From 849369d6c66d3054688672f97d31fceb8e8230fb Mon Sep 17 00:00:00 2001 From: root Date: Fri, 25 Dec 2015 04:40:36 +0000 Subject: initial_commit --- fs/9p/Kconfig | 34 + fs/9p/Makefile | 17 + fs/9p/acl.c | 407 + fs/9p/acl.h | 49 + fs/9p/cache.c | 415 + fs/9p/cache.h | 139 + fs/9p/fid.c | 309 + fs/9p/fid.h | 50 + fs/9p/v9fs.c | 624 ++ fs/9p/v9fs.h | 227 + fs/9p/v9fs_vfs.h | 87 + fs/9p/vfs_addr.c | 353 + fs/9p/vfs_dentry.c | 147 + fs/9p/vfs_dir.c | 318 + fs/9p/vfs_file.c | 771 ++ fs/9p/vfs_inode.c | 1487 ++++ fs/9p/vfs_inode_dotl.c | 1003 +++ fs/9p/vfs_super.c | 368 + fs/9p/xattr.c | 172 + fs/9p/xattr.h | 33 + fs/9p/xattr_user.c | 80 + fs/Kconfig | 277 + fs/Kconfig.binfmt | 163 + fs/Makefile | 129 + fs/adfs/Kconfig | 27 + fs/adfs/Makefile | 7 + fs/adfs/adfs.h | 204 + fs/adfs/dir.c | 296 + fs/adfs/dir_f.c | 486 ++ fs/adfs/dir_f.h | 65 + fs/adfs/dir_fplus.c | 265 + fs/adfs/dir_fplus.h | 45 + fs/adfs/file.c | 37 + fs/adfs/inode.c | 366 + fs/adfs/map.c | 290 + fs/adfs/super.c | 543 ++ fs/affs/Changes | 343 + fs/affs/Kconfig | 21 + fs/affs/Makefile | 9 + fs/affs/affs.h | 305 + fs/affs/amigaffs.c | 515 ++ fs/affs/bitmap.c | 390 + fs/affs/dir.c | 156 + fs/affs/file.c | 936 +++ fs/affs/inode.c | 413 + fs/affs/namei.c | 458 ++ fs/affs/super.c | 626 ++ fs/affs/symlink.c | 81 + fs/afs/Kconfig | 30 + fs/afs/Makefile | 32 + fs/afs/afs.h | 177 + fs/afs/afs_cm.h | 33 + fs/afs/afs_fs.h | 56 + fs/afs/afs_vl.h | 84 + fs/afs/cache.c | 402 + fs/afs/callback.c | 477 ++ fs/afs/cell.c | 460 ++ fs/afs/cmservice.c | 585 ++ fs/afs/dir.c | 1184 +++ fs/afs/file.c | 378 + fs/afs/flock.c | 588 ++ fs/afs/fsclient.c | 1905 +++++ fs/afs/inode.c | 498 ++ fs/afs/internal.h | 890 +++ fs/afs/main.c | 184 + fs/afs/misc.c | 75 + fs/afs/mntpt.c | 284 + fs/afs/netdevices.c | 68 + fs/afs/proc.c | 744 ++ fs/afs/rxrpc.c | 859 ++ fs/afs/security.c | 363 + fs/afs/server.c | 328 + fs/afs/super.c | 550 ++ fs/afs/vlclient.c | 219 + fs/afs/vlocation.c | 726 ++ fs/afs/vnode.c | 1025 +++ fs/afs/volume.c | 401 + fs/afs/write.c | 754 ++ fs/aio.c | 1795 +++++ fs/anon_inodes.c | 244 + fs/attr.c | 252 + fs/autofs4/Kconfig | 20 + fs/autofs4/Makefile | 7 + fs/autofs4/autofs_i.h | 350 + fs/autofs4/dev-ioctl.c | 762 ++ fs/autofs4/expire.c | 588 ++ fs/autofs4/init.c | 51 + fs/autofs4/inode.c | 353 + fs/autofs4/root.c | 895 +++ fs/autofs4/symlink.c | 24 + fs/autofs4/waitq.c | 552 ++ fs/bad_inode.c | 360 + fs/befs/ChangeLog | 417 + fs/befs/Kconfig | 26 + fs/befs/Makefile | 7 + fs/befs/TODO | 14 + fs/befs/befs.h | 156 + fs/befs/befs_fs_types.h | 255 + fs/befs/btree.c | 787 ++ fs/befs/btree.h | 13 + fs/befs/datastream.c | 526 ++ fs/befs/datastream.h | 19 + fs/befs/debug.c | 282 + fs/befs/endian.h | 125 + fs/befs/inode.c | 52 + fs/befs/inode.h | 8 + fs/befs/io.c | 83 + fs/befs/io.h | 9 + fs/befs/linuxvfs.c | 979 +++ fs/befs/super.c | 112 + fs/befs/super.h | 8 + fs/bfs/Kconfig | 19 + fs/bfs/Makefile | 7 + fs/bfs/bfs.h | 59 + fs/bfs/dir.c | 374 + fs/bfs/file.c | 194 + fs/bfs/inode.c | 496 ++ fs/binfmt_aout.c | 467 ++ fs/binfmt_elf.c | 2092 +++++ fs/binfmt_elf_fdpic.c | 1875 +++++ fs/binfmt_em86.c | 113 + fs/binfmt_flat.c | 960 +++ fs/binfmt_misc.c | 746 ++ fs/binfmt_script.c | 118 + fs/binfmt_som.c | 304 + fs/bio-integrity.c | 807 ++ fs/bio.c | 1692 ++++ fs/block_dev.c | 1671 ++++ fs/btrfs/Kconfig | 33 + fs/btrfs/Makefile | 10 + fs/btrfs/acl.c | 335 + fs/btrfs/async-thread.c | 718 ++ fs/btrfs/async-thread.h | 119 + fs/btrfs/btrfs_inode.h | 187 + fs/btrfs/compat.h | 7 + fs/btrfs/compression.c | 1029 +++ fs/btrfs/compression.h | 83 + fs/btrfs/ctree.c | 4402 +++++++++++ fs/btrfs/ctree.h | 2683 +++++++ fs/btrfs/delayed-inode.c | 1773 +++++ fs/btrfs/delayed-inode.h | 145 + fs/btrfs/delayed-ref.c | 711 ++ fs/btrfs/delayed-ref.h | 205 + fs/btrfs/dir-item.c | 453 ++ fs/btrfs/disk-io.c | 3096 ++++++++ fs/btrfs/disk-io.h | 97 + fs/btrfs/export.c | 317 + fs/btrfs/export.h | 19 + fs/btrfs/extent-tree.c | 7347 +++++++++++++++++ fs/btrfs/extent_io.c | 3890 +++++++++ fs/btrfs/extent_io.h | 300 + fs/btrfs/extent_map.c | 418 + fs/btrfs/extent_map.h | 66 + fs/btrfs/file-item.c | 869 ++ fs/btrfs/file.c | 1683 ++++ fs/btrfs/free-space-cache.c | 2693 +++++++ fs/btrfs/free-space-cache.h | 113 + fs/btrfs/hash.h | 27 + fs/btrfs/inode-item.c | 234 + fs/btrfs/inode-map.c | 545 ++ fs/btrfs/inode-map.h | 13 + fs/btrfs/inode.c | 7459 ++++++++++++++++++ fs/btrfs/ioctl.c | 2914 +++++++ fs/btrfs/ioctl.h | 251 + fs/btrfs/locking.c | 208 + fs/btrfs/locking.h | 29 + fs/btrfs/lzo.c | 427 + fs/btrfs/ordered-data.c | 983 +++ fs/btrfs/ordered-data.h | 179 + fs/btrfs/orphan.c | 91 + fs/btrfs/print-tree.c | 338 + fs/btrfs/print-tree.h | 23 + fs/btrfs/ref-cache.c | 68 + fs/btrfs/ref-cache.h | 52 + fs/btrfs/relocation.c | 4413 +++++++++++ fs/btrfs/root-tree.c | 450 ++ fs/btrfs/scrub.c | 1395 ++++ fs/btrfs/struct-funcs.c | 139 + fs/btrfs/super.c | 1305 +++ fs/btrfs/sysfs.c | 46 + fs/btrfs/transaction.c | 1463 ++++ fs/btrfs/transaction.h | 117 + fs/btrfs/tree-defrag.c | 145 + fs/btrfs/tree-log.c | 3339 ++++++++ fs/btrfs/tree-log.h | 52 + fs/btrfs/version.h | 4 + fs/btrfs/volumes.c | 3718 +++++++++ fs/btrfs/volumes.h | 218 + fs/btrfs/xattr.c | 395 + fs/btrfs/xattr.h | 43 + fs/btrfs/zlib.c | 399 + fs/buffer.c | 3326 ++++++++ fs/cachefiles/Kconfig | 39 + fs/cachefiles/Makefile | 18 + fs/cachefiles/bind.c | 283 + fs/cachefiles/daemon.c | 752 ++ fs/cachefiles/interface.c | 470 ++ fs/cachefiles/internal.h | 354 + fs/cachefiles/key.c | 159 + fs/cachefiles/main.c | 106 + fs/cachefiles/namei.c | 988 +++ fs/cachefiles/proc.c | 134 + fs/cachefiles/rdwr.c | 984 +++ fs/cachefiles/security.c | 120 + fs/cachefiles/xattr.c | 292 + fs/ceph/Kconfig | 18 + fs/ceph/Makefile | 11 + fs/ceph/addr.c | 1181 +++ fs/ceph/caps.c | 3087 ++++++++ fs/ceph/ceph_frag.c | 22 + fs/ceph/debugfs.c | 273 + fs/ceph/dir.c | 1275 +++ fs/ceph/export.c | 249 + fs/ceph/file.c | 828 ++ fs/ceph/inode.c | 1842 +++++ fs/ceph/ioctl.c | 255 + fs/ceph/ioctl.h | 44 + fs/ceph/locks.c | 286 + fs/ceph/mds_client.c | 3454 ++++++++ fs/ceph/mds_client.h | 379 + fs/ceph/mdsmap.c | 179 + fs/ceph/snap.c | 916 +++ fs/ceph/strings.c | 117 + fs/ceph/super.c | 921 +++ fs/ceph/super.h | 833 ++ fs/ceph/xattr.c | 854 ++ fs/char_dev.c | 575 ++ fs/cifs/AUTHORS | 55 + fs/cifs/CHANGES | 1065 +++ fs/cifs/Kconfig | 161 + fs/cifs/Makefile | 17 + fs/cifs/README | 748 ++ fs/cifs/TODO | 129 + fs/cifs/asn1.c | 666 ++ fs/cifs/cache.c | 333 + fs/cifs/cifs_debug.c | 782 ++ fs/cifs/cifs_debug.h | 96 + fs/cifs/cifs_dfs_ref.c | 370 + fs/cifs/cifs_fs_sb.h | 65 + fs/cifs/cifs_spnego.c | 175 + fs/cifs/cifs_spnego.h | 47 + fs/cifs/cifs_unicode.c | 332 + fs/cifs/cifs_unicode.h | 383 + fs/cifs/cifs_uniupr.h | 253 + fs/cifs/cifsacl.c | 1142 +++ fs/cifs/cifsacl.h | 103 + fs/cifs/cifsencrypt.c | 762 ++ fs/cifs/cifsfs.c | 1215 +++ fs/cifs/cifsfs.h | 133 + fs/cifs/cifsglob.h | 951 +++ fs/cifs/cifspdu.h | 2641 +++++++ fs/cifs/cifsproto.h | 460 ++ fs/cifs/cifssmb.c | 6088 ++++++++++++++ fs/cifs/connect.c | 3719 +++++++++ fs/cifs/dir.c | 742 ++ fs/cifs/dns_resolve.c | 98 + fs/cifs/dns_resolve.h | 30 + fs/cifs/export.c | 67 + fs/cifs/file.c | 2471 ++++++ fs/cifs/fscache.c | 235 + fs/cifs/fscache.h | 136 + fs/cifs/inode.c | 2272 ++++++ fs/cifs/ioctl.c | 102 + fs/cifs/link.c | 593 ++ fs/cifs/misc.c | 685 ++ fs/cifs/netmisc.c | 1008 +++ fs/cifs/nterr.c | 687 ++ fs/cifs/nterr.h | 561 ++ fs/cifs/ntlmssp.h | 128 + fs/cifs/readdir.c | 887 +++ fs/cifs/rfc1002pdu.h | 74 + fs/cifs/sess.c | 952 +++ fs/cifs/smbencrypt.c | 401 + fs/cifs/smberr.h | 184 + fs/cifs/smbfsctl.h | 84 + fs/cifs/transport.c | 931 +++ fs/cifs/xattr.c | 420 + fs/coda/Kconfig | 21 + fs/coda/Makefile | 12 + fs/coda/cache.c | 122 + fs/coda/cnode.c | 174 + fs/coda/coda_cache.h | 22 + fs/coda/coda_fs_i.h | 58 + fs/coda/coda_int.h | 20 + fs/coda/coda_linux.c | 192 + fs/coda/coda_linux.h | 101 + fs/coda/dir.c | 650 ++ fs/coda/file.c | 234 + fs/coda/inode.c | 329 + fs/coda/pioctl.c | 90 + fs/coda/psdev.c | 430 + fs/coda/symlink.c | 50 + fs/coda/sysctl.c | 73 + fs/coda/upcall.c | 883 +++ fs/compat.c | 2061 +++++ fs/compat_binfmt_elf.c | 133 + fs/compat_ioctl.c | 1656 ++++ fs/configfs/Kconfig | 11 + fs/configfs/Makefile | 7 + fs/configfs/configfs_internal.h | 161 + fs/configfs/dir.c | 1766 +++++ fs/configfs/file.c | 344 + fs/configfs/inode.c | 297 + fs/configfs/item.c | 216 + fs/configfs/mount.c | 187 + fs/configfs/symlink.c | 320 + fs/cramfs/Kconfig | 19 + fs/cramfs/Makefile | 7 + fs/cramfs/README | 168 + fs/cramfs/inode.c | 601 ++ fs/cramfs/uncompress.c | 77 + fs/dcache.c | 3124 ++++++++ fs/dcookies.c | 345 + fs/debugfs/Makefile | 4 + fs/debugfs/file.c | 527 ++ fs/debugfs/inode.c | 544 ++ fs/devpts/Makefile | 7 + fs/devpts/inode.c | 572 ++ fs/direct-io.c | 1256 +++ fs/dlm/Kconfig | 16 + fs/dlm/Makefile | 21 + fs/dlm/ast.c | 346 + fs/dlm/ast.h | 31 + fs/dlm/config.c | 1010 +++ fs/dlm/config.h | 47 + fs/dlm/debug_fs.c | 731 ++ fs/dlm/dir.c | 441 ++ fs/dlm/dir.h | 30 + fs/dlm/dlm_internal.h | 616 ++ fs/dlm/lock.c | 5115 ++++++++++++ fs/dlm/lock.h | 70 + fs/dlm/lockspace.c | 844 ++ fs/dlm/lockspace.h | 26 + fs/dlm/lowcomms.c | 1572 ++++ fs/dlm/lowcomms.h | 25 + fs/dlm/lvb_table.h | 18 + fs/dlm/main.c | 95 + fs/dlm/member.c | 390 + fs/dlm/member.h | 25 + fs/dlm/memory.c | 92 + fs/dlm/memory.h | 27 + fs/dlm/midcomms.c | 137 + fs/dlm/midcomms.h | 21 + fs/dlm/netlink.c | 139 + fs/dlm/plock.c | 503 ++ fs/dlm/rcom.c | 511 ++ fs/dlm/rcom.h | 25 + fs/dlm/recover.c | 789 ++ fs/dlm/recover.h | 34 + fs/dlm/recoverd.c | 323 + fs/dlm/recoverd.h | 24 + fs/dlm/requestqueue.c | 188 + fs/dlm/requestqueue.h | 22 + fs/dlm/user.c | 1011 +++ fs/dlm/user.h | 19 + fs/dlm/util.c | 154 + fs/dlm/util.h | 22 + fs/drop_caches.c | 67 + fs/ecryptfs/Kconfig | 14 + fs/ecryptfs/Makefile | 7 + fs/ecryptfs/crypto.c | 2238 ++++++ fs/ecryptfs/debug.c | 121 + fs/ecryptfs/dentry.c | 105 + fs/ecryptfs/ecryptfs_kernel.h | 771 ++ fs/ecryptfs/file.c | 379 + fs/ecryptfs/inode.c | 1228 +++ fs/ecryptfs/keystore.c | 2530 ++++++ fs/ecryptfs/kthread.c | 197 + fs/ecryptfs/main.c | 867 ++ fs/ecryptfs/messaging.c | 578 ++ fs/ecryptfs/miscdev.c | 547 ++ fs/ecryptfs/mmap.c | 566 ++ fs/ecryptfs/read_write.c | 357 + fs/ecryptfs/super.c | 181 + fs/efs/Kconfig | 14 + fs/efs/Makefile | 7 + fs/efs/dir.c | 110 + fs/efs/efs.h | 140 + fs/efs/file.c | 60 + fs/efs/inode.c | 313 + fs/efs/namei.c | 118 + fs/efs/super.c | 359 + fs/efs/symlink.c | 54 + fs/eventfd.c | 439 ++ fs/eventpoll.c | 1818 +++++ fs/exec.c | 2247 ++++++ fs/exofs/BUGS | 3 + fs/exofs/Kbuild | 16 + fs/exofs/Kconfig | 13 + fs/exofs/common.h | 262 + fs/exofs/dir.c | 673 ++ fs/exofs/exofs.h | 308 + fs/exofs/file.c | 77 + fs/exofs/inode.c | 1374 ++++ fs/exofs/ios.c | 803 ++ fs/exofs/namei.c | 336 + fs/exofs/pnfs.h | 45 + fs/exofs/super.c | 1001 +++ fs/exofs/symlink.c | 55 + fs/exportfs/Makefile | 6 + fs/exportfs/expfs.c | 497 ++ fs/ext2/Kconfig | 55 + fs/ext2/Makefile | 13 + fs/ext2/acl.c | 445 ++ fs/ext2/acl.h | 78 + fs/ext2/balloc.c | 1555 ++++ fs/ext2/dir.c | 733 ++ fs/ext2/ext2.h | 182 + fs/ext2/file.c | 107 + fs/ext2/ialloc.c | 674 ++ fs/ext2/inode.c | 1552 ++++ fs/ext2/ioctl.c | 178 + fs/ext2/namei.c | 427 + fs/ext2/super.c | 1525 ++++ fs/ext2/symlink.c | 54 + fs/ext2/xattr.c | 1031 +++ fs/ext2/xattr.h | 127 + fs/ext2/xattr_security.c | 76 + fs/ext2/xattr_trusted.c | 58 + fs/ext2/xattr_user.c | 62 + fs/ext2/xip.c | 92 + fs/ext2/xip.h | 26 + fs/ext3/Kconfig | 89 + fs/ext3/Makefile | 12 + fs/ext3/acl.c | 481 ++ fs/ext3/acl.h | 77 + fs/ext3/balloc.c | 2156 +++++ fs/ext3/bitmap.c | 32 + fs/ext3/dir.c | 519 ++ fs/ext3/ext3_jbd.c | 59 + fs/ext3/file.c | 85 + fs/ext3/fsync.c | 95 + fs/ext3/hash.c | 208 + fs/ext3/ialloc.c | 768 ++ fs/ext3/inode.c | 3516 +++++++++ fs/ext3/ioctl.c | 352 + fs/ext3/namei.c | 2550 ++++++ fs/ext3/namei.h | 8 + fs/ext3/resize.c | 1120 +++ fs/ext3/super.c | 3084 ++++++++ fs/ext3/symlink.c | 56 + fs/ext3/xattr.c | 1336 ++++ fs/ext3/xattr.h | 138 + fs/ext3/xattr_security.c | 78 + fs/ext3/xattr_trusted.c | 59 + fs/ext3/xattr_user.c | 62 + fs/ext4/Kconfig | 85 + fs/ext4/Makefile | 14 + fs/ext4/acl.c | 479 ++ fs/ext4/acl.h | 77 + fs/ext4/balloc.c | 622 ++ fs/ext4/bitmap.c | 31 + fs/ext4/block_validity.c | 248 + fs/ext4/dir.c | 542 ++ fs/ext4/ext4.h | 2236 ++++++ fs/ext4/ext4_extents.h | 294 + fs/ext4/ext4_jbd2.c | 152 + fs/ext4/ext4_jbd2.h | 327 + fs/ext4/extents.c | 4364 +++++++++++ fs/ext4/file.c | 286 + fs/ext4/fsync.c | 225 + fs/ext4/hash.c | 208 + fs/ext4/ialloc.c | 1338 ++++ fs/ext4/inode.c | 5957 ++++++++++++++ fs/ext4/ioctl.c | 442 ++ fs/ext4/mballoc.c | 4958 ++++++++++++ fs/ext4/mballoc.h | 222 + fs/ext4/migrate.c | 634 ++ fs/ext4/mmp.c | 351 + fs/ext4/move_extent.c | 1424 ++++ fs/ext4/namei.c | 2612 ++++++ fs/ext4/page-io.c | 447 ++ fs/ext4/resize.c | 1076 +++ fs/ext4/super.c | 5027 ++++++++++++ fs/ext4/symlink.c | 56 + fs/ext4/xattr.c | 1612 ++++ fs/ext4/xattr.h | 155 + fs/ext4/xattr_security.c | 78 + fs/ext4/xattr_trusted.c | 59 + fs/ext4/xattr_user.c | 62 + fs/fat/Kconfig | 100 + fs/fat/Makefile | 11 + fs/fat/cache.c | 351 + fs/fat/dir.c | 1371 ++++ fs/fat/fat.h | 351 + fs/fat/fatent.c | 685 ++ fs/fat/file.c | 446 ++ fs/fat/inode.c | 1619 ++++ fs/fat/misc.c | 278 + fs/fat/namei_msdos.c | 702 ++ fs/fat/namei_vfat.c | 1110 +++ fs/fcntl.c | 853 ++ fs/fhandle.c | 266 + fs/fifo.c | 154 + fs/file.c | 486 ++ fs/file_table.c | 554 ++ fs/filesystems.c | 287 + fs/freevxfs/Kconfig | 16 + fs/freevxfs/Makefile | 8 + fs/freevxfs/vxfs.h | 263 + fs/freevxfs/vxfs_bmap.c | 281 + fs/freevxfs/vxfs_dir.h | 92 + fs/freevxfs/vxfs_extern.h | 81 + fs/freevxfs/vxfs_fshead.c | 203 + fs/freevxfs/vxfs_fshead.h | 67 + fs/freevxfs/vxfs_immed.c | 115 + fs/freevxfs/vxfs_inode.c | 361 + fs/freevxfs/vxfs_inode.h | 180 + fs/freevxfs/vxfs_lookup.c | 322 + fs/freevxfs/vxfs_olt.c | 130 + fs/freevxfs/vxfs_olt.h | 145 + fs/freevxfs/vxfs_subr.c | 183 + fs/freevxfs/vxfs_super.c | 287 + fs/fs-writeback.c | 1373 ++++ fs/fs_struct.c | 201 + fs/fscache/Kconfig | 61 + fs/fscache/Makefile | 20 + fs/fscache/cache.c | 420 + fs/fscache/cookie.c | 514 ++ fs/fscache/fsdef.c | 144 + fs/fscache/histogram.c | 109 + fs/fscache/internal.h | 438 ++ fs/fscache/main.c | 218 + fs/fscache/netfs.c | 103 + fs/fscache/object-list.c | 432 + fs/fscache/object.c | 892 +++ fs/fscache/operation.c | 463 ++ fs/fscache/page.c | 996 +++ fs/fscache/proc.c | 81 + fs/fscache/stats.c | 280 + fs/fuse/Kconfig | 15 + fs/fuse/Makefile | 8 + fs/fuse/control.c | 359 + fs/fuse/cuse.c | 620 ++ fs/fuse/dev.c | 2049 +++++ fs/fuse/dir.c | 1638 ++++ fs/fuse/file.c | 2186 ++++++ fs/fuse/fuse_i.h | 769 ++ fs/fuse/inode.c | 1263 +++ fs/generic_acl.c | 225 + fs/gfs2/Kconfig | 38 + fs/gfs2/Makefile | 10 + fs/gfs2/acl.c | 354 + fs/gfs2/acl.h | 24 + fs/gfs2/aops.c | 1182 +++ fs/gfs2/bmap.c | 1297 +++ fs/gfs2/bmap.h | 59 + fs/gfs2/dentry.c | 141 + fs/gfs2/dir.c | 2006 +++++ fs/gfs2/dir.h | 68 + fs/gfs2/export.c | 206 + fs/gfs2/file.c | 1112 +++ fs/gfs2/gfs2.h | 26 + fs/gfs2/glock.c | 1870 +++++ fs/gfs2/glock.h | 244 + fs/gfs2/glops.c | 605 ++ fs/gfs2/glops.h | 28 + fs/gfs2/incore.h | 686 ++ fs/gfs2/inode.c | 1892 +++++ fs/gfs2/inode.h | 143 + fs/gfs2/lock_dlm.c | 235 + fs/gfs2/log.c | 972 +++ fs/gfs2/log.h | 69 + fs/gfs2/lops.c | 795 ++ fs/gfs2/lops.h | 113 + fs/gfs2/main.c | 215 + fs/gfs2/meta_io.c | 459 ++ fs/gfs2/meta_io.h | 82 + fs/gfs2/ops_fstype.c | 1407 ++++ fs/gfs2/quota.c | 1644 ++++ fs/gfs2/quota.h | 59 + fs/gfs2/recovery.c | 610 ++ fs/gfs2/recovery.h | 36 + fs/gfs2/rgrp.c | 1885 +++++ fs/gfs2/rgrp.h | 78 + fs/gfs2/super.c | 1585 ++++ fs/gfs2/super.h | 60 + fs/gfs2/sys.c | 653 ++ fs/gfs2/sys.h | 23 + fs/gfs2/trace_gfs2.h | 438 ++ fs/gfs2/trans.c | 192 + fs/gfs2/trans.h | 47 + fs/gfs2/util.c | 285 + fs/gfs2/util.h | 172 + fs/gfs2/xattr.c | 1549 ++++ fs/gfs2/xattr.h | 67 + fs/hfs/Kconfig | 12 + fs/hfs/Makefile | 10 + fs/hfs/attr.c | 121 + fs/hfs/bfind.c | 222 + fs/hfs/bitmap.c | 243 + fs/hfs/bnode.c | 478 ++ fs/hfs/brec.c | 519 ++ fs/hfs/btree.c | 366 + fs/hfs/btree.h | 168 + fs/hfs/catalog.c | 356 + fs/hfs/dir.c | 316 + fs/hfs/extent.c | 525 ++ fs/hfs/hfs.h | 289 + fs/hfs/hfs_fs.h | 276 + fs/hfs/inode.c | 673 ++ fs/hfs/mdb.c | 354 + fs/hfs/part_tbl.c | 117 + fs/hfs/string.c | 116 + fs/hfs/super.c | 493 ++ fs/hfs/sysdep.c | 45 + fs/hfs/trans.c | 150 + fs/hfsplus/Kconfig | 13 + fs/hfsplus/Makefile | 9 + fs/hfsplus/bfind.c | 224 + fs/hfsplus/bitmap.c | 235 + fs/hfsplus/bnode.c | 656 ++ fs/hfsplus/brec.c | 516 ++ fs/hfsplus/btree.c | 377 + fs/hfsplus/catalog.c | 425 + fs/hfsplus/dir.c | 516 ++ fs/hfsplus/extents.c | 561 ++ fs/hfsplus/hfsplus_fs.h | 463 ++ fs/hfsplus/hfsplus_raw.h | 337 + fs/hfsplus/inode.c | 617 ++ fs/hfsplus/ioctl.c | 221 + fs/hfsplus/options.c | 230 + fs/hfsplus/part_tbl.c | 157 + fs/hfsplus/super.c | 593 ++ fs/hfsplus/tables.c | 3245 ++++++++ fs/hfsplus/unicode.c | 452 ++ fs/hfsplus/wrapper.c | 283 + fs/hostfs/Makefile | 11 + fs/hostfs/hostfs.h | 96 + fs/hostfs/hostfs_kern.c | 1004 +++ fs/hostfs/hostfs_user.c | 387 + fs/hpfs/Kconfig | 14 + fs/hpfs/Makefile | 8 + fs/hpfs/alloc.c | 424 + fs/hpfs/anode.c | 491 ++ fs/hpfs/buffer.c | 168 + fs/hpfs/dentry.c | 64 + fs/hpfs/dir.c | 322 + fs/hpfs/dnode.c | 1084 +++ fs/hpfs/ea.c | 367 + fs/hpfs/file.c | 166 + fs/hpfs/hpfs.h | 568 ++ fs/hpfs/hpfs_fn.h | 360 + fs/hpfs/inode.c | 308 + fs/hpfs/map.c | 287 + fs/hpfs/name.c | 112 + fs/hpfs/namei.c | 628 ++ fs/hpfs/super.c | 712 ++ fs/hppfs/Makefile | 6 + fs/hppfs/hppfs.c | 773 ++ fs/hugetlbfs/Makefile | 7 + fs/hugetlbfs/inode.c | 1033 +++ fs/inode.c | 1697 ++++ fs/internal.h | 137 + fs/ioctl.c | 623 ++ fs/ioprio.c | 250 + fs/isofs/Kconfig | 39 + fs/isofs/Makefile | 10 + fs/isofs/compress.c | 380 + fs/isofs/dir.c | 288 + fs/isofs/export.c | 196 + fs/isofs/inode.c | 1579 ++++ fs/isofs/isofs.h | 184 + fs/isofs/joliet.c | 69 + fs/isofs/namei.c | 196 + fs/isofs/rock.c | 778 ++ fs/isofs/rock.h | 122 + fs/isofs/util.c | 80 + fs/isofs/zisofs.h | 21 + fs/jbd/Kconfig | 30 + fs/jbd/Makefile | 7 + fs/jbd/checkpoint.c | 757 ++ fs/jbd/commit.c | 953 +++ fs/jbd/journal.c | 2081 +++++ fs/jbd/recovery.c | 587 ++ fs/jbd/revoke.c | 706 ++ fs/jbd/transaction.c | 2198 ++++++ fs/jbd2/Kconfig | 33 + fs/jbd2/Makefile | 7 + fs/jbd2/checkpoint.c | 798 ++ fs/jbd2/commit.c | 1048 +++ fs/jbd2/journal.c | 2471 ++++++ fs/jbd2/recovery.c | 738 ++ fs/jbd2/revoke.c | 714 ++ fs/jbd2/transaction.c | 2227 ++++++ fs/jffs2/Kconfig | 188 + fs/jffs2/LICENCE | 30 + fs/jffs2/Makefile | 21 + fs/jffs2/README.Locking | 172 + fs/jffs2/TODO | 37 + fs/jffs2/acl.c | 440 ++ fs/jffs2/acl.h | 44 + fs/jffs2/background.c | 159 + fs/jffs2/build.c | 392 + fs/jffs2/compr.c | 360 + fs/jffs2/compr.h | 103 + fs/jffs2/compr_lzo.c | 111 + fs/jffs2/compr_rtime.c | 130 + fs/jffs2/compr_rubin.c | 455 ++ fs/jffs2/compr_zlib.c | 218 + fs/jffs2/debug.c | 860 ++ fs/jffs2/debug.h | 291 + fs/jffs2/dir.c | 873 +++ fs/jffs2/erase.c | 502 ++ fs/jffs2/file.c | 321 + fs/jffs2/fs.c | 748 ++ fs/jffs2/gc.c | 1326 ++++ fs/jffs2/ioctl.c | 22 + fs/jffs2/jffs2_fs_i.h | 56 + fs/jffs2/jffs2_fs_sb.h | 147 + fs/jffs2/malloc.c | 318 + fs/jffs2/nodelist.c | 771 ++ fs/jffs2/nodelist.h | 480 ++ fs/jffs2/nodemgmt.c | 787 ++ fs/jffs2/os-linux.h | 204 + fs/jffs2/read.c | 216 + fs/jffs2/readinode.c | 1461 ++++ fs/jffs2/scan.c | 1148 +++ fs/jffs2/security.c | 86 + fs/jffs2/summary.c | 869 ++ fs/jffs2/summary.h | 213 + fs/jffs2/super.c | 316 + fs/jffs2/symlink.c | 64 + fs/jffs2/wbuf.c | 1310 ++++ fs/jffs2/write.c | 707 ++ fs/jffs2/writev.c | 79 + fs/jffs2/xattr.c | 1330 ++++ fs/jffs2/xattr.h | 131 + fs/jffs2/xattr_trusted.c | 55 + fs/jffs2/xattr_user.c | 55 + fs/jfs/Kconfig | 50 + fs/jfs/Makefile | 16 + fs/jfs/acl.c | 211 + fs/jfs/endian24.h | 49 + fs/jfs/file.c | 154 + fs/jfs/inode.c | 414 + fs/jfs/ioctl.c | 148 + fs/jfs/jfs_acl.h | 41 + fs/jfs/jfs_btree.h | 172 + fs/jfs/jfs_debug.c | 109 + fs/jfs/jfs_debug.h | 122 + fs/jfs/jfs_dinode.h | 176 + fs/jfs/jfs_dmap.c | 3992 ++++++++++ fs/jfs/jfs_dmap.h | 314 + fs/jfs/jfs_dtree.c | 4567 +++++++++++ fs/jfs/jfs_dtree.h | 269 + fs/jfs/jfs_extent.c | 651 ++ fs/jfs/jfs_extent.h | 31 + fs/jfs/jfs_filsys.h | 282 + fs/jfs/jfs_imap.c | 3187 ++++++++ fs/jfs/jfs_imap.h | 175 + fs/jfs/jfs_incore.h | 224 + fs/jfs/jfs_inode.c | 166 + fs/jfs/jfs_inode.h | 53 + fs/jfs/jfs_lock.h | 52 + fs/jfs/jfs_logmgr.c | 2529 ++++++ fs/jfs/jfs_logmgr.h | 513 ++ fs/jfs/jfs_metapage.c | 843 ++ fs/jfs/jfs_metapage.h | 155 + fs/jfs/jfs_mount.c | 507 ++ fs/jfs/jfs_superblock.h | 121 + fs/jfs/jfs_txnmgr.c | 3101 ++++++++ fs/jfs/jfs_txnmgr.h | 311 + fs/jfs/jfs_types.h | 159 + fs/jfs/jfs_umount.c | 168 + fs/jfs/jfs_unicode.c | 138 + fs/jfs/jfs_unicode.h | 156 + fs/jfs/jfs_uniupr.c | 134 + fs/jfs/jfs_xattr.h | 75 + fs/jfs/jfs_xtree.c | 3905 +++++++++ fs/jfs/jfs_xtree.h | 132 + fs/jfs/namei.c | 1639 ++++ fs/jfs/resize.c | 543 ++ fs/jfs/super.c | 901 +++ fs/jfs/symlink.c | 52 + fs/jfs/xattr.c | 1128 +++ fs/libfs.c | 997 +++ fs/lockd/Makefile | 10 + fs/lockd/clnt4xdr.c | 605 ++ fs/lockd/clntlock.c | 279 + fs/lockd/clntproc.c | 848 ++ fs/lockd/clntxdr.c | 627 ++ fs/lockd/grace.c | 59 + fs/lockd/host.c | 649 ++ fs/lockd/mon.c | 555 ++ fs/lockd/svc.c | 568 ++ fs/lockd/svc4proc.c | 503 ++ fs/lockd/svclock.c | 966 +++ fs/lockd/svcproc.c | 544 ++ fs/lockd/svcshare.c | 106 + fs/lockd/svcsubs.c | 446 ++ fs/lockd/xdr.c | 343 + fs/lockd/xdr4.c | 334 + fs/locks.c | 2341 ++++++ fs/logfs/Kconfig | 17 + fs/logfs/Makefile | 13 + fs/logfs/compr.c | 95 + fs/logfs/dev_bdev.c | 342 + fs/logfs/dev_mtd.c | 278 + fs/logfs/dir.c | 825 ++ fs/logfs/file.c | 275 + fs/logfs/gc.c | 732 ++ fs/logfs/inode.c | 405 + fs/logfs/journal.c | 895 +++ fs/logfs/logfs.h | 734 ++ fs/logfs/logfs_abi.h | 629 ++ fs/logfs/readwrite.c | 2277 ++++++ fs/logfs/segment.c | 932 +++ fs/logfs/super.c | 671 ++ fs/mbcache.c | 620 ++ fs/minix/Kconfig | 25 + fs/minix/Makefile | 7 + fs/minix/bitmap.c | 279 + fs/minix/dir.c | 477 ++ fs/minix/file.c | 51 + fs/minix/inode.c | 661 ++ fs/minix/itree_common.c | 364 + fs/minix/itree_v1.c | 67 + fs/minix/itree_v2.c | 75 + fs/minix/minix.h | 165 + fs/minix/namei.c | 269 + fs/mpage.c | 718 ++ fs/namei.c | 3402 ++++++++ fs/namespace.c | 2730 +++++++ fs/ncpfs/Kconfig | 108 + fs/ncpfs/Makefile | 16 + fs/ncpfs/dir.c | 1280 +++ fs/ncpfs/file.c | 297 + fs/ncpfs/getopt.c | 74 + fs/ncpfs/getopt.h | 16 + fs/ncpfs/inode.c | 1072 +++ fs/ncpfs/ioctl.c | 918 +++ fs/ncpfs/mmap.c | 128 + fs/ncpfs/ncp_fs.h | 98 + fs/ncpfs/ncp_fs_i.h | 29 + fs/ncpfs/ncp_fs_sb.h | 176 + fs/ncpfs/ncplib_kernel.c | 1323 ++++ fs/ncpfs/ncplib_kernel.h | 254 + fs/ncpfs/ncpsign_kernel.c | 127 + fs/ncpfs/ncpsign_kernel.h | 26 + fs/ncpfs/sock.c | 880 +++ fs/ncpfs/symlink.c | 181 + fs/nfs/Kconfig | 144 + fs/nfs/Makefile | 25 + fs/nfs/cache_lib.c | 141 + fs/nfs/cache_lib.h | 27 + fs/nfs/callback.c | 403 + fs/nfs/callback.h | 213 + fs/nfs/callback_proc.c | 561 ++ fs/nfs/callback_xdr.c | 989 +++ fs/nfs/client.c | 2004 +++++ fs/nfs/delegation.c | 716 ++ fs/nfs/delegation.h | 80 + fs/nfs/dir.c | 2350 ++++++ fs/nfs/direct.c | 1039 +++ fs/nfs/dns_resolve.c | 372 + fs/nfs/dns_resolve.h | 26 + fs/nfs/file.c | 921 +++ fs/nfs/fscache-index.c | 337 + fs/nfs/fscache.c | 535 ++ fs/nfs/fscache.h | 222 + fs/nfs/getroot.c | 267 + fs/nfs/idmap.c | 779 ++ fs/nfs/inode.c | 1656 ++++ fs/nfs/internal.h | 456 ++ fs/nfs/iostat.h | 71 + fs/nfs/mount_clnt.c | 518 ++ fs/nfs/namespace.c | 371 + fs/nfs/nfs2xdr.c | 1157 +++ fs/nfs/nfs3acl.c | 444 ++ fs/nfs/nfs3proc.c | 890 +++ fs/nfs/nfs3xdr.c | 2498 ++++++ fs/nfs/nfs4_fs.h | 383 + fs/nfs/nfs4filelayout.c | 914 +++ fs/nfs/nfs4filelayout.h | 105 + fs/nfs/nfs4filelayoutdev.c | 636 ++ fs/nfs/nfs4namespace.c | 265 + fs/nfs/nfs4proc.c | 6114 +++++++++++++++ fs/nfs/nfs4renewd.c | 130 + fs/nfs/nfs4state.c | 1766 +++++ fs/nfs/nfs4xdr.c | 6679 ++++++++++++++++ fs/nfs/nfsroot.c | 309 + fs/nfs/objlayout/Kbuild | 5 + fs/nfs/objlayout/objio_osd.c | 1056 +++ fs/nfs/objlayout/objlayout.c | 716 ++ fs/nfs/objlayout/objlayout.h | 187 + fs/nfs/objlayout/pnfs_osd_xdr_cli.c | 415 + fs/nfs/pagelist.c | 453 ++ fs/nfs/pnfs.c | 1319 ++++ fs/nfs/pnfs.h | 427 + fs/nfs/pnfs_dev.c | 277 + fs/nfs/proc.c | 746 ++ fs/nfs/read.c | 704 ++ fs/nfs/super.c | 3099 ++++++++ fs/nfs/symlink.c | 78 + fs/nfs/sysctl.c | 92 + fs/nfs/unlink.c | 580 ++ fs/nfs/write.c | 1732 ++++ fs/nfs_common/Makefile | 7 + fs/nfs_common/nfsacl.c | 286 + fs/nfsctl.c | 100 + fs/nfsd/Kconfig | 94 + fs/nfsd/Makefile | 13 + fs/nfsd/acl.h | 59 + fs/nfsd/auth.c | 95 + fs/nfsd/auth.h | 22 + fs/nfsd/cache.h | 83 + fs/nfsd/export.c | 1693 ++++ fs/nfsd/idmap.h | 62 + fs/nfsd/lockd.c | 79 + fs/nfsd/nfs2acl.c | 356 + fs/nfsd/nfs3acl.c | 267 + fs/nfsd/nfs3proc.c | 896 +++ fs/nfsd/nfs3xdr.c | 1112 +++ fs/nfsd/nfs4acl.c | 838 ++ fs/nfsd/nfs4callback.c | 1026 +++ fs/nfsd/nfs4idmap.c | 587 ++ fs/nfsd/nfs4proc.c | 1472 ++++ fs/nfsd/nfs4recover.c | 402 + fs/nfsd/nfs4state.c | 4485 +++++++++++ fs/nfsd/nfs4xdr.c | 3408 ++++++++ fs/nfsd/nfscache.c | 322 + fs/nfsd/nfsctl.c | 1527 ++++ fs/nfsd/nfsd.h | 340 + fs/nfsd/nfsfh.c | 693 ++ fs/nfsd/nfsfh.h | 206 + fs/nfsd/nfsproc.c | 755 ++ fs/nfsd/nfssvc.c | 663 ++ fs/nfsd/nfsxdr.c | 545 ++ fs/nfsd/state.h | 492 ++ fs/nfsd/stats.c | 104 + fs/nfsd/vfs.c | 2273 ++++++ fs/nfsd/vfs.h | 109 + fs/nfsd/xdr.h | 173 + fs/nfsd/xdr3.h | 344 + fs/nfsd/xdr4.h | 573 ++ fs/nilfs2/Kconfig | 25 + fs/nilfs2/Makefile | 5 + fs/nilfs2/alloc.c | 721 ++ fs/nilfs2/alloc.h | 100 + fs/nilfs2/bmap.c | 567 ++ fs/nilfs2/bmap.h | 270 + fs/nilfs2/btnode.c | 297 + fs/nilfs2/btnode.h | 53 + fs/nilfs2/btree.c | 2310 ++++++ fs/nilfs2/btree.h | 77 + fs/nilfs2/cpfile.c | 965 +++ fs/nilfs2/cpfile.h | 46 + fs/nilfs2/dat.c | 511 ++ fs/nilfs2/dat.h | 59 + fs/nilfs2/dir.c | 688 ++ fs/nilfs2/direct.c | 369 + fs/nilfs2/direct.h | 51 + fs/nilfs2/export.h | 17 + fs/nilfs2/file.c | 159 + fs/nilfs2/gcinode.c | 196 + fs/nilfs2/ifile.c | 201 + fs/nilfs2/ifile.h | 56 + fs/nilfs2/inode.c | 1064 +++ fs/nilfs2/ioctl.c | 863 ++ fs/nilfs2/mdt.c | 589 ++ fs/nilfs2/mdt.h | 110 + fs/nilfs2/namei.c | 594 ++ fs/nilfs2/nilfs.h | 326 + fs/nilfs2/page.c | 551 ++ fs/nilfs2/page.h | 80 + fs/nilfs2/recovery.c | 963 +++ fs/nilfs2/segbuf.c | 536 ++ fs/nilfs2/segbuf.h | 184 + fs/nilfs2/segment.c | 2707 +++++++ fs/nilfs2/segment.h | 246 + fs/nilfs2/sufile.c | 921 +++ fs/nilfs2/sufile.h | 144 + fs/nilfs2/super.c | 1462 ++++ fs/nilfs2/the_nilfs.c | 784 ++ fs/nilfs2/the_nilfs.h | 356 + fs/nls/Kconfig | 464 ++ fs/nls/Makefile | 44 + fs/nls/nls_ascii.c | 167 + fs/nls/nls_base.c | 524 ++ fs/nls/nls_cp1250.c | 347 + fs/nls/nls_cp1251.c | 302 + fs/nls/nls_cp1255.c | 385 + fs/nls/nls_cp437.c | 388 + fs/nls/nls_cp737.c | 351 + fs/nls/nls_cp775.c | 320 + fs/nls/nls_cp850.c | 316 + fs/nls/nls_cp852.c | 338 + fs/nls/nls_cp855.c | 300 + fs/nls/nls_cp857.c | 302 + fs/nls/nls_cp860.c | 365 + fs/nls/nls_cp861.c | 388 + fs/nls/nls_cp862.c | 422 + fs/nls/nls_cp863.c | 382 + fs/nls/nls_cp864.c | 408 + fs/nls/nls_cp865.c | 388 + fs/nls/nls_cp866.c | 306 + fs/nls/nls_cp869.c | 316 + fs/nls/nls_cp874.c | 276 + fs/nls/nls_cp932.c | 7934 +++++++++++++++++++ fs/nls/nls_cp936.c | 11112 ++++++++++++++++++++++++++ fs/nls/nls_cp949.c | 13947 +++++++++++++++++++++++++++++++++ fs/nls/nls_cp950.c | 9483 ++++++++++++++++++++++ fs/nls/nls_euc-jp.c | 581 ++ fs/nls/nls_iso8859-1.c | 258 + fs/nls/nls_iso8859-13.c | 286 + fs/nls/nls_iso8859-14.c | 342 + fs/nls/nls_iso8859-15.c | 308 + fs/nls/nls_iso8859-2.c | 309 + fs/nls/nls_iso8859-3.c | 309 + fs/nls/nls_iso8859-4.c | 309 + fs/nls/nls_iso8859-5.c | 273 + fs/nls/nls_iso8859-6.c | 264 + fs/nls/nls_iso8859-7.c | 318 + fs/nls/nls_iso8859-9.c | 273 + fs/nls/nls_koi8-r.c | 324 + fs/nls/nls_koi8-ru.c | 83 + fs/nls/nls_koi8-u.c | 331 + fs/nls/nls_utf8.c | 68 + fs/no-block.c | 23 + fs/notify/Kconfig | 6 + fs/notify/Makefile | 6 + fs/notify/dnotify/Kconfig | 11 + fs/notify/dnotify/Makefile | 1 + fs/notify/dnotify/dnotify.c | 411 + fs/notify/fanotify/Kconfig | 26 + fs/notify/fanotify/Makefile | 1 + fs/notify/fanotify/fanotify.c | 228 + fs/notify/fanotify/fanotify_user.c | 891 +++ fs/notify/fsnotify.c | 310 + fs/notify/fsnotify.h | 47 + fs/notify/group.c | 104 + fs/notify/inode_mark.c | 315 + fs/notify/inotify/Kconfig | 17 + fs/notify/inotify/Makefile | 1 + fs/notify/inotify/inotify.h | 21 + fs/notify/inotify/inotify_fsnotify.c | 222 + fs/notify/inotify/inotify_user.c | 867 ++ fs/notify/mark.c | 372 + fs/notify/notification.c | 464 ++ fs/notify/vfsmount_mark.c | 190 + fs/ntfs/Kconfig | 78 + fs/ntfs/Makefile | 14 + fs/ntfs/aops.c | 1638 ++++ fs/ntfs/aops.h | 107 + fs/ntfs/attrib.c | 2615 ++++++ fs/ntfs/attrib.h | 116 + fs/ntfs/bitmap.c | 193 + fs/ntfs/bitmap.h | 118 + fs/ntfs/collate.c | 124 + fs/ntfs/collate.h | 50 + fs/ntfs/compress.c | 969 +++ fs/ntfs/debug.c | 183 + fs/ntfs/debug.h | 63 + fs/ntfs/dir.c | 1575 ++++ fs/ntfs/dir.h | 48 + fs/ntfs/endian.h | 93 + fs/ntfs/file.c | 2227 ++++++ fs/ntfs/index.c | 454 ++ fs/ntfs/index.h | 148 + fs/ntfs/inode.c | 3112 ++++++++ fs/ntfs/inode.h | 321 + fs/ntfs/layout.h | 2435 ++++++ fs/ntfs/lcnalloc.c | 1014 +++ fs/ntfs/lcnalloc.h | 145 + fs/ntfs/logfile.c | 863 ++ fs/ntfs/logfile.h | 309 + fs/ntfs/malloc.h | 96 + fs/ntfs/mft.c | 2917 +++++++ fs/ntfs/mft.h | 124 + fs/ntfs/mst.c | 203 + fs/ntfs/namei.c | 405 + fs/ntfs/ntfs.h | 164 + fs/ntfs/quota.c | 117 + fs/ntfs/quota.h | 35 + fs/ntfs/runlist.c | 1907 +++++ fs/ntfs/runlist.h | 102 + fs/ntfs/super.c | 3206 ++++++++ fs/ntfs/sysctl.c | 83 + fs/ntfs/sysctl.h | 41 + fs/ntfs/time.h | 100 + fs/ntfs/types.h | 69 + fs/ntfs/unistr.c | 398 + fs/ntfs/upcase.c | 87 + fs/ntfs/usnjrnl.c | 84 + fs/ntfs/usnjrnl.h | 205 + fs/ntfs/volume.h | 177 + fs/ocfs2/Kconfig | 76 + fs/ocfs2/Makefile | 54 + fs/ocfs2/acl.c | 535 ++ fs/ocfs2/acl.h | 36 + fs/ocfs2/alloc.c | 7352 +++++++++++++++++ fs/ocfs2/alloc.h | 324 + fs/ocfs2/aops.c | 2048 +++++ fs/ocfs2/aops.h | 94 + fs/ocfs2/blockcheck.c | 645 ++ fs/ocfs2/blockcheck.h | 107 + fs/ocfs2/buffer_head_io.c | 429 + fs/ocfs2/buffer_head_io.h | 77 + fs/ocfs2/cluster/Makefile | 4 + fs/ocfs2/cluster/heartbeat.c | 2628 +++++++ fs/ocfs2/cluster/heartbeat.h | 89 + fs/ocfs2/cluster/masklog.c | 155 + fs/ocfs2/cluster/masklog.h | 219 + fs/ocfs2/cluster/netdebug.c | 543 ++ fs/ocfs2/cluster/nodemanager.c | 989 +++ fs/ocfs2/cluster/nodemanager.h | 88 + fs/ocfs2/cluster/ocfs2_heartbeat.h | 38 + fs/ocfs2/cluster/ocfs2_nodemanager.h | 45 + fs/ocfs2/cluster/quorum.c | 331 + fs/ocfs2/cluster/quorum.h | 36 + fs/ocfs2/cluster/sys.c | 82 + fs/ocfs2/cluster/sys.h | 33 + fs/ocfs2/cluster/tcp.c | 2143 +++++ fs/ocfs2/cluster/tcp.h | 152 + fs/ocfs2/cluster/tcp_internal.h | 242 + fs/ocfs2/cluster/ver.c | 42 + fs/ocfs2/cluster/ver.h | 31 + fs/ocfs2/dcache.c | 537 ++ fs/ocfs2/dcache.h | 69 + fs/ocfs2/dir.c | 4555 +++++++++++ fs/ocfs2/dir.h | 117 + fs/ocfs2/dlm/Makefile | 7 + fs/ocfs2/dlm/dlmapi.h | 220 + fs/ocfs2/dlm/dlmast.c | 502 ++ fs/ocfs2/dlm/dlmcommon.h | 1181 +++ fs/ocfs2/dlm/dlmconvert.c | 548 ++ fs/ocfs2/dlm/dlmconvert.h | 35 + fs/ocfs2/dlm/dlmdebug.c | 1017 +++ fs/ocfs2/dlm/dlmdebug.h | 81 + fs/ocfs2/dlm/dlmdomain.c | 2397 ++++++ fs/ocfs2/dlm/dlmdomain.h | 36 + fs/ocfs2/dlm/dlmlock.c | 767 ++ fs/ocfs2/dlm/dlmmaster.c | 3413 ++++++++ fs/ocfs2/dlm/dlmrecovery.c | 2886 +++++++ fs/ocfs2/dlm/dlmthread.c | 762 ++ fs/ocfs2/dlm/dlmunlock.c | 692 ++ fs/ocfs2/dlm/dlmver.c | 42 + fs/ocfs2/dlm/dlmver.h | 31 + fs/ocfs2/dlmfs/Makefile | 5 + fs/ocfs2/dlmfs/dlmfs.c | 725 ++ fs/ocfs2/dlmfs/dlmfsver.c | 42 + fs/ocfs2/dlmfs/dlmfsver.h | 31 + fs/ocfs2/dlmfs/userdlm.c | 688 ++ fs/ocfs2/dlmfs/userdlm.h | 113 + fs/ocfs2/dlmglue.c | 4033 ++++++++++ fs/ocfs2/dlmglue.h | 172 + fs/ocfs2/export.c | 276 + fs/ocfs2/export.h | 33 + fs/ocfs2/extent_map.c | 902 +++ fs/ocfs2/extent_map.h | 90 + fs/ocfs2/file.c | 2688 +++++++ fs/ocfs2/file.h | 76 + fs/ocfs2/heartbeat.c | 134 + fs/ocfs2/heartbeat.h | 45 + fs/ocfs2/inode.c | 1447 ++++ fs/ocfs2/inode.h | 180 + fs/ocfs2/ioctl.c | 1031 +++ fs/ocfs2/ioctl.h | 16 + fs/ocfs2/journal.c | 2192 ++++++ fs/ocfs2/journal.h | 619 ++ fs/ocfs2/localalloc.c | 1307 +++ fs/ocfs2/localalloc.h | 62 + fs/ocfs2/locks.c | 139 + fs/ocfs2/locks.h | 32 + fs/ocfs2/mmap.c | 197 + fs/ocfs2/mmap.h | 6 + fs/ocfs2/move_extents.c | 1152 +++ fs/ocfs2/move_extents.h | 22 + fs/ocfs2/namei.c | 2501 ++++++ fs/ocfs2/namei.h | 45 + fs/ocfs2/ocfs1_fs_compat.h | 109 + fs/ocfs2/ocfs2.h | 853 ++ fs/ocfs2/ocfs2_fs.h | 1640 ++++ fs/ocfs2/ocfs2_ioctl.h | 242 + fs/ocfs2/ocfs2_lockid.h | 128 + fs/ocfs2/ocfs2_lockingver.h | 32 + fs/ocfs2/ocfs2_trace.h | 2764 +++++++ fs/ocfs2/quota.h | 117 + fs/ocfs2/quota_global.c | 922 +++ fs/ocfs2/quota_local.c | 1316 ++++ fs/ocfs2/refcounttree.c | 4514 +++++++++++ fs/ocfs2/refcounttree.h | 118 + fs/ocfs2/reservations.c | 839 ++ fs/ocfs2/reservations.h | 159 + fs/ocfs2/resize.c | 589 ++ fs/ocfs2/resize.h | 32 + fs/ocfs2/slot_map.c | 538 ++ fs/ocfs2/slot_map.h | 44 + fs/ocfs2/stack_o2cb.c | 393 + fs/ocfs2/stack_user.c | 908 +++ fs/ocfs2/stackglue.c | 726 ++ fs/ocfs2/stackglue.h | 292 + fs/ocfs2/suballoc.c | 2908 +++++++ fs/ocfs2/suballoc.h | 221 + fs/ocfs2/super.c | 2651 +++++++ fs/ocfs2/super.h | 55 + fs/ocfs2/symlink.c | 173 + fs/ocfs2/symlink.h | 42 + fs/ocfs2/sysfile.c | 180 + fs/ocfs2/sysfile.h | 33 + fs/ocfs2/uptodate.c | 638 ++ fs/ocfs2/uptodate.h | 84 + fs/ocfs2/ver.c | 43 + fs/ocfs2/ver.h | 31 + fs/ocfs2/xattr.c | 7388 +++++++++++++++++ fs/ocfs2/xattr.h | 100 + fs/omfs/Kconfig | 13 + fs/omfs/Makefile | 4 + fs/omfs/bitmap.c | 193 + fs/omfs/dir.c | 475 ++ fs/omfs/file.c | 378 + fs/omfs/inode.c | 585 ++ fs/omfs/omfs.h | 68 + fs/omfs/omfs_fs.h | 81 + fs/open.c | 1161 +++ fs/openpromfs/Makefile | 7 + fs/openpromfs/inode.c | 473 ++ fs/partitions/Kconfig | 251 + fs/partitions/Makefile | 20 + fs/partitions/acorn.c | 556 ++ fs/partitions/acorn.h | 14 + fs/partitions/amiga.c | 139 + fs/partitions/amiga.h | 6 + fs/partitions/atari.c | 149 + fs/partitions/atari.h | 34 + fs/partitions/check.c | 730 ++ fs/partitions/check.h | 49 + fs/partitions/efi.c | 675 ++ fs/partitions/efi.h | 134 + fs/partitions/ibm.c | 275 + fs/partitions/ibm.h | 1 + fs/partitions/karma.c | 57 + fs/partitions/karma.h | 8 + fs/partitions/ldm.c | 1568 ++++ fs/partitions/ldm.h | 215 + fs/partitions/mac.c | 134 + fs/partitions/mac.h | 44 + fs/partitions/msdos.c | 552 ++ fs/partitions/msdos.h | 8 + fs/partitions/osf.c | 86 + fs/partitions/osf.h | 7 + fs/partitions/sgi.c | 82 + fs/partitions/sgi.h | 8 + fs/partitions/sun.c | 122 + fs/partitions/sun.h | 8 + fs/partitions/sysv68.c | 95 + fs/partitions/sysv68.h | 1 + fs/partitions/ultrix.c | 48 + fs/partitions/ultrix.h | 5 + fs/pipe.c | 1326 ++++ fs/pnode.c | 371 + fs/pnode.h | 39 + fs/posix_acl.c | 388 + fs/proc/Kconfig | 69 + fs/proc/Makefile | 30 + fs/proc/array.c | 546 ++ fs/proc/base.c | 3429 ++++++++ fs/proc/cmdline.c | 29 + fs/proc/consoles.c | 114 + fs/proc/cpuinfo.c | 24 + fs/proc/devices.c | 70 + fs/proc/generic.c | 853 ++ fs/proc/inode.c | 497 ++ fs/proc/internal.h | 147 + fs/proc/interrupts.c | 53 + fs/proc/kcore.c | 635 ++ fs/proc/kmsg.c | 64 + fs/proc/loadavg.c | 45 + fs/proc/meminfo.c | 194 + fs/proc/mmu.c | 60 + fs/proc/namespaces.c | 201 + fs/proc/nommu.c | 136 + fs/proc/page.c | 212 + fs/proc/proc_devtree.c | 241 + fs/proc/proc_net.c | 241 + fs/proc/proc_sysctl.c | 436 ++ fs/proc/proc_tty.c | 189 + fs/proc/root.c | 213 + fs/proc/softirqs.c | 44 + fs/proc/stat.c | 170 + fs/proc/task_mmu.c | 1125 +++ fs/proc/task_nommu.c | 271 + fs/proc/uptime.c | 53 + fs/proc/version.c | 34 + fs/proc/vmcore.c | 701 ++ fs/pstore/Kconfig | 13 + fs/pstore/Makefile | 7 + fs/pstore/inode.c | 311 + fs/pstore/internal.h | 6 + fs/pstore/platform.c | 207 + fs/qnx4/Kconfig | 14 + fs/qnx4/Makefile | 7 + fs/qnx4/README | 9 + fs/qnx4/bitmap.c | 58 + fs/qnx4/dir.c | 85 + fs/qnx4/inode.c | 504 ++ fs/qnx4/namei.c | 132 + fs/qnx4/qnx4.h | 49 + fs/quota/Kconfig | 74 + fs/quota/Makefile | 7 + fs/quota/compat.c | 118 + fs/quota/dquot.c | 2661 +++++++ fs/quota/netlink.c | 96 + fs/quota/quota.c | 375 + fs/quota/quota_tree.c | 659 ++ fs/quota/quota_tree.h | 25 + fs/quota/quota_v1.c | 233 + fs/quota/quota_v2.c | 336 + fs/quota/quotaio_v1.h | 33 + fs/quota/quotaio_v2.h | 73 + fs/ramfs/Makefile | 9 + fs/ramfs/file-mmu.c | 55 + fs/ramfs/file-nommu.c | 266 + fs/ramfs/inode.c | 315 + fs/ramfs/internal.h | 14 + fs/read_write.c | 948 +++ fs/read_write.h | 14 + fs/readdir.c | 311 + fs/reiserfs/Kconfig | 88 + fs/reiserfs/Makefile | 38 + fs/reiserfs/README | 161 + fs/reiserfs/bitmap.c | 1300 +++ fs/reiserfs/dir.c | 310 + fs/reiserfs/do_balan.c | 2074 +++++ fs/reiserfs/file.c | 315 + fs/reiserfs/fix_node.c | 2593 ++++++ fs/reiserfs/hashes.c | 182 + fs/reiserfs/ibalance.c | 1089 +++ fs/reiserfs/inode.c | 3225 ++++++++ fs/reiserfs/ioctl.c | 226 + fs/reiserfs/item_ops.c | 756 ++ fs/reiserfs/journal.c | 4319 ++++++++++ fs/reiserfs/lbalance.c | 1311 ++++ fs/reiserfs/lock.c | 97 + fs/reiserfs/namei.c | 1562 ++++ fs/reiserfs/objectid.c | 203 + fs/reiserfs/prints.c | 768 ++ fs/reiserfs/procfs.c | 576 ++ fs/reiserfs/resize.c | 212 + fs/reiserfs/stree.c | 2120 +++++ fs/reiserfs/super.c | 2271 ++++++ fs/reiserfs/tail_conversion.c | 280 + fs/reiserfs/xattr.c | 1051 +++ fs/reiserfs/xattr_acl.c | 531 ++ fs/reiserfs/xattr_security.c | 120 + fs/reiserfs/xattr_trusted.c | 56 + fs/reiserfs/xattr_user.c | 52 + fs/romfs/Kconfig | 62 + fs/romfs/Makefile | 12 + fs/romfs/internal.h | 47 + fs/romfs/mmap-nommu.c | 79 + fs/romfs/storage.c | 293 + fs/romfs/super.c | 662 ++ fs/select.c | 999 +++ fs/seq_file.c | 821 ++ fs/signalfd.c | 295 + fs/splice.c | 2047 +++++ fs/squashfs/Kconfig | 89 + fs/squashfs/Makefile | 10 + fs/squashfs/block.c | 210 + fs/squashfs/cache.c | 428 + fs/squashfs/decompressor.c | 110 + fs/squashfs/decompressor.h | 59 + fs/squashfs/dir.c | 244 + fs/squashfs/export.c | 163 + fs/squashfs/file.c | 501 ++ fs/squashfs/fragment.c | 99 + fs/squashfs/id.c | 102 + fs/squashfs/inode.c | 425 + fs/squashfs/lzo_wrapper.c | 135 + fs/squashfs/namei.c | 257 + fs/squashfs/squashfs.h | 102 + fs/squashfs/squashfs_fs.h | 459 ++ fs/squashfs/squashfs_fs_i.h | 54 + fs/squashfs/squashfs_fs_sb.h | 79 + fs/squashfs/super.c | 497 ++ fs/squashfs/symlink.c | 127 + fs/squashfs/xattr.c | 324 + fs/squashfs/xattr.h | 47 + fs/squashfs/xattr_id.c | 95 + fs/squashfs/xz_wrapper.c | 180 + fs/squashfs/zlib_wrapper.c | 149 + fs/stack.c | 79 + fs/stat.c | 473 ++ fs/statfs.c | 229 + fs/super.c | 1061 +++ fs/sync.c | 402 + fs/sysfs/Kconfig | 23 + fs/sysfs/Makefile | 6 + fs/sysfs/bin.c | 506 ++ fs/sysfs/dir.c | 964 +++ fs/sysfs/file.c | 747 ++ fs/sysfs/group.c | 207 + fs/sysfs/inode.c | 367 + fs/sysfs/mount.c | 201 + fs/sysfs/symlink.c | 307 + fs/sysfs/sysfs.h | 231 + fs/sysv/Kconfig | 36 + fs/sysv/Makefile | 8 + fs/sysv/balloc.c | 239 + fs/sysv/dir.c | 377 + fs/sysv/file.c | 58 + fs/sysv/ialloc.c | 234 + fs/sysv/inode.c | 381 + fs/sysv/itree.c | 494 ++ fs/sysv/namei.c | 301 + fs/sysv/super.c | 590 ++ fs/sysv/symlink.c | 20 + fs/sysv/sysv.h | 248 + fs/timerfd.c | 369 + fs/ubifs/Kconfig | 60 + fs/ubifs/Makefile | 9 + fs/ubifs/budget.c | 732 ++ fs/ubifs/commit.c | 738 ++ fs/ubifs/compress.c | 251 + fs/ubifs/debug.c | 2933 +++++++ fs/ubifs/debug.h | 445 ++ fs/ubifs/dir.c | 1206 +++ fs/ubifs/file.c | 1593 ++++ fs/ubifs/find.c | 977 +++ fs/ubifs/gc.c | 985 +++ fs/ubifs/io.c | 1054 +++ fs/ubifs/ioctl.c | 205 + fs/ubifs/journal.c | 1466 ++++ fs/ubifs/key.h | 548 ++ fs/ubifs/log.c | 773 ++ fs/ubifs/lprops.c | 1319 ++++ fs/ubifs/lpt.c | 2277 ++++++ fs/ubifs/lpt_commit.c | 2050 +++++ fs/ubifs/master.c | 396 + fs/ubifs/misc.h | 360 + fs/ubifs/orphan.c | 972 +++ fs/ubifs/recovery.c | 1567 ++++ fs/ubifs/replay.c | 1080 +++ fs/ubifs/sb.c | 803 ++ fs/ubifs/scan.c | 380 + fs/ubifs/shrinker.c | 325 + fs/ubifs/super.c | 2321 ++++++ fs/ubifs/tnc.c | 3349 ++++++++ fs/ubifs/tnc_commit.c | 1103 +++ fs/ubifs/tnc_misc.c | 494 ++ fs/ubifs/ubifs-media.h | 784 ++ fs/ubifs/ubifs.h | 1777 +++++ fs/ubifs/xattr.c | 572 ++ fs/udf/Kconfig | 18 + fs/udf/Makefile | 9 + fs/udf/balloc.c | 815 ++ fs/udf/dir.c | 210 + fs/udf/directory.c | 241 + fs/udf/ecma_167.h | 796 ++ fs/udf/file.c | 249 + fs/udf/ialloc.c | 132 + fs/udf/inode.c | 2166 +++++ fs/udf/lowlevel.c | 67 + fs/udf/misc.c | 296 + fs/udf/namei.c | 1339 ++++ fs/udf/osta_udf.h | 279 + fs/udf/partition.c | 334 + fs/udf/super.c | 2324 ++++++ fs/udf/symlink.c | 119 + fs/udf/truncate.c | 289 + fs/udf/udf_i.h | 45 + fs/udf/udf_sb.h | 182 + fs/udf/udfdecl.h | 241 + fs/udf/udfend.h | 77 + fs/udf/udftime.c | 171 + fs/udf/unicode.c | 493 ++ fs/ufs/Kconfig | 43 + fs/ufs/Makefile | 8 + fs/ufs/balloc.c | 953 +++ fs/ufs/cylinder.c | 201 + fs/ufs/dir.c | 666 ++ fs/ufs/file.c | 46 + fs/ufs/ialloc.c | 354 + fs/ufs/inode.c | 906 +++ fs/ufs/namei.c | 360 + fs/ufs/super.c | 1511 ++++ fs/ufs/swab.h | 115 + fs/ufs/symlink.c | 53 + fs/ufs/truncate.c | 523 ++ fs/ufs/ufs.h | 161 + fs/ufs/ufs_fs.h | 959 +++ fs/ufs/util.c | 283 + fs/ufs/util.h | 592 ++ fs/utimes.c | 224 + fs/xattr.c | 698 ++ fs/xattr_acl.c | 98 + fs/xfs/Kconfig | 82 + fs/xfs/Makefile | 111 + fs/xfs/linux-2.6/kmem.c | 132 + fs/xfs/linux-2.6/kmem.h | 124 + fs/xfs/linux-2.6/mrlock.h | 90 + fs/xfs/linux-2.6/time.h | 36 + fs/xfs/linux-2.6/xfs_acl.c | 464 ++ fs/xfs/linux-2.6/xfs_aops.c | 1506 ++++ fs/xfs/linux-2.6/xfs_aops.h | 68 + fs/xfs/linux-2.6/xfs_buf.c | 1899 +++++ fs/xfs/linux-2.6/xfs_buf.h | 351 + fs/xfs/linux-2.6/xfs_discard.c | 222 + fs/xfs/linux-2.6/xfs_discard.h | 10 + fs/xfs/linux-2.6/xfs_export.c | 250 + fs/xfs/linux-2.6/xfs_export.h | 72 + fs/xfs/linux-2.6/xfs_file.c | 1114 +++ fs/xfs/linux-2.6/xfs_fs_subr.c | 96 + fs/xfs/linux-2.6/xfs_globals.c | 43 + fs/xfs/linux-2.6/xfs_ioctl.c | 1556 ++++ fs/xfs/linux-2.6/xfs_ioctl.h | 85 + fs/xfs/linux-2.6/xfs_ioctl32.c | 672 ++ fs/xfs/linux-2.6/xfs_ioctl32.h | 237 + fs/xfs/linux-2.6/xfs_iops.c | 778 ++ fs/xfs/linux-2.6/xfs_iops.h | 30 + fs/xfs/linux-2.6/xfs_linux.h | 307 + fs/xfs/linux-2.6/xfs_message.c | 108 + fs/xfs/linux-2.6/xfs_message.h | 39 + fs/xfs/linux-2.6/xfs_quotaops.c | 139 + fs/xfs/linux-2.6/xfs_stats.c | 122 + fs/xfs/linux-2.6/xfs_stats.h | 223 + fs/xfs/linux-2.6/xfs_super.c | 1720 ++++ fs/xfs/linux-2.6/xfs_super.h | 87 + fs/xfs/linux-2.6/xfs_sync.c | 1132 +++ fs/xfs/linux-2.6/xfs_sync.h | 62 + fs/xfs/linux-2.6/xfs_sysctl.c | 252 + fs/xfs/linux-2.6/xfs_sysctl.h | 102 + fs/xfs/linux-2.6/xfs_trace.c | 56 + fs/xfs/linux-2.6/xfs_trace.h | 1776 +++++ fs/xfs/linux-2.6/xfs_vnode.h | 64 + fs/xfs/linux-2.6/xfs_xattr.c | 241 + fs/xfs/quota/xfs_dquot.c | 1496 ++++ fs/xfs/quota/xfs_dquot.h | 143 + fs/xfs/quota/xfs_dquot_item.c | 533 ++ fs/xfs/quota/xfs_dquot_item.h | 48 + fs/xfs/quota/xfs_qm.c | 2462 ++++++ fs/xfs/quota/xfs_qm.h | 172 + fs/xfs/quota/xfs_qm_bhv.c | 176 + fs/xfs/quota/xfs_qm_stats.c | 105 + fs/xfs/quota/xfs_qm_stats.h | 53 + fs/xfs/quota/xfs_qm_syscalls.c | 1259 +++ fs/xfs/quota/xfs_quota_priv.h | 53 + fs/xfs/quota/xfs_trans_dquot.c | 895 +++ fs/xfs/support/uuid.c | 63 + fs/xfs/support/uuid.h | 29 + fs/xfs/xfs.h | 29 + fs/xfs/xfs_acl.h | 62 + fs/xfs/xfs_ag.h | 289 + fs/xfs/xfs_alloc.c | 3047 +++++++ fs/xfs/xfs_alloc.h | 253 + fs/xfs/xfs_alloc_btree.c | 458 ++ fs/xfs/xfs_alloc_btree.h | 110 + fs/xfs/xfs_arch.h | 136 + fs/xfs/xfs_attr.c | 2230 ++++++ fs/xfs/xfs_attr.h | 146 + fs/xfs/xfs_attr_leaf.c | 2979 +++++++ fs/xfs/xfs_attr_leaf.h | 265 + fs/xfs/xfs_attr_sf.h | 70 + fs/xfs/xfs_bit.c | 120 + fs/xfs/xfs_bit.h | 84 + fs/xfs/xfs_bmap.c | 6139 +++++++++++++++ fs/xfs/xfs_bmap.h | 402 + fs/xfs/xfs_bmap_btree.c | 919 +++ fs/xfs/xfs_bmap_btree.h | 240 + fs/xfs/xfs_btree.c | 3679 +++++++++ fs/xfs/xfs_btree.h | 455 ++ fs/xfs/xfs_btree_trace.c | 249 + fs/xfs/xfs_btree_trace.h | 99 + fs/xfs/xfs_buf_item.c | 1064 +++ fs/xfs/xfs_buf_item.h | 130 + fs/xfs/xfs_da_btree.c | 2463 ++++++ fs/xfs/xfs_da_btree.h | 281 + fs/xfs/xfs_dfrag.c | 457 ++ fs/xfs/xfs_dfrag.h | 53 + fs/xfs/xfs_dinode.h | 216 + fs/xfs/xfs_dir2.c | 764 ++ fs/xfs/xfs_dir2.h | 106 + fs/xfs/xfs_dir2_block.c | 1233 +++ fs/xfs/xfs_dir2_block.h | 92 + fs/xfs/xfs_dir2_data.c | 833 ++ fs/xfs/xfs_dir2_data.h | 184 + fs/xfs/xfs_dir2_leaf.c | 1881 +++++ fs/xfs/xfs_dir2_leaf.h | 253 + fs/xfs/xfs_dir2_node.c | 2076 +++++ fs/xfs/xfs_dir2_node.h | 100 + fs/xfs/xfs_dir2_sf.c | 1267 +++ fs/xfs/xfs_dir2_sf.h | 171 + fs/xfs/xfs_error.c | 185 + fs/xfs/xfs_error.h | 161 + fs/xfs/xfs_extfree_item.c | 520 ++ fs/xfs/xfs_extfree_item.h | 161 + fs/xfs/xfs_filestream.c | 824 ++ fs/xfs/xfs_filestream.h | 74 + fs/xfs/xfs_fs.h | 501 ++ fs/xfs/xfs_fsops.c | 671 ++ fs/xfs/xfs_fsops.h | 30 + fs/xfs/xfs_ialloc.c | 1563 ++++ fs/xfs/xfs_ialloc.h | 164 + fs/xfs/xfs_ialloc_btree.c | 346 + fs/xfs/xfs_ialloc_btree.h | 112 + fs/xfs/xfs_iget.c | 727 ++ fs/xfs/xfs_inode.c | 4145 ++++++++++ fs/xfs/xfs_inode.h | 600 ++ fs/xfs/xfs_inode_item.c | 1060 +++ fs/xfs/xfs_inode_item.h | 166 + fs/xfs/xfs_inum.h | 80 + fs/xfs/xfs_iomap.c | 748 ++ fs/xfs/xfs_iomap.h | 32 + fs/xfs/xfs_itable.c | 736 ++ fs/xfs/xfs_itable.h | 106 + fs/xfs/xfs_log.c | 3767 +++++++++ fs/xfs/xfs_log.h | 198 + fs/xfs/xfs_log_cil.c | 804 ++ fs/xfs/xfs_log_priv.h | 668 ++ fs/xfs/xfs_log_recover.c | 3843 +++++++++ fs/xfs/xfs_log_recover.h | 66 + fs/xfs/xfs_mount.c | 2585 ++++++ fs/xfs/xfs_mount.h | 402 + fs/xfs/xfs_mru_cache.c | 576 ++ fs/xfs/xfs_mru_cache.h | 53 + fs/xfs/xfs_quota.h | 390 + fs/xfs/xfs_rename.c | 358 + fs/xfs/xfs_rtalloc.c | 2325 ++++++ fs/xfs/xfs_rtalloc.h | 166 + fs/xfs/xfs_rw.c | 175 + fs/xfs/xfs_rw.h | 49 + fs/xfs/xfs_sb.h | 543 ++ fs/xfs/xfs_trans.c | 2026 +++++ fs/xfs/xfs_trans.h | 506 ++ fs/xfs/xfs_trans_ail.c | 890 +++ fs/xfs/xfs_trans_buf.c | 879 +++ fs/xfs/xfs_trans_extfree.c | 134 + fs/xfs/xfs_trans_inode.c | 190 + fs/xfs/xfs_trans_priv.h | 143 + fs/xfs/xfs_trans_space.h | 86 + fs/xfs/xfs_types.h | 156 + fs/xfs/xfs_utils.c | 321 + fs/xfs/xfs_utils.h | 27 + fs/xfs/xfs_vnodeops.c | 2852 +++++++ fs/xfs/xfs_vnodeops.h | 63 + fs/yaffs2/Kconfig | 161 + fs/yaffs2/Makefile | 17 + fs/yaffs2/yaffs_allocator.c | 396 + fs/yaffs2/yaffs_allocator.h | 30 + fs/yaffs2/yaffs_attribs.c | 124 + fs/yaffs2/yaffs_attribs.h | 28 + fs/yaffs2/yaffs_bitmap.c | 98 + fs/yaffs2/yaffs_bitmap.h | 33 + fs/yaffs2/yaffs_checkptrw.c | 415 + fs/yaffs2/yaffs_checkptrw.h | 33 + fs/yaffs2/yaffs_ecc.c | 298 + fs/yaffs2/yaffs_ecc.h | 44 + fs/yaffs2/yaffs_getblockinfo.h | 35 + fs/yaffs2/yaffs_guts.c | 5164 ++++++++++++ fs/yaffs2/yaffs_guts.h | 915 +++ fs/yaffs2/yaffs_linux.h | 41 + fs/yaffs2/yaffs_mtdif.c | 54 + fs/yaffs2/yaffs_mtdif.h | 23 + fs/yaffs2/yaffs_mtdif1.c | 330 + fs/yaffs2/yaffs_mtdif1.h | 29 + fs/yaffs2/yaffs_mtdif2.c | 225 + fs/yaffs2/yaffs_mtdif2.h | 29 + fs/yaffs2/yaffs_nameval.c | 201 + fs/yaffs2/yaffs_nameval.h | 28 + fs/yaffs2/yaffs_nand.c | 127 + fs/yaffs2/yaffs_nand.h | 38 + fs/yaffs2/yaffs_packedtags1.c | 53 + fs/yaffs2/yaffs_packedtags1.h | 39 + fs/yaffs2/yaffs_packedtags2.c | 196 + fs/yaffs2/yaffs_packedtags2.h | 47 + fs/yaffs2/yaffs_tagscompat.c | 422 + fs/yaffs2/yaffs_tagscompat.h | 36 + fs/yaffs2/yaffs_tagsvalidity.c | 27 + fs/yaffs2/yaffs_tagsvalidity.h | 23 + fs/yaffs2/yaffs_trace.h | 57 + fs/yaffs2/yaffs_verify.c | 535 ++ fs/yaffs2/yaffs_verify.h | 43 + fs/yaffs2/yaffs_vfs.c | 2792 +++++++ fs/yaffs2/yaffs_yaffs1.c | 433 + fs/yaffs2/yaffs_yaffs1.h | 22 + fs/yaffs2/yaffs_yaffs2.c | 1598 ++++ fs/yaffs2/yaffs_yaffs2.h | 39 + fs/yaffs2/yportenv.h | 70 + 1680 files changed, 1011662 insertions(+) create mode 100644 fs/9p/Kconfig create mode 100644 fs/9p/Makefile create mode 100644 fs/9p/acl.c create mode 100644 fs/9p/acl.h create mode 100644 fs/9p/cache.c create mode 100644 fs/9p/cache.h create mode 100644 fs/9p/fid.c create mode 100644 fs/9p/fid.h create mode 100644 fs/9p/v9fs.c create mode 100644 fs/9p/v9fs.h create mode 100644 fs/9p/v9fs_vfs.h create mode 100644 fs/9p/vfs_addr.c create mode 100644 fs/9p/vfs_dentry.c create mode 100644 fs/9p/vfs_dir.c create mode 100644 fs/9p/vfs_file.c create mode 100644 fs/9p/vfs_inode.c create mode 100644 fs/9p/vfs_inode_dotl.c create mode 100644 fs/9p/vfs_super.c create mode 100644 fs/9p/xattr.c create mode 100644 fs/9p/xattr.h create mode 100644 fs/9p/xattr_user.c create mode 100644 fs/Kconfig create mode 100644 fs/Kconfig.binfmt create mode 100644 fs/Makefile create mode 100644 fs/adfs/Kconfig create mode 100644 fs/adfs/Makefile create mode 100644 fs/adfs/adfs.h create mode 100644 fs/adfs/dir.c create mode 100644 fs/adfs/dir_f.c create mode 100644 fs/adfs/dir_f.h create mode 100644 fs/adfs/dir_fplus.c create mode 100644 fs/adfs/dir_fplus.h create mode 100644 fs/adfs/file.c create mode 100644 fs/adfs/inode.c create mode 100644 fs/adfs/map.c create mode 100644 fs/adfs/super.c create mode 100644 fs/affs/Changes create mode 100644 fs/affs/Kconfig create mode 100644 fs/affs/Makefile create mode 100644 fs/affs/affs.h create mode 100644 fs/affs/amigaffs.c create mode 100644 fs/affs/bitmap.c create mode 100644 fs/affs/dir.c create mode 100644 fs/affs/file.c create mode 100644 fs/affs/inode.c create mode 100644 fs/affs/namei.c create mode 100644 fs/affs/super.c create mode 100644 fs/affs/symlink.c create mode 100644 fs/afs/Kconfig create mode 100644 fs/afs/Makefile create mode 100644 fs/afs/afs.h create mode 100644 fs/afs/afs_cm.h create mode 100644 fs/afs/afs_fs.h create mode 100644 fs/afs/afs_vl.h create mode 100644 fs/afs/cache.c create mode 100644 fs/afs/callback.c create mode 100644 fs/afs/cell.c create mode 100644 fs/afs/cmservice.c create mode 100644 fs/afs/dir.c create mode 100644 fs/afs/file.c create mode 100644 fs/afs/flock.c create mode 100644 fs/afs/fsclient.c create mode 100644 fs/afs/inode.c create mode 100644 fs/afs/internal.h create mode 100644 fs/afs/main.c create mode 100644 fs/afs/misc.c create mode 100644 fs/afs/mntpt.c create mode 100644 fs/afs/netdevices.c create mode 100644 fs/afs/proc.c create mode 100644 fs/afs/rxrpc.c create mode 100644 fs/afs/security.c create mode 100644 fs/afs/server.c create mode 100644 fs/afs/super.c create mode 100644 fs/afs/vlclient.c create mode 100644 fs/afs/vlocation.c create mode 100644 fs/afs/vnode.c create mode 100644 fs/afs/volume.c create mode 100644 fs/afs/write.c create mode 100644 fs/aio.c create mode 100644 fs/anon_inodes.c create mode 100644 fs/attr.c create mode 100644 fs/autofs4/Kconfig create mode 100644 fs/autofs4/Makefile create mode 100644 fs/autofs4/autofs_i.h create mode 100644 fs/autofs4/dev-ioctl.c create mode 100644 fs/autofs4/expire.c create mode 100644 fs/autofs4/init.c create mode 100644 fs/autofs4/inode.c create mode 100644 fs/autofs4/root.c create mode 100644 fs/autofs4/symlink.c create mode 100644 fs/autofs4/waitq.c create mode 100644 fs/bad_inode.c create mode 100644 fs/befs/ChangeLog create mode 100644 fs/befs/Kconfig create mode 100644 fs/befs/Makefile create mode 100644 fs/befs/TODO create mode 100644 fs/befs/befs.h create mode 100644 fs/befs/befs_fs_types.h create mode 100644 fs/befs/btree.c create mode 100644 fs/befs/btree.h create mode 100644 fs/befs/datastream.c create mode 100644 fs/befs/datastream.h create mode 100644 fs/befs/debug.c create mode 100644 fs/befs/endian.h create mode 100644 fs/befs/inode.c create mode 100644 fs/befs/inode.h create mode 100644 fs/befs/io.c create mode 100644 fs/befs/io.h create mode 100644 fs/befs/linuxvfs.c create mode 100644 fs/befs/super.c create mode 100644 fs/befs/super.h create mode 100644 fs/bfs/Kconfig create mode 100644 fs/bfs/Makefile create mode 100644 fs/bfs/bfs.h create mode 100644 fs/bfs/dir.c create mode 100644 fs/bfs/file.c create mode 100644 fs/bfs/inode.c create mode 100644 fs/binfmt_aout.c create mode 100644 fs/binfmt_elf.c create mode 100644 fs/binfmt_elf_fdpic.c create mode 100644 fs/binfmt_em86.c create mode 100644 fs/binfmt_flat.c create mode 100644 fs/binfmt_misc.c create mode 100644 fs/binfmt_script.c create mode 100644 fs/binfmt_som.c create mode 100644 fs/bio-integrity.c create mode 100644 fs/bio.c create mode 100644 fs/block_dev.c create mode 100644 fs/btrfs/Kconfig create mode 100644 fs/btrfs/Makefile create mode 100644 fs/btrfs/acl.c create mode 100644 fs/btrfs/async-thread.c create mode 100644 fs/btrfs/async-thread.h create mode 100644 fs/btrfs/btrfs_inode.h create mode 100644 fs/btrfs/compat.h create mode 100644 fs/btrfs/compression.c create mode 100644 fs/btrfs/compression.h create mode 100644 fs/btrfs/ctree.c create mode 100644 fs/btrfs/ctree.h create mode 100644 fs/btrfs/delayed-inode.c create mode 100644 fs/btrfs/delayed-inode.h create mode 100644 fs/btrfs/delayed-ref.c create mode 100644 fs/btrfs/delayed-ref.h create mode 100644 fs/btrfs/dir-item.c create mode 100644 fs/btrfs/disk-io.c create mode 100644 fs/btrfs/disk-io.h create mode 100644 fs/btrfs/export.c create mode 100644 fs/btrfs/export.h create mode 100644 fs/btrfs/extent-tree.c create mode 100644 fs/btrfs/extent_io.c create mode 100644 fs/btrfs/extent_io.h create mode 100644 fs/btrfs/extent_map.c create mode 100644 fs/btrfs/extent_map.h create mode 100644 fs/btrfs/file-item.c create mode 100644 fs/btrfs/file.c create mode 100644 fs/btrfs/free-space-cache.c create mode 100644 fs/btrfs/free-space-cache.h create mode 100644 fs/btrfs/hash.h create mode 100644 fs/btrfs/inode-item.c create mode 100644 fs/btrfs/inode-map.c create mode 100644 fs/btrfs/inode-map.h create mode 100644 fs/btrfs/inode.c create mode 100644 fs/btrfs/ioctl.c create mode 100644 fs/btrfs/ioctl.h create mode 100644 fs/btrfs/locking.c create mode 100644 fs/btrfs/locking.h create mode 100644 fs/btrfs/lzo.c create mode 100644 fs/btrfs/ordered-data.c create mode 100644 fs/btrfs/ordered-data.h create mode 100644 fs/btrfs/orphan.c create mode 100644 fs/btrfs/print-tree.c create mode 100644 fs/btrfs/print-tree.h create mode 100644 fs/btrfs/ref-cache.c create mode 100644 fs/btrfs/ref-cache.h create mode 100644 fs/btrfs/relocation.c create mode 100644 fs/btrfs/root-tree.c create mode 100644 fs/btrfs/scrub.c create mode 100644 fs/btrfs/struct-funcs.c create mode 100644 fs/btrfs/super.c create mode 100644 fs/btrfs/sysfs.c create mode 100644 fs/btrfs/transaction.c create mode 100644 fs/btrfs/transaction.h create mode 100644 fs/btrfs/tree-defrag.c create mode 100644 fs/btrfs/tree-log.c create mode 100644 fs/btrfs/tree-log.h create mode 100644 fs/btrfs/version.h create mode 100644 fs/btrfs/volumes.c create mode 100644 fs/btrfs/volumes.h create mode 100644 fs/btrfs/xattr.c create mode 100644 fs/btrfs/xattr.h create mode 100644 fs/btrfs/zlib.c create mode 100644 fs/buffer.c create mode 100644 fs/cachefiles/Kconfig create mode 100644 fs/cachefiles/Makefile create mode 100644 fs/cachefiles/bind.c create mode 100644 fs/cachefiles/daemon.c create mode 100644 fs/cachefiles/interface.c create mode 100644 fs/cachefiles/internal.h create mode 100644 fs/cachefiles/key.c create mode 100644 fs/cachefiles/main.c create mode 100644 fs/cachefiles/namei.c create mode 100644 fs/cachefiles/proc.c create mode 100644 fs/cachefiles/rdwr.c create mode 100644 fs/cachefiles/security.c create mode 100644 fs/cachefiles/xattr.c create mode 100644 fs/ceph/Kconfig create mode 100644 fs/ceph/Makefile create mode 100644 fs/ceph/addr.c create mode 100644 fs/ceph/caps.c create mode 100644 fs/ceph/ceph_frag.c create mode 100644 fs/ceph/debugfs.c create mode 100644 fs/ceph/dir.c create mode 100644 fs/ceph/export.c create mode 100644 fs/ceph/file.c create mode 100644 fs/ceph/inode.c create mode 100644 fs/ceph/ioctl.c create mode 100644 fs/ceph/ioctl.h create mode 100644 fs/ceph/locks.c create mode 100644 fs/ceph/mds_client.c create mode 100644 fs/ceph/mds_client.h create mode 100644 fs/ceph/mdsmap.c create mode 100644 fs/ceph/snap.c create mode 100644 fs/ceph/strings.c create mode 100644 fs/ceph/super.c create mode 100644 fs/ceph/super.h create mode 100644 fs/ceph/xattr.c create mode 100644 fs/char_dev.c create mode 100644 fs/cifs/AUTHORS create mode 100644 fs/cifs/CHANGES create mode 100644 fs/cifs/Kconfig create mode 100644 fs/cifs/Makefile create mode 100644 fs/cifs/README create mode 100644 fs/cifs/TODO create mode 100644 fs/cifs/asn1.c create mode 100644 fs/cifs/cache.c create mode 100644 fs/cifs/cifs_debug.c create mode 100644 fs/cifs/cifs_debug.h create mode 100644 fs/cifs/cifs_dfs_ref.c create mode 100644 fs/cifs/cifs_fs_sb.h create mode 100644 fs/cifs/cifs_spnego.c create mode 100644 fs/cifs/cifs_spnego.h create mode 100644 fs/cifs/cifs_unicode.c create mode 100644 fs/cifs/cifs_unicode.h create mode 100644 fs/cifs/cifs_uniupr.h create mode 100644 fs/cifs/cifsacl.c create mode 100644 fs/cifs/cifsacl.h create mode 100644 fs/cifs/cifsencrypt.c create mode 100644 fs/cifs/cifsfs.c create mode 100644 fs/cifs/cifsfs.h create mode 100644 fs/cifs/cifsglob.h create mode 100644 fs/cifs/cifspdu.h create mode 100644 fs/cifs/cifsproto.h create mode 100644 fs/cifs/cifssmb.c create mode 100644 fs/cifs/connect.c create mode 100644 fs/cifs/dir.c create mode 100644 fs/cifs/dns_resolve.c create mode 100644 fs/cifs/dns_resolve.h create mode 100644 fs/cifs/export.c create mode 100644 fs/cifs/file.c create mode 100644 fs/cifs/fscache.c create mode 100644 fs/cifs/fscache.h create mode 100644 fs/cifs/inode.c create mode 100644 fs/cifs/ioctl.c create mode 100644 fs/cifs/link.c create mode 100644 fs/cifs/misc.c create mode 100644 fs/cifs/netmisc.c create mode 100644 fs/cifs/nterr.c create mode 100644 fs/cifs/nterr.h create mode 100644 fs/cifs/ntlmssp.h create mode 100644 fs/cifs/readdir.c create mode 100644 fs/cifs/rfc1002pdu.h create mode 100644 fs/cifs/sess.c create mode 100644 fs/cifs/smbencrypt.c create mode 100644 fs/cifs/smberr.h create mode 100644 fs/cifs/smbfsctl.h create mode 100644 fs/cifs/transport.c create mode 100644 fs/cifs/xattr.c create mode 100644 fs/coda/Kconfig create mode 100644 fs/coda/Makefile create mode 100644 fs/coda/cache.c create mode 100644 fs/coda/cnode.c create mode 100644 fs/coda/coda_cache.h create mode 100644 fs/coda/coda_fs_i.h create mode 100644 fs/coda/coda_int.h create mode 100644 fs/coda/coda_linux.c create mode 100644 fs/coda/coda_linux.h create mode 100644 fs/coda/dir.c create mode 100644 fs/coda/file.c create mode 100644 fs/coda/inode.c create mode 100644 fs/coda/pioctl.c create mode 100644 fs/coda/psdev.c create mode 100644 fs/coda/symlink.c create mode 100644 fs/coda/sysctl.c create mode 100644 fs/coda/upcall.c create mode 100644 fs/compat.c create mode 100644 fs/compat_binfmt_elf.c create mode 100644 fs/compat_ioctl.c create mode 100644 fs/configfs/Kconfig create mode 100644 fs/configfs/Makefile create mode 100644 fs/configfs/configfs_internal.h create mode 100644 fs/configfs/dir.c create mode 100644 fs/configfs/file.c create mode 100644 fs/configfs/inode.c create mode 100644 fs/configfs/item.c create mode 100644 fs/configfs/mount.c create mode 100644 fs/configfs/symlink.c create mode 100644 fs/cramfs/Kconfig create mode 100644 fs/cramfs/Makefile create mode 100644 fs/cramfs/README create mode 100644 fs/cramfs/inode.c create mode 100644 fs/cramfs/uncompress.c create mode 100644 fs/dcache.c create mode 100644 fs/dcookies.c create mode 100644 fs/debugfs/Makefile create mode 100644 fs/debugfs/file.c create mode 100644 fs/debugfs/inode.c create mode 100644 fs/devpts/Makefile create mode 100644 fs/devpts/inode.c create mode 100644 fs/direct-io.c create mode 100644 fs/dlm/Kconfig create mode 100644 fs/dlm/Makefile create mode 100644 fs/dlm/ast.c create mode 100644 fs/dlm/ast.h create mode 100644 fs/dlm/config.c create mode 100644 fs/dlm/config.h create mode 100644 fs/dlm/debug_fs.c create mode 100644 fs/dlm/dir.c create mode 100644 fs/dlm/dir.h create mode 100644 fs/dlm/dlm_internal.h create mode 100644 fs/dlm/lock.c create mode 100644 fs/dlm/lock.h create mode 100644 fs/dlm/lockspace.c create mode 100644 fs/dlm/lockspace.h create mode 100644 fs/dlm/lowcomms.c create mode 100644 fs/dlm/lowcomms.h create mode 100644 fs/dlm/lvb_table.h create mode 100644 fs/dlm/main.c create mode 100644 fs/dlm/member.c create mode 100644 fs/dlm/member.h create mode 100644 fs/dlm/memory.c create mode 100644 fs/dlm/memory.h create mode 100644 fs/dlm/midcomms.c create mode 100644 fs/dlm/midcomms.h create mode 100644 fs/dlm/netlink.c create mode 100644 fs/dlm/plock.c create mode 100644 fs/dlm/rcom.c create mode 100644 fs/dlm/rcom.h create mode 100644 fs/dlm/recover.c create mode 100644 fs/dlm/recover.h create mode 100644 fs/dlm/recoverd.c create mode 100644 fs/dlm/recoverd.h create mode 100644 fs/dlm/requestqueue.c create mode 100644 fs/dlm/requestqueue.h create mode 100644 fs/dlm/user.c create mode 100644 fs/dlm/user.h create mode 100644 fs/dlm/util.c create mode 100644 fs/dlm/util.h create mode 100644 fs/drop_caches.c create mode 100644 fs/ecryptfs/Kconfig create mode 100644 fs/ecryptfs/Makefile create mode 100644 fs/ecryptfs/crypto.c create mode 100644 fs/ecryptfs/debug.c create mode 100644 fs/ecryptfs/dentry.c create mode 100644 fs/ecryptfs/ecryptfs_kernel.h create mode 100644 fs/ecryptfs/file.c create mode 100644 fs/ecryptfs/inode.c create mode 100644 fs/ecryptfs/keystore.c create mode 100644 fs/ecryptfs/kthread.c create mode 100644 fs/ecryptfs/main.c create mode 100644 fs/ecryptfs/messaging.c create mode 100644 fs/ecryptfs/miscdev.c create mode 100644 fs/ecryptfs/mmap.c create mode 100644 fs/ecryptfs/read_write.c create mode 100644 fs/ecryptfs/super.c create mode 100644 fs/efs/Kconfig create mode 100644 fs/efs/Makefile create mode 100644 fs/efs/dir.c create mode 100644 fs/efs/efs.h create mode 100644 fs/efs/file.c create mode 100644 fs/efs/inode.c create mode 100644 fs/efs/namei.c create mode 100644 fs/efs/super.c create mode 100644 fs/efs/symlink.c create mode 100644 fs/eventfd.c create mode 100644 fs/eventpoll.c create mode 100644 fs/exec.c create mode 100644 fs/exofs/BUGS create mode 100644 fs/exofs/Kbuild create mode 100644 fs/exofs/Kconfig create mode 100644 fs/exofs/common.h create mode 100644 fs/exofs/dir.c create mode 100644 fs/exofs/exofs.h create mode 100644 fs/exofs/file.c create mode 100644 fs/exofs/inode.c create mode 100644 fs/exofs/ios.c create mode 100644 fs/exofs/namei.c create mode 100644 fs/exofs/pnfs.h create mode 100644 fs/exofs/super.c create mode 100644 fs/exofs/symlink.c create mode 100644 fs/exportfs/Makefile create mode 100644 fs/exportfs/expfs.c create mode 100644 fs/ext2/Kconfig create mode 100644 fs/ext2/Makefile create mode 100644 fs/ext2/acl.c create mode 100644 fs/ext2/acl.h create mode 100644 fs/ext2/balloc.c create mode 100644 fs/ext2/dir.c create mode 100644 fs/ext2/ext2.h create mode 100644 fs/ext2/file.c create mode 100644 fs/ext2/ialloc.c create mode 100644 fs/ext2/inode.c create mode 100644 fs/ext2/ioctl.c create mode 100644 fs/ext2/namei.c create mode 100644 fs/ext2/super.c create mode 100644 fs/ext2/symlink.c create mode 100644 fs/ext2/xattr.c create mode 100644 fs/ext2/xattr.h create mode 100644 fs/ext2/xattr_security.c create mode 100644 fs/ext2/xattr_trusted.c create mode 100644 fs/ext2/xattr_user.c create mode 100644 fs/ext2/xip.c create mode 100644 fs/ext2/xip.h create mode 100644 fs/ext3/Kconfig create mode 100644 fs/ext3/Makefile create mode 100644 fs/ext3/acl.c create mode 100644 fs/ext3/acl.h create mode 100644 fs/ext3/balloc.c create mode 100644 fs/ext3/bitmap.c create mode 100644 fs/ext3/dir.c create mode 100644 fs/ext3/ext3_jbd.c create mode 100644 fs/ext3/file.c create mode 100644 fs/ext3/fsync.c create mode 100644 fs/ext3/hash.c create mode 100644 fs/ext3/ialloc.c create mode 100644 fs/ext3/inode.c create mode 100644 fs/ext3/ioctl.c create mode 100644 fs/ext3/namei.c create mode 100644 fs/ext3/namei.h create mode 100644 fs/ext3/resize.c create mode 100644 fs/ext3/super.c create mode 100644 fs/ext3/symlink.c create mode 100644 fs/ext3/xattr.c create mode 100644 fs/ext3/xattr.h create mode 100644 fs/ext3/xattr_security.c create mode 100644 fs/ext3/xattr_trusted.c create mode 100644 fs/ext3/xattr_user.c create mode 100644 fs/ext4/Kconfig create mode 100644 fs/ext4/Makefile create mode 100644 fs/ext4/acl.c create mode 100644 fs/ext4/acl.h create mode 100644 fs/ext4/balloc.c create mode 100644 fs/ext4/bitmap.c create mode 100644 fs/ext4/block_validity.c create mode 100644 fs/ext4/dir.c create mode 100644 fs/ext4/ext4.h create mode 100644 fs/ext4/ext4_extents.h create mode 100644 fs/ext4/ext4_jbd2.c create mode 100644 fs/ext4/ext4_jbd2.h create mode 100644 fs/ext4/extents.c create mode 100644 fs/ext4/file.c create mode 100644 fs/ext4/fsync.c create mode 100644 fs/ext4/hash.c create mode 100644 fs/ext4/ialloc.c create mode 100644 fs/ext4/inode.c create mode 100644 fs/ext4/ioctl.c create mode 100644 fs/ext4/mballoc.c create mode 100644 fs/ext4/mballoc.h create mode 100644 fs/ext4/migrate.c create mode 100644 fs/ext4/mmp.c create mode 100644 fs/ext4/move_extent.c create mode 100644 fs/ext4/namei.c create mode 100644 fs/ext4/page-io.c create mode 100644 fs/ext4/resize.c create mode 100644 fs/ext4/super.c create mode 100644 fs/ext4/symlink.c create mode 100644 fs/ext4/xattr.c create mode 100644 fs/ext4/xattr.h create mode 100644 fs/ext4/xattr_security.c create mode 100644 fs/ext4/xattr_trusted.c create mode 100644 fs/ext4/xattr_user.c create mode 100644 fs/fat/Kconfig create mode 100644 fs/fat/Makefile create mode 100644 fs/fat/cache.c create mode 100644 fs/fat/dir.c create mode 100644 fs/fat/fat.h create mode 100644 fs/fat/fatent.c create mode 100644 fs/fat/file.c create mode 100644 fs/fat/inode.c create mode 100644 fs/fat/misc.c create mode 100644 fs/fat/namei_msdos.c create mode 100644 fs/fat/namei_vfat.c create mode 100644 fs/fcntl.c create mode 100644 fs/fhandle.c create mode 100644 fs/fifo.c create mode 100644 fs/file.c create mode 100644 fs/file_table.c create mode 100644 fs/filesystems.c create mode 100644 fs/freevxfs/Kconfig create mode 100644 fs/freevxfs/Makefile create mode 100644 fs/freevxfs/vxfs.h create mode 100644 fs/freevxfs/vxfs_bmap.c create mode 100644 fs/freevxfs/vxfs_dir.h create mode 100644 fs/freevxfs/vxfs_extern.h create mode 100644 fs/freevxfs/vxfs_fshead.c create mode 100644 fs/freevxfs/vxfs_fshead.h create mode 100644 fs/freevxfs/vxfs_immed.c create mode 100644 fs/freevxfs/vxfs_inode.c create mode 100644 fs/freevxfs/vxfs_inode.h create mode 100644 fs/freevxfs/vxfs_lookup.c create mode 100644 fs/freevxfs/vxfs_olt.c create mode 100644 fs/freevxfs/vxfs_olt.h create mode 100644 fs/freevxfs/vxfs_subr.c create mode 100644 fs/freevxfs/vxfs_super.c create mode 100644 fs/fs-writeback.c create mode 100644 fs/fs_struct.c create mode 100644 fs/fscache/Kconfig create mode 100644 fs/fscache/Makefile create mode 100644 fs/fscache/cache.c create mode 100644 fs/fscache/cookie.c create mode 100644 fs/fscache/fsdef.c create mode 100644 fs/fscache/histogram.c create mode 100644 fs/fscache/internal.h create mode 100644 fs/fscache/main.c create mode 100644 fs/fscache/netfs.c create mode 100644 fs/fscache/object-list.c create mode 100644 fs/fscache/object.c create mode 100644 fs/fscache/operation.c create mode 100644 fs/fscache/page.c create mode 100644 fs/fscache/proc.c create mode 100644 fs/fscache/stats.c create mode 100644 fs/fuse/Kconfig create mode 100644 fs/fuse/Makefile create mode 100644 fs/fuse/control.c create mode 100644 fs/fuse/cuse.c create mode 100644 fs/fuse/dev.c create mode 100644 fs/fuse/dir.c create mode 100644 fs/fuse/file.c create mode 100644 fs/fuse/fuse_i.h create mode 100644 fs/fuse/inode.c create mode 100644 fs/generic_acl.c create mode 100644 fs/gfs2/Kconfig create mode 100644 fs/gfs2/Makefile create mode 100644 fs/gfs2/acl.c create mode 100644 fs/gfs2/acl.h create mode 100644 fs/gfs2/aops.c create mode 100644 fs/gfs2/bmap.c create mode 100644 fs/gfs2/bmap.h create mode 100644 fs/gfs2/dentry.c create mode 100644 fs/gfs2/dir.c create mode 100644 fs/gfs2/dir.h create mode 100644 fs/gfs2/export.c create mode 100644 fs/gfs2/file.c create mode 100644 fs/gfs2/gfs2.h create mode 100644 fs/gfs2/glock.c create mode 100644 fs/gfs2/glock.h create mode 100644 fs/gfs2/glops.c create mode 100644 fs/gfs2/glops.h create mode 100644 fs/gfs2/incore.h create mode 100644 fs/gfs2/inode.c create mode 100644 fs/gfs2/inode.h create mode 100644 fs/gfs2/lock_dlm.c create mode 100644 fs/gfs2/log.c create mode 100644 fs/gfs2/log.h create mode 100644 fs/gfs2/lops.c create mode 100644 fs/gfs2/lops.h create mode 100644 fs/gfs2/main.c create mode 100644 fs/gfs2/meta_io.c create mode 100644 fs/gfs2/meta_io.h create mode 100644 fs/gfs2/ops_fstype.c create mode 100644 fs/gfs2/quota.c create mode 100644 fs/gfs2/quota.h create mode 100644 fs/gfs2/recovery.c create mode 100644 fs/gfs2/recovery.h create mode 100644 fs/gfs2/rgrp.c create mode 100644 fs/gfs2/rgrp.h create mode 100644 fs/gfs2/super.c create mode 100644 fs/gfs2/super.h create mode 100644 fs/gfs2/sys.c create mode 100644 fs/gfs2/sys.h create mode 100644 fs/gfs2/trace_gfs2.h create mode 100644 fs/gfs2/trans.c create mode 100644 fs/gfs2/trans.h create mode 100644 fs/gfs2/util.c create mode 100644 fs/gfs2/util.h create mode 100644 fs/gfs2/xattr.c create mode 100644 fs/gfs2/xattr.h create mode 100644 fs/hfs/Kconfig create mode 100644 fs/hfs/Makefile create mode 100644 fs/hfs/attr.c create mode 100644 fs/hfs/bfind.c create mode 100644 fs/hfs/bitmap.c create mode 100644 fs/hfs/bnode.c create mode 100644 fs/hfs/brec.c create mode 100644 fs/hfs/btree.c create mode 100644 fs/hfs/btree.h create mode 100644 fs/hfs/catalog.c create mode 100644 fs/hfs/dir.c create mode 100644 fs/hfs/extent.c create mode 100644 fs/hfs/hfs.h create mode 100644 fs/hfs/hfs_fs.h create mode 100644 fs/hfs/inode.c create mode 100644 fs/hfs/mdb.c create mode 100644 fs/hfs/part_tbl.c create mode 100644 fs/hfs/string.c create mode 100644 fs/hfs/super.c create mode 100644 fs/hfs/sysdep.c create mode 100644 fs/hfs/trans.c create mode 100644 fs/hfsplus/Kconfig create mode 100644 fs/hfsplus/Makefile create mode 100644 fs/hfsplus/bfind.c create mode 100644 fs/hfsplus/bitmap.c create mode 100644 fs/hfsplus/bnode.c create mode 100644 fs/hfsplus/brec.c create mode 100644 fs/hfsplus/btree.c create mode 100644 fs/hfsplus/catalog.c create mode 100644 fs/hfsplus/dir.c create mode 100644 fs/hfsplus/extents.c create mode 100644 fs/hfsplus/hfsplus_fs.h create mode 100644 fs/hfsplus/hfsplus_raw.h create mode 100644 fs/hfsplus/inode.c create mode 100644 fs/hfsplus/ioctl.c create mode 100644 fs/hfsplus/options.c create mode 100644 fs/hfsplus/part_tbl.c create mode 100644 fs/hfsplus/super.c create mode 100644 fs/hfsplus/tables.c create mode 100644 fs/hfsplus/unicode.c create mode 100644 fs/hfsplus/wrapper.c create mode 100644 fs/hostfs/Makefile create mode 100644 fs/hostfs/hostfs.h create mode 100644 fs/hostfs/hostfs_kern.c create mode 100644 fs/hostfs/hostfs_user.c create mode 100644 fs/hpfs/Kconfig create mode 100644 fs/hpfs/Makefile create mode 100644 fs/hpfs/alloc.c create mode 100644 fs/hpfs/anode.c create mode 100644 fs/hpfs/buffer.c create mode 100644 fs/hpfs/dentry.c create mode 100644 fs/hpfs/dir.c create mode 100644 fs/hpfs/dnode.c create mode 100644 fs/hpfs/ea.c create mode 100644 fs/hpfs/file.c create mode 100644 fs/hpfs/hpfs.h create mode 100644 fs/hpfs/hpfs_fn.h create mode 100644 fs/hpfs/inode.c create mode 100644 fs/hpfs/map.c create mode 100644 fs/hpfs/name.c create mode 100644 fs/hpfs/namei.c create mode 100644 fs/hpfs/super.c create mode 100644 fs/hppfs/Makefile create mode 100644 fs/hppfs/hppfs.c create mode 100644 fs/hugetlbfs/Makefile create mode 100644 fs/hugetlbfs/inode.c create mode 100644 fs/inode.c create mode 100644 fs/internal.h create mode 100644 fs/ioctl.c create mode 100644 fs/ioprio.c create mode 100644 fs/isofs/Kconfig create mode 100644 fs/isofs/Makefile create mode 100644 fs/isofs/compress.c create mode 100644 fs/isofs/dir.c create mode 100644 fs/isofs/export.c create mode 100644 fs/isofs/inode.c create mode 100644 fs/isofs/isofs.h create mode 100644 fs/isofs/joliet.c create mode 100644 fs/isofs/namei.c create mode 100644 fs/isofs/rock.c create mode 100644 fs/isofs/rock.h create mode 100644 fs/isofs/util.c create mode 100644 fs/isofs/zisofs.h create mode 100644 fs/jbd/Kconfig create mode 100644 fs/jbd/Makefile create mode 100644 fs/jbd/checkpoint.c create mode 100644 fs/jbd/commit.c create mode 100644 fs/jbd/journal.c create mode 100644 fs/jbd/recovery.c create mode 100644 fs/jbd/revoke.c create mode 100644 fs/jbd/transaction.c create mode 100644 fs/jbd2/Kconfig create mode 100644 fs/jbd2/Makefile create mode 100644 fs/jbd2/checkpoint.c create mode 100644 fs/jbd2/commit.c create mode 100644 fs/jbd2/journal.c create mode 100644 fs/jbd2/recovery.c create mode 100644 fs/jbd2/revoke.c create mode 100644 fs/jbd2/transaction.c create mode 100644 fs/jffs2/Kconfig create mode 100644 fs/jffs2/LICENCE create mode 100644 fs/jffs2/Makefile create mode 100644 fs/jffs2/README.Locking create mode 100644 fs/jffs2/TODO create mode 100644 fs/jffs2/acl.c create mode 100644 fs/jffs2/acl.h create mode 100644 fs/jffs2/background.c create mode 100644 fs/jffs2/build.c create mode 100644 fs/jffs2/compr.c create mode 100644 fs/jffs2/compr.h create mode 100644 fs/jffs2/compr_lzo.c create mode 100644 fs/jffs2/compr_rtime.c create mode 100644 fs/jffs2/compr_rubin.c create mode 100644 fs/jffs2/compr_zlib.c create mode 100644 fs/jffs2/debug.c create mode 100644 fs/jffs2/debug.h create mode 100644 fs/jffs2/dir.c create mode 100644 fs/jffs2/erase.c create mode 100644 fs/jffs2/file.c create mode 100644 fs/jffs2/fs.c create mode 100644 fs/jffs2/gc.c create mode 100644 fs/jffs2/ioctl.c create mode 100644 fs/jffs2/jffs2_fs_i.h create mode 100644 fs/jffs2/jffs2_fs_sb.h create mode 100644 fs/jffs2/malloc.c create mode 100644 fs/jffs2/nodelist.c create mode 100644 fs/jffs2/nodelist.h create mode 100644 fs/jffs2/nodemgmt.c create mode 100644 fs/jffs2/os-linux.h create mode 100644 fs/jffs2/read.c create mode 100644 fs/jffs2/readinode.c create mode 100644 fs/jffs2/scan.c create mode 100644 fs/jffs2/security.c create mode 100644 fs/jffs2/summary.c create mode 100644 fs/jffs2/summary.h create mode 100644 fs/jffs2/super.c create mode 100644 fs/jffs2/symlink.c create mode 100644 fs/jffs2/wbuf.c create mode 100644 fs/jffs2/write.c create mode 100644 fs/jffs2/writev.c create mode 100644 fs/jffs2/xattr.c create mode 100644 fs/jffs2/xattr.h create mode 100644 fs/jffs2/xattr_trusted.c create mode 100644 fs/jffs2/xattr_user.c create mode 100644 fs/jfs/Kconfig create mode 100644 fs/jfs/Makefile create mode 100644 fs/jfs/acl.c create mode 100644 fs/jfs/endian24.h create mode 100644 fs/jfs/file.c create mode 100644 fs/jfs/inode.c create mode 100644 fs/jfs/ioctl.c create mode 100644 fs/jfs/jfs_acl.h create mode 100644 fs/jfs/jfs_btree.h create mode 100644 fs/jfs/jfs_debug.c create mode 100644 fs/jfs/jfs_debug.h create mode 100644 fs/jfs/jfs_dinode.h create mode 100644 fs/jfs/jfs_dmap.c create mode 100644 fs/jfs/jfs_dmap.h create mode 100644 fs/jfs/jfs_dtree.c create mode 100644 fs/jfs/jfs_dtree.h create mode 100644 fs/jfs/jfs_extent.c create mode 100644 fs/jfs/jfs_extent.h create mode 100644 fs/jfs/jfs_filsys.h create mode 100644 fs/jfs/jfs_imap.c create mode 100644 fs/jfs/jfs_imap.h create mode 100644 fs/jfs/jfs_incore.h create mode 100644 fs/jfs/jfs_inode.c create mode 100644 fs/jfs/jfs_inode.h create mode 100644 fs/jfs/jfs_lock.h create mode 100644 fs/jfs/jfs_logmgr.c create mode 100644 fs/jfs/jfs_logmgr.h create mode 100644 fs/jfs/jfs_metapage.c create mode 100644 fs/jfs/jfs_metapage.h create mode 100644 fs/jfs/jfs_mount.c create mode 100644 fs/jfs/jfs_superblock.h create mode 100644 fs/jfs/jfs_txnmgr.c create mode 100644 fs/jfs/jfs_txnmgr.h create mode 100644 fs/jfs/jfs_types.h create mode 100644 fs/jfs/jfs_umount.c create mode 100644 fs/jfs/jfs_unicode.c create mode 100644 fs/jfs/jfs_unicode.h create mode 100644 fs/jfs/jfs_uniupr.c create mode 100644 fs/jfs/jfs_xattr.h create mode 100644 fs/jfs/jfs_xtree.c create mode 100644 fs/jfs/jfs_xtree.h create mode 100644 fs/jfs/namei.c create mode 100644 fs/jfs/resize.c create mode 100644 fs/jfs/super.c create mode 100644 fs/jfs/symlink.c create mode 100644 fs/jfs/xattr.c create mode 100644 fs/libfs.c create mode 100644 fs/lockd/Makefile create mode 100644 fs/lockd/clnt4xdr.c create mode 100644 fs/lockd/clntlock.c create mode 100644 fs/lockd/clntproc.c create mode 100644 fs/lockd/clntxdr.c create mode 100644 fs/lockd/grace.c create mode 100644 fs/lockd/host.c create mode 100644 fs/lockd/mon.c create mode 100644 fs/lockd/svc.c create mode 100644 fs/lockd/svc4proc.c create mode 100644 fs/lockd/svclock.c create mode 100644 fs/lockd/svcproc.c create mode 100644 fs/lockd/svcshare.c create mode 100644 fs/lockd/svcsubs.c create mode 100644 fs/lockd/xdr.c create mode 100644 fs/lockd/xdr4.c create mode 100644 fs/locks.c create mode 100644 fs/logfs/Kconfig create mode 100644 fs/logfs/Makefile create mode 100644 fs/logfs/compr.c create mode 100644 fs/logfs/dev_bdev.c create mode 100644 fs/logfs/dev_mtd.c create mode 100644 fs/logfs/dir.c create mode 100644 fs/logfs/file.c create mode 100644 fs/logfs/gc.c create mode 100644 fs/logfs/inode.c create mode 100644 fs/logfs/journal.c create mode 100644 fs/logfs/logfs.h create mode 100644 fs/logfs/logfs_abi.h create mode 100644 fs/logfs/readwrite.c create mode 100644 fs/logfs/segment.c create mode 100644 fs/logfs/super.c create mode 100644 fs/mbcache.c create mode 100644 fs/minix/Kconfig create mode 100644 fs/minix/Makefile create mode 100644 fs/minix/bitmap.c create mode 100644 fs/minix/dir.c create mode 100644 fs/minix/file.c create mode 100644 fs/minix/inode.c create mode 100644 fs/minix/itree_common.c create mode 100644 fs/minix/itree_v1.c create mode 100644 fs/minix/itree_v2.c create mode 100644 fs/minix/minix.h create mode 100644 fs/minix/namei.c create mode 100644 fs/mpage.c create mode 100644 fs/namei.c create mode 100644 fs/namespace.c create mode 100644 fs/ncpfs/Kconfig create mode 100644 fs/ncpfs/Makefile create mode 100644 fs/ncpfs/dir.c create mode 100644 fs/ncpfs/file.c create mode 100644 fs/ncpfs/getopt.c create mode 100644 fs/ncpfs/getopt.h create mode 100644 fs/ncpfs/inode.c create mode 100644 fs/ncpfs/ioctl.c create mode 100644 fs/ncpfs/mmap.c create mode 100644 fs/ncpfs/ncp_fs.h create mode 100644 fs/ncpfs/ncp_fs_i.h create mode 100644 fs/ncpfs/ncp_fs_sb.h create mode 100644 fs/ncpfs/ncplib_kernel.c create mode 100644 fs/ncpfs/ncplib_kernel.h create mode 100644 fs/ncpfs/ncpsign_kernel.c create mode 100644 fs/ncpfs/ncpsign_kernel.h create mode 100644 fs/ncpfs/sock.c create mode 100644 fs/ncpfs/symlink.c create mode 100644 fs/nfs/Kconfig create mode 100644 fs/nfs/Makefile create mode 100644 fs/nfs/cache_lib.c create mode 100644 fs/nfs/cache_lib.h create mode 100644 fs/nfs/callback.c create mode 100644 fs/nfs/callback.h create mode 100644 fs/nfs/callback_proc.c create mode 100644 fs/nfs/callback_xdr.c create mode 100644 fs/nfs/client.c create mode 100644 fs/nfs/delegation.c create mode 100644 fs/nfs/delegation.h create mode 100644 fs/nfs/dir.c create mode 100644 fs/nfs/direct.c create mode 100644 fs/nfs/dns_resolve.c create mode 100644 fs/nfs/dns_resolve.h create mode 100644 fs/nfs/file.c create mode 100644 fs/nfs/fscache-index.c create mode 100644 fs/nfs/fscache.c create mode 100644 fs/nfs/fscache.h create mode 100644 fs/nfs/getroot.c create mode 100644 fs/nfs/idmap.c create mode 100644 fs/nfs/inode.c create mode 100644 fs/nfs/internal.h create mode 100644 fs/nfs/iostat.h create mode 100644 fs/nfs/mount_clnt.c create mode 100644 fs/nfs/namespace.c create mode 100644 fs/nfs/nfs2xdr.c create mode 100644 fs/nfs/nfs3acl.c create mode 100644 fs/nfs/nfs3proc.c create mode 100644 fs/nfs/nfs3xdr.c create mode 100644 fs/nfs/nfs4_fs.h create mode 100644 fs/nfs/nfs4filelayout.c create mode 100644 fs/nfs/nfs4filelayout.h create mode 100644 fs/nfs/nfs4filelayoutdev.c create mode 100644 fs/nfs/nfs4namespace.c create mode 100644 fs/nfs/nfs4proc.c create mode 100644 fs/nfs/nfs4renewd.c create mode 100644 fs/nfs/nfs4state.c create mode 100644 fs/nfs/nfs4xdr.c create mode 100644 fs/nfs/nfsroot.c create mode 100644 fs/nfs/objlayout/Kbuild create mode 100644 fs/nfs/objlayout/objio_osd.c create mode 100644 fs/nfs/objlayout/objlayout.c create mode 100644 fs/nfs/objlayout/objlayout.h create mode 100644 fs/nfs/objlayout/pnfs_osd_xdr_cli.c create mode 100644 fs/nfs/pagelist.c create mode 100644 fs/nfs/pnfs.c create mode 100644 fs/nfs/pnfs.h create mode 100644 fs/nfs/pnfs_dev.c create mode 100644 fs/nfs/proc.c create mode 100644 fs/nfs/read.c create mode 100644 fs/nfs/super.c create mode 100644 fs/nfs/symlink.c create mode 100644 fs/nfs/sysctl.c create mode 100644 fs/nfs/unlink.c create mode 100644 fs/nfs/write.c create mode 100644 fs/nfs_common/Makefile create mode 100644 fs/nfs_common/nfsacl.c create mode 100644 fs/nfsctl.c create mode 100644 fs/nfsd/Kconfig create mode 100644 fs/nfsd/Makefile create mode 100644 fs/nfsd/acl.h create mode 100644 fs/nfsd/auth.c create mode 100644 fs/nfsd/auth.h create mode 100644 fs/nfsd/cache.h create mode 100644 fs/nfsd/export.c create mode 100644 fs/nfsd/idmap.h create mode 100644 fs/nfsd/lockd.c create mode 100644 fs/nfsd/nfs2acl.c create mode 100644 fs/nfsd/nfs3acl.c create mode 100644 fs/nfsd/nfs3proc.c create mode 100644 fs/nfsd/nfs3xdr.c create mode 100644 fs/nfsd/nfs4acl.c create mode 100644 fs/nfsd/nfs4callback.c create mode 100644 fs/nfsd/nfs4idmap.c create mode 100644 fs/nfsd/nfs4proc.c create mode 100644 fs/nfsd/nfs4recover.c create mode 100644 fs/nfsd/nfs4state.c create mode 100644 fs/nfsd/nfs4xdr.c create mode 100644 fs/nfsd/nfscache.c create mode 100644 fs/nfsd/nfsctl.c create mode 100644 fs/nfsd/nfsd.h create mode 100644 fs/nfsd/nfsfh.c create mode 100644 fs/nfsd/nfsfh.h create mode 100644 fs/nfsd/nfsproc.c create mode 100644 fs/nfsd/nfssvc.c create mode 100644 fs/nfsd/nfsxdr.c create mode 100644 fs/nfsd/state.h create mode 100644 fs/nfsd/stats.c create mode 100644 fs/nfsd/vfs.c create mode 100644 fs/nfsd/vfs.h create mode 100644 fs/nfsd/xdr.h create mode 100644 fs/nfsd/xdr3.h create mode 100644 fs/nfsd/xdr4.h create mode 100644 fs/nilfs2/Kconfig create mode 100644 fs/nilfs2/Makefile create mode 100644 fs/nilfs2/alloc.c create mode 100644 fs/nilfs2/alloc.h create mode 100644 fs/nilfs2/bmap.c create mode 100644 fs/nilfs2/bmap.h create mode 100644 fs/nilfs2/btnode.c create mode 100644 fs/nilfs2/btnode.h create mode 100644 fs/nilfs2/btree.c create mode 100644 fs/nilfs2/btree.h create mode 100644 fs/nilfs2/cpfile.c create mode 100644 fs/nilfs2/cpfile.h create mode 100644 fs/nilfs2/dat.c create mode 100644 fs/nilfs2/dat.h create mode 100644 fs/nilfs2/dir.c create mode 100644 fs/nilfs2/direct.c create mode 100644 fs/nilfs2/direct.h create mode 100644 fs/nilfs2/export.h create mode 100644 fs/nilfs2/file.c create mode 100644 fs/nilfs2/gcinode.c create mode 100644 fs/nilfs2/ifile.c create mode 100644 fs/nilfs2/ifile.h create mode 100644 fs/nilfs2/inode.c create mode 100644 fs/nilfs2/ioctl.c create mode 100644 fs/nilfs2/mdt.c create mode 100644 fs/nilfs2/mdt.h create mode 100644 fs/nilfs2/namei.c create mode 100644 fs/nilfs2/nilfs.h create mode 100644 fs/nilfs2/page.c create mode 100644 fs/nilfs2/page.h create mode 100644 fs/nilfs2/recovery.c create mode 100644 fs/nilfs2/segbuf.c create mode 100644 fs/nilfs2/segbuf.h create mode 100644 fs/nilfs2/segment.c create mode 100644 fs/nilfs2/segment.h create mode 100644 fs/nilfs2/sufile.c create mode 100644 fs/nilfs2/sufile.h create mode 100644 fs/nilfs2/super.c create mode 100644 fs/nilfs2/the_nilfs.c create mode 100644 fs/nilfs2/the_nilfs.h create mode 100644 fs/nls/Kconfig create mode 100644 fs/nls/Makefile create mode 100644 fs/nls/nls_ascii.c create mode 100644 fs/nls/nls_base.c create mode 100644 fs/nls/nls_cp1250.c create mode 100644 fs/nls/nls_cp1251.c create mode 100644 fs/nls/nls_cp1255.c create mode 100644 fs/nls/nls_cp437.c create mode 100644 fs/nls/nls_cp737.c create mode 100644 fs/nls/nls_cp775.c create mode 100644 fs/nls/nls_cp850.c create mode 100644 fs/nls/nls_cp852.c create mode 100644 fs/nls/nls_cp855.c create mode 100644 fs/nls/nls_cp857.c create mode 100644 fs/nls/nls_cp860.c create mode 100644 fs/nls/nls_cp861.c create mode 100644 fs/nls/nls_cp862.c create mode 100644 fs/nls/nls_cp863.c create mode 100644 fs/nls/nls_cp864.c create mode 100644 fs/nls/nls_cp865.c create mode 100644 fs/nls/nls_cp866.c create mode 100644 fs/nls/nls_cp869.c create mode 100644 fs/nls/nls_cp874.c create mode 100644 fs/nls/nls_cp932.c create mode 100644 fs/nls/nls_cp936.c create mode 100644 fs/nls/nls_cp949.c create mode 100644 fs/nls/nls_cp950.c create mode 100644 fs/nls/nls_euc-jp.c create mode 100644 fs/nls/nls_iso8859-1.c create mode 100644 fs/nls/nls_iso8859-13.c create mode 100644 fs/nls/nls_iso8859-14.c create mode 100644 fs/nls/nls_iso8859-15.c create mode 100644 fs/nls/nls_iso8859-2.c create mode 100644 fs/nls/nls_iso8859-3.c create mode 100644 fs/nls/nls_iso8859-4.c create mode 100644 fs/nls/nls_iso8859-5.c create mode 100644 fs/nls/nls_iso8859-6.c create mode 100644 fs/nls/nls_iso8859-7.c create mode 100644 fs/nls/nls_iso8859-9.c create mode 100644 fs/nls/nls_koi8-r.c create mode 100644 fs/nls/nls_koi8-ru.c create mode 100644 fs/nls/nls_koi8-u.c create mode 100644 fs/nls/nls_utf8.c create mode 100644 fs/no-block.c create mode 100644 fs/notify/Kconfig create mode 100644 fs/notify/Makefile create mode 100644 fs/notify/dnotify/Kconfig create mode 100644 fs/notify/dnotify/Makefile create mode 100644 fs/notify/dnotify/dnotify.c create mode 100644 fs/notify/fanotify/Kconfig create mode 100644 fs/notify/fanotify/Makefile create mode 100644 fs/notify/fanotify/fanotify.c create mode 100644 fs/notify/fanotify/fanotify_user.c create mode 100644 fs/notify/fsnotify.c create mode 100644 fs/notify/fsnotify.h create mode 100644 fs/notify/group.c create mode 100644 fs/notify/inode_mark.c create mode 100644 fs/notify/inotify/Kconfig create mode 100644 fs/notify/inotify/Makefile create mode 100644 fs/notify/inotify/inotify.h create mode 100644 fs/notify/inotify/inotify_fsnotify.c create mode 100644 fs/notify/inotify/inotify_user.c create mode 100644 fs/notify/mark.c create mode 100644 fs/notify/notification.c create mode 100644 fs/notify/vfsmount_mark.c create mode 100644 fs/ntfs/Kconfig create mode 100644 fs/ntfs/Makefile create mode 100644 fs/ntfs/aops.c create mode 100644 fs/ntfs/aops.h create mode 100644 fs/ntfs/attrib.c create mode 100644 fs/ntfs/attrib.h create mode 100644 fs/ntfs/bitmap.c create mode 100644 fs/ntfs/bitmap.h create mode 100644 fs/ntfs/collate.c create mode 100644 fs/ntfs/collate.h create mode 100644 fs/ntfs/compress.c create mode 100644 fs/ntfs/debug.c create mode 100644 fs/ntfs/debug.h create mode 100644 fs/ntfs/dir.c create mode 100644 fs/ntfs/dir.h create mode 100644 fs/ntfs/endian.h create mode 100644 fs/ntfs/file.c create mode 100644 fs/ntfs/index.c create mode 100644 fs/ntfs/index.h create mode 100644 fs/ntfs/inode.c create mode 100644 fs/ntfs/inode.h create mode 100644 fs/ntfs/layout.h create mode 100644 fs/ntfs/lcnalloc.c create mode 100644 fs/ntfs/lcnalloc.h create mode 100644 fs/ntfs/logfile.c create mode 100644 fs/ntfs/logfile.h create mode 100644 fs/ntfs/malloc.h create mode 100644 fs/ntfs/mft.c create mode 100644 fs/ntfs/mft.h create mode 100644 fs/ntfs/mst.c create mode 100644 fs/ntfs/namei.c create mode 100644 fs/ntfs/ntfs.h create mode 100644 fs/ntfs/quota.c create mode 100644 fs/ntfs/quota.h create mode 100644 fs/ntfs/runlist.c create mode 100644 fs/ntfs/runlist.h create mode 100644 fs/ntfs/super.c create mode 100644 fs/ntfs/sysctl.c create mode 100644 fs/ntfs/sysctl.h create mode 100644 fs/ntfs/time.h create mode 100644 fs/ntfs/types.h create mode 100644 fs/ntfs/unistr.c create mode 100644 fs/ntfs/upcase.c create mode 100644 fs/ntfs/usnjrnl.c create mode 100644 fs/ntfs/usnjrnl.h create mode 100644 fs/ntfs/volume.h create mode 100644 fs/ocfs2/Kconfig create mode 100644 fs/ocfs2/Makefile create mode 100644 fs/ocfs2/acl.c create mode 100644 fs/ocfs2/acl.h create mode 100644 fs/ocfs2/alloc.c create mode 100644 fs/ocfs2/alloc.h create mode 100644 fs/ocfs2/aops.c create mode 100644 fs/ocfs2/aops.h create mode 100644 fs/ocfs2/blockcheck.c create mode 100644 fs/ocfs2/blockcheck.h create mode 100644 fs/ocfs2/buffer_head_io.c create mode 100644 fs/ocfs2/buffer_head_io.h create mode 100644 fs/ocfs2/cluster/Makefile create mode 100644 fs/ocfs2/cluster/heartbeat.c create mode 100644 fs/ocfs2/cluster/heartbeat.h create mode 100644 fs/ocfs2/cluster/masklog.c create mode 100644 fs/ocfs2/cluster/masklog.h create mode 100644 fs/ocfs2/cluster/netdebug.c create mode 100644 fs/ocfs2/cluster/nodemanager.c create mode 100644 fs/ocfs2/cluster/nodemanager.h create mode 100644 fs/ocfs2/cluster/ocfs2_heartbeat.h create mode 100644 fs/ocfs2/cluster/ocfs2_nodemanager.h create mode 100644 fs/ocfs2/cluster/quorum.c create mode 100644 fs/ocfs2/cluster/quorum.h create mode 100644 fs/ocfs2/cluster/sys.c create mode 100644 fs/ocfs2/cluster/sys.h create mode 100644 fs/ocfs2/cluster/tcp.c create mode 100644 fs/ocfs2/cluster/tcp.h create mode 100644 fs/ocfs2/cluster/tcp_internal.h create mode 100644 fs/ocfs2/cluster/ver.c create mode 100644 fs/ocfs2/cluster/ver.h create mode 100644 fs/ocfs2/dcache.c create mode 100644 fs/ocfs2/dcache.h create mode 100644 fs/ocfs2/dir.c create mode 100644 fs/ocfs2/dir.h create mode 100644 fs/ocfs2/dlm/Makefile create mode 100644 fs/ocfs2/dlm/dlmapi.h create mode 100644 fs/ocfs2/dlm/dlmast.c create mode 100644 fs/ocfs2/dlm/dlmcommon.h create mode 100644 fs/ocfs2/dlm/dlmconvert.c create mode 100644 fs/ocfs2/dlm/dlmconvert.h create mode 100644 fs/ocfs2/dlm/dlmdebug.c create mode 100644 fs/ocfs2/dlm/dlmdebug.h create mode 100644 fs/ocfs2/dlm/dlmdomain.c create mode 100644 fs/ocfs2/dlm/dlmdomain.h create mode 100644 fs/ocfs2/dlm/dlmlock.c create mode 100644 fs/ocfs2/dlm/dlmmaster.c create mode 100644 fs/ocfs2/dlm/dlmrecovery.c create mode 100644 fs/ocfs2/dlm/dlmthread.c create mode 100644 fs/ocfs2/dlm/dlmunlock.c create mode 100644 fs/ocfs2/dlm/dlmver.c create mode 100644 fs/ocfs2/dlm/dlmver.h create mode 100644 fs/ocfs2/dlmfs/Makefile create mode 100644 fs/ocfs2/dlmfs/dlmfs.c create mode 100644 fs/ocfs2/dlmfs/dlmfsver.c create mode 100644 fs/ocfs2/dlmfs/dlmfsver.h create mode 100644 fs/ocfs2/dlmfs/userdlm.c create mode 100644 fs/ocfs2/dlmfs/userdlm.h create mode 100644 fs/ocfs2/dlmglue.c create mode 100644 fs/ocfs2/dlmglue.h create mode 100644 fs/ocfs2/export.c create mode 100644 fs/ocfs2/export.h create mode 100644 fs/ocfs2/extent_map.c create mode 100644 fs/ocfs2/extent_map.h create mode 100644 fs/ocfs2/file.c create mode 100644 fs/ocfs2/file.h create mode 100644 fs/ocfs2/heartbeat.c create mode 100644 fs/ocfs2/heartbeat.h create mode 100644 fs/ocfs2/inode.c create mode 100644 fs/ocfs2/inode.h create mode 100644 fs/ocfs2/ioctl.c create mode 100644 fs/ocfs2/ioctl.h create mode 100644 fs/ocfs2/journal.c create mode 100644 fs/ocfs2/journal.h create mode 100644 fs/ocfs2/localalloc.c create mode 100644 fs/ocfs2/localalloc.h create mode 100644 fs/ocfs2/locks.c create mode 100644 fs/ocfs2/locks.h create mode 100644 fs/ocfs2/mmap.c create mode 100644 fs/ocfs2/mmap.h create mode 100644 fs/ocfs2/move_extents.c create mode 100644 fs/ocfs2/move_extents.h create mode 100644 fs/ocfs2/namei.c create mode 100644 fs/ocfs2/namei.h create mode 100644 fs/ocfs2/ocfs1_fs_compat.h create mode 100644 fs/ocfs2/ocfs2.h create mode 100644 fs/ocfs2/ocfs2_fs.h create mode 100644 fs/ocfs2/ocfs2_ioctl.h create mode 100644 fs/ocfs2/ocfs2_lockid.h create mode 100644 fs/ocfs2/ocfs2_lockingver.h create mode 100644 fs/ocfs2/ocfs2_trace.h create mode 100644 fs/ocfs2/quota.h create mode 100644 fs/ocfs2/quota_global.c create mode 100644 fs/ocfs2/quota_local.c create mode 100644 fs/ocfs2/refcounttree.c create mode 100644 fs/ocfs2/refcounttree.h create mode 100644 fs/ocfs2/reservations.c create mode 100644 fs/ocfs2/reservations.h create mode 100644 fs/ocfs2/resize.c create mode 100644 fs/ocfs2/resize.h create mode 100644 fs/ocfs2/slot_map.c create mode 100644 fs/ocfs2/slot_map.h create mode 100644 fs/ocfs2/stack_o2cb.c create mode 100644 fs/ocfs2/stack_user.c create mode 100644 fs/ocfs2/stackglue.c create mode 100644 fs/ocfs2/stackglue.h create mode 100644 fs/ocfs2/suballoc.c create mode 100644 fs/ocfs2/suballoc.h create mode 100644 fs/ocfs2/super.c create mode 100644 fs/ocfs2/super.h create mode 100644 fs/ocfs2/symlink.c create mode 100644 fs/ocfs2/symlink.h create mode 100644 fs/ocfs2/sysfile.c create mode 100644 fs/ocfs2/sysfile.h create mode 100644 fs/ocfs2/uptodate.c create mode 100644 fs/ocfs2/uptodate.h create mode 100644 fs/ocfs2/ver.c create mode 100644 fs/ocfs2/ver.h create mode 100644 fs/ocfs2/xattr.c create mode 100644 fs/ocfs2/xattr.h create mode 100644 fs/omfs/Kconfig create mode 100644 fs/omfs/Makefile create mode 100644 fs/omfs/bitmap.c create mode 100644 fs/omfs/dir.c create mode 100644 fs/omfs/file.c create mode 100644 fs/omfs/inode.c create mode 100644 fs/omfs/omfs.h create mode 100644 fs/omfs/omfs_fs.h create mode 100644 fs/open.c create mode 100644 fs/openpromfs/Makefile create mode 100644 fs/openpromfs/inode.c create mode 100644 fs/partitions/Kconfig create mode 100644 fs/partitions/Makefile create mode 100644 fs/partitions/acorn.c create mode 100644 fs/partitions/acorn.h create mode 100644 fs/partitions/amiga.c create mode 100644 fs/partitions/amiga.h create mode 100644 fs/partitions/atari.c create mode 100644 fs/partitions/atari.h create mode 100644 fs/partitions/check.c create mode 100644 fs/partitions/check.h create mode 100644 fs/partitions/efi.c create mode 100644 fs/partitions/efi.h create mode 100644 fs/partitions/ibm.c create mode 100644 fs/partitions/ibm.h create mode 100644 fs/partitions/karma.c create mode 100644 fs/partitions/karma.h create mode 100644 fs/partitions/ldm.c create mode 100644 fs/partitions/ldm.h create mode 100644 fs/partitions/mac.c create mode 100644 fs/partitions/mac.h create mode 100644 fs/partitions/msdos.c create mode 100644 fs/partitions/msdos.h create mode 100644 fs/partitions/osf.c create mode 100644 fs/partitions/osf.h create mode 100644 fs/partitions/sgi.c create mode 100644 fs/partitions/sgi.h create mode 100644 fs/partitions/sun.c create mode 100644 fs/partitions/sun.h create mode 100644 fs/partitions/sysv68.c create mode 100644 fs/partitions/sysv68.h create mode 100644 fs/partitions/ultrix.c create mode 100644 fs/partitions/ultrix.h create mode 100644 fs/pipe.c create mode 100644 fs/pnode.c create mode 100644 fs/pnode.h create mode 100644 fs/posix_acl.c create mode 100644 fs/proc/Kconfig create mode 100644 fs/proc/Makefile create mode 100644 fs/proc/array.c create mode 100644 fs/proc/base.c create mode 100644 fs/proc/cmdline.c create mode 100644 fs/proc/consoles.c create mode 100644 fs/proc/cpuinfo.c create mode 100644 fs/proc/devices.c create mode 100644 fs/proc/generic.c create mode 100644 fs/proc/inode.c create mode 100644 fs/proc/internal.h create mode 100644 fs/proc/interrupts.c create mode 100644 fs/proc/kcore.c create mode 100644 fs/proc/kmsg.c create mode 100644 fs/proc/loadavg.c create mode 100644 fs/proc/meminfo.c create mode 100644 fs/proc/mmu.c create mode 100644 fs/proc/namespaces.c create mode 100644 fs/proc/nommu.c create mode 100644 fs/proc/page.c create mode 100644 fs/proc/proc_devtree.c create mode 100644 fs/proc/proc_net.c create mode 100644 fs/proc/proc_sysctl.c create mode 100644 fs/proc/proc_tty.c create mode 100644 fs/proc/root.c create mode 100644 fs/proc/softirqs.c create mode 100644 fs/proc/stat.c create mode 100644 fs/proc/task_mmu.c create mode 100644 fs/proc/task_nommu.c create mode 100644 fs/proc/uptime.c create mode 100644 fs/proc/version.c create mode 100644 fs/proc/vmcore.c create mode 100644 fs/pstore/Kconfig create mode 100644 fs/pstore/Makefile create mode 100644 fs/pstore/inode.c create mode 100644 fs/pstore/internal.h create mode 100644 fs/pstore/platform.c create mode 100644 fs/qnx4/Kconfig create mode 100644 fs/qnx4/Makefile create mode 100644 fs/qnx4/README create mode 100644 fs/qnx4/bitmap.c create mode 100644 fs/qnx4/dir.c create mode 100644 fs/qnx4/inode.c create mode 100644 fs/qnx4/namei.c create mode 100644 fs/qnx4/qnx4.h create mode 100644 fs/quota/Kconfig create mode 100644 fs/quota/Makefile create mode 100644 fs/quota/compat.c create mode 100644 fs/quota/dquot.c create mode 100644 fs/quota/netlink.c create mode 100644 fs/quota/quota.c create mode 100644 fs/quota/quota_tree.c create mode 100644 fs/quota/quota_tree.h create mode 100644 fs/quota/quota_v1.c create mode 100644 fs/quota/quota_v2.c create mode 100644 fs/quota/quotaio_v1.h create mode 100644 fs/quota/quotaio_v2.h create mode 100644 fs/ramfs/Makefile create mode 100644 fs/ramfs/file-mmu.c create mode 100644 fs/ramfs/file-nommu.c create mode 100644 fs/ramfs/inode.c create mode 100644 fs/ramfs/internal.h create mode 100644 fs/read_write.c create mode 100644 fs/read_write.h create mode 100644 fs/readdir.c create mode 100644 fs/reiserfs/Kconfig create mode 100644 fs/reiserfs/Makefile create mode 100644 fs/reiserfs/README create mode 100644 fs/reiserfs/bitmap.c create mode 100644 fs/reiserfs/dir.c create mode 100644 fs/reiserfs/do_balan.c create mode 100644 fs/reiserfs/file.c create mode 100644 fs/reiserfs/fix_node.c create mode 100644 fs/reiserfs/hashes.c create mode 100644 fs/reiserfs/ibalance.c create mode 100644 fs/reiserfs/inode.c create mode 100644 fs/reiserfs/ioctl.c create mode 100644 fs/reiserfs/item_ops.c create mode 100644 fs/reiserfs/journal.c create mode 100644 fs/reiserfs/lbalance.c create mode 100644 fs/reiserfs/lock.c create mode 100644 fs/reiserfs/namei.c create mode 100644 fs/reiserfs/objectid.c create mode 100644 fs/reiserfs/prints.c create mode 100644 fs/reiserfs/procfs.c create mode 100644 fs/reiserfs/resize.c create mode 100644 fs/reiserfs/stree.c create mode 100644 fs/reiserfs/super.c create mode 100644 fs/reiserfs/tail_conversion.c create mode 100644 fs/reiserfs/xattr.c create mode 100644 fs/reiserfs/xattr_acl.c create mode 100644 fs/reiserfs/xattr_security.c create mode 100644 fs/reiserfs/xattr_trusted.c create mode 100644 fs/reiserfs/xattr_user.c create mode 100644 fs/romfs/Kconfig create mode 100644 fs/romfs/Makefile create mode 100644 fs/romfs/internal.h create mode 100644 fs/romfs/mmap-nommu.c create mode 100644 fs/romfs/storage.c create mode 100644 fs/romfs/super.c create mode 100644 fs/select.c create mode 100644 fs/seq_file.c create mode 100644 fs/signalfd.c create mode 100644 fs/splice.c create mode 100644 fs/squashfs/Kconfig create mode 100644 fs/squashfs/Makefile create mode 100644 fs/squashfs/block.c create mode 100644 fs/squashfs/cache.c create mode 100644 fs/squashfs/decompressor.c create mode 100644 fs/squashfs/decompressor.h create mode 100644 fs/squashfs/dir.c create mode 100644 fs/squashfs/export.c create mode 100644 fs/squashfs/file.c create mode 100644 fs/squashfs/fragment.c create mode 100644 fs/squashfs/id.c create mode 100644 fs/squashfs/inode.c create mode 100644 fs/squashfs/lzo_wrapper.c create mode 100644 fs/squashfs/namei.c create mode 100644 fs/squashfs/squashfs.h create mode 100644 fs/squashfs/squashfs_fs.h create mode 100644 fs/squashfs/squashfs_fs_i.h create mode 100644 fs/squashfs/squashfs_fs_sb.h create mode 100644 fs/squashfs/super.c create mode 100644 fs/squashfs/symlink.c create mode 100644 fs/squashfs/xattr.c create mode 100644 fs/squashfs/xattr.h create mode 100644 fs/squashfs/xattr_id.c create mode 100644 fs/squashfs/xz_wrapper.c create mode 100644 fs/squashfs/zlib_wrapper.c create mode 100644 fs/stack.c create mode 100644 fs/stat.c create mode 100644 fs/statfs.c create mode 100644 fs/super.c create mode 100644 fs/sync.c create mode 100644 fs/sysfs/Kconfig create mode 100644 fs/sysfs/Makefile create mode 100644 fs/sysfs/bin.c create mode 100644 fs/sysfs/dir.c create mode 100644 fs/sysfs/file.c create mode 100644 fs/sysfs/group.c create mode 100644 fs/sysfs/inode.c create mode 100644 fs/sysfs/mount.c create mode 100644 fs/sysfs/symlink.c create mode 100644 fs/sysfs/sysfs.h create mode 100644 fs/sysv/Kconfig create mode 100644 fs/sysv/Makefile create mode 100644 fs/sysv/balloc.c create mode 100644 fs/sysv/dir.c create mode 100644 fs/sysv/file.c create mode 100644 fs/sysv/ialloc.c create mode 100644 fs/sysv/inode.c create mode 100644 fs/sysv/itree.c create mode 100644 fs/sysv/namei.c create mode 100644 fs/sysv/super.c create mode 100644 fs/sysv/symlink.c create mode 100644 fs/sysv/sysv.h create mode 100644 fs/timerfd.c create mode 100644 fs/ubifs/Kconfig create mode 100644 fs/ubifs/Makefile create mode 100644 fs/ubifs/budget.c create mode 100644 fs/ubifs/commit.c create mode 100644 fs/ubifs/compress.c create mode 100644 fs/ubifs/debug.c create mode 100644 fs/ubifs/debug.h create mode 100644 fs/ubifs/dir.c create mode 100644 fs/ubifs/file.c create mode 100644 fs/ubifs/find.c create mode 100644 fs/ubifs/gc.c create mode 100644 fs/ubifs/io.c create mode 100644 fs/ubifs/ioctl.c create mode 100644 fs/ubifs/journal.c create mode 100644 fs/ubifs/key.h create mode 100644 fs/ubifs/log.c create mode 100644 fs/ubifs/lprops.c create mode 100644 fs/ubifs/lpt.c create mode 100644 fs/ubifs/lpt_commit.c create mode 100644 fs/ubifs/master.c create mode 100644 fs/ubifs/misc.h create mode 100644 fs/ubifs/orphan.c create mode 100644 fs/ubifs/recovery.c create mode 100644 fs/ubifs/replay.c create mode 100644 fs/ubifs/sb.c create mode 100644 fs/ubifs/scan.c create mode 100644 fs/ubifs/shrinker.c create mode 100644 fs/ubifs/super.c create mode 100644 fs/ubifs/tnc.c create mode 100644 fs/ubifs/tnc_commit.c create mode 100644 fs/ubifs/tnc_misc.c create mode 100644 fs/ubifs/ubifs-media.h create mode 100644 fs/ubifs/ubifs.h create mode 100644 fs/ubifs/xattr.c create mode 100644 fs/udf/Kconfig create mode 100644 fs/udf/Makefile create mode 100644 fs/udf/balloc.c create mode 100644 fs/udf/dir.c create mode 100644 fs/udf/directory.c create mode 100644 fs/udf/ecma_167.h create mode 100644 fs/udf/file.c create mode 100644 fs/udf/ialloc.c create mode 100644 fs/udf/inode.c create mode 100644 fs/udf/lowlevel.c create mode 100644 fs/udf/misc.c create mode 100644 fs/udf/namei.c create mode 100644 fs/udf/osta_udf.h create mode 100644 fs/udf/partition.c create mode 100644 fs/udf/super.c create mode 100644 fs/udf/symlink.c create mode 100644 fs/udf/truncate.c create mode 100644 fs/udf/udf_i.h create mode 100644 fs/udf/udf_sb.h create mode 100644 fs/udf/udfdecl.h create mode 100644 fs/udf/udfend.h create mode 100644 fs/udf/udftime.c create mode 100644 fs/udf/unicode.c create mode 100644 fs/ufs/Kconfig create mode 100644 fs/ufs/Makefile create mode 100644 fs/ufs/balloc.c create mode 100644 fs/ufs/cylinder.c create mode 100644 fs/ufs/dir.c create mode 100644 fs/ufs/file.c create mode 100644 fs/ufs/ialloc.c create mode 100644 fs/ufs/inode.c create mode 100644 fs/ufs/namei.c create mode 100644 fs/ufs/super.c create mode 100644 fs/ufs/swab.h create mode 100644 fs/ufs/symlink.c create mode 100644 fs/ufs/truncate.c create mode 100644 fs/ufs/ufs.h create mode 100644 fs/ufs/ufs_fs.h create mode 100644 fs/ufs/util.c create mode 100644 fs/ufs/util.h create mode 100644 fs/utimes.c create mode 100644 fs/xattr.c create mode 100644 fs/xattr_acl.c create mode 100644 fs/xfs/Kconfig create mode 100644 fs/xfs/Makefile create mode 100644 fs/xfs/linux-2.6/kmem.c create mode 100644 fs/xfs/linux-2.6/kmem.h create mode 100644 fs/xfs/linux-2.6/mrlock.h create mode 100644 fs/xfs/linux-2.6/time.h create mode 100644 fs/xfs/linux-2.6/xfs_acl.c create mode 100644 fs/xfs/linux-2.6/xfs_aops.c create mode 100644 fs/xfs/linux-2.6/xfs_aops.h create mode 100644 fs/xfs/linux-2.6/xfs_buf.c create mode 100644 fs/xfs/linux-2.6/xfs_buf.h create mode 100644 fs/xfs/linux-2.6/xfs_discard.c create mode 100644 fs/xfs/linux-2.6/xfs_discard.h create mode 100644 fs/xfs/linux-2.6/xfs_export.c create mode 100644 fs/xfs/linux-2.6/xfs_export.h create mode 100644 fs/xfs/linux-2.6/xfs_file.c create mode 100644 fs/xfs/linux-2.6/xfs_fs_subr.c create mode 100644 fs/xfs/linux-2.6/xfs_globals.c create mode 100644 fs/xfs/linux-2.6/xfs_ioctl.c create mode 100644 fs/xfs/linux-2.6/xfs_ioctl.h create mode 100644 fs/xfs/linux-2.6/xfs_ioctl32.c create mode 100644 fs/xfs/linux-2.6/xfs_ioctl32.h create mode 100644 fs/xfs/linux-2.6/xfs_iops.c create mode 100644 fs/xfs/linux-2.6/xfs_iops.h create mode 100644 fs/xfs/linux-2.6/xfs_linux.h create mode 100644 fs/xfs/linux-2.6/xfs_message.c create mode 100644 fs/xfs/linux-2.6/xfs_message.h create mode 100644 fs/xfs/linux-2.6/xfs_quotaops.c create mode 100644 fs/xfs/linux-2.6/xfs_stats.c create mode 100644 fs/xfs/linux-2.6/xfs_stats.h create mode 100644 fs/xfs/linux-2.6/xfs_super.c create mode 100644 fs/xfs/linux-2.6/xfs_super.h create mode 100644 fs/xfs/linux-2.6/xfs_sync.c create mode 100644 fs/xfs/linux-2.6/xfs_sync.h create mode 100644 fs/xfs/linux-2.6/xfs_sysctl.c create mode 100644 fs/xfs/linux-2.6/xfs_sysctl.h create mode 100644 fs/xfs/linux-2.6/xfs_trace.c create mode 100644 fs/xfs/linux-2.6/xfs_trace.h create mode 100644 fs/xfs/linux-2.6/xfs_vnode.h create mode 100644 fs/xfs/linux-2.6/xfs_xattr.c create mode 100644 fs/xfs/quota/xfs_dquot.c create mode 100644 fs/xfs/quota/xfs_dquot.h create mode 100644 fs/xfs/quota/xfs_dquot_item.c create mode 100644 fs/xfs/quota/xfs_dquot_item.h create mode 100644 fs/xfs/quota/xfs_qm.c create mode 100644 fs/xfs/quota/xfs_qm.h create mode 100644 fs/xfs/quota/xfs_qm_bhv.c create mode 100644 fs/xfs/quota/xfs_qm_stats.c create mode 100644 fs/xfs/quota/xfs_qm_stats.h create mode 100644 fs/xfs/quota/xfs_qm_syscalls.c create mode 100644 fs/xfs/quota/xfs_quota_priv.h create mode 100644 fs/xfs/quota/xfs_trans_dquot.c create mode 100644 fs/xfs/support/uuid.c create mode 100644 fs/xfs/support/uuid.h create mode 100644 fs/xfs/xfs.h create mode 100644 fs/xfs/xfs_acl.h create mode 100644 fs/xfs/xfs_ag.h create mode 100644 fs/xfs/xfs_alloc.c create mode 100644 fs/xfs/xfs_alloc.h create mode 100644 fs/xfs/xfs_alloc_btree.c create mode 100644 fs/xfs/xfs_alloc_btree.h create mode 100644 fs/xfs/xfs_arch.h create mode 100644 fs/xfs/xfs_attr.c create mode 100644 fs/xfs/xfs_attr.h create mode 100644 fs/xfs/xfs_attr_leaf.c create mode 100644 fs/xfs/xfs_attr_leaf.h create mode 100644 fs/xfs/xfs_attr_sf.h create mode 100644 fs/xfs/xfs_bit.c create mode 100644 fs/xfs/xfs_bit.h create mode 100644 fs/xfs/xfs_bmap.c create mode 100644 fs/xfs/xfs_bmap.h create mode 100644 fs/xfs/xfs_bmap_btree.c create mode 100644 fs/xfs/xfs_bmap_btree.h create mode 100644 fs/xfs/xfs_btree.c create mode 100644 fs/xfs/xfs_btree.h create mode 100644 fs/xfs/xfs_btree_trace.c create mode 100644 fs/xfs/xfs_btree_trace.h create mode 100644 fs/xfs/xfs_buf_item.c create mode 100644 fs/xfs/xfs_buf_item.h create mode 100644 fs/xfs/xfs_da_btree.c create mode 100644 fs/xfs/xfs_da_btree.h create mode 100644 fs/xfs/xfs_dfrag.c create mode 100644 fs/xfs/xfs_dfrag.h create mode 100644 fs/xfs/xfs_dinode.h create mode 100644 fs/xfs/xfs_dir2.c create mode 100644 fs/xfs/xfs_dir2.h create mode 100644 fs/xfs/xfs_dir2_block.c create mode 100644 fs/xfs/xfs_dir2_block.h create mode 100644 fs/xfs/xfs_dir2_data.c create mode 100644 fs/xfs/xfs_dir2_data.h create mode 100644 fs/xfs/xfs_dir2_leaf.c create mode 100644 fs/xfs/xfs_dir2_leaf.h create mode 100644 fs/xfs/xfs_dir2_node.c create mode 100644 fs/xfs/xfs_dir2_node.h create mode 100644 fs/xfs/xfs_dir2_sf.c create mode 100644 fs/xfs/xfs_dir2_sf.h create mode 100644 fs/xfs/xfs_error.c create mode 100644 fs/xfs/xfs_error.h create mode 100644 fs/xfs/xfs_extfree_item.c create mode 100644 fs/xfs/xfs_extfree_item.h create mode 100644 fs/xfs/xfs_filestream.c create mode 100644 fs/xfs/xfs_filestream.h create mode 100644 fs/xfs/xfs_fs.h create mode 100644 fs/xfs/xfs_fsops.c create mode 100644 fs/xfs/xfs_fsops.h create mode 100644 fs/xfs/xfs_ialloc.c create mode 100644 fs/xfs/xfs_ialloc.h create mode 100644 fs/xfs/xfs_ialloc_btree.c create mode 100644 fs/xfs/xfs_ialloc_btree.h create mode 100644 fs/xfs/xfs_iget.c create mode 100644 fs/xfs/xfs_inode.c create mode 100644 fs/xfs/xfs_inode.h create mode 100644 fs/xfs/xfs_inode_item.c create mode 100644 fs/xfs/xfs_inode_item.h create mode 100644 fs/xfs/xfs_inum.h create mode 100644 fs/xfs/xfs_iomap.c create mode 100644 fs/xfs/xfs_iomap.h create mode 100644 fs/xfs/xfs_itable.c create mode 100644 fs/xfs/xfs_itable.h create mode 100644 fs/xfs/xfs_log.c create mode 100644 fs/xfs/xfs_log.h create mode 100644 fs/xfs/xfs_log_cil.c create mode 100644 fs/xfs/xfs_log_priv.h create mode 100644 fs/xfs/xfs_log_recover.c create mode 100644 fs/xfs/xfs_log_recover.h create mode 100644 fs/xfs/xfs_mount.c create mode 100644 fs/xfs/xfs_mount.h create mode 100644 fs/xfs/xfs_mru_cache.c create mode 100644 fs/xfs/xfs_mru_cache.h create mode 100644 fs/xfs/xfs_quota.h create mode 100644 fs/xfs/xfs_rename.c create mode 100644 fs/xfs/xfs_rtalloc.c create mode 100644 fs/xfs/xfs_rtalloc.h create mode 100644 fs/xfs/xfs_rw.c create mode 100644 fs/xfs/xfs_rw.h create mode 100644 fs/xfs/xfs_sb.h create mode 100644 fs/xfs/xfs_trans.c create mode 100644 fs/xfs/xfs_trans.h create mode 100644 fs/xfs/xfs_trans_ail.c create mode 100644 fs/xfs/xfs_trans_buf.c create mode 100644 fs/xfs/xfs_trans_extfree.c create mode 100644 fs/xfs/xfs_trans_inode.c create mode 100644 fs/xfs/xfs_trans_priv.h create mode 100644 fs/xfs/xfs_trans_space.h create mode 100644 fs/xfs/xfs_types.h create mode 100644 fs/xfs/xfs_utils.c create mode 100644 fs/xfs/xfs_utils.h create mode 100644 fs/xfs/xfs_vnodeops.c create mode 100644 fs/xfs/xfs_vnodeops.h create mode 100644 fs/yaffs2/Kconfig create mode 100644 fs/yaffs2/Makefile create mode 100644 fs/yaffs2/yaffs_allocator.c create mode 100644 fs/yaffs2/yaffs_allocator.h create mode 100644 fs/yaffs2/yaffs_attribs.c create mode 100644 fs/yaffs2/yaffs_attribs.h create mode 100644 fs/yaffs2/yaffs_bitmap.c create mode 100644 fs/yaffs2/yaffs_bitmap.h create mode 100644 fs/yaffs2/yaffs_checkptrw.c create mode 100644 fs/yaffs2/yaffs_checkptrw.h create mode 100644 fs/yaffs2/yaffs_ecc.c create mode 100644 fs/yaffs2/yaffs_ecc.h create mode 100644 fs/yaffs2/yaffs_getblockinfo.h create mode 100644 fs/yaffs2/yaffs_guts.c create mode 100644 fs/yaffs2/yaffs_guts.h create mode 100644 fs/yaffs2/yaffs_linux.h create mode 100644 fs/yaffs2/yaffs_mtdif.c create mode 100644 fs/yaffs2/yaffs_mtdif.h create mode 100644 fs/yaffs2/yaffs_mtdif1.c create mode 100644 fs/yaffs2/yaffs_mtdif1.h create mode 100644 fs/yaffs2/yaffs_mtdif2.c create mode 100644 fs/yaffs2/yaffs_mtdif2.h create mode 100644 fs/yaffs2/yaffs_nameval.c create mode 100644 fs/yaffs2/yaffs_nameval.h create mode 100644 fs/yaffs2/yaffs_nand.c create mode 100644 fs/yaffs2/yaffs_nand.h create mode 100644 fs/yaffs2/yaffs_packedtags1.c create mode 100644 fs/yaffs2/yaffs_packedtags1.h create mode 100644 fs/yaffs2/yaffs_packedtags2.c create mode 100644 fs/yaffs2/yaffs_packedtags2.h create mode 100644 fs/yaffs2/yaffs_tagscompat.c create mode 100644 fs/yaffs2/yaffs_tagscompat.h create mode 100644 fs/yaffs2/yaffs_tagsvalidity.c create mode 100644 fs/yaffs2/yaffs_tagsvalidity.h create mode 100644 fs/yaffs2/yaffs_trace.h create mode 100644 fs/yaffs2/yaffs_verify.c create mode 100644 fs/yaffs2/yaffs_verify.h create mode 100644 fs/yaffs2/yaffs_vfs.c create mode 100644 fs/yaffs2/yaffs_yaffs1.c create mode 100644 fs/yaffs2/yaffs_yaffs1.h create mode 100644 fs/yaffs2/yaffs_yaffs2.c create mode 100644 fs/yaffs2/yaffs_yaffs2.h create mode 100644 fs/yaffs2/yportenv.h (limited to 'fs') diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig new file mode 100644 index 00000000..0a93dc1c --- /dev/null +++ b/fs/9p/Kconfig @@ -0,0 +1,34 @@ +config 9P_FS + tristate "Plan 9 Resource Sharing Support (9P2000)" + depends on INET && NET_9P + help + If you say Y here, you will get experimental support for + Plan 9 resource sharing via the 9P2000 protocol. + + See for more information. + + If unsure, say N. + +if 9P_FS +config 9P_FSCACHE + bool "Enable 9P client caching support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y + help + Choose Y here to enable persistent, read-only local + caching support for 9p clients using FS-Cache + + +config 9P_FS_POSIX_ACL + bool "9P POSIX Access Control Lists" + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +endif diff --git a/fs/9p/Makefile b/fs/9p/Makefile new file mode 100644 index 00000000..ab8c1278 --- /dev/null +++ b/fs/9p/Makefile @@ -0,0 +1,17 @@ +obj-$(CONFIG_9P_FS) := 9p.o + +9p-objs := \ + vfs_super.o \ + vfs_inode.o \ + vfs_inode_dotl.o \ + vfs_addr.o \ + vfs_file.o \ + vfs_dir.o \ + vfs_dentry.o \ + v9fs.o \ + fid.o \ + xattr.o \ + xattr_user.o + +9p-$(CONFIG_9P_FSCACHE) += cache.o +9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o diff --git a/fs/9p/acl.c b/fs/9p/acl.c new file mode 100644 index 00000000..4a866cd0 --- /dev/null +++ b/fs/9p/acl.c @@ -0,0 +1,407 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" +#include "v9fs.h" +#include "v9fs_vfs.h" + +static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) +{ + ssize_t size; + void *value = NULL; + struct posix_acl *acl = NULL; + + size = v9fs_fid_xattr_get(fid, name, NULL, 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = v9fs_fid_xattr_get(fid, name, value, size); + if (size > 0) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + goto err_out; + } + } else if (size == -ENODATA || size == 0 || + size == -ENOSYS || size == -EOPNOTSUPP) { + acl = NULL; + } else + acl = ERR_PTR(-EIO); + +err_out: + kfree(value); + return acl; +} + +int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) +{ + int retval = 0; + struct posix_acl *pacl, *dacl; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) || + ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) { + set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL); + set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); + return 0; + } + /* get the default/access acl values and cache them */ + dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT); + pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS); + + if (!IS_ERR(dacl) && !IS_ERR(pacl)) { + set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl); + set_cached_acl(inode, ACL_TYPE_ACCESS, pacl); + } else + retval = -EIO; + + if (!IS_ERR(dacl)) + posix_acl_release(dacl); + + if (!IS_ERR(pacl)) + posix_acl_release(pacl); + + return retval; +} + +static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + /* + * 9p Always cache the acl value when + * instantiating the inode (v9fs_inode_from_fid) + */ + acl = get_cached_acl(inode, type); + BUG_ON(acl == ACL_NOT_CACHED); + return acl; +} + +int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags) +{ + struct posix_acl *acl; + struct v9fs_session_info *v9ses; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + v9ses = v9fs_inode2v9ses(inode); + if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) || + ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) { + /* + * On access = client and acl = on mode get the acl + * values from the server + */ + return 0; + } + acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + return -EAGAIN; +} + +static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl) +{ + int retval; + char *name; + size_t size; + void *buffer; + struct inode *inode = dentry->d_inode; + + set_cached_acl(inode, type, acl); + + if (!acl) + return 0; + + /* Set a setxattr request to server */ + size = posix_acl_xattr_size(acl->a_count); + buffer = kmalloc(size, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + retval = posix_acl_to_xattr(acl, buffer, size); + if (retval < 0) + goto err_free_out; + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + retval = v9fs_xattr_set(dentry, name, buffer, size, 0); +err_free_out: + kfree(buffer); + return retval; +} + +int v9fs_acl_chmod(struct dentry *dentry) +{ + int retval = 0; + struct posix_acl *acl, *clone; + struct inode *inode = dentry->d_inode; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); + if (acl) { + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + retval = posix_acl_chmod_masq(clone, inode->i_mode); + if (!retval) + retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, clone); + posix_acl_release(clone); + } + return retval; +} + +int v9fs_set_create_acl(struct dentry *dentry, + struct posix_acl **dpacl, struct posix_acl **pacl) +{ + if (dentry) { + v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, *dpacl); + v9fs_set_acl(dentry, ACL_TYPE_ACCESS, *pacl); + } + posix_acl_release(*dpacl); + posix_acl_release(*pacl); + *dpacl = *pacl = NULL; + return 0; +} + +int v9fs_acl_mode(struct inode *dir, mode_t *modep, + struct posix_acl **dpacl, struct posix_acl **pacl) +{ + int retval = 0; + mode_t mode = *modep; + struct posix_acl *acl = NULL; + + if (!S_ISLNK(mode)) { + acl = v9fs_get_cached_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (!acl) + mode &= ~current_umask(); + } + if (acl) { + struct posix_acl *clone; + + if (S_ISDIR(mode)) + *dpacl = posix_acl_dup(acl); + clone = posix_acl_clone(acl, GFP_NOFS); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + + retval = posix_acl_create_masq(clone, &mode); + if (retval < 0) { + posix_acl_release(clone); + goto cleanup; + } + if (retval > 0) + *pacl = clone; + else + posix_acl_release(clone); + } + *modep = mode; + return 0; +cleanup: + return retval; + +} + +static int v9fs_remote_get_acl(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + char *full_name; + + switch (type) { + case ACL_TYPE_ACCESS: + full_name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + full_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + return v9fs_xattr_get(dentry, full_name, buffer, size); +} + +static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct v9fs_session_info *v9ses; + struct posix_acl *acl; + int error; + + if (strcmp(name, "") != 0) + return -EINVAL; + + v9ses = v9fs_dentry2v9ses(dentry); + /* + * We allow set/get/list of acl when access=client is not specified + */ + if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) + return v9fs_remote_get_acl(dentry, name, buffer, size, type); + + acl = v9fs_get_cached_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +static int v9fs_remote_set_acl(struct dentry *dentry, const char *name, + const void *value, size_t size, + int flags, int type) +{ + char *full_name; + + switch (type) { + case ACL_TYPE_ACCESS: + full_name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + full_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + return v9fs_xattr_set(dentry, full_name, value, size, flags); +} + + +static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, + const void *value, size_t size, + int flags, int type) +{ + int retval; + struct posix_acl *acl; + struct v9fs_session_info *v9ses; + struct inode *inode = dentry->d_inode; + + if (strcmp(name, "") != 0) + return -EINVAL; + + v9ses = v9fs_dentry2v9ses(dentry); + /* + * set the attribute on the remote. Without even looking at the + * xattr value. We leave it to the server to validate + */ + if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) + return v9fs_remote_set_acl(dentry, name, + value, size, flags, type); + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + if (!inode_owner_or_capable(inode)) + return -EPERM; + if (value) { + /* update the cached acl value */ + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + retval = posix_acl_valid(acl); + if (retval) + goto err_out; + } + } else + acl = NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + retval = posix_acl_equiv_mode(acl, &mode); + if (retval < 0) + goto err_out; + else { + struct iattr iattr; + if (retval == 0) { + /* + * ACL can be represented + * by the mode bits. So don't + * update ACL. + */ + acl = NULL; + value = NULL; + size = 0; + } + /* Updte the mode bits */ + iattr.ia_mode = ((mode & S_IALLUGO) | + (inode->i_mode & ~S_IALLUGO)); + iattr.ia_valid = ATTR_MODE; + /* FIXME should we update ctime ? + * What is the following setxattr update the + * mode ? + */ + v9fs_vfs_setattr_dotl(dentry, &iattr); + } + } + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + if (!S_ISDIR(inode->i_mode)) { + retval = acl ? -EINVAL : 0; + goto err_out; + } + break; + default: + BUG(); + } + retval = v9fs_xattr_set(dentry, name, value, size, flags); + if (!retval) + set_cached_acl(inode, type, acl); +err_out: + posix_acl_release(acl); + return retval; +} + +const struct xattr_handler v9fs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .get = v9fs_xattr_get_acl, + .set = v9fs_xattr_set_acl, +}; + +const struct xattr_handler v9fs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .get = v9fs_xattr_get_acl, + .set = v9fs_xattr_set_acl, +}; diff --git a/fs/9p/acl.h b/fs/9p/acl.h new file mode 100644 index 00000000..c47ea9cf --- /dev/null +++ b/fs/9p/acl.h @@ -0,0 +1,49 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ +#ifndef FS_9P_ACL_H +#define FS_9P_ACL_H + +#ifdef CONFIG_9P_FS_POSIX_ACL +extern int v9fs_get_acl(struct inode *, struct p9_fid *); +extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags); +extern int v9fs_acl_chmod(struct dentry *); +extern int v9fs_set_create_acl(struct dentry *, + struct posix_acl **, struct posix_acl **); +extern int v9fs_acl_mode(struct inode *dir, mode_t *modep, + struct posix_acl **dpacl, struct posix_acl **pacl); +#else +#define v9fs_check_acl NULL +static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) +{ + return 0; +} +static inline int v9fs_acl_chmod(struct dentry *dentry) +{ + return 0; +} +static inline int v9fs_set_create_acl(struct dentry *dentry, + struct posix_acl **dpacl, + struct posix_acl **pacl) +{ + return 0; +} +static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep, + struct posix_acl **dpacl, + struct posix_acl **pacl) +{ + return 0; +} + +#endif +#endif /* FS_9P_XATTR_H */ diff --git a/fs/9p/cache.c b/fs/9p/cache.c new file mode 100644 index 00000000..945aa5f0 --- /dev/null +++ b/fs/9p/cache.c @@ -0,0 +1,415 @@ +/* + * V9FS cache definitions. + * + * Copyright (C) 2009 by Abhishek Kulkarni + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "cache.h" + +#define CACHETAG_LEN 11 + +struct fscache_netfs v9fs_cache_netfs = { + .name = "9p", + .version = 0, +}; + +/** + * v9fs_random_cachetag - Generate a random tag to be associated + * with a new cache session. + * + * The value of jiffies is used for a fairly randomly cache tag. + */ + +static +int v9fs_random_cachetag(struct v9fs_session_info *v9ses) +{ + v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL); + if (!v9ses->cachetag) + return -ENOMEM; + + return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies); +} + +static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + struct v9fs_session_info *v9ses; + uint16_t klen = 0; + + v9ses = (struct v9fs_session_info *)cookie_netfs_data; + P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses, + buffer, bufmax); + + if (v9ses->cachetag) + klen = strlen(v9ses->cachetag); + + if (klen > bufmax) + return 0; + + memcpy(buffer, v9ses->cachetag, klen); + P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag); + return klen; +} + +const struct fscache_cookie_def v9fs_cache_session_index_def = { + .name = "9P.session", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = v9fs_cache_session_get_key, +}; + +void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) +{ + /* If no cache session tag was specified, we generate a random one. */ + if (!v9ses->cachetag) + v9fs_random_cachetag(v9ses); + + v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, + &v9fs_cache_session_index_def, + v9ses); + P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses, + v9ses->fscache); +} + +void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses) +{ + P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses, + v9ses->fscache); + fscache_relinquish_cookie(v9ses->fscache, 0); + v9ses->fscache = NULL; +} + + +static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct v9fs_inode *v9inode = cookie_netfs_data; + memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path)); + P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode, + v9inode->qid.path); + return sizeof(v9inode->qid.path); +} + +static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct v9fs_inode *v9inode = cookie_netfs_data; + *size = i_size_read(&v9inode->vfs_inode); + + P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode, + *size); +} + +static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen) +{ + const struct v9fs_inode *v9inode = cookie_netfs_data; + memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version)); + P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode, + v9inode->qid.version); + return sizeof(v9inode->qid.version); +} + +static enum +fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) +{ + const struct v9fs_inode *v9inode = cookie_netfs_data; + + if (buflen != sizeof(v9inode->qid.version)) + return FSCACHE_CHECKAUX_OBSOLETE; + + if (memcmp(buffer, &v9inode->qid.version, + sizeof(v9inode->qid.version))) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data) +{ + struct v9fs_inode *v9inode = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + for (;;) { + nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping, + first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +const struct fscache_cookie_def v9fs_cache_inode_index_def = { + .name = "9p.inode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = v9fs_cache_inode_get_key, + .get_attr = v9fs_cache_inode_get_attr, + .get_aux = v9fs_cache_inode_get_aux, + .check_aux = v9fs_cache_inode_check_aux, + .now_uncached = v9fs_cache_inode_now_uncached, +}; + +void v9fs_cache_inode_get_cookie(struct inode *inode) +{ + struct v9fs_inode *v9inode; + struct v9fs_session_info *v9ses; + + if (!S_ISREG(inode->i_mode)) + return; + + v9inode = V9FS_I(inode); + if (v9inode->fscache) + return; + + v9ses = v9fs_inode2v9ses(inode); + v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, + &v9fs_cache_inode_index_def, + v9inode); + + P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode, + v9inode->fscache); +} + +void v9fs_cache_inode_put_cookie(struct inode *inode) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + + if (!v9inode->fscache) + return; + P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode, + v9inode->fscache); + + fscache_relinquish_cookie(v9inode->fscache, 0); + v9inode->fscache = NULL; +} + +void v9fs_cache_inode_flush_cookie(struct inode *inode) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + + if (!v9inode->fscache) + return; + P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode, + v9inode->fscache); + + fscache_relinquish_cookie(v9inode->fscache, 1); + v9inode->fscache = NULL; +} + +void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_fid *fid; + + if (!v9inode->fscache) + return; + + spin_lock(&v9inode->fscache_lock); + fid = filp->private_data; + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) + v9fs_cache_inode_flush_cookie(inode); + else + v9fs_cache_inode_get_cookie(inode); + + spin_unlock(&v9inode->fscache_lock); +} + +void v9fs_cache_inode_reset_cookie(struct inode *inode) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct v9fs_session_info *v9ses; + struct fscache_cookie *old; + + if (!v9inode->fscache) + return; + + old = v9inode->fscache; + + spin_lock(&v9inode->fscache_lock); + fscache_relinquish_cookie(v9inode->fscache, 1); + + v9ses = v9fs_inode2v9ses(inode); + v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, + &v9fs_cache_inode_index_def, + v9inode); + P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p", + inode, old, v9inode->fscache); + + spin_unlock(&v9inode->fscache_lock); +} + +int __v9fs_fscache_release_page(struct page *page, gfp_t gfp) +{ + struct inode *inode = page->mapping->host; + struct v9fs_inode *v9inode = V9FS_I(inode); + + BUG_ON(!v9inode->fscache); + + return fscache_maybe_release_page(v9inode->fscache, page, gfp); +} + +void __v9fs_fscache_invalidate_page(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct v9fs_inode *v9inode = V9FS_I(inode); + + BUG_ON(!v9inode->fscache); + + if (PageFsCache(page)) { + fscache_wait_on_page_write(v9inode->fscache, page); + BUG_ON(!PageLocked(page)); + fscache_uncache_page(v9inode->fscache, page); + } +} + +static void v9fs_vfs_readpage_complete(struct page *page, void *data, + int error) +{ + if (!error) + SetPageUptodate(page); + + unlock_page(page); +} + +/** + * __v9fs_readpage_from_fscache - read a page from cache + * + * Returns 0 if the pages are in cache and a BIO is submitted, + * 1 if the pages are not in cache and -error otherwise. + */ + +int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) +{ + int ret; + const struct v9fs_inode *v9inode = V9FS_I(inode); + + P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + if (!v9inode->fscache) + return -ENOBUFS; + + ret = fscache_read_or_alloc_page(v9inode->fscache, + page, + v9fs_vfs_readpage_complete, + NULL, + GFP_KERNEL); + switch (ret) { + case -ENOBUFS: + case -ENODATA: + P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret); + return 1; + case 0: + P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted"); + return ret; + default: + P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret); + return ret; + } +} + +/** + * __v9fs_readpages_from_fscache - read multiple pages from cache + * + * Returns 0 if the pages are in cache and a BIO is submitted, + * 1 if the pages are not in cache and -error otherwise. + */ + +int __v9fs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + int ret; + const struct v9fs_inode *v9inode = V9FS_I(inode); + + P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages); + if (!v9inode->fscache) + return -ENOBUFS; + + ret = fscache_read_or_alloc_pages(v9inode->fscache, + mapping, pages, nr_pages, + v9fs_vfs_readpage_complete, + NULL, + mapping_gfp_mask(mapping)); + switch (ret) { + case -ENOBUFS: + case -ENODATA: + P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret); + return 1; + case 0: + BUG_ON(!list_empty(pages)); + BUG_ON(*nr_pages != 0); + P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted"); + return ret; + default: + P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret); + return ret; + } +} + +/** + * __v9fs_readpage_to_fscache - write a page to the cache + * + */ + +void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) +{ + int ret; + const struct v9fs_inode *v9inode = V9FS_I(inode); + + P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL); + P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret); + if (ret != 0) + v9fs_uncache_page(inode, page); +} + +/* + * wait for a page to complete writing to the cache + */ +void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page) +{ + const struct v9fs_inode *v9inode = V9FS_I(inode); + P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + if (PageFsCache(page)) + fscache_wait_on_page_write(v9inode->fscache, page); +} diff --git a/fs/9p/cache.h b/fs/9p/cache.h new file mode 100644 index 00000000..40cc54ce --- /dev/null +++ b/fs/9p/cache.h @@ -0,0 +1,139 @@ +/* + * V9FS cache definitions. + * + * Copyright (C) 2009 by Abhishek Kulkarni + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#ifndef _9P_CACHE_H +#ifdef CONFIG_9P_FSCACHE +#include +#include + +extern struct fscache_netfs v9fs_cache_netfs; +extern const struct fscache_cookie_def v9fs_cache_session_index_def; +extern const struct fscache_cookie_def v9fs_cache_inode_index_def; + +extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses); +extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses); + +extern void v9fs_cache_inode_get_cookie(struct inode *inode); +extern void v9fs_cache_inode_put_cookie(struct inode *inode); +extern void v9fs_cache_inode_flush_cookie(struct inode *inode); +extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp); +extern void v9fs_cache_inode_reset_cookie(struct inode *inode); + +extern int __v9fs_cache_register(void); +extern void __v9fs_cache_unregister(void); + +extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp); +extern void __v9fs_fscache_invalidate_page(struct page *page); +extern int __v9fs_readpage_from_fscache(struct inode *inode, + struct page *page); +extern int __v9fs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages); +extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page); +extern void __v9fs_fscache_wait_on_page_write(struct inode *inode, + struct page *page); + +static inline int v9fs_fscache_release_page(struct page *page, + gfp_t gfp) +{ + return __v9fs_fscache_release_page(page, gfp); +} + +static inline void v9fs_fscache_invalidate_page(struct page *page) +{ + __v9fs_fscache_invalidate_page(page); +} + +static inline int v9fs_readpage_from_fscache(struct inode *inode, + struct page *page) +{ + return __v9fs_readpage_from_fscache(inode, page); +} + +static inline int v9fs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return __v9fs_readpages_from_fscache(inode, mapping, pages, + nr_pages); +} + +static inline void v9fs_readpage_to_fscache(struct inode *inode, + struct page *page) +{ + if (PageFsCache(page)) + __v9fs_readpage_to_fscache(inode, page); +} + +static inline void v9fs_uncache_page(struct inode *inode, struct page *page) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + fscache_uncache_page(v9inode->fscache, page); + BUG_ON(PageFsCache(page)); +} + +static inline void v9fs_fscache_wait_on_page_write(struct inode *inode, + struct page *page) +{ + return __v9fs_fscache_wait_on_page_write(inode, page); +} + +#else /* CONFIG_9P_FSCACHE */ + +static inline int v9fs_fscache_release_page(struct page *page, + gfp_t gfp) { + return 1; +} + +static inline void v9fs_fscache_invalidate_page(struct page *page) {} + +static inline int v9fs_readpage_from_fscache(struct inode *inode, + struct page *page) +{ + return -ENOBUFS; +} + +static inline int v9fs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} + +static inline void v9fs_readpage_to_fscache(struct inode *inode, + struct page *page) +{} + +static inline void v9fs_uncache_page(struct inode *inode, struct page *page) +{} + +static inline void v9fs_fscache_wait_on_page_write(struct inode *inode, + struct page *page) +{ + return; +} + +#endif /* CONFIG_9P_FSCACHE */ +#endif /* _9P_CACHE_H */ diff --git a/fs/9p/fid.c b/fs/9p/fid.c new file mode 100644 index 00000000..85b67ffa --- /dev/null +++ b/fs/9p/fid.c @@ -0,0 +1,309 @@ +/* + * V9FS FID Management + * + * Copyright (C) 2007 by Latchesar Ionkov + * Copyright (C) 2005, 2006 by Eric Van Hensbergen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" + +/** + * v9fs_fid_add - add a fid to a dentry + * @dentry: dentry that the fid is being added to + * @fid: fid to add + * + */ + +int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) +{ + struct v9fs_dentry *dent; + + P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n", + fid->fid, dentry->d_name.name); + + dent = dentry->d_fsdata; + if (!dent) { + dent = kmalloc(sizeof(struct v9fs_dentry), GFP_KERNEL); + if (!dent) + return -ENOMEM; + + spin_lock_init(&dent->lock); + INIT_LIST_HEAD(&dent->fidlist); + dentry->d_fsdata = dent; + } + + spin_lock(&dent->lock); + list_add(&fid->dlist, &dent->fidlist); + spin_unlock(&dent->lock); + + return 0; +} + +/** + * v9fs_fid_find - retrieve a fid that belongs to the specified uid + * @dentry: dentry to look for fid in + * @uid: return fid that belongs to the specified user + * @any: if non-zero, return any fid associated with the dentry + * + */ + +static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any) +{ + struct v9fs_dentry *dent; + struct p9_fid *fid, *ret; + + P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n", + dentry->d_name.name, dentry, uid, any); + dent = (struct v9fs_dentry *) dentry->d_fsdata; + ret = NULL; + if (dent) { + spin_lock(&dent->lock); + list_for_each_entry(fid, &dent->fidlist, dlist) { + if (any || fid->uid == uid) { + ret = fid; + break; + } + } + spin_unlock(&dent->lock); + } + + return ret; +} + +/* + * We need to hold v9ses->rename_sem as long as we hold references + * to returned path array. Array element contain pointers to + * dentry names. + */ +static int build_path_from_dentry(struct v9fs_session_info *v9ses, + struct dentry *dentry, char ***names) +{ + int n = 0, i; + char **wnames; + struct dentry *ds; + + for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent) + n++; + + wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL); + if (!wnames) + goto err_out; + + for (ds = dentry, i = (n-1); i >= 0; i--, ds = ds->d_parent) + wnames[i] = (char *)ds->d_name.name; + + *names = wnames; + return n; +err_out: + return -ENOMEM; +} + +static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, + uid_t uid, int any) +{ + struct dentry *ds; + char **wnames, *uname; + int i, n, l, clone, access; + struct v9fs_session_info *v9ses; + struct p9_fid *fid, *old_fid = NULL; + + v9ses = v9fs_dentry2v9ses(dentry); + access = v9ses->flags & V9FS_ACCESS_MASK; + fid = v9fs_fid_find(dentry, uid, any); + if (fid) + return fid; + /* + * we don't have a matching fid. To do a TWALK we need + * parent fid. We need to prevent rename when we want to + * look at the parent. + */ + down_read(&v9ses->rename_sem); + ds = dentry->d_parent; + fid = v9fs_fid_find(ds, uid, any); + if (fid) { + /* Found the parent fid do a lookup with that */ + fid = p9_client_walk(fid, 1, (char **)&dentry->d_name.name, 1); + goto fid_out; + } + up_read(&v9ses->rename_sem); + + /* start from the root and try to do a lookup */ + fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any); + if (!fid) { + /* the user is not attached to the fs yet */ + if (access == V9FS_ACCESS_SINGLE) + return ERR_PTR(-EPERM); + + if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) + uname = NULL; + else + uname = v9ses->uname; + + fid = p9_client_attach(v9ses->clnt, NULL, uname, uid, + v9ses->aname); + if (IS_ERR(fid)) + return fid; + + v9fs_fid_add(dentry->d_sb->s_root, fid); + } + /* If we are root ourself just return that */ + if (dentry->d_sb->s_root == dentry) + return fid; + /* + * Do a multipath walk with attached root. + * When walking parent we need to make sure we + * don't have a parallel rename happening + */ + down_read(&v9ses->rename_sem); + n = build_path_from_dentry(v9ses, dentry, &wnames); + if (n < 0) { + fid = ERR_PTR(n); + goto err_out; + } + clone = 1; + i = 0; + while (i < n) { + l = min(n - i, P9_MAXWELEM); + /* + * We need to hold rename lock when doing a multipath + * walk to ensure none of the patch component change + */ + fid = p9_client_walk(fid, l, &wnames[i], clone); + if (IS_ERR(fid)) { + if (old_fid) { + /* + * If we fail, clunk fid which are mapping + * to path component and not the last component + * of the path. + */ + p9_client_clunk(old_fid); + } + kfree(wnames); + goto err_out; + } + old_fid = fid; + i += l; + clone = 0; + } + kfree(wnames); +fid_out: + if (!IS_ERR(fid)) + v9fs_fid_add(dentry, fid); +err_out: + up_read(&v9ses->rename_sem); + return fid; +} + +/** + * v9fs_fid_lookup - lookup for a fid, try to walk if not found + * @dentry: dentry to look for fid in + * + * Look for a fid in the specified dentry for the current user. + * If no fid is found, try to create one walking from a fid from the parent + * dentry (if it has one), or the root dentry. If the user haven't accessed + * the fs yet, attach now and walk from the root. + */ + +struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) +{ + uid_t uid; + int any, access; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_dentry2v9ses(dentry); + access = v9ses->flags & V9FS_ACCESS_MASK; + switch (access) { + case V9FS_ACCESS_SINGLE: + case V9FS_ACCESS_USER: + case V9FS_ACCESS_CLIENT: + uid = current_fsuid(); + any = 0; + break; + + case V9FS_ACCESS_ANY: + uid = v9ses->uid; + any = 1; + break; + + default: + uid = ~0; + any = 0; + break; + } + return v9fs_fid_lookup_with_uid(dentry, uid, any); +} + +struct p9_fid *v9fs_fid_clone(struct dentry *dentry) +{ + struct p9_fid *fid, *ret; + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return fid; + + ret = p9_client_walk(fid, 0, NULL, 1); + return ret; +} + +static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid) +{ + struct p9_fid *fid, *ret; + + fid = v9fs_fid_lookup_with_uid(dentry, uid, 0); + if (IS_ERR(fid)) + return fid; + + ret = p9_client_walk(fid, 0, NULL, 1); + return ret; +} + +struct p9_fid *v9fs_writeback_fid(struct dentry *dentry) +{ + int err; + struct p9_fid *fid; + + fid = v9fs_fid_clone_with_uid(dentry, 0); + if (IS_ERR(fid)) + goto error_out; + /* + * writeback fid will only be used to write back the + * dirty pages. We always request for the open fid in read-write + * mode so that a partial page write which result in page + * read can work. + */ + err = p9_client_open(fid, O_RDWR); + if (err < 0) { + p9_client_clunk(fid); + fid = ERR_PTR(err); + goto error_out; + } +error_out: + return fid; +} diff --git a/fs/9p/fid.h b/fs/9p/fid.h new file mode 100644 index 00000000..bb0b6e7f --- /dev/null +++ b/fs/9p/fid.h @@ -0,0 +1,50 @@ +/* + * V9FS FID Management + * + * Copyright (C) 2005 by Eric Van Hensbergen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ +#ifndef FS_9P_FID_H +#define FS_9P_FID_H +#include + +/** + * struct v9fs_dentry - 9p private data stored in dentry d_fsdata + * @lock: protects the fidlist + * @fidlist: list of FIDs currently associated with this dentry + * + * This structure defines the 9p private data associated with + * a particular dentry. In particular, this private data is used + * to lookup which 9P FID handle should be used for a particular VFS + * operation. FID handles are associated with dentries instead of + * inodes in order to more closely map functionality to the Plan 9 + * expected behavior for FID reclaimation and tracking. + * + * See Also: Mapping FIDs to Linux VFS model in + * Design and Implementation of the Linux 9P File System documentation + */ +struct v9fs_dentry { + spinlock_t lock; /* protect fidlist */ + struct list_head fidlist; +}; + +struct p9_fid *v9fs_fid_lookup(struct dentry *dentry); +struct p9_fid *v9fs_fid_clone(struct dentry *dentry); +int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid); +struct p9_fid *v9fs_writeback_fid(struct dentry *dentry); +#endif diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c new file mode 100644 index 00000000..ef966188 --- /dev/null +++ b/fs/9p/v9fs.c @@ -0,0 +1,624 @@ +/* + * linux/fs/9p/v9fs.c + * + * This file contains functions assisting in mapping VFS to 9P2000 + * + * Copyright (C) 2004-2008 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "cache.h" + +static DEFINE_SPINLOCK(v9fs_sessionlist_lock); +static LIST_HEAD(v9fs_sessionlist); +struct kmem_cache *v9fs_inode_cache; + +/* + * Option Parsing (code inspired by NFS code) + * NOTE: each transport will parse its own options + */ + +enum { + /* Options that take integer arguments */ + Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid, + /* String options */ + Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag, + /* Options that take no arguments */ + Opt_nodevmap, + /* Cache options */ + Opt_cache_loose, Opt_fscache, + /* Access options */ + Opt_access, Opt_posixacl, + /* Error token */ + Opt_err +}; + +static const match_table_t tokens = { + {Opt_debug, "debug=%x"}, + {Opt_dfltuid, "dfltuid=%u"}, + {Opt_dfltgid, "dfltgid=%u"}, + {Opt_afid, "afid=%u"}, + {Opt_uname, "uname=%s"}, + {Opt_remotename, "aname=%s"}, + {Opt_nodevmap, "nodevmap"}, + {Opt_cache, "cache=%s"}, + {Opt_cache_loose, "loose"}, + {Opt_fscache, "fscache"}, + {Opt_cachetag, "cachetag=%s"}, + {Opt_access, "access=%s"}, + {Opt_posixacl, "posixacl"}, + {Opt_err, NULL} +}; + +/* Interpret mount options for cache mode */ +static int get_cache_mode(char *s) +{ + int version = -EINVAL; + + if (!strcmp(s, "loose")) { + version = CACHE_LOOSE; + P9_DPRINTK(P9_DEBUG_9P, "Cache mode: loose\n"); + } else if (!strcmp(s, "fscache")) { + version = CACHE_FSCACHE; + P9_DPRINTK(P9_DEBUG_9P, "Cache mode: fscache\n"); + } else if (!strcmp(s, "none")) { + version = CACHE_NONE; + P9_DPRINTK(P9_DEBUG_9P, "Cache mode: none\n"); + } else + printk(KERN_INFO "9p: Unknown Cache mode %s.\n", s); + return version; +} + +/** + * v9fs_parse_options - parse mount options into session structure + * @v9ses: existing v9fs session information + * + * Return 0 upon success, -ERRNO upon failure. + */ + +static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) +{ + char *options, *tmp_options; + substring_t args[MAX_OPT_ARGS]; + char *p; + int option = 0; + char *s, *e; + int ret = 0; + + /* setup defaults */ + v9ses->afid = ~0; + v9ses->debug = 0; + v9ses->cache = CACHE_NONE; +#ifdef CONFIG_9P_FSCACHE + v9ses->cachetag = NULL; +#endif + + if (!opts) + return 0; + + tmp_options = kstrdup(opts, GFP_KERNEL); + if (!tmp_options) { + ret = -ENOMEM; + goto fail_option_alloc; + } + options = tmp_options; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + token = match_token(p, tokens, args); + if (token < Opt_uname) { + int r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } + } + switch (token) { + case Opt_debug: + v9ses->debug = option; +#ifdef CONFIG_NET_9P_DEBUG + p9_debug_level = option; +#endif + break; + + case Opt_dfltuid: + v9ses->dfltuid = option; + break; + case Opt_dfltgid: + v9ses->dfltgid = option; + break; + case Opt_afid: + v9ses->afid = option; + break; + case Opt_uname: + match_strlcpy(v9ses->uname, &args[0], PATH_MAX); + break; + case Opt_remotename: + match_strlcpy(v9ses->aname, &args[0], PATH_MAX); + break; + case Opt_nodevmap: + v9ses->nodev = 1; + break; + case Opt_cache_loose: + v9ses->cache = CACHE_LOOSE; + break; + case Opt_fscache: + v9ses->cache = CACHE_FSCACHE; + break; + case Opt_cachetag: +#ifdef CONFIG_9P_FSCACHE + v9ses->cachetag = match_strdup(&args[0]); +#endif + break; + case Opt_cache: + s = match_strdup(&args[0]); + if (!s) { + ret = -ENOMEM; + P9_DPRINTK(P9_DEBUG_ERROR, + "problem allocating copy of cache arg\n"); + goto free_and_return; + } + ret = get_cache_mode(s); + if (ret == -EINVAL) { + kfree(s); + goto free_and_return; + } + + v9ses->cache = ret; + kfree(s); + break; + + case Opt_access: + s = match_strdup(&args[0]); + if (!s) { + ret = -ENOMEM; + P9_DPRINTK(P9_DEBUG_ERROR, + "problem allocating copy of access arg\n"); + goto free_and_return; + } + + v9ses->flags &= ~V9FS_ACCESS_MASK; + if (strcmp(s, "user") == 0) + v9ses->flags |= V9FS_ACCESS_USER; + else if (strcmp(s, "any") == 0) + v9ses->flags |= V9FS_ACCESS_ANY; + else if (strcmp(s, "client") == 0) { + v9ses->flags |= V9FS_ACCESS_CLIENT; + } else { + v9ses->flags |= V9FS_ACCESS_SINGLE; + v9ses->uid = simple_strtoul(s, &e, 10); + if (*e != '\0') { + ret = -EINVAL; + printk(KERN_INFO "9p: Unknown access " + "argument %s.\n", s); + kfree(s); + goto free_and_return; + } + } + + kfree(s); + break; + + case Opt_posixacl: +#ifdef CONFIG_9P_FS_POSIX_ACL + v9ses->flags |= V9FS_POSIX_ACL; +#else + P9_DPRINTK(P9_DEBUG_ERROR, + "Not defined CONFIG_9P_FS_POSIX_ACL. " + "Ignoring posixacl option\n"); +#endif + break; + + default: + continue; + } + } + +free_and_return: + kfree(tmp_options); +fail_option_alloc: + return ret; +} + +/** + * v9fs_session_init - initialize session + * @v9ses: session information structure + * @dev_name: device being mounted + * @data: options + * + */ + +struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, + const char *dev_name, char *data) +{ + int retval = -EINVAL; + struct p9_fid *fid; + int rc; + + v9ses->uname = __getname(); + if (!v9ses->uname) + return ERR_PTR(-ENOMEM); + + v9ses->aname = __getname(); + if (!v9ses->aname) { + __putname(v9ses->uname); + return ERR_PTR(-ENOMEM); + } + init_rwsem(&v9ses->rename_sem); + + rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); + if (rc) { + __putname(v9ses->aname); + __putname(v9ses->uname); + return ERR_PTR(rc); + } + + spin_lock(&v9fs_sessionlist_lock); + list_add(&v9ses->slist, &v9fs_sessionlist); + spin_unlock(&v9fs_sessionlist_lock); + + strcpy(v9ses->uname, V9FS_DEFUSER); + strcpy(v9ses->aname, V9FS_DEFANAME); + v9ses->uid = ~0; + v9ses->dfltuid = V9FS_DEFUID; + v9ses->dfltgid = V9FS_DEFGID; + + v9ses->clnt = p9_client_create(dev_name, data); + if (IS_ERR(v9ses->clnt)) { + retval = PTR_ERR(v9ses->clnt); + v9ses->clnt = NULL; + P9_DPRINTK(P9_DEBUG_ERROR, "problem initializing 9p client\n"); + goto error; + } + + v9ses->flags = V9FS_ACCESS_USER; + + if (p9_is_proto_dotl(v9ses->clnt)) { + v9ses->flags = V9FS_ACCESS_CLIENT; + v9ses->flags |= V9FS_PROTO_2000L; + } else if (p9_is_proto_dotu(v9ses->clnt)) { + v9ses->flags |= V9FS_PROTO_2000U; + } + + rc = v9fs_parse_options(v9ses, data); + if (rc < 0) { + retval = rc; + goto error; + } + + v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; + + if (!v9fs_proto_dotl(v9ses) && + ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { + /* + * We support ACCESS_CLIENT only for dotl. + * Fall back to ACCESS_USER + */ + v9ses->flags &= ~V9FS_ACCESS_MASK; + v9ses->flags |= V9FS_ACCESS_USER; + } + /*FIXME !! */ + /* for legacy mode, fall back to V9FS_ACCESS_ANY */ + if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) && + ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { + + v9ses->flags &= ~V9FS_ACCESS_MASK; + v9ses->flags |= V9FS_ACCESS_ANY; + v9ses->uid = ~0; + } + if (!v9fs_proto_dotl(v9ses) || + !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { + /* + * We support ACL checks on clinet only if the protocol is + * 9P2000.L and access is V9FS_ACCESS_CLIENT. + */ + v9ses->flags &= ~V9FS_ACL_MASK; + } + + fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0, + v9ses->aname); + if (IS_ERR(fid)) { + retval = PTR_ERR(fid); + fid = NULL; + P9_DPRINTK(P9_DEBUG_ERROR, "cannot attach\n"); + goto error; + } + + if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE) + fid->uid = v9ses->uid; + else + fid->uid = ~0; + +#ifdef CONFIG_9P_FSCACHE + /* register the session for caching */ + v9fs_cache_session_get_cookie(v9ses); +#endif + + return fid; + +error: + bdi_destroy(&v9ses->bdi); + return ERR_PTR(retval); +} + +/** + * v9fs_session_close - shutdown a session + * @v9ses: session information structure + * + */ + +void v9fs_session_close(struct v9fs_session_info *v9ses) +{ + if (v9ses->clnt) { + p9_client_destroy(v9ses->clnt); + v9ses->clnt = NULL; + } + +#ifdef CONFIG_9P_FSCACHE + if (v9ses->fscache) { + v9fs_cache_session_put_cookie(v9ses); + kfree(v9ses->cachetag); + } +#endif + __putname(v9ses->uname); + __putname(v9ses->aname); + + bdi_destroy(&v9ses->bdi); + + spin_lock(&v9fs_sessionlist_lock); + list_del(&v9ses->slist); + spin_unlock(&v9fs_sessionlist_lock); +} + +/** + * v9fs_session_cancel - terminate a session + * @v9ses: session to terminate + * + * mark transport as disconnected and cancel all pending requests. + */ + +void v9fs_session_cancel(struct v9fs_session_info *v9ses) { + P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses); + p9_client_disconnect(v9ses->clnt); +} + +/** + * v9fs_session_begin_cancel - Begin terminate of a session + * @v9ses: session to terminate + * + * After this call we don't allow any request other than clunk. + */ + +void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses) +{ + P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses); + p9_client_begin_disconnect(v9ses->clnt); +} + +extern int v9fs_error_init(void); + +static struct kobject *v9fs_kobj; + +#ifdef CONFIG_9P_FSCACHE +/** + * caches_show - list caches associated with a session + * + * Returns the size of buffer written. + */ + +static ssize_t caches_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t n = 0, count = 0, limit = PAGE_SIZE; + struct v9fs_session_info *v9ses; + + spin_lock(&v9fs_sessionlist_lock); + list_for_each_entry(v9ses, &v9fs_sessionlist, slist) { + if (v9ses->cachetag) { + n = snprintf(buf, limit, "%s\n", v9ses->cachetag); + if (n < 0) { + count = n; + break; + } + + count += n; + limit -= n; + } + } + + spin_unlock(&v9fs_sessionlist_lock); + return count; +} + +static struct kobj_attribute v9fs_attr_cache = __ATTR_RO(caches); +#endif /* CONFIG_9P_FSCACHE */ + +static struct attribute *v9fs_attrs[] = { +#ifdef CONFIG_9P_FSCACHE + &v9fs_attr_cache.attr, +#endif + NULL, +}; + +static struct attribute_group v9fs_attr_group = { + .attrs = v9fs_attrs, +}; + +/** + * v9fs_sysfs_init - Initialize the v9fs sysfs interface + * + */ + +static int v9fs_sysfs_init(void) +{ + v9fs_kobj = kobject_create_and_add("9p", fs_kobj); + if (!v9fs_kobj) + return -ENOMEM; + + if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) { + kobject_put(v9fs_kobj); + return -ENOMEM; + } + + return 0; +} + +/** + * v9fs_sysfs_cleanup - Unregister the v9fs sysfs interface + * + */ + +static void v9fs_sysfs_cleanup(void) +{ + sysfs_remove_group(v9fs_kobj, &v9fs_attr_group); + kobject_put(v9fs_kobj); +} + +static void v9fs_inode_init_once(void *foo) +{ + struct v9fs_inode *v9inode = (struct v9fs_inode *)foo; +#ifdef CONFIG_9P_FSCACHE + v9inode->fscache = NULL; +#endif + memset(&v9inode->qid, 0, sizeof(v9inode->qid)); + inode_init_once(&v9inode->vfs_inode); +} + +/** + * v9fs_init_inode_cache - initialize a cache for 9P + * Returns 0 on success. + */ +static int v9fs_init_inode_cache(void) +{ + v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", + sizeof(struct v9fs_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + v9fs_inode_init_once); + if (!v9fs_inode_cache) + return -ENOMEM; + + return 0; +} + +/** + * v9fs_destroy_inode_cache - destroy the cache of 9P inode + * + */ +static void v9fs_destroy_inode_cache(void) +{ + kmem_cache_destroy(v9fs_inode_cache); +} + +static int v9fs_cache_register(void) +{ + int ret; + ret = v9fs_init_inode_cache(); + if (ret < 0) + return ret; +#ifdef CONFIG_9P_FSCACHE + return fscache_register_netfs(&v9fs_cache_netfs); +#else + return ret; +#endif +} + +static void v9fs_cache_unregister(void) +{ + v9fs_destroy_inode_cache(); +#ifdef CONFIG_9P_FSCACHE + fscache_unregister_netfs(&v9fs_cache_netfs); +#endif +} + +/** + * init_v9fs - Initialize module + * + */ + +static int __init init_v9fs(void) +{ + int err; + printk(KERN_INFO "Installing v9fs 9p2000 file system support\n"); + /* TODO: Setup list of registered trasnport modules */ + err = register_filesystem(&v9fs_fs_type); + if (err < 0) { + printk(KERN_ERR "Failed to register filesystem\n"); + return err; + } + + err = v9fs_cache_register(); + if (err < 0) { + printk(KERN_ERR "Failed to register v9fs for caching\n"); + goto out_fs_unreg; + } + + err = v9fs_sysfs_init(); + if (err < 0) { + printk(KERN_ERR "Failed to register with sysfs\n"); + goto out_sysfs_cleanup; + } + + return 0; + +out_sysfs_cleanup: + v9fs_sysfs_cleanup(); + +out_fs_unreg: + unregister_filesystem(&v9fs_fs_type); + + return err; +} + +/** + * exit_v9fs - shutdown module + * + */ + +static void __exit exit_v9fs(void) +{ + v9fs_sysfs_cleanup(); + v9fs_cache_unregister(); + unregister_filesystem(&v9fs_fs_type); +} + +module_init(init_v9fs) +module_exit(exit_v9fs) + +MODULE_AUTHOR("Latchesar Ionkov "); +MODULE_AUTHOR("Eric Van Hensbergen "); +MODULE_AUTHOR("Ron Minnich "); +MODULE_LICENSE("GPL"); diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h new file mode 100644 index 00000000..e78956cb --- /dev/null +++ b/fs/9p/v9fs.h @@ -0,0 +1,227 @@ +/* + * V9FS definitions. + * + * Copyright (C) 2004-2008 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ +#ifndef FS_9P_V9FS_H +#define FS_9P_V9FS_H + +#include + +/** + * enum p9_session_flags - option flags for each 9P session + * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions + * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions + * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy + * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) + * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client. + * @V9FS_ACCESS_ANY: use a single attach for all users + * @V9FS_ACCESS_MASK: bit mask of different ACCESS options + * @V9FS_POSIX_ACL: POSIX ACLs are enforced + * + * Session flags reflect options selected by users at mount time + */ +#define V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \ + V9FS_ACCESS_USER | \ + V9FS_ACCESS_CLIENT) +#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY +#define V9FS_ACL_MASK V9FS_POSIX_ACL + +enum p9_session_flags { + V9FS_PROTO_2000U = 0x01, + V9FS_PROTO_2000L = 0x02, + V9FS_ACCESS_SINGLE = 0x04, + V9FS_ACCESS_USER = 0x08, + V9FS_ACCESS_CLIENT = 0x10, + V9FS_POSIX_ACL = 0x20 +}; + +/* possible values of ->cache */ +/** + * enum p9_cache_modes - user specified cache preferences + * @CACHE_NONE: do not cache data, dentries, or directory contents (default) + * @CACHE_LOOSE: cache data, dentries, and directory contents w/no consistency + * + * eventually support loose, tight, time, session, default always none + */ + +enum p9_cache_modes { + CACHE_NONE, + CACHE_LOOSE, + CACHE_FSCACHE, +}; + +/** + * struct v9fs_session_info - per-instance session information + * @flags: session options of type &p9_session_flags + * @nodev: set to 1 to disable device mapping + * @debug: debug level + * @afid: authentication handle + * @cache: cache mode of type &p9_cache_modes + * @cachetag: the tag of the cache associated with this session + * @fscache: session cookie associated with FS-Cache + * @options: copy of options string given by user + * @uname: string user name to mount hierarchy as + * @aname: mount specifier for remote hierarchy + * @maxdata: maximum data to be sent/recvd per protocol message + * @dfltuid: default numeric userid to mount hierarchy as + * @dfltgid: default numeric groupid to mount hierarchy as + * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy + * @clnt: reference to 9P network client instantiated for this session + * @slist: reference to list of registered 9p sessions + * + * This structure holds state for each session instance established during + * a sys_mount() . + * + * Bugs: there seems to be a lot of state which could be condensed and/or + * removed. + */ + +struct v9fs_session_info { + /* options */ + unsigned char flags; + unsigned char nodev; + unsigned short debug; + unsigned int afid; + unsigned int cache; +#ifdef CONFIG_9P_FSCACHE + char *cachetag; + struct fscache_cookie *fscache; +#endif + + char *uname; /* user name to mount as */ + char *aname; /* name of remote hierarchy being mounted */ + unsigned int maxdata; /* max data for client interface */ + unsigned int dfltuid; /* default uid/muid for legacy support */ + unsigned int dfltgid; /* default gid for legacy support */ + u32 uid; /* if ACCESS_SINGLE, the uid that has access */ + struct p9_client *clnt; /* 9p client */ + struct list_head slist; /* list of sessions registered with v9fs */ + struct backing_dev_info bdi; + struct rw_semaphore rename_sem; +}; + +/* cache_validity flags */ +#define V9FS_INO_INVALID_ATTR 0x01 + +struct v9fs_inode { +#ifdef CONFIG_9P_FSCACHE + spinlock_t fscache_lock; + struct fscache_cookie *fscache; +#endif + struct p9_qid qid; + unsigned int cache_validity; + struct p9_fid *writeback_fid; + struct mutex v_mutex; + struct inode vfs_inode; +}; + +static inline struct v9fs_inode *V9FS_I(const struct inode *inode) +{ + return container_of(inode, struct v9fs_inode, vfs_inode); +} + +struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, + char *); +extern void v9fs_session_close(struct v9fs_session_info *v9ses); +extern void v9fs_session_cancel(struct v9fs_session_info *v9ses); +extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); +extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nameidata); +extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); +extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); +extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry); +extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, + void *p); +extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, + struct p9_fid *fid, + struct super_block *sb, int new); +extern const struct inode_operations v9fs_dir_inode_operations_dotl; +extern const struct inode_operations v9fs_file_inode_operations_dotl; +extern const struct inode_operations v9fs_symlink_inode_operations_dotl; +extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, + struct p9_fid *fid, + struct super_block *sb, int new); + +/* other default globals */ +#define V9FS_PORT 564 +#define V9FS_DEFUSER "nobody" +#define V9FS_DEFANAME "" +#define V9FS_DEFUID (-2) +#define V9FS_DEFGID (-2) + +static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) +{ + return (inode->i_sb->s_fs_info); +} + +static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry) +{ + return dentry->d_sb->s_fs_info; +} + +static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses) +{ + return v9ses->flags & V9FS_PROTO_2000U; +} + +static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) +{ + return v9ses->flags & V9FS_PROTO_2000L; +} + +/** + * v9fs_get_inode_from_fid - Helper routine to populate an inode by + * issuing a attribute request + * @v9ses: session information + * @fid: fid to issue attribute request for + * @sb: superblock on which to create inode + * + */ +static inline struct inode * +v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + if (v9fs_proto_dotl(v9ses)) + return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0); + else + return v9fs_inode_from_fid(v9ses, fid, sb, 0); +} + +/** + * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by + * issuing a attribute request + * @v9ses: session information + * @fid: fid to issue attribute request for + * @sb: superblock on which to create inode + * + */ +static inline struct inode * +v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + if (v9fs_proto_dotl(v9ses)) + return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1); + else + return v9fs_inode_from_fid(v9ses, fid, sb, 1); +} + +#endif diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h new file mode 100644 index 00000000..f9a28eab --- /dev/null +++ b/fs/9p/v9fs_vfs.h @@ -0,0 +1,87 @@ +/* + * V9FS VFS extensions. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ +#ifndef FS_9P_V9FS_VFS_H +#define FS_9P_V9FS_VFS_H + +/* plan9 semantics are that created files are implicitly opened. + * But linux semantics are that you call create, then open. + * the plan9 approach is superior as it provides an atomic + * open. + * we track the create fid here. When the file is opened, if fidopen is + * non-zero, we use the fid and can skip some steps. + * there may be a better way to do this, but I don't know it. + * one BAD way is to clunk the fid on create, then open it again: + * you lose the atomicity of file open + */ + +/* special case: + * unlink calls remove, which is an implicit clunk. So we have to track + * that kind of thing so that we don't try to clunk a dead fid. + */ +#define P9_LOCK_TIMEOUT (30*HZ) + +extern struct file_system_type v9fs_fs_type; +extern const struct address_space_operations v9fs_addr_operations; +extern const struct file_operations v9fs_file_operations; +extern const struct file_operations v9fs_file_operations_dotl; +extern const struct file_operations v9fs_dir_operations; +extern const struct file_operations v9fs_dir_operations_dotl; +extern const struct dentry_operations v9fs_dentry_operations; +extern const struct dentry_operations v9fs_cached_dentry_operations; +extern const struct file_operations v9fs_cached_file_operations; +extern const struct file_operations v9fs_cached_file_operations_dotl; +extern struct kmem_cache *v9fs_inode_cache; + +struct inode *v9fs_alloc_inode(struct super_block *sb); +void v9fs_destroy_inode(struct inode *inode); +struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t); +int v9fs_init_inode(struct v9fs_session_info *v9ses, + struct inode *inode, int mode, dev_t); +void v9fs_evict_inode(struct inode *inode); +ino_t v9fs_qid2ino(struct p9_qid *qid); +void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); +void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *); +int v9fs_dir_release(struct inode *inode, struct file *filp); +int v9fs_file_open(struct inode *inode, struct file *file); +void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); +int v9fs_uflags2omode(int uflags, int extended); + +ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); +ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64); +void v9fs_blank_wstat(struct p9_wstat *wstat); +int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *); +int v9fs_file_fsync_dotl(struct file *filp, int datasync); +ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *, + const char __user *, size_t, loff_t *, int); +int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode); +int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode); +static inline void v9fs_invalidate_inode_attr(struct inode *inode) +{ + struct v9fs_inode *v9inode; + v9inode = V9FS_I(inode); + v9inode->cache_validity |= V9FS_INO_INVALID_ATTR; + return; +} + +int v9fs_open_to_dotl_flags(int flags); +#endif diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c new file mode 100644 index 00000000..2524e4cb --- /dev/null +++ b/fs/9p/vfs_addr.c @@ -0,0 +1,353 @@ +/* + * linux/fs/9p/vfs_addr.c + * + * This file contians vfs address (mmap) ops for 9P2000. + * + * Copyright (C) 2005 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "cache.h" +#include "fid.h" + +/** + * v9fs_fid_readpage - read an entire page in from 9P + * + * @fid: fid being read + * @page: structure to page + * + */ +static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) +{ + int retval; + loff_t offset; + char *buffer; + struct inode *inode; + + inode = page->mapping->host; + P9_DPRINTK(P9_DEBUG_VFS, "\n"); + + BUG_ON(!PageLocked(page)); + + retval = v9fs_readpage_from_fscache(inode, page); + if (retval == 0) + return retval; + + buffer = kmap(page); + offset = page_offset(page); + + retval = v9fs_fid_readn(fid, buffer, NULL, PAGE_CACHE_SIZE, offset); + if (retval < 0) { + v9fs_uncache_page(inode, page); + goto done; + } + + memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval); + flush_dcache_page(page); + SetPageUptodate(page); + + v9fs_readpage_to_fscache(inode, page); + retval = 0; + +done: + kunmap(page); + unlock_page(page); + return retval; +} + +/** + * v9fs_vfs_readpage - read an entire page in from 9P + * + * @filp: file being read + * @page: structure to page + * + */ + +static int v9fs_vfs_readpage(struct file *filp, struct page *page) +{ + return v9fs_fid_readpage(filp->private_data, page); +} + +/** + * v9fs_vfs_readpages - read a set of pages from 9P + * + * @filp: file being read + * @mapping: the address space + * @pages: list of pages to read + * @nr_pages: count of pages to read + * + */ + +static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + int ret = 0; + struct inode *inode; + + inode = mapping->host; + P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp); + + ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages); + if (ret == 0) + return ret; + + ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp); + P9_DPRINTK(P9_DEBUG_VFS, " = %d\n", ret); + return ret; +} + +/** + * v9fs_release_page - release the private state associated with a page + * + * Returns 1 if the page can be released, false otherwise. + */ + +static int v9fs_release_page(struct page *page, gfp_t gfp) +{ + if (PagePrivate(page)) + return 0; + return v9fs_fscache_release_page(page, gfp); +} + +/** + * v9fs_invalidate_page - Invalidate a page completely or partially + * + * @page: structure to page + * @offset: offset in the page + */ + +static void v9fs_invalidate_page(struct page *page, unsigned long offset) +{ + /* + * If called with zero offset, we should release + * the private state assocated with the page + */ + if (offset == 0) + v9fs_fscache_invalidate_page(page); +} + +static int v9fs_vfs_writepage_locked(struct page *page) +{ + char *buffer; + int retval, len; + loff_t offset, size; + mm_segment_t old_fs; + struct v9fs_inode *v9inode; + struct inode *inode = page->mapping->host; + + v9inode = V9FS_I(inode); + size = i_size_read(inode); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + set_page_writeback(page); + + buffer = kmap(page); + offset = page_offset(page); + + old_fs = get_fs(); + set_fs(get_ds()); + /* We should have writeback_fid always set */ + BUG_ON(!v9inode->writeback_fid); + + retval = v9fs_file_write_internal(inode, + v9inode->writeback_fid, + (__force const char __user *)buffer, + len, &offset, 0); + if (retval > 0) + retval = 0; + + set_fs(old_fs); + kunmap(page); + end_page_writeback(page); + return retval; +} + +static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) +{ + int retval; + + retval = v9fs_vfs_writepage_locked(page); + if (retval < 0) { + if (retval == -EAGAIN) { + redirty_page_for_writepage(wbc, page); + retval = 0; + } else { + SetPageError(page); + mapping_set_error(page->mapping, retval); + } + } else + retval = 0; + + unlock_page(page); + return retval; +} + +/** + * v9fs_launder_page - Writeback a dirty page + * Returns 0 on success. + */ + +static int v9fs_launder_page(struct page *page) +{ + int retval; + struct inode *inode = page->mapping->host; + + v9fs_fscache_wait_on_page_write(inode, page); + if (clear_page_dirty_for_io(page)) { + retval = v9fs_vfs_writepage_locked(page); + if (retval) + return retval; + } + return 0; +} + +/** + * v9fs_direct_IO - 9P address space operation for direct I/O + * @rw: direction (read or write) + * @iocb: target I/O control block + * @iov: array of vectors that define I/O buffer + * @pos: offset in file to begin the operation + * @nr_segs: size of iovec array + * + * The presence of v9fs_direct_IO() in the address space ops vector + * allowes open() O_DIRECT flags which would have failed otherwise. + * + * In the non-cached mode, we shunt off direct read and write requests before + * the VFS gets them, so this method should never be called. + * + * Direct IO is not 'yet' supported in the cached mode. Hence when + * this routine is called through generic_file_aio_read(), the read/write fails + * with an error. + * + */ +static ssize_t +v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t pos, unsigned long nr_segs) +{ + /* + * FIXME + * Now that we do caching with cache mode enabled, We need + * to support direct IO + */ + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) " + "off/no(%lld/%lu) EINVAL\n", + iocb->ki_filp->f_path.dentry->d_name.name, + (long long) pos, nr_segs); + + return -EINVAL; +} + +static int v9fs_write_begin(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int retval = 0; + struct page *page; + struct v9fs_inode *v9inode; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct inode *inode = mapping->host; + + v9inode = V9FS_I(inode); +start: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + retval = -ENOMEM; + goto out; + } + BUG_ON(!v9inode->writeback_fid); + if (PageUptodate(page)) + goto out; + + if (len == PAGE_CACHE_SIZE) + goto out; + + retval = v9fs_fid_readpage(v9inode->writeback_fid, page); + page_cache_release(page); + if (!retval) + goto start; +out: + *pagep = page; + return retval; +} + +static int v9fs_write_end(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + loff_t last_pos = pos + copied; + struct inode *inode = page->mapping->host; + + if (unlikely(copied < len)) { + /* + * zero out the rest of the area + */ + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + + zero_user(page, from + copied, len - copied); + flush_dcache_page(page); + } + + if (!PageUptodate(page)) + SetPageUptodate(page); + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold the i_mutex. + */ + if (last_pos > inode->i_size) { + inode_add_bytes(inode, last_pos - inode->i_size); + i_size_write(inode, last_pos); + } + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + + return copied; +} + + +const struct address_space_operations v9fs_addr_operations = { + .readpage = v9fs_vfs_readpage, + .readpages = v9fs_vfs_readpages, + .set_page_dirty = __set_page_dirty_nobuffers, + .writepage = v9fs_vfs_writepage, + .write_begin = v9fs_write_begin, + .write_end = v9fs_write_end, + .releasepage = v9fs_release_page, + .invalidatepage = v9fs_invalidate_page, + .launder_page = v9fs_launder_page, + .direct_IO = v9fs_direct_IO, +}; diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c new file mode 100644 index 00000000..e022890c --- /dev/null +++ b/fs/9p/vfs_dentry.c @@ -0,0 +1,147 @@ +/* + * linux/fs/9p/vfs_dentry.c + * + * This file contians vfs dentry ops for the 9P2000 protocol. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" + +/** + * v9fs_dentry_delete - called when dentry refcount equals 0 + * @dentry: dentry in question + * + * By returning 1 here we should remove cacheing of unused + * dentry components. + * + */ + +static int v9fs_dentry_delete(const struct dentry *dentry) +{ + P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, + dentry); + + return 1; +} + +/** + * v9fs_cached_dentry_delete - called when dentry refcount equals 0 + * @dentry: dentry in question + * + */ +static int v9fs_cached_dentry_delete(const struct dentry *dentry) +{ + P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", + dentry->d_name.name, dentry); + + /* Don't cache negative dentries */ + if (!dentry->d_inode) + return 1; + return 0; +} + +/** + * v9fs_dentry_release - called when dentry is going to be freed + * @dentry: dentry that is being release + * + */ + +static void v9fs_dentry_release(struct dentry *dentry) +{ + struct v9fs_dentry *dent; + struct p9_fid *temp, *current_fid; + + P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, + dentry); + dent = dentry->d_fsdata; + if (dent) { + list_for_each_entry_safe(current_fid, temp, &dent->fidlist, + dlist) { + p9_client_clunk(current_fid); + } + + kfree(dent); + dentry->d_fsdata = NULL; + } +} + +static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct p9_fid *fid; + struct inode *inode; + struct v9fs_inode *v9inode; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + if (!inode) + goto out_valid; + + v9inode = V9FS_I(inode); + if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) { + int retval; + struct v9fs_session_info *v9ses; + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + v9ses = v9fs_inode2v9ses(inode); + if (v9fs_proto_dotl(v9ses)) + retval = v9fs_refresh_inode_dotl(fid, inode); + else + retval = v9fs_refresh_inode(fid, inode); + if (retval == -ENOENT) + return 0; + if (retval < 0) + return retval; + } +out_valid: + return 1; +} + +const struct dentry_operations v9fs_cached_dentry_operations = { + .d_revalidate = v9fs_lookup_revalidate, + .d_delete = v9fs_cached_dentry_delete, + .d_release = v9fs_dentry_release, +}; + +const struct dentry_operations v9fs_dentry_operations = { + .d_delete = v9fs_dentry_delete, + .d_release = v9fs_dentry_release, +}; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c new file mode 100644 index 00000000..9c2bdda5 --- /dev/null +++ b/fs/9p/vfs_dir.c @@ -0,0 +1,318 @@ +/* + * linux/fs/9p/vfs_dir.c + * + * This file contains vfs directory ops for the 9P2000 protocol. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" + +/** + * struct p9_rdir - readdir accounting + * @mutex: mutex protecting readdir + * @head: start offset of current dirread buffer + * @tail: end offset of current dirread buffer + * @buf: dirread buffer + * + * private structure for keeping track of readdir + * allocated on demand + */ + +struct p9_rdir { + struct mutex mutex; + int head; + int tail; + uint8_t *buf; +}; + +/** + * dt_type - return file type + * @mistat: mistat structure + * + */ + +static inline int dt_type(struct p9_wstat *mistat) +{ + unsigned long perm = mistat->mode; + int rettype = DT_REG; + + if (perm & P9_DMDIR) + rettype = DT_DIR; + if (perm & P9_DMSYMLINK) + rettype = DT_LNK; + + return rettype; +} + +static void p9stat_init(struct p9_wstat *stbuf) +{ + stbuf->name = NULL; + stbuf->uid = NULL; + stbuf->gid = NULL; + stbuf->muid = NULL; + stbuf->extension = NULL; +} + +/** + * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir + * @filp: opened file structure + * @buflen: Length in bytes of buffer to allocate + * + */ + +static int v9fs_alloc_rdir_buf(struct file *filp, int buflen) +{ + struct p9_rdir *rdir; + struct p9_fid *fid; + int err = 0; + + fid = filp->private_data; + if (!fid->rdir) { + rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); + + if (rdir == NULL) { + err = -ENOMEM; + goto exit; + } + spin_lock(&filp->f_dentry->d_lock); + if (!fid->rdir) { + rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir); + mutex_init(&rdir->mutex); + rdir->head = rdir->tail = 0; + fid->rdir = (void *) rdir; + rdir = NULL; + } + spin_unlock(&filp->f_dentry->d_lock); + kfree(rdir); + } +exit: + return err; +} + +/** + * v9fs_dir_readdir - read a directory + * @filp: opened file structure + * @dirent: directory structure ??? + * @filldir: function to populate directory structure ??? + * + */ + +static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + int over; + struct p9_wstat st; + int err = 0; + struct p9_fid *fid; + int buflen; + int reclen = 0; + struct p9_rdir *rdir; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + fid = filp->private_data; + + buflen = fid->clnt->msize - P9_IOHDRSZ; + + err = v9fs_alloc_rdir_buf(filp, buflen); + if (err) + goto exit; + rdir = (struct p9_rdir *) fid->rdir; + + err = mutex_lock_interruptible(&rdir->mutex); + if (err) + return err; + while (err == 0) { + if (rdir->tail == rdir->head) { + err = v9fs_file_readn(filp, rdir->buf, NULL, + buflen, filp->f_pos); + if (err <= 0) + goto unlock_and_exit; + + rdir->head = 0; + rdir->tail = err; + } + while (rdir->head < rdir->tail) { + p9stat_init(&st); + err = p9stat_read(rdir->buf + rdir->head, + rdir->tail - rdir->head, &st, + fid->clnt->proto_version); + if (err) { + P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); + err = -EIO; + p9stat_free(&st); + goto unlock_and_exit; + } + reclen = st.size+2; + + over = filldir(dirent, st.name, strlen(st.name), + filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st)); + + p9stat_free(&st); + + if (over) { + err = 0; + goto unlock_and_exit; + } + rdir->head += reclen; + filp->f_pos += reclen; + } + } + +unlock_and_exit: + mutex_unlock(&rdir->mutex); +exit: + return err; +} + +/** + * v9fs_dir_readdir_dotl - read a directory + * @filp: opened file structure + * @dirent: buffer to fill dirent structures + * @filldir: function to populate dirent structures + * + */ +static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, + filldir_t filldir) +{ + int over; + int err = 0; + struct p9_fid *fid; + int buflen; + struct p9_rdir *rdir; + struct p9_dirent curdirent; + u64 oldoffset = 0; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + fid = filp->private_data; + + buflen = fid->clnt->msize - P9_READDIRHDRSZ; + + err = v9fs_alloc_rdir_buf(filp, buflen); + if (err) + goto exit; + rdir = (struct p9_rdir *) fid->rdir; + + err = mutex_lock_interruptible(&rdir->mutex); + if (err) + return err; + + while (err == 0) { + if (rdir->tail == rdir->head) { + err = p9_client_readdir(fid, rdir->buf, buflen, + filp->f_pos); + if (err <= 0) + goto unlock_and_exit; + + rdir->head = 0; + rdir->tail = err; + } + + while (rdir->head < rdir->tail) { + + err = p9dirent_read(rdir->buf + rdir->head, + rdir->tail - rdir->head, + &curdirent, + fid->clnt->proto_version); + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); + err = -EIO; + goto unlock_and_exit; + } + + /* d_off in dirent structure tracks the offset into + * the next dirent in the dir. However, filldir() + * expects offset into the current dirent. Hence + * while calling filldir send the offset from the + * previous dirent structure. + */ + over = filldir(dirent, curdirent.d_name, + strlen(curdirent.d_name), + oldoffset, v9fs_qid2ino(&curdirent.qid), + curdirent.d_type); + oldoffset = curdirent.d_off; + + if (over) { + err = 0; + goto unlock_and_exit; + } + + filp->f_pos = curdirent.d_off; + rdir->head += err; + } + } + +unlock_and_exit: + mutex_unlock(&rdir->mutex); +exit: + return err; +} + + +/** + * v9fs_dir_release - close a directory + * @inode: inode of the directory + * @filp: file pointer to a directory + * + */ + +int v9fs_dir_release(struct inode *inode, struct file *filp) +{ + struct p9_fid *fid; + + fid = filp->private_data; + P9_DPRINTK(P9_DEBUG_VFS, + "v9fs_dir_release: inode: %p filp: %p fid: %d\n", + inode, filp, fid ? fid->fid : -1); + if (fid) + p9_client_clunk(fid); + return 0; +} + +const struct file_operations v9fs_dir_operations = { + .read = generic_read_dir, + .llseek = generic_file_llseek, + .readdir = v9fs_dir_readdir, + .open = v9fs_file_open, + .release = v9fs_dir_release, +}; + +const struct file_operations v9fs_dir_operations_dotl = { + .read = generic_read_dir, + .llseek = generic_file_llseek, + .readdir = v9fs_dir_readdir_dotl, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .fsync = v9fs_file_fsync_dotl, +}; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c new file mode 100644 index 00000000..9d6e1685 --- /dev/null +++ b/fs/9p/vfs_file.c @@ -0,0 +1,771 @@ +/* + * linux/fs/9p/vfs_file.c + * + * This file contians vfs file ops for 9P2000. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" +#include "cache.h" + +static const struct vm_operations_struct v9fs_file_vm_ops; + +/** + * v9fs_file_open - open a file (or directory) + * @inode: inode to be opened + * @file: file being opened + * + */ + +int v9fs_file_open(struct inode *inode, struct file *file) +{ + int err; + struct v9fs_inode *v9inode; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + int omode; + + P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); + v9inode = V9FS_I(inode); + v9ses = v9fs_inode2v9ses(inode); + if (v9fs_proto_dotl(v9ses)) + omode = v9fs_open_to_dotl_flags(file->f_flags); + else + omode = v9fs_uflags2omode(file->f_flags, + v9fs_proto_dotu(v9ses)); + fid = file->private_data; + if (!fid) { + fid = v9fs_fid_clone(file->f_path.dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + err = p9_client_open(fid, omode); + if (err < 0) { + p9_client_clunk(fid); + return err; + } + if (file->f_flags & O_TRUNC) { + i_size_write(inode, 0); + inode->i_blocks = 0; + } + if ((file->f_flags & O_APPEND) && + (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))) + generic_file_llseek(file, 0, SEEK_END); + } + + file->private_data = fid; + mutex_lock(&v9inode->v_mutex); + if (v9ses->cache && !v9inode->writeback_fid && + ((file->f_flags & O_ACCMODE) != O_RDONLY)) { + /* + * clone a fid and add it to writeback_fid + * we do it during open time instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + fid = v9fs_writeback_fid(file->f_path.dentry); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + mutex_unlock(&v9inode->v_mutex); + goto out_error; + } + v9inode->writeback_fid = (void *) fid; + } + mutex_unlock(&v9inode->v_mutex); +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache) + v9fs_cache_inode_set_cookie(inode, file); +#endif + return 0; +out_error: + p9_client_clunk(file->private_data); + file->private_data = NULL; + return err; +} + +/** + * v9fs_file_lock - lock a file (or directory) + * @filp: file to be locked + * @cmd: lock command + * @fl: file lock structure + * + * Bugs: this looks like a local only lock, we should extend into 9P + * by using open exclusive + */ + +static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) +{ + int res = 0; + struct inode *inode = filp->f_path.dentry->d_inode; + + P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); + + /* No mandatory locks */ + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { + filemap_write_and_wait(inode->i_mapping); + invalidate_mapping_pages(&inode->i_data, 0, -1); + } + + return res; +} + +static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) +{ + struct p9_flock flock; + struct p9_fid *fid; + uint8_t status; + int res = 0; + unsigned char fl_type; + + fid = filp->private_data; + BUG_ON(fid == NULL); + + if ((fl->fl_flags & FL_POSIX) != FL_POSIX) + BUG(); + + res = posix_lock_file_wait(filp, fl); + if (res < 0) + goto out; + + /* convert posix lock to p9 tlock args */ + memset(&flock, 0, sizeof(flock)); + /* map the lock type */ + switch (fl->fl_type) { + case F_RDLCK: + flock.type = P9_LOCK_TYPE_RDLCK; + break; + case F_WRLCK: + flock.type = P9_LOCK_TYPE_WRLCK; + break; + case F_UNLCK: + flock.type = P9_LOCK_TYPE_UNLCK; + break; + } + flock.start = fl->fl_start; + if (fl->fl_end == OFFSET_MAX) + flock.length = 0; + else + flock.length = fl->fl_end - fl->fl_start + 1; + flock.proc_id = fl->fl_pid; + flock.client_id = utsname()->nodename; + if (IS_SETLKW(cmd)) + flock.flags = P9_LOCK_FLAGS_BLOCK; + + /* + * if its a blocked request and we get P9_LOCK_BLOCKED as the status + * for lock request, keep on trying + */ + for (;;) { + res = p9_client_lock_dotl(fid, &flock, &status); + if (res < 0) + break; + + if (status != P9_LOCK_BLOCKED) + break; + if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd)) + break; + schedule_timeout_interruptible(P9_LOCK_TIMEOUT); + } + + /* map 9p status to VFS status */ + switch (status) { + case P9_LOCK_SUCCESS: + res = 0; + break; + case P9_LOCK_BLOCKED: + res = -EAGAIN; + break; + case P9_LOCK_ERROR: + case P9_LOCK_GRACE: + res = -ENOLCK; + break; + default: + BUG(); + } + + /* + * incase server returned error for lock request, revert + * it locally + */ + if (res < 0 && fl->fl_type != F_UNLCK) { + fl_type = fl->fl_type; + fl->fl_type = F_UNLCK; + res = posix_lock_file_wait(filp, fl); + fl->fl_type = fl_type; + } +out: + return res; +} + +static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) +{ + struct p9_getlock glock; + struct p9_fid *fid; + int res = 0; + + fid = filp->private_data; + BUG_ON(fid == NULL); + + posix_test_lock(filp, fl); + /* + * if we have a conflicting lock locally, no need to validate + * with server + */ + if (fl->fl_type != F_UNLCK) + return res; + + /* convert posix lock to p9 tgetlock args */ + memset(&glock, 0, sizeof(glock)); + glock.type = P9_LOCK_TYPE_UNLCK; + glock.start = fl->fl_start; + if (fl->fl_end == OFFSET_MAX) + glock.length = 0; + else + glock.length = fl->fl_end - fl->fl_start + 1; + glock.proc_id = fl->fl_pid; + glock.client_id = utsname()->nodename; + + res = p9_client_getlock_dotl(fid, &glock); + if (res < 0) + return res; + /* map 9p lock type to os lock type */ + switch (glock.type) { + case P9_LOCK_TYPE_RDLCK: + fl->fl_type = F_RDLCK; + break; + case P9_LOCK_TYPE_WRLCK: + fl->fl_type = F_WRLCK; + break; + case P9_LOCK_TYPE_UNLCK: + fl->fl_type = F_UNLCK; + break; + } + if (glock.type != P9_LOCK_TYPE_UNLCK) { + fl->fl_start = glock.start; + if (glock.length == 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = glock.start + glock.length - 1; + fl->fl_pid = glock.proc_id; + } + return res; +} + +/** + * v9fs_file_lock_dotl - lock a file (or directory) + * @filp: file to be locked + * @cmd: lock command + * @fl: file lock structure + * + */ + +static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + int ret = -ENOLCK; + + P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp, + cmd, fl, filp->f_path.dentry->d_name.name); + + /* No mandatory locks */ + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + goto out_err; + + if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { + filemap_write_and_wait(inode->i_mapping); + invalidate_mapping_pages(&inode->i_data, 0, -1); + } + + if (IS_SETLK(cmd) || IS_SETLKW(cmd)) + ret = v9fs_file_do_lock(filp, cmd, fl); + else if (IS_GETLK(cmd)) + ret = v9fs_file_getlock(filp, fl); + else + ret = -EINVAL; +out_err: + return ret; +} + +/** + * v9fs_file_flock_dotl - lock a file + * @filp: file to be locked + * @cmd: lock command + * @fl: file lock structure + * + */ + +static int v9fs_file_flock_dotl(struct file *filp, int cmd, + struct file_lock *fl) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + int ret = -ENOLCK; + + P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp, + cmd, fl, filp->f_path.dentry->d_name.name); + + /* No mandatory locks */ + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + goto out_err; + + if (!(fl->fl_flags & FL_FLOCK)) + goto out_err; + + if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { + filemap_write_and_wait(inode->i_mapping); + invalidate_mapping_pages(&inode->i_data, 0, -1); + } + /* Convert flock to posix lock */ + fl->fl_owner = (fl_owner_t)filp; + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; + fl->fl_flags |= FL_POSIX; + fl->fl_flags ^= FL_FLOCK; + + if (IS_SETLK(cmd) | IS_SETLKW(cmd)) + ret = v9fs_file_do_lock(filp, cmd, fl); + else + ret = -EINVAL; +out_err: + return ret; +} + +/** + * v9fs_fid_readn - read from a fid + * @fid: fid to read + * @data: data buffer to read data into + * @udata: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ +ssize_t +v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count, + u64 offset) +{ + int n, total, size; + + P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, + (long long unsigned) offset, count); + n = 0; + total = 0; + size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; + do { + n = p9_client_read(fid, data, udata, offset, count); + if (n <= 0) + break; + + if (data) + data += n; + if (udata) + udata += n; + + offset += n; + count -= n; + total += n; + } while (count > 0 && n == size); + + if (n < 0) + total = n; + + return total; +} + +/** + * v9fs_file_readn - read from a file + * @filp: file pointer to read + * @data: data buffer to read data into + * @udata: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ +ssize_t +v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, + u64 offset) +{ + return v9fs_fid_readn(filp->private_data, data, udata, count, offset); +} + +/** + * v9fs_file_read - read from a file + * @filp: file pointer to read + * @udata: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ + +static ssize_t +v9fs_file_read(struct file *filp, char __user *udata, size_t count, + loff_t * offset) +{ + int ret; + struct p9_fid *fid; + size_t size; + + P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); + fid = filp->private_data; + + size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; + if (count > size) + ret = v9fs_file_readn(filp, NULL, udata, count, *offset); + else + ret = p9_client_read(fid, NULL, udata, *offset, count); + + if (ret > 0) + *offset += ret; + + return ret; +} + +ssize_t +v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid, + const char __user *data, size_t count, + loff_t *offset, int invalidate) +{ + int n; + loff_t i_size; + size_t total = 0; + struct p9_client *clnt; + loff_t origin = *offset; + unsigned long pg_start, pg_end; + + P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, + (int)count, (int)*offset); + + clnt = fid->clnt; + do { + n = p9_client_write(fid, NULL, data+total, origin+total, count); + if (n <= 0) + break; + count -= n; + total += n; + } while (count > 0); + + if (invalidate && (total > 0)) { + pg_start = origin >> PAGE_CACHE_SHIFT; + pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT; + if (inode->i_mapping && inode->i_mapping->nrpages) + invalidate_inode_pages2_range(inode->i_mapping, + pg_start, pg_end); + *offset += total; + i_size = i_size_read(inode); + if (*offset > i_size) { + inode_add_bytes(inode, *offset - i_size); + i_size_write(inode, *offset); + } + } + if (n < 0) + return n; + + return total; +} + +/** + * v9fs_file_write - write to a file + * @filp: file pointer to write + * @data: data buffer to write data from + * @count: size of buffer + * @offset: offset at which to write data + * + */ +static ssize_t +v9fs_file_write(struct file *filp, const char __user * data, + size_t count, loff_t *offset) +{ + ssize_t retval = 0; + loff_t origin = *offset; + + + retval = generic_write_checks(filp, &origin, &count, 0); + if (retval) + goto out; + + retval = -EINVAL; + if ((ssize_t) count < 0) + goto out; + retval = 0; + if (!count) + goto out; + + retval = v9fs_file_write_internal(filp->f_path.dentry->d_inode, + filp->private_data, + data, count, &origin, 1); + /* update offset on successful write */ + if (retval > 0) + *offset = origin; +out: + return retval; +} + + +static int v9fs_file_fsync(struct file *filp, int datasync) +{ + struct p9_fid *fid; + struct p9_wstat wstat; + int retval; + + P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); + + fid = filp->private_data; + v9fs_blank_wstat(&wstat); + + retval = p9_client_wstat(fid, &wstat); + return retval; +} + +int v9fs_file_fsync_dotl(struct file *filp, int datasync) +{ + struct p9_fid *fid; + int retval; + + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n", + filp, datasync); + + fid = filp->private_data; + + retval = p9_client_fsync(fid, datasync); + return retval; +} + +static int +v9fs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int retval; + + retval = generic_file_mmap(file, vma); + if (!retval) + vma->vm_ops = &v9fs_file_vm_ops; + + return retval; +} + +static int +v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct v9fs_inode *v9inode; + struct page *page = vmf->page; + struct file *filp = vma->vm_file; + struct inode *inode = filp->f_path.dentry->d_inode; + + + P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n", + page, (unsigned long)filp->private_data); + + v9inode = V9FS_I(inode); + /* make sure the cache has finished storing the page */ + v9fs_fscache_wait_on_page_write(inode, page); + BUG_ON(!v9inode->writeback_fid); + lock_page(page); + if (page->mapping != inode->i_mapping) + goto out_unlock; + + return VM_FAULT_LOCKED; +out_unlock: + unlock_page(page); + return VM_FAULT_NOPAGE; +} + +static ssize_t +v9fs_direct_read(struct file *filp, char __user *udata, size_t count, + loff_t *offsetp) +{ + loff_t size, offset; + struct inode *inode; + struct address_space *mapping; + + offset = *offsetp; + mapping = filp->f_mapping; + inode = mapping->host; + if (!count) + return 0; + size = i_size_read(inode); + if (offset < size) + filemap_write_and_wait_range(mapping, offset, + offset + count - 1); + + return v9fs_file_read(filp, udata, count, offsetp); +} + +/** + * v9fs_cached_file_read - read from a file + * @filp: file pointer to read + * @udata: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ +static ssize_t +v9fs_cached_file_read(struct file *filp, char __user *data, size_t count, + loff_t *offset) +{ + if (filp->f_flags & O_DIRECT) + return v9fs_direct_read(filp, data, count, offset); + return do_sync_read(filp, data, count, offset); +} + +static ssize_t +v9fs_direct_write(struct file *filp, const char __user * data, + size_t count, loff_t *offsetp) +{ + loff_t offset; + ssize_t retval; + struct inode *inode; + struct address_space *mapping; + + offset = *offsetp; + mapping = filp->f_mapping; + inode = mapping->host; + if (!count) + return 0; + + mutex_lock(&inode->i_mutex); + retval = filemap_write_and_wait_range(mapping, offset, + offset + count - 1); + if (retval) + goto err_out; + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that if we fail + * here we fall back to buffered write + */ + if (mapping->nrpages) { + pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT; + pgoff_t pg_end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + + retval = invalidate_inode_pages2_range(mapping, + pg_start, pg_end); + /* + * If a page can not be invalidated, fall back + * to buffered write. + */ + if (retval) { + if (retval == -EBUSY) + goto buff_write; + goto err_out; + } + } + retval = v9fs_file_write(filp, data, count, offsetp); +err_out: + mutex_unlock(&inode->i_mutex); + return retval; + +buff_write: + mutex_unlock(&inode->i_mutex); + return do_sync_write(filp, data, count, offsetp); +} + +/** + * v9fs_cached_file_write - write to a file + * @filp: file pointer to write + * @data: data buffer to write data from + * @count: size of buffer + * @offset: offset at which to write data + * + */ +static ssize_t +v9fs_cached_file_write(struct file *filp, const char __user * data, + size_t count, loff_t *offset) +{ + + if (filp->f_flags & O_DIRECT) + return v9fs_direct_write(filp, data, count, offset); + return do_sync_write(filp, data, count, offset); +} + +static const struct vm_operations_struct v9fs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = v9fs_vm_page_mkwrite, +}; + + +const struct file_operations v9fs_cached_file_operations = { + .llseek = generic_file_llseek, + .read = v9fs_cached_file_read, + .write = v9fs_cached_file_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock, + .mmap = v9fs_file_mmap, + .fsync = v9fs_file_fsync, +}; + +const struct file_operations v9fs_cached_file_operations_dotl = { + .llseek = generic_file_llseek, + .read = v9fs_cached_file_read, + .write = v9fs_cached_file_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock_dotl, + .flock = v9fs_file_flock_dotl, + .mmap = v9fs_file_mmap, + .fsync = v9fs_file_fsync_dotl, +}; + +const struct file_operations v9fs_file_operations = { + .llseek = generic_file_llseek, + .read = v9fs_file_read, + .write = v9fs_file_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock, + .mmap = generic_file_readonly_mmap, + .fsync = v9fs_file_fsync, +}; + +const struct file_operations v9fs_file_operations_dotl = { + .llseek = generic_file_llseek, + .read = v9fs_file_read, + .write = v9fs_file_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock_dotl, + .flock = v9fs_file_flock_dotl, + .mmap = generic_file_readonly_mmap, + .fsync = v9fs_file_fsync_dotl, +}; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c new file mode 100644 index 00000000..c72e20cd --- /dev/null +++ b/fs/9p/vfs_inode.c @@ -0,0 +1,1487 @@ +/* + * linux/fs/9p/vfs_inode.c + * + * This file contains vfs inode ops for the 9P2000 protocol. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" +#include "cache.h" +#include "xattr.h" +#include "acl.h" + +static const struct inode_operations v9fs_dir_inode_operations; +static const struct inode_operations v9fs_dir_inode_operations_dotu; +static const struct inode_operations v9fs_file_inode_operations; +static const struct inode_operations v9fs_symlink_inode_operations; + +/** + * unixmode2p9mode - convert unix mode bits to plan 9 + * @v9ses: v9fs session information + * @mode: mode to convert + * + */ + +static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) +{ + int res; + res = mode & 0777; + if (S_ISDIR(mode)) + res |= P9_DMDIR; + if (v9fs_proto_dotu(v9ses)) { + if (S_ISLNK(mode)) + res |= P9_DMSYMLINK; + if (v9ses->nodev == 0) { + if (S_ISSOCK(mode)) + res |= P9_DMSOCKET; + if (S_ISFIFO(mode)) + res |= P9_DMNAMEDPIPE; + if (S_ISBLK(mode)) + res |= P9_DMDEVICE; + if (S_ISCHR(mode)) + res |= P9_DMDEVICE; + } + + if ((mode & S_ISUID) == S_ISUID) + res |= P9_DMSETUID; + if ((mode & S_ISGID) == S_ISGID) + res |= P9_DMSETGID; + if ((mode & S_ISVTX) == S_ISVTX) + res |= P9_DMSETVTX; + if ((mode & P9_DMLINK)) + res |= P9_DMLINK; + } + + return res; +} + +/** + * p9mode2unixmode- convert plan9 mode bits to unix mode bits + * @v9ses: v9fs session information + * @stat: p9_wstat from which mode need to be derived + * @rdev: major number, minor number in case of device files. + * + */ +static int p9mode2unixmode(struct v9fs_session_info *v9ses, + struct p9_wstat *stat, dev_t *rdev) +{ + int res; + int mode = stat->mode; + + res = mode & S_IALLUGO; + *rdev = 0; + + if ((mode & P9_DMDIR) == P9_DMDIR) + res |= S_IFDIR; + else if ((mode & P9_DMSYMLINK) && (v9fs_proto_dotu(v9ses))) + res |= S_IFLNK; + else if ((mode & P9_DMSOCKET) && (v9fs_proto_dotu(v9ses)) + && (v9ses->nodev == 0)) + res |= S_IFSOCK; + else if ((mode & P9_DMNAMEDPIPE) && (v9fs_proto_dotu(v9ses)) + && (v9ses->nodev == 0)) + res |= S_IFIFO; + else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses)) + && (v9ses->nodev == 0)) { + char type = 0, ext[32]; + int major = -1, minor = -1; + + strncpy(ext, stat->extension, sizeof(ext)); + sscanf(ext, "%c %u %u", &type, &major, &minor); + switch (type) { + case 'c': + res |= S_IFCHR; + break; + case 'b': + res |= S_IFBLK; + break; + default: + P9_DPRINTK(P9_DEBUG_ERROR, + "Unknown special type %c %s\n", type, + stat->extension); + }; + *rdev = MKDEV(major, minor); + } else + res |= S_IFREG; + + if (v9fs_proto_dotu(v9ses)) { + if ((mode & P9_DMSETUID) == P9_DMSETUID) + res |= S_ISUID; + + if ((mode & P9_DMSETGID) == P9_DMSETGID) + res |= S_ISGID; + + if ((mode & P9_DMSETVTX) == P9_DMSETVTX) + res |= S_ISVTX; + } + return res; +} + +/** + * v9fs_uflags2omode- convert posix open flags to plan 9 mode bits + * @uflags: flags to convert + * @extended: if .u extensions are active + */ + +int v9fs_uflags2omode(int uflags, int extended) +{ + int ret; + + ret = 0; + switch (uflags&3) { + default: + case O_RDONLY: + ret = P9_OREAD; + break; + + case O_WRONLY: + ret = P9_OWRITE; + break; + + case O_RDWR: + ret = P9_ORDWR; + break; + } + + if (uflags & O_TRUNC) + ret |= P9_OTRUNC; + + if (extended) { + if (uflags & O_EXCL) + ret |= P9_OEXCL; + + if (uflags & O_APPEND) + ret |= P9_OAPPEND; + } + + return ret; +} + +/** + * v9fs_blank_wstat - helper function to setup a 9P stat structure + * @wstat: structure to initialize + * + */ + +void +v9fs_blank_wstat(struct p9_wstat *wstat) +{ + wstat->type = ~0; + wstat->dev = ~0; + wstat->qid.type = ~0; + wstat->qid.version = ~0; + *((long long *)&wstat->qid.path) = ~0; + wstat->mode = ~0; + wstat->atime = ~0; + wstat->mtime = ~0; + wstat->length = ~0; + wstat->name = NULL; + wstat->uid = NULL; + wstat->gid = NULL; + wstat->muid = NULL; + wstat->n_uid = ~0; + wstat->n_gid = ~0; + wstat->n_muid = ~0; + wstat->extension = NULL; +} + +/** + * v9fs_alloc_inode - helper function to allocate an inode + * + */ +struct inode *v9fs_alloc_inode(struct super_block *sb) +{ + struct v9fs_inode *v9inode; + v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache, + GFP_KERNEL); + if (!v9inode) + return NULL; +#ifdef CONFIG_9P_FSCACHE + v9inode->fscache = NULL; + spin_lock_init(&v9inode->fscache_lock); +#endif + v9inode->writeback_fid = NULL; + v9inode->cache_validity = 0; + mutex_init(&v9inode->v_mutex); + return &v9inode->vfs_inode; +} + +/** + * v9fs_destroy_inode - destroy an inode + * + */ + +static void v9fs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(v9fs_inode_cache, V9FS_I(inode)); +} + +void v9fs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, v9fs_i_callback); +} + +int v9fs_init_inode(struct v9fs_session_info *v9ses, + struct inode *inode, int mode, dev_t rdev) +{ + int err = 0; + + inode_init_owner(inode, NULL, mode); + inode->i_blocks = 0; + inode->i_rdev = rdev; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mapping->a_ops = &v9fs_addr_operations; + + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFBLK: + case S_IFCHR: + case S_IFSOCK: + if (v9fs_proto_dotl(v9ses)) { + inode->i_op = &v9fs_file_inode_operations_dotl; + inode->i_fop = &v9fs_file_operations_dotl; + } else if (v9fs_proto_dotu(v9ses)) { + inode->i_op = &v9fs_file_inode_operations; + inode->i_fop = &v9fs_file_operations; + } else { + P9_DPRINTK(P9_DEBUG_ERROR, + "special files without extended mode\n"); + err = -EINVAL; + goto error; + } + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + case S_IFREG: + if (v9fs_proto_dotl(v9ses)) { + inode->i_op = &v9fs_file_inode_operations_dotl; + if (v9ses->cache) + inode->i_fop = + &v9fs_cached_file_operations_dotl; + else + inode->i_fop = &v9fs_file_operations_dotl; + } else { + inode->i_op = &v9fs_file_inode_operations; + if (v9ses->cache) + inode->i_fop = &v9fs_cached_file_operations; + else + inode->i_fop = &v9fs_file_operations; + } + + break; + case S_IFLNK: + if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) { + P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with " + "legacy protocol.\n"); + err = -EINVAL; + goto error; + } + + if (v9fs_proto_dotl(v9ses)) + inode->i_op = &v9fs_symlink_inode_operations_dotl; + else + inode->i_op = &v9fs_symlink_inode_operations; + + break; + case S_IFDIR: + inc_nlink(inode); + if (v9fs_proto_dotl(v9ses)) + inode->i_op = &v9fs_dir_inode_operations_dotl; + else if (v9fs_proto_dotu(v9ses)) + inode->i_op = &v9fs_dir_inode_operations_dotu; + else + inode->i_op = &v9fs_dir_inode_operations; + + if (v9fs_proto_dotl(v9ses)) + inode->i_fop = &v9fs_dir_operations_dotl; + else + inode->i_fop = &v9fs_dir_operations; + + break; + default: + P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", + mode, mode & S_IFMT); + err = -EINVAL; + goto error; + } +error: + return err; + +} + +/** + * v9fs_get_inode - helper function to setup an inode + * @sb: superblock + * @mode: mode to setup inode with + * + */ + +struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev) +{ + int err; + struct inode *inode; + struct v9fs_session_info *v9ses = sb->s_fs_info; + + P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); + + inode = new_inode(sb); + if (!inode) { + P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); + return ERR_PTR(-ENOMEM); + } + err = v9fs_init_inode(v9ses, inode, mode, rdev); + if (err) { + iput(inode); + return ERR_PTR(err); + } + return inode; +} + +/* +static struct v9fs_fid* +v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry) +{ + int err; + int nfid; + struct v9fs_fid *ret; + struct v9fs_fcall *fcall; + + nfid = v9fs_get_idpool(&v9ses->fidpool); + if (nfid < 0) { + eprintk(KERN_WARNING, "no free fids available\n"); + return ERR_PTR(-ENOSPC); + } + + err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name, + &fcall); + + if (err < 0) { + if (fcall && fcall->id == RWALK) + goto clunk_fid; + + PRINT_FCALL_ERROR("walk error", fcall); + v9fs_put_idpool(nfid, &v9ses->fidpool); + goto error; + } + + kfree(fcall); + fcall = NULL; + ret = v9fs_fid_create(v9ses, nfid); + if (!ret) { + err = -ENOMEM; + goto clunk_fid; + } + + err = v9fs_fid_insert(ret, dentry); + if (err < 0) { + v9fs_fid_destroy(ret); + goto clunk_fid; + } + + return ret; + +clunk_fid: + v9fs_t_clunk(v9ses, nfid); + +error: + kfree(fcall); + return ERR_PTR(err); +} +*/ + + +/** + * v9fs_clear_inode - release an inode + * @inode: inode to release + * + */ +void v9fs_evict_inode(struct inode *inode) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + + truncate_inode_pages(inode->i_mapping, 0); + end_writeback(inode); + filemap_fdatawrite(inode->i_mapping); + +#ifdef CONFIG_9P_FSCACHE + v9fs_cache_inode_put_cookie(inode); +#endif + /* clunk the fid stashed in writeback_fid */ + if (v9inode->writeback_fid) { + p9_client_clunk(v9inode->writeback_fid); + v9inode->writeback_fid = NULL; + } +} + +static int v9fs_test_inode(struct inode *inode, void *data) +{ + int umode; + dev_t rdev; + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_wstat *st = (struct p9_wstat *)data; + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); + + umode = p9mode2unixmode(v9ses, st, &rdev); + /* don't match inode of different type */ + if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + return 0; + + /* compare qid details */ + if (memcmp(&v9inode->qid.version, + &st->qid.version, sizeof(v9inode->qid.version))) + return 0; + + if (v9inode->qid.type != st->qid.type) + return 0; + return 1; +} + +static int v9fs_test_new_inode(struct inode *inode, void *data) +{ + return 0; +} + +static int v9fs_set_inode(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_wstat *st = (struct p9_wstat *)data; + + memcpy(&v9inode->qid, &st->qid, sizeof(st->qid)); + return 0; +} + +static struct inode *v9fs_qid_iget(struct super_block *sb, + struct p9_qid *qid, + struct p9_wstat *st, + int new) +{ + dev_t rdev; + int retval, umode; + unsigned long i_ino; + struct inode *inode; + struct v9fs_session_info *v9ses = sb->s_fs_info; + int (*test)(struct inode *, void *); + + if (new) + test = v9fs_test_new_inode; + else + test = v9fs_test_inode; + + i_ino = v9fs_qid2ino(qid); + inode = iget5_locked(sb, i_ino, test, v9fs_set_inode, st); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + /* + * initialize the inode with the stat info + * FIXME!! we may need support for stale inodes + * later. + */ + inode->i_ino = i_ino; + umode = p9mode2unixmode(v9ses, st, &rdev); + retval = v9fs_init_inode(v9ses, inode, umode, rdev); + if (retval) + goto error; + + v9fs_stat2inode(st, inode, sb); +#ifdef CONFIG_9P_FSCACHE + v9fs_cache_inode_get_cookie(inode); +#endif + unlock_new_inode(inode); + return inode; +error: + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(retval); + +} + +struct inode * +v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb, int new) +{ + struct p9_wstat *st; + struct inode *inode = NULL; + + st = p9_client_stat(fid); + if (IS_ERR(st)) + return ERR_CAST(st); + + inode = v9fs_qid_iget(sb, &st->qid, st, new); + p9stat_free(st); + kfree(st); + return inode; +} + +/** + * v9fs_remove - helper function to remove files and directories + * @dir: directory inode that is being deleted + * @file: dentry that is being deleted + * @rmdir: removing a directory + * + */ + +static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) +{ + int retval; + struct p9_fid *v9fid; + struct inode *file_inode; + + P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, + rmdir); + + file_inode = file->d_inode; + v9fid = v9fs_fid_clone(file); + if (IS_ERR(v9fid)) + return PTR_ERR(v9fid); + + retval = p9_client_remove(v9fid); + if (!retval) { + /* + * directories on unlink should have zero + * link count + */ + if (rmdir) { + clear_nlink(file_inode); + drop_nlink(dir); + } else + drop_nlink(file_inode); + + v9fs_invalidate_inode_attr(file_inode); + v9fs_invalidate_inode_attr(dir); + } + return retval; +} + +/** + * v9fs_create - Create a file + * @v9ses: session information + * @dir: directory that dentry is being created in + * @dentry: dentry that is being created + * @extension: 9p2000.u extension string to support devices, etc. + * @perm: create permissions + * @mode: open mode + * + */ +static struct p9_fid * +v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, + struct dentry *dentry, char *extension, u32 perm, u8 mode) +{ + int err; + char *name; + struct p9_fid *dfid, *ofid, *fid; + struct inode *inode; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + + err = 0; + ofid = NULL; + fid = NULL; + name = (char *) dentry->d_name.name; + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return ERR_PTR(err); + } + + /* clone a fid to use for creation */ + ofid = p9_client_walk(dfid, 0, NULL, 1); + if (IS_ERR(ofid)) { + err = PTR_ERR(ofid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + return ERR_PTR(err); + } + + err = p9_client_fcreate(ofid, name, perm, mode, extension); + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err); + goto error; + } + + /* now walk from the parent so we can get unopened fid */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + fid = NULL; + goto error; + } + + /* instantiate inode and assign the unopened fid to the dentry */ + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); + goto error; + } + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + d_instantiate(dentry, inode); + return ofid; +error: + if (ofid) + p9_client_clunk(ofid); + + if (fid) + p9_client_clunk(fid); + + return ERR_PTR(err); +} + +/** + * v9fs_vfs_create - VFS hook to create files + * @dir: directory inode that is being created + * @dentry: dentry that is being deleted + * @mode: create permissions + * @nd: path information + * + */ + +static int +v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + int err; + u32 perm; + int flags; + struct file *filp; + struct v9fs_inode *v9inode; + struct v9fs_session_info *v9ses; + struct p9_fid *fid, *inode_fid; + + err = 0; + fid = NULL; + v9ses = v9fs_inode2v9ses(dir); + perm = unixmode2p9mode(v9ses, mode); + if (nd && nd->flags & LOOKUP_OPEN) + flags = nd->intent.open.flags - 1; + else + flags = O_RDWR; + + fid = v9fs_create(v9ses, dir, dentry, NULL, perm, + v9fs_uflags2omode(flags, + v9fs_proto_dotu(v9ses))); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + fid = NULL; + goto error; + } + + v9fs_invalidate_inode_attr(dir); + /* if we are opening a file, assign the open fid to the file */ + if (nd && nd->flags & LOOKUP_OPEN) { + v9inode = V9FS_I(dentry->d_inode); + mutex_lock(&v9inode->v_mutex); + if (v9ses->cache && !v9inode->writeback_fid && + ((flags & O_ACCMODE) != O_RDONLY)) { + /* + * clone a fid and add it to writeback_fid + * we do it during open time instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + inode_fid = v9fs_writeback_fid(dentry); + if (IS_ERR(inode_fid)) { + err = PTR_ERR(inode_fid); + mutex_unlock(&v9inode->v_mutex); + goto error; + } + v9inode->writeback_fid = (void *) inode_fid; + } + mutex_unlock(&v9inode->v_mutex); + filp = lookup_instantiate_filp(nd, dentry, generic_file_open); + if (IS_ERR(filp)) { + err = PTR_ERR(filp); + goto error; + } + + filp->private_data = fid; +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache) + v9fs_cache_inode_set_cookie(dentry->d_inode, filp); +#endif + } else + p9_client_clunk(fid); + + return 0; + +error: + if (fid) + p9_client_clunk(fid); + + return err; +} + +/** + * v9fs_vfs_mkdir - VFS mkdir hook to create a directory + * @dir: inode that is being unlinked + * @dentry: dentry that is being unlinked + * @mode: mode for new directory + * + */ + +static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int err; + u32 perm; + struct p9_fid *fid; + struct v9fs_session_info *v9ses; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + err = 0; + v9ses = v9fs_inode2v9ses(dir); + perm = unixmode2p9mode(v9ses, mode | S_IFDIR); + fid = v9fs_create(v9ses, dir, dentry, NULL, perm, P9_OREAD); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + fid = NULL; + } else { + inc_nlink(dir); + v9fs_invalidate_inode_attr(dir); + } + + if (fid) + p9_client_clunk(fid); + + return err; +} + +/** + * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode + * @dir: inode that is being walked from + * @dentry: dentry that is being walked to? + * @nameidata: path data + * + */ + +struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nameidata) +{ + struct dentry *res; + struct super_block *sb; + struct v9fs_session_info *v9ses; + struct p9_fid *dfid, *fid; + struct inode *inode; + char *name; + int result = 0; + + P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", + dir, dentry->d_name.name, dentry, nameidata); + + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + sb = dir->i_sb; + v9ses = v9fs_inode2v9ses(dir); + /* We can walk d_parent because we hold the dir->i_mutex */ + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) + return ERR_CAST(dfid); + + name = (char *) dentry->d_name.name; + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + result = PTR_ERR(fid); + if (result == -ENOENT) { + inode = NULL; + goto inst_out; + } + + return ERR_PTR(result); + } + /* + * Make sure we don't use a wrong inode due to parallel + * unlink. For cached mode create calls request for new + * inode. But with cache disabled, lookup should do this. + */ + if (v9ses->cache) + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); + else + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + result = PTR_ERR(inode); + inode = NULL; + goto error; + } + result = v9fs_fid_add(dentry, fid); + if (result < 0) + goto error_iput; +inst_out: + /* + * If we had a rename on the server and a parallel lookup + * for the new name, then make sure we instantiate with + * the new name. ie look up for a/b, while on server somebody + * moved b under k and client parallely did a lookup for + * k/b. + */ + res = d_materialise_unique(dentry, inode); + if (!IS_ERR(res)) + return res; + result = PTR_ERR(res); +error_iput: + iput(inode); +error: + p9_client_clunk(fid); + + return ERR_PTR(result); +} + +/** + * v9fs_vfs_unlink - VFS unlink hook to delete an inode + * @i: inode that is being unlinked + * @d: dentry that is being unlinked + * + */ + +int v9fs_vfs_unlink(struct inode *i, struct dentry *d) +{ + return v9fs_remove(i, d, 0); +} + +/** + * v9fs_vfs_rmdir - VFS unlink hook to delete a directory + * @i: inode that is being unlinked + * @d: dentry that is being unlinked + * + */ + +int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) +{ + return v9fs_remove(i, d, 1); +} + +/** + * v9fs_vfs_rename - VFS hook to rename an inode + * @old_dir: old dir inode + * @old_dentry: old dentry + * @new_dir: new dir inode + * @new_dentry: new dentry + * + */ + +int +v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int retval; + struct inode *old_inode; + struct inode *new_inode; + struct v9fs_session_info *v9ses; + struct p9_fid *oldfid; + struct p9_fid *olddirfid; + struct p9_fid *newdirfid; + struct p9_wstat wstat; + + P9_DPRINTK(P9_DEBUG_VFS, "\n"); + retval = 0; + old_inode = old_dentry->d_inode; + new_inode = new_dentry->d_inode; + v9ses = v9fs_inode2v9ses(old_inode); + oldfid = v9fs_fid_lookup(old_dentry); + if (IS_ERR(oldfid)) + return PTR_ERR(oldfid); + + olddirfid = v9fs_fid_clone(old_dentry->d_parent); + if (IS_ERR(olddirfid)) { + retval = PTR_ERR(olddirfid); + goto done; + } + + newdirfid = v9fs_fid_clone(new_dentry->d_parent); + if (IS_ERR(newdirfid)) { + retval = PTR_ERR(newdirfid); + goto clunk_olddir; + } + + down_write(&v9ses->rename_sem); + if (v9fs_proto_dotl(v9ses)) { + retval = p9_client_rename(oldfid, newdirfid, + (char *) new_dentry->d_name.name); + if (retval != -ENOSYS) + goto clunk_newdir; + } + if (old_dentry->d_parent != new_dentry->d_parent) { + /* + * 9P .u can only handle file rename in the same directory + */ + + P9_DPRINTK(P9_DEBUG_ERROR, + "old dir and new dir are different\n"); + retval = -EXDEV; + goto clunk_newdir; + } + v9fs_blank_wstat(&wstat); + wstat.muid = v9ses->uname; + wstat.name = (char *) new_dentry->d_name.name; + retval = p9_client_wstat(oldfid, &wstat); + +clunk_newdir: + if (!retval) { + if (new_inode) { + if (S_ISDIR(new_inode->i_mode)) + clear_nlink(new_inode); + else + drop_nlink(new_inode); + /* + * Work around vfs rename rehash bug with + * FS_RENAME_DOES_D_MOVE + */ + v9fs_invalidate_inode_attr(new_inode); + } + if (S_ISDIR(old_inode->i_mode)) { + if (!new_inode) + inc_nlink(new_dir); + drop_nlink(old_dir); + } + v9fs_invalidate_inode_attr(old_inode); + v9fs_invalidate_inode_attr(old_dir); + v9fs_invalidate_inode_attr(new_dir); + + /* successful rename */ + d_move(old_dentry, new_dentry); + } + up_write(&v9ses->rename_sem); + p9_client_clunk(newdirfid); + +clunk_olddir: + p9_client_clunk(olddirfid); + +done: + return retval; +} + +/** + * v9fs_vfs_getattr - retrieve file metadata + * @mnt: mount information + * @dentry: file to get attributes on + * @stat: metadata structure to populate + * + */ + +static int +v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + int err; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_wstat *st; + + P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); + err = -EPERM; + v9ses = v9fs_dentry2v9ses(dentry); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + generic_fillattr(dentry->d_inode, stat); + return 0; + } + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + st = p9_client_stat(fid); + if (IS_ERR(st)) + return PTR_ERR(st); + + v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); + generic_fillattr(dentry->d_inode, stat); + + p9stat_free(st); + kfree(st); + return 0; +} + +/** + * v9fs_vfs_setattr - set file metadata + * @dentry: file whose metadata to set + * @iattr: metadata assignment structure + * + */ + +static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + int retval; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_wstat wstat; + + P9_DPRINTK(P9_DEBUG_VFS, "\n"); + retval = inode_change_ok(dentry->d_inode, iattr); + if (retval) + return retval; + + retval = -EPERM; + v9ses = v9fs_dentry2v9ses(dentry); + fid = v9fs_fid_lookup(dentry); + if(IS_ERR(fid)) + return PTR_ERR(fid); + + v9fs_blank_wstat(&wstat); + if (iattr->ia_valid & ATTR_MODE) + wstat.mode = unixmode2p9mode(v9ses, iattr->ia_mode); + + if (iattr->ia_valid & ATTR_MTIME) + wstat.mtime = iattr->ia_mtime.tv_sec; + + if (iattr->ia_valid & ATTR_ATIME) + wstat.atime = iattr->ia_atime.tv_sec; + + if (iattr->ia_valid & ATTR_SIZE) + wstat.length = iattr->ia_size; + + if (v9fs_proto_dotu(v9ses)) { + if (iattr->ia_valid & ATTR_UID) + wstat.n_uid = iattr->ia_uid; + + if (iattr->ia_valid & ATTR_GID) + wstat.n_gid = iattr->ia_gid; + } + + /* Write all dirty data */ + if (S_ISREG(dentry->d_inode->i_mode)) + filemap_write_and_wait(dentry->d_inode->i_mapping); + + retval = p9_client_wstat(fid, &wstat); + if (retval < 0) + return retval; + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(dentry->d_inode)) + truncate_setsize(dentry->d_inode, iattr->ia_size); + + v9fs_invalidate_inode_attr(dentry->d_inode); + + setattr_copy(dentry->d_inode, iattr); + mark_inode_dirty(dentry->d_inode); + return 0; +} + +/** + * v9fs_stat2inode - populate an inode structure with mistat info + * @stat: Plan 9 metadata (mistat) structure + * @inode: inode to populate + * @sb: superblock of filesystem + * + */ + +void +v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, + struct super_block *sb) +{ + mode_t mode; + char ext[32]; + char tag_name[14]; + unsigned int i_nlink; + struct v9fs_session_info *v9ses = sb->s_fs_info; + struct v9fs_inode *v9inode = V9FS_I(inode); + + inode->i_nlink = 1; + + inode->i_atime.tv_sec = stat->atime; + inode->i_mtime.tv_sec = stat->mtime; + inode->i_ctime.tv_sec = stat->mtime; + + inode->i_uid = v9ses->dfltuid; + inode->i_gid = v9ses->dfltgid; + + if (v9fs_proto_dotu(v9ses)) { + inode->i_uid = stat->n_uid; + inode->i_gid = stat->n_gid; + } + if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) { + if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) { + /* + * Hadlink support got added later to + * to the .u extension. So there can be + * server out there that doesn't support + * this even with .u extension. So check + * for non NULL stat->extension + */ + strncpy(ext, stat->extension, sizeof(ext)); + /* HARDLINKCOUNT %u */ + sscanf(ext, "%13s %u", tag_name, &i_nlink); + if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) + inode->i_nlink = i_nlink; + } + } + mode = stat->mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; + i_size_write(inode, stat->length); + + /* not real number of blocks, but 512 byte ones ... */ + inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; + v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; +} + +/** + * v9fs_qid2ino - convert qid into inode number + * @qid: qid to hash + * + * BUG: potential for inode number collisions? + */ + +ino_t v9fs_qid2ino(struct p9_qid *qid) +{ + u64 path = qid->path + 2; + ino_t i = 0; + + if (sizeof(ino_t) == sizeof(path)) + memcpy(&i, &path, sizeof(ino_t)); + else + i = (ino_t) (path ^ (path >> 32)); + + return i; +} + +/** + * v9fs_readlink - read a symlink's location (internal version) + * @dentry: dentry for symlink + * @buffer: buffer to load symlink location into + * @buflen: length of buffer + * + */ + +static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) +{ + int retval; + + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_wstat *st; + + P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); + retval = -EPERM; + v9ses = v9fs_dentry2v9ses(dentry); + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + if (!v9fs_proto_dotu(v9ses)) + return -EBADF; + + st = p9_client_stat(fid); + if (IS_ERR(st)) + return PTR_ERR(st); + + if (!(st->mode & P9_DMSYMLINK)) { + retval = -EINVAL; + goto done; + } + + /* copy extension buffer into buffer */ + strncpy(buffer, st->extension, buflen); + + P9_DPRINTK(P9_DEBUG_VFS, + "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer); + + retval = strnlen(buffer, buflen); +done: + p9stat_free(st); + kfree(st); + return retval; +} + +/** + * v9fs_vfs_follow_link - follow a symlink path + * @dentry: dentry for symlink + * @nd: nameidata + * + */ + +static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + int len = 0; + char *link = __getname(); + + P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name); + + if (!link) + link = ERR_PTR(-ENOMEM); + else { + len = v9fs_readlink(dentry, link, PATH_MAX); + + if (len < 0) { + __putname(link); + link = ERR_PTR(len); + } else + link[min(len, PATH_MAX-1)] = 0; + } + nd_set_link(nd, link); + + return NULL; +} + +/** + * v9fs_vfs_put_link - release a symlink path + * @dentry: dentry for symlink + * @nd: nameidata + * @p: unused + * + */ + +void +v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) +{ + char *s = nd_get_link(nd); + + P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name, + IS_ERR(s) ? "" : s); + if (!IS_ERR(s)) + __putname(s); +} + +/** + * v9fs_vfs_mkspecial - create a special file + * @dir: inode to create special file in + * @dentry: dentry to create + * @mode: mode to create special file + * @extension: 9p2000.u format extension string representing special file + * + */ + +static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, + int mode, const char *extension) +{ + u32 perm; + struct p9_fid *fid; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(dir); + if (!v9fs_proto_dotu(v9ses)) { + P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n"); + return -EPERM; + } + + perm = unixmode2p9mode(v9ses, mode); + fid = v9fs_create(v9ses, dir, dentry, (char *) extension, perm, + P9_OREAD); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + v9fs_invalidate_inode_attr(dir); + p9_client_clunk(fid); + return 0; +} + +/** + * v9fs_vfs_symlink - helper function to create symlinks + * @dir: directory inode containing symlink + * @dentry: dentry for symlink + * @symname: symlink data + * + * See Also: 9P2000.u RFC for more information + * + */ + +static int +v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, + dentry->d_name.name, symname); + + return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname); +} + +/** + * v9fs_vfs_link - create a hardlink + * @old_dentry: dentry for file to link to + * @dir: inode destination for new link + * @dentry: dentry for link + * + */ + +static int +v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + int retval; + char *name; + struct p9_fid *oldfid; + + P9_DPRINTK(P9_DEBUG_VFS, + " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, + old_dentry->d_name.name); + + oldfid = v9fs_fid_clone(old_dentry); + if (IS_ERR(oldfid)) + return PTR_ERR(oldfid); + + name = __getname(); + if (unlikely(!name)) { + retval = -ENOMEM; + goto clunk_fid; + } + + sprintf(name, "%d\n", oldfid->fid); + retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name); + __putname(name); + if (!retval) { + v9fs_refresh_inode(oldfid, old_dentry->d_inode); + v9fs_invalidate_inode_attr(dir); + } +clunk_fid: + p9_client_clunk(oldfid); + return retval; +} + +/** + * v9fs_vfs_mknod - create a special file + * @dir: inode destination for new link + * @dentry: dentry for file + * @mode: mode for creation + * @rdev: device associated with special file + * + */ + +static int +v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +{ + int retval; + char *name; + + P9_DPRINTK(P9_DEBUG_VFS, + " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, + dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + name = __getname(); + if (!name) + return -ENOMEM; + /* build extension */ + if (S_ISBLK(mode)) + sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev)); + else if (S_ISCHR(mode)) + sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev)); + else if (S_ISFIFO(mode)) + *name = 0; + else if (S_ISSOCK(mode)) + *name = 0; + else { + __putname(name); + return -EINVAL; + } + + retval = v9fs_vfs_mkspecial(dir, dentry, mode, name); + __putname(name); + + return retval; +} + +int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) +{ + int umode; + dev_t rdev; + loff_t i_size; + struct p9_wstat *st; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + st = p9_client_stat(fid); + if (IS_ERR(st)) + return PTR_ERR(st); + /* + * Don't update inode if the file type is different + */ + umode = p9mode2unixmode(v9ses, st, &rdev); + if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + goto out; + + spin_lock(&inode->i_lock); + /* + * We don't want to refresh inode->i_size, + * because we may have cached data + */ + i_size = inode->i_size; + v9fs_stat2inode(st, inode, inode->i_sb); + if (v9ses->cache) + inode->i_size = i_size; + spin_unlock(&inode->i_lock); +out: + p9stat_free(st); + kfree(st); + return 0; +} + +static const struct inode_operations v9fs_dir_inode_operations_dotu = { + .create = v9fs_vfs_create, + .lookup = v9fs_vfs_lookup, + .symlink = v9fs_vfs_symlink, + .link = v9fs_vfs_link, + .unlink = v9fs_vfs_unlink, + .mkdir = v9fs_vfs_mkdir, + .rmdir = v9fs_vfs_rmdir, + .mknod = v9fs_vfs_mknod, + .rename = v9fs_vfs_rename, + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; + +static const struct inode_operations v9fs_dir_inode_operations = { + .create = v9fs_vfs_create, + .lookup = v9fs_vfs_lookup, + .unlink = v9fs_vfs_unlink, + .mkdir = v9fs_vfs_mkdir, + .rmdir = v9fs_vfs_rmdir, + .mknod = v9fs_vfs_mknod, + .rename = v9fs_vfs_rename, + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; + +static const struct inode_operations v9fs_file_inode_operations = { + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; + +static const struct inode_operations v9fs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = v9fs_vfs_follow_link, + .put_link = v9fs_vfs_put_link, + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; + diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c new file mode 100644 index 00000000..c873172a --- /dev/null +++ b/fs/9p/vfs_inode_dotl.c @@ -0,0 +1,1003 @@ +/* + * linux/fs/9p/vfs_inode_dotl.c + * + * This file contains vfs inode ops for the 9P2000.L protocol. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" +#include "cache.h" +#include "xattr.h" +#include "acl.h" + +static int +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, + dev_t rdev); + +/** + * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a + * new file system object. This checks the S_ISGID to determine the owning + * group of the new file system object. + */ + +static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) +{ + BUG_ON(dir_inode == NULL); + + if (dir_inode->i_mode & S_ISGID) { + /* set_gid bit is set.*/ + return dir_inode->i_gid; + } + return current_fsgid(); +} + +/** + * v9fs_dentry_from_dir_inode - helper function to get the dentry from + * dir inode. + * + */ + +static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&inode->i_lock); + /* Directory should have only one entry. */ + BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry)); + dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias); + spin_unlock(&inode->i_lock); + return dentry; +} + +static int v9fs_test_inode_dotl(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; + + /* don't match inode of different type */ + if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + return 0; + + if (inode->i_generation != st->st_gen) + return 0; + + /* compare qid details */ + if (memcmp(&v9inode->qid.version, + &st->qid.version, sizeof(v9inode->qid.version))) + return 0; + + if (v9inode->qid.type != st->qid.type) + return 0; + return 1; +} + +/* Always get a new inode */ +static int v9fs_test_new_inode_dotl(struct inode *inode, void *data) +{ + return 0; +} + +static int v9fs_set_inode_dotl(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; + + memcpy(&v9inode->qid, &st->qid, sizeof(st->qid)); + inode->i_generation = st->st_gen; + return 0; +} + +static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, + struct p9_qid *qid, + struct p9_fid *fid, + struct p9_stat_dotl *st, + int new) +{ + int retval; + unsigned long i_ino; + struct inode *inode; + struct v9fs_session_info *v9ses = sb->s_fs_info; + int (*test)(struct inode *, void *); + + if (new) + test = v9fs_test_new_inode_dotl; + else + test = v9fs_test_inode_dotl; + + i_ino = v9fs_qid2ino(qid); + inode = iget5_locked(sb, i_ino, test, v9fs_set_inode_dotl, st); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + /* + * initialize the inode with the stat info + * FIXME!! we may need support for stale inodes + * later. + */ + inode->i_ino = i_ino; + retval = v9fs_init_inode(v9ses, inode, + st->st_mode, new_decode_dev(st->st_rdev)); + if (retval) + goto error; + + v9fs_stat2inode_dotl(st, inode); +#ifdef CONFIG_9P_FSCACHE + v9fs_cache_inode_get_cookie(inode); +#endif + retval = v9fs_get_acl(inode, fid); + if (retval) + goto error; + + unlock_new_inode(inode); + return inode; +error: + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(retval); + +} + +struct inode * +v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb, int new) +{ + struct p9_stat_dotl *st; + struct inode *inode = NULL; + + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN); + if (IS_ERR(st)) + return ERR_CAST(st); + + inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new); + kfree(st); + return inode; +} + +struct dotl_openflag_map { + int open_flag; + int dotl_flag; +}; + +static int v9fs_mapped_dotl_flags(int flags) +{ + int i; + int rflags = 0; + struct dotl_openflag_map dotl_oflag_map[] = { + { O_CREAT, P9_DOTL_CREATE }, + { O_EXCL, P9_DOTL_EXCL }, + { O_NOCTTY, P9_DOTL_NOCTTY }, + { O_TRUNC, P9_DOTL_TRUNC }, + { O_APPEND, P9_DOTL_APPEND }, + { O_NONBLOCK, P9_DOTL_NONBLOCK }, + { O_DSYNC, P9_DOTL_DSYNC }, + { FASYNC, P9_DOTL_FASYNC }, + { O_DIRECT, P9_DOTL_DIRECT }, + { O_LARGEFILE, P9_DOTL_LARGEFILE }, + { O_DIRECTORY, P9_DOTL_DIRECTORY }, + { O_NOFOLLOW, P9_DOTL_NOFOLLOW }, + { O_NOATIME, P9_DOTL_NOATIME }, + { O_CLOEXEC, P9_DOTL_CLOEXEC }, + { O_SYNC, P9_DOTL_SYNC}, + }; + for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) { + if (flags & dotl_oflag_map[i].open_flag) + rflags |= dotl_oflag_map[i].dotl_flag; + } + return rflags; +} + +/** + * v9fs_open_to_dotl_flags- convert Linux specific open flags to + * plan 9 open flag. + * @flags: flags to convert + */ +int v9fs_open_to_dotl_flags(int flags) +{ + int rflags = 0; + + /* + * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY + * and P9_DOTL_NOACCESS + */ + rflags |= flags & O_ACCMODE; + rflags |= v9fs_mapped_dotl_flags(flags); + + return rflags; +} + +/** + * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. + * @dir: directory inode that is being created + * @dentry: dentry that is being deleted + * @mode: create permissions + * @nd: path information + * + */ + +static int +v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, + struct nameidata *nd) +{ + int err = 0; + gid_t gid; + int flags; + mode_t mode; + char *name = NULL; + struct file *filp; + struct p9_qid qid; + struct inode *inode; + struct p9_fid *fid = NULL; + struct v9fs_inode *v9inode; + struct p9_fid *dfid, *ofid, *inode_fid; + struct v9fs_session_info *v9ses; + struct posix_acl *pacl = NULL, *dacl = NULL; + + v9ses = v9fs_inode2v9ses(dir); + if (nd && nd->flags & LOOKUP_OPEN) + flags = nd->intent.open.flags - 1; + else { + /* + * create call without LOOKUP_OPEN is due + * to mknod of regular files. So use mknod + * operation. + */ + return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0); + } + + name = (char *) dentry->d_name.name; + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x " + "mode:0x%x\n", name, flags, omode); + + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return err; + } + + /* clone a fid to use for creation */ + ofid = p9_client_walk(dfid, 0, NULL, 1); + if (IS_ERR(ofid)) { + err = PTR_ERR(ofid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + return err; + } + + gid = v9fs_get_fsgid_for_create(dir); + + mode = omode; + /* Update mode based on ACL value */ + err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); + if (err) { + P9_DPRINTK(P9_DEBUG_VFS, + "Failed to get acl values in creat %d\n", err); + goto error; + } + err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), + mode, gid, &qid); + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, + "p9_client_open_dotl failed in creat %d\n", + err); + goto error; + } + v9fs_invalidate_inode_attr(dir); + + /* instantiate inode and assign the unopened fid to the dentry */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + fid = NULL; + goto error; + } + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); + goto error; + } + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + d_instantiate(dentry, inode); + + /* Now set the ACL based on the default value */ + v9fs_set_create_acl(dentry, &dacl, &pacl); + + v9inode = V9FS_I(inode); + mutex_lock(&v9inode->v_mutex); + if (v9ses->cache && !v9inode->writeback_fid && + ((flags & O_ACCMODE) != O_RDONLY)) { + /* + * clone a fid and add it to writeback_fid + * we do it during open time instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + inode_fid = v9fs_writeback_fid(dentry); + if (IS_ERR(inode_fid)) { + err = PTR_ERR(inode_fid); + mutex_unlock(&v9inode->v_mutex); + goto err_clunk_old_fid; + } + v9inode->writeback_fid = (void *) inode_fid; + } + mutex_unlock(&v9inode->v_mutex); + /* Since we are opening a file, assign the open fid to the file */ + filp = lookup_instantiate_filp(nd, dentry, generic_file_open); + if (IS_ERR(filp)) { + err = PTR_ERR(filp); + goto err_clunk_old_fid; + } + filp->private_data = ofid; +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache) + v9fs_cache_inode_set_cookie(inode, filp); +#endif + return 0; + +error: + if (fid) + p9_client_clunk(fid); +err_clunk_old_fid: + if (ofid) + p9_client_clunk(ofid); + v9fs_set_create_acl(NULL, &dacl, &pacl); + return err; +} + +/** + * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory + * @dir: inode that is being unlinked + * @dentry: dentry that is being unlinked + * @mode: mode for new directory + * + */ + +static int v9fs_vfs_mkdir_dotl(struct inode *dir, + struct dentry *dentry, int omode) +{ + int err; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL, *dfid = NULL; + gid_t gid; + char *name; + mode_t mode; + struct inode *inode; + struct p9_qid qid; + struct dentry *dir_dentry; + struct posix_acl *dacl = NULL, *pacl = NULL; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + err = 0; + v9ses = v9fs_inode2v9ses(dir); + + omode |= S_IFDIR; + if (dir->i_mode & S_ISGID) + omode |= S_ISGID; + + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + dfid = NULL; + goto error; + } + + gid = v9fs_get_fsgid_for_create(dir); + mode = omode; + /* Update mode based on ACL value */ + err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); + if (err) { + P9_DPRINTK(P9_DEBUG_VFS, + "Failed to get acl values in mkdir %d\n", err); + goto error; + } + name = (char *) dentry->d_name.name; + err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid); + if (err < 0) + goto error; + + /* instantiate inode and assign the unopened fid to the dentry */ + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + d_instantiate(dentry, inode); + fid = NULL; + } else { + /* + * Not in cached mode. No need to populate + * inode with stat. We need to get an inode + * so that we can set the acl with dentry + */ + inode = v9fs_get_inode(dir->i_sb, mode, 0); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + d_instantiate(dentry, inode); + } + /* Now set the ACL based on the default value */ + v9fs_set_create_acl(dentry, &dacl, &pacl); + inc_nlink(dir); + v9fs_invalidate_inode_attr(dir); +error: + if (fid) + p9_client_clunk(fid); + v9fs_set_create_acl(NULL, &dacl, &pacl); + return err; +} + +static int +v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + int err; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_stat_dotl *st; + + P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); + err = -EPERM; + v9ses = v9fs_dentry2v9ses(dentry); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + generic_fillattr(dentry->d_inode, stat); + return 0; + } + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + /* Ask for all the fields in stat structure. Server will return + * whatever it supports + */ + + st = p9_client_getattr_dotl(fid, P9_STATS_ALL); + if (IS_ERR(st)) + return PTR_ERR(st); + + v9fs_stat2inode_dotl(st, dentry->d_inode); + generic_fillattr(dentry->d_inode, stat); + /* Change block size to what the server returned */ + stat->blksize = st->st_blksize; + + kfree(st); + return 0; +} + +/** + * v9fs_vfs_setattr_dotl - set file metadata + * @dentry: file whose metadata to set + * @iattr: metadata assignment structure + * + */ + +int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) +{ + int retval; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_iattr_dotl p9attr; + + P9_DPRINTK(P9_DEBUG_VFS, "\n"); + + retval = inode_change_ok(dentry->d_inode, iattr); + if (retval) + return retval; + + p9attr.valid = iattr->ia_valid; + p9attr.mode = iattr->ia_mode; + p9attr.uid = iattr->ia_uid; + p9attr.gid = iattr->ia_gid; + p9attr.size = iattr->ia_size; + p9attr.atime_sec = iattr->ia_atime.tv_sec; + p9attr.atime_nsec = iattr->ia_atime.tv_nsec; + p9attr.mtime_sec = iattr->ia_mtime.tv_sec; + p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; + + retval = -EPERM; + v9ses = v9fs_dentry2v9ses(dentry); + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + /* Write all dirty data */ + if (S_ISREG(dentry->d_inode->i_mode)) + filemap_write_and_wait(dentry->d_inode->i_mapping); + + retval = p9_client_setattr(fid, &p9attr); + if (retval < 0) + return retval; + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(dentry->d_inode)) + truncate_setsize(dentry->d_inode, iattr->ia_size); + + v9fs_invalidate_inode_attr(dentry->d_inode); + setattr_copy(dentry->d_inode, iattr); + mark_inode_dirty(dentry->d_inode); + if (iattr->ia_valid & ATTR_MODE) { + /* We also want to update ACL when we update mode bits */ + retval = v9fs_acl_chmod(dentry); + if (retval < 0) + return retval; + } + return 0; +} + +/** + * v9fs_stat2inode_dotl - populate an inode structure with stat info + * @stat: stat structure + * @inode: inode to populate + * @sb: superblock of filesystem + * + */ + +void +v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) +{ + mode_t mode; + struct v9fs_inode *v9inode = V9FS_I(inode); + + if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { + inode->i_atime.tv_sec = stat->st_atime_sec; + inode->i_atime.tv_nsec = stat->st_atime_nsec; + inode->i_mtime.tv_sec = stat->st_mtime_sec; + inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + inode->i_ctime.tv_sec = stat->st_ctime_sec; + inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + inode->i_uid = stat->st_uid; + inode->i_gid = stat->st_gid; + inode->i_nlink = stat->st_nlink; + + mode = stat->st_mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; + + i_size_write(inode, stat->st_size); + inode->i_blocks = stat->st_blocks; + } else { + if (stat->st_result_mask & P9_STATS_ATIME) { + inode->i_atime.tv_sec = stat->st_atime_sec; + inode->i_atime.tv_nsec = stat->st_atime_nsec; + } + if (stat->st_result_mask & P9_STATS_MTIME) { + inode->i_mtime.tv_sec = stat->st_mtime_sec; + inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + } + if (stat->st_result_mask & P9_STATS_CTIME) { + inode->i_ctime.tv_sec = stat->st_ctime_sec; + inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + } + if (stat->st_result_mask & P9_STATS_UID) + inode->i_uid = stat->st_uid; + if (stat->st_result_mask & P9_STATS_GID) + inode->i_gid = stat->st_gid; + if (stat->st_result_mask & P9_STATS_NLINK) + inode->i_nlink = stat->st_nlink; + if (stat->st_result_mask & P9_STATS_MODE) { + inode->i_mode = stat->st_mode; + if ((S_ISBLK(inode->i_mode)) || + (S_ISCHR(inode->i_mode))) + init_special_inode(inode, inode->i_mode, + inode->i_rdev); + } + if (stat->st_result_mask & P9_STATS_RDEV) + inode->i_rdev = new_decode_dev(stat->st_rdev); + if (stat->st_result_mask & P9_STATS_SIZE) + i_size_write(inode, stat->st_size); + if (stat->st_result_mask & P9_STATS_BLOCKS) + inode->i_blocks = stat->st_blocks; + } + if (stat->st_result_mask & P9_STATS_GEN) + inode->i_generation = stat->st_gen; + + /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION + * because the inode structure does not have fields for them. + */ + v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; +} + +static int +v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + int err; + gid_t gid; + char *name; + struct p9_qid qid; + struct inode *inode; + struct p9_fid *dfid; + struct p9_fid *fid = NULL; + struct v9fs_session_info *v9ses; + + name = (char *) dentry->d_name.name; + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n", + dir->i_ino, name, symname); + v9ses = v9fs_inode2v9ses(dir); + + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return err; + } + + gid = v9fs_get_fsgid_for_create(dir); + + /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */ + err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid); + + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); + goto error; + } + + v9fs_invalidate_inode_attr(dir); + if (v9ses->cache) { + /* Now walk from the parent so we can get an unopened fid. */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + /* instantiate inode and assign the unopened fid to dentry */ + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + d_instantiate(dentry, inode); + fid = NULL; + } else { + /* Not in cached mode. No need to populate inode with stat */ + inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + d_instantiate(dentry, inode); + } + +error: + if (fid) + p9_client_clunk(fid); + + return err; +} + +/** + * v9fs_vfs_link_dotl - create a hardlink for dotl + * @old_dentry: dentry for file to link to + * @dir: inode destination for new link + * @dentry: dentry for link + * + */ + +static int +v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + int err; + char *name; + struct dentry *dir_dentry; + struct p9_fid *dfid, *oldfid; + struct v9fs_session_info *v9ses; + + P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", + dir->i_ino, old_dentry->d_name.name, + dentry->d_name.name); + + v9ses = v9fs_inode2v9ses(dir); + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) + return PTR_ERR(dfid); + + oldfid = v9fs_fid_lookup(old_dentry); + if (IS_ERR(oldfid)) + return PTR_ERR(oldfid); + + name = (char *) dentry->d_name.name; + + err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); + + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); + return err; + } + + v9fs_invalidate_inode_attr(dir); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + /* Get the latest stat info from server. */ + struct p9_fid *fid; + fid = v9fs_fid_lookup(old_dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + v9fs_refresh_inode_dotl(fid, old_dentry->d_inode); + } + ihold(old_dentry->d_inode); + d_instantiate(dentry, old_dentry->d_inode); + + return err; +} + +/** + * v9fs_vfs_mknod_dotl - create a special file + * @dir: inode destination for new link + * @dentry: dentry for file + * @mode: mode for creation + * @rdev: device associated with special file + * + */ +static int +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, + dev_t rdev) +{ + int err; + gid_t gid; + char *name; + mode_t mode; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL, *dfid = NULL; + struct inode *inode; + struct p9_qid qid; + struct dentry *dir_dentry; + struct posix_acl *dacl = NULL, *pacl = NULL; + + P9_DPRINTK(P9_DEBUG_VFS, + " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, + dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev)); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + v9ses = v9fs_inode2v9ses(dir); + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + dfid = NULL; + goto error; + } + + gid = v9fs_get_fsgid_for_create(dir); + mode = omode; + /* Update mode based on ACL value */ + err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); + if (err) { + P9_DPRINTK(P9_DEBUG_VFS, + "Failed to get acl values in mknod %d\n", err); + goto error; + } + name = (char *) dentry->d_name.name; + + err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid); + if (err < 0) + goto error; + + v9fs_invalidate_inode_attr(dir); + /* instantiate inode and assign the unopened fid to the dentry */ + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + d_instantiate(dentry, inode); + fid = NULL; + } else { + /* + * Not in cached mode. No need to populate inode with stat. + * socket syscall returns a fd, so we need instantiate + */ + inode = v9fs_get_inode(dir->i_sb, mode, rdev); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + d_instantiate(dentry, inode); + } + /* Now set the ACL based on the default value */ + v9fs_set_create_acl(dentry, &dacl, &pacl); +error: + if (fid) + p9_client_clunk(fid); + v9fs_set_create_acl(NULL, &dacl, &pacl); + return err; +} + +/** + * v9fs_vfs_follow_link_dotl - follow a symlink path + * @dentry: dentry for symlink + * @nd: nameidata + * + */ + +static void * +v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd) +{ + int retval; + struct p9_fid *fid; + char *link = __getname(); + char *target; + + P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); + + if (!link) { + link = ERR_PTR(-ENOMEM); + goto ndset; + } + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) { + __putname(link); + link = ERR_CAST(fid); + goto ndset; + } + retval = p9_client_readlink(fid, &target); + if (!retval) { + strcpy(link, target); + kfree(target); + goto ndset; + } + __putname(link); + link = ERR_PTR(retval); +ndset: + nd_set_link(nd, link); + return NULL; +} + +int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) +{ + loff_t i_size; + struct p9_stat_dotl *st; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + st = p9_client_getattr_dotl(fid, P9_STATS_ALL); + if (IS_ERR(st)) + return PTR_ERR(st); + /* + * Don't update inode if the file type is different + */ + if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + goto out; + + spin_lock(&inode->i_lock); + /* + * We don't want to refresh inode->i_size, + * because we may have cached data + */ + i_size = inode->i_size; + v9fs_stat2inode_dotl(st, inode); + if (v9ses->cache) + inode->i_size = i_size; + spin_unlock(&inode->i_lock); +out: + kfree(st); + return 0; +} + +const struct inode_operations v9fs_dir_inode_operations_dotl = { + .create = v9fs_vfs_create_dotl, + .lookup = v9fs_vfs_lookup, + .link = v9fs_vfs_link_dotl, + .symlink = v9fs_vfs_symlink_dotl, + .unlink = v9fs_vfs_unlink, + .mkdir = v9fs_vfs_mkdir_dotl, + .rmdir = v9fs_vfs_rmdir, + .mknod = v9fs_vfs_mknod_dotl, + .rename = v9fs_vfs_rename, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, + .check_acl = v9fs_check_acl, +}; + +const struct inode_operations v9fs_file_inode_operations_dotl = { + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, + .check_acl = v9fs_check_acl, +}; + +const struct inode_operations v9fs_symlink_inode_operations_dotl = { + .readlink = generic_readlink, + .follow_link = v9fs_vfs_follow_link_dotl, + .put_link = v9fs_vfs_put_link, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, +}; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c new file mode 100644 index 00000000..c70251d4 --- /dev/null +++ b/fs/9p/vfs_super.c @@ -0,0 +1,368 @@ +/* + * linux/fs/9p/vfs_super.c + * + * This file contians superblock ops for 9P2000. It is intended that + * you mount this file system on directories. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" +#include "xattr.h" +#include "acl.h" + +static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; + +/** + * v9fs_set_super - set the superblock + * @s: super block + * @data: file system specific data + * + */ + +static int v9fs_set_super(struct super_block *s, void *data) +{ + s->s_fs_info = data; + return set_anon_super(s, data); +} + +/** + * v9fs_fill_super - populate superblock with info + * @sb: superblock + * @v9ses: session information + * @flags: flags propagated from v9fs_mount() + * + */ + +static void +v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, + int flags, void *data) +{ + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize_bits = fls(v9ses->maxdata - 1); + sb->s_blocksize = 1 << sb->s_blocksize_bits; + sb->s_magic = V9FS_MAGIC; + if (v9fs_proto_dotl(v9ses)) { + sb->s_op = &v9fs_super_ops_dotl; + sb->s_xattr = v9fs_xattr_handlers; + } else + sb->s_op = &v9fs_super_ops; + sb->s_bdi = &v9ses->bdi; + if (v9ses->cache) + sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE; + + sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME; + if (!v9ses->cache) + sb->s_flags |= MS_SYNCHRONOUS; + +#ifdef CONFIG_9P_FS_POSIX_ACL + if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL) + sb->s_flags |= MS_POSIXACL; +#endif + + save_mount_options(sb, data); +} + +/** + * v9fs_mount - mount a superblock + * @fs_type: file system type + * @flags: mount flags + * @dev_name: device name that was mounted + * @data: mount options + * + */ + +static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + struct super_block *sb = NULL; + struct inode *inode = NULL; + struct dentry *root = NULL; + struct v9fs_session_info *v9ses = NULL; + int mode = S_IRWXUGO | S_ISVTX; + struct p9_fid *fid; + int retval = 0; + + P9_DPRINTK(P9_DEBUG_VFS, " \n"); + + v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); + if (!v9ses) + return ERR_PTR(-ENOMEM); + + fid = v9fs_session_init(v9ses, dev_name, data); + if (IS_ERR(fid)) { + retval = PTR_ERR(fid); + /* + * we need to call session_close to tear down some + * of the data structure setup by session_init + */ + goto close_session; + } + + sb = sget(fs_type, NULL, v9fs_set_super, v9ses); + if (IS_ERR(sb)) { + retval = PTR_ERR(sb); + goto clunk_fid; + } + v9fs_fill_super(sb, v9ses, flags, data); + + if (v9ses->cache) + sb->s_d_op = &v9fs_cached_dentry_operations; + else + sb->s_d_op = &v9fs_dentry_operations; + + inode = v9fs_get_inode(sb, S_IFDIR | mode, 0); + if (IS_ERR(inode)) { + retval = PTR_ERR(inode); + goto release_sb; + } + + root = d_alloc_root(inode); + if (!root) { + iput(inode); + retval = -ENOMEM; + goto release_sb; + } + sb->s_root = root; + if (v9fs_proto_dotl(v9ses)) { + struct p9_stat_dotl *st = NULL; + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); + if (IS_ERR(st)) { + retval = PTR_ERR(st); + goto release_sb; + } + root->d_inode->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode_dotl(st, root->d_inode); + kfree(st); + } else { + struct p9_wstat *st = NULL; + st = p9_client_stat(fid); + if (IS_ERR(st)) { + retval = PTR_ERR(st); + goto release_sb; + } + + root->d_inode->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode(st, root->d_inode, sb); + + p9stat_free(st); + kfree(st); + } + retval = v9fs_get_acl(inode, fid); + if (retval) + goto release_sb; + v9fs_fid_add(root, fid); + + P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); + return dget(sb->s_root); + +clunk_fid: + p9_client_clunk(fid); +close_session: + v9fs_session_close(v9ses); + kfree(v9ses); + return ERR_PTR(retval); + +release_sb: + /* + * we will do the session_close and root dentry release + * in the below call. But we need to clunk fid, because we haven't + * attached the fid to dentry so it won't get clunked + * automatically. + */ + p9_client_clunk(fid); + deactivate_locked_super(sb); + return ERR_PTR(retval); +} + +/** + * v9fs_kill_super - Kill Superblock + * @s: superblock + * + */ + +static void v9fs_kill_super(struct super_block *s) +{ + struct v9fs_session_info *v9ses = s->s_fs_info; + + P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); + + kill_anon_super(s); + + v9fs_session_cancel(v9ses); + v9fs_session_close(v9ses); + kfree(v9ses); + s->s_fs_info = NULL; + P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); +} + +static void +v9fs_umount_begin(struct super_block *sb) +{ + struct v9fs_session_info *v9ses; + + v9ses = sb->s_fs_info; + v9fs_session_begin_cancel(v9ses); +} + +static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_rstatfs rs; + int res; + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) { + res = PTR_ERR(fid); + goto done; + } + + v9ses = v9fs_dentry2v9ses(dentry); + if (v9fs_proto_dotl(v9ses)) { + res = p9_client_statfs(fid, &rs); + if (res == 0) { + buf->f_type = V9FS_MAGIC; + buf->f_bsize = rs.bsize; + buf->f_blocks = rs.blocks; + buf->f_bfree = rs.bfree; + buf->f_bavail = rs.bavail; + buf->f_files = rs.files; + buf->f_ffree = rs.ffree; + buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL; + buf->f_namelen = rs.namelen; + } + if (res != -ENOSYS) + goto done; + } + res = simple_statfs(dentry, buf); +done: + return res; +} + +static int v9fs_drop_inode(struct inode *inode) +{ + struct v9fs_session_info *v9ses; + v9ses = v9fs_inode2v9ses(inode); + if (v9ses->cache) + return generic_drop_inode(inode); + /* + * in case of non cached mode always drop the + * the inode because we want the inode attribute + * to always match that on the server. + */ + return 1; +} + +static int v9fs_write_inode(struct inode *inode, + struct writeback_control *wbc) +{ + int ret; + struct p9_wstat wstat; + struct v9fs_inode *v9inode; + /* + * send an fsync request to server irrespective of + * wbc->sync_mode. + */ + P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); + v9inode = V9FS_I(inode); + if (!v9inode->writeback_fid) + return 0; + v9fs_blank_wstat(&wstat); + + ret = p9_client_wstat(v9inode->writeback_fid, &wstat); + if (ret < 0) { + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + return ret; + } + return 0; +} + +static int v9fs_write_inode_dotl(struct inode *inode, + struct writeback_control *wbc) +{ + int ret; + struct v9fs_inode *v9inode; + /* + * send an fsync request to server irrespective of + * wbc->sync_mode. + */ + P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); + v9inode = V9FS_I(inode); + if (!v9inode->writeback_fid) + return 0; + ret = p9_client_fsync(v9inode->writeback_fid, 0); + if (ret < 0) { + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + return ret; + } + return 0; +} + +static const struct super_operations v9fs_super_ops = { + .alloc_inode = v9fs_alloc_inode, + .destroy_inode = v9fs_destroy_inode, + .statfs = simple_statfs, + .evict_inode = v9fs_evict_inode, + .show_options = generic_show_options, + .umount_begin = v9fs_umount_begin, + .write_inode = v9fs_write_inode, +}; + +static const struct super_operations v9fs_super_ops_dotl = { + .alloc_inode = v9fs_alloc_inode, + .destroy_inode = v9fs_destroy_inode, + .statfs = v9fs_statfs, + .drop_inode = v9fs_drop_inode, + .evict_inode = v9fs_evict_inode, + .show_options = generic_show_options, + .umount_begin = v9fs_umount_begin, + .write_inode = v9fs_write_inode_dotl, +}; + +struct file_system_type v9fs_fs_type = { + .name = "9p", + .mount = v9fs_mount, + .kill_sb = v9fs_kill_super, + .owner = THIS_MODULE, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT, +}; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c new file mode 100644 index 00000000..d2887738 --- /dev/null +++ b/fs/9p/xattr.c @@ -0,0 +1,172 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include +#include +#include +#include + +#include "fid.h" +#include "xattr.h" + +ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, + void *buffer, size_t buffer_size) +{ + ssize_t retval; + int msize, read_count; + u64 offset = 0, attr_size; + struct p9_fid *attr_fid; + + attr_fid = p9_client_xattrwalk(fid, name, &attr_size); + if (IS_ERR(attr_fid)) { + retval = PTR_ERR(attr_fid); + P9_DPRINTK(P9_DEBUG_VFS, + "p9_client_attrwalk failed %zd\n", retval); + attr_fid = NULL; + goto error; + } + if (!buffer_size) { + /* request to get the attr_size */ + retval = attr_size; + goto error; + } + if (attr_size > buffer_size) { + retval = -ERANGE; + goto error; + } + msize = attr_fid->clnt->msize; + while (attr_size) { + if (attr_size > (msize - P9_IOHDRSZ)) + read_count = msize - P9_IOHDRSZ; + else + read_count = attr_size; + read_count = p9_client_read(attr_fid, ((char *)buffer)+offset, + NULL, offset, read_count); + if (read_count < 0) { + /* error in xattr read */ + retval = read_count; + goto error; + } + offset += read_count; + attr_size -= read_count; + } + /* Total read xattr bytes */ + retval = offset; +error: + if (attr_fid) + p9_client_clunk(attr_fid); + return retval; + +} + + +/* + * v9fs_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, + void *buffer, size_t buffer_size) +{ + struct p9_fid *fid; + + P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n", + __func__, name, buffer_size); + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + return v9fs_fid_xattr_get(fid, name, buffer, buffer_size); +} + +/* + * v9fs_xattr_set() + * + * Create, replace or remove an extended attribute for this inode. Buffer + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int v9fs_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t value_len, int flags) +{ + u64 offset = 0; + int retval, msize, write_count; + struct p9_fid *fid = NULL; + + P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n", + __func__, name, value_len, flags); + + fid = v9fs_fid_clone(dentry); + if (IS_ERR(fid)) { + retval = PTR_ERR(fid); + fid = NULL; + goto error; + } + /* + * On success fid points to xattr + */ + retval = p9_client_xattrcreate(fid, name, value_len, flags); + if (retval < 0) { + P9_DPRINTK(P9_DEBUG_VFS, + "p9_client_xattrcreate failed %d\n", retval); + goto error; + } + msize = fid->clnt->msize; + while (value_len) { + if (value_len > (msize - P9_IOHDRSZ)) + write_count = msize - P9_IOHDRSZ; + else + write_count = value_len; + write_count = p9_client_write(fid, ((char *)value)+offset, + NULL, offset, write_count); + if (write_count < 0) { + /* error in xattr write */ + retval = write_count; + goto error; + } + offset += write_count; + value_len -= write_count; + } + /* Total read xattr bytes */ + retval = offset; +error: + if (fid) + retval = p9_client_clunk(fid); + return retval; +} + +ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + return v9fs_xattr_get(dentry, NULL, buffer, buffer_size); +} + +const struct xattr_handler *v9fs_xattr_handlers[] = { + &v9fs_xattr_user_handler, +#ifdef CONFIG_9P_FS_POSIX_ACL + &v9fs_xattr_acl_access_handler, + &v9fs_xattr_acl_default_handler, +#endif + NULL +}; diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h new file mode 100644 index 00000000..eaa837c5 --- /dev/null +++ b/fs/9p/xattr.h @@ -0,0 +1,33 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ +#ifndef FS_9P_XATTR_H +#define FS_9P_XATTR_H + +#include +#include +#include + +extern const struct xattr_handler *v9fs_xattr_handlers[]; +extern struct xattr_handler v9fs_xattr_user_handler; +extern const struct xattr_handler v9fs_xattr_acl_access_handler; +extern const struct xattr_handler v9fs_xattr_acl_default_handler; + +extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *, + void *, size_t); +extern ssize_t v9fs_xattr_get(struct dentry *, const char *, + void *, size_t); +extern int v9fs_xattr_set(struct dentry *, const char *, + const void *, size_t, int); +extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t); +#endif /* FS_9P_XATTR_H */ diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c new file mode 100644 index 00000000..d0b701b7 --- /dev/null +++ b/fs/9p/xattr_user.c @@ -0,0 +1,80 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + + +#include +#include +#include +#include +#include "xattr.h" + +static int v9fs_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_USER_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_USER_PREFIX, prefix_len); + memcpy(full_name+prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_get(dentry, full_name, buffer, size); + kfree(full_name); + return retval; +} + +static int v9fs_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_USER_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_USER_PREFIX, prefix_len); + memcpy(full_name + prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_set(dentry, full_name, value, size, flags); + kfree(full_name); + return retval; +} + +struct xattr_handler v9fs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = v9fs_xattr_user_get, + .set = v9fs_xattr_user_set, +}; diff --git a/fs/Kconfig b/fs/Kconfig new file mode 100644 index 00000000..88701cc0 --- /dev/null +++ b/fs/Kconfig @@ -0,0 +1,277 @@ +# +# File system configuration +# + +menu "File systems" + +if BLOCK + +source "fs/ext2/Kconfig" +source "fs/ext3/Kconfig" +source "fs/ext4/Kconfig" + +config FS_XIP +# execute in place + bool + depends on EXT2_FS_XIP + default y + +source "fs/jbd/Kconfig" +source "fs/jbd2/Kconfig" + +config FS_MBCACHE +# Meta block cache for Extended Attributes (ext2/ext3/ext4) + tristate + default y if EXT2_FS=y && EXT2_FS_XATTR + default y if EXT3_FS=y && EXT3_FS_XATTR + default y if EXT4_FS=y && EXT4_FS_XATTR + default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR + +source "fs/reiserfs/Kconfig" +source "fs/jfs/Kconfig" + +source "fs/xfs/Kconfig" +source "fs/gfs2/Kconfig" +source "fs/ocfs2/Kconfig" +source "fs/btrfs/Kconfig" +source "fs/nilfs2/Kconfig" + +endif # BLOCK + +# Posix ACL utility routines +# +# Note: Posix ACLs can be implemented without these helpers. Never use +# this symbol for ifdefs in core code. +# +config FS_POSIX_ACL + def_bool n + +config EXPORTFS + tristate + +config FILE_LOCKING + bool "Enable POSIX file locking API" if EXPERT + default y + help + This option enables standard file locking support, required + for filesystems like NFS and for the flock() system + call. Disabling this option saves about 11k. + +source "fs/notify/Kconfig" + +source "fs/quota/Kconfig" + +source "fs/autofs4/Kconfig" +source "fs/fuse/Kconfig" + +config CUSE + tristate "Character device in Userspace support" + depends on FUSE_FS + help + This FUSE extension allows character devices to be + implemented in userspace. + + If you want to develop or use userspace character device + based on CUSE, answer Y or M. + +config GENERIC_ACL + bool + select FS_POSIX_ACL + +menu "Caches" + +source "fs/fscache/Kconfig" +source "fs/cachefiles/Kconfig" + +endmenu + +if BLOCK +menu "CD-ROM/DVD Filesystems" + +source "fs/isofs/Kconfig" +source "fs/udf/Kconfig" + +endmenu +endif # BLOCK + +if BLOCK +menu "DOS/FAT/NT Filesystems" + +source "fs/fat/Kconfig" +source "fs/ntfs/Kconfig" + +endmenu +endif # BLOCK + +menu "Pseudo filesystems" + +source "fs/proc/Kconfig" +source "fs/sysfs/Kconfig" + +config TMPFS + bool "Virtual memory file system support (former shm fs)" + depends on SHMEM + help + Tmpfs is a file system which keeps all files in virtual memory. + + Everything in tmpfs is temporary in the sense that no files will be + created on your hard drive. The files live in memory and swap + space. If you unmount a tmpfs instance, everything stored therein is + lost. + + See for details. + +config TMPFS_POSIX_ACL + bool "Tmpfs POSIX Access Control Lists" + depends on TMPFS + select TMPFS_XATTR + select GENERIC_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N. + +config TMPFS_XATTR + bool "Tmpfs extended attributes" + depends on TMPFS + default n + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + Currently this enables support for the trusted.* and + security.* namespaces. + + You need this for POSIX ACL support on tmpfs. + + If unsure, say N. + +config HUGETLBFS + bool "HugeTLB file system support" + depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ + SYS_SUPPORTS_HUGETLBFS || BROKEN + help + hugetlbfs is a filesystem backing for HugeTLB pages, based on + ramfs. For architectures that support it, say Y here and read + for details. + + If unsure, say N. + +config HUGETLB_PAGE + def_bool HUGETLBFS + +source "fs/configfs/Kconfig" + +endmenu + +menuconfig MISC_FILESYSTEMS + bool "Miscellaneous filesystems" + default y + ---help--- + Say Y here to get to see options for various miscellaneous + filesystems, such as filesystems that came from other + operating systems. + + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and + disabled; if unsure, say Y here. + +if MISC_FILESYSTEMS + +source "fs/adfs/Kconfig" +source "fs/affs/Kconfig" +source "fs/ecryptfs/Kconfig" +source "fs/hfs/Kconfig" +source "fs/hfsplus/Kconfig" +source "fs/befs/Kconfig" +source "fs/bfs/Kconfig" +source "fs/efs/Kconfig" + +# Patched by YAFFS +source "fs/yaffs2/Kconfig" + +source "fs/jffs2/Kconfig" +# UBIFS File system configuration +source "fs/ubifs/Kconfig" +source "fs/logfs/Kconfig" +source "fs/cramfs/Kconfig" +source "fs/squashfs/Kconfig" +source "fs/freevxfs/Kconfig" +source "fs/minix/Kconfig" +source "fs/omfs/Kconfig" +source "fs/hpfs/Kconfig" +source "fs/qnx4/Kconfig" +source "fs/romfs/Kconfig" +source "fs/pstore/Kconfig" +source "fs/sysv/Kconfig" +source "fs/ufs/Kconfig" +source "fs/exofs/Kconfig" + +endif # MISC_FILESYSTEMS + +menuconfig NETWORK_FILESYSTEMS + bool "Network File Systems" + default y + depends on NET + ---help--- + Say Y here to get to see options for network filesystems and + filesystem-related networking code, such as NFS daemon and + RPCSEC security modules. + + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and + disabled; if unsure, say Y here. + +if NETWORK_FILESYSTEMS + +source "fs/nfs/Kconfig" +source "fs/nfsd/Kconfig" + +config LOCKD + tristate + depends on FILE_LOCKING + +config LOCKD_V4 + bool + depends on NFSD_V3 || NFS_V3 + depends on FILE_LOCKING + default y + +config NFS_ACL_SUPPORT + tristate + select FS_POSIX_ACL + +config NFS_COMMON + bool + depends on NFSD || NFS_FS + default y + +source "net/sunrpc/Kconfig" +source "fs/ceph/Kconfig" +source "fs/cifs/Kconfig" +source "fs/ncpfs/Kconfig" +source "fs/coda/Kconfig" +source "fs/afs/Kconfig" +source "fs/9p/Kconfig" + +endif # NETWORK_FILESYSTEMS + +if BLOCK +menu "Partition Types" + +source "fs/partitions/Kconfig" + +endmenu +endif + +source "fs/nls/Kconfig" +source "fs/dlm/Kconfig" + +endmenu diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt new file mode 100644 index 00000000..79e2ca79 --- /dev/null +++ b/fs/Kconfig.binfmt @@ -0,0 +1,163 @@ +config BINFMT_ELF + bool "Kernel support for ELF binaries" + depends on MMU && (BROKEN || !FRV) + default y + ---help--- + ELF (Executable and Linkable Format) is a format for libraries and + executables used across different architectures and operating + systems. Saying Y here will enable your kernel to run ELF binaries + and enlarge it by about 13 KB. ELF support under Linux has now all + but replaced the traditional Linux a.out formats (QMAGIC and ZMAGIC) + because it is portable (this does *not* mean that you will be able + to run executables from different architectures or operating systems + however) and makes building run-time libraries very easy. Many new + executables are distributed solely in ELF format. You definitely + want to say Y here. + + Information about ELF is contained in the ELF HOWTO available from + . + + If you find that after upgrading from Linux kernel 1.2 and saying Y + here, you still can't run any ELF binaries (they just crash), then + you'll have to install the newest ELF runtime libraries, including + ld.so (check the file for location and + latest version). + +config COMPAT_BINFMT_ELF + bool + depends on COMPAT && BINFMT_ELF + +config BINFMT_ELF_FDPIC + bool "Kernel support for FDPIC ELF binaries" + default y + depends on (FRV || BLACKFIN || (SUPERH32 && !MMU)) + help + ELF FDPIC binaries are based on ELF, but allow the individual load + segments of a binary to be located in memory independently of each + other. This makes this format ideal for use in environments where no + MMU is available as it still permits text segments to be shared, + even if data segments are not. + + It is also possible to run FDPIC ELF binaries on MMU linux also. + +config CORE_DUMP_DEFAULT_ELF_HEADERS + bool "Write ELF core dumps with partial segments" + default y + depends on BINFMT_ELF && ELF_CORE + help + ELF core dump files describe each memory mapping of the crashed + process, and can contain or omit the memory contents of each one. + The contents of an unmodified text mapping are omitted by default. + + For an unmodified text mapping of an ELF object, including just + the first page of the file in a core dump makes it possible to + identify the build ID bits in the file, without paying the i/o + cost and disk space to dump all the text. However, versions of + GDB before 6.7 are confused by ELF core dump files in this format. + + The core dump behavior can be controlled per process using + the /proc/PID/coredump_filter pseudo-file; this setting is + inherited. See Documentation/filesystems/proc.txt for details. + + This config option changes the default setting of coredump_filter + seen at boot time. If unsure, say Y. + +config BINFMT_FLAT + bool "Kernel support for flat binaries" + depends on !MMU && (!FRV || BROKEN) + help + Support uClinux FLAT format binaries. + +config BINFMT_ZFLAT + bool "Enable ZFLAT support" + depends on BINFMT_FLAT + select ZLIB_INFLATE + help + Support FLAT format compressed binaries + +config BINFMT_SHARED_FLAT + bool "Enable shared FLAT support" + depends on BINFMT_FLAT + help + Support FLAT shared libraries + +config HAVE_AOUT + def_bool n + +config BINFMT_AOUT + tristate "Kernel support for a.out and ECOFF binaries" + depends on HAVE_AOUT + ---help--- + A.out (Assembler.OUTput) is a set of formats for libraries and + executables used in the earliest versions of UNIX. Linux used + the a.out formats QMAGIC and ZMAGIC until they were replaced + with the ELF format. + + The conversion to ELF started in 1995. This option is primarily + provided for historical interest and for the benefit of those + who need to run binaries from that era. + + Most people should answer N here. If you think you may have + occasional use for this format, enable module support above + and answer M here to compile this support as a module called + binfmt_aout. + + If any crucial components of your system (such as /sbin/init + or /lib/ld.so) are still in a.out format, you will have to + say Y here. + +config OSF4_COMPAT + bool "OSF/1 v4 readv/writev compatibility" + depends on ALPHA && BINFMT_AOUT + help + Say Y if you are using OSF/1 binaries (like Netscape and Acrobat) + with v4 shared libraries freely available from Compaq. If you're + going to use shared libraries from Tru64 version 5.0 or later, say N. + +config BINFMT_EM86 + tristate "Kernel support for Linux/Intel ELF binaries" + depends on ALPHA + ---help--- + Say Y here if you want to be able to execute Linux/Intel ELF + binaries just like native Alpha binaries on your Alpha machine. For + this to work, you need to have the emulator /usr/bin/em86 in place. + + You can get the same functionality by saying N here and saying Y to + "Kernel support for MISC binaries". + + You may answer M to compile the emulation support as a module and + later load the module when you want to use a Linux/Intel binary. The + module will be called binfmt_em86. If unsure, say Y. + +config BINFMT_SOM + tristate "Kernel support for SOM binaries" + depends on PARISC && HPUX + help + SOM is a binary executable format inherited from HP/UX. Say + Y here to be able to load and execute SOM binaries directly. + +config BINFMT_MISC + tristate "Kernel support for MISC binaries" + ---help--- + If you say Y here, it will be possible to plug wrapper-driven binary + formats into the kernel. You will like this especially when you use + programs that need an interpreter to run like Java, Python, .NET or + Emacs-Lisp. It's also useful if you often run DOS executables under + the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO, available from + ). Once you have + registered such a binary class with the kernel, you can start one of + those programs simply by typing in its name at a shell prompt; Linux + will automatically feed it to the correct interpreter. + + You can do other nice things, too. Read the file + to learn how to use this + feature, for information about how + to include Java support. and for + information about how to include Mono-based .NET support. + + To use binfmt_misc, you will need to mount it: + mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc + + You may say M here for module support and later load the module when + you have use for it; the module is called binfmt_misc. If you + don't know what to answer at this point, say Y. diff --git a/fs/Makefile b/fs/Makefile new file mode 100644 index 00000000..2999b4d4 --- /dev/null +++ b/fs/Makefile @@ -0,0 +1,129 @@ +# +# Makefile for the Linux filesystems. +# +# 14 Sep 2000, Christoph Hellwig +# Rewritten to use lists instead of if-statements. +# + +obj-y := open.o read_write.o file_table.o super.o \ + char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ + ioctl.o readdir.o select.o fifo.o dcache.o inode.o \ + attr.o bad_inode.o file.o filesystems.o namespace.o \ + seq_file.o xattr.o libfs.o fs-writeback.o \ + pnode.o drop_caches.o splice.o sync.o utimes.o \ + stack.o fs_struct.o statfs.o + +ifeq ($(CONFIG_BLOCK),y) +obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o +else +obj-y += no-block.o +endif + +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o +obj-y += notify/ +obj-$(CONFIG_EPOLL) += eventpoll.o +obj-$(CONFIG_ANON_INODES) += anon_inodes.o +obj-$(CONFIG_SIGNALFD) += signalfd.o +obj-$(CONFIG_TIMERFD) += timerfd.o +obj-$(CONFIG_EVENTFD) += eventfd.o +obj-$(CONFIG_AIO) += aio.o +obj-$(CONFIG_FILE_LOCKING) += locks.o +obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o +obj-$(CONFIG_NFSD_DEPRECATED) += nfsctl.o +obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o +obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o +obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o + +# binfmt_script is always there +obj-y += binfmt_script.o + +obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o +obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o +obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o +obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o +obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o + +obj-$(CONFIG_FS_MBCACHE) += mbcache.o +obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o +obj-$(CONFIG_NFS_COMMON) += nfs_common/ +obj-$(CONFIG_GENERIC_ACL) += generic_acl.o + +obj-$(CONFIG_FHANDLE) += fhandle.o + +obj-y += quota/ + +obj-$(CONFIG_PROC_FS) += proc/ +obj-y += partitions/ +obj-$(CONFIG_SYSFS) += sysfs/ +obj-$(CONFIG_CONFIGFS_FS) += configfs/ +obj-y += devpts/ + +obj-$(CONFIG_PROFILING) += dcookies.o +obj-$(CONFIG_DLM) += dlm/ + +# Do not add any filesystems before this line +obj-$(CONFIG_FSCACHE) += fscache/ +obj-$(CONFIG_REISERFS_FS) += reiserfs/ +obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 +obj-$(CONFIG_EXT2_FS) += ext2/ +# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2 +# unless explicitly requested by rootfstype +obj-$(CONFIG_EXT4_FS) += ext4/ +obj-$(CONFIG_JBD) += jbd/ +obj-$(CONFIG_JBD2) += jbd2/ +obj-$(CONFIG_CRAMFS) += cramfs/ +obj-$(CONFIG_SQUASHFS) += squashfs/ +obj-y += ramfs/ +obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ +obj-$(CONFIG_CODA_FS) += coda/ +obj-$(CONFIG_MINIX_FS) += minix/ +obj-$(CONFIG_FAT_FS) += fat/ +obj-$(CONFIG_BFS_FS) += bfs/ +obj-$(CONFIG_ISO9660_FS) += isofs/ +obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+ +obj-$(CONFIG_HFS_FS) += hfs/ +obj-$(CONFIG_ECRYPT_FS) += ecryptfs/ +obj-$(CONFIG_VXFS_FS) += freevxfs/ +obj-$(CONFIG_NFS_FS) += nfs/ +obj-$(CONFIG_EXPORTFS) += exportfs/ +obj-$(CONFIG_NFSD) += nfsd/ +obj-$(CONFIG_LOCKD) += lockd/ +obj-$(CONFIG_NLS) += nls/ +obj-$(CONFIG_SYSV_FS) += sysv/ +obj-$(CONFIG_CIFS) += cifs/ +obj-$(CONFIG_NCP_FS) += ncpfs/ +obj-$(CONFIG_HPFS_FS) += hpfs/ +obj-$(CONFIG_NTFS_FS) += ntfs/ +obj-$(CONFIG_UFS_FS) += ufs/ +obj-$(CONFIG_EFS_FS) += efs/ +obj-$(CONFIG_JFFS2_FS) += jffs2/ +obj-$(CONFIG_LOGFS) += logfs/ +obj-$(CONFIG_UBIFS_FS) += ubifs/ +obj-$(CONFIG_AFFS_FS) += affs/ +obj-$(CONFIG_ROMFS_FS) += romfs/ +obj-$(CONFIG_QNX4FS_FS) += qnx4/ +obj-$(CONFIG_AUTOFS4_FS) += autofs4/ +obj-$(CONFIG_ADFS_FS) += adfs/ +obj-$(CONFIG_FUSE_FS) += fuse/ +obj-$(CONFIG_UDF_FS) += udf/ +obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ +obj-$(CONFIG_OMFS_FS) += omfs/ +obj-$(CONFIG_JFS_FS) += jfs/ +obj-$(CONFIG_XFS_FS) += xfs/ +obj-$(CONFIG_9P_FS) += 9p/ +obj-$(CONFIG_AFS_FS) += afs/ +obj-$(CONFIG_NILFS2_FS) += nilfs2/ +obj-$(CONFIG_BEFS_FS) += befs/ +obj-$(CONFIG_HOSTFS) += hostfs/ +obj-$(CONFIG_HPPFS) += hppfs/ +obj-$(CONFIG_CACHEFILES) += cachefiles/ +obj-$(CONFIG_DEBUG_FS) += debugfs/ +obj-$(CONFIG_OCFS2_FS) += ocfs2/ +obj-$(CONFIG_BTRFS_FS) += btrfs/ +obj-$(CONFIG_GFS2_FS) += gfs2/ +obj-$(CONFIG_EXOFS_FS) += exofs/ +obj-$(CONFIG_CEPH_FS) += ceph/ +obj-$(CONFIG_PSTORE) += pstore/ + +# Patched by YAFFS +obj-$(CONFIG_YAFFS_FS) += yaffs2/ diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig new file mode 100644 index 00000000..e55182a7 --- /dev/null +++ b/fs/adfs/Kconfig @@ -0,0 +1,27 @@ +config ADFS_FS + tristate "ADFS file system support (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + help + The Acorn Disc Filing System is the standard file system of the + RiscOS operating system which runs on Acorn's ARM-based Risc PC + systems and the Acorn Archimedes range of machines. If you say Y + here, Linux will be able to read from ADFS partitions on hard drives + and from ADFS-formatted floppy discs. If you also want to be able to + write to those devices, say Y to "ADFS write support" below. + + The ADFS partition should be the first partition (i.e., + /dev/[hs]d?1) on each of your drives. Please read the file + for further details. + + To compile this code as a module, choose M here: the module will be + called adfs. + + If unsure, say N. + +config ADFS_FS_RW + bool "ADFS write support (DANGEROUS)" + depends on ADFS_FS + help + If you say Y here, you will be able to write to ADFS partitions on + hard drives and ADFS-formatted floppy disks. This is experimental + codes, so if you're unsure, say N. diff --git a/fs/adfs/Makefile b/fs/adfs/Makefile new file mode 100644 index 00000000..9b2d71a9 --- /dev/null +++ b/fs/adfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux adfs filesystem routines. +# + +obj-$(CONFIG_ADFS_FS) += adfs.o + +adfs-objs := dir.o dir_f.o dir_fplus.o file.o inode.o map.o super.o diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h new file mode 100644 index 00000000..718ac1f4 --- /dev/null +++ b/fs/adfs/adfs.h @@ -0,0 +1,204 @@ +#include +#include + +/* Internal data structures for ADFS */ + +#define ADFS_FREE_FRAG 0 +#define ADFS_BAD_FRAG 1 +#define ADFS_ROOT_FRAG 2 + +#define ADFS_NDA_OWNER_READ (1 << 0) +#define ADFS_NDA_OWNER_WRITE (1 << 1) +#define ADFS_NDA_LOCKED (1 << 2) +#define ADFS_NDA_DIRECTORY (1 << 3) +#define ADFS_NDA_EXECUTE (1 << 4) +#define ADFS_NDA_PUBLIC_READ (1 << 5) +#define ADFS_NDA_PUBLIC_WRITE (1 << 6) + +#include "dir_f.h" + +struct buffer_head; + +/* + * adfs file system inode data in memory + */ +struct adfs_inode_info { + loff_t mmu_private; + unsigned long parent_id; /* object id of parent */ + __u32 loadaddr; /* RISC OS load address */ + __u32 execaddr; /* RISC OS exec address */ + unsigned int filetype; /* RISC OS file type */ + unsigned int attr; /* RISC OS permissions */ + unsigned int stamped:1; /* RISC OS file has date/time */ + struct inode vfs_inode; +}; + +/* + * Forward-declare this + */ +struct adfs_discmap; +struct adfs_dir_ops; + +/* + * ADFS file system superblock data in memory + */ +struct adfs_sb_info { + struct adfs_discmap *s_map; /* bh list containing map */ + struct adfs_dir_ops *s_dir; /* directory operations */ + + uid_t s_uid; /* owner uid */ + gid_t s_gid; /* owner gid */ + umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ + umode_t s_other_mask; /* ADFS other perm -> unix perm */ + int s_ftsuffix; /* ,xyz hex filetype suffix option */ + + __u32 s_ids_per_zone; /* max. no ids in one zone */ + __u32 s_idlen; /* length of ID in map */ + __u32 s_map_size; /* sector size of a map */ + unsigned long s_size; /* total size (in blocks) of this fs */ + signed int s_map2blk; /* shift left by this for map->sector */ + unsigned int s_log2sharesize;/* log2 share size */ + __le32 s_version; /* disc format version */ + unsigned int s_namelen; /* maximum number of characters in name */ +}; + +static inline struct adfs_sb_info *ADFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct adfs_inode_info *ADFS_I(struct inode *inode) +{ + return container_of(inode, struct adfs_inode_info, vfs_inode); +} + +/* + * Directory handling + */ +struct adfs_dir { + struct super_block *sb; + + int nr_buffers; + struct buffer_head *bh[4]; + + /* big directories need allocated buffers */ + struct buffer_head **bh_fplus; + + unsigned int pos; + unsigned int parent_id; + + struct adfs_dirheader dirhead; + union adfs_dirtail dirtail; +}; + +/* + * This is the overall maximum name length + */ +#define ADFS_MAX_NAME_LEN (256 + 4) /* +4 for ,xyz hex filetype suffix */ +struct object_info { + __u32 parent_id; /* parent object id */ + __u32 file_id; /* object id */ + __u32 loadaddr; /* load address */ + __u32 execaddr; /* execution address */ + __u32 size; /* size */ + __u8 attr; /* RISC OS attributes */ + unsigned int name_len; /* name length */ + char name[ADFS_MAX_NAME_LEN];/* file name */ + + /* RISC OS file type (12-bit: derived from loadaddr) */ + __u16 filetype; +}; + +/* RISC OS 12-bit filetype converts to ,xyz hex filename suffix */ +static inline int append_filetype_suffix(char *buf, __u16 filetype) +{ + if (filetype == 0xffff) /* no explicit 12-bit file type was set */ + return 0; + + *buf++ = ','; + *buf++ = hex_asc_lo(filetype >> 8); + *buf++ = hex_asc_lo(filetype >> 4); + *buf++ = hex_asc_lo(filetype >> 0); + return 4; +} + +struct adfs_dir_ops { + int (*read)(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir); + int (*setpos)(struct adfs_dir *dir, unsigned int fpos); + int (*getnext)(struct adfs_dir *dir, struct object_info *obj); + int (*update)(struct adfs_dir *dir, struct object_info *obj); + int (*create)(struct adfs_dir *dir, struct object_info *obj); + int (*remove)(struct adfs_dir *dir, struct object_info *obj); + int (*sync)(struct adfs_dir *dir); + void (*free)(struct adfs_dir *dir); +}; + +struct adfs_discmap { + struct buffer_head *dm_bh; + __u32 dm_startblk; + unsigned int dm_startbit; + unsigned int dm_endbit; +}; + +/* Inode stuff */ +struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); +int adfs_write_inode(struct inode *inode, struct writeback_control *wbc); +int adfs_notify_change(struct dentry *dentry, struct iattr *attr); + +/* map.c */ +extern int adfs_map_lookup(struct super_block *sb, unsigned int frag_id, unsigned int offset); +extern unsigned int adfs_map_free(struct super_block *sb); + +/* Misc */ +void __adfs_error(struct super_block *sb, const char *function, + const char *fmt, ...); +#define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt) + +/* super.c */ + +/* + * Inodes and file operations + */ + +/* dir_*.c */ +extern const struct inode_operations adfs_dir_inode_operations; +extern const struct file_operations adfs_dir_operations; +extern const struct dentry_operations adfs_dentry_operations; +extern struct adfs_dir_ops adfs_f_dir_ops; +extern struct adfs_dir_ops adfs_fplus_dir_ops; + +extern int adfs_dir_update(struct super_block *sb, struct object_info *obj, + int wait); + +/* file.c */ +extern const struct inode_operations adfs_file_inode_operations; +extern const struct file_operations adfs_file_operations; + +static inline __u32 signed_asl(__u32 val, signed int shift) +{ + if (shift >= 0) + val <<= shift; + else + val >>= -shift; + return val; +} + +/* + * Calculate the address of a block in an object given the block offset + * and the object identity. + * + * The root directory ID should always be looked up in the map [3.4] + */ +static inline int +__adfs_block_map(struct super_block *sb, unsigned int object_id, + unsigned int block) +{ + if (object_id & 255) { + unsigned int off; + + off = (object_id & 255) - 1; + block += off << ADFS_SB(sb)->s_log2sharesize; + } + + return adfs_map_lookup(sb, object_id >> 8, block); +} diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c new file mode 100644 index 00000000..3d83075a --- /dev/null +++ b/fs/adfs/dir.c @@ -0,0 +1,296 @@ +/* + * linux/fs/adfs/dir.c + * + * Copyright (C) 1999-2000 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Common directory handling for ADFS + */ +#include "adfs.h" + +/* + * For future. This should probably be per-directory. + */ +static DEFINE_RWLOCK(adfs_dir_lock); + +static int +adfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; + struct object_info obj; + struct adfs_dir dir; + int ret = 0; + + if (filp->f_pos >> 32) + goto out; + + ret = ops->read(sb, inode->i_ino, inode->i_size, &dir); + if (ret) + goto out; + + switch ((unsigned long)filp->f_pos) { + case 0: + if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) + goto free_out; + filp->f_pos += 1; + + case 1: + if (filldir(dirent, "..", 2, 1, dir.parent_id, DT_DIR) < 0) + goto free_out; + filp->f_pos += 1; + + default: + break; + } + + read_lock(&adfs_dir_lock); + + ret = ops->setpos(&dir, filp->f_pos - 2); + if (ret) + goto unlock_out; + while (ops->getnext(&dir, &obj) == 0) { + if (filldir(dirent, obj.name, obj.name_len, + filp->f_pos, obj.file_id, DT_UNKNOWN) < 0) + goto unlock_out; + filp->f_pos += 1; + } + +unlock_out: + read_unlock(&adfs_dir_lock); + +free_out: + ops->free(&dir); + +out: + return ret; +} + +int +adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait) +{ + int ret = -EINVAL; +#ifdef CONFIG_ADFS_FS_RW + struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; + struct adfs_dir dir; + + printk(KERN_INFO "adfs_dir_update: object %06X in dir %06X\n", + obj->file_id, obj->parent_id); + + if (!ops->update) { + ret = -EINVAL; + goto out; + } + + ret = ops->read(sb, obj->parent_id, 0, &dir); + if (ret) + goto out; + + write_lock(&adfs_dir_lock); + ret = ops->update(&dir, obj); + write_unlock(&adfs_dir_lock); + + if (wait) { + int err = ops->sync(&dir); + if (!ret) + ret = err; + } + + ops->free(&dir); +out: +#endif + return ret; +} + +static int +adfs_match(struct qstr *name, struct object_info *obj) +{ + int i; + + if (name->len != obj->name_len) + return 0; + + for (i = 0; i < name->len; i++) { + char c1, c2; + + c1 = name->name[i]; + c2 = obj->name[i]; + + if (c1 >= 'A' && c1 <= 'Z') + c1 += 'a' - 'A'; + if (c2 >= 'A' && c2 <= 'Z') + c2 += 'a' - 'A'; + + if (c1 != c2) + return 0; + } + return 1; +} + +static int +adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_info *obj) +{ + struct super_block *sb = inode->i_sb; + struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; + struct adfs_dir dir; + int ret; + + ret = ops->read(sb, inode->i_ino, inode->i_size, &dir); + if (ret) + goto out; + + if (ADFS_I(inode)->parent_id != dir.parent_id) { + adfs_error(sb, "parent directory changed under me! (%lx but got %lx)\n", + ADFS_I(inode)->parent_id, dir.parent_id); + ret = -EIO; + goto free_out; + } + + obj->parent_id = inode->i_ino; + + /* + * '.' is handled by reserved_lookup() in fs/namei.c + */ + if (name->len == 2 && name->name[0] == '.' && name->name[1] == '.') { + /* + * Currently unable to fill in the rest of 'obj', + * but this is better than nothing. We need to + * ascend one level to find it's parent. + */ + obj->name_len = 0; + obj->file_id = obj->parent_id; + goto free_out; + } + + read_lock(&adfs_dir_lock); + + ret = ops->setpos(&dir, 0); + if (ret) + goto unlock_out; + + ret = -ENOENT; + while (ops->getnext(&dir, obj) == 0) { + if (adfs_match(name, obj)) { + ret = 0; + break; + } + } + +unlock_out: + read_unlock(&adfs_dir_lock); + +free_out: + ops->free(&dir); +out: + return ret; +} + +const struct file_operations adfs_dir_operations = { + .read = generic_read_dir, + .llseek = generic_file_llseek, + .readdir = adfs_readdir, + .fsync = generic_file_fsync, +}; + +static int +adfs_hash(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr) +{ + const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen; + const unsigned char *name; + unsigned long hash; + int i; + + if (qstr->len < name_len) + return 0; + + /* + * Truncate the name in place, avoids + * having to define a compare function. + */ + qstr->len = i = name_len; + name = qstr->name; + hash = init_name_hash(); + while (i--) { + char c; + + c = *name++; + if (c >= 'A' && c <= 'Z') + c += 'a' - 'A'; + + hash = partial_name_hash(c, hash); + } + qstr->hash = end_name_hash(hash); + + return 0; +} + +/* + * Compare two names, taking note of the name length + * requirements of the underlying filesystem. + */ +static int +adfs_compare(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + int i; + + if (len != name->len) + return 1; + + for (i = 0; i < name->len; i++) { + char a, b; + + a = str[i]; + b = name->name[i]; + + if (a >= 'A' && a <= 'Z') + a += 'a' - 'A'; + if (b >= 'A' && b <= 'Z') + b += 'a' - 'A'; + + if (a != b) + return 1; + } + return 0; +} + +const struct dentry_operations adfs_dentry_operations = { + .d_hash = adfs_hash, + .d_compare = adfs_compare, +}; + +static struct dentry * +adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = NULL; + struct object_info obj; + int error; + + error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj); + if (error == 0) { + error = -EACCES; + /* + * This only returns NULL if get_empty_inode + * fails. + */ + inode = adfs_iget(dir->i_sb, &obj); + if (inode) + error = 0; + } + d_add(dentry, inode); + return ERR_PTR(error); +} + +/* + * directories can handle most operations... + */ +const struct inode_operations adfs_dir_inode_operations = { + .lookup = adfs_lookup, + .setattr = adfs_notify_change, +}; diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c new file mode 100644 index 00000000..4bbe853e --- /dev/null +++ b/fs/adfs/dir_f.c @@ -0,0 +1,486 @@ +/* + * linux/fs/adfs/dir_f.c + * + * Copyright (C) 1997-1999 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * E and F format directory handling + */ +#include +#include "adfs.h" +#include "dir_f.h" + +static void adfs_f_free(struct adfs_dir *dir); + +/* + * Read an (unaligned) value of length 1..4 bytes + */ +static inline unsigned int adfs_readval(unsigned char *p, int len) +{ + unsigned int val = 0; + + switch (len) { + case 4: val |= p[3] << 24; + case 3: val |= p[2] << 16; + case 2: val |= p[1] << 8; + default: val |= p[0]; + } + return val; +} + +static inline void adfs_writeval(unsigned char *p, int len, unsigned int val) +{ + switch (len) { + case 4: p[3] = val >> 24; + case 3: p[2] = val >> 16; + case 2: p[1] = val >> 8; + default: p[0] = val; + } +} + +static inline int adfs_readname(char *buf, char *ptr, int maxlen) +{ + char *old_buf = buf; + + while ((unsigned char)*ptr >= ' ' && maxlen--) { + if (*ptr == '/') + *buf++ = '.'; + else + *buf++ = *ptr; + ptr++; + } + + return buf - old_buf; +} + +#define ror13(v) ((v >> 13) | (v << 19)) + +#define dir_u8(idx) \ + ({ int _buf = idx >> blocksize_bits; \ + int _off = idx - (_buf << blocksize_bits);\ + *(u8 *)(bh[_buf]->b_data + _off); \ + }) + +#define dir_u32(idx) \ + ({ int _buf = idx >> blocksize_bits; \ + int _off = idx - (_buf << blocksize_bits);\ + *(__le32 *)(bh[_buf]->b_data + _off); \ + }) + +#define bufoff(_bh,_idx) \ + ({ int _buf = _idx >> blocksize_bits; \ + int _off = _idx - (_buf << blocksize_bits);\ + (u8 *)(_bh[_buf]->b_data + _off); \ + }) + +/* + * There are some algorithms that are nice in + * assembler, but a bitch in C... This is one + * of them. + */ +static u8 +adfs_dir_checkbyte(const struct adfs_dir *dir) +{ + struct buffer_head * const *bh = dir->bh; + const int blocksize_bits = dir->sb->s_blocksize_bits; + union { __le32 *ptr32; u8 *ptr8; } ptr, end; + u32 dircheck = 0; + int last = 5 - 26; + int i = 0; + + /* + * Accumulate each word up to the last whole + * word of the last directory entry. This + * can spread across several buffer heads. + */ + do { + last += 26; + do { + dircheck = le32_to_cpu(dir_u32(i)) ^ ror13(dircheck); + + i += sizeof(u32); + } while (i < (last & ~3)); + } while (dir_u8(last) != 0); + + /* + * Accumulate the last few bytes. These + * bytes will be within the same bh. + */ + if (i != last) { + ptr.ptr8 = bufoff(bh, i); + end.ptr8 = ptr.ptr8 + last - i; + + do { + dircheck = *ptr.ptr8++ ^ ror13(dircheck); + } while (ptr.ptr8 < end.ptr8); + } + + /* + * The directory tail is in the final bh + * Note that contary to the RISC OS PRMs, + * the first few bytes are NOT included + * in the check. All bytes are in the + * same bh. + */ + ptr.ptr8 = bufoff(bh, 2008); + end.ptr8 = ptr.ptr8 + 36; + + do { + __le32 v = *ptr.ptr32++; + dircheck = le32_to_cpu(v) ^ ror13(dircheck); + } while (ptr.ptr32 < end.ptr32); + + return (dircheck ^ (dircheck >> 8) ^ (dircheck >> 16) ^ (dircheck >> 24)) & 0xff; +} + +/* + * Read and check that a directory is valid + */ +static int +adfs_dir_read(struct super_block *sb, unsigned long object_id, + unsigned int size, struct adfs_dir *dir) +{ + const unsigned int blocksize_bits = sb->s_blocksize_bits; + int blk = 0; + + /* + * Directories which are not a multiple of 2048 bytes + * are considered bad v2 [3.6] + */ + if (size & 2047) + goto bad_dir; + + size >>= blocksize_bits; + + dir->nr_buffers = 0; + dir->sb = sb; + + for (blk = 0; blk < size; blk++) { + int phys; + + phys = __adfs_block_map(sb, object_id, blk); + if (!phys) { + adfs_error(sb, "dir object %lX has a hole at offset %d", + object_id, blk); + goto release_buffers; + } + + dir->bh[blk] = sb_bread(sb, phys); + if (!dir->bh[blk]) + goto release_buffers; + } + + memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead)); + memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail)); + + if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq || + memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4)) + goto bad_dir; + + if (memcmp(&dir->dirhead.startname, "Nick", 4) && + memcmp(&dir->dirhead.startname, "Hugo", 4)) + goto bad_dir; + + if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte) + goto bad_dir; + + dir->nr_buffers = blk; + + return 0; + +bad_dir: + adfs_error(sb, "corrupted directory fragment %lX", + object_id); +release_buffers: + for (blk -= 1; blk >= 0; blk -= 1) + brelse(dir->bh[blk]); + + dir->sb = NULL; + + return -EIO; +} + +/* + * convert a disk-based directory entry to a Linux ADFS directory entry + */ +static inline void +adfs_dir2obj(struct adfs_dir *dir, struct object_info *obj, + struct adfs_direntry *de) +{ + obj->name_len = adfs_readname(obj->name, de->dirobname, ADFS_F_NAME_LEN); + obj->file_id = adfs_readval(de->dirinddiscadd, 3); + obj->loadaddr = adfs_readval(de->dirload, 4); + obj->execaddr = adfs_readval(de->direxec, 4); + obj->size = adfs_readval(de->dirlen, 4); + obj->attr = de->newdiratts; + obj->filetype = -1; + + /* + * object is a file and is filetyped and timestamped? + * RISC OS 12-bit filetype is stored in load_address[19:8] + */ + if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) && + (0xfff00000 == (0xfff00000 & obj->loadaddr))) { + obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8); + + /* optionally append the ,xyz hex filetype suffix */ + if (ADFS_SB(dir->sb)->s_ftsuffix) + obj->name_len += + append_filetype_suffix( + &obj->name[obj->name_len], + obj->filetype); + } +} + +/* + * convert a Linux ADFS directory entry to a disk-based directory entry + */ +static inline void +adfs_obj2dir(struct adfs_direntry *de, struct object_info *obj) +{ + adfs_writeval(de->dirinddiscadd, 3, obj->file_id); + adfs_writeval(de->dirload, 4, obj->loadaddr); + adfs_writeval(de->direxec, 4, obj->execaddr); + adfs_writeval(de->dirlen, 4, obj->size); + de->newdiratts = obj->attr; +} + +/* + * get a directory entry. Note that the caller is responsible + * for holding the relevant locks. + */ +static int +__adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj) +{ + struct super_block *sb = dir->sb; + struct adfs_direntry de; + int thissize, buffer, offset; + + buffer = pos >> sb->s_blocksize_bits; + + if (buffer > dir->nr_buffers) + return -EINVAL; + + offset = pos & (sb->s_blocksize - 1); + thissize = sb->s_blocksize - offset; + if (thissize > 26) + thissize = 26; + + memcpy(&de, dir->bh[buffer]->b_data + offset, thissize); + if (thissize != 26) + memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data, + 26 - thissize); + + if (!de.dirobname[0]) + return -ENOENT; + + adfs_dir2obj(dir, obj, &de); + + return 0; +} + +static int +__adfs_dir_put(struct adfs_dir *dir, int pos, struct object_info *obj) +{ + struct super_block *sb = dir->sb; + struct adfs_direntry de; + int thissize, buffer, offset; + + buffer = pos >> sb->s_blocksize_bits; + + if (buffer > dir->nr_buffers) + return -EINVAL; + + offset = pos & (sb->s_blocksize - 1); + thissize = sb->s_blocksize - offset; + if (thissize > 26) + thissize = 26; + + /* + * Get the entry in total + */ + memcpy(&de, dir->bh[buffer]->b_data + offset, thissize); + if (thissize != 26) + memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data, + 26 - thissize); + + /* + * update it + */ + adfs_obj2dir(&de, obj); + + /* + * Put the new entry back + */ + memcpy(dir->bh[buffer]->b_data + offset, &de, thissize); + if (thissize != 26) + memcpy(dir->bh[buffer + 1]->b_data, ((char *)&de) + thissize, + 26 - thissize); + + return 0; +} + +/* + * the caller is responsible for holding the necessary + * locks. + */ +static int +adfs_dir_find_entry(struct adfs_dir *dir, unsigned long object_id) +{ + int pos, ret; + + ret = -ENOENT; + + for (pos = 5; pos < ADFS_NUM_DIR_ENTRIES * 26 + 5; pos += 26) { + struct object_info obj; + + if (!__adfs_dir_get(dir, pos, &obj)) + break; + + if (obj.file_id == object_id) { + ret = pos; + break; + } + } + + return ret; +} + +static int +adfs_f_read(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir) +{ + int ret; + + if (sz != ADFS_NEWDIR_SIZE) + return -EIO; + + ret = adfs_dir_read(sb, id, sz, dir); + if (ret) + adfs_error(sb, "unable to read directory"); + else + dir->parent_id = adfs_readval(dir->dirtail.new.dirparent, 3); + + return ret; +} + +static int +adfs_f_setpos(struct adfs_dir *dir, unsigned int fpos) +{ + if (fpos >= ADFS_NUM_DIR_ENTRIES) + return -ENOENT; + + dir->pos = 5 + fpos * 26; + return 0; +} + +static int +adfs_f_getnext(struct adfs_dir *dir, struct object_info *obj) +{ + unsigned int ret; + + ret = __adfs_dir_get(dir, dir->pos, obj); + if (ret == 0) + dir->pos += 26; + + return ret; +} + +static int +adfs_f_update(struct adfs_dir *dir, struct object_info *obj) +{ + struct super_block *sb = dir->sb; + int ret, i; + + ret = adfs_dir_find_entry(dir, obj->file_id); + if (ret < 0) { + adfs_error(dir->sb, "unable to locate entry to update"); + goto out; + } + + __adfs_dir_put(dir, ret, obj); + + /* + * Increment directory sequence number + */ + dir->bh[0]->b_data[0] += 1; + dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 6] += 1; + + ret = adfs_dir_checkbyte(dir); + /* + * Update directory check byte + */ + dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 1] = ret; + +#if 1 + { + const unsigned int blocksize_bits = sb->s_blocksize_bits; + + memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead)); + memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail)); + + if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq || + memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4)) + goto bad_dir; + + if (memcmp(&dir->dirhead.startname, "Nick", 4) && + memcmp(&dir->dirhead.startname, "Hugo", 4)) + goto bad_dir; + + if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte) + goto bad_dir; + } +#endif + for (i = dir->nr_buffers - 1; i >= 0; i--) + mark_buffer_dirty(dir->bh[i]); + + ret = 0; +out: + return ret; +#if 1 +bad_dir: + adfs_error(dir->sb, "whoops! I broke a directory!"); + return -EIO; +#endif +} + +static int +adfs_f_sync(struct adfs_dir *dir) +{ + int err = 0; + int i; + + for (i = dir->nr_buffers - 1; i >= 0; i--) { + struct buffer_head *bh = dir->bh[i]; + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + err = -EIO; + } + + return err; +} + +static void +adfs_f_free(struct adfs_dir *dir) +{ + int i; + + for (i = dir->nr_buffers - 1; i >= 0; i--) { + brelse(dir->bh[i]); + dir->bh[i] = NULL; + } + + dir->nr_buffers = 0; + dir->sb = NULL; +} + +struct adfs_dir_ops adfs_f_dir_ops = { + .read = adfs_f_read, + .setpos = adfs_f_setpos, + .getnext = adfs_f_getnext, + .update = adfs_f_update, + .sync = adfs_f_sync, + .free = adfs_f_free +}; diff --git a/fs/adfs/dir_f.h b/fs/adfs/dir_f.h new file mode 100644 index 00000000..e4713404 --- /dev/null +++ b/fs/adfs/dir_f.h @@ -0,0 +1,65 @@ +/* + * linux/fs/adfs/dir_f.h + * + * Copyright (C) 1999 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Structures of directories on the F format disk + */ +#ifndef ADFS_DIR_F_H +#define ADFS_DIR_F_H + +/* + * Directory header + */ +struct adfs_dirheader { + unsigned char startmasseq; + unsigned char startname[4]; +}; + +#define ADFS_NEWDIR_SIZE 2048 +#define ADFS_NUM_DIR_ENTRIES 77 + +/* + * Directory entries + */ +struct adfs_direntry { +#define ADFS_F_NAME_LEN 10 + char dirobname[ADFS_F_NAME_LEN]; + __u8 dirload[4]; + __u8 direxec[4]; + __u8 dirlen[4]; + __u8 dirinddiscadd[3]; + __u8 newdiratts; +}; + +/* + * Directory tail + */ +union adfs_dirtail { + struct { + unsigned char dirlastmask; + char dirname[10]; + unsigned char dirparent[3]; + char dirtitle[19]; + unsigned char reserved[14]; + unsigned char endmasseq; + unsigned char endname[4]; + unsigned char dircheckbyte; + } old; + struct { + unsigned char dirlastmask; + unsigned char reserved[2]; + unsigned char dirparent[3]; + char dirtitle[19]; + char dirname[10]; + unsigned char endmasseq; + unsigned char endname[4]; + unsigned char dircheckbyte; + } new; +}; + +#endif diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c new file mode 100644 index 00000000..d9e3bee4 --- /dev/null +++ b/fs/adfs/dir_fplus.c @@ -0,0 +1,265 @@ +/* + * linux/fs/adfs/dir_fplus.c + * + * Copyright (C) 1997-1999 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include "adfs.h" +#include "dir_fplus.h" + +static int +adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir) +{ + struct adfs_bigdirheader *h; + struct adfs_bigdirtail *t; + unsigned long block; + unsigned int blk, size; + int i, ret = -EIO; + + dir->nr_buffers = 0; + + /* start off using fixed bh set - only alloc for big dirs */ + dir->bh_fplus = &dir->bh[0]; + + block = __adfs_block_map(sb, id, 0); + if (!block) { + adfs_error(sb, "dir object %X has a hole at offset 0", id); + goto out; + } + + dir->bh_fplus[0] = sb_bread(sb, block); + if (!dir->bh_fplus[0]) + goto out; + dir->nr_buffers += 1; + + h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data; + size = le32_to_cpu(h->bigdirsize); + if (size != sz) { + printk(KERN_WARNING "adfs: adfs_fplus_read:" + " directory header size %X\n" + " does not match directory size %X\n", + size, sz); + } + + if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 || + h->bigdirversion[2] != 0 || size & 2047 || + h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) { + printk(KERN_WARNING "adfs: dir object %X has" + " malformed dir header\n", id); + goto out; + } + + size >>= sb->s_blocksize_bits; + if (size > sizeof(dir->bh)/sizeof(dir->bh[0])) { + /* this directory is too big for fixed bh set, must allocate */ + struct buffer_head **bh_fplus = + kzalloc(size * sizeof(struct buffer_head *), + GFP_KERNEL); + if (!bh_fplus) { + adfs_error(sb, "not enough memory for" + " dir object %X (%d blocks)", id, size); + goto out; + } + dir->bh_fplus = bh_fplus; + /* copy over the pointer to the block that we've already read */ + dir->bh_fplus[0] = dir->bh[0]; + } + + for (blk = 1; blk < size; blk++) { + block = __adfs_block_map(sb, id, blk); + if (!block) { + adfs_error(sb, "dir object %X has a hole at offset %d", id, blk); + goto out; + } + + dir->bh_fplus[blk] = sb_bread(sb, block); + if (!dir->bh_fplus[blk]) { + adfs_error(sb, "dir object %X failed read for" + " offset %d, mapped block %X", + id, blk, block); + goto out; + } + + dir->nr_buffers += 1; + } + + t = (struct adfs_bigdirtail *) + (dir->bh_fplus[size - 1]->b_data + (sb->s_blocksize - 8)); + + if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) || + t->bigdirendmasseq != h->startmasseq || + t->reserved[0] != 0 || t->reserved[1] != 0) { + printk(KERN_WARNING "adfs: dir object %X has " + "malformed dir end\n", id); + goto out; + } + + dir->parent_id = le32_to_cpu(h->bigdirparent); + dir->sb = sb; + return 0; + +out: + if (dir->bh_fplus) { + for (i = 0; i < dir->nr_buffers; i++) + brelse(dir->bh_fplus[i]); + + if (&dir->bh[0] != dir->bh_fplus) + kfree(dir->bh_fplus); + + dir->bh_fplus = NULL; + } + + dir->nr_buffers = 0; + dir->sb = NULL; + return ret; +} + +static int +adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos) +{ + struct adfs_bigdirheader *h = + (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data; + int ret = -ENOENT; + + if (fpos <= le32_to_cpu(h->bigdirentries)) { + dir->pos = fpos; + ret = 0; + } + + return ret; +} + +static void +dir_memcpy(struct adfs_dir *dir, unsigned int offset, void *to, int len) +{ + struct super_block *sb = dir->sb; + unsigned int buffer, partial, remainder; + + buffer = offset >> sb->s_blocksize_bits; + offset &= sb->s_blocksize - 1; + + partial = sb->s_blocksize - offset; + + if (partial >= len) + memcpy(to, dir->bh_fplus[buffer]->b_data + offset, len); + else { + char *c = (char *)to; + + remainder = len - partial; + + memcpy(c, + dir->bh_fplus[buffer]->b_data + offset, + partial); + + memcpy(c + partial, + dir->bh_fplus[buffer + 1]->b_data, + remainder); + } +} + +static int +adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj) +{ + struct adfs_bigdirheader *h = + (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data; + struct adfs_bigdirentry bde; + unsigned int offset; + int i, ret = -ENOENT; + + if (dir->pos >= le32_to_cpu(h->bigdirentries)) + goto out; + + offset = offsetof(struct adfs_bigdirheader, bigdirname); + offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3); + offset += dir->pos * sizeof(struct adfs_bigdirentry); + + dir_memcpy(dir, offset, &bde, sizeof(struct adfs_bigdirentry)); + + obj->loadaddr = le32_to_cpu(bde.bigdirload); + obj->execaddr = le32_to_cpu(bde.bigdirexec); + obj->size = le32_to_cpu(bde.bigdirlen); + obj->file_id = le32_to_cpu(bde.bigdirindaddr); + obj->attr = le32_to_cpu(bde.bigdirattr); + obj->name_len = le32_to_cpu(bde.bigdirobnamelen); + + offset = offsetof(struct adfs_bigdirheader, bigdirname); + offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3); + offset += le32_to_cpu(h->bigdirentries) * sizeof(struct adfs_bigdirentry); + offset += le32_to_cpu(bde.bigdirobnameptr); + + dir_memcpy(dir, offset, obj->name, obj->name_len); + for (i = 0; i < obj->name_len; i++) + if (obj->name[i] == '/') + obj->name[i] = '.'; + + obj->filetype = -1; + + /* + * object is a file and is filetyped and timestamped? + * RISC OS 12-bit filetype is stored in load_address[19:8] + */ + if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) && + (0xfff00000 == (0xfff00000 & obj->loadaddr))) { + obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8); + + /* optionally append the ,xyz hex filetype suffix */ + if (ADFS_SB(dir->sb)->s_ftsuffix) + obj->name_len += + append_filetype_suffix( + &obj->name[obj->name_len], + obj->filetype); + } + + dir->pos += 1; + ret = 0; +out: + return ret; +} + +static int +adfs_fplus_sync(struct adfs_dir *dir) +{ + int err = 0; + int i; + + for (i = dir->nr_buffers - 1; i >= 0; i--) { + struct buffer_head *bh = dir->bh_fplus[i]; + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + err = -EIO; + } + + return err; +} + +static void +adfs_fplus_free(struct adfs_dir *dir) +{ + int i; + + if (dir->bh_fplus) { + for (i = 0; i < dir->nr_buffers; i++) + brelse(dir->bh_fplus[i]); + + if (&dir->bh[0] != dir->bh_fplus) + kfree(dir->bh_fplus); + + dir->bh_fplus = NULL; + } + + dir->nr_buffers = 0; + dir->sb = NULL; +} + +struct adfs_dir_ops adfs_fplus_dir_ops = { + .read = adfs_fplus_read, + .setpos = adfs_fplus_setpos, + .getnext = adfs_fplus_getnext, + .sync = adfs_fplus_sync, + .free = adfs_fplus_free +}; diff --git a/fs/adfs/dir_fplus.h b/fs/adfs/dir_fplus.h new file mode 100644 index 00000000..b55aa41a --- /dev/null +++ b/fs/adfs/dir_fplus.h @@ -0,0 +1,45 @@ +/* + * linux/fs/adfs/dir_fplus.h + * + * Copyright (C) 1999 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Structures of directories on the F+ format disk + */ + +#define ADFS_FPLUS_NAME_LEN 255 + +#define BIGDIRSTARTNAME ('S' | 'B' << 8 | 'P' << 16 | 'r' << 24) +#define BIGDIRENDNAME ('o' | 'v' << 8 | 'e' << 16 | 'n' << 24) + +struct adfs_bigdirheader { + __u8 startmasseq; + __u8 bigdirversion[3]; + __le32 bigdirstartname; + __le32 bigdirnamelen; + __le32 bigdirsize; + __le32 bigdirentries; + __le32 bigdirnamesize; + __le32 bigdirparent; + char bigdirname[1]; +}; + +struct adfs_bigdirentry { + __le32 bigdirload; + __le32 bigdirexec; + __le32 bigdirlen; + __le32 bigdirindaddr; + __le32 bigdirattr; + __le32 bigdirobnamelen; + __le32 bigdirobnameptr; +}; + +struct adfs_bigdirtail { + __le32 bigdirendname; + __u8 bigdirendmasseq; + __u8 reserved[2]; + __u8 bigdircheckbyte; +}; diff --git a/fs/adfs/file.c b/fs/adfs/file.c new file mode 100644 index 00000000..a36da538 --- /dev/null +++ b/fs/adfs/file.c @@ -0,0 +1,37 @@ +/* + * linux/fs/adfs/file.c + * + * Copyright (C) 1997-1999 Russell King + * from: + * + * linux/fs/ext2/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * adfs regular file handling primitives + */ +#include "adfs.h" + +const struct file_operations adfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .mmap = generic_file_mmap, + .fsync = generic_file_fsync, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations adfs_file_inode_operations = { + .setattr = adfs_notify_change, +}; diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c new file mode 100644 index 00000000..d5250c5a --- /dev/null +++ b/fs/adfs/inode.c @@ -0,0 +1,366 @@ +/* + * linux/fs/adfs/inode.c + * + * Copyright (C) 1997-1999 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include "adfs.h" + +/* + * Lookup/Create a block at offset 'block' into 'inode'. We currently do + * not support creation of new blocks, so we return -EIO for this case. + */ +static int +adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh, + int create) +{ + if (!create) { + if (block >= inode->i_blocks) + goto abort_toobig; + + block = __adfs_block_map(inode->i_sb, inode->i_ino, block); + if (block) + map_bh(bh, inode->i_sb, block); + return 0; + } + /* don't support allocation of blocks yet */ + return -EIO; + +abort_toobig: + return 0; +} + +static int adfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, adfs_get_block, wbc); +} + +static int adfs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, adfs_get_block); +} + +static int adfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + *pagep = NULL; + ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + adfs_get_block, + &ADFS_I(mapping->host)->mmu_private); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} + +static sector_t _adfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, adfs_get_block); +} + +static const struct address_space_operations adfs_aops = { + .readpage = adfs_readpage, + .writepage = adfs_writepage, + .write_begin = adfs_write_begin, + .write_end = generic_write_end, + .bmap = _adfs_bmap +}; + +/* + * Convert ADFS attributes and filetype to Linux permission. + */ +static umode_t +adfs_atts2mode(struct super_block *sb, struct inode *inode) +{ + unsigned int attr = ADFS_I(inode)->attr; + umode_t mode, rmask; + struct adfs_sb_info *asb = ADFS_SB(sb); + + if (attr & ADFS_NDA_DIRECTORY) { + mode = S_IRUGO & asb->s_owner_mask; + return S_IFDIR | S_IXUGO | mode; + } + + switch (ADFS_I(inode)->filetype) { + case 0xfc0: /* LinkFS */ + return S_IFLNK|S_IRWXUGO; + + case 0xfe6: /* UnixExec */ + rmask = S_IRUGO | S_IXUGO; + break; + + default: + rmask = S_IRUGO; + } + + mode = S_IFREG; + + if (attr & ADFS_NDA_OWNER_READ) + mode |= rmask & asb->s_owner_mask; + + if (attr & ADFS_NDA_OWNER_WRITE) + mode |= S_IWUGO & asb->s_owner_mask; + + if (attr & ADFS_NDA_PUBLIC_READ) + mode |= rmask & asb->s_other_mask; + + if (attr & ADFS_NDA_PUBLIC_WRITE) + mode |= S_IWUGO & asb->s_other_mask; + return mode; +} + +/* + * Convert Linux permission to ADFS attribute. We try to do the reverse + * of atts2mode, but there is not a 1:1 translation. + */ +static int +adfs_mode2atts(struct super_block *sb, struct inode *inode) +{ + umode_t mode; + int attr; + struct adfs_sb_info *asb = ADFS_SB(sb); + + /* FIXME: should we be able to alter a link? */ + if (S_ISLNK(inode->i_mode)) + return ADFS_I(inode)->attr; + + if (S_ISDIR(inode->i_mode)) + attr = ADFS_NDA_DIRECTORY; + else + attr = 0; + + mode = inode->i_mode & asb->s_owner_mask; + if (mode & S_IRUGO) + attr |= ADFS_NDA_OWNER_READ; + if (mode & S_IWUGO) + attr |= ADFS_NDA_OWNER_WRITE; + + mode = inode->i_mode & asb->s_other_mask; + mode &= ~asb->s_owner_mask; + if (mode & S_IRUGO) + attr |= ADFS_NDA_PUBLIC_READ; + if (mode & S_IWUGO) + attr |= ADFS_NDA_PUBLIC_WRITE; + + return attr; +} + +/* + * Convert an ADFS time to Unix time. ADFS has a 40-bit centi-second time + * referenced to 1 Jan 1900 (til 2248) so we need to discard 2208988800 seconds + * of time to convert from RISC OS epoch to Unix epoch. + */ +static void +adfs_adfs2unix_time(struct timespec *tv, struct inode *inode) +{ + unsigned int high, low; + /* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since + * 01 Jan 1900 00:00:00 (RISC OS epoch) + */ + static const s64 nsec_unix_epoch_diff_risc_os_epoch = + 2208988800000000000LL; + s64 nsec; + + if (ADFS_I(inode)->stamped == 0) + goto cur_time; + + high = ADFS_I(inode)->loadaddr & 0xFF; /* top 8 bits of timestamp */ + low = ADFS_I(inode)->execaddr; /* bottom 32 bits of timestamp */ + + /* convert 40-bit centi-seconds to 32-bit seconds + * going via nanoseconds to retain precision + */ + nsec = (((s64) high << 32) | (s64) low) * 10000000; /* cs to ns */ + + /* Files dated pre 01 Jan 1970 00:00:00. */ + if (nsec < nsec_unix_epoch_diff_risc_os_epoch) + goto too_early; + + /* convert from RISC OS to Unix epoch */ + nsec -= nsec_unix_epoch_diff_risc_os_epoch; + + *tv = ns_to_timespec(nsec); + return; + + cur_time: + *tv = CURRENT_TIME; + return; + + too_early: + tv->tv_sec = tv->tv_nsec = 0; + return; +} + +/* + * Convert an Unix time to ADFS time. We only do this if the entry has a + * time/date stamp already. + */ +static void +adfs_unix2adfs_time(struct inode *inode, unsigned int secs) +{ + unsigned int high, low; + + if (ADFS_I(inode)->stamped) { + /* convert 32-bit seconds to 40-bit centi-seconds */ + low = (secs & 255) * 100; + high = (secs / 256) * 100 + (low >> 8) + 0x336e996a; + + ADFS_I(inode)->loadaddr = (high >> 24) | + (ADFS_I(inode)->loadaddr & ~0xff); + ADFS_I(inode)->execaddr = (low & 255) | (high << 8); + } +} + +/* + * Fill in the inode information from the object information. + * + * Note that this is an inode-less filesystem, so we can't use the inode + * number to reference the metadata on the media. Instead, we use the + * inode number to hold the object ID, which in turn will tell us where + * the data is held. We also save the parent object ID, and with these + * two, we can locate the metadata. + * + * This does mean that we rely on an objects parent remaining the same at + * all times - we cannot cope with a cross-directory rename (yet). + */ +struct inode * +adfs_iget(struct super_block *sb, struct object_info *obj) +{ + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + goto out; + + inode->i_uid = ADFS_SB(sb)->s_uid; + inode->i_gid = ADFS_SB(sb)->s_gid; + inode->i_ino = obj->file_id; + inode->i_size = obj->size; + inode->i_nlink = 2; + inode->i_blocks = (inode->i_size + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + + /* + * we need to save the parent directory ID so that + * write_inode can update the directory information + * for this file. This will need special handling + * for cross-directory renames. + */ + ADFS_I(inode)->parent_id = obj->parent_id; + ADFS_I(inode)->loadaddr = obj->loadaddr; + ADFS_I(inode)->execaddr = obj->execaddr; + ADFS_I(inode)->attr = obj->attr; + ADFS_I(inode)->filetype = obj->filetype; + ADFS_I(inode)->stamped = ((obj->loadaddr & 0xfff00000) == 0xfff00000); + + inode->i_mode = adfs_atts2mode(sb, inode); + adfs_adfs2unix_time(&inode->i_mtime, inode); + inode->i_atime = inode->i_mtime; + inode->i_ctime = inode->i_mtime; + + if (S_ISDIR(inode->i_mode)) { + inode->i_op = &adfs_dir_inode_operations; + inode->i_fop = &adfs_dir_operations; + } else if (S_ISREG(inode->i_mode)) { + inode->i_op = &adfs_file_inode_operations; + inode->i_fop = &adfs_file_operations; + inode->i_mapping->a_ops = &adfs_aops; + ADFS_I(inode)->mmu_private = inode->i_size; + } + + insert_inode_hash(inode); + +out: + return inode; +} + +/* + * Validate and convert a changed access mode/time to their ADFS equivalents. + * adfs_write_inode will actually write the information back to the directory + * later. + */ +int +adfs_notify_change(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + unsigned int ia_valid = attr->ia_valid; + int error; + + error = inode_change_ok(inode, attr); + + /* + * we can't change the UID or GID of any file - + * we have a global UID/GID in the superblock + */ + if ((ia_valid & ATTR_UID && attr->ia_uid != ADFS_SB(sb)->s_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != ADFS_SB(sb)->s_gid)) + error = -EPERM; + + if (error) + goto out; + + /* XXX: this is missing some actual on-disk truncation.. */ + if (ia_valid & ATTR_SIZE) + truncate_setsize(inode, attr->ia_size); + + if (ia_valid & ATTR_MTIME) { + inode->i_mtime = attr->ia_mtime; + adfs_unix2adfs_time(inode, attr->ia_mtime.tv_sec); + } + /* + * FIXME: should we make these == to i_mtime since we don't + * have the ability to represent them in our filesystem? + */ + if (ia_valid & ATTR_ATIME) + inode->i_atime = attr->ia_atime; + if (ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; + if (ia_valid & ATTR_MODE) { + ADFS_I(inode)->attr = adfs_mode2atts(sb, inode); + inode->i_mode = adfs_atts2mode(sb, inode); + } + + /* + * FIXME: should we be marking this inode dirty even if + * we don't have any metadata to write back? + */ + if (ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MODE)) + mark_inode_dirty(inode); +out: + return error; +} + +/* + * write an existing inode back to the directory, and therefore the disk. + * The adfs-specific inode data has already been updated by + * adfs_notify_change() + */ +int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct super_block *sb = inode->i_sb; + struct object_info obj; + int ret; + + obj.file_id = inode->i_ino; + obj.name_len = 0; + obj.parent_id = ADFS_I(inode)->parent_id; + obj.loadaddr = ADFS_I(inode)->loadaddr; + obj.execaddr = ADFS_I(inode)->execaddr; + obj.attr = ADFS_I(inode)->attr; + obj.size = inode->i_size; + + ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); + return ret; +} diff --git a/fs/adfs/map.c b/fs/adfs/map.c new file mode 100644 index 00000000..6935f052 --- /dev/null +++ b/fs/adfs/map.c @@ -0,0 +1,290 @@ +/* + * linux/fs/adfs/map.c + * + * Copyright (C) 1997-2002 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include "adfs.h" + +/* + * The ADFS map is basically a set of sectors. Each sector is called a + * zone which contains a bitstream made up of variable sized fragments. + * Each bit refers to a set of bytes in the filesystem, defined by + * log2bpmb. This may be larger or smaller than the sector size, but + * the overall size it describes will always be a round number of + * sectors. A fragment id is always idlen bits long. + * + * < idlen > < n > <1> + * +---------+-------//---------+---+ + * | frag id | 0000....000000 | 1 | + * +---------+-------//---------+---+ + * + * The physical disk space used by a fragment is taken from the start of + * the fragment id up to and including the '1' bit - ie, idlen + n + 1 + * bits. + * + * A fragment id can be repeated multiple times in the whole map for + * large or fragmented files. The first map zone a fragment starts in + * is given by fragment id / ids_per_zone - this allows objects to start + * from any zone on the disk. + * + * Free space is described by a linked list of fragments. Each free + * fragment describes free space in the same way as the other fragments, + * however, the frag id specifies an offset (in map bits) from the end + * of this fragment to the start of the next free fragment. + * + * Objects stored on the disk are allocated object ids (we use these as + * our inode numbers.) Object ids contain a fragment id and an optional + * offset. This allows a directory fragment to contain small files + * associated with that directory. + */ + +/* + * For the future... + */ +static DEFINE_RWLOCK(adfs_map_lock); + +/* + * This is fun. We need to load up to 19 bits from the map at an + * arbitrary bit alignment. (We're limited to 19 bits by F+ version 2). + */ +#define GET_FRAG_ID(_map,_start,_idmask) \ + ({ \ + unsigned char *_m = _map + (_start >> 3); \ + u32 _frag = get_unaligned_le32(_m); \ + _frag >>= (_start & 7); \ + _frag & _idmask; \ + }) + +/* + * return the map bit offset of the fragment frag_id in the zone dm. + * Note that the loop is optimised for best asm code - look at the + * output of: + * gcc -D__KERNEL__ -O2 -I../../include -o - -S map.c + */ +static int +lookup_zone(const struct adfs_discmap *dm, const unsigned int idlen, + const unsigned int frag_id, unsigned int *offset) +{ + const unsigned int mapsize = dm->dm_endbit; + const u32 idmask = (1 << idlen) - 1; + unsigned char *map = dm->dm_bh->b_data + 4; + unsigned int start = dm->dm_startbit; + unsigned int mapptr; + u32 frag; + + do { + frag = GET_FRAG_ID(map, start, idmask); + mapptr = start + idlen; + + /* + * find end of fragment + */ + { + __le32 *_map = (__le32 *)map; + u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31); + while (v == 0) { + mapptr = (mapptr & ~31) + 32; + if (mapptr >= mapsize) + goto error; + v = le32_to_cpu(_map[mapptr >> 5]); + } + + mapptr += 1 + ffz(~v); + } + + if (frag == frag_id) + goto found; +again: + start = mapptr; + } while (mapptr < mapsize); + return -1; + +error: + printk(KERN_ERR "adfs: oversized fragment 0x%x at 0x%x-0x%x\n", + frag, start, mapptr); + return -1; + +found: + { + int length = mapptr - start; + if (*offset >= length) { + *offset -= length; + goto again; + } + } + return start + *offset; +} + +/* + * Scan the free space map, for this zone, calculating the total + * number of map bits in each free space fragment. + * + * Note: idmask is limited to 15 bits [3.2] + */ +static unsigned int +scan_free_map(struct adfs_sb_info *asb, struct adfs_discmap *dm) +{ + const unsigned int mapsize = dm->dm_endbit + 32; + const unsigned int idlen = asb->s_idlen; + const unsigned int frag_idlen = idlen <= 15 ? idlen : 15; + const u32 idmask = (1 << frag_idlen) - 1; + unsigned char *map = dm->dm_bh->b_data; + unsigned int start = 8, mapptr; + u32 frag; + unsigned long total = 0; + + /* + * get fragment id + */ + frag = GET_FRAG_ID(map, start, idmask); + + /* + * If the freelink is null, then no free fragments + * exist in this zone. + */ + if (frag == 0) + return 0; + + do { + start += frag; + + /* + * get fragment id + */ + frag = GET_FRAG_ID(map, start, idmask); + mapptr = start + idlen; + + /* + * find end of fragment + */ + { + __le32 *_map = (__le32 *)map; + u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31); + while (v == 0) { + mapptr = (mapptr & ~31) + 32; + if (mapptr >= mapsize) + goto error; + v = le32_to_cpu(_map[mapptr >> 5]); + } + + mapptr += 1 + ffz(~v); + } + + total += mapptr - start; + } while (frag >= idlen + 1); + + if (frag != 0) + printk(KERN_ERR "adfs: undersized free fragment\n"); + + return total; +error: + printk(KERN_ERR "adfs: oversized free fragment\n"); + return 0; +} + +static int +scan_map(struct adfs_sb_info *asb, unsigned int zone, + const unsigned int frag_id, unsigned int mapoff) +{ + const unsigned int idlen = asb->s_idlen; + struct adfs_discmap *dm, *dm_end; + int result; + + dm = asb->s_map + zone; + zone = asb->s_map_size; + dm_end = asb->s_map + zone; + + do { + result = lookup_zone(dm, idlen, frag_id, &mapoff); + + if (result != -1) + goto found; + + dm ++; + if (dm == dm_end) + dm = asb->s_map; + } while (--zone > 0); + + return -1; +found: + result -= dm->dm_startbit; + result += dm->dm_startblk; + + return result; +} + +/* + * calculate the amount of free blocks in the map. + * + * n=1 + * total_free = E(free_in_zone_n) + * nzones + */ +unsigned int +adfs_map_free(struct super_block *sb) +{ + struct adfs_sb_info *asb = ADFS_SB(sb); + struct adfs_discmap *dm; + unsigned int total = 0; + unsigned int zone; + + dm = asb->s_map; + zone = asb->s_map_size; + + do { + total += scan_free_map(asb, dm++); + } while (--zone > 0); + + return signed_asl(total, asb->s_map2blk); +} + +int +adfs_map_lookup(struct super_block *sb, unsigned int frag_id, + unsigned int offset) +{ + struct adfs_sb_info *asb = ADFS_SB(sb); + unsigned int zone, mapoff; + int result; + + /* + * map & root fragment is special - it starts in the center of the + * disk. The other fragments start at zone (frag / ids_per_zone) + */ + if (frag_id == ADFS_ROOT_FRAG) + zone = asb->s_map_size >> 1; + else + zone = frag_id / asb->s_ids_per_zone; + + if (zone >= asb->s_map_size) + goto bad_fragment; + + /* Convert sector offset to map offset */ + mapoff = signed_asl(offset, -asb->s_map2blk); + + read_lock(&adfs_map_lock); + result = scan_map(asb, zone, frag_id, mapoff); + read_unlock(&adfs_map_lock); + + if (result > 0) { + unsigned int secoff; + + /* Calculate sector offset into map block */ + secoff = offset - signed_asl(mapoff, asb->s_map2blk); + return secoff + signed_asl(result, asb->s_map2blk); + } + + adfs_error(sb, "fragment 0x%04x at offset %d not found in map", + frag_id, offset); + return 0; + +bad_fragment: + adfs_error(sb, "invalid fragment 0x%04x (zone = %d, max = %d)", + frag_id, zone, asb->s_map_size); + return 0; +} diff --git a/fs/adfs/super.c b/fs/adfs/super.c new file mode 100644 index 00000000..c8bf36a1 --- /dev/null +++ b/fs/adfs/super.c @@ -0,0 +1,543 @@ +/* + * linux/fs/adfs/super.c + * + * Copyright (C) 1997-1999 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "adfs.h" +#include "dir_f.h" +#include "dir_fplus.h" + +#define ADFS_DEFAULT_OWNER_MASK S_IRWXU +#define ADFS_DEFAULT_OTHER_MASK (S_IRWXG | S_IRWXO) + +void __adfs_error(struct super_block *sb, const char *function, const char *fmt, ...) +{ + char error_buf[128]; + va_list args; + + va_start(args, fmt); + vsnprintf(error_buf, sizeof(error_buf), fmt, args); + va_end(args); + + printk(KERN_CRIT "ADFS-fs error (device %s)%s%s: %s\n", + sb->s_id, function ? ": " : "", + function ? function : "", error_buf); +} + +static int adfs_checkdiscrecord(struct adfs_discrecord *dr) +{ + int i; + + /* sector size must be 256, 512 or 1024 bytes */ + if (dr->log2secsize != 8 && + dr->log2secsize != 9 && + dr->log2secsize != 10) + return 1; + + /* idlen must be at least log2secsize + 3 */ + if (dr->idlen < dr->log2secsize + 3) + return 1; + + /* we cannot have such a large disc that we + * are unable to represent sector offsets in + * 32 bits. This works out at 2.0 TB. + */ + if (le32_to_cpu(dr->disc_size_high) >> dr->log2secsize) + return 1; + + /* idlen must be no greater than 19 v2 [1.0] */ + if (dr->idlen > 19) + return 1; + + /* reserved bytes should be zero */ + for (i = 0; i < sizeof(dr->unused52); i++) + if (dr->unused52[i] != 0) + return 1; + + return 0; +} + +static unsigned char adfs_calczonecheck(struct super_block *sb, unsigned char *map) +{ + unsigned int v0, v1, v2, v3; + int i; + + v0 = v1 = v2 = v3 = 0; + for (i = sb->s_blocksize - 4; i; i -= 4) { + v0 += map[i] + (v3 >> 8); + v3 &= 0xff; + v1 += map[i + 1] + (v0 >> 8); + v0 &= 0xff; + v2 += map[i + 2] + (v1 >> 8); + v1 &= 0xff; + v3 += map[i + 3] + (v2 >> 8); + v2 &= 0xff; + } + v0 += v3 >> 8; + v1 += map[1] + (v0 >> 8); + v2 += map[2] + (v1 >> 8); + v3 += map[3] + (v2 >> 8); + + return v0 ^ v1 ^ v2 ^ v3; +} + +static int adfs_checkmap(struct super_block *sb, struct adfs_discmap *dm) +{ + unsigned char crosscheck = 0, zonecheck = 1; + int i; + + for (i = 0; i < ADFS_SB(sb)->s_map_size; i++) { + unsigned char *map; + + map = dm[i].dm_bh->b_data; + + if (adfs_calczonecheck(sb, map) != map[0]) { + adfs_error(sb, "zone %d fails zonecheck", i); + zonecheck = 0; + } + crosscheck ^= map[3]; + } + if (crosscheck != 0xff) + adfs_error(sb, "crosscheck != 0xff"); + return crosscheck == 0xff && zonecheck; +} + +static void adfs_put_super(struct super_block *sb) +{ + int i; + struct adfs_sb_info *asb = ADFS_SB(sb); + + for (i = 0; i < asb->s_map_size; i++) + brelse(asb->s_map[i].dm_bh); + kfree(asb->s_map); + kfree(asb); + sb->s_fs_info = NULL; +} + +static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) +{ + struct adfs_sb_info *asb = ADFS_SB(mnt->mnt_sb); + + if (asb->s_uid != 0) + seq_printf(seq, ",uid=%u", asb->s_uid); + if (asb->s_gid != 0) + seq_printf(seq, ",gid=%u", asb->s_gid); + if (asb->s_owner_mask != ADFS_DEFAULT_OWNER_MASK) + seq_printf(seq, ",ownmask=%o", asb->s_owner_mask); + if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK) + seq_printf(seq, ",othmask=%o", asb->s_other_mask); + if (asb->s_ftsuffix != 0) + seq_printf(seq, ",ftsuffix=%u", asb->s_ftsuffix); + + return 0; +} + +enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err}; + +static const match_table_t tokens = { + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_ownmask, "ownmask=%o"}, + {Opt_othmask, "othmask=%o"}, + {Opt_ftsuffix, "ftsuffix=%u"}, + {Opt_err, NULL} +}; + +static int parse_options(struct super_block *sb, char *options) +{ + char *p; + struct adfs_sb_info *asb = ADFS_SB(sb); + int option; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(args, &option)) + return -EINVAL; + asb->s_uid = option; + break; + case Opt_gid: + if (match_int(args, &option)) + return -EINVAL; + asb->s_gid = option; + break; + case Opt_ownmask: + if (match_octal(args, &option)) + return -EINVAL; + asb->s_owner_mask = option; + break; + case Opt_othmask: + if (match_octal(args, &option)) + return -EINVAL; + asb->s_other_mask = option; + break; + case Opt_ftsuffix: + if (match_int(args, &option)) + return -EINVAL; + asb->s_ftsuffix = option; + break; + default: + printk("ADFS-fs: unrecognised mount option \"%s\" " + "or missing value\n", p); + return -EINVAL; + } + } + return 0; +} + +static int adfs_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_NODIRATIME; + return parse_options(sb, data); +} + +static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct adfs_sb_info *sbi = ADFS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = ADFS_SUPER_MAGIC; + buf->f_namelen = sbi->s_namelen; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = sbi->s_size; + buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size; + buf->f_bavail = + buf->f_bfree = adfs_map_free(sb); + buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + +static struct kmem_cache *adfs_inode_cachep; + +static struct inode *adfs_alloc_inode(struct super_block *sb) +{ + struct adfs_inode_info *ei; + ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void adfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(adfs_inode_cachep, ADFS_I(inode)); +} + +static void adfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, adfs_i_callback); +} + +static void init_once(void *foo) +{ + struct adfs_inode_info *ei = (struct adfs_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", + sizeof(struct adfs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (adfs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(adfs_inode_cachep); +} + +static const struct super_operations adfs_sops = { + .alloc_inode = adfs_alloc_inode, + .destroy_inode = adfs_destroy_inode, + .write_inode = adfs_write_inode, + .put_super = adfs_put_super, + .statfs = adfs_statfs, + .remount_fs = adfs_remount, + .show_options = adfs_show_options, +}; + +static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr) +{ + struct adfs_discmap *dm; + unsigned int map_addr, zone_size, nzones; + int i, zone; + struct adfs_sb_info *asb = ADFS_SB(sb); + + nzones = asb->s_map_size; + zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare); + map_addr = (nzones >> 1) * zone_size - + ((nzones > 1) ? ADFS_DR_SIZE_BITS : 0); + map_addr = signed_asl(map_addr, asb->s_map2blk); + + asb->s_ids_per_zone = zone_size / (asb->s_idlen + 1); + + dm = kmalloc(nzones * sizeof(*dm), GFP_KERNEL); + if (dm == NULL) { + adfs_error(sb, "not enough memory"); + return NULL; + } + + for (zone = 0; zone < nzones; zone++, map_addr++) { + dm[zone].dm_startbit = 0; + dm[zone].dm_endbit = zone_size; + dm[zone].dm_startblk = zone * zone_size - ADFS_DR_SIZE_BITS; + dm[zone].dm_bh = sb_bread(sb, map_addr); + + if (!dm[zone].dm_bh) { + adfs_error(sb, "unable to read map"); + goto error_free; + } + } + + /* adjust the limits for the first and last map zones */ + i = zone - 1; + dm[0].dm_startblk = 0; + dm[0].dm_startbit = ADFS_DR_SIZE_BITS; + dm[i].dm_endbit = (le32_to_cpu(dr->disc_size_high) << (32 - dr->log2bpmb)) + + (le32_to_cpu(dr->disc_size) >> dr->log2bpmb) + + (ADFS_DR_SIZE_BITS - i * zone_size); + + if (adfs_checkmap(sb, dm)) + return dm; + + adfs_error(sb, "map corrupted"); + +error_free: + while (--zone >= 0) + brelse(dm[zone].dm_bh); + + kfree(dm); + return NULL; +} + +static inline unsigned long adfs_discsize(struct adfs_discrecord *dr, int block_bits) +{ + unsigned long discsize; + + discsize = le32_to_cpu(dr->disc_size_high) << (32 - block_bits); + discsize |= le32_to_cpu(dr->disc_size) >> block_bits; + + return discsize; +} + +static int adfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct adfs_discrecord *dr; + struct buffer_head *bh; + struct object_info root_obj; + unsigned char *b_data; + struct adfs_sb_info *asb; + struct inode *root; + + sb->s_flags |= MS_NODIRATIME; + + asb = kzalloc(sizeof(*asb), GFP_KERNEL); + if (!asb) + return -ENOMEM; + sb->s_fs_info = asb; + + /* set default options */ + asb->s_uid = 0; + asb->s_gid = 0; + asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK; + asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; + asb->s_ftsuffix = 0; + + if (parse_options(sb, data)) + goto error; + + sb_set_blocksize(sb, BLOCK_SIZE); + if (!(bh = sb_bread(sb, ADFS_DISCRECORD / BLOCK_SIZE))) { + adfs_error(sb, "unable to read superblock"); + goto error; + } + + b_data = bh->b_data + (ADFS_DISCRECORD % BLOCK_SIZE); + + if (adfs_checkbblk(b_data)) { + if (!silent) + printk("VFS: Can't find an adfs filesystem on dev " + "%s.\n", sb->s_id); + goto error_free_bh; + } + + dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); + + /* + * Do some sanity checks on the ADFS disc record + */ + if (adfs_checkdiscrecord(dr)) { + if (!silent) + printk("VPS: Can't find an adfs filesystem on dev " + "%s.\n", sb->s_id); + goto error_free_bh; + } + + brelse(bh); + if (sb_set_blocksize(sb, 1 << dr->log2secsize)) { + bh = sb_bread(sb, ADFS_DISCRECORD / sb->s_blocksize); + if (!bh) { + adfs_error(sb, "couldn't read superblock on " + "2nd try."); + goto error; + } + b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize); + if (adfs_checkbblk(b_data)) { + adfs_error(sb, "disc record mismatch, very weird!"); + goto error_free_bh; + } + dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); + } else { + if (!silent) + printk(KERN_ERR "VFS: Unsupported blocksize on dev " + "%s.\n", sb->s_id); + goto error; + } + + /* + * blocksize on this device should now be set to the ADFS log2secsize + */ + + sb->s_magic = ADFS_SUPER_MAGIC; + asb->s_idlen = dr->idlen; + asb->s_map_size = dr->nzones | (dr->nzones_high << 8); + asb->s_map2blk = dr->log2bpmb - dr->log2secsize; + asb->s_size = adfs_discsize(dr, sb->s_blocksize_bits); + asb->s_version = dr->format_version; + asb->s_log2sharesize = dr->log2sharesize; + + asb->s_map = adfs_read_map(sb, dr); + if (!asb->s_map) + goto error_free_bh; + + brelse(bh); + + /* + * set up enough so that we can read an inode + */ + sb->s_op = &adfs_sops; + + dr = (struct adfs_discrecord *)(asb->s_map[0].dm_bh->b_data + 4); + + root_obj.parent_id = root_obj.file_id = le32_to_cpu(dr->root); + root_obj.name_len = 0; + /* Set root object date as 01 Jan 1987 00:00:00 */ + root_obj.loadaddr = 0xfff0003f; + root_obj.execaddr = 0xec22c000; + root_obj.size = ADFS_NEWDIR_SIZE; + root_obj.attr = ADFS_NDA_DIRECTORY | ADFS_NDA_OWNER_READ | + ADFS_NDA_OWNER_WRITE | ADFS_NDA_PUBLIC_READ; + root_obj.filetype = -1; + + /* + * If this is a F+ disk with variable length directories, + * get the root_size from the disc record. + */ + if (asb->s_version) { + root_obj.size = le32_to_cpu(dr->root_size); + asb->s_dir = &adfs_fplus_dir_ops; + asb->s_namelen = ADFS_FPLUS_NAME_LEN; + } else { + asb->s_dir = &adfs_f_dir_ops; + asb->s_namelen = ADFS_F_NAME_LEN; + } + /* + * ,xyz hex filetype suffix may be added by driver + * to files that have valid RISC OS filetype + */ + if (asb->s_ftsuffix) + asb->s_namelen += 4; + + sb->s_d_op = &adfs_dentry_operations; + root = adfs_iget(sb, &root_obj); + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + int i; + iput(root); + for (i = 0; i < asb->s_map_size; i++) + brelse(asb->s_map[i].dm_bh); + kfree(asb->s_map); + adfs_error(sb, "get root inode failed\n"); + goto error; + } + return 0; + +error_free_bh: + brelse(bh); +error: + sb->s_fs_info = NULL; + kfree(asb); + return -EINVAL; +} + +static struct dentry *adfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super); +} + +static struct file_system_type adfs_fs_type = { + .owner = THIS_MODULE, + .name = "adfs", + .mount = adfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_adfs_fs(void) +{ + int err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&adfs_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_adfs_fs(void) +{ + unregister_filesystem(&adfs_fs_type); + destroy_inodecache(); +} + +module_init(init_adfs_fs) +module_exit(exit_adfs_fs) +MODULE_LICENSE("GPL"); diff --git a/fs/affs/Changes b/fs/affs/Changes new file mode 100644 index 00000000..a29409c1 --- /dev/null +++ b/fs/affs/Changes @@ -0,0 +1,343 @@ +(Note: I consider version numbers as cheap. That means +that I do not like numbers like 0.1 and the like for +things that can be used since quite some time. But +then, 3.1 doesn't mean 'perfectly stable', too.) + +Known bugs: +----------- + +- Doesn't work on the alpha. The only 64/32-bit + problem that I'm aware of (pointer/int conversion + in readdir()) gives compiler warnings but is + apparently not causing the failure, as directory + reads basically work (but all files are of size 0). + Alas, I've got no alpha to debug. :-( + +- The partition checker (drivers/block/genhd.c) + doesn't work with devices which have 256 byte + blocks (some very old SCSI drives). + +- The feature to automatically make the fs clean + might leave a trashed file system with the + bitmap flag set valid. + +- When a file is truncated to a size that is not + a multiple of the blocksize, the rest of the + last allocated block is not cleared. Well, + this fs never claimed to be Posix conformant. + +Please direct bug reports to: zippel@linux-m68k.org + +Version 3.20 +------------ +- kill kernel lock +- fix for a possible bitmap corruption + +Version 3.19 +------------ + +- sizeof changes from Kernel Janitor Project +- several bug fixes found with fsx + +Version 3.18 +------------ + +- change to global min macro + warning fixes +- add module tags + +Version 3.17 +------------ + +- locking fixes +- wrong sign in __affs_hash_dentry +- remove unnecessary check in affs_new_inode +- enable international mode for dircache fs + +Version 3.16 +------------ + +- use mark_buffer_dirty_inode instead of mark_buffer_dirty. +- introduce affs_lock_{link|dir|ext}. + +Version 3.15 +------------ + +- disable link to directories until we can properly support them. +- locking fixes for link creation/removal. + +Version 3.14 +------------ + +- correctly cut off long file names for compares +- correctly initialize s_last_bmap + +Version 3.13 +------------ + +Major cleanup for 2.4 [Roman Zippel] +- new extended block handling +- new bitmap allocation functions +- locking should be safe for the future +- cleanup of some interfaces + +Version 3.12 +------------ + +more 2.4 fixes: [Roman Zippel] +- s_lock changes +- increased getblock mess +- clear meta blocks + +Version 3.11 +------------ + +- Converted to use 2.3.x page cache [Dave Jones ] +- Corruption in truncate() bugfix [Ken Tyler ] + +Version 3.10 +------------ + +- Changed partition checker to allow devices + with physical blocks != 512 bytes. + +- The partition checker now also ignores the + word at 0xd0 that Windows likes to write to. + +Version 3.9 +----------- + +- Moved cleanup from release_file() to put_inode(). + This makes the first one obsolete. + +- truncate() zeroes the unused remainder of a + partially used last block when a file is truncated. + It also marks the inode dirty now (which is not + really necessary as notify_change() will do + it anyway). + +- Added a few comments, fixed some typos (and + introduced some new ones), made the debug messages + more consistent. Changed a bad example in the + doc file (affs.txt). + +- Sets the NOEXEC flag in read_super() for old file + systems, since you can't run programs on them. + +Version 3.8 +----------- +Bill Hawes kindly reviewed the affs and sent me the +patches he did. They're marked (BH). Thanks, Bill! + +- Cleanup of error handling in read_super(). + Didn't release all resources in case of an + error. (BH) + +- put_inode() releases the ext cache only if it's + no longer needed. (BH) + +- One set of dentry callbacks is enough. (BH) + +- Cleanup of error handling in namei.c. (BH) + +- Cleanup of error handling in file.c. (BH) + +- The original blocksize of the device is + restored when the fs is unmounted. (BH) + +- getblock() did not invalidate the key cache + when it allocated a new block. + +- Removed some unnecessary locks as Bill + suggested. + +- Simplified match_name(), changed all hashing + and case insensitive name comparisons to use + uppercase. This makes the tolower() routines + obsolete. + +- Added mount option 'mufs' to force muFS + uid/gid interpretation. + +- File mode changes were not updated on disk. + This was fixed before, but somehow got lost. + +Version 3.7 +----------- + +- Added dentry callbacks to allow the dcache to + operate case insensitive and length ignorant + like the affs itself. + +- getblock() didn't update the lastblock field in the + inode if the fs was not an OFS. This bug only shows + up if a file was enlarged via truncate() and there + was not enough space. + +- Remove some more superfluous code left over from + the old link days ... + +- Fixed some oversights which were in patch 2.1.78. + +- Fixed a few typos. + +Version 3.6 +----------- + +- dentry changes. (Thanks to Jes Sorensen for his help.) + +- Fixed bug in balloc(): Superblock was not set dirty after + the bitmap was changed, so the bitmap wasn't sync'd. + +- Fixed nasty bug in find_new_zone(): If the current + zone number was zero, the loop didn't terminate, + causing a solid lock-up. + +- Removed support for old-style directory reads. + +- Fixed bug in add_entry(): When doing a sorted insert, + the pointer to the next entry in the hash chain wasn't + correctly byte-swapped. Since most of the users of the + affs use it on a 68k, they didn't notice. But why did + I not find this during my tests? + +- Fixed some oversights (version wasn't updated on some + directory changes). + +- Handling of hard links rewritten. To the VFS + they appear now as normal Unix links. They are + now resolved only once in lookup(). The backside + is that unlink(), rename() and rmdir() have to + be smart about them, but the result is worth the + effort. This also led to some code cleanup. + +- Changed name type to unsigned char; the test for + invalid filenames didn't work correctly. + (Thanks to Michael Krause for pointing at this.) + +- Changed mapping of executable flag. + +- Changed all network byte-order macros to the + recommended ones. + +- Added a remount function, so attempts to remount + a dircache filesystem or one with errors read/write + can be trapped. Previously, ro remounts didn't + flush the super block, and rw remounts didn't + create allocation zones ... + +- Call shrink_dcache_parent() in rmdir(). + (Thanks to Bill Hawes.) + +- Permission checks in unlink(). + +- Allow mounting of volumes with superfluous + bitmap pointers read only, also allows them + to be remounted read/write. + +- Owner/Group defaults now to the fs user (i.e. + the one that mounted it) instead of root. This + obsoletes the mount options uid and gid. + +- Argument to volume option could overflow the + name buffer. It is now silently truncated to + 30 characters. (Damn it! This kind of bug + is too embarrassing.) + +- Split inode.c into 2 files, the superblock + routines desperately wanted their own file. + +- truncate() didn't allocate an extension block + cache. If a file was extended by means of + truncate(), this led to an Oops. + +- fsuser is now checked last. + +- rename() will not ignore changes in filename + casing any more (though mv(1) still won't allow + you to do "mv oldname OldName"). + +Version 3.5 +----------- + +- Extension block caches are now allocated on + demand instead of when a file is opened, as + files can be read and written without opening + them (e. g. the loopback device does this). + +- Removed an unused function. + +Version 3.4 +----------- + +- Hash chains are now sorted by block numbers. + (Thanks to Kars de Jong for finding this.) +- Removed all unnecessary external symbols. + +Version 3.3 +----------- + +- Tried to make all types 'correct' and consistent. +- Errors and warnings are now reported via a + function. They are all prefixed by a severity + and have the same appearance: + "AFFS: : " + (There's one exception to this, as in that function + is no pointer to the super block available.) +- The filesystem is remounted read-only after an + error. +- The names of newly created filesystem objects are + now checked for validity. +- Minor cleanups in comments. +- Added this Changes file. At last! + +Version 3.2 +----------- + +- Extension block cache: Reading/writing of huge files + (several MB) is much faster (of course the added + overhead slows down opening, but this is hardly + noticeable). +- The same get_block()-routine can now be used for + both OFS and FFS. +- The super block is now searched in the block that + was calculated and in the one following. This + should remedy the round-off error introduced by + the 1-k blocks that Linux uses. +- Minor changes to adhere to the new VFS interface. +- The number of used blocks is now also calculated + if the filesystem is mounted read-only. +- Prefixed some constants with AFFS_ to avoid name + clashes. +- Removed 'EXPERIMENTAL' status. + +Version 3.1 +----------- + +- Fixed a nasty bug which didn't allow read-only + mounts. +- Allow dir-cache filesystems to be mounted + read only. +- OFS support. +- Several other changes I just cannot remember + any more. + +Version 3.0 +----------- + +- Almost complete rewrite for the new VFS + interface in Linux 1.3. +- Write support. +- Support for hard and symbolic links. +- Lots of things I remember even less ... + +Version 2.0 +----------- + +- Fixed a few things to get it compiled. +- Automatic root block calculation. +- Partition checker for genhd.c + +======================================== + +Let's just call Ray Burr's original affs +'Version 1.0'. diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig new file mode 100644 index 00000000..cfad9afb --- /dev/null +++ b/fs/affs/Kconfig @@ -0,0 +1,21 @@ +config AFFS_FS + tristate "Amiga FFS file system support (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + help + The Fast File System (FFS) is the common file system used on hard + disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20). Say Y + if you want to be able to read and write files from and to an Amiga + FFS partition on your hard drive. Amiga floppies however cannot be + read with this driver due to an incompatibility of the floppy + controller used in an Amiga and the standard floppy controller in + PCs and workstations. Read + and . + + With this driver you can also mount disk files used by Bernd + Schmidt's Un*X Amiga Emulator + (). + If you want to do this, you will also need to say Y or M to "Loop + device support", above. + + To compile this file system support as a module, choose M here: the + module will be called affs. If unsure, say N. diff --git a/fs/affs/Makefile b/fs/affs/Makefile new file mode 100644 index 00000000..3988b4a7 --- /dev/null +++ b/fs/affs/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the Linux affs filesystem routines. +# + +#ccflags-y := -DDEBUG=1 + +obj-$(CONFIG_AFFS_FS) += affs.o + +affs-objs := super.o namei.o inode.o file.o dir.o amigaffs.o bitmap.o symlink.o diff --git a/fs/affs/affs.h b/fs/affs/affs.h new file mode 100644 index 00000000..0e95f73a --- /dev/null +++ b/fs/affs/affs.h @@ -0,0 +1,305 @@ +#include +#include +#include +#include +#include + +/* AmigaOS allows file names with up to 30 characters length. + * Names longer than that will be silently truncated. If you + * want to disallow this, comment out the following #define. + * Creating filesystem objects with longer names will then + * result in an error (ENAMETOOLONG). + */ +/*#define AFFS_NO_TRUNCATE */ + +/* Ugly macros make the code more pretty. */ + +#define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st)))) +#define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey]) +#define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)]) + +#ifdef __LITTLE_ENDIAN +#define BO_EXBITS 0x18UL +#elif defined(__BIG_ENDIAN) +#define BO_EXBITS 0x00UL +#else +#error Endianness must be known for affs to work. +#endif + +#define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data) +#define AFFS_TAIL(sb, bh) ((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail))) +#define AFFS_ROOT_HEAD(bh) ((struct affs_root_head *)(bh)->b_data) +#define AFFS_ROOT_TAIL(sb, bh) ((struct affs_root_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_root_tail))) +#define AFFS_DATA_HEAD(bh) ((struct affs_data_head *)(bh)->b_data) +#define AFFS_DATA(bh) (((struct affs_data_head *)(bh)->b_data)->data) + +#define AFFS_CACHE_SIZE PAGE_SIZE + +#define AFFS_MAX_PREALLOC 32 +#define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2) +#define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) +#define AFFS_AC_MASK (AFFS_AC_SIZE-1) + +struct affs_ext_key { + u32 ext; /* idx of the extended block */ + u32 key; /* block number */ +}; + +/* + * affs fs inode data in memory + */ +struct affs_inode_info { + atomic_t i_opencnt; + struct semaphore i_link_lock; /* Protects internal inode access. */ + struct semaphore i_ext_lock; /* Protects internal inode access. */ +#define i_hash_lock i_ext_lock + u32 i_blkcnt; /* block count */ + u32 i_extcnt; /* extended block count */ + u32 *i_lc; /* linear cache of extended blocks */ + u32 i_lc_size; + u32 i_lc_shift; + u32 i_lc_mask; + struct affs_ext_key *i_ac; /* associative cache of extended blocks */ + u32 i_ext_last; /* last accessed extended block */ + struct buffer_head *i_ext_bh; /* bh of last extended block */ + loff_t mmu_private; + u32 i_protect; /* unused attribute bits */ + u32 i_lastalloc; /* last allocated block */ + int i_pa_cnt; /* number of preallocated blocks */ + struct inode vfs_inode; +}; + +/* short cut to get to the affs specific inode data */ +static inline struct affs_inode_info *AFFS_I(struct inode *inode) +{ + return list_entry(inode, struct affs_inode_info, vfs_inode); +} + +/* + * super-block data in memory + * + * Block numbers are adjusted for their actual size + * + */ + +struct affs_bm_info { + u32 bm_key; /* Disk block number */ + u32 bm_free; /* Free blocks in here */ +}; + +struct affs_sb_info { + int s_partition_size; /* Partition size in blocks. */ + int s_reserved; /* Number of reserved blocks. */ + //u32 s_blksize; /* Initial device blksize */ + u32 s_data_blksize; /* size of the data block w/o header */ + u32 s_root_block; /* FFS root block number. */ + int s_hashsize; /* Size of hash table. */ + unsigned long s_flags; /* See below. */ + uid_t s_uid; /* uid to override */ + gid_t s_gid; /* gid to override */ + umode_t s_mode; /* mode to override */ + struct buffer_head *s_root_bh; /* Cached root block. */ + struct mutex s_bmlock; /* Protects bitmap access. */ + struct affs_bm_info *s_bitmap; /* Bitmap infos. */ + u32 s_bmap_count; /* # of bitmap blocks. */ + u32 s_bmap_bits; /* # of bits in one bitmap blocks */ + u32 s_last_bmap; + struct buffer_head *s_bmap_bh; + char *s_prefix; /* Prefix for volumes and assigns. */ + char s_volume[32]; /* Volume prefix for absolute symlinks. */ + spinlock_t symlink_lock; /* protects the previous two */ +}; + +#define SF_INTL 0x0001 /* International filesystem. */ +#define SF_BM_VALID 0x0002 /* Bitmap is valid. */ +#define SF_IMMUTABLE 0x0004 /* Protection bits cannot be changed */ +#define SF_QUIET 0x0008 /* chmod errors will be not reported */ +#define SF_SETUID 0x0010 /* Ignore Amiga uid */ +#define SF_SETGID 0x0020 /* Ignore Amiga gid */ +#define SF_SETMODE 0x0040 /* Ignore Amiga protection bits */ +#define SF_MUFS 0x0100 /* Use MUFS uid/gid mapping */ +#define SF_OFS 0x0200 /* Old filesystem */ +#define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */ +#define SF_VERBOSE 0x0800 /* Talk about fs when mounting */ + +/* short cut to get to the affs specific sb data */ +static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* amigaffs.c */ + +extern int affs_insert_hash(struct inode *inode, struct buffer_head *bh); +extern int affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh); +extern int affs_remove_header(struct dentry *dentry); +extern u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh); +extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); +extern void secs_to_datestamp(time_t secs, struct affs_date *ds); +extern mode_t prot_to_mode(u32 prot); +extern void mode_to_prot(struct inode *inode); +extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); +extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); +extern int affs_check_name(const unsigned char *name, int len); +extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry); + +/* bitmap. c */ + +extern u32 affs_count_free_blocks(struct super_block *s); +extern void affs_free_block(struct super_block *sb, u32 block); +extern u32 affs_alloc_block(struct inode *inode, u32 goal); +extern int affs_init_bitmap(struct super_block *sb, int *flags); +extern void affs_free_bitmap(struct super_block *sb); + +/* namei.c */ + +extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); +extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *); +extern int affs_unlink(struct inode *dir, struct dentry *dentry); +extern int affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *); +extern int affs_mkdir(struct inode *dir, struct dentry *dentry, int mode); +extern int affs_rmdir(struct inode *dir, struct dentry *dentry); +extern int affs_link(struct dentry *olddentry, struct inode *dir, + struct dentry *dentry); +extern int affs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname); +extern int affs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry); + +/* inode.c */ + +extern unsigned long affs_parent_ino(struct inode *dir); +extern struct inode *affs_new_inode(struct inode *dir); +extern int affs_notify_change(struct dentry *dentry, struct iattr *attr); +extern void affs_evict_inode(struct inode *inode); +extern struct inode *affs_iget(struct super_block *sb, + unsigned long ino); +extern int affs_write_inode(struct inode *inode, + struct writeback_control *wbc); +extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); + +/* file.c */ + +void affs_free_prealloc(struct inode *inode); +extern void affs_truncate(struct inode *); +int affs_file_fsync(struct file *, int); + +/* dir.c */ + +extern void affs_dir_truncate(struct inode *); + +/* jump tables */ + +extern const struct inode_operations affs_file_inode_operations; +extern const struct inode_operations affs_dir_inode_operations; +extern const struct inode_operations affs_symlink_inode_operations; +extern const struct file_operations affs_file_operations; +extern const struct file_operations affs_file_operations_ofs; +extern const struct file_operations affs_dir_operations; +extern const struct address_space_operations affs_symlink_aops; +extern const struct address_space_operations affs_aops; +extern const struct address_space_operations affs_aops_ofs; + +extern const struct dentry_operations affs_dentry_operations; +extern const struct dentry_operations affs_intl_dentry_operations; + +static inline void +affs_set_blocksize(struct super_block *sb, int size) +{ + sb_set_blocksize(sb, size); +} +static inline struct buffer_head * +affs_bread(struct super_block *sb, int block) +{ + pr_debug("affs_bread: %d\n", block); + if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) + return sb_bread(sb, block); + return NULL; +} +static inline struct buffer_head * +affs_getblk(struct super_block *sb, int block) +{ + pr_debug("affs_getblk: %d\n", block); + if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) + return sb_getblk(sb, block); + return NULL; +} +static inline struct buffer_head * +affs_getzeroblk(struct super_block *sb, int block) +{ + struct buffer_head *bh; + pr_debug("affs_getzeroblk: %d\n", block); + if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) { + bh = sb_getblk(sb, block); + lock_buffer(bh); + memset(bh->b_data, 0 , sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + return bh; + } + return NULL; +} +static inline struct buffer_head * +affs_getemptyblk(struct super_block *sb, int block) +{ + struct buffer_head *bh; + pr_debug("affs_getemptyblk: %d\n", block); + if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) { + bh = sb_getblk(sb, block); + wait_on_buffer(bh); + set_buffer_uptodate(bh); + return bh; + } + return NULL; +} +static inline void +affs_brelse(struct buffer_head *bh) +{ + if (bh) + pr_debug("affs_brelse: %lld\n", (long long) bh->b_blocknr); + brelse(bh); +} + +static inline void +affs_adjust_checksum(struct buffer_head *bh, u32 val) +{ + u32 tmp = be32_to_cpu(((__be32 *)bh->b_data)[5]); + ((__be32 *)bh->b_data)[5] = cpu_to_be32(tmp - val); +} +static inline void +affs_adjust_bitmapchecksum(struct buffer_head *bh, u32 val) +{ + u32 tmp = be32_to_cpu(((__be32 *)bh->b_data)[0]); + ((__be32 *)bh->b_data)[0] = cpu_to_be32(tmp - val); +} + +static inline void +affs_lock_link(struct inode *inode) +{ + down(&AFFS_I(inode)->i_link_lock); +} +static inline void +affs_unlock_link(struct inode *inode) +{ + up(&AFFS_I(inode)->i_link_lock); +} +static inline void +affs_lock_dir(struct inode *inode) +{ + down(&AFFS_I(inode)->i_hash_lock); +} +static inline void +affs_unlock_dir(struct inode *inode) +{ + up(&AFFS_I(inode)->i_hash_lock); +} +static inline void +affs_lock_ext(struct inode *inode) +{ + down(&AFFS_I(inode)->i_ext_lock); +} +static inline void +affs_unlock_ext(struct inode *inode) +{ + up(&AFFS_I(inode)->i_ext_lock); +} diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c new file mode 100644 index 00000000..3a4557e8 --- /dev/null +++ b/fs/affs/amigaffs.c @@ -0,0 +1,515 @@ +/* + * linux/fs/affs/amigaffs.c + * + * (c) 1996 Hans-Joachim Widmaier - Rewritten + * + * (C) 1993 Ray Burr - Amiga FFS filesystem. + * + * Please send bug reports to: hjw@zvw.de + */ + +#include "affs.h" + +extern struct timezone sys_tz; + +static char ErrorBuffer[256]; + +/* + * Functions for accessing Amiga-FFS structures. + */ + + +/* Insert a header block bh into the directory dir + * caller must hold AFFS_DIR->i_hash_lock! + */ + +int +affs_insert_hash(struct inode *dir, struct buffer_head *bh) +{ + struct super_block *sb = dir->i_sb; + struct buffer_head *dir_bh; + u32 ino, hash_ino; + int offset; + + ino = bh->b_blocknr; + offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]); + + pr_debug("AFFS: insert_hash(dir=%u, ino=%d)\n", (u32)dir->i_ino, ino); + + dir_bh = affs_bread(sb, dir->i_ino); + if (!dir_bh) + return -EIO; + + hash_ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[offset]); + while (hash_ino) { + affs_brelse(dir_bh); + dir_bh = affs_bread(sb, hash_ino); + if (!dir_bh) + return -EIO; + hash_ino = be32_to_cpu(AFFS_TAIL(sb, dir_bh)->hash_chain); + } + AFFS_TAIL(sb, bh)->parent = cpu_to_be32(dir->i_ino); + AFFS_TAIL(sb, bh)->hash_chain = 0; + affs_fix_checksum(sb, bh); + + if (dir->i_ino == dir_bh->b_blocknr) + AFFS_HEAD(dir_bh)->table[offset] = cpu_to_be32(ino); + else + AFFS_TAIL(sb, dir_bh)->hash_chain = cpu_to_be32(ino); + + affs_adjust_checksum(dir_bh, ino); + mark_buffer_dirty_inode(dir_bh, dir); + affs_brelse(dir_bh); + + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_version++; + mark_inode_dirty(dir); + + return 0; +} + +/* Remove a header block from its directory. + * caller must hold AFFS_DIR->i_hash_lock! + */ + +int +affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) +{ + struct super_block *sb; + struct buffer_head *bh; + u32 rem_ino, hash_ino; + __be32 ino; + int offset, retval; + + sb = dir->i_sb; + rem_ino = rem_bh->b_blocknr; + offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]); + pr_debug("AFFS: remove_hash(dir=%d, ino=%d, hashval=%d)\n", (u32)dir->i_ino, rem_ino, offset); + + bh = affs_bread(sb, dir->i_ino); + if (!bh) + return -EIO; + + retval = -ENOENT; + hash_ino = be32_to_cpu(AFFS_HEAD(bh)->table[offset]); + while (hash_ino) { + if (hash_ino == rem_ino) { + ino = AFFS_TAIL(sb, rem_bh)->hash_chain; + if (dir->i_ino == bh->b_blocknr) + AFFS_HEAD(bh)->table[offset] = ino; + else + AFFS_TAIL(sb, bh)->hash_chain = ino; + affs_adjust_checksum(bh, be32_to_cpu(ino) - hash_ino); + mark_buffer_dirty_inode(bh, dir); + AFFS_TAIL(sb, rem_bh)->parent = 0; + retval = 0; + break; + } + affs_brelse(bh); + bh = affs_bread(sb, hash_ino); + if (!bh) + return -EIO; + hash_ino = be32_to_cpu(AFFS_TAIL(sb, bh)->hash_chain); + } + + affs_brelse(bh); + + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_version++; + mark_inode_dirty(dir); + + return retval; +} + +static void +affs_fix_dcache(struct dentry *dentry, u32 entry_ino) +{ + struct inode *inode = dentry->d_inode; + void *data = dentry->d_fsdata; + struct list_head *head, *next; + + spin_lock(&inode->i_lock); + head = &inode->i_dentry; + next = head->next; + while (next != head) { + dentry = list_entry(next, struct dentry, d_alias); + if (entry_ino == (u32)(long)dentry->d_fsdata) { + dentry->d_fsdata = data; + break; + } + next = next->next; + } + spin_unlock(&inode->i_lock); +} + + +/* Remove header from link chain */ + +static int +affs_remove_link(struct dentry *dentry) +{ + struct inode *dir, *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct buffer_head *bh = NULL, *link_bh = NULL; + u32 link_ino, ino; + int retval; + + pr_debug("AFFS: remove_link(key=%ld)\n", inode->i_ino); + retval = -EIO; + bh = affs_bread(sb, inode->i_ino); + if (!bh) + goto done; + + link_ino = (u32)(long)dentry->d_fsdata; + if (inode->i_ino == link_ino) { + /* we can't remove the head of the link, as its blocknr is still used as ino, + * so we remove the block of the first link instead. + */ + link_ino = be32_to_cpu(AFFS_TAIL(sb, bh)->link_chain); + link_bh = affs_bread(sb, link_ino); + if (!link_bh) + goto done; + + dir = affs_iget(sb, be32_to_cpu(AFFS_TAIL(sb, link_bh)->parent)); + if (IS_ERR(dir)) { + retval = PTR_ERR(dir); + goto done; + } + + affs_lock_dir(dir); + affs_fix_dcache(dentry, link_ino); + retval = affs_remove_hash(dir, link_bh); + if (retval) { + affs_unlock_dir(dir); + goto done; + } + mark_buffer_dirty_inode(link_bh, inode); + + memcpy(AFFS_TAIL(sb, bh)->name, AFFS_TAIL(sb, link_bh)->name, 32); + retval = affs_insert_hash(dir, bh); + if (retval) { + affs_unlock_dir(dir); + goto done; + } + mark_buffer_dirty_inode(bh, inode); + + affs_unlock_dir(dir); + iput(dir); + } else { + link_bh = affs_bread(sb, link_ino); + if (!link_bh) + goto done; + } + + while ((ino = be32_to_cpu(AFFS_TAIL(sb, bh)->link_chain)) != 0) { + if (ino == link_ino) { + __be32 ino2 = AFFS_TAIL(sb, link_bh)->link_chain; + AFFS_TAIL(sb, bh)->link_chain = ino2; + affs_adjust_checksum(bh, be32_to_cpu(ino2) - link_ino); + mark_buffer_dirty_inode(bh, inode); + retval = 0; + /* Fix the link count, if bh is a normal header block without links */ + switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) { + case ST_LINKDIR: + case ST_LINKFILE: + break; + default: + if (!AFFS_TAIL(sb, bh)->link_chain) + inode->i_nlink = 1; + } + affs_free_block(sb, link_ino); + goto done; + } + affs_brelse(bh); + bh = affs_bread(sb, ino); + if (!bh) + goto done; + } + retval = -ENOENT; +done: + affs_brelse(link_bh); + affs_brelse(bh); + return retval; +} + + +static int +affs_empty_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; + int retval, size; + + retval = -EIO; + bh = affs_bread(sb, inode->i_ino); + if (!bh) + goto done; + + retval = -ENOTEMPTY; + for (size = AFFS_SB(sb)->s_hashsize - 1; size >= 0; size--) + if (AFFS_HEAD(bh)->table[size]) + goto not_empty; + retval = 0; +not_empty: + affs_brelse(bh); +done: + return retval; +} + + +/* Remove a filesystem object. If the object to be removed has + * links to it, one of the links must be changed to inherit + * the file or directory. As above, any inode will do. + * The buffer will not be freed. If the header is a link, the + * block will be marked as free. + * This function returns a negative error number in case of + * an error, else 0 if the inode is to be deleted or 1 if not. + */ + +int +affs_remove_header(struct dentry *dentry) +{ + struct super_block *sb; + struct inode *inode, *dir; + struct buffer_head *bh = NULL; + int retval; + + dir = dentry->d_parent->d_inode; + sb = dir->i_sb; + + retval = -ENOENT; + inode = dentry->d_inode; + if (!inode) + goto done; + + pr_debug("AFFS: remove_header(key=%ld)\n", inode->i_ino); + retval = -EIO; + bh = affs_bread(sb, (u32)(long)dentry->d_fsdata); + if (!bh) + goto done; + + affs_lock_link(inode); + affs_lock_dir(dir); + switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) { + case ST_USERDIR: + /* if we ever want to support links to dirs + * i_hash_lock of the inode must only be + * taken after some checks + */ + affs_lock_dir(inode); + retval = affs_empty_dir(inode); + affs_unlock_dir(inode); + if (retval) + goto done_unlock; + break; + default: + break; + } + + retval = affs_remove_hash(dir, bh); + if (retval) + goto done_unlock; + mark_buffer_dirty_inode(bh, inode); + + affs_unlock_dir(dir); + + if (inode->i_nlink > 1) + retval = affs_remove_link(dentry); + else + inode->i_nlink = 0; + affs_unlock_link(inode); + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + +done: + affs_brelse(bh); + return retval; + +done_unlock: + affs_unlock_dir(dir); + affs_unlock_link(inode); + goto done; +} + +/* Checksum a block, do various consistency checks and optionally return + the blocks type number. DATA points to the block. If their pointers + are non-null, *PTYPE and *STYPE are set to the primary and secondary + block types respectively, *HASHSIZE is set to the size of the hashtable + (which lets us calculate the block size). + Returns non-zero if the block is not consistent. */ + +u32 +affs_checksum_block(struct super_block *sb, struct buffer_head *bh) +{ + __be32 *ptr = (__be32 *)bh->b_data; + u32 sum; + int bsize; + + sum = 0; + for (bsize = sb->s_blocksize / sizeof(__be32); bsize > 0; bsize--) + sum += be32_to_cpu(*ptr++); + return sum; +} + +/* + * Calculate the checksum of a disk block and store it + * at the indicated position. + */ + +void +affs_fix_checksum(struct super_block *sb, struct buffer_head *bh) +{ + int cnt = sb->s_blocksize / sizeof(__be32); + __be32 *ptr = (__be32 *)bh->b_data; + u32 checksum; + __be32 *checksumptr; + + checksumptr = ptr + 5; + *checksumptr = 0; + for (checksum = 0; cnt > 0; ptr++, cnt--) + checksum += be32_to_cpu(*ptr); + *checksumptr = cpu_to_be32(-checksum); +} + +void +secs_to_datestamp(time_t secs, struct affs_date *ds) +{ + u32 days; + u32 minute; + + secs -= sys_tz.tz_minuteswest * 60 + ((8 * 365 + 2) * 24 * 60 * 60); + if (secs < 0) + secs = 0; + days = secs / 86400; + secs -= days * 86400; + minute = secs / 60; + secs -= minute * 60; + + ds->days = cpu_to_be32(days); + ds->mins = cpu_to_be32(minute); + ds->ticks = cpu_to_be32(secs * 50); +} + +mode_t +prot_to_mode(u32 prot) +{ + int mode = 0; + + if (!(prot & FIBF_NOWRITE)) + mode |= S_IWUSR; + if (!(prot & FIBF_NOREAD)) + mode |= S_IRUSR; + if (!(prot & FIBF_NOEXECUTE)) + mode |= S_IXUSR; + if (prot & FIBF_GRP_WRITE) + mode |= S_IWGRP; + if (prot & FIBF_GRP_READ) + mode |= S_IRGRP; + if (prot & FIBF_GRP_EXECUTE) + mode |= S_IXGRP; + if (prot & FIBF_OTR_WRITE) + mode |= S_IWOTH; + if (prot & FIBF_OTR_READ) + mode |= S_IROTH; + if (prot & FIBF_OTR_EXECUTE) + mode |= S_IXOTH; + + return mode; +} + +void +mode_to_prot(struct inode *inode) +{ + u32 prot = AFFS_I(inode)->i_protect; + mode_t mode = inode->i_mode; + + if (!(mode & S_IXUSR)) + prot |= FIBF_NOEXECUTE; + if (!(mode & S_IRUSR)) + prot |= FIBF_NOREAD; + if (!(mode & S_IWUSR)) + prot |= FIBF_NOWRITE; + if (mode & S_IXGRP) + prot |= FIBF_GRP_EXECUTE; + if (mode & S_IRGRP) + prot |= FIBF_GRP_READ; + if (mode & S_IWGRP) + prot |= FIBF_GRP_WRITE; + if (mode & S_IXOTH) + prot |= FIBF_OTR_EXECUTE; + if (mode & S_IROTH) + prot |= FIBF_OTR_READ; + if (mode & S_IWOTH) + prot |= FIBF_OTR_WRITE; + + AFFS_I(inode)->i_protect = prot; +} + +void +affs_error(struct super_block *sb, const char *function, const char *fmt, ...) +{ + va_list args; + + va_start(args,fmt); + vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); + va_end(args); + + printk(KERN_CRIT "AFFS error (device %s): %s(): %s\n", sb->s_id, + function,ErrorBuffer); + if (!(sb->s_flags & MS_RDONLY)) + printk(KERN_WARNING "AFFS: Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; +} + +void +affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) +{ + va_list args; + + va_start(args,fmt); + vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); + va_end(args); + + printk(KERN_WARNING "AFFS warning (device %s): %s(): %s\n", sb->s_id, + function,ErrorBuffer); +} + +/* Check if the name is valid for a affs object. */ + +int +affs_check_name(const unsigned char *name, int len) +{ + int i; + + if (len > 30) +#ifdef AFFS_NO_TRUNCATE + return -ENAMETOOLONG; +#else + len = 30; +#endif + + for (i = 0; i < len; i++) { + if (name[i] < ' ' || name[i] == ':' + || (name[i] > 0x7e && name[i] < 0xa0)) + return -EINVAL; + } + + return 0; +} + +/* This function copies name to bstr, with at most 30 + * characters length. The bstr will be prepended by + * a length byte. + * NOTE: The name will must be already checked by + * affs_check_name()! + */ + +int +affs_copy_name(unsigned char *bstr, struct dentry *dentry) +{ + int len = min(dentry->d_name.len, 30u); + + *bstr++ = len; + memcpy(bstr, dentry->d_name.name, len); + return len; +} diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c new file mode 100644 index 00000000..3e262711 --- /dev/null +++ b/fs/affs/bitmap.c @@ -0,0 +1,390 @@ +/* + * linux/fs/affs/bitmap.c + * + * (c) 1996 Hans-Joachim Widmaier + * + * bitmap.c contains the code that handles all bitmap related stuff - + * block allocation, deallocation, calculation of free space. + */ + +#include +#include "affs.h" + +/* This is, of course, shamelessly stolen from fs/minix */ + +static const int nibblemap[] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4 }; + +static u32 +affs_count_free_bits(u32 blocksize, const void *data) +{ + const u32 *map; + u32 free; + u32 tmp; + + map = data; + free = 0; + for (blocksize /= 4; blocksize > 0; blocksize--) { + tmp = *map++; + while (tmp) { + free += nibblemap[tmp & 0xf]; + tmp >>= 4; + } + } + + return free; +} + +u32 +affs_count_free_blocks(struct super_block *sb) +{ + struct affs_bm_info *bm; + u32 free; + int i; + + pr_debug("AFFS: count_free_blocks()\n"); + + if (sb->s_flags & MS_RDONLY) + return 0; + + mutex_lock(&AFFS_SB(sb)->s_bmlock); + + bm = AFFS_SB(sb)->s_bitmap; + free = 0; + for (i = AFFS_SB(sb)->s_bmap_count; i > 0; bm++, i--) + free += bm->bm_free; + + mutex_unlock(&AFFS_SB(sb)->s_bmlock); + + return free; +} + +void +affs_free_block(struct super_block *sb, u32 block) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + struct affs_bm_info *bm; + struct buffer_head *bh; + u32 blk, bmap, bit, mask, tmp; + __be32 *data; + + pr_debug("AFFS: free_block(%u)\n", block); + + if (block > sbi->s_partition_size) + goto err_range; + + blk = block - sbi->s_reserved; + bmap = blk / sbi->s_bmap_bits; + bit = blk % sbi->s_bmap_bits; + bm = &sbi->s_bitmap[bmap]; + + mutex_lock(&sbi->s_bmlock); + + bh = sbi->s_bmap_bh; + if (sbi->s_last_bmap != bmap) { + affs_brelse(bh); + bh = affs_bread(sb, bm->bm_key); + if (!bh) + goto err_bh_read; + sbi->s_bmap_bh = bh; + sbi->s_last_bmap = bmap; + } + + mask = 1 << (bit & 31); + data = (__be32 *)bh->b_data + bit / 32 + 1; + + /* mark block free */ + tmp = be32_to_cpu(*data); + if (tmp & mask) + goto err_free; + *data = cpu_to_be32(tmp | mask); + + /* fix checksum */ + tmp = be32_to_cpu(*(__be32 *)bh->b_data); + *(__be32 *)bh->b_data = cpu_to_be32(tmp - mask); + + mark_buffer_dirty(bh); + sb->s_dirt = 1; + bm->bm_free++; + + mutex_unlock(&sbi->s_bmlock); + return; + +err_free: + affs_warning(sb,"affs_free_block","Trying to free block %u which is already free", block); + mutex_unlock(&sbi->s_bmlock); + return; + +err_bh_read: + affs_error(sb,"affs_free_block","Cannot read bitmap block %u", bm->bm_key); + sbi->s_bmap_bh = NULL; + sbi->s_last_bmap = ~0; + mutex_unlock(&sbi->s_bmlock); + return; + +err_range: + affs_error(sb, "affs_free_block","Block %u outside partition", block); + return; +} + +/* + * Allocate a block in the given allocation zone. + * Since we have to byte-swap the bitmap on little-endian + * machines, this is rather expensive. Therefore we will + * preallocate up to 16 blocks from the same word, if + * possible. We are not doing preallocations in the + * header zone, though. + */ + +u32 +affs_alloc_block(struct inode *inode, u32 goal) +{ + struct super_block *sb; + struct affs_sb_info *sbi; + struct affs_bm_info *bm; + struct buffer_head *bh; + __be32 *data, *enddata; + u32 blk, bmap, bit, mask, mask2, tmp; + int i; + + sb = inode->i_sb; + sbi = AFFS_SB(sb); + + pr_debug("AFFS: balloc(inode=%lu,goal=%u): ", inode->i_ino, goal); + + if (AFFS_I(inode)->i_pa_cnt) { + pr_debug("%d\n", AFFS_I(inode)->i_lastalloc+1); + AFFS_I(inode)->i_pa_cnt--; + return ++AFFS_I(inode)->i_lastalloc; + } + + if (!goal || goal > sbi->s_partition_size) { + if (goal) + affs_warning(sb, "affs_balloc", "invalid goal %d", goal); + //if (!AFFS_I(inode)->i_last_block) + // affs_warning(sb, "affs_balloc", "no last alloc block"); + goal = sbi->s_reserved; + } + + blk = goal - sbi->s_reserved; + bmap = blk / sbi->s_bmap_bits; + bm = &sbi->s_bitmap[bmap]; + + mutex_lock(&sbi->s_bmlock); + + if (bm->bm_free) + goto find_bmap_bit; + +find_bmap: + /* search for the next bmap buffer with free bits */ + i = sbi->s_bmap_count; + do { + if (--i < 0) + goto err_full; + bmap++; + bm++; + if (bmap < sbi->s_bmap_count) + continue; + /* restart search at zero */ + bmap = 0; + bm = sbi->s_bitmap; + } while (!bm->bm_free); + blk = bmap * sbi->s_bmap_bits; + +find_bmap_bit: + + bh = sbi->s_bmap_bh; + if (sbi->s_last_bmap != bmap) { + affs_brelse(bh); + bh = affs_bread(sb, bm->bm_key); + if (!bh) + goto err_bh_read; + sbi->s_bmap_bh = bh; + sbi->s_last_bmap = bmap; + } + + /* find an unused block in this bitmap block */ + bit = blk % sbi->s_bmap_bits; + data = (__be32 *)bh->b_data + bit / 32 + 1; + enddata = (__be32 *)((u8 *)bh->b_data + sb->s_blocksize); + mask = ~0UL << (bit & 31); + blk &= ~31UL; + + tmp = be32_to_cpu(*data); + if (tmp & mask) + goto find_bit; + + /* scan the rest of the buffer */ + do { + blk += 32; + if (++data >= enddata) + /* didn't find something, can only happen + * if scan didn't start at 0, try next bmap + */ + goto find_bmap; + } while (!*data); + tmp = be32_to_cpu(*data); + mask = ~0; + +find_bit: + /* finally look for a free bit in the word */ + bit = ffs(tmp & mask) - 1; + blk += bit + sbi->s_reserved; + mask2 = mask = 1 << (bit & 31); + AFFS_I(inode)->i_lastalloc = blk; + + /* prealloc as much as possible within this word */ + while ((mask2 <<= 1)) { + if (!(tmp & mask2)) + break; + AFFS_I(inode)->i_pa_cnt++; + mask |= mask2; + } + bm->bm_free -= AFFS_I(inode)->i_pa_cnt + 1; + + *data = cpu_to_be32(tmp & ~mask); + + /* fix checksum */ + tmp = be32_to_cpu(*(__be32 *)bh->b_data); + *(__be32 *)bh->b_data = cpu_to_be32(tmp + mask); + + mark_buffer_dirty(bh); + sb->s_dirt = 1; + + mutex_unlock(&sbi->s_bmlock); + + pr_debug("%d\n", blk); + return blk; + +err_bh_read: + affs_error(sb,"affs_read_block","Cannot read bitmap block %u", bm->bm_key); + sbi->s_bmap_bh = NULL; + sbi->s_last_bmap = ~0; +err_full: + mutex_unlock(&sbi->s_bmlock); + pr_debug("failed\n"); + return 0; +} + +int affs_init_bitmap(struct super_block *sb, int *flags) +{ + struct affs_bm_info *bm; + struct buffer_head *bmap_bh = NULL, *bh = NULL; + __be32 *bmap_blk; + u32 size, blk, end, offset, mask; + int i, res = 0; + struct affs_sb_info *sbi = AFFS_SB(sb); + + if (*flags & MS_RDONLY) + return 0; + + if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) { + printk(KERN_NOTICE "AFFS: Bitmap invalid - mounting %s read only\n", + sb->s_id); + *flags |= MS_RDONLY; + return 0; + } + + sbi->s_last_bmap = ~0; + sbi->s_bmap_bh = NULL; + sbi->s_bmap_bits = sb->s_blocksize * 8 - 32; + sbi->s_bmap_count = (sbi->s_partition_size - sbi->s_reserved + + sbi->s_bmap_bits - 1) / sbi->s_bmap_bits; + size = sbi->s_bmap_count * sizeof(*bm); + bm = sbi->s_bitmap = kzalloc(size, GFP_KERNEL); + if (!sbi->s_bitmap) { + printk(KERN_ERR "AFFS: Bitmap allocation failed\n"); + return -ENOMEM; + } + + bmap_blk = (__be32 *)sbi->s_root_bh->b_data; + blk = sb->s_blocksize / 4 - 49; + end = blk + 25; + + for (i = sbi->s_bmap_count; i > 0; bm++, i--) { + affs_brelse(bh); + + bm->bm_key = be32_to_cpu(bmap_blk[blk]); + bh = affs_bread(sb, bm->bm_key); + if (!bh) { + printk(KERN_ERR "AFFS: Cannot read bitmap\n"); + res = -EIO; + goto out; + } + if (affs_checksum_block(sb, bh)) { + printk(KERN_WARNING "AFFS: Bitmap %u invalid - mounting %s read only.\n", + bm->bm_key, sb->s_id); + *flags |= MS_RDONLY; + goto out; + } + pr_debug("AFFS: read bitmap block %d: %d\n", blk, bm->bm_key); + bm->bm_free = affs_count_free_bits(sb->s_blocksize - 4, bh->b_data + 4); + + /* Don't try read the extension if this is the last block, + * but we also need the right bm pointer below + */ + if (++blk < end || i == 1) + continue; + if (bmap_bh) + affs_brelse(bmap_bh); + bmap_bh = affs_bread(sb, be32_to_cpu(bmap_blk[blk])); + if (!bmap_bh) { + printk(KERN_ERR "AFFS: Cannot read bitmap extension\n"); + res = -EIO; + goto out; + } + bmap_blk = (__be32 *)bmap_bh->b_data; + blk = 0; + end = sb->s_blocksize / 4 - 1; + } + + offset = (sbi->s_partition_size - sbi->s_reserved) % sbi->s_bmap_bits; + mask = ~(0xFFFFFFFFU << (offset & 31)); + pr_debug("last word: %d %d %d\n", offset, offset / 32 + 1, mask); + offset = offset / 32 + 1; + + if (mask) { + u32 old, new; + + /* Mark unused bits in the last word as allocated */ + old = be32_to_cpu(((__be32 *)bh->b_data)[offset]); + new = old & mask; + //if (old != new) { + ((__be32 *)bh->b_data)[offset] = cpu_to_be32(new); + /* fix checksum */ + //new -= old; + //old = be32_to_cpu(*(__be32 *)bh->b_data); + //*(__be32 *)bh->b_data = cpu_to_be32(old - new); + //mark_buffer_dirty(bh); + //} + /* correct offset for the bitmap count below */ + //offset++; + } + while (++offset < sb->s_blocksize / 4) + ((__be32 *)bh->b_data)[offset] = 0; + ((__be32 *)bh->b_data)[0] = 0; + ((__be32 *)bh->b_data)[0] = cpu_to_be32(-affs_checksum_block(sb, bh)); + mark_buffer_dirty(bh); + + /* recalculate bitmap count for last block */ + bm--; + bm->bm_free = affs_count_free_bits(sb->s_blocksize - 4, bh->b_data + 4); + +out: + affs_brelse(bh); + affs_brelse(bmap_bh); + return res; +} + +void affs_free_bitmap(struct super_block *sb) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + + if (!sbi->s_bitmap) + return; + + affs_brelse(sbi->s_bmap_bh); + sbi->s_bmap_bh = NULL; + sbi->s_last_bmap = ~0; + kfree(sbi->s_bitmap); + sbi->s_bitmap = NULL; +} diff --git a/fs/affs/dir.c b/fs/affs/dir.c new file mode 100644 index 00000000..8ca8f3a5 --- /dev/null +++ b/fs/affs/dir.c @@ -0,0 +1,156 @@ +/* + * linux/fs/affs/dir.c + * + * (c) 1996 Hans-Joachim Widmaier - Rewritten + * + * (C) 1993 Ray Burr - Modified for Amiga FFS filesystem. + * + * (C) 1992 Eric Youngdale Modified for ISO 9660 filesystem. + * + * (C) 1991 Linus Torvalds - minix filesystem + * + * affs directory handling functions + * + */ + +#include "affs.h" + +static int affs_readdir(struct file *, void *, filldir_t); + +const struct file_operations affs_dir_operations = { + .read = generic_read_dir, + .llseek = generic_file_llseek, + .readdir = affs_readdir, + .fsync = affs_file_fsync, +}; + +/* + * directories can handle most operations... + */ +const struct inode_operations affs_dir_inode_operations = { + .create = affs_create, + .lookup = affs_lookup, + .link = affs_link, + .unlink = affs_unlink, + .symlink = affs_symlink, + .mkdir = affs_mkdir, + .rmdir = affs_rmdir, + .rename = affs_rename, + .setattr = affs_notify_change, +}; + +static int +affs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct buffer_head *dir_bh; + struct buffer_head *fh_bh; + unsigned char *name; + int namelen; + u32 i; + int hash_pos; + int chain_pos; + u32 f_pos; + u32 ino; + int stored; + int res; + + pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)filp->f_pos); + + stored = 0; + res = -EIO; + dir_bh = NULL; + fh_bh = NULL; + f_pos = filp->f_pos; + + if (f_pos == 0) { + filp->private_data = (void *)0; + if (filldir(dirent, ".", 1, f_pos, inode->i_ino, DT_DIR) < 0) + return 0; + filp->f_pos = f_pos = 1; + stored++; + } + if (f_pos == 1) { + if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_path.dentry), DT_DIR) < 0) + return stored; + filp->f_pos = f_pos = 2; + stored++; + } + + affs_lock_dir(inode); + chain_pos = (f_pos - 2) & 0xffff; + hash_pos = (f_pos - 2) >> 16; + if (chain_pos == 0xffff) { + affs_warning(sb, "readdir", "More than 65535 entries in chain"); + chain_pos = 0; + hash_pos++; + filp->f_pos = ((hash_pos << 16) | chain_pos) + 2; + } + dir_bh = affs_bread(sb, inode->i_ino); + if (!dir_bh) + goto readdir_out; + + /* If the directory hasn't changed since the last call to readdir(), + * we can jump directly to where we left off. + */ + ino = (u32)(long)filp->private_data; + if (ino && filp->f_version == inode->i_version) { + pr_debug("AFFS: readdir() left off=%d\n", ino); + goto inside; + } + + ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]); + for (i = 0; ino && i < chain_pos; i++) { + fh_bh = affs_bread(sb, ino); + if (!fh_bh) { + affs_error(sb, "readdir","Cannot read block %d", i); + goto readdir_out; + } + ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); + affs_brelse(fh_bh); + fh_bh = NULL; + } + if (ino) + goto inside; + hash_pos++; + + for (; hash_pos < AFFS_SB(sb)->s_hashsize; hash_pos++) { + ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]); + if (!ino) + continue; + f_pos = (hash_pos << 16) + 2; +inside: + do { + fh_bh = affs_bread(sb, ino); + if (!fh_bh) { + affs_error(sb, "readdir","Cannot read block %d", ino); + goto readdir_done; + } + + namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); + name = AFFS_TAIL(sb, fh_bh)->name + 1; + pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n", + namelen, name, ino, hash_pos, f_pos); + if (filldir(dirent, name, namelen, f_pos, ino, DT_UNKNOWN) < 0) + goto readdir_done; + stored++; + f_pos++; + ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); + affs_brelse(fh_bh); + fh_bh = NULL; + } while (ino); + } +readdir_done: + filp->f_pos = f_pos; + filp->f_version = inode->i_version; + filp->private_data = (void *)(long)ino; + res = stored; + +readdir_out: + affs_brelse(dir_bh); + affs_brelse(fh_bh); + affs_unlock_dir(inode); + pr_debug("AFFS: readdir()=%d\n", stored); + return res; +} diff --git a/fs/affs/file.c b/fs/affs/file.c new file mode 100644 index 00000000..acf321b7 --- /dev/null +++ b/fs/affs/file.c @@ -0,0 +1,936 @@ +/* + * linux/fs/affs/file.c + * + * (c) 1996 Hans-Joachim Widmaier - Rewritten + * + * (C) 1993 Ray Burr - Modified for Amiga FFS filesystem. + * + * (C) 1992 Eric Youngdale Modified for ISO 9660 filesystem. + * + * (C) 1991 Linus Torvalds - minix filesystem + * + * affs regular file handling primitives + */ + +#include "affs.h" + +#if PAGE_SIZE < 4096 +#error PAGE_SIZE must be at least 4096 +#endif + +static int affs_grow_extcache(struct inode *inode, u32 lc_idx); +static struct buffer_head *affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext); +static inline struct buffer_head *affs_get_extblock(struct inode *inode, u32 ext); +static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); +static int affs_file_open(struct inode *inode, struct file *filp); +static int affs_file_release(struct inode *inode, struct file *filp); + +const struct file_operations affs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .open = affs_file_open, + .release = affs_file_release, + .fsync = affs_file_fsync, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations affs_file_inode_operations = { + .truncate = affs_truncate, + .setattr = affs_notify_change, +}; + +static int +affs_file_open(struct inode *inode, struct file *filp) +{ + pr_debug("AFFS: open(%lu,%d)\n", + inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); + atomic_inc(&AFFS_I(inode)->i_opencnt); + return 0; +} + +static int +affs_file_release(struct inode *inode, struct file *filp) +{ + pr_debug("AFFS: release(%lu, %d)\n", + inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); + + if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) { + mutex_lock(&inode->i_mutex); + if (inode->i_size != AFFS_I(inode)->mmu_private) + affs_truncate(inode); + affs_free_prealloc(inode); + mutex_unlock(&inode->i_mutex); + } + + return 0; +} + +static int +affs_grow_extcache(struct inode *inode, u32 lc_idx) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; + u32 lc_max; + int i, j, key; + + if (!AFFS_I(inode)->i_lc) { + char *ptr = (char *)get_zeroed_page(GFP_NOFS); + if (!ptr) + return -ENOMEM; + AFFS_I(inode)->i_lc = (u32 *)ptr; + AFFS_I(inode)->i_ac = (struct affs_ext_key *)(ptr + AFFS_CACHE_SIZE / 2); + } + + lc_max = AFFS_LC_SIZE << AFFS_I(inode)->i_lc_shift; + + if (AFFS_I(inode)->i_extcnt > lc_max) { + u32 lc_shift, lc_mask, tmp, off; + + /* need to recalculate linear cache, start from old size */ + lc_shift = AFFS_I(inode)->i_lc_shift; + tmp = (AFFS_I(inode)->i_extcnt / AFFS_LC_SIZE) >> lc_shift; + for (; tmp; tmp >>= 1) + lc_shift++; + lc_mask = (1 << lc_shift) - 1; + + /* fix idx and old size to new shift */ + lc_idx >>= (lc_shift - AFFS_I(inode)->i_lc_shift); + AFFS_I(inode)->i_lc_size >>= (lc_shift - AFFS_I(inode)->i_lc_shift); + + /* first shrink old cache to make more space */ + off = 1 << (lc_shift - AFFS_I(inode)->i_lc_shift); + for (i = 1, j = off; j < AFFS_LC_SIZE; i++, j += off) + AFFS_I(inode)->i_ac[i] = AFFS_I(inode)->i_ac[j]; + + AFFS_I(inode)->i_lc_shift = lc_shift; + AFFS_I(inode)->i_lc_mask = lc_mask; + } + + /* fill cache to the needed index */ + i = AFFS_I(inode)->i_lc_size; + AFFS_I(inode)->i_lc_size = lc_idx + 1; + for (; i <= lc_idx; i++) { + if (!i) { + AFFS_I(inode)->i_lc[0] = inode->i_ino; + continue; + } + key = AFFS_I(inode)->i_lc[i - 1]; + j = AFFS_I(inode)->i_lc_mask + 1; + // unlock cache + for (; j > 0; j--) { + bh = affs_bread(sb, key); + if (!bh) + goto err; + key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension); + affs_brelse(bh); + } + // lock cache + AFFS_I(inode)->i_lc[i] = key; + } + + return 0; + +err: + // lock cache + return -EIO; +} + +static struct buffer_head * +affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh; + u32 blocknr, tmp; + + blocknr = affs_alloc_block(inode, bh->b_blocknr); + if (!blocknr) + return ERR_PTR(-ENOSPC); + + new_bh = affs_getzeroblk(sb, blocknr); + if (!new_bh) { + affs_free_block(sb, blocknr); + return ERR_PTR(-EIO); + } + + AFFS_HEAD(new_bh)->ptype = cpu_to_be32(T_LIST); + AFFS_HEAD(new_bh)->key = cpu_to_be32(blocknr); + AFFS_TAIL(sb, new_bh)->stype = cpu_to_be32(ST_FILE); + AFFS_TAIL(sb, new_bh)->parent = cpu_to_be32(inode->i_ino); + affs_fix_checksum(sb, new_bh); + + mark_buffer_dirty_inode(new_bh, inode); + + tmp = be32_to_cpu(AFFS_TAIL(sb, bh)->extension); + if (tmp) + affs_warning(sb, "alloc_ext", "previous extension set (%x)", tmp); + AFFS_TAIL(sb, bh)->extension = cpu_to_be32(blocknr); + affs_adjust_checksum(bh, blocknr - tmp); + mark_buffer_dirty_inode(bh, inode); + + AFFS_I(inode)->i_extcnt++; + mark_inode_dirty(inode); + + return new_bh; +} + +static inline struct buffer_head * +affs_get_extblock(struct inode *inode, u32 ext) +{ + /* inline the simplest case: same extended block as last time */ + struct buffer_head *bh = AFFS_I(inode)->i_ext_bh; + if (ext == AFFS_I(inode)->i_ext_last) + get_bh(bh); + else + /* we have to do more (not inlined) */ + bh = affs_get_extblock_slow(inode, ext); + + return bh; +} + +static struct buffer_head * +affs_get_extblock_slow(struct inode *inode, u32 ext) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; + u32 ext_key; + u32 lc_idx, lc_off, ac_idx; + u32 tmp, idx; + + if (ext == AFFS_I(inode)->i_ext_last + 1) { + /* read the next extended block from the current one */ + bh = AFFS_I(inode)->i_ext_bh; + ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension); + if (ext < AFFS_I(inode)->i_extcnt) + goto read_ext; + if (ext > AFFS_I(inode)->i_extcnt) + BUG(); + bh = affs_alloc_extblock(inode, bh, ext); + if (IS_ERR(bh)) + return bh; + goto store_ext; + } + + if (ext == 0) { + /* we seek back to the file header block */ + ext_key = inode->i_ino; + goto read_ext; + } + + if (ext >= AFFS_I(inode)->i_extcnt) { + struct buffer_head *prev_bh; + + /* allocate a new extended block */ + if (ext > AFFS_I(inode)->i_extcnt) + BUG(); + + /* get previous extended block */ + prev_bh = affs_get_extblock(inode, ext - 1); + if (IS_ERR(prev_bh)) + return prev_bh; + bh = affs_alloc_extblock(inode, prev_bh, ext); + affs_brelse(prev_bh); + if (IS_ERR(bh)) + return bh; + goto store_ext; + } + +again: + /* check if there is an extended cache and whether it's large enough */ + lc_idx = ext >> AFFS_I(inode)->i_lc_shift; + lc_off = ext & AFFS_I(inode)->i_lc_mask; + + if (lc_idx >= AFFS_I(inode)->i_lc_size) { + int err; + + err = affs_grow_extcache(inode, lc_idx); + if (err) + return ERR_PTR(err); + goto again; + } + + /* every n'th key we find in the linear cache */ + if (!lc_off) { + ext_key = AFFS_I(inode)->i_lc[lc_idx]; + goto read_ext; + } + + /* maybe it's still in the associative cache */ + ac_idx = (ext - lc_idx - 1) & AFFS_AC_MASK; + if (AFFS_I(inode)->i_ac[ac_idx].ext == ext) { + ext_key = AFFS_I(inode)->i_ac[ac_idx].key; + goto read_ext; + } + + /* try to find one of the previous extended blocks */ + tmp = ext; + idx = ac_idx; + while (--tmp, --lc_off > 0) { + idx = (idx - 1) & AFFS_AC_MASK; + if (AFFS_I(inode)->i_ac[idx].ext == tmp) { + ext_key = AFFS_I(inode)->i_ac[idx].key; + goto find_ext; + } + } + + /* fall back to the linear cache */ + ext_key = AFFS_I(inode)->i_lc[lc_idx]; +find_ext: + /* read all extended blocks until we find the one we need */ + //unlock cache + do { + bh = affs_bread(sb, ext_key); + if (!bh) + goto err_bread; + ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension); + affs_brelse(bh); + tmp++; + } while (tmp < ext); + //lock cache + + /* store it in the associative cache */ + // recalculate ac_idx? + AFFS_I(inode)->i_ac[ac_idx].ext = ext; + AFFS_I(inode)->i_ac[ac_idx].key = ext_key; + +read_ext: + /* finally read the right extended block */ + //unlock cache + bh = affs_bread(sb, ext_key); + if (!bh) + goto err_bread; + //lock cache + +store_ext: + /* release old cached extended block and store the new one */ + affs_brelse(AFFS_I(inode)->i_ext_bh); + AFFS_I(inode)->i_ext_last = ext; + AFFS_I(inode)->i_ext_bh = bh; + get_bh(bh); + + return bh; + +err_bread: + affs_brelse(bh); + return ERR_PTR(-EIO); +} + +static int +affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *ext_bh; + u32 ext; + + pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block); + + BUG_ON(block > (sector_t)0x7fffffffUL); + + if (block >= AFFS_I(inode)->i_blkcnt) { + if (block > AFFS_I(inode)->i_blkcnt || !create) + goto err_big; + } else + create = 0; + + //lock cache + affs_lock_ext(inode); + + ext = (u32)block / AFFS_SB(sb)->s_hashsize; + block -= ext * AFFS_SB(sb)->s_hashsize; + ext_bh = affs_get_extblock(inode, ext); + if (IS_ERR(ext_bh)) + goto err_ext; + map_bh(bh_result, sb, (sector_t)be32_to_cpu(AFFS_BLOCK(sb, ext_bh, block))); + + if (create) { + u32 blocknr = affs_alloc_block(inode, ext_bh->b_blocknr); + if (!blocknr) + goto err_alloc; + set_buffer_new(bh_result); + AFFS_I(inode)->mmu_private += AFFS_SB(sb)->s_data_blksize; + AFFS_I(inode)->i_blkcnt++; + + /* store new block */ + if (bh_result->b_blocknr) + affs_warning(sb, "get_block", "block already set (%x)", bh_result->b_blocknr); + AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr); + AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1); + affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1); + bh_result->b_blocknr = blocknr; + + if (!block) { + /* insert first block into header block */ + u32 tmp = be32_to_cpu(AFFS_HEAD(ext_bh)->first_data); + if (tmp) + affs_warning(sb, "get_block", "first block already set (%d)", tmp); + AFFS_HEAD(ext_bh)->first_data = cpu_to_be32(blocknr); + affs_adjust_checksum(ext_bh, blocknr - tmp); + } + } + + affs_brelse(ext_bh); + //unlock cache + affs_unlock_ext(inode); + return 0; + +err_big: + affs_error(inode->i_sb,"get_block","strange block request %d", block); + return -EIO; +err_ext: + // unlock cache + affs_unlock_ext(inode); + return PTR_ERR(ext_bh); +err_alloc: + brelse(ext_bh); + clear_buffer_mapped(bh_result); + bh_result->b_bdev = NULL; + // unlock cache + affs_unlock_ext(inode); + return -ENOSPC; +} + +static int affs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, affs_get_block, wbc); +} + +static int affs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, affs_get_block); +} + +static int affs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + *pagep = NULL; + ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + affs_get_block, + &AFFS_I(mapping->host)->mmu_private); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} + +static sector_t _affs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,affs_get_block); +} + +const struct address_space_operations affs_aops = { + .readpage = affs_readpage, + .writepage = affs_writepage, + .write_begin = affs_write_begin, + .write_end = generic_write_end, + .bmap = _affs_bmap +}; + +static inline struct buffer_head * +affs_bread_ino(struct inode *inode, int block, int create) +{ + struct buffer_head *bh, tmp_bh; + int err; + + tmp_bh.b_state = 0; + err = affs_get_block(inode, block, &tmp_bh, create); + if (!err) { + bh = affs_bread(inode->i_sb, tmp_bh.b_blocknr); + if (bh) { + bh->b_state |= tmp_bh.b_state; + return bh; + } + err = -EIO; + } + return ERR_PTR(err); +} + +static inline struct buffer_head * +affs_getzeroblk_ino(struct inode *inode, int block) +{ + struct buffer_head *bh, tmp_bh; + int err; + + tmp_bh.b_state = 0; + err = affs_get_block(inode, block, &tmp_bh, 1); + if (!err) { + bh = affs_getzeroblk(inode->i_sb, tmp_bh.b_blocknr); + if (bh) { + bh->b_state |= tmp_bh.b_state; + return bh; + } + err = -EIO; + } + return ERR_PTR(err); +} + +static inline struct buffer_head * +affs_getemptyblk_ino(struct inode *inode, int block) +{ + struct buffer_head *bh, tmp_bh; + int err; + + tmp_bh.b_state = 0; + err = affs_get_block(inode, block, &tmp_bh, 1); + if (!err) { + bh = affs_getemptyblk(inode->i_sb, tmp_bh.b_blocknr); + if (bh) { + bh->b_state |= tmp_bh.b_state; + return bh; + } + err = -EIO; + } + return ERR_PTR(err); +} + +static int +affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; + char *data; + u32 bidx, boff, bsize; + u32 tmp; + + pr_debug("AFFS: read_page(%u, %ld, %d, %d)\n", (u32)inode->i_ino, page->index, from, to); + BUG_ON(from > to || to > PAGE_CACHE_SIZE); + kmap(page); + data = page_address(page); + bsize = AFFS_SB(sb)->s_data_blksize; + tmp = (page->index << PAGE_CACHE_SHIFT) + from; + bidx = tmp / bsize; + boff = tmp % bsize; + + while (from < to) { + bh = affs_bread_ino(inode, bidx, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + tmp = min(bsize - boff, to - from); + BUG_ON(from + tmp > to || tmp > bsize); + memcpy(data + from, AFFS_DATA(bh) + boff, tmp); + affs_brelse(bh); + bidx++; + from += tmp; + boff = 0; + } + flush_dcache_page(page); + kunmap(page); + return 0; +} + +static int +affs_extent_file_ofs(struct inode *inode, u32 newsize) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *bh, *prev_bh; + u32 bidx, boff; + u32 size, bsize; + u32 tmp; + + pr_debug("AFFS: extent_file(%u, %d)\n", (u32)inode->i_ino, newsize); + bsize = AFFS_SB(sb)->s_data_blksize; + bh = NULL; + size = AFFS_I(inode)->mmu_private; + bidx = size / bsize; + boff = size % bsize; + if (boff) { + bh = affs_bread_ino(inode, bidx, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + tmp = min(bsize - boff, newsize - size); + BUG_ON(boff + tmp > bsize || tmp > bsize); + memset(AFFS_DATA(bh) + boff, 0, tmp); + be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp); + affs_fix_checksum(sb, bh); + mark_buffer_dirty_inode(bh, inode); + size += tmp; + bidx++; + } else if (bidx) { + bh = affs_bread_ino(inode, bidx - 1, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + } + + while (size < newsize) { + prev_bh = bh; + bh = affs_getzeroblk_ino(inode, bidx); + if (IS_ERR(bh)) + goto out; + tmp = min(bsize, newsize - size); + BUG_ON(tmp > bsize); + AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); + AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp); + affs_fix_checksum(sb, bh); + bh->b_state &= ~(1UL << BH_New); + mark_buffer_dirty_inode(bh, inode); + if (prev_bh) { + u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); + if (tmp) + affs_warning(sb, "extent_file_ofs", "next block already set for %d (%d)", bidx, tmp); + AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr); + affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp); + mark_buffer_dirty_inode(prev_bh, inode); + affs_brelse(prev_bh); + } + size += bsize; + bidx++; + } + affs_brelse(bh); + inode->i_size = AFFS_I(inode)->mmu_private = newsize; + return 0; + +out: + inode->i_size = AFFS_I(inode)->mmu_private = newsize; + return PTR_ERR(bh); +} + +static int +affs_readpage_ofs(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + u32 to; + int err; + + pr_debug("AFFS: read_page(%u, %ld)\n", (u32)inode->i_ino, page->index); + to = PAGE_CACHE_SIZE; + if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) { + to = inode->i_size & ~PAGE_CACHE_MASK; + memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to); + } + + err = affs_do_readpage_ofs(file, page, 0, to); + if (!err) + SetPageUptodate(page); + unlock_page(page); + return err; +} + +static int affs_write_begin_ofs(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct page *page; + pgoff_t index; + int err = 0; + + pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len); + if (pos > AFFS_I(inode)->mmu_private) { + /* XXX: this probably leaves a too-big i_size in case of + * failure. Should really be updating i_size at write_end time + */ + err = affs_extent_file_ofs(inode, pos); + if (err) + return err; + } + + index = pos >> PAGE_CACHE_SHIFT; + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + if (PageUptodate(page)) + return 0; + + /* XXX: inefficient but safe in the face of short writes */ + err = affs_do_readpage_ofs(file, page, 0, PAGE_CACHE_SIZE); + if (err) { + unlock_page(page); + page_cache_release(page); + } + return err; +} + +static int affs_write_end_ofs(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + struct super_block *sb = inode->i_sb; + struct buffer_head *bh, *prev_bh; + char *data; + u32 bidx, boff, bsize; + unsigned from, to; + u32 tmp; + int written; + + from = pos & (PAGE_CACHE_SIZE - 1); + to = pos + len; + /* + * XXX: not sure if this can handle short copies (len < copied), but + * we don't have to, because the page should always be uptodate here, + * due to write_begin. + */ + + pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len); + bsize = AFFS_SB(sb)->s_data_blksize; + data = page_address(page); + + bh = NULL; + written = 0; + tmp = (page->index << PAGE_CACHE_SHIFT) + from; + bidx = tmp / bsize; + boff = tmp % bsize; + if (boff) { + bh = affs_bread_ino(inode, bidx, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + tmp = min(bsize - boff, to - from); + BUG_ON(boff + tmp > bsize || tmp > bsize); + memcpy(AFFS_DATA(bh) + boff, data + from, tmp); + be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp); + affs_fix_checksum(sb, bh); + mark_buffer_dirty_inode(bh, inode); + written += tmp; + from += tmp; + bidx++; + } else if (bidx) { + bh = affs_bread_ino(inode, bidx - 1, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + } + while (from + bsize <= to) { + prev_bh = bh; + bh = affs_getemptyblk_ino(inode, bidx); + if (IS_ERR(bh)) + goto out; + memcpy(AFFS_DATA(bh), data + from, bsize); + if (buffer_new(bh)) { + AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); + AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->size = cpu_to_be32(bsize); + AFFS_DATA_HEAD(bh)->next = 0; + bh->b_state &= ~(1UL << BH_New); + if (prev_bh) { + u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); + if (tmp) + affs_warning(sb, "commit_write_ofs", "next block already set for %d (%d)", bidx, tmp); + AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr); + affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp); + mark_buffer_dirty_inode(prev_bh, inode); + } + } + affs_brelse(prev_bh); + affs_fix_checksum(sb, bh); + mark_buffer_dirty_inode(bh, inode); + written += bsize; + from += bsize; + bidx++; + } + if (from < to) { + prev_bh = bh; + bh = affs_bread_ino(inode, bidx, 1); + if (IS_ERR(bh)) + goto out; + tmp = min(bsize, to - from); + BUG_ON(tmp > bsize); + memcpy(AFFS_DATA(bh), data + from, tmp); + if (buffer_new(bh)) { + AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); + AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp); + AFFS_DATA_HEAD(bh)->next = 0; + bh->b_state &= ~(1UL << BH_New); + if (prev_bh) { + u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); + if (tmp) + affs_warning(sb, "commit_write_ofs", "next block already set for %d (%d)", bidx, tmp); + AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr); + affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp); + mark_buffer_dirty_inode(prev_bh, inode); + } + } else if (be32_to_cpu(AFFS_DATA_HEAD(bh)->size) < tmp) + AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp); + affs_brelse(prev_bh); + affs_fix_checksum(sb, bh); + mark_buffer_dirty_inode(bh, inode); + written += tmp; + from += tmp; + bidx++; + } + SetPageUptodate(page); + +done: + affs_brelse(bh); + tmp = (page->index << PAGE_CACHE_SHIFT) + from; + if (tmp > inode->i_size) + inode->i_size = AFFS_I(inode)->mmu_private = tmp; + + unlock_page(page); + page_cache_release(page); + + return written; + +out: + bh = prev_bh; + if (!written) + written = PTR_ERR(bh); + goto done; +} + +const struct address_space_operations affs_aops_ofs = { + .readpage = affs_readpage_ofs, + //.writepage = affs_writepage_ofs, + .write_begin = affs_write_begin_ofs, + .write_end = affs_write_end_ofs +}; + +/* Free any preallocated blocks. */ + +void +affs_free_prealloc(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + pr_debug("AFFS: free_prealloc(ino=%lu)\n", inode->i_ino); + + while (AFFS_I(inode)->i_pa_cnt) { + AFFS_I(inode)->i_pa_cnt--; + affs_free_block(sb, ++AFFS_I(inode)->i_lastalloc); + } +} + +/* Truncate (or enlarge) a file to the requested size. */ + +void +affs_truncate(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + u32 ext, ext_key; + u32 last_blk, blkcnt, blk; + u32 size; + struct buffer_head *ext_bh; + int i; + + pr_debug("AFFS: truncate(inode=%d, oldsize=%u, newsize=%u)\n", + (u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size); + + last_blk = 0; + ext = 0; + if (inode->i_size) { + last_blk = ((u32)inode->i_size - 1) / AFFS_SB(sb)->s_data_blksize; + ext = last_blk / AFFS_SB(sb)->s_hashsize; + } + + if (inode->i_size > AFFS_I(inode)->mmu_private) { + struct address_space *mapping = inode->i_mapping; + struct page *page; + void *fsdata; + u32 size = inode->i_size; + int res; + + res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata); + if (!res) + res = mapping->a_ops->write_end(NULL, mapping, size, 0, 0, page, fsdata); + else + inode->i_size = AFFS_I(inode)->mmu_private; + mark_inode_dirty(inode); + return; + } else if (inode->i_size == AFFS_I(inode)->mmu_private) + return; + + // lock cache + ext_bh = affs_get_extblock(inode, ext); + if (IS_ERR(ext_bh)) { + affs_warning(sb, "truncate", "unexpected read error for ext block %u (%d)", + ext, PTR_ERR(ext_bh)); + return; + } + if (AFFS_I(inode)->i_lc) { + /* clear linear cache */ + i = (ext + 1) >> AFFS_I(inode)->i_lc_shift; + if (AFFS_I(inode)->i_lc_size > i) { + AFFS_I(inode)->i_lc_size = i; + for (; i < AFFS_LC_SIZE; i++) + AFFS_I(inode)->i_lc[i] = 0; + } + /* clear associative cache */ + for (i = 0; i < AFFS_AC_SIZE; i++) + if (AFFS_I(inode)->i_ac[i].ext >= ext) + AFFS_I(inode)->i_ac[i].ext = 0; + } + ext_key = be32_to_cpu(AFFS_TAIL(sb, ext_bh)->extension); + + blkcnt = AFFS_I(inode)->i_blkcnt; + i = 0; + blk = last_blk; + if (inode->i_size) { + i = last_blk % AFFS_SB(sb)->s_hashsize + 1; + blk++; + } else + AFFS_HEAD(ext_bh)->first_data = 0; + AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(i); + size = AFFS_SB(sb)->s_hashsize; + if (size > blkcnt - blk + i) + size = blkcnt - blk + i; + for (; i < size; i++, blk++) { + affs_free_block(sb, be32_to_cpu(AFFS_BLOCK(sb, ext_bh, i))); + AFFS_BLOCK(sb, ext_bh, i) = 0; + } + AFFS_TAIL(sb, ext_bh)->extension = 0; + affs_fix_checksum(sb, ext_bh); + mark_buffer_dirty_inode(ext_bh, inode); + affs_brelse(ext_bh); + + if (inode->i_size) { + AFFS_I(inode)->i_blkcnt = last_blk + 1; + AFFS_I(inode)->i_extcnt = ext + 1; + if (AFFS_SB(sb)->s_flags & SF_OFS) { + struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); + u32 tmp; + if (IS_ERR(bh)) { + affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)", + ext, PTR_ERR(bh)); + return; + } + tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); + AFFS_DATA_HEAD(bh)->next = 0; + affs_adjust_checksum(bh, -tmp); + affs_brelse(bh); + } + } else { + AFFS_I(inode)->i_blkcnt = 0; + AFFS_I(inode)->i_extcnt = 1; + } + AFFS_I(inode)->mmu_private = inode->i_size; + // unlock cache + + while (ext_key) { + ext_bh = affs_bread(sb, ext_key); + size = AFFS_SB(sb)->s_hashsize; + if (size > blkcnt - blk) + size = blkcnt - blk; + for (i = 0; i < size; i++, blk++) + affs_free_block(sb, be32_to_cpu(AFFS_BLOCK(sb, ext_bh, i))); + affs_free_block(sb, ext_key); + ext_key = be32_to_cpu(AFFS_TAIL(sb, ext_bh)->extension); + affs_brelse(ext_bh); + } + affs_free_prealloc(inode); +} + +int affs_file_fsync(struct file *filp, int datasync) +{ + struct inode *inode = filp->f_mapping->host; + int ret, err; + + ret = write_inode_now(inode, 0); + err = sync_blockdev(inode->i_sb->s_bdev); + if (!ret) + ret = err; + return ret; +} diff --git a/fs/affs/inode.c b/fs/affs/inode.c new file mode 100644 index 00000000..5d828903 --- /dev/null +++ b/fs/affs/inode.c @@ -0,0 +1,413 @@ +/* + * linux/fs/affs/inode.c + * + * (c) 1996 Hans-Joachim Widmaier - Rewritten + * + * (C) 1993 Ray Burr - Modified for Amiga FFS filesystem. + * + * (C) 1992 Eric Youngdale Modified for ISO9660 filesystem. + * + * (C) 1991 Linus Torvalds - minix filesystem + */ +#include +#include +#include "affs.h" + +extern const struct inode_operations affs_symlink_inode_operations; +extern struct timezone sys_tz; + +struct inode *affs_iget(struct super_block *sb, unsigned long ino) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + struct buffer_head *bh; + struct affs_head *head; + struct affs_tail *tail; + struct inode *inode; + u32 block; + u32 size; + u32 prot; + u16 id; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + pr_debug("AFFS: affs_iget(%lu)\n", inode->i_ino); + + block = inode->i_ino; + bh = affs_bread(sb, block); + if (!bh) { + affs_warning(sb, "read_inode", "Cannot read block %d", block); + goto bad_inode; + } + if (affs_checksum_block(sb, bh) || be32_to_cpu(AFFS_HEAD(bh)->ptype) != T_SHORT) { + affs_warning(sb,"read_inode", + "Checksum or type (ptype=%d) error on inode %d", + AFFS_HEAD(bh)->ptype, block); + goto bad_inode; + } + + head = AFFS_HEAD(bh); + tail = AFFS_TAIL(sb, bh); + prot = be32_to_cpu(tail->protect); + + inode->i_size = 0; + inode->i_nlink = 1; + inode->i_mode = 0; + AFFS_I(inode)->i_extcnt = 1; + AFFS_I(inode)->i_ext_last = ~1; + AFFS_I(inode)->i_protect = prot; + atomic_set(&AFFS_I(inode)->i_opencnt, 0); + AFFS_I(inode)->i_blkcnt = 0; + AFFS_I(inode)->i_lc = NULL; + AFFS_I(inode)->i_lc_size = 0; + AFFS_I(inode)->i_lc_shift = 0; + AFFS_I(inode)->i_lc_mask = 0; + AFFS_I(inode)->i_ac = NULL; + AFFS_I(inode)->i_ext_bh = NULL; + AFFS_I(inode)->mmu_private = 0; + AFFS_I(inode)->i_lastalloc = 0; + AFFS_I(inode)->i_pa_cnt = 0; + + if (sbi->s_flags & SF_SETMODE) + inode->i_mode = sbi->s_mode; + else + inode->i_mode = prot_to_mode(prot); + + id = be16_to_cpu(tail->uid); + if (id == 0 || sbi->s_flags & SF_SETUID) + inode->i_uid = sbi->s_uid; + else if (id == 0xFFFF && sbi->s_flags & SF_MUFS) + inode->i_uid = 0; + else + inode->i_uid = id; + + id = be16_to_cpu(tail->gid); + if (id == 0 || sbi->s_flags & SF_SETGID) + inode->i_gid = sbi->s_gid; + else if (id == 0xFFFF && sbi->s_flags & SF_MUFS) + inode->i_gid = 0; + else + inode->i_gid = id; + + switch (be32_to_cpu(tail->stype)) { + case ST_ROOT: + inode->i_uid = sbi->s_uid; + inode->i_gid = sbi->s_gid; + /* fall through */ + case ST_USERDIR: + if (be32_to_cpu(tail->stype) == ST_USERDIR || + sbi->s_flags & SF_SETMODE) { + if (inode->i_mode & S_IRUSR) + inode->i_mode |= S_IXUSR; + if (inode->i_mode & S_IRGRP) + inode->i_mode |= S_IXGRP; + if (inode->i_mode & S_IROTH) + inode->i_mode |= S_IXOTH; + inode->i_mode |= S_IFDIR; + } else + inode->i_mode = S_IRUGO | S_IXUGO | S_IWUSR | S_IFDIR; + /* Maybe it should be controlled by mount parameter? */ + //inode->i_mode |= S_ISVTX; + inode->i_op = &affs_dir_inode_operations; + inode->i_fop = &affs_dir_operations; + break; + case ST_LINKDIR: +#if 0 + affs_warning(sb, "read_inode", "inode is LINKDIR"); + goto bad_inode; +#else + inode->i_mode |= S_IFDIR; + /* ... and leave ->i_op and ->i_fop pointing to empty */ + break; +#endif + case ST_LINKFILE: + affs_warning(sb, "read_inode", "inode is LINKFILE"); + goto bad_inode; + case ST_FILE: + size = be32_to_cpu(tail->size); + inode->i_mode |= S_IFREG; + AFFS_I(inode)->mmu_private = inode->i_size = size; + if (inode->i_size) { + AFFS_I(inode)->i_blkcnt = (size - 1) / + sbi->s_data_blksize + 1; + AFFS_I(inode)->i_extcnt = (AFFS_I(inode)->i_blkcnt - 1) / + sbi->s_hashsize + 1; + } + if (tail->link_chain) + inode->i_nlink = 2; + inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops; + inode->i_op = &affs_file_inode_operations; + inode->i_fop = &affs_file_operations; + break; + case ST_SOFTLINK: + inode->i_mode |= S_IFLNK; + inode->i_op = &affs_symlink_inode_operations; + inode->i_data.a_ops = &affs_symlink_aops; + break; + } + + inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec + = (be32_to_cpu(tail->change.days) * (24 * 60 * 60) + + be32_to_cpu(tail->change.mins) * 60 + + be32_to_cpu(tail->change.ticks) / 50 + + ((8 * 365 + 2) * 24 * 60 * 60)) + + sys_tz.tz_minuteswest * 60; + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_atime.tv_nsec = 0; + affs_brelse(bh); + unlock_new_inode(inode); + return inode; + +bad_inode: + affs_brelse(bh); + iget_failed(inode); + return ERR_PTR(-EIO); +} + +int +affs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; + struct affs_tail *tail; + uid_t uid; + gid_t gid; + + pr_debug("AFFS: write_inode(%lu)\n",inode->i_ino); + + if (!inode->i_nlink) + // possibly free block + return 0; + bh = affs_bread(sb, inode->i_ino); + if (!bh) { + affs_error(sb,"write_inode","Cannot read block %lu",inode->i_ino); + return -EIO; + } + tail = AFFS_TAIL(sb, bh); + if (tail->stype == cpu_to_be32(ST_ROOT)) { + secs_to_datestamp(inode->i_mtime.tv_sec,&AFFS_ROOT_TAIL(sb, bh)->root_change); + } else { + tail->protect = cpu_to_be32(AFFS_I(inode)->i_protect); + tail->size = cpu_to_be32(inode->i_size); + secs_to_datestamp(inode->i_mtime.tv_sec,&tail->change); + if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) { + uid = inode->i_uid; + gid = inode->i_gid; + if (AFFS_SB(sb)->s_flags & SF_MUFS) { + if (inode->i_uid == 0 || inode->i_uid == 0xFFFF) + uid = inode->i_uid ^ ~0; + if (inode->i_gid == 0 || inode->i_gid == 0xFFFF) + gid = inode->i_gid ^ ~0; + } + if (!(AFFS_SB(sb)->s_flags & SF_SETUID)) + tail->uid = cpu_to_be16(uid); + if (!(AFFS_SB(sb)->s_flags & SF_SETGID)) + tail->gid = cpu_to_be16(gid); + } + } + affs_fix_checksum(sb, bh); + mark_buffer_dirty_inode(bh, inode); + affs_brelse(bh); + affs_free_prealloc(inode); + return 0; +} + +int +affs_notify_change(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error; + + pr_debug("AFFS: notify_change(%lu,0x%x)\n",inode->i_ino,attr->ia_valid); + + error = inode_change_ok(inode,attr); + if (error) + goto out; + + if (((attr->ia_valid & ATTR_UID) && (AFFS_SB(inode->i_sb)->s_flags & SF_SETUID)) || + ((attr->ia_valid & ATTR_GID) && (AFFS_SB(inode->i_sb)->s_flags & SF_SETGID)) || + ((attr->ia_valid & ATTR_MODE) && + (AFFS_SB(inode->i_sb)->s_flags & (SF_SETMODE | SF_IMMUTABLE)))) { + if (!(AFFS_SB(inode->i_sb)->s_flags & SF_QUIET)) + error = -EPERM; + goto out; + } + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + + if (attr->ia_valid & ATTR_MODE) + mode_to_prot(inode); +out: + return error; +} + +void +affs_evict_inode(struct inode *inode) +{ + unsigned long cache_page; + pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); + truncate_inode_pages(&inode->i_data, 0); + + if (!inode->i_nlink) { + inode->i_size = 0; + affs_truncate(inode); + } + + invalidate_inode_buffers(inode); + end_writeback(inode); + affs_free_prealloc(inode); + cache_page = (unsigned long)AFFS_I(inode)->i_lc; + if (cache_page) { + pr_debug("AFFS: freeing ext cache\n"); + AFFS_I(inode)->i_lc = NULL; + AFFS_I(inode)->i_ac = NULL; + free_page(cache_page); + } + affs_brelse(AFFS_I(inode)->i_ext_bh); + AFFS_I(inode)->i_ext_last = ~1; + AFFS_I(inode)->i_ext_bh = NULL; + + if (!inode->i_nlink) + affs_free_block(inode->i_sb, inode->i_ino); +} + +struct inode * +affs_new_inode(struct inode *dir) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + u32 block; + struct buffer_head *bh; + + if (!(inode = new_inode(sb))) + goto err_inode; + + if (!(block = affs_alloc_block(dir, dir->i_ino))) + goto err_block; + + bh = affs_getzeroblk(sb, block); + if (!bh) + goto err_bh; + mark_buffer_dirty_inode(bh, inode); + affs_brelse(bh); + + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_ino = block; + inode->i_nlink = 1; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + atomic_set(&AFFS_I(inode)->i_opencnt, 0); + AFFS_I(inode)->i_blkcnt = 0; + AFFS_I(inode)->i_lc = NULL; + AFFS_I(inode)->i_lc_size = 0; + AFFS_I(inode)->i_lc_shift = 0; + AFFS_I(inode)->i_lc_mask = 0; + AFFS_I(inode)->i_ac = NULL; + AFFS_I(inode)->i_ext_bh = NULL; + AFFS_I(inode)->mmu_private = 0; + AFFS_I(inode)->i_protect = 0; + AFFS_I(inode)->i_lastalloc = 0; + AFFS_I(inode)->i_pa_cnt = 0; + AFFS_I(inode)->i_extcnt = 1; + AFFS_I(inode)->i_ext_last = ~1; + + insert_inode_hash(inode); + + return inode; + +err_bh: + affs_free_block(sb, block); +err_block: + iput(inode); +err_inode: + return NULL; +} + +/* + * Add an entry to a directory. Create the header block + * and insert it into the hash table. + */ + +int +affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type) +{ + struct super_block *sb = dir->i_sb; + struct buffer_head *inode_bh = NULL; + struct buffer_head *bh = NULL; + u32 block = 0; + int retval; + + pr_debug("AFFS: add_entry(dir=%u, inode=%u, \"%*s\", type=%d)\n", (u32)dir->i_ino, + (u32)inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name, type); + + retval = -EIO; + bh = affs_bread(sb, inode->i_ino); + if (!bh) + goto done; + + affs_lock_link(inode); + switch (type) { + case ST_LINKFILE: + case ST_LINKDIR: + retval = -ENOSPC; + block = affs_alloc_block(dir, dir->i_ino); + if (!block) + goto err; + retval = -EIO; + inode_bh = bh; + bh = affs_getzeroblk(sb, block); + if (!bh) + goto err; + break; + default: + break; + } + + AFFS_HEAD(bh)->ptype = cpu_to_be32(T_SHORT); + AFFS_HEAD(bh)->key = cpu_to_be32(bh->b_blocknr); + affs_copy_name(AFFS_TAIL(sb, bh)->name, dentry); + AFFS_TAIL(sb, bh)->stype = cpu_to_be32(type); + AFFS_TAIL(sb, bh)->parent = cpu_to_be32(dir->i_ino); + + if (inode_bh) { + __be32 chain; + chain = AFFS_TAIL(sb, inode_bh)->link_chain; + AFFS_TAIL(sb, bh)->original = cpu_to_be32(inode->i_ino); + AFFS_TAIL(sb, bh)->link_chain = chain; + AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block); + affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain)); + mark_buffer_dirty_inode(inode_bh, inode); + inode->i_nlink = 2; + ihold(inode); + } + affs_fix_checksum(sb, bh); + mark_buffer_dirty_inode(bh, inode); + dentry->d_fsdata = (void *)(long)bh->b_blocknr; + + affs_lock_dir(dir); + retval = affs_insert_hash(dir, bh); + mark_buffer_dirty_inode(bh, inode); + affs_unlock_dir(dir); + affs_unlock_link(inode); + + d_instantiate(dentry, inode); +done: + affs_brelse(inode_bh); + affs_brelse(bh); + return retval; +err: + if (block) + affs_free_block(sb, block); + affs_unlock_link(inode); + goto done; +} diff --git a/fs/affs/namei.c b/fs/affs/namei.c new file mode 100644 index 00000000..e3e9efc1 --- /dev/null +++ b/fs/affs/namei.c @@ -0,0 +1,458 @@ +/* + * linux/fs/affs/namei.c + * + * (c) 1996 Hans-Joachim Widmaier - Rewritten + * + * (C) 1993 Ray Burr - Modified for Amiga FFS filesystem. + * + * (C) 1991 Linus Torvalds - minix filesystem + */ + +#include "affs.h" + +typedef int (*toupper_t)(int); + +static int affs_toupper(int ch); +static int affs_hash_dentry(const struct dentry *, + const struct inode *, struct qstr *); +static int affs_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); +static int affs_intl_toupper(int ch); +static int affs_intl_hash_dentry(const struct dentry *, + const struct inode *, struct qstr *); +static int affs_intl_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); + +const struct dentry_operations affs_dentry_operations = { + .d_hash = affs_hash_dentry, + .d_compare = affs_compare_dentry, +}; + +const struct dentry_operations affs_intl_dentry_operations = { + .d_hash = affs_intl_hash_dentry, + .d_compare = affs_intl_compare_dentry, +}; + + +/* Simple toupper() for DOS\1 */ + +static int +affs_toupper(int ch) +{ + return ch >= 'a' && ch <= 'z' ? ch -= ('a' - 'A') : ch; +} + +/* International toupper() for DOS\3 ("international") */ + +static int +affs_intl_toupper(int ch) +{ + return (ch >= 'a' && ch <= 'z') || (ch >= 0xE0 + && ch <= 0xFE && ch != 0xF7) ? + ch - ('a' - 'A') : ch; +} + +static inline toupper_t +affs_get_toupper(struct super_block *sb) +{ + return AFFS_SB(sb)->s_flags & SF_INTL ? affs_intl_toupper : affs_toupper; +} + +/* + * Note: the dentry argument is the parent dentry. + */ +static inline int +__affs_hash_dentry(struct qstr *qstr, toupper_t toupper) +{ + const u8 *name = qstr->name; + unsigned long hash; + int i; + + i = affs_check_name(qstr->name, qstr->len); + if (i) + return i; + + hash = init_name_hash(); + i = min(qstr->len, 30u); + for (; i > 0; name++, i--) + hash = partial_name_hash(toupper(*name), hash); + qstr->hash = end_name_hash(hash); + + return 0; +} + +static int +affs_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) +{ + return __affs_hash_dentry(qstr, affs_toupper); +} +static int +affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) +{ + return __affs_hash_dentry(qstr, affs_intl_toupper); +} + +static inline int __affs_compare_dentry(unsigned int len, + const char *str, const struct qstr *name, toupper_t toupper) +{ + const u8 *aname = str; + const u8 *bname = name->name; + + /* + * 'str' is the name of an already existing dentry, so the name + * must be valid. 'name' must be validated first. + */ + + if (affs_check_name(name->name, name->len)) + return 1; + + /* + * If the names are longer than the allowed 30 chars, + * the excess is ignored, so their length may differ. + */ + if (len >= 30) { + if (name->len < 30) + return 1; + len = 30; + } else if (len != name->len) + return 1; + + for (; len > 0; len--) + if (toupper(*aname++) != toupper(*bname++)) + return 1; + + return 0; +} + +static int +affs_compare_dentry(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + return __affs_compare_dentry(len, str, name, affs_toupper); +} +static int +affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + return __affs_compare_dentry(len, str, name, affs_intl_toupper); +} + +/* + * NOTE! unlike strncmp, affs_match returns 1 for success, 0 for failure. + */ + +static inline int +affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper) +{ + const u8 *name = dentry->d_name.name; + int len = dentry->d_name.len; + + if (len >= 30) { + if (*name2 < 30) + return 0; + len = 30; + } else if (len != *name2) + return 0; + + for (name2++; len > 0; len--) + if (toupper(*name++) != toupper(*name2++)) + return 0; + return 1; +} + +int +affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len) +{ + toupper_t toupper = affs_get_toupper(sb); + int hash; + + hash = len = min(len, 30u); + for (; len > 0; len--) + hash = (hash * 13 + toupper(*name++)) & 0x7ff; + + return hash % AFFS_SB(sb)->s_hashsize; +} + +static struct buffer_head * +affs_find_entry(struct inode *dir, struct dentry *dentry) +{ + struct super_block *sb = dir->i_sb; + struct buffer_head *bh; + toupper_t toupper = affs_get_toupper(sb); + u32 key; + + pr_debug("AFFS: find_entry(\"%.*s\")\n", (int)dentry->d_name.len, dentry->d_name.name); + + bh = affs_bread(sb, dir->i_ino); + if (!bh) + return ERR_PTR(-EIO); + + key = be32_to_cpu(AFFS_HEAD(bh)->table[affs_hash_name(sb, dentry->d_name.name, dentry->d_name.len)]); + + for (;;) { + affs_brelse(bh); + if (key == 0) + return NULL; + bh = affs_bread(sb, key); + if (!bh) + return ERR_PTR(-EIO); + if (affs_match(dentry, AFFS_TAIL(sb, bh)->name, toupper)) + return bh; + key = be32_to_cpu(AFFS_TAIL(sb, bh)->hash_chain); + } +} + +struct dentry * +affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct super_block *sb = dir->i_sb; + struct buffer_head *bh; + struct inode *inode = NULL; + + pr_debug("AFFS: lookup(\"%.*s\")\n",(int)dentry->d_name.len,dentry->d_name.name); + + affs_lock_dir(dir); + bh = affs_find_entry(dir, dentry); + affs_unlock_dir(dir); + if (IS_ERR(bh)) + return ERR_CAST(bh); + if (bh) { + u32 ino = bh->b_blocknr; + + /* store the real header ino in d_fsdata for faster lookups */ + dentry->d_fsdata = (void *)(long)ino; + switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) { + //link to dirs disabled + //case ST_LINKDIR: + case ST_LINKFILE: + ino = be32_to_cpu(AFFS_TAIL(sb, bh)->original); + } + affs_brelse(bh); + inode = affs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + d_add(dentry, inode); + return NULL; +} + +int +affs_unlink(struct inode *dir, struct dentry *dentry) +{ + pr_debug("AFFS: unlink(dir=%d, %lu \"%.*s\")\n", (u32)dir->i_ino, + dentry->d_inode->i_ino, + (int)dentry->d_name.len, dentry->d_name.name); + + return affs_remove_header(dentry); +} + +int +affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + int error; + + pr_debug("AFFS: create(%lu,\"%.*s\",0%o)\n",dir->i_ino,(int)dentry->d_name.len, + dentry->d_name.name,mode); + + inode = affs_new_inode(dir); + if (!inode) + return -ENOSPC; + + inode->i_mode = mode; + mode_to_prot(inode); + mark_inode_dirty(inode); + + inode->i_op = &affs_file_inode_operations; + inode->i_fop = &affs_file_operations; + inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops; + error = affs_add_entry(dir, inode, dentry, ST_FILE); + if (error) { + inode->i_nlink = 0; + iput(inode); + return error; + } + return 0; +} + +int +affs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + int error; + + pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%o)\n",dir->i_ino, + (int)dentry->d_name.len,dentry->d_name.name,mode); + + inode = affs_new_inode(dir); + if (!inode) + return -ENOSPC; + + inode->i_mode = S_IFDIR | mode; + mode_to_prot(inode); + + inode->i_op = &affs_dir_inode_operations; + inode->i_fop = &affs_dir_operations; + + error = affs_add_entry(dir, inode, dentry, ST_USERDIR); + if (error) { + inode->i_nlink = 0; + mark_inode_dirty(inode); + iput(inode); + return error; + } + return 0; +} + +int +affs_rmdir(struct inode *dir, struct dentry *dentry) +{ + pr_debug("AFFS: rmdir(dir=%u, %lu \"%.*s\")\n", (u32)dir->i_ino, + dentry->d_inode->i_ino, + (int)dentry->d_name.len, dentry->d_name.name); + + return affs_remove_header(dentry); +} + +int +affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + struct super_block *sb = dir->i_sb; + struct buffer_head *bh; + struct inode *inode; + char *p; + int i, maxlen, error; + char c, lc; + + pr_debug("AFFS: symlink(%lu,\"%.*s\" -> \"%s\")\n",dir->i_ino, + (int)dentry->d_name.len,dentry->d_name.name,symname); + + maxlen = AFFS_SB(sb)->s_hashsize * sizeof(u32) - 1; + inode = affs_new_inode(dir); + if (!inode) + return -ENOSPC; + + inode->i_op = &affs_symlink_inode_operations; + inode->i_data.a_ops = &affs_symlink_aops; + inode->i_mode = S_IFLNK | 0777; + mode_to_prot(inode); + + error = -EIO; + bh = affs_bread(sb, inode->i_ino); + if (!bh) + goto err; + i = 0; + p = (char *)AFFS_HEAD(bh)->table; + lc = '/'; + if (*symname == '/') { + struct affs_sb_info *sbi = AFFS_SB(sb); + while (*symname == '/') + symname++; + spin_lock(&sbi->symlink_lock); + while (sbi->s_volume[i]) /* Cannot overflow */ + *p++ = sbi->s_volume[i++]; + spin_unlock(&sbi->symlink_lock); + } + while (i < maxlen && (c = *symname++)) { + if (c == '.' && lc == '/' && *symname == '.' && symname[1] == '/') { + *p++ = '/'; + i++; + symname += 2; + lc = '/'; + } else if (c == '.' && lc == '/' && *symname == '/') { + symname++; + lc = '/'; + } else { + *p++ = c; + lc = c; + i++; + } + if (lc == '/') + while (*symname == '/') + symname++; + } + *p = 0; + mark_buffer_dirty_inode(bh, inode); + affs_brelse(bh); + mark_inode_dirty(inode); + + error = affs_add_entry(dir, inode, dentry, ST_SOFTLINK); + if (error) + goto err; + + return 0; + +err: + inode->i_nlink = 0; + mark_inode_dirty(inode); + iput(inode); + return error; +} + +int +affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + + pr_debug("AFFS: link(%u, %u, \"%.*s\")\n", (u32)inode->i_ino, (u32)dir->i_ino, + (int)dentry->d_name.len,dentry->d_name.name); + + return affs_add_entry(dir, inode, dentry, ST_LINKFILE); +} + +int +affs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct super_block *sb = old_dir->i_sb; + struct buffer_head *bh = NULL; + int retval; + + pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n", + (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, + (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); + + retval = affs_check_name(new_dentry->d_name.name,new_dentry->d_name.len); + if (retval) + return retval; + + /* Unlink destination if it already exists */ + if (new_dentry->d_inode) { + retval = affs_remove_header(new_dentry); + if (retval) + return retval; + } + + bh = affs_bread(sb, old_dentry->d_inode->i_ino); + if (!bh) + return -EIO; + + /* Remove header from its parent directory. */ + affs_lock_dir(old_dir); + retval = affs_remove_hash(old_dir, bh); + affs_unlock_dir(old_dir); + if (retval) + goto done; + + /* And insert it into the new directory with the new name. */ + affs_copy_name(AFFS_TAIL(sb, bh)->name, new_dentry); + affs_fix_checksum(sb, bh); + affs_lock_dir(new_dir); + retval = affs_insert_hash(new_dir, bh); + affs_unlock_dir(new_dir); + /* TODO: move it back to old_dir, if error? */ + +done: + mark_buffer_dirty_inode(bh, retval ? old_dir : new_dir); + affs_brelse(bh); + return retval; +} diff --git a/fs/affs/super.c b/fs/affs/super.c new file mode 100644 index 00000000..b31507d0 --- /dev/null +++ b/fs/affs/super.c @@ -0,0 +1,626 @@ +/* + * linux/fs/affs/inode.c + * + * (c) 1996 Hans-Joachim Widmaier - Rewritten + * + * (C) 1993 Ray Burr - Modified for Amiga FFS filesystem. + * + * (C) 1992 Eric Youngdale Modified for ISO 9660 filesystem. + * + * (C) 1991 Linus Torvalds - minix filesystem + */ + +#include +#include +#include +#include +#include +#include +#include +#include "affs.h" + +extern struct timezone sys_tz; + +static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); +static int affs_remount (struct super_block *sb, int *flags, char *data); + +static void +affs_commit_super(struct super_block *sb, int wait, int clean) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + struct buffer_head *bh = sbi->s_root_bh; + struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh); + + tail->bm_flag = cpu_to_be32(clean); + secs_to_datestamp(get_seconds(), &tail->disk_change); + affs_fix_checksum(sb, bh); + mark_buffer_dirty(bh); + if (wait) + sync_dirty_buffer(bh); +} + +static void +affs_put_super(struct super_block *sb) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + pr_debug("AFFS: put_super()\n"); + + if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt) + affs_commit_super(sb, 1, 1); + + kfree(sbi->s_prefix); + affs_free_bitmap(sb); + affs_brelse(sbi->s_root_bh); + kfree(sbi); + sb->s_fs_info = NULL; +} + +static void +affs_write_super(struct super_block *sb) +{ + lock_super(sb); + if (!(sb->s_flags & MS_RDONLY)) + affs_commit_super(sb, 1, 2); + sb->s_dirt = 0; + unlock_super(sb); + + pr_debug("AFFS: write_super() at %lu, clean=2\n", get_seconds()); +} + +static int +affs_sync_fs(struct super_block *sb, int wait) +{ + lock_super(sb); + affs_commit_super(sb, wait, 2); + sb->s_dirt = 0; + unlock_super(sb); + return 0; +} + +static struct kmem_cache * affs_inode_cachep; + +static struct inode *affs_alloc_inode(struct super_block *sb) +{ + struct affs_inode_info *i; + + i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL); + if (!i) + return NULL; + + i->vfs_inode.i_version = 1; + i->i_lc = NULL; + i->i_ext_bh = NULL; + i->i_pa_cnt = 0; + + return &i->vfs_inode; +} + +static void affs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(affs_inode_cachep, AFFS_I(inode)); +} + +static void affs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, affs_i_callback); +} + +static void init_once(void *foo) +{ + struct affs_inode_info *ei = (struct affs_inode_info *) foo; + + sema_init(&ei->i_link_lock, 1); + sema_init(&ei->i_ext_lock, 1); + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + affs_inode_cachep = kmem_cache_create("affs_inode_cache", + sizeof(struct affs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (affs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(affs_inode_cachep); +} + +static const struct super_operations affs_sops = { + .alloc_inode = affs_alloc_inode, + .destroy_inode = affs_destroy_inode, + .write_inode = affs_write_inode, + .evict_inode = affs_evict_inode, + .put_super = affs_put_super, + .write_super = affs_write_super, + .sync_fs = affs_sync_fs, + .statfs = affs_statfs, + .remount_fs = affs_remount, + .show_options = generic_show_options, +}; + +enum { + Opt_bs, Opt_mode, Opt_mufs, Opt_prefix, Opt_protect, + Opt_reserved, Opt_root, Opt_setgid, Opt_setuid, + Opt_verbose, Opt_volume, Opt_ignore, Opt_err, +}; + +static const match_table_t tokens = { + {Opt_bs, "bs=%u"}, + {Opt_mode, "mode=%o"}, + {Opt_mufs, "mufs"}, + {Opt_prefix, "prefix=%s"}, + {Opt_protect, "protect"}, + {Opt_reserved, "reserved=%u"}, + {Opt_root, "root=%u"}, + {Opt_setgid, "setgid=%u"}, + {Opt_setuid, "setuid=%u"}, + {Opt_verbose, "verbose"}, + {Opt_volume, "volume=%s"}, + {Opt_ignore, "grpquota"}, + {Opt_ignore, "noquota"}, + {Opt_ignore, "quota"}, + {Opt_ignore, "usrquota"}, + {Opt_err, NULL}, +}; + +static int +parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s32 *root, + int *blocksize, char **prefix, char *volume, unsigned long *mount_opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + + /* Fill in defaults */ + + *uid = current_uid(); + *gid = current_gid(); + *reserved = 2; + *root = -1; + *blocksize = -1; + volume[0] = ':'; + volume[1] = 0; + *mount_opts = 0; + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token, n, option; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_bs: + if (match_int(&args[0], &n)) + return 0; + if (n != 512 && n != 1024 && n != 2048 + && n != 4096) { + printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); + return 0; + } + *blocksize = n; + break; + case Opt_mode: + if (match_octal(&args[0], &option)) + return 0; + *mode = option & 0777; + *mount_opts |= SF_SETMODE; + break; + case Opt_mufs: + *mount_opts |= SF_MUFS; + break; + case Opt_prefix: + *prefix = match_strdup(&args[0]); + if (!*prefix) + return 0; + *mount_opts |= SF_PREFIX; + break; + case Opt_protect: + *mount_opts |= SF_IMMUTABLE; + break; + case Opt_reserved: + if (match_int(&args[0], reserved)) + return 0; + break; + case Opt_root: + if (match_int(&args[0], root)) + return 0; + break; + case Opt_setgid: + if (match_int(&args[0], &option)) + return 0; + *gid = option; + *mount_opts |= SF_SETGID; + break; + case Opt_setuid: + if (match_int(&args[0], &option)) + return 0; + *uid = option; + *mount_opts |= SF_SETUID; + break; + case Opt_verbose: + *mount_opts |= SF_VERBOSE; + break; + case Opt_volume: { + char *vol = match_strdup(&args[0]); + if (!vol) + return 0; + strlcpy(volume, vol, 32); + kfree(vol); + break; + } + case Opt_ignore: + /* Silently ignore the quota options */ + break; + default: + printk("AFFS: Unrecognized mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } + return 1; +} + +/* This function definitely needs to be split up. Some fine day I'll + * hopefully have the guts to do so. Until then: sorry for the mess. + */ + +static int affs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct affs_sb_info *sbi; + struct buffer_head *root_bh = NULL; + struct buffer_head *boot_bh; + struct inode *root_inode = NULL; + s32 root_block; + int size, blocksize; + u32 chksum; + int num_bm; + int i, j; + s32 key; + uid_t uid; + gid_t gid; + int reserved; + unsigned long mount_flags; + int tmp_flags; /* fix remount prototype... */ + u8 sig[4]; + int ret = -EINVAL; + + save_mount_options(sb, data); + + pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options"); + + sb->s_magic = AFFS_SUPER_MAGIC; + sb->s_op = &affs_sops; + sb->s_flags |= MS_NODIRATIME; + + sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + mutex_init(&sbi->s_bmlock); + spin_lock_init(&sbi->symlink_lock); + + if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, + &blocksize,&sbi->s_prefix, + sbi->s_volume, &mount_flags)) { + printk(KERN_ERR "AFFS: Error parsing options\n"); + kfree(sbi->s_prefix); + kfree(sbi); + return -EINVAL; + } + /* N.B. after this point s_prefix must be released */ + + sbi->s_flags = mount_flags; + sbi->s_mode = i; + sbi->s_uid = uid; + sbi->s_gid = gid; + sbi->s_reserved= reserved; + + /* Get the size of the device in 512-byte blocks. + * If we later see that the partition uses bigger + * blocks, we will have to change it. + */ + + size = sb->s_bdev->bd_inode->i_size >> 9; + pr_debug("AFFS: initial blocksize=%d, #blocks=%d\n", 512, size); + + affs_set_blocksize(sb, PAGE_SIZE); + /* Try to find root block. Its location depends on the block size. */ + + i = 512; + j = 4096; + if (blocksize > 0) { + i = j = blocksize; + size = size / (blocksize / 512); + } + for (blocksize = i, key = 0; blocksize <= j; blocksize <<= 1, size >>= 1) { + sbi->s_root_block = root_block; + if (root_block < 0) + sbi->s_root_block = (reserved + size - 1) / 2; + pr_debug("AFFS: setting blocksize to %d\n", blocksize); + affs_set_blocksize(sb, blocksize); + sbi->s_partition_size = size; + + /* The root block location that was calculated above is not + * correct if the partition size is an odd number of 512- + * byte blocks, which will be rounded down to a number of + * 1024-byte blocks, and if there were an even number of + * reserved blocks. Ideally, all partition checkers should + * report the real number of blocks of the real blocksize, + * but since this just cannot be done, we have to try to + * find the root block anyways. In the above case, it is one + * block behind the calculated one. So we check this one, too. + */ + for (num_bm = 0; num_bm < 2; num_bm++) { + pr_debug("AFFS: Dev %s, trying root=%u, bs=%d, " + "size=%d, reserved=%d\n", + sb->s_id, + sbi->s_root_block + num_bm, + blocksize, size, reserved); + root_bh = affs_bread(sb, sbi->s_root_block + num_bm); + if (!root_bh) + continue; + if (!affs_checksum_block(sb, root_bh) && + be32_to_cpu(AFFS_ROOT_HEAD(root_bh)->ptype) == T_SHORT && + be32_to_cpu(AFFS_ROOT_TAIL(sb, root_bh)->stype) == ST_ROOT) { + sbi->s_hashsize = blocksize / 4 - 56; + sbi->s_root_block += num_bm; + key = 1; + goto got_root; + } + affs_brelse(root_bh); + root_bh = NULL; + } + } + if (!silent) + printk(KERN_ERR "AFFS: No valid root block on device %s\n", + sb->s_id); + goto out_error; + + /* N.B. after this point bh must be released */ +got_root: + root_block = sbi->s_root_block; + + /* Find out which kind of FS we have */ + boot_bh = sb_bread(sb, 0); + if (!boot_bh) { + printk(KERN_ERR "AFFS: Cannot read boot block\n"); + goto out_error; + } + memcpy(sig, boot_bh->b_data, 4); + brelse(boot_bh); + chksum = be32_to_cpu(*(__be32 *)sig); + + /* Dircache filesystems are compatible with non-dircache ones + * when reading. As long as they aren't supported, writing is + * not recommended. + */ + if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS + || chksum == MUFS_DCOFS) && !(sb->s_flags & MS_RDONLY)) { + printk(KERN_NOTICE "AFFS: Dircache FS - mounting %s read only\n", + sb->s_id); + sb->s_flags |= MS_RDONLY; + } + switch (chksum) { + case MUFS_FS: + case MUFS_INTLFFS: + case MUFS_DCFFS: + sbi->s_flags |= SF_MUFS; + /* fall thru */ + case FS_INTLFFS: + case FS_DCFFS: + sbi->s_flags |= SF_INTL; + break; + case MUFS_FFS: + sbi->s_flags |= SF_MUFS; + break; + case FS_FFS: + break; + case MUFS_OFS: + sbi->s_flags |= SF_MUFS; + /* fall thru */ + case FS_OFS: + sbi->s_flags |= SF_OFS; + sb->s_flags |= MS_NOEXEC; + break; + case MUFS_DCOFS: + case MUFS_INTLOFS: + sbi->s_flags |= SF_MUFS; + case FS_DCOFS: + case FS_INTLOFS: + sbi->s_flags |= SF_INTL | SF_OFS; + sb->s_flags |= MS_NOEXEC; + break; + default: + printk(KERN_ERR "AFFS: Unknown filesystem on device %s: %08X\n", + sb->s_id, chksum); + goto out_error; + } + + if (mount_flags & SF_VERBOSE) { + u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0]; + printk(KERN_NOTICE "AFFS: Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n", + len > 31 ? 31 : len, + AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1, + sig, sig[3] + '0', blocksize); + } + + sb->s_flags |= MS_NODEV | MS_NOSUID; + + sbi->s_data_blksize = sb->s_blocksize; + if (sbi->s_flags & SF_OFS) + sbi->s_data_blksize -= 24; + + /* Keep super block in cache */ + sbi->s_root_bh = root_bh; + /* N.B. after this point s_root_bh must be released */ + + tmp_flags = sb->s_flags; + if (affs_init_bitmap(sb, &tmp_flags)) + goto out_error; + sb->s_flags = tmp_flags; + + /* set up enough so that it can read an inode */ + + root_inode = affs_iget(sb, root_block); + if (IS_ERR(root_inode)) { + ret = PTR_ERR(root_inode); + goto out_error_noinode; + } + + if (AFFS_SB(sb)->s_flags & SF_INTL) + sb->s_d_op = &affs_intl_dentry_operations; + else + sb->s_d_op = &affs_dentry_operations; + + sb->s_root = d_alloc_root(root_inode); + if (!sb->s_root) { + printk(KERN_ERR "AFFS: Get root inode failed\n"); + goto out_error; + } + + pr_debug("AFFS: s_flags=%lX\n",sb->s_flags); + return 0; + + /* + * Begin the cascaded cleanup ... + */ +out_error: + if (root_inode) + iput(root_inode); +out_error_noinode: + kfree(sbi->s_bitmap); + affs_brelse(root_bh); + kfree(sbi->s_prefix); + kfree(sbi); + sb->s_fs_info = NULL; + return ret; +} + +static int +affs_remount(struct super_block *sb, int *flags, char *data) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + int blocksize; + uid_t uid; + gid_t gid; + int mode; + int reserved; + int root_block; + unsigned long mount_flags; + int res = 0; + char *new_opts = kstrdup(data, GFP_KERNEL); + char volume[32]; + char *prefix = NULL; + + pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); + + *flags |= MS_NODIRATIME; + + memcpy(volume, sbi->s_volume, 32); + if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block, + &blocksize, &prefix, volume, + &mount_flags)) { + kfree(prefix); + kfree(new_opts); + return -EINVAL; + } + + replace_mount_options(sb, new_opts); + + sbi->s_flags = mount_flags; + sbi->s_mode = mode; + sbi->s_uid = uid; + sbi->s_gid = gid; + /* protect against readers */ + spin_lock(&sbi->symlink_lock); + if (prefix) { + kfree(sbi->s_prefix); + sbi->s_prefix = prefix; + } + memcpy(sbi->s_volume, volume, 32); + spin_unlock(&sbi->symlink_lock); + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + + if (*flags & MS_RDONLY) { + affs_write_super(sb); + affs_free_bitmap(sb); + } else + res = affs_init_bitmap(sb, flags); + + return res; +} + +static int +affs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + int free; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size, + AFFS_SB(sb)->s_reserved); + + free = affs_count_free_blocks(sb); + buf->f_type = AFFS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved; + buf->f_bfree = free; + buf->f_bavail = free; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = 30; + return 0; +} + +static struct dentry *affs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super); +} + +static struct file_system_type affs_fs_type = { + .owner = THIS_MODULE, + .name = "affs", + .mount = affs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_affs_fs(void) +{ + int err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&affs_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_affs_fs(void) +{ + unregister_filesystem(&affs_fs_type); + destroy_inodecache(); +} + +MODULE_DESCRIPTION("Amiga filesystem support for Linux"); +MODULE_LICENSE("GPL"); + +module_init(init_affs_fs) +module_exit(exit_affs_fs) diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c new file mode 100644 index 00000000..ee00f08c --- /dev/null +++ b/fs/affs/symlink.c @@ -0,0 +1,81 @@ +/* + * linux/fs/affs/symlink.c + * + * 1995 Hans-Joachim Widmaier - Modified for affs. + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * affs symlink handling code + */ + +#include "affs.h" + +static int affs_symlink_readpage(struct file *file, struct page *page) +{ + struct buffer_head *bh; + struct inode *inode = page->mapping->host; + char *link = kmap(page); + struct slink_front *lf; + int err; + int i, j; + char c; + char lc; + + pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino); + + err = -EIO; + bh = affs_bread(inode->i_sb, inode->i_ino); + if (!bh) + goto fail; + i = 0; + j = 0; + lf = (struct slink_front *)bh->b_data; + lc = 0; + + if (strchr(lf->symname,':')) { /* Handle assign or volume name */ + struct affs_sb_info *sbi = AFFS_SB(inode->i_sb); + char *pf; + spin_lock(&sbi->symlink_lock); + pf = sbi->s_prefix ? sbi->s_prefix : "/"; + while (i < 1023 && (c = pf[i])) + link[i++] = c; + spin_unlock(&sbi->symlink_lock); + while (i < 1023 && lf->symname[j] != ':') + link[i++] = lf->symname[j++]; + if (i < 1023) + link[i++] = '/'; + j++; + lc = '/'; + } + while (i < 1023 && (c = lf->symname[j])) { + if (c == '/' && lc == '/' && i < 1020) { /* parent dir */ + link[i++] = '.'; + link[i++] = '.'; + } + link[i++] = c; + lc = c; + j++; + } + link[i] = '\0'; + affs_brelse(bh); + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + return 0; +fail: + SetPageError(page); + kunmap(page); + unlock_page(page); + return err; +} + +const struct address_space_operations affs_symlink_aops = { + .readpage = affs_symlink_readpage, +}; + +const struct inode_operations affs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = affs_notify_change, +}; diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig new file mode 100644 index 00000000..8f975f25 --- /dev/null +++ b/fs/afs/Kconfig @@ -0,0 +1,30 @@ +config AFS_FS + tristate "Andrew File System support (AFS) (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + select AF_RXRPC + select DNS_RESOLVER + help + If you say Y here, you will get an experimental Andrew File System + driver. It currently only supports unsecured read-only AFS access. + + See for more information. + + If unsure, say N. + +config AFS_DEBUG + bool "AFS dynamic debugging" + depends on AFS_FS + help + Say Y here to make runtime controllable debugging messages appear. + + See for more information. + + If unsure, say N. + +config AFS_FSCACHE + bool "Provide AFS client caching support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y + help + Say Y here if you want AFS data to be cached locally on disk through + the generic filesystem cache manager diff --git a/fs/afs/Makefile b/fs/afs/Makefile new file mode 100644 index 00000000..4f64b95d --- /dev/null +++ b/fs/afs/Makefile @@ -0,0 +1,32 @@ +# +# Makefile for Red Hat Linux AFS client. +# + +afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o + +kafs-objs := \ + $(afs-cache-y) \ + callback.o \ + cell.o \ + cmservice.o \ + dir.o \ + file.o \ + flock.o \ + fsclient.o \ + inode.o \ + main.o \ + misc.o \ + mntpt.o \ + proc.o \ + rxrpc.o \ + security.o \ + server.o \ + super.o \ + netdevices.o \ + vlclient.o \ + vlocation.o \ + vnode.o \ + volume.o \ + write.o + +obj-$(CONFIG_AFS_FS) := kafs.o diff --git a/fs/afs/afs.h b/fs/afs/afs.h new file mode 100644 index 00000000..c548aa34 --- /dev/null +++ b/fs/afs/afs.h @@ -0,0 +1,177 @@ +/* AFS common types + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef AFS_H +#define AFS_H + +#include + +#define AFS_MAXCELLNAME 64 /* maximum length of a cell name */ +#define AFS_MAXVOLNAME 64 /* maximum length of a volume name */ +#define AFSNAMEMAX 256 /* maximum length of a filename plus NUL */ +#define AFSPATHMAX 1024 /* maximum length of a pathname plus NUL */ +#define AFSOPAQUEMAX 1024 /* maximum length of an opaque field */ + +typedef unsigned afs_volid_t; +typedef unsigned afs_vnodeid_t; +typedef unsigned long long afs_dataversion_t; + +typedef enum { + AFSVL_RWVOL, /* read/write volume */ + AFSVL_ROVOL, /* read-only volume */ + AFSVL_BACKVOL, /* backup volume */ +} __attribute__((packed)) afs_voltype_t; + +typedef enum { + AFS_FTYPE_INVALID = 0, + AFS_FTYPE_FILE = 1, + AFS_FTYPE_DIR = 2, + AFS_FTYPE_SYMLINK = 3, +} afs_file_type_t; + +typedef enum { + AFS_LOCK_READ = 0, /* read lock request */ + AFS_LOCK_WRITE = 1, /* write lock request */ +} afs_lock_type_t; + +#define AFS_LOCKWAIT (5 * 60) /* time until a lock times out (seconds) */ + +/* + * AFS file identifier + */ +struct afs_fid { + afs_volid_t vid; /* volume ID */ + afs_vnodeid_t vnode; /* file index within volume */ + unsigned unique; /* unique ID number (file index version) */ +}; + +/* + * AFS callback notification + */ +typedef enum { + AFSCM_CB_UNTYPED = 0, /* no type set on CB break */ + AFSCM_CB_EXCLUSIVE = 1, /* CB exclusive to CM [not implemented] */ + AFSCM_CB_SHARED = 2, /* CB shared by other CM's */ + AFSCM_CB_DROPPED = 3, /* CB promise cancelled by file server */ +} afs_callback_type_t; + +struct afs_callback { + struct afs_fid fid; /* file identifier */ + unsigned version; /* callback version */ + unsigned expiry; /* time at which expires */ + afs_callback_type_t type; /* type of callback */ +}; + +#define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */ + +/* + * AFS volume information + */ +struct afs_volume_info { + afs_volid_t vid; /* volume ID */ + afs_voltype_t type; /* type of this volume */ + afs_volid_t type_vids[5]; /* volume ID's for possible types for this vol */ + + /* list of fileservers serving this volume */ + size_t nservers; /* number of entries used in servers[] */ + struct { + struct in_addr addr; /* fileserver address */ + } servers[8]; +}; + +/* + * AFS security ACE access mask + */ +typedef u32 afs_access_t; +#define AFS_ACE_READ 0x00000001U /* - permission to read a file/dir */ +#define AFS_ACE_WRITE 0x00000002U /* - permission to write/chmod a file */ +#define AFS_ACE_INSERT 0x00000004U /* - permission to create dirent in a dir */ +#define AFS_ACE_LOOKUP 0x00000008U /* - permission to lookup a file/dir in a dir */ +#define AFS_ACE_DELETE 0x00000010U /* - permission to delete a dirent from a dir */ +#define AFS_ACE_LOCK 0x00000020U /* - permission to lock a file */ +#define AFS_ACE_ADMINISTER 0x00000040U /* - permission to change ACL */ +#define AFS_ACE_USER_A 0x01000000U /* - 'A' user-defined permission */ +#define AFS_ACE_USER_B 0x02000000U /* - 'B' user-defined permission */ +#define AFS_ACE_USER_C 0x04000000U /* - 'C' user-defined permission */ +#define AFS_ACE_USER_D 0x08000000U /* - 'D' user-defined permission */ +#define AFS_ACE_USER_E 0x10000000U /* - 'E' user-defined permission */ +#define AFS_ACE_USER_F 0x20000000U /* - 'F' user-defined permission */ +#define AFS_ACE_USER_G 0x40000000U /* - 'G' user-defined permission */ +#define AFS_ACE_USER_H 0x80000000U /* - 'H' user-defined permission */ + +/* + * AFS file status information + */ +struct afs_file_status { + unsigned if_version; /* interface version */ +#define AFS_FSTATUS_VERSION 1 + + afs_file_type_t type; /* file type */ + unsigned nlink; /* link count */ + u64 size; /* file size */ + afs_dataversion_t data_version; /* current data version */ + u32 author; /* author ID */ + u32 owner; /* owner ID */ + u32 group; /* group ID */ + afs_access_t caller_access; /* access rights for authenticated caller */ + afs_access_t anon_access; /* access rights for unauthenticated caller */ + umode_t mode; /* UNIX mode */ + struct afs_fid parent; /* parent dir ID for non-dirs only */ + time_t mtime_client; /* last time client changed data */ + time_t mtime_server; /* last time server changed data */ + s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */ +}; + +/* + * AFS file status change request + */ +struct afs_store_status { + u32 mask; /* which bits of the struct are set */ + u32 mtime_client; /* last time client changed data */ + u32 owner; /* owner ID */ + u32 group; /* group ID */ + umode_t mode; /* UNIX mode */ +}; + +#define AFS_SET_MTIME 0x01 /* set the mtime */ +#define AFS_SET_OWNER 0x02 /* set the owner ID */ +#define AFS_SET_GROUP 0x04 /* set the group ID (unsupported?) */ +#define AFS_SET_MODE 0x08 /* set the UNIX mode */ +#define AFS_SET_SEG_SIZE 0x10 /* set the segment size (unsupported) */ + +/* + * AFS volume synchronisation information + */ +struct afs_volsync { + time_t creation; /* volume creation time */ +}; + +/* + * AFS volume status record + */ +struct afs_volume_status { + u32 vid; /* volume ID */ + u32 parent_id; /* parent volume ID */ + u8 online; /* true if volume currently online and available */ + u8 in_service; /* true if volume currently in service */ + u8 blessed; /* same as in_service */ + u8 needs_salvage; /* true if consistency checking required */ + u32 type; /* volume type (afs_voltype_t) */ + u32 min_quota; /* minimum space set aside (blocks) */ + u32 max_quota; /* maximum space this volume may occupy (blocks) */ + u32 blocks_in_use; /* space this volume currently occupies (blocks) */ + u32 part_blocks_avail; /* space available in volume's partition */ + u32 part_max_blocks; /* size of volume's partition */ +}; + +#define AFS_BLOCK_SIZE 1024 + +#endif /* AFS_H */ diff --git a/fs/afs/afs_cm.h b/fs/afs/afs_cm.h new file mode 100644 index 00000000..255f5dd6 --- /dev/null +++ b/fs/afs/afs_cm.h @@ -0,0 +1,33 @@ +/* AFS Cache Manager definitions + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef AFS_CM_H +#define AFS_CM_H + +#define AFS_CM_PORT 7001 /* AFS file server port */ +#define CM_SERVICE 1 /* AFS File Service ID */ + +enum AFS_CM_Operations { + CBCallBack = 204, /* break callback promises */ + CBInitCallBackState = 205, /* initialise callback state */ + CBProbe = 206, /* probe client */ + CBGetLock = 207, /* get contents of CM lock table */ + CBGetCE = 208, /* get cache file description */ + CBGetXStatsVersion = 209, /* get version of extended statistics */ + CBGetXStats = 210, /* get contents of extended statistics data */ + CBInitCallBackState3 = 213, /* initialise callback state, version 3 */ + CBProbeUuid = 214, /* check the client hasn't rebooted */ + CBTellMeAboutYourself = 65538, /* get client capabilities */ +}; + +#define AFS_CAP_ERROR_TRANSLATION 0x1 + +#endif /* AFS_FS_H */ diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h new file mode 100644 index 00000000..eb647323 --- /dev/null +++ b/fs/afs/afs_fs.h @@ -0,0 +1,56 @@ +/* AFS File Service definitions + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef AFS_FS_H +#define AFS_FS_H + +#define AFS_FS_PORT 7000 /* AFS file server port */ +#define FS_SERVICE 1 /* AFS File Service ID */ + +enum AFS_FS_Operations { + FSFETCHDATA = 130, /* AFS Fetch file data */ + FSFETCHSTATUS = 132, /* AFS Fetch file status */ + FSSTOREDATA = 133, /* AFS Store file data */ + FSSTORESTATUS = 135, /* AFS Store file status */ + FSREMOVEFILE = 136, /* AFS Remove a file */ + FSCREATEFILE = 137, /* AFS Create a file */ + FSRENAME = 138, /* AFS Rename or move a file or directory */ + FSSYMLINK = 139, /* AFS Create a symbolic link */ + FSLINK = 140, /* AFS Create a hard link */ + FSMAKEDIR = 141, /* AFS Create a directory */ + FSREMOVEDIR = 142, /* AFS Remove a directory */ + FSGIVEUPCALLBACKS = 147, /* AFS Discard callback promises */ + FSGETVOLUMEINFO = 148, /* AFS Get information about a volume */ + FSGETVOLUMESTATUS = 149, /* AFS Get volume status information */ + FSGETROOTVOLUME = 151, /* AFS Get root volume name */ + FSSETLOCK = 156, /* AFS Request a file lock */ + FSEXTENDLOCK = 157, /* AFS Extend a file lock */ + FSRELEASELOCK = 158, /* AFS Release a file lock */ + FSLOOKUP = 161, /* AFS lookup file in directory */ + FSFETCHDATA64 = 65537, /* AFS Fetch file data */ + FSSTOREDATA64 = 65538, /* AFS Store file data */ +}; + +enum AFS_FS_Errors { + VSALVAGE = 101, /* volume needs salvaging */ + VNOVNODE = 102, /* no such file/dir (vnode) */ + VNOVOL = 103, /* no such volume or volume unavailable */ + VVOLEXISTS = 104, /* volume name already exists */ + VNOSERVICE = 105, /* volume not currently in service */ + VOFFLINE = 106, /* volume is currently offline (more info available [VVL-spec]) */ + VONLINE = 107, /* volume is already online */ + VDISKFULL = 108, /* disk partition is full */ + VOVERQUOTA = 109, /* volume's maximum quota exceeded */ + VBUSY = 110, /* volume is temporarily unavailable */ + VMOVED = 111, /* volume moved to new server - ask this FS where */ +}; + +#endif /* AFS_FS_H */ diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h new file mode 100644 index 00000000..8bbefe00 --- /dev/null +++ b/fs/afs/afs_vl.h @@ -0,0 +1,84 @@ +/* AFS Volume Location Service client interface + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef AFS_VL_H +#define AFS_VL_H + +#include "afs.h" + +#define AFS_VL_PORT 7003 /* volume location service port */ +#define VL_SERVICE 52 /* RxRPC service ID for the Volume Location service */ + +enum AFSVL_Operations { + VLGETENTRYBYID = 503, /* AFS Get Cache Entry By ID operation ID */ + VLGETENTRYBYNAME = 504, /* AFS Get Cache Entry By Name operation ID */ + VLPROBE = 514, /* AFS Probe Volume Location Service operation ID */ +}; + +enum AFSVL_Errors { + AFSVL_IDEXIST = 363520, /* Volume Id entry exists in vl database */ + AFSVL_IO = 363521, /* I/O related error */ + AFSVL_NAMEEXIST = 363522, /* Volume name entry exists in vl database */ + AFSVL_CREATEFAIL = 363523, /* Internal creation failure */ + AFSVL_NOENT = 363524, /* No such entry */ + AFSVL_EMPTY = 363525, /* Vl database is empty */ + AFSVL_ENTDELETED = 363526, /* Entry is deleted (soft delete) */ + AFSVL_BADNAME = 363527, /* Volume name is illegal */ + AFSVL_BADINDEX = 363528, /* Index is out of range */ + AFSVL_BADVOLTYPE = 363529, /* Bad volume type */ + AFSVL_BADSERVER = 363530, /* Illegal server number (out of range) */ + AFSVL_BADPARTITION = 363531, /* Bad partition number */ + AFSVL_REPSFULL = 363532, /* Run out of space for Replication sites */ + AFSVL_NOREPSERVER = 363533, /* No such Replication server site exists */ + AFSVL_DUPREPSERVER = 363534, /* Replication site already exists */ + AFSVL_RWNOTFOUND = 363535, /* Parent R/W entry not found */ + AFSVL_BADREFCOUNT = 363536, /* Illegal Reference Count number */ + AFSVL_SIZEEXCEEDED = 363537, /* Vl size for attributes exceeded */ + AFSVL_BADENTRY = 363538, /* Bad incoming vl entry */ + AFSVL_BADVOLIDBUMP = 363539, /* Illegal max volid increment */ + AFSVL_IDALREADYHASHED = 363540, /* RO/BACK id already hashed */ + AFSVL_ENTRYLOCKED = 363541, /* Vl entry is already locked */ + AFSVL_BADVOLOPER = 363542, /* Bad volume operation code */ + AFSVL_BADRELLOCKTYPE = 363543, /* Bad release lock type */ + AFSVL_RERELEASE = 363544, /* Status report: last release was aborted */ + AFSVL_BADSERVERFLAG = 363545, /* Invalid replication site server °ag */ + AFSVL_PERM = 363546, /* No permission access */ + AFSVL_NOMEM = 363547, /* malloc/realloc failed to alloc enough memory */ +}; + +/* + * maps to "struct vldbentry" in vvl-spec.pdf + */ +struct afs_vldbentry { + char name[65]; /* name of volume (with NUL char) */ + afs_voltype_t type; /* volume type */ + unsigned num_servers; /* num servers that hold instances of this vol */ + unsigned clone_id; /* cloning ID */ + + unsigned flags; +#define AFS_VLF_RWEXISTS 0x1000 /* R/W volume exists */ +#define AFS_VLF_ROEXISTS 0x2000 /* R/O volume exists */ +#define AFS_VLF_BACKEXISTS 0x4000 /* backup volume exists */ + + afs_volid_t volume_ids[3]; /* volume IDs */ + + struct { + struct in_addr addr; /* server address */ + unsigned partition; /* partition ID on this server */ + unsigned flags; /* server specific flags */ +#define AFS_VLSF_NEWREPSITE 0x0001 /* unused */ +#define AFS_VLSF_ROVOL 0x0002 /* this server holds a R/O instance of the volume */ +#define AFS_VLSF_RWVOL 0x0004 /* this server holds a R/W instance of the volume */ +#define AFS_VLSF_BACKVOL 0x0008 /* this server holds a backup instance of the volume */ + } servers[8]; +}; + +#endif /* AFS_VL_H */ diff --git a/fs/afs/cache.c b/fs/afs/cache.c new file mode 100644 index 00000000..577763c3 --- /dev/null +++ b/fs/afs/cache.c @@ -0,0 +1,402 @@ +/* AFS caching stuff + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include "internal.h" + +static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen); + +static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static enum fscache_checkaux afs_vlocation_cache_check_aux( + void *cookie_netfs_data, const void *buffer, uint16_t buflen); + +static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); + +static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, + uint64_t *size); +static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen); +static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen); +static void afs_vnode_cache_now_uncached(void *cookie_netfs_data); + +struct fscache_netfs afs_cache_netfs = { + .name = "afs", + .version = 0, +}; + +struct fscache_cookie_def afs_cell_cache_index_def = { + .name = "AFS.cell", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = afs_cell_cache_get_key, + .get_aux = afs_cell_cache_get_aux, + .check_aux = afs_cell_cache_check_aux, +}; + +struct fscache_cookie_def afs_vlocation_cache_index_def = { + .name = "AFS.vldb", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = afs_vlocation_cache_get_key, + .get_aux = afs_vlocation_cache_get_aux, + .check_aux = afs_vlocation_cache_check_aux, +}; + +struct fscache_cookie_def afs_volume_cache_index_def = { + .name = "AFS.volume", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = afs_volume_cache_get_key, +}; + +struct fscache_cookie_def afs_vnode_cache_index_def = { + .name = "AFS.vnode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = afs_vnode_cache_get_key, + .get_attr = afs_vnode_cache_get_attr, + .get_aux = afs_vnode_cache_get_aux, + .check_aux = afs_vnode_cache_check_aux, + .now_uncached = afs_vnode_cache_now_uncached, +}; + +/* + * set the key for the index entry + */ +static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_cell *cell = cookie_netfs_data; + uint16_t klen; + + _enter("%p,%p,%u", cell, buffer, bufmax); + + klen = strlen(cell->name); + if (klen > bufmax) + return 0; + + memcpy(buffer, cell->name, klen); + return klen; +} + +/* + * provide new auxiliary cache data + */ +static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_cell *cell = cookie_netfs_data; + uint16_t dlen; + + _enter("%p,%p,%u", cell, buffer, bufmax); + + dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]); + dlen = min(dlen, bufmax); + dlen &= ~(sizeof(cell->vl_addrs[0]) - 1); + + memcpy(buffer, cell->vl_addrs, dlen); + return dlen; +} + +/* + * check that the auxiliary data indicates that the entry is still valid + */ +static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) +{ + _leave(" = OKAY"); + return FSCACHE_CHECKAUX_OKAY; +} + +/*****************************************************************************/ +/* + * set the key for the index entry + */ +static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_vlocation *vlocation = cookie_netfs_data; + uint16_t klen; + + _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax); + + klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name)); + if (klen > bufmax) + return 0; + + memcpy(buffer, vlocation->vldb.name, klen); + + _leave(" = %u", klen); + return klen; +} + +/* + * provide new auxiliary cache data + */ +static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_vlocation *vlocation = cookie_netfs_data; + uint16_t dlen; + + _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax); + + dlen = sizeof(struct afs_cache_vlocation); + dlen -= offsetof(struct afs_cache_vlocation, nservers); + if (dlen > bufmax) + return 0; + + memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen); + + _leave(" = %u", dlen); + return dlen; +} + +/* + * check that the auxiliary data indicates that the entry is still valid + */ +static +enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) +{ + const struct afs_cache_vlocation *cvldb; + struct afs_vlocation *vlocation = cookie_netfs_data; + uint16_t dlen; + + _enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen); + + /* check the size of the data is what we're expecting */ + dlen = sizeof(struct afs_cache_vlocation); + dlen -= offsetof(struct afs_cache_vlocation, nservers); + if (dlen != buflen) + return FSCACHE_CHECKAUX_OBSOLETE; + + cvldb = container_of(buffer, struct afs_cache_vlocation, nservers); + + /* if what's on disk is more valid than what's in memory, then use the + * VL record from the cache */ + if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) { + memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen); + vlocation->valid = 1; + _leave(" = SUCCESS [c->m]"); + return FSCACHE_CHECKAUX_OKAY; + } + + /* need to update the cache if the cached info differs */ + if (memcmp(&vlocation->vldb, buffer, dlen) != 0) { + /* delete if the volume IDs for this name differ */ + if (memcmp(&vlocation->vldb.vid, &cvldb->vid, + sizeof(cvldb->vid)) != 0 + ) { + _leave(" = OBSOLETE"); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + _leave(" = UPDATE"); + return FSCACHE_CHECKAUX_NEEDS_UPDATE; + } + + _leave(" = OKAY"); + return FSCACHE_CHECKAUX_OKAY; +} + +/*****************************************************************************/ +/* + * set the key for the volume index entry + */ +static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_volume *volume = cookie_netfs_data; + uint16_t klen; + + _enter("{%u},%p,%u", volume->type, buffer, bufmax); + + klen = sizeof(volume->type); + if (klen > bufmax) + return 0; + + memcpy(buffer, &volume->type, sizeof(volume->type)); + + _leave(" = %u", klen); + return klen; + +} + +/*****************************************************************************/ +/* + * set the key for the index entry + */ +static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_vnode *vnode = cookie_netfs_data; + uint16_t klen; + + _enter("{%x,%x,%llx},%p,%u", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, + buffer, bufmax); + + klen = sizeof(vnode->fid.vnode); + if (klen > bufmax) + return 0; + + memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode)); + + _leave(" = %u", klen); + return klen; +} + +/* + * provide updated file attributes + */ +static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct afs_vnode *vnode = cookie_netfs_data; + + _enter("{%x,%x,%llx},", + vnode->fid.vnode, vnode->fid.unique, + vnode->status.data_version); + + *size = vnode->status.size; +} + +/* + * provide new auxiliary cache data + */ +static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct afs_vnode *vnode = cookie_netfs_data; + uint16_t dlen; + + _enter("{%x,%x,%Lx},%p,%u", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, + buffer, bufmax); + + dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version); + if (dlen > bufmax) + return 0; + + memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique)); + buffer += sizeof(vnode->fid.unique); + memcpy(buffer, &vnode->status.data_version, + sizeof(vnode->status.data_version)); + + _leave(" = %u", dlen); + return dlen; +} + +/* + * check that the auxiliary data indicates that the entry is still valid + */ +static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) +{ + struct afs_vnode *vnode = cookie_netfs_data; + uint16_t dlen; + + _enter("{%x,%x,%llx},%p,%u", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, + buffer, buflen); + + /* check the size of the data is what we're expecting */ + dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version); + if (dlen != buflen) { + _leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + if (memcmp(buffer, + &vnode->fid.unique, + sizeof(vnode->fid.unique) + ) != 0) { + unsigned unique; + + memcpy(&unique, buffer, sizeof(unique)); + + _leave(" = OBSOLETE [uniq %x != %x]", + unique, vnode->fid.unique); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + if (memcmp(buffer + sizeof(vnode->fid.unique), + &vnode->status.data_version, + sizeof(vnode->status.data_version) + ) != 0) { + afs_dataversion_t version; + + memcpy(&version, buffer + sizeof(vnode->fid.unique), + sizeof(version)); + + _leave(" = OBSOLETE [vers %llx != %llx]", + version, vnode->status.data_version); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + _leave(" = SUCCESS"); + return FSCACHE_CHECKAUX_OKAY; +} + +/* + * indication the cookie is no longer uncached + * - this function is called when the backing store currently caching a cookie + * is removed + * - the netfs should use this to clean up any markers indicating cached pages + * - this is mandatory for any object that may have data + */ +static void afs_vnode_cache_now_uncached(void *cookie_netfs_data) +{ + struct afs_vnode *vnode = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + _enter("{%x,%x,%Lx}", + vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version); + + pagevec_init(&pvec, 0); + first = 0; + + for (;;) { + /* grab a bunch of pages to clean */ + nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping, + first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } + + _leave(""); +} diff --git a/fs/afs/callback.c b/fs/afs/callback.c new file mode 100644 index 00000000..587ef512 --- /dev/null +++ b/fs/afs/callback.c @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved. + * + * This software may be freely redistributed under the terms of the + * GNU General Public License. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Authors: David Woodhouse + * David Howells + * + */ + +#include +#include +#include +#include +#include +#include "internal.h" + +#if 0 +unsigned afs_vnode_update_timeout = 10; +#endif /* 0 */ + +#define afs_breakring_space(server) \ + CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail, \ + ARRAY_SIZE((server)->cb_break)) + +//static void afs_callback_updater(struct work_struct *); + +static struct workqueue_struct *afs_callback_update_worker; + +/* + * allow the fileserver to request callback state (re-)initialisation + */ +void afs_init_callback_state(struct afs_server *server) +{ + struct afs_vnode *vnode; + + _enter("{%p}", server); + + spin_lock(&server->cb_lock); + + /* kill all the promises on record from this server */ + while (!RB_EMPTY_ROOT(&server->cb_promises)) { + vnode = rb_entry(server->cb_promises.rb_node, + struct afs_vnode, cb_promise); + _debug("UNPROMISE { vid=%x:%u uq=%u}", + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); + rb_erase(&vnode->cb_promise, &server->cb_promises); + vnode->cb_promised = false; + } + + spin_unlock(&server->cb_lock); + _leave(""); +} + +/* + * handle the data invalidation side of a callback being broken + */ +void afs_broken_callback_work(struct work_struct *work) +{ + struct afs_vnode *vnode = + container_of(work, struct afs_vnode, cb_broken_work); + + _enter(""); + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + return; + + /* we're only interested in dealing with a broken callback on *this* + * vnode and only if no-one else has dealt with it yet */ + if (!mutex_trylock(&vnode->validate_lock)) + return; /* someone else is dealing with it */ + + if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) { + if (S_ISDIR(vnode->vfs_inode.i_mode)) + afs_clear_permits(vnode); + + if (afs_vnode_fetch_status(vnode, NULL, NULL) < 0) + goto out; + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + goto out; + + /* if the vnode's data version number changed then its contents + * are different */ + if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) + afs_zap_data(vnode); + } + +out: + mutex_unlock(&vnode->validate_lock); + + /* avoid the potential race whereby the mutex_trylock() in this + * function happens again between the clear_bit() and the + * mutex_unlock() */ + if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) { + _debug("requeue"); + queue_work(afs_callback_update_worker, &vnode->cb_broken_work); + } + _leave(""); +} + +/* + * actually break a callback + */ +static void afs_break_callback(struct afs_server *server, + struct afs_vnode *vnode) +{ + _enter(""); + + set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + + if (vnode->cb_promised) { + spin_lock(&vnode->lock); + + _debug("break callback"); + + spin_lock(&server->cb_lock); + if (vnode->cb_promised) { + rb_erase(&vnode->cb_promise, &server->cb_promises); + vnode->cb_promised = false; + } + spin_unlock(&server->cb_lock); + + queue_work(afs_callback_update_worker, &vnode->cb_broken_work); + if (list_empty(&vnode->granted_locks) && + !list_empty(&vnode->pending_locks)) + afs_lock_may_be_available(vnode); + spin_unlock(&vnode->lock); + } +} + +/* + * allow the fileserver to explicitly break one callback + * - happens when + * - the backing file is changed + * - a lock is released + */ +static void afs_break_one_callback(struct afs_server *server, + struct afs_fid *fid) +{ + struct afs_vnode *vnode; + struct rb_node *p; + + _debug("find"); + spin_lock(&server->fs_lock); + p = server->fs_vnodes.rb_node; + while (p) { + vnode = rb_entry(p, struct afs_vnode, server_rb); + if (fid->vid < vnode->fid.vid) + p = p->rb_left; + else if (fid->vid > vnode->fid.vid) + p = p->rb_right; + else if (fid->vnode < vnode->fid.vnode) + p = p->rb_left; + else if (fid->vnode > vnode->fid.vnode) + p = p->rb_right; + else if (fid->unique < vnode->fid.unique) + p = p->rb_left; + else if (fid->unique > vnode->fid.unique) + p = p->rb_right; + else + goto found; + } + + /* not found so we just ignore it (it may have moved to another + * server) */ +not_available: + _debug("not avail"); + spin_unlock(&server->fs_lock); + _leave(""); + return; + +found: + _debug("found"); + ASSERTCMP(server, ==, vnode->server); + + if (!igrab(AFS_VNODE_TO_I(vnode))) + goto not_available; + spin_unlock(&server->fs_lock); + + afs_break_callback(server, vnode); + iput(&vnode->vfs_inode); + _leave(""); +} + +/* + * allow the fileserver to break callback promises + */ +void afs_break_callbacks(struct afs_server *server, size_t count, + struct afs_callback callbacks[]) +{ + _enter("%p,%zu,", server, count); + + ASSERT(server != NULL); + ASSERTCMP(count, <=, AFSCBMAX); + + for (; count > 0; callbacks++, count--) { + _debug("- Fid { vl=%08x n=%u u=%u } CB { v=%u x=%u t=%u }", + callbacks->fid.vid, + callbacks->fid.vnode, + callbacks->fid.unique, + callbacks->version, + callbacks->expiry, + callbacks->type + ); + afs_break_one_callback(server, &callbacks->fid); + } + + _leave(""); + return; +} + +/* + * record the callback for breaking + * - the caller must hold server->cb_lock + */ +static void afs_do_give_up_callback(struct afs_server *server, + struct afs_vnode *vnode) +{ + struct afs_callback *cb; + + _enter("%p,%p", server, vnode); + + cb = &server->cb_break[server->cb_break_head]; + cb->fid = vnode->fid; + cb->version = vnode->cb_version; + cb->expiry = vnode->cb_expiry; + cb->type = vnode->cb_type; + smp_wmb(); + server->cb_break_head = + (server->cb_break_head + 1) & + (ARRAY_SIZE(server->cb_break) - 1); + + /* defer the breaking of callbacks to try and collect as many as + * possible to ship in one operation */ + switch (atomic_inc_return(&server->cb_break_n)) { + case 1 ... AFSCBMAX - 1: + queue_delayed_work(afs_callback_update_worker, + &server->cb_break_work, HZ * 2); + break; + case AFSCBMAX: + afs_flush_callback_breaks(server); + break; + default: + break; + } + + ASSERT(server->cb_promises.rb_node != NULL); + rb_erase(&vnode->cb_promise, &server->cb_promises); + vnode->cb_promised = false; + _leave(""); +} + +/* + * discard the callback on a deleted item + */ +void afs_discard_callback_on_delete(struct afs_vnode *vnode) +{ + struct afs_server *server = vnode->server; + + _enter("%d", vnode->cb_promised); + + if (!vnode->cb_promised) { + _leave(" [not promised]"); + return; + } + + ASSERT(server != NULL); + + spin_lock(&server->cb_lock); + if (vnode->cb_promised) { + ASSERT(server->cb_promises.rb_node != NULL); + rb_erase(&vnode->cb_promise, &server->cb_promises); + vnode->cb_promised = false; + } + spin_unlock(&server->cb_lock); + _leave(""); +} + +/* + * give up the callback registered for a vnode on the file server when the + * inode is being cleared + */ +void afs_give_up_callback(struct afs_vnode *vnode) +{ + struct afs_server *server = vnode->server; + + DECLARE_WAITQUEUE(myself, current); + + _enter("%d", vnode->cb_promised); + + _debug("GIVE UP INODE %p", &vnode->vfs_inode); + + if (!vnode->cb_promised) { + _leave(" [not promised]"); + return; + } + + ASSERT(server != NULL); + + spin_lock(&server->cb_lock); + if (vnode->cb_promised && afs_breakring_space(server) == 0) { + add_wait_queue(&server->cb_break_waitq, &myself); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!vnode->cb_promised || + afs_breakring_space(server) != 0) + break; + spin_unlock(&server->cb_lock); + schedule(); + spin_lock(&server->cb_lock); + } + remove_wait_queue(&server->cb_break_waitq, &myself); + __set_current_state(TASK_RUNNING); + } + + /* of course, it's always possible for the server to break this vnode's + * callback first... */ + if (vnode->cb_promised) + afs_do_give_up_callback(server, vnode); + + spin_unlock(&server->cb_lock); + _leave(""); +} + +/* + * dispatch a deferred give up callbacks operation + */ +void afs_dispatch_give_up_callbacks(struct work_struct *work) +{ + struct afs_server *server = + container_of(work, struct afs_server, cb_break_work.work); + + _enter(""); + + /* tell the fileserver to discard the callback promises it has + * - in the event of ENOMEM or some other error, we just forget that we + * had callbacks entirely, and the server will call us later to break + * them + */ + afs_fs_give_up_callbacks(server, &afs_async_call); +} + +/* + * flush the outstanding callback breaks on a server + */ +void afs_flush_callback_breaks(struct afs_server *server) +{ + cancel_delayed_work(&server->cb_break_work); + queue_delayed_work(afs_callback_update_worker, + &server->cb_break_work, 0); +} + +#if 0 +/* + * update a bunch of callbacks + */ +static void afs_callback_updater(struct work_struct *work) +{ + struct afs_server *server; + struct afs_vnode *vnode, *xvnode; + time_t now; + long timeout; + int ret; + + server = container_of(work, struct afs_server, updater); + + _enter(""); + + now = get_seconds(); + + /* find the first vnode to update */ + spin_lock(&server->cb_lock); + for (;;) { + if (RB_EMPTY_ROOT(&server->cb_promises)) { + spin_unlock(&server->cb_lock); + _leave(" [nothing]"); + return; + } + + vnode = rb_entry(rb_first(&server->cb_promises), + struct afs_vnode, cb_promise); + if (atomic_read(&vnode->usage) > 0) + break; + rb_erase(&vnode->cb_promise, &server->cb_promises); + vnode->cb_promised = false; + } + + timeout = vnode->update_at - now; + if (timeout > 0) { + queue_delayed_work(afs_vnode_update_worker, + &afs_vnode_update, timeout * HZ); + spin_unlock(&server->cb_lock); + _leave(" [nothing]"); + return; + } + + list_del_init(&vnode->update); + atomic_inc(&vnode->usage); + spin_unlock(&server->cb_lock); + + /* we can now perform the update */ + _debug("update %s", vnode->vldb.name); + vnode->state = AFS_VL_UPDATING; + vnode->upd_rej_cnt = 0; + vnode->upd_busy_cnt = 0; + + ret = afs_vnode_update_record(vl, &vldb); + switch (ret) { + case 0: + afs_vnode_apply_update(vl, &vldb); + vnode->state = AFS_VL_UPDATING; + break; + case -ENOMEDIUM: + vnode->state = AFS_VL_VOLUME_DELETED; + break; + default: + vnode->state = AFS_VL_UNCERTAIN; + break; + } + + /* and then reschedule */ + _debug("reschedule"); + vnode->update_at = get_seconds() + afs_vnode_update_timeout; + + spin_lock(&server->cb_lock); + + if (!list_empty(&server->cb_promises)) { + /* next update in 10 minutes, but wait at least 1 second more + * than the newest record already queued so that we don't spam + * the VL server suddenly with lots of requests + */ + xvnode = list_entry(server->cb_promises.prev, + struct afs_vnode, update); + if (vnode->update_at <= xvnode->update_at) + vnode->update_at = xvnode->update_at + 1; + xvnode = list_entry(server->cb_promises.next, + struct afs_vnode, update); + timeout = xvnode->update_at - now; + if (timeout < 0) + timeout = 0; + } else { + timeout = afs_vnode_update_timeout; + } + + list_add_tail(&vnode->update, &server->cb_promises); + + _debug("timeout %ld", timeout); + queue_delayed_work(afs_vnode_update_worker, + &afs_vnode_update, timeout * HZ); + spin_unlock(&server->cb_lock); + afs_put_vnode(vl); +} +#endif + +/* + * initialise the callback update process + */ +int __init afs_callback_update_init(void) +{ + afs_callback_update_worker = + create_singlethread_workqueue("kafs_callbackd"); + return afs_callback_update_worker ? 0 : -ENOMEM; +} + +/* + * shut down the callback update process + */ +void afs_callback_update_kill(void) +{ + destroy_workqueue(afs_callback_update_worker); +} diff --git a/fs/afs/cell.c b/fs/afs/cell.c new file mode 100644 index 00000000..3c090b75 --- /dev/null +++ b/fs/afs/cell.c @@ -0,0 +1,460 @@ +/* AFS cell and server record management + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +DECLARE_RWSEM(afs_proc_cells_sem); +LIST_HEAD(afs_proc_cells); + +static LIST_HEAD(afs_cells); +static DEFINE_RWLOCK(afs_cells_lock); +static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */ +static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq); +static struct afs_cell *afs_cell_root; + +/* + * allocate a cell record and fill in its name, VL server address list and + * allocate an anonymous key + */ +static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen, + char *vllist) +{ + struct afs_cell *cell; + struct key *key; + char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next; + char *dvllist = NULL, *_vllist = NULL; + char delimiter = ':'; + int ret; + + _enter("%*.*s,%s", namelen, namelen, name ?: "", vllist); + + BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */ + + if (namelen > AFS_MAXCELLNAME) { + _leave(" = -ENAMETOOLONG"); + return ERR_PTR(-ENAMETOOLONG); + } + + /* allocate and initialise a cell record */ + cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL); + if (!cell) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + memcpy(cell->name, name, namelen); + cell->name[namelen] = 0; + + atomic_set(&cell->usage, 1); + INIT_LIST_HEAD(&cell->link); + rwlock_init(&cell->servers_lock); + INIT_LIST_HEAD(&cell->servers); + init_rwsem(&cell->vl_sem); + INIT_LIST_HEAD(&cell->vl_list); + spin_lock_init(&cell->vl_lock); + + /* if the ip address is invalid, try dns query */ + if (!vllist || strlen(vllist) < 7) { + ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL); + if (ret < 0) { + if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY) + /* translate these errors into something + * userspace might understand */ + ret = -EDESTADDRREQ; + _leave(" = %d", ret); + return ERR_PTR(ret); + } + _vllist = dvllist; + + /* change the delimiter for user-space reply */ + delimiter = ','; + + } else { + _vllist = vllist; + } + + /* fill in the VL server list from the rest of the string */ + do { + unsigned a, b, c, d; + + next = strchr(_vllist, delimiter); + if (next) + *next++ = 0; + + if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) + goto bad_address; + + if (a > 255 || b > 255 || c > 255 || d > 255) + goto bad_address; + + cell->vl_addrs[cell->vl_naddrs++].s_addr = + htonl((a << 24) | (b << 16) | (c << 8) | d); + + } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next)); + + /* create a key to represent an anonymous user */ + memcpy(keyname, "afs@", 4); + dp = keyname + 4; + cp = cell->name; + do { + *dp++ = toupper(*cp); + } while (*cp++); + + key = rxrpc_get_null_key(keyname); + if (IS_ERR(key)) { + _debug("no key"); + ret = PTR_ERR(key); + goto error; + } + cell->anonymous_key = key; + + _debug("anon key %p{%x}", + cell->anonymous_key, key_serial(cell->anonymous_key)); + + _leave(" = %p", cell); + return cell; + +bad_address: + printk(KERN_ERR "kAFS: bad VL server IP address\n"); + ret = -EINVAL; +error: + key_put(cell->anonymous_key); + kfree(dvllist); + kfree(cell); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * afs_cell_crate() - create a cell record + * @name: is the name of the cell. + * @namsesz: is the strlen of the cell name. + * @vllist: is a colon separated list of IP addresses in "a.b.c.d" format. + * @retref: is T to return the cell reference when the cell exists. + */ +struct afs_cell *afs_cell_create(const char *name, unsigned namesz, + char *vllist, bool retref) +{ + struct afs_cell *cell; + int ret; + + _enter("%*.*s,%s", namesz, namesz, name ?: "", vllist); + + down_write(&afs_cells_sem); + read_lock(&afs_cells_lock); + list_for_each_entry(cell, &afs_cells, link) { + if (strncasecmp(cell->name, name, namesz) == 0) + goto duplicate_name; + } + read_unlock(&afs_cells_lock); + + cell = afs_cell_alloc(name, namesz, vllist); + if (IS_ERR(cell)) { + _leave(" = %ld", PTR_ERR(cell)); + up_write(&afs_cells_sem); + return cell; + } + + /* add a proc directory for this cell */ + ret = afs_proc_cell_setup(cell); + if (ret < 0) + goto error; + +#ifdef CONFIG_AFS_FSCACHE + /* put it up for caching (this never returns an error) */ + cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, + &afs_cell_cache_index_def, + cell); +#endif + + /* add to the cell lists */ + write_lock(&afs_cells_lock); + list_add_tail(&cell->link, &afs_cells); + write_unlock(&afs_cells_lock); + + down_write(&afs_proc_cells_sem); + list_add_tail(&cell->proc_link, &afs_proc_cells); + up_write(&afs_proc_cells_sem); + up_write(&afs_cells_sem); + + _leave(" = %p", cell); + return cell; + +error: + up_write(&afs_cells_sem); + key_put(cell->anonymous_key); + kfree(cell); + _leave(" = %d", ret); + return ERR_PTR(ret); + +duplicate_name: + if (retref && !IS_ERR(cell)) + afs_get_cell(cell); + + read_unlock(&afs_cells_lock); + up_write(&afs_cells_sem); + + if (retref) { + _leave(" = %p", cell); + return cell; + } + + _leave(" = -EEXIST"); + return ERR_PTR(-EEXIST); +} + +/* + * set the root cell information + * - can be called with a module parameter string + * - can be called from a write to /proc/fs/afs/rootcell + */ +int afs_cell_init(char *rootcell) +{ + struct afs_cell *old_root, *new_root; + char *cp; + + _enter(""); + + if (!rootcell) { + /* module is loaded with no parameters, or built statically. + * - in the future we might initialize cell DB here. + */ + _leave(" = 0 [no root]"); + return 0; + } + + cp = strchr(rootcell, ':'); + if (!cp) + _debug("kAFS: no VL server IP addresses specified"); + else + *cp++ = 0; + + /* allocate a cell record for the root cell */ + new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false); + if (IS_ERR(new_root)) { + _leave(" = %ld", PTR_ERR(new_root)); + return PTR_ERR(new_root); + } + + /* install the new cell */ + write_lock(&afs_cells_lock); + old_root = afs_cell_root; + afs_cell_root = new_root; + write_unlock(&afs_cells_lock); + afs_put_cell(old_root); + + _leave(" = 0"); + return 0; +} + +/* + * lookup a cell record + */ +struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz, + bool dns_cell) +{ + struct afs_cell *cell; + + _enter("\"%*.*s\",", namesz, namesz, name ?: ""); + + down_read(&afs_cells_sem); + read_lock(&afs_cells_lock); + + if (name) { + /* if the cell was named, look for it in the cell record list */ + list_for_each_entry(cell, &afs_cells, link) { + if (strncmp(cell->name, name, namesz) == 0) { + afs_get_cell(cell); + goto found; + } + } + cell = ERR_PTR(-ENOENT); + if (dns_cell) + goto create_cell; + found: + ; + } else { + cell = afs_cell_root; + if (!cell) { + /* this should not happen unless user tries to mount + * when root cell is not set. Return an impossibly + * bizarre errno to alert the user. Things like + * ENOENT might be "more appropriate" but they happen + * for other reasons. + */ + cell = ERR_PTR(-EDESTADDRREQ); + } else { + afs_get_cell(cell); + } + + } + + read_unlock(&afs_cells_lock); + up_read(&afs_cells_sem); + _leave(" = %p", cell); + return cell; + +create_cell: + read_unlock(&afs_cells_lock); + up_read(&afs_cells_sem); + + cell = afs_cell_create(name, namesz, NULL, true); + + _leave(" = %p", cell); + return cell; +} + +#if 0 +/* + * try and get a cell record + */ +struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell) +{ + write_lock(&afs_cells_lock); + + if (cell && !list_empty(&cell->link)) + afs_get_cell(cell); + else + cell = NULL; + + write_unlock(&afs_cells_lock); + return cell; +} +#endif /* 0 */ + +/* + * destroy a cell record + */ +void afs_put_cell(struct afs_cell *cell) +{ + if (!cell) + return; + + _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name); + + ASSERTCMP(atomic_read(&cell->usage), >, 0); + + /* to prevent a race, the decrement and the dequeue must be effectively + * atomic */ + write_lock(&afs_cells_lock); + + if (likely(!atomic_dec_and_test(&cell->usage))) { + write_unlock(&afs_cells_lock); + _leave(""); + return; + } + + ASSERT(list_empty(&cell->servers)); + ASSERT(list_empty(&cell->vl_list)); + + write_unlock(&afs_cells_lock); + + wake_up(&afs_cells_freeable_wq); + + _leave(" [unused]"); +} + +/* + * destroy a cell record + * - must be called with the afs_cells_sem write-locked + * - cell->link should have been broken by the caller + */ +static void afs_cell_destroy(struct afs_cell *cell) +{ + _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name); + + ASSERTCMP(atomic_read(&cell->usage), >=, 0); + ASSERT(list_empty(&cell->link)); + + /* wait for everyone to stop using the cell */ + if (atomic_read(&cell->usage) > 0) { + DECLARE_WAITQUEUE(myself, current); + + _debug("wait for cell %s", cell->name); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&afs_cells_freeable_wq, &myself); + + while (atomic_read(&cell->usage) > 0) { + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + } + + remove_wait_queue(&afs_cells_freeable_wq, &myself); + set_current_state(TASK_RUNNING); + } + + _debug("cell dead"); + ASSERTCMP(atomic_read(&cell->usage), ==, 0); + ASSERT(list_empty(&cell->servers)); + ASSERT(list_empty(&cell->vl_list)); + + afs_proc_cell_remove(cell); + + down_write(&afs_proc_cells_sem); + list_del_init(&cell->proc_link); + up_write(&afs_proc_cells_sem); + +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(cell->cache, 0); +#endif + key_put(cell->anonymous_key); + kfree(cell); + + _leave(" [destroyed]"); +} + +/* + * purge in-memory cell database on module unload or afs_init() failure + * - the timeout daemon is stopped before calling this + */ +void afs_cell_purge(void) +{ + struct afs_cell *cell; + + _enter(""); + + afs_put_cell(afs_cell_root); + + down_write(&afs_cells_sem); + + while (!list_empty(&afs_cells)) { + cell = NULL; + + /* remove the next cell from the front of the list */ + write_lock(&afs_cells_lock); + + if (!list_empty(&afs_cells)) { + cell = list_entry(afs_cells.next, + struct afs_cell, link); + list_del_init(&cell->link); + } + + write_unlock(&afs_cells_lock); + + if (cell) { + _debug("PURGING CELL %s (%d)", + cell->name, atomic_read(&cell->usage)); + + /* now the cell should be left with no references */ + afs_cell_destroy(cell); + } + } + + up_write(&afs_cells_sem); + _leave(""); +} diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c new file mode 100644 index 00000000..1c8c6cc6 --- /dev/null +++ b/fs/afs/cmservice.c @@ -0,0 +1,585 @@ +/* AFS Cache Manager Service + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "internal.h" +#include "afs_cm.h" + +#if 0 +struct workqueue_struct *afs_cm_workqueue; +#endif /* 0 */ + +static int afs_deliver_cb_init_call_back_state(struct afs_call *, + struct sk_buff *, bool); +static int afs_deliver_cb_init_call_back_state3(struct afs_call *, + struct sk_buff *, bool); +static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool); +static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool); +static int afs_deliver_cb_probe_uuid(struct afs_call *, struct sk_buff *, bool); +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *, + struct sk_buff *, bool); +static void afs_cm_destructor(struct afs_call *); + +/* + * CB.CallBack operation type + */ +static const struct afs_call_type afs_SRXCBCallBack = { + .name = "CB.CallBack", + .deliver = afs_deliver_cb_callback, + .abort_to_error = afs_abort_to_error, + .destructor = afs_cm_destructor, +}; + +/* + * CB.InitCallBackState operation type + */ +static const struct afs_call_type afs_SRXCBInitCallBackState = { + .name = "CB.InitCallBackState", + .deliver = afs_deliver_cb_init_call_back_state, + .abort_to_error = afs_abort_to_error, + .destructor = afs_cm_destructor, +}; + +/* + * CB.InitCallBackState3 operation type + */ +static const struct afs_call_type afs_SRXCBInitCallBackState3 = { + .name = "CB.InitCallBackState3", + .deliver = afs_deliver_cb_init_call_back_state3, + .abort_to_error = afs_abort_to_error, + .destructor = afs_cm_destructor, +}; + +/* + * CB.Probe operation type + */ +static const struct afs_call_type afs_SRXCBProbe = { + .name = "CB.Probe", + .deliver = afs_deliver_cb_probe, + .abort_to_error = afs_abort_to_error, + .destructor = afs_cm_destructor, +}; + +/* + * CB.ProbeUuid operation type + */ +static const struct afs_call_type afs_SRXCBProbeUuid = { + .name = "CB.ProbeUuid", + .deliver = afs_deliver_cb_probe_uuid, + .abort_to_error = afs_abort_to_error, + .destructor = afs_cm_destructor, +}; + +/* + * CB.TellMeAboutYourself operation type + */ +static const struct afs_call_type afs_SRXCBTellMeAboutYourself = { + .name = "CB.TellMeAboutYourself", + .deliver = afs_deliver_cb_tell_me_about_yourself, + .abort_to_error = afs_abort_to_error, + .destructor = afs_cm_destructor, +}; + +/* + * route an incoming cache manager call + * - return T if supported, F if not + */ +bool afs_cm_incoming_call(struct afs_call *call) +{ + u32 operation_id = ntohl(call->operation_ID); + + _enter("{CB.OP %u}", operation_id); + + switch (operation_id) { + case CBCallBack: + call->type = &afs_SRXCBCallBack; + return true; + case CBInitCallBackState: + call->type = &afs_SRXCBInitCallBackState; + return true; + case CBInitCallBackState3: + call->type = &afs_SRXCBInitCallBackState3; + return true; + case CBProbe: + call->type = &afs_SRXCBProbe; + return true; + case CBTellMeAboutYourself: + call->type = &afs_SRXCBTellMeAboutYourself; + return true; + default: + return false; + } +} + +/* + * clean up a cache manager call + */ +static void afs_cm_destructor(struct afs_call *call) +{ + _enter(""); + + afs_put_server(call->server); + call->server = NULL; + kfree(call->buffer); + call->buffer = NULL; +} + +/* + * allow the fileserver to see if the cache manager is still alive + */ +static void SRXAFSCB_CallBack(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, work); + + _enter(""); + + /* be sure to send the reply *before* attempting to spam the AFS server + * with FSFetchStatus requests on the vnodes with broken callbacks lest + * the AFS server get into a vicious cycle of trying to break further + * callbacks because it hadn't received completion of the CBCallBack op + * yet */ + afs_send_empty_reply(call); + + afs_break_callbacks(call->server, call->count, call->request); + _leave(""); +} + +/* + * deliver request data to a CB.CallBack call + */ +static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, + bool last) +{ + struct afs_callback *cb; + struct afs_server *server; + struct in_addr addr; + __be32 *bp; + u32 tmp; + int ret, loop; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->unmarshall++; + + /* extract the FID array and its count in two steps */ + case 1: + _debug("extract FID count"); + ret = afs_extract_data(call, skb, last, &call->tmp, 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->count = ntohl(call->tmp); + _debug("FID count: %u", call->count); + if (call->count > AFSCBMAX) + return -EBADMSG; + + call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL); + if (!call->buffer) + return -ENOMEM; + call->offset = 0; + call->unmarshall++; + + case 2: + _debug("extract FID array"); + ret = afs_extract_data(call, skb, last, call->buffer, + call->count * 3 * 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + _debug("unmarshall FID array"); + call->request = kcalloc(call->count, + sizeof(struct afs_callback), + GFP_KERNEL); + if (!call->request) + return -ENOMEM; + + cb = call->request; + bp = call->buffer; + for (loop = call->count; loop > 0; loop--, cb++) { + cb->fid.vid = ntohl(*bp++); + cb->fid.vnode = ntohl(*bp++); + cb->fid.unique = ntohl(*bp++); + cb->type = AFSCM_CB_UNTYPED; + } + + call->offset = 0; + call->unmarshall++; + + /* extract the callback array and its count in two steps */ + case 3: + _debug("extract CB count"); + ret = afs_extract_data(call, skb, last, &call->tmp, 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + tmp = ntohl(call->tmp); + _debug("CB count: %u", tmp); + if (tmp != call->count && tmp != 0) + return -EBADMSG; + call->offset = 0; + call->unmarshall++; + if (tmp == 0) + goto empty_cb_array; + + case 4: + _debug("extract CB array"); + ret = afs_extract_data(call, skb, last, call->request, + call->count * 3 * 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + _debug("unmarshall CB array"); + cb = call->request; + bp = call->buffer; + for (loop = call->count; loop > 0; loop--, cb++) { + cb->version = ntohl(*bp++); + cb->expiry = ntohl(*bp++); + cb->type = ntohl(*bp++); + } + + empty_cb_array: + call->offset = 0; + call->unmarshall++; + + case 5: + _debug("trailer"); + if (skb->len != 0) + return -EBADMSG; + break; + } + + if (!last) + return 0; + + call->state = AFS_CALL_REPLYING; + + /* we'll need the file server record as that tells us which set of + * vnodes to operate upon */ + memcpy(&addr, &ip_hdr(skb)->saddr, 4); + server = afs_find_server(&addr); + if (!server) + return -ENOTCONN; + call->server = server; + + INIT_WORK(&call->work, SRXAFSCB_CallBack); + queue_work(afs_wq, &call->work); + return 0; +} + +/* + * allow the fileserver to request callback state (re-)initialisation + */ +static void SRXAFSCB_InitCallBackState(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, work); + + _enter("{%p}", call->server); + + afs_init_callback_state(call->server); + afs_send_empty_reply(call); + _leave(""); +} + +/* + * deliver request data to a CB.InitCallBackState call + */ +static int afs_deliver_cb_init_call_back_state(struct afs_call *call, + struct sk_buff *skb, + bool last) +{ + struct afs_server *server; + struct in_addr addr; + + _enter(",{%u},%d", skb->len, last); + + if (skb->len > 0) + return -EBADMSG; + if (!last) + return 0; + + /* no unmarshalling required */ + call->state = AFS_CALL_REPLYING; + + /* we'll need the file server record as that tells us which set of + * vnodes to operate upon */ + memcpy(&addr, &ip_hdr(skb)->saddr, 4); + server = afs_find_server(&addr); + if (!server) + return -ENOTCONN; + call->server = server; + + INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); + queue_work(afs_wq, &call->work); + return 0; +} + +/* + * deliver request data to a CB.InitCallBackState3 call + */ +static int afs_deliver_cb_init_call_back_state3(struct afs_call *call, + struct sk_buff *skb, + bool last) +{ + struct afs_server *server; + struct in_addr addr; + + _enter(",{%u},%d", skb->len, last); + + if (!last) + return 0; + + /* no unmarshalling required */ + call->state = AFS_CALL_REPLYING; + + /* we'll need the file server record as that tells us which set of + * vnodes to operate upon */ + memcpy(&addr, &ip_hdr(skb)->saddr, 4); + server = afs_find_server(&addr); + if (!server) + return -ENOTCONN; + call->server = server; + + INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); + queue_work(afs_wq, &call->work); + return 0; +} + +/* + * allow the fileserver to see if the cache manager is still alive + */ +static void SRXAFSCB_Probe(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, work); + + _enter(""); + afs_send_empty_reply(call); + _leave(""); +} + +/* + * deliver request data to a CB.Probe call + */ +static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb, + bool last) +{ + _enter(",{%u},%d", skb->len, last); + + if (skb->len > 0) + return -EBADMSG; + if (!last) + return 0; + + /* no unmarshalling required */ + call->state = AFS_CALL_REPLYING; + + INIT_WORK(&call->work, SRXAFSCB_Probe); + queue_work(afs_wq, &call->work); + return 0; +} + +/* + * allow the fileserver to quickly find out if the fileserver has been rebooted + */ +static void SRXAFSCB_ProbeUuid(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, work); + struct afs_uuid *r = call->request; + + struct { + __be32 match; + } reply; + + _enter(""); + + + if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0) + reply.match = htonl(0); + else + reply.match = htonl(1); + + afs_send_simple_reply(call, &reply, sizeof(reply)); + _leave(""); +} + +/* + * deliver request data to a CB.ProbeUuid call + */ +static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, + bool last) +{ + struct afs_uuid *r; + unsigned loop; + __be32 *b; + int ret; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + if (skb->len > 0) + return -EBADMSG; + if (!last) + return 0; + + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL); + if (!call->buffer) + return -ENOMEM; + call->unmarshall++; + + case 1: + _debug("extract UUID"); + ret = afs_extract_data(call, skb, last, call->buffer, + 11 * sizeof(__be32)); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + _debug("unmarshall UUID"); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); + if (!call->request) + return -ENOMEM; + + b = call->buffer; + r = call->request; + r->time_low = ntohl(b[0]); + r->time_mid = ntohl(b[1]); + r->time_hi_and_version = ntohl(b[2]); + r->clock_seq_hi_and_reserved = ntohl(b[3]); + r->clock_seq_low = ntohl(b[4]); + + for (loop = 0; loop < 6; loop++) + r->node[loop] = ntohl(b[loop + 5]); + + call->offset = 0; + call->unmarshall++; + + case 2: + _debug("trailer"); + if (skb->len != 0) + return -EBADMSG; + break; + } + + if (!last) + return 0; + + call->state = AFS_CALL_REPLYING; + + INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); + queue_work(afs_wq, &call->work); + return 0; +} + +/* + * allow the fileserver to ask about the cache manager's capabilities + */ +static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) +{ + struct afs_interface *ifs; + struct afs_call *call = container_of(work, struct afs_call, work); + int loop, nifs; + + struct { + struct /* InterfaceAddr */ { + __be32 nifs; + __be32 uuid[11]; + __be32 ifaddr[32]; + __be32 netmask[32]; + __be32 mtu[32]; + } ia; + struct /* Capabilities */ { + __be32 capcount; + __be32 caps[1]; + } cap; + } reply; + + _enter(""); + + nifs = 0; + ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL); + if (ifs) { + nifs = afs_get_ipv4_interfaces(ifs, 32, false); + if (nifs < 0) { + kfree(ifs); + ifs = NULL; + nifs = 0; + } + } + + memset(&reply, 0, sizeof(reply)); + reply.ia.nifs = htonl(nifs); + + reply.ia.uuid[0] = htonl(afs_uuid.time_low); + reply.ia.uuid[1] = htonl(afs_uuid.time_mid); + reply.ia.uuid[2] = htonl(afs_uuid.time_hi_and_version); + reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved); + reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low); + for (loop = 0; loop < 6; loop++) + reply.ia.uuid[loop + 5] = htonl((s8) afs_uuid.node[loop]); + + if (ifs) { + for (loop = 0; loop < nifs; loop++) { + reply.ia.ifaddr[loop] = ifs[loop].address.s_addr; + reply.ia.netmask[loop] = ifs[loop].netmask.s_addr; + reply.ia.mtu[loop] = htonl(ifs[loop].mtu); + } + kfree(ifs); + } + + reply.cap.capcount = htonl(1); + reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION); + afs_send_simple_reply(call, &reply, sizeof(reply)); + + _leave(""); +} + +/* + * deliver request data to a CB.TellMeAboutYourself call + */ +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + _enter(",{%u},%d", skb->len, last); + + if (skb->len > 0) + return -EBADMSG; + if (!last) + return 0; + + /* no unmarshalling required */ + call->state = AFS_CALL_REPLYING; + + INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself); + queue_work(afs_wq, &call->work); + return 0; +} diff --git a/fs/afs/dir.c b/fs/afs/dir.c new file mode 100644 index 00000000..1b0b1955 --- /dev/null +++ b/fs/afs/dir.c @@ -0,0 +1,1184 @@ +/* dir.c: AFS filesystem directory handling + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd); +static int afs_dir_open(struct inode *inode, struct file *file); +static int afs_readdir(struct file *file, void *dirent, filldir_t filldir); +static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd); +static int afs_d_delete(const struct dentry *dentry); +static void afs_d_release(struct dentry *dentry); +static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, + loff_t fpos, u64 ino, unsigned dtype); +static int afs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd); +static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static int afs_rmdir(struct inode *dir, struct dentry *dentry); +static int afs_unlink(struct inode *dir, struct dentry *dentry); +static int afs_link(struct dentry *from, struct inode *dir, + struct dentry *dentry); +static int afs_symlink(struct inode *dir, struct dentry *dentry, + const char *content); +static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry); + +const struct file_operations afs_dir_file_operations = { + .open = afs_dir_open, + .release = afs_release, + .readdir = afs_readdir, + .lock = afs_lock, + .llseek = generic_file_llseek, +}; + +const struct inode_operations afs_dir_inode_operations = { + .create = afs_create, + .lookup = afs_lookup, + .link = afs_link, + .unlink = afs_unlink, + .symlink = afs_symlink, + .mkdir = afs_mkdir, + .rmdir = afs_rmdir, + .rename = afs_rename, + .permission = afs_permission, + .getattr = afs_getattr, + .setattr = afs_setattr, +}; + +const struct dentry_operations afs_fs_dentry_operations = { + .d_revalidate = afs_d_revalidate, + .d_delete = afs_d_delete, + .d_release = afs_d_release, + .d_automount = afs_d_automount, +}; + +#define AFS_DIR_HASHTBL_SIZE 128 +#define AFS_DIR_DIRENT_SIZE 32 +#define AFS_DIRENT_PER_BLOCK 64 + +union afs_dirent { + struct { + uint8_t valid; + uint8_t unused[1]; + __be16 hash_next; + __be32 vnode; + __be32 unique; + uint8_t name[16]; + uint8_t overflow[4]; /* if any char of the name (inc + * NUL) reaches here, consume + * the next dirent too */ + } u; + uint8_t extended_name[32]; +}; + +/* AFS directory page header (one at the beginning of every 2048-byte chunk) */ +struct afs_dir_pagehdr { + __be16 npages; + __be16 magic; +#define AFS_DIR_MAGIC htons(1234) + uint8_t nentries; + uint8_t bitmap[8]; + uint8_t pad[19]; +}; + +/* directory block layout */ +union afs_dir_block { + + struct afs_dir_pagehdr pagehdr; + + struct { + struct afs_dir_pagehdr pagehdr; + uint8_t alloc_ctrs[128]; + /* dir hash table */ + uint16_t hashtable[AFS_DIR_HASHTBL_SIZE]; + } hdr; + + union afs_dirent dirents[AFS_DIRENT_PER_BLOCK]; +}; + +/* layout on a linux VM page */ +struct afs_dir_page { + union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)]; +}; + +struct afs_lookup_cookie { + struct afs_fid fid; + const char *name; + size_t nlen; + int found; +}; + +/* + * check that a directory page is valid + */ +static inline void afs_dir_check_page(struct inode *dir, struct page *page) +{ + struct afs_dir_page *dbuf; + loff_t latter; + int tmp, qty; + +#if 0 + /* check the page count */ + qty = desc.size / sizeof(dbuf->blocks[0]); + if (qty == 0) + goto error; + + if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) { + printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n", + __func__, dir->i_ino, qty, + ntohs(dbuf->blocks[0].pagehdr.npages)); + goto error; + } +#endif + + /* determine how many magic numbers there should be in this page */ + latter = dir->i_size - page_offset(page); + if (latter >= PAGE_SIZE) + qty = PAGE_SIZE; + else + qty = latter; + qty /= sizeof(union afs_dir_block); + + /* check them */ + dbuf = page_address(page); + for (tmp = 0; tmp < qty; tmp++) { + if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) { + printk("kAFS: %s(%lu): bad magic %d/%d is %04hx\n", + __func__, dir->i_ino, tmp, qty, + ntohs(dbuf->blocks[tmp].pagehdr.magic)); + goto error; + } + } + + SetPageChecked(page); + return; + +error: + SetPageChecked(page); + SetPageError(page); +} + +/* + * discard a page cached in the pagecache + */ +static inline void afs_dir_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +/* + * get a page into the pagecache + */ +static struct page *afs_dir_get_page(struct inode *dir, unsigned long index, + struct key *key) +{ + struct page *page; + _enter("{%lu},%lu", dir->i_ino, index); + + page = read_cache_page(dir->i_mapping, index, afs_page_filler, key); + if (!IS_ERR(page)) { + kmap(page); + if (!PageChecked(page)) + afs_dir_check_page(dir, page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + afs_dir_put_page(page); + _leave(" = -EIO"); + return ERR_PTR(-EIO); +} + +/* + * open an AFS directory file + */ +static int afs_dir_open(struct inode *inode, struct file *file) +{ + _enter("{%lu}", inode->i_ino); + + BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_dirent) != 32); + + if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags)) + return -ENOENT; + + return afs_open(inode, file); +} + +/* + * deal with one block in an AFS directory + */ +static int afs_dir_iterate_block(unsigned *fpos, + union afs_dir_block *block, + unsigned blkoff, + void *cookie, + filldir_t filldir) +{ + union afs_dirent *dire; + unsigned offset, next, curr; + size_t nlen; + int tmp, ret; + + _enter("%u,%x,%p,,",*fpos,blkoff,block); + + curr = (*fpos - blkoff) / sizeof(union afs_dirent); + + /* walk through the block, an entry at a time */ + for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries; + offset < AFS_DIRENT_PER_BLOCK; + offset = next + ) { + next = offset + 1; + + /* skip entries marked unused in the bitmap */ + if (!(block->pagehdr.bitmap[offset / 8] & + (1 << (offset % 8)))) { + _debug("ENT[%Zu.%u]: unused", + blkoff / sizeof(union afs_dir_block), offset); + if (offset >= curr) + *fpos = blkoff + + next * sizeof(union afs_dirent); + continue; + } + + /* got a valid entry */ + dire = &block->dirents[offset]; + nlen = strnlen(dire->u.name, + sizeof(*block) - + offset * sizeof(union afs_dirent)); + + _debug("ENT[%Zu.%u]: %s %Zu \"%s\"", + blkoff / sizeof(union afs_dir_block), offset, + (offset < curr ? "skip" : "fill"), + nlen, dire->u.name); + + /* work out where the next possible entry is */ + for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_dirent)) { + if (next >= AFS_DIRENT_PER_BLOCK) { + _debug("ENT[%Zu.%u]:" + " %u travelled beyond end dir block" + " (len %u/%Zu)", + blkoff / sizeof(union afs_dir_block), + offset, next, tmp, nlen); + return -EIO; + } + if (!(block->pagehdr.bitmap[next / 8] & + (1 << (next % 8)))) { + _debug("ENT[%Zu.%u]:" + " %u unmarked extension (len %u/%Zu)", + blkoff / sizeof(union afs_dir_block), + offset, next, tmp, nlen); + return -EIO; + } + + _debug("ENT[%Zu.%u]: ext %u/%Zu", + blkoff / sizeof(union afs_dir_block), + next, tmp, nlen); + next++; + } + + /* skip if starts before the current position */ + if (offset < curr) + continue; + + /* found the next entry */ + ret = filldir(cookie, + dire->u.name, + nlen, + blkoff + offset * sizeof(union afs_dirent), + ntohl(dire->u.vnode), + filldir == afs_lookup_filldir ? + ntohl(dire->u.unique) : DT_UNKNOWN); + if (ret < 0) { + _leave(" = 0 [full]"); + return 0; + } + + *fpos = blkoff + next * sizeof(union afs_dirent); + } + + _leave(" = 1 [more]"); + return 1; +} + +/* + * iterate through the data blob that lists the contents of an AFS directory + */ +static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie, + filldir_t filldir, struct key *key) +{ + union afs_dir_block *dblock; + struct afs_dir_page *dbuf; + struct page *page; + unsigned blkoff, limit; + int ret; + + _enter("{%lu},%u,,", dir->i_ino, *fpos); + + if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) { + _leave(" = -ESTALE"); + return -ESTALE; + } + + /* round the file position up to the next entry boundary */ + *fpos += sizeof(union afs_dirent) - 1; + *fpos &= ~(sizeof(union afs_dirent) - 1); + + /* walk through the blocks in sequence */ + ret = 0; + while (*fpos < dir->i_size) { + blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1); + + /* fetch the appropriate page from the directory */ + page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + break; + } + + limit = blkoff & ~(PAGE_SIZE - 1); + + dbuf = page_address(page); + + /* deal with the individual blocks stashed on this page */ + do { + dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) / + sizeof(union afs_dir_block)]; + ret = afs_dir_iterate_block(fpos, dblock, blkoff, + cookie, filldir); + if (ret != 1) { + afs_dir_put_page(page); + goto out; + } + + blkoff += sizeof(union afs_dir_block); + + } while (*fpos < dir->i_size && blkoff < limit); + + afs_dir_put_page(page); + ret = 0; + } + +out: + _leave(" = %d", ret); + return ret; +} + +/* + * read an AFS directory + */ +static int afs_readdir(struct file *file, void *cookie, filldir_t filldir) +{ + unsigned fpos; + int ret; + + _enter("{%Ld,{%lu}}", + file->f_pos, file->f_path.dentry->d_inode->i_ino); + + ASSERT(file->private_data != NULL); + + fpos = file->f_pos; + ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos, + cookie, filldir, file->private_data); + file->f_pos = fpos; + + _leave(" = %d", ret); + return ret; +} + +/* + * search the directory for a name + * - if afs_dir_iterate_block() spots this function, it'll pass the FID + * uniquifier through dtype + */ +static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, + loff_t fpos, u64 ino, unsigned dtype) +{ + struct afs_lookup_cookie *cookie = _cookie; + + _enter("{%s,%Zu},%s,%u,,%llu,%u", + cookie->name, cookie->nlen, name, nlen, + (unsigned long long) ino, dtype); + + /* insanity checks first */ + BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_dirent) != 32); + + if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) { + _leave(" = 0 [no]"); + return 0; + } + + cookie->fid.vnode = ino; + cookie->fid.unique = dtype; + cookie->found = 1; + + _leave(" = -1 [found]"); + return -1; +} + +/* + * do a lookup in a directory + * - just returns the FID the dentry name maps to if found + */ +static int afs_do_lookup(struct inode *dir, struct dentry *dentry, + struct afs_fid *fid, struct key *key) +{ + struct afs_lookup_cookie cookie; + struct afs_super_info *as; + unsigned fpos; + int ret; + + _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name); + + as = dir->i_sb->s_fs_info; + + /* search the directory */ + cookie.name = dentry->d_name.name; + cookie.nlen = dentry->d_name.len; + cookie.fid.vid = as->volume->vid; + cookie.found = 0; + + fpos = 0; + ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir, + key); + if (ret < 0) { + _leave(" = %d [iter]", ret); + return ret; + } + + ret = -ENOENT; + if (!cookie.found) { + _leave(" = -ENOENT [not found]"); + return -ENOENT; + } + + *fid = cookie.fid; + _leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique); + return 0; +} + +/* + * Try to auto mount the mountpoint with pseudo directory, if the autocell + * operation is setted. + */ +static struct inode *afs_try_auto_mntpt( + int ret, struct dentry *dentry, struct inode *dir, struct key *key, + struct afs_fid *fid) +{ + const char *devname = dentry->d_name.name; + struct afs_vnode *vnode = AFS_FS_I(dir); + struct inode *inode; + + _enter("%d, %p{%s}, {%x:%u}, %p", + ret, dentry, devname, vnode->fid.vid, vnode->fid.vnode, key); + + if (ret != -ENOENT || + !test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) + goto out; + + inode = afs_iget_autocell(dir, devname, strlen(devname), key); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out; + } + + *fid = AFS_FS_I(inode)->fid; + _leave("= %p", inode); + return inode; + +out: + _leave("= %d", ret); + return ERR_PTR(ret); +} + +/* + * look up an entry in a directory + */ +static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct afs_vnode *vnode; + struct afs_fid fid; + struct inode *inode; + struct key *key; + int ret; + + vnode = AFS_FS_I(dir); + + _enter("{%x:%u},%p{%s},", + vnode->fid.vid, vnode->fid.vnode, dentry, dentry->d_name.name); + + ASSERTCMP(dentry->d_inode, ==, NULL); + + if (dentry->d_name.len >= AFSNAMEMAX) { + _leave(" = -ENAMETOOLONG"); + return ERR_PTR(-ENAMETOOLONG); + } + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + _leave(" = -ESTALE"); + return ERR_PTR(-ESTALE); + } + + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) { + _leave(" = %ld [key]", PTR_ERR(key)); + return ERR_CAST(key); + } + + ret = afs_validate(vnode, key); + if (ret < 0) { + key_put(key); + _leave(" = %d [val]", ret); + return ERR_PTR(ret); + } + + ret = afs_do_lookup(dir, dentry, &fid, key); + if (ret < 0) { + inode = afs_try_auto_mntpt(ret, dentry, dir, key, &fid); + if (!IS_ERR(inode)) { + key_put(key); + goto success; + } + + ret = PTR_ERR(inode); + key_put(key); + if (ret == -ENOENT) { + d_add(dentry, NULL); + _leave(" = NULL [negative]"); + return NULL; + } + _leave(" = %d [do]", ret); + return ERR_PTR(ret); + } + dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version; + + /* instantiate the dentry */ + inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL); + key_put(key); + if (IS_ERR(inode)) { + _leave(" = %ld", PTR_ERR(inode)); + return ERR_CAST(inode); + } + +success: + d_add(dentry, inode); + _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }", + fid.vnode, + fid.unique, + dentry->d_inode->i_ino, + dentry->d_inode->i_generation); + + return NULL; +} + +/* + * check that a dentry lookup hit has found a valid entry + * - NOTE! the hit can be a negative hit too, so we can't assume we have an + * inode + */ +static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct afs_vnode *vnode, *dir; + struct afs_fid uninitialized_var(fid); + struct dentry *parent; + struct key *key; + void *dir_version; + int ret; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + vnode = AFS_FS_I(dentry->d_inode); + + if (dentry->d_inode) + _enter("{v={%x:%u} n=%s fl=%lx},", + vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name, + vnode->flags); + else + _enter("{neg n=%s}", dentry->d_name.name); + + key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell); + if (IS_ERR(key)) + key = NULL; + + /* lock down the parent dentry so we can peer at it */ + parent = dget_parent(dentry); + if (!parent->d_inode) + goto out_bad; + + dir = AFS_FS_I(parent->d_inode); + + /* validate the parent directory */ + if (test_bit(AFS_VNODE_MODIFIED, &dir->flags)) + afs_validate(dir, key); + + if (test_bit(AFS_VNODE_DELETED, &dir->flags)) { + _debug("%s: parent dir deleted", dentry->d_name.name); + goto out_bad; + } + + dir_version = (void *) (unsigned long) dir->status.data_version; + if (dentry->d_fsdata == dir_version) + goto out_valid; /* the dir contents are unchanged */ + + _debug("dir modified"); + + /* search the directory for this vnode */ + ret = afs_do_lookup(&dir->vfs_inode, dentry, &fid, key); + switch (ret) { + case 0: + /* the filename maps to something */ + if (!dentry->d_inode) + goto out_bad; + if (is_bad_inode(dentry->d_inode)) { + printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n", + parent->d_name.name, dentry->d_name.name); + goto out_bad; + } + + /* if the vnode ID has changed, then the dirent points to a + * different file */ + if (fid.vnode != vnode->fid.vnode) { + _debug("%s: dirent changed [%u != %u]", + dentry->d_name.name, fid.vnode, + vnode->fid.vnode); + goto not_found; + } + + /* if the vnode ID uniqifier has changed, then the file has + * been deleted and replaced, and the original vnode ID has + * been reused */ + if (fid.unique != vnode->fid.unique) { + _debug("%s: file deleted (uq %u -> %u I:%u)", + dentry->d_name.name, fid.unique, + vnode->fid.unique, + dentry->d_inode->i_generation); + spin_lock(&vnode->lock); + set_bit(AFS_VNODE_DELETED, &vnode->flags); + spin_unlock(&vnode->lock); + goto not_found; + } + goto out_valid; + + case -ENOENT: + /* the filename is unknown */ + _debug("%s: dirent not found", dentry->d_name.name); + if (dentry->d_inode) + goto not_found; + goto out_valid; + + default: + _debug("failed to iterate dir %s: %d", + parent->d_name.name, ret); + goto out_bad; + } + +out_valid: + dentry->d_fsdata = dir_version; +out_skip: + dput(parent); + key_put(key); + _leave(" = 1 [valid]"); + return 1; + + /* the dirent, if it exists, now points to a different vnode */ +not_found: + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_NFSFS_RENAMED; + spin_unlock(&dentry->d_lock); + +out_bad: + if (dentry->d_inode) { + /* don't unhash if we have submounts */ + if (have_submounts(dentry)) + goto out_skip; + } + + _debug("dropping dentry %s/%s", + parent->d_name.name, dentry->d_name.name); + shrink_dcache_parent(dentry); + d_drop(dentry); + dput(parent); + key_put(key); + + _leave(" = 0 [bad]"); + return 0; +} + +/* + * allow the VFS to enquire as to whether a dentry should be unhashed (mustn't + * sleep) + * - called from dput() when d_count is going to 0. + * - return 1 to request dentry be unhashed, 0 otherwise + */ +static int afs_d_delete(const struct dentry *dentry) +{ + _enter("%s", dentry->d_name.name); + + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + goto zap; + + if (dentry->d_inode && + (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags) || + test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(dentry->d_inode)->flags))) + goto zap; + + _leave(" = 0 [keep]"); + return 0; + +zap: + _leave(" = 1 [zap]"); + return 1; +} + +/* + * handle dentry release + */ +static void afs_d_release(struct dentry *dentry) +{ + _enter("%s", dentry->d_name.name); +} + +/* + * create a directory on an AFS filesystem + */ +static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct afs_file_status status; + struct afs_callback cb; + struct afs_server *server; + struct afs_vnode *dvnode, *vnode; + struct afs_fid fid; + struct inode *inode; + struct key *key; + int ret; + + dvnode = AFS_FS_I(dir); + + _enter("{%x:%u},{%s},%o", + dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); + + ret = -ENAMETOOLONG; + if (dentry->d_name.len >= AFSNAMEMAX) + goto error; + + key = afs_request_key(dvnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + + mode |= S_IFDIR; + ret = afs_vnode_create(dvnode, key, dentry->d_name.name, + mode, &fid, &status, &cb, &server); + if (ret < 0) + goto mkdir_error; + + inode = afs_iget(dir->i_sb, key, &fid, &status, &cb); + if (IS_ERR(inode)) { + /* ENOMEM at a really inconvenient time - just abandon the new + * directory on the server */ + ret = PTR_ERR(inode); + goto iget_error; + } + + /* apply the status report we've got for the new vnode */ + vnode = AFS_FS_I(inode); + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + + d_instantiate(dentry, inode); + if (d_unhashed(dentry)) { + _debug("not hashed"); + d_rehash(dentry); + } + key_put(key); + _leave(" = 0"); + return 0; + +iget_error: + afs_put_server(server); +mkdir_error: + key_put(key); +error: + d_drop(dentry); + _leave(" = %d", ret); + return ret; +} + +/* + * remove a directory from an AFS filesystem + */ +static int afs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct afs_vnode *dvnode, *vnode; + struct key *key; + int ret; + + dvnode = AFS_FS_I(dir); + + _enter("{%x:%u},{%s}", + dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); + + ret = -ENAMETOOLONG; + if (dentry->d_name.len >= AFSNAMEMAX) + goto error; + + key = afs_request_key(dvnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + + ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, true); + if (ret < 0) + goto rmdir_error; + + if (dentry->d_inode) { + vnode = AFS_FS_I(dentry->d_inode); + clear_nlink(&vnode->vfs_inode); + set_bit(AFS_VNODE_DELETED, &vnode->flags); + afs_discard_callback_on_delete(vnode); + } + + key_put(key); + _leave(" = 0"); + return 0; + +rmdir_error: + key_put(key); +error: + _leave(" = %d", ret); + return ret; +} + +/* + * remove a file from an AFS filesystem + */ +static int afs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct afs_vnode *dvnode, *vnode; + struct key *key; + int ret; + + dvnode = AFS_FS_I(dir); + + _enter("{%x:%u},{%s}", + dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); + + ret = -ENAMETOOLONG; + if (dentry->d_name.len >= AFSNAMEMAX) + goto error; + + key = afs_request_key(dvnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + + if (dentry->d_inode) { + vnode = AFS_FS_I(dentry->d_inode); + + /* make sure we have a callback promise on the victim */ + ret = afs_validate(vnode, key); + if (ret < 0) + goto error; + } + + ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, false); + if (ret < 0) + goto remove_error; + + if (dentry->d_inode) { + /* if the file wasn't deleted due to excess hard links, the + * fileserver will break the callback promise on the file - if + * it had one - before it returns to us, and if it was deleted, + * it won't + * + * however, if we didn't have a callback promise outstanding, + * or it was outstanding on a different server, then it won't + * break it either... + */ + vnode = AFS_FS_I(dentry->d_inode); + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + _debug("AFS_VNODE_DELETED"); + if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) + _debug("AFS_VNODE_CB_BROKEN"); + set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + ret = afs_validate(vnode, key); + _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret); + } + + key_put(key); + _leave(" = 0"); + return 0; + +remove_error: + key_put(key); +error: + _leave(" = %d", ret); + return ret; +} + +/* + * create a regular file on an AFS filesystem + */ +static int afs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct afs_file_status status; + struct afs_callback cb; + struct afs_server *server; + struct afs_vnode *dvnode, *vnode; + struct afs_fid fid; + struct inode *inode; + struct key *key; + int ret; + + dvnode = AFS_FS_I(dir); + + _enter("{%x:%u},{%s},%o,", + dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); + + ret = -ENAMETOOLONG; + if (dentry->d_name.len >= AFSNAMEMAX) + goto error; + + key = afs_request_key(dvnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + + mode |= S_IFREG; + ret = afs_vnode_create(dvnode, key, dentry->d_name.name, + mode, &fid, &status, &cb, &server); + if (ret < 0) + goto create_error; + + inode = afs_iget(dir->i_sb, key, &fid, &status, &cb); + if (IS_ERR(inode)) { + /* ENOMEM at a really inconvenient time - just abandon the new + * directory on the server */ + ret = PTR_ERR(inode); + goto iget_error; + } + + /* apply the status report we've got for the new vnode */ + vnode = AFS_FS_I(inode); + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + + d_instantiate(dentry, inode); + if (d_unhashed(dentry)) { + _debug("not hashed"); + d_rehash(dentry); + } + key_put(key); + _leave(" = 0"); + return 0; + +iget_error: + afs_put_server(server); +create_error: + key_put(key); +error: + d_drop(dentry); + _leave(" = %d", ret); + return ret; +} + +/* + * create a hard link between files in an AFS filesystem + */ +static int afs_link(struct dentry *from, struct inode *dir, + struct dentry *dentry) +{ + struct afs_vnode *dvnode, *vnode; + struct key *key; + int ret; + + vnode = AFS_FS_I(from->d_inode); + dvnode = AFS_FS_I(dir); + + _enter("{%x:%u},{%x:%u},{%s}", + vnode->fid.vid, vnode->fid.vnode, + dvnode->fid.vid, dvnode->fid.vnode, + dentry->d_name.name); + + ret = -ENAMETOOLONG; + if (dentry->d_name.len >= AFSNAMEMAX) + goto error; + + key = afs_request_key(dvnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + + ret = afs_vnode_link(dvnode, vnode, key, dentry->d_name.name); + if (ret < 0) + goto link_error; + + ihold(&vnode->vfs_inode); + d_instantiate(dentry, &vnode->vfs_inode); + key_put(key); + _leave(" = 0"); + return 0; + +link_error: + key_put(key); +error: + d_drop(dentry); + _leave(" = %d", ret); + return ret; +} + +/* + * create a symlink in an AFS filesystem + */ +static int afs_symlink(struct inode *dir, struct dentry *dentry, + const char *content) +{ + struct afs_file_status status; + struct afs_server *server; + struct afs_vnode *dvnode, *vnode; + struct afs_fid fid; + struct inode *inode; + struct key *key; + int ret; + + dvnode = AFS_FS_I(dir); + + _enter("{%x:%u},{%s},%s", + dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, + content); + + ret = -ENAMETOOLONG; + if (dentry->d_name.len >= AFSNAMEMAX) + goto error; + + ret = -EINVAL; + if (strlen(content) >= AFSPATHMAX) + goto error; + + key = afs_request_key(dvnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + + ret = afs_vnode_symlink(dvnode, key, dentry->d_name.name, content, + &fid, &status, &server); + if (ret < 0) + goto create_error; + + inode = afs_iget(dir->i_sb, key, &fid, &status, NULL); + if (IS_ERR(inode)) { + /* ENOMEM at a really inconvenient time - just abandon the new + * directory on the server */ + ret = PTR_ERR(inode); + goto iget_error; + } + + /* apply the status report we've got for the new vnode */ + vnode = AFS_FS_I(inode); + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + + d_instantiate(dentry, inode); + if (d_unhashed(dentry)) { + _debug("not hashed"); + d_rehash(dentry); + } + key_put(key); + _leave(" = 0"); + return 0; + +iget_error: + afs_put_server(server); +create_error: + key_put(key); +error: + d_drop(dentry); + _leave(" = %d", ret); + return ret; +} + +/* + * rename a file in an AFS filesystem and/or move it between directories + */ +static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; + struct key *key; + int ret; + + vnode = AFS_FS_I(old_dentry->d_inode); + orig_dvnode = AFS_FS_I(old_dir); + new_dvnode = AFS_FS_I(new_dir); + + _enter("{%x:%u},{%x:%u},{%x:%u},{%s}", + orig_dvnode->fid.vid, orig_dvnode->fid.vnode, + vnode->fid.vid, vnode->fid.vnode, + new_dvnode->fid.vid, new_dvnode->fid.vnode, + new_dentry->d_name.name); + + ret = -ENAMETOOLONG; + if (new_dentry->d_name.len >= AFSNAMEMAX) + goto error; + + key = afs_request_key(orig_dvnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + + ret = afs_vnode_rename(orig_dvnode, new_dvnode, key, + old_dentry->d_name.name, + new_dentry->d_name.name); + if (ret < 0) + goto rename_error; + key_put(key); + _leave(" = 0"); + return 0; + +rename_error: + key_put(key); +error: + d_drop(new_dentry); + _leave(" = %d", ret); + return ret; +} diff --git a/fs/afs/file.c b/fs/afs/file.c new file mode 100644 index 00000000..14d89fa5 --- /dev/null +++ b/fs/afs/file.c @@ -0,0 +1,378 @@ +/* AFS filesystem file handling + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static int afs_readpage(struct file *file, struct page *page); +static void afs_invalidatepage(struct page *page, unsigned long offset); +static int afs_releasepage(struct page *page, gfp_t gfp_flags); +static int afs_launder_page(struct page *page); + +static int afs_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages); + +const struct file_operations afs_file_operations = { + .open = afs_open, + .release = afs_release, + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = afs_file_write, + .mmap = generic_file_readonly_mmap, + .splice_read = generic_file_splice_read, + .fsync = afs_fsync, + .lock = afs_lock, + .flock = afs_flock, +}; + +const struct inode_operations afs_file_inode_operations = { + .getattr = afs_getattr, + .setattr = afs_setattr, + .permission = afs_permission, +}; + +const struct address_space_operations afs_fs_aops = { + .readpage = afs_readpage, + .readpages = afs_readpages, + .set_page_dirty = afs_set_page_dirty, + .launder_page = afs_launder_page, + .releasepage = afs_releasepage, + .invalidatepage = afs_invalidatepage, + .write_begin = afs_write_begin, + .write_end = afs_write_end, + .writepage = afs_writepage, + .writepages = afs_writepages, +}; + +/* + * open an AFS file or directory and attach a key to it + */ +int afs_open(struct inode *inode, struct file *file) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + struct key *key; + int ret; + + _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); + + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) { + _leave(" = %ld [key]", PTR_ERR(key)); + return PTR_ERR(key); + } + + ret = afs_validate(vnode, key); + if (ret < 0) { + _leave(" = %d [val]", ret); + return ret; + } + + file->private_data = key; + _leave(" = 0"); + return 0; +} + +/* + * release an AFS file or directory and discard its key + */ +int afs_release(struct inode *inode, struct file *file) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + + _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); + + key_put(file->private_data); + _leave(" = 0"); + return 0; +} + +#ifdef CONFIG_AFS_FSCACHE +/* + * deal with notification that a page was read from the cache + */ +static void afs_file_readpage_read_complete(struct page *page, + void *data, + int error) +{ + _enter("%p,%p,%d", page, data, error); + + /* if the read completes with an error, we just unlock the page and let + * the VM reissue the readpage */ + if (!error) + SetPageUptodate(page); + unlock_page(page); +} +#endif + +/* + * read page from file, directory or symlink, given a key to use + */ +int afs_page_filler(void *data, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct afs_vnode *vnode = AFS_FS_I(inode); + struct key *key = data; + size_t len; + off_t offset; + int ret; + + _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); + + BUG_ON(!PageLocked(page)); + + ret = -ESTALE; + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + goto error; + + /* is it cached? */ +#ifdef CONFIG_AFS_FSCACHE + ret = fscache_read_or_alloc_page(vnode->cache, + page, + afs_file_readpage_read_complete, + NULL, + GFP_KERNEL); +#else + ret = -ENOBUFS; +#endif + switch (ret) { + /* read BIO submitted (page in cache) */ + case 0: + break; + + /* page not yet cached */ + case -ENODATA: + _debug("cache said ENODATA"); + goto go_on; + + /* page will not be cached */ + case -ENOBUFS: + _debug("cache said ENOBUFS"); + default: + go_on: + offset = page->index << PAGE_CACHE_SHIFT; + len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE); + + /* read the contents of the file from the server into the + * page */ + ret = afs_vnode_fetch_data(vnode, key, offset, len, page); + if (ret < 0) { + if (ret == -ENOENT) { + _debug("got NOENT from server" + " - marking file deleted and stale"); + set_bit(AFS_VNODE_DELETED, &vnode->flags); + ret = -ESTALE; + } + +#ifdef CONFIG_AFS_FSCACHE + fscache_uncache_page(vnode->cache, page); +#endif + BUG_ON(PageFsCache(page)); + goto error; + } + + SetPageUptodate(page); + + /* send the page to the cache */ +#ifdef CONFIG_AFS_FSCACHE + if (PageFsCache(page) && + fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) { + fscache_uncache_page(vnode->cache, page); + BUG_ON(PageFsCache(page)); + } +#endif + unlock_page(page); + } + + _leave(" = 0"); + return 0; + +error: + SetPageError(page); + unlock_page(page); + _leave(" = %d", ret); + return ret; +} + +/* + * read page from file, directory or symlink, given a file to nominate the key + * to be used + */ +static int afs_readpage(struct file *file, struct page *page) +{ + struct key *key; + int ret; + + if (file) { + key = file->private_data; + ASSERT(key != NULL); + ret = afs_page_filler(key, page); + } else { + struct inode *inode = page->mapping->host; + key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + } else { + ret = afs_page_filler(key, page); + key_put(key); + } + } + return ret; +} + +/* + * read a set of pages + */ +static int afs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct key *key = file->private_data; + struct afs_vnode *vnode; + int ret = 0; + + _enter("{%d},{%lu},,%d", + key_serial(key), mapping->host->i_ino, nr_pages); + + ASSERT(key != NULL); + + vnode = AFS_FS_I(mapping->host); + if (vnode->flags & AFS_VNODE_DELETED) { + _leave(" = -ESTALE"); + return -ESTALE; + } + + /* attempt to read as many of the pages as possible */ +#ifdef CONFIG_AFS_FSCACHE + ret = fscache_read_or_alloc_pages(vnode->cache, + mapping, + pages, + &nr_pages, + afs_file_readpage_read_complete, + NULL, + mapping_gfp_mask(mapping)); +#else + ret = -ENOBUFS; +#endif + + switch (ret) { + /* all pages are being read from the cache */ + case 0: + BUG_ON(!list_empty(pages)); + BUG_ON(nr_pages != 0); + _leave(" = 0 [reading all]"); + return 0; + + /* there were pages that couldn't be read from the cache */ + case -ENODATA: + case -ENOBUFS: + break; + + /* other error */ + default: + _leave(" = %d", ret); + return ret; + } + + /* load the missing pages from the network */ + ret = read_cache_pages(mapping, pages, afs_page_filler, key); + + _leave(" = %d [netting]", ret); + return ret; +} + +/* + * write back a dirty page + */ +static int afs_launder_page(struct page *page) +{ + _enter("{%lu}", page->index); + + return 0; +} + +/* + * invalidate part or all of a page + * - release a page and clean up its private data if offset is 0 (indicating + * the entire page) + */ +static void afs_invalidatepage(struct page *page, unsigned long offset) +{ + struct afs_writeback *wb = (struct afs_writeback *) page_private(page); + + _enter("{%lu},%lu", page->index, offset); + + BUG_ON(!PageLocked(page)); + + /* we clean up only if the entire page is being invalidated */ + if (offset == 0) { +#ifdef CONFIG_AFS_FSCACHE + if (PageFsCache(page)) { + struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + fscache_wait_on_page_write(vnode->cache, page); + fscache_uncache_page(vnode->cache, page); + } +#endif + + if (PagePrivate(page)) { + if (wb && !PageWriteback(page)) { + set_page_private(page, 0); + afs_put_writeback(wb); + } + + if (!page_private(page)) + ClearPagePrivate(page); + } + } + + _leave(""); +} + +/* + * release a page and clean up its private state if it's not busy + * - return true if the page can now be released, false if not + */ +static int afs_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct afs_writeback *wb = (struct afs_writeback *) page_private(page); + struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + + _enter("{{%x:%u}[%lu],%lx},%x", + vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, + gfp_flags); + + /* deny if page is being written to the cache and the caller hasn't + * elected to wait */ +#ifdef CONFIG_AFS_FSCACHE + if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) { + _leave(" = F [cache busy]"); + return 0; + } +#endif + + if (PagePrivate(page)) { + if (wb) { + set_page_private(page, 0); + afs_put_writeback(wb); + } + ClearPagePrivate(page); + } + + /* indicate that the page can be released */ + _leave(" = T"); + return 1; +} diff --git a/fs/afs/flock.c b/fs/afs/flock.c new file mode 100644 index 00000000..757d6645 --- /dev/null +++ b/fs/afs/flock.c @@ -0,0 +1,588 @@ +/* AFS file locking support + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "internal.h" + +#define AFS_LOCK_GRANTED 0 +#define AFS_LOCK_PENDING 1 + +static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl); +static void afs_fl_release_private(struct file_lock *fl); + +static struct workqueue_struct *afs_lock_manager; +static DEFINE_MUTEX(afs_lock_manager_mutex); + +static const struct file_lock_operations afs_lock_ops = { + .fl_copy_lock = afs_fl_copy_lock, + .fl_release_private = afs_fl_release_private, +}; + +/* + * initialise the lock manager thread if it isn't already running + */ +static int afs_init_lock_manager(void) +{ + int ret; + + ret = 0; + if (!afs_lock_manager) { + mutex_lock(&afs_lock_manager_mutex); + if (!afs_lock_manager) { + afs_lock_manager = + create_singlethread_workqueue("kafs_lockd"); + if (!afs_lock_manager) + ret = -ENOMEM; + } + mutex_unlock(&afs_lock_manager_mutex); + } + return ret; +} + +/* + * destroy the lock manager thread if it's running + */ +void __exit afs_kill_lock_manager(void) +{ + if (afs_lock_manager) + destroy_workqueue(afs_lock_manager); +} + +/* + * if the callback is broken on this vnode, then the lock may now be available + */ +void afs_lock_may_be_available(struct afs_vnode *vnode) +{ + _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + + queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0); +} + +/* + * the lock will time out in 5 minutes unless we extend it, so schedule + * extension in a bit less than that time + */ +static void afs_schedule_lock_extension(struct afs_vnode *vnode) +{ + queue_delayed_work(afs_lock_manager, &vnode->lock_work, + AFS_LOCKWAIT * HZ / 2); +} + +/* + * grant one or more locks (readlocks are allowed to jump the queue if the + * first lock in the queue is itself a readlock) + * - the caller must hold the vnode lock + */ +static void afs_grant_locks(struct afs_vnode *vnode, struct file_lock *fl) +{ + struct file_lock *p, *_p; + + list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks); + if (fl->fl_type == F_RDLCK) { + list_for_each_entry_safe(p, _p, &vnode->pending_locks, + fl_u.afs.link) { + if (p->fl_type == F_RDLCK) { + p->fl_u.afs.state = AFS_LOCK_GRANTED; + list_move_tail(&p->fl_u.afs.link, + &vnode->granted_locks); + wake_up(&p->fl_wait); + } + } + } +} + +/* + * do work for a lock, including: + * - probing for a lock we're waiting on but didn't get immediately + * - extending a lock that's close to timing out + */ +void afs_lock_work(struct work_struct *work) +{ + struct afs_vnode *vnode = + container_of(work, struct afs_vnode, lock_work.work); + struct file_lock *fl; + afs_lock_type_t type; + struct key *key; + int ret; + + _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + + spin_lock(&vnode->lock); + + if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) { + _debug("unlock"); + spin_unlock(&vnode->lock); + + /* attempt to release the server lock; if it fails, we just + * wait 5 minutes and it'll time out anyway */ + ret = afs_vnode_release_lock(vnode, vnode->unlock_key); + if (ret < 0) + printk(KERN_WARNING "AFS:" + " Failed to release lock on {%x:%x} error %d\n", + vnode->fid.vid, vnode->fid.vnode, ret); + + spin_lock(&vnode->lock); + key_put(vnode->unlock_key); + vnode->unlock_key = NULL; + clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags); + } + + /* if we've got a lock, then it must be time to extend that lock as AFS + * locks time out after 5 minutes */ + if (!list_empty(&vnode->granted_locks)) { + _debug("extend"); + + if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags)) + BUG(); + fl = list_entry(vnode->granted_locks.next, + struct file_lock, fl_u.afs.link); + key = key_get(fl->fl_file->private_data); + spin_unlock(&vnode->lock); + + ret = afs_vnode_extend_lock(vnode, key); + clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + key_put(key); + switch (ret) { + case 0: + afs_schedule_lock_extension(vnode); + break; + default: + /* ummm... we failed to extend the lock - retry + * extension shortly */ + printk(KERN_WARNING "AFS:" + " Failed to extend lock on {%x:%x} error %d\n", + vnode->fid.vid, vnode->fid.vnode, ret); + queue_delayed_work(afs_lock_manager, &vnode->lock_work, + HZ * 10); + break; + } + _leave(" [extend]"); + return; + } + + /* if we don't have a granted lock, then we must've been called back by + * the server, and so if might be possible to get a lock we're + * currently waiting for */ + if (!list_empty(&vnode->pending_locks)) { + _debug("get"); + + if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags)) + BUG(); + fl = list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link); + key = key_get(fl->fl_file->private_data); + type = (fl->fl_type == F_RDLCK) ? + AFS_LOCK_READ : AFS_LOCK_WRITE; + spin_unlock(&vnode->lock); + + ret = afs_vnode_set_lock(vnode, key, type); + clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + switch (ret) { + case -EWOULDBLOCK: + _debug("blocked"); + break; + case 0: + _debug("acquired"); + if (type == AFS_LOCK_READ) + set_bit(AFS_VNODE_READLOCKED, &vnode->flags); + else + set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); + ret = AFS_LOCK_GRANTED; + default: + spin_lock(&vnode->lock); + /* the pending lock may have been withdrawn due to a + * signal */ + if (list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link) == fl) { + fl->fl_u.afs.state = ret; + if (ret == AFS_LOCK_GRANTED) + afs_grant_locks(vnode, fl); + else + list_del_init(&fl->fl_u.afs.link); + wake_up(&fl->fl_wait); + spin_unlock(&vnode->lock); + } else { + _debug("withdrawn"); + clear_bit(AFS_VNODE_READLOCKED, &vnode->flags); + clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); + spin_unlock(&vnode->lock); + afs_vnode_release_lock(vnode, key); + if (!list_empty(&vnode->pending_locks)) + afs_lock_may_be_available(vnode); + } + break; + } + key_put(key); + _leave(" [pend]"); + return; + } + + /* looks like the lock request was withdrawn on a signal */ + spin_unlock(&vnode->lock); + _leave(" [no locks]"); +} + +/* + * pass responsibility for the unlocking of a vnode on the server to the + * manager thread, lest a pending signal in the calling thread interrupt + * AF_RXRPC + * - the caller must hold the vnode lock + */ +static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key) +{ + cancel_delayed_work(&vnode->lock_work); + if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) && + !test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags)) + BUG(); + if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) + BUG(); + vnode->unlock_key = key_get(key); + afs_lock_may_be_available(vnode); +} + +/* + * request a lock on a file on the server + */ +static int afs_do_setlk(struct file *file, struct file_lock *fl) +{ + struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); + afs_lock_type_t type; + struct key *key = file->private_data; + int ret; + + _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + + /* only whole-file locks are supported */ + if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX) + return -EINVAL; + + ret = afs_init_lock_manager(); + if (ret < 0) + return ret; + + fl->fl_ops = &afs_lock_ops; + INIT_LIST_HEAD(&fl->fl_u.afs.link); + fl->fl_u.afs.state = AFS_LOCK_PENDING; + + type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + + lock_flocks(); + + /* make sure we've got a callback on this file and that our view of the + * data version is up to date */ + ret = afs_vnode_fetch_status(vnode, NULL, key); + if (ret < 0) + goto error; + + if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) { + ret = -EAGAIN; + goto error; + } + + spin_lock(&vnode->lock); + + /* if we've already got a readlock on the server then we can instantly + * grant another readlock, irrespective of whether there are any + * pending writelocks */ + if (type == AFS_LOCK_READ && + vnode->flags & (1 << AFS_VNODE_READLOCKED)) { + _debug("instant readlock"); + ASSERTCMP(vnode->flags & + ((1 << AFS_VNODE_LOCKING) | + (1 << AFS_VNODE_WRITELOCKED)), ==, 0); + ASSERT(!list_empty(&vnode->granted_locks)); + goto sharing_existing_lock; + } + + /* if there's no-one else with a lock on this vnode, then we need to + * ask the server for a lock */ + if (list_empty(&vnode->pending_locks) && + list_empty(&vnode->granted_locks)) { + _debug("not locked"); + ASSERTCMP(vnode->flags & + ((1 << AFS_VNODE_LOCKING) | + (1 << AFS_VNODE_READLOCKED) | + (1 << AFS_VNODE_WRITELOCKED)), ==, 0); + list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); + set_bit(AFS_VNODE_LOCKING, &vnode->flags); + spin_unlock(&vnode->lock); + + ret = afs_vnode_set_lock(vnode, key, type); + clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + switch (ret) { + case 0: + _debug("acquired"); + goto acquired_server_lock; + case -EWOULDBLOCK: + _debug("would block"); + spin_lock(&vnode->lock); + ASSERT(list_empty(&vnode->granted_locks)); + ASSERTCMP(vnode->pending_locks.next, ==, + &fl->fl_u.afs.link); + goto wait; + default: + spin_lock(&vnode->lock); + list_del_init(&fl->fl_u.afs.link); + spin_unlock(&vnode->lock); + goto error; + } + } + + /* otherwise, we need to wait for a local lock to become available */ + _debug("wait local"); + list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); +wait: + if (!(fl->fl_flags & FL_SLEEP)) { + _debug("noblock"); + ret = -EAGAIN; + goto abort_attempt; + } + spin_unlock(&vnode->lock); + + /* now we need to sleep and wait for the lock manager thread to get the + * lock from the server */ + _debug("sleep"); + ret = wait_event_interruptible(fl->fl_wait, + fl->fl_u.afs.state <= AFS_LOCK_GRANTED); + if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) { + ret = fl->fl_u.afs.state; + if (ret < 0) + goto error; + spin_lock(&vnode->lock); + goto given_lock; + } + + /* we were interrupted, but someone may still be in the throes of + * giving us the lock */ + _debug("intr"); + ASSERTCMP(ret, ==, -ERESTARTSYS); + + spin_lock(&vnode->lock); + if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) { + ret = fl->fl_u.afs.state; + if (ret < 0) { + spin_unlock(&vnode->lock); + goto error; + } + goto given_lock; + } + +abort_attempt: + /* we aren't going to get the lock, either because we're unwilling to + * wait, or because some signal happened */ + _debug("abort"); + if (list_empty(&vnode->granted_locks) && + vnode->pending_locks.next == &fl->fl_u.afs.link) { + if (vnode->pending_locks.prev != &fl->fl_u.afs.link) { + /* kick the next pending lock into having a go */ + list_del_init(&fl->fl_u.afs.link); + afs_lock_may_be_available(vnode); + } + } else { + list_del_init(&fl->fl_u.afs.link); + } + spin_unlock(&vnode->lock); + goto error; + +acquired_server_lock: + /* we've acquired a server lock, but it needs to be renewed after 5 + * mins */ + spin_lock(&vnode->lock); + afs_schedule_lock_extension(vnode); + if (type == AFS_LOCK_READ) + set_bit(AFS_VNODE_READLOCKED, &vnode->flags); + else + set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); +sharing_existing_lock: + /* the lock has been granted as far as we're concerned... */ + fl->fl_u.afs.state = AFS_LOCK_GRANTED; + list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks); +given_lock: + /* ... but we do still need to get the VFS's blessing */ + ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING))); + ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) | + (1 << AFS_VNODE_WRITELOCKED))) != 0); + ret = posix_lock_file(file, fl, NULL); + if (ret < 0) + goto vfs_rejected_lock; + spin_unlock(&vnode->lock); + + /* again, make sure we've got a callback on this file and, again, make + * sure that our view of the data version is up to date (we ignore + * errors incurred here and deal with the consequences elsewhere) */ + afs_vnode_fetch_status(vnode, NULL, key); + +error: + unlock_flocks(); + _leave(" = %d", ret); + return ret; + +vfs_rejected_lock: + /* the VFS rejected the lock we just obtained, so we have to discard + * what we just got */ + _debug("vfs refused %d", ret); + list_del_init(&fl->fl_u.afs.link); + if (list_empty(&vnode->granted_locks)) + afs_defer_unlock(vnode, key); + goto abort_attempt; +} + +/* + * unlock on a file on the server + */ +static int afs_do_unlk(struct file *file, struct file_lock *fl) +{ + struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); + struct key *key = file->private_data; + int ret; + + _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + + /* only whole-file unlocks are supported */ + if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX) + return -EINVAL; + + fl->fl_ops = &afs_lock_ops; + INIT_LIST_HEAD(&fl->fl_u.afs.link); + fl->fl_u.afs.state = AFS_LOCK_PENDING; + + spin_lock(&vnode->lock); + ret = posix_lock_file(file, fl, NULL); + if (ret < 0) { + spin_unlock(&vnode->lock); + _leave(" = %d [vfs]", ret); + return ret; + } + + /* discard the server lock only if all granted locks are gone */ + if (list_empty(&vnode->granted_locks)) + afs_defer_unlock(vnode, key); + spin_unlock(&vnode->lock); + _leave(" = 0"); + return 0; +} + +/* + * return information about a lock we currently hold, if indeed we hold one + */ +static int afs_do_getlk(struct file *file, struct file_lock *fl) +{ + struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); + struct key *key = file->private_data; + int ret, lock_count; + + _enter(""); + + fl->fl_type = F_UNLCK; + + mutex_lock(&vnode->vfs_inode.i_mutex); + + /* check local lock records first */ + ret = 0; + posix_test_lock(file, fl); + if (fl->fl_type == F_UNLCK) { + /* no local locks; consult the server */ + ret = afs_vnode_fetch_status(vnode, NULL, key); + if (ret < 0) + goto error; + lock_count = vnode->status.lock_count; + if (lock_count) { + if (lock_count > 0) + fl->fl_type = F_RDLCK; + else + fl->fl_type = F_WRLCK; + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; + } + } + +error: + mutex_unlock(&vnode->vfs_inode.i_mutex); + _leave(" = %d [%hd]", ret, fl->fl_type); + return ret; +} + +/* + * manage POSIX locks on a file + */ +int afs_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); + + _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}", + vnode->fid.vid, vnode->fid.vnode, cmd, + fl->fl_type, fl->fl_flags, + (long long) fl->fl_start, (long long) fl->fl_end); + + /* AFS doesn't support mandatory locks */ + if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + if (IS_GETLK(cmd)) + return afs_do_getlk(file, fl); + if (fl->fl_type == F_UNLCK) + return afs_do_unlk(file, fl); + return afs_do_setlk(file, fl); +} + +/* + * manage FLOCK locks on a file + */ +int afs_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); + + _enter("{%x:%u},%d,{t=%x,fl=%x}", + vnode->fid.vid, vnode->fid.vnode, cmd, + fl->fl_type, fl->fl_flags); + + /* + * No BSD flocks over NFS allowed. + * Note: we could try to fake a POSIX lock request here by + * using ((u32) filp | 0x80000000) or some such as the pid. + * Not sure whether that would be unique, though, or whether + * that would break in other places. + */ + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + + /* we're simulating flock() locks using posix locks on the server */ + fl->fl_owner = (fl_owner_t) file; + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; + + if (fl->fl_type == F_UNLCK) + return afs_do_unlk(file, fl); + return afs_do_setlk(file, fl); +} + +/* + * the POSIX lock management core VFS code copies the lock record and adds the + * copy into its own list, so we need to add that copy to the vnode's lock + * queue in the same place as the original (which will be deleted shortly + * after) + */ +static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) +{ + _enter(""); + + list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link); +} + +/* + * need to remove this lock from the vnode queue when it's removed from the + * VFS's list + */ +static void afs_fl_release_private(struct file_lock *fl) +{ + _enter(""); + + list_del_init(&fl->fl_u.afs.link); +} diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c new file mode 100644 index 00000000..346e3289 --- /dev/null +++ b/fs/afs/fsclient.c @@ -0,0 +1,1905 @@ +/* AFS File Server client stubs + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include "internal.h" +#include "afs_fs.h" + +/* + * decode an AFSFid block + */ +static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid) +{ + const __be32 *bp = *_bp; + + fid->vid = ntohl(*bp++); + fid->vnode = ntohl(*bp++); + fid->unique = ntohl(*bp++); + *_bp = bp; +} + +/* + * decode an AFSFetchStatus block + */ +static void xdr_decode_AFSFetchStatus(const __be32 **_bp, + struct afs_file_status *status, + struct afs_vnode *vnode, + afs_dataversion_t *store_version) +{ + afs_dataversion_t expected_version; + const __be32 *bp = *_bp; + umode_t mode; + u64 data_version, size; + u32 changed = 0; /* becomes non-zero if ctime-type changes seen */ + +#define EXTRACT(DST) \ + do { \ + u32 x = ntohl(*bp++); \ + changed |= DST - x; \ + DST = x; \ + } while (0) + + status->if_version = ntohl(*bp++); + EXTRACT(status->type); + EXTRACT(status->nlink); + size = ntohl(*bp++); + data_version = ntohl(*bp++); + EXTRACT(status->author); + EXTRACT(status->owner); + EXTRACT(status->caller_access); /* call ticket dependent */ + EXTRACT(status->anon_access); + EXTRACT(status->mode); + EXTRACT(status->parent.vnode); + EXTRACT(status->parent.unique); + bp++; /* seg size */ + status->mtime_client = ntohl(*bp++); + status->mtime_server = ntohl(*bp++); + EXTRACT(status->group); + bp++; /* sync counter */ + data_version |= (u64) ntohl(*bp++) << 32; + EXTRACT(status->lock_count); + size |= (u64) ntohl(*bp++) << 32; + bp++; /* spare 4 */ + *_bp = bp; + + if (size != status->size) { + status->size = size; + changed |= true; + } + status->mode &= S_IALLUGO; + + _debug("vnode time %lx, %lx", + status->mtime_client, status->mtime_server); + + if (vnode) { + status->parent.vid = vnode->fid.vid; + if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { + _debug("vnode changed"); + i_size_write(&vnode->vfs_inode, size); + vnode->vfs_inode.i_uid = status->owner; + vnode->vfs_inode.i_gid = status->group; + vnode->vfs_inode.i_generation = vnode->fid.unique; + vnode->vfs_inode.i_nlink = status->nlink; + + mode = vnode->vfs_inode.i_mode; + mode &= ~S_IALLUGO; + mode |= status->mode; + barrier(); + vnode->vfs_inode.i_mode = mode; + } + + vnode->vfs_inode.i_ctime.tv_sec = status->mtime_server; + vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime; + vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime; + vnode->vfs_inode.i_version = data_version; + } + + expected_version = status->data_version; + if (store_version) + expected_version = *store_version; + + if (expected_version != data_version) { + status->data_version = data_version; + if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { + _debug("vnode modified %llx on {%x:%u}", + (unsigned long long) data_version, + vnode->fid.vid, vnode->fid.vnode); + set_bit(AFS_VNODE_MODIFIED, &vnode->flags); + set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); + } + } else if (store_version) { + status->data_version = data_version; + } +} + +/* + * decode an AFSCallBack block + */ +static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode) +{ + const __be32 *bp = *_bp; + + vnode->cb_version = ntohl(*bp++); + vnode->cb_expiry = ntohl(*bp++); + vnode->cb_type = ntohl(*bp++); + vnode->cb_expires = vnode->cb_expiry + get_seconds(); + *_bp = bp; +} + +static void xdr_decode_AFSCallBack_raw(const __be32 **_bp, + struct afs_callback *cb) +{ + const __be32 *bp = *_bp; + + cb->version = ntohl(*bp++); + cb->expiry = ntohl(*bp++); + cb->type = ntohl(*bp++); + *_bp = bp; +} + +/* + * decode an AFSVolSync block + */ +static void xdr_decode_AFSVolSync(const __be32 **_bp, + struct afs_volsync *volsync) +{ + const __be32 *bp = *_bp; + + volsync->creation = ntohl(*bp++); + bp++; /* spare2 */ + bp++; /* spare3 */ + bp++; /* spare4 */ + bp++; /* spare5 */ + bp++; /* spare6 */ + *_bp = bp; +} + +/* + * encode the requested attributes into an AFSStoreStatus block + */ +static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr) +{ + __be32 *bp = *_bp; + u32 mask = 0, mtime = 0, owner = 0, group = 0, mode = 0; + + mask = 0; + if (attr->ia_valid & ATTR_MTIME) { + mask |= AFS_SET_MTIME; + mtime = attr->ia_mtime.tv_sec; + } + + if (attr->ia_valid & ATTR_UID) { + mask |= AFS_SET_OWNER; + owner = attr->ia_uid; + } + + if (attr->ia_valid & ATTR_GID) { + mask |= AFS_SET_GROUP; + group = attr->ia_gid; + } + + if (attr->ia_valid & ATTR_MODE) { + mask |= AFS_SET_MODE; + mode = attr->ia_mode & S_IALLUGO; + } + + *bp++ = htonl(mask); + *bp++ = htonl(mtime); + *bp++ = htonl(owner); + *bp++ = htonl(group); + *bp++ = htonl(mode); + *bp++ = 0; /* segment size */ + *_bp = bp; +} + +/* + * decode an AFSFetchVolumeStatus block + */ +static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, + struct afs_volume_status *vs) +{ + const __be32 *bp = *_bp; + + vs->vid = ntohl(*bp++); + vs->parent_id = ntohl(*bp++); + vs->online = ntohl(*bp++); + vs->in_service = ntohl(*bp++); + vs->blessed = ntohl(*bp++); + vs->needs_salvage = ntohl(*bp++); + vs->type = ntohl(*bp++); + vs->min_quota = ntohl(*bp++); + vs->max_quota = ntohl(*bp++); + vs->blocks_in_use = ntohl(*bp++); + vs->part_blocks_avail = ntohl(*bp++); + vs->part_max_blocks = ntohl(*bp++); + *_bp = bp; +} + +/* + * deliver reply data to an FS.FetchStatus + */ +static int afs_deliver_fs_fetch_status(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *vnode = call->reply; + const __be32 *bp; + + _enter(",,%u", last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + xdr_decode_AFSCallBack(&bp, vnode); + if (call->reply2) + xdr_decode_AFSVolSync(&bp, call->reply2); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.FetchStatus operation type + */ +static const struct afs_call_type afs_RXFSFetchStatus = { + .name = "FS.FetchStatus", + .deliver = afs_deliver_fs_fetch_status, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * fetch the status information for a file + */ +int afs_fs_fetch_file_status(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + struct afs_volsync *volsync, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%x:%u},,", + key_serial(key), vnode->fid.vid, vnode->fid.vnode); + + call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->reply2 = volsync; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + bp[0] = htonl(FSFETCHSTATUS); + bp[1] = htonl(vnode->fid.vid); + bp[2] = htonl(vnode->fid.vnode); + bp[3] = htonl(vnode->fid.unique); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.FetchData + */ +static int afs_deliver_fs_fetch_data(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *vnode = call->reply; + const __be32 *bp; + struct page *page; + void *buffer; + int ret; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->unmarshall++; + if (call->operation_ID != FSFETCHDATA64) { + call->unmarshall++; + goto no_msw; + } + + /* extract the upper part of the returned data length of an + * FSFETCHDATA64 op (which should always be 0 using this + * client) */ + case 1: + _debug("extract data length (MSW)"); + ret = afs_extract_data(call, skb, last, &call->tmp, 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->count = ntohl(call->tmp); + _debug("DATA length MSW: %u", call->count); + if (call->count > 0) + return -EBADMSG; + call->offset = 0; + call->unmarshall++; + + no_msw: + /* extract the returned data length */ + case 2: + _debug("extract data length"); + ret = afs_extract_data(call, skb, last, &call->tmp, 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->count = ntohl(call->tmp); + _debug("DATA length: %u", call->count); + if (call->count > PAGE_SIZE) + return -EBADMSG; + call->offset = 0; + call->unmarshall++; + + /* extract the returned data */ + case 3: + _debug("extract data"); + if (call->count > 0) { + page = call->reply3; + buffer = kmap_atomic(page, KM_USER0); + ret = afs_extract_data(call, skb, last, buffer, + call->count); + kunmap_atomic(buffer, KM_USER0); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + } + + call->offset = 0; + call->unmarshall++; + + /* extract the metadata */ + case 4: + ret = afs_extract_data(call, skb, last, call->buffer, + (21 + 3 + 6) * 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + xdr_decode_AFSCallBack(&bp, vnode); + if (call->reply2) + xdr_decode_AFSVolSync(&bp, call->reply2); + + call->offset = 0; + call->unmarshall++; + + case 5: + _debug("trailer"); + if (skb->len != 0) + return -EBADMSG; + break; + } + + if (!last) + return 0; + + if (call->count < PAGE_SIZE) { + _debug("clear"); + page = call->reply3; + buffer = kmap_atomic(page, KM_USER0); + memset(buffer + call->count, 0, PAGE_SIZE - call->count); + kunmap_atomic(buffer, KM_USER0); + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.FetchData operation type + */ +static const struct afs_call_type afs_RXFSFetchData = { + .name = "FS.FetchData", + .deliver = afs_deliver_fs_fetch_data, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type afs_RXFSFetchData64 = { + .name = "FS.FetchData64", + .deliver = afs_deliver_fs_fetch_data, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * fetch data from a very large file + */ +static int afs_fs_fetch_data64(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + off_t offset, size_t length, + struct page *buffer, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + ASSERTCMP(length, <, ULONG_MAX); + + call = afs_alloc_flat_call(&afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->reply2 = NULL; /* volsync */ + call->reply3 = buffer; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + call->operation_ID = FSFETCHDATA64; + + /* marshall the parameters */ + bp = call->request; + bp[0] = htonl(FSFETCHDATA64); + bp[1] = htonl(vnode->fid.vid); + bp[2] = htonl(vnode->fid.vnode); + bp[3] = htonl(vnode->fid.unique); + bp[4] = htonl(upper_32_bits(offset)); + bp[5] = htonl((u32) offset); + bp[6] = 0; + bp[7] = htonl((u32) length); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * fetch data from a file + */ +int afs_fs_fetch_data(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + off_t offset, size_t length, + struct page *buffer, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + if (upper_32_bits(offset) || upper_32_bits(offset + length)) + return afs_fs_fetch_data64(server, key, vnode, offset, length, + buffer, wait_mode); + + _enter(""); + + call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->reply2 = NULL; /* volsync */ + call->reply3 = buffer; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + call->operation_ID = FSFETCHDATA; + + /* marshall the parameters */ + bp = call->request; + bp[0] = htonl(FSFETCHDATA); + bp[1] = htonl(vnode->fid.vid); + bp[2] = htonl(vnode->fid.vnode); + bp[3] = htonl(vnode->fid.unique); + bp[4] = htonl(offset); + bp[5] = htonl(length); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.GiveUpCallBacks + */ +static int afs_deliver_fs_give_up_callbacks(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + _enter(",{%u},%d", skb->len, last); + + if (skb->len > 0) + return -EBADMSG; /* shouldn't be any reply data */ + return 0; +} + +/* + * FS.GiveUpCallBacks operation type + */ +static const struct afs_call_type afs_RXFSGiveUpCallBacks = { + .name = "FS.GiveUpCallBacks", + .deliver = afs_deliver_fs_give_up_callbacks, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * give up a set of callbacks + * - the callbacks are held in the server->cb_break ring + */ +int afs_fs_give_up_callbacks(struct afs_server *server, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + size_t ncallbacks; + __be32 *bp, *tp; + int loop; + + ncallbacks = CIRC_CNT(server->cb_break_head, server->cb_break_tail, + ARRAY_SIZE(server->cb_break)); + + _enter("{%zu},", ncallbacks); + + if (ncallbacks == 0) + return 0; + if (ncallbacks > AFSCBMAX) + ncallbacks = AFSCBMAX; + + _debug("break %zu callbacks", ncallbacks); + + call = afs_alloc_flat_call(&afs_RXFSGiveUpCallBacks, + 12 + ncallbacks * 6 * 4, 0); + if (!call) + return -ENOMEM; + + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + tp = bp + 2 + ncallbacks * 3; + *bp++ = htonl(FSGIVEUPCALLBACKS); + *bp++ = htonl(ncallbacks); + *tp++ = htonl(ncallbacks); + + atomic_sub(ncallbacks, &server->cb_break_n); + for (loop = ncallbacks; loop > 0; loop--) { + struct afs_callback *cb = + &server->cb_break[server->cb_break_tail]; + + *bp++ = htonl(cb->fid.vid); + *bp++ = htonl(cb->fid.vnode); + *bp++ = htonl(cb->fid.unique); + *tp++ = htonl(cb->version); + *tp++ = htonl(cb->expiry); + *tp++ = htonl(cb->type); + smp_mb(); + server->cb_break_tail = + (server->cb_break_tail + 1) & + (ARRAY_SIZE(server->cb_break) - 1); + } + + ASSERT(ncallbacks > 0); + wake_up_nr(&server->cb_break_waitq, ncallbacks); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.CreateFile or an FS.MakeDir + */ +static int afs_deliver_fs_create_vnode(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *vnode = call->reply; + const __be32 *bp; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFid(&bp, call->reply2); + xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL); + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + xdr_decode_AFSCallBack_raw(&bp, call->reply4); + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.CreateFile and FS.MakeDir operation type + */ +static const struct afs_call_type afs_RXFSCreateXXXX = { + .name = "FS.CreateXXXX", + .deliver = afs_deliver_fs_create_vnode, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * create a file or make a directory + */ +int afs_fs_create(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + const char *name, + umode_t mode, + struct afs_fid *newfid, + struct afs_file_status *newstatus, + struct afs_callback *newcb, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + size_t namesz, reqsz, padsz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + padsz = (4 - (namesz & 3)) & 3; + reqsz = (5 * 4) + namesz + padsz + (6 * 4); + + call = afs_alloc_flat_call(&afs_RXFSCreateXXXX, reqsz, + (3 + 21 + 21 + 3 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->reply2 = newfid; + call->reply3 = newstatus; + call->reply4 = newcb; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(S_ISDIR(mode) ? FSMAKEDIR : FSCREATEFILE); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(namesz); + memcpy(bp, name, namesz); + bp = (void *) bp + namesz; + if (padsz > 0) { + memset(bp, 0, padsz); + bp = (void *) bp + padsz; + } + *bp++ = htonl(AFS_SET_MODE); + *bp++ = 0; /* mtime */ + *bp++ = 0; /* owner */ + *bp++ = 0; /* group */ + *bp++ = htonl(mode & S_IALLUGO); /* unix mode */ + *bp++ = 0; /* segment size */ + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.RemoveFile or FS.RemoveDir + */ +static int afs_deliver_fs_remove(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *vnode = call->reply; + const __be32 *bp; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.RemoveDir/FS.RemoveFile operation type + */ +static const struct afs_call_type afs_RXFSRemoveXXXX = { + .name = "FS.RemoveXXXX", + .deliver = afs_deliver_fs_remove, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * remove a file or directory + */ +int afs_fs_remove(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + const char *name, + bool isdir, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + size_t namesz, reqsz, padsz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + padsz = (4 - (namesz & 3)) & 3; + reqsz = (5 * 4) + namesz + padsz; + + call = afs_alloc_flat_call(&afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(namesz); + memcpy(bp, name, namesz); + bp = (void *) bp + namesz; + if (padsz > 0) { + memset(bp, 0, padsz); + bp = (void *) bp + padsz; + } + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.Link + */ +static int afs_deliver_fs_link(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *dvnode = call->reply, *vnode = call->reply2; + const __be32 *bp; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL); + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.Link operation type + */ +static const struct afs_call_type afs_RXFSLink = { + .name = "FS.Link", + .deliver = afs_deliver_fs_link, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * make a hard link + */ +int afs_fs_link(struct afs_server *server, + struct key *key, + struct afs_vnode *dvnode, + struct afs_vnode *vnode, + const char *name, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + size_t namesz, reqsz, padsz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + padsz = (4 - (namesz & 3)) & 3; + reqsz = (5 * 4) + namesz + padsz + (3 * 4); + + call = afs_alloc_flat_call(&afs_RXFSLink, reqsz, (21 + 21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = dvnode; + call->reply2 = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSLINK); + *bp++ = htonl(dvnode->fid.vid); + *bp++ = htonl(dvnode->fid.vnode); + *bp++ = htonl(dvnode->fid.unique); + *bp++ = htonl(namesz); + memcpy(bp, name, namesz); + bp = (void *) bp + namesz; + if (padsz > 0) { + memset(bp, 0, padsz); + bp = (void *) bp + padsz; + } + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.Symlink + */ +static int afs_deliver_fs_symlink(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *vnode = call->reply; + const __be32 *bp; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFid(&bp, call->reply2); + xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL); + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.Symlink operation type + */ +static const struct afs_call_type afs_RXFSSymlink = { + .name = "FS.Symlink", + .deliver = afs_deliver_fs_symlink, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * create a symbolic link + */ +int afs_fs_symlink(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + const char *name, + const char *contents, + struct afs_fid *newfid, + struct afs_file_status *newstatus, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + size_t namesz, reqsz, padsz, c_namesz, c_padsz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + padsz = (4 - (namesz & 3)) & 3; + + c_namesz = strlen(contents); + c_padsz = (4 - (c_namesz & 3)) & 3; + + reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4); + + call = afs_alloc_flat_call(&afs_RXFSSymlink, reqsz, + (3 + 21 + 21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->reply2 = newfid; + call->reply3 = newstatus; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSYMLINK); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(namesz); + memcpy(bp, name, namesz); + bp = (void *) bp + namesz; + if (padsz > 0) { + memset(bp, 0, padsz); + bp = (void *) bp + padsz; + } + *bp++ = htonl(c_namesz); + memcpy(bp, contents, c_namesz); + bp = (void *) bp + c_namesz; + if (c_padsz > 0) { + memset(bp, 0, c_padsz); + bp = (void *) bp + c_padsz; + } + *bp++ = htonl(AFS_SET_MODE); + *bp++ = 0; /* mtime */ + *bp++ = 0; /* owner */ + *bp++ = 0; /* group */ + *bp++ = htonl(S_IRWXUGO); /* unix mode */ + *bp++ = 0; /* segment size */ + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.Rename + */ +static int afs_deliver_fs_rename(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2; + const __be32 *bp; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode, NULL); + if (new_dvnode != orig_dvnode) + xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode, + NULL); + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.Rename operation type + */ +static const struct afs_call_type afs_RXFSRename = { + .name = "FS.Rename", + .deliver = afs_deliver_fs_rename, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * create a symbolic link + */ +int afs_fs_rename(struct afs_server *server, + struct key *key, + struct afs_vnode *orig_dvnode, + const char *orig_name, + struct afs_vnode *new_dvnode, + const char *new_name, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz; + __be32 *bp; + + _enter(""); + + o_namesz = strlen(orig_name); + o_padsz = (4 - (o_namesz & 3)) & 3; + + n_namesz = strlen(new_name); + n_padsz = (4 - (n_namesz & 3)) & 3; + + reqsz = (4 * 4) + + 4 + o_namesz + o_padsz + + (3 * 4) + + 4 + n_namesz + n_padsz; + + call = afs_alloc_flat_call(&afs_RXFSRename, reqsz, (21 + 21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = orig_dvnode; + call->reply2 = new_dvnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSRENAME); + *bp++ = htonl(orig_dvnode->fid.vid); + *bp++ = htonl(orig_dvnode->fid.vnode); + *bp++ = htonl(orig_dvnode->fid.unique); + *bp++ = htonl(o_namesz); + memcpy(bp, orig_name, o_namesz); + bp = (void *) bp + o_namesz; + if (o_padsz > 0) { + memset(bp, 0, o_padsz); + bp = (void *) bp + o_padsz; + } + + *bp++ = htonl(new_dvnode->fid.vid); + *bp++ = htonl(new_dvnode->fid.vnode); + *bp++ = htonl(new_dvnode->fid.unique); + *bp++ = htonl(n_namesz); + memcpy(bp, new_name, n_namesz); + bp = (void *) bp + n_namesz; + if (n_padsz > 0) { + memset(bp, 0, n_padsz); + bp = (void *) bp + n_padsz; + } + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.StoreData + */ +static int afs_deliver_fs_store_data(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_vnode *vnode = call->reply; + const __be32 *bp; + + _enter(",,%u", last); + + afs_transfer_reply(call, skb); + if (!last) { + _leave(" = 0 [more]"); + return 0; + } + + if (call->reply_size != call->reply_max) { + _leave(" = -EBADMSG [%u != %u]", + call->reply_size, call->reply_max); + return -EBADMSG; + } + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, + &call->store_version); + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + afs_pages_written_back(vnode, call); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.StoreData operation type + */ +static const struct afs_call_type afs_RXFSStoreData = { + .name = "FS.StoreData", + .deliver = afs_deliver_fs_store_data, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type afs_RXFSStoreData64 = { + .name = "FS.StoreData64", + .deliver = afs_deliver_fs_store_data, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * store a set of pages to a very large file + */ +static int afs_fs_store_data64(struct afs_server *server, + struct afs_writeback *wb, + pgoff_t first, pgoff_t last, + unsigned offset, unsigned to, + loff_t size, loff_t pos, loff_t i_size, + const struct afs_wait_mode *wait_mode) +{ + struct afs_vnode *vnode = wb->vnode; + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%x:%u},,", + key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode); + + call = afs_alloc_flat_call(&afs_RXFSStoreData64, + (4 + 6 + 3 * 2) * 4, + (21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->wb = wb; + call->key = wb->key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + call->mapping = vnode->vfs_inode.i_mapping; + call->first = first; + call->last = last; + call->first_offset = offset; + call->last_to = to; + call->send_pages = true; + call->store_version = vnode->status.data_version + 1; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSTOREDATA64); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + *bp++ = 0; /* mask */ + *bp++ = 0; /* mtime */ + *bp++ = 0; /* owner */ + *bp++ = 0; /* group */ + *bp++ = 0; /* unix mode */ + *bp++ = 0; /* segment size */ + + *bp++ = htonl(pos >> 32); + *bp++ = htonl((u32) pos); + *bp++ = htonl(size >> 32); + *bp++ = htonl((u32) size); + *bp++ = htonl(i_size >> 32); + *bp++ = htonl((u32) i_size); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * store a set of pages + */ +int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, + pgoff_t first, pgoff_t last, + unsigned offset, unsigned to, + const struct afs_wait_mode *wait_mode) +{ + struct afs_vnode *vnode = wb->vnode; + struct afs_call *call; + loff_t size, pos, i_size; + __be32 *bp; + + _enter(",%x,{%x:%u},,", + key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode); + + size = to - offset; + if (first != last) + size += (loff_t)(last - first) << PAGE_SHIFT; + pos = (loff_t)first << PAGE_SHIFT; + pos += offset; + + i_size = i_size_read(&vnode->vfs_inode); + if (pos + size > i_size) + i_size = size + pos; + + _debug("size %llx, at %llx, i_size %llx", + (unsigned long long) size, (unsigned long long) pos, + (unsigned long long) i_size); + + if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32) + return afs_fs_store_data64(server, wb, first, last, offset, to, + size, pos, i_size, wait_mode); + + call = afs_alloc_flat_call(&afs_RXFSStoreData, + (4 + 6 + 3) * 4, + (21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->wb = wb; + call->key = wb->key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + call->mapping = vnode->vfs_inode.i_mapping; + call->first = first; + call->last = last; + call->first_offset = offset; + call->last_to = to; + call->send_pages = true; + call->store_version = vnode->status.data_version + 1; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSTOREDATA); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + *bp++ = 0; /* mask */ + *bp++ = 0; /* mtime */ + *bp++ = 0; /* owner */ + *bp++ = 0; /* group */ + *bp++ = 0; /* unix mode */ + *bp++ = 0; /* segment size */ + + *bp++ = htonl(pos); + *bp++ = htonl(size); + *bp++ = htonl(i_size); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.StoreStatus + */ +static int afs_deliver_fs_store_status(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + afs_dataversion_t *store_version; + struct afs_vnode *vnode = call->reply; + const __be32 *bp; + + _enter(",,%u", last); + + afs_transfer_reply(call, skb); + if (!last) { + _leave(" = 0 [more]"); + return 0; + } + + if (call->reply_size != call->reply_max) { + _leave(" = -EBADMSG [%u != %u]", + call->reply_size, call->reply_max); + return -EBADMSG; + } + + /* unmarshall the reply once we've received all of it */ + store_version = NULL; + if (call->operation_ID == FSSTOREDATA) + store_version = &call->store_version; + + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version); + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.StoreStatus operation type + */ +static const struct afs_call_type afs_RXFSStoreStatus = { + .name = "FS.StoreStatus", + .deliver = afs_deliver_fs_store_status, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type afs_RXFSStoreData_as_Status = { + .name = "FS.StoreData", + .deliver = afs_deliver_fs_store_status, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type afs_RXFSStoreData64_as_Status = { + .name = "FS.StoreData64", + .deliver = afs_deliver_fs_store_status, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * set the attributes on a very large file, using FS.StoreData rather than + * FS.StoreStatus so as to alter the file size also + */ +static int afs_fs_setattr_size64(struct afs_server *server, struct key *key, + struct afs_vnode *vnode, struct iattr *attr, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%x:%u},,", + key_serial(key), vnode->fid.vid, vnode->fid.vnode); + + ASSERT(attr->ia_valid & ATTR_SIZE); + + call = afs_alloc_flat_call(&afs_RXFSStoreData64_as_Status, + (4 + 6 + 3 * 2) * 4, + (21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + call->store_version = vnode->status.data_version + 1; + call->operation_ID = FSSTOREDATA; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSTOREDATA64); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + xdr_encode_AFS_StoreStatus(&bp, attr); + + *bp++ = 0; /* position of start of write */ + *bp++ = 0; + *bp++ = 0; /* size of write */ + *bp++ = 0; + *bp++ = htonl(attr->ia_size >> 32); /* new file length */ + *bp++ = htonl((u32) attr->ia_size); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus + * so as to alter the file size also + */ +static int afs_fs_setattr_size(struct afs_server *server, struct key *key, + struct afs_vnode *vnode, struct iattr *attr, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%x:%u},,", + key_serial(key), vnode->fid.vid, vnode->fid.vnode); + + ASSERT(attr->ia_valid & ATTR_SIZE); + if (attr->ia_size >> 32) + return afs_fs_setattr_size64(server, key, vnode, attr, + wait_mode); + + call = afs_alloc_flat_call(&afs_RXFSStoreData_as_Status, + (4 + 6 + 3) * 4, + (21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + call->store_version = vnode->status.data_version + 1; + call->operation_ID = FSSTOREDATA; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSTOREDATA); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + xdr_encode_AFS_StoreStatus(&bp, attr); + + *bp++ = 0; /* position of start of write */ + *bp++ = 0; /* size of write */ + *bp++ = htonl(attr->ia_size); /* new file length */ + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * set the attributes on a file, using FS.StoreData if there's a change in file + * size, and FS.StoreStatus otherwise + */ +int afs_fs_setattr(struct afs_server *server, struct key *key, + struct afs_vnode *vnode, struct iattr *attr, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + if (attr->ia_valid & ATTR_SIZE) + return afs_fs_setattr_size(server, key, vnode, attr, + wait_mode); + + _enter(",%x,{%x:%u},,", + key_serial(key), vnode->fid.vid, vnode->fid.vnode); + + call = afs_alloc_flat_call(&afs_RXFSStoreStatus, + (4 + 6) * 4, + (21 + 6) * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + call->operation_ID = FSSTORESTATUS; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSTORESTATUS); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + xdr_encode_AFS_StoreStatus(&bp, attr); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.GetVolumeStatus + */ +static int afs_deliver_fs_get_volume_status(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + const __be32 *bp; + char *p; + int ret; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->unmarshall++; + + /* extract the returned status record */ + case 1: + _debug("extract status"); + ret = afs_extract_data(call, skb, last, call->buffer, + 12 * 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + bp = call->buffer; + xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2); + call->offset = 0; + call->unmarshall++; + + /* extract the volume name length */ + case 2: + ret = afs_extract_data(call, skb, last, &call->tmp, 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->count = ntohl(call->tmp); + _debug("volname length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return -EBADMSG; + call->offset = 0; + call->unmarshall++; + + /* extract the volume name */ + case 3: + _debug("extract volname"); + if (call->count > 0) { + ret = afs_extract_data(call, skb, last, call->reply3, + call->count); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + } + + p = call->reply3; + p[call->count] = 0; + _debug("volname '%s'", p); + + call->offset = 0; + call->unmarshall++; + + /* extract the volume name padding */ + if ((call->count & 3) == 0) { + call->unmarshall++; + goto no_volname_padding; + } + call->count = 4 - (call->count & 3); + + case 4: + ret = afs_extract_data(call, skb, last, call->buffer, + call->count); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->offset = 0; + call->unmarshall++; + no_volname_padding: + + /* extract the offline message length */ + case 5: + ret = afs_extract_data(call, skb, last, &call->tmp, 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->count = ntohl(call->tmp); + _debug("offline msg length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return -EBADMSG; + call->offset = 0; + call->unmarshall++; + + /* extract the offline message */ + case 6: + _debug("extract offline"); + if (call->count > 0) { + ret = afs_extract_data(call, skb, last, call->reply3, + call->count); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + } + + p = call->reply3; + p[call->count] = 0; + _debug("offline '%s'", p); + + call->offset = 0; + call->unmarshall++; + + /* extract the offline message padding */ + if ((call->count & 3) == 0) { + call->unmarshall++; + goto no_offline_padding; + } + call->count = 4 - (call->count & 3); + + case 7: + ret = afs_extract_data(call, skb, last, call->buffer, + call->count); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->offset = 0; + call->unmarshall++; + no_offline_padding: + + /* extract the message of the day length */ + case 8: + ret = afs_extract_data(call, skb, last, &call->tmp, 4); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->count = ntohl(call->tmp); + _debug("motd length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return -EBADMSG; + call->offset = 0; + call->unmarshall++; + + /* extract the message of the day */ + case 9: + _debug("extract motd"); + if (call->count > 0) { + ret = afs_extract_data(call, skb, last, call->reply3, + call->count); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + } + + p = call->reply3; + p[call->count] = 0; + _debug("motd '%s'", p); + + call->offset = 0; + call->unmarshall++; + + /* extract the message of the day padding */ + if ((call->count & 3) == 0) { + call->unmarshall++; + goto no_motd_padding; + } + call->count = 4 - (call->count & 3); + + case 10: + ret = afs_extract_data(call, skb, last, call->buffer, + call->count); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + call->offset = 0; + call->unmarshall++; + no_motd_padding: + + case 11: + _debug("trailer %d", skb->len); + if (skb->len != 0) + return -EBADMSG; + break; + } + + if (!last) + return 0; + + _leave(" = 0 [done]"); + return 0; +} + +/* + * destroy an FS.GetVolumeStatus call + */ +static void afs_get_volume_status_call_destructor(struct afs_call *call) +{ + kfree(call->reply3); + call->reply3 = NULL; + afs_flat_call_destructor(call); +} + +/* + * FS.GetVolumeStatus operation type + */ +static const struct afs_call_type afs_RXFSGetVolumeStatus = { + .name = "FS.GetVolumeStatus", + .deliver = afs_deliver_fs_get_volume_status, + .abort_to_error = afs_abort_to_error, + .destructor = afs_get_volume_status_call_destructor, +}; + +/* + * fetch the status of a volume + */ +int afs_fs_get_volume_status(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + struct afs_volume_status *vs, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + void *tmpbuf; + + _enter(""); + + tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; + + call = afs_alloc_flat_call(&afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4); + if (!call) { + kfree(tmpbuf); + return -ENOMEM; + } + + call->key = key; + call->reply = vnode; + call->reply2 = vs; + call->reply3 = tmpbuf; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + bp[0] = htonl(FSGETVOLUMESTATUS); + bp[1] = htonl(vnode->fid.vid); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock + */ +static int afs_deliver_fs_xxxx_lock(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + const __be32 *bp; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.SetLock operation type + */ +static const struct afs_call_type afs_RXFSSetLock = { + .name = "FS.SetLock", + .deliver = afs_deliver_fs_xxxx_lock, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * FS.ExtendLock operation type + */ +static const struct afs_call_type afs_RXFSExtendLock = { + .name = "FS.ExtendLock", + .deliver = afs_deliver_fs_xxxx_lock, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * FS.ReleaseLock operation type + */ +static const struct afs_call_type afs_RXFSReleaseLock = { + .name = "FS.ReleaseLock", + .deliver = afs_deliver_fs_xxxx_lock, + .abort_to_error = afs_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * get a lock on a file + */ +int afs_fs_set_lock(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + afs_lock_type_t type, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(&afs_RXFSSetLock, 5 * 4, 6 * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSSETLOCK); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(type); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * extend a lock on a file + */ +int afs_fs_extend_lock(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(&afs_RXFSExtendLock, 4 * 4, 6 * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSEXTENDLOCK); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} + +/* + * release a lock on a file + */ +int afs_fs_release_lock(struct afs_server *server, + struct key *key, + struct afs_vnode *vnode, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(&afs_RXFSReleaseLock, 4 * 4, 6 * 4); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = vnode; + call->service_id = FS_SERVICE; + call->port = htons(AFS_FS_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSRELEASELOCK); + *bp++ = htonl(vnode->fid.vid); + *bp++ = htonl(vnode->fid.vnode); + *bp++ = htonl(vnode->fid.unique); + + return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); +} diff --git a/fs/afs/inode.c b/fs/afs/inode.c new file mode 100644 index 00000000..0fdab6e0 --- /dev/null +++ b/fs/afs/inode.c @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2002 Red Hat, Inc. All rights reserved. + * + * This software may be freely redistributed under the terms of the + * GNU General Public License. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Authors: David Woodhouse + * David Howells + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +struct afs_iget_data { + struct afs_fid fid; + struct afs_volume *volume; /* volume on which resides */ +}; + +/* + * map the AFS file status to the inode member variables + */ +static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) +{ + struct inode *inode = AFS_VNODE_TO_I(vnode); + + _debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu", + vnode->status.type, + vnode->status.nlink, + (unsigned long long) vnode->status.size, + vnode->status.data_version, + vnode->status.mode); + + switch (vnode->status.type) { + case AFS_FTYPE_FILE: + inode->i_mode = S_IFREG | vnode->status.mode; + inode->i_op = &afs_file_inode_operations; + inode->i_fop = &afs_file_operations; + break; + case AFS_FTYPE_DIR: + inode->i_mode = S_IFDIR | vnode->status.mode; + inode->i_op = &afs_dir_inode_operations; + inode->i_fop = &afs_dir_file_operations; + break; + case AFS_FTYPE_SYMLINK: + inode->i_mode = S_IFLNK | vnode->status.mode; + inode->i_op = &page_symlink_inode_operations; + break; + default: + printk("kAFS: AFS vnode with undefined type\n"); + return -EBADMSG; + } + +#ifdef CONFIG_AFS_FSCACHE + if (vnode->status.size != inode->i_size) + fscache_attr_changed(vnode->cache); +#endif + + inode->i_nlink = vnode->status.nlink; + inode->i_uid = vnode->status.owner; + inode->i_gid = 0; + inode->i_size = vnode->status.size; + inode->i_ctime.tv_sec = vnode->status.mtime_server; + inode->i_ctime.tv_nsec = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime; + inode->i_blocks = 0; + inode->i_generation = vnode->fid.unique; + inode->i_version = vnode->status.data_version; + inode->i_mapping->a_ops = &afs_fs_aops; + + /* check to see whether a symbolic link is really a mountpoint */ + if (vnode->status.type == AFS_FTYPE_SYMLINK) { + afs_mntpt_check_symlink(vnode, key); + + if (test_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags)) { + inode->i_mode = S_IFDIR | vnode->status.mode; + inode->i_op = &afs_mntpt_inode_operations; + inode->i_fop = &afs_mntpt_file_operations; + } + } + + return 0; +} + +/* + * iget5() comparator + */ +static int afs_iget5_test(struct inode *inode, void *opaque) +{ + struct afs_iget_data *data = opaque; + + return inode->i_ino == data->fid.vnode && + inode->i_generation == data->fid.unique; +} + +/* + * iget5() comparator for inode created by autocell operations + * + * These pseudo inodes don't match anything. + */ +static int afs_iget5_autocell_test(struct inode *inode, void *opaque) +{ + return 0; +} + +/* + * iget5() inode initialiser + */ +static int afs_iget5_set(struct inode *inode, void *opaque) +{ + struct afs_iget_data *data = opaque; + struct afs_vnode *vnode = AFS_FS_I(inode); + + inode->i_ino = data->fid.vnode; + inode->i_generation = data->fid.unique; + vnode->fid = data->fid; + vnode->volume = data->volume; + + return 0; +} + +/* + * inode retrieval for autocell + */ +struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name, + int namesz, struct key *key) +{ + struct afs_iget_data data; + struct afs_super_info *as; + struct afs_vnode *vnode; + struct super_block *sb; + struct inode *inode; + static atomic_t afs_autocell_ino; + + _enter("{%x:%u},%*.*s,", + AFS_FS_I(dir)->fid.vid, AFS_FS_I(dir)->fid.vnode, + namesz, namesz, dev_name ?: ""); + + sb = dir->i_sb; + as = sb->s_fs_info; + data.volume = as->volume; + data.fid.vid = as->volume->vid; + data.fid.unique = 0; + data.fid.vnode = 0; + + inode = iget5_locked(sb, atomic_inc_return(&afs_autocell_ino), + afs_iget5_autocell_test, afs_iget5_set, + &data); + if (!inode) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + _debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }", + inode, inode->i_ino, data.fid.vid, data.fid.vnode, + data.fid.unique); + + vnode = AFS_FS_I(inode); + + /* there shouldn't be an existing inode */ + BUG_ON(!(inode->i_state & I_NEW)); + + inode->i_size = 0; + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + inode->i_op = &afs_autocell_inode_operations; + inode->i_nlink = 2; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_ctime.tv_sec = get_seconds(); + inode->i_ctime.tv_nsec = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime; + inode->i_blocks = 0; + inode->i_version = 0; + inode->i_generation = 0; + + set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); + set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); + inode->i_flags |= S_AUTOMOUNT | S_NOATIME; + unlock_new_inode(inode); + _leave(" = %p", inode); + return inode; +} + +/* + * inode retrieval + */ +struct inode *afs_iget(struct super_block *sb, struct key *key, + struct afs_fid *fid, struct afs_file_status *status, + struct afs_callback *cb) +{ + struct afs_iget_data data = { .fid = *fid }; + struct afs_super_info *as; + struct afs_vnode *vnode; + struct inode *inode; + int ret; + + _enter(",{%x:%u.%u},,", fid->vid, fid->vnode, fid->unique); + + as = sb->s_fs_info; + data.volume = as->volume; + + inode = iget5_locked(sb, fid->vnode, afs_iget5_test, afs_iget5_set, + &data); + if (!inode) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + _debug("GOT INODE %p { vl=%x vn=%x, u=%x }", + inode, fid->vid, fid->vnode, fid->unique); + + vnode = AFS_FS_I(inode); + + /* deal with an existing inode */ + if (!(inode->i_state & I_NEW)) { + _leave(" = %p", inode); + return inode; + } + + if (!status) { + /* it's a remotely extant inode */ + set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + ret = afs_vnode_fetch_status(vnode, NULL, key); + if (ret < 0) + goto bad_inode; + } else { + /* it's an inode we just created */ + memcpy(&vnode->status, status, sizeof(vnode->status)); + + if (!cb) { + /* it's a symlink we just created (the fileserver + * didn't give us a callback) */ + vnode->cb_version = 0; + vnode->cb_expiry = 0; + vnode->cb_type = 0; + vnode->cb_expires = get_seconds(); + } else { + vnode->cb_version = cb->version; + vnode->cb_expiry = cb->expiry; + vnode->cb_type = cb->type; + vnode->cb_expires = vnode->cb_expiry + get_seconds(); + } + } + + /* set up caching before mapping the status, as map-status reads the + * first page of symlinks to see if they're really mountpoints */ + inode->i_size = vnode->status.size; +#ifdef CONFIG_AFS_FSCACHE + vnode->cache = fscache_acquire_cookie(vnode->volume->cache, + &afs_vnode_cache_index_def, + vnode); +#endif + + ret = afs_inode_map_status(vnode, key); + if (ret < 0) + goto bad_inode; + + /* success */ + clear_bit(AFS_VNODE_UNSET, &vnode->flags); + inode->i_flags |= S_NOATIME; + unlock_new_inode(inode); + _leave(" = %p [CB { v=%u t=%u }]", inode, vnode->cb_version, vnode->cb_type); + return inode; + + /* failure */ +bad_inode: +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(vnode->cache, 0); + vnode->cache = NULL; +#endif + iget_failed(inode); + _leave(" = %d [bad]", ret); + return ERR_PTR(ret); +} + +/* + * mark the data attached to an inode as obsolete due to a write on the server + * - might also want to ditch all the outstanding writes and dirty pages + */ +void afs_zap_data(struct afs_vnode *vnode) +{ + _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + + /* nuke all the non-dirty pages that aren't locked, mapped or being + * written back in a regular file and completely discard the pages in a + * directory or symlink */ + if (S_ISREG(vnode->vfs_inode.i_mode)) + invalidate_remote_inode(&vnode->vfs_inode); + else + invalidate_inode_pages2(vnode->vfs_inode.i_mapping); +} + +/* + * validate a vnode/inode + * - there are several things we need to check + * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, + * symlink) + * - parent dir metadata changed (security changes) + * - dentry data changed (write, truncate) + * - dentry metadata changed (security changes) + */ +int afs_validate(struct afs_vnode *vnode, struct key *key) +{ + int ret; + + _enter("{v={%x:%u} fl=%lx},%x", + vnode->fid.vid, vnode->fid.vnode, vnode->flags, + key_serial(key)); + + if (vnode->cb_promised && + !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) && + !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) && + !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { + if (vnode->cb_expires < get_seconds() + 10) { + _debug("callback expired"); + set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + } else { + goto valid; + } + } + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + goto valid; + + mutex_lock(&vnode->validate_lock); + + /* if the promise has expired, we need to check the server again to get + * a new promise - note that if the (parent) directory's metadata was + * changed then the security may be different and we may no longer have + * access */ + if (!vnode->cb_promised || + test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) { + _debug("not promised"); + ret = afs_vnode_fetch_status(vnode, NULL, key); + if (ret < 0) + goto error_unlock; + _debug("new promise [fl=%lx]", vnode->flags); + } + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + _debug("file already deleted"); + ret = -ESTALE; + goto error_unlock; + } + + /* if the vnode's data version number changed then its contents are + * different */ + if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) + afs_zap_data(vnode); + + clear_bit(AFS_VNODE_MODIFIED, &vnode->flags); + mutex_unlock(&vnode->validate_lock); +valid: + _leave(" = 0"); + return 0; + +error_unlock: + mutex_unlock(&vnode->validate_lock); + _leave(" = %d", ret); + return ret; +} + +/* + * read the attributes of an inode + */ +int afs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode; + + inode = dentry->d_inode; + + _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); + + generic_fillattr(inode, stat); + return 0; +} + +/* + * discard an AFS inode + */ +int afs_drop_inode(struct inode *inode) +{ + _enter(""); + + if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags)) + return generic_delete_inode(inode); + else + return generic_drop_inode(inode); +} + +/* + * clear an AFS inode + */ +void afs_evict_inode(struct inode *inode) +{ + struct afs_permits *permits; + struct afs_vnode *vnode; + + vnode = AFS_FS_I(inode); + + _enter("{%x:%u.%d} v=%u x=%u t=%u }", + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + vnode->cb_version, + vnode->cb_expiry, + vnode->cb_type); + + _debug("CLEAR INODE %p", inode); + + ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); + + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + + afs_give_up_callback(vnode); + + if (vnode->server) { + spin_lock(&vnode->server->fs_lock); + rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes); + spin_unlock(&vnode->server->fs_lock); + afs_put_server(vnode->server); + vnode->server = NULL; + } + + ASSERT(list_empty(&vnode->writebacks)); + ASSERT(!vnode->cb_promised); + +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(vnode->cache, 0); + vnode->cache = NULL; +#endif + + mutex_lock(&vnode->permits_lock); + permits = vnode->permits; + rcu_assign_pointer(vnode->permits, NULL); + mutex_unlock(&vnode->permits_lock); + if (permits) + call_rcu(&permits->rcu, afs_zap_permits); + + _leave(""); +} + +/* + * set the attributes of an inode + */ +int afs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); + struct key *key; + int ret; + + _enter("{%x:%u},{n=%s},%x", + vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name, + attr->ia_valid); + + if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID | + ATTR_MTIME))) { + _leave(" = 0 [unsupported]"); + return 0; + } + + /* flush any dirty data outstanding on a regular file */ + if (S_ISREG(vnode->vfs_inode.i_mode)) { + filemap_write_and_wait(vnode->vfs_inode.i_mapping); + afs_writeback_all(vnode); + } + + if (attr->ia_valid & ATTR_FILE) { + key = attr->ia_file->private_data; + } else { + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error; + } + } + + ret = afs_vnode_setattr(vnode, key, attr); + if (!(attr->ia_valid & ATTR_FILE)) + key_put(key); + +error: + _leave(" = %d", ret); + return ret; +} diff --git a/fs/afs/internal.h b/fs/afs/internal.h new file mode 100644 index 00000000..1f3624d3 --- /dev/null +++ b/fs/afs/internal.h @@ -0,0 +1,890 @@ +/* internal AFS stuff + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "afs.h" +#include "afs_vl.h" + +#define AFS_CELL_MAX_ADDRS 15 + +struct pagevec; +struct afs_call; + +typedef enum { + AFS_VL_NEW, /* new, uninitialised record */ + AFS_VL_CREATING, /* creating record */ + AFS_VL_VALID, /* record is pending */ + AFS_VL_NO_VOLUME, /* no such volume available */ + AFS_VL_UPDATING, /* update in progress */ + AFS_VL_VOLUME_DELETED, /* volume was deleted */ + AFS_VL_UNCERTAIN, /* uncertain state (update failed) */ +} __attribute__((packed)) afs_vlocation_state_t; + +struct afs_mount_params { + bool rwpath; /* T if the parent should be considered R/W */ + bool force; /* T to force cell type */ + bool autocell; /* T if set auto mount operation */ + afs_voltype_t type; /* type of volume requested */ + int volnamesz; /* size of volume name */ + const char *volname; /* name of volume to mount */ + struct afs_cell *cell; /* cell in which to find volume */ + struct afs_volume *volume; /* volume record */ + struct key *key; /* key to use for secure mounting */ +}; + +/* + * definition of how to wait for the completion of an operation + */ +struct afs_wait_mode { + /* RxRPC received message notification */ + void (*rx_wakeup)(struct afs_call *call); + + /* synchronous call waiter and call dispatched notification */ + int (*wait)(struct afs_call *call); + + /* asynchronous call completion */ + void (*async_complete)(void *reply, int error); +}; + +extern const struct afs_wait_mode afs_sync_call; +extern const struct afs_wait_mode afs_async_call; + +/* + * a record of an in-progress RxRPC call + */ +struct afs_call { + const struct afs_call_type *type; /* type of call */ + const struct afs_wait_mode *wait_mode; /* completion wait mode */ + wait_queue_head_t waitq; /* processes awaiting completion */ + struct work_struct async_work; /* asynchronous work processor */ + struct work_struct work; /* actual work processor */ + struct sk_buff_head rx_queue; /* received packets */ + struct rxrpc_call *rxcall; /* RxRPC call handle */ + struct key *key; /* security for this call */ + struct afs_server *server; /* server affected by incoming CM call */ + void *request; /* request data (first part) */ + struct address_space *mapping; /* page set */ + struct afs_writeback *wb; /* writeback being performed */ + void *buffer; /* reply receive buffer */ + void *reply; /* reply buffer (first part) */ + void *reply2; /* reply buffer (second part) */ + void *reply3; /* reply buffer (third part) */ + void *reply4; /* reply buffer (fourth part) */ + pgoff_t first; /* first page in mapping to deal with */ + pgoff_t last; /* last page in mapping to deal with */ + enum { /* call state */ + AFS_CALL_REQUESTING, /* request is being sent for outgoing call */ + AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */ + AFS_CALL_AWAIT_OP_ID, /* awaiting op ID on incoming call */ + AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */ + AFS_CALL_REPLYING, /* replying to incoming call */ + AFS_CALL_AWAIT_ACK, /* awaiting final ACK of incoming call */ + AFS_CALL_COMPLETE, /* successfully completed */ + AFS_CALL_BUSY, /* server was busy */ + AFS_CALL_ABORTED, /* call was aborted */ + AFS_CALL_ERROR, /* call failed due to error */ + } state; + int error; /* error code */ + unsigned request_size; /* size of request data */ + unsigned reply_max; /* maximum size of reply */ + unsigned reply_size; /* current size of reply */ + unsigned first_offset; /* offset into mapping[first] */ + unsigned last_to; /* amount of mapping[last] */ + unsigned offset; /* offset into received data store */ + unsigned char unmarshall; /* unmarshalling phase */ + bool incoming; /* T if incoming call */ + bool send_pages; /* T if data from mapping should be sent */ + u16 service_id; /* RxRPC service ID to call */ + __be16 port; /* target UDP port */ + __be32 operation_ID; /* operation ID for an incoming call */ + u32 count; /* count for use in unmarshalling */ + __be32 tmp; /* place to extract temporary data */ + afs_dataversion_t store_version; /* updated version expected from store */ +}; + +struct afs_call_type { + const char *name; + + /* deliver request or reply data to an call + * - returning an error will cause the call to be aborted + */ + int (*deliver)(struct afs_call *call, struct sk_buff *skb, + bool last); + + /* map an abort code to an error number */ + int (*abort_to_error)(u32 abort_code); + + /* clean up a call */ + void (*destructor)(struct afs_call *call); +}; + +/* + * record of an outstanding writeback on a vnode + */ +struct afs_writeback { + struct list_head link; /* link in vnode->writebacks */ + struct work_struct writer; /* work item to perform the writeback */ + struct afs_vnode *vnode; /* vnode to which this write applies */ + struct key *key; /* owner of this write */ + wait_queue_head_t waitq; /* completion and ready wait queue */ + pgoff_t first; /* first page in batch */ + pgoff_t point; /* last page in current store op */ + pgoff_t last; /* last page in batch (inclusive) */ + unsigned offset_first; /* offset into first page of start of write */ + unsigned to_last; /* offset into last page of end of write */ + int num_conflicts; /* count of conflicting writes in list */ + int usage; + bool conflicts; /* T if has dependent conflicts */ + enum { + AFS_WBACK_SYNCING, /* synchronisation being performed */ + AFS_WBACK_PENDING, /* write pending */ + AFS_WBACK_CONFLICTING, /* conflicting writes posted */ + AFS_WBACK_WRITING, /* writing back */ + AFS_WBACK_COMPLETE /* the writeback record has been unlinked */ + } state __attribute__((packed)); +}; + +/* + * AFS superblock private data + * - there's one superblock per volume + */ +struct afs_super_info { + struct afs_volume *volume; /* volume record */ + char rwparent; /* T if parent is R/W AFS volume */ +}; + +static inline struct afs_super_info *AFS_FS_S(struct super_block *sb) +{ + return sb->s_fs_info; +} + +extern struct file_system_type afs_fs_type; + +/* + * entry in the cached cell catalogue + */ +struct afs_cache_cell { + char name[AFS_MAXCELLNAME]; /* cell name (padded with NULs) */ + struct in_addr vl_servers[15]; /* cached cell VL servers */ +}; + +/* + * AFS cell record + */ +struct afs_cell { + atomic_t usage; + struct list_head link; /* main cell list link */ + struct key *anonymous_key; /* anonymous user key for this cell */ + struct list_head proc_link; /* /proc cell list link */ + struct proc_dir_entry *proc_dir; /* /proc dir for this cell */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ +#endif + + /* server record management */ + rwlock_t servers_lock; /* active server list lock */ + struct list_head servers; /* active server list */ + + /* volume location record management */ + struct rw_semaphore vl_sem; /* volume management serialisation semaphore */ + struct list_head vl_list; /* cell's active VL record list */ + spinlock_t vl_lock; /* vl_list lock */ + unsigned short vl_naddrs; /* number of VL servers in addr list */ + unsigned short vl_curr_svix; /* current server index */ + struct in_addr vl_addrs[AFS_CELL_MAX_ADDRS]; /* cell VL server addresses */ + + char name[0]; /* cell name - must go last */ +}; + +/* + * entry in the cached volume location catalogue + */ +struct afs_cache_vlocation { + /* volume name (lowercase, padded with NULs) */ + uint8_t name[AFS_MAXVOLNAME + 1]; + + uint8_t nservers; /* number of entries used in servers[] */ + uint8_t vidmask; /* voltype mask for vid[] */ + uint8_t srvtmask[8]; /* voltype masks for servers[] */ +#define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */ +#define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */ +#define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */ + + afs_volid_t vid[3]; /* volume IDs for R/W, R/O and Bak volumes */ + struct in_addr servers[8]; /* fileserver addresses */ + time_t rtime; /* last retrieval time */ +}; + +/* + * volume -> vnode hash table entry + */ +struct afs_cache_vhash { + afs_voltype_t vtype; /* which volume variation */ + uint8_t hash_bucket; /* which hash bucket this represents */ +} __attribute__((packed)); + +/* + * AFS volume location record + */ +struct afs_vlocation { + atomic_t usage; + time_t time_of_death; /* time at which put reduced usage to 0 */ + struct list_head link; /* link in cell volume location list */ + struct list_head grave; /* link in master graveyard list */ + struct list_head update; /* link in master update list */ + struct afs_cell *cell; /* cell to which volume belongs */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ +#endif + struct afs_cache_vlocation vldb; /* volume information DB record */ + struct afs_volume *vols[3]; /* volume access record pointer (index by type) */ + wait_queue_head_t waitq; /* status change waitqueue */ + time_t update_at; /* time at which record should be updated */ + spinlock_t lock; /* access lock */ + afs_vlocation_state_t state; /* volume location state */ + unsigned short upd_rej_cnt; /* ENOMEDIUM count during update */ + unsigned short upd_busy_cnt; /* EBUSY count during update */ + bool valid; /* T if valid */ +}; + +/* + * AFS fileserver record + */ +struct afs_server { + atomic_t usage; + time_t time_of_death; /* time at which put reduced usage to 0 */ + struct in_addr addr; /* server address */ + struct afs_cell *cell; /* cell in which server resides */ + struct list_head link; /* link in cell's server list */ + struct list_head grave; /* link in master graveyard list */ + struct rb_node master_rb; /* link in master by-addr tree */ + struct rw_semaphore sem; /* access lock */ + + /* file service access */ + struct rb_root fs_vnodes; /* vnodes backed by this server (ordered by FID) */ + unsigned long fs_act_jif; /* time at which last activity occurred */ + unsigned long fs_dead_jif; /* time at which no longer to be considered dead */ + spinlock_t fs_lock; /* access lock */ + int fs_state; /* 0 or reason FS currently marked dead (-errno) */ + + /* callback promise management */ + struct rb_root cb_promises; /* vnode expiration list (ordered earliest first) */ + struct delayed_work cb_updater; /* callback updater */ + struct delayed_work cb_break_work; /* collected break dispatcher */ + wait_queue_head_t cb_break_waitq; /* space available in cb_break waitqueue */ + spinlock_t cb_lock; /* access lock */ + struct afs_callback cb_break[64]; /* ring of callbacks awaiting breaking */ + atomic_t cb_break_n; /* number of pending breaks */ + u8 cb_break_head; /* head of callback breaking ring */ + u8 cb_break_tail; /* tail of callback breaking ring */ +}; + +/* + * AFS volume access record + */ +struct afs_volume { + atomic_t usage; + struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */ + struct afs_vlocation *vlocation; /* volume location */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ +#endif + afs_volid_t vid; /* volume ID */ + afs_voltype_t type; /* type of volume */ + char type_force; /* force volume type (suppress R/O -> R/W) */ + unsigned short nservers; /* number of server slots filled */ + unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */ + struct afs_server *servers[8]; /* servers on which volume resides (ordered) */ + struct rw_semaphore server_sem; /* lock for accessing current server */ + struct backing_dev_info bdi; +}; + +/* + * vnode catalogue entry + */ +struct afs_cache_vnode { + afs_vnodeid_t vnode_id; /* vnode ID */ + unsigned vnode_unique; /* vnode ID uniquifier */ + afs_dataversion_t data_version; /* data version */ +}; + +/* + * AFS inode private data + */ +struct afs_vnode { + struct inode vfs_inode; /* the VFS's inode record */ + + struct afs_volume *volume; /* volume on which vnode resides */ + struct afs_server *server; /* server currently supplying this file */ + struct afs_fid fid; /* the file identifier for this inode */ + struct afs_file_status status; /* AFS status info for this file */ +#ifdef CONFIG_AFS_FSCACHE + struct fscache_cookie *cache; /* caching cookie */ +#endif + struct afs_permits *permits; /* cache of permits so far obtained */ + struct mutex permits_lock; /* lock for altering permits list */ + struct mutex validate_lock; /* lock for validating this vnode */ + wait_queue_head_t update_waitq; /* status fetch waitqueue */ + int update_cnt; /* number of outstanding ops that will update the + * status */ + spinlock_t writeback_lock; /* lock for writebacks */ + spinlock_t lock; /* waitqueue/flags lock */ + unsigned long flags; +#define AFS_VNODE_CB_BROKEN 0 /* set if vnode's callback was broken */ +#define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ +#define AFS_VNODE_MODIFIED 2 /* set if vnode's data modified */ +#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ +#define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ +#define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ +#define AFS_VNODE_LOCKING 6 /* set if waiting for lock on vnode */ +#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */ +#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */ +#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */ +#define AFS_VNODE_AUTOCELL 10 /* set if Vnode is an auto mount point */ +#define AFS_VNODE_PSEUDODIR 11 /* set if Vnode is a pseudo directory */ + + long acl_order; /* ACL check count (callback break count) */ + + struct list_head writebacks; /* alterations in pagecache that need writing */ + struct list_head pending_locks; /* locks waiting to be granted */ + struct list_head granted_locks; /* locks granted on this file */ + struct delayed_work lock_work; /* work to be done in locking */ + struct key *unlock_key; /* key to be used in unlocking */ + + /* outstanding callback notification on this file */ + struct rb_node server_rb; /* link in server->fs_vnodes */ + struct rb_node cb_promise; /* link in server->cb_promises */ + struct work_struct cb_broken_work; /* work to be done on callback break */ + time_t cb_expires; /* time at which callback expires */ + time_t cb_expires_at; /* time used to order cb_promise */ + unsigned cb_version; /* callback version */ + unsigned cb_expiry; /* callback expiry time */ + afs_callback_type_t cb_type; /* type of callback */ + bool cb_promised; /* true if promise still holds */ +}; + +/* + * cached security record for one user's attempt to access a vnode + */ +struct afs_permit { + struct key *key; /* RxRPC ticket holding a security context */ + afs_access_t access_mask; /* access mask for this key */ +}; + +/* + * cache of security records from attempts to access a vnode + */ +struct afs_permits { + struct rcu_head rcu; /* disposal procedure */ + int count; /* number of records */ + struct afs_permit permits[0]; /* the permits so far examined */ +}; + +/* + * record of one of a system's set of network interfaces + */ +struct afs_interface { + struct in_addr address; /* IPv4 address bound to interface */ + struct in_addr netmask; /* netmask applied to address */ + unsigned mtu; /* MTU of interface */ +}; + +/* + * UUID definition [internet draft] + * - the timestamp is a 60-bit value, split 32/16/12, and goes in 100ns + * increments since midnight 15th October 1582 + * - add AFS_UUID_TO_UNIX_TIME to convert unix time in 100ns units to UUID + * time + * - the clock sequence is a 14-bit counter to avoid duplicate times + */ +struct afs_uuid { + u32 time_low; /* low part of timestamp */ + u16 time_mid; /* mid part of timestamp */ + u16 time_hi_and_version; /* high part of timestamp and version */ +#define AFS_UUID_TO_UNIX_TIME 0x01b21dd213814000ULL +#define AFS_UUID_TIMEHI_MASK 0x0fff +#define AFS_UUID_VERSION_TIME 0x1000 /* time-based UUID */ +#define AFS_UUID_VERSION_NAME 0x3000 /* name-based UUID */ +#define AFS_UUID_VERSION_RANDOM 0x4000 /* (pseudo-)random generated UUID */ + u8 clock_seq_hi_and_reserved; /* clock seq hi and variant */ +#define AFS_UUID_CLOCKHI_MASK 0x3f +#define AFS_UUID_VARIANT_STD 0x80 + u8 clock_seq_low; /* clock seq low */ + u8 node[6]; /* spatially unique node ID (MAC addr) */ +}; + +/*****************************************************************************/ +/* + * cache.c + */ +#ifdef CONFIG_AFS_FSCACHE +extern struct fscache_netfs afs_cache_netfs; +extern struct fscache_cookie_def afs_cell_cache_index_def; +extern struct fscache_cookie_def afs_vlocation_cache_index_def; +extern struct fscache_cookie_def afs_volume_cache_index_def; +extern struct fscache_cookie_def afs_vnode_cache_index_def; +#else +#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL) +#define afs_vlocation_cache_index_def (*(struct fscache_cookie_def *) NULL) +#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL) +#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL) +#endif + +/* + * callback.c + */ +extern void afs_init_callback_state(struct afs_server *); +extern void afs_broken_callback_work(struct work_struct *); +extern void afs_break_callbacks(struct afs_server *, size_t, + struct afs_callback[]); +extern void afs_discard_callback_on_delete(struct afs_vnode *); +extern void afs_give_up_callback(struct afs_vnode *); +extern void afs_dispatch_give_up_callbacks(struct work_struct *); +extern void afs_flush_callback_breaks(struct afs_server *); +extern int __init afs_callback_update_init(void); +extern void afs_callback_update_kill(void); + +/* + * cell.c + */ +extern struct rw_semaphore afs_proc_cells_sem; +extern struct list_head afs_proc_cells; + +#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0) +extern int afs_cell_init(char *); +extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool); +extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool); +extern struct afs_cell *afs_grab_cell(struct afs_cell *); +extern void afs_put_cell(struct afs_cell *); +extern void afs_cell_purge(void); + +/* + * cmservice.c + */ +extern bool afs_cm_incoming_call(struct afs_call *); + +/* + * dir.c + */ +extern const struct inode_operations afs_dir_inode_operations; +extern const struct dentry_operations afs_fs_dentry_operations; +extern const struct file_operations afs_dir_file_operations; + +/* + * file.c + */ +extern const struct address_space_operations afs_fs_aops; +extern const struct inode_operations afs_file_inode_operations; +extern const struct file_operations afs_file_operations; + +extern int afs_open(struct inode *, struct file *); +extern int afs_release(struct inode *, struct file *); +extern int afs_page_filler(void *, struct page *); + +/* + * flock.c + */ +extern void __exit afs_kill_lock_manager(void); +extern void afs_lock_work(struct work_struct *); +extern void afs_lock_may_be_available(struct afs_vnode *); +extern int afs_lock(struct file *, int, struct file_lock *); +extern int afs_flock(struct file *, int, struct file_lock *); + +/* + * fsclient.c + */ +extern int afs_fs_fetch_file_status(struct afs_server *, struct key *, + struct afs_vnode *, struct afs_volsync *, + const struct afs_wait_mode *); +extern int afs_fs_give_up_callbacks(struct afs_server *, + const struct afs_wait_mode *); +extern int afs_fs_fetch_data(struct afs_server *, struct key *, + struct afs_vnode *, off_t, size_t, struct page *, + const struct afs_wait_mode *); +extern int afs_fs_create(struct afs_server *, struct key *, + struct afs_vnode *, const char *, umode_t, + struct afs_fid *, struct afs_file_status *, + struct afs_callback *, + const struct afs_wait_mode *); +extern int afs_fs_remove(struct afs_server *, struct key *, + struct afs_vnode *, const char *, bool, + const struct afs_wait_mode *); +extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *, + struct afs_vnode *, const char *, + const struct afs_wait_mode *); +extern int afs_fs_symlink(struct afs_server *, struct key *, + struct afs_vnode *, const char *, const char *, + struct afs_fid *, struct afs_file_status *, + const struct afs_wait_mode *); +extern int afs_fs_rename(struct afs_server *, struct key *, + struct afs_vnode *, const char *, + struct afs_vnode *, const char *, + const struct afs_wait_mode *); +extern int afs_fs_store_data(struct afs_server *, struct afs_writeback *, + pgoff_t, pgoff_t, unsigned, unsigned, + const struct afs_wait_mode *); +extern int afs_fs_setattr(struct afs_server *, struct key *, + struct afs_vnode *, struct iattr *, + const struct afs_wait_mode *); +extern int afs_fs_get_volume_status(struct afs_server *, struct key *, + struct afs_vnode *, + struct afs_volume_status *, + const struct afs_wait_mode *); +extern int afs_fs_set_lock(struct afs_server *, struct key *, + struct afs_vnode *, afs_lock_type_t, + const struct afs_wait_mode *); +extern int afs_fs_extend_lock(struct afs_server *, struct key *, + struct afs_vnode *, + const struct afs_wait_mode *); +extern int afs_fs_release_lock(struct afs_server *, struct key *, + struct afs_vnode *, + const struct afs_wait_mode *); + +/* + * inode.c + */ +extern struct inode *afs_iget_autocell(struct inode *, const char *, int, + struct key *); +extern struct inode *afs_iget(struct super_block *, struct key *, + struct afs_fid *, struct afs_file_status *, + struct afs_callback *); +extern void afs_zap_data(struct afs_vnode *); +extern int afs_validate(struct afs_vnode *, struct key *); +extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int afs_setattr(struct dentry *, struct iattr *); +extern void afs_evict_inode(struct inode *); +extern int afs_drop_inode(struct inode *); + +/* + * main.c + */ +extern struct workqueue_struct *afs_wq; +extern struct afs_uuid afs_uuid; + +/* + * misc.c + */ +extern int afs_abort_to_error(u32); + +/* + * mntpt.c + */ +extern const struct inode_operations afs_mntpt_inode_operations; +extern const struct inode_operations afs_autocell_inode_operations; +extern const struct file_operations afs_mntpt_file_operations; + +extern struct vfsmount *afs_d_automount(struct path *); +extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); +extern void afs_mntpt_kill_timer(void); + +/* + * proc.c + */ +extern int afs_proc_init(void); +extern void afs_proc_cleanup(void); +extern int afs_proc_cell_setup(struct afs_cell *); +extern void afs_proc_cell_remove(struct afs_cell *); + +/* + * rxrpc.c + */ +extern int afs_open_socket(void); +extern void afs_close_socket(void); +extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, + const struct afs_wait_mode *); +extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *, + size_t, size_t); +extern void afs_flat_call_destructor(struct afs_call *); +extern void afs_transfer_reply(struct afs_call *, struct sk_buff *); +extern void afs_send_empty_reply(struct afs_call *); +extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); +extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *, + size_t); + +/* + * security.c + */ +extern void afs_clear_permits(struct afs_vnode *); +extern void afs_cache_permit(struct afs_vnode *, struct key *, long); +extern void afs_zap_permits(struct rcu_head *); +extern struct key *afs_request_key(struct afs_cell *); +extern int afs_permission(struct inode *, int, unsigned int); + +/* + * server.c + */ +extern spinlock_t afs_server_peer_lock; + +#define afs_get_server(S) \ +do { \ + _debug("GET SERVER %d", atomic_read(&(S)->usage)); \ + atomic_inc(&(S)->usage); \ +} while(0) + +extern struct afs_server *afs_lookup_server(struct afs_cell *, + const struct in_addr *); +extern struct afs_server *afs_find_server(const struct in_addr *); +extern void afs_put_server(struct afs_server *); +extern void __exit afs_purge_servers(void); + +/* + * super.c + */ +extern int afs_fs_init(void); +extern void afs_fs_exit(void); + +/* + * use-rtnetlink.c + */ +extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool); +extern int afs_get_MAC_address(u8 *, size_t); + +/* + * vlclient.c + */ +extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *, + const char *, struct afs_cache_vlocation *, + const struct afs_wait_mode *); +extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *, + afs_volid_t, afs_voltype_t, + struct afs_cache_vlocation *, + const struct afs_wait_mode *); + +/* + * vlocation.c + */ +#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0) + +extern int __init afs_vlocation_update_init(void); +extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *, + struct key *, + const char *, size_t); +extern void afs_put_vlocation(struct afs_vlocation *); +extern void afs_vlocation_purge(void); + +/* + * vnode.c + */ +static inline struct afs_vnode *AFS_FS_I(struct inode *inode) +{ + return container_of(inode, struct afs_vnode, vfs_inode); +} + +static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) +{ + return &vnode->vfs_inode; +} + +extern void afs_vnode_finalise_status_update(struct afs_vnode *, + struct afs_server *); +extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *, + struct key *); +extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *, + off_t, size_t, struct page *); +extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *, + umode_t, struct afs_fid *, struct afs_file_status *, + struct afs_callback *, struct afs_server **); +extern int afs_vnode_remove(struct afs_vnode *, struct key *, const char *, + bool); +extern int afs_vnode_link(struct afs_vnode *, struct afs_vnode *, struct key *, + const char *); +extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *, + const char *, struct afs_fid *, + struct afs_file_status *, struct afs_server **); +extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *, + struct key *, const char *, const char *); +extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t, + unsigned, unsigned); +extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *); +extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *, + struct afs_volume_status *); +extern int afs_vnode_set_lock(struct afs_vnode *, struct key *, + afs_lock_type_t); +extern int afs_vnode_extend_lock(struct afs_vnode *, struct key *); +extern int afs_vnode_release_lock(struct afs_vnode *, struct key *); + +/* + * volume.c + */ +#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0) + +extern void afs_put_volume(struct afs_volume *); +extern struct afs_volume *afs_volume_lookup(struct afs_mount_params *); +extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *); +extern int afs_volume_release_fileserver(struct afs_vnode *, + struct afs_server *, int); + +/* + * write.c + */ +extern int afs_set_page_dirty(struct page *); +extern void afs_put_writeback(struct afs_writeback *); +extern int afs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +extern int afs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); +extern int afs_writepage(struct page *, struct writeback_control *); +extern int afs_writepages(struct address_space *, struct writeback_control *); +extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); +extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, + unsigned long, loff_t); +extern int afs_writeback_all(struct afs_vnode *); +extern int afs_fsync(struct file *, int); + + +/*****************************************************************************/ +/* + * debug tracing + */ +extern unsigned afs_debug; + +#define dbgprintk(FMT,...) \ + printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) + +#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) +#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) +#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) + + +#if defined(__KDEBUG) +#define _enter(FMT,...) kenter(FMT,##__VA_ARGS__) +#define _leave(FMT,...) kleave(FMT,##__VA_ARGS__) +#define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__) + +#elif defined(CONFIG_AFS_DEBUG) +#define AFS_DEBUG_KENTER 0x01 +#define AFS_DEBUG_KLEAVE 0x02 +#define AFS_DEBUG_KDEBUG 0x04 + +#define _enter(FMT,...) \ +do { \ + if (unlikely(afs_debug & AFS_DEBUG_KENTER)) \ + kenter(FMT,##__VA_ARGS__); \ +} while (0) + +#define _leave(FMT,...) \ +do { \ + if (unlikely(afs_debug & AFS_DEBUG_KLEAVE)) \ + kleave(FMT,##__VA_ARGS__); \ +} while (0) + +#define _debug(FMT,...) \ +do { \ + if (unlikely(afs_debug & AFS_DEBUG_KDEBUG)) \ + kdebug(FMT,##__VA_ARGS__); \ +} while (0) + +#else +#define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__) +#define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) +#define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__) +#endif + +/* + * debug assertion checking + */ +#if 1 // defined(__KDEBUGALL) + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "AFS: Assertion failed\n"); \ + BUG(); \ + } \ +} while(0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "AFS: Assertion failed\n"); \ + printk(KERN_ERR "%lu " #OP " %lu is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while(0) + +#define ASSERTRANGE(L, OP1, N, OP2, H) \ +do { \ + if (unlikely(!((L) OP1 (N)) || !((N) OP2 (H)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "AFS: Assertion failed\n"); \ + printk(KERN_ERR "%lu "#OP1" %lu "#OP2" %lu is false\n", \ + (unsigned long)(L), (unsigned long)(N), \ + (unsigned long)(H)); \ + printk(KERN_ERR "0x%lx "#OP1" 0x%lx "#OP2" 0x%lx is false\n", \ + (unsigned long)(L), (unsigned long)(N), \ + (unsigned long)(H)); \ + BUG(); \ + } \ +} while(0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "AFS: Assertion failed\n"); \ + BUG(); \ + } \ +} while(0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "AFS: Assertion failed\n"); \ + printk(KERN_ERR "%lu " #OP " %lu is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while(0) + +#else + +#define ASSERT(X) \ +do { \ +} while(0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ +} while(0) + +#define ASSERTRANGE(L, OP1, N, OP2, H) \ +do { \ +} while(0) + +#define ASSERTIF(C, X) \ +do { \ +} while(0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ +} while(0) + +#endif /* __KDEBUGALL */ diff --git a/fs/afs/main.c b/fs/afs/main.c new file mode 100644 index 00000000..42dd2e49 --- /dev/null +++ b/fs/afs/main.c @@ -0,0 +1,184 @@ +/* AFS client file system + * + * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "internal.h" + +MODULE_DESCRIPTION("AFS Client File System"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +unsigned afs_debug; +module_param_named(debug, afs_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(debug, "AFS debugging mask"); + +static char *rootcell; + +module_param(rootcell, charp, 0); +MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); + +struct afs_uuid afs_uuid; +struct workqueue_struct *afs_wq; + +/* + * get a client UUID + */ +static int __init afs_get_client_UUID(void) +{ + struct timespec ts; + u64 uuidtime; + u16 clockseq; + int ret; + + /* read the MAC address of one of the external interfaces and construct + * a UUID from it */ + ret = afs_get_MAC_address(afs_uuid.node, sizeof(afs_uuid.node)); + if (ret < 0) + return ret; + + getnstimeofday(&ts); + uuidtime = (u64) ts.tv_sec * 1000 * 1000 * 10; + uuidtime += ts.tv_nsec / 100; + uuidtime += AFS_UUID_TO_UNIX_TIME; + afs_uuid.time_low = uuidtime; + afs_uuid.time_mid = uuidtime >> 32; + afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK; + afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME; + + get_random_bytes(&clockseq, 2); + afs_uuid.clock_seq_low = clockseq; + afs_uuid.clock_seq_hi_and_reserved = + (clockseq >> 8) & AFS_UUID_CLOCKHI_MASK; + afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD; + + _debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", + afs_uuid.time_low, + afs_uuid.time_mid, + afs_uuid.time_hi_and_version, + afs_uuid.clock_seq_hi_and_reserved, + afs_uuid.clock_seq_low, + afs_uuid.node[0], afs_uuid.node[1], afs_uuid.node[2], + afs_uuid.node[3], afs_uuid.node[4], afs_uuid.node[5]); + + return 0; +} + +/* + * initialise the AFS client FS module + */ +static int __init afs_init(void) +{ + int ret; + + printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); + + ret = afs_get_client_UUID(); + if (ret < 0) + return ret; + + /* create workqueue */ + ret = -ENOMEM; + afs_wq = alloc_workqueue("afs", 0, 0); + if (!afs_wq) + return ret; + + /* register the /proc stuff */ + ret = afs_proc_init(); + if (ret < 0) + goto error_proc; + +#ifdef CONFIG_AFS_FSCACHE + /* we want to be able to cache */ + ret = fscache_register_netfs(&afs_cache_netfs); + if (ret < 0) + goto error_cache; +#endif + + /* initialise the cell DB */ + ret = afs_cell_init(rootcell); + if (ret < 0) + goto error_cell_init; + + /* initialise the VL update process */ + ret = afs_vlocation_update_init(); + if (ret < 0) + goto error_vl_update_init; + + /* initialise the callback update process */ + ret = afs_callback_update_init(); + if (ret < 0) + goto error_callback_update_init; + + /* create the RxRPC transport */ + ret = afs_open_socket(); + if (ret < 0) + goto error_open_socket; + + /* register the filesystems */ + ret = afs_fs_init(); + if (ret < 0) + goto error_fs; + + return ret; + +error_fs: + afs_close_socket(); +error_open_socket: + afs_callback_update_kill(); +error_callback_update_init: + afs_vlocation_purge(); +error_vl_update_init: + afs_cell_purge(); +error_cell_init: +#ifdef CONFIG_AFS_FSCACHE + fscache_unregister_netfs(&afs_cache_netfs); +error_cache: +#endif + afs_proc_cleanup(); +error_proc: + destroy_workqueue(afs_wq); + rcu_barrier(); + printk(KERN_ERR "kAFS: failed to register: %d\n", ret); + return ret; +} + +/* XXX late_initcall is kludgy, but the only alternative seems to create + * a transport upon the first mount, which is worse. Or is it? + */ +late_initcall(afs_init); /* must be called after net/ to create socket */ + +/* + * clean up on module removal + */ +static void __exit afs_exit(void) +{ + printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n"); + + afs_fs_exit(); + afs_kill_lock_manager(); + afs_close_socket(); + afs_purge_servers(); + afs_callback_update_kill(); + afs_vlocation_purge(); + destroy_workqueue(afs_wq); + afs_cell_purge(); +#ifdef CONFIG_AFS_FSCACHE + fscache_unregister_netfs(&afs_cache_netfs); +#endif + afs_proc_cleanup(); + rcu_barrier(); +} + +module_exit(afs_exit); diff --git a/fs/afs/misc.c b/fs/afs/misc.c new file mode 100644 index 00000000..0dd4dafe --- /dev/null +++ b/fs/afs/misc.c @@ -0,0 +1,75 @@ +/* miscellaneous bits + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include "internal.h" +#include "afs_fs.h" + +/* + * convert an AFS abort code to a Linux error number + */ +int afs_abort_to_error(u32 abort_code) +{ + switch (abort_code) { + case 13: return -EACCES; + case 27: return -EFBIG; + case 30: return -EROFS; + case VSALVAGE: return -EIO; + case VNOVNODE: return -ENOENT; + case VNOVOL: return -ENOMEDIUM; + case VVOLEXISTS: return -EEXIST; + case VNOSERVICE: return -EIO; + case VOFFLINE: return -ENOENT; + case VONLINE: return -EEXIST; + case VDISKFULL: return -ENOSPC; + case VOVERQUOTA: return -EDQUOT; + case VBUSY: return -EBUSY; + case VMOVED: return -ENXIO; + case 0x2f6df0a: return -EWOULDBLOCK; + case 0x2f6df0c: return -EACCES; + case 0x2f6df0f: return -EBUSY; + case 0x2f6df10: return -EEXIST; + case 0x2f6df11: return -EXDEV; + case 0x2f6df13: return -ENOTDIR; + case 0x2f6df14: return -EISDIR; + case 0x2f6df15: return -EINVAL; + case 0x2f6df1a: return -EFBIG; + case 0x2f6df1b: return -ENOSPC; + case 0x2f6df1d: return -EROFS; + case 0x2f6df1e: return -EMLINK; + case 0x2f6df20: return -EDOM; + case 0x2f6df21: return -ERANGE; + case 0x2f6df22: return -EDEADLK; + case 0x2f6df23: return -ENAMETOOLONG; + case 0x2f6df24: return -ENOLCK; + case 0x2f6df26: return -ENOTEMPTY; + case 0x2f6df78: return -EDQUOT; + + case RXKADINCONSISTENCY: return -EPROTO; + case RXKADPACKETSHORT: return -EPROTO; + case RXKADLEVELFAIL: return -EKEYREJECTED; + case RXKADTICKETLEN: return -EKEYREJECTED; + case RXKADOUTOFSEQUENCE: return -EPROTO; + case RXKADNOAUTH: return -EKEYREJECTED; + case RXKADBADKEY: return -EKEYREJECTED; + case RXKADBADTICKET: return -EKEYREJECTED; + case RXKADUNKNOWNKEY: return -EKEYREJECTED; + case RXKADEXPIRED: return -EKEYEXPIRED; + case RXKADSEALEDINCON: return -EKEYREJECTED; + case RXKADDATALEN: return -EKEYREJECTED; + case RXKADILLEGALLEVEL: return -EKEYREJECTED; + + default: return -EREMOTEIO; + } +} diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c new file mode 100644 index 00000000..aa591841 --- /dev/null +++ b/fs/afs/mntpt.c @@ -0,0 +1,284 @@ +/* mountpoint management + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + + +static struct dentry *afs_mntpt_lookup(struct inode *dir, + struct dentry *dentry, + struct nameidata *nd); +static int afs_mntpt_open(struct inode *inode, struct file *file); +static void afs_mntpt_expiry_timed_out(struct work_struct *work); + +const struct file_operations afs_mntpt_file_operations = { + .open = afs_mntpt_open, + .llseek = noop_llseek, +}; + +const struct inode_operations afs_mntpt_inode_operations = { + .lookup = afs_mntpt_lookup, + .readlink = page_readlink, + .getattr = afs_getattr, +}; + +const struct inode_operations afs_autocell_inode_operations = { + .getattr = afs_getattr, +}; + +static LIST_HEAD(afs_vfsmounts); +static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out); + +static unsigned long afs_mntpt_expiry_timeout = 10 * 60; + +/* + * check a symbolic link to see whether it actually encodes a mountpoint + * - sets the AFS_VNODE_MOUNTPOINT flag on the vnode appropriately + */ +int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) +{ + struct page *page; + size_t size; + char *buf; + int ret; + + _enter("{%x:%u,%u}", + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); + + /* read the contents of the symlink into the pagecache */ + page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, + afs_page_filler, key); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto out; + } + + ret = -EIO; + if (PageError(page)) + goto out_free; + + buf = kmap(page); + + /* examine the symlink's contents */ + size = vnode->status.size; + _debug("symlink to %*.*s", (int) size, (int) size, buf); + + if (size > 2 && + (buf[0] == '%' || buf[0] == '#') && + buf[size - 1] == '.' + ) { + _debug("symlink is a mountpoint"); + spin_lock(&vnode->lock); + set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); + vnode->vfs_inode.i_flags |= S_AUTOMOUNT; + spin_unlock(&vnode->lock); + } + + ret = 0; + + kunmap(page); +out_free: + page_cache_release(page); +out: + _leave(" = %d", ret); + return ret; +} + +/* + * no valid lookup procedure on this sort of dir + */ +static struct dentry *afs_mntpt_lookup(struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + _enter("%p,%p{%p{%s},%s}", + dir, + dentry, + dentry->d_parent, + dentry->d_parent ? + dentry->d_parent->d_name.name : (const unsigned char *) "", + dentry->d_name.name); + + return ERR_PTR(-EREMOTE); +} + +/* + * no valid open procedure on this sort of dir + */ +static int afs_mntpt_open(struct inode *inode, struct file *file) +{ + _enter("%p,%p{%p{%s},%s}", + inode, file, + file->f_path.dentry->d_parent, + file->f_path.dentry->d_parent ? + file->f_path.dentry->d_parent->d_name.name : + (const unsigned char *) "", + file->f_path.dentry->d_name.name); + + return -EREMOTE; +} + +/* + * create a vfsmount to be automounted + */ +static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) +{ + struct afs_super_info *super; + struct vfsmount *mnt; + struct afs_vnode *vnode; + struct page *page; + char *devname, *options; + bool rwpath = false; + int ret; + + _enter("{%s}", mntpt->d_name.name); + + BUG_ON(!mntpt->d_inode); + + ret = -ENOMEM; + devname = (char *) get_zeroed_page(GFP_KERNEL); + if (!devname) + goto error_no_devname; + + options = (char *) get_zeroed_page(GFP_KERNEL); + if (!options) + goto error_no_options; + + vnode = AFS_FS_I(mntpt->d_inode); + if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { + /* if the directory is a pseudo directory, use the d_name */ + static const char afs_root_cell[] = ":root.cell."; + unsigned size = mntpt->d_name.len; + + ret = -ENOENT; + if (size < 2 || size > AFS_MAXCELLNAME) + goto error_no_page; + + if (mntpt->d_name.name[0] == '.') { + devname[0] = '#'; + memcpy(devname + 1, mntpt->d_name.name, size - 1); + memcpy(devname + size, afs_root_cell, + sizeof(afs_root_cell)); + rwpath = true; + } else { + devname[0] = '%'; + memcpy(devname + 1, mntpt->d_name.name, size); + memcpy(devname + size + 1, afs_root_cell, + sizeof(afs_root_cell)); + } + } else { + /* read the contents of the AFS special symlink */ + loff_t size = i_size_read(mntpt->d_inode); + char *buf; + + ret = -EINVAL; + if (size > PAGE_SIZE - 1) + goto error_no_page; + + page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto error_no_page; + } + + ret = -EIO; + if (PageError(page)) + goto error; + + buf = kmap_atomic(page, KM_USER0); + memcpy(devname, buf, size); + kunmap_atomic(buf, KM_USER0); + page_cache_release(page); + page = NULL; + } + + /* work out what options we want */ + super = AFS_FS_S(mntpt->d_sb); + memcpy(options, "cell=", 5); + strcpy(options + 5, super->volume->cell->name); + if (super->volume->type == AFSVL_RWVOL || rwpath) + strcat(options, ",rwpath"); + + /* try and do the mount */ + _debug("--- attempting mount %s -o %s ---", devname, options); + mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options); + _debug("--- mount result %p ---", mnt); + + free_page((unsigned long) devname); + free_page((unsigned long) options); + _leave(" = %p", mnt); + return mnt; + +error: + page_cache_release(page); +error_no_page: + free_page((unsigned long) options); +error_no_options: + free_page((unsigned long) devname); +error_no_devname: + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * handle an automount point + */ +struct vfsmount *afs_d_automount(struct path *path) +{ + struct vfsmount *newmnt; + + _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name); + + newmnt = afs_mntpt_do_automount(path->dentry); + if (IS_ERR(newmnt)) + return newmnt; + + mntget(newmnt); /* prevent immediate expiration */ + mnt_set_expiry(newmnt, &afs_vfsmounts); + queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, + afs_mntpt_expiry_timeout * HZ); + _leave(" = %p {%s}", newmnt, newmnt->mnt_devname); + return newmnt; +} + +/* + * handle mountpoint expiry timer going off + */ +static void afs_mntpt_expiry_timed_out(struct work_struct *work) +{ + _enter(""); + + if (!list_empty(&afs_vfsmounts)) { + mark_mounts_for_expiry(&afs_vfsmounts); + queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, + afs_mntpt_expiry_timeout * HZ); + } + + _leave(""); +} + +/* + * kill the AFS mountpoint timer if it's still running + */ +void afs_mntpt_kill_timer(void) +{ + _enter(""); + + ASSERT(list_empty(&afs_vfsmounts)); + cancel_delayed_work_sync(&afs_mntpt_expiry_timer); +} diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c new file mode 100644 index 00000000..7ad36506 --- /dev/null +++ b/fs/afs/netdevices.c @@ -0,0 +1,68 @@ +/* AFS network device helpers + * + * Copyright (c) 2007 Patrick McHardy + */ + +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * get a MAC address from a random ethernet interface that has a real one + * - the buffer will normally be 6 bytes in size + */ +int afs_get_MAC_address(u8 *mac, size_t maclen) +{ + struct net_device *dev; + int ret = -ENODEV; + + BUG_ON(maclen != ETH_ALEN); + + rtnl_lock(); + dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER); + if (dev) { + memcpy(mac, dev->dev_addr, maclen); + ret = 0; + } + rtnl_unlock(); + return ret; +} + +/* + * get a list of this system's interface IPv4 addresses, netmasks and MTUs + * - maxbufs must be at least 1 + * - returns the number of interface records in the buffer + */ +int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs, + bool wantloopback) +{ + struct net_device *dev; + struct in_device *idev; + int n = 0; + + ASSERT(maxbufs > 0); + + rtnl_lock(); + for_each_netdev(&init_net, dev) { + if (dev->type == ARPHRD_LOOPBACK && !wantloopback) + continue; + idev = __in_dev_get_rtnl(dev); + if (!idev) + continue; + for_primary_ifa(idev) { + bufs[n].address.s_addr = ifa->ifa_address; + bufs[n].netmask.s_addr = ifa->ifa_mask; + bufs[n].mtu = dev->mtu; + n++; + if (n >= maxbufs) + goto out; + } endfor_ifa(idev); + } +out: + rtnl_unlock(); + return n; +} diff --git a/fs/afs/proc.c b/fs/afs/proc.c new file mode 100644 index 00000000..096b23f8 --- /dev/null +++ b/fs/afs/proc.c @@ -0,0 +1,744 @@ +/* /proc interface for AFS + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static struct proc_dir_entry *proc_afs; + + +static int afs_proc_cells_open(struct inode *inode, struct file *file); +static void *afs_proc_cells_start(struct seq_file *p, loff_t *pos); +static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos); +static void afs_proc_cells_stop(struct seq_file *p, void *v); +static int afs_proc_cells_show(struct seq_file *m, void *v); +static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, + size_t size, loff_t *_pos); + +static const struct seq_operations afs_proc_cells_ops = { + .start = afs_proc_cells_start, + .next = afs_proc_cells_next, + .stop = afs_proc_cells_stop, + .show = afs_proc_cells_show, +}; + +static const struct file_operations afs_proc_cells_fops = { + .open = afs_proc_cells_open, + .read = seq_read, + .write = afs_proc_cells_write, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + +static int afs_proc_rootcell_open(struct inode *inode, struct file *file); +static int afs_proc_rootcell_release(struct inode *inode, struct file *file); +static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, + size_t size, loff_t *_pos); +static ssize_t afs_proc_rootcell_write(struct file *file, + const char __user *buf, + size_t size, loff_t *_pos); + +static const struct file_operations afs_proc_rootcell_fops = { + .open = afs_proc_rootcell_open, + .read = afs_proc_rootcell_read, + .write = afs_proc_rootcell_write, + .llseek = no_llseek, + .release = afs_proc_rootcell_release, + .owner = THIS_MODULE, +}; + +static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file); +static int afs_proc_cell_volumes_release(struct inode *inode, + struct file *file); +static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos); +static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, + loff_t *pos); +static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v); +static int afs_proc_cell_volumes_show(struct seq_file *m, void *v); + +static const struct seq_operations afs_proc_cell_volumes_ops = { + .start = afs_proc_cell_volumes_start, + .next = afs_proc_cell_volumes_next, + .stop = afs_proc_cell_volumes_stop, + .show = afs_proc_cell_volumes_show, +}; + +static const struct file_operations afs_proc_cell_volumes_fops = { + .open = afs_proc_cell_volumes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = afs_proc_cell_volumes_release, + .owner = THIS_MODULE, +}; + +static int afs_proc_cell_vlservers_open(struct inode *inode, + struct file *file); +static int afs_proc_cell_vlservers_release(struct inode *inode, + struct file *file); +static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos); +static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, + loff_t *pos); +static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v); +static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v); + +static const struct seq_operations afs_proc_cell_vlservers_ops = { + .start = afs_proc_cell_vlservers_start, + .next = afs_proc_cell_vlservers_next, + .stop = afs_proc_cell_vlservers_stop, + .show = afs_proc_cell_vlservers_show, +}; + +static const struct file_operations afs_proc_cell_vlservers_fops = { + .open = afs_proc_cell_vlservers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = afs_proc_cell_vlservers_release, + .owner = THIS_MODULE, +}; + +static int afs_proc_cell_servers_open(struct inode *inode, struct file *file); +static int afs_proc_cell_servers_release(struct inode *inode, + struct file *file); +static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos); +static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, + loff_t *pos); +static void afs_proc_cell_servers_stop(struct seq_file *p, void *v); +static int afs_proc_cell_servers_show(struct seq_file *m, void *v); + +static const struct seq_operations afs_proc_cell_servers_ops = { + .start = afs_proc_cell_servers_start, + .next = afs_proc_cell_servers_next, + .stop = afs_proc_cell_servers_stop, + .show = afs_proc_cell_servers_show, +}; + +static const struct file_operations afs_proc_cell_servers_fops = { + .open = afs_proc_cell_servers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = afs_proc_cell_servers_release, + .owner = THIS_MODULE, +}; + +/* + * initialise the /proc/fs/afs/ directory + */ +int afs_proc_init(void) +{ + struct proc_dir_entry *p; + + _enter(""); + + proc_afs = proc_mkdir("fs/afs", NULL); + if (!proc_afs) + goto error_dir; + + p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops); + if (!p) + goto error_cells; + + p = proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops); + if (!p) + goto error_rootcell; + + _leave(" = 0"); + return 0; + +error_rootcell: + remove_proc_entry("cells", proc_afs); +error_cells: + remove_proc_entry("fs/afs", NULL); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * clean up the /proc/fs/afs/ directory + */ +void afs_proc_cleanup(void) +{ + remove_proc_entry("rootcell", proc_afs); + remove_proc_entry("cells", proc_afs); + remove_proc_entry("fs/afs", NULL); +} + +/* + * open "/proc/fs/afs/cells" which provides a summary of extant cells + */ +static int afs_proc_cells_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &afs_proc_cells_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = PDE(inode)->data; + + return 0; +} + +/* + * set up the iterator to start reading from the cells list and return the + * first item + */ +static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos) +{ + /* lock the list against modification */ + down_read(&afs_proc_cells_sem); + return seq_list_start_head(&afs_proc_cells, *_pos); +} + +/* + * move to next cell in cells list + */ +static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos) +{ + return seq_list_next(v, &afs_proc_cells, pos); +} + +/* + * clean up after reading from the cells list + */ +static void afs_proc_cells_stop(struct seq_file *p, void *v) +{ + up_read(&afs_proc_cells_sem); +} + +/* + * display a header line followed by a load of cell lines + */ +static int afs_proc_cells_show(struct seq_file *m, void *v) +{ + struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link); + + if (v == &afs_proc_cells) { + /* display header on line 1 */ + seq_puts(m, "USE NAME\n"); + return 0; + } + + /* display one cell per line on subsequent lines */ + seq_printf(m, "%3d %s\n", + atomic_read(&cell->usage), cell->name); + return 0; +} + +/* + * handle writes to /proc/fs/afs/cells + * - to add cells: echo "add [:][:]" + */ +static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, + size_t size, loff_t *_pos) +{ + char *kbuf, *name, *args; + int ret; + + /* start by dragging the command into memory */ + if (size <= 1 || size >= PAGE_SIZE) + return -EINVAL; + + kbuf = kmalloc(size + 1, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(kbuf, buf, size) != 0) + goto done; + kbuf[size] = 0; + + /* trim to first NL */ + name = memchr(kbuf, '\n', size); + if (name) + *name = 0; + + /* split into command, name and argslist */ + name = strchr(kbuf, ' '); + if (!name) + goto inval; + do { + *name++ = 0; + } while(*name == ' '); + if (!*name) + goto inval; + + args = strchr(name, ' '); + if (!args) + goto inval; + do { + *args++ = 0; + } while(*args == ' '); + if (!*args) + goto inval; + + /* determine command to perform */ + _debug("cmd=%s name=%s args=%s", kbuf, name, args); + + if (strcmp(kbuf, "add") == 0) { + struct afs_cell *cell; + + cell = afs_cell_create(name, strlen(name), args, false); + if (IS_ERR(cell)) { + ret = PTR_ERR(cell); + goto done; + } + + afs_put_cell(cell); + printk("kAFS: Added new cell '%s'\n", name); + } else { + goto inval; + } + + ret = size; + +done: + kfree(kbuf); + _leave(" = %d", ret); + return ret; + +inval: + ret = -EINVAL; + printk("kAFS: Invalid Command on /proc/fs/afs/cells file\n"); + goto done; +} + +/* + * Stubs for /proc/fs/afs/rootcell + */ +static int afs_proc_rootcell_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static int afs_proc_rootcell_release(struct inode *inode, struct file *file) +{ + return 0; +} + +static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, + size_t size, loff_t *_pos) +{ + return 0; +} + +/* + * handle writes to /proc/fs/afs/rootcell + * - to initialize rootcell: echo "cell.name:192.168.231.14" + */ +static ssize_t afs_proc_rootcell_write(struct file *file, + const char __user *buf, + size_t size, loff_t *_pos) +{ + char *kbuf, *s; + int ret; + + /* start by dragging the command into memory */ + if (size <= 1 || size >= PAGE_SIZE) + return -EINVAL; + + ret = -ENOMEM; + kbuf = kmalloc(size + 1, GFP_KERNEL); + if (!kbuf) + goto nomem; + + ret = -EFAULT; + if (copy_from_user(kbuf, buf, size) != 0) + goto infault; + kbuf[size] = 0; + + /* trim to first NL */ + s = memchr(kbuf, '\n', size); + if (s) + *s = 0; + + /* determine command to perform */ + _debug("rootcell=%s", kbuf); + + ret = afs_cell_init(kbuf); + if (ret >= 0) + ret = size; /* consume everything, always */ + +infault: + kfree(kbuf); +nomem: + _leave(" = %d", ret); + return ret; +} + +/* + * initialise /proc/fs/afs// + */ +int afs_proc_cell_setup(struct afs_cell *cell) +{ + struct proc_dir_entry *p; + + _enter("%p{%s}", cell, cell->name); + + cell->proc_dir = proc_mkdir(cell->name, proc_afs); + if (!cell->proc_dir) + goto error_dir; + + p = proc_create_data("servers", 0, cell->proc_dir, + &afs_proc_cell_servers_fops, cell); + if (!p) + goto error_servers; + + p = proc_create_data("vlservers", 0, cell->proc_dir, + &afs_proc_cell_vlservers_fops, cell); + if (!p) + goto error_vlservers; + + p = proc_create_data("volumes", 0, cell->proc_dir, + &afs_proc_cell_volumes_fops, cell); + if (!p) + goto error_volumes; + + _leave(" = 0"); + return 0; + +error_volumes: + remove_proc_entry("vlservers", cell->proc_dir); +error_vlservers: + remove_proc_entry("servers", cell->proc_dir); +error_servers: + remove_proc_entry(cell->name, proc_afs); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * remove /proc/fs/afs// + */ +void afs_proc_cell_remove(struct afs_cell *cell) +{ + _enter(""); + + remove_proc_entry("volumes", cell->proc_dir); + remove_proc_entry("vlservers", cell->proc_dir); + remove_proc_entry("servers", cell->proc_dir); + remove_proc_entry(cell->name, proc_afs); + + _leave(""); +} + +/* + * open "/proc/fs/afs//volumes" which provides a summary of extant cells + */ +static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) +{ + struct afs_cell *cell; + struct seq_file *m; + int ret; + + cell = PDE(inode)->data; + if (!cell) + return -ENOENT; + + ret = seq_open(file, &afs_proc_cell_volumes_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = cell; + + return 0; +} + +/* + * close the file and release the ref to the cell + */ +static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file) +{ + return seq_release(inode, file); +} + +/* + * set up the iterator to start reading from the cells list and return the + * first item + */ +static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) +{ + struct afs_cell *cell = m->private; + + _enter("cell=%p pos=%Ld", cell, *_pos); + + /* lock the list against modification */ + down_read(&cell->vl_sem); + return seq_list_start_head(&cell->vl_list, *_pos); +} + +/* + * move to next cell in cells list + */ +static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, + loff_t *_pos) +{ + struct afs_cell *cell = p->private; + + _enter("cell=%p pos=%Ld", cell, *_pos); + return seq_list_next(v, &cell->vl_list, _pos); +} + +/* + * clean up after reading from the cells list + */ +static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v) +{ + struct afs_cell *cell = p->private; + + up_read(&cell->vl_sem); +} + +static const char afs_vlocation_states[][4] = { + [AFS_VL_NEW] = "New", + [AFS_VL_CREATING] = "Crt", + [AFS_VL_VALID] = "Val", + [AFS_VL_NO_VOLUME] = "NoV", + [AFS_VL_UPDATING] = "Upd", + [AFS_VL_VOLUME_DELETED] = "Del", + [AFS_VL_UNCERTAIN] = "Unc", +}; + +/* + * display a header line followed by a load of volume lines + */ +static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) +{ + struct afs_cell *cell = m->private; + struct afs_vlocation *vlocation = + list_entry(v, struct afs_vlocation, link); + + /* display header on line 1 */ + if (v == &cell->vl_list) { + seq_puts(m, "USE STT VLID[0] VLID[1] VLID[2] NAME\n"); + return 0; + } + + /* display one cell per line on subsequent lines */ + seq_printf(m, "%3d %s %08x %08x %08x %s\n", + atomic_read(&vlocation->usage), + afs_vlocation_states[vlocation->state], + vlocation->vldb.vid[0], + vlocation->vldb.vid[1], + vlocation->vldb.vid[2], + vlocation->vldb.name); + + return 0; +} + +/* + * open "/proc/fs/afs//vlservers" which provides a list of volume + * location server + */ +static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) +{ + struct afs_cell *cell; + struct seq_file *m; + int ret; + + cell = PDE(inode)->data; + if (!cell) + return -ENOENT; + + ret = seq_open(file, &afs_proc_cell_vlservers_ops); + if (ret<0) + return ret; + + m = file->private_data; + m->private = cell; + + return 0; +} + +/* + * close the file and release the ref to the cell + */ +static int afs_proc_cell_vlservers_release(struct inode *inode, + struct file *file) +{ + return seq_release(inode, file); +} + +/* + * set up the iterator to start reading from the cells list and return the + * first item + */ +static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) +{ + struct afs_cell *cell = m->private; + loff_t pos = *_pos; + + _enter("cell=%p pos=%Ld", cell, *_pos); + + /* lock the list against modification */ + down_read(&cell->vl_sem); + + /* allow for the header line */ + if (!pos) + return (void *) 1; + pos--; + + if (pos >= cell->vl_naddrs) + return NULL; + + return &cell->vl_addrs[pos]; +} + +/* + * move to next cell in cells list + */ +static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, + loff_t *_pos) +{ + struct afs_cell *cell = p->private; + loff_t pos; + + _enter("cell=%p{nad=%u} pos=%Ld", cell, cell->vl_naddrs, *_pos); + + pos = *_pos; + (*_pos)++; + if (pos >= cell->vl_naddrs) + return NULL; + + return &cell->vl_addrs[pos]; +} + +/* + * clean up after reading from the cells list + */ +static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v) +{ + struct afs_cell *cell = p->private; + + up_read(&cell->vl_sem); +} + +/* + * display a header line followed by a load of volume lines + */ +static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) +{ + struct in_addr *addr = v; + + /* display header on line 1 */ + if (v == (struct in_addr *) 1) { + seq_puts(m, "ADDRESS\n"); + return 0; + } + + /* display one cell per line on subsequent lines */ + seq_printf(m, "%pI4\n", &addr->s_addr); + return 0; +} + +/* + * open "/proc/fs/afs//servers" which provides a summary of active + * servers + */ +static int afs_proc_cell_servers_open(struct inode *inode, struct file *file) +{ + struct afs_cell *cell; + struct seq_file *m; + int ret; + + cell = PDE(inode)->data; + if (!cell) + return -ENOENT; + + ret = seq_open(file, &afs_proc_cell_servers_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = cell; + return 0; +} + +/* + * close the file and release the ref to the cell + */ +static int afs_proc_cell_servers_release(struct inode *inode, + struct file *file) +{ + return seq_release(inode, file); +} + +/* + * set up the iterator to start reading from the cells list and return the + * first item + */ +static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos) + __acquires(m->private->servers_lock) +{ + struct afs_cell *cell = m->private; + + _enter("cell=%p pos=%Ld", cell, *_pos); + + /* lock the list against modification */ + read_lock(&cell->servers_lock); + return seq_list_start_head(&cell->servers, *_pos); +} + +/* + * move to next cell in cells list + */ +static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, + loff_t *_pos) +{ + struct afs_cell *cell = p->private; + + _enter("cell=%p pos=%Ld", cell, *_pos); + return seq_list_next(v, &cell->servers, _pos); +} + +/* + * clean up after reading from the cells list + */ +static void afs_proc_cell_servers_stop(struct seq_file *p, void *v) + __releases(p->private->servers_lock) +{ + struct afs_cell *cell = p->private; + + read_unlock(&cell->servers_lock); +} + +/* + * display a header line followed by a load of volume lines + */ +static int afs_proc_cell_servers_show(struct seq_file *m, void *v) +{ + struct afs_cell *cell = m->private; + struct afs_server *server = list_entry(v, struct afs_server, link); + char ipaddr[20]; + + /* display header on line 1 */ + if (v == &cell->servers) { + seq_puts(m, "USE ADDR STATE\n"); + return 0; + } + + /* display one cell per line on subsequent lines */ + sprintf(ipaddr, "%pI4", &server->addr); + seq_printf(m, "%3d %-15.15s %5d\n", + atomic_read(&server->usage), ipaddr, server->fs_state); + + return 0; +} diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c new file mode 100644 index 00000000..8ad8c2a0 --- /dev/null +++ b/fs/afs/rxrpc.c @@ -0,0 +1,859 @@ +/* Maintain an RxRPC server socket to do AFS communications through + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include "internal.h" +#include "afs_cm.h" + +static struct socket *afs_socket; /* my RxRPC socket */ +static struct workqueue_struct *afs_async_calls; +static atomic_t afs_outstanding_calls; +static atomic_t afs_outstanding_skbs; + +static void afs_wake_up_call_waiter(struct afs_call *); +static int afs_wait_for_call_to_complete(struct afs_call *); +static void afs_wake_up_async_call(struct afs_call *); +static int afs_dont_wait_for_call_to_complete(struct afs_call *); +static void afs_process_async_call(struct work_struct *); +static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *); +static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool); + +/* synchronous call management */ +const struct afs_wait_mode afs_sync_call = { + .rx_wakeup = afs_wake_up_call_waiter, + .wait = afs_wait_for_call_to_complete, +}; + +/* asynchronous call management */ +const struct afs_wait_mode afs_async_call = { + .rx_wakeup = afs_wake_up_async_call, + .wait = afs_dont_wait_for_call_to_complete, +}; + +/* asynchronous incoming call management */ +static const struct afs_wait_mode afs_async_incoming_call = { + .rx_wakeup = afs_wake_up_async_call, +}; + +/* asynchronous incoming call initial processing */ +static const struct afs_call_type afs_RXCMxxxx = { + .name = "CB.xxxx", + .deliver = afs_deliver_cm_op_id, + .abort_to_error = afs_abort_to_error, +}; + +static void afs_collect_incoming_call(struct work_struct *); + +static struct sk_buff_head afs_incoming_calls; +static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); + +/* + * open an RxRPC socket and bind it to be a server for callback notifications + * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT + */ +int afs_open_socket(void) +{ + struct sockaddr_rxrpc srx; + struct socket *socket; + int ret; + + _enter(""); + + skb_queue_head_init(&afs_incoming_calls); + + afs_async_calls = create_singlethread_workqueue("kafsd"); + if (!afs_async_calls) { + _leave(" = -ENOMEM [wq]"); + return -ENOMEM; + } + + ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket); + if (ret < 0) { + destroy_workqueue(afs_async_calls); + _leave(" = %d [socket]", ret); + return ret; + } + + socket->sk->sk_allocation = GFP_NOFS; + + /* bind the callback manager's address to make this a server socket */ + srx.srx_family = AF_RXRPC; + srx.srx_service = CM_SERVICE; + srx.transport_type = SOCK_DGRAM; + srx.transport_len = sizeof(srx.transport.sin); + srx.transport.sin.sin_family = AF_INET; + srx.transport.sin.sin_port = htons(AFS_CM_PORT); + memset(&srx.transport.sin.sin_addr, 0, + sizeof(srx.transport.sin.sin_addr)); + + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + if (ret < 0) { + sock_release(socket); + destroy_workqueue(afs_async_calls); + _leave(" = %d [bind]", ret); + return ret; + } + + rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor); + + afs_socket = socket; + _leave(" = 0"); + return 0; +} + +/* + * close the RxRPC socket AFS was using + */ +void afs_close_socket(void) +{ + _enter(""); + + sock_release(afs_socket); + + _debug("dework"); + destroy_workqueue(afs_async_calls); + + ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0); + ASSERTCMP(atomic_read(&afs_outstanding_calls), ==, 0); + _leave(""); +} + +/* + * note that the data in a socket buffer is now delivered and that the buffer + * should be freed + */ +static void afs_data_delivered(struct sk_buff *skb) +{ + if (!skb) { + _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs)); + dump_stack(); + } else { + _debug("DLVR %p{%u} [%d]", + skb, skb->mark, atomic_read(&afs_outstanding_skbs)); + if (atomic_dec_return(&afs_outstanding_skbs) == -1) + BUG(); + rxrpc_kernel_data_delivered(skb); + } +} + +/* + * free a socket buffer + */ +static void afs_free_skb(struct sk_buff *skb) +{ + if (!skb) { + _debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs)); + dump_stack(); + } else { + _debug("FREE %p{%u} [%d]", + skb, skb->mark, atomic_read(&afs_outstanding_skbs)); + if (atomic_dec_return(&afs_outstanding_skbs) == -1) + BUG(); + rxrpc_kernel_free_skb(skb); + } +} + +/* + * free a call + */ +static void afs_free_call(struct afs_call *call) +{ + _debug("DONE %p{%s} [%d]", + call, call->type->name, atomic_read(&afs_outstanding_calls)); + if (atomic_dec_return(&afs_outstanding_calls) == -1) + BUG(); + + ASSERTCMP(call->rxcall, ==, NULL); + ASSERT(!work_pending(&call->async_work)); + ASSERT(skb_queue_empty(&call->rx_queue)); + ASSERT(call->type->name != NULL); + + kfree(call->request); + kfree(call); +} + +/* + * allocate a call with flat request and reply buffers + */ +struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, + size_t request_size, size_t reply_size) +{ + struct afs_call *call; + + call = kzalloc(sizeof(*call), GFP_NOFS); + if (!call) + goto nomem_call; + + _debug("CALL %p{%s} [%d]", + call, type->name, atomic_read(&afs_outstanding_calls)); + atomic_inc(&afs_outstanding_calls); + + call->type = type; + call->request_size = request_size; + call->reply_max = reply_size; + + if (request_size) { + call->request = kmalloc(request_size, GFP_NOFS); + if (!call->request) + goto nomem_free; + } + + if (reply_size) { + call->buffer = kmalloc(reply_size, GFP_NOFS); + if (!call->buffer) + goto nomem_free; + } + + init_waitqueue_head(&call->waitq); + skb_queue_head_init(&call->rx_queue); + return call; + +nomem_free: + afs_free_call(call); +nomem_call: + return NULL; +} + +/* + * clean up a call with flat buffer + */ +void afs_flat_call_destructor(struct afs_call *call) +{ + _enter(""); + + kfree(call->request); + call->request = NULL; + kfree(call->buffer); + call->buffer = NULL; +} + +/* + * attach the data from a bunch of pages on an inode to a call + */ +static int afs_send_pages(struct afs_call *call, struct msghdr *msg, + struct kvec *iov) +{ + struct page *pages[8]; + unsigned count, n, loop, offset, to; + pgoff_t first = call->first, last = call->last; + int ret; + + _enter(""); + + offset = call->first_offset; + call->first_offset = 0; + + do { + _debug("attach %lx-%lx", first, last); + + count = last - first + 1; + if (count > ARRAY_SIZE(pages)) + count = ARRAY_SIZE(pages); + n = find_get_pages_contig(call->mapping, first, count, pages); + ASSERTCMP(n, ==, count); + + loop = 0; + do { + msg->msg_flags = 0; + to = PAGE_SIZE; + if (first + loop >= last) + to = call->last_to; + else + msg->msg_flags = MSG_MORE; + iov->iov_base = kmap(pages[loop]) + offset; + iov->iov_len = to - offset; + offset = 0; + + _debug("- range %u-%u%s", + offset, to, msg->msg_flags ? " [more]" : ""); + msg->msg_iov = (struct iovec *) iov; + msg->msg_iovlen = 1; + + /* have to change the state *before* sending the last + * packet as RxRPC might give us the reply before it + * returns from sending the request */ + if (first + loop >= last) + call->state = AFS_CALL_AWAIT_REPLY; + ret = rxrpc_kernel_send_data(call->rxcall, msg, + to - offset); + kunmap(pages[loop]); + if (ret < 0) + break; + } while (++loop < count); + first += count; + + for (loop = 0; loop < count; loop++) + put_page(pages[loop]); + if (ret < 0) + break; + } while (first <= last); + + _leave(" = %d", ret); + return ret; +} + +/* + * initiate a call + */ +int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, + const struct afs_wait_mode *wait_mode) +{ + struct sockaddr_rxrpc srx; + struct rxrpc_call *rxcall; + struct msghdr msg; + struct kvec iov[1]; + int ret; + struct sk_buff *skb; + + _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); + + ASSERT(call->type != NULL); + ASSERT(call->type->name != NULL); + + _debug("____MAKE %p{%s,%x} [%d]____", + call, call->type->name, key_serial(call->key), + atomic_read(&afs_outstanding_calls)); + + call->wait_mode = wait_mode; + INIT_WORK(&call->async_work, afs_process_async_call); + + memset(&srx, 0, sizeof(srx)); + srx.srx_family = AF_RXRPC; + srx.srx_service = call->service_id; + srx.transport_type = SOCK_DGRAM; + srx.transport_len = sizeof(srx.transport.sin); + srx.transport.sin.sin_family = AF_INET; + srx.transport.sin.sin_port = call->port; + memcpy(&srx.transport.sin.sin_addr, addr, 4); + + /* create a call */ + rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key, + (unsigned long) call, gfp); + call->key = NULL; + if (IS_ERR(rxcall)) { + ret = PTR_ERR(rxcall); + goto error_kill_call; + } + + call->rxcall = rxcall; + + /* send the request */ + iov[0].iov_base = call->request; + iov[0].iov_len = call->request_size; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = (struct iovec *) iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = (call->send_pages ? MSG_MORE : 0); + + /* have to change the state *before* sending the last packet as RxRPC + * might give us the reply before it returns from sending the + * request */ + if (!call->send_pages) + call->state = AFS_CALL_AWAIT_REPLY; + ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size); + if (ret < 0) + goto error_do_abort; + + if (call->send_pages) { + ret = afs_send_pages(call, &msg, iov); + if (ret < 0) + goto error_do_abort; + } + + /* at this point, an async call may no longer exist as it may have + * already completed */ + return wait_mode->wait(call); + +error_do_abort: + rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); + while ((skb = skb_dequeue(&call->rx_queue))) + afs_free_skb(skb); + rxrpc_kernel_end_call(rxcall); + call->rxcall = NULL; +error_kill_call: + call->type->destructor(call); + afs_free_call(call); + _leave(" = %d", ret); + return ret; +} + +/* + * handles intercepted messages that were arriving in the socket's Rx queue + * - called with the socket receive queue lock held to ensure message ordering + * - called with softirqs disabled + */ +static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID, + struct sk_buff *skb) +{ + struct afs_call *call = (struct afs_call *) user_call_ID; + + _enter("%p,,%u", call, skb->mark); + + _debug("ICPT %p{%u} [%d]", + skb, skb->mark, atomic_read(&afs_outstanding_skbs)); + + ASSERTCMP(sk, ==, afs_socket->sk); + atomic_inc(&afs_outstanding_skbs); + + if (!call) { + /* its an incoming call for our callback service */ + skb_queue_tail(&afs_incoming_calls, skb); + queue_work(afs_wq, &afs_collect_incoming_call_work); + } else { + /* route the messages directly to the appropriate call */ + skb_queue_tail(&call->rx_queue, skb); + call->wait_mode->rx_wakeup(call); + } + + _leave(""); +} + +/* + * deliver messages to a call + */ +static void afs_deliver_to_call(struct afs_call *call) +{ + struct sk_buff *skb; + bool last; + u32 abort_code; + int ret; + + _enter(""); + + while ((call->state == AFS_CALL_AWAIT_REPLY || + call->state == AFS_CALL_AWAIT_OP_ID || + call->state == AFS_CALL_AWAIT_REQUEST || + call->state == AFS_CALL_AWAIT_ACK) && + (skb = skb_dequeue(&call->rx_queue))) { + switch (skb->mark) { + case RXRPC_SKB_MARK_DATA: + _debug("Rcv DATA"); + last = rxrpc_kernel_is_data_last(skb); + ret = call->type->deliver(call, skb, last); + switch (ret) { + case 0: + if (last && + call->state == AFS_CALL_AWAIT_REPLY) + call->state = AFS_CALL_COMPLETE; + break; + case -ENOTCONN: + abort_code = RX_CALL_DEAD; + goto do_abort; + case -ENOTSUPP: + abort_code = RX_INVALID_OPERATION; + goto do_abort; + default: + abort_code = RXGEN_CC_UNMARSHAL; + if (call->state != AFS_CALL_AWAIT_REPLY) + abort_code = RXGEN_SS_UNMARSHAL; + do_abort: + rxrpc_kernel_abort_call(call->rxcall, + abort_code); + call->error = ret; + call->state = AFS_CALL_ERROR; + break; + } + afs_data_delivered(skb); + skb = NULL; + continue; + case RXRPC_SKB_MARK_FINAL_ACK: + _debug("Rcv ACK"); + call->state = AFS_CALL_COMPLETE; + break; + case RXRPC_SKB_MARK_BUSY: + _debug("Rcv BUSY"); + call->error = -EBUSY; + call->state = AFS_CALL_BUSY; + break; + case RXRPC_SKB_MARK_REMOTE_ABORT: + abort_code = rxrpc_kernel_get_abort_code(skb); + call->error = call->type->abort_to_error(abort_code); + call->state = AFS_CALL_ABORTED; + _debug("Rcv ABORT %u -> %d", abort_code, call->error); + break; + case RXRPC_SKB_MARK_NET_ERROR: + call->error = -rxrpc_kernel_get_error_number(skb); + call->state = AFS_CALL_ERROR; + _debug("Rcv NET ERROR %d", call->error); + break; + case RXRPC_SKB_MARK_LOCAL_ERROR: + call->error = -rxrpc_kernel_get_error_number(skb); + call->state = AFS_CALL_ERROR; + _debug("Rcv LOCAL ERROR %d", call->error); + break; + default: + BUG(); + break; + } + + afs_free_skb(skb); + } + + /* make sure the queue is empty if the call is done with (we might have + * aborted the call early because of an unmarshalling error) */ + if (call->state >= AFS_CALL_COMPLETE) { + while ((skb = skb_dequeue(&call->rx_queue))) + afs_free_skb(skb); + if (call->incoming) { + rxrpc_kernel_end_call(call->rxcall); + call->rxcall = NULL; + call->type->destructor(call); + afs_free_call(call); + } + } + + _leave(""); +} + +/* + * wait synchronously for a call to complete + */ +static int afs_wait_for_call_to_complete(struct afs_call *call) +{ + struct sk_buff *skb; + int ret; + + DECLARE_WAITQUEUE(myself, current); + + _enter(""); + + add_wait_queue(&call->waitq, &myself); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + /* deliver any messages that are in the queue */ + if (!skb_queue_empty(&call->rx_queue)) { + __set_current_state(TASK_RUNNING); + afs_deliver_to_call(call); + continue; + } + + ret = call->error; + if (call->state >= AFS_CALL_COMPLETE) + break; + ret = -EINTR; + if (signal_pending(current)) + break; + schedule(); + } + + remove_wait_queue(&call->waitq, &myself); + __set_current_state(TASK_RUNNING); + + /* kill the call */ + if (call->state < AFS_CALL_COMPLETE) { + _debug("call incomplete"); + rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD); + while ((skb = skb_dequeue(&call->rx_queue))) + afs_free_skb(skb); + } + + _debug("call complete"); + rxrpc_kernel_end_call(call->rxcall); + call->rxcall = NULL; + call->type->destructor(call); + afs_free_call(call); + _leave(" = %d", ret); + return ret; +} + +/* + * wake up a waiting call + */ +static void afs_wake_up_call_waiter(struct afs_call *call) +{ + wake_up(&call->waitq); +} + +/* + * wake up an asynchronous call + */ +static void afs_wake_up_async_call(struct afs_call *call) +{ + _enter(""); + queue_work(afs_async_calls, &call->async_work); +} + +/* + * put a call into asynchronous mode + * - mustn't touch the call descriptor as the call my have completed by the + * time we get here + */ +static int afs_dont_wait_for_call_to_complete(struct afs_call *call) +{ + _enter(""); + return -EINPROGRESS; +} + +/* + * delete an asynchronous call + */ +static void afs_delete_async_call(struct work_struct *work) +{ + struct afs_call *call = + container_of(work, struct afs_call, async_work); + + _enter(""); + + afs_free_call(call); + + _leave(""); +} + +/* + * perform processing on an asynchronous call + * - on a multiple-thread workqueue this work item may try to run on several + * CPUs at the same time + */ +static void afs_process_async_call(struct work_struct *work) +{ + struct afs_call *call = + container_of(work, struct afs_call, async_work); + + _enter(""); + + if (!skb_queue_empty(&call->rx_queue)) + afs_deliver_to_call(call); + + if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) { + if (call->wait_mode->async_complete) + call->wait_mode->async_complete(call->reply, + call->error); + call->reply = NULL; + + /* kill the call */ + rxrpc_kernel_end_call(call->rxcall); + call->rxcall = NULL; + if (call->type->destructor) + call->type->destructor(call); + + /* we can't just delete the call because the work item may be + * queued */ + PREPARE_WORK(&call->async_work, afs_delete_async_call); + queue_work(afs_async_calls, &call->async_work); + } + + _leave(""); +} + +/* + * empty a socket buffer into a flat reply buffer + */ +void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb) +{ + size_t len = skb->len; + + if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0) + BUG(); + call->reply_size += len; +} + +/* + * accept the backlog of incoming calls + */ +static void afs_collect_incoming_call(struct work_struct *work) +{ + struct rxrpc_call *rxcall; + struct afs_call *call = NULL; + struct sk_buff *skb; + + while ((skb = skb_dequeue(&afs_incoming_calls))) { + _debug("new call"); + + /* don't need the notification */ + afs_free_skb(skb); + + if (!call) { + call = kzalloc(sizeof(struct afs_call), GFP_KERNEL); + if (!call) { + rxrpc_kernel_reject_call(afs_socket); + return; + } + + INIT_WORK(&call->async_work, afs_process_async_call); + call->wait_mode = &afs_async_incoming_call; + call->type = &afs_RXCMxxxx; + init_waitqueue_head(&call->waitq); + skb_queue_head_init(&call->rx_queue); + call->state = AFS_CALL_AWAIT_OP_ID; + + _debug("CALL %p{%s} [%d]", + call, call->type->name, + atomic_read(&afs_outstanding_calls)); + atomic_inc(&afs_outstanding_calls); + } + + rxcall = rxrpc_kernel_accept_call(afs_socket, + (unsigned long) call); + if (!IS_ERR(rxcall)) { + call->rxcall = rxcall; + call = NULL; + } + } + + if (call) + afs_free_call(call); +} + +/* + * grab the operation ID from an incoming cache manager call + */ +static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, + bool last) +{ + size_t len = skb->len; + void *oibuf = (void *) &call->operation_ID; + + _enter("{%u},{%zu},%d", call->offset, len, last); + + ASSERTCMP(call->offset, <, 4); + + /* the operation ID forms the first four bytes of the request data */ + len = min_t(size_t, len, 4 - call->offset); + if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0) + BUG(); + if (!pskb_pull(skb, len)) + BUG(); + call->offset += len; + + if (call->offset < 4) { + if (last) { + _leave(" = -EBADMSG [op ID short]"); + return -EBADMSG; + } + _leave(" = 0 [incomplete]"); + return 0; + } + + call->state = AFS_CALL_AWAIT_REQUEST; + + /* ask the cache manager to route the call (it'll change the call type + * if successful) */ + if (!afs_cm_incoming_call(call)) + return -ENOTSUPP; + + /* pass responsibility for the remainer of this message off to the + * cache manager op */ + return call->type->deliver(call, skb, last); +} + +/* + * send an empty reply + */ +void afs_send_empty_reply(struct afs_call *call) +{ + struct msghdr msg; + struct iovec iov[1]; + + _enter(""); + + iov[0].iov_base = NULL; + iov[0].iov_len = 0; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = iov; + msg.msg_iovlen = 0; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + call->state = AFS_CALL_AWAIT_ACK; + switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) { + case 0: + _leave(" [replied]"); + return; + + case -ENOMEM: + _debug("oom"); + rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + default: + rxrpc_kernel_end_call(call->rxcall); + call->rxcall = NULL; + call->type->destructor(call); + afs_free_call(call); + _leave(" [error]"); + return; + } +} + +/* + * send a simple reply + */ +void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) +{ + struct msghdr msg; + struct iovec iov[1]; + int n; + + _enter(""); + + iov[0].iov_base = (void *) buf; + iov[0].iov_len = len; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + call->state = AFS_CALL_AWAIT_ACK; + n = rxrpc_kernel_send_data(call->rxcall, &msg, len); + if (n >= 0) { + _leave(" [replied]"); + return; + } + if (n == -ENOMEM) { + _debug("oom"); + rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + } + rxrpc_kernel_end_call(call->rxcall); + call->rxcall = NULL; + call->type->destructor(call); + afs_free_call(call); + _leave(" [error]"); +} + +/* + * extract a piece of data from the received data socket buffers + */ +int afs_extract_data(struct afs_call *call, struct sk_buff *skb, + bool last, void *buf, size_t count) +{ + size_t len = skb->len; + + _enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count); + + ASSERTCMP(call->offset, <, count); + + len = min_t(size_t, len, count - call->offset); + if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 || + !pskb_pull(skb, len)) + BUG(); + call->offset += len; + + if (call->offset < count) { + if (last) { + _leave(" = -EBADMSG [%d < %zu]", call->offset, count); + return -EBADMSG; + } + _leave(" = -EAGAIN"); + return -EAGAIN; + } + return 0; +} diff --git a/fs/afs/security.c b/fs/afs/security.c new file mode 100644 index 00000000..f44b9d35 --- /dev/null +++ b/fs/afs/security.c @@ -0,0 +1,363 @@ +/* AFS security handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * get a key + */ +struct key *afs_request_key(struct afs_cell *cell) +{ + struct key *key; + + _enter("{%x}", key_serial(cell->anonymous_key)); + + _debug("key %s", cell->anonymous_key->description); + key = request_key(&key_type_rxrpc, cell->anonymous_key->description, + NULL); + if (IS_ERR(key)) { + if (PTR_ERR(key) != -ENOKEY) { + _leave(" = %ld", PTR_ERR(key)); + return key; + } + + /* act as anonymous user */ + _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); + return key_get(cell->anonymous_key); + } else { + /* act as authorised user */ + _leave(" = {%x} [auth]", key_serial(key)); + return key; + } +} + +/* + * dispose of a permits list + */ +void afs_zap_permits(struct rcu_head *rcu) +{ + struct afs_permits *permits = + container_of(rcu, struct afs_permits, rcu); + int loop; + + _enter("{%d}", permits->count); + + for (loop = permits->count - 1; loop >= 0; loop--) + key_put(permits->permits[loop].key); + kfree(permits); +} + +/* + * dispose of a permits list in which all the key pointers have been copied + */ +static void afs_dispose_of_permits(struct rcu_head *rcu) +{ + struct afs_permits *permits = + container_of(rcu, struct afs_permits, rcu); + + _enter("{%d}", permits->count); + + kfree(permits); +} + +/* + * get the authorising vnode - this is the specified inode itself if it's a + * directory or it's the parent directory if the specified inode is a file or + * symlink + * - the caller must release the ref on the inode + */ +static struct afs_vnode *afs_get_auth_inode(struct afs_vnode *vnode, + struct key *key) +{ + struct afs_vnode *auth_vnode; + struct inode *auth_inode; + + _enter(""); + + if (S_ISDIR(vnode->vfs_inode.i_mode)) { + auth_inode = igrab(&vnode->vfs_inode); + ASSERT(auth_inode != NULL); + } else { + auth_inode = afs_iget(vnode->vfs_inode.i_sb, key, + &vnode->status.parent, NULL, NULL); + if (IS_ERR(auth_inode)) + return ERR_CAST(auth_inode); + } + + auth_vnode = AFS_FS_I(auth_inode); + _leave(" = {%x}", auth_vnode->fid.vnode); + return auth_vnode; +} + +/* + * clear the permit cache on a directory vnode + */ +void afs_clear_permits(struct afs_vnode *vnode) +{ + struct afs_permits *permits; + + _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + + mutex_lock(&vnode->permits_lock); + permits = vnode->permits; + rcu_assign_pointer(vnode->permits, NULL); + mutex_unlock(&vnode->permits_lock); + + if (permits) + call_rcu(&permits->rcu, afs_zap_permits); + _leave(""); +} + +/* + * add the result obtained for a vnode to its or its parent directory's cache + * for the key used to access it + */ +void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order) +{ + struct afs_permits *permits, *xpermits; + struct afs_permit *permit; + struct afs_vnode *auth_vnode; + int count, loop; + + _enter("{%x:%u},%x,%lx", + vnode->fid.vid, vnode->fid.vnode, key_serial(key), acl_order); + + auth_vnode = afs_get_auth_inode(vnode, key); + if (IS_ERR(auth_vnode)) { + _leave(" [get error %ld]", PTR_ERR(auth_vnode)); + return; + } + + mutex_lock(&auth_vnode->permits_lock); + + /* guard against a rename being detected whilst we waited for the + * lock */ + if (memcmp(&auth_vnode->fid, &vnode->status.parent, + sizeof(struct afs_fid)) != 0) { + _debug("renamed"); + goto out_unlock; + } + + /* have to be careful as the directory's callback may be broken between + * us receiving the status we're trying to cache and us getting the + * lock to update the cache for the status */ + if (auth_vnode->acl_order - acl_order > 0) { + _debug("ACL changed?"); + goto out_unlock; + } + + /* always update the anonymous mask */ + _debug("anon access %x", vnode->status.anon_access); + auth_vnode->status.anon_access = vnode->status.anon_access; + if (key == vnode->volume->cell->anonymous_key) + goto out_unlock; + + xpermits = auth_vnode->permits; + count = 0; + if (xpermits) { + /* see if the permit is already in the list + * - if it is then we just amend the list + */ + count = xpermits->count; + permit = xpermits->permits; + for (loop = count; loop > 0; loop--) { + if (permit->key == key) { + permit->access_mask = + vnode->status.caller_access; + goto out_unlock; + } + permit++; + } + } + + permits = kmalloc(sizeof(*permits) + sizeof(*permit) * (count + 1), + GFP_NOFS); + if (!permits) + goto out_unlock; + + if (xpermits) + memcpy(permits->permits, xpermits->permits, + count * sizeof(struct afs_permit)); + + _debug("key %x access %x", + key_serial(key), vnode->status.caller_access); + permits->permits[count].access_mask = vnode->status.caller_access; + permits->permits[count].key = key_get(key); + permits->count = count + 1; + + rcu_assign_pointer(auth_vnode->permits, permits); + if (xpermits) + call_rcu(&xpermits->rcu, afs_dispose_of_permits); + +out_unlock: + mutex_unlock(&auth_vnode->permits_lock); + iput(&auth_vnode->vfs_inode); + _leave(""); +} + +/* + * check with the fileserver to see if the directory or parent directory is + * permitted to be accessed with this authorisation, and if so, what access it + * is granted + */ +static int afs_check_permit(struct afs_vnode *vnode, struct key *key, + afs_access_t *_access) +{ + struct afs_permits *permits; + struct afs_permit *permit; + struct afs_vnode *auth_vnode; + bool valid; + int loop, ret; + + _enter("{%x:%u},%x", + vnode->fid.vid, vnode->fid.vnode, key_serial(key)); + + auth_vnode = afs_get_auth_inode(vnode, key); + if (IS_ERR(auth_vnode)) { + *_access = 0; + _leave(" = %ld", PTR_ERR(auth_vnode)); + return PTR_ERR(auth_vnode); + } + + ASSERT(S_ISDIR(auth_vnode->vfs_inode.i_mode)); + + /* check the permits to see if we've got one yet */ + if (key == auth_vnode->volume->cell->anonymous_key) { + _debug("anon"); + *_access = auth_vnode->status.anon_access; + valid = true; + } else { + valid = false; + rcu_read_lock(); + permits = rcu_dereference(auth_vnode->permits); + if (permits) { + permit = permits->permits; + for (loop = permits->count; loop > 0; loop--) { + if (permit->key == key) { + _debug("found in cache"); + *_access = permit->access_mask; + valid = true; + break; + } + permit++; + } + } + rcu_read_unlock(); + } + + if (!valid) { + /* check the status on the file we're actually interested in + * (the post-processing will cache the result on auth_vnode) */ + _debug("no valid permit"); + + set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + ret = afs_vnode_fetch_status(vnode, auth_vnode, key); + if (ret < 0) { + iput(&auth_vnode->vfs_inode); + *_access = 0; + _leave(" = %d", ret); + return ret; + } + *_access = vnode->status.caller_access; + } + + iput(&auth_vnode->vfs_inode); + _leave(" = 0 [access %x]", *_access); + return 0; +} + +/* + * check the permissions on an AFS file + * - AFS ACLs are attached to directories only, and a file is controlled by its + * parent directory's ACL + */ +int afs_permission(struct inode *inode, int mask, unsigned int flags) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + afs_access_t uninitialized_var(access); + struct key *key; + int ret; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + _enter("{{%x:%u},%lx},%x,", + vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask); + + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) { + _leave(" = %ld [key]", PTR_ERR(key)); + return PTR_ERR(key); + } + + /* if the promise has expired, we need to check the server again */ + if (!vnode->cb_promised) { + _debug("not promised"); + ret = afs_vnode_fetch_status(vnode, NULL, key); + if (ret < 0) + goto error; + _debug("new promise [fl=%lx]", vnode->flags); + } + + /* check the permits to see if we've got one yet */ + ret = afs_check_permit(vnode, key, &access); + if (ret < 0) + goto error; + + /* interpret the access mask */ + _debug("REQ %x ACC %x on %s", + mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file"); + + if (S_ISDIR(inode->i_mode)) { + if (mask & MAY_EXEC) { + if (!(access & AFS_ACE_LOOKUP)) + goto permission_denied; + } else if (mask & MAY_READ) { + if (!(access & AFS_ACE_READ)) + goto permission_denied; + } else if (mask & MAY_WRITE) { + if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */ + AFS_ACE_INSERT | /* create, mkdir, symlink, rename to */ + AFS_ACE_WRITE))) /* chmod */ + goto permission_denied; + } else { + BUG(); + } + } else { + if (!(access & AFS_ACE_LOOKUP)) + goto permission_denied; + if (mask & (MAY_EXEC | MAY_READ)) { + if (!(access & AFS_ACE_READ)) + goto permission_denied; + } else if (mask & MAY_WRITE) { + if (!(access & AFS_ACE_WRITE)) + goto permission_denied; + } + } + + key_put(key); + ret = generic_permission(inode, mask, flags, NULL); + _leave(" = %d", ret); + return ret; + +permission_denied: + ret = -EACCES; +error: + key_put(key); + _leave(" = %d", ret); + return ret; +} diff --git a/fs/afs/server.c b/fs/afs/server.c new file mode 100644 index 00000000..d59b7516 --- /dev/null +++ b/fs/afs/server.c @@ -0,0 +1,328 @@ +/* AFS server record management + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include "internal.h" + +static unsigned afs_server_timeout = 10; /* server timeout in seconds */ + +static void afs_reap_server(struct work_struct *); + +/* tree of all the servers, indexed by IP address */ +static struct rb_root afs_servers = RB_ROOT; +static DEFINE_RWLOCK(afs_servers_lock); + +/* LRU list of all the servers not currently in use */ +static LIST_HEAD(afs_server_graveyard); +static DEFINE_SPINLOCK(afs_server_graveyard_lock); +static DECLARE_DELAYED_WORK(afs_server_reaper, afs_reap_server); + +/* + * install a server record in the master tree + */ +static int afs_install_server(struct afs_server *server) +{ + struct afs_server *xserver; + struct rb_node **pp, *p; + int ret; + + _enter("%p", server); + + write_lock(&afs_servers_lock); + + ret = -EEXIST; + pp = &afs_servers.rb_node; + p = NULL; + while (*pp) { + p = *pp; + _debug("- consider %p", p); + xserver = rb_entry(p, struct afs_server, master_rb); + if (server->addr.s_addr < xserver->addr.s_addr) + pp = &(*pp)->rb_left; + else if (server->addr.s_addr > xserver->addr.s_addr) + pp = &(*pp)->rb_right; + else + goto error; + } + + rb_link_node(&server->master_rb, p, pp); + rb_insert_color(&server->master_rb, &afs_servers); + ret = 0; + +error: + write_unlock(&afs_servers_lock); + return ret; +} + +/* + * allocate a new server record + */ +static struct afs_server *afs_alloc_server(struct afs_cell *cell, + const struct in_addr *addr) +{ + struct afs_server *server; + + _enter(""); + + server = kzalloc(sizeof(struct afs_server), GFP_KERNEL); + if (server) { + atomic_set(&server->usage, 1); + server->cell = cell; + + INIT_LIST_HEAD(&server->link); + INIT_LIST_HEAD(&server->grave); + init_rwsem(&server->sem); + spin_lock_init(&server->fs_lock); + server->fs_vnodes = RB_ROOT; + server->cb_promises = RB_ROOT; + spin_lock_init(&server->cb_lock); + init_waitqueue_head(&server->cb_break_waitq); + INIT_DELAYED_WORK(&server->cb_break_work, + afs_dispatch_give_up_callbacks); + + memcpy(&server->addr, addr, sizeof(struct in_addr)); + server->addr.s_addr = addr->s_addr; + _leave(" = %p{%d}", server, atomic_read(&server->usage)); + } else { + _leave(" = NULL [nomem]"); + } + return server; +} + +/* + * get an FS-server record for a cell + */ +struct afs_server *afs_lookup_server(struct afs_cell *cell, + const struct in_addr *addr) +{ + struct afs_server *server, *candidate; + + _enter("%p,%pI4", cell, &addr->s_addr); + + /* quick scan of the list to see if we already have the server */ + read_lock(&cell->servers_lock); + + list_for_each_entry(server, &cell->servers, link) { + if (server->addr.s_addr == addr->s_addr) + goto found_server_quickly; + } + read_unlock(&cell->servers_lock); + + candidate = afs_alloc_server(cell, addr); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + write_lock(&cell->servers_lock); + + /* check the cell's server list again */ + list_for_each_entry(server, &cell->servers, link) { + if (server->addr.s_addr == addr->s_addr) + goto found_server; + } + + _debug("new"); + server = candidate; + if (afs_install_server(server) < 0) + goto server_in_two_cells; + + afs_get_cell(cell); + list_add_tail(&server->link, &cell->servers); + + write_unlock(&cell->servers_lock); + _leave(" = %p{%d}", server, atomic_read(&server->usage)); + return server; + + /* found a matching server quickly */ +found_server_quickly: + _debug("found quickly"); + afs_get_server(server); + read_unlock(&cell->servers_lock); +no_longer_unused: + if (!list_empty(&server->grave)) { + spin_lock(&afs_server_graveyard_lock); + list_del_init(&server->grave); + spin_unlock(&afs_server_graveyard_lock); + } + _leave(" = %p{%d}", server, atomic_read(&server->usage)); + return server; + + /* found a matching server on the second pass */ +found_server: + _debug("found"); + afs_get_server(server); + write_unlock(&cell->servers_lock); + kfree(candidate); + goto no_longer_unused; + + /* found a server that seems to be in two cells */ +server_in_two_cells: + write_unlock(&cell->servers_lock); + kfree(candidate); + printk(KERN_NOTICE "kAFS: Server %pI4 appears to be in two cells\n", + addr); + _leave(" = -EEXIST"); + return ERR_PTR(-EEXIST); +} + +/* + * look up a server by its IP address + */ +struct afs_server *afs_find_server(const struct in_addr *_addr) +{ + struct afs_server *server = NULL; + struct rb_node *p; + struct in_addr addr = *_addr; + + _enter("%pI4", &addr.s_addr); + + read_lock(&afs_servers_lock); + + p = afs_servers.rb_node; + while (p) { + server = rb_entry(p, struct afs_server, master_rb); + + _debug("- consider %p", p); + + if (addr.s_addr < server->addr.s_addr) { + p = p->rb_left; + } else if (addr.s_addr > server->addr.s_addr) { + p = p->rb_right; + } else { + afs_get_server(server); + goto found; + } + } + + server = NULL; +found: + read_unlock(&afs_servers_lock); + ASSERTIFCMP(server, server->addr.s_addr, ==, addr.s_addr); + _leave(" = %p", server); + return server; +} + +/* + * destroy a server record + * - removes from the cell list + */ +void afs_put_server(struct afs_server *server) +{ + if (!server) + return; + + _enter("%p{%d}", server, atomic_read(&server->usage)); + + _debug("PUT SERVER %d", atomic_read(&server->usage)); + + ASSERTCMP(atomic_read(&server->usage), >, 0); + + if (likely(!atomic_dec_and_test(&server->usage))) { + _leave(""); + return; + } + + afs_flush_callback_breaks(server); + + spin_lock(&afs_server_graveyard_lock); + if (atomic_read(&server->usage) == 0) { + list_move_tail(&server->grave, &afs_server_graveyard); + server->time_of_death = get_seconds(); + queue_delayed_work(afs_wq, &afs_server_reaper, + afs_server_timeout * HZ); + } + spin_unlock(&afs_server_graveyard_lock); + _leave(" [dead]"); +} + +/* + * destroy a dead server + */ +static void afs_destroy_server(struct afs_server *server) +{ + _enter("%p", server); + + ASSERTIF(server->cb_break_head != server->cb_break_tail, + delayed_work_pending(&server->cb_break_work)); + + ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL); + ASSERTCMP(server->cb_promises.rb_node, ==, NULL); + ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail); + ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0); + + afs_put_cell(server->cell); + kfree(server); +} + +/* + * reap dead server records + */ +static void afs_reap_server(struct work_struct *work) +{ + LIST_HEAD(corpses); + struct afs_server *server; + unsigned long delay, expiry; + time_t now; + + now = get_seconds(); + spin_lock(&afs_server_graveyard_lock); + + while (!list_empty(&afs_server_graveyard)) { + server = list_entry(afs_server_graveyard.next, + struct afs_server, grave); + + /* the queue is ordered most dead first */ + expiry = server->time_of_death + afs_server_timeout; + if (expiry > now) { + delay = (expiry - now) * HZ; + if (!queue_delayed_work(afs_wq, &afs_server_reaper, + delay)) { + cancel_delayed_work(&afs_server_reaper); + queue_delayed_work(afs_wq, &afs_server_reaper, + delay); + } + break; + } + + write_lock(&server->cell->servers_lock); + write_lock(&afs_servers_lock); + if (atomic_read(&server->usage) > 0) { + list_del_init(&server->grave); + } else { + list_move_tail(&server->grave, &corpses); + list_del_init(&server->link); + rb_erase(&server->master_rb, &afs_servers); + } + write_unlock(&afs_servers_lock); + write_unlock(&server->cell->servers_lock); + } + + spin_unlock(&afs_server_graveyard_lock); + + /* now reap the corpses we've extracted */ + while (!list_empty(&corpses)) { + server = list_entry(corpses.next, struct afs_server, grave); + list_del(&server->grave); + afs_destroy_server(server); + } +} + +/* + * discard all the server records for rmmod + */ +void __exit afs_purge_servers(void) +{ + afs_server_timeout = 0; + cancel_delayed_work(&afs_server_reaper); + queue_delayed_work(afs_wq, &afs_server_reaper, 0); +} diff --git a/fs/afs/super.c b/fs/afs/super.c new file mode 100644 index 00000000..356dcf09 --- /dev/null +++ b/fs/afs/super.c @@ -0,0 +1,550 @@ +/* AFS superblock handling + * + * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved. + * + * This software may be freely redistributed under the terms of the + * GNU General Public License. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Authors: David Howells + * David Woodhouse + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ + +static void afs_i_init_once(void *foo); +static struct dentry *afs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data); +static void afs_kill_super(struct super_block *sb); +static struct inode *afs_alloc_inode(struct super_block *sb); +static void afs_destroy_inode(struct inode *inode); +static int afs_statfs(struct dentry *dentry, struct kstatfs *buf); + +struct file_system_type afs_fs_type = { + .owner = THIS_MODULE, + .name = "afs", + .mount = afs_mount, + .kill_sb = afs_kill_super, + .fs_flags = 0, +}; + +static const struct super_operations afs_super_ops = { + .statfs = afs_statfs, + .alloc_inode = afs_alloc_inode, + .drop_inode = afs_drop_inode, + .destroy_inode = afs_destroy_inode, + .evict_inode = afs_evict_inode, + .show_options = generic_show_options, +}; + +static struct kmem_cache *afs_inode_cachep; +static atomic_t afs_count_active_inodes; + +enum { + afs_no_opt, + afs_opt_cell, + afs_opt_rwpath, + afs_opt_vol, + afs_opt_autocell, +}; + +static const match_table_t afs_options_list = { + { afs_opt_cell, "cell=%s" }, + { afs_opt_rwpath, "rwpath" }, + { afs_opt_vol, "vol=%s" }, + { afs_opt_autocell, "autocell" }, + { afs_no_opt, NULL }, +}; + +/* + * initialise the filesystem + */ +int __init afs_fs_init(void) +{ + int ret; + + _enter(""); + + /* create ourselves an inode cache */ + atomic_set(&afs_count_active_inodes, 0); + + ret = -ENOMEM; + afs_inode_cachep = kmem_cache_create("afs_inode_cache", + sizeof(struct afs_vnode), + 0, + SLAB_HWCACHE_ALIGN, + afs_i_init_once); + if (!afs_inode_cachep) { + printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n"); + return ret; + } + + /* now export our filesystem to lesser mortals */ + ret = register_filesystem(&afs_fs_type); + if (ret < 0) { + kmem_cache_destroy(afs_inode_cachep); + _leave(" = %d", ret); + return ret; + } + + _leave(" = 0"); + return 0; +} + +/* + * clean up the filesystem + */ +void __exit afs_fs_exit(void) +{ + _enter(""); + + afs_mntpt_kill_timer(); + unregister_filesystem(&afs_fs_type); + + if (atomic_read(&afs_count_active_inodes) != 0) { + printk("kAFS: %d active inode objects still present\n", + atomic_read(&afs_count_active_inodes)); + BUG(); + } + + kmem_cache_destroy(afs_inode_cachep); + _leave(""); +} + +/* + * parse the mount options + * - this function has been shamelessly adapted from the ext3 fs which + * shamelessly adapted it from the msdos fs + */ +static int afs_parse_options(struct afs_mount_params *params, + char *options, const char **devname) +{ + struct afs_cell *cell; + substring_t args[MAX_OPT_ARGS]; + char *p; + int token; + + _enter("%s", options); + + options[PAGE_SIZE - 1] = 0; + + while ((p = strsep(&options, ","))) { + if (!*p) + continue; + + token = match_token(p, afs_options_list, args); + switch (token) { + case afs_opt_cell: + cell = afs_cell_lookup(args[0].from, + args[0].to - args[0].from, + false); + if (IS_ERR(cell)) + return PTR_ERR(cell); + afs_put_cell(params->cell); + params->cell = cell; + break; + + case afs_opt_rwpath: + params->rwpath = 1; + break; + + case afs_opt_vol: + *devname = args[0].from; + break; + + case afs_opt_autocell: + params->autocell = 1; + break; + + default: + printk(KERN_ERR "kAFS:" + " Unknown or invalid mount option: '%s'\n", p); + return -EINVAL; + } + } + + _leave(" = 0"); + return 0; +} + +/* + * parse a device name to get cell name, volume name, volume type and R/W + * selector + * - this can be one of the following: + * "%[cell:]volume[.]" R/W volume + * "#[cell:]volume[.]" R/O or R/W volume (rwpath=0), + * or R/W (rwpath=1) volume + * "%[cell:]volume.readonly" R/O volume + * "#[cell:]volume.readonly" R/O volume + * "%[cell:]volume.backup" Backup volume + * "#[cell:]volume.backup" Backup volume + */ +static int afs_parse_device_name(struct afs_mount_params *params, + const char *name) +{ + struct afs_cell *cell; + const char *cellname, *suffix; + int cellnamesz; + + _enter(",%s", name); + + if (!name) { + printk(KERN_ERR "kAFS: no volume name specified\n"); + return -EINVAL; + } + + if ((name[0] != '%' && name[0] != '#') || !name[1]) { + printk(KERN_ERR "kAFS: unparsable volume name\n"); + return -EINVAL; + } + + /* determine the type of volume we're looking for */ + params->type = AFSVL_ROVOL; + params->force = false; + if (params->rwpath || name[0] == '%') { + params->type = AFSVL_RWVOL; + params->force = true; + } + name++; + + /* split the cell name out if there is one */ + params->volname = strchr(name, ':'); + if (params->volname) { + cellname = name; + cellnamesz = params->volname - name; + params->volname++; + } else { + params->volname = name; + cellname = NULL; + cellnamesz = 0; + } + + /* the volume type is further affected by a possible suffix */ + suffix = strrchr(params->volname, '.'); + if (suffix) { + if (strcmp(suffix, ".readonly") == 0) { + params->type = AFSVL_ROVOL; + params->force = true; + } else if (strcmp(suffix, ".backup") == 0) { + params->type = AFSVL_BACKVOL; + params->force = true; + } else if (suffix[1] == 0) { + } else { + suffix = NULL; + } + } + + params->volnamesz = suffix ? + suffix - params->volname : strlen(params->volname); + + _debug("cell %*.*s [%p]", + cellnamesz, cellnamesz, cellname ?: "", params->cell); + + /* lookup the cell record */ + if (cellname || !params->cell) { + cell = afs_cell_lookup(cellname, cellnamesz, true); + if (IS_ERR(cell)) { + printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n", + cellnamesz, cellnamesz, cellname ?: ""); + return PTR_ERR(cell); + } + afs_put_cell(params->cell); + params->cell = cell; + } + + _debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s", + params->cell->name, params->cell, + params->volnamesz, params->volnamesz, params->volname, + suffix ?: "-", params->type, params->force ? " FORCE" : ""); + + return 0; +} + +/* + * check a superblock to see if it's the one we're looking for + */ +static int afs_test_super(struct super_block *sb, void *data) +{ + struct afs_super_info *as1 = data; + struct afs_super_info *as = sb->s_fs_info; + + return as->volume == as1->volume; +} + +static int afs_set_super(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); +} + +/* + * fill in the superblock + */ +static int afs_fill_super(struct super_block *sb, + struct afs_mount_params *params) +{ + struct afs_super_info *as = sb->s_fs_info; + struct afs_fid fid; + struct dentry *root = NULL; + struct inode *inode = NULL; + int ret; + + _enter(""); + + /* fill in the superblock */ + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = AFS_FS_MAGIC; + sb->s_op = &afs_super_ops; + sb->s_bdi = &as->volume->bdi; + strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id)); + + /* allocate the root inode and dentry */ + fid.vid = as->volume->vid; + fid.vnode = 1; + fid.unique = 1; + inode = afs_iget(sb, params->key, &fid, NULL, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (params->autocell) + set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); + + ret = -ENOMEM; + root = d_alloc_root(inode); + if (!root) + goto error; + + sb->s_d_op = &afs_fs_dentry_operations; + sb->s_root = root; + + _leave(" = 0"); + return 0; + +error: + iput(inode); + _leave(" = %d", ret); + return ret; +} + +/* + * get an AFS superblock + */ +static struct dentry *afs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *options) +{ + struct afs_mount_params params; + struct super_block *sb; + struct afs_volume *vol; + struct key *key; + char *new_opts = kstrdup(options, GFP_KERNEL); + struct afs_super_info *as; + int ret; + + _enter(",,%s,%p", dev_name, options); + + memset(¶ms, 0, sizeof(params)); + + /* parse the options and device name */ + if (options) { + ret = afs_parse_options(¶ms, options, &dev_name); + if (ret < 0) + goto error; + } + + ret = afs_parse_device_name(¶ms, dev_name); + if (ret < 0) + goto error; + + /* try and do the mount securely */ + key = afs_request_key(params.cell); + if (IS_ERR(key)) { + _leave(" = %ld [key]", PTR_ERR(key)); + ret = PTR_ERR(key); + goto error; + } + params.key = key; + + /* parse the device name */ + vol = afs_volume_lookup(¶ms); + if (IS_ERR(vol)) { + ret = PTR_ERR(vol); + goto error; + } + + /* allocate a superblock info record */ + as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); + if (!as) { + ret = -ENOMEM; + afs_put_volume(vol); + goto error; + } + as->volume = vol; + + /* allocate a deviceless superblock */ + sb = sget(fs_type, afs_test_super, afs_set_super, as); + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + afs_put_volume(vol); + kfree(as); + goto error; + } + + if (!sb->s_root) { + /* initial superblock/root creation */ + _debug("create"); + sb->s_flags = flags; + ret = afs_fill_super(sb, ¶ms); + if (ret < 0) { + deactivate_locked_super(sb); + goto error; + } + save_mount_options(sb, new_opts); + sb->s_flags |= MS_ACTIVE; + } else { + _debug("reuse"); + ASSERTCMP(sb->s_flags, &, MS_ACTIVE); + afs_put_volume(vol); + kfree(as); + } + + afs_put_cell(params.cell); + kfree(new_opts); + _leave(" = 0 [%p]", sb); + return dget(sb->s_root); + +error: + afs_put_cell(params.cell); + key_put(params.key); + kfree(new_opts); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +static void afs_kill_super(struct super_block *sb) +{ + struct afs_super_info *as = sb->s_fs_info; + kill_anon_super(sb); + afs_put_volume(as->volume); + kfree(as); +} + +/* + * initialise an inode cache slab element prior to any use + */ +static void afs_i_init_once(void *_vnode) +{ + struct afs_vnode *vnode = _vnode; + + memset(vnode, 0, sizeof(*vnode)); + inode_init_once(&vnode->vfs_inode); + init_waitqueue_head(&vnode->update_waitq); + mutex_init(&vnode->permits_lock); + mutex_init(&vnode->validate_lock); + spin_lock_init(&vnode->writeback_lock); + spin_lock_init(&vnode->lock); + INIT_LIST_HEAD(&vnode->writebacks); + INIT_LIST_HEAD(&vnode->pending_locks); + INIT_LIST_HEAD(&vnode->granted_locks); + INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work); + INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work); +} + +/* + * allocate an AFS inode struct from our slab cache + */ +static struct inode *afs_alloc_inode(struct super_block *sb) +{ + struct afs_vnode *vnode; + + vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL); + if (!vnode) + return NULL; + + atomic_inc(&afs_count_active_inodes); + + memset(&vnode->fid, 0, sizeof(vnode->fid)); + memset(&vnode->status, 0, sizeof(vnode->status)); + + vnode->volume = NULL; + vnode->update_cnt = 0; + vnode->flags = 1 << AFS_VNODE_UNSET; + vnode->cb_promised = false; + + _leave(" = %p", &vnode->vfs_inode); + return &vnode->vfs_inode; +} + +static void afs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct afs_vnode *vnode = AFS_FS_I(inode); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(afs_inode_cachep, vnode); +} + +/* + * destroy an AFS inode struct + */ +static void afs_destroy_inode(struct inode *inode) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + + _enter("%p{%x:%u}", inode, vnode->fid.vid, vnode->fid.vnode); + + _debug("DESTROY INODE %p", inode); + + ASSERTCMP(vnode->server, ==, NULL); + + call_rcu(&inode->i_rcu, afs_i_callback); + atomic_dec(&afs_count_active_inodes); +} + +/* + * return information about an AFS volume + */ +static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct afs_volume_status vs; + struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); + struct key *key; + int ret; + + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) + return PTR_ERR(key); + + ret = afs_vnode_get_volume_status(vnode, key, &vs); + key_put(key); + if (ret < 0) { + _leave(" = %d", ret); + return ret; + } + + buf->f_type = dentry->d_sb->s_magic; + buf->f_bsize = AFS_BLOCK_SIZE; + buf->f_namelen = AFSNAMEMAX - 1; + + if (vs.max_quota == 0) + buf->f_blocks = vs.part_max_blocks; + else + buf->f_blocks = vs.max_quota; + buf->f_bavail = buf->f_bfree = buf->f_blocks - vs.blocks_in_use; + return 0; +} diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c new file mode 100644 index 00000000..340afd0c --- /dev/null +++ b/fs/afs/vlclient.c @@ -0,0 +1,219 @@ +/* AFS Volume Location Service client + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include "internal.h" + +/* + * map volume locator abort codes to error codes + */ +static int afs_vl_abort_to_error(u32 abort_code) +{ + _enter("%u", abort_code); + + switch (abort_code) { + case AFSVL_IDEXIST: return -EEXIST; + case AFSVL_IO: return -EREMOTEIO; + case AFSVL_NAMEEXIST: return -EEXIST; + case AFSVL_CREATEFAIL: return -EREMOTEIO; + case AFSVL_NOENT: return -ENOMEDIUM; + case AFSVL_EMPTY: return -ENOMEDIUM; + case AFSVL_ENTDELETED: return -ENOMEDIUM; + case AFSVL_BADNAME: return -EINVAL; + case AFSVL_BADINDEX: return -EINVAL; + case AFSVL_BADVOLTYPE: return -EINVAL; + case AFSVL_BADSERVER: return -EINVAL; + case AFSVL_BADPARTITION: return -EINVAL; + case AFSVL_REPSFULL: return -EFBIG; + case AFSVL_NOREPSERVER: return -ENOENT; + case AFSVL_DUPREPSERVER: return -EEXIST; + case AFSVL_RWNOTFOUND: return -ENOENT; + case AFSVL_BADREFCOUNT: return -EINVAL; + case AFSVL_SIZEEXCEEDED: return -EINVAL; + case AFSVL_BADENTRY: return -EINVAL; + case AFSVL_BADVOLIDBUMP: return -EINVAL; + case AFSVL_IDALREADYHASHED: return -EINVAL; + case AFSVL_ENTRYLOCKED: return -EBUSY; + case AFSVL_BADVOLOPER: return -EBADRQC; + case AFSVL_BADRELLOCKTYPE: return -EINVAL; + case AFSVL_RERELEASE: return -EREMOTEIO; + case AFSVL_BADSERVERFLAG: return -EINVAL; + case AFSVL_PERM: return -EACCES; + case AFSVL_NOMEM: return -EREMOTEIO; + default: + return afs_abort_to_error(abort_code); + } +} + +/* + * deliver reply data to a VL.GetEntryByXXX call + */ +static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call, + struct sk_buff *skb, bool last) +{ + struct afs_cache_vlocation *entry; + __be32 *bp; + u32 tmp; + int loop; + + _enter(",,%u", last); + + afs_transfer_reply(call, skb); + if (!last) + return 0; + + if (call->reply_size != call->reply_max) + return -EBADMSG; + + /* unmarshall the reply once we've received all of it */ + entry = call->reply; + bp = call->buffer; + + for (loop = 0; loop < 64; loop++) + entry->name[loop] = ntohl(*bp++); + entry->name[loop] = 0; + bp++; /* final NUL */ + + bp++; /* type */ + entry->nservers = ntohl(*bp++); + + for (loop = 0; loop < 8; loop++) + entry->servers[loop].s_addr = *bp++; + + bp += 8; /* partition IDs */ + + for (loop = 0; loop < 8; loop++) { + tmp = ntohl(*bp++); + entry->srvtmask[loop] = 0; + if (tmp & AFS_VLSF_RWVOL) + entry->srvtmask[loop] |= AFS_VOL_VTM_RW; + if (tmp & AFS_VLSF_ROVOL) + entry->srvtmask[loop] |= AFS_VOL_VTM_RO; + if (tmp & AFS_VLSF_BACKVOL) + entry->srvtmask[loop] |= AFS_VOL_VTM_BAK; + } + + entry->vid[0] = ntohl(*bp++); + entry->vid[1] = ntohl(*bp++); + entry->vid[2] = ntohl(*bp++); + + bp++; /* clone ID */ + + tmp = ntohl(*bp++); /* flags */ + entry->vidmask = 0; + if (tmp & AFS_VLF_RWEXISTS) + entry->vidmask |= AFS_VOL_VTM_RW; + if (tmp & AFS_VLF_ROEXISTS) + entry->vidmask |= AFS_VOL_VTM_RO; + if (tmp & AFS_VLF_BACKEXISTS) + entry->vidmask |= AFS_VOL_VTM_BAK; + if (!entry->vidmask) + return -EBADMSG; + + _leave(" = 0 [done]"); + return 0; +} + +/* + * VL.GetEntryByName operation type + */ +static const struct afs_call_type afs_RXVLGetEntryByName = { + .name = "VL.GetEntryByName", + .deliver = afs_deliver_vl_get_entry_by_xxx, + .abort_to_error = afs_vl_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * VL.GetEntryById operation type + */ +static const struct afs_call_type afs_RXVLGetEntryById = { + .name = "VL.GetEntryById", + .deliver = afs_deliver_vl_get_entry_by_xxx, + .abort_to_error = afs_vl_abort_to_error, + .destructor = afs_flat_call_destructor, +}; + +/* + * dispatch a get volume entry by name operation + */ +int afs_vl_get_entry_by_name(struct in_addr *addr, + struct key *key, + const char *volname, + struct afs_cache_vlocation *entry, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + size_t volnamesz, reqsz, padsz; + __be32 *bp; + + _enter(""); + + volnamesz = strlen(volname); + padsz = (4 - (volnamesz & 3)) & 3; + reqsz = 8 + volnamesz + padsz; + + call = afs_alloc_flat_call(&afs_RXVLGetEntryByName, reqsz, 384); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = entry; + call->service_id = VL_SERVICE; + call->port = htons(AFS_VL_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(VLGETENTRYBYNAME); + *bp++ = htonl(volnamesz); + memcpy(bp, volname, volnamesz); + if (padsz > 0) + memset((void *) bp + volnamesz, 0, padsz); + + /* initiate the call */ + return afs_make_call(addr, call, GFP_KERNEL, wait_mode); +} + +/* + * dispatch a get volume entry by ID operation + */ +int afs_vl_get_entry_by_id(struct in_addr *addr, + struct key *key, + afs_volid_t volid, + afs_voltype_t voltype, + struct afs_cache_vlocation *entry, + const struct afs_wait_mode *wait_mode) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(&afs_RXVLGetEntryById, 12, 384); + if (!call) + return -ENOMEM; + + call->key = key; + call->reply = entry; + call->service_id = VL_SERVICE; + call->port = htons(AFS_VL_PORT); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(VLGETENTRYBYID); + *bp++ = htonl(volid); + *bp = htonl(voltype); + + /* initiate the call */ + return afs_make_call(addr, call, GFP_KERNEL, wait_mode); +} diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c new file mode 100644 index 00000000..431984d2 --- /dev/null +++ b/fs/afs/vlocation.c @@ -0,0 +1,726 @@ +/* AFS volume location management + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "internal.h" + +static unsigned afs_vlocation_timeout = 10; /* volume location timeout in seconds */ +static unsigned afs_vlocation_update_timeout = 10 * 60; + +static void afs_vlocation_reaper(struct work_struct *); +static void afs_vlocation_updater(struct work_struct *); + +static LIST_HEAD(afs_vlocation_updates); +static LIST_HEAD(afs_vlocation_graveyard); +static DEFINE_SPINLOCK(afs_vlocation_updates_lock); +static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock); +static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper); +static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater); +static struct workqueue_struct *afs_vlocation_update_worker; + +/* + * iterate through the VL servers in a cell until one of them admits knowing + * about the volume in question + */ +static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl, + struct key *key, + struct afs_cache_vlocation *vldb) +{ + struct afs_cell *cell = vl->cell; + struct in_addr addr; + int count, ret; + + _enter("%s,%s", cell->name, vl->vldb.name); + + down_write(&vl->cell->vl_sem); + ret = -ENOMEDIUM; + for (count = cell->vl_naddrs; count > 0; count--) { + addr = cell->vl_addrs[cell->vl_curr_svix]; + + _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr); + + /* attempt to access the VL server */ + ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb, + &afs_sync_call); + switch (ret) { + case 0: + goto out; + case -ENOMEM: + case -ENONET: + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + if (ret == -ENOMEM || ret == -ENONET) + goto out; + goto rotate; + case -ENOMEDIUM: + case -EKEYREJECTED: + case -EKEYEXPIRED: + goto out; + default: + ret = -EIO; + goto rotate; + } + + /* rotate the server records upon lookup failure */ + rotate: + cell->vl_curr_svix++; + cell->vl_curr_svix %= cell->vl_naddrs; + } + +out: + up_write(&vl->cell->vl_sem); + _leave(" = %d", ret); + return ret; +} + +/* + * iterate through the VL servers in a cell until one of them admits knowing + * about the volume in question + */ +static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl, + struct key *key, + afs_volid_t volid, + afs_voltype_t voltype, + struct afs_cache_vlocation *vldb) +{ + struct afs_cell *cell = vl->cell; + struct in_addr addr; + int count, ret; + + _enter("%s,%x,%d,", cell->name, volid, voltype); + + down_write(&vl->cell->vl_sem); + ret = -ENOMEDIUM; + for (count = cell->vl_naddrs; count > 0; count--) { + addr = cell->vl_addrs[cell->vl_curr_svix]; + + _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr); + + /* attempt to access the VL server */ + ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb, + &afs_sync_call); + switch (ret) { + case 0: + goto out; + case -ENOMEM: + case -ENONET: + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + if (ret == -ENOMEM || ret == -ENONET) + goto out; + goto rotate; + case -EBUSY: + vl->upd_busy_cnt++; + if (vl->upd_busy_cnt <= 3) { + if (vl->upd_busy_cnt > 1) { + /* second+ BUSY - sleep a little bit */ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(1); + __set_current_state(TASK_RUNNING); + } + continue; + } + break; + case -ENOMEDIUM: + vl->upd_rej_cnt++; + goto rotate; + default: + ret = -EIO; + goto rotate; + } + + /* rotate the server records upon lookup failure */ + rotate: + cell->vl_curr_svix++; + cell->vl_curr_svix %= cell->vl_naddrs; + vl->upd_busy_cnt = 0; + } + +out: + if (ret < 0 && vl->upd_rej_cnt > 0) { + printk(KERN_NOTICE "kAFS:" + " Active volume no longer valid '%s'\n", + vl->vldb.name); + vl->valid = 0; + ret = -ENOMEDIUM; + } + + up_write(&vl->cell->vl_sem); + _leave(" = %d", ret); + return ret; +} + +/* + * allocate a volume location record + */ +static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell, + const char *name, + size_t namesz) +{ + struct afs_vlocation *vl; + + vl = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL); + if (vl) { + vl->cell = cell; + vl->state = AFS_VL_NEW; + atomic_set(&vl->usage, 1); + INIT_LIST_HEAD(&vl->link); + INIT_LIST_HEAD(&vl->grave); + INIT_LIST_HEAD(&vl->update); + init_waitqueue_head(&vl->waitq); + spin_lock_init(&vl->lock); + memcpy(vl->vldb.name, name, namesz); + } + + _leave(" = %p", vl); + return vl; +} + +/* + * update record if we found it in the cache + */ +static int afs_vlocation_update_record(struct afs_vlocation *vl, + struct key *key, + struct afs_cache_vlocation *vldb) +{ + afs_voltype_t voltype; + afs_volid_t vid; + int ret; + + /* try to look up a cached volume in the cell VL databases by ID */ + _debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }", + vl->vldb.name, + vl->vldb.vidmask, + ntohl(vl->vldb.servers[0].s_addr), + vl->vldb.srvtmask[0], + ntohl(vl->vldb.servers[1].s_addr), + vl->vldb.srvtmask[1], + ntohl(vl->vldb.servers[2].s_addr), + vl->vldb.srvtmask[2]); + + _debug("Vids: %08x %08x %08x", + vl->vldb.vid[0], + vl->vldb.vid[1], + vl->vldb.vid[2]); + + if (vl->vldb.vidmask & AFS_VOL_VTM_RW) { + vid = vl->vldb.vid[0]; + voltype = AFSVL_RWVOL; + } else if (vl->vldb.vidmask & AFS_VOL_VTM_RO) { + vid = vl->vldb.vid[1]; + voltype = AFSVL_ROVOL; + } else if (vl->vldb.vidmask & AFS_VOL_VTM_BAK) { + vid = vl->vldb.vid[2]; + voltype = AFSVL_BACKVOL; + } else { + BUG(); + vid = 0; + voltype = 0; + } + + /* contact the server to make sure the volume is still available + * - TODO: need to handle disconnected operation here + */ + ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb); + switch (ret) { + /* net error */ + default: + printk(KERN_WARNING "kAFS:" + " failed to update volume '%s' (%x) up in '%s': %d\n", + vl->vldb.name, vid, vl->cell->name, ret); + _leave(" = %d", ret); + return ret; + + /* pulled from local cache into memory */ + case 0: + _leave(" = 0"); + return 0; + + /* uh oh... looks like the volume got deleted */ + case -ENOMEDIUM: + printk(KERN_ERR "kAFS:" + " volume '%s' (%x) does not exist '%s'\n", + vl->vldb.name, vid, vl->cell->name); + + /* TODO: make existing record unavailable */ + _leave(" = %d", ret); + return ret; + } +} + +/* + * apply the update to a VL record + */ +static void afs_vlocation_apply_update(struct afs_vlocation *vl, + struct afs_cache_vlocation *vldb) +{ + _debug("Done VL Lookup: %s %02x { %08x(%x) %08x(%x) %08x(%x) }", + vldb->name, vldb->vidmask, + ntohl(vldb->servers[0].s_addr), vldb->srvtmask[0], + ntohl(vldb->servers[1].s_addr), vldb->srvtmask[1], + ntohl(vldb->servers[2].s_addr), vldb->srvtmask[2]); + + _debug("Vids: %08x %08x %08x", + vldb->vid[0], vldb->vid[1], vldb->vid[2]); + + if (strcmp(vldb->name, vl->vldb.name) != 0) + printk(KERN_NOTICE "kAFS:" + " name of volume '%s' changed to '%s' on server\n", + vl->vldb.name, vldb->name); + + vl->vldb = *vldb; + +#ifdef CONFIG_AFS_FSCACHE + fscache_update_cookie(vl->cache); +#endif +} + +/* + * fill in a volume location record, consulting the cache and the VL server + * both + */ +static int afs_vlocation_fill_in_record(struct afs_vlocation *vl, + struct key *key) +{ + struct afs_cache_vlocation vldb; + int ret; + + _enter(""); + + ASSERTCMP(vl->valid, ==, 0); + + memset(&vldb, 0, sizeof(vldb)); + + /* see if we have an in-cache copy (will set vl->valid if there is) */ +#ifdef CONFIG_AFS_FSCACHE + vl->cache = fscache_acquire_cookie(vl->cell->cache, + &afs_vlocation_cache_index_def, vl); +#endif + + if (vl->valid) { + /* try to update a known volume in the cell VL databases by + * ID as the name may have changed */ + _debug("found in cache"); + ret = afs_vlocation_update_record(vl, key, &vldb); + } else { + /* try to look up an unknown volume in the cell VL databases by + * name */ + ret = afs_vlocation_access_vl_by_name(vl, key, &vldb); + if (ret < 0) { + printk("kAFS: failed to locate '%s' in cell '%s'\n", + vl->vldb.name, vl->cell->name); + return ret; + } + } + + afs_vlocation_apply_update(vl, &vldb); + _leave(" = 0"); + return 0; +} + +/* + * queue a vlocation record for updates + */ +static void afs_vlocation_queue_for_updates(struct afs_vlocation *vl) +{ + struct afs_vlocation *xvl; + + /* wait at least 10 minutes before updating... */ + vl->update_at = get_seconds() + afs_vlocation_update_timeout; + + spin_lock(&afs_vlocation_updates_lock); + + if (!list_empty(&afs_vlocation_updates)) { + /* ... but wait at least 1 second more than the newest record + * already queued so that we don't spam the VL server suddenly + * with lots of requests + */ + xvl = list_entry(afs_vlocation_updates.prev, + struct afs_vlocation, update); + if (vl->update_at <= xvl->update_at) + vl->update_at = xvl->update_at + 1; + } else { + queue_delayed_work(afs_vlocation_update_worker, + &afs_vlocation_update, + afs_vlocation_update_timeout * HZ); + } + + list_add_tail(&vl->update, &afs_vlocation_updates); + spin_unlock(&afs_vlocation_updates_lock); +} + +/* + * lookup volume location + * - iterate through the VL servers in a cell until one of them admits knowing + * about the volume in question + * - lookup in the local cache if not able to find on the VL server + * - insert/update in the local cache if did get a VL response + */ +struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell, + struct key *key, + const char *name, + size_t namesz) +{ + struct afs_vlocation *vl; + int ret; + + _enter("{%s},{%x},%*.*s,%zu", + cell->name, key_serial(key), + (int) namesz, (int) namesz, name, namesz); + + if (namesz >= sizeof(vl->vldb.name)) { + _leave(" = -ENAMETOOLONG"); + return ERR_PTR(-ENAMETOOLONG); + } + + /* see if we have an in-memory copy first */ + down_write(&cell->vl_sem); + spin_lock(&cell->vl_lock); + list_for_each_entry(vl, &cell->vl_list, link) { + if (vl->vldb.name[namesz] != '\0') + continue; + if (memcmp(vl->vldb.name, name, namesz) == 0) + goto found_in_memory; + } + spin_unlock(&cell->vl_lock); + + /* not in the cell's in-memory lists - create a new record */ + vl = afs_vlocation_alloc(cell, name, namesz); + if (!vl) { + up_write(&cell->vl_sem); + return ERR_PTR(-ENOMEM); + } + + afs_get_cell(cell); + + list_add_tail(&vl->link, &cell->vl_list); + vl->state = AFS_VL_CREATING; + up_write(&cell->vl_sem); + +fill_in_record: + ret = afs_vlocation_fill_in_record(vl, key); + if (ret < 0) + goto error_abandon; + spin_lock(&vl->lock); + vl->state = AFS_VL_VALID; + spin_unlock(&vl->lock); + wake_up(&vl->waitq); + + /* update volume entry in local cache */ +#ifdef CONFIG_AFS_FSCACHE + fscache_update_cookie(vl->cache); +#endif + + /* schedule for regular updates */ + afs_vlocation_queue_for_updates(vl); + goto success; + +found_in_memory: + /* found in memory */ + _debug("found in memory"); + atomic_inc(&vl->usage); + spin_unlock(&cell->vl_lock); + if (!list_empty(&vl->grave)) { + spin_lock(&afs_vlocation_graveyard_lock); + list_del_init(&vl->grave); + spin_unlock(&afs_vlocation_graveyard_lock); + } + up_write(&cell->vl_sem); + + /* see if it was an abandoned record that we might try filling in */ + spin_lock(&vl->lock); + while (vl->state != AFS_VL_VALID) { + afs_vlocation_state_t state = vl->state; + + _debug("invalid [state %d]", state); + + if (state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME) { + vl->state = AFS_VL_CREATING; + spin_unlock(&vl->lock); + goto fill_in_record; + } + + /* must now wait for creation or update by someone else to + * complete */ + _debug("wait"); + + spin_unlock(&vl->lock); + ret = wait_event_interruptible(vl->waitq, + vl->state == AFS_VL_NEW || + vl->state == AFS_VL_VALID || + vl->state == AFS_VL_NO_VOLUME); + if (ret < 0) + goto error; + spin_lock(&vl->lock); + } + spin_unlock(&vl->lock); + +success: + _leave(" = %p", vl); + return vl; + +error_abandon: + spin_lock(&vl->lock); + vl->state = AFS_VL_NEW; + spin_unlock(&vl->lock); + wake_up(&vl->waitq); +error: + ASSERT(vl != NULL); + afs_put_vlocation(vl); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * finish using a volume location record + */ +void afs_put_vlocation(struct afs_vlocation *vl) +{ + if (!vl) + return; + + _enter("%s", vl->vldb.name); + + ASSERTCMP(atomic_read(&vl->usage), >, 0); + + if (likely(!atomic_dec_and_test(&vl->usage))) { + _leave(""); + return; + } + + spin_lock(&afs_vlocation_graveyard_lock); + if (atomic_read(&vl->usage) == 0) { + _debug("buried"); + list_move_tail(&vl->grave, &afs_vlocation_graveyard); + vl->time_of_death = get_seconds(); + queue_delayed_work(afs_wq, &afs_vlocation_reap, + afs_vlocation_timeout * HZ); + + /* suspend updates on this record */ + if (!list_empty(&vl->update)) { + spin_lock(&afs_vlocation_updates_lock); + list_del_init(&vl->update); + spin_unlock(&afs_vlocation_updates_lock); + } + } + spin_unlock(&afs_vlocation_graveyard_lock); + _leave(" [killed?]"); +} + +/* + * destroy a dead volume location record + */ +static void afs_vlocation_destroy(struct afs_vlocation *vl) +{ + _enter("%p", vl); + +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(vl->cache, 0); +#endif + afs_put_cell(vl->cell); + kfree(vl); +} + +/* + * reap dead volume location records + */ +static void afs_vlocation_reaper(struct work_struct *work) +{ + LIST_HEAD(corpses); + struct afs_vlocation *vl; + unsigned long delay, expiry; + time_t now; + + _enter(""); + + now = get_seconds(); + spin_lock(&afs_vlocation_graveyard_lock); + + while (!list_empty(&afs_vlocation_graveyard)) { + vl = list_entry(afs_vlocation_graveyard.next, + struct afs_vlocation, grave); + + _debug("check %p", vl); + + /* the queue is ordered most dead first */ + expiry = vl->time_of_death + afs_vlocation_timeout; + if (expiry > now) { + delay = (expiry - now) * HZ; + _debug("delay %lu", delay); + if (!queue_delayed_work(afs_wq, &afs_vlocation_reap, + delay)) { + cancel_delayed_work(&afs_vlocation_reap); + queue_delayed_work(afs_wq, &afs_vlocation_reap, + delay); + } + break; + } + + spin_lock(&vl->cell->vl_lock); + if (atomic_read(&vl->usage) > 0) { + _debug("no reap"); + list_del_init(&vl->grave); + } else { + _debug("reap"); + list_move_tail(&vl->grave, &corpses); + list_del_init(&vl->link); + } + spin_unlock(&vl->cell->vl_lock); + } + + spin_unlock(&afs_vlocation_graveyard_lock); + + /* now reap the corpses we've extracted */ + while (!list_empty(&corpses)) { + vl = list_entry(corpses.next, struct afs_vlocation, grave); + list_del(&vl->grave); + afs_vlocation_destroy(vl); + } + + _leave(""); +} + +/* + * initialise the VL update process + */ +int __init afs_vlocation_update_init(void) +{ + afs_vlocation_update_worker = + create_singlethread_workqueue("kafs_vlupdated"); + return afs_vlocation_update_worker ? 0 : -ENOMEM; +} + +/* + * discard all the volume location records for rmmod + */ +void afs_vlocation_purge(void) +{ + afs_vlocation_timeout = 0; + + spin_lock(&afs_vlocation_updates_lock); + list_del_init(&afs_vlocation_updates); + spin_unlock(&afs_vlocation_updates_lock); + cancel_delayed_work(&afs_vlocation_update); + queue_delayed_work(afs_vlocation_update_worker, + &afs_vlocation_update, 0); + destroy_workqueue(afs_vlocation_update_worker); + + cancel_delayed_work(&afs_vlocation_reap); + queue_delayed_work(afs_wq, &afs_vlocation_reap, 0); +} + +/* + * update a volume location + */ +static void afs_vlocation_updater(struct work_struct *work) +{ + struct afs_cache_vlocation vldb; + struct afs_vlocation *vl, *xvl; + time_t now; + long timeout; + int ret; + + _enter(""); + + now = get_seconds(); + + /* find a record to update */ + spin_lock(&afs_vlocation_updates_lock); + for (;;) { + if (list_empty(&afs_vlocation_updates)) { + spin_unlock(&afs_vlocation_updates_lock); + _leave(" [nothing]"); + return; + } + + vl = list_entry(afs_vlocation_updates.next, + struct afs_vlocation, update); + if (atomic_read(&vl->usage) > 0) + break; + list_del_init(&vl->update); + } + + timeout = vl->update_at - now; + if (timeout > 0) { + queue_delayed_work(afs_vlocation_update_worker, + &afs_vlocation_update, timeout * HZ); + spin_unlock(&afs_vlocation_updates_lock); + _leave(" [nothing]"); + return; + } + + list_del_init(&vl->update); + atomic_inc(&vl->usage); + spin_unlock(&afs_vlocation_updates_lock); + + /* we can now perform the update */ + _debug("update %s", vl->vldb.name); + vl->state = AFS_VL_UPDATING; + vl->upd_rej_cnt = 0; + vl->upd_busy_cnt = 0; + + ret = afs_vlocation_update_record(vl, NULL, &vldb); + spin_lock(&vl->lock); + switch (ret) { + case 0: + afs_vlocation_apply_update(vl, &vldb); + vl->state = AFS_VL_VALID; + break; + case -ENOMEDIUM: + vl->state = AFS_VL_VOLUME_DELETED; + break; + default: + vl->state = AFS_VL_UNCERTAIN; + break; + } + spin_unlock(&vl->lock); + wake_up(&vl->waitq); + + /* and then reschedule */ + _debug("reschedule"); + vl->update_at = get_seconds() + afs_vlocation_update_timeout; + + spin_lock(&afs_vlocation_updates_lock); + + if (!list_empty(&afs_vlocation_updates)) { + /* next update in 10 minutes, but wait at least 1 second more + * than the newest record already queued so that we don't spam + * the VL server suddenly with lots of requests + */ + xvl = list_entry(afs_vlocation_updates.prev, + struct afs_vlocation, update); + if (vl->update_at <= xvl->update_at) + vl->update_at = xvl->update_at + 1; + xvl = list_entry(afs_vlocation_updates.next, + struct afs_vlocation, update); + timeout = xvl->update_at - now; + if (timeout < 0) + timeout = 0; + } else { + timeout = afs_vlocation_update_timeout; + } + + ASSERT(list_empty(&vl->update)); + + list_add_tail(&vl->update, &afs_vlocation_updates); + + _debug("timeout %ld", timeout); + queue_delayed_work(afs_vlocation_update_worker, + &afs_vlocation_update, timeout * HZ); + spin_unlock(&afs_vlocation_updates_lock); + afs_put_vlocation(vl); +} diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c new file mode 100644 index 00000000..25cf4c3f --- /dev/null +++ b/fs/afs/vnode.c @@ -0,0 +1,1025 @@ +/* AFS vnode management + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "internal.h" + +#if 0 +static noinline bool dump_tree_aux(struct rb_node *node, struct rb_node *parent, + int depth, char lr) +{ + struct afs_vnode *vnode; + bool bad = false; + + if (!node) + return false; + + if (node->rb_left) + bad = dump_tree_aux(node->rb_left, node, depth + 2, '/'); + + vnode = rb_entry(node, struct afs_vnode, cb_promise); + _debug("%c %*.*s%c%p {%d}", + rb_is_red(node) ? 'R' : 'B', + depth, depth, "", lr, + vnode, vnode->cb_expires_at); + if (rb_parent(node) != parent) { + printk("BAD: %p != %p\n", rb_parent(node), parent); + bad = true; + } + + if (node->rb_right) + bad |= dump_tree_aux(node->rb_right, node, depth + 2, '\\'); + + return bad; +} + +static noinline void dump_tree(const char *name, struct afs_server *server) +{ + _enter("%s", name); + if (dump_tree_aux(server->cb_promises.rb_node, NULL, 0, '-')) + BUG(); +} +#endif + +/* + * insert a vnode into the backing server's vnode tree + */ +static void afs_install_vnode(struct afs_vnode *vnode, + struct afs_server *server) +{ + struct afs_server *old_server = vnode->server; + struct afs_vnode *xvnode; + struct rb_node *parent, **p; + + _enter("%p,%p", vnode, server); + + if (old_server) { + spin_lock(&old_server->fs_lock); + rb_erase(&vnode->server_rb, &old_server->fs_vnodes); + spin_unlock(&old_server->fs_lock); + } + + afs_get_server(server); + vnode->server = server; + afs_put_server(old_server); + + /* insert into the server's vnode tree in FID order */ + spin_lock(&server->fs_lock); + + parent = NULL; + p = &server->fs_vnodes.rb_node; + while (*p) { + parent = *p; + xvnode = rb_entry(parent, struct afs_vnode, server_rb); + if (vnode->fid.vid < xvnode->fid.vid) + p = &(*p)->rb_left; + else if (vnode->fid.vid > xvnode->fid.vid) + p = &(*p)->rb_right; + else if (vnode->fid.vnode < xvnode->fid.vnode) + p = &(*p)->rb_left; + else if (vnode->fid.vnode > xvnode->fid.vnode) + p = &(*p)->rb_right; + else if (vnode->fid.unique < xvnode->fid.unique) + p = &(*p)->rb_left; + else if (vnode->fid.unique > xvnode->fid.unique) + p = &(*p)->rb_right; + else + BUG(); /* can't happen unless afs_iget() malfunctions */ + } + + rb_link_node(&vnode->server_rb, parent, p); + rb_insert_color(&vnode->server_rb, &server->fs_vnodes); + + spin_unlock(&server->fs_lock); + _leave(""); +} + +/* + * insert a vnode into the promising server's update/expiration tree + * - caller must hold vnode->lock + */ +static void afs_vnode_note_promise(struct afs_vnode *vnode, + struct afs_server *server) +{ + struct afs_server *old_server; + struct afs_vnode *xvnode; + struct rb_node *parent, **p; + + _enter("%p,%p", vnode, server); + + ASSERT(server != NULL); + + old_server = vnode->server; + if (vnode->cb_promised) { + if (server == old_server && + vnode->cb_expires == vnode->cb_expires_at) { + _leave(" [no change]"); + return; + } + + spin_lock(&old_server->cb_lock); + if (vnode->cb_promised) { + _debug("delete"); + rb_erase(&vnode->cb_promise, &old_server->cb_promises); + vnode->cb_promised = false; + } + spin_unlock(&old_server->cb_lock); + } + + if (vnode->server != server) + afs_install_vnode(vnode, server); + + vnode->cb_expires_at = vnode->cb_expires; + _debug("PROMISE on %p {%lu}", + vnode, (unsigned long) vnode->cb_expires_at); + + /* abuse an RB-tree to hold the expiration order (we may have multiple + * items with the same expiration time) */ + spin_lock(&server->cb_lock); + + parent = NULL; + p = &server->cb_promises.rb_node; + while (*p) { + parent = *p; + xvnode = rb_entry(parent, struct afs_vnode, cb_promise); + if (vnode->cb_expires_at < xvnode->cb_expires_at) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&vnode->cb_promise, parent, p); + rb_insert_color(&vnode->cb_promise, &server->cb_promises); + vnode->cb_promised = true; + + spin_unlock(&server->cb_lock); + _leave(""); +} + +/* + * handle remote file deletion by discarding the callback promise + */ +static void afs_vnode_deleted_remotely(struct afs_vnode *vnode) +{ + struct afs_server *server; + + _enter("{%p}", vnode->server); + + set_bit(AFS_VNODE_DELETED, &vnode->flags); + + server = vnode->server; + if (server) { + if (vnode->cb_promised) { + spin_lock(&server->cb_lock); + if (vnode->cb_promised) { + rb_erase(&vnode->cb_promise, + &server->cb_promises); + vnode->cb_promised = false; + } + spin_unlock(&server->cb_lock); + } + + spin_lock(&server->fs_lock); + rb_erase(&vnode->server_rb, &server->fs_vnodes); + spin_unlock(&server->fs_lock); + + vnode->server = NULL; + afs_put_server(server); + } else { + ASSERT(!vnode->cb_promised); + } + + _leave(""); +} + +/* + * finish off updating the recorded status of a file after a successful + * operation completion + * - starts callback expiry timer + * - adds to server's callback list + */ +void afs_vnode_finalise_status_update(struct afs_vnode *vnode, + struct afs_server *server) +{ + struct afs_server *oldserver = NULL; + + _enter("%p,%p", vnode, server); + + spin_lock(&vnode->lock); + clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + afs_vnode_note_promise(vnode, server); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + + wake_up_all(&vnode->update_waitq); + afs_put_server(oldserver); + _leave(""); +} + +/* + * finish off updating the recorded status of a file after an operation failed + */ +static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret) +{ + _enter("{%x:%u},%d", vnode->fid.vid, vnode->fid.vnode, ret); + + spin_lock(&vnode->lock); + + clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); + + if (ret == -ENOENT) { + /* the file was deleted on the server */ + _debug("got NOENT from server - marking file deleted"); + afs_vnode_deleted_remotely(vnode); + } + + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + + wake_up_all(&vnode->update_waitq); + _leave(""); +} + +/* + * fetch file status from the volume + * - don't issue a fetch if: + * - the changed bit is not set and there's a valid callback + * - there are any outstanding ops that will fetch the status + * - TODO implement local caching + */ +int afs_vnode_fetch_status(struct afs_vnode *vnode, + struct afs_vnode *auth_vnode, struct key *key) +{ + struct afs_server *server; + unsigned long acl_order; + int ret; + + DECLARE_WAITQUEUE(myself, current); + + _enter("%s,{%x:%u.%u}", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); + + if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) && + vnode->cb_promised) { + _leave(" [unchanged]"); + return 0; + } + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + _leave(" [deleted]"); + return -ENOENT; + } + + acl_order = 0; + if (auth_vnode) + acl_order = auth_vnode->acl_order; + + spin_lock(&vnode->lock); + + if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) && + vnode->cb_promised) { + spin_unlock(&vnode->lock); + _leave(" [unchanged]"); + return 0; + } + + ASSERTCMP(vnode->update_cnt, >=, 0); + + if (vnode->update_cnt > 0) { + /* someone else started a fetch */ + _debug("wait on fetch %d", vnode->update_cnt); + + set_current_state(TASK_UNINTERRUPTIBLE); + ASSERT(myself.func != NULL); + add_wait_queue(&vnode->update_waitq, &myself); + + /* wait for the status to be updated */ + for (;;) { + if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) + break; + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + break; + + /* check to see if it got updated and invalidated all + * before we saw it */ + if (vnode->update_cnt == 0) { + remove_wait_queue(&vnode->update_waitq, + &myself); + set_current_state(TASK_RUNNING); + goto get_anyway; + } + + spin_unlock(&vnode->lock); + + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + + spin_lock(&vnode->lock); + } + + remove_wait_queue(&vnode->update_waitq, &myself); + spin_unlock(&vnode->lock); + set_current_state(TASK_RUNNING); + + return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? + -ENOENT : 0; + } + +get_anyway: + /* okay... we're going to have to initiate the op */ + vnode->update_cnt++; + + spin_unlock(&vnode->lock); + + /* merge AFS status fetches and clear outstanding callback on this + * vnode */ + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %p{%08x}", + server, ntohl(server->addr.s_addr)); + + ret = afs_fs_fetch_file_status(server, key, vnode, NULL, + &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + _debug("adjust"); + if (auth_vnode) + afs_cache_permit(vnode, key, acl_order); + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + } else { + _debug("failed [%d]", ret); + afs_vnode_status_update_failed(vnode, ret); + } + + ASSERTCMP(vnode->update_cnt, >=, 0); + + _leave(" = %d [cnt %d]", ret, vnode->update_cnt); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); + return PTR_ERR(server); +} + +/* + * fetch file data from the volume + * - TODO implement caching + */ +int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key, + off_t offset, size_t length, struct page *page) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x,,,", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + /* this op will fetch the status */ + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + + /* merge in AFS status fetches and clear outstanding callback on this + * vnode */ + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_fetch_data(server, key, vnode, offset, length, + page, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + } else { + afs_vnode_status_update_failed(vnode, ret); + } + + _leave(" = %d", ret); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + return PTR_ERR(server); +} + +/* + * make a file or a directory + */ +int afs_vnode_create(struct afs_vnode *vnode, struct key *key, + const char *name, umode_t mode, struct afs_fid *newfid, + struct afs_file_status *newstatus, + struct afs_callback *newcb, struct afs_server **_server) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x,%s,,", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key), + name); + + /* this op will fetch the status on the directory we're creating in */ + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_create(server, key, vnode, name, mode, newfid, + newstatus, newcb, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(vnode, server); + *_server = server; + } else { + afs_vnode_status_update_failed(vnode, ret); + *_server = NULL; + } + + _leave(" = %d [cnt %d]", ret, vnode->update_cnt); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); + return PTR_ERR(server); +} + +/* + * remove a file or directory + */ +int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name, + bool isdir) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x,%s", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key), + name); + + /* this op will fetch the status on the directory we're removing from */ + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_remove(server, key, vnode, name, isdir, + &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + } else { + afs_vnode_status_update_failed(vnode, ret); + } + + _leave(" = %d [cnt %d]", ret, vnode->update_cnt); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); + return PTR_ERR(server); +} + +/* + * create a hard link + */ +int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode, + struct key *key, const char *name) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%s{%x:%u.%u},%x,%s", + dvnode->volume->vlocation->vldb.name, + dvnode->fid.vid, + dvnode->fid.vnode, + dvnode->fid.unique, + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key), + name); + + /* this op will fetch the status on the directory we're removing from */ + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + spin_lock(&dvnode->lock); + dvnode->update_cnt++; + spin_unlock(&dvnode->lock); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(dvnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_link(server, key, dvnode, vnode, name, + &afs_sync_call); + + } while (!afs_volume_release_fileserver(dvnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(vnode, server); + afs_vnode_finalise_status_update(dvnode, server); + afs_put_server(server); + } else { + afs_vnode_status_update_failed(vnode, ret); + afs_vnode_status_update_failed(dvnode, ret); + } + + _leave(" = %d [cnt %d]", ret, vnode->update_cnt); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + spin_lock(&dvnode->lock); + dvnode->update_cnt--; + ASSERTCMP(dvnode->update_cnt, >=, 0); + spin_unlock(&dvnode->lock); + _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); + return PTR_ERR(server); +} + +/* + * create a symbolic link + */ +int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key, + const char *name, const char *content, + struct afs_fid *newfid, + struct afs_file_status *newstatus, + struct afs_server **_server) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x,%s,%s,,,", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key), + name, content); + + /* this op will fetch the status on the directory we're creating in */ + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_symlink(server, key, vnode, name, content, + newfid, newstatus, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(vnode, server); + *_server = server; + } else { + afs_vnode_status_update_failed(vnode, ret); + *_server = NULL; + } + + _leave(" = %d [cnt %d]", ret, vnode->update_cnt); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); + return PTR_ERR(server); +} + +/* + * rename a file + */ +int afs_vnode_rename(struct afs_vnode *orig_dvnode, + struct afs_vnode *new_dvnode, + struct key *key, + const char *orig_name, + const char *new_name) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%s{%u,%u,%u},%x,%s,%s", + orig_dvnode->volume->vlocation->vldb.name, + orig_dvnode->fid.vid, + orig_dvnode->fid.vnode, + orig_dvnode->fid.unique, + new_dvnode->volume->vlocation->vldb.name, + new_dvnode->fid.vid, + new_dvnode->fid.vnode, + new_dvnode->fid.unique, + key_serial(key), + orig_name, + new_name); + + /* this op will fetch the status on both the directories we're dealing + * with */ + spin_lock(&orig_dvnode->lock); + orig_dvnode->update_cnt++; + spin_unlock(&orig_dvnode->lock); + if (new_dvnode != orig_dvnode) { + spin_lock(&new_dvnode->lock); + new_dvnode->update_cnt++; + spin_unlock(&new_dvnode->lock); + } + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(orig_dvnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_rename(server, key, orig_dvnode, orig_name, + new_dvnode, new_name, &afs_sync_call); + + } while (!afs_volume_release_fileserver(orig_dvnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(orig_dvnode, server); + if (new_dvnode != orig_dvnode) + afs_vnode_finalise_status_update(new_dvnode, server); + afs_put_server(server); + } else { + afs_vnode_status_update_failed(orig_dvnode, ret); + if (new_dvnode != orig_dvnode) + afs_vnode_status_update_failed(new_dvnode, ret); + } + + _leave(" = %d [cnt %d]", ret, orig_dvnode->update_cnt); + return ret; + +no_server: + spin_lock(&orig_dvnode->lock); + orig_dvnode->update_cnt--; + ASSERTCMP(orig_dvnode->update_cnt, >=, 0); + spin_unlock(&orig_dvnode->lock); + if (new_dvnode != orig_dvnode) { + spin_lock(&new_dvnode->lock); + new_dvnode->update_cnt--; + ASSERTCMP(new_dvnode->update_cnt, >=, 0); + spin_unlock(&new_dvnode->lock); + } + _leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt); + return PTR_ERR(server); +} + +/* + * write to a file + */ +int afs_vnode_store_data(struct afs_writeback *wb, pgoff_t first, pgoff_t last, + unsigned offset, unsigned to) +{ + struct afs_server *server; + struct afs_vnode *vnode = wb->vnode; + int ret; + + _enter("%s{%x:%u.%u},%x,%lx,%lx,%x,%x", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(wb->key), + first, last, offset, to); + + /* this op will fetch the status */ + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_store_data(server, wb, first, last, offset, to, + &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + } else { + afs_vnode_status_update_failed(vnode, ret); + } + + _leave(" = %d", ret); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + return PTR_ERR(server); +} + +/* + * set the attributes on a file + */ +int afs_vnode_setattr(struct afs_vnode *vnode, struct key *key, + struct iattr *attr) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + /* this op will fetch the status */ + spin_lock(&vnode->lock); + vnode->update_cnt++; + spin_unlock(&vnode->lock); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_setattr(server, key, vnode, attr, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) { + afs_vnode_finalise_status_update(vnode, server); + afs_put_server(server); + } else { + afs_vnode_status_update_failed(vnode, ret); + } + + _leave(" = %d", ret); + return ret; + +no_server: + spin_lock(&vnode->lock); + vnode->update_cnt--; + ASSERTCMP(vnode->update_cnt, >=, 0); + spin_unlock(&vnode->lock); + return PTR_ERR(server); +} + +/* + * get the status of a volume + */ +int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key, + struct afs_volume_status *vs) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x,", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_get_volume_status(server, key, vnode, vs, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) + afs_put_server(server); + + _leave(" = %d", ret); + return ret; + +no_server: + return PTR_ERR(server); +} + +/* + * get a lock on a file + */ +int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key, + afs_lock_type_t type) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x,%u", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key), type); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_set_lock(server, key, vnode, type, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) + afs_put_server(server); + + _leave(" = %d", ret); + return ret; + +no_server: + return PTR_ERR(server); +} + +/* + * extend a lock on a file + */ +int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_extend_lock(server, key, vnode, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) + afs_put_server(server); + + _leave(" = %d", ret); + return ret; + +no_server: + return PTR_ERR(server); +} + +/* + * release a lock on a file + */ +int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key) +{ + struct afs_server *server; + int ret; + + _enter("%s{%x:%u.%u},%x", + vnode->volume->vlocation->vldb.name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + do { + /* pick a server to query */ + server = afs_volume_pick_fileserver(vnode); + if (IS_ERR(server)) + goto no_server; + + _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); + + ret = afs_fs_release_lock(server, key, vnode, &afs_sync_call); + + } while (!afs_volume_release_fileserver(vnode, server, ret)); + + /* adjust the flags */ + if (ret == 0) + afs_put_server(server); + + _leave(" = %d", ret); + return ret; + +no_server: + return PTR_ERR(server); +} diff --git a/fs/afs/volume.c b/fs/afs/volume.c new file mode 100644 index 00000000..401eeb21 --- /dev/null +++ b/fs/afs/volume.c @@ -0,0 +1,401 @@ +/* AFS volume management + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" }; + +/* + * lookup a volume by name + * - this can be one of the following: + * "%[cell:]volume[.]" R/W volume + * "#[cell:]volume[.]" R/O or R/W volume (rwparent=0), + * or R/W (rwparent=1) volume + * "%[cell:]volume.readonly" R/O volume + * "#[cell:]volume.readonly" R/O volume + * "%[cell:]volume.backup" Backup volume + * "#[cell:]volume.backup" Backup volume + * + * The cell name is optional, and defaults to the current cell. + * + * See "The Rules of Mount Point Traversal" in Chapter 5 of the AFS SysAdmin + * Guide + * - Rule 1: Explicit type suffix forces access of that type or nothing + * (no suffix, then use Rule 2 & 3) + * - Rule 2: If parent volume is R/O, then mount R/O volume by preference, R/W + * if not available + * - Rule 3: If parent volume is R/W, then only mount R/W volume unless + * explicitly told otherwise + */ +struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) +{ + struct afs_vlocation *vlocation = NULL; + struct afs_volume *volume = NULL; + struct afs_server *server = NULL; + char srvtmask; + int ret, loop; + + _enter("{%*.*s,%d}", + params->volnamesz, params->volnamesz, params->volname, params->rwpath); + + /* lookup the volume location record */ + vlocation = afs_vlocation_lookup(params->cell, params->key, + params->volname, params->volnamesz); + if (IS_ERR(vlocation)) { + ret = PTR_ERR(vlocation); + vlocation = NULL; + goto error; + } + + /* make the final decision on the type we want */ + ret = -ENOMEDIUM; + if (params->force && !(vlocation->vldb.vidmask & (1 << params->type))) + goto error; + + srvtmask = 0; + for (loop = 0; loop < vlocation->vldb.nservers; loop++) + srvtmask |= vlocation->vldb.srvtmask[loop]; + + if (params->force) { + if (!(srvtmask & (1 << params->type))) + goto error; + } else if (srvtmask & AFS_VOL_VTM_RO) { + params->type = AFSVL_ROVOL; + } else if (srvtmask & AFS_VOL_VTM_RW) { + params->type = AFSVL_RWVOL; + } else { + goto error; + } + + down_write(¶ms->cell->vl_sem); + + /* is the volume already active? */ + if (vlocation->vols[params->type]) { + /* yes - re-use it */ + volume = vlocation->vols[params->type]; + afs_get_volume(volume); + goto success; + } + + /* create a new volume record */ + _debug("creating new volume record"); + + ret = -ENOMEM; + volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL); + if (!volume) + goto error_up; + + atomic_set(&volume->usage, 1); + volume->type = params->type; + volume->type_force = params->force; + volume->cell = params->cell; + volume->vid = vlocation->vldb.vid[params->type]; + + ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY); + if (ret) + goto error_bdi; + + init_rwsem(&volume->server_sem); + + /* look up all the applicable server records */ + for (loop = 0; loop < 8; loop++) { + if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) { + server = afs_lookup_server( + volume->cell, &vlocation->vldb.servers[loop]); + if (IS_ERR(server)) { + ret = PTR_ERR(server); + goto error_discard; + } + + volume->servers[volume->nservers] = server; + volume->nservers++; + } + } + + /* attach the cache and volume location */ +#ifdef CONFIG_AFS_FSCACHE + volume->cache = fscache_acquire_cookie(vlocation->cache, + &afs_volume_cache_index_def, + volume); +#endif + afs_get_vlocation(vlocation); + volume->vlocation = vlocation; + + vlocation->vols[volume->type] = volume; + +success: + _debug("kAFS selected %s volume %08x", + afs_voltypes[volume->type], volume->vid); + up_write(¶ms->cell->vl_sem); + afs_put_vlocation(vlocation); + _leave(" = %p", volume); + return volume; + + /* clean up */ +error_up: + up_write(¶ms->cell->vl_sem); +error: + afs_put_vlocation(vlocation); + _leave(" = %d", ret); + return ERR_PTR(ret); + +error_discard: + bdi_destroy(&volume->bdi); +error_bdi: + up_write(¶ms->cell->vl_sem); + + for (loop = volume->nservers - 1; loop >= 0; loop--) + afs_put_server(volume->servers[loop]); + + kfree(volume); + goto error; +} + +/* + * destroy a volume record + */ +void afs_put_volume(struct afs_volume *volume) +{ + struct afs_vlocation *vlocation; + int loop; + + if (!volume) + return; + + _enter("%p", volume); + + ASSERTCMP(atomic_read(&volume->usage), >, 0); + + vlocation = volume->vlocation; + + /* to prevent a race, the decrement and the dequeue must be effectively + * atomic */ + down_write(&vlocation->cell->vl_sem); + + if (likely(!atomic_dec_and_test(&volume->usage))) { + up_write(&vlocation->cell->vl_sem); + _leave(""); + return; + } + + vlocation->vols[volume->type] = NULL; + + up_write(&vlocation->cell->vl_sem); + + /* finish cleaning up the volume */ +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_cookie(volume->cache, 0); +#endif + afs_put_vlocation(vlocation); + + for (loop = volume->nservers - 1; loop >= 0; loop--) + afs_put_server(volume->servers[loop]); + + bdi_destroy(&volume->bdi); + kfree(volume); + + _leave(" [destroyed]"); +} + +/* + * pick a server to use to try accessing this volume + * - returns with an elevated usage count on the server chosen + */ +struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode) +{ + struct afs_volume *volume = vnode->volume; + struct afs_server *server; + int ret, state, loop; + + _enter("%s", volume->vlocation->vldb.name); + + /* stick with the server we're already using if we can */ + if (vnode->server && vnode->server->fs_state == 0) { + afs_get_server(vnode->server); + _leave(" = %p [current]", vnode->server); + return vnode->server; + } + + down_read(&volume->server_sem); + + /* handle the no-server case */ + if (volume->nservers == 0) { + ret = volume->rjservers ? -ENOMEDIUM : -ESTALE; + up_read(&volume->server_sem); + _leave(" = %d [no servers]", ret); + return ERR_PTR(ret); + } + + /* basically, just search the list for the first live server and use + * that */ + ret = 0; + for (loop = 0; loop < volume->nservers; loop++) { + server = volume->servers[loop]; + state = server->fs_state; + + _debug("consider %d [%d]", loop, state); + + switch (state) { + /* found an apparently healthy server */ + case 0: + afs_get_server(server); + up_read(&volume->server_sem); + _leave(" = %p (picked %08x)", + server, ntohl(server->addr.s_addr)); + return server; + + case -ENETUNREACH: + if (ret == 0) + ret = state; + break; + + case -EHOSTUNREACH: + if (ret == 0 || + ret == -ENETUNREACH) + ret = state; + break; + + case -ECONNREFUSED: + if (ret == 0 || + ret == -ENETUNREACH || + ret == -EHOSTUNREACH) + ret = state; + break; + + default: + case -EREMOTEIO: + if (ret == 0 || + ret == -ENETUNREACH || + ret == -EHOSTUNREACH || + ret == -ECONNREFUSED) + ret = state; + break; + } + } + + /* no available servers + * - TODO: handle the no active servers case better + */ + up_read(&volume->server_sem); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * release a server after use + * - releases the ref on the server struct that was acquired by picking + * - records result of using a particular server to access a volume + * - return 0 to try again, 1 if okay or to issue error + * - the caller must release the server struct if result was 0 + */ +int afs_volume_release_fileserver(struct afs_vnode *vnode, + struct afs_server *server, + int result) +{ + struct afs_volume *volume = vnode->volume; + unsigned loop; + + _enter("%s,%08x,%d", + volume->vlocation->vldb.name, ntohl(server->addr.s_addr), + result); + + switch (result) { + /* success */ + case 0: + server->fs_act_jif = jiffies; + server->fs_state = 0; + _leave(""); + return 1; + + /* the fileserver denied all knowledge of the volume */ + case -ENOMEDIUM: + server->fs_act_jif = jiffies; + down_write(&volume->server_sem); + + /* firstly, find where the server is in the active list (if it + * is) */ + for (loop = 0; loop < volume->nservers; loop++) + if (volume->servers[loop] == server) + goto present; + + /* no longer there - may have been discarded by another op */ + goto try_next_server_upw; + + present: + volume->nservers--; + memmove(&volume->servers[loop], + &volume->servers[loop + 1], + sizeof(volume->servers[loop]) * + (volume->nservers - loop)); + volume->servers[volume->nservers] = NULL; + afs_put_server(server); + volume->rjservers++; + + if (volume->nservers > 0) + /* another server might acknowledge its existence */ + goto try_next_server_upw; + + /* handle the case where all the fileservers have rejected the + * volume + * - TODO: try asking the fileservers for volume information + * - TODO: contact the VL server again to see if the volume is + * no longer registered + */ + up_write(&volume->server_sem); + afs_put_server(server); + _leave(" [completely rejected]"); + return 1; + + /* problem reaching the server */ + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + case -ETIME: + case -ETIMEDOUT: + case -EREMOTEIO: + /* mark the server as dead + * TODO: vary dead timeout depending on error + */ + spin_lock(&server->fs_lock); + if (!server->fs_state) { + server->fs_dead_jif = jiffies + HZ * 10; + server->fs_state = result; + printk("kAFS: SERVER DEAD state=%d\n", result); + } + spin_unlock(&server->fs_lock); + goto try_next_server; + + /* miscellaneous error */ + default: + server->fs_act_jif = jiffies; + case -ENOMEM: + case -ENONET: + /* tell the caller to accept the result */ + afs_put_server(server); + _leave(" [local failure]"); + return 1; + } + + /* tell the caller to loop around and try the next server */ +try_next_server_upw: + up_write(&volume->server_sem); +try_next_server: + afs_put_server(server); + _leave(" [try next server]"); + return 0; +} diff --git a/fs/afs/write.c b/fs/afs/write.c new file mode 100644 index 00000000..b806285f --- /dev/null +++ b/fs/afs/write.c @@ -0,0 +1,754 @@ +/* handling of writes to regular files and writing back to the server + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static int afs_write_back_from_locked_page(struct afs_writeback *wb, + struct page *page); + +/* + * mark a page as having been made dirty and thus needing writeback + */ +int afs_set_page_dirty(struct page *page) +{ + _enter(""); + return __set_page_dirty_nobuffers(page); +} + +/* + * unlink a writeback record because its usage has reached zero + * - must be called with the wb->vnode->writeback_lock held + */ +static void afs_unlink_writeback(struct afs_writeback *wb) +{ + struct afs_writeback *front; + struct afs_vnode *vnode = wb->vnode; + + list_del_init(&wb->link); + if (!list_empty(&vnode->writebacks)) { + /* if an fsync rises to the front of the queue then wake it + * up */ + front = list_entry(vnode->writebacks.next, + struct afs_writeback, link); + if (front->state == AFS_WBACK_SYNCING) { + _debug("wake up sync"); + front->state = AFS_WBACK_COMPLETE; + wake_up(&front->waitq); + } + } +} + +/* + * free a writeback record + */ +static void afs_free_writeback(struct afs_writeback *wb) +{ + _enter(""); + key_put(wb->key); + kfree(wb); +} + +/* + * dispose of a reference to a writeback record + */ +void afs_put_writeback(struct afs_writeback *wb) +{ + struct afs_vnode *vnode = wb->vnode; + + _enter("{%d}", wb->usage); + + spin_lock(&vnode->writeback_lock); + if (--wb->usage == 0) + afs_unlink_writeback(wb); + else + wb = NULL; + spin_unlock(&vnode->writeback_lock); + if (wb) + afs_free_writeback(wb); +} + +/* + * partly or wholly fill a page that's under preparation for writing + */ +static int afs_fill_page(struct afs_vnode *vnode, struct key *key, + loff_t pos, struct page *page) +{ + loff_t i_size; + int ret; + int len; + + _enter(",,%llu", (unsigned long long)pos); + + i_size = i_size_read(&vnode->vfs_inode); + if (pos + PAGE_CACHE_SIZE > i_size) + len = i_size - pos; + else + len = PAGE_CACHE_SIZE; + + ret = afs_vnode_fetch_data(vnode, key, pos, len, page); + if (ret < 0) { + if (ret == -ENOENT) { + _debug("got NOENT from server" + " - marking file deleted and stale"); + set_bit(AFS_VNODE_DELETED, &vnode->flags); + ret = -ESTALE; + } + } + + _leave(" = %d", ret); + return ret; +} + +/* + * prepare to perform part of a write to a page + */ +int afs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct afs_writeback *candidate, *wb; + struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); + struct page *page; + struct key *key = file->private_data; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + int ret; + + _enter("{%x:%u},{%lx},%u,%u", + vnode->fid.vid, vnode->fid.vnode, index, from, to); + + candidate = kzalloc(sizeof(*candidate), GFP_KERNEL); + if (!candidate) + return -ENOMEM; + candidate->vnode = vnode; + candidate->first = candidate->last = index; + candidate->offset_first = from; + candidate->to_last = to; + INIT_LIST_HEAD(&candidate->link); + candidate->usage = 1; + candidate->state = AFS_WBACK_PENDING; + init_waitqueue_head(&candidate->waitq); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + kfree(candidate); + return -ENOMEM; + } + *pagep = page; + /* page won't leak in error case: it eventually gets cleaned off LRU */ + + if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) { + ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page); + if (ret < 0) { + kfree(candidate); + _leave(" = %d [prep]", ret); + return ret; + } + SetPageUptodate(page); + } + +try_again: + spin_lock(&vnode->writeback_lock); + + /* see if this page is already pending a writeback under a suitable key + * - if so we can just join onto that one */ + wb = (struct afs_writeback *) page_private(page); + if (wb) { + if (wb->key == key && wb->state == AFS_WBACK_PENDING) + goto subsume_in_current_wb; + goto flush_conflicting_wb; + } + + if (index > 0) { + /* see if we can find an already pending writeback that we can + * append this page to */ + list_for_each_entry(wb, &vnode->writebacks, link) { + if (wb->last == index - 1 && wb->key == key && + wb->state == AFS_WBACK_PENDING) + goto append_to_previous_wb; + } + } + + list_add_tail(&candidate->link, &vnode->writebacks); + candidate->key = key_get(key); + spin_unlock(&vnode->writeback_lock); + SetPagePrivate(page); + set_page_private(page, (unsigned long) candidate); + _leave(" = 0 [new]"); + return 0; + +subsume_in_current_wb: + _debug("subsume"); + ASSERTRANGE(wb->first, <=, index, <=, wb->last); + if (index == wb->first && from < wb->offset_first) + wb->offset_first = from; + if (index == wb->last && to > wb->to_last) + wb->to_last = to; + spin_unlock(&vnode->writeback_lock); + kfree(candidate); + _leave(" = 0 [sub]"); + return 0; + +append_to_previous_wb: + _debug("append into %lx-%lx", wb->first, wb->last); + wb->usage++; + wb->last++; + wb->to_last = to; + spin_unlock(&vnode->writeback_lock); + SetPagePrivate(page); + set_page_private(page, (unsigned long) wb); + kfree(candidate); + _leave(" = 0 [app]"); + return 0; + + /* the page is currently bound to another context, so if it's dirty we + * need to flush it before we can use the new context */ +flush_conflicting_wb: + _debug("flush conflict"); + if (wb->state == AFS_WBACK_PENDING) + wb->state = AFS_WBACK_CONFLICTING; + spin_unlock(&vnode->writeback_lock); + if (PageDirty(page)) { + ret = afs_write_back_from_locked_page(wb, page); + if (ret < 0) { + afs_put_writeback(candidate); + _leave(" = %d", ret); + return ret; + } + } + + /* the page holds a ref on the writeback record */ + afs_put_writeback(wb); + set_page_private(page, 0); + ClearPagePrivate(page); + goto try_again; +} + +/* + * finalise part of a write to a page + */ +int afs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); + loff_t i_size, maybe_i_size; + + _enter("{%x:%u},{%lx}", + vnode->fid.vid, vnode->fid.vnode, page->index); + + maybe_i_size = pos + copied; + + i_size = i_size_read(&vnode->vfs_inode); + if (maybe_i_size > i_size) { + spin_lock(&vnode->writeback_lock); + i_size = i_size_read(&vnode->vfs_inode); + if (maybe_i_size > i_size) + i_size_write(&vnode->vfs_inode, maybe_i_size); + spin_unlock(&vnode->writeback_lock); + } + + set_page_dirty(page); + if (PageDirty(page)) + _debug("dirtied"); + unlock_page(page); + page_cache_release(page); + + return copied; +} + +/* + * kill all the pages in the given range + */ +static void afs_kill_pages(struct afs_vnode *vnode, bool error, + pgoff_t first, pgoff_t last) +{ + struct pagevec pv; + unsigned count, loop; + + _enter("{%x:%u},%lx-%lx", + vnode->fid.vid, vnode->fid.vnode, first, last); + + pagevec_init(&pv, 0); + + do { + _debug("kill %lx-%lx", first, last); + + count = last - first + 1; + if (count > PAGEVEC_SIZE) + count = PAGEVEC_SIZE; + pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping, + first, count, pv.pages); + ASSERTCMP(pv.nr, ==, count); + + for (loop = 0; loop < count; loop++) { + ClearPageUptodate(pv.pages[loop]); + if (error) + SetPageError(pv.pages[loop]); + end_page_writeback(pv.pages[loop]); + } + + __pagevec_release(&pv); + } while (first < last); + + _leave(""); +} + +/* + * synchronously write back the locked page and any subsequent non-locked dirty + * pages also covered by the same writeback record + */ +static int afs_write_back_from_locked_page(struct afs_writeback *wb, + struct page *primary_page) +{ + struct page *pages[8], *page; + unsigned long count; + unsigned n, offset, to; + pgoff_t start, first, last; + int loop, ret; + + _enter(",%lx", primary_page->index); + + count = 1; + if (!clear_page_dirty_for_io(primary_page)) + BUG(); + if (test_set_page_writeback(primary_page)) + BUG(); + + /* find all consecutive lockable dirty pages, stopping when we find a + * page that is not immediately lockable, is not dirty or is missing, + * or we reach the end of the range */ + start = primary_page->index; + if (start >= wb->last) + goto no_more; + start++; + do { + _debug("more %lx [%lx]", start, count); + n = wb->last - start + 1; + if (n > ARRAY_SIZE(pages)) + n = ARRAY_SIZE(pages); + n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping, + start, n, pages); + _debug("fgpc %u", n); + if (n == 0) + goto no_more; + if (pages[0]->index != start) { + do { + put_page(pages[--n]); + } while (n > 0); + goto no_more; + } + + for (loop = 0; loop < n; loop++) { + page = pages[loop]; + if (page->index > wb->last) + break; + if (!trylock_page(page)) + break; + if (!PageDirty(page) || + page_private(page) != (unsigned long) wb) { + unlock_page(page); + break; + } + if (!clear_page_dirty_for_io(page)) + BUG(); + if (test_set_page_writeback(page)) + BUG(); + unlock_page(page); + put_page(page); + } + count += loop; + if (loop < n) { + for (; loop < n; loop++) + put_page(pages[loop]); + goto no_more; + } + + start += loop; + } while (start <= wb->last && count < 65536); + +no_more: + /* we now have a contiguous set of dirty pages, each with writeback set + * and the dirty mark cleared; the first page is locked and must remain + * so, all the rest are unlocked */ + first = primary_page->index; + last = first + count - 1; + + offset = (first == wb->first) ? wb->offset_first : 0; + to = (last == wb->last) ? wb->to_last : PAGE_SIZE; + + _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); + + ret = afs_vnode_store_data(wb, first, last, offset, to); + if (ret < 0) { + switch (ret) { + case -EDQUOT: + case -ENOSPC: + set_bit(AS_ENOSPC, + &wb->vnode->vfs_inode.i_mapping->flags); + break; + case -EROFS: + case -EIO: + case -EREMOTEIO: + case -EFBIG: + case -ENOENT: + case -ENOMEDIUM: + case -ENXIO: + afs_kill_pages(wb->vnode, true, first, last); + set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags); + break; + case -EACCES: + case -EPERM: + case -ENOKEY: + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EKEYREVOKED: + afs_kill_pages(wb->vnode, false, first, last); + break; + default: + break; + } + } else { + ret = count; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * write a page back to the server + * - the caller locked the page for us + */ +int afs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct afs_writeback *wb; + int ret; + + _enter("{%lx},", page->index); + + wb = (struct afs_writeback *) page_private(page); + ASSERT(wb != NULL); + + ret = afs_write_back_from_locked_page(wb, page); + unlock_page(page); + if (ret < 0) { + _leave(" = %d", ret); + return 0; + } + + wbc->nr_to_write -= ret; + + _leave(" = 0"); + return 0; +} + +/* + * write a region of pages back to the server + */ +static int afs_writepages_region(struct address_space *mapping, + struct writeback_control *wbc, + pgoff_t index, pgoff_t end, pgoff_t *_next) +{ + struct afs_writeback *wb; + struct page *page; + int ret, n; + + _enter(",,%lx,%lx,", index, end); + + do { + n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY, + 1, &page); + if (!n) + break; + + _debug("wback %lx", page->index); + + if (page->index > end) { + *_next = index; + page_cache_release(page); + _leave(" = 0 [%lx]", *_next); + return 0; + } + + /* at this point we hold neither mapping->tree_lock nor lock on + * the page itself: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled back from + * swapper_space to tmpfs file mapping + */ + lock_page(page); + + if (page->mapping != mapping) { + unlock_page(page); + page_cache_release(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || !PageDirty(page)) { + unlock_page(page); + continue; + } + + wb = (struct afs_writeback *) page_private(page); + ASSERT(wb != NULL); + + spin_lock(&wb->vnode->writeback_lock); + wb->state = AFS_WBACK_WRITING; + spin_unlock(&wb->vnode->writeback_lock); + + ret = afs_write_back_from_locked_page(wb, page); + unlock_page(page); + page_cache_release(page); + if (ret < 0) { + _leave(" = %d", ret); + return ret; + } + + wbc->nr_to_write -= ret; + + cond_resched(); + } while (index < end && wbc->nr_to_write > 0); + + *_next = index; + _leave(" = 0 [%lx]", *_next); + return 0; +} + +/* + * write some of the pending data back to the server + */ +int afs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + pgoff_t start, end, next; + int ret; + + _enter(""); + + if (wbc->range_cyclic) { + start = mapping->writeback_index; + end = -1; + ret = afs_writepages_region(mapping, wbc, start, end, &next); + if (start > 0 && wbc->nr_to_write > 0 && ret == 0) + ret = afs_writepages_region(mapping, wbc, 0, start, + &next); + mapping->writeback_index = next; + } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { + end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT); + ret = afs_writepages_region(mapping, wbc, 0, end, &next); + if (wbc->nr_to_write > 0) + mapping->writeback_index = next; + } else { + start = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + ret = afs_writepages_region(mapping, wbc, start, end, &next); + } + + _leave(" = %d", ret); + return ret; +} + +/* + * completion of write to server + */ +void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) +{ + struct afs_writeback *wb = call->wb; + struct pagevec pv; + unsigned count, loop; + pgoff_t first = call->first, last = call->last; + bool free_wb; + + _enter("{%x:%u},{%lx-%lx}", + vnode->fid.vid, vnode->fid.vnode, first, last); + + ASSERT(wb != NULL); + + pagevec_init(&pv, 0); + + do { + _debug("done %lx-%lx", first, last); + + count = last - first + 1; + if (count > PAGEVEC_SIZE) + count = PAGEVEC_SIZE; + pv.nr = find_get_pages_contig(call->mapping, first, count, + pv.pages); + ASSERTCMP(pv.nr, ==, count); + + spin_lock(&vnode->writeback_lock); + for (loop = 0; loop < count; loop++) { + struct page *page = pv.pages[loop]; + end_page_writeback(page); + if (page_private(page) == (unsigned long) wb) { + set_page_private(page, 0); + ClearPagePrivate(page); + wb->usage--; + } + } + free_wb = false; + if (wb->usage == 0) { + afs_unlink_writeback(wb); + free_wb = true; + } + spin_unlock(&vnode->writeback_lock); + first += count; + if (free_wb) { + afs_free_writeback(wb); + wb = NULL; + } + + __pagevec_release(&pv); + } while (first <= last); + + _leave(""); +} + +/* + * write to an AFS file + */ +ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct dentry *dentry = iocb->ki_filp->f_path.dentry; + struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); + ssize_t result; + size_t count = iov_length(iov, nr_segs); + + _enter("{%x.%u},{%zu},%lu,", + vnode->fid.vid, vnode->fid.vnode, count, nr_segs); + + if (IS_SWAPFILE(&vnode->vfs_inode)) { + printk(KERN_INFO + "AFS: Attempt to write to active swap file!\n"); + return -EBUSY; + } + + if (!count) + return 0; + + result = generic_file_aio_write(iocb, iov, nr_segs, pos); + if (IS_ERR_VALUE(result)) { + _leave(" = %zd", result); + return result; + } + + _leave(" = %zd", result); + return result; +} + +/* + * flush the vnode to the fileserver + */ +int afs_writeback_all(struct afs_vnode *vnode) +{ + struct address_space *mapping = vnode->vfs_inode.i_mapping; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_cyclic = 1, + }; + int ret; + + _enter(""); + + ret = mapping->a_ops->writepages(mapping, &wbc); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + + _leave(" = %d", ret); + return ret; +} + +/* + * flush any dirty pages for this process, and check for write errors. + * - the return status from this call provides a reliable indication of + * whether any write errors occurred for this process. + */ +int afs_fsync(struct file *file, int datasync) +{ + struct dentry *dentry = file->f_path.dentry; + struct afs_writeback *wb, *xwb; + struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); + int ret; + + _enter("{%x:%u},{n=%s},%d", + vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name, + datasync); + + /* use a writeback record as a marker in the queue - when this reaches + * the front of the queue, all the outstanding writes are either + * completed or rejected */ + wb = kzalloc(sizeof(*wb), GFP_KERNEL); + if (!wb) + return -ENOMEM; + wb->vnode = vnode; + wb->first = 0; + wb->last = -1; + wb->offset_first = 0; + wb->to_last = PAGE_SIZE; + wb->usage = 1; + wb->state = AFS_WBACK_SYNCING; + init_waitqueue_head(&wb->waitq); + + spin_lock(&vnode->writeback_lock); + list_for_each_entry(xwb, &vnode->writebacks, link) { + if (xwb->state == AFS_WBACK_PENDING) + xwb->state = AFS_WBACK_CONFLICTING; + } + list_add_tail(&wb->link, &vnode->writebacks); + spin_unlock(&vnode->writeback_lock); + + /* push all the outstanding writebacks to the server */ + ret = afs_writeback_all(vnode); + if (ret < 0) { + afs_put_writeback(wb); + _leave(" = %d [wb]", ret); + return ret; + } + + /* wait for the preceding writes to actually complete */ + ret = wait_event_interruptible(wb->waitq, + wb->state == AFS_WBACK_COMPLETE || + vnode->writebacks.next == &wb->link); + afs_put_writeback(wb); + _leave(" = %d", ret); + return ret; +} + +/* + * notification that a previously read-only page is about to become writable + * - if it returns an error, the caller will deliver a bus error signal + */ +int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host); + + _enter("{{%x:%u}},{%lx}", + vnode->fid.vid, vnode->fid.vnode, page->index); + + /* wait for the page to be written to the cache before we allow it to + * be modified */ +#ifdef CONFIG_AFS_FSCACHE + fscache_wait_on_page_write(vnode->cache, page); +#endif + + _leave(" = 0"); + return 0; +} diff --git a/fs/aio.c b/fs/aio.c new file mode 100644 index 00000000..278ed7dc --- /dev/null +++ b/fs/aio.c @@ -0,0 +1,1795 @@ +/* + * An async IO implementation for Linux + * Written by Benjamin LaHaise + * + * Implements an efficient asynchronous io interface. + * + * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. + * + * See ../COPYING for licensing terms. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG 0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#if DEBUG > 1 +#define dprintk printk +#else +#define dprintk(x...) do { ; } while (0) +#endif + +/*------ sysctl variables----*/ +static DEFINE_SPINLOCK(aio_nr_lock); +unsigned long aio_nr; /* current system wide number of aio requests */ +unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ +/*----end sysctl variables---*/ + +static struct kmem_cache *kiocb_cachep; +static struct kmem_cache *kioctx_cachep; + +static struct workqueue_struct *aio_wq; + +/* Used for rare fput completion. */ +static void aio_fput_routine(struct work_struct *); +static DECLARE_WORK(fput_work, aio_fput_routine); + +static DEFINE_SPINLOCK(fput_lock); +static LIST_HEAD(fput_head); + +static void aio_kick_handler(struct work_struct *); +static void aio_queue_work(struct kioctx *); + +/* aio_setup + * Creates the slab caches used by the aio routines, panic on + * failure as this is done early during the boot sequence. + */ +static int __init aio_setup(void) +{ + kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); + kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); + + aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */ + BUG_ON(!aio_wq); + + pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); + + return 0; +} +__initcall(aio_setup); + +static void aio_free_ring(struct kioctx *ctx) +{ + struct aio_ring_info *info = &ctx->ring_info; + long i; + + for (i=0; inr_pages; i++) + put_page(info->ring_pages[i]); + + if (info->mmap_size) { + down_write(&ctx->mm->mmap_sem); + do_munmap(ctx->mm, info->mmap_base, info->mmap_size); + up_write(&ctx->mm->mmap_sem); + } + + if (info->ring_pages && info->ring_pages != info->internal_pages) + kfree(info->ring_pages); + info->ring_pages = NULL; + info->nr = 0; +} + +static int aio_setup_ring(struct kioctx *ctx) +{ + struct aio_ring *ring; + struct aio_ring_info *info = &ctx->ring_info; + unsigned nr_events = ctx->max_reqs; + unsigned long size; + int nr_pages; + + /* Compensate for the ring buffer's head/tail overlap entry */ + nr_events += 2; /* 1 is required, 2 for good luck */ + + size = sizeof(struct aio_ring); + size += sizeof(struct io_event) * nr_events; + nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + + if (nr_pages < 0) + return -EINVAL; + + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); + + info->nr = 0; + info->ring_pages = info->internal_pages; + if (nr_pages > AIO_RING_PAGES) { + info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!info->ring_pages) + return -ENOMEM; + } + + info->mmap_size = nr_pages * PAGE_SIZE; + dprintk("attempting mmap of %lu bytes\n", info->mmap_size); + down_write(&ctx->mm->mmap_sem); + info->mmap_base = do_mmap(NULL, 0, info->mmap_size, + PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, + 0); + if (IS_ERR((void *)info->mmap_base)) { + up_write(&ctx->mm->mmap_sem); + info->mmap_size = 0; + aio_free_ring(ctx); + return -EAGAIN; + } + + dprintk("mmap address: 0x%08lx\n", info->mmap_base); + info->nr_pages = get_user_pages(current, ctx->mm, + info->mmap_base, nr_pages, + 1, 0, info->ring_pages, NULL); + up_write(&ctx->mm->mmap_sem); + + if (unlikely(info->nr_pages != nr_pages)) { + aio_free_ring(ctx); + return -EAGAIN; + } + + ctx->user_id = info->mmap_base; + + info->nr = nr_events; /* trusted copy */ + + ring = kmap_atomic(info->ring_pages[0], KM_USER0); + ring->nr = nr_events; /* user copy */ + ring->id = ctx->user_id; + ring->head = ring->tail = 0; + ring->magic = AIO_RING_MAGIC; + ring->compat_features = AIO_RING_COMPAT_FEATURES; + ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; + ring->header_length = sizeof(struct aio_ring); + kunmap_atomic(ring, KM_USER0); + + return 0; +} + + +/* aio_ring_event: returns a pointer to the event at the given index from + * kmap_atomic(, km). Release the pointer with put_aio_ring_event(); + */ +#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) +#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) +#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) + +#define aio_ring_event(info, nr, km) ({ \ + unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ + struct io_event *__event; \ + __event = kmap_atomic( \ + (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ + __event += pos % AIO_EVENTS_PER_PAGE; \ + __event; \ +}) + +#define put_aio_ring_event(event, km) do { \ + struct io_event *__event = (event); \ + (void)__event; \ + kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ +} while(0) + +static void ctx_rcu_free(struct rcu_head *head) +{ + struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + unsigned nr_events = ctx->max_reqs; + + kmem_cache_free(kioctx_cachep, ctx); + + if (nr_events) { + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - nr_events > aio_nr); + aio_nr -= nr_events; + spin_unlock(&aio_nr_lock); + } +} + +/* __put_ioctx + * Called when the last user of an aio context has gone away, + * and the struct needs to be freed. + */ +static void __put_ioctx(struct kioctx *ctx) +{ + BUG_ON(ctx->reqs_active); + + cancel_delayed_work(&ctx->wq); + cancel_work_sync(&ctx->wq.work); + aio_free_ring(ctx); + mmdrop(ctx->mm); + ctx->mm = NULL; + pr_debug("__put_ioctx: freeing %p\n", ctx); + call_rcu(&ctx->rcu_head, ctx_rcu_free); +} + +static inline int try_get_ioctx(struct kioctx *kioctx) +{ + return atomic_inc_not_zero(&kioctx->users); +} + +static inline void put_ioctx(struct kioctx *kioctx) +{ + BUG_ON(atomic_read(&kioctx->users) <= 0); + if (unlikely(atomic_dec_and_test(&kioctx->users))) + __put_ioctx(kioctx); +} + +/* ioctx_alloc + * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. + */ +static struct kioctx *ioctx_alloc(unsigned nr_events) +{ + struct mm_struct *mm; + struct kioctx *ctx; + int did_sync = 0; + + /* Prevent overflows */ + if ((nr_events > (0x10000000U / sizeof(struct io_event))) || + (nr_events > (0x10000000U / sizeof(struct kiocb)))) { + pr_debug("ENOMEM: nr_events too high\n"); + return ERR_PTR(-EINVAL); + } + + if ((unsigned long)nr_events > aio_max_nr) + return ERR_PTR(-EAGAIN); + + ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + + ctx->max_reqs = nr_events; + mm = ctx->mm = current->mm; + atomic_inc(&mm->mm_count); + + atomic_set(&ctx->users, 2); + spin_lock_init(&ctx->ctx_lock); + spin_lock_init(&ctx->ring_info.ring_lock); + init_waitqueue_head(&ctx->wait); + + INIT_LIST_HEAD(&ctx->active_reqs); + INIT_LIST_HEAD(&ctx->run_list); + INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler); + + if (aio_setup_ring(ctx) < 0) + goto out_freectx; + + /* limit the number of system wide aios */ + do { + spin_lock_bh(&aio_nr_lock); + if (aio_nr + nr_events > aio_max_nr || + aio_nr + nr_events < aio_nr) + ctx->max_reqs = 0; + else + aio_nr += ctx->max_reqs; + spin_unlock_bh(&aio_nr_lock); + if (ctx->max_reqs || did_sync) + break; + + /* wait for rcu callbacks to have completed before giving up */ + synchronize_rcu(); + did_sync = 1; + ctx->max_reqs = nr_events; + } while (1); + + if (ctx->max_reqs == 0) + goto out_cleanup; + + /* now link into global list. */ + spin_lock(&mm->ioctx_lock); + hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); + spin_unlock(&mm->ioctx_lock); + + dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", + ctx, ctx->user_id, current->mm, ctx->ring_info.nr); + return ctx; + +out_cleanup: + __put_ioctx(ctx); + return ERR_PTR(-EAGAIN); + +out_freectx: + mmdrop(mm); + kmem_cache_free(kioctx_cachep, ctx); + ctx = ERR_PTR(-ENOMEM); + + dprintk("aio: error allocating ioctx %p\n", ctx); + return ctx; +} + +/* aio_cancel_all + * Cancels all outstanding aio requests on an aio context. Used + * when the processes owning a context have all exited to encourage + * the rapid destruction of the kioctx. + */ +static void aio_cancel_all(struct kioctx *ctx) +{ + int (*cancel)(struct kiocb *, struct io_event *); + struct io_event res; + spin_lock_irq(&ctx->ctx_lock); + ctx->dead = 1; + while (!list_empty(&ctx->active_reqs)) { + struct list_head *pos = ctx->active_reqs.next; + struct kiocb *iocb = list_kiocb(pos); + list_del_init(&iocb->ki_list); + cancel = iocb->ki_cancel; + kiocbSetCancelled(iocb); + if (cancel) { + iocb->ki_users++; + spin_unlock_irq(&ctx->ctx_lock); + cancel(iocb, &res); + spin_lock_irq(&ctx->ctx_lock); + } + } + spin_unlock_irq(&ctx->ctx_lock); +} + +static void wait_for_all_aios(struct kioctx *ctx) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + spin_lock_irq(&ctx->ctx_lock); + if (!ctx->reqs_active) + goto out; + + add_wait_queue(&ctx->wait, &wait); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + while (ctx->reqs_active) { + spin_unlock_irq(&ctx->ctx_lock); + io_schedule(); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + spin_lock_irq(&ctx->ctx_lock); + } + __set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&ctx->wait, &wait); + +out: + spin_unlock_irq(&ctx->ctx_lock); +} + +/* wait_on_sync_kiocb: + * Waits on the given sync kiocb to complete. + */ +ssize_t wait_on_sync_kiocb(struct kiocb *iocb) +{ + while (iocb->ki_users) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!iocb->ki_users) + break; + io_schedule(); + } + __set_current_state(TASK_RUNNING); + return iocb->ki_user_data; +} +EXPORT_SYMBOL(wait_on_sync_kiocb); + +/* exit_aio: called when the last user of mm goes away. At this point, + * there is no way for any new requests to be submited or any of the + * io_* syscalls to be called on the context. However, there may be + * outstanding requests which hold references to the context; as they + * go away, they will call put_ioctx and release any pinned memory + * associated with the request (held via struct page * references). + */ +void exit_aio(struct mm_struct *mm) +{ + struct kioctx *ctx; + + while (!hlist_empty(&mm->ioctx_list)) { + ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); + hlist_del_rcu(&ctx->list); + + aio_cancel_all(ctx); + + wait_for_all_aios(ctx); + /* + * Ensure we don't leave the ctx on the aio_wq + */ + cancel_work_sync(&ctx->wq.work); + + if (1 != atomic_read(&ctx->users)) + printk(KERN_DEBUG + "exit_aio:ioctx still alive: %d %d %d\n", + atomic_read(&ctx->users), ctx->dead, + ctx->reqs_active); + put_ioctx(ctx); + } +} + +/* aio_get_req + * Allocate a slot for an aio request. Increments the users count + * of the kioctx so that the kioctx stays around until all requests are + * complete. Returns NULL if no requests are free. + * + * Returns with kiocb->users set to 2. The io submit code path holds + * an extra reference while submitting the i/o. + * This prevents races between the aio code path referencing the + * req (after submitting it) and aio_complete() freeing the req. + */ +static struct kiocb *__aio_get_req(struct kioctx *ctx) +{ + struct kiocb *req = NULL; + struct aio_ring *ring; + int okay = 0; + + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); + if (unlikely(!req)) + return NULL; + + req->ki_flags = 0; + req->ki_users = 2; + req->ki_key = 0; + req->ki_ctx = ctx; + req->ki_cancel = NULL; + req->ki_retry = NULL; + req->ki_dtor = NULL; + req->private = NULL; + req->ki_iovec = NULL; + INIT_LIST_HEAD(&req->ki_run_list); + req->ki_eventfd = NULL; + + /* Check if the completion queue has enough free space to + * accept an event from this io. + */ + spin_lock_irq(&ctx->ctx_lock); + ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); + if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { + list_add(&req->ki_list, &ctx->active_reqs); + ctx->reqs_active++; + okay = 1; + } + kunmap_atomic(ring, KM_USER0); + spin_unlock_irq(&ctx->ctx_lock); + + if (!okay) { + kmem_cache_free(kiocb_cachep, req); + req = NULL; + } + + return req; +} + +static inline struct kiocb *aio_get_req(struct kioctx *ctx) +{ + struct kiocb *req; + /* Handle a potential starvation case -- should be exceedingly rare as + * requests will be stuck on fput_head only if the aio_fput_routine is + * delayed and the requests were the last user of the struct file. + */ + req = __aio_get_req(ctx); + if (unlikely(NULL == req)) { + aio_fput_routine(NULL); + req = __aio_get_req(ctx); + } + return req; +} + +static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) +{ + assert_spin_locked(&ctx->ctx_lock); + + if (req->ki_eventfd != NULL) + eventfd_ctx_put(req->ki_eventfd); + if (req->ki_dtor) + req->ki_dtor(req); + if (req->ki_iovec != &req->ki_inline_vec) + kfree(req->ki_iovec); + kmem_cache_free(kiocb_cachep, req); + ctx->reqs_active--; + + if (unlikely(!ctx->reqs_active && ctx->dead)) + wake_up_all(&ctx->wait); +} + +static void aio_fput_routine(struct work_struct *data) +{ + spin_lock_irq(&fput_lock); + while (likely(!list_empty(&fput_head))) { + struct kiocb *req = list_kiocb(fput_head.next); + struct kioctx *ctx = req->ki_ctx; + + list_del(&req->ki_list); + spin_unlock_irq(&fput_lock); + + /* Complete the fput(s) */ + if (req->ki_filp != NULL) + fput(req->ki_filp); + + /* Link the iocb into the context's free list */ + rcu_read_lock(); + spin_lock_irq(&ctx->ctx_lock); + really_put_req(ctx, req); + /* + * at that point ctx might've been killed, but actual + * freeing is RCU'd + */ + spin_unlock_irq(&ctx->ctx_lock); + rcu_read_unlock(); + + spin_lock_irq(&fput_lock); + } + spin_unlock_irq(&fput_lock); +} + +/* __aio_put_req + * Returns true if this put was the last user of the request. + */ +static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) +{ + dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", + req, atomic_long_read(&req->ki_filp->f_count)); + + assert_spin_locked(&ctx->ctx_lock); + + req->ki_users--; + BUG_ON(req->ki_users < 0); + if (likely(req->ki_users)) + return 0; + list_del(&req->ki_list); /* remove from active_reqs */ + req->ki_cancel = NULL; + req->ki_retry = NULL; + + /* + * Try to optimize the aio and eventfd file* puts, by avoiding to + * schedule work in case it is not final fput() time. In normal cases, + * we would not be holding the last reference to the file*, so + * this function will be executed w/out any aio kthread wakeup. + */ + if (unlikely(!fput_atomic(req->ki_filp))) { + spin_lock(&fput_lock); + list_add(&req->ki_list, &fput_head); + spin_unlock(&fput_lock); + schedule_work(&fput_work); + } else { + req->ki_filp = NULL; + really_put_req(ctx, req); + } + return 1; +} + +/* aio_put_req + * Returns true if this put was the last user of the kiocb, + * false if the request is still in use. + */ +int aio_put_req(struct kiocb *req) +{ + struct kioctx *ctx = req->ki_ctx; + int ret; + spin_lock_irq(&ctx->ctx_lock); + ret = __aio_put_req(ctx, req); + spin_unlock_irq(&ctx->ctx_lock); + return ret; +} +EXPORT_SYMBOL(aio_put_req); + +static struct kioctx *lookup_ioctx(unsigned long ctx_id) +{ + struct mm_struct *mm = current->mm; + struct kioctx *ctx, *ret = NULL; + struct hlist_node *n; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { + /* + * RCU protects us against accessing freed memory but + * we have to be careful not to get a reference when the + * reference count already dropped to 0 (ctx->dead test + * is unreliable because of races). + */ + if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){ + ret = ctx; + break; + } + } + + rcu_read_unlock(); + return ret; +} + +/* + * Queue up a kiocb to be retried. Assumes that the kiocb + * has already been marked as kicked, and places it on + * the retry run list for the corresponding ioctx, if it + * isn't already queued. Returns 1 if it actually queued + * the kiocb (to tell the caller to activate the work + * queue to process it), or 0, if it found that it was + * already queued. + */ +static inline int __queue_kicked_iocb(struct kiocb *iocb) +{ + struct kioctx *ctx = iocb->ki_ctx; + + assert_spin_locked(&ctx->ctx_lock); + + if (list_empty(&iocb->ki_run_list)) { + list_add_tail(&iocb->ki_run_list, + &ctx->run_list); + return 1; + } + return 0; +} + +/* aio_run_iocb + * This is the core aio execution routine. It is + * invoked both for initial i/o submission and + * subsequent retries via the aio_kick_handler. + * Expects to be invoked with iocb->ki_ctx->lock + * already held. The lock is released and reacquired + * as needed during processing. + * + * Calls the iocb retry method (already setup for the + * iocb on initial submission) for operation specific + * handling, but takes care of most of common retry + * execution details for a given iocb. The retry method + * needs to be non-blocking as far as possible, to avoid + * holding up other iocbs waiting to be serviced by the + * retry kernel thread. + * + * The trickier parts in this code have to do with + * ensuring that only one retry instance is in progress + * for a given iocb at any time. Providing that guarantee + * simplifies the coding of individual aio operations as + * it avoids various potential races. + */ +static ssize_t aio_run_iocb(struct kiocb *iocb) +{ + struct kioctx *ctx = iocb->ki_ctx; + ssize_t (*retry)(struct kiocb *); + ssize_t ret; + + if (!(retry = iocb->ki_retry)) { + printk("aio_run_iocb: iocb->ki_retry = NULL\n"); + return 0; + } + + /* + * We don't want the next retry iteration for this + * operation to start until this one has returned and + * updated the iocb state. However, wait_queue functions + * can trigger a kick_iocb from interrupt context in the + * meantime, indicating that data is available for the next + * iteration. We want to remember that and enable the + * next retry iteration _after_ we are through with + * this one. + * + * So, in order to be able to register a "kick", but + * prevent it from being queued now, we clear the kick + * flag, but make the kick code *think* that the iocb is + * still on the run list until we are actually done. + * When we are done with this iteration, we check if + * the iocb was kicked in the meantime and if so, queue + * it up afresh. + */ + + kiocbClearKicked(iocb); + + /* + * This is so that aio_complete knows it doesn't need to + * pull the iocb off the run list (We can't just call + * INIT_LIST_HEAD because we don't want a kick_iocb to + * queue this on the run list yet) + */ + iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; + spin_unlock_irq(&ctx->ctx_lock); + + /* Quit retrying if the i/o has been cancelled */ + if (kiocbIsCancelled(iocb)) { + ret = -EINTR; + aio_complete(iocb, ret, 0); + /* must not access the iocb after this */ + goto out; + } + + /* + * Now we are all set to call the retry method in async + * context. + */ + ret = retry(iocb); + + if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || + ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) + ret = -EINTR; + aio_complete(iocb, ret, 0); + } +out: + spin_lock_irq(&ctx->ctx_lock); + + if (-EIOCBRETRY == ret) { + /* + * OK, now that we are done with this iteration + * and know that there is more left to go, + * this is where we let go so that a subsequent + * "kick" can start the next iteration + */ + + /* will make __queue_kicked_iocb succeed from here on */ + INIT_LIST_HEAD(&iocb->ki_run_list); + /* we must queue the next iteration ourselves, if it + * has already been kicked */ + if (kiocbIsKicked(iocb)) { + __queue_kicked_iocb(iocb); + + /* + * __queue_kicked_iocb will always return 1 here, because + * iocb->ki_run_list is empty at this point so it should + * be safe to unconditionally queue the context into the + * work queue. + */ + aio_queue_work(ctx); + } + } + return ret; +} + +/* + * __aio_run_iocbs: + * Process all pending retries queued on the ioctx + * run list. + * Assumes it is operating within the aio issuer's mm + * context. + */ +static int __aio_run_iocbs(struct kioctx *ctx) +{ + struct kiocb *iocb; + struct list_head run_list; + + assert_spin_locked(&ctx->ctx_lock); + + list_replace_init(&ctx->run_list, &run_list); + while (!list_empty(&run_list)) { + iocb = list_entry(run_list.next, struct kiocb, + ki_run_list); + list_del(&iocb->ki_run_list); + /* + * Hold an extra reference while retrying i/o. + */ + iocb->ki_users++; /* grab extra reference */ + aio_run_iocb(iocb); + __aio_put_req(ctx, iocb); + } + if (!list_empty(&ctx->run_list)) + return 1; + return 0; +} + +static void aio_queue_work(struct kioctx * ctx) +{ + unsigned long timeout; + /* + * if someone is waiting, get the work started right + * away, otherwise, use a longer delay + */ + smp_mb(); + if (waitqueue_active(&ctx->wait)) + timeout = 1; + else + timeout = HZ/10; + queue_delayed_work(aio_wq, &ctx->wq, timeout); +} + +/* + * aio_run_all_iocbs: + * Process all pending retries queued on the ioctx + * run list, and keep running them until the list + * stays empty. + * Assumes it is operating within the aio issuer's mm context. + */ +static inline void aio_run_all_iocbs(struct kioctx *ctx) +{ + spin_lock_irq(&ctx->ctx_lock); + while (__aio_run_iocbs(ctx)) + ; + spin_unlock_irq(&ctx->ctx_lock); +} + +/* + * aio_kick_handler: + * Work queue handler triggered to process pending + * retries on an ioctx. Takes on the aio issuer's + * mm context before running the iocbs, so that + * copy_xxx_user operates on the issuer's address + * space. + * Run on aiod's context. + */ +static void aio_kick_handler(struct work_struct *work) +{ + struct kioctx *ctx = container_of(work, struct kioctx, wq.work); + mm_segment_t oldfs = get_fs(); + struct mm_struct *mm; + int requeue; + + set_fs(USER_DS); + use_mm(ctx->mm); + spin_lock_irq(&ctx->ctx_lock); + requeue =__aio_run_iocbs(ctx); + mm = ctx->mm; + spin_unlock_irq(&ctx->ctx_lock); + unuse_mm(mm); + set_fs(oldfs); + /* + * we're in a worker thread already, don't use queue_delayed_work, + */ + if (requeue) + queue_delayed_work(aio_wq, &ctx->wq, 0); +} + + +/* + * Called by kick_iocb to queue the kiocb for retry + * and if required activate the aio work queue to process + * it + */ +static void try_queue_kicked_iocb(struct kiocb *iocb) +{ + struct kioctx *ctx = iocb->ki_ctx; + unsigned long flags; + int run = 0; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + /* set this inside the lock so that we can't race with aio_run_iocb() + * testing it and putting the iocb on the run list under the lock */ + if (!kiocbTryKick(iocb)) + run = __queue_kicked_iocb(iocb); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + if (run) + aio_queue_work(ctx); +} + +/* + * kick_iocb: + * Called typically from a wait queue callback context + * to trigger a retry of the iocb. + * The retry is usually executed by aio workqueue + * threads (See aio_kick_handler). + */ +void kick_iocb(struct kiocb *iocb) +{ + /* sync iocbs are easy: they can only ever be executing from a + * single context. */ + if (is_sync_kiocb(iocb)) { + kiocbSetKicked(iocb); + wake_up_process(iocb->ki_obj.tsk); + return; + } + + try_queue_kicked_iocb(iocb); +} +EXPORT_SYMBOL(kick_iocb); + +/* aio_complete + * Called when the io request on the given iocb is complete. + * Returns true if this is the last user of the request. The + * only other user of the request can be the cancellation code. + */ +int aio_complete(struct kiocb *iocb, long res, long res2) +{ + struct kioctx *ctx = iocb->ki_ctx; + struct aio_ring_info *info; + struct aio_ring *ring; + struct io_event *event; + unsigned long flags; + unsigned long tail; + int ret; + + /* + * Special case handling for sync iocbs: + * - events go directly into the iocb for fast handling + * - the sync task with the iocb in its stack holds the single iocb + * ref, no other paths have a way to get another ref + * - the sync task helpfully left a reference to itself in the iocb + */ + if (is_sync_kiocb(iocb)) { + BUG_ON(iocb->ki_users != 1); + iocb->ki_user_data = res; + iocb->ki_users = 0; + wake_up_process(iocb->ki_obj.tsk); + return 1; + } + + info = &ctx->ring_info; + + /* add a completion event to the ring buffer. + * must be done holding ctx->ctx_lock to prevent + * other code from messing with the tail + * pointer since we might be called from irq + * context. + */ + spin_lock_irqsave(&ctx->ctx_lock, flags); + + if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) + list_del_init(&iocb->ki_run_list); + + /* + * cancelled requests don't get events, userland was given one + * when the event got cancelled. + */ + if (kiocbIsCancelled(iocb)) + goto put_rq; + + ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); + + tail = info->tail; + event = aio_ring_event(info, tail, KM_IRQ0); + if (++tail >= info->nr) + tail = 0; + + event->obj = (u64)(unsigned long)iocb->ki_obj.user; + event->data = iocb->ki_user_data; + event->res = res; + event->res2 = res2; + + dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", + ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, + res, res2); + + /* after flagging the request as done, we + * must never even look at it again + */ + smp_wmb(); /* make event visible before updating tail */ + + info->tail = tail; + ring->tail = tail; + + put_aio_ring_event(event, KM_IRQ0); + kunmap_atomic(ring, KM_IRQ1); + + pr_debug("added to ring %p at [%lu]\n", iocb, tail); + + /* + * Check if the user asked us to deliver the result through an + * eventfd. The eventfd_signal() function is safe to be called + * from IRQ context. + */ + if (iocb->ki_eventfd != NULL) + eventfd_signal(iocb->ki_eventfd, 1); + +put_rq: + /* everything turned out well, dispose of the aiocb. */ + ret = __aio_put_req(ctx, iocb); + + /* + * We have to order our ring_info tail store above and test + * of the wait list below outside the wait lock. This is + * like in wake_up_bit() where clearing a bit has to be + * ordered with the unlocked test. + */ + smp_mb(); + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); + + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + return ret; +} +EXPORT_SYMBOL(aio_complete); + +/* aio_read_evt + * Pull an event off of the ioctx's event ring. Returns the number of + * events fetched (0 or 1 ;-) + * FIXME: make this use cmpxchg. + * TODO: make the ringbuffer user mmap()able (requires FIXME). + */ +static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) +{ + struct aio_ring_info *info = &ioctx->ring_info; + struct aio_ring *ring; + unsigned long head; + int ret = 0; + + ring = kmap_atomic(info->ring_pages[0], KM_USER0); + dprintk("in aio_read_evt h%lu t%lu m%lu\n", + (unsigned long)ring->head, (unsigned long)ring->tail, + (unsigned long)ring->nr); + + if (ring->head == ring->tail) + goto out; + + spin_lock(&info->ring_lock); + + head = ring->head % info->nr; + if (head != ring->tail) { + struct io_event *evp = aio_ring_event(info, head, KM_USER1); + *ent = *evp; + head = (head + 1) % info->nr; + smp_mb(); /* finish reading the event before updatng the head */ + ring->head = head; + ret = 1; + put_aio_ring_event(evp, KM_USER1); + } + spin_unlock(&info->ring_lock); + +out: + kunmap_atomic(ring, KM_USER0); + dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, + (unsigned long)ring->head, (unsigned long)ring->tail); + return ret; +} + +struct aio_timeout { + struct timer_list timer; + int timed_out; + struct task_struct *p; +}; + +static void timeout_func(unsigned long data) +{ + struct aio_timeout *to = (struct aio_timeout *)data; + + to->timed_out = 1; + wake_up_process(to->p); +} + +static inline void init_timeout(struct aio_timeout *to) +{ + setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); + to->timed_out = 0; + to->p = current; +} + +static inline void set_timeout(long start_jiffies, struct aio_timeout *to, + const struct timespec *ts) +{ + to->timer.expires = start_jiffies + timespec_to_jiffies(ts); + if (time_after(to->timer.expires, jiffies)) + add_timer(&to->timer); + else + to->timed_out = 1; +} + +static inline void clear_timeout(struct aio_timeout *to) +{ + del_singleshot_timer_sync(&to->timer); +} + +static int read_events(struct kioctx *ctx, + long min_nr, long nr, + struct io_event __user *event, + struct timespec __user *timeout) +{ + long start_jiffies = jiffies; + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + int ret; + int i = 0; + struct io_event ent; + struct aio_timeout to; + int retry = 0; + + /* needed to zero any padding within an entry (there shouldn't be + * any, but C is fun! + */ + memset(&ent, 0, sizeof(ent)); +retry: + ret = 0; + while (likely(i < nr)) { + ret = aio_read_evt(ctx, &ent); + if (unlikely(ret <= 0)) + break; + + dprintk("read event: %Lx %Lx %Lx %Lx\n", + ent.data, ent.obj, ent.res, ent.res2); + + /* Could we split the check in two? */ + ret = -EFAULT; + if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { + dprintk("aio: lost an event due to EFAULT.\n"); + break; + } + ret = 0; + + /* Good, event copied to userland, update counts. */ + event ++; + i ++; + } + + if (min_nr <= i) + return i; + if (ret) + return ret; + + /* End fast path */ + + /* racey check, but it gets redone */ + if (!retry && unlikely(!list_empty(&ctx->run_list))) { + retry = 1; + aio_run_all_iocbs(ctx); + goto retry; + } + + init_timeout(&to); + if (timeout) { + struct timespec ts; + ret = -EFAULT; + if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) + goto out; + + set_timeout(start_jiffies, &to, &ts); + } + + while (likely(i < nr)) { + add_wait_queue_exclusive(&ctx->wait, &wait); + do { + set_task_state(tsk, TASK_INTERRUPTIBLE); + ret = aio_read_evt(ctx, &ent); + if (ret) + break; + if (min_nr <= i) + break; + if (unlikely(ctx->dead)) { + ret = -EINVAL; + break; + } + if (to.timed_out) /* Only check after read evt */ + break; + /* Try to only show up in io wait if there are ops + * in flight */ + if (ctx->reqs_active) + io_schedule(); + else + schedule(); + if (signal_pending(tsk)) { + ret = -EINTR; + break; + } + /*ret = aio_read_evt(ctx, &ent);*/ + } while (1) ; + + set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&ctx->wait, &wait); + + if (unlikely(ret <= 0)) + break; + + ret = -EFAULT; + if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { + dprintk("aio: lost an event due to EFAULT.\n"); + break; + } + + /* Good, event copied to userland, update counts. */ + event ++; + i ++; + } + + if (timeout) + clear_timeout(&to); +out: + destroy_timer_on_stack(&to.timer); + return i ? i : ret; +} + +/* Take an ioctx and remove it from the list of ioctx's. Protects + * against races with itself via ->dead. + */ +static void io_destroy(struct kioctx *ioctx) +{ + struct mm_struct *mm = current->mm; + int was_dead; + + /* delete the entry from the list is someone else hasn't already */ + spin_lock(&mm->ioctx_lock); + was_dead = ioctx->dead; + ioctx->dead = 1; + hlist_del_rcu(&ioctx->list); + spin_unlock(&mm->ioctx_lock); + + dprintk("aio_release(%p)\n", ioctx); + if (likely(!was_dead)) + put_ioctx(ioctx); /* twice for the list */ + + aio_cancel_all(ioctx); + wait_for_all_aios(ioctx); + + /* + * Wake up any waiters. The setting of ctx->dead must be seen + * by other CPUs at this point. Right now, we rely on the + * locking done by the above calls to ensure this consistency. + */ + wake_up_all(&ioctx->wait); + put_ioctx(ioctx); /* once for the lookup */ +} + +/* sys_io_setup: + * Create an aio_context capable of receiving at least nr_events. + * ctxp must not point to an aio_context that already exists, and + * must be initialized to 0 prior to the call. On successful + * creation of the aio_context, *ctxp is filled in with the resulting + * handle. May fail with -EINVAL if *ctxp is not initialized, + * if the specified nr_events exceeds internal limits. May fail + * with -EAGAIN if the specified nr_events exceeds the user's limit + * of available events. May fail with -ENOMEM if insufficient kernel + * resources are available. May fail with -EFAULT if an invalid + * pointer is passed for ctxp. Will fail with -ENOSYS if not + * implemented. + */ +SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) +{ + struct kioctx *ioctx = NULL; + unsigned long ctx; + long ret; + + ret = get_user(ctx, ctxp); + if (unlikely(ret)) + goto out; + + ret = -EINVAL; + if (unlikely(ctx || nr_events == 0)) { + pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n", + ctx, nr_events); + goto out; + } + + ioctx = ioctx_alloc(nr_events); + ret = PTR_ERR(ioctx); + if (!IS_ERR(ioctx)) { + ret = put_user(ioctx->user_id, ctxp); + if (!ret) { + put_ioctx(ioctx); + return 0; + } + io_destroy(ioctx); + } + +out: + return ret; +} + +/* sys_io_destroy: + * Destroy the aio_context specified. May cancel any outstanding + * AIOs and block on completion. Will fail with -ENOSYS if not + * implemented. May fail with -EINVAL if the context pointed to + * is invalid. + */ +SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) +{ + struct kioctx *ioctx = lookup_ioctx(ctx); + if (likely(NULL != ioctx)) { + io_destroy(ioctx); + return 0; + } + pr_debug("EINVAL: io_destroy: invalid context id\n"); + return -EINVAL; +} + +static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) +{ + struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; + + BUG_ON(ret <= 0); + + while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { + ssize_t this = min((ssize_t)iov->iov_len, ret); + iov->iov_base += this; + iov->iov_len -= this; + iocb->ki_left -= this; + ret -= this; + if (iov->iov_len == 0) { + iocb->ki_cur_seg++; + iov++; + } + } + + /* the caller should not have done more io than what fit in + * the remaining iovecs */ + BUG_ON(ret > 0 && iocb->ki_left == 0); +} + +static ssize_t aio_rw_vect_retry(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t (*rw_op)(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + ssize_t ret = 0; + unsigned short opcode; + + if ((iocb->ki_opcode == IOCB_CMD_PREADV) || + (iocb->ki_opcode == IOCB_CMD_PREAD)) { + rw_op = file->f_op->aio_read; + opcode = IOCB_CMD_PREADV; + } else { + rw_op = file->f_op->aio_write; + opcode = IOCB_CMD_PWRITEV; + } + + /* This matches the pread()/pwrite() logic */ + if (iocb->ki_pos < 0) + return -EINVAL; + + do { + ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], + iocb->ki_nr_segs - iocb->ki_cur_seg, + iocb->ki_pos); + if (ret > 0) + aio_advance_iovec(iocb, ret); + + /* retry all partial writes. retry partial reads as long as its a + * regular file. */ + } while (ret > 0 && iocb->ki_left > 0 && + (opcode == IOCB_CMD_PWRITEV || + (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); + + /* This means we must have transferred all that we could */ + /* No need to retry anymore */ + if ((ret == 0) || (iocb->ki_left == 0)) + ret = iocb->ki_nbytes - iocb->ki_left; + + /* If we managed to write some out we return that, rather than + * the eventual error. */ + if (opcode == IOCB_CMD_PWRITEV + && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY + && iocb->ki_nbytes - iocb->ki_left) + ret = iocb->ki_nbytes - iocb->ki_left; + + return ret; +} + +static ssize_t aio_fdsync(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + ssize_t ret = -EINVAL; + + if (file->f_op->aio_fsync) + ret = file->f_op->aio_fsync(iocb, 1); + return ret; +} + +static ssize_t aio_fsync(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + ssize_t ret = -EINVAL; + + if (file->f_op->aio_fsync) + ret = file->f_op->aio_fsync(iocb, 0); + return ret; +} + +static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) +{ + ssize_t ret; + +#ifdef CONFIG_COMPAT + if (compat) + ret = compat_rw_copy_check_uvector(type, + (struct compat_iovec __user *)kiocb->ki_buf, + kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + &kiocb->ki_iovec); + else +#endif + ret = rw_copy_check_uvector(type, + (struct iovec __user *)kiocb->ki_buf, + kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + &kiocb->ki_iovec); + if (ret < 0) + goto out; + + ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret); + if (ret < 0) + goto out; + + kiocb->ki_nr_segs = kiocb->ki_nbytes; + kiocb->ki_cur_seg = 0; + /* ki_nbytes/left now reflect bytes instead of segs */ + kiocb->ki_nbytes = ret; + kiocb->ki_left = ret; + + ret = 0; +out: + return ret; +} + +static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb) +{ + int bytes; + + bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left); + if (bytes < 0) + return bytes; + + kiocb->ki_iovec = &kiocb->ki_inline_vec; + kiocb->ki_iovec->iov_base = kiocb->ki_buf; + kiocb->ki_iovec->iov_len = bytes; + kiocb->ki_nr_segs = 1; + kiocb->ki_cur_seg = 0; + return 0; +} + +/* + * aio_setup_iocb: + * Performs the initial checks and aio retry method + * setup for the kiocb at the time of io submission. + */ +static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) +{ + struct file *file = kiocb->ki_filp; + ssize_t ret = 0; + + switch (kiocb->ki_opcode) { + case IOCB_CMD_PREAD: + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_READ))) + break; + ret = -EFAULT; + if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, + kiocb->ki_left))) + break; + ret = aio_setup_single_vector(READ, file, kiocb); + if (ret) + break; + ret = -EINVAL; + if (file->f_op->aio_read) + kiocb->ki_retry = aio_rw_vect_retry; + break; + case IOCB_CMD_PWRITE: + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_WRITE))) + break; + ret = -EFAULT; + if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, + kiocb->ki_left))) + break; + ret = aio_setup_single_vector(WRITE, file, kiocb); + if (ret) + break; + ret = -EINVAL; + if (file->f_op->aio_write) + kiocb->ki_retry = aio_rw_vect_retry; + break; + case IOCB_CMD_PREADV: + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_READ))) + break; + ret = aio_setup_vectored_rw(READ, kiocb, compat); + if (ret) + break; + ret = -EINVAL; + if (file->f_op->aio_read) + kiocb->ki_retry = aio_rw_vect_retry; + break; + case IOCB_CMD_PWRITEV: + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_WRITE))) + break; + ret = aio_setup_vectored_rw(WRITE, kiocb, compat); + if (ret) + break; + ret = -EINVAL; + if (file->f_op->aio_write) + kiocb->ki_retry = aio_rw_vect_retry; + break; + case IOCB_CMD_FDSYNC: + ret = -EINVAL; + if (file->f_op->aio_fsync) + kiocb->ki_retry = aio_fdsync; + break; + case IOCB_CMD_FSYNC: + ret = -EINVAL; + if (file->f_op->aio_fsync) + kiocb->ki_retry = aio_fsync; + break; + default: + dprintk("EINVAL: io_submit: no operation provided\n"); + ret = -EINVAL; + } + + if (!kiocb->ki_retry) + return ret; + + return 0; +} + +static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, + struct iocb *iocb, bool compat) +{ + struct kiocb *req; + struct file *file; + ssize_t ret; + + /* enforce forwards compatibility on users */ + if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { + pr_debug("EINVAL: io_submit: reserve field set\n"); + return -EINVAL; + } + + /* prevent overflows */ + if (unlikely( + (iocb->aio_buf != (unsigned long)iocb->aio_buf) || + (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || + ((ssize_t)iocb->aio_nbytes < 0) + )) { + pr_debug("EINVAL: io_submit: overflow check\n"); + return -EINVAL; + } + + file = fget(iocb->aio_fildes); + if (unlikely(!file)) + return -EBADF; + + req = aio_get_req(ctx); /* returns with 2 references to req */ + if (unlikely(!req)) { + fput(file); + return -EAGAIN; + } + req->ki_filp = file; + if (iocb->aio_flags & IOCB_FLAG_RESFD) { + /* + * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an + * instance of the file* now. The file descriptor must be + * an eventfd() fd, and will be signaled for each completed + * event using the eventfd_signal() function. + */ + req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); + if (IS_ERR(req->ki_eventfd)) { + ret = PTR_ERR(req->ki_eventfd); + req->ki_eventfd = NULL; + goto out_put_req; + } + } + + ret = put_user(req->ki_key, &user_iocb->aio_key); + if (unlikely(ret)) { + dprintk("EFAULT: aio_key\n"); + goto out_put_req; + } + + req->ki_obj.user = user_iocb; + req->ki_user_data = iocb->aio_data; + req->ki_pos = iocb->aio_offset; + + req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; + req->ki_left = req->ki_nbytes = iocb->aio_nbytes; + req->ki_opcode = iocb->aio_lio_opcode; + + ret = aio_setup_iocb(req, compat); + + if (ret) + goto out_put_req; + + spin_lock_irq(&ctx->ctx_lock); + /* + * We could have raced with io_destroy() and are currently holding a + * reference to ctx which should be destroyed. We cannot submit IO + * since ctx gets freed as soon as io_submit() puts its reference. The + * check here is reliable: io_destroy() sets ctx->dead before waiting + * for outstanding IO and the barrier between these two is realized by + * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we + * increment ctx->reqs_active before checking for ctx->dead and the + * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we + * don't see ctx->dead set here, io_destroy() waits for our IO to + * finish. + */ + if (ctx->dead) { + spin_unlock_irq(&ctx->ctx_lock); + ret = -EINVAL; + goto out_put_req; + } + aio_run_iocb(req); + if (!list_empty(&ctx->run_list)) { + /* drain the run list */ + while (__aio_run_iocbs(ctx)) + ; + } + spin_unlock_irq(&ctx->ctx_lock); + + aio_put_req(req); /* drop extra ref to req */ + return 0; + +out_put_req: + aio_put_req(req); /* drop extra ref to req */ + aio_put_req(req); /* drop i/o ref to req */ + return ret; +} + +long do_io_submit(aio_context_t ctx_id, long nr, + struct iocb __user *__user *iocbpp, bool compat) +{ + struct kioctx *ctx; + long ret = 0; + int i; + struct blk_plug plug; + + if (unlikely(nr < 0)) + return -EINVAL; + + if (unlikely(nr > LONG_MAX/sizeof(*iocbpp))) + nr = LONG_MAX/sizeof(*iocbpp); + + if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) + return -EFAULT; + + ctx = lookup_ioctx(ctx_id); + if (unlikely(!ctx)) { + pr_debug("EINVAL: io_submit: invalid context id\n"); + return -EINVAL; + } + + blk_start_plug(&plug); + + /* + * AKPM: should this return a partial result if some of the IOs were + * successfully submitted? + */ + for (i=0; ictx_lock); + + /* TODO: use a hash or array, this sucks. */ + list_for_each(pos, &ctx->active_reqs) { + struct kiocb *kiocb = list_kiocb(pos); + if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key) + return kiocb; + } + return NULL; +} + +/* sys_io_cancel: + * Attempts to cancel an iocb previously passed to io_submit. If + * the operation is successfully cancelled, the resulting event is + * copied into the memory pointed to by result without being placed + * into the completion queue and 0 is returned. May fail with + * -EFAULT if any of the data structures pointed to are invalid. + * May fail with -EINVAL if aio_context specified by ctx_id is + * invalid. May fail with -EAGAIN if the iocb specified was not + * cancelled. Will fail with -ENOSYS if not implemented. + */ +SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, + struct io_event __user *, result) +{ + int (*cancel)(struct kiocb *iocb, struct io_event *res); + struct kioctx *ctx; + struct kiocb *kiocb; + u32 key; + int ret; + + ret = get_user(key, &iocb->aio_key); + if (unlikely(ret)) + return -EFAULT; + + ctx = lookup_ioctx(ctx_id); + if (unlikely(!ctx)) + return -EINVAL; + + spin_lock_irq(&ctx->ctx_lock); + ret = -EAGAIN; + kiocb = lookup_kiocb(ctx, iocb, key); + if (kiocb && kiocb->ki_cancel) { + cancel = kiocb->ki_cancel; + kiocb->ki_users ++; + kiocbSetCancelled(kiocb); + } else + cancel = NULL; + spin_unlock_irq(&ctx->ctx_lock); + + if (NULL != cancel) { + struct io_event tmp; + pr_debug("calling cancel\n"); + memset(&tmp, 0, sizeof(tmp)); + tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; + tmp.data = kiocb->ki_user_data; + ret = cancel(kiocb, &tmp); + if (!ret) { + /* Cancellation succeeded -- copy the result + * into the user's buffer. + */ + if (copy_to_user(result, &tmp, sizeof(tmp))) + ret = -EFAULT; + } + } else + ret = -EINVAL; + + put_ioctx(ctx); + + return ret; +} + +/* io_getevents: + * Attempts to read at least min_nr events and up to nr events from + * the completion queue for the aio_context specified by ctx_id. If + * it succeeds, the number of read events is returned. May fail with + * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is + * out of range, if timeout is out of range. May fail with -EFAULT + * if any of the memory specified is invalid. May return 0 or + * < min_nr if the timeout specified by timeout has elapsed + * before sufficient events are available, where timeout == NULL + * specifies an infinite timeout. Note that the timeout pointed to by + * timeout is relative and will be updated if not NULL and the + * operation blocks. Will fail with -ENOSYS if not implemented. + */ +SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, + long, min_nr, + long, nr, + struct io_event __user *, events, + struct timespec __user *, timeout) +{ + struct kioctx *ioctx = lookup_ioctx(ctx_id); + long ret = -EINVAL; + + if (likely(ioctx)) { + if (likely(min_nr <= nr && min_nr >= 0)) + ret = read_events(ioctx, min_nr, nr, events, timeout); + put_ioctx(ioctx); + } + + asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout); + return ret; +} diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c new file mode 100644 index 00000000..c5567cb7 --- /dev/null +++ b/fs/anon_inodes.c @@ -0,0 +1,244 @@ +/* + * fs/anon_inodes.c + * + * Copyright (C) 2007 Davide Libenzi + * + * Thanks to Arnd Bergmann for code review and suggestions. + * More changes for Thomas Gleixner suggestions. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static struct vfsmount *anon_inode_mnt __read_mostly; +static struct inode *anon_inode_inode; +static const struct file_operations anon_inode_fops; + +/* + * anon_inodefs_dname() is called from d_path(). + */ +static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s", + dentry->d_name.name); +} + +static const struct dentry_operations anon_inodefs_dentry_operations = { + .d_dname = anon_inodefs_dname, +}; + +static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "anon_inode:", NULL, + &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC); +} + +static struct file_system_type anon_inode_fs_type = { + .name = "anon_inodefs", + .mount = anon_inodefs_mount, + .kill_sb = kill_anon_super, +}; + +/* + * nop .set_page_dirty method so that people can use .page_mkwrite on + * anon inodes. + */ +static int anon_set_page_dirty(struct page *page) +{ + return 0; +}; + +static const struct address_space_operations anon_aops = { + .set_page_dirty = anon_set_page_dirty, +}; + +/** + * anon_inode_getfile - creates a new file instance by hooking it up to an + * anonymous inode, and a dentry that describe the "class" + * of the file + * + * @name: [in] name of the "class" of the new file + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags + * + * Creates a new file by hooking it on a single inode. This is useful for files + * that do not need to have a full-fledged inode in order to operate correctly. + * All the files created with anon_inode_getfile() will share a single inode, + * hence saving memory and avoiding code duplication for the file/inode/dentry + * setup. Returns the newly created file* or an error pointer. + */ +struct file *anon_inode_getfile(const char *name, + const struct file_operations *fops, + void *priv, int flags) +{ + struct qstr this; + struct path path; + struct file *file; + int error; + + if (IS_ERR(anon_inode_inode)) + return ERR_PTR(-ENODEV); + + if (fops->owner && !try_module_get(fops->owner)) + return ERR_PTR(-ENOENT); + + /* + * Link the inode to a directory entry by creating a unique name + * using the inode sequence number. + */ + error = -ENOMEM; + this.name = name; + this.len = strlen(name); + this.hash = 0; + path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this); + if (!path.dentry) + goto err_module; + + path.mnt = mntget(anon_inode_mnt); + /* + * We know the anon_inode inode count is always greater than zero, + * so ihold() is safe. + */ + ihold(anon_inode_inode); + + d_instantiate(path.dentry, anon_inode_inode); + + error = -ENFILE; + file = alloc_file(&path, OPEN_FMODE(flags), fops); + if (!file) + goto err_dput; + file->f_mapping = anon_inode_inode->i_mapping; + + file->f_pos = 0; + file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); + file->f_version = 0; + file->private_data = priv; + + return file; + +err_dput: + path_put(&path); +err_module: + module_put(fops->owner); + return ERR_PTR(error); +} +EXPORT_SYMBOL_GPL(anon_inode_getfile); + +/** + * anon_inode_getfd - creates a new file instance by hooking it up to an + * anonymous inode, and a dentry that describe the "class" + * of the file + * + * @name: [in] name of the "class" of the new file + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags + * + * Creates a new file by hooking it on a single inode. This is useful for files + * that do not need to have a full-fledged inode in order to operate correctly. + * All the files created with anon_inode_getfd() will share a single inode, + * hence saving memory and avoiding code duplication for the file/inode/dentry + * setup. Returns new descriptor or an error code. + */ +int anon_inode_getfd(const char *name, const struct file_operations *fops, + void *priv, int flags) +{ + int error, fd; + struct file *file; + + error = get_unused_fd_flags(flags); + if (error < 0) + return error; + fd = error; + + file = anon_inode_getfile(name, fops, priv, flags); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_put_unused_fd; + } + fd_install(fd, file); + + return fd; + +err_put_unused_fd: + put_unused_fd(fd); + return error; +} +EXPORT_SYMBOL_GPL(anon_inode_getfd); + +/* + * A single inode exists for all anon_inode files. Contrary to pipes, + * anon_inode inodes have no associated per-instance data, so we need + * only allocate one of them. + */ +static struct inode *anon_inode_mkinode(void) +{ + struct inode *inode = new_inode(anon_inode_mnt->mnt_sb); + + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_ino = get_next_ino(); + inode->i_fop = &anon_inode_fops; + + inode->i_mapping->a_ops = &anon_aops; + + /* + * Mark the inode dirty from the very beginning, + * that way it will never be moved to the dirty + * list because mark_inode_dirty() will think + * that it already _is_ on the dirty list. + */ + inode->i_state = I_DIRTY; + inode->i_mode = S_IRUSR | S_IWUSR; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_flags |= S_PRIVATE; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + return inode; +} + +static int __init anon_inode_init(void) +{ + int error; + + error = register_filesystem(&anon_inode_fs_type); + if (error) + goto err_exit; + anon_inode_mnt = kern_mount(&anon_inode_fs_type); + if (IS_ERR(anon_inode_mnt)) { + error = PTR_ERR(anon_inode_mnt); + goto err_unregister_filesystem; + } + anon_inode_inode = anon_inode_mkinode(); + if (IS_ERR(anon_inode_inode)) { + error = PTR_ERR(anon_inode_inode); + goto err_mntput; + } + + return 0; + +err_mntput: + mntput(anon_inode_mnt); +err_unregister_filesystem: + unregister_filesystem(&anon_inode_fs_type); +err_exit: + panic(KERN_ERR "anon_inode_init() failed (%d)\n", error); +} + +fs_initcall(anon_inode_init); + diff --git a/fs/attr.c b/fs/attr.c new file mode 100644 index 00000000..caf2aa52 --- /dev/null +++ b/fs/attr.c @@ -0,0 +1,252 @@ +/* + * linux/fs/attr.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * changes by Thomas Schoebel-Theuer + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * inode_change_ok - check if attribute changes to an inode are allowed + * @inode: inode to check + * @attr: attributes to change + * + * Check if we are allowed to change the attributes contained in @attr + * in the given inode. This includes the normal unix access permission + * checks, as well as checks for rlimits and others. + * + * Should be called as the first thing in ->setattr implementations, + * possibly after taking additional locks. + */ +int inode_change_ok(const struct inode *inode, struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + + /* + * First check size constraints. These can't be overriden using + * ATTR_FORCE. + */ + if (ia_valid & ATTR_SIZE) { + int error = inode_newsize_ok(inode, attr->ia_size); + if (error) + return error; + } + + /* If force is set do it anyway. */ + if (ia_valid & ATTR_FORCE) + return 0; + + /* Make sure a caller can chown. */ + if ((ia_valid & ATTR_UID) && + (current_fsuid() != inode->i_uid || + attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN)) + return -EPERM; + + /* Make sure caller can chgrp. */ + if ((ia_valid & ATTR_GID) && + (current_fsuid() != inode->i_uid || + (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) && + !capable(CAP_CHOWN)) + return -EPERM; + + /* Make sure a caller can chmod. */ + if (ia_valid & ATTR_MODE) { + if (!inode_owner_or_capable(inode)) + return -EPERM; + /* Also check the setgid bit! */ + if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : + inode->i_gid) && !capable(CAP_FSETID)) + attr->ia_mode &= ~S_ISGID; + } + + /* Check for setting the inode time. */ + if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { + if (!inode_owner_or_capable(inode)) + return -EPERM; + } + + return 0; +} +EXPORT_SYMBOL(inode_change_ok); + +/** + * inode_newsize_ok - may this inode be truncated to a given size + * @inode: the inode to be truncated + * @offset: the new size to assign to the inode + * @Returns: 0 on success, -ve errno on failure + * + * inode_newsize_ok must be called with i_mutex held. + * + * inode_newsize_ok will check filesystem limits and ulimits to check that the + * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ + * when necessary. Caller must not proceed with inode size change if failure is + * returned. @inode must be a file (not directory), with appropriate + * permissions to allow truncate (inode_newsize_ok does NOT check these + * conditions). + */ +int inode_newsize_ok(const struct inode *inode, loff_t offset) +{ + if (inode->i_size < offset) { + unsigned long limit; + + limit = rlimit(RLIMIT_FSIZE); + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + } else { + /* + * truncation of in-use swapfiles is disallowed - it would + * cause subsequent swapout to scribble on the now-freed + * blocks. + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + } + + return 0; +out_sig: + send_sig(SIGXFSZ, current, 0); +out_big: + return -EFBIG; +} +EXPORT_SYMBOL(inode_newsize_ok); + +/** + * setattr_copy - copy simple metadata updates into the generic inode + * @inode: the inode to be updated + * @attr: the new attributes + * + * setattr_copy must be called with i_mutex held. + * + * setattr_copy updates the inode's metadata with that specified + * in attr. Noticeably missing is inode size update, which is more complex + * as it requires pagecache updates. + * + * The inode is not marked as dirty after this operation. The rationale is + * that for "simple" filesystems, the struct inode is the inode storage. + * The caller is free to mark the inode dirty afterwards if needed. + */ +void setattr_copy(struct inode *inode, const struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + + if (ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + if (ia_valid & ATTR_ATIME) + inode->i_atime = timespec_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + inode->i_mtime = timespec_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + inode->i_ctime = timespec_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + inode->i_mode = mode; + } +} +EXPORT_SYMBOL(setattr_copy); + +int notify_change(struct dentry * dentry, struct iattr * attr) +{ + struct inode *inode = dentry->d_inode; + mode_t mode = inode->i_mode; + int error; + struct timespec now; + unsigned int ia_valid = attr->ia_valid; + + if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + } + + if ((ia_valid & ATTR_MODE)) { + mode_t amode = attr->ia_mode; + /* Flag setting protected by i_mutex */ + if (is_sxid(amode)) + inode->i_flags &= ~S_NOSEC; + } + + now = current_fs_time(inode->i_sb); + + attr->ia_ctime = now; + if (!(ia_valid & ATTR_ATIME_SET)) + attr->ia_atime = now; + if (!(ia_valid & ATTR_MTIME_SET)) + attr->ia_mtime = now; + if (ia_valid & ATTR_KILL_PRIV) { + attr->ia_valid &= ~ATTR_KILL_PRIV; + ia_valid &= ~ATTR_KILL_PRIV; + error = security_inode_need_killpriv(dentry); + if (error > 0) + error = security_inode_killpriv(dentry); + if (error) + return error; + } + + /* + * We now pass ATTR_KILL_S*ID to the lower level setattr function so + * that the function has the ability to reinterpret a mode change + * that's due to these bits. This adds an implicit restriction that + * no function will ever call notify_change with both ATTR_MODE and + * ATTR_KILL_S*ID set. + */ + if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) && + (ia_valid & ATTR_MODE)) + BUG(); + + if (ia_valid & ATTR_KILL_SUID) { + if (mode & S_ISUID) { + ia_valid = attr->ia_valid |= ATTR_MODE; + attr->ia_mode = (inode->i_mode & ~S_ISUID); + } + } + if (ia_valid & ATTR_KILL_SGID) { + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + if (!(ia_valid & ATTR_MODE)) { + ia_valid = attr->ia_valid |= ATTR_MODE; + attr->ia_mode = inode->i_mode; + } + attr->ia_mode &= ~S_ISGID; + } + } + if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID))) + return 0; + + error = security_inode_setattr(dentry, attr); + if (error) + return error; + + if (ia_valid & ATTR_SIZE) + down_write(&dentry->d_inode->i_alloc_sem); + + if (inode->i_op->setattr) + error = inode->i_op->setattr(dentry, attr); + else + error = simple_setattr(dentry, attr); + + if (ia_valid & ATTR_SIZE) + up_write(&dentry->d_inode->i_alloc_sem); + + if (!error) + fsnotify_change(dentry, ia_valid); + + return error; +} + +EXPORT_SYMBOL(notify_change); diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig new file mode 100644 index 00000000..1204d638 --- /dev/null +++ b/fs/autofs4/Kconfig @@ -0,0 +1,20 @@ +config AUTOFS4_FS + tristate "Kernel automounter version 4 support (also supports v3)" + help + The automounter is a tool to automatically mount remote file systems + on demand. This implementation is partially kernel-based to reduce + overhead in the already-mounted case; this is unlike the BSD + automounter (amd), which is a pure user space daemon. + + To use the automounter you need the user-space tools from + ; you also + want to answer Y to "NFS file system support", below. + + To compile this support as a module, choose M here: the module will be + called autofs4. You will need to add "alias autofs autofs4" to your + modules configuration file. + + If you are not a part of a fairly large, distributed network or + don't have a laptop which needs to dynamically reconfigure to the + local network, you probably do not need an automounter, and can say + N here. diff --git a/fs/autofs4/Makefile b/fs/autofs4/Makefile new file mode 100644 index 00000000..a811c1f7 --- /dev/null +++ b/fs/autofs4/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux autofs-filesystem routines. +# + +obj-$(CONFIG_AUTOFS4_FS) += autofs4.o + +autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h new file mode 100644 index 00000000..756d3286 --- /dev/null +++ b/fs/autofs4/autofs_i.h @@ -0,0 +1,350 @@ +/* -*- c -*- ------------------------------------------------------------- * + * + * linux/fs/autofs/autofs_i.h + * + * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved + * Copyright 2005-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* Internal header file for autofs */ + +#include +#include +#include +#include +#include + +/* This is the range of ioctl() numbers we claim as ours */ +#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY +#define AUTOFS_IOC_COUNT 32 + +#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) +#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* #define DEBUG */ + +#ifdef DEBUG +#define DPRINTK(fmt, args...) \ +do { \ + printk(KERN_DEBUG "pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##args); \ +} while (0) +#else +#define DPRINTK(fmt, args...) do {} while (0) +#endif + +#define AUTOFS_WARN(fmt, args...) \ +do { \ + printk(KERN_WARNING "pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##args); \ +} while (0) + +#define AUTOFS_ERROR(fmt, args...) \ +do { \ + printk(KERN_ERR "pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##args); \ +} while (0) + +/* Unified info structure. This is pointed to by both the dentry and + inode structures. Each file in the filesystem has an instance of this + structure. It holds a reference to the dentry, so dentries are never + flushed while the file exists. All name lookups are dealt with at the + dentry level, although the filesystem can interfere in the validation + process. Readdir is implemented by traversing the dentry lists. */ +struct autofs_info { + struct dentry *dentry; + struct inode *inode; + + int flags; + + struct completion expire_complete; + + struct list_head active; + int active_count; + + struct list_head expiring; + + struct autofs_sb_info *sbi; + unsigned long last_used; + atomic_t count; + + uid_t uid; + gid_t gid; +}; + +#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ +#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ + +struct autofs_wait_queue { + wait_queue_head_t queue; + struct autofs_wait_queue *next; + autofs_wqt_t wait_queue_token; + /* We use the following to see what we are waiting for */ + struct qstr name; + u32 dev; + u64 ino; + uid_t uid; + gid_t gid; + pid_t pid; + pid_t tgid; + /* This is for status reporting upon return */ + int status; + unsigned int wait_ctr; +}; + +#define AUTOFS_SBI_MAGIC 0x6d4a556d + +struct autofs_sb_info { + u32 magic; + int pipefd; + struct file *pipe; + pid_t oz_pgrp; + int catatonic; + int version; + int sub_version; + int min_proto; + int max_proto; + unsigned long exp_timeout; + unsigned int type; + int reghost_enabled; + int needs_reghost; + struct super_block *sb; + struct mutex wq_mutex; + spinlock_t fs_lock; + struct autofs_wait_queue *queues; /* Wait queue pointer */ + spinlock_t lookup_lock; + struct list_head active_list; + struct list_head expiring_list; +}; + +static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) +{ + return (struct autofs_sb_info *)(sb->s_fs_info); +} + +static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) +{ + return (struct autofs_info *)(dentry->d_fsdata); +} + +/* autofs4_oz_mode(): do we see the man behind the curtain? (The + processes which do manipulations for us in user space sees the raw + filesystem without "magic".) */ + +static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { + return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp; +} + +/* Does a dentry have some pending activity? */ +static inline int autofs4_ispending(struct dentry *dentry) +{ + struct autofs_info *inf = autofs4_dentry_ino(dentry); + + if (inf->flags & AUTOFS_INF_PENDING) + return 1; + + if (inf->flags & AUTOFS_INF_EXPIRING) + return 1; + + return 0; +} + +struct inode *autofs4_get_inode(struct super_block *, mode_t); +void autofs4_free_ino(struct autofs_info *); + +/* Expiration */ +int is_autofs4_dentry(struct dentry *); +int autofs4_expire_wait(struct dentry *dentry); +int autofs4_expire_run(struct super_block *, struct vfsmount *, + struct autofs_sb_info *, + struct autofs_packet_expire __user *); +int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when); +int autofs4_expire_multi(struct super_block *, struct vfsmount *, + struct autofs_sb_info *, int __user *); +struct dentry *autofs4_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); +struct dentry *autofs4_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); + +/* Device node initialization */ + +int autofs_dev_ioctl_init(void); +void autofs_dev_ioctl_exit(void); + +/* Operations structures */ + +extern const struct inode_operations autofs4_symlink_inode_operations; +extern const struct inode_operations autofs4_dir_inode_operations; +extern const struct file_operations autofs4_dir_operations; +extern const struct file_operations autofs4_root_operations; +extern const struct dentry_operations autofs4_dentry_operations; + +/* VFS automount flags management functions */ + +static inline void __managed_dentry_set_automount(struct dentry *dentry) +{ + dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; +} + +static inline void managed_dentry_set_automount(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_set_automount(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_clear_automount(struct dentry *dentry) +{ + dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT; +} + +static inline void managed_dentry_clear_automount(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_clear_automount(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_set_transit(struct dentry *dentry) +{ + dentry->d_flags |= DCACHE_MANAGE_TRANSIT; +} + +static inline void managed_dentry_set_transit(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_set_transit(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_clear_transit(struct dentry *dentry) +{ + dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT; +} + +static inline void managed_dentry_clear_transit(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_clear_transit(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_set_managed(struct dentry *dentry) +{ + dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); +} + +static inline void managed_dentry_set_managed(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_set_managed(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_clear_managed(struct dentry *dentry) +{ + dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); +} + +static inline void managed_dentry_clear_managed(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_clear_managed(dentry); + spin_unlock(&dentry->d_lock); +} + +/* Initializing function */ + +int autofs4_fill_super(struct super_block *, void *, int); +struct autofs_info *autofs4_new_ino(struct autofs_sb_info *); +void autofs4_clean_ino(struct autofs_info *); + +static inline int autofs_prepare_pipe(struct file *pipe) +{ + if (!pipe->f_op || !pipe->f_op->write) + return -EINVAL; + if (!S_ISFIFO(pipe->f_dentry->d_inode->i_mode)) + return -EINVAL; + /* We want a packet pipe */ + pipe->f_flags |= O_DIRECT; + return 0; +} + +/* Queue management functions */ + +int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); +int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); +void autofs4_catatonic_mode(struct autofs_sb_info *); + +static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) +{ + return new_encode_dev(sbi->sb->s_dev); +} + +static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi) +{ + return sbi->sb->s_root->d_inode->i_ino; +} + +static inline int simple_positive(struct dentry *dentry) +{ + return dentry->d_inode && !d_unhashed(dentry); +} + +static inline void __autofs4_add_expiring(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + if (list_empty(&ino->expiring)) + list_add(&ino->expiring, &sbi->expiring_list); + } + return; +} + +static inline void autofs4_add_expiring(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + spin_lock(&sbi->lookup_lock); + if (list_empty(&ino->expiring)) + list_add(&ino->expiring, &sbi->expiring_list); + spin_unlock(&sbi->lookup_lock); + } + return; +} + +static inline void autofs4_del_expiring(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->expiring)) + list_del_init(&ino->expiring); + spin_unlock(&sbi->lookup_lock); + } + return; +} + +extern void autofs4_kill_sb(struct super_block *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c new file mode 100644 index 00000000..de542716 --- /dev/null +++ b/fs/autofs4/dev-ioctl.c @@ -0,0 +1,762 @@ +/* + * Copyright 2008 Red Hat, Inc. All rights reserved. + * Copyright 2008 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "autofs_i.h" + +/* + * This module implements an interface for routing autofs ioctl control + * commands via a miscellaneous device file. + * + * The alternate interface is needed because we need to be able open + * an ioctl file descriptor on an autofs mount that may be covered by + * another mount. This situation arises when starting automount(8) + * or other user space daemon which uses direct mounts or offset + * mounts (used for autofs lazy mount/umount of nested mount trees), + * which have been left busy at at service shutdown. + */ + +#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) + +typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *, + struct autofs_dev_ioctl *); + +static int check_name(const char *name) +{ + if (!strchr(name, '/')) + return -EINVAL; + return 0; +} + +/* + * Check a string doesn't overrun the chunk of + * memory we copied from user land. + */ +static int invalid_str(char *str, size_t size) +{ + if (memchr(str, 0, size)) + return 0; + return -EINVAL; +} + +/* + * Check that the user compiled against correct version of autofs + * misc device code. + * + * As well as checking the version compatibility this always copies + * the kernel interface version out. + */ +static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) +{ + int err = 0; + + if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) || + (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) { + AUTOFS_WARN("ioctl control interface version mismatch: " + "kernel(%u.%u), user(%u.%u), cmd(%d)", + AUTOFS_DEV_IOCTL_VERSION_MAJOR, + AUTOFS_DEV_IOCTL_VERSION_MINOR, + param->ver_major, param->ver_minor, cmd); + err = -EINVAL; + } + + /* Fill in the kernel version. */ + param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; + param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; + + return err; +} + +/* + * Copy parameter control struct, including a possible path allocated + * at the end of the struct. + */ +static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) +{ + struct autofs_dev_ioctl tmp; + + if (copy_from_user(&tmp, in, sizeof(tmp))) + return ERR_PTR(-EFAULT); + + if (tmp.size < sizeof(tmp)) + return ERR_PTR(-EINVAL); + + return memdup_user(in, tmp.size); +} + +static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) +{ + kfree(param); + return; +} + +/* + * Check sanity of parameter control fields and if a path is present + * check that it is terminated and contains at least one "/". + */ +static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) +{ + int err; + + err = check_dev_ioctl_version(cmd, param); + if (err) { + AUTOFS_WARN("invalid device control module version " + "supplied for cmd(0x%08x)", cmd); + goto out; + } + + if (param->size > sizeof(*param)) { + err = invalid_str(param->path, param->size - sizeof(*param)); + if (err) { + AUTOFS_WARN( + "path string terminator missing for cmd(0x%08x)", + cmd); + goto out; + } + + err = check_name(param->path); + if (err) { + AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", + cmd); + goto out; + } + } + + err = 0; +out: + return err; +} + +/* + * Get the autofs super block info struct from the file opened on + * the autofs mount point. + */ +static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) +{ + struct autofs_sb_info *sbi = NULL; + struct inode *inode; + + if (f) { + inode = f->f_path.dentry->d_inode; + sbi = autofs4_sbi(inode->i_sb); + } + return sbi; +} + +/* Return autofs module protocol version */ +static int autofs_dev_ioctl_protover(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->protover.version = sbi->version; + return 0; +} + +/* Return autofs module protocol sub version */ +static int autofs_dev_ioctl_protosubver(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->protosubver.sub_version = sbi->sub_version; + return 0; +} + +static int find_autofs_mount(const char *pathname, + struct path *res, + int test(struct path *path, void *data), + void *data) +{ + struct path path; + int err = kern_path(pathname, 0, &path); + if (err) + return err; + err = -ENOENT; + while (path.dentry == path.mnt->mnt_root) { + if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) { + if (test(&path, data)) { + path_get(&path); + if (!err) /* already found some */ + path_put(res); + *res = path; + err = 0; + } + } + if (!follow_up(&path)) + break; + } + path_put(&path); + return err; +} + +static int test_by_dev(struct path *path, void *p) +{ + return path->mnt->mnt_sb->s_dev == *(dev_t *)p; +} + +static int test_by_type(struct path *path, void *p) +{ + struct autofs_info *ino = autofs4_dentry_ino(path->dentry); + return ino && ino->sbi->type & *(unsigned *)p; +} + +static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + BUG_ON(fdt->fd[fd] != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + FD_SET(fd, fdt->close_on_exec); + spin_unlock(&files->file_lock); +} + + +/* + * Open a file descriptor on the autofs mount point corresponding + * to the given path and device number (aka. new_encode_dev(sb->s_dev)). + */ +static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) +{ + int err, fd; + + fd = get_unused_fd(); + if (likely(fd >= 0)) { + struct file *filp; + struct path path; + + err = find_autofs_mount(name, &path, test_by_dev, &devid); + if (err) + goto out; + + /* + * Find autofs super block that has the device number + * corresponding to the autofs fs we want to open. + */ + + filp = dentry_open(path.dentry, path.mnt, O_RDONLY, + current_cred()); + if (IS_ERR(filp)) { + err = PTR_ERR(filp); + goto out; + } + + autofs_dev_ioctl_fd_install(fd, filp); + } + + return fd; + +out: + put_unused_fd(fd); + return err; +} + +/* Open a file descriptor on an autofs mount point */ +static int autofs_dev_ioctl_openmount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + const char *path; + dev_t devid; + int err, fd; + + /* param->path has already been checked */ + if (!param->openmount.devid) + return -EINVAL; + + param->ioctlfd = -1; + + path = param->path; + devid = new_decode_dev(param->openmount.devid); + + err = 0; + fd = autofs_dev_ioctl_open_mountpoint(path, devid); + if (unlikely(fd < 0)) { + err = fd; + goto out; + } + + param->ioctlfd = fd; +out: + return err; +} + +/* Close file descriptor allocated above (user can also use close(2)). */ +static int autofs_dev_ioctl_closemount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + return sys_close(param->ioctlfd); +} + +/* + * Send "ready" status for an existing wait (either a mount or an expire + * request). + */ +static int autofs_dev_ioctl_ready(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs_wqt_t token; + + token = (autofs_wqt_t) param->ready.token; + return autofs4_wait_release(sbi, token, 0); +} + +/* + * Send "fail" status for an existing wait (either a mount or an expire + * request). + */ +static int autofs_dev_ioctl_fail(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs_wqt_t token; + int status; + + token = (autofs_wqt_t) param->fail.token; + status = param->fail.status ? param->fail.status : -ENOENT; + return autofs4_wait_release(sbi, token, status); +} + +/* + * Set the pipe fd for kernel communication to the daemon. + * + * Normally this is set at mount using an option but if we + * are reconnecting to a busy mount then we need to use this + * to tell the autofs mount about the new kernel pipe fd. In + * order to protect mounts against incorrectly setting the + * pipefd we also require that the autofs mount be catatonic. + * + * This also sets the process group id used to identify the + * controlling process (eg. the owning automount(8) daemon). + */ +static int autofs_dev_ioctl_setpipefd(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + int pipefd; + int err = 0; + + if (param->setpipefd.pipefd == -1) + return -EINVAL; + + pipefd = param->setpipefd.pipefd; + + mutex_lock(&sbi->wq_mutex); + if (!sbi->catatonic) { + mutex_unlock(&sbi->wq_mutex); + return -EBUSY; + } else { + struct file *pipe = fget(pipefd); + if (!pipe) { + err = -EBADF; + goto out; + } + if (autofs_prepare_pipe(pipe) < 0) { + err = -EPIPE; + fput(pipe); + goto out; + } + sbi->oz_pgrp = task_pgrp_nr(current); + sbi->pipefd = pipefd; + sbi->pipe = pipe; + sbi->catatonic = 0; + } +out: + mutex_unlock(&sbi->wq_mutex); + return err; +} + +/* + * Make the autofs mount point catatonic, no longer responsive to + * mount requests. Also closes the kernel pipe file descriptor. + */ +static int autofs_dev_ioctl_catatonic(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs4_catatonic_mode(sbi); + return 0; +} + +/* Set the autofs mount timeout */ +static int autofs_dev_ioctl_timeout(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + unsigned long timeout; + + timeout = param->timeout.timeout; + param->timeout.timeout = sbi->exp_timeout / HZ; + sbi->exp_timeout = timeout * HZ; + return 0; +} + +/* + * Return the uid and gid of the last request for the mount + * + * When reconstructing an autofs mount tree with active mounts + * we need to re-connect to mounts that may have used the original + * process uid and gid (or string variations of them) for mount + * lookups within the map entry. + */ +static int autofs_dev_ioctl_requester(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct autofs_info *ino; + struct path path; + dev_t devid; + int err = -ENOENT; + + if (param->size <= sizeof(*param)) { + err = -EINVAL; + goto out; + } + + devid = sbi->sb->s_dev; + + param->requester.uid = param->requester.gid = -1; + + err = find_autofs_mount(param->path, &path, test_by_dev, &devid); + if (err) + goto out; + + ino = autofs4_dentry_ino(path.dentry); + if (ino) { + err = 0; + autofs4_expire_wait(path.dentry); + spin_lock(&sbi->fs_lock); + param->requester.uid = ino->uid; + param->requester.gid = ino->gid; + spin_unlock(&sbi->fs_lock); + } + path_put(&path); +out: + return err; +} + +/* + * Call repeatedly until it returns -EAGAIN, meaning there's nothing + * more that can be done. + */ +static int autofs_dev_ioctl_expire(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct vfsmount *mnt; + int how; + + how = param->expire.how; + mnt = fp->f_path.mnt; + + return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how); +} + +/* Check if autofs mount point is in use */ +static int autofs_dev_ioctl_askumount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->askumount.may_umount = 0; + if (may_umount(fp->f_path.mnt)) + param->askumount.may_umount = 1; + return 0; +} + +/* + * Check if the given path is a mountpoint. + * + * If we are supplied with the file descriptor of an autofs + * mount we're looking for a specific mount. In this case + * the path is considered a mountpoint if it is itself a + * mountpoint or contains a mount, such as a multi-mount + * without a root mount. In this case we return 1 if the + * path is a mount point and the super magic of the covering + * mount if there is one or 0 if it isn't a mountpoint. + * + * If we aren't supplied with a file descriptor then we + * lookup the nameidata of the path and check if it is the + * root of a mount. If a type is given we are looking for + * a particular autofs mount and if we don't find a match + * we return fail. If the located nameidata path is the + * root of a mount we return 1 along with the super magic + * of the mount or 0 otherwise. + * + * In both cases the the device number (as returned by + * new_encode_dev()) is also returned. + */ +static int autofs_dev_ioctl_ismountpoint(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct path path; + const char *name; + unsigned int type; + unsigned int devid, magic; + int err = -ENOENT; + + if (param->size <= sizeof(*param)) { + err = -EINVAL; + goto out; + } + + name = param->path; + type = param->ismountpoint.in.type; + + param->ismountpoint.out.devid = devid = 0; + param->ismountpoint.out.magic = magic = 0; + + if (!fp || param->ioctlfd == -1) { + if (autofs_type_any(type)) + err = kern_path(name, LOOKUP_FOLLOW, &path); + else + err = find_autofs_mount(name, &path, test_by_type, &type); + if (err) + goto out; + devid = new_encode_dev(path.mnt->mnt_sb->s_dev); + err = 0; + if (path.mnt->mnt_root == path.dentry) { + err = 1; + magic = path.mnt->mnt_sb->s_magic; + } + } else { + dev_t dev = sbi->sb->s_dev; + + err = find_autofs_mount(name, &path, test_by_dev, &dev); + if (err) + goto out; + + devid = new_encode_dev(dev); + + err = have_submounts(path.dentry); + + if (follow_down_one(&path)) + magic = path.mnt->mnt_sb->s_magic; + } + + param->ismountpoint.out.devid = devid; + param->ismountpoint.out.magic = magic; + path_put(&path); +out: + return err; +} + +/* + * Our range of ioctl numbers isn't 0 based so we need to shift + * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table + * lookup. + */ +#define cmd_idx(cmd) (cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST)) + +static ioctl_fn lookup_dev_ioctl(unsigned int cmd) +{ + static struct { + int cmd; + ioctl_fn fn; + } _ioctls[] = { + {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL}, + {cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD), + autofs_dev_ioctl_protover}, + {cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD), + autofs_dev_ioctl_protosubver}, + {cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD), + autofs_dev_ioctl_openmount}, + {cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD), + autofs_dev_ioctl_closemount}, + {cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD), + autofs_dev_ioctl_ready}, + {cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD), + autofs_dev_ioctl_fail}, + {cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD), + autofs_dev_ioctl_setpipefd}, + {cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD), + autofs_dev_ioctl_catatonic}, + {cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD), + autofs_dev_ioctl_timeout}, + {cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD), + autofs_dev_ioctl_requester}, + {cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD), + autofs_dev_ioctl_expire}, + {cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD), + autofs_dev_ioctl_askumount}, + {cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD), + autofs_dev_ioctl_ismountpoint} + }; + unsigned int idx = cmd_idx(cmd); + + return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn; +} + +/* ioctl dispatcher */ +static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user) +{ + struct autofs_dev_ioctl *param; + struct file *fp; + struct autofs_sb_info *sbi; + unsigned int cmd_first, cmd; + ioctl_fn fn = NULL; + int err = 0; + + /* only root can play with this */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); + cmd = _IOC_NR(command); + + if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) || + cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) { + return -ENOTTY; + } + + /* Copy the parameters into kernel space. */ + param = copy_dev_ioctl(user); + if (IS_ERR(param)) + return PTR_ERR(param); + + err = validate_dev_ioctl(command, param); + if (err) + goto out; + + /* The validate routine above always sets the version */ + if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD) + goto done; + + fn = lookup_dev_ioctl(cmd); + if (!fn) { + AUTOFS_WARN("unknown command 0x%08x", command); + return -ENOTTY; + } + + fp = NULL; + sbi = NULL; + + /* + * For obvious reasons the openmount can't have a file + * descriptor yet. We don't take a reference to the + * file during close to allow for immediate release. + */ + if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && + cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { + fp = fget(param->ioctlfd); + if (!fp) { + if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) + goto cont; + err = -EBADF; + goto out; + } + + if (!fp->f_op) { + err = -ENOTTY; + fput(fp); + goto out; + } + + sbi = autofs_dev_ioctl_sbi(fp); + if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) { + err = -EINVAL; + fput(fp); + goto out; + } + + /* + * Admin needs to be able to set the mount catatonic in + * order to be able to perform the re-open. + */ + if (!autofs4_oz_mode(sbi) && + cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { + err = -EACCES; + fput(fp); + goto out; + } + } +cont: + err = fn(fp, sbi, param); + + if (fp) + fput(fp); +done: + if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE)) + err = -EFAULT; +out: + free_dev_ioctl(param); + return err; +} + +static long autofs_dev_ioctl(struct file *file, uint command, ulong u) +{ + int err; + err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); + return (long) err; +} + +#ifdef CONFIG_COMPAT +static long autofs_dev_ioctl_compat(struct file *file, uint command, ulong u) +{ + return (long) autofs_dev_ioctl(file, command, (ulong) compat_ptr(u)); +} +#else +#define autofs_dev_ioctl_compat NULL +#endif + +static const struct file_operations _dev_ioctl_fops = { + .unlocked_ioctl = autofs_dev_ioctl, + .compat_ioctl = autofs_dev_ioctl_compat, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice _autofs_dev_ioctl_misc = { + .minor = AUTOFS_MINOR, + .name = AUTOFS_DEVICE_NAME, + .fops = &_dev_ioctl_fops +}; + +MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); +MODULE_ALIAS("devname:autofs"); + +/* Register/deregister misc character device */ +int autofs_dev_ioctl_init(void) +{ + int r; + + r = misc_register(&_autofs_dev_ioctl_misc); + if (r) { + AUTOFS_ERROR("misc_register failed for control device"); + return r; + } + + return 0; +} + +void autofs_dev_ioctl_exit(void) +{ + misc_deregister(&_autofs_dev_ioctl_misc); + return; +} + diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c new file mode 100644 index 00000000..450f529a --- /dev/null +++ b/fs/autofs4/expire.c @@ -0,0 +1,588 @@ +/* -*- c -*- --------------------------------------------------------------- * + * + * linux/fs/autofs/expire.c + * + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 1999-2000 Jeremy Fitzhardinge + * Copyright 2001-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ------------------------------------------------------------------------- */ + +#include "autofs_i.h" + +static unsigned long now; + +/* Check if a dentry can be expired */ +static inline int autofs4_can_expire(struct dentry *dentry, + unsigned long timeout, int do_now) +{ + struct autofs_info *ino = autofs4_dentry_ino(dentry); + + /* dentry in the process of being deleted */ + if (ino == NULL) + return 0; + + if (!do_now) { + /* Too young to die */ + if (!timeout || time_after(ino->last_used + timeout, now)) + return 0; + + /* update last_used here :- + - obviously makes sense if it is in use now + - less obviously, prevents rapid-fire expire + attempts if expire fails the first time */ + ino->last_used = now; + } + return 1; +} + +/* Check a mount point for busyness */ +static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) +{ + struct dentry *top = dentry; + struct path path = {.mnt = mnt, .dentry = dentry}; + int status = 1; + + DPRINTK("dentry %p %.*s", + dentry, (int)dentry->d_name.len, dentry->d_name.name); + + path_get(&path); + + if (!follow_down_one(&path)) + goto done; + + if (is_autofs4_dentry(path.dentry)) { + struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb); + + /* This is an autofs submount, we can't expire it */ + if (autofs_type_indirect(sbi->type)) + goto done; + + /* + * Otherwise it's an offset mount and we need to check + * if we can umount its mount, if there is one. + */ + if (!d_mountpoint(path.dentry)) { + status = 0; + goto done; + } + } + + /* Update the expiry counter if fs is busy */ + if (!may_umount_tree(path.mnt)) { + struct autofs_info *ino = autofs4_dentry_ino(top); + ino->last_used = jiffies; + goto done; + } + + status = 0; +done: + DPRINTK("returning = %d", status); + path_put(&path); + return status; +} + +/* + * Calculate and dget next entry in the subdirs list under root. + */ +static struct dentry *get_next_positive_subdir(struct dentry *prev, + struct dentry *root) +{ + struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct list_head *next; + struct dentry *p, *q; + + spin_lock(&sbi->lookup_lock); + + if (prev == NULL) { + spin_lock(&root->d_lock); + prev = dget_dlock(root); + next = prev->d_subdirs.next; + p = prev; + goto start; + } + + p = prev; + spin_lock(&p->d_lock); +again: + next = p->d_u.d_child.next; +start: + if (next == &root->d_subdirs) { + spin_unlock(&p->d_lock); + spin_unlock(&sbi->lookup_lock); + dput(prev); + return NULL; + } + + q = list_entry(next, struct dentry, d_u.d_child); + + spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); + /* Negative dentry - try next */ + if (!simple_positive(q)) { + spin_unlock(&p->d_lock); + p = q; + goto again; + } + dget_dlock(q); + spin_unlock(&q->d_lock); + spin_unlock(&p->d_lock); + spin_unlock(&sbi->lookup_lock); + + dput(prev); + + return q; +} + +/* + * Calculate and dget next entry in top down tree traversal. + */ +static struct dentry *get_next_positive_dentry(struct dentry *prev, + struct dentry *root) +{ + struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct list_head *next; + struct dentry *p, *ret; + + if (prev == NULL) + return dget(root); + + spin_lock(&sbi->lookup_lock); +relock: + p = prev; + spin_lock(&p->d_lock); +again: + next = p->d_subdirs.next; + if (next == &p->d_subdirs) { + while (1) { + struct dentry *parent; + + if (p == root) { + spin_unlock(&p->d_lock); + spin_unlock(&sbi->lookup_lock); + dput(prev); + return NULL; + } + + parent = p->d_parent; + if (!spin_trylock(&parent->d_lock)) { + spin_unlock(&p->d_lock); + cpu_relax(); + goto relock; + } + spin_unlock(&p->d_lock); + next = p->d_u.d_child.next; + p = parent; + if (next != &parent->d_subdirs) + break; + } + } + ret = list_entry(next, struct dentry, d_u.d_child); + + spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED); + /* Negative dentry - try next */ + if (!simple_positive(ret)) { + spin_unlock(&p->d_lock); + p = ret; + goto again; + } + dget_dlock(ret); + spin_unlock(&ret->d_lock); + spin_unlock(&p->d_lock); + spin_unlock(&sbi->lookup_lock); + + dput(prev); + + return ret; +} + +/* + * Check a direct mount point for busyness. + * Direct mounts have similar expiry semantics to tree mounts. + * The tree is not busy iff no mountpoints are busy and there are no + * autofs submounts. + */ +static int autofs4_direct_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) +{ + DPRINTK("top %p %.*s", + top, (int) top->d_name.len, top->d_name.name); + + /* If it's busy update the expiry counters */ + if (!may_umount_tree(mnt)) { + struct autofs_info *ino = autofs4_dentry_ino(top); + if (ino) + ino->last_used = jiffies; + return 1; + } + + /* Timeout of a direct mount is determined by its top dentry */ + if (!autofs4_can_expire(top, timeout, do_now)) + return 1; + + return 0; +} + +/* Check a directory tree of mount points for busyness + * The tree is not busy iff no mountpoints are busy + */ +static int autofs4_tree_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) +{ + struct autofs_info *top_ino = autofs4_dentry_ino(top); + struct dentry *p; + + DPRINTK("top %p %.*s", + top, (int)top->d_name.len, top->d_name.name); + + /* Negative dentry - give up */ + if (!simple_positive(top)) + return 1; + + p = NULL; + while ((p = get_next_positive_dentry(p, top))) { + DPRINTK("dentry %p %.*s", + p, (int) p->d_name.len, p->d_name.name); + + /* + * Is someone visiting anywhere in the subtree ? + * If there's no mount we need to check the usage + * count for the autofs dentry. + * If the fs is busy update the expiry counter. + */ + if (d_mountpoint(p)) { + if (autofs4_mount_busy(mnt, p)) { + top_ino->last_used = jiffies; + dput(p); + return 1; + } + } else { + struct autofs_info *ino = autofs4_dentry_ino(p); + unsigned int ino_count = atomic_read(&ino->count); + + /* + * Clean stale dentries below that have not been + * invalidated after a mount fail during lookup + */ + d_invalidate(p); + + /* allow for dget above and top is already dgot */ + if (p == top) + ino_count += 2; + else + ino_count++; + + if (p->d_count > ino_count) { + top_ino->last_used = jiffies; + dput(p); + return 1; + } + } + } + + /* Timeout of a tree mount is ultimately determined by its top dentry */ + if (!autofs4_can_expire(top, timeout, do_now)) + return 1; + + return 0; +} + +static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, + struct dentry *parent, + unsigned long timeout, + int do_now) +{ + struct dentry *p; + + DPRINTK("parent %p %.*s", + parent, (int)parent->d_name.len, parent->d_name.name); + + p = NULL; + while ((p = get_next_positive_dentry(p, parent))) { + DPRINTK("dentry %p %.*s", + p, (int) p->d_name.len, p->d_name.name); + + if (d_mountpoint(p)) { + /* Can we umount this guy */ + if (autofs4_mount_busy(mnt, p)) + continue; + + /* Can we expire this guy */ + if (autofs4_can_expire(p, timeout, do_now)) + return p; + } + } + return NULL; +} + +/* Check if we can expire a direct mount (possibly a tree) */ +struct dentry *autofs4_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) +{ + unsigned long timeout; + struct dentry *root = dget(sb->s_root); + int do_now = how & AUTOFS_EXP_IMMEDIATE; + struct autofs_info *ino; + + if (!root) + return NULL; + + now = jiffies; + timeout = sbi->exp_timeout; + + spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(root); + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) + goto out; + if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + struct autofs_info *ino = autofs4_dentry_ino(root); + ino->flags |= AUTOFS_INF_EXPIRING; + init_completion(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + return root; + } +out: + spin_unlock(&sbi->fs_lock); + dput(root); + + return NULL; +} + +/* + * Find an eligible tree to time-out + * A tree is eligible if :- + * - it is unused by any user process + * - it has been unused for exp_timeout time + */ +struct dentry *autofs4_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) +{ + unsigned long timeout; + struct dentry *root = sb->s_root; + struct dentry *dentry; + struct dentry *expired = NULL; + int do_now = how & AUTOFS_EXP_IMMEDIATE; + int exp_leaves = how & AUTOFS_EXP_LEAVES; + struct autofs_info *ino; + unsigned int ino_count; + + if (!root) + return NULL; + + now = jiffies; + timeout = sbi->exp_timeout; + + dentry = NULL; + while ((dentry = get_next_positive_subdir(dentry, root))) { + spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(dentry); + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) + goto next; + + /* + * Case 1: (i) indirect mount or top level pseudo direct mount + * (autofs-4.1). + * (ii) indirect mount with offset mount, check the "/" + * offset (autofs-5.0+). + */ + if (d_mountpoint(dentry)) { + DPRINTK("checking mountpoint %p %.*s", + dentry, (int)dentry->d_name.len, dentry->d_name.name); + + /* Path walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 2; + if (dentry->d_count > ino_count) + goto next; + + /* Can we umount this guy */ + if (autofs4_mount_busy(mnt, dentry)) + goto next; + + /* Can we expire this guy */ + if (autofs4_can_expire(dentry, timeout, do_now)) { + expired = dentry; + goto found; + } + goto next; + } + + if (simple_empty(dentry)) + goto next; + + /* Case 2: tree mount, expire iff entire tree is not busy */ + if (!exp_leaves) { + /* Path walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (dentry->d_count > ino_count) + goto next; + + if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { + expired = dentry; + goto found; + } + /* + * Case 3: pseudo direct mount, expire individual leaves + * (autofs-4.1). + */ + } else { + /* Path walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (dentry->d_count > ino_count) + goto next; + + expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); + if (expired) { + dput(dentry); + goto found; + } + } +next: + spin_unlock(&sbi->fs_lock); + } + return NULL; + +found: + DPRINTK("returning %p %.*s", + expired, (int)expired->d_name.len, expired->d_name.name); + ino = autofs4_dentry_ino(expired); + ino->flags |= AUTOFS_INF_EXPIRING; + init_completion(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + spin_lock(&sbi->lookup_lock); + spin_lock(&expired->d_parent->d_lock); + spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); + list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); + spin_unlock(&expired->d_lock); + spin_unlock(&expired->d_parent->d_lock); + spin_unlock(&sbi->lookup_lock); + return expired; +} + +int autofs4_expire_wait(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status; + + /* Block on any pending expire */ + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_EXPIRING) { + spin_unlock(&sbi->fs_lock); + + DPRINTK("waiting for expire %p name=%.*s", + dentry, dentry->d_name.len, dentry->d_name.name); + + status = autofs4_wait(sbi, dentry, NFY_NONE); + wait_for_completion(&ino->expire_complete); + + DPRINTK("expire done status=%d", status); + + if (d_unhashed(dentry)) + return -EAGAIN; + + return status; + } + spin_unlock(&sbi->fs_lock); + + return 0; +} + +/* Perform an expiry operation */ +int autofs4_expire_run(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + struct autofs_packet_expire __user *pkt_p) +{ + struct autofs_packet_expire pkt; + struct autofs_info *ino; + struct dentry *dentry; + int ret = 0; + + memset(&pkt,0,sizeof pkt); + + pkt.hdr.proto_version = sbi->version; + pkt.hdr.type = autofs_ptype_expire; + + if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL) + return -EAGAIN; + + pkt.len = dentry->d_name.len; + memcpy(pkt.name, dentry->d_name.name, pkt.len); + pkt.name[pkt.len] = '\0'; + dput(dentry); + + if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) ) + ret = -EFAULT; + + spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(dentry); + ino->flags &= ~AUTOFS_INF_EXPIRING; + complete_all(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + + return ret; +} + +int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when) +{ + struct dentry *dentry; + int ret = -EAGAIN; + + if (autofs_type_trigger(sbi->type)) + dentry = autofs4_expire_direct(sb, mnt, sbi, when); + else + dentry = autofs4_expire_indirect(sb, mnt, sbi, when); + + if (dentry) { + struct autofs_info *ino = autofs4_dentry_ino(dentry); + + /* This is synchronous because it makes the daemon a + little easier */ + ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); + + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_EXPIRING; + spin_lock(&dentry->d_lock); + if (!ret) { + if ((IS_ROOT(dentry) || + (autofs_type_indirect(sbi->type) && + IS_ROOT(dentry->d_parent))) && + !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) + __managed_dentry_set_automount(dentry); + } + spin_unlock(&dentry->d_lock); + complete_all(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + dput(dentry); + } + + return ret; +} + +/* Call repeatedly until it returns -EAGAIN, meaning there's nothing + more to be done */ +int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int __user *arg) +{ + int do_now = 0; + + if (arg && get_user(do_now, arg)) + return -EFAULT; + + return autofs4_do_expire_multi(sb, mnt, sbi, do_now); +} + diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c new file mode 100644 index 00000000..c038727b --- /dev/null +++ b/fs/autofs4/init.c @@ -0,0 +1,51 @@ +/* -*- c -*- --------------------------------------------------------------- * + * + * linux/fs/autofs/init.c + * + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ------------------------------------------------------------------------- */ + +#include +#include +#include "autofs_i.h" + +static struct dentry *autofs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, autofs4_fill_super); +} + +static struct file_system_type autofs_fs_type = { + .owner = THIS_MODULE, + .name = "autofs", + .mount = autofs_mount, + .kill_sb = autofs4_kill_sb, +}; + +static int __init init_autofs4_fs(void) +{ + int err; + + err = register_filesystem(&autofs_fs_type); + if (err) + return err; + + autofs_dev_ioctl_init(); + + return err; +} + +static void __exit exit_autofs4_fs(void) +{ + autofs_dev_ioctl_exit(); + unregister_filesystem(&autofs_fs_type); +} + +module_init(init_autofs4_fs) +module_exit(exit_autofs4_fs) +MODULE_LICENSE("GPL"); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c new file mode 100644 index 00000000..7c26678e --- /dev/null +++ b/fs/autofs4/inode.c @@ -0,0 +1,353 @@ +/* -*- c -*- --------------------------------------------------------------- * + * + * linux/fs/autofs/inode.c + * + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 2005-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "autofs_i.h" +#include + +struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) +{ + struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL); + if (ino) { + INIT_LIST_HEAD(&ino->active); + INIT_LIST_HEAD(&ino->expiring); + ino->last_used = jiffies; + ino->sbi = sbi; + } + return ino; +} + +void autofs4_clean_ino(struct autofs_info *ino) +{ + ino->uid = 0; + ino->gid = 0; + ino->last_used = jiffies; +} + +void autofs4_free_ino(struct autofs_info *ino) +{ + kfree(ino); +} + +void autofs4_kill_sb(struct super_block *sb) +{ + struct autofs_sb_info *sbi = autofs4_sbi(sb); + + /* + * In the event of a failure in get_sb_nodev the superblock + * info is not present so nothing else has been setup, so + * just call kill_anon_super when we are called from + * deactivate_super. + */ + if (!sbi) + goto out_kill_sb; + + /* Free wait queues, close pipe */ + autofs4_catatonic_mode(sbi); + + sb->s_fs_info = NULL; + kfree(sbi); + +out_kill_sb: + DPRINTK("shutting down"); + kill_litter_super(sb); +} + +static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + struct autofs_sb_info *sbi = autofs4_sbi(mnt->mnt_sb); + struct inode *root_inode = mnt->mnt_sb->s_root->d_inode; + + if (!sbi) + return 0; + + seq_printf(m, ",fd=%d", sbi->pipefd); + if (root_inode->i_uid != 0) + seq_printf(m, ",uid=%u", root_inode->i_uid); + if (root_inode->i_gid != 0) + seq_printf(m, ",gid=%u", root_inode->i_gid); + seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); + seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); + seq_printf(m, ",minproto=%d", sbi->min_proto); + seq_printf(m, ",maxproto=%d", sbi->max_proto); + + if (autofs_type_offset(sbi->type)) + seq_printf(m, ",offset"); + else if (autofs_type_direct(sbi->type)) + seq_printf(m, ",direct"); + else + seq_printf(m, ",indirect"); + + return 0; +} + +static void autofs4_evict_inode(struct inode *inode) +{ + end_writeback(inode); + kfree(inode->i_private); +} + +static const struct super_operations autofs4_sops = { + .statfs = simple_statfs, + .show_options = autofs4_show_options, + .evict_inode = autofs4_evict_inode, +}; + +enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, + Opt_indirect, Opt_direct, Opt_offset}; + +static const match_table_t tokens = { + {Opt_fd, "fd=%u"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_pgrp, "pgrp=%u"}, + {Opt_minproto, "minproto=%u"}, + {Opt_maxproto, "maxproto=%u"}, + {Opt_indirect, "indirect"}, + {Opt_direct, "direct"}, + {Opt_offset, "offset"}, + {Opt_err, NULL} +}; + +static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, + pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + *uid = current_uid(); + *gid = current_gid(); + *pgrp = task_pgrp_nr(current); + + *minproto = AUTOFS_MIN_PROTO_VERSION; + *maxproto = AUTOFS_MAX_PROTO_VERSION; + + *pipefd = -1; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_fd: + if (match_int(args, pipefd)) + return 1; + break; + case Opt_uid: + if (match_int(args, &option)) + return 1; + *uid = option; + break; + case Opt_gid: + if (match_int(args, &option)) + return 1; + *gid = option; + break; + case Opt_pgrp: + if (match_int(args, &option)) + return 1; + *pgrp = option; + break; + case Opt_minproto: + if (match_int(args, &option)) + return 1; + *minproto = option; + break; + case Opt_maxproto: + if (match_int(args, &option)) + return 1; + *maxproto = option; + break; + case Opt_indirect: + set_autofs_type_indirect(type); + break; + case Opt_direct: + set_autofs_type_direct(type); + break; + case Opt_offset: + set_autofs_type_offset(type); + break; + default: + return 1; + } + } + return (*pipefd < 0); +} + +int autofs4_fill_super(struct super_block *s, void *data, int silent) +{ + struct inode * root_inode; + struct dentry * root; + struct file * pipe; + int pipefd; + struct autofs_sb_info *sbi; + struct autofs_info *ino; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + goto fail_unlock; + DPRINTK("starting up, sbi = %p",sbi); + + s->s_fs_info = sbi; + sbi->magic = AUTOFS_SBI_MAGIC; + sbi->pipefd = -1; + sbi->pipe = NULL; + sbi->catatonic = 1; + sbi->exp_timeout = 0; + sbi->oz_pgrp = task_pgrp_nr(current); + sbi->sb = s; + sbi->version = 0; + sbi->sub_version = 0; + set_autofs_type_indirect(&sbi->type); + sbi->min_proto = 0; + sbi->max_proto = 0; + mutex_init(&sbi->wq_mutex); + spin_lock_init(&sbi->fs_lock); + sbi->queues = NULL; + spin_lock_init(&sbi->lookup_lock); + INIT_LIST_HEAD(&sbi->active_list); + INIT_LIST_HEAD(&sbi->expiring_list); + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = AUTOFS_SUPER_MAGIC; + s->s_op = &autofs4_sops; + s->s_d_op = &autofs4_dentry_operations; + s->s_time_gran = 1; + + /* + * Get the root inode and dentry, but defer checking for errors. + */ + ino = autofs4_new_ino(sbi); + if (!ino) + goto fail_free; + root_inode = autofs4_get_inode(s, S_IFDIR | 0755); + if (!root_inode) + goto fail_ino; + + root = d_alloc_root(root_inode); + if (!root) + goto fail_iput; + pipe = NULL; + + root->d_fsdata = ino; + + /* Can this call block? */ + if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, + &sbi->oz_pgrp, &sbi->type, &sbi->min_proto, + &sbi->max_proto)) { + printk("autofs: called with bogus options\n"); + goto fail_dput; + } + + if (autofs_type_trigger(sbi->type)) + __managed_dentry_set_managed(root); + + root_inode->i_fop = &autofs4_root_operations; + root_inode->i_op = &autofs4_dir_inode_operations; + + /* Couldn't this be tested earlier? */ + if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || + sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { + printk("autofs: kernel does not match daemon version " + "daemon (%d, %d) kernel (%d, %d)\n", + sbi->min_proto, sbi->max_proto, + AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); + goto fail_dput; + } + + /* Establish highest kernel protocol version */ + if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) + sbi->version = AUTOFS_MAX_PROTO_VERSION; + else + sbi->version = sbi->max_proto; + sbi->sub_version = AUTOFS_PROTO_SUBVERSION; + + DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp); + pipe = fget(pipefd); + + if (!pipe) { + printk("autofs: could not open pipe file descriptor\n"); + goto fail_dput; + } + if (autofs_prepare_pipe(pipe) < 0) + goto fail_fput; + sbi->pipe = pipe; + sbi->pipefd = pipefd; + sbi->catatonic = 0; + + /* + * Success! Install the root dentry now to indicate completion. + */ + s->s_root = root; + return 0; + + /* + * Failure ... clean up. + */ +fail_fput: + printk("autofs: pipe file descriptor does not contain proper ops\n"); + fput(pipe); + /* fall through */ +fail_dput: + dput(root); + goto fail_free; +fail_iput: + printk("autofs: get root dentry failed\n"); + iput(root_inode); +fail_ino: + kfree(ino); +fail_free: + kfree(sbi); + s->s_fs_info = NULL; +fail_unlock: + return -EINVAL; +} + +struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode) +{ + struct inode *inode = new_inode(sb); + + if (inode == NULL) + return NULL; + + inode->i_mode = mode; + if (sb->s_root) { + inode->i_uid = sb->s_root->d_inode->i_uid; + inode->i_gid = sb->s_root->d_inode->i_gid; + } + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = get_next_ino(); + + if (S_ISDIR(mode)) { + inode->i_nlink = 2; + inode->i_op = &autofs4_dir_inode_operations; + inode->i_fop = &autofs4_dir_operations; + } else if (S_ISLNK(mode)) { + inode->i_op = &autofs4_symlink_inode_operations; + } + + return inode; +} diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c new file mode 100644 index 00000000..f55ae23b --- /dev/null +++ b/fs/autofs4/root.c @@ -0,0 +1,895 @@ +/* -*- c -*- --------------------------------------------------------------- * + * + * linux/fs/autofs/root.c + * + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 1999-2000 Jeremy Fitzhardinge + * Copyright 2001-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "autofs_i.h" + +static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); +static int autofs4_dir_unlink(struct inode *,struct dentry *); +static int autofs4_dir_rmdir(struct inode *,struct dentry *); +static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); +static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); +#ifdef CONFIG_COMPAT +static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); +#endif +static int autofs4_dir_open(struct inode *inode, struct file *file); +static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); +static struct vfsmount *autofs4_d_automount(struct path *); +static int autofs4_d_manage(struct dentry *, bool); +static void autofs4_dentry_release(struct dentry *); + +const struct file_operations autofs4_root_operations = { + .open = dcache_dir_open, + .release = dcache_dir_close, + .read = generic_read_dir, + .readdir = dcache_readdir, + .llseek = dcache_dir_lseek, + .unlocked_ioctl = autofs4_root_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = autofs4_root_compat_ioctl, +#endif +}; + +const struct file_operations autofs4_dir_operations = { + .open = autofs4_dir_open, + .release = dcache_dir_close, + .read = generic_read_dir, + .readdir = dcache_readdir, + .llseek = dcache_dir_lseek, +}; + +const struct inode_operations autofs4_dir_inode_operations = { + .lookup = autofs4_lookup, + .unlink = autofs4_dir_unlink, + .symlink = autofs4_dir_symlink, + .mkdir = autofs4_dir_mkdir, + .rmdir = autofs4_dir_rmdir, +}; + +const struct dentry_operations autofs4_dentry_operations = { + .d_automount = autofs4_d_automount, + .d_manage = autofs4_d_manage, + .d_release = autofs4_dentry_release, +}; + +static void autofs4_add_active(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + spin_lock(&sbi->lookup_lock); + if (!ino->active_count) { + if (list_empty(&ino->active)) + list_add(&ino->active, &sbi->active_list); + } + ino->active_count++; + spin_unlock(&sbi->lookup_lock); + } + return; +} + +static void autofs4_del_active(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + spin_lock(&sbi->lookup_lock); + ino->active_count--; + if (!ino->active_count) { + if (!list_empty(&ino->active)) + list_del_init(&ino->active); + } + spin_unlock(&sbi->lookup_lock); + } + return; +} + +static int autofs4_dir_open(struct inode *inode, struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + + DPRINTK("file=%p dentry=%p %.*s", + file, dentry, dentry->d_name.len, dentry->d_name.name); + + if (autofs4_oz_mode(sbi)) + goto out; + + /* + * An empty directory in an autofs file system is always a + * mount point. The daemon must have failed to mount this + * during lookup so it doesn't exist. This can happen, for + * example, if user space returns an incorrect status for a + * mount request. Otherwise we're doing a readdir on the + * autofs file system so just let the libfs routines handle + * it. + */ + spin_lock(&sbi->lookup_lock); + spin_lock(&dentry->d_lock); + if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { + spin_unlock(&dentry->d_lock); + spin_unlock(&sbi->lookup_lock); + return -ENOENT; + } + spin_unlock(&dentry->d_lock); + spin_unlock(&sbi->lookup_lock); + +out: + return dcache_dir_open(inode, file); +} + +static void autofs4_dentry_release(struct dentry *de) +{ + struct autofs_info *ino = autofs4_dentry_ino(de); + struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); + + DPRINTK("releasing %p", de); + + if (!ino) + return; + + if (sbi) { + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->active)) + list_del(&ino->active); + if (!list_empty(&ino->expiring)) + list_del(&ino->expiring); + spin_unlock(&sbi->lookup_lock); + } + + autofs4_free_ino(ino); +} + +static struct dentry *autofs4_lookup_active(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct dentry *parent = dentry->d_parent; + struct qstr *name = &dentry->d_name; + unsigned int len = name->len; + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct list_head *p, *head; + + spin_lock(&sbi->lookup_lock); + head = &sbi->active_list; + list_for_each(p, head) { + struct autofs_info *ino; + struct dentry *active; + struct qstr *qstr; + + ino = list_entry(p, struct autofs_info, active); + active = ino->dentry; + + spin_lock(&active->d_lock); + + /* Already gone? */ + if (active->d_count == 0) + goto next; + + qstr = &active->d_name; + + if (active->d_name.hash != hash) + goto next; + if (active->d_parent != parent) + goto next; + + if (qstr->len != len) + goto next; + if (memcmp(qstr->name, str, len)) + goto next; + + if (d_unhashed(active)) { + dget_dlock(active); + spin_unlock(&active->d_lock); + spin_unlock(&sbi->lookup_lock); + return active; + } +next: + spin_unlock(&active->d_lock); + } + spin_unlock(&sbi->lookup_lock); + + return NULL; +} + +static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct dentry *parent = dentry->d_parent; + struct qstr *name = &dentry->d_name; + unsigned int len = name->len; + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct list_head *p, *head; + + spin_lock(&sbi->lookup_lock); + head = &sbi->expiring_list; + list_for_each(p, head) { + struct autofs_info *ino; + struct dentry *expiring; + struct qstr *qstr; + + ino = list_entry(p, struct autofs_info, expiring); + expiring = ino->dentry; + + spin_lock(&expiring->d_lock); + + /* Bad luck, we've already been dentry_iput */ + if (!expiring->d_inode) + goto next; + + qstr = &expiring->d_name; + + if (expiring->d_name.hash != hash) + goto next; + if (expiring->d_parent != parent) + goto next; + + if (qstr->len != len) + goto next; + if (memcmp(qstr->name, str, len)) + goto next; + + if (d_unhashed(expiring)) { + dget_dlock(expiring); + spin_unlock(&expiring->d_lock); + spin_unlock(&sbi->lookup_lock); + return expiring; + } +next: + spin_unlock(&expiring->d_lock); + } + spin_unlock(&sbi->lookup_lock); + + return NULL; +} + +static int autofs4_mount_wait(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status = 0; + + if (ino->flags & AUTOFS_INF_PENDING) { + DPRINTK("waiting for mount name=%.*s", + dentry->d_name.len, dentry->d_name.name); + status = autofs4_wait(sbi, dentry, NFY_MOUNT); + DPRINTK("mount wait done status=%d", status); + } + ino->last_used = jiffies; + return status; +} + +static int do_expire_wait(struct dentry *dentry) +{ + struct dentry *expiring; + + expiring = autofs4_lookup_expiring(dentry); + if (!expiring) + return autofs4_expire_wait(dentry); + else { + /* + * If we are racing with expire the request might not + * be quite complete, but the directory has been removed + * so it must have been successful, just wait for it. + */ + autofs4_expire_wait(expiring); + autofs4_del_expiring(expiring); + dput(expiring); + } + return 0; +} + +static struct dentry *autofs4_mountpoint_changed(struct path *path) +{ + struct dentry *dentry = path->dentry; + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + + /* + * If this is an indirect mount the dentry could have gone away + * as a result of an expire and a new one created. + */ + if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { + struct dentry *parent = dentry->d_parent; + struct autofs_info *ino; + struct dentry *new = d_lookup(parent, &dentry->d_name); + if (!new) + return NULL; + ino = autofs4_dentry_ino(new); + ino->last_used = jiffies; + dput(path->dentry); + path->dentry = new; + } + return path->dentry; +} + +static struct vfsmount *autofs4_d_automount(struct path *path) +{ + struct dentry *dentry = path->dentry; + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status; + + DPRINTK("dentry=%p %.*s", + dentry, dentry->d_name.len, dentry->d_name.name); + + /* The daemon never triggers a mount. */ + if (autofs4_oz_mode(sbi)) + return NULL; + + /* + * If an expire request is pending everyone must wait. + * If the expire fails we're still mounted so continue + * the follow and return. A return of -EAGAIN (which only + * happens with indirect mounts) means the expire completed + * and the directory was removed, so just go ahead and try + * the mount. + */ + status = do_expire_wait(dentry); + if (status && status != -EAGAIN) + return NULL; + + /* Callback to the daemon to perform the mount or wait */ + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_PENDING) { + spin_unlock(&sbi->fs_lock); + status = autofs4_mount_wait(dentry); + if (status) + return ERR_PTR(status); + spin_lock(&sbi->fs_lock); + goto done; + } + + /* + * If the dentry is a symlink it's equivalent to a directory + * having d_mountpoint() true, so there's no need to call back + * to the daemon. + */ + if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) + goto done; + if (!d_mountpoint(dentry)) { + /* + * It's possible that user space hasn't removed directories + * after umounting a rootless multi-mount, although it + * should. For v5 have_submounts() is sufficient to handle + * this because the leaves of the directory tree under the + * mount never trigger mounts themselves (they have an autofs + * trigger mount mounted on them). But v4 pseudo direct mounts + * do need the leaves to to trigger mounts. In this case we + * have no choice but to use the list_empty() check and + * require user space behave. + */ + if (sbi->version > 4) { + if (have_submounts(dentry)) + goto done; + } else { + spin_lock(&dentry->d_lock); + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dentry->d_lock); + goto done; + } + spin_unlock(&dentry->d_lock); + } + ino->flags |= AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); + status = autofs4_mount_wait(dentry); + if (status) + return ERR_PTR(status); + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_PENDING; + } +done: + if (!(ino->flags & AUTOFS_INF_EXPIRING)) { + /* + * Any needed mounting has been completed and the path + * updated so clear DCACHE_NEED_AUTOMOUNT so we don't + * call ->d_automount() on rootless multi-mounts since + * it can lead to an incorrect ELOOP error return. + * + * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and + * symlinks as in all other cases the dentry will be covered by + * an actual mount so ->d_automount() won't be called during + * the follow. + */ + spin_lock(&dentry->d_lock); + if ((!d_mountpoint(dentry) && + !list_empty(&dentry->d_subdirs)) || + (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) + __managed_dentry_clear_automount(dentry); + spin_unlock(&dentry->d_lock); + } + spin_unlock(&sbi->fs_lock); + + /* Mount succeeded, check if we ended up with a new dentry */ + dentry = autofs4_mountpoint_changed(path); + if (!dentry) + return ERR_PTR(-ENOENT); + + return NULL; +} + +int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + + DPRINTK("dentry=%p %.*s", + dentry, dentry->d_name.len, dentry->d_name.name); + + /* The daemon never waits. */ + if (autofs4_oz_mode(sbi)) { + if (rcu_walk) + return 0; + if (!d_mountpoint(dentry)) + return -EISDIR; + return 0; + } + + /* We need to sleep, so we need pathwalk to be in ref-mode */ + if (rcu_walk) + return -ECHILD; + + /* Wait for pending expires */ + do_expire_wait(dentry); + + /* + * This dentry may be under construction so wait on mount + * completion. + */ + return autofs4_mount_wait(dentry); +} + +/* Lookups in the root directory */ +static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct autofs_sb_info *sbi; + struct autofs_info *ino; + struct dentry *active; + + DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name); + + /* File name too long to exist */ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + sbi = autofs4_sbi(dir->i_sb); + + DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", + current->pid, task_pgrp_nr(current), sbi->catatonic, + autofs4_oz_mode(sbi)); + + active = autofs4_lookup_active(dentry); + if (active) { + return active; + } else { + /* + * A dentry that is not within the root can never trigger a + * mount operation, unless the directory already exists, so we + * can return fail immediately. The daemon however does need + * to create directories within the file system. + */ + if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) + return ERR_PTR(-ENOENT); + + /* Mark entries in the root as mount triggers */ + if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent)) + __managed_dentry_set_managed(dentry); + + ino = autofs4_new_ino(sbi); + if (!ino) + return ERR_PTR(-ENOMEM); + + dentry->d_fsdata = ino; + ino->dentry = dentry; + + autofs4_add_active(dentry); + + d_instantiate(dentry, NULL); + } + return NULL; +} + +static int autofs4_dir_symlink(struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *p_ino; + struct inode *inode; + size_t size = strlen(symname); + char *cp; + + DPRINTK("%s <- %.*s", symname, + dentry->d_name.len, dentry->d_name.name); + + if (!autofs4_oz_mode(sbi)) + return -EACCES; + + BUG_ON(!ino); + + autofs4_clean_ino(ino); + + autofs4_del_active(dentry); + + cp = kmalloc(size + 1, GFP_KERNEL); + if (!cp) + return -ENOMEM; + + strcpy(cp, symname); + + inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555); + if (!inode) { + kfree(cp); + if (!dentry->d_fsdata) + kfree(ino); + return -ENOMEM; + } + inode->i_private = cp; + inode->i_size = size; + d_add(dentry, inode); + + dget(dentry); + atomic_inc(&ino->count); + p_ino = autofs4_dentry_ino(dentry->d_parent); + if (p_ino && dentry->d_parent != dentry) + atomic_inc(&p_ino->count); + + dir->i_mtime = CURRENT_TIME; + + return 0; +} + +/* + * NOTE! + * + * Normal filesystems would do a "d_delete()" to tell the VFS dcache + * that the file no longer exists. However, doing that means that the + * VFS layer can turn the dentry into a negative dentry. We don't want + * this, because the unlink is probably the result of an expire. + * We simply d_drop it and add it to a expiring list in the super block, + * which allows the dentry lookup to check for an incomplete expire. + * + * If a process is blocked on the dentry waiting for the expire to finish, + * it will invalidate the dentry and try to mount with a new one. + * + * Also see autofs4_dir_rmdir().. + */ +static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *p_ino; + + /* This allows root to remove symlinks */ + if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (atomic_dec_and_test(&ino->count)) { + p_ino = autofs4_dentry_ino(dentry->d_parent); + if (p_ino && dentry->d_parent != dentry) + atomic_dec(&p_ino->count); + } + dput(ino->dentry); + + dentry->d_inode->i_size = 0; + clear_nlink(dentry->d_inode); + + dir->i_mtime = CURRENT_TIME; + + spin_lock(&sbi->lookup_lock); + __autofs4_add_expiring(dentry); + spin_lock(&dentry->d_lock); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&sbi->lookup_lock); + + return 0; +} + +/* + * Version 4 of autofs provides a pseudo direct mount implementation + * that relies on directories at the leaves of a directory tree under + * an indirect mount to trigger mounts. To allow for this we need to + * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves + * of the directory tree. There is no need to clear the automount flag + * following a mount or restore it after an expire because these mounts + * are always covered. However, it is necessary to ensure that these + * flags are clear on non-empty directories to avoid unnecessary calls + * during path walks. + */ +static void autofs_set_leaf_automount_flags(struct dentry *dentry) +{ + struct dentry *parent; + + /* root and dentrys in the root are already handled */ + if (IS_ROOT(dentry->d_parent)) + return; + + managed_dentry_set_managed(dentry); + + parent = dentry->d_parent; + /* only consider parents below dentrys in the root */ + if (IS_ROOT(parent->d_parent)) + return; + managed_dentry_clear_managed(parent); + return; +} + +static void autofs_clear_leaf_automount_flags(struct dentry *dentry) +{ + struct list_head *d_child; + struct dentry *parent; + + /* flags for dentrys in the root are handled elsewhere */ + if (IS_ROOT(dentry->d_parent)) + return; + + managed_dentry_clear_managed(dentry); + + parent = dentry->d_parent; + /* only consider parents below dentrys in the root */ + if (IS_ROOT(parent->d_parent)) + return; + d_child = &dentry->d_u.d_child; + /* Set parent managed if it's becoming empty */ + if (d_child->next == &parent->d_subdirs && + d_child->prev == &parent->d_subdirs) + managed_dentry_set_managed(parent); + return; +} + +static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *p_ino; + + DPRINTK("dentry %p, removing %.*s", + dentry, dentry->d_name.len, dentry->d_name.name); + + if (!autofs4_oz_mode(sbi)) + return -EACCES; + + spin_lock(&sbi->lookup_lock); + spin_lock(&dentry->d_lock); + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dentry->d_lock); + spin_unlock(&sbi->lookup_lock); + return -ENOTEMPTY; + } + __autofs4_add_expiring(dentry); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&sbi->lookup_lock); + + if (sbi->version < 5) + autofs_clear_leaf_automount_flags(dentry); + + if (atomic_dec_and_test(&ino->count)) { + p_ino = autofs4_dentry_ino(dentry->d_parent); + if (p_ino && dentry->d_parent != dentry) + atomic_dec(&p_ino->count); + } + dput(ino->dentry); + dentry->d_inode->i_size = 0; + clear_nlink(dentry->d_inode); + + if (dir->i_nlink) + drop_nlink(dir); + + return 0; +} + +static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *p_ino; + struct inode *inode; + + if (!autofs4_oz_mode(sbi)) + return -EACCES; + + DPRINTK("dentry %p, creating %.*s", + dentry, dentry->d_name.len, dentry->d_name.name); + + BUG_ON(!ino); + + autofs4_clean_ino(ino); + + autofs4_del_active(dentry); + + inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555); + if (!inode) + return -ENOMEM; + d_add(dentry, inode); + + if (sbi->version < 5) + autofs_set_leaf_automount_flags(dentry); + + dget(dentry); + atomic_inc(&ino->count); + p_ino = autofs4_dentry_ino(dentry->d_parent); + if (p_ino && dentry->d_parent != dentry) + atomic_inc(&p_ino->count); + inc_nlink(dir); + dir->i_mtime = CURRENT_TIME; + + return 0; +} + +/* Get/set timeout ioctl() operation */ +#ifdef CONFIG_COMPAT +static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, + compat_ulong_t __user *p) +{ + int rv; + unsigned long ntimeout; + + if ((rv = get_user(ntimeout, p)) || + (rv = put_user(sbi->exp_timeout/HZ, p))) + return rv; + + if (ntimeout > UINT_MAX/HZ) + sbi->exp_timeout = 0; + else + sbi->exp_timeout = ntimeout * HZ; + + return 0; +} +#endif + +static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, + unsigned long __user *p) +{ + int rv; + unsigned long ntimeout; + + if ((rv = get_user(ntimeout, p)) || + (rv = put_user(sbi->exp_timeout/HZ, p))) + return rv; + + if (ntimeout > ULONG_MAX/HZ) + sbi->exp_timeout = 0; + else + sbi->exp_timeout = ntimeout * HZ; + + return 0; +} + +/* Return protocol version */ +static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p) +{ + return put_user(sbi->version, p); +} + +/* Return protocol sub version */ +static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p) +{ + return put_user(sbi->sub_version, p); +} + +/* +* Tells the daemon whether it can umount the autofs mount. +*/ +static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) +{ + int status = 0; + + if (may_umount(mnt)) + status = 1; + + DPRINTK("returning %d", status); + + status = put_user(status, p); + + return status; +} + +/* Identify autofs4_dentries - this is so we can tell if there's + an extra dentry refcount or not. We only hold a refcount on the + dentry if its non-negative (ie, d_inode != NULL) +*/ +int is_autofs4_dentry(struct dentry *dentry) +{ + return dentry && dentry->d_inode && + dentry->d_op == &autofs4_dentry_operations && + dentry->d_fsdata != NULL; +} + +/* + * ioctl()'s on the root directory is the chief method for the daemon to + * generate kernel reactions + */ +static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); + void __user *p = (void __user *)arg; + + DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u", + cmd,arg,sbi,task_pgrp_nr(current)); + + if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || + _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) + return -ENOTTY; + + if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch(cmd) { + case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ + return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0); + case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */ + return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT); + case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */ + autofs4_catatonic_mode(sbi); + return 0; + case AUTOFS_IOC_PROTOVER: /* Get protocol version */ + return autofs4_get_protover(sbi, p); + case AUTOFS_IOC_PROTOSUBVER: /* Get protocol sub version */ + return autofs4_get_protosubver(sbi, p); + case AUTOFS_IOC_SETTIMEOUT: + return autofs4_get_set_timeout(sbi, p); +#ifdef CONFIG_COMPAT + case AUTOFS_IOC_SETTIMEOUT32: + return autofs4_compat_get_set_timeout(sbi, p); +#endif + + case AUTOFS_IOC_ASKUMOUNT: + return autofs4_ask_umount(filp->f_path.mnt, p); + + /* return a single thing to expire */ + case AUTOFS_IOC_EXPIRE: + return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p); + /* same as above, but can send multiple expires through pipe */ + case AUTOFS_IOC_EXPIRE_MULTI: + return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p); + + default: + return -ENOSYS; + } +} + +static long autofs4_root_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); +} + +#ifdef CONFIG_COMPAT +static long autofs4_root_compat_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + int ret; + + if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) + ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); + else + ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, + (unsigned long)compat_ptr(arg)); + + return ret; +} +#endif diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c new file mode 100644 index 00000000..f27c094a --- /dev/null +++ b/fs/autofs4/symlink.c @@ -0,0 +1,24 @@ +/* -*- c -*- --------------------------------------------------------------- * + * + * linux/fs/autofs/symlink.c + * + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ------------------------------------------------------------------------- */ + +#include "autofs_i.h" + +static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, dentry->d_inode->i_private); + return NULL; +} + +const struct inode_operations autofs4_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = autofs4_follow_link +}; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c new file mode 100644 index 00000000..813ea10f --- /dev/null +++ b/fs/autofs4/waitq.c @@ -0,0 +1,552 @@ +/* -*- c -*- --------------------------------------------------------------- * + * + * linux/fs/autofs/waitq.c + * + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 2001-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "autofs_i.h" + +/* We make this a static variable rather than a part of the superblock; it + is better if we don't reassign numbers easily even across filesystems */ +static autofs_wqt_t autofs4_next_wait_queue = 1; + +/* These are the signals we allow interrupting a pending mount */ +#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT)) + +void autofs4_catatonic_mode(struct autofs_sb_info *sbi) +{ + struct autofs_wait_queue *wq, *nwq; + + mutex_lock(&sbi->wq_mutex); + if (sbi->catatonic) { + mutex_unlock(&sbi->wq_mutex); + return; + } + + DPRINTK("entering catatonic mode"); + + sbi->catatonic = 1; + wq = sbi->queues; + sbi->queues = NULL; /* Erase all wait queues */ + while (wq) { + nwq = wq->next; + wq->status = -ENOENT; /* Magic is gone - report failure */ + if (wq->name.name) { + kfree(wq->name.name); + wq->name.name = NULL; + } + wq->wait_ctr--; + wake_up_interruptible(&wq->queue); + wq = nwq; + } + fput(sbi->pipe); /* Close the pipe */ + sbi->pipe = NULL; + sbi->pipefd = -1; + mutex_unlock(&sbi->wq_mutex); +} + +static int autofs4_write(struct file *file, const void *addr, int bytes) +{ + unsigned long sigpipe, flags; + mm_segment_t fs; + const char *data = (const char *)addr; + ssize_t wr = 0; + + /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/ + + sigpipe = sigismember(¤t->pending.signal, SIGPIPE); + + /* Save pointer to user space and point back to kernel space */ + fs = get_fs(); + set_fs(KERNEL_DS); + + while (bytes && + (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) { + data += wr; + bytes -= wr; + } + + set_fs(fs); + + /* Keep the currently executing process from receiving a + SIGPIPE unless it was already supposed to get one */ + if (wr == -EPIPE && !sigpipe) { + spin_lock_irqsave(¤t->sighand->siglock, flags); + sigdelset(¤t->pending.signal, SIGPIPE); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + + return (bytes > 0); +} + +static void autofs4_notify_daemon(struct autofs_sb_info *sbi, + struct autofs_wait_queue *wq, + int type) +{ + union { + struct autofs_packet_hdr hdr; + union autofs_packet_union v4_pkt; + union autofs_v5_packet_union v5_pkt; + } pkt; + struct file *pipe = NULL; + size_t pktsz; + + DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d", + wq->wait_queue_token, wq->name.len, wq->name.name, type); + + memset(&pkt,0,sizeof pkt); /* For security reasons */ + + pkt.hdr.proto_version = sbi->version; + pkt.hdr.type = type; + switch (type) { + /* Kernel protocol v4 missing and expire packets */ + case autofs_ptype_missing: + { + struct autofs_packet_missing *mp = &pkt.v4_pkt.missing; + + pktsz = sizeof(*mp); + + mp->wait_queue_token = wq->wait_queue_token; + mp->len = wq->name.len; + memcpy(mp->name, wq->name.name, wq->name.len); + mp->name[wq->name.len] = '\0'; + break; + } + case autofs_ptype_expire_multi: + { + struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi; + + pktsz = sizeof(*ep); + + ep->wait_queue_token = wq->wait_queue_token; + ep->len = wq->name.len; + memcpy(ep->name, wq->name.name, wq->name.len); + ep->name[wq->name.len] = '\0'; + break; + } + /* + * Kernel protocol v5 packet for handling indirect and direct + * mount missing and expire requests + */ + case autofs_ptype_missing_indirect: + case autofs_ptype_expire_indirect: + case autofs_ptype_missing_direct: + case autofs_ptype_expire_direct: + { + struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; + + pktsz = sizeof(*packet); + + packet->wait_queue_token = wq->wait_queue_token; + packet->len = wq->name.len; + memcpy(packet->name, wq->name.name, wq->name.len); + packet->name[wq->name.len] = '\0'; + packet->dev = wq->dev; + packet->ino = wq->ino; + packet->uid = wq->uid; + packet->gid = wq->gid; + packet->pid = wq->pid; + packet->tgid = wq->tgid; + break; + } + default: + printk("autofs4_notify_daemon: bad type %d!\n", type); + return; + } + + /* Check if we have become catatonic */ + mutex_lock(&sbi->wq_mutex); + if (!sbi->catatonic) { + pipe = sbi->pipe; + get_file(pipe); + } + mutex_unlock(&sbi->wq_mutex); + + if (pipe) { + if (autofs4_write(pipe, &pkt, pktsz)) + autofs4_catatonic_mode(sbi); + fput(pipe); + } +} + +static int autofs4_getpath(struct autofs_sb_info *sbi, + struct dentry *dentry, char **name) +{ + struct dentry *root = sbi->sb->s_root; + struct dentry *tmp; + char *buf; + char *p; + int len; + unsigned seq; + +rename_retry: + buf = *name; + len = 0; + + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + spin_lock(&sbi->fs_lock); + for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) + len += tmp->d_name.len + 1; + + if (!len || --len > NAME_MAX) { + spin_unlock(&sbi->fs_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; + return 0; + } + + *(buf + len) = '\0'; + p = buf + len - dentry->d_name.len; + strncpy(p, dentry->d_name.name, dentry->d_name.len); + + for (tmp = dentry->d_parent; tmp != root ; tmp = tmp->d_parent) { + *(--p) = '/'; + p -= tmp->d_name.len; + strncpy(p, tmp->d_name.name, tmp->d_name.len); + } + spin_unlock(&sbi->fs_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; + + return len; +} + +static struct autofs_wait_queue * +autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr) +{ + struct autofs_wait_queue *wq; + + for (wq = sbi->queues; wq; wq = wq->next) { + if (wq->name.hash == qstr->hash && + wq->name.len == qstr->len && + wq->name.name && + !memcmp(wq->name.name, qstr->name, qstr->len)) + break; + } + return wq; +} + +/* + * Check if we have a valid request. + * Returns + * 1 if the request should continue. + * In this case we can return an autofs_wait_queue entry if one is + * found or NULL to idicate a new wait needs to be created. + * 0 or a negative errno if the request shouldn't continue. + */ +static int validate_request(struct autofs_wait_queue **wait, + struct autofs_sb_info *sbi, + struct qstr *qstr, + struct dentry*dentry, enum autofs_notify notify) +{ + struct autofs_wait_queue *wq; + struct autofs_info *ino; + + /* Wait in progress, continue; */ + wq = autofs4_find_wait(sbi, qstr); + if (wq) { + *wait = wq; + return 1; + } + + *wait = NULL; + + /* If we don't yet have any info this is a new request */ + ino = autofs4_dentry_ino(dentry); + if (!ino) + return 1; + + /* + * If we've been asked to wait on an existing expire (NFY_NONE) + * but there is no wait in the queue ... + */ + if (notify == NFY_NONE) { + /* + * Either we've betean the pending expire to post it's + * wait or it finished while we waited on the mutex. + * So we need to wait till either, the wait appears + * or the expire finishes. + */ + + while (ino->flags & AUTOFS_INF_EXPIRING) { + mutex_unlock(&sbi->wq_mutex); + schedule_timeout_interruptible(HZ/10); + if (mutex_lock_interruptible(&sbi->wq_mutex)) + return -EINTR; + + wq = autofs4_find_wait(sbi, qstr); + if (wq) { + *wait = wq; + return 1; + } + } + + /* + * Not ideal but the status has already gone. Of the two + * cases where we wait on NFY_NONE neither depend on the + * return status of the wait. + */ + return 0; + } + + /* + * If we've been asked to trigger a mount and the request + * completed while we waited on the mutex ... + */ + if (notify == NFY_MOUNT) { + struct dentry *new = NULL; + int valid = 1; + + /* + * If the dentry was successfully mounted while we slept + * on the wait queue mutex we can return success. If it + * isn't mounted (doesn't have submounts for the case of + * a multi-mount with no mount at it's base) we can + * continue on and create a new request. + */ + if (!IS_ROOT(dentry)) { + if (dentry->d_inode && d_unhashed(dentry)) { + struct dentry *parent = dentry->d_parent; + new = d_lookup(parent, &dentry->d_name); + if (new) + dentry = new; + } + } + if (have_submounts(dentry)) + valid = 0; + + if (new) + dput(new); + return valid; + } + + return 1; +} + +int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, + enum autofs_notify notify) +{ + struct autofs_wait_queue *wq; + struct qstr qstr; + char *name; + int status, ret, type; + + /* In catatonic mode, we don't wait for nobody */ + if (sbi->catatonic) + return -ENOENT; + + if (!dentry->d_inode) { + /* + * A wait for a negative dentry is invalid for certain + * cases. A direct or offset mount "always" has its mount + * point directory created and so the request dentry must + * be positive or the map key doesn't exist. The situation + * is very similar for indirect mounts except only dentrys + * in the root of the autofs file system may be negative. + */ + if (autofs_type_trigger(sbi->type)) + return -ENOENT; + else if (!IS_ROOT(dentry->d_parent)) + return -ENOENT; + } + + name = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + + /* If this is a direct mount request create a dummy name */ + if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) + qstr.len = sprintf(name, "%p", dentry); + else { + qstr.len = autofs4_getpath(sbi, dentry, &name); + if (!qstr.len) { + kfree(name); + return -ENOENT; + } + } + qstr.name = name; + qstr.hash = full_name_hash(name, qstr.len); + + if (mutex_lock_interruptible(&sbi->wq_mutex)) { + kfree(qstr.name); + return -EINTR; + } + + ret = validate_request(&wq, sbi, &qstr, dentry, notify); + if (ret <= 0) { + if (ret == 0) + mutex_unlock(&sbi->wq_mutex); + kfree(qstr.name); + return ret; + } + + if (!wq) { + /* Create a new wait queue */ + wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL); + if (!wq) { + kfree(qstr.name); + mutex_unlock(&sbi->wq_mutex); + return -ENOMEM; + } + + wq->wait_queue_token = autofs4_next_wait_queue; + if (++autofs4_next_wait_queue == 0) + autofs4_next_wait_queue = 1; + wq->next = sbi->queues; + sbi->queues = wq; + init_waitqueue_head(&wq->queue); + memcpy(&wq->name, &qstr, sizeof(struct qstr)); + wq->dev = autofs4_get_dev(sbi); + wq->ino = autofs4_get_ino(sbi); + wq->uid = current_uid(); + wq->gid = current_gid(); + wq->pid = current->pid; + wq->tgid = current->tgid; + wq->status = -EINTR; /* Status return if interrupted */ + wq->wait_ctr = 2; + mutex_unlock(&sbi->wq_mutex); + + if (sbi->version < 5) { + if (notify == NFY_MOUNT) + type = autofs_ptype_missing; + else + type = autofs_ptype_expire_multi; + } else { + if (notify == NFY_MOUNT) + type = autofs_type_trigger(sbi->type) ? + autofs_ptype_missing_direct : + autofs_ptype_missing_indirect; + else + type = autofs_type_trigger(sbi->type) ? + autofs_ptype_expire_direct : + autofs_ptype_expire_indirect; + } + + DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", + (unsigned long) wq->wait_queue_token, wq->name.len, + wq->name.name, notify); + + /* autofs4_notify_daemon() may block */ + autofs4_notify_daemon(sbi, wq, type); + } else { + wq->wait_ctr++; + mutex_unlock(&sbi->wq_mutex); + kfree(qstr.name); + DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", + (unsigned long) wq->wait_queue_token, wq->name.len, + wq->name.name, notify); + } + + /* + * wq->name.name is NULL iff the lock is already released + * or the mount has been made catatonic. + */ + if (wq->name.name) { + /* Block all but "shutdown" signals while waiting */ + sigset_t oldset; + unsigned long irqflags; + + spin_lock_irqsave(¤t->sighand->siglock, irqflags); + oldset = current->blocked; + siginitsetinv(¤t->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); + + wait_event_interruptible(wq->queue, wq->name.name == NULL); + + spin_lock_irqsave(¤t->sighand->siglock, irqflags); + current->blocked = oldset; + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); + } else { + DPRINTK("skipped sleeping"); + } + + status = wq->status; + + /* + * For direct and offset mounts we need to track the requester's + * uid and gid in the dentry info struct. This is so it can be + * supplied, on request, by the misc device ioctl interface. + * This is needed during daemon resatart when reconnecting + * to existing, active, autofs mounts. The uid and gid (and + * related string values) may be used for macro substitution + * in autofs mount maps. + */ + if (!status) { + struct autofs_info *ino; + struct dentry *de = NULL; + + /* direct mount or browsable map */ + ino = autofs4_dentry_ino(dentry); + if (!ino) { + /* If not lookup actual dentry used */ + de = d_lookup(dentry->d_parent, &dentry->d_name); + if (de) + ino = autofs4_dentry_ino(de); + } + + /* Set mount requester */ + if (ino) { + spin_lock(&sbi->fs_lock); + ino->uid = wq->uid; + ino->gid = wq->gid; + spin_unlock(&sbi->fs_lock); + } + + if (de) + dput(de); + } + + /* Are we the last process to need status? */ + mutex_lock(&sbi->wq_mutex); + if (!--wq->wait_ctr) + kfree(wq); + mutex_unlock(&sbi->wq_mutex); + + return status; +} + + +int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status) +{ + struct autofs_wait_queue *wq, **wql; + + mutex_lock(&sbi->wq_mutex); + for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) { + if (wq->wait_queue_token == wait_queue_token) + break; + } + + if (!wq) { + mutex_unlock(&sbi->wq_mutex); + return -EINVAL; + } + + *wql = wq->next; /* Unlink from chain */ + kfree(wq->name.name); + wq->name.name = NULL; /* Do not wait on this queue */ + wq->status = status; + wake_up_interruptible(&wq->queue); + if (!--wq->wait_ctr) + kfree(wq); + mutex_unlock(&sbi->wq_mutex); + + return 0; +} + diff --git a/fs/bad_inode.c b/fs/bad_inode.c new file mode 100644 index 00000000..bfcb18fe --- /dev/null +++ b/fs/bad_inode.c @@ -0,0 +1,360 @@ +/* + * linux/fs/bad_inode.c + * + * Copyright (C) 1997, Stephen Tweedie + * + * Provide stub functions for unreadable inodes + * + * Fabian Frederick : August 2003 - All file operations assigned to EIO + */ + +#include +#include +#include +#include +#include +#include + + +static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin) +{ + return -EIO; +} + +static ssize_t bad_file_read(struct file *filp, char __user *buf, + size_t size, loff_t *ppos) +{ + return -EIO; +} + +static ssize_t bad_file_write(struct file *filp, const char __user *buf, + size_t siz, loff_t *ppos) +{ + return -EIO; +} + +static ssize_t bad_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + return -EIO; +} + +static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + return -EIO; +} + +static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + return -EIO; +} + +static unsigned int bad_file_poll(struct file *filp, poll_table *wait) +{ + return POLLERR; +} + +static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd, + unsigned long arg) +{ + return -EIO; +} + +static long bad_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return -EIO; +} + +static int bad_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + return -EIO; +} + +static int bad_file_open(struct inode *inode, struct file *filp) +{ + return -EIO; +} + +static int bad_file_flush(struct file *file, fl_owner_t id) +{ + return -EIO; +} + +static int bad_file_release(struct inode *inode, struct file *filp) +{ + return -EIO; +} + +static int bad_file_fsync(struct file *file, int datasync) +{ + return -EIO; +} + +static int bad_file_aio_fsync(struct kiocb *iocb, int datasync) +{ + return -EIO; +} + +static int bad_file_fasync(int fd, struct file *filp, int on) +{ + return -EIO; +} + +static int bad_file_lock(struct file *file, int cmd, struct file_lock *fl) +{ + return -EIO; +} + +static ssize_t bad_file_sendpage(struct file *file, struct page *page, + int off, size_t len, loff_t *pos, int more) +{ + return -EIO; +} + +static unsigned long bad_file_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + return -EIO; +} + +static int bad_file_check_flags(int flags) +{ + return -EIO; +} + +static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl) +{ + return -EIO; +} + +static ssize_t bad_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags) +{ + return -EIO; +} + +static ssize_t bad_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + return -EIO; +} + +static const struct file_operations bad_file_ops = +{ + .llseek = bad_file_llseek, + .read = bad_file_read, + .write = bad_file_write, + .aio_read = bad_file_aio_read, + .aio_write = bad_file_aio_write, + .readdir = bad_file_readdir, + .poll = bad_file_poll, + .unlocked_ioctl = bad_file_unlocked_ioctl, + .compat_ioctl = bad_file_compat_ioctl, + .mmap = bad_file_mmap, + .open = bad_file_open, + .flush = bad_file_flush, + .release = bad_file_release, + .fsync = bad_file_fsync, + .aio_fsync = bad_file_aio_fsync, + .fasync = bad_file_fasync, + .lock = bad_file_lock, + .sendpage = bad_file_sendpage, + .get_unmapped_area = bad_file_get_unmapped_area, + .check_flags = bad_file_check_flags, + .flock = bad_file_flock, + .splice_write = bad_file_splice_write, + .splice_read = bad_file_splice_read, +}; + +static int bad_inode_create (struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + return -EIO; +} + +static struct dentry *bad_inode_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + return ERR_PTR(-EIO); +} + +static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + return -EIO; +} + +static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) +{ + return -EIO; +} + +static int bad_inode_symlink (struct inode *dir, struct dentry *dentry, + const char *symname) +{ + return -EIO; +} + +static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry, + int mode) +{ + return -EIO; +} + +static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) +{ + return -EIO; +} + +static int bad_inode_mknod (struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + return -EIO; +} + +static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + return -EIO; +} + +static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + return -EIO; +} + +static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags) +{ + return -EIO; +} + +static int bad_inode_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + return -EIO; +} + +static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs) +{ + return -EIO; +} + +static int bad_inode_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return -EIO; +} + +static ssize_t bad_inode_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + return -EIO; +} + +static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, + size_t buffer_size) +{ + return -EIO; +} + +static int bad_inode_removexattr(struct dentry *dentry, const char *name) +{ + return -EIO; +} + +static const struct inode_operations bad_inode_ops = +{ + .create = bad_inode_create, + .lookup = bad_inode_lookup, + .link = bad_inode_link, + .unlink = bad_inode_unlink, + .symlink = bad_inode_symlink, + .mkdir = bad_inode_mkdir, + .rmdir = bad_inode_rmdir, + .mknod = bad_inode_mknod, + .rename = bad_inode_rename, + .readlink = bad_inode_readlink, + /* follow_link must be no-op, otherwise unmounting this inode + won't work */ + /* put_link returns void */ + /* truncate returns void */ + .permission = bad_inode_permission, + .getattr = bad_inode_getattr, + .setattr = bad_inode_setattr, + .setxattr = bad_inode_setxattr, + .getxattr = bad_inode_getxattr, + .listxattr = bad_inode_listxattr, + .removexattr = bad_inode_removexattr, + /* truncate_range returns void */ +}; + + +/* + * When a filesystem is unable to read an inode due to an I/O error in + * its read_inode() function, it can call make_bad_inode() to return a + * set of stubs which will return EIO errors as required. + * + * We only need to do limited initialisation: all other fields are + * preinitialised to zero automatically. + */ + +/** + * make_bad_inode - mark an inode bad due to an I/O error + * @inode: Inode to mark bad + * + * When an inode cannot be read due to a media or remote network + * failure this function makes the inode "bad" and causes I/O operations + * on it to fail from this point on. + */ + +void make_bad_inode(struct inode *inode) +{ + remove_inode_hash(inode); + + inode->i_mode = S_IFREG; + inode->i_atime = inode->i_mtime = inode->i_ctime = + current_fs_time(inode->i_sb); + inode->i_op = &bad_inode_ops; + inode->i_fop = &bad_file_ops; +} +EXPORT_SYMBOL(make_bad_inode); + +/* + * This tests whether an inode has been flagged as bad. The test uses + * &bad_inode_ops to cover the case of invalidated inodes as well as + * those created by make_bad_inode() above. + */ + +/** + * is_bad_inode - is an inode errored + * @inode: inode to test + * + * Returns true if the inode in question has been marked as bad. + */ + +int is_bad_inode(struct inode *inode) +{ + return (inode->i_op == &bad_inode_ops); +} + +EXPORT_SYMBOL(is_bad_inode); + +/** + * iget_failed - Mark an under-construction inode as dead and release it + * @inode: The inode to discard + * + * Mark an under-construction inode as dead and release it. + */ +void iget_failed(struct inode *inode) +{ + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); +} +EXPORT_SYMBOL(iget_failed); diff --git a/fs/befs/ChangeLog b/fs/befs/ChangeLog new file mode 100644 index 00000000..75a461cf --- /dev/null +++ b/fs/befs/ChangeLog @@ -0,0 +1,417 @@ +Version 0.92 (2002-03-29) +========== +* Minor cleanup. Ran Lindent on the sources. + +Version 0.92 (2002-03-27) +========== +* Fixed module makefile problem. It was not compiling all the correct + source files! +* Removed duplicated function definition +* Fixed potential null pointer dereference when reporting an error + +Version 0.91 (2002-03-26) +========== +* Oy! Fixed stupid bug that would cause an unresolved symbol error. + Thanks to Laszlo Boszormenyi for pointing this out to me. + +Version 0.9 (2002-03-14) +========== +* Added Sergey S. Kostyliov's patch to eliminate memcpy() overhead + from b+tree operations. Changes the befs_read_datastream() interface. + +* Segregated the functions that interface directly with the linux vfs + interface into their own file called linuxvfs.c. [WD] + +Version 0.64 (2002-02-07) +========== +* Did the string comparison really right this time (btree.c) [WD] + +* Fixed up some places where I assumed that a long int could hold + a pointer value. (btree.c) [WD] + +* Andrew Farnham pointed out that the module + wouldn't work on older (<2.4.10) kernels due to an unresolved symbol. + This is bad, since 2.4.9 is still the current RedHat kernel. I added + a workaround for this problem (compatibility.h) [WD] + +* Sergey S. Kostyliov made befs_find_key() use a binary search to find + keys within btree nodes, rather than the linear search we were using + before. (btree.c) [Sergey S. Kostyliov ] + +* Made a debian package of the source for use with kernel-package. [WD] + + +Version 0.63 (2002-01-31) +========== +* Fixed bug in befs_find_brun_indirect() that would result in the wrong + block being read. It was introduced when adding byteswapping in + 0.61. (datastream.c) [WD] + +* Fixed a longstanding bug in befs_find_key() that would result in it + finding the first key that is a substring of the string it is searching + for. For example, this would cause files in the same directory with + names like file1 and file2 to mysteriously be duplicates of each other + (because they have the same inode number). Many thanks to Pavel Roskin + for reporting this serious bug!!! + (btree.c) [WD] + +* Added support for long symlinks, after Axel Dorfler explained up how + they work. I had forgotten all about them. (inode.c, symlink.c) [WD] + +* Documentation improvements in source. [WD] + +* Makefile fix for independent module when CONFIG_MODVERSION is set in + kernel config [Pavel Roskin ] + +* Compile warning fix for namei.c. [Sergey S. Kostyliov ] + + +Version 0.62 +========== +* Fixed makefile for module install [WD] + + +Version 0.61 (2002-01-20) +========== +* Made functions in endian.h to do the correct byteswapping, no matter + the arch. [WD] + +* Abbandoned silly checks for a NULL superblock pointer in debug.c. [WD] + +* Misc code cleanups. Also cleanup of this changelog file. [WD] + +* Added byteswapping to all metadata reads from disk. + Uses the functions from endian.h [WD] + +* Remove the typedef of struct super_block to vfs_sb, as it offended + certain peoples' aesthetic sense. [WD] + +* Ditto with the befs_read_block() interface. [WD] + + +Version 0.6 (2001-12-15) +========== +* Cleanup of NLS functions (util.c) [WD] + +* Make directory lookup/read use the NLS if an iocharset is provided. [WD] + +* Fixed stupid bug where specifying the uid or gid mount options as '0' + would result in the filesystem using the on-disk uid and gid. [WD] + +* Added mount option to control debug printing. + The option is, simply enough, 'debug'. + (super.c, debug.c) [WD] + +* Removed notion of btree handle from btree.c. It was unnecessary, as the + linux VFS doesn't allow us to keep any state between calls. Updated + dir.c, namei.c befs_fs.h to account for it. [WD] + +* Improved handleing of overflow nodes when listing directories. + Now works for overflow nodes hanging off of nodes other than the root + node. This is the cleaner solution to Brent Miszalaski's problem. [WD] + +* Added new debug/warning/error print functions in debug.c. + More flexible. Will soon be controllable at mount time + (see TODO). [WD] + +* Rewrote datastream position lookups. + (datastream.c) [WD] + +* Moved the TODO list to its own file. + + +Version 0.50 (2001-11-13) +========== +* Added workaround for mis-understanding of the nature of the b+trees used + in directories. A cleaner solution will come after I've thought about it + for a while. Thanks to Brent Miszalaski for finding and reporting this bug. + (btree.c) [WD] + +* Minor cleanups + +* Added test for "impossible" condition of empty internal nodes in + seekleaf() in btree.c [WD] + +* Implemented the abstracted read_block() in io.c [WD] + +* Cleaned up the inode validation in inode.c [WD] + +* Anton Altaparmakov figured out (by asking Linus :) ) what was causing the + hanging disk io problem. It turns out you need to have the sync_pages + callback defined in your address_space_ops, even if it just uses the + default linux-supplied implementation. Fixed. Works now. + (file.c) [WD] + +* Anton Altaparmakov and Christoph Hellwig alerted me to the fact that + filesystem code should be using GFP_NOFS instead of GFP_KERNEL as the + priority parameter to kmalloc(). Fixed. + (datastream.c, btree.c super.c inode.c) [WD] + +* Anton also told me that the blocksize is not allowed to be larger than + the page size in linux, which is 4k i386. Oops. Added a test for + (blocksize > PAGE_SIZE), and refuse to mount in that case. What this + practically means is that 8k blocksize volumes won't work without a major + restructuring of the driver (or an alpha or other 64bit hardware). [WD] + +* Cleaned up the befs_count_blocks() function. Much smarter now. + And somewhat smaller too. [WD] + +* Made inode allocations use a slab cache + (super.c inode.c) [WD] + +* Moved the freeing of the private inode section from put_inode() to + clear_inode(). This fixes a potential free twice type bug. Put_inode() + can be called multiple times for each inode struct. [WD] + +* Converted all non vfs-callback functions to use befs_sb_info as the + superblock type, rather than struct super_block. This is for + portablity. [WD] + +* Fixed a couple of compile warnings due to use of malloc.h, when slab.h + is the new way. (inode.c, super.c) [WD] + +* Fixed erronous includes of linux/befs_fs_i.h and linux/befs_fs_sb.h + in inode.c [WD] + +Version 0.45 (2001-10-29) +========== +* Added functions to get the private superblock and inode structures from + their enclosing public structures. Switched all references to the + private portions to use them. (many files) [WD] + +* Made read_super and read_inode allocate the private portions of those + structures into the generic pointer fields of the public structures + with kmalloc(). put_super and put_inode free them. This allows us not + to have to touch the definitions of the public structures in + include/linux/fs.h. Also, befs_inode_info is huge (because of the + symlink string). (super.c, inode.c, befs_fs.h) [WD] + +* Fixed a thinko that was corrupting file reads after the first block_run + is done being read. (datastream.c) [WD] + +* Removed fsync() hooks, since a read-only filesystem doesn't need them. + [Christoph Hellwig]. + +* Fixed befs_readlink() (symlink.c) [Christoph Hellwig]. + +* Removed all the Read-Write stuff. I'll redo it when it is time to add + write support (various files) [WD]. + +* Removed prototypes for functions who's definitions have been removed + (befs_fs.h) [WD]. + + +Version 0.4 (2001-10-28) +========== +* Made it an option to use the old non-pagecache befs_file_read() for + testing purposes. (fs/Config.in) + +* Fixed unused variable warnings when compiling without debugging. + +* Fixed a bug where the inode and super_block didn't get their blockbits + fields set (inode.c and super.c). + +* Release patch version 11. AKA befs-driver version 0.4. + +* Thats right. New versioning scheme. + I've done some serious testing on it now (on my box anyhow), and it + seems stable and not outragously slow. Existing features are more-or-less + correct (see TODO list). But it isn't 1.0 yet. I think 0.4 gives me some + headroom before the big 1.0. + + +2001-10-26 +========== +* Fixed date format in this file. Was I smoking crack? + +* Removed old datastream code from file.c, since it is nolonger used. + +* Generic_read_file() is now used to read regular file data. + It doesn't chew up the buffer cache (it does page io instead), and seems + to be about as fast (even though it has to look up each file block + indivdualy). And it knows about doing readahead, which is a major plus. + So it does i/o in much larger chunks. It is the correct linux way. It + uses befs_get_block() by way of befs_readpage() to find the disk offsets + of blocks, which in turn calls befs_fpos2brun() in datastream.c to do + the hard work of finding the disk block number. + +* Changed method of checking for a dirty filesystem in befs_read_super + (super.c). Now we check to see if log_start and log_end differ. If so, + the journal needs to be replayed, and the filesystem cannot be mounted. + +* Fixed an extra instance of MOD_DEC_USE_COUNT in super.c + +* Fixed a problem with reading the superblock on devices with large sector + sizes (such as cdroms) on linux 2.4.10 and up. + +2001-10-24 +========== +* Fix nasty bug in converting block numbers to struct befs_inode_addr. + Subtle, because the old version was only sometimes wrong. + Probably responsible for lots of problems. (inode.c) + +* Fix bug with reading an empty directory. (btree.c and dir.c) + +* This one looks good. Release patch version 10 + +2001-10-23 +========== +* Added btree searching function. + +* Use befs_btree_find in befs_lookup (namei.c) + +* Additional comments in btree.c + +2001-10-22 +========== +* Added B+tree reading functions (in btree.c). + Made befs_readdir() use them them instead of the cruft in index.c. + +2001-09-11 +========== +* Converted befs_read_file() to use the new datastream code. + +* Finally updated the README file. + +* Added many comments. + +* Posted version 6 + +* Removed byte-order conversion code. + I have no intention of supporting it, and it was very ugly. + Flow control with #ifdef (ugh). Maybe I'll redo it once + native byteorder works 100%. + +2001-09-10 +========== +* Finished implementing read_datastream() + +* made befs_read_brun() more general + Supports an offset to start at and a max bytes to read + Added a wrapper function to give the old call + +2001-09-30 +========== +* Discovered that the datastream handleing code in file.c is quite deficient + in several respects. For one thing, it doesn't deal with indirect blocks + +* Rewrote datastream handleing. + +* Created io.c, for io related functions. + Previously, the befs_bread() funtions lived in file.c + Created the befs_read_brun() function. + + +2001-09-07 +========== +* Made a function to actually count the number of fs blocks used by a file. + And helper functions. + (fs/befs/inode.c) + +2001-09-05 +========== +* Fixed a misunderstanding of the inode fields. + This fixed the problmem with wrong file sizes from du and others. + The i_blocks field of the inode struct is not the number of blocks for the + inode, it is the number of blocks for the file. Also, i_blksize is not + necessarily the size of the inode, although in practice it works out. + Changed to blocksize of filesystem. + (fs/befs/inode.c) + +* Permanently removed code that had been provisionally ifdefed out of befs_fs.h + +* Since we don't support access time, make that field zero, instead of + copying m_time. + (fs/befs/inode.c) + +* Added sanity check for inode reading + Make sure inode we got was the one we asked for. + (fs/befs/inode.c) + +* Code cleanup + Local pointers to commonly used structures in inode.c. + Got rid of abominations befs_iaddr2inode() and befs_inode2ino(). + Replaced with single function iaddr2blockno(). + (fs/befs/super.c) (fs/befs/inode.c) + +2001-09-01 +========== +* Fixed the problem with statfs where it would always claim the disk was + half full, due to improper understanding of the statfs fields. + (fs/befs/super.c) + +* Posted verion 4 of the patch + +2001-09-01 +========== +* Changed the macros in befs_fs.h to inline functions. + More readable. Typesafe. Better + (include/linux/befs_fs.h) + +* Moved type definitions from befs_fs.h to a new file, befs_fs_types.h + Because befs_fs_i.h and befs_fs_sb.h were including befs_fs.h for the + typedefs, and they are inlcuded in , which has definitions + that I want the inline functions in befs_fs.h to be able to see. Nasty + circularity. + (include/linux/befs_fs.h) + +2001-08-30 +========== +* Cleaned up some wording. + +* Added additional consitency checks on mount + Check block_size agrees with block_shift + Check flags == BEFS_CLEAN + (fs/befs/super.c) + +* Tell the kernel to only mount befs read-only. + By setting the MS_RDONLY flag in befs_read_super(). + Not that it was possible to write before. But now the kernel won't even try. + (fs/befs/super.c) + +* Got rid of kernel warning on mount. + The kernel doesn't like it if you call set_blocksize() on a device when + you have some of its blocks open. Moved the second set_blocksize() to the + very end of befs_read_super(), after we are done with the disk superblock. + (fs/befs/super.c) + +* Fixed wrong number of args bug in befs_dump_inode + (fs/befs/debug.c) + +* Solved lots of type mismatches in kprint()s + (everwhere) + +2001-08-27 +========== +* Cleaned up the fs/Config.in entries a bit, now slightly more descriptive. + +* BeFS depends on NLS, so I made activating BeFS enable the NLS questions + (fs/nls/Config.in) + +* Added Configure.help entries for CONFIG_BEFS_FS and CONFIG_DEBUG_BEFS + (Documentation/Configure.help) + +2001-08-?? +========== +* Removed superblock locking calls in befs_read_super(). In 2.4, the VFS + hands us a super_block struct that is already locked. + +2001-08-13 +========== +* Will Dyson is now attempting to maintain this module + Makoto Kato is original author.Daniel Berlin + also did some work on it (fixing it up for the later 2.3.x kernels, IIRC). + +* Fixed compile errors on 2.4.1 kernel (WD) + Resolve rejected patches + Accommodate changed NLS interface (util.h) + Needed to include in most files + Makefile changes + fs/Config.in changes + +* Tried to niceify the code using the ext2 fs as a guide + Declare befs_fs_type using the DECLARE_FSTYPE_DEV() macro + +* Made it a configure option to turn on debugging (fs/Config.in) + +* Compiles on 2.4.7 diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig new file mode 100644 index 00000000..7835d30f --- /dev/null +++ b/fs/befs/Kconfig @@ -0,0 +1,26 @@ +config BEFS_FS + tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + select NLS + help + The BeOS File System (BeFS) is the native file system of Be, Inc's + BeOS. Notable features include support for arbitrary attributes + on files and directories, and database-like indices on selected + attributes. (Also note that this driver doesn't make those features + available at this time). It is a 64 bit filesystem, so it supports + extremely large volumes and files. + + If you use this filesystem, you should also say Y to at least one + of the NLS (native language support) options below. + + If you don't know what this is about, say N. + + To compile this as a module, choose M here: the module will be + called befs. + +config BEFS_DEBUG + bool "Debug BeFS" + depends on BEFS_FS + help + If you say Y here, you can use the 'debug' mount option to enable + debugging output from the driver. diff --git a/fs/befs/Makefile b/fs/befs/Makefile new file mode 100644 index 00000000..2f370bd7 --- /dev/null +++ b/fs/befs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux BeOS filesystem routines. +# + +obj-$(CONFIG_BEFS_FS) += befs.o + +befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o diff --git a/fs/befs/TODO b/fs/befs/TODO new file mode 100644 index 00000000..3250921a --- /dev/null +++ b/fs/befs/TODO @@ -0,0 +1,14 @@ +TODO +========== + +* Convert comments to the Kernel-Doc format. + +* Befs_fs.h has gotten big and messy. No reason not to break it up into + smaller peices. + +* See if Alexander Viro's option parser made it into the kernel tree. + Use that if we can. (include/linux/parser.h) + +* See if we really need separate types for on-disk and in-memory + representations of the superblock and inode. + diff --git a/fs/befs/befs.h b/fs/befs/befs.h new file mode 100644 index 00000000..d9a40abd --- /dev/null +++ b/fs/befs/befs.h @@ -0,0 +1,156 @@ +/* + * befs.h + * + * Copyright (C) 2001-2002 Will Dyson + * Copyright (C) 1999 Makoto Kato (m_kato@ga2.so-net.ne.jp) + */ + +#ifndef _LINUX_BEFS_H +#define _LINUX_BEFS_H + +#include "befs_fs_types.h" + +/* used in debug.c */ +#define BEFS_VERSION "0.9.3" + + +typedef u64 befs_blocknr_t; +/* + * BeFS in memory structures + */ + +typedef struct befs_mount_options { + gid_t gid; + uid_t uid; + int use_gid; + int use_uid; + int debug; + char *iocharset; +} befs_mount_options; + +typedef struct befs_sb_info { + u32 magic1; + u32 block_size; + u32 block_shift; + int byte_order; + befs_off_t num_blocks; + befs_off_t used_blocks; + u32 inode_size; + u32 magic2; + + /* Allocation group information */ + u32 blocks_per_ag; + u32 ag_shift; + u32 num_ags; + + /* jornal log entry */ + befs_block_run log_blocks; + befs_off_t log_start; + befs_off_t log_end; + + befs_inode_addr root_dir; + befs_inode_addr indices; + u32 magic3; + + befs_mount_options mount_opts; + struct nls_table *nls; + +} befs_sb_info; + +typedef struct befs_inode_info { + u32 i_flags; + u32 i_type; + + befs_inode_addr i_inode_num; + befs_inode_addr i_parent; + befs_inode_addr i_attribute; + + union { + befs_data_stream ds; + char symlink[BEFS_SYMLINK_LEN]; + } i_data; + + struct inode vfs_inode; + +} befs_inode_info; + +enum befs_err { + BEFS_OK, + BEFS_ERR, + BEFS_BAD_INODE, + BEFS_BT_END, + BEFS_BT_EMPTY, + BEFS_BT_MATCH, + BEFS_BT_PARMATCH, + BEFS_BT_NOT_FOUND +}; + + +/****************************/ +/* debug.c */ +void befs_error(const struct super_block *sb, const char *fmt, ...); +void befs_warning(const struct super_block *sb, const char *fmt, ...); +void befs_debug(const struct super_block *sb, const char *fmt, ...); + +void befs_dump_super_block(const struct super_block *sb, befs_super_block *); +void befs_dump_inode(const struct super_block *sb, befs_inode *); +void befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super *); +void befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *); +/****************************/ + + +/* Gets a pointer to the private portion of the super_block + * structure from the public part + */ +static inline befs_sb_info * +BEFS_SB(const struct super_block *super) +{ + return (befs_sb_info *) super->s_fs_info; +} + +static inline befs_inode_info * +BEFS_I(const struct inode *inode) +{ + return list_entry(inode, struct befs_inode_info, vfs_inode); +} + +static inline befs_blocknr_t +iaddr2blockno(struct super_block *sb, befs_inode_addr * iaddr) +{ + return ((iaddr->allocation_group << BEFS_SB(sb)->ag_shift) + + iaddr->start); +} + +static inline befs_inode_addr +blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno) +{ + befs_inode_addr iaddr; + iaddr.allocation_group = blockno >> BEFS_SB(sb)->ag_shift; + iaddr.start = + blockno - (iaddr.allocation_group << BEFS_SB(sb)->ag_shift); + iaddr.len = 1; + + return iaddr; +} + +static inline unsigned int +befs_iaddrs_per_block(struct super_block *sb) +{ + return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr); +} + +static inline int +befs_iaddr_is_empty(befs_inode_addr * iaddr) +{ + return (!iaddr->allocation_group) && (!iaddr->start) && (!iaddr->len); +} + +static inline size_t +befs_brun_size(struct super_block *sb, befs_block_run run) +{ + return BEFS_SB(sb)->block_size * run.len; +} + +#include "endian.h" + +#endif /* _LINUX_BEFS_H */ diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h new file mode 100644 index 00000000..eb557d9d --- /dev/null +++ b/fs/befs/befs_fs_types.h @@ -0,0 +1,255 @@ +/* + * fs/befs/befs_fs_types.h + * + * Copyright (C) 2001 Will Dyson (will@cs.earlham.edu) + * + * + * + * from linux/include/linux/befs_fs.h + * + * Copyright (C) 1999 Makoto Kato (m_kato@ga2.so-net.ne.jp) + * + */ + +#ifndef _LINUX_BEFS_FS_TYPES +#define _LINUX_BEFS_FS_TYPES + +#ifdef __KERNEL__ +#include +#endif /*__KERNEL__*/ + +#define PACKED __attribute__ ((__packed__)) + +/* + * Max name lengths of BFS + */ + +#define BEFS_NAME_LEN 255 + +#define BEFS_SYMLINK_LEN 144 +#define BEFS_NUM_DIRECT_BLOCKS 12 +#define B_OS_NAME_LENGTH 32 + +/* The datastream blocks mapped by the double-indirect + * block are always 4 fs blocks long. + * This eliminates the need for linear searches among + * the potentially huge number of indirect blocks + * + * Err. Should that be 4 fs blocks or 4k??? + * It matters on large blocksize volumes + */ +#define BEFS_DBLINDIR_BRUN_LEN 4 + +/* + * Flags of superblock + */ + +enum super_flags { + BEFS_BYTESEX_BE, + BEFS_BYTESEX_LE, + BEFS_CLEAN = 0x434c454e, + BEFS_DIRTY = 0x44495254, + BEFS_SUPER_MAGIC1 = 0x42465331, /* BFS1 */ + BEFS_SUPER_MAGIC2 = 0xdd121031, + BEFS_SUPER_MAGIC3 = 0x15b6830e, +}; + +#define BEFS_BYTEORDER_NATIVE 0x42494745 +#define BEFS_BYTEORDER_NATIVE_LE (__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE) +#define BEFS_BYTEORDER_NATIVE_BE (__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE) + +#define BEFS_SUPER_MAGIC BEFS_SUPER_MAGIC1 +#define BEFS_SUPER_MAGIC1_LE (__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1) +#define BEFS_SUPER_MAGIC1_BE (__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1) + +/* + * Flags of inode + */ + +#define BEFS_INODE_MAGIC1 0x3bbe0ad9 + +enum inode_flags { + BEFS_INODE_IN_USE = 0x00000001, + BEFS_ATTR_INODE = 0x00000004, + BEFS_INODE_LOGGED = 0x00000008, + BEFS_INODE_DELETED = 0x00000010, + BEFS_LONG_SYMLINK = 0x00000040, + BEFS_PERMANENT_FLAG = 0x0000ffff, + BEFS_INODE_NO_CREATE = 0x00010000, + BEFS_INODE_WAS_WRITTEN = 0x00020000, + BEFS_NO_TRANSACTION = 0x00040000, +}; +/* + * On-Disk datastructures of BeFS + */ + +typedef u64 __bitwise fs64; +typedef u32 __bitwise fs32; +typedef u16 __bitwise fs16; + +typedef u64 befs_off_t; +typedef fs64 befs_time_t; + +/* Block runs */ +typedef struct { + fs32 allocation_group; + fs16 start; + fs16 len; +} PACKED befs_disk_block_run; + +typedef struct { + u32 allocation_group; + u16 start; + u16 len; +} PACKED befs_block_run; + +typedef befs_disk_block_run befs_disk_inode_addr; +typedef befs_block_run befs_inode_addr; + +/* + * The Superblock Structure + */ +typedef struct { + char name[B_OS_NAME_LENGTH]; + fs32 magic1; + fs32 fs_byte_order; + + fs32 block_size; + fs32 block_shift; + + fs64 num_blocks; + fs64 used_blocks; + + fs32 inode_size; + + fs32 magic2; + fs32 blocks_per_ag; + fs32 ag_shift; + fs32 num_ags; + + fs32 flags; + + befs_disk_block_run log_blocks; + fs64 log_start; + fs64 log_end; + + fs32 magic3; + befs_disk_inode_addr root_dir; + befs_disk_inode_addr indices; + +} PACKED befs_super_block; + +/* + * Note: the indirect and dbl_indir block_runs may + * be longer than one block! + */ +typedef struct { + befs_disk_block_run direct[BEFS_NUM_DIRECT_BLOCKS]; + fs64 max_direct_range; + befs_disk_block_run indirect; + fs64 max_indirect_range; + befs_disk_block_run double_indirect; + fs64 max_double_indirect_range; + fs64 size; +} PACKED befs_disk_data_stream; + +typedef struct { + befs_block_run direct[BEFS_NUM_DIRECT_BLOCKS]; + befs_off_t max_direct_range; + befs_block_run indirect; + befs_off_t max_indirect_range; + befs_block_run double_indirect; + befs_off_t max_double_indirect_range; + befs_off_t size; +} PACKED befs_data_stream; + +/* Attribute */ +typedef struct { + fs32 type; + fs16 name_size; + fs16 data_size; + char name[1]; +} PACKED befs_small_data; + +/* Inode structure */ +typedef struct { + fs32 magic1; + befs_disk_inode_addr inode_num; + fs32 uid; + fs32 gid; + fs32 mode; + fs32 flags; + befs_time_t create_time; + befs_time_t last_modified_time; + befs_disk_inode_addr parent; + befs_disk_inode_addr attributes; + fs32 type; + + fs32 inode_size; + fs32 etc; /* not use */ + + union { + befs_disk_data_stream datastream; + char symlink[BEFS_SYMLINK_LEN]; + } data; + + fs32 pad[4]; /* not use */ + befs_small_data small_data[1]; +} PACKED befs_inode; + +/* + * B+tree superblock + */ + +#define BEFS_BTREE_MAGIC 0x69f6c2e8 + +enum btree_types { + BTREE_STRING_TYPE = 0, + BTREE_INT32_TYPE = 1, + BTREE_UINT32_TYPE = 2, + BTREE_INT64_TYPE = 3, + BTREE_UINT64_TYPE = 4, + BTREE_FLOAT_TYPE = 5, + BTREE_DOUBLE_TYPE = 6 +}; + +typedef struct { + fs32 magic; + fs32 node_size; + fs32 max_depth; + fs32 data_type; + fs64 root_node_ptr; + fs64 free_node_ptr; + fs64 max_size; +} PACKED befs_disk_btree_super; + +typedef struct { + u32 magic; + u32 node_size; + u32 max_depth; + u32 data_type; + befs_off_t root_node_ptr; + befs_off_t free_node_ptr; + befs_off_t max_size; +} PACKED befs_btree_super; + +/* + * Header structure of each btree node + */ +typedef struct { + fs64 left; + fs64 right; + fs64 overflow; + fs16 all_key_count; + fs16 all_key_length; +} PACKED befs_btree_nodehead; + +typedef struct { + befs_off_t left; + befs_off_t right; + befs_off_t overflow; + u16 all_key_count; + u16 all_key_length; +} PACKED befs_host_btree_nodehead; + +#endif /* _LINUX_BEFS_FS_TYPES */ diff --git a/fs/befs/btree.c b/fs/befs/btree.c new file mode 100644 index 00000000..a66c9b11 --- /dev/null +++ b/fs/befs/btree.c @@ -0,0 +1,787 @@ +/* + * linux/fs/befs/btree.c + * + * Copyright (C) 2001-2002 Will Dyson + * + * Licensed under the GNU GPL. See the file COPYING for details. + * + * 2002-02-05: Sergey S. Kostyliov added binary search within + * btree nodes. + * + * Many thanks to: + * + * Dominic Giampaolo, author of "Practical File System + * Design with the Be File System", for such a helpful book. + * + * Marcus J. Ranum, author of the b+tree package in + * comp.sources.misc volume 10. This code is not copied from that + * work, but it is partially based on it. + * + * Makoto Kato, author of the original BeFS for linux filesystem + * driver. + */ + +#include +#include +#include +#include +#include + +#include "befs.h" +#include "btree.h" +#include "datastream.h" + +/* + * The btree functions in this file are built on top of the + * datastream.c interface, which is in turn built on top of the + * io.c interface. + */ + +/* Befs B+tree structure: + * + * The first thing in the tree is the tree superblock. It tells you + * all kinds of useful things about the tree, like where the rootnode + * is located, and the size of the nodes (always 1024 with current version + * of BeOS). + * + * The rest of the tree consists of a series of nodes. Nodes contain a header + * (struct befs_btree_nodehead), the packed key data, an array of shorts + * containing the ending offsets for each of the keys, and an array of + * befs_off_t values. In interior nodes, the keys are the ending keys for + * the childnode they point to, and the values are offsets into the + * datastream containing the tree. + */ + +/* Note: + * + * The book states 2 confusing things about befs b+trees. First, + * it states that the overflow field of node headers is used by internal nodes + * to point to another node that "effectively continues this one". Here is what + * I believe that means. Each key in internal nodes points to another node that + * contains key values less than itself. Inspection reveals that the last key + * in the internal node is not the last key in the index. Keys that are + * greater than the last key in the internal node go into the overflow node. + * I imagine there is a performance reason for this. + * + * Second, it states that the header of a btree node is sufficient to + * distinguish internal nodes from leaf nodes. Without saying exactly how. + * After figuring out the first, it becomes obvious that internal nodes have + * overflow nodes and leafnodes do not. + */ + +/* + * Currently, this code is only good for directory B+trees. + * In order to be used for other BFS indexes, it needs to be extended to handle + * duplicate keys and non-string keytypes (int32, int64, float, double). + */ + +/* + * In memory structure of each btree node + */ +typedef struct { + befs_host_btree_nodehead head; /* head of node converted to cpu byteorder */ + struct buffer_head *bh; + befs_btree_nodehead *od_node; /* on disk node */ +} befs_btree_node; + +/* local constants */ +static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL; + +/* local functions */ +static int befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, + befs_btree_super * bt_super, + befs_btree_node * this_node, + befs_off_t * node_off); + +static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, + befs_btree_super * sup); + +static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, + befs_btree_node * node, befs_off_t node_off); + +static int befs_leafnode(befs_btree_node * node); + +static fs16 *befs_bt_keylen_index(befs_btree_node * node); + +static fs64 *befs_bt_valarray(befs_btree_node * node); + +static char *befs_bt_keydata(befs_btree_node * node); + +static int befs_find_key(struct super_block *sb, befs_btree_node * node, + const char *findkey, befs_off_t * value); + +static char *befs_bt_get_key(struct super_block *sb, befs_btree_node * node, + int index, u16 * keylen); + +static int befs_compare_strings(const void *key1, int keylen1, + const void *key2, int keylen2); + +/** + * befs_bt_read_super - read in btree superblock convert to cpu byteorder + * @sb: Filesystem superblock + * @ds: Datastream to read from + * @sup: Buffer in which to place the btree superblock + * + * Calls befs_read_datastream to read in the btree superblock and + * makes sure it is in cpu byteorder, byteswapping if necessary. + * + * On success, returns BEFS_OK and *@sup contains the btree superblock, + * in cpu byte order. + * + * On failure, BEFS_ERR is returned. + */ +static int +befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, + befs_btree_super * sup) +{ + struct buffer_head *bh = NULL; + befs_disk_btree_super *od_sup = NULL; + + befs_debug(sb, "---> befs_btree_read_super()"); + + bh = befs_read_datastream(sb, ds, 0, NULL); + + if (!bh) { + befs_error(sb, "Couldn't read index header."); + goto error; + } + od_sup = (befs_disk_btree_super *) bh->b_data; + befs_dump_index_entry(sb, od_sup); + + sup->magic = fs32_to_cpu(sb, od_sup->magic); + sup->node_size = fs32_to_cpu(sb, od_sup->node_size); + sup->max_depth = fs32_to_cpu(sb, od_sup->max_depth); + sup->data_type = fs32_to_cpu(sb, od_sup->data_type); + sup->root_node_ptr = fs64_to_cpu(sb, od_sup->root_node_ptr); + sup->free_node_ptr = fs64_to_cpu(sb, od_sup->free_node_ptr); + sup->max_size = fs64_to_cpu(sb, od_sup->max_size); + + brelse(bh); + if (sup->magic != BEFS_BTREE_MAGIC) { + befs_error(sb, "Index header has bad magic."); + goto error; + } + + befs_debug(sb, "<--- befs_btree_read_super()"); + return BEFS_OK; + + error: + befs_debug(sb, "<--- befs_btree_read_super() ERROR"); + return BEFS_ERR; +} + +/** + * befs_bt_read_node - read in btree node and convert to cpu byteorder + * @sb: Filesystem superblock + * @ds: Datastream to read from + * @node: Buffer in which to place the btree node + * @node_off: Starting offset (in bytes) of the node in @ds + * + * Calls befs_read_datastream to read in the indicated btree node and + * makes sure its header fields are in cpu byteorder, byteswapping if + * necessary. + * Note: node->bh must be NULL when this function called first + * time. Don't forget brelse(node->bh) after last call. + * + * On success, returns BEFS_OK and *@node contains the btree node that + * starts at @node_off, with the node->head fields in cpu byte order. + * + * On failure, BEFS_ERR is returned. + */ + +static int +befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, + befs_btree_node * node, befs_off_t node_off) +{ + uint off = 0; + + befs_debug(sb, "---> befs_bt_read_node()"); + + if (node->bh) + brelse(node->bh); + + node->bh = befs_read_datastream(sb, ds, node_off, &off); + if (!node->bh) { + befs_error(sb, "befs_bt_read_node() failed to read " + "node at %Lu", node_off); + befs_debug(sb, "<--- befs_bt_read_node() ERROR"); + + return BEFS_ERR; + } + node->od_node = + (befs_btree_nodehead *) ((void *) node->bh->b_data + off); + + befs_dump_index_node(sb, node->od_node); + + node->head.left = fs64_to_cpu(sb, node->od_node->left); + node->head.right = fs64_to_cpu(sb, node->od_node->right); + node->head.overflow = fs64_to_cpu(sb, node->od_node->overflow); + node->head.all_key_count = + fs16_to_cpu(sb, node->od_node->all_key_count); + node->head.all_key_length = + fs16_to_cpu(sb, node->od_node->all_key_length); + + befs_debug(sb, "<--- befs_btree_read_node()"); + return BEFS_OK; +} + +/** + * befs_btree_find - Find a key in a befs B+tree + * @sb: Filesystem superblock + * @ds: Datastream containing btree + * @key: Key string to lookup in btree + * @value: Value stored with @key + * + * On success, returns BEFS_OK and sets *@value to the value stored + * with @key (usually the disk block number of an inode). + * + * On failure, returns BEFS_ERR or BEFS_BT_NOT_FOUND. + * + * Algorithm: + * Read the superblock and rootnode of the b+tree. + * Drill down through the interior nodes using befs_find_key(). + * Once at the correct leaf node, use befs_find_key() again to get the + * actuall value stored with the key. + */ +int +befs_btree_find(struct super_block *sb, befs_data_stream * ds, + const char *key, befs_off_t * value) +{ + befs_btree_node *this_node = NULL; + befs_btree_super bt_super; + befs_off_t node_off; + int res; + + befs_debug(sb, "---> befs_btree_find() Key: %s", key); + + if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { + befs_error(sb, + "befs_btree_find() failed to read index superblock"); + goto error; + } + + this_node = kmalloc(sizeof (befs_btree_node), + GFP_NOFS); + if (!this_node) { + befs_error(sb, "befs_btree_find() failed to allocate %u " + "bytes of memory", sizeof (befs_btree_node)); + goto error; + } + + this_node->bh = NULL; + + /* read in root node */ + node_off = bt_super.root_node_ptr; + if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { + befs_error(sb, "befs_btree_find() failed to read " + "node at %Lu", node_off); + goto error_alloc; + } + + while (!befs_leafnode(this_node)) { + res = befs_find_key(sb, this_node, key, &node_off); + if (res == BEFS_BT_NOT_FOUND) + node_off = this_node->head.overflow; + /* if no match, go to overflow node */ + if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { + befs_error(sb, "befs_btree_find() failed to read " + "node at %Lu", node_off); + goto error_alloc; + } + } + + /* at the correct leaf node now */ + + res = befs_find_key(sb, this_node, key, value); + + brelse(this_node->bh); + kfree(this_node); + + if (res != BEFS_BT_MATCH) { + befs_debug(sb, "<--- befs_btree_find() Key %s not found", key); + *value = 0; + return BEFS_BT_NOT_FOUND; + } + befs_debug(sb, "<--- befs_btree_find() Found key %s, value %Lu", + key, *value); + return BEFS_OK; + + error_alloc: + kfree(this_node); + error: + *value = 0; + befs_debug(sb, "<--- befs_btree_find() ERROR"); + return BEFS_ERR; +} + +/** + * befs_find_key - Search for a key within a node + * @sb: Filesystem superblock + * @node: Node to find the key within + * @key: Keystring to search for + * @value: If key is found, the value stored with the key is put here + * + * finds exact match if one exists, and returns BEFS_BT_MATCH + * If no exact match, finds first key in node that is greater + * (alphabetically) than the search key and returns BEFS_BT_PARMATCH + * (for partial match, I guess). Can you think of something better to + * call it? + * + * If no key was a match or greater than the search key, return + * BEFS_BT_NOT_FOUND. + * + * Use binary search instead of a linear. + */ +static int +befs_find_key(struct super_block *sb, befs_btree_node * node, + const char *findkey, befs_off_t * value) +{ + int first, last, mid; + int eq; + u16 keylen; + int findkey_len; + char *thiskey; + fs64 *valarray; + + befs_debug(sb, "---> befs_find_key() %s", findkey); + + *value = 0; + + findkey_len = strlen(findkey); + + /* if node can not contain key, just skeep this node */ + last = node->head.all_key_count - 1; + thiskey = befs_bt_get_key(sb, node, last, &keylen); + + eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len); + if (eq < 0) { + befs_debug(sb, "<--- befs_find_key() %s not found", findkey); + return BEFS_BT_NOT_FOUND; + } + + valarray = befs_bt_valarray(node); + + /* simple binary search */ + first = 0; + mid = 0; + while (last >= first) { + mid = (last + first) / 2; + befs_debug(sb, "first: %d, last: %d, mid: %d", first, last, + mid); + thiskey = befs_bt_get_key(sb, node, mid, &keylen); + eq = befs_compare_strings(thiskey, keylen, findkey, + findkey_len); + + if (eq == 0) { + befs_debug(sb, "<--- befs_find_key() found %s at %d", + thiskey, mid); + + *value = fs64_to_cpu(sb, valarray[mid]); + return BEFS_BT_MATCH; + } + if (eq > 0) + last = mid - 1; + else + first = mid + 1; + } + if (eq < 0) + *value = fs64_to_cpu(sb, valarray[mid + 1]); + else + *value = fs64_to_cpu(sb, valarray[mid]); + befs_debug(sb, "<--- befs_find_key() found %s at %d", thiskey, mid); + return BEFS_BT_PARMATCH; +} + +/** + * befs_btree_read - Traverse leafnodes of a btree + * @sb: Filesystem superblock + * @ds: Datastream containing btree + * @key_no: Key number (alphabetical order) of key to read + * @bufsize: Size of the buffer to return key in + * @keybuf: Pointer to a buffer to put the key in + * @keysize: Length of the returned key + * @value: Value stored with the returned key + * + * Heres how it works: Key_no is the index of the key/value pair to + * return in keybuf/value. + * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is + * the number of charecters in the key (just a convenience). + * + * Algorithm: + * Get the first leafnode of the tree. See if the requested key is in that + * node. If not, follow the node->right link to the next leafnode. Repeat + * until the (key_no)th key is found or the tree is out of keys. + */ +int +befs_btree_read(struct super_block *sb, befs_data_stream * ds, + loff_t key_no, size_t bufsize, char *keybuf, size_t * keysize, + befs_off_t * value) +{ + befs_btree_node *this_node; + befs_btree_super bt_super; + befs_off_t node_off = 0; + int cur_key; + fs64 *valarray; + char *keystart; + u16 keylen; + int res; + + uint key_sum = 0; + + befs_debug(sb, "---> befs_btree_read()"); + + if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { + befs_error(sb, + "befs_btree_read() failed to read index superblock"); + goto error; + } + + if ((this_node = (befs_btree_node *) + kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { + befs_error(sb, "befs_btree_read() failed to allocate %u " + "bytes of memory", sizeof (befs_btree_node)); + goto error; + } + + node_off = bt_super.root_node_ptr; + this_node->bh = NULL; + + /* seeks down to first leafnode, reads it into this_node */ + res = befs_btree_seekleaf(sb, ds, &bt_super, this_node, &node_off); + if (res == BEFS_BT_EMPTY) { + brelse(this_node->bh); + kfree(this_node); + *value = 0; + *keysize = 0; + befs_debug(sb, "<--- befs_btree_read() Tree is EMPTY"); + return BEFS_BT_EMPTY; + } else if (res == BEFS_ERR) { + goto error_alloc; + } + + /* find the leaf node containing the key_no key */ + + while (key_sum + this_node->head.all_key_count <= key_no) { + + /* no more nodes to look in: key_no is too large */ + if (this_node->head.right == befs_bt_inval) { + *keysize = 0; + *value = 0; + befs_debug(sb, + "<--- befs_btree_read() END of keys at %Lu", + key_sum + this_node->head.all_key_count); + brelse(this_node->bh); + kfree(this_node); + return BEFS_BT_END; + } + + key_sum += this_node->head.all_key_count; + node_off = this_node->head.right; + + if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { + befs_error(sb, "befs_btree_read() failed to read " + "node at %Lu", node_off); + goto error_alloc; + } + } + + /* how many keys into this_node is key_no */ + cur_key = key_no - key_sum; + + /* get pointers to datastructures within the node body */ + valarray = befs_bt_valarray(this_node); + + keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen); + + befs_debug(sb, "Read [%Lu,%d]: keysize %d", node_off, cur_key, keylen); + + if (bufsize < keylen + 1) { + befs_error(sb, "befs_btree_read() keybuf too small (%u) " + "for key of size %d", bufsize, keylen); + brelse(this_node->bh); + goto error_alloc; + }; + + strncpy(keybuf, keystart, keylen); + *value = fs64_to_cpu(sb, valarray[cur_key]); + *keysize = keylen; + keybuf[keylen] = '\0'; + + befs_debug(sb, "Read [%Lu,%d]: Key \"%.*s\", Value %Lu", node_off, + cur_key, keylen, keybuf, *value); + + brelse(this_node->bh); + kfree(this_node); + + befs_debug(sb, "<--- befs_btree_read()"); + + return BEFS_OK; + + error_alloc: + kfree(this_node); + + error: + *keysize = 0; + *value = 0; + befs_debug(sb, "<--- befs_btree_read() ERROR"); + return BEFS_ERR; +} + +/** + * befs_btree_seekleaf - Find the first leafnode in the btree + * @sb: Filesystem superblock + * @ds: Datastream containing btree + * @bt_super: Pointer to the superblock of the btree + * @this_node: Buffer to return the leafnode in + * @node_off: Pointer to offset of current node within datastream. Modified + * by the function. + * + * + * Helper function for btree traverse. Moves the current position to the + * start of the first leaf node. + * + * Also checks for an empty tree. If there are no keys, returns BEFS_BT_EMPTY. + */ +static int +befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, + befs_btree_super * bt_super, befs_btree_node * this_node, + befs_off_t * node_off) +{ + + befs_debug(sb, "---> befs_btree_seekleaf()"); + + if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { + befs_error(sb, "befs_btree_seekleaf() failed to read " + "node at %Lu", *node_off); + goto error; + } + befs_debug(sb, "Seekleaf to root node %Lu", *node_off); + + if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) { + befs_debug(sb, "<--- befs_btree_seekleaf() Tree is EMPTY"); + return BEFS_BT_EMPTY; + } + + while (!befs_leafnode(this_node)) { + + if (this_node->head.all_key_count == 0) { + befs_debug(sb, "befs_btree_seekleaf() encountered " + "an empty interior node: %Lu. Using Overflow " + "node: %Lu", *node_off, + this_node->head.overflow); + *node_off = this_node->head.overflow; + } else { + fs64 *valarray = befs_bt_valarray(this_node); + *node_off = fs64_to_cpu(sb, valarray[0]); + } + if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { + befs_error(sb, "befs_btree_seekleaf() failed to read " + "node at %Lu", *node_off); + goto error; + } + + befs_debug(sb, "Seekleaf to child node %Lu", *node_off); + } + befs_debug(sb, "Node %Lu is a leaf node", *node_off); + + return BEFS_OK; + + error: + befs_debug(sb, "<--- befs_btree_seekleaf() ERROR"); + return BEFS_ERR; +} + +/** + * befs_leafnode - Determine if the btree node is a leaf node or an + * interior node + * @node: Pointer to node structure to test + * + * Return 1 if leaf, 0 if interior + */ +static int +befs_leafnode(befs_btree_node * node) +{ + /* all interior nodes (and only interior nodes) have an overflow node */ + if (node->head.overflow == befs_bt_inval) + return 1; + else + return 0; +} + +/** + * befs_bt_keylen_index - Finds start of keylen index in a node + * @node: Pointer to the node structure to find the keylen index within + * + * Returns a pointer to the start of the key length index array + * of the B+tree node *@node + * + * "The length of all the keys in the node is added to the size of the + * header and then rounded up to a multiple of four to get the beginning + * of the key length index" (p.88, practical filesystem design). + * + * Except that rounding up to 8 works, and rounding up to 4 doesn't. + */ +static fs16 * +befs_bt_keylen_index(befs_btree_node * node) +{ + const int keylen_align = 8; + unsigned long int off = + (sizeof (befs_btree_nodehead) + node->head.all_key_length); + ulong tmp = off % keylen_align; + + if (tmp) + off += keylen_align - tmp; + + return (fs16 *) ((void *) node->od_node + off); +} + +/** + * befs_bt_valarray - Finds the start of value array in a node + * @node: Pointer to the node structure to find the value array within + * + * Returns a pointer to the start of the value array + * of the node pointed to by the node header + */ +static fs64 * +befs_bt_valarray(befs_btree_node * node) +{ + void *keylen_index_start = (void *) befs_bt_keylen_index(node); + size_t keylen_index_size = node->head.all_key_count * sizeof (fs16); + + return (fs64 *) (keylen_index_start + keylen_index_size); +} + +/** + * befs_bt_keydata - Finds start of keydata array in a node + * @node: Pointer to the node structure to find the keydata array within + * + * Returns a pointer to the start of the keydata array + * of the node pointed to by the node header + */ +static char * +befs_bt_keydata(befs_btree_node * node) +{ + return (char *) ((void *) node->od_node + sizeof (befs_btree_nodehead)); +} + +/** + * befs_bt_get_key - returns a pointer to the start of a key + * @sb: filesystem superblock + * @node: node in which to look for the key + * @index: the index of the key to get + * @keylen: modified to be the length of the key at @index + * + * Returns a valid pointer into @node on success. + * Returns NULL on failure (bad input) and sets *@keylen = 0 + */ +static char * +befs_bt_get_key(struct super_block *sb, befs_btree_node * node, + int index, u16 * keylen) +{ + int prev_key_end; + char *keystart; + fs16 *keylen_index; + + if (index < 0 || index > node->head.all_key_count) { + *keylen = 0; + return NULL; + } + + keystart = befs_bt_keydata(node); + keylen_index = befs_bt_keylen_index(node); + + if (index == 0) + prev_key_end = 0; + else + prev_key_end = fs16_to_cpu(sb, keylen_index[index - 1]); + + *keylen = fs16_to_cpu(sb, keylen_index[index]) - prev_key_end; + + return keystart + prev_key_end; +} + +/** + * befs_compare_strings - compare two strings + * @key1: pointer to the first key to be compared + * @keylen1: length in bytes of key1 + * @key2: pointer to the second key to be compared + * @kelen2: length in bytes of key2 + * + * Returns 0 if @key1 and @key2 are equal. + * Returns >0 if @key1 is greater. + * Returns <0 if @key2 is greater.. + */ +static int +befs_compare_strings(const void *key1, int keylen1, + const void *key2, int keylen2) +{ + int len = min_t(int, keylen1, keylen2); + int result = strncmp(key1, key2, len); + if (result == 0) + result = keylen1 - keylen2; + return result; +} + +/* These will be used for non-string keyed btrees */ +#if 0 +static int +btree_compare_int32(cont void *key1, int keylen1, const void *key2, int keylen2) +{ + return *(int32_t *) key1 - *(int32_t *) key2; +} + +static int +btree_compare_uint32(cont void *key1, int keylen1, + const void *key2, int keylen2) +{ + if (*(u_int32_t *) key1 == *(u_int32_t *) key2) + return 0; + else if (*(u_int32_t *) key1 > *(u_int32_t *) key2) + return 1; + + return -1; +} +static int +btree_compare_int64(cont void *key1, int keylen1, const void *key2, int keylen2) +{ + if (*(int64_t *) key1 == *(int64_t *) key2) + return 0; + else if (*(int64_t *) key1 > *(int64_t *) key2) + return 1; + + return -1; +} + +static int +btree_compare_uint64(cont void *key1, int keylen1, + const void *key2, int keylen2) +{ + if (*(u_int64_t *) key1 == *(u_int64_t *) key2) + return 0; + else if (*(u_int64_t *) key1 > *(u_int64_t *) key2) + return 1; + + return -1; +} + +static int +btree_compare_float(cont void *key1, int keylen1, const void *key2, int keylen2) +{ + float result = *(float *) key1 - *(float *) key2; + if (result == 0.0f) + return 0; + + return (result < 0.0f) ? -1 : 1; +} + +static int +btree_compare_double(cont void *key1, int keylen1, + const void *key2, int keylen2) +{ + double result = *(double *) key1 - *(double *) key2; + if (result == 0.0) + return 0; + + return (result < 0.0) ? -1 : 1; +} +#endif //0 diff --git a/fs/befs/btree.h b/fs/befs/btree.h new file mode 100644 index 00000000..92e781e5 --- /dev/null +++ b/fs/befs/btree.h @@ -0,0 +1,13 @@ +/* + * btree.h + * + */ + + +int befs_btree_find(struct super_block *sb, befs_data_stream * ds, + const char *key, befs_off_t * value); + +int befs_btree_read(struct super_block *sb, befs_data_stream * ds, + loff_t key_no, size_t bufsize, char *keybuf, + size_t * keysize, befs_off_t * value); + diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c new file mode 100644 index 00000000..59096b5e --- /dev/null +++ b/fs/befs/datastream.c @@ -0,0 +1,526 @@ +/* + * linux/fs/befs/datastream.c + * + * Copyright (C) 2001 Will Dyson + * + * Based on portions of file.c by Makoto Kato + * + * Many thanks to Dominic Giampaolo, author of "Practical File System + * Design with the Be File System", for such a helpful book. + * + */ + +#include +#include +#include + +#include "befs.h" +#include "datastream.h" +#include "io.h" + +const befs_inode_addr BAD_IADDR = { 0, 0, 0 }; + +static int befs_find_brun_direct(struct super_block *sb, + befs_data_stream * data, + befs_blocknr_t blockno, befs_block_run * run); + +static int befs_find_brun_indirect(struct super_block *sb, + befs_data_stream * data, + befs_blocknr_t blockno, + befs_block_run * run); + +static int befs_find_brun_dblindirect(struct super_block *sb, + befs_data_stream * data, + befs_blocknr_t blockno, + befs_block_run * run); + +/** + * befs_read_datastream - get buffer_head containing data, starting from pos. + * @sb: Filesystem superblock + * @ds: datastrem to find data with + * @pos: start of data + * @off: offset of data in buffer_head->b_data + * + * Returns pointer to buffer_head containing data starting with offset @off, + * if you don't need to know offset just set @off = NULL. + */ +struct buffer_head * +befs_read_datastream(struct super_block *sb, befs_data_stream * ds, + befs_off_t pos, uint * off) +{ + struct buffer_head *bh = NULL; + befs_block_run run; + befs_blocknr_t block; /* block coresponding to pos */ + + befs_debug(sb, "---> befs_read_datastream() %Lu", pos); + block = pos >> BEFS_SB(sb)->block_shift; + if (off) + *off = pos - (block << BEFS_SB(sb)->block_shift); + + if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) { + befs_error(sb, "BeFS: Error finding disk addr of block %lu", + block); + befs_debug(sb, "<--- befs_read_datastream() ERROR"); + return NULL; + } + bh = befs_bread_iaddr(sb, run); + if (!bh) { + befs_error(sb, "BeFS: Error reading block %lu from datastream", + block); + return NULL; + } + + befs_debug(sb, "<--- befs_read_datastream() read data, starting at %Lu", + pos); + + return bh; +} + +/* + * Takes a file position and gives back a brun who's starting block + * is block number fblock of the file. + * + * Returns BEFS_OK or BEFS_ERR. + * + * Calls specialized functions for each of the three possible + * datastream regions. + * + * 2001-11-15 Will Dyson + */ +int +befs_fblock2brun(struct super_block *sb, befs_data_stream * data, + befs_blocknr_t fblock, befs_block_run * run) +{ + int err; + befs_off_t pos = fblock << BEFS_SB(sb)->block_shift; + + if (pos < data->max_direct_range) { + err = befs_find_brun_direct(sb, data, fblock, run); + + } else if (pos < data->max_indirect_range) { + err = befs_find_brun_indirect(sb, data, fblock, run); + + } else if (pos < data->max_double_indirect_range) { + err = befs_find_brun_dblindirect(sb, data, fblock, run); + + } else { + befs_error(sb, + "befs_fblock2brun() was asked to find block %lu, " + "which is not mapped by the datastream\n", fblock); + err = BEFS_ERR; + } + return err; +} + +/** + * befs_read_lsmylink - read long symlink from datastream. + * @sb: Filesystem superblock + * @ds: Datastrem to read from + * @buf: Buffer in which to place long symlink data + * @len: Length of the long symlink in bytes + * + * Returns the number of bytes read + */ +size_t +befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff, + befs_off_t len) +{ + befs_off_t bytes_read = 0; /* bytes readed */ + u16 plen; + struct buffer_head *bh = NULL; + befs_debug(sb, "---> befs_read_lsymlink() length: %Lu", len); + + while (bytes_read < len) { + bh = befs_read_datastream(sb, ds, bytes_read, NULL); + if (!bh) { + befs_error(sb, "BeFS: Error reading datastream block " + "starting from %Lu", bytes_read); + befs_debug(sb, "<--- befs_read_lsymlink() ERROR"); + return bytes_read; + + } + plen = ((bytes_read + BEFS_SB(sb)->block_size) < len) ? + BEFS_SB(sb)->block_size : len - bytes_read; + memcpy(buff + bytes_read, bh->b_data, plen); + brelse(bh); + bytes_read += plen; + } + + befs_debug(sb, "<--- befs_read_lsymlink() read %u bytes", bytes_read); + return bytes_read; +} + +/** + * befs_count_blocks - blocks used by a file + * @sb: Filesystem superblock + * @ds: Datastream of the file + * + * Counts the number of fs blocks that the file represented by + * inode occupies on the filesystem, counting both regular file + * data and filesystem metadata (and eventually attribute data + * when we support attributes) +*/ + +befs_blocknr_t +befs_count_blocks(struct super_block * sb, befs_data_stream * ds) +{ + befs_blocknr_t blocks; + befs_blocknr_t datablocks; /* File data blocks */ + befs_blocknr_t metablocks; /* FS metadata blocks */ + befs_sb_info *befs_sb = BEFS_SB(sb); + + befs_debug(sb, "---> befs_count_blocks()"); + + datablocks = ds->size >> befs_sb->block_shift; + if (ds->size & (befs_sb->block_size - 1)) + datablocks += 1; + + metablocks = 1; /* Start with 1 block for inode */ + + /* Size of indirect block */ + if (ds->size > ds->max_direct_range) + metablocks += ds->indirect.len; + + /* + Double indir block, plus all the indirect blocks it mapps + In the double-indirect range, all block runs of data are + BEFS_DBLINDIR_BRUN_LEN blocks long. Therefore, we know + how many data block runs are in the double-indirect region, + and from that we know how many indirect blocks it takes to + map them. We assume that the indirect blocks are also + BEFS_DBLINDIR_BRUN_LEN blocks long. + */ + if (ds->size > ds->max_indirect_range && ds->max_indirect_range != 0) { + uint dbl_bytes; + uint dbl_bruns; + uint indirblocks; + + dbl_bytes = + ds->max_double_indirect_range - ds->max_indirect_range; + dbl_bruns = + dbl_bytes / (befs_sb->block_size * BEFS_DBLINDIR_BRUN_LEN); + indirblocks = dbl_bruns / befs_iaddrs_per_block(sb); + + metablocks += ds->double_indirect.len; + metablocks += indirblocks; + } + + blocks = datablocks + metablocks; + befs_debug(sb, "<--- befs_count_blocks() %u blocks", blocks); + + return blocks; +} + +/* + Finds the block run that starts at file block number blockno + in the file represented by the datastream data, if that + blockno is in the direct region of the datastream. + + sb: the superblock + data: the datastream + blockno: the blocknumber to find + run: The found run is passed back through this pointer + + Return value is BEFS_OK if the blockrun is found, BEFS_ERR + otherwise. + + Algorithm: + Linear search. Checks each element of array[] to see if it + contains the blockno-th filesystem block. This is necessary + because the block runs map variable amounts of data. Simply + keeps a count of the number of blocks searched so far (sum), + incrementing this by the length of each block run as we come + across it. Adds sum to *count before returning (this is so + you can search multiple arrays that are logicaly one array, + as in the indirect region code). + + When/if blockno is found, if blockno is inside of a block + run as stored on disk, we offset the start and length members + of the block run, so that blockno is the start and len is + still valid (the run ends in the same place). + + 2001-11-15 Will Dyson +*/ +static int +befs_find_brun_direct(struct super_block *sb, befs_data_stream * data, + befs_blocknr_t blockno, befs_block_run * run) +{ + int i; + befs_block_run *array = data->direct; + befs_blocknr_t sum; + befs_blocknr_t max_block = + data->max_direct_range >> BEFS_SB(sb)->block_shift; + + befs_debug(sb, "---> befs_find_brun_direct(), find %lu", blockno); + + if (blockno > max_block) { + befs_error(sb, "befs_find_brun_direct() passed block outside of" + "direct region"); + return BEFS_ERR; + } + + for (i = 0, sum = 0; i < BEFS_NUM_DIRECT_BLOCKS; + sum += array[i].len, i++) { + if (blockno >= sum && blockno < sum + (array[i].len)) { + int offset = blockno - sum; + run->allocation_group = array[i].allocation_group; + run->start = array[i].start + offset; + run->len = array[i].len - offset; + + befs_debug(sb, "---> befs_find_brun_direct(), " + "found %lu at direct[%d]", blockno, i); + return BEFS_OK; + } + } + + befs_debug(sb, "---> befs_find_brun_direct() ERROR"); + return BEFS_ERR; +} + +/* + Finds the block run that starts at file block number blockno + in the file represented by the datastream data, if that + blockno is in the indirect region of the datastream. + + sb: the superblock + data: the datastream + blockno: the blocknumber to find + run: The found run is passed back through this pointer + + Return value is BEFS_OK if the blockrun is found, BEFS_ERR + otherwise. + + Algorithm: + For each block in the indirect run of the datastream, read + it in and search through it for search_blk. + + XXX: + Really should check to make sure blockno is inside indirect + region. + + 2001-11-15 Will Dyson +*/ +static int +befs_find_brun_indirect(struct super_block *sb, + befs_data_stream * data, befs_blocknr_t blockno, + befs_block_run * run) +{ + int i, j; + befs_blocknr_t sum = 0; + befs_blocknr_t indir_start_blk; + befs_blocknr_t search_blk; + struct buffer_head *indirblock; + befs_disk_block_run *array; + + befs_block_run indirect = data->indirect; + befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect); + int arraylen = befs_iaddrs_per_block(sb); + + befs_debug(sb, "---> befs_find_brun_indirect(), find %lu", blockno); + + indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift; + search_blk = blockno - indir_start_blk; + + /* Examine blocks of the indirect run one at a time */ + for (i = 0; i < indirect.len; i++) { + indirblock = befs_bread(sb, indirblockno + i); + if (indirblock == NULL) { + befs_debug(sb, + "---> befs_find_brun_indirect() failed to " + "read disk block %lu from the indirect brun", + indirblockno + i); + return BEFS_ERR; + } + + array = (befs_disk_block_run *) indirblock->b_data; + + for (j = 0; j < arraylen; ++j) { + int len = fs16_to_cpu(sb, array[j].len); + + if (search_blk >= sum && search_blk < sum + len) { + int offset = search_blk - sum; + run->allocation_group = + fs32_to_cpu(sb, array[j].allocation_group); + run->start = + fs16_to_cpu(sb, array[j].start) + offset; + run->len = + fs16_to_cpu(sb, array[j].len) - offset; + + brelse(indirblock); + befs_debug(sb, + "<--- befs_find_brun_indirect() found " + "file block %lu at indirect[%d]", + blockno, j + (i * arraylen)); + return BEFS_OK; + } + sum += len; + } + + brelse(indirblock); + } + + /* Only fallthrough is an error */ + befs_error(sb, "BeFS: befs_find_brun_indirect() failed to find " + "file block %lu", blockno); + + befs_debug(sb, "<--- befs_find_brun_indirect() ERROR"); + return BEFS_ERR; +} + +/* + Finds the block run that starts at file block number blockno + in the file represented by the datastream data, if that + blockno is in the double-indirect region of the datastream. + + sb: the superblock + data: the datastream + blockno: the blocknumber to find + run: The found run is passed back through this pointer + + Return value is BEFS_OK if the blockrun is found, BEFS_ERR + otherwise. + + Algorithm: + The block runs in the double-indirect region are different. + They are always allocated 4 fs blocks at a time, so each + block run maps a constant amount of file data. This means + that we can directly calculate how many block runs into the + double-indirect region we need to go to get to the one that + maps a particular filesystem block. + + We do this in two stages. First we calculate which of the + inode addresses in the double-indirect block will point us + to the indirect block that contains the mapping for the data, + then we calculate which of the inode addresses in that + indirect block maps the data block we are after. + + Oh, and once we've done that, we actually read in the blocks + that contain the inode addresses we calculated above. Even + though the double-indirect run may be several blocks long, + we can calculate which of those blocks will contain the index + we are after and only read that one. We then follow it to + the indirect block and perform a similar process to find + the actual block run that maps the data block we are interested + in. + + Then we offset the run as in befs_find_brun_array() and we are + done. + + 2001-11-15 Will Dyson +*/ +static int +befs_find_brun_dblindirect(struct super_block *sb, + befs_data_stream * data, befs_blocknr_t blockno, + befs_block_run * run) +{ + int dblindir_indx; + int indir_indx; + int offset; + int dbl_which_block; + int which_block; + int dbl_block_indx; + int block_indx; + off_t dblindir_leftover; + befs_blocknr_t blockno_at_run_start; + struct buffer_head *dbl_indir_block; + struct buffer_head *indir_block; + befs_block_run indir_run; + befs_disk_inode_addr *iaddr_array = NULL; + befs_sb_info *befs_sb = BEFS_SB(sb); + + befs_blocknr_t indir_start_blk = + data->max_indirect_range >> befs_sb->block_shift; + + off_t dbl_indir_off = blockno - indir_start_blk; + + /* number of data blocks mapped by each of the iaddrs in + * the indirect block pointed to by the double indirect block + */ + size_t iblklen = BEFS_DBLINDIR_BRUN_LEN; + + /* number of data blocks mapped by each of the iaddrs in + * the double indirect block + */ + size_t diblklen = iblklen * befs_iaddrs_per_block(sb) + * BEFS_DBLINDIR_BRUN_LEN; + + befs_debug(sb, "---> befs_find_brun_dblindirect() find %lu", blockno); + + /* First, discover which of the double_indir->indir blocks + * contains pos. Then figure out how much of pos that + * accounted for. Then discover which of the iaddrs in + * the indirect block contains pos. + */ + + dblindir_indx = dbl_indir_off / diblklen; + dblindir_leftover = dbl_indir_off % diblklen; + indir_indx = dblindir_leftover / diblklen; + + /* Read double indirect block */ + dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb); + if (dbl_which_block > data->double_indirect.len) { + befs_error(sb, "The double-indirect index calculated by " + "befs_read_brun_dblindirect(), %d, is outside the range " + "of the double-indirect block", dblindir_indx); + return BEFS_ERR; + } + + dbl_indir_block = + befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) + + dbl_which_block); + if (dbl_indir_block == NULL) { + befs_error(sb, "befs_read_brun_dblindirect() couldn't read the " + "double-indirect block at blockno %lu", + iaddr2blockno(sb, + &data->double_indirect) + + dbl_which_block); + brelse(dbl_indir_block); + return BEFS_ERR; + } + + dbl_block_indx = + dblindir_indx - (dbl_which_block * befs_iaddrs_per_block(sb)); + iaddr_array = (befs_disk_inode_addr *) dbl_indir_block->b_data; + indir_run = fsrun_to_cpu(sb, iaddr_array[dbl_block_indx]); + brelse(dbl_indir_block); + iaddr_array = NULL; + + /* Read indirect block */ + which_block = indir_indx / befs_iaddrs_per_block(sb); + if (which_block > indir_run.len) { + befs_error(sb, "The indirect index calculated by " + "befs_read_brun_dblindirect(), %d, is outside the range " + "of the indirect block", indir_indx); + return BEFS_ERR; + } + + indir_block = + befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); + if (indir_block == NULL) { + befs_error(sb, "befs_read_brun_dblindirect() couldn't read the " + "indirect block at blockno %lu", + iaddr2blockno(sb, &indir_run) + which_block); + brelse(indir_block); + return BEFS_ERR; + } + + block_indx = indir_indx - (which_block * befs_iaddrs_per_block(sb)); + iaddr_array = (befs_disk_inode_addr *) indir_block->b_data; + *run = fsrun_to_cpu(sb, iaddr_array[block_indx]); + brelse(indir_block); + iaddr_array = NULL; + + blockno_at_run_start = indir_start_blk; + blockno_at_run_start += diblklen * dblindir_indx; + blockno_at_run_start += iblklen * indir_indx; + offset = blockno - blockno_at_run_start; + + run->start += offset; + run->len -= offset; + + befs_debug(sb, "Found file block %lu in double_indirect[%d][%d]," + " double_indirect_leftover = %lu", + blockno, dblindir_indx, indir_indx, dblindir_leftover); + + return BEFS_OK; +} diff --git a/fs/befs/datastream.h b/fs/befs/datastream.h new file mode 100644 index 00000000..45e8a3c9 --- /dev/null +++ b/fs/befs/datastream.h @@ -0,0 +1,19 @@ +/* + * datastream.h + * + */ + +struct buffer_head *befs_read_datastream(struct super_block *sb, + befs_data_stream * ds, befs_off_t pos, + uint * off); + +int befs_fblock2brun(struct super_block *sb, befs_data_stream * data, + befs_blocknr_t fblock, befs_block_run * run); + +size_t befs_read_lsymlink(struct super_block *sb, befs_data_stream * data, + void *buff, befs_off_t len); + +befs_blocknr_t befs_count_blocks(struct super_block *sb, befs_data_stream * ds); + +extern const befs_inode_addr BAD_IADDR; + diff --git a/fs/befs/debug.c b/fs/befs/debug.c new file mode 100644 index 00000000..622e7377 --- /dev/null +++ b/fs/befs/debug.c @@ -0,0 +1,282 @@ +/* + * linux/fs/befs/debug.c + * + * Copyright (C) 2001 Will Dyson (will_dyson at pobox.com) + * + * With help from the ntfs-tng driver by Anton Altparmakov + * + * Copyright (C) 1999 Makoto Kato (m_kato@ga2.so-net.ne.jp) + * + * debug functions + */ + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include + +#endif /* __KERNEL__ */ + +#include "befs.h" + +#define ERRBUFSIZE 1024 + +void +befs_error(const struct super_block *sb, const char *fmt, ...) +{ + va_list args; + char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); + if (err_buf == NULL) { + printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE); + return; + } + + va_start(args, fmt); + vsnprintf(err_buf, ERRBUFSIZE, fmt, args); + va_end(args); + + printk(KERN_ERR "BeFS(%s): %s\n", sb->s_id, err_buf); + kfree(err_buf); +} + +void +befs_warning(const struct super_block *sb, const char *fmt, ...) +{ + va_list args; + char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); + if (err_buf == NULL) { + printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE); + return; + } + + va_start(args, fmt); + vsnprintf(err_buf, ERRBUFSIZE, fmt, args); + va_end(args); + + printk(KERN_WARNING "BeFS(%s): %s\n", sb->s_id, err_buf); + + kfree(err_buf); +} + +void +befs_debug(const struct super_block *sb, const char *fmt, ...) +{ +#ifdef CONFIG_BEFS_DEBUG + + va_list args; + char *err_buf = NULL; + + if (BEFS_SB(sb)->mount_opts.debug) { + err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); + if (err_buf == NULL) { + printk(KERN_ERR "could not allocate %d bytes\n", + ERRBUFSIZE); + return; + } + + va_start(args, fmt); + vsnprintf(err_buf, ERRBUFSIZE, fmt, args); + va_end(args); + + printk(KERN_DEBUG "BeFS(%s): %s\n", sb->s_id, err_buf); + + kfree(err_buf); + } + +#endif //CONFIG_BEFS_DEBUG +} + +void +befs_dump_inode(const struct super_block *sb, befs_inode * inode) +{ +#ifdef CONFIG_BEFS_DEBUG + + befs_block_run tmp_run; + + befs_debug(sb, "befs_inode information"); + + befs_debug(sb, " magic1 %08x", fs32_to_cpu(sb, inode->magic1)); + + tmp_run = fsrun_to_cpu(sb, inode->inode_num); + befs_debug(sb, " inode_num %u, %hu, %hu", + tmp_run.allocation_group, tmp_run.start, tmp_run.len); + + befs_debug(sb, " uid %u", fs32_to_cpu(sb, inode->uid)); + befs_debug(sb, " gid %u", fs32_to_cpu(sb, inode->gid)); + befs_debug(sb, " mode %08x", fs32_to_cpu(sb, inode->mode)); + befs_debug(sb, " flags %08x", fs32_to_cpu(sb, inode->flags)); + befs_debug(sb, " create_time %Lu", + fs64_to_cpu(sb, inode->create_time)); + befs_debug(sb, " last_modified_time %Lu", + fs64_to_cpu(sb, inode->last_modified_time)); + + tmp_run = fsrun_to_cpu(sb, inode->parent); + befs_debug(sb, " parent [%u, %hu, %hu]", + tmp_run.allocation_group, tmp_run.start, tmp_run.len); + + tmp_run = fsrun_to_cpu(sb, inode->attributes); + befs_debug(sb, " attributes [%u, %hu, %hu]", + tmp_run.allocation_group, tmp_run.start, tmp_run.len); + + befs_debug(sb, " type %08x", fs32_to_cpu(sb, inode->type)); + befs_debug(sb, " inode_size %u", fs32_to_cpu(sb, inode->inode_size)); + + if (S_ISLNK(fs32_to_cpu(sb, inode->mode))) { + befs_debug(sb, " Symbolic link [%s]", inode->data.symlink); + } else { + int i; + + for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; i++) { + tmp_run = + fsrun_to_cpu(sb, inode->data.datastream.direct[i]); + befs_debug(sb, " direct %d [%u, %hu, %hu]", i, + tmp_run.allocation_group, tmp_run.start, + tmp_run.len); + } + befs_debug(sb, " max_direct_range %Lu", + fs64_to_cpu(sb, + inode->data.datastream. + max_direct_range)); + + tmp_run = fsrun_to_cpu(sb, inode->data.datastream.indirect); + befs_debug(sb, " indirect [%u, %hu, %hu]", + tmp_run.allocation_group, + tmp_run.start, tmp_run.len); + + befs_debug(sb, " max_indirect_range %Lu", + fs64_to_cpu(sb, + inode->data.datastream. + max_indirect_range)); + + tmp_run = + fsrun_to_cpu(sb, inode->data.datastream.double_indirect); + befs_debug(sb, " double indirect [%u, %hu, %hu]", + tmp_run.allocation_group, tmp_run.start, + tmp_run.len); + + befs_debug(sb, " max_double_indirect_range %Lu", + fs64_to_cpu(sb, + inode->data.datastream. + max_double_indirect_range)); + + befs_debug(sb, " size %Lu", + fs64_to_cpu(sb, inode->data.datastream.size)); + } + +#endif //CONFIG_BEFS_DEBUG +} + +/* + * Display super block structure for debug. + */ + +void +befs_dump_super_block(const struct super_block *sb, befs_super_block * sup) +{ +#ifdef CONFIG_BEFS_DEBUG + + befs_block_run tmp_run; + + befs_debug(sb, "befs_super_block information"); + + befs_debug(sb, " name %s", sup->name); + befs_debug(sb, " magic1 %08x", fs32_to_cpu(sb, sup->magic1)); + befs_debug(sb, " fs_byte_order %08x", + fs32_to_cpu(sb, sup->fs_byte_order)); + + befs_debug(sb, " block_size %u", fs32_to_cpu(sb, sup->block_size)); + befs_debug(sb, " block_shift %u", fs32_to_cpu(sb, sup->block_shift)); + + befs_debug(sb, " num_blocks %Lu", fs64_to_cpu(sb, sup->num_blocks)); + befs_debug(sb, " used_blocks %Lu", fs64_to_cpu(sb, sup->used_blocks)); + + befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2)); + befs_debug(sb, " blocks_per_ag %u", + fs32_to_cpu(sb, sup->blocks_per_ag)); + befs_debug(sb, " ag_shift %u", fs32_to_cpu(sb, sup->ag_shift)); + befs_debug(sb, " num_ags %u", fs32_to_cpu(sb, sup->num_ags)); + + befs_debug(sb, " flags %08x", fs32_to_cpu(sb, sup->flags)); + + tmp_run = fsrun_to_cpu(sb, sup->log_blocks); + befs_debug(sb, " log_blocks %u, %hu, %hu", + tmp_run.allocation_group, tmp_run.start, tmp_run.len); + + befs_debug(sb, " log_start %Ld", fs64_to_cpu(sb, sup->log_start)); + befs_debug(sb, " log_end %Ld", fs64_to_cpu(sb, sup->log_end)); + + befs_debug(sb, " magic3 %08x", fs32_to_cpu(sb, sup->magic3)); + + tmp_run = fsrun_to_cpu(sb, sup->root_dir); + befs_debug(sb, " root_dir %u, %hu, %hu", + tmp_run.allocation_group, tmp_run.start, tmp_run.len); + + tmp_run = fsrun_to_cpu(sb, sup->indices); + befs_debug(sb, " indices %u, %hu, %hu", + tmp_run.allocation_group, tmp_run.start, tmp_run.len); + +#endif //CONFIG_BEFS_DEBUG +} + +#if 0 +/* unused */ +void +befs_dump_small_data(const struct super_block *sb, befs_small_data * sd) +{ +} + +/* unused */ +void +befs_dump_run(const struct super_block *sb, befs_disk_block_run run) +{ +#ifdef CONFIG_BEFS_DEBUG + + befs_block_run n = fsrun_to_cpu(sb, run); + + befs_debug(sb, "[%u, %hu, %hu]", n.allocation_group, n.start, n.len); + +#endif //CONFIG_BEFS_DEBUG +} +#endif /* 0 */ + +void +befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super * super) +{ +#ifdef CONFIG_BEFS_DEBUG + + befs_debug(sb, "Btree super structure"); + befs_debug(sb, " magic %08x", fs32_to_cpu(sb, super->magic)); + befs_debug(sb, " node_size %u", fs32_to_cpu(sb, super->node_size)); + befs_debug(sb, " max_depth %08x", fs32_to_cpu(sb, super->max_depth)); + + befs_debug(sb, " data_type %08x", fs32_to_cpu(sb, super->data_type)); + befs_debug(sb, " root_node_pointer %016LX", + fs64_to_cpu(sb, super->root_node_ptr)); + befs_debug(sb, " free_node_pointer %016LX", + fs64_to_cpu(sb, super->free_node_ptr)); + befs_debug(sb, " maximum size %016LX", + fs64_to_cpu(sb, super->max_size)); + +#endif //CONFIG_BEFS_DEBUG +} + +void +befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead * node) +{ +#ifdef CONFIG_BEFS_DEBUG + + befs_debug(sb, "Btree node structure"); + befs_debug(sb, " left %016LX", fs64_to_cpu(sb, node->left)); + befs_debug(sb, " right %016LX", fs64_to_cpu(sb, node->right)); + befs_debug(sb, " overflow %016LX", fs64_to_cpu(sb, node->overflow)); + befs_debug(sb, " all_key_count %hu", + fs16_to_cpu(sb, node->all_key_count)); + befs_debug(sb, " all_key_length %hu", + fs16_to_cpu(sb, node->all_key_length)); + +#endif //CONFIG_BEFS_DEBUG +} diff --git a/fs/befs/endian.h b/fs/befs/endian.h new file mode 100644 index 00000000..27223878 --- /dev/null +++ b/fs/befs/endian.h @@ -0,0 +1,125 @@ +/* + * linux/fs/befs/endian.h + * + * Copyright (C) 2001 Will Dyson + * + * Partially based on similar funtions in the sysv driver. + */ + +#ifndef LINUX_BEFS_ENDIAN +#define LINUX_BEFS_ENDIAN + +#include + +static inline u64 +fs64_to_cpu(const struct super_block *sb, fs64 n) +{ + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) + return le64_to_cpu((__force __le64)n); + else + return be64_to_cpu((__force __be64)n); +} + +static inline fs64 +cpu_to_fs64(const struct super_block *sb, u64 n) +{ + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) + return (__force fs64)cpu_to_le64(n); + else + return (__force fs64)cpu_to_be64(n); +} + +static inline u32 +fs32_to_cpu(const struct super_block *sb, fs32 n) +{ + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) + return le32_to_cpu((__force __le32)n); + else + return be32_to_cpu((__force __be32)n); +} + +static inline fs32 +cpu_to_fs32(const struct super_block *sb, u32 n) +{ + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) + return (__force fs32)cpu_to_le32(n); + else + return (__force fs32)cpu_to_be32(n); +} + +static inline u16 +fs16_to_cpu(const struct super_block *sb, fs16 n) +{ + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) + return le16_to_cpu((__force __le16)n); + else + return be16_to_cpu((__force __be16)n); +} + +static inline fs16 +cpu_to_fs16(const struct super_block *sb, u16 n) +{ + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) + return (__force fs16)cpu_to_le16(n); + else + return (__force fs16)cpu_to_be16(n); +} + +/* Composite types below here */ + +static inline befs_block_run +fsrun_to_cpu(const struct super_block *sb, befs_disk_block_run n) +{ + befs_block_run run; + + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) { + run.allocation_group = le32_to_cpu((__force __le32)n.allocation_group); + run.start = le16_to_cpu((__force __le16)n.start); + run.len = le16_to_cpu((__force __le16)n.len); + } else { + run.allocation_group = be32_to_cpu((__force __be32)n.allocation_group); + run.start = be16_to_cpu((__force __be16)n.start); + run.len = be16_to_cpu((__force __be16)n.len); + } + return run; +} + +static inline befs_disk_block_run +cpu_to_fsrun(const struct super_block *sb, befs_block_run n) +{ + befs_disk_block_run run; + + if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) { + run.allocation_group = cpu_to_le32(n.allocation_group); + run.start = cpu_to_le16(n.start); + run.len = cpu_to_le16(n.len); + } else { + run.allocation_group = cpu_to_be32(n.allocation_group); + run.start = cpu_to_be16(n.start); + run.len = cpu_to_be16(n.len); + } + return run; +} + +static inline befs_data_stream +fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n) +{ + befs_data_stream data; + int i; + + for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i) + data.direct[i] = fsrun_to_cpu(sb, n->direct[i]); + + data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range); + data.indirect = fsrun_to_cpu(sb, n->indirect); + data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range); + data.double_indirect = fsrun_to_cpu(sb, n->double_indirect); + data.max_double_indirect_range = fs64_to_cpu(sb, + n-> + max_double_indirect_range); + data.size = fs64_to_cpu(sb, n->size); + + return data; +} + +#endif //LINUX_BEFS_ENDIAN diff --git a/fs/befs/inode.c b/fs/befs/inode.c new file mode 100644 index 00000000..94c17f9a --- /dev/null +++ b/fs/befs/inode.c @@ -0,0 +1,52 @@ +/* + * inode.c + * + * Copyright (C) 2001 Will Dyson + */ + +#include + +#include "befs.h" +#include "inode.h" + +/* + Validates the correctness of the befs inode + Returns BEFS_OK if the inode should be used, otherwise + returns BEFS_BAD_INODE +*/ +int +befs_check_inode(struct super_block *sb, befs_inode * raw_inode, + befs_blocknr_t inode) +{ + u32 magic1 = fs32_to_cpu(sb, raw_inode->magic1); + befs_inode_addr ino_num = fsrun_to_cpu(sb, raw_inode->inode_num); + u32 flags = fs32_to_cpu(sb, raw_inode->flags); + + /* check magic header. */ + if (magic1 != BEFS_INODE_MAGIC1) { + befs_error(sb, + "Inode has a bad magic header - inode = %lu", inode); + return BEFS_BAD_INODE; + } + + /* + * Sanity check2: inodes store their own block address. Check it. + */ + if (inode != iaddr2blockno(sb, &ino_num)) { + befs_error(sb, "inode blocknr field disagrees with vfs " + "VFS: %lu, Inode %lu", + inode, iaddr2blockno(sb, &ino_num)); + return BEFS_BAD_INODE; + } + + /* + * check flag + */ + + if (!(flags & BEFS_INODE_IN_USE)) { + befs_error(sb, "inode is not used - inode = %lu", inode); + return BEFS_BAD_INODE; + } + + return BEFS_OK; +} diff --git a/fs/befs/inode.h b/fs/befs/inode.h new file mode 100644 index 00000000..9dc7fd9b --- /dev/null +++ b/fs/befs/inode.h @@ -0,0 +1,8 @@ +/* + * inode.h + * + */ + +int befs_check_inode(struct super_block *sb, befs_inode * raw_inode, + befs_blocknr_t inode); + diff --git a/fs/befs/io.c b/fs/befs/io.c new file mode 100644 index 00000000..ddef98aa --- /dev/null +++ b/fs/befs/io.c @@ -0,0 +1,83 @@ +/* + * linux/fs/befs/io.c + * + * Copyright (C) 2001 Will Dyson + +#include "befs.h" +#include "io.h" + +/* + * Converts befs notion of disk addr to a disk offset and uses + * linux kernel function sb_bread() to get the buffer containing + * the offset. -Will Dyson + * + */ + +struct buffer_head * +befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) +{ + struct buffer_head *bh = NULL; + befs_blocknr_t block = 0; + befs_sb_info *befs_sb = BEFS_SB(sb); + + befs_debug(sb, "---> Enter befs_read_iaddr() " + "[%u, %hu, %hu]", + iaddr.allocation_group, iaddr.start, iaddr.len); + + if (iaddr.allocation_group > befs_sb->num_ags) { + befs_error(sb, "BEFS: Invalid allocation group %u, max is %u", + iaddr.allocation_group, befs_sb->num_ags); + goto error; + } + + block = iaddr2blockno(sb, &iaddr); + + befs_debug(sb, "befs_read_iaddr: offset = %lu", block); + + bh = sb_bread(sb, block); + + if (bh == NULL) { + befs_error(sb, "Failed to read block %lu", block); + goto error; + } + + befs_debug(sb, "<--- befs_read_iaddr()"); + return bh; + + error: + befs_debug(sb, "<--- befs_read_iaddr() ERROR"); + return NULL; +} + +struct buffer_head * +befs_bread(struct super_block *sb, befs_blocknr_t block) +{ + struct buffer_head *bh = NULL; + + befs_debug(sb, "---> Enter befs_read() %Lu", block); + + bh = sb_bread(sb, block); + + if (bh == NULL) { + befs_error(sb, "Failed to read block %lu", block); + goto error; + } + + befs_debug(sb, "<--- befs_read()"); + + return bh; + + error: + befs_debug(sb, "<--- befs_read() ERROR"); + return NULL; +} diff --git a/fs/befs/io.h b/fs/befs/io.h new file mode 100644 index 00000000..9b78266b --- /dev/null +++ b/fs/befs/io.h @@ -0,0 +1,9 @@ +/* + * io.h + */ + +struct buffer_head *befs_bread_iaddr(struct super_block *sb, + befs_inode_addr iaddr); + +struct buffer_head *befs_bread(struct super_block *sb, befs_blocknr_t block); + diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c new file mode 100644 index 00000000..720d885e --- /dev/null +++ b/fs/befs/linuxvfs.c @@ -0,0 +1,979 @@ +/* + * linux/fs/befs/linuxvfs.c + * + * Copyright (C) 2001 Will Dyson +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "befs.h" +#include "btree.h" +#include "inode.h" +#include "datastream.h" +#include "super.h" +#include "io.h" + +MODULE_DESCRIPTION("BeOS File System (BeFS) driver"); +MODULE_AUTHOR("Will Dyson"); +MODULE_LICENSE("GPL"); + +/* The units the vfs expects inode->i_blocks to be in */ +#define VFS_BLOCK_SIZE 512 + +static int befs_readdir(struct file *, void *, filldir_t); +static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int); +static int befs_readpage(struct file *file, struct page *page); +static sector_t befs_bmap(struct address_space *mapping, sector_t block); +static struct dentry *befs_lookup(struct inode *, struct dentry *, struct nameidata *); +static struct inode *befs_iget(struct super_block *, unsigned long); +static struct inode *befs_alloc_inode(struct super_block *sb); +static void befs_destroy_inode(struct inode *inode); +static int befs_init_inodecache(void); +static void befs_destroy_inodecache(void); +static void *befs_follow_link(struct dentry *, struct nameidata *); +static void befs_put_link(struct dentry *, struct nameidata *, void *); +static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, + char **out, int *out_len); +static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, + char **out, int *out_len); +static void befs_put_super(struct super_block *); +static int befs_remount(struct super_block *, int *, char *); +static int befs_statfs(struct dentry *, struct kstatfs *); +static int parse_options(char *, befs_mount_options *); + +static const struct super_operations befs_sops = { + .alloc_inode = befs_alloc_inode, /* allocate a new inode */ + .destroy_inode = befs_destroy_inode, /* deallocate an inode */ + .put_super = befs_put_super, /* uninit super */ + .statfs = befs_statfs, /* statfs */ + .remount_fs = befs_remount, + .show_options = generic_show_options, +}; + +/* slab cache for befs_inode_info objects */ +static struct kmem_cache *befs_inode_cachep; + +static const struct file_operations befs_dir_operations = { + .read = generic_read_dir, + .readdir = befs_readdir, + .llseek = generic_file_llseek, +}; + +static const struct inode_operations befs_dir_inode_operations = { + .lookup = befs_lookup, +}; + +static const struct address_space_operations befs_aops = { + .readpage = befs_readpage, + .bmap = befs_bmap, +}; + +static const struct inode_operations befs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = befs_follow_link, + .put_link = befs_put_link, +}; + +/* + * Called by generic_file_read() to read a page of data + * + * In turn, simply calls a generic block read function and + * passes it the address of befs_get_block, for mapping file + * positions to disk blocks. + */ +static int +befs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, befs_get_block); +} + +static sector_t +befs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, befs_get_block); +} + +/* + * Generic function to map a file position (block) to a + * disk offset (passed back in bh_result). + * + * Used by many higher level functions. + * + * Calls befs_fblock2brun() in datastream.c to do the real work. + * + * -WD 10-26-01 + */ + +static int +befs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; + befs_block_run run = BAD_IADDR; + int res = 0; + ulong disk_off; + + befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld", + inode->i_ino, block); + + if (block < 0) { + befs_error(sb, "befs_get_block() was asked for a block " + "number less than zero: block %ld in inode %lu", + block, inode->i_ino); + return -EIO; + } + + if (create) { + befs_error(sb, "befs_get_block() was asked to write to " + "block %ld in inode %lu", block, inode->i_ino); + return -EPERM; + } + + res = befs_fblock2brun(sb, ds, block, &run); + if (res != BEFS_OK) { + befs_error(sb, + "<--- befs_get_block() for inode %lu, block " + "%ld ERROR", inode->i_ino, block); + return -EFBIG; + } + + disk_off = (ulong) iaddr2blockno(sb, &run); + + map_bh(bh_result, inode->i_sb, disk_off); + + befs_debug(sb, "<--- befs_get_block() for inode %lu, block %ld, " + "disk address %lu", inode->i_ino, block, disk_off); + + return 0; +} + +static struct dentry * +befs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = NULL; + struct super_block *sb = dir->i_sb; + befs_data_stream *ds = &BEFS_I(dir)->i_data.ds; + befs_off_t offset; + int ret; + int utfnamelen; + char *utfname; + const char *name = dentry->d_name.name; + + befs_debug(sb, "---> befs_lookup() " + "name %s inode %ld", dentry->d_name.name, dir->i_ino); + + /* Convert to UTF-8 */ + if (BEFS_SB(sb)->nls) { + ret = + befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen); + if (ret < 0) { + befs_debug(sb, "<--- befs_lookup() ERROR"); + return ERR_PTR(ret); + } + ret = befs_btree_find(sb, ds, utfname, &offset); + kfree(utfname); + + } else { + ret = befs_btree_find(sb, ds, dentry->d_name.name, &offset); + } + + if (ret == BEFS_BT_NOT_FOUND) { + befs_debug(sb, "<--- befs_lookup() %s not found", + dentry->d_name.name); + return ERR_PTR(-ENOENT); + + } else if (ret != BEFS_OK || offset == 0) { + befs_warning(sb, "<--- befs_lookup() Error"); + return ERR_PTR(-ENODATA); + } + + inode = befs_iget(dir->i_sb, (ino_t) offset); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + d_add(dentry, inode); + + befs_debug(sb, "<--- befs_lookup()"); + + return NULL; +} + +static int +befs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; + befs_off_t value; + int result; + size_t keysize; + unsigned char d_type; + char keybuf[BEFS_NAME_LEN + 1]; + char *nlsname; + int nlsnamelen; + const char *dirname = filp->f_path.dentry->d_name.name; + + befs_debug(sb, "---> befs_readdir() " + "name %s, inode %ld, filp->f_pos %Ld", + dirname, inode->i_ino, filp->f_pos); + + result = befs_btree_read(sb, ds, filp->f_pos, BEFS_NAME_LEN + 1, + keybuf, &keysize, &value); + + if (result == BEFS_ERR) { + befs_debug(sb, "<--- befs_readdir() ERROR"); + befs_error(sb, "IO error reading %s (inode %lu)", + dirname, inode->i_ino); + return -EIO; + + } else if (result == BEFS_BT_END) { + befs_debug(sb, "<--- befs_readdir() END"); + return 0; + + } else if (result == BEFS_BT_EMPTY) { + befs_debug(sb, "<--- befs_readdir() Empty directory"); + return 0; + } + + d_type = DT_UNKNOWN; + + /* Convert to NLS */ + if (BEFS_SB(sb)->nls) { + result = + befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); + if (result < 0) { + befs_debug(sb, "<--- befs_readdir() ERROR"); + return result; + } + result = filldir(dirent, nlsname, nlsnamelen, filp->f_pos, + (ino_t) value, d_type); + kfree(nlsname); + + } else { + result = filldir(dirent, keybuf, keysize, filp->f_pos, + (ino_t) value, d_type); + } + + filp->f_pos++; + + befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos); + + return 0; +} + +static struct inode * +befs_alloc_inode(struct super_block *sb) +{ + struct befs_inode_info *bi; + bi = (struct befs_inode_info *)kmem_cache_alloc(befs_inode_cachep, + GFP_KERNEL); + if (!bi) + return NULL; + return &bi->vfs_inode; +} + +static void befs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(befs_inode_cachep, BEFS_I(inode)); +} + +static void befs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, befs_i_callback); +} + +static void init_once(void *foo) +{ + struct befs_inode_info *bi = (struct befs_inode_info *) foo; + + inode_init_once(&bi->vfs_inode); +} + +static struct inode *befs_iget(struct super_block *sb, unsigned long ino) +{ + struct buffer_head *bh = NULL; + befs_inode *raw_inode = NULL; + + befs_sb_info *befs_sb = BEFS_SB(sb); + befs_inode_info *befs_ino = NULL; + struct inode *inode; + long ret = -EIO; + + befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino); + + inode = iget_locked(sb, ino); + if (IS_ERR(inode)) + return inode; + if (!(inode->i_state & I_NEW)) + return inode; + + befs_ino = BEFS_I(inode); + + /* convert from vfs's inode number to befs's inode number */ + befs_ino->i_inode_num = blockno2iaddr(sb, inode->i_ino); + + befs_debug(sb, " real inode number [%u, %hu, %hu]", + befs_ino->i_inode_num.allocation_group, + befs_ino->i_inode_num.start, befs_ino->i_inode_num.len); + + bh = befs_bread(sb, inode->i_ino); + if (!bh) { + befs_error(sb, "unable to read inode block - " + "inode = %lu", inode->i_ino); + goto unacquire_none; + } + + raw_inode = (befs_inode *) bh->b_data; + + befs_dump_inode(sb, raw_inode); + + if (befs_check_inode(sb, raw_inode, inode->i_ino) != BEFS_OK) { + befs_error(sb, "Bad inode: %lu", inode->i_ino); + goto unacquire_bh; + } + + inode->i_mode = (umode_t) fs32_to_cpu(sb, raw_inode->mode); + + /* + * set uid and gid. But since current BeOS is single user OS, so + * you can change by "uid" or "gid" options. + */ + + inode->i_uid = befs_sb->mount_opts.use_uid ? + befs_sb->mount_opts.uid : (uid_t) fs32_to_cpu(sb, raw_inode->uid); + inode->i_gid = befs_sb->mount_opts.use_gid ? + befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid); + + inode->i_nlink = 1; + + /* + * BEFS's time is 64 bits, but current VFS is 32 bits... + * BEFS don't have access time. Nor inode change time. VFS + * doesn't have creation time. + * Also, the lower 16 bits of the last_modified_time and + * create_time are just a counter to help ensure uniqueness + * for indexing purposes. (PFD, page 54) + */ + + inode->i_mtime.tv_sec = + fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16; + inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */ + inode->i_ctime = inode->i_mtime; + inode->i_atime = inode->i_mtime; + + befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num); + befs_ino->i_parent = fsrun_to_cpu(sb, raw_inode->parent); + befs_ino->i_attribute = fsrun_to_cpu(sb, raw_inode->attributes); + befs_ino->i_flags = fs32_to_cpu(sb, raw_inode->flags); + + if (S_ISLNK(inode->i_mode) && !(befs_ino->i_flags & BEFS_LONG_SYMLINK)){ + inode->i_size = 0; + inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE; + strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink, + BEFS_SYMLINK_LEN - 1); + befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0'; + } else { + int num_blks; + + befs_ino->i_data.ds = + fsds_to_cpu(sb, &raw_inode->data.datastream); + + num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds); + inode->i_blocks = + num_blks * (befs_sb->block_size / VFS_BLOCK_SIZE); + inode->i_size = befs_ino->i_data.ds.size; + } + + inode->i_mapping->a_ops = &befs_aops; + + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &generic_ro_fops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &befs_dir_inode_operations; + inode->i_fop = &befs_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &befs_symlink_inode_operations; + } else { + befs_error(sb, "Inode %lu is not a regular file, " + "directory or symlink. THAT IS WRONG! BeFS has no " + "on disk special files", inode->i_ino); + goto unacquire_bh; + } + + brelse(bh); + befs_debug(sb, "<--- befs_read_inode()"); + unlock_new_inode(inode); + return inode; + + unacquire_bh: + brelse(bh); + + unacquire_none: + iget_failed(inode); + befs_debug(sb, "<--- befs_read_inode() - Bad inode"); + return ERR_PTR(ret); +} + +/* Initialize the inode cache. Called at fs setup. + * + * Taken from NFS implementation by Al Viro. + */ +static int +befs_init_inodecache(void) +{ + befs_inode_cachep = kmem_cache_create("befs_inode_cache", + sizeof (struct befs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (befs_inode_cachep == NULL) { + printk(KERN_ERR "befs_init_inodecache: " + "Couldn't initialize inode slabcache\n"); + return -ENOMEM; + } + + return 0; +} + +/* Called at fs teardown. + * + * Taken from NFS implementation by Al Viro. + */ +static void +befs_destroy_inodecache(void) +{ + kmem_cache_destroy(befs_inode_cachep); +} + +/* + * The inode of symbolic link is different to data stream. + * The data stream become link name. Unless the LONG_SYMLINK + * flag is set. + */ +static void * +befs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + befs_inode_info *befs_ino = BEFS_I(dentry->d_inode); + char *link; + + if (befs_ino->i_flags & BEFS_LONG_SYMLINK) { + struct super_block *sb = dentry->d_sb; + befs_data_stream *data = &befs_ino->i_data.ds; + befs_off_t len = data->size; + + if (len == 0) { + befs_error(sb, "Long symlink with illegal length"); + link = ERR_PTR(-EIO); + } else { + befs_debug(sb, "Follow long symlink"); + + link = kmalloc(len, GFP_NOFS); + if (!link) { + link = ERR_PTR(-ENOMEM); + } else if (befs_read_lsymlink(sb, data, link, len) != len) { + kfree(link); + befs_error(sb, "Failed to read entire long symlink"); + link = ERR_PTR(-EIO); + } else { + link[len - 1] = '\0'; + } + } + } else { + link = befs_ino->i_data.symlink; + } + + nd_set_link(nd, link); + return NULL; +} + +static void befs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) +{ + befs_inode_info *befs_ino = BEFS_I(dentry->d_inode); + if (befs_ino->i_flags & BEFS_LONG_SYMLINK) { + char *link = nd_get_link(nd); + if (!IS_ERR(link)) + kfree(link); + } +} + +/* + * UTF-8 to NLS charset convert routine + * + * + * Changed 8/10/01 by Will Dyson. Now use uni2char() / char2uni() rather than + * the nls tables directly + */ + +static int +befs_utf2nls(struct super_block *sb, const char *in, + int in_len, char **out, int *out_len) +{ + struct nls_table *nls = BEFS_SB(sb)->nls; + int i, o; + unicode_t uni; + int unilen, utflen; + char *result; + /* The utf8->nls conversion won't make the final nls string bigger + * than the utf one, but if the string is pure ascii they'll have the + * same width and an extra char is needed to save the additional \0 + */ + int maxlen = in_len + 1; + + befs_debug(sb, "---> utf2nls()"); + + if (!nls) { + befs_error(sb, "befs_utf2nls called with no NLS table loaded"); + return -EINVAL; + } + + *out = result = kmalloc(maxlen, GFP_NOFS); + if (!*out) { + befs_error(sb, "befs_utf2nls() cannot allocate memory"); + *out_len = 0; + return -ENOMEM; + } + + for (i = o = 0; i < in_len; i += utflen, o += unilen) { + + /* convert from UTF-8 to Unicode */ + utflen = utf8_to_utf32(&in[i], in_len - i, &uni); + if (utflen < 0) + goto conv_err; + + /* convert from Unicode to nls */ + if (uni > MAX_WCHAR_T) + goto conv_err; + unilen = nls->uni2char(uni, &result[o], in_len - o); + if (unilen < 0) + goto conv_err; + } + result[o] = '\0'; + *out_len = o; + + befs_debug(sb, "<--- utf2nls()"); + + return o; + + conv_err: + befs_error(sb, "Name using character set %s contains a character that " + "cannot be converted to unicode.", nls->charset); + befs_debug(sb, "<--- utf2nls()"); + kfree(result); + return -EILSEQ; +} + +/** + * befs_nls2utf - Convert NLS string to utf8 encodeing + * @sb: Superblock + * @src: Input string buffer in NLS format + * @srclen: Length of input string in bytes + * @dest: The output string in UTF-8 format + * @destlen: Length of the output buffer + * + * Converts input string @src, which is in the format of the loaded NLS map, + * into a utf8 string. + * + * The destination string @dest is allocated by this function and the caller is + * responsible for freeing it with kfree() + * + * On return, *@destlen is the length of @dest in bytes. + * + * On success, the return value is the number of utf8 characters written to + * the output buffer @dest. + * + * On Failure, a negative number coresponding to the error code is returned. + */ + +static int +befs_nls2utf(struct super_block *sb, const char *in, + int in_len, char **out, int *out_len) +{ + struct nls_table *nls = BEFS_SB(sb)->nls; + int i, o; + wchar_t uni; + int unilen, utflen; + char *result; + /* There're nls characters that will translate to 3-chars-wide UTF-8 + * characters, a additional byte is needed to save the final \0 + * in special cases */ + int maxlen = (3 * in_len) + 1; + + befs_debug(sb, "---> nls2utf()\n"); + + if (!nls) { + befs_error(sb, "befs_nls2utf called with no NLS table loaded."); + return -EINVAL; + } + + *out = result = kmalloc(maxlen, GFP_NOFS); + if (!*out) { + befs_error(sb, "befs_nls2utf() cannot allocate memory"); + *out_len = 0; + return -ENOMEM; + } + + for (i = o = 0; i < in_len; i += unilen, o += utflen) { + + /* convert from nls to unicode */ + unilen = nls->char2uni(&in[i], in_len - i, &uni); + if (unilen < 0) + goto conv_err; + + /* convert from unicode to UTF-8 */ + utflen = utf32_to_utf8(uni, &result[o], 3); + if (utflen <= 0) + goto conv_err; + } + + result[o] = '\0'; + *out_len = o; + + befs_debug(sb, "<--- nls2utf()"); + + return i; + + conv_err: + befs_error(sb, "Name using charecter set %s contains a charecter that " + "cannot be converted to unicode.", nls->charset); + befs_debug(sb, "<--- nls2utf()"); + kfree(result); + return -EILSEQ; +} + +/** + * Use the + * + */ +enum { + Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, +}; + +static const match_table_t befs_tokens = { + {Opt_uid, "uid=%d"}, + {Opt_gid, "gid=%d"}, + {Opt_charset, "iocharset=%s"}, + {Opt_debug, "debug"}, + {Opt_err, NULL} +}; + +static int +parse_options(char *options, befs_mount_options * opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + /* Initialize options */ + opts->uid = 0; + opts->gid = 0; + opts->use_uid = 0; + opts->use_gid = 0; + opts->iocharset = NULL; + opts->debug = 0; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, befs_tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) { + printk(KERN_ERR "BeFS: Invalid uid %d, " + "using default\n", option); + break; + } + opts->uid = option; + opts->use_uid = 1; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) { + printk(KERN_ERR "BeFS: Invalid gid %d, " + "using default\n", option); + break; + } + opts->gid = option; + opts->use_gid = 1; + break; + case Opt_charset: + kfree(opts->iocharset); + opts->iocharset = match_strdup(&args[0]); + if (!opts->iocharset) { + printk(KERN_ERR "BeFS: allocation failure for " + "iocharset string\n"); + return 0; + } + break; + case Opt_debug: + opts->debug = 1; + break; + default: + printk(KERN_ERR "BeFS: Unrecognized mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } + return 1; +} + +/* This function has the responsibiltiy of getting the + * filesystem ready for unmounting. + * Basically, we free everything that we allocated in + * befs_read_inode + */ +static void +befs_put_super(struct super_block *sb) +{ + kfree(BEFS_SB(sb)->mount_opts.iocharset); + BEFS_SB(sb)->mount_opts.iocharset = NULL; + unload_nls(BEFS_SB(sb)->nls); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; +} + +/* Allocate private field of the superblock, fill it. + * + * Finish filling the public superblock fields + * Make the root directory + * Load a set of NLS translations if needed. + */ +static int +befs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct buffer_head *bh; + befs_sb_info *befs_sb; + befs_super_block *disk_sb; + struct inode *root; + long ret = -EINVAL; + const unsigned long sb_block = 0; + const off_t x86_sb_off = 512; + + save_mount_options(sb, data); + + sb->s_fs_info = kmalloc(sizeof (*befs_sb), GFP_KERNEL); + if (sb->s_fs_info == NULL) { + printk(KERN_ERR + "BeFS(%s): Unable to allocate memory for private " + "portion of superblock. Bailing.\n", sb->s_id); + goto unacquire_none; + } + befs_sb = BEFS_SB(sb); + memset(befs_sb, 0, sizeof(befs_sb_info)); + + if (!parse_options((char *) data, &befs_sb->mount_opts)) { + befs_error(sb, "cannot parse mount options"); + goto unacquire_priv_sbp; + } + + befs_debug(sb, "---> befs_fill_super()"); + +#ifndef CONFIG_BEFS_RW + if (!(sb->s_flags & MS_RDONLY)) { + befs_warning(sb, + "No write support. Marking filesystem read-only"); + sb->s_flags |= MS_RDONLY; + } +#endif /* CONFIG_BEFS_RW */ + + /* + * Set dummy blocksize to read super block. + * Will be set to real fs blocksize later. + * + * Linux 2.4.10 and later refuse to read blocks smaller than + * the hardsect size for the device. But we also need to read at + * least 1k to get the second 512 bytes of the volume. + * -WD 10-26-01 + */ + sb_min_blocksize(sb, 1024); + + if (!(bh = sb_bread(sb, sb_block))) { + befs_error(sb, "unable to read superblock"); + goto unacquire_priv_sbp; + } + + /* account for offset of super block on x86 */ + disk_sb = (befs_super_block *) bh->b_data; + if ((disk_sb->magic1 == BEFS_SUPER_MAGIC1_LE) || + (disk_sb->magic1 == BEFS_SUPER_MAGIC1_BE)) { + befs_debug(sb, "Using PPC superblock location"); + } else { + befs_debug(sb, "Using x86 superblock location"); + disk_sb = + (befs_super_block *) ((void *) bh->b_data + x86_sb_off); + } + + if (befs_load_sb(sb, disk_sb) != BEFS_OK) + goto unacquire_bh; + + befs_dump_super_block(sb, disk_sb); + + brelse(bh); + + if (befs_check_sb(sb) != BEFS_OK) + goto unacquire_priv_sbp; + + if( befs_sb->num_blocks > ~((sector_t)0) ) { + befs_error(sb, "blocks count: %Lu " + "is larger than the host can use", + befs_sb->num_blocks); + goto unacquire_priv_sbp; + } + + /* + * set up enough so that it can read an inode + * Fill in kernel superblock fields from private sb + */ + sb->s_magic = BEFS_SUPER_MAGIC; + /* Set real blocksize of fs */ + sb_set_blocksize(sb, (ulong) befs_sb->block_size); + sb->s_op = &befs_sops; + root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir))); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto unacquire_priv_sbp; + } + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + iput(root); + befs_error(sb, "get root inode failed"); + goto unacquire_priv_sbp; + } + + /* load nls library */ + if (befs_sb->mount_opts.iocharset) { + befs_debug(sb, "Loading nls: %s", + befs_sb->mount_opts.iocharset); + befs_sb->nls = load_nls(befs_sb->mount_opts.iocharset); + if (!befs_sb->nls) { + befs_warning(sb, "Cannot load nls %s" + " loading default nls", + befs_sb->mount_opts.iocharset); + befs_sb->nls = load_nls_default(); + } + /* load default nls if none is specified in mount options */ + } else { + befs_debug(sb, "Loading default nls"); + befs_sb->nls = load_nls_default(); + } + + return 0; +/*****************/ + unacquire_bh: + brelse(bh); + + unacquire_priv_sbp: + kfree(befs_sb->mount_opts.iocharset); + kfree(sb->s_fs_info); + + unacquire_none: + sb->s_fs_info = NULL; + return ret; +} + +static int +befs_remount(struct super_block *sb, int *flags, char *data) +{ + if (!(*flags & MS_RDONLY)) + return -EINVAL; + return 0; +} + +static int +befs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + befs_debug(sb, "---> befs_statfs()"); + + buf->f_type = BEFS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = BEFS_SB(sb)->num_blocks; + buf->f_bfree = BEFS_SB(sb)->num_blocks - BEFS_SB(sb)->used_blocks; + buf->f_bavail = buf->f_bfree; + buf->f_files = 0; /* UNKNOWN */ + buf->f_ffree = 0; /* UNKNOWN */ + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = BEFS_NAME_LEN; + + befs_debug(sb, "<--- befs_statfs()"); + + return 0; +} + +static struct dentry * +befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super); +} + +static struct file_system_type befs_fs_type = { + .owner = THIS_MODULE, + .name = "befs", + .mount = befs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init +init_befs_fs(void) +{ + int err; + + printk(KERN_INFO "BeFS version: %s\n", BEFS_VERSION); + + err = befs_init_inodecache(); + if (err) + goto unacquire_none; + + err = register_filesystem(&befs_fs_type); + if (err) + goto unacquire_inodecache; + + return 0; + +unacquire_inodecache: + befs_destroy_inodecache(); + +unacquire_none: + return err; +} + +static void __exit +exit_befs_fs(void) +{ + befs_destroy_inodecache(); + + unregister_filesystem(&befs_fs_type); +} + +/* +Macros that typecheck the init and exit functions, +ensures that they are called at init and cleanup, +and eliminates warnings about unused functions. +*/ +module_init(init_befs_fs) +module_exit(exit_befs_fs) diff --git a/fs/befs/super.c b/fs/befs/super.c new file mode 100644 index 00000000..ca40f828 --- /dev/null +++ b/fs/befs/super.c @@ -0,0 +1,112 @@ +/* + * super.c + * + * Copyright (C) 2001-2002 Will Dyson + * + * Licensed under the GNU GPL. See the file COPYING for details. + * + */ + +#include +#include /* for PAGE_SIZE */ + +#include "befs.h" +#include "super.h" + +/** + * load_befs_sb -- Read from disk and properly byteswap all the fields + * of the befs superblock + * + * + * + * + */ +int +befs_load_sb(struct super_block *sb, befs_super_block * disk_sb) +{ + befs_sb_info *befs_sb = BEFS_SB(sb); + + /* Check the byte order of the filesystem */ + if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE) + befs_sb->byte_order = BEFS_BYTESEX_LE; + else if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_BE) + befs_sb->byte_order = BEFS_BYTESEX_BE; + + befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1); + befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2); + befs_sb->magic3 = fs32_to_cpu(sb, disk_sb->magic3); + befs_sb->block_size = fs32_to_cpu(sb, disk_sb->block_size); + befs_sb->block_shift = fs32_to_cpu(sb, disk_sb->block_shift); + befs_sb->num_blocks = fs64_to_cpu(sb, disk_sb->num_blocks); + befs_sb->used_blocks = fs64_to_cpu(sb, disk_sb->used_blocks); + befs_sb->inode_size = fs32_to_cpu(sb, disk_sb->inode_size); + + befs_sb->blocks_per_ag = fs32_to_cpu(sb, disk_sb->blocks_per_ag); + befs_sb->ag_shift = fs32_to_cpu(sb, disk_sb->ag_shift); + befs_sb->num_ags = fs32_to_cpu(sb, disk_sb->num_ags); + + befs_sb->log_blocks = fsrun_to_cpu(sb, disk_sb->log_blocks); + befs_sb->log_start = fs64_to_cpu(sb, disk_sb->log_start); + befs_sb->log_end = fs64_to_cpu(sb, disk_sb->log_end); + + befs_sb->root_dir = fsrun_to_cpu(sb, disk_sb->root_dir); + befs_sb->indices = fsrun_to_cpu(sb, disk_sb->indices); + befs_sb->nls = NULL; + + return BEFS_OK; +} + +int +befs_check_sb(struct super_block *sb) +{ + befs_sb_info *befs_sb = BEFS_SB(sb); + + /* Check magic headers of super block */ + if ((befs_sb->magic1 != BEFS_SUPER_MAGIC1) + || (befs_sb->magic2 != BEFS_SUPER_MAGIC2) + || (befs_sb->magic3 != BEFS_SUPER_MAGIC3)) { + befs_error(sb, "invalid magic header"); + return BEFS_ERR; + } + + /* + * Check blocksize of BEFS. + * + * Blocksize of BEFS is 1024, 2048, 4096 or 8192. + */ + + if ((befs_sb->block_size != 1024) + && (befs_sb->block_size != 2048) + && (befs_sb->block_size != 4096) + && (befs_sb->block_size != 8192)) { + befs_error(sb, "invalid blocksize: %u", befs_sb->block_size); + return BEFS_ERR; + } + + if (befs_sb->block_size > PAGE_SIZE) { + befs_error(sb, "blocksize(%u) cannot be larger" + "than system pagesize(%lu)", befs_sb->block_size, + PAGE_SIZE); + return BEFS_ERR; + } + + /* + * block_shift and block_size encode the same information + * in different ways as a consistency check. + */ + + if ((1 << befs_sb->block_shift) != befs_sb->block_size) { + befs_error(sb, "block_shift disagrees with block_size. " + "Corruption likely."); + return BEFS_ERR; + } + + if (befs_sb->log_start != befs_sb->log_end) { + befs_error(sb, "Filesystem not clean! There are blocks in the " + "journal. You must boot into BeOS and mount this volume " + "to make it clean."); + return BEFS_ERR; + } + + return BEFS_OK; +} diff --git a/fs/befs/super.h b/fs/befs/super.h new file mode 100644 index 00000000..dc455637 --- /dev/null +++ b/fs/befs/super.h @@ -0,0 +1,8 @@ +/* + * super.h + */ + +int befs_load_sb(struct super_block *sb, befs_super_block * disk_sb); + +int befs_check_sb(struct super_block *sb); + diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig new file mode 100644 index 00000000..c2336c62 --- /dev/null +++ b/fs/bfs/Kconfig @@ -0,0 +1,19 @@ +config BFS_FS + tristate "BFS file system support (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + help + Boot File System (BFS) is a file system used under SCO UnixWare to + allow the bootloader access to the kernel image and other important + files during the boot process. It is usually mounted under /stand + and corresponds to the slice marked as "STAND" in the UnixWare + partition. You should say Y if you want to read or write the files + on your /stand slice from within Linux. You then also need to say Y + to "UnixWare slices support", below. More information about the BFS + file system is contained in the file + . + + If you don't know what this is about, say N. + + To compile this as a module, choose M here: the module will be called + bfs. Note that the file system of your root partition (the one + containing the directory /) cannot be compiled as a module. diff --git a/fs/bfs/Makefile b/fs/bfs/Makefile new file mode 100644 index 00000000..c787b36d --- /dev/null +++ b/fs/bfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for BFS filesystem. +# + +obj-$(CONFIG_BFS_FS) += bfs.o + +bfs-objs := inode.o file.o dir.o diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h new file mode 100644 index 00000000..f7f87e23 --- /dev/null +++ b/fs/bfs/bfs.h @@ -0,0 +1,59 @@ +/* + * fs/bfs/bfs.h + * Copyright (C) 1999 Tigran Aivazian + */ +#ifndef _FS_BFS_BFS_H +#define _FS_BFS_BFS_H + +#include + +/* + * BFS file system in-core superblock info + */ +struct bfs_sb_info { + unsigned long si_blocks; + unsigned long si_freeb; + unsigned long si_freei; + unsigned long si_lf_eblk; + unsigned long si_lasti; + unsigned long *si_imap; + struct mutex bfs_lock; +}; + +/* + * BFS file system in-core inode info + */ +struct bfs_inode_info { + unsigned long i_dsk_ino; /* inode number from the disk, can be 0 */ + unsigned long i_sblock; + unsigned long i_eblock; + struct inode vfs_inode; +}; + +static inline struct bfs_sb_info *BFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct bfs_inode_info *BFS_I(struct inode *inode) +{ + return container_of(inode, struct bfs_inode_info, vfs_inode); +} + + +#define printf(format, args...) \ + printk(KERN_ERR "BFS-fs: %s(): " format, __func__, ## args) + +/* inode.c */ +extern struct inode *bfs_iget(struct super_block *sb, unsigned long ino); + +/* file.c */ +extern const struct inode_operations bfs_file_inops; +extern const struct file_operations bfs_file_operations; +extern const struct address_space_operations bfs_aops; + +/* dir.c */ +extern const struct inode_operations bfs_dir_inops; +extern const struct file_operations bfs_dir_operations; + +#endif /* _FS_BFS_BFS_H */ diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c new file mode 100644 index 00000000..b14cebfd --- /dev/null +++ b/fs/bfs/dir.c @@ -0,0 +1,374 @@ +/* + * fs/bfs/dir.c + * BFS directory operations. + * Copyright (C) 1999,2000 Tigran Aivazian + * Made endianness-clean by Andrew Stribblehill 2005 + */ + +#include +#include +#include +#include +#include +#include "bfs.h" + +#undef DEBUG + +#ifdef DEBUG +#define dprintf(x...) printf(x) +#else +#define dprintf(x...) +#endif + +static int bfs_add_entry(struct inode *dir, const unsigned char *name, + int namelen, int ino); +static struct buffer_head *bfs_find_entry(struct inode *dir, + const unsigned char *name, int namelen, + struct bfs_dirent **res_dir); + +static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir) +{ + struct inode *dir = f->f_path.dentry->d_inode; + struct buffer_head *bh; + struct bfs_dirent *de; + struct bfs_sb_info *info = BFS_SB(dir->i_sb); + unsigned int offset; + int block; + + mutex_lock(&info->bfs_lock); + + if (f->f_pos & (BFS_DIRENT_SIZE - 1)) { + printf("Bad f_pos=%08lx for %s:%08lx\n", + (unsigned long)f->f_pos, + dir->i_sb->s_id, dir->i_ino); + mutex_unlock(&info->bfs_lock); + return -EBADF; + } + + while (f->f_pos < dir->i_size) { + offset = f->f_pos & (BFS_BSIZE - 1); + block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS); + bh = sb_bread(dir->i_sb, block); + if (!bh) { + f->f_pos += BFS_BSIZE - offset; + continue; + } + do { + de = (struct bfs_dirent *)(bh->b_data + offset); + if (de->ino) { + int size = strnlen(de->name, BFS_NAMELEN); + if (filldir(dirent, de->name, size, f->f_pos, + le16_to_cpu(de->ino), + DT_UNKNOWN) < 0) { + brelse(bh); + mutex_unlock(&info->bfs_lock); + return 0; + } + } + offset += BFS_DIRENT_SIZE; + f->f_pos += BFS_DIRENT_SIZE; + } while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size)); + brelse(bh); + } + + mutex_unlock(&info->bfs_lock); + return 0; +} + +const struct file_operations bfs_dir_operations = { + .read = generic_read_dir, + .readdir = bfs_readdir, + .fsync = generic_file_fsync, + .llseek = generic_file_llseek, +}; + +extern void dump_imap(const char *, struct super_block *); + +static int bfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + int err; + struct inode *inode; + struct super_block *s = dir->i_sb; + struct bfs_sb_info *info = BFS_SB(s); + unsigned long ino; + + inode = new_inode(s); + if (!inode) + return -ENOSPC; + mutex_lock(&info->bfs_lock); + ino = find_first_zero_bit(info->si_imap, info->si_lasti + 1); + if (ino > info->si_lasti) { + mutex_unlock(&info->bfs_lock); + iput(inode); + return -ENOSPC; + } + set_bit(ino, info->si_imap); + info->si_freei--; + inode_init_owner(inode, dir, mode); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_blocks = 0; + inode->i_op = &bfs_file_inops; + inode->i_fop = &bfs_file_operations; + inode->i_mapping->a_ops = &bfs_aops; + inode->i_ino = ino; + BFS_I(inode)->i_dsk_ino = ino; + BFS_I(inode)->i_sblock = 0; + BFS_I(inode)->i_eblock = 0; + insert_inode_hash(inode); + mark_inode_dirty(inode); + dump_imap("create", s); + + err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len, + inode->i_ino); + if (err) { + inode_dec_link_count(inode); + mutex_unlock(&info->bfs_lock); + iput(inode); + return err; + } + mutex_unlock(&info->bfs_lock); + d_instantiate(dentry, inode); + return 0; +} + +static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode = NULL; + struct buffer_head *bh; + struct bfs_dirent *de; + struct bfs_sb_info *info = BFS_SB(dir->i_sb); + + if (dentry->d_name.len > BFS_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + mutex_lock(&info->bfs_lock); + bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de); + if (bh) { + unsigned long ino = (unsigned long)le16_to_cpu(de->ino); + brelse(bh); + inode = bfs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) { + mutex_unlock(&info->bfs_lock); + return ERR_CAST(inode); + } + } + mutex_unlock(&info->bfs_lock); + d_add(dentry, inode); + return NULL; +} + +static int bfs_link(struct dentry *old, struct inode *dir, + struct dentry *new) +{ + struct inode *inode = old->d_inode; + struct bfs_sb_info *info = BFS_SB(inode->i_sb); + int err; + + mutex_lock(&info->bfs_lock); + err = bfs_add_entry(dir, new->d_name.name, new->d_name.len, + inode->i_ino); + if (err) { + mutex_unlock(&info->bfs_lock); + return err; + } + inc_nlink(inode); + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + ihold(inode); + d_instantiate(new, inode); + mutex_unlock(&info->bfs_lock); + return 0; +} + +static int bfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int error = -ENOENT; + struct inode *inode = dentry->d_inode; + struct buffer_head *bh; + struct bfs_dirent *de; + struct bfs_sb_info *info = BFS_SB(inode->i_sb); + + mutex_lock(&info->bfs_lock); + bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de); + if (!bh || (le16_to_cpu(de->ino) != inode->i_ino)) + goto out_brelse; + + if (!inode->i_nlink) { + printf("unlinking non-existent file %s:%lu (nlink=%d)\n", + inode->i_sb->s_id, inode->i_ino, + inode->i_nlink); + inode->i_nlink = 1; + } + de->ino = 0; + mark_buffer_dirty_inode(bh, dir); + dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + error = 0; + +out_brelse: + brelse(bh); + mutex_unlock(&info->bfs_lock); + return error; +} + +static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode, *new_inode; + struct buffer_head *old_bh, *new_bh; + struct bfs_dirent *old_de, *new_de; + struct bfs_sb_info *info; + int error = -ENOENT; + + old_bh = new_bh = NULL; + old_inode = old_dentry->d_inode; + if (S_ISDIR(old_inode->i_mode)) + return -EINVAL; + + info = BFS_SB(old_inode->i_sb); + + mutex_lock(&info->bfs_lock); + old_bh = bfs_find_entry(old_dir, + old_dentry->d_name.name, + old_dentry->d_name.len, &old_de); + + if (!old_bh || (le16_to_cpu(old_de->ino) != old_inode->i_ino)) + goto end_rename; + + error = -EPERM; + new_inode = new_dentry->d_inode; + new_bh = bfs_find_entry(new_dir, + new_dentry->d_name.name, + new_dentry->d_name.len, &new_de); + + if (new_bh && !new_inode) { + brelse(new_bh); + new_bh = NULL; + } + if (!new_bh) { + error = bfs_add_entry(new_dir, + new_dentry->d_name.name, + new_dentry->d_name.len, + old_inode->i_ino); + if (error) + goto end_rename; + } + old_de->ino = 0; + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(old_dir); + if (new_inode) { + new_inode->i_ctime = CURRENT_TIME_SEC; + inode_dec_link_count(new_inode); + } + mark_buffer_dirty_inode(old_bh, old_dir); + error = 0; + +end_rename: + mutex_unlock(&info->bfs_lock); + brelse(old_bh); + brelse(new_bh); + return error; +} + +const struct inode_operations bfs_dir_inops = { + .create = bfs_create, + .lookup = bfs_lookup, + .link = bfs_link, + .unlink = bfs_unlink, + .rename = bfs_rename, +}; + +static int bfs_add_entry(struct inode *dir, const unsigned char *name, + int namelen, int ino) +{ + struct buffer_head *bh; + struct bfs_dirent *de; + int block, sblock, eblock, off, pos; + int i; + + dprintf("name=%s, namelen=%d\n", name, namelen); + + if (!namelen) + return -ENOENT; + if (namelen > BFS_NAMELEN) + return -ENAMETOOLONG; + + sblock = BFS_I(dir)->i_sblock; + eblock = BFS_I(dir)->i_eblock; + for (block = sblock; block <= eblock; block++) { + bh = sb_bread(dir->i_sb, block); + if (!bh) + return -ENOSPC; + for (off = 0; off < BFS_BSIZE; off += BFS_DIRENT_SIZE) { + de = (struct bfs_dirent *)(bh->b_data + off); + if (!de->ino) { + pos = (block - sblock) * BFS_BSIZE + off; + if (pos >= dir->i_size) { + dir->i_size += BFS_DIRENT_SIZE; + dir->i_ctime = CURRENT_TIME_SEC; + } + dir->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); + de->ino = cpu_to_le16((u16)ino); + for (i = 0; i < BFS_NAMELEN; i++) + de->name[i] = + (i < namelen) ? name[i] : 0; + mark_buffer_dirty_inode(bh, dir); + brelse(bh); + return 0; + } + } + brelse(bh); + } + return -ENOSPC; +} + +static inline int bfs_namecmp(int len, const unsigned char *name, + const char *buffer) +{ + if ((len < BFS_NAMELEN) && buffer[len]) + return 0; + return !memcmp(name, buffer, len); +} + +static struct buffer_head *bfs_find_entry(struct inode *dir, + const unsigned char *name, int namelen, + struct bfs_dirent **res_dir) +{ + unsigned long block = 0, offset = 0; + struct buffer_head *bh = NULL; + struct bfs_dirent *de; + + *res_dir = NULL; + if (namelen > BFS_NAMELEN) + return NULL; + + while (block * BFS_BSIZE + offset < dir->i_size) { + if (!bh) { + bh = sb_bread(dir->i_sb, BFS_I(dir)->i_sblock + block); + if (!bh) { + block++; + continue; + } + } + de = (struct bfs_dirent *)(bh->b_data + offset); + offset += BFS_DIRENT_SIZE; + if (le16_to_cpu(de->ino) && + bfs_namecmp(namelen, name, de->name)) { + *res_dir = de; + return bh; + } + if (offset < bh->b_size) + continue; + brelse(bh); + bh = NULL; + offset = 0; + block++; + } + brelse(bh); + return NULL; +} diff --git a/fs/bfs/file.c b/fs/bfs/file.c new file mode 100644 index 00000000..f20e8a71 --- /dev/null +++ b/fs/bfs/file.c @@ -0,0 +1,194 @@ +/* + * fs/bfs/file.c + * BFS file operations. + * Copyright (C) 1999,2000 Tigran Aivazian + * + * Make the file block allocation algorithm understand the size + * of the underlying block device. + * Copyright (C) 2007 Dmitri Vorobiev + * + */ + +#include +#include +#include "bfs.h" + +#undef DEBUG + +#ifdef DEBUG +#define dprintf(x...) printf(x) +#else +#define dprintf(x...) +#endif + +const struct file_operations bfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .splice_read = generic_file_splice_read, +}; + +static int bfs_move_block(unsigned long from, unsigned long to, + struct super_block *sb) +{ + struct buffer_head *bh, *new; + + bh = sb_bread(sb, from); + if (!bh) + return -EIO; + new = sb_getblk(sb, to); + memcpy(new->b_data, bh->b_data, bh->b_size); + mark_buffer_dirty(new); + bforget(bh); + brelse(new); + return 0; +} + +static int bfs_move_blocks(struct super_block *sb, unsigned long start, + unsigned long end, unsigned long where) +{ + unsigned long i; + + dprintf("%08lx-%08lx->%08lx\n", start, end, where); + for (i = start; i <= end; i++) + if(bfs_move_block(i, where + i, sb)) { + dprintf("failed to move block %08lx -> %08lx\n", i, + where + i); + return -EIO; + } + return 0; +} + +static int bfs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + unsigned long phys; + int err; + struct super_block *sb = inode->i_sb; + struct bfs_sb_info *info = BFS_SB(sb); + struct bfs_inode_info *bi = BFS_I(inode); + + phys = bi->i_sblock + block; + if (!create) { + if (phys <= bi->i_eblock) { + dprintf("c=%d, b=%08lx, phys=%09lx (granted)\n", + create, (unsigned long)block, phys); + map_bh(bh_result, sb, phys); + } + return 0; + } + + /* + * If the file is not empty and the requested block is within the + * range of blocks allocated for this file, we can grant it. + */ + if (bi->i_sblock && (phys <= bi->i_eblock)) { + dprintf("c=%d, b=%08lx, phys=%08lx (interim block granted)\n", + create, (unsigned long)block, phys); + map_bh(bh_result, sb, phys); + return 0; + } + + /* The file will be extended, so let's see if there is enough space. */ + if (phys >= info->si_blocks) + return -ENOSPC; + + /* The rest has to be protected against itself. */ + mutex_lock(&info->bfs_lock); + + /* + * If the last data block for this file is the last allocated + * block, we can extend the file trivially, without moving it + * anywhere. + */ + if (bi->i_eblock == info->si_lf_eblk) { + dprintf("c=%d, b=%08lx, phys=%08lx (simple extension)\n", + create, (unsigned long)block, phys); + map_bh(bh_result, sb, phys); + info->si_freeb -= phys - bi->i_eblock; + info->si_lf_eblk = bi->i_eblock = phys; + mark_inode_dirty(inode); + err = 0; + goto out; + } + + /* Ok, we have to move this entire file to the next free block. */ + phys = info->si_lf_eblk + 1; + if (phys + block >= info->si_blocks) { + err = -ENOSPC; + goto out; + } + + if (bi->i_sblock) { + err = bfs_move_blocks(inode->i_sb, bi->i_sblock, + bi->i_eblock, phys); + if (err) { + dprintf("failed to move ino=%08lx -> fs corruption\n", + inode->i_ino); + goto out; + } + } else + err = 0; + + dprintf("c=%d, b=%08lx, phys=%08lx (moved)\n", + create, (unsigned long)block, phys); + bi->i_sblock = phys; + phys += block; + info->si_lf_eblk = bi->i_eblock = phys; + + /* + * This assumes nothing can write the inode back while we are here + * and thus update inode->i_blocks! (XXX) + */ + info->si_freeb -= bi->i_eblock - bi->i_sblock + 1 - inode->i_blocks; + mark_inode_dirty(inode); + map_bh(bh_result, sb, phys); +out: + mutex_unlock(&info->bfs_lock); + return err; +} + +static int bfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, bfs_get_block, wbc); +} + +static int bfs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, bfs_get_block); +} + +static int bfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, + bfs_get_block); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} + +static sector_t bfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, bfs_get_block); +} + +const struct address_space_operations bfs_aops = { + .readpage = bfs_readpage, + .writepage = bfs_writepage, + .write_begin = bfs_write_begin, + .write_end = generic_write_end, + .bmap = bfs_bmap, +}; + +const struct inode_operations bfs_file_inops; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c new file mode 100644 index 00000000..a8e37f81 --- /dev/null +++ b/fs/bfs/inode.c @@ -0,0 +1,496 @@ +/* + * fs/bfs/inode.c + * BFS superblock and inode operations. + * Copyright (C) 1999-2006 Tigran Aivazian + * From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds. + * + * Made endianness-clean by Andrew Stribblehill , 2005. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bfs.h" + +MODULE_AUTHOR("Tigran Aivazian "); +MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux"); +MODULE_LICENSE("GPL"); + +#undef DEBUG + +#ifdef DEBUG +#define dprintf(x...) printf(x) +#else +#define dprintf(x...) +#endif + +void dump_imap(const char *prefix, struct super_block *s); + +struct inode *bfs_iget(struct super_block *sb, unsigned long ino) +{ + struct bfs_inode *di; + struct inode *inode; + struct buffer_head *bh; + int block, off; + + inode = iget_locked(sb, ino); + if (IS_ERR(inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) { + printf("Bad inode number %s:%08lx\n", inode->i_sb->s_id, ino); + goto error; + } + + block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; + bh = sb_bread(inode->i_sb, block); + if (!bh) { + printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id, + ino); + goto error; + } + + off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; + di = (struct bfs_inode *)bh->b_data + off; + + inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode); + if (le32_to_cpu(di->i_vtype) == BFS_VDIR) { + inode->i_mode |= S_IFDIR; + inode->i_op = &bfs_dir_inops; + inode->i_fop = &bfs_dir_operations; + } else if (le32_to_cpu(di->i_vtype) == BFS_VREG) { + inode->i_mode |= S_IFREG; + inode->i_op = &bfs_file_inops; + inode->i_fop = &bfs_file_operations; + inode->i_mapping->a_ops = &bfs_aops; + } + + BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock); + BFS_I(inode)->i_eblock = le32_to_cpu(di->i_eblock); + BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); + inode->i_uid = le32_to_cpu(di->i_uid); + inode->i_gid = le32_to_cpu(di->i_gid); + inode->i_nlink = le32_to_cpu(di->i_nlink); + inode->i_size = BFS_FILESIZE(di); + inode->i_blocks = BFS_FILEBLOCKS(di); + inode->i_atime.tv_sec = le32_to_cpu(di->i_atime); + inode->i_mtime.tv_sec = le32_to_cpu(di->i_mtime); + inode->i_ctime.tv_sec = le32_to_cpu(di->i_ctime); + inode->i_atime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + + brelse(bh); + unlock_new_inode(inode); + return inode; + +error: + iget_failed(inode); + return ERR_PTR(-EIO); +} + +static struct bfs_inode *find_inode(struct super_block *sb, u16 ino, struct buffer_head **p) +{ + if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(sb)->si_lasti)) { + printf("Bad inode number %s:%08x\n", sb->s_id, ino); + return ERR_PTR(-EIO); + } + + ino -= BFS_ROOT_INO; + + *p = sb_bread(sb, 1 + ino / BFS_INODES_PER_BLOCK); + if (!*p) { + printf("Unable to read inode %s:%08x\n", sb->s_id, ino); + return ERR_PTR(-EIO); + } + + return (struct bfs_inode *)(*p)->b_data + ino % BFS_INODES_PER_BLOCK; +} + +static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct bfs_sb_info *info = BFS_SB(inode->i_sb); + unsigned int ino = (u16)inode->i_ino; + unsigned long i_sblock; + struct bfs_inode *di; + struct buffer_head *bh; + int err = 0; + + dprintf("ino=%08x\n", ino); + + di = find_inode(inode->i_sb, ino, &bh); + if (IS_ERR(di)) + return PTR_ERR(di); + + mutex_lock(&info->bfs_lock); + + if (ino == BFS_ROOT_INO) + di->i_vtype = cpu_to_le32(BFS_VDIR); + else + di->i_vtype = cpu_to_le32(BFS_VREG); + + di->i_ino = cpu_to_le16(ino); + di->i_mode = cpu_to_le32(inode->i_mode); + di->i_uid = cpu_to_le32(inode->i_uid); + di->i_gid = cpu_to_le32(inode->i_gid); + di->i_nlink = cpu_to_le32(inode->i_nlink); + di->i_atime = cpu_to_le32(inode->i_atime.tv_sec); + di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + i_sblock = BFS_I(inode)->i_sblock; + di->i_sblock = cpu_to_le32(i_sblock); + di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock); + di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); + + mark_buffer_dirty(bh); + if (wbc->sync_mode == WB_SYNC_ALL) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + err = -EIO; + } + brelse(bh); + mutex_unlock(&info->bfs_lock); + return err; +} + +static void bfs_evict_inode(struct inode *inode) +{ + unsigned long ino = inode->i_ino; + struct bfs_inode *di; + struct buffer_head *bh; + struct super_block *s = inode->i_sb; + struct bfs_sb_info *info = BFS_SB(s); + struct bfs_inode_info *bi = BFS_I(inode); + + dprintf("ino=%08lx\n", ino); + + truncate_inode_pages(&inode->i_data, 0); + invalidate_inode_buffers(inode); + end_writeback(inode); + + if (inode->i_nlink) + return; + + di = find_inode(s, inode->i_ino, &bh); + if (IS_ERR(di)) + return; + + mutex_lock(&info->bfs_lock); + /* clear on-disk inode */ + memset(di, 0, sizeof(struct bfs_inode)); + mark_buffer_dirty(bh); + brelse(bh); + + if (bi->i_dsk_ino) { + if (bi->i_sblock) + info->si_freeb += bi->i_eblock + 1 - bi->i_sblock; + info->si_freei++; + clear_bit(ino, info->si_imap); + dump_imap("delete_inode", s); + } + + /* + * If this was the last file, make the previous block + * "last block of the last file" even if there is no + * real file there, saves us 1 gap. + */ + if (info->si_lf_eblk == bi->i_eblock) + info->si_lf_eblk = bi->i_sblock - 1; + mutex_unlock(&info->bfs_lock); +} + +static void bfs_put_super(struct super_block *s) +{ + struct bfs_sb_info *info = BFS_SB(s); + + if (!info) + return; + + mutex_destroy(&info->bfs_lock); + kfree(info->si_imap); + kfree(info); + s->s_fs_info = NULL; +} + +static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *s = dentry->d_sb; + struct bfs_sb_info *info = BFS_SB(s); + u64 id = huge_encode_dev(s->s_bdev->bd_dev); + buf->f_type = BFS_MAGIC; + buf->f_bsize = s->s_blocksize; + buf->f_blocks = info->si_blocks; + buf->f_bfree = buf->f_bavail = info->si_freeb; + buf->f_files = info->si_lasti + 1 - BFS_ROOT_INO; + buf->f_ffree = info->si_freei; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = BFS_NAMELEN; + return 0; +} + +static struct kmem_cache *bfs_inode_cachep; + +static struct inode *bfs_alloc_inode(struct super_block *sb) +{ + struct bfs_inode_info *bi; + bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL); + if (!bi) + return NULL; + return &bi->vfs_inode; +} + +static void bfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(bfs_inode_cachep, BFS_I(inode)); +} + +static void bfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, bfs_i_callback); +} + +static void init_once(void *foo) +{ + struct bfs_inode_info *bi = foo; + + inode_init_once(&bi->vfs_inode); +} + +static int init_inodecache(void) +{ + bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", + sizeof(struct bfs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (bfs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(bfs_inode_cachep); +} + +static const struct super_operations bfs_sops = { + .alloc_inode = bfs_alloc_inode, + .destroy_inode = bfs_destroy_inode, + .write_inode = bfs_write_inode, + .evict_inode = bfs_evict_inode, + .put_super = bfs_put_super, + .statfs = bfs_statfs, +}; + +void dump_imap(const char *prefix, struct super_block *s) +{ +#ifdef DEBUG + int i; + char *tmpbuf = (char *)get_zeroed_page(GFP_KERNEL); + + if (!tmpbuf) + return; + for (i = BFS_SB(s)->si_lasti; i >= 0; i--) { + if (i > PAGE_SIZE - 100) break; + if (test_bit(i, BFS_SB(s)->si_imap)) + strcat(tmpbuf, "1"); + else + strcat(tmpbuf, "0"); + } + printf("BFS-fs: %s: lasti=%08lx <%s>\n", + prefix, BFS_SB(s)->si_lasti, tmpbuf); + free_page((unsigned long)tmpbuf); +#endif +} + +static int bfs_fill_super(struct super_block *s, void *data, int silent) +{ + struct buffer_head *bh, *sbh; + struct bfs_super_block *bfs_sb; + struct inode *inode; + unsigned i, imap_len; + struct bfs_sb_info *info; + int ret = -EINVAL; + unsigned long i_sblock, i_eblock, i_eoff, s_size; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + mutex_init(&info->bfs_lock); + s->s_fs_info = info; + + sb_set_blocksize(s, BFS_BSIZE); + + sbh = sb_bread(s, 0); + if (!sbh) + goto out; + bfs_sb = (struct bfs_super_block *)sbh->b_data; + if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { + if (!silent) + printf("No BFS filesystem on %s (magic=%08x)\n", + s->s_id, le32_to_cpu(bfs_sb->s_magic)); + goto out1; + } + if (BFS_UNCLEAN(bfs_sb, s) && !silent) + printf("%s is unclean, continuing\n", s->s_id); + + s->s_magic = BFS_MAGIC; + + if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { + printf("Superblock is corrupted\n"); + goto out1; + } + + info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / + sizeof(struct bfs_inode) + + BFS_ROOT_INO - 1; + imap_len = (info->si_lasti / 8) + 1; + info->si_imap = kzalloc(imap_len, GFP_KERNEL); + if (!info->si_imap) + goto out1; + for (i = 0; i < BFS_ROOT_INO; i++) + set_bit(i, info->si_imap); + + s->s_op = &bfs_sops; + inode = bfs_iget(s, BFS_ROOT_INO); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out2; + } + s->s_root = d_alloc_root(inode); + if (!s->s_root) { + iput(inode); + ret = -ENOMEM; + goto out2; + } + + info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS; + info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 + - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; + info->si_freei = 0; + info->si_lf_eblk = 0; + + /* can we read the last block? */ + bh = sb_bread(s, info->si_blocks - 1); + if (!bh) { + printf("Last block not available: %lu\n", info->si_blocks - 1); + ret = -EIO; + goto out3; + } + brelse(bh); + + bh = NULL; + for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) { + struct bfs_inode *di; + int block = (i - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; + int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; + unsigned long eblock; + + if (!off) { + brelse(bh); + bh = sb_bread(s, block); + } + + if (!bh) + continue; + + di = (struct bfs_inode *)bh->b_data + off; + + /* test if filesystem is not corrupted */ + + i_eoff = le32_to_cpu(di->i_eoffset); + i_sblock = le32_to_cpu(di->i_sblock); + i_eblock = le32_to_cpu(di->i_eblock); + s_size = le32_to_cpu(bfs_sb->s_end); + + if (i_sblock > info->si_blocks || + i_eblock > info->si_blocks || + i_sblock > i_eblock || + i_eoff > s_size || + i_sblock * BFS_BSIZE > i_eoff) { + + printf("Inode 0x%08x corrupted\n", i); + + brelse(bh); + ret = -EIO; + goto out3; + } + + if (!di->i_ino) { + info->si_freei++; + continue; + } + set_bit(i, info->si_imap); + info->si_freeb -= BFS_FILEBLOCKS(di); + + eblock = le32_to_cpu(di->i_eblock); + if (eblock > info->si_lf_eblk) + info->si_lf_eblk = eblock; + } + brelse(bh); + brelse(sbh); + dump_imap("read_super", s); + return 0; + +out3: + dput(s->s_root); + s->s_root = NULL; +out2: + kfree(info->si_imap); +out1: + brelse(sbh); +out: + mutex_destroy(&info->bfs_lock); + kfree(info); + s->s_fs_info = NULL; + return ret; +} + +static struct dentry *bfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super); +} + +static struct file_system_type bfs_fs_type = { + .owner = THIS_MODULE, + .name = "bfs", + .mount = bfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_bfs_fs(void) +{ + int err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&bfs_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_bfs_fs(void) +{ + unregister_filesystem(&bfs_fs_type); + destroy_inodecache(); +} + +module_init(init_bfs_fs) +module_exit(exit_bfs_fs) diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c new file mode 100644 index 00000000..a6395bdb --- /dev/null +++ b/fs/binfmt_aout.c @@ -0,0 +1,467 @@ +/* + * linux/fs/binfmt_aout.c + * + * Copyright (C) 1991, 1992, 1996 Linus Torvalds + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); +static int load_aout_library(struct file*); +static int aout_core_dump(struct coredump_params *cprm); + +static struct linux_binfmt aout_format = { + .module = THIS_MODULE, + .load_binary = load_aout_binary, + .load_shlib = load_aout_library, + .core_dump = aout_core_dump, + .min_coredump = PAGE_SIZE +}; + +#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) + +static int set_brk(unsigned long start, unsigned long end) +{ + start = PAGE_ALIGN(start); + end = PAGE_ALIGN(end); + if (end > start) { + unsigned long addr; + down_write(¤t->mm->mmap_sem); + addr = do_brk(start, end - start); + up_write(¤t->mm->mmap_sem); + if (BAD_ADDR(addr)) + return addr; + } + return 0; +} + +/* + * Routine writes a core dump image in the current directory. + * Currently only a stub-function. + * + * Note that setuid/setgid files won't make a core-dump if the uid/gid + * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable" + * field, which also makes sure the core-dumps won't be recursive if the + * dumping of the process results in another error.. + */ + +static int aout_core_dump(struct coredump_params *cprm) +{ + struct file *file = cprm->file; + mm_segment_t fs; + int has_dumped = 0; + void __user *dump_start; + int dump_size; + struct user dump; +#ifdef __alpha__ +# define START_DATA(u) ((void __user *)u.start_data) +#else +# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \ + u.start_code)) +#endif +# define START_STACK(u) ((void __user *)u.start_stack) + + fs = get_fs(); + set_fs(KERNEL_DS); + has_dumped = 1; + current->flags |= PF_DUMPCORE; + strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); + dump.u_ar0 = offsetof(struct user, regs); + dump.signal = cprm->signr; + aout_dump_thread(cprm->regs, &dump); + +/* If the size of the dump file exceeds the rlimit, then see what would happen + if we wrote the stack, but not the data area. */ + if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit) + dump.u_dsize = 0; + +/* Make sure we have enough room to write the stack and data areas. */ + if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit) + dump.u_ssize = 0; + +/* make sure we actually have a data and stack area to dump */ + set_fs(USER_DS); + if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) + dump.u_dsize = 0; + if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) + dump.u_ssize = 0; + + set_fs(KERNEL_DS); +/* struct user */ + if (!dump_write(file, &dump, sizeof(dump))) + goto end_coredump; +/* Now dump all of the user data. Include malloced stuff as well */ + if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump))) + goto end_coredump; +/* now we start writing out the user space info */ + set_fs(USER_DS); +/* Dump the data area */ + if (dump.u_dsize != 0) { + dump_start = START_DATA(dump); + dump_size = dump.u_dsize << PAGE_SHIFT; + if (!dump_write(file, dump_start, dump_size)) + goto end_coredump; + } +/* Now prepare to dump the stack area */ + if (dump.u_ssize != 0) { + dump_start = START_STACK(dump); + dump_size = dump.u_ssize << PAGE_SHIFT; + if (!dump_write(file, dump_start, dump_size)) + goto end_coredump; + } +end_coredump: + set_fs(fs); + return has_dumped; +} + +/* + * create_aout_tables() parses the env- and arg-strings in new user + * memory and creates the pointer tables from them, and puts their + * addresses on the "stack", returning the new stack pointer value. + */ +static unsigned long __user *create_aout_tables(char __user *p, struct linux_binprm * bprm) +{ + char __user * __user *argv; + char __user * __user *envp; + unsigned long __user *sp; + int argc = bprm->argc; + int envc = bprm->envc; + + sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p); +#ifdef __alpha__ +/* whee.. test-programs are so much fun. */ + put_user(0, --sp); + put_user(0, --sp); + if (bprm->loader) { + put_user(0, --sp); + put_user(1003, --sp); + put_user(bprm->loader, --sp); + put_user(1002, --sp); + } + put_user(bprm->exec, --sp); + put_user(1001, --sp); +#endif + sp -= envc+1; + envp = (char __user * __user *) sp; + sp -= argc+1; + argv = (char __user * __user *) sp; +#ifndef __alpha__ + put_user((unsigned long) envp,--sp); + put_user((unsigned long) argv,--sp); +#endif + put_user(argc,--sp); + current->mm->arg_start = (unsigned long) p; + while (argc-->0) { + char c; + put_user(p,argv++); + do { + get_user(c,p++); + } while (c); + } + put_user(NULL,argv); + current->mm->arg_end = current->mm->env_start = (unsigned long) p; + while (envc-->0) { + char c; + put_user(p,envp++); + do { + get_user(c,p++); + } while (c); + } + put_user(NULL,envp); + current->mm->env_end = (unsigned long) p; + return sp; +} + +/* + * These are the functions used to load a.out style executables and shared + * libraries. There is no binary dependent code anywhere else. + */ + +static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) +{ + struct exec ex; + unsigned long error; + unsigned long fd_offset; + unsigned long rlim; + int retval; + + ex = *((struct exec *) bprm->buf); /* exec-header */ + if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && + N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || + N_TRSIZE(ex) || N_DRSIZE(ex) || + i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { + return -ENOEXEC; + } + + /* + * Requires a mmap handler. This prevents people from using a.out + * as part of an exploit attack against /proc-related vulnerabilities. + */ + if (!bprm->file->f_op || !bprm->file->f_op->mmap) + return -ENOEXEC; + + fd_offset = N_TXTOFF(ex); + + /* Check initial limits. This avoids letting people circumvent + * size limits imposed on them by creating programs with large + * arrays in the data or bss. + */ + rlim = rlimit(RLIMIT_DATA); + if (rlim >= RLIM_INFINITY) + rlim = ~0; + if (ex.a_data + ex.a_bss > rlim) + return -ENOMEM; + + /* Flush all traces of the currently running executable */ + retval = flush_old_exec(bprm); + if (retval) + return retval; + + /* OK, This is the point of no return */ +#ifdef __alpha__ + SET_AOUT_PERSONALITY(bprm, ex); +#else + set_personality(PER_LINUX); +#endif + setup_new_exec(bprm); + + current->mm->end_code = ex.a_text + + (current->mm->start_code = N_TXTADDR(ex)); + current->mm->end_data = ex.a_data + + (current->mm->start_data = N_DATADDR(ex)); + current->mm->brk = ex.a_bss + + (current->mm->start_brk = N_BSSADDR(ex)); + current->mm->free_area_cache = current->mm->mmap_base; + current->mm->cached_hole_size = 0; + + install_exec_creds(bprm); + current->flags &= ~PF_FORKNOEXEC; + + if (N_MAGIC(ex) == OMAGIC) { + unsigned long text_addr, map_size; + loff_t pos; + + text_addr = N_TXTADDR(ex); + +#ifdef __alpha__ + pos = fd_offset; + map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1; +#else + pos = 32; + map_size = ex.a_text+ex.a_data; +#endif + down_write(¤t->mm->mmap_sem); + error = do_brk(text_addr & PAGE_MASK, map_size); + up_write(¤t->mm->mmap_sem); + if (error != (text_addr & PAGE_MASK)) { + send_sig(SIGKILL, current, 0); + return error; + } + + error = bprm->file->f_op->read(bprm->file, + (char __user *)text_addr, + ex.a_text+ex.a_data, &pos); + if ((signed long)error < 0) { + send_sig(SIGKILL, current, 0); + return error; + } + + flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data); + } else { + if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && + (N_MAGIC(ex) != NMAGIC) && printk_ratelimit()) + { + printk(KERN_NOTICE "executable not page aligned\n"); + } + + if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit()) + { + printk(KERN_WARNING + "fd_offset is not page aligned. Please convert program: %s\n", + bprm->file->f_path.dentry->d_name.name); + } + + if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { + loff_t pos = fd_offset; + down_write(¤t->mm->mmap_sem); + do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); + up_write(¤t->mm->mmap_sem); + bprm->file->f_op->read(bprm->file, + (char __user *)N_TXTADDR(ex), + ex.a_text+ex.a_data, &pos); + flush_icache_range((unsigned long) N_TXTADDR(ex), + (unsigned long) N_TXTADDR(ex) + + ex.a_text+ex.a_data); + goto beyond_if; + } + + down_write(¤t->mm->mmap_sem); + error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, + PROT_READ | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, + fd_offset); + up_write(¤t->mm->mmap_sem); + + if (error != N_TXTADDR(ex)) { + send_sig(SIGKILL, current, 0); + return error; + } + + down_write(¤t->mm->mmap_sem); + error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, + fd_offset + ex.a_text); + up_write(¤t->mm->mmap_sem); + if (error != N_DATADDR(ex)) { + send_sig(SIGKILL, current, 0); + return error; + } + } +beyond_if: + set_binfmt(&aout_format); + + retval = set_brk(current->mm->start_brk, current->mm->brk); + if (retval < 0) { + send_sig(SIGKILL, current, 0); + return retval; + } + + retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); + if (retval < 0) { + /* Someone check-me: is this error path enough? */ + send_sig(SIGKILL, current, 0); + return retval; + } + + current->mm->start_stack = + (unsigned long) create_aout_tables((char __user *) bprm->p, bprm); +#ifdef __alpha__ + regs->gp = ex.a_gpvalue; +#endif + start_thread(regs, ex.a_entry, current->mm->start_stack); + return 0; +} + +static int load_aout_library(struct file *file) +{ + struct inode * inode; + unsigned long bss, start_addr, len; + unsigned long error; + int retval; + struct exec ex; + + inode = file->f_path.dentry->d_inode; + + retval = -ENOEXEC; + error = kernel_read(file, 0, (char *) &ex, sizeof(ex)); + if (error != sizeof(ex)) + goto out; + + /* We come in here for the regular a.out style of shared libraries */ + if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || + N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || + i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { + goto out; + } + + /* + * Requires a mmap handler. This prevents people from using a.out + * as part of an exploit attack against /proc-related vulnerabilities. + */ + if (!file->f_op || !file->f_op->mmap) + goto out; + + if (N_FLAGS(ex)) + goto out; + + /* For QMAGIC, the starting address is 0x20 into the page. We mask + this off to get the starting address for the page */ + + start_addr = ex.a_entry & 0xfffff000; + + if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { + loff_t pos = N_TXTOFF(ex); + + if (printk_ratelimit()) + { + printk(KERN_WARNING + "N_TXTOFF is not page aligned. Please convert library: %s\n", + file->f_path.dentry->d_name.name); + } + down_write(¤t->mm->mmap_sem); + do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); + up_write(¤t->mm->mmap_sem); + + file->f_op->read(file, (char __user *)start_addr, + ex.a_text + ex.a_data, &pos); + flush_icache_range((unsigned long) start_addr, + (unsigned long) start_addr + ex.a_text + ex.a_data); + + retval = 0; + goto out; + } + /* Now use mmap to map the library into memory. */ + down_write(¤t->mm->mmap_sem); + error = do_mmap(file, start_addr, ex.a_text + ex.a_data, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, + N_TXTOFF(ex)); + up_write(¤t->mm->mmap_sem); + retval = error; + if (error != start_addr) + goto out; + + len = PAGE_ALIGN(ex.a_text + ex.a_data); + bss = ex.a_text + ex.a_data + ex.a_bss; + if (bss > len) { + down_write(¤t->mm->mmap_sem); + error = do_brk(start_addr + len, bss - len); + up_write(¤t->mm->mmap_sem); + retval = error; + if (error != start_addr + len) + goto out; + } + retval = 0; +out: + return retval; +} + +static int __init init_aout_binfmt(void) +{ + return register_binfmt(&aout_format); +} + +static void __exit exit_aout_binfmt(void) +{ + unregister_binfmt(&aout_format); +} + +core_initcall(init_aout_binfmt); +module_exit(exit_aout_binfmt); +MODULE_LICENSE("GPL"); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c new file mode 100644 index 00000000..618493e4 --- /dev/null +++ b/fs/binfmt_elf.c @@ -0,0 +1,2092 @@ +/* + * linux/fs/binfmt_elf.c + * + * These are the functions used to load ELF format executables as used + * on SVr4 machines. Information on the format may be found in the book + * "UNIX SYSTEM V RELEASE 4 Programmers Guide: Ansi C and Programming Support + * Tools". + * + * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); +static int load_elf_library(struct file *); +static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, + int, int, unsigned long); + +/* + * If we don't support core dumping, then supply a NULL so we + * don't even try. + */ +#ifdef CONFIG_ELF_CORE +static int elf_core_dump(struct coredump_params *cprm); +#else +#define elf_core_dump NULL +#endif + +#if ELF_EXEC_PAGESIZE > PAGE_SIZE +#define ELF_MIN_ALIGN ELF_EXEC_PAGESIZE +#else +#define ELF_MIN_ALIGN PAGE_SIZE +#endif + +#ifndef ELF_CORE_EFLAGS +#define ELF_CORE_EFLAGS 0 +#endif + +#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1)) +#define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1)) +#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) + +static struct linux_binfmt elf_format = { + .module = THIS_MODULE, + .load_binary = load_elf_binary, + .load_shlib = load_elf_library, + .core_dump = elf_core_dump, + .min_coredump = ELF_EXEC_PAGESIZE, +}; + +#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) + +static int set_brk(unsigned long start, unsigned long end) +{ + start = ELF_PAGEALIGN(start); + end = ELF_PAGEALIGN(end); + if (end > start) { + unsigned long addr; + down_write(¤t->mm->mmap_sem); + addr = do_brk(start, end - start); + up_write(¤t->mm->mmap_sem); + if (BAD_ADDR(addr)) + return addr; + } + current->mm->start_brk = current->mm->brk = end; + return 0; +} + +/* We need to explicitly zero any fractional pages + after the data section (i.e. bss). This would + contain the junk from the file that should not + be in memory + */ +static int padzero(unsigned long elf_bss) +{ + unsigned long nbyte; + + nbyte = ELF_PAGEOFFSET(elf_bss); + if (nbyte) { + nbyte = ELF_MIN_ALIGN - nbyte; + if (clear_user((void __user *) elf_bss, nbyte)) + return -EFAULT; + } + return 0; +} + +/* Let's use some macros to make this stack manipulation a little clearer */ +#ifdef CONFIG_STACK_GROWSUP +#define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) + (items)) +#define STACK_ROUND(sp, items) \ + ((15 + (unsigned long) ((sp) + (items))) &~ 15UL) +#define STACK_ALLOC(sp, len) ({ \ + elf_addr_t __user *old_sp = (elf_addr_t __user *)sp; sp += len; \ + old_sp; }) +#else +#define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) - (items)) +#define STACK_ROUND(sp, items) \ + (((unsigned long) (sp - items)) &~ 15UL) +#define STACK_ALLOC(sp, len) ({ sp -= len ; sp; }) +#endif + +#ifndef ELF_BASE_PLATFORM +/* + * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture. + * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value + * will be copied to the user stack in the same manner as AT_PLATFORM. + */ +#define ELF_BASE_PLATFORM NULL +#endif + +static int +create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, + unsigned long load_addr, unsigned long interp_load_addr) +{ + unsigned long p = bprm->p; + int argc = bprm->argc; + int envc = bprm->envc; + elf_addr_t __user *argv; + elf_addr_t __user *envp; + elf_addr_t __user *sp; + elf_addr_t __user *u_platform; + elf_addr_t __user *u_base_platform; + elf_addr_t __user *u_rand_bytes; + const char *k_platform = ELF_PLATFORM; + const char *k_base_platform = ELF_BASE_PLATFORM; + unsigned char k_rand_bytes[16]; + int items; + elf_addr_t *elf_info; + int ei_index = 0; + const struct cred *cred = current_cred(); + struct vm_area_struct *vma; + + /* + * In some cases (e.g. Hyper-Threading), we want to avoid L1 + * evictions by the processes running on the same package. One + * thing we can do is to shuffle the initial stack for them. + */ + + p = arch_align_stack(p); + + /* + * If this architecture has a platform capability string, copy it + * to userspace. In some cases (Sparc), this info is impossible + * for userspace to get any other way, in others (i386) it is + * merely difficult. + */ + u_platform = NULL; + if (k_platform) { + size_t len = strlen(k_platform) + 1; + + u_platform = (elf_addr_t __user *)STACK_ALLOC(p, len); + if (__copy_to_user(u_platform, k_platform, len)) + return -EFAULT; + } + + /* + * If this architecture has a "base" platform capability + * string, copy it to userspace. + */ + u_base_platform = NULL; + if (k_base_platform) { + size_t len = strlen(k_base_platform) + 1; + + u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len); + if (__copy_to_user(u_base_platform, k_base_platform, len)) + return -EFAULT; + } + + /* + * Generate 16 random bytes for userspace PRNG seeding. + */ + get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); + u_rand_bytes = (elf_addr_t __user *) + STACK_ALLOC(p, sizeof(k_rand_bytes)); + if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) + return -EFAULT; + + /* Create the ELF interpreter info */ + elf_info = (elf_addr_t *)current->mm->saved_auxv; + /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ +#define NEW_AUX_ENT(id, val) \ + do { \ + elf_info[ei_index++] = id; \ + elf_info[ei_index++] = val; \ + } while (0) + +#ifdef ARCH_DLINFO + /* + * ARCH_DLINFO must come first so PPC can do its special alignment of + * AUXV. + * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in + * ARCH_DLINFO changes + */ + ARCH_DLINFO; +#endif + NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); + NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE); + NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); + NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff); + NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); + NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); + NEW_AUX_ENT(AT_BASE, interp_load_addr); + NEW_AUX_ENT(AT_FLAGS, 0); + NEW_AUX_ENT(AT_ENTRY, exec->e_entry); + NEW_AUX_ENT(AT_UID, cred->uid); + NEW_AUX_ENT(AT_EUID, cred->euid); + NEW_AUX_ENT(AT_GID, cred->gid); + NEW_AUX_ENT(AT_EGID, cred->egid); + NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); + NEW_AUX_ENT(AT_EXECFN, bprm->exec); + if (k_platform) { + NEW_AUX_ENT(AT_PLATFORM, + (elf_addr_t)(unsigned long)u_platform); + } + if (k_base_platform) { + NEW_AUX_ENT(AT_BASE_PLATFORM, + (elf_addr_t)(unsigned long)u_base_platform); + } + if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) { + NEW_AUX_ENT(AT_EXECFD, bprm->interp_data); + } +#undef NEW_AUX_ENT + /* AT_NULL is zero; clear the rest too */ + memset(&elf_info[ei_index], 0, + sizeof current->mm->saved_auxv - ei_index * sizeof elf_info[0]); + + /* And advance past the AT_NULL entry. */ + ei_index += 2; + + sp = STACK_ADD(p, ei_index); + + items = (argc + 1) + (envc + 1) + 1; + bprm->p = STACK_ROUND(sp, items); + + /* Point sp at the lowest address on the stack */ +#ifdef CONFIG_STACK_GROWSUP + sp = (elf_addr_t __user *)bprm->p - items - ei_index; + bprm->exec = (unsigned long)sp; /* XXX: PARISC HACK */ +#else + sp = (elf_addr_t __user *)bprm->p; +#endif + + + /* + * Grow the stack manually; some architectures have a limit on how + * far ahead a user-space access may be in order to grow the stack. + */ + vma = find_extend_vma(current->mm, bprm->p); + if (!vma) + return -EFAULT; + + /* Now, let's put argc (and argv, envp if appropriate) on the stack */ + if (__put_user(argc, sp++)) + return -EFAULT; + argv = sp; + envp = argv + argc + 1; + + /* Populate argv and envp */ + p = current->mm->arg_end = current->mm->arg_start; + while (argc-- > 0) { + size_t len; + if (__put_user((elf_addr_t)p, argv++)) + return -EFAULT; + len = strnlen_user((void __user *)p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) + return -EINVAL; + p += len; + } + if (__put_user(0, argv)) + return -EFAULT; + current->mm->arg_end = current->mm->env_start = p; + while (envc-- > 0) { + size_t len; + if (__put_user((elf_addr_t)p, envp++)) + return -EFAULT; + len = strnlen_user((void __user *)p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) + return -EINVAL; + p += len; + } + if (__put_user(0, envp)) + return -EFAULT; + current->mm->env_end = p; + + /* Put the elf_info on the stack in the right place. */ + sp = (elf_addr_t __user *)envp + 1; + if (copy_to_user(sp, elf_info, ei_index * sizeof(elf_addr_t))) + return -EFAULT; + return 0; +} + +static unsigned long elf_map(struct file *filep, unsigned long addr, + struct elf_phdr *eppnt, int prot, int type, + unsigned long total_size) +{ + unsigned long map_addr; + unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr); + unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr); + addr = ELF_PAGESTART(addr); + size = ELF_PAGEALIGN(size); + + /* mmap() will return -EINVAL if given a zero size, but a + * segment with zero filesize is perfectly valid */ + if (!size) + return addr; + + down_write(¤t->mm->mmap_sem); + /* + * total_size is the size of the ELF (interpreter) image. + * The _first_ mmap needs to know the full size, otherwise + * randomization might put this image into an overlapping + * position with the ELF binary image. (since size < total_size) + * So we first map the 'big' image - and unmap the remainder at + * the end. (which unmap is needed for ELF images with holes.) + */ + if (total_size) { + total_size = ELF_PAGEALIGN(total_size); + map_addr = do_mmap(filep, addr, total_size, prot, type, off); + if (!BAD_ADDR(map_addr)) + do_munmap(current->mm, map_addr+size, total_size-size); + } else + map_addr = do_mmap(filep, addr, size, prot, type, off); + + up_write(¤t->mm->mmap_sem); + return(map_addr); +} + +static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) +{ + int i, first_idx = -1, last_idx = -1; + + for (i = 0; i < nr; i++) { + if (cmds[i].p_type == PT_LOAD) { + last_idx = i; + if (first_idx == -1) + first_idx = i; + } + } + if (first_idx == -1) + return 0; + + return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz - + ELF_PAGESTART(cmds[first_idx].p_vaddr); +} + + +/* This is much more generalized than the library routine read function, + so we keep this separate. Technically the library read function + is only provided so that we can read a.out libraries that have + an ELF header */ + +static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, + struct file *interpreter, unsigned long *interp_map_addr, + unsigned long no_base) +{ + struct elf_phdr *elf_phdata; + struct elf_phdr *eppnt; + unsigned long load_addr = 0; + int load_addr_set = 0; + unsigned long last_bss = 0, elf_bss = 0; + unsigned long error = ~0UL; + unsigned long total_size; + int retval, i, size; + + /* First of all, some simple consistency checks */ + if (interp_elf_ex->e_type != ET_EXEC && + interp_elf_ex->e_type != ET_DYN) + goto out; + if (!elf_check_arch(interp_elf_ex)) + goto out; + if (!interpreter->f_op || !interpreter->f_op->mmap) + goto out; + + /* + * If the size of this structure has changed, then punt, since + * we will be doing the wrong thing. + */ + if (interp_elf_ex->e_phentsize != sizeof(struct elf_phdr)) + goto out; + if (interp_elf_ex->e_phnum < 1 || + interp_elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr)) + goto out; + + /* Now read in all of the header information */ + size = sizeof(struct elf_phdr) * interp_elf_ex->e_phnum; + if (size > ELF_MIN_ALIGN) + goto out; + elf_phdata = kmalloc(size, GFP_KERNEL); + if (!elf_phdata) + goto out; + + retval = kernel_read(interpreter, interp_elf_ex->e_phoff, + (char *)elf_phdata, size); + error = -EIO; + if (retval != size) { + if (retval < 0) + error = retval; + goto out_close; + } + + total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum); + if (!total_size) { + error = -EINVAL; + goto out_close; + } + + eppnt = elf_phdata; + for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { + if (eppnt->p_type == PT_LOAD) { + int elf_type = MAP_PRIVATE | MAP_DENYWRITE; + int elf_prot = 0; + unsigned long vaddr = 0; + unsigned long k, map_addr; + + if (eppnt->p_flags & PF_R) + elf_prot = PROT_READ; + if (eppnt->p_flags & PF_W) + elf_prot |= PROT_WRITE; + if (eppnt->p_flags & PF_X) + elf_prot |= PROT_EXEC; + vaddr = eppnt->p_vaddr; + if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) + elf_type |= MAP_FIXED; + else if (no_base && interp_elf_ex->e_type == ET_DYN) + load_addr = -vaddr; + + map_addr = elf_map(interpreter, load_addr + vaddr, + eppnt, elf_prot, elf_type, total_size); + total_size = 0; + if (!*interp_map_addr) + *interp_map_addr = map_addr; + error = map_addr; + if (BAD_ADDR(map_addr)) + goto out_close; + + if (!load_addr_set && + interp_elf_ex->e_type == ET_DYN) { + load_addr = map_addr - ELF_PAGESTART(vaddr); + load_addr_set = 1; + } + + /* + * Check to see if the section's size will overflow the + * allowed task size. Note that p_filesz must always be + * <= p_memsize so it's only necessary to check p_memsz. + */ + k = load_addr + eppnt->p_vaddr; + if (BAD_ADDR(k) || + eppnt->p_filesz > eppnt->p_memsz || + eppnt->p_memsz > TASK_SIZE || + TASK_SIZE - eppnt->p_memsz < k) { + error = -ENOMEM; + goto out_close; + } + + /* + * Find the end of the file mapping for this phdr, and + * keep track of the largest address we see for this. + */ + k = load_addr + eppnt->p_vaddr + eppnt->p_filesz; + if (k > elf_bss) + elf_bss = k; + + /* + * Do the same thing for the memory mapping - between + * elf_bss and last_bss is the bss section. + */ + k = load_addr + eppnt->p_memsz + eppnt->p_vaddr; + if (k > last_bss) + last_bss = k; + } + } + + if (last_bss > elf_bss) { + /* + * Now fill out the bss section. First pad the last page up + * to the page boundary, and then perform a mmap to make sure + * that there are zero-mapped pages up to and including the + * last bss page. + */ + if (padzero(elf_bss)) { + error = -EFAULT; + goto out_close; + } + + /* What we have mapped so far */ + elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); + + /* Map the last of the bss segment */ + down_write(¤t->mm->mmap_sem); + error = do_brk(elf_bss, last_bss - elf_bss); + up_write(¤t->mm->mmap_sem); + if (BAD_ADDR(error)) + goto out_close; + } + + error = load_addr; + +out_close: + kfree(elf_phdata); +out: + return error; +} + +/* + * These are the functions used to load ELF style executables and shared + * libraries. There is no binary dependent code anywhere else. + */ + +#define INTERPRETER_NONE 0 +#define INTERPRETER_ELF 2 + +#ifndef STACK_RND_MASK +#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ +#endif + +static unsigned long randomize_stack_top(unsigned long stack_top) +{ + unsigned int random_variable = 0; + + if ((current->flags & PF_RANDOMIZE) && + !(current->personality & ADDR_NO_RANDOMIZE)) { + random_variable = get_random_int() & STACK_RND_MASK; + random_variable <<= PAGE_SHIFT; + } +#ifdef CONFIG_STACK_GROWSUP + return PAGE_ALIGN(stack_top) + random_variable; +#else + return PAGE_ALIGN(stack_top) - random_variable; +#endif +} + +static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) +{ + struct file *interpreter = NULL; /* to shut gcc up */ + unsigned long load_addr = 0, load_bias = 0; + int load_addr_set = 0; + char * elf_interpreter = NULL; + unsigned long error; + struct elf_phdr *elf_ppnt, *elf_phdata; + unsigned long elf_bss, elf_brk; + int retval, i; + unsigned int size; + unsigned long elf_entry; + unsigned long interp_load_addr = 0; + unsigned long start_code, end_code, start_data, end_data; + unsigned long reloc_func_desc __maybe_unused = 0; + int executable_stack = EXSTACK_DEFAULT; + unsigned long def_flags = 0; + struct { + struct elfhdr elf_ex; + struct elfhdr interp_elf_ex; + } *loc; + + loc = kmalloc(sizeof(*loc), GFP_KERNEL); + if (!loc) { + retval = -ENOMEM; + goto out_ret; + } + + /* Get the exec-header */ + loc->elf_ex = *((struct elfhdr *)bprm->buf); + + retval = -ENOEXEC; + /* First of all, some simple consistency checks */ + if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + goto out; + + if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN) + goto out; + if (!elf_check_arch(&loc->elf_ex)) + goto out; + if (!bprm->file->f_op || !bprm->file->f_op->mmap) + goto out; + + /* Now read in all of the header information */ + if (loc->elf_ex.e_phentsize != sizeof(struct elf_phdr)) + goto out; + if (loc->elf_ex.e_phnum < 1 || + loc->elf_ex.e_phnum > 65536U / sizeof(struct elf_phdr)) + goto out; + size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr); + retval = -ENOMEM; + elf_phdata = kmalloc(size, GFP_KERNEL); + if (!elf_phdata) + goto out; + + retval = kernel_read(bprm->file, loc->elf_ex.e_phoff, + (char *)elf_phdata, size); + if (retval != size) { + if (retval >= 0) + retval = -EIO; + goto out_free_ph; + } + + elf_ppnt = elf_phdata; + elf_bss = 0; + elf_brk = 0; + + start_code = ~0UL; + end_code = 0; + start_data = 0; + end_data = 0; + + for (i = 0; i < loc->elf_ex.e_phnum; i++) { + if (elf_ppnt->p_type == PT_INTERP) { + /* This is the program interpreter used for + * shared libraries - for now assume that this + * is an a.out format binary + */ + retval = -ENOEXEC; + if (elf_ppnt->p_filesz > PATH_MAX || + elf_ppnt->p_filesz < 2) + goto out_free_ph; + + retval = -ENOMEM; + elf_interpreter = kmalloc(elf_ppnt->p_filesz, + GFP_KERNEL); + if (!elf_interpreter) + goto out_free_ph; + + retval = kernel_read(bprm->file, elf_ppnt->p_offset, + elf_interpreter, + elf_ppnt->p_filesz); + if (retval != elf_ppnt->p_filesz) { + if (retval >= 0) + retval = -EIO; + goto out_free_interp; + } + /* make sure path is NULL terminated */ + retval = -ENOEXEC; + if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') + goto out_free_interp; + + interpreter = open_exec(elf_interpreter); + retval = PTR_ERR(interpreter); + if (IS_ERR(interpreter)) + goto out_free_interp; + + /* + * If the binary is not readable then enforce + * mm->dumpable = 0 regardless of the interpreter's + * permissions. + */ + if (file_permission(interpreter, MAY_READ) < 0) + bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + + retval = kernel_read(interpreter, 0, bprm->buf, + BINPRM_BUF_SIZE); + if (retval != BINPRM_BUF_SIZE) { + if (retval >= 0) + retval = -EIO; + goto out_free_dentry; + } + + /* Get the exec headers */ + loc->interp_elf_ex = *((struct elfhdr *)bprm->buf); + break; + } + elf_ppnt++; + } + + elf_ppnt = elf_phdata; + for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) + if (elf_ppnt->p_type == PT_GNU_STACK) { + if (elf_ppnt->p_flags & PF_X) + executable_stack = EXSTACK_ENABLE_X; + else + executable_stack = EXSTACK_DISABLE_X; + break; + } + + /* Some simple consistency checks for the interpreter */ + if (elf_interpreter) { + retval = -ELIBBAD; + /* Not an ELF interpreter */ + if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + goto out_free_dentry; + /* Verify the interpreter has a valid arch */ + if (!elf_check_arch(&loc->interp_elf_ex)) + goto out_free_dentry; + } + + /* Flush all traces of the currently running executable */ + retval = flush_old_exec(bprm); + if (retval) + goto out_free_dentry; + + /* OK, This is the point of no return */ + current->flags &= ~PF_FORKNOEXEC; + current->mm->def_flags = def_flags; + + /* Do this immediately, since STACK_TOP as used in setup_arg_pages + may depend on the personality. */ + SET_PERSONALITY(loc->elf_ex); + if (elf_read_implies_exec(loc->elf_ex, executable_stack)) + current->personality |= READ_IMPLIES_EXEC; + + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + current->flags |= PF_RANDOMIZE; + + setup_new_exec(bprm); + + /* Do this so that we can load the interpreter, if need be. We will + change some of these later */ + current->mm->free_area_cache = current->mm->mmap_base; + current->mm->cached_hole_size = 0; + retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), + executable_stack); + if (retval < 0) { + send_sig(SIGKILL, current, 0); + goto out_free_dentry; + } + + current->mm->start_stack = bprm->p; + + /* Now we do a little grungy work by mmapping the ELF image into + the correct location in memory. */ + for(i = 0, elf_ppnt = elf_phdata; + i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { + int elf_prot = 0, elf_flags; + unsigned long k, vaddr; + + if (elf_ppnt->p_type != PT_LOAD) + continue; + + if (unlikely (elf_brk > elf_bss)) { + unsigned long nbyte; + + /* There was a PT_LOAD segment with p_memsz > p_filesz + before this one. Map anonymous pages, if needed, + and clear the area. */ + retval = set_brk(elf_bss + load_bias, + elf_brk + load_bias); + if (retval) { + send_sig(SIGKILL, current, 0); + goto out_free_dentry; + } + nbyte = ELF_PAGEOFFSET(elf_bss); + if (nbyte) { + nbyte = ELF_MIN_ALIGN - nbyte; + if (nbyte > elf_brk - elf_bss) + nbyte = elf_brk - elf_bss; + if (clear_user((void __user *)elf_bss + + load_bias, nbyte)) { + /* + * This bss-zeroing can fail if the ELF + * file specifies odd protections. So + * we don't check the return value + */ + } + } + } + + if (elf_ppnt->p_flags & PF_R) + elf_prot |= PROT_READ; + if (elf_ppnt->p_flags & PF_W) + elf_prot |= PROT_WRITE; + if (elf_ppnt->p_flags & PF_X) + elf_prot |= PROT_EXEC; + + elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; + + vaddr = elf_ppnt->p_vaddr; + if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { + elf_flags |= MAP_FIXED; + } else if (loc->elf_ex.e_type == ET_DYN) { + /* Try and get dynamic programs out of the way of the + * default mmap base, as well as whatever program they + * might try to exec. This is because the brk will + * follow the loader, and is not movable. */ +#if defined(CONFIG_X86) || defined(CONFIG_ARM) + /* Memory randomization might have been switched off + * in runtime via sysctl. + * If that is the case, retain the original non-zero + * load_bias value in order to establish proper + * non-randomized mappings. + */ + if (current->flags & PF_RANDOMIZE) + load_bias = 0; + else + load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); +#else + load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); +#endif + } + + error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, + elf_prot, elf_flags, 0); + if (BAD_ADDR(error)) { + send_sig(SIGKILL, current, 0); + retval = IS_ERR((void *)error) ? + PTR_ERR((void*)error) : -EINVAL; + goto out_free_dentry; + } + + if (!load_addr_set) { + load_addr_set = 1; + load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset); + if (loc->elf_ex.e_type == ET_DYN) { + load_bias += error - + ELF_PAGESTART(load_bias + vaddr); + load_addr += load_bias; + reloc_func_desc = load_bias; + } + } + k = elf_ppnt->p_vaddr; + if (k < start_code) + start_code = k; + if (start_data < k) + start_data = k; + + /* + * Check to see if the section's size will overflow the + * allowed task size. Note that p_filesz must always be + * <= p_memsz so it is only necessary to check p_memsz. + */ + if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz || + elf_ppnt->p_memsz > TASK_SIZE || + TASK_SIZE - elf_ppnt->p_memsz < k) { + /* set_brk can never work. Avoid overflows. */ + send_sig(SIGKILL, current, 0); + retval = -EINVAL; + goto out_free_dentry; + } + + k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz; + + if (k > elf_bss) + elf_bss = k; + if ((elf_ppnt->p_flags & PF_X) && end_code < k) + end_code = k; + if (end_data < k) + end_data = k; + k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz; + if (k > elf_brk) + elf_brk = k; + } + + loc->elf_ex.e_entry += load_bias; + elf_bss += load_bias; + elf_brk += load_bias; + start_code += load_bias; + end_code += load_bias; + start_data += load_bias; + end_data += load_bias; + + /* Calling set_brk effectively mmaps the pages that we need + * for the bss and break sections. We must do this before + * mapping in the interpreter, to make sure it doesn't wind + * up getting placed where the bss needs to go. + */ + retval = set_brk(elf_bss, elf_brk); + if (retval) { + send_sig(SIGKILL, current, 0); + goto out_free_dentry; + } + if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { + send_sig(SIGSEGV, current, 0); + retval = -EFAULT; /* Nobody gets to see this, but.. */ + goto out_free_dentry; + } + + if (elf_interpreter) { + unsigned long uninitialized_var(interp_map_addr); + + elf_entry = load_elf_interp(&loc->interp_elf_ex, + interpreter, + &interp_map_addr, + load_bias); + if (!IS_ERR((void *)elf_entry)) { + /* + * load_elf_interp() returns relocation + * adjustment + */ + interp_load_addr = elf_entry; + elf_entry += loc->interp_elf_ex.e_entry; + } + if (BAD_ADDR(elf_entry)) { + force_sig(SIGSEGV, current); + retval = IS_ERR((void *)elf_entry) ? + (int)elf_entry : -EINVAL; + goto out_free_dentry; + } + reloc_func_desc = interp_load_addr; + + allow_write_access(interpreter); + fput(interpreter); + kfree(elf_interpreter); + } else { + elf_entry = loc->elf_ex.e_entry; + if (BAD_ADDR(elf_entry)) { + force_sig(SIGSEGV, current); + retval = -EINVAL; + goto out_free_dentry; + } + } + + kfree(elf_phdata); + + set_binfmt(&elf_format); + +#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES + retval = arch_setup_additional_pages(bprm, !!elf_interpreter); + if (retval < 0) { + send_sig(SIGKILL, current, 0); + goto out; + } +#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ + + install_exec_creds(bprm); + current->flags &= ~PF_FORKNOEXEC; + retval = create_elf_tables(bprm, &loc->elf_ex, + load_addr, interp_load_addr); + if (retval < 0) { + send_sig(SIGKILL, current, 0); + goto out; + } + /* N.B. passed_fileno might not be initialized? */ + current->mm->end_code = end_code; + current->mm->start_code = start_code; + current->mm->start_data = start_data; + current->mm->end_data = end_data; + current->mm->start_stack = bprm->p; + +#ifdef arch_randomize_brk + if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { + current->mm->brk = current->mm->start_brk = + arch_randomize_brk(current->mm); +#ifdef CONFIG_COMPAT_BRK + current->brk_randomized = 1; +#endif + } +#endif + + if (current->personality & MMAP_PAGE_ZERO) { + /* Why this, you ask??? Well SVr4 maps page 0 as read-only, + and some applications "depend" upon this behavior. + Since we do not have the power to recompile these, we + emulate the SVr4 behavior. Sigh. */ + down_write(¤t->mm->mmap_sem); + error = do_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE, 0); + up_write(¤t->mm->mmap_sem); + } + +#ifdef ELF_PLAT_INIT + /* + * The ABI may specify that certain registers be set up in special + * ways (on i386 %edx is the address of a DT_FINI function, for + * example. In addition, it may also specify (eg, PowerPC64 ELF) + * that the e_entry field is the address of the function descriptor + * for the startup routine, rather than the address of the startup + * routine itself. This macro performs whatever initialization to + * the regs structure is required as well as any relocations to the + * function descriptor entries when executing dynamically links apps. + */ + ELF_PLAT_INIT(regs, reloc_func_desc); +#endif + + start_thread(regs, elf_entry, bprm->p); + retval = 0; +out: + kfree(loc); +out_ret: + return retval; + + /* error cleanup */ +out_free_dentry: + allow_write_access(interpreter); + if (interpreter) + fput(interpreter); +out_free_interp: + kfree(elf_interpreter); +out_free_ph: + kfree(elf_phdata); + goto out; +} + +/* This is really simpleminded and specialized - we are loading an + a.out library that is given an ELF header. */ +static int load_elf_library(struct file *file) +{ + struct elf_phdr *elf_phdata; + struct elf_phdr *eppnt; + unsigned long elf_bss, bss, len; + int retval, error, i, j; + struct elfhdr elf_ex; + + error = -ENOEXEC; + retval = kernel_read(file, 0, (char *)&elf_ex, sizeof(elf_ex)); + if (retval != sizeof(elf_ex)) + goto out; + + if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + goto out; + + /* First of all, some simple consistency checks */ + if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 || + !elf_check_arch(&elf_ex) || !file->f_op || !file->f_op->mmap) + goto out; + + /* Now read in all of the header information */ + + j = sizeof(struct elf_phdr) * elf_ex.e_phnum; + /* j < ELF_MIN_ALIGN because elf_ex.e_phnum <= 2 */ + + error = -ENOMEM; + elf_phdata = kmalloc(j, GFP_KERNEL); + if (!elf_phdata) + goto out; + + eppnt = elf_phdata; + error = -ENOEXEC; + retval = kernel_read(file, elf_ex.e_phoff, (char *)eppnt, j); + if (retval != j) + goto out_free_ph; + + for (j = 0, i = 0; ip_type == PT_LOAD) + j++; + if (j != 1) + goto out_free_ph; + + while (eppnt->p_type != PT_LOAD) + eppnt++; + + /* Now use mmap to map the library into memory. */ + down_write(¤t->mm->mmap_sem); + error = do_mmap(file, + ELF_PAGESTART(eppnt->p_vaddr), + (eppnt->p_filesz + + ELF_PAGEOFFSET(eppnt->p_vaddr)), + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, + (eppnt->p_offset - + ELF_PAGEOFFSET(eppnt->p_vaddr))); + up_write(¤t->mm->mmap_sem); + if (error != ELF_PAGESTART(eppnt->p_vaddr)) + goto out_free_ph; + + elf_bss = eppnt->p_vaddr + eppnt->p_filesz; + if (padzero(elf_bss)) { + error = -EFAULT; + goto out_free_ph; + } + + len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + + ELF_MIN_ALIGN - 1); + bss = eppnt->p_memsz + eppnt->p_vaddr; + if (bss > len) { + down_write(¤t->mm->mmap_sem); + do_brk(len, bss - len); + up_write(¤t->mm->mmap_sem); + } + error = 0; + +out_free_ph: + kfree(elf_phdata); +out: + return error; +} + +#ifdef CONFIG_ELF_CORE +/* + * ELF core dumper + * + * Modelled on fs/exec.c:aout_core_dump() + * Jeremy Fitzhardinge + */ + +/* + * Decide what to dump of a segment, part, all or none. + */ +static unsigned long vma_dump_size(struct vm_area_struct *vma, + unsigned long mm_flags) +{ +#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) + + /* The vma can be set up to tell us the answer directly. */ + if (vma->vm_flags & VM_ALWAYSDUMP) + goto whole; + + /* Hugetlb memory check */ + if (vma->vm_flags & VM_HUGETLB) { + if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) + goto whole; + } + + /* Do not dump I/O mapped devices or special mappings */ + if (vma->vm_flags & (VM_IO | VM_RESERVED)) + return 0; + + /* By default, dump shared memory if mapped from an anonymous file. */ + if (vma->vm_flags & VM_SHARED) { + if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ? + FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) + goto whole; + return 0; + } + + /* Dump segments that have been written to. */ + if (vma->anon_vma && FILTER(ANON_PRIVATE)) + goto whole; + if (vma->vm_file == NULL) + return 0; + + if (FILTER(MAPPED_PRIVATE)) + goto whole; + + /* + * If this looks like the beginning of a DSO or executable mapping, + * check for an ELF header. If we find one, dump the first page to + * aid in determining what was mapped here. + */ + if (FILTER(ELF_HEADERS) && + vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { + u32 __user *header = (u32 __user *) vma->vm_start; + u32 word; + mm_segment_t fs = get_fs(); + /* + * Doing it this way gets the constant folded by GCC. + */ + union { + u32 cmp; + char elfmag[SELFMAG]; + } magic; + BUILD_BUG_ON(SELFMAG != sizeof word); + magic.elfmag[EI_MAG0] = ELFMAG0; + magic.elfmag[EI_MAG1] = ELFMAG1; + magic.elfmag[EI_MAG2] = ELFMAG2; + magic.elfmag[EI_MAG3] = ELFMAG3; + /* + * Switch to the user "segment" for get_user(), + * then put back what elf_core_dump() had in place. + */ + set_fs(USER_DS); + if (unlikely(get_user(word, header))) + word = 0; + set_fs(fs); + if (word == magic.cmp) + return PAGE_SIZE; + } + +#undef FILTER + + return 0; + +whole: + return vma->vm_end - vma->vm_start; +} + +/* An ELF note in memory */ +struct memelfnote +{ + const char *name; + int type; + unsigned int datasz; + void *data; +}; + +static int notesize(struct memelfnote *en) +{ + int sz; + + sz = sizeof(struct elf_note); + sz += roundup(strlen(en->name) + 1, 4); + sz += roundup(en->datasz, 4); + + return sz; +} + +#define DUMP_WRITE(addr, nr, foffset) \ + do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0) + +static int alignfile(struct file *file, loff_t *foffset) +{ + static const char buf[4] = { 0, }; + DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset); + return 1; +} + +static int writenote(struct memelfnote *men, struct file *file, + loff_t *foffset) +{ + struct elf_note en; + en.n_namesz = strlen(men->name) + 1; + en.n_descsz = men->datasz; + en.n_type = men->type; + + DUMP_WRITE(&en, sizeof(en), foffset); + DUMP_WRITE(men->name, en.n_namesz, foffset); + if (!alignfile(file, foffset)) + return 0; + DUMP_WRITE(men->data, men->datasz, foffset); + if (!alignfile(file, foffset)) + return 0; + + return 1; +} +#undef DUMP_WRITE + +static void fill_elf_header(struct elfhdr *elf, int segs, + u16 machine, u32 flags, u8 osabi) +{ + memset(elf, 0, sizeof(*elf)); + + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + + elf->e_type = ET_CORE; + elf->e_machine = machine; + elf->e_version = EV_CURRENT; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_flags = flags; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize = sizeof(struct elf_phdr); + elf->e_phnum = segs; + + return; +} + +static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) +{ + phdr->p_type = PT_NOTE; + phdr->p_offset = offset; + phdr->p_vaddr = 0; + phdr->p_paddr = 0; + phdr->p_filesz = sz; + phdr->p_memsz = 0; + phdr->p_flags = 0; + phdr->p_align = 0; + return; +} + +static void fill_note(struct memelfnote *note, const char *name, int type, + unsigned int sz, void *data) +{ + note->name = name; + note->type = type; + note->datasz = sz; + note->data = data; + return; +} + +/* + * fill up all the fields in prstatus from the given task struct, except + * registers which need to be filled up separately. + */ +static void fill_prstatus(struct elf_prstatus *prstatus, + struct task_struct *p, long signr) +{ + prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; + prstatus->pr_sigpend = p->pending.signal.sig[0]; + prstatus->pr_sighold = p->blocked.sig[0]; + rcu_read_lock(); + prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); + prstatus->pr_pid = task_pid_vnr(p); + prstatus->pr_pgrp = task_pgrp_vnr(p); + prstatus->pr_sid = task_session_vnr(p); + if (thread_group_leader(p)) { + struct task_cputime cputime; + + /* + * This is the record for the group leader. It shows the + * group-wide total, not its individual thread total. + */ + thread_group_cputime(p, &cputime); + cputime_to_timeval(cputime.utime, &prstatus->pr_utime); + cputime_to_timeval(cputime.stime, &prstatus->pr_stime); + } else { + cputime_to_timeval(p->utime, &prstatus->pr_utime); + cputime_to_timeval(p->stime, &prstatus->pr_stime); + } + cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); + cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); +} + +static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, + struct mm_struct *mm) +{ + const struct cred *cred; + unsigned int i, len; + + /* first copy the parameters from user space */ + memset(psinfo, 0, sizeof(struct elf_prpsinfo)); + + len = mm->arg_end - mm->arg_start; + if (len >= ELF_PRARGSZ) + len = ELF_PRARGSZ-1; + if (copy_from_user(&psinfo->pr_psargs, + (const char __user *)mm->arg_start, len)) + return -EFAULT; + for(i = 0; i < len; i++) + if (psinfo->pr_psargs[i] == 0) + psinfo->pr_psargs[i] = ' '; + psinfo->pr_psargs[len] = 0; + + rcu_read_lock(); + psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); + psinfo->pr_pid = task_pid_vnr(p); + psinfo->pr_pgrp = task_pgrp_vnr(p); + psinfo->pr_sid = task_session_vnr(p); + + i = p->state ? ffz(~p->state) + 1 : 0; + psinfo->pr_state = i; + psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; + psinfo->pr_zomb = psinfo->pr_sname == 'Z'; + psinfo->pr_nice = task_nice(p); + psinfo->pr_flag = p->flags; + rcu_read_lock(); + cred = __task_cred(p); + SET_UID(psinfo->pr_uid, cred->uid); + SET_GID(psinfo->pr_gid, cred->gid); + rcu_read_unlock(); + strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + + return 0; +} + +static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) +{ + elf_addr_t *auxv = (elf_addr_t *) mm->saved_auxv; + int i = 0; + do + i += 2; + while (auxv[i - 2] != AT_NULL); + fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); +} + +#ifdef CORE_DUMP_USE_REGSET +#include + +struct elf_thread_core_info { + struct elf_thread_core_info *next; + struct task_struct *task; + struct elf_prstatus prstatus; + struct memelfnote notes[0]; +}; + +struct elf_note_info { + struct elf_thread_core_info *thread; + struct memelfnote psinfo; + struct memelfnote auxv; + size_t size; + int thread_notes; +}; + +/* + * When a regset has a writeback hook, we call it on each thread before + * dumping user memory. On register window machines, this makes sure the + * user memory backing the register data is up to date before we read it. + */ +static void do_thread_regset_writeback(struct task_struct *task, + const struct user_regset *regset) +{ + if (regset->writeback) + regset->writeback(task, regset, 1); +} + +static int fill_thread_core_info(struct elf_thread_core_info *t, + const struct user_regset_view *view, + long signr, size_t *total) +{ + unsigned int i; + + /* + * NT_PRSTATUS is the one special case, because the regset data + * goes into the pr_reg field inside the note contents, rather + * than being the whole note contents. We fill the reset in here. + * We assume that regset 0 is NT_PRSTATUS. + */ + fill_prstatus(&t->prstatus, t->task, signr); + (void) view->regsets[0].get(t->task, &view->regsets[0], + 0, sizeof(t->prstatus.pr_reg), + &t->prstatus.pr_reg, NULL); + + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, + sizeof(t->prstatus), &t->prstatus); + *total += notesize(&t->notes[0]); + + do_thread_regset_writeback(t->task, &view->regsets[0]); + + /* + * Each other regset might generate a note too. For each regset + * that has no core_note_type or is inactive, we leave t->notes[i] + * all zero and we'll know to skip writing it later. + */ + for (i = 1; i < view->n; ++i) { + const struct user_regset *regset = &view->regsets[i]; + do_thread_regset_writeback(t->task, regset); + if (regset->core_note_type && regset->get && + (!regset->active || regset->active(t->task, regset))) { + int ret; + size_t size = regset->n * regset->size; + void *data = kmalloc(size, GFP_KERNEL); + if (unlikely(!data)) + return 0; + ret = regset->get(t->task, regset, + 0, size, data, NULL); + if (unlikely(ret)) + kfree(data); + else { + if (regset->core_note_type != NT_PRFPREG) + fill_note(&t->notes[i], "LINUX", + regset->core_note_type, + size, data); + else { + t->prstatus.pr_fpvalid = 1; + fill_note(&t->notes[i], "CORE", + NT_PRFPREG, size, data); + } + *total += notesize(&t->notes[i]); + } + } + } + + return 1; +} + +static int fill_note_info(struct elfhdr *elf, int phdrs, + struct elf_note_info *info, + long signr, struct pt_regs *regs) +{ + struct task_struct *dump_task = current; + const struct user_regset_view *view = task_user_regset_view(dump_task); + struct elf_thread_core_info *t; + struct elf_prpsinfo *psinfo; + struct core_thread *ct; + unsigned int i; + + info->size = 0; + info->thread = NULL; + + psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); + if (psinfo == NULL) + return 0; + + fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + + /* + * Figure out how many notes we're going to need for each thread. + */ + info->thread_notes = 0; + for (i = 0; i < view->n; ++i) + if (view->regsets[i].core_note_type != 0) + ++info->thread_notes; + + /* + * Sanity check. We rely on regset 0 being in NT_PRSTATUS, + * since it is our one special case. + */ + if (unlikely(info->thread_notes == 0) || + unlikely(view->regsets[0].core_note_type != NT_PRSTATUS)) { + WARN_ON(1); + return 0; + } + + /* + * Initialize the ELF file header. + */ + fill_elf_header(elf, phdrs, + view->e_machine, view->e_flags, view->ei_osabi); + + /* + * Allocate a structure for each thread. + */ + for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) { + t = kzalloc(offsetof(struct elf_thread_core_info, + notes[info->thread_notes]), + GFP_KERNEL); + if (unlikely(!t)) + return 0; + + t->task = ct->task; + if (ct->task == dump_task || !info->thread) { + t->next = info->thread; + info->thread = t; + } else { + /* + * Make sure to keep the original task at + * the head of the list. + */ + t->next = info->thread->next; + info->thread->next = t; + } + } + + /* + * Now fill in each thread's information. + */ + for (t = info->thread; t != NULL; t = t->next) + if (!fill_thread_core_info(t, view, signr, &info->size)) + return 0; + + /* + * Fill in the two process-wide notes. + */ + fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm); + info->size += notesize(&info->psinfo); + + fill_auxv_note(&info->auxv, current->mm); + info->size += notesize(&info->auxv); + + return 1; +} + +static size_t get_note_info_size(struct elf_note_info *info) +{ + return info->size; +} + +/* + * Write all the notes for each thread. When writing the first thread, the + * process-wide notes are interleaved after the first thread-specific note. + */ +static int write_note_info(struct elf_note_info *info, + struct file *file, loff_t *foffset) +{ + bool first = 1; + struct elf_thread_core_info *t = info->thread; + + do { + int i; + + if (!writenote(&t->notes[0], file, foffset)) + return 0; + + if (first && !writenote(&info->psinfo, file, foffset)) + return 0; + if (first && !writenote(&info->auxv, file, foffset)) + return 0; + + for (i = 1; i < info->thread_notes; ++i) + if (t->notes[i].data && + !writenote(&t->notes[i], file, foffset)) + return 0; + + first = 0; + t = t->next; + } while (t); + + return 1; +} + +static void free_note_info(struct elf_note_info *info) +{ + struct elf_thread_core_info *threads = info->thread; + while (threads) { + unsigned int i; + struct elf_thread_core_info *t = threads; + threads = t->next; + WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus); + for (i = 1; i < info->thread_notes; ++i) + kfree(t->notes[i].data); + kfree(t); + } + kfree(info->psinfo.data); +} + +#else + +/* Here is the structure in which status of each thread is captured. */ +struct elf_thread_status +{ + struct list_head list; + struct elf_prstatus prstatus; /* NT_PRSTATUS */ + elf_fpregset_t fpu; /* NT_PRFPREG */ + struct task_struct *thread; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t xfpu; /* ELF_CORE_XFPREG_TYPE */ +#endif + struct memelfnote notes[3]; + int num_notes; +}; + +/* + * In order to add the specific thread information for the elf file format, + * we need to keep a linked list of every threads pr_status and then create + * a single section for them in the final core file. + */ +static int elf_dump_thread_status(long signr, struct elf_thread_status *t) +{ + int sz = 0; + struct task_struct *p = t->thread; + t->num_notes = 0; + + fill_prstatus(&t->prstatus, p, signr); + elf_core_copy_task_regs(p, &t->prstatus.pr_reg); + + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + &(t->prstatus)); + t->num_notes++; + sz += notesize(&t->notes[0]); + + if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, + &t->fpu))) { + fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), + &(t->fpu)); + t->num_notes++; + sz += notesize(&t->notes[1]); + } + +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(p, &t->xfpu)) { + fill_note(&t->notes[2], "LINUX", ELF_CORE_XFPREG_TYPE, + sizeof(t->xfpu), &t->xfpu); + t->num_notes++; + sz += notesize(&t->notes[2]); + } +#endif + return sz; +} + +struct elf_note_info { + struct memelfnote *notes; + struct elf_prstatus *prstatus; /* NT_PRSTATUS */ + struct elf_prpsinfo *psinfo; /* NT_PRPSINFO */ + struct list_head thread_list; + elf_fpregset_t *fpu; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t *xfpu; +#endif + int thread_status_size; + int numnote; +}; + +static int elf_note_info_init(struct elf_note_info *info) +{ + memset(info, 0, sizeof(*info)); + INIT_LIST_HEAD(&info->thread_list); + + /* Allocate space for six ELF notes */ + info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL); + if (!info->notes) + return 0; + info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); + if (!info->psinfo) + goto notes_free; + info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); + if (!info->prstatus) + goto psinfo_free; + info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); + if (!info->fpu) + goto prstatus_free; +#ifdef ELF_CORE_COPY_XFPREGS + info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL); + if (!info->xfpu) + goto fpu_free; +#endif + return 1; +#ifdef ELF_CORE_COPY_XFPREGS + fpu_free: + kfree(info->fpu); +#endif + prstatus_free: + kfree(info->prstatus); + psinfo_free: + kfree(info->psinfo); + notes_free: + kfree(info->notes); + return 0; +} + +static int fill_note_info(struct elfhdr *elf, int phdrs, + struct elf_note_info *info, + long signr, struct pt_regs *regs) +{ + struct list_head *t; + + if (!elf_note_info_init(info)) + return 0; + + if (signr) { + struct core_thread *ct; + struct elf_thread_status *ets; + + for (ct = current->mm->core_state->dumper.next; + ct; ct = ct->next) { + ets = kzalloc(sizeof(*ets), GFP_KERNEL); + if (!ets) + return 0; + + ets->thread = ct->task; + list_add(&ets->list, &info->thread_list); + } + + list_for_each(t, &info->thread_list) { + int sz; + + ets = list_entry(t, struct elf_thread_status, list); + sz = elf_dump_thread_status(signr, ets); + info->thread_status_size += sz; + } + } + /* now collect the dump for the current */ + memset(info->prstatus, 0, sizeof(*info->prstatus)); + fill_prstatus(info->prstatus, current, signr); + elf_core_copy_regs(&info->prstatus->pr_reg, regs); + + /* Set up header */ + fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI); + + /* + * Set up the notes in similar form to SVR4 core dumps made + * with info from their /proc. + */ + + fill_note(info->notes + 0, "CORE", NT_PRSTATUS, + sizeof(*info->prstatus), info->prstatus); + fill_psinfo(info->psinfo, current->group_leader, current->mm); + fill_note(info->notes + 1, "CORE", NT_PRPSINFO, + sizeof(*info->psinfo), info->psinfo); + + info->numnote = 2; + + fill_auxv_note(&info->notes[info->numnote++], current->mm); + + /* Try to dump the FPU. */ + info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs, + info->fpu); + if (info->prstatus->pr_fpvalid) + fill_note(info->notes + info->numnote++, + "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu); +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(current, info->xfpu)) + fill_note(info->notes + info->numnote++, + "LINUX", ELF_CORE_XFPREG_TYPE, + sizeof(*info->xfpu), info->xfpu); +#endif + + return 1; +} + +static size_t get_note_info_size(struct elf_note_info *info) +{ + int sz = 0; + int i; + + for (i = 0; i < info->numnote; i++) + sz += notesize(info->notes + i); + + sz += info->thread_status_size; + + return sz; +} + +static int write_note_info(struct elf_note_info *info, + struct file *file, loff_t *foffset) +{ + int i; + struct list_head *t; + + for (i = 0; i < info->numnote; i++) + if (!writenote(info->notes + i, file, foffset)) + return 0; + + /* write out the thread status notes section */ + list_for_each(t, &info->thread_list) { + struct elf_thread_status *tmp = + list_entry(t, struct elf_thread_status, list); + + for (i = 0; i < tmp->num_notes; i++) + if (!writenote(&tmp->notes[i], file, foffset)) + return 0; + } + + return 1; +} + +static void free_note_info(struct elf_note_info *info) +{ + while (!list_empty(&info->thread_list)) { + struct list_head *tmp = info->thread_list.next; + list_del(tmp); + kfree(list_entry(tmp, struct elf_thread_status, list)); + } + + kfree(info->prstatus); + kfree(info->psinfo); + kfree(info->notes); + kfree(info->fpu); +#ifdef ELF_CORE_COPY_XFPREGS + kfree(info->xfpu); +#endif +} + +#endif + +static struct vm_area_struct *first_vma(struct task_struct *tsk, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret = tsk->mm->mmap; + + if (ret) + return ret; + return gate_vma; +} +/* + * Helper function for iterating across a vma list. It ensures that the caller + * will visit `gate_vma' prior to terminating the search. + */ +static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret; + + ret = this_vma->vm_next; + if (ret) + return ret; + if (this_vma == gate_vma) + return NULL; + return gate_vma; +} + +static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, + elf_addr_t e_shoff, int segs) +{ + elf->e_shoff = e_shoff; + elf->e_shentsize = sizeof(*shdr4extnum); + elf->e_shnum = 1; + elf->e_shstrndx = SHN_UNDEF; + + memset(shdr4extnum, 0, sizeof(*shdr4extnum)); + + shdr4extnum->sh_type = SHT_NULL; + shdr4extnum->sh_size = elf->e_shnum; + shdr4extnum->sh_link = elf->e_shstrndx; + shdr4extnum->sh_info = segs; +} + +static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma, + unsigned long mm_flags) +{ + struct vm_area_struct *vma; + size_t size = 0; + + for (vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma)) + size += vma_dump_size(vma, mm_flags); + return size; +} + +/* + * Actual dumper + * + * This is a two-pass process; first we find the offsets of the bits, + * and then they are actually written out. If we run out of core limit + * we just truncate. + */ +static int elf_core_dump(struct coredump_params *cprm) +{ + int has_dumped = 0; + mm_segment_t fs; + int segs; + size_t size = 0; + struct vm_area_struct *vma, *gate_vma; + struct elfhdr *elf = NULL; + loff_t offset = 0, dataoff, foffset; + struct elf_note_info info; + struct elf_phdr *phdr4note = NULL; + struct elf_shdr *shdr4extnum = NULL; + Elf_Half e_phnum; + elf_addr_t e_shoff; + + /* + * We no longer stop all VM operations. + * + * This is because those proceses that could possibly change map_count + * or the mmap / vma pages are now blocked in do_exit on current + * finishing this core dump. + * + * Only ptrace can touch these memory addresses, but it doesn't change + * the map_count or the pages allocated. So no possibility of crashing + * exists while dumping the mm->vm_next areas to the core file. + */ + + /* alloc memory for large data structures: too large to be on stack */ + elf = kmalloc(sizeof(*elf), GFP_KERNEL); + if (!elf) + goto out; + /* + * The number of segs are recored into ELF header as 16bit value. + * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. + */ + segs = current->mm->map_count; + segs += elf_core_extra_phdrs(); + + gate_vma = get_gate_vma(current->mm); + if (gate_vma != NULL) + segs++; + + /* for notes section */ + segs++; + + /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid + * this, kernel supports extended numbering. Have a look at + * include/linux/elf.h for further information. */ + e_phnum = segs > PN_XNUM ? PN_XNUM : segs; + + /* + * Collect all the non-memory information about the process for the + * notes. This also sets up the file header. + */ + if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs)) + goto cleanup; + + has_dumped = 1; + current->flags |= PF_DUMPCORE; + + fs = get_fs(); + set_fs(KERNEL_DS); + + offset += sizeof(*elf); /* Elf header */ + offset += segs * sizeof(struct elf_phdr); /* Program headers */ + foffset = offset; + + /* Write notes phdr entry */ + { + size_t sz = get_note_info_size(&info); + + sz += elf_coredump_extra_notes_size(); + + phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL); + if (!phdr4note) + goto end_coredump; + + fill_elf_note_phdr(phdr4note, sz, offset); + offset += sz; + } + + dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); + + offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags); + offset += elf_core_extra_data_size(); + e_shoff = offset; + + if (e_phnum == PN_XNUM) { + shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL); + if (!shdr4extnum) + goto end_coredump; + fill_extnum_info(elf, shdr4extnum, e_shoff, segs); + } + + offset = dataoff; + + size += sizeof(*elf); + if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf))) + goto end_coredump; + + size += sizeof(*phdr4note); + if (size > cprm->limit + || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note))) + goto end_coredump; + + /* Write program headers for segments dump */ + for (vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma)) { + struct elf_phdr phdr; + + phdr.p_type = PT_LOAD; + phdr.p_offset = offset; + phdr.p_vaddr = vma->vm_start; + phdr.p_paddr = 0; + phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags); + phdr.p_memsz = vma->vm_end - vma->vm_start; + offset += phdr.p_filesz; + phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; + if (vma->vm_flags & VM_WRITE) + phdr.p_flags |= PF_W; + if (vma->vm_flags & VM_EXEC) + phdr.p_flags |= PF_X; + phdr.p_align = ELF_EXEC_PAGESIZE; + + size += sizeof(phdr); + if (size > cprm->limit + || !dump_write(cprm->file, &phdr, sizeof(phdr))) + goto end_coredump; + } + + if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit)) + goto end_coredump; + + /* write out the notes section */ + if (!write_note_info(&info, cprm->file, &foffset)) + goto end_coredump; + + if (elf_coredump_extra_notes_write(cprm->file, &foffset)) + goto end_coredump; + + /* Align to page */ + if (!dump_seek(cprm->file, dataoff - foffset)) + goto end_coredump; + + for (vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma)) { + unsigned long addr; + unsigned long end; + + end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags); + + for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { + struct page *page; + int stop; + + page = get_dump_page(addr); + if (page) { + void *kaddr = kmap(page); + stop = ((size += PAGE_SIZE) > cprm->limit) || + !dump_write(cprm->file, kaddr, + PAGE_SIZE); + kunmap(page); + page_cache_release(page); + } else + stop = !dump_seek(cprm->file, PAGE_SIZE); + if (stop) + goto end_coredump; + } + } + + if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit)) + goto end_coredump; + + if (e_phnum == PN_XNUM) { + size += sizeof(*shdr4extnum); + if (size > cprm->limit + || !dump_write(cprm->file, shdr4extnum, + sizeof(*shdr4extnum))) + goto end_coredump; + } + +end_coredump: + set_fs(fs); + +cleanup: + free_note_info(&info); + kfree(shdr4extnum); + kfree(phdr4note); + kfree(elf); +out: + return has_dumped; +} + +#endif /* CONFIG_ELF_CORE */ + +static int __init init_elf_binfmt(void) +{ + return register_binfmt(&elf_format); +} + +static void __exit exit_elf_binfmt(void) +{ + /* Remove the COFF and ELF loaders. */ + unregister_binfmt(&elf_format); +} + +core_initcall(init_elf_binfmt); +module_exit(exit_elf_binfmt); +MODULE_LICENSE("GPL"); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c new file mode 100644 index 00000000..2bc5dc64 --- /dev/null +++ b/fs/binfmt_elf_fdpic.c @@ -0,0 +1,1875 @@ +/* binfmt_elf_fdpic.c: FDPIC ELF binary format + * + * Copyright (C) 2003, 2004, 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * Derived from binfmt_elf.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +typedef char *elf_caddr_t; + +#if 0 +#define kdebug(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ ) +#else +#define kdebug(fmt, ...) do {} while(0) +#endif + +#if 0 +#define kdcore(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ ) +#else +#define kdcore(fmt, ...) do {} while(0) +#endif + +MODULE_LICENSE("GPL"); + +static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); +static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *); +static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *, + struct mm_struct *, const char *); + +static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *, + struct elf_fdpic_params *, + struct elf_fdpic_params *); + +#ifndef CONFIG_MMU +static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *, + unsigned long *); +static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *, + struct file *, + struct mm_struct *); +#endif + +static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, + struct file *, struct mm_struct *); + +#ifdef CONFIG_ELF_CORE +static int elf_fdpic_core_dump(struct coredump_params *cprm); +#endif + +static struct linux_binfmt elf_fdpic_format = { + .module = THIS_MODULE, + .load_binary = load_elf_fdpic_binary, +#ifdef CONFIG_ELF_CORE + .core_dump = elf_fdpic_core_dump, +#endif + .min_coredump = ELF_EXEC_PAGESIZE, +}; + +static int __init init_elf_fdpic_binfmt(void) +{ + return register_binfmt(&elf_fdpic_format); +} + +static void __exit exit_elf_fdpic_binfmt(void) +{ + unregister_binfmt(&elf_fdpic_format); +} + +core_initcall(init_elf_fdpic_binfmt); +module_exit(exit_elf_fdpic_binfmt); + +static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) +{ + if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0) + return 0; + if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) + return 0; + if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr)) + return 0; + if (!file->f_op || !file->f_op->mmap) + return 0; + return 1; +} + +/*****************************************************************************/ +/* + * read the program headers table into memory + */ +static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, + struct file *file) +{ + struct elf32_phdr *phdr; + unsigned long size; + int retval, loop; + + if (params->hdr.e_phentsize != sizeof(struct elf_phdr)) + return -ENOMEM; + if (params->hdr.e_phnum > 65536U / sizeof(struct elf_phdr)) + return -ENOMEM; + + size = params->hdr.e_phnum * sizeof(struct elf_phdr); + params->phdrs = kmalloc(size, GFP_KERNEL); + if (!params->phdrs) + return -ENOMEM; + + retval = kernel_read(file, params->hdr.e_phoff, + (char *) params->phdrs, size); + if (unlikely(retval != size)) + return retval < 0 ? retval : -ENOEXEC; + + /* determine stack size for this binary */ + phdr = params->phdrs; + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + if (phdr->p_type != PT_GNU_STACK) + continue; + + if (phdr->p_flags & PF_X) + params->flags |= ELF_FDPIC_FLAG_EXEC_STACK; + else + params->flags |= ELF_FDPIC_FLAG_NOEXEC_STACK; + + params->stack_size = phdr->p_memsz; + break; + } + + return 0; +} + +/*****************************************************************************/ +/* + * load an fdpic binary into various bits of memory + */ +static int load_elf_fdpic_binary(struct linux_binprm *bprm, + struct pt_regs *regs) +{ + struct elf_fdpic_params exec_params, interp_params; + struct elf_phdr *phdr; + unsigned long stack_size, entryaddr; +#ifdef ELF_FDPIC_PLAT_INIT + unsigned long dynaddr; +#endif +#ifndef CONFIG_MMU + unsigned long stack_prot; +#endif + struct file *interpreter = NULL; /* to shut gcc up */ + char *interpreter_name = NULL; + int executable_stack; + int retval, i; + + kdebug("____ LOAD %d ____", current->pid); + + memset(&exec_params, 0, sizeof(exec_params)); + memset(&interp_params, 0, sizeof(interp_params)); + + exec_params.hdr = *(struct elfhdr *) bprm->buf; + exec_params.flags = ELF_FDPIC_FLAG_PRESENT | ELF_FDPIC_FLAG_EXECUTABLE; + + /* check that this is a binary we know how to deal with */ + retval = -ENOEXEC; + if (!is_elf_fdpic(&exec_params.hdr, bprm->file)) + goto error; + + /* read the program header table */ + retval = elf_fdpic_fetch_phdrs(&exec_params, bprm->file); + if (retval < 0) + goto error; + + /* scan for a program header that specifies an interpreter */ + phdr = exec_params.phdrs; + + for (i = 0; i < exec_params.hdr.e_phnum; i++, phdr++) { + switch (phdr->p_type) { + case PT_INTERP: + retval = -ENOMEM; + if (phdr->p_filesz > PATH_MAX) + goto error; + retval = -ENOENT; + if (phdr->p_filesz < 2) + goto error; + + /* read the name of the interpreter into memory */ + interpreter_name = kmalloc(phdr->p_filesz, GFP_KERNEL); + if (!interpreter_name) + goto error; + + retval = kernel_read(bprm->file, + phdr->p_offset, + interpreter_name, + phdr->p_filesz); + if (unlikely(retval != phdr->p_filesz)) { + if (retval >= 0) + retval = -ENOEXEC; + goto error; + } + + retval = -ENOENT; + if (interpreter_name[phdr->p_filesz - 1] != '\0') + goto error; + + kdebug("Using ELF interpreter %s", interpreter_name); + + /* replace the program with the interpreter */ + interpreter = open_exec(interpreter_name); + retval = PTR_ERR(interpreter); + if (IS_ERR(interpreter)) { + interpreter = NULL; + goto error; + } + + /* + * If the binary is not readable then enforce + * mm->dumpable = 0 regardless of the interpreter's + * permissions. + */ + if (file_permission(interpreter, MAY_READ) < 0) + bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + + retval = kernel_read(interpreter, 0, bprm->buf, + BINPRM_BUF_SIZE); + if (unlikely(retval != BINPRM_BUF_SIZE)) { + if (retval >= 0) + retval = -ENOEXEC; + goto error; + } + + interp_params.hdr = *((struct elfhdr *) bprm->buf); + break; + + case PT_LOAD: +#ifdef CONFIG_MMU + if (exec_params.load_addr == 0) + exec_params.load_addr = phdr->p_vaddr; +#endif + break; + } + + } + + if (elf_check_const_displacement(&exec_params.hdr)) + exec_params.flags |= ELF_FDPIC_FLAG_CONSTDISP; + + /* perform insanity checks on the interpreter */ + if (interpreter_name) { + retval = -ELIBBAD; + if (!is_elf_fdpic(&interp_params.hdr, interpreter)) + goto error; + + interp_params.flags = ELF_FDPIC_FLAG_PRESENT; + + /* read the interpreter's program header table */ + retval = elf_fdpic_fetch_phdrs(&interp_params, interpreter); + if (retval < 0) + goto error; + } + + stack_size = exec_params.stack_size; + if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) + executable_stack = EXSTACK_ENABLE_X; + else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) + executable_stack = EXSTACK_DISABLE_X; + else + executable_stack = EXSTACK_DEFAULT; + + if (stack_size == 0) { + stack_size = interp_params.stack_size; + if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) + executable_stack = EXSTACK_ENABLE_X; + else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) + executable_stack = EXSTACK_DISABLE_X; + else + executable_stack = EXSTACK_DEFAULT; + } + + retval = -ENOEXEC; + if (stack_size == 0) + goto error; + + if (elf_check_const_displacement(&interp_params.hdr)) + interp_params.flags |= ELF_FDPIC_FLAG_CONSTDISP; + + /* flush all traces of the currently running executable */ + retval = flush_old_exec(bprm); + if (retval) + goto error; + + /* there's now no turning back... the old userspace image is dead, + * defunct, deceased, etc. after this point we have to exit via + * error_kill */ + set_personality(PER_LINUX_FDPIC); + if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) + current->personality |= READ_IMPLIES_EXEC; + + setup_new_exec(bprm); + + set_binfmt(&elf_fdpic_format); + + current->mm->start_code = 0; + current->mm->end_code = 0; + current->mm->start_stack = 0; + current->mm->start_data = 0; + current->mm->end_data = 0; + current->mm->context.exec_fdpic_loadmap = 0; + current->mm->context.interp_fdpic_loadmap = 0; + + current->flags &= ~PF_FORKNOEXEC; + +#ifdef CONFIG_MMU + elf_fdpic_arch_lay_out_mm(&exec_params, + &interp_params, + ¤t->mm->start_stack, + ¤t->mm->start_brk); + + retval = setup_arg_pages(bprm, current->mm->start_stack, + executable_stack); + if (retval < 0) { + send_sig(SIGKILL, current, 0); + goto error_kill; + } +#endif + + /* load the executable and interpreter into memory */ + retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm, + "executable"); + if (retval < 0) + goto error_kill; + + if (interpreter_name) { + retval = elf_fdpic_map_file(&interp_params, interpreter, + current->mm, "interpreter"); + if (retval < 0) { + printk(KERN_ERR "Unable to load interpreter\n"); + goto error_kill; + } + + allow_write_access(interpreter); + fput(interpreter); + interpreter = NULL; + } + +#ifdef CONFIG_MMU + if (!current->mm->start_brk) + current->mm->start_brk = current->mm->end_data; + + current->mm->brk = current->mm->start_brk = + PAGE_ALIGN(current->mm->start_brk); + +#else + /* create a stack and brk area big enough for everyone + * - the brk heap starts at the bottom and works up + * - the stack starts at the top and works down + */ + stack_size = (stack_size + PAGE_SIZE - 1) & PAGE_MASK; + if (stack_size < PAGE_SIZE * 2) + stack_size = PAGE_SIZE * 2; + + stack_prot = PROT_READ | PROT_WRITE; + if (executable_stack == EXSTACK_ENABLE_X || + (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC)) + stack_prot |= PROT_EXEC; + + down_write(¤t->mm->mmap_sem); + current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot, + MAP_PRIVATE | MAP_ANONYMOUS | + MAP_UNINITIALIZED | MAP_GROWSDOWN, + 0); + + if (IS_ERR_VALUE(current->mm->start_brk)) { + up_write(¤t->mm->mmap_sem); + retval = current->mm->start_brk; + current->mm->start_brk = 0; + goto error_kill; + } + + up_write(¤t->mm->mmap_sem); + + current->mm->brk = current->mm->start_brk; + current->mm->context.end_brk = current->mm->start_brk; + current->mm->context.end_brk += + (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0; + current->mm->start_stack = current->mm->start_brk + stack_size; +#endif + + install_exec_creds(bprm); + current->flags &= ~PF_FORKNOEXEC; + if (create_elf_fdpic_tables(bprm, current->mm, + &exec_params, &interp_params) < 0) + goto error_kill; + + kdebug("- start_code %lx", current->mm->start_code); + kdebug("- end_code %lx", current->mm->end_code); + kdebug("- start_data %lx", current->mm->start_data); + kdebug("- end_data %lx", current->mm->end_data); + kdebug("- start_brk %lx", current->mm->start_brk); + kdebug("- brk %lx", current->mm->brk); + kdebug("- start_stack %lx", current->mm->start_stack); + +#ifdef ELF_FDPIC_PLAT_INIT + /* + * The ABI may specify that certain registers be set up in special + * ways (on i386 %edx is the address of a DT_FINI function, for + * example. This macro performs whatever initialization to + * the regs structure is required. + */ + dynaddr = interp_params.dynamic_addr ?: exec_params.dynamic_addr; + ELF_FDPIC_PLAT_INIT(regs, exec_params.map_addr, interp_params.map_addr, + dynaddr); +#endif + + /* everything is now ready... get the userspace context ready to roll */ + entryaddr = interp_params.entry_addr ?: exec_params.entry_addr; + start_thread(regs, entryaddr, current->mm->start_stack); + + retval = 0; + +error: + if (interpreter) { + allow_write_access(interpreter); + fput(interpreter); + } + kfree(interpreter_name); + kfree(exec_params.phdrs); + kfree(exec_params.loadmap); + kfree(interp_params.phdrs); + kfree(interp_params.loadmap); + return retval; + + /* unrecoverable error - kill the process */ +error_kill: + send_sig(SIGSEGV, current, 0); + goto error; + +} + +/*****************************************************************************/ + +#ifndef ELF_BASE_PLATFORM +/* + * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture. + * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value + * will be copied to the user stack in the same manner as AT_PLATFORM. + */ +#define ELF_BASE_PLATFORM NULL +#endif + +/* + * present useful information to the program by shovelling it onto the new + * process's stack + */ +static int create_elf_fdpic_tables(struct linux_binprm *bprm, + struct mm_struct *mm, + struct elf_fdpic_params *exec_params, + struct elf_fdpic_params *interp_params) +{ + const struct cred *cred = current_cred(); + unsigned long sp, csp, nitems; + elf_caddr_t __user *argv, *envp; + size_t platform_len = 0, len; + char *k_platform, *k_base_platform; + char __user *u_platform, *u_base_platform, *p; + long hwcap; + int loop; + int nr; /* reset for each csp adjustment */ + +#ifdef CONFIG_MMU + /* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions + * by the processes running on the same package. One thing we can do is + * to shuffle the initial stack for them, so we give the architecture + * an opportunity to do so here. + */ + sp = arch_align_stack(bprm->p); +#else + sp = mm->start_stack; + + /* stack the program arguments and environment */ + if (elf_fdpic_transfer_args_to_stack(bprm, &sp) < 0) + return -EFAULT; +#endif + + hwcap = ELF_HWCAP; + + /* + * If this architecture has a platform capability string, copy it + * to userspace. In some cases (Sparc), this info is impossible + * for userspace to get any other way, in others (i386) it is + * merely difficult. + */ + k_platform = ELF_PLATFORM; + u_platform = NULL; + + if (k_platform) { + platform_len = strlen(k_platform) + 1; + sp -= platform_len; + u_platform = (char __user *) sp; + if (__copy_to_user(u_platform, k_platform, platform_len) != 0) + return -EFAULT; + } + + /* + * If this architecture has a "base" platform capability + * string, copy it to userspace. + */ + k_base_platform = ELF_BASE_PLATFORM; + u_base_platform = NULL; + + if (k_base_platform) { + platform_len = strlen(k_base_platform) + 1; + sp -= platform_len; + u_base_platform = (char __user *) sp; + if (__copy_to_user(u_base_platform, k_base_platform, platform_len) != 0) + return -EFAULT; + } + + sp &= ~7UL; + + /* stack the load map(s) */ + len = sizeof(struct elf32_fdpic_loadmap); + len += sizeof(struct elf32_fdpic_loadseg) * exec_params->loadmap->nsegs; + sp = (sp - len) & ~7UL; + exec_params->map_addr = sp; + + if (copy_to_user((void __user *) sp, exec_params->loadmap, len) != 0) + return -EFAULT; + + current->mm->context.exec_fdpic_loadmap = (unsigned long) sp; + + if (interp_params->loadmap) { + len = sizeof(struct elf32_fdpic_loadmap); + len += sizeof(struct elf32_fdpic_loadseg) * + interp_params->loadmap->nsegs; + sp = (sp - len) & ~7UL; + interp_params->map_addr = sp; + + if (copy_to_user((void __user *) sp, interp_params->loadmap, + len) != 0) + return -EFAULT; + + current->mm->context.interp_fdpic_loadmap = (unsigned long) sp; + } + + /* force 16 byte _final_ alignment here for generality */ +#define DLINFO_ITEMS 15 + + nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + + (k_base_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH; + + if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) + nitems++; + + csp = sp; + sp -= nitems * 2 * sizeof(unsigned long); + sp -= (bprm->envc + 1) * sizeof(char *); /* envv[] */ + sp -= (bprm->argc + 1) * sizeof(char *); /* argv[] */ + sp -= 1 * sizeof(unsigned long); /* argc */ + + csp -= sp & 15UL; + sp -= sp & 15UL; + + /* put the ELF interpreter info on the stack */ +#define NEW_AUX_ENT(id, val) \ + do { \ + struct { unsigned long _id, _val; } __user *ent; \ + \ + ent = (void __user *) csp; \ + __put_user((id), &ent[nr]._id); \ + __put_user((val), &ent[nr]._val); \ + nr++; \ + } while (0) + + nr = 0; + csp -= 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_NULL, 0); + if (k_platform) { + nr = 0; + csp -= 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_PLATFORM, + (elf_addr_t) (unsigned long) u_platform); + } + + if (k_base_platform) { + nr = 0; + csp -= 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_BASE_PLATFORM, + (elf_addr_t) (unsigned long) u_base_platform); + } + + if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) { + nr = 0; + csp -= 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_EXECFD, bprm->interp_data); + } + + nr = 0; + csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_HWCAP, hwcap); + NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE); + NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); + NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr); + NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); + NEW_AUX_ENT(AT_PHNUM, exec_params->hdr.e_phnum); + NEW_AUX_ENT(AT_BASE, interp_params->elfhdr_addr); + NEW_AUX_ENT(AT_FLAGS, 0); + NEW_AUX_ENT(AT_ENTRY, exec_params->entry_addr); + NEW_AUX_ENT(AT_UID, (elf_addr_t) cred->uid); + NEW_AUX_ENT(AT_EUID, (elf_addr_t) cred->euid); + NEW_AUX_ENT(AT_GID, (elf_addr_t) cred->gid); + NEW_AUX_ENT(AT_EGID, (elf_addr_t) cred->egid); + NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_EXECFN, bprm->exec); + +#ifdef ARCH_DLINFO + nr = 0; + csp -= AT_VECTOR_SIZE_ARCH * 2 * sizeof(unsigned long); + + /* ARCH_DLINFO must come last so platform specific code can enforce + * special alignment requirements on the AUXV if necessary (eg. PPC). + */ + ARCH_DLINFO; +#endif +#undef NEW_AUX_ENT + + /* allocate room for argv[] and envv[] */ + csp -= (bprm->envc + 1) * sizeof(elf_caddr_t); + envp = (elf_caddr_t __user *) csp; + csp -= (bprm->argc + 1) * sizeof(elf_caddr_t); + argv = (elf_caddr_t __user *) csp; + + /* stack argc */ + csp -= sizeof(unsigned long); + __put_user(bprm->argc, (unsigned long __user *) csp); + + BUG_ON(csp != sp); + + /* fill in the argv[] array */ +#ifdef CONFIG_MMU + current->mm->arg_start = bprm->p; +#else + current->mm->arg_start = current->mm->start_stack - + (MAX_ARG_PAGES * PAGE_SIZE - bprm->p); +#endif + + p = (char __user *) current->mm->arg_start; + for (loop = bprm->argc; loop > 0; loop--) { + __put_user((elf_caddr_t) p, argv++); + len = strnlen_user(p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) + return -EINVAL; + p += len; + } + __put_user(NULL, argv); + current->mm->arg_end = (unsigned long) p; + + /* fill in the envv[] array */ + current->mm->env_start = (unsigned long) p; + for (loop = bprm->envc; loop > 0; loop--) { + __put_user((elf_caddr_t)(unsigned long) p, envp++); + len = strnlen_user(p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) + return -EINVAL; + p += len; + } + __put_user(NULL, envp); + current->mm->env_end = (unsigned long) p; + + mm->start_stack = (unsigned long) sp; + return 0; +} + +/*****************************************************************************/ +/* + * transfer the program arguments and environment from the holding pages onto + * the stack + */ +#ifndef CONFIG_MMU +static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, + unsigned long *_sp) +{ + unsigned long index, stop, sp; + char *src; + int ret = 0; + + stop = bprm->p >> PAGE_SHIFT; + sp = *_sp; + + for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { + src = kmap(bprm->page[index]); + sp -= PAGE_SIZE; + if (copy_to_user((void *) sp, src, PAGE_SIZE) != 0) + ret = -EFAULT; + kunmap(bprm->page[index]); + if (ret < 0) + goto out; + } + + *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15; + +out: + return ret; +} +#endif + +/*****************************************************************************/ +/* + * load the appropriate binary image (executable or interpreter) into memory + * - we assume no MMU is available + * - if no other PIC bits are set in params->hdr->e_flags + * - we assume that the LOADable segments in the binary are independently relocatable + * - we assume R/O executable segments are shareable + * - else + * - we assume the loadable parts of the image to require fixed displacement + * - the image is not shareable + */ +static int elf_fdpic_map_file(struct elf_fdpic_params *params, + struct file *file, + struct mm_struct *mm, + const char *what) +{ + struct elf32_fdpic_loadmap *loadmap; +#ifdef CONFIG_MMU + struct elf32_fdpic_loadseg *mseg; +#endif + struct elf32_fdpic_loadseg *seg; + struct elf32_phdr *phdr; + unsigned long load_addr, stop; + unsigned nloads, tmp; + size_t size; + int loop, ret; + + /* allocate a load map table */ + nloads = 0; + for (loop = 0; loop < params->hdr.e_phnum; loop++) + if (params->phdrs[loop].p_type == PT_LOAD) + nloads++; + + if (nloads == 0) + return -ELIBBAD; + + size = sizeof(*loadmap) + nloads * sizeof(*seg); + loadmap = kzalloc(size, GFP_KERNEL); + if (!loadmap) + return -ENOMEM; + + params->loadmap = loadmap; + + loadmap->version = ELF32_FDPIC_LOADMAP_VERSION; + loadmap->nsegs = nloads; + + load_addr = params->load_addr; + seg = loadmap->segs; + + /* map the requested LOADs into the memory space */ + switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) { + case ELF_FDPIC_FLAG_CONSTDISP: + case ELF_FDPIC_FLAG_CONTIGUOUS: +#ifndef CONFIG_MMU + ret = elf_fdpic_map_file_constdisp_on_uclinux(params, file, mm); + if (ret < 0) + return ret; + break; +#endif + default: + ret = elf_fdpic_map_file_by_direct_mmap(params, file, mm); + if (ret < 0) + return ret; + break; + } + + /* map the entry point */ + if (params->hdr.e_entry) { + seg = loadmap->segs; + for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { + if (params->hdr.e_entry >= seg->p_vaddr && + params->hdr.e_entry < seg->p_vaddr + seg->p_memsz) { + params->entry_addr = + (params->hdr.e_entry - seg->p_vaddr) + + seg->addr; + break; + } + } + } + + /* determine where the program header table has wound up if mapped */ + stop = params->hdr.e_phoff; + stop += params->hdr.e_phnum * sizeof (struct elf_phdr); + phdr = params->phdrs; + + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + if (phdr->p_type != PT_LOAD) + continue; + + if (phdr->p_offset > params->hdr.e_phoff || + phdr->p_offset + phdr->p_filesz < stop) + continue; + + seg = loadmap->segs; + for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { + if (phdr->p_vaddr >= seg->p_vaddr && + phdr->p_vaddr + phdr->p_filesz <= + seg->p_vaddr + seg->p_memsz) { + params->ph_addr = + (phdr->p_vaddr - seg->p_vaddr) + + seg->addr + + params->hdr.e_phoff - phdr->p_offset; + break; + } + } + break; + } + + /* determine where the dynamic section has wound up if there is one */ + phdr = params->phdrs; + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + if (phdr->p_type != PT_DYNAMIC) + continue; + + seg = loadmap->segs; + for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { + if (phdr->p_vaddr >= seg->p_vaddr && + phdr->p_vaddr + phdr->p_memsz <= + seg->p_vaddr + seg->p_memsz) { + params->dynamic_addr = + (phdr->p_vaddr - seg->p_vaddr) + + seg->addr; + + /* check the dynamic section contains at least + * one item, and that the last item is a NULL + * entry */ + if (phdr->p_memsz == 0 || + phdr->p_memsz % sizeof(Elf32_Dyn) != 0) + goto dynamic_error; + + tmp = phdr->p_memsz / sizeof(Elf32_Dyn); + if (((Elf32_Dyn *) + params->dynamic_addr)[tmp - 1].d_tag != 0) + goto dynamic_error; + break; + } + } + break; + } + + /* now elide adjacent segments in the load map on MMU linux + * - on uClinux the holes between may actually be filled with system + * stuff or stuff from other processes + */ +#ifdef CONFIG_MMU + nloads = loadmap->nsegs; + mseg = loadmap->segs; + seg = mseg + 1; + for (loop = 1; loop < nloads; loop++) { + /* see if we have a candidate for merging */ + if (seg->p_vaddr - mseg->p_vaddr == seg->addr - mseg->addr) { + load_addr = PAGE_ALIGN(mseg->addr + mseg->p_memsz); + if (load_addr == (seg->addr & PAGE_MASK)) { + mseg->p_memsz += + load_addr - + (mseg->addr + mseg->p_memsz); + mseg->p_memsz += seg->addr & ~PAGE_MASK; + mseg->p_memsz += seg->p_memsz; + loadmap->nsegs--; + continue; + } + } + + mseg++; + if (mseg != seg) + *mseg = *seg; + } +#endif + + kdebug("Mapped Object [%s]:", what); + kdebug("- elfhdr : %lx", params->elfhdr_addr); + kdebug("- entry : %lx", params->entry_addr); + kdebug("- PHDR[] : %lx", params->ph_addr); + kdebug("- DYNAMIC[]: %lx", params->dynamic_addr); + seg = loadmap->segs; + for (loop = 0; loop < loadmap->nsegs; loop++, seg++) + kdebug("- LOAD[%d] : %08x-%08x [va=%x ms=%x]", + loop, + seg->addr, seg->addr + seg->p_memsz - 1, + seg->p_vaddr, seg->p_memsz); + + return 0; + +dynamic_error: + printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n", + what, file->f_path.dentry->d_inode->i_ino); + return -ELIBBAD; +} + +/*****************************************************************************/ +/* + * map a file with constant displacement under uClinux + */ +#ifndef CONFIG_MMU +static int elf_fdpic_map_file_constdisp_on_uclinux( + struct elf_fdpic_params *params, + struct file *file, + struct mm_struct *mm) +{ + struct elf32_fdpic_loadseg *seg; + struct elf32_phdr *phdr; + unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags; + loff_t fpos; + int loop, ret; + + load_addr = params->load_addr; + seg = params->loadmap->segs; + + /* determine the bounds of the contiguous overall allocation we must + * make */ + phdr = params->phdrs; + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + if (params->phdrs[loop].p_type != PT_LOAD) + continue; + + if (base > phdr->p_vaddr) + base = phdr->p_vaddr; + if (top < phdr->p_vaddr + phdr->p_memsz) + top = phdr->p_vaddr + phdr->p_memsz; + } + + /* allocate one big anon block for everything */ + mflags = MAP_PRIVATE; + if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) + mflags |= MAP_EXECUTABLE; + + down_write(&mm->mmap_sem); + maddr = do_mmap(NULL, load_addr, top - base, + PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); + up_write(&mm->mmap_sem); + if (IS_ERR_VALUE(maddr)) + return (int) maddr; + + if (load_addr != 0) + load_addr += PAGE_ALIGN(top - base); + + /* and then load the file segments into it */ + phdr = params->phdrs; + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + if (params->phdrs[loop].p_type != PT_LOAD) + continue; + + fpos = phdr->p_offset; + + seg->addr = maddr + (phdr->p_vaddr - base); + seg->p_vaddr = phdr->p_vaddr; + seg->p_memsz = phdr->p_memsz; + + ret = file->f_op->read(file, (void *) seg->addr, + phdr->p_filesz, &fpos); + if (ret < 0) + return ret; + + /* map the ELF header address if in this segment */ + if (phdr->p_offset == 0) + params->elfhdr_addr = seg->addr; + + /* clear any space allocated but not loaded */ + if (phdr->p_filesz < phdr->p_memsz) { + if (clear_user((void *) (seg->addr + phdr->p_filesz), + phdr->p_memsz - phdr->p_filesz)) + return -EFAULT; + } + + if (mm) { + if (phdr->p_flags & PF_X) { + if (!mm->start_code) { + mm->start_code = seg->addr; + mm->end_code = seg->addr + + phdr->p_memsz; + } + } else if (!mm->start_data) { + mm->start_data = seg->addr; + mm->end_data = seg->addr + phdr->p_memsz; + } + } + + seg++; + } + + return 0; +} +#endif + +/*****************************************************************************/ +/* + * map a binary by direct mmap() of the individual PT_LOAD segments + */ +static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, + struct file *file, + struct mm_struct *mm) +{ + struct elf32_fdpic_loadseg *seg; + struct elf32_phdr *phdr; + unsigned long load_addr, delta_vaddr; + int loop, dvset; + + load_addr = params->load_addr; + delta_vaddr = 0; + dvset = 0; + + seg = params->loadmap->segs; + + /* deal with each load segment separately */ + phdr = params->phdrs; + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + unsigned long maddr, disp, excess, excess1; + int prot = 0, flags; + + if (phdr->p_type != PT_LOAD) + continue; + + kdebug("[LOAD] va=%lx of=%lx fs=%lx ms=%lx", + (unsigned long) phdr->p_vaddr, + (unsigned long) phdr->p_offset, + (unsigned long) phdr->p_filesz, + (unsigned long) phdr->p_memsz); + + /* determine the mapping parameters */ + if (phdr->p_flags & PF_R) prot |= PROT_READ; + if (phdr->p_flags & PF_W) prot |= PROT_WRITE; + if (phdr->p_flags & PF_X) prot |= PROT_EXEC; + + flags = MAP_PRIVATE | MAP_DENYWRITE; + if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) + flags |= MAP_EXECUTABLE; + + maddr = 0; + + switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) { + case ELF_FDPIC_FLAG_INDEPENDENT: + /* PT_LOADs are independently locatable */ + break; + + case ELF_FDPIC_FLAG_HONOURVADDR: + /* the specified virtual address must be honoured */ + maddr = phdr->p_vaddr; + flags |= MAP_FIXED; + break; + + case ELF_FDPIC_FLAG_CONSTDISP: + /* constant displacement + * - can be mapped anywhere, but must be mapped as a + * unit + */ + if (!dvset) { + maddr = load_addr; + delta_vaddr = phdr->p_vaddr; + dvset = 1; + } else { + maddr = load_addr + phdr->p_vaddr - delta_vaddr; + flags |= MAP_FIXED; + } + break; + + case ELF_FDPIC_FLAG_CONTIGUOUS: + /* contiguity handled later */ + break; + + default: + BUG(); + } + + maddr &= PAGE_MASK; + + /* create the mapping */ + disp = phdr->p_vaddr & ~PAGE_MASK; + down_write(&mm->mmap_sem); + maddr = do_mmap(file, maddr, phdr->p_memsz + disp, prot, flags, + phdr->p_offset - disp); + up_write(&mm->mmap_sem); + + kdebug("mmap[%d] sz=%lx pr=%x fl=%x of=%lx --> %08lx", + loop, phdr->p_memsz + disp, prot, flags, + phdr->p_offset - disp, maddr); + + if (IS_ERR_VALUE(maddr)) + return (int) maddr; + + if ((params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) == + ELF_FDPIC_FLAG_CONTIGUOUS) + load_addr += PAGE_ALIGN(phdr->p_memsz + disp); + + seg->addr = maddr + disp; + seg->p_vaddr = phdr->p_vaddr; + seg->p_memsz = phdr->p_memsz; + + /* map the ELF header address if in this segment */ + if (phdr->p_offset == 0) + params->elfhdr_addr = seg->addr; + + /* clear the bit between beginning of mapping and beginning of + * PT_LOAD */ + if (prot & PROT_WRITE && disp > 0) { + kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); + if (clear_user((void __user *) maddr, disp)) + return -EFAULT; + maddr += disp; + } + + /* clear any space allocated but not loaded + * - on uClinux we can just clear the lot + * - on MMU linux we'll get a SIGBUS beyond the last page + * extant in the file + */ + excess = phdr->p_memsz - phdr->p_filesz; + excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); + +#ifdef CONFIG_MMU + if (excess > excess1) { + unsigned long xaddr = maddr + phdr->p_filesz + excess1; + unsigned long xmaddr; + + flags |= MAP_FIXED | MAP_ANONYMOUS; + down_write(&mm->mmap_sem); + xmaddr = do_mmap(NULL, xaddr, excess - excess1, + prot, flags, 0); + up_write(&mm->mmap_sem); + + kdebug("mmap[%d] " + " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx", + loop, xaddr, excess - excess1, prot, flags, + xmaddr); + + if (xmaddr != xaddr) + return -ENOMEM; + } + + if (prot & PROT_WRITE && excess1 > 0) { + kdebug("clear[%d] ad=%lx sz=%lx", + loop, maddr + phdr->p_filesz, excess1); + if (clear_user((void __user *) maddr + phdr->p_filesz, + excess1)) + return -EFAULT; + } + +#else + if (excess > 0) { + kdebug("clear[%d] ad=%lx sz=%lx", + loop, maddr + phdr->p_filesz, excess); + if (clear_user((void *) maddr + phdr->p_filesz, excess)) + return -EFAULT; + } +#endif + + if (mm) { + if (phdr->p_flags & PF_X) { + if (!mm->start_code) { + mm->start_code = maddr; + mm->end_code = maddr + phdr->p_memsz; + } + } else if (!mm->start_data) { + mm->start_data = maddr; + mm->end_data = maddr + phdr->p_memsz; + } + } + + seg++; + } + + return 0; +} + +/*****************************************************************************/ +/* + * ELF-FDPIC core dumper + * + * Modelled on fs/exec.c:aout_core_dump() + * Jeremy Fitzhardinge + * + * Modelled on fs/binfmt_elf.c core dumper + */ +#ifdef CONFIG_ELF_CORE + +/* + * Decide whether a segment is worth dumping; default is yes to be + * sure (missing info is worse than too much; etc). + * Personally I'd include everything, and use the coredump limit... + * + * I think we should skip something. But I am not sure how. H.J. + */ +static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) +{ + int dump_ok; + + /* Do not dump I/O mapped devices or special mappings */ + if (vma->vm_flags & (VM_IO | VM_RESERVED)) { + kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); + return 0; + } + + /* If we may not read the contents, don't allow us to dump + * them either. "dump_write()" can't handle it anyway. + */ + if (!(vma->vm_flags & VM_READ)) { + kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags); + return 0; + } + + /* By default, dump shared memory if mapped from an anonymous file. */ + if (vma->vm_flags & VM_SHARED) { + if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0) { + dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags); + kdcore("%08lx: %08lx: %s (share)", vma->vm_start, + vma->vm_flags, dump_ok ? "yes" : "no"); + return dump_ok; + } + + dump_ok = test_bit(MMF_DUMP_MAPPED_SHARED, &mm_flags); + kdcore("%08lx: %08lx: %s (share)", vma->vm_start, + vma->vm_flags, dump_ok ? "yes" : "no"); + return dump_ok; + } + +#ifdef CONFIG_MMU + /* By default, if it hasn't been written to, don't write it out */ + if (!vma->anon_vma) { + dump_ok = test_bit(MMF_DUMP_MAPPED_PRIVATE, &mm_flags); + kdcore("%08lx: %08lx: %s (!anon)", vma->vm_start, + vma->vm_flags, dump_ok ? "yes" : "no"); + return dump_ok; + } +#endif + + dump_ok = test_bit(MMF_DUMP_ANON_PRIVATE, &mm_flags); + kdcore("%08lx: %08lx: %s", vma->vm_start, vma->vm_flags, + dump_ok ? "yes" : "no"); + return dump_ok; +} + +/* An ELF note in memory */ +struct memelfnote +{ + const char *name; + int type; + unsigned int datasz; + void *data; +}; + +static int notesize(struct memelfnote *en) +{ + int sz; + + sz = sizeof(struct elf_note); + sz += roundup(strlen(en->name) + 1, 4); + sz += roundup(en->datasz, 4); + + return sz; +} + +/* #define DEBUG */ + +#define DUMP_WRITE(addr, nr, foffset) \ + do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0) + +static int alignfile(struct file *file, loff_t *foffset) +{ + static const char buf[4] = { 0, }; + DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset); + return 1; +} + +static int writenote(struct memelfnote *men, struct file *file, + loff_t *foffset) +{ + struct elf_note en; + en.n_namesz = strlen(men->name) + 1; + en.n_descsz = men->datasz; + en.n_type = men->type; + + DUMP_WRITE(&en, sizeof(en), foffset); + DUMP_WRITE(men->name, en.n_namesz, foffset); + if (!alignfile(file, foffset)) + return 0; + DUMP_WRITE(men->data, men->datasz, foffset); + if (!alignfile(file, foffset)) + return 0; + + return 1; +} +#undef DUMP_WRITE + +static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) +{ + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + + elf->e_type = ET_CORE; + elf->e_machine = ELF_ARCH; + elf->e_version = EV_CURRENT; + elf->e_entry = 0; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_shoff = 0; + elf->e_flags = ELF_FDPIC_CORE_EFLAGS; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize = sizeof(struct elf_phdr); + elf->e_phnum = segs; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + return; +} + +static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) +{ + phdr->p_type = PT_NOTE; + phdr->p_offset = offset; + phdr->p_vaddr = 0; + phdr->p_paddr = 0; + phdr->p_filesz = sz; + phdr->p_memsz = 0; + phdr->p_flags = 0; + phdr->p_align = 0; + return; +} + +static inline void fill_note(struct memelfnote *note, const char *name, int type, + unsigned int sz, void *data) +{ + note->name = name; + note->type = type; + note->datasz = sz; + note->data = data; + return; +} + +/* + * fill up all the fields in prstatus from the given task struct, except + * registers which need to be filled up separately. + */ +static void fill_prstatus(struct elf_prstatus *prstatus, + struct task_struct *p, long signr) +{ + prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; + prstatus->pr_sigpend = p->pending.signal.sig[0]; + prstatus->pr_sighold = p->blocked.sig[0]; + rcu_read_lock(); + prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); + prstatus->pr_pid = task_pid_vnr(p); + prstatus->pr_pgrp = task_pgrp_vnr(p); + prstatus->pr_sid = task_session_vnr(p); + if (thread_group_leader(p)) { + struct task_cputime cputime; + + /* + * This is the record for the group leader. It shows the + * group-wide total, not its individual thread total. + */ + thread_group_cputime(p, &cputime); + cputime_to_timeval(cputime.utime, &prstatus->pr_utime); + cputime_to_timeval(cputime.stime, &prstatus->pr_stime); + } else { + cputime_to_timeval(p->utime, &prstatus->pr_utime); + cputime_to_timeval(p->stime, &prstatus->pr_stime); + } + cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); + cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); + + prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; + prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; +} + +static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, + struct mm_struct *mm) +{ + const struct cred *cred; + unsigned int i, len; + + /* first copy the parameters from user space */ + memset(psinfo, 0, sizeof(struct elf_prpsinfo)); + + len = mm->arg_end - mm->arg_start; + if (len >= ELF_PRARGSZ) + len = ELF_PRARGSZ - 1; + if (copy_from_user(&psinfo->pr_psargs, + (const char __user *) mm->arg_start, len)) + return -EFAULT; + for (i = 0; i < len; i++) + if (psinfo->pr_psargs[i] == 0) + psinfo->pr_psargs[i] = ' '; + psinfo->pr_psargs[len] = 0; + + rcu_read_lock(); + psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); + psinfo->pr_pid = task_pid_vnr(p); + psinfo->pr_pgrp = task_pgrp_vnr(p); + psinfo->pr_sid = task_session_vnr(p); + + i = p->state ? ffz(~p->state) + 1 : 0; + psinfo->pr_state = i; + psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; + psinfo->pr_zomb = psinfo->pr_sname == 'Z'; + psinfo->pr_nice = task_nice(p); + psinfo->pr_flag = p->flags; + rcu_read_lock(); + cred = __task_cred(p); + SET_UID(psinfo->pr_uid, cred->uid); + SET_GID(psinfo->pr_gid, cred->gid); + rcu_read_unlock(); + strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + + return 0; +} + +/* Here is the structure in which status of each thread is captured. */ +struct elf_thread_status +{ + struct list_head list; + struct elf_prstatus prstatus; /* NT_PRSTATUS */ + elf_fpregset_t fpu; /* NT_PRFPREG */ + struct task_struct *thread; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t xfpu; /* ELF_CORE_XFPREG_TYPE */ +#endif + struct memelfnote notes[3]; + int num_notes; +}; + +/* + * In order to add the specific thread information for the elf file format, + * we need to keep a linked list of every thread's pr_status and then create + * a single section for them in the final core file. + */ +static int elf_dump_thread_status(long signr, struct elf_thread_status *t) +{ + struct task_struct *p = t->thread; + int sz = 0; + + t->num_notes = 0; + + fill_prstatus(&t->prstatus, p, signr); + elf_core_copy_task_regs(p, &t->prstatus.pr_reg); + + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + &t->prstatus); + t->num_notes++; + sz += notesize(&t->notes[0]); + + t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, &t->fpu); + if (t->prstatus.pr_fpvalid) { + fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), + &t->fpu); + t->num_notes++; + sz += notesize(&t->notes[1]); + } + +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(p, &t->xfpu)) { + fill_note(&t->notes[2], "LINUX", ELF_CORE_XFPREG_TYPE, + sizeof(t->xfpu), &t->xfpu); + t->num_notes++; + sz += notesize(&t->notes[2]); + } +#endif + return sz; +} + +static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, + elf_addr_t e_shoff, int segs) +{ + elf->e_shoff = e_shoff; + elf->e_shentsize = sizeof(*shdr4extnum); + elf->e_shnum = 1; + elf->e_shstrndx = SHN_UNDEF; + + memset(shdr4extnum, 0, sizeof(*shdr4extnum)); + + shdr4extnum->sh_type = SHT_NULL; + shdr4extnum->sh_size = elf->e_shnum; + shdr4extnum->sh_link = elf->e_shstrndx; + shdr4extnum->sh_info = segs; +} + +/* + * dump the segments for an MMU process + */ +#ifdef CONFIG_MMU +static int elf_fdpic_dump_segments(struct file *file, size_t *size, + unsigned long *limit, unsigned long mm_flags) +{ + struct vm_area_struct *vma; + int err = 0; + + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { + unsigned long addr; + + if (!maydump(vma, mm_flags)) + continue; + + for (addr = vma->vm_start; addr < vma->vm_end; + addr += PAGE_SIZE) { + struct page *page = get_dump_page(addr); + if (page) { + void *kaddr = kmap(page); + *size += PAGE_SIZE; + if (*size > *limit) + err = -EFBIG; + else if (!dump_write(file, kaddr, PAGE_SIZE)) + err = -EIO; + kunmap(page); + page_cache_release(page); + } else if (!dump_seek(file, PAGE_SIZE)) + err = -EFBIG; + if (err) + goto out; + } + } +out: + return err; +} +#endif + +/* + * dump the segments for a NOMMU process + */ +#ifndef CONFIG_MMU +static int elf_fdpic_dump_segments(struct file *file, size_t *size, + unsigned long *limit, unsigned long mm_flags) +{ + struct vm_area_struct *vma; + + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { + if (!maydump(vma, mm_flags)) + continue; + + if ((*size += PAGE_SIZE) > *limit) + return -EFBIG; + + if (!dump_write(file, (void *) vma->vm_start, + vma->vm_end - vma->vm_start)) + return -EIO; + } + + return 0; +} +#endif + +static size_t elf_core_vma_data_size(unsigned long mm_flags) +{ + struct vm_area_struct *vma; + size_t size = 0; + + for (vma = current->mm->mmap; vma; vma = vma->vm_next) + if (maydump(vma, mm_flags)) + size += vma->vm_end - vma->vm_start; + return size; +} + +/* + * Actual dumper + * + * This is a two-pass process; first we find the offsets of the bits, + * and then they are actually written out. If we run out of core limit + * we just truncate. + */ +static int elf_fdpic_core_dump(struct coredump_params *cprm) +{ +#define NUM_NOTES 6 + int has_dumped = 0; + mm_segment_t fs; + int segs; + size_t size = 0; + int i; + struct vm_area_struct *vma; + struct elfhdr *elf = NULL; + loff_t offset = 0, dataoff, foffset; + int numnote; + struct memelfnote *notes = NULL; + struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ + struct elf_prpsinfo *psinfo = NULL; /* NT_PRPSINFO */ + LIST_HEAD(thread_list); + struct list_head *t; + elf_fpregset_t *fpu = NULL; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t *xfpu = NULL; +#endif + int thread_status_size = 0; + elf_addr_t *auxv; + struct elf_phdr *phdr4note = NULL; + struct elf_shdr *shdr4extnum = NULL; + Elf_Half e_phnum; + elf_addr_t e_shoff; + + /* + * We no longer stop all VM operations. + * + * This is because those proceses that could possibly change map_count + * or the mmap / vma pages are now blocked in do_exit on current + * finishing this core dump. + * + * Only ptrace can touch these memory addresses, but it doesn't change + * the map_count or the pages allocated. So no possibility of crashing + * exists while dumping the mm->vm_next areas to the core file. + */ + + /* alloc memory for large data structures: too large to be on stack */ + elf = kmalloc(sizeof(*elf), GFP_KERNEL); + if (!elf) + goto cleanup; + prstatus = kzalloc(sizeof(*prstatus), GFP_KERNEL); + if (!prstatus) + goto cleanup; + psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); + if (!psinfo) + goto cleanup; + notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL); + if (!notes) + goto cleanup; + fpu = kmalloc(sizeof(*fpu), GFP_KERNEL); + if (!fpu) + goto cleanup; +#ifdef ELF_CORE_COPY_XFPREGS + xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL); + if (!xfpu) + goto cleanup; +#endif + + if (cprm->signr) { + struct core_thread *ct; + struct elf_thread_status *tmp; + + for (ct = current->mm->core_state->dumper.next; + ct; ct = ct->next) { + tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); + if (!tmp) + goto cleanup; + + tmp->thread = ct->task; + list_add(&tmp->list, &thread_list); + } + + list_for_each(t, &thread_list) { + struct elf_thread_status *tmp; + int sz; + + tmp = list_entry(t, struct elf_thread_status, list); + sz = elf_dump_thread_status(cprm->signr, tmp); + thread_status_size += sz; + } + } + + /* now collect the dump for the current */ + fill_prstatus(prstatus, current, cprm->signr); + elf_core_copy_regs(&prstatus->pr_reg, cprm->regs); + + segs = current->mm->map_count; + segs += elf_core_extra_phdrs(); + + /* for notes section */ + segs++; + + /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid + * this, kernel supports extended numbering. Have a look at + * include/linux/elf.h for further information. */ + e_phnum = segs > PN_XNUM ? PN_XNUM : segs; + + /* Set up header */ + fill_elf_fdpic_header(elf, e_phnum); + + has_dumped = 1; + current->flags |= PF_DUMPCORE; + + /* + * Set up the notes in similar form to SVR4 core dumps made + * with info from their /proc. + */ + + fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus); + fill_psinfo(psinfo, current->group_leader, current->mm); + fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + + numnote = 2; + + auxv = (elf_addr_t *) current->mm->saved_auxv; + + i = 0; + do + i += 2; + while (auxv[i - 2] != AT_NULL); + fill_note(¬es[numnote++], "CORE", NT_AUXV, + i * sizeof(elf_addr_t), auxv); + + /* Try to dump the FPU. */ + if ((prstatus->pr_fpvalid = + elf_core_copy_task_fpregs(current, cprm->regs, fpu))) + fill_note(notes + numnote++, + "CORE", NT_PRFPREG, sizeof(*fpu), fpu); +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(current, xfpu)) + fill_note(notes + numnote++, + "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu); +#endif + + fs = get_fs(); + set_fs(KERNEL_DS); + + offset += sizeof(*elf); /* Elf header */ + offset += segs * sizeof(struct elf_phdr); /* Program headers */ + foffset = offset; + + /* Write notes phdr entry */ + { + int sz = 0; + + for (i = 0; i < numnote; i++) + sz += notesize(notes + i); + + sz += thread_status_size; + + phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL); + if (!phdr4note) + goto end_coredump; + + fill_elf_note_phdr(phdr4note, sz, offset); + offset += sz; + } + + /* Page-align dumped data */ + dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); + + offset += elf_core_vma_data_size(cprm->mm_flags); + offset += elf_core_extra_data_size(); + e_shoff = offset; + + if (e_phnum == PN_XNUM) { + shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL); + if (!shdr4extnum) + goto end_coredump; + fill_extnum_info(elf, shdr4extnum, e_shoff, segs); + } + + offset = dataoff; + + size += sizeof(*elf); + if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf))) + goto end_coredump; + + size += sizeof(*phdr4note); + if (size > cprm->limit + || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note))) + goto end_coredump; + + /* write program headers for segments dump */ + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { + struct elf_phdr phdr; + size_t sz; + + sz = vma->vm_end - vma->vm_start; + + phdr.p_type = PT_LOAD; + phdr.p_offset = offset; + phdr.p_vaddr = vma->vm_start; + phdr.p_paddr = 0; + phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0; + phdr.p_memsz = sz; + offset += phdr.p_filesz; + phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; + if (vma->vm_flags & VM_WRITE) + phdr.p_flags |= PF_W; + if (vma->vm_flags & VM_EXEC) + phdr.p_flags |= PF_X; + phdr.p_align = ELF_EXEC_PAGESIZE; + + size += sizeof(phdr); + if (size > cprm->limit + || !dump_write(cprm->file, &phdr, sizeof(phdr))) + goto end_coredump; + } + + if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit)) + goto end_coredump; + + /* write out the notes section */ + for (i = 0; i < numnote; i++) + if (!writenote(notes + i, cprm->file, &foffset)) + goto end_coredump; + + /* write out the thread status notes section */ + list_for_each(t, &thread_list) { + struct elf_thread_status *tmp = + list_entry(t, struct elf_thread_status, list); + + for (i = 0; i < tmp->num_notes; i++) + if (!writenote(&tmp->notes[i], cprm->file, &foffset)) + goto end_coredump; + } + + if (!dump_seek(cprm->file, dataoff - foffset)) + goto end_coredump; + + if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit, + cprm->mm_flags) < 0) + goto end_coredump; + + if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit)) + goto end_coredump; + + if (e_phnum == PN_XNUM) { + size += sizeof(*shdr4extnum); + if (size > cprm->limit + || !dump_write(cprm->file, shdr4extnum, + sizeof(*shdr4extnum))) + goto end_coredump; + } + + if (cprm->file->f_pos != offset) { + /* Sanity check */ + printk(KERN_WARNING + "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n", + cprm->file->f_pos, offset); + } + +end_coredump: + set_fs(fs); + +cleanup: + while (!list_empty(&thread_list)) { + struct list_head *tmp = thread_list.next; + list_del(tmp); + kfree(list_entry(tmp, struct elf_thread_status, list)); + } + kfree(phdr4note); + kfree(elf); + kfree(prstatus); + kfree(psinfo); + kfree(notes); + kfree(fpu); + kfree(shdr4extnum); +#ifdef ELF_CORE_COPY_XFPREGS + kfree(xfpu); +#endif + return has_dumped; +#undef NUM_NOTES +} + +#endif /* CONFIG_ELF_CORE */ diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c new file mode 100644 index 00000000..b8e8b0ac --- /dev/null +++ b/fs/binfmt_em86.c @@ -0,0 +1,113 @@ +/* + * linux/fs/binfmt_em86.c + * + * Based on linux/fs/binfmt_script.c + * Copyright (C) 1996 Martin von Löwis + * original #!-checking implemented by tytso. + * + * em86 changes Copyright (C) 1997 Jim Paradis + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define EM86_INTERP "/usr/bin/em86" +#define EM86_I_NAME "em86" + +static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) +{ + char *interp, *i_name, *i_arg; + struct file * file; + int retval; + struct elfhdr elf_ex; + + /* Make sure this is a Linux/Intel ELF executable... */ + elf_ex = *((struct elfhdr *)bprm->buf); + + if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + return -ENOEXEC; + + /* First of all, some simple consistency checks */ + if ((elf_ex.e_type != ET_EXEC && elf_ex.e_type != ET_DYN) || + (!((elf_ex.e_machine == EM_386) || (elf_ex.e_machine == EM_486))) || + (!bprm->file->f_op || !bprm->file->f_op->mmap)) { + return -ENOEXEC; + } + + bprm->recursion_depth++; /* Well, the bang-shell is implicit... */ + allow_write_access(bprm->file); + fput(bprm->file); + bprm->file = NULL; + + /* Unlike in the script case, we don't have to do any hairy + * parsing to find our interpreter... it's hardcoded! + */ + interp = EM86_INTERP; + i_name = EM86_I_NAME; + i_arg = NULL; /* We reserve the right to add an arg later */ + + /* + * Splice in (1) the interpreter's name for argv[0] + * (2) (optional) argument to interpreter + * (3) filename of emulated file (replace argv[0]) + * + * This is done in reverse order, because of how the + * user environment and arguments are stored. + */ + remove_arg_zero(bprm); + retval = copy_strings_kernel(1, &bprm->filename, bprm); + if (retval < 0) return retval; + bprm->argc++; + if (i_arg) { + retval = copy_strings_kernel(1, &i_arg, bprm); + if (retval < 0) return retval; + bprm->argc++; + } + retval = copy_strings_kernel(1, &i_name, bprm); + if (retval < 0) return retval; + bprm->argc++; + + /* + * OK, now restart the process with the interpreter's inode. + * Note that we use open_exec() as the name is now in kernel + * space, and we don't need to copy it. + */ + file = open_exec(interp); + if (IS_ERR(file)) + return PTR_ERR(file); + + bprm->file = file; + + retval = prepare_binprm(bprm); + if (retval < 0) + return retval; + + return search_binary_handler(bprm, regs); +} + +static struct linux_binfmt em86_format = { + .module = THIS_MODULE, + .load_binary = load_em86, +}; + +static int __init init_em86_binfmt(void) +{ + return register_binfmt(&em86_format); +} + +static void __exit exit_em86_binfmt(void) +{ + unregister_binfmt(&em86_format); +} + +core_initcall(init_em86_binfmt); +module_exit(exit_em86_binfmt); +MODULE_LICENSE("GPL"); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c new file mode 100644 index 00000000..1bffbe0e --- /dev/null +++ b/fs/binfmt_flat.c @@ -0,0 +1,960 @@ +/****************************************************************************/ +/* + * linux/fs/binfmt_flat.c + * + * Copyright (C) 2000-2003 David McCullough + * Copyright (C) 2002 Greg Ungerer + * Copyright (C) 2002 SnapGear, by Paul Dale + * Copyright (C) 2000, 2001 Lineo, by David McCullough + * based heavily on: + * + * linux/fs/binfmt_aout.c: + * Copyright (C) 1991, 1992, 1996 Linus Torvalds + * linux/fs/binfmt_flat.c for 2.0 kernel + * Copyright (C) 1998 Kenneth Albanowski + * JAN/99 -- coded full program relocation (gerg@snapgear.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/****************************************************************************/ + +#if 0 +#define DEBUG 1 +#endif + +#ifdef DEBUG +#define DBG_FLT(a...) printk(a) +#else +#define DBG_FLT(a...) +#endif + +/* + * User data (data section and bss) needs to be aligned. + * We pick 0x20 here because it is the max value elf2flt has always + * used in producing FLAT files, and because it seems to be large + * enough to make all the gcc alignment related tests happy. + */ +#define FLAT_DATA_ALIGN (0x20) + +/* + * User data (stack) also needs to be aligned. + * Here we can be a bit looser than the data sections since this + * needs to only meet arch ABI requirements. + */ +#define FLAT_STACK_ALIGN max_t(unsigned long, sizeof(void *), ARCH_SLAB_MINALIGN) + +#define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */ +#define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */ + +struct lib_info { + struct { + unsigned long start_code; /* Start of text segment */ + unsigned long start_data; /* Start of data segment */ + unsigned long start_brk; /* End of data segment */ + unsigned long text_len; /* Length of text segment */ + unsigned long entry; /* Start address for this module */ + unsigned long build_date; /* When this one was compiled */ + short loaded; /* Has this library been loaded? */ + } lib_list[MAX_SHARED_LIBS]; +}; + +#ifdef CONFIG_BINFMT_SHARED_FLAT +static int load_flat_shared_library(int id, struct lib_info *p); +#endif + +static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); +static int flat_core_dump(struct coredump_params *cprm); + +static struct linux_binfmt flat_format = { + .module = THIS_MODULE, + .load_binary = load_flat_binary, + .core_dump = flat_core_dump, + .min_coredump = PAGE_SIZE +}; + +/****************************************************************************/ +/* + * Routine writes a core dump image in the current directory. + * Currently only a stub-function. + */ + +static int flat_core_dump(struct coredump_params *cprm) +{ + printk("Process %s:%d received signr %d and should have core dumped\n", + current->comm, current->pid, (int) cprm->signr); + return(1); +} + +/****************************************************************************/ +/* + * create_flat_tables() parses the env- and arg-strings in new user + * memory and creates the pointer tables from them, and puts their + * addresses on the "stack", returning the new stack pointer value. + */ + +static unsigned long create_flat_tables( + unsigned long pp, + struct linux_binprm * bprm) +{ + unsigned long *argv,*envp; + unsigned long * sp; + char * p = (char*)pp; + int argc = bprm->argc; + int envc = bprm->envc; + char uninitialized_var(dummy); + + sp = (unsigned long *)p; + sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); + sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN); + argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); + envp = argv + (argc + 1); + + if (flat_argvp_envp_on_stack()) { + put_user((unsigned long) envp, sp + 2); + put_user((unsigned long) argv, sp + 1); + } + + put_user(argc, sp); + current->mm->arg_start = (unsigned long) p; + while (argc-->0) { + put_user((unsigned long) p, argv++); + do { + get_user(dummy, p); p++; + } while (dummy); + } + put_user((unsigned long) NULL, argv); + current->mm->arg_end = current->mm->env_start = (unsigned long) p; + while (envc-->0) { + put_user((unsigned long)p, envp); envp++; + do { + get_user(dummy, p); p++; + } while (dummy); + } + put_user((unsigned long) NULL, envp); + current->mm->env_end = (unsigned long) p; + return (unsigned long)sp; +} + +/****************************************************************************/ + +#ifdef CONFIG_BINFMT_ZFLAT + +#include + +#define LBUFSIZE 4000 + +/* gzip flag byte */ +#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ +#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */ +#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ +#define ORIG_NAME 0x08 /* bit 3 set: original file name present */ +#define COMMENT 0x10 /* bit 4 set: file comment present */ +#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */ +#define RESERVED 0xC0 /* bit 6,7: reserved */ + +static int decompress_exec( + struct linux_binprm *bprm, + unsigned long offset, + char *dst, + long len, + int fd) +{ + unsigned char *buf; + z_stream strm; + loff_t fpos; + int ret, retval; + + DBG_FLT("decompress_exec(offset=%x,buf=%x,len=%x)\n",(int)offset, (int)dst, (int)len); + + memset(&strm, 0, sizeof(strm)); + strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); + if (strm.workspace == NULL) { + DBG_FLT("binfmt_flat: no memory for decompress workspace\n"); + return -ENOMEM; + } + buf = kmalloc(LBUFSIZE, GFP_KERNEL); + if (buf == NULL) { + DBG_FLT("binfmt_flat: no memory for read buffer\n"); + retval = -ENOMEM; + goto out_free; + } + + /* Read in first chunk of data and parse gzip header. */ + fpos = offset; + ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); + + strm.next_in = buf; + strm.avail_in = ret; + strm.total_in = 0; + + retval = -ENOEXEC; + + /* Check minimum size -- gzip header */ + if (ret < 10) { + DBG_FLT("binfmt_flat: file too small?\n"); + goto out_free_buf; + } + + /* Check gzip magic number */ + if ((buf[0] != 037) || ((buf[1] != 0213) && (buf[1] != 0236))) { + DBG_FLT("binfmt_flat: unknown compression magic?\n"); + goto out_free_buf; + } + + /* Check gzip method */ + if (buf[2] != 8) { + DBG_FLT("binfmt_flat: unknown compression method?\n"); + goto out_free_buf; + } + /* Check gzip flags */ + if ((buf[3] & ENCRYPTED) || (buf[3] & CONTINUATION) || + (buf[3] & RESERVED)) { + DBG_FLT("binfmt_flat: unknown flags?\n"); + goto out_free_buf; + } + + ret = 10; + if (buf[3] & EXTRA_FIELD) { + ret += 2 + buf[10] + (buf[11] << 8); + if (unlikely(LBUFSIZE <= ret)) { + DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n"); + goto out_free_buf; + } + } + if (buf[3] & ORIG_NAME) { + while (ret < LBUFSIZE && buf[ret++] != 0) + ; + if (unlikely(LBUFSIZE == ret)) { + DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n"); + goto out_free_buf; + } + } + if (buf[3] & COMMENT) { + while (ret < LBUFSIZE && buf[ret++] != 0) + ; + if (unlikely(LBUFSIZE == ret)) { + DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n"); + goto out_free_buf; + } + } + + strm.next_in += ret; + strm.avail_in -= ret; + + strm.next_out = dst; + strm.avail_out = len; + strm.total_out = 0; + + if (zlib_inflateInit2(&strm, -MAX_WBITS) != Z_OK) { + DBG_FLT("binfmt_flat: zlib init failed?\n"); + goto out_free_buf; + } + + while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) { + ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); + if (ret <= 0) + break; + len -= ret; + + strm.next_in = buf; + strm.avail_in = ret; + strm.total_in = 0; + } + + if (ret < 0) { + DBG_FLT("binfmt_flat: decompression failed (%d), %s\n", + ret, strm.msg); + goto out_zlib; + } + + retval = 0; +out_zlib: + zlib_inflateEnd(&strm); +out_free_buf: + kfree(buf); +out_free: + kfree(strm.workspace); + return retval; +} + +#endif /* CONFIG_BINFMT_ZFLAT */ + +/****************************************************************************/ + +static unsigned long +calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) +{ + unsigned long addr; + int id; + unsigned long start_brk; + unsigned long start_data; + unsigned long text_len; + unsigned long start_code; + +#ifdef CONFIG_BINFMT_SHARED_FLAT + if (r == 0) + id = curid; /* Relocs of 0 are always self referring */ + else { + id = (r >> 24) & 0xff; /* Find ID for this reloc */ + r &= 0x00ffffff; /* Trim ID off here */ + } + if (id >= MAX_SHARED_LIBS) { + printk("BINFMT_FLAT: reference 0x%x to shared library %d", + (unsigned) r, id); + goto failed; + } + if (curid != id) { + if (internalp) { + printk("BINFMT_FLAT: reloc address 0x%x not in same module " + "(%d != %d)", (unsigned) r, curid, id); + goto failed; + } else if ( ! p->lib_list[id].loaded && + IS_ERR_VALUE(load_flat_shared_library(id, p))) { + printk("BINFMT_FLAT: failed to load library %d", id); + goto failed; + } + /* Check versioning information (i.e. time stamps) */ + if (p->lib_list[id].build_date && p->lib_list[curid].build_date && + p->lib_list[curid].build_date < p->lib_list[id].build_date) { + printk("BINFMT_FLAT: library %d is younger than %d", id, curid); + goto failed; + } + } +#else + id = 0; +#endif + + start_brk = p->lib_list[id].start_brk; + start_data = p->lib_list[id].start_data; + start_code = p->lib_list[id].start_code; + text_len = p->lib_list[id].text_len; + + if (!flat_reloc_valid(r, start_brk - start_data + text_len)) { + printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)", + (int) r,(int)(start_brk-start_data+text_len),(int)text_len); + goto failed; + } + + if (r < text_len) /* In text segment */ + addr = r + start_code; + else /* In data segment */ + addr = r - text_len + start_data; + + /* Range checked already above so doing the range tests is redundant...*/ + return(addr); + +failed: + printk(", killing %s!\n", current->comm); + send_sig(SIGSEGV, current, 0); + + return RELOC_FAILED; +} + +/****************************************************************************/ + +void old_reloc(unsigned long rl) +{ +#ifdef DEBUG + char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; +#endif + flat_v2_reloc_t r; + unsigned long *ptr; + + r.value = rl; +#if defined(CONFIG_COLDFIRE) + ptr = (unsigned long *) (current->mm->start_code + r.reloc.offset); +#else + ptr = (unsigned long *) (current->mm->start_data + r.reloc.offset); +#endif + +#ifdef DEBUG + printk("Relocation of variable at DATASEG+%x " + "(address %p, currently %x) into segment %s\n", + r.reloc.offset, ptr, (int)*ptr, segment[r.reloc.type]); +#endif + + switch (r.reloc.type) { + case OLD_FLAT_RELOC_TYPE_TEXT: + *ptr += current->mm->start_code; + break; + case OLD_FLAT_RELOC_TYPE_DATA: + *ptr += current->mm->start_data; + break; + case OLD_FLAT_RELOC_TYPE_BSS: + *ptr += current->mm->end_data; + break; + default: + printk("BINFMT_FLAT: Unknown relocation type=%x\n", r.reloc.type); + break; + } + +#ifdef DEBUG + printk("Relocation became %x\n", (int)*ptr); +#endif +} + +/****************************************************************************/ + +static int load_flat_file(struct linux_binprm * bprm, + struct lib_info *libinfo, int id, unsigned long *extra_stack) +{ + struct flat_hdr * hdr; + unsigned long textpos = 0, datapos = 0, result; + unsigned long realdatastart = 0; + unsigned long text_len, data_len, bss_len, stack_len, flags; + unsigned long len, memp = 0; + unsigned long memp_size, extra, rlim; + unsigned long *reloc = 0, *rp; + struct inode *inode; + int i, rev, relocs = 0; + loff_t fpos; + unsigned long start_code, end_code; + int ret; + + hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */ + inode = bprm->file->f_path.dentry->d_inode; + + text_len = ntohl(hdr->data_start); + data_len = ntohl(hdr->data_end) - ntohl(hdr->data_start); + bss_len = ntohl(hdr->bss_end) - ntohl(hdr->data_end); + stack_len = ntohl(hdr->stack_size); + if (extra_stack) { + stack_len += *extra_stack; + *extra_stack = stack_len; + } + relocs = ntohl(hdr->reloc_count); + flags = ntohl(hdr->flags); + rev = ntohl(hdr->rev); + + if (strncmp(hdr->magic, "bFLT", 4)) { + /* + * Previously, here was a printk to tell people + * "BINFMT_FLAT: bad header magic". + * But for the kernel which also use ELF FD-PIC format, this + * error message is confusing. + * because a lot of people do not manage to produce good + */ + ret = -ENOEXEC; + goto err; + } + + if (flags & FLAT_FLAG_KTRACE) + printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename); + + if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) { + printk("BINFMT_FLAT: bad flat file version 0x%x (supported " + "0x%lx and 0x%lx)\n", + rev, FLAT_VERSION, OLD_FLAT_VERSION); + ret = -ENOEXEC; + goto err; + } + + /* Don't allow old format executables to use shared libraries */ + if (rev == OLD_FLAT_VERSION && id != 0) { + printk("BINFMT_FLAT: shared libraries are not available before rev 0x%x\n", + (int) FLAT_VERSION); + ret = -ENOEXEC; + goto err; + } + + /* + * fix up the flags for the older format, there were all kinds + * of endian hacks, this only works for the simple cases + */ + if (rev == OLD_FLAT_VERSION && flat_old_ram_flag(flags)) + flags = FLAT_FLAG_RAM; + +#ifndef CONFIG_BINFMT_ZFLAT + if (flags & (FLAT_FLAG_GZIP|FLAT_FLAG_GZDATA)) { + printk("Support for ZFLAT executables is not enabled.\n"); + ret = -ENOEXEC; + goto err; + } +#endif + + /* + * Check initial limits. This avoids letting people circumvent + * size limits imposed on them by creating programs with large + * arrays in the data or bss. + */ + rlim = rlimit(RLIMIT_DATA); + if (rlim >= RLIM_INFINITY) + rlim = ~0; + if (data_len + bss_len > rlim) { + ret = -ENOMEM; + goto err; + } + + /* Flush all traces of the currently running executable */ + if (id == 0) { + result = flush_old_exec(bprm); + if (result) { + ret = result; + goto err; + } + + /* OK, This is the point of no return */ + set_personality(PER_LINUX_32BIT); + setup_new_exec(bprm); + } + + /* + * calculate the extra space we need to map in + */ + extra = max_t(unsigned long, bss_len + stack_len, + relocs * sizeof(unsigned long)); + + /* + * there are a couple of cases here, the separate code/data + * case, and then the fully copied to RAM case which lumps + * it all together. + */ + if ((flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP)) == 0) { + /* + * this should give us a ROM ptr, but if it doesn't we don't + * really care + */ + DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n"); + + down_write(¤t->mm->mmap_sem); + textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, + MAP_PRIVATE|MAP_EXECUTABLE, 0); + up_write(¤t->mm->mmap_sem); + if (!textpos || IS_ERR_VALUE(textpos)) { + if (!textpos) + textpos = (unsigned long) -ENOMEM; + printk("Unable to mmap process text, errno %d\n", (int)-textpos); + ret = textpos; + goto err; + } + + len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); + len = PAGE_ALIGN(len); + down_write(¤t->mm->mmap_sem); + realdatastart = do_mmap(0, 0, len, + PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); + up_write(¤t->mm->mmap_sem); + + if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) { + if (!realdatastart) + realdatastart = (unsigned long) -ENOMEM; + printk("Unable to allocate RAM for process data, errno %d\n", + (int)-realdatastart); + do_munmap(current->mm, textpos, text_len); + ret = realdatastart; + goto err; + } + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(unsigned long), + FLAT_DATA_ALIGN); + + DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n", + (int)(data_len + bss_len + stack_len), (int)datapos); + + fpos = ntohl(hdr->data_start); +#ifdef CONFIG_BINFMT_ZFLAT + if (flags & FLAT_FLAG_GZDATA) { + result = decompress_exec(bprm, fpos, (char *) datapos, + data_len + (relocs * sizeof(unsigned long)), 0); + } else +#endif + { + result = bprm->file->f_op->read(bprm->file, (char *) datapos, + data_len + (relocs * sizeof(unsigned long)), &fpos); + } + if (IS_ERR_VALUE(result)) { + printk("Unable to read data+bss, errno %d\n", (int)-result); + do_munmap(current->mm, textpos, text_len); + do_munmap(current->mm, realdatastart, len); + ret = result; + goto err; + } + + reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); + memp = realdatastart; + memp_size = len; + } else { + + len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); + len = PAGE_ALIGN(len); + down_write(¤t->mm->mmap_sem); + textpos = do_mmap(0, 0, len, + PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); + up_write(¤t->mm->mmap_sem); + + if (!textpos || IS_ERR_VALUE(textpos)) { + if (!textpos) + textpos = (unsigned long) -ENOMEM; + printk("Unable to allocate RAM for process text/data, errno %d\n", + (int)-textpos); + ret = textpos; + goto err; + } + + realdatastart = textpos + ntohl(hdr->data_start); + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(unsigned long), + FLAT_DATA_ALIGN); + + reloc = (unsigned long *) + (datapos + (ntohl(hdr->reloc_start) - text_len)); + memp = textpos; + memp_size = len; +#ifdef CONFIG_BINFMT_ZFLAT + /* + * load it all in and treat it like a RAM load from now on + */ + if (flags & FLAT_FLAG_GZIP) { + result = decompress_exec(bprm, sizeof (struct flat_hdr), + (((char *) textpos) + sizeof (struct flat_hdr)), + (text_len + data_len + (relocs * sizeof(unsigned long)) + - sizeof (struct flat_hdr)), + 0); + memmove((void *) datapos, (void *) realdatastart, + data_len + (relocs * sizeof(unsigned long))); + } else if (flags & FLAT_FLAG_GZDATA) { + fpos = 0; + result = bprm->file->f_op->read(bprm->file, + (char *) textpos, text_len, &fpos); + if (!IS_ERR_VALUE(result)) + result = decompress_exec(bprm, text_len, (char *) datapos, + data_len + (relocs * sizeof(unsigned long)), 0); + } + else +#endif + { + fpos = 0; + result = bprm->file->f_op->read(bprm->file, + (char *) textpos, text_len, &fpos); + if (!IS_ERR_VALUE(result)) { + fpos = ntohl(hdr->data_start); + result = bprm->file->f_op->read(bprm->file, (char *) datapos, + data_len + (relocs * sizeof(unsigned long)), &fpos); + } + } + if (IS_ERR_VALUE(result)) { + printk("Unable to read code+data+bss, errno %d\n",(int)-result); + do_munmap(current->mm, textpos, text_len + data_len + extra + + MAX_SHARED_LIBS * sizeof(unsigned long)); + ret = result; + goto err; + } + } + + if (flags & FLAT_FLAG_KTRACE) + printk("Mapping is %x, Entry point is %x, data_start is %x\n", + (int)textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start)); + + /* The main program needs a little extra setup in the task structure */ + start_code = textpos + sizeof (struct flat_hdr); + end_code = textpos + text_len; + if (id == 0) { + current->mm->start_code = start_code; + current->mm->end_code = end_code; + current->mm->start_data = datapos; + current->mm->end_data = datapos + data_len; + /* + * set up the brk stuff, uses any slack left in data/bss/stack + * allocation. We put the brk after the bss (between the bss + * and stack) like other platforms. + * Userspace code relies on the stack pointer starting out at + * an address right at the end of a page. + */ + current->mm->start_brk = datapos + data_len + bss_len; + current->mm->brk = (current->mm->start_brk + 3) & ~3; + current->mm->context.end_brk = memp + memp_size - stack_len; + } + + if (flags & FLAT_FLAG_KTRACE) + printk("%s %s: TEXT=%x-%x DATA=%x-%x BSS=%x-%x\n", + id ? "Lib" : "Load", bprm->filename, + (int) start_code, (int) end_code, + (int) datapos, + (int) (datapos + data_len), + (int) (datapos + data_len), + (int) (((datapos + data_len + bss_len) + 3) & ~3)); + + text_len -= sizeof(struct flat_hdr); /* the real code len */ + + /* Store the current module values into the global library structure */ + libinfo->lib_list[id].start_code = start_code; + libinfo->lib_list[id].start_data = datapos; + libinfo->lib_list[id].start_brk = datapos + data_len + bss_len; + libinfo->lib_list[id].text_len = text_len; + libinfo->lib_list[id].loaded = 1; + libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos; + libinfo->lib_list[id].build_date = ntohl(hdr->build_date); + + /* + * We just load the allocations into some temporary memory to + * help simplify all this mumbo jumbo + * + * We've got two different sections of relocation entries. + * The first is the GOT which resides at the beginning of the data segment + * and is terminated with a -1. This one can be relocated in place. + * The second is the extra relocation entries tacked after the image's + * data segment. These require a little more processing as the entry is + * really an offset into the image which contains an offset into the + * image. + */ + if (flags & FLAT_FLAG_GOTPIC) { + for (rp = (unsigned long *)datapos; *rp != 0xffffffff; rp++) { + unsigned long addr; + if (*rp) { + addr = calc_reloc(*rp, libinfo, id, 0); + if (addr == RELOC_FAILED) { + ret = -ENOEXEC; + goto err; + } + *rp = addr; + } + } + } + + /* + * Now run through the relocation entries. + * We've got to be careful here as C++ produces relocatable zero + * entries in the constructor and destructor tables which are then + * tested for being not zero (which will always occur unless we're + * based from address zero). This causes an endless loop as __start + * is at zero. The solution used is to not relocate zero addresses. + * This has the negative side effect of not allowing a global data + * reference to be statically initialised to _stext (I've moved + * __start to address 4 so that is okay). + */ + if (rev > OLD_FLAT_VERSION) { + unsigned long persistent = 0; + for (i=0; i < relocs; i++) { + unsigned long addr, relval; + + /* Get the address of the pointer to be + relocated (of course, the address has to be + relocated first). */ + relval = ntohl(reloc[i]); + if (flat_set_persistent (relval, &persistent)) + continue; + addr = flat_get_relocate_addr(relval); + rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1); + if (rp == (unsigned long *)RELOC_FAILED) { + ret = -ENOEXEC; + goto err; + } + + /* Get the pointer's value. */ + addr = flat_get_addr_from_rp(rp, relval, flags, + &persistent); + if (addr != 0) { + /* + * Do the relocation. PIC relocs in the data section are + * already in target order + */ + if ((flags & FLAT_FLAG_GOTPIC) == 0) + addr = ntohl(addr); + addr = calc_reloc(addr, libinfo, id, 0); + if (addr == RELOC_FAILED) { + ret = -ENOEXEC; + goto err; + } + + /* Write back the relocated pointer. */ + flat_put_addr_at_rp(rp, addr, relval); + } + } + } else { + for (i=0; i < relocs; i++) + old_reloc(ntohl(reloc[i])); + } + + flush_icache_range(start_code, end_code); + + /* zero the BSS, BRK and stack areas */ + memset((void*)(datapos + data_len), 0, bss_len + + (memp + memp_size - stack_len - /* end brk */ + libinfo->lib_list[id].start_brk) + /* start brk */ + stack_len); + + return 0; +err: + return ret; +} + + +/****************************************************************************/ +#ifdef CONFIG_BINFMT_SHARED_FLAT + +/* + * Load a shared library into memory. The library gets its own data + * segment (including bss) but not argv/argc/environ. + */ + +static int load_flat_shared_library(int id, struct lib_info *libs) +{ + struct linux_binprm bprm; + int res; + char buf[16]; + + memset(&bprm, 0, sizeof(bprm)); + + /* Create the file name */ + sprintf(buf, "/lib/lib%d.so", id); + + /* Open the file up */ + bprm.filename = buf; + bprm.file = open_exec(bprm.filename); + res = PTR_ERR(bprm.file); + if (IS_ERR(bprm.file)) + return res; + + bprm.cred = prepare_exec_creds(); + res = -ENOMEM; + if (!bprm.cred) + goto out; + + /* We don't really care about recalculating credentials at this point + * as we're past the point of no return and are dealing with shared + * libraries. + */ + bprm.cred_prepared = 1; + + res = prepare_binprm(&bprm); + + if (!IS_ERR_VALUE(res)) + res = load_flat_file(&bprm, libs, id, NULL); + + abort_creds(bprm.cred); + +out: + allow_write_access(bprm.file); + fput(bprm.file); + + return(res); +} + +#endif /* CONFIG_BINFMT_SHARED_FLAT */ +/****************************************************************************/ + +/* + * These are the functions used to load flat style executables and shared + * libraries. There is no binary dependent code anywhere else. + */ + +static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) +{ + struct lib_info libinfo; + unsigned long p = bprm->p; + unsigned long stack_len; + unsigned long start_addr; + unsigned long *sp; + int res; + int i, j; + + memset(&libinfo, 0, sizeof(libinfo)); + /* + * We have to add the size of our arguments to our stack size + * otherwise it's too easy for users to create stack overflows + * by passing in a huge argument list. And yes, we have to be + * pedantic and include space for the argv/envp array as it may have + * a lot of entries. + */ +#define TOP_OF_ARGS (PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *)) + stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ + stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ + stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ + stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */ + + res = load_flat_file(bprm, &libinfo, 0, &stack_len); + if (IS_ERR_VALUE(res)) + return res; + + /* Update data segment pointers for all libraries */ + for (i=0; iflags &= ~PF_FORKNOEXEC; + + set_binfmt(&flat_format); + + p = ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4; + DBG_FLT("p=%x\n", (int)p); + + /* copy the arg pages onto the stack, this could be more efficient :-) */ + for (i = TOP_OF_ARGS - 1; i >= bprm->p; i--) + * (char *) --p = + ((char *) page_address(bprm->page[i/PAGE_SIZE]))[i % PAGE_SIZE]; + + sp = (unsigned long *) create_flat_tables(p, bprm); + + /* Fake some return addresses to ensure the call chain will + * initialise library in order for us. We are required to call + * lib 1 first, then 2, ... and finally the main program (id 0). + */ + start_addr = libinfo.lib_list[0].entry; + +#ifdef CONFIG_BINFMT_SHARED_FLAT + for (i = MAX_SHARED_LIBS-1; i>0; i--) { + if (libinfo.lib_list[i].loaded) { + /* Push previos first to call address */ + --sp; put_user(start_addr, sp); + start_addr = libinfo.lib_list[i].entry; + } + } +#endif + + /* Stash our initial stack pointer into the mm structure */ + current->mm->start_stack = (unsigned long )sp; + +#ifdef FLAT_PLAT_INIT + FLAT_PLAT_INIT(regs); +#endif + DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n", + (int)regs, (int)start_addr, (int)current->mm->start_stack); + + start_thread(regs, start_addr, current->mm->start_stack); + + return 0; +} + +/****************************************************************************/ + +static int __init init_flat_binfmt(void) +{ + return register_binfmt(&flat_format); +} + +/****************************************************************************/ + +core_initcall(init_flat_binfmt); + +/****************************************************************************/ diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c new file mode 100644 index 00000000..1befe2ec --- /dev/null +++ b/fs/binfmt_misc.c @@ -0,0 +1,746 @@ +/* + * binfmt_misc.c + * + * Copyright (C) 1997 Richard Günther + * + * binfmt_misc detects binaries via a magic or filename extension and invokes + * a specified wrapper. This should obsolete binfmt_java, binfmt_em86 and + * binfmt_mz. + * + * 1997-04-25 first version + * [...] + * 1997-05-19 cleanup + * 1997-06-26 hpa: pass the real filename rather than argv[0] + * 1997-06-30 minor cleanup + * 1997-08-09 removed extension stripping, locking cleanup + * 2001-02-28 AV: rewritten into something that resembles C. Original didn't. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +enum { + VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ +}; + +static LIST_HEAD(entries); +static int enabled = 1; + +enum {Enabled, Magic}; +#define MISC_FMT_PRESERVE_ARGV0 (1<<31) +#define MISC_FMT_OPEN_BINARY (1<<30) +#define MISC_FMT_CREDENTIALS (1<<29) + +typedef struct { + struct list_head list; + unsigned long flags; /* type, status, etc. */ + int offset; /* offset of magic */ + int size; /* size of magic/mask */ + char *magic; /* magic or filename extension */ + char *mask; /* mask, NULL for exact match */ + char *interpreter; /* filename of interpreter */ + char *name; + struct dentry *dentry; +} Node; + +static DEFINE_RWLOCK(entries_lock); +static struct file_system_type bm_fs_type; +static struct vfsmount *bm_mnt; +static int entry_count; + +/* + * Check if we support the binfmt + * if we do, return the node, else NULL + * locking is done in load_misc_binary + */ +static Node *check_file(struct linux_binprm *bprm) +{ + char *p = strrchr(bprm->interp, '.'); + struct list_head *l; + + list_for_each(l, &entries) { + Node *e = list_entry(l, Node, list); + char *s; + int j; + + if (!test_bit(Enabled, &e->flags)) + continue; + + if (!test_bit(Magic, &e->flags)) { + if (p && !strcmp(e->magic, p + 1)) + return e; + continue; + } + + s = bprm->buf + e->offset; + if (e->mask) { + for (j = 0; j < e->size; j++) + if ((*s++ ^ e->magic[j]) & e->mask[j]) + break; + } else { + for (j = 0; j < e->size; j++) + if ((*s++ ^ e->magic[j])) + break; + } + if (j == e->size) + return e; + } + return NULL; +} + +/* + * the loader itself + */ +static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) +{ + Node *fmt; + struct file * interp_file = NULL; + char iname[BINPRM_BUF_SIZE]; + const char *iname_addr = iname; + int retval; + int fd_binary = -1; + + retval = -ENOEXEC; + if (!enabled) + goto _ret; + + retval = -ENOEXEC; + if (bprm->recursion_depth > BINPRM_MAX_RECURSION) + goto _ret; + + /* to keep locking time low, we copy the interpreter string */ + read_lock(&entries_lock); + fmt = check_file(bprm); + if (fmt) + strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); + read_unlock(&entries_lock); + if (!fmt) + goto _ret; + + if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { + retval = remove_arg_zero(bprm); + if (retval) + goto _ret; + } + + if (fmt->flags & MISC_FMT_OPEN_BINARY) { + + /* if the binary should be opened on behalf of the + * interpreter than keep it open and assign descriptor + * to it */ + fd_binary = get_unused_fd(); + if (fd_binary < 0) { + retval = fd_binary; + goto _ret; + } + fd_install(fd_binary, bprm->file); + + /* if the binary is not readable than enforce mm->dumpable=0 + regardless of the interpreter's permissions */ + if (file_permission(bprm->file, MAY_READ)) + bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + + allow_write_access(bprm->file); + bprm->file = NULL; + + /* mark the bprm that fd should be passed to interp */ + bprm->interp_flags |= BINPRM_FLAGS_EXECFD; + bprm->interp_data = fd_binary; + + } else { + allow_write_access(bprm->file); + fput(bprm->file); + bprm->file = NULL; + } + /* make argv[1] be the path to the binary */ + retval = copy_strings_kernel (1, &bprm->interp, bprm); + if (retval < 0) + goto _error; + bprm->argc++; + + /* add the interp as argv[0] */ + retval = copy_strings_kernel (1, &iname_addr, bprm); + if (retval < 0) + goto _error; + bprm->argc ++; + + bprm->interp = iname; /* for binfmt_script */ + + interp_file = open_exec (iname); + retval = PTR_ERR (interp_file); + if (IS_ERR (interp_file)) + goto _error; + + bprm->file = interp_file; + if (fmt->flags & MISC_FMT_CREDENTIALS) { + /* + * No need to call prepare_binprm(), it's already been + * done. bprm->buf is stale, update from interp_file. + */ + memset(bprm->buf, 0, BINPRM_BUF_SIZE); + retval = kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); + } else + retval = prepare_binprm (bprm); + + if (retval < 0) + goto _error; + + bprm->recursion_depth++; + + retval = search_binary_handler (bprm, regs); + if (retval < 0) + goto _error; + +_ret: + return retval; +_error: + if (fd_binary > 0) + sys_close(fd_binary); + bprm->interp_flags = 0; + bprm->interp_data = 0; + goto _ret; +} + +/* Command parsers */ + +/* + * parses and copies one argument enclosed in del from *sp to *dp, + * recognising the \x special. + * returns pointer to the copied argument or NULL in case of an + * error (and sets err) or null argument length. + */ +static char *scanarg(char *s, char del) +{ + char c; + + while ((c = *s++) != del) { + if (c == '\\' && *s == 'x') { + s++; + if (!isxdigit(*s++)) + return NULL; + if (!isxdigit(*s++)) + return NULL; + } + } + return s; +} + +static int unquote(char *from) +{ + char c = 0, *s = from, *p = from; + + while ((c = *s++) != '\0') { + if (c == '\\' && *s == 'x') { + s++; + c = toupper(*s++); + *p = (c - (isdigit(c) ? '0' : 'A' - 10)) << 4; + c = toupper(*s++); + *p++ |= c - (isdigit(c) ? '0' : 'A' - 10); + continue; + } + *p++ = c; + } + return p - from; +} + +static char * check_special_flags (char * sfs, Node * e) +{ + char * p = sfs; + int cont = 1; + + /* special flags */ + while (cont) { + switch (*p) { + case 'P': + p++; + e->flags |= MISC_FMT_PRESERVE_ARGV0; + break; + case 'O': + p++; + e->flags |= MISC_FMT_OPEN_BINARY; + break; + case 'C': + p++; + /* this flags also implies the + open-binary flag */ + e->flags |= (MISC_FMT_CREDENTIALS | + MISC_FMT_OPEN_BINARY); + break; + default: + cont = 0; + } + } + + return p; +} +/* + * This registers a new binary format, it recognises the syntax + * ':name:type:offset:magic:mask:interpreter:flags' + * where the ':' is the IFS, that can be chosen with the first char + */ +static Node *create_entry(const char __user *buffer, size_t count) +{ + Node *e; + int memsize, err; + char *buf, *p; + char del; + + /* some sanity checks */ + err = -EINVAL; + if ((count < 11) || (count > 256)) + goto out; + + err = -ENOMEM; + memsize = sizeof(Node) + count + 8; + e = kmalloc(memsize, GFP_USER); + if (!e) + goto out; + + p = buf = (char *)e + sizeof(Node); + + memset(e, 0, sizeof(Node)); + if (copy_from_user(buf, buffer, count)) + goto Efault; + + del = *p++; /* delimeter */ + + memset(buf+count, del, 8); + + e->name = p; + p = strchr(p, del); + if (!p) + goto Einval; + *p++ = '\0'; + if (!e->name[0] || + !strcmp(e->name, ".") || + !strcmp(e->name, "..") || + strchr(e->name, '/')) + goto Einval; + switch (*p++) { + case 'E': e->flags = 1<flags = (1<flags)) { + char *s = strchr(p, del); + if (!s) + goto Einval; + *s++ = '\0'; + e->offset = simple_strtoul(p, &p, 10); + if (*p++) + goto Einval; + e->magic = p; + p = scanarg(p, del); + if (!p) + goto Einval; + p[-1] = '\0'; + if (!e->magic[0]) + goto Einval; + e->mask = p; + p = scanarg(p, del); + if (!p) + goto Einval; + p[-1] = '\0'; + if (!e->mask[0]) + e->mask = NULL; + e->size = unquote(e->magic); + if (e->mask && unquote(e->mask) != e->size) + goto Einval; + if (e->size + e->offset > BINPRM_BUF_SIZE) + goto Einval; + } else { + p = strchr(p, del); + if (!p) + goto Einval; + *p++ = '\0'; + e->magic = p; + p = strchr(p, del); + if (!p) + goto Einval; + *p++ = '\0'; + if (!e->magic[0] || strchr(e->magic, '/')) + goto Einval; + p = strchr(p, del); + if (!p) + goto Einval; + *p++ = '\0'; + } + e->interpreter = p; + p = strchr(p, del); + if (!p) + goto Einval; + *p++ = '\0'; + if (!e->interpreter[0]) + goto Einval; + + + p = check_special_flags (p, e); + + if (*p == '\n') + p++; + if (p != buf + count) + goto Einval; + return e; + +out: + return ERR_PTR(err); + +Efault: + kfree(e); + return ERR_PTR(-EFAULT); +Einval: + kfree(e); + return ERR_PTR(-EINVAL); +} + +/* + * Set status of entry/binfmt_misc: + * '1' enables, '0' disables and '-1' clears entry/binfmt_misc + */ +static int parse_command(const char __user *buffer, size_t count) +{ + char s[4]; + + if (!count) + return 0; + if (count > 3) + return -EINVAL; + if (copy_from_user(s, buffer, count)) + return -EFAULT; + if (s[count-1] == '\n') + count--; + if (count == 1 && s[0] == '0') + return 1; + if (count == 1 && s[0] == '1') + return 2; + if (count == 2 && s[0] == '-' && s[1] == '1') + return 3; + return -EINVAL; +} + +/* generic stuff */ + +static void entry_status(Node *e, char *page) +{ + char *dp; + char *status = "disabled"; + const char * flags = "flags: "; + + if (test_bit(Enabled, &e->flags)) + status = "enabled"; + + if (!VERBOSE_STATUS) { + sprintf(page, "%s\n", status); + return; + } + + sprintf(page, "%s\ninterpreter %s\n", status, e->interpreter); + dp = page + strlen(page); + + /* print the special flags */ + sprintf (dp, "%s", flags); + dp += strlen (flags); + if (e->flags & MISC_FMT_PRESERVE_ARGV0) { + *dp ++ = 'P'; + } + if (e->flags & MISC_FMT_OPEN_BINARY) { + *dp ++ = 'O'; + } + if (e->flags & MISC_FMT_CREDENTIALS) { + *dp ++ = 'C'; + } + *dp ++ = '\n'; + + + if (!test_bit(Magic, &e->flags)) { + sprintf(dp, "extension .%s\n", e->magic); + } else { + int i; + + sprintf(dp, "offset %i\nmagic ", e->offset); + dp = page + strlen(page); + for (i = 0; i < e->size; i++) { + sprintf(dp, "%02x", 0xff & (int) (e->magic[i])); + dp += 2; + } + if (e->mask) { + sprintf(dp, "\nmask "); + dp += 6; + for (i = 0; i < e->size; i++) { + sprintf(dp, "%02x", 0xff & (int) (e->mask[i])); + dp += 2; + } + } + *dp++ = '\n'; + *dp = '\0'; + } +} + +static struct inode *bm_get_inode(struct super_block *sb, int mode) +{ + struct inode * inode = new_inode(sb); + + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = + current_fs_time(inode->i_sb); + } + return inode; +} + +static void bm_evict_inode(struct inode *inode) +{ + end_writeback(inode); + kfree(inode->i_private); +} + +static void kill_node(Node *e) +{ + struct dentry *dentry; + + write_lock(&entries_lock); + dentry = e->dentry; + if (dentry) { + list_del_init(&e->list); + e->dentry = NULL; + } + write_unlock(&entries_lock); + + if (dentry) { + dentry->d_inode->i_nlink--; + d_drop(dentry); + dput(dentry); + simple_release_fs(&bm_mnt, &entry_count); + } +} + +/* / */ + +static ssize_t +bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos) +{ + Node *e = file->f_path.dentry->d_inode->i_private; + ssize_t res; + char *page; + + if (!(page = (char*) __get_free_page(GFP_KERNEL))) + return -ENOMEM; + + entry_status(e, page); + + res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page)); + + free_page((unsigned long) page); + return res; +} + +static ssize_t bm_entry_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct dentry *root; + Node *e = file->f_path.dentry->d_inode->i_private; + int res = parse_command(buffer, count); + + switch (res) { + case 1: clear_bit(Enabled, &e->flags); + break; + case 2: set_bit(Enabled, &e->flags); + break; + case 3: root = dget(file->f_path.mnt->mnt_sb->s_root); + mutex_lock(&root->d_inode->i_mutex); + + kill_node(e); + + mutex_unlock(&root->d_inode->i_mutex); + dput(root); + break; + default: return res; + } + return count; +} + +static const struct file_operations bm_entry_operations = { + .read = bm_entry_read, + .write = bm_entry_write, + .llseek = default_llseek, +}; + +/* /register */ + +static ssize_t bm_register_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + Node *e; + struct inode *inode; + struct dentry *root, *dentry; + struct super_block *sb = file->f_path.mnt->mnt_sb; + int err = 0; + + e = create_entry(buffer, count); + + if (IS_ERR(e)) + return PTR_ERR(e); + + root = dget(sb->s_root); + mutex_lock(&root->d_inode->i_mutex); + dentry = lookup_one_len(e->name, root, strlen(e->name)); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out; + + err = -EEXIST; + if (dentry->d_inode) + goto out2; + + inode = bm_get_inode(sb, S_IFREG | 0644); + + err = -ENOMEM; + if (!inode) + goto out2; + + err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count); + if (err) { + iput(inode); + inode = NULL; + goto out2; + } + + e->dentry = dget(dentry); + inode->i_private = e; + inode->i_fop = &bm_entry_operations; + + d_instantiate(dentry, inode); + write_lock(&entries_lock); + list_add(&e->list, &entries); + write_unlock(&entries_lock); + + err = 0; +out2: + dput(dentry); +out: + mutex_unlock(&root->d_inode->i_mutex); + dput(root); + + if (err) { + kfree(e); + return -EINVAL; + } + return count; +} + +static const struct file_operations bm_register_operations = { + .write = bm_register_write, + .llseek = noop_llseek, +}; + +/* /status */ + +static ssize_t +bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) +{ + char *s = enabled ? "enabled\n" : "disabled\n"; + + return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); +} + +static ssize_t bm_status_write(struct file * file, const char __user * buffer, + size_t count, loff_t *ppos) +{ + int res = parse_command(buffer, count); + struct dentry *root; + + switch (res) { + case 1: enabled = 0; break; + case 2: enabled = 1; break; + case 3: root = dget(file->f_path.mnt->mnt_sb->s_root); + mutex_lock(&root->d_inode->i_mutex); + + while (!list_empty(&entries)) + kill_node(list_entry(entries.next, Node, list)); + + mutex_unlock(&root->d_inode->i_mutex); + dput(root); + default: return res; + } + return count; +} + +static const struct file_operations bm_status_operations = { + .read = bm_status_read, + .write = bm_status_write, + .llseek = default_llseek, +}; + +/* Superblock handling */ + +static const struct super_operations s_ops = { + .statfs = simple_statfs, + .evict_inode = bm_evict_inode, +}; + +static int bm_fill_super(struct super_block * sb, void * data, int silent) +{ + static struct tree_descr bm_files[] = { + [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, + [3] = {"register", &bm_register_operations, S_IWUSR}, + /* last one */ {""} + }; + int err = simple_fill_super(sb, 0x42494e4d, bm_files); + if (!err) + sb->s_op = &s_ops; + return err; +} + +static struct dentry *bm_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, bm_fill_super); +} + +static struct linux_binfmt misc_format = { + .module = THIS_MODULE, + .load_binary = load_misc_binary, +}; + +static struct file_system_type bm_fs_type = { + .owner = THIS_MODULE, + .name = "binfmt_misc", + .mount = bm_mount, + .kill_sb = kill_litter_super, +}; + +static int __init init_misc_binfmt(void) +{ + int err = register_filesystem(&bm_fs_type); + if (!err) { + err = insert_binfmt(&misc_format); + if (err) + unregister_filesystem(&bm_fs_type); + } + return err; +} + +static void __exit exit_misc_binfmt(void) +{ + unregister_binfmt(&misc_format); + unregister_filesystem(&bm_fs_type); +} + +core_initcall(init_misc_binfmt); +module_exit(exit_misc_binfmt); +MODULE_LICENSE("GPL"); diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c new file mode 100644 index 00000000..396a9884 --- /dev/null +++ b/fs/binfmt_script.c @@ -0,0 +1,118 @@ +/* + * linux/fs/binfmt_script.c + * + * Copyright (C) 1996 Martin von Löwis + * original #!-checking implemented by tytso. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) +{ + const char *i_arg, *i_name; + char *cp; + struct file *file; + char interp[BINPRM_BUF_SIZE]; + int retval; + + if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') || + (bprm->recursion_depth > BINPRM_MAX_RECURSION)) + return -ENOEXEC; + /* + * This section does the #! interpretation. + * Sorta complicated, but hopefully it will work. -TYT + */ + + bprm->recursion_depth++; + allow_write_access(bprm->file); + fput(bprm->file); + bprm->file = NULL; + + bprm->buf[BINPRM_BUF_SIZE - 1] = '\0'; + if ((cp = strchr(bprm->buf, '\n')) == NULL) + cp = bprm->buf+BINPRM_BUF_SIZE-1; + *cp = '\0'; + while (cp > bprm->buf) { + cp--; + if ((*cp == ' ') || (*cp == '\t')) + *cp = '\0'; + else + break; + } + for (cp = bprm->buf+2; (*cp == ' ') || (*cp == '\t'); cp++); + if (*cp == '\0') + return -ENOEXEC; /* No interpreter name found */ + i_name = cp; + i_arg = NULL; + for ( ; *cp && (*cp != ' ') && (*cp != '\t'); cp++) + /* nothing */ ; + while ((*cp == ' ') || (*cp == '\t')) + *cp++ = '\0'; + if (*cp) + i_arg = cp; + strcpy (interp, i_name); + /* + * OK, we've parsed out the interpreter name and + * (optional) argument. + * Splice in (1) the interpreter's name for argv[0] + * (2) (optional) argument to interpreter + * (3) filename of shell script (replace argv[0]) + * + * This is done in reverse order, because of how the + * user environment and arguments are stored. + */ + retval = remove_arg_zero(bprm); + if (retval) + return retval; + retval = copy_strings_kernel(1, &bprm->interp, bprm); + if (retval < 0) return retval; + bprm->argc++; + if (i_arg) { + retval = copy_strings_kernel(1, &i_arg, bprm); + if (retval < 0) return retval; + bprm->argc++; + } + retval = copy_strings_kernel(1, &i_name, bprm); + if (retval) return retval; + bprm->argc++; + bprm->interp = interp; + + /* + * OK, now restart the process with the interpreter's dentry. + */ + file = open_exec(interp); + if (IS_ERR(file)) + return PTR_ERR(file); + + bprm->file = file; + retval = prepare_binprm(bprm); + if (retval < 0) + return retval; + return search_binary_handler(bprm,regs); +} + +static struct linux_binfmt script_format = { + .module = THIS_MODULE, + .load_binary = load_script, +}; + +static int __init init_script_binfmt(void) +{ + return register_binfmt(&script_format); +} + +static void __exit exit_script_binfmt(void) +{ + unregister_binfmt(&script_format); +} + +core_initcall(init_script_binfmt); +module_exit(exit_script_binfmt); +MODULE_LICENSE("GPL"); diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c new file mode 100644 index 00000000..cc8560f6 --- /dev/null +++ b/fs/binfmt_som.c @@ -0,0 +1,304 @@ +/* + * linux/fs/binfmt_som.c + * + * These are the functions used to load SOM format executables as used + * by HP-UX. + * + * Copyright 1999 Matthew Wilcox + * based on binfmt_elf which is + * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com). + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +#include + +static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs); +static int load_som_library(struct file *); + +/* + * If we don't support core dumping, then supply a NULL so we + * don't even try. + */ +#if 0 +static int som_core_dump(struct coredump_params *cprm); +#else +#define som_core_dump NULL +#endif + +#define SOM_PAGESTART(_v) ((_v) & ~(unsigned long)(SOM_PAGESIZE-1)) +#define SOM_PAGEOFFSET(_v) ((_v) & (SOM_PAGESIZE-1)) +#define SOM_PAGEALIGN(_v) (((_v) + SOM_PAGESIZE - 1) & ~(SOM_PAGESIZE - 1)) + +static struct linux_binfmt som_format = { + .module = THIS_MODULE, + .load_binary = load_som_binary, + .load_shlib = load_som_library, + .core_dump = som_core_dump, + .min_coredump = SOM_PAGESIZE +}; + +/* + * create_som_tables() parses the env- and arg-strings in new user + * memory and creates the pointer tables from them, and puts their + * addresses on the "stack", returning the new stack pointer value. + */ +static void create_som_tables(struct linux_binprm *bprm) +{ + char **argv, **envp; + int argc = bprm->argc; + int envc = bprm->envc; + unsigned long p; + unsigned long *sp; + + /* Word-align the stack pointer */ + sp = (unsigned long *)((bprm->p + 3) & ~3); + + envp = (char **) sp; + sp += envc + 1; + argv = (char **) sp; + sp += argc + 1; + + __put_user((unsigned long) envp,++sp); + __put_user((unsigned long) argv,++sp); + + __put_user(argc, ++sp); + + bprm->p = (unsigned long) sp; + + p = current->mm->arg_start; + while (argc-- > 0) { + __put_user((char *)p,argv++); + p += strlen_user((char *)p); + } + __put_user(NULL, argv); + current->mm->arg_end = current->mm->env_start = p; + while (envc-- > 0) { + __put_user((char *)p,envp++); + p += strlen_user((char *)p); + } + __put_user(NULL, envp); + current->mm->env_end = p; +} + +static int check_som_header(struct som_hdr *som_ex) +{ + int *buf = (int *)som_ex; + int i, ck; + + if (som_ex->system_id != SOM_SID_PARISC_1_0 && + som_ex->system_id != SOM_SID_PARISC_1_1 && + som_ex->system_id != SOM_SID_PARISC_2_0) + return -ENOEXEC; + + if (som_ex->a_magic != SOM_EXEC_NONSHARE && + som_ex->a_magic != SOM_EXEC_SHARE && + som_ex->a_magic != SOM_EXEC_DEMAND) + return -ENOEXEC; + + if (som_ex->version_id != SOM_ID_OLD && + som_ex->version_id != SOM_ID_NEW) + return -ENOEXEC; + + ck = 0; + for (i=0; i<32; i++) + ck ^= buf[i]; + if (ck != 0) + return -ENOEXEC; + + return 0; +} + +static int map_som_binary(struct file *file, + const struct som_exec_auxhdr *hpuxhdr) +{ + unsigned long code_start, code_size, data_start, data_size; + unsigned long bss_start, som_brk; + int retval; + int prot = PROT_READ | PROT_EXEC; + int flags = MAP_FIXED|MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE; + + mm_segment_t old_fs = get_fs(); + set_fs(get_ds()); + + code_start = SOM_PAGESTART(hpuxhdr->exec_tmem); + code_size = SOM_PAGEALIGN(hpuxhdr->exec_tsize); + current->mm->start_code = code_start; + current->mm->end_code = code_start + code_size; + down_write(¤t->mm->mmap_sem); + retval = do_mmap(file, code_start, code_size, prot, + flags, SOM_PAGESTART(hpuxhdr->exec_tfile)); + up_write(¤t->mm->mmap_sem); + if (retval < 0 && retval > -1024) + goto out; + + data_start = SOM_PAGESTART(hpuxhdr->exec_dmem); + data_size = SOM_PAGEALIGN(hpuxhdr->exec_dsize); + current->mm->start_data = data_start; + current->mm->end_data = bss_start = data_start + data_size; + down_write(¤t->mm->mmap_sem); + retval = do_mmap(file, data_start, data_size, + prot | PROT_WRITE, flags, + SOM_PAGESTART(hpuxhdr->exec_dfile)); + up_write(¤t->mm->mmap_sem); + if (retval < 0 && retval > -1024) + goto out; + + som_brk = bss_start + SOM_PAGEALIGN(hpuxhdr->exec_bsize); + current->mm->start_brk = current->mm->brk = som_brk; + down_write(¤t->mm->mmap_sem); + retval = do_mmap(NULL, bss_start, som_brk - bss_start, + prot | PROT_WRITE, MAP_FIXED | MAP_PRIVATE, 0); + up_write(¤t->mm->mmap_sem); + if (retval > 0 || retval < -1024) + retval = 0; +out: + set_fs(old_fs); + return retval; +} + + +/* + * These are the functions used to load SOM executables and shared + * libraries. There is no binary dependent code anywhere else. + */ + +static int +load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) +{ + int retval; + unsigned int size; + unsigned long som_entry; + struct som_hdr *som_ex; + struct som_exec_auxhdr *hpuxhdr; + + /* Get the exec-header */ + som_ex = (struct som_hdr *) bprm->buf; + + retval = check_som_header(som_ex); + if (retval != 0) + goto out; + + /* Now read in the auxiliary header information */ + + retval = -ENOMEM; + size = som_ex->aux_header_size; + if (size > SOM_PAGESIZE) + goto out; + hpuxhdr = kmalloc(size, GFP_KERNEL); + if (!hpuxhdr) + goto out; + + retval = kernel_read(bprm->file, som_ex->aux_header_location, + (char *) hpuxhdr, size); + if (retval != size) { + if (retval >= 0) + retval = -EIO; + goto out_free; + } + + /* Flush all traces of the currently running executable */ + retval = flush_old_exec(bprm); + if (retval) + goto out_free; + + /* OK, This is the point of no return */ + current->flags &= ~PF_FORKNOEXEC; + current->personality = PER_HPUX; + setup_new_exec(bprm); + + /* Set the task size for HP-UX processes such that + * the gateway page is outside the address space. + * This can be fixed later, but for now, this is much + * easier. + */ + + current->thread.task_size = 0xc0000000; + + /* Set map base to allow enough room for hp-ux heap growth */ + + current->thread.map_base = 0x80000000; + + retval = map_som_binary(bprm->file, hpuxhdr); + if (retval < 0) + goto out_free; + + som_entry = hpuxhdr->exec_entry; + kfree(hpuxhdr); + + set_binfmt(&som_format); + install_exec_creds(bprm); + setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); + + create_som_tables(bprm); + + current->mm->start_stack = bprm->p; + +#if 0 + printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk); + printk("(end_code) %08lx\n" , (unsigned long) current->mm->end_code); + printk("(start_code) %08lx\n" , (unsigned long) current->mm->start_code); + printk("(end_data) %08lx\n" , (unsigned long) current->mm->end_data); + printk("(start_stack) %08lx\n" , (unsigned long) current->mm->start_stack); + printk("(brk) %08lx\n" , (unsigned long) current->mm->brk); +#endif + + map_hpux_gateway_page(current,current->mm); + + start_thread_som(regs, som_entry, bprm->p); + return 0; + + /* error cleanup */ +out_free: + kfree(hpuxhdr); +out: + return retval; +} + +static int load_som_library(struct file *f) +{ +/* No lib support in SOM yet. gizza chance.. */ + return -ENOEXEC; +} + /* Install the SOM loader. + * N.B. We *rely* on the table being the right size with the + * right number of free slots... + */ + +static int __init init_som_binfmt(void) +{ + return register_binfmt(&som_format); +} + +static void __exit exit_som_binfmt(void) +{ + /* Remove the SOM loader. */ + unregister_binfmt(&som_format); +} + +core_initcall(init_som_binfmt); +module_exit(exit_som_binfmt); + +MODULE_LICENSE("GPL"); diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c new file mode 100644 index 00000000..9c5e6b2c --- /dev/null +++ b/fs/bio-integrity.c @@ -0,0 +1,807 @@ +/* + * bio-integrity.c - bio data integrity extensions + * + * Copyright (C) 2007, 2008, 2009 Oracle Corporation + * Written by: Martin K. Petersen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include +#include +#include +#include +#include + +struct integrity_slab { + struct kmem_cache *slab; + unsigned short nr_vecs; + char name[8]; +}; + +#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) } +struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = { + IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES), +}; +#undef IS + +static struct workqueue_struct *kintegrityd_wq; + +static inline unsigned int vecs_to_idx(unsigned int nr) +{ + switch (nr) { + case 1: + return 0; + case 2 ... 4: + return 1; + case 5 ... 16: + return 2; + case 17 ... 64: + return 3; + case 65 ... 128: + return 4; + case 129 ... BIO_MAX_PAGES: + return 5; + default: + BUG(); + } +} + +static inline int use_bip_pool(unsigned int idx) +{ + if (idx == BIOVEC_MAX_IDX) + return 1; + + return 0; +} + +/** + * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * @bs: bio_set to allocate from + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs, + struct bio_set *bs) +{ + struct bio_integrity_payload *bip; + unsigned int idx = vecs_to_idx(nr_vecs); + + BUG_ON(bio == NULL); + bip = NULL; + + /* Lower order allocations come straight from slab */ + if (!use_bip_pool(idx)) + bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask); + + /* Use mempool if lower order alloc failed or max vecs were requested */ + if (bip == NULL) { + idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */ + bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); + + if (unlikely(bip == NULL)) { + printk(KERN_ERR "%s: could not alloc bip\n", __func__); + return NULL; + } + } + + memset(bip, 0, sizeof(*bip)); + + bip->bip_slab = idx; + bip->bip_bio = bio; + bio->bi_integrity = bip; + + return bip; +} +EXPORT_SYMBOL(bio_integrity_alloc_bioset); + +/** + * bio_integrity_alloc - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs) +{ + return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set); +} +EXPORT_SYMBOL(bio_integrity_alloc); + +/** + * bio_integrity_free - Free bio integrity payload + * @bio: bio containing bip to be freed + * @bs: bio_set this bio was allocated from + * + * Description: Used to free the integrity portion of a bio. Usually + * called from bio_free(). + */ +void bio_integrity_free(struct bio *bio, struct bio_set *bs) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + + BUG_ON(bip == NULL); + + /* A cloned bio doesn't own the integrity metadata */ + if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY) + && bip->bip_buf != NULL) + kfree(bip->bip_buf); + + if (use_bip_pool(bip->bip_slab)) + mempool_free(bip, bs->bio_integrity_pool); + else + kmem_cache_free(bip_slab[bip->bip_slab].slab, bip); + + bio->bi_integrity = NULL; +} +EXPORT_SYMBOL(bio_integrity_free); + +/** + * bio_integrity_add_page - Attach integrity metadata + * @bio: bio to update + * @page: page containing integrity metadata + * @len: number of bytes of integrity metadata in page + * @offset: start offset within page + * + * Description: Attach a page containing integrity metadata to bio. + */ +int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_vec *iv; + + if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) { + printk(KERN_ERR "%s: bip_vec full\n", __func__); + return 0; + } + + iv = bip_vec_idx(bip, bip->bip_vcnt); + BUG_ON(iv == NULL); + + iv->bv_page = page; + iv->bv_len = len; + iv->bv_offset = offset; + bip->bip_vcnt++; + + return len; +} +EXPORT_SYMBOL(bio_integrity_add_page); + +static int bdev_integrity_enabled(struct block_device *bdev, int rw) +{ + struct blk_integrity *bi = bdev_get_integrity(bdev); + + if (bi == NULL) + return 0; + + if (rw == READ && bi->verify_fn != NULL && + (bi->flags & INTEGRITY_FLAG_READ)) + return 1; + + if (rw == WRITE && bi->generate_fn != NULL && + (bi->flags & INTEGRITY_FLAG_WRITE)) + return 1; + + return 0; +} + +/** + * bio_integrity_enabled - Check whether integrity can be passed + * @bio: bio to check + * + * Description: Determines whether bio_integrity_prep() can be called + * on this bio or not. bio data direction and target device must be + * set prior to calling. The functions honors the write_generate and + * read_verify flags in sysfs. + */ +int bio_integrity_enabled(struct bio *bio) +{ + /* Already protected? */ + if (bio_integrity(bio)) + return 0; + + return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio)); +} +EXPORT_SYMBOL(bio_integrity_enabled); + +/** + * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto + * @bi: blk_integrity profile for device + * @sectors: Number of 512 sectors to convert + * + * Description: The block layer calculates everything in 512 byte + * sectors but integrity metadata is done in terms of the hardware + * sector size of the storage device. Convert the block layer sectors + * to physical sectors. + */ +static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, + unsigned int sectors) +{ + /* At this point there are only 512b or 4096b DIF/EPP devices */ + if (bi->sector_size == 4096) + return sectors >>= 3; + + return sectors; +} + +/** + * bio_integrity_tag_size - Retrieve integrity tag space + * @bio: bio to inspect + * + * Description: Returns the maximum number of tag bytes that can be + * attached to this bio. Filesystems can use this to determine how + * much metadata to attach to an I/O. + */ +unsigned int bio_integrity_tag_size(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + + BUG_ON(bio->bi_size == 0); + + return bi->tag_size * (bio->bi_size / bi->sector_size); +} +EXPORT_SYMBOL(bio_integrity_tag_size); + +int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip->bip_buf == NULL); + + if (bi->tag_size == 0) + return -1; + + nr_sectors = bio_integrity_hw_sectors(bi, + DIV_ROUND_UP(len, bi->tag_size)); + + if (nr_sectors * bi->tuple_size > bip->bip_size) { + printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", + __func__, nr_sectors * bi->tuple_size, bip->bip_size); + return -1; + } + + if (set) + bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + else + bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + + return 0; +} + +/** + * bio_integrity_set_tag - Attach a tag buffer to a bio + * @bio: bio to attach buffer to + * @tag_buf: Pointer to a buffer containing tag data + * @len: Length of the included buffer + * + * Description: Use this function to tag a bio by leveraging the extra + * space provided by devices formatted with integrity protection. The + * size of the integrity buffer must be <= to the size reported by + * bio_integrity_tag_size(). + */ +int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ + BUG_ON(bio_data_dir(bio) != WRITE); + + return bio_integrity_tag(bio, tag_buf, len, 1); +} +EXPORT_SYMBOL(bio_integrity_set_tag); + +/** + * bio_integrity_get_tag - Retrieve a tag buffer from a bio + * @bio: bio to retrieve buffer from + * @tag_buf: Pointer to a buffer for the tag data + * @len: Length of the target buffer + * + * Description: Use this function to retrieve the tag buffer from a + * completed I/O. The size of the integrity buffer must be <= to the + * size reported by bio_integrity_tag_size(). + */ +int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ + BUG_ON(bio_data_dir(bio) != READ); + + return bio_integrity_tag(bio, tag_buf, len, 0); +} +EXPORT_SYMBOL(bio_integrity_get_tag); + +/** + * bio_integrity_generate - Generate integrity metadata for a bio + * @bio: bio to generate integrity metadata for + * + * Description: Generates integrity metadata for a bio by calling the + * block device's generation callback function. The bio must have a + * bip attached with enough room to accommodate the generated + * integrity metadata. + */ +static void bio_integrity_generate(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity_exchg bix; + struct bio_vec *bv; + sector_t sector = bio->bi_sector; + unsigned int i, sectors, total; + void *prot_buf = bio->bi_integrity->bip_buf; + + total = 0; + bix.disk_name = bio->bi_bdev->bd_disk->disk_name; + bix.sector_size = bi->sector_size; + + bio_for_each_segment(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; + bix.prot_buf = prot_buf; + bix.sector = sector; + + bi->generate_fn(&bix); + + sectors = bv->bv_len / bi->sector_size; + sector += sectors; + prot_buf += sectors * bi->tuple_size; + total += sectors * bi->tuple_size; + BUG_ON(total > bio->bi_integrity->bip_size); + + kunmap_atomic(kaddr, KM_USER0); + } +} + +static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) +{ + if (bi) + return bi->tuple_size; + + return 0; +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio: bio to prepare + * + * Description: Allocates a buffer for integrity metadata, maps the + * pages and attaches them to a bio. The bio must have data + * direction, target device and start sector set priot to calling. In + * the WRITE case, integrity metadata will be generated using the + * block device's integrity function. In the READ case, the buffer + * will be prepared for DMA and a suitable end_io handler set up. + */ +int bio_integrity_prep(struct bio *bio) +{ + struct bio_integrity_payload *bip; + struct blk_integrity *bi; + struct request_queue *q; + void *buf; + unsigned long start, end; + unsigned int len, nr_pages; + unsigned int bytes, offset, i; + unsigned int sectors; + + bi = bdev_get_integrity(bio->bi_bdev); + q = bdev_get_queue(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bio_integrity(bio)); + + sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); + + /* Allocate kernel buffer for protection data */ + len = sectors * blk_integrity_tuple_size(bi); + buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); + if (unlikely(buf == NULL)) { + printk(KERN_ERR "could not allocate integrity buffer\n"); + return -ENOMEM; + } + + end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ((unsigned long) buf) >> PAGE_SHIFT; + nr_pages = end - start; + + /* Allocate bio integrity payload and integrity vectors */ + bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); + if (unlikely(bip == NULL)) { + printk(KERN_ERR "could not allocate data integrity bioset\n"); + kfree(buf); + return -EIO; + } + + bip->bip_buf = buf; + bip->bip_size = len; + bip->bip_sector = bio->bi_sector; + + /* Map it */ + offset = offset_in_page(buf); + for (i = 0 ; i < nr_pages ; i++) { + int ret; + bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + ret = bio_integrity_add_page(bio, virt_to_page(buf), + bytes, offset); + + if (ret == 0) + return 0; + + if (ret < bytes) + break; + + buf += bytes; + len -= bytes; + offset = 0; + } + + /* Install custom I/O completion handler if read verify is enabled */ + if (bio_data_dir(bio) == READ) { + bip->bip_end_io = bio->bi_end_io; + bio->bi_end_io = bio_integrity_endio; + } + + /* Auto-generate integrity metadata if this is a write */ + if (bio_data_dir(bio) == WRITE) + bio_integrity_generate(bio); + + return 0; +} +EXPORT_SYMBOL(bio_integrity_prep); + +/** + * bio_integrity_verify - Verify integrity metadata for a bio + * @bio: bio to verify + * + * Description: This function is called to verify the integrity of a + * bio. The data in the bio io_vec is compared to the integrity + * metadata returned by the HBA. + */ +static int bio_integrity_verify(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity_exchg bix; + struct bio_vec *bv; + sector_t sector = bio->bi_integrity->bip_sector; + unsigned int i, sectors, total, ret; + void *prot_buf = bio->bi_integrity->bip_buf; + + ret = total = 0; + bix.disk_name = bio->bi_bdev->bd_disk->disk_name; + bix.sector_size = bi->sector_size; + + bio_for_each_segment(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; + bix.prot_buf = prot_buf; + bix.sector = sector; + + ret = bi->verify_fn(&bix); + + if (ret) { + kunmap_atomic(kaddr, KM_USER0); + return ret; + } + + sectors = bv->bv_len / bi->sector_size; + sector += sectors; + prot_buf += sectors * bi->tuple_size; + total += sectors * bi->tuple_size; + BUG_ON(total > bio->bi_integrity->bip_size); + + kunmap_atomic(kaddr, KM_USER0); + } + + return ret; +} + +/** + * bio_integrity_verify_fn - Integrity I/O completion worker + * @work: Work struct stored in bio to be verified + * + * Description: This workqueue function is called to complete a READ + * request. The function verifies the transferred integrity metadata + * and then calls the original bio end_io function. + */ +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_payload *bip = + container_of(work, struct bio_integrity_payload, bip_work); + struct bio *bio = bip->bip_bio; + int error; + + error = bio_integrity_verify(bio); + + /* Restore original bio completion handler */ + bio->bi_end_io = bip->bip_end_io; + bio_endio(bio, error); +} + +/** + * bio_integrity_endio - Integrity I/O completion function + * @bio: Protected bio + * @error: Pointer to errno + * + * Description: Completion for integrity I/O + * + * Normally I/O completion is done in interrupt context. However, + * verifying I/O integrity is a time-consuming task which must be run + * in process context. This function postpones completion + * accordingly. + */ +void bio_integrity_endio(struct bio *bio, int error) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + + BUG_ON(bip->bip_bio != bio); + + /* In case of an I/O error there is no point in verifying the + * integrity metadata. Restore original bio end_io handler + * and run it. + */ + if (error) { + bio->bi_end_io = bip->bip_end_io; + bio_endio(bio, error); + + return; + } + + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bip->bip_work); +} +EXPORT_SYMBOL(bio_integrity_endio); + +/** + * bio_integrity_mark_head - Advance bip_vec skip bytes + * @bip: Integrity vector to advance + * @skip: Number of bytes to advance it + */ +void bio_integrity_mark_head(struct bio_integrity_payload *bip, + unsigned int skip) +{ + struct bio_vec *iv; + unsigned int i; + + bip_for_each_vec(iv, bip, i) { + if (skip == 0) { + bip->bip_idx = i; + return; + } else if (skip >= iv->bv_len) { + skip -= iv->bv_len; + } else { /* skip < iv->bv_len) */ + iv->bv_offset += skip; + iv->bv_len -= skip; + bip->bip_idx = i; + return; + } + } +} + +/** + * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long + * @bip: Integrity vector to truncate + * @len: New length of integrity vector + */ +void bio_integrity_mark_tail(struct bio_integrity_payload *bip, + unsigned int len) +{ + struct bio_vec *iv; + unsigned int i; + + bip_for_each_vec(iv, bip, i) { + if (len == 0) { + bip->bip_vcnt = i; + return; + } else if (len >= iv->bv_len) { + len -= iv->bv_len; + } else { /* len < iv->bv_len) */ + iv->bv_len = len; + len = 0; + } + } +} + +/** + * bio_integrity_advance - Advance integrity vector + * @bio: bio whose integrity vector to update + * @bytes_done: number of data bytes that have been completed + * + * Description: This function calculates how many integrity bytes the + * number of completed data bytes correspond to and advances the + * integrity vector accordingly. + */ +void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip == NULL); + BUG_ON(bi == NULL); + + nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9); + bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size); +} +EXPORT_SYMBOL(bio_integrity_advance); + +/** + * bio_integrity_trim - Trim integrity vector + * @bio: bio whose integrity vector to update + * @offset: offset to first data sector + * @sectors: number of data sectors + * + * Description: Used to trim the integrity vector in a cloned bio. + * The ivec will be advanced corresponding to 'offset' data sectors + * and the length will be truncated corresponding to 'len' data + * sectors. + */ +void bio_integrity_trim(struct bio *bio, unsigned int offset, + unsigned int sectors) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip == NULL); + BUG_ON(bi == NULL); + BUG_ON(!bio_flagged(bio, BIO_CLONED)); + + nr_sectors = bio_integrity_hw_sectors(bi, sectors); + bip->bip_sector = bip->bip_sector + offset; + bio_integrity_mark_head(bip, offset * bi->tuple_size); + bio_integrity_mark_tail(bip, sectors * bi->tuple_size); +} +EXPORT_SYMBOL(bio_integrity_trim); + +/** + * bio_integrity_split - Split integrity metadata + * @bio: Protected bio + * @bp: Resulting bio_pair + * @sectors: Offset + * + * Description: Splits an integrity page into a bio_pair. + */ +void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) +{ + struct blk_integrity *bi; + struct bio_integrity_payload *bip = bio->bi_integrity; + unsigned int nr_sectors; + + if (bio_integrity(bio) == 0) + return; + + bi = bdev_get_integrity(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bip->bip_vcnt != 1); + + nr_sectors = bio_integrity_hw_sectors(bi, sectors); + + bp->bio1.bi_integrity = &bp->bip1; + bp->bio2.bi_integrity = &bp->bip2; + + bp->iv1 = bip->bip_vec[0]; + bp->iv2 = bip->bip_vec[0]; + + bp->bip1.bip_vec[0] = bp->iv1; + bp->bip2.bip_vec[0] = bp->iv2; + + bp->iv1.bv_len = sectors * bi->tuple_size; + bp->iv2.bv_offset += sectors * bi->tuple_size; + bp->iv2.bv_len -= sectors * bi->tuple_size; + + bp->bip1.bip_sector = bio->bi_integrity->bip_sector; + bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors; + + bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1; + bp->bip1.bip_idx = bp->bip2.bip_idx = 0; +} +EXPORT_SYMBOL(bio_integrity_split); + +/** + * bio_integrity_clone - Callback for cloning bios with integrity metadata + * @bio: New bio + * @bio_src: Original bio + * @gfp_mask: Memory allocation mask + * @bs: bio_set to allocate bip from + * + * Description: Called to allocate a bip when cloning a bio + */ +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, + gfp_t gfp_mask, struct bio_set *bs) +{ + struct bio_integrity_payload *bip_src = bio_src->bi_integrity; + struct bio_integrity_payload *bip; + + BUG_ON(bip_src == NULL); + + bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs); + + if (bip == NULL) + return -EIO; + + memcpy(bip->bip_vec, bip_src->bip_vec, + bip_src->bip_vcnt * sizeof(struct bio_vec)); + + bip->bip_sector = bip_src->bip_sector; + bip->bip_vcnt = bip_src->bip_vcnt; + bip->bip_idx = bip_src->bip_idx; + + return 0; +} +EXPORT_SYMBOL(bio_integrity_clone); + +int bioset_integrity_create(struct bio_set *bs, int pool_size) +{ + unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES); + + if (bs->bio_integrity_pool) + return 0; + + bs->bio_integrity_pool = + mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab); + + if (!bs->bio_integrity_pool) + return -1; + + return 0; +} +EXPORT_SYMBOL(bioset_integrity_create); + +void bioset_integrity_free(struct bio_set *bs) +{ + if (bs->bio_integrity_pool) + mempool_destroy(bs->bio_integrity_pool); +} +EXPORT_SYMBOL(bioset_integrity_free); + +void __init bio_integrity_init(void) +{ + unsigned int i; + + /* + * kintegrityd won't block much but may burn a lot of CPU cycles. + * Make it highpri CPU intensive wq with max concurrency of 1. + */ + kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); + if (!kintegrityd_wq) + panic("Failed to create kintegrityd\n"); + + for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) { + unsigned int size; + + size = sizeof(struct bio_integrity_payload) + + bip_slab[i].nr_vecs * sizeof(struct bio_vec); + + bip_slab[i].slab = + kmem_cache_create(bip_slab[i].name, size, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + } +} diff --git a/fs/bio.c b/fs/bio.c new file mode 100644 index 00000000..9bfade8a --- /dev/null +++ b/fs/bio.c @@ -0,0 +1,1692 @@ +/* + * Copyright (C) 2001 Jens Axboe + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for struct sg_iovec */ + +#include + +/* + * Test patch to inline a certain number of bi_io_vec's inside the bio + * itself, to shrink a bio data allocation from two mempool calls to one + */ +#define BIO_INLINE_VECS 4 + +static mempool_t *bio_split_pool __read_mostly; + +/* + * if you change this list, also change bvec_alloc or things will + * break badly! cannot be bigger than what you can fit into an + * unsigned short + */ +#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } +static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { + BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), +}; +#undef BV + +/* + * fs_bio_set is the bio_set containing bio and iovec memory pools used by + * IO code that does not need private memory pools. + */ +struct bio_set *fs_bio_set; + +/* + * Our slab pool management + */ +struct bio_slab { + struct kmem_cache *slab; + unsigned int slab_ref; + unsigned int slab_size; + char name[8]; +}; +static DEFINE_MUTEX(bio_slab_lock); +static struct bio_slab *bio_slabs; +static unsigned int bio_slab_nr, bio_slab_max; + +static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) +{ + unsigned int sz = sizeof(struct bio) + extra_size; + struct kmem_cache *slab = NULL; + struct bio_slab *bslab; + unsigned int i, entry = -1; + + mutex_lock(&bio_slab_lock); + + i = 0; + while (i < bio_slab_nr) { + bslab = &bio_slabs[i]; + + if (!bslab->slab && entry == -1) + entry = i; + else if (bslab->slab_size == sz) { + slab = bslab->slab; + bslab->slab_ref++; + break; + } + i++; + } + + if (slab) + goto out_unlock; + + if (bio_slab_nr == bio_slab_max && entry == -1) { + bio_slab_max <<= 1; + bio_slabs = krealloc(bio_slabs, + bio_slab_max * sizeof(struct bio_slab), + GFP_KERNEL); + if (!bio_slabs) + goto out_unlock; + } + if (entry == -1) + entry = bio_slab_nr++; + + bslab = &bio_slabs[entry]; + + snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); + slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL); + if (!slab) + goto out_unlock; + + printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry); + bslab->slab = slab; + bslab->slab_ref = 1; + bslab->slab_size = sz; +out_unlock: + mutex_unlock(&bio_slab_lock); + return slab; +} + +static void bio_put_slab(struct bio_set *bs) +{ + struct bio_slab *bslab = NULL; + unsigned int i; + + mutex_lock(&bio_slab_lock); + + for (i = 0; i < bio_slab_nr; i++) { + if (bs->bio_slab == bio_slabs[i].slab) { + bslab = &bio_slabs[i]; + break; + } + } + + if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) + goto out; + + WARN_ON(!bslab->slab_ref); + + if (--bslab->slab_ref) + goto out; + + kmem_cache_destroy(bslab->slab); + bslab->slab = NULL; + +out: + mutex_unlock(&bio_slab_lock); +} + +unsigned int bvec_nr_vecs(unsigned short idx) +{ + return bvec_slabs[idx].nr_vecs; +} + +void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx) +{ + BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); + + if (idx == BIOVEC_MAX_IDX) + mempool_free(bv, bs->bvec_pool); + else { + struct biovec_slab *bvs = bvec_slabs + idx; + + kmem_cache_free(bvs->slab, bv); + } +} + +struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, + struct bio_set *bs) +{ + struct bio_vec *bvl; + + /* + * see comment near bvec_array define! + */ + switch (nr) { + case 1: + *idx = 0; + break; + case 2 ... 4: + *idx = 1; + break; + case 5 ... 16: + *idx = 2; + break; + case 17 ... 64: + *idx = 3; + break; + case 65 ... 128: + *idx = 4; + break; + case 129 ... BIO_MAX_PAGES: + *idx = 5; + break; + default: + return NULL; + } + + /* + * idx now points to the pool we want to allocate from. only the + * 1-vec entry pool is mempool backed. + */ + if (*idx == BIOVEC_MAX_IDX) { +fallback: + bvl = mempool_alloc(bs->bvec_pool, gfp_mask); + } else { + struct biovec_slab *bvs = bvec_slabs + *idx; + gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + + /* + * Make this allocation restricted and don't dump info on + * allocation failures, since we'll fallback to the mempool + * in case of failure. + */ + __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + + /* + * Try a slab allocation. If this fails and __GFP_WAIT + * is set, retry with the 1-entry mempool + */ + bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); + if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { + *idx = BIOVEC_MAX_IDX; + goto fallback; + } + } + + return bvl; +} + +void bio_free(struct bio *bio, struct bio_set *bs) +{ + void *p; + + if (bio_has_allocated_vec(bio)) + bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); + + if (bio_integrity(bio)) + bio_integrity_free(bio, bs); + + /* + * If we have front padding, adjust the bio pointer before freeing + */ + p = bio; + if (bs->front_pad) + p -= bs->front_pad; + + mempool_free(p, bs->bio_pool); +} +EXPORT_SYMBOL(bio_free); + +void bio_init(struct bio *bio) +{ + memset(bio, 0, sizeof(*bio)); + bio->bi_flags = 1 << BIO_UPTODATE; + bio->bi_comp_cpu = -1; + atomic_set(&bio->bi_cnt, 1); +} +EXPORT_SYMBOL(bio_init); + +/** + * bio_alloc_bioset - allocate a bio for I/O + * @gfp_mask: the GFP_ mask given to the slab allocator + * @nr_iovecs: number of iovecs to pre-allocate + * @bs: the bio_set to allocate from. + * + * Description: + * bio_alloc_bioset will try its own mempool to satisfy the allocation. + * If %__GFP_WAIT is set then we will block on the internal pool waiting + * for a &struct bio to become free. + * + * Note that the caller must set ->bi_destructor on successful return + * of a bio, to do the appropriate freeing of the bio once the reference + * count drops to zero. + **/ +struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) +{ + unsigned long idx = BIO_POOL_NONE; + struct bio_vec *bvl = NULL; + struct bio *bio; + void *p; + + p = mempool_alloc(bs->bio_pool, gfp_mask); + if (unlikely(!p)) + return NULL; + bio = p + bs->front_pad; + + bio_init(bio); + + if (unlikely(!nr_iovecs)) + goto out_set; + + if (nr_iovecs <= BIO_INLINE_VECS) { + bvl = bio->bi_inline_vecs; + nr_iovecs = BIO_INLINE_VECS; + } else { + bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); + if (unlikely(!bvl)) + goto err_free; + + nr_iovecs = bvec_nr_vecs(idx); + } +out_set: + bio->bi_flags |= idx << BIO_POOL_OFFSET; + bio->bi_max_vecs = nr_iovecs; + bio->bi_io_vec = bvl; + return bio; + +err_free: + mempool_free(p, bs->bio_pool); + return NULL; +} +EXPORT_SYMBOL(bio_alloc_bioset); + +static void bio_fs_destructor(struct bio *bio) +{ + bio_free(bio, fs_bio_set); +} + +/** + * bio_alloc - allocate a new bio, memory pool backed + * @gfp_mask: allocation mask to use + * @nr_iovecs: number of iovecs + * + * bio_alloc will allocate a bio and associated bio_vec array that can hold + * at least @nr_iovecs entries. Allocations will be done from the + * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc. + * + * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate + * a bio. This is due to the mempool guarantees. To make this work, callers + * must never allocate more than 1 bio at a time from this pool. Callers + * that need to allocate more than 1 bio must always submit the previously + * allocated bio for IO before attempting to allocate a new one. Failure to + * do so can cause livelocks under memory pressure. + * + * RETURNS: + * Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) +{ + struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); + + if (bio) + bio->bi_destructor = bio_fs_destructor; + + return bio; +} +EXPORT_SYMBOL(bio_alloc); + +static void bio_kmalloc_destructor(struct bio *bio) +{ + if (bio_integrity(bio)) + bio_integrity_free(bio, fs_bio_set); + kfree(bio); +} + +/** + * bio_kmalloc - allocate a bio for I/O using kmalloc() + * @gfp_mask: the GFP_ mask given to the slab allocator + * @nr_iovecs: number of iovecs to pre-allocate + * + * Description: + * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains + * %__GFP_WAIT, the allocation is guaranteed to succeed. + * + **/ +struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) +{ + struct bio *bio; + + if (nr_iovecs > UIO_MAXIOV) + return NULL; + + bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), + gfp_mask); + if (unlikely(!bio)) + return NULL; + + bio_init(bio); + bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; + bio->bi_max_vecs = nr_iovecs; + bio->bi_io_vec = bio->bi_inline_vecs; + bio->bi_destructor = bio_kmalloc_destructor; + + return bio; +} +EXPORT_SYMBOL(bio_kmalloc); + +void zero_fill_bio(struct bio *bio) +{ + unsigned long flags; + struct bio_vec *bv; + int i; + + bio_for_each_segment(bv, bio, i) { + char *data = bvec_kmap_irq(bv, &flags); + memset(data, 0, bv->bv_len); + flush_dcache_page(bv->bv_page); + bvec_kunmap_irq(data, &flags); + } +} +EXPORT_SYMBOL(zero_fill_bio); + +/** + * bio_put - release a reference to a bio + * @bio: bio to release reference to + * + * Description: + * Put a reference to a &struct bio, either one you have gotten with + * bio_alloc, bio_get or bio_clone. The last put of a bio will free it. + **/ +void bio_put(struct bio *bio) +{ + BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); + + /* + * last put frees it + */ + if (atomic_dec_and_test(&bio->bi_cnt)) { + bio->bi_next = NULL; + bio->bi_destructor(bio); + } +} +EXPORT_SYMBOL(bio_put); + +inline int bio_phys_segments(struct request_queue *q, struct bio *bio) +{ + if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) + blk_recount_segments(q, bio); + + return bio->bi_phys_segments; +} +EXPORT_SYMBOL(bio_phys_segments); + +/** + * __bio_clone - clone a bio + * @bio: destination bio + * @bio_src: bio to clone + * + * Clone a &bio. Caller will own the returned bio, but not + * the actual data it points to. Reference count of returned + * bio will be one. + */ +void __bio_clone(struct bio *bio, struct bio *bio_src) +{ + memcpy(bio->bi_io_vec, bio_src->bi_io_vec, + bio_src->bi_max_vecs * sizeof(struct bio_vec)); + + /* + * most users will be overriding ->bi_bdev with a new target, + * so we don't set nor calculate new physical/hw segment counts here + */ + bio->bi_sector = bio_src->bi_sector; + bio->bi_bdev = bio_src->bi_bdev; + bio->bi_flags |= 1 << BIO_CLONED; + bio->bi_rw = bio_src->bi_rw; + bio->bi_vcnt = bio_src->bi_vcnt; + bio->bi_size = bio_src->bi_size; + bio->bi_idx = bio_src->bi_idx; +} +EXPORT_SYMBOL(__bio_clone); + +/** + * bio_clone - clone a bio + * @bio: bio to clone + * @gfp_mask: allocation priority + * + * Like __bio_clone, only also allocates the returned bio + */ +struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) +{ + struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); + + if (!b) + return NULL; + + b->bi_destructor = bio_fs_destructor; + __bio_clone(b, bio); + + if (bio_integrity(bio)) { + int ret; + + ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); + + if (ret < 0) { + bio_put(b); + return NULL; + } + } + + return b; +} +EXPORT_SYMBOL(bio_clone); + +/** + * bio_get_nr_vecs - return approx number of vecs + * @bdev: I/O target + * + * Return the approximate number of pages we can send to this target. + * There's no guarantee that you will be able to fit this number of pages + * into a bio, it does not account for dynamic restrictions that vary + * on offset. + */ +int bio_get_nr_vecs(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + int nr_pages; + + nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (nr_pages > queue_max_segments(q)) + nr_pages = queue_max_segments(q); + + return nr_pages; +} +EXPORT_SYMBOL(bio_get_nr_vecs); + +static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page + *page, unsigned int len, unsigned int offset, + unsigned short max_sectors) +{ + int retried_segments = 0; + struct bio_vec *bvec; + + /* + * cloned bio must not modify vec list + */ + if (unlikely(bio_flagged(bio, BIO_CLONED))) + return 0; + + if (((bio->bi_size + len) >> 9) > max_sectors) + return 0; + + /* + * For filesystems with a blocksize smaller than the pagesize + * we will often be called with the same page as last time and + * a consecutive offset. Optimize this special case. + */ + if (bio->bi_vcnt > 0) { + struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (page == prev->bv_page && + offset == prev->bv_offset + prev->bv_len) { + unsigned int prev_bv_len = prev->bv_len; + prev->bv_len += len; + + if (q->merge_bvec_fn) { + struct bvec_merge_data bvm = { + /* prev_bvec is already charged in + bi_size, discharge it in order to + simulate merging updated prev_bvec + as new bvec. */ + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_sector, + .bi_size = bio->bi_size - prev_bv_len, + .bi_rw = bio->bi_rw, + }; + + if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) { + prev->bv_len -= len; + return 0; + } + } + + goto done; + } + } + + if (bio->bi_vcnt >= bio->bi_max_vecs) + return 0; + + /* + * we might lose a segment or two here, but rather that than + * make this too complex. + */ + + while (bio->bi_phys_segments >= queue_max_segments(q)) { + + if (retried_segments) + return 0; + + retried_segments = 1; + blk_recount_segments(q, bio); + } + + /* + * setup the new entry, we might clear it again later if we + * cannot add the page + */ + bvec = &bio->bi_io_vec[bio->bi_vcnt]; + bvec->bv_page = page; + bvec->bv_len = len; + bvec->bv_offset = offset; + + /* + * if queue has other restrictions (eg varying max sector size + * depending on offset), it can specify a merge_bvec_fn in the + * queue to get further control + */ + if (q->merge_bvec_fn) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_sector, + .bi_size = bio->bi_size, + .bi_rw = bio->bi_rw, + }; + + /* + * merge_bvec_fn() returns number of bytes it can accept + * at this offset + */ + if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { + bvec->bv_page = NULL; + bvec->bv_len = 0; + bvec->bv_offset = 0; + return 0; + } + } + + /* If we may be able to merge these biovecs, force a recount */ + if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) + bio->bi_flags &= ~(1 << BIO_SEG_VALID); + + bio->bi_vcnt++; + bio->bi_phys_segments++; + done: + bio->bi_size += len; + return len; +} + +/** + * bio_add_pc_page - attempt to add page to bio + * @q: the target queue + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * + * Attempt to add a page to the bio_vec maplist. This can fail for a + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. + * + * This should only be used by REQ_PC bios. + */ +int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + return __bio_add_page(q, bio, page, len, offset, + queue_max_hw_sectors(q)); +} +EXPORT_SYMBOL(bio_add_pc_page); + +/** + * bio_add_page - attempt to add page to bio + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * + * Attempt to add a page to the bio_vec maplist. This can fail for a + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. + */ +int bio_add_page(struct bio *bio, struct page *page, unsigned int len, + unsigned int offset) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); +} +EXPORT_SYMBOL(bio_add_page); + +struct bio_map_data { + struct bio_vec *iovecs; + struct sg_iovec *sgvecs; + int nr_sgvecs; + int is_our_pages; +}; + +static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, + struct sg_iovec *iov, int iov_count, + int is_our_pages) +{ + memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); + memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); + bmd->nr_sgvecs = iov_count; + bmd->is_our_pages = is_our_pages; + bio->bi_private = bmd; +} + +static void bio_free_map_data(struct bio_map_data *bmd) +{ + kfree(bmd->iovecs); + kfree(bmd->sgvecs); + kfree(bmd); +} + +static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, + gfp_t gfp_mask) +{ + struct bio_map_data *bmd; + + if (iov_count > UIO_MAXIOV) + return NULL; + + bmd = kmalloc(sizeof(*bmd), gfp_mask); + if (!bmd) + return NULL; + + bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask); + if (!bmd->iovecs) { + kfree(bmd); + return NULL; + } + + bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask); + if (bmd->sgvecs) + return bmd; + + kfree(bmd->iovecs); + kfree(bmd); + return NULL; +} + +static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, + struct sg_iovec *iov, int iov_count, + int to_user, int from_user, int do_free_page) +{ + int ret = 0, i; + struct bio_vec *bvec; + int iov_idx = 0; + unsigned int iov_off = 0; + + __bio_for_each_segment(bvec, bio, i, 0) { + char *bv_addr = page_address(bvec->bv_page); + unsigned int bv_len = iovecs[i].bv_len; + + while (bv_len && iov_idx < iov_count) { + unsigned int bytes; + char __user *iov_addr; + + bytes = min_t(unsigned int, + iov[iov_idx].iov_len - iov_off, bv_len); + iov_addr = iov[iov_idx].iov_base + iov_off; + + if (!ret) { + if (to_user) + ret = copy_to_user(iov_addr, bv_addr, + bytes); + + if (from_user) + ret = copy_from_user(bv_addr, iov_addr, + bytes); + + if (ret) + ret = -EFAULT; + } + + bv_len -= bytes; + bv_addr += bytes; + iov_addr += bytes; + iov_off += bytes; + + if (iov[iov_idx].iov_len == iov_off) { + iov_idx++; + iov_off = 0; + } + } + + if (do_free_page) + __free_page(bvec->bv_page); + } + + return ret; +} + +/** + * bio_uncopy_user - finish previously mapped bio + * @bio: bio being terminated + * + * Free pages allocated from bio_copy_user() and write back data + * to user space in case of a read. + */ +int bio_uncopy_user(struct bio *bio) +{ + struct bio_map_data *bmd = bio->bi_private; + int ret = 0; + + if (!bio_flagged(bio, BIO_NULL_MAPPED)) + ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, + bmd->nr_sgvecs, bio_data_dir(bio) == READ, + 0, bmd->is_our_pages); + bio_free_map_data(bmd); + bio_put(bio); + return ret; +} +EXPORT_SYMBOL(bio_uncopy_user); + +/** + * bio_copy_user_iov - copy user data to bio + * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) + * @iov: the iovec. + * @iov_count: number of elements in the iovec + * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags + * + * Prepares and returns a bio for indirect user io, bouncing data + * to/from kernel pages as necessary. Must be paired with + * call bio_uncopy_user() on io completion. + */ +struct bio *bio_copy_user_iov(struct request_queue *q, + struct rq_map_data *map_data, + struct sg_iovec *iov, int iov_count, + int write_to_vm, gfp_t gfp_mask) +{ + struct bio_map_data *bmd; + struct bio_vec *bvec; + struct page *page; + struct bio *bio; + int i, ret; + int nr_pages = 0; + unsigned int len = 0; + unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; + + for (i = 0; i < iov_count; i++) { + unsigned long uaddr; + unsigned long end; + unsigned long start; + + uaddr = (unsigned long)iov[i].iov_base; + end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + + nr_pages += end - start; + len += iov[i].iov_len; + } + + if (offset) + nr_pages++; + + bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); + if (!bmd) + return ERR_PTR(-ENOMEM); + + ret = -ENOMEM; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + goto out_bmd; + + if (!write_to_vm) + bio->bi_rw |= REQ_WRITE; + + ret = 0; + + if (map_data) { + nr_pages = 1 << map_data->page_order; + i = map_data->offset / PAGE_SIZE; + } + while (len) { + unsigned int bytes = PAGE_SIZE; + + bytes -= offset; + + if (bytes > len) + bytes = len; + + if (map_data) { + if (i == map_data->nr_entries * nr_pages) { + ret = -ENOMEM; + break; + } + + page = map_data->pages[i / nr_pages]; + page += (i % nr_pages); + + i++; + } else { + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) { + ret = -ENOMEM; + break; + } + } + + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) + break; + + len -= bytes; + offset = 0; + } + + if (ret) + goto cleanup; + + /* + * success + */ + if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || + (map_data && map_data->from_user)) { + ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0); + if (ret) + goto cleanup; + } + + bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1); + return bio; +cleanup: + if (!map_data) + bio_for_each_segment(bvec, bio, i) + __free_page(bvec->bv_page); + + bio_put(bio); +out_bmd: + bio_free_map_data(bmd); + return ERR_PTR(ret); +} + +/** + * bio_copy_user - copy user data to bio + * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) + * @uaddr: start of user address + * @len: length in bytes + * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags + * + * Prepares and returns a bio for indirect user io, bouncing data + * to/from kernel pages as necessary. Must be paired with + * call bio_uncopy_user() on io completion. + */ +struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, + unsigned long uaddr, unsigned int len, + int write_to_vm, gfp_t gfp_mask) +{ + struct sg_iovec iov; + + iov.iov_base = (void __user *)uaddr; + iov.iov_len = len; + + return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); +} +EXPORT_SYMBOL(bio_copy_user); + +static struct bio *__bio_map_user_iov(struct request_queue *q, + struct block_device *bdev, + struct sg_iovec *iov, int iov_count, + int write_to_vm, gfp_t gfp_mask) +{ + int i, j; + int nr_pages = 0; + struct page **pages; + struct bio *bio; + int cur_page = 0; + int ret, offset; + + for (i = 0; i < iov_count; i++) { + unsigned long uaddr = (unsigned long)iov[i].iov_base; + unsigned long len = iov[i].iov_len; + unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = uaddr >> PAGE_SHIFT; + + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + + nr_pages += end - start; + /* + * buffer must be aligned to at least hardsector size for now + */ + if (uaddr & queue_dma_alignment(q)) + return ERR_PTR(-EINVAL); + } + + if (!nr_pages) + return ERR_PTR(-EINVAL); + + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + ret = -ENOMEM; + pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); + if (!pages) + goto out; + + for (i = 0; i < iov_count; i++) { + unsigned long uaddr = (unsigned long)iov[i].iov_base; + unsigned long len = iov[i].iov_len; + unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = uaddr >> PAGE_SHIFT; + const int local_nr_pages = end - start; + const int page_limit = cur_page + local_nr_pages; + + ret = get_user_pages_fast(uaddr, local_nr_pages, + write_to_vm, &pages[cur_page]); + if (ret < local_nr_pages) { + ret = -EFAULT; + goto out_unmap; + } + + offset = uaddr & ~PAGE_MASK; + for (j = cur_page; j < page_limit; j++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + /* + * sorry... + */ + if (bio_add_pc_page(q, bio, pages[j], bytes, offset) < + bytes) + break; + + len -= bytes; + offset = 0; + } + + cur_page = j; + /* + * release the pages we didn't map into the bio, if any + */ + while (j < page_limit) + page_cache_release(pages[j++]); + } + + kfree(pages); + + /* + * set data direction, and check if mapped pages need bouncing + */ + if (!write_to_vm) + bio->bi_rw |= REQ_WRITE; + + bio->bi_bdev = bdev; + bio->bi_flags |= (1 << BIO_USER_MAPPED); + return bio; + + out_unmap: + for (i = 0; i < nr_pages; i++) { + if(!pages[i]) + break; + page_cache_release(pages[i]); + } + out: + kfree(pages); + bio_put(bio); + return ERR_PTR(ret); +} + +/** + * bio_map_user - map user address into bio + * @q: the struct request_queue for the bio + * @bdev: destination block device + * @uaddr: start of user address + * @len: length in bytes + * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags + * + * Map the user space address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, + unsigned long uaddr, unsigned int len, int write_to_vm, + gfp_t gfp_mask) +{ + struct sg_iovec iov; + + iov.iov_base = (void __user *)uaddr; + iov.iov_len = len; + + return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); +} +EXPORT_SYMBOL(bio_map_user); + +/** + * bio_map_user_iov - map user sg_iovec table into bio + * @q: the struct request_queue for the bio + * @bdev: destination block device + * @iov: the iovec. + * @iov_count: number of elements in the iovec + * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags + * + * Map the user space address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, + struct sg_iovec *iov, int iov_count, + int write_to_vm, gfp_t gfp_mask) +{ + struct bio *bio; + + bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm, + gfp_mask); + if (IS_ERR(bio)) + return bio; + + /* + * subtle -- if __bio_map_user() ended up bouncing a bio, + * it would normally disappear when its bi_end_io is run. + * however, we need it for the unmap, so grab an extra + * reference to it + */ + bio_get(bio); + + return bio; +} + +static void __bio_unmap_user(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + /* + * make sure we dirty pages we wrote to + */ + __bio_for_each_segment(bvec, bio, i, 0) { + if (bio_data_dir(bio) == READ) + set_page_dirty_lock(bvec->bv_page); + + page_cache_release(bvec->bv_page); + } + + bio_put(bio); +} + +/** + * bio_unmap_user - unmap a bio + * @bio: the bio being unmapped + * + * Unmap a bio previously mapped by bio_map_user(). Must be called with + * a process context. + * + * bio_unmap_user() may sleep. + */ +void bio_unmap_user(struct bio *bio) +{ + __bio_unmap_user(bio); + bio_put(bio); +} +EXPORT_SYMBOL(bio_unmap_user); + +static void bio_map_kern_endio(struct bio *bio, int err) +{ + bio_put(bio); +} + +static struct bio *__bio_map_kern(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int nr_pages = end - start; + int offset, i; + struct bio *bio; + + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, + offset) < bytes) + break; + + data += bytes; + len -= bytes; + offset = 0; + } + + bio->bi_end_io = bio_map_kern_endio; + return bio; +} + +/** + * bio_map_kern - map kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to map + * @len: length in bytes + * @gfp_mask: allocation flags for bio allocation + * + * Map the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, + gfp_t gfp_mask) +{ + struct bio *bio; + + bio = __bio_map_kern(q, data, len, gfp_mask); + if (IS_ERR(bio)) + return bio; + + if (bio->bi_size == len) + return bio; + + /* + * Don't support partial mappings. + */ + bio_put(bio); + return ERR_PTR(-EINVAL); +} +EXPORT_SYMBOL(bio_map_kern); + +static void bio_copy_kern_endio(struct bio *bio, int err) +{ + struct bio_vec *bvec; + const int read = bio_data_dir(bio) == READ; + struct bio_map_data *bmd = bio->bi_private; + int i; + char *p = bmd->sgvecs[0].iov_base; + + __bio_for_each_segment(bvec, bio, i, 0) { + char *addr = page_address(bvec->bv_page); + int len = bmd->iovecs[i].bv_len; + + if (read) + memcpy(p, addr, len); + + __free_page(bvec->bv_page); + p += len; + } + + bio_free_map_data(bmd); + bio_put(bio); +} + +/** + * bio_copy_kern - copy kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to copy + * @len: length in bytes + * @gfp_mask: allocation flags for bio and page allocation + * @reading: data direction is READ + * + * copy the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, + gfp_t gfp_mask, int reading) +{ + struct bio *bio; + struct bio_vec *bvec; + int i; + + bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask); + if (IS_ERR(bio)) + return bio; + + if (!reading) { + void *p = data; + + bio_for_each_segment(bvec, bio, i) { + char *addr = page_address(bvec->bv_page); + + memcpy(addr, p, bvec->bv_len); + p += bvec->bv_len; + } + } + + bio->bi_end_io = bio_copy_kern_endio; + + return bio; +} +EXPORT_SYMBOL(bio_copy_kern); + +/* + * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions + * for performing direct-IO in BIOs. + * + * The problem is that we cannot run set_page_dirty() from interrupt context + * because the required locks are not interrupt-safe. So what we can do is to + * mark the pages dirty _before_ performing IO. And in interrupt context, + * check that the pages are still dirty. If so, fine. If not, redirty them + * in process context. + * + * We special-case compound pages here: normally this means reads into hugetlb + * pages. The logic in here doesn't really work right for compound pages + * because the VM does not uniformly chase down the head page in all cases. + * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't + * handle them at all. So we skip compound pages here at an early stage. + * + * Note that this code is very hard to test under normal circumstances because + * direct-io pins the pages with get_user_pages(). This makes + * is_page_cache_freeable return false, and the VM will not clean the pages. + * But other code (eg, pdflush) could clean the pages if they are mapped + * pagecache. + * + * Simply disabling the call to bio_set_pages_dirty() is a good way to test the + * deferred bio dirtying paths. + */ + +/* + * bio_set_pages_dirty() will mark all the bio's pages as dirty. + */ +void bio_set_pages_dirty(struct bio *bio) +{ + struct bio_vec *bvec = bio->bi_io_vec; + int i; + + for (i = 0; i < bio->bi_vcnt; i++) { + struct page *page = bvec[i].bv_page; + + if (page && !PageCompound(page)) + set_page_dirty_lock(page); + } +} + +static void bio_release_pages(struct bio *bio) +{ + struct bio_vec *bvec = bio->bi_io_vec; + int i; + + for (i = 0; i < bio->bi_vcnt; i++) { + struct page *page = bvec[i].bv_page; + + if (page) + put_page(page); + } +} + +/* + * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. + * If they are, then fine. If, however, some pages are clean then they must + * have been written out during the direct-IO read. So we take another ref on + * the BIO and the offending pages and re-dirty the pages in process context. + * + * It is expected that bio_check_pages_dirty() will wholly own the BIO from + * here on. It will run one page_cache_release() against each page and will + * run one bio_put() against the BIO. + */ + +static void bio_dirty_fn(struct work_struct *work); + +static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); +static DEFINE_SPINLOCK(bio_dirty_lock); +static struct bio *bio_dirty_list; + +/* + * This runs in process context + */ +static void bio_dirty_fn(struct work_struct *work) +{ + unsigned long flags; + struct bio *bio; + + spin_lock_irqsave(&bio_dirty_lock, flags); + bio = bio_dirty_list; + bio_dirty_list = NULL; + spin_unlock_irqrestore(&bio_dirty_lock, flags); + + while (bio) { + struct bio *next = bio->bi_private; + + bio_set_pages_dirty(bio); + bio_release_pages(bio); + bio_put(bio); + bio = next; + } +} + +void bio_check_pages_dirty(struct bio *bio) +{ + struct bio_vec *bvec = bio->bi_io_vec; + int nr_clean_pages = 0; + int i; + + for (i = 0; i < bio->bi_vcnt; i++) { + struct page *page = bvec[i].bv_page; + + if (PageDirty(page) || PageCompound(page)) { + page_cache_release(page); + bvec[i].bv_page = NULL; + } else { + nr_clean_pages++; + } + } + + if (nr_clean_pages) { + unsigned long flags; + + spin_lock_irqsave(&bio_dirty_lock, flags); + bio->bi_private = bio_dirty_list; + bio_dirty_list = bio; + spin_unlock_irqrestore(&bio_dirty_lock, flags); + schedule_work(&bio_dirty_work); + } else { + bio_put(bio); + } +} + +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +void bio_flush_dcache_pages(struct bio *bi) +{ + int i; + struct bio_vec *bvec; + + bio_for_each_segment(bvec, bi, i) + flush_dcache_page(bvec->bv_page); +} +EXPORT_SYMBOL(bio_flush_dcache_pages); +#endif + +/** + * bio_endio - end I/O on a bio + * @bio: bio + * @error: error, if any + * + * Description: + * bio_endio() will end I/O on the whole bio. bio_endio() is the + * preferred way to end I/O on a bio, it takes care of clearing + * BIO_UPTODATE on error. @error is 0 on success, and and one of the + * established -Exxxx (-EIO, for instance) error values in case + * something went wrong. No one should call bi_end_io() directly on a + * bio unless they own it and thus know that it has an end_io + * function. + **/ +void bio_endio(struct bio *bio, int error) +{ + if (error) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = -EIO; + + if (bio->bi_end_io) + bio->bi_end_io(bio, error); +} +EXPORT_SYMBOL(bio_endio); + +void bio_pair_release(struct bio_pair *bp) +{ + if (atomic_dec_and_test(&bp->cnt)) { + struct bio *master = bp->bio1.bi_private; + + bio_endio(master, bp->error); + mempool_free(bp, bp->bio2.bi_private); + } +} +EXPORT_SYMBOL(bio_pair_release); + +static void bio_pair_end_1(struct bio *bi, int err) +{ + struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); + + if (err) + bp->error = err; + + bio_pair_release(bp); +} + +static void bio_pair_end_2(struct bio *bi, int err) +{ + struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); + + if (err) + bp->error = err; + + bio_pair_release(bp); +} + +/* + * split a bio - only worry about a bio with a single page in its iovec + */ +struct bio_pair *bio_split(struct bio *bi, int first_sectors) +{ + struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO); + + if (!bp) + return bp; + + trace_block_split(bdev_get_queue(bi->bi_bdev), bi, + bi->bi_sector + first_sectors); + + BUG_ON(bi->bi_vcnt != 1); + BUG_ON(bi->bi_idx != 0); + atomic_set(&bp->cnt, 3); + bp->error = 0; + bp->bio1 = *bi; + bp->bio2 = *bi; + bp->bio2.bi_sector += first_sectors; + bp->bio2.bi_size -= first_sectors << 9; + bp->bio1.bi_size = first_sectors << 9; + + bp->bv1 = bi->bi_io_vec[0]; + bp->bv2 = bi->bi_io_vec[0]; + bp->bv2.bv_offset += first_sectors << 9; + bp->bv2.bv_len -= first_sectors << 9; + bp->bv1.bv_len = first_sectors << 9; + + bp->bio1.bi_io_vec = &bp->bv1; + bp->bio2.bi_io_vec = &bp->bv2; + + bp->bio1.bi_max_vecs = 1; + bp->bio2.bi_max_vecs = 1; + + bp->bio1.bi_end_io = bio_pair_end_1; + bp->bio2.bi_end_io = bio_pair_end_2; + + bp->bio1.bi_private = bi; + bp->bio2.bi_private = bio_split_pool; + + if (bio_integrity(bi)) + bio_integrity_split(bi, bp, first_sectors); + + return bp; +} +EXPORT_SYMBOL(bio_split); + +/** + * bio_sector_offset - Find hardware sector offset in bio + * @bio: bio to inspect + * @index: bio_vec index + * @offset: offset in bv_page + * + * Return the number of hardware sectors between beginning of bio + * and an end point indicated by a bio_vec index and an offset + * within that vector's page. + */ +sector_t bio_sector_offset(struct bio *bio, unsigned short index, + unsigned int offset) +{ + unsigned int sector_sz; + struct bio_vec *bv; + sector_t sectors; + int i; + + sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue); + sectors = 0; + + if (index >= bio->bi_idx) + index = bio->bi_vcnt - 1; + + __bio_for_each_segment(bv, bio, i, 0) { + if (i == index) { + if (offset > bv->bv_offset) + sectors += (offset - bv->bv_offset) / sector_sz; + break; + } + + sectors += bv->bv_len / sector_sz; + } + + return sectors; +} +EXPORT_SYMBOL(bio_sector_offset); + +/* + * create memory pools for biovec's in a bio_set. + * use the global biovec slabs created for general use. + */ +static int biovec_create_pools(struct bio_set *bs, int pool_entries) +{ + struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; + + bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab); + if (!bs->bvec_pool) + return -ENOMEM; + + return 0; +} + +static void biovec_free_pools(struct bio_set *bs) +{ + mempool_destroy(bs->bvec_pool); +} + +void bioset_free(struct bio_set *bs) +{ + if (bs->bio_pool) + mempool_destroy(bs->bio_pool); + + bioset_integrity_free(bs); + biovec_free_pools(bs); + bio_put_slab(bs); + + kfree(bs); +} +EXPORT_SYMBOL(bioset_free); + +/** + * bioset_create - Create a bio_set + * @pool_size: Number of bio and bio_vecs to cache in the mempool + * @front_pad: Number of bytes to allocate in front of the returned bio + * + * Description: + * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller + * to ask for a number of bytes to be allocated in front of the bio. + * Front pad allocation is useful for embedding the bio inside + * another structure, to avoid allocating extra data to go with the bio. + * Note that the bio must be embedded at the END of that structure always, + * or things will break badly. + */ +struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) +{ + unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + struct bio_set *bs; + + bs = kzalloc(sizeof(*bs), GFP_KERNEL); + if (!bs) + return NULL; + + bs->front_pad = front_pad; + + bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); + if (!bs->bio_slab) { + kfree(bs); + return NULL; + } + + bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab); + if (!bs->bio_pool) + goto bad; + + if (!biovec_create_pools(bs, pool_size)) + return bs; + +bad: + bioset_free(bs); + return NULL; +} +EXPORT_SYMBOL(bioset_create); + +static void __init biovec_init_slabs(void) +{ + int i; + + for (i = 0; i < BIOVEC_NR_POOLS; i++) { + int size; + struct biovec_slab *bvs = bvec_slabs + i; + + if (bvs->nr_vecs <= BIO_INLINE_VECS) { + bvs->slab = NULL; + continue; + } + + size = bvs->nr_vecs * sizeof(struct bio_vec); + bvs->slab = kmem_cache_create(bvs->name, size, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + } +} + +static int __init init_bio(void) +{ + bio_slab_max = 2; + bio_slab_nr = 0; + bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL); + if (!bio_slabs) + panic("bio: can't allocate bios\n"); + + bio_integrity_init(); + biovec_init_slabs(); + + fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); + if (!fs_bio_set) + panic("bio: can't allocate bios\n"); + + if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE)) + panic("bio: can't create integrity pool\n"); + + bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES, + sizeof(struct bio_pair)); + if (!bio_split_pool) + panic("bio: can't create split pool\n"); + + return 0; +} +subsys_initcall(init_bio); diff --git a/fs/block_dev.c b/fs/block_dev.c new file mode 100644 index 00000000..a580028e --- /dev/null +++ b/fs/block_dev.c @@ -0,0 +1,1671 @@ +/* + * linux/fs/block_dev.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2001 Andrea Arcangeli SuSE + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +struct bdev_inode { + struct block_device bdev; + struct inode vfs_inode; +}; + +static const struct address_space_operations def_blk_aops; + +static inline struct bdev_inode *BDEV_I(struct inode *inode) +{ + return container_of(inode, struct bdev_inode, vfs_inode); +} + +inline struct block_device *I_BDEV(struct inode *inode) +{ + return &BDEV_I(inode)->bdev; +} + +EXPORT_SYMBOL(I_BDEV); + +/* + * move the inode from it's current bdi to the a new bdi. if the inode is dirty + * we need to move it onto the dirty list of @dst so that the inode is always + * on the right list. + */ +static void bdev_inode_switch_bdi(struct inode *inode, + struct backing_dev_info *dst) +{ + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); + inode->i_data.backing_dev_info = dst; + if (inode->i_state & I_DIRTY) + list_move(&inode->i_wb_list, &dst->wb.b_dirty); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); +} + +sector_t blkdev_max_block(struct block_device *bdev) +{ + sector_t retval = ~((sector_t)0); + loff_t sz = i_size_read(bdev->bd_inode); + + if (sz) { + unsigned int size = block_size(bdev); + unsigned int sizebits = blksize_bits(size); + retval = (sz >> sizebits); + } + return retval; +} + +/* Kill _all_ buffers and pagecache , dirty or not.. */ +static void kill_bdev(struct block_device *bdev) +{ + if (bdev->bd_inode->i_mapping->nrpages == 0) + return; + invalidate_bh_lrus(); + truncate_inode_pages(bdev->bd_inode->i_mapping, 0); +} + +int set_blocksize(struct block_device *bdev, int size) +{ + /* Size must be a power of two, and between 512 and PAGE_SIZE */ + if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) + return -EINVAL; + + /* Size cannot be smaller than the size supported by the device */ + if (size < bdev_logical_block_size(bdev)) + return -EINVAL; + + /* Don't change the size if it is same as current */ + if (bdev->bd_block_size != size) { + sync_blockdev(bdev); + bdev->bd_block_size = size; + bdev->bd_inode->i_blkbits = blksize_bits(size); + kill_bdev(bdev); + } + return 0; +} + +EXPORT_SYMBOL(set_blocksize); + +int sb_set_blocksize(struct super_block *sb, int size) +{ + if (set_blocksize(sb->s_bdev, size)) + return 0; + /* If we get here, we know size is power of two + * and it's value is between 512 and PAGE_SIZE */ + sb->s_blocksize = size; + sb->s_blocksize_bits = blksize_bits(size); + return sb->s_blocksize; +} + +EXPORT_SYMBOL(sb_set_blocksize); + +int sb_min_blocksize(struct super_block *sb, int size) +{ + int minsize = bdev_logical_block_size(sb->s_bdev); + if (size < minsize) + size = minsize; + return sb_set_blocksize(sb, size); +} + +EXPORT_SYMBOL(sb_min_blocksize); + +static int +blkdev_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + if (iblock >= blkdev_max_block(I_BDEV(inode))) { + if (create) + return -EIO; + + /* + * for reads, we're just trying to fill a partial page. + * return a hole, they will have to call get_block again + * before they can fill it, and they will get -EIO at that + * time + */ + return 0; + } + bh->b_bdev = I_BDEV(inode); + bh->b_blocknr = iblock; + set_buffer_mapped(bh); + return 0; +} + +static int +blkdev_get_blocks(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + sector_t end_block = blkdev_max_block(I_BDEV(inode)); + unsigned long max_blocks = bh->b_size >> inode->i_blkbits; + + if ((iblock + max_blocks) > end_block) { + max_blocks = end_block - iblock; + if ((long)max_blocks <= 0) { + if (create) + return -EIO; /* write fully beyond EOF */ + /* + * It is a read which is fully beyond EOF. We return + * a !buffer_mapped buffer + */ + max_blocks = 0; + } + } + + bh->b_bdev = I_BDEV(inode); + bh->b_blocknr = iblock; + bh->b_size = max_blocks << inode->i_blkbits; + if (max_blocks) + set_buffer_mapped(bh); + return 0; +} + +static ssize_t +blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + + return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, + nr_segs, blkdev_get_blocks, NULL, NULL, 0); +} + +int __sync_blockdev(struct block_device *bdev, int wait) +{ + if (!bdev) + return 0; + if (!wait) + return filemap_flush(bdev->bd_inode->i_mapping); + return filemap_write_and_wait(bdev->bd_inode->i_mapping); +} + +/* + * Write out and wait upon all the dirty data associated with a block + * device via its mapping. Does not take the superblock lock. + */ +int sync_blockdev(struct block_device *bdev) +{ + return __sync_blockdev(bdev, 1); +} +EXPORT_SYMBOL(sync_blockdev); + +/* + * Write out and wait upon all dirty data associated with this + * device. Filesystem data as well as the underlying block + * device. Takes the superblock lock. + */ +int fsync_bdev(struct block_device *bdev) +{ + struct super_block *sb = get_super(bdev); + if (sb) { + int res = sync_filesystem(sb); + drop_super(sb); + return res; + } + return sync_blockdev(bdev); +} +EXPORT_SYMBOL(fsync_bdev); + +/** + * freeze_bdev -- lock a filesystem and force it into a consistent state + * @bdev: blockdevice to lock + * + * If a superblock is found on this device, we take the s_umount semaphore + * on it to make sure nobody unmounts until the snapshot creation is done. + * The reference counter (bd_fsfreeze_count) guarantees that only the last + * unfreeze process can unfreeze the frozen filesystem actually when multiple + * freeze requests arrive simultaneously. It counts up in freeze_bdev() and + * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze + * actually. + */ +struct super_block *freeze_bdev(struct block_device *bdev) +{ + struct super_block *sb; + int error = 0; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (++bdev->bd_fsfreeze_count > 1) { + /* + * We don't even need to grab a reference - the first call + * to freeze_bdev grab an active reference and only the last + * thaw_bdev drops it. + */ + sb = get_super(bdev); + drop_super(sb); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; + } + + sb = get_active_super(bdev); + if (!sb) + goto out; + error = freeze_super(sb); + if (error) { + deactivate_super(sb); + bdev->bd_fsfreeze_count--; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return ERR_PTR(error); + } + deactivate_super(sb); + out: + sync_blockdev(bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; /* thaw_bdev releases s->s_umount */ +} +EXPORT_SYMBOL(freeze_bdev); + +/** + * thaw_bdev -- unlock filesystem + * @bdev: blockdevice to unlock + * @sb: associated superblock + * + * Unlocks the filesystem and marks it writeable again after freeze_bdev(). + */ +int thaw_bdev(struct block_device *bdev, struct super_block *sb) +{ + int error = -EINVAL; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (!bdev->bd_fsfreeze_count) + goto out; + + error = 0; + if (--bdev->bd_fsfreeze_count > 0) + goto out; + + if (!sb) + goto out; + + error = thaw_super(sb); + if (error) { + bdev->bd_fsfreeze_count++; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; + } +out: + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; +} +EXPORT_SYMBOL(thaw_bdev); + +static int blkdev_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, blkdev_get_block, wbc); +} + +static int blkdev_readpage(struct file * file, struct page * page) +{ + return block_read_full_page(page, blkdev_get_block); +} + +static int blkdev_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + return block_write_begin(mapping, pos, len, flags, pagep, + blkdev_get_block); +} + +static int blkdev_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + unlock_page(page); + page_cache_release(page); + + return ret; +} + +/* + * private llseek: + * for a block special file file->f_path.dentry->d_inode->i_size is zero + * so we compute the size by hand (just as in block_read/write above) + */ +static loff_t block_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *bd_inode = file->f_mapping->host; + loff_t size; + loff_t retval; + + mutex_lock(&bd_inode->i_mutex); + size = i_size_read(bd_inode); + + switch (origin) { + case 2: + offset += size; + break; + case 1: + offset += file->f_pos; + } + retval = -EINVAL; + if (offset >= 0 && offset <= size) { + if (offset != file->f_pos) { + file->f_pos = offset; + } + retval = offset; + } + mutex_unlock(&bd_inode->i_mutex); + return retval; +} + +int blkdev_fsync(struct file *filp, int datasync) +{ + struct inode *bd_inode = filp->f_mapping->host; + struct block_device *bdev = I_BDEV(bd_inode); + int error; + + /* + * There is no need to serialise calls to blkdev_issue_flush with + * i_mutex and doing so causes performance issues with concurrent + * O_SYNC writers to a block device. + */ + mutex_unlock(&bd_inode->i_mutex); + + error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); + if (error == -EOPNOTSUPP) + error = 0; + + mutex_lock(&bd_inode->i_mutex); + + return error; +} +EXPORT_SYMBOL(blkdev_fsync); + +/* + * pseudo-fs + */ + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); +static struct kmem_cache * bdev_cachep __read_mostly; + +static struct inode *bdev_alloc_inode(struct super_block *sb) +{ + struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void bdev_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct bdev_inode *bdi = BDEV_I(inode); + + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(bdev_cachep, bdi); +} + +static void bdev_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, bdev_i_callback); +} + +static void init_once(void *foo) +{ + struct bdev_inode *ei = (struct bdev_inode *) foo; + struct block_device *bdev = &ei->bdev; + + memset(bdev, 0, sizeof(*bdev)); + mutex_init(&bdev->bd_mutex); + INIT_LIST_HEAD(&bdev->bd_inodes); + INIT_LIST_HEAD(&bdev->bd_list); +#ifdef CONFIG_SYSFS + INIT_LIST_HEAD(&bdev->bd_holder_disks); +#endif + inode_init_once(&ei->vfs_inode); + /* Initialize mutex for freeze. */ + mutex_init(&bdev->bd_fsfreeze_mutex); +} + +static inline void __bd_forget(struct inode *inode) +{ + list_del_init(&inode->i_devices); + inode->i_bdev = NULL; + inode->i_mapping = &inode->i_data; +} + +static void bdev_evict_inode(struct inode *inode) +{ + struct block_device *bdev = &BDEV_I(inode)->bdev; + struct list_head *p; + truncate_inode_pages(&inode->i_data, 0); + invalidate_inode_buffers(inode); /* is it needed here? */ + end_writeback(inode); + spin_lock(&bdev_lock); + while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { + __bd_forget(list_entry(p, struct inode, i_devices)); + } + list_del_init(&bdev->bd_list); + spin_unlock(&bdev_lock); +} + +static const struct super_operations bdev_sops = { + .statfs = simple_statfs, + .alloc_inode = bdev_alloc_inode, + .destroy_inode = bdev_destroy_inode, + .drop_inode = generic_delete_inode, + .evict_inode = bdev_evict_inode, +}; + +static struct dentry *bd_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576); +} + +static struct file_system_type bd_type = { + .name = "bdev", + .mount = bd_mount, + .kill_sb = kill_anon_super, +}; + +struct super_block *blockdev_superblock __read_mostly; + +void __init bdev_cache_init(void) +{ + int err; + struct vfsmount *bd_mnt; + + bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), + 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_PANIC), + init_once); + err = register_filesystem(&bd_type); + if (err) + panic("Cannot register bdev pseudo-fs"); + bd_mnt = kern_mount(&bd_type); + if (IS_ERR(bd_mnt)) + panic("Cannot create bdev pseudo-fs"); + /* + * This vfsmount structure is only used to obtain the + * blockdev_superblock, so tell kmemleak not to report it. + */ + kmemleak_not_leak(bd_mnt); + blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ +} + +/* + * Most likely _very_ bad one - but then it's hardly critical for small + * /dev and can be fixed when somebody will need really large one. + * Keep in mind that it will be fed through icache hash function too. + */ +static inline unsigned long hash(dev_t dev) +{ + return MAJOR(dev)+MINOR(dev); +} + +static int bdev_test(struct inode *inode, void *data) +{ + return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; +} + +static int bdev_set(struct inode *inode, void *data) +{ + BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; + return 0; +} + +static LIST_HEAD(all_bdevs); + +struct block_device *bdget(dev_t dev) +{ + struct block_device *bdev; + struct inode *inode; + + inode = iget5_locked(blockdev_superblock, hash(dev), + bdev_test, bdev_set, &dev); + + if (!inode) + return NULL; + + bdev = &BDEV_I(inode)->bdev; + + if (inode->i_state & I_NEW) { + bdev->bd_contains = NULL; + bdev->bd_inode = inode; + bdev->bd_block_size = (1 << inode->i_blkbits); + bdev->bd_part_count = 0; + bdev->bd_invalidated = 0; + inode->i_mode = S_IFBLK; + inode->i_rdev = dev; + inode->i_bdev = bdev; + inode->i_data.a_ops = &def_blk_aops; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + inode->i_data.backing_dev_info = &default_backing_dev_info; + spin_lock(&bdev_lock); + list_add(&bdev->bd_list, &all_bdevs); + spin_unlock(&bdev_lock); + unlock_new_inode(inode); + } + return bdev; +} + +EXPORT_SYMBOL(bdget); + +/** + * bdgrab -- Grab a reference to an already referenced block device + * @bdev: Block device to grab a reference to. + */ +struct block_device *bdgrab(struct block_device *bdev) +{ + ihold(bdev->bd_inode); + return bdev; +} + +long nr_blockdev_pages(void) +{ + struct block_device *bdev; + long ret = 0; + spin_lock(&bdev_lock); + list_for_each_entry(bdev, &all_bdevs, bd_list) { + ret += bdev->bd_inode->i_mapping->nrpages; + } + spin_unlock(&bdev_lock); + return ret; +} + +void bdput(struct block_device *bdev) +{ + iput(bdev->bd_inode); +} + +EXPORT_SYMBOL(bdput); + +static struct block_device *bd_acquire(struct inode *inode) +{ + struct block_device *bdev; + + spin_lock(&bdev_lock); + bdev = inode->i_bdev; + if (bdev) { + ihold(bdev->bd_inode); + spin_unlock(&bdev_lock); + return bdev; + } + spin_unlock(&bdev_lock); + + bdev = bdget(inode->i_rdev); + if (bdev) { + spin_lock(&bdev_lock); + if (!inode->i_bdev) { + /* + * We take an additional reference to bd_inode, + * and it's released in clear_inode() of inode. + * So, we can access it via ->i_mapping always + * without igrab(). + */ + ihold(bdev->bd_inode); + inode->i_bdev = bdev; + inode->i_mapping = bdev->bd_inode->i_mapping; + list_add(&inode->i_devices, &bdev->bd_inodes); + } + spin_unlock(&bdev_lock); + } + return bdev; +} + +/* Call when you free inode */ + +void bd_forget(struct inode *inode) +{ + struct block_device *bdev = NULL; + + spin_lock(&bdev_lock); + if (inode->i_bdev) { + if (!sb_is_blkdev_sb(inode->i_sb)) + bdev = inode->i_bdev; + __bd_forget(inode); + } + spin_unlock(&bdev_lock); + + if (bdev) + iput(bdev->bd_inode); +} + +/** + * bd_may_claim - test whether a block device can be claimed + * @bdev: block device of interest + * @whole: whole block device containing @bdev, may equal @bdev + * @holder: holder trying to claim @bdev + * + * Test whether @bdev can be claimed by @holder. + * + * CONTEXT: + * spin_lock(&bdev_lock). + * + * RETURNS: + * %true if @bdev can be claimed, %false otherwise. + */ +static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, + void *holder) +{ + if (bdev->bd_holder == holder) + return true; /* already a holder */ + else if (bdev->bd_holder != NULL) + return false; /* held by someone else */ + else if (bdev->bd_contains == bdev) + return true; /* is a whole device which isn't held */ + + else if (whole->bd_holder == bd_may_claim) + return true; /* is a partition of a device that is being partitioned */ + else if (whole->bd_holder != NULL) + return false; /* is a partition of a held device */ + else + return true; /* is a partition of an un-held device */ +} + +/** + * bd_prepare_to_claim - prepare to claim a block device + * @bdev: block device of interest + * @whole: the whole device containing @bdev, may equal @bdev + * @holder: holder trying to claim @bdev + * + * Prepare to claim @bdev. This function fails if @bdev is already + * claimed by another holder and waits if another claiming is in + * progress. This function doesn't actually claim. On successful + * return, the caller has ownership of bd_claiming and bd_holder[s]. + * + * CONTEXT: + * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab + * it multiple times. + * + * RETURNS: + * 0 if @bdev can be claimed, -EBUSY otherwise. + */ +static int bd_prepare_to_claim(struct block_device *bdev, + struct block_device *whole, void *holder) +{ +retry: + /* if someone else claimed, fail */ + if (!bd_may_claim(bdev, whole, holder)) + return -EBUSY; + + /* if claiming is already in progress, wait for it to finish */ + if (whole->bd_claiming) { + wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); + DEFINE_WAIT(wait); + + prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&bdev_lock); + schedule(); + finish_wait(wq, &wait); + spin_lock(&bdev_lock); + goto retry; + } + + /* yay, all mine */ + return 0; +} + +/** + * bd_start_claiming - start claiming a block device + * @bdev: block device of interest + * @holder: holder trying to claim @bdev + * + * @bdev is about to be opened exclusively. Check @bdev can be opened + * exclusively and mark that an exclusive open is in progress. Each + * successful call to this function must be matched with a call to + * either bd_finish_claiming() or bd_abort_claiming() (which do not + * fail). + * + * This function is used to gain exclusive access to the block device + * without actually causing other exclusive open attempts to fail. It + * should be used when the open sequence itself requires exclusive + * access but may subsequently fail. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to the block device containing @bdev on success, ERR_PTR() + * value on failure. + */ +static struct block_device *bd_start_claiming(struct block_device *bdev, + void *holder) +{ + struct gendisk *disk; + struct block_device *whole; + int partno, err; + + might_sleep(); + + /* + * @bdev might not have been initialized properly yet, look up + * and grab the outer block device the hard way. + */ + disk = get_gendisk(bdev->bd_dev, &partno); + if (!disk) + return ERR_PTR(-ENXIO); + + /* + * Normally, @bdev should equal what's returned from bdget_disk() + * if partno is 0; however, some drivers (floppy) use multiple + * bdev's for the same physical device and @bdev may be one of the + * aliases. Keep @bdev if partno is 0. This means claimer + * tracking is broken for those devices but it has always been that + * way. + */ + if (partno) + whole = bdget_disk(disk, 0); + else + whole = bdgrab(bdev); + + module_put(disk->fops->owner); + put_disk(disk); + if (!whole) + return ERR_PTR(-ENOMEM); + + /* prepare to claim, if successful, mark claiming in progress */ + spin_lock(&bdev_lock); + + err = bd_prepare_to_claim(bdev, whole, holder); + if (err == 0) { + whole->bd_claiming = holder; + spin_unlock(&bdev_lock); + return whole; + } else { + spin_unlock(&bdev_lock); + bdput(whole); + return ERR_PTR(err); + } +} + +#ifdef CONFIG_SYSFS +struct bd_holder_disk { + struct list_head list; + struct gendisk *disk; + int refcnt; +}; + +static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, + struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + list_for_each_entry(holder, &bdev->bd_holder_disks, list) + if (holder->disk == disk) + return holder; + return NULL; +} + +static int add_symlink(struct kobject *from, struct kobject *to) +{ + return sysfs_create_link(from, to, kobject_name(to)); +} + +static void del_symlink(struct kobject *from, struct kobject *to) +{ + sysfs_remove_link(from, kobject_name(to)); +} + +/** + * bd_link_disk_holder - create symlinks between holding disk and slave bdev + * @bdev: the claimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * This functions creates the following sysfs symlinks. + * + * - from "slaves" directory of the holder @disk to the claimed @bdev + * - from "holders" directory of the @bdev to the holder @disk + * + * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is + * passed to bd_link_disk_holder(), then: + * + * /sys/block/dm-0/slaves/sda --> /sys/block/sda + * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 + * + * The caller must have claimed @bdev before calling this function and + * ensure that both @bdev and @disk are valid during the creation and + * lifetime of these symlinks. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + int ret = 0; + + mutex_lock(&bdev->bd_mutex); + + WARN_ON_ONCE(!bdev->bd_holder); + + /* FIXME: remove the following once add_disk() handles errors */ + if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) + goto out_unlock; + + holder = bd_find_holder_disk(bdev, disk); + if (holder) { + holder->refcnt++; + goto out_unlock; + } + + holder = kzalloc(sizeof(*holder), GFP_KERNEL); + if (!holder) { + ret = -ENOMEM; + goto out_unlock; + } + + INIT_LIST_HEAD(&holder->list); + holder->disk = disk; + holder->refcnt = 1; + + ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + if (ret) + goto out_free; + + ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + goto out_del; + /* + * bdev could be deleted beneath us which would implicitly destroy + * the holder directory. Hold on to it. + */ + kobject_get(bdev->bd_part->holder_dir); + + list_add(&holder->list, &bdev->bd_holder_disks); + goto out_unlock; + +out_del: + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); +out_free: + kfree(holder); +out_unlock: + mutex_unlock(&bdev->bd_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(bd_link_disk_holder); + +/** + * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() + * @bdev: the calimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * CONTEXT: + * Might sleep. + */ +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + mutex_lock(&bdev->bd_mutex); + + holder = bd_find_holder_disk(bdev, disk); + + if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + del_symlink(bdev->bd_part->holder_dir, + &disk_to_dev(disk)->kobj); + kobject_put(bdev->bd_part->holder_dir); + list_del_init(&holder->list); + kfree(holder); + } + + mutex_unlock(&bdev->bd_mutex); +} +EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); +#endif + +/** + * flush_disk - invalidates all buffer-cache entries on a disk + * + * @bdev: struct block device to be flushed + * @kill_dirty: flag to guide handling of dirty inodes + * + * Invalidates all buffer-cache entries on a disk. It should be called + * when a disk has been changed -- either by a media change or online + * resize. + */ +static void flush_disk(struct block_device *bdev, bool kill_dirty) +{ + if (__invalidate_device(bdev, kill_dirty)) { + char name[BDEVNAME_SIZE] = ""; + + if (bdev->bd_disk) + disk_name(bdev->bd_disk, 0, name); + printk(KERN_WARNING "VFS: busy inodes on changed media or " + "resized disk %s\n", name); + } + + if (!bdev->bd_disk) + return; + if (disk_partitionable(bdev->bd_disk)) + bdev->bd_invalidated = 1; +} + +/** + * check_disk_size_change - checks for disk size change and adjusts bdev size. + * @disk: struct gendisk to check + * @bdev: struct bdev to adjust. + * + * This routine checks to see if the bdev size does not match the disk size + * and adjusts it if it differs. + */ +void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) +{ + loff_t disk_size, bdev_size; + + disk_size = (loff_t)get_capacity(disk) << 9; + bdev_size = i_size_read(bdev->bd_inode); + if (disk_size != bdev_size) { + char name[BDEVNAME_SIZE]; + + disk_name(disk, 0, name); + printk(KERN_INFO + "%s: detected capacity change from %lld to %lld\n", + name, bdev_size, disk_size); + i_size_write(bdev->bd_inode, disk_size); + flush_disk(bdev, false); + } +} +EXPORT_SYMBOL(check_disk_size_change); + +/** + * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back + * @disk: struct gendisk to be revalidated + * + * This routine is a wrapper for lower-level driver's revalidate_disk + * call-backs. It is used to do common pre and post operations needed + * for all revalidate_disk operations. + */ +int revalidate_disk(struct gendisk *disk) +{ + struct block_device *bdev; + int ret = 0; + + if (disk->fops->revalidate_disk) + ret = disk->fops->revalidate_disk(disk); + + bdev = bdget_disk(disk, 0); + if (!bdev) + return ret; + + mutex_lock(&bdev->bd_mutex); + check_disk_size_change(disk, bdev); + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return ret; +} +EXPORT_SYMBOL(revalidate_disk); + +/* + * This routine checks whether a removable media has been changed, + * and invalidates all buffer-cache-entries in that case. This + * is a relatively slow routine, so we have to try to minimize using + * it. Thus it is called only upon a 'mount' or 'open'. This + * is the best way of combining speed and utility, I think. + * People changing diskettes in the middle of an operation deserve + * to lose :-) + */ +int check_disk_change(struct block_device *bdev) +{ + struct gendisk *disk = bdev->bd_disk; + const struct block_device_operations *bdops = disk->fops; + unsigned int events; + + events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | + DISK_EVENT_EJECT_REQUEST); + if (!(events & DISK_EVENT_MEDIA_CHANGE)) + return 0; + + flush_disk(bdev, true); + if (bdops->revalidate_disk) + bdops->revalidate_disk(bdev->bd_disk); + return 1; +} + +EXPORT_SYMBOL(check_disk_change); + +void bd_set_size(struct block_device *bdev, loff_t size) +{ + unsigned bsize = bdev_logical_block_size(bdev); + + bdev->bd_inode->i_size = size; + while (bsize < PAGE_CACHE_SIZE) { + if (size & bsize) + break; + bsize <<= 1; + } + bdev->bd_block_size = bsize; + bdev->bd_inode->i_blkbits = blksize_bits(bsize); +} +EXPORT_SYMBOL(bd_set_size); + +static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); + +/* + * bd_mutex locking: + * + * mutex_lock(part->bd_mutex) + * mutex_lock_nested(whole->bd_mutex, 1) + */ + +static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) +{ + struct gendisk *disk; + struct module *owner; + int ret; + int partno; + int perm = 0; + + if (mode & FMODE_READ) + perm |= MAY_READ; + if (mode & FMODE_WRITE) + perm |= MAY_WRITE; + /* + * hooks: /n/, see "layering violations". + */ + if (!for_part) { + ret = devcgroup_inode_permission(bdev->bd_inode, perm); + if (ret != 0) { + bdput(bdev); + return ret; + } + } + + restart: + + ret = -ENXIO; + disk = get_gendisk(bdev->bd_dev, &partno); + if (!disk) + goto out; + owner = disk->fops->owner; + + disk_block_events(disk); + mutex_lock_nested(&bdev->bd_mutex, for_part); + if (!bdev->bd_openers) { + bdev->bd_disk = disk; + bdev->bd_contains = bdev; + if (!partno) { + struct backing_dev_info *bdi; + + ret = -ENXIO; + bdev->bd_part = disk_get_part(disk, partno); + if (!bdev->bd_part) + goto out_clear; + + ret = 0; + if (disk->fops->open) { + ret = disk->fops->open(bdev, mode); + if (ret == -ERESTARTSYS) { + /* Lost a race with 'disk' being + * deleted, try again. + * See md.c + */ + disk_put_part(bdev->bd_part); + bdev->bd_part = NULL; + bdev->bd_disk = NULL; + mutex_unlock(&bdev->bd_mutex); + disk_unblock_events(disk); + put_disk(disk); + module_put(owner); + goto restart; + } + } + + if (!ret && !bdev->bd_openers) { + bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + bdi = blk_get_backing_dev_info(bdev); + if (bdi == NULL) + bdi = &default_backing_dev_info; + bdev_inode_switch_bdi(bdev->bd_inode, bdi); + } + + /* + * If the device is invalidated, rescan partition + * if open succeeded or failed with -ENOMEDIUM. + * The latter is necessary to prevent ghost + * partitions on a removed medium. + */ + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(disk, bdev); + } + if (ret) + goto out_clear; + } else { + struct block_device *whole; + whole = bdget_disk(disk, 0); + ret = -ENOMEM; + if (!whole) + goto out_clear; + BUG_ON(for_part); + ret = __blkdev_get(whole, mode, 1); + if (ret) + goto out_clear; + bdev->bd_contains = whole; + bdev_inode_switch_bdi(bdev->bd_inode, + whole->bd_inode->i_data.backing_dev_info); + bdev->bd_part = disk_get_part(disk, partno); + if (!(disk->flags & GENHD_FL_UP) || + !bdev->bd_part || !bdev->bd_part->nr_sects) { + ret = -ENXIO; + goto out_clear; + } + bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); + } + } else { + if (bdev->bd_contains == bdev) { + ret = 0; + if (bdev->bd_disk->fops->open) + ret = bdev->bd_disk->fops->open(bdev, mode); + /* the same as first opener case, read comment there */ + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(bdev->bd_disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(bdev->bd_disk, bdev); + } + if (ret) + goto out_unlock_bdev; + } + /* only one opener holds refs to the module and disk */ + put_disk(disk); + module_put(owner); + } + bdev->bd_openers++; + if (for_part) + bdev->bd_part_count++; + mutex_unlock(&bdev->bd_mutex); + disk_unblock_events(disk); + return 0; + + out_clear: + disk_put_part(bdev->bd_part); + bdev->bd_disk = NULL; + bdev->bd_part = NULL; + bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); + if (bdev != bdev->bd_contains) + __blkdev_put(bdev->bd_contains, mode, 1); + bdev->bd_contains = NULL; + out_unlock_bdev: + mutex_unlock(&bdev->bd_mutex); + disk_unblock_events(disk); + put_disk(disk); + module_put(owner); + out: + bdput(bdev); + + return ret; +} + +/** + * blkdev_get - open a block device + * @bdev: block_device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is + * open with exclusive access. Specifying %FMODE_EXCL with %NULL + * @holder is invalid. Exclusive opens may nest for the same @holder. + * + * On success, the reference count of @bdev is unchanged. On failure, + * @bdev is put. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) +{ + struct block_device *whole = NULL; + int res; + + WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); + + if ((mode & FMODE_EXCL) && holder) { + whole = bd_start_claiming(bdev, holder); + if (IS_ERR(whole)) { + bdput(bdev); + return PTR_ERR(whole); + } + } + + res = __blkdev_get(bdev, mode, 0); + + if (whole) { + struct gendisk *disk = whole->bd_disk; + + /* finish claiming */ + mutex_lock(&bdev->bd_mutex); + spin_lock(&bdev_lock); + + if (!res) { + BUG_ON(!bd_may_claim(bdev, whole, holder)); + /* + * Note that for a whole device bd_holders + * will be incremented twice, and bd_holder + * will be set to bd_may_claim before being + * set to holder + */ + whole->bd_holders++; + whole->bd_holder = bd_may_claim; + bdev->bd_holders++; + bdev->bd_holder = holder; + } + + /* tell others that we're done */ + BUG_ON(whole->bd_claiming != holder); + whole->bd_claiming = NULL; + wake_up_bit(&whole->bd_claiming, 0); + + spin_unlock(&bdev_lock); + + /* + * Block event polling for write claims if requested. Any + * write holder makes the write_holder state stick until + * all are released. This is good enough and tracking + * individual writeable reference is too fragile given the + * way @mode is used in blkdev_get/put(). + */ + if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && + (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { + bdev->bd_write_holder = true; + disk_block_events(disk); + } + + mutex_unlock(&bdev->bd_mutex); + bdput(whole); + } + + return res; +} +EXPORT_SYMBOL(blkdev_get); + +/** + * blkdev_get_by_path - open a block device by name + * @path: path to the block device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open the blockdevice described by the device file at @path. @mode + * and @holder are identical to blkdev_get(). + * + * On success, the returned block_device has reference count of one. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to block_device on success, ERR_PTR(-errno) on failure. + */ +struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, + void *holder) +{ + struct block_device *bdev; + int err; + + bdev = lookup_bdev(path); + if (IS_ERR(bdev)) + return bdev; + + err = blkdev_get(bdev, mode, holder); + if (err) + return ERR_PTR(err); + + if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { + blkdev_put(bdev, mode); + return ERR_PTR(-EACCES); + } + + return bdev; +} +EXPORT_SYMBOL(blkdev_get_by_path); + +/** + * blkdev_get_by_dev - open a block device by device number + * @dev: device number of block device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open the blockdevice described by device number @dev. @mode and + * @holder are identical to blkdev_get(). + * + * Use it ONLY if you really do not have anything better - i.e. when + * you are behind a truly sucky interface and all you are given is a + * device number. _Never_ to be used for internal purposes. If you + * ever need it - reconsider your API. + * + * On success, the returned block_device has reference count of one. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to block_device on success, ERR_PTR(-errno) on failure. + */ +struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) +{ + struct block_device *bdev; + int err; + + bdev = bdget(dev); + if (!bdev) + return ERR_PTR(-ENOMEM); + + err = blkdev_get(bdev, mode, holder); + if (err) + return ERR_PTR(err); + + return bdev; +} +EXPORT_SYMBOL(blkdev_get_by_dev); + +static int blkdev_open(struct inode * inode, struct file * filp) +{ + struct block_device *bdev; + + /* + * Preserve backwards compatibility and allow large file access + * even if userspace doesn't ask for it explicitly. Some mkfs + * binary needs it. We might want to drop this workaround + * during an unstable branch. + */ + filp->f_flags |= O_LARGEFILE; + + if (filp->f_flags & O_NDELAY) + filp->f_mode |= FMODE_NDELAY; + if (filp->f_flags & O_EXCL) + filp->f_mode |= FMODE_EXCL; + if ((filp->f_flags & O_ACCMODE) == 3) + filp->f_mode |= FMODE_WRITE_IOCTL; + + bdev = bd_acquire(inode); + if (bdev == NULL) + return -ENOMEM; + + filp->f_mapping = bdev->bd_inode->i_mapping; + + return blkdev_get(bdev, filp->f_mode, filp); +} + +static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) +{ + int ret = 0; + struct gendisk *disk = bdev->bd_disk; + struct block_device *victim = NULL; + + mutex_lock_nested(&bdev->bd_mutex, for_part); + if (for_part) + bdev->bd_part_count--; + + if (!--bdev->bd_openers) { + WARN_ON_ONCE(bdev->bd_holders); + sync_blockdev(bdev); + kill_bdev(bdev); + /* ->release can cause the old bdi to disappear, + * so must switch it out first + */ + bdev_inode_switch_bdi(bdev->bd_inode, + &default_backing_dev_info); + } + if (bdev->bd_contains == bdev) { + if (disk->fops->release) + ret = disk->fops->release(disk, mode); + } + if (!bdev->bd_openers) { + struct module *owner = disk->fops->owner; + + disk_put_part(bdev->bd_part); + bdev->bd_part = NULL; + bdev->bd_disk = NULL; + if (bdev != bdev->bd_contains) + victim = bdev->bd_contains; + bdev->bd_contains = NULL; + + put_disk(disk); + module_put(owner); + } + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + if (victim) + __blkdev_put(victim, mode, 1); + return ret; +} + +int blkdev_put(struct block_device *bdev, fmode_t mode) +{ + if (mode & FMODE_EXCL) { + bool bdev_free; + + /* + * Release a claim on the device. The holder fields + * are protected with bdev_lock. bd_mutex is to + * synchronize disk_holder unlinking. + */ + mutex_lock(&bdev->bd_mutex); + spin_lock(&bdev_lock); + + WARN_ON_ONCE(--bdev->bd_holders < 0); + WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); + + /* bd_contains might point to self, check in a separate step */ + if ((bdev_free = !bdev->bd_holders)) + bdev->bd_holder = NULL; + if (!bdev->bd_contains->bd_holders) + bdev->bd_contains->bd_holder = NULL; + + spin_unlock(&bdev_lock); + + /* + * If this was the last claim, remove holder link and + * unblock evpoll if it was a write holder. + */ + if (bdev_free) { + if (bdev->bd_write_holder) { + disk_unblock_events(bdev->bd_disk); + disk_check_events(bdev->bd_disk); + bdev->bd_write_holder = false; + } + } + + mutex_unlock(&bdev->bd_mutex); + } + + return __blkdev_put(bdev, mode, 0); +} +EXPORT_SYMBOL(blkdev_put); + +static int blkdev_close(struct inode * inode, struct file * filp) +{ + struct block_device *bdev = I_BDEV(filp->f_mapping->host); + + return blkdev_put(bdev, filp->f_mode); +} + +static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + struct block_device *bdev = I_BDEV(file->f_mapping->host); + fmode_t mode = file->f_mode; + + /* + * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have + * to updated it before every ioctl. + */ + if (file->f_flags & O_NDELAY) + mode |= FMODE_NDELAY; + else + mode &= ~FMODE_NDELAY; + + return blkdev_ioctl(bdev, mode, cmd, arg); +} + +/* + * Write data to the block device. Only intended for the block device itself + * and the raw driver which basically is a fake block device. + * + * Does not take i_mutex for the write and thus is not for general purpose + * use. + */ +ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + ssize_t ret; + + BUG_ON(iocb->ki_pos != pos); + + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + if (ret > 0 || ret == -EIOCBQUEUED) { + ssize_t err; + + err = generic_write_sync(file, pos, ret); + if (err < 0 && ret > 0) + ret = err; + } + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_aio_write); + +/* + * Try to release a page associated with block device when the system + * is under memory pressure. + */ +static int blkdev_releasepage(struct page *page, gfp_t wait) +{ + struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; + + if (super && super->s_op->bdev_try_to_free_page) + return super->s_op->bdev_try_to_free_page(super, page, wait); + + return try_to_free_buffers(page); +} + +static const struct address_space_operations def_blk_aops = { + .readpage = blkdev_readpage, + .writepage = blkdev_writepage, + .write_begin = blkdev_write_begin, + .write_end = blkdev_write_end, + .writepages = generic_writepages, + .releasepage = blkdev_releasepage, + .direct_IO = blkdev_direct_IO, +}; + +const struct file_operations def_blk_fops = { + .open = blkdev_open, + .release = blkdev_close, + .llseek = block_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = blkdev_aio_write, + .mmap = generic_file_mmap, + .fsync = blkdev_fsync, + .unlocked_ioctl = block_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_blkdev_ioctl, +#endif + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +}; + +int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) +{ + int res; + mm_segment_t old_fs = get_fs(); + set_fs(KERNEL_DS); + res = blkdev_ioctl(bdev, 0, cmd, arg); + set_fs(old_fs); + return res; +} + +EXPORT_SYMBOL(ioctl_by_bdev); + +/** + * lookup_bdev - lookup a struct block_device by name + * @pathname: special file representing the block device + * + * Get a reference to the blockdevice at @pathname in the current + * namespace if possible and return it. Return ERR_PTR(error) + * otherwise. + */ +struct block_device *lookup_bdev(const char *pathname) +{ + struct block_device *bdev; + struct inode *inode; + struct path path; + int error; + + if (!pathname || !*pathname) + return ERR_PTR(-EINVAL); + + error = kern_path(pathname, LOOKUP_FOLLOW, &path); + if (error) + return ERR_PTR(error); + + inode = path.dentry->d_inode; + error = -ENOTBLK; + if (!S_ISBLK(inode->i_mode)) + goto fail; + error = -EACCES; + if (path.mnt->mnt_flags & MNT_NODEV) + goto fail; + error = -ENOMEM; + bdev = bd_acquire(inode); + if (!bdev) + goto fail; +out: + path_put(&path); + return bdev; +fail: + bdev = ERR_PTR(error); + goto out; +} +EXPORT_SYMBOL(lookup_bdev); + +int __invalidate_device(struct block_device *bdev, bool kill_dirty) +{ + struct super_block *sb = get_super(bdev); + int res = 0; + + if (sb) { + /* + * no need to lock the super, get_super holds the + * read mutex so the filesystem cannot go away + * under us (->put_super runs with the write lock + * hold). + */ + shrink_dcache_sb(sb); + res = invalidate_inodes(sb, kill_dirty); + drop_super(sb); + } + invalidate_bdev(bdev); + return res; +} +EXPORT_SYMBOL(__invalidate_device); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig new file mode 100644 index 00000000..ecb9fd3b --- /dev/null +++ b/fs/btrfs/Kconfig @@ -0,0 +1,33 @@ +config BTRFS_FS + tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" + depends on EXPERIMENTAL + select LIBCRC32C + select ZLIB_INFLATE + select ZLIB_DEFLATE + select LZO_COMPRESS + select LZO_DECOMPRESS + help + Btrfs is a new filesystem with extents, writable snapshotting, + support for multiple devices and many more features. + + Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET + FINALIZED. You should say N here unless you are interested in + testing Btrfs with non-critical data. + + To compile this file system support as a module, choose M here. The + module will be called btrfs. + + If unsure, say N. + +config BTRFS_FS_POSIX_ACL + bool "Btrfs POSIX Access Control Lists" + depends on BTRFS_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile new file mode 100644 index 00000000..9b72dcf1 --- /dev/null +++ b/fs/btrfs/Makefile @@ -0,0 +1,10 @@ + +obj-$(CONFIG_BTRFS_FS) := btrfs.o + +btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + file-item.o inode-item.o inode-map.o disk-io.o \ + transaction.o inode.o file.o tree-defrag.o \ + extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ + extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ + export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \ + compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c new file mode 100644 index 00000000..f66fc995 --- /dev/null +++ b/fs/btrfs/acl.c @@ -0,0 +1,335 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ctree.h" +#include "btrfs_inode.h" +#include "xattr.h" + +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + +static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) +{ + int size; + const char *name; + char *value = NULL; + struct posix_acl *acl; + + if (!IS_POSIXACL(inode)) + return NULL; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + + size = __btrfs_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __btrfs_getxattr(inode, name, value, size); + if (size > 0) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) { + kfree(value); + return acl; + } + set_cached_acl(inode, type, acl); + } + kfree(value); + } else if (size == -ENOENT || size == -ENODATA || size == 0) { + /* FIXME, who returns -ENOENT? I think nobody */ + acl = NULL; + set_cached_acl(inode, type, acl); + } else { + acl = ERR_PTR(-EIO); + } + + return acl; +} + +static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, + void *value, size_t size, int type) +{ + struct posix_acl *acl; + int ret = 0; + + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + + acl = btrfs_get_acl(dentry->d_inode, type); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + ret = posix_acl_to_xattr(acl, value, size); + posix_acl_release(acl); + + return ret; +} + +/* + * Needs to be called with fs_mutex held + */ +static int btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, int type) +{ + int ret, size = 0; + const char *name; + char *value = NULL; + mode_t mode; + + if (acl) { + ret = posix_acl_valid(acl); + if (ret < 0) + return ret; + ret = 0; + } + + switch (type) { + case ACL_TYPE_ACCESS: + mode = inode->i_mode; + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + ret = posix_acl_equiv_mode(acl, &mode); + if (ret < 0) + return ret; + inode->i_mode = mode; + } + ret = 0; + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EINVAL : 0; + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return -EINVAL; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(acl, value, size); + if (ret < 0) + goto out; + } + + ret = __btrfs_setxattr(trans, inode, name, value, size, 0); +out: + kfree(value); + + if (!ret) + set_cached_acl(inode, type, acl); + + return ret; +} + +static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + int ret; + struct posix_acl *acl = NULL; + + if (!inode_owner_or_capable(dentry->d_inode)) + return -EPERM; + + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + ret = posix_acl_valid(acl); + if (ret) + goto out; + } + } + + ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); +out: + posix_acl_release(acl); + + return ret; +} + +int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags) +{ + int error = -EAGAIN; + + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + error = -ECHILD; + + } else { + struct posix_acl *acl; + acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + } + } + + return error; +} + +/* + * btrfs_init_acl is already generally called under fs_mutex, so the locking + * stuff has been fixed to work with that. If the locking stuff changes, we + * need to re-evaluate the acl locking stuff. + */ +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int ret = 0; + + /* this happens with subvols */ + if (!dir) + return 0; + + if (!S_ISLNK(inode->i_mode)) { + if (IS_POSIXACL(dir)) { + acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + + if (!acl) + inode->i_mode &= ~current_umask(); + } + + if (IS_POSIXACL(dir) && acl) { + struct posix_acl *clone; + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { + ret = btrfs_set_acl(trans, inode, acl, + ACL_TYPE_DEFAULT); + if (ret) + goto failed; + } + clone = posix_acl_clone(acl, GFP_NOFS); + ret = -ENOMEM; + if (!clone) + goto failed; + + mode = inode->i_mode; + ret = posix_acl_create_masq(clone, &mode); + if (ret >= 0) { + inode->i_mode = mode; + if (ret > 0) { + /* we need an acl */ + ret = btrfs_set_acl(trans, inode, clone, + ACL_TYPE_ACCESS); + } + } + posix_acl_release(clone); + } +failed: + posix_acl_release(acl); + + return ret; +} + +int btrfs_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int ret = 0; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!IS_POSIXACL(inode)) + return 0; + + acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR_OR_NULL(acl)) + return PTR_ERR(acl); + + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + + ret = posix_acl_chmod_masq(clone, inode->i_mode); + if (!ret) + ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS); + + posix_acl_release(clone); + + return ret; +} + +const struct xattr_handler btrfs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .get = btrfs_xattr_acl_get, + .set = btrfs_xattr_acl_set, +}; + +const struct xattr_handler btrfs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .get = btrfs_xattr_acl_get, + .set = btrfs_xattr_acl_set, +}; + +#else /* CONFIG_BTRFS_FS_POSIX_ACL */ + +int btrfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) +{ + return 0; +} + +#endif /* CONFIG_BTRFS_FS_POSIX_ACL */ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c new file mode 100644 index 00000000..7ec14097 --- /dev/null +++ b/fs/btrfs/async-thread.c @@ -0,0 +1,718 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include "async-thread.h" + +#define WORK_QUEUED_BIT 0 +#define WORK_DONE_BIT 1 +#define WORK_ORDER_DONE_BIT 2 +#define WORK_HIGH_PRIO_BIT 3 + +/* + * container for the kthread task pointer and the list of pending work + * One of these is allocated per thread. + */ +struct btrfs_worker_thread { + /* pool we belong to */ + struct btrfs_workers *workers; + + /* list of struct btrfs_work that are waiting for service */ + struct list_head pending; + struct list_head prio_pending; + + /* list of worker threads from struct btrfs_workers */ + struct list_head worker_list; + + /* kthread */ + struct task_struct *task; + + /* number of things on the pending list */ + atomic_t num_pending; + + /* reference counter for this struct */ + atomic_t refs; + + unsigned long sequence; + + /* protects the pending list. */ + spinlock_t lock; + + /* set to non-zero when this thread is already awake and kicking */ + int working; + + /* are we currently idle */ + int idle; +}; + +/* + * btrfs_start_workers uses kthread_run, which can block waiting for memory + * for a very long time. It will actually throttle on page writeback, + * and so it may not make progress until after our btrfs worker threads + * process all of the pending work structs in their queue + * + * This means we can't use btrfs_start_workers from inside a btrfs worker + * thread that is used as part of cleaning dirty memory, which pretty much + * involves all of the worker threads. + * + * Instead we have a helper queue who never has more than one thread + * where we scheduler thread start operations. This worker_start struct + * is used to contain the work and hold a pointer to the queue that needs + * another worker. + */ +struct worker_start { + struct btrfs_work work; + struct btrfs_workers *queue; +}; + +static void start_new_worker_func(struct btrfs_work *work) +{ + struct worker_start *start; + start = container_of(work, struct worker_start, work); + btrfs_start_workers(start->queue, 1); + kfree(start); +} + +static int start_new_worker(struct btrfs_workers *queue) +{ + struct worker_start *start; + int ret; + + start = kzalloc(sizeof(*start), GFP_NOFS); + if (!start) + return -ENOMEM; + + start->work.func = start_new_worker_func; + start->queue = queue; + ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); + if (ret) + kfree(start); + return ret; +} + +/* + * helper function to move a thread onto the idle list after it + * has finished some requests. + */ +static void check_idle_worker(struct btrfs_worker_thread *worker) +{ + if (!worker->idle && atomic_read(&worker->num_pending) < + worker->workers->idle_thresh / 2) { + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 1; + + /* the list may be empty if the worker is just starting */ + if (!list_empty(&worker->worker_list)) { + list_move(&worker->worker_list, + &worker->workers->idle_list); + } + spin_unlock_irqrestore(&worker->workers->lock, flags); + } +} + +/* + * helper function to move a thread off the idle list after new + * pending work is added. + */ +static void check_busy_worker(struct btrfs_worker_thread *worker) +{ + if (worker->idle && atomic_read(&worker->num_pending) >= + worker->workers->idle_thresh) { + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 0; + + if (!list_empty(&worker->worker_list)) { + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + } + spin_unlock_irqrestore(&worker->workers->lock, flags); + } +} + +static void check_pending_worker_creates(struct btrfs_worker_thread *worker) +{ + struct btrfs_workers *workers = worker->workers; + unsigned long flags; + + rmb(); + if (!workers->atomic_start_pending) + return; + + spin_lock_irqsave(&workers->lock, flags); + if (!workers->atomic_start_pending) + goto out; + + workers->atomic_start_pending = 0; + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) + goto out; + + workers->num_workers_starting += 1; + spin_unlock_irqrestore(&workers->lock, flags); + start_new_worker(workers); + return; + +out: + spin_unlock_irqrestore(&workers->lock, flags); +} + +static noinline int run_ordered_completions(struct btrfs_workers *workers, + struct btrfs_work *work) +{ + if (!workers->ordered) + return 0; + + set_bit(WORK_DONE_BIT, &work->flags); + + spin_lock(&workers->order_lock); + + while (1) { + if (!list_empty(&workers->prio_order_list)) { + work = list_entry(workers->prio_order_list.next, + struct btrfs_work, order_list); + } else if (!list_empty(&workers->order_list)) { + work = list_entry(workers->order_list.next, + struct btrfs_work, order_list); + } else { + break; + } + if (!test_bit(WORK_DONE_BIT, &work->flags)) + break; + + /* we are going to call the ordered done function, but + * we leave the work item on the list as a barrier so + * that later work items that are done don't have their + * functions called before this one returns + */ + if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) + break; + + spin_unlock(&workers->order_lock); + + work->ordered_func(work); + + /* now take the lock again and call the freeing code */ + spin_lock(&workers->order_lock); + list_del(&work->order_list); + work->ordered_free(work); + } + + spin_unlock(&workers->order_lock); + return 0; +} + +static void put_worker(struct btrfs_worker_thread *worker) +{ + if (atomic_dec_and_test(&worker->refs)) + kfree(worker); +} + +static int try_worker_shutdown(struct btrfs_worker_thread *worker) +{ + int freeit = 0; + + spin_lock_irq(&worker->lock); + spin_lock(&worker->workers->lock); + if (worker->workers->num_workers > 1 && + worker->idle && + !worker->working && + !list_empty(&worker->worker_list) && + list_empty(&worker->prio_pending) && + list_empty(&worker->pending) && + atomic_read(&worker->num_pending) == 0) { + freeit = 1; + list_del_init(&worker->worker_list); + worker->workers->num_workers--; + } + spin_unlock(&worker->workers->lock); + spin_unlock_irq(&worker->lock); + + if (freeit) + put_worker(worker); + return freeit; +} + +static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, + struct list_head *prio_head, + struct list_head *head) +{ + struct btrfs_work *work = NULL; + struct list_head *cur = NULL; + + if(!list_empty(prio_head)) + cur = prio_head->next; + + smp_mb(); + if (!list_empty(&worker->prio_pending)) + goto refill; + + if (!list_empty(head)) + cur = head->next; + + if (cur) + goto out; + +refill: + spin_lock_irq(&worker->lock); + list_splice_tail_init(&worker->prio_pending, prio_head); + list_splice_tail_init(&worker->pending, head); + + if (!list_empty(prio_head)) + cur = prio_head->next; + else if (!list_empty(head)) + cur = head->next; + spin_unlock_irq(&worker->lock); + + if (!cur) + goto out_fail; + +out: + work = list_entry(cur, struct btrfs_work, list); + +out_fail: + return work; +} + +/* + * main loop for servicing work items + */ +static int worker_loop(void *arg) +{ + struct btrfs_worker_thread *worker = arg; + struct list_head head; + struct list_head prio_head; + struct btrfs_work *work; + + INIT_LIST_HEAD(&head); + INIT_LIST_HEAD(&prio_head); + + do { +again: + while (1) { + + + work = get_next_work(worker, &prio_head, &head); + if (!work) + break; + + list_del(&work->list); + clear_bit(WORK_QUEUED_BIT, &work->flags); + + work->worker = worker; + + work->func(work); + + atomic_dec(&worker->num_pending); + /* + * unless this is an ordered work queue, + * 'work' was probably freed by func above. + */ + run_ordered_completions(worker->workers, work); + + check_pending_worker_creates(worker); + + } + + spin_lock_irq(&worker->lock); + check_idle_worker(worker); + + if (freezing(current)) { + worker->working = 0; + spin_unlock_irq(&worker->lock); + refrigerator(); + } else { + spin_unlock_irq(&worker->lock); + if (!kthread_should_stop()) { + cpu_relax(); + /* + * we've dropped the lock, did someone else + * jump_in? + */ + smp_mb(); + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) + continue; + + /* + * this short schedule allows more work to + * come in without the queue functions + * needing to go through wake_up_process() + * + * worker->working is still 1, so nobody + * is going to try and wake us up + */ + schedule_timeout(1); + smp_mb(); + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) + continue; + + if (kthread_should_stop()) + break; + + /* still no more work?, sleep for real */ + spin_lock_irq(&worker->lock); + set_current_state(TASK_INTERRUPTIBLE); + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) { + spin_unlock_irq(&worker->lock); + set_current_state(TASK_RUNNING); + goto again; + } + + /* + * this makes sure we get a wakeup when someone + * adds something new to the queue + */ + worker->working = 0; + spin_unlock_irq(&worker->lock); + + if (!kthread_should_stop()) { + schedule_timeout(HZ * 120); + if (!worker->working && + try_worker_shutdown(worker)) { + return 0; + } + } + } + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +/* + * this will wait for all the worker threads to shutdown + */ +int btrfs_stop_workers(struct btrfs_workers *workers) +{ + struct list_head *cur; + struct btrfs_worker_thread *worker; + int can_stop; + + spin_lock_irq(&workers->lock); + list_splice_init(&workers->idle_list, &workers->worker_list); + while (!list_empty(&workers->worker_list)) { + cur = workers->worker_list.next; + worker = list_entry(cur, struct btrfs_worker_thread, + worker_list); + + atomic_inc(&worker->refs); + workers->num_workers -= 1; + if (!list_empty(&worker->worker_list)) { + list_del_init(&worker->worker_list); + put_worker(worker); + can_stop = 1; + } else + can_stop = 0; + spin_unlock_irq(&workers->lock); + if (can_stop) + kthread_stop(worker->task); + spin_lock_irq(&workers->lock); + put_worker(worker); + } + spin_unlock_irq(&workers->lock); + return 0; +} + +/* + * simple init on struct btrfs_workers + */ +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_helper) +{ + workers->num_workers = 0; + workers->num_workers_starting = 0; + INIT_LIST_HEAD(&workers->worker_list); + INIT_LIST_HEAD(&workers->idle_list); + INIT_LIST_HEAD(&workers->order_list); + INIT_LIST_HEAD(&workers->prio_order_list); + spin_lock_init(&workers->lock); + spin_lock_init(&workers->order_lock); + workers->max_workers = max; + workers->idle_thresh = 32; + workers->name = name; + workers->ordered = 0; + workers->atomic_start_pending = 0; + workers->atomic_worker_start = async_helper; +} + +/* + * starts new worker threads. This does not enforce the max worker + * count in case you need to temporarily go past it. + */ +static int __btrfs_start_workers(struct btrfs_workers *workers, + int num_workers) +{ + struct btrfs_worker_thread *worker; + int ret = 0; + int i; + + for (i = 0; i < num_workers; i++) { + worker = kzalloc(sizeof(*worker), GFP_NOFS); + if (!worker) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->prio_pending); + INIT_LIST_HEAD(&worker->worker_list); + spin_lock_init(&worker->lock); + + atomic_set(&worker->num_pending, 0); + atomic_set(&worker->refs, 1); + worker->workers = workers; + worker->task = kthread_run(worker_loop, worker, + "btrfs-%s-%d", workers->name, + workers->num_workers + i); + if (IS_ERR(worker->task)) { + ret = PTR_ERR(worker->task); + kfree(worker); + goto fail; + } + spin_lock_irq(&workers->lock); + list_add_tail(&worker->worker_list, &workers->idle_list); + worker->idle = 1; + workers->num_workers++; + workers->num_workers_starting--; + WARN_ON(workers->num_workers_starting < 0); + spin_unlock_irq(&workers->lock); + } + return 0; +fail: + btrfs_stop_workers(workers); + return ret; +} + +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + spin_lock_irq(&workers->lock); + workers->num_workers_starting += num_workers; + spin_unlock_irq(&workers->lock); + return __btrfs_start_workers(workers, num_workers); +} + +/* + * run through the list and find a worker thread that doesn't have a lot + * to do right now. This can return null if we aren't yet at the thread + * count limit and all of the threads are busy. + */ +static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + struct list_head *next; + int enforce_min; + + enforce_min = (workers->num_workers + workers->num_workers_starting) < + workers->max_workers; + + /* + * if we find an idle thread, don't move it to the end of the + * idle list. This improves the chance that the next submission + * will reuse the same thread, and maybe catch it while it is still + * working + */ + if (!list_empty(&workers->idle_list)) { + next = workers->idle_list.next; + worker = list_entry(next, struct btrfs_worker_thread, + worker_list); + return worker; + } + if (enforce_min || list_empty(&workers->worker_list)) + return NULL; + + /* + * if we pick a busy task, move the task to the end of the list. + * hopefully this will keep things somewhat evenly balanced. + * Do the move in batches based on the sequence number. This groups + * requests submitted at roughly the same time onto the same worker. + */ + next = workers->worker_list.next; + worker = list_entry(next, struct btrfs_worker_thread, worker_list); + worker->sequence++; + + if (worker->sequence % workers->idle_thresh == 0) + list_move_tail(next, &workers->worker_list); + return worker; +} + +/* + * selects a worker thread to take the next job. This will either find + * an idle worker, start a new worker up to the max count, or just return + * one of the existing busy workers. + */ +static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + struct list_head *fallback; + +again: + spin_lock_irqsave(&workers->lock, flags); + worker = next_worker(workers); + + if (!worker) { + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) { + goto fallback; + } else if (workers->atomic_worker_start) { + workers->atomic_start_pending = 1; + goto fallback; + } else { + workers->num_workers_starting++; + spin_unlock_irqrestore(&workers->lock, flags); + /* we're below the limit, start another worker */ + __btrfs_start_workers(workers, 1); + goto again; + } + } + goto found; + +fallback: + fallback = NULL; + /* + * we have failed to find any workers, just + * return the first one we can find. + */ + if (!list_empty(&workers->worker_list)) + fallback = workers->worker_list.next; + if (!list_empty(&workers->idle_list)) + fallback = workers->idle_list.next; + BUG_ON(!fallback); + worker = list_entry(fallback, + struct btrfs_worker_thread, worker_list); +found: + /* + * this makes sure the worker doesn't exit before it is placed + * onto a busy/idle list + */ + atomic_inc(&worker->num_pending); + spin_unlock_irqrestore(&workers->lock, flags); + return worker; +} + +/* + * btrfs_requeue_work just puts the work item back on the tail of the list + * it was taken from. It is intended for use with long running work functions + * that make some progress and want to give the cpu up for others. + */ +int btrfs_requeue_work(struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker = work->worker; + unsigned long flags; + int wake = 0; + + if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); + atomic_inc(&worker->num_pending); + + /* by definition we're busy, take ourselves off the idle + * list + */ + if (worker->idle) { + spin_lock(&worker->workers->lock); + worker->idle = 0; + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + spin_unlock(&worker->workers->lock); + } + if (!worker->working) { + wake = 1; + worker->working = 1; + } + + if (wake) + wake_up_process(worker->task); + spin_unlock_irqrestore(&worker->lock, flags); +out: + + return 0; +} + +void btrfs_set_work_high_prio(struct btrfs_work *work) +{ + set_bit(WORK_HIGH_PRIO_BIT, &work->flags); +} + +/* + * places a struct btrfs_work into the pending queue of one of the kthreads + */ +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + int wake = 0; + + /* don't requeue something already on a list */ + if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + goto out; + + worker = find_worker(workers); + if (workers->ordered) { + /* + * you're not allowed to do ordered queues from an + * interrupt handler + */ + spin_lock(&workers->order_lock); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { + list_add_tail(&work->order_list, + &workers->prio_order_list); + } else { + list_add_tail(&work->order_list, &workers->order_list); + } + spin_unlock(&workers->order_lock); + } else { + INIT_LIST_HEAD(&work->order_list); + } + + spin_lock_irqsave(&worker->lock, flags); + + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); + check_busy_worker(worker); + + /* + * avoid calling into wake_up_process if this thread has already + * been kicked + */ + if (!worker->working) + wake = 1; + worker->working = 1; + + if (wake) + wake_up_process(worker->task); + spin_unlock_irqrestore(&worker->lock, flags); + +out: + return 0; +} diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h new file mode 100644 index 00000000..5077746c --- /dev/null +++ b/fs/btrfs/async-thread.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ASYNC_THREAD_ +#define __BTRFS_ASYNC_THREAD_ + +struct btrfs_worker_thread; + +/* + * This is similar to a workqueue, but it is meant to spread the operations + * across all available cpus instead of just the CPU that was used to + * queue the work. There is also some batching introduced to try and + * cut down on context switches. + * + * By default threads are added on demand up to 2 * the number of cpus. + * Changing struct btrfs_workers->max_workers is one way to prevent + * demand creation of kthreads. + * + * the basic model of these worker threads is to embed a btrfs_work + * structure in your own data struct, and use container_of in a + * work function to get back to your data struct. + */ +struct btrfs_work { + /* + * func should be set to the function you want called + * your work struct is passed as the only arg + * + * ordered_func must be set for work sent to an ordered work queue, + * and it is called to complete a given work item in the same + * order they were sent to the queue. + */ + void (*func)(struct btrfs_work *work); + void (*ordered_func)(struct btrfs_work *work); + void (*ordered_free)(struct btrfs_work *work); + + /* + * flags should be set to zero. It is used to make sure the + * struct is only inserted once into the list. + */ + unsigned long flags; + + /* don't touch these */ + struct btrfs_worker_thread *worker; + struct list_head list; + struct list_head order_list; +}; + +struct btrfs_workers { + /* current number of running workers */ + int num_workers; + + int num_workers_starting; + + /* max number of workers allowed. changed by btrfs_start_workers */ + int max_workers; + + /* once a worker has this many requests or fewer, it is idle */ + int idle_thresh; + + /* force completions in the order they were queued */ + int ordered; + + /* more workers required, but in an interrupt handler */ + int atomic_start_pending; + + /* + * are we allowed to sleep while starting workers or are we required + * to start them at a later time? If we can't sleep, this indicates + * which queue we need to use to schedule thread creation. + */ + struct btrfs_workers *atomic_worker_start; + + /* list with all the work threads. The workers on the idle thread + * may be actively servicing jobs, but they haven't yet hit the + * idle thresh limit above. + */ + struct list_head worker_list; + struct list_head idle_list; + + /* + * when operating in ordered mode, this maintains the list + * of work items waiting for completion + */ + struct list_head order_list; + struct list_head prio_order_list; + + /* lock for finding the next worker thread to queue on */ + spinlock_t lock; + + /* lock for the ordered lists */ + spinlock_t order_lock; + + /* extra name for this worker, used for current->name */ + char *name; +}; + +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); +int btrfs_stop_workers(struct btrfs_workers *workers); +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_starter); +int btrfs_requeue_work(struct btrfs_work *work); +void btrfs_set_work_high_prio(struct btrfs_work *work); +#endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h new file mode 100644 index 00000000..52d7eca8 --- /dev/null +++ b/fs/btrfs/btrfs_inode.h @@ -0,0 +1,187 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_I__ +#define __BTRFS_I__ + +#include "extent_map.h" +#include "extent_io.h" +#include "ordered-data.h" +#include "delayed-inode.h" + +/* in memory btrfs inode */ +struct btrfs_inode { + /* which subvolume this inode belongs to */ + struct btrfs_root *root; + + /* key used to find this inode on disk. This is used by the code + * to read in roots of subvolumes + */ + struct btrfs_key location; + + /* the extent_tree has caches of all the extent mappings to disk */ + struct extent_map_tree extent_tree; + + /* the io_tree does range state (DIRTY, LOCKED etc) */ + struct extent_io_tree io_tree; + + /* special utility tree used to record which mirrors have already been + * tried when checksums fail for a given block + */ + struct extent_io_tree io_failure_tree; + + /* held while logging the inode in tree-log.c */ + struct mutex log_mutex; + + /* used to order data wrt metadata */ + struct btrfs_ordered_inode_tree ordered_tree; + + /* for keeping track of orphaned inodes */ + struct list_head i_orphan; + + /* list of all the delalloc inodes in the FS. There are times we need + * to write all the delalloc pages to disk, and this list is used + * to walk them all. + */ + struct list_head delalloc_inodes; + + /* + * list for tracking inodes that must be sent to disk before a + * rename or truncate commit + */ + struct list_head ordered_operations; + + /* node for the red-black tree that links inodes in subvolume root */ + struct rb_node rb_node; + + /* the space_info for where this inode's data allocations are done */ + struct btrfs_space_info *space_info; + + /* full 64 bit generation number, struct vfs_inode doesn't have a big + * enough field for this. + */ + u64 generation; + + /* sequence number for NFS changes */ + u64 sequence; + + /* + * transid of the trans_handle that last modified this inode + */ + u64 last_trans; + + /* + * log transid when this inode was last modified + */ + u64 last_sub_trans; + + /* + * transid that last logged this inode + */ + u64 logged_trans; + + /* total number of bytes pending delalloc, used by stat to calc the + * real block usage of the file + */ + u64 delalloc_bytes; + + /* total number of bytes that may be used for this inode for + * delalloc + */ + u64 reserved_bytes; + + /* + * the size of the file stored in the metadata on disk. data=ordered + * means the in-memory i_size might be larger than the size on disk + * because not all the blocks are written yet. + */ + u64 disk_i_size; + + /* flags field from the on disk inode */ + u32 flags; + + /* + * if this is a directory then index_cnt is the counter for the index + * number for new files that are created + */ + u64 index_cnt; + + /* the fsync log has some corner cases that mean we have to check + * directories to see if any unlinks have been done before + * the directory was logged. See tree-log.c for all the + * details + */ + u64 last_unlink_trans; + + /* + * Counters to keep track of the number of extent item's we may use due + * to delalloc and such. outstanding_extents is the number of extent + * items we think we'll end up using, and reserved_extents is the number + * of extent items we've reserved metadata for. + */ + atomic_t outstanding_extents; + atomic_t reserved_extents; + + /* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + * + * yes, its silly to have a single bitflag, but we might grow more + * of these. + */ + unsigned ordered_data_close:1; + unsigned orphan_meta_reserved:1; + unsigned dummy_inode:1; + unsigned in_defrag:1; + + /* + * always compress this one file + */ + unsigned force_compress:4; + + struct btrfs_delayed_node *delayed_node; + + struct inode vfs_inode; +}; + +extern unsigned char btrfs_filetype_table[]; + +static inline struct btrfs_inode *BTRFS_I(struct inode *inode) +{ + return container_of(inode, struct btrfs_inode, vfs_inode); +} + +static inline u64 btrfs_ino(struct inode *inode) +{ + u64 ino = BTRFS_I(inode)->location.objectid; + + if (ino <= BTRFS_FIRST_FREE_OBJECTID) + ino = inode->i_ino; + return ino; +} + +static inline void btrfs_i_size_write(struct inode *inode, u64 size) +{ + i_size_write(inode, size); + BTRFS_I(inode)->disk_i_size = size; +} + +#endif diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h new file mode 100644 index 00000000..7c4503ef --- /dev/null +++ b/fs/btrfs/compat.h @@ -0,0 +1,7 @@ +#ifndef _COMPAT_H_ +#define _COMPAT_H_ + +#define btrfs_drop_nlink(inode) drop_nlink(inode) +#define btrfs_inc_nlink(inode) inc_nlink(inode) + +#endif /* _COMPAT_H_ */ diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c new file mode 100644 index 00000000..bfe42b03 --- /dev/null +++ b/fs/btrfs/compression.c @@ -0,0 +1,1029 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "ordered-data.h" +#include "compression.h" +#include "extent_io.h" +#include "extent_map.h" + +struct compressed_bio { + /* number of bios pending for this compressed extent */ + atomic_t pending_bios; + + /* the pages with the compressed data on them */ + struct page **compressed_pages; + + /* inode that owns this data */ + struct inode *inode; + + /* starting offset in the inode for our pages */ + u64 start; + + /* number of bytes in the inode we're working on */ + unsigned long len; + + /* number of bytes on disk */ + unsigned long compressed_len; + + /* the compression algorithm for this bio */ + int compress_type; + + /* number of compressed pages in the array */ + unsigned long nr_pages; + + /* IO errors */ + int errors; + int mirror_num; + + /* for reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + + /* + * the start of a variable length array of checksums only + * used by reads + */ + u32 sums; +}; + +static inline int compressed_bio_size(struct btrfs_root *root, + unsigned long disk_size) +{ + u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + return sizeof(struct compressed_bio) + + ((disk_size + root->sectorsize - 1) / root->sectorsize) * + csum_size; +} + +static struct bio *compressed_bio_alloc(struct block_device *bdev, + u64 first_byte, gfp_t gfp_flags) +{ + int nr_vecs; + + nr_vecs = bio_get_nr_vecs(bdev); + return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags); +} + +static int check_compressed_csum(struct inode *inode, + struct compressed_bio *cb, + u64 disk_start) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page *page; + unsigned long i; + char *kaddr; + u32 csum; + u32 *cb_sum = &cb->sums; + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + return 0; + + for (i = 0; i < cb->nr_pages; i++) { + page = cb->compressed_pages[i]; + csum = ~(u32)0; + + kaddr = kmap_atomic(page, KM_USER0); + csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr, KM_USER0); + + if (csum != *cb_sum) { + printk(KERN_INFO "btrfs csum failed ino %llu " + "extent %llu csum %u " + "wanted %u mirror %d\n", + (unsigned long long)btrfs_ino(inode), + (unsigned long long)disk_start, + csum, *cb_sum, cb->mirror_num); + ret = -EIO; + goto fail; + } + cb_sum++; + + } + ret = 0; +fail: + return ret; +} + +/* when we finish reading compressed pages from the disk, we + * decompress them and then run the bio end_io routines on the + * decompressed pages (in the inode address space). + * + * This allows the checksumming and other IO error handling routines + * to work normally + * + * The compressed pages are freed here, and it must be run + * in process context + */ +static void end_compressed_bio_read(struct bio *bio, int err) +{ + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + int ret; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + inode = cb->inode; + ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9); + if (ret) + goto csum_failed; + + /* ok, we're the last bio for this extent, lets start + * the decompression. + */ + ret = btrfs_decompress_biovec(cb->compress_type, + cb->compressed_pages, + cb->start, + cb->orig_bio->bi_io_vec, + cb->orig_bio->bi_vcnt, + cb->compressed_len); +csum_failed: + if (ret) + cb->errors = 1; + + /* release the compressed pages */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* do io completion on the original bio */ + if (cb->errors) { + bio_io_error(cb->orig_bio); + } else { + int bio_index = 0; + struct bio_vec *bvec = cb->orig_bio->bi_io_vec; + + /* + * we have verified the checksum already, set page + * checked so the end_io handlers know about it + */ + while (bio_index < cb->orig_bio->bi_vcnt) { + SetPageChecked(bvec->bv_page); + bvec++; + bio_index++; + } + bio_endio(cb->orig_bio, 0); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * Clear the writeback bits on all of the file + * pages for a compressed write + */ +static noinline int end_compressed_writeback(struct inode *inode, u64 start, + unsigned long ram_size) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; + struct page *pages[16]; + unsigned long nr_pages = end_index - index + 1; + int i; + int ret; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + nr_pages -= 1; + index += 1; + continue; + } + for (i = 0; i < ret; i++) { + end_page_writeback(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + } + /* the inode may be gone now */ + return 0; +} + +/* + * do the cleanup once all the compressed pages hit the disk. + * This will clear writeback on the file pages and free the compressed + * pages. + * + * This also calls the writeback end hooks for the file pages so that + * metadata and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + /* ok, we're the last bio for this extent, step one is to + * call back into the FS and do all the end_io operations + */ + inode = cb->inode; + tree = &BTRFS_I(inode)->io_tree; + cb->compressed_pages[0]->mapping = cb->inode->i_mapping; + tree->ops->writepage_end_io_hook(cb->compressed_pages[0], + cb->start, + cb->start + cb->len - 1, + NULL, 1); + cb->compressed_pages[0]->mapping = NULL; + + end_compressed_writeback(inode, cb->start, cb->len); + /* note, our inode could be gone now */ + + /* + * release the compressed pages, these came from alloc_page and + * are not attached to the inode at all + */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * worker function to build and submit bios for previously compressed pages. + * The corresponding pages in the inode should be marked for writeback + * and the compressed pages should have a reference on them for dropping + * when the IO is complete. + * + * This also checksums the file bytes and gets things ready for + * the end io hooks. + */ +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages) +{ + struct bio *bio = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct compressed_bio *cb; + unsigned long bytes_left; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int pg_index = 0; + struct page *page; + u64 first_byte = disk_start; + struct block_device *bdev; + int ret; + + WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + if (!cb) + return -ENOMEM; + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->start = start; + cb->len = len; + cb->mirror_num = 0; + cb->compressed_pages = compressed_pages; + cb->compressed_len = compressed_len; + cb->orig_bio = NULL; + cb->nr_pages = nr_pages; + + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + if(!bio) { + kfree(cb); + return -ENOMEM; + } + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + atomic_inc(&cb->pending_bios); + + /* create and submit bios for the compressed pages */ + bytes_left = compressed_len; + for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { + page = compressed_pages[pg_index]; + page->mapping = inode->i_mapping; + if (bio->bi_size) + ret = io_tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + bio, 0); + else + ret = 0; + + page->mapping = NULL; + if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(bio); + + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the cb might get + * freed before we're done setting it up + */ + atomic_inc(&cb->pending_bios); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + } + if (bytes_left < PAGE_CACHE_SIZE) { + printk("bytes left %lu compress len %lu nr %lu\n", + bytes_left, cb->compressed_len, cb->nr_pages); + } + bytes_left -= PAGE_CACHE_SIZE; + first_byte += PAGE_CACHE_SIZE; + cond_resched(); + } + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + return 0; +} + +static noinline int add_ra_bio_pages(struct inode *inode, + u64 compressed_end, + struct compressed_bio *cb) +{ + unsigned long end_index; + unsigned long pg_index; + u64 last_offset; + u64 isize = i_size_read(inode); + int ret; + struct page *page; + unsigned long nr_pages = 0; + struct extent_map *em; + struct address_space *mapping = inode->i_mapping; + struct extent_map_tree *em_tree; + struct extent_io_tree *tree; + u64 end; + int misses = 0; + + page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page; + last_offset = (page_offset(page) + PAGE_CACHE_SIZE); + em_tree = &BTRFS_I(inode)->extent_tree; + tree = &BTRFS_I(inode)->io_tree; + + if (isize == 0) + return 0; + + end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + + while (last_offset < compressed_end) { + pg_index = last_offset >> PAGE_CACHE_SHIFT; + + if (pg_index > end_index) + break; + + rcu_read_lock(); + page = radix_tree_lookup(&mapping->page_tree, pg_index); + rcu_read_unlock(); + if (page) { + misses++; + if (misses > 4) + break; + goto next; + } + + page = __page_cache_alloc(mapping_gfp_mask(mapping) & + ~__GFP_FS); + if (!page) + break; + + if (add_to_page_cache_lru(page, mapping, pg_index, + GFP_NOFS)) { + page_cache_release(page); + goto next; + } + + end = last_offset + PAGE_CACHE_SIZE - 1; + /* + * at this point, we have a locked page in the page cache + * for these bytes in the file. But, we have to make + * sure they map to this compressed extent on disk. + */ + set_page_extent_mapped(page); + lock_extent(tree, last_offset, end, GFP_NOFS); + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, last_offset, + PAGE_CACHE_SIZE); + read_unlock(&em_tree->lock); + + if (!em || last_offset < em->start || + (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || + (em->block_start >> 9) != cb->orig_bio->bi_sector) { + free_extent_map(em); + unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + break; + } + free_extent_map(em); + + if (page->index == end_index) { + char *userpage; + size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + int zeros; + zeros = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + zero_offset, 0, zeros); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + } + } + + ret = bio_add_page(cb->orig_bio, page, + PAGE_CACHE_SIZE, 0); + + if (ret == PAGE_CACHE_SIZE) { + nr_pages++; + page_cache_release(page); + } else { + unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + break; + } +next: + last_offset += PAGE_CACHE_SIZE; + } + return 0; +} + +/* + * for a compressed read, the bio we get passed has all the inode pages + * in it. We don't actually do IO on those pages but allocate new ones + * to hold the compressed pages on disk. + * + * bio->bi_sector points to the compressed extent on disk + * bio->bi_io_vec points to all of the inode pages + * bio->bi_vcnt is a count of pages + * + * After the compressed pages are read, we copy the bytes into the + * bio we were passed and then call the bio end_io calls + */ +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *em_tree; + struct compressed_bio *cb; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + unsigned long compressed_len; + unsigned long nr_pages; + unsigned long pg_index; + struct page *page; + struct block_device *bdev; + struct bio *comp_bio; + u64 cur_disk_byte = (u64)bio->bi_sector << 9; + u64 em_len; + u64 em_start; + struct extent_map *em; + int ret = -ENOMEM; + u32 *sums; + + tree = &BTRFS_I(inode)->io_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + + /* we need the actual starting offset of this extent in the file */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, + page_offset(bio->bi_io_vec->bv_page), + PAGE_CACHE_SIZE); + read_unlock(&em_tree->lock); + + compressed_len = em->block_len; + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + if (!cb) + goto out; + + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->mirror_num = mirror_num; + sums = &cb->sums; + + cb->start = em->orig_start; + em_len = em->len; + em_start = em->start; + + free_extent_map(em); + em = NULL; + + cb->len = uncompressed_len; + cb->compressed_len = compressed_len; + cb->compress_type = extent_compress_type(bio_flags); + cb->orig_bio = bio; + + nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, + GFP_NOFS); + if (!cb->compressed_pages) + goto fail1; + + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + for (pg_index = 0; pg_index < nr_pages; pg_index++) { + cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | + __GFP_HIGHMEM); + if (!cb->compressed_pages[pg_index]) + goto fail2; + } + cb->nr_pages = nr_pages; + + add_ra_bio_pages(inode, em_start + em_len, cb); + + /* include any pages we added in add_ra-bio_pages */ + uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + cb->len = uncompressed_len; + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + if (!comp_bio) + goto fail2; + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + atomic_inc(&cb->pending_bios); + + for (pg_index = 0; pg_index < nr_pages; pg_index++) { + page = cb->compressed_pages[pg_index]; + page->mapping = inode->i_mapping; + page->index = em_start >> PAGE_CACHE_SHIFT; + + if (comp_bio->bi_size) + ret = tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + comp_bio, 0); + else + ret = 0; + + page->mapping = NULL; + if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the cb might get + * freed before we're done setting it up + */ + atomic_inc(&cb->pending_bios); + + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + ret = btrfs_lookup_bio_sums(root, inode, + comp_bio, sums); + BUG_ON(ret); + } + sums += (comp_bio->bi_size + root->sectorsize - 1) / + root->sectorsize; + + ret = btrfs_map_bio(root, READ, comp_bio, + mirror_num, 0); + BUG_ON(ret); + + bio_put(comp_bio); + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, + GFP_NOFS); + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + + bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0); + } + cur_disk_byte += PAGE_CACHE_SIZE; + } + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + BUG_ON(ret); + } + + ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); + BUG_ON(ret); + + bio_put(comp_bio); + return 0; + +fail2: + for (pg_index = 0; pg_index < nr_pages; pg_index++) + free_page((unsigned long)cb->compressed_pages[pg_index]); + + kfree(cb->compressed_pages); +fail1: + kfree(cb); +out: + free_extent_map(em); + return ret; +} + +static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; +static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]; +static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; +static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; +static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; + +struct btrfs_compress_op *btrfs_compress_op[] = { + &btrfs_zlib_compress, + &btrfs_lzo_compress, +}; + +int __init btrfs_init_compress(void) +{ + int i; + + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + INIT_LIST_HEAD(&comp_idle_workspace[i]); + spin_lock_init(&comp_workspace_lock[i]); + atomic_set(&comp_alloc_workspace[i], 0); + init_waitqueue_head(&comp_workspace_wait[i]); + } + return 0; +} + +/* + * this finds an available workspace or allocates a new one + * ERR_PTR is returned if things go bad. + */ +static struct list_head *find_workspace(int type) +{ + struct list_head *workspace; + int cpus = num_online_cpus(); + int idx = type - 1; + + struct list_head *idle_workspace = &comp_idle_workspace[idx]; + spinlock_t *workspace_lock = &comp_workspace_lock[idx]; + atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; + wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; + int *num_workspace = &comp_num_workspace[idx]; +again: + spin_lock(workspace_lock); + if (!list_empty(idle_workspace)) { + workspace = idle_workspace->next; + list_del(workspace); + (*num_workspace)--; + spin_unlock(workspace_lock); + return workspace; + + } + if (atomic_read(alloc_workspace) > cpus) { + DEFINE_WAIT(wait); + + spin_unlock(workspace_lock); + prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(alloc_workspace) > cpus && !*num_workspace) + schedule(); + finish_wait(workspace_wait, &wait); + goto again; + } + atomic_inc(alloc_workspace); + spin_unlock(workspace_lock); + + workspace = btrfs_compress_op[idx]->alloc_workspace(); + if (IS_ERR(workspace)) { + atomic_dec(alloc_workspace); + wake_up(workspace_wait); + } + return workspace; +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +static void free_workspace(int type, struct list_head *workspace) +{ + int idx = type - 1; + struct list_head *idle_workspace = &comp_idle_workspace[idx]; + spinlock_t *workspace_lock = &comp_workspace_lock[idx]; + atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; + wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; + int *num_workspace = &comp_num_workspace[idx]; + + spin_lock(workspace_lock); + if (*num_workspace < num_online_cpus()) { + list_add_tail(workspace, idle_workspace); + (*num_workspace)++; + spin_unlock(workspace_lock); + goto wake; + } + spin_unlock(workspace_lock); + + btrfs_compress_op[idx]->free_workspace(workspace); + atomic_dec(alloc_workspace); +wake: + if (waitqueue_active(workspace_wait)) + wake_up(workspace_wait); +} + +/* + * cleanup function for module exit + */ +static void free_workspaces(void) +{ + struct list_head *workspace; + int i; + + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + while (!list_empty(&comp_idle_workspace[i])) { + workspace = comp_idle_workspace[i].next; + list_del(workspace); + btrfs_compress_op[i]->free_workspace(workspace); + atomic_dec(&comp_alloc_workspace[i]); + } + } +} + +/* + * given an address space and start/len, compress the bytes. + * + * pages are allocated to hold the compressed result and stored + * in 'pages' + * + * out_pages is used to return the number of pages allocated. There + * may be pages allocated even if we return an error + * + * total_in is used to return the number of bytes actually read. It + * may be smaller then len if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * total_out is used to return the total number of compressed bytes + * + * max_out tells us the max number of bytes that we're allowed to + * stuff into pages + */ +int btrfs_compress_pages(int type, struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -1; + + ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, + start, len, pages, + nr_dest_pages, out_pages, + total_in, total_out, + max_out); + free_workspace(type, workspace); + return ret; +} + +/* + * pages_in is an array of pages with compressed data. + * + * disk_start is the starting logical offset of this array in the file + * + * bvec is a bio_vec of pages from the file that we want to decompress into + * + * vcnt is the count of pages in the biovec + * + * srclen is the number of bytes in pages_in + * + * The basic idea is that we have a bio that was created by readpages. + * The pages in the bio are for the uncompressed data, and they may not + * be contiguous. They all correspond to the range of bytes covered by + * the compressed extent. + */ +int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, + struct bio_vec *bvec, int vcnt, size_t srclen) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -ENOMEM; + + ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, + disk_start, + bvec, vcnt, srclen); + free_workspace(type, workspace); + return ret; +} + +/* + * a less complex decompression routine. Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -ENOMEM; + + ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, + dest_page, start_byte, + srclen, destlen); + + free_workspace(type, workspace); + return ret; +} + +void btrfs_exit_compress(void) +{ + free_workspaces(); +} + +/* + * Copy uncompressed data from working buffer to pages. + * + * buf_start is the byte offset we're of the start of our workspace buffer. + * + * total_out is the last byte of the buffer + */ +int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, + unsigned long total_out, u64 disk_start, + struct bio_vec *bvec, int vcnt, + unsigned long *pg_index, + unsigned long *pg_offset) +{ + unsigned long buf_offset; + unsigned long current_buf_start; + unsigned long start_byte; + unsigned long working_bytes = total_out - buf_start; + unsigned long bytes; + char *kaddr; + struct page *page_out = bvec[*pg_index].bv_page; + + /* + * start byte is the first byte of the page we're currently + * copying into relative to the start of the compressed data. + */ + start_byte = page_offset(page_out) - disk_start; + + /* we haven't yet hit data corresponding to this page */ + if (total_out <= start_byte) + return 1; + + /* + * the start of the data we care about is offset into + * the middle of our working buffer + */ + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes -= buf_offset; + } else { + buf_offset = 0; + } + current_buf_start = buf_start; + + /* copy bytes from the working buffer into the pages */ + while (working_bytes > 0) { + bytes = min(PAGE_CACHE_SIZE - *pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, working_bytes); + kaddr = kmap_atomic(page_out, KM_USER0); + memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page_out); + + *pg_offset += bytes; + buf_offset += bytes; + working_bytes -= bytes; + current_buf_start += bytes; + + /* check if we need to pick another page */ + if (*pg_offset == PAGE_CACHE_SIZE) { + (*pg_index)++; + if (*pg_index >= vcnt) + return 0; + + page_out = bvec[*pg_index].bv_page; + *pg_offset = 0; + start_byte = page_offset(page_out) - disk_start; + + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + return 1; + + /* + * the next page in the biovec might not be adjacent + * to the last page, but it might still be found + * inside this working buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + buf_offset; + } + } + } + + return 1; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h new file mode 100644 index 00000000..a12059f4 --- /dev/null +++ b/fs/btrfs/compression.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_COMPRESSION_ +#define __BTRFS_COMPRESSION_ + +int btrfs_init_compress(void); +void btrfs_exit_compress(void); + +int btrfs_compress_pages(int type, struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); +int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, + struct bio_vec *bvec, int vcnt, size_t srclen); +int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen); +int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, + unsigned long total_out, u64 disk_start, + struct bio_vec *bvec, int vcnt, + unsigned long *pg_index, + unsigned long *pg_offset); + +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages); +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); + +struct btrfs_compress_op { + struct list_head *(*alloc_workspace)(void); + + void (*free_workspace)(struct list_head *workspace); + + int (*compress_pages)(struct list_head *workspace, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); + + int (*decompress_biovec)(struct list_head *workspace, + struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen); + + int (*decompress)(struct list_head *workspace, + unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen); +}; + +extern struct btrfs_compress_op btrfs_zlib_compress; +extern struct btrfs_compress_op btrfs_lzo_compress; + +#endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c new file mode 100644 index 00000000..2e667868 --- /dev/null +++ b/fs/btrfs/ctree.c @@ -0,0 +1,4402 @@ +/* + * Copyright (C) 2007,2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "locking.h" + +static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int level); +static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, int extend); +static int push_node_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *dst, + struct extent_buffer *src, int empty); +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst_buf, + struct extent_buffer *src_buf); +static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot); + +struct btrfs_path *btrfs_alloc_path(void) +{ + struct btrfs_path *path; + path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); + return path; +} + +/* + * set all locked nodes in the path to blocking locks. This should + * be done before scheduling + */ +noinline void btrfs_set_path_blocking(struct btrfs_path *p) +{ + int i; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (p->nodes[i] && p->locks[i]) + btrfs_set_lock_blocking(p->nodes[i]); + } +} + +/* + * reset all the locked nodes in the patch to spinning locks. + * + * held is used to keep lockdep happy, when lockdep is enabled + * we set held to a blocking lock before we go around and + * retake all the spinlocks in the path. You can safely use NULL + * for held + */ +noinline void btrfs_clear_path_blocking(struct btrfs_path *p, + struct extent_buffer *held) +{ + int i; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* lockdep really cares that we take all of these spinlocks + * in the right order. If any of the locks in the path are not + * currently blocking, it is going to complain. So, make really + * really sure by forcing the path to blocking before we clear + * the path blocking. + */ + if (held) + btrfs_set_lock_blocking(held); + btrfs_set_path_blocking(p); +#endif + + for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { + if (p->nodes[i] && p->locks[i]) + btrfs_clear_lock_blocking(p->nodes[i]); + } + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (held) + btrfs_clear_lock_blocking(held); +#endif +} + +/* this also releases the path */ +void btrfs_free_path(struct btrfs_path *p) +{ + if (!p) + return; + btrfs_release_path(p); + kmem_cache_free(btrfs_path_cachep, p); +} + +/* + * path release drops references on the extent buffers in the path + * and it drops any locks held by this path + * + * It is safe to call this on paths that no locks or extent buffers held. + */ +noinline void btrfs_release_path(struct btrfs_path *p) +{ + int i; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + p->slots[i] = 0; + if (!p->nodes[i]) + continue; + if (p->locks[i]) { + btrfs_tree_unlock(p->nodes[i]); + p->locks[i] = 0; + } + free_extent_buffer(p->nodes[i]); + p->nodes[i] = NULL; + } +} + +/* + * safely gets a reference on the root node of a tree. A lock + * is not taken, so a concurrent writer may put a different node + * at the root of the tree. See btrfs_lock_root_node for the + * looping required. + * + * The extent buffer returned by this has a reference taken, so + * it won't disappear. It may stop being the root of the tree + * at any time because there are no locks held. + */ +struct extent_buffer *btrfs_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + rcu_read_lock(); + eb = rcu_dereference(root->node); + extent_buffer_get(eb); + rcu_read_unlock(); + return eb; +} + +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root. A locked buffer + * is returned, with a reference held. + */ +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_lock(eb); + if (eb == root->node) + break; + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* cowonly root (everything not a reference counted cow subvolume), just get + * put onto a simple dirty list. transaction.c walks this to make sure they + * get properly updated on disk. + */ +static void add_root_to_dirty_list(struct btrfs_root *root) +{ + if (root->track_dirty && list_empty(&root->dirty_list)) { + list_add(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); + } +} + +/* + * used by snapshot creation to make a copy of a root for a tree with + * a given objectid. The buffer with the new root node is returned in + * cow_ret, and this func returns zero on success or a negative error code. + */ +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid) +{ + struct extent_buffer *cow; + int ret = 0; + int level; + struct btrfs_disk_key disk_key; + + WARN_ON(root->ref_cows && trans->transid != + root->fs_info->running_transaction->transid); + WARN_ON(root->ref_cows && trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + if (level == 0) + btrfs_item_key(buf, &disk_key, 0); + else + btrfs_node_key(buf, &disk_key, 0); + + cow = btrfs_alloc_free_block(trans, root, buf->len, 0, + new_root_objectid, &disk_key, level, + buf->start, 0); + if (IS_ERR(cow)) + return PTR_ERR(cow); + + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | + BTRFS_HEADER_FLAG_RELOC); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); + else + btrfs_set_header_owner(cow, new_root_objectid); + + write_extent_buffer(cow, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(cow), + BTRFS_FSID_SIZE); + + WARN_ON(btrfs_header_generation(buf) > trans->transid); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + + if (ret) + return ret; + + btrfs_mark_buffer_dirty(cow); + *cow_ret = cow; + return 0; +} + +/* + * check if the tree block can be shared by multiple trees + */ +int btrfs_block_can_be_shared(struct btrfs_root *root, + struct extent_buffer *buf) +{ + /* + * Tree blocks not in refernece counted trees and tree roots + * are never shared. If a block was allocated after the last + * snapshot and the block was not allocated by tree relocation, + * we know the block is not shared. + */ + if (root->ref_cows && + buf != root->node && buf != root->commit_root && + (btrfs_header_generation(buf) <= + btrfs_root_last_snapshot(&root->root_item) || + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + return 1; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (root->ref_cows && + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + return 1; +#endif + return 0; +} + +static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *cow, + int *last_ref) +{ + u64 refs; + u64 owner; + u64 flags; + u64 new_flags = 0; + int ret; + + /* + * Backrefs update rules: + * + * Always use full backrefs for extent pointers in tree block + * allocated by tree relocation. + * + * If a shared tree block is no longer referenced by its owner + * tree (btrfs_header_owner(buf) == root->root_key.objectid), + * use full backrefs for extent pointers in tree block. + * + * If a tree block is been relocating + * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), + * use full backrefs for extent pointers in tree block. + * The reason for this is some operations (such as drop tree) + * are only allowed for blocks use full backrefs. + */ + + if (btrfs_block_can_be_shared(root, buf)) { + ret = btrfs_lookup_extent_info(trans, root, buf->start, + buf->len, &refs, &flags); + BUG_ON(ret); + BUG_ON(refs == 0); + } else { + refs = 1; + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; + else + flags = 0; + } + + owner = btrfs_header_owner(buf); + BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + + if (refs > 1) { + if ((owner == root->root_key.objectid || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { + ret = btrfs_inc_ref(trans, root, buf, 1); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_dec_ref(trans, root, buf, 0); + BUG_ON(ret); + ret = btrfs_inc_ref(trans, root, cow, 1); + BUG_ON(ret); + } + new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else { + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + BUG_ON(ret); + } + if (new_flags != 0) { + ret = btrfs_set_disk_extent_flags(trans, root, + buf->start, + buf->len, + new_flags, 0); + BUG_ON(ret); + } + } else { + if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + BUG_ON(ret); + ret = btrfs_dec_ref(trans, root, buf, 1); + BUG_ON(ret); + } + clean_tree_block(trans, root, buf); + *last_ref = 1; + } + return 0; +} + +/* + * does the dirty work in cow of a single block. The parent block (if + * supplied) is updated to point to the new cow copy. The new buffer is marked + * dirty and returned locked. If you modify the block it needs to be marked + * dirty again. + * + * search_start -- an allocation hint for the new block + * + * empty_size -- a hint that you plan on doing more cow. This is the size in + * bytes the allocator should try to find free next to the block it returns. + * This is just a hint and may be ignored by the allocator. + */ +static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *cow; + int level; + int last_ref = 0; + int unlock_orig = 0; + u64 parent_start; + + if (*cow_ret == buf) + unlock_orig = 1; + + btrfs_assert_tree_locked(buf); + + WARN_ON(root->ref_cows && trans->transid != + root->fs_info->running_transaction->transid); + WARN_ON(root->ref_cows && trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + + if (level == 0) + btrfs_item_key(buf, &disk_key, 0); + else + btrfs_node_key(buf, &disk_key, 0); + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent) + parent_start = parent->start; + else + parent_start = 0; + } else + parent_start = 0; + + cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, + root->root_key.objectid, &disk_key, + level, search_start, empty_size); + if (IS_ERR(cow)) + return PTR_ERR(cow); + + /* cow is set to blocking by btrfs_init_new_buffer */ + + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | + BTRFS_HEADER_FLAG_RELOC); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); + else + btrfs_set_header_owner(cow, root->root_key.objectid); + + write_extent_buffer(cow, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(cow), + BTRFS_FSID_SIZE); + + update_ref_for_cow(trans, root, buf, cow, &last_ref); + + if (root->ref_cows) + btrfs_reloc_cow_block(trans, root, buf, cow); + + if (buf == root->node) { + WARN_ON(parent && parent != buf); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + parent_start = buf->start; + else + parent_start = 0; + + extent_buffer_get(cow); + rcu_assign_pointer(root->node, cow); + + btrfs_free_tree_block(trans, root, buf, parent_start, + last_ref); + free_extent_buffer(buf); + add_root_to_dirty_list(root); + } else { + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + parent_start = parent->start; + else + parent_start = 0; + + WARN_ON(trans->transid != btrfs_header_generation(parent)); + btrfs_set_node_blockptr(parent, parent_slot, + cow->start); + btrfs_set_node_ptr_generation(parent, parent_slot, + trans->transid); + btrfs_mark_buffer_dirty(parent); + btrfs_free_tree_block(trans, root, buf, parent_start, + last_ref); + } + if (unlock_orig) + btrfs_tree_unlock(buf); + free_extent_buffer(buf); + btrfs_mark_buffer_dirty(cow); + *cow_ret = cow; + return 0; +} + +static inline int should_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf) +{ + if (btrfs_header_generation(buf) == trans->transid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && + !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + return 0; + return 1; +} + +/* + * cows a single block, see __btrfs_cow_block for the real work. + * This version of it has extra checks so that a block isn't cow'd more than + * once per transaction, as long as it hasn't been written yet + */ +noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret) +{ + u64 search_start; + int ret; + + if (trans->transaction != root->fs_info->running_transaction) { + printk(KERN_CRIT "trans %llu running %llu\n", + (unsigned long long)trans->transid, + (unsigned long long) + root->fs_info->running_transaction->transid); + WARN_ON(1); + } + if (trans->transid != root->fs_info->generation) { + printk(KERN_CRIT "trans %llu running %llu\n", + (unsigned long long)trans->transid, + (unsigned long long)root->fs_info->generation); + WARN_ON(1); + } + + if (!should_cow_block(trans, root, buf)) { + *cow_ret = buf; + return 0; + } + + search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + + if (parent) + btrfs_set_lock_blocking(parent); + btrfs_set_lock_blocking(buf); + + ret = __btrfs_cow_block(trans, root, buf, parent, + parent_slot, cow_ret, search_start, 0); + + trace_btrfs_cow_block(root, buf, *cow_ret); + + return ret; +} + +/* + * helper function for defrag to decide if two blocks pointed to by a + * node are actually close by + */ +static int close_blocks(u64 blocknr, u64 other, u32 blocksize) +{ + if (blocknr < other && other - (blocknr + blocksize) < 32768) + return 1; + if (blocknr > other && blocknr - (other + blocksize) < 32768) + return 1; + return 0; +} + +/* + * compare two keys in a memcmp fashion + */ +static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) +{ + struct btrfs_key k1; + + btrfs_disk_key_to_cpu(&k1, disk); + + return btrfs_comp_cpu_keys(&k1, k2); +} + +/* + * same as comp_keys only with two btrfs_key's + */ +int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) +{ + if (k1->objectid > k2->objectid) + return 1; + if (k1->objectid < k2->objectid) + return -1; + if (k1->type > k2->type) + return 1; + if (k1->type < k2->type) + return -1; + if (k1->offset > k2->offset) + return 1; + if (k1->offset < k2->offset) + return -1; + return 0; +} + +/* + * this is used by the defrag code to go through all the + * leaves pointed to by a node and reallocate them so that + * disk order is close to key order + */ +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, int cache_only, u64 *last_ret, + struct btrfs_key *progress) +{ + struct extent_buffer *cur; + u64 blocknr; + u64 gen; + u64 search_start = *last_ret; + u64 last_block = 0; + u64 other; + u32 parent_nritems; + int end_slot; + int i; + int err = 0; + int parent_level; + int uptodate; + u32 blocksize; + int progress_passed = 0; + struct btrfs_disk_key disk_key; + + parent_level = btrfs_header_level(parent); + if (cache_only && parent_level != 1) + return 0; + + if (trans->transaction != root->fs_info->running_transaction) + WARN_ON(1); + if (trans->transid != root->fs_info->generation) + WARN_ON(1); + + parent_nritems = btrfs_header_nritems(parent); + blocksize = btrfs_level_size(root, parent_level - 1); + end_slot = parent_nritems; + + if (parent_nritems == 1) + return 0; + + btrfs_set_lock_blocking(parent); + + for (i = start_slot; i < end_slot; i++) { + int close = 1; + + if (!parent->map_token) { + map_extent_buffer(parent, + btrfs_node_key_ptr_offset(i), + sizeof(struct btrfs_key_ptr), + &parent->map_token, &parent->kaddr, + &parent->map_start, &parent->map_len, + KM_USER1); + } + btrfs_node_key(parent, &disk_key, i); + if (!progress_passed && comp_keys(&disk_key, progress) < 0) + continue; + + progress_passed = 1; + blocknr = btrfs_node_blockptr(parent, i); + gen = btrfs_node_ptr_generation(parent, i); + if (last_block == 0) + last_block = blocknr; + + if (i > 0) { + other = btrfs_node_blockptr(parent, i - 1); + close = close_blocks(blocknr, other, blocksize); + } + if (!close && i < end_slot - 2) { + other = btrfs_node_blockptr(parent, i + 1); + close = close_blocks(blocknr, other, blocksize); + } + if (close) { + last_block = blocknr; + continue; + } + if (parent->map_token) { + unmap_extent_buffer(parent, parent->map_token, + KM_USER1); + parent->map_token = NULL; + } + + cur = btrfs_find_tree_block(root, blocknr, blocksize); + if (cur) + uptodate = btrfs_buffer_uptodate(cur, gen); + else + uptodate = 0; + if (!cur || !uptodate) { + if (cache_only) { + free_extent_buffer(cur); + continue; + } + if (!cur) { + cur = read_tree_block(root, blocknr, + blocksize, gen); + if (!cur) + return -EIO; + } else if (!uptodate) { + btrfs_read_buffer(cur, gen); + } + } + if (search_start == 0) + search_start = last_block; + + btrfs_tree_lock(cur); + btrfs_set_lock_blocking(cur); + err = __btrfs_cow_block(trans, root, cur, parent, i, + &cur, search_start, + min(16 * blocksize, + (end_slot - i) * blocksize)); + if (err) { + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + break; + } + search_start = cur->start; + last_block = cur->start; + *last_ret = search_start; + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + } + if (parent->map_token) { + unmap_extent_buffer(parent, parent->map_token, + KM_USER1); + parent->map_token = NULL; + } + return err; +} + +/* + * The leaf data grows from end-to-front in the node. + * this returns the address of the start of the last item, + * which is the stop of the leaf data stack + */ +static inline unsigned int leaf_data_end(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(root); + return btrfs_item_offset_nr(leaf, nr - 1); +} + + +/* + * search for key in the extent_buffer. The items start at offset p, + * and they are item_size apart. There are 'max' items in p. + * + * the slot in the array is returned via slot, and it points to + * the place where you would insert key if it is not found in + * the array. + * + * slot may point to max if the key is bigger than all of the keys + */ +static noinline int generic_bin_search(struct extent_buffer *eb, + unsigned long p, + int item_size, struct btrfs_key *key, + int max, int *slot) +{ + int low = 0; + int high = max; + int mid; + int ret; + struct btrfs_disk_key *tmp = NULL; + struct btrfs_disk_key unaligned; + unsigned long offset; + char *map_token = NULL; + char *kaddr = NULL; + unsigned long map_start = 0; + unsigned long map_len = 0; + int err; + + while (low < high) { + mid = (low + high) / 2; + offset = p + mid * item_size; + + if (!map_token || offset < map_start || + (offset + sizeof(struct btrfs_disk_key)) > + map_start + map_len) { + if (map_token) { + unmap_extent_buffer(eb, map_token, KM_USER0); + map_token = NULL; + } + + err = map_private_extent_buffer(eb, offset, + sizeof(struct btrfs_disk_key), + &map_token, &kaddr, + &map_start, &map_len, KM_USER0); + + if (!err) { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } else { + read_extent_buffer(eb, &unaligned, + offset, sizeof(unaligned)); + tmp = &unaligned; + } + + } else { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } + ret = comp_keys(tmp, key); + + if (ret < 0) + low = mid + 1; + else if (ret > 0) + high = mid; + else { + *slot = mid; + if (map_token) + unmap_extent_buffer(eb, map_token, KM_USER0); + return 0; + } + } + *slot = low; + if (map_token) + unmap_extent_buffer(eb, map_token, KM_USER0); + return 1; +} + +/* + * simple bin_search frontend that does the right thing for + * leaves vs nodes + */ +static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot) +{ + if (level == 0) { + return generic_bin_search(eb, + offsetof(struct btrfs_leaf, items), + sizeof(struct btrfs_item), + key, btrfs_header_nritems(eb), + slot); + } else { + return generic_bin_search(eb, + offsetof(struct btrfs_node, ptrs), + sizeof(struct btrfs_key_ptr), + key, btrfs_header_nritems(eb), + slot); + } + return -1; +} + +int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot) +{ + return bin_search(eb, key, level, slot); +} + +static void root_add_used(struct btrfs_root *root, u32 size) +{ + spin_lock(&root->accounting_lock); + btrfs_set_root_used(&root->root_item, + btrfs_root_used(&root->root_item) + size); + spin_unlock(&root->accounting_lock); +} + +static void root_sub_used(struct btrfs_root *root, u32 size) +{ + spin_lock(&root->accounting_lock); + btrfs_set_root_used(&root->root_item, + btrfs_root_used(&root->root_item) - size); + spin_unlock(&root->accounting_lock); +} + +/* given a node and slot number, this reads the blocks it points to. The + * extent buffer is returned with a reference taken (but unlocked). + * NULL is returned on error. + */ +static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, + struct extent_buffer *parent, int slot) +{ + int level = btrfs_header_level(parent); + if (slot < 0) + return NULL; + if (slot >= btrfs_header_nritems(parent)) + return NULL; + + BUG_ON(level == 0); + + return read_tree_block(root, btrfs_node_blockptr(parent, slot), + btrfs_level_size(root, level - 1), + btrfs_node_ptr_generation(parent, slot)); +} + +/* + * node level balancing, used to make sure nodes are in proper order for + * item deletion. We balance from the top down, so we have to make sure + * that a deletion won't leave an node completely empty later on. + */ +static noinline int balance_level(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + u64 orig_ptr; + + if (level == 0) + return 0; + + mid = path->nodes[level]; + + WARN_ON(!path->locks[level]); + WARN_ON(btrfs_header_generation(mid) != trans->transid); + + orig_ptr = btrfs_node_blockptr(mid, orig_slot); + + if (level < BTRFS_MAX_LEVEL - 1) + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + + /* + * deal with the case where there is only one pointer in the root + * by promoting the node below to a root + */ + if (!parent) { + struct extent_buffer *child; + + if (btrfs_header_nritems(mid) != 1) + return 0; + + /* promote the child to a root */ + child = read_node_slot(root, mid, 0); + BUG_ON(!child); + btrfs_tree_lock(child); + btrfs_set_lock_blocking(child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child); + if (ret) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + goto enospc; + } + + rcu_assign_pointer(root->node, child); + + add_root_to_dirty_list(root); + btrfs_tree_unlock(child); + + path->locks[level] = 0; + path->nodes[level] = NULL; + clean_tree_block(trans, root, mid); + btrfs_tree_unlock(mid); + /* once for the path */ + free_extent_buffer(mid); + + root_sub_used(root, mid->len); + btrfs_free_tree_block(trans, root, mid, 0, 1); + /* once for the root ptr */ + free_extent_buffer(mid); + return 0; + } + if (btrfs_header_nritems(mid) > + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) + return 0; + + btrfs_header_nritems(mid); + + left = read_node_slot(root, parent, pslot - 1); + if (left) { + btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + wret = btrfs_cow_block(trans, root, left, + parent, pslot - 1, &left); + if (wret) { + ret = wret; + goto enospc; + } + } + right = read_node_slot(root, parent, pslot + 1); + if (right) { + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + wret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, &right); + if (wret) { + ret = wret; + goto enospc; + } + } + + /* first, try to make some room in the middle buffer */ + if (left) { + orig_slot += btrfs_header_nritems(left); + wret = push_node_left(trans, root, left, mid, 1); + if (wret < 0) + ret = wret; + btrfs_header_nritems(mid); + } + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + wret = push_node_left(trans, root, mid, right, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + if (btrfs_header_nritems(right) == 0) { + clean_tree_block(trans, root, right); + btrfs_tree_unlock(right); + wret = del_ptr(trans, root, path, level + 1, pslot + + 1); + if (wret) + ret = wret; + root_sub_used(root, right->len); + btrfs_free_tree_block(trans, root, right, 0, 1); + free_extent_buffer(right); + right = NULL; + } else { + struct btrfs_disk_key right_key; + btrfs_node_key(right, &right_key, 0); + btrfs_set_node_key(parent, &right_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + } + } + if (btrfs_header_nritems(mid) == 1) { + /* + * we're not allowed to leave a node with one item in the + * tree during a delete. A deletion from lower in the tree + * could try to delete the only pointer in this node. + * So, pull some keys from the left. + * There has to be a left pointer at this point because + * otherwise we would have pulled some pointers from the + * right + */ + BUG_ON(!left); + wret = balance_node_right(trans, root, mid, left); + if (wret < 0) { + ret = wret; + goto enospc; + } + if (wret == 1) { + wret = push_node_left(trans, root, left, mid, 1); + if (wret < 0) + ret = wret; + } + BUG_ON(wret == 1); + } + if (btrfs_header_nritems(mid) == 0) { + clean_tree_block(trans, root, mid); + btrfs_tree_unlock(mid); + wret = del_ptr(trans, root, path, level + 1, pslot); + if (wret) + ret = wret; + root_sub_used(root, mid->len); + btrfs_free_tree_block(trans, root, mid, 0, 1); + free_extent_buffer(mid); + mid = NULL; + } else { + /* update the parent key to reflect our changes */ + struct btrfs_disk_key mid_key; + btrfs_node_key(mid, &mid_key, 0); + btrfs_set_node_key(parent, &mid_key, pslot); + btrfs_mark_buffer_dirty(parent); + } + + /* update the path */ + if (left) { + if (btrfs_header_nritems(left) > orig_slot) { + extent_buffer_get(left); + /* left was locked after cow */ + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + if (mid) { + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } + } else { + orig_slot -= btrfs_header_nritems(left); + path->slots[level] = orig_slot; + } + } + /* double check we haven't messed things up */ + if (orig_ptr != + btrfs_node_blockptr(path->nodes[level], path->slots[level])) + BUG(); +enospc: + if (right) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + if (left) { + if (path->nodes[level] != left) + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return ret; +} + +/* Node balancing for insertion. Here we only split or push nodes around + * when they are completely full. This is also done top down, so we + * have to be pessimistic. + */ +static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + + if (level == 0) + return 1; + + mid = path->nodes[level]; + WARN_ON(btrfs_header_generation(mid) != trans->transid); + + if (level < BTRFS_MAX_LEVEL - 1) + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + + if (!parent) + return 1; + + left = read_node_slot(root, parent, pslot - 1); + + /* first, try to make some room in the middle buffer */ + if (left) { + u32 left_nr; + + btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + + left_nr = btrfs_header_nritems(left); + if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, left, parent, + pslot - 1, &left); + if (ret) + wret = 1; + else { + wret = push_node_left(trans, root, + left, mid, 0); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + orig_slot += left_nr; + btrfs_node_key(mid, &disk_key, 0); + btrfs_set_node_key(parent, &disk_key, pslot); + btrfs_mark_buffer_dirty(parent); + if (btrfs_header_nritems(left) > orig_slot) { + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + orig_slot -= + btrfs_header_nritems(left); + path->slots[level] = orig_slot; + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return 0; + } + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + right = read_node_slot(root, parent, pslot + 1); + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + u32 right_nr; + + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + + right_nr = btrfs_header_nritems(right); + if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, + &right); + if (ret) + wret = 1; + else { + wret = balance_node_right(trans, root, + right, mid); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(right, &disk_key, 0); + btrfs_set_node_key(parent, &disk_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + + if (btrfs_header_nritems(mid) <= orig_slot) { + path->nodes[level] = right; + path->slots[level + 1] += 1; + path->slots[level] = orig_slot - + btrfs_header_nritems(mid); + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + } + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 1; +} + +/* + * readahead one full node of leaves, finding things that are close + * to the block in 'slot', and triggering ra on them. + */ +static void reada_for_search(struct btrfs_root *root, + struct btrfs_path *path, + int level, int slot, u64 objectid) +{ + struct extent_buffer *node; + struct btrfs_disk_key disk_key; + u32 nritems; + u64 search; + u64 target; + u64 nread = 0; + u64 gen; + int direction = path->reada; + struct extent_buffer *eb; + u32 nr; + u32 blocksize; + u32 nscan = 0; + bool map = true; + + if (level != 1) + return; + + if (!path->nodes[level]) + return; + + node = path->nodes[level]; + + search = btrfs_node_blockptr(node, slot); + blocksize = btrfs_level_size(root, level - 1); + eb = btrfs_find_tree_block(root, search, blocksize); + if (eb) { + free_extent_buffer(eb); + return; + } + + target = search; + + nritems = btrfs_header_nritems(node); + nr = slot; + if (node->map_token || path->skip_locking) + map = false; + + while (1) { + if (map && !node->map_token) { + unsigned long offset = btrfs_node_key_ptr_offset(nr); + map_private_extent_buffer(node, offset, + sizeof(struct btrfs_key_ptr), + &node->map_token, + &node->kaddr, + &node->map_start, + &node->map_len, KM_USER1); + } + if (direction < 0) { + if (nr == 0) + break; + nr--; + } else if (direction > 0) { + nr++; + if (nr >= nritems) + break; + } + if (path->reada < 0 && objectid) { + btrfs_node_key(node, &disk_key, nr); + if (btrfs_disk_key_objectid(&disk_key) != objectid) + break; + } + search = btrfs_node_blockptr(node, nr); + if ((search <= target && target - search <= 65536) || + (search > target && search - target <= 65536)) { + gen = btrfs_node_ptr_generation(node, nr); + if (map && node->map_token) { + unmap_extent_buffer(node, node->map_token, + KM_USER1); + node->map_token = NULL; + } + readahead_tree_block(root, search, blocksize, gen); + nread += blocksize; + } + nscan++; + if ((nread > 65536 || nscan > 32)) + break; + } + if (map && node->map_token) { + unmap_extent_buffer(node, node->map_token, KM_USER1); + node->map_token = NULL; + } +} + +/* + * returns -EAGAIN if it had to drop the path, or zero if everything was in + * cache + */ +static noinline int reada_for_balance(struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + int slot; + int nritems; + struct extent_buffer *parent; + struct extent_buffer *eb; + u64 gen; + u64 block1 = 0; + u64 block2 = 0; + int ret = 0; + int blocksize; + + parent = path->nodes[level + 1]; + if (!parent) + return 0; + + nritems = btrfs_header_nritems(parent); + slot = path->slots[level + 1]; + blocksize = btrfs_level_size(root, level); + + if (slot > 0) { + block1 = btrfs_node_blockptr(parent, slot - 1); + gen = btrfs_node_ptr_generation(parent, slot - 1); + eb = btrfs_find_tree_block(root, block1, blocksize); + if (eb && btrfs_buffer_uptodate(eb, gen)) + block1 = 0; + free_extent_buffer(eb); + } + if (slot + 1 < nritems) { + block2 = btrfs_node_blockptr(parent, slot + 1); + gen = btrfs_node_ptr_generation(parent, slot + 1); + eb = btrfs_find_tree_block(root, block2, blocksize); + if (eb && btrfs_buffer_uptodate(eb, gen)) + block2 = 0; + free_extent_buffer(eb); + } + if (block1 || block2) { + ret = -EAGAIN; + + /* release the whole path */ + btrfs_release_path(path); + + /* read the blocks */ + if (block1) + readahead_tree_block(root, block1, blocksize, 0); + if (block2) + readahead_tree_block(root, block2, blocksize, 0); + + if (block1) { + eb = read_tree_block(root, block1, blocksize, 0); + free_extent_buffer(eb); + } + if (block2) { + eb = read_tree_block(root, block2, blocksize, 0); + free_extent_buffer(eb); + } + } + return ret; +} + + +/* + * when we walk down the tree, it is usually safe to unlock the higher layers + * in the tree. The exceptions are when our path goes through slot 0, because + * operations on the tree might require changing key pointers higher up in the + * tree. + * + * callers might also have set path->keep_locks, which tells this code to keep + * the lock if the path points to the last slot in the block. This is part of + * walking through the tree, and selecting the next slot in the higher block. + * + * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so + * if lowest_unlock is 1, level 0 won't be unlocked + */ +static noinline void unlock_up(struct btrfs_path *path, int level, + int lowest_unlock) +{ + int i; + int skip_level = level; + int no_skips = 0; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + break; + if (!path->locks[i]) + break; + if (!no_skips && path->slots[i] == 0) { + skip_level = i + 1; + continue; + } + if (!no_skips && path->keep_locks) { + u32 nritems; + t = path->nodes[i]; + nritems = btrfs_header_nritems(t); + if (nritems < 1 || path->slots[i] >= nritems - 1) { + skip_level = i + 1; + continue; + } + } + if (skip_level < i && i >= lowest_unlock) + no_skips = 1; + + t = path->nodes[i]; + if (i >= lowest_unlock && i > skip_level && path->locks[i]) { + btrfs_tree_unlock(t); + path->locks[i] = 0; + } + } +} + +/* + * This releases any locks held in the path starting at level and + * going all the way up to the root. + * + * btrfs_search_slot will keep the lock held on higher nodes in a few + * corner cases, such as COW of the block at slot zero in the node. This + * ignores those rules, and it should only be called when there are no + * more updates to be done higher up in the tree. + */ +noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) +{ + int i; + + if (path->keep_locks) + return; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + continue; + if (!path->locks[i]) + continue; + btrfs_tree_unlock(path->nodes[i]); + path->locks[i] = 0; + } +} + +/* + * helper function for btrfs_search_slot. The goal is to find a block + * in cache without setting the path to blocking. If we find the block + * we return zero and the path is unchanged. + * + * If we can't find the block, we set the path blocking and do some + * reada. -EAGAIN is returned and the search must be repeated. + */ +static int +read_block_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer **eb_ret, int level, int slot, + struct btrfs_key *key) +{ + u64 blocknr; + u64 gen; + u32 blocksize; + struct extent_buffer *b = *eb_ret; + struct extent_buffer *tmp; + int ret; + + blocknr = btrfs_node_blockptr(b, slot); + gen = btrfs_node_ptr_generation(b, slot); + blocksize = btrfs_level_size(root, level - 1); + + tmp = btrfs_find_tree_block(root, blocknr, blocksize); + if (tmp) { + if (btrfs_buffer_uptodate(tmp, 0)) { + if (btrfs_buffer_uptodate(tmp, gen)) { + /* + * we found an up to date block without + * sleeping, return + * right away + */ + *eb_ret = tmp; + return 0; + } + /* the pages were up to date, but we failed + * the generation number check. Do a full + * read for the generation number that is correct. + * We must do this without dropping locks so + * we can trust our generation number + */ + free_extent_buffer(tmp); + tmp = read_tree_block(root, blocknr, blocksize, gen); + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + *eb_ret = tmp; + return 0; + } + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EIO; + } + } + + /* + * reduce lock contention at high levels + * of the btree by dropping locks before + * we read. Don't release the lock on the current + * level because we need to walk this node to figure + * out which blocks to read. + */ + btrfs_unlock_up_safe(p, level + 1); + btrfs_set_path_blocking(p); + + free_extent_buffer(tmp); + if (p->reada) + reada_for_search(root, p, level, slot, key->objectid); + + btrfs_release_path(p); + + ret = -EAGAIN; + tmp = read_tree_block(root, blocknr, blocksize, 0); + if (tmp) { + /* + * If the read above didn't mark this buffer up to date, + * it will never end up being up to date. Set ret to EIO now + * and give up so that our caller doesn't loop forever + * on our EAGAINs. + */ + if (!btrfs_buffer_uptodate(tmp, 0)) + ret = -EIO; + free_extent_buffer(tmp); + } + return ret; +} + +/* + * helper function for btrfs_search_slot. This does all of the checks + * for node-level blocks and does any balancing required based on + * the ins_len. + * + * If no extra work was required, zero is returned. If we had to + * drop the path, -EAGAIN is returned and btrfs_search_slot must + * start over + */ +static int +setup_nodes_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer *b, int level, int ins_len) +{ + int ret; + if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = split_node(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + } else if (ins_len < 0 && btrfs_header_nritems(b) < + BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = balance_level(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + if (!b) { + btrfs_release_path(p); + goto again; + } + BUG_ON(btrfs_header_nritems(b) == 1); + } + return 0; + +again: + ret = -EAGAIN; +done: + return ret; +} + +/* + * look for key in the tree. path is filled in with nodes along the way + * if key is found, we return zero and you can find the item in the leaf + * level of the path (level 0) + * + * If the key isn't found, the path points to the slot where it should + * be inserted, and 1 is returned. If there are other errors during the + * search a negative error number is returned. + * + * if ins_len > 0, nodes and leaves will be split as we walk down the + * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if + * possible) + */ +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow) +{ + struct extent_buffer *b; + int slot; + int ret; + int err; + int level; + int lowest_unlock = 1; + u8 lowest_level = 0; + + lowest_level = p->lowest_level; + WARN_ON(lowest_level && ins_len > 0); + WARN_ON(p->nodes[0] != NULL); + + if (ins_len < 0) + lowest_unlock = 2; + +again: + if (p->search_commit_root) { + b = root->commit_root; + extent_buffer_get(b); + if (!p->skip_locking) + btrfs_tree_lock(b); + } else { + if (p->skip_locking) + b = btrfs_root_node(root); + else + b = btrfs_lock_root_node(root); + } + + while (b) { + level = btrfs_header_level(b); + + /* + * setup the path here so we can release it under lock + * contention with the cow code + */ + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = 1; + + if (cow) { + /* + * if we don't really need to cow this block + * then we don't want to set the path blocking, + * so we test it here + */ + if (!should_cow_block(trans, root, b)) + goto cow_done; + + btrfs_set_path_blocking(p); + + err = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b); + if (err) { + ret = err; + goto done; + } + } +cow_done: + BUG_ON(!cow && ins_len); + + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = 1; + + btrfs_clear_path_blocking(p, NULL); + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + * + * If cow is true, then we might be changing slot zero, + * which may require changing the parent. So, we can't + * drop the lock until after we know which slot we're + * operating on. + */ + if (!cow) + btrfs_unlock_up_safe(p, level + 1); + + ret = bin_search(b, key, level, &slot); + + if (level != 0) { + int dec = 0; + if (ret && slot > 0) { + dec = 1; + slot -= 1; + } + p->slots[level] = slot; + err = setup_nodes_for_search(trans, root, p, b, level, + ins_len); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + b = p->nodes[level]; + slot = p->slots[level]; + + unlock_up(p, level, lowest_unlock); + + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } + + err = read_block_for_search(trans, root, p, + &b, level, slot, key); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + if (!p->skip_locking) { + btrfs_clear_path_blocking(p, NULL); + err = btrfs_try_spin_lock(b); + + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(p, b); + } + } + } else { + p->slots[level] = slot; + if (ins_len > 0 && + btrfs_leaf_free_space(root, b) < ins_len) { + btrfs_set_path_blocking(p); + err = split_leaf(trans, root, key, + p, ins_len, ret == 0); + btrfs_clear_path_blocking(p, NULL); + + BUG_ON(err > 0); + if (err) { + ret = err; + goto done; + } + } + if (!p->search_for_split) + unlock_up(p, level, lowest_unlock); + goto done; + } + } + ret = 1; +done: + /* + * we don't really know what they plan on doing with the path + * from here on, so for now just mark it as blocking + */ + if (!p->leave_spinning) + btrfs_set_path_blocking(p); + if (ret < 0) + btrfs_release_path(p); + return ret; +} + +/* + * adjust the pointers going up the tree, starting at level + * making sure the right key of each node is points to 'key'. + * This is used after shifting pointers to the left, so it stops + * fixing up pointers when a given leaf/node is not in slot 0 of the + * higher levels + * + * If this fails to write a tree block, it returns -1, but continues + * fixing up the blocks in ram so the tree is consistent. + */ +static int fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, int level) +{ + int i; + int ret = 0; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + int tslot = path->slots[i]; + if (!path->nodes[i]) + break; + t = path->nodes[i]; + btrfs_set_node_key(t, key, tslot); + btrfs_mark_buffer_dirty(path->nodes[i]); + if (tslot != 0) + break; + } + return ret; +} + +/* + * update item key. + * + * This function isn't completely safe. It's the caller's responsibility + * that the new key won't break the order + */ +int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *eb; + int slot; + + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot > 0) { + btrfs_item_key(eb, &disk_key, slot - 1); + if (comp_keys(&disk_key, new_key) >= 0) + return -1; + } + if (slot < btrfs_header_nritems(eb) - 1) { + btrfs_item_key(eb, &disk_key, slot + 1); + if (comp_keys(&disk_key, new_key) <= 0) + return -1; + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(eb, &disk_key, slot); + btrfs_mark_buffer_dirty(eb); + if (slot == 0) + fixup_low_keys(trans, root, path, &disk_key, 1); + return 0; +} + +/* + * try to push data from one node into the next node left in the + * tree. + * + * returns 0 if some ptrs were pushed left, < 0 if there was some horrible + * error, and > 0 if there was no room in the left hand block. + */ +static int push_node_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *dst, + struct extent_buffer *src, int empty) +{ + int push_items = 0; + int src_nritems; + int dst_nritems; + int ret = 0; + + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); + push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + WARN_ON(btrfs_header_generation(src) != trans->transid); + WARN_ON(btrfs_header_generation(dst) != trans->transid); + + if (!empty && src_nritems <= 8) + return 1; + + if (push_items <= 0) + return 1; + + if (empty) { + push_items = min(src_nritems, push_items); + if (push_items < src_nritems) { + /* leave at least 8 pointers in the node if + * we aren't going to empty it + */ + if (src_nritems - push_items < 8) { + if (push_items <= 8) + return 1; + push_items -= 8; + } + } + } else + push_items = min(src_nritems - 8, push_items); + + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(dst_nritems), + btrfs_node_key_ptr_offset(0), + push_items * sizeof(struct btrfs_key_ptr)); + + if (push_items < src_nritems) { + memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(push_items), + (src_nritems - push_items) * + sizeof(struct btrfs_key_ptr)); + } + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + btrfs_mark_buffer_dirty(src); + btrfs_mark_buffer_dirty(dst); + + return ret; +} + +/* + * try to push data from one node into the next node right in the + * tree. + * + * returns 0 if some ptrs were pushed, < 0 if there was some horrible + * error, and > 0 if there was no room in the right hand block. + * + * this will only push up to 1/2 the contents of the left node over + */ +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst, + struct extent_buffer *src) +{ + int push_items = 0; + int max_push; + int src_nritems; + int dst_nritems; + int ret = 0; + + WARN_ON(btrfs_header_generation(src) != trans->transid); + WARN_ON(btrfs_header_generation(dst) != trans->transid); + + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); + push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + if (push_items <= 0) + return 1; + + if (src_nritems < 4) + return 1; + + max_push = src_nritems / 2 + 1; + /* don't try to empty the node */ + if (max_push >= src_nritems) + return 1; + + if (max_push < push_items) + push_items = max_push; + + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), + btrfs_node_key_ptr_offset(0), + (dst_nritems) * + sizeof(struct btrfs_key_ptr)); + + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(src_nritems - push_items), + push_items * sizeof(struct btrfs_key_ptr)); + + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + + btrfs_mark_buffer_dirty(src); + btrfs_mark_buffer_dirty(dst); + + return ret; +} + +/* + * helper function to insert a new root level in the tree. + * A new node is allocated, and a single item is inserted to + * point to the existing root + * + * returns zero on success or < 0 on failure. + */ +static noinline int insert_new_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + u64 lower_gen; + struct extent_buffer *lower; + struct extent_buffer *c; + struct extent_buffer *old; + struct btrfs_disk_key lower_key; + + BUG_ON(path->nodes[level]); + BUG_ON(path->nodes[level-1] != root->node); + + lower = path->nodes[level-1]; + if (level == 1) + btrfs_item_key(lower, &lower_key, 0); + else + btrfs_node_key(lower, &lower_key, 0); + + c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, + root->root_key.objectid, &lower_key, + level, root->node->start, 0); + if (IS_ERR(c)) + return PTR_ERR(c); + + root_add_used(root, root->nodesize); + + memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_nritems(c, 1); + btrfs_set_header_level(c, level); + btrfs_set_header_bytenr(c, c->start); + btrfs_set_header_generation(c, trans->transid); + btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(c, root->root_key.objectid); + + write_extent_buffer(c, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(c), + BTRFS_FSID_SIZE); + + write_extent_buffer(c, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(c), + BTRFS_UUID_SIZE); + + btrfs_set_node_key(c, &lower_key, 0); + btrfs_set_node_blockptr(c, 0, lower->start); + lower_gen = btrfs_header_generation(lower); + WARN_ON(lower_gen != trans->transid); + + btrfs_set_node_ptr_generation(c, 0, lower_gen); + + btrfs_mark_buffer_dirty(c); + + old = root->node; + rcu_assign_pointer(root->node, c); + + /* the super has an extra ref to root->node */ + free_extent_buffer(old); + + add_root_to_dirty_list(root); + extent_buffer_get(c); + path->nodes[level] = c; + path->locks[level] = 1; + path->slots[level] = 0; + return 0; +} + +/* + * worker function to insert a single pointer in a node. + * the node should have enough room for the pointer already + * + * slot and level indicate where you want the key to go, and + * blocknr is the block the key points to. + * + * returns zero on success and < 0 on any error + */ +static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, struct btrfs_disk_key + *key, u64 bytenr, int slot, int level) +{ + struct extent_buffer *lower; + int nritems; + + BUG_ON(!path->nodes[level]); + btrfs_assert_tree_locked(path->nodes[level]); + lower = path->nodes[level]; + nritems = btrfs_header_nritems(lower); + BUG_ON(slot > nritems); + if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) + BUG(); + if (slot != nritems) { + memmove_extent_buffer(lower, + btrfs_node_key_ptr_offset(slot + 1), + btrfs_node_key_ptr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_key_ptr)); + } + btrfs_set_node_key(lower, key, slot); + btrfs_set_node_blockptr(lower, slot, bytenr); + WARN_ON(trans->transid == 0); + btrfs_set_node_ptr_generation(lower, slot, trans->transid); + btrfs_set_header_nritems(lower, nritems + 1); + btrfs_mark_buffer_dirty(lower); + return 0; +} + +/* + * split the node at the specified level in path in two. + * The path is corrected to point to the appropriate node after the split + * + * Before splitting this tries to make some room in the node by pushing + * left and right, if either one works, it returns right away. + * + * returns 0 on success and < 0 on failure + */ +static noinline int split_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *c; + struct extent_buffer *split; + struct btrfs_disk_key disk_key; + int mid; + int ret; + int wret; + u32 c_nritems; + + c = path->nodes[level]; + WARN_ON(btrfs_header_generation(c) != trans->transid); + if (c == root->node) { + /* trying to split the root, lets make a new one */ + ret = insert_new_root(trans, root, path, level + 1); + if (ret) + return ret; + } else { + ret = push_nodes_for_insert(trans, root, path, level); + c = path->nodes[level]; + if (!ret && btrfs_header_nritems(c) < + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) + return 0; + if (ret < 0) + return ret; + } + + c_nritems = btrfs_header_nritems(c); + mid = (c_nritems + 1) / 2; + btrfs_node_key(c, &disk_key, mid); + + split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, + root->root_key.objectid, + &disk_key, level, c->start, 0); + if (IS_ERR(split)) + return PTR_ERR(split); + + root_add_used(root, root->nodesize); + + memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_level(split, btrfs_header_level(c)); + btrfs_set_header_bytenr(split, split->start); + btrfs_set_header_generation(split, trans->transid); + btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(split, root->root_key.objectid); + write_extent_buffer(split, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(split), + BTRFS_FSID_SIZE); + write_extent_buffer(split, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(split), + BTRFS_UUID_SIZE); + + + copy_extent_buffer(split, c, + btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(mid), + (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); + btrfs_set_header_nritems(split, c_nritems - mid); + btrfs_set_header_nritems(c, mid); + ret = 0; + + btrfs_mark_buffer_dirty(c); + btrfs_mark_buffer_dirty(split); + + wret = insert_ptr(trans, root, path, &disk_key, split->start, + path->slots[level + 1] + 1, + level + 1); + if (wret) + ret = wret; + + if (path->slots[level] >= mid) { + path->slots[level] -= mid; + btrfs_tree_unlock(c); + free_extent_buffer(c); + path->nodes[level] = split; + path->slots[level + 1] += 1; + } else { + btrfs_tree_unlock(split); + free_extent_buffer(split); + } + return ret; +} + +/* + * how many bytes are required to store the items in a leaf. start + * and nr indicate which items in the leaf to check. This totals up the + * space used both by the item structs and the item data + */ +static int leaf_space_used(struct extent_buffer *l, int start, int nr) +{ + int data_len; + int nritems = btrfs_header_nritems(l); + int end = min(nritems, start + nr) - 1; + + if (!nr) + return 0; + data_len = btrfs_item_end_nr(l, start); + data_len = data_len - btrfs_item_offset_nr(l, end); + data_len += sizeof(struct btrfs_item) * nr; + WARN_ON(data_len < 0); + return data_len; +} + +/* + * The space between the end of the leaf items and + * the start of the leaf data. IOW, how much room + * the leaf has left for both items and data + */ +noinline int btrfs_leaf_free_space(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + int nritems = btrfs_header_nritems(leaf); + int ret; + ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); + if (ret < 0) { + printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " + "used %d nritems %d\n", + ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), + leaf_space_used(leaf, 0, nritems), nritems); + } + return ret; +} + +/* + * min slot controls the lowest index we're willing to push to the + * right. We'll push up to and including min_slot, but no lower + */ +static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size, int empty, + struct extent_buffer *right, + int free_space, u32 left_nritems, + u32 min_slot) +{ + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *upper = path->nodes[1]; + struct btrfs_disk_key disk_key; + int slot; + u32 i; + int push_space = 0; + int push_items = 0; + struct btrfs_item *item; + u32 nr; + u32 right_nritems; + u32 data_end; + u32 this_item_size; + + if (empty) + nr = 0; + else + nr = max_t(u32, 1, min_slot); + + if (path->slots[0] >= left_nritems) + push_space += data_size; + + slot = path->slots[1]; + i = left_nritems - 1; + while (i >= nr) { + item = btrfs_item_nr(left, i); + + if (!empty && push_items > 0) { + if (path->slots[0] > i) + break; + if (path->slots[0] == i) { + int space = btrfs_leaf_free_space(root, left); + if (space + push_space * 2 > free_space) + break; + } + } + + if (path->slots[0] == i) + push_space += data_size; + + if (!left->map_token) { + map_extent_buffer(left, (unsigned long)item, + sizeof(struct btrfs_item), + &left->map_token, &left->kaddr, + &left->map_start, &left->map_len, + KM_USER1); + } + + this_item_size = btrfs_item_size(left, item); + if (this_item_size + sizeof(*item) + push_space > free_space) + break; + + push_items++; + push_space += this_item_size + sizeof(*item); + if (i == 0) + break; + i--; + } + if (left->map_token) { + unmap_extent_buffer(left, left->map_token, KM_USER1); + left->map_token = NULL; + } + + if (push_items == 0) + goto out_unlock; + + if (!empty && push_items == left_nritems) + WARN_ON(1); + + /* push left to right */ + right_nritems = btrfs_header_nritems(right); + + push_space = btrfs_item_end_nr(left, left_nritems - push_items); + push_space -= leaf_data_end(root, left); + + /* make room in the right data area */ + data_end = leaf_data_end(root, right); + memmove_extent_buffer(right, + btrfs_leaf_data(right) + data_end - push_space, + btrfs_leaf_data(right) + data_end, + BTRFS_LEAF_DATA_SIZE(root) - data_end); + + /* copy from the left data area */ + copy_extent_buffer(right, left, btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_SIZE(root) - push_space, + btrfs_leaf_data(left) + leaf_data_end(root, left), + push_space); + + memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), + btrfs_item_nr_offset(0), + right_nritems * sizeof(struct btrfs_item)); + + /* copy the items from left to right */ + copy_extent_buffer(right, left, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(left_nritems - push_items), + push_items * sizeof(struct btrfs_item)); + + /* update the item pointers */ + right_nritems += push_items; + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(root); + for (i = 0; i < right_nritems; i++) { + item = btrfs_item_nr(right, i); + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + push_space -= btrfs_item_size(right, item); + btrfs_set_item_offset(right, item, push_space); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + left_nritems -= push_items; + btrfs_set_header_nritems(left, left_nritems); + + if (left_nritems) + btrfs_mark_buffer_dirty(left); + else + clean_tree_block(trans, root, left); + + btrfs_mark_buffer_dirty(right); + + btrfs_item_key(right, &disk_key, 0); + btrfs_set_node_key(upper, &disk_key, slot + 1); + btrfs_mark_buffer_dirty(upper); + + /* then fixup the leaf pointer in the path */ + if (path->slots[0] >= left_nritems) { + path->slots[0] -= left_nritems; + if (btrfs_header_nritems(path->nodes[0]) == 0) + clean_tree_block(trans, root, path->nodes[0]); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + +/* + * push some data in the path leaf to the right, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * returns 1 if the push failed because the other node didn't have enough + * room, 0 if everything worked out and < 0 if there were major errors. + * + * this will push starting from min_slot to the end of the leaf. It won't + * push any slot lower than min_slot + */ +static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + int min_data_size, int data_size, + int empty, u32 min_slot) +{ + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *right; + struct extent_buffer *upper; + int slot; + int free_space; + u32 left_nritems; + int ret; + + if (!path->nodes[1]) + return 1; + + slot = path->slots[1]; + upper = path->nodes[1]; + if (slot >= btrfs_header_nritems(upper) - 1) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + right = read_node_slot(root, upper, slot + 1); + if (right == NULL) + return 1; + + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, right, upper, + slot + 1, &right); + if (ret) + goto out_unlock; + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + left_nritems = btrfs_header_nritems(left); + if (left_nritems == 0) + goto out_unlock; + + return __push_leaf_right(trans, root, path, min_data_size, empty, + right, free_space, left_nritems, min_slot); +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + +/* + * push some data in the path leaf to the left, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the + * items + */ +static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int data_size, + int empty, struct extent_buffer *left, + int free_space, u32 right_nritems, + u32 max_slot) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *right = path->nodes[0]; + int i; + int push_space = 0; + int push_items = 0; + struct btrfs_item *item; + u32 old_left_nritems; + u32 nr; + int ret = 0; + int wret; + u32 this_item_size; + u32 old_left_item_size; + + if (empty) + nr = min(right_nritems, max_slot); + else + nr = min(right_nritems - 1, max_slot); + + for (i = 0; i < nr; i++) { + item = btrfs_item_nr(right, i); + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + if (!empty && push_items > 0) { + if (path->slots[0] < i) + break; + if (path->slots[0] == i) { + int space = btrfs_leaf_free_space(root, right); + if (space + push_space * 2 > free_space) + break; + } + } + + if (path->slots[0] == i) + push_space += data_size; + + this_item_size = btrfs_item_size(right, item); + if (this_item_size + sizeof(*item) + push_space > free_space) + break; + + push_items++; + push_space += this_item_size + sizeof(*item); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + if (push_items == 0) { + ret = 1; + goto out; + } + if (!empty && push_items == btrfs_header_nritems(right)) + WARN_ON(1); + + /* push data from right to left */ + copy_extent_buffer(left, right, + btrfs_item_nr_offset(btrfs_header_nritems(left)), + btrfs_item_nr_offset(0), + push_items * sizeof(struct btrfs_item)); + + push_space = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_offset_nr(right, push_items - 1); + + copy_extent_buffer(left, right, btrfs_leaf_data(left) + + leaf_data_end(root, left) - push_space, + btrfs_leaf_data(right) + + btrfs_item_offset_nr(right, push_items - 1), + push_space); + old_left_nritems = btrfs_header_nritems(left); + BUG_ON(old_left_nritems <= 0); + + old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); + for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { + u32 ioff; + + item = btrfs_item_nr(left, i); + if (!left->map_token) { + map_extent_buffer(left, (unsigned long)item, + sizeof(struct btrfs_item), + &left->map_token, &left->kaddr, + &left->map_start, &left->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(left, item); + btrfs_set_item_offset(left, item, + ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); + } + btrfs_set_header_nritems(left, old_left_nritems + push_items); + if (left->map_token) { + unmap_extent_buffer(left, left->map_token, KM_USER1); + left->map_token = NULL; + } + + /* fixup right node */ + if (push_items > right_nritems) { + printk(KERN_CRIT "push items %d nr %u\n", push_items, + right_nritems); + WARN_ON(1); + } + + if (push_items < right_nritems) { + push_space = btrfs_item_offset_nr(right, push_items - 1) - + leaf_data_end(root, right); + memmove_extent_buffer(right, btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_SIZE(root) - push_space, + btrfs_leaf_data(right) + + leaf_data_end(root, right), push_space); + + memmove_extent_buffer(right, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(push_items), + (btrfs_header_nritems(right) - push_items) * + sizeof(struct btrfs_item)); + } + right_nritems -= push_items; + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(root); + for (i = 0; i < right_nritems; i++) { + item = btrfs_item_nr(right, i); + + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + push_space = push_space - btrfs_item_size(right, item); + btrfs_set_item_offset(right, item, push_space); + } + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + btrfs_mark_buffer_dirty(left); + if (right_nritems) + btrfs_mark_buffer_dirty(right); + else + clean_tree_block(trans, root, right); + + btrfs_item_key(right, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, &disk_key, 1); + if (wret) + ret = wret; + + /* then fixup the leaf pointer in the path */ + if (path->slots[0] < push_items) { + path->slots[0] += old_left_nritems; + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = left; + path->slots[1] -= 1; + } else { + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->slots[0] -= push_items; + } + BUG_ON(path->slots[0] < 0); + return ret; +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; +} + +/* + * push some data in the path leaf to the left, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the + * items + */ +static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int min_data_size, + int data_size, int empty, u32 max_slot) +{ + struct extent_buffer *right = path->nodes[0]; + struct extent_buffer *left; + int slot; + int free_space; + u32 right_nritems; + int ret = 0; + + slot = path->slots[1]; + if (slot == 0) + return 1; + if (!path->nodes[1]) + return 1; + + right_nritems = btrfs_header_nritems(right); + if (right_nritems == 0) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + left = read_node_slot(root, path->nodes[1], slot - 1); + if (left == NULL) + return 1; + + btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, left, + path->nodes[1], slot - 1, &left); + if (ret) { + /* we hit -ENOSPC, but it isn't fatal here */ + ret = 1; + goto out; + } + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + return __push_leaf_left(trans, root, path, min_data_size, + empty, left, free_space, right_nritems, + max_slot); +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; +} + +/* + * split the path's leaf in two, making sure there is at least data_size + * available for the resulting leaf level of the path. + * + * returns 0 if all went well and < 0 on failure. + */ +static noinline int copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) +{ + int data_copy_size; + int rt_data_off; + int i; + int ret = 0; + int wret; + struct btrfs_disk_key disk_key; + + nritems = nritems - mid; + btrfs_set_header_nritems(right, nritems); + data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); + + copy_extent_buffer(right, l, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(mid), + nritems * sizeof(struct btrfs_item)); + + copy_extent_buffer(right, l, + btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - + data_copy_size, btrfs_leaf_data(l) + + leaf_data_end(root, l), data_copy_size); + + rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_end_nr(l, mid); + + for (i = 0; i < nritems; i++) { + struct btrfs_item *item = btrfs_item_nr(right, i); + u32 ioff; + + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(right, item); + btrfs_set_item_offset(right, item, ioff + rt_data_off); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + btrfs_set_header_nritems(l, mid); + ret = 0; + btrfs_item_key(right, &disk_key, 0); + wret = insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; + + btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(l); + BUG_ON(path->slots[0] != slot); + + if (mid <= slot) { + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] -= mid; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + + BUG_ON(path->slots[0] < 0); + + return ret; +} + +/* + * double splits happen when we need to insert a big item in the middle + * of a leaf. A double split can leave us with 3 mostly empty leaves: + * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] + * A B C + * + * We avoid this by trying to push the items on either side of our target + * into the adjacent leaves. If all goes well we can avoid the double split + * completely. + */ +static noinline int push_for_double_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size) +{ + int ret; + int progress = 0; + int slot; + u32 nritems; + + slot = path->slots[0]; + + /* + * try to push all the items after our slot into the + * right leaf + */ + ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * our goal is to get our slot at the start or end of a leaf. If + * we've done so we're done + */ + if (path->slots[0] == 0 || path->slots[0] == nritems) + return 0; + + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + + /* try to push all the items before our slot into the next leaf */ + slot = path->slots[0]; + ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + if (progress) + return 0; + return 1; +} + +/* + * split the path's leaf in two, making sure there is at least data_size + * available for the resulting leaf level of the path. + * + * returns 0 if all went well and < 0 on failure. + */ +static noinline int split_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, + int extend) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *l; + u32 nritems; + int mid; + int slot; + struct extent_buffer *right; + int ret = 0; + int wret; + int split; + int num_doubles = 0; + int tried_avoid_double = 0; + + l = path->nodes[0]; + slot = path->slots[0]; + if (extend && data_size + btrfs_item_size_nr(l, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) + return -EOVERFLOW; + + /* first try to make some room by pushing left and right */ + if (data_size) { + wret = push_leaf_right(trans, root, path, data_size, + data_size, 0, 0); + if (wret < 0) + return wret; + if (wret) { + wret = push_leaf_left(trans, root, path, data_size, + data_size, 0, (u32)-1); + if (wret < 0) + return wret; + } + l = path->nodes[0]; + + /* did the pushes work? */ + if (btrfs_leaf_free_space(root, l) >= data_size) + return 0; + } + + if (!path->nodes[1]) { + ret = insert_new_root(trans, root, path, 1); + if (ret) + return ret; + } +again: + split = 1; + l = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(l); + mid = (nritems + 1) / 2; + + if (mid <= slot) { + if (nritems == 1 || + leaf_space_used(l, mid, nritems - mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (slot >= nritems) { + split = 0; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; + split = 2; + } + } + } + } else { + if (leaf_space_used(l, 0, mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (!extend && data_size && slot == 0) { + split = 0; + } else if ((extend || !data_size) && slot == 0) { + mid = 1; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; + split = 2 ; + } + } + } + } + + if (split == 0) + btrfs_cpu_key_to_disk(&disk_key, ins_key); + else + btrfs_item_key(l, &disk_key, mid); + + right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + root->root_key.objectid, + &disk_key, 0, l->start, 0); + if (IS_ERR(right)) + return PTR_ERR(right); + + root_add_used(root, root->leafsize); + + memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(right, right->start); + btrfs_set_header_generation(right, trans->transid); + btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(right, root->root_key.objectid); + btrfs_set_header_level(right, 0); + write_extent_buffer(right, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(right), + BTRFS_FSID_SIZE); + + write_extent_buffer(right, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(right), + BTRFS_UUID_SIZE); + + if (split == 0) { + if (mid <= slot) { + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; + + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1] += 1; + } else { + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, + right->start, + path->slots[1], 1); + if (wret) + ret = wret; + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + if (path->slots[1] == 0) { + wret = fixup_low_keys(trans, root, + path, &disk_key, 1); + if (wret) + ret = wret; + } + } + btrfs_mark_buffer_dirty(right); + return ret; + } + + ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); + BUG_ON(ret); + + if (split == 2) { + BUG_ON(num_doubles != 0); + num_doubles++; + goto again; + } + + return ret; + +push_for_double: + push_for_double_split(trans, root, path, data_size); + tried_avoid_double = 1; + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + goto again; +} + +static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int ins_len) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + u64 extent_len = 0; + u32 item_size; + int ret; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_EXTENT_CSUM_KEY); + + if (btrfs_leaf_free_space(root, leaf) >= ins_len) + return 0; + + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_len = btrfs_file_extent_num_bytes(leaf, fi); + } + btrfs_release_path(path); + + path->keep_locks = 1; + path->search_for_split = 1; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + path->search_for_split = 0; + if (ret < 0) + goto err; + + ret = -EAGAIN; + leaf = path->nodes[0]; + /* if our item isn't there or got smaller, return now */ + if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) + goto err; + + /* the leaf has changed, it now has room. return now */ + if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len) + goto err; + + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) + goto err; + } + + btrfs_set_path_blocking(path); + ret = split_leaf(trans, root, &key, path, ins_len, 1); + if (ret) + goto err; + + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); + return 0; +err: + path->keep_locks = 0; + return ret; +} + +static noinline int split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + struct btrfs_item *new_item; + int slot; + char *buf; + u32 nritems; + u32 item_size; + u32 orig_offset; + struct btrfs_disk_key disk_key; + + leaf = path->nodes[0]; + BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); + + btrfs_set_path_blocking(path); + + item = btrfs_item_nr(leaf, path->slots[0]); + orig_offset = btrfs_item_offset(leaf, item); + item_size = btrfs_item_size(leaf, item); + + buf = kmalloc(item_size, GFP_NOFS); + if (!buf) + return -ENOMEM; + + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, + path->slots[0]), item_size); + + slot = path->slots[0] + 1; + nritems = btrfs_header_nritems(leaf); + if (slot != nritems) { + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(leaf, &disk_key, slot); + + new_item = btrfs_item_nr(leaf, slot); + + btrfs_set_item_offset(leaf, new_item, orig_offset); + btrfs_set_item_size(leaf, new_item, item_size - split_offset); + + btrfs_set_item_offset(leaf, item, + orig_offset + item_size - split_offset); + btrfs_set_item_size(leaf, item, split_offset); + + btrfs_set_header_nritems(leaf, nritems + 1); + + /* write the data for the start of the original item */ + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + split_offset); + + /* write the data for the new item */ + write_extent_buffer(leaf, buf + split_offset, + btrfs_item_ptr_offset(leaf, slot), + item_size - split_offset); + btrfs_mark_buffer_dirty(leaf); + + BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); + kfree(buf); + return 0; +} + +/* + * This function splits a single item into two items, + * giving 'new_key' to the new item and splitting the + * old one at split_offset (from the start of the item). + * + * The path may be released by this operation. After + * the split, the path is pointing to the old item. The + * new item is going to be in the same node as the old one. + * + * Note, the item being split must be smaller enough to live alone on + * a tree block with room for one extra struct btrfs_item + * + * This allows us to split the item in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + int ret; + ret = setup_leaf_for_split(trans, root, path, + sizeof(struct btrfs_item)); + if (ret) + return ret; + + ret = split_item(trans, root, path, new_key, split_offset); + return ret; +} + +/* + * This function duplicate a item, giving 'new_key' to the new item. + * It guarantees both items live in the same tree leaf and the new item + * is contiguous with the original item. + * + * This allows us to split file extent in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key) +{ + struct extent_buffer *leaf; + int ret; + u32 item_size; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ret = setup_leaf_for_split(trans, root, path, + item_size + sizeof(struct btrfs_item)); + if (ret) + return ret; + + path->slots[0]++; + ret = setup_items_for_insert(trans, root, path, new_key, &item_size, + item_size, item_size + + sizeof(struct btrfs_item), 1); + BUG_ON(ret); + + leaf = path->nodes[0]; + memcpy_extent_buffer(leaf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + btrfs_item_ptr_offset(leaf, path->slots[0] - 1), + item_size); + return 0; +} + +/* + * make the item pointed to by the path smaller. new_size indicates + * how small to make it, and from_end tells us if we just chop bytes + * off the end of the item or if we shift the item to chop bytes off + * the front. + */ +int btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end) +{ + int slot; + struct extent_buffer *leaf; + struct btrfs_item *item; + u32 nritems; + unsigned int data_end; + unsigned int old_data_start; + unsigned int old_size; + unsigned int size_diff; + int i; + + leaf = path->nodes[0]; + slot = path->slots[0]; + + old_size = btrfs_item_size_nr(leaf, slot); + if (old_size == new_size) + return 0; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + old_data_start = btrfs_item_offset_nr(leaf, slot); + + size_diff = old_size - new_size; + + BUG_ON(slot < 0); + BUG_ON(slot >= nritems); + + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + for (i = slot; i < nritems; i++) { + u32 ioff; + item = btrfs_item_nr(leaf, i); + + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff + size_diff); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the data */ + if (from_end) { + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + size_diff, btrfs_leaf_data(leaf) + + data_end, old_data_start + new_size - data_end); + } else { + struct btrfs_disk_key disk_key; + u64 offset; + + btrfs_item_key(leaf, &disk_key, slot); + + if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { + unsigned long ptr; + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + fi = (struct btrfs_file_extent_item *)( + (unsigned long)fi - size_diff); + + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) { + ptr = btrfs_item_ptr_offset(leaf, slot); + memmove_extent_buffer(leaf, ptr, + (unsigned long)fi, + offsetof(struct btrfs_file_extent_item, + disk_bytenr)); + } + } + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + size_diff, btrfs_leaf_data(leaf) + + data_end, old_data_start - data_end); + + offset = btrfs_disk_key_offset(&disk_key); + btrfs_set_disk_key_offset(&disk_key, offset + size_diff); + btrfs_set_item_key(leaf, &disk_key, slot); + if (slot == 0) + fixup_low_keys(trans, root, path, &disk_key, 1); + } + + item = btrfs_item_nr(leaf, slot); + btrfs_set_item_size(leaf, item, new_size); + btrfs_mark_buffer_dirty(leaf); + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + return 0; +} + +/* + * make the item pointed to by the path bigger, data_size is the new size. + */ +int btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u32 data_size) +{ + int slot; + struct extent_buffer *leaf; + struct btrfs_item *item; + u32 nritems; + unsigned int data_end; + unsigned int old_data; + unsigned int old_size; + int i; + + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < data_size) { + btrfs_print_leaf(root, leaf); + BUG(); + } + slot = path->slots[0]; + old_data = btrfs_item_end_nr(leaf, slot); + + BUG_ON(slot < 0); + if (slot >= nritems) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d too large, nritems %d\n", + slot, nritems); + BUG_ON(1); + } + + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + for (i = slot; i < nritems; i++) { + u32 ioff; + item = btrfs_item_nr(leaf, i); + + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - data_size); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - data_size, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + + data_end = old_data; + old_size = btrfs_item_size_nr(leaf, slot); + item = btrfs_item_nr(leaf, slot); + btrfs_set_item_size(leaf, item, old_size + data_size); + btrfs_mark_buffer_dirty(leaf); + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + return 0; +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + * Returns the number of keys that were inserted. + */ +int btrfs_insert_some_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + int ret = 0; + int slot; + int i; + u32 nritems; + u32 total_data = 0; + u32 total_size = 0; + unsigned int data_end; + struct btrfs_disk_key disk_key; + struct btrfs_key found_key; + + for (i = 0; i < nr; i++) { + if (total_size + data_size[i] + sizeof(struct btrfs_item) > + BTRFS_LEAF_DATA_SIZE(root)) { + break; + nr = i; + } + total_data += data_size[i]; + total_size += data_size[i] + sizeof(struct btrfs_item); + } + BUG_ON(nr == 0); + + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < total_size) { + for (i = nr; i >= 0; i--) { + total_data -= data_size[i]; + total_size -= data_size[i] + sizeof(struct btrfs_item); + if (total_size < btrfs_leaf_free_space(root, leaf)) + break; + } + nr = i; + } + + slot = path->slots[0]; + BUG_ON(slot < 0); + + if (slot != nritems) { + unsigned int old_data = btrfs_item_end_nr(leaf, slot); + + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* figure out how many keys we can insert in here */ + total_data = data_size[0]; + for (i = 1; i < nr; i++) { + if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0) + break; + total_data += data_size[i]; + } + nr = i; + + if (old_data < data_end) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + slot, old_data, data_end); + BUG_ON(1); + } + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + WARN_ON(leaf->map_token); + for (i = slot; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - total_data); + } + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - total_data, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + data_end = old_data; + } else { + /* + * this sucks but it has to be done, if we are inserting at + * the end of the leaf only insert 1 of the items, since we + * have no way of knowing whats on the next leaf and we'd have + * to drop our current locks to figure it out + */ + nr = 1; + } + + /* setup the item for the new data */ + for (i = 0; i < nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + btrfs_set_item_key(leaf, &disk_key, slot + i); + item = btrfs_item_nr(leaf, slot + i); + btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + data_end -= data_size[i]; + btrfs_set_item_size(leaf, item, data_size[i]); + } + btrfs_set_header_nritems(leaf, nritems + nr); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (slot == 0) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + ret = fixup_low_keys(trans, root, path, &disk_key, 1); + } + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } +out: + if (!ret) + ret = nr; + return ret; +} + +/* + * this is a helper for btrfs_insert_empty_items, the main goal here is + * to save stack depth by doing the bulk of the work in a function + * that doesn't call btrfs_search_slot + */ +int setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr) +{ + struct btrfs_item *item; + int i; + u32 nritems; + unsigned int data_end; + struct btrfs_disk_key disk_key; + int ret; + struct extent_buffer *leaf; + int slot; + + leaf = path->nodes[0]; + slot = path->slots[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < total_size) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "not enough freespace need %u have %d\n", + total_size, btrfs_leaf_free_space(root, leaf)); + BUG(); + } + + if (slot != nritems) { + unsigned int old_data = btrfs_item_end_nr(leaf, slot); + + if (old_data < data_end) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + slot, old_data, data_end); + BUG_ON(1); + } + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + WARN_ON(leaf->map_token); + for (i = slot; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - total_data); + } + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - total_data, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + data_end = old_data; + } + + /* setup the item for the new data */ + for (i = 0; i < nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + btrfs_set_item_key(leaf, &disk_key, slot + i); + item = btrfs_item_nr(leaf, slot + i); + btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + data_end -= data_size[i]; + btrfs_set_item_size(leaf, item, data_size[i]); + } + + btrfs_set_header_nritems(leaf, nritems + nr); + + ret = 0; + if (slot == 0) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + ret = fixup_low_keys(trans, root, path, &disk_key, 1); + } + btrfs_unlock_up_safe(path, 1); + btrfs_mark_buffer_dirty(leaf); + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + return ret; +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) +{ + int ret = 0; + int slot; + int i; + u32 total_size = 0; + u32 total_data = 0; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + + total_size = total_data + (nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + goto out; + + slot = path->slots[0]; + BUG_ON(slot < 0); + + ret = setup_items_for_insert(trans, root, path, cpu_key, data_size, + total_data, total_size, nr); + +out: + return ret; +} + +/* + * Given a key and some data, insert an item into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *cpu_key, void *data, u32 + data_size) +{ + int ret = 0; + struct btrfs_path *path; + struct extent_buffer *leaf; + unsigned long ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); + if (!ret) { + leaf = path->nodes[0]; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, data, ptr, data_size); + btrfs_mark_buffer_dirty(leaf); + } + btrfs_free_path(path); + return ret; +} + +/* + * delete the pointer from a given node. + * + * the tree should have been previously balanced so the deletion does not + * empty a node. + */ +static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) +{ + struct extent_buffer *parent = path->nodes[level]; + u32 nritems; + int ret = 0; + int wret; + + nritems = btrfs_header_nritems(parent); + if (slot != nritems - 1) { + memmove_extent_buffer(parent, + btrfs_node_key_ptr_offset(slot), + btrfs_node_key_ptr_offset(slot + 1), + sizeof(struct btrfs_key_ptr) * + (nritems - slot - 1)); + } + nritems--; + btrfs_set_header_nritems(parent, nritems); + if (nritems == 0 && parent == root->node) { + BUG_ON(btrfs_header_level(root->node) != 1); + /* just turn the root into a leaf and break */ + btrfs_set_header_level(root->node, 0); + } else if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(parent, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, &disk_key, level + 1); + if (wret) + ret = wret; + } + btrfs_mark_buffer_dirty(parent); + return ret; +} + +/* + * a helper function to delete the leaf pointed to by path->slots[1] and + * path->nodes[1]. + * + * This deletes the pointer in path->nodes[1] and frees the leaf + * block extent. zero is returned if it all worked out, < 0 otherwise. + * + * The path must have already been setup for deleting the leaf, including + * all the proper balancing. path->nodes[1] must be locked. + */ +static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) +{ + int ret; + + WARN_ON(btrfs_header_generation(leaf) != trans->transid); + ret = del_ptr(trans, root, path, 1, path->slots[1]); + if (ret) + return ret; + + /* + * btrfs_free_extent is expensive, we want to make sure we + * aren't holding any locks when we call it + */ + btrfs_unlock_up_safe(path, 0); + + root_sub_used(root, leaf->len); + + btrfs_free_tree_block(trans, root, leaf, 0, 1); + return 0; +} +/* + * delete the item at the leaf level in path. If that empties + * the leaf, remove it from the tree + */ +int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int slot, int nr) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + int last_off; + int dsize = 0; + int ret = 0; + int wret; + int i; + u32 nritems; + + leaf = path->nodes[0]; + last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); + + for (i = 0; i < nr; i++) + dsize += btrfs_item_size_nr(leaf, slot + i); + + nritems = btrfs_header_nritems(leaf); + + if (slot + nr != nritems) { + int data_end = leaf_data_end(root, leaf); + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + dsize, + btrfs_leaf_data(leaf) + data_end, + last_off - data_end); + + for (i = slot + nr; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff + dsize); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), + btrfs_item_nr_offset(slot + nr), + sizeof(struct btrfs_item) * + (nritems - slot - nr)); + } + btrfs_set_header_nritems(leaf, nritems - nr); + nritems -= nr; + + /* delete the leaf if we've emptied it */ + if (nritems == 0) { + if (leaf == root->node) { + btrfs_set_header_level(leaf, 0); + } else { + btrfs_set_path_blocking(path); + clean_tree_block(trans, root, leaf); + ret = btrfs_del_leaf(trans, root, path, leaf); + BUG_ON(ret); + } + } else { + int used = leaf_space_used(leaf, 0, nritems); + if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_item_key(leaf, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, + &disk_key, 1); + if (wret) + ret = wret; + } + + /* delete the leaf if it is mostly empty */ + if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) { + /* push_leaf_left fixes the path. + * make sure the path still points to our leaf + * for possible call to del_ptr below + */ + slot = path->slots[1]; + extent_buffer_get(leaf); + + btrfs_set_path_blocking(path); + wret = push_leaf_left(trans, root, path, 1, 1, + 1, (u32)-1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + + if (path->nodes[0] == leaf && + btrfs_header_nritems(leaf)) { + wret = push_leaf_right(trans, root, path, 1, + 1, 1, 0); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + } + + if (btrfs_header_nritems(leaf) == 0) { + path->slots[1] = slot; + ret = btrfs_del_leaf(trans, root, path, leaf); + BUG_ON(ret); + free_extent_buffer(leaf); + } else { + /* if we're still in the path, make sure + * we're dirty. Otherwise, one of the + * push_leaf functions must have already + * dirtied this buffer + */ + if (path->nodes[0] == leaf) + btrfs_mark_buffer_dirty(leaf); + free_extent_buffer(leaf); + } + } else { + btrfs_mark_buffer_dirty(leaf); + } + } + return ret; +} + +/* + * search the tree again to find a leaf with lesser keys + * returns 0 if it found something or 1 if there are no lesser leaves. + * returns < 0 on io errors. + * + * This may release the path, and so you may lose any locks held at the + * time you call it. + */ +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_disk_key found_key; + int ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + + if (key.offset > 0) + key.offset--; + else if (key.type > 0) + key.type--; + else if (key.objectid > 0) + key.objectid--; + else + return 1; + + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + btrfs_item_key(path->nodes[0], &found_key, 0); + ret = comp_keys(&found_key, &key); + if (ret < 0) + return 0; + return 1; +} + +/* + * A helper function to walk down the tree starting at min_key, and looking + * for nodes or leaves that are either in cache or have a minimum + * transaction id. This is used by the btree defrag code, and tree logging + * + * This does not cow, but it does stuff the starting key it finds back + * into min_key, so you can call btrfs_search_slot with cow=1 on the + * key and get a writable path. + * + * This does lock as it descends, and path->keep_locks should be set + * to 1 by the caller. + * + * This honors path->lowest_level to prevent descent past a given level + * of the tree. + * + * min_trans indicates the oldest transaction that you are interested + * in walking through. Any nodes or leaves older than min_trans are + * skipped over (without reading them). + * + * returns zero if something useful was found, < 0 on error and 1 if there + * was nothing in the tree that matched the search criteria. + */ +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_key *max_key, + struct btrfs_path *path, int cache_only, + u64 min_trans) +{ + struct extent_buffer *cur; + struct btrfs_key found_key; + int slot; + int sret; + u32 nritems; + int level; + int ret = 1; + + WARN_ON(!path->keep_locks); +again: + cur = btrfs_lock_root_node(root); + level = btrfs_header_level(cur); + WARN_ON(path->nodes[level]); + path->nodes[level] = cur; + path->locks[level] = 1; + + if (btrfs_header_generation(cur) < min_trans) { + ret = 1; + goto out; + } + while (1) { + nritems = btrfs_header_nritems(cur); + level = btrfs_header_level(cur); + sret = bin_search(cur, min_key, level, &slot); + + /* at the lowest level, we're done, setup the path and exit */ + if (level == path->lowest_level) { + if (slot >= nritems) + goto find_next_key; + ret = 0; + path->slots[level] = slot; + btrfs_item_key_to_cpu(cur, &found_key, slot); + goto out; + } + if (sret && slot > 0) + slot--; + /* + * check this node pointer against the cache_only and + * min_trans parameters. If it isn't in cache or is too + * old, skip to the next one. + */ + while (slot < nritems) { + u64 blockptr; + u64 gen; + struct extent_buffer *tmp; + struct btrfs_disk_key disk_key; + + blockptr = btrfs_node_blockptr(cur, slot); + gen = btrfs_node_ptr_generation(cur, slot); + if (gen < min_trans) { + slot++; + continue; + } + if (!cache_only) + break; + + if (max_key) { + btrfs_node_key(cur, &disk_key, slot); + if (comp_keys(&disk_key, max_key) >= 0) { + ret = 1; + goto out; + } + } + + tmp = btrfs_find_tree_block(root, blockptr, + btrfs_level_size(root, level - 1)); + + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + free_extent_buffer(tmp); + break; + } + if (tmp) + free_extent_buffer(tmp); + slot++; + } +find_next_key: + /* + * we didn't find a candidate key in this node, walk forward + * and find another one + */ + if (slot >= nritems) { + path->slots[level] = slot; + btrfs_set_path_blocking(path); + sret = btrfs_find_next_key(root, path, min_key, level, + cache_only, min_trans); + if (sret == 0) { + btrfs_release_path(path); + goto again; + } else { + goto out; + } + } + /* save our key for returning back */ + btrfs_node_key_to_cpu(cur, &found_key, slot); + path->slots[level] = slot; + if (level == path->lowest_level) { + ret = 0; + unlock_up(path, level, 1); + goto out; + } + btrfs_set_path_blocking(path); + cur = read_node_slot(root, cur, slot); + BUG_ON(!cur); + + btrfs_tree_lock(cur); + + path->locks[level - 1] = 1; + path->nodes[level - 1] = cur; + unlock_up(path, level, 1); + btrfs_clear_path_blocking(path, NULL); + } +out: + if (ret == 0) + memcpy(min_key, &found_key, sizeof(found_key)); + btrfs_set_path_blocking(path); + return ret; +} + +/* + * this is similar to btrfs_next_leaf, but does not try to preserve + * and fixup the path. It looks for and returns the next key in the + * tree based on the current path and the cache_only and min_trans + * parameters. + * + * 0 is returned if another key is found, < 0 if there are any errors + * and 1 is returned if there are no higher keys in the tree + * + * path->keep_locks should be set to 1 on the search made before + * calling this function. + */ +int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, int level, + int cache_only, u64 min_trans) +{ + int slot; + struct extent_buffer *c; + + WARN_ON(!path->keep_locks); + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + return 1; + + slot = path->slots[level] + 1; + c = path->nodes[level]; +next: + if (slot >= btrfs_header_nritems(c)) { + int ret; + int orig_lowest; + struct btrfs_key cur_key; + if (level + 1 >= BTRFS_MAX_LEVEL || + !path->nodes[level + 1]) + return 1; + + if (path->locks[level + 1]) { + level++; + continue; + } + + slot = btrfs_header_nritems(c) - 1; + if (level == 0) + btrfs_item_key_to_cpu(c, &cur_key, slot); + else + btrfs_node_key_to_cpu(c, &cur_key, slot); + + orig_lowest = path->lowest_level; + btrfs_release_path(path); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &cur_key, path, + 0, 0); + path->lowest_level = orig_lowest; + if (ret < 0) + return ret; + + c = path->nodes[level]; + slot = path->slots[level]; + if (ret == 0) + slot++; + goto next; + } + + if (level == 0) + btrfs_item_key_to_cpu(c, key, slot); + else { + u64 blockptr = btrfs_node_blockptr(c, slot); + u64 gen = btrfs_node_ptr_generation(c, slot); + + if (cache_only) { + struct extent_buffer *cur; + cur = btrfs_find_tree_block(root, blockptr, + btrfs_level_size(root, level - 1)); + if (!cur || !btrfs_buffer_uptodate(cur, gen)) { + slot++; + if (cur) + free_extent_buffer(cur); + goto next; + } + free_extent_buffer(cur); + } + if (gen < min_trans) { + slot++; + goto next; + } + btrfs_node_key_to_cpu(c, key, slot); + } + return 0; + } + return 1; +} + +/* + * search the tree again to find a leaf with greater keys + * returns 0 if it found something or 1 if there are no greater leaves. + * returns < 0 on io errors. + */ +int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + int slot; + int level; + struct extent_buffer *c; + struct extent_buffer *next; + struct btrfs_key key; + u32 nritems; + int ret; + int old_spinning = path->leave_spinning; + int force_blocking = 0; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (nritems == 0) + return 1; + + /* + * we take the blocks in an order that upsets lockdep. Using + * blocking mode is the only way around it. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + force_blocking = 1; +#endif + + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); +again: + level = 1; + next = NULL; + btrfs_release_path(path); + + path->keep_locks = 1; + + if (!force_blocking) + path->leave_spinning = 1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->keep_locks = 0; + + if (ret < 0) + return ret; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * by releasing the path above we dropped all our locks. A balance + * could have added more items next to the key that used to be + * at the very end of the block. So, check again here and + * advance the path if there are now more items available. + */ + if (nritems > 0 && path->slots[0] < nritems - 1) { + if (ret == 0) + path->slots[0]++; + ret = 0; + goto done; + } + + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) { + ret = 1; + goto done; + } + + slot = path->slots[level] + 1; + c = path->nodes[level]; + if (slot >= btrfs_header_nritems(c)) { + level++; + if (level == BTRFS_MAX_LEVEL) { + ret = 1; + goto done; + } + continue; + } + + if (next) { + btrfs_tree_unlock(next); + free_extent_buffer(next); + } + + next = c; + ret = read_block_for_search(NULL, root, path, &next, level, + slot, &key); + if (ret == -EAGAIN) + goto again; + + if (ret < 0) { + btrfs_release_path(path); + goto done; + } + + if (!path->skip_locking) { + ret = btrfs_try_spin_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(next); + if (!force_blocking) + btrfs_clear_path_blocking(path, next); + } + if (force_blocking) + btrfs_set_lock_blocking(next); + } + break; + } + path->slots[level] = slot; + while (1) { + level--; + c = path->nodes[level]; + if (path->locks[level]) + btrfs_tree_unlock(c); + + free_extent_buffer(c); + path->nodes[level] = next; + path->slots[level] = 0; + if (!path->skip_locking) + path->locks[level] = 1; + + if (!level) + break; + + ret = read_block_for_search(NULL, root, path, &next, level, + 0, &key); + if (ret == -EAGAIN) + goto again; + + if (ret < 0) { + btrfs_release_path(path); + goto done; + } + + if (!path->skip_locking) { + btrfs_assert_tree_locked(path->nodes[level]); + ret = btrfs_try_spin_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(next); + if (!force_blocking) + btrfs_clear_path_blocking(path, next); + } + if (force_blocking) + btrfs_set_lock_blocking(next); + } + } + ret = 0; +done: + unlock_up(path, 0, 1); + path->leave_spinning = old_spinning; + if (!old_spinning) + btrfs_set_path_blocking(path); + + return ret; +} + +/* + * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps + * searching until it gets past min_objectid or finds an item of 'type' + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid < min_objectid) + break; + if (found_key.type == type) + return 0; + if (found_key.objectid == min_objectid && + found_key.type < type) + break; + } + return 1; +} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h new file mode 100644 index 00000000..66179bcb --- /dev/null +++ b/fs/btrfs/ctree.h @@ -0,0 +1,2683 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_CTREE__ +#define __BTRFS_CTREE__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "extent_io.h" +#include "extent_map.h" +#include "async-thread.h" +#include "ioctl.h" + +struct btrfs_trans_handle; +struct btrfs_transaction; +struct btrfs_pending_snapshot; +extern struct kmem_cache *btrfs_trans_handle_cachep; +extern struct kmem_cache *btrfs_transaction_cachep; +extern struct kmem_cache *btrfs_bit_radix_cachep; +extern struct kmem_cache *btrfs_path_cachep; +extern struct kmem_cache *btrfs_free_space_cachep; +struct btrfs_ordered_sum; + +#define BTRFS_MAGIC "_BHRfS_M" + +#define BTRFS_MAX_LEVEL 8 + +#define BTRFS_COMPAT_EXTENT_TREE_V0 + +/* + * files bigger than this get some pre-flushing when they are added + * to the ordered operations list. That way we limit the total + * work done by the commit + */ +#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024) + +/* holds pointers to all of the tree roots */ +#define BTRFS_ROOT_TREE_OBJECTID 1ULL + +/* stores information about which extents are in use, and reference counts */ +#define BTRFS_EXTENT_TREE_OBJECTID 2ULL + +/* + * chunk tree stores translations from logical -> physical block numbering + * the super block points to the chunk tree + */ +#define BTRFS_CHUNK_TREE_OBJECTID 3ULL + +/* + * stores information about which areas of a given device are in use. + * one per device. The tree of tree roots points to the device tree + */ +#define BTRFS_DEV_TREE_OBJECTID 4ULL + +/* one per subvolume, storing files and directories */ +#define BTRFS_FS_TREE_OBJECTID 5ULL + +/* directory objectid inside the root tree */ +#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL + +/* holds checksums of all the data extents */ +#define BTRFS_CSUM_TREE_OBJECTID 7ULL + +/* orhpan objectid for tracking unlinked/truncated files */ +#define BTRFS_ORPHAN_OBJECTID -5ULL + +/* does write ahead logging to speed up fsyncs */ +#define BTRFS_TREE_LOG_OBJECTID -6ULL +#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL + +/* for space balancing */ +#define BTRFS_TREE_RELOC_OBJECTID -8ULL +#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL + +/* + * extent checksums all have this objectid + * this allows them to share the logging tree + * for fsyncs + */ +#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL + +/* For storing free space cache */ +#define BTRFS_FREE_SPACE_OBJECTID -11ULL + +/* + * The inode number assigned to the special inode for sotring + * free ino cache + */ +#define BTRFS_FREE_INO_OBJECTID -12ULL + +/* dummy objectid represents multiple objectids */ +#define BTRFS_MULTIPLE_OBJECTIDS -255ULL + +/* + * All files have objectids in this range. + */ +#define BTRFS_FIRST_FREE_OBJECTID 256ULL +#define BTRFS_LAST_FREE_OBJECTID -256ULL +#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL + + +/* + * the device items go into the chunk tree. The key is in the form + * [ 1 BTRFS_DEV_ITEM_KEY device_id ] + */ +#define BTRFS_DEV_ITEMS_OBJECTID 1ULL + +#define BTRFS_BTREE_INODE_OBJECTID 1 + +#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 + +/* + * we can actually store much bigger names, but lets not confuse the rest + * of linux + */ +#define BTRFS_NAME_LEN 255 + +/* 32 bytes in various csum fields */ +#define BTRFS_CSUM_SIZE 32 + +/* csum types */ +#define BTRFS_CSUM_TYPE_CRC32 0 + +static int btrfs_csum_sizes[] = { 4, 0 }; + +/* four bytes for CRC32 */ +#define BTRFS_EMPTY_DIR_SIZE 0 + +#define BTRFS_FT_UNKNOWN 0 +#define BTRFS_FT_REG_FILE 1 +#define BTRFS_FT_DIR 2 +#define BTRFS_FT_CHRDEV 3 +#define BTRFS_FT_BLKDEV 4 +#define BTRFS_FT_FIFO 5 +#define BTRFS_FT_SOCK 6 +#define BTRFS_FT_SYMLINK 7 +#define BTRFS_FT_XATTR 8 +#define BTRFS_FT_MAX 9 + +/* + * The key defines the order in the tree, and so it also defines (optimal) + * block layout. + * + * objectid corresponds to the inode number. + * + * type tells us things about the object, and is a kind of stream selector. + * so for a given inode, keys with type of 1 might refer to the inode data, + * type of 2 may point to file data in the btree and type == 3 may point to + * extents. + * + * offset is the starting byte offset for this key in the stream. + * + * btrfs_disk_key is in disk byte order. struct btrfs_key is always + * in cpu native order. Otherwise they are identical and their sizes + * should be the same (ie both packed) + */ +struct btrfs_disk_key { + __le64 objectid; + u8 type; + __le64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_key { + u64 objectid; + u8 type; + u64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_mapping_tree { + struct extent_map_tree map_tree; +}; + +struct btrfs_dev_item { + /* the internal btrfs device id */ + __le64 devid; + + /* size of the device */ + __le64 total_bytes; + + /* bytes used */ + __le64 bytes_used; + + /* optimal io alignment for this device */ + __le32 io_align; + + /* optimal io width for this device */ + __le32 io_width; + + /* minimal io size for this device */ + __le32 sector_size; + + /* type and info about this device */ + __le64 type; + + /* expected generation for this device */ + __le64 generation; + + /* + * starting byte of this partition on the device, + * to allow for stripe alignment in the future + */ + __le64 start_offset; + + /* grouping information for allocation decisions */ + __le32 dev_group; + + /* seek speed 0-100 where 100 is fastest */ + u8 seek_speed; + + /* bandwidth 0-100 where 100 is fastest */ + u8 bandwidth; + + /* btrfs generated uuid for this device */ + u8 uuid[BTRFS_UUID_SIZE]; + + /* uuid of FS who owns this device */ + u8 fsid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_stripe { + __le64 devid; + __le64 offset; + u8 dev_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_chunk { + /* size of this chunk in bytes */ + __le64 length; + + /* objectid of the root referencing this chunk */ + __le64 owner; + + __le64 stripe_len; + __le64 type; + + /* optimal io alignment for this chunk */ + __le32 io_align; + + /* optimal io width for this chunk */ + __le32 io_width; + + /* minimal io size for this chunk */ + __le32 sector_size; + + /* 2^16 stripes is quite a lot, a second limit is the size of a single + * item in the btree + */ + __le16 num_stripes; + + /* sub stripes only matter for raid10 */ + __le16 sub_stripes; + struct btrfs_stripe stripe; + /* additional stripes go here */ +} __attribute__ ((__packed__)); + +#define BTRFS_FREE_SPACE_EXTENT 1 +#define BTRFS_FREE_SPACE_BITMAP 2 + +struct btrfs_free_space_entry { + __le64 offset; + __le64 bytes; + u8 type; +} __attribute__ ((__packed__)); + +struct btrfs_free_space_header { + struct btrfs_disk_key location; + __le64 generation; + __le64 num_entries; + __le64 num_bitmaps; +} __attribute__ ((__packed__)); + +static inline unsigned long btrfs_chunk_item_size(int num_stripes) +{ + BUG_ON(num_stripes == 0); + return sizeof(struct btrfs_chunk) + + sizeof(struct btrfs_stripe) * (num_stripes - 1); +} + +#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) +#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) + +/* + * File system states + */ + +/* Errors detected */ +#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) + +#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) +#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) + +#define BTRFS_BACKREF_REV_MAX 256 +#define BTRFS_BACKREF_REV_SHIFT 56 +#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ + BTRFS_BACKREF_REV_SHIFT) + +#define BTRFS_OLD_BACKREF_REV 0 +#define BTRFS_MIXED_BACKREF_REV 1 + +/* + * every tree block (leaf or node) starts with this header. + */ +struct btrfs_header { + /* these first four must match the super block */ + u8 csum[BTRFS_CSUM_SIZE]; + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + __le64 bytenr; /* which block this node is supposed to live in */ + __le64 flags; + + /* allowed to be different from the super from here on down */ + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + __le64 generation; + __le64 owner; + __le32 nritems; + u8 level; +} __attribute__ ((__packed__)); + +#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \ + sizeof(struct btrfs_header)) / \ + sizeof(struct btrfs_key_ptr)) +#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) +#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize)) +#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) - \ + sizeof(struct btrfs_file_extent_item)) +#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) -\ + sizeof(struct btrfs_dir_item)) + + +/* + * this is a very generous portion of the super block, giving us + * room to translate 14 chunks with 3 stripes each. + */ +#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 +#define BTRFS_LABEL_SIZE 256 + +/* + * the super block basically lists the main trees of the FS + * it currently lacks any block count etc etc + */ +struct btrfs_super_block { + u8 csum[BTRFS_CSUM_SIZE]; + /* the first 4 fields must match struct btrfs_header */ + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + __le64 bytenr; /* this block number */ + __le64 flags; + + /* allowed to be different from the btrfs_header from here own down */ + __le64 magic; + __le64 generation; + __le64 root; + __le64 chunk_root; + __le64 log_root; + + /* this will help find the new super based on the log root */ + __le64 log_root_transid; + __le64 total_bytes; + __le64 bytes_used; + __le64 root_dir_objectid; + __le64 num_devices; + __le32 sectorsize; + __le32 nodesize; + __le32 leafsize; + __le32 stripesize; + __le32 sys_chunk_array_size; + __le64 chunk_root_generation; + __le64 compat_flags; + __le64 compat_ro_flags; + __le64 incompat_flags; + __le16 csum_type; + u8 root_level; + u8 chunk_root_level; + u8 log_root_level; + struct btrfs_dev_item dev_item; + + char label[BTRFS_LABEL_SIZE]; + + __le64 cache_generation; + + /* future expansion */ + __le64 reserved[31]; + u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; +} __attribute__ ((__packed__)); + +/* + * Compat flags that we support. If any incompat flags are set other than the + * ones specified below then we will fail to mount + */ +#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) +#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) +#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) + +#define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) + +/* + * A leaf is full of items. offset and size tell us where to find + * the item in the leaf (relative to the start of the data area) + */ +struct btrfs_item { + struct btrfs_disk_key key; + __le32 offset; + __le32 size; +} __attribute__ ((__packed__)); + +/* + * leaves have an item area and a data area: + * [item0, item1....itemN] [free space] [dataN...data1, data0] + * + * The data is separate from the items to get the keys closer together + * during searches. + */ +struct btrfs_leaf { + struct btrfs_header header; + struct btrfs_item items[]; +} __attribute__ ((__packed__)); + +/* + * all non-leaf blocks are nodes, they hold only keys and pointers to + * other blocks + */ +struct btrfs_key_ptr { + struct btrfs_disk_key key; + __le64 blockptr; + __le64 generation; +} __attribute__ ((__packed__)); + +struct btrfs_node { + struct btrfs_header header; + struct btrfs_key_ptr ptrs[]; +} __attribute__ ((__packed__)); + +/* + * btrfs_paths remember the path taken from the root down to the leaf. + * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point + * to any other levels that are present. + * + * The slots array records the index of the item or block pointer + * used while walking the tree. + */ +struct btrfs_path { + struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; + int slots[BTRFS_MAX_LEVEL]; + /* if there is real range locking, this locks field will change */ + int locks[BTRFS_MAX_LEVEL]; + int reada; + /* keep some upper locks as we walk down */ + int lowest_level; + + /* + * set by btrfs_split_item, tells search_slot to keep all locks + * and to force calls to keep space in the nodes + */ + unsigned int search_for_split:1; + unsigned int keep_locks:1; + unsigned int skip_locking:1; + unsigned int leave_spinning:1; + unsigned int search_commit_root:1; +}; + +/* + * items in the extent btree are used to record the objectid of the + * owner of the block and the number of references + */ + +struct btrfs_extent_item { + __le64 refs; + __le64 generation; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_extent_item_v0 { + __le32 refs; +} __attribute__ ((__packed__)); + +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ + sizeof(struct btrfs_item)) + +#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) +#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) + +/* following flags only apply to tree blocks */ + +/* use full backrefs for extent pointers in the block */ +#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) + +/* + * this flag is only used internally by scrub and may be changed at any time + * it is only declared here to avoid collisions + */ +#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48) + +struct btrfs_tree_block_info { + struct btrfs_disk_key key; + u8 level; +} __attribute__ ((__packed__)); + +struct btrfs_extent_data_ref { + __le64 root; + __le64 objectid; + __le64 offset; + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_shared_data_ref { + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_extent_inline_ref { + u8 type; + __le64 offset; +} __attribute__ ((__packed__)); + +/* old style backrefs item */ +struct btrfs_extent_ref_v0 { + __le64 root; + __le64 generation; + __le64 objectid; + __le32 count; +} __attribute__ ((__packed__)); + + +/* dev extents record free space on individual devices. The owner + * field points back to the chunk allocation mapping tree that allocated + * the extent. The chunk tree uuid field is a way to double check the owner + */ +struct btrfs_dev_extent { + __le64 chunk_tree; + __le64 chunk_objectid; + __le64 chunk_offset; + __le64 length; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_inode_ref { + __le64 index; + __le16 name_len; + /* name goes here */ +} __attribute__ ((__packed__)); + +struct btrfs_timespec { + __le64 sec; + __le32 nsec; +} __attribute__ ((__packed__)); + +enum btrfs_compression_type { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LZO = 2, + BTRFS_COMPRESS_TYPES = 2, + BTRFS_COMPRESS_LAST = 3, +}; + +struct btrfs_inode_item { + /* nfs style generation number */ + __le64 generation; + /* transid that last touched this inode */ + __le64 transid; + __le64 size; + __le64 nbytes; + __le64 block_group; + __le32 nlink; + __le32 uid; + __le32 gid; + __le32 mode; + __le64 rdev; + __le64 flags; + + /* modification sequence number for NFS */ + __le64 sequence; + + /* + * a little future expansion, for more than this we can + * just grow the inode item and version it + */ + __le64 reserved[4]; + struct btrfs_timespec atime; + struct btrfs_timespec ctime; + struct btrfs_timespec mtime; + struct btrfs_timespec otime; +} __attribute__ ((__packed__)); + +struct btrfs_dir_log_item { + __le64 end; +} __attribute__ ((__packed__)); + +struct btrfs_dir_item { + struct btrfs_disk_key location; + __le64 transid; + __le16 data_len; + __le16 name_len; + u8 type; +} __attribute__ ((__packed__)); + +#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) + +struct btrfs_root_item { + struct btrfs_inode_item inode; + __le64 generation; + __le64 root_dirid; + __le64 bytenr; + __le64 byte_limit; + __le64 bytes_used; + __le64 last_snapshot; + __le64 flags; + __le32 refs; + struct btrfs_disk_key drop_progress; + u8 drop_level; + u8 level; +} __attribute__ ((__packed__)); + +/* + * this is used for both forward and backward root refs + */ +struct btrfs_root_ref { + __le64 dirid; + __le64 sequence; + __le16 name_len; +} __attribute__ ((__packed__)); + +#define BTRFS_FILE_EXTENT_INLINE 0 +#define BTRFS_FILE_EXTENT_REG 1 +#define BTRFS_FILE_EXTENT_PREALLOC 2 + +struct btrfs_file_extent_item { + /* + * transaction id that created this extent + */ + __le64 generation; + /* + * max number of bytes to hold this extent in ram + * when we split a compressed extent we can't know how big + * each of the resulting pieces will be. So, this is + * an upper limit on the size of the extent in ram instead of + * an exact limit. + */ + __le64 ram_bytes; + + /* + * 32 bits for the various ways we might encode the data, + * including compression and encryption. If any of these + * are set to something a given disk format doesn't understand + * it is treated like an incompat flag for reading and writing, + * but not for stat. + */ + u8 compression; + u8 encryption; + __le16 other_encoding; /* spare for later use */ + + /* are we inline data or a real extent? */ + u8 type; + + /* + * disk space consumed by the extent, checksum blocks are included + * in these numbers + */ + __le64 disk_bytenr; + __le64 disk_num_bytes; + /* + * the logical offset in file blocks (no csums) + * this extent record is for. This allows a file extent to point + * into the middle of an existing extent on disk, sharing it + * between two snapshots (useful if some bytes in the middle of the + * extent have changed + */ + __le64 offset; + /* + * the logical number of file blocks (no csums included). This + * always reflects the size uncompressed and without encoding. + */ + __le64 num_bytes; + +} __attribute__ ((__packed__)); + +struct btrfs_csum_item { + u8 csum; +} __attribute__ ((__packed__)); + +/* different types of block groups (and chunks) */ +#define BTRFS_BLOCK_GROUP_DATA (1 << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) +#define BTRFS_BLOCK_GROUP_DUP (1 << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) +#define BTRFS_NR_RAID_TYPES 5 + +struct btrfs_block_group_item { + __le64 used; + __le64 chunk_objectid; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_space_info { + u64 flags; + + u64 total_bytes; /* total bytes in the space, + this doesn't take mirrors into account */ + u64 bytes_used; /* total bytes used, + this doesn't take mirrors into account */ + u64 bytes_pinned; /* total bytes pinned, will be freed when the + transaction finishes */ + u64 bytes_reserved; /* total bytes the allocator has reserved for + current allocations */ + u64 bytes_readonly; /* total bytes that are read only */ + + u64 bytes_may_use; /* number of bytes that may be used for + delalloc/allocations */ + u64 disk_used; /* total bytes used on disk */ + u64 disk_total; /* total bytes on disk, takes mirrors into + account */ + + /* + * we bump reservation progress every time we decrement + * bytes_reserved. This way people waiting for reservations + * know something good has happened and they can check + * for progress. The number here isn't to be trusted, it + * just shows reclaim activity + */ + unsigned long reservation_progress; + + unsigned int full:1; /* indicates that we cannot allocate any more + chunks for this space */ + unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + + unsigned int force_alloc; /* set if we need to force a chunk + alloc for this space */ + + struct list_head list; + + /* for block groups in our same type */ + struct list_head block_groups[BTRFS_NR_RAID_TYPES]; + spinlock_t lock; + struct rw_semaphore groups_sem; + atomic_t caching_threads; +}; + +struct btrfs_block_rsv { + u64 size; + u64 reserved; + u64 freed[2]; + struct btrfs_space_info *space_info; + struct list_head list; + spinlock_t lock; + atomic_t usage; + unsigned int priority:8; + unsigned int durable:1; + unsigned int refill_used:1; + unsigned int full:1; +}; + +/* + * free clusters are used to claim free space in relatively large chunks, + * allowing us to do less seeky writes. They are used for all metadata + * allocations and data allocations in ssd mode. + */ +struct btrfs_free_cluster { + spinlock_t lock; + spinlock_t refill_lock; + struct rb_root root; + + /* largest extent in this cluster */ + u64 max_size; + + /* first extent starting offset */ + u64 window_start; + + struct btrfs_block_group_cache *block_group; + /* + * when a cluster is allocated from a block group, we put the + * cluster onto a list in the block group so that it can + * be freed before the block group is freed. + */ + struct list_head block_group_list; +}; + +enum btrfs_caching_type { + BTRFS_CACHE_NO = 0, + BTRFS_CACHE_STARTED = 1, + BTRFS_CACHE_FINISHED = 2, +}; + +enum btrfs_disk_cache_state { + BTRFS_DC_WRITTEN = 0, + BTRFS_DC_ERROR = 1, + BTRFS_DC_CLEAR = 2, + BTRFS_DC_SETUP = 3, + BTRFS_DC_NEED_WRITE = 4, +}; + +struct btrfs_caching_control { + struct list_head list; + struct mutex mutex; + wait_queue_head_t wait; + struct btrfs_block_group_cache *block_group; + u64 progress; + atomic_t count; +}; + +struct btrfs_block_group_cache { + struct btrfs_key key; + struct btrfs_block_group_item item; + struct btrfs_fs_info *fs_info; + struct inode *inode; + spinlock_t lock; + u64 pinned; + u64 reserved; + u64 reserved_pinned; + u64 bytes_super; + u64 flags; + u64 sectorsize; + unsigned int ro:1; + unsigned int dirty:1; + unsigned int iref:1; + + int disk_cache_state; + + /* cache tracking stuff */ + int cached; + struct btrfs_caching_control *caching_ctl; + u64 last_byte_to_unpin; + + struct btrfs_space_info *space_info; + + /* free space cache stuff */ + struct btrfs_free_space_ctl *free_space_ctl; + + /* block group cache stuff */ + struct rb_node cache_node; + + /* for block groups in the same raid type */ + struct list_head list; + + /* usage count */ + atomic_t count; + + /* List of struct btrfs_free_clusters for this block group. + * Today it will only have one thing on it, but that may change + */ + struct list_head cluster_list; +}; + +struct reloc_control; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_delayed_root; +struct btrfs_fs_info { + u8 fsid[BTRFS_FSID_SIZE]; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + struct btrfs_root *extent_root; + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; + struct btrfs_root *fs_root; + struct btrfs_root *csum_root; + + /* the log root tree is a directory of all the other log roots */ + struct btrfs_root *log_root_tree; + + spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; + + /* block group cache stuff */ + spinlock_t block_group_cache_lock; + struct rb_root block_group_cache_tree; + + struct extent_io_tree freed_extents[2]; + struct extent_io_tree *pinned_extents; + + /* logical->physical extent mapping */ + struct btrfs_mapping_tree mapping_tree; + + /* + * block reservation for extent, checksum, root tree and + * delayed dir index item + */ + struct btrfs_block_rsv global_block_rsv; + /* block reservation for delay allocation */ + struct btrfs_block_rsv delalloc_block_rsv; + /* block reservation for metadata operations */ + struct btrfs_block_rsv trans_block_rsv; + /* block reservation for chunk tree */ + struct btrfs_block_rsv chunk_block_rsv; + + struct btrfs_block_rsv empty_block_rsv; + + /* list of block reservations that cross multiple transactions */ + struct list_head durable_block_rsv_list; + + struct mutex durable_block_rsv_mutex; + + u64 generation; + u64 last_trans_committed; + + /* + * this is updated to the current trans every time a full commit + * is required instead of the faster short fsync log commits + */ + u64 last_trans_log_full_commit; + unsigned long mount_opt:20; + unsigned long compress_type:4; + u64 max_inline; + u64 alloc_start; + struct btrfs_transaction *running_transaction; + wait_queue_head_t transaction_throttle; + wait_queue_head_t transaction_wait; + wait_queue_head_t transaction_blocked_wait; + wait_queue_head_t async_submit_wait; + + struct btrfs_super_block super_copy; + struct btrfs_super_block super_for_commit; + struct block_device *__bdev; + struct super_block *sb; + struct inode *btree_inode; + struct backing_dev_info bdi; + struct mutex tree_log_mutex; + struct mutex transaction_kthread_mutex; + struct mutex cleaner_mutex; + struct mutex chunk_mutex; + struct mutex volume_mutex; + /* + * this protects the ordered operations list only while we are + * processing all of the entries on it. This way we make + * sure the commit code doesn't find the list temporarily empty + * because another function happens to be doing non-waiting preflush + * before jumping into the main commit. + */ + struct mutex ordered_operations_mutex; + struct rw_semaphore extent_commit_sem; + + struct rw_semaphore cleanup_work_sem; + + struct rw_semaphore subvol_sem; + struct srcu_struct subvol_srcu; + + spinlock_t trans_lock; + /* + * the reloc mutex goes with the trans lock, it is taken + * during commit to protect us from the relocation code + */ + struct mutex reloc_mutex; + + struct list_head trans_list; + struct list_head hashers; + struct list_head dead_roots; + struct list_head caching_block_groups; + + spinlock_t delayed_iput_lock; + struct list_head delayed_iputs; + + atomic_t nr_async_submits; + atomic_t async_submit_draining; + atomic_t nr_async_bios; + atomic_t async_delalloc_pages; + atomic_t open_ioctl_trans; + + /* + * this is used by the balancing code to wait for all the pending + * ordered extents + */ + spinlock_t ordered_extent_lock; + + /* + * all of the data=ordered extents pending writeback + * these can span multiple transactions and basically include + * every dirty data page that isn't from nodatacow + */ + struct list_head ordered_extents; + + /* + * all of the inodes that have delalloc bytes. It is possible for + * this list to be empty even when there is still dirty data=ordered + * extents waiting to finish IO. + */ + struct list_head delalloc_inodes; + + /* + * special rename and truncate targets that must be on disk before + * we're allowed to commit. This is basically the ext3 style + * data=ordered list. + */ + struct list_head ordered_operations; + + /* + * there is a pool of worker threads for checksumming during writes + * and a pool for checksumming after reads. This is because readers + * can run with FS locks held, and the writers may be waiting for + * those locks. We don't want ordering in the pending list to cause + * deadlocks, and so the two are serviced separately. + * + * A third pool does submit_bio to avoid deadlocking with the other + * two + */ + struct btrfs_workers generic_worker; + struct btrfs_workers workers; + struct btrfs_workers delalloc_workers; + struct btrfs_workers endio_workers; + struct btrfs_workers endio_meta_workers; + struct btrfs_workers endio_meta_write_workers; + struct btrfs_workers endio_write_workers; + struct btrfs_workers endio_freespace_worker; + struct btrfs_workers submit_workers; + /* + * fixup workers take dirty pages that didn't properly go through + * the cow mechanism and make them safe to write. It happens + * for the sys_munmap function call path + */ + struct btrfs_workers fixup_workers; + struct btrfs_workers delayed_workers; + struct task_struct *transaction_kthread; + struct task_struct *cleaner_kthread; + int thread_pool_size; + + struct kobject super_kobj; + struct completion kobj_unregister; + int do_barriers; + int closing; + int log_root_recovering; + int enospc_unlink; + int trans_no_join; + + u64 total_pinned; + + /* protected by the delalloc lock, used to keep from writing + * metadata until there is a nice batch + */ + u64 dirty_metadata_bytes; + struct list_head dirty_cowonly_roots; + + struct btrfs_fs_devices *fs_devices; + + /* + * the space_info list is almost entirely read only. It only changes + * when we add a new raid type to the FS, and that happens + * very rarely. RCU is used to protect it. + */ + struct list_head space_info; + + struct reloc_control *reloc_ctl; + + spinlock_t delalloc_lock; + u64 delalloc_bytes; + + /* data_alloc_cluster is only used in ssd mode */ + struct btrfs_free_cluster data_alloc_cluster; + + /* all metadata allocations go through this cluster */ + struct btrfs_free_cluster meta_alloc_cluster; + + /* auto defrag inodes go here */ + spinlock_t defrag_inodes_lock; + struct rb_root defrag_inodes; + atomic_t defrag_running; + + spinlock_t ref_cache_lock; + u64 total_ref_cache_size; + + u64 avail_data_alloc_bits; + u64 avail_metadata_alloc_bits; + u64 avail_system_alloc_bits; + u64 data_alloc_profile; + u64 metadata_alloc_profile; + u64 system_alloc_profile; + + unsigned data_chunk_allocations; + unsigned metadata_ratio; + + void *bdev_holder; + + /* private scrub information */ + struct mutex scrub_lock; + atomic_t scrubs_running; + atomic_t scrub_pause_req; + atomic_t scrubs_paused; + atomic_t scrub_cancel_req; + wait_queue_head_t scrub_pause_wait; + struct rw_semaphore scrub_super_lock; + int scrub_workers_refcnt; + struct btrfs_workers scrub_workers; + + /* filesystem state */ + u64 fs_state; + + struct btrfs_delayed_root *delayed_root; +}; + +/* + * in ram representation of the tree. extent_root is used for all allocations + * and for the extent tree extent_root root. + */ +struct btrfs_root { + struct extent_buffer *node; + + struct extent_buffer *commit_root; + struct btrfs_root *log_root; + struct btrfs_root *reloc_root; + + struct btrfs_root_item root_item; + struct btrfs_key root_key; + struct btrfs_fs_info *fs_info; + struct extent_io_tree dirty_log_pages; + + struct kobject root_kobj; + struct completion kobj_unregister; + struct mutex objectid_mutex; + + spinlock_t accounting_lock; + struct btrfs_block_rsv *block_rsv; + + /* free ino cache stuff */ + struct mutex fs_commit_mutex; + struct btrfs_free_space_ctl *free_ino_ctl; + enum btrfs_caching_type cached; + spinlock_t cache_lock; + wait_queue_head_t cache_wait; + struct btrfs_free_space_ctl *free_ino_pinned; + u64 cache_progress; + struct inode *cache_inode; + + struct mutex log_mutex; + wait_queue_head_t log_writer_wait; + wait_queue_head_t log_commit_wait[2]; + atomic_t log_writers; + atomic_t log_commit[2]; + unsigned long log_transid; + unsigned long last_log_commit; + unsigned long log_batch; + pid_t log_start_pid; + bool log_multiple_pids; + + u64 objectid; + u64 last_trans; + + /* data allocations are done in sectorsize units */ + u32 sectorsize; + + /* node allocations are done in nodesize units */ + u32 nodesize; + + /* leaf allocations are done in leafsize units */ + u32 leafsize; + + u32 stripesize; + + u32 type; + + u64 highest_objectid; + + /* btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code. But the + * race is very small, and only the first time the root + * is added to each transaction. So in_trans_setup + * is used to tell us when more checks are required + */ + unsigned long in_trans_setup; + int ref_cows; + int track_dirty; + int in_radix; + + u64 defrag_trans_start; + struct btrfs_key defrag_progress; + struct btrfs_key defrag_max; + int defrag_running; + char *name; + + /* the dirty list is only used by non-reference counted roots */ + struct list_head dirty_list; + + struct list_head root_list; + + spinlock_t orphan_lock; + struct list_head orphan_list; + struct btrfs_block_rsv *orphan_block_rsv; + int orphan_item_inserted; + int orphan_cleanup_state; + + spinlock_t inode_lock; + /* red-black tree that keeps track of in-memory inodes */ + struct rb_root inode_tree; + + /* + * radix tree that keeps track of delayed nodes of every inode, + * protected by inode_lock + */ + struct radix_tree_root delayed_nodes_tree; + /* + * right now this just gets used so that a root has its own devid + * for stat. It may be used for more later + */ + struct super_block anon_super; +}; + +struct btrfs_ioctl_defrag_range_args { + /* start of the defrag operation */ + __u64 start; + + /* number of bytes to defrag, use (u64)-1 to say all */ + __u64 len; + + /* + * flags for the operation, which can include turning + * on compression for this one defrag + */ + __u64 flags; + + /* + * any extent bigger than this will be considered + * already defragged. Use 0 to take the kernel default + * Use 1 to say every single extent must be rewritten + */ + __u32 extent_thresh; + + /* + * which compression method to use if turning on compression + * for this defrag operation. If unspecified, zlib will + * be used + */ + __u32 compress_type; + + /* spare for later */ + __u32 unused[4]; +}; + + +/* + * inode items have the data typically returned from stat and store other + * info about object characteristics. There is one for every file and dir in + * the FS + */ +#define BTRFS_INODE_ITEM_KEY 1 +#define BTRFS_INODE_REF_KEY 12 +#define BTRFS_XATTR_ITEM_KEY 24 +#define BTRFS_ORPHAN_ITEM_KEY 48 +/* reserve 2-15 close to the inode for later flexibility */ + +/* + * dir items are the name -> inode pointers in a directory. There is one + * for every name in a directory. + */ +#define BTRFS_DIR_LOG_ITEM_KEY 60 +#define BTRFS_DIR_LOG_INDEX_KEY 72 +#define BTRFS_DIR_ITEM_KEY 84 +#define BTRFS_DIR_INDEX_KEY 96 +/* + * extent data is for file data + */ +#define BTRFS_EXTENT_DATA_KEY 108 + +/* + * extent csums are stored in a separate tree and hold csums for + * an entire extent on disk. + */ +#define BTRFS_EXTENT_CSUM_KEY 128 + +/* + * root items point to tree roots. They are typically in the root + * tree used by the super block to find all the other trees + */ +#define BTRFS_ROOT_ITEM_KEY 132 + +/* + * root backrefs tie subvols and snapshots to the directory entries that + * reference them + */ +#define BTRFS_ROOT_BACKREF_KEY 144 + +/* + * root refs make a fast index for listing all of the snapshots and + * subvolumes referenced by a given root. They point directly to the + * directory item in the root that references the subvol + */ +#define BTRFS_ROOT_REF_KEY 156 + +/* + * extent items are in the extent map tree. These record which blocks + * are used, and how many references there are to each block + */ +#define BTRFS_EXTENT_ITEM_KEY 168 + +#define BTRFS_TREE_BLOCK_REF_KEY 176 + +#define BTRFS_EXTENT_DATA_REF_KEY 178 + +#define BTRFS_EXTENT_REF_V0_KEY 180 + +#define BTRFS_SHARED_BLOCK_REF_KEY 182 + +#define BTRFS_SHARED_DATA_REF_KEY 184 + +/* + * block groups give us hints into the extent allocation trees. Which + * blocks are free etc etc + */ +#define BTRFS_BLOCK_GROUP_ITEM_KEY 192 + +#define BTRFS_DEV_EXTENT_KEY 204 +#define BTRFS_DEV_ITEM_KEY 216 +#define BTRFS_CHUNK_ITEM_KEY 228 + +/* + * string items are for debugging. They just store a short string of + * data in the FS + */ +#define BTRFS_STRING_ITEM_KEY 253 + +/* + * Flags for mount options. + * + * Note: don't forget to add new options to btrfs_show_options() + */ +#define BTRFS_MOUNT_NODATASUM (1 << 0) +#define BTRFS_MOUNT_NODATACOW (1 << 1) +#define BTRFS_MOUNT_NOBARRIER (1 << 2) +#define BTRFS_MOUNT_SSD (1 << 3) +#define BTRFS_MOUNT_DEGRADED (1 << 4) +#define BTRFS_MOUNT_COMPRESS (1 << 5) +#define BTRFS_MOUNT_NOTREELOG (1 << 6) +#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) +#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) +#define BTRFS_MOUNT_NOSSD (1 << 9) +#define BTRFS_MOUNT_DISCARD (1 << 10) +#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) +#define BTRFS_MOUNT_SPACE_CACHE (1 << 12) +#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) +#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) +#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) +#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) +#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) + +#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) +#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ + BTRFS_MOUNT_##opt) +/* + * Inode flags + */ +#define BTRFS_INODE_NODATASUM (1 << 0) +#define BTRFS_INODE_NODATACOW (1 << 1) +#define BTRFS_INODE_READONLY (1 << 2) +#define BTRFS_INODE_NOCOMPRESS (1 << 3) +#define BTRFS_INODE_PREALLOC (1 << 4) +#define BTRFS_INODE_SYNC (1 << 5) +#define BTRFS_INODE_IMMUTABLE (1 << 6) +#define BTRFS_INODE_APPEND (1 << 7) +#define BTRFS_INODE_NODUMP (1 << 8) +#define BTRFS_INODE_NOATIME (1 << 9) +#define BTRFS_INODE_DIRSYNC (1 << 10) +#define BTRFS_INODE_COMPRESS (1 << 11) + +#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) + +/* some macros to generate set/get funcs for the struct fields. This + * assumes there is a lefoo_to_cpu for every type, so lets make a simple + * one for u8: + */ +#define le8_to_cpu(v) (v) +#define cpu_to_le8(v) (v) +#define __le8 u8 + +#define read_eb_member(eb, ptr, type, member, result) ( \ + read_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define write_eb_member(eb, ptr, type, member, result) ( \ + write_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#ifndef BTRFS_SETGET_FUNCS +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); +#endif + +#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(struct extent_buffer *eb) \ +{ \ + type *p = kmap_atomic(eb->first_page, KM_USER0); \ + u##bits res = le##bits##_to_cpu(p->member); \ + kunmap_atomic(p, KM_USER0); \ + return res; \ +} \ +static inline void btrfs_set_##name(struct extent_buffer *eb, \ + u##bits val) \ +{ \ + type *p = kmap_atomic(eb->first_page, KM_USER0); \ + p->member = cpu_to_le##bits(val); \ + kunmap_atomic(p, KM_USER0); \ +} + +#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(type *s) \ +{ \ + return le##bits##_to_cpu(s->member); \ +} \ +static inline void btrfs_set_##name(type *s, u##bits val) \ +{ \ + s->member = cpu_to_le##bits(val); \ +} + +BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64); +BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); +BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); +BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); +BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, + start_offset, 64); +BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); +BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); +BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); +BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, + dev_group, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, + seek_speed, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, + bandwidth, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, + generation, 64); + +static inline char *btrfs_device_uuid(struct btrfs_dev_item *d) +{ + return (char *)d + offsetof(struct btrfs_dev_item, uuid); +} + +static inline char *btrfs_device_fsid(struct btrfs_dev_item *d) +{ + return (char *)d + offsetof(struct btrfs_dev_item, fsid); +} + +BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); +BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); +BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); +BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); +BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); + +static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) +{ + return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); +} + +BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, + stripe_len, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, + num_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, + sub_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); + +static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, + int nr) +{ + unsigned long offset = (unsigned long)c; + offset += offsetof(struct btrfs_chunk, stripe); + offset += nr * sizeof(struct btrfs_stripe); + return (struct btrfs_stripe *)offset; +} + +static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); +} + +/* struct btrfs_block_group_item */ +BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); + +BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(disk_block_group_flags, + struct btrfs_block_group_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(block_group_flags, + struct btrfs_block_group_item, flags, 64); + +/* struct btrfs_inode_ref */ +BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); + +/* struct btrfs_inode_item */ +BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); +BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); +BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); +BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); +BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); + +static inline struct btrfs_timespec * +btrfs_inode_atime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, atime); + return (struct btrfs_timespec *)ptr; +} + +static inline struct btrfs_timespec * +btrfs_inode_mtime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, mtime); + return (struct btrfs_timespec *)ptr; +} + +static inline struct btrfs_timespec * +btrfs_inode_ctime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, ctime); + return (struct btrfs_timespec *)ptr; +} + +BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); + +/* struct btrfs_dev_extent */ +BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, + chunk_tree, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); + +static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) +{ + unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid); + return (u8 *)((unsigned long)dev + ptr); +} + +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); +BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); + +BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32); + + +BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); + +static inline void btrfs_tree_block_key(struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +static inline void btrfs_set_tree_block_key(struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, + root, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, + objectid, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, + offset, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, + count, 32); + +BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, + count, 32); + +BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, + type, 8); +BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, + offset, 64); + +static inline u32 btrfs_extent_inline_ref_size(int type) +{ + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_SHARED_DATA_REF_KEY) + return sizeof(struct btrfs_shared_data_ref) + + sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_EXTENT_DATA_REF_KEY) + return sizeof(struct btrfs_extent_data_ref) + + offsetof(struct btrfs_extent_inline_ref, offset); + BUG(); + return 0; +} + +BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64); +BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0, + generation, 64); +BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64); +BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32); + +/* struct btrfs_node */ +BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); + +static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_blockptr(struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline unsigned long btrfs_node_key_ptr_offset(int nr) +{ + return offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; +} + +void btrfs_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr); + +static inline void btrfs_set_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr; + ptr = btrfs_node_key_ptr_offset(nr); + write_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} + +/* struct btrfs_item */ +BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); + +static inline unsigned long btrfs_item_nr_offset(int nr) +{ + return offsetof(struct btrfs_leaf, items) + + sizeof(struct btrfs_item) * nr; +} + +static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb, + int nr) +{ + return (struct btrfs_item *)btrfs_item_nr_offset(nr); +} + +static inline u32 btrfs_item_end(struct extent_buffer *eb, + struct btrfs_item *item) +{ + return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); +} + +static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_end(eb, btrfs_item_nr(eb, nr)); +} + +static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_offset(eb, btrfs_item_nr(eb, nr)); +} + +static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_size(eb, btrfs_item_nr(eb, nr)); +} + +static inline void btrfs_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + read_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +static inline void btrfs_set_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + write_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); + +/* + * struct btrfs_root_ref + */ +BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); + +/* struct btrfs_dir_item */ +BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); + +static inline void btrfs_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, + num_entries, 64); +BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, + num_bitmaps, 64); +BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, + generation, 64); + +static inline void btrfs_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +static inline void btrfs_set_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +/* struct btrfs_disk_key */ +BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, + objectid, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, + struct btrfs_disk_key *disk) +{ + cpu->offset = le64_to_cpu(disk->offset); + cpu->type = disk->type; + cpu->objectid = le64_to_cpu(disk->objectid); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, + struct btrfs_key *cpu) +{ + disk->offset = cpu_to_le64(cpu->offset); + disk->type = cpu->type; + disk->objectid = cpu_to_le64(cpu->objectid); +} + +static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + btrfs_node_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + btrfs_item_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_key *key) +{ + struct btrfs_disk_key disk_key; + btrfs_dir_item_key(eb, item, &disk_key); + btrfs_disk_key_to_cpu(key, &disk_key); +} + + +static inline u8 btrfs_key_type(struct btrfs_key *key) +{ + return key->type; +} + +static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val) +{ + key->type = val; +} + +/* struct btrfs_header */ +BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); +BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); +BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); + +static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag) +{ + return (btrfs_header_flags(eb) & flag) == flag; +} + +static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + btrfs_set_header_flags(eb, flags | flag); + return (flags & flag) == flag; +} + +static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + btrfs_set_header_flags(eb, flags & ~flag); + return (flags & flag) == flag; +} + +static inline int btrfs_header_backref_rev(struct extent_buffer *eb) +{ + u64 flags = btrfs_header_flags(eb); + return flags >> BTRFS_BACKREF_REV_SHIFT; +} + +static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, + int rev) +{ + u64 flags = btrfs_header_flags(eb); + flags &= ~BTRFS_BACKREF_REV_MASK; + flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; + btrfs_set_header_flags(eb, flags); +} + +static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, fsid); + return (u8 *)ptr; +} + +static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid); + return (u8 *)ptr; +} + +static inline int btrfs_is_leaf(struct extent_buffer *eb) +{ + return btrfs_header_level(eb) == 0; +} + +/* struct btrfs_root_item */ +BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, + generation, 64); +BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); + +BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); +BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); +BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); +BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, + last_snapshot, 64); + +static inline bool btrfs_root_readonly(struct btrfs_root *root) +{ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; +} + +/* struct btrfs_super_block */ + +BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); +BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, + struct btrfs_super_block, sys_chunk_array_size, 32); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, + struct btrfs_super_block, chunk_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, + root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, + chunk_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, + log_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, + log_root_transid, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, + log_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, + sectorsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, + nodesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block, + leafsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, + stripesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, + root_dir_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, + num_devices, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, + compat_ro_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, + incompat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, + csum_type, 16); +BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, + cache_generation, 64); + +static inline int btrfs_super_csum_size(struct btrfs_super_block *s) +{ + int t = btrfs_super_csum_type(s); + BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes)); + return btrfs_csum_sizes[t]; +} + +static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) +{ + return offsetof(struct btrfs_leaf, items); +} + +/* struct btrfs_file_extent_item */ +BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); + +static inline unsigned long +btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) +{ + unsigned long offset = (unsigned long)e; + offset += offsetof(struct btrfs_file_extent_item, disk_bytenr); + return offset; +} + +static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) +{ + return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; +} + +BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, + disk_bytenr, 64); +BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, + disk_num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, + offset, 64); +BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, + num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, + ram_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, + compression, 8); +BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, + encryption, 8); +BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, + other_encoding, 16); + +/* this returns the number of file bytes represented by the inline item. + * If an item is compressed, this is the uncompressed size + */ +static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, + struct btrfs_file_extent_item *e) +{ + return btrfs_file_extent_ram_bytes(eb, e); +} + +/* + * this returns the number of bytes used by the item on disk, minus the + * size of any extent headers. If a file is compressed on disk, this is + * the compressed size + */ +static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, + struct btrfs_item *e) +{ + unsigned long offset; + offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); + return btrfs_item_size(eb, e) - offset; +} + +static inline struct btrfs_root *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline u32 btrfs_level_size(struct btrfs_root *root, int level) +{ + if (level == 0) + return root->leafsize; + return root->nodesize; +} + +/* helper function to cast into the data area of the leaf. */ +#define btrfs_item_ptr(leaf, slot, type) \ + ((type *)(btrfs_leaf_data(leaf) + \ + btrfs_item_offset_nr(leaf, slot))) + +#define btrfs_item_ptr_offset(leaf, slot) \ + ((unsigned long)(btrfs_leaf_data(leaf) + \ + btrfs_item_offset_nr(leaf, slot))) + +static inline struct dentry *fdentry(struct file *file) +{ + return file->f_path.dentry; +} + +static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +{ + return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && + (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); +} + +/* extent-tree.c */ +static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, + int num_items) +{ + return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + 3 * num_items; +} + +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count); +int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags); +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num, int reserved); +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr); +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, + u64 bytenr); +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +u64 btrfs_find_block_group(struct btrfs_root *root, + u64 search_start, u64 search_hint, int owner); +struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u32 blocksize, + u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 hint, u64 empty_size); +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + u64 parent, int last_ref); +struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize, + int level); +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, + u64 offset, struct btrfs_key *ins); +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins); +int btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data); +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 flags, + int is_data); +int btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset); + +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int sinfo); +int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset); + +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); +int btrfs_free_block_groups(struct btrfs_fs_info *info); +int btrfs_read_block_groups(struct btrfs_root *root); +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr); +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_objectid, u64 chunk_offset, + u64 size); +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 group_start); +u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); +void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); +void btrfs_clear_space_info_full(struct btrfs_fs_info *info); +int btrfs_check_data_free_space(struct inode *inode, u64 bytes); +void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); +int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int num_items); +void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, + struct inode *inode); +void btrfs_orphan_release_metadata(struct inode *inode); +int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); +int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); +void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); +void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); +void btrfs_free_block_rsv(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); +void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv); +int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes); +int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved, int min_factor); +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, + u64 num_bytes); +void btrfs_block_rsv_release(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes); +int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *rsv); +int btrfs_set_block_group_ro(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); +int btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); +void btrfs_put_block_group_cache(struct btrfs_fs_info *info); +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); +int btrfs_error_unpin_extent_range(struct btrfs_root *root, + u64 start, u64 end); +int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes); +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type); +int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info); +/* ctree.c */ +int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot); +int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type); +int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key); +struct extent_buffer *btrfs_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, int lowest_level, + int cache_only, u64 min_trans); +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_key *max_key, + struct btrfs_path *path, int cache_only, + u64 min_trans); +int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret); +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid); +int btrfs_block_can_be_shared(struct btrfs_root *root, + struct extent_buffer *buf); +int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, u32 data_size); +int btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end); +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset); +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key); +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow); +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, int cache_only, u64 *last_ret, + struct btrfs_key *progress); +void btrfs_release_path(struct btrfs_path *p); +struct btrfs_path *btrfs_alloc_path(void); +void btrfs_free_path(struct btrfs_path *p); +void btrfs_set_path_blocking(struct btrfs_path *p); +void btrfs_clear_path_blocking(struct btrfs_path *p, + struct extent_buffer *held); +void btrfs_unlock_up_safe(struct btrfs_path *p, int level); + +int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int slot, int nr); +static inline int btrfs_del_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + return btrfs_del_items(trans, root, path, path->slots[0], 1); +} + +int setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr); +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, void *data, u32 data_size); +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, int nr); + +static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u32 data_size) +{ + return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); +} + +int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); +int btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref); +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent); +static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) +{ + /* + * Get synced with close_ctree() + */ + smp_mb(); + return fs_info->closing; +} + +/* root-item.c */ +int btrfs_find_root_ref(struct btrfs_root *tree_root, + struct btrfs_path *path, + u64 root_id, u64 ref_id); +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u64 ref_id, u64 dirid, u64 sequence, + const char *name, int name_len); +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, + const char *name, int name_len); +int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key); +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item); +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item); +int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct + btrfs_root_item *item, struct btrfs_key *key); +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); +int btrfs_find_orphan_roots(struct btrfs_root *tree_root); +int btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); +void btrfs_check_and_init_root_item(struct btrfs_root_item *item); + +/* dir-item.c */ +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + int name_len, struct inode *dir, + struct btrfs_key *location, u8 type, u64 index); +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, int name_len, + int mod); +struct btrfs_dir_item * +btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 objectid, const char *name, int name_len, + int mod); +struct btrfs_dir_item * +btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const char *name, int name_len); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len); +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di); +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len); +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod); +int verify_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dir_item *dir_item); + +/* orphan.c */ +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); + +/* inode-item.c */ +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index); +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index); +struct btrfs_inode_ref * +btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int mod); +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod); + +/* file-item.c */ +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len); +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst); +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 logical_offset, u32 *dst); +int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding); +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 bytenr, int mod); +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums); +int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 file_start, int contig); +struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow); +int btrfs_csum_truncate(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u64 isize); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit); +/* inode.c */ + +/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ +#if defined(ClearPageFsMisc) && !defined(ClearPageChecked) +#define ClearPageChecked ClearPageFsMisc +#define SetPageChecked SetPageFsMisc +#define PageChecked PageFsMisc +#endif + +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); +int btrfs_set_inode_index(struct inode *dir, u64 *index); +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len); +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index); +int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, u64 objectid, + const char *name, int name_len); +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 new_size, + u32 min_type); + +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); +int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, + struct extent_state **cached_state); +int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc); +int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, + struct btrfs_root *new_root, u64 new_dirid); +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, + size_t size, struct bio *bio, unsigned long bio_flags); + +unsigned long btrfs_force_ra(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + pgoff_t offset, pgoff_t last_index); +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +int btrfs_readpage(struct file *file, struct page *page); +void btrfs_evict_inode(struct inode *inode); +int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); +void btrfs_dirty_inode(struct inode *inode, int flags); +struct inode *btrfs_alloc_inode(struct super_block *sb); +void btrfs_destroy_inode(struct inode *inode); +int btrfs_drop_inode(struct inode *inode); +int btrfs_init_cachep(void); +void btrfs_destroy_cachep(void); +long btrfs_ioctl_trans_end(struct file *file); +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *was_new); +struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 end, + int create); +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); +int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); +int btrfs_orphan_cleanup(struct btrfs_root *root); +void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); +void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); +int btrfs_invalidate_inodes(struct btrfs_root *root); +void btrfs_add_delayed_iput(struct inode *inode); +void btrfs_run_delayed_iputs(struct btrfs_root *root); +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +extern const struct dentry_operations btrfs_dentry_operations; + +/* ioctl.c */ +long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +void btrfs_update_iflags(struct inode *inode); +void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); +int btrfs_defrag_file(struct inode *inode, struct file *file, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_pages); +/* file.c */ +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct inode *inode); +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); +int btrfs_sync_file(struct file *file, int datasync); +int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned); +extern const struct file_operations btrfs_file_operations; +int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, + u64 start, u64 end, u64 *hint_byte, int drop_cache); +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct inode *inode, u64 start, u64 end); +int btrfs_release_file(struct inode *inode, struct file *file); +void btrfs_drop_pages(struct page **pages, size_t num_pages); +int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, + struct page **pages, size_t num_pages, + loff_t pos, size_t write_bytes, + struct extent_state **cached); + +/* tree-defrag.c */ +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int cache_only); + +/* sysfs.c */ +int btrfs_init_sysfs(void); +void btrfs_exit_sysfs(void); + +/* xattr.c */ +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); + +/* super.c */ +int btrfs_parse_options(struct btrfs_root *root, char *options); +int btrfs_sync_fs(struct super_block *sb, int wait); +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno); + +#define btrfs_std_error(fs_info, errno) \ +do { \ + if ((errno)) \ + __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\ +} while (0) + +/* acl.c */ +#ifdef CONFIG_BTRFS_FS_POSIX_ACL +int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags); +#else +#define btrfs_check_acl NULL +#endif +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir); +int btrfs_acl_chmod(struct inode *inode); + +/* relocation.c */ +int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_root *root); +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow); +void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); + +/* scrub.c */ +int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, + struct btrfs_scrub_progress *progress, int readonly); +int btrfs_scrub_pause(struct btrfs_root *root); +int btrfs_scrub_pause_super(struct btrfs_root *root); +int btrfs_scrub_continue(struct btrfs_root *root); +int btrfs_scrub_continue_super(struct btrfs_root *root); +int btrfs_scrub_cancel(struct btrfs_root *root); +int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); +int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); +int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, + struct btrfs_scrub_progress *progress); + +#endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c new file mode 100644 index 00000000..98c68e65 --- /dev/null +++ b/fs/btrfs/delayed-inode.c @@ -0,0 +1,1773 @@ +/* + * Copyright (C) 2011 Fujitsu. All rights reserved. + * Written by Miao Xie + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "delayed-inode.h" +#include "disk-io.h" +#include "transaction.h" + +#define BTRFS_DELAYED_WRITEBACK 400 +#define BTRFS_DELAYED_BACKGROUND 100 + +static struct kmem_cache *delayed_node_cache; + +int __init btrfs_delayed_inode_init(void) +{ + delayed_node_cache = kmem_cache_create("delayed_node", + sizeof(struct btrfs_delayed_node), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!delayed_node_cache) + return -ENOMEM; + return 0; +} + +void btrfs_delayed_inode_exit(void) +{ + if (delayed_node_cache) + kmem_cache_destroy(delayed_node_cache); +} + +static inline void btrfs_init_delayed_node( + struct btrfs_delayed_node *delayed_node, + struct btrfs_root *root, u64 inode_id) +{ + delayed_node->root = root; + delayed_node->inode_id = inode_id; + atomic_set(&delayed_node->refs, 0); + delayed_node->count = 0; + delayed_node->in_list = 0; + delayed_node->inode_dirty = 0; + delayed_node->ins_root = RB_ROOT; + delayed_node->del_root = RB_ROOT; + mutex_init(&delayed_node->mutex); + delayed_node->index_cnt = 0; + INIT_LIST_HEAD(&delayed_node->n_list); + INIT_LIST_HEAD(&delayed_node->p_list); + delayed_node->bytes_reserved = 0; +} + +static inline int btrfs_is_continuous_delayed_item( + struct btrfs_delayed_item *item1, + struct btrfs_delayed_item *item2) +{ + if (item1->key.type == BTRFS_DIR_INDEX_KEY && + item1->key.objectid == item2->key.objectid && + item1->key.type == item2->key.type && + item1->key.offset + 1 == item2->key.offset) + return 1; + return 0; +} + +static inline struct btrfs_delayed_root *btrfs_get_delayed_root( + struct btrfs_root *root) +{ + return root->fs_info->delayed_root; +} + +static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) +{ + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + struct btrfs_root *root = btrfs_inode->root; + u64 ino = btrfs_ino(inode); + struct btrfs_delayed_node *node; + + node = ACCESS_ONCE(btrfs_inode->delayed_node); + if (node) { + atomic_inc(&node->refs); + return node; + } + + spin_lock(&root->inode_lock); + node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + if (node) { + if (btrfs_inode->delayed_node) { + atomic_inc(&node->refs); /* can be accessed */ + BUG_ON(btrfs_inode->delayed_node != node); + spin_unlock(&root->inode_lock); + return node; + } + btrfs_inode->delayed_node = node; + atomic_inc(&node->refs); /* can be accessed */ + atomic_inc(&node->refs); /* cached in the inode */ + spin_unlock(&root->inode_lock); + return node; + } + spin_unlock(&root->inode_lock); + + return NULL; +} + +static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( + struct inode *inode) +{ + struct btrfs_delayed_node *node; + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + struct btrfs_root *root = btrfs_inode->root; + u64 ino = btrfs_ino(inode); + int ret; + +again: + node = btrfs_get_delayed_node(inode); + if (node) + return node; + + node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS); + if (!node) + return ERR_PTR(-ENOMEM); + btrfs_init_delayed_node(node, root, ino); + + atomic_inc(&node->refs); /* cached in the btrfs inode */ + atomic_inc(&node->refs); /* can be accessed */ + + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) { + kmem_cache_free(delayed_node_cache, node); + return ERR_PTR(ret); + } + + spin_lock(&root->inode_lock); + ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); + if (ret == -EEXIST) { + kmem_cache_free(delayed_node_cache, node); + spin_unlock(&root->inode_lock); + radix_tree_preload_end(); + goto again; + } + btrfs_inode->delayed_node = node; + spin_unlock(&root->inode_lock); + radix_tree_preload_end(); + + return node; +} + +/* + * Call it when holding delayed_node->mutex + * + * If mod = 1, add this node into the prepared list. + */ +static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, + struct btrfs_delayed_node *node, + int mod) +{ + spin_lock(&root->lock); + if (node->in_list) { + if (!list_empty(&node->p_list)) + list_move_tail(&node->p_list, &root->prepare_list); + else if (mod) + list_add_tail(&node->p_list, &root->prepare_list); + } else { + list_add_tail(&node->n_list, &root->node_list); + list_add_tail(&node->p_list, &root->prepare_list); + atomic_inc(&node->refs); /* inserted into list */ + root->nodes++; + node->in_list = 1; + } + spin_unlock(&root->lock); +} + +/* Call it when holding delayed_node->mutex */ +static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, + struct btrfs_delayed_node *node) +{ + spin_lock(&root->lock); + if (node->in_list) { + root->nodes--; + atomic_dec(&node->refs); /* not in the list */ + list_del_init(&node->n_list); + if (!list_empty(&node->p_list)) + list_del_init(&node->p_list); + node->in_list = 0; + } + spin_unlock(&root->lock); +} + +struct btrfs_delayed_node *btrfs_first_delayed_node( + struct btrfs_delayed_root *delayed_root) +{ + struct list_head *p; + struct btrfs_delayed_node *node = NULL; + + spin_lock(&delayed_root->lock); + if (list_empty(&delayed_root->node_list)) + goto out; + + p = delayed_root->node_list.next; + node = list_entry(p, struct btrfs_delayed_node, n_list); + atomic_inc(&node->refs); +out: + spin_unlock(&delayed_root->lock); + + return node; +} + +struct btrfs_delayed_node *btrfs_next_delayed_node( + struct btrfs_delayed_node *node) +{ + struct btrfs_delayed_root *delayed_root; + struct list_head *p; + struct btrfs_delayed_node *next = NULL; + + delayed_root = node->root->fs_info->delayed_root; + spin_lock(&delayed_root->lock); + if (!node->in_list) { /* not in the list */ + if (list_empty(&delayed_root->node_list)) + goto out; + p = delayed_root->node_list.next; + } else if (list_is_last(&node->n_list, &delayed_root->node_list)) + goto out; + else + p = node->n_list.next; + + next = list_entry(p, struct btrfs_delayed_node, n_list); + atomic_inc(&next->refs); +out: + spin_unlock(&delayed_root->lock); + + return next; +} + +static void __btrfs_release_delayed_node( + struct btrfs_delayed_node *delayed_node, + int mod) +{ + struct btrfs_delayed_root *delayed_root; + + if (!delayed_node) + return; + + delayed_root = delayed_node->root->fs_info->delayed_root; + + mutex_lock(&delayed_node->mutex); + if (delayed_node->count) + btrfs_queue_delayed_node(delayed_root, delayed_node, mod); + else + btrfs_dequeue_delayed_node(delayed_root, delayed_node); + mutex_unlock(&delayed_node->mutex); + + if (atomic_dec_and_test(&delayed_node->refs)) { + struct btrfs_root *root = delayed_node->root; + spin_lock(&root->inode_lock); + if (atomic_read(&delayed_node->refs) == 0) { + radix_tree_delete(&root->delayed_nodes_tree, + delayed_node->inode_id); + kmem_cache_free(delayed_node_cache, delayed_node); + } + spin_unlock(&root->inode_lock); + } +} + +static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) +{ + __btrfs_release_delayed_node(node, 0); +} + +struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( + struct btrfs_delayed_root *delayed_root) +{ + struct list_head *p; + struct btrfs_delayed_node *node = NULL; + + spin_lock(&delayed_root->lock); + if (list_empty(&delayed_root->prepare_list)) + goto out; + + p = delayed_root->prepare_list.next; + list_del_init(p); + node = list_entry(p, struct btrfs_delayed_node, p_list); + atomic_inc(&node->refs); +out: + spin_unlock(&delayed_root->lock); + + return node; +} + +static inline void btrfs_release_prepared_delayed_node( + struct btrfs_delayed_node *node) +{ + __btrfs_release_delayed_node(node, 1); +} + +struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) +{ + struct btrfs_delayed_item *item; + item = kmalloc(sizeof(*item) + data_len, GFP_NOFS); + if (item) { + item->data_len = data_len; + item->ins_or_del = 0; + item->bytes_reserved = 0; + item->delayed_node = NULL; + atomic_set(&item->refs, 1); + } + return item; +} + +/* + * __btrfs_lookup_delayed_item - look up the delayed item by key + * @delayed_node: pointer to the delayed node + * @key: the key to look up + * @prev: used to store the prev item if the right item isn't found + * @next: used to store the next item if the right item isn't found + * + * Note: if we don't find the right item, we will return the prev item and + * the next item. + */ +static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( + struct rb_root *root, + struct btrfs_key *key, + struct btrfs_delayed_item **prev, + struct btrfs_delayed_item **next) +{ + struct rb_node *node, *prev_node = NULL; + struct btrfs_delayed_item *delayed_item = NULL; + int ret = 0; + + node = root->rb_node; + + while (node) { + delayed_item = rb_entry(node, struct btrfs_delayed_item, + rb_node); + prev_node = node; + ret = btrfs_comp_cpu_keys(&delayed_item->key, key); + if (ret < 0) + node = node->rb_right; + else if (ret > 0) + node = node->rb_left; + else + return delayed_item; + } + + if (prev) { + if (!prev_node) + *prev = NULL; + else if (ret < 0) + *prev = delayed_item; + else if ((node = rb_prev(prev_node)) != NULL) { + *prev = rb_entry(node, struct btrfs_delayed_item, + rb_node); + } else + *prev = NULL; + } + + if (next) { + if (!prev_node) + *next = NULL; + else if (ret > 0) + *next = delayed_item; + else if ((node = rb_next(prev_node)) != NULL) { + *next = rb_entry(node, struct btrfs_delayed_item, + rb_node); + } else + *next = NULL; + } + return NULL; +} + +struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( + struct btrfs_delayed_node *delayed_node, + struct btrfs_key *key) +{ + struct btrfs_delayed_item *item; + + item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key, + NULL, NULL); + return item; +} + +struct btrfs_delayed_item *__btrfs_lookup_delayed_deletion_item( + struct btrfs_delayed_node *delayed_node, + struct btrfs_key *key) +{ + struct btrfs_delayed_item *item; + + item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key, + NULL, NULL); + return item; +} + +struct btrfs_delayed_item *__btrfs_search_delayed_insertion_item( + struct btrfs_delayed_node *delayed_node, + struct btrfs_key *key) +{ + struct btrfs_delayed_item *item, *next; + + item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key, + NULL, &next); + if (!item) + item = next; + + return item; +} + +struct btrfs_delayed_item *__btrfs_search_delayed_deletion_item( + struct btrfs_delayed_node *delayed_node, + struct btrfs_key *key) +{ + struct btrfs_delayed_item *item, *next; + + item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key, + NULL, &next); + if (!item) + item = next; + + return item; +} + +static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, + struct btrfs_delayed_item *ins, + int action) +{ + struct rb_node **p, *node; + struct rb_node *parent_node = NULL; + struct rb_root *root; + struct btrfs_delayed_item *item; + int cmp; + + if (action == BTRFS_DELAYED_INSERTION_ITEM) + root = &delayed_node->ins_root; + else if (action == BTRFS_DELAYED_DELETION_ITEM) + root = &delayed_node->del_root; + else + BUG(); + p = &root->rb_node; + node = &ins->rb_node; + + while (*p) { + parent_node = *p; + item = rb_entry(parent_node, struct btrfs_delayed_item, + rb_node); + + cmp = btrfs_comp_cpu_keys(&item->key, &ins->key); + if (cmp < 0) + p = &(*p)->rb_right; + else if (cmp > 0) + p = &(*p)->rb_left; + else + return -EEXIST; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + ins->delayed_node = delayed_node; + ins->ins_or_del = action; + + if (ins->key.type == BTRFS_DIR_INDEX_KEY && + action == BTRFS_DELAYED_INSERTION_ITEM && + ins->key.offset >= delayed_node->index_cnt) + delayed_node->index_cnt = ins->key.offset + 1; + + delayed_node->count++; + atomic_inc(&delayed_node->root->fs_info->delayed_root->items); + return 0; +} + +static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node, + struct btrfs_delayed_item *item) +{ + return __btrfs_add_delayed_item(node, item, + BTRFS_DELAYED_INSERTION_ITEM); +} + +static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, + struct btrfs_delayed_item *item) +{ + return __btrfs_add_delayed_item(node, item, + BTRFS_DELAYED_DELETION_ITEM); +} + +static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) +{ + struct rb_root *root; + struct btrfs_delayed_root *delayed_root; + + delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; + + BUG_ON(!delayed_root); + BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM && + delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM); + + if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM) + root = &delayed_item->delayed_node->ins_root; + else + root = &delayed_item->delayed_node->del_root; + + rb_erase(&delayed_item->rb_node, root); + delayed_item->delayed_node->count--; + atomic_dec(&delayed_root->items); + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND && + waitqueue_active(&delayed_root->wait)) + wake_up(&delayed_root->wait); +} + +static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) +{ + if (item) { + __btrfs_remove_delayed_item(item); + if (atomic_dec_and_test(&item->refs)) + kfree(item); + } +} + +struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( + struct btrfs_delayed_node *delayed_node) +{ + struct rb_node *p; + struct btrfs_delayed_item *item = NULL; + + p = rb_first(&delayed_node->ins_root); + if (p) + item = rb_entry(p, struct btrfs_delayed_item, rb_node); + + return item; +} + +struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( + struct btrfs_delayed_node *delayed_node) +{ + struct rb_node *p; + struct btrfs_delayed_item *item = NULL; + + p = rb_first(&delayed_node->del_root); + if (p) + item = rb_entry(p, struct btrfs_delayed_item, rb_node); + + return item; +} + +struct btrfs_delayed_item *__btrfs_next_delayed_item( + struct btrfs_delayed_item *item) +{ + struct rb_node *p; + struct btrfs_delayed_item *next = NULL; + + p = rb_next(&item->rb_node); + if (p) + next = rb_entry(p, struct btrfs_delayed_item, rb_node); + + return next; +} + +static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root, + u64 root_id) +{ + struct btrfs_key root_key; + + if (root->objectid == root_id) + return root; + + root_key.objectid = root_id; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + return btrfs_read_fs_root_no_name(root->fs_info, &root_key); +} + +static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_item *item) +{ + struct btrfs_block_rsv *src_rsv; + struct btrfs_block_rsv *dst_rsv; + u64 num_bytes; + int ret; + + if (!trans->bytes_reserved) + return 0; + + src_rsv = trans->block_rsv; + dst_rsv = &root->fs_info->global_block_rsv; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!ret) + item->bytes_reserved = num_bytes; + + return ret; +} + +static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, + struct btrfs_delayed_item *item) +{ + struct btrfs_block_rsv *rsv; + + if (!item->bytes_reserved) + return; + + rsv = &root->fs_info->global_block_rsv; + btrfs_block_rsv_release(root, rsv, + item->bytes_reserved); +} + +static int btrfs_delayed_inode_reserve_metadata( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_node *node) +{ + struct btrfs_block_rsv *src_rsv; + struct btrfs_block_rsv *dst_rsv; + u64 num_bytes; + int ret; + + if (!trans->bytes_reserved) + return 0; + + src_rsv = trans->block_rsv; + dst_rsv = &root->fs_info->global_block_rsv; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!ret) + node->bytes_reserved = num_bytes; + + return ret; +} + +static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, + struct btrfs_delayed_node *node) +{ + struct btrfs_block_rsv *rsv; + + if (!node->bytes_reserved) + return; + + rsv = &root->fs_info->global_block_rsv; + btrfs_block_rsv_release(root, rsv, + node->bytes_reserved); + node->bytes_reserved = 0; +} + +/* + * This helper will insert some continuous items into the same leaf according + * to the free space of the leaf. + */ +static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *item) +{ + struct btrfs_delayed_item *curr, *next; + int free_space; + int total_data_size = 0, total_size = 0; + struct extent_buffer *leaf; + char *data_ptr; + struct btrfs_key *keys; + u32 *data_size; + struct list_head head; + int slot; + int nitems; + int i; + int ret = 0; + + BUG_ON(!path->nodes[0]); + + leaf = path->nodes[0]; + free_space = btrfs_leaf_free_space(root, leaf); + INIT_LIST_HEAD(&head); + + next = item; + nitems = 0; + + /* + * count the number of the continuous items that we can insert in batch + */ + while (total_size + next->data_len + sizeof(struct btrfs_item) <= + free_space) { + total_data_size += next->data_len; + total_size += next->data_len + sizeof(struct btrfs_item); + list_add_tail(&next->tree_list, &head); + nitems++; + + curr = next; + next = __btrfs_next_delayed_item(curr); + if (!next) + break; + + if (!btrfs_is_continuous_delayed_item(curr, next)) + break; + } + + if (!nitems) { + ret = 0; + goto out; + } + + /* + * we need allocate some memory space, but it might cause the task + * to sleep, so we set all locked nodes in the path to blocking locks + * first. + */ + btrfs_set_path_blocking(path); + + keys = kmalloc(sizeof(struct btrfs_key) * nitems, GFP_NOFS); + if (!keys) { + ret = -ENOMEM; + goto out; + } + + data_size = kmalloc(sizeof(u32) * nitems, GFP_NOFS); + if (!data_size) { + ret = -ENOMEM; + goto error; + } + + /* get keys of all the delayed items */ + i = 0; + list_for_each_entry(next, &head, tree_list) { + keys[i] = next->key; + data_size[i] = next->data_len; + i++; + } + + /* reset all the locked nodes in the patch to spinning locks. */ + btrfs_clear_path_blocking(path, NULL); + + /* insert the keys of the items */ + ret = setup_items_for_insert(trans, root, path, keys, data_size, + total_data_size, total_size, nitems); + if (ret) + goto error; + + /* insert the dir index items */ + slot = path->slots[0]; + list_for_each_entry_safe(curr, next, &head, tree_list) { + data_ptr = btrfs_item_ptr(leaf, slot, char); + write_extent_buffer(leaf, &curr->data, + (unsigned long)data_ptr, + curr->data_len); + slot++; + + btrfs_delayed_item_release_metadata(root, curr); + + list_del(&curr->tree_list); + btrfs_release_delayed_item(curr); + } + +error: + kfree(data_size); + kfree(keys); +out: + return ret; +} + +/* + * This helper can just do simple insertion that needn't extend item for new + * data, such as directory name index insertion, inode insertion. + */ +static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *delayed_item) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + char *ptr; + int ret; + + ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key, + delayed_item->data_len); + if (ret < 0 && ret != -EEXIST) + return ret; + + leaf = path->nodes[0]; + + item = btrfs_item_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr(leaf, path->slots[0], char); + + write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, + delayed_item->data_len); + btrfs_mark_buffer_dirty(leaf); + + btrfs_delayed_item_release_metadata(root, delayed_item); + return 0; +} + +/* + * we insert an item first, then if there are some continuous items, we try + * to insert those items into the same leaf. + */ +static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_root *root, + struct btrfs_delayed_node *node) +{ + struct btrfs_delayed_item *curr, *prev; + int ret = 0; + +do_again: + mutex_lock(&node->mutex); + curr = __btrfs_first_delayed_insertion_item(node); + if (!curr) + goto insert_end; + + ret = btrfs_insert_delayed_item(trans, root, path, curr); + if (ret < 0) { + btrfs_release_path(path); + goto insert_end; + } + + prev = curr; + curr = __btrfs_next_delayed_item(prev); + if (curr && btrfs_is_continuous_delayed_item(prev, curr)) { + /* insert the continuous items into the same leaf */ + path->slots[0]++; + btrfs_batch_insert_items(trans, root, path, curr); + } + btrfs_release_delayed_item(prev); + btrfs_mark_buffer_dirty(path->nodes[0]); + + btrfs_release_path(path); + mutex_unlock(&node->mutex); + goto do_again; + +insert_end: + mutex_unlock(&node->mutex); + return ret; +} + +static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *item) +{ + struct btrfs_delayed_item *curr, *next; + struct extent_buffer *leaf; + struct btrfs_key key; + struct list_head head; + int nitems, i, last_item; + int ret = 0; + + BUG_ON(!path->nodes[0]); + + leaf = path->nodes[0]; + + i = path->slots[0]; + last_item = btrfs_header_nritems(leaf) - 1; + if (i > last_item) + return -ENOENT; /* FIXME: Is errno suitable? */ + + next = item; + INIT_LIST_HEAD(&head); + btrfs_item_key_to_cpu(leaf, &key, i); + nitems = 0; + /* + * count the number of the dir index items that we can delete in batch + */ + while (btrfs_comp_cpu_keys(&next->key, &key) == 0) { + list_add_tail(&next->tree_list, &head); + nitems++; + + curr = next; + next = __btrfs_next_delayed_item(curr); + if (!next) + break; + + if (!btrfs_is_continuous_delayed_item(curr, next)) + break; + + i++; + if (i > last_item) + break; + btrfs_item_key_to_cpu(leaf, &key, i); + } + + if (!nitems) + return 0; + + ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); + if (ret) + goto out; + + list_for_each_entry_safe(curr, next, &head, tree_list) { + btrfs_delayed_item_release_metadata(root, curr); + list_del(&curr->tree_list); + btrfs_release_delayed_item(curr); + } + +out: + return ret; +} + +static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_root *root, + struct btrfs_delayed_node *node) +{ + struct btrfs_delayed_item *curr, *prev; + int ret = 0; + +do_again: + mutex_lock(&node->mutex); + curr = __btrfs_first_delayed_deletion_item(node); + if (!curr) + goto delete_fail; + + ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); + if (ret < 0) + goto delete_fail; + else if (ret > 0) { + /* + * can't find the item which the node points to, so this node + * is invalid, just drop it. + */ + prev = curr; + curr = __btrfs_next_delayed_item(prev); + btrfs_release_delayed_item(prev); + ret = 0; + btrfs_release_path(path); + if (curr) + goto do_again; + else + goto delete_fail; + } + + btrfs_batch_delete_items(trans, root, path, curr); + btrfs_release_path(path); + mutex_unlock(&node->mutex); + goto do_again; + +delete_fail: + btrfs_release_path(path); + mutex_unlock(&node->mutex); + return ret; +} + +static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) +{ + struct btrfs_delayed_root *delayed_root; + + if (delayed_node && delayed_node->inode_dirty) { + BUG_ON(!delayed_node->root); + delayed_node->inode_dirty = 0; + delayed_node->count--; + + delayed_root = delayed_node->root->fs_info->delayed_root; + atomic_dec(&delayed_root->items); + if (atomic_read(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND && + waitqueue_active(&delayed_root->wait)) + wake_up(&delayed_root->wait); + } +} + +static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + struct btrfs_key key; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + int ret; + + mutex_lock(&node->mutex); + if (!node->inode_dirty) { + mutex_unlock(&node->mutex); + return 0; + } + + key.objectid = node->inode_id; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + ret = btrfs_lookup_inode(trans, root, path, &key, 1); + if (ret > 0) { + btrfs_release_path(path); + mutex_unlock(&node->mutex); + return -ENOENT; + } else if (ret < 0) { + mutex_unlock(&node->mutex); + return ret; + } + + btrfs_unlock_up_safe(path, 1); + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, + sizeof(struct btrfs_inode_item)); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + btrfs_delayed_inode_release_metadata(root, node); + btrfs_release_delayed_inode(node); + mutex_unlock(&node->mutex); + + return 0; +} + +/* Called when committing the transaction. */ +int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + struct btrfs_delayed_node *curr_node, *prev_node; + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + + delayed_root = btrfs_get_delayed_root(root); + + curr_node = btrfs_first_delayed_node(delayed_root); + while (curr_node) { + root = curr_node->root; + ret = btrfs_insert_delayed_items(trans, path, root, + curr_node); + if (!ret) + ret = btrfs_delete_delayed_items(trans, path, root, + curr_node); + if (!ret) + ret = btrfs_update_delayed_inode(trans, root, path, + curr_node); + if (ret) { + btrfs_release_delayed_node(curr_node); + break; + } + + prev_node = curr_node; + curr_node = btrfs_next_delayed_node(curr_node); + btrfs_release_delayed_node(prev_node); + } + + btrfs_free_path(path); + trans->block_rsv = block_rsv; + return ret; +} + +static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_delayed_node *node) +{ + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + block_rsv = trans->block_rsv; + trans->block_rsv = &node->root->fs_info->global_block_rsv; + + ret = btrfs_insert_delayed_items(trans, path, node->root, node); + if (!ret) + ret = btrfs_delete_delayed_items(trans, path, node->root, node); + if (!ret) + ret = btrfs_update_delayed_inode(trans, node->root, path, node); + btrfs_free_path(path); + + trans->block_rsv = block_rsv; + return ret; +} + +int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + int ret; + + if (!delayed_node) + return 0; + + mutex_lock(&delayed_node->mutex); + if (!delayed_node->count) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; + } + mutex_unlock(&delayed_node->mutex); + + ret = __btrfs_commit_inode_delayed_items(trans, delayed_node); + btrfs_release_delayed_node(delayed_node); + return ret; +} + +void btrfs_remove_delayed_node(struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + + delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node); + if (!delayed_node) + return; + + BTRFS_I(inode)->delayed_node = NULL; + btrfs_release_delayed_node(delayed_node); +} + +struct btrfs_async_delayed_node { + struct btrfs_root *root; + struct btrfs_delayed_node *delayed_node; + struct btrfs_work work; +}; + +static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) +{ + struct btrfs_async_delayed_node *async_node; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_delayed_node *delayed_node = NULL; + struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; + unsigned long nr = 0; + int need_requeue = 0; + int ret; + + async_node = container_of(work, struct btrfs_async_delayed_node, work); + + path = btrfs_alloc_path(); + if (!path) + goto out; + path->leave_spinning = 1; + + delayed_node = async_node->delayed_node; + root = delayed_node->root; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + goto free_path; + + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + + ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); + if (!ret) + ret = btrfs_delete_delayed_items(trans, path, root, + delayed_node); + + if (!ret) + btrfs_update_delayed_inode(trans, root, path, delayed_node); + + /* + * Maybe new delayed items have been inserted, so we need requeue + * the work. Besides that, we must dequeue the empty delayed nodes + * to avoid the race between delayed items balance and the worker. + * The race like this: + * Task1 Worker thread + * count == 0, needn't requeue + * also needn't insert the + * delayed node into prepare + * list again. + * add lots of delayed items + * queue the delayed node + * already in the list, + * and not in the prepare + * list, it means the delayed + * node is being dealt with + * by the worker. + * do delayed items balance + * the delayed node is being + * dealt with by the worker + * now, just wait. + * the worker goto idle. + * Task1 will sleep until the transaction is commited. + */ + mutex_lock(&delayed_node->mutex); + if (delayed_node->count) + need_requeue = 1; + else + btrfs_dequeue_delayed_node(root->fs_info->delayed_root, + delayed_node); + mutex_unlock(&delayed_node->mutex); + + nr = trans->blocks_used; + + trans->block_rsv = block_rsv; + btrfs_end_transaction_dmeta(trans, root); + __btrfs_btree_balance_dirty(root, nr); +free_path: + btrfs_free_path(path); +out: + if (need_requeue) + btrfs_requeue_work(&async_node->work); + else { + btrfs_release_prepared_delayed_node(delayed_node); + kfree(async_node); + } +} + +static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, + struct btrfs_root *root, int all) +{ + struct btrfs_async_delayed_node *async_node; + struct btrfs_delayed_node *curr; + int count = 0; + +again: + curr = btrfs_first_prepared_delayed_node(delayed_root); + if (!curr) + return 0; + + async_node = kmalloc(sizeof(*async_node), GFP_NOFS); + if (!async_node) { + btrfs_release_prepared_delayed_node(curr); + return -ENOMEM; + } + + async_node->root = root; + async_node->delayed_node = curr; + + async_node->work.func = btrfs_async_run_delayed_node_done; + async_node->work.flags = 0; + + btrfs_queue_worker(&root->fs_info->delayed_workers, &async_node->work); + count++; + + if (all || count < 4) + goto again; + + return 0; +} + +void btrfs_assert_delayed_root_empty(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + delayed_root = btrfs_get_delayed_root(root); + WARN_ON(btrfs_first_delayed_node(delayed_root)); +} + +void btrfs_balance_delayed_items(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + + delayed_root = btrfs_get_delayed_root(root); + + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + return; + + if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { + int ret; + ret = btrfs_wq_run_delayed_node(delayed_root, root, 1); + if (ret) + return; + + wait_event_interruptible_timeout( + delayed_root->wait, + (atomic_read(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND), + HZ); + return; + } + + btrfs_wq_run_delayed_node(delayed_root, root, 0); +} + +int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + int name_len, struct inode *dir, + struct btrfs_disk_key *disk_key, u8 type, + u64 index) +{ + struct btrfs_delayed_node *delayed_node; + struct btrfs_delayed_item *delayed_item; + struct btrfs_dir_item *dir_item; + int ret; + + delayed_node = btrfs_get_or_create_delayed_node(dir); + if (IS_ERR(delayed_node)) + return PTR_ERR(delayed_node); + + delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len); + if (!delayed_item) { + ret = -ENOMEM; + goto release_node; + } + + ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); + /* + * we have reserved enough space when we start a new transaction, + * so reserving metadata failure is impossible + */ + BUG_ON(ret); + + delayed_item->key.objectid = btrfs_ino(dir); + btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); + delayed_item->key.offset = index; + + dir_item = (struct btrfs_dir_item *)delayed_item->data; + dir_item->location = *disk_key; + dir_item->transid = cpu_to_le64(trans->transid); + dir_item->data_len = 0; + dir_item->name_len = cpu_to_le16(name_len); + dir_item->type = type; + memcpy((char *)(dir_item + 1), name, name_len); + + mutex_lock(&delayed_node->mutex); + ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); + if (unlikely(ret)) { + printk(KERN_ERR "err add delayed dir index item(name: %s) into " + "the insertion tree of the delayed node" + "(root id: %llu, inode id: %llu, errno: %d)\n", + name, + (unsigned long long)delayed_node->root->objectid, + (unsigned long long)delayed_node->inode_id, + ret); + BUG(); + } + mutex_unlock(&delayed_node->mutex); + +release_node: + btrfs_release_delayed_node(delayed_node); + return ret; +} + +static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root, + struct btrfs_delayed_node *node, + struct btrfs_key *key) +{ + struct btrfs_delayed_item *item; + + mutex_lock(&node->mutex); + item = __btrfs_lookup_delayed_insertion_item(node, key); + if (!item) { + mutex_unlock(&node->mutex); + return 1; + } + + btrfs_delayed_item_release_metadata(root, item); + btrfs_release_delayed_item(item); + mutex_unlock(&node->mutex); + return 0; +} + +int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *dir, + u64 index) +{ + struct btrfs_delayed_node *node; + struct btrfs_delayed_item *item; + struct btrfs_key item_key; + int ret; + + node = btrfs_get_or_create_delayed_node(dir); + if (IS_ERR(node)) + return PTR_ERR(node); + + item_key.objectid = btrfs_ino(dir); + btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY); + item_key.offset = index; + + ret = btrfs_delete_delayed_insertion_item(root, node, &item_key); + if (!ret) + goto end; + + item = btrfs_alloc_delayed_item(0); + if (!item) { + ret = -ENOMEM; + goto end; + } + + item->key = item_key; + + ret = btrfs_delayed_item_reserve_metadata(trans, root, item); + /* + * we have reserved enough space when we start a new transaction, + * so reserving metadata failure is impossible. + */ + BUG_ON(ret); + + mutex_lock(&node->mutex); + ret = __btrfs_add_delayed_deletion_item(node, item); + if (unlikely(ret)) { + printk(KERN_ERR "err add delayed dir index item(index: %llu) " + "into the deletion tree of the delayed node" + "(root id: %llu, inode id: %llu, errno: %d)\n", + (unsigned long long)index, + (unsigned long long)node->root->objectid, + (unsigned long long)node->inode_id, + ret); + BUG(); + } + mutex_unlock(&node->mutex); +end: + btrfs_release_delayed_node(node); + return ret; +} + +int btrfs_inode_delayed_dir_index_count(struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + + if (!delayed_node) + return -ENOENT; + + /* + * Since we have held i_mutex of this directory, it is impossible that + * a new directory index is added into the delayed node and index_cnt + * is updated now. So we needn't lock the delayed node. + */ + if (!delayed_node->index_cnt) { + btrfs_release_delayed_node(delayed_node); + return -EINVAL; + } + + BTRFS_I(inode)->index_cnt = delayed_node->index_cnt; + btrfs_release_delayed_node(delayed_node); + return 0; +} + +void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_node *delayed_node; + struct btrfs_delayed_item *item; + + delayed_node = btrfs_get_delayed_node(inode); + if (!delayed_node) + return; + + mutex_lock(&delayed_node->mutex); + item = __btrfs_first_delayed_insertion_item(delayed_node); + while (item) { + atomic_inc(&item->refs); + list_add_tail(&item->readdir_list, ins_list); + item = __btrfs_next_delayed_item(item); + } + + item = __btrfs_first_delayed_deletion_item(delayed_node); + while (item) { + atomic_inc(&item->refs); + list_add_tail(&item->readdir_list, del_list); + item = __btrfs_next_delayed_item(item); + } + mutex_unlock(&delayed_node->mutex); + /* + * This delayed node is still cached in the btrfs inode, so refs + * must be > 1 now, and we needn't check it is going to be freed + * or not. + * + * Besides that, this function is used to read dir, we do not + * insert/delete delayed items in this period. So we also needn't + * requeue or dequeue this delayed node. + */ + atomic_dec(&delayed_node->refs); +} + +void btrfs_put_delayed_items(struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_item *curr, *next; + + list_for_each_entry_safe(curr, next, ins_list, readdir_list) { + list_del(&curr->readdir_list); + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + } + + list_for_each_entry_safe(curr, next, del_list, readdir_list) { + list_del(&curr->readdir_list); + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + } +} + +int btrfs_should_delete_dir_index(struct list_head *del_list, + u64 index) +{ + struct btrfs_delayed_item *curr, *next; + int ret; + + if (list_empty(del_list)) + return 0; + + list_for_each_entry_safe(curr, next, del_list, readdir_list) { + if (curr->key.offset > index) + break; + + list_del(&curr->readdir_list); + ret = (curr->key.offset == index); + + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + + if (ret) + return 1; + else + continue; + } + return 0; +} + +/* + * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree + * + */ +int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, + filldir_t filldir, + struct list_head *ins_list) +{ + struct btrfs_dir_item *di; + struct btrfs_delayed_item *curr, *next; + struct btrfs_key location; + char *name; + int name_len; + int over = 0; + unsigned char d_type; + + if (list_empty(ins_list)) + return 0; + + /* + * Changing the data of the delayed item is impossible. So + * we needn't lock them. And we have held i_mutex of the + * directory, nobody can delete any directory indexes now. + */ + list_for_each_entry_safe(curr, next, ins_list, readdir_list) { + list_del(&curr->readdir_list); + + if (curr->key.offset < filp->f_pos) { + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + continue; + } + + filp->f_pos = curr->key.offset; + + di = (struct btrfs_dir_item *)curr->data; + name = (char *)(di + 1); + name_len = le16_to_cpu(di->name_len); + + d_type = btrfs_filetype_table[di->type]; + btrfs_disk_key_to_cpu(&location, &di->location); + + over = filldir(dirent, name, name_len, curr->key.offset, + location.objectid, d_type); + + if (atomic_dec_and_test(&curr->refs)) + kfree(curr); + + if (over) + return 1; + } + return 0; +} + +BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, + sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, + transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, + nbytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, + block_group, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); + +static void fill_stack_inode_item(struct btrfs_trans_handle *trans, + struct btrfs_inode_item *inode_item, + struct inode *inode) +{ + btrfs_set_stack_inode_uid(inode_item, inode->i_uid); + btrfs_set_stack_inode_gid(inode_item, inode->i_gid); + btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); + btrfs_set_stack_inode_mode(inode_item, inode->i_mode); + btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); + btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); + btrfs_set_stack_inode_generation(inode_item, + BTRFS_I(inode)->generation); + btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence); + btrfs_set_stack_inode_transid(inode_item, trans->transid); + btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); + btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); + btrfs_set_stack_inode_block_group(inode_item, 0); + + btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item), + inode->i_atime.tv_sec); + btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item), + inode->i_atime.tv_nsec); + + btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item), + inode->i_mtime.tv_sec); + btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item), + inode->i_mtime.tv_nsec); + + btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item), + inode->i_ctime.tv_sec); + btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item), + inode->i_ctime.tv_nsec); +} + +int btrfs_fill_inode(struct inode *inode, u32 *rdev) +{ + struct btrfs_delayed_node *delayed_node; + struct btrfs_inode_item *inode_item; + struct btrfs_timespec *tspec; + + delayed_node = btrfs_get_delayed_node(inode); + if (!delayed_node) + return -ENOENT; + + mutex_lock(&delayed_node->mutex); + if (!delayed_node->inode_dirty) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return -ENOENT; + } + + inode_item = &delayed_node->inode_item; + + inode->i_uid = btrfs_stack_inode_uid(inode_item); + inode->i_gid = btrfs_stack_inode_gid(inode_item); + btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + inode->i_mode = btrfs_stack_inode_mode(inode_item); + inode->i_nlink = btrfs_stack_inode_nlink(inode_item); + inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); + BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); + BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); + inode->i_rdev = 0; + *rdev = btrfs_stack_inode_rdev(inode_item); + BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); + + tspec = btrfs_inode_atime(inode_item); + inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + tspec = btrfs_inode_mtime(inode_item); + inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + tspec = btrfs_inode_ctime(inode_item); + inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + inode->i_generation = BTRFS_I(inode)->generation; + BTRFS_I(inode)->index_cnt = (u64)-1; + + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; +} + +int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + int ret = 0; + + delayed_node = btrfs_get_or_create_delayed_node(inode); + if (IS_ERR(delayed_node)) + return PTR_ERR(delayed_node); + + mutex_lock(&delayed_node->mutex); + if (delayed_node->inode_dirty) { + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); + goto release_node; + } + + ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); + /* + * we must reserve enough space when we start a new transaction, + * so reserving metadata failure is impossible + */ + BUG_ON(ret); + + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); + delayed_node->inode_dirty = 1; + delayed_node->count++; + atomic_inc(&root->fs_info->delayed_root->items); +release_node: + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return ret; +} + +static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) +{ + struct btrfs_root *root = delayed_node->root; + struct btrfs_delayed_item *curr_item, *prev_item; + + mutex_lock(&delayed_node->mutex); + curr_item = __btrfs_first_delayed_insertion_item(delayed_node); + while (curr_item) { + btrfs_delayed_item_release_metadata(root, curr_item); + prev_item = curr_item; + curr_item = __btrfs_next_delayed_item(prev_item); + btrfs_release_delayed_item(prev_item); + } + + curr_item = __btrfs_first_delayed_deletion_item(delayed_node); + while (curr_item) { + btrfs_delayed_item_release_metadata(root, curr_item); + prev_item = curr_item; + curr_item = __btrfs_next_delayed_item(prev_item); + btrfs_release_delayed_item(prev_item); + } + + if (delayed_node->inode_dirty) { + btrfs_delayed_inode_release_metadata(root, delayed_node); + btrfs_release_delayed_inode(delayed_node); + } + mutex_unlock(&delayed_node->mutex); +} + +void btrfs_kill_delayed_inode_items(struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + + delayed_node = btrfs_get_delayed_node(inode); + if (!delayed_node) + return; + + __btrfs_kill_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node); +} + +void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) +{ + u64 inode_id = 0; + struct btrfs_delayed_node *delayed_nodes[8]; + int i, n; + + while (1) { + spin_lock(&root->inode_lock); + n = radix_tree_gang_lookup(&root->delayed_nodes_tree, + (void **)delayed_nodes, inode_id, + ARRAY_SIZE(delayed_nodes)); + if (!n) { + spin_unlock(&root->inode_lock); + break; + } + + inode_id = delayed_nodes[n - 1]->inode_id + 1; + + for (i = 0; i < n; i++) + atomic_inc(&delayed_nodes[i]->refs); + spin_unlock(&root->inode_lock); + + for (i = 0; i < n; i++) { + __btrfs_kill_delayed_node(delayed_nodes[i]); + btrfs_release_delayed_node(delayed_nodes[i]); + } + } +} diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h new file mode 100644 index 00000000..8d27af4b --- /dev/null +++ b/fs/btrfs/delayed-inode.h @@ -0,0 +1,145 @@ +/* + * Copyright (C) 2011 Fujitsu. All rights reserved. + * Written by Miao Xie + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __DELAYED_TREE_OPERATION_H +#define __DELAYED_TREE_OPERATION_H + +#include +#include +#include +#include +#include +#include + +#include "ctree.h" + +/* types of the delayed item */ +#define BTRFS_DELAYED_INSERTION_ITEM 1 +#define BTRFS_DELAYED_DELETION_ITEM 2 + +struct btrfs_delayed_root { + spinlock_t lock; + struct list_head node_list; + /* + * Used for delayed nodes which is waiting to be dealt with by the + * worker. If the delayed node is inserted into the work queue, we + * drop it from this list. + */ + struct list_head prepare_list; + atomic_t items; /* for delayed items */ + int nodes; /* for delayed nodes */ + wait_queue_head_t wait; +}; + +struct btrfs_delayed_node { + u64 inode_id; + u64 bytes_reserved; + struct btrfs_root *root; + /* Used to add the node into the delayed root's node list. */ + struct list_head n_list; + /* + * Used to add the node into the prepare list, the nodes in this list + * is waiting to be dealt with by the async worker. + */ + struct list_head p_list; + struct rb_root ins_root; + struct rb_root del_root; + struct mutex mutex; + struct btrfs_inode_item inode_item; + atomic_t refs; + u64 index_cnt; + bool in_list; + bool inode_dirty; + int count; +}; + +struct btrfs_delayed_item { + struct rb_node rb_node; + struct btrfs_key key; + struct list_head tree_list; /* used for batch insert/delete items */ + struct list_head readdir_list; /* used for readdir items */ + u64 bytes_reserved; + struct btrfs_delayed_node *delayed_node; + atomic_t refs; + int ins_or_del; + u32 data_len; + char data[0]; +}; + +static inline void btrfs_init_delayed_root( + struct btrfs_delayed_root *delayed_root) +{ + atomic_set(&delayed_root->items, 0); + delayed_root->nodes = 0; + spin_lock_init(&delayed_root->lock); + init_waitqueue_head(&delayed_root->wait); + INIT_LIST_HEAD(&delayed_root->node_list); + INIT_LIST_HEAD(&delayed_root->prepare_list); +} + +int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + int name_len, struct inode *dir, + struct btrfs_disk_key *disk_key, u8 type, + u64 index); + +int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *dir, + u64 index); + +int btrfs_inode_delayed_dir_index_count(struct inode *inode); + +int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + +void btrfs_balance_delayed_items(struct btrfs_root *root); + +int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct inode *inode); +/* Used for evicting the inode. */ +void btrfs_remove_delayed_node(struct inode *inode); +void btrfs_kill_delayed_inode_items(struct inode *inode); + + +int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); +int btrfs_fill_inode(struct inode *inode, u32 *rdev); + +/* Used for drop dead root */ +void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); + +/* Used for readdir() */ +void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, + struct list_head *del_list); +void btrfs_put_delayed_items(struct list_head *ins_list, + struct list_head *del_list); +int btrfs_should_delete_dir_index(struct list_head *del_list, + u64 index); +int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, + filldir_t filldir, + struct list_head *ins_list); + +/* for init */ +int __init btrfs_delayed_inode_init(void); +void btrfs_delayed_inode_exit(void); + +/* for debugging */ +void btrfs_assert_delayed_root_empty(struct btrfs_root *root); + +#endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c new file mode 100644 index 00000000..125cf76f --- /dev/null +++ b/fs/btrfs/delayed-ref.c @@ -0,0 +1,711 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include "ctree.h" +#include "delayed-ref.h" +#include "transaction.h" + +/* + * delayed back reference update tracking. For subvolume trees + * we queue up extent allocations and backref maintenance for + * delayed processing. This avoids deep call chains where we + * add extents in the middle of btrfs_search_slot, and it allows + * us to buffer up frequently modified backrefs in an rb tree instead + * of hammering updates on the extent allocation tree. + */ + +/* + * compare two delayed tree backrefs with same bytenr and type + */ +static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, + struct btrfs_delayed_tree_ref *ref1) +{ + if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } + return 0; +} + +/* + * compare two delayed data backrefs with same bytenr and type + */ +static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, + struct btrfs_delayed_data_ref *ref1) +{ + if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + if (ref1->objectid < ref2->objectid) + return -1; + if (ref1->objectid > ref2->objectid) + return 1; + if (ref1->offset < ref2->offset) + return -1; + if (ref1->offset > ref2->offset) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } + return 0; +} + +/* + * entries in the rb tree are ordered by the byte number of the extent, + * type of the delayed backrefs and content of delayed backrefs. + */ +static int comp_entry(struct btrfs_delayed_ref_node *ref2, + struct btrfs_delayed_ref_node *ref1) +{ + if (ref1->bytenr < ref2->bytenr) + return -1; + if (ref1->bytenr > ref2->bytenr) + return 1; + if (ref1->is_head && ref2->is_head) + return 0; + if (ref2->is_head) + return -1; + if (ref1->is_head) + return 1; + if (ref1->type < ref2->type) + return -1; + if (ref1->type > ref2->type) + return 1; + if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || + ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { + return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), + btrfs_delayed_node_to_tree_ref(ref1)); + } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || + ref1->type == BTRFS_SHARED_DATA_REF_KEY) { + return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), + btrfs_delayed_node_to_data_ref(ref1)); + } + BUG(); + return 0; +} + +/* + * insert a new ref into the rbtree. This returns any existing refs + * for the same (bytenr,parent) tuple, or NULL if the new node was properly + * inserted. + */ +static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_node *entry; + struct btrfs_delayed_ref_node *ins; + int cmp; + + ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, + rb_node); + + cmp = comp_entry(entry, ins); + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * find an head entry based on bytenr. This returns the delayed ref + * head if it was able to find one, or NULL if nothing was in that spot + */ +static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, + u64 bytenr, + struct btrfs_delayed_ref_node **last) +{ + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int cmp; + + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); + if (last) + *last = entry; + + if (bytenr < entry->bytenr) + cmp = -1; + else if (bytenr > entry->bytenr) + cmp = 1; + else if (!btrfs_delayed_ref_is_head(entry)) + cmp = 1; + else + cmp = 0; + + if (cmp < 0) + n = n->rb_left; + else if (cmp > 0) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + assert_spin_locked(&delayed_refs->lock); + if (mutex_trylock(&head->mutex)) + return 0; + + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + mutex_lock(&head->mutex); + spin_lock(&delayed_refs->lock); + if (!head->node.in_tree) { + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return -EAGAIN; + } + btrfs_put_delayed_ref(&head->node); + return 0; +} + +int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, + struct list_head *cluster, u64 start) +{ + int count = 0; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *head; + + delayed_refs = &trans->transaction->delayed_refs; + if (start == 0) { + node = rb_first(&delayed_refs->root); + } else { + ref = NULL; + find_ref_head(&delayed_refs->root, start, &ref); + if (ref) { + struct btrfs_delayed_ref_node *tmp; + + node = rb_prev(&ref->rb_node); + while (node) { + tmp = rb_entry(node, + struct btrfs_delayed_ref_node, + rb_node); + if (tmp->bytenr < start) + break; + ref = tmp; + node = rb_prev(&ref->rb_node); + } + node = &ref->rb_node; + } else + node = rb_first(&delayed_refs->root); + } +again: + while (node && count < 32) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + if (btrfs_delayed_ref_is_head(ref)) { + head = btrfs_delayed_node_to_head(ref); + if (list_empty(&head->cluster)) { + list_add_tail(&head->cluster, cluster); + delayed_refs->run_delayed_start = + head->node.bytenr; + count++; + + WARN_ON(delayed_refs->num_heads_ready == 0); + delayed_refs->num_heads_ready--; + } else if (count) { + /* the goal of the clustering is to find extents + * that are likely to end up in the same extent + * leaf on disk. So, we don't want them spread + * all over the tree. Stop now if we've hit + * a head that was already in use + */ + break; + } + } + node = rb_next(node); + } + if (count) { + return 0; + } else if (start) { + /* + * we've gone to the end of the rbtree without finding any + * clusters. start from the beginning and try again + */ + start = 0; + node = rb_first(&delayed_refs->root); + goto again; + } + return 1; +} + +/* + * helper function to update an extent delayed ref in the + * rbtree. existing and update must both have the same + * bytenr and parent + * + * This may free existing if the update cancels out whatever + * operation it was doing. + */ +static noinline void +update_existing_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_node *existing, + struct btrfs_delayed_ref_node *update) +{ + if (update->action != existing->action) { + /* + * this is effectively undoing either an add or a + * drop. We decrement the ref_mod, and if it goes + * down to zero we just delete the entry without + * every changing the extent allocation tree. + */ + existing->ref_mod--; + if (existing->ref_mod == 0) { + rb_erase(&existing->rb_node, + &delayed_refs->root); + existing->in_tree = 0; + btrfs_put_delayed_ref(existing); + delayed_refs->num_entries--; + if (trans->delayed_ref_updates) + trans->delayed_ref_updates--; + } else { + WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || + existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + } + } else { + WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || + existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + /* + * the action on the existing ref matches + * the action on the ref we're trying to add. + * Bump the ref_mod by one so the backref that + * is eventually added/removed has the correct + * reference count + */ + existing->ref_mod += update->ref_mod; + } +} + +/* + * helper function to update the accounting in the head ref + * existing and update must have the same bytenr + */ +static noinline void +update_existing_head_ref(struct btrfs_delayed_ref_node *existing, + struct btrfs_delayed_ref_node *update) +{ + struct btrfs_delayed_ref_head *existing_ref; + struct btrfs_delayed_ref_head *ref; + + existing_ref = btrfs_delayed_node_to_head(existing); + ref = btrfs_delayed_node_to_head(update); + BUG_ON(existing_ref->is_data != ref->is_data); + + if (ref->must_insert_reserved) { + /* if the extent was freed and then + * reallocated before the delayed ref + * entries were processed, we can end up + * with an existing head ref without + * the must_insert_reserved flag set. + * Set it again here + */ + existing_ref->must_insert_reserved = ref->must_insert_reserved; + + /* + * update the num_bytes so we make sure the accounting + * is done correctly + */ + existing->num_bytes = update->num_bytes; + + } + + if (ref->extent_op) { + if (!existing_ref->extent_op) { + existing_ref->extent_op = ref->extent_op; + } else { + if (ref->extent_op->update_key) { + memcpy(&existing_ref->extent_op->key, + &ref->extent_op->key, + sizeof(ref->extent_op->key)); + existing_ref->extent_op->update_key = 1; + } + if (ref->extent_op->update_flags) { + existing_ref->extent_op->flags_to_set |= + ref->extent_op->flags_to_set; + existing_ref->extent_op->update_flags = 1; + } + kfree(ref->extent_op); + } + } + /* + * update the reference mod on the head to reflect this new operation + */ + existing->ref_mod += update->ref_mod; +} + +/* + * helper function to actually insert a head node into the rbtree. + * this does all the dirty work in terms of maintaining the correct + * overall modification count. + */ +static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, + int action, int is_data) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_ref_head *head_ref = NULL; + struct btrfs_delayed_ref_root *delayed_refs; + int count_mod = 1; + int must_insert_reserved = 0; + + /* + * the head node stores the sum of all the mods, so dropping a ref + * should drop the sum in the head node by one. + */ + if (action == BTRFS_UPDATE_DELAYED_HEAD) + count_mod = 0; + else if (action == BTRFS_DROP_DELAYED_REF) + count_mod = -1; + + /* + * BTRFS_ADD_DELAYED_EXTENT means that we need to update + * the reserved accounting when the extent is finally added, or + * if a later modification deletes the delayed ref without ever + * inserting the extent into the extent allocation tree. + * ref->must_insert_reserved is the flag used to record + * that accounting mods are required. + * + * Once we record must_insert_reserved, switch the action to + * BTRFS_ADD_DELAYED_REF because other special casing is not required. + */ + if (action == BTRFS_ADD_DELAYED_EXTENT) + must_insert_reserved = 1; + else + must_insert_reserved = 0; + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = count_mod; + ref->type = 0; + ref->action = 0; + ref->is_head = 1; + ref->in_tree = 1; + + head_ref = btrfs_delayed_node_to_head(ref); + head_ref->must_insert_reserved = must_insert_reserved; + head_ref->is_data = is_data; + + INIT_LIST_HEAD(&head_ref->cluster); + mutex_init(&head_ref->mutex); + + trace_btrfs_delayed_ref_head(ref, head_ref, action); + + existing = tree_insert(&delayed_refs->root, &ref->rb_node); + + if (existing) { + update_existing_head_ref(existing, ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + delayed_refs->num_heads++; + delayed_refs->num_heads_ready++; + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * helper to insert a delayed tree ref into the rbtree. + */ +static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_tree_ref *full_ref; + struct btrfs_delayed_ref_root *delayed_refs; + + if (action == BTRFS_ADD_DELAYED_EXTENT) + action = BTRFS_ADD_DELAYED_REF; + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = 1; + ref->action = action; + ref->is_head = 0; + ref->in_tree = 1; + + full_ref = btrfs_delayed_node_to_tree_ref(ref); + if (parent) { + full_ref->parent = parent; + ref->type = BTRFS_SHARED_BLOCK_REF_KEY; + } else { + full_ref->root = ref_root; + ref->type = BTRFS_TREE_BLOCK_REF_KEY; + } + full_ref->level = level; + + trace_btrfs_delayed_tree_ref(ref, full_ref, action); + + existing = tree_insert(&delayed_refs->root, &ref->rb_node); + + if (existing) { + update_existing_ref(trans, delayed_refs, existing, ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * helper to insert a delayed data ref into the rbtree. + */ +static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, u64 owner, u64 offset, + int action) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_data_ref *full_ref; + struct btrfs_delayed_ref_root *delayed_refs; + + if (action == BTRFS_ADD_DELAYED_EXTENT) + action = BTRFS_ADD_DELAYED_REF; + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = 1; + ref->action = action; + ref->is_head = 0; + ref->in_tree = 1; + + full_ref = btrfs_delayed_node_to_data_ref(ref); + if (parent) { + full_ref->parent = parent; + ref->type = BTRFS_SHARED_DATA_REF_KEY; + } else { + full_ref->root = ref_root; + ref->type = BTRFS_EXTENT_DATA_REF_KEY; + } + full_ref->objectid = owner; + full_ref->offset = offset; + + trace_btrfs_delayed_data_ref(ref, full_ref, action); + + existing = tree_insert(&delayed_refs->root, &ref->rb_node); + + if (existing) { + update_existing_ref(trans, delayed_refs, existing, ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * add a delayed tree ref. This does all of the accounting required + * to make sure the delayed ref is eventually processed before this + * transaction commits. + */ +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_tree_ref *ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + BUG_ON(extent_op && extent_op->is_data); + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) { + kfree(ref); + return -ENOMEM; + } + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, + action, 0); + BUG_ON(ret); + + ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, level, action); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} + +/* + * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. + */ +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, + u64 owner, u64 offset, int action, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_data_ref *ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + BUG_ON(extent_op && !extent_op->is_data); + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) { + kfree(ref); + return -ENOMEM; + } + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + /* + * insert both the head node and the new ref without dropping + * the spin lock + */ + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, + action, 1); + BUG_ON(ret); + + ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, owner, offset, action); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} + +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) + return -ENOMEM; + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, + num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + extent_op->is_data); + BUG_ON(ret); + + spin_unlock(&delayed_refs->lock); + return 0; +} + +/* + * this does a simple search for the head node for a given extent. + * It must be called with the delayed ref spinlock held, and it returns + * the head node if any where found, or NULL if not. + */ +struct btrfs_delayed_ref_head * +btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) +{ + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + ref = find_ref_head(&delayed_refs->root, bytenr, NULL); + if (ref) + return btrfs_delayed_node_to_head(ref); + return NULL; +} diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h new file mode 100644 index 00000000..e287e3b0 --- /dev/null +++ b/fs/btrfs/delayed-ref.h @@ -0,0 +1,205 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef __DELAYED_REF__ +#define __DELAYED_REF__ + +/* these are the possible values of struct btrfs_delayed_ref->action */ +#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ +#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ +#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ +#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ + +struct btrfs_delayed_ref_node { + struct rb_node rb_node; + + /* the starting bytenr of the extent */ + u64 bytenr; + + /* the size of the extent */ + u64 num_bytes; + + /* ref count on this data structure */ + atomic_t refs; + + /* + * how many refs is this entry adding or deleting. For + * head refs, this may be a negative number because it is keeping + * track of the total mods done to the reference count. + * For individual refs, this will always be a positive number + * + * It may be more than one, since it is possible for a single + * parent to have more than one ref on an extent + */ + int ref_mod; + + unsigned int action:8; + unsigned int type:8; + /* is this node still in the rbtree? */ + unsigned int is_head:1; + unsigned int in_tree:1; +}; + +struct btrfs_delayed_extent_op { + struct btrfs_disk_key key; + u64 flags_to_set; + unsigned int update_key:1; + unsigned int update_flags:1; + unsigned int is_data:1; +}; + +/* + * the head refs are used to hold a lock on a given extent, which allows us + * to make sure that only one process is running the delayed refs + * at a time for a single extent. They also store the sum of all the + * reference count modifications we've queued up. + */ +struct btrfs_delayed_ref_head { + struct btrfs_delayed_ref_node node; + + /* + * the mutex is held while running the refs, and it is also + * held when checking the sum of reference modifications. + */ + struct mutex mutex; + + struct list_head cluster; + + struct btrfs_delayed_extent_op *extent_op; + /* + * when a new extent is allocated, it is just reserved in memory + * The actual extent isn't inserted into the extent allocation tree + * until the delayed ref is processed. must_insert_reserved is + * used to flag a delayed ref so the accounting can be updated + * when a full insert is done. + * + * It is possible the extent will be freed before it is ever + * inserted into the extent allocation tree. In this case + * we need to update the in ram accounting to properly reflect + * the free has happened. + */ + unsigned int must_insert_reserved:1; + unsigned int is_data:1; +}; + +struct btrfs_delayed_tree_ref { + struct btrfs_delayed_ref_node node; + union { + u64 root; + u64 parent; + }; + int level; +}; + +struct btrfs_delayed_data_ref { + struct btrfs_delayed_ref_node node; + union { + u64 root; + u64 parent; + }; + u64 objectid; + u64 offset; +}; + +struct btrfs_delayed_ref_root { + struct rb_root root; + + /* this spin lock protects the rbtree and the entries inside */ + spinlock_t lock; + + /* how many delayed ref updates we've queued, used by the + * throttling code + */ + unsigned long num_entries; + + /* total number of head nodes in tree */ + unsigned long num_heads; + + /* total number of head nodes ready for processing */ + unsigned long num_heads_ready; + + /* + * set when the tree is flushing before a transaction commit, + * used by the throttling code to decide if new updates need + * to be run right away + */ + int flushing; + + u64 run_delayed_start; +}; + +static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) +{ + WARN_ON(atomic_read(&ref->refs) == 0); + if (atomic_dec_and_test(&ref->refs)) { + WARN_ON(ref->in_tree); + kfree(ref); + } +} + +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, + u64 owner, u64 offset, int action, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op); + +struct btrfs_delayed_ref_head * +btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); +int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head); +int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, + struct list_head *cluster, u64 search_start); +/* + * a node might live in a head or a regular ref, this lets you + * test for the proper type to use. + */ +static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) +{ + return node->is_head; +} + +/* + * helper functions to cast a node into its container + */ +static inline struct btrfs_delayed_tree_ref * +btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_tree_ref, node); +} + +static inline struct btrfs_delayed_data_ref * +btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_data_ref, node); +} + +static inline struct btrfs_delayed_ref_head * +btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(!btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_ref_head, node); +} +#endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c new file mode 100644 index 00000000..685f2593 --- /dev/null +++ b/fs/btrfs/dir-item.c @@ -0,0 +1,453 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "hash.h" +#include "transaction.h" + +/* + * insert a name into a directory, doing overflow properly if there is a hash + * collision. data_size indicates how big the item inserted should be. On + * success a struct btrfs_dir_item pointer is returned, otherwise it is + * an ERR_PTR. + * + * The name is not copied into the dir item, you have to do that yourself. + */ +static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle + *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, + u32 data_size, + const char *name, + int name_len) +{ + int ret; + char *ptr; + struct btrfs_item *item; + struct extent_buffer *leaf; + + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); + if (ret == -EEXIST) { + struct btrfs_dir_item *di; + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) + return ERR_PTR(-EEXIST); + ret = btrfs_extend_item(trans, root, path, data_size); + } + if (ret < 0) + return ERR_PTR(ret); + WARN_ON(ret > 0); + leaf = path->nodes[0]; + item = btrfs_item_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr(leaf, path->slots[0], char); + BUG_ON(data_size > btrfs_item_size(leaf, item)); + ptr += btrfs_item_size(leaf, item) - data_size; + return (struct btrfs_dir_item *)ptr; +} + +/* + * xattrs work a lot like directories, this inserts an xattr item + * into the tree + */ +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len) +{ + int ret = 0; + struct btrfs_dir_item *dir_item; + unsigned long name_ptr, data_ptr; + struct btrfs_key key, location; + struct btrfs_disk_key disk_key; + struct extent_buffer *leaf; + u32 data_size; + + BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + + data_size = sizeof(*dir_item) + name_len + data_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + /* + * FIXME: at some point we should handle xattr's that are larger than + * what we can fit in our leaf. We set location to NULL b/c we arent + * pointing at anything else, that will change if we store the xattr + * data in a separate inode. + */ + BUG_ON(IS_ERR(dir_item)); + memset(&location, 0, sizeof(location)); + + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, &location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + btrfs_set_dir_data_len(leaf, dir_item, data_len); + name_ptr = (unsigned long)(dir_item + 1); + data_ptr = (unsigned long)((char *)name_ptr + name_len); + + write_extent_buffer(leaf, name, name_ptr, name_len); + write_extent_buffer(leaf, data, data_ptr, data_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + + return ret; +} + +/* + * insert a directory item in the tree, doing all the magic for + * both indexes. 'dir' indicates which objectid to insert it into, + * 'location' is the key to stuff into the directory item, 'type' is the + * type of the inode we're pointing to, and 'index' is the sequence number + * to use for the second index (if one is created). + */ +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, const char *name, int name_len, + struct inode *dir, struct btrfs_key *location, + u8 type, u64 index) +{ + int ret = 0; + int ret2 = 0; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + struct btrfs_disk_key disk_key; + u32 data_size; + + key.objectid = btrfs_ino(dir); + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + btrfs_cpu_key_to_disk(&disk_key, location); + + data_size = sizeof(*dir_item) + name_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); + if (ret == -EEXIST) + goto second_insert; + goto out_free; + } + + leaf = path->nodes[0]; + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_data_len(leaf, dir_item, 0); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + name_ptr = (unsigned long)(dir_item + 1); + + write_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + +second_insert: + /* FIXME, use some real flag for selecting the extra index */ + if (root == root->fs_info->tree_root) { + ret = 0; + goto out_free; + } + btrfs_release_path(path); + + ret2 = btrfs_insert_delayed_dir_index(trans, root, name, name_len, dir, + &disk_key, type, index); +out_free: + btrfs_free_path(path); + if (ret) + return ret; + if (ret2) + return ret2; + return 0; +} + +/* + * lookup a directory item based on name. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + */ +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, int name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + + key.offset = btrfs_name_hash(name, name_len); + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + if (path->slots[0] == 0) + return NULL; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != dir || + btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY || + found_key.offset != key.offset) + return NULL; + + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +/* + * lookup a directory item based on index. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + * + * The name is used to make sure the index really points to the name you were + * looking for. + */ +struct btrfs_dir_item * +btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 objectid, const char *name, int name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.offset = objectid; + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return ERR_PTR(-ENOENT); + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +struct btrfs_dir_item * +btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const char *name, int name_len) +{ + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u32 nritems; + int ret; + + key.objectid = dirid; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ERR_PTR(ret); + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + + while (1) { + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) + break; + + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) + return di; + + path->slots[0]++; + } + return NULL; +} + +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + if (path->slots[0] == 0) + return NULL; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != dir || + btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY || + found_key.offset != key.offset) + return NULL; + + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +/* + * helper function to look at the directory item pointed to by 'path' + * this walks through all the entries in a dir item and finds one + * for a specific name. + */ +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len) +{ + struct btrfs_dir_item *dir_item; + unsigned long name_ptr; + u32 total_len; + u32 cur = 0; + u32 this_len; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + if (verify_dir_item(root, leaf, dir_item)) + return NULL; + + total_len = btrfs_item_size_nr(leaf, path->slots[0]); + while (cur < total_len) { + this_len = sizeof(*dir_item) + + btrfs_dir_name_len(leaf, dir_item) + + btrfs_dir_data_len(leaf, dir_item); + name_ptr = (unsigned long)(dir_item + 1); + + if (btrfs_dir_name_len(leaf, dir_item) == name_len && + memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) + return dir_item; + + cur += this_len; + dir_item = (struct btrfs_dir_item *)((char *)dir_item + + this_len); + } + return NULL; +} + +/* + * given a pointer into a directory item, delete it. This + * handles items that have more than one entry in them. + */ +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di) +{ + + struct extent_buffer *leaf; + u32 sub_item_len; + u32 item_len; + int ret = 0; + + leaf = path->nodes[0]; + sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di); + item_len = btrfs_item_size_nr(leaf, path->slots[0]); + if (sub_item_len == item_len) { + ret = btrfs_del_item(trans, root, path); + } else { + /* MARKER */ + unsigned long ptr = (unsigned long)di; + unsigned long start; + + start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, + item_len - (ptr + sub_item_len - start)); + ret = btrfs_truncate_item(trans, root, path, + item_len - sub_item_len, 1); + } + return ret; +} + +int verify_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dir_item *dir_item) +{ + u16 namelen = BTRFS_NAME_LEN; + u8 type = btrfs_dir_type(leaf, dir_item); + + if (type >= BTRFS_FT_MAX) { + printk(KERN_CRIT "btrfs: invalid dir item type: %d\n", + (int)type); + return 1; + } + + if (type == BTRFS_FT_XATTR) + namelen = XATTR_NAME_MAX; + + if (btrfs_dir_name_len(leaf, dir_item) > namelen) { + printk(KERN_CRIT "btrfs: invalid dir item name len: %u\n", + (unsigned)btrfs_dir_data_len(leaf, dir_item)); + return 1; + } + + /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ + if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) { + printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n", + (unsigned)btrfs_dir_data_len(leaf, dir_item)); + return 1; + } + + return 0; +} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c new file mode 100644 index 00000000..1ac8db5d --- /dev/null +++ b/fs/btrfs/disk-io.c @@ -0,0 +1,3096 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "print-tree.h" +#include "async-thread.h" +#include "locking.h" +#include "tree-log.h" +#include "free-space-cache.h" +#include "inode-map.h" + +static struct extent_io_ops btree_extent_io_ops; +static void end_workqueue_fn(struct btrfs_work *work); +static void free_fs_root(struct btrfs_root *root); +static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, + int read_only); +static int btrfs_destroy_ordered_operations(struct btrfs_root *root); +static int btrfs_destroy_ordered_extents(struct btrfs_root *root); +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root); +static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root); +static int btrfs_destroy_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, + int mark); +static int btrfs_destroy_pinned_extent(struct btrfs_root *root, + struct extent_io_tree *pinned_extents); +static int btrfs_cleanup_transaction(struct btrfs_root *root); + +/* + * end_io_wq structs are used to do processing in task context when an IO is + * complete. This is used during reads to verify checksums, and it is used + * by writes to insert metadata for new file extents after IO is complete. + */ +struct end_io_wq { + struct bio *bio; + bio_end_io_t *end_io; + void *private; + struct btrfs_fs_info *info; + int error; + int metadata; + struct list_head list; + struct btrfs_work work; +}; + +/* + * async submit bios are used to offload expensive checksumming + * onto the worker threads. They checksum file and metadata bios + * just before they are sent down the IO stack. + */ +struct async_submit_bio { + struct inode *inode; + struct bio *bio; + struct list_head list; + extent_submit_bio_hook_t *submit_bio_start; + extent_submit_bio_hook_t *submit_bio_done; + int rw; + int mirror_num; + unsigned long bio_flags; + /* + * bio_offset is optional, can be used if the pages in the bio + * can't tell us where in the file the bio should go + */ + u64 bio_offset; + struct btrfs_work work; +}; + +/* These are used to set the lockdep class on the extent buffer locks. + * The class is set by the readpage_end_io_hook after the buffer has + * passed csum validation but before the pages are unlocked. + * + * The lockdep class is also set by btrfs_init_new_buffer on freshly + * allocated blocks. + * + * The class is based on the level in the tree block, which allows lockdep + * to know that lower nodes nest inside the locks of higher nodes. + * + * We also add a check to make sure the highest level of the tree is + * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this + * code needs update as well. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# if BTRFS_MAX_LEVEL != 8 +# error +# endif +static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; +static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { + /* leaf */ + "btrfs-extent-00", + "btrfs-extent-01", + "btrfs-extent-02", + "btrfs-extent-03", + "btrfs-extent-04", + "btrfs-extent-05", + "btrfs-extent-06", + "btrfs-extent-07", + /* highest possible level */ + "btrfs-extent-08", +}; +#endif + +/* + * extents on the btree inode are pretty simple, there's one extent + * that covers the entire device + */ +static struct extent_map *btree_get_extent(struct inode *inode, + struct page *page, size_t pg_offset, u64 start, u64 len, + int create) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + em->bdev = + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + read_unlock(&em_tree->lock); + goto out; + } + read_unlock(&em_tree->lock); + + em = alloc_extent_map(); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } + em->start = 0; + em->len = (u64)-1; + em->block_len = (u64)-1; + em->block_start = 0; + em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + if (ret == -EEXIST) { + u64 failed_start = em->start; + u64 failed_len = em->len; + + free_extent_map(em); + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + ret = 0; + } else { + em = lookup_extent_mapping(em_tree, failed_start, + failed_len); + ret = -EIO; + } + } else if (ret) { + free_extent_map(em); + em = NULL; + } + write_unlock(&em_tree->lock); + + if (ret) + em = ERR_PTR(ret); +out: + return em; +} + +u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) +{ + return crc32c(seed, data, len); +} + +void btrfs_csum_final(u32 crc, char *result) +{ + put_unaligned_le32(~crc, result); +} + +/* + * compute the csum for a btree block, and either verify it or write it + * into the csum field of the block. + */ +static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, + int verify) +{ + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + char *result = NULL; + unsigned long len; + unsigned long cur_len; + unsigned long offset = BTRFS_CSUM_SIZE; + char *map_token = NULL; + char *kaddr; + unsigned long map_start; + unsigned long map_len; + int err; + u32 crc = ~(u32)0; + unsigned long inline_result; + + len = buf->len - offset; + while (len > 0) { + err = map_private_extent_buffer(buf, offset, 32, + &map_token, &kaddr, + &map_start, &map_len, KM_USER0); + if (err) + return 1; + cur_len = min(len, map_len - (offset - map_start)); + crc = btrfs_csum_data(root, kaddr + offset - map_start, + crc, cur_len); + len -= cur_len; + offset += cur_len; + unmap_extent_buffer(buf, map_token, KM_USER0); + } + if (csum_size > sizeof(inline_result)) { + result = kzalloc(csum_size * sizeof(char), GFP_NOFS); + if (!result) + return 1; + } else { + result = (char *)&inline_result; + } + + btrfs_csum_final(crc, result); + + if (verify) { + if (memcmp_extent_buffer(buf, result, 0, csum_size)) { + u32 val; + u32 found = 0; + memcpy(&found, result, csum_size); + + read_extent_buffer(buf, &val, 0, csum_size); + printk_ratelimited(KERN_INFO "btrfs: %s checksum verify " + "failed on %llu wanted %X found %X " + "level %d\n", + root->fs_info->sb->s_id, + (unsigned long long)buf->start, val, found, + btrfs_header_level(buf)); + if (result != (char *)&inline_result) + kfree(result); + return 1; + } + } else { + write_extent_buffer(buf, result, 0, csum_size); + } + if (result != (char *)&inline_result) + kfree(result); + return 0; +} + +/* + * we can't consider a given block up to date unless the transid of the + * block matches the transid in the parent node's pointer. This is how we + * detect blocks that either didn't get written at all or got written + * in the wrong place. + */ +static int verify_parent_transid(struct extent_io_tree *io_tree, + struct extent_buffer *eb, u64 parent_transid) +{ + struct extent_state *cached_state = NULL; + int ret; + + if (!parent_transid || btrfs_header_generation(eb) == parent_transid) + return 0; + + lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, + 0, &cached_state, GFP_NOFS); + if (extent_buffer_uptodate(io_tree, eb, cached_state) && + btrfs_header_generation(eb) == parent_transid) { + ret = 0; + goto out; + } + printk_ratelimited("parent transid verify failed on %llu wanted %llu " + "found %llu\n", + (unsigned long long)eb->start, + (unsigned long long)parent_transid, + (unsigned long long)btrfs_header_generation(eb)); + ret = 1; + clear_extent_buffer_uptodate(io_tree, eb, &cached_state); +out: + unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state, GFP_NOFS); + return ret; +} + +/* + * helper to read a given tree block, doing retries as required when + * the checksums don't match and we have alternate mirrors to try. + */ +static int btree_read_extent_buffer_pages(struct btrfs_root *root, + struct extent_buffer *eb, + u64 start, u64 parent_transid) +{ + struct extent_io_tree *io_tree; + int ret; + int num_copies = 0; + int mirror_num = 0; + + clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + while (1) { + ret = read_extent_buffer_pages(io_tree, eb, start, 1, + btree_get_extent, mirror_num); + if (!ret && + !verify_parent_transid(io_tree, eb, parent_transid)) + return ret; + + /* + * This buffer's crc is fine, but its contents are corrupted, so + * there is no reason to read the other copies, they won't be + * any less wrong. + */ + if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) + return ret; + + num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, + eb->start, eb->len); + if (num_copies == 1) + return ret; + + mirror_num++; + if (mirror_num > num_copies) + return ret; + } + return -EIO; +} + +/* + * checksum a dirty tree block before IO. This has extra checks to make sure + * we only fill in the checksum field in the first page of a multi-page block + */ + +static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) +{ + struct extent_io_tree *tree; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 found_start; + unsigned long len; + struct extent_buffer *eb; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (page->private == EXTENT_PAGE_PRIVATE) { + WARN_ON(1); + goto out; + } + if (!page->private) { + WARN_ON(1); + goto out; + } + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page); + if (eb == NULL) { + WARN_ON(1); + goto out; + } + ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, + btrfs_header_generation(eb)); + BUG_ON(ret); + WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)); + + found_start = btrfs_header_bytenr(eb); + if (found_start != start) { + WARN_ON(1); + goto err; + } + if (eb->first_page != page) { + WARN_ON(1); + goto err; + } + if (!PageUptodate(page)) { + WARN_ON(1); + goto err; + } + csum_tree_block(root, eb, 0); +err: + free_extent_buffer(eb); +out: + return 0; +} + +static int check_tree_block_fsid(struct btrfs_root *root, + struct extent_buffer *eb) +{ + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + u8 fsid[BTRFS_UUID_SIZE]; + int ret = 1; + + read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb), + BTRFS_FSID_SIZE); + while (fs_devices) { + if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { + ret = 0; + break; + } + fs_devices = fs_devices->seed; + } + return ret; +} + +#define CORRUPT(reason, eb, root, slot) \ + printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \ + "root=%llu, slot=%d\n", reason, \ + (unsigned long long)btrfs_header_bytenr(eb), \ + (unsigned long long)root->objectid, slot) + +static noinline int check_leaf(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + struct btrfs_key key; + struct btrfs_key leaf_key; + u32 nritems = btrfs_header_nritems(leaf); + int slot; + + if (nritems == 0) + return 0; + + /* Check the 0 item */ + if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != + BTRFS_LEAF_DATA_SIZE(root)) { + CORRUPT("invalid item offset size pair", leaf, root, 0); + return -EIO; + } + + /* + * Check to make sure each items keys are in the correct order and their + * offsets make sense. We only have to loop through nritems-1 because + * we check the current slot against the next slot, which verifies the + * next slot's offset+size makes sense and that the current's slot + * offset is correct. + */ + for (slot = 0; slot < nritems - 1; slot++) { + btrfs_item_key_to_cpu(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &key, slot + 1); + + /* Make sure the keys are in the right order */ + if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) { + CORRUPT("bad key order", leaf, root, slot); + return -EIO; + } + + /* + * Make sure the offset and ends are right, remember that the + * item data starts at the end of the leaf and grows towards the + * front. + */ + if (btrfs_item_offset_nr(leaf, slot) != + btrfs_item_end_nr(leaf, slot + 1)) { + CORRUPT("slot offset bad", leaf, root, slot); + return -EIO; + } + + /* + * Check to make sure that we don't point outside of the leaf, + * just incase all the items are consistent to eachother, but + * all point outside of the leaf. + */ + if (btrfs_item_end_nr(leaf, slot) > + BTRFS_LEAF_DATA_SIZE(root)) { + CORRUPT("slot end outside of leaf", leaf, root, slot); + return -EIO; + } + } + + return 0; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) +{ + lockdep_set_class_and_name(&eb->lock, + &btrfs_eb_class[level], + btrfs_eb_name[level]); +} +#endif + +static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + struct extent_io_tree *tree; + u64 found_start; + int found_level; + unsigned long len; + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + int ret = 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page); + if (eb == NULL) { + ret = -EIO; + goto out; + } + + found_start = btrfs_header_bytenr(eb); + if (found_start != start) { + printk_ratelimited(KERN_INFO "btrfs bad tree block start " + "%llu %llu\n", + (unsigned long long)found_start, + (unsigned long long)eb->start); + ret = -EIO; + goto err; + } + if (eb->first_page != page) { + printk(KERN_INFO "btrfs bad first page %lu %lu\n", + eb->first_page->index, page->index); + WARN_ON(1); + ret = -EIO; + goto err; + } + if (check_tree_block_fsid(root, eb)) { + printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", + (unsigned long long)eb->start); + ret = -EIO; + goto err; + } + found_level = btrfs_header_level(eb); + + btrfs_set_buffer_lockdep_class(eb, found_level); + + ret = csum_tree_block(root, eb, 1); + if (ret) { + ret = -EIO; + goto err; + } + + /* + * If this is a leaf block and it is corrupt, set the corrupt bit so + * that we don't try and read the other copies of this block, just + * return -EIO. + */ + if (found_level == 0 && check_leaf(root, eb)) { + set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + ret = -EIO; + } + + end = min_t(u64, eb->len, PAGE_CACHE_SIZE); + end = eb->start + end - 1; +err: + free_extent_buffer(eb); +out: + return ret; +} + +static void end_workqueue_bio(struct bio *bio, int err) +{ + struct end_io_wq *end_io_wq = bio->bi_private; + struct btrfs_fs_info *fs_info; + + fs_info = end_io_wq->info; + end_io_wq->error = err; + end_io_wq->work.func = end_workqueue_fn; + end_io_wq->work.flags = 0; + + if (bio->bi_rw & REQ_WRITE) { + if (end_io_wq->metadata == 1) + btrfs_queue_worker(&fs_info->endio_meta_write_workers, + &end_io_wq->work); + else if (end_io_wq->metadata == 2) + btrfs_queue_worker(&fs_info->endio_freespace_worker, + &end_io_wq->work); + else + btrfs_queue_worker(&fs_info->endio_write_workers, + &end_io_wq->work); + } else { + if (end_io_wq->metadata) + btrfs_queue_worker(&fs_info->endio_meta_workers, + &end_io_wq->work); + else + btrfs_queue_worker(&fs_info->endio_workers, + &end_io_wq->work); + } +} + +/* + * For the metadata arg you want + * + * 0 - if data + * 1 - if normal metadta + * 2 - if writing to the free space cache area + */ +int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, + int metadata) +{ + struct end_io_wq *end_io_wq; + end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); + if (!end_io_wq) + return -ENOMEM; + + end_io_wq->private = bio->bi_private; + end_io_wq->end_io = bio->bi_end_io; + end_io_wq->info = info; + end_io_wq->error = 0; + end_io_wq->bio = bio; + end_io_wq->metadata = metadata; + + bio->bi_private = end_io_wq; + bio->bi_end_io = end_workqueue_bio; + return 0; +} + +unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) +{ + unsigned long limit = min_t(unsigned long, + info->workers.max_workers, + info->fs_devices->open_devices); + return 256 * limit; +} + +static void run_one_async_start(struct btrfs_work *work) +{ + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + async->submit_bio_start(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags, + async->bio_offset); +} + +static void run_one_async_done(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info; + struct async_submit_bio *async; + int limit; + + async = container_of(work, struct async_submit_bio, work); + fs_info = BTRFS_I(async->inode)->root->fs_info; + + limit = btrfs_async_submit_limit(fs_info); + limit = limit * 2 / 3; + + atomic_dec(&fs_info->nr_async_submits); + + if (atomic_read(&fs_info->nr_async_submits) < limit && + waitqueue_active(&fs_info->async_submit_wait)) + wake_up(&fs_info->async_submit_wait); + + async->submit_bio_done(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags, + async->bio_offset); +} + +static void run_one_async_free(struct btrfs_work *work) +{ + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + kfree(async); +} + +int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, + int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, + u64 bio_offset, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done) +{ + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->inode = inode; + async->rw = rw; + async->bio = bio; + async->mirror_num = mirror_num; + async->submit_bio_start = submit_bio_start; + async->submit_bio_done = submit_bio_done; + + async->work.func = run_one_async_start; + async->work.ordered_func = run_one_async_done; + async->work.ordered_free = run_one_async_free; + + async->work.flags = 0; + async->bio_flags = bio_flags; + async->bio_offset = bio_offset; + + atomic_inc(&fs_info->nr_async_submits); + + if (rw & REQ_SYNC) + btrfs_set_work_high_prio(&async->work); + + btrfs_queue_worker(&fs_info->workers, &async->work); + + while (atomic_read(&fs_info->async_submit_draining) && + atomic_read(&fs_info->nr_async_submits)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) == 0)); + } + + return 0; +} + +static int btree_csum_one_bio(struct bio *bio) +{ + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + struct btrfs_root *root; + + WARN_ON(bio->bi_vcnt <= 0); + while (bio_index < bio->bi_vcnt) { + root = BTRFS_I(bvec->bv_page->mapping->host)->root; + csum_dirty_buffer(root, bvec->bv_page); + bio_index++; + bvec++; + } + return 0; +} + +static int __btree_submit_bio_start(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags, + u64 bio_offset) +{ + /* + * when we're called for a write, we're already in the async + * submission context. Just jump into btrfs_map_bio + */ + btree_csum_one_bio(bio); + return 0; +} + +static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) +{ + /* + * when we're called for a write, we're already in the async + * submission context. Just jump into btrfs_map_bio + */ + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); +} + +static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) +{ + int ret; + + ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, + bio, 1); + BUG_ON(ret); + + if (!(rw & REQ_WRITE)) { + /* + * called for a read, do the setup so that checksum validation + * can happen in the async kernel threads + */ + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } + + /* + * kthread helpers are used to submit writes so that checksumming + * can happen in parallel across all CPUs + */ + return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, 0, + bio_offset, + __btree_submit_bio_start, + __btree_submit_bio_done); +} + +#ifdef CONFIG_MIGRATION +static int btree_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + /* + * we can't safely write a btree page from here, + * we haven't done the locking hook + */ + if (PageDirty(page)) + return -EAGAIN; + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; + return migrate_page(mapping, newpage, page); +} +#endif + +static int btree_writepage(struct page *page, struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct extent_buffer *eb; + int was_dirty; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (!(current->flags & PF_MEMALLOC)) { + return extent_write_full_page(tree, page, + btree_get_extent, wbc); + } + + redirty_page_for_writepage(wbc, page); + eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE); + WARN_ON(!eb); + + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + if (!was_dirty) { + spin_lock(&root->fs_info->delalloc_lock); + root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE; + spin_unlock(&root->fs_info->delalloc_lock); + } + free_extent_buffer(eb); + + unlock_page(page); + return 0; +} + +static int btree_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; + if (wbc->sync_mode == WB_SYNC_NONE) { + struct btrfs_root *root = BTRFS_I(mapping->host)->root; + u64 num_dirty; + unsigned long thresh = 32 * 1024 * 1024; + + if (wbc->for_kupdate) + return 0; + + /* this is a bit racy, but that's ok */ + num_dirty = root->fs_info->dirty_metadata_bytes; + if (num_dirty < thresh) + return 0; + } + return extent_writepages(tree, mapping, btree_get_extent, wbc); +} + +static int btree_readpage(struct file *file, struct page *page) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_read_full_page(tree, page, btree_get_extent); +} + +static int btree_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *map; + int ret; + + if (PageWriteback(page) || PageDirty(page)) + return 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + + ret = try_release_extent_state(map, tree, page, gfp_flags); + if (!ret) + return 0; + + ret = try_release_extent_buffer(tree, page); + if (ret == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } + + return ret; +} + +static void btree_invalidatepage(struct page *page, unsigned long offset) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + extent_invalidatepage(tree, page, offset); + btree_releasepage(page, GFP_NOFS); + if (PagePrivate(page)) { + printk(KERN_WARNING "btrfs warning page private not zero " + "on page %llu\n", (unsigned long long)page_offset(page)); + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } +} + +static const struct address_space_operations btree_aops = { + .readpage = btree_readpage, + .writepage = btree_writepage, + .writepages = btree_writepages, + .releasepage = btree_releasepage, + .invalidatepage = btree_invalidatepage, +#ifdef CONFIG_MIGRATION + .migratepage = btree_migratepage, +#endif +}; + +int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, + u64 parent_transid) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + int ret = 0; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return 0; + read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, + buf, 0, 0, btree_get_extent, 0); + free_extent_buffer(buf); + return ret; +} + +struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, + bytenr, blocksize); + return eb; +} + +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + + eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, + bytenr, blocksize, NULL); + return eb; +} + + +int btrfs_write_tree_block(struct extent_buffer *buf) +{ + return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, + buf->start + buf->len - 1); +} + +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +{ + return filemap_fdatawait_range(buf->first_page->mapping, + buf->start, buf->start + buf->len - 1); +} + +struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, + u32 blocksize, u64 parent_transid) +{ + struct extent_buffer *buf = NULL; + int ret; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return NULL; + + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + + if (ret == 0) + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); + return buf; + +} + +int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + if (btrfs_header_generation(buf) == + root->fs_info->running_transaction->transid) { + btrfs_assert_tree_locked(buf); + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (root->fs_info->dirty_metadata_bytes >= buf->len) + root->fs_info->dirty_metadata_bytes -= buf->len; + else + WARN_ON(1); + spin_unlock(&root->fs_info->delalloc_lock); + } + + /* ugh, clear_extent_buffer_dirty needs to lock the page */ + btrfs_set_lock_blocking(buf); + clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, + buf); + } + return 0; +} + +static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, + u32 stripesize, struct btrfs_root *root, + struct btrfs_fs_info *fs_info, + u64 objectid) +{ + root->node = NULL; + root->commit_root = NULL; + root->sectorsize = sectorsize; + root->nodesize = nodesize; + root->leafsize = leafsize; + root->stripesize = stripesize; + root->ref_cows = 0; + root->track_dirty = 0; + root->in_radix = 0; + root->orphan_item_inserted = 0; + root->orphan_cleanup_state = 0; + + root->fs_info = fs_info; + root->objectid = objectid; + root->last_trans = 0; + root->highest_objectid = 0; + root->name = NULL; + root->inode_tree = RB_ROOT; + INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); + root->block_rsv = NULL; + root->orphan_block_rsv = NULL; + + INIT_LIST_HEAD(&root->dirty_list); + INIT_LIST_HEAD(&root->orphan_list); + INIT_LIST_HEAD(&root->root_list); + spin_lock_init(&root->orphan_lock); + spin_lock_init(&root->inode_lock); + spin_lock_init(&root->accounting_lock); + mutex_init(&root->objectid_mutex); + mutex_init(&root->log_mutex); + init_waitqueue_head(&root->log_writer_wait); + init_waitqueue_head(&root->log_commit_wait[0]); + init_waitqueue_head(&root->log_commit_wait[1]); + atomic_set(&root->log_commit[0], 0); + atomic_set(&root->log_commit[1], 0); + atomic_set(&root->log_writers, 0); + root->log_batch = 0; + root->log_transid = 0; + root->last_log_commit = 0; + extent_io_tree_init(&root->dirty_log_pages, + fs_info->btree_inode->i_mapping); + + memset(&root->root_key, 0, sizeof(root->root_key)); + memset(&root->root_item, 0, sizeof(root->root_item)); + memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); + memset(&root->root_kobj, 0, sizeof(root->root_kobj)); + root->defrag_trans_start = fs_info->generation; + init_completion(&root->kobj_unregister); + root->defrag_running = 0; + root->root_key.objectid = objectid; + root->anon_super.s_root = NULL; + root->anon_super.s_dev = 0; + INIT_LIST_HEAD(&root->anon_super.s_list); + INIT_LIST_HEAD(&root->anon_super.s_instances); + init_rwsem(&root->anon_super.s_umount); + + return 0; +} + +static int find_and_setup_root(struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info, + u64 objectid, + struct btrfs_root *root) +{ + int ret; + u32 blocksize; + u64 generation; + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, objectid); + ret = btrfs_find_last_root(tree_root, objectid, + &root->root_item, &root->root_key); + if (ret > 0) + return -ENOENT; + BUG_ON(ret); + + generation = btrfs_root_generation(&root->root_item); + blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + blocksize, generation); + if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { + free_extent_buffer(root->node); + return -EIO; + } + root->commit_root = btrfs_root_node(root); + return 0; +} + +static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct btrfs_root *tree_root = fs_info->tree_root; + struct extent_buffer *leaf; + + root = kzalloc(sizeof(*root), GFP_NOFS); + if (!root) + return ERR_PTR(-ENOMEM); + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, BTRFS_TREE_LOG_OBJECTID); + + root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + /* + * log trees do not get reference counted because they go away + * before a real commit is actually done. They do store pointers + * to file data extents, and those reference counts still get + * updated (along with back refs to the log tree). + */ + root->ref_cows = 0; + + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); + if (IS_ERR(leaf)) { + kfree(root); + return ERR_CAST(leaf); + } + + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); + root->node = leaf; + + write_extent_buffer(root->node, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(root->node), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(root->node); + btrfs_tree_unlock(root->node); + return root; +} + +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *log_root; + + log_root = alloc_log_tree(trans, fs_info); + if (IS_ERR(log_root)) + return PTR_ERR(log_root); + WARN_ON(fs_info->log_root_tree); + fs_info->log_root_tree = log_root; + return 0; +} + +int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *log_root; + struct btrfs_inode_item *inode_item; + + log_root = alloc_log_tree(trans, root->fs_info); + if (IS_ERR(log_root)) + return PTR_ERR(log_root); + + log_root->last_trans = trans->transid; + log_root->root_key.offset = root->root_key.objectid; + + inode_item = &log_root->root_item.inode; + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nbytes = cpu_to_le64(root->leafsize); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + btrfs_set_root_node(&log_root->root_item, log_root->node); + + WARN_ON(root->log_root); + root->log_root = log_root; + root->log_transid = 0; + root->last_log_commit = 0; + return 0; +} + +struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + struct btrfs_key *location) +{ + struct btrfs_root *root; + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_path *path; + struct extent_buffer *l; + u64 generation; + u32 blocksize; + int ret = 0; + + root = kzalloc(sizeof(*root), GFP_NOFS); + if (!root) + return ERR_PTR(-ENOMEM); + if (location->offset == (u64)-1) { + ret = find_and_setup_root(tree_root, fs_info, + location->objectid, root); + if (ret) { + kfree(root); + return ERR_PTR(ret); + } + goto out; + } + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, location->objectid); + + path = btrfs_alloc_path(); + if (!path) { + kfree(root); + return ERR_PTR(-ENOMEM); + } + ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); + if (ret == 0) { + l = path->nodes[0]; + read_extent_buffer(l, &root->root_item, + btrfs_item_ptr_offset(l, path->slots[0]), + sizeof(root->root_item)); + memcpy(&root->root_key, location, sizeof(*location)); + } + btrfs_free_path(path); + if (ret) { + kfree(root); + if (ret > 0) + ret = -ENOENT; + return ERR_PTR(ret); + } + + generation = btrfs_root_generation(&root->root_item); + blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + blocksize, generation); + root->commit_root = btrfs_root_node(root); + BUG_ON(!root->node); +out: + if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { + root->ref_cows = 1; + btrfs_check_and_init_root_item(&root->root_item); + } + + return root; +} + +struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location) +{ + struct btrfs_root *root; + int ret; + + if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) + return fs_info->tree_root; + if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) + return fs_info->extent_root; + if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) + return fs_info->chunk_root; + if (location->objectid == BTRFS_DEV_TREE_OBJECTID) + return fs_info->dev_root; + if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) + return fs_info->csum_root; +again: + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)location->objectid); + spin_unlock(&fs_info->fs_roots_radix_lock); + if (root) + return root; + + root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); + if (IS_ERR(root)) + return root; + + root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); + root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), + GFP_NOFS); + if (!root->free_ino_pinned || !root->free_ino_ctl) { + ret = -ENOMEM; + goto fail; + } + + btrfs_init_free_ino_ctl(root); + mutex_init(&root->fs_commit_mutex); + spin_lock_init(&root->cache_lock); + init_waitqueue_head(&root->cache_wait); + + ret = set_anon_super(&root->anon_super, NULL); + if (ret) + goto fail; + + if (btrfs_root_refs(&root->root_item) == 0) { + ret = -ENOENT; + goto fail; + } + + ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); + if (ret < 0) + goto fail; + if (ret == 0) + root->orphan_item_inserted = 1; + + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto fail; + + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); + if (ret == 0) + root->in_radix = 1; + + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); + if (ret) { + if (ret == -EEXIST) { + free_fs_root(root); + goto again; + } + goto fail; + } + + ret = btrfs_find_dead_roots(fs_info->tree_root, + root->root_key.objectid); + WARN_ON(ret); + return root; +fail: + free_fs_root(root); + return ERR_PTR(ret); +} + +static int btrfs_congested_fn(void *congested_data, int bdi_bits) +{ + struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; + int ret = 0; + struct btrfs_device *device; + struct backing_dev_info *bdi; + + rcu_read_lock(); + list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi && bdi_congested(bdi, bdi_bits)) { + ret = 1; + break; + } + } + rcu_read_unlock(); + return ret; +} + +/* + * If this fails, caller must call bdi_destroy() to get rid of the + * bdi again. + */ +static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) +{ + int err; + + bdi->capabilities = BDI_CAP_MAP_COPY; + err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY); + if (err) + return err; + + bdi->ra_pages = default_backing_dev_info.ra_pages; + bdi->congested_fn = btrfs_congested_fn; + bdi->congested_data = info; + return 0; +} + +static int bio_ready_for_csum(struct bio *bio) +{ + u64 length = 0; + u64 buf_len = 0; + u64 start = 0; + struct page *page; + struct extent_io_tree *io_tree = NULL; + struct bio_vec *bvec; + int i; + int ret; + + bio_for_each_segment(bvec, bio, i) { + page = bvec->bv_page; + if (page->private == EXTENT_PAGE_PRIVATE) { + length += bvec->bv_len; + continue; + } + if (!page->private) { + length += bvec->bv_len; + continue; + } + length = bvec->bv_len; + buf_len = page->private >> 2; + start = page_offset(page) + bvec->bv_offset; + io_tree = &BTRFS_I(page->mapping->host)->io_tree; + } + /* are we fully contained in this bio? */ + if (buf_len <= length) + return 1; + + ret = extent_range_uptodate(io_tree, start + length, + start + buf_len - 1); + return ret; +} + +/* + * called by the kthread helper functions to finally call the bio end_io + * functions. This is where read checksum verification actually happens + */ +static void end_workqueue_fn(struct btrfs_work *work) +{ + struct bio *bio; + struct end_io_wq *end_io_wq; + struct btrfs_fs_info *fs_info; + int error; + + end_io_wq = container_of(work, struct end_io_wq, work); + bio = end_io_wq->bio; + fs_info = end_io_wq->info; + + /* metadata bio reads are special because the whole tree block must + * be checksummed at once. This makes sure the entire block is in + * ram and up to date before trying to verify things. For + * blocksize <= pagesize, it is basically a noop + */ + if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata && + !bio_ready_for_csum(bio)) { + btrfs_queue_worker(&fs_info->endio_meta_workers, + &end_io_wq->work); + return; + } + error = end_io_wq->error; + bio->bi_private = end_io_wq->private; + bio->bi_end_io = end_io_wq->end_io; + kfree(end_io_wq); + bio_endio(bio, error); +} + +static int cleaner_kthread(void *arg) +{ + struct btrfs_root *root = arg; + + do { + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + + if (!(root->fs_info->sb->s_flags & MS_RDONLY) && + mutex_trylock(&root->fs_info->cleaner_mutex)) { + btrfs_run_delayed_iputs(root); + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + btrfs_run_defrag_inodes(root->fs_info); + } + + if (freezing(current)) { + refrigerator(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + if (!kthread_should_stop()) + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +static int transaction_kthread(void *arg) +{ + struct btrfs_root *root = arg; + struct btrfs_trans_handle *trans; + struct btrfs_transaction *cur; + u64 transid; + unsigned long now; + unsigned long delay; + int ret; + + do { + delay = HZ * 30; + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + + spin_lock(&root->fs_info->trans_lock); + cur = root->fs_info->running_transaction; + if (!cur) { + spin_unlock(&root->fs_info->trans_lock); + goto sleep; + } + + now = get_seconds(); + if (!cur->blocked && + (now < cur->start_time || now - cur->start_time < 30)) { + spin_unlock(&root->fs_info->trans_lock); + delay = HZ * 5; + goto sleep; + } + transid = cur->transid; + spin_unlock(&root->fs_info->trans_lock); + + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + if (transid == trans->transid) { + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + } else { + btrfs_end_transaction(trans, root); + } +sleep: + wake_up_process(root->fs_info->cleaner_kthread); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + + if (freezing(current)) { + refrigerator(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + if (!kthread_should_stop() && + !btrfs_transaction_blocked(root->fs_info)) + schedule_timeout(delay); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options) +{ + u32 sectorsize; + u32 nodesize; + u32 leafsize; + u32 blocksize; + u32 stripesize; + u64 generation; + u64 features; + struct btrfs_key location; + struct buffer_head *bh; + struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *tree_root = btrfs_sb(sb); + struct btrfs_fs_info *fs_info = NULL; + struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *log_tree_root; + + int ret; + int err = -EINVAL; + + struct btrfs_super_block *disk_super; + + if (!extent_root || !tree_root || !tree_root->fs_info || + !chunk_root || !dev_root || !csum_root) { + err = -ENOMEM; + goto fail; + } + fs_info = tree_root->fs_info; + + ret = init_srcu_struct(&fs_info->subvol_srcu); + if (ret) { + err = ret; + goto fail; + } + + ret = setup_bdi(fs_info, &fs_info->bdi); + if (ret) { + err = ret; + goto fail_srcu; + } + + fs_info->btree_inode = new_inode(sb); + if (!fs_info->btree_inode) { + err = -ENOMEM; + goto fail_bdi; + } + + fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS; + + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_LIST_HEAD(&fs_info->trans_list); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->delayed_iputs); + INIT_LIST_HEAD(&fs_info->hashers); + INIT_LIST_HEAD(&fs_info->delalloc_inodes); + INIT_LIST_HEAD(&fs_info->ordered_operations); + INIT_LIST_HEAD(&fs_info->caching_block_groups); + spin_lock_init(&fs_info->delalloc_lock); + spin_lock_init(&fs_info->trans_lock); + spin_lock_init(&fs_info->ref_cache_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->delayed_iput_lock); + spin_lock_init(&fs_info->defrag_inodes_lock); + mutex_init(&fs_info->reloc_mutex); + + init_completion(&fs_info->kobj_unregister); + fs_info->tree_root = tree_root; + fs_info->extent_root = extent_root; + fs_info->csum_root = csum_root; + fs_info->chunk_root = chunk_root; + fs_info->dev_root = dev_root; + fs_info->fs_devices = fs_devices; + INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); + INIT_LIST_HEAD(&fs_info->space_info); + btrfs_mapping_init(&fs_info->mapping_tree); + btrfs_init_block_rsv(&fs_info->global_block_rsv); + btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); + btrfs_init_block_rsv(&fs_info->trans_block_rsv); + btrfs_init_block_rsv(&fs_info->chunk_block_rsv); + btrfs_init_block_rsv(&fs_info->empty_block_rsv); + INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); + mutex_init(&fs_info->durable_block_rsv_mutex); + atomic_set(&fs_info->nr_async_submits, 0); + atomic_set(&fs_info->async_delalloc_pages, 0); + atomic_set(&fs_info->async_submit_draining, 0); + atomic_set(&fs_info->nr_async_bios, 0); + atomic_set(&fs_info->defrag_running, 0); + fs_info->sb = sb; + fs_info->max_inline = 8192 * 1024; + fs_info->metadata_ratio = 0; + fs_info->defrag_inodes = RB_ROOT; + fs_info->trans_no_join = 0; + + fs_info->thread_pool_size = min_t(unsigned long, + num_online_cpus() + 2, 8); + + INIT_LIST_HEAD(&fs_info->ordered_extents); + spin_lock_init(&fs_info->ordered_extent_lock); + fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), + GFP_NOFS); + if (!fs_info->delayed_root) { + err = -ENOMEM; + goto fail_iput; + } + btrfs_init_delayed_root(fs_info->delayed_root); + + mutex_init(&fs_info->scrub_lock); + atomic_set(&fs_info->scrubs_running, 0); + atomic_set(&fs_info->scrub_pause_req, 0); + atomic_set(&fs_info->scrubs_paused, 0); + atomic_set(&fs_info->scrub_cancel_req, 0); + init_waitqueue_head(&fs_info->scrub_pause_wait); + init_rwsem(&fs_info->scrub_super_lock); + fs_info->scrub_workers_refcnt = 0; + + sb->s_blocksize = 4096; + sb->s_blocksize_bits = blksize_bits(4096); + sb->s_bdi = &fs_info->bdi; + + fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + fs_info->btree_inode->i_nlink = 1; + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of + * the devices in the system + */ + fs_info->btree_inode->i_size = OFFSET_MAX; + fs_info->btree_inode->i_mapping->a_ops = &btree_aops; + fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; + + RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); + extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, + fs_info->btree_inode->i_mapping); + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); + + BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + + BTRFS_I(fs_info->btree_inode)->root = tree_root; + memset(&BTRFS_I(fs_info->btree_inode)->location, 0, + sizeof(struct btrfs_key)); + BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; + insert_inode_hash(fs_info->btree_inode); + + spin_lock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree = RB_ROOT; + + extent_io_tree_init(&fs_info->freed_extents[0], + fs_info->btree_inode->i_mapping); + extent_io_tree_init(&fs_info->freed_extents[1], + fs_info->btree_inode->i_mapping); + fs_info->pinned_extents = &fs_info->freed_extents[0]; + fs_info->do_barriers = 1; + + + mutex_init(&fs_info->ordered_operations_mutex); + mutex_init(&fs_info->tree_log_mutex); + mutex_init(&fs_info->chunk_mutex); + mutex_init(&fs_info->transaction_kthread_mutex); + mutex_init(&fs_info->cleaner_mutex); + mutex_init(&fs_info->volume_mutex); + init_rwsem(&fs_info->extent_commit_sem); + init_rwsem(&fs_info->cleanup_work_sem); + init_rwsem(&fs_info->subvol_sem); + + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); + btrfs_init_free_cluster(&fs_info->data_alloc_cluster); + + init_waitqueue_head(&fs_info->transaction_throttle); + init_waitqueue_head(&fs_info->transaction_wait); + init_waitqueue_head(&fs_info->transaction_blocked_wait); + init_waitqueue_head(&fs_info->async_submit_wait); + + __setup_root(4096, 4096, 4096, 4096, tree_root, + fs_info, BTRFS_ROOT_TREE_OBJECTID); + + bh = btrfs_read_dev_super(fs_devices->latest_bdev); + if (!bh) { + err = -EINVAL; + goto fail_alloc; + } + + memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); + memcpy(&fs_info->super_for_commit, &fs_info->super_copy, + sizeof(fs_info->super_for_commit)); + brelse(bh); + + memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); + + disk_super = &fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + goto fail_alloc; + + /* check FS state, whether FS is broken. */ + fs_info->fs_state |= btrfs_super_flags(disk_super); + + btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + + /* + * In the long term, we'll store the compression type in the super + * block, and it'll be used for per file compression control. + */ + fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + + ret = btrfs_parse_options(tree_root, options); + if (ret) { + err = ret; + goto fail_alloc; + } + + features = btrfs_super_incompat_flags(disk_super) & + ~BTRFS_FEATURE_INCOMPAT_SUPP; + if (features) { + printk(KERN_ERR "BTRFS: couldn't mount because of " + "unsupported optional features (%Lx).\n", + (unsigned long long)features); + err = -EINVAL; + goto fail_alloc; + } + + features = btrfs_super_incompat_flags(disk_super); + features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + btrfs_set_super_incompat_flags(disk_super, features); + + features = btrfs_super_compat_ro_flags(disk_super) & + ~BTRFS_FEATURE_COMPAT_RO_SUPP; + if (!(sb->s_flags & MS_RDONLY) && features) { + printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " + "unsupported option features (%Lx).\n", + (unsigned long long)features); + err = -EINVAL; + goto fail_alloc; + } + + btrfs_init_workers(&fs_info->generic_worker, + "genwork", 1, NULL); + + btrfs_init_workers(&fs_info->workers, "worker", + fs_info->thread_pool_size, + &fs_info->generic_worker); + + btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", + fs_info->thread_pool_size, + &fs_info->generic_worker); + + btrfs_init_workers(&fs_info->submit_workers, "submit", + min_t(u64, fs_devices->num_devices, + fs_info->thread_pool_size), + &fs_info->generic_worker); + + /* a higher idle thresh on the submit workers makes it much more + * likely that bios will be send down in a sane order to the + * devices + */ + fs_info->submit_workers.idle_thresh = 64; + + fs_info->workers.idle_thresh = 16; + fs_info->workers.ordered = 1; + + fs_info->delalloc_workers.idle_thresh = 2; + fs_info->delalloc_workers.ordered = 1; + + btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_workers, "endio", + fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", + fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_meta_write_workers, + "endio-meta-write", fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", + fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write", + 1, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", + fs_info->thread_pool_size, + &fs_info->generic_worker); + + /* + * endios are largely parallel and should have a very + * low idle thresh + */ + fs_info->endio_workers.idle_thresh = 4; + fs_info->endio_meta_workers.idle_thresh = 4; + + fs_info->endio_write_workers.idle_thresh = 2; + fs_info->endio_meta_write_workers.idle_thresh = 2; + + btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->generic_worker, 1); + btrfs_start_workers(&fs_info->submit_workers, 1); + btrfs_start_workers(&fs_info->delalloc_workers, 1); + btrfs_start_workers(&fs_info->fixup_workers, 1); + btrfs_start_workers(&fs_info->endio_workers, 1); + btrfs_start_workers(&fs_info->endio_meta_workers, 1); + btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); + btrfs_start_workers(&fs_info->endio_write_workers, 1); + btrfs_start_workers(&fs_info->endio_freespace_worker, 1); + btrfs_start_workers(&fs_info->delayed_workers, 1); + + fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); + fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, + 4 * 1024 * 1024 / PAGE_CACHE_SIZE); + + nodesize = btrfs_super_nodesize(disk_super); + leafsize = btrfs_super_leafsize(disk_super); + sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = btrfs_super_stripesize(disk_super); + tree_root->nodesize = nodesize; + tree_root->leafsize = leafsize; + tree_root->sectorsize = sectorsize; + tree_root->stripesize = stripesize; + + sb->s_blocksize = sectorsize; + sb->s_blocksize_bits = blksize_bits(sectorsize); + + if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, + sizeof(disk_super->magic))) { + printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); + goto fail_sb_buffer; + } + + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_sys_array(tree_root); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + printk(KERN_WARNING "btrfs: failed to read the system " + "array on %s\n", sb->s_id); + goto fail_sb_buffer; + } + + blocksize = btrfs_level_size(tree_root, + btrfs_super_chunk_root_level(disk_super)); + generation = btrfs_super_chunk_root_generation(disk_super); + + __setup_root(nodesize, leafsize, sectorsize, stripesize, + chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); + + chunk_root->node = read_tree_block(chunk_root, + btrfs_super_chunk_root(disk_super), + blocksize, generation); + BUG_ON(!chunk_root->node); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", + sb->s_id); + goto fail_chunk_root; + } + btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); + chunk_root->commit_root = btrfs_root_node(chunk_root); + + read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), + BTRFS_UUID_SIZE); + + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_chunk_tree(chunk_root); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", + sb->s_id); + goto fail_chunk_root; + } + + btrfs_close_extra_devices(fs_devices); + + blocksize = btrfs_level_size(tree_root, + btrfs_super_root_level(disk_super)); + generation = btrfs_super_generation(disk_super); + + tree_root->node = read_tree_block(tree_root, + btrfs_super_root(disk_super), + blocksize, generation); + if (!tree_root->node) + goto fail_chunk_root; + if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", + sb->s_id); + goto fail_tree_root; + } + btrfs_set_root_node(&tree_root->root_item, tree_root->node); + tree_root->commit_root = btrfs_root_node(tree_root); + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_EXTENT_TREE_OBJECTID, extent_root); + if (ret) + goto fail_tree_root; + extent_root->track_dirty = 1; + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_DEV_TREE_OBJECTID, dev_root); + if (ret) + goto fail_extent_root; + dev_root->track_dirty = 1; + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_CSUM_TREE_OBJECTID, csum_root); + if (ret) + goto fail_dev_root; + + csum_root->track_dirty = 1; + + fs_info->generation = generation; + fs_info->last_trans_committed = generation; + fs_info->data_alloc_profile = (u64)-1; + fs_info->metadata_alloc_profile = (u64)-1; + fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + + ret = btrfs_init_space_info(fs_info); + if (ret) { + printk(KERN_ERR "Failed to initial space info: %d\n", ret); + goto fail_block_groups; + } + + ret = btrfs_read_block_groups(extent_root); + if (ret) { + printk(KERN_ERR "Failed to read block groups: %d\n", ret); + goto fail_block_groups; + } + + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, + "btrfs-cleaner"); + if (IS_ERR(fs_info->cleaner_kthread)) + goto fail_block_groups; + + fs_info->transaction_kthread = kthread_run(transaction_kthread, + tree_root, + "btrfs-transaction"); + if (IS_ERR(fs_info->transaction_kthread)) + goto fail_cleaner; + + if (!btrfs_test_opt(tree_root, SSD) && + !btrfs_test_opt(tree_root, NOSSD) && + !fs_info->fs_devices->rotating) { + printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD " + "mode\n"); + btrfs_set_opt(fs_info->mount_opt, SSD); + } + + /* do not make disk changes in broken FS */ + if (btrfs_super_log_root(disk_super) != 0 && + !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { + u64 bytenr = btrfs_super_log_root(disk_super); + + if (fs_devices->rw_devices == 0) { + printk(KERN_WARNING "Btrfs log replay required " + "on RO media\n"); + err = -EIO; + goto fail_trans_kthread; + } + blocksize = + btrfs_level_size(tree_root, + btrfs_super_log_root_level(disk_super)); + + log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!log_tree_root) { + err = -ENOMEM; + goto fail_trans_kthread; + } + + __setup_root(nodesize, leafsize, sectorsize, stripesize, + log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); + + log_tree_root->node = read_tree_block(tree_root, bytenr, + blocksize, + generation + 1); + ret = btrfs_recover_log_trees(log_tree_root); + BUG_ON(ret); + + if (sb->s_flags & MS_RDONLY) { + ret = btrfs_commit_super(tree_root); + BUG_ON(ret); + } + } + + ret = btrfs_find_orphan_roots(tree_root); + BUG_ON(ret); + + if (!(sb->s_flags & MS_RDONLY)) { + ret = btrfs_cleanup_fs_roots(fs_info); + BUG_ON(ret); + + ret = btrfs_recover_relocation(tree_root); + if (ret < 0) { + printk(KERN_WARNING + "btrfs: failed to recover relocation\n"); + err = -EINVAL; + goto fail_trans_kthread; + } + } + + location.objectid = BTRFS_FS_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = (u64)-1; + + fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); + if (!fs_info->fs_root) + goto fail_trans_kthread; + if (IS_ERR(fs_info->fs_root)) { + err = PTR_ERR(fs_info->fs_root); + goto fail_trans_kthread; + } + + if (!(sb->s_flags & MS_RDONLY)) { + down_read(&fs_info->cleanup_work_sem); + err = btrfs_orphan_cleanup(fs_info->fs_root); + if (!err) + err = btrfs_orphan_cleanup(fs_info->tree_root); + up_read(&fs_info->cleanup_work_sem); + if (err) { + close_ctree(tree_root); + return ERR_PTR(err); + } + } + + return tree_root; + +fail_trans_kthread: + kthread_stop(fs_info->transaction_kthread); +fail_cleaner: + kthread_stop(fs_info->cleaner_kthread); + + /* + * make sure we're done with the btree inode before we stop our + * kthreads + */ + filemap_write_and_wait(fs_info->btree_inode->i_mapping); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + +fail_block_groups: + btrfs_free_block_groups(fs_info); + free_extent_buffer(csum_root->node); + free_extent_buffer(csum_root->commit_root); +fail_dev_root: + free_extent_buffer(dev_root->node); + free_extent_buffer(dev_root->commit_root); +fail_extent_root: + free_extent_buffer(extent_root->node); + free_extent_buffer(extent_root->commit_root); +fail_tree_root: + free_extent_buffer(tree_root->node); + free_extent_buffer(tree_root->commit_root); +fail_chunk_root: + free_extent_buffer(chunk_root->node); + free_extent_buffer(chunk_root->commit_root); +fail_sb_buffer: + btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->endio_freespace_worker); + btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->delayed_workers); +fail_alloc: + kfree(fs_info->delayed_root); +fail_iput: + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + iput(fs_info->btree_inode); + + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); +fail_bdi: + bdi_destroy(&fs_info->bdi); +fail_srcu: + cleanup_srcu_struct(&fs_info->subvol_srcu); +fail: + kfree(extent_root); + kfree(tree_root); + kfree(fs_info); + kfree(chunk_root); + kfree(dev_root); + kfree(csum_root); + return ERR_PTR(err); +} + +static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (uptodate) { + set_buffer_uptodate(bh); + } else { + printk_ratelimited(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + bdevname(bh->b_bdev, b)); + /* note, we dont' set_buffer_write_io_error because we have + * our own ways of dealing with the IO errors + */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) +{ + struct buffer_head *bh; + struct buffer_head *latest = NULL; + struct btrfs_super_block *super; + int i; + u64 transid = 0; + u64 bytenr; + + /* we would like to check all the supers, but that would make + * a btrfs mount succeed after a mkfs from a different FS. + * So, we need to add a special mount option to scan for + * later supers, using BTRFS_SUPER_MIRROR_MAX instead + */ + for (i = 0; i < 1; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + 4096 >= i_size_read(bdev->bd_inode)) + break; + bh = __bread(bdev, bytenr / 4096, 4096); + if (!bh) + continue; + + super = (struct btrfs_super_block *)bh->b_data; + if (btrfs_super_bytenr(super) != bytenr || + strncmp((char *)(&super->magic), BTRFS_MAGIC, + sizeof(super->magic))) { + brelse(bh); + continue; + } + + if (!latest || btrfs_super_generation(super) > transid) { + brelse(latest); + latest = bh; + transid = btrfs_super_generation(super); + } else { + brelse(bh); + } + } + return latest; +} + +/* + * this should be called twice, once with wait == 0 and + * once with wait == 1. When wait == 0 is done, all the buffer heads + * we write are pinned. + * + * They are released when wait == 1 is done. + * max_mirrors must be the same for both runs, and it indicates how + * many supers on this one device should be written. + * + * max_mirrors == 0 means to write them all. + */ +static int write_dev_supers(struct btrfs_device *device, + struct btrfs_super_block *sb, + int do_barriers, int wait, int max_mirrors) +{ + struct buffer_head *bh; + int i; + int ret; + int errors = 0; + u32 crc; + u64 bytenr; + int last_barrier = 0; + + if (max_mirrors == 0) + max_mirrors = BTRFS_SUPER_MIRROR_MAX; + + /* make sure only the last submit_bh does a barrier */ + if (do_barriers) { + for (i = 0; i < max_mirrors; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + device->total_bytes) + break; + last_barrier = i; + } + } + + for (i = 0; i < max_mirrors; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) + break; + + if (wait) { + bh = __find_get_block(device->bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + BUG_ON(!bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + errors++; + + /* drop our reference */ + brelse(bh); + + /* drop the reference from the wait == 0 run */ + brelse(bh); + continue; + } else { + btrfs_set_super_bytenr(sb, bytenr); + + crc = ~(u32)0; + crc = btrfs_csum_data(NULL, (char *)sb + + BTRFS_CSUM_SIZE, crc, + BTRFS_SUPER_INFO_SIZE - + BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, sb->csum); + + /* + * one reference for us, and we leave it for the + * caller + */ + bh = __getblk(device->bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); + + /* one reference for submit_bh */ + get_bh(bh); + + set_buffer_uptodate(bh); + lock_buffer(bh); + bh->b_end_io = btrfs_end_buffer_write_sync; + } + + if (i == last_barrier && do_barriers) + ret = submit_bh(WRITE_FLUSH_FUA, bh); + else + ret = submit_bh(WRITE_SYNC, bh); + + if (ret) + errors++; + } + return errors < i ? 0 : -1; +} + +int write_all_supers(struct btrfs_root *root, int max_mirrors) +{ + struct list_head *head; + struct btrfs_device *dev; + struct btrfs_super_block *sb; + struct btrfs_dev_item *dev_item; + int ret; + int do_barriers; + int max_errors; + int total_errors = 0; + u64 flags; + + max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + do_barriers = !btrfs_test_opt(root, NOBARRIER); + + sb = &root->fs_info->super_for_commit; + dev_item = &sb->dev_item; + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + head = &root->fs_info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + total_errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + btrfs_set_stack_device_generation(dev_item, 0); + btrfs_set_stack_device_type(dev_item, dev->type); + btrfs_set_stack_device_id(dev_item, dev->devid); + btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); + btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); + btrfs_set_stack_device_io_align(dev_item, dev->io_align); + btrfs_set_stack_device_io_width(dev_item, dev->io_width); + btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); + memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); + memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); + + flags = btrfs_super_flags(sb); + btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); + + ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors); + if (ret) + total_errors++; + } + if (total_errors > max_errors) { + printk(KERN_ERR "btrfs: %d errors while writing supers\n", + total_errors); + BUG(); + } + + total_errors = 0; + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) + continue; + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors); + if (ret) + total_errors++; + } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + if (total_errors > max_errors) { + printk(KERN_ERR "btrfs: %d errors while writing supers\n", + total_errors); + BUG(); + } + return 0; +} + +int write_ctree_super(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int max_mirrors) +{ + int ret; + + ret = write_all_supers(root, max_mirrors); + return ret; +} + +int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +{ + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_delete(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid); + spin_unlock(&fs_info->fs_roots_radix_lock); + + if (btrfs_root_refs(&root->root_item) == 0) + synchronize_srcu(&fs_info->subvol_srcu); + + __btrfs_remove_free_space_cache(root->free_ino_pinned); + __btrfs_remove_free_space_cache(root->free_ino_ctl); + free_fs_root(root); + return 0; +} + +static void free_fs_root(struct btrfs_root *root) +{ + iput(root->cache_inode); + WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + if (root->anon_super.s_dev) { + down_write(&root->anon_super.s_umount); + kill_anon_super(&root->anon_super); + } + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root->free_ino_ctl); + kfree(root->free_ino_pinned); + kfree(root->name); + kfree(root); +} + +static int del_fs_roots(struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *gang[8]; + int i; + + while (!list_empty(&fs_info->dead_roots)) { + gang[0] = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&gang[0]->root_list); + + if (gang[0]->in_radix) { + btrfs_free_fs_root(fs_info, gang[0]); + } else { + free_extent_buffer(gang[0]->node); + free_extent_buffer(gang[0]->commit_root); + kfree(gang[0]); + } + } + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_free_fs_root(fs_info, gang[i]); + } + return 0; +} + +int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) +{ + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i; + int ret; + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) + break; + + root_objectid = gang[ret - 1]->root_key.objectid + 1; + for (i = 0; i < ret; i++) { + int err; + + root_objectid = gang[i]->root_key.objectid; + err = btrfs_orphan_cleanup(gang[i]); + if (err) + return err; + } + root_objectid++; + } + return 0; +} + +int btrfs_commit_super(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(root); + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + /* wait until ongoing cleanup work done */ + down_write(&root->fs_info->cleanup_work_sem); + up_write(&root->fs_info->cleanup_work_sem); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + /* run commit again to drop the original snapshot */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + btrfs_commit_transaction(trans, root); + ret = btrfs_write_and_wait_transaction(NULL, root); + BUG_ON(ret); + + ret = write_ctree_super(NULL, root, 0); + return ret; +} + +int close_ctree(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + fs_info->closing = 1; + smp_mb(); + + btrfs_scrub_cancel(root); + + /* wait for any defraggers to finish */ + wait_event(fs_info->transaction_wait, + (atomic_read(&fs_info->defrag_running) == 0)); + + /* clear out the rbtree of defraggable inodes */ + btrfs_run_defrag_inodes(root->fs_info); + + btrfs_put_block_group_cache(fs_info); + + /* + * Here come 2 situations when btrfs is broken to flip readonly: + * + * 1. when btrfs flips readonly somewhere else before + * btrfs_commit_super, sb->s_flags has MS_RDONLY flag, + * and btrfs will skip to write sb directly to keep + * ERROR state on disk. + * + * 2. when btrfs flips readonly just in btrfs_commit_super, + * and in such case, btrfs cannot write sb via btrfs_commit_super, + * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, + * btrfs will cleanup all FS resources first and write sb then. + */ + if (!(fs_info->sb->s_flags & MS_RDONLY)) { + ret = btrfs_commit_super(root); + if (ret) + printk(KERN_ERR "btrfs: commit super ret %d\n", ret); + } + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + ret = btrfs_error_commit_super(root); + if (ret) + printk(KERN_ERR "btrfs: commit super ret %d\n", ret); + } + + kthread_stop(root->fs_info->transaction_kthread); + kthread_stop(root->fs_info->cleaner_kthread); + + fs_info->closing = 2; + smp_mb(); + + if (fs_info->delalloc_bytes) { + printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", + (unsigned long long)fs_info->delalloc_bytes); + } + if (fs_info->total_ref_cache_size) { + printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", + (unsigned long long)fs_info->total_ref_cache_size); + } + + free_extent_buffer(fs_info->extent_root->node); + free_extent_buffer(fs_info->extent_root->commit_root); + free_extent_buffer(fs_info->tree_root->node); + free_extent_buffer(fs_info->tree_root->commit_root); + free_extent_buffer(root->fs_info->chunk_root->node); + free_extent_buffer(root->fs_info->chunk_root->commit_root); + free_extent_buffer(root->fs_info->dev_root->node); + free_extent_buffer(root->fs_info->dev_root->commit_root); + free_extent_buffer(root->fs_info->csum_root->node); + free_extent_buffer(root->fs_info->csum_root->commit_root); + + btrfs_free_block_groups(root->fs_info); + + del_fs_roots(fs_info); + + iput(fs_info->btree_inode); + kfree(fs_info->delayed_root); + + btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->endio_freespace_worker); + btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->delayed_workers); + + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + + bdi_destroy(&fs_info->bdi); + cleanup_srcu_struct(&fs_info->subvol_srcu); + + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + kfree(fs_info); + + return 0; +} + +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) +{ + int ret; + struct inode *btree_inode = buf->first_page->mapping->host; + + ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf, + NULL); + if (!ret) + return ret; + + ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, + parent_transid); + return !ret; +} + +int btrfs_set_buffer_uptodate(struct extent_buffer *buf) +{ + struct inode *btree_inode = buf->first_page->mapping->host; + return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, + buf); +} + +void btrfs_mark_buffer_dirty(struct extent_buffer *buf) +{ + struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + u64 transid = btrfs_header_generation(buf); + struct inode *btree_inode = root->fs_info->btree_inode; + int was_dirty; + + btrfs_assert_tree_locked(buf); + if (transid != root->fs_info->generation) { + printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " + "found %llu running %llu\n", + (unsigned long long)buf->start, + (unsigned long long)transid, + (unsigned long long)root->fs_info->generation); + WARN_ON(1); + } + was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, + buf); + if (!was_dirty) { + spin_lock(&root->fs_info->delalloc_lock); + root->fs_info->dirty_metadata_bytes += buf->len; + spin_unlock(&root->fs_info->delalloc_lock); + } +} + +void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +{ + /* + * looks as though older kernels can get into trouble with + * this code, they end up stuck in balance_dirty_pages forever + */ + u64 num_dirty; + unsigned long thresh = 32 * 1024 * 1024; + + if (current->flags & PF_MEMALLOC) + return; + + btrfs_balance_delayed_items(root); + + num_dirty = root->fs_info->dirty_metadata_bytes; + + if (num_dirty > thresh) { + balance_dirty_pages_ratelimited_nr( + root->fs_info->btree_inode->i_mapping, 1); + } + return; +} + +void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +{ + /* + * looks as though older kernels can get into trouble with + * this code, they end up stuck in balance_dirty_pages forever + */ + u64 num_dirty; + unsigned long thresh = 32 * 1024 * 1024; + + if (current->flags & PF_MEMALLOC) + return; + + num_dirty = root->fs_info->dirty_metadata_bytes; + + if (num_dirty > thresh) { + balance_dirty_pages_ratelimited_nr( + root->fs_info->btree_inode->i_mapping, 1); + } + return; +} + +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) +{ + struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + int ret; + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + if (ret == 0) + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); + return ret; +} + +int btree_lock_page_hook(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_buffer *eb; + unsigned long len; + u64 bytenr = page_offset(page); + + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + + len = page->private >> 2; + eb = find_extent_buffer(io_tree, bytenr, len); + if (!eb) + goto out; + + btrfs_tree_lock(eb); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (root->fs_info->dirty_metadata_bytes >= eb->len) + root->fs_info->dirty_metadata_bytes -= eb->len; + else + WARN_ON(1); + spin_unlock(&root->fs_info->delalloc_lock); + } + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); +out: + lock_page(page); + return 0; +} + +static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, + int read_only) +{ + if (read_only) + return; + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + printk(KERN_WARNING "warning: mount fs with errors, " + "running btrfsck is recommended\n"); +} + +int btrfs_error_commit_super(struct btrfs_root *root) +{ + int ret; + + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + down_write(&root->fs_info->cleanup_work_sem); + up_write(&root->fs_info->cleanup_work_sem); + + /* cleanup FS via transaction */ + btrfs_cleanup_transaction(root); + + ret = write_ctree_super(NULL, root, 0); + + return ret; +} + +static int btrfs_destroy_ordered_operations(struct btrfs_root *root) +{ + struct btrfs_inode *btrfs_inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&root->fs_info->ordered_operations_mutex); + spin_lock(&root->fs_info->ordered_extent_lock); + + list_splice_init(&root->fs_info->ordered_operations, &splice); + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + ordered_operations); + + list_del_init(&btrfs_inode->ordered_operations); + + btrfs_invalidate_inodes(btrfs_inode->root); + } + + spin_unlock(&root->fs_info->ordered_extent_lock); + mutex_unlock(&root->fs_info->ordered_operations_mutex); + + return 0; +} + +static int btrfs_destroy_ordered_extents(struct btrfs_root *root) +{ + struct list_head splice; + struct btrfs_ordered_extent *ordered; + struct inode *inode; + + INIT_LIST_HEAD(&splice); + + spin_lock(&root->fs_info->ordered_extent_lock); + + list_splice_init(&root->fs_info->ordered_extents, &splice); + while (!list_empty(&splice)) { + ordered = list_entry(splice.next, struct btrfs_ordered_extent, + root_extent_list); + + list_del_init(&ordered->root_extent_list); + atomic_inc(&ordered->refs); + + /* the inode may be getting freed (in sys_unlink path). */ + inode = igrab(ordered->inode); + + spin_unlock(&root->fs_info->ordered_extent_lock); + if (inode) + iput(inode); + + atomic_set(&ordered->refs, 1); + btrfs_put_ordered_extent(ordered); + + spin_lock(&root->fs_info->ordered_extent_lock); + } + + spin_unlock(&root->fs_info->ordered_extent_lock); + + return 0; +} + +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + int ret = 0; + + delayed_refs = &trans->delayed_refs; + + spin_lock(&delayed_refs->lock); + if (delayed_refs->num_entries == 0) { + spin_unlock(&delayed_refs->lock); + printk(KERN_INFO "delayed_refs has NO entry\n"); + return ret; + } + + node = rb_first(&delayed_refs->root); + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); + + ref->in_tree = 0; + rb_erase(&ref->rb_node, &delayed_refs->root); + delayed_refs->num_entries--; + + atomic_set(&ref->refs, 1); + if (btrfs_delayed_ref_is_head(ref)) { + struct btrfs_delayed_ref_head *head; + + head = btrfs_delayed_node_to_head(ref); + mutex_lock(&head->mutex); + kfree(head->extent_op); + delayed_refs->num_heads--; + if (list_empty(&head->cluster)) + delayed_refs->num_heads_ready--; + list_del_init(&head->cluster); + mutex_unlock(&head->mutex); + } + + spin_unlock(&delayed_refs->lock); + btrfs_put_delayed_ref(ref); + + cond_resched(); + spin_lock(&delayed_refs->lock); + } + + spin_unlock(&delayed_refs->lock); + + return ret; +} + +static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +{ + struct btrfs_pending_snapshot *snapshot; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + list_splice_init(&t->pending_snapshots, &splice); + + while (!list_empty(&splice)) { + snapshot = list_entry(splice.next, + struct btrfs_pending_snapshot, + list); + + list_del_init(&snapshot->list); + + kfree(snapshot); + } + + return 0; +} + +static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root) +{ + struct btrfs_inode *btrfs_inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&root->fs_info->delalloc_lock); + list_splice_init(&root->fs_info->delalloc_inodes, &splice); + + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + delalloc_inodes); + + list_del_init(&btrfs_inode->delalloc_inodes); + + btrfs_invalidate_inodes(btrfs_inode->root); + } + + spin_unlock(&root->fs_info->delalloc_lock); + + return 0; +} + +static int btrfs_destroy_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, + int mark) +{ + int ret; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + u64 start = 0; + u64 end; + u64 offset; + unsigned long index; + + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + mark); + if (ret) + break; + + clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + offset = page_offset(page); + + spin_lock(&dirty_pages->buffer_lock); + eb = radix_tree_lookup( + &(&BTRFS_I(page->mapping->host)->io_tree)->buffer, + offset >> PAGE_CACHE_SHIFT); + spin_unlock(&dirty_pages->buffer_lock); + if (eb) { + ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY, + &eb->bflags); + atomic_set(&eb->refs, 1); + } + if (PageWriteback(page)) + end_page_writeback(page); + + lock_page(page); + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + spin_lock_irq(&page->mapping->tree_lock); + radix_tree_tag_clear(&page->mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&page->mapping->tree_lock); + } + + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + } + } + + return ret; +} + +static int btrfs_destroy_pinned_extent(struct btrfs_root *root, + struct extent_io_tree *pinned_extents) +{ + struct extent_io_tree *unpin; + u64 start; + u64 end; + int ret; + + unpin = pinned_extents; + while (1) { + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + + /* opt_discard */ + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_error_discard_extent(root, start, + end + 1 - start, + NULL); + + clear_extent_dirty(unpin, start, end, GFP_NOFS); + btrfs_error_unpin_extent_range(root, start, end); + cond_resched(); + } + + return 0; +} + +static int btrfs_cleanup_transaction(struct btrfs_root *root) +{ + struct btrfs_transaction *t; + LIST_HEAD(list); + + WARN_ON(1); + + mutex_lock(&root->fs_info->transaction_kthread_mutex); + + spin_lock(&root->fs_info->trans_lock); + list_splice_init(&root->fs_info->trans_list, &list); + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); + + while (!list_empty(&list)) { + t = list_entry(list.next, struct btrfs_transaction, list); + if (!t) + break; + + btrfs_destroy_ordered_operations(root); + + btrfs_destroy_ordered_extents(root); + + btrfs_destroy_delayed_refs(t, root); + + btrfs_block_rsv_release(root, + &root->fs_info->trans_block_rsv, + t->dirty_pages.dirty_bytes); + + /* FIXME: cleanup wait for commit */ + t->in_commit = 1; + t->blocked = 1; + if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) + wake_up(&root->fs_info->transaction_blocked_wait); + + t->blocked = 0; + if (waitqueue_active(&root->fs_info->transaction_wait)) + wake_up(&root->fs_info->transaction_wait); + + t->commit_done = 1; + if (waitqueue_active(&t->commit_wait)) + wake_up(&t->commit_wait); + + btrfs_destroy_pending_snapshots(t); + + btrfs_destroy_delalloc_inodes(root); + + spin_lock(&root->fs_info->trans_lock); + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->trans_lock); + + btrfs_destroy_marked_extents(root, &t->dirty_pages, + EXTENT_DIRTY); + + btrfs_destroy_pinned_extent(root, + root->fs_info->pinned_extents); + + atomic_set(&t->use_count, 0); + list_del_init(&t->list); + memset(t, 0, sizeof(*t)); + kmem_cache_free(btrfs_transaction_cachep, t); + } + + spin_lock(&root->fs_info->trans_lock); + root->fs_info->trans_no_join = 0; + spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + + return 0; +} + +static struct extent_io_ops btree_extent_io_ops = { + .write_cache_pages_lock_hook = btree_lock_page_hook, + .readpage_end_io_hook = btree_readpage_end_io_hook, + .submit_bio_hook = btree_submit_bio_hook, + /* note we're sharing with inode.c for the merge bio hook */ + .merge_bio_hook = btrfs_merge_bio_hook, +}; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h new file mode 100644 index 00000000..a0b610a6 --- /dev/null +++ b/fs/btrfs/disk-io.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __DISKIO__ +#define __DISKIO__ + +#define BTRFS_SUPER_INFO_OFFSET (64 * 1024) +#define BTRFS_SUPER_INFO_SIZE 4096 + +#define BTRFS_SUPER_MIRROR_MAX 3 +#define BTRFS_SUPER_MIRROR_SHIFT 12 + +static inline u64 btrfs_sb_offset(int mirror) +{ + u64 start = 16 * 1024; + if (mirror) + return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); + return BTRFS_SUPER_INFO_OFFSET; +} + +struct btrfs_device; +struct btrfs_fs_devices; + +struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, + u32 blocksize, u64 parent_transid); +int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, + u64 parent_transid); +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize); +int clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf); +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options); +int close_ctree(struct btrfs_root *root); +int write_ctree_super(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int max_mirrors); +struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); +int btrfs_commit_super(struct btrfs_root *root); +int btrfs_error_commit_super(struct btrfs_root *root); +struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize); +struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + struct btrfs_key *location); +struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location); +int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); +void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); +void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); +int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_mark_buffer_dirty(struct extent_buffer *buf); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); +int btrfs_set_buffer_uptodate(struct extent_buffer *buf); +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); +u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); +void btrfs_csum_final(u32 crc, char *result); +int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, + int metadata); +int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, + int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 bio_offset, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done); +unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); +int btrfs_write_tree_block(struct extent_buffer *buf); +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btree_lock_page_hook(struct page *page); + + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level); +#else +static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, + int level) +{ +} +#endif +#endif diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c new file mode 100644 index 00000000..1b8dc337 --- /dev/null +++ b/fs/btrfs/export.c @@ -0,0 +1,317 @@ +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "export.h" +#include "compat.h" + +#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ + parent_objectid) / 4) +#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \ + parent_root_objectid) / 4) +#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4) + +static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + int connectable) +{ + struct btrfs_fid *fid = (struct btrfs_fid *)fh; + struct inode *inode = dentry->d_inode; + int len = *max_len; + int type; + + if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) { + *max_len = BTRFS_FID_SIZE_CONNECTABLE; + return 255; + } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { + *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; + return 255; + } + + len = BTRFS_FID_SIZE_NON_CONNECTABLE; + type = FILEID_BTRFS_WITHOUT_PARENT; + + fid->objectid = btrfs_ino(inode); + fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->gen = inode->i_generation; + + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + u64 parent_root_id; + + spin_lock(&dentry->d_lock); + + parent = dentry->d_parent->d_inode; + fid->parent_objectid = BTRFS_I(parent)->location.objectid; + fid->parent_gen = parent->i_generation; + parent_root_id = BTRFS_I(parent)->root->objectid; + + spin_unlock(&dentry->d_lock); + + if (parent_root_id != fid->root_objectid) { + fid->parent_root_objectid = parent_root_id; + len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; + type = FILEID_BTRFS_WITH_PARENT_ROOT; + } else { + len = BTRFS_FID_SIZE_CONNECTABLE; + type = FILEID_BTRFS_WITH_PARENT; + } + } + + *max_len = len; + return type; +} + +static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation, + int check_generation) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; + struct btrfs_root *root; + struct inode *inode; + struct btrfs_key key; + int index; + int err = 0; + + if (objectid < BTRFS_FIRST_FREE_OBJECTID) + return ERR_PTR(-ESTALE); + + key.objectid = root_objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto fail; + } + + if (btrfs_root_refs(&root->root_item) == 0) { + err = -ENOENT; + goto fail; + } + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + inode = btrfs_iget(sb, &key, root, NULL); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto fail; + } + + srcu_read_unlock(&fs_info->subvol_srcu, index); + + if (check_generation && generation != inode->i_generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return d_obtain_alias(inode); +fail: + srcu_read_unlock(&fs_info->subvol_srcu, index); + return ERR_PTR(err); +} + +static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE) + return NULL; + root_objectid = fid->root_objectid; + } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) + return NULL; + root_objectid = fid->parent_root_objectid; + } else + return NULL; + + objectid = fid->parent_objectid; + generation = fid->parent_gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); +} + +static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if ((fh_type != FILEID_BTRFS_WITH_PARENT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE) && + (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) && + (fh_type != FILEID_BTRFS_WITHOUT_PARENT || + fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE)) + return NULL; + + objectid = fid->objectid; + root_objectid = fid->root_objectid; + generation = fid->gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); +} + +static struct dentry *btrfs_get_parent(struct dentry *child) +{ + struct inode *dir = child->d_inode; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_root_ref *ref; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + root = root->fs_info->tree_root; + } else { + key.objectid = btrfs_ino(dir); + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + } + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + + BUG_ON(ret == 0); + if (path->slots[0] == 0) { + ret = -ENOENT; + goto fail; + } + + path->slots[0]--; + leaf = path->nodes[0]; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != key.objectid || found_key.type != key.type) { + ret = -ENOENT; + goto fail; + } + + if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + key.objectid = btrfs_root_ref_dirid(leaf, ref); + } else { + key.objectid = found_key.offset; + } + btrfs_free_path(path); + + if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { + return btrfs_get_dentry(root->fs_info->sb, key.objectid, + found_key.offset, 0, 0); + } + + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); +fail: + btrfs_free_path(path); + return ERR_PTR(ret); +} + +static int btrfs_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *inode = child->d_inode; + struct inode *dir = parent->d_inode; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode_ref *iref; + struct btrfs_root_ref *rref; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + int name_len; + int ret; + u64 ino; + + if (!dir || !inode) + return -EINVAL; + + if (!S_ISDIR(dir->i_mode)) + return -EINVAL; + + ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + if (ino == BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + root = root->fs_info->tree_root; + } else { + key.objectid = ino; + key.offset = btrfs_ino(dir); + key.type = BTRFS_INODE_REF_KEY; + } + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } else if (ret > 0) { + if (ino == BTRFS_FIRST_FREE_OBJECTID) { + path->slots[0]--; + } else { + btrfs_free_path(path); + return -ENOENT; + } + } + leaf = path->nodes[0]; + + if (ino == BTRFS_FIRST_FREE_OBJECTID) { + rref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + name_ptr = (unsigned long)(rref + 1); + name_len = btrfs_root_ref_name_len(leaf, rref); + } else { + iref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_ref); + name_ptr = (unsigned long)(iref + 1); + name_len = btrfs_inode_ref_name_len(leaf, iref); + } + + read_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_free_path(path); + + /* + * have to add the null termination to make sure that reconnect_path + * gets the right len for strlen + */ + name[name_len] = '\0'; + + return 0; +} + +const struct export_operations btrfs_export_ops = { + .encode_fh = btrfs_encode_fh, + .fh_to_dentry = btrfs_fh_to_dentry, + .fh_to_parent = btrfs_fh_to_parent, + .get_parent = btrfs_get_parent, + .get_name = btrfs_get_name, +}; diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h new file mode 100644 index 00000000..074348a9 --- /dev/null +++ b/fs/btrfs/export.h @@ -0,0 +1,19 @@ +#ifndef BTRFS_EXPORT_H +#define BTRFS_EXPORT_H + +#include + +extern const struct export_operations btrfs_export_ops; + +struct btrfs_fid { + u64 objectid; + u64 root_objectid; + u32 gen; + + u64 parent_objectid; + u32 parent_gen; + + u64 parent_root_objectid; +} __attribute__ ((packed)); + +#endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c new file mode 100644 index 00000000..7e20a65d --- /dev/null +++ b/fs/btrfs/extent-tree.c @@ -0,0 +1,7347 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "compat.h" +#include "hash.h" +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "volumes.h" +#include "locking.h" +#include "free-space-cache.h" + +/* control flags for do_chunk_alloc's force field + * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk + * if we really need one. + * + * CHUNK_ALLOC_FORCE means it must try to allocate one + * + * CHUNK_ALLOC_LIMITED means to only try and allocate one + * if we have very few chunks already allocated. This is + * used as part of the clustering code to help make sure + * we have a good pool of storage to cluster in, without + * filling the FS with empty chunks + * + */ +enum { + CHUNK_ALLOC_NO_FORCE = 0, + CHUNK_ALLOC_FORCE = 1, + CHUNK_ALLOC_LIMITED = 2, +}; + +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int alloc); +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extra_op); +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei); +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod); +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, struct btrfs_disk_key *key, + int level, struct btrfs_key *ins); +static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 alloc_bytes, + u64 flags, int force); +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key); +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups); + +static noinline int +block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + smp_mb(); + return cache->cached == BTRFS_CACHE_FINISHED; +} + +static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) +{ + return (cache->flags & bits) == bits; +} + +static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +{ + atomic_inc(&cache->count); +} + +void btrfs_put_block_group(struct btrfs_block_group_cache *cache) +{ + if (atomic_dec_and_test(&cache->count)) { + WARN_ON(cache->pinned > 0); + WARN_ON(cache->reserved > 0); + WARN_ON(cache->reserved_pinned > 0); + kfree(cache->free_space_ctl); + kfree(cache); + } +} + +/* + * this adds the block group to the fs_info rb tree for the block group + * cache + */ +static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, + struct btrfs_block_group_cache *block_group) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_block_group_cache *cache; + + spin_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_node; + + while (*p) { + parent = *p; + cache = rb_entry(parent, struct btrfs_block_group_cache, + cache_node); + if (block_group->key.objectid < cache->key.objectid) { + p = &(*p)->rb_left; + } else if (block_group->key.objectid > cache->key.objectid) { + p = &(*p)->rb_right; + } else { + spin_unlock(&info->block_group_cache_lock); + return -EEXIST; + } + } + + rb_link_node(&block_group->cache_node, parent, p); + rb_insert_color(&block_group->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + + return 0; +} + +/* + * This will return the block group at or after bytenr if contains is 0, else + * it will return the block group that contains the bytenr + */ +static struct btrfs_block_group_cache * +block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, + int contains) +{ + struct btrfs_block_group_cache *cache, *ret = NULL; + struct rb_node *n; + u64 end, start; + + spin_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_node; + + while (n) { + cache = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + end = cache->key.objectid + cache->key.offset - 1; + start = cache->key.objectid; + + if (bytenr < start) { + if (!contains && (!ret || start < ret->key.objectid)) + ret = cache; + n = n->rb_left; + } else if (bytenr > start) { + if (contains && bytenr <= end) { + ret = cache; + break; + } + n = n->rb_right; + } else { + ret = cache; + break; + } + } + if (ret) + btrfs_get_block_group(ret); + spin_unlock(&info->block_group_cache_lock); + + return ret; +} + +static int add_excluded_extent(struct btrfs_root *root, + u64 start, u64 num_bytes) +{ + u64 end = start + num_bytes - 1; + set_extent_bits(&root->fs_info->freed_extents[0], + start, end, EXTENT_UPTODATE, GFP_NOFS); + set_extent_bits(&root->fs_info->freed_extents[1], + start, end, EXTENT_UPTODATE, GFP_NOFS); + return 0; +} + +static void free_excluded_extents(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + u64 start, end; + + start = cache->key.objectid; + end = start + cache->key.offset - 1; + + clear_extent_bits(&root->fs_info->freed_extents[0], + start, end, EXTENT_UPTODATE, GFP_NOFS); + clear_extent_bits(&root->fs_info->freed_extents[1], + start, end, EXTENT_UPTODATE, GFP_NOFS); +} + +static int exclude_super_stripes(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + u64 bytenr; + u64 *logical; + int stripe_len; + int i, nr, ret; + + if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { + stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; + cache->bytes_super += stripe_len; + ret = add_excluded_extent(root, cache->key.objectid, + stripe_len); + BUG_ON(ret); + } + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(&root->fs_info->mapping_tree, + cache->key.objectid, bytenr, + 0, &logical, &nr, &stripe_len); + BUG_ON(ret); + + while (nr--) { + cache->bytes_super += stripe_len; + ret = add_excluded_extent(root, logical[nr], + stripe_len); + BUG_ON(ret); + } + + kfree(logical); + } + return 0; +} + +static struct btrfs_caching_control * +get_caching_control(struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *ctl; + + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_STARTED) { + spin_unlock(&cache->lock); + return NULL; + } + + /* We're loading it the fast way, so we don't have a caching_ctl. */ + if (!cache->caching_ctl) { + spin_unlock(&cache->lock); + return NULL; + } + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + spin_unlock(&cache->lock); + return ctl; +} + +static void put_caching_control(struct btrfs_caching_control *ctl) +{ + if (atomic_dec_and_test(&ctl->count)) + kfree(ctl); +} + +/* + * this is only called by cache_block_group, since we could have freed extents + * we need to check the pinned_extents for any extents that can't be used yet + * since their free space will be released as soon as the transaction commits. + */ +static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end) +{ + u64 extent_start, extent_end, size, total_added = 0; + int ret; + + while (start < end) { + ret = find_first_extent_bit(info->pinned_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY | EXTENT_UPTODATE); + if (ret) + break; + + if (extent_start <= start) { + start = extent_end + 1; + } else if (extent_start > start && extent_start < end) { + size = extent_start - start; + total_added += size; + ret = btrfs_add_free_space(block_group, start, + size); + BUG_ON(ret); + start = extent_end + 1; + } else { + break; + } + } + + if (start < end) { + size = end - start; + total_added += size; + ret = btrfs_add_free_space(block_group, start, size); + BUG_ON(ret); + } + + return total_added; +} + +static int caching_kthread(void *data) +{ + struct btrfs_block_group_cache *block_group = data; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + u64 total_found = 0; + u64 last = 0; + u32 nritems; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + + /* + * We don't want to deadlock with somebody trying to allocate a new + * extent for the extent root while also trying to search the extent + * root to add free space. So we skip locking and search the commit + * root, since its read-only + */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = 1; + + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; +again: + mutex_lock(&caching_ctl->mutex); + /* need to make sure the commit_root doesn't disappear */ + down_read(&fs_info->extent_commit_sem); + + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto err; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + + while (1) { + if (btrfs_fs_closing(fs_info) > 1) { + last = (u64)-1; + break; + } + + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + } else { + ret = find_next_key(path, 0, &key); + if (ret) + break; + + if (need_resched() || + btrfs_next_leaf(extent_root, path)) { + caching_ctl->progress = last; + btrfs_release_path(path); + up_read(&fs_info->extent_commit_sem); + mutex_unlock(&caching_ctl->mutex); + cond_resched(); + goto again; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; + } + + if (key.objectid < block_group->key.objectid) { + path->slots[0]++; + continue; + } + + if (key.objectid >= block_group->key.objectid + + block_group->key.offset) + break; + + if (key.type == BTRFS_EXTENT_ITEM_KEY) { + total_found += add_new_free_space(block_group, + fs_info, last, + key.objectid); + last = key.objectid + key.offset; + + if (total_found > (1024 * 1024 * 2)) { + total_found = 0; + wake_up(&caching_ctl->wait); + } + } + path->slots[0]++; + } + ret = 0; + + total_found += add_new_free_space(block_group, fs_info, last, + block_group->key.objectid + + block_group->key.offset); + caching_ctl->progress = (u64)-1; + + spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; + block_group->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&block_group->lock); + +err: + btrfs_free_path(path); + up_read(&fs_info->extent_commit_sem); + + free_excluded_extents(extent_root, block_group); + + mutex_unlock(&caching_ctl->mutex); + wake_up(&caching_ctl->wait); + + put_caching_control(caching_ctl); + atomic_dec(&block_group->space_info->caching_threads); + btrfs_put_block_group(block_group); + + return 0; +} + +static int cache_block_group(struct btrfs_block_group_cache *cache, + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int load_cache_only) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_caching_control *caching_ctl; + struct task_struct *tsk; + int ret = 0; + + smp_mb(); + if (cache->cached != BTRFS_CACHE_NO) + return 0; + + /* + * We can't do the read from on-disk cache during a commit since we need + * to have the normal tree locking. Also if we are currently trying to + * allocate blocks for the tree root we can't do the fast caching since + * we likely hold important locks. + */ + if (trans && (!trans->transaction->in_commit) && + (root && root != root->fs_info->tree_root)) { + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + return 0; + } + cache->cached = BTRFS_CACHE_STARTED; + spin_unlock(&cache->lock); + + ret = load_free_space_cache(fs_info, cache); + + spin_lock(&cache->lock); + if (ret == 1) { + cache->cached = BTRFS_CACHE_FINISHED; + cache->last_byte_to_unpin = (u64)-1; + } else { + cache->cached = BTRFS_CACHE_NO; + } + spin_unlock(&cache->lock); + if (ret == 1) { + free_excluded_extents(fs_info->extent_root, cache); + return 0; + } + } + + if (load_cache_only) + return 0; + + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + BUG_ON(!caching_ctl); + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + /* one for caching kthread, one for caching block group list */ + atomic_set(&caching_ctl->count, 2); + + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + kfree(caching_ctl); + return 0; + } + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_STARTED; + spin_unlock(&cache->lock); + + down_write(&fs_info->extent_commit_sem); + list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); + up_write(&fs_info->extent_commit_sem); + + atomic_inc(&cache->space_info->caching_threads); + btrfs_get_block_group(cache); + + tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", + cache->key.objectid); + if (IS_ERR(tsk)) { + ret = PTR_ERR(tsk); + printk(KERN_ERR "error running thread %d\n", ret); + BUG(); + } + + return ret; +} + +/* + * return the block group that starts at or after bytenr + */ +static struct btrfs_block_group_cache * +btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) +{ + struct btrfs_block_group_cache *cache; + + cache = block_group_cache_tree_search(info, bytenr, 0); + + return cache; +} + +/* + * return the block group that contains the given bytenr + */ +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, + u64 bytenr) +{ + struct btrfs_block_group_cache *cache; + + cache = block_group_cache_tree_search(info, bytenr, 1); + + return cache; +} + +static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, + u64 flags) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | + BTRFS_BLOCK_GROUP_METADATA; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & flags) { + rcu_read_unlock(); + return found; + } + } + rcu_read_unlock(); + return NULL; +} + +/* + * after adding space to the filesystem, we need to clear the full flags + * on all the space infos. + */ +void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) + found->full = 0; + rcu_read_unlock(); +} + +static u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + +static u64 div_factor_fine(u64 num, int factor) +{ + if (factor == 100) + return num; + num *= factor; + do_div(num, 100); + return num; +} + +u64 btrfs_find_block_group(struct btrfs_root *root, + u64 search_start, u64 search_hint, int owner) +{ + struct btrfs_block_group_cache *cache; + u64 used; + u64 last = max(search_hint, search_start); + u64 group_start = 0; + int full_search = 0; + int factor = 9; + int wrapped = 0; +again: + while (1) { + cache = btrfs_lookup_first_block_group(root->fs_info, last); + if (!cache) + break; + + spin_lock(&cache->lock); + last = cache->key.objectid + cache->key.offset; + used = btrfs_block_group_used(&cache->item); + + if ((full_search || !cache->ro) && + block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { + if (used + cache->pinned + cache->reserved < + div_factor(cache->key.offset, factor)) { + group_start = cache->key.objectid; + spin_unlock(&cache->lock); + btrfs_put_block_group(cache); + goto found; + } + } + spin_unlock(&cache->lock); + btrfs_put_block_group(cache); + cond_resched(); + } + if (!wrapped) { + last = search_start; + wrapped = 1; + goto again; + } + if (!full_search && factor < 10) { + last = search_start; + full_search = 1; + factor = 10; + goto again; + } +found: + return group_start; +} + +/* simple helper to search for an existing extent at a given offset */ +int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) +{ + int ret; + struct btrfs_key key; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + BUG_ON(!path); + key.objectid = start; + key.offset = len; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, + 0, 0); + btrfs_free_path(path); + return ret; +} + +/* + * helper function to lookup reference count and flags of extent. + * + * the head node for delayed ref is used to store the sum of all the + * reference count modifications queued up in the rbtree. the head + * node may also store the extent flags to set. This way you can check + * to see what the reference count and extent flags would be if all of + * the delayed refs are not processed. + */ +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + struct btrfs_key key; + u32 item_size; + u64 num_refs; + u64 extent_flags; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + if (!trans) { + path->skip_locking = 1; + path->search_commit_root = 1; + } +again: + ret = btrfs_search_slot(trans, root->fs_info->extent_root, + &key, path, 0, 0); + if (ret < 0) + goto out_free; + + if (ret == 0) { + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (item_size >= sizeof(*ei)) { + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + extent_flags = btrfs_extent_flags(leaf, ei); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + num_refs = btrfs_extent_refs_v0(leaf, ei0); + /* FIXME: this isn't correct for data */ + extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; +#else + BUG(); +#endif + } + BUG_ON(num_refs == 0); + } else { + num_refs = 0; + extent_flags = 0; + ret = 0; + } + + if (!trans) + goto out; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's released and try + * again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto again; + } + if (head->extent_op && head->extent_op->update_flags) + extent_flags |= head->extent_op->flags_to_set; + else + BUG_ON(num_refs == 0); + + num_refs += head->node.ref_mod; + mutex_unlock(&head->mutex); + } + spin_unlock(&delayed_refs->lock); +out: + WARN_ON(num_refs == 0); + if (refs) + *refs = num_refs; + if (flags) + *flags = extent_flags; +out_free: + btrfs_free_path(path); + return ret; +} + +/* + * Back reference rules. Back refs have three main goals: + * + * 1) differentiate between all holders of references to an extent so that + * when a reference is dropped we can make sure it was a valid reference + * before freeing the extent. + * + * 2) Provide enough information to quickly find the holders of an extent + * if we notice a given block is corrupted or bad. + * + * 3) Make it easy to migrate blocks for FS shrinking or storage pool + * maintenance. This is actually the same as #2, but with a slightly + * different use case. + * + * There are two kinds of back refs. The implicit back refs is optimized + * for pointers in non-shared tree blocks. For a given pointer in a block, + * back refs of this kind provide information about the block's owner tree + * and the pointer's key. These information allow us to find the block by + * b-tree searching. The full back refs is for pointers in tree blocks not + * referenced by their owner trees. The location of tree block is recorded + * in the back refs. Actually the full back refs is generic, and can be + * used in all cases the implicit back refs is used. The major shortcoming + * of the full back refs is its overhead. Every time a tree block gets + * COWed, we have to update back refs entry for all pointers in it. + * + * For a newly allocated tree block, we use implicit back refs for + * pointers in it. This means most tree related operations only involve + * implicit back refs. For a tree block created in old transaction, the + * only way to drop a reference to it is COW it. So we can detect the + * event that tree block loses its owner tree's reference and do the + * back refs conversion. + * + * When a tree block is COW'd through a tree, there are four cases: + * + * The reference count of the block is one and the tree is the block's + * owner tree. Nothing to do in this case. + * + * The reference count of the block is one and the tree is not the + * block's owner tree. In this case, full back refs is used for pointers + * in the block. Remove these full back refs, add implicit back refs for + * every pointers in the new block. + * + * The reference count of the block is greater than one and the tree is + * the block's owner tree. In this case, implicit back refs is used for + * pointers in the block. Add full back refs for every pointers in the + * block, increase lower level extents' reference counts. The original + * implicit back refs are entailed to the new block. + * + * The reference count of the block is greater than one and the tree is + * not the block's owner tree. Add implicit back refs for every pointer in + * the new block, increase lower level extents' reference count. + * + * Back Reference Key composing: + * + * The key objectid corresponds to the first byte in the extent, + * The key type is used to differentiate between types of back refs. + * There are different meanings of the key offset for different types + * of back refs. + * + * File extents can be referenced by: + * + * - multiple snapshots, subvolumes, or different generations in one subvol + * - different files inside a single subvolume + * - different offsets inside a file (bookend extents in file.c) + * + * The extent ref structure for the implicit back refs has fields for: + * + * - Objectid of the subvolume root + * - objectid of the file holding the reference + * - original offset in the file + * - how many bookend extents + * + * The key offset for the implicit back refs is hash of the first + * three fields. + * + * The extent ref structure for the full back refs has field for: + * + * - number of pointers in the tree leaf + * + * The key offset for the implicit back refs is the first byte of + * the tree leaf + * + * When a file extent is allocated, The implicit back refs is used. + * the fields are filled in: + * + * (root_key.objectid, inode objectid, offset in file, 1) + * + * When a file extent is removed file truncation, we find the + * corresponding implicit back refs and check the following fields: + * + * (btrfs_header_owner(leaf), inode objectid, offset in file) + * + * Btree extents can be referenced by: + * + * - Different subvolumes + * + * Both the implicit back refs and the full back refs for tree blocks + * only consist of key. The key offset for the implicit back refs is + * objectid of block's owner tree. The key offset for the full back refs + * is the first byte of parent block. + * + * When implicit back refs is used, information about the lowest key and + * level of the tree block are required. These information are stored in + * tree block info structure. + */ + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static int convert_extent_item_v0(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 owner, u32 extra_size) +{ + struct btrfs_extent_item *item; + struct btrfs_extent_item_v0 *ei0; + struct btrfs_extent_ref_v0 *ref0; + struct btrfs_tree_block_info *bi; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + u32 new_size = sizeof(*item); + u64 refs; + int ret; + + leaf = path->nodes[0]; + BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + refs = btrfs_extent_refs_v0(leaf, ei0); + + if (owner == (u64)-1) { + while (1) { + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + BUG_ON(ret > 0); + leaf = path->nodes[0]; + } + btrfs_item_key_to_cpu(leaf, &found_key, + path->slots[0]); + BUG_ON(key.objectid != found_key.objectid); + if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { + path->slots[0]++; + continue; + } + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + owner = btrfs_ref_objectid_v0(leaf, ref0); + break; + } + } + btrfs_release_path(path); + + if (owner < BTRFS_FIRST_FREE_OBJECTID) + new_size += sizeof(*bi); + + new_size -= sizeof(*ei0); + ret = btrfs_search_slot(trans, root, &key, path, + new_size + extra_size, 1); + if (ret < 0) + return ret; + BUG_ON(ret); + + ret = btrfs_extend_item(trans, root, path, new_size); + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, item, refs); + /* FIXME: get real generation */ + btrfs_set_extent_generation(leaf, item, 0); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_set_extent_flags(leaf, item, + BTRFS_EXTENT_FLAG_TREE_BLOCK | + BTRFS_BLOCK_FLAG_FULL_BACKREF); + bi = (struct btrfs_tree_block_info *)(item + 1); + /* FIXME: get first key of the block */ + memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); + btrfs_set_tree_block_level(leaf, bi, (int)owner); + } else { + btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} +#endif + +static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) +{ + u32 high_crc = ~(u32)0; + u32 low_crc = ~(u32)0; + __le64 lenum; + + lenum = cpu_to_le64(root_objectid); + high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(owner); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(offset); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + + return ((u64)high_crc << 31) ^ (u64)low_crc; +} + +static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref) +{ + return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), + btrfs_extent_data_ref_objectid(leaf, ref), + btrfs_extent_data_ref_offset(leaf, ref)); +} + +static int match_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + u64 root_objectid, u64 owner, u64 offset) +{ + if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != owner || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + return 0; + return 1; +} + +static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, + u64 owner, u64 offset) +{ + struct btrfs_key key; + struct btrfs_extent_data_ref *ref; + struct extent_buffer *leaf; + u32 nritems; + int ret; + int recow; + int err = -ENOENT; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + } +again: + recow = 0; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto fail; + } + + if (parent) { + if (!ret) + return 0; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + key.type = BTRFS_EXTENT_REF_V0_KEY; + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto fail; + } + if (!ret) + return 0; +#endif + goto fail; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + while (1) { + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + err = ret; + if (ret) + goto fail; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + recow = 1; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != bytenr || + key.type != BTRFS_EXTENT_DATA_REF_KEY) + goto fail; + + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) { + if (recow) { + btrfs_release_path(path); + goto again; + } + err = 0; + break; + } + path->slots[0]++; + } +fail: + return err; +} + +static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + u32 size; + u32 num_refs; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + size = sizeof(struct btrfs_shared_data_ref); + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + size = sizeof(struct btrfs_extent_data_ref); + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, size); + if (ret && ret != -EEXIST) + goto fail; + + leaf = path->nodes[0]; + if (parent) { + struct btrfs_shared_data_ref *ref; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + if (ret == 0) { + btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_shared_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_shared_data_ref_count(leaf, ref, num_refs); + } + } else { + struct btrfs_extent_data_ref *ref; + while (ret == -EEXIST) { + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) + break; + btrfs_release_path(path); + key.offset++; + ret = btrfs_insert_empty_item(trans, root, path, &key, + size); + if (ret && ret != -EEXIST) + goto fail; + + leaf = path->nodes[0]; + } + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (ret == 0) { + btrfs_set_extent_data_ref_root(leaf, ref, + root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_extent_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_extent_data_ref_count(leaf, ref, num_refs); + } + } + btrfs_mark_buffer_dirty(leaf); + ret = 0; +fail: + btrfs_release_path(path); + return ret; +} + +static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int refs_to_drop) +{ + struct btrfs_key key; + struct btrfs_extent_data_ref *ref1 = NULL; + struct btrfs_shared_data_ref *ref2 = NULL; + struct extent_buffer *leaf; + u32 num_refs = 0; + int ret = 0; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + num_refs = btrfs_ref_count_v0(leaf, ref0); +#endif + } else { + BUG(); + } + + BUG_ON(num_refs < refs_to_drop); + num_refs -= refs_to_drop; + + if (num_refs == 0) { + ret = btrfs_del_item(trans, root, path); + } else { + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); + else if (key.type == BTRFS_SHARED_DATA_REF_KEY) + btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + else { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + btrfs_set_ref_count_v0(leaf, ref0, num_refs); + } +#endif + btrfs_mark_buffer_dirty(leaf); + } + return ret; +} + +static noinline u32 extent_data_ref_count(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_data_ref *ref1; + struct btrfs_shared_data_ref *ref2; + u32 num_refs = 0; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (iref) { + if (btrfs_extent_inline_ref_type(leaf, iref) == + BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else { + ref2 = (struct btrfs_shared_data_ref *)(iref + 1); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); + } + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + num_refs = btrfs_ref_count_v0(leaf, ref0); +#endif + } else { + WARN_ON(1); + } + return num_refs; +} + +static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) +{ + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (ret == -ENOENT && parent) { + btrfs_release_path(path); + key.type = BTRFS_EXTENT_REF_V0_KEY; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + } +#endif + return ret; +} + +static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) +{ + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + btrfs_release_path(path); + return ret; +} + +static inline int extent_ref_type(u64 parent, u64 owner) +{ + int type; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (parent > 0) + type = BTRFS_SHARED_BLOCK_REF_KEY; + else + type = BTRFS_TREE_BLOCK_REF_KEY; + } else { + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; + } + return type; +} + +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) + +{ + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) + break; + if (path->slots[level] + 1 >= + btrfs_header_nritems(path->nodes[level])) + continue; + if (level == 0) + btrfs_item_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + else + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + return 1; +} + +/* + * look for inline back ref. if back ref is found, *ref_ret is set + * to the address of inline back ref, and 0 is returned. + * + * if back ref isn't found, *ref_ret is set to the address where it + * should be inserted, and -ENOENT is returned. + * + * if insert is true and there are too many inline back refs, the path + * points to the extent item, and -EAGAIN is returned. + * + * NOTE: inline back refs are ordered in the same way that back ref + * items in the tree are ordered. + */ +static noinline_for_stack +int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int insert) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + u64 flags; + u64 item_size; + unsigned long ptr; + unsigned long end; + int extra_size; + int type; + int want; + int ret; + int err = 0; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + want = extent_ref_type(parent, owner); + if (insert) { + extra_size = btrfs_extent_inline_ref_size(want); + path->keep_locks = 1; + } else + extra_size = -1; + ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); + if (ret < 0) { + err = ret; + goto out; + } + BUG_ON(ret); + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + if (!insert) { + err = -ENOENT; + goto out; + } + ret = convert_extent_item_v0(trans, root, path, owner, + extra_size); + if (ret < 0) { + err = ret; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + } +#endif + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); + } else { + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); + } + + err = -ENOENT; + while (1) { + if (ptr >= end) { + WARN_ON(ptr > end); + break; + } + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + if (want < type) + break; + if (want > type) { + ptr += btrfs_extent_inline_ref_size(type); + continue; + } + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (match_extent_data_ref(leaf, dref, root_objectid, + owner, offset)) { + err = 0; + break; + } + if (hash_extent_data_ref_item(leaf, dref) < + hash_extent_data_ref(root_objectid, owner, offset)) + break; + } else { + u64 ref_offset; + ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); + if (parent > 0) { + if (parent == ref_offset) { + err = 0; + break; + } + if (ref_offset < parent) + break; + } else { + if (root_objectid == ref_offset) { + err = 0; + break; + } + if (ref_offset < root_objectid) + break; + } + } + ptr += btrfs_extent_inline_ref_size(type); + } + if (err == -ENOENT && insert) { + if (item_size + extra_size >= + BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { + err = -EAGAIN; + goto out; + } + /* + * To add new inline back ref, we have to make sure + * there is no corresponding back ref item. + * For simplicity, we just do not add new inline back + * ref if there is any kind of item for this block + */ + if (find_next_key(path, 0, &key) == 0 && + key.objectid == bytenr && + key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { + err = -EAGAIN; + goto out; + } + } + *ref_ret = (struct btrfs_extent_inline_ref *)ptr; +out: + if (insert) { + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); + } + return err; +} + +/* + * helper to add new inline back ref + */ +static noinline_for_stack +int setup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + unsigned long ptr; + unsigned long end; + unsigned long item_offset; + u64 refs; + int size; + int type; + int ret; + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + item_offset = (unsigned long)iref - (unsigned long)ei; + + type = extent_ref_type(parent, owner); + size = btrfs_extent_inline_ref_size(type); + + ret = btrfs_extend_item(trans, root, path, size); + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + refs += refs_to_add; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + + ptr = (unsigned long)ei + item_offset; + end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); + if (ptr < end - size) + memmove_extent_buffer(leaf, ptr + size, ptr, + end - size - ptr); + + iref = (struct btrfs_extent_inline_ref *)ptr; + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, dref, owner); + btrfs_set_extent_data_ref_offset(leaf, dref, offset); + btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + sref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} + +static int lookup_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) +{ + int ret; + + ret = lookup_inline_extent_backref(trans, root, path, ref_ret, + bytenr, num_bytes, parent, + root_objectid, owner, offset, 0); + if (ret != -ENOENT) + return ret; + + btrfs_release_path(path); + *ref_ret = NULL; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, + root_objectid); + } else { + ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, + root_objectid, owner, offset); + } + return ret; +} + +/* + * helper to update/remove inline back ref + */ +static noinline_for_stack +int update_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_mod, + struct btrfs_delayed_extent_op *extent_op) +{ + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_data_ref *dref = NULL; + struct btrfs_shared_data_ref *sref = NULL; + unsigned long ptr; + unsigned long end; + u32 item_size; + int size; + int type; + int ret; + u64 refs; + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); + refs += refs_to_mod; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + + type = btrfs_extent_inline_ref_type(leaf, iref); + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + refs = btrfs_extent_data_ref_count(leaf, dref); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + sref = (struct btrfs_shared_data_ref *)(iref + 1); + refs = btrfs_shared_data_ref_count(leaf, sref); + } else { + refs = 1; + BUG_ON(refs_to_mod != -1); + } + + BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); + refs += refs_to_mod; + + if (refs > 0) { + if (type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, dref, refs); + else + btrfs_set_shared_data_ref_count(leaf, sref, refs); + } else { + size = btrfs_extent_inline_ref_size(type); + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + if (ptr + size < end) + memmove_extent_buffer(leaf, ptr, ptr + size, + end - ptr - size); + item_size -= size; + ret = btrfs_truncate_item(trans, root, path, item_size, 1); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} + +static noinline_for_stack +int insert_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_extent_inline_ref *iref; + int ret; + + ret = lookup_inline_extent_backref(trans, root, path, &iref, + bytenr, num_bytes, parent, + root_objectid, owner, offset, 1); + if (ret == 0) { + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); + ret = update_inline_extent_backref(trans, root, path, iref, + refs_to_add, extent_op); + } else if (ret == -ENOENT) { + ret = setup_inline_extent_backref(trans, root, path, iref, + parent, root_objectid, + owner, offset, refs_to_add, + extent_op); + } + return ret; +} + +static int insert_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add) +{ + int ret; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + BUG_ON(refs_to_add != 1); + ret = insert_tree_block_ref(trans, root, path, bytenr, + parent, root_objectid); + } else { + ret = insert_extent_data_ref(trans, root, path, bytenr, + parent, root_objectid, + owner, offset, refs_to_add); + } + return ret; +} + +static int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_drop, int is_data) +{ + int ret; + + BUG_ON(!is_data && refs_to_drop != 1); + if (iref) { + ret = update_inline_extent_backref(trans, root, path, iref, + -refs_to_drop, NULL); + } else if (is_data) { + ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + } else { + ret = btrfs_del_item(trans, root, path); + } + return ret; +} + +static int btrfs_issue_discard(struct block_device *bdev, + u64 start, u64 len) +{ + return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); +} + +static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes) +{ + int ret; + u64 discarded_bytes = 0; + struct btrfs_multi_bio *multi = NULL; + + + /* Tell the block device(s) that the sectors can be discarded */ + ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, + bytenr, &num_bytes, &multi, 0); + if (!ret) { + struct btrfs_bio_stripe *stripe = multi->stripes; + int i; + + + for (i = 0; i < multi->num_stripes; i++, stripe++) { + if (!stripe->dev->can_discard) + continue; + + ret = btrfs_issue_discard(stripe->dev->bdev, + stripe->physical, + stripe->length); + if (!ret) + discarded_bytes += stripe->length; + else if (ret != -EOPNOTSUPP) + break; + + /* + * Just in case we get back EOPNOTSUPP for some reason, + * just ignore the return value so we don't screw up + * people calling discard_extent. + */ + ret = 0; + } + kfree(multi); + } + + if (actual_bytes) + *actual_bytes = discarded_bytes; + + + return ret; +} + +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) +{ + int ret; + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && + root_objectid == BTRFS_TREE_LOG_OBJECTID); + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + parent, root_objectid, (int)owner, + BTRFS_ADD_DELAYED_REF, NULL); + } else { + ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + parent, root_objectid, owner, offset, + BTRFS_ADD_DELAYED_REF, NULL); + } + return ret; +} + +static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_extent_item *item; + u64 refs; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + path->leave_spinning = 1; + /* this will setup the path even if it fails to insert the back ref */ + ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, + path, bytenr, num_bytes, parent, + root_objectid, owner, offset, + refs_to_add, extent_op); + if (ret == 0) + goto out; + + if (ret != -EAGAIN) { + err = ret; + goto out; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, item); + btrfs_set_extent_refs(leaf, item, refs + refs_to_add); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, item); + + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + path->reada = 1; + path->leave_spinning = 1; + + /* now insert the actual backref */ + ret = insert_extent_backref(trans, root->fs_info->extent_root, + path, bytenr, parent, root_objectid, + owner, offset, refs_to_add); + BUG_ON(ret); +out: + btrfs_free_path(path); + return err; +} + +static int run_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) +{ + int ret = 0; + struct btrfs_delayed_data_ref *ref; + struct btrfs_key ins; + u64 parent = 0; + u64 ref_root = 0; + u64 flags = 0; + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ref = btrfs_delayed_node_to_data_ref(node); + if (node->type == BTRFS_SHARED_DATA_REF_KEY) + parent = ref->parent; + else + ref_root = ref->root; + + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + if (extent_op) { + BUG_ON(extent_op->update_key); + flags |= extent_op->flags_to_set; + } + ret = alloc_reserved_file_extent(trans, root, + parent, ref_root, flags, + ref->objectid, ref->offset, + &ins, node->ref_mod); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, + ref_root, ref->objectid, + ref->offset, node->ref_mod, + extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, root, node->bytenr, + node->num_bytes, parent, + ref_root, ref->objectid, + ref->offset, node->ref_mod, + extent_op); + } else { + BUG(); + } + return ret; +} + +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei) +{ + u64 flags = btrfs_extent_flags(leaf, ei); + if (extent_op->update_flags) { + flags |= extent_op->flags_to_set; + btrfs_set_extent_flags(leaf, ei, flags); + } + + if (extent_op->update_key) { + struct btrfs_tree_block_info *bi; + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_set_tree_block_key(leaf, bi, &extent_op->key); + } +} + +static int run_delayed_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + u32 item_size; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = node->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + + path->reada = 1; + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, + path, 0, 1); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + err = -EIO; + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + ret = convert_extent_item_v0(trans, root->fs_info->extent_root, + path, (u64)-1, 0); + if (ret < 0) { + err = ret; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + } +#endif + BUG_ON(item_size < sizeof(*ei)); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + __run_delayed_extent_op(extent_op, leaf, ei); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return err; +} + +static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) +{ + int ret = 0; + struct btrfs_delayed_tree_ref *ref; + struct btrfs_key ins; + u64 parent = 0; + u64 ref_root = 0; + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ref = btrfs_delayed_node_to_tree_ref(node); + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) + parent = ref->parent; + else + ref_root = ref->root; + + BUG_ON(node->ref_mod != 1); + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + BUG_ON(!extent_op || !extent_op->update_flags || + !extent_op->update_key); + ret = alloc_reserved_tree_block(trans, root, + parent, ref_root, + extent_op->flags_to_set, + &extent_op->key, + ref->level, &ins); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, ref_root, + ref->level, 0, 1, extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, root, node->bytenr, + node->num_bytes, parent, ref_root, + ref->level, 0, 1, extent_op); + } else { + BUG(); + } + return ret; +} + +/* helper function to actually process a single delayed ref entry */ +static int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) +{ + int ret; + if (btrfs_delayed_ref_is_head(node)) { + struct btrfs_delayed_ref_head *head; + /* + * we've hit the end of the chain and we were supposed + * to insert this extent into the tree. But, it got + * deleted before we ever needed to insert it, so all + * we have to do is clean up the accounting + */ + BUG_ON(extent_op); + head = btrfs_delayed_node_to_head(node); + if (insert_reserved) { + btrfs_pin_extent(root, node->bytenr, + node->num_bytes, 1); + if (head->is_data) { + ret = btrfs_del_csums(trans, root, + node->bytenr, + node->num_bytes); + BUG_ON(ret); + } + } + mutex_unlock(&head->mutex); + return 0; + } + + if (node->type == BTRFS_TREE_BLOCK_REF_KEY || + node->type == BTRFS_SHARED_BLOCK_REF_KEY) + ret = run_delayed_tree_ref(trans, root, node, extent_op, + insert_reserved); + else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) + ret = run_delayed_data_ref(trans, root, node, extent_op, + insert_reserved); + else + BUG(); + return ret; +} + +static noinline struct btrfs_delayed_ref_node * +select_delayed_ref(struct btrfs_delayed_ref_head *head) +{ + struct rb_node *node; + struct btrfs_delayed_ref_node *ref; + int action = BTRFS_ADD_DELAYED_REF; +again: + /* + * select delayed ref of type BTRFS_ADD_DELAYED_REF first. + * this prevents ref count from going down to zero when + * there still are pending delayed ref. + */ + node = rb_prev(&head->node.rb_node); + while (1) { + if (!node) + break; + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + if (ref->bytenr != head->node.bytenr) + break; + if (ref->action == action) + return ref; + node = rb_prev(node); + } + if (action == BTRFS_ADD_DELAYED_REF) { + action = BTRFS_DROP_DELAYED_REF; + goto again; + } + return NULL; +} + +static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct list_head *cluster) +{ + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *locked_ref = NULL; + struct btrfs_delayed_extent_op *extent_op; + int ret; + int count = 0; + int must_insert_reserved = 0; + + delayed_refs = &trans->transaction->delayed_refs; + while (1) { + if (!locked_ref) { + /* pick a new head ref from the cluster list */ + if (list_empty(cluster)) + break; + + locked_ref = list_entry(cluster->next, + struct btrfs_delayed_ref_head, cluster); + + /* grab the lock that says we are going to process + * all the refs for this head */ + ret = btrfs_delayed_ref_lock(trans, locked_ref); + + /* + * we may have dropped the spin lock to get the head + * mutex lock, and that might have given someone else + * time to free the head. If that's true, it has been + * removed from our list and we can move on. + */ + if (ret == -EAGAIN) { + locked_ref = NULL; + count++; + continue; + } + } + + /* + * record the must insert reserved flag before we + * drop the spin lock. + */ + must_insert_reserved = locked_ref->must_insert_reserved; + locked_ref->must_insert_reserved = 0; + + extent_op = locked_ref->extent_op; + locked_ref->extent_op = NULL; + + /* + * locked_ref is the head node, so we have to go one + * node back for any delayed ref updates + */ + ref = select_delayed_ref(locked_ref); + if (!ref) { + /* All delayed refs have been processed, Go ahead + * and send the head node to run_one_delayed_ref, + * so that any accounting fixes can happen + */ + ref = &locked_ref->node; + + if (extent_op && must_insert_reserved) { + kfree(extent_op); + extent_op = NULL; + } + + if (extent_op) { + spin_unlock(&delayed_refs->lock); + + ret = run_delayed_extent_op(trans, root, + ref, extent_op); + BUG_ON(ret); + kfree(extent_op); + + cond_resched(); + spin_lock(&delayed_refs->lock); + continue; + } + + list_del_init(&locked_ref->cluster); + locked_ref = NULL; + } + + ref->in_tree = 0; + rb_erase(&ref->rb_node, &delayed_refs->root); + delayed_refs->num_entries--; + + spin_unlock(&delayed_refs->lock); + + ret = run_one_delayed_ref(trans, root, ref, extent_op, + must_insert_reserved); + BUG_ON(ret); + + btrfs_put_delayed_ref(ref); + kfree(extent_op); + count++; + + cond_resched(); + spin_lock(&delayed_refs->lock); + } + return count; +} + +/* + * this starts processing the delayed reference count updates and + * extent insertions we have queued up so far. count can be + * 0, which means to process everything in the tree at the start + * of the run (but not newly added entries), or it can be some target + * number you'd like to process. + */ +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct list_head cluster; + int ret; + int run_all = count == (unsigned long)-1; + int run_most = 0; + + if (root == root->fs_info->extent_root) + root = root->fs_info->tree_root; + + delayed_refs = &trans->transaction->delayed_refs; + INIT_LIST_HEAD(&cluster); +again: + spin_lock(&delayed_refs->lock); + if (count == 0) { + count = delayed_refs->num_entries * 2; + run_most = 1; + } + while (1) { + if (!(run_all || run_most) && + delayed_refs->num_heads_ready < 64) + break; + + /* + * go find something we can process in the rbtree. We start at + * the beginning of the tree, and then build a cluster + * of refs to process starting at the first one we are able to + * lock + */ + ret = btrfs_find_ref_cluster(trans, &cluster, + delayed_refs->run_delayed_start); + if (ret) + break; + + ret = run_clustered_refs(trans, root, &cluster); + BUG_ON(ret < 0); + + count -= min_t(unsigned long, ret, count); + + if (count == 0) + break; + } + + if (run_all) { + node = rb_first(&delayed_refs->root); + if (!node) + goto out; + count = (unsigned long)-1; + + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + if (btrfs_delayed_ref_is_head(ref)) { + struct btrfs_delayed_ref_head *head; + + head = btrfs_delayed_node_to_head(ref); + atomic_inc(&ref->refs); + + spin_unlock(&delayed_refs->lock); + /* + * Mutex was contended, block until it's + * released and try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + + btrfs_put_delayed_ref(ref); + cond_resched(); + goto again; + } + node = rb_next(node); + } + spin_unlock(&delayed_refs->lock); + schedule_timeout(1); + goto again; + } +out: + spin_unlock(&delayed_refs->lock); + return 0; +} + +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 flags, + int is_data) +{ + struct btrfs_delayed_extent_op *extent_op; + int ret; + + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + if (!extent_op) + return -ENOMEM; + + extent_op->flags_to_set = flags; + extent_op->update_flags = 1; + extent_op->update_key = 0; + extent_op->is_data = is_data ? 1 : 0; + + ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + if (ret) + kfree(extent_op); + return ret; +} + +static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_data_ref *data_ref; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + int ret = 0; + + ret = -ENOENT; + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (!head) + goto out; + + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's released and let + * caller try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return -EAGAIN; + } + + node = rb_prev(&head->node.rb_node); + if (!node) + goto out_unlock; + + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + + if (ref->bytenr != bytenr) + goto out_unlock; + + ret = 1; + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) + goto out_unlock; + + data_ref = btrfs_delayed_node_to_data_ref(ref); + + node = rb_prev(node); + if (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + if (ref->bytenr == bytenr) + goto out_unlock; + } + + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || data_ref->offset != offset) + goto out_unlock; + + ret = 0; +out_unlock: + mutex_unlock(&head->mutex); +out: + spin_unlock(&delayed_refs->lock); + return ret; +} + +static noinline int check_committed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct extent_buffer *leaf; + struct btrfs_extent_data_ref *ref; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_item *ei; + struct btrfs_key key; + u32 item_size; + int ret; + + key.objectid = bytenr; + key.offset = (u64)-1; + key.type = BTRFS_EXTENT_ITEM_KEY; + + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + ret = -ENOENT; + if (path->slots[0] == 0) + goto out; + + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) + goto out; + + ret = 1; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + goto out; + } +#endif + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + + if (item_size != sizeof(*ei) + + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) + goto out; + + if (btrfs_extent_generation(leaf, ei) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out; + + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + if (btrfs_extent_inline_ref_type(leaf, iref) != + BTRFS_EXTENT_DATA_REF_KEY) + goto out; + + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (btrfs_extent_refs(leaf, ei) != + btrfs_extent_data_ref_count(leaf, ref) || + btrfs_extent_data_ref_root(leaf, ref) != + root->root_key.objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != objectid || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + goto out; + + ret = 0; +out: + return ret; +} + +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_path *path; + int ret; + int ret2; + + path = btrfs_alloc_path(); + if (!path) + return -ENOENT; + + do { + ret = check_committed_ref(trans, root, path, objectid, + offset, bytenr); + if (ret && ret != -ENOENT) + goto out; + + ret2 = check_delayed_ref(trans, root, path, objectid, + offset, bytenr); + } while (ret2 == -EAGAIN); + + if (ret2 && ret2 != -ENOENT) { + ret = ret2; + goto out; + } + + if (ret != -ENOENT || ret2 != -ENOENT) + ret = 0; +out: + btrfs_free_path(path); + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + WARN_ON(ret > 0); + return ret; +} + +static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + int full_backref, int inc) +{ + u64 bytenr; + u64 num_bytes; + u64 parent; + u64 ref_root; + u32 nritems; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int i; + int level; + int ret = 0; + int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, + u64, u64, u64, u64, u64, u64); + + ref_root = btrfs_header_owner(buf); + nritems = btrfs_header_nritems(buf); + level = btrfs_header_level(buf); + + if (!root->ref_cows && level == 0) + return 0; + + if (inc) + process_func = btrfs_inc_extent_ref; + else + process_func = btrfs_free_extent; + + if (full_backref) + parent = buf->start; + else + parent = 0; + + for (i = 0; i < nritems; i++) { + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; + + num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + key.offset -= btrfs_file_extent_offset(buf, fi); + ret = process_func(trans, root, bytenr, num_bytes, + parent, ref_root, key.objectid, + key.offset); + if (ret) + goto fail; + } else { + bytenr = btrfs_node_blockptr(buf, i); + num_bytes = btrfs_level_size(root, level - 1); + ret = process_func(trans, root, bytenr, num_bytes, + parent, ref_root, level - 1, 0); + if (ret) + goto fail; + } + } + return 0; +fail: + BUG(); + return ret; +} + +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) +{ + return __btrfs_mod_ref(trans, root, buf, full_backref, 1); +} + +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) +{ + return __btrfs_mod_ref(trans, root, buf, full_backref, 0); +} + +static int write_one_cache_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_block_group_cache *cache) +{ + int ret; + struct btrfs_root *extent_root = root->fs_info->extent_root; + unsigned long bi; + struct extent_buffer *leaf; + + ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); + if (ret < 0) + goto fail; + BUG_ON(ret); + + leaf = path->nodes[0]; + bi = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); +fail: + if (ret) + return ret; + return 0; + +} + +static struct btrfs_block_group_cache * +next_block_group(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + struct rb_node *node; + spin_lock(&root->fs_info->block_group_cache_lock); + node = rb_next(&cache->cache_node); + btrfs_put_block_group(cache); + if (node) { + cache = rb_entry(node, struct btrfs_block_group_cache, + cache_node); + btrfs_get_block_group(cache); + } else + cache = NULL; + spin_unlock(&root->fs_info->block_group_cache_lock); + return cache; +} + +static int cache_save_setup(struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path) +{ + struct btrfs_root *root = block_group->fs_info->tree_root; + struct inode *inode = NULL; + u64 alloc_hint = 0; + int dcs = BTRFS_DC_ERROR; + int num_pages = 0; + int retries = 0; + int ret = 0; + + /* + * If this block group is smaller than 100 megs don't bother caching the + * block group. + */ + if (block_group->key.offset < (100 * 1024 * 1024)) { + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + return 0; + } + +again: + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { + ret = PTR_ERR(inode); + btrfs_release_path(path); + goto out; + } + + if (IS_ERR(inode)) { + BUG_ON(retries); + retries++; + + if (block_group->ro) + goto out_free; + + ret = create_free_space_inode(root, trans, block_group, path); + if (ret) + goto out_free; + goto again; + } + + /* + * We want to set the generation to 0, that way if anything goes wrong + * from here on out we know not to trust this cache when we load up next + * time. + */ + BTRFS_I(inode)->generation = 0; + ret = btrfs_update_inode(trans, root, inode); + WARN_ON(ret); + + if (i_size_read(inode) > 0) { + ret = btrfs_truncate_free_space_cache(root, trans, path, + inode); + if (ret) + goto out_put; + } + + spin_lock(&block_group->lock); + if (block_group->cached != BTRFS_CACHE_FINISHED) { + /* We're not cached, don't bother trying to write stuff out */ + dcs = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + goto out_put; + } + spin_unlock(&block_group->lock); + + num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024); + if (!num_pages) + num_pages = 1; + + /* + * Just to make absolutely sure we have enough space, we're going to + * preallocate 12 pages worth of space for each block group. In + * practice we ought to use at most 8, but we need extra space so we can + * add our header and have a terminator between the extents and the + * bitmaps. + */ + num_pages *= 16; + num_pages *= PAGE_CACHE_SIZE; + + ret = btrfs_check_data_free_space(inode, num_pages); + if (ret) + goto out_put; + + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, + num_pages, num_pages, + &alloc_hint); + if (!ret) + dcs = BTRFS_DC_SETUP; + btrfs_free_reserved_data_space(inode, num_pages); +out_put: + iput(inode); +out_free: + btrfs_release_path(path); +out: + spin_lock(&block_group->lock); + block_group->disk_cache_state = dcs; + spin_unlock(&block_group->lock); + + return ret; +} + +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache; + int err = 0; + struct btrfs_path *path; + u64 last = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + +again: + while (1) { + cache = btrfs_lookup_first_block_group(root->fs_info, last); + while (cache) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + break; + cache = next_block_group(root, cache); + } + if (!cache) { + if (last == 0) + break; + last = 0; + continue; + } + err = cache_save_setup(cache, trans, path); + last = cache->key.objectid + cache->key.offset; + btrfs_put_block_group(cache); + } + + while (1) { + if (last == 0) { + err = btrfs_run_delayed_refs(trans, root, + (unsigned long)-1); + BUG_ON(err); + } + + cache = btrfs_lookup_first_block_group(root->fs_info, last); + while (cache) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) { + btrfs_put_block_group(cache); + goto again; + } + + if (cache->dirty) + break; + cache = next_block_group(root, cache); + } + if (!cache) { + if (last == 0) + break; + last = 0; + continue; + } + + if (cache->disk_cache_state == BTRFS_DC_SETUP) + cache->disk_cache_state = BTRFS_DC_NEED_WRITE; + cache->dirty = 0; + last = cache->key.objectid + cache->key.offset; + + err = write_one_cache_group(trans, root, path, cache); + BUG_ON(err); + btrfs_put_block_group(cache); + } + + while (1) { + /* + * I don't think this is needed since we're just marking our + * preallocated extent as written, but just in case it can't + * hurt. + */ + if (last == 0) { + err = btrfs_run_delayed_refs(trans, root, + (unsigned long)-1); + BUG_ON(err); + } + + cache = btrfs_lookup_first_block_group(root->fs_info, last); + while (cache) { + /* + * Really this shouldn't happen, but it could if we + * couldn't write the entire preallocated extent and + * splitting the extent resulted in a new block. + */ + if (cache->dirty) { + btrfs_put_block_group(cache); + goto again; + } + if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) + break; + cache = next_block_group(root, cache); + } + if (!cache) { + if (last == 0) + break; + last = 0; + continue; + } + + btrfs_write_out_cache(root, trans, cache, path); + + /* + * If we didn't have an error then the cache state is still + * NEED_WRITE, so we can set it to WRITTEN. + */ + if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) + cache->disk_cache_state = BTRFS_DC_WRITTEN; + last = cache->key.objectid + cache->key.offset; + btrfs_put_block_group(cache); + } + + btrfs_free_path(path); + return 0; +} + +int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) +{ + struct btrfs_block_group_cache *block_group; + int readonly = 0; + + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); + if (!block_group || block_group->ro) + readonly = 1; + if (block_group) + btrfs_put_block_group(block_group); + return readonly; +} + +static int update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + int i; + int factor; + + if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + + found = __find_space_info(info, flags); + if (found) { + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; + found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; + found->full = 0; + spin_unlock(&found->lock); + *space_info = found; + return 0; + } + found = kzalloc(sizeof(*found), GFP_NOFS); + if (!found) + return -ENOMEM; + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + INIT_LIST_HEAD(&found->block_groups[i]); + init_rwsem(&found->groups_sem); + spin_lock_init(&found->lock); + found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | + BTRFS_BLOCK_GROUP_SYSTEM | + BTRFS_BLOCK_GROUP_METADATA); + found->total_bytes = total_bytes; + found->disk_total = total_bytes * factor; + found->bytes_used = bytes_used; + found->disk_used = bytes_used * factor; + found->bytes_pinned = 0; + found->bytes_reserved = 0; + found->bytes_readonly = 0; + found->bytes_may_use = 0; + found->full = 0; + found->force_alloc = CHUNK_ALLOC_NO_FORCE; + found->chunk_alloc = 0; + *space_info = found; + list_add_rcu(&found->list, &info->space_info); + atomic_set(&found->caching_threads, 0); + return 0; +} + +static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP); + if (extra_flags) { + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; + } +} + +u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) +{ + /* + * we add in the count of missing devices because we want + * to make sure that any RAID levels on a degraded FS + * continue to be honored. + */ + u64 num_devices = root->fs_info->fs_devices->rw_devices + + root->fs_info->fs_devices->missing_devices; + + if (num_devices == 1) + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + if (num_devices < 4) + flags &= ~BTRFS_BLOCK_GROUP_RAID10; + + if ((flags & BTRFS_BLOCK_GROUP_DUP) && + (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10))) { + flags &= ~BTRFS_BLOCK_GROUP_DUP; + } + + if ((flags & BTRFS_BLOCK_GROUP_RAID1) && + (flags & BTRFS_BLOCK_GROUP_RAID10)) { + flags &= ~BTRFS_BLOCK_GROUP_RAID1; + } + + if ((flags & BTRFS_BLOCK_GROUP_RAID0) && + ((flags & BTRFS_BLOCK_GROUP_RAID1) | + (flags & BTRFS_BLOCK_GROUP_RAID10) | + (flags & BTRFS_BLOCK_GROUP_DUP))) + flags &= ~BTRFS_BLOCK_GROUP_RAID0; + return flags; +} + +static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) +{ + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= root->fs_info->avail_data_alloc_bits & + root->fs_info->data_alloc_profile; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= root->fs_info->avail_system_alloc_bits & + root->fs_info->system_alloc_profile; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= root->fs_info->avail_metadata_alloc_bits & + root->fs_info->metadata_alloc_profile; + return btrfs_reduce_alloc_profile(root, flags); +} + +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) +{ + u64 flags; + + if (data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (root == root->fs_info->chunk_root) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + + return get_alloc_profile(root, flags); +} + +void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) +{ + BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, + BTRFS_BLOCK_GROUP_DATA); +} + +/* + * This will check the space that the inode allocates from to make sure we have + * enough space for bytes. + */ +int btrfs_check_data_free_space(struct inode *inode, u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 used; + int ret = 0, committed = 0, alloc_chunk = 1; + + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + if (root == root->fs_info->tree_root || + BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { + alloc_chunk = 0; + committed = 1; + } + + data_sinfo = BTRFS_I(inode)->space_info; + if (!data_sinfo) + goto alloc; + +again: + /* make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + + data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + + data_sinfo->bytes_may_use; + + if (used + bytes > data_sinfo->total_bytes) { + struct btrfs_trans_handle *trans; + + /* + * if we don't have enough free bytes in this space then we need + * to alloc a new chunk. + */ + if (!data_sinfo->full && alloc_chunk) { + u64 alloc_target; + + data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; + spin_unlock(&data_sinfo->lock); +alloc: + alloc_target = btrfs_get_alloc_profile(root, 1); + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + bytes + 2 * 1024 * 1024, + alloc_target, + CHUNK_ALLOC_NO_FORCE); + btrfs_end_transaction(trans, root); + if (ret < 0) { + if (ret != -ENOSPC) + return ret; + else + goto commit_trans; + } + + if (!data_sinfo) { + btrfs_set_inode_space_info(root, inode); + data_sinfo = BTRFS_I(inode)->space_info; + } + goto again; + } + + /* + * If we have less pinned bytes than we want to allocate then + * don't bother committing the transaction, it won't help us. + */ + if (data_sinfo->bytes_pinned < bytes) + committed = 1; + spin_unlock(&data_sinfo->lock); + + /* commit the current transaction and try again */ +commit_trans: + if (!committed && + !atomic_read(&root->fs_info->open_ioctl_trans)) { + committed = 1; + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + goto again; + } + + return -ENOSPC; + } + data_sinfo->bytes_may_use += bytes; + BTRFS_I(inode)->reserved_bytes += bytes; + spin_unlock(&data_sinfo->lock); + + return 0; +} + +/* + * called when we are clearing an delalloc extent from the + * inode's io_tree or there was an error for whatever reason + * after calling btrfs_check_data_free_space + */ +void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_space_info *data_sinfo; + + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + data_sinfo = BTRFS_I(inode)->space_info; + spin_lock(&data_sinfo->lock); + data_sinfo->bytes_may_use -= bytes; + BTRFS_I(inode)->reserved_bytes -= bytes; + spin_unlock(&data_sinfo->lock); +} + +static void force_metadata_allocation(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = CHUNK_ALLOC_FORCE; + } + rcu_read_unlock(); +} + +static int should_alloc_chunk(struct btrfs_root *root, + struct btrfs_space_info *sinfo, u64 alloc_bytes, + int force) +{ + u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; + u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; + u64 thresh; + + if (force == CHUNK_ALLOC_FORCE) + return 1; + + /* + * in limited mode, we want to have some free space up to + * about 1% of the FS size. + */ + if (force == CHUNK_ALLOC_LIMITED) { + thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = max_t(u64, 64 * 1024 * 1024, + div_factor_fine(thresh, 1)); + + if (num_bytes - num_allocated < thresh) + return 1; + } + + /* + * we have two similar checks here, one based on percentage + * and once based on a hard number of 256MB. The idea + * is that if we have a good amount of free + * room, don't allocate a chunk. A good mount is + * less than 80% utilized of the chunks we have allocated, + * or more than 256MB free + */ + if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes) + return 0; + + if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) + return 0; + + thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + + /* 256MB or 5% of the FS */ + thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); + + if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) + return 0; + return 1; +} + +static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 alloc_bytes, + u64 flags, int force) +{ + struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = extent_root->fs_info; + int wait_for_alloc = 0; + int ret = 0; + + flags = btrfs_reduce_alloc_profile(extent_root, flags); + + space_info = __find_space_info(extent_root->fs_info, flags); + if (!space_info) { + ret = update_space_info(extent_root->fs_info, flags, + 0, 0, &space_info); + BUG_ON(ret); + } + BUG_ON(!space_info); + +again: + spin_lock(&space_info->lock); + if (space_info->force_alloc) + force = space_info->force_alloc; + if (space_info->full) { + spin_unlock(&space_info->lock); + return 0; + } + + if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { + spin_unlock(&space_info->lock); + return 0; + } else if (space_info->chunk_alloc) { + wait_for_alloc = 1; + } else { + space_info->chunk_alloc = 1; + } + + spin_unlock(&space_info->lock); + + mutex_lock(&fs_info->chunk_mutex); + + /* + * The chunk_mutex is held throughout the entirety of a chunk + * allocation, so once we've acquired the chunk_mutex we know that the + * other guy is done and we need to recheck and see if we should + * allocate. + */ + if (wait_for_alloc) { + mutex_unlock(&fs_info->chunk_mutex); + wait_for_alloc = 0; + goto again; + } + + /* + * If we have mixed data/metadata chunks we want to make sure we keep + * allocating mixed chunks instead of individual chunks. + */ + if (btrfs_mixed_space_info(space_info)) + flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); + + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); + } + + ret = btrfs_alloc_chunk(trans, extent_root, flags); + spin_lock(&space_info->lock); + if (ret) + space_info->full = 1; + else + ret = 1; + + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; + space_info->chunk_alloc = 0; + spin_unlock(&space_info->lock); + mutex_unlock(&extent_root->fs_info->chunk_mutex); + return ret; +} + +/* + * shrink metadata reservation for delalloc + */ +static int shrink_delalloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 to_reclaim, int sync) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_space_info *space_info; + u64 reserved; + u64 max_reclaim; + u64 reclaimed = 0; + long time_left; + int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; + int loops = 0; + unsigned long progress; + + block_rsv = &root->fs_info->delalloc_block_rsv; + space_info = block_rsv->space_info; + + smp_mb(); + reserved = space_info->bytes_reserved; + progress = space_info->reservation_progress; + + if (reserved == 0) + return 0; + + max_reclaim = min(reserved, to_reclaim); + + while (loops < 1024) { + /* have the flusher threads jump in and do some IO */ + smp_mb(); + nr_pages = min_t(unsigned long, nr_pages, + root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); + writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); + + spin_lock(&space_info->lock); + if (reserved > space_info->bytes_reserved) + reclaimed += reserved - space_info->bytes_reserved; + reserved = space_info->bytes_reserved; + spin_unlock(&space_info->lock); + + loops++; + + if (reserved == 0 || reclaimed >= max_reclaim) + break; + + if (trans && trans->transaction->blocked) + return -EAGAIN; + + time_left = schedule_timeout_interruptible(1); + + /* We were interrupted, exit */ + if (time_left) + break; + + /* we've kicked the IO a few times, if anything has been freed, + * exit. There is no sense in looping here for a long time + * when we really need to commit the transaction, or there are + * just too many writers without enough free space + */ + + if (loops > 3) { + smp_mb(); + if (progress != space_info->reservation_progress) + break; + } + + } + return reclaimed >= to_reclaim; +} + +/* + * Retries tells us how many times we've called reserve_metadata_bytes. The + * idea is if this is the first call (retries == 0) then we will add to our + * reserved count if we can't make the allocation in order to hold our place + * while we go and try and free up space. That way for retries > 1 we don't try + * and add space, we just check to see if the amount of unused space is >= the + * total space, meaning that our reservation is valid. + * + * However if we don't intend to retry this reservation, pass -1 as retries so + * that it short circuits this logic. + */ +static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, int flush) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + u64 unused; + u64 num_bytes = orig_bytes; + int retries = 0; + int ret = 0; + bool reserved = false; + bool committed = false; + +again: + ret = -ENOSPC; + if (reserved) + num_bytes = 0; + + spin_lock(&space_info->lock); + unused = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + + /* + * The idea here is that we've not already over-reserved the block group + * then we can go ahead and save our reservation first and then start + * flushing if we need to. Otherwise if we've already overcommitted + * lets start flushing stuff first and then come back and try to make + * our reservation. + */ + if (unused <= space_info->total_bytes) { + unused = space_info->total_bytes - unused; + if (unused >= num_bytes) { + if (!reserved) + space_info->bytes_reserved += orig_bytes; + ret = 0; + } else { + /* + * Ok set num_bytes to orig_bytes since we aren't + * overocmmitted, this way we only try and reclaim what + * we need. + */ + num_bytes = orig_bytes; + } + } else { + /* + * Ok we're over committed, set num_bytes to the overcommitted + * amount plus the amount of bytes that we need for this + * reservation. + */ + num_bytes = unused - space_info->total_bytes + + (orig_bytes * (retries + 1)); + } + + /* + * Couldn't make our reservation, save our place so while we're trying + * to reclaim space we can actually use it instead of somebody else + * stealing it from us. + */ + if (ret && !reserved) { + space_info->bytes_reserved += orig_bytes; + reserved = true; + } + + spin_unlock(&space_info->lock); + + if (!ret) + return 0; + + if (!flush) + goto out; + + /* + * We do synchronous shrinking since we don't actually unreserve + * metadata until after the IO is completed. + */ + ret = shrink_delalloc(trans, root, num_bytes, 1); + if (ret > 0) + return 0; + else if (ret < 0) + goto out; + + /* + * So if we were overcommitted it's possible that somebody else flushed + * out enough space and we simply didn't have enough space to reclaim, + * so go back around and try again. + */ + if (retries < 2) { + retries++; + goto again; + } + + spin_lock(&space_info->lock); + /* + * Not enough space to be reclaimed, don't bother committing the + * transaction. + */ + if (space_info->bytes_pinned < orig_bytes) + ret = -ENOSPC; + spin_unlock(&space_info->lock); + if (ret) + goto out; + + ret = -EAGAIN; + if (trans || committed) + goto out; + + ret = -ENOSPC; + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + goto out; + ret = btrfs_commit_transaction(trans, root); + if (!ret) { + trans = NULL; + committed = true; + goto again; + } + +out: + if (reserved) { + spin_lock(&space_info->lock); + space_info->bytes_reserved -= orig_bytes; + spin_unlock(&space_info->lock); + } + + return ret; +} + +static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_rsv *block_rsv; + if (root->ref_cows) + block_rsv = trans->block_rsv; + else + block_rsv = root->block_rsv; + + if (!block_rsv) + block_rsv = &root->fs_info->empty_block_rsv; + + return block_rsv; +} + +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + int ret = -ENOSPC; + spin_lock(&block_rsv->lock); + if (block_rsv->reserved >= num_bytes) { + block_rsv->reserved -= num_bytes; + if (block_rsv->reserved < block_rsv->size) + block_rsv->full = 0; + ret = 0; + } + spin_unlock(&block_rsv->lock); + return ret; +} + +static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int update_size) +{ + spin_lock(&block_rsv->lock); + block_rsv->reserved += num_bytes; + if (update_size) + block_rsv->size += num_bytes; + else if (block_rsv->reserved >= block_rsv->size) + block_rsv->full = 1; + spin_unlock(&block_rsv->lock); +} + +static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, + struct btrfs_block_rsv *dest, u64 num_bytes) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + + spin_lock(&block_rsv->lock); + if (num_bytes == (u64)-1) + num_bytes = block_rsv->size; + block_rsv->size -= num_bytes; + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } else { + num_bytes = 0; + } + spin_unlock(&block_rsv->lock); + + if (num_bytes > 0) { + if (dest) { + spin_lock(&dest->lock); + if (!dest->full) { + u64 bytes_to_add; + + bytes_to_add = dest->size - dest->reserved; + bytes_to_add = min(num_bytes, bytes_to_add); + dest->reserved += bytes_to_add; + if (dest->reserved >= dest->size) + dest->full = 1; + num_bytes -= bytes_to_add; + } + spin_unlock(&dest->lock); + } + if (num_bytes) { + spin_lock(&space_info->lock); + space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; + spin_unlock(&space_info->lock); + } + } +} + +static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes) +{ + int ret; + + ret = block_rsv_use_bytes(src, num_bytes); + if (ret) + return ret; + + block_rsv_add_bytes(dst, num_bytes, 1); + return 0; +} + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) +{ + memset(rsv, 0, sizeof(*rsv)); + spin_lock_init(&rsv->lock); + atomic_set(&rsv->usage, 1); + rsv->priority = 6; + INIT_LIST_HEAD(&rsv->list); +} + +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_fs_info *fs_info = root->fs_info; + + block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); + if (!block_rsv) + return NULL; + + btrfs_init_block_rsv(block_rsv); + block_rsv->space_info = __find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); + return block_rsv; +} + +void btrfs_free_block_rsv(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + if (rsv && atomic_dec_and_test(&rsv->usage)) { + btrfs_block_rsv_release(root, rsv, (u64)-1); + if (!rsv->durable) + kfree(rsv); + } +} + +/* + * make the block_rsv struct be able to capture freed space. + * the captured space will re-add to the the block_rsv struct + * after transaction commit + */ +void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv) +{ + block_rsv->durable = 1; + mutex_lock(&fs_info->durable_block_rsv_mutex); + list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); + mutex_unlock(&fs_info->durable_block_rsv_mutex); +} + +int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + int ret; + + if (num_bytes == 0) + return 0; + + ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 1); + return 0; + } + + return ret; +} + +int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved, int min_factor) +{ + u64 num_bytes = 0; + int commit_trans = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + if (min_factor > 0) + num_bytes = div_factor(block_rsv->size, min_factor); + if (min_reserved > num_bytes) + num_bytes = min_reserved; + + if (block_rsv->reserved >= num_bytes) { + ret = 0; + } else { + num_bytes -= block_rsv->reserved; + if (block_rsv->durable && + block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) + commit_trans = 1; + } + spin_unlock(&block_rsv->lock); + if (!ret) + return 0; + + if (block_rsv->refill_used) { + ret = reserve_metadata_bytes(trans, root, block_rsv, + num_bytes, 0); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); + return 0; + } + } + + if (commit_trans) { + if (trans) + return -EAGAIN; + + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + ret = btrfs_commit_transaction(trans, root); + return 0; + } + + return -ENOSPC; +} + +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, + u64 num_bytes) +{ + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +void btrfs_block_rsv_release(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + if (global_rsv->full || global_rsv == block_rsv || + block_rsv->space_info != global_rsv->space_info) + global_rsv = NULL; + block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); +} + +/* + * helper to calculate size of global block reservation. + * the desired value is sum of space used by extent tree, + * checksum tree and root tree + */ +static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *sinfo; + u64 num_bytes; + u64 meta_used; + u64 data_used; + int csum_size = btrfs_super_csum_size(&fs_info->super_copy); + + sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + spin_lock(&sinfo->lock); + data_used = sinfo->bytes_used; + spin_unlock(&sinfo->lock); + + sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + spin_lock(&sinfo->lock); + if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) + data_used = 0; + meta_used = sinfo->bytes_used; + spin_unlock(&sinfo->lock); + + num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * + csum_size * 2; + num_bytes += div64_u64(data_used + meta_used, 50); + + if (num_bytes * 3 > meta_used) + num_bytes = div64_u64(meta_used, 3); + + return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); +} + +static void update_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + struct btrfs_space_info *sinfo = block_rsv->space_info; + u64 num_bytes; + + num_bytes = calc_global_metadata_size(fs_info); + + spin_lock(&block_rsv->lock); + spin_lock(&sinfo->lock); + + block_rsv->size = num_bytes; + + num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + + sinfo->bytes_reserved + sinfo->bytes_readonly + + sinfo->bytes_may_use; + + if (sinfo->total_bytes > num_bytes) { + num_bytes = sinfo->total_bytes - num_bytes; + block_rsv->reserved += num_bytes; + sinfo->bytes_reserved += num_bytes; + } + + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + sinfo->bytes_reserved -= num_bytes; + sinfo->reservation_progress++; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } + + spin_unlock(&sinfo->lock); + spin_unlock(&block_rsv->lock); +} + +static void init_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + fs_info->chunk_block_rsv.space_info = space_info; + fs_info->chunk_block_rsv.priority = 10; + + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + fs_info->global_block_rsv.space_info = space_info; + fs_info->global_block_rsv.priority = 10; + fs_info->global_block_rsv.refill_used = 1; + fs_info->delalloc_block_rsv.space_info = space_info; + fs_info->trans_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.priority = 10; + + fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; + fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; + fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; + fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; + + btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); + + btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); + + update_global_block_rsv(fs_info); +} + +static void release_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); + WARN_ON(fs_info->delalloc_block_rsv.size > 0); + WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); + WARN_ON(fs_info->trans_block_rsv.size > 0); + WARN_ON(fs_info->trans_block_rsv.reserved > 0); + WARN_ON(fs_info->chunk_block_rsv.size > 0); + WARN_ON(fs_info->chunk_block_rsv.reserved > 0); +} + +int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv; + u64 num_bytes; + int ret; + + /* + * Truncate should be freeing data, but give us 2 items just in case it + * needs to use some space. We may want to be smarter about this in the + * future. + */ + num_bytes = btrfs_calc_trans_metadata_size(root, 2); + + /* We already have enough bytes, just return */ + if (rsv->reserved >= num_bytes) + return 0; + + num_bytes -= rsv->reserved; + + /* + * You should have reserved enough space before hand to do this, so this + * should not fail. + */ + ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes); + BUG_ON(ret); + + return 0; +} + +int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int num_items) +{ + u64 num_bytes; + int ret; + + if (num_items == 0 || root->fs_info->chunk_root == root) + return 0; + + num_bytes = btrfs_calc_trans_metadata_size(root, num_items); + ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, + num_bytes); + if (!ret) { + trans->bytes_reserved += num_bytes; + trans->block_rsv = &root->fs_info->trans_block_rsv; + } + return ret; +} + +void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!trans->bytes_reserved) + return; + + BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); + btrfs_block_rsv_release(root, trans->block_rsv, + trans->bytes_reserved); + trans->bytes_reserved = 0; +} + +int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; + + /* + * We need to hold space in order to delete our orphan item once we've + * added it, so this takes the reservation so we can release it later + * when we are truly done with the orphan item. + */ + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +void btrfs_orphan_release_metadata(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); + btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); +} + +int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; + /* + * two for root back/forward refs, two for directory entries + * and one for root of the snapshot. + */ + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); + dst_rsv->space_info = src_rsv->space_info; + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) +{ + return num_bytes >>= 3; +} + +int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; + u64 to_reserve; + int nr_extents; + int reserved_extents; + int ret; + + if (btrfs_transaction_in_commit(root->fs_info)) + schedule_timeout(1); + + num_bytes = ALIGN(num_bytes, root->sectorsize); + + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; + reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); + + if (nr_extents > reserved_extents) { + nr_extents -= reserved_extents; + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + } else { + nr_extents = 0; + to_reserve = 0; + } + + to_reserve += calc_csum_metadata_size(inode, num_bytes); + ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); + if (ret) + return ret; + + atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents); + atomic_inc(&BTRFS_I(inode)->outstanding_extents); + + block_rsv_add_bytes(block_rsv, to_reserve, 1); + + if (block_rsv->size > 512 * 1024 * 1024) + shrink_delalloc(NULL, root, to_reserve, 0); + + return 0; +} + +void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 to_free; + int nr_extents; + int reserved_extents; + + num_bytes = ALIGN(num_bytes, root->sectorsize); + atomic_dec(&BTRFS_I(inode)->outstanding_extents); + WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0); + + reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); + do { + int old, new; + + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); + if (nr_extents >= reserved_extents) { + nr_extents = 0; + break; + } + old = reserved_extents; + nr_extents = reserved_extents - nr_extents; + new = reserved_extents - nr_extents; + old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents, + reserved_extents, new); + if (likely(old == reserved_extents)) + break; + reserved_extents = old; + } while (1); + + to_free = calc_csum_metadata_size(inode, num_bytes); + if (nr_extents > 0) + to_free += btrfs_calc_trans_metadata_size(root, nr_extents); + + btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, + to_free); +} + +int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) +{ + int ret; + + ret = btrfs_check_data_free_space(inode, num_bytes); + if (ret) + return ret; + + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); + if (ret) { + btrfs_free_reserved_data_space(inode, num_bytes); + return ret; + } + + return 0; +} + +void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) +{ + btrfs_delalloc_release_metadata(inode, num_bytes); + btrfs_free_reserved_data_space(inode, num_bytes); +} + +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int alloc) +{ + struct btrfs_block_group_cache *cache = NULL; + struct btrfs_fs_info *info = root->fs_info; + u64 total = num_bytes; + u64 old_val; + u64 byte_in_group; + int factor; + + /* block accounting for super block */ + spin_lock(&info->delalloc_lock); + old_val = btrfs_super_bytes_used(&info->super_copy); + if (alloc) + old_val += num_bytes; + else + old_val -= num_bytes; + btrfs_set_super_bytes_used(&info->super_copy, old_val); + spin_unlock(&info->delalloc_lock); + + while (total) { + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) + return -1; + if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + /* + * If this block group has free space cache written out, we + * need to make sure to load it if we are removing space. This + * is because we need the unpinning stage to actually add the + * space back to the block group, otherwise we will leak space. + */ + if (!alloc && cache->cached == BTRFS_CACHE_NO) + cache_block_group(cache, trans, NULL, 1); + + byte_in_group = bytenr - cache->key.objectid; + WARN_ON(byte_in_group > cache->key.offset); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + + if (btrfs_super_cache_generation(&info->super_copy) != 0 && + cache->disk_cache_state < BTRFS_DC_CLEAR) + cache->disk_cache_state = BTRFS_DC_CLEAR; + + cache->dirty = 1; + old_val = btrfs_block_group_used(&cache->item); + num_bytes = min(total, cache->key.offset - byte_in_group); + if (alloc) { + old_val += num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->reservation_progress++; + cache->space_info->bytes_used += num_bytes; + cache->space_info->disk_used += num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + } else { + old_val -= num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + cache->space_info->bytes_used -= num_bytes; + cache->space_info->disk_used -= num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + set_extent_dirty(info->pinned_extents, + bytenr, bytenr + num_bytes - 1, + GFP_NOFS | __GFP_NOFAIL); + } + btrfs_put_block_group(cache); + total -= num_bytes; + bytenr += num_bytes; + } + return 0; +} + +static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) +{ + struct btrfs_block_group_cache *cache; + u64 bytenr; + + cache = btrfs_lookup_first_block_group(root->fs_info, search_start); + if (!cache) + return 0; + + bytenr = cache->key.objectid; + btrfs_put_block_group(cache); + + return bytenr; +} + +static int pin_down_extent(struct btrfs_root *root, + struct btrfs_block_group_cache *cache, + u64 bytenr, u64 num_bytes, int reserved) +{ + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + if (reserved) { + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->reservation_progress++; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + set_extent_dirty(root->fs_info->pinned_extents, bytenr, + bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + return 0; +} + +/* + * this function must be called within transaction + */ +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); + + pin_down_extent(root, cache, bytenr, num_bytes, reserved); + + btrfs_put_block_group(cache); + return 0; +} + +/* + * update size of reserved extents. this function may return -EAGAIN + * if 'reserve' is true or 'sinfo' is false. + */ +int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int sinfo) +{ + int ret = 0; + if (sinfo) { + struct btrfs_space_info *space_info = cache->space_info; + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (reserve) { + if (cache->ro) { + ret = -EAGAIN; + } else { + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + } + } else { + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; + } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + } else { + spin_lock(&cache->lock); + if (cache->ro) { + ret = -EAGAIN; + } else { + if (reserve) + cache->reserved += num_bytes; + else + cache->reserved -= num_bytes; + } + spin_unlock(&cache->lock); + } + return ret; +} + +int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_caching_control *next; + struct btrfs_caching_control *caching_ctl; + struct btrfs_block_group_cache *cache; + + down_write(&fs_info->extent_commit_sem); + + list_for_each_entry_safe(caching_ctl, next, + &fs_info->caching_block_groups, list) { + cache = caching_ctl->block_group; + if (block_group_cache_done(cache)) { + cache->last_byte_to_unpin = (u64)-1; + list_del_init(&caching_ctl->list); + put_caching_control(caching_ctl); + } else { + cache->last_byte_to_unpin = caching_ctl->progress; + } + } + + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) + fs_info->pinned_extents = &fs_info->freed_extents[1]; + else + fs_info->pinned_extents = &fs_info->freed_extents[0]; + + up_write(&fs_info->extent_commit_sem); + + update_global_block_rsv(fs_info); + return 0; +} + +static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache = NULL; + u64 len; + + while (start <= end) { + if (!cache || + start >= cache->key.objectid + cache->key.offset) { + if (cache) + btrfs_put_block_group(cache); + cache = btrfs_lookup_block_group(fs_info, start); + BUG_ON(!cache); + } + + len = cache->key.objectid + cache->key.offset - start; + len = min(len, end + 1 - start); + + if (start < cache->last_byte_to_unpin) { + len = min(len, cache->last_byte_to_unpin - start); + btrfs_add_free_space(cache, start, len); + } + + start += len; + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned -= len; + cache->space_info->bytes_pinned -= len; + if (cache->ro) { + cache->space_info->bytes_readonly += len; + } else if (cache->reserved_pinned > 0) { + len = min(len, cache->reserved_pinned); + cache->reserved_pinned -= len; + cache->space_info->bytes_reserved += len; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + } + + if (cache) + btrfs_put_block_group(cache); + return 0; +} + +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *unpin; + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *next_rsv; + u64 start; + u64 end; + int idx; + int ret; + + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) + unpin = &fs_info->freed_extents[1]; + else + unpin = &fs_info->freed_extents[0]; + + while (1) { + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_discard_extent(root, start, + end + 1 - start, NULL); + + clear_extent_dirty(unpin, start, end, GFP_NOFS); + unpin_extent_range(root, start, end); + cond_resched(); + } + + mutex_lock(&fs_info->durable_block_rsv_mutex); + list_for_each_entry_safe(block_rsv, next_rsv, + &fs_info->durable_block_rsv_list, list) { + + idx = trans->transid & 0x1; + if (block_rsv->freed[idx] > 0) { + block_rsv_add_bytes(block_rsv, + block_rsv->freed[idx], 0); + block_rsv->freed[idx] = 0; + } + if (atomic_read(&block_rsv->usage) == 0) { + btrfs_block_rsv_release(root, block_rsv, (u64)-1); + + if (block_rsv->freed[0] == 0 && + block_rsv->freed[1] == 0) { + list_del_init(&block_rsv->list); + kfree(block_rsv); + } + } else { + btrfs_block_rsv_release(root, block_rsv, 0); + } + } + mutex_unlock(&fs_info->durable_block_rsv_mutex); + + return 0; +} + +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_root *extent_root = info->extent_root; + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + int ret; + int is_data; + int extent_slot = 0; + int found_extent = 0; + int num_to_del = 1; + u32 item_size; + u64 refs; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + path->leave_spinning = 1; + + is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; + BUG_ON(!is_data && refs_to_drop != 1); + + ret = lookup_extent_backref(trans, extent_root, path, &iref, + bytenr, num_bytes, parent, + root_objectid, owner_objectid, + owner_offset); + if (ret == 0) { + extent_slot = path->slots[0]; + while (extent_slot >= 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + extent_slot); + if (key.objectid != bytenr) + break; + if (key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) { + found_extent = 1; + break; + } + if (path->slots[0] - extent_slot > 5) + break; + extent_slot--; + } +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); + if (found_extent && item_size < sizeof(*ei)) + found_extent = 0; +#endif + if (!found_extent) { + BUG_ON(iref); + ret = remove_extent_backref(trans, extent_root, path, + NULL, refs_to_drop, + is_data); + BUG_ON(ret); + btrfs_release_path(path); + path->leave_spinning = 1; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + if (ret) { + printk(KERN_ERR "umm, got %d back from search" + ", was looking for %llu\n", ret, + (unsigned long long)bytenr); + btrfs_print_leaf(extent_root, path->nodes[0]); + } + BUG_ON(ret); + extent_slot = path->slots[0]; + } + } else { + btrfs_print_leaf(extent_root, path->nodes[0]); + WARN_ON(1); + printk(KERN_ERR "btrfs unable to find ref byte nr %llu " + "parent %llu root %llu owner %llu offset %llu\n", + (unsigned long long)bytenr, + (unsigned long long)parent, + (unsigned long long)root_objectid, + (unsigned long long)owner_objectid, + (unsigned long long)owner_offset); + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, extent_slot); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + BUG_ON(found_extent || extent_slot != path->slots[0]); + ret = convert_extent_item_v0(trans, extent_root, path, + owner_objectid, 0); + BUG_ON(ret < 0); + + btrfs_release_path(path); + path->leave_spinning = 1; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + ret = btrfs_search_slot(trans, extent_root, &key, path, + -1, 1); + if (ret) { + printk(KERN_ERR "umm, got %d back from search" + ", was looking for %llu\n", ret, + (unsigned long long)bytenr); + btrfs_print_leaf(extent_root, path->nodes[0]); + } + BUG_ON(ret); + extent_slot = path->slots[0]; + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, extent_slot); + } +#endif + BUG_ON(item_size < sizeof(*ei)); + ei = btrfs_item_ptr(leaf, extent_slot, + struct btrfs_extent_item); + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + struct btrfs_tree_block_info *bi; + BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); + } + + refs = btrfs_extent_refs(leaf, ei); + BUG_ON(refs < refs_to_drop); + refs -= refs_to_drop; + + if (refs > 0) { + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + /* + * In the case of inline back ref, reference count will + * be updated by remove_extent_backref + */ + if (iref) { + BUG_ON(!found_extent); + } else { + btrfs_set_extent_refs(leaf, ei, refs); + btrfs_mark_buffer_dirty(leaf); + } + if (found_extent) { + ret = remove_extent_backref(trans, extent_root, path, + iref, refs_to_drop, + is_data); + BUG_ON(ret); + } + } else { + if (found_extent) { + BUG_ON(is_data && refs_to_drop != + extent_data_ref_count(root, path, iref)); + if (iref) { + BUG_ON(path->slots[0] != extent_slot); + } else { + BUG_ON(path->slots[0] != extent_slot + 1); + path->slots[0] = extent_slot; + num_to_del = 2; + } + } + + ret = btrfs_del_items(trans, extent_root, path, path->slots[0], + num_to_del); + BUG_ON(ret); + btrfs_release_path(path); + + if (is_data) { + ret = btrfs_del_csums(trans, root, bytenr, num_bytes); + BUG_ON(ret); + } else { + invalidate_mapping_pages(info->btree_inode->i_mapping, + bytenr >> PAGE_CACHE_SHIFT, + (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); + } + + ret = update_block_group(trans, root, bytenr, num_bytes, 0); + BUG_ON(ret); + } + btrfs_free_path(path); + return ret; +} + +/* + * when we free an block, it is possible (and likely) that we free the last + * delayed ref for that extent as well. This searches the delayed ref tree for + * a given extent, and if there are no other delayed refs to be processed, it + * removes it from the tree. + */ +static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct rb_node *node; + int ret = 0; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (!head) + goto out; + + node = rb_prev(&head->node.rb_node); + if (!node) + goto out; + + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + + /* there are still entries for this ref, we can't drop it */ + if (ref->bytenr == bytenr) + goto out; + + if (head->extent_op) { + if (!head->must_insert_reserved) + goto out; + kfree(head->extent_op); + head->extent_op = NULL; + } + + /* + * waiting for the lock here would deadlock. If someone else has it + * locked they are already in the process of dropping it anyway + */ + if (!mutex_trylock(&head->mutex)) + goto out; + + /* + * at this point we have a head with no other entries. Go + * ahead and process it. + */ + head->node.in_tree = 0; + rb_erase(&head->node.rb_node, &delayed_refs->root); + + delayed_refs->num_entries--; + + /* + * we don't take a ref on the node because we're removing it from the + * tree, so we just steal the ref the tree was holding. + */ + delayed_refs->num_heads--; + if (list_empty(&head->cluster)) + delayed_refs->num_heads_ready--; + + list_del_init(&head->cluster); + spin_unlock(&delayed_refs->lock); + + BUG_ON(head->extent_op); + if (head->must_insert_reserved) + ret = 1; + + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return ret; +out: + spin_unlock(&delayed_refs->lock); + return 0; +} + +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + u64 parent, int last_ref) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_group_cache *cache = NULL; + int ret; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, + parent, root->root_key.objectid, + btrfs_header_level(buf), + BTRFS_DROP_DELAYED_REF, NULL); + BUG_ON(ret); + } + + if (!last_ref) + return; + + block_rsv = get_block_rsv(trans, root); + cache = btrfs_lookup_block_group(root->fs_info, buf->start); + if (block_rsv->space_info != cache->space_info) + goto out; + + if (btrfs_header_generation(buf) == trans->transid) { + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = check_ref_cleanup(trans, root, buf->start); + if (!ret) + goto pin; + } + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + pin_down_extent(root, cache, buf->start, buf->len, 1); + goto pin; + } + + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + + btrfs_add_free_space(cache, buf->start, buf->len); + ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); + if (ret == -EAGAIN) { + /* block group became read-only */ + btrfs_update_reserved_bytes(cache, buf->len, 0, 1); + goto out; + } + + ret = 1; + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + block_rsv->reserved += buf->len; + ret = 0; + } + spin_unlock(&block_rsv->lock); + + if (ret) { + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_reserved -= buf->len; + cache->space_info->reservation_progress++; + spin_unlock(&cache->space_info->lock); + } + goto out; + } +pin: + if (block_rsv->durable && !cache->ro) { + ret = 0; + spin_lock(&cache->lock); + if (!cache->ro) { + cache->reserved_pinned += buf->len; + ret = 1; + } + spin_unlock(&cache->lock); + + if (ret) { + spin_lock(&block_rsv->lock); + block_rsv->freed[trans->transid & 0x1] += buf->len; + spin_unlock(&block_rsv->lock); + } + } +out: + /* + * Deleting the buffer, clear the corrupt flag since it doesn't matter + * anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + btrfs_put_block_group(cache); +} + +int btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) +{ + int ret; + + /* + * tree log blocks never actually go into the extent allocation + * tree, just update pinning info and exit early. + */ + if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { + WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); + /* unlocks the pinned mutex */ + btrfs_pin_extent(root, bytenr, num_bytes, 1); + ret = 0; + } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + parent, root_objectid, (int)owner, + BTRFS_DROP_DELAYED_REF, NULL); + BUG_ON(ret); + } else { + ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + parent, root_objectid, owner, + offset, BTRFS_DROP_DELAYED_REF, NULL); + BUG_ON(ret); + } + return ret; +} + +static u64 stripe_align(struct btrfs_root *root, u64 val) +{ + u64 mask = ((u64)root->stripesize - 1); + u64 ret = (val + mask) & ~mask; + return ret; +} + +/* + * when we wait for progress in the block group caching, its because + * our allocation attempt failed at least once. So, we must sleep + * and let some progress happen before we try again. + * + * This function will sleep at least once waiting for new free space to + * show up, and then it will check the block group free space numbers + * for our min num_bytes. Another option is to have it go ahead + * and look in the rbtree for a free extent of a given size, but this + * is a good start. + */ +static noinline int +wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, + u64 num_bytes) +{ + struct btrfs_caching_control *caching_ctl; + DEFINE_WAIT(wait); + + caching_ctl = get_caching_control(cache); + if (!caching_ctl) + return 0; + + wait_event(caching_ctl->wait, block_group_cache_done(cache) || + (cache->free_space_ctl->free_space >= num_bytes)); + + put_caching_control(caching_ctl); + return 0; +} + +static noinline int +wait_block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *caching_ctl; + DEFINE_WAIT(wait); + + caching_ctl = get_caching_control(cache); + if (!caching_ctl) + return 0; + + wait_event(caching_ctl->wait, block_group_cache_done(cache)); + + put_caching_control(caching_ctl); + return 0; +} + +static int get_block_group_index(struct btrfs_block_group_cache *cache) +{ + int index; + if (cache->flags & BTRFS_BLOCK_GROUP_RAID10) + index = 0; + else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1) + index = 1; + else if (cache->flags & BTRFS_BLOCK_GROUP_DUP) + index = 2; + else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) + index = 3; + else + index = 4; + return index; +} + +enum btrfs_loop_type { + LOOP_FIND_IDEAL = 0, + LOOP_CACHING_NOWAIT = 1, + LOOP_CACHING_WAIT = 2, + LOOP_ALLOC_CHUNK = 3, + LOOP_NO_EMPTY_SIZE = 4, +}; + +/* + * walks the btree of allocated extents and find a hole of a given size. + * The key ins is changed to record the hole: + * ins->objectid == block start + * ins->flags = BTRFS_EXTENT_ITEM_KEY + * ins->offset == number of blocks + * Any available blocks before search_start are skipped. + */ +static noinline int find_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *orig_root, + u64 num_bytes, u64 empty_size, + u64 search_start, u64 search_end, + u64 hint_byte, struct btrfs_key *ins, + u64 data) +{ + int ret = 0; + struct btrfs_root *root = orig_root->fs_info->extent_root; + struct btrfs_free_cluster *last_ptr = NULL; + struct btrfs_block_group_cache *block_group = NULL; + int empty_cluster = 2 * 1024 * 1024; + int allowed_chunk_alloc = 0; + int done_chunk_alloc = 0; + struct btrfs_space_info *space_info; + int last_ptr_loop = 0; + int loop = 0; + int index = 0; + bool found_uncached_bg = false; + bool failed_cluster_refill = false; + bool failed_alloc = false; + bool use_cluster = true; + u64 ideal_cache_percent = 0; + u64 ideal_cache_offset = 0; + + WARN_ON(num_bytes < root->sectorsize); + btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); + ins->objectid = 0; + ins->offset = 0; + + space_info = __find_space_info(root->fs_info, data); + if (!space_info) { + printk(KERN_ERR "No space info for %llu\n", data); + return -ENOSPC; + } + + /* + * If the space info is for both data and metadata it means we have a + * small filesystem and we can't use the clustering stuff. + */ + if (btrfs_mixed_space_info(space_info)) + use_cluster = false; + + if (orig_root->ref_cows || empty_size) + allowed_chunk_alloc = 1; + + if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { + last_ptr = &root->fs_info->meta_alloc_cluster; + if (!btrfs_test_opt(root, SSD)) + empty_cluster = 64 * 1024; + } + + if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster && + btrfs_test_opt(root, SSD)) { + last_ptr = &root->fs_info->data_alloc_cluster; + } + + if (last_ptr) { + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + hint_byte = last_ptr->window_start; + spin_unlock(&last_ptr->lock); + } + + search_start = max(search_start, first_logical_byte(root, 0)); + search_start = max(search_start, hint_byte); + + if (!last_ptr) + empty_cluster = 0; + + if (search_start == hint_byte) { +ideal_cache: + block_group = btrfs_lookup_block_group(root->fs_info, + search_start); + /* + * we don't want to use the block group if it doesn't match our + * allocation bits, or if its not cached. + * + * However if we are re-searching with an ideal block group + * picked out then we don't care that the block group is cached. + */ + if (block_group && block_group_bits(block_group, data) && + (block_group->cached != BTRFS_CACHE_NO || + search_start == ideal_cache_offset)) { + down_read(&space_info->groups_sem); + if (list_empty(&block_group->list) || + block_group->ro) { + /* + * someone is removing this block group, + * we can't jump into the have_block_group + * target because our list pointers are not + * valid + */ + btrfs_put_block_group(block_group); + up_read(&space_info->groups_sem); + } else { + index = get_block_group_index(block_group); + goto have_block_group; + } + } else if (block_group) { + btrfs_put_block_group(block_group); + } + } +search: + down_read(&space_info->groups_sem); + list_for_each_entry(block_group, &space_info->block_groups[index], + list) { + u64 offset; + int cached; + + btrfs_get_block_group(block_group); + search_start = block_group->key.objectid; + + /* + * this can happen if we end up cycling through all the + * raid types, but we want to make sure we only allocate + * for the proper type. + */ + if (!block_group_bits(block_group, data)) { + u64 extra = BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10; + + /* + * if they asked for extra copies and this block group + * doesn't provide them, bail. This does allow us to + * fill raid0 from raid1. + */ + if ((data & extra) && !(block_group->flags & extra)) + goto loop; + } + +have_block_group: + if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + u64 free_percent; + + ret = cache_block_group(block_group, trans, + orig_root, 1); + if (block_group->cached == BTRFS_CACHE_FINISHED) + goto have_block_group; + + free_percent = btrfs_block_group_used(&block_group->item); + free_percent *= 100; + free_percent = div64_u64(free_percent, + block_group->key.offset); + free_percent = 100 - free_percent; + if (free_percent > ideal_cache_percent && + likely(!block_group->ro)) { + ideal_cache_offset = block_group->key.objectid; + ideal_cache_percent = free_percent; + } + + /* + * We only want to start kthread caching if we are at + * the point where we will wait for caching to make + * progress, or if our ideal search is over and we've + * found somebody to start caching. + */ + if (loop > LOOP_CACHING_NOWAIT || + (loop > LOOP_FIND_IDEAL && + atomic_read(&space_info->caching_threads) < 2)) { + ret = cache_block_group(block_group, trans, + orig_root, 0); + BUG_ON(ret); + } + found_uncached_bg = true; + + /* + * If loop is set for cached only, try the next block + * group. + */ + if (loop == LOOP_FIND_IDEAL) + goto loop; + } + + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) + found_uncached_bg = true; + + if (unlikely(block_group->ro)) + goto loop; + + spin_lock(&block_group->free_space_ctl->tree_lock); + if (cached && + block_group->free_space_ctl->free_space < + num_bytes + empty_size) { + spin_unlock(&block_group->free_space_ctl->tree_lock); + goto loop; + } + spin_unlock(&block_group->free_space_ctl->tree_lock); + + /* + * Ok we want to try and use the cluster allocator, so lets look + * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will + * have tried the cluster allocator plenty of times at this + * point and not have found anything, so we are likely way too + * fragmented for the clustering stuff to find anything, so lets + * just skip it and let the allocator find whatever block it can + * find + */ + if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { + /* + * the refill lock keeps out other + * people trying to start a new cluster + */ + spin_lock(&last_ptr->refill_lock); + if (last_ptr->block_group && + (last_ptr->block_group->ro || + !block_group_bits(last_ptr->block_group, data))) { + offset = 0; + goto refill_cluster; + } + + offset = btrfs_alloc_from_cluster(block_group, last_ptr, + num_bytes, search_start); + if (offset) { + /* we have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); + goto checks; + } + + spin_lock(&last_ptr->lock); + /* + * whoops, this cluster doesn't actually point to + * this block group. Get a ref on the block + * group is does point to and try again + */ + if (!last_ptr_loop && last_ptr->block_group && + last_ptr->block_group != block_group) { + + btrfs_put_block_group(block_group); + block_group = last_ptr->block_group; + btrfs_get_block_group(block_group); + spin_unlock(&last_ptr->lock); + spin_unlock(&last_ptr->refill_lock); + + last_ptr_loop = 1; + search_start = block_group->key.objectid; + /* + * we know this block group is properly + * in the list because + * btrfs_remove_block_group, drops the + * cluster before it removes the block + * group from the list + */ + goto have_block_group; + } + spin_unlock(&last_ptr->lock); +refill_cluster: + /* + * this cluster didn't work out, free it and + * start over + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + + last_ptr_loop = 0; + + /* allocate a cluster in this block group */ + ret = btrfs_find_space_cluster(trans, root, + block_group, last_ptr, + offset, num_bytes, + empty_cluster + empty_size); + if (ret == 0) { + /* + * now pull our allocation out of this + * cluster + */ + offset = btrfs_alloc_from_cluster(block_group, + last_ptr, num_bytes, + search_start); + if (offset) { + /* we found one, proceed */ + spin_unlock(&last_ptr->refill_lock); + goto checks; + } + } else if (!cached && loop > LOOP_CACHING_NOWAIT + && !failed_cluster_refill) { + spin_unlock(&last_ptr->refill_lock); + + failed_cluster_refill = true; + wait_block_group_cache_progress(block_group, + num_bytes + empty_cluster + empty_size); + goto have_block_group; + } + + /* + * at this point we either didn't find a cluster + * or we weren't able to allocate a block from our + * cluster. Free the cluster we've been trying + * to use, and go to the next block group + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + spin_unlock(&last_ptr->refill_lock); + goto loop; + } + + offset = btrfs_find_space_for_alloc(block_group, search_start, + num_bytes, empty_size); + /* + * If we didn't find a chunk, and we haven't failed on this + * block group before, and this block group is in the middle of + * caching and we are ok with waiting, then go ahead and wait + * for progress to be made, and set failed_alloc to true. + * + * If failed_alloc is true then we've already waited on this + * block group once and should move on to the next block group. + */ + if (!offset && !failed_alloc && !cached && + loop > LOOP_CACHING_NOWAIT) { + wait_block_group_cache_progress(block_group, + num_bytes + empty_size); + failed_alloc = true; + goto have_block_group; + } else if (!offset) { + goto loop; + } +checks: + search_start = stripe_align(root, offset); + /* move on to the next group */ + if (search_start + num_bytes >= search_end) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; + } + + /* move on to the next group */ + if (search_start + num_bytes > + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; + } + + ins->objectid = search_start; + ins->offset = num_bytes; + + if (offset < search_start) + btrfs_add_free_space(block_group, offset, + search_start - offset); + BUG_ON(offset > search_start); + + ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, + (data & BTRFS_BLOCK_GROUP_DATA)); + if (ret == -EAGAIN) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; + } + + /* we are all good, lets return */ + ins->objectid = search_start; + ins->offset = num_bytes; + + if (offset < search_start) + btrfs_add_free_space(block_group, offset, + search_start - offset); + BUG_ON(offset > search_start); + btrfs_put_block_group(block_group); + break; +loop: + failed_cluster_refill = false; + failed_alloc = false; + BUG_ON(index != get_block_group_index(block_group)); + btrfs_put_block_group(block_group); + } + up_read(&space_info->groups_sem); + + if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) + goto search; + + /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for + * for them to make caching progress. Also + * determine the best possible bg to cache + * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking + * caching kthreads as we move along + * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_ALLOC_CHUNK, force a chunk allocation and try again + * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try + * again + */ + if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { + index = 0; + if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { + found_uncached_bg = false; + loop++; + if (!ideal_cache_percent && + atomic_read(&space_info->caching_threads)) + goto search; + + /* + * 1 of the following 2 things have happened so far + * + * 1) We found an ideal block group for caching that + * is mostly full and will cache quickly, so we might + * as well wait for it. + * + * 2) We searched for cached only and we didn't find + * anything, and we didn't start any caching kthreads + * either, so chances are we will loop through and + * start a couple caching kthreads, and then come back + * around and just wait for them. This will be slower + * because we will have 2 caching kthreads reading at + * the same time when we could have just started one + * and waited for it to get far enough to give us an + * allocation, so go ahead and go to the wait caching + * loop. + */ + loop = LOOP_CACHING_WAIT; + search_start = ideal_cache_offset; + ideal_cache_percent = 0; + goto ideal_cache; + } else if (loop == LOOP_FIND_IDEAL) { + /* + * Didn't find a uncached bg, wait on anything we find + * next. + */ + loop = LOOP_CACHING_WAIT; + goto search; + } + + loop++; + + if (loop == LOOP_ALLOC_CHUNK) { + if (allowed_chunk_alloc) { + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, + CHUNK_ALLOC_LIMITED); + allowed_chunk_alloc = 0; + if (ret == 1) + done_chunk_alloc = 1; + } else if (!done_chunk_alloc && + space_info->force_alloc == + CHUNK_ALLOC_NO_FORCE) { + space_info->force_alloc = CHUNK_ALLOC_LIMITED; + } + + /* + * We didn't allocate a chunk, go ahead and drop the + * empty size and loop again. + */ + if (!done_chunk_alloc) + loop = LOOP_NO_EMPTY_SIZE; + } + + if (loop == LOOP_NO_EMPTY_SIZE) { + empty_size = 0; + empty_cluster = 0; + } + + goto search; + } else if (!ins->objectid) { + ret = -ENOSPC; + } else if (ins->objectid) { + ret = 0; + } + + return ret; +} + +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) +{ + struct btrfs_block_group_cache *cache; + int index = 0; + + spin_lock(&info->lock); + printk(KERN_INFO "space_info has %llu free, is %sfull\n", + (unsigned long long)(info->total_bytes - info->bytes_used - + info->bytes_pinned - info->bytes_reserved - + info->bytes_readonly), + (info->full) ? "" : "not "); + printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " + "reserved=%llu, may_use=%llu, readonly=%llu\n", + (unsigned long long)info->total_bytes, + (unsigned long long)info->bytes_used, + (unsigned long long)info->bytes_pinned, + (unsigned long long)info->bytes_reserved, + (unsigned long long)info->bytes_may_use, + (unsigned long long)info->bytes_readonly); + spin_unlock(&info->lock); + + if (!dump_block_groups) + return; + + down_read(&info->groups_sem); +again: + list_for_each_entry(cache, &info->block_groups[index], list) { + spin_lock(&cache->lock); + printk(KERN_INFO "block group %llu has %llu bytes, %llu used " + "%llu pinned %llu reserved\n", + (unsigned long long)cache->key.objectid, + (unsigned long long)cache->key.offset, + (unsigned long long)btrfs_block_group_used(&cache->item), + (unsigned long long)cache->pinned, + (unsigned long long)cache->reserved); + btrfs_dump_free_space(cache, bytes); + spin_unlock(&cache->lock); + } + if (++index < BTRFS_NR_RAID_TYPES) + goto again; + up_read(&info->groups_sem); +} + +int btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data) +{ + int ret; + u64 search_start = 0; + + data = btrfs_get_alloc_profile(root, data); +again: + /* + * the only place that sets empty_size is btrfs_realloc_node, which + * is not called recursively on allocations + */ + if (empty_size || root->ref_cows) + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes + 2 * 1024 * 1024, data, + CHUNK_ALLOC_NO_FORCE); + + WARN_ON(num_bytes < root->sectorsize); + ret = find_free_extent(trans, root, num_bytes, empty_size, + search_start, search_end, hint_byte, + ins, data); + + if (ret == -ENOSPC && num_bytes > min_alloc_size) { + num_bytes = num_bytes >> 1; + num_bytes = num_bytes & ~(root->sectorsize - 1); + num_bytes = max(num_bytes, min_alloc_size); + do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes, data, CHUNK_ALLOC_FORCE); + goto again; + } + if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { + struct btrfs_space_info *sinfo; + + sinfo = __find_space_info(root->fs_info, data); + printk(KERN_ERR "btrfs allocation failed flags %llu, " + "wanted %llu\n", (unsigned long long)data, + (unsigned long long)num_bytes); + dump_space_info(sinfo, num_bytes, 1); + } + + trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); + + return ret; +} + +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) +{ + struct btrfs_block_group_cache *cache; + int ret = 0; + + cache = btrfs_lookup_block_group(root->fs_info, start); + if (!cache) { + printk(KERN_ERR "Unable to find block group for %llu\n", + (unsigned long long)start); + return -ENOSPC; + } + + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_discard_extent(root, start, len, NULL); + + btrfs_add_free_space(cache, start, len); + btrfs_update_reserved_bytes(cache, len, 0, 1); + btrfs_put_block_group(cache); + + trace_btrfs_reserved_extent_free(root, start, len); + + return ret; +} + +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod) +{ + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_extent_item *extent_item; + struct btrfs_extent_inline_ref *iref; + struct btrfs_path *path; + struct extent_buffer *leaf; + int type; + u32 size; + + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; + + size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + ins, size); + BUG_ON(ret); + + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, extent_item, ref_mod); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_DATA); + + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (parent > 0) { + struct btrfs_shared_data_ref *ref; + ref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); + } else { + struct btrfs_extent_data_ref *ref; + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); + } + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); + if (ret) { + printk(KERN_ERR "btrfs update block group failed for %llu " + "%llu\n", (unsigned long long)ins->objectid, + (unsigned long long)ins->offset); + BUG(); + } + return ret; +} + +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, struct btrfs_disk_key *key, + int level, struct btrfs_key *ins) +{ + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_extent_item *extent_item; + struct btrfs_tree_block_info *block_info; + struct btrfs_extent_inline_ref *iref; + struct btrfs_path *path; + struct extent_buffer *leaf; + u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + ins, size); + BUG_ON(ret); + + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, extent_item, 1); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); + block_info = (struct btrfs_tree_block_info *)(extent_item + 1); + + btrfs_set_tree_block_key(leaf, block_info, key); + btrfs_set_tree_block_level(leaf, block_info, level); + + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + if (parent > 0) { + BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); + if (ret) { + printk(KERN_ERR "btrfs update block group failed for %llu " + "%llu\n", (unsigned long long)ins->objectid, + (unsigned long long)ins->offset); + BUG(); + } + return ret; +} + +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, + u64 offset, struct btrfs_key *ins) +{ + int ret; + + BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); + + ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, + 0, root_objectid, owner, offset, + BTRFS_ADD_DELAYED_EXTENT, NULL); + return ret; +} + +/* + * this is used by the tree logging recovery code. It records that + * an extent has been allocated and makes sure to clear the free + * space cache bits as well + */ +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins) +{ + int ret; + struct btrfs_block_group_cache *block_group; + struct btrfs_caching_control *caching_ctl; + u64 start = ins->objectid; + u64 num_bytes = ins->offset; + + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + cache_block_group(block_group, trans, NULL, 0); + caching_ctl = get_caching_control(block_group); + + if (!caching_ctl) { + BUG_ON(!block_group_cache_done(block_group)); + ret = btrfs_remove_free_space(block_group, start, num_bytes); + BUG_ON(ret); + } else { + mutex_lock(&caching_ctl->mutex); + + if (start >= caching_ctl->progress) { + ret = add_excluded_extent(root, start, num_bytes); + BUG_ON(ret); + } else if (start + num_bytes <= caching_ctl->progress) { + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + BUG_ON(ret); + } else { + num_bytes = caching_ctl->progress - start; + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + BUG_ON(ret); + + start = caching_ctl->progress; + num_bytes = ins->objectid + ins->offset - + caching_ctl->progress; + ret = add_excluded_extent(root, start, num_bytes); + BUG_ON(ret); + } + + mutex_unlock(&caching_ctl->mutex); + put_caching_control(caching_ctl); + } + + ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); + BUG_ON(ret); + btrfs_put_block_group(block_group); + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, + 0, owner, offset, ins, 1); + return ret; +} + +struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize, + int level) +{ + struct extent_buffer *buf; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return ERR_PTR(-ENOMEM); + btrfs_set_header_generation(buf, trans->transid); + btrfs_set_buffer_lockdep_class(buf, level); + btrfs_tree_lock(buf); + clean_tree_block(trans, root, buf); + + btrfs_set_lock_blocking(buf); + btrfs_set_buffer_uptodate(buf); + + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + /* + * we allow two log transactions at a time, use different + * EXENT bit to differentiate dirty pages. + */ + if (root->log_transid % 2 == 0) + set_extent_dirty(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + else + set_extent_new(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + } else { + set_extent_dirty(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + } + trans->blocks_used++; + /* this returns a buffer locked for blocking */ + return buf; +} + +static struct btrfs_block_rsv * +use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u32 blocksize) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + int ret; + + block_rsv = get_block_rsv(trans, root); + + if (block_rsv->size == 0) { + ret = reserve_metadata_bytes(trans, root, block_rsv, + blocksize, 0); + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve. + */ + if (ret && block_rsv != global_rsv) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + return ERR_PTR(ret); + } else if (ret) { + return ERR_PTR(ret); + } + return block_rsv; + } + + ret = block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) + return block_rsv; + if (ret) { + WARN_ON(1); + ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, + 0); + if (!ret) { + spin_lock(&block_rsv->lock); + block_rsv->size += blocksize; + spin_unlock(&block_rsv->lock); + return block_rsv; + } else if (ret && block_rsv != global_rsv) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + } + + return ERR_PTR(-ENOSPC); +} + +static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) +{ + block_rsv_add_bytes(block_rsv, blocksize, 0); + block_rsv_release_bytes(block_rsv, NULL, 0); +} + +/* + * finds a free extent and does all the dirty work required for allocation + * returns the key for the extent through ins, and a tree buffer for + * the first block of the extent through buf. + * + * returns the tree buffer or NULL. + */ +struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u32 blocksize, + u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 hint, u64 empty_size) +{ + struct btrfs_key ins; + struct btrfs_block_rsv *block_rsv; + struct extent_buffer *buf; + u64 flags = 0; + int ret; + + + block_rsv = use_block_rsv(trans, root, blocksize); + if (IS_ERR(block_rsv)) + return ERR_CAST(block_rsv); + + ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, + empty_size, hint, (u64)-1, &ins, 0); + if (ret) { + unuse_block_rsv(block_rsv, blocksize); + return ERR_PTR(ret); + } + + buf = btrfs_init_new_buffer(trans, root, ins.objectid, + blocksize, level); + BUG_ON(IS_ERR(buf)); + + if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent == 0) + parent = ins.objectid; + flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else + BUG_ON(parent > 0); + + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_delayed_extent_op *extent_op; + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = 1; + extent_op->update_flags = 1; + extent_op->is_data = 0; + + ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, + ins.offset, parent, root_objectid, + level, BTRFS_ADD_DELAYED_EXTENT, + extent_op); + BUG_ON(ret); + } + return buf; +} + +struct walk_control { + u64 refs[BTRFS_MAX_LEVEL]; + u64 flags[BTRFS_MAX_LEVEL]; + struct btrfs_key update_progress; + int stage; + int level; + int shared_level; + int update_ref; + int keep_locks; + int reada_slot; + int reada_count; +}; + +#define DROP_REFERENCE 1 +#define UPDATE_BACKREF 2 + +static noinline void reada_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct walk_control *wc, + struct btrfs_path *path) +{ + u64 bytenr; + u64 generation; + u64 refs; + u64 flags; + u32 nritems; + u32 blocksize; + struct btrfs_key key; + struct extent_buffer *eb; + int ret; + int slot; + int nread = 0; + + if (path->slots[wc->level] < wc->reada_slot) { + wc->reada_count = wc->reada_count * 2 / 3; + wc->reada_count = max(wc->reada_count, 2); + } else { + wc->reada_count = wc->reada_count * 3 / 2; + wc->reada_count = min_t(int, wc->reada_count, + BTRFS_NODEPTRS_PER_BLOCK(root)); + } + + eb = path->nodes[wc->level]; + nritems = btrfs_header_nritems(eb); + blocksize = btrfs_level_size(root, wc->level - 1); + + for (slot = path->slots[wc->level]; slot < nritems; slot++) { + if (nread >= wc->reada_count) + break; + + cond_resched(); + bytenr = btrfs_node_blockptr(eb, slot); + generation = btrfs_node_ptr_generation(eb, slot); + + if (slot == path->slots[wc->level]) + goto reada; + + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) + continue; + + /* We don't lock the tree block, it's OK to be racy here */ + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &refs, &flags); + BUG_ON(ret); + BUG_ON(refs == 0); + + if (wc->stage == DROP_REFERENCE) { + if (refs == 1) + goto reada; + + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; + if (!wc->update_ref || + generation <= root->root_key.offset) + continue; + btrfs_node_key_to_cpu(eb, &key, slot); + ret = btrfs_comp_cpu_keys(&key, + &wc->update_progress); + if (ret < 0) + continue; + } else { + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; + } +reada: + ret = readahead_tree_block(root, bytenr, blocksize, + generation); + if (ret) + break; + nread++; + } + wc->reada_slot = slot; +} + +/* + * hepler to process tree block while walking down the tree. + * + * when wc->stage == UPDATE_BACKREF, this function updates + * back refs for pointers in the block. + * + * NOTE: return value 1 means we should stop walking down. + */ +static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int lookup_info) +{ + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; + int ret; + + if (wc->stage == UPDATE_BACKREF && + btrfs_header_owner(eb) != root->root_key.objectid) + return 1; + + /* + * when reference count of tree block is 1, it won't increase + * again. once full backref flag is set, we never clear it. + */ + if (lookup_info && + ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { + BUG_ON(!path->locks[level]); + ret = btrfs_lookup_extent_info(trans, root, + eb->start, eb->len, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); + } + + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level] > 1) + return 1; + + if (path->locks[level] && !wc->keep_locks) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + } + return 0; + } + + /* wc->stage == UPDATE_BACKREF */ + if (!(wc->flags[level] & flag)) { + BUG_ON(!path->locks[level]); + ret = btrfs_inc_ref(trans, root, eb, 1); + BUG_ON(ret); + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); + ret = btrfs_set_disk_extent_flags(trans, root, eb->start, + eb->len, flag, 0); + BUG_ON(ret); + wc->flags[level] |= flag; + } + + /* + * the block is shared by multiple trees, so it's not good to + * keep the tree lock + */ + if (path->locks[level] && level > 0) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + } + return 0; +} + +/* + * hepler to process tree block pointer. + * + * when wc->stage == DROP_REFERENCE, this function checks + * reference count of the block pointed to. if the block + * is shared and we need update back refs for the subtree + * rooted at the block, this function changes wc->stage to + * UPDATE_BACKREF. if the block is shared and there is no + * need to update back, this function drops the reference + * to the block. + * + * NOTE: return value 1 means we should stop walking down. + */ +static noinline int do_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int *lookup_info) +{ + u64 bytenr; + u64 generation; + u64 parent; + u32 blocksize; + struct btrfs_key key; + struct extent_buffer *next; + int level = wc->level; + int reada = 0; + int ret = 0; + + generation = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + /* + * if the lower level block was created before the snapshot + * was created, we know there is no need to update back refs + * for the subtree + */ + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) { + *lookup_info = 1; + return 1; + } + + bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); + blocksize = btrfs_level_size(root, level - 1); + + next = btrfs_find_tree_block(root, bytenr, blocksize); + if (!next) { + next = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!next) + return -ENOMEM; + reada = 1; + } + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &wc->refs[level - 1], + &wc->flags[level - 1]); + BUG_ON(ret); + BUG_ON(wc->refs[level - 1] == 0); + *lookup_info = 0; + + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level - 1] > 1) { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + + if (!wc->update_ref || + generation <= root->root_key.offset) + goto skip; + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); + if (ret < 0) + goto skip; + + wc->stage = UPDATE_BACKREF; + wc->shared_level = level - 1; + } + } else { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + } + + if (!btrfs_buffer_uptodate(next, generation)) { + btrfs_tree_unlock(next); + free_extent_buffer(next); + next = NULL; + *lookup_info = 1; + } + + if (!next) { + if (reada && level == 1) + reada_walk_down(trans, root, wc, path); + next = read_tree_block(root, bytenr, blocksize, generation); + if (!next) + return -EIO; + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + } + + level--; + BUG_ON(level != btrfs_header_level(next)); + path->nodes[level] = next; + path->slots[level] = 0; + path->locks[level] = 1; + wc->level = level; + if (wc->level == 1) + wc->reada_slot = 0; + return 0; +skip: + wc->refs[level - 1] = 0; + wc->flags[level - 1] = 0; + if (wc->stage == DROP_REFERENCE) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + parent = path->nodes[level]->start; + } else { + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level])); + parent = 0; + } + + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, + root->root_key.objectid, level - 1, 0); + BUG_ON(ret); + } + btrfs_tree_unlock(next); + free_extent_buffer(next); + *lookup_info = 1; + return 1; +} + +/* + * hepler to process tree block while walking up the tree. + * + * when wc->stage == DROP_REFERENCE, this function drops + * reference count on the block. + * + * when wc->stage == UPDATE_BACKREF, this function changes + * wc->stage back to DROP_REFERENCE if we changed wc->stage + * to UPDATE_BACKREF previously while processing the block. + * + * NOTE: return value 1 means we should stop walking up. + */ +static noinline int walk_up_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + int ret; + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + u64 parent = 0; + + if (wc->stage == UPDATE_BACKREF) { + BUG_ON(wc->shared_level < level); + if (level < wc->shared_level) + goto out; + + ret = find_next_key(path, level + 1, &wc->update_progress); + if (ret > 0) + wc->update_ref = 0; + + wc->stage = DROP_REFERENCE; + wc->shared_level = -1; + path->slots[level] = 0; + + /* + * check reference count again if the block isn't locked. + * we should start walking down the tree again if reference + * count is one. + */ + if (!path->locks[level]) { + BUG_ON(level == 0); + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + path->locks[level] = 1; + + ret = btrfs_lookup_extent_info(trans, root, + eb->start, eb->len, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); + if (wc->refs[level] == 1) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + return 1; + } + } + } + + /* wc->stage == DROP_REFERENCE */ + BUG_ON(wc->refs[level] > 1 && !path->locks[level]); + + if (wc->refs[level] == 1) { + if (level == 0) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + ret = btrfs_dec_ref(trans, root, eb, 1); + else + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); + } + /* make block locked assertion in clean_tree_block happy */ + if (!path->locks[level] && + btrfs_header_generation(eb) == trans->transid) { + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + path->locks[level] = 1; + } + clean_tree_block(trans, root, eb); + } + + if (eb == root->node) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = eb->start; + else + BUG_ON(root->root_key.objectid != + btrfs_header_owner(eb)); + } else { + if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = path->nodes[level + 1]->start; + else + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level + 1])); + } + + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); +out: + wc->refs[level] = 0; + wc->flags[level] = 0; + return 0; +} + +static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + int level = wc->level; + int lookup_info = 1; + int ret; + + while (level >= 0) { + ret = walk_down_proc(trans, root, path, wc, lookup_info); + if (ret > 0) + break; + + if (level == 0) + break; + + if (path->slots[level] >= + btrfs_header_nritems(path->nodes[level])) + break; + + ret = do_walk_down(trans, root, path, wc, &lookup_info); + if (ret > 0) { + path->slots[level]++; + continue; + } else if (ret < 0) + return ret; + level = wc->level; + } + return 0; +} + +static noinline int walk_up_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int max_level) +{ + int level = wc->level; + int ret; + + path->slots[level] = btrfs_header_nritems(path->nodes[level]); + while (level < max_level && path->nodes[level]) { + wc->level = level; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + path->slots[level]++; + return 0; + } else { + ret = walk_up_proc(trans, root, path, wc); + if (ret > 0) + return 0; + + if (path->locks[level]) { + btrfs_tree_unlock(path->nodes[level]); + path->locks[level] = 0; + } + free_extent_buffer(path->nodes[level]); + path->nodes[level] = NULL; + level++; + } + } + return 1; +} + +/* + * drop a subvolume tree. + * + * this function traverses the tree freeing any blocks that only + * referenced by the tree. + * + * when a shared tree block is found. this function decreases its + * reference count by one. if update_ref is true, this function + * also make sure backrefs for the shared block and all lower level + * blocks are properly updated. + */ +int btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref) +{ + struct btrfs_path *path; + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = root->fs_info->tree_root; + struct btrfs_root_item *root_item = &root->root_item; + struct walk_control *wc; + struct btrfs_key key; + int err = 0; + int ret; + int level; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + wc = kzalloc(sizeof(*wc), GFP_NOFS); + BUG_ON(!wc); + + trans = btrfs_start_transaction(tree_root, 0); + BUG_ON(IS_ERR(trans)); + + if (block_rsv) + trans->block_rsv = block_rsv; + + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + level = btrfs_header_level(root->node); + path->nodes[level] = btrfs_lock_root_node(root); + btrfs_set_lock_blocking(path->nodes[level]); + path->slots[level] = 0; + path->locks[level] = 1; + memset(&wc->update_progress, 0, + sizeof(wc->update_progress)); + } else { + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + memcpy(&wc->update_progress, &key, + sizeof(wc->update_progress)); + + level = root_item->drop_level; + BUG_ON(level == 0); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + err = ret; + goto out; + } + WARN_ON(ret > 0); + + /* + * unlock our path, this is safe because only this + * function is allowed to delete this snapshot + */ + btrfs_unlock_up_safe(path, 0); + + level = btrfs_header_level(root->node); + while (1) { + btrfs_tree_lock(path->nodes[level]); + btrfs_set_lock_blocking(path->nodes[level]); + + ret = btrfs_lookup_extent_info(trans, root, + path->nodes[level]->start, + path->nodes[level]->len, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); + + if (level == root_item->drop_level) + break; + + btrfs_tree_unlock(path->nodes[level]); + WARN_ON(wc->refs[level] != 1); + level--; + } + } + + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = update_ref; + wc->keep_locks = 0; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); + + while (1) { + ret = walk_down_tree(trans, root, path, wc); + if (ret < 0) { + err = ret; + break; + } + + ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); + if (ret < 0) { + err = ret; + break; + } + + if (ret > 0) { + BUG_ON(wc->stage != DROP_REFERENCE); + break; + } + + if (wc->stage == DROP_REFERENCE) { + level = wc->level; + btrfs_node_key(path->nodes[level], + &root_item->drop_progress, + path->slots[level]); + root_item->drop_level = level; + } + + BUG_ON(wc->level == 0); + if (btrfs_should_end_transaction(trans, tree_root)) { + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + root_item); + BUG_ON(ret); + + btrfs_end_transaction_throttle(trans, tree_root); + trans = btrfs_start_transaction(tree_root, 0); + BUG_ON(IS_ERR(trans)); + if (block_rsv) + trans->block_rsv = block_rsv; + } + } + btrfs_release_path(path); + BUG_ON(err); + + ret = btrfs_del_root(trans, tree_root, &root->root_key); + BUG_ON(ret); + + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_find_last_root(tree_root, root->root_key.objectid, + NULL, NULL); + BUG_ON(ret < 0); + if (ret > 0) { + /* if we fail to delete the orphan item this time + * around, it'll get picked up the next time. + * + * The most common failure here is just -ENOENT. + */ + btrfs_del_orphan_item(trans, tree_root, + root->root_key.objectid); + } + } + + if (root->in_radix) { + btrfs_free_fs_root(tree_root->fs_info, root); + } else { + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root); + } +out: + btrfs_end_transaction_throttle(trans, tree_root); + kfree(wc); + btrfs_free_path(path); + return err; +} + +/* + * drop subtree rooted at tree block 'node'. + * + * NOTE: this function will unlock and release tree block 'node' + */ +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent) +{ + struct btrfs_path *path; + struct walk_control *wc; + int level; + int parent_level; + int ret = 0; + int wret; + + BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + wc = kzalloc(sizeof(*wc), GFP_NOFS); + if (!wc) { + btrfs_free_path(path); + return -ENOMEM; + } + + btrfs_assert_tree_locked(parent); + parent_level = btrfs_header_level(parent); + extent_buffer_get(parent); + path->nodes[parent_level] = parent; + path->slots[parent_level] = btrfs_header_nritems(parent); + + btrfs_assert_tree_locked(node); + level = btrfs_header_level(node); + path->nodes[level] = node; + path->slots[level] = 0; + path->locks[level] = 1; + + wc->refs[parent_level] = 1; + wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = 0; + wc->keep_locks = 1; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); + + while (1) { + wret = walk_down_tree(trans, root, path, wc); + if (wret < 0) { + ret = wret; + break; + } + + wret = walk_up_tree(trans, root, path, wc, parent_level); + if (wret < 0) + ret = wret; + if (wret != 0) + break; + } + + kfree(wc); + btrfs_free_path(path); + return ret; +} + +static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) +{ + u64 num_devices; + u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + + /* + * we add in the count of missing devices because we want + * to make sure that any RAID levels on a degraded FS + * continue to be honored. + */ + num_devices = root->fs_info->fs_devices->rw_devices + + root->fs_info->fs_devices->missing_devices; + + if (num_devices == 1) { + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* turn raid0 into single device chunks */ + if (flags & BTRFS_BLOCK_GROUP_RAID0) + return stripped; + + /* turn mirroring into duplication */ + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + return stripped | BTRFS_BLOCK_GROUP_DUP; + return flags; + } else { + /* they already had raid on here, just return */ + if (flags & stripped) + return flags; + + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* switch duplicated blocks with raid1 */ + if (flags & BTRFS_BLOCK_GROUP_DUP) + return stripped | BTRFS_BLOCK_GROUP_RAID1; + + /* turn single device chunks into raid0 */ + return stripped | BTRFS_BLOCK_GROUP_RAID0; + } + return flags; +} + +static int set_block_group_ro(struct btrfs_block_group_cache *cache) +{ + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + int ret = -ENOSPC; + + if (cache->ro) + return 0; + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + + if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + + sinfo->bytes_may_use + sinfo->bytes_readonly + + cache->reserved_pinned + num_bytes <= sinfo->total_bytes) { + sinfo->bytes_readonly += num_bytes; + sinfo->bytes_reserved += cache->reserved_pinned; + cache->reserved_pinned = 0; + cache->ro = 1; + ret = 0; + } + + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); + return ret; +} + +int btrfs_set_block_group_ro(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) + +{ + struct btrfs_trans_handle *trans; + u64 alloc_flags; + int ret; + + BUG_ON(cache->ro); + + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + + alloc_flags = update_block_group_flags(root, cache->flags); + if (alloc_flags != cache->flags) + do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + CHUNK_ALLOC_FORCE); + + ret = set_block_group_ro(cache); + if (!ret) + goto out; + alloc_flags = get_alloc_profile(root, cache->space_info->flags); + ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + CHUNK_ALLOC_FORCE); + if (ret < 0) + goto out; + ret = set_block_group_ro(cache); +out: + btrfs_end_transaction(trans, root); + return ret; +} + +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) +{ + u64 alloc_flags = get_alloc_profile(root, type); + return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + CHUNK_ALLOC_FORCE); +} + +/* + * helper to account the unused space of all the readonly block group in the + * list. takes mirrors into account. + */ +static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) +{ + struct btrfs_block_group_cache *block_group; + u64 free_bytes = 0; + int factor; + + list_for_each_entry(block_group, groups_list, list) { + spin_lock(&block_group->lock); + + if (!block_group->ro) { + spin_unlock(&block_group->lock); + continue; + } + + if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) + factor = 2; + else + factor = 1; + + free_bytes += (block_group->key.offset - + btrfs_block_group_used(&block_group->item)) * + factor; + + spin_unlock(&block_group->lock); + } + + return free_bytes; +} + +/* + * helper to account the unused space of all the readonly block group in the + * space_info. takes mirrors into account. + */ +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) +{ + int i; + u64 free_bytes = 0; + + spin_lock(&sinfo->lock); + + for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) + if (!list_empty(&sinfo->block_groups[i])) + free_bytes += __btrfs_get_ro_block_group_free_space( + &sinfo->block_groups[i]); + + spin_unlock(&sinfo->lock); + + return free_bytes; +} + +int btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + + BUG_ON(!cache->ro); + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + sinfo->bytes_readonly -= num_bytes; + cache->ro = 0; + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); + return 0; +} + +/* + * checks to see if its even possible to relocate this block group. + * + * @return - -1 if it's not a good idea to relocate this block group, 0 if its + * ok to go ahead and try. + */ +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_device *device; + int full = 0; + int ret = 0; + + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); + + /* odd, couldn't find the block group, leave it alone */ + if (!block_group) + return -1; + + /* no bytes used, we're good */ + if (!btrfs_block_group_used(&block_group->item)) + goto out; + + space_info = block_group->space_info; + spin_lock(&space_info->lock); + + full = space_info->full; + + /* + * if this is the last block group we have in this space, we can't + * relocate it unless we're able to allocate a new chunk below. + * + * Otherwise, we need to make sure we have room in the space to handle + * all of the extents from this block group. If we can, we're good + */ + if ((space_info->total_bytes != block_group->key.offset) && + (space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + btrfs_block_group_used(&block_group->item) < + space_info->total_bytes)) { + spin_unlock(&space_info->lock); + goto out; + } + spin_unlock(&space_info->lock); + + /* + * ok we don't have enough space, but maybe we have free space on our + * devices to allocate new chunks for relocation, so loop through our + * alloc devices and guess if we have enough space. However, if we + * were marked as full, then we know there aren't enough chunks, and we + * can just return. + */ + ret = -1; + if (full) + goto out; + + mutex_lock(&root->fs_info->chunk_mutex); + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + u64 min_free = btrfs_block_group_used(&block_group->item); + u64 dev_offset; + + /* + * check to make sure we can actually find a chunk with enough + * space to fit our block group in. + */ + if (device->total_bytes > device->bytes_used + min_free) { + ret = find_free_dev_extent(NULL, device, min_free, + &dev_offset, NULL); + if (!ret) + break; + ret = -1; + } + } + mutex_unlock(&root->fs_info->chunk_mutex); +out: + btrfs_put_block_group(block_group); + return ret; +} + +static int find_first_block_group(struct btrfs_root *root, + struct btrfs_path *path, struct btrfs_key *key) +{ + int ret = 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + int slot; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + slot = path->slots[0]; + leaf = path->nodes[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid >= key->objectid && + found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + ret = 0; + goto out; + } + path->slots[0]++; + } +out: + return ret; +} + +void btrfs_put_block_group_cache(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + u64 last = 0; + + while (1) { + struct inode *inode; + + block_group = btrfs_lookup_first_block_group(info, last); + while (block_group) { + spin_lock(&block_group->lock); + if (block_group->iref) + break; + spin_unlock(&block_group->lock); + block_group = next_block_group(info->tree_root, + block_group); + } + if (!block_group) { + if (last == 0) + break; + last = 0; + continue; + } + + inode = block_group->inode; + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + last = block_group->key.objectid + block_group->key.offset; + btrfs_put_block_group(block_group); + } +} + +int btrfs_free_block_groups(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_caching_control *caching_ctl; + struct rb_node *n; + + down_write(&info->extent_commit_sem); + while (!list_empty(&info->caching_block_groups)) { + caching_ctl = list_entry(info->caching_block_groups.next, + struct btrfs_caching_control, list); + list_del(&caching_ctl->list); + put_caching_control(caching_ctl); + } + up_write(&info->extent_commit_sem); + + spin_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + block_group = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + rb_erase(&block_group->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + + down_write(&block_group->space_info->groups_sem); + list_del(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + if (block_group->cached == BTRFS_CACHE_STARTED) + wait_block_group_cache_done(block_group); + + /* + * We haven't cached this block group, which means we could + * possibly have excluded extents on this block group. + */ + if (block_group->cached == BTRFS_CACHE_NO) + free_excluded_extents(info->extent_root, block_group); + + btrfs_remove_free_space_cache(block_group); + btrfs_put_block_group(block_group); + + spin_lock(&info->block_group_cache_lock); + } + spin_unlock(&info->block_group_cache_lock); + + /* now that all the block groups are freed, go through and + * free all the space_info structs. This is only called during + * the final stages of unmount, and so we know nobody is + * using them. We call synchronize_rcu() once before we start, + * just to be on the safe side. + */ + synchronize_rcu(); + + release_global_block_rsv(info); + + while(!list_empty(&info->space_info)) { + space_info = list_entry(info->space_info.next, + struct btrfs_space_info, + list); + if (space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0) { + WARN_ON(1); + dump_space_info(space_info, 0, 0); + } + list_del(&space_info->list); + kfree(space_info); + } + return 0; +} + +static void __link_block_group(struct btrfs_space_info *space_info, + struct btrfs_block_group_cache *cache) +{ + int index = get_block_group_index(cache); + + down_write(&space_info->groups_sem); + list_add_tail(&cache->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); +} + +int btrfs_read_block_groups(struct btrfs_root *root) +{ + struct btrfs_path *path; + int ret; + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *space_info; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf; + int need_clear = 0; + u64 cache_gen; + + root = info->extent_root; + key.objectid = 0; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + + cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); + if (cache_gen != 0 && + btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) + need_clear = 1; + if (btrfs_test_opt(root, CLEAR_CACHE)) + need_clear = 1; + if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen) + printk(KERN_INFO "btrfs: disk space caching is enabled\n"); + + while (1) { + ret = find_first_block_group(root, path, &key); + if (ret > 0) + break; + if (ret != 0) + goto error; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) { + ret = -ENOMEM; + goto error; + } + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + ret = -ENOMEM; + goto error; + } + + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + cache->fs_info = info; + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + + if (need_clear) + cache->disk_cache_state = BTRFS_DC_CLEAR; + + read_extent_buffer(leaf, &cache->item, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(cache->item)); + memcpy(&cache->key, &found_key, sizeof(found_key)); + + key.objectid = found_key.objectid + found_key.offset; + btrfs_release_path(path); + cache->flags = btrfs_block_group_flags(&cache->item); + cache->sectorsize = root->sectorsize; + + btrfs_init_free_space_ctl(cache); + + /* + * We need to exclude the super stripes now so that the space + * info has super bytes accounted for, otherwise we'll think + * we have more space than we actually do. + */ + exclude_super_stripes(root, cache); + + /* + * check for two cases, either we are full, and therefore + * don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all + * the space in and be done with it. This saves us _alot_ of + * time, particularly in the full case. + */ + if (found_key.offset == btrfs_block_group_used(&cache->item)) { + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + free_excluded_extents(root, cache); + } else if (btrfs_block_group_used(&cache->item) == 0) { + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + add_new_free_space(cache, root->fs_info, + found_key.objectid, + found_key.objectid + + found_key.offset); + free_excluded_extents(root, cache); + } + + ret = update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + &space_info); + BUG_ON(ret); + cache->space_info = space_info; + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_readonly += cache->bytes_super; + spin_unlock(&cache->space_info->lock); + + __link_block_group(space_info, cache); + + ret = btrfs_add_block_group_cache(root->fs_info, cache); + BUG_ON(ret); + + set_avail_alloc_bits(root->fs_info, cache->flags); + if (btrfs_chunk_readonly(root, cache->key.objectid)) + set_block_group_ro(cache); + } + + list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { + if (!(get_alloc_profile(root, space_info->flags) & + (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP))) + continue; + /* + * avoid allocating from un-mirrored block group if there are + * mirrored block groups. + */ + list_for_each_entry(cache, &space_info->block_groups[3], list) + set_block_group_ro(cache); + list_for_each_entry(cache, &space_info->block_groups[4], list) + set_block_group_ro(cache); + } + + init_global_block_rsv(info); + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_objectid, u64 chunk_offset, + u64 size) +{ + int ret; + struct btrfs_root *extent_root; + struct btrfs_block_group_cache *cache; + + extent_root = root->fs_info->extent_root; + + root->fs_info->last_trans_log_full_commit = trans->transid; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return -ENOMEM; + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return -ENOMEM; + } + + cache->key.objectid = chunk_offset; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = root->sectorsize; + cache->fs_info = root->fs_info; + + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + + btrfs_init_free_space_ctl(cache); + + btrfs_set_block_group_used(&cache->item, bytes_used); + btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); + cache->flags = type; + btrfs_set_block_group_flags(&cache->item, type); + + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + exclude_super_stripes(root, cache); + + add_new_free_space(cache, root->fs_info, chunk_offset, + chunk_offset + size); + + free_excluded_extents(root, cache); + + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, + &cache->space_info); + BUG_ON(ret); + + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_readonly += cache->bytes_super; + spin_unlock(&cache->space_info->lock); + + __link_block_group(cache->space_info, cache); + + ret = btrfs_add_block_group_cache(root->fs_info, cache); + BUG_ON(ret); + + ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, + sizeof(cache->item)); + BUG_ON(ret); + + set_avail_alloc_bits(extent_root->fs_info, type); + + return 0; +} + +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 group_start) +{ + struct btrfs_path *path; + struct btrfs_block_group_cache *block_group; + struct btrfs_free_cluster *cluster; + struct btrfs_root *tree_root = root->fs_info->tree_root; + struct btrfs_key key; + struct inode *inode; + int ret; + int factor; + + root = root->fs_info->extent_root; + + block_group = btrfs_lookup_block_group(root->fs_info, group_start); + BUG_ON(!block_group); + BUG_ON(!block_group->ro); + + /* + * Free the reserved super bytes from this block group before + * remove it. + */ + free_excluded_extents(root, block_group); + + memcpy(&key, &block_group->key, sizeof(key)); + if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + + /* make sure this block group isn't part of an allocation cluster */ + cluster = &root->fs_info->data_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + /* + * make sure this block group isn't part of a metadata + * allocation cluster + */ + cluster = &root->fs_info->meta_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + inode = lookup_free_space_inode(root, block_group, path); + if (!IS_ERR(inode)) { + btrfs_orphan_add(trans, inode); + clear_nlink(inode); + /* One for the block groups ref */ + spin_lock(&block_group->lock); + if (block_group->iref) { + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + /* One for our lookup ref */ + iput(inode); + } + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) + btrfs_release_path(path); + if (ret == 0) { + ret = btrfs_del_item(trans, tree_root, path); + if (ret) + goto out; + btrfs_release_path(path); + } + + spin_lock(&root->fs_info->block_group_cache_lock); + rb_erase(&block_group->cache_node, + &root->fs_info->block_group_cache_tree); + spin_unlock(&root->fs_info->block_group_cache_lock); + + down_write(&block_group->space_info->groups_sem); + /* + * we must use list_del_init so people can check to see if they + * are still on the list after taking the semaphore + */ + list_del_init(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + if (block_group->cached == BTRFS_CACHE_STARTED) + wait_block_group_cache_done(block_group); + + btrfs_remove_free_space_cache(block_group); + + spin_lock(&block_group->space_info->lock); + block_group->space_info->total_bytes -= block_group->key.offset; + block_group->space_info->bytes_readonly -= block_group->key.offset; + block_group->space_info->disk_total -= block_group->key.offset * factor; + spin_unlock(&block_group->space_info->lock); + + memcpy(&key, &block_group->key, sizeof(key)); + + btrfs_clear_space_info_full(root->fs_info); + + btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -EIO; + if (ret < 0) + goto out; + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + struct btrfs_super_block *disk_super; + u64 features; + u64 flags; + int mixed = 0; + int ret; + + disk_super = &fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + return 1; + + features = btrfs_super_incompat_flags(disk_super); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; + + flags = BTRFS_BLOCK_GROUP_SYSTEM; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + if (ret) + goto out; + + if (mixed) { + flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + } else { + flags = BTRFS_BLOCK_GROUP_METADATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + if (ret) + goto out; + + flags = BTRFS_BLOCK_GROUP_DATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + } +out: + return ret; +} + +int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +{ + return unpin_extent_range(root, start, end); +} + +int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes) +{ + return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); +} + +int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache = NULL; + u64 group_trimmed; + u64 start; + u64 end; + u64 trimmed = 0; + int ret = 0; + + cache = btrfs_lookup_block_group(fs_info, range->start); + + while (cache) { + if (cache->key.objectid >= (range->start + range->len)) { + btrfs_put_block_group(cache); + break; + } + + start = max(range->start, cache->key.objectid); + end = min(range->start + range->len, + cache->key.objectid + cache->key.offset); + + if (end - start >= range->minlen) { + if (!block_group_cache_done(cache)) { + ret = cache_block_group(cache, NULL, root, 0); + if (!ret) + wait_block_group_cache_done(cache); + } + ret = btrfs_trim_block_group(cache, + &group_trimmed, + start, + end, + range->minlen); + + trimmed += group_trimmed; + if (ret) { + btrfs_put_block_group(cache); + break; + } + } + + cache = next_block_group(fs_info->tree_root, cache); + } + + range->len = trimmed; + return ret; +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c new file mode 100644 index 00000000..7055d11c --- /dev/null +++ b/fs/btrfs/extent_io.c @@ -0,0 +1,3890 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "extent_io.h" +#include "extent_map.h" +#include "compat.h" +#include "ctree.h" +#include "btrfs_inode.h" + +static struct kmem_cache *extent_state_cache; +static struct kmem_cache *extent_buffer_cache; + +static LIST_HEAD(buffers); +static LIST_HEAD(states); + +#define LEAK_DEBUG 0 +#if LEAK_DEBUG +static DEFINE_SPINLOCK(leak_lock); +#endif + +#define BUFFER_LRU_MAX 64 + +struct tree_entry { + u64 start; + u64 end; + struct rb_node rb_node; +}; + +struct extent_page_data { + struct bio *bio; + struct extent_io_tree *tree; + get_extent_t *get_extent; + + /* tells writepage not to lock the state bits for this range + * it still does the unlocking + */ + unsigned int extent_locked:1; + + /* tells the submit_bio code to use a WRITE_SYNC */ + unsigned int sync_io:1; +}; + +int __init extent_io_init(void) +{ + extent_state_cache = kmem_cache_create("extent_state", + sizeof(struct extent_state), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!extent_state_cache) + return -ENOMEM; + + extent_buffer_cache = kmem_cache_create("extent_buffers", + sizeof(struct extent_buffer), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!extent_buffer_cache) + goto free_state_cache; + return 0; + +free_state_cache: + kmem_cache_destroy(extent_state_cache); + return -ENOMEM; +} + +void extent_io_exit(void) +{ + struct extent_state *state; + struct extent_buffer *eb; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, leak_list); + printk(KERN_ERR "btrfs state leak: start %llu end %llu " + "state %lu in tree %p refs %d\n", + (unsigned long long)state->start, + (unsigned long long)state->end, + state->state, state->tree, atomic_read(&state->refs)); + list_del(&state->leak_list); + kmem_cache_free(extent_state_cache, state); + + } + + while (!list_empty(&buffers)) { + eb = list_entry(buffers.next, struct extent_buffer, leak_list); + printk(KERN_ERR "btrfs buffer leak start %llu len %lu " + "refs %d\n", (unsigned long long)eb->start, + eb->len, atomic_read(&eb->refs)); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } + if (extent_state_cache) + kmem_cache_destroy(extent_state_cache); + if (extent_buffer_cache) + kmem_cache_destroy(extent_buffer_cache); +} + +void extent_io_tree_init(struct extent_io_tree *tree, + struct address_space *mapping) +{ + tree->state = RB_ROOT; + INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); + tree->ops = NULL; + tree->dirty_bytes = 0; + spin_lock_init(&tree->lock); + spin_lock_init(&tree->buffer_lock); + tree->mapping = mapping; +} + +static struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; +#if LEAK_DEBUG + unsigned long flags; +#endif + + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state) + return state; + state->state = 0; + state->private = 0; + state->tree = NULL; +#if LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_add(&state->leak_list, &states); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + atomic_set(&state->refs, 1); + init_waitqueue_head(&state->wq); + return state; +} + +void free_extent_state(struct extent_state *state) +{ + if (!state) + return; + if (atomic_dec_and_test(&state->refs)) { +#if LEAK_DEBUG + unsigned long flags; +#endif + WARN_ON(state->tree); +#if LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_del(&state->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + kmem_cache_free(extent_state_cache, state); + } +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct tree_entry *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset > entry->end) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct tree_entry, rb_node); + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) +{ + struct rb_root *root = &tree->state; + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct tree_entry *entry; + struct tree_entry *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct tree_entry, rb_node); + prev = n; + prev_entry = entry; + + if (offset < entry->start) + n = n->rb_left; + else if (offset > entry->end) + n = n->rb_right; + else + return n; + } + + if (prev_ret) { + orig_prev = prev; + while (prev && offset > prev_entry->end) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; + prev = orig_prev; + } + + if (next_ret) { + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *next_ret = prev; + } + return NULL; +} + +static inline struct rb_node *tree_search(struct extent_io_tree *tree, + u64 offset) +{ + struct rb_node *prev = NULL; + struct rb_node *ret; + + ret = __etree_search(tree, offset, &prev, NULL); + if (!ret) + return prev; + return ret; +} + +static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, + struct extent_state *other) +{ + if (tree->ops && tree->ops->merge_extent_hook) + tree->ops->merge_extent_hook(tree->mapping->host, new, + other); +} + +/* + * utility function to look for merge candidates inside a given range. + * Any extents with matching state are merged together into a single + * extent in the tree. Extents with EXTENT_IO in their state field + * are not merged because the end_io handlers need to be able to do + * operations on them without sleeping (or doing allocations/splits). + * + * This should be called with the tree lock held. + */ +static int merge_state(struct extent_io_tree *tree, + struct extent_state *state) +{ + struct extent_state *other; + struct rb_node *other_node; + + if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + return 0; + + other_node = rb_prev(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->end == state->start - 1 && + other->state == state->state) { + merge_cb(tree, state, other); + state->start = other->start; + other->tree = NULL; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); + } + } + other_node = rb_next(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->start == state->end + 1 && + other->state == state->state) { + merge_cb(tree, state, other); + other->start = state->start; + state->tree = NULL; + rb_erase(&state->rb_node, &tree->state); + free_extent_state(state); + state = NULL; + } + } + + return 0; +} + +static int set_state_cb(struct extent_io_tree *tree, + struct extent_state *state, int *bits) +{ + if (tree->ops && tree->ops->set_bit_hook) { + return tree->ops->set_bit_hook(tree->mapping->host, + state, bits); + } + + return 0; +} + +static void clear_state_cb(struct extent_io_tree *tree, + struct extent_state *state, int *bits) +{ + if (tree->ops && tree->ops->clear_bit_hook) + tree->ops->clear_bit_hook(tree->mapping->host, state, bits); +} + +/* + * insert an extent_state struct into the tree. 'bits' are set on the + * struct before it is inserted. + * + * This may return -EEXIST if the extent is already there, in which case the + * state struct is freed. + * + * The tree lock is not taken internally. This is a utility function and + * probably isn't what you want to call (see set/clear_extent_bit). + */ +static int insert_state(struct extent_io_tree *tree, + struct extent_state *state, u64 start, u64 end, + int *bits) +{ + struct rb_node *node; + int bits_to_set = *bits & ~EXTENT_CTLBITS; + int ret; + + if (end < start) { + printk(KERN_ERR "btrfs end < start %llu %llu\n", + (unsigned long long)end, + (unsigned long long)start); + WARN_ON(1); + } + state->start = start; + state->end = end; + ret = set_state_cb(tree, state, bits); + if (ret) + return ret; + + if (bits_to_set & EXTENT_DIRTY) + tree->dirty_bytes += end - start + 1; + state->state |= bits_to_set; + node = tree_insert(&tree->state, end, &state->rb_node); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); + printk(KERN_ERR "btrfs found node %llu %llu on insert of " + "%llu %llu\n", (unsigned long long)found->start, + (unsigned long long)found->end, + (unsigned long long)start, (unsigned long long)end); + free_extent_state(state); + return -EEXIST; + } + state->tree = tree; + merge_state(tree, state); + return 0; +} + +static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, + u64 split) +{ + if (tree->ops && tree->ops->split_extent_hook) + return tree->ops->split_extent_hook(tree->mapping->host, + orig, split); + return 0; +} + +/* + * split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an + * offset inside 'orig' where it should be split. + * + * Before calling, + * the tree has 'orig' at [orig->start, orig->end]. After calling, there + * are two extent state structs in the tree: + * prealloc: [orig->start, split - 1] + * orig: [ split, orig->end ] + * + * The tree locks are not taken by this function. They need to be held + * by the caller. + */ +static int split_state(struct extent_io_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) +{ + struct rb_node *node; + + split_cb(tree, orig, split); + + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; + orig->start = split; + + node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); + if (node) { + free_extent_state(prealloc); + return -EEXIST; + } + prealloc->tree = tree; + return 0; +} + +/* + * utility function to clear some bits in an extent state struct. + * it will optionally wake up any one waiting on this state (wake == 1), or + * forcibly remove the state from the tree (delete == 1). + * + * If no bits are set on the state struct after clearing things, the + * struct is freed and removed from the tree + */ +static int clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, + int *bits, int wake) +{ + int bits_to_clear = *bits & ~EXTENT_CTLBITS; + int ret = state->state & bits_to_clear; + + if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + WARN_ON(range > tree->dirty_bytes); + tree->dirty_bytes -= range; + } + clear_state_cb(tree, state, bits); + state->state &= ~bits_to_clear; + if (wake) + wake_up(&state->wq); + if (state->state == 0) { + if (state->tree) { + rb_erase(&state->rb_node, &tree->state); + state->tree = NULL; + free_extent_state(state); + } else { + WARN_ON(1); + } + } else { + merge_state(tree, state); + } + return ret; +} + +static struct extent_state * +alloc_extent_state_atomic(struct extent_state *prealloc) +{ + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + + return prealloc; +} + +/* + * clear some bits on a range in the tree. This may require splitting + * or inserting elements in the tree, so the gfp mask is used to + * indicate which allocations or sleeping are allowed. + * + * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove + * the given range from the tree regardless of state (ie for truncate). + * + * the range [start, end] is inclusive. + * + * This takes the tree lock, and returns < 0 on error, > 0 if any of the + * bits were already set, or zero if none of the bits were already set. + */ +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int wake, int delete, + struct extent_state **cached_state, + gfp_t mask) +{ + struct extent_state *state; + struct extent_state *cached; + struct extent_state *prealloc = NULL; + struct rb_node *next_node; + struct rb_node *node; + u64 last_end; + int err; + int set = 0; + int clear = 0; + + if (delete) + bits |= ~EXTENT_CTLBITS; + bits |= EXTENT_FIRST_DELALLOC; + + if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + clear = 1; +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + if (cached_state) { + cached = *cached_state; + + if (clear) { + *cached_state = NULL; + cached_state = NULL; + } + + if (cached && cached->tree && cached->start == start) { + if (clear) + atomic_dec(&cached->refs); + state = cached; + goto hit_next; + } + if (clear) + free_extent_state(cached); + } + /* + * this search will find the extents that end after + * our range starts + */ + node = tree_search(tree, start); + if (!node) + goto out; + state = rb_entry(node, struct extent_state, rb_node); +hit_next: + if (state->start > end) + goto out; + WARN_ON(state->end < start); + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip + * bits on second half. + * + * If the extent we found extends past our range, we + * just split and search again. It'll get split again + * the next time though. + * + * If the extent we found is inside our range, we clear + * the desired bit on it. + */ + + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set |= clear_state_bit(tree, state, &bits, wake); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and clear the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + if (wake) + wake_up(&state->wq); + + set |= clear_state_bit(tree, prealloc, &bits, wake); + + prealloc = NULL; + goto out; + } + + if (state->end < end && prealloc && !need_resched()) + next_node = rb_next(&state->rb_node); + else + next_node = NULL; + + set |= clear_state_bit(tree, state, &bits, wake); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start <= end && next_node) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return set; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +static int wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) + __releases(tree->lock) + __acquires(tree->lock) +{ + DEFINE_WAIT(wait); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&tree->lock); + schedule(); + spin_lock(&tree->lock); + finish_wait(&state->wq, &wait); + return 0; +} + +/* + * waits for one or more bits to clear on a range in the state tree. + * The range [start, end] is inclusive. + * The tree lock is taken by this function + */ +int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) +{ + struct extent_state *state; + struct rb_node *node; + + spin_lock(&tree->lock); +again: + while (1) { + /* + * this search will find all the extents that end after + * our range starts + */ + node = tree_search(tree, start); + if (!node) + break; + + state = rb_entry(node, struct extent_state, rb_node); + + if (state->start > end) + goto out; + + if (state->state & bits) { + start = state->start; + atomic_inc(&state->refs); + wait_on_state(tree, state); + free_extent_state(state); + goto again; + } + start = state->end + 1; + + if (start > end) + break; + + if (need_resched()) { + spin_unlock(&tree->lock); + cond_resched(); + spin_lock(&tree->lock); + } + } +out: + spin_unlock(&tree->lock); + return 0; +} + +static int set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, + int *bits) +{ + int ret; + int bits_to_set = *bits & ~EXTENT_CTLBITS; + + ret = set_state_cb(tree, state, bits); + if (ret) + return ret; + if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + tree->dirty_bytes += range; + } + state->state |= bits_to_set; + + return 0; +} + +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + if (cached_ptr && !(*cached_ptr)) { + if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { + *cached_ptr = state; + atomic_inc(&state->refs); + } + } +} + +static void uncache_state(struct extent_state **cached_ptr) +{ + if (cached_ptr && (*cached_ptr)) { + struct extent_state *state = *cached_ptr; + *cached_ptr = NULL; + free_extent_state(state); + } +} + +/* + * set some bits on a range in the tree. This may require allocations or + * sleeping, so the gfp mask is used to indicate what is allowed. + * + * If any of the exclusive bits are set, this will fail with -EEXIST if some + * part of the range already has the desired bits set. The start of the + * existing range is returned in failed_start in this case. + * + * [start, end] is inclusive This takes the tree lock. + */ + +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err = 0; + u64 last_start; + u64 last_end; + + bits |= EXTENT_FIRST_DELALLOC; +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + BUG_ON(!prealloc); + } + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start == start && state->tree) { + node = &state->rb_node; + goto hit_next; + } + } + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = insert_state(tree, prealloc, start, end, &bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + struct rb_node *next_node; + if (state->state & exclusive_bits) { + *failed_start = state->start; + err = -EEXIST; + goto out; + } + + err = set_state_bits(tree, state, &bits); + if (err) + goto out; + + next_node = rb_next(node); + cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + + start = last_end + 1; + if (next_node && start < end && prealloc && !need_resched()) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + if (state->state & exclusive_bits) { + *failed_start = start; + err = -EEXIST; + goto out; + } + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + err = set_state_bits(tree, state, &bits); + if (err) + goto out; + cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + + /* + * Avoid to free 'prealloc' if it can be merged with + * the later extent. + */ + atomic_inc(&prealloc->refs); + err = insert_state(tree, prealloc, start, this_end, + &bits); + BUG_ON(err == -EEXIST); + if (err) { + free_extent_state(prealloc); + prealloc = NULL; + goto out; + } + cache_state(prealloc, cached_state); + free_extent_state(prealloc); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + if (state->state & exclusive_bits) { + *failed_start = start; + err = -EEXIST; + goto out; + } + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + err = set_state_bits(tree, prealloc, &bits); + if (err) { + prealloc = NULL; + goto out; + } + cache_state(prealloc, cached_state); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +/* wrappers around set/clear extent bit */ +int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, + NULL, mask); +} + +int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) +{ + return set_extent_bit(tree, start, end, bits, 0, NULL, + NULL, mask); +} + +int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); +} + +int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, + 0, NULL, cached_state, mask); +} + +int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); +} + +int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, + NULL, mask); +} + +int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, + NULL, cached_state, mask); +} + +static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + cached_state, mask); +} + +/* + * either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */ +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, struct extent_state **cached_state, gfp_t mask) +{ + int err; + u64 failed_start; + while (1) { + err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + EXTENT_LOCKED, &failed_start, + cached_state, mask); + if (err == -EEXIST && (mask & __GFP_WAIT)) { + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + start = failed_start; + } else { + break; + } + WARN_ON(start > end); + } + return err; +} + +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +{ + return lock_extent_bits(tree, start, end, 0, NULL, mask); +} + +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + int err; + u64 failed_start; + + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, mask); + if (err == -EEXIST) { + if (failed_start > start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, 1, 0, NULL, mask); + return 0; + } + return 1; +} + +int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + mask); +} + +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, + mask); +} + +/* + * helper function to set both pages and extents in the tree writeback + */ +static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + set_page_writeback(page); + page_cache_release(page); + index++; + } + return 0; +} + +/* + * find the first offset in the io tree with 'bits' set. zero is + * returned if we find something, and *start_ret and *end_ret are + * set to reflect the state struct that was found. + * + * If nothing was found, 1 is returned, < 0 on error + */ +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && (state->state & bits)) { + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + break; + } + node = rb_next(node); + if (!node) + break; + } +out: + spin_unlock(&tree->lock); + return ret; +} + +/* find the first state struct with 'bits' set after 'start', and + * return it. tree->lock must be held. NULL will returned if + * nothing was found after 'start' + */ +struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, int bits) +{ + struct rb_node *node; + struct extent_state *state; + + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && (state->state & bits)) + return state; + + node = rb_next(node); + if (!node) + break; + } +out: + return NULL; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_delalloc_range(struct extent_io_tree *tree, + u64 *start, u64 *end, u64 max_bytes, + struct extent_state **cached_state) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 found = 0; + u64 total_bytes = 0; + + spin_lock(&tree->lock); + + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, cur_start); + if (!node) { + if (!found) + *end = (u64)-1; + goto out; + } + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (found && (state->start != cur_start || + (state->state & EXTENT_BOUNDARY))) { + goto out; + } + if (!(state->state & EXTENT_DELALLOC)) { + if (!found) + *end = state->end; + goto out; + } + if (!found) { + *start = state->start; + *cached_state = state; + atomic_inc(&state->refs); + } + found++; + *end = state->end; + cur_start = state->end + 1; + node = rb_next(node); + if (!node) + break; + total_bytes += state->end - state->start + 1; + if (total_bytes >= max_bytes) + break; + } +out: + spin_unlock(&tree->lock); + return found; +} + +static noinline int __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + + if (index == locked_page->index && end_index == index) + return 0; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, nr_pages, + ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (pages[i] != locked_page) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} + +static noinline int lock_delalloc_pages(struct inode *inode, + struct page *locked_page, + u64 delalloc_start, + u64 delalloc_end) +{ + unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; + unsigned long start_index = index; + unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; + unsigned long pages_locked = 0; + struct page *pages[16]; + unsigned long nrpages; + int ret; + int i; + + /* the caller is responsible for locking the start index */ + if (index == locked_page->index && index == end_index) + return 0; + + /* skip the page at the start index */ + nrpages = end_index - index + 1; + while (nrpages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nrpages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + ret = -EAGAIN; + goto done; + } + /* now we have an array of pages, lock them all */ + for (i = 0; i < ret; i++) { + /* + * the caller is taking responsibility for + * locked_page + */ + if (pages[i] != locked_page) { + lock_page(pages[i]); + if (!PageDirty(pages[i]) || + pages[i]->mapping != inode->i_mapping) { + ret = -EAGAIN; + unlock_page(pages[i]); + page_cache_release(pages[i]); + goto done; + } + } + page_cache_release(pages[i]); + pages_locked++; + } + nrpages -= ret; + index += ret; + cond_resched(); + } + ret = 0; +done: + if (ret && pages_locked) { + __unlock_for_delalloc(inode, locked_page, + delalloc_start, + ((u64)(start_index + pages_locked - 1)) << + PAGE_CACHE_SHIFT); + } + return ret; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, + u64 *start, u64 *end, + u64 max_bytes) +{ + u64 delalloc_start; + u64 delalloc_end; + u64 found; + struct extent_state *cached_state = NULL; + int ret; + int loops = 0; + +again: + /* step one, find a bunch of delalloc bytes starting at start */ + delalloc_start = *start; + delalloc_end = 0; + found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, + max_bytes, &cached_state); + if (!found || delalloc_end <= *start) { + *start = delalloc_start; + *end = delalloc_end; + free_extent_state(cached_state); + return found; + } + + /* + * start comes from the offset of locked_page. We have to lock + * pages in order, so we can't process delalloc bytes before + * locked_page + */ + if (delalloc_start < *start) + delalloc_start = *start; + + /* + * make sure to limit the number of pages we try to lock down + * if we're looping. + */ + if (delalloc_end + 1 - delalloc_start > max_bytes && loops) + delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; + + /* step two, lock all the pages after the page that has start */ + ret = lock_delalloc_pages(inode, locked_page, + delalloc_start, delalloc_end); + if (ret == -EAGAIN) { + /* some of the pages are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching + */ + free_extent_state(cached_state); + if (!loops) { + unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); + max_bytes = PAGE_CACHE_SIZE - offset; + loops = 1; + goto again; + } else { + found = 0; + goto out_failed; + } + } + BUG_ON(ret); + + /* step three, lock the state bits for the whole range */ + lock_extent_bits(tree, delalloc_start, delalloc_end, + 0, &cached_state, GFP_NOFS); + + /* then test to make sure it is all still delalloc */ + ret = test_range_bit(tree, delalloc_start, delalloc_end, + EXTENT_DELALLOC, 1, cached_state); + if (!ret) { + unlock_extent_cached(tree, delalloc_start, delalloc_end, + &cached_state, GFP_NOFS); + __unlock_for_delalloc(inode, locked_page, + delalloc_start, delalloc_end); + cond_resched(); + goto again; + } + free_extent_state(cached_state); + *start = delalloc_start; + *end = delalloc_end; +out_failed: + return found; +} + +int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, + unsigned long op) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + int clear_bits = 0; + + if (op & EXTENT_CLEAR_UNLOCK) + clear_bits |= EXTENT_LOCKED; + if (op & EXTENT_CLEAR_DIRTY) + clear_bits |= EXTENT_DIRTY; + + if (op & EXTENT_CLEAR_DELALLOC) + clear_bits |= EXTENT_DELALLOC; + + clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); + if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | + EXTENT_SET_PRIVATE2))) + return 0; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + + if (op & EXTENT_SET_PRIVATE2) + SetPagePrivate2(pages[i]); + + if (pages[i] == locked_page) { + page_cache_release(pages[i]); + continue; + } + if (op & EXTENT_CLEAR_DIRTY) + clear_page_dirty_for_io(pages[i]); + if (op & EXTENT_SET_WRITEBACK) + set_page_writeback(pages[i]); + if (op & EXTENT_END_WRITEBACK) + end_page_writeback(pages[i]); + if (op & EXTENT_CLEAR_UNLOCK_PAGE) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} + +/* + * count the number of bytes in the tree that have a given bit(s) + * set. This can be fairly slow, except for EXTENT_DIRTY which is + * cached. The total number found is returned. + */ +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + unsigned long bits, int contig) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 total_bytes = 0; + u64 last = 0; + int found = 0; + + if (search_end <= cur_start) { + WARN_ON(1); + return 0; + } + + spin_lock(&tree->lock); + if (cur_start == 0 && bits == EXTENT_DIRTY) { + total_bytes = tree->dirty_bytes; + goto out; + } + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, cur_start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > search_end) + break; + if (contig && found && state->start > last + 1) + break; + if (state->end >= cur_start && (state->state & bits) == bits) { + total_bytes += min(search_end, state->end) + 1 - + max(cur_start, state->start); + if (total_bytes >= max_bytes) + break; + if (!found) { + *start = max(cur_start, state->start); + found = 1; + } + last = state->end; + } else if (contig && found) { + break; + } + node = rb_next(node); + if (!node) + break; + } +out: + spin_unlock(&tree->lock); + return total_bytes; +} + +/* + * set the private field for a given byte offset in the tree. If there isn't + * an extent_state there already, this does nothing. + */ +int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + state->private = private; +out: + spin_unlock(&tree->lock); + return ret; +} + +int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + *private = state->private; +out: + spin_unlock(&tree->lock); + return ret; +} + +/* + * searches a range in the state tree for a given mask. + * If 'filled' == 1, this returns 1 only if every extent in the tree + * has the bits set. Otherwise, 1 is returned if any bit in the + * range is found set. + */ +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int filled, struct extent_state *cached) +{ + struct extent_state *state = NULL; + struct rb_node *node; + int bitset = 0; + + spin_lock(&tree->lock); + if (cached && cached->tree && cached->start == start) + node = &cached->rb_node; + else + node = tree_search(tree, start); + while (node && start <= end) { + state = rb_entry(node, struct extent_state, rb_node); + + if (filled && state->start > start) { + bitset = 0; + break; + } + + if (state->start > end) + break; + + if (state->state & bits) { + bitset = 1; + if (!filled) + break; + } else if (filled) { + bitset = 0; + break; + } + + if (state->end == (u64)-1) + break; + + start = state->end + 1; + if (start > end) + break; + node = rb_next(node); + if (!node) { + if (filled) + bitset = 0; + break; + } + } + spin_unlock(&tree->lock); + return bitset; +} + +/* + * helper function to set a given page up to date if all the + * extents in the tree for that page are up to date + */ +static int check_page_uptodate(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) + SetPageUptodate(page); + return 0; +} + +/* + * helper function to unlock a page if all the extents in the tree + * for that page are unlocked + */ +static int check_page_locked(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) + unlock_page(page); + return 0; +} + +/* + * helper function to end page writeback if all the extents + * in the tree for that page are done with writeback + */ +static int check_page_writeback(struct extent_io_tree *tree, + struct page *page) +{ + end_page_writeback(page); + return 0; +} + +/* lots and lots of room for performance fixes in the end_bio funcs */ + +/* + * after a writepage IO is done, we need to: + * clear the uptodate bits on error + * clear the writeback bits in the extent tree for this IO + * end_page_writeback if the page has no more pending IO + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static void end_bio_extent_writepage(struct bio *bio, int err) +{ + int uptodate = err == 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree; + u64 start; + u64 end; + int whole_page; + int ret; + + do { + struct page *page = bvec->bv_page; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + if (tree->ops && tree->ops->writepage_end_io_hook) { + ret = tree->ops->writepage_end_io_hook(page, start, + end, NULL, uptodate); + if (ret) + uptodate = 0; + } + + if (!uptodate && tree->ops && + tree->ops->writepage_io_failed_hook) { + ret = tree->ops->writepage_io_failed_hook(bio, page, + start, end, NULL); + if (ret == 0) { + uptodate = (err == 0); + continue; + } + } + + if (!uptodate) { + clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); + ClearPageUptodate(page); + SetPageError(page); + } + + if (whole_page) + end_page_writeback(page); + else + check_page_writeback(tree, page); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +} + +/* + * after a readpage IO is done, we need to: + * clear the uptodate bits on error + * set the uptodate bits if things worked + * set the page up to date if all extents in the tree are uptodate + * clear the lock bit in the extent tree + * unlock the page if there are no other extents locked for it + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static void end_bio_extent_readpage(struct bio *bio, int err) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec = bio->bi_io_vec; + struct extent_io_tree *tree; + u64 start; + u64 end; + int whole_page; + int ret; + + if (err) + uptodate = 0; + + do { + struct page *page = bvec->bv_page; + struct extent_state *cached = NULL; + struct extent_state *state; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (++bvec <= bvec_end) + prefetchw(&bvec->bv_page->flags); + + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); + if (state && state->start == start) { + /* + * take a reference on the state, unlock will drop + * the ref + */ + cache_state(state, &cached); + } + spin_unlock(&tree->lock); + + if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { + ret = tree->ops->readpage_end_io_hook(page, start, end, + state); + if (ret) + uptodate = 0; + } + if (!uptodate && tree->ops && + tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook(bio, page, + start, end, NULL); + if (ret == 0) { + uptodate = + test_bit(BIO_UPTODATE, &bio->bi_flags); + if (err) + uptodate = 0; + uncache_state(&cached); + continue; + } + } + + if (uptodate) { + set_extent_uptodate(tree, start, end, &cached, + GFP_ATOMIC); + } + unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); + + if (whole_page) { + if (uptodate) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { + if (uptodate) { + check_page_uptodate(tree, page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + check_page_locked(tree, page); + } + } while (bvec <= bvec_end); + + bio_put(bio); +} + +struct bio * +btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags) +{ + struct bio *bio; + + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_size = 0; + bio->bi_bdev = bdev; + bio->bi_sector = first_sector; + } + return bio; +} + +static int submit_one_bio(int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + int ret = 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct page *page = bvec->bv_page; + struct extent_io_tree *tree = bio->bi_private; + u64 start; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + + bio->bi_private = NULL; + + bio_get(bio); + + if (tree->ops && tree->ops->submit_bio_hook) + ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + mirror_num, bio_flags, start); + else + submit_bio(rw, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + bio_put(bio); + return ret; +} + +static int submit_extent_page(int rw, struct extent_io_tree *tree, + struct page *page, sector_t sector, + size_t size, unsigned long offset, + struct block_device *bdev, + struct bio **bio_ret, + unsigned long max_pages, + bio_end_io_t end_io_func, + int mirror_num, + unsigned long prev_bio_flags, + unsigned long bio_flags) +{ + int ret = 0; + struct bio *bio; + int nr; + int contig = 0; + int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; + int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; + size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); + + if (bio_ret && *bio_ret) { + bio = *bio_ret; + if (old_compressed) + contig = bio->bi_sector == sector; + else + contig = bio->bi_sector + (bio->bi_size >> 9) == + sector; + + if (prev_bio_flags != bio_flags || !contig || + (tree->ops && tree->ops->merge_bio_hook && + tree->ops->merge_bio_hook(page, offset, page_size, bio, + bio_flags)) || + bio_add_page(bio, page, page_size, offset) < page_size) { + ret = submit_one_bio(rw, bio, mirror_num, + prev_bio_flags); + bio = NULL; + } else { + return 0; + } + } + if (this_compressed) + nr = BIO_MAX_PAGES; + else + nr = bio_get_nr_vecs(bdev); + + bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + if (!bio) + return -ENOMEM; + + bio_add_page(bio, page, page_size, offset); + bio->bi_end_io = end_io_func; + bio->bi_private = tree; + + if (bio_ret) + *bio_ret = bio; + else + ret = submit_one_bio(rw, bio, mirror_num, bio_flags); + + return ret; +} + +void set_page_extent_mapped(struct page *page) +{ + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, EXTENT_PAGE_PRIVATE); + } +} + +static void set_page_extent_head(struct page *page, unsigned long len) +{ + WARN_ON(!PagePrivate(page)); + set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); +} + +/* + * basic readpage implementation. Locked extent state structs are inserted + * into the tree that are removed when the IO is done (by the end_io + * handlers) + */ +static int __extent_read_full_page(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct bio **bio, int mirror_num, + unsigned long *bio_flags) +{ + struct inode *inode = page->mapping->host; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 cur_end; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + struct btrfs_ordered_extent *ordered; + int ret; + int nr = 0; + size_t pg_offset = 0; + size_t iosize; + size_t disk_io_size; + size_t blocksize = inode->i_sb->s_blocksize; + unsigned long this_bio_flag = 0; + + set_page_extent_mapped(page); + + if (!PageUptodate(page)) { + if (cleancache_get_page(page) == 0) { + BUG_ON(blocksize != PAGE_SIZE); + goto out; + } + } + + end = page_end; + while (1) { + lock_extent(tree, start, end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, start); + if (!ordered) + break; + unlock_extent(tree, start, end, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + if (page->index == last_byte >> PAGE_CACHE_SHIFT) { + char *userpage; + size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + iosize = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + zero_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + } + } + while (cur <= end) { + if (cur >= last_byte) { + char *userpage; + struct extent_state *cached = NULL; + + iosize = PAGE_CACHE_SIZE - pg_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + pg_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + set_extent_uptodate(tree, cur, cur + iosize - 1, + &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, cur + iosize - 1, + &cached, GFP_NOFS); + break; + } + em = get_extent(inode, page, pg_offset, cur, + end - cur + 1, 0); + if (IS_ERR_OR_NULL(em)) { + SetPageError(page); + unlock_extent(tree, cur, end, GFP_NOFS); + break; + } + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + this_bio_flag = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&this_bio_flag, + em->compress_type); + } + + iosize = min(extent_map_end(em) - cur, end - cur + 1); + cur_end = min(extent_map_end(em) - 1, end); + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + if (this_bio_flag & EXTENT_BIO_COMPRESSED) { + disk_io_size = em->block_len; + sector = em->block_start >> 9; + } else { + sector = (em->block_start + extent_offset) >> 9; + disk_io_size = iosize; + } + bdev = em->bdev; + block_start = em->block_start; + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + block_start = EXTENT_MAP_HOLE; + free_extent_map(em); + em = NULL; + + /* we've found a hole, just zero and go on */ + if (block_start == EXTENT_MAP_HOLE) { + char *userpage; + struct extent_state *cached = NULL; + + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + pg_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + + set_extent_uptodate(tree, cur, cur + iosize - 1, + &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, cur + iosize - 1, + &cached, GFP_NOFS); + cur = cur + iosize; + pg_offset += iosize; + continue; + } + /* the get_extent function already copied into the page */ + if (test_range_bit(tree, cur, cur_end, + EXTENT_UPTODATE, 1, NULL)) { + check_page_uptodate(tree, page); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + pg_offset += iosize; + continue; + } + /* we have an inline extent but it didn't get marked up + * to date. Error out + */ + if (block_start == EXTENT_MAP_INLINE) { + SetPageError(page); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + pg_offset += iosize; + continue; + } + + ret = 0; + if (tree->ops && tree->ops->readpage_io_hook) { + ret = tree->ops->readpage_io_hook(page, cur, + cur + iosize - 1); + } + if (!ret) { + unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + pnr -= page->index; + ret = submit_extent_page(READ, tree, page, + sector, disk_io_size, pg_offset, + bdev, bio, pnr, + end_bio_extent_readpage, mirror_num, + *bio_flags, + this_bio_flag); + nr++; + *bio_flags = this_bio_flag; + } + if (ret) + SetPageError(page); + cur = cur + iosize; + pg_offset += iosize; + } +out: + if (!nr) { + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + } + return 0; +} + +int extent_read_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent) +{ + struct bio *bio = NULL; + unsigned long bio_flags = 0; + int ret; + + ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, + &bio_flags); + if (bio) + ret = submit_one_bio(READ, bio, 0, bio_flags); + return ret; +} + +static noinline void update_nr_written(struct page *page, + struct writeback_control *wbc, + unsigned long nr_written) +{ + wbc->nr_to_write -= nr_written; + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && + wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) + page->mapping->writeback_index = page->index + nr_written; +} + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +static int __extent_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct extent_page_data *epd = data; + struct extent_io_tree *tree = epd->tree; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 delalloc_start; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 iosize; + sector_t sector; + struct extent_state *cached_state = NULL; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t pg_offset = 0; + size_t blocksize; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + u64 nr_delalloc; + u64 delalloc_end; + int page_started; + int compressed; + int write_flags; + unsigned long nr_written = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC; + else + write_flags = WRITE; + + trace___extent_writepage(page, inode, wbc); + + WARN_ON(!PageLocked(page)); + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); + if (page->index > end_index || + (page->index == end_index && !pg_offset)) { + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + char *userpage; + + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + pg_offset, 0, + PAGE_CACHE_SIZE - pg_offset); + kunmap_atomic(userpage, KM_USER0); + flush_dcache_page(page); + } + pg_offset = 0; + + set_page_extent_mapped(page); + + delalloc_start = start; + delalloc_end = 0; + page_started = 0; + if (!epd->extent_locked) { + u64 delalloc_to_write = 0; + /* + * make sure the wbc mapping index is at least updated + * to this page. + */ + update_nr_written(page, wbc, 0); + + while (delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(inode, tree, + page, + &delalloc_start, + &delalloc_end, + 128 * 1024 * 1024); + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } + tree->ops->fill_delalloc(inode, page, delalloc_start, + delalloc_end, &page_started, + &nr_written); + /* + * delalloc_end is already one less than the total + * length, so we don't subtract one from + * PAGE_CACHE_SIZE + */ + delalloc_to_write += (delalloc_end - delalloc_start + + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + delalloc_start = delalloc_end + 1; + } + if (wbc->nr_to_write < delalloc_to_write) { + int thresh = 8192; + + if (delalloc_to_write < thresh * 2) + thresh = delalloc_to_write; + wbc->nr_to_write = min_t(u64, delalloc_to_write, + thresh); + } + + /* did the fill delalloc function already unlock and start + * the IO? + */ + if (page_started) { + ret = 0; + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= nr_written; + goto done_unlocked; + } + } + if (tree->ops && tree->ops->writepage_start_hook) { + ret = tree->ops->writepage_start_hook(page, start, + page_end); + if (ret == -EAGAIN) { + redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); + unlock_page(page); + ret = 0; + goto done_unlocked; + } + } + + /* + * we don't want to touch the inode after unlocking the page, + * so we update the mapping writeback index now + */ + update_nr_written(page, wbc, nr_written + 1); + + end = page_end; + if (last_byte <= start) { + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + page_end, NULL, 1); + goto done; + } + + blocksize = inode->i_sb->s_blocksize; + + while (cur <= end) { + if (cur >= last_byte) { + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, cur, + page_end, NULL, 1); + break; + } + em = epd->get_extent(inode, page, pg_offset, cur, + end - cur + 1, 1); + if (IS_ERR_OR_NULL(em)) { + SetPageError(page); + break; + } + + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + iosize = min(extent_map_end(em) - cur, end - cur + 1); + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + sector = (em->block_start + extent_offset) >> 9; + bdev = em->bdev; + block_start = em->block_start; + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + free_extent_map(em); + em = NULL; + + /* + * compressed and inline extents are written through other + * paths in the FS + */ + if (compressed || block_start == EXTENT_MAP_HOLE || + block_start == EXTENT_MAP_INLINE) { + /* + * end_io notification does not happen here for + * compressed extents + */ + if (!compressed && tree->ops && + tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, cur, + cur + iosize - 1, + NULL, 1); + else if (compressed) { + /* we don't want to end_page_writeback on + * a compressed extent. this happens + * elsewhere + */ + nr++; + } + + cur += iosize; + pg_offset += iosize; + continue; + } + /* leave this out until we have a page_mkwrite call */ + if (0 && !test_range_bit(tree, cur, cur + iosize - 1, + EXTENT_DIRTY, 0, NULL)) { + cur = cur + iosize; + pg_offset += iosize; + continue; + } + + if (tree->ops && tree->ops->writepage_io_hook) { + ret = tree->ops->writepage_io_hook(page, cur, + cur + iosize - 1); + } else { + ret = 0; + } + if (ret) { + SetPageError(page); + } else { + unsigned long max_nr = end_index + 1; + + set_range_writeback(tree, cur, cur + iosize - 1); + if (!PageWriteback(page)) { + printk(KERN_ERR "btrfs warning page %lu not " + "writeback, cur %llu end %llu\n", + page->index, (unsigned long long)cur, + (unsigned long long)end); + } + + ret = submit_extent_page(write_flags, tree, page, + sector, iosize, pg_offset, + bdev, &epd->bio, max_nr, + end_bio_extent_writepage, + 0, 0, 0); + if (ret) + SetPageError(page); + } + cur = cur + iosize; + pg_offset += iosize; + nr++; + } +done: + if (nr == 0) { + /* make sure the mapping tag for page dirty gets cleared */ + set_page_writeback(page); + end_page_writeback(page); + } + unlock_page(page); + +done_unlocked: + + /* drop our reference on any cached states */ + free_extent_state(cached_state); + return 0; +} + +/** + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ +static int extent_write_cache_pages(struct extent_io_tree *tree, + struct address_space *mapping, + struct writeback_control *wbc, + writepage_t writepage, void *data, + void (*flush_fn)(void *)) +{ + int ret = 0; + int done = 0; + int nr_to_write_done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + scanned = 1; + } +retry: + while (!done && !nr_to_write_done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + if (tree->ops && tree->ops->write_cache_pages_lock_hook) + tree->ops->write_cache_pages_lock_hook(page); + else + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) { + if (PageWriteback(page)) + flush_fn(data); + wait_on_page_writeback(page); + } + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + ret = (*writepage)(page, wbc, data); + + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { + unlock_page(page); + ret = 0; + } + if (ret) + done = 1; + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + return ret; +} + +static void flush_epd_write_bio(struct extent_page_data *epd) +{ + if (epd->bio) { + if (epd->sync_io) + submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); + else + submit_one_bio(WRITE, epd->bio, 0, 0); + epd->bio = NULL; + } +} + +static noinline void flush_write_bio(void *data) +{ + struct extent_page_data *epd = data; + flush_epd_write_bio(epd); +} + +int extent_write_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret; + struct address_space *mapping = page->mapping; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + }; + struct writeback_control wbc_writepages = { + .sync_mode = wbc->sync_mode, + .older_than_this = NULL, + .nr_to_write = 64, + .range_start = page_offset(page) + PAGE_CACHE_SIZE, + .range_end = (loff_t)-1, + }; + + ret = __extent_writepage(page, wbc, &epd); + + extent_write_cache_pages(tree, mapping, &wbc_writepages, + __extent_writepage, &epd, flush_write_bio); + flush_epd_write_bio(&epd); + return ret; +} + +int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, + u64 start, u64 end, get_extent_t *get_extent, + int mode) +{ + int ret = 0; + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 1, + .sync_io = mode == WB_SYNC_ALL, + }; + struct writeback_control wbc_writepages = { + .sync_mode = mode, + .older_than_this = NULL, + .nr_to_write = nr_pages * 2, + .range_start = start, + .range_end = end + 1, + }; + + while (start <= end) { + page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + if (clear_page_dirty_for_io(page)) + ret = __extent_writepage(page, &wbc_writepages, &epd); + else { + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + start + PAGE_CACHE_SIZE - 1, + NULL, 1); + unlock_page(page); + } + page_cache_release(page); + start += PAGE_CACHE_SIZE; + } + + flush_epd_write_bio(&epd); + return ret; +} + +int extent_writepages(struct extent_io_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret = 0; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + }; + + ret = extent_write_cache_pages(tree, mapping, wbc, + __extent_writepage, &epd, + flush_write_bio); + flush_epd_write_bio(&epd); + return ret; +} + +int extent_readpages(struct extent_io_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent) +{ + struct bio *bio = NULL; + unsigned page_idx; + unsigned long bio_flags = 0; + + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + if (!add_to_page_cache_lru(page, mapping, + page->index, GFP_NOFS)) { + __extent_read_full_page(tree, page, get_extent, + &bio, 0, &bio_flags); + } + page_cache_release(page); + } + BUG_ON(!list_empty(pages)); + if (bio) + submit_one_bio(READ, bio, 0, bio_flags); + return 0; +} + +/* + * basic invalidatepage code, this waits on any locked or writeback + * ranges corresponding to the page, and then deletes any extent state + * records from the tree + */ +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset) +{ + struct extent_state *cached_state = NULL; + u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); + u64 end = start + PAGE_CACHE_SIZE - 1; + size_t blocksize = page->mapping->host->i_sb->s_blocksize; + + start += (offset + blocksize - 1) & ~(blocksize - 1); + if (start > end) + return 0; + + lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); + wait_on_page_writeback(page); + clear_extent_bit(tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, + 1, 1, &cached_state, GFP_NOFS); + return 0; +} + +/* + * a helper for releasepage, this tests for areas of the page that + * are locked or under IO and drops the related state bits if it is safe + * to drop the page. + */ +int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + int ret = 1; + + if (test_range_bit(tree, start, end, + EXTENT_IOBITS, 0, NULL)) + ret = 0; + else { + if ((mask & GFP_NOFS) == GFP_NOFS) + mask = GFP_NOFS; + /* + * at this point we can safely clear everything except the + * locked bit and the nodatasum bit + */ + ret = clear_extent_bit(tree, start, end, + ~(EXTENT_LOCKED | EXTENT_NODATASUM), + 0, 0, NULL, mask); + + /* if clear_extent_bit failed for enomem reasons, + * we can't allow the release to continue. + */ + if (ret < 0) + ret = 0; + else + ret = 1; + } + return ret; +} + +/* + * a helper for releasepage. As long as there are no locked extents + * in the range corresponding to the page, both state records and extent + * map records are removed + */ +int try_release_extent_mapping(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask) +{ + struct extent_map *em; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + + if ((mask & __GFP_WAIT) && + page->mapping->host->i_size > 16 * 1024 * 1024) { + u64 len; + while (start <= end) { + len = end - start + 1; + write_lock(&map->lock); + em = lookup_extent_mapping(map, start, len); + if (IS_ERR_OR_NULL(em)) { + write_unlock(&map->lock); + break; + } + if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || + em->start != start) { + write_unlock(&map->lock); + free_extent_map(em); + break; + } + if (!test_range_bit(tree, em->start, + extent_map_end(em) - 1, + EXTENT_LOCKED | EXTENT_WRITEBACK, + 0, NULL)) { + remove_extent_mapping(map, em); + /* once for the rb tree */ + free_extent_map(em); + } + start = extent_map_end(em); + write_unlock(&map->lock); + + /* once for us */ + free_extent_map(em); + } + } + return try_release_extent_state(map, tree, page, mask); +} + +/* + * helper function for fiemap, which doesn't want to see any holes. + * This maps until we find something past 'last' + */ +static struct extent_map *get_extent_skip_holes(struct inode *inode, + u64 offset, + u64 last, + get_extent_t *get_extent) +{ + u64 sectorsize = BTRFS_I(inode)->root->sectorsize; + struct extent_map *em; + u64 len; + + if (offset >= last) + return NULL; + + while(1) { + len = last - offset; + if (len == 0) + break; + len = (len + sectorsize - 1) & ~(sectorsize - 1); + em = get_extent(inode, NULL, 0, offset, len, 0); + if (IS_ERR_OR_NULL(em)) + return em; + + /* if this isn't a hole return it */ + if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && + em->block_start != EXTENT_MAP_HOLE) { + return em; + } + + /* this is a hole, advance to the next extent */ + offset = extent_map_end(em); + free_extent_map(em); + if (offset >= last) + break; + } + return NULL; +} + +int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, get_extent_t *get_extent) +{ + int ret = 0; + u64 off = start; + u64 max = start + len; + u32 flags = 0; + u32 found_type; + u64 last; + u64 last_for_get_extent = 0; + u64 disko = 0; + u64 isize = i_size_read(inode); + struct btrfs_key found_key; + struct extent_map *em = NULL; + struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_file_extent_item *item; + int end = 0; + u64 em_start = 0; + u64 em_len = 0; + u64 em_end = 0; + unsigned long emflags; + + if (len == 0) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + /* + * lookup the last file extent. We're not using i_size here + * because there might be preallocation past i_size + */ + ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, + path, btrfs_ino(inode), -1, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } + WARN_ON(!ret); + path->slots[0]--; + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + + /* No extents, but there might be delalloc bits */ + if (found_key.objectid != btrfs_ino(inode) || + found_type != BTRFS_EXTENT_DATA_KEY) { + /* have to trust i_size as the end */ + last = (u64)-1; + last_for_get_extent = isize; + } else { + /* + * remember the start of the last extent. There are a + * bunch of different factors that go into the length of the + * extent, so its much less complex to remember where it started + */ + last = found_key.offset; + last_for_get_extent = last + 1; + } + btrfs_free_path(path); + + /* + * we might have some extents allocated but more delalloc past those + * extents. so, we trust isize unless the start of the last extent is + * beyond isize + */ + if (last < isize) { + last = (u64)-1; + last_for_get_extent = isize; + } + + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, + &cached_state, GFP_NOFS); + + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); + if (!em) + goto out; + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + while (!end) { + u64 offset_in_extent; + + /* break if the extent we found is outside the range */ + if (em->start >= max || extent_map_end(em) < off) + break; + + /* + * get_extent may return an extent that starts before our + * requested range. We have to make sure the ranges + * we return to fiemap always move forward and don't + * overlap, so adjust the offsets here + */ + em_start = max(em->start, off); + + /* + * record the offset from the start of the extent + * for adjusting the disk offset below + */ + offset_in_extent = em_start - em->start; + em_end = extent_map_end(em); + em_len = em_end - em_start; + emflags = em->flags; + disko = 0; + flags = 0; + + /* + * bump off for our next call to get_extent + */ + off = extent_map_end(em); + if (off >= max) + end = 1; + + if (em->block_start == EXTENT_MAP_LAST_BYTE) { + end = 1; + flags |= FIEMAP_EXTENT_LAST; + } else if (em->block_start == EXTENT_MAP_INLINE) { + flags |= (FIEMAP_EXTENT_DATA_INLINE | + FIEMAP_EXTENT_NOT_ALIGNED); + } else if (em->block_start == EXTENT_MAP_DELALLOC) { + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + } else { + disko = em->block_start + offset_in_extent; + } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + flags |= FIEMAP_EXTENT_ENCODED; + + free_extent_map(em); + em = NULL; + if ((em_start >= last) || em_len == (u64)-1 || + (last == (u64)-1 && isize <= em_end)) { + flags |= FIEMAP_EXTENT_LAST; + end = 1; + } + + /* now scan forward to see if this is really the last extent. */ + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + if (!em) { + flags |= FIEMAP_EXTENT_LAST; + end = 1; + } + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; + } +out_free: + free_extent_map(em); +out: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, + &cached_state, GFP_NOFS); + return ret; +} + +static inline struct page *extent_buffer_page(struct extent_buffer *eb, + unsigned long i) +{ + struct page *p; + struct address_space *mapping; + + if (i == 0) + return eb->first_page; + i += eb->start >> PAGE_CACHE_SHIFT; + mapping = eb->first_page->mapping; + if (!mapping) + return NULL; + + /* + * extent_buffer_page is only called after pinning the page + * by increasing the reference count. So we know the page must + * be in the radix tree. + */ + rcu_read_lock(); + p = radix_tree_lookup(&mapping->page_tree, i); + rcu_read_unlock(); + + return p; +} + +static inline unsigned long num_extent_pages(u64 start, u64 len) +{ + return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT); +} + +static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, + unsigned long len, + gfp_t mask) +{ + struct extent_buffer *eb = NULL; +#if LEAK_DEBUG + unsigned long flags; +#endif + + eb = kmem_cache_zalloc(extent_buffer_cache, mask); + if (eb == NULL) + return NULL; + eb->start = start; + eb->len = len; + spin_lock_init(&eb->lock); + init_waitqueue_head(&eb->lock_wq); + +#if LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_add(&eb->leak_list, &buffers); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + atomic_set(&eb->refs, 1); + + return eb; +} + +static void __free_extent_buffer(struct extent_buffer *eb) +{ +#if LEAK_DEBUG + unsigned long flags; + spin_lock_irqsave(&leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + kmem_cache_free(extent_buffer_cache, eb); +} + +/* + * Helper for releasing extent buffer page. + */ +static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, + unsigned long start_idx) +{ + unsigned long index; + struct page *page; + + if (!eb->first_page) + return; + + index = num_extent_pages(eb->start, eb->len); + if (start_idx >= index) + return; + + do { + index--; + page = extent_buffer_page(eb, index); + if (page) + page_cache_release(page); + } while (index != start_idx); +} + +/* + * Helper for releasing the extent buffer. + */ +static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) +{ + btrfs_release_extent_buffer_page(eb, 0); + __free_extent_buffer(eb); +} + +struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + struct page *page0) +{ + unsigned long num_pages = num_extent_pages(start, len); + unsigned long i; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct extent_buffer *eb; + struct extent_buffer *exists = NULL; + struct page *p; + struct address_space *mapping = tree->mapping; + int uptodate = 1; + int ret; + + rcu_read_lock(); + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + mark_page_accessed(eb->first_page); + return eb; + } + rcu_read_unlock(); + + eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); + if (!eb) + return NULL; + + if (page0) { + eb->first_page = page0; + i = 1; + index++; + page_cache_get(page0); + mark_page_accessed(page0); + set_page_extent_mapped(page0); + set_page_extent_head(page0, len); + uptodate = PageUptodate(page0); + } else { + i = 0; + } + for (; i < num_pages; i++, index++) { + p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM); + if (!p) { + WARN_ON(1); + goto free_eb; + } + set_page_extent_mapped(p); + mark_page_accessed(p); + if (i == 0) { + eb->first_page = p; + set_page_extent_head(p, len); + } else { + set_page_private(p, EXTENT_PAGE_PRIVATE); + } + if (!PageUptodate(p)) + uptodate = 0; + + /* + * see below about how we avoid a nasty race with release page + * and why we unlock later + */ + if (i != 0) + unlock_page(p); + } + if (uptodate) + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto free_eb; + + spin_lock(&tree->buffer_lock); + ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); + if (ret == -EEXIST) { + exists = radix_tree_lookup(&tree->buffer, + start >> PAGE_CACHE_SHIFT); + /* add one reference for the caller */ + atomic_inc(&exists->refs); + spin_unlock(&tree->buffer_lock); + radix_tree_preload_end(); + goto free_eb; + } + /* add one reference for the tree */ + atomic_inc(&eb->refs); + spin_unlock(&tree->buffer_lock); + radix_tree_preload_end(); + + /* + * there is a race where release page may have + * tried to find this extent buffer in the radix + * but failed. It will tell the VM it is safe to + * reclaim the, and it will clear the page private bit. + * We must make sure to set the page private bit properly + * after the extent buffer is in the radix tree so + * it doesn't get lost + */ + set_page_extent_mapped(eb->first_page); + set_page_extent_head(eb->first_page, eb->len); + if (!page0) + unlock_page(eb->first_page); + return eb; + +free_eb: + if (eb->first_page && !page0) + unlock_page(eb->first_page); + + if (!atomic_dec_and_test(&eb->refs)) + return exists; + btrfs_release_extent_buffer(eb); + return exists; +} + +struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len) +{ + struct extent_buffer *eb; + + rcu_read_lock(); + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + mark_page_accessed(eb->first_page); + return eb; + } + rcu_read_unlock(); + + return NULL; +} + +void free_extent_buffer(struct extent_buffer *eb) +{ + if (!eb) + return; + + if (!atomic_dec_and_test(&eb->refs)) + return; + + WARN_ON(1); +} + +int clear_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + unsigned long num_pages; + struct page *page; + + num_pages = num_extent_pages(eb->start, eb->len); + + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!PageDirty(page)) + continue; + + lock_page(page); + WARN_ON(!PagePrivate(page)); + + set_page_extent_mapped(page); + if (i == 0) + set_page_extent_head(page, eb->len); + + clear_page_dirty_for_io(page); + spin_lock_irq(&page->mapping->tree_lock); + if (!PageDirty(page)) { + radix_tree_tag_clear(&page->mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&page->mapping->tree_lock); + unlock_page(page); + } + return 0; +} + +int set_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + unsigned long num_pages; + int was_dirty = 0; + + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) + __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + return was_dirty; +} + +int clear_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb, + struct extent_state **cached_state) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + cached_state, GFP_NOFS); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (page) + ClearPageUptodate(page); + } + return 0; +} + +int set_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + + set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + NULL, GFP_NOFS); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { + check_page_uptodate(tree, page); + continue; + } + SetPageUptodate(page); + } + return 0; +} + +int extent_range_uptodate(struct extent_io_tree *tree, + u64 start, u64 end) +{ + struct page *page; + int ret; + int pg_uptodate = 1; + int uptodate; + unsigned long index; + + ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); + if (ret) + return 1; + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + page = find_get_page(tree->mapping, index); + uptodate = PageUptodate(page); + page_cache_release(page); + if (!uptodate) { + pg_uptodate = 0; + break; + } + start += PAGE_CACHE_SIZE; + } + return pg_uptodate; +} + +int extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb, + struct extent_state *cached_state) +{ + int ret = 0; + unsigned long num_pages; + unsigned long i; + struct page *page; + int pg_uptodate = 1; + + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + return 1; + + ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, cached_state); + if (ret) + return ret; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!PageUptodate(page)) { + pg_uptodate = 0; + break; + } + } + return pg_uptodate; +} + +int read_extent_buffer_pages(struct extent_io_tree *tree, + struct extent_buffer *eb, + u64 start, int wait, + get_extent_t *get_extent, int mirror_num) +{ + unsigned long i; + unsigned long start_i; + struct page *page; + int err; + int ret = 0; + int locked_pages = 0; + int all_uptodate = 1; + int inc_all_pages = 0; + unsigned long num_pages; + struct bio *bio = NULL; + unsigned long bio_flags = 0; + + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + return 0; + + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, NULL)) { + return 0; + } + + if (start) { + WARN_ON(start < eb->start); + start_i = (start >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT); + } else { + start_i = 0; + } + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!wait) { + if (!trylock_page(page)) + goto unlock_exit; + } else { + lock_page(page); + } + locked_pages++; + if (!PageUptodate(page)) + all_uptodate = 0; + } + if (all_uptodate) { + if (start_i == 0) + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + goto unlock_exit; + } + + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + + WARN_ON(!PagePrivate(page)); + + set_page_extent_mapped(page); + if (i == 0) + set_page_extent_head(page, eb->len); + + if (inc_all_pages) + page_cache_get(page); + if (!PageUptodate(page)) { + if (start_i == 0) + inc_all_pages = 1; + ClearPageError(page); + err = __extent_read_full_page(tree, page, + get_extent, &bio, + mirror_num, &bio_flags); + if (err) + ret = err; + } else { + unlock_page(page); + } + } + + if (bio) + submit_one_bio(READ, bio, mirror_num, bio_flags); + + if (ret || !wait) + return ret; + + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + wait_on_page_locked(page); + if (!PageUptodate(page)) + ret = -EIO; + } + + if (!ret) + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + return ret; + +unlock_exit: + i = start_i; + while (locked_pages > 0) { + page = extent_buffer_page(eb, i); + i++; + unlock_page(page); + locked_pages--; + } + return ret; +} + +void read_extent_buffer(struct extent_buffer *eb, void *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + kaddr = kmap_atomic(page, KM_USER1); + memcpy(dst, kaddr + offset, cur); + kunmap_atomic(kaddr, KM_USER1); + + dst += cur; + len -= cur; + offset = 0; + i++; + } +} + +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + size_t offset = start & (PAGE_CACHE_SIZE - 1); + char *kaddr; + struct page *p; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + unsigned long end_i = (start_offset + start + min_len - 1) >> + PAGE_CACHE_SHIFT; + + if (i != end_i) + return -EINVAL; + + if (i == 0) { + offset = start_offset; + *map_start = 0; + } else { + offset = 0; + *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; + } + + if (start + min_len > eb->len) { + printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " + "wanted %lu %lu\n", (unsigned long long)eb->start, + eb->len, start, min_len); + WARN_ON(1); + return -EINVAL; + } + + p = extent_buffer_page(eb, i); + kaddr = kmap_atomic(p, km); + *token = kaddr; + *map = kaddr + offset; + *map_len = PAGE_CACHE_SIZE - offset; + return 0; +} + +int map_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, + char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + int err; + int save = 0; + if (eb->map_token) { + unmap_extent_buffer(eb, eb->map_token, km); + eb->map_token = NULL; + save = 1; + } + err = map_private_extent_buffer(eb, start, min_len, token, map, + map_start, map_len, km); + if (!err && save) { + eb->map_token = *token; + eb->kaddr = *map; + eb->map_start = *map_start; + eb->map_len = *map_len; + } + return err; +} + +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) +{ + kunmap_atomic(token, km); +} + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *ptr = (char *)ptrv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + + kaddr = kmap_atomic(page, KM_USER0); + ret = memcmp(ptr, kaddr + offset, cur); + kunmap_atomic(kaddr, KM_USER0); + if (ret) + break; + + ptr += cur; + len -= cur; + offset = 0; + i++; + } + return ret; +} + +void write_extent_buffer(struct extent_buffer *eb, const void *srcv, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *src = (char *)srcv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + kaddr = kmap_atomic(page, KM_USER1); + memcpy(kaddr + offset, src, cur); + kunmap_atomic(kaddr, KM_USER1); + + src += cur; + len -= cur; + offset = 0; + i++; + } +} + +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, c, cur); + kunmap_atomic(kaddr, KM_USER0); + + len -= cur; + offset = 0; + i++; + } +} + +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) +{ + u64 dst_len = dst->len; + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(src->len != dst_len); + + offset = (start_offset + dst_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(dst, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); + + kaddr = kmap_atomic(page, KM_USER0); + read_extent_buffer(src, kaddr + offset, src_offset, cur); + kunmap_atomic(kaddr, KM_USER0); + + src_offset += cur; + len -= cur; + offset = 0; + i++; + } +} + +static void move_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + if (dst_page == src_page) { + memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); + } else { + char *src_kaddr = kmap_atomic(src_page, KM_USER1); + char *p = dst_kaddr + dst_off + len; + char *s = src_kaddr + src_off + len; + + while (len--) + *--p = *--s; + + kunmap_atomic(src_kaddr, KM_USER1); + } + kunmap_atomic(dst_kaddr, KM_USER0); +} + +static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) +{ + unsigned long distance = (src > dst) ? src - dst : dst - src; + return distance < len; +} + +static void copy_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *src_kaddr; + + if (dst_page != src_page) { + src_kaddr = kmap_atomic(src_page, KM_USER1); + } else { + src_kaddr = dst_kaddr; + BUG_ON(areas_overlap(src_off, dst_off, len)); + } + + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + kunmap_atomic(dst_kaddr, KM_USER0); + if (dst_page != src_page) + kunmap_atomic(src_kaddr, KM_USER1); +} + +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + "len %lu dst len %lu\n", src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + "len %lu dst len %lu\n", dst_offset, len, dst->len); + BUG_ON(1); + } + + while (len > 0) { + dst_off_in_page = (start_offset + dst_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = (start_offset + src_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - + src_off_in_page)); + cur = min_t(unsigned long, cur, + (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); + + copy_pages(extent_buffer_page(dst, dst_i), + extent_buffer_page(dst, src_i), + dst_off_in_page, src_off_in_page, cur); + + src_offset += cur; + dst_offset += cur; + len -= cur; + } +} + +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + unsigned long dst_end = dst_offset + len - 1; + unsigned long src_end = src_offset + len - 1; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + "len %lu len %lu\n", src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + "len %lu len %lu\n", dst_offset, len, dst->len); + BUG_ON(1); + } + if (!areas_overlap(src_offset, dst_offset, len)) { + memcpy_extent_buffer(dst, dst_offset, src_offset, len); + return; + } + while (len > 0) { + dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; + + dst_off_in_page = (start_offset + dst_end) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = (start_offset + src_end) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + cur = min_t(unsigned long, len, src_off_in_page + 1); + cur = min(cur, dst_off_in_page + 1); + move_pages(extent_buffer_page(dst, dst_i), + extent_buffer_page(dst, src_i), + dst_off_in_page - cur + 1, + src_off_in_page - cur + 1, cur); + + dst_end -= cur; + src_end -= cur; + len -= cur; + } +} + +static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +{ + struct extent_buffer *eb = + container_of(head, struct extent_buffer, rcu_head); + + btrfs_release_extent_buffer(eb); +} + +int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) +{ + u64 start = page_offset(page); + struct extent_buffer *eb; + int ret = 1; + + spin_lock(&tree->buffer_lock); + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); + if (!eb) { + spin_unlock(&tree->buffer_lock); + return ret; + } + + if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + ret = 0; + goto out; + } + + /* + * set @eb->refs to 0 if it is already 1, and then release the @eb. + * Or go back. + */ + if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { + ret = 0; + goto out; + } + + radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); +out: + spin_unlock(&tree->buffer_lock); + + /* at this point we can safely release the extent buffer */ + if (atomic_read(&eb->refs) == 0) + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); + return ret; +} diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h new file mode 100644 index 00000000..a11a92ee --- /dev/null +++ b/fs/btrfs/extent_io.h @@ -0,0 +1,300 @@ +#ifndef __EXTENTIO__ +#define __EXTENTIO__ + +#include + +/* bits for the extent state */ +#define EXTENT_DIRTY 1 +#define EXTENT_WRITEBACK (1 << 1) +#define EXTENT_UPTODATE (1 << 2) +#define EXTENT_LOCKED (1 << 3) +#define EXTENT_NEW (1 << 4) +#define EXTENT_DELALLOC (1 << 5) +#define EXTENT_DEFRAG (1 << 6) +#define EXTENT_DEFRAG_DONE (1 << 7) +#define EXTENT_BUFFER_FILLED (1 << 8) +#define EXTENT_BOUNDARY (1 << 9) +#define EXTENT_NODATASUM (1 << 10) +#define EXTENT_DO_ACCOUNTING (1 << 11) +#define EXTENT_FIRST_DELALLOC (1 << 12) +#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) + +/* + * flags for bio submission. The high bits indicate the compression + * type for this bio + */ +#define EXTENT_BIO_COMPRESSED 1 +#define EXTENT_BIO_FLAG_SHIFT 16 + +/* these are bit numbers for test/set bit */ +#define EXTENT_BUFFER_UPTODATE 0 +#define EXTENT_BUFFER_BLOCKING 1 +#define EXTENT_BUFFER_DIRTY 2 +#define EXTENT_BUFFER_CORRUPT 3 + +/* these are flags for extent_clear_unlock_delalloc */ +#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 +#define EXTENT_CLEAR_UNLOCK 0x2 +#define EXTENT_CLEAR_DELALLOC 0x4 +#define EXTENT_CLEAR_DIRTY 0x8 +#define EXTENT_SET_WRITEBACK 0x10 +#define EXTENT_END_WRITEBACK 0x20 +#define EXTENT_SET_PRIVATE2 0x40 +#define EXTENT_CLEAR_ACCOUNTING 0x80 + +/* + * page->private values. Every page that is controlled by the extent + * map has page->private set to one. + */ +#define EXTENT_PAGE_PRIVATE 1 +#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 + +struct extent_state; + +typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 bio_offset); +struct extent_io_ops { + int (*fill_delalloc)(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written); + int (*writepage_start_hook)(struct page *page, u64 start, u64 end); + int (*writepage_io_hook)(struct page *page, u64 start, u64 end); + extent_submit_bio_hook_t *submit_bio_hook; + int (*merge_bio_hook)(struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags); + int (*readpage_io_hook)(struct page *page, u64 start, u64 end); + int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state); + int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state); + int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, + struct extent_state *state); + int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate); + int (*set_bit_hook)(struct inode *inode, struct extent_state *state, + int *bits); + int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, + int *bits); + int (*merge_extent_hook)(struct inode *inode, + struct extent_state *new, + struct extent_state *other); + int (*split_extent_hook)(struct inode *inode, + struct extent_state *orig, u64 split); + int (*write_cache_pages_lock_hook)(struct page *page); +}; + +struct extent_io_tree { + struct rb_root state; + struct radix_tree_root buffer; + struct address_space *mapping; + u64 dirty_bytes; + spinlock_t lock; + spinlock_t buffer_lock; + struct extent_io_ops *ops; +}; + +struct extent_state { + u64 start; + u64 end; /* inclusive */ + struct rb_node rb_node; + + /* ADD NEW ELEMENTS AFTER THIS */ + struct extent_io_tree *tree; + wait_queue_head_t wq; + atomic_t refs; + unsigned long state; + u64 split_start; + u64 split_end; + + /* for use by the FS */ + u64 private; + + struct list_head leak_list; +}; + +struct extent_buffer { + u64 start; + unsigned long len; + char *map_token; + char *kaddr; + unsigned long map_start; + unsigned long map_len; + struct page *first_page; + unsigned long bflags; + struct list_head leak_list; + struct rcu_head rcu_head; + atomic_t refs; + + /* the spinlock is used to protect most operations */ + spinlock_t lock; + + /* + * when we keep the lock held while blocking, waiters go onto + * the wq + */ + wait_queue_head_t lock_wq; +}; + +static inline void extent_set_compress_type(unsigned long *bio_flags, + int compress_type) +{ + *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT; +} + +static inline int extent_compress_type(unsigned long bio_flags) +{ + return bio_flags >> EXTENT_BIO_FLAG_SHIFT; +} + +struct extent_map_tree; + +typedef struct extent_map *(get_extent_t)(struct inode *inode, + struct page *page, + size_t pg_offset, + u64 start, u64 len, + int create); + +void extent_io_tree_init(struct extent_io_tree *tree, + struct address_space *mapping); +int try_release_extent_mapping(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask); +int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page); +int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask); +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, struct extent_state **cached, gfp_t mask); +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached, gfp_t mask); +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int extent_read_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent); +int __init extent_io_init(void); +void extent_io_exit(void); + +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, + u64 max_bytes, unsigned long bits, int contig); + +void free_extent_state(struct extent_state *state); +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int filled, struct extent_state *cached_state); +int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int wake, int delete, struct extent_state **cached, + gfp_t mask); +int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask); +int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask); +int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask); +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits); +struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, int bits); +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset); +int extent_write_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc); +int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, + u64 start, u64 end, get_extent_t *get_extent, + int mode); +int extent_writepages(struct extent_io_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc); +int extent_readpages(struct extent_io_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent); +int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, get_extent_t *get_extent); +int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); +int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); +void set_page_extent_mapped(struct page *page); + +struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + struct page *page0); +struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len); +void free_extent_buffer(struct extent_buffer *eb); +int read_extent_buffer_pages(struct extent_io_tree *tree, + struct extent_buffer *eb, u64 start, int wait, + get_extent_t *get_extent, int mirror_num); + +static inline void extent_buffer_get(struct extent_buffer *eb) +{ + atomic_inc(&eb->refs); +} + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len); +void read_extent_buffer(struct extent_buffer *eb, void *dst, + unsigned long start, + unsigned long len); +void write_extent_buffer(struct extent_buffer *eb, const void *src, + unsigned long start, unsigned long len); +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len); +int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); +int clear_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int clear_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb, + struct extent_state **cached_state); +int extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb, + struct extent_state *cached_state); +int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km); +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km); +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); +int extent_range_uptodate(struct extent_io_tree *tree, + u64 start, u64 end); +int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, + unsigned long op); +struct bio * +btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags); +#endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c new file mode 100644 index 00000000..2d041034 --- /dev/null +++ b/fs/btrfs/extent_map.c @@ -0,0 +1,418 @@ +#include +#include +#include +#include +#include +#include "ctree.h" +#include "extent_map.h" + + +static struct kmem_cache *extent_map_cache; + +int __init extent_map_init(void) +{ + extent_map_cache = kmem_cache_create("extent_map", + sizeof(struct extent_map), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!extent_map_cache) + return -ENOMEM; + return 0; +} + +void extent_map_exit(void) +{ + if (extent_map_cache) + kmem_cache_destroy(extent_map_cache); +} + +/** + * extent_map_tree_init - initialize extent map tree + * @tree: tree to initialize + * + * Initialize the extent tree @tree. Should be called for each new inode + * or other user of the extent_map interface. + */ +void extent_map_tree_init(struct extent_map_tree *tree) +{ + tree->map = RB_ROOT; + rwlock_init(&tree->lock); +} + +/** + * alloc_extent_map - allocate new extent map structure + * + * Allocate a new extent_map structure. The new structure is + * returned with a reference count of one and needs to be + * freed using free_extent_map() + */ +struct extent_map *alloc_extent_map(void) +{ + struct extent_map *em; + em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); + if (!em) + return NULL; + em->in_tree = 0; + em->flags = 0; + em->compress_type = BTRFS_COMPRESS_NONE; + atomic_set(&em->refs, 1); + return em; +} + +/** + * free_extent_map - drop reference count of an extent_map + * @em: extent map beeing releasead + * + * Drops the reference out on @em by one and free the structure + * if the reference count hits zero. + */ +void free_extent_map(struct extent_map *em) +{ + if (!em) + return; + WARN_ON(atomic_read(&em->refs) == 0); + if (atomic_dec_and_test(&em->refs)) { + WARN_ON(em->in_tree); + kmem_cache_free(extent_map_cache, em); + } +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct extent_map *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct extent_map, rb_node); + + WARN_ON(!entry->in_tree); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset >= extent_map_end(entry)) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct extent_map, rb_node); + entry->in_tree = 1; + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * search through the tree for an extent_map with a given offset. If + * it can't be found, try to find some neighboring extents + */ +static struct rb_node *__tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) +{ + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct extent_map *entry; + struct extent_map *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct extent_map, rb_node); + prev = n; + prev_entry = entry; + + WARN_ON(!entry->in_tree); + + if (offset < entry->start) + n = n->rb_left; + else if (offset >= extent_map_end(entry)) + n = n->rb_right; + else + return n; + } + + if (prev_ret) { + orig_prev = prev; + while (prev && offset >= extent_map_end(prev_entry)) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + *prev_ret = prev; + prev = orig_prev; + } + + if (next_ret) { + prev_entry = rb_entry(prev, struct extent_map, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + *next_ret = prev; + } + return NULL; +} + +/* check to see if two extent_map structs are adjacent and safe to merge */ +static int mergable_maps(struct extent_map *prev, struct extent_map *next) +{ + if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) + return 0; + + /* + * don't merge compressed extents, we need to know their + * actual size + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) + return 0; + + if (extent_map_end(prev) == next->start && + prev->flags == next->flags && + prev->bdev == next->bdev && + ((next->block_start == EXTENT_MAP_HOLE && + prev->block_start == EXTENT_MAP_HOLE) || + (next->block_start == EXTENT_MAP_INLINE && + prev->block_start == EXTENT_MAP_INLINE) || + (next->block_start == EXTENT_MAP_DELALLOC && + prev->block_start == EXTENT_MAP_DELALLOC) || + (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && + next->block_start == extent_map_block_end(prev)))) { + return 1; + } + return 0; +} + +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +{ + int ret = 0; + struct extent_map *merge = NULL; + struct rb_node *rb; + struct extent_map *em; + + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, start, len); + + WARN_ON(!em || em->start != start); + + if (!em) + goto out; + + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->len += merge->len; + em->block_len += merge->block_len; + em->block_start = merge->block_start; + merge->in_tree = 0; + rb_erase(&merge->rb_node, &tree->map); + free_extent_map(merge); + } + } + + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + em->block_len += merge->len; + rb_erase(&merge->rb_node, &tree->map); + merge->in_tree = 0; + free_extent_map(merge); + } + + free_extent_map(em); +out: + write_unlock(&tree->lock); + return ret; + +} + +/** + * add_extent_mapping - add new extent map to the extent tree + * @tree: tree to insert new map in + * @em: map to insert + * + * Insert @em into @tree or perform a simple forward/backward merge with + * existing mappings. The extent_map struct passed in will be inserted + * into the tree directly, with an additional reference taken, or a + * reference dropped if the merge attempt was successful. + */ +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em) +{ + int ret = 0; + struct extent_map *merge = NULL; + struct rb_node *rb; + struct extent_map *exist; + + exist = lookup_extent_mapping(tree, em->start, em->len); + if (exist) { + free_extent_map(exist); + ret = -EEXIST; + goto out; + } + rb = tree_insert(&tree->map, em->start, &em->rb_node); + if (rb) { + ret = -EEXIST; + goto out; + } + atomic_inc(&em->refs); + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->len += merge->len; + em->block_len += merge->block_len; + em->block_start = merge->block_start; + merge->in_tree = 0; + rb_erase(&merge->rb_node, &tree->map); + free_extent_map(merge); + } + } + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + em->block_len += merge->len; + rb_erase(&merge->rb_node, &tree->map); + merge->in_tree = 0; + free_extent_map(merge); + } +out: + return ret; +} + +/* simple helper to do math around the end of an extent, handling wrap */ +static u64 range_end(u64 start, u64 len) +{ + if (start + len < start) + return (u64)-1; + return start + len; +} + +/** + * lookup_extent_mapping - lookup extent_map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. There may be additional objects in the tree that + * intersect, so check the object returned carefully to make sure that no + * additional lookups are needed. + */ +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + struct extent_map *em; + struct rb_node *rb_node; + struct rb_node *prev = NULL; + struct rb_node *next = NULL; + u64 end = range_end(start, len); + + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node && prev) { + em = rb_entry(prev, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + } + if (!rb_node && next) { + em = rb_entry(next, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + } + if (!rb_node) { + em = NULL; + goto out; + } + if (IS_ERR(rb_node)) { + em = ERR_CAST(rb_node); + goto out; + } + em = rb_entry(rb_node, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + + em = NULL; + goto out; + +found: + atomic_inc(&em->refs); +out: + return em; +} + +/** + * search_extent_mapping - find a nearby extent map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. + * + * If one can't be found, any nearby extent may be returned + */ +struct extent_map *search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + struct extent_map *em; + struct rb_node *rb_node; + struct rb_node *prev = NULL; + struct rb_node *next = NULL; + + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node && prev) { + em = rb_entry(prev, struct extent_map, rb_node); + goto found; + } + if (!rb_node && next) { + em = rb_entry(next, struct extent_map, rb_node); + goto found; + } + if (!rb_node) { + em = NULL; + goto out; + } + if (IS_ERR(rb_node)) { + em = ERR_CAST(rb_node); + goto out; + } + em = rb_entry(rb_node, struct extent_map, rb_node); + goto found; + + em = NULL; + goto out; + +found: + atomic_inc(&em->refs); +out: + return em; +} + +/** + * remove_extent_mapping - removes an extent_map from the extent tree + * @tree: extent tree to remove from + * @em: extent map beeing removed + * + * Removes @em from @tree. No reference counts are dropped, and no checks + * are done to see if the range is in use + */ +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +{ + int ret = 0; + + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + rb_erase(&em->rb_node, &tree->map); + em->in_tree = 0; + return ret; +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h new file mode 100644 index 00000000..33a7890b --- /dev/null +++ b/fs/btrfs/extent_map.h @@ -0,0 +1,66 @@ +#ifndef __EXTENTMAP__ +#define __EXTENTMAP__ + +#include + +#define EXTENT_MAP_LAST_BYTE (u64)-4 +#define EXTENT_MAP_HOLE (u64)-3 +#define EXTENT_MAP_INLINE (u64)-2 +#define EXTENT_MAP_DELALLOC (u64)-1 + +/* bits for the flags field */ +#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ +#define EXTENT_FLAG_COMPRESSED 1 +#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ +#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ + +struct extent_map { + struct rb_node rb_node; + + /* all of these are in bytes */ + u64 start; + u64 len; + u64 orig_start; + u64 block_start; + u64 block_len; + unsigned long flags; + struct block_device *bdev; + atomic_t refs; + unsigned int in_tree:1; + unsigned int compress_type:4; +}; + +struct extent_map_tree { + struct rb_root map; + rwlock_t lock; +}; + +static inline u64 extent_map_end(struct extent_map *em) +{ + if (em->start + em->len < em->start) + return (u64)-1; + return em->start + em->len; +} + +static inline u64 extent_map_block_end(struct extent_map *em) +{ + if (em->block_start + em->block_len < em->block_start) + return (u64)-1; + return em->block_start + em->block_len; +} + +void extent_map_tree_init(struct extent_map_tree *tree); +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em); +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); + +struct extent_map *alloc_extent_map(void); +void free_extent_map(struct extent_map *em); +int __init extent_map_init(void); +void extent_map_exit(void); +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); +struct extent_map *search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +#endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c new file mode 100644 index 00000000..90d4ee52 --- /dev/null +++ b/fs/btrfs/file-item.c @@ -0,0 +1,869 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" + +#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) * 2) / \ + size) - 1)) + +#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ + sizeof(struct btrfs_ordered_sum)) / \ + sizeof(struct btrfs_sector_sum) * \ + (r)->sectorsize - (r)->sectorsize) + +int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding) +{ + int ret = 0; + struct btrfs_file_extent_item *item; + struct btrfs_key file_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + file_key.objectid = objectid; + file_key.offset = pos; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &file_key, + sizeof(*item)); + if (ret < 0) + goto out; + BUG_ON(ret); + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset); + btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); + btrfs_set_file_extent_offset(leaf, item, offset); + btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, compression); + btrfs_set_file_extent_encryption(leaf, item, encryption); + btrfs_set_file_extent_other_encoding(leaf, item, other_encoding); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return ret; +} + +struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow) +{ + int ret; + struct btrfs_key file_key; + struct btrfs_key found_key; + struct btrfs_csum_item *item; + struct extent_buffer *leaf; + u64 csum_offset = 0; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int csums_in_item; + + file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + file_key.offset = bytenr; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); + if (ret < 0) + goto fail; + leaf = path->nodes[0]; + if (ret > 0) { + ret = 1; + if (path->slots[0] == 0) + goto fail; + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY) + goto fail; + + csum_offset = (bytenr - found_key.offset) >> + root->fs_info->sb->s_blocksize_bits; + csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); + csums_in_item /= csum_size; + + if (csum_offset >= csums_in_item) { + ret = -EFBIG; + goto fail; + } + } + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); + return item; +fail: + if (ret > 0) + ret = -ENOENT; + return ERR_PTR(ret); +} + + +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 offset, int mod) +{ + int ret; + struct btrfs_key file_key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + file_key.objectid = objectid; + file_key.offset = offset; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); + return ret; +} + + +static int __btrfs_lookup_bio_sums(struct btrfs_root *root, + struct inode *inode, struct bio *bio, + u64 logical_offset, u32 *dst, int dio) +{ + u32 sum; + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + u64 offset = 0; + u64 item_start_offset = 0; + u64 item_last_offset = 0; + u64 disk_bytenr; + u32 diff; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int ret; + struct btrfs_path *path; + struct btrfs_csum_item *item = NULL; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + if (bio->bi_size > PAGE_CACHE_SIZE * 8) + path->reada = 2; + + WARN_ON(bio->bi_vcnt <= 0); + + disk_bytenr = (u64)bio->bi_sector << 9; + if (dio) + offset = logical_offset; + while (bio_index < bio->bi_vcnt) { + if (!dio) + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); + if (ret == 0) + goto found; + + if (!item || disk_bytenr < item_start_offset || + disk_bytenr >= item_last_offset) { + struct btrfs_key found_key; + u32 item_size; + + if (item) + btrfs_release_path(path); + item = btrfs_lookup_csum(NULL, root->fs_info->csum_root, + path, disk_bytenr, 0); + if (IS_ERR(item)) { + ret = PTR_ERR(item); + if (ret == -ENOENT || ret == -EFBIG) + ret = 0; + sum = 0; + if (BTRFS_I(inode)->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + set_extent_bits(io_tree, offset, + offset + bvec->bv_len - 1, + EXTENT_NODATASUM, GFP_NOFS); + } else { + printk(KERN_INFO "btrfs no csum found " + "for inode %llu start %llu\n", + (unsigned long long) + btrfs_ino(inode), + (unsigned long long)offset); + } + item = NULL; + btrfs_release_path(path); + goto found; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + item_start_offset = found_key.offset; + item_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + item_last_offset = item_start_offset + + (item_size / csum_size) * + root->sectorsize; + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + } + /* + * this byte range must be able to fit inside + * a single leaf so it will also fit inside a u32 + */ + diff = disk_bytenr - item_start_offset; + diff = diff / root->sectorsize; + diff = diff * csum_size; + + read_extent_buffer(path->nodes[0], &sum, + ((unsigned long)item) + diff, + csum_size); +found: + if (dst) + *dst++ = sum; + else + set_state_private(io_tree, offset, sum); + disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; + bio_index++; + bvec++; + } + btrfs_free_path(path); + return 0; +} + +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0); +} + +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 offset, u32 *dst) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1); +} + +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_csum_item *item; + unsigned long offset; + int ret; + size_t size; + u64 csum_end; + u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + if (search_commit) { + path->skip_locking = 1; + path->reada = 2; + path->search_commit_root = 1; + } + + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = start; + key.type = BTRFS_EXTENT_CSUM_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + if (ret > 0 && path->slots[0] > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY) { + offset = (start - key.offset) >> + root->fs_info->sb->s_blocksize_bits; + if (offset * csum_size < + btrfs_item_size_nr(leaf, path->slots[0] - 1)) + path->slots[0]--; + } + } + + while (start <= end) { + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto fail; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY) + break; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.offset > end) + break; + + if (key.offset > start) + start = key.offset; + + size = btrfs_item_size_nr(leaf, path->slots[0]); + csum_end = key.offset + (size / csum_size) * root->sectorsize; + if (csum_end <= start) { + path->slots[0]++; + continue; + } + + csum_end = min(csum_end, end + 1); + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + while (start < csum_end) { + size = min_t(size_t, csum_end - start, + MAX_ORDERED_SUM_BYTES(root)); + sums = kzalloc(btrfs_ordered_sum_size(root, size), + GFP_NOFS); + BUG_ON(!sums); + + sector_sum = sums->sums; + sums->bytenr = start; + sums->len = size; + + offset = (start - key.offset) >> + root->fs_info->sb->s_blocksize_bits; + offset *= csum_size; + + while (size > 0) { + read_extent_buffer(path->nodes[0], + §or_sum->sum, + ((unsigned long)item) + + offset, csum_size); + sector_sum->bytenr = start; + + size -= root->sectorsize; + start += root->sectorsize; + offset += csum_size; + sector_sum++; + } + list_add_tail(&sums->list, list); + } + path->slots[0]++; + } + ret = 0; +fail: + btrfs_free_path(path); + return ret; +} + +int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 file_start, int contig) +{ + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + char *data; + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + unsigned long total_bytes = 0; + unsigned long this_sum_bytes = 0; + u64 offset; + u64 disk_bytenr; + + WARN_ON(bio->bi_vcnt <= 0); + sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); + if (!sums) + return -ENOMEM; + + sector_sum = sums->sums; + disk_bytenr = (u64)bio->bi_sector << 9; + sums->len = bio->bi_size; + INIT_LIST_HEAD(&sums->list); + + if (contig) + offset = file_start; + else + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + + ordered = btrfs_lookup_ordered_extent(inode, offset); + BUG_ON(!ordered); + sums->bytenr = ordered->start; + + while (bio_index < bio->bi_vcnt) { + if (!contig) + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + + if (!contig && (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset)) { + unsigned long bytes_left; + sums->len = this_sum_bytes; + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + + bytes_left = bio->bi_size - total_bytes; + + sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), + GFP_NOFS); + BUG_ON(!sums); + sector_sum = sums->sums; + sums->len = bytes_left; + ordered = btrfs_lookup_ordered_extent(inode, offset); + BUG_ON(!ordered); + sums->bytenr = ordered->start; + } + + data = kmap_atomic(bvec->bv_page, KM_USER0); + sector_sum->sum = ~(u32)0; + sector_sum->sum = btrfs_csum_data(root, + data + bvec->bv_offset, + sector_sum->sum, + bvec->bv_len); + kunmap_atomic(data, KM_USER0); + btrfs_csum_final(sector_sum->sum, + (char *)§or_sum->sum); + sector_sum->bytenr = disk_bytenr; + + sector_sum++; + bio_index++; + total_bytes += bvec->bv_len; + this_sum_bytes += bvec->bv_len; + disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; + bvec++; + } + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + return 0; +} + +/* + * helper function for csum removal, this expects the + * key to describe the csum pointed to by the path, and it expects + * the csum to overlap the range [bytenr, len] + * + * The csum should not be entirely contained in the range and the + * range should not be entirely contained in the csum. + * + * This calls btrfs_truncate_item with the correct args based on the + * overlap, and fixes up the key as required. + */ +static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u64 bytenr, u64 len) +{ + struct extent_buffer *leaf; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + u64 csum_end; + u64 end_byte = bytenr + len; + u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; + int ret; + + leaf = path->nodes[0]; + csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end <<= root->fs_info->sb->s_blocksize_bits; + csum_end += key->offset; + + if (key->offset < bytenr && csum_end <= end_byte) { + /* + * [ bytenr - len ] + * [ ] + * [csum ] + * A simple truncate off the end of the item + */ + u32 new_size = (bytenr - key->offset) >> blocksize_bits; + new_size *= csum_size; + ret = btrfs_truncate_item(trans, root, path, new_size, 1); + } else if (key->offset >= bytenr && csum_end > end_byte && + end_byte > key->offset) { + /* + * [ bytenr - len ] + * [ ] + * [csum ] + * we need to truncate from the beginning of the csum + */ + u32 new_size = (csum_end - end_byte) >> blocksize_bits; + new_size *= csum_size; + + ret = btrfs_truncate_item(trans, root, path, new_size, 0); + + key->offset = end_byte; + ret = btrfs_set_item_key_safe(trans, root, path, key); + BUG_ON(ret); + } else { + BUG(); + } + return 0; +} + +/* + * deletes the csum items from the csum tree for a given + * range of bytes. + */ +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len) +{ + struct btrfs_path *path; + struct btrfs_key key; + u64 end_byte = bytenr + len; + u64 csum_end; + struct extent_buffer *leaf; + int ret; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int blocksize_bits = root->fs_info->sb->s_blocksize_bits; + + root = root->fs_info->csum_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = end_byte - 1; + key.type = BTRFS_EXTENT_CSUM_KEY; + + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } else if (ret < 0) { + break; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY) { + break; + } + + if (key.offset >= end_byte) + break; + + csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end <<= blocksize_bits; + csum_end += key.offset; + + /* this csum ends before we start, we're done */ + if (csum_end <= bytenr) + break; + + /* delete the entire item, it is inside our range */ + if (key.offset >= bytenr && csum_end <= end_byte) { + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + if (key.offset == bytenr) + break; + } else if (key.offset < bytenr && csum_end > end_byte) { + unsigned long offset; + unsigned long shift_len; + unsigned long item_offset; + /* + * [ bytenr - len ] + * [csum ] + * + * Our bytes are in the middle of the csum, + * we need to split this item and insert a new one. + * + * But we can't drop the path because the + * csum could change, get removed, extended etc. + * + * The trick here is the max size of a csum item leaves + * enough room in the tree block for a single + * item header. So, we split the item in place, + * adding a new header pointing to the existing + * bytes. Then we loop around again and we have + * a nicely formed csum item that we can neatly + * truncate. + */ + offset = (bytenr - key.offset) >> blocksize_bits; + offset *= csum_size; + + shift_len = (len >> blocksize_bits) * csum_size; + + item_offset = btrfs_item_ptr_offset(leaf, + path->slots[0]); + + memset_extent_buffer(leaf, 0, item_offset + offset, + shift_len); + key.offset = bytenr; + + /* + * btrfs_split_item returns -EAGAIN when the + * item changed size or key + */ + ret = btrfs_split_item(trans, root, path, &key, offset); + BUG_ON(ret && ret != -EAGAIN); + + key.offset = end_byte - 1; + } else { + ret = truncate_one_csum(trans, root, path, + &key, bytenr, len); + BUG_ON(ret); + if (key.offset < bytenr) + break; + } + btrfs_release_path(path); + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums) +{ + u64 bytenr; + int ret; + struct btrfs_key file_key; + struct btrfs_key found_key; + u64 next_offset; + u64 total_bytes = 0; + int found_next; + struct btrfs_path *path; + struct btrfs_csum_item *item; + struct btrfs_csum_item *item_end; + struct extent_buffer *leaf = NULL; + u64 csum_offset; + struct btrfs_sector_sum *sector_sum; + u32 nritems; + u32 ins_size; + char *eb_map; + char *eb_token; + unsigned long map_len; + unsigned long map_start; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + + path = btrfs_alloc_path(); + BUG_ON(!path); + sector_sum = sums->sums; +again: + next_offset = (u64)-1; + found_next = 0; + file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + file_key.offset = sector_sum->bytenr; + bytenr = sector_sum->bytenr; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + + item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); + if (!IS_ERR(item)) { + leaf = path->nodes[0]; + ret = 0; + goto found; + } + ret = PTR_ERR(item); + if (ret != -EFBIG && ret != -ENOENT) + goto fail_unlock; + + if (ret == -EFBIG) { + u32 item_size; + /* we found one, but it isn't big enough yet */ + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if ((item_size / csum_size) >= + MAX_CSUM_ITEMS(root, csum_size)) { + /* already at max size, make a new one */ + goto insert; + } + } else { + int slot = path->slots[0] + 1; + /* we didn't find a csum item, insert one */ + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems - 1) { + ret = btrfs_next_leaf(root, path); + if (ret == 1) + found_next = 1; + if (ret != 0) + goto insert; + slot = 0; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); + if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + found_key.type != BTRFS_EXTENT_CSUM_KEY) { + found_next = 1; + goto insert; + } + next_offset = found_key.offset; + found_next = 1; + goto insert; + } + + /* + * at this point, we know the tree has an item, but it isn't big + * enough yet to put our csum in. Grow it + */ + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, &file_key, path, + csum_size, 1); + if (ret < 0) + goto fail_unlock; + + if (ret > 0) { + if (path->slots[0] == 0) + goto insert; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + csum_offset = (bytenr - found_key.offset) >> + root->fs_info->sb->s_blocksize_bits; + + if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY || + found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { + goto insert; + } + + if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / + csum_size) { + u32 diff = (csum_offset + 1) * csum_size; + + /* + * is the item big enough already? we dropped our lock + * before and need to recheck + */ + if (diff < btrfs_item_size_nr(leaf, path->slots[0])) + goto csum; + + diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); + if (diff != csum_size) + goto insert; + + ret = btrfs_extend_item(trans, root, path, diff); + goto csum; + } + +insert: + btrfs_release_path(path); + csum_offset = 0; + if (found_next) { + u64 tmp = total_bytes + root->sectorsize; + u64 next_sector = sector_sum->bytenr; + struct btrfs_sector_sum *next = sector_sum + 1; + + while (tmp < sums->len) { + if (next_sector + root->sectorsize != next->bytenr) + break; + tmp += root->sectorsize; + next_sector = next->bytenr; + next++; + } + tmp = min(tmp, next_offset - file_key.offset); + tmp >>= root->fs_info->sb->s_blocksize_bits; + tmp = max((u64)1, tmp); + tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); + ins_size = csum_size * tmp; + } else { + ins_size = csum_size; + } + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &file_key, + ins_size); + path->leave_spinning = 0; + if (ret < 0) + goto fail_unlock; + if (ret != 0) { + WARN_ON(1); + goto fail_unlock; + } +csum: + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + ret = 0; + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); +found: + item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + + btrfs_item_size_nr(leaf, path->slots[0])); + eb_token = NULL; +next_sector: + + if (!eb_token || + (unsigned long)item + csum_size >= map_start + map_len) { + int err; + + if (eb_token) + unmap_extent_buffer(leaf, eb_token, KM_USER1); + eb_token = NULL; + err = map_private_extent_buffer(leaf, (unsigned long)item, + csum_size, + &eb_token, &eb_map, + &map_start, &map_len, KM_USER1); + if (err) + eb_token = NULL; + } + if (eb_token) { + memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)), + §or_sum->sum, csum_size); + } else { + write_extent_buffer(leaf, §or_sum->sum, + (unsigned long)item, csum_size); + } + + total_bytes += root->sectorsize; + sector_sum++; + if (total_bytes < sums->len) { + item = (struct btrfs_csum_item *)((char *)item + + csum_size); + if (item < item_end && bytenr + PAGE_CACHE_SIZE == + sector_sum->bytenr) { + bytenr = sector_sum->bytenr; + goto next_sector; + } + } + if (eb_token) { + unmap_extent_buffer(leaf, eb_token, KM_USER1); + eb_token = NULL; + } + btrfs_mark_buffer_dirty(path->nodes[0]); + if (total_bytes < sums->len) { + btrfs_release_path(path); + cond_resched(); + goto again; + } +out: + btrfs_free_path(path); + return ret; + +fail_unlock: + goto out; +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c new file mode 100644 index 00000000..fa4ef18b --- /dev/null +++ b/fs/btrfs/file.c @@ -0,0 +1,1683 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "tree-log.h" +#include "locking.h" +#include "compat.h" + +/* + * when auto defrag is enabled we + * queue up these defrag structs to remember which + * inodes need defragging passes + */ +struct inode_defrag { + struct rb_node rb_node; + /* objectid */ + u64 ino; + /* + * transid where the defrag was added, we search for + * extents newer than this + */ + u64 transid; + + /* root objectid */ + u64 root; + + /* last offset we were able to defrag */ + u64 last_offset; + + /* if we've wrapped around back to zero once already */ + int cycled; +}; + +/* pop a record for an inode into the defrag tree. The lock + * must be held already + * + * If you're inserting a record for an older transid than an + * existing record, the transid already in the tree is lowered + * + * If an existing record is found the defrag item you + * pass in is freed + */ +static int __btrfs_add_inode_defrag(struct inode *inode, + struct inode_defrag *defrag) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode_defrag *entry; + struct rb_node **p; + struct rb_node *parent = NULL; + + p = &root->fs_info->defrag_inodes.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + if (defrag->ino < entry->ino) + p = &parent->rb_left; + else if (defrag->ino > entry->ino) + p = &parent->rb_right; + else { + /* if we're reinserting an entry for + * an old defrag run, make sure to + * lower the transid of our existing record + */ + if (defrag->transid < entry->transid) + entry->transid = defrag->transid; + if (defrag->last_offset > entry->last_offset) + entry->last_offset = defrag->last_offset; + goto exists; + } + } + BTRFS_I(inode)->in_defrag = 1; + rb_link_node(&defrag->rb_node, parent, p); + rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); + return 0; + +exists: + kfree(defrag); + return 0; + +} + +/* + * insert a defrag record for this inode if auto defrag is + * enabled + */ +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode_defrag *defrag; + int ret = 0; + u64 transid; + + if (!btrfs_test_opt(root, AUTO_DEFRAG)) + return 0; + + if (btrfs_fs_closing(root->fs_info)) + return 0; + + if (BTRFS_I(inode)->in_defrag) + return 0; + + if (trans) + transid = trans->transid; + else + transid = BTRFS_I(inode)->root->last_trans; + + defrag = kzalloc(sizeof(*defrag), GFP_NOFS); + if (!defrag) + return -ENOMEM; + + defrag->ino = btrfs_ino(inode); + defrag->transid = transid; + defrag->root = root->root_key.objectid; + + spin_lock(&root->fs_info->defrag_inodes_lock); + if (!BTRFS_I(inode)->in_defrag) + ret = __btrfs_add_inode_defrag(inode, defrag); + spin_unlock(&root->fs_info->defrag_inodes_lock); + return ret; +} + +/* + * must be called with the defrag_inodes lock held + */ +struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino, + struct rb_node **next) +{ + struct inode_defrag *entry = NULL; + struct rb_node *p; + struct rb_node *parent = NULL; + + p = info->defrag_inodes.rb_node; + while (p) { + parent = p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + if (ino < entry->ino) + p = parent->rb_left; + else if (ino > entry->ino) + p = parent->rb_right; + else + return entry; + } + + if (next) { + while (parent && ino > entry->ino) { + parent = rb_next(parent); + entry = rb_entry(parent, struct inode_defrag, rb_node); + } + *next = parent; + } + return NULL; +} + +/* + * run through the list of inodes in the FS that need + * defragging + */ +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + struct btrfs_root *inode_root; + struct inode *inode; + struct rb_node *n; + struct btrfs_key key; + struct btrfs_ioctl_defrag_range_args range; + u64 first_ino = 0; + int num_defrag; + int defrag_batch = 1024; + + memset(&range, 0, sizeof(range)); + range.len = (u64)-1; + + atomic_inc(&fs_info->defrag_running); + spin_lock(&fs_info->defrag_inodes_lock); + while(1) { + n = NULL; + + /* find an inode to defrag */ + defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n); + if (!defrag) { + if (n) + defrag = rb_entry(n, struct inode_defrag, rb_node); + else if (first_ino) { + first_ino = 0; + continue; + } else { + break; + } + } + + /* remove it from the rbtree */ + first_ino = defrag->ino + 1; + rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); + + if (btrfs_fs_closing(fs_info)) + goto next_free; + + spin_unlock(&fs_info->defrag_inodes_lock); + + /* get the inode */ + key.objectid = defrag->root; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(inode_root)) + goto next; + + key.objectid = defrag->ino; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); + if (IS_ERR(inode)) + goto next; + + /* do a chunk of defrag */ + BTRFS_I(inode)->in_defrag = 0; + range.start = defrag->last_offset; + num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + defrag_batch); + /* + * if we filled the whole defrag batch, there + * must be more work to do. Queue this defrag + * again + */ + if (num_defrag == defrag_batch) { + defrag->last_offset = range.start; + __btrfs_add_inode_defrag(inode, defrag); + /* + * we don't want to kfree defrag, we added it back to + * the rbtree + */ + defrag = NULL; + } else if (defrag->last_offset && !defrag->cycled) { + /* + * we didn't fill our defrag batch, but + * we didn't start at zero. Make sure we loop + * around to the start of the file. + */ + defrag->last_offset = 0; + defrag->cycled = 1; + __btrfs_add_inode_defrag(inode, defrag); + defrag = NULL; + } + + iput(inode); +next: + spin_lock(&fs_info->defrag_inodes_lock); +next_free: + kfree(defrag); + } + spin_unlock(&fs_info->defrag_inodes_lock); + + atomic_dec(&fs_info->defrag_running); + + /* + * during unmount, we use the transaction_wait queue to + * wait for the defragger to stop + */ + wake_up(&fs_info->transaction_wait); + return 0; +} + +/* simple helper to fault in pages and copy. This should go away + * and be replaced with calls into generic code. + */ +static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, + size_t write_bytes, + struct page **prepared_pages, + struct iov_iter *i) +{ + size_t copied = 0; + size_t total_copied = 0; + int pg = 0; + int offset = pos & (PAGE_CACHE_SIZE - 1); + + while (write_bytes > 0) { + size_t count = min_t(size_t, + PAGE_CACHE_SIZE - offset, write_bytes); + struct page *page = prepared_pages[pg]; + /* + * Copy data from userspace to the current page + * + * Disable pagefault to avoid recursive lock since + * the pages are already locked + */ + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, count); + pagefault_enable(); + + /* Flush processor's dcache for this page */ + flush_dcache_page(page); + + /* + * if we get a partial write, we can end up with + * partially up to date pages. These add + * a lot of complexity, so make sure they don't + * happen by forcing this copy to be retried. + * + * The rest of the btrfs_file_write code will fall + * back to page at a time copies after we return 0. + */ + if (!PageUptodate(page) && copied < count) + copied = 0; + + iov_iter_advance(i, copied); + write_bytes -= copied; + total_copied += copied; + + /* Return to btrfs_file_aio_write to fault page */ + if (unlikely(copied == 0)) + break; + + if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { + offset += copied; + } else { + pg++; + offset = 0; + } + } + return total_copied; +} + +/* + * unlocks pages after btrfs_file_write is done with them + */ +void btrfs_drop_pages(struct page **pages, size_t num_pages) +{ + size_t i; + for (i = 0; i < num_pages; i++) { + /* page checked is some magic around finding pages that + * have been modified without going through btrfs_set_page_dirty + * clear it here + */ + ClearPageChecked(pages[i]); + unlock_page(pages[i]); + mark_page_accessed(pages[i]); + page_cache_release(pages[i]); + } +} + +/* + * after copy_from_user, pages need to be dirtied and we need to make + * sure holes are created between the current EOF and the start of + * any next extents (if required). + * + * this also makes the decision about creating an inline extent vs + * doing real data extents, marking pages dirty and delalloc as required. + */ +int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, + struct page **pages, size_t num_pages, + loff_t pos, size_t write_bytes, + struct extent_state **cached) +{ + int err = 0; + int i; + u64 num_bytes; + u64 start_pos; + u64 end_of_last_block; + u64 end_pos = pos + write_bytes; + loff_t isize = i_size_read(inode); + + start_pos = pos & ~((u64)root->sectorsize - 1); + num_bytes = (write_bytes + pos - start_pos + + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + end_of_last_block = start_pos + num_bytes - 1; + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, + cached); + if (err) + return err; + + for (i = 0; i < num_pages; i++) { + struct page *p = pages[i]; + SetPageUptodate(p); + ClearPageChecked(p); + set_page_dirty(p); + } + + /* + * we've only changed i_size in ram, and we haven't updated + * the disk i_size. There is no need to log the inode + * at this time. + */ + if (end_pos > isize) + i_size_write(inode, end_pos); + return 0; +} + +/* + * this drops all the extents in the cache that intersect the range + * [start, end]. Existing extents are split as required. + */ +int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned) +{ + struct extent_map *em; + struct extent_map *split = NULL; + struct extent_map *split2 = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 len = end - start + 1; + int ret; + int testend = 1; + unsigned long flags; + int compressed = 0; + + WARN_ON(end < start); + if (end == (u64)-1) { + len = (u64)-1; + testend = 0; + } + while (1) { + if (!split) + split = alloc_extent_map(); + if (!split2) + split2 = alloc_extent_map(); + BUG_ON(!split || !split2); + + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) { + write_unlock(&em_tree->lock); + break; + } + flags = em->flags; + if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { + if (testend && em->start + em->len >= start + len) { + free_extent_map(em); + write_unlock(&em_tree->lock); + break; + } + start = em->start + em->len; + if (testend) + len = start + len - (em->start + em->len); + free_extent_map(em); + write_unlock(&em_tree->lock); + continue; + } + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + remove_extent_mapping(em_tree, em); + + if (em->block_start < EXTENT_MAP_LAST_BYTE && + em->start < start) { + split->start = em->start; + split->len = start - em->start; + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + + split->bdev = em->bdev; + split->flags = flags; + split->compress_type = em->compress_type; + ret = add_extent_mapping(em_tree, split); + BUG_ON(ret); + free_extent_map(split); + split = split2; + split2 = NULL; + } + if (em->block_start < EXTENT_MAP_LAST_BYTE && + testend && em->start + em->len > start + len) { + u64 diff = start + len - em->start; + + split->start = start + len; + split->len = em->start + em->len - (start + len); + split->bdev = em->bdev; + split->flags = flags; + split->compress_type = em->compress_type; + + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + split->orig_start = em->orig_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + diff; + split->orig_start = split->start; + } + + ret = add_extent_mapping(em_tree, split); + BUG_ON(ret); + free_extent_map(split); + split = NULL; + } + write_unlock(&em_tree->lock); + + /* once for us */ + free_extent_map(em); + /* once for the tree*/ + free_extent_map(em); + } + if (split) + free_extent_map(split); + if (split2) + free_extent_map(split2); + return 0; +} + +/* + * this is very complex, but the basic idea is to drop all extents + * in the range start - end. hint_block is filled in with a block number + * that would be a good hint to the block allocator for this file. + * + * If an extent intersects the range but is not entirely inside the range + * it is either truncated or split. Anything entirely inside the range + * is deleted from the tree. + */ +int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, + u64 start, u64 end, u64 *hint_byte, int drop_cache) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key new_key; + u64 ino = btrfs_ino(inode); + u64 search_start = start; + u64 disk_bytenr = 0; + u64 num_bytes = 0; + u64 extent_offset = 0; + u64 extent_end = 0; + int del_nr = 0; + int del_slot = 0; + int extent_type; + int recow; + int ret; + + if (drop_cache) + btrfs_drop_extent_cache(inode, start, end - 1, 0); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + recow = 0; + ret = btrfs_lookup_file_extent(trans, root, path, ino, + search_start, -1); + if (ret < 0) + break; + if (ret > 0 && path->slots[0] > 0 && search_start == start) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + if (key.objectid == ino && + key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + ret = 0; +next_slot: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + BUG_ON(del_nr > 0); + ret = btrfs_next_leaf(root, path); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; + } + leaf = path->nodes[0]; + recow = 1; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid > ino || + key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = key.offset + + btrfs_file_extent_inline_len(leaf, fi); + } else { + WARN_ON(1); + extent_end = search_start; + } + + if (extent_end <= search_start) { + path->slots[0]++; + goto next_slot; + } + + search_start = max(key.offset, start); + if (recow) { + btrfs_release_path(path); + continue; + } + + /* + * | - range to drop - | + * | -------- extent -------- | + */ + if (start > key.offset && end < extent_end) { + BUG_ON(del_nr > 0); + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = start; + ret = btrfs_duplicate_item(trans, root, path, + &new_key); + if (ret == -EAGAIN) { + btrfs_release_path(path); + continue; + } + if (ret < 0) + break; + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + extent_offset += start - key.offset; + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - start); + btrfs_mark_buffer_dirty(leaf); + + if (disk_bytenr > 0) { + ret = btrfs_inc_extent_ref(trans, root, + disk_bytenr, num_bytes, 0, + root->root_key.objectid, + new_key.objectid, + start - extent_offset); + BUG_ON(ret); + *hint_byte = disk_bytenr; + } + key.offset = start; + } + /* + * | ---- range to drop ----- | + * | -------- extent -------- | + */ + if (start <= key.offset && end < extent_end) { + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = end; + btrfs_set_item_key_safe(trans, root, path, &new_key); + + extent_offset += end - key.offset; + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - end); + btrfs_mark_buffer_dirty(leaf); + if (disk_bytenr > 0) { + inode_sub_bytes(inode, end - key.offset); + *hint_byte = disk_bytenr; + } + break; + } + + search_start = extent_end; + /* + * | ---- range to drop ----- | + * | -------- extent -------- | + */ + if (start > key.offset && end >= extent_end) { + BUG_ON(del_nr > 0); + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + btrfs_mark_buffer_dirty(leaf); + if (disk_bytenr > 0) { + inode_sub_bytes(inode, extent_end - start); + *hint_byte = disk_bytenr; + } + if (end == extent_end) + break; + + path->slots[0]++; + goto next_slot; + } + + /* + * | ---- range to drop ----- | + * | ------ extent ------ | + */ + if (start <= key.offset && end >= extent_end) { + if (del_nr == 0) { + del_slot = path->slots[0]; + del_nr = 1; + } else { + BUG_ON(del_slot + del_nr != path->slots[0]); + del_nr++; + } + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + inode_sub_bytes(inode, + extent_end - key.offset); + extent_end = ALIGN(extent_end, + root->sectorsize); + } else if (disk_bytenr > 0) { + ret = btrfs_free_extent(trans, root, + disk_bytenr, num_bytes, 0, + root->root_key.objectid, + key.objectid, key.offset - + extent_offset); + BUG_ON(ret); + inode_sub_bytes(inode, + extent_end - key.offset); + *hint_byte = disk_bytenr; + } + + if (end == extent_end) + break; + + if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { + path->slots[0]++; + goto next_slot; + } + + ret = btrfs_del_items(trans, root, path, del_slot, + del_nr); + BUG_ON(ret); + + del_nr = 0; + del_slot = 0; + + btrfs_release_path(path); + continue; + } + + BUG_ON(1); + } + + if (del_nr > 0) { + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + BUG_ON(ret); + } + + btrfs_free_path(path); + return ret; +} + +static int extent_mergeable(struct extent_buffer *leaf, int slot, + u64 objectid, u64 bytenr, u64 orig_offset, + u64 *start, u64 *end) +{ + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 extent_end; + + if (slot < 0 || slot >= btrfs_header_nritems(leaf)) + return 0; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || + btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || + btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + return 0; + + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if ((*start && *start != key.offset) || (*end && *end != extent_end)) + return 0; + + *start = key.offset; + *end = extent_end; + return 1; +} + +/* + * Mark extent in the range start - end as written. + * + * This changes extent type from 'pre-allocated' to 'regular'. If only + * part of extent is marked as written, the extent will be split into + * two or three. + */ +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct inode *inode, u64 start, u64 end) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key new_key; + u64 bytenr; + u64 num_bytes; + u64 extent_end; + u64 orig_offset; + u64 other_start; + u64 other_end; + u64 split; + int del_nr = 0; + int del_slot = 0; + int recow; + int ret; + u64 ino = btrfs_ino(inode); + + btrfs_drop_extent_cache(inode, start, end - 1, 0); + + path = btrfs_alloc_path(); + BUG_ON(!path); +again: + recow = 0; + split = start; + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = split; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + BUG_ON(btrfs_file_extent_type(leaf, fi) != + BTRFS_FILE_EXTENT_PREALLOC); + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + BUG_ON(key.offset > start || extent_end < end); + + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); + memcpy(&new_key, &key, sizeof(new_key)); + + if (start == key.offset && end < extent_end) { + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1, + ino, bytenr, orig_offset, + &other_start, &other_end)) { + new_key.offset = end; + btrfs_set_item_key_safe(trans, root, path, &new_key); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - end); + btrfs_set_file_extent_offset(leaf, fi, + end - orig_offset); + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + end - other_start); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + } + + if (start > key.offset && end == extent_end) { + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, + ino, bytenr, orig_offset, + &other_start, &other_end)) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + path->slots[0]++; + new_key.offset = start; + btrfs_set_item_key_safe(trans, root, path, &new_key); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + other_end - start); + btrfs_set_file_extent_offset(leaf, fi, + start - orig_offset); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + } + + while (start > key.offset || end < extent_end) { + if (key.offset == start) + split = end; + + new_key.offset = split; + ret = btrfs_duplicate_item(trans, root, path, &new_key); + if (ret == -EAGAIN) { + btrfs_release_path(path); + goto again; + } + BUG_ON(ret < 0); + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + split - key.offset); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - split); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, + root->root_key.objectid, + ino, orig_offset); + BUG_ON(ret); + + if (split == start) { + key.offset = start; + } else { + BUG_ON(start != key.offset); + path->slots[0]--; + extent_end = end; + } + recow = 1; + } + + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, + ino, bytenr, orig_offset, + &other_start, &other_end)) { + if (recow) { + btrfs_release_path(path); + goto again; + } + extent_end = other_end; + del_slot = path->slots[0] + 1; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + 0, root->root_key.objectid, + ino, orig_offset); + BUG_ON(ret); + } + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1, + ino, bytenr, orig_offset, + &other_start, &other_end)) { + if (recow) { + btrfs_release_path(path); + goto again; + } + key.offset = other_start; + del_slot = path->slots[0]; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + 0, root->root_key.objectid, + ino, orig_offset); + BUG_ON(ret); + } + if (del_nr == 0) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_mark_buffer_dirty(leaf); + } else { + fi = btrfs_item_ptr(leaf, del_slot - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - key.offset); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + BUG_ON(ret); + } +out: + btrfs_free_path(path); + return 0; +} + +/* + * on error we return an unlocked page and the error value + * on success we return a locked page and 0 + */ +static int prepare_uptodate_page(struct page *page, u64 pos) +{ + int ret = 0; + + if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + if (ret) + return ret; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + return -EIO; + } + } + return 0; +} + +/* + * this gets pages into the page cache and locks them down, it also properly + * waits for data=ordered extents to finish before allowing the pages to be + * modified. + */ +static noinline int prepare_pages(struct btrfs_root *root, struct file *file, + struct page **pages, size_t num_pages, + loff_t pos, unsigned long first_index, + unsigned long last_index, size_t write_bytes) +{ + struct extent_state *cached_state = NULL; + int i; + unsigned long index = pos >> PAGE_CACHE_SHIFT; + struct inode *inode = fdentry(file)->d_inode; + int err = 0; + int faili = 0; + u64 start_pos; + u64 last_pos; + + start_pos = pos & ~((u64)root->sectorsize - 1); + last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; + + if (start_pos > inode->i_size) { + err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); + if (err) + return err; + } + +again: + for (i = 0; i < num_pages; i++) { + pages[i] = grab_cache_page(inode->i_mapping, index + i); + if (!pages[i]) { + faili = i - 1; + err = -ENOMEM; + goto fail; + } + + if (i == 0) + err = prepare_uptodate_page(pages[i], pos); + if (i == num_pages - 1) + err = prepare_uptodate_page(pages[i], + pos + write_bytes); + if (err) { + page_cache_release(pages[i]); + faili = i - 1; + goto fail; + } + wait_on_page_writeback(pages[i]); + } + err = 0; + if (start_pos < inode->i_size) { + struct btrfs_ordered_extent *ordered; + lock_extent_bits(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, 0, &cached_state, + GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + last_pos - 1); + if (ordered && + ordered->file_offset + ordered->len > start_pos && + ordered->file_offset < last_pos) { + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, + &cached_state, GFP_NOFS); + for (i = 0; i < num_pages; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + btrfs_wait_ordered_range(inode, start_pos, + last_pos - start_pos); + goto again; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + + clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, + last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, + GFP_NOFS); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, &cached_state, + GFP_NOFS); + } + for (i = 0; i < num_pages; i++) { + clear_page_dirty_for_io(pages[i]); + set_page_extent_mapped(pages[i]); + WARN_ON(!PageLocked(pages[i])); + } + return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + page_cache_release(pages[faili]); + faili--; + } + return err; + +} + +static noinline ssize_t __btrfs_buffered_write(struct file *file, + struct iov_iter *i, + loff_t pos) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page **pages = NULL; + unsigned long first_index; + unsigned long last_index; + size_t num_written = 0; + int nrptrs; + int ret = 0; + + nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / + (sizeof(struct page *))); + pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + first_index = pos >> PAGE_CACHE_SHIFT; + last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT; + + while (iov_iter_count(i) > 0) { + size_t offset = pos & (PAGE_CACHE_SIZE - 1); + size_t write_bytes = min(iov_iter_count(i), + nrptrs * (size_t)PAGE_CACHE_SIZE - + offset); + size_t num_pages = (write_bytes + offset + + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size_t dirty_pages; + size_t copied; + + WARN_ON(num_pages > nrptrs); + + /* + * Fault pages before locking them in prepare_pages + * to avoid recursive lock + */ + if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { + ret = -EFAULT; + break; + } + + ret = btrfs_delalloc_reserve_space(inode, + num_pages << PAGE_CACHE_SHIFT); + if (ret) + break; + + /* + * This is going to setup the pages array with the number of + * pages we want, so we don't really need to worry about the + * contents of pages from loop to loop + */ + ret = prepare_pages(root, file, pages, num_pages, + pos, first_index, last_index, + write_bytes); + if (ret) { + btrfs_delalloc_release_space(inode, + num_pages << PAGE_CACHE_SHIFT); + break; + } + + copied = btrfs_copy_from_user(pos, num_pages, + write_bytes, pages, i); + + /* + * if we have trouble faulting in the pages, fall + * back to one page at a time + */ + if (copied < write_bytes) + nrptrs = 1; + + if (copied == 0) + dirty_pages = 0; + else + dirty_pages = (copied + offset + + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + /* + * If we had a short copy we need to release the excess delaloc + * bytes we reserved. We need to increment outstanding_extents + * because btrfs_delalloc_release_space will decrement it, but + * we still have an outstanding extent for the chunk we actually + * managed to copy. + */ + if (num_pages > dirty_pages) { + if (copied > 0) + atomic_inc( + &BTRFS_I(inode)->outstanding_extents); + btrfs_delalloc_release_space(inode, + (num_pages - dirty_pages) << + PAGE_CACHE_SHIFT); + } + + if (copied > 0) { + ret = btrfs_dirty_pages(root, inode, pages, + dirty_pages, pos, copied, + NULL); + if (ret) { + btrfs_delalloc_release_space(inode, + dirty_pages << PAGE_CACHE_SHIFT); + btrfs_drop_pages(pages, num_pages); + break; + } + } + + btrfs_drop_pages(pages, num_pages); + + cond_resched(); + + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + dirty_pages); + if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + btrfs_btree_balance_dirty(root, 1); + btrfs_throttle(root); + + pos += copied; + num_written += copied; + } + + kfree(pages); + + return num_written ? num_written : ret; +} + +static ssize_t __btrfs_direct_write(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos, + loff_t *ppos, size_t count, size_t ocount) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = fdentry(file)->d_inode; + struct iov_iter i; + ssize_t written; + ssize_t written_buffered; + loff_t endbyte; + int err; + + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, + count, ocount); + + /* + * the generic O_DIRECT will update in-memory i_size after the + * DIOs are done. But our endio handlers that update the on + * disk i_size never update past the in memory i_size. So we + * need one more update here to catch any additions to the + * file + */ + if (inode->i_size != BTRFS_I(inode)->disk_i_size) { + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + mark_inode_dirty(inode); + } + + if (written < 0 || written == count) + return written; + + pos += written; + count -= written; + iov_iter_init(&i, iov, nr_segs, count, written); + written_buffered = __btrfs_buffered_write(file, &i, pos); + if (written_buffered < 0) { + err = written_buffered; + goto out; + } + endbyte = pos + written_buffered - 1; + err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); + if (err) + goto out; + written += written_buffered; + *ppos = pos + written_buffered; + invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); +out: + return written ? written : err; +} + +static ssize_t btrfs_file_aio_write(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + loff_t *ppos = &iocb->ki_pos; + ssize_t num_written = 0; + ssize_t err = 0; + size_t count, ocount; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + mutex_lock(&inode->i_mutex); + + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + count = ocount; + + current->backing_dev_info = inode->i_mapping->backing_dev_info; + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + + if (count == 0) { + mutex_unlock(&inode->i_mutex); + goto out; + } + + err = file_remove_suid(file); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + + /* + * If BTRFS flips readonly due to some impossible error + * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), + * although we have opened a file as writable, we have + * to stop this write operation to ensure FS consistency. + */ + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + mutex_unlock(&inode->i_mutex); + err = -EROFS; + goto out; + } + + file_update_time(file); + BTRFS_I(inode)->sequence++; + + if (unlikely(file->f_flags & O_DIRECT)) { + num_written = __btrfs_direct_write(iocb, iov, nr_segs, + pos, ppos, count, ocount); + } else { + struct iov_iter i; + + iov_iter_init(&i, iov, nr_segs, count, num_written); + + num_written = __btrfs_buffered_write(file, &i, pos); + if (num_written > 0) + *ppos = pos + num_written; + } + + mutex_unlock(&inode->i_mutex); + + /* + * we want to make sure fsync finds this change + * but we haven't joined a transaction running right now. + * + * Later on, someone is sure to update the inode and get the + * real transid recorded. + * + * We set last_trans now to the fs_info generation + 1, + * this will either be one more than the running transaction + * or the generation used for the next transaction if there isn't + * one running right now. + */ + BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + if (num_written > 0 || num_written == -EIOCBQUEUED) { + err = generic_write_sync(file, pos, num_written); + if (err < 0 && num_written > 0) + num_written = err; + } +out: + current->backing_dev_info = NULL; + return num_written ? num_written : err; +} + +int btrfs_release_file(struct inode *inode, struct file *filp) +{ + /* + * ordered_data_close is set by settattr when we are about to truncate + * a file from a non-zero size to a zero size. This tries to + * flush down new bytes that may have been written if the + * application were using truncate to replace a file in place. + */ + if (BTRFS_I(inode)->ordered_data_close) { + BTRFS_I(inode)->ordered_data_close = 0; + btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); + if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(inode->i_mapping); + } + if (filp->private_data) + btrfs_ioctl_trans_end(filp); + return 0; +} + +/* + * fsync call for both files and directories. This logs the inode into + * the tree log instead of forcing full commits whenever possible. + * + * It needs to call filemap_fdatawait so that all ordered extent updates are + * in the metadata btree are up to date for copying to the log. + * + * It drops the inode mutex before doing the tree log commit. This is an + * important optimization for directories because holding the mutex prevents + * new operations on the dir while we write to disk. + */ +int btrfs_sync_file(struct file *file, int datasync) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + struct btrfs_trans_handle *trans; + + trace_btrfs_sync_file(file, datasync); + + /* we wait first, since the writeback may change the inode */ + root->log_batch++; + /* the VFS called filemap_fdatawrite for us */ + btrfs_wait_ordered_range(inode, 0, (u64)-1); + root->log_batch++; + + /* + * check the transaction that last modified this inode + * and see if its already been committed + */ + if (!BTRFS_I(inode)->last_trans) + goto out; + + /* + * if the last transaction that changed this file was before + * the current transaction, we can bail out now without any + * syncing + */ + smp_mb(); + if (BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed) { + BTRFS_I(inode)->last_trans = 0; + goto out; + } + + /* + * ok we haven't committed the transaction yet, lets do a commit + */ + if (file->private_data) + btrfs_ioctl_trans_end(file); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = btrfs_log_dentry_safe(trans, root, dentry); + if (ret < 0) + goto out; + + /* we've logged all the items and now have a consistent + * version of the file in the log. It is possible that + * someone will come in and modify the file, but that's + * fine because the log is consistent on disk, and we + * have references to all of the file's extents + * + * It is possible that someone will come in and log the + * file again, but that will end up using the synchronization + * inside btrfs_sync_log to keep things safe. + */ + mutex_unlock(&dentry->d_inode->i_mutex); + + if (ret != BTRFS_NO_LOG_SYNC) { + if (ret > 0) { + ret = btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_sync_log(trans, root); + if (ret == 0) + ret = btrfs_end_transaction(trans, root); + else + ret = btrfs_commit_transaction(trans, root); + } + } else { + ret = btrfs_end_transaction(trans, root); + } + mutex_lock(&dentry->d_inode->i_mutex); +out: + return ret > 0 ? -EIO : ret; +} + +static const struct vm_operations_struct btrfs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = btrfs_page_mkwrite, +}; + +static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct address_space *mapping = filp->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + + file_accessed(filp); + vma->vm_ops = &btrfs_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + + return 0; +} + +static long btrfs_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct extent_state *cached_state = NULL; + u64 cur_offset; + u64 last_byte; + u64 alloc_start; + u64 alloc_end; + u64 alloc_hint = 0; + u64 locked_end; + u64 mask = BTRFS_I(inode)->root->sectorsize - 1; + struct extent_map *em; + int ret; + + alloc_start = offset & ~mask; + alloc_end = (offset + len + mask) & ~mask; + + /* We only support the FALLOC_FL_KEEP_SIZE mode */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + + mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, alloc_end); + if (ret) + goto out; + + if (alloc_start > inode->i_size) { + ret = btrfs_cont_expand(inode, i_size_read(inode), + alloc_start); + if (ret) + goto out; + } + + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); + if (ret) + goto out; + + locked_end = alloc_end - 1; + while (1) { + struct btrfs_ordered_extent *ordered; + + /* the extent lock is ordered inside the running + * transaction + */ + lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, + locked_end, 0, &cached_state, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + alloc_end - 1); + if (ordered && + ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset < alloc_end) { + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + alloc_start, locked_end, + &cached_state, GFP_NOFS); + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ + btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + } else { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + } + + cur_offset = alloc_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + alloc_end - cur_offset, 0); + BUG_ON(IS_ERR_OR_NULL(em)); + last_byte = min(extent_map_end(em), alloc_end); + last_byte = (last_byte + mask) & ~mask; + if (em->block_start == EXTENT_MAP_HOLE || + (cur_offset >= inode->i_size && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + ret = btrfs_prealloc_file_range(inode, mode, cur_offset, + last_byte - cur_offset, + 1 << inode->i_blkbits, + offset + len, + &alloc_hint); + if (ret < 0) { + free_extent_map(em); + break; + } + } + free_extent_map(em); + + cur_offset = last_byte; + if (cur_offset >= alloc_end) { + ret = 0; + break; + } + } + unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state, GFP_NOFS); + + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +const struct file_operations btrfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .splice_read = generic_file_splice_read, + .aio_write = btrfs_file_aio_write, + .mmap = btrfs_file_mmap, + .open = generic_file_open, + .release = btrfs_release_file, + .fsync = btrfs_sync_file, + .fallocate = btrfs_fallocate, + .unlocked_ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_ioctl, +#endif +}; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c new file mode 100644 index 00000000..bf0d6156 --- /dev/null +++ b/fs/btrfs/free-space-cache.c @@ -0,0 +1,2693 @@ +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include "ctree.h" +#include "free-space-cache.h" +#include "transaction.h" +#include "disk-io.h" +#include "extent_io.h" +#include "inode-map.h" + +#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) + +static int link_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info); + +static struct inode *__lookup_free_space_inode(struct btrfs_root *root, + struct btrfs_path *path, + u64 offset) +{ + struct btrfs_key key; + struct btrfs_key location; + struct btrfs_disk_key disk_key; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct inode *inode = NULL; + int ret; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + btrfs_release_path(path); + return ERR_PTR(-ENOENT); + } + + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_free_space_key(leaf, header, &disk_key); + btrfs_disk_key_to_cpu(&location, &disk_key); + btrfs_release_path(path); + + inode = btrfs_iget(root->fs_info->sb, &location, root, NULL); + if (!inode) + return ERR_PTR(-ENOENT); + if (IS_ERR(inode)) + return inode; + if (is_bad_inode(inode)) { + iput(inode); + return ERR_PTR(-ENOENT); + } + + inode->i_mapping->flags &= ~__GFP_FS; + + return inode; +} + +struct inode *lookup_free_space_inode(struct btrfs_root *root, + struct btrfs_block_group_cache + *block_group, struct btrfs_path *path) +{ + struct inode *inode = NULL; + + spin_lock(&block_group->lock); + if (block_group->inode) + inode = igrab(block_group->inode); + spin_unlock(&block_group->lock); + if (inode) + return inode; + + inode = __lookup_free_space_inode(root, path, + block_group->key.objectid); + if (IS_ERR(inode)) + return inode; + + spin_lock(&block_group->lock); + if (!btrfs_fs_closing(root->fs_info)) { + block_group->inode = igrab(inode); + block_group->iref = 1; + } + spin_unlock(&block_group->lock); + + return inode; +} + +int __create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 ino, u64 offset) +{ + struct btrfs_key key; + struct btrfs_disk_key disk_key; + struct btrfs_free_space_header *header; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + int ret; + + ret = btrfs_insert_empty_inode(trans, root, path, ino); + if (ret) + return ret; + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + btrfs_item_key(leaf, &disk_key, path->slots[0]); + memset_extent_buffer(leaf, 0, (unsigned long)inode_item, + sizeof(*inode_item)); + btrfs_set_inode_generation(leaf, inode_item, trans->transid); + btrfs_set_inode_size(leaf, inode_item, 0); + btrfs_set_inode_nbytes(leaf, inode_item, 0); + btrfs_set_inode_uid(leaf, inode_item, 0); + btrfs_set_inode_gid(leaf, inode_item, 0); + btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | + BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); + btrfs_set_inode_nlink(leaf, inode_item, 1); + btrfs_set_inode_transid(leaf, inode_item, trans->transid); + btrfs_set_inode_block_group(leaf, inode_item, offset); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_free_space_header)); + if (ret < 0) { + btrfs_release_path(path); + return ret; + } + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header)); + btrfs_set_free_space_key(leaf, header, &disk_key); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + return 0; +} + +int create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + int ret; + u64 ino; + + ret = btrfs_find_free_objectid(root, &ino); + if (ret < 0) + return ret; + + return __create_free_space_inode(root, trans, path, ino, + block_group->key.objectid); +} + +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode) +{ + loff_t oldsize; + int ret = 0; + + trans->block_rsv = root->orphan_block_rsv; + ret = btrfs_block_rsv_check(trans, root, + root->orphan_block_rsv, + 0, 5); + if (ret) + return ret; + + oldsize = i_size_read(inode); + btrfs_i_size_write(inode, 0); + truncate_pagecache(inode, oldsize, 0); + + /* + * We don't need an orphan item because truncating the free space cache + * will never be split across transactions. + */ + ret = btrfs_truncate_inode_items(trans, root, inode, + 0, BTRFS_EXTENT_DATA_KEY); + if (ret) { + WARN_ON(1); + return ret; + } + + ret = btrfs_update_inode(trans, root, inode); + return ret; +} + +static int readahead_cache(struct inode *inode) +{ + struct file_ra_state *ra; + unsigned long last_index; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + file_ra_state_init(ra, inode->i_mapping); + last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + + page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); + + kfree(ra); + + return 0; +} + +int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_path *path, u64 offset) +{ + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct page *page; + u32 *checksums = NULL, *crc; + char *disk_crcs = NULL; + struct btrfs_key key; + struct list_head bitmaps; + u64 num_entries; + u64 num_bitmaps; + u64 generation; + u32 cur_crc = ~(u32)0; + pgoff_t index = 0; + unsigned long first_page_offset; + int num_checksums; + int ret = 0; + + INIT_LIST_HEAD(&bitmaps); + + /* Nothing in the space cache, goodbye */ + if (!i_size_read(inode)) + goto out; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + else if (ret > 0) { + btrfs_release_path(path); + ret = 0; + goto out; + } + + ret = -1; + + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + num_entries = btrfs_free_space_entries(leaf, header); + num_bitmaps = btrfs_free_space_bitmaps(leaf, header); + generation = btrfs_free_space_generation(leaf, header); + btrfs_release_path(path); + + if (BTRFS_I(inode)->generation != generation) { + printk(KERN_ERR "btrfs: free space inode generation (%llu) did" + " not match free space cache generation (%llu)\n", + (unsigned long long)BTRFS_I(inode)->generation, + (unsigned long long)generation); + goto out; + } + + if (!num_entries) + goto out; + + /* Setup everything for doing checksumming */ + num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; + checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); + if (!checksums) + goto out; + first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); + disk_crcs = kzalloc(first_page_offset, GFP_NOFS); + if (!disk_crcs) + goto out; + + ret = readahead_cache(inode); + if (ret) + goto out; + + while (1) { + struct btrfs_free_space_entry *entry; + struct btrfs_free_space *e; + void *addr; + unsigned long offset = 0; + unsigned long start_offset = 0; + int need_loop = 0; + + if (!num_entries && !num_bitmaps) + break; + + if (index == 0) { + start_offset = first_page_offset; + offset = start_offset; + } + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + goto free_cache; + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + printk(KERN_ERR "btrfs: error reading free " + "space cache\n"); + goto free_cache; + } + } + addr = kmap(page); + + if (index == 0) { + u64 *gen; + + memcpy(disk_crcs, addr, first_page_offset); + gen = addr + (sizeof(u32) * num_checksums); + if (*gen != BTRFS_I(inode)->generation) { + printk(KERN_ERR "btrfs: space cache generation" + " (%llu) does not match inode (%llu)\n", + (unsigned long long)*gen, + (unsigned long long) + BTRFS_I(inode)->generation); + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + crc = (u32 *)disk_crcs; + } + entry = addr + start_offset; + + /* First lets check our crc before we do anything fun */ + cur_crc = ~(u32)0; + cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc, + PAGE_CACHE_SIZE - start_offset); + btrfs_csum_final(cur_crc, (char *)&cur_crc); + if (cur_crc != *crc) { + printk(KERN_ERR "btrfs: crc mismatch for page %lu\n", + index); + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + crc++; + + while (1) { + if (!num_entries) + break; + + need_loop = 1; + e = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); + if (!e) { + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + + e->offset = le64_to_cpu(entry->offset); + e->bytes = le64_to_cpu(entry->bytes); + if (!e->bytes) { + kunmap(page); + kmem_cache_free(btrfs_free_space_cachep, e); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + + if (entry->type == BTRFS_FREE_SPACE_EXTENT) { + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + spin_unlock(&ctl->tree_lock); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + } else { + e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!e->bitmap) { + kunmap(page); + kmem_cache_free( + btrfs_free_space_cachep, e); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + ctl->total_bitmaps++; + ctl->op->recalc_thresholds(ctl); + spin_unlock(&ctl->tree_lock); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + list_add_tail(&e->list, &bitmaps); + } + + num_entries--; + offset += sizeof(struct btrfs_free_space_entry); + if (offset + sizeof(struct btrfs_free_space_entry) >= + PAGE_CACHE_SIZE) + break; + entry++; + } + + /* + * We read an entry out of this page, we need to move on to the + * next page. + */ + if (need_loop) { + kunmap(page); + goto next; + } + + /* + * We add the bitmaps at the end of the entries in order that + * the bitmap entries are added to the cache. + */ + e = list_entry(bitmaps.next, struct btrfs_free_space, list); + list_del_init(&e->list); + memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); + kunmap(page); + num_bitmaps--; +next: + unlock_page(page); + page_cache_release(page); + index++; + } + + ret = 1; +out: + kfree(checksums); + kfree(disk_crcs); + return ret; +free_cache: + __btrfs_remove_free_space_cache(ctl); + goto out; +} + +int load_free_space_cache(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_root *root = fs_info->tree_root; + struct inode *inode; + struct btrfs_path *path; + int ret; + bool matched; + u64 used = btrfs_block_group_used(&block_group->item); + + /* + * If we're unmounting then just return, since this does a search on the + * normal root and not the commit root and we could deadlock. + */ + if (btrfs_fs_closing(fs_info)) + return 0; + + /* + * If this block group has been marked to be cleared for one reason or + * another then we can't trust the on disk cache, so just return. + */ + spin_lock(&block_group->lock); + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { + spin_unlock(&block_group->lock); + return 0; + } + spin_unlock(&block_group->lock); + + path = btrfs_alloc_path(); + if (!path) + return 0; + + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode)) { + btrfs_free_path(path); + return 0; + } + + ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, + path, block_group->key.objectid); + btrfs_free_path(path); + if (ret <= 0) + goto out; + + spin_lock(&ctl->tree_lock); + matched = (ctl->free_space == (block_group->key.offset - used - + block_group->bytes_super)); + spin_unlock(&ctl->tree_lock); + + if (!matched) { + __btrfs_remove_free_space_cache(ctl); + printk(KERN_ERR "block group %llu has an wrong amount of free " + "space\n", block_group->key.objectid); + ret = -1; + } +out: + if (ret < 0) { + /* This cache is bogus, make sure it gets cleared */ + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_CLEAR; + spin_unlock(&block_group->lock); + ret = 0; + + printk(KERN_ERR "btrfs: failed to load free space cache " + "for block group %llu\n", block_group->key.objectid); + } + + iput(inode); + return ret; +} + +int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 offset) +{ + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct rb_node *node; + struct list_head *pos, *n; + struct page **pages; + struct page *page; + struct extent_state *cached_state = NULL; + struct btrfs_free_cluster *cluster = NULL; + struct extent_io_tree *unpin = NULL; + struct list_head bitmap_list; + struct btrfs_key key; + u64 start, end, len; + u64 bytes = 0; + u32 *crc, *checksums; + unsigned long first_page_offset; + int index = 0, num_pages = 0; + int entries = 0; + int bitmaps = 0; + int ret = -1; + bool next_page = false; + bool out_of_space = false; + + INIT_LIST_HEAD(&bitmap_list); + + node = rb_first(&ctl->free_space_offset); + if (!node) + return 0; + + if (!i_size_read(inode)) + return -1; + + num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + /* Since the first page has all of our checksums and our generation we + * need to calculate the offset into the page that we can start writing + * our entries. + */ + first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); + + filemap_write_and_wait(inode->i_mapping); + btrfs_wait_ordered_range(inode, inode->i_size & + ~(root->sectorsize - 1), (u64)-1); + + /* make sure we don't overflow that first page */ + if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) { + /* this is really the same as running out of space, where we also return 0 */ + printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n"); + ret = 0; + goto out_update; + } + + /* We need a checksum per page. */ + crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS); + if (!crc) + return -1; + + pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); + if (!pages) { + kfree(crc); + return -1; + } + + /* Get the cluster for this block_group if it exists */ + if (block_group && !list_empty(&block_group->cluster_list)) + cluster = list_entry(block_group->cluster_list.next, + struct btrfs_free_cluster, + block_group_list); + + /* + * We shouldn't have switched the pinned extents yet so this is the + * right one + */ + unpin = root->fs_info->pinned_extents; + + /* + * Lock all pages first so we can lock the extent safely. + * + * NOTE: Because we hold the ref the entire time we're going to write to + * the page find_get_page should never fail, so we don't do a check + * after find_get_page at this point. Just putting this here so people + * know and don't freak out. + */ + while (index < num_pages) { + page = grab_cache_page(inode->i_mapping, index); + if (!page) { + int i; + + for (i = 0; i < num_pages; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + goto out_free; + } + pages[index] = page; + index++; + } + + index = 0; + lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + 0, &cached_state, GFP_NOFS); + + /* + * When searching for pinned extents, we need to start at our start + * offset. + */ + if (block_group) + start = block_group->key.objectid; + + /* Write out the extent entries */ + do { + struct btrfs_free_space_entry *entry; + void *addr; + unsigned long offset = 0; + unsigned long start_offset = 0; + + next_page = false; + + if (index == 0) { + start_offset = first_page_offset; + offset = start_offset; + } + + if (index >= num_pages) { + out_of_space = true; + break; + } + + page = pages[index]; + + addr = kmap(page); + entry = addr + start_offset; + + memset(addr, 0, PAGE_CACHE_SIZE); + while (node && !next_page) { + struct btrfs_free_space *e; + + e = rb_entry(node, struct btrfs_free_space, offset_index); + entries++; + + entry->offset = cpu_to_le64(e->offset); + entry->bytes = cpu_to_le64(e->bytes); + if (e->bitmap) { + entry->type = BTRFS_FREE_SPACE_BITMAP; + list_add_tail(&e->list, &bitmap_list); + bitmaps++; + } else { + entry->type = BTRFS_FREE_SPACE_EXTENT; + } + node = rb_next(node); + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster = NULL; + } + offset += sizeof(struct btrfs_free_space_entry); + if (offset + sizeof(struct btrfs_free_space_entry) >= + PAGE_CACHE_SIZE) + next_page = true; + entry++; + } + + /* + * We want to add any pinned extents to our free space cache + * so we don't leak the space + */ + while (block_group && !next_page && + (start < block_group->key.objectid + + block_group->key.offset)) { + ret = find_first_extent_bit(unpin, start, &start, &end, + EXTENT_DIRTY); + if (ret) { + ret = 0; + break; + } + + /* This pinned extent is out of our range */ + if (start >= block_group->key.objectid + + block_group->key.offset) + break; + + len = block_group->key.objectid + + block_group->key.offset - start; + len = min(len, end + 1 - start); + + entries++; + entry->offset = cpu_to_le64(start); + entry->bytes = cpu_to_le64(len); + entry->type = BTRFS_FREE_SPACE_EXTENT; + + start = end + 1; + offset += sizeof(struct btrfs_free_space_entry); + if (offset + sizeof(struct btrfs_free_space_entry) >= + PAGE_CACHE_SIZE) + next_page = true; + entry++; + } + *crc = ~(u32)0; + *crc = btrfs_csum_data(root, addr + start_offset, *crc, + PAGE_CACHE_SIZE - start_offset); + kunmap(page); + + btrfs_csum_final(*crc, (char *)crc); + crc++; + + bytes += PAGE_CACHE_SIZE; + + index++; + } while (node || next_page); + + /* Write out the bitmaps */ + list_for_each_safe(pos, n, &bitmap_list) { + void *addr; + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + + if (index >= num_pages) { + out_of_space = true; + break; + } + page = pages[index]; + + addr = kmap(page); + memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); + *crc = ~(u32)0; + *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE); + kunmap(page); + btrfs_csum_final(*crc, (char *)crc); + crc++; + bytes += PAGE_CACHE_SIZE; + + list_del_init(&entry->list); + index++; + } + + if (out_of_space) { + btrfs_drop_pages(pages, num_pages); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, + GFP_NOFS); + ret = 0; + goto out_free; + } + + /* Zero out the rest of the pages just to make sure */ + while (index < num_pages) { + void *addr; + + page = pages[index]; + addr = kmap(page); + memset(addr, 0, PAGE_CACHE_SIZE); + kunmap(page); + bytes += PAGE_CACHE_SIZE; + index++; + } + + /* Write the checksums and trans id to the first page */ + { + void *addr; + u64 *gen; + + page = pages[0]; + + addr = kmap(page); + memcpy(addr, checksums, sizeof(u32) * num_pages); + gen = addr + (sizeof(u32) * num_pages); + *gen = trans->transid; + kunmap(page); + } + + ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, + bytes, &cached_state); + btrfs_drop_pages(pages, num_pages); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, GFP_NOFS); + + if (ret) { + ret = 0; + goto out_free; + } + + BTRFS_I(inode)->generation = trans->transid; + + filemap_write_and_wait(inode->i_mapping); + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + if (ret < 0) { + ret = -1; + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); + goto out_free; + } + leaf = path->nodes[0]; + if (ret > 0) { + struct btrfs_key found_key; + BUG_ON(!path->slots[0]); + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || + found_key.offset != offset) { + ret = -1; + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, + GFP_NOFS); + btrfs_release_path(path); + goto out_free; + } + } + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_set_free_space_entries(leaf, header, entries); + btrfs_set_free_space_bitmaps(leaf, header, bitmaps); + btrfs_set_free_space_generation(leaf, header, trans->transid); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + ret = 1; + +out_free: + kfree(checksums); + kfree(pages); + +out_update: + if (ret != 1) { + invalidate_inode_pages2_range(inode->i_mapping, 0, index); + BTRFS_I(inode)->generation = 0; + } + btrfs_update_inode(trans, root, inode); + return ret; +} + +int btrfs_write_out_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct inode *inode; + int ret = 0; + + root = root->fs_info->tree_root; + + spin_lock(&block_group->lock); + if (block_group->disk_cache_state < BTRFS_DC_SETUP) { + spin_unlock(&block_group->lock); + return 0; + } + spin_unlock(&block_group->lock); + + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode)) + return 0; + + ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, + path, block_group->key.objectid); + if (ret < 0) { + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_ERROR; + spin_unlock(&block_group->lock); + ret = 0; + + printk(KERN_ERR "btrfs: failed to write free space cace " + "for block group %llu\n", block_group->key.objectid); + } + + iput(inode); + return ret; +} + +static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, + u64 offset) +{ + BUG_ON(offset < bitmap_start); + offset -= bitmap_start; + return (unsigned long)(div_u64(offset, unit)); +} + +static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) +{ + return (unsigned long)(div_u64(bytes, unit)); +} + +static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, + u64 offset) +{ + u64 bitmap_start; + u64 bytes_per_bitmap; + + bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; + bitmap_start = offset - ctl->start; + bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); + bitmap_start *= bytes_per_bitmap; + bitmap_start += ctl->start; + + return bitmap_start; +} + +static int tree_insert_offset(struct rb_root *root, u64 offset, + struct rb_node *node, int bitmap) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_free_space *info; + + while (*p) { + parent = *p; + info = rb_entry(parent, struct btrfs_free_space, offset_index); + + if (offset < info->offset) { + p = &(*p)->rb_left; + } else if (offset > info->offset) { + p = &(*p)->rb_right; + } else { + /* + * we could have a bitmap entry and an extent entry + * share the same offset. If this is the case, we want + * the extent entry to always be found first if we do a + * linear search through the tree, since we want to have + * the quickest allocation time, and allocating from an + * extent is faster than allocating from a bitmap. So + * if we're inserting a bitmap and we find an entry at + * this offset, we want to go right, or after this entry + * logically. If we are inserting an extent and we've + * found a bitmap, we want to go left, or before + * logically. + */ + if (bitmap) { + if (info->bitmap) { + WARN_ON_ONCE(1); + return -EEXIST; + } + p = &(*p)->rb_right; + } else { + if (!info->bitmap) { + WARN_ON_ONCE(1); + return -EEXIST; + } + p = &(*p)->rb_left; + } + } + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + + return 0; +} + +/* + * searches the tree for the given offset. + * + * fuzzy - If this is set, then we are trying to make an allocation, and we just + * want a section that has at least bytes size and comes at or after the given + * offset. + */ +static struct btrfs_free_space * +tree_search_offset(struct btrfs_free_space_ctl *ctl, + u64 offset, int bitmap_only, int fuzzy) +{ + struct rb_node *n = ctl->free_space_offset.rb_node; + struct btrfs_free_space *entry, *prev = NULL; + + /* find entry that is closest to the 'offset' */ + while (1) { + if (!n) { + entry = NULL; + break; + } + + entry = rb_entry(n, struct btrfs_free_space, offset_index); + prev = entry; + + if (offset < entry->offset) + n = n->rb_left; + else if (offset > entry->offset) + n = n->rb_right; + else + break; + } + + if (bitmap_only) { + if (!entry) + return NULL; + if (entry->bitmap) + return entry; + + /* + * bitmap entry and extent entry may share same offset, + * in that case, bitmap entry comes after extent entry. + */ + n = rb_next(n); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + if (entry->offset != offset) + return NULL; + + WARN_ON(!entry->bitmap); + return entry; + } else if (entry) { + if (entry->bitmap) { + /* + * if previous extent entry covers the offset, + * we should return it instead of the bitmap entry + */ + n = &entry->offset_index; + while (1) { + n = rb_prev(n); + if (!n) + break; + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap) { + if (prev->offset + prev->bytes > offset) + entry = prev; + break; + } + } + } + return entry; + } + + if (!prev) + return NULL; + + /* find last entry before the 'offset' */ + entry = prev; + if (entry->offset > offset) { + n = rb_prev(&entry->offset_index); + if (n) { + entry = rb_entry(n, struct btrfs_free_space, + offset_index); + BUG_ON(entry->offset > offset); + } else { + if (fuzzy) + return entry; + else + return NULL; + } + } + + if (entry->bitmap) { + n = &entry->offset_index; + while (1) { + n = rb_prev(n); + if (!n) + break; + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap) { + if (prev->offset + prev->bytes > offset) + return prev; + break; + } + } + if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) + return entry; + } else if (entry->offset + entry->bytes > offset) + return entry; + + if (!fuzzy) + return NULL; + + while (1) { + if (entry->bitmap) { + if (entry->offset + BITS_PER_BITMAP * + ctl->unit > offset) + break; + } else { + if (entry->offset + entry->bytes > offset) + break; + } + + n = rb_next(&entry->offset_index); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + } + return entry; +} + +static inline void +__unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + rb_erase(&info->offset_index, &ctl->free_space_offset); + ctl->free_extents--; +} + +static void unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + __unlink_free_space(ctl, info); + ctl->free_space -= info->bytes; +} + +static int link_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + int ret = 0; + + BUG_ON(!info->bitmap && !info->bytes); + ret = tree_insert_offset(&ctl->free_space_offset, info->offset, + &info->offset_index, (info->bitmap != NULL)); + if (ret) + return ret; + + ctl->free_space += info->bytes; + ctl->free_extents++; + return ret; +} + +static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_block_group_cache *block_group = ctl->private; + u64 max_bytes; + u64 bitmap_bytes; + u64 extent_bytes; + u64 size = block_group->key.offset; + u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; + int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); + + BUG_ON(ctl->total_bitmaps > max_bitmaps); + + /* + * The goal is to keep the total amount of memory used per 1gb of space + * at or below 32k, so we need to adjust how much memory we allow to be + * used by extent based free space tracking + */ + if (size < 1024 * 1024 * 1024) + max_bytes = MAX_CACHE_BYTES_PER_GIG; + else + max_bytes = MAX_CACHE_BYTES_PER_GIG * + div64_u64(size, 1024 * 1024 * 1024); + + /* + * we want to account for 1 more bitmap than what we have so we can make + * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as + * we add more bitmaps. + */ + bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE; + + if (bitmap_bytes >= max_bytes) { + ctl->extents_thresh = 0; + return; + } + + /* + * we want the extent entry threshold to always be at most 1/2 the maxw + * bytes we can have, or whatever is less than that. + */ + extent_bytes = max_bytes - bitmap_bytes; + extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); + + ctl->extents_thresh = + div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); +} + +static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + unsigned long start, count; + + start = offset_to_bit(info->offset, ctl->unit, offset); + count = bytes_to_bits(bytes, ctl->unit); + BUG_ON(start + count > BITS_PER_BITMAP); + + bitmap_clear(info->bitmap, start, count); + + info->bytes -= bytes; + ctl->free_space -= bytes; +} + +static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + unsigned long start, count; + + start = offset_to_bit(info->offset, ctl->unit, offset); + count = bytes_to_bits(bytes, ctl->unit); + BUG_ON(start + count > BITS_PER_BITMAP); + + bitmap_set(info->bitmap, start, count); + + info->bytes += bytes; + ctl->free_space += bytes; +} + +static int search_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info, u64 *offset, + u64 *bytes) +{ + unsigned long found_bits = 0; + unsigned long bits, i; + unsigned long next_zero; + + i = offset_to_bit(bitmap_info->offset, ctl->unit, + max_t(u64, *offset, bitmap_info->offset)); + bits = bytes_to_bits(*bytes, ctl->unit); + + for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); + i < BITS_PER_BITMAP; + i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { + next_zero = find_next_zero_bit(bitmap_info->bitmap, + BITS_PER_BITMAP, i); + if ((next_zero - i) >= bits) { + found_bits = next_zero - i; + break; + } + i = next_zero; + } + + if (found_bits) { + *offset = (u64)(i * ctl->unit) + bitmap_info->offset; + *bytes = (u64)(found_bits) * ctl->unit; + return 0; + } + + return -1; +} + +static struct btrfs_free_space * +find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + int ret; + + if (!ctl->free_space_offset.rb_node) + return NULL; + + entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); + if (!entry) + return NULL; + + for (node = &entry->offset_index; node; node = rb_next(node)) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (entry->bytes < *bytes) + continue; + + if (entry->bitmap) { + ret = search_bitmap(ctl, entry, offset, bytes); + if (!ret) + return entry; + continue; + } + + *offset = entry->offset; + *bytes = entry->bytes; + return entry; + } + + return NULL; +} + +static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset) +{ + info->offset = offset_to_bitmap(ctl, offset); + info->bytes = 0; + link_free_space(ctl, info); + ctl->total_bitmaps++; + + ctl->op->recalc_thresholds(ctl); +} + +static void free_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info) +{ + unlink_free_space(ctl, bitmap_info); + kfree(bitmap_info->bitmap); + kmem_cache_free(btrfs_free_space_cachep, bitmap_info); + ctl->total_bitmaps--; + ctl->op->recalc_thresholds(ctl); +} + +static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info, + u64 *offset, u64 *bytes) +{ + u64 end; + u64 search_start, search_bytes; + int ret; + +again: + end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; + + /* + * XXX - this can go away after a few releases. + * + * since the only user of btrfs_remove_free_space is the tree logging + * stuff, and the only way to test that is under crash conditions, we + * want to have this debug stuff here just in case somethings not + * working. Search the bitmap for the space we are trying to use to + * make sure its actually there. If its not there then we need to stop + * because something has gone wrong. + */ + search_start = *offset; + search_bytes = *bytes; + search_bytes = min(search_bytes, end - search_start + 1); + ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); + BUG_ON(ret < 0 || search_start != *offset); + + if (*offset > bitmap_info->offset && *offset + *bytes > end) { + bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1); + *bytes -= end - *offset + 1; + *offset = end + 1; + } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { + bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes); + *bytes = 0; + } + + if (*bytes) { + struct rb_node *next = rb_next(&bitmap_info->offset_index); + if (!bitmap_info->bytes) + free_bitmap(ctl, bitmap_info); + + /* + * no entry after this bitmap, but we still have bytes to + * remove, so something has gone wrong. + */ + if (!next) + return -EINVAL; + + bitmap_info = rb_entry(next, struct btrfs_free_space, + offset_index); + + /* + * if the next entry isn't a bitmap we need to return to let the + * extent stuff do its work. + */ + if (!bitmap_info->bitmap) + return -EAGAIN; + + /* + * Ok the next item is a bitmap, but it may not actually hold + * the information for the rest of this free space stuff, so + * look for it, and if we don't find it return so we can try + * everything over again. + */ + search_start = *offset; + search_bytes = *bytes; + ret = search_bitmap(ctl, bitmap_info, &search_start, + &search_bytes); + if (ret < 0 || search_start != *offset) + return -EAGAIN; + + goto again; + } else if (!bitmap_info->bytes) + free_bitmap(ctl, bitmap_info); + + return 0; +} + +static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + u64 bytes_to_set = 0; + u64 end; + + end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); + + bytes_to_set = min(end - offset, bytes); + + bitmap_set_bits(ctl, info, offset, bytes_to_set); + + return bytes_to_set; + +} + +static bool use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + struct btrfs_block_group_cache *block_group = ctl->private; + + /* + * If we are below the extents threshold then we can add this as an + * extent, and don't have to deal with the bitmap + */ + if (ctl->free_extents < ctl->extents_thresh) { + /* + * If this block group has some small extents we don't want to + * use up all of our free slots in the cache with them, we want + * to reserve them to larger extents, however if we have plent + * of cache left then go ahead an dadd them, no sense in adding + * the overhead of a bitmap if we don't have to. + */ + if (info->bytes <= block_group->sectorsize * 4) { + if (ctl->free_extents * 2 <= ctl->extents_thresh) + return false; + } else { + return false; + } + } + + /* + * some block groups are so tiny they can't be enveloped by a bitmap, so + * don't even bother to create a bitmap for this + */ + if (BITS_PER_BITMAP * block_group->sectorsize > + block_group->key.offset) + return false; + + return true; +} + +static struct btrfs_free_space_op free_space_op = { + .recalc_thresholds = recalculate_thresholds, + .use_bitmap = use_bitmap, +}; + +static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + struct btrfs_free_space *bitmap_info; + struct btrfs_block_group_cache *block_group = NULL; + int added = 0; + u64 bytes, offset, bytes_added; + int ret; + + bytes = info->bytes; + offset = info->offset; + + if (!ctl->op->use_bitmap(ctl, info)) + return 0; + + if (ctl->op == &free_space_op) + block_group = ctl->private; +again: + /* + * Since we link bitmaps right into the cluster we need to see if we + * have a cluster here, and if so and it has our bitmap we need to add + * the free space to that bitmap. + */ + if (block_group && !list_empty(&block_group->cluster_list)) { + struct btrfs_free_cluster *cluster; + struct rb_node *node; + struct btrfs_free_space *entry; + + cluster = list_entry(block_group->cluster_list.next, + struct btrfs_free_cluster, + block_group_list); + spin_lock(&cluster->lock); + node = rb_first(&cluster->root); + if (!node) { + spin_unlock(&cluster->lock); + goto no_cluster_bitmap; + } + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (!entry->bitmap) { + spin_unlock(&cluster->lock); + goto no_cluster_bitmap; + } + + if (entry->offset == offset_to_bitmap(ctl, offset)) { + bytes_added = add_bytes_to_bitmap(ctl, entry, + offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + } + spin_unlock(&cluster->lock); + if (!bytes) { + ret = 1; + goto out; + } + } + +no_cluster_bitmap: + bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!bitmap_info) { + BUG_ON(added); + goto new_bitmap; + } + + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + added = 0; + + if (!bytes) { + ret = 1; + goto out; + } else + goto again; + +new_bitmap: + if (info && info->bitmap) { + add_new_bitmap(ctl, info, offset); + added = 1; + info = NULL; + goto again; + } else { + spin_unlock(&ctl->tree_lock); + + /* no pre-allocated info, allocate a new one */ + if (!info) { + info = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); + if (!info) { + spin_lock(&ctl->tree_lock); + ret = -ENOMEM; + goto out; + } + } + + /* allocate the bitmap */ + info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + spin_lock(&ctl->tree_lock); + if (!info->bitmap) { + ret = -ENOMEM; + goto out; + } + goto again; + } + +out: + if (info) { + if (info->bitmap) + kfree(info->bitmap); + kmem_cache_free(btrfs_free_space_cachep, info); + } + + return ret; +} + +static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, bool update_stat) +{ + struct btrfs_free_space *left_info; + struct btrfs_free_space *right_info; + bool merged = false; + u64 offset = info->offset; + u64 bytes = info->bytes; + + /* + * first we want to see if there is free space adjacent to the range we + * are adding, if there is remove that struct and add a new one to + * cover the entire range + */ + right_info = tree_search_offset(ctl, offset + bytes, 0, 0); + if (right_info && rb_prev(&right_info->offset_index)) + left_info = rb_entry(rb_prev(&right_info->offset_index), + struct btrfs_free_space, offset_index); + else + left_info = tree_search_offset(ctl, offset - 1, 0, 0); + + if (right_info && !right_info->bitmap) { + if (update_stat) + unlink_free_space(ctl, right_info); + else + __unlink_free_space(ctl, right_info); + info->bytes += right_info->bytes; + kmem_cache_free(btrfs_free_space_cachep, right_info); + merged = true; + } + + if (left_info && !left_info->bitmap && + left_info->offset + left_info->bytes == offset) { + if (update_stat) + unlink_free_space(ctl, left_info); + else + __unlink_free_space(ctl, left_info); + info->offset = left_info->offset; + info->bytes += left_info->bytes; + kmem_cache_free(btrfs_free_space_cachep, left_info); + merged = true; + } + + return merged; +} + +int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *info; + int ret = 0; + + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); + if (!info) + return -ENOMEM; + + info->offset = offset; + info->bytes = bytes; + + spin_lock(&ctl->tree_lock); + + if (try_merge_free_space(ctl, info, true)) + goto link; + + /* + * There was no extent directly to the left or right of this new + * extent then we know we're going to have to allocate a new extent, so + * before we do that see if we need to drop this into a bitmap + */ + ret = insert_into_bitmap(ctl, info); + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + goto out; + } +link: + ret = link_free_space(ctl, info); + if (ret) + kmem_cache_free(btrfs_free_space_cachep, info); +out: + spin_unlock(&ctl->tree_lock); + + if (ret) { + printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); + BUG_ON(ret == -EEXIST); + } + + return ret; +} + +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *info; + struct btrfs_free_space *next_info = NULL; + int ret = 0; + + spin_lock(&ctl->tree_lock); + +again: + info = tree_search_offset(ctl, offset, 0, 0); + if (!info) { + /* + * oops didn't find an extent that matched the space we wanted + * to remove, look for a bitmap instead + */ + info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!info) { + WARN_ON(1); + goto out_lock; + } + } + + if (info->bytes < bytes && rb_next(&info->offset_index)) { + u64 end; + next_info = rb_entry(rb_next(&info->offset_index), + struct btrfs_free_space, + offset_index); + + if (next_info->bitmap) + end = next_info->offset + + BITS_PER_BITMAP * ctl->unit - 1; + else + end = next_info->offset + next_info->bytes; + + if (next_info->bytes < bytes || + next_info->offset > offset || offset > end) { + printk(KERN_CRIT "Found free space at %llu, size %llu," + " trying to use %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)bytes); + WARN_ON(1); + ret = -EINVAL; + goto out_lock; + } + + info = next_info; + } + + if (info->bytes == bytes) { + unlink_free_space(ctl, info); + if (info->bitmap) { + kfree(info->bitmap); + ctl->total_bitmaps--; + } + kmem_cache_free(btrfs_free_space_cachep, info); + goto out_lock; + } + + if (!info->bitmap && info->offset == offset) { + unlink_free_space(ctl, info); + info->offset += bytes; + info->bytes -= bytes; + link_free_space(ctl, info); + goto out_lock; + } + + if (!info->bitmap && info->offset <= offset && + info->offset + info->bytes >= offset + bytes) { + u64 old_start = info->offset; + /* + * we're freeing space in the middle of the info, + * this can happen during tree log replay + * + * first unlink the old info and then + * insert it again after the hole we're creating + */ + unlink_free_space(ctl, info); + if (offset + bytes < info->offset + info->bytes) { + u64 old_end = info->offset + info->bytes; + + info->offset = offset + bytes; + info->bytes = old_end - info->offset; + ret = link_free_space(ctl, info); + WARN_ON(ret); + if (ret) + goto out_lock; + } else { + /* the hole we're creating ends at the end + * of the info struct, just free the info + */ + kmem_cache_free(btrfs_free_space_cachep, info); + } + spin_unlock(&ctl->tree_lock); + + /* step two, insert a new info struct to cover + * anything before the hole + */ + ret = btrfs_add_free_space(block_group, old_start, + offset - old_start); + WARN_ON(ret); + goto out; + } + + ret = remove_from_bitmap(ctl, info, &offset, &bytes); + if (ret == -EAGAIN) + goto again; + BUG_ON(ret); +out_lock: + spin_unlock(&ctl->tree_lock); +out: + return ret; +} + +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *info; + struct rb_node *n; + int count = 0; + + for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + if (info->bytes >= bytes) + count++; + printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (info->bitmap) ? "yes" : "no"); + } + printk(KERN_INFO "block group has cluster?: %s\n", + list_empty(&block_group->cluster_list) ? "no" : "yes"); + printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" + "\n", count); +} + +void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + + spin_lock_init(&ctl->tree_lock); + ctl->unit = block_group->sectorsize; + ctl->start = block_group->key.objectid; + ctl->private = block_group; + ctl->op = &free_space_op; + + /* + * we only want to have 32k of ram per block group for keeping + * track of free space, and if we pass 1/2 of that we want to + * start converting things over to using bitmaps + */ + ctl->extents_thresh = ((1024 * 32) / 2) / + sizeof(struct btrfs_free_space); +} + +/* + * for a given cluster, put all of its extents back into the free + * space cache. If the block group passed doesn't match the block group + * pointed to by the cluster, someone else raced in and freed the + * cluster already. In that case, we just return without changing anything + */ +static int +__btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + struct rb_node *node; + + spin_lock(&cluster->lock); + if (cluster->block_group != block_group) + goto out; + + cluster->block_group = NULL; + cluster->window_start = 0; + list_del_init(&cluster->block_group_list); + + node = rb_first(&cluster->root); + while (node) { + bool bitmap; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + rb_erase(&entry->offset_index, &cluster->root); + + bitmap = (entry->bitmap != NULL); + if (!bitmap) + try_merge_free_space(ctl, entry, false); + tree_insert_offset(&ctl->free_space_offset, + entry->offset, &entry->offset_index, bitmap); + } + cluster->root = RB_ROOT; + +out: + spin_unlock(&cluster->lock); + btrfs_put_block_group(block_group); + return 0; +} + +void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *node; + + while ((node = rb_last(&ctl->free_space_offset)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, offset_index); + if (!info->bitmap) { + unlink_free_space(ctl, info); + kmem_cache_free(btrfs_free_space_cachep, info); + } else { + free_bitmap(ctl, info); + } + if (need_resched()) { + spin_unlock(&ctl->tree_lock); + cond_resched(); + spin_lock(&ctl->tree_lock); + } + } +} + +void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) +{ + spin_lock(&ctl->tree_lock); + __btrfs_remove_free_space_cache_locked(ctl); + spin_unlock(&ctl->tree_lock); +} + +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_cluster *cluster; + struct list_head *head; + + spin_lock(&ctl->tree_lock); + while ((head = block_group->cluster_list.next) != + &block_group->cluster_list) { + cluster = list_entry(head, struct btrfs_free_cluster, + block_group_list); + + WARN_ON(cluster->block_group != block_group); + __btrfs_return_cluster_to_free_space(block_group, cluster); + if (need_resched()) { + spin_unlock(&ctl->tree_lock); + cond_resched(); + spin_lock(&ctl->tree_lock); + } + } + __btrfs_remove_free_space_cache_locked(ctl); + spin_unlock(&ctl->tree_lock); + +} + +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry = NULL; + u64 bytes_search = bytes + empty_size; + u64 ret = 0; + + spin_lock(&ctl->tree_lock); + entry = find_free_space(ctl, &offset, &bytes_search); + if (!entry) + goto out; + + ret = offset; + if (entry->bitmap) { + bitmap_clear_bits(ctl, entry, offset, bytes); + if (!entry->bytes) + free_bitmap(ctl, entry); + } else { + unlink_free_space(ctl, entry); + entry->offset += bytes; + entry->bytes -= bytes; + if (!entry->bytes) + kmem_cache_free(btrfs_free_space_cachep, entry); + else + link_free_space(ctl, entry); + } + +out: + spin_unlock(&ctl->tree_lock); + + return ret; +} + +/* + * given a cluster, put all of its extents back into the free space + * cache. If a block group is passed, this function will only free + * a cluster that belongs to the passed block group. + * + * Otherwise, it'll get a reference on the block group pointed to by the + * cluster and remove the cluster from it. + */ +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) +{ + struct btrfs_free_space_ctl *ctl; + int ret; + + /* first, get a safe pointer to the block group */ + spin_lock(&cluster->lock); + if (!block_group) { + block_group = cluster->block_group; + if (!block_group) { + spin_unlock(&cluster->lock); + return 0; + } + } else if (cluster->block_group != block_group) { + /* someone else has already freed it don't redo their work */ + spin_unlock(&cluster->lock); + return 0; + } + atomic_inc(&block_group->count); + spin_unlock(&cluster->lock); + + ctl = block_group->free_space_ctl; + + /* now return any extents the cluster had on it */ + spin_lock(&ctl->tree_lock); + ret = __btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&ctl->tree_lock); + + /* finally drop our ref */ + btrfs_put_block_group(block_group); + return ret; +} + +static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + struct btrfs_free_space *entry, + u64 bytes, u64 min_start) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + int err; + u64 search_start = cluster->window_start; + u64 search_bytes = bytes; + u64 ret = 0; + + search_start = min_start; + search_bytes = bytes; + + err = search_bitmap(ctl, entry, &search_start, &search_bytes); + if (err) + return 0; + + ret = search_start; + bitmap_clear_bits(ctl, entry, ret, bytes); + + return ret; +} + +/* + * given a cluster, try to allocate 'bytes' from it, returns 0 + * if it couldn't find anything suitably large, or a logical disk offset + * if things worked out + */ +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + u64 ret = 0; + + spin_lock(&cluster->lock); + if (bytes > cluster->max_size) + goto out; + + if (cluster->block_group != block_group) + goto out; + + node = rb_first(&cluster->root); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + while(1) { + if (entry->bytes < bytes || + (!entry->bitmap && entry->offset < min_start)) { + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + + if (entry->bitmap) { + ret = btrfs_alloc_from_bitmap(block_group, + cluster, entry, bytes, + min_start); + if (ret == 0) { + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + } else { + + ret = entry->offset; + + entry->offset += bytes; + entry->bytes -= bytes; + } + + if (entry->bytes == 0) + rb_erase(&entry->offset_index, &cluster->root); + break; + } +out: + spin_unlock(&cluster->lock); + + if (!ret) + return 0; + + spin_lock(&ctl->tree_lock); + + ctl->free_space -= bytes; + if (entry->bytes == 0) { + ctl->free_extents--; + if (entry->bitmap) { + kfree(entry->bitmap); + ctl->total_bitmaps--; + ctl->op->recalc_thresholds(ctl); + } + kmem_cache_free(btrfs_free_space_cachep, entry); + } + + spin_unlock(&ctl->tree_lock); + + return ret; +} + +static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *entry, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 min_bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + unsigned long next_zero; + unsigned long i; + unsigned long search_bits; + unsigned long total_bits; + unsigned long found_bits; + unsigned long start = 0; + unsigned long total_found = 0; + int ret; + bool found = false; + + i = offset_to_bit(entry->offset, block_group->sectorsize, + max_t(u64, offset, entry->offset)); + search_bits = bytes_to_bits(bytes, block_group->sectorsize); + total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + +again: + found_bits = 0; + for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); + i < BITS_PER_BITMAP; + i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { + next_zero = find_next_zero_bit(entry->bitmap, + BITS_PER_BITMAP, i); + if (next_zero - i >= search_bits) { + found_bits = next_zero - i; + break; + } + i = next_zero; + } + + if (!found_bits) + return -ENOSPC; + + if (!found) { + start = i; + found = true; + } + + total_found += found_bits; + + if (cluster->max_size < found_bits * block_group->sectorsize) + cluster->max_size = found_bits * block_group->sectorsize; + + if (total_found < total_bits) { + i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); + if (i - start > total_bits * 2) { + total_found = 0; + cluster->max_size = 0; + found = false; + } + goto again; + } + + cluster->window_start = start * block_group->sectorsize + + entry->offset; + rb_erase(&entry->offset_index, &ctl->free_space_offset); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index, 1); + BUG_ON(ret); + + return 0; +} + +/* + * This searches the block group for just extents to fill the cluster with. + */ +static noinline int +setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, + u64 min_bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *first = NULL; + struct btrfs_free_space *entry = NULL; + struct btrfs_free_space *prev = NULL; + struct btrfs_free_space *last; + struct rb_node *node; + u64 window_start; + u64 window_free; + u64 max_extent; + u64 max_gap = 128 * 1024; + + entry = tree_search_offset(ctl, offset, 0, 1); + if (!entry) + return -ENOSPC; + + /* + * We don't want bitmaps, so just move along until we find a normal + * extent entry. + */ + while (entry->bitmap) { + if (list_empty(&entry->list)) + list_add_tail(&entry->list, bitmaps); + node = rb_next(&entry->offset_index); + if (!node) + return -ENOSPC; + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + + window_start = entry->offset; + window_free = entry->bytes; + max_extent = entry->bytes; + first = entry; + last = entry; + prev = entry; + + while (window_free <= min_bytes) { + node = rb_next(&entry->offset_index); + if (!node) + return -ENOSPC; + entry = rb_entry(node, struct btrfs_free_space, offset_index); + + if (entry->bitmap) { + if (list_empty(&entry->list)) + list_add_tail(&entry->list, bitmaps); + continue; + } + + /* + * we haven't filled the empty size and the window is + * very large. reset and try again + */ + if (entry->offset - (prev->offset + prev->bytes) > max_gap || + entry->offset - window_start > (min_bytes * 2)) { + first = entry; + window_start = entry->offset; + window_free = entry->bytes; + last = entry; + max_extent = entry->bytes; + } else { + last = entry; + window_free += entry->bytes; + if (entry->bytes > max_extent) + max_extent = entry->bytes; + } + prev = entry; + } + + cluster->window_start = first->offset; + + node = &first->offset_index; + + /* + * now we've found our entries, pull them out of the free space + * cache and put them into the cluster rbtree + */ + do { + int ret; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + if (entry->bitmap) + continue; + + rb_erase(&entry->offset_index, &ctl->free_space_offset); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index, 0); + BUG_ON(ret); + } while (node && entry != last); + + cluster->max_size = max_extent; + + return 0; +} + +/* + * This specifically looks for bitmaps that may work in the cluster, we assume + * that we have already failed to find extents that will work. + */ +static noinline int +setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, + u64 min_bytes) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + struct rb_node *node; + int ret = -ENOSPC; + + if (ctl->total_bitmaps == 0) + return -ENOSPC; + + /* + * First check our cached list of bitmaps and see if there is an entry + * here that will work. + */ + list_for_each_entry(entry, bitmaps, list) { + if (entry->bytes < min_bytes) + continue; + ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, + bytes, min_bytes); + if (!ret) + return 0; + } + + /* + * If we do have entries on our list and we are here then we didn't find + * anything, so go ahead and get the next entry after the last entry in + * this list and start the search from there. + */ + if (!list_empty(bitmaps)) { + entry = list_entry(bitmaps->prev, struct btrfs_free_space, + list); + node = rb_next(&entry->offset_index); + if (!node) + return -ENOSPC; + entry = rb_entry(node, struct btrfs_free_space, offset_index); + goto search; + } + + entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); + if (!entry) + return -ENOSPC; + +search: + node = &entry->offset_index; + do { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + if (!entry->bitmap) + continue; + if (entry->bytes < min_bytes) + continue; + ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, + bytes, min_bytes); + } while (ret && node); + + return ret; +} + +/* + * here we try to find a cluster of blocks in a block group. The goal + * is to find at least bytes free and up to empty_size + bytes free. + * We might not find them all in one contiguous area. + * + * returns zero and sets up cluster if things worked out, otherwise + * it returns -enospc + */ +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct list_head bitmaps; + struct btrfs_free_space *entry, *tmp; + u64 min_bytes; + int ret; + + /* for metadata, allow allocates with more holes */ + if (btrfs_test_opt(root, SSD_SPREAD)) { + min_bytes = bytes + empty_size; + } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { + /* + * we want to do larger allocations when we are + * flushing out the delayed refs, it helps prevent + * making more work as we go along. + */ + if (trans->transaction->delayed_refs.flushing) + min_bytes = max(bytes, (bytes + empty_size) >> 1); + else + min_bytes = max(bytes, (bytes + empty_size) >> 4); + } else + min_bytes = max(bytes, (bytes + empty_size) >> 2); + + spin_lock(&ctl->tree_lock); + + /* + * If we know we don't have enough space to make a cluster don't even + * bother doing all the work to try and find one. + */ + if (ctl->free_space < min_bytes) { + spin_unlock(&ctl->tree_lock); + return -ENOSPC; + } + + spin_lock(&cluster->lock); + + /* someone already found a cluster, hooray */ + if (cluster->block_group) { + ret = 0; + goto out; + } + + INIT_LIST_HEAD(&bitmaps); + ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, + bytes, min_bytes); + if (ret) + ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, + offset, bytes, min_bytes); + + /* Clear our temporary list */ + list_for_each_entry_safe(entry, tmp, &bitmaps, list) + list_del_init(&entry->list); + + if (!ret) { + atomic_inc(&block_group->count); + list_add_tail(&cluster->block_group_list, + &block_group->cluster_list); + cluster->block_group = block_group; + } +out: + spin_unlock(&cluster->lock); + spin_unlock(&ctl->tree_lock); + + return ret; +} + +/* + * simple code to zero out a cluster + */ +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) +{ + spin_lock_init(&cluster->lock); + spin_lock_init(&cluster->refill_lock); + cluster->root = RB_ROOT; + cluster->max_size = 0; + INIT_LIST_HEAD(&cluster->block_group_list); + cluster->block_group = NULL; +} + +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry = NULL; + struct btrfs_fs_info *fs_info = block_group->fs_info; + u64 bytes = 0; + u64 actually_trimmed; + int ret = 0; + + *trimmed = 0; + + while (start < end) { + spin_lock(&ctl->tree_lock); + + if (ctl->free_space < minlen) { + spin_unlock(&ctl->tree_lock); + break; + } + + entry = tree_search_offset(ctl, start, 0, 1); + if (!entry) + entry = tree_search_offset(ctl, + offset_to_bitmap(ctl, start), + 1, 1); + + if (!entry || entry->offset >= end) { + spin_unlock(&ctl->tree_lock); + break; + } + + if (entry->bitmap) { + ret = search_bitmap(ctl, entry, &start, &bytes); + if (!ret) { + if (start >= end) { + spin_unlock(&ctl->tree_lock); + break; + } + bytes = min(bytes, end - start); + bitmap_clear_bits(ctl, entry, start, bytes); + if (entry->bytes == 0) + free_bitmap(ctl, entry); + } else { + start = entry->offset + BITS_PER_BITMAP * + block_group->sectorsize; + spin_unlock(&ctl->tree_lock); + ret = 0; + continue; + } + } else { + start = entry->offset; + bytes = min(entry->bytes, end - start); + unlink_free_space(ctl, entry); + kmem_cache_free(btrfs_free_space_cachep, entry); + } + + spin_unlock(&ctl->tree_lock); + + if (bytes >= minlen) { + int update_ret; + update_ret = btrfs_update_reserved_bytes(block_group, + bytes, 1, 1); + + ret = btrfs_error_discard_extent(fs_info->extent_root, + start, + bytes, + &actually_trimmed); + + btrfs_add_free_space(block_group, start, bytes); + if (!update_ret) + btrfs_update_reserved_bytes(block_group, + bytes, 0, 1); + + if (ret) + break; + *trimmed += actually_trimmed; + } + start += bytes; + bytes = 0; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } + + return ret; +} + +/* + * Find the left-most item in the cache tree, and then return the + * smallest inode number in the item. + * + * Note: the returned inode number may not be the smallest one in + * the tree, if the left-most item is a bitmap. + */ +u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) +{ + struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl; + struct btrfs_free_space *entry = NULL; + u64 ino = 0; + + spin_lock(&ctl->tree_lock); + + if (RB_EMPTY_ROOT(&ctl->free_space_offset)) + goto out; + + entry = rb_entry(rb_first(&ctl->free_space_offset), + struct btrfs_free_space, offset_index); + + if (!entry->bitmap) { + ino = entry->offset; + + unlink_free_space(ctl, entry); + entry->offset++; + entry->bytes--; + if (!entry->bytes) + kmem_cache_free(btrfs_free_space_cachep, entry); + else + link_free_space(ctl, entry); + } else { + u64 offset = 0; + u64 count = 1; + int ret; + + ret = search_bitmap(ctl, entry, &offset, &count); + BUG_ON(ret); + + ino = offset; + bitmap_clear_bits(ctl, entry, offset, 1); + if (entry->bytes == 0) + free_bitmap(ctl, entry); + } +out: + spin_unlock(&ctl->tree_lock); + + return ino; +} + +struct inode *lookup_free_ino_inode(struct btrfs_root *root, + struct btrfs_path *path) +{ + struct inode *inode = NULL; + + spin_lock(&root->cache_lock); + if (root->cache_inode) + inode = igrab(root->cache_inode); + spin_unlock(&root->cache_lock); + if (inode) + return inode; + + inode = __lookup_free_space_inode(root, path, 0); + if (IS_ERR(inode)) + return inode; + + spin_lock(&root->cache_lock); + if (!btrfs_fs_closing(root->fs_info)) + root->cache_inode = igrab(inode); + spin_unlock(&root->cache_lock); + + return inode; +} + +int create_free_ino_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path) +{ + return __create_free_space_inode(root, trans, path, + BTRFS_FREE_INO_OBJECTID, 0); +} + +int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_path *path; + struct inode *inode; + int ret = 0; + u64 root_gen = btrfs_root_generation(&root->root_item); + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + + /* + * If we're unmounting then just return, since this does a search on the + * normal root and not the commit root and we could deadlock. + */ + if (btrfs_fs_closing(fs_info)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return 0; + + inode = lookup_free_ino_inode(root, path); + if (IS_ERR(inode)) + goto out; + + if (root_gen != BTRFS_I(inode)->generation) + goto out_put; + + ret = __load_free_space_cache(root, inode, ctl, path, 0); + + if (ret < 0) + printk(KERN_ERR "btrfs: failed to load free ino cache for " + "root %llu\n", root->root_key.objectid); +out_put: + iput(inode); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_write_out_ino_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct inode *inode; + int ret; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + + inode = lookup_free_ino_inode(root, path); + if (IS_ERR(inode)) + return 0; + + ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); + if (ret < 0) + printk(KERN_ERR "btrfs: failed to write free ino cache " + "for root %llu\n", root->root_key.objectid); + + iput(inode); + return ret; +} diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h new file mode 100644 index 00000000..8f2613f7 --- /dev/null +++ b/fs/btrfs/free-space-cache.h @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_FREE_SPACE_CACHE +#define __BTRFS_FREE_SPACE_CACHE + +struct btrfs_free_space { + struct rb_node offset_index; + u64 offset; + u64 bytes; + unsigned long *bitmap; + struct list_head list; +}; + +struct btrfs_free_space_ctl { + spinlock_t tree_lock; + struct rb_root free_space_offset; + u64 free_space; + int extents_thresh; + int free_extents; + int total_bitmaps; + int unit; + u64 start; + struct btrfs_free_space_op *op; + void *private; +}; + +struct btrfs_free_space_op { + void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl); + bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info); +}; + +struct inode *lookup_free_space_inode(struct btrfs_root *root, + struct btrfs_block_group_cache + *block_group, struct btrfs_path *path); +int create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); + +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode); +int load_free_space_cache(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group); +int btrfs_write_out_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); + +struct inode *lookup_free_ino_inode(struct btrfs_root *root, + struct btrfs_path *path); +int create_free_ino_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path); +int load_free_ino_cache(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); +int btrfs_write_out_ino_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path); + +void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group); +int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, + u64 bytenr, u64 size); +static inline int +btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size) +{ + return __btrfs_add_free_space(block_group->free_space_ctl, + bytenr, size); +} +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache + *block_group); +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size); +u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root); +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes); +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start); +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster); +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen); +#endif diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h new file mode 100644 index 00000000..db2ff977 --- /dev/null +++ b/fs/btrfs/hash.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __HASH__ +#define __HASH__ + +#include +static inline u64 btrfs_name_hash(const char *name, int len) +{ + return crc32c((u32)~1, name, len); +} +#endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c new file mode 100644 index 00000000..baa74f3d --- /dev/null +++ b/fs/btrfs/inode-item.c @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +static int find_name_in_backref(struct btrfs_path *path, const char *name, + int name_len, struct btrfs_inode_ref **ref_ret) +{ + struct extent_buffer *leaf; + struct btrfs_inode_ref *ref; + unsigned long ptr; + unsigned long name_ptr; + u32 item_size; + u32 cur_offset = 0; + int len; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + while (cur_offset < item_size) { + ref = (struct btrfs_inode_ref *)(ptr + cur_offset); + len = btrfs_inode_ref_name_len(leaf, ref); + name_ptr = (unsigned long)(ref + 1); + cur_offset += len + sizeof(*ref); + if (len != name_len) + continue; + if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) { + *ref_ret = ref; + return 1; + } + } + return 0; +} + +struct btrfs_inode_ref * +btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int mod) +{ + struct btrfs_key key; + struct btrfs_inode_ref *ref; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + int ret; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + if (!find_name_in_backref(path, name, name_len, &ref)) + return NULL; + return ref; +} + +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_ref *ref; + struct extent_buffer *leaf; + unsigned long ptr; + unsigned long item_start; + u32 item_size; + u32 sub_item_len; + int ret; + int del_len = name_len + sizeof(*ref); + + key.objectid = inode_objectid; + key.offset = ref_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = -ENOENT; + goto out; + } else if (ret < 0) { + goto out; + } + if (!find_name_in_backref(path, name, name_len, &ref)) { + ret = -ENOENT; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + + if (index) + *index = btrfs_inode_ref_index(leaf, ref); + + if (del_len == item_size) { + ret = btrfs_del_item(trans, root, path); + goto out; + } + ptr = (unsigned long)ref; + sub_item_len = name_len + sizeof(*ref); + item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, + item_size - (ptr + sub_item_len - item_start)); + ret = btrfs_truncate_item(trans, root, path, + item_size - sub_item_len, 1); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_ref *ref; + unsigned long ptr; + int ret; + int ins_len = name_len + sizeof(*ref); + + key.objectid = inode_objectid; + key.offset = ref_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &key, + ins_len); + if (ret == -EEXIST) { + u32 old_size; + + if (find_name_in_backref(path, name, name_len, &ref)) + goto out; + + old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + ret = btrfs_extend_item(trans, root, path, ins_len); + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, index); + ptr = (unsigned long)(ref + 1); + ret = 0; + } else if (ret < 0) { + if (ret == -EOVERFLOW) + ret = -EMLINK; + goto out; + } else { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, index); + ptr = (unsigned long)(ref + 1); + } + write_extent_buffer(path->nodes[0], name, ptr, name_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid) +{ + struct btrfs_key key; + int ret; + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_inode_item)); + return ret; +} + +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod) +{ + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key found_key; + + ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); + if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY && + location->offset == (u64)-1 && path->slots[0] != 0) { + slot = path->slots[0] - 1; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid == location->objectid && + btrfs_key_type(&found_key) == btrfs_key_type(location)) { + path->slots[0]--; + return 0; + } + } + return ret; +} diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c new file mode 100644 index 00000000..b4087e0f --- /dev/null +++ b/fs/btrfs/inode-map.c @@ -0,0 +1,545 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include + +#include "ctree.h" +#include "disk-io.h" +#include "free-space-cache.h" +#include "inode-map.h" +#include "transaction.h" + +static int caching_kthread(void *data) +{ + struct btrfs_root *root = data; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + u64 last = (u64)-1; + int slot; + int ret; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Since the commit root is read-only, we can safely skip locking. */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = 2; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.offset = 0; + key.type = BTRFS_INODE_ITEM_KEY; +again: + /* need to make sure the commit_root doesn't disappear */ + mutex_lock(&root->fs_commit_mutex); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + if (btrfs_fs_closing(fs_info)) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + + if (need_resched() || + btrfs_transaction_in_commit(fs_info)) { + leaf = path->nodes[0]; + + if (btrfs_header_nritems(leaf) == 0) { + WARN_ON(1); + break; + } + + /* + * Save the key so we can advances forward + * in the next search. + */ + btrfs_item_key_to_cpu(leaf, &key, 0); + btrfs_release_path(path); + root->cache_progress = last; + mutex_unlock(&root->fs_commit_mutex); + schedule_timeout(1); + goto again; + } else + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.type != BTRFS_INODE_ITEM_KEY) + goto next; + + if (key.objectid >= root->highest_objectid) + break; + + if (last != (u64)-1 && last + 1 != key.objectid) { + __btrfs_add_free_space(ctl, last + 1, + key.objectid - last - 1); + wake_up(&root->cache_wait); + } + + last = key.objectid; +next: + path->slots[0]++; + } + + if (last < root->highest_objectid - 1) { + __btrfs_add_free_space(ctl, last + 1, + root->highest_objectid - last - 1); + } + + spin_lock(&root->cache_lock); + root->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&root->cache_lock); + + root->cache_progress = (u64)-1; + btrfs_unpin_free_ino(root); +out: + wake_up(&root->cache_wait); + mutex_unlock(&root->fs_commit_mutex); + + btrfs_free_path(path); + + return ret; +} + +static void start_caching(struct btrfs_root *root) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct task_struct *tsk; + int ret; + u64 objectid; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + + spin_lock(&root->cache_lock); + if (root->cached != BTRFS_CACHE_NO) { + spin_unlock(&root->cache_lock); + return; + } + + root->cached = BTRFS_CACHE_STARTED; + spin_unlock(&root->cache_lock); + + ret = load_free_ino_cache(root->fs_info, root); + if (ret == 1) { + spin_lock(&root->cache_lock); + root->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&root->cache_lock); + return; + } + + /* + * It can be quite time-consuming to fill the cache by searching + * through the extent tree, and this can keep ino allocation path + * waiting. Therefore at start we quickly find out the highest + * inode number and we know we can use inode numbers which fall in + * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID]. + */ + ret = btrfs_find_free_objectid(root, &objectid); + if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) { + __btrfs_add_free_space(ctl, objectid, + BTRFS_LAST_FREE_OBJECTID - objectid + 1); + } + + tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", + root->root_key.objectid); + BUG_ON(IS_ERR(tsk)); +} + +int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) +{ + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return btrfs_find_free_objectid(root, objectid); + +again: + *objectid = btrfs_find_ino_for_alloc(root); + + if (*objectid != 0) + return 0; + + start_caching(root); + + wait_event(root->cache_wait, + root->cached == BTRFS_CACHE_FINISHED || + root->free_ino_ctl->free_space > 0); + + if (root->cached == BTRFS_CACHE_FINISHED && + root->free_ino_ctl->free_space == 0) + return -ENOSPC; + else + goto again; +} + +void btrfs_return_ino(struct btrfs_root *root, u64 objectid) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + +again: + if (root->cached == BTRFS_CACHE_FINISHED) { + __btrfs_add_free_space(ctl, objectid, 1); + } else { + /* + * If we are in the process of caching free ino chunks, + * to avoid adding the same inode number to the free_ino + * tree twice due to cross transaction, we'll leave it + * in the pinned tree until a transaction is committed + * or the caching work is done. + */ + + mutex_lock(&root->fs_commit_mutex); + spin_lock(&root->cache_lock); + if (root->cached == BTRFS_CACHE_FINISHED) { + spin_unlock(&root->cache_lock); + mutex_unlock(&root->fs_commit_mutex); + goto again; + } + spin_unlock(&root->cache_lock); + + start_caching(root); + + if (objectid <= root->cache_progress || + objectid > root->highest_objectid) + __btrfs_add_free_space(ctl, objectid, 1); + else + __btrfs_add_free_space(pinned, objectid, 1); + + mutex_unlock(&root->fs_commit_mutex); + } +} + +/* + * When a transaction is committed, we'll move those inode numbers which + * are smaller than root->cache_progress from pinned tree to free_ino tree, + * and others will just be dropped, because the commit root we were + * searching has changed. + * + * Must be called with root->fs_commit_mutex held + */ +void btrfs_unpin_free_ino(struct btrfs_root *root) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset; + struct btrfs_free_space *info; + struct rb_node *n; + u64 count; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + + while (1) { + n = rb_first(rbroot); + if (!n) + break; + + info = rb_entry(n, struct btrfs_free_space, offset_index); + BUG_ON(info->bitmap); + + if (info->offset > root->cache_progress) + goto free; + else if (info->offset + info->bytes > root->cache_progress) + count = root->cache_progress - info->offset + 1; + else + count = info->bytes; + + __btrfs_add_free_space(ctl, info->offset, count); +free: + rb_erase(&info->offset_index, rbroot); + kfree(info); + } +} + +#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space)) +#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8) + +/* + * The goal is to keep the memory used by the free_ino tree won't + * exceed the memory if we use bitmaps only. + */ +static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *n; + int max_ino; + int max_bitmaps; + + n = rb_last(&ctl->free_space_offset); + if (!n) { + ctl->extents_thresh = INIT_THRESHOLD; + return; + } + info = rb_entry(n, struct btrfs_free_space, offset_index); + + /* + * Find the maximum inode number in the filesystem. Note we + * ignore the fact that this can be a bitmap, because we are + * not doing precise calculation. + */ + max_ino = info->bytes - 1; + + max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP; + if (max_bitmaps <= ctl->total_bitmaps) { + ctl->extents_thresh = 0; + return; + } + + ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) * + PAGE_CACHE_SIZE / sizeof(*info); +} + +/* + * We don't fall back to bitmap, if we are below the extents threshold + * or this chunk of inode numbers is a big one. + */ +static bool use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + if (ctl->free_extents < ctl->extents_thresh || + info->bytes > INODES_PER_BITMAP / 10) + return false; + + return true; +} + +static struct btrfs_free_space_op free_ino_op = { + .recalc_thresholds = recalculate_thresholds, + .use_bitmap = use_bitmap, +}; + +static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl) +{ +} + +static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + /* + * We always use extents for two reasons: + * + * - The pinned tree is only used during the process of caching + * work. + * - Make code simpler. See btrfs_unpin_free_ino(). + */ + return false; +} + +static struct btrfs_free_space_op pinned_free_ino_op = { + .recalc_thresholds = pinned_recalc_thresholds, + .use_bitmap = pinned_use_bitmap, +}; + +void btrfs_init_free_ino_ctl(struct btrfs_root *root) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; + + spin_lock_init(&ctl->tree_lock); + ctl->unit = 1; + ctl->start = 0; + ctl->private = NULL; + ctl->op = &free_ino_op; + + /* + * Initially we allow to use 16K of ram to cache chunks of + * inode numbers before we resort to bitmaps. This is somewhat + * arbitrary, but it will be adjusted in runtime. + */ + ctl->extents_thresh = INIT_THRESHOLD; + + spin_lock_init(&pinned->tree_lock); + pinned->unit = 1; + pinned->start = 0; + pinned->private = NULL; + pinned->extents_thresh = 0; + pinned->op = &pinned_free_ino_op; +} + +int btrfs_save_ino_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_path *path; + struct inode *inode; + u64 alloc_hint = 0; + int ret; + int prealloc; + bool retry = false; + + /* only fs tree and subvol/snap needs ino cache */ + if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID && + (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID || + root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID)) + return 0; + + /* Don't save inode cache if we are deleting this root */ + if (btrfs_root_refs(&root->root_item) == 0 && + root != root->fs_info->tree_root) + return 0; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + +again: + inode = lookup_free_ino_inode(root, path); + if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { + ret = PTR_ERR(inode); + goto out; + } + + if (IS_ERR(inode)) { + BUG_ON(retry); + retry = true; + + ret = create_free_ino_inode(root, trans, path); + if (ret) + goto out; + goto again; + } + + BTRFS_I(inode)->generation = 0; + ret = btrfs_update_inode(trans, root, inode); + WARN_ON(ret); + + if (i_size_read(inode) > 0) { + ret = btrfs_truncate_free_space_cache(root, trans, path, inode); + if (ret) + goto out_put; + } + + spin_lock(&root->cache_lock); + if (root->cached != BTRFS_CACHE_FINISHED) { + ret = -1; + spin_unlock(&root->cache_lock); + goto out_put; + } + spin_unlock(&root->cache_lock); + + spin_lock(&ctl->tree_lock); + prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; + prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE); + prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE; + spin_unlock(&ctl->tree_lock); + + /* Just to make sure we have enough space */ + prealloc += 8 * PAGE_CACHE_SIZE; + + ret = btrfs_check_data_free_space(inode, prealloc); + if (ret) + goto out_put; + + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, + prealloc, prealloc, &alloc_hint); + if (ret) + goto out_put; + btrfs_free_reserved_data_space(inode, prealloc); + +out_put: + iput(inode); +out: + if (ret == 0) + ret = btrfs_write_out_ino_cache(root, trans, path); + + btrfs_free_path(path); + return ret; +} + +static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) +{ + struct btrfs_path *path; + int ret; + struct extent_buffer *l; + struct btrfs_key search_key; + struct btrfs_key found_key; + int slot; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + search_key.objectid = BTRFS_LAST_FREE_OBJECTID; + search_key.type = -1; + search_key.offset = (u64)-1; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); + if (path->slots[0] > 0) { + slot = path->slots[0] - 1; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + *objectid = max_t(u64, found_key.objectid, + BTRFS_FIRST_FREE_OBJECTID - 1); + } else { + *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) +{ + int ret; + mutex_lock(&root->objectid_mutex); + + if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { + ret = btrfs_find_highest_objectid(root, + &root->highest_objectid); + if (ret) + goto out; + } + + if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + ret = -ENOSPC; + goto out; + } + + *objectid = ++root->highest_objectid; + ret = 0; +out: + mutex_unlock(&root->objectid_mutex); + return ret; +} diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h new file mode 100644 index 00000000..ddb347bf --- /dev/null +++ b/fs/btrfs/inode-map.h @@ -0,0 +1,13 @@ +#ifndef __BTRFS_INODE_MAP +#define __BTRFS_INODE_MAP + +void btrfs_init_free_ino_ctl(struct btrfs_root *root); +void btrfs_unpin_free_ino(struct btrfs_root *root); +void btrfs_return_ino(struct btrfs_root *root, u64 objectid); +int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid); +int btrfs_save_ino_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans); + +int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); + +#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c new file mode 100644 index 00000000..d42e6bfd --- /dev/null +++ b/fs/btrfs/inode.c @@ -0,0 +1,7459 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "volumes.h" +#include "ordered-data.h" +#include "xattr.h" +#include "tree-log.h" +#include "compression.h" +#include "locking.h" +#include "free-space-cache.h" +#include "inode-map.h" + +struct btrfs_iget_args { + u64 ino; + struct btrfs_root *root; +}; + +static const struct inode_operations btrfs_dir_inode_operations; +static const struct inode_operations btrfs_symlink_inode_operations; +static const struct inode_operations btrfs_dir_ro_inode_operations; +static const struct inode_operations btrfs_special_inode_operations; +static const struct inode_operations btrfs_file_inode_operations; +static const struct address_space_operations btrfs_aops; +static const struct address_space_operations btrfs_symlink_aops; +static const struct file_operations btrfs_dir_file_operations; +static struct extent_io_ops btrfs_extent_io_ops; + +static struct kmem_cache *btrfs_inode_cachep; +struct kmem_cache *btrfs_trans_handle_cachep; +struct kmem_cache *btrfs_transaction_cachep; +struct kmem_cache *btrfs_path_cachep; +struct kmem_cache *btrfs_free_space_cachep; + +#define S_SHIFT 12 +static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, +}; + +static int btrfs_setsize(struct inode *inode, loff_t newsize); +static int btrfs_truncate(struct inode *inode); +static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock); + +static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + int err; + + err = btrfs_init_acl(trans, inode, dir); + if (!err) + err = btrfs_xattr_security_init(trans, inode, dir, qstr); + return err; +} + +/* + * this does all the hard work for inserting an inline extent into + * the btree. The caller should have done a btrfs_drop_extents so that + * no overlapping inline items exist in the btree + */ +static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, size_t size, size_t compressed_size, + int compress_type, + struct page **compressed_pages) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct page *page = NULL; + char *kaddr; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + int err = 0; + int ret; + size_t cur_size = size; + size_t datasize; + unsigned long offset; + + if (compressed_size && compressed_pages) + cur_size = compressed_size; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + + key.objectid = btrfs_ino(inode); + key.offset = start; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + datasize = btrfs_file_extent_calc_inline_size(cur_size); + + inode_add_bytes(inode, size); + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + BUG_ON(ret); + if (ret) { + err = ret; + goto fail; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, size); + ptr = btrfs_file_extent_inline_start(ei); + + if (compress_type != BTRFS_COMPRESS_NONE) { + struct page *cpage; + int i = 0; + while (compressed_size > 0) { + cpage = compressed_pages[i]; + cur_size = min_t(unsigned long, compressed_size, + PAGE_CACHE_SIZE); + + kaddr = kmap_atomic(cpage, KM_USER0); + write_extent_buffer(leaf, kaddr, ptr, cur_size); + kunmap_atomic(kaddr, KM_USER0); + + i++; + ptr += cur_size; + compressed_size -= cur_size; + } + btrfs_set_file_extent_compression(leaf, ei, + compress_type); + } else { + page = find_get_page(inode->i_mapping, + start >> PAGE_CACHE_SHIFT); + btrfs_set_file_extent_compression(leaf, ei, 0); + kaddr = kmap_atomic(page, KM_USER0); + offset = start & (PAGE_CACHE_SIZE - 1); + write_extent_buffer(leaf, kaddr + offset, ptr, size); + kunmap_atomic(kaddr, KM_USER0); + page_cache_release(page); + } + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + /* + * we're an inline extent, so nobody can + * extend the file past i_size without locking + * a page we already have locked. + * + * We must do any isize and inode updates + * before we unlock the pages. Otherwise we + * could end up racing with unlink. + */ + BTRFS_I(inode)->disk_i_size = inode->i_size; + btrfs_update_inode(trans, root, inode); + + return 0; +fail: + btrfs_free_path(path); + return err; +} + + +/* + * conditionally insert an inline extent into the file. This + * does the checks required to make sure the data is small enough + * to fit as an inline extent. + */ +static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end, + size_t compressed_size, int compress_type, + struct page **compressed_pages) +{ + u64 isize = i_size_read(inode); + u64 actual_end = min(end + 1, isize); + u64 inline_len = actual_end - start; + u64 aligned_end = (end + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + u64 hint_byte; + u64 data_len = inline_len; + int ret; + + if (compressed_size) + data_len = compressed_size; + + if (start > 0 || + actual_end >= PAGE_CACHE_SIZE || + data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || + (!compressed_size && + (actual_end & (root->sectorsize - 1)) == 0) || + end + 1 < isize || + data_len > root->fs_info->max_inline) { + return 1; + } + + ret = btrfs_drop_extents(trans, inode, start, aligned_end, + &hint_byte, 1); + BUG_ON(ret); + + if (isize > actual_end) + inline_len = min_t(u64, isize, actual_end); + ret = insert_inline_extent(trans, root, inode, start, + inline_len, compressed_size, + compress_type, compressed_pages); + BUG_ON(ret); + btrfs_delalloc_release_metadata(inode, end + 1 - start); + btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); + return 0; +} + +struct async_extent { + u64 start; + u64 ram_size; + u64 compressed_size; + struct page **pages; + unsigned long nr_pages; + int compress_type; + struct list_head list; +}; + +struct async_cow { + struct inode *inode; + struct btrfs_root *root; + struct page *locked_page; + u64 start; + u64 end; + struct list_head extents; + struct btrfs_work work; +}; + +static noinline int add_async_extent(struct async_cow *cow, + u64 start, u64 ram_size, + u64 compressed_size, + struct page **pages, + unsigned long nr_pages, + int compress_type) +{ + struct async_extent *async_extent; + + async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); + BUG_ON(!async_extent); + async_extent->start = start; + async_extent->ram_size = ram_size; + async_extent->compressed_size = compressed_size; + async_extent->pages = pages; + async_extent->nr_pages = nr_pages; + async_extent->compress_type = compress_type; + list_add_tail(&async_extent->list, &cow->extents); + return 0; +} + +/* + * we create compressed extents in two phases. The first + * phase compresses a range of pages that have already been + * locked (both pages and state bits are locked). + * + * This is done inside an ordered work queue, and the compression + * is spread across many cpus. The actual IO submission is step + * two, and the ordered work queue takes care of making sure that + * happens in the same order things were put onto the queue by + * writepages and friends. + * + * If this code finds it can't get good compression, it puts an + * entry onto the work queue to write the uncompressed bytes. This + * makes sure that both compressed inodes and uncompressed inodes + * are written in the same order that pdflush sent them down. + */ +static noinline int compress_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, + struct async_cow *async_cow, + int *num_added) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 num_bytes; + u64 blocksize = root->sectorsize; + u64 actual_end; + u64 isize = i_size_read(inode); + int ret = 0; + struct page **pages = NULL; + unsigned long nr_pages; + unsigned long nr_pages_ret = 0; + unsigned long total_compressed = 0; + unsigned long total_in = 0; + unsigned long max_compressed = 128 * 1024; + unsigned long max_uncompressed = 128 * 1024; + int i; + int will_compress; + int compress_type = root->fs_info->compress_type; + + /* if this is a small write inside eof, kick off a defragbot */ + if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024) + btrfs_add_inode_defrag(NULL, inode); + + actual_end = min_t(u64, isize, end + 1); +again: + will_compress = 0; + nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; + nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + + /* + * we don't want to send crud past the end of i_size through + * compression, that's just a waste of CPU time. So, if the + * end of the file is before the start of our current + * requested range of bytes, we bail out to the uncompressed + * cleanup code that can deal with all of this. + * + * It isn't really the fastest way to fix things, but this is a + * very uncommon corner. + */ + if (actual_end <= start) + goto cleanup_and_bail_uncompressed; + + total_compressed = actual_end - start; + + /* we want to make sure that amount of ram required to uncompress + * an extent is reasonable, so we limit the total size in ram + * of a compressed extent to 128k. This is a crucial number + * because it also controls how easily we can spread reads across + * cpus for decompression. + * + * We also want to make sure the amount of IO required to do + * a random read is reasonably small, so we limit the size of + * a compressed extent to 128k. + */ + total_compressed = min(total_compressed, max_uncompressed); + num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = max(blocksize, num_bytes); + total_in = 0; + ret = 0; + + /* + * we do compression for mount -o compress and when the + * inode has not been flagged as nocompress. This flag can + * change at any time if we discover bad compression ratios. + */ + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && + (btrfs_test_opt(root, COMPRESS) || + (BTRFS_I(inode)->force_compress) || + (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { + WARN_ON(pages); + pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + BUG_ON(!pages); + + if (BTRFS_I(inode)->force_compress) + compress_type = BTRFS_I(inode)->force_compress; + + ret = btrfs_compress_pages(compress_type, + inode->i_mapping, start, + total_compressed, pages, + nr_pages, &nr_pages_ret, + &total_in, + &total_compressed, + max_compressed); + + if (!ret) { + unsigned long offset = total_compressed & + (PAGE_CACHE_SIZE - 1); + struct page *page = pages[nr_pages_ret - 1]; + char *kaddr; + + /* zero the tail end of the last page, we might be + * sending it down to disk + */ + if (offset) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, + PAGE_CACHE_SIZE - offset); + kunmap_atomic(kaddr, KM_USER0); + } + will_compress = 1; + } + } + if (start == 0) { + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + /* lets try to make an inline extent */ + if (ret || total_in < (actual_end - start)) { + /* we didn't compress the entire range, try + * to make an uncompressed inline extent. + */ + ret = cow_file_range_inline(trans, root, inode, + start, end, 0, 0, NULL); + } else { + /* try making a compressed inline extent */ + ret = cow_file_range_inline(trans, root, inode, + start, end, + total_compressed, + compress_type, pages); + } + if (ret == 0) { + /* + * inline extent creation worked, we don't need + * to create any more async work items. Unlock + * and free up our temp pages. + */ + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_CLEAR_DELALLOC | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); + + btrfs_end_transaction(trans, root); + goto free_pages_out; + } + btrfs_end_transaction(trans, root); + } + + if (will_compress) { + /* + * we aren't doing an inline extent round the compressed size + * up to a block size boundary so the allocator does sane + * things + */ + total_compressed = (total_compressed + blocksize - 1) & + ~(blocksize - 1); + + /* + * one last check to make sure the compression is really a + * win, compare the page count read with the blocks on disk + */ + total_in = (total_in + PAGE_CACHE_SIZE - 1) & + ~(PAGE_CACHE_SIZE - 1); + if (total_compressed >= total_in) { + will_compress = 0; + } else { + num_bytes = total_in; + } + } + if (!will_compress && pages) { + /* + * the compression code ran but failed to make things smaller, + * free any pages it allocated and our page pointer array + */ + for (i = 0; i < nr_pages_ret; i++) { + WARN_ON(pages[i]->mapping); + page_cache_release(pages[i]); + } + kfree(pages); + pages = NULL; + total_compressed = 0; + nr_pages_ret = 0; + + /* flag the file so we don't compress in the future */ + if (!btrfs_test_opt(root, FORCE_COMPRESS) && + !(BTRFS_I(inode)->force_compress)) { + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + } + } + if (will_compress) { + *num_added += 1; + + /* the async work queues will take care of doing actual + * allocation on disk for these compressed pages, + * and will submit them to the elevator. + */ + add_async_extent(async_cow, start, num_bytes, + total_compressed, pages, nr_pages_ret, + compress_type); + + if (start + num_bytes < end) { + start += num_bytes; + pages = NULL; + cond_resched(); + goto again; + } + } else { +cleanup_and_bail_uncompressed: + /* + * No compression, but we still need to write the pages in + * the file we've been given so far. redirty the locked + * page if it corresponds to our extent and set things up + * for the async work queue to run cow_file_range to do + * the normal delalloc dance + */ + if (page_offset(locked_page) >= start && + page_offset(locked_page) <= end) { + __set_page_dirty_nobuffers(locked_page); + /* unlocked later on in the async handlers */ + } + add_async_extent(async_cow, start, end - start + 1, + 0, NULL, 0, BTRFS_COMPRESS_NONE); + *num_added += 1; + } + +out: + return 0; + +free_pages_out: + for (i = 0; i < nr_pages_ret; i++) { + WARN_ON(pages[i]->mapping); + page_cache_release(pages[i]); + } + kfree(pages); + + goto out; +} + +/* + * phase two of compressed writeback. This is the ordered portion + * of the code, which only gets called in the order the work was + * queued. We walk all the async extents created by compress_file_range + * and send them down to the disk. + */ +static noinline int submit_compressed_extents(struct inode *inode, + struct async_cow *async_cow) +{ + struct async_extent *async_extent; + u64 alloc_hint = 0; + struct btrfs_trans_handle *trans; + struct btrfs_key ins; + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree; + int ret = 0; + + if (list_empty(&async_cow->extents)) + return 0; + + + while (!list_empty(&async_cow->extents)) { + async_extent = list_entry(async_cow->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + + io_tree = &BTRFS_I(inode)->io_tree; + +retry: + /* did the compression code fall back to uncompressed IO? */ + if (!async_extent->pages) { + int page_started = 0; + unsigned long nr_written = 0; + + lock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, GFP_NOFS); + + /* allocate blocks */ + ret = cow_file_range(inode, async_cow->locked_page, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + &page_started, &nr_written, 0); + + /* + * if page_started, cow_file_range inserted an + * inline extent and took care of all the unlocking + * and IO for us. Otherwise, we need to submit + * all those pages down to the drive. + */ + if (!page_started && !ret) + extent_write_locked_range(io_tree, + inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + btrfs_get_extent, + WB_SYNC_ALL); + kfree(async_extent); + cond_resched(); + continue; + } + + lock_extent(io_tree, async_extent->start, + async_extent->start + async_extent->ram_size - 1, + GFP_NOFS); + + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + ret = btrfs_reserve_extent(trans, root, + async_extent->compressed_size, + async_extent->compressed_size, + 0, alloc_hint, + (u64)-1, &ins, 1); + btrfs_end_transaction(trans, root); + + if (ret) { + int i; + for (i = 0; i < async_extent->nr_pages; i++) { + WARN_ON(async_extent->pages[i]->mapping); + page_cache_release(async_extent->pages[i]); + } + kfree(async_extent->pages); + async_extent->nr_pages = 0; + async_extent->pages = NULL; + unlock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, GFP_NOFS); + goto retry; + } + + /* + * here we're doing allocation and writeback of the + * compressed pages + */ + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + + em = alloc_extent_map(); + BUG_ON(!em); + em->start = async_extent->start; + em->len = async_extent->ram_size; + em->orig_start = em->start; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + em->compress_type = async_extent->compress_type; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + } + + ret = btrfs_add_ordered_extent_compress(inode, + async_extent->start, + ins.objectid, + async_extent->ram_size, + ins.offset, + BTRFS_ORDERED_COMPRESSED, + async_extent->compress_type); + BUG_ON(ret); + + /* + * clear dirty, set writeback and unlock the pages. + */ + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); + + ret = btrfs_submit_compressed_write(inode, + async_extent->start, + async_extent->ram_size, + ins.objectid, + ins.offset, async_extent->pages, + async_extent->nr_pages); + + BUG_ON(ret); + alloc_hint = ins.objectid + ins.offset; + kfree(async_extent); + cond_resched(); + } + + return 0; +} + +static u64 get_extent_allocation_hint(struct inode *inode, u64 start, + u64 num_bytes) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + u64 alloc_hint = 0; + + read_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, start, num_bytes); + if (em) { + /* + * if block start isn't an actual block number then find the + * first block in this inode and use that as a hint. If that + * block is also bogus then just don't worry about it. + */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + em = search_extent_mapping(em_tree, 0, 0); + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + if (em) + free_extent_map(em); + } else { + alloc_hint = em->block_start; + free_extent_map(em); + } + } + read_unlock(&em_tree->lock); + + return alloc_hint; +} + +static inline bool is_free_space_inode(struct btrfs_root *root, + struct inode *inode) +{ + if (root == root->fs_info->tree_root || + BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) + return true; + return false; +} + +/* + * when extent_io.c finds a delayed allocation range in the file, + * the call backs end up in this code. The basic idea is to + * allocate extents on disk for the range, and create ordered data structs + * in ram to track those extents. + * + * locked_page is the page that writepage had locked already. We use + * it to make sure we don't do extra locks or unlocks. + * + * *page_started is set to one if we unlock locked_page and do everything + * required to start IO on it. It may be clean and already done with + * IO when we return. + */ +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, + int unlock) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 alloc_hint = 0; + u64 num_bytes; + unsigned long ram_size; + u64 disk_num_bytes; + u64 cur_alloc_size; + u64 blocksize = root->sectorsize; + struct btrfs_key ins; + struct extent_map *em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + int ret = 0; + + BUG_ON(is_free_space_inode(root, inode)); + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = max(blocksize, num_bytes); + disk_num_bytes = num_bytes; + ret = 0; + + /* if this is a small write inside eof, kick off defrag */ + if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024) + btrfs_add_inode_defrag(trans, inode); + + if (start == 0) { + /* lets try to make an inline extent */ + ret = cow_file_range_inline(trans, root, inode, + start, end, 0, 0, NULL); + if (ret == 0) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + + *nr_written = *nr_written + + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; + *page_started = 1; + ret = 0; + goto out; + } + } + + BUG_ON(disk_num_bytes > + btrfs_super_total_bytes(&root->fs_info->super_copy)); + + alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); + btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); + + while (disk_num_bytes > 0) { + unsigned long op; + + cur_alloc_size = disk_num_bytes; + ret = btrfs_reserve_extent(trans, root, cur_alloc_size, + root->sectorsize, 0, alloc_hint, + (u64)-1, &ins, 1); + BUG_ON(ret); + + em = alloc_extent_map(); + BUG_ON(!em); + em->start = start; + em->orig_start = em->start; + ram_size = ins.offset; + em->len = ins.offset; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, + start + ram_size - 1, 0); + } + + cur_alloc_size = ins.offset; + ret = btrfs_add_ordered_extent(inode, start, ins.objectid, + ram_size, cur_alloc_size, 0); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_reloc_clone_csums(inode, start, + cur_alloc_size); + BUG_ON(ret); + } + + if (disk_num_bytes < cur_alloc_size) + break; + + /* we're not doing compressed IO, don't unlock the first + * page (which the caller expects to stay locked), don't + * clear any dirty bits and don't set any writeback bits + * + * Do set the Private2 bit so we know this page was properly + * setup for writepage + */ + op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; + op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_SET_PRIVATE2; + + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + start, start + ram_size - 1, + locked_page, op); + disk_num_bytes -= cur_alloc_size; + num_bytes -= cur_alloc_size; + alloc_hint = ins.objectid + ins.offset; + start += cur_alloc_size; + } +out: + ret = 0; + btrfs_end_transaction(trans, root); + + return ret; +} + +/* + * work queue call back to started compression on a file and pages + */ +static noinline void async_cow_start(struct btrfs_work *work) +{ + struct async_cow *async_cow; + int num_added = 0; + async_cow = container_of(work, struct async_cow, work); + + compress_file_range(async_cow->inode, async_cow->locked_page, + async_cow->start, async_cow->end, async_cow, + &num_added); + if (num_added == 0) + async_cow->inode = NULL; +} + +/* + * work queue call back to submit previously compressed pages + */ +static noinline void async_cow_submit(struct btrfs_work *work) +{ + struct async_cow *async_cow; + struct btrfs_root *root; + unsigned long nr_pages; + + async_cow = container_of(work, struct async_cow, work); + + root = async_cow->root; + nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + + atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); + + if (atomic_read(&root->fs_info->async_delalloc_pages) < + 5 * 1042 * 1024 && + waitqueue_active(&root->fs_info->async_submit_wait)) + wake_up(&root->fs_info->async_submit_wait); + + if (async_cow->inode) + submit_compressed_extents(async_cow->inode, async_cow); +} + +static noinline void async_cow_free(struct btrfs_work *work) +{ + struct async_cow *async_cow; + async_cow = container_of(work, struct async_cow, work); + kfree(async_cow); +} + +static int cow_file_range_async(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) +{ + struct async_cow *async_cow; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long nr_pages; + u64 cur_end; + int limit = 10 * 1024 * 1042; + + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, + 1, 0, NULL, GFP_NOFS); + while (start < end) { + async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); + BUG_ON(!async_cow); + async_cow->inode = inode; + async_cow->root = root; + async_cow->locked_page = locked_page; + async_cow->start = start; + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) + cur_end = end; + else + cur_end = min(end, start + 512 * 1024 - 1); + + async_cow->end = cur_end; + INIT_LIST_HEAD(&async_cow->extents); + + async_cow->work.func = async_cow_start; + async_cow->work.ordered_func = async_cow_submit; + async_cow->work.ordered_free = async_cow_free; + async_cow->work.flags = 0; + + nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); + + btrfs_queue_worker(&root->fs_info->delalloc_workers, + &async_cow->work); + + if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->async_delalloc_pages) < + limit)); + } + + while (atomic_read(&root->fs_info->async_submit_draining) && + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->async_delalloc_pages) == + 0)); + } + + *nr_written += nr_pages; + start = cur_end + 1; + } + *page_started = 1; + return 0; +} + +static noinline int csum_exist_in_range(struct btrfs_root *root, + u64 bytenr, u64 num_bytes) +{ + int ret; + struct btrfs_ordered_sum *sums; + LIST_HEAD(list); + + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, + bytenr + num_bytes - 1, &list, 0); + if (ret == 0 && list_empty(&list)) + return 0; + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del(&sums->list); + kfree(sums); + } + return 1; +} + +/* + * when nowcow writeback call back. This checks for snapshots or COW copies + * of the extents that exist in the file, and COWs the file as required. + * + * If no cow copies or snapshots exist, we write directly to the existing + * blocks on disk + */ +static noinline int run_delalloc_nocow(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, int force, + unsigned long *nr_written) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct btrfs_key found_key; + u64 cow_start; + u64 cur_offset; + u64 extent_end; + u64 extent_offset; + u64 disk_bytenr; + u64 num_bytes; + int extent_type; + int ret; + int type; + int nocow; + int check_prev = 1; + bool nolock; + u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + nolock = is_free_space_inode(root, inode); + + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + + BUG_ON(IS_ERR(trans)); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + cow_start = (u64)-1; + cur_offset = start; + while (1) { + ret = btrfs_lookup_file_extent(trans, root, path, ino, + cur_offset, 0); + BUG_ON(ret < 0); + if (ret > 0 && path->slots[0] > 0 && check_prev) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, + path->slots[0] - 1); + if (found_key.objectid == ino && + found_key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + check_prev = 0; +next_slot: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + BUG_ON(1); + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + nocow = 0; + disk_bytenr = 0; + num_bytes = 0; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid > ino || + found_key.type > BTRFS_EXTENT_DATA_KEY || + found_key.offset > end) + break; + + if (found_key.offset > cur_offset) { + extent_end = found_key.offset; + extent_type = 0; + goto out_check; + } + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); + extent_end = found_key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } + if (disk_bytenr == 0) + goto out_check; + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out_check; + if (extent_type == BTRFS_FILE_EXTENT_REG && !force) + goto out_check; + if (btrfs_extent_readonly(root, disk_bytenr)) + goto out_check; + if (btrfs_cross_ref_exist(trans, root, ino, + found_key.offset - + extent_offset, disk_bytenr)) + goto out_check; + disk_bytenr += extent_offset; + disk_bytenr += cur_offset - found_key.offset; + num_bytes = min(end + 1, extent_end) - cur_offset; + /* + * force cow if csum exists in the range. + * this ensure that csum for a given extent are + * either valid or do not exist. + */ + if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + goto out_check; + nocow = 1; + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = found_key.offset + + btrfs_file_extent_inline_len(leaf, fi); + extent_end = ALIGN(extent_end, root->sectorsize); + } else { + BUG_ON(1); + } +out_check: + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } + if (!nocow) { + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = extent_end; + if (cur_offset > end) + break; + path->slots[0]++; + goto next_slot; + } + + btrfs_release_path(path); + if (cow_start != (u64)-1) { + ret = cow_file_range(inode, locked_page, cow_start, + found_key.offset - 1, page_started, + nr_written, 1); + BUG_ON(ret); + cow_start = (u64)-1; + } + + if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + struct extent_map *em; + struct extent_map_tree *em_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + em = alloc_extent_map(); + BUG_ON(!em); + em->start = cur_offset; + em->orig_start = em->start; + em->len = num_bytes; + em->block_len = num_bytes; + em->block_start = disk_bytenr; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + } + type = BTRFS_ORDERED_PREALLOC; + } else { + type = BTRFS_ORDERED_NOCOW; + } + + ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, + num_bytes, num_bytes, type); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_reloc_clone_csums(inode, cur_offset, + num_bytes); + BUG_ON(ret); + } + + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + cur_offset, cur_offset + num_bytes - 1, + locked_page, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_SET_PRIVATE2); + cur_offset = extent_end; + if (cur_offset > end) + break; + } + btrfs_release_path(path); + + if (cur_offset <= end && cow_start == (u64)-1) + cow_start = cur_offset; + if (cow_start != (u64)-1) { + ret = cow_file_range(inode, locked_page, cow_start, end, + page_started, nr_written, 1); + BUG_ON(ret); + } + + if (nolock) { + ret = btrfs_end_transaction_nolock(trans, root); + BUG_ON(ret); + } else { + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + } + btrfs_free_path(path); + return 0; +} + +/* + * extent_io.c call back to do delayed allocation processing + */ +static int run_delalloc_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) + ret = run_delalloc_nocow(inode, locked_page, start, end, + page_started, 1, nr_written); + else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) + ret = run_delalloc_nocow(inode, locked_page, start, end, + page_started, 0, nr_written); + else if (!btrfs_test_opt(root, COMPRESS) && + !(BTRFS_I(inode)->force_compress) && + !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); + else + ret = cow_file_range_async(inode, locked_page, start, end, + page_started, nr_written); + return ret; +} + +static int btrfs_split_extent_hook(struct inode *inode, + struct extent_state *orig, u64 split) +{ + /* not delalloc, ignore it */ + if (!(orig->state & EXTENT_DELALLOC)) + return 0; + + atomic_inc(&BTRFS_I(inode)->outstanding_extents); + return 0; +} + +/* + * extent_io.c merge_extent_hook, used to track merged delayed allocation + * extents so we can keep track of new extents that are just merged onto old + * extents, such as when we are doing sequential writes, so we can properly + * account for the metadata space we'll need. + */ +static int btrfs_merge_extent_hook(struct inode *inode, + struct extent_state *new, + struct extent_state *other) +{ + /* not delalloc, ignore it */ + if (!(other->state & EXTENT_DELALLOC)) + return 0; + + atomic_dec(&BTRFS_I(inode)->outstanding_extents); + return 0; +} + +/* + * extent_io.c set_bit_hook, used to track delayed allocation + * bytes in this file, and to maintain the list of inodes that + * have pending delalloc work to be done. + */ +static int btrfs_set_bit_hook(struct inode *inode, + struct extent_state *state, int *bits) +{ + + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ + if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; + bool do_list = !is_free_space_inode(root, inode); + + if (*bits & EXTENT_FIRST_DELALLOC) + *bits &= ~EXTENT_FIRST_DELALLOC; + else + atomic_inc(&BTRFS_I(inode)->outstanding_extents); + + spin_lock(&root->fs_info->delalloc_lock); + BTRFS_I(inode)->delalloc_bytes += len; + root->fs_info->delalloc_bytes += len; + if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->fs_info->delalloc_inodes); + } + spin_unlock(&root->fs_info->delalloc_lock); + } + return 0; +} + +/* + * extent_io.c clear_bit_hook, see set_bit_hook for why + */ +static int btrfs_clear_bit_hook(struct inode *inode, + struct extent_state *state, int *bits) +{ + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ + if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; + bool do_list = !is_free_space_inode(root, inode); + + if (*bits & EXTENT_FIRST_DELALLOC) + *bits &= ~EXTENT_FIRST_DELALLOC; + else if (!(*bits & EXTENT_DO_ACCOUNTING)) + atomic_dec(&BTRFS_I(inode)->outstanding_extents); + + if (*bits & EXTENT_DO_ACCOUNTING) + btrfs_delalloc_release_metadata(inode, len); + + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID + && do_list) + btrfs_free_reserved_data_space(inode, len); + + spin_lock(&root->fs_info->delalloc_lock); + root->fs_info->delalloc_bytes -= len; + BTRFS_I(inode)->delalloc_bytes -= len; + + if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && + !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + } + spin_unlock(&root->fs_info->delalloc_lock); + } + return 0; +} + +/* + * extent_io.c merge_bio_hook, this must check the chunk tree to make sure + * we don't create bios that span stripes or chunks + */ +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct btrfs_mapping_tree *map_tree; + u64 logical = (u64)bio->bi_sector << 9; + u64 length = 0; + u64 map_length; + int ret; + + if (bio_flags & EXTENT_BIO_COMPRESSED) + return 0; + + length = bio->bi_size; + map_tree = &root->fs_info->mapping_tree; + map_length = length; + ret = btrfs_map_block(map_tree, READ, logical, + &map_length, NULL, 0); + + if (map_length < length + size) + return 1; + return ret; +} + +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ +static int __btrfs_submit_bio_start(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags, + u64 bio_offset) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); + BUG_ON(ret); + return 0; +} + +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ +static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + return btrfs_map_bio(root, rw, bio, mirror_num, 1); +} + +/* + * extent_io.c submission hook. This does the right thing for csum calculation + * on write, or reading the csums from the tree before a read + */ +static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + int skip_sum; + + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + if (is_free_space_inode(root, inode)) + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); + else + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + if (!(rw & REQ_WRITE)) { + if (bio_flags & EXTENT_BIO_COMPRESSED) { + return btrfs_submit_compressed_read(inode, bio, + mirror_num, bio_flags); + } else if (!skip_sum) { + ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); + if (ret) + return ret; + } + goto mapit; + } else if (!skip_sum) { + /* csum items have already been cloned */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + goto mapit; + /* we're doing a write, do the async checksumming */ + return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, + bio_flags, bio_offset, + __btrfs_submit_bio_start, + __btrfs_submit_bio_done); + } + +mapit: + return btrfs_map_bio(root, rw, bio, mirror_num, 0); +} + +/* + * given a list of ordered sums record them in the inode. This happens + * at IO completion time based on sums calculated at bio submission time. + */ +static noinline int add_pending_csums(struct btrfs_trans_handle *trans, + struct inode *inode, u64 file_offset, + struct list_head *list) +{ + struct btrfs_ordered_sum *sum; + + list_for_each_entry(sum, list, list) { + btrfs_csum_file_blocks(trans, + BTRFS_I(inode)->root->fs_info->csum_root, sum); + } + return 0; +} + +int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, + struct extent_state **cached_state) +{ + if ((end & (PAGE_CACHE_SIZE - 1)) == 0) + WARN_ON(1); + return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, + cached_state, GFP_NOFS); +} + +/* see btrfs_writepage_start_hook for details on why this is required */ +struct btrfs_writepage_fixup { + struct page *page; + struct btrfs_work work; +}; + +static void btrfs_writepage_fixup_worker(struct btrfs_work *work) +{ + struct btrfs_writepage_fixup *fixup; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + struct page *page; + struct inode *inode; + u64 page_start; + u64 page_end; + + fixup = container_of(work, struct btrfs_writepage_fixup, work); + page = fixup->page; +again: + lock_page(page); + if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + ClearPageChecked(page); + goto out_page; + } + + inode = page->mapping->host; + page_start = page_offset(page); + page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; + + lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, + &cached_state, GFP_NOFS); + + /* already ordered? We're done */ + if (PagePrivate2(page)) + goto out; + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, + page_end, &cached_state, GFP_NOFS); + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + goto again; + } + + BUG(); + btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); + ClearPageChecked(page); +out: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, + &cached_state, GFP_NOFS); +out_page: + unlock_page(page); + page_cache_release(page); + kfree(fixup); +} + +/* + * There are a few paths in the higher layers of the kernel that directly + * set the page dirty bit without asking the filesystem if it is a + * good idea. This causes problems because we want to make sure COW + * properly happens and the data=ordered rules are followed. + * + * In our case any range that doesn't have the ORDERED bit set + * hasn't been properly setup for IO. We kick off an async process + * to fix it up. The async helper will wait for ordered extents, set + * the delalloc bit and make it safe to write the page. + */ +static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) +{ + struct inode *inode = page->mapping->host; + struct btrfs_writepage_fixup *fixup; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* this page is properly in the ordered list */ + if (TestClearPagePrivate2(page)) + return 0; + + if (PageChecked(page)) + return -EAGAIN; + + fixup = kzalloc(sizeof(*fixup), GFP_NOFS); + if (!fixup) + return -EAGAIN; + + SetPageChecked(page); + page_cache_get(page); + fixup->work.func = btrfs_writepage_fixup_worker; + fixup->page = page; + btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); + return -EAGAIN; +} + +static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, + struct inode *inode, u64 file_pos, + u64 disk_bytenr, u64 disk_num_bytes, + u64 num_bytes, u64 ram_bytes, + u8 compression, u8 encryption, + u16 other_encoding, int extent_type) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key ins; + u64 hint; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + path->leave_spinning = 1; + + /* + * we may be replacing one extent in the tree with another. + * The new extent is pinned in the extent map, and we don't want + * to drop it from the cache until it is completely in the btree. + * + * So, tell btrfs_drop_extents to leave this extent in the cache. + * the caller is expected to unpin it and allow it to be merged + * with the others. + */ + ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, + &hint, 0); + BUG_ON(ret); + + ins.objectid = btrfs_ino(inode); + ins.offset = file_pos; + ins.type = BTRFS_EXTENT_DATA_KEY; + ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); + BUG_ON(ret); + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_type(leaf, fi, extent_type); + btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); + btrfs_set_file_extent_compression(leaf, fi, compression); + btrfs_set_file_extent_encryption(leaf, fi, encryption); + btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); + + btrfs_unlock_up_safe(path, 1); + btrfs_set_lock_blocking(leaf); + + btrfs_mark_buffer_dirty(leaf); + + inode_add_bytes(inode, num_bytes); + + ins.objectid = disk_bytenr; + ins.offset = disk_num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ret = btrfs_alloc_reserved_file_extent(trans, root, + root->root_key.objectid, + btrfs_ino(inode), file_pos, &ins); + BUG_ON(ret); + btrfs_free_path(path); + + return 0; +} + +/* + * helper function for btrfs_finish_ordered_io, this + * just reads in some of the csum leaves to prime them into ram + * before we start the transaction. It limits the amount of btree + * reads required while inside the transaction. + */ +/* as ordered data IO finishes, this gets called so we can finish + * an ordered extent if the range of bytes in the file it covers are + * fully written. + */ +static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_ordered_extent *ordered_extent = NULL; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_state *cached_state = NULL; + int compress_type = 0; + int ret; + bool nolock; + + ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, + end - start + 1); + if (!ret) + return 0; + BUG_ON(!ordered_extent); + + nolock = is_free_space_inode(root, inode); + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + BUG_ON(!list_empty(&ordered_extent->list)); + ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); + if (!ret) { + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + } + goto out; + } + + lock_extent_bits(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + 0, &cached_state, GFP_NOFS); + + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) + compress_type = ordered_extent->compress_type; + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { + BUG_ON(compress_type); + ret = btrfs_mark_extent_written(trans, inode, + ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len); + BUG_ON(ret); + } else { + BUG_ON(root == root->fs_info->tree_root); + ret = insert_reserved_file_extent(trans, inode, + ordered_extent->file_offset, + ordered_extent->start, + ordered_extent->disk_len, + ordered_extent->len, + ordered_extent->len, + compress_type, 0, 0, + BTRFS_FILE_EXTENT_REG); + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered_extent->file_offset, + ordered_extent->len); + BUG_ON(ret); + } + unlock_extent_cached(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, &cached_state, GFP_NOFS); + + add_pending_csums(trans, inode, ordered_extent->file_offset, + &ordered_extent->list); + + ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); + if (!ret) { + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + } + ret = 0; +out: + if (nolock) { + if (trans) + btrfs_end_transaction_nolock(trans, root); + } else { + btrfs_delalloc_release_metadata(inode, ordered_extent->len); + if (trans) + btrfs_end_transaction(trans, root); + } + + /* once for us */ + btrfs_put_ordered_extent(ordered_extent); + /* once for the tree */ + btrfs_put_ordered_extent(ordered_extent); + + return 0; +} + +static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate) +{ + trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); + + ClearPagePrivate2(page); + return btrfs_finish_ordered_io(page->mapping->host, start, end); +} + +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int last_mirror; +}; + +static int btrfs_io_failed_hook(struct bio *failed_bio, + struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + struct io_failure_record *failrec = NULL; + u64 private; + struct extent_map *em; + struct inode *inode = page->mapping->host; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct bio *bio; + int num_copies; + int ret; + int rw; + u64 logical; + + ret = get_state_private(failure_tree, start, &private); + if (ret) { + failrec = kmalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return -ENOMEM; + failrec->start = start; + failrec->len = end - start + 1; + failrec->last_mirror = 0; + failrec->bio_flags = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (em->start > start || em->start + em->len < start) { + free_extent_map(em); + em = NULL; + } + read_unlock(&em_tree->lock); + + if (IS_ERR_OR_NULL(em)) { + kfree(failrec); + return -EIO; + } + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, + em->compress_type); + } + failrec->logical = logical; + free_extent_map(em); + set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | + EXTENT_DIRTY, GFP_NOFS); + set_state_private(failure_tree, start, + (u64)(unsigned long)failrec); + } else { + failrec = (struct io_failure_record *)(unsigned long)private; + } + num_copies = btrfs_num_copies( + &BTRFS_I(inode)->root->fs_info->mapping_tree, + failrec->logical, failrec->len); + failrec->last_mirror++; + if (!state) { + spin_lock(&BTRFS_I(inode)->io_tree.lock); + state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + failrec->start, + EXTENT_LOCKED); + if (state && state->start != failrec->start) + state = NULL; + spin_unlock(&BTRFS_I(inode)->io_tree.lock); + } + if (!state || failrec->last_mirror > num_copies) { + set_state_private(failure_tree, failrec->start, 0); + clear_extent_bits(failure_tree, failrec->start, + failrec->start + failrec->len - 1, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + kfree(failrec); + return -EIO; + } + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_private = state; + bio->bi_end_io = failed_bio->bi_end_io; + bio->bi_sector = failrec->logical >> 9; + bio->bi_bdev = failed_bio->bi_bdev; + bio->bi_size = 0; + + bio_add_page(bio, page, failrec->len, start - page_offset(page)); + if (failed_bio->bi_rw & REQ_WRITE) + rw = WRITE; + else + rw = READ; + + ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, + failrec->last_mirror, + failrec->bio_flags, 0); + return ret; +} + +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ +static int btrfs_clean_io_failures(struct inode *inode, u64 start) +{ + u64 private; + u64 private_failure; + struct io_failure_record *failure; + int ret; + + private = 0; + if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, + (u64)-1, 1, EXTENT_DIRTY, 0)) { + ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, + start, &private_failure); + if (ret == 0) { + failure = (struct io_failure_record *)(unsigned long) + private_failure; + set_state_private(&BTRFS_I(inode)->io_failure_tree, + failure->start, 0); + clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, + failure->start, + failure->start + failure->len - 1, + EXTENT_DIRTY | EXTENT_LOCKED, + GFP_NOFS); + kfree(failure); + } + } + return 0; +} + +/* + * when reads are done, we need to check csums to verify the data is correct + * if there's a match, we allow the bio to finish. If not, we go through + * the io_failure_record routines to find good copies + */ +static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); + struct inode *inode = page->mapping->host; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + char *kaddr; + u64 private = ~(u32)0; + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + u32 csum = ~(u32)0; + + if (PageChecked(page)) { + ClearPageChecked(page); + goto good; + } + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + goto good; + + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && + test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, + GFP_NOFS); + return 0; + } + + if (state && state->start == start) { + private = state->private; + ret = 0; + } else { + ret = get_state_private(io_tree, start, &private); + } + kaddr = kmap_atomic(page, KM_USER0); + if (ret) + goto zeroit; + + csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); + btrfs_csum_final(csum, (char *)&csum); + if (csum != private) + goto zeroit; + + kunmap_atomic(kaddr, KM_USER0); +good: + /* if the io failure tree for this inode is non-empty, + * check to see if we've recovered from a failed IO + */ + btrfs_clean_io_failures(inode, start); + return 0; + +zeroit: + printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u " + "private %llu\n", + (unsigned long long)btrfs_ino(page->mapping->host), + (unsigned long long)start, csum, + (unsigned long long)private); + memset(kaddr + offset, 1, end - start + 1); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + if (private == 0) + return 0; + return -EIO; +} + +struct delayed_iput { + struct list_head list; + struct inode *inode; +}; + +void btrfs_add_delayed_iput(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct delayed_iput *delayed; + + if (atomic_add_unless(&inode->i_count, -1, 1)) + return; + + delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); + delayed->inode = inode; + + spin_lock(&fs_info->delayed_iput_lock); + list_add_tail(&delayed->list, &fs_info->delayed_iputs); + spin_unlock(&fs_info->delayed_iput_lock); +} + +void btrfs_run_delayed_iputs(struct btrfs_root *root) +{ + LIST_HEAD(list); + struct btrfs_fs_info *fs_info = root->fs_info; + struct delayed_iput *delayed; + int empty; + + spin_lock(&fs_info->delayed_iput_lock); + empty = list_empty(&fs_info->delayed_iputs); + spin_unlock(&fs_info->delayed_iput_lock); + if (empty) + return; + + down_read(&root->fs_info->cleanup_work_sem); + spin_lock(&fs_info->delayed_iput_lock); + list_splice_init(&fs_info->delayed_iputs, &list); + spin_unlock(&fs_info->delayed_iput_lock); + + while (!list_empty(&list)) { + delayed = list_entry(list.next, struct delayed_iput, list); + list_del(&delayed->list); + iput(delayed->inode); + kfree(delayed); + } + up_read(&root->fs_info->cleanup_work_sem); +} + +/* + * calculate extra metadata reservation when snapshotting a subvolume + * contains orphan files. + */ +void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve) +{ + struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; + u64 num_bytes; + int index; + + root = pending->root; + if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) + return; + + block_rsv = root->orphan_block_rsv; + + /* orphan block reservation for the snapshot */ + num_bytes = block_rsv->size; + + /* + * after the snapshot is created, COWing tree blocks may use more + * space than it frees. So we should make sure there is enough + * reserved space. + */ + index = trans->transid & 0x1; + if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { + num_bytes += block_rsv->size - + (block_rsv->reserved + block_rsv->freed[index]); + } + + *bytes_to_reserve += num_bytes; +} + +void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_root *snap = pending->snap; + struct btrfs_block_rsv *block_rsv; + u64 num_bytes; + int index; + int ret; + + if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) + return; + + /* refill source subvolume's orphan block reservation */ + block_rsv = root->orphan_block_rsv; + index = trans->transid & 0x1; + if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { + num_bytes = block_rsv->size - + (block_rsv->reserved + block_rsv->freed[index]); + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + root->orphan_block_rsv, + num_bytes); + BUG_ON(ret); + } + + /* setup orphan block reservation for the snapshot */ + block_rsv = btrfs_alloc_block_rsv(snap); + BUG_ON(!block_rsv); + + btrfs_add_durable_block_rsv(root->fs_info, block_rsv); + snap->orphan_block_rsv = block_rsv; + + num_bytes = root->orphan_block_rsv->size; + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + block_rsv, num_bytes); + BUG_ON(ret); + +#if 0 + /* insert orphan item for the snapshot */ + WARN_ON(!root->orphan_item_inserted); + ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, + snap->root_key.objectid); + BUG_ON(ret); + snap->orphan_item_inserted = 1; +#endif +} + +enum btrfs_orphan_cleanup_state { + ORPHAN_CLEANUP_STARTED = 1, + ORPHAN_CLEANUP_DONE = 2, +}; + +/* + * This is called in transaction commmit time. If there are no orphan + * files in the subvolume, it removes orphan item and frees block_rsv + * structure. + */ +void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + + if (!list_empty(&root->orphan_list) || + root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) + return; + + if (root->orphan_item_inserted && + btrfs_root_refs(&root->root_item) > 0) { + ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, + root->root_key.objectid); + BUG_ON(ret); + root->orphan_item_inserted = 0; + } + + if (root->orphan_block_rsv) { + WARN_ON(root->orphan_block_rsv->size > 0); + btrfs_free_block_rsv(root, root->orphan_block_rsv); + root->orphan_block_rsv = NULL; + } +} + +/* + * This creates an orphan entry for the given inode in case something goes + * wrong in the middle of an unlink/truncate. + * + * NOTE: caller of this function should reserve 5 units of metadata for + * this function. + */ +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *block_rsv = NULL; + int reserve = 0; + int insert = 0; + int ret; + + if (!root->orphan_block_rsv) { + block_rsv = btrfs_alloc_block_rsv(root); + BUG_ON(!block_rsv); + } + + spin_lock(&root->orphan_lock); + if (!root->orphan_block_rsv) { + root->orphan_block_rsv = block_rsv; + } else if (block_rsv) { + btrfs_free_block_rsv(root, block_rsv); + block_rsv = NULL; + } + + if (list_empty(&BTRFS_I(inode)->i_orphan)) { + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); +#if 0 + /* + * For proper ENOSPC handling, we should do orphan + * cleanup when mounting. But this introduces backward + * compatibility issue. + */ + if (!xchg(&root->orphan_item_inserted, 1)) + insert = 2; + else + insert = 1; +#endif + insert = 1; + } + + if (!BTRFS_I(inode)->orphan_meta_reserved) { + BTRFS_I(inode)->orphan_meta_reserved = 1; + reserve = 1; + } + spin_unlock(&root->orphan_lock); + + if (block_rsv) + btrfs_add_durable_block_rsv(root->fs_info, block_rsv); + + /* grab metadata reservation from transaction handle */ + if (reserve) { + ret = btrfs_orphan_reserve_metadata(trans, inode); + BUG_ON(ret); + } + + /* insert an orphan item to track this unlinked/truncated file */ + if (insert >= 1) { + ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); + BUG_ON(ret); + } + + /* insert an orphan item to track subvolume contains orphan files */ + if (insert >= 2) { + ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, + root->root_key.objectid); + BUG_ON(ret); + } + return 0; +} + +/* + * We have done the truncate/delete so we can go ahead and remove the orphan + * item for this particular inode. + */ +int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int delete_item = 0; + int release_rsv = 0; + int ret = 0; + + spin_lock(&root->orphan_lock); + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + list_del_init(&BTRFS_I(inode)->i_orphan); + delete_item = 1; + } + + if (BTRFS_I(inode)->orphan_meta_reserved) { + BTRFS_I(inode)->orphan_meta_reserved = 0; + release_rsv = 1; + } + spin_unlock(&root->orphan_lock); + + if (trans && delete_item) { + ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); + BUG_ON(ret); + } + + if (release_rsv) + btrfs_orphan_release_metadata(inode); + + return 0; +} + +/* + * this cleans up any orphans that may be left on the list from the last use + * of this root. + */ +int btrfs_orphan_cleanup(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key, found_key; + struct btrfs_trans_handle *trans; + struct inode *inode; + int ret = 0, nr_unlink = 0, nr_truncate = 0; + + if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + path->reada = -1; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + /* + * if ret == 0 means we found what we were searching for, which + * is weird, but possible, so only screw with path if we didn't + * find the key and see if we have stuff that matches + */ + if (ret > 0) { + ret = 0; + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + /* pull out the item */ + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + /* make sure the item matches what we want */ + if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) + break; + if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) + break; + + /* release the path since we're done with it */ + btrfs_release_path(path); + + /* + * this is where we are basically btrfs_lookup, without the + * crossing root thing. we store the inode number in the + * offset of the orphan item. + */ + found_key.objectid = found_key.offset; + found_key.type = BTRFS_INODE_ITEM_KEY; + found_key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out; + } + + /* + * add this inode to the orphan list so btrfs_orphan_del does + * the proper thing when we hit it + */ + spin_lock(&root->orphan_lock); + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + spin_unlock(&root->orphan_lock); + + /* + * if this is a bad inode, means we actually succeeded in + * removing the inode, but not the orphan record, which means + * we need to manually delete the orphan since iput will just + * do a destroy_inode + */ + if (is_bad_inode(inode)) { + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + btrfs_orphan_del(trans, inode); + btrfs_end_transaction(trans, root); + iput(inode); + continue; + } + + /* if we have links, this was a truncate, lets do that */ + if (inode->i_nlink) { + if (!S_ISREG(inode->i_mode)) { + WARN_ON(1); + iput(inode); + continue; + } + nr_truncate++; + ret = btrfs_truncate(inode); + } else { + nr_unlink++; + } + + /* this will do delete_inode and everything for us */ + iput(inode); + if (ret) + goto out; + } + root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; + + if (root->orphan_block_rsv) + btrfs_block_rsv_release(root, root->orphan_block_rsv, + (u64)-1); + + if (root->orphan_block_rsv || root->orphan_item_inserted) { + trans = btrfs_join_transaction(root); + if (!IS_ERR(trans)) + btrfs_end_transaction(trans, root); + } + + if (nr_unlink) + printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); + if (nr_truncate) + printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); + +out: + if (ret) + printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret); + btrfs_free_path(path); + return ret; +} + +/* + * very simple check to peek ahead in the leaf looking for xattrs. If we + * don't find any xattrs, we know there can't be any acls. + * + * slot is the slot the inode is in, objectid is the objectid of the inode + */ +static noinline int acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid) +{ + u32 nritems = btrfs_header_nritems(leaf); + struct btrfs_key found_key; + int scanned = 0; + + slot++; + while (slot < nritems) { + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* we found a different objectid, there must not be acls */ + if (found_key.objectid != objectid) + return 0; + + /* we found an xattr, assume we've got an acl */ + if (found_key.type == BTRFS_XATTR_ITEM_KEY) + return 1; + + /* + * we found a key greater than an xattr key, there can't + * be any acls later on + */ + if (found_key.type > BTRFS_XATTR_ITEM_KEY) + return 0; + + slot++; + scanned++; + + /* + * it goes inode, inode backrefs, xattrs, extents, + * so if there are a ton of hard links to an inode there can + * be a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization + */ + if (scanned >= 8) + break; + } + /* we hit the end of the leaf before we found an xattr or + * something larger than an xattr. We have to assume the inode + * has acls + */ + return 1; +} + +/* + * read an inode from the btree into the in-memory inode + */ +static void btrfs_read_locked_inode(struct inode *inode) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_inode_item *inode_item; + struct btrfs_timespec *tspec; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key location; + int maybe_acls; + u32 rdev; + int ret; + bool filled = false; + + ret = btrfs_fill_inode(inode, &rdev); + if (!ret) + filled = true; + + path = btrfs_alloc_path(); + BUG_ON(!path); + path->leave_spinning = 1; + memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); + + ret = btrfs_lookup_inode(NULL, root, path, &location, 0); + if (ret) + goto make_bad; + + leaf = path->nodes[0]; + + if (filled) + goto cache_acl; + + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + if (!leaf->map_token) + map_private_extent_buffer(leaf, (unsigned long)inode_item, + sizeof(struct btrfs_inode_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + + inode->i_mode = btrfs_inode_mode(leaf, inode_item); + inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); + inode->i_uid = btrfs_inode_uid(leaf, inode_item); + inode->i_gid = btrfs_inode_gid(leaf, inode_item); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + + tspec = btrfs_inode_atime(inode_item); + inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + tspec = btrfs_inode_mtime(inode_item); + inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + tspec = btrfs_inode_ctime(inode_item); + inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); + BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); + BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); + inode->i_generation = BTRFS_I(inode)->generation; + inode->i_rdev = 0; + rdev = btrfs_inode_rdev(leaf, inode_item); + + BTRFS_I(inode)->index_cnt = (u64)-1; + BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); +cache_acl: + /* + * try to precache a NULL acl entry for files that don't have + * any xattrs or acls + */ + maybe_acls = acls_after_inode_item(leaf, path->slots[0], + btrfs_ino(inode)); + if (!maybe_acls) + cache_no_acl(inode); + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + btrfs_free_path(path); + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + break; + case S_IFDIR: + inode->i_fop = &btrfs_dir_file_operations; + if (root == root->fs_info->tree_root) + inode->i_op = &btrfs_dir_ro_inode_operations; + else + inode->i_op = &btrfs_dir_inode_operations; + break; + case S_IFLNK: + inode->i_op = &btrfs_symlink_inode_operations; + inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + break; + default: + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + break; + } + + btrfs_update_iflags(inode); + return; + +make_bad: + btrfs_free_path(path); + make_bad_inode(inode); +} + +/* + * given a leaf and an inode, copy the inode fields into the leaf + */ +static void fill_inode_item(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf, + struct btrfs_inode_item *item, + struct inode *inode) +{ + if (!leaf->map_token) + map_private_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_inode_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + + btrfs_set_inode_uid(leaf, item, inode->i_uid); + btrfs_set_inode_gid(leaf, item, inode->i_gid); + btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + + btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec); + + btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); + btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); + btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); + btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); + btrfs_set_inode_block_group(leaf, item, 0); + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } +} + +/* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + struct btrfs_inode_item *inode_item; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + /* + * If the inode is a free space inode, we can deadlock during commit + * if we put it into the delayed code. + * + * The data relocation inode should also be directly updated + * without delay + */ + if (!is_free_space_inode(root, inode) + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_delayed_update_inode(trans, root, inode); + if (!ret) + btrfs_set_inode_last_trans(trans, inode); + return ret; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, + 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto failed; + } + + btrfs_unlock_up_safe(path, 1); + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + + fill_inode_item(trans, leaf, inode_item, inode); + btrfs_mark_buffer_dirty(leaf); + btrfs_set_inode_last_trans(trans, inode); + ret = 0; +failed: + btrfs_free_path(path); + return ret; +} + +/* + * unlink helper that gets used here in inode.c and in the tree logging + * recovery code. It remove a link in a directory with a given name, and + * also drops the back refs in the inode to the directory + */ +static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) +{ + struct btrfs_path *path; + int ret = 0; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 index; + u64 ino = btrfs_ino(inode); + u64 dir_ino = btrfs_ino(dir); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + path->leave_spinning = 1; + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, + name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto err; + } + if (!di) { + ret = -ENOENT; + goto err; + } + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto err; + btrfs_release_path(path); + + ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, + dir_ino, &index); + if (ret) { + printk(KERN_INFO "btrfs failed to delete reference to %.*s, " + "inode %llu parent %llu\n", name_len, name, + (unsigned long long)ino, (unsigned long long)dir_ino); + goto err; + } + + ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); + if (ret) + goto err; + + ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, + inode, dir_ino); + BUG_ON(ret != 0 && ret != -ENOENT); + + ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, + dir, index); + if (ret == -ENOENT) + ret = 0; +err: + btrfs_free_path(path); + if (ret) + goto out; + + btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; + btrfs_update_inode(trans, root, dir); +out: + return ret; +} + +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) +{ + int ret; + ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + if (!ret) { + btrfs_drop_nlink(inode); + ret = btrfs_update_inode(trans, root, inode); + } + return ret; +} + + +/* helper to check if there is any shared block in the path */ +static int check_path_shared(struct btrfs_root *root, + struct btrfs_path *path) +{ + struct extent_buffer *eb; + int level; + u64 refs = 1; + + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + int ret; + + if (!path->nodes[level]) + break; + eb = path->nodes[level]; + if (!btrfs_block_can_be_shared(root, eb)) + continue; + ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, + &refs, NULL); + if (refs > 1) + return 1; + } + return 0; +} + +/* + * helper to start transaction for unlink and rmdir. + * + * unlink and rmdir are special in btrfs, they do not always free space. + * so in enospc case, we should make sure they will free space before + * allowing them to use the global metadata reservation. + */ +static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, + struct dentry *dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + struct btrfs_dir_item *di; + struct inode *inode = dentry->d_inode; + u64 index; + int check_link = 1; + int err = -ENOSPC; + int ret; + u64 ino = btrfs_ino(inode); + u64 dir_ino = btrfs_ino(dir); + + trans = btrfs_start_transaction(root, 10); + if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) + return trans; + + if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return ERR_PTR(-ENOSPC); + + /* check if there is someone else holds reference */ + if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) + return ERR_PTR(-ENOSPC); + + if (atomic_read(&inode->i_count) > 2) + return ERR_PTR(-ENOSPC); + + if (xchg(&root->fs_info->enospc_unlink, 1)) + return ERR_PTR(-ENOSPC); + + path = btrfs_alloc_path(); + if (!path) { + root->fs_info->enospc_unlink = 0; + return ERR_PTR(-ENOMEM); + } + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + root->fs_info->enospc_unlink = 0; + return trans; + } + + path->skip_locking = 1; + path->search_commit_root = 1; + + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(dir)->location, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret == 0) { + if (check_path_shared(root, path)) + goto out; + } else { + check_link = 0; + } + btrfs_release_path(path); + + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(inode)->location, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret == 0) { + if (check_path_shared(root, path)) + goto out; + } else { + check_link = 0; + } + btrfs_release_path(path); + + if (ret == 0 && S_ISREG(inode->i_mode)) { + ret = btrfs_lookup_file_extent(trans, root, path, + ino, (u64)-1, 0); + if (ret < 0) { + err = ret; + goto out; + } + BUG_ON(ret == 0); + if (check_path_shared(root, path)) + goto out; + btrfs_release_path(path); + } + + if (!check_link) { + err = 0; + goto out; + } + + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, + dentry->d_name.name, dentry->d_name.len, 0); + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto out; + } + if (di) { + if (check_path_shared(root, path)) + goto out; + } else { + err = 0; + goto out; + } + btrfs_release_path(path); + + ref = btrfs_lookup_inode_ref(trans, root, path, + dentry->d_name.name, dentry->d_name.len, + ino, dir_ino, 0); + if (IS_ERR(ref)) { + err = PTR_ERR(ref); + goto out; + } + BUG_ON(!ref); + if (check_path_shared(root, path)) + goto out; + index = btrfs_inode_ref_index(path->nodes[0], ref); + btrfs_release_path(path); + + /* + * This is a commit root search, if we can lookup inode item and other + * relative items in the commit root, it means the transaction of + * dir/file creation has been committed, and the dir index item that we + * delay to insert has also been inserted into the commit root. So + * we needn't worry about the delayed insertion of the dir index item + * here. + */ + di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index, + dentry->d_name.name, dentry->d_name.len, 0); + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto out; + } + BUG_ON(ret == -ENOENT); + if (check_path_shared(root, path)) + goto out; + + err = 0; +out: + btrfs_free_path(path); + if (err) { + btrfs_end_transaction(trans, root); + root->fs_info->enospc_unlink = 0; + return ERR_PTR(err); + } + + trans->block_rsv = &root->fs_info->global_block_rsv; + return trans; +} + +static void __unlink_end_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (trans->block_rsv == &root->fs_info->global_block_rsv) { + BUG_ON(!root->fs_info->enospc_unlink); + root->fs_info->enospc_unlink = 0; + } + btrfs_end_transaction_throttle(trans, root); +} + +static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_trans_handle *trans; + struct inode *inode = dentry->d_inode; + int ret; + unsigned long nr = 0; + + trans = __unlink_start_trans(dir, dentry); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); + + ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); + BUG_ON(ret); + + if (inode->i_nlink == 0) { + ret = btrfs_orphan_add(trans, inode); + BUG_ON(ret); + } + + nr = trans->blocks_used; + __unlink_end_trans(trans, root); + btrfs_btree_balance_dirty(root, nr); + return ret; +} + +int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, u64 objectid, + const char *name, int name_len) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 index; + int ret; + u64 dir_ino = btrfs_ino(dir); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, + name, name_len, -1); + BUG_ON(IS_ERR_OR_NULL(di)); + + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + BUG_ON(ret); + btrfs_release_path(path); + + ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, + objectid, root->root_key.objectid, + dir_ino, &index, name, name_len); + if (ret < 0) { + BUG_ON(ret != -ENOENT); + di = btrfs_search_dir_index_item(root, path, dir_ino, + name, name_len); + BUG_ON(IS_ERR_OR_NULL(di)); + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(path); + index = key.offset; + } + btrfs_release_path(path); + + ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); + BUG_ON(ret); + + btrfs_i_size_write(dir, dir->i_size - name_len * 2); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + + btrfs_free_path(path); + return 0; +} + +static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int err = 0; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_trans_handle *trans; + unsigned long nr = 0; + + if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || + btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) + return -ENOTEMPTY; + + trans = __unlink_start_trans(dir, dentry); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + err = btrfs_unlink_subvol(trans, root, dir, + BTRFS_I(inode)->location.objectid, + dentry->d_name.name, + dentry->d_name.len); + goto out; + } + + err = btrfs_orphan_add(trans, inode); + if (err) + goto out; + + /* now the directory is empty */ + err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); + if (!err) + btrfs_i_size_write(inode, 0); +out: + nr = trans->blocks_used; + __unlink_end_trans(trans, root); + btrfs_btree_balance_dirty(root, nr); + + return err; +} + +/* + * this can truncate away extent items, csum items and directory items. + * It starts at a high offset and removes keys until it can't find + * any higher than new_size + * + * csum items that cross the new i_size are truncated to the new size + * as well. + * + * min_type is the minimum key type to truncate down to. If set to 0, this + * will kill all the items on this inode, including the INODE_ITEM_KEY. + */ +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + u64 new_size, u32 min_type) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key found_key; + u64 extent_start = 0; + u64 extent_num_bytes = 0; + u64 extent_offset = 0; + u64 item_end = 0; + u64 mask = root->sectorsize - 1; + u32 found_type = (u8)-1; + int found_extent; + int del_item; + int pending_del_nr = 0; + int pending_del_slot = 0; + int extent_type = -1; + int encoding; + int ret; + int err = 0; + u64 ino = btrfs_ino(inode); + + BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); + + if (root->ref_cows || root == root->fs_info->tree_root) + btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); + + /* + * This function is also used to drop the items in the log tree before + * we relog the inode, so if root != BTRFS_I(inode)->root, it means + * it is used to drop the loged items. So we shouldn't kill the delayed + * items. + */ + if (min_type == 0 && root == BTRFS_I(inode)->root) + btrfs_kill_delayed_inode_items(inode); + + path = btrfs_alloc_path(); + BUG_ON(!path); + path->reada = -1; + + key.objectid = ino; + key.offset = (u64)-1; + key.type = (u8)-1; + +search_again: + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto out; + } + + if (ret > 0) { + /* there are no items in the tree for us to truncate, we're + * done + */ + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + + while (1) { + fi = NULL; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + encoding = 0; + + if (found_key.objectid != ino) + break; + + if (found_type < min_type) + break; + + item_end = found_key.offset; + if (found_type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + encoding = btrfs_file_extent_compression(leaf, fi); + encoding |= btrfs_file_extent_encryption(leaf, fi); + encoding |= btrfs_file_extent_other_encoding(leaf, fi); + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + item_end += + btrfs_file_extent_num_bytes(leaf, fi); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + item_end += btrfs_file_extent_inline_len(leaf, + fi); + } + item_end--; + } + if (found_type > min_type) { + del_item = 1; + } else { + if (item_end < new_size) + break; + if (found_key.offset >= new_size) + del_item = 1; + else + del_item = 0; + } + found_extent = 0; + /* FIXME, shrink the extent if the ref count is only 1 */ + if (found_type != BTRFS_EXTENT_DATA_KEY) + goto delete; + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + u64 num_dec; + extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); + if (!del_item && !encoding) { + u64 orig_num_bytes = + btrfs_file_extent_num_bytes(leaf, fi); + extent_num_bytes = new_size - + found_key.offset + root->sectorsize - 1; + extent_num_bytes = extent_num_bytes & + ~((u64)root->sectorsize - 1); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_num_bytes); + num_dec = (orig_num_bytes - + extent_num_bytes); + if (root->ref_cows && extent_start != 0) + inode_sub_bytes(inode, num_dec); + btrfs_mark_buffer_dirty(leaf); + } else { + extent_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, + fi); + extent_offset = found_key.offset - + btrfs_file_extent_offset(leaf, fi); + + /* FIXME blocksize != 4096 */ + num_dec = btrfs_file_extent_num_bytes(leaf, fi); + if (extent_start != 0) { + found_extent = 1; + if (root->ref_cows) + inode_sub_bytes(inode, num_dec); + } + } + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + /* + * we can't truncate inline items that have had + * special encodings + */ + if (!del_item && + btrfs_file_extent_compression(leaf, fi) == 0 && + btrfs_file_extent_encryption(leaf, fi) == 0 && + btrfs_file_extent_other_encoding(leaf, fi) == 0) { + u32 size = new_size - found_key.offset; + + if (root->ref_cows) { + inode_sub_bytes(inode, item_end + 1 - + new_size); + } + size = + btrfs_file_extent_calc_inline_size(size); + ret = btrfs_truncate_item(trans, root, path, + size, 1); + } else if (root->ref_cows) { + inode_sub_bytes(inode, item_end + 1 - + found_key.offset); + } + } +delete: + if (del_item) { + if (!pending_del_nr) { + /* no pending yet, add ourselves */ + pending_del_slot = path->slots[0]; + pending_del_nr = 1; + } else if (pending_del_nr && + path->slots[0] + 1 == pending_del_slot) { + /* hop on the pending chunk */ + pending_del_nr++; + pending_del_slot = path->slots[0]; + } else { + BUG(); + } + } else { + break; + } + if (found_extent && (root->ref_cows || + root == root->fs_info->tree_root)) { + btrfs_set_path_blocking(path); + ret = btrfs_free_extent(trans, root, extent_start, + extent_num_bytes, 0, + btrfs_header_owner(leaf), + ino, extent_offset); + BUG_ON(ret); + } + + if (found_type == BTRFS_INODE_ITEM_KEY) + break; + + if (path->slots[0] == 0 || + path->slots[0] != pending_del_slot) { + if (root->ref_cows && + BTRFS_I(inode)->location.objectid != + BTRFS_FREE_INO_OBJECTID) { + err = -EAGAIN; + goto out; + } + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + BUG_ON(ret); + pending_del_nr = 0; + } + btrfs_release_path(path); + goto search_again; + } else { + path->slots[0]--; + } + } +out: + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, pending_del_slot, + pending_del_nr); + BUG_ON(ret); + } + btrfs_free_path(path); + return err; +} + +/* + * taken from block_truncate_page, but does cow as it zeros out + * any bytes left in the last page in the file. + */ +static int btrfs_truncate_page(struct address_space *mapping, loff_t from) +{ + struct inode *inode = mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + char *kaddr; + u32 blocksize = root->sectorsize; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + struct page *page; + int ret = 0; + u64 page_start; + u64 page_end; + + if ((offset & (blocksize - 1)) == 0) + goto out; + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + + ret = -ENOMEM; +again: + page = grab_cache_page(mapping, index); + if (!page) { + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + goto out; + } + + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + if (!PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + lock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + if (!PageUptodate(page)) { + ret = -EIO; + goto out_unlock; + } + } + wait_on_page_writeback(page); + + lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, + GFP_NOFS); + set_page_extent_mapped(page); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent_cached(io_tree, page_start, page_end, + &cached_state, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + 0, 0, &cached_state, GFP_NOFS); + + ret = btrfs_set_extent_delalloc(inode, page_start, page_end, + &cached_state); + if (ret) { + unlock_extent_cached(io_tree, page_start, page_end, + &cached_state, GFP_NOFS); + goto out_unlock; + } + + ret = 0; + if (offset != PAGE_CACHE_SIZE) { + kaddr = kmap(page); + memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + kunmap(page); + } + ClearPageChecked(page); + set_page_dirty(page); + unlock_extent_cached(io_tree, page_start, page_end, &cached_state, + GFP_NOFS); + +out_unlock: + if (ret) + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + unlock_page(page); + page_cache_release(page); +out: + return ret; +} + +/* + * This function puts in dummy file extents for the area we're creating a hole + * for. So if we are truncating this file to a larger size we need to insert + * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for + * the range between oldsize and size + */ +int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em = NULL; + struct extent_state *cached_state = NULL; + u64 mask = root->sectorsize - 1; + u64 hole_start = (oldsize + mask) & ~mask; + u64 block_end = (size + mask) & ~mask; + u64 last_byte; + u64 cur_offset; + u64 hole_size; + int err = 0; + + if (size <= hole_start) + return 0; + + while (1) { + struct btrfs_ordered_extent *ordered; + btrfs_wait_ordered_range(inode, hole_start, + block_end - hole_start); + lock_extent_bits(io_tree, hole_start, block_end - 1, 0, + &cached_state, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, hole_start); + if (!ordered) + break; + unlock_extent_cached(io_tree, hole_start, block_end - 1, + &cached_state, GFP_NOFS); + btrfs_put_ordered_extent(ordered); + } + + cur_offset = hole_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + block_end - cur_offset, 0); + BUG_ON(IS_ERR_OR_NULL(em)); + last_byte = min(extent_map_end(em), block_end); + last_byte = (last_byte + mask) & ~mask; + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + u64 hint_byte = 0; + hole_size = last_byte - cur_offset; + + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + break; + } + + err = btrfs_drop_extents(trans, inode, cur_offset, + cur_offset + hole_size, + &hint_byte, 1); + if (err) + break; + + err = btrfs_insert_file_extent(trans, root, + btrfs_ino(inode), cur_offset, 0, + 0, hole_size, 0, hole_size, + 0, 0, 0); + if (err) + break; + + btrfs_drop_extent_cache(inode, hole_start, + last_byte - 1, 0); + + btrfs_end_transaction(trans, root); + } + free_extent_map(em); + em = NULL; + cur_offset = last_byte; + if (cur_offset >= block_end) + break; + } + + free_extent_map(em); + unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, + GFP_NOFS); + return err; +} + +static int btrfs_setsize(struct inode *inode, loff_t newsize) +{ + loff_t oldsize = i_size_read(inode); + int ret; + + if (newsize == oldsize) + return 0; + + if (newsize > oldsize) { + i_size_write(inode, newsize); + btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); + truncate_pagecache(inode, oldsize, newsize); + ret = btrfs_cont_expand(inode, oldsize, newsize); + if (ret) { + btrfs_setsize(inode, oldsize); + return ret; + } + + mark_inode_dirty(inode); + } else { + + /* + * We're truncating a file that used to have good data down to + * zero. Make sure it gets into the ordered flush list so that + * any new writes get down to disk quickly. + */ + if (newsize == 0) + BTRFS_I(inode)->ordered_data_close = 1; + + /* we don't support swapfiles, so vmtruncate shouldn't fail */ + truncate_setsize(inode, newsize); + ret = btrfs_truncate(inode); + } + + return ret; +} + +static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int err; + + if (btrfs_root_readonly(root)) + return -EROFS; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { + err = btrfs_setsize(inode, attr->ia_size); + if (err) + return err; + } + + if (attr->ia_valid) { + setattr_copy(inode, attr); + mark_inode_dirty(inode); + + if (attr->ia_valid & ATTR_MODE) + err = btrfs_acl_chmod(inode); + } + + return err; +} + +void btrfs_evict_inode(struct inode *inode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long nr; + int ret; + + trace_btrfs_inode_evict(inode); + + truncate_inode_pages(&inode->i_data, 0); + if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || + is_free_space_inode(root, inode))) + goto no_delete; + + if (is_bad_inode(inode)) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ + btrfs_wait_ordered_range(inode, 0, (u64)-1); + + if (root->fs_info->log_root_recovering) { + BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); + goto no_delete; + } + + if (inode->i_nlink > 0) { + BUG_ON(btrfs_root_refs(&root->root_item) != 0); + goto no_delete; + } + + btrfs_i_size_write(inode, 0); + + while (1) { + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + trans->block_rsv = root->orphan_block_rsv; + + ret = btrfs_block_rsv_check(trans, root, + root->orphan_block_rsv, 0, 5); + if (ret) { + BUG_ON(ret != -EAGAIN); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + continue; + } + + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); + if (ret != -EAGAIN) + break; + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + trans = NULL; + btrfs_btree_balance_dirty(root, nr); + + } + + if (ret == 0) { + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + } + + if (!(root == root->fs_info->tree_root || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) + btrfs_return_ino(root, btrfs_ino(inode)); + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); +no_delete: + end_writeback(inode); + return; +} + +/* + * this returns the key found in the dir entry in the location pointer. + * If no dir entries were found, location->objectid is 0. + */ +static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, + struct btrfs_key *location) +{ + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + int ret = 0; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, + namelen, 0); + if (IS_ERR(di)) + ret = PTR_ERR(di); + + if (IS_ERR_OR_NULL(di)) + goto out_err; + + btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); +out: + btrfs_free_path(path); + return ret; +out_err: + location->objectid = 0; + goto out; +} + +/* + * when we hit a tree root in a directory, the btrfs part of the inode + * needs to be changed to reflect the root directory of the tree root. This + * is kind of like crossing a mount point. + */ +static int fixup_tree_root_location(struct btrfs_root *root, + struct inode *dir, + struct dentry *dentry, + struct btrfs_key *location, + struct btrfs_root **sub_root) +{ + struct btrfs_path *path; + struct btrfs_root *new_root; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + + err = -ENOENT; + ret = btrfs_find_root_ref(root->fs_info->tree_root, path, + BTRFS_I(dir)->root->root_key.objectid, + location->objectid); + if (ret) { + if (ret < 0) + err = ret; + goto out; + } + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || + btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) + goto out; + + ret = memcmp_extent_buffer(leaf, dentry->d_name.name, + (unsigned long)(ref + 1), + dentry->d_name.len); + if (ret) + goto out; + + btrfs_release_path(path); + + new_root = btrfs_read_fs_root_no_name(root->fs_info, location); + if (IS_ERR(new_root)) { + err = PTR_ERR(new_root); + goto out; + } + + if (btrfs_root_refs(&new_root->root_item) == 0) { + err = -ENOENT; + goto out; + } + + *sub_root = new_root; + location->objectid = btrfs_root_dirid(&new_root->root_item); + location->type = BTRFS_INODE_ITEM_KEY; + location->offset = 0; + err = 0; +out: + btrfs_free_path(path); + return err; +} + +static void inode_tree_add(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *entry; + struct rb_node **p; + struct rb_node *parent; + u64 ino = btrfs_ino(inode); +again: + p = &root->inode_tree.rb_node; + parent = NULL; + + if (inode_unhashed(inode)) + return; + + spin_lock(&root->inode_lock); + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_inode, rb_node); + + if (ino < btrfs_ino(&entry->vfs_inode)) + p = &parent->rb_left; + else if (ino > btrfs_ino(&entry->vfs_inode)) + p = &parent->rb_right; + else { + WARN_ON(!(entry->vfs_inode.i_state & + (I_WILL_FREE | I_FREEING))); + rb_erase(parent, &root->inode_tree); + RB_CLEAR_NODE(parent); + spin_unlock(&root->inode_lock); + goto again; + } + } + rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); + rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); + spin_unlock(&root->inode_lock); +} + +static void inode_tree_del(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int empty = 0; + + spin_lock(&root->inode_lock); + if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { + rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + empty = RB_EMPTY_ROOT(&root->inode_tree); + } + spin_unlock(&root->inode_lock); + + /* + * Free space cache has inodes in the tree root, but the tree root has a + * root_refs of 0, so this could end up dropping the tree root as a + * snapshot, so we need the extra !root->fs_info->tree_root check to + * make sure we don't drop it. + */ + if (empty && btrfs_root_refs(&root->root_item) == 0 && + root != root->fs_info->tree_root) { + synchronize_srcu(&root->fs_info->subvol_srcu); + spin_lock(&root->inode_lock); + empty = RB_EMPTY_ROOT(&root->inode_tree); + spin_unlock(&root->inode_lock); + if (empty) + btrfs_add_dead_root(root); + } +} + +int btrfs_invalidate_inodes(struct btrfs_root *root) +{ + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + u64 objectid = 0; + + WARN_ON(btrfs_root_refs(&root->root_item) != 0); + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < btrfs_ino(&entry->vfs_inode)) + node = node->rb_left; + else if (objectid > btrfs_ino(&entry->vfs_inode)) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= btrfs_ino(&entry->vfs_inode)) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + objectid = btrfs_ino(&entry->vfs_inode) + 1; + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + if (atomic_read(&inode->i_count) > 1) + d_prune_aliases(inode); + /* + * btrfs_drop_inode will have it removed from + * the inode cache when its usage count + * hits zero. + */ + iput(inode); + cond_resched(); + spin_lock(&root->inode_lock); + goto again; + } + + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); + return 0; +} + +static int btrfs_init_locked_inode(struct inode *inode, void *p) +{ + struct btrfs_iget_args *args = p; + inode->i_ino = args->ino; + BTRFS_I(inode)->root = args->root; + btrfs_set_inode_space_info(args->root, inode); + return 0; +} + +static int btrfs_find_actor(struct inode *inode, void *opaque) +{ + struct btrfs_iget_args *args = opaque; + return args->ino == btrfs_ino(inode) && + args->root == BTRFS_I(inode)->root; +} + +static struct inode *btrfs_iget_locked(struct super_block *s, + u64 objectid, + struct btrfs_root *root) +{ + struct inode *inode; + struct btrfs_iget_args args; + args.ino = objectid; + args.root = root; + + inode = iget5_locked(s, objectid, btrfs_find_actor, + btrfs_init_locked_inode, + (void *)&args); + return inode; +} + +/* Get an inode object given its location and corresponding root. + * Returns in *is_new if the inode was read from disk + */ +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *new) +{ + struct inode *inode; + + inode = btrfs_iget_locked(s, location->objectid, root); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); + btrfs_read_locked_inode(inode); + inode_tree_add(inode); + unlock_new_inode(inode); + if (new) + *new = 1; + } + + return inode; +} + +static struct inode *new_simple_dir(struct super_block *s, + struct btrfs_key *key, + struct btrfs_root *root) +{ + struct inode *inode = new_inode(s); + + if (!inode) + return ERR_PTR(-ENOMEM); + + BTRFS_I(inode)->root = root; + memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); + BTRFS_I(inode)->dummy_inode = 1; + + inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + return inode; +} + +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *sub_root = root; + struct btrfs_key location; + int index; + int ret; + + if (dentry->d_name.len > BTRFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ret = btrfs_inode_by_name(dir, dentry, &location); + + if (ret < 0) + return ERR_PTR(ret); + + if (location.objectid == 0) + return NULL; + + if (location.type == BTRFS_INODE_ITEM_KEY) { + inode = btrfs_iget(dir->i_sb, &location, root, NULL); + return inode; + } + + BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); + + index = srcu_read_lock(&root->fs_info->subvol_srcu); + ret = fixup_tree_root_location(root, dir, dentry, + &location, &sub_root); + if (ret < 0) { + if (ret != -ENOENT) + inode = ERR_PTR(ret); + else + inode = new_simple_dir(dir->i_sb, &location, sub_root); + } else { + inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); + } + srcu_read_unlock(&root->fs_info->subvol_srcu, index); + + if (!IS_ERR(inode) && root != sub_root) { + down_read(&root->fs_info->cleanup_work_sem); + if (!(inode->i_sb->s_flags & MS_RDONLY)) + ret = btrfs_orphan_cleanup(sub_root); + up_read(&root->fs_info->cleanup_work_sem); + if (ret) + inode = ERR_PTR(ret); + } + + return inode; +} + +static int btrfs_dentry_delete(const struct dentry *dentry) +{ + struct btrfs_root *root; + + if (!dentry->d_inode && !IS_ROOT(dentry)) + dentry = dentry->d_parent; + + if (dentry->d_inode) { + root = BTRFS_I(dentry->d_inode)->root; + if (btrfs_root_refs(&root->root_item) == 0) + return 1; + } + return 0; +} + +static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_splice_alias(inode, dentry); +} + +unsigned char btrfs_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int btrfs_real_readdir(struct file *filp, void *dirent, + filldir_t filldir) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_item *item; + struct btrfs_dir_item *di; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + struct list_head ins_list; + struct list_head del_list; + int ret; + struct extent_buffer *leaf; + int slot; + unsigned char d_type; + int over = 0; + u32 di_cur; + u32 di_total; + u32 di_len; + int key_type = BTRFS_DIR_INDEX_KEY; + char tmp_name[32]; + char *name_ptr; + int name_len; + int is_curr = 0; /* filp->f_pos points to the current index? */ + + /* FIXME, use a real flag for deciding about the key type */ + if (root->fs_info->tree_root == root) + key_type = BTRFS_DIR_ITEM_KEY; + + /* special case for "." */ + if (filp->f_pos == 0) { + over = filldir(dirent, ".", 1, + filp->f_pos, btrfs_ino(inode), DT_DIR); + if (over) + return 0; + filp->f_pos = 1; + } + /* special case for .., just use the back ref */ + if (filp->f_pos == 1) { + u64 pino = parent_ino(filp->f_path.dentry); + over = filldir(dirent, "..", 2, + filp->f_pos, pino, DT_DIR); + if (over) + return 0; + filp->f_pos = 2; + } + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + + if (key_type == BTRFS_DIR_INDEX_KEY) { + INIT_LIST_HEAD(&ins_list); + INIT_LIST_HEAD(&del_list); + btrfs_get_delayed_items(inode, &ins_list, &del_list); + } + + btrfs_set_key_type(&key, key_type); + key.offset = filp->f_pos; + key.objectid = btrfs_ino(inode); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto err; + else if (ret > 0) + break; + continue; + } + + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid != key.objectid) + break; + if (btrfs_key_type(&found_key) != key_type) + break; + if (found_key.offset < filp->f_pos) + goto next; + if (key_type == BTRFS_DIR_INDEX_KEY && + btrfs_should_delete_dir_index(&del_list, + found_key.offset)) + goto next; + + filp->f_pos = found_key.offset; + is_curr = 1; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + di_cur = 0; + di_total = btrfs_item_size(leaf, item); + + while (di_cur < di_total) { + struct btrfs_key location; + + if (verify_dir_item(root, leaf, di)) + break; + + name_len = btrfs_dir_name_len(leaf, di); + if (name_len <= sizeof(tmp_name)) { + name_ptr = tmp_name; + } else { + name_ptr = kmalloc(name_len, GFP_NOFS); + if (!name_ptr) { + ret = -ENOMEM; + goto err; + } + } + read_extent_buffer(leaf, name_ptr, + (unsigned long)(di + 1), name_len); + + d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; + btrfs_dir_item_key_to_cpu(leaf, di, &location); + + /* is this a reference to our own snapshot? If so + * skip it + */ + if (location.type == BTRFS_ROOT_ITEM_KEY && + location.objectid == root->root_key.objectid) { + over = 0; + goto skip; + } + over = filldir(dirent, name_ptr, name_len, + found_key.offset, location.objectid, + d_type); + +skip: + if (name_ptr != tmp_name) + kfree(name_ptr); + + if (over) + goto nopos; + di_len = btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di) + sizeof(*di); + di_cur += di_len; + di = (struct btrfs_dir_item *)((char *)di + di_len); + } +next: + path->slots[0]++; + } + + if (key_type == BTRFS_DIR_INDEX_KEY) { + if (is_curr) + filp->f_pos++; + ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir, + &ins_list); + if (ret) + goto nopos; + } + + /* Reached end of directory/root. Bump pos past the last item. */ + if (key_type == BTRFS_DIR_INDEX_KEY) + /* + * 32-bit glibc will use getdents64, but then strtol - + * so the last number we can serve is this. + */ + filp->f_pos = 0x7fffffff; + else + filp->f_pos++; +nopos: + ret = 0; +err: + if (key_type == BTRFS_DIR_INDEX_KEY) + btrfs_put_delayed_items(&ins_list, &del_list); + btrfs_free_path(path); + return ret; +} + +int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + bool nolock = false; + + if (BTRFS_I(inode)->dummy_inode) + return 0; + + if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode)) + nolock = true; + + if (wbc->sync_mode == WB_SYNC_ALL) { + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + if (nolock) + ret = btrfs_end_transaction_nolock(trans, root); + else + ret = btrfs_commit_transaction(trans, root); + } + return ret; +} + +/* + * This is somewhat expensive, updating the tree every time the + * inode changes. But, it is most likely to find the inode in cache. + * FIXME, needs more benchmarking...there are no reasons other than performance + * to keep or drop this code. + */ +void btrfs_dirty_inode(struct inode *inode, int flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret; + + if (BTRFS_I(inode)->dummy_inode) + return; + + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + + ret = btrfs_update_inode(trans, root, inode); + if (ret && ret == -ENOSPC) { + /* whoops, lets try again with the full transaction */ + btrfs_end_transaction(trans, root); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + printk_ratelimited(KERN_ERR "btrfs: fail to " + "dirty inode %llu error %ld\n", + (unsigned long long)btrfs_ino(inode), + PTR_ERR(trans)); + return; + } + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + printk_ratelimited(KERN_ERR "btrfs: fail to " + "dirty inode %llu error %d\n", + (unsigned long long)btrfs_ino(inode), + ret); + } + } + btrfs_end_transaction(trans, root); + if (BTRFS_I(inode)->delayed_node) + btrfs_balance_delayed_items(root); +} + +/* + * find the highest existing sequence number in a directory + * and then set the in-memory index_cnt variable to reflect + * free sequence numbers + */ +static int btrfs_set_inode_index_count(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key key, found_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + key.objectid = btrfs_ino(inode); + btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + /* FIXME: we should be able to handle this */ + if (ret == 0) + goto out; + ret = 0; + + /* + * MAGIC NUMBER EXPLANATION: + * since we search a directory based on f_pos we have to start at 2 + * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody + * else has to start at 2 + */ + if (path->slots[0] == 0) { + BTRFS_I(inode)->index_cnt = 2; + goto out; + } + + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != btrfs_ino(inode) || + btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { + BTRFS_I(inode)->index_cnt = 2; + goto out; + } + + BTRFS_I(inode)->index_cnt = found_key.offset + 1; +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to find a free sequence number in a given directory. This current + * code is very simple, later versions will do smarter things in the btree + */ +int btrfs_set_inode_index(struct inode *dir, u64 *index) +{ + int ret = 0; + + if (BTRFS_I(dir)->index_cnt == (u64)-1) { + ret = btrfs_inode_delayed_dir_index_count(dir); + if (ret) { + ret = btrfs_set_inode_index_count(dir); + if (ret) + return ret; + } + } + + *index = BTRFS_I(dir)->index_cnt; + BTRFS_I(dir)->index_cnt++; + + return ret; +} + +static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, + const char *name, int name_len, + u64 ref_objectid, u64 objectid, int mode, + u64 *index) +{ + struct inode *inode; + struct btrfs_inode_item *inode_item; + struct btrfs_key *location; + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + struct btrfs_key key[2]; + u32 sizes[2]; + unsigned long ptr; + int ret; + int owner; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + inode = new_inode(root->fs_info->sb); + if (!inode) { + btrfs_free_path(path); + return ERR_PTR(-ENOMEM); + } + + /* + * we have to initialize this early, so we can reclaim the inode + * number if we fail afterwards in this function. + */ + inode->i_ino = objectid; + + if (dir) { + trace_btrfs_inode_request(dir); + + ret = btrfs_set_inode_index(dir, index); + if (ret) { + btrfs_free_path(path); + iput(inode); + return ERR_PTR(ret); + } + } + /* + * index_cnt is ignored for everything but a dir, + * btrfs_get_inode_index_count has an explanation for the magic + * number + */ + BTRFS_I(inode)->index_cnt = 2; + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->generation = trans->transid; + inode->i_generation = BTRFS_I(inode)->generation; + btrfs_set_inode_space_info(root, inode); + + if (mode & S_IFDIR) + owner = 0; + else + owner = 1; + + key[0].objectid = objectid; + btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); + key[0].offset = 0; + + key[1].objectid = objectid; + btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); + key[1].offset = ref_objectid; + + sizes[0] = sizeof(struct btrfs_inode_item); + sizes[1] = name_len + sizeof(*ref); + + path->leave_spinning = 1; + ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); + if (ret != 0) + goto fail; + + inode_init_owner(inode, dir, mode); + inode_set_bytes(inode, 0); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + fill_inode_item(trans, path->nodes[0], inode_item, inode); + + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, *index); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + + btrfs_inherit_iflags(inode, dir); + + if ((mode & S_IFREG)) { + if (btrfs_test_opt(root, NODATASUM)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + if (btrfs_test_opt(root, NODATACOW) || + (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + } + + insert_inode_hash(inode); + inode_tree_add(inode); + + trace_btrfs_inode_new(inode); + btrfs_set_inode_last_trans(trans, inode); + + return inode; +fail: + if (dir) + BTRFS_I(dir)->index_cnt--; + btrfs_free_path(path); + iput(inode); + return ERR_PTR(ret); +} + +static inline u8 btrfs_inode_type(struct inode *inode) +{ + return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; +} + +/* + * utility function to add 'inode' into 'parent_inode' with + * a give name and a given sequence number. + * if 'add_backref' is true, also insert a backref from the + * inode to the parent directory. + */ +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index) +{ + int ret = 0; + struct btrfs_key key; + struct btrfs_root *root = BTRFS_I(parent_inode)->root; + u64 ino = btrfs_ino(inode); + u64 parent_ino = btrfs_ino(parent_inode); + + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { + memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); + } else { + key.objectid = ino; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + } + + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + key.objectid, root->root_key.objectid, + parent_ino, index, name, name_len); + } else if (add_backref) { + ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, + parent_ino, index); + } + + if (ret == 0) { + ret = btrfs_insert_dir_item(trans, root, name, name_len, + parent_inode, &key, + btrfs_inode_type(inode), index); + BUG_ON(ret); + + btrfs_i_size_write(parent_inode, parent_inode->i_size + + name_len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, parent_inode); + } + return ret; +} + +static int btrfs_add_nondir(struct btrfs_trans_handle *trans, + struct inode *dir, struct dentry *dentry, + struct inode *inode, int backref, u64 index) +{ + int err = btrfs_add_link(trans, dir, inode, + dentry->d_name.name, dentry->d_name.len, + backref, index); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + if (err > 0) + err = -EEXIST; + return err; +} + +static int btrfs_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + u64 objectid; + unsigned long nr = 0; + u64 index = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + /* + * 2 for inode item and ref + * 2 for dir items + * 1 for xattr if selinux is on + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + err = btrfs_find_free_ino(root, &objectid); + if (err) + goto out_unlock; + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, btrfs_ino(dir), objectid, + mode, &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_unlock; + } + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + btrfs_update_inode(trans, root, inode); + } +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + btrfs_btree_balance_dirty(root, nr); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + return err; +} + +static int btrfs_create(struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + int drop_inode = 0; + int err; + unsigned long nr = 0; + u64 objectid; + u64 index = 0; + + /* + * 2 for inode item and ref + * 2 for dir items + * 1 for xattr if selinux is on + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + err = btrfs_find_free_ino(root, &objectid); + if (err) + goto out_unlock; + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, btrfs_ino(dir), objectid, + mode, &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_unlock; + } + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + } +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int btrfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = old_dentry->d_inode; + u64 index; + unsigned long nr = 0; + int err; + int drop_inode = 0; + + /* do not allow sys_link's with other subvols of the same device */ + if (root->objectid != BTRFS_I(inode)->root->objectid) + return -EXDEV; + + if (inode->i_nlink == ~0U) + return -EMLINK; + + err = btrfs_set_inode_index(dir, &index); + if (err) + goto fail; + + /* + * 2 items for inode and inode ref + * 2 items for dir items + * 1 item for parent inode + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto fail; + } + + btrfs_inc_nlink(inode); + inode->i_ctime = CURRENT_TIME; + ihold(inode); + + err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); + + if (err) { + drop_inode = 1; + } else { + struct dentry *parent = dget_parent(dentry); + err = btrfs_update_inode(trans, root, inode); + BUG_ON(err); + btrfs_log_new_name(trans, inode, NULL, parent); + dput(parent); + } + + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +fail: + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + int err = 0; + int drop_on_err = 0; + u64 objectid = 0; + u64 index = 0; + unsigned long nr = 1; + + /* + * 2 items for inode and ref + * 2 items for dir items + * 1 for xattr if selinux is on + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + err = btrfs_find_free_ino(root, &objectid); + if (err) + goto out_fail; + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, btrfs_ino(dir), objectid, + S_IFDIR | mode, &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_fail; + } + + drop_on_err = 1; + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_fail; + + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + + btrfs_i_size_write(inode, 0); + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_fail; + + err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, + dentry->d_name.len, 0, index); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); + drop_on_err = 0; + +out_fail: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + if (drop_on_err) + iput(inode); + btrfs_btree_balance_dirty(root, nr); + return err; +} + +/* helper for btfs_get_extent. Given an existing extent in the tree, + * and an extent that you want to insert, deal with overlap and insert + * the new extent into the tree. + */ +static int merge_extent_mapping(struct extent_map_tree *em_tree, + struct extent_map *existing, + struct extent_map *em, + u64 map_start, u64 map_len) +{ + u64 start_diff; + + BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); + start_diff = map_start - em->start; + em->start = map_start; + em->len = map_len; + if (em->block_start < EXTENT_MAP_LAST_BYTE && + !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + em->block_start += start_diff; + em->block_len -= start_diff; + } + return add_extent_mapping(em_tree, em); +} + +static noinline int uncompress_inline(struct btrfs_path *path, + struct inode *inode, struct page *page, + size_t pg_offset, u64 extent_offset, + struct btrfs_file_extent_item *item) +{ + int ret; + struct extent_buffer *leaf = path->nodes[0]; + char *tmp; + size_t max_size; + unsigned long inline_size; + unsigned long ptr; + int compress_type; + + WARN_ON(pg_offset != 0); + compress_type = btrfs_file_extent_compression(leaf, item); + max_size = btrfs_file_extent_ram_bytes(leaf, item); + inline_size = btrfs_file_extent_inline_item_len(leaf, + btrfs_item_nr(leaf, path->slots[0])); + tmp = kmalloc(inline_size, GFP_NOFS); + if (!tmp) + return -ENOMEM; + ptr = btrfs_file_extent_inline_start(item); + + read_extent_buffer(leaf, tmp, ptr, inline_size); + + max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); + ret = btrfs_decompress(compress_type, tmp, page, + extent_offset, inline_size, max_size); + if (ret) { + char *kaddr = kmap_atomic(page, KM_USER0); + unsigned long copy_size = min_t(u64, + PAGE_CACHE_SIZE - pg_offset, + max_size - extent_offset); + memset(kaddr + pg_offset, 0, copy_size); + kunmap_atomic(kaddr, KM_USER0); + } + kfree(tmp); + return 0; +} + +/* + * a bit scary, this does extent mapping from logical file offset to the disk. + * the ugly parts come from merging extents from the disk with the in-ram + * representation. This gets more complex because of the data=ordered code, + * where the in-ram extents might be locked pending data=ordered completion. + * + * This also copies inline extents directly into the page. + */ + +struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + int ret; + int err = 0; + u64 bytenr; + u64 extent_start = 0; + u64 extent_end = 0; + u64 objectid = btrfs_ino(inode); + u32 found_type; + struct btrfs_path *path = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *item; + struct extent_buffer *leaf; + struct btrfs_key found_key; + struct extent_map *em = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_trans_handle *trans = NULL; + int compress_type; + +again: + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) + em->bdev = root->fs_info->fs_devices->latest_bdev; + read_unlock(&em_tree->lock); + + if (em) { + if (em->start > start || em->start + em->len <= start) + free_extent_map(em); + else if (em->block_start == EXTENT_MAP_INLINE && page) + free_extent_map(em); + else + goto out; + } + em = alloc_extent_map(); + if (!em) { + err = -ENOMEM; + goto out; + } + em->bdev = root->fs_info->fs_devices->latest_bdev; + em->start = EXTENT_MAP_HOLE; + em->orig_start = EXTENT_MAP_HOLE; + em->len = (u64)-1; + em->block_len = (u64)-1; + + if (!path) { + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + /* + * Chances are we'll be called again, so go ahead and do + * readahead + */ + path->reada = 1; + } + + ret = btrfs_lookup_file_extent(trans, root, path, + objectid, start, trans != NULL); + if (ret < 0) { + err = ret; + goto out; + } + + if (ret != 0) { + if (path->slots[0] == 0) + goto not_found; + path->slots[0]--; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + /* are we inside the extent that was found? */ + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + if (found_key.objectid != objectid || + found_type != BTRFS_EXTENT_DATA_KEY) { + goto not_found; + } + + found_type = btrfs_file_extent_type(leaf, item); + extent_start = found_key.offset; + compress_type = btrfs_file_extent_compression(leaf, item); + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, item); + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, item); + extent_end = (extent_start + size + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + } + + if (start >= extent_end) { + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + goto not_found; + leaf = path->nodes[0]; + } + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != objectid || + found_key.type != BTRFS_EXTENT_DATA_KEY) + goto not_found; + if (start + len <= found_key.offset) + goto not_found; + em->start = start; + em->len = found_key.offset - start; + goto not_found_em; + } + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + em->start = extent_start; + em->len = extent_end - extent_start; + em->orig_start = extent_start - + btrfs_file_extent_offset(leaf, item); + bytenr = btrfs_file_extent_disk_bytenr(leaf, item); + if (bytenr == 0) { + em->block_start = EXTENT_MAP_HOLE; + goto insert; + } + if (compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + em->block_start = bytenr; + em->block_len = btrfs_file_extent_disk_num_bytes(leaf, + item); + } else { + bytenr += btrfs_file_extent_offset(leaf, item); + em->block_start = bytenr; + em->block_len = em->len; + if (found_type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } + goto insert; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + unsigned long ptr; + char *map; + size_t size; + size_t extent_offset; + size_t copy_size; + + em->block_start = EXTENT_MAP_INLINE; + if (!page || create) { + em->start = extent_start; + em->len = extent_end - extent_start; + goto out; + } + + size = btrfs_file_extent_inline_len(leaf, item); + extent_offset = page_offset(page) + pg_offset - extent_start; + copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, + size - extent_offset); + em->start = extent_start + extent_offset; + em->len = (copy_size + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + em->orig_start = EXTENT_MAP_INLINE; + if (compress_type) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + } + ptr = btrfs_file_extent_inline_start(item) + extent_offset; + if (create == 0 && !PageUptodate(page)) { + if (btrfs_file_extent_compression(leaf, item) != + BTRFS_COMPRESS_NONE) { + ret = uncompress_inline(path, inode, page, + pg_offset, + extent_offset, item); + BUG_ON(ret); + } else { + map = kmap(page); + read_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); + if (pg_offset + copy_size < PAGE_CACHE_SIZE) { + memset(map + pg_offset + copy_size, 0, + PAGE_CACHE_SIZE - pg_offset - + copy_size); + } + kunmap(page); + } + flush_dcache_page(page); + } else if (create && PageUptodate(page)) { + WARN_ON(1); + if (!trans) { + kunmap(page); + free_extent_map(em); + em = NULL; + + btrfs_release_path(path); + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) + return ERR_CAST(trans); + goto again; + } + map = kmap(page); + write_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); + kunmap(page); + btrfs_mark_buffer_dirty(leaf); + } + set_extent_uptodate(io_tree, em->start, + extent_map_end(em) - 1, NULL, GFP_NOFS); + goto insert; + } else { + printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); + WARN_ON(1); + } +not_found: + em->start = start; + em->len = len; +not_found_em: + em->block_start = EXTENT_MAP_HOLE; + set_bit(EXTENT_FLAG_VACANCY, &em->flags); +insert: + btrfs_release_path(path); + if (em->start > start || extent_map_end(em) <= start) { + printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " + "[%llu %llu]\n", (unsigned long long)em->start, + (unsigned long long)em->len, + (unsigned long long)start, + (unsigned long long)len); + err = -EIO; + goto out; + } + + err = 0; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + /* it is possible that someone inserted the extent into the tree + * while we had the lock dropped. It is also possible that + * an overlapping map exists in the tree + */ + if (ret == -EEXIST) { + struct extent_map *existing; + + ret = 0; + + existing = lookup_extent_mapping(em_tree, start, len); + if (existing && (existing->start > start || + existing->start + existing->len <= start)) { + free_extent_map(existing); + existing = NULL; + } + if (!existing) { + existing = lookup_extent_mapping(em_tree, em->start, + em->len); + if (existing) { + err = merge_extent_mapping(em_tree, existing, + em, start, + root->sectorsize); + free_extent_map(existing); + if (err) { + free_extent_map(em); + em = NULL; + } + } else { + err = -EIO; + free_extent_map(em); + em = NULL; + } + } else { + free_extent_map(em); + em = existing; + err = 0; + } + } + write_unlock(&em_tree->lock); +out: + + trace_btrfs_get_extent(root, em); + + if (path) + btrfs_free_path(path); + if (trans) { + ret = btrfs_end_transaction(trans, root); + if (!err) + err = ret; + } + if (err) { + free_extent_map(em); + return ERR_PTR(err); + } + return em; +} + +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + struct extent_map *em; + struct extent_map *hole_em = NULL; + u64 range_start = start; + u64 end; + u64 found; + u64 found_end; + int err = 0; + + em = btrfs_get_extent(inode, page, pg_offset, start, len, create); + if (IS_ERR(em)) + return em; + if (em) { + /* + * if our em maps to a hole, there might + * actually be delalloc bytes behind it + */ + if (em->block_start != EXTENT_MAP_HOLE) + return em; + else + hole_em = em; + } + + /* check to see if we've wrapped (len == -1 or similar) */ + end = start + len; + if (end < start) + end = (u64)-1; + else + end -= 1; + + em = NULL; + + /* ok, we didn't find anything, lets look for delalloc */ + found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, + end, len, EXTENT_DELALLOC, 1); + found_end = range_start + found; + if (found_end < range_start) + found_end = (u64)-1; + + /* + * we didn't find anything useful, return + * the original results from get_extent() + */ + if (range_start > end || found_end <= start) { + em = hole_em; + hole_em = NULL; + goto out; + } + + /* adjust the range_start to make sure it doesn't + * go backwards from the start they passed in + */ + range_start = max(start,range_start); + found = found_end - range_start; + + if (found > 0) { + u64 hole_start = start; + u64 hole_len = len; + + em = alloc_extent_map(); + if (!em) { + err = -ENOMEM; + goto out; + } + /* + * when btrfs_get_extent can't find anything it + * returns one huge hole + * + * make sure what it found really fits our range, and + * adjust to make sure it is based on the start from + * the caller + */ + if (hole_em) { + u64 calc_end = extent_map_end(hole_em); + + if (calc_end <= start || (hole_em->start > end)) { + free_extent_map(hole_em); + hole_em = NULL; + } else { + hole_start = max(hole_em->start, start); + hole_len = calc_end - hole_start; + } + } + em->bdev = NULL; + if (hole_em && range_start > hole_start) { + /* our hole starts before our delalloc, so we + * have to return just the parts of the hole + * that go until the delalloc starts + */ + em->len = min(hole_len, + range_start - hole_start); + em->start = hole_start; + em->orig_start = hole_start; + /* + * don't adjust block start at all, + * it is fixed at EXTENT_MAP_HOLE + */ + em->block_start = hole_em->block_start; + em->block_len = hole_len; + } else { + em->start = range_start; + em->len = found; + em->orig_start = range_start; + em->block_start = EXTENT_MAP_DELALLOC; + em->block_len = found; + } + } else if (hole_em) { + return hole_em; + } +out: + + free_extent_map(hole_em); + if (err) { + free_extent_map(em); + return ERR_PTR(err); + } + return em; +} + +static struct extent_map *btrfs_new_extent_direct(struct inode *inode, + struct extent_map *em, + u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_key ins; + u64 alloc_hint; + int ret; + bool insert = false; + + /* + * Ok if the extent map we looked up is a hole and is for the exact + * range we want, there is no reason to allocate a new one, however if + * it is not right then we need to free this one and drop the cache for + * our range. + */ + if (em->block_start != EXTENT_MAP_HOLE || em->start != start || + em->len != len) { + free_extent_map(em); + em = NULL; + insert = true; + btrfs_drop_extent_cache(inode, start, start + len - 1, 0); + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return ERR_CAST(trans); + + if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) + btrfs_add_inode_defrag(trans, inode); + + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + alloc_hint = get_extent_allocation_hint(inode, start, len); + ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, + alloc_hint, (u64)-1, &ins, 1); + if (ret) { + em = ERR_PTR(ret); + goto out; + } + + if (!em) { + em = alloc_extent_map(); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } + } + + em->start = start; + em->orig_start = em->start; + em->len = ins.offset; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + + /* + * We need to do this because if we're using the original em we searched + * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. + */ + em->flags = 0; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + while (insert) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) + break; + btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); + } + + ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, + ins.offset, ins.offset, 0); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + em = ERR_PTR(ret); + } +out: + btrfs_end_transaction(trans, root); + return em; +} + +/* + * returns 1 when the nocow is safe, < 1 on error, 0 if the + * block must be cow'd + */ +static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, + struct inode *inode, u64 offset, u64 len) +{ + struct btrfs_path *path; + int ret; + struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 disk_bytenr; + u64 backref_offset; + u64 extent_end; + u64 num_bytes; + int slot; + int found_type; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), + offset, 0); + if (ret < 0) + goto out; + + slot = path->slots[0]; + if (ret == 1) { + if (slot == 0) { + /* can't find the item, must cow */ + ret = 0; + goto out; + } + slot--; + } + ret = 0; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) { + /* not our file or wrong item type, must cow */ + goto out; + } + + if (key.offset > offset) { + /* Wrong offset, must cow */ + goto out; + } + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, fi); + if (found_type != BTRFS_FILE_EXTENT_REG && + found_type != BTRFS_FILE_EXTENT_PREALLOC) { + /* not a regular extent, must cow */ + goto out; + } + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + backref_offset = btrfs_file_extent_offset(leaf, fi); + + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end < offset + len) { + /* extent doesn't include our full range, must cow */ + goto out; + } + + if (btrfs_extent_readonly(root, disk_bytenr)) + goto out; + + /* + * look for other files referencing this extent, if we + * find any we must cow + */ + if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), + key.offset - backref_offset, disk_bytenr)) + goto out; + + /* + * adjust disk_bytenr and num_bytes to cover just the bytes + * in this extent we are about to write. If there + * are any csums in that range we have to cow in order + * to keep the csums correct + */ + disk_bytenr += backref_offset; + disk_bytenr += offset - key.offset; + num_bytes = min(offset + len, extent_end) - offset; + if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + goto out; + /* + * all of the above have passed, it is safe to overwrite this extent + * without cow + */ + ret = 1; +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start = iblock << inode->i_blkbits; + u64 len = bh_result->b_size; + struct btrfs_trans_handle *trans; + + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + /* + * Ok for INLINE and COMPRESSED extents we need to fallback on buffered + * io. INLINE is special, and we could probably kludge it in here, but + * it's still buffered so for safety lets just fall back to the generic + * buffered path. + * + * For COMPRESSED we _have_ to read the entire extent in so we can + * decompress it, so there will be buffering required no matter what we + * do, so go ahead and fallback to buffered. + * + * We return -ENOTBLK because thats what makes DIO go ahead and go back + * to buffered IO. Don't blame me, this is the price we pay for using + * the generic code. + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || + em->block_start == EXTENT_MAP_INLINE) { + free_extent_map(em); + return -ENOTBLK; + } + + /* Just a good old fashioned hole, return */ + if (!create && (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + free_extent_map(em); + /* DIO will do one hole at a time, so just unlock a sector */ + unlock_extent(&BTRFS_I(inode)->io_tree, start, + start + root->sectorsize - 1, GFP_NOFS); + return 0; + } + + /* + * We don't allocate a new extent in the following cases + * + * 1) The inode is marked as NODATACOW. In this case we'll just use the + * existing extent. + * 2) The extent is marked as PREALLOC. We're good to go here and can + * just use the extent. + * + */ + if (!create) { + len = em->len - (start - em->start); + goto map; + } + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->block_start != EXTENT_MAP_HOLE)) { + int type; + int ret; + u64 block_start; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + type = BTRFS_ORDERED_PREALLOC; + else + type = BTRFS_ORDERED_NOCOW; + len = min(len, em->len - (start - em->start)); + block_start = em->block_start + (start - em->start); + + /* + * we're not going to log anything, but we do need + * to make sure the current transaction stays open + * while we look for nocow cross refs + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + goto must_cow; + + if (can_nocow_odirect(trans, inode, start, len) == 1) { + ret = btrfs_add_ordered_extent_dio(inode, start, + block_start, len, len, type); + btrfs_end_transaction(trans, root); + if (ret) { + free_extent_map(em); + return ret; + } + goto unlock; + } + btrfs_end_transaction(trans, root); + } +must_cow: + /* + * this will cow the extent, reset the len in case we changed + * it above + */ + len = bh_result->b_size; + em = btrfs_new_extent_direct(inode, em, start, len); + if (IS_ERR(em)) + return PTR_ERR(em); + len = min(len, em->len - (start - em->start)); +unlock: + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, + 0, NULL, GFP_NOFS); +map: + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = em->bdev; + set_buffer_mapped(bh_result); + if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + set_buffer_new(bh_result); + + free_extent_map(em); + + return 0; +} + +struct btrfs_dio_private { + struct inode *inode; + u64 logical_offset; + u64 disk_bytenr; + u64 bytes; + u32 *csums; + void *private; + + /* number of bios pending for this dio */ + atomic_t pending_bios; + + /* IO errors */ + int errors; + + struct bio *orig_bio; +}; + +static void btrfs_endio_direct_read(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec = bio->bi_io_vec; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start; + u32 *private = dip->csums; + + start = dip->logical_offset; + do { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + struct page *page = bvec->bv_page; + char *kaddr; + u32 csum = ~(u32)0; + unsigned long flags; + + local_irq_save(flags); + kaddr = kmap_atomic(page, KM_IRQ0); + csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, + csum, bvec->bv_len); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr, KM_IRQ0); + local_irq_restore(flags); + + flush_dcache_page(bvec->bv_page); + if (csum != *private) { + printk(KERN_ERR "btrfs csum failed ino %llu off" + " %llu csum %u private %u\n", + (unsigned long long)btrfs_ino(inode), + (unsigned long long)start, + csum, *private); + err = -EIO; + } + } + + start += bvec->bv_len; + private++; + bvec++; + } while (bvec <= bvec_end); + + unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, + dip->logical_offset + dip->bytes - 1, GFP_NOFS); + bio->bi_private = dip->private; + + kfree(dip->csums); + kfree(dip); + + /* If we had a csum failure make sure to clear the uptodate flag */ + if (err) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + dio_end_io(bio, err); +} + +static void btrfs_endio_direct_write(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_ordered_extent *ordered = NULL; + struct extent_state *cached_state = NULL; + u64 ordered_offset = dip->logical_offset; + u64 ordered_bytes = dip->bytes; + int ret; + + if (err) + goto out_done; +again: + ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, + &ordered_offset, + ordered_bytes); + if (!ret) + goto out_test; + + BUG_ON(!ordered); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + err = -ENOMEM; + goto out; + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { + ret = btrfs_ordered_update_i_size(inode, 0, ordered); + if (!ret) + ret = btrfs_update_inode(trans, root, inode); + err = ret; + goto out; + } + + lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, + ordered->file_offset + ordered->len - 1, 0, + &cached_state, GFP_NOFS); + + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { + ret = btrfs_mark_extent_written(trans, inode, + ordered->file_offset, + ordered->file_offset + + ordered->len); + if (ret) { + err = ret; + goto out_unlock; + } + } else { + ret = insert_reserved_file_extent(trans, inode, + ordered->file_offset, + ordered->start, + ordered->disk_len, + ordered->len, + ordered->len, + 0, 0, 0, + BTRFS_FILE_EXTENT_REG); + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered->file_offset, ordered->len); + if (ret) { + err = ret; + WARN_ON(1); + goto out_unlock; + } + } + + add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); + ret = btrfs_ordered_update_i_size(inode, 0, ordered); + if (!ret) + btrfs_update_inode(trans, root, inode); + ret = 0; +out_unlock: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, + ordered->file_offset + ordered->len - 1, + &cached_state, GFP_NOFS); +out: + btrfs_delalloc_release_metadata(inode, ordered->len); + btrfs_end_transaction(trans, root); + ordered_offset = ordered->file_offset + ordered->len; + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + +out_test: + /* + * our bio might span multiple ordered extents. If we haven't + * completed the accounting for the whole dio, go back and try again + */ + if (ordered_offset < dip->logical_offset + dip->bytes) { + ordered_bytes = dip->logical_offset + dip->bytes - + ordered_offset; + goto again; + } +out_done: + bio->bi_private = dip->private; + + kfree(dip->csums); + kfree(dip); + + /* If we had an error make sure to clear the uptodate flag */ + if (err) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + dio_end_io(bio, err); +} + +static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 offset) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); + BUG_ON(ret); + return 0; +} + +static void btrfs_end_dio_bio(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + + if (err) { + printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " + "sector %#Lx len %u err no %d\n", + (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw, + (unsigned long long)bio->bi_sector, bio->bi_size, err); + dip->errors = 1; + + /* + * before atomic variable goto zero, we must make sure + * dip->errors is perceived to be set. + */ + smp_mb__before_atomic_dec(); + } + + /* if there are more bios still pending for this dio, just exit */ + if (!atomic_dec_and_test(&dip->pending_bios)) + goto out; + + if (dip->errors) + bio_io_error(dip->orig_bio); + else { + set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); + bio_endio(dip->orig_bio, 0); + } +out: + bio_put(bio); +} + +static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, + u64 first_sector, gfp_t gfp_flags) +{ + int nr_vecs = bio_get_nr_vecs(bdev); + return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); +} + +static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, + int rw, u64 file_offset, int skip_sum, + u32 *csums, int async_submit) +{ + int write = rw & REQ_WRITE; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + bio_get(bio); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto err; + + if (skip_sum) + goto map; + + if (write && async_submit) { + ret = btrfs_wq_submit_bio(root->fs_info, + inode, rw, bio, 0, 0, + file_offset, + __btrfs_submit_bio_start_direct_io, + __btrfs_submit_bio_done); + goto err; + } else if (write) { + /* + * If we aren't doing async submit, calculate the csum of the + * bio now. + */ + ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); + if (ret) + goto err; + } else if (!skip_sum) { + ret = btrfs_lookup_bio_sums_dio(root, inode, bio, + file_offset, csums); + if (ret) + goto err; + } + +map: + ret = btrfs_map_bio(root, rw, bio, 0, async_submit); +err: + bio_put(bio); + return ret; +} + +static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, + int skip_sum) +{ + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct bio *bio; + struct bio *orig_bio = dip->orig_bio; + struct bio_vec *bvec = orig_bio->bi_io_vec; + u64 start_sector = orig_bio->bi_sector; + u64 file_offset = dip->logical_offset; + u64 submit_len = 0; + u64 map_length; + int nr_pages = 0; + u32 *csums = dip->csums; + int ret = 0; + int async_submit = 0; + int write = rw & REQ_WRITE; + + map_length = orig_bio->bi_size; + ret = btrfs_map_block(map_tree, READ, start_sector << 9, + &map_length, NULL, 0); + if (ret) { + bio_put(orig_bio); + return -EIO; + } + + if (map_length >= orig_bio->bi_size) { + bio = orig_bio; + goto submit; + } + + async_submit = 1; + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); + if (!bio) + return -ENOMEM; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + atomic_inc(&dip->pending_bios); + + while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { + if (unlikely(map_length < submit_len + bvec->bv_len || + bio_add_page(bio, bvec->bv_page, bvec->bv_len, + bvec->bv_offset) < bvec->bv_len)) { + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the dip might get freed + * before we're done setting it up + */ + atomic_inc(&dip->pending_bios); + ret = __btrfs_submit_dio_bio(bio, inode, rw, + file_offset, skip_sum, + csums, async_submit); + if (ret) { + bio_put(bio); + atomic_dec(&dip->pending_bios); + goto out_err; + } + + /* Write's use the ordered csums */ + if (!write && !skip_sum) + csums = csums + nr_pages; + start_sector += submit_len >> 9; + file_offset += submit_len; + + submit_len = 0; + nr_pages = 0; + + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, + start_sector, GFP_NOFS); + if (!bio) + goto out_err; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + + map_length = orig_bio->bi_size; + ret = btrfs_map_block(map_tree, READ, start_sector << 9, + &map_length, NULL, 0); + if (ret) { + bio_put(bio); + goto out_err; + } + } else { + submit_len += bvec->bv_len; + nr_pages ++; + bvec++; + } + } + +submit: + ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, + csums, async_submit); + if (!ret) + return 0; + + bio_put(bio); +out_err: + dip->errors = 1; + /* + * before atomic variable goto zero, we must + * make sure dip->errors is perceived to be set. + */ + smp_mb__before_atomic_dec(); + if (atomic_dec_and_test(&dip->pending_bios)) + bio_io_error(dip->orig_bio); + + /* bio_end_io() will handle error, so we needn't return it */ + return 0; +} + +static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, + loff_t file_offset) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_dio_private *dip; + struct bio_vec *bvec = bio->bi_io_vec; + int skip_sum; + int write = rw & REQ_WRITE; + int ret = 0; + + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + dip = kmalloc(sizeof(*dip), GFP_NOFS); + if (!dip) { + ret = -ENOMEM; + goto free_ordered; + } + dip->csums = NULL; + + /* Write's use the ordered csum stuff, so we don't need dip->csums */ + if (!write && !skip_sum) { + dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); + if (!dip->csums) { + kfree(dip); + ret = -ENOMEM; + goto free_ordered; + } + } + + dip->private = bio->bi_private; + dip->inode = inode; + dip->logical_offset = file_offset; + + dip->bytes = 0; + do { + dip->bytes += bvec->bv_len; + bvec++; + } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); + + dip->disk_bytenr = (u64)bio->bi_sector << 9; + bio->bi_private = dip; + dip->errors = 0; + dip->orig_bio = bio; + atomic_set(&dip->pending_bios, 0); + + if (write) + bio->bi_end_io = btrfs_endio_direct_write; + else + bio->bi_end_io = btrfs_endio_direct_read; + + ret = btrfs_submit_direct_hook(rw, dip, skip_sum); + if (!ret) + return; +free_ordered: + /* + * If this is a write, we need to clean up the reserved space and kill + * the ordered extent. + */ + if (write) { + struct btrfs_ordered_extent *ordered; + ordered = btrfs_lookup_ordered_extent(inode, file_offset); + if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + btrfs_free_reserved_extent(root, ordered->start, + ordered->disk_len); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + bio_endio(bio, ret); +} + +static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + int seg; + int i; + size_t size; + unsigned long addr; + unsigned blocksize_mask = root->sectorsize - 1; + ssize_t retval = -EINVAL; + loff_t end = offset; + + if (offset & blocksize_mask) + goto out; + + /* Check the memory alignment. Blocks cannot straddle pages */ + for (seg = 0; seg < nr_segs; seg++) { + addr = (unsigned long)iov[seg].iov_base; + size = iov[seg].iov_len; + end += size; + if ((addr & blocksize_mask) || (size & blocksize_mask)) + goto out; + + /* If this is a write we don't need to check anymore */ + if (rw & WRITE) + continue; + + /* + * Check to make sure we don't have duplicate iov_base's in this + * iovec, if so return EINVAL, otherwise we'll get csum errors + * when reading back. + */ + for (i = seg + 1; i < nr_segs; i++) { + if (iov[seg].iov_base == iov[i].iov_base) + goto out; + } + } + retval = 0; +out: + return retval; +} +static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + u64 lockstart, lockend; + ssize_t ret; + int writing = rw & WRITE; + int write_bits = 0; + size_t count = iov_length(iov, nr_segs); + + if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, + offset, nr_segs)) { + return 0; + } + + lockstart = offset; + lockend = offset + count - 1; + + if (writing) { + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + goto out; + } + + while (1) { + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, &cached_state, GFP_NOFS); + /* + * We're concerned with the entire range that we're going to be + * doing DIO to, so we need to make sure theres no ordered + * extents in this range. + */ + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (!ordered) + break; + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + cond_resched(); + } + + /* + * we don't use btrfs_set_extent_delalloc because we don't want + * the dirty or uptodate bits + */ + if (writing) { + write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_DELALLOC, 0, NULL, &cached_state, + GFP_NOFS); + if (ret) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_LOCKED | write_bits, + 1, 0, &cached_state, GFP_NOFS); + goto out; + } + } + + free_extent_state(cached_state); + cached_state = NULL; + + ret = __blockdev_direct_IO(rw, iocb, inode, + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, 0); + + if (ret < 0 && ret != -EIOCBQUEUED) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, + offset + iov_length(iov, nr_segs) - 1, + EXTENT_LOCKED | write_bits, 1, 0, + &cached_state, GFP_NOFS); + } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { + /* + * We're falling back to buffered, unlock the section we didn't + * do IO on. + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, + offset + iov_length(iov, nr_segs) - 1, + EXTENT_LOCKED | write_bits, 1, 0, + &cached_state, GFP_NOFS); + } +out: + free_extent_state(cached_state); + return ret; +} + +static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); +} + +int btrfs_readpage(struct file *file, struct page *page) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_read_full_page(tree, page, btrfs_get_extent); +} + +static int btrfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + + + if (current->flags & PF_MEMALLOC) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_write_full_page(tree, page, btrfs_get_extent, wbc); +} + +int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + + tree = &BTRFS_I(mapping->host)->io_tree; + return extent_writepages(tree, mapping, btrfs_get_extent, wbc); +} + +static int +btrfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; + return extent_readpages(tree, mapping, pages, nr_pages, + btrfs_get_extent); +} +static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *map; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + ret = try_release_extent_mapping(map, tree, page, gfp_flags); + if (ret == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } + return ret; +} + +static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) +{ + if (PageWriteback(page) || PageDirty(page)) + return 0; + return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); +} + +static void btrfs_invalidatepage(struct page *page, unsigned long offset) +{ + struct extent_io_tree *tree; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + u64 page_start = page_offset(page); + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + + + /* + * we have the page locked, so new writeback can't start, + * and the dirty bit won't be cleared while we are here. + * + * Wait for IO on this page so that we can safely clear + * the PagePrivate2 bit and do ordered accounting + */ + wait_on_page_writeback(page); + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (offset) { + btrfs_releasepage(page, GFP_NOFS); + return; + } + lock_extent_bits(tree, page_start, page_end, 0, &cached_state, + GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(page->mapping->host, + page_offset(page)); + if (ordered) { + /* + * IO on this page will never be started, so we need + * to account for any ordered extents now + */ + clear_extent_bit(tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, + &cached_state, GFP_NOFS); + /* + * whoever cleared the private bit is responsible + * for the finish_ordered_io + */ + if (TestClearPagePrivate2(page)) { + btrfs_finish_ordered_io(page->mapping->host, + page_start, page_end); + } + btrfs_put_ordered_extent(ordered); + cached_state = NULL; + lock_extent_bits(tree, page_start, page_end, 0, &cached_state, + GFP_NOFS); + } + clear_extent_bit(tree, page_start, page_end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); + __btrfs_releasepage(page, GFP_NOFS); + + ClearPageChecked(page); + if (PagePrivate(page)) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } +} + +/* + * btrfs_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * vmtruncate() writes the inode size before removing pages, once we have the + * page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = fdentry(vma->vm_file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + char *kaddr; + unsigned long zero_start; + loff_t size; + int ret; + u64 page_start; + u64 page_end; + + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (ret) { + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else /* -ENOSPC, -EIO, etc */ + ret = VM_FAULT_SIGBUS; + goto out; + } + + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ +again: + lock_page(page); + size = i_size_read(inode); + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + if ((page->mapping != inode->i_mapping) || + (page_start >= size)) { + /* page got truncated out from underneath us */ + goto out_unlock; + } + wait_on_page_writeback(page); + + lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, + GFP_NOFS); + set_page_extent_mapped(page); + + /* + * we can't set the delalloc bits if there are pending ordered + * extents. Drop our locks and wait for them to finish + */ + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent_cached(io_tree, page_start, page_end, + &cached_state, GFP_NOFS); + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + + /* + * XXX - page_mkwrite gets called every time the page is dirtied, even + * if it was already dirty, so for space accounting reasons we need to + * clear any delalloc bits for the range we are fixing to save. There + * is probably a better way to do this, but for now keep consistent with + * prepare_pages in the normal write path. + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + 0, 0, &cached_state, GFP_NOFS); + + ret = btrfs_set_extent_delalloc(inode, page_start, page_end, + &cached_state); + if (ret) { + unlock_extent_cached(io_tree, page_start, page_end, + &cached_state, GFP_NOFS); + ret = VM_FAULT_SIGBUS; + goto out_unlock; + } + ret = 0; + + /* page is wholly or partially inside EOF */ + if (page_start + PAGE_CACHE_SIZE > size) + zero_start = size & ~PAGE_CACHE_MASK; + else + zero_start = PAGE_CACHE_SIZE; + + if (zero_start != PAGE_CACHE_SIZE) { + kaddr = kmap(page); + memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); + flush_dcache_page(page); + kunmap(page); + } + ClearPageChecked(page); + set_page_dirty(page); + SetPageUptodate(page); + + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + + unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); + +out_unlock: + if (!ret) + return VM_FAULT_LOCKED; + unlock_page(page); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +out: + return ret; +} + +static int btrfs_truncate(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *rsv; + int ret; + int err = 0; + struct btrfs_trans_handle *trans; + unsigned long nr; + u64 mask = root->sectorsize - 1; + + ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (ret) + return ret; + + btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + + /* + * Yes ladies and gentelment, this is indeed ugly. The fact is we have + * 3 things going on here + * + * 1) We need to reserve space for our orphan item and the space to + * delete our orphan item. Lord knows we don't want to have a dangling + * orphan item because we didn't reserve space to remove it. + * + * 2) We need to reserve space to update our inode. + * + * 3) We need to have something to cache all the space that is going to + * be free'd up by the truncate operation, but also have some slack + * space reserved in case it uses space during the truncate (thank you + * very much snapshotting). + * + * And we need these to all be seperate. The fact is we can use alot of + * space doing the truncate, and we have no earthly idea how much space + * we will use, so we need the truncate reservation to be seperate so it + * doesn't end up using space reserved for updating the inode or + * removing the orphan item. We also need to be able to stop the + * transaction and start a new one, which means we need to be able to + * update the inode several times, and we have no idea of knowing how + * many times that will be, so we can't just reserve 1 item for the + * entirety of the opration, so that has to be done seperately as well. + * Then there is the orphan item, which does indeed need to be held on + * to for the whole operation, and we need nobody to touch this reserved + * space except the orphan code. + * + * So that leaves us with + * + * 1) root->orphan_block_rsv - for the orphan deletion. + * 2) rsv - for the truncate reservation, which we will steal from the + * transaction reservation. + * 3) fs_info->trans_block_rsv - this will have 1 items worth left for + * updating the inode. + */ + rsv = btrfs_alloc_block_rsv(root); + if (!rsv) + return -ENOMEM; + btrfs_add_durable_block_rsv(root->fs_info, rsv); + + trans = btrfs_start_transaction(root, 4); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } + + /* + * Reserve space for the truncate process. Truncate should be adding + * space, but if there are snapshots it may end up using space. + */ + ret = btrfs_truncate_reserve_metadata(trans, root, rsv); + BUG_ON(ret); + + ret = btrfs_orphan_add(trans, inode); + if (ret) { + btrfs_end_transaction(trans, root); + goto out; + } + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + + /* + * Ok so we've already migrated our bytes over for the truncate, so here + * just reserve the one slot we need for updating the inode. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } + trans->block_rsv = rsv; + + /* + * setattr is responsible for setting the ordered_data_close flag, + * but that is only tested during the last file release. That + * could happen well after the next commit, leaving a great big + * window where new writes may get lost if someone chooses to write + * to this file after truncating to zero + * + * The inode doesn't have any dirty data here, and so if we commit + * this is a noop. If someone immediately starts writing to the inode + * it is very likely we'll catch some of their writes in this + * transaction, and the commit will find this file on the ordered + * data list with good things to send down. + * + * This is a best effort solution, there is still a window where + * using truncate to replace the contents of the file will + * end up with a zero length file after a crash. + */ + if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) + btrfs_add_ordered_operation(trans, root, inode); + + while (1) { + if (!trans) { + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } + + ret = btrfs_truncate_reserve_metadata(trans, root, + rsv); + BUG_ON(ret); + + trans->block_rsv = rsv; + } + + ret = btrfs_truncate_inode_items(trans, root, inode, + inode->i_size, + BTRFS_EXTENT_DATA_KEY); + if (ret != -EAGAIN) { + err = ret; + break; + } + + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + err = ret; + break; + } + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + trans = NULL; + btrfs_btree_balance_dirty(root, nr); + } + + if (ret == 0 && inode->i_nlink > 0) { + trans->block_rsv = root->orphan_block_rsv; + ret = btrfs_orphan_del(trans, inode); + if (ret) + err = ret; + } else if (ret && inode->i_nlink > 0) { + /* + * Failed to do the truncate, remove us from the in memory + * orphan list. + */ + ret = btrfs_orphan_del(NULL, inode); + } + + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + if (ret && !err) + err = ret; + + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + btrfs_btree_balance_dirty(root, nr); + +out: + btrfs_free_block_rsv(root, rsv); + + if (ret && !err) + err = ret; + + return err; +} + +/* + * create a new subvolume directory/inode (helper for the ioctl). + */ +int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, + struct btrfs_root *new_root, u64 new_dirid) +{ + struct inode *inode; + int err; + u64 index = 0; + + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, + new_dirid, S_IFDIR | 0700, &index); + if (IS_ERR(inode)) + return PTR_ERR(inode); + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + + inode->i_nlink = 1; + btrfs_i_size_write(inode, 0); + + err = btrfs_update_inode(trans, new_root, inode); + BUG_ON(err); + + iput(inode); + return 0; +} + +/* helper function for file defrag and space balancing. This + * forces readahead on a given range of bytes in an inode + */ +unsigned long btrfs_force_ra(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + pgoff_t offset, pgoff_t last_index) +{ + pgoff_t req_size = last_index - offset + 1; + + page_cache_sync_readahead(mapping, ra, file, offset, req_size); + return offset + req_size; +} + +struct inode *btrfs_alloc_inode(struct super_block *sb) +{ + struct btrfs_inode *ei; + struct inode *inode; + + ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + + ei->root = NULL; + ei->space_info = NULL; + ei->generation = 0; + ei->sequence = 0; + ei->last_trans = 0; + ei->last_sub_trans = 0; + ei->logged_trans = 0; + ei->delalloc_bytes = 0; + ei->reserved_bytes = 0; + ei->disk_i_size = 0; + ei->flags = 0; + ei->index_cnt = (u64)-1; + ei->last_unlink_trans = 0; + + atomic_set(&ei->outstanding_extents, 0); + atomic_set(&ei->reserved_extents, 0); + + ei->ordered_data_close = 0; + ei->orphan_meta_reserved = 0; + ei->dummy_inode = 0; + ei->in_defrag = 0; + ei->force_compress = BTRFS_COMPRESS_NONE; + + ei->delayed_node = NULL; + + inode = &ei->vfs_inode; + extent_map_tree_init(&ei->extent_tree); + extent_io_tree_init(&ei->io_tree, &inode->i_data); + extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); + mutex_init(&ei->log_mutex); + btrfs_ordered_inode_tree_init(&ei->ordered_tree); + INIT_LIST_HEAD(&ei->i_orphan); + INIT_LIST_HEAD(&ei->delalloc_inodes); + INIT_LIST_HEAD(&ei->ordered_operations); + RB_CLEAR_NODE(&ei->rb_node); + + return inode; +} + +static void btrfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} + +void btrfs_destroy_inode(struct inode *inode) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + + WARN_ON(!list_empty(&inode->i_dentry)); + WARN_ON(inode->i_data.nrpages); + WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); + WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents)); + + /* + * This can happen where we create an inode, but somebody else also + * created the same inode and we need to destroy the one we already + * created. + */ + if (!root) + goto free; + + /* + * Make sure we're properly removed from the ordered operation + * lists. + */ + smp_mb(); + if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { + spin_lock(&root->fs_info->ordered_extent_lock); + list_del_init(&BTRFS_I(inode)->ordered_operations); + spin_unlock(&root->fs_info->ordered_extent_lock); + } + + spin_lock(&root->orphan_lock); + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", + (unsigned long long)btrfs_ino(inode)); + list_del_init(&BTRFS_I(inode)->i_orphan); + } + spin_unlock(&root->orphan_lock); + + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); + if (!ordered) + break; + else { + printk(KERN_ERR "btrfs found ordered " + "extent %llu %llu on inode cleanup\n", + (unsigned long long)ordered->file_offset, + (unsigned long long)ordered->len); + btrfs_remove_ordered_extent(inode, ordered); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + } + inode_tree_del(inode); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); +free: + btrfs_remove_delayed_node(inode); + call_rcu(&inode->i_rcu, btrfs_i_callback); +} + +int btrfs_drop_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (btrfs_root_refs(&root->root_item) == 0 && + !is_free_space_inode(root, inode)) + return 1; + else + return generic_drop_inode(inode); +} + +static void init_once(void *foo) +{ + struct btrfs_inode *ei = (struct btrfs_inode *) foo; + + inode_init_once(&ei->vfs_inode); +} + +void btrfs_destroy_cachep(void) +{ + if (btrfs_inode_cachep) + kmem_cache_destroy(btrfs_inode_cachep); + if (btrfs_trans_handle_cachep) + kmem_cache_destroy(btrfs_trans_handle_cachep); + if (btrfs_transaction_cachep) + kmem_cache_destroy(btrfs_transaction_cachep); + if (btrfs_path_cachep) + kmem_cache_destroy(btrfs_path_cachep); + if (btrfs_free_space_cachep) + kmem_cache_destroy(btrfs_free_space_cachep); +} + +int btrfs_init_cachep(void) +{ + btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", + sizeof(struct btrfs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); + if (!btrfs_inode_cachep) + goto fail; + + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", + sizeof(struct btrfs_trans_handle), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_trans_handle_cachep) + goto fail; + + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", + sizeof(struct btrfs_transaction), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_transaction_cachep) + goto fail; + + btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", + sizeof(struct btrfs_path), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_path_cachep) + goto fail; + + btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", + sizeof(struct btrfs_free_space), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_cachep) + goto fail; + + return 0; +fail: + btrfs_destroy_cachep(); + return -ENOMEM; +} + +static int btrfs_getattr(struct vfsmount *mnt, + struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + generic_fillattr(inode, stat); + stat->dev = BTRFS_I(inode)->root->anon_super.s_dev; + stat->blksize = PAGE_CACHE_SIZE; + stat->blocks = (inode_get_bytes(inode) + + BTRFS_I(inode)->delalloc_bytes) >> 9; + return 0; +} + +/* + * If a file is moved, it will inherit the cow and compression flags of the new + * directory. + */ +static void fixup_inode_flags(struct inode *dir, struct inode *inode) +{ + struct btrfs_inode *b_dir = BTRFS_I(dir); + struct btrfs_inode *b_inode = BTRFS_I(inode); + + if (b_dir->flags & BTRFS_INODE_NODATACOW) + b_inode->flags |= BTRFS_INODE_NODATACOW; + else + b_inode->flags &= ~BTRFS_INODE_NODATACOW; + + if (b_dir->flags & BTRFS_INODE_COMPRESS) + b_inode->flags |= BTRFS_INODE_COMPRESS; + else + b_inode->flags &= ~BTRFS_INODE_COMPRESS; +} + +static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct btrfs_root *dest = BTRFS_I(new_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec ctime = CURRENT_TIME; + u64 index = 0; + u64 root_objectid; + int ret; + u64 old_ino = btrfs_ino(old_inode); + + if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return -EPERM; + + /* we only allow rename subvolume link between subvolumes */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + return -EXDEV; + + if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || + (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID)) + return -ENOTEMPTY; + + if (S_ISDIR(old_inode->i_mode) && new_inode && + new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) + return -ENOTEMPTY; + /* + * we're using rename to replace one file with another. + * and the replacement file is large. Start IO on it now so + * we don't add too much work to the end of the transaction + */ + if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && + old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + filemap_flush(old_inode->i_mapping); + + /* close the racy window with snapshot create/destroy ioctl */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&root->fs_info->subvol_sem); + /* + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items + * should cover the worst case number of items we'll modify. + */ + trans = btrfs_start_transaction(root, 20); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_notrans; + } + + if (dest != root) + btrfs_record_root_in_trans(trans, dest); + + ret = btrfs_set_inode_index(new_dir, &index); + if (ret) + goto out_fail; + + if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { + /* force full log commit if subvolume involved. */ + root->fs_info->last_trans_log_full_commit = trans->transid; + } else { + ret = btrfs_insert_inode_ref(trans, dest, + new_dentry->d_name.name, + new_dentry->d_name.len, + old_ino, + btrfs_ino(new_dir), index); + if (ret) + goto out_fail; + /* + * this is an ugly little race, but the rename is required + * to make sure that if we crash, the inode is either at the + * old name or the new one. pinning the log transaction lets + * us make sure we don't allow a log commit to come in after + * we unlink the name but before we add the new name back in. + */ + btrfs_pin_log_trans(root); + } + /* + * make sure the inode gets flushed if it is replacing + * something. + */ + if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) + btrfs_add_ordered_operation(trans, root, old_inode); + + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; + + if (old_dentry->d_parent != new_dentry->d_parent) + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + + if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { + root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, + old_dentry->d_name.name, + old_dentry->d_name.len); + } else { + ret = __btrfs_unlink_inode(trans, root, old_dir, + old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, root, old_inode); + } + BUG_ON(ret); + + if (new_inode) { + new_inode->i_ctime = CURRENT_TIME; + if (unlikely(btrfs_ino(new_inode) == + BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + root_objectid = BTRFS_I(new_inode)->location.objectid; + ret = btrfs_unlink_subvol(trans, dest, new_dir, + root_objectid, + new_dentry->d_name.name, + new_dentry->d_name.len); + BUG_ON(new_inode->i_nlink == 0); + } else { + ret = btrfs_unlink_inode(trans, dest, new_dir, + new_dentry->d_inode, + new_dentry->d_name.name, + new_dentry->d_name.len); + } + BUG_ON(ret); + if (new_inode->i_nlink == 0) { + ret = btrfs_orphan_add(trans, new_dentry->d_inode); + BUG_ON(ret); + } + } + + fixup_inode_flags(new_dir, old_inode); + + ret = btrfs_add_link(trans, new_dir, old_inode, + new_dentry->d_name.name, + new_dentry->d_name.len, 0, index); + BUG_ON(ret); + + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + struct dentry *parent = dget_parent(new_dentry); + btrfs_log_new_name(trans, old_inode, old_dir, parent); + dput(parent); + btrfs_end_log_trans(root); + } +out_fail: + btrfs_end_transaction_throttle(trans, root); +out_notrans: + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&root->fs_info->subvol_sem); + + return ret; +} + +/* + * some fairly slow code that needs optimization. This walks the list + * of all the inodes with pending delalloc and forces them to disk. + */ +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +{ + struct list_head *head = &root->fs_info->delalloc_inodes; + struct btrfs_inode *binode; + struct inode *inode; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + spin_lock(&root->fs_info->delalloc_lock); + while (!list_empty(head)) { + binode = list_entry(head->next, struct btrfs_inode, + delalloc_inodes); + inode = igrab(&binode->vfs_inode); + if (!inode) + list_del_init(&binode->delalloc_inodes); + spin_unlock(&root->fs_info->delalloc_lock); + if (inode) { + filemap_flush(inode->i_mapping); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + } + cond_resched(); + spin_lock(&root->fs_info->delalloc_lock); + } + spin_unlock(&root->fs_info->delalloc_lock); + + /* the filemap_flush will queue IO into the worker threads, but + * we have to make sure the IO is actually started and that + * ordered extents get created before we return + */ + atomic_inc(&root->fs_info->async_submit_draining); + while (atomic_read(&root->fs_info->nr_async_submits) || + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->nr_async_submits) == 0 && + atomic_read(&root->fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&root->fs_info->async_submit_draining); + return 0; +} + +static int btrfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + u64 objectid; + u64 index = 0 ; + int name_len; + int datasize; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + struct extent_buffer *leaf; + unsigned long nr = 0; + + name_len = strlen(symname) + 1; + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) + return -ENAMETOOLONG; + + /* + * 2 items for inode item and ref + * 2 items for dir items + * 1 item for xattr if selinux is on + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + err = btrfs_find_free_ino(root, &objectid); + if (err) + goto out_unlock; + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, btrfs_ino(dir), objectid, + S_IFLNK|S_IRWXUGO, &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_unlock; + } + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + } + if (drop_inode) + goto out_unlock; + + path = btrfs_alloc_path(); + BUG_ON(!path); + key.objectid = btrfs_ino(inode); + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + datasize = btrfs_file_extent_calc_inline_size(name_len); + err = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + if (err) { + drop_inode = 1; + btrfs_free_path(path); + goto out_unlock; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, + BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_compression(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); + + ptr = btrfs_file_extent_inline_start(ei); + write_extent_buffer(leaf, symname, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + inode->i_op = &btrfs_symlink_inode_operations; + inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode_set_bytes(inode, name_len); + btrfs_i_size_write(inode, name_len - 1); + err = btrfs_update_inode(trans, root, inode); + if (err) + drop_inode = 1; + +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int __btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint, + struct btrfs_trans_handle *trans) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key ins; + u64 cur_offset = start; + u64 i_size; + int ret = 0; + bool own_trans = true; + + if (trans) + own_trans = false; + while (num_bytes > 0) { + if (own_trans) { + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + } + + ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, + 0, *alloc_hint, (u64)-1, &ins, 1); + if (ret) { + if (own_trans) + btrfs_end_transaction(trans, root); + break; + } + + ret = insert_reserved_file_extent(trans, inode, + cur_offset, ins.objectid, + ins.offset, ins.offset, + ins.offset, 0, 0, 0, + BTRFS_FILE_EXTENT_PREALLOC); + BUG_ON(ret); + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + ins.offset -1, 0); + + num_bytes -= ins.offset; + cur_offset += ins.offset; + *alloc_hint = ins.objectid + ins.offset; + + inode->i_ctime = CURRENT_TIME; + BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (actual_len > inode->i_size) && + (cur_offset > inode->i_size)) { + if (cur_offset > actual_len) + i_size = actual_len; + else + i_size = cur_offset; + i_size_write(inode, i_size); + btrfs_ordered_update_i_size(inode, i_size, NULL); + } + + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + + if (own_trans) + btrfs_end_transaction(trans, root); + } + return ret; +} + +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) +{ + return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, + min_size, actual_len, alloc_hint, + NULL); +} + +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) +{ + return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, + min_size, actual_len, alloc_hint, trans); +} + +static int btrfs_set_page_dirty(struct page *page) +{ + return __set_page_dirty_nobuffers(page); +} + +static int btrfs_permission(struct inode *inode, int mask, unsigned int flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (btrfs_root_readonly(root) && (mask & MAY_WRITE)) + return -EROFS; + if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) + return -EACCES; + return generic_permission(inode, mask, flags, btrfs_check_acl); +} + +static const struct inode_operations btrfs_dir_inode_operations = { + .getattr = btrfs_getattr, + .lookup = btrfs_lookup, + .create = btrfs_create, + .unlink = btrfs_unlink, + .link = btrfs_link, + .mkdir = btrfs_mkdir, + .rmdir = btrfs_rmdir, + .rename = btrfs_rename, + .symlink = btrfs_symlink, + .setattr = btrfs_setattr, + .mknod = btrfs_mknod, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .permission = btrfs_permission, +}; +static const struct inode_operations btrfs_dir_ro_inode_operations = { + .lookup = btrfs_lookup, + .permission = btrfs_permission, +}; + +static const struct file_operations btrfs_dir_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = btrfs_real_readdir, + .unlocked_ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_ioctl, +#endif + .release = btrfs_release_file, + .fsync = btrfs_sync_file, +}; + +static struct extent_io_ops btrfs_extent_io_ops = { + .fill_delalloc = run_delalloc_range, + .submit_bio_hook = btrfs_submit_bio_hook, + .merge_bio_hook = btrfs_merge_bio_hook, + .readpage_end_io_hook = btrfs_readpage_end_io_hook, + .writepage_end_io_hook = btrfs_writepage_end_io_hook, + .writepage_start_hook = btrfs_writepage_start_hook, + .readpage_io_failed_hook = btrfs_io_failed_hook, + .set_bit_hook = btrfs_set_bit_hook, + .clear_bit_hook = btrfs_clear_bit_hook, + .merge_extent_hook = btrfs_merge_extent_hook, + .split_extent_hook = btrfs_split_extent_hook, +}; + +/* + * btrfs doesn't support the bmap operation because swapfiles + * use bmap to make a mapping of extents in the file. They assume + * these extents won't change over the life of the file and they + * use the bmap result to do IO directly to the drive. + * + * the btrfs bmap call would return logical addresses that aren't + * suitable for IO and they also will change frequently as COW + * operations happen. So, swapfile + btrfs == corruption. + * + * For now we're avoiding this by dropping bmap. + */ +static const struct address_space_operations btrfs_aops = { + .readpage = btrfs_readpage, + .writepage = btrfs_writepage, + .writepages = btrfs_writepages, + .readpages = btrfs_readpages, + .direct_IO = btrfs_direct_IO, + .invalidatepage = btrfs_invalidatepage, + .releasepage = btrfs_releasepage, + .set_page_dirty = btrfs_set_page_dirty, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations btrfs_symlink_aops = { + .readpage = btrfs_readpage, + .writepage = btrfs_writepage, + .invalidatepage = btrfs_invalidatepage, + .releasepage = btrfs_releasepage, +}; + +static const struct inode_operations btrfs_file_inode_operations = { + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .permission = btrfs_permission, + .fiemap = btrfs_fiemap, +}; +static const struct inode_operations btrfs_special_inode_operations = { + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .permission = btrfs_permission, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, +}; +static const struct inode_operations btrfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .getattr = btrfs_getattr, + .permission = btrfs_permission, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, +}; + +const struct dentry_operations btrfs_dentry_operations = { + .d_delete = btrfs_dentry_delete, +}; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c new file mode 100644 index 00000000..a3c4751e --- /dev/null +++ b/fs/btrfs/ioctl.c @@ -0,0 +1,2914 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "volumes.h" +#include "locking.h" +#include "inode-map.h" + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & ~FS_DIRSYNC_FL; + else + return flags & (FS_NODUMP_FL | FS_NOATIME_FL); +} + +/* + * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl. + */ +static unsigned int btrfs_flags_to_ioctl(unsigned int flags) +{ + unsigned int iflags = 0; + + if (flags & BTRFS_INODE_SYNC) + iflags |= FS_SYNC_FL; + if (flags & BTRFS_INODE_IMMUTABLE) + iflags |= FS_IMMUTABLE_FL; + if (flags & BTRFS_INODE_APPEND) + iflags |= FS_APPEND_FL; + if (flags & BTRFS_INODE_NODUMP) + iflags |= FS_NODUMP_FL; + if (flags & BTRFS_INODE_NOATIME) + iflags |= FS_NOATIME_FL; + if (flags & BTRFS_INODE_DIRSYNC) + iflags |= FS_DIRSYNC_FL; + if (flags & BTRFS_INODE_NODATACOW) + iflags |= FS_NOCOW_FL; + + if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS)) + iflags |= FS_COMPR_FL; + else if (flags & BTRFS_INODE_NOCOMPRESS) + iflags |= FS_NOCOMP_FL; + + return iflags; +} + +/* + * Update inode->i_flags based on the btrfs internal flags. + */ +void btrfs_update_iflags(struct inode *inode) +{ + struct btrfs_inode *ip = BTRFS_I(inode); + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + + if (ip->flags & BTRFS_INODE_SYNC) + inode->i_flags |= S_SYNC; + if (ip->flags & BTRFS_INODE_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + if (ip->flags & BTRFS_INODE_APPEND) + inode->i_flags |= S_APPEND; + if (ip->flags & BTRFS_INODE_NOATIME) + inode->i_flags |= S_NOATIME; + if (ip->flags & BTRFS_INODE_DIRSYNC) + inode->i_flags |= S_DIRSYNC; +} + +/* + * Inherit flags from the parent inode. + * + * Unlike extN we don't have any flags we don't want to inherit currently. + */ +void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) +{ + unsigned int flags; + + if (!dir) + return; + + flags = BTRFS_I(dir)->flags; + + if (S_ISREG(inode->i_mode)) + flags &= ~BTRFS_INODE_DIRSYNC; + else if (!S_ISDIR(inode->i_mode)) + flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); + + BTRFS_I(inode)->flags = flags; + btrfs_update_iflags(inode); +} + +static int btrfs_ioctl_getflags(struct file *file, void __user *arg) +{ + struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode); + unsigned int flags = btrfs_flags_to_ioctl(ip->flags); + + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} + +static int check_flags(unsigned int flags) +{ + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL | FS_DIRSYNC_FL | \ + FS_NOCOMP_FL | FS_COMPR_FL | + FS_NOCOW_FL)) + return -EOPNOTSUPP; + + if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) + return -EINVAL; + + return 0; +} + +static int btrfs_ioctl_setflags(struct file *file, void __user *arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct btrfs_inode *ip = BTRFS_I(inode); + struct btrfs_root *root = ip->root; + struct btrfs_trans_handle *trans; + unsigned int flags, oldflags; + int ret; + + if (btrfs_root_readonly(root)) + return -EROFS; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + ret = check_flags(flags); + if (ret) + return ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + mutex_lock(&inode->i_mutex); + + flags = btrfs_mask_flags(inode->i_mode, flags); + oldflags = btrfs_flags_to_ioctl(ip->flags); + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + ret = -EPERM; + goto out_unlock; + } + } + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + goto out_unlock; + + if (flags & FS_SYNC_FL) + ip->flags |= BTRFS_INODE_SYNC; + else + ip->flags &= ~BTRFS_INODE_SYNC; + if (flags & FS_IMMUTABLE_FL) + ip->flags |= BTRFS_INODE_IMMUTABLE; + else + ip->flags &= ~BTRFS_INODE_IMMUTABLE; + if (flags & FS_APPEND_FL) + ip->flags |= BTRFS_INODE_APPEND; + else + ip->flags &= ~BTRFS_INODE_APPEND; + if (flags & FS_NODUMP_FL) + ip->flags |= BTRFS_INODE_NODUMP; + else + ip->flags &= ~BTRFS_INODE_NODUMP; + if (flags & FS_NOATIME_FL) + ip->flags |= BTRFS_INODE_NOATIME; + else + ip->flags &= ~BTRFS_INODE_NOATIME; + if (flags & FS_DIRSYNC_FL) + ip->flags |= BTRFS_INODE_DIRSYNC; + else + ip->flags &= ~BTRFS_INODE_DIRSYNC; + if (flags & FS_NOCOW_FL) + ip->flags |= BTRFS_INODE_NODATACOW; + else + ip->flags &= ~BTRFS_INODE_NODATACOW; + + /* + * The COMPRESS flag can only be changed by users, while the NOCOMPRESS + * flag may be changed automatically if compression code won't make + * things smaller. + */ + if (flags & FS_NOCOMP_FL) { + ip->flags &= ~BTRFS_INODE_COMPRESS; + ip->flags |= BTRFS_INODE_NOCOMPRESS; + } else if (flags & FS_COMPR_FL) { + ip->flags |= BTRFS_INODE_COMPRESS; + ip->flags &= ~BTRFS_INODE_NOCOMPRESS; + } else { + ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + } + + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + + btrfs_update_iflags(inode); + inode->i_ctime = CURRENT_TIME; + btrfs_end_transaction(trans, root); + + mnt_drop_write(file->f_path.mnt); + + ret = 0; + out_unlock: + mutex_unlock(&inode->i_mutex); + return ret; +} + +static int btrfs_ioctl_getversion(struct file *file, int __user *arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + + return put_user(inode->i_generation, arg); +} + +static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) +{ + struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_device *device; + struct request_queue *q; + struct fstrim_range range; + u64 minlen = ULLONG_MAX; + u64 num_devices = 0; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, + dev_list) { + if (!device->bdev) + continue; + q = bdev_get_queue(device->bdev); + if (blk_queue_discard(q)) { + num_devices++; + minlen = min((u64)q->limits.discard_granularity, + minlen); + } + } + rcu_read_unlock(); + if (!num_devices) + return -EOPNOTSUPP; + + if (copy_from_user(&range, arg, sizeof(range))) + return -EFAULT; + + range.minlen = max(range.minlen, minlen); + ret = btrfs_trim_fs(root, &range); + if (ret < 0) + return ret; + + if (copy_to_user(arg, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + +static noinline int create_subvol(struct btrfs_root *root, + struct dentry *dentry, + char *name, int namelen, + u64 *async_transid) +{ + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + struct btrfs_root *new_root; + struct dentry *parent = dget_parent(dentry); + struct inode *dir; + int ret; + int err; + u64 objectid; + u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; + u64 index = 0; + + ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); + if (ret) { + dput(parent); + return ret; + } + + dir = parent->d_inode; + + /* + * 1 - inode item + * 2 - refs + * 1 - root item + * 2 - dir items + */ + trans = btrfs_start_transaction(root, 6); + if (IS_ERR(trans)) { + dput(parent); + return PTR_ERR(trans); + } + + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, objectid, NULL, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + goto fail; + } + + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(leaf, objectid); + + write_extent_buffer(leaf, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(leaf), + BTRFS_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + inode_item = &root_item.inode; + memset(inode_item, 0, sizeof(*inode_item)); + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nbytes = cpu_to_le64(root->leafsize); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + root_item.flags = 0; + root_item.byte_limit = 0; + inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT); + + btrfs_set_root_bytenr(&root_item, leaf->start); + btrfs_set_root_generation(&root_item, trans->transid); + btrfs_set_root_level(&root_item, 0); + btrfs_set_root_refs(&root_item, 1); + btrfs_set_root_used(&root_item, leaf->len); + btrfs_set_root_last_snapshot(&root_item, 0); + + memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); + root_item.drop_level = 0; + + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + leaf = NULL; + + btrfs_set_root_dirid(&root_item, new_dirid); + + key.objectid = objectid; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + &root_item); + if (ret) + goto fail; + + key.offset = (u64)-1; + new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); + BUG_ON(IS_ERR(new_root)); + + btrfs_record_root_in_trans(trans, new_root); + + ret = btrfs_create_subvol_root(trans, new_root, new_dirid); + /* + * insert the directory item + */ + ret = btrfs_set_inode_index(dir, &index); + BUG_ON(ret); + + ret = btrfs_insert_dir_item(trans, root, + name, namelen, dir, &key, + BTRFS_FT_DIR, index); + if (ret) + goto fail; + + btrfs_i_size_write(dir, dir->i_size + namelen * 2); + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + objectid, root->root_key.objectid, + btrfs_ino(dir), index, name, namelen); + + BUG_ON(ret); + + d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); +fail: + dput(parent); + if (async_transid) { + *async_transid = trans->transid; + err = btrfs_commit_transaction_async(trans, root, 1); + } else { + err = btrfs_commit_transaction(trans, root); + } + if (err && !ret) + ret = err; + return ret; +} + +static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, + char *name, int namelen, u64 *async_transid, + bool readonly) +{ + struct inode *inode; + struct dentry *parent; + struct btrfs_pending_snapshot *pending_snapshot; + struct btrfs_trans_handle *trans; + int ret; + + if (!root->ref_cows) + return -EINVAL; + + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) + return -ENOMEM; + + btrfs_init_block_rsv(&pending_snapshot->block_rsv); + pending_snapshot->dentry = dentry; + pending_snapshot->root = root; + pending_snapshot->readonly = readonly; + + trans = btrfs_start_transaction(root->fs_info->extent_root, 5); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto fail; + } + + ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); + BUG_ON(ret); + + spin_lock(&root->fs_info->trans_lock); + list_add(&pending_snapshot->list, + &trans->transaction->pending_snapshots); + spin_unlock(&root->fs_info->trans_lock); + if (async_transid) { + *async_transid = trans->transid; + ret = btrfs_commit_transaction_async(trans, + root->fs_info->extent_root, 1); + } else { + ret = btrfs_commit_transaction(trans, + root->fs_info->extent_root); + } + BUG_ON(ret); + + ret = pending_snapshot->error; + if (ret) + goto fail; + + ret = btrfs_orphan_cleanup(pending_snapshot->snap); + if (ret) + goto fail; + + parent = dget_parent(dentry); + inode = btrfs_lookup_dentry(parent->d_inode, dentry); + dput(parent); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto fail; + } + BUG_ON(!inode); + d_instantiate(dentry, inode); + ret = 0; +fail: + kfree(pending_snapshot); + return ret; +} + +/* copy of check_sticky in fs/namei.c() +* It's inline, so penalty for filesystems that don't use sticky bit is +* minimal. +*/ +static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) +{ + uid_t fsuid = current_fsuid(); + + if (!(dir->i_mode & S_ISVTX)) + return 0; + if (inode->i_uid == fsuid) + return 0; + if (dir->i_uid == fsuid) + return 0; + return !capable(CAP_FOWNER); +} + +/* copy of may_delete in fs/namei.c() + * Check whether we can remove a link victim from directory dir, check + * whether the type of victim is right. + * 1. We can't do it if dir is read-only (done in permission()) + * 2. We should have write and exec permissions on dir + * 3. We can't remove anything from append-only dir + * 4. We can't do anything with immutable dir (done in permission()) + * 5. If the sticky bit on dir is set we should either + * a. be owner of dir, or + * b. be owner of victim, or + * c. have CAP_FOWNER capability + * 6. If the victim is append-only or immutable we can't do antyhing with + * links pointing to it. + * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. + * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. + * 9. We can't remove a root or mountpoint. + * 10. We don't allow removal of NFS sillyrenamed files; it's handled by + * nfs_async_unlink(). + */ + +static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir) +{ + int error; + + if (!victim->d_inode) + return -ENOENT; + + BUG_ON(victim->d_parent->d_inode != dir); + audit_inode_child(victim, dir); + + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + if (error) + return error; + if (IS_APPEND(dir)) + return -EPERM; + if (btrfs_check_sticky(dir, victim->d_inode)|| + IS_APPEND(victim->d_inode)|| + IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + return -EPERM; + if (isdir) { + if (!S_ISDIR(victim->d_inode->i_mode)) + return -ENOTDIR; + if (IS_ROOT(victim)) + return -EBUSY; + } else if (S_ISDIR(victim->d_inode->i_mode)) + return -EISDIR; + if (IS_DEADDIR(dir)) + return -ENOENT; + if (victim->d_flags & DCACHE_NFSFS_RENAMED) + return -EBUSY; + return 0; +} + +/* copy of may_create in fs/namei.c() */ +static inline int btrfs_may_create(struct inode *dir, struct dentry *child) +{ + if (child->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/* + * Create a new subvolume below @parent. This is largely modeled after + * sys_mkdirat and vfs_mkdir, but we only do a single component lookup + * inside this filesystem so it's quite a bit simpler. + */ +static noinline int btrfs_mksubvol(struct path *parent, + char *name, int namelen, + struct btrfs_root *snap_src, + u64 *async_transid, bool readonly) +{ + struct inode *dir = parent->dentry->d_inode; + struct dentry *dentry; + int error; + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + + dentry = lookup_one_len(name, parent->dentry, namelen); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_unlock; + + error = -EEXIST; + if (dentry->d_inode) + goto out_dput; + + error = mnt_want_write(parent->mnt); + if (error) + goto out_dput; + + error = btrfs_may_create(dir, dentry); + if (error) + goto out_drop_write; + + down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); + + if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) + goto out_up_read; + + if (snap_src) { + error = create_snapshot(snap_src, dentry, + name, namelen, async_transid, readonly); + } else { + error = create_subvol(BTRFS_I(dir)->root, dentry, + name, namelen, async_transid); + } + if (!error) + fsnotify_mkdir(dir, dentry); +out_up_read: + up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); +out_drop_write: + mnt_drop_write(parent->mnt); +out_dput: + dput(dentry); +out_unlock: + mutex_unlock(&dir->i_mutex); + return error; +} + +/* + * When we're defragging a range, we don't want to kick it off again + * if it is really just waiting for delalloc to send it down. + * If we find a nice big extent or delalloc range for the bytes in the + * file you want to defrag, we return 0 to let you know to skip this + * part of the file + */ +static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh) +{ + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 end; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); + read_unlock(&em_tree->lock); + + if (em) { + end = extent_map_end(em); + free_extent_map(em); + if (end - offset > thresh) + return 0; + } + /* if we already have a nice delalloc here, just stop */ + thresh /= 2; + end = count_range_bits(io_tree, &offset, offset + thresh, + thresh, EXTENT_DELALLOC, 1); + if (end >= thresh) + return 0; + return 1; +} + +/* + * helper function to walk through a file and find extents + * newer than a specific transid, and smaller than thresh. + * + * This is used by the defragging code to find new and small + * extents + */ +static int find_new_extents(struct btrfs_root *root, + struct inode *inode, u64 newer_than, + u64 *off, int thresh) +{ + struct btrfs_path *path; + struct btrfs_key min_key; + struct btrfs_key max_key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *extent; + int type; + int ret; + u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + min_key.objectid = ino; + min_key.type = BTRFS_EXTENT_DATA_KEY; + min_key.offset = *off; + + max_key.objectid = ino; + max_key.type = (u8)-1; + max_key.offset = (u64)-1; + + path->keep_locks = 1; + + while(1) { + ret = btrfs_search_forward(root, &min_key, &max_key, + path, 0, newer_than); + if (ret != 0) + goto none; + if (min_key.objectid != ino) + goto none; + if (min_key.type != BTRFS_EXTENT_DATA_KEY) + goto none; + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG && + btrfs_file_extent_num_bytes(leaf, extent) < thresh && + check_defrag_in_cache(inode, min_key.offset, thresh)) { + *off = min_key.offset; + btrfs_free_path(path); + return 0; + } + + if (min_key.offset == (u64)-1) + goto none; + + min_key.offset++; + btrfs_release_path(path); + } +none: + btrfs_free_path(path); + return -ENOENT; +} + +static int should_defrag_range(struct inode *inode, u64 start, u64 len, + int thresh, u64 *last_len, u64 *skip, + u64 *defrag_end) +{ + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + int ret = 1; + + /* + * make sure that once we start defragging and extent, we keep on + * defragging it + */ + if (start < *defrag_end) + return 1; + + *skip = 0; + + /* + * hopefully we have this extent in the tree already, try without + * the full extent lock + */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + read_unlock(&em_tree->lock); + + if (!em) { + /* get the big lock and read metadata off disk */ + lock_extent(io_tree, start, start + len - 1, GFP_NOFS); + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + unlock_extent(io_tree, start, start + len - 1, GFP_NOFS); + + if (IS_ERR(em)) + return 0; + } + + /* this will cover holes, and inline extents */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) + ret = 0; + + /* + * we hit a real extent, if it is big don't bother defragging it again + */ + if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh) + ret = 0; + + /* + * last_len ends up being a counter of how many bytes we've defragged. + * every time we choose not to defrag an extent, we reset *last_len + * so that the next tiny extent will force a defrag. + * + * The end result of this is that tiny extents before a single big + * extent will force at least part of that big extent to be defragged. + */ + if (ret) { + *last_len += len; + *defrag_end = extent_map_end(em); + } else { + *last_len = 0; + *skip = extent_map_end(em); + *defrag_end = 0; + } + + free_extent_map(em); + return ret; +} + +/* + * it doesn't do much good to defrag one or two pages + * at a time. This pulls in a nice chunk of pages + * to COW and defrag. + * + * It also makes sure the delalloc code has enough + * dirty data to avoid making new small extents as part + * of the defrag + * + * It's a good idea to start RA on this range + * before calling this. + */ +static int cluster_pages_for_defrag(struct inode *inode, + struct page **pages, + unsigned long start_index, + int num_pages) +{ + unsigned long file_end; + u64 isize = i_size_read(inode); + u64 page_start; + u64 page_end; + int ret; + int i; + int i_done; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + + if (isize == 0) + return 0; + file_end = (isize - 1) >> PAGE_CACHE_SHIFT; + + ret = btrfs_delalloc_reserve_space(inode, + num_pages << PAGE_CACHE_SHIFT); + if (ret) + return ret; +again: + ret = 0; + i_done = 0; + + /* step one, lock all the pages */ + for (i = 0; i < num_pages; i++) { + struct page *page; + page = grab_cache_page(inode->i_mapping, + start_index + i); + if (!page) + break; + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + break; + } + } + isize = i_size_read(inode); + file_end = (isize - 1) >> PAGE_CACHE_SHIFT; + if (!isize || page->index > file_end || + page->mapping != inode->i_mapping) { + /* whoops, we blew past eof, skip this page */ + unlock_page(page); + page_cache_release(page); + break; + } + pages[i] = page; + i_done++; + } + if (!i_done || ret) + goto out; + + if (!(inode->i_sb->s_flags & MS_ACTIVE)) + goto out; + + /* + * so now we have a nice long stream of locked + * and up to date pages, lets wait on them + */ + for (i = 0; i < i_done; i++) + wait_on_page_writeback(pages[i]); + + page_start = page_offset(pages[0]); + page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; + + lock_extent_bits(&BTRFS_I(inode)->io_tree, + page_start, page_end - 1, 0, &cached_state, + GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1); + if (ordered && + ordered->file_offset + ordered->len > page_start && + ordered->file_offset < page_end) { + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + page_start, page_end - 1, + &cached_state, GFP_NOFS); + for (i = 0; i < i_done; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + btrfs_wait_ordered_range(inode, page_start, + page_end - page_start); + goto again; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, + page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, + GFP_NOFS); + + if (i_done != num_pages) { + atomic_inc(&BTRFS_I(inode)->outstanding_extents); + btrfs_delalloc_release_space(inode, + (num_pages - i_done) << PAGE_CACHE_SHIFT); + } + + + btrfs_set_extent_delalloc(inode, page_start, page_end - 1, + &cached_state); + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + page_start, page_end - 1, &cached_state, + GFP_NOFS); + + for (i = 0; i < i_done; i++) { + clear_page_dirty_for_io(pages[i]); + ClearPageChecked(pages[i]); + set_page_extent_mapped(pages[i]); + set_page_dirty(pages[i]); + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + return i_done; +out: + for (i = 0; i < i_done; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT); + return ret; + +} + +int btrfs_defrag_file(struct inode *inode, struct file *file, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_to_defrag) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_super_block *disk_super; + struct file_ra_state *ra = NULL; + unsigned long last_index; + u64 features; + u64 last_len = 0; + u64 skip = 0; + u64 defrag_end = 0; + u64 newer_off = range->start; + int newer_left = 0; + unsigned long i; + int ret; + int defrag_count = 0; + int compress_type = BTRFS_COMPRESS_ZLIB; + int extent_thresh = range->extent_thresh; + int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + u64 new_align = ~((u64)128 * 1024 - 1); + struct page **pages = NULL; + + if (extent_thresh == 0) + extent_thresh = 256 * 1024; + + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { + if (range->compress_type > BTRFS_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } + + if (inode->i_size == 0) + return 0; + + /* + * if we were not given a file, allocate a readahead + * context + */ + if (!file) { + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + file_ra_state_init(ra, inode->i_mapping); + } else { + ra = &file->f_ra; + } + + pages = kmalloc(sizeof(struct page *) * newer_cluster, + GFP_NOFS); + if (!pages) { + ret = -ENOMEM; + goto out_ra; + } + + /* find the last page to defrag */ + if (range->start + range->len > range->start) { + last_index = min_t(u64, inode->i_size - 1, + range->start + range->len - 1) >> PAGE_CACHE_SHIFT; + } else { + last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + } + + if (newer_than) { + ret = find_new_extents(root, inode, newer_than, + &newer_off, 64 * 1024); + if (!ret) { + range->start = newer_off; + /* + * we always align our defrag to help keep + * the extents in the file evenly spaced + */ + i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; + newer_left = newer_cluster; + } else + goto out_ra; + } else { + i = range->start >> PAGE_CACHE_SHIFT; + } + if (!max_to_defrag) + max_to_defrag = last_index - 1; + + while (i <= last_index && defrag_count < max_to_defrag) { + /* + * make sure we stop running if someone unmounts + * the FS + */ + if (!(inode->i_sb->s_flags & MS_ACTIVE)) + break; + + if (!newer_than && + !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, + extent_thresh, + &last_len, &skip, + &defrag_end)) { + unsigned long next; + /* + * the should_defrag function tells us how much to skip + * bump our counter by the suggested amount + */ + next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + i = max(i + 1, next); + continue; + } + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) + BTRFS_I(inode)->force_compress = compress_type; + + btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); + + ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); + if (ret < 0) + goto out_ra; + + defrag_count += ret; + balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); + i += ret; + + if (newer_than) { + if (newer_off == (u64)-1) + break; + + newer_off = max(newer_off + 1, + (u64)i << PAGE_CACHE_SHIFT); + + ret = find_new_extents(root, inode, + newer_than, &newer_off, + 64 * 1024); + if (!ret) { + range->start = newer_off; + i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; + newer_left = newer_cluster; + } else { + break; + } + } else { + i++; + } + } + + if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) + filemap_flush(inode->i_mapping); + + if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + /* the filemap_flush will queue IO into the worker threads, but + * we have to make sure the IO is actually started and that + * ordered extents get created before we return + */ + atomic_inc(&root->fs_info->async_submit_draining); + while (atomic_read(&root->fs_info->nr_async_submits) || + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->nr_async_submits) == 0 && + atomic_read(&root->fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&root->fs_info->async_submit_draining); + + mutex_lock(&inode->i_mutex); + BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; + mutex_unlock(&inode->i_mutex); + } + + disk_super = &root->fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (range->compress_type == BTRFS_COMPRESS_LZO) { + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + btrfs_set_super_incompat_flags(disk_super, features); + } + + if (!file) + kfree(ra); + return defrag_count; + +out_ra: + if (!file) + kfree(ra); + kfree(pages); + return ret; +} + +static noinline int btrfs_ioctl_resize(struct btrfs_root *root, + void __user *arg) +{ + u64 new_size; + u64 old_size; + u64 devid = 1; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_trans_handle *trans; + struct btrfs_device *device = NULL; + char *sizestr; + char *devstr = NULL; + int ret = 0; + int mod = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + + mutex_lock(&root->fs_info->volume_mutex); + sizestr = vol_args->name; + devstr = strchr(sizestr, ':'); + if (devstr) { + char *end; + sizestr = devstr + 1; + *devstr = '\0'; + devstr = vol_args->name; + devid = simple_strtoull(devstr, &end, 10); + printk(KERN_INFO "resizing devid %llu\n", + (unsigned long long)devid); + } + device = btrfs_find_device(root, devid, NULL, NULL); + if (!device) { + printk(KERN_INFO "resizer unable to find device %llu\n", + (unsigned long long)devid); + ret = -EINVAL; + goto out_unlock; + } + if (!strcmp(sizestr, "max")) + new_size = device->bdev->bd_inode->i_size; + else { + if (sizestr[0] == '-') { + mod = -1; + sizestr++; + } else if (sizestr[0] == '+') { + mod = 1; + sizestr++; + } + new_size = memparse(sizestr, NULL); + if (new_size == 0) { + ret = -EINVAL; + goto out_unlock; + } + } + + old_size = device->total_bytes; + + if (mod < 0) { + if (new_size > old_size) { + ret = -EINVAL; + goto out_unlock; + } + new_size = old_size - new_size; + } else if (mod > 0) { + new_size = old_size + new_size; + } + + if (new_size < 256 * 1024 * 1024) { + ret = -EINVAL; + goto out_unlock; + } + if (new_size > device->bdev->bd_inode->i_size) { + ret = -EFBIG; + goto out_unlock; + } + + do_div(new_size, root->sectorsize); + new_size *= root->sectorsize; + + printk(KERN_INFO "new size for %s is %llu\n", + device->name, (unsigned long long)new_size); + + if (new_size > old_size) { + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + ret = btrfs_grow_device(trans, device, new_size); + btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_shrink_device(device, new_size); + } + +out_unlock: + mutex_unlock(&root->fs_info->volume_mutex); + kfree(vol_args); + return ret; +} + +static noinline int btrfs_ioctl_snap_create_transid(struct file *file, + char *name, + unsigned long fd, + int subvol, + u64 *transid, + bool readonly) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct file *src_file; + int namelen; + int ret = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + namelen = strlen(name); + if (strchr(name, '/')) { + ret = -EINVAL; + goto out; + } + + if (subvol) { + ret = btrfs_mksubvol(&file->f_path, name, namelen, + NULL, transid, readonly); + } else { + struct inode *src_inode; + src_file = fget(fd); + if (!src_file) { + ret = -EINVAL; + goto out; + } + + src_inode = src_file->f_path.dentry->d_inode; + if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { + printk(KERN_INFO "btrfs: Snapshot src from " + "another FS\n"); + ret = -EINVAL; + fput(src_file); + goto out; + } + ret = btrfs_mksubvol(&file->f_path, name, namelen, + BTRFS_I(src_inode)->root, + transid, readonly); + fput(src_file); + } +out: + return ret; +} + +static noinline int btrfs_ioctl_snap_create(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + + ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, + vol_args->fd, subvol, + NULL, false); + + kfree(vol_args); + return ret; +} + +static noinline int btrfs_ioctl_snap_create_v2(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_ioctl_vol_args_v2 *vol_args; + int ret; + u64 transid = 0; + u64 *ptr = NULL; + bool readonly = false; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + + if (vol_args->flags & + ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) + ptr = &transid; + if (vol_args->flags & BTRFS_SUBVOL_RDONLY) + readonly = true; + + ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, + vol_args->fd, subvol, + ptr, readonly); + + if (ret == 0 && ptr && + copy_to_user(arg + + offsetof(struct btrfs_ioctl_vol_args_v2, + transid), ptr, sizeof(*ptr))) + ret = -EFAULT; +out: + kfree(vol_args); + return ret; +} + +static noinline int btrfs_ioctl_subvol_getflags(struct file *file, + void __user *arg) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + u64 flags = 0; + + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) + return -EINVAL; + + down_read(&root->fs_info->subvol_sem); + if (btrfs_root_readonly(root)) + flags |= BTRFS_SUBVOL_RDONLY; + up_read(&root->fs_info->subvol_sem); + + if (copy_to_user(arg, &flags, sizeof(flags))) + ret = -EFAULT; + + return ret; +} + +static noinline int btrfs_ioctl_subvol_setflags(struct file *file, + void __user *arg) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 root_flags; + u64 flags; + int ret = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) + return -EINVAL; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & BTRFS_SUBVOL_CREATE_ASYNC) + return -EINVAL; + + if (flags & ~BTRFS_SUBVOL_RDONLY) + return -EOPNOTSUPP; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + down_write(&root->fs_info->subvol_sem); + + /* nothing to do */ + if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) + goto out; + + root_flags = btrfs_root_flags(&root->root_item); + if (flags & BTRFS_SUBVOL_RDONLY) + btrfs_set_root_flags(&root->root_item, + root_flags | BTRFS_ROOT_SUBVOL_RDONLY); + else + btrfs_set_root_flags(&root->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_reset; + } + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + + btrfs_commit_transaction(trans, root); +out_reset: + if (ret) + btrfs_set_root_flags(&root->root_item, root_flags); +out: + up_write(&root->fs_info->subvol_sem); + return ret; +} + +/* + * helper to check if the subvolume references other subvolumes + */ +static noinline int may_destroy_subvol(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = root->root_key.objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, + &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + ret = 0; + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == root->root_key.objectid && + key.type == BTRFS_ROOT_REF_KEY) + ret = -ENOTEMPTY; + } +out: + btrfs_free_path(path); + return ret; +} + +static noinline int key_in_sk(struct btrfs_key *key, + struct btrfs_ioctl_search_key *sk) +{ + struct btrfs_key test; + int ret; + + test.objectid = sk->min_objectid; + test.type = sk->min_type; + test.offset = sk->min_offset; + + ret = btrfs_comp_cpu_keys(key, &test); + if (ret < 0) + return 0; + + test.objectid = sk->max_objectid; + test.type = sk->max_type; + test.offset = sk->max_offset; + + ret = btrfs_comp_cpu_keys(key, &test); + if (ret > 0) + return 0; + return 1; +} + +static noinline int copy_to_sk(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + struct btrfs_ioctl_search_key *sk, + char *buf, + unsigned long *sk_offset, + int *num_found) +{ + u64 found_transid; + struct extent_buffer *leaf; + struct btrfs_ioctl_search_header sh; + unsigned long item_off; + unsigned long item_len; + int nritems; + int i; + int slot; + int ret = 0; + + leaf = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(leaf); + + if (btrfs_header_generation(leaf) > sk->max_transid) { + i = nritems; + goto advance_key; + } + found_transid = btrfs_header_generation(leaf); + + for (i = slot; i < nritems; i++) { + item_off = btrfs_item_ptr_offset(leaf, i); + item_len = btrfs_item_size_nr(leaf, i); + + if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE) + item_len = 0; + + if (sizeof(sh) + item_len + *sk_offset > + BTRFS_SEARCH_ARGS_BUFSIZE) { + ret = 1; + goto overflow; + } + + btrfs_item_key_to_cpu(leaf, key, i); + if (!key_in_sk(key, sk)) + continue; + + sh.objectid = key->objectid; + sh.offset = key->offset; + sh.type = key->type; + sh.len = item_len; + sh.transid = found_transid; + + /* copy search result header */ + memcpy(buf + *sk_offset, &sh, sizeof(sh)); + *sk_offset += sizeof(sh); + + if (item_len) { + char *p = buf + *sk_offset; + /* copy the item */ + read_extent_buffer(leaf, p, + item_off, item_len); + *sk_offset += item_len; + } + (*num_found)++; + + if (*num_found >= sk->nr_items) + break; + } +advance_key: + ret = 0; + if (key->offset < (u64)-1 && key->offset < sk->max_offset) + key->offset++; + else if (key->type < (u8)-1 && key->type < sk->max_type) { + key->offset = 0; + key->type++; + } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) { + key->offset = 0; + key->type = 0; + key->objectid++; + } else + ret = 1; +overflow: + return ret; +} + +static noinline int search_ioctl(struct inode *inode, + struct btrfs_ioctl_search_args *args) +{ + struct btrfs_root *root; + struct btrfs_key key; + struct btrfs_key max_key; + struct btrfs_path *path; + struct btrfs_ioctl_search_key *sk = &args->key; + struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info; + int ret; + int num_found = 0; + unsigned long sk_offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (sk->tree_id == 0) { + /* search the root of the inode that was passed */ + root = BTRFS_I(inode)->root; + } else { + key.objectid = sk->tree_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(info, &key); + if (IS_ERR(root)) { + printk(KERN_ERR "could not find root %llu\n", + sk->tree_id); + btrfs_free_path(path); + return -ENOENT; + } + } + + key.objectid = sk->min_objectid; + key.type = sk->min_type; + key.offset = sk->min_offset; + + max_key.objectid = sk->max_objectid; + max_key.type = sk->max_type; + max_key.offset = sk->max_offset; + + path->keep_locks = 1; + + while(1) { + ret = btrfs_search_forward(root, &key, &max_key, path, 0, + sk->min_transid); + if (ret != 0) { + if (ret > 0) + ret = 0; + goto err; + } + ret = copy_to_sk(root, path, &key, sk, args->buf, + &sk_offset, &num_found); + btrfs_release_path(path); + if (ret || num_found >= sk->nr_items) + break; + + } + ret = 0; +err: + sk->nr_items = num_found; + btrfs_free_path(path); + return ret; +} + +static noinline int btrfs_ioctl_tree_search(struct file *file, + void __user *argp) +{ + struct btrfs_ioctl_search_args *args; + struct inode *inode; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + args = memdup_user(argp, sizeof(*args)); + if (IS_ERR(args)) + return PTR_ERR(args); + + inode = fdentry(file)->d_inode; + ret = search_ioctl(inode, args); + if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) + ret = -EFAULT; + kfree(args); + return ret; +} + +/* + * Search INODE_REFs to identify path name of 'dirid' directory + * in a 'tree_id' tree. and sets path name to 'name'. + */ +static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, + u64 tree_id, u64 dirid, char *name) +{ + struct btrfs_root *root; + struct btrfs_key key; + char *ptr; + int ret = -1; + int slot; + int len; + int total_len = 0; + struct btrfs_inode_ref *iref; + struct extent_buffer *l; + struct btrfs_path *path; + + if (dirid == BTRFS_FIRST_FREE_OBJECTID) { + name[0]='\0'; + return 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX]; + + key.objectid = tree_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(info, &key); + if (IS_ERR(root)) { + printk(KERN_ERR "could not find root %llu\n", tree_id); + ret = -ENOENT; + goto out; + } + + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + while(1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + if (ret > 0 && slot > 0) + slot--; + btrfs_item_key_to_cpu(l, &key, slot); + + if (ret > 0 && (key.objectid != dirid || + key.type != BTRFS_INODE_REF_KEY)) { + ret = -ENOENT; + goto out; + } + + iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(l, iref); + ptr -= len + 1; + total_len += len + 1; + if (ptr < name) + goto out; + + *(ptr + len) = '/'; + read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len); + + if (key.offset == BTRFS_FIRST_FREE_OBJECTID) + break; + + btrfs_release_path(path); + key.objectid = key.offset; + key.offset = (u64)-1; + dirid = key.objectid; + + } + if (ptr < name) + goto out; + memcpy(name, ptr, total_len); + name[total_len]='\0'; + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static noinline int btrfs_ioctl_ino_lookup(struct file *file, + void __user *argp) +{ + struct btrfs_ioctl_ino_lookup_args *args; + struct inode *inode; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + args = memdup_user(argp, sizeof(*args)); + if (IS_ERR(args)) + return PTR_ERR(args); + + inode = fdentry(file)->d_inode; + + if (args->treeid == 0) + args->treeid = BTRFS_I(inode)->root->root_key.objectid; + + ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, + args->treeid, args->objectid, + args->name); + + if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) + ret = -EFAULT; + + kfree(args); + return ret; +} + +static noinline int btrfs_ioctl_snap_destroy(struct file *file, + void __user *arg) +{ + struct dentry *parent = fdentry(file); + struct dentry *dentry; + struct inode *dir = parent->d_inode; + struct inode *inode; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *dest = NULL; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_trans_handle *trans; + int namelen; + int ret; + int err = 0; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + namelen = strlen(vol_args->name); + if (strchr(vol_args->name, '/') || + strncmp(vol_args->name, "..", namelen) == 0) { + err = -EINVAL; + goto out; + } + + err = mnt_want_write(file->f_path.mnt); + if (err) + goto out; + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + dentry = lookup_one_len(vol_args->name, parent, namelen); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_unlock_dir; + } + + if (!dentry->d_inode) { + err = -ENOENT; + goto out_dput; + } + + inode = dentry->d_inode; + dest = BTRFS_I(inode)->root; + if (!capable(CAP_SYS_ADMIN)){ + /* + * Regular user. Only allow this with a special mount + * option, when the user has write+exec access to the + * subvol root, and when rmdir(2) would have been + * allowed. + * + * Note that this is _not_ check that the subvol is + * empty or doesn't contain data that we wouldn't + * otherwise be able to delete. + * + * Users who want to delete empty subvols should try + * rmdir(2). + */ + err = -EPERM; + if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + goto out_dput; + + /* + * Do not allow deletion if the parent dir is the same + * as the dir to be deleted. That means the ioctl + * must be called on the dentry referencing the root + * of the subvol, not a random directory contained + * within it. + */ + err = -EINVAL; + if (root == dest) + goto out_dput; + + err = inode_permission(inode, MAY_WRITE | MAY_EXEC); + if (err) + goto out_dput; + + /* check if subvolume may be deleted by a non-root user */ + err = btrfs_may_delete(dir, dentry, 1); + if (err) + goto out_dput; + } + + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { + err = -EINVAL; + goto out_dput; + } + + mutex_lock(&inode->i_mutex); + err = d_invalidate(dentry); + if (err) + goto out_unlock; + + down_write(&root->fs_info->subvol_sem); + + err = may_destroy_subvol(dest); + if (err) + goto out_up_write; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_up_write; + } + trans->block_rsv = &root->fs_info->global_block_rsv; + + ret = btrfs_unlink_subvol(trans, root, dir, + dest->root_key.objectid, + dentry->d_name.name, + dentry->d_name.len); + BUG_ON(ret); + + btrfs_record_root_in_trans(trans, dest); + + memset(&dest->root_item.drop_progress, 0, + sizeof(dest->root_item.drop_progress)); + dest->root_item.drop_level = 0; + btrfs_set_root_refs(&dest->root_item, 0); + + if (!xchg(&dest->orphan_item_inserted, 1)) { + ret = btrfs_insert_orphan_item(trans, + root->fs_info->tree_root, + dest->root_key.objectid); + BUG_ON(ret); + } + + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + inode->i_flags |= S_DEAD; +out_up_write: + up_write(&root->fs_info->subvol_sem); +out_unlock: + mutex_unlock(&inode->i_mutex); + if (!err) { + shrink_dcache_sb(root->fs_info->sb); + btrfs_invalidate_inodes(dest); + d_delete(dentry); + } +out_dput: + dput(dentry); +out_unlock_dir: + mutex_unlock(&dir->i_mutex); + mnt_drop_write(file->f_path.mnt); +out: + kfree(vol_args); + return err; +} + +static int btrfs_ioctl_defrag(struct file *file, void __user *argp) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ioctl_defrag_range_args *range; + int ret; + + if (btrfs_root_readonly(root)) + return -EROFS; + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; + + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + ret = btrfs_defrag_root(root, 0); + if (ret) + goto out; + ret = btrfs_defrag_root(root->fs_info->extent_root, 0); + break; + case S_IFREG: + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EINVAL; + goto out; + } + + range = kzalloc(sizeof(*range), GFP_KERNEL); + if (!range) { + ret = -ENOMEM; + goto out; + } + + if (argp) { + if (copy_from_user(range, argp, + sizeof(*range))) { + ret = -EFAULT; + kfree(range); + goto out; + } + /* compression requires us to start the IO */ + if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + range->flags |= BTRFS_DEFRAG_RANGE_START_IO; + range->extent_thresh = (u32)-1; + } + } else { + /* the rest are all set to zero by kzalloc */ + range->len = (u64)-1; + } + ret = btrfs_defrag_file(fdentry(file)->d_inode, file, + range, 0, 0); + if (ret > 0) + ret = 0; + kfree(range); + break; + default: + ret = -EINVAL; + } +out: + mnt_drop_write(file->f_path.mnt); + return ret; +} + +static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_init_new_device(root, vol_args->name); + + kfree(vol_args); + return ret; +} + +static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_rm_device(root, vol_args->name); + + kfree(vol_args); + return ret; +} + +static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_fs_info_args *fi_args; + struct btrfs_device *device; + struct btrfs_device *next; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); + if (!fi_args) + return -ENOMEM; + + fi_args->num_devices = fs_devices->num_devices; + memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { + if (device->devid > fi_args->max_id) + fi_args->max_id = device->devid; + } + mutex_unlock(&fs_devices->device_list_mutex); + + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) + ret = -EFAULT; + + kfree(fi_args); + return ret; +} + +static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_dev_info_args *di_args; + struct btrfs_device *dev; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int ret = 0; + char *s_uuid = NULL; + char empty_uuid[BTRFS_UUID_SIZE] = {0}; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + di_args = memdup_user(arg, sizeof(*di_args)); + if (IS_ERR(di_args)) + return PTR_ERR(di_args); + + if (memcmp(empty_uuid, di_args->uuid, BTRFS_UUID_SIZE) != 0) + s_uuid = di_args->uuid; + + mutex_lock(&fs_devices->device_list_mutex); + dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); + mutex_unlock(&fs_devices->device_list_mutex); + + if (!dev) { + ret = -ENODEV; + goto out; + } + + di_args->devid = dev->devid; + di_args->bytes_used = dev->bytes_used; + di_args->total_bytes = dev->total_bytes; + memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); + strncpy(di_args->path, dev->name, sizeof(di_args->path)); + +out: + if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) + ret = -EFAULT; + + kfree(di_args); + return ret; +} + +static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct file *src_file; + struct inode *src; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct extent_buffer *leaf; + char *buf; + struct btrfs_key key; + u32 nritems; + int slot; + int ret; + u64 len = olen; + u64 bs = root->fs_info->sb->s_blocksize; + u64 hint_byte; + + /* + * TODO: + * - split compressed inline extents. annoying: we need to + * decompress into destination's address_space (the file offset + * may change, so source mapping won't do), then recompress (or + * otherwise reinsert) a subrange. + * - allow ranges within the same file to be cloned (provided + * they don't overlap)? + */ + + /* the destination must be opened for writing */ + if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) + return -EINVAL; + + if (btrfs_root_readonly(root)) + return -EROFS; + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; + + src_file = fget(srcfd); + if (!src_file) { + ret = -EBADF; + goto out_drop_write; + } + + src = src_file->f_dentry->d_inode; + + ret = -EINVAL; + if (src == inode) + goto out_fput; + + /* the src must be open for reading */ + if (!(src_file->f_mode & FMODE_READ)) + goto out_fput; + + ret = -EISDIR; + if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) + goto out_fput; + + ret = -EXDEV; + if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root) + goto out_fput; + + ret = -ENOMEM; + buf = vmalloc(btrfs_level_size(root, 0)); + if (!buf) + goto out_fput; + + path = btrfs_alloc_path(); + if (!path) { + vfree(buf); + goto out_fput; + } + path->reada = 2; + + if (inode < src) { + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + } + + /* determine range to clone */ + ret = -EINVAL; + if (off + len > src->i_size || off + len < off) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - off; + /* if we extend to eof, continue to block boundary */ + if (off + len == src->i_size) + len = ALIGN(src->i_size, bs) - off; + + /* verify the end result is block aligned */ + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || + !IS_ALIGNED(destoff, bs)) + goto out_unlock; + + /* do any pending delalloc/csum calc on src, one way or + another, and lock file content */ + while (1) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(src, off+len); + if (!ordered && + !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, + EXTENT_DELALLOC, 0, NULL)) + break; + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + if (ordered) + btrfs_put_ordered_extent(ordered); + btrfs_wait_ordered_range(src, off, len); + } + + /* clone data */ + key.objectid = btrfs_ino(src); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + + while (1) { + /* + * note the key will change type as we walk through the + * tree. + */ + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + nritems = btrfs_header_nritems(path->nodes[0]); + } + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || + key.objectid != btrfs_ino(src)) + break; + + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + struct btrfs_file_extent_item *extent; + int type; + u32 size; + struct btrfs_key new_key; + u64 disko = 0, diskl = 0; + u64 datao = 0, datal = 0; + u8 comp; + u64 endoff; + + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + comp = btrfs_file_extent_compression(leaf, extent); + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + disko = btrfs_file_extent_disk_bytenr(leaf, + extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, + extent); + datao = btrfs_file_extent_offset(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, + extent); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* take upper bound, may be compressed */ + datal = btrfs_file_extent_ram_bytes(leaf, + extent); + } + btrfs_release_path(path); + + if (key.offset + datal <= off || + key.offset >= off+len) + goto next; + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = btrfs_ino(inode); + if (off <= key.offset) + new_key.offset = key.offset + destoff - off; + else + new_key.offset = destoff; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + ret = btrfs_drop_extents(trans, inode, + new_key.offset, + new_key.offset + datal, + &hint_byte, 1); + BUG_ON(ret); + + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + BUG_ON(ret); + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + + /* disko == 0 means it's a hole */ + if (!disko) + datao = 0; + + btrfs_set_file_extent_offset(leaf, extent, + datao); + btrfs_set_file_extent_num_bytes(leaf, extent, + datal); + if (disko) { + inode_add_bytes(inode, datal); + ret = btrfs_inc_extent_ref(trans, root, + disko, diskl, 0, + root->root_key.objectid, + btrfs_ino(inode), + new_key.offset - datao); + BUG_ON(ret); + } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 skip = 0; + u64 trim = 0; + if (off > key.offset) { + skip = off - key.offset; + new_key.offset += skip; + } + + if (key.offset + datal > off+len) + trim = key.offset + datal - (off+len); + + if (comp && (skip || trim)) { + ret = -EINVAL; + btrfs_end_transaction(trans, root); + goto out; + } + size -= skip + trim; + datal -= skip + trim; + + ret = btrfs_drop_extents(trans, inode, + new_key.offset, + new_key.offset + datal, + &hint_byte, 1); + BUG_ON(ret); + + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + BUG_ON(ret); + + if (skip) { + u32 start = + btrfs_file_extent_calc_inline_size(0); + memmove(buf+start, buf+start+skip, + datal); + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + inode_add_bytes(inode, datal); + } + + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + /* + * we round up to the block size at eof when + * determining which extents to clone above, + * but shouldn't round up the file size + */ + endoff = new_key.offset + datal; + if (endoff > destoff+olen) + endoff = destoff+olen; + if (endoff > inode->i_size) + btrfs_i_size_write(inode, endoff); + + BTRFS_I(inode)->flags = BTRFS_I(src)->flags; + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + btrfs_end_transaction(trans, root); + } +next: + btrfs_release_path(path); + key.offset++; + } + ret = 0; +out: + btrfs_release_path(path); + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); +out_unlock: + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); + vfree(buf); + btrfs_free_path(path); +out_fput: + fput(src_file); +out_drop_write: + mnt_drop_write(file->f_path.mnt); + return ret; +} + +static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_clone_range_args args; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, + args.src_length, args.dest_offset); +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +static long btrfs_ioctl_trans_start(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret; + + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -EINPROGRESS; + if (file->private_data) + goto out; + + ret = -EROFS; + if (btrfs_root_readonly(root)) + goto out; + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + goto out; + + atomic_inc(&root->fs_info->open_ioctl_trans); + + ret = -ENOMEM; + trans = btrfs_start_ioctl_transaction(root); + if (IS_ERR(trans)) + goto out_drop; + + file->private_data = trans; + return 0; + +out_drop: + atomic_dec(&root->fs_info->open_ioctl_trans); + mnt_drop_write(file->f_path.mnt); +out: + return ret; +} + +static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *new_root; + struct btrfs_dir_item *di; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_key location; + struct btrfs_disk_key disk_key; + struct btrfs_super_block *disk_super; + u64 features; + u64 objectid = 0; + u64 dir_id; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&objectid, argp, sizeof(objectid))) + return -EFAULT; + + if (!objectid) + objectid = root->root_key.objectid; + + location.objectid = objectid; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = (u64)-1; + + new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); + if (IS_ERR(new_root)) + return PTR_ERR(new_root); + + if (btrfs_root_refs(&new_root->root_item) == 0) + return -ENOENT; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, + dir_id, "default", 7, 1); + if (IS_ERR_OR_NULL(di)) { + btrfs_free_path(path); + btrfs_end_transaction(trans, root); + printk(KERN_ERR "Umm, you don't have the default dir item, " + "this isn't going to work\n"); + return -ENOENT; + } + + btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); + btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + + disk_super = &root->fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { + features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; + btrfs_set_super_incompat_flags(disk_super, features); + } + btrfs_end_transaction(trans, root); + + return 0; +} + +static void get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space) +{ + struct btrfs_block_group_cache *block_group; + + space->total_bytes = 0; + space->used_bytes = 0; + space->flags = 0; + list_for_each_entry(block_group, groups_list, list) { + space->flags = block_group->flags; + space->total_bytes += block_group->key.offset; + space->used_bytes += + btrfs_block_group_used(&block_group->item); + } +} + +long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_space_args space_args; + struct btrfs_ioctl_space_info space; + struct btrfs_ioctl_space_info *dest; + struct btrfs_ioctl_space_info *dest_orig; + struct btrfs_ioctl_space_info __user *user_dest; + struct btrfs_space_info *info; + u64 types[] = {BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + int num_types = 4; + int alloc_size; + int ret = 0; + u64 slot_count = 0; + int i, c; + + if (copy_from_user(&space_args, + (struct btrfs_ioctl_space_args __user *)arg, + sizeof(space_args))) + return -EFAULT; + + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + info = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &root->fs_info->space_info, + list) { + if (tmp->flags == types[i]) { + info = tmp; + break; + } + } + rcu_read_unlock(); + + if (!info) + continue; + + down_read(&info->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&info->block_groups[c])) + slot_count++; + } + up_read(&info->groups_sem); + } + + /* space_slots == 0 means they are asking for a count */ + if (space_args.space_slots == 0) { + space_args.total_spaces = slot_count; + goto out; + } + + slot_count = min_t(u64, space_args.space_slots, slot_count); + + alloc_size = sizeof(*dest) * slot_count; + + /* we generally have at most 6 or so space infos, one for each raid + * level. So, a whole page should be more than enough for everyone + */ + if (alloc_size > PAGE_CACHE_SIZE) + return -ENOMEM; + + space_args.total_spaces = 0; + dest = kmalloc(alloc_size, GFP_NOFS); + if (!dest) + return -ENOMEM; + dest_orig = dest; + + /* now we have a buffer to copy into */ + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + if (!slot_count) + break; + + info = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &root->fs_info->space_info, + list) { + if (tmp->flags == types[i]) { + info = tmp; + break; + } + } + rcu_read_unlock(); + + if (!info) + continue; + down_read(&info->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&info->block_groups[c])) { + get_block_group_info(&info->block_groups[c], + &space); + memcpy(dest, &space, sizeof(space)); + dest++; + space_args.total_spaces++; + slot_count--; + } + if (!slot_count) + break; + } + up_read(&info->groups_sem); + } + + user_dest = (struct btrfs_ioctl_space_info *) + (arg + sizeof(struct btrfs_ioctl_space_args)); + + if (copy_to_user(user_dest, dest_orig, alloc_size)) + ret = -EFAULT; + + kfree(dest_orig); +out: + if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args))) + ret = -EFAULT; + + return ret; +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +long btrfs_ioctl_trans_end(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + + trans = file->private_data; + if (!trans) + return -EINVAL; + file->private_data = NULL; + + btrfs_end_transaction(trans, root); + + atomic_dec(&root->fs_info->open_ioctl_trans); + + mnt_drop_write(file->f_path.mnt); + return 0; +} + +static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) +{ + struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; + struct btrfs_trans_handle *trans; + u64 transid; + int ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + transid = trans->transid; + ret = btrfs_commit_transaction_async(trans, root, 0); + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } + + if (argp) + if (copy_to_user(argp, &transid, sizeof(transid))) + return -EFAULT; + return 0; +} + +static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) +{ + struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; + u64 transid; + + if (argp) { + if (copy_from_user(&transid, argp, sizeof(transid))) + return -EFAULT; + } else { + transid = 0; /* current trans */ + } + return btrfs_wait_for_commit(root, transid); +} + +static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) +{ + int ret; + struct btrfs_ioctl_scrub_args *sa; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, + &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + +static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btrfs_scrub_cancel(root); +} + +static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_ioctl_scrub_args *sa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + ret = btrfs_scrub_progress(root, sa->devid, &sa->progress); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + +long btrfs_ioctl(struct file *file, unsigned int + cmd, unsigned long arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case FS_IOC_GETFLAGS: + return btrfs_ioctl_getflags(file, argp); + case FS_IOC_SETFLAGS: + return btrfs_ioctl_setflags(file, argp); + case FS_IOC_GETVERSION: + return btrfs_ioctl_getversion(file, argp); + case FITRIM: + return btrfs_ioctl_fitrim(file, argp); + case BTRFS_IOC_SNAP_CREATE: + return btrfs_ioctl_snap_create(file, argp, 0); + case BTRFS_IOC_SNAP_CREATE_V2: + return btrfs_ioctl_snap_create_v2(file, argp, 0); + case BTRFS_IOC_SUBVOL_CREATE: + return btrfs_ioctl_snap_create(file, argp, 1); + case BTRFS_IOC_SNAP_DESTROY: + return btrfs_ioctl_snap_destroy(file, argp); + case BTRFS_IOC_SUBVOL_GETFLAGS: + return btrfs_ioctl_subvol_getflags(file, argp); + case BTRFS_IOC_SUBVOL_SETFLAGS: + return btrfs_ioctl_subvol_setflags(file, argp); + case BTRFS_IOC_DEFAULT_SUBVOL: + return btrfs_ioctl_default_subvol(file, argp); + case BTRFS_IOC_DEFRAG: + return btrfs_ioctl_defrag(file, NULL); + case BTRFS_IOC_DEFRAG_RANGE: + return btrfs_ioctl_defrag(file, argp); + case BTRFS_IOC_RESIZE: + return btrfs_ioctl_resize(root, argp); + case BTRFS_IOC_ADD_DEV: + return btrfs_ioctl_add_dev(root, argp); + case BTRFS_IOC_RM_DEV: + return btrfs_ioctl_rm_dev(root, argp); + case BTRFS_IOC_FS_INFO: + return btrfs_ioctl_fs_info(root, argp); + case BTRFS_IOC_DEV_INFO: + return btrfs_ioctl_dev_info(root, argp); + case BTRFS_IOC_BALANCE: + return btrfs_balance(root->fs_info->dev_root); + case BTRFS_IOC_CLONE: + return btrfs_ioctl_clone(file, arg, 0, 0, 0); + case BTRFS_IOC_CLONE_RANGE: + return btrfs_ioctl_clone_range(file, argp); + case BTRFS_IOC_TRANS_START: + return btrfs_ioctl_trans_start(file); + case BTRFS_IOC_TRANS_END: + return btrfs_ioctl_trans_end(file); + case BTRFS_IOC_TREE_SEARCH: + return btrfs_ioctl_tree_search(file, argp); + case BTRFS_IOC_INO_LOOKUP: + return btrfs_ioctl_ino_lookup(file, argp); + case BTRFS_IOC_SPACE_INFO: + return btrfs_ioctl_space_info(root, argp); + case BTRFS_IOC_SYNC: + btrfs_sync_fs(file->f_dentry->d_sb, 1); + return 0; + case BTRFS_IOC_START_SYNC: + return btrfs_ioctl_start_sync(file, argp); + case BTRFS_IOC_WAIT_SYNC: + return btrfs_ioctl_wait_sync(file, argp); + case BTRFS_IOC_SCRUB: + return btrfs_ioctl_scrub(root, argp); + case BTRFS_IOC_SCRUB_CANCEL: + return btrfs_ioctl_scrub_cancel(root, argp); + case BTRFS_IOC_SCRUB_PROGRESS: + return btrfs_ioctl_scrub_progress(root, argp); + } + + return -ENOTTY; +} diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h new file mode 100644 index 00000000..ad1ea789 --- /dev/null +++ b/fs/btrfs/ioctl.h @@ -0,0 +1,251 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __IOCTL_ +#define __IOCTL_ +#include + +#define BTRFS_IOCTL_MAGIC 0x94 +#define BTRFS_VOL_NAME_MAX 255 + +/* this should be 4k */ +#define BTRFS_PATH_NAME_MAX 4087 +struct btrfs_ioctl_vol_args { + __s64 fd; + char name[BTRFS_PATH_NAME_MAX + 1]; +}; + +#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) +#define BTRFS_SUBVOL_RDONLY (1ULL << 1) +#define BTRFS_FSID_SIZE 16 +#define BTRFS_UUID_SIZE 16 + +#define BTRFS_SUBVOL_NAME_MAX 4039 +struct btrfs_ioctl_vol_args_v2 { + __s64 fd; + __u64 transid; + __u64 flags; + __u64 unused[4]; + char name[BTRFS_SUBVOL_NAME_MAX + 1]; +}; + +/* + * structure to report errors and progress to userspace, either as a + * result of a finished scrub, a canceled scrub or a progress inquiry + */ +struct btrfs_scrub_progress { + __u64 data_extents_scrubbed; /* # of data extents scrubbed */ + __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */ + __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */ + __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */ + __u64 read_errors; /* # of read errors encountered (EIO) */ + __u64 csum_errors; /* # of failed csum checks */ + __u64 verify_errors; /* # of occurences, where the metadata + * of a tree block did not match the + * expected values, like generation or + * logical */ + __u64 no_csum; /* # of 4k data block for which no csum + * is present, probably the result of + * data written with nodatasum */ + __u64 csum_discards; /* # of csum for which no data was found + * in the extent tree. */ + __u64 super_errors; /* # of bad super blocks encountered */ + __u64 malloc_errors; /* # of internal kmalloc errors. These + * will likely cause an incomplete + * scrub */ + __u64 uncorrectable_errors; /* # of errors where either no intact + * copy was found or the writeback + * failed */ + __u64 corrected_errors; /* # of errors corrected */ + __u64 last_physical; /* last physical address scrubbed. In + * case a scrub was aborted, this can + * be used to restart the scrub */ + __u64 unverified_errors; /* # of occurences where a read for a + * full (64k) bio failed, but the re- + * check succeeded for each 4k piece. + * Intermittent error. */ +}; + +#define BTRFS_SCRUB_READONLY 1 +struct btrfs_ioctl_scrub_args { + __u64 devid; /* in */ + __u64 start; /* in */ + __u64 end; /* in */ + __u64 flags; /* in */ + struct btrfs_scrub_progress progress; /* out */ + /* pad to 1k */ + __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; +}; + +#define BTRFS_DEVICE_PATH_NAME_MAX 1024 +struct btrfs_ioctl_dev_info_args { + __u64 devid; /* in/out */ + __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ + __u64 bytes_used; /* out */ + __u64 total_bytes; /* out */ + __u64 unused[379]; /* pad to 4k */ + __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */ +}; + +struct btrfs_ioctl_fs_info_args { + __u64 max_id; /* out */ + __u64 num_devices; /* out */ + __u8 fsid[BTRFS_FSID_SIZE]; /* out */ + __u64 reserved[124]; /* pad to 1k */ +}; + +#define BTRFS_INO_LOOKUP_PATH_MAX 4080 +struct btrfs_ioctl_ino_lookup_args { + __u64 treeid; + __u64 objectid; + char name[BTRFS_INO_LOOKUP_PATH_MAX]; +}; + +struct btrfs_ioctl_search_key { + /* which root are we searching. 0 is the tree of tree roots */ + __u64 tree_id; + + /* keys returned will be >= min and <= max */ + __u64 min_objectid; + __u64 max_objectid; + + /* keys returned will be >= min and <= max */ + __u64 min_offset; + __u64 max_offset; + + /* max and min transids to search for */ + __u64 min_transid; + __u64 max_transid; + + /* keys returned will be >= min and <= max */ + __u32 min_type; + __u32 max_type; + + /* + * how many items did userland ask for, and how many are we + * returning + */ + __u32 nr_items; + + /* align to 64 bits */ + __u32 unused; + + /* some extra for later */ + __u64 unused1; + __u64 unused2; + __u64 unused3; + __u64 unused4; +}; + +struct btrfs_ioctl_search_header { + __u64 transid; + __u64 objectid; + __u64 offset; + __u32 type; + __u32 len; +}; + +#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key)) +/* + * the buf is an array of search headers where + * each header is followed by the actual item + * the type field is expanded to 32 bits for alignment + */ +struct btrfs_ioctl_search_args { + struct btrfs_ioctl_search_key key; + char buf[BTRFS_SEARCH_ARGS_BUFSIZE]; +}; + +struct btrfs_ioctl_clone_range_args { + __s64 src_fd; + __u64 src_offset, src_length; + __u64 dest_offset; +}; + +/* flags for the defrag range ioctl */ +#define BTRFS_DEFRAG_RANGE_COMPRESS 1 +#define BTRFS_DEFRAG_RANGE_START_IO 2 + +struct btrfs_ioctl_space_info { + __u64 flags; + __u64 total_bytes; + __u64 used_bytes; +}; + +struct btrfs_ioctl_space_args { + __u64 space_slots; + __u64 total_spaces; + struct btrfs_ioctl_space_info spaces[0]; +}; + +#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ + struct btrfs_ioctl_vol_args) +/* trans start and trans end are dangerous, and only for + * use by applications that know how to avoid the + * resulting deadlocks + */ +#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) +#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) +#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) + +#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) +#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ + struct btrfs_ioctl_vol_args) + +#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ + struct btrfs_ioctl_clone_range_args) + +#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \ + struct btrfs_ioctl_defrag_range_args) +#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \ + struct btrfs_ioctl_search_args) +#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \ + struct btrfs_ioctl_ino_lookup_args) +#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) +#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ + struct btrfs_ioctl_space_args) +#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) +#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) +#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ + struct btrfs_ioctl_vol_args_v2) +#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64) +#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) +#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \ + struct btrfs_ioctl_scrub_args) +#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28) +#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \ + struct btrfs_ioctl_scrub_args) +#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \ + struct btrfs_ioctl_dev_info_args) +#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ + struct btrfs_ioctl_fs_info_args) +#endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c new file mode 100644 index 00000000..66fa43dc --- /dev/null +++ b/fs/btrfs/locking.c @@ -0,0 +1,208 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include +#include +#include +#include "ctree.h" +#include "extent_io.h" +#include "locking.h" + +static inline void spin_nested(struct extent_buffer *eb) +{ + spin_lock(&eb->lock); +} + +/* + * Setting a lock to blocking will drop the spinlock and set the + * flag that forces other procs who want the lock to wait. After + * this you can safely schedule with the lock held. + */ +void btrfs_set_lock_blocking(struct extent_buffer *eb) +{ + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { + set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); + spin_unlock(&eb->lock); + } + /* exit with the spin lock released and the bit set */ +} + +/* + * clearing the blocking flag will take the spinlock again. + * After this you can't safely schedule + */ +void btrfs_clear_lock_blocking(struct extent_buffer *eb) +{ + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { + spin_nested(eb); + clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); + smp_mb__after_clear_bit(); + } + /* exit with the spin lock held */ +} + +/* + * unfortunately, many of the places that currently set a lock to blocking + * don't end up blocking for very long, and often they don't block + * at all. For a dbench 50 run, if we don't spin on the blocking bit + * at all, the context switch rate can jump up to 400,000/sec or more. + * + * So, we're still stuck with this crummy spin on the blocking bit, + * at least until the most common causes of the short blocks + * can be dealt with. + */ +static int btrfs_spin_on_block(struct extent_buffer *eb) +{ + int i; + + for (i = 0; i < 512; i++) { + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + if (need_resched()) + break; + cpu_relax(); + } + return 0; +} + +/* + * This is somewhat different from trylock. It will take the + * spinlock but if it finds the lock is set to blocking, it will + * return without the lock held. + * + * returns 1 if it was able to take the lock and zero otherwise + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */ +int btrfs_try_spin_lock(struct extent_buffer *eb) +{ + int i; + + if (btrfs_spin_on_block(eb)) { + spin_nested(eb); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + spin_unlock(&eb->lock); + } + /* spin for a bit on the BLOCKING flag */ + for (i = 0; i < 2; i++) { + cpu_relax(); + if (!btrfs_spin_on_block(eb)) + break; + + spin_nested(eb); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + spin_unlock(&eb->lock); + } + return 0; +} + +/* + * the autoremove wake function will return 0 if it tried to wake up + * a process that was already awake, which means that process won't + * count as an exclusive wakeup. The waitq code will continue waking + * procs until it finds one that was actually sleeping. + * + * For btrfs, this isn't quite what we want. We want a single proc + * to be notified that the lock is ready for taking. If that proc + * already happen to be awake, great, it will loop around and try for + * the lock. + * + * So, btrfs_wake_function always returns 1, even when the proc that we + * tried to wake up was already awake. + */ +static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + autoremove_wake_function(wait, mode, sync, key); + return 1; +} + +/* + * returns with the extent buffer spinlocked. + * + * This will spin and/or wait as required to take the lock, and then + * return with the spinlock held. + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */ +int btrfs_tree_lock(struct extent_buffer *eb) +{ + DEFINE_WAIT(wait); + wait.func = btrfs_wake_function; + + if (!btrfs_spin_on_block(eb)) + goto sleep; + + while(1) { + spin_nested(eb); + + /* nobody is blocking, exit with the spinlock held */ + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 0; + + /* + * we have the spinlock, but the real owner is blocking. + * wait for them + */ + spin_unlock(&eb->lock); + + /* + * spin for a bit, and if the blocking flag goes away, + * loop around + */ + cpu_relax(); + if (btrfs_spin_on_block(eb)) + continue; +sleep: + prepare_to_wait_exclusive(&eb->lock_wq, &wait, + TASK_UNINTERRUPTIBLE); + + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + schedule(); + + finish_wait(&eb->lock_wq, &wait); + } + return 0; +} + +int btrfs_tree_unlock(struct extent_buffer *eb) +{ + /* + * if we were a blocking owner, we don't have the spinlock held + * just clear the bit and look for waiters + */ + if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + smp_mb__after_clear_bit(); + else + spin_unlock(&eb->lock); + + if (waitqueue_active(&eb->lock_wq)) + wake_up(&eb->lock_wq); + return 0; +} + +void btrfs_assert_tree_locked(struct extent_buffer *eb) +{ + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + assert_spin_locked(&eb->lock); +} diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h new file mode 100644 index 00000000..5c33a560 --- /dev/null +++ b/fs/btrfs/locking.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_LOCKING_ +#define __BTRFS_LOCKING_ + +int btrfs_tree_lock(struct extent_buffer *eb); +int btrfs_tree_unlock(struct extent_buffer *eb); +int btrfs_try_spin_lock(struct extent_buffer *eb); + +void btrfs_set_lock_blocking(struct extent_buffer *eb); +void btrfs_clear_lock_blocking(struct extent_buffer *eb); +void btrfs_assert_tree_locked(struct extent_buffer *eb); +#endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c new file mode 100644 index 00000000..a178f5eb --- /dev/null +++ b/fs/btrfs/lzo.c @@ -0,0 +1,427 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compression.h" + +#define LZO_LEN 4 + +struct workspace { + void *mem; + void *buf; /* where compressed data goes */ + void *cbuf; /* where decompressed data goes */ + struct list_head list; +}; + +static void lzo_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + vfree(workspace->buf); + vfree(workspace->cbuf); + vfree(workspace->mem); + kfree(workspace); +} + +static struct list_head *lzo_alloc_workspace(void) +{ + struct workspace *workspace; + + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); + workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + if (!workspace->mem || !workspace->buf || !workspace->cbuf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + lzo_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +static inline void write_compress_length(char *buf, size_t len) +{ + __le32 dlen; + + dlen = cpu_to_le32(len); + memcpy(buf, &dlen, LZO_LEN); +} + +static inline size_t read_compress_length(char *buf) +{ + __le32 dlen; + + memcpy(&dlen, buf, LZO_LEN); + return le32_to_cpu(dlen); +} + +static int lzo_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + unsigned long bytes_left; + + size_t in_len; + size_t out_len; + char *buf; + unsigned long tot_in = 0; + unsigned long tot_out = 0; + unsigned long pg_bytes_left; + unsigned long out_offset; + unsigned long bytes; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + /* + * store the size of all chunks of compressed data in + * the first 4 bytes + */ + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + out_offset = LZO_LEN; + tot_out = LZO_LEN; + pages[0] = out_page; + nr_pages = 1; + pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + + /* compress at most one page of data each time */ + in_len = min(len, PAGE_CACHE_SIZE); + while (tot_in < len) { + ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, + &out_len, workspace->mem); + if (ret != LZO_E_OK) { + printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + ret); + ret = -1; + goto out; + } + + /* store the size of this chunk of compressed data */ + write_compress_length(cpage_out + out_offset, out_len); + tot_out += LZO_LEN; + out_offset += LZO_LEN; + pg_bytes_left -= LZO_LEN; + + tot_in += in_len; + tot_out += out_len; + + /* copy bytes from the working buffer into the pages */ + buf = workspace->cbuf; + while (out_len) { + bytes = min_t(unsigned long, pg_bytes_left, out_len); + + memcpy(cpage_out + out_offset, buf, bytes); + + out_len -= bytes; + pg_bytes_left -= bytes; + buf += bytes; + out_offset += bytes; + + /* + * we need another page for writing out. + * + * Note if there's less than 4 bytes left, we just + * skip to a new page. + */ + if ((out_len == 0 && pg_bytes_left < LZO_LEN) || + pg_bytes_left == 0) { + if (pg_bytes_left) { + memset(cpage_out + out_offset, 0, + pg_bytes_left); + tot_out += pg_bytes_left; + } + + /* we're done, don't allocate new page */ + if (out_len == 0 && tot_in >= len) + break; + + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -1; + goto out; + } + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + pages[nr_pages++] = out_page; + + pg_bytes_left = PAGE_CACHE_SIZE; + out_offset = 0; + } + } + + /* we're making it bigger, give up */ + if (tot_in > 8192 && tot_in < tot_out) + goto out; + + /* we're all done */ + if (tot_in >= len) + break; + + if (tot_out > max_out) + break; + + bytes_left = len - tot_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + in_len = min(bytes_left, PAGE_CACHE_SIZE); + } + + if (tot_out > tot_in) + goto out; + + /* store the size of all chunks of compressed data */ + cpage_out = kmap(pages[0]); + write_compress_length(cpage_out, tot_out); + + kunmap(pages[0]); + + ret = 0; + *total_out = tot_out; + *total_in = tot_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + + return ret; +} + +static int lzo_decompress_biovec(struct list_head *ws, + struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; + char *data_in; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + unsigned long buf_start; + unsigned long buf_offset = 0; + unsigned long bytes; + unsigned long working_bytes; + unsigned long pg_offset; + + size_t in_len; + size_t out_len; + unsigned long in_offset; + unsigned long in_page_bytes_left; + unsigned long tot_in; + unsigned long tot_out; + unsigned long tot_len; + char *buf; + bool may_late_unmap, need_unmap; + + data_in = kmap(pages_in[0]); + tot_len = read_compress_length(data_in); + + tot_in = LZO_LEN; + in_offset = LZO_LEN; + tot_len = min_t(size_t, srclen, tot_len); + in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + + tot_out = 0; + pg_offset = 0; + + while (tot_in < tot_len) { + in_len = read_compress_length(data_in + in_offset); + in_page_bytes_left -= LZO_LEN; + in_offset += LZO_LEN; + tot_in += LZO_LEN; + + tot_in += in_len; + working_bytes = in_len; + may_late_unmap = need_unmap = false; + + /* fast path: avoid using the working buffer */ + if (in_page_bytes_left >= in_len) { + buf = data_in + in_offset; + bytes = in_len; + may_late_unmap = true; + goto cont; + } + + /* copy bytes from the pages into the working buffer */ + buf = workspace->cbuf; + buf_offset = 0; + while (working_bytes) { + bytes = min(working_bytes, in_page_bytes_left); + + memcpy(buf + buf_offset, data_in + in_offset, bytes); + buf_offset += bytes; +cont: + working_bytes -= bytes; + in_page_bytes_left -= bytes; + in_offset += bytes; + + /* check if we need to pick another page */ + if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN) + || in_page_bytes_left == 0) { + tot_in += in_page_bytes_left; + + if (working_bytes == 0 && tot_in >= tot_len) + break; + + if (page_in_index + 1 >= total_pages_in) { + ret = -1; + goto done; + } + + if (may_late_unmap) + need_unmap = true; + else + kunmap(pages_in[page_in_index]); + + data_in = kmap(pages_in[++page_in_index]); + + in_page_bytes_left = PAGE_CACHE_SIZE; + in_offset = 0; + } + } + + out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); + ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, + &out_len); + if (need_unmap) + kunmap(pages_in[page_in_index - 1]); + if (ret != LZO_E_OK) { + printk(KERN_WARNING "btrfs decompress failed\n"); + ret = -1; + break; + } + + buf_start = tot_out; + tot_out += out_len; + + ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, + tot_out, disk_start, + bvec, vcnt, + &page_out_index, &pg_offset); + if (ret2 == 0) + break; + } +done: + kunmap(pages_in[page_in_index]); + return ret; +} + +static int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + size_t in_len; + size_t out_len; + size_t tot_len; + int ret = 0; + char *kaddr; + unsigned long bytes; + + BUG_ON(srclen < LZO_LEN); + + tot_len = read_compress_length(data_in); + data_in += LZO_LEN; + + in_len = read_compress_length(data_in); + data_in += LZO_LEN; + + out_len = PAGE_CACHE_SIZE; + ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); + if (ret != LZO_E_OK) { + printk(KERN_WARNING "btrfs decompress failed!\n"); + ret = -1; + goto out; + } + + if (out_len < start_byte) { + ret = -1; + goto out; + } + + bytes = min_t(unsigned long, destlen, out_len - start_byte); + + kaddr = kmap_atomic(dest_page, KM_USER0); + memcpy(kaddr, workspace->buf + start_byte, bytes); + kunmap_atomic(kaddr, KM_USER0); +out: + return ret; +} + +struct btrfs_compress_op btrfs_lzo_compress = { + .alloc_workspace = lzo_alloc_workspace, + .free_workspace = lzo_free_workspace, + .compress_pages = lzo_compress_pages, + .decompress_biovec = lzo_decompress_biovec, + .decompress = lzo_decompress, +}; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c new file mode 100644 index 00000000..a1c94042 --- /dev/null +++ b/fs/btrfs/ordered-data.c @@ -0,0 +1,983 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include "ctree.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "extent_io.h" + +static u64 entry_end(struct btrfs_ordered_extent *entry) +{ + if (entry->file_offset + entry->len < entry->file_offset) + return (u64)-1; + return entry->file_offset + entry->len; +} + +/* returns NULL if the insertion worked, or it returns the node it did find + * in the tree + */ +static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_ordered_extent *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); + + if (file_offset < entry->file_offset) + p = &(*p)->rb_left; + else if (file_offset >= entry_end(entry)) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * look for a given offset in the tree, and if it can't be found return the + * first lesser offset + */ +static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, + struct rb_node **prev_ret) +{ + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *test; + struct btrfs_ordered_extent *entry; + struct btrfs_ordered_extent *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); + prev = n; + prev_entry = entry; + + if (file_offset < entry->file_offset) + n = n->rb_left; + else if (file_offset >= entry_end(entry)) + n = n->rb_right; + else + return n; + } + if (!prev_ret) + return NULL; + + while (prev && file_offset >= entry_end(prev_entry)) { + test = rb_next(prev); + if (!test) + break; + prev_entry = rb_entry(test, struct btrfs_ordered_extent, + rb_node); + if (file_offset < entry_end(prev_entry)) + break; + + prev = test; + } + if (prev) + prev_entry = rb_entry(prev, struct btrfs_ordered_extent, + rb_node); + while (prev && file_offset < entry_end(prev_entry)) { + test = rb_prev(prev); + if (!test) + break; + prev_entry = rb_entry(test, struct btrfs_ordered_extent, + rb_node); + prev = test; + } + *prev_ret = prev; + return NULL; +} + +/* + * helper to check if a given offset is inside a given entry + */ +static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) +{ + if (file_offset < entry->file_offset || + entry->file_offset + entry->len <= file_offset) + return 0; + return 1; +} + +static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, + u64 len) +{ + if (file_offset + len <= entry->file_offset || + entry->file_offset + entry->len <= file_offset) + return 0; + return 1; +} + +/* + * look find the first ordered struct that has this offset, otherwise + * the first one less than this offset + */ +static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, + u64 file_offset) +{ + struct rb_root *root = &tree->tree; + struct rb_node *prev = NULL; + struct rb_node *ret; + struct btrfs_ordered_extent *entry; + + if (tree->last) { + entry = rb_entry(tree->last, struct btrfs_ordered_extent, + rb_node); + if (offset_in_entry(entry, file_offset)) + return tree->last; + } + ret = __tree_search(root, file_offset, &prev); + if (!ret) + ret = prev; + if (ret) + tree->last = ret; + return ret; +} + +/* allocate and add a new ordered_extent into the per-inode tree. + * file_offset is the logical offset in the file + * + * start is the disk block number of an extent already reserved in the + * extent allocation tree + * + * len is the length of the extent + * + * The tree is given a single reference on the ordered extent that was + * inserted. + */ +static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int dio, int compress_type) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry; + + tree = &BTRFS_I(inode)->ordered_tree; + entry = kzalloc(sizeof(*entry), GFP_NOFS); + if (!entry) + return -ENOMEM; + + entry->file_offset = file_offset; + entry->start = start; + entry->len = len; + entry->disk_len = disk_len; + entry->bytes_left = len; + entry->inode = inode; + entry->compress_type = compress_type; + if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) + set_bit(type, &entry->flags); + + if (dio) + set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); + + /* one ref for the tree */ + atomic_set(&entry->refs, 1); + init_waitqueue_head(&entry->wait); + INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->root_extent_list); + + trace_btrfs_ordered_extent_add(inode, entry); + + spin_lock(&tree->lock); + node = tree_insert(&tree->tree, file_offset, + &entry->rb_node); + BUG_ON(node); + spin_unlock(&tree->lock); + + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_add_tail(&entry->root_extent_list, + &BTRFS_I(inode)->root->fs_info->ordered_extents); + spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + + BUG_ON(node); + return 0; +} + +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0, + BTRFS_COMPRESS_NONE); +} + +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 1, + BTRFS_COMPRESS_NONE); +} + +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int compress_type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0, + compress_type); +} + +/* + * Add a struct btrfs_ordered_sum into the list of checksums to be inserted + * when an ordered extent is finished. If the list covers more than one + * ordered extent, it is split across multiples. + */ +int btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum) +{ + struct btrfs_ordered_inode_tree *tree; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + list_add_tail(&sum->list, &entry->list); + spin_unlock(&tree->lock); + return 0; +} + +/* + * this is used to account for finished IO across a given range + * of the file. The IO may span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + * + * file_offset is updated to one byte past the range that is recorded as + * complete. This allows you to walk forward in the file. + */ +int btrfs_dec_test_first_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 *file_offset, u64 io_size) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + int ret; + u64 dec_end; + u64 dec_start; + u64 to_dec; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + node = tree_search(tree, *file_offset); + if (!node) { + ret = 1; + goto out; + } + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, *file_offset)) { + ret = 1; + goto out; + } + + dec_start = max(*file_offset, entry->file_offset); + dec_end = min(*file_offset + io_size, entry->file_offset + + entry->len); + *file_offset = dec_end; + if (dec_start > dec_end) { + printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n", + (unsigned long long)dec_start, + (unsigned long long)dec_end); + } + to_dec = dec_end - dec_start; + if (to_dec > entry->bytes_left) { + printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", + (unsigned long long)entry->bytes_left, + (unsigned long long)to_dec); + } + entry->bytes_left -= to_dec; + if (entry->bytes_left == 0) + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + else + ret = 1; +out: + if (!ret && cached && entry) { + *cached = entry; + atomic_inc(&entry->refs); + } + spin_unlock(&tree->lock); + return ret == 0; +} + +/* + * this is used to account for finished IO across a given range + * of the file. The IO should not span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + */ +int btrfs_dec_test_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + int ret; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) { + ret = 1; + goto out; + } + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, file_offset)) { + ret = 1; + goto out; + } + + if (io_size > entry->bytes_left) { + printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", + (unsigned long long)entry->bytes_left, + (unsigned long long)io_size); + } + entry->bytes_left -= io_size; + if (entry->bytes_left == 0) + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + else + ret = 1; +out: + if (!ret && cached && entry) { + *cached = entry; + atomic_inc(&entry->refs); + } + spin_unlock(&tree->lock); + return ret == 0; +} + +/* + * used to drop a reference on an ordered extent. This will free + * the extent if the last reference is dropped + */ +int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) +{ + struct list_head *cur; + struct btrfs_ordered_sum *sum; + + trace_btrfs_ordered_extent_put(entry->inode, entry); + + if (atomic_dec_and_test(&entry->refs)) { + while (!list_empty(&entry->list)) { + cur = entry->list.next; + sum = list_entry(cur, struct btrfs_ordered_sum, list); + list_del(&sum->list); + kfree(sum); + } + kfree(entry); + } + return 0; +} + +/* + * remove an ordered extent from the tree. No references are dropped + * and you must wake_up entry->wait. You must hold the tree lock + * while you call this function. + */ +static int __btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) +{ + struct btrfs_ordered_inode_tree *tree; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct rb_node *node; + + tree = &BTRFS_I(inode)->ordered_tree; + node = &entry->rb_node; + rb_erase(node, &tree->tree); + tree->last = NULL; + set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + + spin_lock(&root->fs_info->ordered_extent_lock); + list_del_init(&entry->root_extent_list); + + trace_btrfs_ordered_extent_remove(inode, entry); + + /* + * we have no more ordered extents for this inode and + * no dirty pages. We can safely remove it from the + * list of ordered extents + */ + if (RB_EMPTY_ROOT(&tree->tree) && + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + list_del_init(&BTRFS_I(inode)->ordered_operations); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + + return 0; +} + +/* + * remove an ordered extent from the tree. No references are dropped + * but any waiters are woken. + */ +int btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) +{ + struct btrfs_ordered_inode_tree *tree; + int ret; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + ret = __btrfs_remove_ordered_extent(inode, entry); + spin_unlock(&tree->lock); + wake_up(&entry->wait); + + return ret; +} + +/* + * wait for all the ordered extents in a root. This is done when balancing + * space between drives. + */ +int btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput) +{ + struct list_head splice; + struct list_head *cur; + struct btrfs_ordered_extent *ordered; + struct inode *inode; + + INIT_LIST_HEAD(&splice); + + spin_lock(&root->fs_info->ordered_extent_lock); + list_splice_init(&root->fs_info->ordered_extents, &splice); + while (!list_empty(&splice)) { + cur = splice.next; + ordered = list_entry(cur, struct btrfs_ordered_extent, + root_extent_list); + if (nocow_only && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { + list_move(&ordered->root_extent_list, + &root->fs_info->ordered_extents); + cond_resched_lock(&root->fs_info->ordered_extent_lock); + continue; + } + + list_del_init(&ordered->root_extent_list); + atomic_inc(&ordered->refs); + + /* + * the inode may be getting freed (in sys_unlink path). + */ + inode = igrab(ordered->inode); + + spin_unlock(&root->fs_info->ordered_extent_lock); + + if (inode) { + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + } else { + btrfs_put_ordered_extent(ordered); + } + + spin_lock(&root->fs_info->ordered_extent_lock); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + return 0; +} + +/* + * this is used during transaction commit to write all the inodes + * added to the ordered operation list. These files must be fully on + * disk before the transaction commits. + * + * we have two modes here, one is to just start the IO via filemap_flush + * and the other is to wait for all the io. When we wait, we have an + * extra check to make sure the ordered operation list really is empty + * before we return + */ +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +{ + struct btrfs_inode *btrfs_inode; + struct inode *inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&root->fs_info->ordered_operations_mutex); + spin_lock(&root->fs_info->ordered_extent_lock); +again: + list_splice_init(&root->fs_info->ordered_operations, &splice); + + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + ordered_operations); + + inode = &btrfs_inode->vfs_inode; + + list_del_init(&btrfs_inode->ordered_operations); + + /* + * the inode may be getting freed (in sys_unlink path). + */ + inode = igrab(inode); + + if (!wait && inode) { + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &root->fs_info->ordered_operations); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + + if (inode) { + if (wait) + btrfs_wait_ordered_range(inode, 0, (u64)-1); + else + filemap_flush(inode->i_mapping); + btrfs_add_delayed_iput(inode); + } + + cond_resched(); + spin_lock(&root->fs_info->ordered_extent_lock); + } + if (wait && !list_empty(&root->fs_info->ordered_operations)) + goto again; + + spin_unlock(&root->fs_info->ordered_extent_lock); + mutex_unlock(&root->fs_info->ordered_operations_mutex); + + return 0; +} + +/* + * Used to start IO or wait for a given ordered extent to finish. + * + * If wait is one, this effectively waits on page writeback for all the pages + * in the extent, and it waits on the io completion code to insert + * metadata into the btree corresponding to the extent + */ +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, + int wait) +{ + u64 start = entry->file_offset; + u64 end = start + entry->len - 1; + + trace_btrfs_ordered_extent_start(inode, entry); + + /* + * pages in the range can be dirty, clean or writeback. We + * start IO on any dirty ones so the wait doesn't stall waiting + * for pdflush to find them + */ + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) + filemap_fdatawrite_range(inode->i_mapping, start, end); + if (wait) { + wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, + &entry->flags)); + } +} + +/* + * Used to wait on ordered extents across a large range of bytes. + */ +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +{ + u64 end; + u64 orig_end; + struct btrfs_ordered_extent *ordered; + int found; + + if (start + len < start) { + orig_end = INT_LIMIT(loff_t); + } else { + orig_end = start + len - 1; + if (orig_end > INT_LIMIT(loff_t)) + orig_end = INT_LIMIT(loff_t); + } +again: + /* start IO across the range first to instantiate any delalloc + * extents + */ + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + + /* The compression code will leave pages locked but return from + * writepage without setting the page writeback. Starting again + * with WB_SYNC_ALL will end up waiting for the IO to actually start. + */ + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + + filemap_fdatawait_range(inode->i_mapping, start, orig_end); + + end = orig_end; + found = 0; + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, end); + if (!ordered) + break; + if (ordered->file_offset > orig_end) { + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered->file_offset + ordered->len < start) { + btrfs_put_ordered_extent(ordered); + break; + } + found++; + btrfs_start_ordered_extent(inode, ordered, 1); + end = ordered->file_offset; + btrfs_put_ordered_extent(ordered); + if (end == 0 || end == start) + break; + end--; + } + if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, + EXTENT_DELALLOC, 0, NULL)) { + schedule_timeout(1); + goto again; + } + return 0; +} + +/* + * find an ordered extent corresponding to file_offset. return NULL if + * nothing is found, otherwise take a reference on the extent and return it + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, + u64 file_offset) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, file_offset)) + entry = NULL; + if (entry) + atomic_inc(&entry->refs); +out: + spin_unlock(&tree->lock); + return entry; +} + +/* Since the DIO code tries to lock a wide area we need to look for any ordered + * extents that exist in the range, rather than just the start of the range. + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) { + node = tree_search(tree, file_offset + len); + if (!node) + goto out; + } + + while (1) { + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + break; + + if (entry->file_offset >= file_offset + len) { + entry = NULL; + break; + } + entry = NULL; + node = rb_next(node); + if (!node) + break; + } +out: + if (entry) + atomic_inc(&entry->refs); + spin_unlock(&tree->lock); + return entry; +} + +/* + * lookup and return any extent before 'file_offset'. NULL is returned + * if none is found + */ +struct btrfs_ordered_extent * +btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + atomic_inc(&entry->refs); +out: + spin_unlock(&tree->lock); + return entry; +} + +/* + * After an extent is done, call this to conditionally update the on disk + * i_size. i_size is updated to cover any fully written part of the file. + */ +int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, + struct btrfs_ordered_extent *ordered) +{ + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u64 disk_i_size; + u64 new_i_size; + u64 i_size_test; + u64 i_size = i_size_read(inode); + struct rb_node *node; + struct rb_node *prev = NULL; + struct btrfs_ordered_extent *test; + int ret = 1; + + if (ordered) + offset = entry_end(ordered); + else + offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); + + spin_lock(&tree->lock); + disk_i_size = BTRFS_I(inode)->disk_i_size; + + /* truncate file */ + if (disk_i_size > i_size) { + BTRFS_I(inode)->disk_i_size = i_size; + ret = 0; + goto out; + } + + /* + * if the disk i_size is already at the inode->i_size, or + * this ordered extent is inside the disk i_size, we're done + */ + if (disk_i_size == i_size || offset <= disk_i_size) { + goto out; + } + + /* + * we can't update the disk_isize if there are delalloc bytes + * between disk_i_size and this ordered extent + */ + if (test_range_bit(io_tree, disk_i_size, offset - 1, + EXTENT_DELALLOC, 0, NULL)) { + goto out; + } + /* + * walk backward from this ordered extent to disk_i_size. + * if we find an ordered extent then we can't update disk i_size + * yet + */ + if (ordered) { + node = rb_prev(&ordered->rb_node); + } else { + prev = tree_search(tree, offset); + /* + * we insert file extents without involving ordered struct, + * so there should be no ordered struct cover this offset + */ + if (prev) { + test = rb_entry(prev, struct btrfs_ordered_extent, + rb_node); + BUG_ON(offset_in_entry(test, offset)); + } + node = prev; + } + while (node) { + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (test->file_offset + test->len <= disk_i_size) + break; + if (test->file_offset >= i_size) + break; + if (test->file_offset >= disk_i_size) + goto out; + node = rb_prev(node); + } + new_i_size = min_t(u64, offset, i_size); + + /* + * at this point, we know we can safely update i_size to at least + * the offset from this ordered extent. But, we need to + * walk forward and see if ios from higher up in the file have + * finished. + */ + if (ordered) { + node = rb_next(&ordered->rb_node); + } else { + if (prev) + node = rb_next(prev); + else + node = rb_first(&tree->tree); + } + i_size_test = 0; + if (node) { + /* + * do we have an area where IO might have finished + * between our ordered extent and the next one. + */ + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (test->file_offset > offset) + i_size_test = test->file_offset; + } else { + i_size_test = i_size; + } + + /* + * i_size_test is the end of a region after this ordered + * extent where there are no ordered extents. As long as there + * are no delalloc bytes in this area, it is safe to update + * disk_i_size to the end of the region. + */ + if (i_size_test > offset && + !test_range_bit(io_tree, offset, i_size_test - 1, + EXTENT_DELALLOC, 0, NULL)) { + new_i_size = min_t(u64, i_size_test, i_size); + } + BTRFS_I(inode)->disk_i_size = new_i_size; + ret = 0; +out: + /* + * we need to remove the ordered extent with the tree lock held + * so that other people calling this function don't find our fully + * processed ordered entry and skip updating the i_size + */ + if (ordered) + __btrfs_remove_ordered_extent(inode, ordered); + spin_unlock(&tree->lock); + if (ordered) + wake_up(&ordered->wait); + return ret; +} + +/* + * search the ordered extents for one corresponding to 'offset' and + * try to find a checksum. This is used because we allow pages to + * be reclaimed before their checksum is actually put into the btree + */ +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, + u32 *sum) +{ + struct btrfs_ordered_sum *ordered_sum; + struct btrfs_sector_sum *sector_sums; + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + unsigned long num_sectors; + unsigned long i; + u32 sectorsize = BTRFS_I(inode)->root->sectorsize; + int ret = 1; + + ordered = btrfs_lookup_ordered_extent(inode, offset); + if (!ordered) + return 1; + + spin_lock(&tree->lock); + list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { + if (disk_bytenr >= ordered_sum->bytenr) { + num_sectors = ordered_sum->len / sectorsize; + sector_sums = ordered_sum->sums; + for (i = 0; i < num_sectors; i++) { + if (sector_sums[i].bytenr == disk_bytenr) { + *sum = sector_sums[i].sum; + ret = 0; + goto out; + } + } + } + } +out: + spin_unlock(&tree->lock); + btrfs_put_ordered_extent(ordered); + return ret; +} + + +/* + * add a given inode to the list of inodes that must be fully on + * disk before a transaction commit finishes. + * + * This basically gives us the ext3 style data=ordered mode, and it is mostly + * used to make sure renamed files are fully on disk. + * + * It is a noop if the inode is already fully on disk. + * + * If trans is not null, we'll do a friendly check for a transaction that + * is already flushing things and force the IO down ourselves. + */ +int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + u64 last_mod; + + last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); + + /* + * if this file hasn't been changed since the last transaction + * commit, we can safely return without doing anything + */ + if (last_mod < root->fs_info->last_trans_committed) + return 0; + + /* + * the transaction is already committing. Just start the IO and + * don't bother with all of this list nonsense + */ + if (trans && root->fs_info->running_transaction->blocked) { + btrfs_wait_ordered_range(inode, 0, (u64)-1); + return 0; + } + + spin_lock(&root->fs_info->ordered_extent_lock); + if (list_empty(&BTRFS_I(inode)->ordered_operations)) { + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &root->fs_info->ordered_operations); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + + return 0; +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h new file mode 100644 index 00000000..ff1f69aa --- /dev/null +++ b/fs/btrfs/ordered-data.h @@ -0,0 +1,179 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ORDERED_DATA__ +#define __BTRFS_ORDERED_DATA__ + +/* one of these per inode */ +struct btrfs_ordered_inode_tree { + spinlock_t lock; + struct rb_root tree; + struct rb_node *last; +}; + +/* + * these are used to collect checksums done just before bios submission. + * They are attached via a list into the ordered extent, and + * checksum items are inserted into the tree after all the blocks in + * the ordered extent are on disk + */ +struct btrfs_sector_sum { + /* bytenr on disk */ + u64 bytenr; + u32 sum; +}; + +struct btrfs_ordered_sum { + /* bytenr is the start of this extent on disk */ + u64 bytenr; + + /* + * this is the length in bytes covered by the sums array below. + */ + unsigned long len; + struct list_head list; + /* last field is a variable length array of btrfs_sector_sums */ + struct btrfs_sector_sum sums[]; +}; + +/* + * bits for the flags field: + * + * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. + * It is used to make sure metadata is inserted into the tree only once + * per extent. + * + * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the + * rbtree, just before waking any waiters. It is used to indicate the + * IO is done and any metadata is inserted into the tree. + */ +#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ + +#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ + +#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ + +#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ + +#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ + +#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ + +struct btrfs_ordered_extent { + /* logical offset in the file */ + u64 file_offset; + + /* disk byte number */ + u64 start; + + /* ram length of the extent in bytes */ + u64 len; + + /* extent length on disk */ + u64 disk_len; + + /* number of bytes that still need writing */ + u64 bytes_left; + + /* flags (described above) */ + unsigned long flags; + + /* compression algorithm */ + int compress_type; + + /* reference count */ + atomic_t refs; + + /* the inode we belong to */ + struct inode *inode; + + /* list of checksums for insertion when the extent io is done */ + struct list_head list; + + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ + wait_queue_head_t wait; + + /* our friendly rbtree entry */ + struct rb_node rb_node; + + /* a per root list of all the pending ordered extents */ + struct list_head root_extent_list; +}; + + +/* + * calculates the total size you need to allocate for an ordered sum + * structure spanning 'bytes' in the file + */ +static inline int btrfs_ordered_sum_size(struct btrfs_root *root, + unsigned long bytes) +{ + unsigned long num_sectors = (bytes + root->sectorsize - 1) / + root->sectorsize; + num_sectors++; + return sizeof(struct btrfs_ordered_sum) + + num_sectors * sizeof(struct btrfs_sector_sum); +} + +static inline void +btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) +{ + spin_lock_init(&t->lock); + t->tree = RB_ROOT; + t->last = NULL; +} + +int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); +int btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry); +int btrfs_dec_test_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size); +int btrfs_dec_test_first_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 *file_offset, u64 io_size); +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int compress_type); +int btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum); +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, + u64 file_offset); +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, int wait); +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); +struct btrfs_ordered_extent * +btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len); +int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, + struct btrfs_ordered_extent *ordered); +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); +int btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput); +#endif diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c new file mode 100644 index 00000000..f8be2509 --- /dev/null +++ b/fs/btrfs/orphan.c @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" + +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_free_path(path); + return ret; +} + +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c new file mode 100644 index 00000000..fb2605d9 --- /dev/null +++ b/fs/btrfs/print-tree.c @@ -0,0 +1,338 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" + +static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) +{ + int num_stripes = btrfs_chunk_num_stripes(eb, chunk); + int i; + printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu " + "num_stripes %d\n", + (unsigned long long)btrfs_chunk_length(eb, chunk), + (unsigned long long)btrfs_chunk_owner(eb, chunk), + (unsigned long long)btrfs_chunk_type(eb, chunk), + num_stripes); + for (i = 0 ; i < num_stripes ; i++) { + printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i, + (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i), + (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i)); + } +} +static void print_dev_item(struct extent_buffer *eb, + struct btrfs_dev_item *dev_item) +{ + printk(KERN_INFO "\t\tdev item devid %llu " + "total_bytes %llu bytes used %llu\n", + (unsigned long long)btrfs_device_id(eb, dev_item), + (unsigned long long)btrfs_device_total_bytes(eb, dev_item), + (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); +} +static void print_extent_data_ref(struct extent_buffer *eb, + struct btrfs_extent_data_ref *ref) +{ + printk(KERN_INFO "\t\textent data backref root %llu " + "objectid %llu offset %llu count %u\n", + (unsigned long long)btrfs_extent_data_ref_root(eb, ref), + (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref), + (unsigned long long)btrfs_extent_data_ref_offset(eb, ref), + btrfs_extent_data_ref_count(eb, ref)); +} + +static void print_extent_item(struct extent_buffer *eb, int slot) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct btrfs_disk_key key; + unsigned long end; + unsigned long ptr; + int type; + u32 item_size = btrfs_item_size_nr(eb, slot); + u64 flags; + u64 offset; + + if (item_size < sizeof(*ei)) { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0); + printk(KERN_INFO "\t\textent refs %u\n", + btrfs_extent_refs_v0(eb, ei0)); + return; +#else + BUG(); +#endif + } + + ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(eb, ei); + + printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n", + (unsigned long long)btrfs_extent_refs(eb, ei), + (unsigned long long)btrfs_extent_generation(eb, ei), + (unsigned long long)flags); + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + info = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_tree_block_key(eb, info, &key); + printk(KERN_INFO "\t\ttree block key (%llu %x %llu) " + "level %d\n", + (unsigned long long)btrfs_disk_key_objectid(&key), + key.type, + (unsigned long long)btrfs_disk_key_offset(&key), + btrfs_tree_block_level(eb, info)); + iref = (struct btrfs_extent_inline_ref *)(info + 1); + } else { + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(eb, iref); + offset = btrfs_extent_inline_ref_offset(eb, iref); + switch (type) { + case BTRFS_TREE_BLOCK_REF_KEY: + printk(KERN_INFO "\t\ttree block backref " + "root %llu\n", (unsigned long long)offset); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + printk(KERN_INFO "\t\tshared block backref " + "parent %llu\n", (unsigned long long)offset); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + print_extent_data_ref(eb, dref); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + printk(KERN_INFO "\t\tshared data backref " + "parent %llu count %u\n", + (unsigned long long)offset, + btrfs_shared_data_ref_count(eb, sref)); + break; + default: + BUG(); + } + ptr += btrfs_extent_inline_ref_size(type); + } + WARN_ON(ptr > end); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static void print_extent_ref_v0(struct extent_buffer *eb, int slot) +{ + struct btrfs_extent_ref_v0 *ref0; + + ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0); + printk("\t\textent back ref root %llu gen %llu " + "owner %llu num_refs %lu\n", + (unsigned long long)btrfs_ref_root_v0(eb, ref0), + (unsigned long long)btrfs_ref_generation_v0(eb, ref0), + (unsigned long long)btrfs_ref_objectid_v0(eb, ref0), + (unsigned long)btrfs_ref_count_v0(eb, ref0)); +} +#endif + +void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) +{ + int i; + u32 type; + u32 nr = btrfs_header_nritems(l); + struct btrfs_item *item; + struct btrfs_root_item *ri; + struct btrfs_dir_item *di; + struct btrfs_inode_item *ii; + struct btrfs_block_group_item *bi; + struct btrfs_file_extent_item *fi; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct btrfs_dev_extent *dev_extent; + struct btrfs_key key; + struct btrfs_key found_key; + + printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", + (unsigned long long)btrfs_header_bytenr(l), nr, + btrfs_leaf_free_space(root, l)); + for (i = 0 ; i < nr ; i++) { + item = btrfs_item_nr(l, i); + btrfs_item_key_to_cpu(l, &key, i); + type = btrfs_key_type(&key); + printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d " + "itemsize %d\n", + i, + (unsigned long long)key.objectid, type, + (unsigned long long)key.offset, + btrfs_item_offset(l, item), btrfs_item_size(l, item)); + switch (type) { + case BTRFS_INODE_ITEM_KEY: + ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); + printk(KERN_INFO "\t\tinode generation %llu size %llu " + "mode %o\n", + (unsigned long long) + btrfs_inode_generation(l, ii), + (unsigned long long)btrfs_inode_size(l, ii), + btrfs_inode_mode(l, ii)); + break; + case BTRFS_DIR_ITEM_KEY: + di = btrfs_item_ptr(l, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(l, di, &found_key); + printk(KERN_INFO "\t\tdir oid %llu type %u\n", + (unsigned long long)found_key.objectid, + btrfs_dir_type(l, di)); + break; + case BTRFS_ROOT_ITEM_KEY: + ri = btrfs_item_ptr(l, i, struct btrfs_root_item); + printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n", + (unsigned long long) + btrfs_disk_root_bytenr(l, ri), + btrfs_disk_root_refs(l, ri)); + break; + case BTRFS_EXTENT_ITEM_KEY: + print_extent_item(l, i); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + printk(KERN_INFO "\t\ttree block backref\n"); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + printk(KERN_INFO "\t\tshared block backref\n"); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = btrfs_item_ptr(l, i, + struct btrfs_extent_data_ref); + print_extent_data_ref(l, dref); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = btrfs_item_ptr(l, i, + struct btrfs_shared_data_ref); + printk(KERN_INFO "\t\tshared data backref count %u\n", + btrfs_shared_data_ref_count(l, sref)); + break; + case BTRFS_EXTENT_DATA_KEY: + fi = btrfs_item_ptr(l, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(l, fi) == + BTRFS_FILE_EXTENT_INLINE) { + printk(KERN_INFO "\t\tinline extent data " + "size %u\n", + btrfs_file_extent_inline_len(l, fi)); + break; + } + printk(KERN_INFO "\t\textent data disk bytenr %llu " + "nr %llu\n", + (unsigned long long) + btrfs_file_extent_disk_bytenr(l, fi), + (unsigned long long) + btrfs_file_extent_disk_num_bytes(l, fi)); + printk(KERN_INFO "\t\textent data offset %llu " + "nr %llu ram %llu\n", + (unsigned long long) + btrfs_file_extent_offset(l, fi), + (unsigned long long) + btrfs_file_extent_num_bytes(l, fi), + (unsigned long long) + btrfs_file_extent_ram_bytes(l, fi)); + break; + case BTRFS_EXTENT_REF_V0_KEY: +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + print_extent_ref_v0(l, i); +#else + BUG(); +#endif + break; + case BTRFS_BLOCK_GROUP_ITEM_KEY: + bi = btrfs_item_ptr(l, i, + struct btrfs_block_group_item); + printk(KERN_INFO "\t\tblock group used %llu\n", + (unsigned long long) + btrfs_disk_block_group_used(l, bi)); + break; + case BTRFS_CHUNK_ITEM_KEY: + print_chunk(l, btrfs_item_ptr(l, i, + struct btrfs_chunk)); + break; + case BTRFS_DEV_ITEM_KEY: + print_dev_item(l, btrfs_item_ptr(l, i, + struct btrfs_dev_item)); + break; + case BTRFS_DEV_EXTENT_KEY: + dev_extent = btrfs_item_ptr(l, i, + struct btrfs_dev_extent); + printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n" + "\t\tchunk objectid %llu chunk offset %llu " + "length %llu\n", + (unsigned long long) + btrfs_dev_extent_chunk_tree(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_chunk_objectid(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_chunk_offset(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_length(l, dev_extent)); + }; + } +} + +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) +{ + int i; u32 nr; + struct btrfs_key key; + int level; + + if (!c) + return; + nr = btrfs_header_nritems(c); + level = btrfs_header_level(c); + if (level == 0) { + btrfs_print_leaf(root, c); + return; + } + printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", + (unsigned long long)btrfs_header_bytenr(c), + level, nr, + (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); + for (i = 0; i < nr; i++) { + btrfs_node_key_to_cpu(c, &key, i); + printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n", + i, + (unsigned long long)key.objectid, + key.type, + (unsigned long long)key.offset, + (unsigned long long)btrfs_node_blockptr(c, i)); + } + for (i = 0; i < nr; i++) { + struct extent_buffer *next = read_tree_block(root, + btrfs_node_blockptr(c, i), + btrfs_level_size(root, level - 1), + btrfs_node_ptr_generation(c, i)); + if (btrfs_is_leaf(next) && + level != 1) + BUG(); + if (btrfs_header_level(next) != + level - 1) + BUG(); + btrfs_print_tree(root, next); + free_extent_buffer(next); + } +} diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h new file mode 100644 index 00000000..da75efe5 --- /dev/null +++ b/fs/btrfs/print-tree.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __PRINT_TREE_ +#define __PRINT_TREE_ +void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l); +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t); +#endif diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c new file mode 100644 index 00000000..82d569cb --- /dev/null +++ b/fs/btrfs/ref-cache.c @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include "ctree.h" +#include "ref-cache.h" +#include "transaction.h" + +static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_leaf_ref *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct btrfs_leaf_ref, rb_node); + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n = root->rb_node; + struct btrfs_leaf_ref *entry; + + while (n) { + entry = rb_entry(n, struct btrfs_leaf_ref, rb_node); + WARN_ON(!entry->in_tree); + + if (bytenr < entry->bytenr) + n = n->rb_left; + else if (bytenr > entry->bytenr) + n = n->rb_right; + else + return n; + } + return NULL; +} diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h new file mode 100644 index 00000000..24f7001f --- /dev/null +++ b/fs/btrfs/ref-cache.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef __REFCACHE__ +#define __REFCACHE__ + +struct btrfs_extent_info { + /* bytenr and num_bytes find the extent in the extent allocation tree */ + u64 bytenr; + u64 num_bytes; + + /* objectid and offset find the back reference for the file */ + u64 objectid; + u64 offset; +}; + +struct btrfs_leaf_ref { + struct rb_node rb_node; + struct btrfs_leaf_ref_tree *tree; + int in_tree; + atomic_t usage; + + u64 root_gen; + u64 bytenr; + u64 owner; + u64 generation; + int nritems; + + struct list_head list; + struct btrfs_extent_info extents[]; +}; + +static inline size_t btrfs_leaf_ref_size(int nr_extents) +{ + return sizeof(struct btrfs_leaf_ref) + + sizeof(struct btrfs_extent_info) * nr_extents; +} +#endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c new file mode 100644 index 00000000..5e0a3dc7 --- /dev/null +++ b/fs/btrfs/relocation.c @@ -0,0 +1,4413 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "volumes.h" +#include "locking.h" +#include "btrfs_inode.h" +#include "async-thread.h" +#include "free-space-cache.h" +#include "inode-map.h" + +/* + * backref_node, mapping_node and tree_block start with this + */ +struct tree_entry { + struct rb_node rb_node; + u64 bytenr; +}; + +/* + * present a tree block in the backref cache + */ +struct backref_node { + struct rb_node rb_node; + u64 bytenr; + + u64 new_bytenr; + /* objectid of tree block owner, can be not uptodate */ + u64 owner; + /* link to pending, changed or detached list */ + struct list_head list; + /* list of upper level blocks reference this block */ + struct list_head upper; + /* list of child blocks in the cache */ + struct list_head lower; + /* NULL if this node is not tree root */ + struct btrfs_root *root; + /* extent buffer got by COW the block */ + struct extent_buffer *eb; + /* level of tree block */ + unsigned int level:8; + /* is the block in non-reference counted tree */ + unsigned int cowonly:1; + /* 1 if no child node in the cache */ + unsigned int lowest:1; + /* is the extent buffer locked */ + unsigned int locked:1; + /* has the block been processed */ + unsigned int processed:1; + /* have backrefs of this block been checked */ + unsigned int checked:1; + /* + * 1 if corresponding block has been cowed but some upper + * level block pointers may not point to the new location + */ + unsigned int pending:1; + /* + * 1 if the backref node isn't connected to any other + * backref node. + */ + unsigned int detached:1; +}; + +/* + * present a block pointer in the backref cache + */ +struct backref_edge { + struct list_head list[2]; + struct backref_node *node[2]; +}; + +#define LOWER 0 +#define UPPER 1 + +struct backref_cache { + /* red black tree of all backref nodes in the cache */ + struct rb_root rb_root; + /* for passing backref nodes to btrfs_reloc_cow_block */ + struct backref_node *path[BTRFS_MAX_LEVEL]; + /* + * list of blocks that have been cowed but some block + * pointers in upper level blocks may not reflect the + * new location + */ + struct list_head pending[BTRFS_MAX_LEVEL]; + /* list of backref nodes with no child node */ + struct list_head leaves; + /* list of blocks that have been cowed in current transaction */ + struct list_head changed; + /* list of detached backref node. */ + struct list_head detached; + + u64 last_trans; + + int nr_nodes; + int nr_edges; +}; + +/* + * map address of tree root to tree + */ +struct mapping_node { + struct rb_node rb_node; + u64 bytenr; + void *data; +}; + +struct mapping_tree { + struct rb_root rb_root; + spinlock_t lock; +}; + +/* + * present a tree block to process + */ +struct tree_block { + struct rb_node rb_node; + u64 bytenr; + struct btrfs_key key; + unsigned int level:8; + unsigned int key_ready:1; +}; + +#define MAX_EXTENTS 128 + +struct file_extent_cluster { + u64 start; + u64 end; + u64 boundary[MAX_EXTENTS]; + unsigned int nr; +}; + +struct reloc_control { + /* block group to relocate */ + struct btrfs_block_group_cache *block_group; + /* extent tree */ + struct btrfs_root *extent_root; + /* inode for moving data */ + struct inode *data_inode; + + struct btrfs_block_rsv *block_rsv; + + struct backref_cache backref_cache; + + struct file_extent_cluster cluster; + /* tree blocks have been processed */ + struct extent_io_tree processed_blocks; + /* map start of tree root to corresponding reloc tree */ + struct mapping_tree reloc_root_tree; + /* list of reloc trees */ + struct list_head reloc_roots; + /* size of metadata reservation for merging reloc trees */ + u64 merging_rsv_size; + /* size of relocated tree nodes */ + u64 nodes_relocated; + + u64 search_start; + u64 extents_found; + + unsigned int stage:8; + unsigned int create_reloc_tree:1; + unsigned int merge_reloc_tree:1; + unsigned int found_file_extent:1; + unsigned int commit_transaction:1; +}; + +/* stages of data relocation */ +#define MOVE_DATA_EXTENTS 0 +#define UPDATE_DATA_PTRS 1 + +static void remove_backref_node(struct backref_cache *cache, + struct backref_node *node); +static void __mark_block_processed(struct reloc_control *rc, + struct backref_node *node); + +static void mapping_tree_init(struct mapping_tree *tree) +{ + tree->rb_root = RB_ROOT; + spin_lock_init(&tree->lock); +} + +static void backref_cache_init(struct backref_cache *cache) +{ + int i; + cache->rb_root = RB_ROOT; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&cache->pending[i]); + INIT_LIST_HEAD(&cache->changed); + INIT_LIST_HEAD(&cache->detached); + INIT_LIST_HEAD(&cache->leaves); +} + +static void backref_cache_cleanup(struct backref_cache *cache) +{ + struct backref_node *node; + int i; + + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct backref_node, list); + remove_backref_node(cache, node); + } + + while (!list_empty(&cache->leaves)) { + node = list_entry(cache->leaves.next, + struct backref_node, lower); + remove_backref_node(cache, node); + } + + cache->last_trans = 0; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + BUG_ON(!list_empty(&cache->pending[i])); + BUG_ON(!list_empty(&cache->changed)); + BUG_ON(!list_empty(&cache->detached)); + BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); + BUG_ON(cache->nr_nodes); + BUG_ON(cache->nr_edges); +} + +static struct backref_node *alloc_backref_node(struct backref_cache *cache) +{ + struct backref_node *node; + + node = kzalloc(sizeof(*node), GFP_NOFS); + if (node) { + INIT_LIST_HEAD(&node->list); + INIT_LIST_HEAD(&node->upper); + INIT_LIST_HEAD(&node->lower); + RB_CLEAR_NODE(&node->rb_node); + cache->nr_nodes++; + } + return node; +} + +static void free_backref_node(struct backref_cache *cache, + struct backref_node *node) +{ + if (node) { + cache->nr_nodes--; + kfree(node); + } +} + +static struct backref_edge *alloc_backref_edge(struct backref_cache *cache) +{ + struct backref_edge *edge; + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (edge) + cache->nr_edges++; + return edge; +} + +static void free_backref_edge(struct backref_cache *cache, + struct backref_edge *edge) +{ + if (edge) { + cache->nr_edges--; + kfree(edge); + } +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct tree_entry *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n = root->rb_node; + struct tree_entry *entry; + + while (n) { + entry = rb_entry(n, struct tree_entry, rb_node); + + if (bytenr < entry->bytenr) + n = n->rb_left; + else if (bytenr > entry->bytenr) + n = n->rb_right; + else + return n; + } + return NULL; +} + +/* + * walk up backref nodes until reach node presents tree root + */ +static struct backref_node *walk_up_backref(struct backref_node *node, + struct backref_edge *edges[], + int *index) +{ + struct backref_edge *edge; + int idx = *index; + + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, + struct backref_edge, list[LOWER]); + edges[idx++] = edge; + node = edge->node[UPPER]; + } + BUG_ON(node->detached); + *index = idx; + return node; +} + +/* + * walk down backref nodes to find start of next reference path + */ +static struct backref_node *walk_down_backref(struct backref_edge *edges[], + int *index) +{ + struct backref_edge *edge; + struct backref_node *lower; + int idx = *index; + + while (idx > 0) { + edge = edges[idx - 1]; + lower = edge->node[LOWER]; + if (list_is_last(&edge->list[LOWER], &lower->upper)) { + idx--; + continue; + } + edge = list_entry(edge->list[LOWER].next, + struct backref_edge, list[LOWER]); + edges[idx - 1] = edge; + *index = idx; + return edge->node[UPPER]; + } + *index = 0; + return NULL; +} + +static void unlock_node_buffer(struct backref_node *node) +{ + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } +} + +static void drop_node_buffer(struct backref_node *node) +{ + if (node->eb) { + unlock_node_buffer(node); + free_extent_buffer(node->eb); + node->eb = NULL; + } +} + +static void drop_backref_node(struct backref_cache *tree, + struct backref_node *node) +{ + BUG_ON(!list_empty(&node->upper)); + + drop_node_buffer(node); + list_del(&node->list); + list_del(&node->lower); + if (!RB_EMPTY_NODE(&node->rb_node)) + rb_erase(&node->rb_node, &tree->rb_root); + free_backref_node(tree, node); +} + +/* + * remove a backref node from the backref cache + */ +static void remove_backref_node(struct backref_cache *cache, + struct backref_node *node) +{ + struct backref_node *upper; + struct backref_edge *edge; + + if (!node) + return; + + BUG_ON(!node->lowest && !node->detached); + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, struct backref_edge, + list[LOWER]); + upper = edge->node[UPPER]; + list_del(&edge->list[LOWER]); + list_del(&edge->list[UPPER]); + free_backref_edge(cache, edge); + + if (RB_EMPTY_NODE(&upper->rb_node)) { + BUG_ON(!list_empty(&node->upper)); + drop_backref_node(cache, node); + node = upper; + node->lowest = 1; + continue; + } + /* + * add the node to leaf node list if no other + * child block cached. + */ + if (list_empty(&upper->lower)) { + list_add_tail(&upper->lower, &cache->leaves); + upper->lowest = 1; + } + } + + drop_backref_node(cache, node); +} + +static void update_backref_node(struct backref_cache *cache, + struct backref_node *node, u64 bytenr) +{ + struct rb_node *rb_node; + rb_erase(&node->rb_node, &cache->rb_root); + node->bytenr = bytenr; + rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); + BUG_ON(rb_node); +} + +/* + * update backref cache after a transaction commit + */ +static int update_backref_cache(struct btrfs_trans_handle *trans, + struct backref_cache *cache) +{ + struct backref_node *node; + int level = 0; + + if (cache->last_trans == 0) { + cache->last_trans = trans->transid; + return 0; + } + + if (cache->last_trans == trans->transid) + return 0; + + /* + * detached nodes are used to avoid unnecessary backref + * lookup. transaction commit changes the extent tree. + * so the detached nodes are no longer useful. + */ + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct backref_node, list); + remove_backref_node(cache, node); + } + + while (!list_empty(&cache->changed)) { + node = list_entry(cache->changed.next, + struct backref_node, list); + list_del_init(&node->list); + BUG_ON(node->pending); + update_backref_node(cache, node, node->new_bytenr); + } + + /* + * some nodes can be left in the pending list if there were + * errors during processing the pending nodes. + */ + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + list_for_each_entry(node, &cache->pending[level], list) { + BUG_ON(!node->pending); + if (node->bytenr == node->new_bytenr) + continue; + update_backref_node(cache, node, node->new_bytenr); + } + } + + cache->last_trans = 0; + return 1; +} + + +static int should_ignore_root(struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + + if (!root->ref_cows) + return 0; + + reloc_root = root->reloc_root; + if (!reloc_root) + return 0; + + if (btrfs_root_last_snapshot(&reloc_root->root_item) == + root->fs_info->running_transaction->transid - 1) + return 0; + /* + * if there is reloc tree and it was created in previous + * transaction backref lookup can find the reloc tree, + * so backref node for the fs tree root is useless for + * relocation. + */ + return 1; +} +/* + * find reloc tree by address of tree root + */ +static struct btrfs_root *find_reloc_root(struct reloc_control *rc, + u64 bytenr) +{ + struct rb_node *rb_node; + struct mapping_node *node; + struct btrfs_root *root = NULL; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + root = (struct btrfs_root *)node->data; + } + spin_unlock(&rc->reloc_root_tree.lock); + return root; +} + +static int is_cowonly_root(u64 root_objectid) +{ + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || + root_objectid == BTRFS_EXTENT_TREE_OBJECTID || + root_objectid == BTRFS_CHUNK_TREE_OBJECTID || + root_objectid == BTRFS_DEV_TREE_OBJECTID || + root_objectid == BTRFS_TREE_LOG_OBJECTID || + root_objectid == BTRFS_CSUM_TREE_OBJECTID) + return 1; + return 0; +} + +static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid) +{ + struct btrfs_key key; + + key.objectid = root_objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + if (is_cowonly_root(root_objectid)) + key.offset = 0; + else + key.offset = (u64)-1; + + return btrfs_read_fs_root_no_name(fs_info, &key); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static noinline_for_stack +struct btrfs_root *find_tree_root(struct reloc_control *rc, + struct extent_buffer *leaf, + struct btrfs_extent_ref_v0 *ref0) +{ + struct btrfs_root *root; + u64 root_objectid = btrfs_ref_root_v0(leaf, ref0); + u64 generation = btrfs_ref_generation_v0(leaf, ref0); + + BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID); + + root = read_fs_root(rc->extent_root->fs_info, root_objectid); + BUG_ON(IS_ERR(root)); + + if (root->ref_cows && + generation != btrfs_root_generation(&root->root_item)) + return NULL; + + return root; +} +#endif + +static noinline_for_stack +int find_inline_backref(struct extent_buffer *leaf, int slot, + unsigned long *ptr, unsigned long *end) +{ + struct btrfs_extent_item *ei; + struct btrfs_tree_block_info *bi; + u32 item_size; + + item_size = btrfs_item_size_nr(leaf, slot); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + return 1; + } +#endif + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + WARN_ON(!(btrfs_extent_flags(leaf, ei) & + BTRFS_EXTENT_FLAG_TREE_BLOCK)); + + if (item_size <= sizeof(*ei) + sizeof(*bi)) { + WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); + return 1; + } + + bi = (struct btrfs_tree_block_info *)(ei + 1); + *ptr = (unsigned long)(bi + 1); + *end = (unsigned long)ei + item_size; + return 0; +} + +/* + * build backref tree for a given tree block. root of the backref tree + * corresponds the tree block, leaves of the backref tree correspond + * roots of b-trees that reference the tree block. + * + * the basic idea of this function is check backrefs of a given block + * to find upper level blocks that refernece the block, and then check + * bakcrefs of these upper level blocks recursively. the recursion stop + * when tree root is reached or backrefs for the block is cached. + * + * NOTE: if we find backrefs for a block are cached, we know backrefs + * for all upper level blocks that directly/indirectly reference the + * block are also cached. + */ +static noinline_for_stack +struct backref_node *build_backref_tree(struct reloc_control *rc, + struct btrfs_key *node_key, + int level, u64 bytenr) +{ + struct backref_cache *cache = &rc->backref_cache; + struct btrfs_path *path1; + struct btrfs_path *path2; + struct extent_buffer *eb; + struct btrfs_root *root; + struct backref_node *cur; + struct backref_node *upper; + struct backref_node *lower; + struct backref_node *node = NULL; + struct backref_node *exist = NULL; + struct backref_edge *edge; + struct rb_node *rb_node; + struct btrfs_key key; + unsigned long end; + unsigned long ptr; + LIST_HEAD(list); + LIST_HEAD(useless); + int cowonly; + int ret; + int err = 0; + + path1 = btrfs_alloc_path(); + path2 = btrfs_alloc_path(); + if (!path1 || !path2) { + err = -ENOMEM; + goto out; + } + path1->reada = 1; + path2->reada = 2; + + node = alloc_backref_node(cache); + if (!node) { + err = -ENOMEM; + goto out; + } + + node->bytenr = bytenr; + node->level = level; + node->lowest = 1; + cur = node; +again: + end = 0; + ptr = 0; + key.objectid = cur->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; + + path1->search_commit_root = 1; + path1->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1, + 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + BUG_ON(!ret || !path1->slots[0]); + + path1->slots[0]--; + + WARN_ON(cur->checked); + if (!list_empty(&cur->upper)) { + /* + * the backref was added previously when processing + * backref of type BTRFS_TREE_BLOCK_REF_KEY + */ + BUG_ON(!list_is_singular(&cur->upper)); + edge = list_entry(cur->upper.next, struct backref_edge, + list[LOWER]); + BUG_ON(!list_empty(&edge->list[UPPER])); + exist = edge->node[UPPER]; + /* + * add the upper level block to pending list if we need + * check its backrefs + */ + if (!exist->checked) + list_add_tail(&edge->list[UPPER], &list); + } else { + exist = NULL; + } + + while (1) { + cond_resched(); + eb = path1->nodes[0]; + + if (ptr >= end) { + if (path1->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(rc->extent_root, path1); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + break; + eb = path1->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path1->slots[0]); + if (key.objectid != cur->bytenr) { + WARN_ON(exist); + break; + } + + if (key.type == BTRFS_EXTENT_ITEM_KEY) { + ret = find_inline_backref(eb, path1->slots[0], + &ptr, &end); + if (ret) + goto next; + } + } + + if (ptr < end) { + /* update key for inline back ref */ + struct btrfs_extent_inline_ref *iref; + iref = (struct btrfs_extent_inline_ref *)ptr; + key.type = btrfs_extent_inline_ref_type(eb, iref); + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY && + key.type != BTRFS_SHARED_BLOCK_REF_KEY); + } + + if (exist && + ((key.type == BTRFS_TREE_BLOCK_REF_KEY && + exist->owner == key.offset) || + (key.type == BTRFS_SHARED_BLOCK_REF_KEY && + exist->bytenr == key.offset))) { + exist = NULL; + goto next; + } + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || + key.type == BTRFS_EXTENT_REF_V0_KEY) { + if (key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(eb, path1->slots[0], + struct btrfs_extent_ref_v0); + if (key.objectid == key.offset) { + root = find_tree_root(rc, eb, ref0); + if (root && !should_ignore_root(root)) + cur->root = root; + else + list_add(&cur->list, &useless); + break; + } + if (is_cowonly_root(btrfs_ref_root_v0(eb, + ref0))) + cur->cowonly = 1; + } +#else + BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { +#endif + if (key.objectid == key.offset) { + /* + * only root blocks of reloc trees use + * backref of this type. + */ + root = find_reloc_root(rc, cur->bytenr); + BUG_ON(!root); + cur->root = root; + break; + } + + edge = alloc_backref_edge(cache); + if (!edge) { + err = -ENOMEM; + goto out; + } + rb_node = tree_search(&cache->rb_root, key.offset); + if (!rb_node) { + upper = alloc_backref_node(cache); + if (!upper) { + free_backref_edge(cache, edge); + err = -ENOMEM; + goto out; + } + upper->bytenr = key.offset; + upper->level = cur->level + 1; + /* + * backrefs for the upper level block isn't + * cached, add the block to pending list + */ + list_add_tail(&edge->list[UPPER], &list); + } else { + upper = rb_entry(rb_node, struct backref_node, + rb_node); + BUG_ON(!upper->checked); + INIT_LIST_HEAD(&edge->list[UPPER]); + } + list_add_tail(&edge->list[LOWER], &cur->upper); + edge->node[LOWER] = cur; + edge->node[UPPER] = upper; + + goto next; + } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { + goto next; + } + + /* key.type == BTRFS_TREE_BLOCK_REF_KEY */ + root = read_fs_root(rc->extent_root->fs_info, key.offset); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + + if (!root->ref_cows) + cur->cowonly = 1; + + if (btrfs_root_level(&root->root_item) == cur->level) { + /* tree root */ + BUG_ON(btrfs_root_bytenr(&root->root_item) != + cur->bytenr); + if (should_ignore_root(root)) + list_add(&cur->list, &useless); + else + cur->root = root; + break; + } + + level = cur->level + 1; + + /* + * searching the tree to find upper level blocks + * reference the block. + */ + path2->search_commit_root = 1; + path2->skip_locking = 1; + path2->lowest_level = level; + ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); + path2->lowest_level = 0; + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0 && path2->slots[level] > 0) + path2->slots[level]--; + + eb = path2->nodes[level]; + WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != + cur->bytenr); + + lower = cur; + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path2->nodes[level]) { + BUG_ON(btrfs_root_bytenr(&root->root_item) != + lower->bytenr); + if (should_ignore_root(root)) + list_add(&lower->list, &useless); + else + lower->root = root; + break; + } + + edge = alloc_backref_edge(cache); + if (!edge) { + err = -ENOMEM; + goto out; + } + + eb = path2->nodes[level]; + rb_node = tree_search(&cache->rb_root, eb->start); + if (!rb_node) { + upper = alloc_backref_node(cache); + if (!upper) { + free_backref_edge(cache, edge); + err = -ENOMEM; + goto out; + } + upper->bytenr = eb->start; + upper->owner = btrfs_header_owner(eb); + upper->level = lower->level + 1; + if (!root->ref_cows) + upper->cowonly = 1; + + /* + * if we know the block isn't shared + * we can void checking its backrefs. + */ + if (btrfs_block_can_be_shared(root, eb)) + upper->checked = 0; + else + upper->checked = 1; + + /* + * add the block to pending list if we + * need check its backrefs. only block + * at 'cur->level + 1' is added to the + * tail of pending list. this guarantees + * we check backrefs from lower level + * blocks to upper level blocks. + */ + if (!upper->checked && + level == cur->level + 1) { + list_add_tail(&edge->list[UPPER], + &list); + } else + INIT_LIST_HEAD(&edge->list[UPPER]); + } else { + upper = rb_entry(rb_node, struct backref_node, + rb_node); + BUG_ON(!upper->checked); + INIT_LIST_HEAD(&edge->list[UPPER]); + if (!upper->owner) + upper->owner = btrfs_header_owner(eb); + } + list_add_tail(&edge->list[LOWER], &lower->upper); + edge->node[LOWER] = lower; + edge->node[UPPER] = upper; + + if (rb_node) + break; + lower = upper; + upper = NULL; + } + btrfs_release_path(path2); +next: + if (ptr < end) { + ptr += btrfs_extent_inline_ref_size(key.type); + if (ptr >= end) { + WARN_ON(ptr > end); + ptr = 0; + end = 0; + } + } + if (ptr >= end) + path1->slots[0]++; + } + btrfs_release_path(path1); + + cur->checked = 1; + WARN_ON(exist); + + /* the pending list isn't empty, take the first block to process */ + if (!list_empty(&list)) { + edge = list_entry(list.next, struct backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + cur = edge->node[UPPER]; + goto again; + } + + /* + * everything goes well, connect backref nodes and insert backref nodes + * into the cache. + */ + BUG_ON(!node->checked); + cowonly = node->cowonly; + if (!cowonly) { + rb_node = tree_insert(&cache->rb_root, node->bytenr, + &node->rb_node); + BUG_ON(rb_node); + list_add_tail(&node->lower, &cache->leaves); + } + + list_for_each_entry(edge, &node->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + + while (!list_empty(&list)) { + edge = list_entry(list.next, struct backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + upper = edge->node[UPPER]; + if (upper->detached) { + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + free_backref_edge(cache, edge); + if (list_empty(&lower->upper)) + list_add(&lower->list, &useless); + continue; + } + + if (!RB_EMPTY_NODE(&upper->rb_node)) { + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + + list_add_tail(&edge->list[UPPER], &upper->lower); + continue; + } + + BUG_ON(!upper->checked); + BUG_ON(cowonly != upper->cowonly); + if (!cowonly) { + rb_node = tree_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + BUG_ON(rb_node); + } + + list_add_tail(&edge->list[UPPER], &upper->lower); + + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + } + /* + * process useless backref nodes. backref nodes for tree leaves + * are deleted from the cache. backref nodes for upper level + * tree blocks are left in the cache to avoid unnecessary backref + * lookup. + */ + while (!list_empty(&useless)) { + upper = list_entry(useless.next, struct backref_node, list); + list_del_init(&upper->list); + BUG_ON(!list_empty(&upper->upper)); + if (upper == node) + node = NULL; + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + while (!list_empty(&upper->lower)) { + edge = list_entry(upper->lower.next, + struct backref_edge, list[UPPER]); + list_del(&edge->list[UPPER]); + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + free_backref_edge(cache, edge); + + if (list_empty(&lower->upper)) + list_add(&lower->list, &useless); + } + __mark_block_processed(rc, upper); + if (upper->level > 0) { + list_add(&upper->list, &cache->detached); + upper->detached = 1; + } else { + rb_erase(&upper->rb_node, &cache->rb_root); + free_backref_node(cache, upper); + } + } +out: + btrfs_free_path(path1); + btrfs_free_path(path2); + if (err) { + while (!list_empty(&useless)) { + lower = list_entry(useless.next, + struct backref_node, upper); + list_del_init(&lower->upper); + } + upper = node; + INIT_LIST_HEAD(&list); + while (upper) { + if (RB_EMPTY_NODE(&upper->rb_node)) { + list_splice_tail(&upper->upper, &list); + free_backref_node(cache, upper); + } + + if (list_empty(&list)) + break; + + edge = list_entry(list.next, struct backref_edge, + list[LOWER]); + list_del(&edge->list[LOWER]); + upper = edge->node[UPPER]; + free_backref_edge(cache, edge); + } + return ERR_PTR(err); + } + BUG_ON(node && node->detached); + return node; +} + +/* + * helper to add backref node for the newly created snapshot. + * the backref node is created by cloning backref node that + * corresponds to root of source tree + */ +static int clone_backref_node(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *src, + struct btrfs_root *dest) +{ + struct btrfs_root *reloc_root = src->reloc_root; + struct backref_cache *cache = &rc->backref_cache; + struct backref_node *node = NULL; + struct backref_node *new_node; + struct backref_edge *edge; + struct backref_edge *new_edge; + struct rb_node *rb_node; + + if (cache->last_trans > 0) + update_backref_cache(trans, cache); + + rb_node = tree_search(&cache->rb_root, src->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct backref_node, rb_node); + if (node->detached) + node = NULL; + else + BUG_ON(node->new_bytenr != reloc_root->node->start); + } + + if (!node) { + rb_node = tree_search(&cache->rb_root, + reloc_root->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct backref_node, + rb_node); + BUG_ON(node->detached); + } + } + + if (!node) + return 0; + + new_node = alloc_backref_node(cache); + if (!new_node) + return -ENOMEM; + + new_node->bytenr = dest->node->start; + new_node->level = node->level; + new_node->lowest = node->lowest; + new_node->checked = 1; + new_node->root = dest; + + if (!node->lowest) { + list_for_each_entry(edge, &node->lower, list[UPPER]) { + new_edge = alloc_backref_edge(cache); + if (!new_edge) + goto fail; + + new_edge->node[UPPER] = new_node; + new_edge->node[LOWER] = edge->node[LOWER]; + list_add_tail(&new_edge->list[UPPER], + &new_node->lower); + } + } + + rb_node = tree_insert(&cache->rb_root, new_node->bytenr, + &new_node->rb_node); + BUG_ON(rb_node); + + if (!new_node->lowest) { + list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { + list_add_tail(&new_edge->list[LOWER], + &new_edge->node[LOWER]->upper); + } + } + return 0; +fail: + while (!list_empty(&new_node->lower)) { + new_edge = list_entry(new_node->lower.next, + struct backref_edge, list[UPPER]); + list_del(&new_edge->list[UPPER]); + free_backref_edge(cache, new_edge); + } + free_backref_node(cache, new_node); + return -ENOMEM; +} + +/* + * helper to add 'address of tree root -> reloc tree' mapping + */ +static int __add_reloc_root(struct btrfs_root *root) +{ + struct rb_node *rb_node; + struct mapping_node *node; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + node = kmalloc(sizeof(*node), GFP_NOFS); + BUG_ON(!node); + + node->bytenr = root->node->start; + node->data = root; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + BUG_ON(rb_node); + + list_add_tail(&root->root_list, &rc->reloc_roots); + return 0; +} + +/* + * helper to update/delete the 'address of tree root -> reloc tree' + * mapping + */ +static int __update_reloc_root(struct btrfs_root *root, int del) +{ + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + } + spin_unlock(&rc->reloc_root_tree.lock); + + BUG_ON((struct btrfs_root *)node->data != root); + + if (!del) { + spin_lock(&rc->reloc_root_tree.lock); + node->bytenr = root->node->start; + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + BUG_ON(rb_node); + } else { + list_del_init(&root->root_list); + kfree(node); + } + return 0; +} + +static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) +{ + struct btrfs_root *reloc_root; + struct extent_buffer *eb; + struct btrfs_root_item *root_item; + struct btrfs_key root_key; + int ret; + + root_item = kmalloc(sizeof(*root_item), GFP_NOFS); + BUG_ON(!root_item); + + root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = objectid; + + if (root->root_key.objectid == objectid) { + /* called by btrfs_init_reloc_root */ + ret = btrfs_copy_root(trans, root, root->commit_root, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + + btrfs_set_root_last_snapshot(&root->root_item, + trans->transid - 1); + } else { + /* + * called by btrfs_reloc_post_snapshot_hook. + * the source tree is a reloc tree, all tree blocks + * modified after it was created have RELOC flag + * set in their headers. so it's OK to not update + * the 'last_snapshot'. + */ + ret = btrfs_copy_root(trans, root, root->node, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + } + + memcpy(root_item, &root->root_item, sizeof(*root_item)); + btrfs_set_root_bytenr(root_item, eb->start); + btrfs_set_root_level(root_item, btrfs_header_level(eb)); + btrfs_set_root_generation(root_item, trans->transid); + + if (root->root_key.objectid == objectid) { + btrfs_set_root_refs(root_item, 0); + memset(&root_item->drop_progress, 0, + sizeof(struct btrfs_disk_key)); + root_item->drop_level = 0; + } + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + ret = btrfs_insert_root(trans, root->fs_info->tree_root, + &root_key, root_item); + BUG_ON(ret); + kfree(root_item); + + reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &root_key); + BUG_ON(IS_ERR(reloc_root)); + reloc_root->last_trans = trans->transid; + return reloc_root; +} + +/* + * create reloc tree for a given fs tree. reloc tree is just a + * snapshot of the fs tree with special root objectid. + */ +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct reloc_control *rc = root->fs_info->reloc_ctl; + int clear_rsv = 0; + + if (root->reloc_root) { + reloc_root = root->reloc_root; + reloc_root->last_trans = trans->transid; + return 0; + } + + if (!rc || !rc->create_reloc_tree || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + if (!trans->block_rsv) { + trans->block_rsv = rc->block_rsv; + clear_rsv = 1; + } + reloc_root = create_reloc_root(trans, root, root->root_key.objectid); + if (clear_rsv) + trans->block_rsv = NULL; + + __add_reloc_root(reloc_root); + root->reloc_root = reloc_root; + return 0; +} + +/* + * update root item of reloc tree + */ +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct btrfs_root_item *root_item; + int del = 0; + int ret; + + if (!root->reloc_root) + goto out; + + reloc_root = root->reloc_root; + root_item = &reloc_root->root_item; + + if (root->fs_info->reloc_ctl->merge_reloc_tree && + btrfs_root_refs(root_item) == 0) { + root->reloc_root = NULL; + del = 1; + } + + __update_reloc_root(reloc_root, del); + + if (reloc_root->commit_root != reloc_root->node) { + btrfs_set_root_node(root_item, reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + reloc_root->commit_root = btrfs_root_node(reloc_root); + } + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &reloc_root->root_key, root_item); + BUG_ON(ret); + +out: + return 0; +} + +/* + * helper to find first cached inode with inode number >= objectid + * in a subvolume + */ +static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) +{ + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < btrfs_ino(&entry->vfs_inode)) + node = node->rb_left; + else if (objectid > btrfs_ino(&entry->vfs_inode)) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= btrfs_ino(&entry->vfs_inode)) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + return inode; + } + + objectid = btrfs_ino(&entry->vfs_inode) + 1; + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); + return NULL; +} + +static int in_block_group(u64 bytenr, + struct btrfs_block_group_cache *block_group) +{ + if (bytenr >= block_group->key.objectid && + bytenr < block_group->key.objectid + block_group->key.offset) + return 1; + return 0; +} + +/* + * get new location of data + */ +static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + bytenr -= BTRFS_I(reloc_inode)->index_cnt; + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode), + bytenr, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + BUG_ON(btrfs_file_extent_offset(leaf, fi) || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)); + + if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { + ret = 1; + goto out; + } + + *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +/* + * update file extent items in the tree leaf to point to + * the new locations. + */ +static noinline_for_stack +int replace_file_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root, + struct extent_buffer *leaf) +{ + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + struct inode *inode = NULL; + u64 parent; + u64 bytenr; + u64 new_bytenr = 0; + u64 num_bytes; + u64 end; + u32 nritems; + u32 i; + int ret; + int first = 1; + int dirty = 0; + + if (rc->stage != UPDATE_DATA_PTRS) + return 0; + + /* reloc trees always use full backref */ + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + parent = leaf->start; + else + parent = 0; + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + cond_resched(); + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + if (bytenr == 0) + continue; + if (!in_block_group(bytenr, rc->block_group)) + continue; + + /* + * if we are modifying block in fs tree, wait for readpage + * to complete and drop the extent cache + */ + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (first) { + inode = find_next_inode(root, key.objectid); + first = 0; + } else if (inode && btrfs_ino(inode) < key.objectid) { + btrfs_add_delayed_iput(inode); + inode = find_next_inode(root, key.objectid); + } + if (inode && btrfs_ino(inode) == key.objectid) { + end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + WARN_ON(!IS_ALIGNED(key.offset, + root->sectorsize)); + WARN_ON(!IS_ALIGNED(end, root->sectorsize)); + end--; + ret = try_lock_extent(&BTRFS_I(inode)->io_tree, + key.offset, end, + GFP_NOFS); + if (!ret) + continue; + + btrfs_drop_extent_cache(inode, key.offset, end, + 1); + unlock_extent(&BTRFS_I(inode)->io_tree, + key.offset, end, GFP_NOFS); + } + } + + ret = get_new_location(rc->data_inode, &new_bytenr, + bytenr, num_bytes); + if (ret > 0) { + WARN_ON(1); + continue; + } + BUG_ON(ret < 0); + + btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); + dirty = 1; + + key.offset -= btrfs_file_extent_offset(leaf, fi); + ret = btrfs_inc_extent_ref(trans, root, new_bytenr, + num_bytes, parent, + btrfs_header_owner(leaf), + key.objectid, key.offset); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + parent, btrfs_header_owner(leaf), + key.objectid, key.offset); + BUG_ON(ret); + } + if (dirty) + btrfs_mark_buffer_dirty(leaf); + if (inode) + btrfs_add_delayed_iput(inode); + return 0; +} + +static noinline_for_stack +int memcmp_node_keys(struct extent_buffer *eb, int slot, + struct btrfs_path *path, int level) +{ + struct btrfs_disk_key key1; + struct btrfs_disk_key key2; + btrfs_node_key(eb, &key1, slot); + btrfs_node_key(path->nodes[level], &key2, path->slots[level]); + return memcmp(&key1, &key2, sizeof(key1)); +} + +/* + * try to replace tree blocks in fs tree with the new blocks + * in reloc tree. tree blocks haven't been modified since the + * reloc tree was create can be replaced. + * + * if a block was replaced, level of the block + 1 is returned. + * if no block got replaced, 0 is returned. if there are other + * errors, a negative error number is returned. + */ +static noinline_for_stack +int replace_path(struct btrfs_trans_handle *trans, + struct btrfs_root *dest, struct btrfs_root *src, + struct btrfs_path *path, struct btrfs_key *next_key, + int lowest_level, int max_level) +{ + struct extent_buffer *eb; + struct extent_buffer *parent; + struct btrfs_key key; + u64 old_bytenr; + u64 new_bytenr; + u64 old_ptr_gen; + u64 new_ptr_gen; + u64 last_snapshot; + u32 blocksize; + int cow = 0; + int level; + int ret; + int slot; + + BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + + last_snapshot = btrfs_root_last_snapshot(&src->root_item); +again: + slot = path->slots[lowest_level]; + btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); + + eb = btrfs_lock_root_node(dest); + btrfs_set_lock_blocking(eb); + level = btrfs_header_level(eb); + + if (level < lowest_level) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + return 0; + } + + if (cow) { + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + BUG_ON(ret); + } + btrfs_set_lock_blocking(eb); + + if (next_key) { + next_key->objectid = (u64)-1; + next_key->type = (u8)-1; + next_key->offset = (u64)-1; + } + + parent = eb; + while (1) { + level = btrfs_header_level(parent); + BUG_ON(level < lowest_level); + + ret = btrfs_bin_search(parent, &key, level, &slot); + if (ret && slot > 0) + slot--; + + if (next_key && slot + 1 < btrfs_header_nritems(parent)) + btrfs_node_key_to_cpu(parent, next_key, slot + 1); + + old_bytenr = btrfs_node_blockptr(parent, slot); + blocksize = btrfs_level_size(dest, level - 1); + old_ptr_gen = btrfs_node_ptr_generation(parent, slot); + + if (level <= max_level) { + eb = path->nodes[level]; + new_bytenr = btrfs_node_blockptr(eb, + path->slots[level]); + new_ptr_gen = btrfs_node_ptr_generation(eb, + path->slots[level]); + } else { + new_bytenr = 0; + new_ptr_gen = 0; + } + + if (new_bytenr > 0 && new_bytenr == old_bytenr) { + WARN_ON(1); + ret = level; + break; + } + + if (new_bytenr == 0 || old_ptr_gen > last_snapshot || + memcmp_node_keys(parent, slot, path, level)) { + if (level <= lowest_level) { + ret = 0; + break; + } + + eb = read_tree_block(dest, old_bytenr, blocksize, + old_ptr_gen); + BUG_ON(!eb); + btrfs_tree_lock(eb); + if (cow) { + ret = btrfs_cow_block(trans, dest, eb, parent, + slot, &eb); + BUG_ON(ret); + } + btrfs_set_lock_blocking(eb); + + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + + parent = eb; + continue; + } + + if (!cow) { + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + cow = 1; + goto again; + } + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + btrfs_release_path(path); + + path->lowest_level = level; + ret = btrfs_search_slot(trans, src, &key, path, 0, 1); + path->lowest_level = 0; + BUG_ON(ret); + + /* + * swap blocks in fs tree and reloc tree. + */ + btrfs_set_node_blockptr(parent, slot, new_bytenr); + btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); + btrfs_mark_buffer_dirty(parent); + + btrfs_set_node_blockptr(path->nodes[level], + path->slots[level], old_bytenr); + btrfs_set_node_ptr_generation(path->nodes[level], + path->slots[level], old_ptr_gen); + btrfs_mark_buffer_dirty(path->nodes[level]); + + ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, + path->nodes[level]->start, + src->root_key.objectid, level - 1, 0); + BUG_ON(ret); + ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, + 0, dest->root_key.objectid, level - 1, + 0); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, + path->nodes[level]->start, + src->root_key.objectid, level - 1, 0); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, + 0, dest->root_key.objectid, level - 1, + 0); + BUG_ON(ret); + + btrfs_unlock_up_safe(path, 0); + + ret = level; + break; + } + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + return ret; +} + +/* + * helper to find next relocated block in reloc tree + */ +static noinline_for_stack +int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + struct extent_buffer *eb; + int i; + u64 last_snapshot; + u32 nritems; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + + for (i = 0; i < *level; i++) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + + for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { + eb = path->nodes[i]; + nritems = btrfs_header_nritems(eb); + while (path->slots[i] + 1 < nritems) { + path->slots[i]++; + if (btrfs_node_ptr_generation(eb, path->slots[i]) <= + last_snapshot) + continue; + + *level = i; + return 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + return 1; +} + +/* + * walk down reloc tree to find relocated block of lowest level + */ +static noinline_for_stack +int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + struct extent_buffer *eb = NULL; + int i; + u64 bytenr; + u64 ptr_gen = 0; + u64 last_snapshot; + u32 blocksize; + u32 nritems; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + + for (i = *level; i > 0; i--) { + eb = path->nodes[i]; + nritems = btrfs_header_nritems(eb); + while (path->slots[i] < nritems) { + ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); + if (ptr_gen > last_snapshot) + break; + path->slots[i]++; + } + if (path->slots[i] >= nritems) { + if (i == *level) + break; + *level = i + 1; + return 0; + } + if (i == 1) { + *level = i; + return 0; + } + + bytenr = btrfs_node_blockptr(eb, path->slots[i]); + blocksize = btrfs_level_size(root, i - 1); + eb = read_tree_block(root, bytenr, blocksize, ptr_gen); + BUG_ON(btrfs_header_level(eb) != i - 1); + path->nodes[i - 1] = eb; + path->slots[i - 1] = 0; + } + return 1; +} + +/* + * invalidate extent cache for file extents whose key in range of + * [min_key, max_key) + */ +static int invalidate_extent_cache(struct btrfs_root *root, + struct btrfs_key *min_key, + struct btrfs_key *max_key) +{ + struct inode *inode = NULL; + u64 objectid; + u64 start, end; + u64 ino; + + objectid = min_key->objectid; + while (1) { + cond_resched(); + iput(inode); + + if (objectid > max_key->objectid) + break; + + inode = find_next_inode(root, objectid); + if (!inode) + break; + ino = btrfs_ino(inode); + + if (ino > max_key->objectid) { + iput(inode); + break; + } + + objectid = ino + 1; + if (!S_ISREG(inode->i_mode)) + continue; + + if (unlikely(min_key->objectid == ino)) { + if (min_key->type > BTRFS_EXTENT_DATA_KEY) + continue; + if (min_key->type < BTRFS_EXTENT_DATA_KEY) + start = 0; + else { + start = min_key->offset; + WARN_ON(!IS_ALIGNED(start, root->sectorsize)); + } + } else { + start = 0; + } + + if (unlikely(max_key->objectid == ino)) { + if (max_key->type < BTRFS_EXTENT_DATA_KEY) + continue; + if (max_key->type > BTRFS_EXTENT_DATA_KEY) { + end = (u64)-1; + } else { + if (max_key->offset == 0) + continue; + end = max_key->offset; + WARN_ON(!IS_ALIGNED(end, root->sectorsize)); + end--; + } + } else { + end = (u64)-1; + } + + /* the lock_extent waits for readpage to complete */ + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + btrfs_drop_extent_cache(inode, start, end, 1); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + } + return 0; +} + +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) + +{ + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + break; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + level++; + } + return 1; +} + +/* + * merge the relocated tree blocks in reloc tree with corresponding + * fs tree. + */ +static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, + struct btrfs_root *root) +{ + LIST_HEAD(inode_list); + struct btrfs_key key; + struct btrfs_key next_key; + struct btrfs_trans_handle *trans; + struct btrfs_root *reloc_root; + struct btrfs_root_item *root_item; + struct btrfs_path *path; + struct extent_buffer *leaf; + unsigned long nr; + int level; + int max_level; + int replaced = 0; + int ret; + int err = 0; + u32 min_reserved; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + + reloc_root = root->reloc_root; + root_item = &reloc_root->root_item; + + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + level = btrfs_root_level(root_item); + extent_buffer_get(reloc_root->node); + path->nodes[level] = reloc_root->node; + path->slots[level] = 0; + } else { + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + + level = root_item->drop_level; + BUG_ON(level == 0); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + btrfs_free_path(path); + return ret; + } + + btrfs_node_key_to_cpu(path->nodes[level], &next_key, + path->slots[level]); + WARN_ON(memcmp(&key, &next_key, sizeof(key))); + + btrfs_unlock_up_safe(path, 0); + } + + min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + memset(&next_key, 0, sizeof(next_key)); + + while (1) { + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); + trans->block_rsv = rc->block_rsv; + + ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, + min_reserved, 0); + if (ret) { + BUG_ON(ret != -EAGAIN); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + continue; + } + + replaced = 0; + max_level = level; + + ret = walk_down_reloc_tree(reloc_root, path, &level); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + break; + + if (!find_next_key(path, level, &key) && + btrfs_comp_cpu_keys(&next_key, &key) >= 0) { + ret = 0; + } else { + ret = replace_path(trans, root, reloc_root, path, + &next_key, level, max_level); + } + if (ret < 0) { + err = ret; + goto out; + } + + if (ret > 0) { + level = ret; + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + replaced = 1; + } + + ret = walk_up_reloc_tree(reloc_root, path, &level); + if (ret > 0) + break; + + BUG_ON(level == 0); + /* + * save the merging progress in the drop_progress. + * this is OK since root refs == 1 in this case. + */ + btrfs_node_key(path->nodes[level], &root_item->drop_progress, + path->slots[level]); + root_item->drop_level = level; + + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + + btrfs_btree_balance_dirty(root, nr); + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + } + + /* + * handle the case only one block in the fs tree need to be + * relocated and the block is tree root. + */ + leaf = btrfs_lock_root_node(root); + ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + if (ret < 0) + err = ret; +out: + btrfs_free_path(path); + + if (err == 0) { + memset(&root_item->drop_progress, 0, + sizeof(root_item->drop_progress)); + root_item->drop_level = 0; + btrfs_set_root_refs(root_item, 0); + btrfs_update_reloc_root(trans, root); + } + + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + + btrfs_btree_balance_dirty(root, nr); + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + + return err; +} + +static noinline_for_stack +int prepare_to_merge(struct reloc_control *rc, int err) +{ + struct btrfs_root *root = rc->extent_root; + struct btrfs_root *reloc_root; + struct btrfs_trans_handle *trans; + LIST_HEAD(reloc_roots); + u64 num_bytes = 0; + int ret; + + mutex_lock(&root->fs_info->reloc_mutex); + rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + rc->merging_rsv_size += rc->nodes_relocated * 2; + mutex_unlock(&root->fs_info->reloc_mutex); + +again: + if (!err) { + num_bytes = rc->merging_rsv_size; + ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, + num_bytes); + if (ret) + err = ret; + } + + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) { + if (!err) + btrfs_block_rsv_release(rc->extent_root, + rc->block_rsv, num_bytes); + return PTR_ERR(trans); + } + + if (!err) { + if (num_bytes != rc->merging_rsv_size) { + btrfs_end_transaction(trans, rc->extent_root); + btrfs_block_rsv_release(rc->extent_root, + rc->block_rsv, num_bytes); + goto again; + } + } + + rc->merge_reloc_tree = 1; + + while (!list_empty(&rc->reloc_roots)) { + reloc_root = list_entry(rc->reloc_roots.next, + struct btrfs_root, root_list); + list_del_init(&reloc_root->root_list); + + root = read_fs_root(reloc_root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + /* + * set reference count to 1, so btrfs_recover_relocation + * knows it should resumes merging + */ + if (!err) + btrfs_set_root_refs(&reloc_root->root_item, 1); + btrfs_update_reloc_root(trans, root); + + list_add(&reloc_root->root_list, &reloc_roots); + } + + list_splice(&reloc_roots, &rc->reloc_roots); + + if (!err) + btrfs_commit_transaction(trans, rc->extent_root); + else + btrfs_end_transaction(trans, rc->extent_root); + return err; +} + +static noinline_for_stack +int merge_reloc_roots(struct reloc_control *rc) +{ + struct btrfs_root *root; + struct btrfs_root *reloc_root; + LIST_HEAD(reloc_roots); + int found = 0; + int ret; +again: + root = rc->extent_root; + + /* + * this serializes us with btrfs_record_root_in_transaction, + * we have to make sure nobody is in the middle of + * adding their roots to the list while we are + * doing this splice + */ + mutex_lock(&root->fs_info->reloc_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&root->fs_info->reloc_mutex); + + while (!list_empty(&reloc_roots)) { + found = 1; + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + root = read_fs_root(reloc_root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + ret = merge_reloc_root(rc, root); + BUG_ON(ret); + } else { + list_del_init(&reloc_root->root_list); + } + btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0); + } + + if (found) { + found = 0; + goto again; + } + BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + return 0; +} + +static void free_block_list(struct rb_root *blocks) +{ + struct tree_block *block; + struct rb_node *rb_node; + while ((rb_node = rb_first(blocks))) { + block = rb_entry(rb_node, struct tree_block, rb_node); + rb_erase(rb_node, blocks); + kfree(block); + } +} + +static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *reloc_root) +{ + struct btrfs_root *root; + + if (reloc_root->last_trans == trans->transid) + return 0; + + root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + return btrfs_record_root_in_trans(trans, root); +} + +static noinline_for_stack +struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct backref_edge *edges[], int *nr) +{ + struct backref_node *next; + struct btrfs_root *root; + int index = 0; + + next = node; + while (1) { + cond_resched(); + next = walk_up_backref(next, edges, &index); + root = next->root; + BUG_ON(!root); + BUG_ON(!root->ref_cows); + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + record_reloc_root_in_trans(trans, root); + break; + } + + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + + if (next->new_bytenr != root->node->start) { + BUG_ON(next->new_bytenr); + BUG_ON(!list_empty(&next->list)); + next->new_bytenr = root->node->start; + next->root = root; + list_add_tail(&next->list, + &rc->backref_cache.changed); + __mark_block_processed(rc, next); + break; + } + + WARN_ON(1); + root = NULL; + next = walk_down_backref(edges, &index); + if (!next || next->level <= node->level) + break; + } + if (!root) + return NULL; + + *nr = index; + next = node; + /* setup backref node path for btrfs_reloc_cow_block */ + while (1) { + rc->backref_cache.path[next->level] = next; + if (--index < 0) + break; + next = edges[index]->node[UPPER]; + } + return root; +} + +/* + * select a tree root for relocation. return NULL if the block + * is reference counted. we should use do_relocation() in this + * case. return a tree root pointer if the block isn't reference + * counted. return -ENOENT if the block is root of reloc tree. + */ +static noinline_for_stack +struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, + struct backref_node *node) +{ + struct backref_node *next; + struct btrfs_root *root; + struct btrfs_root *fs_root = NULL; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + int index = 0; + + next = node; + while (1) { + cond_resched(); + next = walk_up_backref(next, edges, &index); + root = next->root; + BUG_ON(!root); + + /* no other choice for non-references counted tree */ + if (!root->ref_cows) + return root; + + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) + fs_root = root; + + if (next != node) + return NULL; + + next = walk_down_backref(edges, &index); + if (!next || next->level <= node->level) + break; + } + + if (!fs_root) + return ERR_PTR(-ENOENT); + return fs_root; +} + +static noinline_for_stack +u64 calcu_metadata_size(struct reloc_control *rc, + struct backref_node *node, int reserve) +{ + struct backref_node *next = node; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + u64 num_bytes = 0; + int index = 0; + + BUG_ON(reserve && node->processed); + + while (next) { + cond_resched(); + while (1) { + if (next->processed && (reserve || next != node)) + break; + + num_bytes += btrfs_level_size(rc->extent_root, + next->level); + + if (list_empty(&next->upper)) + break; + + edge = list_entry(next->upper.next, + struct backref_edge, list[LOWER]); + edges[index++] = edge; + next = edge->node[UPPER]; + } + next = walk_down_backref(edges, &index); + } + return num_bytes; +} + +static int reserve_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node) +{ + struct btrfs_root *root = rc->extent_root; + u64 num_bytes; + int ret; + + num_bytes = calcu_metadata_size(rc, node, 1) * 2; + + trans->block_rsv = rc->block_rsv; + ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); + if (ret) { + if (ret == -EAGAIN) + rc->commit_transaction = 1; + return ret; + } + + return 0; +} + +static void release_metadata_space(struct reloc_control *rc, + struct backref_node *node) +{ + u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2; + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes); +} + +/* + * relocate a block tree, and then update pointers in upper level + * blocks that reference the block to point to the new location. + * + * if called by link_to_upper, the block has already been relocated. + * in that case this function just updates pointers. + */ +static int do_relocation(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct btrfs_key *key, + struct btrfs_path *path, int lowest) +{ + struct backref_node *upper; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + struct btrfs_root *root; + struct extent_buffer *eb; + u32 blocksize; + u64 bytenr; + u64 generation; + int nr; + int slot; + int ret; + int err = 0; + + BUG_ON(lowest && node->eb); + + path->lowest_level = node->level + 1; + rc->backref_cache.path[node->level] = node; + list_for_each_entry(edge, &node->upper, list[LOWER]) { + cond_resched(); + + upper = edge->node[UPPER]; + root = select_reloc_root(trans, rc, upper, edges, &nr); + BUG_ON(!root); + + if (upper->eb && !upper->locked) { + if (!lowest) { + ret = btrfs_bin_search(upper->eb, key, + upper->level, &slot); + BUG_ON(ret); + bytenr = btrfs_node_blockptr(upper->eb, slot); + if (node->eb->start == bytenr) + goto next; + } + drop_node_buffer(upper); + } + + if (!upper->eb) { + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret < 0) { + err = ret; + break; + } + BUG_ON(ret > 0); + + if (!upper->eb) { + upper->eb = path->nodes[upper->level]; + path->nodes[upper->level] = NULL; + } else { + BUG_ON(upper->eb != path->nodes[upper->level]); + } + + upper->locked = 1; + path->locks[upper->level] = 0; + + slot = path->slots[upper->level]; + btrfs_release_path(path); + } else { + ret = btrfs_bin_search(upper->eb, key, upper->level, + &slot); + BUG_ON(ret); + } + + bytenr = btrfs_node_blockptr(upper->eb, slot); + if (lowest) { + BUG_ON(bytenr != node->bytenr); + } else { + if (node->eb->start == bytenr) + goto next; + } + + blocksize = btrfs_level_size(root, node->level); + generation = btrfs_node_ptr_generation(upper->eb, slot); + eb = read_tree_block(root, bytenr, blocksize, generation); + if (!eb) { + err = -EIO; + goto next; + } + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + + if (!node->eb) { + ret = btrfs_cow_block(trans, root, eb, upper->eb, + slot, &eb); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + if (ret < 0) { + err = ret; + goto next; + } + BUG_ON(node->eb != eb); + } else { + btrfs_set_node_blockptr(upper->eb, slot, + node->eb->start); + btrfs_set_node_ptr_generation(upper->eb, slot, + trans->transid); + btrfs_mark_buffer_dirty(upper->eb); + + ret = btrfs_inc_extent_ref(trans, root, + node->eb->start, blocksize, + upper->eb->start, + btrfs_header_owner(upper->eb), + node->level, 0); + BUG_ON(ret); + + ret = btrfs_drop_subtree(trans, root, eb, upper->eb); + BUG_ON(ret); + } +next: + if (!upper->pending) + drop_node_buffer(upper); + else + unlock_node_buffer(upper); + if (err) + break; + } + + if (!err && node->pending) { + drop_node_buffer(node); + list_move_tail(&node->list, &rc->backref_cache.changed); + node->pending = 0; + } + + path->lowest_level = 0; + BUG_ON(err == -ENOSPC); + return err; +} + +static int link_to_upper(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct btrfs_path *path) +{ + struct btrfs_key key; + + btrfs_node_key_to_cpu(node->eb, &key, 0); + return do_relocation(trans, rc, node, &key, path, 0); +} + +static int finish_pending_nodes(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_path *path, int err) +{ + LIST_HEAD(list); + struct backref_cache *cache = &rc->backref_cache; + struct backref_node *node; + int level; + int ret; + + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + while (!list_empty(&cache->pending[level])) { + node = list_entry(cache->pending[level].next, + struct backref_node, list); + list_move_tail(&node->list, &list); + BUG_ON(!node->pending); + + if (!err) { + ret = link_to_upper(trans, rc, node, path); + if (ret < 0) + err = ret; + } + } + list_splice_init(&list, &cache->pending[level]); + } + return err; +} + +static void mark_block_processed(struct reloc_control *rc, + u64 bytenr, u32 blocksize) +{ + set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, + EXTENT_DIRTY, GFP_NOFS); +} + +static void __mark_block_processed(struct reloc_control *rc, + struct backref_node *node) +{ + u32 blocksize; + if (node->level == 0 || + in_block_group(node->bytenr, rc->block_group)) { + blocksize = btrfs_level_size(rc->extent_root, node->level); + mark_block_processed(rc, node->bytenr, blocksize); + } + node->processed = 1; +} + +/* + * mark a block and all blocks directly/indirectly reference the block + * as processed. + */ +static void update_processed_blocks(struct reloc_control *rc, + struct backref_node *node) +{ + struct backref_node *next = node; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + int index = 0; + + while (next) { + cond_resched(); + while (1) { + if (next->processed) + break; + + __mark_block_processed(rc, next); + + if (list_empty(&next->upper)) + break; + + edge = list_entry(next->upper.next, + struct backref_edge, list[LOWER]); + edges[index++] = edge; + next = edge->node[UPPER]; + } + next = walk_down_backref(edges, &index); + } +} + +static int tree_block_processed(u64 bytenr, u32 blocksize, + struct reloc_control *rc) +{ + if (test_range_bit(&rc->processed_blocks, bytenr, + bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) + return 1; + return 0; +} + +static int get_tree_block_key(struct reloc_control *rc, + struct tree_block *block) +{ + struct extent_buffer *eb; + + BUG_ON(block->key_ready); + eb = read_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, block->key.offset); + BUG_ON(!eb); + WARN_ON(btrfs_header_level(eb) != block->level); + if (block->level == 0) + btrfs_item_key_to_cpu(eb, &block->key, 0); + else + btrfs_node_key_to_cpu(eb, &block->key, 0); + free_extent_buffer(eb); + block->key_ready = 1; + return 0; +} + +static int reada_tree_block(struct reloc_control *rc, + struct tree_block *block) +{ + BUG_ON(block->key_ready); + readahead_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, block->key.offset); + return 0; +} + +/* + * helper function to relocate a tree block + */ +static int relocate_tree_block(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct btrfs_key *key, + struct btrfs_path *path) +{ + struct btrfs_root *root; + int release = 0; + int ret = 0; + + if (!node) + return 0; + + BUG_ON(node->processed); + root = select_one_root(trans, node); + if (root == ERR_PTR(-ENOENT)) { + update_processed_blocks(rc, node); + goto out; + } + + if (!root || root->ref_cows) { + ret = reserve_metadata_space(trans, rc, node); + if (ret) + goto out; + release = 1; + } + + if (root) { + if (root->ref_cows) { + BUG_ON(node->new_bytenr); + BUG_ON(!list_empty(&node->list)); + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + node->new_bytenr = root->node->start; + node->root = root; + list_add_tail(&node->list, &rc->backref_cache.changed); + } else { + path->lowest_level = node->level; + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + btrfs_release_path(path); + if (ret > 0) + ret = 0; + } + if (!ret) + update_processed_blocks(rc, node); + } else { + ret = do_relocation(trans, rc, node, key, path, 1); + } +out: + if (ret || node->level == 0 || node->cowonly) { + if (release) + release_metadata_space(rc, node); + remove_backref_node(&rc->backref_cache, node); + } + return ret; +} + +/* + * relocate a list of blocks + */ +static noinline_for_stack +int relocate_tree_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct rb_root *blocks) +{ + struct backref_node *node; + struct btrfs_path *path; + struct tree_block *block; + struct rb_node *rb_node; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (!block->key_ready) + reada_tree_block(rc, block); + rb_node = rb_next(rb_node); + } + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (!block->key_ready) + get_tree_block_key(rc, block); + rb_node = rb_next(rb_node); + } + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + + node = build_backref_tree(rc, &block->key, + block->level, block->bytenr); + if (IS_ERR(node)) { + err = PTR_ERR(node); + goto out; + } + + ret = relocate_tree_block(trans, rc, node, &block->key, + path); + if (ret < 0) { + if (ret != -EAGAIN || rb_node == rb_first(blocks)) + err = ret; + goto out; + } + rb_node = rb_next(rb_node); + } +out: + free_block_list(blocks); + err = finish_pending_nodes(trans, rc, path, err); + + btrfs_free_path(path); + return err; +} + +static noinline_for_stack +int prealloc_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) +{ + u64 alloc_hint = 0; + u64 start; + u64 end; + u64 offset = BTRFS_I(inode)->index_cnt; + u64 num_bytes; + int nr = 0; + int ret = 0; + + BUG_ON(cluster->start != cluster->boundary[0]); + mutex_lock(&inode->i_mutex); + + ret = btrfs_check_data_free_space(inode, cluster->end + + 1 - cluster->start); + if (ret) + goto out; + + while (nr < cluster->nr) { + start = cluster->boundary[nr] - offset; + if (nr + 1 < cluster->nr) + end = cluster->boundary[nr + 1] - 1 - offset; + else + end = cluster->end - offset; + + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + num_bytes = end + 1 - start; + ret = btrfs_prealloc_file_range(inode, 0, start, + num_bytes, num_bytes, + end + 1, &alloc_hint); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + if (ret) + break; + nr++; + } + btrfs_free_reserved_data_space(inode, cluster->end + + 1 - cluster->start); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +static noinline_for_stack +int setup_extent_mapping(struct inode *inode, u64 start, u64 end, + u64 block_start) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret = 0; + + em = alloc_extent_map(); + if (!em) + return -ENOMEM; + + em->start = start; + em->len = end + 1 - start; + em->block_len = em->len; + em->block_start = block_start; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, end, 0); + } + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + return ret; +} + +static int relocate_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) +{ + u64 page_start; + u64 page_end; + u64 offset = BTRFS_I(inode)->index_cnt; + unsigned long index; + unsigned long last_index; + struct page *page; + struct file_ra_state *ra; + int nr = 0; + int ret = 0; + + if (!cluster->nr) + return 0; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + ret = prealloc_file_extent_cluster(inode, cluster); + if (ret) + goto out; + + file_ra_state_init(ra, inode->i_mapping); + + ret = setup_extent_mapping(inode, cluster->start - offset, + cluster->end - offset, cluster->start); + if (ret) + goto out; + + index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; + last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; + while (index <= last_index) { + ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + + page = find_lock_page(inode->i_mapping, index); + if (!page) { + page_cache_sync_readahead(inode->i_mapping, + ra, NULL, index, + last_index + 1 - index); + page = grab_cache_page(inode->i_mapping, index); + if (!page) { + btrfs_delalloc_release_metadata(inode, + PAGE_CACHE_SIZE); + ret = -ENOMEM; + goto out; + } + } + + if (PageReadahead(page)) { + page_cache_async_readahead(inode->i_mapping, + ra, NULL, page, index, + last_index + 1 - index); + } + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + btrfs_delalloc_release_metadata(inode, + PAGE_CACHE_SIZE); + ret = -EIO; + goto out; + } + } + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + + lock_extent(&BTRFS_I(inode)->io_tree, + page_start, page_end, GFP_NOFS); + + set_page_extent_mapped(page); + + if (nr < cluster->nr && + page_start + offset == cluster->boundary[nr]) { + set_extent_bits(&BTRFS_I(inode)->io_tree, + page_start, page_end, + EXTENT_BOUNDARY, GFP_NOFS); + nr++; + } + + btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); + set_page_dirty(page); + + unlock_extent(&BTRFS_I(inode)->io_tree, + page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + + index++; + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_throttle(BTRFS_I(inode)->root); + } + WARN_ON(nr != cluster->nr); +out: + kfree(ra); + return ret; +} + +static noinline_for_stack +int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, + struct file_extent_cluster *cluster) +{ + int ret; + + if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; + } + + if (!cluster->nr) + cluster->start = extent_key->objectid; + else + BUG_ON(cluster->nr >= MAX_EXTENTS); + cluster->end = extent_key->objectid + extent_key->offset - 1; + cluster->boundary[cluster->nr] = extent_key->objectid; + cluster->nr++; + + if (cluster->nr >= MAX_EXTENTS) { + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; + } + return 0; +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static int get_ref_objectid_v0(struct reloc_control *rc, + struct btrfs_path *path, + struct btrfs_key *extent_key, + u64 *ref_objectid, int *path_change) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_ref_v0 *ref0; + int ret; + int slot; + + leaf = path->nodes[0]; + slot = path->slots[0]; + while (1) { + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret < 0) + return ret; + BUG_ON(ret > 0); + leaf = path->nodes[0]; + slot = path->slots[0]; + if (path_change) + *path_change = 1; + } + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != extent_key->objectid) + return -ENOENT; + + if (key.type != BTRFS_EXTENT_REF_V0_KEY) { + slot++; + continue; + } + ref0 = btrfs_item_ptr(leaf, slot, + struct btrfs_extent_ref_v0); + *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0); + break; + } + return 0; +} +#endif + +/* + * helper to add a tree block to the list. + * the major work is getting the generation and level of the block + */ +static int add_tree_block(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) +{ + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_tree_block_info *bi; + struct tree_block *block; + struct rb_node *rb_node; + u32 item_size; + int level = -1; + int generation; + + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + if (item_size >= sizeof(*ei) + sizeof(*bi)) { + ei = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_item); + bi = (struct btrfs_tree_block_info *)(ei + 1); + generation = btrfs_extent_generation(eb, ei); + level = btrfs_tree_block_level(eb, bi); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + u64 ref_owner; + int ret; + + BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + ret = get_ref_objectid_v0(rc, path, extent_key, + &ref_owner, NULL); + if (ret < 0) + return ret; + BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); + level = (int)ref_owner; + /* FIXME: get real generation */ + generation = 0; +#else + BUG(); +#endif + } + + btrfs_release_path(path); + + BUG_ON(level == -1); + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) + return -ENOMEM; + + block->bytenr = extent_key->objectid; + block->key.objectid = extent_key->offset; + block->key.offset = generation; + block->level = level; + block->key_ready = 0; + + rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); + BUG_ON(rb_node); + + return 0; +} + +/* + * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY + */ +static int __add_tree_block(struct reloc_control *rc, + u64 bytenr, u32 blocksize, + struct rb_root *blocks) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + if (tree_block_processed(bytenr, blocksize, rc)) + return 0; + + if (tree_search(blocks, bytenr)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = blocksize; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret); + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ret = add_tree_block(rc, &key, path, blocks); +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to check if the block use full backrefs for pointers in it + */ +static int block_use_full_backref(struct reloc_control *rc, + struct extent_buffer *eb) +{ + u64 flags; + int ret; + + if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || + btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) + return 1; + + ret = btrfs_lookup_extent_info(NULL, rc->extent_root, + eb->start, eb->len, NULL, &flags); + BUG_ON(ret); + + if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) + ret = 1; + else + ret = 0; + return ret; +} + +static int delete_block_group_cache(struct btrfs_fs_info *fs_info, + struct inode *inode, u64 ino) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_trans_handle *trans; + unsigned long nr; + int ret = 0; + + if (inode) + goto truncate; + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) { + if (inode && !IS_ERR(inode)) + iput(inode); + return -ENOENT; + } + +truncate: + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_free_path(path); + ret = PTR_ERR(trans); + goto out; + } + + ret = btrfs_truncate_free_space_cache(root, trans, path, inode); + + btrfs_free_path(path); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); +out: + iput(inode); + return ret; +} + +/* + * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY + * this function scans fs tree to find blocks reference the data extent + */ +static int find_data_references(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + struct rb_root *blocks) +{ + struct btrfs_path *path; + struct tree_block *block; + struct btrfs_root *root; + struct btrfs_file_extent_item *fi; + struct rb_node *rb_node; + struct btrfs_key key; + u64 ref_root; + u64 ref_objectid; + u64 ref_offset; + u32 ref_count; + u32 nritems; + int err = 0; + int added = 0; + int counted; + int ret; + + ref_root = btrfs_extent_data_ref_root(leaf, ref); + ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); + ref_offset = btrfs_extent_data_ref_offset(leaf, ref); + ref_count = btrfs_extent_data_ref_count(leaf, ref); + + /* + * This is an extent belonging to the free space cache, lets just delete + * it and redo the search. + */ + if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { + ret = delete_block_group_cache(rc->extent_root->fs_info, + NULL, ref_objectid); + if (ret != -ENOENT) + return ret; + ret = 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + + root = read_fs_root(rc->extent_root->fs_info, ref_root); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + + key.objectid = ref_objectid; + key.offset = ref_offset; + key.type = BTRFS_EXTENT_DATA_KEY; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + /* + * the references in tree blocks that use full backrefs + * are not counted in + */ + if (block_use_full_backref(rc, leaf)) + counted = 0; + else + counted = 1; + rb_node = tree_search(blocks, leaf->start); + if (rb_node) { + if (counted) + added = 1; + else + path->slots[0] = nritems; + } + + while (ref_count > 0) { + while (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + WARN_ON(1); + goto out; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + added = 0; + + if (block_use_full_backref(rc, leaf)) + counted = 0; + else + counted = 1; + rb_node = tree_search(blocks, leaf->start); + if (rb_node) { + if (counted) + added = 1; + else + path->slots[0] = nritems; + } + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ref_objectid || + key.type != BTRFS_EXTENT_DATA_KEY) { + WARN_ON(1); + break; + } + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + goto next; + + if (btrfs_file_extent_disk_bytenr(leaf, fi) != + extent_key->objectid) + goto next; + + key.offset -= btrfs_file_extent_offset(leaf, fi); + if (key.offset != ref_offset) + goto next; + + if (counted) + ref_count--; + if (added) + goto next; + + if (!tree_block_processed(leaf->start, leaf->len, rc)) { + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) { + err = -ENOMEM; + break; + } + block->bytenr = leaf->start; + btrfs_item_key_to_cpu(leaf, &block->key, 0); + block->level = 0; + block->key_ready = 1; + rb_node = tree_insert(blocks, block->bytenr, + &block->rb_node); + BUG_ON(rb_node); + } + if (counted) + added = 1; + else + path->slots[0] = nritems; +next: + path->slots[0]++; + + } +out: + btrfs_free_path(path); + return err; +} + +/* + * hepler to find all tree blocks that reference a given data extent + */ +static noinline_for_stack +int add_data_references(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) +{ + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_extent_data_ref *dref; + struct btrfs_extent_inline_ref *iref; + unsigned long ptr; + unsigned long end; + u32 blocksize = btrfs_level_size(rc->extent_root, 0); + int ret; + int err = 0; + + eb = path->nodes[0]; + ptr = btrfs_item_ptr_offset(eb, path->slots[0]); + end = ptr + btrfs_item_size_nr(eb, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (ptr + sizeof(struct btrfs_extent_item_v0) == end) + ptr = end; + else +#endif + ptr += sizeof(struct btrfs_extent_item); + + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + key.type = btrfs_extent_inline_ref_type(eb, iref); + if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + ret = __add_tree_block(rc, key.offset, blocksize, + blocks); + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + ret = find_data_references(rc, extent_key, + eb, dref, blocks); + } else { + BUG(); + } + ptr += btrfs_extent_inline_ref_size(key.type); + } + WARN_ON(ptr > end); + + while (1) { + cond_resched(); + eb = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret < 0) { + err = ret; + break; + } + if (ret > 0) + break; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path->slots[0]); + if (key.objectid != extent_key->objectid) + break; + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (key.type == BTRFS_SHARED_DATA_REF_KEY || + key.type == BTRFS_EXTENT_REF_V0_KEY) { +#else + BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); + if (key.type == BTRFS_SHARED_DATA_REF_KEY) { +#endif + ret = __add_tree_block(rc, key.offset, blocksize, + blocks); + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_data_ref); + ret = find_data_references(rc, extent_key, + eb, dref, blocks); + } else { + ret = 0; + } + if (ret) { + err = ret; + break; + } + path->slots[0]++; + } + btrfs_release_path(path); + if (err) + free_block_list(blocks); + return err; +} + +/* + * hepler to find next unprocessed extent + */ +static noinline_for_stack +int find_next_extent(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct btrfs_path *path, + struct btrfs_key *extent_key) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + u64 start, end, last; + int ret; + + last = rc->block_group->key.objectid + rc->block_group->key.offset; + while (1) { + cond_resched(); + if (rc->search_start >= last) { + ret = 1; + break; + } + + key.objectid = rc->search_start; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, + 0, 0); + if (ret < 0) + break; +next: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret != 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid >= last) { + ret = 1; + break; + } + + if (key.type != BTRFS_EXTENT_ITEM_KEY || + key.objectid + key.offset <= rc->search_start) { + path->slots[0]++; + goto next; + } + + ret = find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY); + + if (ret == 0 && start <= key.objectid) { + btrfs_release_path(path); + rc->search_start = end + 1; + } else { + rc->search_start = key.objectid + key.offset; + memcpy(extent_key, &key, sizeof(key)); + return 0; + } + } + btrfs_release_path(path); + return ret; +} + +static void set_reloc_control(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + + mutex_lock(&fs_info->reloc_mutex); + fs_info->reloc_ctl = rc; + mutex_unlock(&fs_info->reloc_mutex); +} + +static void unset_reloc_control(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + + mutex_lock(&fs_info->reloc_mutex); + fs_info->reloc_ctl = NULL; + mutex_unlock(&fs_info->reloc_mutex); +} + +static int check_extent_flags(u64 flags) +{ + if ((flags & BTRFS_EXTENT_FLAG_DATA) && + (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return 1; + if (!(flags & BTRFS_EXTENT_FLAG_DATA) && + !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return 1; + if ((flags & BTRFS_EXTENT_FLAG_DATA) && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + return 1; + return 0; +} + +static noinline_for_stack +int prepare_to_relocate(struct reloc_control *rc) +{ + struct btrfs_trans_handle *trans; + int ret; + + rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root); + if (!rc->block_rsv) + return -ENOMEM; + + /* + * reserve some space for creating reloc trees. + * btrfs_init_reloc_root will use them when there + * is no reservation in transaction handle. + */ + ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, + rc->extent_root->nodesize * 256); + if (ret) + return ret; + + rc->block_rsv->refill_used = 1; + btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv); + + memset(&rc->cluster, 0, sizeof(rc->cluster)); + rc->search_start = rc->block_group->key.objectid; + rc->extents_found = 0; + rc->nodes_relocated = 0; + rc->merging_rsv_size = 0; + + rc->create_reloc_tree = 1; + set_reloc_control(rc); + + trans = btrfs_join_transaction(rc->extent_root); + BUG_ON(IS_ERR(trans)); + btrfs_commit_transaction(trans, rc->extent_root); + return 0; +} + +static noinline_for_stack int relocate_block_group(struct reloc_control *rc) +{ + struct rb_root blocks = RB_ROOT; + struct btrfs_key key; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + unsigned long nr; + u64 flags; + u32 item_size; + int ret; + int err = 0; + int progress = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + + ret = prepare_to_relocate(rc); + if (ret) { + err = ret; + goto out_free; + } + + while (1) { + progress++; + trans = btrfs_start_transaction(rc->extent_root, 0); + BUG_ON(IS_ERR(trans)); +restart: + if (update_backref_cache(trans, &rc->backref_cache)) { + btrfs_end_transaction(trans, rc->extent_root); + continue; + } + + ret = find_next_extent(trans, rc, path, &key); + if (ret < 0) + err = ret; + if (ret != 0) + break; + + rc->extents_found++; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + if (item_size >= sizeof(*ei)) { + flags = btrfs_extent_flags(path->nodes[0], ei); + ret = check_extent_flags(flags); + BUG_ON(ret); + + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + u64 ref_owner; + int path_change = 0; + + BUG_ON(item_size != + sizeof(struct btrfs_extent_item_v0)); + ret = get_ref_objectid_v0(rc, path, &key, &ref_owner, + &path_change); + if (ref_owner < BTRFS_FIRST_FREE_OBJECTID) + flags = BTRFS_EXTENT_FLAG_TREE_BLOCK; + else + flags = BTRFS_EXTENT_FLAG_DATA; + + if (path_change) { + btrfs_release_path(path); + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, + &key, path, 0, 0); + if (ret < 0) { + err = ret; + break; + } + BUG_ON(ret > 0); + } +#else + BUG(); +#endif + } + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ret = add_tree_block(rc, &key, path, &blocks); + } else if (rc->stage == UPDATE_DATA_PTRS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + ret = add_data_references(rc, &key, path, &blocks); + } else { + btrfs_release_path(path); + ret = 0; + } + if (ret < 0) { + err = ret; + break; + } + + if (!RB_EMPTY_ROOT(&blocks)) { + ret = relocate_tree_blocks(trans, rc, &blocks); + if (ret < 0) { + if (ret != -EAGAIN) { + err = ret; + break; + } + rc->extents_found--; + rc->search_start = key.objectid; + } + } + + ret = btrfs_block_rsv_check(trans, rc->extent_root, + rc->block_rsv, 0, 5); + if (ret < 0) { + if (ret != -EAGAIN) { + err = ret; + WARN_ON(1); + break; + } + rc->commit_transaction = 1; + } + + if (rc->commit_transaction) { + rc->commit_transaction = 0; + ret = btrfs_commit_transaction(trans, rc->extent_root); + BUG_ON(ret); + } else { + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root, nr); + } + trans = NULL; + + if (rc->stage == MOVE_DATA_EXTENTS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + rc->found_file_extent = 1; + ret = relocate_data_extent(rc->data_inode, + &key, &rc->cluster); + if (ret < 0) { + err = ret; + break; + } + } + } + if (trans && progress && err == -ENOSPC) { + ret = btrfs_force_chunk_alloc(trans, rc->extent_root, + rc->block_group->flags); + if (ret == 0) { + err = 0; + progress = 0; + goto restart; + } + } + + btrfs_release_path(path); + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, + GFP_NOFS); + + if (trans) { + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root, nr); + } + + if (!err) { + ret = relocate_file_extent_cluster(rc->data_inode, + &rc->cluster); + if (ret < 0) + err = ret; + } + + rc->create_reloc_tree = 0; + set_reloc_control(rc); + + backref_cache_cleanup(&rc->backref_cache); + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); + + err = prepare_to_merge(rc, err); + + merge_reloc_roots(rc); + + rc->merge_reloc_tree = 0; + unset_reloc_control(rc); + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); + + /* get rid of pinned extents */ + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) + err = PTR_ERR(trans); + else + btrfs_commit_transaction(trans, rc->extent_root); +out_free: + btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); + btrfs_free_path(path); + return err; +} + +static int __insert_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) +{ + struct btrfs_path *path; + struct btrfs_inode_item *item; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_inode(trans, root, path, objectid); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + btrfs_set_inode_generation(leaf, item, 1); + btrfs_set_inode_size(leaf, item, 0); + btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | + BTRFS_INODE_PREALLOC); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to create inode for data relocation. + * the inode is in data relocation tree and its link count is 0 + */ +static noinline_for_stack +struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *group) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + struct btrfs_key key; + unsigned long nr; + u64 objectid = BTRFS_FIRST_FREE_OBJECTID; + int err = 0; + + root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); + if (IS_ERR(root)) + return ERR_CAST(root); + + trans = btrfs_start_transaction(root, 6); + if (IS_ERR(trans)) + return ERR_CAST(trans); + + err = btrfs_find_free_objectid(root, &objectid); + if (err) + goto out; + + err = __insert_orphan_inode(trans, root, objectid); + BUG_ON(err); + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); + BTRFS_I(inode)->index_cnt = group->key.objectid; + + err = btrfs_orphan_add(trans, inode); +out: + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + if (err) { + if (inode) + iput(inode); + inode = ERR_PTR(err); + } + return inode; +} + +static struct reloc_control *alloc_reloc_control(void) +{ + struct reloc_control *rc; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return NULL; + + INIT_LIST_HEAD(&rc->reloc_roots); + backref_cache_init(&rc->backref_cache); + mapping_tree_init(&rc->reloc_root_tree); + extent_io_tree_init(&rc->processed_blocks, NULL); + return rc; +} + +/* + * function to relocate all extents in a block group. + */ +int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) +{ + struct btrfs_fs_info *fs_info = extent_root->fs_info; + struct reloc_control *rc; + struct inode *inode; + struct btrfs_path *path; + int ret; + int rw = 0; + int err = 0; + + rc = alloc_reloc_control(); + if (!rc) + return -ENOMEM; + + rc->extent_root = extent_root; + + rc->block_group = btrfs_lookup_block_group(fs_info, group_start); + BUG_ON(!rc->block_group); + + if (!rc->block_group->ro) { + ret = btrfs_set_block_group_ro(extent_root, rc->block_group); + if (ret) { + err = ret; + goto out; + } + rw = 1; + } + + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + + inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group, + path); + btrfs_free_path(path); + + if (!IS_ERR(inode)) + ret = delete_block_group_cache(fs_info, inode, 0); + else + ret = PTR_ERR(inode); + + if (ret && ret != -ENOENT) { + err = ret; + goto out; + } + + rc->data_inode = create_reloc_inode(fs_info, rc->block_group); + if (IS_ERR(rc->data_inode)) { + err = PTR_ERR(rc->data_inode); + rc->data_inode = NULL; + goto out; + } + + printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", + (unsigned long long)rc->block_group->key.objectid, + (unsigned long long)rc->block_group->flags); + + btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); + + while (1) { + mutex_lock(&fs_info->cleaner_mutex); + + btrfs_clean_old_snapshots(fs_info->tree_root); + ret = relocate_block_group(rc); + + mutex_unlock(&fs_info->cleaner_mutex); + if (ret < 0) { + err = ret; + goto out; + } + + if (rc->extents_found == 0) + break; + + printk(KERN_INFO "btrfs: found %llu extents\n", + (unsigned long long)rc->extents_found); + + if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { + btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); + invalidate_mapping_pages(rc->data_inode->i_mapping, + 0, -1); + rc->stage = UPDATE_DATA_PTRS; + } + } + + filemap_write_and_wait_range(fs_info->btree_inode->i_mapping, + rc->block_group->key.objectid, + rc->block_group->key.objectid + + rc->block_group->key.offset - 1); + + WARN_ON(rc->block_group->pinned > 0); + WARN_ON(rc->block_group->reserved > 0); + WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); +out: + if (err && rw) + btrfs_set_block_group_rw(extent_root, rc->block_group); + iput(rc->data_inode); + btrfs_put_block_group(rc->block_group); + kfree(rc); + return err; +} + +static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + trans = btrfs_start_transaction(root->fs_info->tree_root, 0); + BUG_ON(IS_ERR(trans)); + + memset(&root->root_item.drop_progress, 0, + sizeof(root->root_item.drop_progress)); + root->root_item.drop_level = 0; + btrfs_set_root_refs(&root->root_item, 0); + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + BUG_ON(ret); + + ret = btrfs_end_transaction(trans, root->fs_info->tree_root); + BUG_ON(ret); + return 0; +} + +/* + * recover relocation interrupted by system crash. + * + * this function resumes merging reloc trees with corresponding fs trees. + * this is important for keeping the sharing of tree blocks + */ +int btrfs_recover_relocation(struct btrfs_root *root) +{ + LIST_HEAD(reloc_roots); + struct btrfs_key key; + struct btrfs_root *fs_root; + struct btrfs_root *reloc_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct reloc_control *rc = NULL; + struct btrfs_trans_handle *trans; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = -1; + + key.objectid = BTRFS_TREE_RELOC_OBJECTID; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, + path, 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(path); + + if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || + key.type != BTRFS_ROOT_ITEM_KEY) + break; + + reloc_root = btrfs_read_fs_root_no_radix(root, &key); + if (IS_ERR(reloc_root)) { + err = PTR_ERR(reloc_root); + goto out; + } + + list_add(&reloc_root->root_list, &reloc_roots); + + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + fs_root = read_fs_root(root->fs_info, + reloc_root->root_key.offset); + if (IS_ERR(fs_root)) { + ret = PTR_ERR(fs_root); + if (ret != -ENOENT) { + err = ret; + goto out; + } + mark_garbage_root(reloc_root); + } + } + + if (key.offset == 0) + break; + + key.offset--; + } + btrfs_release_path(path); + + if (list_empty(&reloc_roots)) + goto out; + + rc = alloc_reloc_control(); + if (!rc) { + err = -ENOMEM; + goto out; + } + + rc->extent_root = root->fs_info->extent_root; + + set_reloc_control(rc); + + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) { + unset_reloc_control(rc); + err = PTR_ERR(trans); + goto out_free; + } + + rc->merge_reloc_tree = 1; + + while (!list_empty(&reloc_roots)) { + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + list_del(&reloc_root->root_list); + + if (btrfs_root_refs(&reloc_root->root_item) == 0) { + list_add_tail(&reloc_root->root_list, + &rc->reloc_roots); + continue; + } + + fs_root = read_fs_root(root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(fs_root)); + + __add_reloc_root(reloc_root); + fs_root->reloc_root = reloc_root; + } + + btrfs_commit_transaction(trans, rc->extent_root); + + merge_reloc_roots(rc); + + unset_reloc_control(rc); + + trans = btrfs_join_transaction(rc->extent_root); + if (IS_ERR(trans)) + err = PTR_ERR(trans); + else + btrfs_commit_transaction(trans, rc->extent_root); +out_free: + kfree(rc); +out: + while (!list_empty(&reloc_roots)) { + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + list_del(&reloc_root->root_list); + free_extent_buffer(reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + kfree(reloc_root); + } + btrfs_free_path(path); + + if (err == 0) { + /* cleanup orphan inode in data relocation tree */ + fs_root = read_fs_root(root->fs_info, + BTRFS_DATA_RELOC_TREE_OBJECTID); + if (IS_ERR(fs_root)) + err = PTR_ERR(fs_root); + else + err = btrfs_orphan_cleanup(fs_root); + } + return err; +} + +/* + * helper to add ordered checksum for data relocation. + * + * cloning checksum properly handles the nodatasum extents. + * it also saves CPU time to re-calculate the checksum. + */ +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) +{ + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + size_t offset; + int ret; + u64 disk_bytenr; + LIST_HEAD(list); + + ordered = btrfs_lookup_ordered_extent(inode, file_pos); + BUG_ON(ordered->file_offset != file_pos || ordered->len != len); + + disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, + disk_bytenr + len - 1, &list, 0); + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del_init(&sums->list); + + sector_sum = sums->sums; + sums->bytenr = ordered->start; + + offset = 0; + while (offset < sums->len) { + sector_sum->bytenr += ordered->start - disk_bytenr; + sector_sum++; + offset += root->sectorsize; + } + + btrfs_add_ordered_sum(inode, ordered, sums); + } + btrfs_put_ordered_extent(ordered); + return ret; +} + +void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow) +{ + struct reloc_control *rc; + struct backref_node *node; + int first_cow = 0; + int level; + int ret; + + rc = root->fs_info->reloc_ctl; + if (!rc) + return; + + BUG_ON(rc->stage == UPDATE_DATA_PTRS && + root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + + level = btrfs_header_level(buf); + if (btrfs_header_generation(buf) <= + btrfs_root_last_snapshot(&root->root_item)) + first_cow = 1; + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && + rc->create_reloc_tree) { + WARN_ON(!first_cow && level == 0); + + node = rc->backref_cache.path[level]; + BUG_ON(node->bytenr != buf->start && + node->new_bytenr != buf->start); + + drop_node_buffer(node); + extent_buffer_get(cow); + node->eb = cow; + node->new_bytenr = cow->start; + + if (!node->pending) { + list_move_tail(&node->list, + &rc->backref_cache.pending[level]); + node->pending = 1; + } + + if (first_cow) + __mark_block_processed(rc, node); + + if (first_cow && level > 0) + rc->nodes_relocated += buf->len; + } + + if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) { + ret = replace_file_extents(trans, rc, root, cow); + BUG_ON(ret); + } +} + +/* + * called before creating snapshot. it calculates metadata reservation + * requried for relocating tree blocks in the snapshot + */ +void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve) +{ + struct btrfs_root *root; + struct reloc_control *rc; + + root = pending->root; + if (!root->reloc_root) + return; + + rc = root->fs_info->reloc_ctl; + if (!rc->merge_reloc_tree) + return; + + root = root->reloc_root; + BUG_ON(btrfs_root_refs(&root->root_item) == 0); + /* + * relocation is in the stage of merging trees. the space + * used by merging a reloc tree is twice the size of + * relocated tree nodes in the worst case. half for cowing + * the reloc tree, half for cowing the fs tree. the space + * used by cowing the reloc tree will be freed after the + * tree is dropped. if we create snapshot, cowing the fs + * tree may use more space than it frees. so we need + * reserve extra space. + */ + *bytes_to_reserve += rc->nodes_relocated; +} + +/* + * called after snapshot is created. migrate block reservation + * and create reloc root for the newly created snapshot + */ +void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_root *reloc_root; + struct btrfs_root *new_root; + struct reloc_control *rc; + int ret; + + if (!root->reloc_root) + return; + + rc = root->fs_info->reloc_ctl; + rc->merging_rsv_size += rc->nodes_relocated; + + if (rc->merge_reloc_tree) { + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + rc->block_rsv, + rc->nodes_relocated); + BUG_ON(ret); + } + + new_root = pending->snap; + reloc_root = create_reloc_root(trans, root->reloc_root, + new_root->root_key.objectid); + + __add_reloc_root(reloc_root); + new_root->reloc_root = reloc_root; + + if (rc->create_reloc_tree) { + ret = clone_backref_node(trans, rc, root, reloc_root); + BUG_ON(ret); + } +} diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c new file mode 100644 index 00000000..ebe45443 --- /dev/null +++ b/fs/btrfs/root-tree.c @@ -0,0 +1,450 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "print-tree.h" + +/* + * lookup the root with the highest offset for a given objectid. The key we do + * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 + * on error. + */ +int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, + struct btrfs_root_item *item, struct btrfs_key *key) +{ + struct btrfs_path *path; + struct btrfs_key search_key; + struct btrfs_key found_key; + struct extent_buffer *l; + int ret; + int slot; + + search_key.objectid = objectid; + search_key.type = BTRFS_ROOT_ITEM_KEY; + search_key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; + + BUG_ON(ret == 0); + if (path->slots[0] == 0) { + ret = 1; + goto out; + } + l = path->nodes[0]; + slot = path->slots[0] - 1; + btrfs_item_key_to_cpu(l, &found_key, slot); + if (found_key.objectid != objectid || + found_key.type != BTRFS_ROOT_ITEM_KEY) { + ret = 1; + goto out; + } + if (item) + read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), + sizeof(*item)); + if (key) + memcpy(key, &found_key, sizeof(found_key)); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node) +{ + btrfs_set_root_bytenr(item, node->start); + btrfs_set_root_level(item, btrfs_header_level(node)); + btrfs_set_root_generation(item, btrfs_header_generation(node)); + return 0; +} + +/* + * copy the data in 'item' into the btree + */ +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item) +{ + struct btrfs_path *path; + struct extent_buffer *l; + int ret; + int slot; + unsigned long ptr; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret < 0) + goto out; + + if (ret != 0) { + btrfs_print_leaf(root, path->nodes[0]); + printk(KERN_CRIT "unable to update root key %llu %u %llu\n", + (unsigned long long)key->objectid, key->type, + (unsigned long long)key->offset); + BUG_ON(1); + } + + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr_offset(l, slot); + write_extent_buffer(l, item, ptr, sizeof(*item)); + btrfs_mark_buffer_dirty(path->nodes[0]); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item) +{ + int ret; + ret = btrfs_insert_item(trans, root, key, item, sizeof(*item)); + return ret; +} + +/* + * at mount time we want to find all the old transaction snapshots that were in + * the process of being deleted if we crashed. This is any root item with an + * offset lower than the latest root. They need to be queued for deletion to + * finish what was happening when we crashed. + */ +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) +{ + struct btrfs_root *dead_root; + struct btrfs_root_item *ri; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + int ret; + u32 nritems; + struct extent_buffer *leaf; + int slot; + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = 0; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + +again: + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + if (slot >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + } + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) + goto next; + + if (key.objectid < objectid) + goto next; + + if (key.objectid > objectid) + break; + + ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item); + if (btrfs_disk_root_refs(leaf, ri) != 0) + goto next; + + memcpy(&found_key, &key, sizeof(key)); + key.offset++; + btrfs_release_path(path); + dead_root = + btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &found_key); + if (IS_ERR(dead_root)) { + ret = PTR_ERR(dead_root); + goto err; + } + + ret = btrfs_add_dead_root(dead_root); + if (ret) + goto err; + goto again; +next: + slot++; + path->slots[0]++; + } + ret = 0; +err: + btrfs_free_path(path); + return ret; +} + +int btrfs_find_orphan_roots(struct btrfs_root *tree_root) +{ + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key root_key; + struct btrfs_root *root; + int err = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = 0; + + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) { + err = ret; + break; + } + + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(tree_root, path); + if (ret < 0) + err = ret; + if (ret != 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(path); + + if (key.objectid != BTRFS_ORPHAN_OBJECTID || + key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + root_key.objectid = key.offset; + key.offset++; + + root = btrfs_read_fs_root_no_name(tree_root->fs_info, + &root_key); + if (!IS_ERR(root)) + continue; + + ret = PTR_ERR(root); + if (ret != -ENOENT) { + err = ret; + break; + } + + ret = btrfs_find_dead_roots(tree_root, root_key.objectid); + if (ret) { + err = ret; + break; + } + } + + btrfs_free_path(path); + return err; +} + +/* drop the root item for 'key' from 'root' */ +int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key) +{ + struct btrfs_path *path; + int ret; + struct btrfs_root_item *ri; + struct extent_buffer *leaf; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, key, path, -1, 1); + if (ret < 0) + goto out; + + BUG_ON(ret != 0); + leaf = path->nodes[0]; + ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, + const char *name, int name_len) + +{ + struct btrfs_path *path; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + int err = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = root_id; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = ref_id; +again: + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + BUG_ON(ret < 0); + if (ret == 0) { + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + + WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); + WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); + ptr = (unsigned long)(ref + 1); + WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); + *sequence = btrfs_root_ref_sequence(leaf, ref); + + ret = btrfs_del_item(trans, tree_root, path); + if (ret) { + err = ret; + goto out; + } + } else + err = -ENOENT; + + if (key.type == BTRFS_ROOT_BACKREF_KEY) { + btrfs_release_path(path); + key.objectid = ref_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = root_id; + goto again; + } + +out: + btrfs_free_path(path); + return err; +} + +int btrfs_find_root_ref(struct btrfs_root *tree_root, + struct btrfs_path *path, + u64 root_id, u64 ref_id) +{ + struct btrfs_key key; + int ret; + + key.objectid = root_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = ref_id; + + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + return ret; +} + +/* + * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY + * or BTRFS_ROOT_BACKREF_KEY. + * + * The dirid, sequence, name and name_len refer to the directory entry + * that is referencing the root. + * + * For a forward ref, the root_id is the id of the tree referencing + * the root and ref_id is the id of the subvol or snapshot. + * + * For a back ref the root_id is the id of the subvol or snapshot and + * ref_id is the id of the tree referencing it. + */ +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u64 ref_id, u64 dirid, u64 sequence, + const char *name, int name_len) +{ + struct btrfs_key key; + int ret; + struct btrfs_path *path; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + unsigned long ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = root_id; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = ref_id; +again: + ret = btrfs_insert_empty_item(trans, tree_root, path, &key, + sizeof(*ref) + name_len); + BUG_ON(ret); + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + btrfs_set_root_ref_dirid(leaf, ref, dirid); + btrfs_set_root_ref_sequence(leaf, ref, sequence); + btrfs_set_root_ref_name_len(leaf, ref, name_len); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(leaf, name, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + + if (key.type == BTRFS_ROOT_BACKREF_KEY) { + btrfs_release_path(path); + key.objectid = ref_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = root_id; + goto again; + } + + btrfs_free_path(path); + return 0; +} + +/* + * Old btrfs forgets to init root_item->flags and root_item->byte_limit + * for subvolumes. To work around this problem, we steal a bit from + * root_item->inode_item->flags, and use it to indicate if those fields + * have been properly initialized. + */ +void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item) +{ + u64 inode_flags = le64_to_cpu(root_item->inode.flags); + + if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) { + inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT; + root_item->inode.flags = cpu_to_le64(inode_flags); + root_item->flags = 0; + root_item->byte_limit = 0; + } +} diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c new file mode 100644 index 00000000..a8d03d5e --- /dev/null +++ b/fs/btrfs/scrub.c @@ -0,0 +1,1395 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "ctree.h" +#include "volumes.h" +#include "disk-io.h" +#include "ordered-data.h" + +/* + * This is only the first step towards a full-features scrub. It reads all + * extent and super block and verifies the checksums. In case a bad checksum + * is found or the extent cannot be read, good data will be written back if + * any can be found. + * + * Future enhancements: + * - To enhance the performance, better read-ahead strategies for the + * extent-tree can be employed. + * - In case an unrepairable extent is encountered, track which files are + * affected and report them + * - In case of a read error on files with nodatasum, map the file and read + * the extent to trigger a writeback of the good copy + * - track and record media errors, throw out bad devices + * - add a mode to also read unallocated space + * - make the prefetch cancellable + */ + +struct scrub_bio; +struct scrub_page; +struct scrub_dev; +static void scrub_bio_end_io(struct bio *bio, int err); +static void scrub_checksum(struct btrfs_work *work); +static int scrub_checksum_data(struct scrub_dev *sdev, + struct scrub_page *spag, void *buffer); +static int scrub_checksum_tree_block(struct scrub_dev *sdev, + struct scrub_page *spag, u64 logical, + void *buffer); +static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer); +static int scrub_fixup_check(struct scrub_bio *sbio, int ix); +static void scrub_fixup_end_io(struct bio *bio, int err); +static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, + struct page *page); +static void scrub_fixup(struct scrub_bio *sbio, int ix); + +#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ +#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ + +struct scrub_page { + u64 flags; /* extent flags */ + u64 generation; + u64 mirror_num; + int have_csum; + u8 csum[BTRFS_CSUM_SIZE]; +}; + +struct scrub_bio { + int index; + struct scrub_dev *sdev; + struct bio *bio; + int err; + u64 logical; + u64 physical; + struct scrub_page spag[SCRUB_PAGES_PER_BIO]; + u64 count; + int next_free; + struct btrfs_work work; +}; + +struct scrub_dev { + struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; + struct btrfs_device *dev; + int first_free; + int curr; + atomic_t in_flight; + spinlock_t list_lock; + wait_queue_head_t list_wait; + u16 csum_size; + struct list_head csum_list; + atomic_t cancel_req; + int readonly; + /* + * statistics + */ + struct btrfs_scrub_progress stat; + spinlock_t stat_lock; +}; + +static void scrub_free_csums(struct scrub_dev *sdev) +{ + while (!list_empty(&sdev->csum_list)) { + struct btrfs_ordered_sum *sum; + sum = list_first_entry(&sdev->csum_list, + struct btrfs_ordered_sum, list); + list_del(&sum->list); + kfree(sum); + } +} + +static void scrub_free_bio(struct bio *bio) +{ + int i; + struct page *last_page = NULL; + + if (!bio) + return; + + for (i = 0; i < bio->bi_vcnt; ++i) { + if (bio->bi_io_vec[i].bv_page == last_page) + continue; + last_page = bio->bi_io_vec[i].bv_page; + __free_page(last_page); + } + bio_put(bio); +} + +static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) +{ + int i; + + if (!sdev) + return; + + for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { + struct scrub_bio *sbio = sdev->bios[i]; + + if (!sbio) + break; + + scrub_free_bio(sbio->bio); + kfree(sbio); + } + + scrub_free_csums(sdev); + kfree(sdev); +} + +static noinline_for_stack +struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) +{ + struct scrub_dev *sdev; + int i; + struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + + sdev = kzalloc(sizeof(*sdev), GFP_NOFS); + if (!sdev) + goto nomem; + sdev->dev = dev; + for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { + struct scrub_bio *sbio; + + sbio = kzalloc(sizeof(*sbio), GFP_NOFS); + if (!sbio) + goto nomem; + sdev->bios[i] = sbio; + + sbio->index = i; + sbio->sdev = sdev; + sbio->count = 0; + sbio->work.func = scrub_checksum; + + if (i != SCRUB_BIOS_PER_DEV-1) + sdev->bios[i]->next_free = i + 1; + else + sdev->bios[i]->next_free = -1; + } + sdev->first_free = 0; + sdev->curr = -1; + atomic_set(&sdev->in_flight, 0); + atomic_set(&sdev->cancel_req, 0); + sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); + INIT_LIST_HEAD(&sdev->csum_list); + + spin_lock_init(&sdev->list_lock); + spin_lock_init(&sdev->stat_lock); + init_waitqueue_head(&sdev->list_wait); + return sdev; + +nomem: + scrub_free_dev(sdev); + return ERR_PTR(-ENOMEM); +} + +/* + * scrub_recheck_error gets called when either verification of the page + * failed or the bio failed to read, e.g. with EIO. In the latter case, + * recheck_error gets called for every page in the bio, even though only + * one may be bad + */ +static void scrub_recheck_error(struct scrub_bio *sbio, int ix) +{ + if (sbio->err) { + if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, + (sbio->physical + ix * PAGE_SIZE) >> 9, + sbio->bio->bi_io_vec[ix].bv_page) == 0) { + if (scrub_fixup_check(sbio, ix) == 0) + return; + } + } + + scrub_fixup(sbio, ix); +} + +static int scrub_fixup_check(struct scrub_bio *sbio, int ix) +{ + int ret = 1; + struct page *page; + void *buffer; + u64 flags = sbio->spag[ix].flags; + + page = sbio->bio->bi_io_vec[ix].bv_page; + buffer = kmap_atomic(page, KM_USER0); + if (flags & BTRFS_EXTENT_FLAG_DATA) { + ret = scrub_checksum_data(sbio->sdev, + sbio->spag + ix, buffer); + } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ret = scrub_checksum_tree_block(sbio->sdev, + sbio->spag + ix, + sbio->logical + ix * PAGE_SIZE, + buffer); + } else { + WARN_ON(1); + } + kunmap_atomic(buffer, KM_USER0); + + return ret; +} + +static void scrub_fixup_end_io(struct bio *bio, int err) +{ + complete((struct completion *)bio->bi_private); +} + +static void scrub_fixup(struct scrub_bio *sbio, int ix) +{ + struct scrub_dev *sdev = sbio->sdev; + struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct btrfs_multi_bio *multi = NULL; + u64 logical = sbio->logical + ix * PAGE_SIZE; + u64 length; + int i; + int ret; + DECLARE_COMPLETION_ONSTACK(complete); + + if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && + (sbio->spag[ix].have_csum == 0)) { + /* + * nodatasum, don't try to fix anything + * FIXME: we can do better, open the inode and trigger a + * writeback + */ + goto uncorrectable; + } + + length = PAGE_SIZE; + ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, + &multi, 0); + if (ret || !multi || length < PAGE_SIZE) { + printk(KERN_ERR + "scrub_fixup: btrfs_map_block failed us for %llu\n", + (unsigned long long)logical); + WARN_ON(1); + return; + } + + if (multi->num_stripes == 1) + /* there aren't any replicas */ + goto uncorrectable; + + /* + * first find a good copy + */ + for (i = 0; i < multi->num_stripes; ++i) { + if (i == sbio->spag[ix].mirror_num) + continue; + + if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, + multi->stripes[i].physical >> 9, + sbio->bio->bi_io_vec[ix].bv_page)) { + /* I/O-error, this is not a good copy */ + continue; + } + + if (scrub_fixup_check(sbio, ix) == 0) + break; + } + if (i == multi->num_stripes) + goto uncorrectable; + + if (!sdev->readonly) { + /* + * bi_io_vec[ix].bv_page now contains good data, write it back + */ + if (scrub_fixup_io(WRITE, sdev->dev->bdev, + (sbio->physical + ix * PAGE_SIZE) >> 9, + sbio->bio->bi_io_vec[ix].bv_page)) { + /* I/O-error, writeback failed, give up */ + goto uncorrectable; + } + } + + kfree(multi); + spin_lock(&sdev->stat_lock); + ++sdev->stat.corrected_errors; + spin_unlock(&sdev->stat_lock); + + if (printk_ratelimit()) + printk(KERN_ERR "btrfs: fixed up at %llu\n", + (unsigned long long)logical); + return; + +uncorrectable: + kfree(multi); + spin_lock(&sdev->stat_lock); + ++sdev->stat.uncorrectable_errors; + spin_unlock(&sdev->stat_lock); + + if (printk_ratelimit()) + printk(KERN_ERR "btrfs: unable to fixup at %llu\n", + (unsigned long long)logical); +} + +static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, + struct page *page) +{ + struct bio *bio = NULL; + int ret; + DECLARE_COMPLETION_ONSTACK(complete); + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_bdev = bdev; + bio->bi_sector = sector; + bio_add_page(bio, page, PAGE_SIZE, 0); + bio->bi_end_io = scrub_fixup_end_io; + bio->bi_private = &complete; + submit_bio(rw, bio); + + /* this will also unplug the queue */ + wait_for_completion(&complete); + + ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); + bio_put(bio); + return ret; +} + +static void scrub_bio_end_io(struct bio *bio, int err) +{ + struct scrub_bio *sbio = bio->bi_private; + struct scrub_dev *sdev = sbio->sdev; + struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + + sbio->err = err; + sbio->bio = bio; + + btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); +} + +static void scrub_checksum(struct btrfs_work *work) +{ + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); + struct scrub_dev *sdev = sbio->sdev; + struct page *page; + void *buffer; + int i; + u64 flags; + u64 logical; + int ret; + + if (sbio->err) { + for (i = 0; i < sbio->count; ++i) + scrub_recheck_error(sbio, i); + + sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); + sbio->bio->bi_flags |= 1 << BIO_UPTODATE; + sbio->bio->bi_phys_segments = 0; + sbio->bio->bi_idx = 0; + + for (i = 0; i < sbio->count; i++) { + struct bio_vec *bi; + bi = &sbio->bio->bi_io_vec[i]; + bi->bv_offset = 0; + bi->bv_len = PAGE_SIZE; + } + + spin_lock(&sdev->stat_lock); + ++sdev->stat.read_errors; + spin_unlock(&sdev->stat_lock); + goto out; + } + for (i = 0; i < sbio->count; ++i) { + page = sbio->bio->bi_io_vec[i].bv_page; + buffer = kmap_atomic(page, KM_USER0); + flags = sbio->spag[i].flags; + logical = sbio->logical + i * PAGE_SIZE; + ret = 0; + if (flags & BTRFS_EXTENT_FLAG_DATA) { + ret = scrub_checksum_data(sdev, sbio->spag + i, buffer); + } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ret = scrub_checksum_tree_block(sdev, sbio->spag + i, + logical, buffer); + } else if (flags & BTRFS_EXTENT_FLAG_SUPER) { + BUG_ON(i); + (void)scrub_checksum_super(sbio, buffer); + } else { + WARN_ON(1); + } + kunmap_atomic(buffer, KM_USER0); + if (ret) + scrub_recheck_error(sbio, i); + } + +out: + scrub_free_bio(sbio->bio); + sbio->bio = NULL; + spin_lock(&sdev->list_lock); + sbio->next_free = sdev->first_free; + sdev->first_free = sbio->index; + spin_unlock(&sdev->list_lock); + atomic_dec(&sdev->in_flight); + wake_up(&sdev->list_wait); +} + +static int scrub_checksum_data(struct scrub_dev *sdev, + struct scrub_page *spag, void *buffer) +{ + u8 csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + int fail = 0; + struct btrfs_root *root = sdev->dev->dev_root; + + if (!spag->have_csum) + return 0; + + crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE); + btrfs_csum_final(crc, csum); + if (memcmp(csum, spag->csum, sdev->csum_size)) + fail = 1; + + spin_lock(&sdev->stat_lock); + ++sdev->stat.data_extents_scrubbed; + sdev->stat.data_bytes_scrubbed += PAGE_SIZE; + if (fail) + ++sdev->stat.csum_errors; + spin_unlock(&sdev->stat_lock); + + return fail; +} + +static int scrub_checksum_tree_block(struct scrub_dev *sdev, + struct scrub_page *spag, u64 logical, + void *buffer) +{ + struct btrfs_header *h; + struct btrfs_root *root = sdev->dev->dev_root; + struct btrfs_fs_info *fs_info = root->fs_info; + u8 csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + int fail = 0; + int crc_fail = 0; + + /* + * we don't use the getter functions here, as we + * a) don't have an extent buffer and + * b) the page is already kmapped + */ + h = (struct btrfs_header *)buffer; + + if (logical != le64_to_cpu(h->bytenr)) + ++fail; + + if (spag->generation != le64_to_cpu(h->generation)) + ++fail; + + if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) + ++fail; + + if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, + BTRFS_UUID_SIZE)) + ++fail; + + crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, + PAGE_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, csum); + if (memcmp(csum, h->csum, sdev->csum_size)) + ++crc_fail; + + spin_lock(&sdev->stat_lock); + ++sdev->stat.tree_extents_scrubbed; + sdev->stat.tree_bytes_scrubbed += PAGE_SIZE; + if (crc_fail) + ++sdev->stat.csum_errors; + if (fail) + ++sdev->stat.verify_errors; + spin_unlock(&sdev->stat_lock); + + return fail || crc_fail; +} + +static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) +{ + struct btrfs_super_block *s; + u64 logical; + struct scrub_dev *sdev = sbio->sdev; + struct btrfs_root *root = sdev->dev->dev_root; + struct btrfs_fs_info *fs_info = root->fs_info; + u8 csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + int fail = 0; + + s = (struct btrfs_super_block *)buffer; + logical = sbio->logical; + + if (logical != le64_to_cpu(s->bytenr)) + ++fail; + + if (sbio->spag[0].generation != le64_to_cpu(s->generation)) + ++fail; + + if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) + ++fail; + + crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, + PAGE_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, csum); + if (memcmp(csum, s->csum, sbio->sdev->csum_size)) + ++fail; + + if (fail) { + /* + * if we find an error in a super block, we just report it. + * They will get written with the next transaction commit + * anyway + */ + spin_lock(&sdev->stat_lock); + ++sdev->stat.super_errors; + spin_unlock(&sdev->stat_lock); + } + + return fail; +} + +static int scrub_submit(struct scrub_dev *sdev) +{ + struct scrub_bio *sbio; + struct bio *bio; + int i; + + if (sdev->curr == -1) + return 0; + + sbio = sdev->bios[sdev->curr]; + + bio = bio_alloc(GFP_NOFS, sbio->count); + if (!bio) + goto nomem; + + bio->bi_private = sbio; + bio->bi_end_io = scrub_bio_end_io; + bio->bi_bdev = sdev->dev->bdev; + bio->bi_sector = sbio->physical >> 9; + + for (i = 0; i < sbio->count; ++i) { + struct page *page; + int ret; + + page = alloc_page(GFP_NOFS); + if (!page) + goto nomem; + + ret = bio_add_page(bio, page, PAGE_SIZE, 0); + if (!ret) { + __free_page(page); + goto nomem; + } + } + + sbio->err = 0; + sdev->curr = -1; + atomic_inc(&sdev->in_flight); + + submit_bio(READ, bio); + + return 0; + +nomem: + scrub_free_bio(bio); + + return -ENOMEM; +} + +static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, + u64 physical, u64 flags, u64 gen, u64 mirror_num, + u8 *csum, int force) +{ + struct scrub_bio *sbio; + +again: + /* + * grab a fresh bio or wait for one to become available + */ + while (sdev->curr == -1) { + spin_lock(&sdev->list_lock); + sdev->curr = sdev->first_free; + if (sdev->curr != -1) { + sdev->first_free = sdev->bios[sdev->curr]->next_free; + sdev->bios[sdev->curr]->next_free = -1; + sdev->bios[sdev->curr]->count = 0; + spin_unlock(&sdev->list_lock); + } else { + spin_unlock(&sdev->list_lock); + wait_event(sdev->list_wait, sdev->first_free != -1); + } + } + sbio = sdev->bios[sdev->curr]; + if (sbio->count == 0) { + sbio->physical = physical; + sbio->logical = logical; + } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || + sbio->logical + sbio->count * PAGE_SIZE != logical) { + int ret; + + ret = scrub_submit(sdev); + if (ret) + return ret; + goto again; + } + sbio->spag[sbio->count].flags = flags; + sbio->spag[sbio->count].generation = gen; + sbio->spag[sbio->count].have_csum = 0; + sbio->spag[sbio->count].mirror_num = mirror_num; + if (csum) { + sbio->spag[sbio->count].have_csum = 1; + memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); + } + ++sbio->count; + if (sbio->count == SCRUB_PAGES_PER_BIO || force) { + int ret; + + ret = scrub_submit(sdev); + if (ret) + return ret; + } + + return 0; +} + +static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, + u8 *csum) +{ + struct btrfs_ordered_sum *sum = NULL; + int ret = 0; + unsigned long i; + unsigned long num_sectors; + u32 sectorsize = sdev->dev->dev_root->sectorsize; + + while (!list_empty(&sdev->csum_list)) { + sum = list_first_entry(&sdev->csum_list, + struct btrfs_ordered_sum, list); + if (sum->bytenr > logical) + return 0; + if (sum->bytenr + sum->len > logical) + break; + + ++sdev->stat.csum_discards; + list_del(&sum->list); + kfree(sum); + sum = NULL; + } + if (!sum) + return 0; + + num_sectors = sum->len / sectorsize; + for (i = 0; i < num_sectors; ++i) { + if (sum->sums[i].bytenr == logical) { + memcpy(csum, &sum->sums[i].sum, sdev->csum_size); + ret = 1; + break; + } + } + if (ret && i == num_sectors - 1) { + list_del(&sum->list); + kfree(sum); + } + return ret; +} + +/* scrub extent tries to collect up to 64 kB for each bio */ +static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, + u64 physical, u64 flags, u64 gen, u64 mirror_num) +{ + int ret; + u8 csum[BTRFS_CSUM_SIZE]; + + while (len) { + u64 l = min_t(u64, len, PAGE_SIZE); + int have_csum = 0; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + /* push csums to sbio */ + have_csum = scrub_find_csum(sdev, logical, l, csum); + if (have_csum == 0) + ++sdev->stat.no_csum; + } + ret = scrub_page(sdev, logical, l, physical, flags, gen, + mirror_num, have_csum ? csum : NULL, 0); + if (ret) + return ret; + len -= l; + logical += l; + physical += l; + } + return 0; +} + +static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, + struct map_lookup *map, int num, u64 base, u64 length) +{ + struct btrfs_path *path; + struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *csum_root = fs_info->csum_root; + struct btrfs_extent_item *extent; + struct blk_plug plug; + u64 flags; + int ret; + int slot; + int i; + u64 nstripes; + int start_stripe; + struct extent_buffer *l; + struct btrfs_key key; + u64 physical; + u64 logical; + u64 generation; + u64 mirror_num; + + u64 increment = map->stripe_len; + u64 offset; + + nstripes = length; + offset = 0; + do_div(nstripes, map->stripe_len); + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + offset = map->stripe_len * num; + increment = map->stripe_len * map->num_stripes; + mirror_num = 0; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + int factor = map->num_stripes / map->sub_stripes; + offset = map->stripe_len * (num / map->sub_stripes); + increment = map->stripe_len * factor; + mirror_num = num % map->sub_stripes; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + increment = map->stripe_len; + mirror_num = num % map->num_stripes; + } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { + increment = map->stripe_len; + mirror_num = num % map->num_stripes; + } else { + increment = map->stripe_len; + mirror_num = 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 2; + path->search_commit_root = 1; + path->skip_locking = 1; + + /* + * find all extents for each stripe and just read them to get + * them into the page cache + * FIXME: we can do better. build a more intelligent prefetching + */ + logical = base + offset; + physical = map->stripes[num].physical; + ret = 0; + for (i = 0; i < nstripes; ++i) { + key.objectid = logical; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out_noplug; + + /* + * we might miss half an extent here, but that doesn't matter, + * as it's only the prefetch + */ + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out_noplug; + + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid >= logical + map->stripe_len) + break; + + path->slots[0]++; + } + btrfs_release_path(path); + logical += increment; + physical += map->stripe_len; + cond_resched(); + } + + /* + * collect all data csums for the stripe to avoid seeking during + * the scrub. This might currently (crc32) end up to be about 1MB + */ + start_stripe = 0; + blk_start_plug(&plug); +again: + logical = base + offset + start_stripe * increment; + for (i = start_stripe; i < nstripes; ++i) { + ret = btrfs_lookup_csums_range(csum_root, logical, + logical + map->stripe_len - 1, + &sdev->csum_list, 1); + if (ret) + goto out; + + logical += increment; + cond_resched(); + } + /* + * now find all extents for each stripe and scrub them + */ + logical = base + offset + start_stripe * increment; + physical = map->stripes[num].physical + start_stripe * map->stripe_len; + ret = 0; + for (i = start_stripe; i < nstripes; ++i) { + /* + * canceled? + */ + if (atomic_read(&fs_info->scrub_cancel_req) || + atomic_read(&sdev->cancel_req)) { + ret = -ECANCELED; + goto out; + } + /* + * check to see if we have to pause + */ + if (atomic_read(&fs_info->scrub_pause_req)) { + /* push queued extents */ + scrub_submit(sdev); + wait_event(sdev->list_wait, + atomic_read(&sdev->in_flight) == 0); + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); + mutex_lock(&fs_info->scrub_lock); + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); + } + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + wake_up(&fs_info->scrub_pause_wait); + scrub_free_csums(sdev); + start_stripe = i; + goto again; + } + + key.objectid = logical; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = btrfs_previous_item(root, path, 0, + BTRFS_EXTENT_ITEM_KEY); + if (ret < 0) + goto out; + if (ret > 0) { + /* there's no smaller item, so stick with the + * larger one */ + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, + path, 0, 0); + if (ret < 0) + goto out; + } + } + + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid + key.offset <= logical) + goto next; + + if (key.objectid >= logical + map->stripe_len) + break; + + if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) + goto next; + + extent = btrfs_item_ptr(l, slot, + struct btrfs_extent_item); + flags = btrfs_extent_flags(l, extent); + generation = btrfs_extent_generation(l, extent); + + if (key.objectid < logical && + (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { + printk(KERN_ERR + "btrfs scrub: tree block %llu spanning " + "stripes, ignored. logical=%llu\n", + (unsigned long long)key.objectid, + (unsigned long long)logical); + goto next; + } + + /* + * trim extent to this stripe + */ + if (key.objectid < logical) { + key.offset -= logical - key.objectid; + key.objectid = logical; + } + if (key.objectid + key.offset > + logical + map->stripe_len) { + key.offset = logical + map->stripe_len - + key.objectid; + } + + ret = scrub_extent(sdev, key.objectid, key.offset, + key.objectid - logical + physical, + flags, generation, mirror_num); + if (ret) + goto out; + +next: + path->slots[0]++; + } + btrfs_release_path(path); + logical += increment; + physical += map->stripe_len; + spin_lock(&sdev->stat_lock); + sdev->stat.last_physical = physical; + spin_unlock(&sdev->stat_lock); + } + /* push queued extents */ + scrub_submit(sdev); + +out: + blk_finish_plug(&plug); +out_noplug: + btrfs_free_path(path); + return ret < 0 ? ret : 0; +} + +static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, + u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length) +{ + struct btrfs_mapping_tree *map_tree = + &sdev->dev->dev_root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + int i; + int ret = -EINVAL; + + read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); + read_unlock(&map_tree->map_tree.lock); + + if (!em) + return -EINVAL; + + map = (struct map_lookup *)em->bdev; + if (em->start != chunk_offset) + goto out; + + if (em->len < length) + goto out; + + for (i = 0; i < map->num_stripes; ++i) { + if (map->stripes[i].dev == sdev->dev) { + ret = scrub_stripe(sdev, map, i, chunk_offset, length); + if (ret) + goto out; + } + } +out: + free_extent_map(em); + + return ret; +} + +static noinline_for_stack +int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) +{ + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + struct btrfs_root *root = sdev->dev->dev_root; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 length; + u64 chunk_tree; + u64 chunk_objectid; + u64 chunk_offset; + int ret; + int slot; + struct extent_buffer *l; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_block_group_cache *cache; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 2; + path->search_commit_root = 1; + path->skip_locking = 1; + + key.objectid = sdev->dev->devid; + key.offset = 0ull; + key.type = BTRFS_DEV_EXTENT_KEY; + + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + } + } + + l = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(l, &found_key, slot); + + if (found_key.objectid != sdev->dev->devid) + break; + + if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) + break; + + if (found_key.offset >= end) + break; + + if (found_key.offset < key.offset) + break; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + length = btrfs_dev_extent_length(l, dev_extent); + + if (found_key.offset + length <= start) { + key.offset = found_key.offset + length; + btrfs_release_path(path); + continue; + } + + chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); + chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); + chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); + + /* + * get a reference on the corresponding block group to prevent + * the chunk from going away while we scrub it + */ + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + if (!cache) { + ret = -ENOENT; + break; + } + ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, + chunk_offset, length); + btrfs_put_block_group(cache); + if (ret) + break; + + key.offset = found_key.offset + length; + btrfs_release_path(path); + } + + btrfs_free_path(path); + + /* + * ret can still be 1 from search_slot or next_leaf, + * that's not an error + */ + return ret < 0 ? ret : 0; +} + +static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) +{ + int i; + u64 bytenr; + u64 gen; + int ret; + struct btrfs_device *device = sdev->dev; + struct btrfs_root *root = device->dev_root; + + gen = root->fs_info->last_trans_committed; + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) + break; + + ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr, + BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); + if (ret) + return ret; + } + wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); + + return 0; +} + +/* + * get a reference count on fs_info->scrub_workers. start worker if necessary + */ +static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + mutex_lock(&fs_info->scrub_lock); + if (fs_info->scrub_workers_refcnt == 0) { + btrfs_init_workers(&fs_info->scrub_workers, "scrub", + fs_info->thread_pool_size, &fs_info->generic_worker); + fs_info->scrub_workers.idle_thresh = 4; + btrfs_start_workers(&fs_info->scrub_workers, 1); + } + ++fs_info->scrub_workers_refcnt; + mutex_unlock(&fs_info->scrub_lock); + + return 0; +} + +static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + mutex_lock(&fs_info->scrub_lock); + if (--fs_info->scrub_workers_refcnt == 0) + btrfs_stop_workers(&fs_info->scrub_workers); + WARN_ON(fs_info->scrub_workers_refcnt < 0); + mutex_unlock(&fs_info->scrub_lock); +} + + +int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, + struct btrfs_scrub_progress *progress, int readonly) +{ + struct scrub_dev *sdev; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + struct btrfs_device *dev; + + if (btrfs_fs_closing(root->fs_info)) + return -EINVAL; + + /* + * check some assumptions + */ + if (root->sectorsize != PAGE_SIZE || + root->sectorsize != root->leafsize || + root->sectorsize != root->nodesize) { + printk(KERN_ERR "btrfs_scrub: size assumptions fail\n"); + return -EINVAL; + } + + ret = scrub_workers_get(root); + if (ret) + return ret; + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + dev = btrfs_find_device(root, devid, NULL, NULL); + if (!dev || dev->missing) { + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + scrub_workers_put(root); + return -ENODEV; + } + mutex_lock(&fs_info->scrub_lock); + + if (!dev->in_fs_metadata) { + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + scrub_workers_put(root); + return -ENODEV; + } + + if (dev->scrub_device) { + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + scrub_workers_put(root); + return -EINPROGRESS; + } + sdev = scrub_setup_dev(dev); + if (IS_ERR(sdev)) { + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + scrub_workers_put(root); + return PTR_ERR(sdev); + } + sdev->readonly = readonly; + dev->scrub_device = sdev; + + atomic_inc(&fs_info->scrubs_running); + mutex_unlock(&fs_info->scrub_lock); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + down_read(&fs_info->scrub_super_lock); + ret = scrub_supers(sdev); + up_read(&fs_info->scrub_super_lock); + + if (!ret) + ret = scrub_enumerate_chunks(sdev, start, end); + + wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); + + atomic_dec(&fs_info->scrubs_running); + wake_up(&fs_info->scrub_pause_wait); + + if (progress) + memcpy(progress, &sdev->stat, sizeof(*progress)); + + mutex_lock(&fs_info->scrub_lock); + dev->scrub_device = NULL; + mutex_unlock(&fs_info->scrub_lock); + + scrub_free_dev(sdev); + scrub_workers_put(root); + + return ret; +} + +int btrfs_scrub_pause(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrub_pause_req); + while (atomic_read(&fs_info->scrubs_paused) != + atomic_read(&fs_info->scrubs_running)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrubs_paused) == + atomic_read(&fs_info->scrubs_running)); + mutex_lock(&fs_info->scrub_lock); + } + mutex_unlock(&fs_info->scrub_lock); + + return 0; +} + +int btrfs_scrub_continue(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + atomic_dec(&fs_info->scrub_pause_req); + wake_up(&fs_info->scrub_pause_wait); + return 0; +} + +int btrfs_scrub_pause_super(struct btrfs_root *root) +{ + down_write(&root->fs_info->scrub_super_lock); + return 0; +} + +int btrfs_scrub_continue_super(struct btrfs_root *root) +{ + up_write(&root->fs_info->scrub_super_lock); + return 0; +} + +int btrfs_scrub_cancel(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + mutex_lock(&fs_info->scrub_lock); + if (!atomic_read(&fs_info->scrubs_running)) { + mutex_unlock(&fs_info->scrub_lock); + return -ENOTCONN; + } + + atomic_inc(&fs_info->scrub_cancel_req); + while (atomic_read(&fs_info->scrubs_running)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrubs_running) == 0); + mutex_lock(&fs_info->scrub_lock); + } + atomic_dec(&fs_info->scrub_cancel_req); + mutex_unlock(&fs_info->scrub_lock); + + return 0; +} + +int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct scrub_dev *sdev; + + mutex_lock(&fs_info->scrub_lock); + sdev = dev->scrub_device; + if (!sdev) { + mutex_unlock(&fs_info->scrub_lock); + return -ENOTCONN; + } + atomic_inc(&sdev->cancel_req); + while (dev->scrub_device) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + dev->scrub_device == NULL); + mutex_lock(&fs_info->scrub_lock); + } + mutex_unlock(&fs_info->scrub_lock); + + return 0; +} +int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_device *dev; + int ret; + + /* + * we have to hold the device_list_mutex here so the device + * does not go away in cancel_dev. FIXME: find a better solution + */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); + dev = btrfs_find_device(root, devid, NULL, NULL); + if (!dev) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return -ENODEV; + } + ret = btrfs_scrub_cancel_dev(root, dev); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + return ret; +} + +int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, + struct btrfs_scrub_progress *progress) +{ + struct btrfs_device *dev; + struct scrub_dev *sdev = NULL; + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + dev = btrfs_find_device(root, devid, NULL, NULL); + if (dev) + sdev = dev->scrub_device; + if (sdev) + memcpy(progress, &sdev->stat, sizeof(*progress)); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; +} diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c new file mode 100644 index 00000000..c0f7ecaf --- /dev/null +++ b/fs/btrfs/struct-funcs.c @@ -0,0 +1,139 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include + +/* this is some deeply nasty code. ctree.h has a different + * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef + * + * The end result is that anyone who #includes ctree.h gets a + * declaration for the btrfs_set_foo functions and btrfs_foo functions + * + * This file declares the macros and then #includes ctree.h, which results + * in cpp creating the function here based on the template below. + * + * These setget functions do all the extent_buffer related mapping + * required to efficiently read and write specific fields in the extent + * buffers. Every pointer to metadata items in btrfs is really just + * an unsigned long offset into the extent buffer which has been + * cast to a specific type. This gives us all the gcc type checking. + * + * The extent buffer api is used to do all the kmapping and page + * spanning work required to get extent buffers in highmem and have + * a metadata blocksize different from the page size. + * + * The macro starts with a simple function prototype declaration so that + * sparse won't complain about it being static. + */ + +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ +u##bits btrfs_##name(struct extent_buffer *eb, \ + type *s) \ +{ \ + unsigned long part_offset = (unsigned long)s; \ + unsigned long offset = part_offset + offsetof(type, member); \ + type *p; \ + /* ugly, but we want the fast path here */ \ + if (eb->map_token && offset >= eb->map_start && \ + offset + sizeof(((type *)0)->member) <= eb->map_start + \ + eb->map_len) { \ + p = (type *)(eb->kaddr + part_offset - eb->map_start); \ + return le##bits##_to_cpu(p->member); \ + } \ + { \ + int err; \ + char *map_token; \ + char *kaddr; \ + int unmap_on_exit = (eb->map_token == NULL); \ + unsigned long map_start; \ + unsigned long map_len; \ + u##bits res; \ + err = map_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &map_token, &kaddr, \ + &map_start, &map_len, KM_USER1); \ + if (err) { \ + __le##bits leres; \ + read_eb_member(eb, s, type, member, &leres); \ + return le##bits##_to_cpu(leres); \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + res = le##bits##_to_cpu(p->member); \ + if (unmap_on_exit) \ + unmap_extent_buffer(eb, map_token, KM_USER1); \ + return res; \ + } \ +} \ +void btrfs_set_##name(struct extent_buffer *eb, \ + type *s, u##bits val) \ +{ \ + unsigned long part_offset = (unsigned long)s; \ + unsigned long offset = part_offset + offsetof(type, member); \ + type *p; \ + /* ugly, but we want the fast path here */ \ + if (eb->map_token && offset >= eb->map_start && \ + offset + sizeof(((type *)0)->member) <= eb->map_start + \ + eb->map_len) { \ + p = (type *)(eb->kaddr + part_offset - eb->map_start); \ + p->member = cpu_to_le##bits(val); \ + return; \ + } \ + { \ + int err; \ + char *map_token; \ + char *kaddr; \ + int unmap_on_exit = (eb->map_token == NULL); \ + unsigned long map_start; \ + unsigned long map_len; \ + err = map_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &map_token, &kaddr, \ + &map_start, &map_len, KM_USER1); \ + if (err) { \ + __le##bits val2; \ + val2 = cpu_to_le##bits(val); \ + write_eb_member(eb, s, type, member, &val2); \ + return; \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + p->member = cpu_to_le##bits(val); \ + if (unmap_on_exit) \ + unmap_extent_buffer(eb, map_token, KM_USER1); \ + } \ +} + +#include "ctree.h" + +void btrfs_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr = btrfs_node_key_ptr_offset(nr); + if (eb->map_token && ptr >= eb->map_start && + ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) { + memcpy(disk_key, eb->kaddr + ptr - eb->map_start, + sizeof(*disk_key)); + return; + } else if (eb->map_token) { + unmap_extent_buffer(eb, eb->map_token, KM_USER1); + eb->map_token = NULL; + } + read_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c new file mode 100644 index 00000000..15634d46 --- /dev/null +++ b/fs/btrfs/super.c @@ -0,0 +1,1305 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compat.h" +#include "delayed-inode.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "xattr.h" +#include "volumes.h" +#include "version.h" +#include "export.h" +#include "compression.h" + +#define CREATE_TRACE_POINTS +#include + +static const struct super_operations btrfs_super_ops; + +static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, + char nbuf[16]) +{ + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + errstr = "Readonly filesystem"; + break; + default: + if (nbuf) { + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + break; + } + + return errstr; +} + +static void __save_error_info(struct btrfs_fs_info *fs_info) +{ + /* + * today we only save the error info into ram. Long term we'll + * also send it down to the disk + */ + fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; +} + +/* NOTE: + * We move write_super stuff at umount in order to avoid deadlock + * for umount hold all lock. + */ +static void save_error_info(struct btrfs_fs_info *fs_info) +{ + __save_error_info(fs_info); +} + +/* btrfs handle error by forcing the filesystem readonly */ +static void btrfs_handle_error(struct btrfs_fs_info *fs_info) +{ + struct super_block *sb = fs_info->sb; + + if (sb->s_flags & MS_RDONLY) + return; + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + sb->s_flags |= MS_RDONLY; + printk(KERN_INFO "btrfs is forced readonly\n"); + } +} + +/* + * __btrfs_std_error decodes expected errors from the caller and + * invokes the approciate error response. + */ +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno) +{ + struct super_block *sb = fs_info->sb; + char nbuf[16]; + const char *errstr; + + /* + * Special case: if the error is EROFS, and we're already + * under MS_RDONLY, then it is safe here. + */ + if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) + return; + + errstr = btrfs_decode_error(fs_info, errno, nbuf); + printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + save_error_info(fs_info); + + btrfs_handle_error(fs_info); +} + +static void btrfs_put_super(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + int ret; + + ret = close_ctree(root); + sb->s_fs_info = NULL; + + (void)ret; /* FIXME: need to fix VFS to return error? */ +} + +enum { + Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, + Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, + Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, + Opt_compress_type, Opt_compress_force, Opt_compress_force_type, + Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, + Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, + Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, + Opt_inode_cache, Opt_err, +}; + +static match_table_t tokens = { + {Opt_degraded, "degraded"}, + {Opt_subvol, "subvol=%s"}, + {Opt_subvolid, "subvolid=%d"}, + {Opt_device, "device=%s"}, + {Opt_nodatasum, "nodatasum"}, + {Opt_nodatacow, "nodatacow"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_max_inline, "max_inline=%s"}, + {Opt_alloc_start, "alloc_start=%s"}, + {Opt_thread_pool, "thread_pool=%d"}, + {Opt_compress, "compress"}, + {Opt_compress_type, "compress=%s"}, + {Opt_compress_force, "compress-force"}, + {Opt_compress_force_type, "compress-force=%s"}, + {Opt_ssd, "ssd"}, + {Opt_ssd_spread, "ssd_spread"}, + {Opt_nossd, "nossd"}, + {Opt_noacl, "noacl"}, + {Opt_notreelog, "notreelog"}, + {Opt_flushoncommit, "flushoncommit"}, + {Opt_ratio, "metadata_ratio=%d"}, + {Opt_discard, "discard"}, + {Opt_space_cache, "space_cache"}, + {Opt_clear_cache, "clear_cache"}, + {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, + {Opt_enospc_debug, "enospc_debug"}, + {Opt_subvolrootid, "subvolrootid=%d"}, + {Opt_defrag, "autodefrag"}, + {Opt_inode_cache, "inode_cache"}, + {Opt_err, NULL}, +}; + +/* + * Regular mount options parser. Everything that is needed only when + * reading in a new superblock is parsed here. + */ +int btrfs_parse_options(struct btrfs_root *root, char *options) +{ + struct btrfs_fs_info *info = root->fs_info; + substring_t args[MAX_OPT_ARGS]; + char *p, *num, *orig; + int intarg; + int ret = 0; + char *compress_type; + bool compress_force = false; + + if (!options) + return 0; + + /* + * strsep changes the string, duplicate it because parse_options + * gets called twice + */ + options = kstrdup(options, GFP_NOFS); + if (!options) + return -ENOMEM; + + orig = options; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_degraded: + printk(KERN_INFO "btrfs: allowing degraded mounts\n"); + btrfs_set_opt(info->mount_opt, DEGRADED); + break; + case Opt_subvol: + case Opt_subvolid: + case Opt_subvolrootid: + case Opt_device: + /* + * These are parsed by btrfs_parse_early_options + * and can be happily ignored here. + */ + break; + case Opt_nodatasum: + printk(KERN_INFO "btrfs: setting nodatasum\n"); + btrfs_set_opt(info->mount_opt, NODATASUM); + break; + case Opt_nodatacow: + printk(KERN_INFO "btrfs: setting nodatacow\n"); + btrfs_set_opt(info->mount_opt, NODATACOW); + btrfs_set_opt(info->mount_opt, NODATASUM); + break; + case Opt_compress_force: + case Opt_compress_force_type: + compress_force = true; + case Opt_compress: + case Opt_compress_type: + if (token == Opt_compress || + token == Opt_compress_force || + strcmp(args[0].from, "zlib") == 0) { + compress_type = "zlib"; + info->compress_type = BTRFS_COMPRESS_ZLIB; + } else if (strcmp(args[0].from, "lzo") == 0) { + compress_type = "lzo"; + info->compress_type = BTRFS_COMPRESS_LZO; + } else { + ret = -EINVAL; + goto out; + } + + btrfs_set_opt(info->mount_opt, COMPRESS); + if (compress_force) { + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); + pr_info("btrfs: force %s compression\n", + compress_type); + } else + pr_info("btrfs: use %s compression\n", + compress_type); + break; + case Opt_ssd: + printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); + btrfs_set_opt(info->mount_opt, SSD); + break; + case Opt_ssd_spread: + printk(KERN_INFO "btrfs: use spread ssd " + "allocation scheme\n"); + btrfs_set_opt(info->mount_opt, SSD); + btrfs_set_opt(info->mount_opt, SSD_SPREAD); + break; + case Opt_nossd: + printk(KERN_INFO "btrfs: not using ssd allocation " + "scheme\n"); + btrfs_set_opt(info->mount_opt, NOSSD); + btrfs_clear_opt(info->mount_opt, SSD); + btrfs_clear_opt(info->mount_opt, SSD_SPREAD); + break; + case Opt_nobarrier: + printk(KERN_INFO "btrfs: turning off barriers\n"); + btrfs_set_opt(info->mount_opt, NOBARRIER); + break; + case Opt_thread_pool: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->thread_pool_size = intarg; + printk(KERN_INFO "btrfs: thread pool %d\n", + info->thread_pool_size); + } + break; + case Opt_max_inline: + num = match_strdup(&args[0]); + if (num) { + info->max_inline = memparse(num, NULL); + kfree(num); + + if (info->max_inline) { + info->max_inline = max_t(u64, + info->max_inline, + root->sectorsize); + } + printk(KERN_INFO "btrfs: max_inline at %llu\n", + (unsigned long long)info->max_inline); + } + break; + case Opt_alloc_start: + num = match_strdup(&args[0]); + if (num) { + info->alloc_start = memparse(num, NULL); + kfree(num); + printk(KERN_INFO + "btrfs: allocations start at %llu\n", + (unsigned long long)info->alloc_start); + } + break; + case Opt_noacl: + root->fs_info->sb->s_flags &= ~MS_POSIXACL; + break; + case Opt_notreelog: + printk(KERN_INFO "btrfs: disabling tree log\n"); + btrfs_set_opt(info->mount_opt, NOTREELOG); + break; + case Opt_flushoncommit: + printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); + btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); + break; + case Opt_ratio: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->metadata_ratio = intarg; + printk(KERN_INFO "btrfs: metadata ratio %d\n", + info->metadata_ratio); + } + break; + case Opt_discard: + btrfs_set_opt(info->mount_opt, DISCARD); + break; + case Opt_space_cache: + printk(KERN_INFO "btrfs: enabling disk space caching\n"); + btrfs_set_opt(info->mount_opt, SPACE_CACHE); + break; + case Opt_inode_cache: + printk(KERN_INFO "btrfs: enabling inode map caching\n"); + btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); + break; + case Opt_clear_cache: + printk(KERN_INFO "btrfs: force clearing of disk cache\n"); + btrfs_set_opt(info->mount_opt, CLEAR_CACHE); + break; + case Opt_user_subvol_rm_allowed: + btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); + break; + case Opt_enospc_debug: + btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); + break; + case Opt_defrag: + printk(KERN_INFO "btrfs: enabling auto defrag"); + btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); + break; + case Opt_err: + printk(KERN_INFO "btrfs: unrecognized mount option " + "'%s'\n", p); + ret = -EINVAL; + goto out; + default: + break; + } + } +out: + kfree(orig); + return ret; +} + +/* + * Parse mount options that are required early in the mount process. + * + * All other options will be parsed on much later in the mount process and + * only when we need to allocate a new super block. + */ +static int btrfs_parse_early_options(const char *options, fmode_t flags, + void *holder, char **subvol_name, u64 *subvol_objectid, + u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) +{ + substring_t args[MAX_OPT_ARGS]; + char *opts, *orig, *p; + int error = 0; + int intarg; + + if (!options) + goto out; + + /* + * strsep changes the string, duplicate it because parse_options + * gets called twice + */ + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + orig = opts; + + while ((p = strsep(&opts, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_subvol: + *subvol_name = match_strdup(&args[0]); + break; + case Opt_subvolid: + intarg = 0; + error = match_int(&args[0], &intarg); + if (!error) { + /* we want the original fs_tree */ + if (!intarg) + *subvol_objectid = + BTRFS_FS_TREE_OBJECTID; + else + *subvol_objectid = intarg; + } + break; + case Opt_subvolrootid: + intarg = 0; + error = match_int(&args[0], &intarg); + if (!error) { + /* we want the original fs_tree */ + if (!intarg) + *subvol_rootid = + BTRFS_FS_TREE_OBJECTID; + else + *subvol_rootid = intarg; + } + break; + case Opt_device: + error = btrfs_scan_one_device(match_strdup(&args[0]), + flags, holder, fs_devices); + if (error) + goto out_free_opts; + break; + default: + break; + } + } + + out_free_opts: + kfree(orig); + out: + /* + * If no subvolume name is specified we use the default one. Allocate + * a copy of the string "." here so that code later in the + * mount path doesn't care if it's the default volume or another one. + */ + if (!*subvol_name) { + *subvol_name = kstrdup(".", GFP_KERNEL); + if (!*subvol_name) + return -ENOMEM; + } + return error; +} + +static struct dentry *get_default_root(struct super_block *sb, + u64 subvol_objectid) +{ + struct btrfs_root *root = sb->s_fs_info; + struct btrfs_root *new_root; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_key location; + struct inode *inode; + struct dentry *dentry; + u64 dir_id; + int new = 0; + + /* + * We have a specific subvol we want to mount, just setup location and + * go look up the root. + */ + if (subvol_objectid) { + location.objectid = subvol_objectid; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = (u64)-1; + goto find_root; + } + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + path->leave_spinning = 1; + + /* + * Find the "default" dir item which points to the root item that we + * will mount by default if we haven't been given a specific subvolume + * to mount. + */ + dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); + if (IS_ERR(di)) { + btrfs_free_path(path); + return ERR_CAST(di); + } + if (!di) { + /* + * Ok the default dir item isn't there. This is weird since + * it's always been there, but don't freak out, just try and + * mount to root most subvolume. + */ + btrfs_free_path(path); + dir_id = BTRFS_FIRST_FREE_OBJECTID; + new_root = root->fs_info->fs_root; + goto setup_root; + } + + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + btrfs_free_path(path); + +find_root: + new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); + if (IS_ERR(new_root)) + return ERR_CAST(new_root); + + if (btrfs_root_refs(&new_root->root_item) == 0) + return ERR_PTR(-ENOENT); + + dir_id = btrfs_root_dirid(&new_root->root_item); +setup_root: + location.objectid = dir_id; + location.type = BTRFS_INODE_ITEM_KEY; + location.offset = 0; + + inode = btrfs_iget(sb, &location, new_root, &new); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + /* + * If we're just mounting the root most subvol put the inode and return + * a reference to the dentry. We will have already gotten a reference + * to the inode in btrfs_fill_super so we're good to go. + */ + if (!new && sb->s_root->d_inode == inode) { + iput(inode); + return dget(sb->s_root); + } + + if (new) { + const struct qstr name = { .name = "/", .len = 1 }; + + /* + * New inode, we need to make the dentry a sibling of s_root so + * everything gets cleaned up properly on unmount. + */ + dentry = d_alloc(sb->s_root, &name); + if (!dentry) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + d_splice_alias(inode, dentry); + } else { + /* + * We found the inode in cache, just find a dentry for it and + * put the reference to the inode we just got. + */ + dentry = d_find_alias(inode); + iput(inode); + } + + return dentry; +} + +static int btrfs_fill_super(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + void *data, int silent) +{ + struct inode *inode; + struct dentry *root_dentry; + struct btrfs_root *tree_root; + struct btrfs_key key; + int err; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_magic = BTRFS_SUPER_MAGIC; + sb->s_op = &btrfs_super_ops; + sb->s_d_op = &btrfs_dentry_operations; + sb->s_export_op = &btrfs_export_ops; + sb->s_xattr = btrfs_xattr_handlers; + sb->s_time_gran = 1; +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + sb->s_flags |= MS_POSIXACL; +#endif + + tree_root = open_ctree(sb, fs_devices, (char *)data); + + if (IS_ERR(tree_root)) { + printk("btrfs: open_ctree failed\n"); + return PTR_ERR(tree_root); + } + sb->s_fs_info = tree_root; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto fail_close; + } + + root_dentry = d_alloc_root(inode); + if (!root_dentry) { + iput(inode); + err = -ENOMEM; + goto fail_close; + } + + sb->s_root = root_dentry; + + save_mount_options(sb, data); + cleancache_init_fs(sb); + return 0; + +fail_close: + close_ctree(tree_root); + return err; +} + +int btrfs_sync_fs(struct super_block *sb, int wait) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = btrfs_sb(sb); + int ret; + + trace_btrfs_sync_fs(wait); + + if (!wait) { + filemap_flush(root->fs_info->btree_inode->i_mapping); + return 0; + } + + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0, 0); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_commit_transaction(trans, root); + return ret; +} + +static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); + struct btrfs_fs_info *info = root->fs_info; + char *compress_type; + + if (btrfs_test_opt(root, DEGRADED)) + seq_puts(seq, ",degraded"); + if (btrfs_test_opt(root, NODATASUM)) + seq_puts(seq, ",nodatasum"); + if (btrfs_test_opt(root, NODATACOW)) + seq_puts(seq, ",nodatacow"); + if (btrfs_test_opt(root, NOBARRIER)) + seq_puts(seq, ",nobarrier"); + if (info->max_inline != 8192 * 1024) + seq_printf(seq, ",max_inline=%llu", + (unsigned long long)info->max_inline); + if (info->alloc_start != 0) + seq_printf(seq, ",alloc_start=%llu", + (unsigned long long)info->alloc_start); + if (info->thread_pool_size != min_t(unsigned long, + num_online_cpus() + 2, 8)) + seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); + if (btrfs_test_opt(root, COMPRESS)) { + if (info->compress_type == BTRFS_COMPRESS_ZLIB) + compress_type = "zlib"; + else + compress_type = "lzo"; + if (btrfs_test_opt(root, FORCE_COMPRESS)) + seq_printf(seq, ",compress-force=%s", compress_type); + else + seq_printf(seq, ",compress=%s", compress_type); + } + if (btrfs_test_opt(root, NOSSD)) + seq_puts(seq, ",nossd"); + if (btrfs_test_opt(root, SSD_SPREAD)) + seq_puts(seq, ",ssd_spread"); + else if (btrfs_test_opt(root, SSD)) + seq_puts(seq, ",ssd"); + if (btrfs_test_opt(root, NOTREELOG)) + seq_puts(seq, ",notreelog"); + if (btrfs_test_opt(root, FLUSHONCOMMIT)) + seq_puts(seq, ",flushoncommit"); + if (btrfs_test_opt(root, DISCARD)) + seq_puts(seq, ",discard"); + if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) + seq_puts(seq, ",noacl"); + if (btrfs_test_opt(root, SPACE_CACHE)) + seq_puts(seq, ",space_cache"); + if (btrfs_test_opt(root, CLEAR_CACHE)) + seq_puts(seq, ",clear_cache"); + if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + seq_puts(seq, ",user_subvol_rm_allowed"); + if (btrfs_test_opt(root, ENOSPC_DEBUG)) + seq_puts(seq, ",enospc_debug"); + if (btrfs_test_opt(root, AUTO_DEFRAG)) + seq_puts(seq, ",autodefrag"); + if (btrfs_test_opt(root, INODE_MAP_CACHE)) + seq_puts(seq, ",inode_cache"); + return 0; +} + +static int btrfs_test_super(struct super_block *s, void *data) +{ + struct btrfs_root *test_root = data; + struct btrfs_root *root = btrfs_sb(s); + + /* + * If this super block is going away, return false as it + * can't match as an existing super block. + */ + if (!atomic_read(&s->s_active)) + return 0; + return root->fs_info->fs_devices == test_root->fs_info->fs_devices; +} + +static int btrfs_set_super(struct super_block *s, void *data) +{ + s->s_fs_info = data; + + return set_anon_super(s, data); +} + + +/* + * Find a superblock for the given device / mount point. + * + * Note: This is based on get_sb_bdev from fs/super.c with a few additions + * for multiple device setup. Make sure to keep it in sync. + */ +static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, + const char *device_name, void *data) +{ + struct block_device *bdev = NULL; + struct super_block *s; + struct dentry *root; + struct btrfs_fs_devices *fs_devices = NULL; + struct btrfs_root *tree_root = NULL; + struct btrfs_fs_info *fs_info = NULL; + fmode_t mode = FMODE_READ; + char *subvol_name = NULL; + u64 subvol_objectid = 0; + u64 subvol_rootid = 0; + int error = 0; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + error = btrfs_parse_early_options(data, mode, fs_type, + &subvol_name, &subvol_objectid, + &subvol_rootid, &fs_devices); + if (error) + return ERR_PTR(error); + + error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); + if (error) + goto error_free_subvol_name; + + error = btrfs_open_devices(fs_devices, mode, fs_type); + if (error) + goto error_free_subvol_name; + + if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; + goto error_close_devices; + } + + /* + * Setup a dummy root and fs_info for test/set super. This is because + * we don't actually fill this stuff out until open_ctree, but we need + * it for searching for existing supers, so this lets us do that and + * then open_ctree will properly initialize everything later. + */ + fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); + tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!fs_info || !tree_root) { + error = -ENOMEM; + goto error_close_devices; + } + fs_info->tree_root = tree_root; + fs_info->fs_devices = fs_devices; + tree_root->fs_info = fs_info; + + bdev = fs_devices->latest_bdev; + s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); + if (IS_ERR(s)) + goto error_s; + + if (s->s_root) { + if ((flags ^ s->s_flags) & MS_RDONLY) { + deactivate_locked_super(s); + error = -EBUSY; + goto error_close_devices; + } + + btrfs_close_devices(fs_devices); + kfree(fs_info); + kfree(tree_root); + } else { + char b[BDEVNAME_SIZE]; + + s->s_flags = flags | MS_NOSEC; + strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + error = btrfs_fill_super(s, fs_devices, data, + flags & MS_SILENT ? 1 : 0); + if (error) { + deactivate_locked_super(s); + goto error_free_subvol_name; + } + + btrfs_sb(s)->fs_info->bdev_holder = fs_type; + s->s_flags |= MS_ACTIVE; + } + + /* if they gave us a subvolume name bind mount into that */ + if (strcmp(subvol_name, ".")) { + struct dentry *new_root; + + root = get_default_root(s, subvol_rootid); + if (IS_ERR(root)) { + error = PTR_ERR(root); + deactivate_locked_super(s); + goto error_free_subvol_name; + } + + mutex_lock(&root->d_inode->i_mutex); + new_root = lookup_one_len(subvol_name, root, + strlen(subvol_name)); + mutex_unlock(&root->d_inode->i_mutex); + + if (IS_ERR(new_root)) { + dput(root); + deactivate_locked_super(s); + error = PTR_ERR(new_root); + goto error_free_subvol_name; + } + if (!new_root->d_inode) { + dput(root); + dput(new_root); + deactivate_locked_super(s); + error = -ENXIO; + goto error_free_subvol_name; + } + dput(root); + root = new_root; + } else { + root = get_default_root(s, subvol_objectid); + if (IS_ERR(root)) { + error = PTR_ERR(root); + deactivate_locked_super(s); + goto error_free_subvol_name; + } + } + + kfree(subvol_name); + return root; + +error_s: + error = PTR_ERR(s); +error_close_devices: + btrfs_close_devices(fs_devices); + kfree(fs_info); + kfree(tree_root); +error_free_subvol_name: + kfree(subvol_name); + return ERR_PTR(error); +} + +static int btrfs_remount(struct super_block *sb, int *flags, char *data) +{ + struct btrfs_root *root = btrfs_sb(sb); + int ret; + + ret = btrfs_parse_options(root, data); + if (ret) + return -EINVAL; + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + + if (*flags & MS_RDONLY) { + sb->s_flags |= MS_RDONLY; + + ret = btrfs_commit_super(root); + WARN_ON(ret); + } else { + if (root->fs_info->fs_devices->rw_devices == 0) + return -EACCES; + + if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) + return -EINVAL; + + ret = btrfs_cleanup_fs_roots(root->fs_info); + WARN_ON(ret); + + /* recover relocation */ + ret = btrfs_recover_relocation(root); + WARN_ON(ret); + + sb->s_flags &= ~MS_RDONLY; + } + + return 0; +} + +/* Used to sort the devices by max_avail(descending sort) */ +static int btrfs_cmp_device_free_bytes(const void *dev_info1, + const void *dev_info2) +{ + if (((struct btrfs_device_info *)dev_info1)->max_avail > + ((struct btrfs_device_info *)dev_info2)->max_avail) + return -1; + else if (((struct btrfs_device_info *)dev_info1)->max_avail < + ((struct btrfs_device_info *)dev_info2)->max_avail) + return 1; + else + return 0; +} + +/* + * sort the devices by max_avail, in which max free extent size of each device + * is stored.(Descending Sort) + */ +static inline void btrfs_descending_sort_devices( + struct btrfs_device_info *devices, + size_t nr_devices) +{ + sort(devices, nr_devices, sizeof(struct btrfs_device_info), + btrfs_cmp_device_free_bytes, NULL); +} + +/* + * The helper to calc the free space on the devices that can be used to store + * file data. + */ +static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_device_info *devices_info; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 skip_space; + u64 type; + u64 avail_space; + u64 used_space; + u64 min_stripe_size; + int min_stripes = 1; + int i = 0, nr_devices; + int ret; + + nr_devices = fs_info->fs_devices->rw_devices; + BUG_ON(!nr_devices); + + devices_info = kmalloc(sizeof(*devices_info) * nr_devices, + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + /* calc min stripe number for data space alloction */ + type = btrfs_get_alloc_profile(root, 1); + if (type & BTRFS_BLOCK_GROUP_RAID0) + min_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID1) + min_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID10) + min_stripes = 4; + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_stripe_size = 2 * BTRFS_STRIPE_LEN; + else + min_stripe_size = BTRFS_STRIPE_LEN; + + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + if (!device->in_fs_metadata) + continue; + + avail_space = device->total_bytes - device->bytes_used; + + /* align with stripe_len */ + do_div(avail_space, BTRFS_STRIPE_LEN); + avail_space *= BTRFS_STRIPE_LEN; + + /* + * In order to avoid overwritting the superblock on the drive, + * btrfs starts at an offset of at least 1MB when doing chunk + * allocation. + */ + skip_space = 1024 * 1024; + + /* user can set the offset in fs_info->alloc_start. */ + if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= + device->total_bytes) + skip_space = max(fs_info->alloc_start, skip_space); + + /* + * btrfs can not use the free space in [0, skip_space - 1], + * we must subtract it from the total. In order to implement + * it, we account the used space in this range first. + */ + ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, + &used_space); + if (ret) { + kfree(devices_info); + return ret; + } + + /* calc the free space in [0, skip_space - 1] */ + skip_space -= used_space; + + /* + * we can use the free space in [0, skip_space - 1], subtract + * it from the total. + */ + if (avail_space && avail_space >= skip_space) + avail_space -= skip_space; + else + avail_space = 0; + + if (avail_space < min_stripe_size) + continue; + + devices_info[i].dev = device; + devices_info[i].max_avail = avail_space; + + i++; + } + + nr_devices = i; + + btrfs_descending_sort_devices(devices_info, nr_devices); + + i = nr_devices - 1; + avail_space = 0; + while (nr_devices >= min_stripes) { + if (devices_info[i].max_avail >= min_stripe_size) { + int j; + u64 alloc_size; + + avail_space += devices_info[i].max_avail * min_stripes; + alloc_size = devices_info[i].max_avail; + for (j = i + 1 - min_stripes; j <= i; j++) + devices_info[j].max_avail -= alloc_size; + } + i--; + nr_devices--; + } + + kfree(devices_info); + *free_bytes = avail_space; + return 0; +} + +static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct btrfs_root *root = btrfs_sb(dentry->d_sb); + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct list_head *head = &root->fs_info->space_info; + struct btrfs_space_info *found; + u64 total_used = 0; + u64 total_free_data = 0; + int bits = dentry->d_sb->s_blocksize_bits; + __be32 *fsid = (__be32 *)root->fs_info->fsid; + int ret; + + /* holding chunk_muext to avoid allocating new chunks */ + mutex_lock(&root->fs_info->chunk_mutex); + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_DATA) { + total_free_data += found->disk_total - found->disk_used; + total_free_data -= + btrfs_account_ro_block_groups_free_space(found); + } + + total_used += found->disk_used; + } + rcu_read_unlock(); + + buf->f_namelen = BTRFS_NAME_LEN; + buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; + buf->f_bfree = buf->f_blocks - (total_used >> bits); + buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_bavail = total_free_data; + ret = btrfs_calc_avail_data_space(root, &total_free_data); + if (ret) { + mutex_unlock(&root->fs_info->chunk_mutex); + return ret; + } + buf->f_bavail += total_free_data; + buf->f_bavail = buf->f_bavail >> bits; + mutex_unlock(&root->fs_info->chunk_mutex); + + /* We treat it as constant endianness (it doesn't matter _which_) + because we want the fsid to come out the same whether mounted + on a big-endian or little-endian host */ + buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); + buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); + /* Mask in the root object ID too, to disambiguate subvols */ + buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32; + buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid; + + return 0; +} + +static struct file_system_type btrfs_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .mount = btrfs_mount, + .kill_sb = kill_anon_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +/* + * used by btrfsctl to scan devices when no FS is mounted + */ +static long btrfs_control_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct btrfs_ioctl_vol_args *vol; + struct btrfs_fs_devices *fs_devices; + int ret = -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol = memdup_user((void __user *)arg, sizeof(*vol)); + if (IS_ERR(vol)) + return PTR_ERR(vol); + + switch (cmd) { + case BTRFS_IOC_SCAN_DEV: + ret = btrfs_scan_one_device(vol->name, FMODE_READ, + &btrfs_fs_type, &fs_devices); + break; + } + + kfree(vol); + return ret; +} + +static int btrfs_freeze(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + mutex_lock(&root->fs_info->cleaner_mutex); + return 0; +} + +static int btrfs_unfreeze(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + mutex_unlock(&root->fs_info->cleaner_mutex); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + return 0; +} + +static const struct super_operations btrfs_super_ops = { + .drop_inode = btrfs_drop_inode, + .evict_inode = btrfs_evict_inode, + .put_super = btrfs_put_super, + .sync_fs = btrfs_sync_fs, + .show_options = btrfs_show_options, + .write_inode = btrfs_write_inode, + .dirty_inode = btrfs_dirty_inode, + .alloc_inode = btrfs_alloc_inode, + .destroy_inode = btrfs_destroy_inode, + .statfs = btrfs_statfs, + .remount_fs = btrfs_remount, + .freeze_fs = btrfs_freeze, + .unfreeze_fs = btrfs_unfreeze, +}; + +static const struct file_operations btrfs_ctl_fops = { + .unlocked_ioctl = btrfs_control_ioctl, + .compat_ioctl = btrfs_control_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice btrfs_misc = { + .minor = BTRFS_MINOR, + .name = "btrfs-control", + .fops = &btrfs_ctl_fops +}; + +MODULE_ALIAS_MISCDEV(BTRFS_MINOR); +MODULE_ALIAS("devname:btrfs-control"); + +static int btrfs_interface_init(void) +{ + return misc_register(&btrfs_misc); +} + +static void btrfs_interface_exit(void) +{ + if (misc_deregister(&btrfs_misc) < 0) + printk(KERN_INFO "misc_deregister failed for control device"); +} + +static int __init init_btrfs_fs(void) +{ + int err; + + err = btrfs_init_sysfs(); + if (err) + return err; + + err = btrfs_init_compress(); + if (err) + goto free_sysfs; + + err = btrfs_init_cachep(); + if (err) + goto free_compress; + + err = extent_io_init(); + if (err) + goto free_cachep; + + err = extent_map_init(); + if (err) + goto free_extent_io; + + err = btrfs_delayed_inode_init(); + if (err) + goto free_extent_map; + + err = btrfs_interface_init(); + if (err) + goto free_delayed_inode; + + err = register_filesystem(&btrfs_fs_type); + if (err) + goto unregister_ioctl; + + printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); + return 0; + +unregister_ioctl: + btrfs_interface_exit(); +free_delayed_inode: + btrfs_delayed_inode_exit(); +free_extent_map: + extent_map_exit(); +free_extent_io: + extent_io_exit(); +free_cachep: + btrfs_destroy_cachep(); +free_compress: + btrfs_exit_compress(); +free_sysfs: + btrfs_exit_sysfs(); + return err; +} + +static void __exit exit_btrfs_fs(void) +{ + btrfs_destroy_cachep(); + btrfs_delayed_inode_exit(); + extent_map_exit(); + extent_io_exit(); + btrfs_interface_exit(); + unregister_filesystem(&btrfs_fs_type); + btrfs_exit_sysfs(); + btrfs_cleanup_fs_uuids(); + btrfs_exit_compress(); +} + +module_init(init_btrfs_fs) +module_exit(exit_btrfs_fs) + +MODULE_LICENSE("GPL"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c new file mode 100644 index 00000000..daac9ae6 --- /dev/null +++ b/fs/btrfs/sysfs.c @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +/* /sys/fs/btrfs/ entry */ +static struct kset *btrfs_kset; + +int btrfs_init_sysfs(void) +{ + btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); + if (!btrfs_kset) + return -ENOMEM; + return 0; +} + +void btrfs_exit_sysfs(void) +{ + kset_unregister(btrfs_kset); +} + diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c new file mode 100644 index 00000000..51dcec86 --- /dev/null +++ b/fs/btrfs/transaction.c @@ -0,0 +1,1463 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "locking.h" +#include "tree-log.h" +#include "inode-map.h" + +#define BTRFS_ROOT_TRANS_TAG 0 + +static noinline void put_transaction(struct btrfs_transaction *transaction) +{ + WARN_ON(atomic_read(&transaction->use_count) == 0); + if (atomic_dec_and_test(&transaction->use_count)) { + BUG_ON(!list_empty(&transaction->list)); + memset(transaction, 0, sizeof(*transaction)); + kmem_cache_free(btrfs_transaction_cachep, transaction); + } +} + +static noinline void switch_commit_root(struct btrfs_root *root) +{ + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); +} + +/* + * either allocate a new transaction or hop into the existing one + */ +static noinline int join_transaction(struct btrfs_root *root, int nofail) +{ + struct btrfs_transaction *cur_trans; + + spin_lock(&root->fs_info->trans_lock); + if (root->fs_info->trans_no_join) { + if (!nofail) { + spin_unlock(&root->fs_info->trans_lock); + return -EBUSY; + } + } + + cur_trans = root->fs_info->running_transaction; + if (cur_trans) { + atomic_inc(&cur_trans->use_count); + atomic_inc(&cur_trans->num_writers); + cur_trans->num_joined++; + spin_unlock(&root->fs_info->trans_lock); + return 0; + } + spin_unlock(&root->fs_info->trans_lock); + + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); + if (!cur_trans) + return -ENOMEM; + spin_lock(&root->fs_info->trans_lock); + if (root->fs_info->running_transaction) { + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + cur_trans = root->fs_info->running_transaction; + atomic_inc(&cur_trans->use_count); + atomic_inc(&cur_trans->num_writers); + cur_trans->num_joined++; + spin_unlock(&root->fs_info->trans_lock); + return 0; + } + atomic_set(&cur_trans->num_writers, 1); + cur_trans->num_joined = 0; + init_waitqueue_head(&cur_trans->writer_wait); + init_waitqueue_head(&cur_trans->commit_wait); + cur_trans->in_commit = 0; + cur_trans->blocked = 0; + /* + * One for this trans handle, one so it will live on until we + * commit the transaction. + */ + atomic_set(&cur_trans->use_count, 2); + cur_trans->commit_done = 0; + cur_trans->start_time = get_seconds(); + + cur_trans->delayed_refs.root = RB_ROOT; + cur_trans->delayed_refs.num_entries = 0; + cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.num_heads = 0; + cur_trans->delayed_refs.flushing = 0; + cur_trans->delayed_refs.run_delayed_start = 0; + spin_lock_init(&cur_trans->commit_lock); + spin_lock_init(&cur_trans->delayed_refs.lock); + + INIT_LIST_HEAD(&cur_trans->pending_snapshots); + list_add_tail(&cur_trans->list, &root->fs_info->trans_list); + extent_io_tree_init(&cur_trans->dirty_pages, + root->fs_info->btree_inode->i_mapping); + root->fs_info->generation++; + cur_trans->transid = root->fs_info->generation; + root->fs_info->running_transaction = cur_trans; + spin_unlock(&root->fs_info->trans_lock); + + return 0; +} + +/* + * this does all the record keeping required to make sure that a reference + * counted root is properly recorded in a given transaction. This is required + * to make sure the old root from before we joined the transaction is deleted + * when the transaction commits + */ +static int record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (root->ref_cows && root->last_trans < trans->transid) { + WARN_ON(root == root->fs_info->extent_root); + WARN_ON(root->commit_root != root->node); + + /* + * see below for in_trans_setup usage rules + * we have the reloc mutex held now, so there + * is only one writer in this function + */ + root->in_trans_setup = 1; + + /* make sure readers find in_trans_setup before + * they find our root->last_trans update + */ + smp_wmb(); + + spin_lock(&root->fs_info->fs_roots_radix_lock); + if (root->last_trans == trans->transid) { + spin_unlock(&root->fs_info->fs_roots_radix_lock); + return 0; + } + radix_tree_tag_set(&root->fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&root->fs_info->fs_roots_radix_lock); + root->last_trans = trans->transid; + + /* this is pretty tricky. We don't want to + * take the relocation lock in btrfs_record_root_in_trans + * unless we're really doing the first setup for this root in + * this transaction. + * + * Normally we'd use root->last_trans as a flag to decide + * if we want to take the expensive mutex. + * + * But, we have to set root->last_trans before we + * init the relocation root, otherwise, we trip over warnings + * in ctree.c. The solution used here is to flag ourselves + * with root->in_trans_setup. When this is 1, we're still + * fixing up the reloc trees and everyone must wait. + * + * When this is zero, they can trust root->last_trans and fly + * through btrfs_record_root_in_trans without having to take the + * lock. smp_wmb() makes sure that all the writes above are + * done before we pop in the zero below + */ + btrfs_init_reloc_root(trans, root); + smp_wmb(); + root->in_trans_setup = 0; + } + return 0; +} + + +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!root->ref_cows) + return 0; + + /* + * see record_root_in_trans for comments about in_trans_setup usage + * and barriers + */ + smp_rmb(); + if (root->last_trans == trans->transid && + !root->in_trans_setup) + return 0; + + mutex_lock(&root->fs_info->reloc_mutex); + record_root_in_trans(trans, root); + mutex_unlock(&root->fs_info->reloc_mutex); + + return 0; +} + +/* wait for commit against the current transaction to become unblocked + * when this is done, it is safe to start a new transaction, but the current + * transaction might not be fully on disk. + */ +static void wait_current_trans(struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans; + + spin_lock(&root->fs_info->trans_lock); + cur_trans = root->fs_info->running_transaction; + if (cur_trans && cur_trans->blocked) { + DEFINE_WAIT(wait); + atomic_inc(&cur_trans->use_count); + spin_unlock(&root->fs_info->trans_lock); + while (1) { + prepare_to_wait(&root->fs_info->transaction_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (!cur_trans->blocked) + break; + schedule(); + } + finish_wait(&root->fs_info->transaction_wait, &wait); + put_transaction(cur_trans); + } else { + spin_unlock(&root->fs_info->trans_lock); + } +} + +enum btrfs_trans_type { + TRANS_START, + TRANS_JOIN, + TRANS_USERSPACE, + TRANS_JOIN_NOLOCK, +}; + +static int may_wait_transaction(struct btrfs_root *root, int type) +{ + if (root->fs_info->log_root_recovering) + return 0; + + if (type == TRANS_USERSPACE) + return 1; + + if (type == TRANS_START && + !atomic_read(&root->fs_info->open_ioctl_trans)) + return 1; + + return 0; +} + +static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, + u64 num_items, int type) +{ + struct btrfs_trans_handle *h; + struct btrfs_transaction *cur_trans; + int retries = 0; + int ret; + + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + return ERR_PTR(-EROFS); + + if (current->journal_info) { + WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); + h = current->journal_info; + h->use_count++; + h->orig_rsv = h->block_rsv; + h->block_rsv = NULL; + goto got_it; + } +again: + h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + if (!h) + return ERR_PTR(-ENOMEM); + + if (may_wait_transaction(root, type)) + wait_current_trans(root); + + do { + ret = join_transaction(root, type == TRANS_JOIN_NOLOCK); + if (ret == -EBUSY) + wait_current_trans(root); + } while (ret == -EBUSY); + + if (ret < 0) { + kmem_cache_free(btrfs_trans_handle_cachep, h); + return ERR_PTR(ret); + } + + cur_trans = root->fs_info->running_transaction; + + h->transid = cur_trans->transid; + h->transaction = cur_trans; + h->blocks_used = 0; + h->bytes_reserved = 0; + h->delayed_ref_updates = 0; + h->use_count = 1; + h->block_rsv = NULL; + h->orig_rsv = NULL; + + smp_mb(); + if (cur_trans->blocked && may_wait_transaction(root, type)) { + btrfs_commit_transaction(h, root); + goto again; + } + + if (num_items > 0) { + ret = btrfs_trans_reserve_metadata(h, root, num_items); + if (ret == -EAGAIN && !retries) { + retries++; + btrfs_commit_transaction(h, root); + goto again; + } else if (ret == -EAGAIN) { + /* + * We have already retried and got EAGAIN, so really we + * don't have space, so set ret to -ENOSPC. + */ + ret = -ENOSPC; + } + + if (ret < 0) { + btrfs_end_transaction(h, root); + return ERR_PTR(ret); + } + } + +got_it: + btrfs_record_root_in_trans(h, root); + + if (!current->journal_info && type != TRANS_USERSPACE) + current->journal_info = h; + return h; +} + +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + int num_items) +{ + return start_transaction(root, num_items, TRANS_START); +} +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_JOIN); +} + +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_JOIN_NOLOCK); +} + +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_USERSPACE); +} + +/* wait for a transaction commit to be fully complete */ +static noinline int wait_for_commit(struct btrfs_root *root, + struct btrfs_transaction *commit) +{ + DEFINE_WAIT(wait); + while (!commit->commit_done) { + prepare_to_wait(&commit->commit_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (commit->commit_done) + break; + schedule(); + } + finish_wait(&commit->commit_wait, &wait); + return 0; +} + +int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) +{ + struct btrfs_transaction *cur_trans = NULL, *t; + int ret; + + ret = 0; + if (transid) { + if (transid <= root->fs_info->last_trans_committed) + goto out; + + /* find specified transaction */ + spin_lock(&root->fs_info->trans_lock); + list_for_each_entry(t, &root->fs_info->trans_list, list) { + if (t->transid == transid) { + cur_trans = t; + atomic_inc(&cur_trans->use_count); + break; + } + if (t->transid > transid) + break; + } + spin_unlock(&root->fs_info->trans_lock); + ret = -EINVAL; + if (!cur_trans) + goto out; /* bad transid */ + } else { + /* find newest transaction that is committing | committed */ + spin_lock(&root->fs_info->trans_lock); + list_for_each_entry_reverse(t, &root->fs_info->trans_list, + list) { + if (t->in_commit) { + if (t->commit_done) + break; + cur_trans = t; + atomic_inc(&cur_trans->use_count); + break; + } + } + spin_unlock(&root->fs_info->trans_lock); + if (!cur_trans) + goto out; /* nothing committing|committed */ + } + + wait_for_commit(root, cur_trans); + + put_transaction(cur_trans); + ret = 0; +out: + return ret; +} + +void btrfs_throttle(struct btrfs_root *root) +{ + if (!atomic_read(&root->fs_info->open_ioctl_trans)) + wait_current_trans(root); +} + +static int should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + ret = btrfs_block_rsv_check(trans, root, + &root->fs_info->global_block_rsv, 0, 5); + return ret ? 1 : 0; +} + +int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + int updates; + + smp_mb(); + if (cur_trans->blocked || cur_trans->delayed_refs.flushing) + return 1; + + updates = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (updates) + btrfs_run_delayed_refs(trans, root, updates); + + return should_end_transaction(trans, root); +} + +static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int throttle, int lock) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_fs_info *info = root->fs_info; + int count = 0; + + if (--trans->use_count) { + trans->block_rsv = trans->orig_rsv; + return 0; + } + + while (count < 4) { + unsigned long cur = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (cur && + trans->transaction->delayed_refs.num_heads_ready > 64) { + trans->delayed_ref_updates = 0; + + /* + * do a full flush if the transaction is trying + * to close + */ + if (trans->transaction->delayed_refs.flushing) + cur = 0; + btrfs_run_delayed_refs(trans, root, cur); + } else { + break; + } + count++; + } + + btrfs_trans_release_metadata(trans, root); + + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && + should_end_transaction(trans, root)) { + trans->transaction->blocked = 1; + smp_wmb(); + } + + if (lock && cur_trans->blocked && !cur_trans->in_commit) { + if (throttle) + return btrfs_commit_transaction(trans, root); + else + wake_up_process(info->transaction_kthread); + } + + WARN_ON(cur_trans != info->running_transaction); + WARN_ON(atomic_read(&cur_trans->num_writers) < 1); + atomic_dec(&cur_trans->num_writers); + + smp_mb(); + if (waitqueue_active(&cur_trans->writer_wait)) + wake_up(&cur_trans->writer_wait); + put_transaction(cur_trans); + + if (current->journal_info == trans) + current->journal_info = NULL; + memset(trans, 0, sizeof(*trans)); + kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (throttle) + btrfs_run_delayed_iputs(root); + + return 0; +} + +int btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + + ret = __btrfs_end_transaction(trans, root, 0, 1); + if (ret) + return ret; + return 0; +} + +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + + ret = __btrfs_end_transaction(trans, root, 1, 1); + if (ret) + return ret; + return 0; +} + +int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + + ret = __btrfs_end_transaction(trans, root, 0, 0); + if (ret) + return ret; + return 0; +} + +int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_end_transaction(trans, root, 1, 1); +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are sent to disk but does not wait on them + */ +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark) +{ + int ret; + int err = 0; + int werr = 0; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + u64 start = 0; + u64 end; + unsigned long index; + + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + mark); + if (ret) + break; + while (start <= end) { + cond_resched(); + + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + + btree_lock_page_hook(page); + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + continue; + } + + if (PageWriteback(page)) { + if (PageDirty(page)) + wait_on_page_writeback(page); + else { + unlock_page(page); + page_cache_release(page); + continue; + } + } + err = write_one_page(page, 0); + if (err) + werr = err; + page_cache_release(page); + } + } + if (err) + werr = err; + return werr; +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit. We wait + * on all the pages and clear them from the dirty pages state tree + */ +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark) +{ + int ret; + int err = 0; + int werr = 0; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + u64 start = 0; + u64 end; + unsigned long index; + + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + mark); + if (ret) + break; + + clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + if (PageDirty(page)) { + btree_lock_page_hook(page); + wait_on_page_writeback(page); + err = write_one_page(page, 0); + if (err) + werr = err; + } + wait_on_page_writeback(page); + page_cache_release(page); + cond_resched(); + } + } + if (err) + werr = err; + return werr; +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit + */ +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark) +{ + int ret; + int ret2; + + ret = btrfs_write_marked_extents(root, dirty_pages, mark); + ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); + return ret || ret2; +} + +int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!trans || !trans->transaction) { + struct inode *btree_inode; + btree_inode = root->fs_info->btree_inode; + return filemap_write_and_wait(btree_inode->i_mapping); + } + return btrfs_write_and_wait_marked_extents(root, + &trans->transaction->dirty_pages, + EXTENT_DIRTY); +} + +/* + * this is used to update the root pointer in the tree of tree roots. + * + * But, in the case of the extent allocation tree, updating the root + * pointer may allocate blocks which may change the root of the extent + * allocation tree. + * + * So, this loops and repeats and makes sure the cowonly root didn't + * change while the root pointer was being updated in the metadata. + */ +static int update_cowonly_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + u64 old_root_bytenr; + u64 old_root_used; + struct btrfs_root *tree_root = root->fs_info->tree_root; + + old_root_used = btrfs_root_used(&root->root_item); + btrfs_write_dirty_block_groups(trans, root); + + while (1) { + old_root_bytenr = btrfs_root_bytenr(&root->root_item); + if (old_root_bytenr == root->node->start && + old_root_used == btrfs_root_used(&root->root_item)) + break; + + btrfs_set_root_node(&root->root_item, root->node); + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + &root->root_item); + BUG_ON(ret); + + old_root_used = btrfs_root_used(&root->root_item); + ret = btrfs_write_dirty_block_groups(trans, root); + BUG_ON(ret); + } + + if (root != root->fs_info->extent_root) + switch_commit_root(root); + + return 0; +} + +/* + * update all the cowonly tree roots on disk + */ +static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *next; + struct extent_buffer *eb; + int ret; + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); + + eb = btrfs_lock_root_node(fs_info->tree_root); + btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); + + while (!list_empty(&fs_info->dirty_cowonly_roots)) { + next = fs_info->dirty_cowonly_roots.next; + list_del_init(next); + root = list_entry(next, struct btrfs_root, dirty_list); + + update_cowonly_root(trans, root); + } + + down_write(&fs_info->extent_commit_sem); + switch_commit_root(fs_info->extent_root); + up_write(&fs_info->extent_commit_sem); + + return 0; +} + +/* + * dead roots are old snapshots that need to be deleted. This allocates + * a dirty root struct and adds it into the list of dead roots that need to + * be deleted + */ +int btrfs_add_dead_root(struct btrfs_root *root) +{ + spin_lock(&root->fs_info->trans_lock); + list_add(&root->root_list, &root->fs_info->dead_roots); + spin_unlock(&root->fs_info->trans_lock); + return 0; +} + +/* + * update all the cowonly tree roots on disk + */ +static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *gang[8]; + struct btrfs_fs_info *fs_info = root->fs_info; + int i; + int ret; + int err = 0; + + spin_lock(&fs_info->fs_roots_radix_lock); + while (1) { + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + root = gang[i]; + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); + + btrfs_free_log(trans, root); + btrfs_update_reloc_root(trans, root); + btrfs_orphan_commit_root(trans, root); + + btrfs_save_ino_cache(root, trans); + + if (root->commit_root != root->node) { + mutex_lock(&root->fs_commit_mutex); + switch_commit_root(root); + btrfs_unpin_free_ino(root); + mutex_unlock(&root->fs_commit_mutex); + + btrfs_set_root_node(&root->root_item, + root->node); + } + + err = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, + &root->root_item); + spin_lock(&fs_info->fs_roots_radix_lock); + if (err) + break; + } + } + spin_unlock(&fs_info->fs_roots_radix_lock); + return err; +} + +/* + * defrag a given btree. If cacheonly == 1, this won't read from the disk, + * otherwise every leaf in the btree is read and defragged. + */ +int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_trans_handle *trans; + int ret; + unsigned long nr; + + if (xchg(&root->defrag_running, 1)) + return 0; + + while (1) { + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_defrag_leaves(trans, root, cacheonly); + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(info->tree_root, nr); + cond_resched(); + + if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) + break; + } + root->defrag_running = 0; + return ret; +} + +/* + * new snapshots need to be created at a very specific time in the + * transaction commit. This does the actual creation + */ +static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_key key; + struct btrfs_root_item *new_root_item; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *root = pending->root; + struct btrfs_root *parent_root; + struct inode *parent_inode; + struct dentry *parent; + struct dentry *dentry; + struct extent_buffer *tmp; + struct extent_buffer *old; + int ret; + u64 to_reserve = 0; + u64 index = 0; + u64 objectid; + u64 root_flags; + + new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); + if (!new_root_item) { + pending->error = -ENOMEM; + goto fail; + } + + ret = btrfs_find_free_objectid(tree_root, &objectid); + if (ret) { + pending->error = ret; + goto fail; + } + + btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); + btrfs_orphan_pre_snapshot(trans, pending, &to_reserve); + + if (to_reserve > 0) { + ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, + to_reserve); + if (ret) { + pending->error = ret; + goto fail; + } + } + + key.objectid = objectid; + key.offset = (u64)-1; + key.type = BTRFS_ROOT_ITEM_KEY; + + trans->block_rsv = &pending->block_rsv; + + dentry = pending->dentry; + parent = dget_parent(dentry); + parent_inode = parent->d_inode; + parent_root = BTRFS_I(parent_inode)->root; + record_root_in_trans(trans, parent_root); + + /* + * insert the directory item + */ + ret = btrfs_set_inode_index(parent_inode, &index); + BUG_ON(ret); + ret = btrfs_insert_dir_item(trans, parent_root, + dentry->d_name.name, dentry->d_name.len, + parent_inode, &key, + BTRFS_FT_DIR, index); + BUG_ON(ret); + + btrfs_i_size_write(parent_inode, parent_inode->i_size + + dentry->d_name.len * 2); + ret = btrfs_update_inode(trans, parent_root, parent_inode); + BUG_ON(ret); + + /* + * pull in the delayed directory update + * and the delayed inode item + * otherwise we corrupt the FS during + * snapshot + */ + ret = btrfs_run_delayed_items(trans, root); + BUG_ON(ret); + + record_root_in_trans(trans, root); + btrfs_set_root_last_snapshot(&root->root_item, trans->transid); + memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + btrfs_check_and_init_root_item(new_root_item); + + root_flags = btrfs_root_flags(new_root_item); + if (pending->readonly) + root_flags |= BTRFS_ROOT_SUBVOL_RDONLY; + else + root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; + btrfs_set_root_flags(new_root_item, root_flags); + + old = btrfs_lock_root_node(root); + btrfs_cow_block(trans, root, old, NULL, 0, &old); + btrfs_set_lock_blocking(old); + + btrfs_copy_root(trans, root, old, &tmp, objectid); + btrfs_tree_unlock(old); + free_extent_buffer(old); + + btrfs_set_root_node(new_root_item, tmp); + /* record when the snapshot was created in key.offset */ + key.offset = trans->transid; + ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); + btrfs_tree_unlock(tmp); + free_extent_buffer(tmp); + BUG_ON(ret); + + /* + * insert root back/forward references + */ + ret = btrfs_add_root_ref(trans, tree_root, objectid, + parent_root->root_key.objectid, + btrfs_ino(parent_inode), index, + dentry->d_name.name, dentry->d_name.len); + BUG_ON(ret); + dput(parent); + + key.offset = (u64)-1; + pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); + BUG_ON(IS_ERR(pending->snap)); + + btrfs_reloc_post_snapshot(trans, pending); + btrfs_orphan_post_snapshot(trans, pending); +fail: + kfree(new_root_item); + btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); + return 0; +} + +/* + * create all the snapshots we've scheduled for creation + */ +static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + int ret; + + list_for_each_entry(pending, head, list) { + ret = create_pending_snapshot(trans, fs_info, pending); + BUG_ON(ret); + } + return 0; +} + +static void update_super_roots(struct btrfs_root *root) +{ + struct btrfs_root_item *root_item; + struct btrfs_super_block *super; + + super = &root->fs_info->super_copy; + + root_item = &root->fs_info->chunk_root->root_item; + super->chunk_root = root_item->bytenr; + super->chunk_root_generation = root_item->generation; + super->chunk_root_level = root_item->level; + + root_item = &root->fs_info->tree_root->root_item; + super->root = root_item->bytenr; + super->generation = root_item->generation; + super->root_level = root_item->level; + if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) + super->cache_generation = root_item->generation; +} + +int btrfs_transaction_in_commit(struct btrfs_fs_info *info) +{ + int ret = 0; + spin_lock(&info->trans_lock); + if (info->running_transaction) + ret = info->running_transaction->in_commit; + spin_unlock(&info->trans_lock); + return ret; +} + +int btrfs_transaction_blocked(struct btrfs_fs_info *info) +{ + int ret = 0; + spin_lock(&info->trans_lock); + if (info->running_transaction) + ret = info->running_transaction->blocked; + spin_unlock(&info->trans_lock); + return ret; +} + +/* + * wait for the current transaction commit to start and block subsequent + * transaction joins + */ +static void wait_current_trans_commit_start(struct btrfs_root *root, + struct btrfs_transaction *trans) +{ + DEFINE_WAIT(wait); + + if (trans->in_commit) + return; + + while (1) { + prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (trans->in_commit) { + finish_wait(&root->fs_info->transaction_blocked_wait, + &wait); + break; + } + schedule(); + finish_wait(&root->fs_info->transaction_blocked_wait, &wait); + } +} + +/* + * wait for the current transaction to start and then become unblocked. + * caller holds ref. + */ +static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, + struct btrfs_transaction *trans) +{ + DEFINE_WAIT(wait); + + if (trans->commit_done || (trans->in_commit && !trans->blocked)) + return; + + while (1) { + prepare_to_wait(&root->fs_info->transaction_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (trans->commit_done || + (trans->in_commit && !trans->blocked)) { + finish_wait(&root->fs_info->transaction_wait, + &wait); + break; + } + schedule(); + finish_wait(&root->fs_info->transaction_wait, + &wait); + } +} + +/* + * commit transactions asynchronously. once btrfs_commit_transaction_async + * returns, any subsequent transaction will not be allowed to join. + */ +struct btrfs_async_commit { + struct btrfs_trans_handle *newtrans; + struct btrfs_root *root; + struct delayed_work work; +}; + +static void do_async_commit(struct work_struct *work) +{ + struct btrfs_async_commit *ac = + container_of(work, struct btrfs_async_commit, work.work); + + btrfs_commit_transaction(ac->newtrans, ac->root); + kfree(ac); +} + +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int wait_for_unblock) +{ + struct btrfs_async_commit *ac; + struct btrfs_transaction *cur_trans; + + ac = kmalloc(sizeof(*ac), GFP_NOFS); + if (!ac) + return -ENOMEM; + + INIT_DELAYED_WORK(&ac->work, do_async_commit); + ac->root = root; + ac->newtrans = btrfs_join_transaction(root); + if (IS_ERR(ac->newtrans)) { + int err = PTR_ERR(ac->newtrans); + kfree(ac); + return err; + } + + /* take transaction reference */ + cur_trans = trans->transaction; + atomic_inc(&cur_trans->use_count); + + btrfs_end_transaction(trans, root); + schedule_delayed_work(&ac->work, 0); + + /* wait for transaction to start and unblock */ + if (wait_for_unblock) + wait_current_trans_commit_start_and_unblock(root, cur_trans); + else + wait_current_trans_commit_start(root, cur_trans); + + if (current->journal_info == trans) + current->journal_info = NULL; + + put_transaction(cur_trans); + return 0; +} + +/* + * btrfs_transaction state sequence: + * in_commit = 0, blocked = 0 (initial) + * in_commit = 1, blocked = 1 + * blocked = 0 + * commit_done = 1 + */ +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + unsigned long joined = 0; + struct btrfs_transaction *cur_trans; + struct btrfs_transaction *prev_trans = NULL; + DEFINE_WAIT(wait); + int ret; + int should_grow = 0; + unsigned long now = get_seconds(); + int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); + + btrfs_run_ordered_operations(root, 0); + + /* make a pass through all the delayed refs we have so far + * any runnings procs may add more while we are here + */ + ret = btrfs_run_delayed_refs(trans, root, 0); + BUG_ON(ret); + + btrfs_trans_release_metadata(trans, root); + + cur_trans = trans->transaction; + /* + * set the flushing flag so procs in this transaction have to + * start sending their work down. + */ + cur_trans->delayed_refs.flushing = 1; + + ret = btrfs_run_delayed_refs(trans, root, 0); + BUG_ON(ret); + + spin_lock(&cur_trans->commit_lock); + if (cur_trans->in_commit) { + spin_unlock(&cur_trans->commit_lock); + atomic_inc(&cur_trans->use_count); + btrfs_end_transaction(trans, root); + + ret = wait_for_commit(root, cur_trans); + BUG_ON(ret); + + put_transaction(cur_trans); + + return 0; + } + + trans->transaction->in_commit = 1; + trans->transaction->blocked = 1; + spin_unlock(&cur_trans->commit_lock); + wake_up(&root->fs_info->transaction_blocked_wait); + + spin_lock(&root->fs_info->trans_lock); + if (cur_trans->list.prev != &root->fs_info->trans_list) { + prev_trans = list_entry(cur_trans->list.prev, + struct btrfs_transaction, list); + if (!prev_trans->commit_done) { + atomic_inc(&prev_trans->use_count); + spin_unlock(&root->fs_info->trans_lock); + + wait_for_commit(root, prev_trans); + + put_transaction(prev_trans); + } else { + spin_unlock(&root->fs_info->trans_lock); + } + } else { + spin_unlock(&root->fs_info->trans_lock); + } + + if (now < cur_trans->start_time || now - cur_trans->start_time < 1) + should_grow = 1; + + do { + int snap_pending = 0; + + joined = cur_trans->num_joined; + if (!list_empty(&trans->transaction->pending_snapshots)) + snap_pending = 1; + + WARN_ON(cur_trans != trans->transaction); + + if (flush_on_commit || snap_pending) { + btrfs_start_delalloc_inodes(root, 1); + ret = btrfs_wait_ordered_extents(root, 0, 1); + BUG_ON(ret); + } + + ret = btrfs_run_delayed_items(trans, root); + BUG_ON(ret); + + /* + * rename don't use btrfs_join_transaction, so, once we + * set the transaction to blocked above, we aren't going + * to get any new ordered operations. We can safely run + * it here and no for sure that nothing new will be added + * to the list + */ + btrfs_run_ordered_operations(root, 1); + + prepare_to_wait(&cur_trans->writer_wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (atomic_read(&cur_trans->num_writers) > 1) + schedule_timeout(MAX_SCHEDULE_TIMEOUT); + else if (should_grow) + schedule_timeout(1); + + finish_wait(&cur_trans->writer_wait, &wait); + } while (atomic_read(&cur_trans->num_writers) > 1 || + (should_grow && cur_trans->num_joined != joined)); + + /* + * Ok now we need to make sure to block out any other joins while we + * commit the transaction. We could have started a join before setting + * no_join so make sure to wait for num_writers to == 1 again. + */ + spin_lock(&root->fs_info->trans_lock); + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + + /* + * the reloc mutex makes sure that we stop + * the balancing code from coming in and moving + * extents around in the middle of the commit + */ + mutex_lock(&root->fs_info->reloc_mutex); + + ret = btrfs_run_delayed_items(trans, root); + BUG_ON(ret); + + ret = create_pending_snapshots(trans, root->fs_info); + BUG_ON(ret); + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); + + /* + * make sure none of the code above managed to slip in a + * delayed item + */ + btrfs_assert_delayed_root_empty(root); + + WARN_ON(cur_trans != trans->transaction); + + btrfs_scrub_pause(root); + /* btrfs_commit_tree_roots is responsible for getting the + * various roots consistent with each other. Every pointer + * in the tree of tree roots has to point to the most up to date + * root for every subvolume and other tree. So, we have to keep + * the tree logging code from jumping in and changing any + * of the trees. + * + * At this point in the commit, there can't be any tree-log + * writers, but a little lower down we drop the trans mutex + * and let new people in. By holding the tree_log_mutex + * from now until after the super is written, we avoid races + * with the tree-log code. + */ + mutex_lock(&root->fs_info->tree_log_mutex); + + ret = commit_fs_roots(trans, root); + BUG_ON(ret); + + /* commit_fs_roots gets rid of all the tree log roots, it is now + * safe to free the root of tree log roots + */ + btrfs_free_log_root_tree(trans, root->fs_info); + + ret = commit_cowonly_roots(trans, root); + BUG_ON(ret); + + btrfs_prepare_extent_commit(trans, root); + + cur_trans = root->fs_info->running_transaction; + + btrfs_set_root_node(&root->fs_info->tree_root->root_item, + root->fs_info->tree_root->node); + switch_commit_root(root->fs_info->tree_root); + + btrfs_set_root_node(&root->fs_info->chunk_root->root_item, + root->fs_info->chunk_root->node); + switch_commit_root(root->fs_info->chunk_root); + + update_super_roots(root); + + if (!root->fs_info->log_root_recovering) { + btrfs_set_super_log_root(&root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); + } + + memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, + sizeof(root->fs_info->super_copy)); + + trans->transaction->blocked = 0; + spin_lock(&root->fs_info->trans_lock); + root->fs_info->running_transaction = NULL; + root->fs_info->trans_no_join = 0; + spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); + + wake_up(&root->fs_info->transaction_wait); + + ret = btrfs_write_and_wait_transaction(trans, root); + BUG_ON(ret); + write_ctree_super(trans, root, 0); + + /* + * the super is written, we can safely allow the tree-loggers + * to go about their business + */ + mutex_unlock(&root->fs_info->tree_log_mutex); + + btrfs_finish_extent_commit(trans, root); + + cur_trans->commit_done = 1; + + root->fs_info->last_trans_committed = cur_trans->transid; + + wake_up(&cur_trans->commit_wait); + + spin_lock(&root->fs_info->trans_lock); + list_del_init(&cur_trans->list); + spin_unlock(&root->fs_info->trans_lock); + + put_transaction(cur_trans); + put_transaction(cur_trans); + + trace_btrfs_transaction_commit(root); + + btrfs_scrub_continue(root); + + if (current->journal_info == trans) + current->journal_info = NULL; + + kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (current != root->fs_info->transaction_kthread) + btrfs_run_delayed_iputs(root); + + return ret; +} + +/* + * interface function to delete all the snapshots we have scheduled for deletion + */ +int btrfs_clean_old_snapshots(struct btrfs_root *root) +{ + LIST_HEAD(list); + struct btrfs_fs_info *fs_info = root->fs_info; + + spin_lock(&fs_info->trans_lock); + list_splice_init(&fs_info->dead_roots, &list); + spin_unlock(&fs_info->trans_lock); + + while (!list_empty(&list)) { + root = list_entry(list.next, struct btrfs_root, root_list); + list_del(&root->root_list); + + btrfs_kill_all_delayed_nodes(root); + + if (btrfs_header_backref_rev(root->node) < + BTRFS_MIXED_BACKREF_REV) + btrfs_drop_snapshot(root, NULL, 0); + else + btrfs_drop_snapshot(root, NULL, 1); + } + return 0; +} diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h new file mode 100644 index 00000000..02564e62 --- /dev/null +++ b/fs/btrfs/transaction.h @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_TRANSACTION__ +#define __BTRFS_TRANSACTION__ +#include "btrfs_inode.h" +#include "delayed-ref.h" + +struct btrfs_transaction { + u64 transid; + /* + * total writers in this transaction, it must be zero before the + * transaction can end + */ + atomic_t num_writers; + atomic_t use_count; + + unsigned long num_joined; + + spinlock_t commit_lock; + int in_commit; + int commit_done; + int blocked; + struct list_head list; + struct extent_io_tree dirty_pages; + unsigned long start_time; + wait_queue_head_t writer_wait; + wait_queue_head_t commit_wait; + struct list_head pending_snapshots; + struct btrfs_delayed_ref_root delayed_refs; +}; + +struct btrfs_trans_handle { + u64 transid; + u64 bytes_reserved; + unsigned long use_count; + unsigned long blocks_reserved; + unsigned long blocks_used; + unsigned long delayed_ref_updates; + struct btrfs_transaction *transaction; + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *orig_rsv; +}; + +struct btrfs_pending_snapshot { + struct dentry *dentry; + struct btrfs_root *root; + struct btrfs_root *snap; + /* block reservation for the operation */ + struct btrfs_block_rsv block_rsv; + /* extra metadata reseration for relocation */ + int error; + bool readonly; + struct list_head list; +}; + +static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + BTRFS_I(inode)->last_trans = trans->transaction->transid; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; +} + +int btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + int num_items); +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); +int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); +int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + +int btrfs_add_dead_root(struct btrfs_root *root); +int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); +int btrfs_clean_old_snapshots(struct btrfs_root *root); +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int wait_for_unblock); +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +void btrfs_throttle(struct btrfs_root *root); +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark); +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark); +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, int mark); +int btrfs_transaction_blocked(struct btrfs_fs_info *info); +int btrfs_transaction_in_commit(struct btrfs_fs_info *info); +#endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c new file mode 100644 index 00000000..3b580ee8 --- /dev/null +++ b/fs/btrfs/tree-defrag.c @@ -0,0 +1,145 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "locking.h" + +/* defrag all the leaves in a given btree. If cache_only == 1, don't read + * things from disk, otherwise read all the leaves and try to get key order to + * better reflect disk order + */ + +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int cache_only) +{ + struct btrfs_path *path = NULL; + struct btrfs_key key; + int ret = 0; + int wret; + int level; + int is_extent = 0; + int next_key_ret = 0; + u64 last_ret = 0; + u64 min_trans = 0; + + if (cache_only) + goto out; + + if (root->fs_info->extent_root == root) { + /* + * there's recursion here right now in the tree locking, + * we can't defrag the extent root without deadlock + */ + goto out; + } + + if (root->ref_cows == 0 && !is_extent) + goto out; + + if (btrfs_test_opt(root, SSD)) + goto out; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + level = btrfs_header_level(root->node); + + if (level == 0) + goto out; + + if (root->defrag_progress.objectid == 0) { + struct extent_buffer *root_node; + u32 nritems; + + root_node = btrfs_lock_root_node(root); + btrfs_set_lock_blocking(root_node); + nritems = btrfs_header_nritems(root_node); + root->defrag_max.objectid = 0; + /* from above we know this is not a leaf */ + btrfs_node_key_to_cpu(root_node, &root->defrag_max, + nritems - 1); + btrfs_tree_unlock(root_node); + free_extent_buffer(root_node); + memset(&key, 0, sizeof(key)); + } else { + memcpy(&key, &root->defrag_progress, sizeof(key)); + } + + path->keep_locks = 1; + if (cache_only) + min_trans = root->defrag_trans_start; + + ret = btrfs_search_forward(root, &key, NULL, path, + cache_only, min_trans); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + goto out; + } + btrfs_release_path(path); + wret = btrfs_search_slot(trans, root, &key, path, 0, 1); + + if (wret < 0) { + ret = wret; + goto out; + } + if (!path->nodes[1]) { + ret = 0; + goto out; + } + path->slots[1] = btrfs_header_nritems(path->nodes[1]); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, + min_trans); + ret = btrfs_realloc_node(trans, root, + path->nodes[1], 0, + cache_only, &last_ret, + &root->defrag_progress); + if (ret) { + WARN_ON(ret == -EAGAIN); + goto out; + } + if (next_key_ret == 0) { + memcpy(&root->defrag_progress, &key, sizeof(key)); + ret = -EAGAIN; + } +out: + if (path) + btrfs_free_path(path); + if (ret == -EAGAIN) { + if (root->defrag_max.objectid > root->defrag_progress.objectid) + goto done; + if (root->defrag_max.type > root->defrag_progress.type) + goto done; + if (root->defrag_max.offset > root->defrag_progress.offset) + goto done; + ret = 0; + } +done: + if (ret != -EAGAIN) { + memset(&root->defrag_progress, 0, + sizeof(root->defrag_progress)); + root->defrag_trans_start = trans->transid; + } + return ret; +} diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c new file mode 100644 index 00000000..7fa128d3 --- /dev/null +++ b/fs/btrfs/tree-log.c @@ -0,0 +1,3339 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" +#include "print-tree.h" +#include "compat.h" +#include "tree-log.h" + +/* magic values for the inode_only field in btrfs_log_inode: + * + * LOG_INODE_ALL means to log everything + * LOG_INODE_EXISTS means to log just enough to recreate the inode + * during log replay + */ +#define LOG_INODE_ALL 0 +#define LOG_INODE_EXISTS 1 + +/* + * directory trouble cases + * + * 1) on rename or unlink, if the inode being unlinked isn't in the fsync + * log, we must force a full commit before doing an fsync of the directory + * where the unlink was done. + * ---> record transid of last unlink/rename per directory + * + * mkdir foo/some_dir + * normal commit + * rename foo/some_dir foo2/some_dir + * mkdir foo/some_dir + * fsync foo/some_dir/some_file + * + * The fsync above will unlink the original some_dir without recording + * it in its new location (foo2). After a crash, some_dir will be gone + * unless the fsync of some_file forces a full commit + * + * 2) we must log any new names for any file or dir that is in the fsync + * log. ---> check inode while renaming/linking. + * + * 2a) we must log any new names for any file or dir during rename + * when the directory they are being removed from was logged. + * ---> check inode and old parent dir during rename + * + * 2a is actually the more important variant. With the extra logging + * a crash might unlink the old name without recreating the new one + * + * 3) after a crash, we must go through any directories with a link count + * of zero and redo the rm -rf + * + * mkdir f1/foo + * normal commit + * rm -rf f1/foo + * fsync(f1) + * + * The directory f1 was fully removed from the FS, but fsync was never + * called on f1, only its parent dir. After a crash the rm -rf must + * be replayed. This must be able to recurse down the entire + * directory tree. The inode link count fixup code takes care of the + * ugly details. + */ + +/* + * stages for the tree walking. The first + * stage (0) is to only pin down the blocks we find + * the second stage (1) is to make sure that all the inodes + * we find in the log are created in the subvolume. + * + * The last stage is to deal with directories and links and extents + * and all the other fun semantics + */ +#define LOG_WALK_PIN_ONLY 0 +#define LOG_WALK_REPLAY_INODES 1 +#define LOG_WALK_REPLAY_ALL 2 + +static int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only); +static int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid, int del_all); + +/* + * tree logging is a special write ahead log used to make sure that + * fsyncs and O_SYNCs can happen without doing full tree commits. + * + * Full tree commits are expensive because they require commonly + * modified blocks to be recowed, creating many dirty pages in the + * extent tree an 4x-6x higher write load than ext3. + * + * Instead of doing a tree commit on every fsync, we use the + * key ranges and transaction ids to find items for a given file or directory + * that have changed in this transaction. Those items are copied into + * a special tree (one per subvolume root), that tree is written to disk + * and then the fsync is considered complete. + * + * After a crash, items are copied out of the log-tree back into the + * subvolume tree. Any file data extents found are recorded in the extent + * allocation tree, and the log-tree freed. + * + * The log tree is read three times, once to pin down all the extents it is + * using in ram and once, once to create all the inodes logged in the tree + * and once to do all the other items. + */ + +/* + * start a sub transaction and setup the log tree + * this increments the log tree writer count to make the people + * syncing the tree wait for us to finish + */ +static int start_log_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + int err = 0; + + mutex_lock(&root->log_mutex); + if (root->log_root) { + if (!root->log_start_pid) { + root->log_start_pid = current->pid; + root->log_multiple_pids = false; + } else if (root->log_start_pid != current->pid) { + root->log_multiple_pids = true; + } + + root->log_batch++; + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return 0; + } + root->log_multiple_pids = false; + root->log_start_pid = current->pid; + mutex_lock(&root->fs_info->tree_log_mutex); + if (!root->fs_info->log_root_tree) { + ret = btrfs_init_log_root_tree(trans, root->fs_info); + if (ret) + err = ret; + } + if (err == 0 && !root->log_root) { + ret = btrfs_add_log_tree(trans, root); + if (ret) + err = ret; + } + mutex_unlock(&root->fs_info->tree_log_mutex); + root->log_batch++; + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return err; +} + +/* + * returns 0 if there was a log transaction running and we were able + * to join, or returns -ENOENT if there were not transactions + * in progress + */ +static int join_running_log_trans(struct btrfs_root *root) +{ + int ret = -ENOENT; + + smp_mb(); + if (!root->log_root) + return -ENOENT; + + mutex_lock(&root->log_mutex); + if (root->log_root) { + ret = 0; + atomic_inc(&root->log_writers); + } + mutex_unlock(&root->log_mutex); + return ret; +} + +/* + * This either makes the current running log transaction wait + * until you call btrfs_end_log_trans() or it makes any future + * log transactions wait until you call btrfs_end_log_trans() + */ +int btrfs_pin_log_trans(struct btrfs_root *root) +{ + int ret = -ENOENT; + + mutex_lock(&root->log_mutex); + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return ret; +} + +/* + * indicate we're done making changes to the log tree + * and wake up anyone waiting to do a sync + */ +int btrfs_end_log_trans(struct btrfs_root *root) +{ + if (atomic_dec_and_test(&root->log_writers)) { + smp_mb(); + if (waitqueue_active(&root->log_writer_wait)) + wake_up(&root->log_writer_wait); + } + return 0; +} + + +/* + * the walk control struct is used to pass state down the chain when + * processing the log tree. The stage field tells us which part + * of the log tree processing we are currently doing. The others + * are state fields used for that specific part + */ +struct walk_control { + /* should we free the extent on disk when done? This is used + * at transaction commit time while freeing a log tree + */ + int free; + + /* should we write out the extent buffer? This is used + * while flushing the log tree to disk during a sync + */ + int write; + + /* should we wait for the extent buffer io to finish? Also used + * while flushing the log tree to disk for a sync + */ + int wait; + + /* pin only walk, we record which extents on disk belong to the + * log trees + */ + int pin; + + /* what stage of the replay code we're currently in */ + int stage; + + /* the root we are currently replaying */ + struct btrfs_root *replay_dest; + + /* the trans handle for the current replay */ + struct btrfs_trans_handle *trans; + + /* the function that gets used to process blocks we find in the + * tree. Note the extent_buffer might not be up to date when it is + * passed in, and it must be checked or read if you need the data + * inside it + */ + int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen); +}; + +/* + * process_func used to pin down extents, write them or wait on them + */ +static int process_one_buffer(struct btrfs_root *log, + struct extent_buffer *eb, + struct walk_control *wc, u64 gen) +{ + if (wc->pin) + btrfs_pin_extent(log->fs_info->extent_root, + eb->start, eb->len, 0); + + if (btrfs_buffer_uptodate(eb, gen)) { + if (wc->write) + btrfs_write_tree_block(eb); + if (wc->wait) + btrfs_wait_tree_block_writeback(eb); + } + return 0; +} + +/* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static noinline int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size; + u64 saved_i_size = 0; + int save_old_i_size = 0; + unsigned long src_ptr; + unsigned long dst_ptr; + int overwrite_root = 0; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + overwrite_root = 1; + + item_size = btrfs_item_size_nr(eb, slot); + src_ptr = btrfs_item_ptr_offset(eb, slot); + + /* look for the key in the destination tree */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret == 0) { + char *src_copy; + char *dst_copy; + u32 dst_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (dst_size != item_size) + goto insert; + + if (item_size == 0) { + btrfs_release_path(path); + return 0; + } + dst_copy = kmalloc(item_size, GFP_NOFS); + src_copy = kmalloc(item_size, GFP_NOFS); + if (!dst_copy || !src_copy) { + btrfs_release_path(path); + kfree(dst_copy); + kfree(src_copy); + return -ENOMEM; + } + + read_extent_buffer(eb, src_copy, src_ptr, item_size); + + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, + item_size); + ret = memcmp(dst_copy, src_copy, item_size); + + kfree(dst_copy); + kfree(src_copy); + /* + * they have the same contents, just return, this saves + * us from cowing blocks in the destination tree and doing + * extra writes that may not have been done by a previous + * sync + */ + if (ret == 0) { + btrfs_release_path(path); + return 0; + } + + } +insert: + btrfs_release_path(path); + /* try to insert the key into the destination tree */ + ret = btrfs_insert_empty_item(trans, root, path, + key, item_size); + + /* make sure any existing item is the correct size */ + if (ret == -EEXIST) { + u32 found_size; + found_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (found_size > item_size) { + btrfs_truncate_item(trans, root, path, item_size, 1); + } else if (found_size < item_size) { + ret = btrfs_extend_item(trans, root, path, + item_size - found_size); + } + } else if (ret) { + return ret; + } + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + + /* don't overwrite an existing inode if the generation number + * was logged as zero. This is done when the tree logging code + * is just logging an inode to make sure it exists after recovery. + * + * Also, don't overwrite i_size on directories during replay. + * log replay inserts and removes directory items based on the + * state of the tree found in the subvolume, and i_size is modified + * as it goes + */ + if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { + struct btrfs_inode_item *src_item; + struct btrfs_inode_item *dst_item; + + src_item = (struct btrfs_inode_item *)src_ptr; + dst_item = (struct btrfs_inode_item *)dst_ptr; + + if (btrfs_inode_generation(eb, src_item) == 0) + goto no_copy; + + if (overwrite_root && + S_ISDIR(btrfs_inode_mode(eb, src_item)) && + S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { + save_old_i_size = 1; + saved_i_size = btrfs_inode_size(path->nodes[0], + dst_item); + } + } + + copy_extent_buffer(path->nodes[0], eb, dst_ptr, + src_ptr, item_size); + + if (save_old_i_size) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); + } + + /* make sure the generation is filled in */ + if (key->type == BTRFS_INODE_ITEM_KEY) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { + btrfs_set_inode_generation(path->nodes[0], dst_item, + trans->transid); + } + } +no_copy: + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(path); + return 0; +} + +/* + * simple helper to read an inode off the disk from a given root + * This can only be called for subvolume roots and not for the log + */ +static noinline struct inode *read_one_inode(struct btrfs_root *root, + u64 objectid) +{ + struct btrfs_key key; + struct inode *inode; + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + if (IS_ERR(inode)) { + inode = NULL; + } else if (is_bad_inode(inode)) { + iput(inode); + inode = NULL; + } + return inode; +} + +/* replays a single extent in 'eb' at 'slot' with 'key' into the + * subvolume 'root'. path is released on entry and should be released + * on exit. + * + * extents in the log tree have not been allocated out of the extent + * tree yet. So, this completes the allocation, taking a reference + * as required if the extent already exists or creating a new extent + * if it isn't in the extent allocation tree yet. + * + * The extent is inserted into the file, dropping any existing extents + * from the file that overlap the new one. + */ +static noinline int replay_one_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int found_type; + u64 mask = root->sectorsize - 1; + u64 extent_end; + u64 alloc_hint; + u64 start = key->offset; + u64 saved_nbytes; + struct btrfs_file_extent_item *item; + struct inode *inode = NULL; + unsigned long size; + int ret = 0; + + item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) + extent_end = start + btrfs_file_extent_num_bytes(eb, item); + else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size = btrfs_file_extent_inline_len(eb, item); + extent_end = (start + size + mask) & ~mask; + } else { + ret = 0; + goto out; + } + + inode = read_one_inode(root, key->objectid); + if (!inode) { + ret = -EIO; + goto out; + } + + /* + * first check to see if we already have this extent in the + * file. This must be done before the btrfs_drop_extents run + * so we don't try to drop this extent. + */ + ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), + start, 0); + + if (ret == 0 && + (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC)) { + struct btrfs_file_extent_item cmp1; + struct btrfs_file_extent_item cmp2; + struct btrfs_file_extent_item *existing; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + existing = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + read_extent_buffer(eb, &cmp1, (unsigned long)item, + sizeof(cmp1)); + read_extent_buffer(leaf, &cmp2, (unsigned long)existing, + sizeof(cmp2)); + + /* + * we already have a pointer to this exact extent, + * we don't have to do anything + */ + if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { + btrfs_release_path(path); + goto out; + } + } + btrfs_release_path(path); + + saved_nbytes = inode_get_bytes(inode); + /* drop any overlapping extents */ + ret = btrfs_drop_extents(trans, inode, start, extent_end, + &alloc_hint, 1); + BUG_ON(ret); + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 offset; + unsigned long dest_offset; + struct btrfs_key ins; + + ret = btrfs_insert_empty_item(trans, root, path, key, + sizeof(*item)); + BUG_ON(ret); + dest_offset = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + copy_extent_buffer(path->nodes[0], eb, dest_offset, + (unsigned long)item, sizeof(*item)); + + ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); + ins.type = BTRFS_EXTENT_ITEM_KEY; + offset = key->offset - btrfs_file_extent_offset(eb, item); + + if (ins.objectid > 0) { + u64 csum_start; + u64 csum_end; + LIST_HEAD(ordered_sums); + /* + * is this extent already allocated in the extent + * allocation tree? If so, just add a reference + */ + ret = btrfs_lookup_extent(root, ins.objectid, + ins.offset); + if (ret == 0) { + ret = btrfs_inc_extent_ref(trans, root, + ins.objectid, ins.offset, + 0, root->root_key.objectid, + key->objectid, offset); + BUG_ON(ret); + } else { + /* + * insert the extent pointer in the extent + * allocation tree + */ + ret = btrfs_alloc_logged_file_extent(trans, + root, root->root_key.objectid, + key->objectid, offset, &ins); + BUG_ON(ret); + } + btrfs_release_path(path); + + if (btrfs_file_extent_compression(eb, item)) { + csum_start = ins.objectid; + csum_end = csum_start + ins.offset; + } else { + csum_start = ins.objectid + + btrfs_file_extent_offset(eb, item); + csum_end = csum_start + + btrfs_file_extent_num_bytes(eb, item); + } + + ret = btrfs_lookup_csums_range(root->log_root, + csum_start, csum_end - 1, + &ordered_sums, 0); + BUG_ON(ret); + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums; + sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + ret = btrfs_csum_file_blocks(trans, + root->fs_info->csum_root, + sums); + BUG_ON(ret); + list_del(&sums->list); + kfree(sums); + } + } else { + btrfs_release_path(path); + } + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + /* inline extents are easy, we just overwrite them */ + ret = overwrite_item(trans, root, path, eb, slot, key); + BUG_ON(ret); + } + + inode_set_bytes(inode, saved_nbytes); + btrfs_update_inode(trans, root, inode); +out: + if (inode) + iput(inode); + return ret; +} + +/* + * when cleaning up conflicts between the directory names in the + * subvolume, directory names in the log and directory names in the + * inode back references, we may have to unlink inodes from directories. + * + * This is a helper function to do the unlink of a specific directory + * item + */ +static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct inode *dir, + struct btrfs_dir_item *di) +{ + struct inode *inode; + char *name; + int name_len; + struct extent_buffer *leaf; + struct btrfs_key location; + int ret; + + leaf = path->nodes[0]; + + btrfs_dir_item_key_to_cpu(leaf, di, &location); + name_len = btrfs_dir_name_len(leaf, di); + name = kmalloc(name_len, GFP_NOFS); + if (!name) + return -ENOMEM; + + read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); + btrfs_release_path(path); + + inode = read_one_inode(root, location.objectid); + if (!inode) { + kfree(name); + return -EIO; + } + + ret = link_to_fixup_dir(trans, root, path, location.objectid); + BUG_ON(ret); + + ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + BUG_ON(ret); + kfree(name); + + iput(inode); + return ret; +} + +/* + * helper function to see if a given name and sequence number found + * in an inode back reference are already in a directory and correctly + * point to this inode + */ +static noinline int inode_in_dir(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 objectid, u64 index, + const char *name, int name_len) +{ + struct btrfs_dir_item *di; + struct btrfs_key location; + int match = 0; + + di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, + index, name, name_len, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else + goto out; + btrfs_release_path(path); + + di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else + goto out; + match = 1; +out: + btrfs_release_path(path); + return match; +} + +/* + * helper function to check a log tree for a named back reference in + * an inode. This is used to decide if a back reference that is + * found in the subvolume conflicts with what we find in the log. + * + * inode backreferences may have multiple refs in a single item, + * during replay we process one reference at a time, and we don't + * want to delete valid links to a file from the subvolume if that + * link is also in the log. + */ +static noinline int backref_in_log(struct btrfs_root *log, + struct btrfs_key *key, + char *name, int namelen) +{ + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + unsigned long ptr; + unsigned long ptr_end; + unsigned long name_ptr; + int found_name_len; + int item_size; + int ret; + int match = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, log, key, path, 0, 0); + if (ret != 0) + goto out; + + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + ref = (struct btrfs_inode_ref *)ptr; + found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); + if (found_name_len == namelen) { + name_ptr = (unsigned long)(ref + 1); + ret = memcmp_extent_buffer(path->nodes[0], name, + name_ptr, namelen); + if (ret == 0) { + match = 1; + goto out; + } + } + ptr = (unsigned long)(ref + 1) + found_name_len; + } +out: + btrfs_free_path(path); + return match; +} + + +/* + * replay one inode back reference item found in the log tree. + * eb, slot and key refer to the buffer and key found in the log tree. + * root is the destination we are replaying into, and path is for temp + * use by this function. (it should be released on return). + */ +static noinline int add_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + struct btrfs_inode_ref *ref; + struct btrfs_dir_item *di; + struct inode *dir; + struct inode *inode; + unsigned long ref_ptr; + unsigned long ref_end; + char *name; + int namelen; + int ret; + int search_done = 0; + + /* + * it is possible that we didn't log all the parent directories + * for a given inode. If we don't find the dir, just don't + * copy the back ref in. The link count fixup code will take + * care of the rest + */ + dir = read_one_inode(root, key->offset); + if (!dir) + return -ENOENT; + + inode = read_one_inode(root, key->objectid); + if (!inode) { + iput(dir); + return -EIO; + } + + ref_ptr = btrfs_item_ptr_offset(eb, slot); + ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + +again: + ref = (struct btrfs_inode_ref *)ref_ptr; + + namelen = btrfs_inode_ref_name_len(eb, ref); + name = kmalloc(namelen, GFP_NOFS); + BUG_ON(!name); + + read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); + + /* if we already have a perfect match, we're done */ + if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + btrfs_inode_ref_index(eb, ref), + name, namelen)) { + goto out; + } + + /* + * look for a conflicting back reference in the metadata. + * if we find one we have to unlink that name of the file + * before we add our new link. Later on, we overwrite any + * existing back reference, and we don't want to create + * dangling pointers in the directory. + */ + + if (search_done) + goto insert; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret == 0) { + char *victim_name; + int victim_name_len; + struct btrfs_inode_ref *victim_ref; + unsigned long ptr; + unsigned long ptr_end; + struct extent_buffer *leaf = path->nodes[0]; + + /* are we trying to overwrite a back ref for the root directory + * if so, just jump out, we're done + */ + if (key->objectid == key->offset) + goto out_nowrite; + + /* check all the names in this back reference to see + * if they are in the log. if so, we allow them to stay + * otherwise they must be unlinked as a conflict + */ + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); + while (ptr < ptr_end) { + victim_ref = (struct btrfs_inode_ref *)ptr; + victim_name_len = btrfs_inode_ref_name_len(leaf, + victim_ref); + victim_name = kmalloc(victim_name_len, GFP_NOFS); + BUG_ON(!victim_name); + + read_extent_buffer(leaf, victim_name, + (unsigned long)(victim_ref + 1), + victim_name_len); + + if (!backref_in_log(log, key, victim_name, + victim_name_len)) { + btrfs_inc_nlink(inode); + btrfs_release_path(path); + + ret = btrfs_unlink_inode(trans, root, dir, + inode, victim_name, + victim_name_len); + } + kfree(victim_name); + ptr = (unsigned long)(victim_ref + 1) + victim_name_len; + } + BUG_ON(ret); + + /* + * NOTE: we have searched root tree and checked the + * coresponding ref, it does not need to check again. + */ + search_done = 1; + } + btrfs_release_path(path); + + /* look for a conflicting sequence number */ + di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), + btrfs_inode_ref_index(eb, ref), + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(path); + + /* look for a conflicing name */ + di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(path); + +insert: + /* insert our name */ + ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, + btrfs_inode_ref_index(eb, ref)); + BUG_ON(ret); + + btrfs_update_inode(trans, root, inode); + +out: + ref_ptr = (unsigned long)(ref + 1) + namelen; + kfree(name); + if (ref_ptr < ref_end) + goto again; + + /* finally write the back reference in the inode */ + ret = overwrite_item(trans, root, path, eb, slot, key); + BUG_ON(ret); + +out_nowrite: + btrfs_release_path(path); + iput(dir); + iput(inode); + return 0; +} + +static int insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + int ret; + ret = btrfs_find_orphan_item(root, offset); + if (ret > 0) + ret = btrfs_insert_orphan_item(trans, root, offset); + return ret; +} + + +/* + * There are a few corners where the link count of the file can't + * be properly maintained during replay. So, instead of adding + * lots of complexity to the log code, we just scan the backrefs + * for any file that has been through replay. + * + * The scan will update the link count on the inode to reflect the + * number of back refs found. If it goes down to zero, the iput + * will free the inode. + */ +static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + u64 nlink = 0; + unsigned long ptr; + unsigned long ptr_end; + int name_len; + u64 ino = btrfs_ino(inode); + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid != ino || + key.type != BTRFS_INODE_REF_KEY) + break; + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + while (ptr < ptr_end) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + name_len = btrfs_inode_ref_name_len(path->nodes[0], + ref); + ptr = (unsigned long)(ref + 1) + name_len; + nlink++; + } + + if (key.offset == 0) + break; + key.offset--; + btrfs_release_path(path); + } + btrfs_release_path(path); + if (nlink != inode->i_nlink) { + inode->i_nlink = nlink; + btrfs_update_inode(trans, root, inode); + } + BTRFS_I(inode)->index_cnt = (u64)-1; + + if (inode->i_nlink == 0) { + if (S_ISDIR(inode->i_mode)) { + ret = replay_dir_deletes(trans, root, NULL, path, + ino, 1); + BUG_ON(ret); + } + ret = insert_orphan_item(trans, root, ino); + BUG_ON(ret); + } + btrfs_free_path(path); + + return 0; +} + +static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + struct inode *inode; + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = (u64)-1; + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + break; + + if (ret == 1) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || + key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + + btrfs_release_path(path); + inode = read_one_inode(root, key.offset); + if (!inode) + return -EIO; + + ret = fixup_inode_link_count(trans, root, inode); + BUG_ON(ret); + + iput(inode); + + /* + * fixup on a directory may create new entries, + * make sure we always look for the highset possible + * offset + */ + key.offset = (u64)-1; + } + ret = 0; +out: + btrfs_release_path(path); + return ret; +} + + +/* + * record a given inode in the fixup dir so we can check its link + * count when replay is done. The link count is incremented here + * so the inode won't go away until we check it + */ +static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid) +{ + struct btrfs_key key; + int ret = 0; + struct inode *inode; + + inode = read_one_inode(root, objectid); + if (!inode) + return -EIO; + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = objectid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_release_path(path); + if (ret == 0) { + btrfs_inc_nlink(inode); + btrfs_update_inode(trans, root, inode); + } else if (ret == -EEXIST) { + ret = 0; + } else { + BUG(); + } + iput(inode); + + return ret; +} + +/* + * when replaying the log for a directory, we only insert names + * for inodes that actually exist. This means an fsync on a directory + * does not implicitly fsync all the new files in it + */ +static noinline int insert_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 index, + char *name, int name_len, u8 type, + struct btrfs_key *location) +{ + struct inode *inode; + struct inode *dir; + int ret; + + inode = read_one_inode(root, location->objectid); + if (!inode) + return -ENOENT; + + dir = read_one_inode(root, dirid); + if (!dir) { + iput(inode); + return -EIO; + } + ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); + + /* FIXME, put inode into FIXUP list */ + + iput(inode); + iput(dir); + return ret; +} + +/* + * take a single entry in a log directory item and replay it into + * the subvolume. + * + * if a conflicting item exists in the subdirectory already, + * the inode it points to is unlinked and put into the link count + * fix up tree. + * + * If a name from the log points to a file or directory that does + * not exist in the FS, it is skipped. fsyncs on directories + * do not force down inodes inside that directory, just changes to the + * names or unlinks in a directory. + */ +static noinline int replay_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, + struct btrfs_dir_item *di, + struct btrfs_key *key) +{ + char *name; + int name_len; + struct btrfs_dir_item *dst_di; + struct btrfs_key found_key; + struct btrfs_key log_key; + struct inode *dir; + u8 log_type; + int exists; + int ret; + + dir = read_one_inode(root, key->objectid); + if (!dir) + return -EIO; + + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + if (!name) + return -ENOMEM; + + log_type = btrfs_dir_type(eb, di); + read_extent_buffer(eb, name, (unsigned long)(di + 1), + name_len); + + btrfs_dir_item_key_to_cpu(eb, di, &log_key); + exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); + if (exists == 0) + exists = 1; + else + exists = 0; + btrfs_release_path(path); + + if (key->type == BTRFS_DIR_ITEM_KEY) { + dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, + name, name_len, 1); + } else if (key->type == BTRFS_DIR_INDEX_KEY) { + dst_di = btrfs_lookup_dir_index_item(trans, root, path, + key->objectid, + key->offset, name, + name_len, 1); + } else { + BUG(); + } + if (IS_ERR_OR_NULL(dst_di)) { + /* we need a sequence number to insert, so we only + * do inserts for the BTRFS_DIR_INDEX_KEY types + */ + if (key->type != BTRFS_DIR_INDEX_KEY) + goto out; + goto insert; + } + + btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + /* the existing item matches the logged item */ + if (found_key.objectid == log_key.objectid && + found_key.type == log_key.type && + found_key.offset == log_key.offset && + btrfs_dir_type(path->nodes[0], dst_di) == log_type) { + goto out; + } + + /* + * don't drop the conflicting directory entry if the inode + * for the new entry doesn't exist + */ + if (!exists) + goto out; + + ret = drop_one_dir_item(trans, root, path, dir, dst_di); + BUG_ON(ret); + + if (key->type == BTRFS_DIR_INDEX_KEY) + goto insert; +out: + btrfs_release_path(path); + kfree(name); + iput(dir); + return 0; + +insert: + btrfs_release_path(path); + ret = insert_one_name(trans, root, path, key->objectid, key->offset, + name, name_len, log_type, &log_key); + + BUG_ON(ret && ret != -ENOENT); + goto out; +} + +/* + * find all the names in a directory item and reconcile them into + * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than + * one name in a directory item, but the same code gets used for + * both directory index types + */ +static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size = btrfs_item_size_nr(eb, slot); + struct btrfs_dir_item *di; + int name_len; + unsigned long ptr; + unsigned long ptr_end; + + ptr = btrfs_item_ptr_offset(eb, slot); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + di = (struct btrfs_dir_item *)ptr; + if (verify_dir_item(root, eb, di)) + return -EIO; + name_len = btrfs_dir_name_len(eb, di); + ret = replay_one_name(trans, root, path, eb, di, key); + BUG_ON(ret); + ptr = (unsigned long)(di + 1); + ptr += name_len; + } + return 0; +} + +/* + * directory replay has two parts. There are the standard directory + * items in the log copied from the subvolume, and range items + * created in the log while the subvolume was logged. + * + * The range items tell us which parts of the key space the log + * is authoritative for. During replay, if a key in the subvolume + * directory is in a logged range item, but not actually in the log + * that means it was deleted from the directory before the fsync + * and should be removed. + */ +static noinline int find_dir_range(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, int key_type, + u64 *start_ret, u64 *end_ret) +{ + struct btrfs_key key; + u64 found_end; + struct btrfs_dir_log_item *item; + int ret; + int nritems; + + if (*start_ret == (u64)-1) + return 1; + + key.objectid = dirid; + key.type = key_type; + key.offset = *start_ret; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + if (ret != 0) + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != key_type || key.objectid != dirid) { + ret = 1; + goto next; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + + if (*start_ret >= key.offset && *start_ret <= found_end) { + ret = 0; + *start_ret = key.offset; + *end_ret = found_end; + goto out; + } + ret = 1; +next: + /* check the next slot in the tree to see if it is a valid item */ + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + } else { + path->slots[0]++; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != key_type || key.objectid != dirid) { + ret = 1; + goto out; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + *start_ret = key.offset; + *end_ret = found_end; + ret = 0; +out: + btrfs_release_path(path); + return ret; +} + +/* + * this looks for a given directory item in the log. If the directory + * item is not in the log, the item is removed and the inode it points + * to is unlinked + */ +static noinline int check_item_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct btrfs_path *log_path, + struct inode *dir, + struct btrfs_key *dir_key) +{ + int ret; + struct extent_buffer *eb; + int slot; + u32 item_size; + struct btrfs_dir_item *di; + struct btrfs_dir_item *log_di; + int name_len; + unsigned long ptr; + unsigned long ptr_end; + char *name; + struct inode *inode; + struct btrfs_key location; + +again: + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr_offset(eb, slot); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + di = (struct btrfs_dir_item *)ptr; + if (verify_dir_item(root, eb, di)) { + ret = -EIO; + goto out; + } + + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(eb, name, (unsigned long)(di + 1), + name_len); + log_di = NULL; + if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { + log_di = btrfs_lookup_dir_item(trans, log, log_path, + dir_key->objectid, + name, name_len, 0); + } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { + log_di = btrfs_lookup_dir_index_item(trans, log, + log_path, + dir_key->objectid, + dir_key->offset, + name, name_len, 0); + } + if (IS_ERR_OR_NULL(log_di)) { + btrfs_dir_item_key_to_cpu(eb, di, &location); + btrfs_release_path(path); + btrfs_release_path(log_path); + inode = read_one_inode(root, location.objectid); + if (!inode) { + kfree(name); + return -EIO; + } + + ret = link_to_fixup_dir(trans, root, + path, location.objectid); + BUG_ON(ret); + btrfs_inc_nlink(inode); + ret = btrfs_unlink_inode(trans, root, dir, inode, + name, name_len); + BUG_ON(ret); + kfree(name); + iput(inode); + + /* there might still be more names under this key + * check and repeat if required + */ + ret = btrfs_search_slot(NULL, root, dir_key, path, + 0, 0); + if (ret == 0) + goto again; + ret = 0; + goto out; + } + btrfs_release_path(log_path); + kfree(name); + + ptr = (unsigned long)(di + 1); + ptr += name_len; + } + ret = 0; +out: + btrfs_release_path(path); + btrfs_release_path(log_path); + return ret; +} + +/* + * deletion replay happens before we copy any new directory items + * out of the log or out of backreferences from inodes. It + * scans the log to find ranges of keys that log is authoritative for, + * and then scans the directory to find items in those ranges that are + * not present in the log. + * + * Anything we don't find in the log is unlinked and removed from the + * directory. + */ +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid, int del_all) +{ + u64 range_start; + u64 range_end; + int key_type = BTRFS_DIR_LOG_ITEM_KEY; + int ret = 0; + struct btrfs_key dir_key; + struct btrfs_key found_key; + struct btrfs_path *log_path; + struct inode *dir; + + dir_key.objectid = dirid; + dir_key.type = BTRFS_DIR_ITEM_KEY; + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; + + dir = read_one_inode(root, dirid); + /* it isn't an error if the inode isn't there, that can happen + * because we replay the deletes before we copy in the inode item + * from the log + */ + if (!dir) { + btrfs_free_path(log_path); + return 0; + } +again: + range_start = 0; + range_end = 0; + while (1) { + if (del_all) + range_end = (u64)-1; + else { + ret = find_dir_range(log, path, dirid, key_type, + &range_start, &range_end); + if (ret != 0) + break; + } + + dir_key.offset = range_start; + while (1) { + int nritems; + ret = btrfs_search_slot(NULL, root, &dir_key, path, + 0, 0); + if (ret < 0) + goto out; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != dirid || + found_key.type != dir_key.type) + goto next_type; + + if (found_key.offset > range_end) + break; + + ret = check_item_in_log(trans, root, log, path, + log_path, dir, + &found_key); + BUG_ON(ret); + if (found_key.offset == (u64)-1) + break; + dir_key.offset = found_key.offset + 1; + } + btrfs_release_path(path); + if (range_end == (u64)-1) + break; + range_start = range_end + 1; + } + +next_type: + ret = 0; + if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { + key_type = BTRFS_DIR_LOG_INDEX_KEY; + dir_key.type = BTRFS_DIR_INDEX_KEY; + btrfs_release_path(path); + goto again; + } +out: + btrfs_release_path(path); + btrfs_free_path(log_path); + iput(dir); + return ret; +} + +/* + * the process_func used to replay items from the log tree. This + * gets called in two different stages. The first stage just looks + * for inodes and makes sure they are all copied into the subvolume. + * + * The second stage copies all the other item types from the log into + * the subvolume. The two stage approach is slower, but gets rid of + * lots of complexity around inodes referencing other inodes that exist + * only in the log (references come from either directory items or inode + * back refs). + */ +static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen) +{ + int nritems; + struct btrfs_path *path; + struct btrfs_root *root = wc->replay_dest; + struct btrfs_key key; + int level; + int i; + int ret; + + btrfs_read_buffer(eb, gen); + + level = btrfs_header_level(eb); + + if (level != 0) + return 0; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + nritems = btrfs_header_nritems(eb); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + + /* inode keys are done during the first stage */ + if (key.type == BTRFS_INODE_ITEM_KEY && + wc->stage == LOG_WALK_REPLAY_INODES) { + struct btrfs_inode_item *inode_item; + u32 mode; + + inode_item = btrfs_item_ptr(eb, i, + struct btrfs_inode_item); + mode = btrfs_inode_mode(eb, inode_item); + if (S_ISDIR(mode)) { + ret = replay_dir_deletes(wc->trans, + root, log, path, key.objectid, 0); + BUG_ON(ret); + } + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + + /* for regular files, make sure corresponding + * orhpan item exist. extents past the new EOF + * will be truncated later by orphan cleanup. + */ + if (S_ISREG(mode)) { + ret = insert_orphan_item(wc->trans, root, + key.objectid); + BUG_ON(ret); + } + + ret = link_to_fixup_dir(wc->trans, root, + path, key.objectid); + BUG_ON(ret); + } + if (wc->stage < LOG_WALK_REPLAY_ALL) + continue; + + /* these keys are simply copied */ + if (key.type == BTRFS_XATTR_ITEM_KEY) { + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } else if (key.type == BTRFS_INODE_REF_KEY) { + ret = add_inode_ref(wc->trans, root, log, path, + eb, i, &key); + BUG_ON(ret && ret != -ENOENT); + } else if (key.type == BTRFS_EXTENT_DATA_KEY) { + ret = replay_one_extent(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } else if (key.type == BTRFS_DIR_ITEM_KEY || + key.type == BTRFS_DIR_INDEX_KEY) { + ret = replay_one_dir_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } + } + btrfs_free_path(path); + return 0; +} + +static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + u64 root_owner; + u64 bytenr; + u64 ptr_gen; + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + u32 blocksize; + int ret = 0; + + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + while (*level > 0) { + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + cur = path->nodes[*level]; + + if (btrfs_header_level(cur) != *level) + WARN_ON(1); + + if (path->slots[*level] >= + btrfs_header_nritems(cur)) + break; + + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + blocksize = btrfs_level_size(root, *level - 1); + + parent = path->nodes[*level]; + root_owner = btrfs_header_owner(parent); + + next = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!next) + return -ENOMEM; + + if (*level == 1) { + wc->process_func(root, next, wc, ptr_gen); + + path->slots[*level]++; + if (wc->free) { + btrfs_read_buffer(next, ptr_gen); + + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + WARN_ON(root_owner != + BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(root, + bytenr, blocksize); + BUG_ON(ret); + } + free_extent_buffer(next); + continue; + } + btrfs_read_buffer(next, ptr_gen); + + WARN_ON(*level <= 0); + if (path->nodes[*level-1]) + free_extent_buffer(path->nodes[*level-1]); + path->nodes[*level-1] = next; + *level = btrfs_header_level(next); + path->slots[*level] = 0; + cond_resched(); + } + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); + + cond_resched(); + return 0; +} + +static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + u64 root_owner; + int i; + int slot; + int ret; + + for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { + slot = path->slots[i]; + if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { + path->slots[i]++; + *level = i; + WARN_ON(*level == 0); + return 0; + } else { + struct extent_buffer *parent; + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; + + root_owner = btrfs_header_owner(parent); + wc->process_func(root, path->nodes[*level], wc, + btrfs_header_generation(path->nodes[*level])); + if (wc->free) { + struct extent_buffer *next; + + next = path->nodes[*level]; + + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(root, + path->nodes[*level]->start, + path->nodes[*level]->len); + BUG_ON(ret); + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level = i + 1; + } + } + return 1; +} + +/* + * drop the reference count on the tree rooted at 'snap'. This traverses + * the tree freeing any blocks that have a ref count of zero after being + * decremented. + */ +static int walk_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct walk_control *wc) +{ + int ret = 0; + int wret; + int level; + struct btrfs_path *path; + int i; + int orig_level; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + level = btrfs_header_level(log->node); + orig_level = level; + path->nodes[level] = log->node; + extent_buffer_get(log->node); + path->slots[level] = 0; + + while (1) { + wret = walk_down_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + + wret = walk_up_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + } + + /* was the root node processed? if not, catch it here */ + if (path->nodes[orig_level]) { + wc->process_func(log, path->nodes[orig_level], wc, + btrfs_header_generation(path->nodes[orig_level])); + if (wc->free) { + struct extent_buffer *next; + + next = path->nodes[orig_level]; + + btrfs_tree_lock(next); + clean_tree_block(trans, log, next); + btrfs_set_lock_blocking(next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + WARN_ON(log->root_key.objectid != + BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(log, next->start, + next->len); + BUG_ON(ret); + } + } + + for (i = 0; i <= orig_level; i++) { + if (path->nodes[i]) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + } + btrfs_free_path(path); + return ret; +} + +/* + * helper function to update the item for a given subvolumes log root + * in the tree of log roots + */ +static int update_log_root(struct btrfs_trans_handle *trans, + struct btrfs_root *log) +{ + int ret; + + if (log->log_transid == 1) { + /* insert root item on the first sync */ + ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + } else { + ret = btrfs_update_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + } + return ret; +} + +static int wait_log_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long transid) +{ + DEFINE_WAIT(wait); + int index = transid % 2; + + /* + * we only allow two pending log transactions at a time, + * so we know that if ours is more than 2 older than the + * current transaction, we're done + */ + do { + prepare_to_wait(&root->log_commit_wait[index], + &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&root->log_mutex); + + if (root->fs_info->last_trans_log_full_commit != + trans->transid && root->log_transid < transid + 2 && + atomic_read(&root->log_commit[index])) + schedule(); + + finish_wait(&root->log_commit_wait[index], &wait); + mutex_lock(&root->log_mutex); + } while (root->log_transid < transid + 2 && + atomic_read(&root->log_commit[index])); + return 0; +} + +static int wait_for_writer(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + DEFINE_WAIT(wait); + while (atomic_read(&root->log_writers)) { + prepare_to_wait(&root->log_writer_wait, + &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&root->log_mutex); + if (root->fs_info->last_trans_log_full_commit != + trans->transid && atomic_read(&root->log_writers)) + schedule(); + mutex_lock(&root->log_mutex); + finish_wait(&root->log_writer_wait, &wait); + } + return 0; +} + +/* + * btrfs_sync_log does sends a given tree log down to the disk and + * updates the super blocks to record it. When this call is done, + * you know that any inodes previously logged are safely on disk only + * if it returns 0. + * + * Any other return value means you need to call btrfs_commit_transaction. + * Some of the edge cases for fsyncing directories that have had unlinks + * or renames done in the past mean that sometimes the only safe + * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, + * that has happened. + */ +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int index1; + int index2; + int mark; + int ret; + struct btrfs_root *log = root->log_root; + struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; + unsigned long log_transid = 0; + + mutex_lock(&root->log_mutex); + index1 = root->log_transid % 2; + if (atomic_read(&root->log_commit[index1])) { + wait_log_commit(trans, root, root->log_transid); + mutex_unlock(&root->log_mutex); + return 0; + } + atomic_set(&root->log_commit[index1], 1); + + /* wait for previous tree log sync to complete */ + if (atomic_read(&root->log_commit[(index1 + 1) % 2])) + wait_log_commit(trans, root, root->log_transid - 1); + + while (1) { + unsigned long batch = root->log_batch; + if (root->log_multiple_pids) { + mutex_unlock(&root->log_mutex); + schedule_timeout_uninterruptible(1); + mutex_lock(&root->log_mutex); + } + wait_for_writer(trans, root); + if (batch == root->log_batch) + break; + } + + /* bail out if we need to do a full commit */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { + ret = -EAGAIN; + mutex_unlock(&root->log_mutex); + goto out; + } + + log_transid = root->log_transid; + if (log_transid % 2 == 0) + mark = EXTENT_DIRTY; + else + mark = EXTENT_NEW; + + /* we start IO on all the marked extents here, but we don't actually + * wait for them until later. + */ + ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); + BUG_ON(ret); + + btrfs_set_root_node(&log->root_item, log->node); + + root->log_batch = 0; + root->log_transid++; + log->log_transid = root->log_transid; + root->log_start_pid = 0; + smp_mb(); + /* + * IO has been started, blocks of the log tree have WRITTEN flag set + * in their headers. new modifications of the log will be written to + * new positions. so it's safe to allow log writers to go in. + */ + mutex_unlock(&root->log_mutex); + + mutex_lock(&log_root_tree->log_mutex); + log_root_tree->log_batch++; + atomic_inc(&log_root_tree->log_writers); + mutex_unlock(&log_root_tree->log_mutex); + + ret = update_log_root(trans, log); + + mutex_lock(&log_root_tree->log_mutex); + if (atomic_dec_and_test(&log_root_tree->log_writers)) { + smp_mb(); + if (waitqueue_active(&log_root_tree->log_writer_wait)) + wake_up(&log_root_tree->log_writer_wait); + } + + if (ret) { + BUG_ON(ret != -ENOSPC); + root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out; + } + + index2 = log_root_tree->log_transid % 2; + if (atomic_read(&log_root_tree->log_commit[index2])) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid); + mutex_unlock(&log_root_tree->log_mutex); + ret = 0; + goto out; + } + atomic_set(&log_root_tree->log_commit[index2], 1); + + if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid - 1); + } + + wait_for_writer(trans, log_root_tree); + + /* + * now that we've moved on to the tree of log tree roots, + * check the full commit flag again + */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out_wake_log_root; + } + + ret = btrfs_write_and_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); + BUG_ON(ret); + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + + btrfs_set_super_log_root(&root->fs_info->super_for_commit, + log_root_tree->node->start); + btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, + btrfs_header_level(log_root_tree->node)); + + log_root_tree->log_batch = 0; + log_root_tree->log_transid++; + smp_mb(); + + mutex_unlock(&log_root_tree->log_mutex); + + /* + * nobody else is going to jump in and write the the ctree + * super here because the log_commit atomic below is protecting + * us. We must be called with a transaction handle pinning + * the running transaction open, so a full commit can't hop + * in and cause problems either. + */ + btrfs_scrub_pause_super(root); + write_ctree_super(trans, root->fs_info->tree_root, 1); + btrfs_scrub_continue_super(root); + ret = 0; + + mutex_lock(&root->log_mutex); + if (root->last_log_commit < log_transid) + root->last_log_commit = log_transid; + mutex_unlock(&root->log_mutex); + +out_wake_log_root: + atomic_set(&log_root_tree->log_commit[index2], 0); + smp_mb(); + if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) + wake_up(&log_root_tree->log_commit_wait[index2]); +out: + atomic_set(&root->log_commit[index1], 0); + smp_mb(); + if (waitqueue_active(&root->log_commit_wait[index1])) + wake_up(&root->log_commit_wait[index1]); + return ret; +} + +static void free_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log) +{ + int ret; + u64 start; + u64 end; + struct walk_control wc = { + .free = 1, + .process_func = process_one_buffer + }; + + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + + while (1) { + ret = find_first_extent_bit(&log->dirty_log_pages, + 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); + if (ret) + break; + + clear_extent_bits(&log->dirty_log_pages, start, end, + EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); + } + + free_extent_buffer(log->node); + kfree(log); +} + +/* + * free all the extents used by the tree log. This should be called + * at commit time of the full transaction + */ +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +{ + if (root->log_root) { + free_log_tree(trans, root->log_root); + root->log_root = NULL; + } + return 0; +} + +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + if (fs_info->log_root_tree) { + free_log_tree(trans, fs_info->log_root_tree); + fs_info->log_root_tree = NULL; + } + return 0; +} + +/* + * If both a file and directory are logged, and unlinks or renames are + * mixed in, we have a few interesting corners: + * + * create file X in dir Y + * link file X to X.link in dir Y + * fsync file X + * unlink file X but leave X.link + * fsync dir Y + * + * After a crash we would expect only X.link to exist. But file X + * didn't get fsync'd again so the log has back refs for X and X.link. + * + * We solve this by removing directory entries and inode backrefs from the + * log when a file that was logged in the current transaction is + * unlinked. Any later fsync will include the updated log entries, and + * we'll be able to reconstruct the proper directory items from backrefs. + * + * This optimizations allows us to avoid relogging the entire inode + * or the entire directory. + */ +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *dir, u64 index) +{ + struct btrfs_root *log; + struct btrfs_dir_item *di; + struct btrfs_path *path; + int ret; + int err = 0; + int bytes_del = 0; + u64 dir_ino = btrfs_ino(dir); + + if (BTRFS_I(dir)->logged_trans < trans->transid) + return 0; + + ret = join_running_log_trans(root); + if (ret) + return 0; + + mutex_lock(&BTRFS_I(dir)->log_mutex); + + log = root->log_root; + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out_unlock; + } + + di = btrfs_lookup_dir_item(trans, log, path, dir_ino, + name, name_len, -1); + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto fail; + } + if (di) { + ret = btrfs_delete_one_dir_name(trans, log, path, di); + bytes_del += name_len; + BUG_ON(ret); + } + btrfs_release_path(path); + di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, + index, name, name_len, -1); + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto fail; + } + if (di) { + ret = btrfs_delete_one_dir_name(trans, log, path, di); + bytes_del += name_len; + BUG_ON(ret); + } + + /* update the directory size in the log to reflect the names + * we have removed + */ + if (bytes_del) { + struct btrfs_key key; + + key.objectid = dir_ino; + key.offset = 0; + key.type = BTRFS_INODE_ITEM_KEY; + btrfs_release_path(path); + + ret = btrfs_search_slot(trans, log, &key, path, 0, 1); + if (ret < 0) { + err = ret; + goto fail; + } + if (ret == 0) { + struct btrfs_inode_item *item; + u64 i_size; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + i_size = btrfs_inode_size(path->nodes[0], item); + if (i_size > bytes_del) + i_size -= bytes_del; + else + i_size = 0; + btrfs_set_inode_size(path->nodes[0], item, i_size); + btrfs_mark_buffer_dirty(path->nodes[0]); + } else + ret = 0; + btrfs_release_path(path); + } +fail: + btrfs_free_path(path); +out_unlock: + mutex_unlock(&BTRFS_I(dir)->log_mutex); + if (ret == -ENOSPC) { + root->fs_info->last_trans_log_full_commit = trans->transid; + ret = 0; + } + btrfs_end_log_trans(root); + + return err; +} + +/* see comments for btrfs_del_dir_entries_in_log */ +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *inode, u64 dirid) +{ + struct btrfs_root *log; + u64 index; + int ret; + + if (BTRFS_I(inode)->logged_trans < trans->transid) + return 0; + + ret = join_running_log_trans(root); + if (ret) + return 0; + log = root->log_root; + mutex_lock(&BTRFS_I(inode)->log_mutex); + + ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), + dirid, &index); + mutex_unlock(&BTRFS_I(inode)->log_mutex); + if (ret == -ENOSPC) { + root->fs_info->last_trans_log_full_commit = trans->transid; + ret = 0; + } + btrfs_end_log_trans(root); + + return ret; +} + +/* + * creates a range item in the log for 'dirid'. first_offset and + * last_offset tell us which parts of the key space the log should + * be considered authoritative for. + */ +static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + int key_type, u64 dirid, + u64 first_offset, u64 last_offset) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_log_item *item; + + key.objectid = dirid; + key.offset = first_offset; + if (key_type == BTRFS_DIR_ITEM_KEY) + key.type = BTRFS_DIR_LOG_ITEM_KEY; + else + key.type = BTRFS_DIR_LOG_INDEX_KEY; + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); + if (ret) + return ret; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + btrfs_set_dir_log_end(path->nodes[0], item, last_offset); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(path); + return 0; +} + +/* + * log all the items included in the current transaction for a given + * directory. This also creates the range items in the log tree required + * to replay anything deleted before the fsync + */ +static noinline int log_dir_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, int key_type, + u64 min_offset, u64 *last_offset_ret) +{ + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; + struct extent_buffer *src; + int err = 0; + int ret; + int i; + int nritems; + u64 first_offset = min_offset; + u64 last_offset = (u64)-1; + u64 ino = btrfs_ino(inode); + + log = root->log_root; + max_key.objectid = ino; + max_key.offset = (u64)-1; + max_key.type = key_type; + + min_key.objectid = ino; + min_key.type = key_type; + min_key.offset = min_offset; + + path->keep_locks = 1; + + ret = btrfs_search_forward(root, &min_key, &max_key, + path, 0, trans->transid); + + /* + * we didn't find anything from this transaction, see if there + * is anything at all + */ + if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { + min_key.objectid = ino; + min_key.type = key_type; + min_key.offset = (u64)-1; + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret < 0) { + btrfs_release_path(path); + return ret; + } + ret = btrfs_previous_item(root, path, ino, key_type); + + /* if ret == 0 there are items for this type, + * create a range to tell us the last key of this type. + * otherwise, there are no items in this directory after + * *min_offset, and we create a range to indicate that. + */ + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, + path->slots[0]); + if (key_type == tmp.type) + first_offset = max(min_offset, tmp.offset) + 1; + } + goto done; + } + + /* go backward to find any previous key */ + ret = btrfs_previous_item(root, path, ino, key_type); + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + if (key_type == tmp.type) { + first_offset = tmp.offset; + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], + &tmp); + if (ret) { + err = ret; + goto done; + } + } + } + btrfs_release_path(path); + + /* find the first key from this transaction again */ + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret != 0) { + WARN_ON(1); + goto done; + } + + /* + * we have a block from this transaction, log every item in it + * from our directory + */ + while (1) { + struct btrfs_key tmp; + src = path->nodes[0]; + nritems = btrfs_header_nritems(src); + for (i = path->slots[0]; i < nritems; i++) { + btrfs_item_key_to_cpu(src, &min_key, i); + + if (min_key.objectid != ino || min_key.type != key_type) + goto done; + ret = overwrite_item(trans, log, dst_path, src, i, + &min_key); + if (ret) { + err = ret; + goto done; + } + } + path->slots[0] = nritems; + + /* + * look ahead to the next item and see if it is also + * from this directory and from this transaction + */ + ret = btrfs_next_leaf(root, path); + if (ret == 1) { + last_offset = (u64)-1; + goto done; + } + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + if (tmp.objectid != ino || tmp.type != key_type) { + last_offset = (u64)-1; + goto done; + } + if (btrfs_header_generation(path->nodes[0]) != trans->transid) { + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], + &tmp); + if (ret) + err = ret; + else + last_offset = tmp.offset; + goto done; + } + } +done: + btrfs_release_path(path); + btrfs_release_path(dst_path); + + if (err == 0) { + *last_offset_ret = last_offset; + /* + * insert the log range keys to indicate where the log + * is valid + */ + ret = insert_dir_log_key(trans, log, path, key_type, + ino, first_offset, last_offset); + if (ret) + err = ret; + } + return err; +} + +/* + * logging directories is very similar to logging inodes, We find all the items + * from the current transaction and write them to the log. + * + * The recovery code scans the directory in the subvolume, and if it finds a + * key in the range logged that is not present in the log tree, then it means + * that dir entry was unlinked during the transaction. + * + * In order for that scan to work, we must include one key smaller than + * the smallest logged by this transaction and one key larger than the largest + * key logged by this transaction. + */ +static noinline int log_directory_changes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + u64 min_key; + u64 max_key; + int ret; + int key_type = BTRFS_DIR_ITEM_KEY; + +again: + min_key = 0; + max_key = 0; + while (1) { + ret = log_dir_items(trans, root, inode, path, + dst_path, key_type, min_key, + &max_key); + if (ret) + return ret; + if (max_key == (u64)-1) + break; + min_key = max_key + 1; + } + + if (key_type == BTRFS_DIR_ITEM_KEY) { + key_type = BTRFS_DIR_INDEX_KEY; + goto again; + } + return 0; +} + +/* + * a helper function to drop items from the log before we relog an + * inode. max_key_type indicates the highest item type to remove. + * This cannot be run for file data extents because it does not + * free the extents they point to. + */ +static int drop_objectid_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + u64 objectid, int max_key_type) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + + key.objectid = objectid; + key.type = max_key_type; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(trans, log, &key, path, -1, 1); + BUG_ON(ret == 0); + if (ret < 0) + break; + + if (path->slots[0] == 0) + break; + + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + if (found_key.objectid != objectid) + break; + + ret = btrfs_del_item(trans, log, path); + if (ret) + break; + btrfs_release_path(path); + } + btrfs_release_path(path); + return ret; +} + +static noinline int copy_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *dst_path, + struct extent_buffer *src, + int start_slot, int nr, int inode_only) +{ + unsigned long src_offset; + unsigned long dst_offset; + struct btrfs_file_extent_item *extent; + struct btrfs_inode_item *inode_item; + int ret; + struct btrfs_key *ins_keys; + u32 *ins_sizes; + char *ins_data; + int i; + struct list_head ordered_sums; + + INIT_LIST_HEAD(&ordered_sums); + + ins_data = kmalloc(nr * sizeof(struct btrfs_key) + + nr * sizeof(u32), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); + + for (i = 0; i < nr; i++) { + ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); + btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); + } + ret = btrfs_insert_empty_items(trans, log, dst_path, + ins_keys, ins_sizes, nr); + if (ret) { + kfree(ins_data); + return ret; + } + + for (i = 0; i < nr; i++, dst_path->slots[0]++) { + dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], + dst_path->slots[0]); + + src_offset = btrfs_item_ptr_offset(src, start_slot + i); + + copy_extent_buffer(dst_path->nodes[0], src, dst_offset, + src_offset, ins_sizes[i]); + + if (inode_only == LOG_INODE_EXISTS && + ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { + inode_item = btrfs_item_ptr(dst_path->nodes[0], + dst_path->slots[0], + struct btrfs_inode_item); + btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); + + /* set the generation to zero so the recover code + * can tell the difference between an logging + * just to say 'this inode exists' and a logging + * to say 'update this inode with these values' + */ + btrfs_set_inode_generation(dst_path->nodes[0], + inode_item, 0); + } + /* take a reference on file data extents so that truncates + * or deletes of this inode don't have to relog the inode + * again + */ + if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { + int found_type; + extent = btrfs_item_ptr(src, start_slot + i, + struct btrfs_file_extent_item); + + if (btrfs_file_extent_generation(src, extent) < trans->transid) + continue; + + found_type = btrfs_file_extent_type(src, extent); + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 ds, dl, cs, cl; + ds = btrfs_file_extent_disk_bytenr(src, + extent); + /* ds == 0 is a hole */ + if (ds == 0) + continue; + + dl = btrfs_file_extent_disk_num_bytes(src, + extent); + cs = btrfs_file_extent_offset(src, extent); + cl = btrfs_file_extent_num_bytes(src, + extent); + if (btrfs_file_extent_compression(src, + extent)) { + cs = 0; + cl = dl; + } + + ret = btrfs_lookup_csums_range( + log->fs_info->csum_root, + ds + cs, ds + cs + cl - 1, + &ordered_sums, 0); + BUG_ON(ret); + } + } + } + + btrfs_mark_buffer_dirty(dst_path->nodes[0]); + btrfs_release_path(dst_path); + kfree(ins_data); + + /* + * we have to do this after the loop above to avoid changing the + * log tree while trying to change the log tree. + */ + ret = 0; + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + if (!ret) + ret = btrfs_csum_file_blocks(trans, log, sums); + list_del(&sums->list); + kfree(sums); + } + return ret; +} + +/* log a single inode in the tree log. + * At least one parent directory for this inode must exist in the tree + * or be logged already. + * + * Any items from this inode changed by the current transaction are copied + * to the log tree. An extra reference is taken on any extents in this + * file, allowing us to avoid a whole pile of corner cases around logging + * blocks that have been removed from the tree. + * + * See LOG_INODE_ALL and related defines for a description of what inode_only + * does. + * + * This handles both files and directories. + */ +static int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only) +{ + struct btrfs_path *path; + struct btrfs_path *dst_path; + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; + struct extent_buffer *src = NULL; + int err = 0; + int ret; + int nritems; + int ins_start_slot = 0; + int ins_nr; + u64 ino = btrfs_ino(inode); + + log = root->log_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + dst_path = btrfs_alloc_path(); + if (!dst_path) { + btrfs_free_path(path); + return -ENOMEM; + } + + min_key.objectid = ino; + min_key.type = BTRFS_INODE_ITEM_KEY; + min_key.offset = 0; + + max_key.objectid = ino; + + /* today the code can only do partial logging of directories */ + if (!S_ISDIR(inode->i_mode)) + inode_only = LOG_INODE_ALL; + + if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) + max_key.type = BTRFS_XATTR_ITEM_KEY; + else + max_key.type = (u8)-1; + max_key.offset = (u64)-1; + + ret = btrfs_commit_inode_delayed_items(trans, inode); + if (ret) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + return ret; + } + + mutex_lock(&BTRFS_I(inode)->log_mutex); + + /* + * a brute force approach to making sure we get the most uptodate + * copies of everything. + */ + if (S_ISDIR(inode->i_mode)) { + int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; + + if (inode_only == LOG_INODE_EXISTS) + max_key_type = BTRFS_XATTR_ITEM_KEY; + ret = drop_objectid_items(trans, log, path, ino, max_key_type); + } else { + ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); + } + if (ret) { + err = ret; + goto out_unlock; + } + path->keep_locks = 1; + + while (1) { + ins_nr = 0; + ret = btrfs_search_forward(root, &min_key, &max_key, + path, 0, trans->transid); + if (ret != 0) + break; +again: + /* note, ins_nr might be > 0 here, cleanup outside the loop */ + if (min_key.objectid != ino) + break; + if (min_key.type > max_key.type) + break; + + src = path->nodes[0]; + if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { + ins_nr++; + goto next_slot; + } else if (!ins_nr) { + ins_start_slot = path->slots[0]; + ins_nr = 1; + goto next_slot; + } + + ret = copy_items(trans, log, dst_path, src, ins_start_slot, + ins_nr, inode_only); + if (ret) { + err = ret; + goto out_unlock; + } + ins_nr = 1; + ins_start_slot = path->slots[0]; +next_slot: + + nritems = btrfs_header_nritems(path->nodes[0]); + path->slots[0]++; + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(path->nodes[0], &min_key, + path->slots[0]); + goto again; + } + if (ins_nr) { + ret = copy_items(trans, log, dst_path, src, + ins_start_slot, + ins_nr, inode_only); + if (ret) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + } + btrfs_release_path(path); + + if (min_key.offset < (u64)-1) + min_key.offset++; + else if (min_key.type < (u8)-1) + min_key.type++; + else if (min_key.objectid < (u64)-1) + min_key.objectid++; + else + break; + } + if (ins_nr) { + ret = copy_items(trans, log, dst_path, src, + ins_start_slot, + ins_nr, inode_only); + if (ret) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + } + WARN_ON(ins_nr); + if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { + btrfs_release_path(path); + btrfs_release_path(dst_path); + ret = log_directory_changes(trans, root, inode, path, dst_path); + if (ret) { + err = ret; + goto out_unlock; + } + } + BTRFS_I(inode)->logged_trans = trans->transid; +out_unlock: + mutex_unlock(&BTRFS_I(inode)->log_mutex); + + btrfs_free_path(path); + btrfs_free_path(dst_path); + return err; +} + +/* + * follow the dentry parent pointers up the chain and see if any + * of the directories in it require a full commit before they can + * be logged. Returns zero if nothing special needs to be done or 1 if + * a full commit is required. + */ +static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, + struct inode *inode, + struct dentry *parent, + struct super_block *sb, + u64 last_committed) +{ + int ret = 0; + struct btrfs_root *root; + struct dentry *old_parent = NULL; + + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->i_mode) && + BTRFS_I(inode)->generation <= last_committed && + BTRFS_I(inode)->last_unlink_trans <= last_committed) + goto out; + + if (!S_ISDIR(inode->i_mode)) { + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + goto out; + inode = parent->d_inode; + } + + while (1) { + BTRFS_I(inode)->logged_trans = trans->transid; + smp_mb(); + + if (BTRFS_I(inode)->last_unlink_trans > last_committed) { + root = BTRFS_I(inode)->root; + + /* + * make sure any commits to the log are forced + * to be full commits + */ + root->fs_info->last_trans_log_full_commit = + trans->transid; + ret = 1; + break; + } + + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + break; + + if (IS_ROOT(parent)) + break; + + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; + inode = parent->d_inode; + + } + dput(old_parent); +out: + return ret; +} + +static int inode_in_log(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + mutex_lock(&root->log_mutex); + if (BTRFS_I(inode)->logged_trans == trans->transid && + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) + ret = 1; + mutex_unlock(&root->log_mutex); + return ret; +} + + +/* + * helper function around btrfs_log_inode to make sure newly created + * parent directories also end up in the log. A minimal inode and backref + * only logging is done of any parent directories that are older than + * the last committed transaction + */ +int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, int exists_only) +{ + int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; + struct super_block *sb; + struct dentry *old_parent = NULL; + int ret = 0; + u64 last_committed = root->fs_info->last_trans_committed; + + sb = inode->i_sb; + + if (btrfs_test_opt(root, NOTREELOG)) { + ret = 1; + goto end_no_trans; + } + + if (root->fs_info->last_trans_log_full_commit > + root->fs_info->last_trans_committed) { + ret = 1; + goto end_no_trans; + } + + if (root != BTRFS_I(inode)->root || + btrfs_root_refs(&root->root_item) == 0) { + ret = 1; + goto end_no_trans; + } + + ret = check_parent_dirs_for_sync(trans, inode, parent, + sb, last_committed); + if (ret) + goto end_no_trans; + + if (inode_in_log(trans, inode)) { + ret = BTRFS_NO_LOG_SYNC; + goto end_no_trans; + } + + ret = start_log_trans(trans, root); + if (ret) + goto end_trans; + + ret = btrfs_log_inode(trans, root, inode, inode_only); + if (ret) + goto end_trans; + + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->i_mode) && + BTRFS_I(inode)->generation <= last_committed && + BTRFS_I(inode)->last_unlink_trans <= last_committed) { + ret = 0; + goto end_trans; + } + + inode_only = LOG_INODE_EXISTS; + while (1) { + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + break; + + inode = parent->d_inode; + if (root != BTRFS_I(inode)->root) + break; + + if (BTRFS_I(inode)->generation > + root->fs_info->last_trans_committed) { + ret = btrfs_log_inode(trans, root, inode, inode_only); + if (ret) + goto end_trans; + } + if (IS_ROOT(parent)) + break; + + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; + } + ret = 0; +end_trans: + dput(old_parent); + if (ret < 0) { + BUG_ON(ret != -ENOSPC); + root->fs_info->last_trans_log_full_commit = trans->transid; + ret = 1; + } + btrfs_end_log_trans(root); +end_no_trans: + return ret; +} + +/* + * it is not safe to log dentry if the chunk root has added new + * chunks. This returns 0 if the dentry was logged, and 1 otherwise. + * If this returns 1, you must commit the transaction to safely get your + * data on disk. + */ +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry) +{ + struct dentry *parent = dget_parent(dentry); + int ret; + + ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); + dput(parent); + + return ret; +} + +/* + * should be called during mount to recover any replay any log trees + * from the FS + */ +int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) +{ + int ret; + struct btrfs_path *path; + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key tmp_key; + struct btrfs_root *log; + struct btrfs_fs_info *fs_info = log_root_tree->fs_info; + struct walk_control wc = { + .process_func = process_one_buffer, + .stage = 0, + }; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + fs_info->log_root_recovering = 1; + + trans = btrfs_start_transaction(fs_info->tree_root, 0); + BUG_ON(IS_ERR(trans)); + + wc.trans = trans; + wc.pin = 1; + + ret = walk_log_tree(trans, log_root_tree, &wc); + BUG_ON(ret); + +again: + key.objectid = BTRFS_TREE_LOG_OBJECTID; + key.offset = (u64)-1; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + + while (1) { + ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + btrfs_release_path(path); + if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) + break; + + log = btrfs_read_fs_root_no_radix(log_root_tree, + &found_key); + BUG_ON(IS_ERR(log)); + + tmp_key.objectid = found_key.offset; + tmp_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_key.offset = (u64)-1; + + wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); + BUG_ON(IS_ERR_OR_NULL(wc.replay_dest)); + + wc.replay_dest->log_root = log; + btrfs_record_root_in_trans(trans, wc.replay_dest); + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + + if (wc.stage == LOG_WALK_REPLAY_ALL) { + ret = fixup_inode_link_counts(trans, wc.replay_dest, + path); + BUG_ON(ret); + } + + key.offset = found_key.offset - 1; + wc.replay_dest->log_root = NULL; + free_extent_buffer(log->node); + free_extent_buffer(log->commit_root); + kfree(log); + + if (found_key.offset == 0) + break; + } + btrfs_release_path(path); + + /* step one is to pin it all, step two is to replay just inodes */ + if (wc.pin) { + wc.pin = 0; + wc.process_func = replay_one_buffer; + wc.stage = LOG_WALK_REPLAY_INODES; + goto again; + } + /* step three is to replay everything */ + if (wc.stage < LOG_WALK_REPLAY_ALL) { + wc.stage++; + goto again; + } + + btrfs_free_path(path); + + free_extent_buffer(log_root_tree->node); + log_root_tree->log_root = NULL; + fs_info->log_root_recovering = 0; + + /* step 4: commit the transaction, which also unpins the blocks */ + btrfs_commit_transaction(trans, fs_info->tree_root); + + kfree(log_root_tree); + return 0; +} + +/* + * there are some corner cases where we want to force a full + * commit instead of allowing a directory to be logged. + * + * They revolve around files there were unlinked from the directory, and + * this function updates the parent directory so that a full commit is + * properly done if it is fsync'd later after the unlinks are done. + */ +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct inode *dir, struct inode *inode, + int for_rename) +{ + /* + * when we're logging a file, if it hasn't been renamed + * or unlinked, and its inode is fully committed on disk, + * we don't have to worry about walking up the directory chain + * to log its parents. + * + * So, we use the last_unlink_trans field to put this transid + * into the file. When the file is logged we check it and + * don't log the parents if the file is fully on disk. + */ + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->last_unlink_trans = trans->transid; + + /* + * if this directory was already logged any new + * names for this file/dir will get recorded + */ + smp_mb(); + if (BTRFS_I(dir)->logged_trans == trans->transid) + return; + + /* + * if the inode we're about to unlink was logged, + * the log will be properly updated for any new names + */ + if (BTRFS_I(inode)->logged_trans == trans->transid) + return; + + /* + * when renaming files across directories, if the directory + * there we're unlinking from gets fsync'd later on, there's + * no way to find the destination directory later and fsync it + * properly. So, we have to be conservative and force commits + * so the new name gets discovered. + */ + if (for_rename) + goto record; + + /* we can safely do the unlink without any special recording */ + return; + +record: + BTRFS_I(dir)->last_unlink_trans = trans->transid; +} + +/* + * Call this after adding a new name for a file and it will properly + * update the log to reflect the new name. + * + * It will return zero if all goes well, and it will return 1 if a + * full transaction commit is required. + */ +int btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *old_dir, + struct dentry *parent) +{ + struct btrfs_root * root = BTRFS_I(inode)->root; + + /* + * this will force the logging code to walk the dentry chain + * up for the file + */ + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->last_unlink_trans = trans->transid; + + /* + * if this inode hasn't been logged and directory we're renaming it + * from hasn't been logged, we don't need to log it + */ + if (BTRFS_I(inode)->logged_trans <= + root->fs_info->last_trans_committed && + (!old_dir || BTRFS_I(old_dir)->logged_trans <= + root->fs_info->last_trans_committed)) + return 0; + + return btrfs_log_inode_parent(trans, root, inode, parent, 1); +} + diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h new file mode 100644 index 00000000..2270ac58 --- /dev/null +++ b/fs/btrfs/tree-log.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __TREE_LOG_ +#define __TREE_LOG_ + +/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ +#define BTRFS_NO_LOG_SYNC 256 + +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_recover_log_trees(struct btrfs_root *tree_root); +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry); +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *dir, u64 index); +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *inode, u64 dirid); +int btrfs_end_log_trans(struct btrfs_root *root); +int btrfs_pin_log_trans(struct btrfs_root *root); +int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, int exists_only); +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct inode *dir, struct inode *inode, + int for_rename); +int btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *old_dir, + struct dentry *parent); +#endif diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h new file mode 100644 index 00000000..9bf3946d --- /dev/null +++ b/fs/btrfs/version.h @@ -0,0 +1,4 @@ +#ifndef __BTRFS_VERSION_H +#define __BTRFS_VERSION_H +#define BTRFS_BUILD_VERSION "Btrfs" +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c new file mode 100644 index 00000000..43baaf0c --- /dev/null +++ b/fs/btrfs/volumes.c @@ -0,0 +1,3718 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "async-thread.h" + +static int init_first_rw_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device); +static int btrfs_relocate_sys_chunks(struct btrfs_root *root); + +static DEFINE_MUTEX(uuid_mutex); +static LIST_HEAD(fs_uuids); + +static void lock_chunks(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->chunk_mutex); +} + +static void unlock_chunks(struct btrfs_root *root) +{ + mutex_unlock(&root->fs_info->chunk_mutex); +} + +static void free_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + WARN_ON(fs_devices->opened); + while (!list_empty(&fs_devices->devices)) { + device = list_entry(fs_devices->devices.next, + struct btrfs_device, dev_list); + list_del(&device->dev_list); + kfree(device->name); + kfree(device); + } + kfree(fs_devices); +} + +int btrfs_cleanup_fs_uuids(void) +{ + struct btrfs_fs_devices *fs_devices; + + while (!list_empty(&fs_uuids)) { + fs_devices = list_entry(fs_uuids.next, + struct btrfs_fs_devices, list); + list_del(&fs_devices->list); + free_fs_devices(fs_devices); + } + return 0; +} + +static noinline struct btrfs_device *__find_device(struct list_head *head, + u64 devid, u8 *uuid) +{ + struct btrfs_device *dev; + + list_for_each_entry(dev, head, dev_list) { + if (dev->devid == devid && + (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { + return dev; + } + } + return NULL; +} + +static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) +{ + struct btrfs_fs_devices *fs_devices; + + list_for_each_entry(fs_devices, &fs_uuids, list) { + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) + return fs_devices; + } + return NULL; +} + +static void requeue_list(struct btrfs_pending_bios *pending_bios, + struct bio *head, struct bio *tail) +{ + + struct bio *old_head; + + old_head = pending_bios->head; + pending_bios->head = head; + if (pending_bios->tail) + tail->bi_next = old_head; + else + pending_bios->tail = tail; +} + +/* + * we try to collect pending bios for a device so we don't get a large + * number of procs sending bios down to the same device. This greatly + * improves the schedulers ability to collect and merge the bios. + * + * But, it also turns into a long list of bios to process and that is sure + * to eventually make the worker thread block. The solution here is to + * make some progress and then put this work struct back at the end of + * the list if the block device is congested. This way, multiple devices + * can make progress from a single worker thread. + */ +static noinline int run_scheduled_bios(struct btrfs_device *device) +{ + struct bio *pending; + struct backing_dev_info *bdi; + struct btrfs_fs_info *fs_info; + struct btrfs_pending_bios *pending_bios; + struct bio *tail; + struct bio *cur; + int again = 0; + unsigned long num_run; + unsigned long batch_run = 0; + unsigned long limit; + unsigned long last_waited = 0; + int force_reg = 0; + struct blk_plug plug; + + /* + * this function runs all the bios we've collected for + * a particular device. We don't want to wander off to + * another device without first sending all of these down. + * So, setup a plug here and finish it off before we return + */ + blk_start_plug(&plug); + + bdi = blk_get_backing_dev_info(device->bdev); + fs_info = device->dev_root->fs_info; + limit = btrfs_async_submit_limit(fs_info); + limit = limit * 2 / 3; + +loop: + spin_lock(&device->io_lock); + +loop_lock: + num_run = 0; + + /* take all the bios off the list at once and process them + * later on (without the lock held). But, remember the + * tail and other pointers so the bios can be properly reinserted + * into the list if we hit congestion + */ + if (!force_reg && device->pending_sync_bios.head) { + pending_bios = &device->pending_sync_bios; + force_reg = 1; + } else { + pending_bios = &device->pending_bios; + force_reg = 0; + } + + pending = pending_bios->head; + tail = pending_bios->tail; + WARN_ON(pending && !tail); + + /* + * if pending was null this time around, no bios need processing + * at all and we can stop. Otherwise it'll loop back up again + * and do an additional check so no bios are missed. + * + * device->running_pending is used to synchronize with the + * schedule_bio code. + */ + if (device->pending_sync_bios.head == NULL && + device->pending_bios.head == NULL) { + again = 0; + device->running_pending = 0; + } else { + again = 1; + device->running_pending = 1; + } + + pending_bios->head = NULL; + pending_bios->tail = NULL; + + spin_unlock(&device->io_lock); + + while (pending) { + + rmb(); + /* we want to work on both lists, but do more bios on the + * sync list than the regular list + */ + if ((num_run > 32 && + pending_bios != &device->pending_sync_bios && + device->pending_sync_bios.head) || + (num_run > 64 && pending_bios == &device->pending_sync_bios && + device->pending_bios.head)) { + spin_lock(&device->io_lock); + requeue_list(pending_bios, pending, tail); + goto loop_lock; + } + + cur = pending; + pending = pending->bi_next; + cur->bi_next = NULL; + atomic_dec(&fs_info->nr_async_bios); + + if (atomic_read(&fs_info->nr_async_bios) < limit && + waitqueue_active(&fs_info->async_submit_wait)) + wake_up(&fs_info->async_submit_wait); + + BUG_ON(atomic_read(&cur->bi_cnt) == 0); + + submit_bio(cur->bi_rw, cur); + num_run++; + batch_run++; + if (need_resched()) + cond_resched(); + + /* + * we made progress, there is more work to do and the bdi + * is now congested. Back off and let other work structs + * run instead + */ + if (pending && bdi_write_congested(bdi) && batch_run > 8 && + fs_info->fs_devices->open_devices > 1) { + struct io_context *ioc; + + ioc = current->io_context; + + /* + * the main goal here is that we don't want to + * block if we're going to be able to submit + * more requests without blocking. + * + * This code does two great things, it pokes into + * the elevator code from a filesystem _and_ + * it makes assumptions about how batching works. + */ + if (ioc && ioc->nr_batch_requests > 0 && + time_before(jiffies, ioc->last_waited + HZ/50UL) && + (last_waited == 0 || + ioc->last_waited == last_waited)) { + /* + * we want to go through our batch of + * requests and stop. So, we copy out + * the ioc->last_waited time and test + * against it before looping + */ + last_waited = ioc->last_waited; + if (need_resched()) + cond_resched(); + continue; + } + spin_lock(&device->io_lock); + requeue_list(pending_bios, pending, tail); + device->running_pending = 1; + + spin_unlock(&device->io_lock); + btrfs_requeue_work(&device->work); + goto done; + } + } + + cond_resched(); + if (again) + goto loop; + + spin_lock(&device->io_lock); + if (device->pending_bios.head || device->pending_sync_bios.head) + goto loop_lock; + spin_unlock(&device->io_lock); + +done: + blk_finish_plug(&plug); + return 0; +} + +static void pending_bios_fn(struct btrfs_work *work) +{ + struct btrfs_device *device; + + device = container_of(work, struct btrfs_device, work); + run_scheduled_bios(device); +} + +static noinline int device_list_add(const char *path, + struct btrfs_super_block *disk_super, + u64 devid, struct btrfs_fs_devices **fs_devices_ret) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices; + u64 found_transid = btrfs_super_generation(disk_super); + char *name; + + fs_devices = find_fsid(disk_super->fsid); + if (!fs_devices) { + fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!fs_devices) + return -ENOMEM; + INIT_LIST_HEAD(&fs_devices->devices); + INIT_LIST_HEAD(&fs_devices->alloc_list); + list_add(&fs_devices->list, &fs_uuids); + memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); + fs_devices->latest_devid = devid; + fs_devices->latest_trans = found_transid; + mutex_init(&fs_devices->device_list_mutex); + device = NULL; + } else { + device = __find_device(&fs_devices->devices, devid, + disk_super->dev_item.uuid); + } + if (!device) { + if (fs_devices->opened) + return -EBUSY; + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + /* we can safely leave the fs_devices entry around */ + return -ENOMEM; + } + device->devid = devid; + device->work.func = pending_bios_fn; + memcpy(device->uuid, disk_super->dev_item.uuid, + BTRFS_UUID_SIZE); + spin_lock_init(&device->io_lock); + device->name = kstrdup(path, GFP_NOFS); + if (!device->name) { + kfree(device); + return -ENOMEM; + } + INIT_LIST_HEAD(&device->dev_alloc_list); + + mutex_lock(&fs_devices->device_list_mutex); + list_add_rcu(&device->dev_list, &fs_devices->devices); + mutex_unlock(&fs_devices->device_list_mutex); + + device->fs_devices = fs_devices; + fs_devices->num_devices++; + } else if (!device->name || strcmp(device->name, path)) { + name = kstrdup(path, GFP_NOFS); + if (!name) + return -ENOMEM; + kfree(device->name); + device->name = name; + if (device->missing) { + fs_devices->missing_devices--; + device->missing = 0; + } + } + + if (found_transid > fs_devices->latest_trans) { + fs_devices->latest_devid = devid; + fs_devices->latest_trans = found_transid; + } + *fs_devices_ret = fs_devices; + return 0; +} + +static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) +{ + struct btrfs_fs_devices *fs_devices; + struct btrfs_device *device; + struct btrfs_device *orig_dev; + + fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!fs_devices) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&fs_devices->devices); + INIT_LIST_HEAD(&fs_devices->alloc_list); + INIT_LIST_HEAD(&fs_devices->list); + mutex_init(&fs_devices->device_list_mutex); + fs_devices->latest_devid = orig->latest_devid; + fs_devices->latest_trans = orig->latest_trans; + memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); + + /* We have held the volume lock, it is safe to get the devices. */ + list_for_each_entry(orig_dev, &orig->devices, dev_list) { + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) + goto error; + + device->name = kstrdup(orig_dev->name, GFP_NOFS); + if (!device->name) { + kfree(device); + goto error; + } + + device->devid = orig_dev->devid; + device->work.func = pending_bios_fn; + memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); + spin_lock_init(&device->io_lock); + INIT_LIST_HEAD(&device->dev_list); + INIT_LIST_HEAD(&device->dev_alloc_list); + + list_add(&device->dev_list, &fs_devices->devices); + device->fs_devices = fs_devices; + fs_devices->num_devices++; + } + return fs_devices; +error: + free_fs_devices(fs_devices); + return ERR_PTR(-ENOMEM); +} + +int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device, *next; + + mutex_lock(&uuid_mutex); +again: + /* This is the initialized path, it is safe to release the devices. */ + list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { + if (device->in_fs_metadata) + continue; + + if (device->bdev) { + blkdev_put(device->bdev, device->mode); + device->bdev = NULL; + fs_devices->open_devices--; + } + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + device->writeable = 0; + fs_devices->rw_devices--; + } + list_del_init(&device->dev_list); + fs_devices->num_devices--; + kfree(device->name); + kfree(device); + } + + if (fs_devices->seed) { + fs_devices = fs_devices->seed; + goto again; + } + + mutex_unlock(&uuid_mutex); + return 0; +} + +static void __free_device(struct work_struct *work) +{ + struct btrfs_device *device; + + device = container_of(work, struct btrfs_device, rcu_work); + + if (device->bdev) + blkdev_put(device->bdev, device->mode); + + kfree(device->name); + kfree(device); +} + +static void free_device(struct rcu_head *head) +{ + struct btrfs_device *device; + + device = container_of(head, struct btrfs_device, rcu); + + INIT_WORK(&device->rcu_work, __free_device); + schedule_work(&device->rcu_work); +} + +static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + + if (--fs_devices->opened > 0) + return 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + struct btrfs_device *new_device; + + if (device->bdev) + fs_devices->open_devices--; + + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + if (device->can_discard) + fs_devices->num_can_discard--; + + new_device = kmalloc(sizeof(*new_device), GFP_NOFS); + BUG_ON(!new_device); + memcpy(new_device, device, sizeof(*new_device)); + new_device->name = kstrdup(device->name, GFP_NOFS); + BUG_ON(device->name && !new_device->name); + new_device->bdev = NULL; + new_device->writeable = 0; + new_device->in_fs_metadata = 0; + new_device->can_discard = 0; + list_replace_rcu(&device->dev_list, &new_device->dev_list); + + call_rcu(&device->rcu, free_device); + } + mutex_unlock(&fs_devices->device_list_mutex); + + WARN_ON(fs_devices->open_devices); + WARN_ON(fs_devices->rw_devices); + fs_devices->opened = 0; + fs_devices->seeding = 0; + + return 0; +} + +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_fs_devices *seed_devices = NULL; + int ret; + + mutex_lock(&uuid_mutex); + ret = __btrfs_close_devices(fs_devices); + if (!fs_devices->opened) { + seed_devices = fs_devices->seed; + fs_devices->seed = NULL; + } + mutex_unlock(&uuid_mutex); + + while (seed_devices) { + fs_devices = seed_devices; + seed_devices = fs_devices->seed; + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + } + return ret; +} + +static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder) +{ + struct request_queue *q; + struct block_device *bdev; + struct list_head *head = &fs_devices->devices; + struct btrfs_device *device; + struct block_device *latest_bdev = NULL; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + u64 latest_devid = 0; + u64 latest_transid = 0; + u64 devid; + int seeding = 1; + int ret = 0; + + flags |= FMODE_EXCL; + + list_for_each_entry(device, head, dev_list) { + if (device->bdev) + continue; + if (!device->name) + continue; + + bdev = blkdev_get_by_path(device->name, flags, holder); + if (IS_ERR(bdev)) { + printk(KERN_INFO "open %s failed\n", device->name); + goto error; + } + set_blocksize(bdev, 4096); + + bh = btrfs_read_dev_super(bdev); + if (!bh) { + ret = -EINVAL; + goto error_close; + } + + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); + if (devid != device->devid) + goto error_brelse; + + if (memcmp(device->uuid, disk_super->dev_item.uuid, + BTRFS_UUID_SIZE)) + goto error_brelse; + + device->generation = btrfs_super_generation(disk_super); + if (!latest_transid || device->generation > latest_transid) { + latest_devid = devid; + latest_transid = device->generation; + latest_bdev = bdev; + } + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + device->writeable = 0; + } else { + device->writeable = !bdev_read_only(bdev); + seeding = 0; + } + + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) { + device->can_discard = 1; + fs_devices->num_can_discard++; + } + + device->bdev = bdev; + device->in_fs_metadata = 0; + device->mode = flags; + + if (!blk_queue_nonrot(bdev_get_queue(bdev))) + fs_devices->rotating = 1; + + fs_devices->open_devices++; + if (device->writeable) { + fs_devices->rw_devices++; + list_add(&device->dev_alloc_list, + &fs_devices->alloc_list); + } + brelse(bh); + continue; + +error_brelse: + brelse(bh); +error_close: + blkdev_put(bdev, flags); +error: + continue; + } + if (fs_devices->open_devices == 0) { + ret = -EIO; + goto out; + } + fs_devices->seeding = seeding; + fs_devices->opened = 1; + fs_devices->latest_bdev = latest_bdev; + fs_devices->latest_devid = latest_devid; + fs_devices->latest_trans = latest_transid; + fs_devices->total_rw_bytes = 0; +out: + return ret; +} + +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder) +{ + int ret; + + mutex_lock(&uuid_mutex); + if (fs_devices->opened) { + fs_devices->opened++; + ret = 0; + } else { + ret = __btrfs_open_devices(fs_devices, flags, holder); + } + mutex_unlock(&uuid_mutex); + return ret; +} + +int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, + struct btrfs_fs_devices **fs_devices_ret) +{ + struct btrfs_super_block *disk_super; + struct block_device *bdev; + struct buffer_head *bh; + int ret; + u64 devid; + u64 transid; + + mutex_lock(&uuid_mutex); + + flags |= FMODE_EXCL; + bdev = blkdev_get_by_path(path, flags, holder); + + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + goto error; + } + + ret = set_blocksize(bdev, 4096); + if (ret) + goto error_close; + bh = btrfs_read_dev_super(bdev); + if (!bh) { + ret = -EINVAL; + goto error_close; + } + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); + transid = btrfs_super_generation(disk_super); + if (disk_super->label[0]) + printk(KERN_INFO "device label %s ", disk_super->label); + else + printk(KERN_INFO "device fsid %pU ", disk_super->fsid); + printk(KERN_CONT "devid %llu transid %llu %s\n", + (unsigned long long)devid, (unsigned long long)transid, path); + ret = device_list_add(path, disk_super, devid, fs_devices_ret); + + brelse(bh); +error_close: + blkdev_put(bdev, flags); +error: + mutex_unlock(&uuid_mutex); + return ret; +} + +/* helper to account the used device space in the range */ +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent; + struct btrfs_path *path; + u64 extent_end; + int ret; + int slot; + struct extent_buffer *l; + + *length = 0; + + if (start >= device->total_bytes) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto out; + } + + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + break; + + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (key.offset <= start && extent_end > end) { + *length = end - start + 1; + break; + } else if (key.offset <= start && extent_end > start) + *length += extent_end - start; + else if (key.offset > start && extent_end <= end) + *length += extent_end - key.offset; + else if (key.offset > start && key.offset <= end) { + *length += end - key.offset + 1; + break; + } else if (key.offset > end) + break; + +next: + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +/* + * find_free_dev_extent - find free space in the specified device + * @trans: transaction handler + * @device: the device which we search the free space in + * @num_bytes: the size of the free space that we need + * @start: store the start of the free space. + * @len: the size of the free space. that we find, or the size of the max + * free space if we don't find suitable free space + * + * this uses a pretty simple search, the expectation is that it is + * called very infrequently and that a given device has a small number + * of extents + * + * @start is used to store the start of the free space if we find. But if we + * don't find suitable free space, it will be used to store the start position + * of the max free space. + * + * @len is used to store the size of the free space that we find. + * But if we don't find suitable free space, it is used to store the size of + * the max free space. + */ +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *len) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent; + struct btrfs_path *path; + u64 hole_size; + u64 max_hole_start; + u64 max_hole_size; + u64 extent_end; + u64 search_start; + u64 search_end = device->total_bytes; + int ret; + int slot; + struct extent_buffer *l; + + /* FIXME use last free of some kind */ + + /* we don't want to overwrite the superblock on the drive, + * so we make sure to start at an offset of at least 1MB + */ + search_start = max(root->fs_info->alloc_start, 1024ull * 1024); + + max_hole_start = search_start; + max_hole_size = 0; + + if (search_start >= search_end) { + ret = -ENOSPC; + goto error; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto error; + } + path->reada = 2; + + key.objectid = device->devid; + key.offset = search_start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto out; + } + + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + break; + + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; + + if (key.offset > search_start) { + hole_size = key.offset - search_start; + + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } + + /* + * If this free space is greater than which we need, + * it must be the max free space that we have found + * until now, so max_hole_start must point to the start + * of this free space and the length of this free space + * is stored in max_hole_size. Thus, we return + * max_hole_start and max_hole_size and go back to the + * caller. + */ + if (hole_size >= num_bytes) { + ret = 0; + goto out; + } + } + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (extent_end > search_start) + search_start = extent_end; +next: + path->slots[0]++; + cond_resched(); + } + + hole_size = search_end- search_start; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } + + /* See above. */ + if (hole_size < num_bytes) + ret = -ENOSPC; + else + ret = 0; + +out: + btrfs_free_path(path); +error: + *start = max_hole_start; + if (len) + *len = max_hole_size; + return ret; +} + +static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 start) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->dev_root; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf = NULL; + struct btrfs_dev_extent *extent = NULL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, + BTRFS_DEV_EXTENT_KEY); + if (ret) + goto out; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + BUG_ON(found_key.offset > start || found_key.offset + + btrfs_dev_extent_length(leaf, extent) < start); + } else if (ret == 0) { + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + } + BUG_ON(ret); + + if (device->bytes_used > 0) + device->bytes_used -= btrfs_dev_extent_length(leaf, extent); + ret = btrfs_del_item(trans, root, path); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + + WARN_ON(!device->in_fs_metadata); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*extent)); + BUG_ON(ret); + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); + btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); + btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); + + write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), + BTRFS_UUID_SIZE); + + btrfs_set_dev_extent_length(leaf, extent, num_bytes); + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + return ret; +} + +static noinline int find_next_chunk(struct btrfs_root *root, + u64 objectid, u64 *offset) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + struct btrfs_chunk *chunk; + struct btrfs_key found_key; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = objectid; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto error; + + BUG_ON(ret == 0); + + ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); + if (ret) { + *offset = 0; + } else { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != objectid) + *offset = 0; + else { + chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_chunk); + *offset = found_key.offset + + btrfs_chunk_length(path->nodes[0], chunk); + } + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto error; + + BUG_ON(ret == 0); + + ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, + BTRFS_DEV_ITEM_KEY); + if (ret) { + *objectid = 1; + } else { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + *objectid = found_key.offset + 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +/* + * the device information is stored in the chunk root + * the btrfs_device struct should be fully filled in + */ +int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*dev_item)); + if (ret) + goto out; + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_generation(leaf, dev_item, 0); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_set_device_group(leaf, dev_item, 0); + btrfs_set_device_seek_speed(leaf, dev_item, 0); + btrfs_set_device_bandwidth(leaf, dev_item, 0); + btrfs_set_device_start_offset(leaf, dev_item, 0); + + ptr = (unsigned long)btrfs_device_uuid(dev_item); + write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); + ptr = (unsigned long)btrfs_device_fsid(dev_item); + write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_rm_dev_item(struct btrfs_root *root, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_trans_handle *trans; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + lock_chunks(root); + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; +out: + btrfs_free_path(path); + unlock_chunks(root); + btrfs_commit_transaction(trans, root); + return ret; +} + +int btrfs_rm_device(struct btrfs_root *root, char *device_path) +{ + struct btrfs_device *device; + struct btrfs_device *next_device; + struct block_device *bdev; + struct buffer_head *bh = NULL; + struct btrfs_super_block *disk_super; + struct btrfs_fs_devices *cur_devices; + u64 all_avail; + u64 devid; + u64 num_devices; + u8 *dev_uuid; + int ret = 0; + bool clear_super = false; + + mutex_lock(&uuid_mutex); + mutex_lock(&root->fs_info->volume_mutex); + + all_avail = root->fs_info->avail_data_alloc_bits | + root->fs_info->avail_system_alloc_bits | + root->fs_info->avail_metadata_alloc_bits; + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && + root->fs_info->fs_devices->num_devices <= 4) { + printk(KERN_ERR "btrfs: unable to go below four devices " + "on raid10\n"); + ret = -EINVAL; + goto out; + } + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && + root->fs_info->fs_devices->num_devices <= 2) { + printk(KERN_ERR "btrfs: unable to go below two " + "devices on raid1\n"); + ret = -EINVAL; + goto out; + } + + if (strcmp(device_path, "missing") == 0) { + struct list_head *devices; + struct btrfs_device *tmp; + + device = NULL; + devices = &root->fs_info->fs_devices->devices; + /* + * It is safe to read the devices since the volume_mutex + * is held. + */ + list_for_each_entry(tmp, devices, dev_list) { + if (tmp->in_fs_metadata && !tmp->bdev) { + device = tmp; + break; + } + } + bdev = NULL; + bh = NULL; + disk_super = NULL; + if (!device) { + printk(KERN_ERR "btrfs: no missing devices found to " + "remove\n"); + goto out; + } + } else { + bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, + root->fs_info->bdev_holder); + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + goto out; + } + + set_blocksize(bdev, 4096); + bh = btrfs_read_dev_super(bdev); + if (!bh) { + ret = -EINVAL; + goto error_close; + } + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); + dev_uuid = disk_super->dev_item.uuid; + device = btrfs_find_device(root, devid, dev_uuid, + disk_super->fsid); + if (!device) { + ret = -ENOENT; + goto error_brelse; + } + } + + if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { + printk(KERN_ERR "btrfs: unable to remove the only writeable " + "device\n"); + ret = -EINVAL; + goto error_brelse; + } + + if (device->writeable) { + lock_chunks(root); + list_del_init(&device->dev_alloc_list); + unlock_chunks(root); + root->fs_info->fs_devices->rw_devices--; + clear_super = true; + } + + ret = btrfs_shrink_device(device, 0); + if (ret) + goto error_undo; + + ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); + if (ret) + goto error_undo; + + device->in_fs_metadata = 0; + btrfs_scrub_cancel_dev(root, device); + + /* + * the device list mutex makes sure that we don't change + * the device list while someone else is writing out all + * the device supers. + */ + + cur_devices = device->fs_devices; + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + list_del_rcu(&device->dev_list); + + device->fs_devices->num_devices--; + + if (device->missing) + root->fs_info->fs_devices->missing_devices--; + + next_device = list_entry(root->fs_info->fs_devices->devices.next, + struct btrfs_device, dev_list); + if (device->bdev == root->fs_info->sb->s_bdev) + root->fs_info->sb->s_bdev = next_device->bdev; + if (device->bdev == root->fs_info->fs_devices->latest_bdev) + root->fs_info->fs_devices->latest_bdev = next_device->bdev; + + if (device->bdev) + device->fs_devices->open_devices--; + + call_rcu(&device->rcu, free_device); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); + + if (cur_devices->open_devices == 0) { + struct btrfs_fs_devices *fs_devices; + fs_devices = root->fs_info->fs_devices; + while (fs_devices) { + if (fs_devices->seed == cur_devices) + break; + fs_devices = fs_devices->seed; + } + fs_devices->seed = cur_devices->seed; + cur_devices->seed = NULL; + lock_chunks(root); + __btrfs_close_devices(cur_devices); + unlock_chunks(root); + free_fs_devices(cur_devices); + } + + /* + * at this point, the device is zero sized. We want to + * remove it from the devices list and zero out the old super + */ + if (clear_super) { + /* make sure this device isn't detected as part of + * the FS anymore + */ + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + } + + ret = 0; + +error_brelse: + brelse(bh); +error_close: + if (bdev) + blkdev_put(bdev, FMODE_READ | FMODE_EXCL); +out: + mutex_unlock(&root->fs_info->volume_mutex); + mutex_unlock(&uuid_mutex); + return ret; +error_undo: + if (device->writeable) { + lock_chunks(root); + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + unlock_chunks(root); + root->fs_info->fs_devices->rw_devices++; + } + goto error_brelse; +} + +/* + * does all the dirty work required for changing file system's UUID. + */ +static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *old_devices; + struct btrfs_fs_devices *seed_devices; + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_device *device; + u64 super_flags; + + BUG_ON(!mutex_is_locked(&uuid_mutex)); + if (!fs_devices->seeding) + return -EINVAL; + + seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!seed_devices) + return -ENOMEM; + + old_devices = clone_fs_devices(fs_devices); + if (IS_ERR(old_devices)) { + kfree(seed_devices); + return PTR_ERR(old_devices); + } + + list_add(&old_devices->list, &fs_uuids); + + memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); + seed_devices->opened = 1; + INIT_LIST_HEAD(&seed_devices->devices); + INIT_LIST_HEAD(&seed_devices->alloc_list); + mutex_init(&seed_devices->device_list_mutex); + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, + synchronize_rcu); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); + list_for_each_entry(device, &seed_devices->devices, dev_list) { + device->fs_devices = seed_devices; + } + + fs_devices->seeding = 0; + fs_devices->num_devices = 0; + fs_devices->open_devices = 0; + fs_devices->seed = seed_devices; + + generate_random_uuid(fs_devices->fsid); + memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + super_flags = btrfs_super_flags(disk_super) & + ~BTRFS_SUPER_FLAG_SEEDING; + btrfs_set_super_flags(disk_super, super_flags); + + return 0; +} + +/* + * strore the expected generation for seed devices in device items. + */ +static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dev_item *dev_item; + struct btrfs_device *device; + struct btrfs_key key; + u8 fs_uuid[BTRFS_UUID_SIZE]; + u8 dev_uuid[BTRFS_UUID_SIZE]; + u64 devid; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + root = root->fs_info->chunk_root; + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = BTRFS_DEV_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto error; + + leaf = path->nodes[0]; +next_slot: + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret > 0) + break; + if (ret < 0) + goto error; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(path); + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || + key.type != BTRFS_DEV_ITEM_KEY) + break; + + dev_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_item); + devid = btrfs_device_id(leaf, dev_item); + read_extent_buffer(leaf, dev_uuid, + (unsigned long)btrfs_device_uuid(dev_item), + BTRFS_UUID_SIZE); + read_extent_buffer(leaf, fs_uuid, + (unsigned long)btrfs_device_fsid(dev_item), + BTRFS_UUID_SIZE); + device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + BUG_ON(!device); + + if (device->fs_devices->seeding) { + btrfs_set_device_generation(leaf, dev_item, + device->generation); + btrfs_mark_buffer_dirty(leaf); + } + + path->slots[0]++; + goto next_slot; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_init_new_device(struct btrfs_root *root, char *device_path) +{ + struct request_queue *q; + struct btrfs_trans_handle *trans; + struct btrfs_device *device; + struct block_device *bdev; + struct list_head *devices; + struct super_block *sb = root->fs_info->sb; + u64 total_bytes; + int seeding_dev = 0; + int ret = 0; + + if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) + return -EINVAL; + + bdev = blkdev_get_by_path(device_path, FMODE_EXCL, + root->fs_info->bdev_holder); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + if (root->fs_info->fs_devices->seeding) { + seeding_dev = 1; + down_write(&sb->s_umount); + mutex_lock(&uuid_mutex); + } + + filemap_write_and_wait(bdev->bd_inode->i_mapping); + mutex_lock(&root->fs_info->volume_mutex); + + devices = &root->fs_info->fs_devices->devices; + /* + * we have the volume lock, so we don't need the extra + * device list mutex while reading the list here. + */ + list_for_each_entry(device, devices, dev_list) { + if (device->bdev == bdev) { + ret = -EEXIST; + goto error; + } + } + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + /* we can safely leave the fs_devices entry around */ + ret = -ENOMEM; + goto error; + } + + device->name = kstrdup(device_path, GFP_NOFS); + if (!device->name) { + kfree(device); + ret = -ENOMEM; + goto error; + } + + ret = find_next_devid(root, &device->devid); + if (ret) { + kfree(device->name); + kfree(device); + goto error; + } + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + kfree(device->name); + kfree(device); + ret = PTR_ERR(trans); + goto error; + } + + lock_chunks(root); + + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) + device->can_discard = 1; + device->writeable = 1; + device->work.func = pending_bios_fn; + generate_random_uuid(device->uuid); + spin_lock_init(&device->io_lock); + device->generation = trans->transid; + device->io_width = root->sectorsize; + device->io_align = root->sectorsize; + device->sector_size = root->sectorsize; + device->total_bytes = i_size_read(bdev->bd_inode); + device->disk_total_bytes = device->total_bytes; + device->dev_root = root->fs_info->dev_root; + device->bdev = bdev; + device->in_fs_metadata = 1; + device->mode = FMODE_EXCL; + set_blocksize(device->bdev, 4096); + + if (seeding_dev) { + sb->s_flags &= ~MS_RDONLY; + ret = btrfs_prepare_sprout(trans, root); + BUG_ON(ret); + } + + device->fs_devices = root->fs_info->fs_devices; + + /* + * we don't want write_supers to jump in here with our device + * half setup + */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + root->fs_info->fs_devices->num_devices++; + root->fs_info->fs_devices->open_devices++; + root->fs_info->fs_devices->rw_devices++; + if (device->can_discard) + root->fs_info->fs_devices->num_can_discard++; + root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + + if (!blk_queue_nonrot(bdev_get_queue(bdev))) + root->fs_info->fs_devices->rotating = 1; + + total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); + btrfs_set_super_total_bytes(&root->fs_info->super_copy, + total_bytes + device->total_bytes); + + total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); + btrfs_set_super_num_devices(&root->fs_info->super_copy, + total_bytes + 1); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + if (seeding_dev) { + ret = init_first_rw_device(trans, root, device); + BUG_ON(ret); + ret = btrfs_finish_sprout(trans, root); + BUG_ON(ret); + } else { + ret = btrfs_add_device(trans, root, device); + } + + /* + * we've got more storage, clear any full flags on the space + * infos + */ + btrfs_clear_space_info_full(root->fs_info); + + unlock_chunks(root); + btrfs_commit_transaction(trans, root); + + if (seeding_dev) { + mutex_unlock(&uuid_mutex); + up_write(&sb->s_umount); + + ret = btrfs_relocate_sys_chunks(root); + BUG_ON(ret); + } +out: + mutex_unlock(&root->fs_info->volume_mutex); + return ret; +error: + blkdev_put(bdev, FMODE_EXCL); + if (seeding_dev) { + mutex_unlock(&uuid_mutex); + up_write(&sb->s_umount); + } + goto out; +} + +static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + + root = device->dev_root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); + btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_mark_buffer_dirty(leaf); + +out: + btrfs_free_path(path); + return ret; +} + +static int __btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size) +{ + struct btrfs_super_block *super_copy = + &device->dev_root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); + u64 diff = new_size - device->total_bytes; + + if (!device->writeable) + return -EACCES; + if (new_size <= device->total_bytes) + return -EINVAL; + + btrfs_set_super_total_bytes(super_copy, old_total + diff); + device->fs_devices->total_rw_bytes += diff; + + device->total_bytes = new_size; + device->disk_total_bytes = new_size; + btrfs_clear_space_info_full(device->dev_root->fs_info); + + return btrfs_update_device(trans, device); +} + +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size) +{ + int ret; + lock_chunks(device->dev_root); + ret = __btrfs_grow_device(trans, device, new_size); + unlock_chunks(device->dev_root); + return ret; +} + +static int btrfs_free_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + root = root->fs_info->chunk_root; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = chunk_objectid; + key.offset = chunk_offset; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + BUG_ON(ret); + + ret = btrfs_del_item(trans, root, path); + + btrfs_free_path(path); + return ret; +} + +static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 + chunk_offset) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *ptr; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur; + struct btrfs_key key; + + array_size = btrfs_super_sys_array_size(super_copy); + + ptr = super_copy->sys_chunk_array; + cur = 0; + + while (cur < array_size) { + disk_key = (struct btrfs_disk_key *)ptr; + btrfs_disk_key_to_cpu(&key, disk_key); + + len = sizeof(*disk_key); + + if (key.type == BTRFS_CHUNK_ITEM_KEY) { + chunk = (struct btrfs_chunk *)(ptr + len); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + len += btrfs_chunk_item_size(num_stripes); + } else { + ret = -EIO; + break; + } + if (key.objectid == chunk_objectid && + key.offset == chunk_offset) { + memmove(ptr, ptr + len, array_size - (cur + len)); + array_size -= len; + btrfs_set_super_sys_array_size(super_copy, array_size); + } else { + ptr += len; + cur += len; + } + } + return ret; +} + +static int btrfs_relocate_chunk(struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + struct extent_map_tree *em_tree; + struct btrfs_root *extent_root; + struct btrfs_trans_handle *trans; + struct extent_map *em; + struct map_lookup *map; + int ret; + int i; + + root = root->fs_info->chunk_root; + extent_root = root->fs_info->extent_root; + em_tree = &root->fs_info->mapping_tree.map_tree; + + ret = btrfs_can_relocate(extent_root, chunk_offset); + if (ret) + return -ENOSPC; + + /* step one, relocate all the extents inside this chunk */ + ret = btrfs_relocate_block_group(extent_root, chunk_offset); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); + + lock_chunks(root); + + /* + * step two, delete the device extents and the + * chunk tree entries + */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + read_unlock(&em_tree->lock); + + BUG_ON(em->start > chunk_offset || + em->start + em->len < chunk_offset); + map = (struct map_lookup *)em->bdev; + + for (i = 0; i < map->num_stripes; i++) { + ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, + map->stripes[i].physical); + BUG_ON(ret); + + if (map->stripes[i].dev) { + ret = btrfs_update_device(trans, map->stripes[i].dev); + BUG_ON(ret); + } + } + ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, + chunk_offset); + + BUG_ON(ret); + + trace_btrfs_chunk_free(root, map, chunk_offset, em->len); + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); + BUG_ON(ret); + } + + ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); + BUG_ON(ret); + + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + kfree(map); + em->bdev = NULL; + + /* once for the tree */ + free_extent_map(em); + /* once for us */ + free_extent_map(em); + + unlock_chunks(root); + btrfs_end_transaction(trans, root); + return 0; +} + +static int btrfs_relocate_sys_chunks(struct btrfs_root *root) +{ + struct btrfs_root *chunk_root = root->fs_info->chunk_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_chunk *chunk; + struct btrfs_key key; + struct btrfs_key found_key; + u64 chunk_tree = chunk_root->root_key.objectid; + u64 chunk_type; + bool retried = false; + int failed = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + +again: + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); + + ret = btrfs_previous_item(chunk_root, path, key.objectid, + key.type); + if (ret < 0) + goto error; + if (ret > 0) + break; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + chunk = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_chunk); + chunk_type = btrfs_chunk_type(leaf, chunk); + btrfs_release_path(path); + + if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_relocate_chunk(chunk_root, chunk_tree, + found_key.objectid, + found_key.offset); + if (ret == -ENOSPC) + failed++; + else if (ret) + BUG(); + } + + if (found_key.offset == 0) + break; + key.offset = found_key.offset - 1; + } + ret = 0; + if (failed && !retried) { + failed = 0; + retried = true; + goto again; + } else if (failed && retried) { + WARN_ON(1); + ret = -ENOSPC; + } +error: + btrfs_free_path(path); + return ret; +} + +static u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + +int btrfs_balance(struct btrfs_root *dev_root) +{ + int ret; + struct list_head *devices = &dev_root->fs_info->fs_devices->devices; + struct btrfs_device *device; + u64 old_size; + u64 size_to_free; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; + struct btrfs_trans_handle *trans; + struct btrfs_key found_key; + + if (dev_root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&dev_root->fs_info->volume_mutex); + dev_root = dev_root->fs_info->dev_root; + + /* step one make some room on all the devices */ + list_for_each_entry(device, devices, dev_list) { + old_size = device->total_bytes; + size_to_free = div_factor(old_size, 1); + size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); + if (!device->writeable || + device->total_bytes - device->bytes_used > size_to_free) + continue; + + ret = btrfs_shrink_device(device, old_size - size_to_free); + if (ret == -ENOSPC) + break; + BUG_ON(ret); + + trans = btrfs_start_transaction(dev_root, 0); + BUG_ON(IS_ERR(trans)); + + ret = btrfs_grow_device(trans, device, old_size); + BUG_ON(ret); + + btrfs_end_transaction(trans, dev_root); + } + + /* step two, relocate all the chunks */ + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + + /* + * this shouldn't happen, it means the last relocate + * failed + */ + if (ret == 0) + break; + + ret = btrfs_previous_item(chunk_root, path, 0, + BTRFS_CHUNK_ITEM_KEY); + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != key.objectid) + break; + + /* chunk zero is special */ + if (found_key.offset == 0) + break; + + btrfs_release_path(path); + ret = btrfs_relocate_chunk(chunk_root, + chunk_root->root_key.objectid, + found_key.objectid, + found_key.offset); + if (ret && ret != -ENOSPC) + goto error; + key.offset = found_key.offset - 1; + } + ret = 0; +error: + btrfs_free_path(path); + mutex_unlock(&dev_root->fs_info->volume_mutex); + return ret; +} + +/* + * shrinking a device means finding all of the device extents past + * the new size, and then following the back refs to the chunks. + * The chunk relocation code actually frees the device extent + */ +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + u64 length; + u64 chunk_tree; + u64 chunk_objectid; + u64 chunk_offset; + int ret; + int slot; + int failed = 0; + bool retried = false; + struct extent_buffer *l; + struct btrfs_key key; + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); + u64 old_size = device->total_bytes; + u64 diff = device->total_bytes - new_size; + + if (new_size >= device->total_bytes) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 2; + + lock_chunks(root); + + device->total_bytes = new_size; + if (device->writeable) + device->fs_devices->total_rw_bytes -= diff; + unlock_chunks(root); + +again: + key.objectid = device->devid; + key.offset = (u64)-1; + key.type = BTRFS_DEV_EXTENT_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto done; + + ret = btrfs_previous_item(root, path, 0, key.type); + if (ret < 0) + goto done; + if (ret) { + ret = 0; + btrfs_release_path(path); + break; + } + + l = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + + if (key.objectid != device->devid) { + btrfs_release_path(path); + break; + } + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + length = btrfs_dev_extent_length(l, dev_extent); + + if (key.offset + length <= new_size) { + btrfs_release_path(path); + break; + } + + chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); + chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); + chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); + btrfs_release_path(path); + + ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, + chunk_offset); + if (ret && ret != -ENOSPC) + goto done; + if (ret == -ENOSPC) + failed++; + key.offset -= 1; + } + + if (failed && !retried) { + failed = 0; + retried = true; + goto again; + } else if (failed && retried) { + ret = -ENOSPC; + lock_chunks(root); + + device->total_bytes = old_size; + if (device->writeable) + device->fs_devices->total_rw_bytes += diff; + unlock_chunks(root); + goto done; + } + + /* Shrinking succeeded, else we would be at "done". */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto done; + } + + lock_chunks(root); + + device->disk_total_bytes = new_size; + /* Now btrfs_update_device() will change the on-disk size. */ + ret = btrfs_update_device(trans, device); + if (ret) { + unlock_chunks(root); + btrfs_end_transaction(trans, root); + goto done; + } + WARN_ON(diff > old_total); + btrfs_set_super_total_bytes(super_copy, old_total - diff); + unlock_chunks(root); + btrfs_end_transaction(trans, root); +done: + btrfs_free_path(path); + return ret; +} + +static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_chunk *chunk, int item_size) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_disk_key disk_key; + u32 array_size; + u8 *ptr; + + array_size = btrfs_super_sys_array_size(super_copy); + if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + return -EFBIG; + + ptr = super_copy->sys_chunk_array + array_size; + btrfs_cpu_key_to_disk(&disk_key, key); + memcpy(ptr, &disk_key, sizeof(disk_key)); + ptr += sizeof(disk_key); + memcpy(ptr, chunk, item_size); + item_size += sizeof(disk_key); + btrfs_set_super_sys_array_size(super_copy, array_size + item_size); + return 0; +} + +/* + * sort the devices in descending order by max_avail, total_avail + */ +static int btrfs_cmp_device_info(const void *a, const void *b) +{ + const struct btrfs_device_info *di_a = a; + const struct btrfs_device_info *di_b = b; + + if (di_a->max_avail > di_b->max_avail) + return -1; + if (di_a->max_avail < di_b->max_avail) + return 1; + if (di_a->total_avail > di_b->total_avail) + return -1; + if (di_a->total_avail < di_b->total_avail) + return 1; + return 0; +} + +static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup **map_ret, + u64 *num_bytes_out, u64 *stripe_size_out, + u64 start, u64 type) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct list_head *cur; + struct map_lookup *map = NULL; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_device_info *devices_info = NULL; + u64 total_avail; + int num_stripes; /* total number of stripes to allocate */ + int sub_stripes; /* sub_stripes info for map */ + int dev_stripes; /* stripes per dev */ + int devs_max; /* max devs to use */ + int devs_min; /* min devs needed */ + int devs_increment; /* ndevs has to be a multiple of this */ + int ncopies; /* how many copies to data has */ + int ret; + u64 max_stripe_size; + u64 max_chunk_size; + u64 stripe_size; + u64 num_bytes; + int ndevs; + int i; + int j; + + if ((type & BTRFS_BLOCK_GROUP_RAID1) && + (type & BTRFS_BLOCK_GROUP_DUP)) { + WARN_ON(1); + type &= ~BTRFS_BLOCK_GROUP_DUP; + } + + if (list_empty(&fs_devices->alloc_list)) + return -ENOSPC; + + sub_stripes = 1; + dev_stripes = 1; + devs_increment = 1; + ncopies = 1; + devs_max = 0; /* 0 == as many as possible */ + devs_min = 1; + + /* + * define the properties of each RAID type. + * FIXME: move this to a global table and use it in all RAID + * calculation code + */ + if (type & (BTRFS_BLOCK_GROUP_DUP)) { + dev_stripes = 2; + ncopies = 2; + devs_max = 1; + } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { + devs_min = 2; + } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { + devs_increment = 2; + ncopies = 2; + devs_max = 2; + devs_min = 2; + } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { + sub_stripes = 2; + devs_increment = 2; + ncopies = 2; + devs_min = 4; + } else { + devs_max = 1; + } + + if (type & BTRFS_BLOCK_GROUP_DATA) { + max_stripe_size = 1024 * 1024 * 1024; + max_chunk_size = 10 * max_stripe_size; + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + max_stripe_size = 256 * 1024 * 1024; + max_chunk_size = max_stripe_size; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + max_stripe_size = 8 * 1024 * 1024; + max_chunk_size = 2 * max_stripe_size; + } else { + printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", + type); + BUG_ON(1); + } + + /* we don't want a chunk larger than 10% of writeable space */ + max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + max_chunk_size); + + devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + cur = fs_devices->alloc_list.next; + + /* + * in the first pass through the devices list, we gather information + * about the available holes on each device. + */ + ndevs = 0; + while (cur != &fs_devices->alloc_list) { + struct btrfs_device *device; + u64 max_avail; + u64 dev_offset; + + device = list_entry(cur, struct btrfs_device, dev_alloc_list); + + cur = cur->next; + + if (!device->writeable) { + printk(KERN_ERR + "btrfs: read-only device in alloc_list\n"); + WARN_ON(1); + continue; + } + + if (!device->in_fs_metadata) + continue; + + if (device->total_bytes > device->bytes_used) + total_avail = device->total_bytes - device->bytes_used; + else + total_avail = 0; + /* avail is off by max(alloc_start, 1MB), but that is the same + * for all devices, so it doesn't hurt the sorting later on + */ + + ret = find_free_dev_extent(trans, device, + max_stripe_size * dev_stripes, + &dev_offset, &max_avail); + if (ret && ret != -ENOSPC) + goto error; + + if (ret == 0) + max_avail = max_stripe_size * dev_stripes; + + if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) + continue; + + devices_info[ndevs].dev_offset = dev_offset; + devices_info[ndevs].max_avail = max_avail; + devices_info[ndevs].total_avail = total_avail; + devices_info[ndevs].dev = device; + ++ndevs; + } + + /* + * now sort the devices by hole size / available space + */ + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), + btrfs_cmp_device_info, NULL); + + /* round down to number of usable stripes */ + ndevs -= ndevs % devs_increment; + + if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { + ret = -ENOSPC; + goto error; + } + + if (devs_max && ndevs > devs_max) + ndevs = devs_max; + /* + * the primary goal is to maximize the number of stripes, so use as many + * devices as possible, even if the stripes are not maximum sized. + */ + stripe_size = devices_info[ndevs-1].max_avail; + num_stripes = ndevs * dev_stripes; + + if (stripe_size * num_stripes > max_chunk_size * ncopies) { + stripe_size = max_chunk_size * ncopies; + do_div(stripe_size, num_stripes); + } + + do_div(stripe_size, dev_stripes); + do_div(stripe_size, BTRFS_STRIPE_LEN); + stripe_size *= BTRFS_STRIPE_LEN; + + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + ret = -ENOMEM; + goto error; + } + map->num_stripes = num_stripes; + + for (i = 0; i < ndevs; ++i) { + for (j = 0; j < dev_stripes; ++j) { + int s = i * dev_stripes + j; + map->stripes[s].dev = devices_info[i].dev; + map->stripes[s].physical = devices_info[i].dev_offset + + j * stripe_size; + } + } + map->sector_size = extent_root->sectorsize; + map->stripe_len = BTRFS_STRIPE_LEN; + map->io_align = BTRFS_STRIPE_LEN; + map->io_width = BTRFS_STRIPE_LEN; + map->type = type; + map->sub_stripes = sub_stripes; + + *map_ret = map; + num_bytes = stripe_size * (num_stripes / ncopies); + + *stripe_size_out = stripe_size; + *num_bytes_out = num_bytes; + + trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); + + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto error; + } + em->bdev = (struct block_device *)map; + em->start = start; + em->len = num_bytes; + em->block_start = 0; + em->block_len = em->len; + + em_tree = &extent_root->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + BUG_ON(ret); + free_extent_map(em); + + ret = btrfs_make_block_group(trans, extent_root, 0, type, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + start, num_bytes); + BUG_ON(ret); + + for (i = 0; i < map->num_stripes; ++i) { + struct btrfs_device *device; + u64 dev_offset; + + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; + + ret = btrfs_alloc_dev_extent(trans, device, + info->chunk_root->root_key.objectid, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + start, dev_offset, stripe_size); + BUG_ON(ret); + } + + kfree(devices_info); + return 0; + +error: + kfree(map); + kfree(devices_info); + return ret; +} + +static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup *map, u64 chunk_offset, + u64 chunk_size, u64 stripe_size) +{ + u64 dev_offset; + struct btrfs_key key; + struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; + struct btrfs_device *device; + struct btrfs_chunk *chunk; + struct btrfs_stripe *stripe; + size_t item_size = btrfs_chunk_item_size(map->num_stripes); + int index = 0; + int ret; + + chunk = kzalloc(item_size, GFP_NOFS); + if (!chunk) + return -ENOMEM; + + index = 0; + while (index < map->num_stripes) { + device = map->stripes[index].dev; + device->bytes_used += stripe_size; + ret = btrfs_update_device(trans, device); + BUG_ON(ret); + index++; + } + + index = 0; + stripe = &chunk->stripe; + while (index < map->num_stripes) { + device = map->stripes[index].dev; + dev_offset = map->stripes[index].physical; + + btrfs_set_stack_stripe_devid(stripe, device->devid); + btrfs_set_stack_stripe_offset(stripe, dev_offset); + memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); + stripe++; + index++; + } + + btrfs_set_stack_chunk_length(chunk, chunk_size); + btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); + btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); + btrfs_set_stack_chunk_type(chunk, map->type); + btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); + btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); + btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); + btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); + btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = chunk_offset; + + ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); + BUG_ON(ret); + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, + item_size); + BUG_ON(ret); + } + + kfree(chunk); + return 0; +} + +/* + * Chunk allocation falls into two parts. The first part does works + * that make the new allocated chunk useable, but not do any operation + * that modifies the chunk tree. The second part does the works that + * require modifying the chunk tree. This division is important for the + * bootstrap process of adding storage to a seed btrfs. + */ +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 type) +{ + u64 chunk_offset; + u64 chunk_size; + u64 stripe_size; + struct map_lookup *map; + struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; + int ret; + + ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, + &chunk_offset); + if (ret) + return ret; + + ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, + &stripe_size, chunk_offset, type); + if (ret) + return ret; + + ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, + chunk_size, stripe_size); + BUG_ON(ret); + return 0; +} + +static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) +{ + u64 chunk_offset; + u64 sys_chunk_offset; + u64 chunk_size; + u64 sys_chunk_size; + u64 stripe_size; + u64 sys_stripe_size; + u64 alloc_profile; + struct map_lookup *map; + struct map_lookup *sys_map; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *extent_root = fs_info->extent_root; + int ret; + + ret = find_next_chunk(fs_info->chunk_root, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); + BUG_ON(ret); + + alloc_profile = BTRFS_BLOCK_GROUP_METADATA | + (fs_info->metadata_alloc_profile & + fs_info->avail_metadata_alloc_bits); + alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); + + ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, + &stripe_size, chunk_offset, alloc_profile); + BUG_ON(ret); + + sys_chunk_offset = chunk_offset + chunk_size; + + alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | + (fs_info->system_alloc_profile & + fs_info->avail_system_alloc_bits); + alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); + + ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, + &sys_chunk_size, &sys_stripe_size, + sys_chunk_offset, alloc_profile); + BUG_ON(ret); + + ret = btrfs_add_device(trans, fs_info->chunk_root, device); + BUG_ON(ret); + + /* + * Modifying chunk tree needs allocating new blocks from both + * system block group and metadata block group. So we only can + * do operations require modifying the chunk tree after both + * block groups were created. + */ + ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, + chunk_size, stripe_size); + BUG_ON(ret); + + ret = __finish_chunk_alloc(trans, extent_root, sys_map, + sys_chunk_offset, sys_chunk_size, + sys_stripe_size); + BUG_ON(ret); + return 0; +} + +int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + int readonly = 0; + int i; + + read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); + read_unlock(&map_tree->map_tree.lock); + if (!em) + return 1; + + if (btrfs_test_opt(root, DEGRADED)) { + free_extent_map(em); + return 0; + } + + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) { + if (!map->stripes[i].dev->writeable) { + readonly = 1; + break; + } + } + free_extent_map(em); + return readonly; +} + +void btrfs_mapping_init(struct btrfs_mapping_tree *tree) +{ + extent_map_tree_init(&tree->map_tree); +} + +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) +{ + struct extent_map *em; + + while (1) { + write_lock(&tree->map_tree.lock); + em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); + if (em) + remove_extent_mapping(&tree->map_tree, em); + write_unlock(&tree->map_tree.lock); + if (!em) + break; + kfree(em->bdev); + /* once for us */ + free_extent_map(em); + /* once for the tree */ + free_extent_map(em); + } +} + +int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + int ret; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + read_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) + ret = map->num_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID10) + ret = map->sub_stripes; + else + ret = 1; + free_extent_map(em); + return ret; +} + +static int find_live_mirror(struct map_lookup *map, int first, int num, + int optimal) +{ + int i; + if (map->stripes[optimal].dev->bdev) + return optimal; + for (i = first; i < first + num; i++) { + if (map->stripes[i].dev->bdev) + return i; + } + /* we couldn't find one that doesn't fail. Just return something + * and the io error handling code will clean up eventually + */ + return optimal; +} + +static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, + int mirror_num) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + u64 offset; + u64 stripe_offset; + u64 stripe_end_offset; + u64 stripe_nr; + u64 stripe_nr_orig; + u64 stripe_nr_end; + int stripes_allocated = 8; + int stripes_required = 1; + int stripe_index; + int i; + int num_stripes; + int max_errors = 0; + struct btrfs_multi_bio *multi = NULL; + + if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) + stripes_allocated = 1; +again: + if (multi_ret) { + multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), + GFP_NOFS); + if (!multi) + return -ENOMEM; + + atomic_set(&multi->error, 0); + } + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, *length); + read_unlock(&em_tree->lock); + + if (!em) { + printk(KERN_CRIT "unable to find logical %llu len %llu\n", + (unsigned long long)logical, + (unsigned long long)*length); + BUG(); + } + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + offset = logical - em->start; + + if (mirror_num > map->num_stripes) + mirror_num = 0; + + /* if our multi bio struct is too small, back off and try again */ + if (rw & REQ_WRITE) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP)) { + stripes_required = map->num_stripes; + max_errors = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + stripes_required = map->sub_stripes; + max_errors = 1; + } + } + if (rw & REQ_DISCARD) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID10)) { + stripes_required = map->num_stripes; + } + } + if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && + stripes_allocated < stripes_required) { + stripes_allocated = map->num_stripes; + free_extent_map(em); + kfree(multi); + goto again; + } + stripe_nr = offset; + /* + * stripe_nr counts the total number of stripes we have to stride + * to get to this block + */ + do_div(stripe_nr, map->stripe_len); + + stripe_offset = stripe_nr * map->stripe_len; + BUG_ON(offset < stripe_offset); + + /* stripe_offset is the offset of this block in its stripe*/ + stripe_offset = offset - stripe_offset; + + if (rw & REQ_DISCARD) + *length = min_t(u64, em->len - offset, *length); + else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) { + /* we limit the length of each bio to what fits in a stripe */ + *length = min_t(u64, em->len - offset, + map->stripe_len - stripe_offset); + } else { + *length = em->len - offset; + } + + if (!multi_ret) + goto out; + + num_stripes = 1; + stripe_index = 0; + stripe_nr_orig = stripe_nr; + stripe_nr_end = (offset + *length + map->stripe_len - 1) & + (~(map->stripe_len - 1)); + do_div(stripe_nr_end, map->stripe_len); + stripe_end_offset = stripe_nr_end * map->stripe_len - + (offset + *length); + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + if (rw & REQ_DISCARD) + num_stripes = min_t(u64, map->num_stripes, + stripe_nr_end - stripe_nr_orig); + stripe_index = do_div(stripe_nr, map->num_stripes); + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + if (rw & (REQ_WRITE | REQ_DISCARD)) + num_stripes = map->num_stripes; + else if (mirror_num) + stripe_index = mirror_num - 1; + else { + stripe_index = find_live_mirror(map, 0, + map->num_stripes, + current->pid % map->num_stripes); + } + + } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { + if (rw & (REQ_WRITE | REQ_DISCARD)) + num_stripes = map->num_stripes; + else if (mirror_num) + stripe_index = mirror_num - 1; + + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + int factor = map->num_stripes / map->sub_stripes; + + stripe_index = do_div(stripe_nr, factor); + stripe_index *= map->sub_stripes; + + if (rw & REQ_WRITE) + num_stripes = map->sub_stripes; + else if (rw & REQ_DISCARD) + num_stripes = min_t(u64, map->sub_stripes * + (stripe_nr_end - stripe_nr_orig), + map->num_stripes); + else if (mirror_num) + stripe_index += mirror_num - 1; + else { + stripe_index = find_live_mirror(map, stripe_index, + map->sub_stripes, stripe_index + + current->pid % map->sub_stripes); + } + } else { + /* + * after this do_div call, stripe_nr is the number of stripes + * on this device we have to walk to find the data, and + * stripe_index is the number of our device in the stripe array + */ + stripe_index = do_div(stripe_nr, map->num_stripes); + } + BUG_ON(stripe_index >= map->num_stripes); + + if (rw & REQ_DISCARD) { + for (i = 0; i < num_stripes; i++) { + multi->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + multi->stripes[i].dev = map->stripes[stripe_index].dev; + + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + u64 stripes; + u32 last_stripe = 0; + int j; + + div_u64_rem(stripe_nr_end - 1, + map->num_stripes, + &last_stripe); + + for (j = 0; j < map->num_stripes; j++) { + u32 test; + + div_u64_rem(stripe_nr_end - 1 - j, + map->num_stripes, &test); + if (test == stripe_index) + break; + } + stripes = stripe_nr_end - 1 - j; + do_div(stripes, map->num_stripes); + multi->stripes[i].length = map->stripe_len * + (stripes - stripe_nr + 1); + + if (i == 0) { + multi->stripes[i].length -= + stripe_offset; + stripe_offset = 0; + } + if (stripe_index == last_stripe) + multi->stripes[i].length -= + stripe_end_offset; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + u64 stripes; + int j; + int factor = map->num_stripes / + map->sub_stripes; + u32 last_stripe = 0; + + div_u64_rem(stripe_nr_end - 1, + factor, &last_stripe); + last_stripe *= map->sub_stripes; + + for (j = 0; j < factor; j++) { + u32 test; + + div_u64_rem(stripe_nr_end - 1 - j, + factor, &test); + + if (test == + stripe_index / map->sub_stripes) + break; + } + stripes = stripe_nr_end - 1 - j; + do_div(stripes, factor); + multi->stripes[i].length = map->stripe_len * + (stripes - stripe_nr + 1); + + if (i < map->sub_stripes) { + multi->stripes[i].length -= + stripe_offset; + if (i == map->sub_stripes - 1) + stripe_offset = 0; + } + if (stripe_index >= last_stripe && + stripe_index <= (last_stripe + + map->sub_stripes - 1)) { + multi->stripes[i].length -= + stripe_end_offset; + } + } else + multi->stripes[i].length = *length; + + stripe_index++; + if (stripe_index == map->num_stripes) { + /* This could only happen for RAID0/10 */ + stripe_index = 0; + stripe_nr++; + } + } + } else { + for (i = 0; i < num_stripes; i++) { + multi->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + + stripe_nr * map->stripe_len; + multi->stripes[i].dev = + map->stripes[stripe_index].dev; + stripe_index++; + } + } + if (multi_ret) { + *multi_ret = multi; + multi->num_stripes = num_stripes; + multi->max_errors = max_errors; + } +out: + free_extent_map(em); + return 0; +} + +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, int mirror_num) +{ + return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, + mirror_num); +} + +int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, + u64 chunk_start, u64 physical, u64 devid, + u64 **logical, int *naddrs, int *stripe_len) +{ + struct extent_map_tree *em_tree = &map_tree->map_tree; + struct extent_map *em; + struct map_lookup *map; + u64 *buf; + u64 bytenr; + u64 length; + u64 stripe_nr; + int i, j, nr = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_start, 1); + read_unlock(&em_tree->lock); + + BUG_ON(!em || em->start != chunk_start); + map = (struct map_lookup *)em->bdev; + + length = em->len; + if (map->type & BTRFS_BLOCK_GROUP_RAID10) + do_div(length, map->num_stripes / map->sub_stripes); + else if (map->type & BTRFS_BLOCK_GROUP_RAID0) + do_div(length, map->num_stripes); + + buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); + BUG_ON(!buf); + + for (i = 0; i < map->num_stripes; i++) { + if (devid && map->stripes[i].dev->devid != devid) + continue; + if (map->stripes[i].physical > physical || + map->stripes[i].physical + length <= physical) + continue; + + stripe_nr = physical - map->stripes[i].physical; + do_div(stripe_nr, map->stripe_len); + + if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + stripe_nr = stripe_nr * map->num_stripes + i; + do_div(stripe_nr, map->sub_stripes); + } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + stripe_nr = stripe_nr * map->num_stripes + i; + } + bytenr = chunk_start + stripe_nr * map->stripe_len; + WARN_ON(nr >= map->num_stripes); + for (j = 0; j < nr; j++) { + if (buf[j] == bytenr) + break; + } + if (j == nr) { + WARN_ON(nr >= map->num_stripes); + buf[nr++] = bytenr; + } + } + + *logical = buf; + *naddrs = nr; + *stripe_len = map->stripe_len; + + free_extent_map(em); + return 0; +} + +static void end_bio_multi_stripe(struct bio *bio, int err) +{ + struct btrfs_multi_bio *multi = bio->bi_private; + int is_orig_bio = 0; + + if (err) + atomic_inc(&multi->error); + + if (bio == multi->orig_bio) + is_orig_bio = 1; + + if (atomic_dec_and_test(&multi->stripes_pending)) { + if (!is_orig_bio) { + bio_put(bio); + bio = multi->orig_bio; + } + bio->bi_private = multi->private; + bio->bi_end_io = multi->end_io; + /* only send an error to the higher layers if it is + * beyond the tolerance of the multi-bio + */ + if (atomic_read(&multi->error) > multi->max_errors) { + err = -EIO; + } else if (err) { + /* + * this bio is actually up to date, we didn't + * go over the max number of errors + */ + set_bit(BIO_UPTODATE, &bio->bi_flags); + err = 0; + } + kfree(multi); + + bio_endio(bio, err); + } else if (!is_orig_bio) { + bio_put(bio); + } +} + +struct async_sched { + struct bio *bio; + int rw; + struct btrfs_fs_info *info; + struct btrfs_work work; +}; + +/* + * see run_scheduled_bios for a description of why bios are collected for + * async submit. + * + * This will add one bio to the pending list for a device and make sure + * the work struct is scheduled. + */ +static noinline int schedule_bio(struct btrfs_root *root, + struct btrfs_device *device, + int rw, struct bio *bio) +{ + int should_queue = 1; + struct btrfs_pending_bios *pending_bios; + + /* don't bother with additional async steps for reads, right now */ + if (!(rw & REQ_WRITE)) { + bio_get(bio); + submit_bio(rw, bio); + bio_put(bio); + return 0; + } + + /* + * nr_async_bios allows us to reliably return congestion to the + * higher layers. Otherwise, the async bio makes it appear we have + * made progress against dirty pages when we've really just put it + * on a queue for later + */ + atomic_inc(&root->fs_info->nr_async_bios); + WARN_ON(bio->bi_next); + bio->bi_next = NULL; + bio->bi_rw |= rw; + + spin_lock(&device->io_lock); + if (bio->bi_rw & REQ_SYNC) + pending_bios = &device->pending_sync_bios; + else + pending_bios = &device->pending_bios; + + if (pending_bios->tail) + pending_bios->tail->bi_next = bio; + + pending_bios->tail = bio; + if (!pending_bios->head) + pending_bios->head = bio; + if (device->running_pending) + should_queue = 0; + + spin_unlock(&device->io_lock); + + if (should_queue) + btrfs_queue_worker(&root->fs_info->submit_workers, + &device->work); + return 0; +} + +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, + int mirror_num, int async_submit) +{ + struct btrfs_mapping_tree *map_tree; + struct btrfs_device *dev; + struct bio *first_bio = bio; + u64 logical = (u64)bio->bi_sector << 9; + u64 length = 0; + u64 map_length; + struct btrfs_multi_bio *multi = NULL; + int ret; + int dev_nr = 0; + int total_devs = 1; + + length = bio->bi_size; + map_tree = &root->fs_info->mapping_tree; + map_length = length; + + ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, + mirror_num); + BUG_ON(ret); + + total_devs = multi->num_stripes; + if (map_length < length) { + printk(KERN_CRIT "mapping failed logical %llu bio len %llu " + "len %llu\n", (unsigned long long)logical, + (unsigned long long)length, + (unsigned long long)map_length); + BUG(); + } + multi->end_io = first_bio->bi_end_io; + multi->private = first_bio->bi_private; + multi->orig_bio = first_bio; + atomic_set(&multi->stripes_pending, multi->num_stripes); + + while (dev_nr < total_devs) { + if (total_devs > 1) { + if (dev_nr < total_devs - 1) { + bio = bio_clone(first_bio, GFP_NOFS); + BUG_ON(!bio); + } else { + bio = first_bio; + } + bio->bi_private = multi; + bio->bi_end_io = end_bio_multi_stripe; + } + bio->bi_sector = multi->stripes[dev_nr].physical >> 9; + dev = multi->stripes[dev_nr].dev; + if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { + bio->bi_bdev = dev->bdev; + if (async_submit) + schedule_bio(root, dev, rw, bio); + else + submit_bio(rw, bio); + } else { + bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; + bio->bi_sector = logical >> 9; + bio_endio(bio, -EIO); + } + dev_nr++; + } + if (total_devs == 1) + kfree(multi); + return 0; +} + +struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, + u8 *uuid, u8 *fsid) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *cur_devices; + + cur_devices = root->fs_info->fs_devices; + while (cur_devices) { + if (!fsid || + !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { + device = __find_device(&cur_devices->devices, + devid, uuid); + if (device) + return device; + } + cur_devices = cur_devices->seed; + } + return NULL; +} + +static struct btrfs_device *add_missing_dev(struct btrfs_root *root, + u64 devid, u8 *dev_uuid) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) + return NULL; + list_add(&device->dev_list, + &fs_devices->devices); + device->dev_root = root->fs_info->dev_root; + device->devid = devid; + device->work.func = pending_bios_fn; + device->fs_devices = fs_devices; + device->missing = 1; + fs_devices->num_devices++; + fs_devices->missing_devices++; + spin_lock_init(&device->io_lock); + INIT_LIST_HEAD(&device->dev_alloc_list); + memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); + return device; +} + +static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + u64 logical; + u64 length; + u64 devid; + u8 uuid[BTRFS_UUID_SIZE]; + int num_stripes; + int ret; + int i; + + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + + read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); + read_unlock(&map_tree->map_tree.lock); + + /* already mapped? */ + if (em && em->start <= logical && em->start + em->len > logical) { + free_extent_map(em); + return 0; + } else if (em) { + free_extent_map(em); + } + + em = alloc_extent_map(); + if (!em) + return -ENOMEM; + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + free_extent_map(em); + return -ENOMEM; + } + + em->bdev = (struct block_device *)map; + em->start = logical; + em->len = length; + em->block_start = 0; + em->block_len = em->len; + + map->num_stripes = num_stripes; + map->io_width = btrfs_chunk_io_width(leaf, chunk); + map->io_align = btrfs_chunk_io_align(leaf, chunk); + map->sector_size = btrfs_chunk_sector_size(leaf, chunk); + map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + map->type = btrfs_chunk_type(leaf, chunk); + map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + for (i = 0; i < num_stripes; i++) { + map->stripes[i].physical = + btrfs_stripe_offset_nr(leaf, chunk, i); + devid = btrfs_stripe_devid_nr(leaf, chunk, i); + read_extent_buffer(leaf, uuid, (unsigned long) + btrfs_stripe_dev_uuid_nr(chunk, i), + BTRFS_UUID_SIZE); + map->stripes[i].dev = btrfs_find_device(root, devid, uuid, + NULL); + if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { + kfree(map); + free_extent_map(em); + return -EIO; + } + if (!map->stripes[i].dev) { + map->stripes[i].dev = + add_missing_dev(root, devid, uuid); + if (!map->stripes[i].dev) { + kfree(map); + free_extent_map(em); + return -EIO; + } + } + map->stripes[i].dev->in_fs_metadata = 1; + } + + write_lock(&map_tree->map_tree.lock); + ret = add_extent_mapping(&map_tree->map_tree, em); + write_unlock(&map_tree->map_tree.lock); + BUG_ON(ret); + free_extent_map(em); + + return 0; +} + +static int fill_device_from_item(struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item, + struct btrfs_device *device) +{ + unsigned long ptr; + + device->devid = btrfs_device_id(leaf, dev_item); + device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->total_bytes = device->disk_total_bytes; + device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); + device->type = btrfs_device_type(leaf, dev_item); + device->io_align = btrfs_device_io_align(leaf, dev_item); + device->io_width = btrfs_device_io_width(leaf, dev_item); + device->sector_size = btrfs_device_sector_size(leaf, dev_item); + + ptr = (unsigned long)btrfs_device_uuid(dev_item); + read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); + + return 0; +} + +static int open_seed_devices(struct btrfs_root *root, u8 *fsid) +{ + struct btrfs_fs_devices *fs_devices; + int ret; + + mutex_lock(&uuid_mutex); + + fs_devices = root->fs_info->fs_devices->seed; + while (fs_devices) { + if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { + ret = 0; + goto out; + } + fs_devices = fs_devices->seed; + } + + fs_devices = find_fsid(fsid); + if (!fs_devices) { + ret = -ENOENT; + goto out; + } + + fs_devices = clone_fs_devices(fs_devices); + if (IS_ERR(fs_devices)) { + ret = PTR_ERR(fs_devices); + goto out; + } + + ret = __btrfs_open_devices(fs_devices, FMODE_READ, + root->fs_info->bdev_holder); + if (ret) + goto out; + + if (!fs_devices->seeding) { + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + ret = -EINVAL; + goto out; + } + + fs_devices->seed = root->fs_info->fs_devices->seed; + root->fs_info->fs_devices->seed = fs_devices; +out: + mutex_unlock(&uuid_mutex); + return ret; +} + +static int read_one_dev(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item) +{ + struct btrfs_device *device; + u64 devid; + int ret; + u8 fs_uuid[BTRFS_UUID_SIZE]; + u8 dev_uuid[BTRFS_UUID_SIZE]; + + devid = btrfs_device_id(leaf, dev_item); + read_extent_buffer(leaf, dev_uuid, + (unsigned long)btrfs_device_uuid(dev_item), + BTRFS_UUID_SIZE); + read_extent_buffer(leaf, fs_uuid, + (unsigned long)btrfs_device_fsid(dev_item), + BTRFS_UUID_SIZE); + + if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { + ret = open_seed_devices(root, fs_uuid); + if (ret && !btrfs_test_opt(root, DEGRADED)) + return ret; + } + + device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + if (!device || !device->bdev) { + if (!btrfs_test_opt(root, DEGRADED)) + return -EIO; + + if (!device) { + printk(KERN_WARNING "warning devid %llu missing\n", + (unsigned long long)devid); + device = add_missing_dev(root, devid, dev_uuid); + if (!device) + return -ENOMEM; + } else if (!device->missing) { + /* + * this happens when a device that was properly setup + * in the device info lists suddenly goes bad. + * device->bdev is NULL, and so we have to set + * device->missing to one here + */ + root->fs_info->fs_devices->missing_devices++; + device->missing = 1; + } + } + + if (device->fs_devices != root->fs_info->fs_devices) { + BUG_ON(device->writeable); + if (device->generation != + btrfs_device_generation(leaf, dev_item)) + return -EINVAL; + } + + fill_device_from_item(leaf, dev_item, device); + device->dev_root = root->fs_info->dev_root; + device->in_fs_metadata = 1; + if (device->writeable) + device->fs_devices->total_rw_bytes += device->total_bytes; + ret = 0; + return ret; +} + +int btrfs_read_sys_array(struct btrfs_root *root) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct extent_buffer *sb; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *ptr; + unsigned long sb_ptr; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur; + struct btrfs_key key; + + sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, + BTRFS_SUPER_INFO_SIZE); + if (!sb) + return -ENOMEM; + btrfs_set_buffer_uptodate(sb); + btrfs_set_buffer_lockdep_class(sb, 0); + + write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); + array_size = btrfs_super_sys_array_size(super_copy); + + ptr = super_copy->sys_chunk_array; + sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); + cur = 0; + + while (cur < array_size) { + disk_key = (struct btrfs_disk_key *)ptr; + btrfs_disk_key_to_cpu(&key, disk_key); + + len = sizeof(*disk_key); ptr += len; + sb_ptr += len; + cur += len; + + if (key.type == BTRFS_CHUNK_ITEM_KEY) { + chunk = (struct btrfs_chunk *)sb_ptr; + ret = read_one_chunk(root, &key, sb, chunk); + if (ret) + break; + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + len = btrfs_chunk_item_size(num_stripes); + } else { + ret = -EIO; + break; + } + ptr += len; + sb_ptr += len; + cur += len; + } + free_extent_buffer(sb); + return ret; +} + +int btrfs_read_chunk_tree(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + int slot; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* first we search for all of the device items, and then we + * read in all of the chunk items. This way we can create chunk + * mappings that reference all of the devices that are afound + */ + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = 0; +again: + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto error; + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { + if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) + break; + if (found_key.type == BTRFS_DEV_ITEM_KEY) { + struct btrfs_dev_item *dev_item; + dev_item = btrfs_item_ptr(leaf, slot, + struct btrfs_dev_item); + ret = read_one_dev(root, leaf, dev_item); + if (ret) + goto error; + } + } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { + struct btrfs_chunk *chunk; + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + ret = read_one_chunk(root, &found_key, leaf, chunk); + if (ret) + goto error; + } + path->slots[0]++; + } + if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { + key.objectid = 0; + btrfs_release_path(path); + goto again; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h new file mode 100644 index 00000000..6d866db4 --- /dev/null +++ b/fs/btrfs/volumes.h @@ -0,0 +1,218 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_VOLUMES_ +#define __BTRFS_VOLUMES_ + +#include +#include +#include "async-thread.h" + +#define BTRFS_STRIPE_LEN (64 * 1024) + +struct buffer_head; +struct btrfs_pending_bios { + struct bio *head; + struct bio *tail; +}; + +struct btrfs_device { + struct list_head dev_list; + struct list_head dev_alloc_list; + struct btrfs_fs_devices *fs_devices; + struct btrfs_root *dev_root; + + /* regular prio bios */ + struct btrfs_pending_bios pending_bios; + /* WRITE_SYNC bios */ + struct btrfs_pending_bios pending_sync_bios; + + int running_pending; + u64 generation; + + int writeable; + int in_fs_metadata; + int missing; + int can_discard; + + spinlock_t io_lock; + + struct block_device *bdev; + + /* the mode sent to blkdev_get */ + fmode_t mode; + + char *name; + + /* the internal btrfs device id */ + u64 devid; + + /* size of the device */ + u64 total_bytes; + + /* size of the disk */ + u64 disk_total_bytes; + + /* bytes used */ + u64 bytes_used; + + /* optimal io alignment for this device */ + u32 io_align; + + /* optimal io width for this device */ + u32 io_width; + + /* minimal io size for this device */ + u32 sector_size; + + /* type and info about this device */ + u64 type; + + /* physical drive uuid (or lvm uuid) */ + u8 uuid[BTRFS_UUID_SIZE]; + + /* per-device scrub information */ + struct scrub_dev *scrub_device; + + struct btrfs_work work; + struct rcu_head rcu; + struct work_struct rcu_work; +}; + +struct btrfs_fs_devices { + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + + /* the device with this id has the most recent copy of the super */ + u64 latest_devid; + u64 latest_trans; + u64 num_devices; + u64 open_devices; + u64 rw_devices; + u64 missing_devices; + u64 total_rw_bytes; + u64 num_can_discard; + struct block_device *latest_bdev; + + /* all of the devices in the FS, protected by a mutex + * so we can safely walk it to write out the supers without + * worrying about add/remove by the multi-device code + */ + struct mutex device_list_mutex; + struct list_head devices; + + /* devices not currently being allocated */ + struct list_head alloc_list; + struct list_head list; + + struct btrfs_fs_devices *seed; + int seeding; + + int opened; + + /* set when we find or add a device that doesn't have the + * nonrot flag set + */ + int rotating; +}; + +struct btrfs_bio_stripe { + struct btrfs_device *dev; + u64 physical; + u64 length; /* only used for discard mappings */ +}; + +struct btrfs_multi_bio { + atomic_t stripes_pending; + bio_end_io_t *end_io; + struct bio *orig_bio; + void *private; + atomic_t error; + int max_errors; + int num_stripes; + struct btrfs_bio_stripe stripes[]; +}; + +struct btrfs_device_info { + struct btrfs_device *dev; + u64 dev_offset; + u64 max_avail; + u64 total_avail; +}; + +struct map_lookup { + u64 type; + int io_align; + int io_width; + int stripe_len; + int sector_size; + int num_stripes; + int sub_stripes; + struct btrfs_bio_stripe stripes[]; +}; + +#define map_lookup_size(n) (sizeof(struct map_lookup) + \ + (sizeof(struct btrfs_bio_stripe) * (n))) + +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length); + +#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ + (sizeof(struct btrfs_bio_stripe) * (n))) + +int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 start, u64 num_bytes); +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, int mirror_num); +int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, + u64 chunk_start, u64 physical, u64 devid, + u64 **logical, int *naddrs, int *stripe_len); +int btrfs_read_sys_array(struct btrfs_root *root); +int btrfs_read_chunk_tree(struct btrfs_root *root); +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 type); +void btrfs_mapping_init(struct btrfs_mapping_tree *tree); +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, + int mirror_num, int async_submit); +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder); +int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, + struct btrfs_fs_devices **fs_devices_ret); +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device); +int btrfs_rm_device(struct btrfs_root *root, char *device_path); +int btrfs_cleanup_fs_uuids(void); +int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size); +struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, + u8 *uuid, u8 *fsid); +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); +int btrfs_init_new_device(struct btrfs_root *root, char *path); +int btrfs_balance(struct btrfs_root *dev_root); +int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *max_avail); +#endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c new file mode 100644 index 00000000..5366fe45 --- /dev/null +++ b/fs/btrfs/xattr.c @@ -0,0 +1,395 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "btrfs_inode.h" +#include "transaction.h" +#include "xattr.h" +#include "disk-io.h" + + +ssize_t __btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret = 0; + unsigned long data_ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* lookup the xattr by name */ + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name, + strlen(name), 0); + if (!di) { + ret = -ENODATA; + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + leaf = path->nodes[0]; + /* if size is 0, that means we want the size of the attr */ + if (!size) { + ret = btrfs_dir_data_len(leaf, di); + goto out; + } + + /* now get the data out of our dir_item */ + if (btrfs_dir_data_len(leaf, di) > size) { + ret = -ERANGE; + goto out; + } + + /* + * The way things are packed into the leaf is like this + * |struct btrfs_dir_item|name|data| + * where name is the xattr name, so security.foo, and data is the + * content of the xattr. data_ptr points to the location in memory + * where the data starts in the in memory leaf + */ + data_ptr = (unsigned long)((char *)(di + 1) + + btrfs_dir_name_len(leaf, di)); + read_extent_buffer(leaf, buffer, data_ptr, + btrfs_dir_data_len(leaf, di)); + ret = btrfs_dir_data_len(leaf, di); + +out: + btrfs_free_path(path); + return ret; +} + +static int do_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + size_t name_len = strlen(name); + int ret = 0; + + if (name_len + size > BTRFS_MAX_XATTR_SIZE(root)) + return -ENOSPC; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* first lets see if we already have this xattr */ + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, + strlen(name), -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + /* ok we already have this xattr, lets remove it */ + if (di) { + /* if we want create only exit */ + if (flags & XATTR_CREATE) { + ret = -EEXIST; + goto out; + } + + ret = btrfs_delete_one_dir_name(trans, root, path, di); + BUG_ON(ret); + btrfs_release_path(path); + + /* if we don't have a value then we are removing the xattr */ + if (!value) + goto out; + } else { + btrfs_release_path(path); + + if (flags & XATTR_REPLACE) { + /* we couldn't find the attr to replace */ + ret = -ENODATA; + goto out; + } + } + + /* ok we have to create a completely new xattr */ + ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), + name, name_len, value, size); + BUG_ON(ret); +out: + btrfs_free_path(path); + return ret; +} + +int __btrfs_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (trans) + return do_setxattr(trans, inode, name, value, size, flags); + + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = do_setxattr(trans, inode, name, value, size, flags); + if (ret) + goto out; + + inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); +out: + btrfs_end_transaction_throttle(trans, root); + return ret; +} + +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct btrfs_key key, found_key; + struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + int ret = 0, slot; + size_t total_size = 0, size_left = size; + unsigned long name_ptr; + size_t name_len; + + /* + * ok we want all objects associated with this id. + * NOTE: we set key.offset = 0; because we want to start with the + * first xattr that we find and walk forward + */ + key.objectid = btrfs_ino(inode); + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + + /* search for our xattrs */ + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + + /* this is where we start walking through the path */ + if (slot >= btrfs_header_nritems(leaf)) { + /* + * if we've reached the last slot in this leaf we need + * to go to the next leaf and reset everything + */ + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto err; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* check to make sure this item is what we want */ + if (found_key.objectid != key.objectid) + break; + if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) + break; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + if (verify_dir_item(root, leaf, di)) + continue; + + name_len = btrfs_dir_name_len(leaf, di); + total_size += name_len + 1; + + /* we are just looking for how big our buffer needs to be */ + if (!size) + goto next; + + if (!buffer || (name_len + 1) > size_left) { + ret = -ERANGE; + goto err; + } + + name_ptr = (unsigned long)(di + 1); + read_extent_buffer(leaf, buffer, name_ptr, name_len); + buffer[name_len] = '\0'; + + size_left -= name_len + 1; + buffer += name_len + 1; +next: + path->slots[0]++; + } + ret = total_size; + +err: + btrfs_free_path(path); + + return ret; +} + +/* + * List of handlers for synthetic system.* attributes. All real ondisk + * attributes are handled directly. + */ +const struct xattr_handler *btrfs_xattr_handlers[] = { +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + &btrfs_xattr_acl_access_handler, + &btrfs_xattr_acl_default_handler, +#endif + NULL, +}; + +/* + * Check if the attribute is in a supported namespace. + * + * This applied after the check for the synthetic attributes in the system + * namespace. + */ +static bool btrfs_is_valid_xattr(const char *name) +{ + return !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || + !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || + !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); +} + +ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, buffer, size); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + return __btrfs_getxattr(dentry->d_inode, name, buffer, size); +} + +int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + + /* + * The permission on security.* and system.* is not checked + * in permission(). + */ + if (btrfs_root_readonly(root)) + return -EROFS; + + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + + if (size == 0) + value = ""; /* empty EA, do not remove */ + + return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size, + flags); +} + +int btrfs_removexattr(struct dentry *dentry, const char *name) +{ + struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + + /* + * The permission on security.* and system.* is not checked + * in permission(). + */ + if (btrfs_root_readonly(root)) + return -EROFS; + + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + + return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, + XATTR_REPLACE); +} + +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + int err; + size_t len; + void *value; + char *suffix; + char *name; + + err = security_inode_init_security(inode, dir, qstr, &suffix, &value, + &len); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1, + GFP_NOFS); + if (!name) { + err = -ENOMEM; + } else { + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); + err = __btrfs_setxattr(trans, inode, name, value, len, 0); + kfree(name); + } + + kfree(suffix); + kfree(value); + return err; +} diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h new file mode 100644 index 00000000..b3cc8039 --- /dev/null +++ b/fs/btrfs/xattr.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __XATTR__ +#define __XATTR__ + +#include + +extern const struct xattr_handler btrfs_xattr_acl_access_handler; +extern const struct xattr_handler btrfs_xattr_acl_default_handler; +extern const struct xattr_handler *btrfs_xattr_handlers[]; + +extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size); +extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags); +extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size); +extern int btrfs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +extern int btrfs_removexattr(struct dentry *dentry, const char *name); + +extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir, + const struct qstr *qstr); + +#endif /* __XATTR__ */ diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c new file mode 100644 index 00000000..faccd47c --- /dev/null +++ b/fs/btrfs/zlib.c @@ -0,0 +1,399 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on jffs2 zlib code: + * Copyright © 2001-2007 Red Hat, Inc. + * Created by David Woodhouse + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compression.h" + +struct workspace { + z_stream inf_strm; + z_stream def_strm; + char *buf; + struct list_head list; +}; + +static void zlib_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); +} + +static struct list_head *zlib_alloc_workspace(void) +{ + struct workspace *workspace; + + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize( + MAX_WBITS, MAX_MEM_LEVEL)); + workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); + workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!workspace->def_strm.workspace || + !workspace->inf_strm.workspace || !workspace->buf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + zlib_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +static int zlib_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + unsigned long bytes_left; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { + printk(KERN_WARNING "deflateInit failed\n"); + ret = -1; + goto out; + } + + workspace->def_strm.total_in = 0; + workspace->def_strm.total_out = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -1; + goto out; + } + cpage_out = kmap(out_page); + pages[0] = out_page; + nr_pages = 1; + + workspace->def_strm.next_in = data_in; + workspace->def_strm.next_out = cpage_out; + workspace->def_strm.avail_out = PAGE_CACHE_SIZE; + workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); + + while (workspace->def_strm.total_in < len) { + ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); + if (ret != Z_OK) { + printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + ret); + zlib_deflateEnd(&workspace->def_strm); + ret = -1; + goto out; + } + + /* we're making it bigger, give up */ + if (workspace->def_strm.total_in > 8192 && + workspace->def_strm.total_in < + workspace->def_strm.total_out) { + ret = -1; + goto out; + } + /* we need another page for writing out. Test this + * before the total_in so we will pull in a new page for + * the stream end if required + */ + if (workspace->def_strm.avail_out == 0) { + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -1; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -1; + goto out; + } + cpage_out = kmap(out_page); + pages[nr_pages] = out_page; + nr_pages++; + workspace->def_strm.avail_out = PAGE_CACHE_SIZE; + workspace->def_strm.next_out = cpage_out; + } + /* we're all done */ + if (workspace->def_strm.total_in >= len) + break; + + /* we've read in a full page, get a new one */ + if (workspace->def_strm.avail_in == 0) { + if (workspace->def_strm.total_out > max_out) + break; + + bytes_left = len - workspace->def_strm.total_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, + start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + workspace->def_strm.avail_in = min(bytes_left, + PAGE_CACHE_SIZE); + workspace->def_strm.next_in = data_in; + } + } + workspace->def_strm.avail_in = 0; + ret = zlib_deflate(&workspace->def_strm, Z_FINISH); + zlib_deflateEnd(&workspace->def_strm); + + if (ret != Z_STREAM_END) { + ret = -1; + goto out; + } + + if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { + ret = -1; + goto out; + } + + ret = 0; + *total_out = workspace->def_strm.total_out; + *total_in = workspace->def_strm.total_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + return ret; +} + +static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; + int wbits = MAX_WBITS; + char *data_in; + size_t total_out = 0; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + unsigned long buf_start; + unsigned long pg_offset; + + data_in = kmap(pages_in[page_in_index]); + workspace->inf_strm.next_in = data_in; + workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); + workspace->inf_strm.total_in = 0; + + workspace->inf_strm.total_out = 0; + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + pg_offset = 0; + + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->inf_strm.next_in += 2; + workspace->inf_strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + return -1; + } + while (workspace->inf_strm.total_in < srclen) { + ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + break; + + buf_start = total_out; + total_out = workspace->inf_strm.total_out; + + /* we didn't make progress in this inflate call, we're done */ + if (buf_start == total_out) + break; + + ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, + total_out, disk_start, + bvec, vcnt, + &page_out_index, &pg_offset); + if (ret2 == 0) { + ret = 0; + goto done; + } + + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + + if (workspace->inf_strm.avail_in == 0) { + unsigned long tmp; + kunmap(pages_in[page_in_index]); + page_in_index++; + if (page_in_index >= total_pages_in) { + data_in = NULL; + break; + } + data_in = kmap(pages_in[page_in_index]); + workspace->inf_strm.next_in = data_in; + tmp = srclen - workspace->inf_strm.total_in; + workspace->inf_strm.avail_in = min(tmp, + PAGE_CACHE_SIZE); + } + } + if (ret != Z_STREAM_END) + ret = -1; + else + ret = 0; +done: + zlib_inflateEnd(&workspace->inf_strm); + if (data_in) + kunmap(pages_in[page_in_index]); + return ret; +} + +static int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0; + int wbits = MAX_WBITS; + unsigned long bytes_left = destlen; + unsigned long total_out = 0; + char *kaddr; + + workspace->inf_strm.next_in = data_in; + workspace->inf_strm.avail_in = srclen; + workspace->inf_strm.total_in = 0; + + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + workspace->inf_strm.total_out = 0; + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->inf_strm.next_in += 2; + workspace->inf_strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + return -1; + } + + while (bytes_left > 0) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + unsigned long pg_offset = 0; + + ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + break; + + buf_start = total_out; + total_out = workspace->inf_strm.total_out; + + if (total_out == buf_start) { + ret = -1; + break; + } + + if (total_out <= start_byte) + goto next; + + if (total_out > start_byte && buf_start < start_byte) + buf_offset = start_byte - buf_start; + else + buf_offset = 0; + + bytes = min(PAGE_CACHE_SIZE - pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, bytes_left); + + kaddr = kmap_atomic(dest_page, KM_USER0); + memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); + kunmap_atomic(kaddr, KM_USER0); + + pg_offset += bytes; + bytes_left -= bytes; +next: + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + } + + if (ret != Z_STREAM_END && bytes_left != 0) + ret = -1; + else + ret = 0; + + zlib_inflateEnd(&workspace->inf_strm); + return ret; +} + +struct btrfs_compress_op btrfs_zlib_compress = { + .alloc_workspace = zlib_alloc_workspace, + .free_workspace = zlib_free_workspace, + .compress_pages = zlib_compress_pages, + .decompress_biovec = zlib_decompress_biovec, + .decompress = zlib_decompress, +}; diff --git a/fs/buffer.c b/fs/buffer.c new file mode 100644 index 00000000..330cbce1 --- /dev/null +++ b/fs/buffer.c @@ -0,0 +1,3326 @@ +/* + * linux/fs/buffer.c + * + * Copyright (C) 1991, 1992, 2002 Linus Torvalds + */ + +/* + * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 + * + * Removed a lot of unnecessary code and simplified things now that + * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 + * + * Speed up hash, lru, and free list operations. Use gfp() for allocating + * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM + * + * Added 32k buffer block sizes - these are required older ARM systems. - RMK + * + * async buffer flushing, 1999 Andrea Arcangeli + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); + +#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) + +inline void +init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) +{ + bh->b_end_io = handler; + bh->b_private = private; +} +EXPORT_SYMBOL(init_buffer); + +static int sleep_on_buffer(void *word) +{ + io_schedule(); + return 0; +} + +void __lock_buffer(struct buffer_head *bh) +{ + wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer, + TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(__lock_buffer); + +void unlock_buffer(struct buffer_head *bh) +{ + clear_bit_unlock(BH_Lock, &bh->b_state); + smp_mb__after_clear_bit(); + wake_up_bit(&bh->b_state, BH_Lock); +} +EXPORT_SYMBOL(unlock_buffer); + +/* + * Block until a buffer comes unlocked. This doesn't stop it + * from becoming locked again - you have to lock it yourself + * if you want to preserve its state. + */ +void __wait_on_buffer(struct buffer_head * bh) +{ + wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(__wait_on_buffer); + +static void +__clear_page_buffers(struct page *page) +{ + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); +} + + +static int quiet_error(struct buffer_head *bh) +{ + if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit()) + return 0; + return 1; +} + + +static void buffer_io_error(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); +} + +/* + * End-of-IO handler helper function which does not touch the bh after + * unlocking it. + * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but + * a race there is benign: unlock_buffer() only use the bh's address for + * hashing after unlocking the buffer, so it doesn't actually touch the bh + * itself. + */ +static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + } else { + /* This happens, due to failed READA attempts. */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); +} + +/* + * Default synchronous end-of-IO handler.. Just mark it up-to-date and + * unlock the buffer. This is what ll_rw_block uses too. + */ +void end_buffer_read_sync(struct buffer_head *bh, int uptodate) +{ + __end_buffer_read_notouch(bh, uptodate); + put_bh(bh); +} +EXPORT_SYMBOL(end_buffer_read_sync); + +void end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (uptodate) { + set_buffer_uptodate(bh); + } else { + if (!quiet_error(bh)) { + buffer_io_error(bh); + printk(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + bdevname(bh->b_bdev, b)); + } + set_buffer_write_io_error(bh); + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} +EXPORT_SYMBOL(end_buffer_write_sync); + +/* + * Various filesystems appear to want __find_get_block to be non-blocking. + * But it's the page lock which protects the buffers. To get around this, + * we get exclusion from try_to_free_buffers with the blockdev mapping's + * private_lock. + * + * Hack idea: for the blockdev mapping, i_bufferlist_lock contention + * may be quite high. This code could TryLock the page, and if that + * succeeds, there is no need to take private_lock. (But if + * private_lock is contended then so is mapping->tree_lock). + */ +static struct buffer_head * +__find_get_block_slow(struct block_device *bdev, sector_t block) +{ + struct inode *bd_inode = bdev->bd_inode; + struct address_space *bd_mapping = bd_inode->i_mapping; + struct buffer_head *ret = NULL; + pgoff_t index; + struct buffer_head *bh; + struct buffer_head *head; + struct page *page; + int all_mapped = 1; + + index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); + page = find_get_page(bd_mapping, index); + if (!page) + goto out; + + spin_lock(&bd_mapping->private_lock); + if (!page_has_buffers(page)) + goto out_unlock; + head = page_buffers(page); + bh = head; + do { + if (!buffer_mapped(bh)) + all_mapped = 0; + else if (bh->b_blocknr == block) { + ret = bh; + get_bh(bh); + goto out_unlock; + } + bh = bh->b_this_page; + } while (bh != head); + + /* we might be here because some of the buffers on this page are + * not mapped. This is due to various races between + * file io on the block device and getblk. It gets dealt with + * elsewhere, don't buffer_error if we had some unmapped buffers + */ + if (all_mapped) { + printk("__find_get_block_slow() failed. " + "block=%llu, b_blocknr=%llu\n", + (unsigned long long)block, + (unsigned long long)bh->b_blocknr); + printk("b_state=0x%08lx, b_size=%zu\n", + bh->b_state, bh->b_size); + printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); + } +out_unlock: + spin_unlock(&bd_mapping->private_lock); + page_cache_release(page); +out: + return ret; +} + +/* If invalidate_buffers() will trash dirty buffers, it means some kind + of fs corruption is going on. Trashing dirty data always imply losing + information that was supposed to be just stored on the physical layer + by the user. + + Thus invalidate_buffers in general usage is not allwowed to trash + dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to + be preserved. These buffers are simply skipped. + + We also skip buffers which are still in use. For example this can + happen if a userspace program is reading the block device. + + NOTE: In the case where the user removed a removable-media-disk even if + there's still dirty data not synced on disk (due a bug in the device driver + or due an error of the user), by not destroying the dirty buffers we could + generate corruption also on the next media inserted, thus a parameter is + necessary to handle this case in the most safe way possible (trying + to not corrupt also the new disk inserted with the data belonging to + the old now corrupted disk). Also for the ramdisk the natural thing + to do in order to release the ramdisk memory is to destroy dirty buffers. + + These are two special cases. Normal usage imply the device driver + to issue a sync on the device (without waiting I/O completion) and + then an invalidate_buffers call that doesn't trash dirty buffers. + + For handling cache coherency with the blkdev pagecache the 'update' case + is been introduced. It is needed to re-read from disk any pinned + buffer. NOTE: re-reading from disk is destructive so we can do it only + when we assume nobody is changing the buffercache under our I/O and when + we think the disk contains more recent information than the buffercache. + The update == 1 pass marks the buffers we need to update, the update == 2 + pass does the actual I/O. */ +void invalidate_bdev(struct block_device *bdev) +{ + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages == 0) + return; + + invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ + invalidate_mapping_pages(mapping, 0, -1); + /* 99% of the time, we don't need to flush the cleancache on the bdev. + * But, for the strange corners, lets be cautious + */ + cleancache_flush_inode(mapping); +} +EXPORT_SYMBOL(invalidate_bdev); + +/* + * Kick the writeback threads then try to free up some ZONE_NORMAL memory. + */ +static void free_more_memory(void) +{ + struct zone *zone; + int nid; + + wakeup_flusher_threads(1024); + yield(); + + for_each_online_node(nid) { + (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), + gfp_zone(GFP_NOFS), NULL, + &zone); + if (zone) + try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, + GFP_NOFS, NULL); + } +} + +/* + * I/O completion handler for block_read_full_page() - pages + * which come unlocked at the end of I/O. + */ +static void end_buffer_async_read(struct buffer_head *bh, int uptodate) +{ + unsigned long flags; + struct buffer_head *first; + struct buffer_head *tmp; + struct page *page; + int page_uptodate = 1; + + BUG_ON(!buffer_async_read(bh)); + + page = bh->b_page; + if (uptodate) { + set_buffer_uptodate(bh); + } else { + clear_buffer_uptodate(bh); + if (!quiet_error(bh)) + buffer_io_error(bh); + SetPageError(page); + } + + /* + * Be _very_ careful from here on. Bad things can happen if + * two buffer heads end IO at almost the same time and both + * decide that the page is now completely done. + */ + first = page_buffers(page); + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + clear_buffer_async_read(bh); + unlock_buffer(bh); + tmp = bh; + do { + if (!buffer_uptodate(tmp)) + page_uptodate = 0; + if (buffer_async_read(tmp)) { + BUG_ON(!buffer_locked(tmp)); + goto still_busy; + } + tmp = tmp->b_this_page; + } while (tmp != bh); + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); + + /* + * If none of the buffers had errors and they are all + * uptodate then we can set the page uptodate. + */ + if (page_uptodate && !PageError(page)) + SetPageUptodate(page); + unlock_page(page); + return; + +still_busy: + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); + return; +} + +/* + * Completion handler for block_write_full_page() - pages which are unlocked + * during I/O, and which have PageWriteback cleared upon I/O completion. + */ +void end_buffer_async_write(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + unsigned long flags; + struct buffer_head *first; + struct buffer_head *tmp; + struct page *page; + + BUG_ON(!buffer_async_write(bh)); + + page = bh->b_page; + if (uptodate) { + set_buffer_uptodate(bh); + } else { + if (!quiet_error(bh)) { + buffer_io_error(bh); + printk(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + bdevname(bh->b_bdev, b)); + } + set_bit(AS_EIO, &page->mapping->flags); + set_buffer_write_io_error(bh); + clear_buffer_uptodate(bh); + SetPageError(page); + } + + first = page_buffers(page); + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + + clear_buffer_async_write(bh); + unlock_buffer(bh); + tmp = bh->b_this_page; + while (tmp != bh) { + if (buffer_async_write(tmp)) { + BUG_ON(!buffer_locked(tmp)); + goto still_busy; + } + tmp = tmp->b_this_page; + } + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); + end_page_writeback(page); + return; + +still_busy: + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); + return; +} +EXPORT_SYMBOL(end_buffer_async_write); + +/* + * If a page's buffers are under async readin (end_buffer_async_read + * completion) then there is a possibility that another thread of + * control could lock one of the buffers after it has completed + * but while some of the other buffers have not completed. This + * locked buffer would confuse end_buffer_async_read() into not unlocking + * the page. So the absence of BH_Async_Read tells end_buffer_async_read() + * that this buffer is not under async I/O. + * + * The page comes unlocked when it has no locked buffer_async buffers + * left. + * + * PageLocked prevents anyone starting new async I/O reads any of + * the buffers. + * + * PageWriteback is used to prevent simultaneous writeout of the same + * page. + * + * PageLocked prevents anyone from starting writeback of a page which is + * under read I/O (PageWriteback is only ever set against a locked page). + */ +static void mark_buffer_async_read(struct buffer_head *bh) +{ + bh->b_end_io = end_buffer_async_read; + set_buffer_async_read(bh); +} + +static void mark_buffer_async_write_endio(struct buffer_head *bh, + bh_end_io_t *handler) +{ + bh->b_end_io = handler; + set_buffer_async_write(bh); +} + +void mark_buffer_async_write(struct buffer_head *bh) +{ + mark_buffer_async_write_endio(bh, end_buffer_async_write); +} +EXPORT_SYMBOL(mark_buffer_async_write); + + +/* + * fs/buffer.c contains helper functions for buffer-backed address space's + * fsync functions. A common requirement for buffer-based filesystems is + * that certain data from the backing blockdev needs to be written out for + * a successful fsync(). For example, ext2 indirect blocks need to be + * written back and waited upon before fsync() returns. + * + * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), + * inode_has_buffers() and invalidate_inode_buffers() are provided for the + * management of a list of dependent buffers at ->i_mapping->private_list. + * + * Locking is a little subtle: try_to_free_buffers() will remove buffers + * from their controlling inode's queue when they are being freed. But + * try_to_free_buffers() will be operating against the *blockdev* mapping + * at the time, not against the S_ISREG file which depends on those buffers. + * So the locking for private_list is via the private_lock in the address_space + * which backs the buffers. Which is different from the address_space + * against which the buffers are listed. So for a particular address_space, + * mapping->private_lock does *not* protect mapping->private_list! In fact, + * mapping->private_list will always be protected by the backing blockdev's + * ->private_lock. + * + * Which introduces a requirement: all buffers on an address_space's + * ->private_list must be from the same address_space: the blockdev's. + * + * address_spaces which do not place buffers at ->private_list via these + * utility functions are free to use private_lock and private_list for + * whatever they want. The only requirement is that list_empty(private_list) + * be true at clear_inode() time. + * + * FIXME: clear_inode should not call invalidate_inode_buffers(). The + * filesystems should do that. invalidate_inode_buffers() should just go + * BUG_ON(!list_empty). + * + * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should + * take an address_space, not an inode. And it should be called + * mark_buffer_dirty_fsync() to clearly define why those buffers are being + * queued up. + * + * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the + * list if it is already on a list. Because if the buffer is on a list, + * it *must* already be on the right one. If not, the filesystem is being + * silly. This will save a ton of locking. But first we have to ensure + * that buffers are taken *off* the old inode's list when they are freed + * (presumably in truncate). That requires careful auditing of all + * filesystems (do it inside bforget()). It could also be done by bringing + * b_inode back. + */ + +/* + * The buffer's backing address_space's private_lock must be held + */ +static void __remove_assoc_queue(struct buffer_head *bh) +{ + list_del_init(&bh->b_assoc_buffers); + WARN_ON(!bh->b_assoc_map); + if (buffer_write_io_error(bh)) + set_bit(AS_EIO, &bh->b_assoc_map->flags); + bh->b_assoc_map = NULL; +} + +int inode_has_buffers(struct inode *inode) +{ + return !list_empty(&inode->i_data.private_list); +} + +/* + * osync is designed to support O_SYNC io. It waits synchronously for + * all already-submitted IO to complete, but does not queue any new + * writes to the disk. + * + * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as + * you dirty the buffers, and then use osync_inode_buffers to wait for + * completion. Any other dirty buffers which are not yet queued for + * write will not be flushed to disk by the osync. + */ +static int osync_buffers_list(spinlock_t *lock, struct list_head *list) +{ + struct buffer_head *bh; + struct list_head *p; + int err = 0; + + spin_lock(lock); +repeat: + list_for_each_prev(p, list) { + bh = BH_ENTRY(p); + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(lock); + goto repeat; + } + } + spin_unlock(lock); + return err; +} + +static void do_thaw_one(struct super_block *sb, void *unused) +{ + char b[BDEVNAME_SIZE]; + while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) + printk(KERN_WARNING "Emergency Thaw on %s\n", + bdevname(sb->s_bdev, b)); +} + +static void do_thaw_all(struct work_struct *work) +{ + iterate_supers(do_thaw_one, NULL); + kfree(work); + printk(KERN_WARNING "Emergency Thaw complete\n"); +} + +/** + * emergency_thaw_all -- forcibly thaw every frozen filesystem + * + * Used for emergency unfreeze of all filesystems via SysRq + */ +void emergency_thaw_all(void) +{ + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_thaw_all); + schedule_work(work); + } +} + +/** + * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers + * @mapping: the mapping which wants those buffers written + * + * Starts I/O against the buffers at mapping->private_list, and waits upon + * that I/O. + * + * Basically, this is a convenience function for fsync(). + * @mapping is a file or directory which needs those buffers to be written for + * a successful fsync(). + */ +int sync_mapping_buffers(struct address_space *mapping) +{ + struct address_space *buffer_mapping = mapping->assoc_mapping; + + if (buffer_mapping == NULL || list_empty(&mapping->private_list)) + return 0; + + return fsync_buffers_list(&buffer_mapping->private_lock, + &mapping->private_list); +} +EXPORT_SYMBOL(sync_mapping_buffers); + +/* + * Called when we've recently written block `bblock', and it is known that + * `bblock' was for a buffer_boundary() buffer. This means that the block at + * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's + * dirty, schedule it for IO. So that indirects merge nicely with their data. + */ +void write_boundary_block(struct block_device *bdev, + sector_t bblock, unsigned blocksize) +{ + struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); + if (bh) { + if (buffer_dirty(bh)) + ll_rw_block(WRITE, 1, &bh); + put_bh(bh); + } +} + +void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct address_space *buffer_mapping = bh->b_page->mapping; + + mark_buffer_dirty(bh); + if (!mapping->assoc_mapping) { + mapping->assoc_mapping = buffer_mapping; + } else { + BUG_ON(mapping->assoc_mapping != buffer_mapping); + } + if (!bh->b_assoc_map) { + spin_lock(&buffer_mapping->private_lock); + list_move_tail(&bh->b_assoc_buffers, + &mapping->private_list); + bh->b_assoc_map = mapping; + spin_unlock(&buffer_mapping->private_lock); + } +} +EXPORT_SYMBOL(mark_buffer_dirty_inode); + +/* + * Mark the page dirty, and set it dirty in the radix tree, and mark the inode + * dirty. + * + * If warn is true, then emit a warning if the page is not uptodate and has + * not been truncated. + */ +static void __set_page_dirty(struct page *page, + struct address_space *mapping, int warn) +{ + spin_lock_irq(&mapping->tree_lock); + if (page->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(warn && !PageUptodate(page)); + account_page_dirtied(page, mapping); + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&mapping->tree_lock); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); +} + +/* + * Add a page to the dirty page list. + * + * It is a sad fact of life that this function is called from several places + * deeply under spinlocking. It may not sleep. + * + * If the page has buffers, the uptodate buffers are set dirty, to preserve + * dirty-state coherency between the page and the buffers. It the page does + * not have buffers then when they are later attached they will all be set + * dirty. + * + * The buffers are dirtied before the page is dirtied. There's a small race + * window in which a writepage caller may see the page cleanness but not the + * buffer dirtiness. That's fine. If this code were to set the page dirty + * before the buffers, a concurrent writepage caller could clear the page dirty + * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean + * page on the dirty page list. + * + * We use private_lock to lock against try_to_free_buffers while using the + * page's buffer list. Also use this to protect against clean buffers being + * added to the page after it was set dirty. + * + * FIXME: may need to call ->reservepage here as well. That's rather up to the + * address_space though. + */ +int __set_page_dirty_buffers(struct page *page) +{ + int newly_dirty; + struct address_space *mapping = page_mapping(page); + + if (unlikely(!mapping)) + return !TestSetPageDirty(page); + + spin_lock(&mapping->private_lock); + if (page_has_buffers(page)) { + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + + do { + set_buffer_dirty(bh); + bh = bh->b_this_page; + } while (bh != head); + } + newly_dirty = !TestSetPageDirty(page); + spin_unlock(&mapping->private_lock); + + if (newly_dirty) + __set_page_dirty(page, mapping, 1); + return newly_dirty; +} +EXPORT_SYMBOL(__set_page_dirty_buffers); + +/* + * Write out and wait upon a list of buffers. + * + * We have conflicting pressures: we want to make sure that all + * initially dirty buffers get waited on, but that any subsequently + * dirtied buffers don't. After all, we don't want fsync to last + * forever if somebody is actively writing to the file. + * + * Do this in two main stages: first we copy dirty buffers to a + * temporary inode list, queueing the writes as we go. Then we clean + * up, waiting for those writes to complete. + * + * During this second stage, any subsequent updates to the file may end + * up refiling the buffer on the original inode's dirty list again, so + * there is a chance we will end up with a buffer queued for write but + * not yet completed on that list. So, as a final cleanup we go through + * the osync code to catch these locked, dirty buffers without requeuing + * any newly dirty buffers for write. + */ +static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) +{ + struct buffer_head *bh; + struct list_head tmp; + struct address_space *mapping; + int err = 0, err2; + struct blk_plug plug; + + INIT_LIST_HEAD(&tmp); + blk_start_plug(&plug); + + spin_lock(lock); + while (!list_empty(list)) { + bh = BH_ENTRY(list->next); + mapping = bh->b_assoc_map; + __remove_assoc_queue(bh); + /* Avoid race with mark_buffer_dirty_inode() which does + * a lockless check and we rely on seeing the dirty bit */ + smp_mb(); + if (buffer_dirty(bh) || buffer_locked(bh)) { + list_add(&bh->b_assoc_buffers, &tmp); + bh->b_assoc_map = mapping; + if (buffer_dirty(bh)) { + get_bh(bh); + spin_unlock(lock); + /* + * Ensure any pending I/O completes so that + * write_dirty_buffer() actually writes the + * current contents - it is a noop if I/O is + * still in flight on potentially older + * contents. + */ + write_dirty_buffer(bh, WRITE_SYNC); + + /* + * Kick off IO for the previous mapping. Note + * that we will not run the very last mapping, + * wait_on_buffer() will do that for us + * through sync_buffer(). + */ + brelse(bh); + spin_lock(lock); + } + } + } + + spin_unlock(lock); + blk_finish_plug(&plug); + spin_lock(lock); + + while (!list_empty(&tmp)) { + bh = BH_ENTRY(tmp.prev); + get_bh(bh); + mapping = bh->b_assoc_map; + __remove_assoc_queue(bh); + /* Avoid race with mark_buffer_dirty_inode() which does + * a lockless check and we rely on seeing the dirty bit */ + smp_mb(); + if (buffer_dirty(bh)) { + list_add(&bh->b_assoc_buffers, + &mapping->private_list); + bh->b_assoc_map = mapping; + } + spin_unlock(lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(lock); + } + + spin_unlock(lock); + err2 = osync_buffers_list(lock, list); + if (err) + return err; + else + return err2; +} + +/* + * Invalidate any and all dirty buffers on a given inode. We are + * probably unmounting the fs, but that doesn't mean we have already + * done a sync(). Just drop the buffers from the inode list. + * + * NOTE: we take the inode's blockdev's mapping's private_lock. Which + * assumes that all the buffers are against the blockdev. Not true + * for reiserfs. + */ +void invalidate_inode_buffers(struct inode *inode) +{ + if (inode_has_buffers(inode)) { + struct address_space *mapping = &inode->i_data; + struct list_head *list = &mapping->private_list; + struct address_space *buffer_mapping = mapping->assoc_mapping; + + spin_lock(&buffer_mapping->private_lock); + while (!list_empty(list)) + __remove_assoc_queue(BH_ENTRY(list->next)); + spin_unlock(&buffer_mapping->private_lock); + } +} +EXPORT_SYMBOL(invalidate_inode_buffers); + +/* + * Remove any clean buffers from the inode's buffer list. This is called + * when we're trying to free the inode itself. Those buffers can pin it. + * + * Returns true if all buffers were removed. + */ +int remove_inode_buffers(struct inode *inode) +{ + int ret = 1; + + if (inode_has_buffers(inode)) { + struct address_space *mapping = &inode->i_data; + struct list_head *list = &mapping->private_list; + struct address_space *buffer_mapping = mapping->assoc_mapping; + + spin_lock(&buffer_mapping->private_lock); + while (!list_empty(list)) { + struct buffer_head *bh = BH_ENTRY(list->next); + if (buffer_dirty(bh)) { + ret = 0; + break; + } + __remove_assoc_queue(bh); + } + spin_unlock(&buffer_mapping->private_lock); + } + return ret; +} + +/* + * Create the appropriate buffers when given a page for data area and + * the size of each buffer.. Use the bh->b_this_page linked list to + * follow the buffers created. Return NULL if unable to create more + * buffers. + * + * The retry flag is used to differentiate async IO (paging, swapping) + * which may not fail from ordinary buffer allocations. + */ +struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, + int retry) +{ + struct buffer_head *bh, *head; + long offset; + +try_again: + head = NULL; + offset = PAGE_SIZE; + while ((offset -= size) >= 0) { + bh = alloc_buffer_head(GFP_NOFS); + if (!bh) + goto no_grow; + + bh->b_bdev = NULL; + bh->b_this_page = head; + bh->b_blocknr = -1; + head = bh; + + bh->b_state = 0; + atomic_set(&bh->b_count, 0); + bh->b_size = size; + + /* Link the buffer to its page */ + set_bh_page(bh, page, offset); + + init_buffer(bh, NULL, NULL); + } + return head; +/* + * In case anything failed, we just free everything we got. + */ +no_grow: + if (head) { + do { + bh = head; + head = head->b_this_page; + free_buffer_head(bh); + } while (head); + } + + /* + * Return failure for non-async IO requests. Async IO requests + * are not allowed to fail, so we have to wait until buffer heads + * become available. But we don't want tasks sleeping with + * partially complete buffers, so all were released above. + */ + if (!retry) + return NULL; + + /* We're _really_ low on memory. Now we just + * wait for old buffer heads to become free due to + * finishing IO. Since this is an async request and + * the reserve list is empty, we're sure there are + * async buffer heads in use. + */ + free_more_memory(); + goto try_again; +} +EXPORT_SYMBOL_GPL(alloc_page_buffers); + +static inline void +link_dev_buffers(struct page *page, struct buffer_head *head) +{ + struct buffer_head *bh, *tail; + + bh = head; + do { + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + attach_page_buffers(page, head); +} + +/* + * Initialise the state of a blockdev page's buffers. + */ +static void +init_page_buffers(struct page *page, struct block_device *bdev, + sector_t block, int size) +{ + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + int uptodate = PageUptodate(page); + sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode)); + + do { + if (!buffer_mapped(bh)) { + init_buffer(bh, NULL, NULL); + bh->b_bdev = bdev; + bh->b_blocknr = block; + if (uptodate) + set_buffer_uptodate(bh); + if (block < end_block) + set_buffer_mapped(bh); + } + block++; + bh = bh->b_this_page; + } while (bh != head); +} + +/* + * Create the page-cache page that contains the requested block. + * + * This is user purely for blockdev mappings. + */ +static struct page * +grow_dev_page(struct block_device *bdev, sector_t block, + pgoff_t index, int size) +{ + struct inode *inode = bdev->bd_inode; + struct page *page; + struct buffer_head *bh; + + page = find_or_create_page(inode->i_mapping, index, + (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); + if (!page) + return NULL; + + BUG_ON(!PageLocked(page)); + + if (page_has_buffers(page)) { + bh = page_buffers(page); + if (bh->b_size == size) { + init_page_buffers(page, bdev, block, size); + return page; + } + if (!try_to_free_buffers(page)) + goto failed; + } + + /* + * Allocate some buffers for this page + */ + bh = alloc_page_buffers(page, size, 0); + if (!bh) + goto failed; + + /* + * Link the page to the buffers and initialise them. Take the + * lock to be atomic wrt __find_get_block(), which does not + * run under the page lock. + */ + spin_lock(&inode->i_mapping->private_lock); + link_dev_buffers(page, bh); + init_page_buffers(page, bdev, block, size); + spin_unlock(&inode->i_mapping->private_lock); + return page; + +failed: + BUG(); + unlock_page(page); + page_cache_release(page); + return NULL; +} + +/* + * Create buffers for the specified block device block's page. If + * that page was dirty, the buffers are set dirty also. + */ +static int +grow_buffers(struct block_device *bdev, sector_t block, int size) +{ + struct page *page; + pgoff_t index; + int sizebits; + + sizebits = -1; + do { + sizebits++; + } while ((size << sizebits) < PAGE_SIZE); + + index = block >> sizebits; + + /* + * Check for a block which wants to lie outside our maximum possible + * pagecache index. (this comparison is done using sector_t types). + */ + if (unlikely(index != block >> sizebits)) { + char b[BDEVNAME_SIZE]; + + printk(KERN_ERR "%s: requested out-of-range block %llu for " + "device %s\n", + __func__, (unsigned long long)block, + bdevname(bdev, b)); + return -EIO; + } + block = index << sizebits; + /* Create a page with the proper size buffers.. */ + page = grow_dev_page(bdev, block, index, size); + if (!page) + return 0; + unlock_page(page); + page_cache_release(page); + return 1; +} + +static struct buffer_head * +__getblk_slow(struct block_device *bdev, sector_t block, int size) +{ + /* Size must be multiple of hard sectorsize */ + if (unlikely(size & (bdev_logical_block_size(bdev)-1) || + (size < 512 || size > PAGE_SIZE))) { + printk(KERN_ERR "getblk(): invalid block size %d requested\n", + size); + printk(KERN_ERR "logical block size: %d\n", + bdev_logical_block_size(bdev)); + + dump_stack(); + return NULL; + } + + for (;;) { + struct buffer_head * bh; + int ret; + + bh = __find_get_block(bdev, block, size); + if (bh) + return bh; + + ret = grow_buffers(bdev, block, size); + if (ret < 0) + return NULL; + if (ret == 0) + free_more_memory(); + } +} + +/* + * The relationship between dirty buffers and dirty pages: + * + * Whenever a page has any dirty buffers, the page's dirty bit is set, and + * the page is tagged dirty in its radix tree. + * + * At all times, the dirtiness of the buffers represents the dirtiness of + * subsections of the page. If the page has buffers, the page dirty bit is + * merely a hint about the true dirty state. + * + * When a page is set dirty in its entirety, all its buffers are marked dirty + * (if the page has buffers). + * + * When a buffer is marked dirty, its page is dirtied, but the page's other + * buffers are not. + * + * Also. When blockdev buffers are explicitly read with bread(), they + * individually become uptodate. But their backing page remains not + * uptodate - even if all of its buffers are uptodate. A subsequent + * block_read_full_page() against that page will discover all the uptodate + * buffers, will set the page uptodate and will perform no I/O. + */ + +/** + * mark_buffer_dirty - mark a buffer_head as needing writeout + * @bh: the buffer_head to mark dirty + * + * mark_buffer_dirty() will set the dirty bit against the buffer, then set its + * backing page dirty, then tag the page as dirty in its address_space's radix + * tree and then attach the address_space's inode to its superblock's dirty + * inode list. + * + * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, + * mapping->tree_lock and mapping->host->i_lock. + */ +void mark_buffer_dirty(struct buffer_head *bh) +{ + WARN_ON_ONCE(!buffer_uptodate(bh)); + + /* + * Very *carefully* optimize the it-is-already-dirty case. + * + * Don't let the final "is it dirty" escape to before we + * perhaps modified the buffer. + */ + if (buffer_dirty(bh)) { + smp_mb(); + if (buffer_dirty(bh)) + return; + } + + if (!test_set_buffer_dirty(bh)) { + struct page *page = bh->b_page; + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page_mapping(page); + if (mapping) + __set_page_dirty(page, mapping, 0); + } + } +} +EXPORT_SYMBOL(mark_buffer_dirty); + +/* + * Decrement a buffer_head's reference count. If all buffers against a page + * have zero reference count, are clean and unlocked, and if the page is clean + * and unlocked then try_to_free_buffers() may strip the buffers from the page + * in preparation for freeing it (sometimes, rarely, buffers are removed from + * a page but it ends up not being freed, and buffers may later be reattached). + */ +void __brelse(struct buffer_head * buf) +{ + if (atomic_read(&buf->b_count)) { + put_bh(buf); + return; + } + WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); +} +EXPORT_SYMBOL(__brelse); + +/* + * bforget() is like brelse(), except it discards any + * potentially dirty data. + */ +void __bforget(struct buffer_head *bh) +{ + clear_buffer_dirty(bh); + if (bh->b_assoc_map) { + struct address_space *buffer_mapping = bh->b_page->mapping; + + spin_lock(&buffer_mapping->private_lock); + list_del_init(&bh->b_assoc_buffers); + bh->b_assoc_map = NULL; + spin_unlock(&buffer_mapping->private_lock); + } + __brelse(bh); +} +EXPORT_SYMBOL(__bforget); + +static struct buffer_head *__bread_slow(struct buffer_head *bh) +{ + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } else { + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + } + brelse(bh); + return NULL; +} + +/* + * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). + * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their + * refcount elevated by one when they're in an LRU. A buffer can only appear + * once in a particular CPU's LRU. A single buffer can be present in multiple + * CPU's LRUs at the same time. + * + * This is a transparent caching front-end to sb_bread(), sb_getblk() and + * sb_find_get_block(). + * + * The LRUs themselves only need locking against invalidate_bh_lrus. We use + * a local interrupt disable for that. + */ + +#define BH_LRU_SIZE 8 + +struct bh_lru { + struct buffer_head *bhs[BH_LRU_SIZE]; +}; + +static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; + +#ifdef CONFIG_SMP +#define bh_lru_lock() local_irq_disable() +#define bh_lru_unlock() local_irq_enable() +#else +#define bh_lru_lock() preempt_disable() +#define bh_lru_unlock() preempt_enable() +#endif + +static inline void check_irqs_on(void) +{ +#ifdef irqs_disabled + BUG_ON(irqs_disabled()); +#endif +} + +/* + * The LRU management algorithm is dopey-but-simple. Sorry. + */ +static void bh_lru_install(struct buffer_head *bh) +{ + struct buffer_head *evictee = NULL; + + check_irqs_on(); + bh_lru_lock(); + if (__this_cpu_read(bh_lrus.bhs[0]) != bh) { + struct buffer_head *bhs[BH_LRU_SIZE]; + int in; + int out = 0; + + get_bh(bh); + bhs[out++] = bh; + for (in = 0; in < BH_LRU_SIZE; in++) { + struct buffer_head *bh2 = + __this_cpu_read(bh_lrus.bhs[in]); + + if (bh2 == bh) { + __brelse(bh2); + } else { + if (out >= BH_LRU_SIZE) { + BUG_ON(evictee != NULL); + evictee = bh2; + } else { + bhs[out++] = bh2; + } + } + } + while (out < BH_LRU_SIZE) + bhs[out++] = NULL; + memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); + } + bh_lru_unlock(); + + if (evictee) + __brelse(evictee); +} + +/* + * Look up the bh in this cpu's LRU. If it's there, move it to the head. + */ +static struct buffer_head * +lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) +{ + struct buffer_head *ret = NULL; + unsigned int i; + + check_irqs_on(); + bh_lru_lock(); + for (i = 0; i < BH_LRU_SIZE; i++) { + struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]); + + if (bh && bh->b_bdev == bdev && + bh->b_blocknr == block && bh->b_size == size) { + if (i) { + while (i) { + __this_cpu_write(bh_lrus.bhs[i], + __this_cpu_read(bh_lrus.bhs[i - 1])); + i--; + } + __this_cpu_write(bh_lrus.bhs[0], bh); + } + get_bh(bh); + ret = bh; + break; + } + } + bh_lru_unlock(); + return ret; +} + +/* + * Perform a pagecache lookup for the matching buffer. If it's there, refresh + * it in the LRU and mark it as accessed. If it is not present then return + * NULL + */ +struct buffer_head * +__find_get_block(struct block_device *bdev, sector_t block, unsigned size) +{ + struct buffer_head *bh = lookup_bh_lru(bdev, block, size); + + if (bh == NULL) { + bh = __find_get_block_slow(bdev, block); + if (bh) + bh_lru_install(bh); + } + if (bh) + touch_buffer(bh); + return bh; +} +EXPORT_SYMBOL(__find_get_block); + +/* + * __getblk will locate (and, if necessary, create) the buffer_head + * which corresponds to the passed block_device, block and size. The + * returned buffer has its reference count incremented. + * + * __getblk() cannot fail - it just keeps trying. If you pass it an + * illegal block number, __getblk() will happily return a buffer_head + * which represents the non-existent block. Very weird. + * + * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() + * attempt is failing. FIXME, perhaps? + */ +struct buffer_head * +__getblk(struct block_device *bdev, sector_t block, unsigned size) +{ + struct buffer_head *bh = __find_get_block(bdev, block, size); + + might_sleep(); + if (bh == NULL) + bh = __getblk_slow(bdev, block, size); + return bh; +} +EXPORT_SYMBOL(__getblk); + +/* + * Do async read-ahead on a buffer.. + */ +void __breadahead(struct block_device *bdev, sector_t block, unsigned size) +{ + struct buffer_head *bh = __getblk(bdev, block, size); + if (likely(bh)) { + ll_rw_block(READA, 1, &bh); + brelse(bh); + } +} +EXPORT_SYMBOL(__breadahead); + +/** + * __bread() - reads a specified block and returns the bh + * @bdev: the block_device to read from + * @block: number of block + * @size: size (in bytes) to read + * + * Reads a specified block, and returns buffer head that contains it. + * It returns NULL if the block was unreadable. + */ +struct buffer_head * +__bread(struct block_device *bdev, sector_t block, unsigned size) +{ + struct buffer_head *bh = __getblk(bdev, block, size); + + if (likely(bh) && !buffer_uptodate(bh)) + bh = __bread_slow(bh); + return bh; +} +EXPORT_SYMBOL(__bread); + +/* + * invalidate_bh_lrus() is called rarely - but not only at unmount. + * This doesn't race because it runs in each cpu either in irq + * or with preempt disabled. + */ +static void invalidate_bh_lru(void *arg) +{ + struct bh_lru *b = &get_cpu_var(bh_lrus); + int i; + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } + put_cpu_var(bh_lrus); +} + +void invalidate_bh_lrus(void) +{ + on_each_cpu(invalidate_bh_lru, NULL, 1); +} +EXPORT_SYMBOL_GPL(invalidate_bh_lrus); + +void set_bh_page(struct buffer_head *bh, + struct page *page, unsigned long offset) +{ + bh->b_page = page; + BUG_ON(offset >= PAGE_SIZE); + if (PageHighMem(page)) + /* + * This catches illegal uses and preserves the offset: + */ + bh->b_data = (char *)(0 + offset); + else + bh->b_data = page_address(page) + offset; +} +EXPORT_SYMBOL(set_bh_page); + +/* + * Called when truncating a buffer on a page completely. + */ +static void discard_buffer(struct buffer_head * bh) +{ + lock_buffer(bh); + clear_buffer_dirty(bh); + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); + unlock_buffer(bh); +} + +/** + * block_invalidatepage - invalidate part of all of a buffer-backed page + * + * @page: the page which is affected + * @offset: the index of the truncation point + * + * block_invalidatepage() is called when all or part of the page has become + * invalidatedby a truncate operation. + * + * block_invalidatepage() does not have to release all buffers, but it must + * ensure that no dirty buffer is left outside @offset and that no I/O + * is underway against any of the blocks which are outside the truncation + * point. Because the caller is about to free (and possibly reuse) those + * blocks on-disk. + */ +void block_invalidatepage(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + /* + * is this block fully invalidated? + */ + if (offset <= curr_off) + discard_buffer(bh); + curr_off = next_off; + bh = next; + } while (bh != head); + + /* + * We release buffers only if the entire page is being invalidated. + * The get_block cached value has been unconditionally invalidated, + * so real IO is not possible anymore. + */ + if (offset == 0) + try_to_release_page(page, 0); +out: + return; +} +EXPORT_SYMBOL(block_invalidatepage); + +/* + * We attach and possibly dirty the buffers atomically wrt + * __set_page_dirty_buffers() via private_lock. try_to_free_buffers + * is already excluded via the page lock. + */ +void create_empty_buffers(struct page *page, + unsigned long blocksize, unsigned long b_state) +{ + struct buffer_head *bh, *head, *tail; + + head = alloc_page_buffers(page, blocksize, 1); + bh = head; + do { + bh->b_state |= b_state; + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + + spin_lock(&page->mapping->private_lock); + if (PageUptodate(page) || PageDirty(page)) { + bh = head; + do { + if (PageDirty(page)) + set_buffer_dirty(bh); + if (PageUptodate(page)) + set_buffer_uptodate(bh); + bh = bh->b_this_page; + } while (bh != head); + } + attach_page_buffers(page, head); + spin_unlock(&page->mapping->private_lock); +} +EXPORT_SYMBOL(create_empty_buffers); + +/* + * We are taking a block for data and we don't want any output from any + * buffer-cache aliases starting from return from that function and + * until the moment when something will explicitly mark the buffer + * dirty (hopefully that will not happen until we will free that block ;-) + * We don't even need to mark it not-uptodate - nobody can expect + * anything from a newly allocated buffer anyway. We used to used + * unmap_buffer() for such invalidation, but that was wrong. We definitely + * don't want to mark the alias unmapped, for example - it would confuse + * anyone who might pick it with bread() afterwards... + * + * Also.. Note that bforget() doesn't lock the buffer. So there can + * be writeout I/O going on against recently-freed buffers. We don't + * wait on that I/O in bforget() - it's more efficient to wait on the I/O + * only if we really need to. That happens here. + */ +void unmap_underlying_metadata(struct block_device *bdev, sector_t block) +{ + struct buffer_head *old_bh; + + might_sleep(); + + old_bh = __find_get_block_slow(bdev, block); + if (old_bh) { + clear_buffer_dirty(old_bh); + wait_on_buffer(old_bh); + clear_buffer_req(old_bh); + __brelse(old_bh); + } +} +EXPORT_SYMBOL(unmap_underlying_metadata); + +/* + * NOTE! All mapped/uptodate combinations are valid: + * + * Mapped Uptodate Meaning + * + * No No "unknown" - must do get_block() + * No Yes "hole" - zero-filled + * Yes No "allocated" - allocated on disk, not read in + * Yes Yes "valid" - allocated and up-to-date in memory. + * + * "Dirty" is valid only with the last case (mapped+uptodate). + */ + +/* + * While block_write_full_page is writing back the dirty buffers under + * the page lock, whoever dirtied the buffers may decide to clean them + * again at any time. We handle that by only looking at the buffer + * state inside lock_buffer(). + * + * If block_write_full_page() is called for regular writeback + * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a + * locked buffer. This only can happen if someone has written the buffer + * directly, with submit_bh(). At the address_space level PageWriteback + * prevents this contention from occurring. + * + * If block_write_full_page() is called with wbc->sync_mode == + * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this + * causes the writes to be flagged as synchronous writes. + */ +static int __block_write_full_page(struct inode *inode, struct page *page, + get_block_t *get_block, struct writeback_control *wbc, + bh_end_io_t *handler) +{ + int err; + sector_t block; + sector_t last_block; + struct buffer_head *bh, *head; + const unsigned blocksize = 1 << inode->i_blkbits; + int nr_underway = 0; + int write_op = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE); + + BUG_ON(!PageLocked(page)); + + last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; + + if (!page_has_buffers(page)) { + create_empty_buffers(page, blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + + /* + * Be very careful. We have no exclusion from __set_page_dirty_buffers + * here, and the (potentially unmapped) buffers may become dirty at + * any time. If a buffer becomes dirty here after we've inspected it + * then we just miss that fact, and the page stays dirty. + * + * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * handle that here by just cleaning them. + */ + + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + head = page_buffers(page); + bh = head; + + /* + * Get all the dirty buffers mapped to disk addresses and + * handle any aliases from the underlying blockdev's mapping. + */ + do { + if (block > last_block) { + /* + * mapped buffers outside i_size will occur, because + * this page can be outside i_size when there is a + * truncate in progress. + */ + /* + * The buffer was zeroed by block_write_full_page() + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && + buffer_dirty(bh)) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, block, bh, 1); + if (err) + goto recover; + clear_buffer_delay(bh); + if (buffer_new(bh)) { + /* blockdev mappings never come here */ + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + } + } + bh = bh->b_this_page; + block++; + } while (bh != head); + + do { + if (!buffer_mapped(bh)) + continue; + /* + * If it's a fully non-blocking write attempt and we cannot + * lock the buffer then redirty the page. Note that this can + * potentially cause a busy-wait loop from writeback threads + * and kswapd activity, but those code paths have their own + * higher-level throttling. + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + lock_buffer(bh); + } else if (!trylock_buffer(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + if (test_clear_buffer_dirty(bh)) { + mark_buffer_async_write_endio(bh, handler); + } else { + unlock_buffer(bh); + } + } while ((bh = bh->b_this_page) != head); + + /* + * The page and its buffers are protected by PageWriteback(), so we can + * drop the bh refcounts early. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(write_op, bh); + nr_underway++; + } + bh = next; + } while (bh != head); + unlock_page(page); + + err = 0; +done: + if (nr_underway == 0) { + /* + * The page was marked dirty, but the buffers were + * clean. Someone wrote them back by hand with + * ll_rw_block/submit_bh. A rare case. + */ + end_page_writeback(page); + + /* + * The page and buffer_heads can be released at any time from + * here on. + */ + } + return err; + +recover: + /* + * ENOSPC, or some other error. We may already have added some + * blocks to the file, so we need to write these out to avoid + * exposing stale data. + * The page is currently locked and not marked for writeback + */ + bh = head; + /* Recovery: lock and submit the mapped buffers */ + do { + if (buffer_mapped(bh) && buffer_dirty(bh) && + !buffer_delay(bh)) { + lock_buffer(bh); + mark_buffer_async_write_endio(bh, handler); + } else { + /* + * The buffer may have been set dirty during + * attachment to a dirty page. + */ + clear_buffer_dirty(bh); + } + } while ((bh = bh->b_this_page) != head); + SetPageError(page); + BUG_ON(PageWriteback(page)); + mapping_set_error(page->mapping, err); + set_page_writeback(page); + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + clear_buffer_dirty(bh); + submit_bh(write_op, bh); + nr_underway++; + } + bh = next; + } while (bh != head); + unlock_page(page); + goto done; +} + +/* + * If a page has any new buffers, zero them out here, and mark them uptodate + * and dirty so they'll be written out (in order to prevent uninitialised + * block data from leaking). And clear the new bit. + */ +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) +{ + unsigned int block_start, block_end; + struct buffer_head *head, *bh; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + return; + + bh = head = page_buffers(page); + block_start = 0; + do { + block_end = block_start + bh->b_size; + + if (buffer_new(bh)) { + if (block_end > from && block_start < to) { + if (!PageUptodate(page)) { + unsigned start, size; + + start = max(from, block_start); + size = min(to, block_end) - start; + + zero_user(page, start, size); + set_buffer_uptodate(bh); + } + + clear_buffer_new(bh); + mark_buffer_dirty(bh); + } + } + + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); +} +EXPORT_SYMBOL(page_zero_new_buffers); + +int __block_write_begin(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block) +{ + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + struct inode *inode = page->mapping->host; + unsigned block_start, block_end; + sector_t block; + int err = 0; + unsigned blocksize, bbits; + struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; + + BUG_ON(!PageLocked(page)); + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > to); + + blocksize = 1 << inode->i_blkbits; + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + head = page_buffers(page); + + bbits = inode->i_blkbits; + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + + for(bh = head, block_start = 0; bh != head || !block_start; + block++, block_start=block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } + continue; + } + if (buffer_new(bh)) + clear_buffer_new(bh); + if (!buffer_mapped(bh)) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, block, bh, 1); + if (err) + break; + if (buffer_new(bh)) { + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + if (PageUptodate(page)) { + clear_buffer_new(bh); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + continue; + } + if (block_end > to || block_start < from) + zero_user_segments(page, + to, block_end, + block_start, from); + continue; + } + } + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + continue; + } + if (!buffer_uptodate(bh) && !buffer_delay(bh) && + !buffer_unwritten(bh) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++=bh; + } + } + /* + * If we issued read requests - let them complete. + */ + while(wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) + err = -EIO; + } + if (unlikely(err)) + page_zero_new_buffers(page, from, to); + return err; +} +EXPORT_SYMBOL(__block_write_begin); + +static int __block_commit_write(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + + blocksize = 1 << inode->i_blkbits; + + for(bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start=block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } + clear_buffer_new(bh); + } + + /* + * If this is a partial write which happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' whether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return 0; +} + +/* + * block_write_begin takes care of the basic task of block allocation and + * bringing partial write blocks uptodate first. + * + * The filesystem needs to handle block truncation upon failure. + */ +int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, + unsigned flags, struct page **pagep, get_block_t *get_block) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int status; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + + status = __block_write_begin(page, pos, len, get_block); + if (unlikely(status)) { + unlock_page(page); + page_cache_release(page); + page = NULL; + } + + *pagep = page; + return status; +} +EXPORT_SYMBOL(block_write_begin); + +int block_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + unsigned start; + + start = pos & (PAGE_CACHE_SIZE - 1); + + if (unlikely(copied < len)) { + /* + * The buffers that were written will now be uptodate, so we + * don't have to worry about a readpage reading them and + * overwriting a partial write. However if we have encountered + * a short write and only partially written into a buffer, it + * will not be marked uptodate, so a readpage might come in and + * destroy our partial write. + * + * Do the simplest thing, and just treat any short write to a + * non uptodate page as a zero-length write, and force the + * caller to redo the whole thing. + */ + if (!PageUptodate(page)) + copied = 0; + + page_zero_new_buffers(page, start+copied, start+len); + } + flush_dcache_page(page); + + /* This could be a short (even 0-length) commit */ + __block_commit_write(inode, page, start, start+copied); + + return copied; +} +EXPORT_SYMBOL(block_write_end); + +int generic_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int i_size_changed = 0; + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + * + * But it's important to update i_size while still holding page lock: + * page writeout could otherwise come in and zero beyond i_size. + */ + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + i_size_changed = 1; + } + + unlock_page(page); + page_cache_release(page); + + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + mark_inode_dirty(inode); + + return copied; +} +EXPORT_SYMBOL(generic_write_end); + +/* + * block_is_partially_uptodate checks whether buffers within a page are + * uptodate or not. + * + * Returns true if all buffers which correspond to a file portion + * we want to read are uptodate. + */ +int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, + unsigned long from) +{ + struct inode *inode = page->mapping->host; + unsigned block_start, block_end, blocksize; + unsigned to; + struct buffer_head *bh, *head; + int ret = 1; + + if (!page_has_buffers(page)) + return 0; + + blocksize = 1 << inode->i_blkbits; + to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); + to = from + to; + if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) + return 0; + + head = page_buffers(page); + bh = head; + block_start = 0; + do { + block_end = block_start + blocksize; + if (block_end > from && block_start < to) { + if (!buffer_uptodate(bh)) { + ret = 0; + break; + } + if (block_end >= to) + break; + } + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); + + return ret; +} +EXPORT_SYMBOL(block_is_partially_uptodate); + +/* + * Generic "read page" function for block devices that have the normal + * get_block functionality. This is most of the block device filesystems. + * Reads the page asynchronously --- the unlock_buffer() and + * set/clear_buffer_uptodate() functions propagate buffer state into the + * page struct once IO has completed. + */ +int block_read_full_page(struct page *page, get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + sector_t iblock, lblock; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + unsigned int blocksize; + int nr, i; + int fully_mapped = 1; + + BUG_ON(!PageLocked(page)); + blocksize = 1 << inode->i_blkbits; + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + head = page_buffers(page); + + iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; + bh = head; + nr = 0; + i = 0; + + do { + if (buffer_uptodate(bh)) + continue; + + if (!buffer_mapped(bh)) { + int err = 0; + + fully_mapped = 0; + if (iblock < lblock) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, iblock, bh, 0); + if (err) + SetPageError(page); + } + if (!buffer_mapped(bh)) { + zero_user(page, i * blocksize, blocksize); + if (!err) + set_buffer_uptodate(bh); + continue; + } + /* + * get_block() might have updated the buffer + * synchronously + */ + if (buffer_uptodate(bh)) + continue; + } + arr[nr++] = bh; + } while (i++, iblock++, (bh = bh->b_this_page) != head); + + if (fully_mapped) + SetPageMappedToDisk(page); + + if (!nr) { + /* + * All buffers are uptodate - we can set the page uptodate + * as well. But not if get_block() returned an error. + */ + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + return 0; + } + + /* Stage two: lock the buffers */ + for (i = 0; i < nr; i++) { + bh = arr[i]; + lock_buffer(bh); + mark_buffer_async_read(bh); + } + + /* + * Stage 3: start the IO. Check for uptodateness + * inside the buffer lock in case another process reading + * the underlying blockdev brought it uptodate (the sct fix). + */ + for (i = 0; i < nr; i++) { + bh = arr[i]; + if (buffer_uptodate(bh)) + end_buffer_async_read(bh, 1); + else + submit_bh(READ, bh); + } + return 0; +} +EXPORT_SYMBOL(block_read_full_page); + +/* utility function for filesystems that need to do work on expanding + * truncates. Uses filesystem pagecache writes to allow the filesystem to + * deal with the hole. + */ +int generic_cont_expand_simple(struct inode *inode, loff_t size) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + void *fsdata; + int err; + + err = inode_newsize_ok(inode, size); + if (err) + goto out; + + err = pagecache_write_begin(NULL, mapping, size, 0, + AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, + &page, &fsdata); + if (err) + goto out; + + err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); + BUG_ON(err > 0); + +out: + return err; +} +EXPORT_SYMBOL(generic_cont_expand_simple); + +static int cont_expand_zero(struct file *file, struct address_space *mapping, + loff_t pos, loff_t *bytes) +{ + struct inode *inode = mapping->host; + unsigned blocksize = 1 << inode->i_blkbits; + struct page *page; + void *fsdata; + pgoff_t index, curidx; + loff_t curpos; + unsigned zerofrom, offset, len; + int err = 0; + + index = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_CACHE_MASK; + + while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) { + zerofrom = curpos & ~PAGE_CACHE_MASK; + if (zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + len = PAGE_CACHE_SIZE - zerofrom; + + err = pagecache_write_begin(file, mapping, curpos, len, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (err) + goto out; + zero_user(page, zerofrom, len); + err = pagecache_write_end(file, mapping, curpos, len, len, + page, fsdata); + if (err < 0) + goto out; + BUG_ON(err != len); + err = 0; + + balance_dirty_pages_ratelimited(mapping); + } + + /* page covers the boundary, find the boundary offset */ + if (index == curidx) { + zerofrom = curpos & ~PAGE_CACHE_MASK; + /* if we will expand the thing last block will be filled */ + if (offset <= zerofrom) { + goto out; + } + if (zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + len = offset - zerofrom; + + err = pagecache_write_begin(file, mapping, curpos, len, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (err) + goto out; + zero_user(page, zerofrom, len); + err = pagecache_write_end(file, mapping, curpos, len, len, + page, fsdata); + if (err < 0) + goto out; + BUG_ON(err != len); + err = 0; + } +out: + return err; +} + +/* + * For moronic filesystems that do not allow holes in file. + * We may have to extend the file. + */ +int cont_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block, loff_t *bytes) +{ + struct inode *inode = mapping->host; + unsigned blocksize = 1 << inode->i_blkbits; + unsigned zerofrom; + int err; + + err = cont_expand_zero(file, mapping, pos, bytes); + if (err) + return err; + + zerofrom = *bytes & ~PAGE_CACHE_MASK; + if (pos+len > *bytes && zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + + return block_write_begin(mapping, pos, len, flags, pagep, get_block); +} +EXPORT_SYMBOL(cont_write_begin); + +int block_commit_write(struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + __block_commit_write(inode,page,from,to); + return 0; +} +EXPORT_SYMBOL(block_commit_write); + +/* + * block_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * truncate writes the inode size before removing pages, once we have the + * page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + * + * Direct callers of this function should call vfs_check_frozen() so that page + * fault does not busyloop until the fs is thawed. + */ +int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + struct page *page = vmf->page; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + unsigned long end; + loff_t size; + int ret; + + lock_page(page); + size = i_size_read(inode); + if ((page->mapping != inode->i_mapping) || + (page_offset(page) > size)) { + /* We overload EFAULT to mean page got truncated */ + ret = -EFAULT; + goto out_unlock; + } + + /* page is wholly or partially inside EOF */ + if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) + end = size & ~PAGE_CACHE_MASK; + else + end = PAGE_CACHE_SIZE; + + ret = __block_write_begin(page, 0, end, get_block); + if (!ret) + ret = block_commit_write(page, 0, end); + + if (unlikely(ret < 0)) + goto out_unlock; + /* + * Freezing in progress? We check after the page is marked dirty and + * with page lock held so if the test here fails, we are sure freezing + * code will wait during syncing until the page fault is done - at that + * point page will be dirty and unlocked so freezing code will write it + * and writeprotect it again. + */ + set_page_dirty(page); + if (inode->i_sb->s_frozen != SB_UNFROZEN) { + ret = -EAGAIN; + goto out_unlock; + } + wait_on_page_writeback(page); + return 0; +out_unlock: + unlock_page(page); + return ret; +} +EXPORT_SYMBOL(__block_page_mkwrite); + +int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + int ret; + struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; + + /* + * This check is racy but catches the common case. The check in + * __block_page_mkwrite() is reliable. + */ + vfs_check_frozen(sb, SB_FREEZE_WRITE); + ret = __block_page_mkwrite(vma, vmf, get_block); + return block_page_mkwrite_return(ret); +} +EXPORT_SYMBOL(block_page_mkwrite); + +/* + * nobh_write_begin()'s prereads are special: the buffer_heads are freed + * immediately, while under the page lock. So it needs a special end_io + * handler which does not touch the bh after unlocking it. + */ +static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) +{ + __end_buffer_read_notouch(bh, uptodate); +} + +/* + * Attach the singly-linked list of buffers created by nobh_write_begin, to + * the page (converting it to circular linked list and taking care of page + * dirty races). + */ +static void attach_nobh_buffers(struct page *page, struct buffer_head *head) +{ + struct buffer_head *bh; + + BUG_ON(!PageLocked(page)); + + spin_lock(&page->mapping->private_lock); + bh = head; + do { + if (PageDirty(page)) + set_buffer_dirty(bh); + if (!bh->b_this_page) + bh->b_this_page = head; + bh = bh->b_this_page; + } while (bh != head); + attach_page_buffers(page, head); + spin_unlock(&page->mapping->private_lock); +} + +/* + * On entry, the page is fully not uptodate. + * On exit the page is fully uptodate in the areas outside (from,to) + * The filesystem needs to handle block truncation upon failure. + */ +int nobh_write_begin(struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + struct inode *inode = mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocksize = 1 << blkbits; + struct buffer_head *head, *bh; + struct page *page; + pgoff_t index; + unsigned from, to; + unsigned block_in_page; + unsigned block_start, block_end; + sector_t block_in_file; + int nr_reads = 0; + int ret = 0; + int is_mapped_to_disk = 1; + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + *fsdata = NULL; + + if (page_has_buffers(page)) { + ret = __block_write_begin(page, pos, len, get_block); + if (unlikely(ret)) + goto out_release; + return ret; + } + + if (PageMappedToDisk(page)) + return 0; + + /* + * Allocate buffers so that we can keep track of state, and potentially + * attach them to the page if an error occurs. In the common case of + * no error, they will just be freed again without ever being attached + * to the page (which is all OK, because we're under the page lock). + * + * Be careful: the buffer linked list is a NULL terminated one, rather + * than the circular one we're used to. + */ + head = alloc_page_buffers(page, blocksize, 0); + if (!head) { + ret = -ENOMEM; + goto out_release; + } + + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + + /* + * We loop across all blocks in the page, whether or not they are + * part of the affected region. This is so we can discover if the + * page is fully mapped-to-disk. + */ + for (block_start = 0, block_in_page = 0, bh = head; + block_start < PAGE_CACHE_SIZE; + block_in_page++, block_start += blocksize, bh = bh->b_this_page) { + int create; + + block_end = block_start + blocksize; + bh->b_state = 0; + create = 1; + if (block_start >= to) + create = 0; + ret = get_block(inode, block_in_file + block_in_page, + bh, create); + if (ret) + goto failed; + if (!buffer_mapped(bh)) + is_mapped_to_disk = 0; + if (buffer_new(bh)) + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + if (PageUptodate(page)) { + set_buffer_uptodate(bh); + continue; + } + if (buffer_new(bh) || !buffer_mapped(bh)) { + zero_user_segments(page, block_start, from, + to, block_end); + continue; + } + if (buffer_uptodate(bh)) + continue; /* reiserfs does this */ + if (block_start < from || block_end > to) { + lock_buffer(bh); + bh->b_end_io = end_buffer_read_nobh; + submit_bh(READ, bh); + nr_reads++; + } + } + + if (nr_reads) { + /* + * The page is locked, so these buffers are protected from + * any VM or truncate activity. Hence we don't need to care + * for the buffer_head refcounts. + */ + for (bh = head; bh; bh = bh->b_this_page) { + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + ret = -EIO; + } + if (ret) + goto failed; + } + + if (is_mapped_to_disk) + SetPageMappedToDisk(page); + + *fsdata = head; /* to be released by nobh_write_end */ + + return 0; + +failed: + BUG_ON(!ret); + /* + * Error recovery is a bit difficult. We need to zero out blocks that + * were newly allocated, and dirty them to ensure they get written out. + * Buffers need to be attached to the page at this point, otherwise + * the handling of potential IO errors during writeout would be hard + * (could try doing synchronous writeout, but what if that fails too?) + */ + attach_nobh_buffers(page, head); + page_zero_new_buffers(page, from, to); + +out_release: + unlock_page(page); + page_cache_release(page); + *pagep = NULL; + + return ret; +} +EXPORT_SYMBOL(nobh_write_begin); + +int nobh_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *head = fsdata; + struct buffer_head *bh; + BUG_ON(fsdata != NULL && page_has_buffers(page)); + + if (unlikely(copied < len) && head) + attach_nobh_buffers(page, head); + if (page_has_buffers(page)) + return generic_write_end(file, mapping, pos, len, + copied, page, fsdata); + + SetPageUptodate(page); + set_page_dirty(page); + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + mark_inode_dirty(inode); + } + + unlock_page(page); + page_cache_release(page); + + while (head) { + bh = head; + head = head->b_this_page; + free_buffer_head(bh); + } + + return copied; +} +EXPORT_SYMBOL(nobh_write_end); + +/* + * nobh_writepage() - based on block_full_write_page() except + * that it tries to operate without attaching bufferheads to + * the page. + */ +int nobh_writepage(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) +{ + struct inode * const inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + int ret; + + /* Is the page fully inside i_size? */ + if (page->index < end_index) + goto out; + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index >= end_index+1 || !offset) { + /* + * The page may have dirty, unmapped buffers. For example, + * they may have been added in ext3_writepage(). Make them + * freeable here, so the page does not leak. + */ +#if 0 + /* Not really sure about this - do we need this ? */ + if (page->mapping->a_ops->invalidatepage) + page->mapping->a_ops->invalidatepage(page, offset); +#endif + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_CACHE_SIZE); +out: + ret = mpage_writepage(page, get_block, wbc); + if (ret == -EAGAIN) + ret = __block_write_full_page(inode, page, get_block, wbc, + end_buffer_async_write); + return ret; +} +EXPORT_SYMBOL(nobh_writepage); + +int nobh_truncate_page(struct address_space *mapping, + loff_t from, get_block_t *get_block) +{ + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize; + sector_t iblock; + unsigned length, pos; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head map_bh; + int err; + + blocksize = 1 << inode->i_blkbits; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + page = grab_cache_page(mapping, index); + err = -ENOMEM; + if (!page) + goto out; + + if (page_has_buffers(page)) { +has_buffers: + unlock_page(page); + page_cache_release(page); + return block_truncate_page(mapping, from, get_block); + } + + /* Find the buffer that contains "offset" */ + pos = blocksize; + while (offset >= pos) { + iblock++; + pos += blocksize; + } + + map_bh.b_size = blocksize; + map_bh.b_state = 0; + err = get_block(inode, iblock, &map_bh, 0); + if (err) + goto unlock; + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(&map_bh)) + goto unlock; + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (!PageUptodate(page)) { + err = mapping->a_ops->readpage(NULL, page); + if (err) { + page_cache_release(page); + goto out; + } + lock_page(page); + if (!PageUptodate(page)) { + err = -EIO; + goto unlock; + } + if (page_has_buffers(page)) + goto has_buffers; + } + zero_user(page, offset, length); + set_page_dirty(page); + err = 0; + +unlock: + unlock_page(page); + page_cache_release(page); +out: + return err; +} +EXPORT_SYMBOL(nobh_truncate_page); + +int block_truncate_page(struct address_space *mapping, + loff_t from, get_block_t *get_block) +{ + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize; + sector_t iblock; + unsigned length, pos; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head *bh; + int err; + + blocksize = 1 << inode->i_blkbits; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + page = grab_cache_page(mapping, index); + err = -ENOMEM; + if (!page) + goto out; + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (!buffer_mapped(bh)) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, iblock, bh, 0); + if (err) + goto unlock; + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + zero_user(page, offset, length); + mark_buffer_dirty(bh); + err = 0; + +unlock: + unlock_page(page); + page_cache_release(page); +out: + return err; +} +EXPORT_SYMBOL(block_truncate_page); + +/* + * The generic ->writepage function for buffer-backed address_spaces + * this form passes in the end_io handler used to finish the IO. + */ +int block_write_full_page_endio(struct page *page, get_block_t *get_block, + struct writeback_control *wbc, bh_end_io_t *handler) +{ + struct inode * const inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + + /* Is the page fully inside i_size? */ + if (page->index < end_index) + return __block_write_full_page(inode, page, get_block, wbc, + handler); + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index >= end_index+1 || !offset) { + /* + * The page may have dirty, unmapped buffers. For example, + * they may have been added in ext3_writepage(). Make them + * freeable here, so the page does not leak. + */ + do_invalidatepage(page, 0); + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + return __block_write_full_page(inode, page, get_block, wbc, handler); +} +EXPORT_SYMBOL(block_write_full_page_endio); + +/* + * The generic ->writepage function for buffer-backed address_spaces + */ +int block_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) +{ + return block_write_full_page_endio(page, get_block, wbc, + end_buffer_async_write); +} +EXPORT_SYMBOL(block_write_full_page); + +sector_t generic_block_bmap(struct address_space *mapping, sector_t block, + get_block_t *get_block) +{ + struct buffer_head tmp; + struct inode *inode = mapping->host; + tmp.b_state = 0; + tmp.b_blocknr = 0; + tmp.b_size = 1 << inode->i_blkbits; + get_block(inode, block, &tmp, 0); + return tmp.b_blocknr; +} +EXPORT_SYMBOL(generic_block_bmap); + +static void end_bio_bh_io_sync(struct bio *bio, int err) +{ + struct buffer_head *bh = bio->bi_private; + + if (err == -EOPNOTSUPP) { + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + } + + if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) + set_bit(BH_Quiet, &bh->b_state); + + bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); + bio_put(bio); +} + +int submit_bh(int rw, struct buffer_head * bh) +{ + struct bio *bio; + int ret = 0; + + BUG_ON(!buffer_locked(bh)); + BUG_ON(!buffer_mapped(bh)); + BUG_ON(!bh->b_end_io); + BUG_ON(buffer_delay(bh)); + BUG_ON(buffer_unwritten(bh)); + + /* + * Only clear out a write error when rewriting + */ + if (test_set_buffer_req(bh) && (rw & WRITE)) + clear_buffer_write_io_error(bh); + + /* + * from here on down, it's all bio -- do the initial mapping, + * submit_bio -> generic_make_request may further map this bio around + */ + bio = bio_alloc(GFP_NOIO, 1); + + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + bio->bi_io_vec[0].bv_page = bh->b_page; + bio->bi_io_vec[0].bv_len = bh->b_size; + bio->bi_io_vec[0].bv_offset = bh_offset(bh); + + bio->bi_vcnt = 1; + bio->bi_idx = 0; + bio->bi_size = bh->b_size; + + bio->bi_end_io = end_bio_bh_io_sync; + bio->bi_private = bh; + + bio_get(bio); + submit_bio(rw, bio); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + + bio_put(bio); + return ret; +} +EXPORT_SYMBOL(submit_bh); + +/** + * ll_rw_block: low-level access to block devices (DEPRECATED) + * @rw: whether to %READ or %WRITE or maybe %READA (readahead) + * @nr: number of &struct buffer_heads in the array + * @bhs: array of pointers to &struct buffer_head + * + * ll_rw_block() takes an array of pointers to &struct buffer_heads, and + * requests an I/O operation on them, either a %READ or a %WRITE. The third + * %READA option is described in the documentation for generic_make_request() + * which ll_rw_block() calls. + * + * This function drops any buffer that it cannot get a lock on (with the + * BH_Lock state bit), any buffer that appears to be clean when doing a write + * request, and any buffer that appears to be up-to-date when doing read + * request. Further it marks as clean buffers that are processed for + * writing (the buffer cache won't assume that they are actually clean + * until the buffer gets unlocked). + * + * ll_rw_block sets b_end_io to simple completion handler that marks + * the buffer up-to-date (if approriate), unlocks the buffer and wakes + * any waiters. + * + * All of the buffers must be for the same device, and must also be a + * multiple of the current approved size for the device. + */ +void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) +{ + int i; + + for (i = 0; i < nr; i++) { + struct buffer_head *bh = bhs[i]; + + if (!trylock_buffer(bh)) + continue; + if (rw == WRITE) { + if (test_clear_buffer_dirty(bh)) { + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + submit_bh(WRITE, bh); + continue; + } + } else { + if (!buffer_uptodate(bh)) { + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(rw, bh); + continue; + } + } + unlock_buffer(bh); + } +} +EXPORT_SYMBOL(ll_rw_block); + +void write_dirty_buffer(struct buffer_head *bh, int rw) +{ + lock_buffer(bh); + if (!test_clear_buffer_dirty(bh)) { + unlock_buffer(bh); + return; + } + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + submit_bh(rw, bh); +} +EXPORT_SYMBOL(write_dirty_buffer); + +/* + * For a data-integrity writeout, we need to wait upon any in-progress I/O + * and then start new I/O and then wait upon it. The caller must have a ref on + * the buffer_head. + */ +int __sync_dirty_buffer(struct buffer_head *bh, int rw) +{ + int ret = 0; + + WARN_ON(atomic_read(&bh->b_count) < 1); + lock_buffer(bh); + if (test_clear_buffer_dirty(bh)) { + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(rw, bh); + wait_on_buffer(bh); + if (!ret && !buffer_uptodate(bh)) + ret = -EIO; + } else { + unlock_buffer(bh); + } + return ret; +} +EXPORT_SYMBOL(__sync_dirty_buffer); + +int sync_dirty_buffer(struct buffer_head *bh) +{ + return __sync_dirty_buffer(bh, WRITE_SYNC); +} +EXPORT_SYMBOL(sync_dirty_buffer); + +/* + * try_to_free_buffers() checks if all the buffers on this particular page + * are unused, and releases them if so. + * + * Exclusion against try_to_free_buffers may be obtained by either + * locking the page or by holding its mapping's private_lock. + * + * If the page is dirty but all the buffers are clean then we need to + * be sure to mark the page clean as well. This is because the page + * may be against a block device, and a later reattachment of buffers + * to a dirty page will set *all* buffers dirty. Which would corrupt + * filesystem data on the same device. + * + * The same applies to regular filesystem pages: if all the buffers are + * clean then we set the page clean and proceed. To do that, we require + * total exclusion from __set_page_dirty_buffers(). That is obtained with + * private_lock. + * + * try_to_free_buffers() is non-blocking. + */ +static inline int buffer_busy(struct buffer_head *bh) +{ + return atomic_read(&bh->b_count) | + (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); +} + +static int +drop_buffers(struct page *page, struct buffer_head **buffers_to_free) +{ + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh; + + bh = head; + do { + if (buffer_write_io_error(bh) && page->mapping) + set_bit(AS_EIO, &page->mapping->flags); + if (buffer_busy(bh)) + goto failed; + bh = bh->b_this_page; + } while (bh != head); + + do { + struct buffer_head *next = bh->b_this_page; + + if (bh->b_assoc_map) + __remove_assoc_queue(bh); + bh = next; + } while (bh != head); + *buffers_to_free = head; + __clear_page_buffers(page); + return 1; +failed: + return 0; +} + +int try_to_free_buffers(struct page *page) +{ + struct address_space * const mapping = page->mapping; + struct buffer_head *buffers_to_free = NULL; + int ret = 0; + + BUG_ON(!PageLocked(page)); + if (PageWriteback(page)) + return 0; + + if (mapping == NULL) { /* can this still happen? */ + ret = drop_buffers(page, &buffers_to_free); + goto out; + } + + spin_lock(&mapping->private_lock); + ret = drop_buffers(page, &buffers_to_free); + + /* + * If the filesystem writes its buffers by hand (eg ext3) + * then we can have clean buffers against a dirty page. We + * clean the page here; otherwise the VM will never notice + * that the filesystem did any IO at all. + * + * Also, during truncate, discard_buffer will have marked all + * the page's buffers clean. We discover that here and clean + * the page also. + * + * private_lock must be held over this entire operation in order + * to synchronise against __set_page_dirty_buffers and prevent the + * dirty bit from being lost. + */ + if (ret) + cancel_dirty_page(page, PAGE_CACHE_SIZE); + spin_unlock(&mapping->private_lock); +out: + if (buffers_to_free) { + struct buffer_head *bh = buffers_to_free; + + do { + struct buffer_head *next = bh->b_this_page; + free_buffer_head(bh); + bh = next; + } while (bh != buffers_to_free); + } + return ret; +} +EXPORT_SYMBOL(try_to_free_buffers); + +/* + * There are no bdflush tunables left. But distributions are + * still running obsolete flush daemons, so we terminate them here. + * + * Use of bdflush() is deprecated and will be removed in a future kernel. + * The `flush-X' kernel threads fully replace bdflush daemons and this call. + */ +SYSCALL_DEFINE2(bdflush, int, func, long, data) +{ + static int msg_count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (msg_count < 5) { + msg_count++; + printk(KERN_INFO + "warning: process `%s' used the obsolete bdflush" + " system call\n", current->comm); + printk(KERN_INFO "Fix your initscripts?\n"); + } + + if (func == 1) + do_exit(0); + return 0; +} + +/* + * Buffer-head allocation + */ +static struct kmem_cache *bh_cachep; + +/* + * Once the number of bh's in the machine exceeds this level, we start + * stripping them in writeback. + */ +static int max_buffer_heads; + +int buffer_heads_over_limit; + +struct bh_accounting { + int nr; /* Number of live bh's */ + int ratelimit; /* Limit cacheline bouncing */ +}; + +static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; + +static void recalc_bh_state(void) +{ + int i; + int tot = 0; + + if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096) + return; + __this_cpu_write(bh_accounting.ratelimit, 0); + for_each_online_cpu(i) + tot += per_cpu(bh_accounting, i).nr; + buffer_heads_over_limit = (tot > max_buffer_heads); +} + +struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) +{ + struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); + if (ret) { + INIT_LIST_HEAD(&ret->b_assoc_buffers); + preempt_disable(); + __this_cpu_inc(bh_accounting.nr); + recalc_bh_state(); + preempt_enable(); + } + return ret; +} +EXPORT_SYMBOL(alloc_buffer_head); + +void free_buffer_head(struct buffer_head *bh) +{ + BUG_ON(!list_empty(&bh->b_assoc_buffers)); + kmem_cache_free(bh_cachep, bh); + preempt_disable(); + __this_cpu_dec(bh_accounting.nr); + recalc_bh_state(); + preempt_enable(); +} +EXPORT_SYMBOL(free_buffer_head); + +static void buffer_exit_cpu(int cpu) +{ + int i; + struct bh_lru *b = &per_cpu(bh_lrus, cpu); + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } + this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr); + per_cpu(bh_accounting, cpu).nr = 0; +} + +static int buffer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) + buffer_exit_cpu((unsigned long)hcpu); + return NOTIFY_OK; +} + +/** + * bh_uptodate_or_lock - Test whether the buffer is uptodate + * @bh: struct buffer_head + * + * Return true if the buffer is up-to-date and false, + * with the buffer locked, if not. + */ +int bh_uptodate_or_lock(struct buffer_head *bh) +{ + if (!buffer_uptodate(bh)) { + lock_buffer(bh); + if (!buffer_uptodate(bh)) + return 0; + unlock_buffer(bh); + } + return 1; +} +EXPORT_SYMBOL(bh_uptodate_or_lock); + +/** + * bh_submit_read - Submit a locked buffer for reading + * @bh: struct buffer_head + * + * Returns zero on success and -EIO on error. + */ +int bh_submit_read(struct buffer_head *bh) +{ + BUG_ON(!buffer_locked(bh)); + + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + return 0; + } + + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return 0; + return -EIO; +} +EXPORT_SYMBOL(bh_submit_read); + +void __init buffer_init(void) +{ + int nrpages; + + bh_cachep = kmem_cache_create("buffer_head", + sizeof(struct buffer_head), 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| + SLAB_MEM_SPREAD), + NULL); + + /* + * Limit the bh occupancy to 10% of ZONE_NORMAL + */ + nrpages = (nr_free_buffer_pages() * 10) / 100; + max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); + hotcpu_notifier(buffer_cpu_notify, 0); +} diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig new file mode 100644 index 00000000..80e9c616 --- /dev/null +++ b/fs/cachefiles/Kconfig @@ -0,0 +1,39 @@ + +config CACHEFILES + tristate "Filesystem caching on files" + depends on FSCACHE && BLOCK + help + This permits use of a mounted filesystem as a cache for other + filesystems - primarily networking filesystems - thus allowing fast + local disk to enhance the speed of slower devices. + + See Documentation/filesystems/caching/cachefiles.txt for more + information. + +config CACHEFILES_DEBUG + bool "Debug CacheFiles" + depends on CACHEFILES + help + This permits debugging to be dynamically enabled in the filesystem + caching on files module. If this is set, the debugging output may be + enabled by setting bits in /sys/modules/cachefiles/parameter/debug or + by including a debugging specifier in /etc/cachefilesd.conf. + +config CACHEFILES_HISTOGRAM + bool "Gather latency information on CacheFiles" + depends on CACHEFILES && PROC_FS + help + + This option causes latency information to be gathered on CacheFiles + operation and exported through file: + + /proc/fs/cachefiles/histogram + + The generation of this histogram adds a certain amount of overhead to + execution as there are a number of points at which data is gathered, + and on a multi-CPU system these may be on cachelines that keep + bouncing between CPUs. On the other hand, the histogram may be + useful for debugging purposes. Saying 'N' here is recommended. + + See Documentation/filesystems/caching/cachefiles.txt for more + information. diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile new file mode 100644 index 00000000..32cbab0f --- /dev/null +++ b/fs/cachefiles/Makefile @@ -0,0 +1,18 @@ +# +# Makefile for caching in a mounted filesystem +# + +cachefiles-y := \ + bind.o \ + daemon.o \ + interface.o \ + key.o \ + main.o \ + namei.o \ + rdwr.o \ + security.o \ + xattr.o + +cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o + +obj-$(CONFIG_CACHEFILES) := cachefiles.o diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c new file mode 100644 index 00000000..a2603e7c --- /dev/null +++ b/fs/cachefiles/bind.c @@ -0,0 +1,283 @@ +/* Bind and unbind a cache from the filesystem backing it + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches); + +/* + * bind a directory as a cache + */ +int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) +{ + _enter("{%u,%u,%u,%u,%u,%u},%s", + cache->frun_percent, + cache->fcull_percent, + cache->fstop_percent, + cache->brun_percent, + cache->bcull_percent, + cache->bstop_percent, + args); + + /* start by checking things over */ + ASSERT(cache->fstop_percent >= 0 && + cache->fstop_percent < cache->fcull_percent && + cache->fcull_percent < cache->frun_percent && + cache->frun_percent < 100); + + ASSERT(cache->bstop_percent >= 0 && + cache->bstop_percent < cache->bcull_percent && + cache->bcull_percent < cache->brun_percent && + cache->brun_percent < 100); + + if (*args) { + kerror("'bind' command doesn't take an argument"); + return -EINVAL; + } + + if (!cache->rootdirname) { + kerror("No cache directory specified"); + return -EINVAL; + } + + /* don't permit already bound caches to be re-bound */ + if (test_bit(CACHEFILES_READY, &cache->flags)) { + kerror("Cache already bound"); + return -EBUSY; + } + + /* make sure we have copies of the tag and dirname strings */ + if (!cache->tag) { + /* the tag string is released by the fops->release() + * function, so we don't release it on error here */ + cache->tag = kstrdup("CacheFiles", GFP_KERNEL); + if (!cache->tag) + return -ENOMEM; + } + + /* add the cache */ + return cachefiles_daemon_add_cache(cache); +} + +/* + * add a cache + */ +static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) +{ + struct cachefiles_object *fsdef; + struct path path; + struct kstatfs stats; + struct dentry *graveyard, *cachedir, *root; + const struct cred *saved_cred; + int ret; + + _enter(""); + + /* we want to work under the module's security ID */ + ret = cachefiles_get_security_ID(cache); + if (ret < 0) + return ret; + + cachefiles_begin_secure(cache, &saved_cred); + + /* allocate the root index object */ + ret = -ENOMEM; + + fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); + if (!fsdef) + goto error_root_object; + + ASSERTCMP(fsdef->backer, ==, NULL); + + atomic_set(&fsdef->usage, 1); + fsdef->type = FSCACHE_COOKIE_TYPE_INDEX; + + _debug("- fsdef %p", fsdef); + + /* look up the directory at the root of the cache */ + ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path); + if (ret < 0) + goto error_open_root; + + cache->mnt = path.mnt; + root = path.dentry; + + /* check parameters */ + ret = -EOPNOTSUPP; + if (!root->d_inode || + !root->d_inode->i_op || + !root->d_inode->i_op->lookup || + !root->d_inode->i_op->mkdir || + !root->d_inode->i_op->setxattr || + !root->d_inode->i_op->getxattr || + !root->d_sb || + !root->d_sb->s_op || + !root->d_sb->s_op->statfs || + !root->d_sb->s_op->sync_fs) + goto error_unsupported; + + ret = -EROFS; + if (root->d_sb->s_flags & MS_RDONLY) + goto error_unsupported; + + /* determine the security of the on-disk cache as this governs + * security ID of files we create */ + ret = cachefiles_determine_cache_security(cache, root, &saved_cred); + if (ret < 0) + goto error_unsupported; + + /* get the cache size and blocksize */ + ret = vfs_statfs(&path, &stats); + if (ret < 0) + goto error_unsupported; + + ret = -ERANGE; + if (stats.f_bsize <= 0) + goto error_unsupported; + + ret = -EOPNOTSUPP; + if (stats.f_bsize > PAGE_SIZE) + goto error_unsupported; + + cache->bsize = stats.f_bsize; + cache->bshift = 0; + if (stats.f_bsize < PAGE_SIZE) + cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize); + + _debug("blksize %u (shift %u)", + cache->bsize, cache->bshift); + + _debug("size %llu, avail %llu", + (unsigned long long) stats.f_blocks, + (unsigned long long) stats.f_bavail); + + /* set up caching limits */ + do_div(stats.f_files, 100); + cache->fstop = stats.f_files * cache->fstop_percent; + cache->fcull = stats.f_files * cache->fcull_percent; + cache->frun = stats.f_files * cache->frun_percent; + + _debug("limits {%llu,%llu,%llu} files", + (unsigned long long) cache->frun, + (unsigned long long) cache->fcull, + (unsigned long long) cache->fstop); + + stats.f_blocks >>= cache->bshift; + do_div(stats.f_blocks, 100); + cache->bstop = stats.f_blocks * cache->bstop_percent; + cache->bcull = stats.f_blocks * cache->bcull_percent; + cache->brun = stats.f_blocks * cache->brun_percent; + + _debug("limits {%llu,%llu,%llu} blocks", + (unsigned long long) cache->brun, + (unsigned long long) cache->bcull, + (unsigned long long) cache->bstop); + + /* get the cache directory and check its type */ + cachedir = cachefiles_get_directory(cache, root, "cache"); + if (IS_ERR(cachedir)) { + ret = PTR_ERR(cachedir); + goto error_unsupported; + } + + fsdef->dentry = cachedir; + fsdef->fscache.cookie = NULL; + + ret = cachefiles_check_object_type(fsdef); + if (ret < 0) + goto error_unsupported; + + /* get the graveyard directory */ + graveyard = cachefiles_get_directory(cache, root, "graveyard"); + if (IS_ERR(graveyard)) { + ret = PTR_ERR(graveyard); + goto error_unsupported; + } + + cache->graveyard = graveyard; + + /* publish the cache */ + fscache_init_cache(&cache->cache, + &cachefiles_cache_ops, + "%s", + fsdef->dentry->d_sb->s_id); + + fscache_object_init(&fsdef->fscache, NULL, &cache->cache); + + ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag); + if (ret < 0) + goto error_add_cache; + + /* done */ + set_bit(CACHEFILES_READY, &cache->flags); + dput(root); + + printk(KERN_INFO "CacheFiles:" + " File cache on %s registered\n", + cache->cache.identifier); + + /* check how much space the cache has */ + cachefiles_has_space(cache, 0, 0); + cachefiles_end_secure(cache, saved_cred); + return 0; + +error_add_cache: + dput(cache->graveyard); + cache->graveyard = NULL; +error_unsupported: + mntput(cache->mnt); + cache->mnt = NULL; + dput(fsdef->dentry); + fsdef->dentry = NULL; + dput(root); +error_open_root: + kmem_cache_free(cachefiles_object_jar, fsdef); +error_root_object: + cachefiles_end_secure(cache, saved_cred); + kerror("Failed to register: %d", ret); + return ret; +} + +/* + * unbind a cache on fd release + */ +void cachefiles_daemon_unbind(struct cachefiles_cache *cache) +{ + _enter(""); + + if (test_bit(CACHEFILES_READY, &cache->flags)) { + printk(KERN_INFO "CacheFiles:" + " File cache on %s unregistering\n", + cache->cache.identifier); + + fscache_withdraw_cache(&cache->cache); + } + + dput(cache->graveyard); + mntput(cache->mnt); + + kfree(cache->rootdirname); + kfree(cache->secctx); + kfree(cache->tag); + + _leave(""); +} diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c new file mode 100644 index 00000000..0a1467b1 --- /dev/null +++ b/fs/cachefiles/daemon.c @@ -0,0 +1,752 @@ +/* Daemon interface + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static int cachefiles_daemon_open(struct inode *, struct file *); +static int cachefiles_daemon_release(struct inode *, struct file *); +static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t, + loff_t *); +static ssize_t cachefiles_daemon_write(struct file *, const char __user *, + size_t, loff_t *); +static unsigned int cachefiles_daemon_poll(struct file *, + struct poll_table_struct *); +static int cachefiles_daemon_frun(struct cachefiles_cache *, char *); +static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *); +static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *); +static int cachefiles_daemon_brun(struct cachefiles_cache *, char *); +static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *); +static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *); +static int cachefiles_daemon_cull(struct cachefiles_cache *, char *); +static int cachefiles_daemon_debug(struct cachefiles_cache *, char *); +static int cachefiles_daemon_dir(struct cachefiles_cache *, char *); +static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *); +static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *); +static int cachefiles_daemon_tag(struct cachefiles_cache *, char *); + +static unsigned long cachefiles_open; + +const struct file_operations cachefiles_daemon_fops = { + .owner = THIS_MODULE, + .open = cachefiles_daemon_open, + .release = cachefiles_daemon_release, + .read = cachefiles_daemon_read, + .write = cachefiles_daemon_write, + .poll = cachefiles_daemon_poll, + .llseek = noop_llseek, +}; + +struct cachefiles_daemon_cmd { + char name[8]; + int (*handler)(struct cachefiles_cache *cache, char *args); +}; + +static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = { + { "bind", cachefiles_daemon_bind }, + { "brun", cachefiles_daemon_brun }, + { "bcull", cachefiles_daemon_bcull }, + { "bstop", cachefiles_daemon_bstop }, + { "cull", cachefiles_daemon_cull }, + { "debug", cachefiles_daemon_debug }, + { "dir", cachefiles_daemon_dir }, + { "frun", cachefiles_daemon_frun }, + { "fcull", cachefiles_daemon_fcull }, + { "fstop", cachefiles_daemon_fstop }, + { "inuse", cachefiles_daemon_inuse }, + { "secctx", cachefiles_daemon_secctx }, + { "tag", cachefiles_daemon_tag }, + { "", NULL } +}; + + +/* + * do various checks + */ +static int cachefiles_daemon_open(struct inode *inode, struct file *file) +{ + struct cachefiles_cache *cache; + + _enter(""); + + /* only the superuser may do this */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* the cachefiles device may only be open once at a time */ + if (xchg(&cachefiles_open, 1) == 1) + return -EBUSY; + + /* allocate a cache record */ + cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL); + if (!cache) { + cachefiles_open = 0; + return -ENOMEM; + } + + mutex_init(&cache->daemon_mutex); + cache->active_nodes = RB_ROOT; + rwlock_init(&cache->active_lock); + init_waitqueue_head(&cache->daemon_pollwq); + + /* set default caching limits + * - limit at 1% free space and/or free files + * - cull below 5% free space and/or free files + * - cease culling above 7% free space and/or free files + */ + cache->frun_percent = 7; + cache->fcull_percent = 5; + cache->fstop_percent = 1; + cache->brun_percent = 7; + cache->bcull_percent = 5; + cache->bstop_percent = 1; + + file->private_data = cache; + cache->cachefilesd = file; + return 0; +} + +/* + * release a cache + */ +static int cachefiles_daemon_release(struct inode *inode, struct file *file) +{ + struct cachefiles_cache *cache = file->private_data; + + _enter(""); + + ASSERT(cache); + + set_bit(CACHEFILES_DEAD, &cache->flags); + + cachefiles_daemon_unbind(cache); + + ASSERT(!cache->active_nodes.rb_node); + + /* clean up the control file interface */ + cache->cachefilesd = NULL; + file->private_data = NULL; + cachefiles_open = 0; + + kfree(cache); + + _leave(""); + return 0; +} + +/* + * read the cache state + */ +static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, + size_t buflen, loff_t *pos) +{ + struct cachefiles_cache *cache = file->private_data; + char buffer[256]; + int n; + + //_enter(",,%zu,", buflen); + + if (!test_bit(CACHEFILES_READY, &cache->flags)) + return 0; + + /* check how much space the cache has */ + cachefiles_has_space(cache, 0, 0); + + /* summarise */ + clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags); + + n = snprintf(buffer, sizeof(buffer), + "cull=%c" + " frun=%llx" + " fcull=%llx" + " fstop=%llx" + " brun=%llx" + " bcull=%llx" + " bstop=%llx", + test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0', + (unsigned long long) cache->frun, + (unsigned long long) cache->fcull, + (unsigned long long) cache->fstop, + (unsigned long long) cache->brun, + (unsigned long long) cache->bcull, + (unsigned long long) cache->bstop + ); + + if (n > buflen) + return -EMSGSIZE; + + if (copy_to_user(_buffer, buffer, n) != 0) + return -EFAULT; + + return n; +} + +/* + * command the cache + */ +static ssize_t cachefiles_daemon_write(struct file *file, + const char __user *_data, + size_t datalen, + loff_t *pos) +{ + const struct cachefiles_daemon_cmd *cmd; + struct cachefiles_cache *cache = file->private_data; + ssize_t ret; + char *data, *args, *cp; + + //_enter(",,%zu,", datalen); + + ASSERT(cache); + + if (test_bit(CACHEFILES_DEAD, &cache->flags)) + return -EIO; + + if (datalen < 0 || datalen > PAGE_SIZE - 1) + return -EOPNOTSUPP; + + /* drag the command string into the kernel so we can parse it */ + data = kmalloc(datalen + 1, GFP_KERNEL); + if (!data) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(data, _data, datalen) != 0) + goto error; + + data[datalen] = '\0'; + + ret = -EINVAL; + if (memchr(data, '\0', datalen)) + goto error; + + /* strip any newline */ + cp = memchr(data, '\n', datalen); + if (cp) { + if (cp == data) + goto error; + + *cp = '\0'; + } + + /* parse the command */ + ret = -EOPNOTSUPP; + + for (args = data; *args; args++) + if (isspace(*args)) + break; + if (*args) { + if (args == data) + goto error; + *args = '\0'; + args = skip_spaces(++args); + } + + /* run the appropriate command handler */ + for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++) + if (strcmp(cmd->name, data) == 0) + goto found_command; + +error: + kfree(data); + //_leave(" = %zd", ret); + return ret; + +found_command: + mutex_lock(&cache->daemon_mutex); + + ret = -EIO; + if (!test_bit(CACHEFILES_DEAD, &cache->flags)) + ret = cmd->handler(cache, args); + + mutex_unlock(&cache->daemon_mutex); + + if (ret == 0) + ret = datalen; + goto error; +} + +/* + * poll for culling state + * - use POLLOUT to indicate culling state + */ +static unsigned int cachefiles_daemon_poll(struct file *file, + struct poll_table_struct *poll) +{ + struct cachefiles_cache *cache = file->private_data; + unsigned int mask; + + poll_wait(file, &cache->daemon_pollwq, poll); + mask = 0; + + if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) + mask |= POLLIN; + + if (test_bit(CACHEFILES_CULLING, &cache->flags)) + mask |= POLLOUT; + + return mask; +} + +/* + * give a range error for cache space constraints + * - can be tail-called + */ +static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, + char *args) +{ + kerror("Free space limits must be in range" + " 0%%<=stop%" + */ +static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args) +{ + unsigned long frun; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + frun = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (frun <= cache->fcull_percent || frun >= 100) + return cachefiles_daemon_range_error(cache, args); + + cache->frun_percent = frun; + return 0; +} + +/* + * set the percentage of files at which to start culling + * - command: "fcull %" + */ +static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args) +{ + unsigned long fcull; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + fcull = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->fcull_percent = fcull; + return 0; +} + +/* + * set the percentage of files at which to stop allocating + * - command: "fstop %" + */ +static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args) +{ + unsigned long fstop; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + fstop = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (fstop < 0 || fstop >= cache->fcull_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->fstop_percent = fstop; + return 0; +} + +/* + * set the percentage of blocks at which to stop culling + * - command: "brun %" + */ +static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args) +{ + unsigned long brun; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + brun = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (brun <= cache->bcull_percent || brun >= 100) + return cachefiles_daemon_range_error(cache, args); + + cache->brun_percent = brun; + return 0; +} + +/* + * set the percentage of blocks at which to start culling + * - command: "bcull %" + */ +static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args) +{ + unsigned long bcull; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + bcull = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->bcull_percent = bcull; + return 0; +} + +/* + * set the percentage of blocks at which to stop allocating + * - command: "bstop %" + */ +static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args) +{ + unsigned long bstop; + + _enter(",%s", args); + + if (!*args) + return -EINVAL; + + bstop = simple_strtoul(args, &args, 10); + if (args[0] != '%' || args[1] != '\0') + return -EINVAL; + + if (bstop < 0 || bstop >= cache->bcull_percent) + return cachefiles_daemon_range_error(cache, args); + + cache->bstop_percent = bstop; + return 0; +} + +/* + * set the cache directory + * - command: "dir " + */ +static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) +{ + char *dir; + + _enter(",%s", args); + + if (!*args) { + kerror("Empty directory specified"); + return -EINVAL; + } + + if (cache->rootdirname) { + kerror("Second cache directory specified"); + return -EEXIST; + } + + dir = kstrdup(args, GFP_KERNEL); + if (!dir) + return -ENOMEM; + + cache->rootdirname = dir; + return 0; +} + +/* + * set the cache security context + * - command: "secctx " + */ +static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) +{ + char *secctx; + + _enter(",%s", args); + + if (!*args) { + kerror("Empty security context specified"); + return -EINVAL; + } + + if (cache->secctx) { + kerror("Second security context specified"); + return -EINVAL; + } + + secctx = kstrdup(args, GFP_KERNEL); + if (!secctx) + return -ENOMEM; + + cache->secctx = secctx; + return 0; +} + +/* + * set the cache tag + * - command: "tag " + */ +static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args) +{ + char *tag; + + _enter(",%s", args); + + if (!*args) { + kerror("Empty tag specified"); + return -EINVAL; + } + + if (cache->tag) + return -EEXIST; + + tag = kstrdup(args, GFP_KERNEL); + if (!tag) + return -ENOMEM; + + cache->tag = tag; + return 0; +} + +/* + * request a node in the cache be culled from the current working directory + * - command: "cull " + */ +static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) +{ + struct path path; + const struct cred *saved_cred; + int ret; + + _enter(",%s", args); + + if (strchr(args, '/')) + goto inval; + + if (!test_bit(CACHEFILES_READY, &cache->flags)) { + kerror("cull applied to unready cache"); + return -EIO; + } + + if (test_bit(CACHEFILES_DEAD, &cache->flags)) { + kerror("cull applied to dead cache"); + return -EIO; + } + + /* extract the directory dentry from the cwd */ + get_fs_pwd(current->fs, &path); + + if (!S_ISDIR(path.dentry->d_inode->i_mode)) + goto notdir; + + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_cull(cache, path.dentry, args); + cachefiles_end_secure(cache, saved_cred); + + path_put(&path); + _leave(" = %d", ret); + return ret; + +notdir: + path_put(&path); + kerror("cull command requires dirfd to be a directory"); + return -ENOTDIR; + +inval: + kerror("cull command requires dirfd and filename"); + return -EINVAL; +} + +/* + * set debugging mode + * - command: "debug " + */ +static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args) +{ + unsigned long mask; + + _enter(",%s", args); + + mask = simple_strtoul(args, &args, 0); + if (args[0] != '\0') + goto inval; + + cachefiles_debug = mask; + _leave(" = 0"); + return 0; + +inval: + kerror("debug command requires mask"); + return -EINVAL; +} + +/* + * find out whether an object in the current working directory is in use or not + * - command: "inuse " + */ +static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) +{ + struct path path; + const struct cred *saved_cred; + int ret; + + //_enter(",%s", args); + + if (strchr(args, '/')) + goto inval; + + if (!test_bit(CACHEFILES_READY, &cache->flags)) { + kerror("inuse applied to unready cache"); + return -EIO; + } + + if (test_bit(CACHEFILES_DEAD, &cache->flags)) { + kerror("inuse applied to dead cache"); + return -EIO; + } + + /* extract the directory dentry from the cwd */ + get_fs_pwd(current->fs, &path); + + if (!S_ISDIR(path.dentry->d_inode->i_mode)) + goto notdir; + + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_check_in_use(cache, path.dentry, args); + cachefiles_end_secure(cache, saved_cred); + + path_put(&path); + //_leave(" = %d", ret); + return ret; + +notdir: + path_put(&path); + kerror("inuse command requires dirfd to be a directory"); + return -ENOTDIR; + +inval: + kerror("inuse command requires dirfd and filename"); + return -EINVAL; +} + +/* + * see if we have space for a number of pages and/or a number of files in the + * cache + */ +int cachefiles_has_space(struct cachefiles_cache *cache, + unsigned fnr, unsigned bnr) +{ + struct kstatfs stats; + struct path path = { + .mnt = cache->mnt, + .dentry = cache->mnt->mnt_root, + }; + int ret; + + //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u", + // (unsigned long long) cache->frun, + // (unsigned long long) cache->fcull, + // (unsigned long long) cache->fstop, + // (unsigned long long) cache->brun, + // (unsigned long long) cache->bcull, + // (unsigned long long) cache->bstop, + // fnr, bnr); + + /* find out how many pages of blockdev are available */ + memset(&stats, 0, sizeof(stats)); + + ret = vfs_statfs(&path, &stats); + if (ret < 0) { + if (ret == -EIO) + cachefiles_io_error(cache, "statfs failed"); + _leave(" = %d", ret); + return ret; + } + + stats.f_bavail >>= cache->bshift; + + //_debug("avail %llu,%llu", + // (unsigned long long) stats.f_ffree, + // (unsigned long long) stats.f_bavail); + + /* see if there is sufficient space */ + if (stats.f_ffree > fnr) + stats.f_ffree -= fnr; + else + stats.f_ffree = 0; + + if (stats.f_bavail > bnr) + stats.f_bavail -= bnr; + else + stats.f_bavail = 0; + + ret = -ENOBUFS; + if (stats.f_ffree < cache->fstop || + stats.f_bavail < cache->bstop) + goto begin_cull; + + ret = 0; + if (stats.f_ffree < cache->fcull || + stats.f_bavail < cache->bcull) + goto begin_cull; + + if (test_bit(CACHEFILES_CULLING, &cache->flags) && + stats.f_ffree >= cache->frun && + stats.f_bavail >= cache->brun && + test_and_clear_bit(CACHEFILES_CULLING, &cache->flags) + ) { + _debug("cease culling"); + cachefiles_state_changed(cache); + } + + //_leave(" = 0"); + return 0; + +begin_cull: + if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) { + _debug("### CULL CACHE ###"); + cachefiles_state_changed(cache); + } + + _leave(" = %d", ret); + return ret; +} diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c new file mode 100644 index 00000000..1064805e --- /dev/null +++ b/fs/cachefiles/interface.c @@ -0,0 +1,470 @@ +/* FS-Cache interface to CacheFiles + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include "internal.h" + +#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) + +struct cachefiles_lookup_data { + struct cachefiles_xattr *auxdata; /* auxiliary data */ + char *key; /* key path */ +}; + +static int cachefiles_attr_changed(struct fscache_object *_object); + +/* + * allocate an object record for a cookie lookup and prepare the lookup data + */ +static struct fscache_object *cachefiles_alloc_object( + struct fscache_cache *_cache, + struct fscache_cookie *cookie) +{ + struct cachefiles_lookup_data *lookup_data; + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct cachefiles_xattr *auxdata; + unsigned keylen, auxlen; + void *buffer; + char *key; + + cache = container_of(_cache, struct cachefiles_cache, cache); + + _enter("{%s},%p,", cache->cache.identifier, cookie); + + lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL); + if (!lookup_data) + goto nomem_lookup_data; + + /* create a new object record and a temporary leaf image */ + object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); + if (!object) + goto nomem_object; + + ASSERTCMP(object->backer, ==, NULL); + + BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); + atomic_set(&object->usage, 1); + + fscache_object_init(&object->fscache, cookie, &cache->cache); + + object->type = cookie->def->type; + + /* get hold of the raw key + * - stick the length on the front and leave space on the back for the + * encoder + */ + buffer = kmalloc((2 + 512) + 3, GFP_KERNEL); + if (!buffer) + goto nomem_buffer; + + keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512); + ASSERTCMP(keylen, <, 512); + + *(uint16_t *)buffer = keylen; + ((char *)buffer)[keylen + 2] = 0; + ((char *)buffer)[keylen + 3] = 0; + ((char *)buffer)[keylen + 4] = 0; + + /* turn the raw key into something that can work with as a filename */ + key = cachefiles_cook_key(buffer, keylen + 2, object->type); + if (!key) + goto nomem_key; + + /* get hold of the auxiliary data and prepend the object type */ + auxdata = buffer; + auxlen = 0; + if (cookie->def->get_aux) { + auxlen = cookie->def->get_aux(cookie->netfs_data, + auxdata->data, 511); + ASSERTCMP(auxlen, <, 511); + } + + auxdata->len = auxlen + 1; + auxdata->type = cookie->def->type; + + lookup_data->auxdata = auxdata; + lookup_data->key = key; + object->lookup_data = lookup_data; + + _leave(" = %p [%p]", &object->fscache, lookup_data); + return &object->fscache; + +nomem_key: + kfree(buffer); +nomem_buffer: + BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); + kmem_cache_free(cachefiles_object_jar, object); + fscache_object_destroyed(&cache->cache); +nomem_object: + kfree(lookup_data); +nomem_lookup_data: + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); +} + +/* + * attempt to look up the nominated node in this cache + * - return -ETIMEDOUT to be scheduled again + */ +static int cachefiles_lookup_object(struct fscache_object *_object) +{ + struct cachefiles_lookup_data *lookup_data; + struct cachefiles_object *parent, *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + int ret; + + _enter("{OBJ%x}", _object->debug_id); + + cache = container_of(_object->cache, struct cachefiles_cache, cache); + parent = container_of(_object->parent, + struct cachefiles_object, fscache); + object = container_of(_object, struct cachefiles_object, fscache); + lookup_data = object->lookup_data; + + ASSERTCMP(lookup_data, !=, NULL); + + /* look up the key, creating any missing bits */ + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_walk_to_object(parent, object, + lookup_data->key, + lookup_data->auxdata); + cachefiles_end_secure(cache, saved_cred); + + /* polish off by setting the attributes of non-index files */ + if (ret == 0 && + object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) + cachefiles_attr_changed(&object->fscache); + + if (ret < 0 && ret != -ETIMEDOUT) { + if (ret != -ENOBUFS) + printk(KERN_WARNING + "CacheFiles: Lookup failed error %d\n", ret); + fscache_object_lookup_error(&object->fscache); + } + + _leave(" [%d]", ret); + return ret; +} + +/* + * indication of lookup completion + */ +static void cachefiles_lookup_complete(struct fscache_object *_object) +{ + struct cachefiles_object *object; + + object = container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data); + + if (object->lookup_data) { + kfree(object->lookup_data->key); + kfree(object->lookup_data->auxdata); + kfree(object->lookup_data); + object->lookup_data = NULL; + } +} + +/* + * increment the usage count on an inode object (may fail if unmounting) + */ +static +struct fscache_object *cachefiles_grab_object(struct fscache_object *_object) +{ + struct cachefiles_object *object = + container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage)); + +#ifdef CACHEFILES_DEBUG_SLAB + ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); +#endif + + atomic_inc(&object->usage); + return &object->fscache; +} + +/* + * update the auxiliary data for an object object on disk + */ +static void cachefiles_update_object(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct cachefiles_xattr *auxdata; + struct cachefiles_cache *cache; + struct fscache_cookie *cookie; + const struct cred *saved_cred; + unsigned auxlen; + + _enter("{OBJ%x}", _object->debug_id); + + object = container_of(_object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, struct cachefiles_cache, + cache); + cookie = object->fscache.cookie; + + if (!cookie->def->get_aux) { + _leave(" [no aux]"); + return; + } + + auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL); + if (!auxdata) { + _leave(" [nomem]"); + return; + } + + auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511); + ASSERTCMP(auxlen, <, 511); + + auxdata->len = auxlen + 1; + auxdata->type = cookie->def->type; + + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_update_object_xattr(object, auxdata); + cachefiles_end_secure(cache, saved_cred); + kfree(auxdata); + _leave(""); +} + +/* + * discard the resources pinned by an object and effect retirement if + * requested + */ +static void cachefiles_drop_object(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + + ASSERT(_object); + + object = container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%d}", + object->fscache.debug_id, atomic_read(&object->usage)); + + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + +#ifdef CACHEFILES_DEBUG_SLAB + ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); +#endif + + /* delete retired objects */ + if (object->fscache.state == FSCACHE_OBJECT_RECYCLING && + _object != cache->cache.fsdef + ) { + _debug("- retire object OBJ%x", object->fscache.debug_id); + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_delete_object(cache, object); + cachefiles_end_secure(cache, saved_cred); + } + + /* close the filesystem stuff attached to the object */ + if (object->backer != object->dentry) + dput(object->backer); + object->backer = NULL; + + /* note that the object is now inactive */ + if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { + write_lock(&cache->active_lock); + if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE, + &object->flags)) + BUG(); + rb_erase(&object->active_node, &cache->active_nodes); + wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); + write_unlock(&cache->active_lock); + } + + dput(object->dentry); + object->dentry = NULL; + + _leave(""); +} + +/* + * dispose of a reference to an object + */ +static void cachefiles_put_object(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct fscache_cache *cache; + + ASSERT(_object); + + object = container_of(_object, struct cachefiles_object, fscache); + + _enter("{OBJ%x,%d}", + object->fscache.debug_id, atomic_read(&object->usage)); + +#ifdef CACHEFILES_DEBUG_SLAB + ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); +#endif + + ASSERTIFCMP(object->fscache.parent, + object->fscache.parent->n_children, >, 0); + + if (atomic_dec_and_test(&object->usage)) { + _debug("- kill object OBJ%x", object->fscache.debug_id); + + ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); + ASSERTCMP(object->fscache.parent, ==, NULL); + ASSERTCMP(object->backer, ==, NULL); + ASSERTCMP(object->dentry, ==, NULL); + ASSERTCMP(object->fscache.n_ops, ==, 0); + ASSERTCMP(object->fscache.n_children, ==, 0); + + if (object->lookup_data) { + kfree(object->lookup_data->key); + kfree(object->lookup_data->auxdata); + kfree(object->lookup_data); + object->lookup_data = NULL; + } + + cache = object->fscache.cache; + fscache_object_destroy(&object->fscache); + kmem_cache_free(cachefiles_object_jar, object); + fscache_object_destroyed(cache); + } + + _leave(""); +} + +/* + * sync a cache + */ +static void cachefiles_sync_cache(struct fscache_cache *_cache) +{ + struct cachefiles_cache *cache; + const struct cred *saved_cred; + int ret; + + _enter("%p", _cache); + + cache = container_of(_cache, struct cachefiles_cache, cache); + + /* make sure all pages pinned by operations on behalf of the netfs are + * written to disc */ + cachefiles_begin_secure(cache, &saved_cred); + down_read(&cache->mnt->mnt_sb->s_umount); + ret = sync_filesystem(cache->mnt->mnt_sb); + up_read(&cache->mnt->mnt_sb->s_umount); + cachefiles_end_secure(cache, saved_cred); + + if (ret == -EIO) + cachefiles_io_error(cache, + "Attempt to sync backing fs superblock" + " returned error %d", + ret); +} + +/* + * notification the attributes on an object have changed + * - called with reads/writes excluded by FS-Cache + */ +static int cachefiles_attr_changed(struct fscache_object *_object) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + struct iattr newattrs; + uint64_t ni_size; + loff_t oi_size; + int ret; + + _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size); + + _enter("{OBJ%x},[%llu]", + _object->debug_id, (unsigned long long) ni_size); + + object = container_of(_object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + if (ni_size == object->i_size) + return 0; + + if (!object->backer) + return -ENOBUFS; + + ASSERT(S_ISREG(object->backer->d_inode->i_mode)); + + fscache_set_store_limit(&object->fscache, ni_size); + + oi_size = i_size_read(object->backer->d_inode); + if (oi_size == ni_size) + return 0; + + cachefiles_begin_secure(cache, &saved_cred); + mutex_lock(&object->backer->d_inode->i_mutex); + + /* if there's an extension to a partial page at the end of the backing + * file, we need to discard the partial page so that we pick up new + * data after it */ + if (oi_size & ~PAGE_MASK && ni_size > oi_size) { + _debug("discard tail %llx", oi_size); + newattrs.ia_valid = ATTR_SIZE; + newattrs.ia_size = oi_size & PAGE_MASK; + ret = notify_change(object->backer, &newattrs); + if (ret < 0) + goto truncate_failed; + } + + newattrs.ia_valid = ATTR_SIZE; + newattrs.ia_size = ni_size; + ret = notify_change(object->backer, &newattrs); + +truncate_failed: + mutex_unlock(&object->backer->d_inode->i_mutex); + cachefiles_end_secure(cache, saved_cred); + + if (ret == -EIO) { + fscache_set_store_limit(&object->fscache, 0); + cachefiles_io_error_obj(object, "Size set failed"); + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * dissociate a cache from all the pages it was backing + */ +static void cachefiles_dissociate_pages(struct fscache_cache *cache) +{ + _enter(""); +} + +const struct fscache_cache_ops cachefiles_cache_ops = { + .name = "cachefiles", + .alloc_object = cachefiles_alloc_object, + .lookup_object = cachefiles_lookup_object, + .lookup_complete = cachefiles_lookup_complete, + .grab_object = cachefiles_grab_object, + .update_object = cachefiles_update_object, + .drop_object = cachefiles_drop_object, + .put_object = cachefiles_put_object, + .sync_cache = cachefiles_sync_cache, + .attr_changed = cachefiles_attr_changed, + .read_or_alloc_page = cachefiles_read_or_alloc_page, + .read_or_alloc_pages = cachefiles_read_or_alloc_pages, + .allocate_page = cachefiles_allocate_page, + .allocate_pages = cachefiles_allocate_pages, + .write_page = cachefiles_write_page, + .uncache_page = cachefiles_uncache_page, + .dissociate_pages = cachefiles_dissociate_pages, +}; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h new file mode 100644 index 00000000..bd6bc1bd --- /dev/null +++ b/fs/cachefiles/internal.h @@ -0,0 +1,354 @@ +/* General netfs cache on cache files internal defs + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +struct cachefiles_cache; +struct cachefiles_object; + +extern unsigned cachefiles_debug; +#define CACHEFILES_DEBUG_KENTER 1 +#define CACHEFILES_DEBUG_KLEAVE 2 +#define CACHEFILES_DEBUG_KDEBUG 4 + +/* + * node records + */ +struct cachefiles_object { + struct fscache_object fscache; /* fscache handle */ + struct cachefiles_lookup_data *lookup_data; /* cached lookup data */ + struct dentry *dentry; /* the file/dir representing this object */ + struct dentry *backer; /* backing file */ + loff_t i_size; /* object size */ + unsigned long flags; +#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ +#define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */ + atomic_t usage; /* object usage count */ + uint8_t type; /* object type */ + uint8_t new; /* T if object new */ + spinlock_t work_lock; + struct rb_node active_node; /* link in active tree (dentry is key) */ +}; + +extern struct kmem_cache *cachefiles_object_jar; + +/* + * Cache files cache definition + */ +struct cachefiles_cache { + struct fscache_cache cache; /* FS-Cache record */ + struct vfsmount *mnt; /* mountpoint holding the cache */ + struct dentry *graveyard; /* directory into which dead objects go */ + struct file *cachefilesd; /* manager daemon handle */ + const struct cred *cache_cred; /* security override for accessing cache */ + struct mutex daemon_mutex; /* command serialisation mutex */ + wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */ + struct rb_root active_nodes; /* active nodes (can't be culled) */ + rwlock_t active_lock; /* lock for active_nodes */ + atomic_t gravecounter; /* graveyard uniquifier */ + unsigned frun_percent; /* when to stop culling (% files) */ + unsigned fcull_percent; /* when to start culling (% files) */ + unsigned fstop_percent; /* when to stop allocating (% files) */ + unsigned brun_percent; /* when to stop culling (% blocks) */ + unsigned bcull_percent; /* when to start culling (% blocks) */ + unsigned bstop_percent; /* when to stop allocating (% blocks) */ + unsigned bsize; /* cache's block size */ + unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */ + uint64_t frun; /* when to stop culling */ + uint64_t fcull; /* when to start culling */ + uint64_t fstop; /* when to stop allocating */ + sector_t brun; /* when to stop culling */ + sector_t bcull; /* when to start culling */ + sector_t bstop; /* when to stop allocating */ + unsigned long flags; +#define CACHEFILES_READY 0 /* T if cache prepared */ +#define CACHEFILES_DEAD 1 /* T if cache dead */ +#define CACHEFILES_CULLING 2 /* T if cull engaged */ +#define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */ + char *rootdirname; /* name of cache root directory */ + char *secctx; /* LSM security context */ + char *tag; /* cache binding tag */ +}; + +/* + * backing file read tracking + */ +struct cachefiles_one_read { + wait_queue_t monitor; /* link into monitored waitqueue */ + struct page *back_page; /* backing file page we're waiting for */ + struct page *netfs_page; /* netfs page we're going to fill */ + struct fscache_retrieval *op; /* retrieval op covering this */ + struct list_head op_link; /* link in op's todo list */ +}; + +/* + * backing file write tracking + */ +struct cachefiles_one_write { + struct page *netfs_page; /* netfs page to copy */ + struct cachefiles_object *object; + struct list_head obj_link; /* link in object's lists */ + fscache_rw_complete_t end_io_func; + void *context; +}; + +/* + * auxiliary data xattr buffer + */ +struct cachefiles_xattr { + uint16_t len; + uint8_t type; + uint8_t data[]; +}; + +/* + * note change of state for daemon + */ +static inline void cachefiles_state_changed(struct cachefiles_cache *cache) +{ + set_bit(CACHEFILES_STATE_CHANGED, &cache->flags); + wake_up_all(&cache->daemon_pollwq); +} + +/* + * bind.c + */ +extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args); +extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache); + +/* + * daemon.c + */ +extern const struct file_operations cachefiles_daemon_fops; + +extern int cachefiles_has_space(struct cachefiles_cache *cache, + unsigned fnr, unsigned bnr); + +/* + * interface.c + */ +extern const struct fscache_cache_ops cachefiles_cache_ops; + +/* + * key.c + */ +extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); + +/* + * namei.c + */ +extern int cachefiles_delete_object(struct cachefiles_cache *cache, + struct cachefiles_object *object); +extern int cachefiles_walk_to_object(struct cachefiles_object *parent, + struct cachefiles_object *object, + const char *key, + struct cachefiles_xattr *auxdata); +extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, + struct dentry *dir, + const char *name); + +extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, + char *filename); + +extern int cachefiles_check_in_use(struct cachefiles_cache *cache, + struct dentry *dir, char *filename); + +/* + * proc.c + */ +#ifdef CONFIG_CACHEFILES_HISTOGRAM +extern atomic_t cachefiles_lookup_histogram[HZ]; +extern atomic_t cachefiles_mkdir_histogram[HZ]; +extern atomic_t cachefiles_create_histogram[HZ]; + +extern int __init cachefiles_proc_init(void); +extern void cachefiles_proc_cleanup(void); +static inline +void cachefiles_hist(atomic_t histogram[], unsigned long start_jif) +{ + unsigned long jif = jiffies - start_jif; + if (jif >= HZ) + jif = HZ - 1; + atomic_inc(&histogram[jif]); +} + +#else +#define cachefiles_proc_init() (0) +#define cachefiles_proc_cleanup() do {} while (0) +#define cachefiles_hist(hist, start_jif) do {} while (0) +#endif + +/* + * rdwr.c + */ +extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *, + struct page *, gfp_t); +extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *, + struct list_head *, unsigned *, + gfp_t); +extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *, + gfp_t); +extern int cachefiles_allocate_pages(struct fscache_retrieval *, + struct list_head *, unsigned *, gfp_t); +extern int cachefiles_write_page(struct fscache_storage *, struct page *); +extern void cachefiles_uncache_page(struct fscache_object *, struct page *); + +/* + * security.c + */ +extern int cachefiles_get_security_ID(struct cachefiles_cache *cache); +extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache, + struct dentry *root, + const struct cred **_saved_cred); + +static inline void cachefiles_begin_secure(struct cachefiles_cache *cache, + const struct cred **_saved_cred) +{ + *_saved_cred = override_creds(cache->cache_cred); +} + +static inline void cachefiles_end_secure(struct cachefiles_cache *cache, + const struct cred *saved_cred) +{ + revert_creds(saved_cred); +} + +/* + * xattr.c + */ +extern int cachefiles_check_object_type(struct cachefiles_object *object); +extern int cachefiles_set_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata); +extern int cachefiles_update_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata); +extern int cachefiles_check_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata); +extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, + struct dentry *dentry); + + +/* + * error handling + */ +#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__) + +#define cachefiles_io_error(___cache, FMT, ...) \ +do { \ + kerror("I/O Error: " FMT, ##__VA_ARGS__); \ + fscache_io_error(&(___cache)->cache); \ + set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ +} while (0) + +#define cachefiles_io_error_obj(object, FMT, ...) \ +do { \ + struct cachefiles_cache *___cache; \ + \ + ___cache = container_of((object)->fscache.cache, \ + struct cachefiles_cache, cache); \ + cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \ +} while (0) + + +/* + * debug tracing + */ +#define dbgprintk(FMT, ...) \ + printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + +#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) + + +#if defined(__KDEBUG) +#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) +#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) +#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) + +#elif defined(CONFIG_CACHEFILES_DEBUG) +#define _enter(FMT, ...) \ +do { \ + if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \ + kenter(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _leave(FMT, ...) \ +do { \ + if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \ + kleave(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _debug(FMT, ...) \ +do { \ + if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \ + kdebug(FMT, ##__VA_ARGS__); \ +} while (0) + +#else +#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) +#endif + +#if 1 /* defined(__KDEBUGALL) */ + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#else + +#define ASSERT(X) do {} while (0) +#define ASSERTCMP(X, OP, Y) do {} while (0) +#define ASSERTIF(C, X) do {} while (0) +#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) + +#endif diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c new file mode 100644 index 00000000..81b8b2b3 --- /dev/null +++ b/fs/cachefiles/key.c @@ -0,0 +1,159 @@ +/* Key to pathname encoder + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include "internal.h" + +static const char cachefiles_charmap[64] = + "0123456789" /* 0 - 9 */ + "abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */ + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */ + "_-" /* 62 - 63 */ + ; + +static const char cachefiles_filecharmap[256] = { + /* we skip space and tab and control chars */ + [33 ... 46] = 1, /* '!' -> '.' */ + /* we skip '/' as it's significant to pathwalk */ + [48 ... 127] = 1, /* '0' -> '~' */ +}; + +/* + * turn the raw key into something cooked + * - the raw key should include the length in the two bytes at the front + * - the key may be up to 514 bytes in length (including the length word) + * - "base64" encode the strange keys, mapping 3 bytes of raw to four of + * cooked + * - need to cut the cooked key into 252 char lengths (189 raw bytes) + */ +char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type) +{ + unsigned char csum, ch; + unsigned int acc; + char *key; + int loop, len, max, seg, mark, print; + + _enter(",%d", keylen); + + BUG_ON(keylen < 2 || keylen > 514); + + csum = raw[0] + raw[1]; + print = 1; + for (loop = 2; loop < keylen; loop++) { + ch = raw[loop]; + csum += ch; + print &= cachefiles_filecharmap[ch]; + } + + if (print) { + /* if the path is usable ASCII, then we render it directly */ + max = keylen - 2; + max += 2; /* two base64'd length chars on the front */ + max += 5; /* @checksum/M */ + max += 3 * 2; /* maximum number of segment dividers (".../M") + * is ((514 + 251) / 252) = 3 + */ + max += 1; /* NUL on end */ + } else { + /* calculate the maximum length of the cooked key */ + keylen = (keylen + 2) / 3; + + max = keylen * 4; + max += 5; /* @checksum/M */ + max += 3 * 2; /* maximum number of segment dividers (".../M") + * is ((514 + 188) / 189) = 3 + */ + max += 1; /* NUL on end */ + } + + max += 1; /* 2nd NUL on end */ + + _debug("max: %d", max); + + key = kmalloc(max, GFP_KERNEL); + if (!key) + return NULL; + + len = 0; + + /* build the cooked key */ + sprintf(key, "@%02x%c+", (unsigned) csum, 0); + len = 5; + mark = len - 1; + + if (print) { + acc = *(uint16_t *) raw; + raw += 2; + + key[len + 1] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len] = cachefiles_charmap[acc & 63]; + len += 2; + + seg = 250; + for (loop = keylen; loop > 0; loop--) { + if (seg <= 0) { + key[len++] = '\0'; + mark = len; + key[len++] = '+'; + seg = 252; + } + + key[len++] = *raw++; + ASSERT(len < max); + } + + switch (type) { + case FSCACHE_COOKIE_TYPE_INDEX: type = 'I'; break; + case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'D'; break; + default: type = 'S'; break; + } + } else { + seg = 252; + for (loop = keylen; loop > 0; loop--) { + if (seg <= 0) { + key[len++] = '\0'; + mark = len; + key[len++] = '+'; + seg = 252; + } + + acc = *raw++; + acc |= *raw++ << 8; + acc |= *raw++ << 16; + + _debug("acc: %06x", acc); + + key[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len++] = cachefiles_charmap[acc & 63]; + acc >>= 6; + key[len++] = cachefiles_charmap[acc & 63]; + + ASSERT(len < max); + } + + switch (type) { + case FSCACHE_COOKIE_TYPE_INDEX: type = 'J'; break; + case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'E'; break; + default: type = 'T'; break; + } + } + + key[mark] = type; + key[len++] = 0; + key[len] = 0; + + _leave(" = %p %d", key, len); + return key; +} diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c new file mode 100644 index 00000000..4bfa8cf4 --- /dev/null +++ b/fs/cachefiles/main.c @@ -0,0 +1,106 @@ +/* Network filesystem caching backend to use cache files on a premounted + * filesystem + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +unsigned cachefiles_debug; +module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask"); + +MODULE_DESCRIPTION("Mounted-filesystem based cache"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +struct kmem_cache *cachefiles_object_jar; + +static struct miscdevice cachefiles_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cachefiles", + .fops = &cachefiles_daemon_fops, +}; + +static void cachefiles_object_init_once(void *_object) +{ + struct cachefiles_object *object = _object; + + memset(object, 0, sizeof(*object)); + spin_lock_init(&object->work_lock); +} + +/* + * initialise the fs caching module + */ +static int __init cachefiles_init(void) +{ + int ret; + + ret = misc_register(&cachefiles_dev); + if (ret < 0) + goto error_dev; + + /* create an object jar */ + ret = -ENOMEM; + cachefiles_object_jar = + kmem_cache_create("cachefiles_object_jar", + sizeof(struct cachefiles_object), + 0, + SLAB_HWCACHE_ALIGN, + cachefiles_object_init_once); + if (!cachefiles_object_jar) { + printk(KERN_NOTICE + "CacheFiles: Failed to allocate an object jar\n"); + goto error_object_jar; + } + + ret = cachefiles_proc_init(); + if (ret < 0) + goto error_proc; + + printk(KERN_INFO "CacheFiles: Loaded\n"); + return 0; + +error_proc: + kmem_cache_destroy(cachefiles_object_jar); +error_object_jar: + misc_deregister(&cachefiles_dev); +error_dev: + kerror("failed to register: %d", ret); + return ret; +} + +fs_initcall(cachefiles_init); + +/* + * clean up on module removal + */ +static void __exit cachefiles_exit(void) +{ + printk(KERN_INFO "CacheFiles: Unloading\n"); + + cachefiles_proc_cleanup(); + kmem_cache_destroy(cachefiles_object_jar); + misc_deregister(&cachefiles_dev); +} + +module_exit(cachefiles_exit); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c new file mode 100644 index 00000000..a0358c21 --- /dev/null +++ b/fs/cachefiles/namei.c @@ -0,0 +1,988 @@ +/* CacheFiles path walking and related routines + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define CACHEFILES_KEYBUF_SIZE 512 + +/* + * dump debugging info about an object + */ +static noinline +void __cachefiles_printk_object(struct cachefiles_object *object, + const char *prefix, + u8 *keybuf) +{ + struct fscache_cookie *cookie; + unsigned keylen, loop; + + printk(KERN_ERR "%sobject: OBJ%x\n", + prefix, object->fscache.debug_id); + printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", + prefix, fscache_object_states[object->fscache.state], + object->fscache.flags, work_busy(&object->fscache.work), + object->fscache.events, + object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK); + printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", + prefix, object->fscache.n_ops, object->fscache.n_in_progress, + object->fscache.n_exclusive); + printk(KERN_ERR "%sparent=%p\n", + prefix, object->fscache.parent); + + spin_lock(&object->fscache.lock); + cookie = object->fscache.cookie; + if (cookie) { + printk(KERN_ERR "%scookie=%p [pr=%p nd=%p fl=%lx]\n", + prefix, + object->fscache.cookie, + object->fscache.cookie->parent, + object->fscache.cookie->netfs_data, + object->fscache.cookie->flags); + if (keybuf) + keylen = cookie->def->get_key(cookie->netfs_data, keybuf, + CACHEFILES_KEYBUF_SIZE); + else + keylen = 0; + } else { + printk(KERN_ERR "%scookie=NULL\n", prefix); + keylen = 0; + } + spin_unlock(&object->fscache.lock); + + if (keylen) { + printk(KERN_ERR "%skey=[%u] '", prefix, keylen); + for (loop = 0; loop < keylen; loop++) + printk("%02x", keybuf[loop]); + printk("'\n"); + } +} + +/* + * dump debugging info about a pair of objects + */ +static noinline void cachefiles_printk_object(struct cachefiles_object *object, + struct cachefiles_object *xobject) +{ + u8 *keybuf; + + keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO); + if (object) + __cachefiles_printk_object(object, "", keybuf); + if (xobject) + __cachefiles_printk_object(xobject, "x", keybuf); + kfree(keybuf); +} + +/* + * mark the owner of a dentry, if there is one, to indicate that that dentry + * has been preemptively deleted + * - the caller must hold the i_mutex on the dentry's parent as required to + * call vfs_unlink(), vfs_rmdir() or vfs_rename() + */ +static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, + struct dentry *dentry) +{ + struct cachefiles_object *object; + struct rb_node *p; + + _enter(",'%*.*s'", + dentry->d_name.len, dentry->d_name.len, dentry->d_name.name); + + write_lock(&cache->active_lock); + + p = cache->active_nodes.rb_node; + while (p) { + object = rb_entry(p, struct cachefiles_object, active_node); + if (object->dentry > dentry) + p = p->rb_left; + else if (object->dentry < dentry) + p = p->rb_right; + else + goto found_dentry; + } + + write_unlock(&cache->active_lock); + _leave(" [no owner]"); + return; + + /* found the dentry for */ +found_dentry: + kdebug("preemptive burial: OBJ%x [%s] %p", + object->fscache.debug_id, + fscache_object_states[object->fscache.state], + dentry); + + if (object->fscache.state < FSCACHE_OBJECT_DYING) { + printk(KERN_ERR "\n"); + printk(KERN_ERR "CacheFiles: Error:" + " Can't preemptively bury live object\n"); + cachefiles_printk_object(object, NULL); + } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { + printk(KERN_ERR "CacheFiles: Error:" + " Object already preemptively buried\n"); + } + + write_unlock(&cache->active_lock); + _leave(" [owner marked]"); +} + +/* + * record the fact that an object is now active + */ +static int cachefiles_mark_object_active(struct cachefiles_cache *cache, + struct cachefiles_object *object) +{ + struct cachefiles_object *xobject; + struct rb_node **_p, *_parent = NULL; + struct dentry *dentry; + + _enter(",%p", object); + +try_again: + write_lock(&cache->active_lock); + + if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { + printk(KERN_ERR "CacheFiles: Error: Object already active\n"); + cachefiles_printk_object(object, NULL); + BUG(); + } + + dentry = object->dentry; + _p = &cache->active_nodes.rb_node; + while (*_p) { + _parent = *_p; + xobject = rb_entry(_parent, + struct cachefiles_object, active_node); + + ASSERT(xobject != object); + + if (xobject->dentry > dentry) + _p = &(*_p)->rb_left; + else if (xobject->dentry < dentry) + _p = &(*_p)->rb_right; + else + goto wait_for_old_object; + } + + rb_link_node(&object->active_node, _parent, _p); + rb_insert_color(&object->active_node, &cache->active_nodes); + + write_unlock(&cache->active_lock); + _leave(" = 0"); + return 0; + + /* an old object from a previous incarnation is hogging the slot - we + * need to wait for it to be destroyed */ +wait_for_old_object: + if (xobject->fscache.state < FSCACHE_OBJECT_DYING) { + printk(KERN_ERR "\n"); + printk(KERN_ERR "CacheFiles: Error:" + " Unexpected object collision\n"); + cachefiles_printk_object(object, xobject); + BUG(); + } + atomic_inc(&xobject->usage); + write_unlock(&cache->active_lock); + + if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { + wait_queue_head_t *wq; + + signed long timeout = 60 * HZ; + wait_queue_t wait; + bool requeue; + + /* if the object we're waiting for is queued for processing, + * then just put ourselves on the queue behind it */ + if (work_pending(&xobject->fscache.work)) { + _debug("queue OBJ%x behind OBJ%x immediately", + object->fscache.debug_id, + xobject->fscache.debug_id); + goto requeue; + } + + /* otherwise we sleep until either the object we're waiting for + * is done, or the fscache_object is congested */ + wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); + init_wait(&wait); + requeue = false; + do { + prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); + if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) + break; + + requeue = fscache_object_sleep_till_congested(&timeout); + } while (timeout > 0 && !requeue); + finish_wait(wq, &wait); + + if (requeue && + test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) { + _debug("queue OBJ%x behind OBJ%x after wait", + object->fscache.debug_id, + xobject->fscache.debug_id); + goto requeue; + } + + if (timeout <= 0) { + printk(KERN_ERR "\n"); + printk(KERN_ERR "CacheFiles: Error: Overlong" + " wait for old active object to go away\n"); + cachefiles_printk_object(object, xobject); + goto requeue; + } + } + + ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)); + + cache->cache.ops->put_object(&xobject->fscache); + goto try_again; + +requeue: + clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); + cache->cache.ops->put_object(&xobject->fscache); + _leave(" = -ETIMEDOUT"); + return -ETIMEDOUT; +} + +/* + * delete an object representation from the cache + * - file backed objects are unlinked + * - directory backed objects are stuffed into the graveyard for userspace to + * delete + * - unlocks the directory mutex + */ +static int cachefiles_bury_object(struct cachefiles_cache *cache, + struct dentry *dir, + struct dentry *rep, + bool preemptive) +{ + struct dentry *grave, *trap; + struct path path, path_to_graveyard; + char nbuffer[8 + 8 + 1]; + int ret; + + _enter(",'%*.*s','%*.*s'", + dir->d_name.len, dir->d_name.len, dir->d_name.name, + rep->d_name.len, rep->d_name.len, rep->d_name.name); + + _debug("remove %p from %p", rep, dir); + + /* non-directories can just be unlinked */ + if (!S_ISDIR(rep->d_inode->i_mode)) { + _debug("unlink stale object"); + + path.mnt = cache->mnt; + path.dentry = dir; + ret = security_path_unlink(&path, rep); + if (ret < 0) { + cachefiles_io_error(cache, "Unlink security error"); + } else { + ret = vfs_unlink(dir->d_inode, rep); + + if (preemptive) + cachefiles_mark_object_buried(cache, rep); + } + + mutex_unlock(&dir->d_inode->i_mutex); + + if (ret == -EIO) + cachefiles_io_error(cache, "Unlink failed"); + + _leave(" = %d", ret); + return ret; + } + + /* directories have to be moved to the graveyard */ + _debug("move stale object to graveyard"); + mutex_unlock(&dir->d_inode->i_mutex); + +try_again: + /* first step is to make up a grave dentry in the graveyard */ + sprintf(nbuffer, "%08x%08x", + (uint32_t) get_seconds(), + (uint32_t) atomic_inc_return(&cache->gravecounter)); + + /* do the multiway lock magic */ + trap = lock_rename(cache->graveyard, dir); + + /* do some checks before getting the grave dentry */ + if (rep->d_parent != dir) { + /* the entry was probably culled when we dropped the parent dir + * lock */ + unlock_rename(cache->graveyard, dir); + _leave(" = 0 [culled?]"); + return 0; + } + + if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) { + unlock_rename(cache->graveyard, dir); + cachefiles_io_error(cache, "Graveyard no longer a directory"); + return -EIO; + } + + if (trap == rep) { + unlock_rename(cache->graveyard, dir); + cachefiles_io_error(cache, "May not make directory loop"); + return -EIO; + } + + if (d_mountpoint(rep)) { + unlock_rename(cache->graveyard, dir); + cachefiles_io_error(cache, "Mountpoint in cache"); + return -EIO; + } + + grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer)); + if (IS_ERR(grave)) { + unlock_rename(cache->graveyard, dir); + + if (PTR_ERR(grave) == -ENOMEM) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + cachefiles_io_error(cache, "Lookup error %ld", + PTR_ERR(grave)); + return -EIO; + } + + if (grave->d_inode) { + unlock_rename(cache->graveyard, dir); + dput(grave); + grave = NULL; + cond_resched(); + goto try_again; + } + + if (d_mountpoint(grave)) { + unlock_rename(cache->graveyard, dir); + dput(grave); + cachefiles_io_error(cache, "Mountpoint in graveyard"); + return -EIO; + } + + /* target should not be an ancestor of source */ + if (trap == grave) { + unlock_rename(cache->graveyard, dir); + dput(grave); + cachefiles_io_error(cache, "May not make directory loop"); + return -EIO; + } + + /* attempt the rename */ + path.mnt = cache->mnt; + path.dentry = dir; + path_to_graveyard.mnt = cache->mnt; + path_to_graveyard.dentry = cache->graveyard; + ret = security_path_rename(&path, rep, &path_to_graveyard, grave); + if (ret < 0) { + cachefiles_io_error(cache, "Rename security error %d", ret); + } else { + ret = vfs_rename(dir->d_inode, rep, + cache->graveyard->d_inode, grave); + if (ret != 0 && ret != -ENOMEM) + cachefiles_io_error(cache, + "Rename failed with error %d", ret); + + if (preemptive) + cachefiles_mark_object_buried(cache, rep); + } + + unlock_rename(cache->graveyard, dir); + dput(grave); + _leave(" = 0"); + return 0; +} + +/* + * delete an object representation from the cache + */ +int cachefiles_delete_object(struct cachefiles_cache *cache, + struct cachefiles_object *object) +{ + struct dentry *dir; + int ret; + + _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry); + + ASSERT(object->dentry); + ASSERT(object->dentry->d_inode); + ASSERT(object->dentry->d_parent); + + dir = dget_parent(object->dentry); + + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + + if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { + /* object allocation for the same key preemptively deleted this + * object's file so that it could create its own file */ + _debug("object preemptively buried"); + mutex_unlock(&dir->d_inode->i_mutex); + ret = 0; + } else { + /* we need to check that our parent is _still_ our parent - it + * may have been renamed */ + if (dir == object->dentry->d_parent) { + ret = cachefiles_bury_object(cache, dir, + object->dentry, false); + } else { + /* it got moved, presumably by cachefilesd culling it, + * so it's no longer in the key path and we can ignore + * it */ + mutex_unlock(&dir->d_inode->i_mutex); + ret = 0; + } + } + + dput(dir); + _leave(" = %d", ret); + return ret; +} + +/* + * walk from the parent object to the child object through the backing + * filesystem, creating directories as we go + */ +int cachefiles_walk_to_object(struct cachefiles_object *parent, + struct cachefiles_object *object, + const char *key, + struct cachefiles_xattr *auxdata) +{ + struct cachefiles_cache *cache; + struct dentry *dir, *next = NULL; + struct path path; + unsigned long start; + const char *name; + int ret, nlen; + + _enter("OBJ%x{%p},OBJ%x,%s,", + parent->fscache.debug_id, parent->dentry, + object->fscache.debug_id, key); + + cache = container_of(parent->fscache.cache, + struct cachefiles_cache, cache); + path.mnt = cache->mnt; + + ASSERT(parent->dentry); + ASSERT(parent->dentry->d_inode); + + if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) { + // TODO: convert file to dir + _leave("looking up in none directory"); + return -ENOBUFS; + } + + dir = dget(parent->dentry); + +advance: + /* attempt to transit the first directory component */ + name = key; + nlen = strlen(key); + + /* key ends in a double NUL */ + key = key + nlen + 1; + if (!*key) + key = NULL; + +lookup_again: + /* search the current directory for the element name */ + _debug("lookup '%s'", name); + + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + + start = jiffies; + next = lookup_one_len(name, dir, nlen); + cachefiles_hist(cachefiles_lookup_histogram, start); + if (IS_ERR(next)) + goto lookup_error; + + _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative"); + + if (!key) + object->new = !next->d_inode; + + /* if this element of the path doesn't exist, then the lookup phase + * failed, and we can release any readers in the certain knowledge that + * there's nothing for them to actually read */ + if (!next->d_inode) + fscache_object_lookup_negative(&object->fscache); + + /* we need to create the object if it's negative */ + if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) { + /* index objects and intervening tree levels must be subdirs */ + if (!next->d_inode) { + ret = cachefiles_has_space(cache, 1, 0); + if (ret < 0) + goto create_error; + + path.dentry = dir; + ret = security_path_mkdir(&path, next, 0); + if (ret < 0) + goto create_error; + start = jiffies; + ret = vfs_mkdir(dir->d_inode, next, 0); + cachefiles_hist(cachefiles_mkdir_histogram, start); + if (ret < 0) + goto create_error; + + ASSERT(next->d_inode); + + _debug("mkdir -> %p{%p{ino=%lu}}", + next, next->d_inode, next->d_inode->i_ino); + + } else if (!S_ISDIR(next->d_inode->i_mode)) { + kerror("inode %lu is not a directory", + next->d_inode->i_ino); + ret = -ENOBUFS; + goto error; + } + + } else { + /* non-index objects start out life as files */ + if (!next->d_inode) { + ret = cachefiles_has_space(cache, 1, 0); + if (ret < 0) + goto create_error; + + path.dentry = dir; + ret = security_path_mknod(&path, next, S_IFREG, 0); + if (ret < 0) + goto create_error; + start = jiffies; + ret = vfs_create(dir->d_inode, next, S_IFREG, NULL); + cachefiles_hist(cachefiles_create_histogram, start); + if (ret < 0) + goto create_error; + + ASSERT(next->d_inode); + + _debug("create -> %p{%p{ino=%lu}}", + next, next->d_inode, next->d_inode->i_ino); + + } else if (!S_ISDIR(next->d_inode->i_mode) && + !S_ISREG(next->d_inode->i_mode) + ) { + kerror("inode %lu is not a file or directory", + next->d_inode->i_ino); + ret = -ENOBUFS; + goto error; + } + } + + /* process the next component */ + if (key) { + _debug("advance"); + mutex_unlock(&dir->d_inode->i_mutex); + dput(dir); + dir = next; + next = NULL; + goto advance; + } + + /* we've found the object we were looking for */ + object->dentry = next; + + /* if we've found that the terminal object exists, then we need to + * check its attributes and delete it if it's out of date */ + if (!object->new) { + _debug("validate '%*.*s'", + next->d_name.len, next->d_name.len, next->d_name.name); + + ret = cachefiles_check_object_xattr(object, auxdata); + if (ret == -ESTALE) { + /* delete the object (the deleter drops the directory + * mutex) */ + object->dentry = NULL; + + ret = cachefiles_bury_object(cache, dir, next, true); + dput(next); + next = NULL; + + if (ret < 0) + goto delete_error; + + _debug("redo lookup"); + goto lookup_again; + } + } + + /* note that we're now using this object */ + ret = cachefiles_mark_object_active(cache, object); + + mutex_unlock(&dir->d_inode->i_mutex); + dput(dir); + dir = NULL; + + if (ret == -ETIMEDOUT) + goto mark_active_timed_out; + + _debug("=== OBTAINED_OBJECT ==="); + + if (object->new) { + /* attach data to a newly constructed terminal object */ + ret = cachefiles_set_object_xattr(object, auxdata); + if (ret < 0) + goto check_error; + } else { + /* always update the atime on an object we've just looked up + * (this is used to keep track of culling, and atimes are only + * updated by read, write and readdir but not lookup or + * open) */ + touch_atime(cache->mnt, next); + } + + /* open a file interface onto a data file */ + if (object->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (S_ISREG(object->dentry->d_inode->i_mode)) { + const struct address_space_operations *aops; + + ret = -EPERM; + aops = object->dentry->d_inode->i_mapping->a_ops; + if (!aops->bmap) + goto check_error; + + object->backer = object->dentry; + } else { + BUG(); // TODO: open file in data-class subdir + } + } + + object->new = 0; + fscache_obtained_object(&object->fscache); + + _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino); + return 0; + +create_error: + _debug("create error %d", ret); + if (ret == -EIO) + cachefiles_io_error(cache, "Create/mkdir failed"); + goto error; + +mark_active_timed_out: + _debug("mark active timed out"); + goto release_dentry; + +check_error: + _debug("check error %d", ret); + write_lock(&cache->active_lock); + rb_erase(&object->active_node, &cache->active_nodes); + clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); + wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); + write_unlock(&cache->active_lock); +release_dentry: + dput(object->dentry); + object->dentry = NULL; + goto error_out; + +delete_error: + _debug("delete error %d", ret); + goto error_out2; + +lookup_error: + _debug("lookup error %ld", PTR_ERR(next)); + ret = PTR_ERR(next); + if (ret == -EIO) + cachefiles_io_error(cache, "Lookup failed"); + next = NULL; +error: + mutex_unlock(&dir->d_inode->i_mutex); + dput(next); +error_out2: + dput(dir); +error_out: + _leave(" = error %d", -ret); + return ret; +} + +/* + * get a subdirectory + */ +struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, + struct dentry *dir, + const char *dirname) +{ + struct dentry *subdir; + unsigned long start; + struct path path; + int ret; + + _enter(",,%s", dirname); + + /* search the current directory for the element name */ + mutex_lock(&dir->d_inode->i_mutex); + + start = jiffies; + subdir = lookup_one_len(dirname, dir, strlen(dirname)); + cachefiles_hist(cachefiles_lookup_histogram, start); + if (IS_ERR(subdir)) { + if (PTR_ERR(subdir) == -ENOMEM) + goto nomem_d_alloc; + goto lookup_error; + } + + _debug("subdir -> %p %s", + subdir, subdir->d_inode ? "positive" : "negative"); + + /* we need to create the subdir if it doesn't exist yet */ + if (!subdir->d_inode) { + ret = cachefiles_has_space(cache, 1, 0); + if (ret < 0) + goto mkdir_error; + + _debug("attempt mkdir"); + + path.mnt = cache->mnt; + path.dentry = dir; + ret = security_path_mkdir(&path, subdir, 0700); + if (ret < 0) + goto mkdir_error; + ret = vfs_mkdir(dir->d_inode, subdir, 0700); + if (ret < 0) + goto mkdir_error; + + ASSERT(subdir->d_inode); + + _debug("mkdir -> %p{%p{ino=%lu}}", + subdir, + subdir->d_inode, + subdir->d_inode->i_ino); + } + + mutex_unlock(&dir->d_inode->i_mutex); + + /* we need to make sure the subdir is a directory */ + ASSERT(subdir->d_inode); + + if (!S_ISDIR(subdir->d_inode->i_mode)) { + kerror("%s is not a directory", dirname); + ret = -EIO; + goto check_error; + } + + ret = -EPERM; + if (!subdir->d_inode->i_op || + !subdir->d_inode->i_op->setxattr || + !subdir->d_inode->i_op->getxattr || + !subdir->d_inode->i_op->lookup || + !subdir->d_inode->i_op->mkdir || + !subdir->d_inode->i_op->create || + !subdir->d_inode->i_op->rename || + !subdir->d_inode->i_op->rmdir || + !subdir->d_inode->i_op->unlink) + goto check_error; + + _leave(" = [%lu]", subdir->d_inode->i_ino); + return subdir; + +check_error: + dput(subdir); + _leave(" = %d [check]", ret); + return ERR_PTR(ret); + +mkdir_error: + mutex_unlock(&dir->d_inode->i_mutex); + dput(subdir); + kerror("mkdir %s failed with error %d", dirname, ret); + return ERR_PTR(ret); + +lookup_error: + mutex_unlock(&dir->d_inode->i_mutex); + ret = PTR_ERR(subdir); + kerror("Lookup %s failed with error %d", dirname, ret); + return ERR_PTR(ret); + +nomem_d_alloc: + mutex_unlock(&dir->d_inode->i_mutex); + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); +} + +/* + * find out if an object is in use or not + * - if finds object and it's not in use: + * - returns a pointer to the object and a reference on it + * - returns with the directory locked + */ +static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, + struct dentry *dir, + char *filename) +{ + struct cachefiles_object *object; + struct rb_node *_n; + struct dentry *victim; + unsigned long start; + int ret; + + //_enter(",%*.*s/,%s", + // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); + + /* look up the victim */ + mutex_lock_nested(&dir->d_inode->i_mutex, 1); + + start = jiffies; + victim = lookup_one_len(filename, dir, strlen(filename)); + cachefiles_hist(cachefiles_lookup_histogram, start); + if (IS_ERR(victim)) + goto lookup_error; + + //_debug("victim -> %p %s", + // victim, victim->d_inode ? "positive" : "negative"); + + /* if the object is no longer there then we probably retired the object + * at the netfs's request whilst the cull was in progress + */ + if (!victim->d_inode) { + mutex_unlock(&dir->d_inode->i_mutex); + dput(victim); + _leave(" = -ENOENT [absent]"); + return ERR_PTR(-ENOENT); + } + + /* check to see if we're using this object */ + read_lock(&cache->active_lock); + + _n = cache->active_nodes.rb_node; + + while (_n) { + object = rb_entry(_n, struct cachefiles_object, active_node); + + if (object->dentry > victim) + _n = _n->rb_left; + else if (object->dentry < victim) + _n = _n->rb_right; + else + goto object_in_use; + } + + read_unlock(&cache->active_lock); + + //_leave(" = %p", victim); + return victim; + +object_in_use: + read_unlock(&cache->active_lock); + mutex_unlock(&dir->d_inode->i_mutex); + dput(victim); + //_leave(" = -EBUSY [in use]"); + return ERR_PTR(-EBUSY); + +lookup_error: + mutex_unlock(&dir->d_inode->i_mutex); + ret = PTR_ERR(victim); + if (ret == -ENOENT) { + /* file or dir now absent - probably retired by netfs */ + _leave(" = -ESTALE [absent]"); + return ERR_PTR(-ESTALE); + } + + if (ret == -EIO) { + cachefiles_io_error(cache, "Lookup failed"); + } else if (ret != -ENOMEM) { + kerror("Internal error: %d", ret); + ret = -EIO; + } + + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * cull an object if it's not in use + * - called only by cache manager daemon + */ +int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, + char *filename) +{ + struct dentry *victim; + int ret; + + _enter(",%*.*s/,%s", + dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); + + victim = cachefiles_check_active(cache, dir, filename); + if (IS_ERR(victim)) + return PTR_ERR(victim); + + _debug("victim -> %p %s", + victim, victim->d_inode ? "positive" : "negative"); + + /* okay... the victim is not being used so we can cull it + * - start by marking it as stale + */ + _debug("victim is cullable"); + + ret = cachefiles_remove_object_xattr(cache, victim); + if (ret < 0) + goto error_unlock; + + /* actually remove the victim (drops the dir mutex) */ + _debug("bury"); + + ret = cachefiles_bury_object(cache, dir, victim, false); + if (ret < 0) + goto error; + + dput(victim); + _leave(" = 0"); + return 0; + +error_unlock: + mutex_unlock(&dir->d_inode->i_mutex); +error: + dput(victim); + if (ret == -ENOENT) { + /* file or dir now absent - probably retired by netfs */ + _leave(" = -ESTALE [absent]"); + return -ESTALE; + } + + if (ret != -ENOMEM) { + kerror("Internal error: %d", ret); + ret = -EIO; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * find out if an object is in use or not + * - called only by cache manager daemon + * - returns -EBUSY or 0 to indicate whether an object is in use or not + */ +int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, + char *filename) +{ + struct dentry *victim; + + //_enter(",%*.*s/,%s", + // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); + + victim = cachefiles_check_active(cache, dir, filename); + if (IS_ERR(victim)) + return PTR_ERR(victim); + + mutex_unlock(&dir->d_inode->i_mutex); + dput(victim); + //_leave(" = 0"); + return 0; +} diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c new file mode 100644 index 00000000..eccd3394 --- /dev/null +++ b/fs/cachefiles/proc.c @@ -0,0 +1,134 @@ +/* CacheFiles statistics + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include "internal.h" + +atomic_t cachefiles_lookup_histogram[HZ]; +atomic_t cachefiles_mkdir_histogram[HZ]; +atomic_t cachefiles_create_histogram[HZ]; + +/* + * display the latency histogram + */ +static int cachefiles_histogram_show(struct seq_file *m, void *v) +{ + unsigned long index; + unsigned x, y, z, t; + + switch ((unsigned long) v) { + case 1: + seq_puts(m, "JIFS SECS LOOKUPS MKDIRS CREATES\n"); + return 0; + case 2: + seq_puts(m, "===== ===== ========= ========= =========\n"); + return 0; + default: + index = (unsigned long) v - 3; + x = atomic_read(&cachefiles_lookup_histogram[index]); + y = atomic_read(&cachefiles_mkdir_histogram[index]); + z = atomic_read(&cachefiles_create_histogram[index]); + if (x == 0 && y == 0 && z == 0) + return 0; + + t = (index * 1000) / HZ; + + seq_printf(m, "%4lu 0.%03u %9u %9u %9u\n", index, t, x, y, z); + return 0; + } +} + +/* + * set up the iterator to start reading from the first line + */ +static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos) +{ + if ((unsigned long long)*_pos >= HZ + 2) + return NULL; + if (*_pos == 0) + *_pos = 1; + return (void *)(unsigned long) *_pos; +} + +/* + * move to the next line + */ +static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return (unsigned long long)*pos > HZ + 2 ? + NULL : (void *)(unsigned long) *pos; +} + +/* + * clean up after reading + */ +static void cachefiles_histogram_stop(struct seq_file *m, void *v) +{ +} + +static const struct seq_operations cachefiles_histogram_ops = { + .start = cachefiles_histogram_start, + .stop = cachefiles_histogram_stop, + .next = cachefiles_histogram_next, + .show = cachefiles_histogram_show, +}; + +/* + * open "/proc/fs/cachefiles/XXX" which provide statistics summaries + */ +static int cachefiles_histogram_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &cachefiles_histogram_ops); +} + +static const struct file_operations cachefiles_histogram_fops = { + .owner = THIS_MODULE, + .open = cachefiles_histogram_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * initialise the /proc/fs/cachefiles/ directory + */ +int __init cachefiles_proc_init(void) +{ + _enter(""); + + if (!proc_mkdir("fs/cachefiles", NULL)) + goto error_dir; + + if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL, + &cachefiles_histogram_fops)) + goto error_histogram; + + _leave(" = 0"); + return 0; + +error_histogram: + remove_proc_entry("fs/cachefiles", NULL); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * clean up the /proc/fs/cachefiles/ directory + */ +void cachefiles_proc_cleanup(void) +{ + remove_proc_entry("fs/cachefiles/histogram", NULL); + remove_proc_entry("fs/cachefiles", NULL); +} diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c new file mode 100644 index 00000000..0e3c0924 --- /dev/null +++ b/fs/cachefiles/rdwr.c @@ -0,0 +1,984 @@ +/* Storage object read/write + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include "internal.h" + +/* + * detect wake up events generated by the unlocking of pages in which we're + * interested + * - we use this to detect read completion of backing pages + * - the caller holds the waitqueue lock + */ +static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, + int sync, void *_key) +{ + struct cachefiles_one_read *monitor = + container_of(wait, struct cachefiles_one_read, monitor); + struct cachefiles_object *object; + struct wait_bit_key *key = _key; + struct page *page = wait->private; + + ASSERT(key); + + _enter("{%lu},%u,%d,{%p,%u}", + monitor->netfs_page->index, mode, sync, + key->flags, key->bit_nr); + + if (key->flags != &page->flags || + key->bit_nr != PG_locked) + return 0; + + _debug("--- monitor %p %lx ---", page, page->flags); + + if (!PageUptodate(page) && !PageError(page)) { + /* unlocked, not uptodate and not erronous? */ + _debug("page probably truncated"); + } + + /* remove from the waitqueue */ + list_del(&wait->task_list); + + /* move onto the action list and queue for FS-Cache thread pool */ + ASSERT(monitor->op); + + object = container_of(monitor->op->op.object, + struct cachefiles_object, fscache); + + spin_lock(&object->work_lock); + list_add_tail(&monitor->op_link, &monitor->op->to_do); + spin_unlock(&object->work_lock); + + fscache_enqueue_retrieval(monitor->op); + return 0; +} + +/* + * handle a probably truncated page + * - check to see if the page is still relevant and reissue the read if + * possible + * - return -EIO on error, -ENODATA if the page is gone, -EINPROGRESS if we + * must wait again and 0 if successful + */ +static int cachefiles_read_reissue(struct cachefiles_object *object, + struct cachefiles_one_read *monitor) +{ + struct address_space *bmapping = object->backer->d_inode->i_mapping; + struct page *backpage = monitor->back_page, *backpage2; + int ret; + + kenter("{ino=%lx},{%lx,%lx}", + object->backer->d_inode->i_ino, + backpage->index, backpage->flags); + + /* skip if the page was truncated away completely */ + if (backpage->mapping != bmapping) { + kleave(" = -ENODATA [mapping]"); + return -ENODATA; + } + + backpage2 = find_get_page(bmapping, backpage->index); + if (!backpage2) { + kleave(" = -ENODATA [gone]"); + return -ENODATA; + } + + if (backpage != backpage2) { + put_page(backpage2); + kleave(" = -ENODATA [different]"); + return -ENODATA; + } + + /* the page is still there and we already have a ref on it, so we don't + * need a second */ + put_page(backpage2); + + INIT_LIST_HEAD(&monitor->op_link); + add_page_wait_queue(backpage, &monitor->monitor); + + if (trylock_page(backpage)) { + ret = -EIO; + if (PageError(backpage)) + goto unlock_discard; + ret = 0; + if (PageUptodate(backpage)) + goto unlock_discard; + + kdebug("reissue read"); + ret = bmapping->a_ops->readpage(NULL, backpage); + if (ret < 0) + goto unlock_discard; + } + + /* but the page may have been read before the monitor was installed, so + * the monitor may miss the event - so we have to ensure that we do get + * one in such a case */ + if (trylock_page(backpage)) { + _debug("jumpstart %p {%lx}", backpage, backpage->flags); + unlock_page(backpage); + } + + /* it'll reappear on the todo list */ + kleave(" = -EINPROGRESS"); + return -EINPROGRESS; + +unlock_discard: + unlock_page(backpage); + spin_lock_irq(&object->work_lock); + list_del(&monitor->op_link); + spin_unlock_irq(&object->work_lock); + kleave(" = %d", ret); + return ret; +} + +/* + * copy data from backing pages to netfs pages to complete a read operation + * - driven by FS-Cache's thread pool + */ +static void cachefiles_read_copier(struct fscache_operation *_op) +{ + struct cachefiles_one_read *monitor; + struct cachefiles_object *object; + struct fscache_retrieval *op; + struct pagevec pagevec; + int error, max; + + op = container_of(_op, struct fscache_retrieval, op); + object = container_of(op->op.object, + struct cachefiles_object, fscache); + + _enter("{ino=%lu}", object->backer->d_inode->i_ino); + + pagevec_init(&pagevec, 0); + + max = 8; + spin_lock_irq(&object->work_lock); + + while (!list_empty(&op->to_do)) { + monitor = list_entry(op->to_do.next, + struct cachefiles_one_read, op_link); + list_del(&monitor->op_link); + + spin_unlock_irq(&object->work_lock); + + _debug("- copy {%lu}", monitor->back_page->index); + + recheck: + if (PageUptodate(monitor->back_page)) { + copy_highpage(monitor->netfs_page, monitor->back_page); + + pagevec_add(&pagevec, monitor->netfs_page); + fscache_mark_pages_cached(monitor->op, &pagevec); + error = 0; + } else if (!PageError(monitor->back_page)) { + /* the page has probably been truncated */ + error = cachefiles_read_reissue(object, monitor); + if (error == -EINPROGRESS) + goto next; + goto recheck; + } else { + cachefiles_io_error_obj( + object, + "Readpage failed on backing file %lx", + (unsigned long) monitor->back_page->flags); + error = -EIO; + } + + page_cache_release(monitor->back_page); + + fscache_end_io(op, monitor->netfs_page, error); + page_cache_release(monitor->netfs_page); + fscache_put_retrieval(op); + kfree(monitor); + + next: + /* let the thread pool have some air occasionally */ + max--; + if (max < 0 || need_resched()) { + if (!list_empty(&op->to_do)) + fscache_enqueue_retrieval(op); + _leave(" [maxed out]"); + return; + } + + spin_lock_irq(&object->work_lock); + } + + spin_unlock_irq(&object->work_lock); + _leave(""); +} + +/* + * read the corresponding page to the given set from the backing file + * - an uncertain page is simply discarded, to be tried again another time + */ +static int cachefiles_read_backing_file_one(struct cachefiles_object *object, + struct fscache_retrieval *op, + struct page *netpage, + struct pagevec *pagevec) +{ + struct cachefiles_one_read *monitor; + struct address_space *bmapping; + struct page *newpage, *backpage; + int ret; + + _enter(""); + + pagevec_reinit(pagevec); + + _debug("read back %p{%lu,%d}", + netpage, netpage->index, page_count(netpage)); + + monitor = kzalloc(sizeof(*monitor), GFP_KERNEL); + if (!monitor) + goto nomem; + + monitor->netfs_page = netpage; + monitor->op = fscache_get_retrieval(op); + + init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter); + + /* attempt to get hold of the backing page */ + bmapping = object->backer->d_inode->i_mapping; + newpage = NULL; + + for (;;) { + backpage = find_get_page(bmapping, netpage->index); + if (backpage) + goto backing_page_already_present; + + if (!newpage) { + newpage = page_cache_alloc_cold(bmapping); + if (!newpage) + goto nomem_monitor; + } + + ret = add_to_page_cache(newpage, bmapping, + netpage->index, GFP_KERNEL); + if (ret == 0) + goto installed_new_backing_page; + if (ret != -EEXIST) + goto nomem_page; + } + + /* we've installed a new backing page, so now we need to add it + * to the LRU list and start it reading */ +installed_new_backing_page: + _debug("- new %p", newpage); + + backpage = newpage; + newpage = NULL; + + page_cache_get(backpage); + pagevec_add(pagevec, backpage); + __pagevec_lru_add_file(pagevec); + +read_backing_page: + ret = bmapping->a_ops->readpage(NULL, backpage); + if (ret < 0) + goto read_error; + + /* set the monitor to transfer the data across */ +monitor_backing_page: + _debug("- monitor add"); + + /* install the monitor */ + page_cache_get(monitor->netfs_page); + page_cache_get(backpage); + monitor->back_page = backpage; + monitor->monitor.private = backpage; + add_page_wait_queue(backpage, &monitor->monitor); + monitor = NULL; + + /* but the page may have been read before the monitor was installed, so + * the monitor may miss the event - so we have to ensure that we do get + * one in such a case */ + if (trylock_page(backpage)) { + _debug("jumpstart %p {%lx}", backpage, backpage->flags); + unlock_page(backpage); + } + goto success; + + /* if the backing page is already present, it can be in one of + * three states: read in progress, read failed or read okay */ +backing_page_already_present: + _debug("- present"); + + if (newpage) { + page_cache_release(newpage); + newpage = NULL; + } + + if (PageError(backpage)) + goto io_error; + + if (PageUptodate(backpage)) + goto backing_page_already_uptodate; + + if (!trylock_page(backpage)) + goto monitor_backing_page; + _debug("read %p {%lx}", backpage, backpage->flags); + goto read_backing_page; + + /* the backing page is already up to date, attach the netfs + * page to the pagecache and LRU and copy the data across */ +backing_page_already_uptodate: + _debug("- uptodate"); + + pagevec_add(pagevec, netpage); + fscache_mark_pages_cached(op, pagevec); + + copy_highpage(netpage, backpage); + fscache_end_io(op, netpage, 0); + +success: + _debug("success"); + ret = 0; + +out: + if (backpage) + page_cache_release(backpage); + if (monitor) { + fscache_put_retrieval(monitor->op); + kfree(monitor); + } + _leave(" = %d", ret); + return ret; + +read_error: + _debug("read error %d", ret); + if (ret == -ENOMEM) + goto out; +io_error: + cachefiles_io_error_obj(object, "Page read error on backing file"); + ret = -ENOBUFS; + goto out; + +nomem_page: + page_cache_release(newpage); +nomem_monitor: + fscache_put_retrieval(monitor->op); + kfree(monitor); +nomem: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * read a page from the cache or allocate a block in which to store it + * - cache withdrawal is prevented by the caller + * - returns -EINTR if interrupted + * - returns -ENOMEM if ran out of memory + * - returns -ENOBUFS if no buffers can be made available + * - returns -ENOBUFS if page is beyond EOF + * - if the page is backed by a block in the cache: + * - a read will be started which will call the callback on completion + * - 0 will be returned + * - else if the page is unbacked: + * - the metadata will be retained + * - -ENODATA will be returned + */ +int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, + struct page *page, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct pagevec pagevec; + struct inode *inode; + sector_t block0, block; + unsigned shift; + int ret; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("{%p},{%lx},,,", object, page->index); + + if (!object->backer) + return -ENOBUFS; + + inode = object->backer->d_inode; + ASSERT(S_ISREG(inode->i_mode)); + ASSERT(inode->i_mapping->a_ops->bmap); + ASSERT(inode->i_mapping->a_ops->readpages); + + /* calculate the shift required to use bmap */ + if (inode->i_sb->s_blocksize > PAGE_SIZE) + return -ENOBUFS; + + shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; + + op->op.flags &= FSCACHE_OP_KEEP_FLAGS; + op->op.flags |= FSCACHE_OP_ASYNC; + op->op.processor = cachefiles_read_copier; + + pagevec_init(&pagevec, 0); + + /* we assume the absence or presence of the first block is a good + * enough indication for the page as a whole + * - TODO: don't use bmap() for this as it is _not_ actually good + * enough for this as it doesn't indicate errors, but it's all we've + * got for the moment + */ + block0 = page->index; + block0 <<= shift; + + block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0); + _debug("%llx -> %llx", + (unsigned long long) block0, + (unsigned long long) block); + + if (block) { + /* submit the apparently valid page to the backing fs to be + * read from disk */ + ret = cachefiles_read_backing_file_one(object, op, page, + &pagevec); + } else if (cachefiles_has_space(cache, 0, 1) == 0) { + /* there's space in the cache we can use */ + pagevec_add(&pagevec, page); + fscache_mark_pages_cached(op, &pagevec); + ret = -ENODATA; + } else { + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * read the corresponding pages to the given set from the backing file + * - any uncertain pages are simply discarded, to be tried again another time + */ +static int cachefiles_read_backing_file(struct cachefiles_object *object, + struct fscache_retrieval *op, + struct list_head *list, + struct pagevec *mark_pvec) +{ + struct cachefiles_one_read *monitor = NULL; + struct address_space *bmapping = object->backer->d_inode->i_mapping; + struct pagevec lru_pvec; + struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; + int ret = 0; + + _enter(""); + + pagevec_init(&lru_pvec, 0); + + list_for_each_entry_safe(netpage, _n, list, lru) { + list_del(&netpage->lru); + + _debug("read back %p{%lu,%d}", + netpage, netpage->index, page_count(netpage)); + + if (!monitor) { + monitor = kzalloc(sizeof(*monitor), GFP_KERNEL); + if (!monitor) + goto nomem; + + monitor->op = fscache_get_retrieval(op); + init_waitqueue_func_entry(&monitor->monitor, + cachefiles_read_waiter); + } + + for (;;) { + backpage = find_get_page(bmapping, netpage->index); + if (backpage) + goto backing_page_already_present; + + if (!newpage) { + newpage = page_cache_alloc_cold(bmapping); + if (!newpage) + goto nomem; + } + + ret = add_to_page_cache(newpage, bmapping, + netpage->index, GFP_KERNEL); + if (ret == 0) + goto installed_new_backing_page; + if (ret != -EEXIST) + goto nomem; + } + + /* we've installed a new backing page, so now we need to add it + * to the LRU list and start it reading */ + installed_new_backing_page: + _debug("- new %p", newpage); + + backpage = newpage; + newpage = NULL; + + page_cache_get(backpage); + if (!pagevec_add(&lru_pvec, backpage)) + __pagevec_lru_add_file(&lru_pvec); + + reread_backing_page: + ret = bmapping->a_ops->readpage(NULL, backpage); + if (ret < 0) + goto read_error; + + /* add the netfs page to the pagecache and LRU, and set the + * monitor to transfer the data across */ + monitor_backing_page: + _debug("- monitor add"); + + ret = add_to_page_cache(netpage, op->mapping, netpage->index, + GFP_KERNEL); + if (ret < 0) { + if (ret == -EEXIST) { + page_cache_release(netpage); + continue; + } + goto nomem; + } + + page_cache_get(netpage); + if (!pagevec_add(&lru_pvec, netpage)) + __pagevec_lru_add_file(&lru_pvec); + + /* install a monitor */ + page_cache_get(netpage); + monitor->netfs_page = netpage; + + page_cache_get(backpage); + monitor->back_page = backpage; + monitor->monitor.private = backpage; + add_page_wait_queue(backpage, &monitor->monitor); + monitor = NULL; + + /* but the page may have been read before the monitor was + * installed, so the monitor may miss the event - so we have to + * ensure that we do get one in such a case */ + if (trylock_page(backpage)) { + _debug("2unlock %p {%lx}", backpage, backpage->flags); + unlock_page(backpage); + } + + page_cache_release(backpage); + backpage = NULL; + + page_cache_release(netpage); + netpage = NULL; + continue; + + /* if the backing page is already present, it can be in one of + * three states: read in progress, read failed or read okay */ + backing_page_already_present: + _debug("- present %p", backpage); + + if (PageError(backpage)) + goto io_error; + + if (PageUptodate(backpage)) + goto backing_page_already_uptodate; + + _debug("- not ready %p{%lx}", backpage, backpage->flags); + + if (!trylock_page(backpage)) + goto monitor_backing_page; + + if (PageError(backpage)) { + _debug("error %lx", backpage->flags); + unlock_page(backpage); + goto io_error; + } + + if (PageUptodate(backpage)) + goto backing_page_already_uptodate_unlock; + + /* we've locked a page that's neither up to date nor erroneous, + * so we need to attempt to read it again */ + goto reread_backing_page; + + /* the backing page is already up to date, attach the netfs + * page to the pagecache and LRU and copy the data across */ + backing_page_already_uptodate_unlock: + _debug("uptodate %lx", backpage->flags); + unlock_page(backpage); + backing_page_already_uptodate: + _debug("- uptodate"); + + ret = add_to_page_cache(netpage, op->mapping, netpage->index, + GFP_KERNEL); + if (ret < 0) { + if (ret == -EEXIST) { + page_cache_release(netpage); + continue; + } + goto nomem; + } + + copy_highpage(netpage, backpage); + + page_cache_release(backpage); + backpage = NULL; + + if (!pagevec_add(mark_pvec, netpage)) + fscache_mark_pages_cached(op, mark_pvec); + + page_cache_get(netpage); + if (!pagevec_add(&lru_pvec, netpage)) + __pagevec_lru_add_file(&lru_pvec); + + fscache_end_io(op, netpage, 0); + page_cache_release(netpage); + netpage = NULL; + continue; + } + + netpage = NULL; + + _debug("out"); + +out: + /* tidy up */ + pagevec_lru_add_file(&lru_pvec); + + if (newpage) + page_cache_release(newpage); + if (netpage) + page_cache_release(netpage); + if (backpage) + page_cache_release(backpage); + if (monitor) { + fscache_put_retrieval(op); + kfree(monitor); + } + + list_for_each_entry_safe(netpage, _n, list, lru) { + list_del(&netpage->lru); + page_cache_release(netpage); + } + + _leave(" = %d", ret); + return ret; + +nomem: + _debug("nomem"); + ret = -ENOMEM; + goto out; + +read_error: + _debug("read error %d", ret); + if (ret == -ENOMEM) + goto out; +io_error: + cachefiles_io_error_obj(object, "Page read error on backing file"); + ret = -ENOBUFS; + goto out; +} + +/* + * read a list of pages from the cache or allocate blocks in which to store + * them + */ +int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, + struct list_head *pages, + unsigned *nr_pages, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct list_head backpages; + struct pagevec pagevec; + struct inode *inode; + struct page *page, *_n; + unsigned shift, nrbackpages; + int ret, ret2, space; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("{OBJ%x,%d},,%d,,", + object->fscache.debug_id, atomic_read(&op->op.usage), + *nr_pages); + + if (!object->backer) + return -ENOBUFS; + + space = 1; + if (cachefiles_has_space(cache, 0, *nr_pages) < 0) + space = 0; + + inode = object->backer->d_inode; + ASSERT(S_ISREG(inode->i_mode)); + ASSERT(inode->i_mapping->a_ops->bmap); + ASSERT(inode->i_mapping->a_ops->readpages); + + /* calculate the shift required to use bmap */ + if (inode->i_sb->s_blocksize > PAGE_SIZE) + return -ENOBUFS; + + shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; + + pagevec_init(&pagevec, 0); + + op->op.flags &= FSCACHE_OP_KEEP_FLAGS; + op->op.flags |= FSCACHE_OP_ASYNC; + op->op.processor = cachefiles_read_copier; + + INIT_LIST_HEAD(&backpages); + nrbackpages = 0; + + ret = space ? -ENODATA : -ENOBUFS; + list_for_each_entry_safe(page, _n, pages, lru) { + sector_t block0, block; + + /* we assume the absence or presence of the first block is a + * good enough indication for the page as a whole + * - TODO: don't use bmap() for this as it is _not_ actually + * good enough for this as it doesn't indicate errors, but + * it's all we've got for the moment + */ + block0 = page->index; + block0 <<= shift; + + block = inode->i_mapping->a_ops->bmap(inode->i_mapping, + block0); + _debug("%llx -> %llx", + (unsigned long long) block0, + (unsigned long long) block); + + if (block) { + /* we have data - add it to the list to give to the + * backing fs */ + list_move(&page->lru, &backpages); + (*nr_pages)--; + nrbackpages++; + } else if (space && pagevec_add(&pagevec, page) == 0) { + fscache_mark_pages_cached(op, &pagevec); + ret = -ENODATA; + } + } + + if (pagevec_count(&pagevec) > 0) + fscache_mark_pages_cached(op, &pagevec); + + if (list_empty(pages)) + ret = 0; + + /* submit the apparently valid pages to the backing fs to be read from + * disk */ + if (nrbackpages > 0) { + ret2 = cachefiles_read_backing_file(object, op, &backpages, + &pagevec); + if (ret2 == -ENOMEM || ret2 == -EINTR) + ret = ret2; + } + + if (pagevec_count(&pagevec) > 0) + fscache_mark_pages_cached(op, &pagevec); + + _leave(" = %d [nr=%u%s]", + ret, *nr_pages, list_empty(pages) ? " empty" : ""); + return ret; +} + +/* + * allocate a block in the cache in which to store a page + * - cache withdrawal is prevented by the caller + * - returns -EINTR if interrupted + * - returns -ENOMEM if ran out of memory + * - returns -ENOBUFS if no buffers can be made available + * - returns -ENOBUFS if page is beyond EOF + * - otherwise: + * - the metadata will be retained + * - 0 will be returned + */ +int cachefiles_allocate_page(struct fscache_retrieval *op, + struct page *page, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct pagevec pagevec; + int ret; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("%p,{%lx},", object, page->index); + + ret = cachefiles_has_space(cache, 0, 1); + if (ret == 0) { + pagevec_init(&pagevec, 0); + pagevec_add(&pagevec, page); + fscache_mark_pages_cached(op, &pagevec); + } else { + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * allocate blocks in the cache in which to store a set of pages + * - cache withdrawal is prevented by the caller + * - returns -EINTR if interrupted + * - returns -ENOMEM if ran out of memory + * - returns -ENOBUFS if some buffers couldn't be made available + * - returns -ENOBUFS if some pages are beyond EOF + * - otherwise: + * - -ENODATA will be returned + * - metadata will be retained for any page marked + */ +int cachefiles_allocate_pages(struct fscache_retrieval *op, + struct list_head *pages, + unsigned *nr_pages, + gfp_t gfp) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct pagevec pagevec; + struct page *page; + int ret; + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("%p,,,%d,", object, *nr_pages); + + ret = cachefiles_has_space(cache, 0, *nr_pages); + if (ret == 0) { + pagevec_init(&pagevec, 0); + + list_for_each_entry(page, pages, lru) { + if (pagevec_add(&pagevec, page) == 0) + fscache_mark_pages_cached(op, &pagevec); + } + + if (pagevec_count(&pagevec) > 0) + fscache_mark_pages_cached(op, &pagevec); + ret = -ENODATA; + } else { + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * request a page be stored in the cache + * - cache withdrawal is prevented by the caller + * - this request may be ignored if there's no cache block available, in which + * case -ENOBUFS will be returned + * - if the op is in progress, 0 will be returned + */ +int cachefiles_write_page(struct fscache_storage *op, struct page *page) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + mm_segment_t old_fs; + struct file *file; + loff_t pos, eof; + size_t len; + void *data; + int ret; + + ASSERT(op != NULL); + ASSERT(page != NULL); + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + + _enter("%p,%p{%lx},,,", object, page, page->index); + + if (!object->backer) { + _leave(" = -ENOBUFS"); + return -ENOBUFS; + } + + ASSERT(S_ISREG(object->backer->d_inode->i_mode)); + + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + /* write the page to the backing filesystem and let it store it in its + * own time */ + dget(object->backer); + mntget(cache->mnt); + file = dentry_open(object->backer, cache->mnt, O_RDWR, + cache->cache_cred); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + } else { + ret = -EIO; + if (file->f_op->write) { + pos = (loff_t) page->index << PAGE_SHIFT; + + /* we mustn't write more data than we have, so we have + * to beware of a partial page at EOF */ + eof = object->fscache.store_limit_l; + len = PAGE_SIZE; + if (eof & ~PAGE_MASK) { + ASSERTCMP(pos, <, eof); + if (eof - pos < PAGE_SIZE) { + _debug("cut short %llx to %llx", + pos, eof); + len = eof - pos; + ASSERTCMP(pos + len, ==, eof); + } + } + + data = kmap(page); + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = file->f_op->write( + file, (const void __user *) data, len, &pos); + set_fs(old_fs); + kunmap(page); + if (ret != len) + ret = -EIO; + } + fput(file); + } + + if (ret < 0) { + if (ret == -EIO) + cachefiles_io_error_obj( + object, "Write page to backing file failed"); + ret = -ENOBUFS; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * detach a backing block from a page + * - cache withdrawal is prevented by the caller + */ +void cachefiles_uncache_page(struct fscache_object *_object, struct page *page) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + + object = container_of(_object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + _enter("%p,{%lu}", object, page->index); + + spin_unlock(&object->fscache.cookie->lock); +} diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c new file mode 100644 index 00000000..039b5011 --- /dev/null +++ b/fs/cachefiles/security.c @@ -0,0 +1,120 @@ +/* CacheFiles security management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include "internal.h" + +/* + * determine the security context within which we access the cache from within + * the kernel + */ +int cachefiles_get_security_ID(struct cachefiles_cache *cache) +{ + struct cred *new; + int ret; + + _enter("{%s}", cache->secctx); + + new = prepare_kernel_cred(current); + if (!new) { + ret = -ENOMEM; + goto error; + } + + if (cache->secctx) { + ret = set_security_override_from_ctx(new, cache->secctx); + if (ret < 0) { + put_cred(new); + printk(KERN_ERR "CacheFiles:" + " Security denies permission to nominate" + " security context: error %d\n", + ret); + goto error; + } + } + + cache->cache_cred = new; + ret = 0; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * see if mkdir and create can be performed in the root directory + */ +static int cachefiles_check_cache_dir(struct cachefiles_cache *cache, + struct dentry *root) +{ + int ret; + + ret = security_inode_mkdir(root->d_inode, root, 0); + if (ret < 0) { + printk(KERN_ERR "CacheFiles:" + " Security denies permission to make dirs: error %d", + ret); + return ret; + } + + ret = security_inode_create(root->d_inode, root, 0); + if (ret < 0) + printk(KERN_ERR "CacheFiles:" + " Security denies permission to create files: error %d", + ret); + + return ret; +} + +/* + * check the security details of the on-disk cache + * - must be called with security override in force + * - must return with a security override in force - even in the case of an + * error + */ +int cachefiles_determine_cache_security(struct cachefiles_cache *cache, + struct dentry *root, + const struct cred **_saved_cred) +{ + struct cred *new; + int ret; + + _enter(""); + + /* duplicate the cache creds for COW (the override is currently in + * force, so we can use prepare_creds() to do this) */ + new = prepare_creds(); + if (!new) + return -ENOMEM; + + cachefiles_end_secure(cache, *_saved_cred); + + /* use the cache root dir's security context as the basis with + * which create files */ + ret = set_create_files_as(new, root->d_inode); + if (ret < 0) { + abort_creds(new); + cachefiles_begin_secure(cache, _saved_cred); + _leave(" = %d [cfa]", ret); + return ret; + } + + put_cred(cache->cache_cred); + cache->cache_cred = new; + + cachefiles_begin_secure(cache, _saved_cred); + ret = cachefiles_check_cache_dir(cache, root); + + if (ret == -EOPNOTSUPP) + ret = 0; + _leave(" = %d", ret); + return ret; +} diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c new file mode 100644 index 00000000..e18b183b --- /dev/null +++ b/fs/cachefiles/xattr.c @@ -0,0 +1,292 @@ +/* CacheFiles extended attribute management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static const char cachefiles_xattr_cache[] = + XATTR_USER_PREFIX "CacheFiles.cache"; + +/* + * check the type label on an object + * - done using xattrs + */ +int cachefiles_check_object_type(struct cachefiles_object *object) +{ + struct dentry *dentry = object->dentry; + char type[3], xtype[3]; + int ret; + + ASSERT(dentry); + ASSERT(dentry->d_inode); + + if (!object->fscache.cookie) + strcpy(type, "C3"); + else + snprintf(type, 3, "%02x", object->fscache.cookie->def->type); + + _enter("%p{%s}", object, type); + + /* attempt to install a type label directly */ + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2, + XATTR_CREATE); + if (ret == 0) { + _debug("SET"); /* we succeeded */ + goto error; + } + + if (ret != -EEXIST) { + kerror("Can't set xattr on %*.*s [%lu] (err %d)", + dentry->d_name.len, dentry->d_name.len, + dentry->d_name.name, dentry->d_inode->i_ino, + -ret); + goto error; + } + + /* read the current type label */ + ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3); + if (ret < 0) { + if (ret == -ERANGE) + goto bad_type_length; + + kerror("Can't read xattr on %*.*s [%lu] (err %d)", + dentry->d_name.len, dentry->d_name.len, + dentry->d_name.name, dentry->d_inode->i_ino, + -ret); + goto error; + } + + /* check the type is what we're expecting */ + if (ret != 2) + goto bad_type_length; + + if (xtype[0] != type[0] || xtype[1] != type[1]) + goto bad_type; + + ret = 0; + +error: + _leave(" = %d", ret); + return ret; + +bad_type_length: + kerror("Cache object %lu type xattr length incorrect", + dentry->d_inode->i_ino); + ret = -EIO; + goto error; + +bad_type: + xtype[2] = 0; + kerror("Cache object %*.*s [%lu] type %s not %s", + dentry->d_name.len, dentry->d_name.len, + dentry->d_name.name, dentry->d_inode->i_ino, + xtype, type); + ret = -EIO; + goto error; +} + +/* + * set the state xattr on a cache file + */ +int cachefiles_set_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata) +{ + struct dentry *dentry = object->dentry; + int ret; + + ASSERT(object->fscache.cookie); + ASSERT(dentry); + + _enter("%p,#%d", object, auxdata->len); + + /* attempt to install the cache metadata directly */ + _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len); + + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, + XATTR_CREATE); + if (ret < 0 && ret != -ENOMEM) + cachefiles_io_error_obj( + object, + "Failed to set xattr with error %d", ret); + + _leave(" = %d", ret); + return ret; +} + +/* + * update the state xattr on a cache file + */ +int cachefiles_update_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata) +{ + struct dentry *dentry = object->dentry; + int ret; + + ASSERT(object->fscache.cookie); + ASSERT(dentry); + + _enter("%p,#%d", object, auxdata->len); + + /* attempt to install the cache metadata directly */ + _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len); + + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, + XATTR_REPLACE); + if (ret < 0 && ret != -ENOMEM) + cachefiles_io_error_obj( + object, + "Failed to update xattr with error %d", ret); + + _leave(" = %d", ret); + return ret; +} + +/* + * check the state xattr on a cache file + * - return -ESTALE if the object should be deleted + */ +int cachefiles_check_object_xattr(struct cachefiles_object *object, + struct cachefiles_xattr *auxdata) +{ + struct cachefiles_xattr *auxbuf; + struct dentry *dentry = object->dentry; + int ret; + + _enter("%p,#%d", object, auxdata->len); + + ASSERT(dentry); + ASSERT(dentry->d_inode); + + auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); + if (!auxbuf) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + /* read the current type label */ + ret = vfs_getxattr(dentry, cachefiles_xattr_cache, + &auxbuf->type, 512 + 1); + if (ret < 0) { + if (ret == -ENODATA) + goto stale; /* no attribute - power went off + * mid-cull? */ + + if (ret == -ERANGE) + goto bad_type_length; + + cachefiles_io_error_obj(object, + "Can't read xattr on %lu (err %d)", + dentry->d_inode->i_ino, -ret); + goto error; + } + + /* check the on-disk object */ + if (ret < 1) + goto bad_type_length; + + if (auxbuf->type != auxdata->type) + goto stale; + + auxbuf->len = ret; + + /* consult the netfs */ + if (object->fscache.cookie->def->check_aux) { + enum fscache_checkaux result; + unsigned int dlen; + + dlen = auxbuf->len - 1; + + _debug("checkaux %s #%u", + object->fscache.cookie->def->name, dlen); + + result = fscache_check_aux(&object->fscache, + &auxbuf->data, dlen); + + switch (result) { + /* entry okay as is */ + case FSCACHE_CHECKAUX_OKAY: + goto okay; + + /* entry requires update */ + case FSCACHE_CHECKAUX_NEEDS_UPDATE: + break; + + /* entry requires deletion */ + case FSCACHE_CHECKAUX_OBSOLETE: + goto stale; + + default: + BUG(); + } + + /* update the current label */ + ret = vfs_setxattr(dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, + XATTR_REPLACE); + if (ret < 0) { + cachefiles_io_error_obj(object, + "Can't update xattr on %lu" + " (error %d)", + dentry->d_inode->i_ino, -ret); + goto error; + } + } + +okay: + ret = 0; + +error: + kfree(auxbuf); + _leave(" = %d", ret); + return ret; + +bad_type_length: + kerror("Cache object %lu xattr length incorrect", + dentry->d_inode->i_ino); + ret = -EIO; + goto error; + +stale: + ret = -ESTALE; + goto error; +} + +/* + * remove the object's xattr to mark it stale + */ +int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, + struct dentry *dentry) +{ + int ret; + + ret = vfs_removexattr(dentry, cachefiles_xattr_cache); + if (ret < 0) { + if (ret == -ENOENT || ret == -ENODATA) + ret = 0; + else if (ret != -ENOMEM) + cachefiles_io_error(cache, + "Can't remove xattr from %lu" + " (error %d)", + dentry->d_inode->i_ino, -ret); + } + + _leave(" = %d", ret); + return ret; +} diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig new file mode 100644 index 00000000..9eb134ea --- /dev/null +++ b/fs/ceph/Kconfig @@ -0,0 +1,18 @@ +config CEPH_FS + tristate "Ceph distributed file system (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + select CEPH_LIB + select LIBCRC32C + select CRYPTO_AES + select CRYPTO + default n + help + Choose Y or M here to include support for mounting the + experimental Ceph distributed file system. Ceph is an extremely + scalable file system designed to provide high performance, + reliable access to petabytes of storage. + + More information at http://ceph.newdream.net/. + + If unsure, say N. + diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile new file mode 100644 index 00000000..bd352125 --- /dev/null +++ b/fs/ceph/Makefile @@ -0,0 +1,11 @@ +# +# Makefile for CEPH filesystem. +# + +obj-$(CONFIG_CEPH_FS) += ceph.o + +ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ + export.o caps.o snap.o xattr.o \ + mds_client.o mdsmap.o strings.o ceph_frag.o \ + debugfs.o + diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c new file mode 100644 index 00000000..5a3953db --- /dev/null +++ b/fs/ceph/addr.c @@ -0,0 +1,1181 @@ +#include + +#include +#include +#include +#include +#include /* generic_writepages */ +#include +#include +#include + +#include "super.h" +#include "mds_client.h" +#include + +/* + * Ceph address space ops. + * + * There are a few funny things going on here. + * + * The page->private field is used to reference a struct + * ceph_snap_context for _every_ dirty page. This indicates which + * snapshot the page was logically dirtied in, and thus which snap + * context needs to be associated with the osd write during writeback. + * + * Similarly, struct ceph_inode_info maintains a set of counters to + * count dirty pages on the inode. In the absence of snapshots, + * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. + * + * When a snapshot is taken (that is, when the client receives + * notification that a snapshot was taken), each inode with caps and + * with dirty pages (dirty pages implies there is a cap) gets a new + * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending + * order, new snaps go to the tail). The i_wrbuffer_ref_head count is + * moved to capsnap->dirty. (Unless a sync write is currently in + * progress. In that case, the capsnap is said to be "pending", new + * writes cannot start, and the capsnap isn't "finalized" until the + * write completes (or fails) and a final size/mtime for the inode for + * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0. + * + * On writeback, we must submit writes to the osd IN SNAP ORDER. So, + * we look for the first capsnap in i_cap_snaps and write out pages in + * that snap context _only_. Then we move on to the next capsnap, + * eventually reaching the "live" or "head" context (i.e., pages that + * are not yet snapped) and are writing the most recently dirtied + * pages. + * + * Invalidate and so forth must take care to ensure the dirty page + * accounting is preserved. + */ + +#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) +#define CONGESTION_OFF_THRESH(congestion_kb) \ + (CONGESTION_ON_THRESH(congestion_kb) - \ + (CONGESTION_ON_THRESH(congestion_kb) >> 2)) + + + +/* + * Dirty a page. Optimistically adjust accounting, on the assumption + * that we won't race with invalidate. If we do, readjust. + */ +static int ceph_set_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct ceph_inode_info *ci; + int undo = 0; + struct ceph_snap_context *snapc; + + if (unlikely(!mapping)) + return !TestSetPageDirty(page); + + if (TestSetPageDirty(page)) { + dout("%p set_page_dirty %p idx %lu -- already dirty\n", + mapping->host, page, page->index); + return 0; + } + + inode = mapping->host; + ci = ceph_inode(inode); + + /* + * Note that we're grabbing a snapc ref here without holding + * any locks! + */ + snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); + + /* dirty the head */ + spin_lock(&inode->i_lock); + if (ci->i_head_snapc == NULL) + ci->i_head_snapc = ceph_get_snap_context(snapc); + ++ci->i_wrbuffer_ref_head; + if (ci->i_wrbuffer_ref == 0) + ihold(inode); + ++ci->i_wrbuffer_ref; + dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " + "snapc %p seq %lld (%d snaps)\n", + mapping->host, page, page->index, + ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, + ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, + snapc, snapc->seq, snapc->num_snaps); + spin_unlock(&inode->i_lock); + + /* now adjust page */ + spin_lock_irq(&mapping->tree_lock); + if (page->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(!PageUptodate(page)); + account_page_dirtied(page, page->mapping); + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + + /* + * Reference snap context in page->private. Also set + * PagePrivate so that we get invalidatepage callback. + */ + page->private = (unsigned long)snapc; + SetPagePrivate(page); + } else { + dout("ANON set_page_dirty %p (raced truncate?)\n", page); + undo = 1; + } + + spin_unlock_irq(&mapping->tree_lock); + + if (undo) + /* whoops, we failed to dirty the page */ + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + + BUG_ON(!PageDirty(page)); + return 1; +} + +/* + * If we are truncating the full page (i.e. offset == 0), adjust the + * dirty page counters appropriately. Only called if there is private + * data on the page. + */ +static void ceph_invalidatepage(struct page *page, unsigned long offset) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_snap_context *snapc = (void *)page->private; + + BUG_ON(!PageLocked(page)); + BUG_ON(!page->private); + BUG_ON(!PagePrivate(page)); + BUG_ON(!page->mapping); + + inode = page->mapping->host; + + /* + * We can get non-dirty pages here due to races between + * set_page_dirty and truncate_complete_page; just spit out a + * warning, in case we end up with accounting problems later. + */ + if (!PageDirty(page)) + pr_err("%p invalidatepage %p page not dirty\n", inode, page); + + if (offset == 0) + ClearPageChecked(page); + + ci = ceph_inode(inode); + if (offset == 0) { + dout("%p invalidatepage %p idx %lu full dirty page %lu\n", + inode, page, page->index, offset); + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); + page->private = 0; + ClearPagePrivate(page); + } else { + dout("%p invalidatepage %p idx %lu partial dirty page\n", + inode, page, page->index); + } +} + +/* just a sanity check */ +static int ceph_releasepage(struct page *page, gfp_t g) +{ + struct inode *inode = page->mapping ? page->mapping->host : NULL; + dout("%p releasepage %p idx %lu\n", inode, page, page->index); + WARN_ON(PageDirty(page)); + WARN_ON(page->private); + WARN_ON(PagePrivate(page)); + return 0; +} + +/* + * read a single page, without unlocking it. + */ +static int readpage_nounlock(struct file *filp, struct page *page) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; + int err = 0; + u64 len = PAGE_CACHE_SIZE; + + dout("readpage inode %p file %p page %p index %lu\n", + inode, filp, page, page->index); + err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, + page->index << PAGE_CACHE_SHIFT, &len, + ci->i_truncate_seq, ci->i_truncate_size, + &page, 1, 0); + if (err == -ENOENT) + err = 0; + if (err < 0) { + SetPageError(page); + goto out; + } else if (err < PAGE_CACHE_SIZE) { + /* zero fill remainder of page */ + zero_user_segment(page, err, PAGE_CACHE_SIZE); + } + SetPageUptodate(page); + +out: + return err < 0 ? err : 0; +} + +static int ceph_readpage(struct file *filp, struct page *page) +{ + int r = readpage_nounlock(filp, page); + unlock_page(page); + return r; +} + +/* + * Build a vector of contiguous pages from the provided page list. + */ +static struct page **page_vector_from_list(struct list_head *page_list, + unsigned *nr_pages) +{ + struct page **pages; + struct page *page; + int next_index, contig_pages = 0; + + /* build page vector */ + pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS); + if (!pages) + return ERR_PTR(-ENOMEM); + + BUG_ON(list_empty(page_list)); + next_index = list_entry(page_list->prev, struct page, lru)->index; + list_for_each_entry_reverse(page, page_list, lru) { + if (page->index == next_index) { + dout("readpages page %d %p\n", contig_pages, page); + pages[contig_pages] = page; + contig_pages++; + next_index++; + } else { + break; + } + } + *nr_pages = contig_pages; + return pages; +} + +/* + * Read multiple pages. Leave pages we don't read + unlock in page_list; + * the caller (VM) cleans them up. + */ +static int ceph_readpages(struct file *file, struct address_space *mapping, + struct list_head *page_list, unsigned nr_pages) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; + int rc = 0; + struct page **pages; + loff_t offset; + u64 len; + + dout("readpages %p file %p nr_pages %d\n", + inode, file, nr_pages); + + pages = page_vector_from_list(page_list, &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + /* guess read extent */ + offset = pages[0]->index << PAGE_CACHE_SHIFT; + len = nr_pages << PAGE_CACHE_SHIFT; + rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, + offset, &len, + ci->i_truncate_seq, ci->i_truncate_size, + pages, nr_pages, 0); + if (rc == -ENOENT) + rc = 0; + if (rc < 0) + goto out; + + for (; !list_empty(page_list) && len > 0; + rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { + struct page *page = + list_entry(page_list->prev, struct page, lru); + + list_del(&page->lru); + + if (rc < (int)PAGE_CACHE_SIZE) { + /* zero (remainder of) page */ + int s = rc < 0 ? 0 : rc; + zero_user_segment(page, s, PAGE_CACHE_SIZE); + } + + if (add_to_page_cache_lru(page, mapping, page->index, + GFP_NOFS)) { + page_cache_release(page); + dout("readpages %p add_to_page_cache failed %p\n", + inode, page); + continue; + } + dout("readpages %p adding %p idx %lu\n", inode, page, + page->index); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + } + rc = 0; + +out: + kfree(pages); + return rc; +} + +/* + * Get ref for the oldest snapc for an inode with dirty data... that is, the + * only snap context we are allowed to write back. + */ +static struct ceph_snap_context *get_oldest_context(struct inode *inode, + u64 *snap_size) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc = NULL; + struct ceph_cap_snap *capsnap = NULL; + + spin_lock(&inode->i_lock); + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, + capsnap->context, capsnap->dirty_pages); + if (capsnap->dirty_pages) { + snapc = ceph_get_snap_context(capsnap->context); + if (snap_size) + *snap_size = capsnap->size; + break; + } + } + if (!snapc && ci->i_wrbuffer_ref_head) { + snapc = ceph_get_snap_context(ci->i_head_snapc); + dout(" head snapc %p has %d dirty pages\n", + snapc, ci->i_wrbuffer_ref_head); + } + spin_unlock(&inode->i_lock); + return snapc; +} + +/* + * Write a single page, but leave the page locked. + * + * If we get a write error, set the page error bit, but still adjust the + * dirty page accounting (i.e., page is no longer dirty). + */ +static int writepage_nounlock(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_fs_client *fsc; + struct ceph_osd_client *osdc; + loff_t page_off = page->index << PAGE_CACHE_SHIFT; + int len = PAGE_CACHE_SIZE; + loff_t i_size; + int err = 0; + struct ceph_snap_context *snapc, *oldest; + u64 snap_size = 0; + long writeback_stat; + + dout("writepage %p idx %lu\n", page, page->index); + + if (!page->mapping || !page->mapping->host) { + dout("writepage %p - no mapping\n", page); + return -EFAULT; + } + inode = page->mapping->host; + ci = ceph_inode(inode); + fsc = ceph_inode_to_client(inode); + osdc = &fsc->client->osdc; + + /* verify this is a writeable snap context */ + snapc = (void *)page->private; + if (snapc == NULL) { + dout("writepage %p page %p not dirty?\n", inode, page); + goto out; + } + oldest = get_oldest_context(inode, &snap_size); + if (snapc->seq > oldest->seq) { + dout("writepage %p page %p snapc %p not writeable - noop\n", + inode, page, (void *)page->private); + /* we should only noop if called by kswapd */ + WARN_ON((current->flags & PF_MEMALLOC) == 0); + ceph_put_snap_context(oldest); + goto out; + } + ceph_put_snap_context(oldest); + + /* is this a partial page at end of file? */ + if (snap_size) + i_size = snap_size; + else + i_size = i_size_read(inode); + if (i_size < page_off + len) + len = i_size - page_off; + + dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", + inode, page, page->index, page_off, len, snapc); + + writeback_stat = atomic_long_inc_return(&fsc->writeback_count); + if (writeback_stat > + CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) + set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); + + set_page_writeback(page); + err = ceph_osdc_writepages(osdc, ceph_vino(inode), + &ci->i_layout, snapc, + page_off, len, + ci->i_truncate_seq, ci->i_truncate_size, + &inode->i_mtime, + &page, 1, 0, 0, true); + if (err < 0) { + dout("writepage setting page/mapping error %d %p\n", err, page); + SetPageError(page); + mapping_set_error(&inode->i_data, err); + if (wbc) + wbc->pages_skipped++; + } else { + dout("writepage cleaned page %p\n", page); + err = 0; /* vfs expects us to return 0 */ + } + page->private = 0; + ClearPagePrivate(page); + end_page_writeback(page); + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); /* page's reference */ +out: + return err; +} + +static int ceph_writepage(struct page *page, struct writeback_control *wbc) +{ + int err; + struct inode *inode = page->mapping->host; + BUG_ON(!inode); + ihold(inode); + err = writepage_nounlock(page, wbc); + unlock_page(page); + iput(inode); + return err; +} + + +/* + * lame release_pages helper. release_pages() isn't exported to + * modules. + */ +static void ceph_release_pages(struct page **pages, int num) +{ + struct pagevec pvec; + int i; + + pagevec_init(&pvec, 0); + for (i = 0; i < num; i++) { + if (pagevec_add(&pvec, pages[i]) == 0) + pagevec_release(&pvec); + } + pagevec_release(&pvec); +} + + +/* + * async writeback completion handler. + * + * If we get an error, set the mapping error bit, but not the individual + * page error bits. + */ +static void writepages_finish(struct ceph_osd_request *req, + struct ceph_msg *msg) +{ + struct inode *inode = req->r_inode; + struct ceph_osd_reply_head *replyhead; + struct ceph_osd_op *op; + struct ceph_inode_info *ci = ceph_inode(inode); + unsigned wrote; + struct page *page; + int i; + struct ceph_snap_context *snapc = req->r_snapc; + struct address_space *mapping = inode->i_mapping; + __s32 rc = -EIO; + u64 bytes = 0; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + long writeback_stat; + unsigned issued = ceph_caps_issued(ci); + + /* parse reply */ + replyhead = msg->front.iov_base; + WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); + op = (void *)(replyhead + 1); + rc = le32_to_cpu(replyhead->result); + bytes = le64_to_cpu(op->extent.length); + + if (rc >= 0) { + /* + * Assume we wrote the pages we originally sent. The + * osd might reply with fewer pages if our writeback + * raced with a truncation and was adjusted at the osd, + * so don't believe the reply. + */ + wrote = req->r_num_pages; + } else { + wrote = 0; + mapping_set_error(mapping, rc); + } + dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n", + inode, rc, bytes, wrote); + + /* clean all pages */ + for (i = 0; i < req->r_num_pages; i++) { + page = req->r_pages[i]; + BUG_ON(!page); + WARN_ON(!PageUptodate(page)); + + writeback_stat = + atomic_long_dec_return(&fsc->writeback_count); + if (writeback_stat < + CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) + clear_bdi_congested(&fsc->backing_dev_info, + BLK_RW_ASYNC); + + ceph_put_snap_context((void *)page->private); + page->private = 0; + ClearPagePrivate(page); + dout("unlocking %d %p\n", i, page); + end_page_writeback(page); + + /* + * We lost the cache cap, need to truncate the page before + * it is unlocked, otherwise we'd truncate it later in the + * page truncation thread, possibly losing some data that + * raced its way in + */ + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) + generic_error_remove_page(inode->i_mapping, page); + + unlock_page(page); + } + dout("%p wrote+cleaned %d pages\n", inode, wrote); + ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); + + ceph_release_pages(req->r_pages, req->r_num_pages); + if (req->r_pages_from_pool) + mempool_free(req->r_pages, + ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); + else + kfree(req->r_pages); + ceph_osdc_put_request(req); +} + +/* + * allocate a page vec, either directly, or if necessary, via a the + * mempool. we avoid the mempool if we can because req->r_num_pages + * may be less than the maximum write size. + */ +static void alloc_page_vec(struct ceph_fs_client *fsc, + struct ceph_osd_request *req) +{ + req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, + GFP_NOFS); + if (!req->r_pages) { + req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); + req->r_pages_from_pool = 1; + WARN_ON(!req->r_pages); + } +} + +/* + * initiate async writeback + */ +static int ceph_writepages_start(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc; + pgoff_t index, start, end; + int range_whole = 0; + int should_loop = 1; + pgoff_t max_pages = 0, max_pages_ever = 0; + struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; + struct pagevec pvec; + int done = 0; + int rc = 0; + unsigned wsize = 1 << inode->i_blkbits; + struct ceph_osd_request *req = NULL; + int do_sync; + u64 snap_size = 0; + + /* + * Include a 'sync' in the OSD request if this is a data + * integrity write (e.g., O_SYNC write or fsync()), or if our + * cap is being revoked. + */ + do_sync = wbc->sync_mode == WB_SYNC_ALL; + if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) + do_sync = 1; + dout("writepages_start %p dosync=%d (mode=%s)\n", + inode, do_sync, + wbc->sync_mode == WB_SYNC_NONE ? "NONE" : + (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); + + fsc = ceph_inode_to_client(inode); + if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { + pr_warning("writepage_start %p on forced umount\n", inode); + return -EIO; /* we're in a forced umount, don't write! */ + } + if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) + wsize = fsc->mount_options->wsize; + if (wsize < PAGE_CACHE_SIZE) + wsize = PAGE_CACHE_SIZE; + max_pages_ever = wsize >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + + /* where to start/end? */ + if (wbc->range_cyclic) { + start = mapping->writeback_index; /* Start from prev offset */ + end = -1; + dout(" cyclic, start at %lu\n", start); + } else { + start = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + should_loop = 0; + dout(" not cyclic, %lu to %lu\n", start, end); + } + index = start; + +retry: + /* find oldest snap context with dirty data */ + ceph_put_snap_context(snapc); + snapc = get_oldest_context(inode, &snap_size); + if (!snapc) { + /* hmm, why does writepages get called when there + is no dirty data? */ + dout(" no snap context with dirty data?\n"); + goto out; + } + dout(" oldest snapc is %p seq %lld (%d snaps)\n", + snapc, snapc->seq, snapc->num_snaps); + if (last_snapc && snapc != last_snapc) { + /* if we switched to a newer snapc, restart our scan at the + * start of the original file range. */ + dout(" snapc differs from last pass, restarting at %lu\n", + index); + index = start; + } + last_snapc = snapc; + + while (!done && index <= end) { + unsigned i; + int first; + pgoff_t next; + int pvec_pages, locked_pages; + struct page *page; + int want; + u64 offset, len; + struct ceph_osd_request_head *reqhead; + struct ceph_osd_op *op; + long writeback_stat; + + next = 0; + locked_pages = 0; + max_pages = max_pages_ever; + +get_more_pages: + first = -1; + want = min(end - index, + min((pgoff_t)PAGEVEC_SIZE, + max_pages - (pgoff_t)locked_pages) - 1) + + 1; + pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + want); + dout("pagevec_lookup_tag got %d\n", pvec_pages); + if (!pvec_pages && !locked_pages) + break; + for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { + page = pvec.pages[i]; + dout("? %p idx %lu\n", page, page->index); + if (locked_pages == 0) + lock_page(page); /* first page */ + else if (!trylock_page(page)) + break; + + /* only dirty pages, or our accounting breaks */ + if (unlikely(!PageDirty(page)) || + unlikely(page->mapping != mapping)) { + dout("!dirty or !mapping %p\n", page); + unlock_page(page); + break; + } + if (!wbc->range_cyclic && page->index > end) { + dout("end of range %p\n", page); + done = 1; + unlock_page(page); + break; + } + if (next && (page->index != next)) { + dout("not consecutive %p\n", page); + unlock_page(page); + break; + } + if (wbc->sync_mode != WB_SYNC_NONE) { + dout("waiting on writeback %p\n", page); + wait_on_page_writeback(page); + } + if ((snap_size && page_offset(page) > snap_size) || + (!snap_size && + page_offset(page) > i_size_read(inode))) { + dout("%p page eof %llu\n", page, snap_size ? + snap_size : i_size_read(inode)); + done = 1; + unlock_page(page); + break; + } + if (PageWriteback(page)) { + dout("%p under writeback\n", page); + unlock_page(page); + break; + } + + /* only if matching snap context */ + pgsnapc = (void *)page->private; + if (pgsnapc->seq > snapc->seq) { + dout("page snapc %p %lld > oldest %p %lld\n", + pgsnapc, pgsnapc->seq, snapc, snapc->seq); + unlock_page(page); + if (!locked_pages) + continue; /* keep looking for snap */ + break; + } + + if (!clear_page_dirty_for_io(page)) { + dout("%p !clear_page_dirty_for_io\n", page); + unlock_page(page); + break; + } + + /* ok */ + if (locked_pages == 0) { + /* prepare async write request */ + offset = (unsigned long long)page->index + << PAGE_CACHE_SHIFT; + len = wsize; + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, + ceph_vino(inode), + offset, &len, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + snapc, do_sync, + ci->i_truncate_seq, + ci->i_truncate_size, + &inode->i_mtime, true, 1, 0); + + if (!req) { + rc = -ENOMEM; + unlock_page(page); + break; + } + + max_pages = req->r_num_pages; + + alloc_page_vec(fsc, req); + req->r_callback = writepages_finish; + req->r_inode = inode; + } + + /* note position of first page in pvec */ + if (first < 0) + first = i; + dout("%p will write page %p idx %lu\n", + inode, page, page->index); + + writeback_stat = + atomic_long_inc_return(&fsc->writeback_count); + if (writeback_stat > CONGESTION_ON_THRESH( + fsc->mount_options->congestion_kb)) { + set_bdi_congested(&fsc->backing_dev_info, + BLK_RW_ASYNC); + } + + set_page_writeback(page); + req->r_pages[locked_pages] = page; + locked_pages++; + next = page->index + 1; + } + + /* did we get anything? */ + if (!locked_pages) + goto release_pvec_pages; + if (i) { + int j; + BUG_ON(!locked_pages || first < 0); + + if (pvec_pages && i == pvec_pages && + locked_pages < max_pages) { + dout("reached end pvec, trying for more\n"); + pagevec_reinit(&pvec); + goto get_more_pages; + } + + /* shift unused pages over in the pvec... we + * will need to release them below. */ + for (j = i; j < pvec_pages; j++) { + dout(" pvec leftover page %p\n", + pvec.pages[j]); + pvec.pages[j-i+first] = pvec.pages[j]; + } + pvec.nr -= i-first; + } + + /* submit the write */ + offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; + len = min((snap_size ? snap_size : i_size_read(inode)) - offset, + (u64)locked_pages << PAGE_CACHE_SHIFT); + dout("writepages got %d pages at %llu~%llu\n", + locked_pages, offset, len); + + /* revise final length, page count */ + req->r_num_pages = locked_pages; + reqhead = req->r_request->front.iov_base; + op = (void *)(reqhead + 1); + op->extent.length = cpu_to_le64(len); + op->payload_len = cpu_to_le32(len); + req->r_request->hdr.data_len = cpu_to_le32(len); + + rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); + BUG_ON(rc); + req = NULL; + + /* continue? */ + index = next; + wbc->nr_to_write -= locked_pages; + if (wbc->nr_to_write <= 0) + done = 1; + +release_pvec_pages: + dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, + pvec.nr ? pvec.pages[0] : NULL); + pagevec_release(&pvec); + + if (locked_pages && !done) + goto retry; + } + + if (should_loop && !done) { + /* more to do; loop back to beginning of file */ + dout("writepages looping back to beginning of file\n"); + should_loop = 0; + index = 0; + goto retry; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + +out: + if (req) + ceph_osdc_put_request(req); + ceph_put_snap_context(snapc); + dout("writepages done, rc = %d\n", rc); + return rc; +} + + + +/* + * See if a given @snapc is either writeable, or already written. + */ +static int context_is_writeable_or_written(struct inode *inode, + struct ceph_snap_context *snapc) +{ + struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); + int ret = !oldest || snapc->seq <= oldest->seq; + + ceph_put_snap_context(oldest); + return ret; +} + +/* + * We are only allowed to write into/dirty the page if the page is + * clean, or already dirty within the same snap context. + * + * called with page locked. + * return success with page locked, + * or any failure (incl -EAGAIN) with page unlocked. + */ +static int ceph_update_writeable_page(struct file *file, + loff_t pos, unsigned len, + struct page *page) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + loff_t page_off = pos & PAGE_CACHE_MASK; + int pos_in_page = pos & ~PAGE_CACHE_MASK; + int end_in_page = pos_in_page + len; + loff_t i_size; + int r; + struct ceph_snap_context *snapc, *oldest; + +retry_locked: + /* writepages currently holds page lock, but if we change that later, */ + wait_on_page_writeback(page); + + /* check snap context */ + BUG_ON(!ci->i_snap_realm); + down_read(&mdsc->snap_rwsem); + BUG_ON(!ci->i_snap_realm->cached_context); + snapc = (void *)page->private; + if (snapc && snapc != ci->i_head_snapc) { + /* + * this page is already dirty in another (older) snap + * context! is it writeable now? + */ + oldest = get_oldest_context(inode, NULL); + up_read(&mdsc->snap_rwsem); + + if (snapc->seq > oldest->seq) { + ceph_put_snap_context(oldest); + dout(" page %p snapc %p not current or oldest\n", + page, snapc); + /* + * queue for writeback, and wait for snapc to + * be writeable or written + */ + snapc = ceph_get_snap_context(snapc); + unlock_page(page); + ceph_queue_writeback(inode); + r = wait_event_interruptible(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + if (r == -ERESTARTSYS) + return r; + return -EAGAIN; + } + ceph_put_snap_context(oldest); + + /* yay, writeable, do it now (without dropping page lock) */ + dout(" page %p snapc %p not current, but oldest\n", + page, snapc); + if (!clear_page_dirty_for_io(page)) + goto retry_locked; + r = writepage_nounlock(page, NULL); + if (r < 0) + goto fail_nosnap; + goto retry_locked; + } + + if (PageUptodate(page)) { + dout(" page %p already uptodate\n", page); + return 0; + } + + /* full page? */ + if (pos_in_page == 0 && len == PAGE_CACHE_SIZE) + return 0; + + /* past end of file? */ + i_size = inode->i_size; /* caller holds i_mutex */ + + if (i_size + len > inode->i_sb->s_maxbytes) { + /* file is too big */ + r = -EINVAL; + goto fail; + } + + if (page_off >= i_size || + (pos_in_page == 0 && (pos+len) >= i_size && + end_in_page - pos_in_page != PAGE_CACHE_SIZE)) { + dout(" zeroing %p 0 - %d and %d - %d\n", + page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE); + zero_user_segments(page, + 0, pos_in_page, + end_in_page, PAGE_CACHE_SIZE); + return 0; + } + + /* we need to read it. */ + up_read(&mdsc->snap_rwsem); + r = readpage_nounlock(file, page); + if (r < 0) + goto fail_nosnap; + goto retry_locked; + +fail: + up_read(&mdsc->snap_rwsem); +fail_nosnap: + unlock_page(page); + return r; +} + +/* + * We are only allowed to write into/dirty the page if the page is + * clean, or already dirty within the same snap context. + */ +static int ceph_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = file->f_dentry->d_inode; + struct page *page; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + int r; + + do { + /* get a page */ + page = grab_cache_page_write_begin(mapping, index, 0); + if (!page) + return -ENOMEM; + *pagep = page; + + dout("write_begin file %p inode %p page %p %d~%d\n", file, + inode, page, (int)pos, (int)len); + + r = ceph_update_writeable_page(file, pos, len, page); + } while (r == -EAGAIN); + + return r; +} + +/* + * we don't do anything in here that simple_write_end doesn't do + * except adjust dirty page accounting and drop read lock on + * mdsc->snap_rwsem. + */ +static int ceph_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + int check_cap = 0; + + dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, + inode, page, (int)pos, (int)copied, (int)len); + + /* zero the stale part of the page if we did a short copy */ + if (copied < len) + zero_user_segment(page, from+copied, len); + + /* did file size increase? */ + /* (no need for i_size_read(); we caller holds i_mutex */ + if (pos+copied > inode->i_size) + check_cap = ceph_inode_set_size(inode, pos+copied); + + if (!PageUptodate(page)) + SetPageUptodate(page); + + set_page_dirty(page); + + unlock_page(page); + up_read(&mdsc->snap_rwsem); + page_cache_release(page); + + if (check_cap) + ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); + + return copied; +} + +/* + * we set .direct_IO to indicate direct io is supported, but since we + * intercept O_DIRECT reads and writes early, this function should + * never get called. + */ +static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, + const struct iovec *iov, + loff_t pos, unsigned long nr_segs) +{ + WARN_ON(1); + return -EINVAL; +} + +const struct address_space_operations ceph_aops = { + .readpage = ceph_readpage, + .readpages = ceph_readpages, + .writepage = ceph_writepage, + .writepages = ceph_writepages_start, + .write_begin = ceph_write_begin, + .write_end = ceph_write_end, + .set_page_dirty = ceph_set_page_dirty, + .invalidatepage = ceph_invalidatepage, + .releasepage = ceph_releasepage, + .direct_IO = ceph_direct_io, +}; + + +/* + * vm ops + */ + +/* + * Reuse write_begin here for simplicity. + */ +static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct page *page = vmf->page; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + loff_t off = page->index << PAGE_CACHE_SHIFT; + loff_t size, len; + int ret; + + size = i_size_read(inode); + if (off + PAGE_CACHE_SIZE <= size) + len = PAGE_CACHE_SIZE; + else + len = size & ~PAGE_CACHE_MASK; + + dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, + off, len, page, page->index); + + lock_page(page); + + ret = VM_FAULT_NOPAGE; + if ((off > size) || + (page->mapping != inode->i_mapping)) + goto out; + + ret = ceph_update_writeable_page(vma->vm_file, off, len, page); + if (ret == 0) { + /* success. we'll keep the page locked. */ + set_page_dirty(page); + up_read(&mdsc->snap_rwsem); + ret = VM_FAULT_LOCKED; + } else { + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; + } +out: + dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); + if (ret != VM_FAULT_LOCKED) + unlock_page(page); + return ret; +} + +static struct vm_operations_struct ceph_vmops = { + .fault = filemap_fault, + .page_mkwrite = ceph_page_mkwrite, +}; + +int ceph_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &ceph_vmops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c new file mode 100644 index 00000000..f605753c --- /dev/null +++ b/fs/ceph/caps.c @@ -0,0 +1,3087 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "super.h" +#include "mds_client.h" +#include +#include + +/* + * Capability management + * + * The Ceph metadata servers control client access to inode metadata + * and file data by issuing capabilities, granting clients permission + * to read and/or write both inode field and file data to OSDs + * (storage nodes). Each capability consists of a set of bits + * indicating which operations are allowed. + * + * If the client holds a *_SHARED cap, the client has a coherent value + * that can be safely read from the cached inode. + * + * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the + * client is allowed to change inode attributes (e.g., file size, + * mtime), note its dirty state in the ceph_cap, and asynchronously + * flush that metadata change to the MDS. + * + * In the event of a conflicting operation (perhaps by another + * client), the MDS will revoke the conflicting client capabilities. + * + * In order for a client to cache an inode, it must hold a capability + * with at least one MDS server. When inodes are released, release + * notifications are batched and periodically sent en masse to the MDS + * cluster to release server state. + */ + + +/* + * Generate readable cap strings for debugging output. + */ +#define MAX_CAP_STR 20 +static char cap_str[MAX_CAP_STR][40]; +static DEFINE_SPINLOCK(cap_str_lock); +static int last_cap_str; + +static char *gcap_string(char *s, int c) +{ + if (c & CEPH_CAP_GSHARED) + *s++ = 's'; + if (c & CEPH_CAP_GEXCL) + *s++ = 'x'; + if (c & CEPH_CAP_GCACHE) + *s++ = 'c'; + if (c & CEPH_CAP_GRD) + *s++ = 'r'; + if (c & CEPH_CAP_GWR) + *s++ = 'w'; + if (c & CEPH_CAP_GBUFFER) + *s++ = 'b'; + if (c & CEPH_CAP_GLAZYIO) + *s++ = 'l'; + return s; +} + +const char *ceph_cap_string(int caps) +{ + int i; + char *s; + int c; + + spin_lock(&cap_str_lock); + i = last_cap_str++; + if (last_cap_str == MAX_CAP_STR) + last_cap_str = 0; + spin_unlock(&cap_str_lock); + + s = cap_str[i]; + + if (caps & CEPH_CAP_PIN) + *s++ = 'p'; + + c = (caps >> CEPH_CAP_SAUTH) & 3; + if (c) { + *s++ = 'A'; + s = gcap_string(s, c); + } + + c = (caps >> CEPH_CAP_SLINK) & 3; + if (c) { + *s++ = 'L'; + s = gcap_string(s, c); + } + + c = (caps >> CEPH_CAP_SXATTR) & 3; + if (c) { + *s++ = 'X'; + s = gcap_string(s, c); + } + + c = caps >> CEPH_CAP_SFILE; + if (c) { + *s++ = 'F'; + s = gcap_string(s, c); + } + + if (s == cap_str[i]) + *s++ = '-'; + *s = 0; + return cap_str[i]; +} + +void ceph_caps_init(struct ceph_mds_client *mdsc) +{ + INIT_LIST_HEAD(&mdsc->caps_list); + spin_lock_init(&mdsc->caps_list_lock); +} + +void ceph_caps_finalize(struct ceph_mds_client *mdsc) +{ + struct ceph_cap *cap; + + spin_lock(&mdsc->caps_list_lock); + while (!list_empty(&mdsc->caps_list)) { + cap = list_first_entry(&mdsc->caps_list, + struct ceph_cap, caps_item); + list_del(&cap->caps_item); + kmem_cache_free(ceph_cap_cachep, cap); + } + mdsc->caps_total_count = 0; + mdsc->caps_avail_count = 0; + mdsc->caps_use_count = 0; + mdsc->caps_reserve_count = 0; + mdsc->caps_min_count = 0; + spin_unlock(&mdsc->caps_list_lock); +} + +void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) +{ + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_min_count += delta; + BUG_ON(mdsc->caps_min_count < 0); + spin_unlock(&mdsc->caps_list_lock); +} + +int ceph_reserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx, int need) +{ + int i; + struct ceph_cap *cap; + int have; + int alloc = 0; + LIST_HEAD(newcaps); + int ret = 0; + + dout("reserve caps ctx=%p need=%d\n", ctx, need); + + /* first reserve any caps that are already allocated */ + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_avail_count >= need) + have = need; + else + have = mdsc->caps_avail_count; + mdsc->caps_avail_count -= have; + mdsc->caps_reserve_count += have; + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); + + for (i = have; i < need; i++) { + cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + if (!cap) { + ret = -ENOMEM; + goto out_alloc_count; + } + list_add(&cap->caps_item, &newcaps); + alloc++; + } + BUG_ON(have + alloc != need); + + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_total_count += alloc; + mdsc->caps_reserve_count += alloc; + list_splice(&newcaps, &mdsc->caps_list); + + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); + + ctx->count = need; + dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", + ctx, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + return 0; + +out_alloc_count: + /* we didn't manage to reserve as much as we needed */ + pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", + ctx, need, have); + return ret; +} + +int ceph_unreserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) +{ + dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); + if (ctx->count) { + spin_lock(&mdsc->caps_list_lock); + BUG_ON(mdsc->caps_reserve_count < ctx->count); + mdsc->caps_reserve_count -= ctx->count; + mdsc->caps_avail_count += ctx->count; + ctx->count = 0; + dout("unreserve caps %d = %d used + %d resv + %d avail\n", + mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); + } + return 0; +} + +static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) +{ + struct ceph_cap *cap = NULL; + + /* temporary, until we do something about cap import/export */ + if (!ctx) { + cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + if (cap) { + mdsc->caps_use_count++; + mdsc->caps_total_count++; + } + return cap; + } + + spin_lock(&mdsc->caps_list_lock); + dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", + ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + BUG_ON(!ctx->count); + BUG_ON(ctx->count > mdsc->caps_reserve_count); + BUG_ON(list_empty(&mdsc->caps_list)); + + ctx->count--; + mdsc->caps_reserve_count--; + mdsc->caps_use_count++; + + cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); + list_del(&cap->caps_item); + + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); + return cap; +} + +void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) +{ + spin_lock(&mdsc->caps_list_lock); + dout("put_cap %p %d = %d used + %d resv + %d avail\n", + cap, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + mdsc->caps_use_count--; + /* + * Keep some preallocated caps around (ceph_min_count), to + * avoid lots of free/alloc churn. + */ + if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + + mdsc->caps_min_count) { + mdsc->caps_total_count--; + kmem_cache_free(ceph_cap_cachep, cap); + } else { + mdsc->caps_avail_count++; + list_add(&cap->caps_item, &mdsc->caps_list); + } + + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); +} + +void ceph_reservation_status(struct ceph_fs_client *fsc, + int *total, int *avail, int *used, int *reserved, + int *min) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + + if (total) + *total = mdsc->caps_total_count; + if (avail) + *avail = mdsc->caps_avail_count; + if (used) + *used = mdsc->caps_use_count; + if (reserved) + *reserved = mdsc->caps_reserve_count; + if (min) + *min = mdsc->caps_min_count; +} + +/* + * Find ceph_cap for given mds, if any. + * + * Called with i_lock held. + */ +static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) +{ + struct ceph_cap *cap; + struct rb_node *n = ci->i_caps.rb_node; + + while (n) { + cap = rb_entry(n, struct ceph_cap, ci_node); + if (mds < cap->mds) + n = n->rb_left; + else if (mds > cap->mds) + n = n->rb_right; + else + return cap; + } + return NULL; +} + +struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) +{ + struct ceph_cap *cap; + + spin_lock(&ci->vfs_inode.i_lock); + cap = __get_cap_for_mds(ci, mds); + spin_unlock(&ci->vfs_inode.i_lock); + return cap; +} + +/* + * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. + */ +static int __ceph_get_cap_mds(struct ceph_inode_info *ci) +{ + struct ceph_cap *cap; + int mds = -1; + struct rb_node *p; + + /* prefer mds with WR|BUFFER|EXCL caps */ + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + mds = cap->mds; + if (cap->issued & (CEPH_CAP_FILE_WR | + CEPH_CAP_FILE_BUFFER | + CEPH_CAP_FILE_EXCL)) + break; + } + return mds; +} + +int ceph_get_cap_mds(struct inode *inode) +{ + int mds; + spin_lock(&inode->i_lock); + mds = __ceph_get_cap_mds(ceph_inode(inode)); + spin_unlock(&inode->i_lock); + return mds; +} + +/* + * Called under i_lock. + */ +static void __insert_cap_node(struct ceph_inode_info *ci, + struct ceph_cap *new) +{ + struct rb_node **p = &ci->i_caps.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap *cap = NULL; + + while (*p) { + parent = *p; + cap = rb_entry(parent, struct ceph_cap, ci_node); + if (new->mds < cap->mds) + p = &(*p)->rb_left; + else if (new->mds > cap->mds) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->ci_node, parent, p); + rb_insert_color(&new->ci_node, &ci->i_caps); +} + +/* + * (re)set cap hold timeouts, which control the delayed release + * of unused caps back to the MDS. Should be called on cap use. + */ +static void __cap_set_timeouts(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + struct ceph_mount_options *ma = mdsc->fsc->mount_options; + + ci->i_hold_caps_min = round_jiffies(jiffies + + ma->caps_wanted_delay_min * HZ); + ci->i_hold_caps_max = round_jiffies(jiffies + + ma->caps_wanted_delay_max * HZ); + dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, + ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); +} + +/* + * (Re)queue cap at the end of the delayed cap release list. + * + * If I_FLUSH is set, leave the inode at the front of the list. + * + * Caller holds i_lock + * -> we take mdsc->cap_delay_lock + */ +static void __cap_delay_requeue(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + __cap_set_timeouts(mdsc, ci); + dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, + ci->i_ceph_flags, ci->i_hold_caps_max); + if (!mdsc->stopping) { + spin_lock(&mdsc->cap_delay_lock); + if (!list_empty(&ci->i_cap_delay_list)) { + if (ci->i_ceph_flags & CEPH_I_FLUSH) + goto no_change; + list_del_init(&ci->i_cap_delay_list); + } + list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); +no_change: + spin_unlock(&mdsc->cap_delay_lock); + } +} + +/* + * Queue an inode for immediate writeback. Mark inode with I_FLUSH, + * indicating we should send a cap message to flush dirty metadata + * asap, and move to the front of the delayed cap list. + */ +static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode); + spin_lock(&mdsc->cap_delay_lock); + ci->i_ceph_flags |= CEPH_I_FLUSH; + if (!list_empty(&ci->i_cap_delay_list)) + list_del_init(&ci->i_cap_delay_list); + list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list); + spin_unlock(&mdsc->cap_delay_lock); +} + +/* + * Cancel delayed work on cap. + * + * Caller must hold i_lock. + */ +static void __cap_delay_cancel(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + dout("__cap_delay_cancel %p\n", &ci->vfs_inode); + if (list_empty(&ci->i_cap_delay_list)) + return; + spin_lock(&mdsc->cap_delay_lock); + list_del_init(&ci->i_cap_delay_list); + spin_unlock(&mdsc->cap_delay_lock); +} + +/* + * Common issue checks for add_cap, handle_cap_grant. + */ +static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, + unsigned issued) +{ + unsigned had = __ceph_caps_issued(ci, NULL); + + /* + * Each time we receive FILE_CACHE anew, we increment + * i_rdcache_gen. + */ + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) + ci->i_rdcache_gen++; + + /* + * if we are newly issued FILE_SHARED, clear I_COMPLETE; we + * don't know what happened to this directory while we didn't + * have the cap. + */ + if ((issued & CEPH_CAP_FILE_SHARED) && + (had & CEPH_CAP_FILE_SHARED) == 0) { + ci->i_shared_gen++; + if (S_ISDIR(ci->vfs_inode.i_mode)) { + dout(" marking %p NOT complete\n", &ci->vfs_inode); + ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + } + } +} + +/* + * Add a capability under the given MDS session. + * + * Caller should hold session snap_rwsem (read) and s_mutex. + * + * @fmode is the open file mode, if we are opening a file, otherwise + * it is < 0. (This is so we can atomically add the cap and add an + * open file reference to it.) + */ +int ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned seq, unsigned mseq, u64 realmino, int flags, + struct ceph_cap_reservation *caps_reservation) +{ + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *new_cap = NULL; + struct ceph_cap *cap; + int mds = session->s_mds; + int actual_wanted; + + dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, + session->s_mds, cap_id, ceph_cap_string(issued), seq); + + /* + * If we are opening the file, include file mode wanted bits + * in wanted. + */ + if (fmode >= 0) + wanted |= ceph_caps_for_mode(fmode); + +retry: + spin_lock(&inode->i_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + if (new_cap) { + cap = new_cap; + new_cap = NULL; + } else { + spin_unlock(&inode->i_lock); + new_cap = get_cap(mdsc, caps_reservation); + if (new_cap == NULL) + return -ENOMEM; + goto retry; + } + + cap->issued = 0; + cap->implemented = 0; + cap->mds = mds; + cap->mds_wanted = 0; + + cap->ci = ci; + __insert_cap_node(ci, cap); + + /* clear out old exporting info? (i.e. on cap import) */ + if (ci->i_cap_exporting_mds == mds) { + ci->i_cap_exporting_issued = 0; + ci->i_cap_exporting_mseq = 0; + ci->i_cap_exporting_mds = -1; + } + + /* add to session cap list */ + cap->session = session; + spin_lock(&session->s_cap_lock); + list_add_tail(&cap->session_caps, &session->s_caps); + session->s_nr_caps++; + spin_unlock(&session->s_cap_lock); + } else if (new_cap) + ceph_put_cap(mdsc, new_cap); + + if (!ci->i_snap_realm) { + /* + * add this inode to the appropriate snap realm + */ + struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, + realmino); + if (realm) { + ceph_get_snap_realm(mdsc, realm); + spin_lock(&realm->inodes_with_caps_lock); + ci->i_snap_realm = realm; + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + spin_unlock(&realm->inodes_with_caps_lock); + } else { + pr_err("ceph_add_cap: couldn't find snap realm %llx\n", + realmino); + WARN_ON(!realm); + } + } + + __check_cap_issue(ci, cap, issued); + + /* + * If we are issued caps we don't want, or the mds' wanted + * value appears to be off, queue a check so we'll release + * later and/or update the mds wanted value. + */ + actual_wanted = __ceph_caps_wanted(ci); + if ((wanted & ~actual_wanted) || + (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { + dout(" issued %s, mds wanted %s, actual %s, queueing\n", + ceph_cap_string(issued), ceph_cap_string(wanted), + ceph_cap_string(actual_wanted)); + __cap_delay_requeue(mdsc, ci); + } + + if (flags & CEPH_CAP_FLAG_AUTH) + ci->i_auth_cap = cap; + else if (ci->i_auth_cap == cap) + ci->i_auth_cap = NULL; + + dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", + inode, ceph_vinop(inode), cap, ceph_cap_string(issued), + ceph_cap_string(issued|cap->issued), seq, mds); + cap->cap_id = cap_id; + cap->issued = issued; + cap->implemented |= issued; + cap->mds_wanted |= wanted; + cap->seq = seq; + cap->issue_seq = seq; + cap->mseq = mseq; + cap->cap_gen = session->s_cap_gen; + + if (fmode >= 0) + __ceph_get_fmode(ci, fmode); + spin_unlock(&inode->i_lock); + wake_up_all(&ci->i_cap_wq); + return 0; +} + +/* + * Return true if cap has not timed out and belongs to the current + * generation of the MDS session (i.e. has not gone 'stale' due to + * us losing touch with the mds). + */ +static int __cap_is_valid(struct ceph_cap *cap) +{ + unsigned long ttl; + u32 gen; + + spin_lock(&cap->session->s_cap_lock); + gen = cap->session->s_cap_gen; + ttl = cap->session->s_cap_ttl; + spin_unlock(&cap->session->s_cap_lock); + + if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { + dout("__cap_is_valid %p cap %p issued %s " + "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode, + cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); + return 0; + } + + return 1; +} + +/* + * Return set of valid cap bits issued to us. Note that caps time + * out, and may be invalidated in bulk if the client session times out + * and session->s_cap_gen is bumped. + */ +int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) +{ + int have = ci->i_snap_caps | ci->i_cap_exporting_issued; + struct ceph_cap *cap; + struct rb_node *p; + + if (implemented) + *implemented = 0; + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (!__cap_is_valid(cap)) + continue; + dout("__ceph_caps_issued %p cap %p issued %s\n", + &ci->vfs_inode, cap, ceph_cap_string(cap->issued)); + have |= cap->issued; + if (implemented) + *implemented |= cap->implemented; + } + return have; +} + +/* + * Get cap bits issued by caps other than @ocap + */ +int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) +{ + int have = ci->i_snap_caps; + struct ceph_cap *cap; + struct rb_node *p; + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (cap == ocap) + continue; + if (!__cap_is_valid(cap)) + continue; + have |= cap->issued; + } + return have; +} + +/* + * Move a cap to the end of the LRU (oldest caps at list head, newest + * at list tail). + */ +static void __touch_cap(struct ceph_cap *cap) +{ + struct ceph_mds_session *s = cap->session; + + spin_lock(&s->s_cap_lock); + if (s->s_cap_iterator == NULL) { + dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, + s->s_mds); + list_move_tail(&cap->session_caps, &s->s_caps); + } else { + dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", + &cap->ci->vfs_inode, cap, s->s_mds); + } + spin_unlock(&s->s_cap_lock); +} + +/* + * Check if we hold the given mask. If so, move the cap(s) to the + * front of their respective LRUs. (This is the preferred way for + * callers to check for caps they want.) + */ +int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) +{ + struct ceph_cap *cap; + struct rb_node *p; + int have = ci->i_snap_caps; + + if ((have & mask) == mask) { + dout("__ceph_caps_issued_mask %p snap issued %s" + " (mask %s)\n", &ci->vfs_inode, + ceph_cap_string(have), + ceph_cap_string(mask)); + return 1; + } + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (!__cap_is_valid(cap)) + continue; + if ((cap->issued & mask) == mask) { + dout("__ceph_caps_issued_mask %p cap %p issued %s" + " (mask %s)\n", &ci->vfs_inode, cap, + ceph_cap_string(cap->issued), + ceph_cap_string(mask)); + if (touch) + __touch_cap(cap); + return 1; + } + + /* does a combination of caps satisfy mask? */ + have |= cap->issued; + if ((have & mask) == mask) { + dout("__ceph_caps_issued_mask %p combo issued %s" + " (mask %s)\n", &ci->vfs_inode, + ceph_cap_string(cap->issued), + ceph_cap_string(mask)); + if (touch) { + struct rb_node *q; + + /* touch this + preceding caps */ + __touch_cap(cap); + for (q = rb_first(&ci->i_caps); q != p; + q = rb_next(q)) { + cap = rb_entry(q, struct ceph_cap, + ci_node); + if (!__cap_is_valid(cap)) + continue; + __touch_cap(cap); + } + } + return 1; + } + } + + return 0; +} + +/* + * Return true if mask caps are currently being revoked by an MDS. + */ +int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + struct rb_node *p; + int ret = 0; + + spin_lock(&inode->i_lock); + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (__cap_is_valid(cap) && + (cap->implemented & ~cap->issued & mask)) { + ret = 1; + break; + } + } + spin_unlock(&inode->i_lock); + dout("ceph_caps_revoking %p %s = %d\n", inode, + ceph_cap_string(mask), ret); + return ret; +} + +int __ceph_caps_used(struct ceph_inode_info *ci) +{ + int used = 0; + if (ci->i_pin_ref) + used |= CEPH_CAP_PIN; + if (ci->i_rd_ref) + used |= CEPH_CAP_FILE_RD; + if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) + used |= CEPH_CAP_FILE_CACHE; + if (ci->i_wr_ref) + used |= CEPH_CAP_FILE_WR; + if (ci->i_wb_ref || ci->i_wrbuffer_ref) + used |= CEPH_CAP_FILE_BUFFER; + return used; +} + +/* + * wanted, by virtue of open file modes + */ +int __ceph_caps_file_wanted(struct ceph_inode_info *ci) +{ + int want = 0; + int mode; + for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++) + if (ci->i_nr_by_mode[mode]) + want |= ceph_caps_for_mode(mode); + return want; +} + +/* + * Return caps we have registered with the MDS(s) as 'wanted'. + */ +int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) +{ + struct ceph_cap *cap; + struct rb_node *p; + int mds_wanted = 0; + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (!__cap_is_valid(cap)) + continue; + mds_wanted |= cap->mds_wanted; + } + return mds_wanted; +} + +/* + * called under i_lock + */ +static int __ceph_is_any_caps(struct ceph_inode_info *ci) +{ + return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; +} + +/* + * Remove a cap. Take steps to deal with a racing iterate_session_caps. + * + * caller should hold i_lock. + * caller will not hold session s_mutex if called from destroy_inode. + */ +void __ceph_remove_cap(struct ceph_cap *cap) +{ + struct ceph_mds_session *session = cap->session; + struct ceph_inode_info *ci = cap->ci; + struct ceph_mds_client *mdsc = + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + int removed = 0; + + dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + + /* remove from session list */ + spin_lock(&session->s_cap_lock); + if (session->s_cap_iterator == cap) { + /* not yet, we are iterating over this very cap */ + dout("__ceph_remove_cap delaying %p removal from session %p\n", + cap, cap->session); + } else { + list_del_init(&cap->session_caps); + session->s_nr_caps--; + cap->session = NULL; + removed = 1; + } + /* protect backpointer with s_cap_lock: see iterate_session_caps */ + cap->ci = NULL; + spin_unlock(&session->s_cap_lock); + + /* remove from inode list */ + rb_erase(&cap->ci_node, &ci->i_caps); + if (ci->i_auth_cap == cap) + ci->i_auth_cap = NULL; + + if (removed) + ceph_put_cap(mdsc, cap); + + if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { + struct ceph_snap_realm *realm = ci->i_snap_realm; + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + ci->i_snap_realm_counter++; + ci->i_snap_realm = NULL; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(mdsc, realm); + } + if (!__ceph_is_any_real_caps(ci)) + __cap_delay_cancel(mdsc, ci); +} + +/* + * Build and send a cap message to the given MDS. + * + * Caller should be holding s_mutex. + */ +static int send_cap_msg(struct ceph_mds_session *session, + u64 ino, u64 cid, int op, + int caps, int wanted, int dirty, + u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, + u64 size, u64 max_size, + struct timespec *mtime, struct timespec *atime, + u64 time_warp_seq, + uid_t uid, gid_t gid, mode_t mode, + u64 xattr_version, + struct ceph_buffer *xattrs_buf, + u64 follows) +{ + struct ceph_mds_caps *fc; + struct ceph_msg *msg; + + dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" + " seq %u/%u mseq %u follows %lld size %llu/%llu" + " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), + cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), + ceph_cap_string(dirty), + seq, issue_seq, mseq, follows, size, max_size, + xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); + + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); + if (!msg) + return -ENOMEM; + + msg->hdr.tid = cpu_to_le64(flush_tid); + + fc = msg->front.iov_base; + memset(fc, 0, sizeof(*fc)); + + fc->cap_id = cpu_to_le64(cid); + fc->op = cpu_to_le32(op); + fc->seq = cpu_to_le32(seq); + fc->issue_seq = cpu_to_le32(issue_seq); + fc->migrate_seq = cpu_to_le32(mseq); + fc->caps = cpu_to_le32(caps); + fc->wanted = cpu_to_le32(wanted); + fc->dirty = cpu_to_le32(dirty); + fc->ino = cpu_to_le64(ino); + fc->snap_follows = cpu_to_le64(follows); + + fc->size = cpu_to_le64(size); + fc->max_size = cpu_to_le64(max_size); + if (mtime) + ceph_encode_timespec(&fc->mtime, mtime); + if (atime) + ceph_encode_timespec(&fc->atime, atime); + fc->time_warp_seq = cpu_to_le32(time_warp_seq); + + fc->uid = cpu_to_le32(uid); + fc->gid = cpu_to_le32(gid); + fc->mode = cpu_to_le32(mode); + + fc->xattr_version = cpu_to_le64(xattr_version); + if (xattrs_buf) { + msg->middle = ceph_buffer_get(xattrs_buf); + fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len); + msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len); + } + + ceph_con_send(&session->s_con, msg); + return 0; +} + +static void __queue_cap_release(struct ceph_mds_session *session, + u64 ino, u64 cap_id, u32 migrate_seq, + u32 issue_seq) +{ + struct ceph_msg *msg; + struct ceph_mds_cap_release *head; + struct ceph_mds_cap_item *item; + + spin_lock(&session->s_cap_lock); + BUG_ON(!session->s_num_cap_releases); + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + + dout(" adding %llx release to mds%d msg %p (%d left)\n", + ino, session->s_mds, msg, session->s_num_cap_releases); + + BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); + head = msg->front.iov_base; + head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); + item = msg->front.iov_base + msg->front.iov_len; + item->ino = cpu_to_le64(ino); + item->cap_id = cpu_to_le64(cap_id); + item->migrate_seq = cpu_to_le32(migrate_seq); + item->seq = cpu_to_le32(issue_seq); + + session->s_num_cap_releases--; + + msg->front.iov_len += sizeof(*item); + if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + dout(" release msg %p full\n", msg); + list_move_tail(&msg->list_head, &session->s_cap_releases_done); + } else { + dout(" release msg %p at %d/%d (%d)\n", msg, + (int)le32_to_cpu(head->num), + (int)CEPH_CAPS_PER_RELEASE, + (int)msg->front.iov_len); + } + spin_unlock(&session->s_cap_lock); +} + +/* + * Queue cap releases when an inode is dropped from our cache. Since + * inode is about to be destroyed, there is no need for i_lock. + */ +void ceph_queue_caps_release(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct rb_node *p; + + p = rb_first(&ci->i_caps); + while (p) { + struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); + struct ceph_mds_session *session = cap->session; + + __queue_cap_release(session, ceph_ino(inode), cap->cap_id, + cap->mseq, cap->issue_seq); + p = rb_next(p); + __ceph_remove_cap(cap); + } +} + +/* + * Send a cap msg on the given inode. Update our caps state, then + * drop i_lock and send the message. + * + * Make note of max_size reported/requested from mds, revoked caps + * that have now been implemented. + * + * Make half-hearted attempt ot to invalidate page cache if we are + * dropping RDCACHE. Note that this will leave behind locked pages + * that we'll then need to deal with elsewhere. + * + * Return non-zero if delayed release, or we experienced an error + * such that the caller should requeue + retry later. + * + * called with i_lock, then drops it. + * caller should hold snap_rwsem (read), s_mutex. + */ +static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, + int op, int used, int want, int retain, int flushing, + unsigned *pflush_tid) + __releases(cap->ci->vfs_inode->i_lock) +{ + struct ceph_inode_info *ci = cap->ci; + struct inode *inode = &ci->vfs_inode; + u64 cap_id = cap->cap_id; + int held, revoking, dropping, keep; + u64 seq, issue_seq, mseq, time_warp_seq, follows; + u64 size, max_size; + struct timespec mtime, atime; + int wake = 0; + mode_t mode; + uid_t uid; + gid_t gid; + struct ceph_mds_session *session; + u64 xattr_version = 0; + struct ceph_buffer *xattr_blob = NULL; + int delayed = 0; + u64 flush_tid = 0; + int i; + int ret; + + held = cap->issued | cap->implemented; + revoking = cap->implemented & ~cap->issued; + retain &= ~revoking; + dropping = cap->issued & ~retain; + + dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", + inode, cap, cap->session, + ceph_cap_string(held), ceph_cap_string(held & retain), + ceph_cap_string(revoking)); + BUG_ON((retain & CEPH_CAP_PIN) == 0); + + session = cap->session; + + /* don't release wanted unless we've waited a bit. */ + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && + time_before(jiffies, ci->i_hold_caps_min)) { + dout(" delaying issued %s -> %s, wanted %s -> %s on send\n", + ceph_cap_string(cap->issued), + ceph_cap_string(cap->issued & retain), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(want)); + want |= cap->mds_wanted; + retain |= cap->issued; + delayed = 1; + } + ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); + + cap->issued &= retain; /* drop bits we don't want */ + if (cap->implemented & ~cap->issued) { + /* + * Wake up any waiters on wanted -> needed transition. + * This is due to the weird transition from buffered + * to sync IO... we need to flush dirty pages _before_ + * allowing sync writes to avoid reordering. + */ + wake = 1; + } + cap->implemented &= cap->issued | used; + cap->mds_wanted = want; + + if (flushing) { + /* + * assign a tid for flush operations so we can avoid + * flush1 -> dirty1 -> flush2 -> flushack1 -> mark + * clean type races. track latest tid for every bit + * so we can handle flush AxFw, flush Fw, and have the + * first ack clean Ax. + */ + flush_tid = ++ci->i_cap_flush_last_tid; + if (pflush_tid) + *pflush_tid = flush_tid; + dout(" cap_flush_tid %d\n", (int)flush_tid); + for (i = 0; i < CEPH_CAP_BITS; i++) + if (flushing & (1 << i)) + ci->i_cap_flush_tid[i] = flush_tid; + + follows = ci->i_head_snapc->seq; + } else { + follows = 0; + } + + keep = cap->implemented; + seq = cap->seq; + issue_seq = cap->issue_seq; + mseq = cap->mseq; + size = inode->i_size; + ci->i_reported_size = size; + max_size = ci->i_wanted_max_size; + ci->i_requested_max_size = max_size; + mtime = inode->i_mtime; + atime = inode->i_atime; + time_warp_seq = ci->i_time_warp_seq; + uid = inode->i_uid; + gid = inode->i_gid; + mode = inode->i_mode; + + if (flushing & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + xattr_blob = ci->i_xattrs.blob; + xattr_version = ci->i_xattrs.version; + } + + spin_unlock(&inode->i_lock); + + ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, + op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, + size, max_size, &mtime, &atime, time_warp_seq, + uid, gid, mode, xattr_version, xattr_blob, + follows); + if (ret < 0) { + dout("error sending cap msg, must requeue %p\n", inode); + delayed = 1; + } + + if (wake) + wake_up_all(&ci->i_cap_wq); + + return delayed; +} + +/* + * When a snapshot is taken, clients accumulate dirty metadata on + * inodes with capabilities in ceph_cap_snaps to describe the file + * state at the time the snapshot was taken. This must be flushed + * asynchronously back to the MDS once sync writes complete and dirty + * data is written out. + * + * Unless @again is true, skip cap_snaps that were already sent to + * the MDS (i.e., during this session). + * + * Called under i_lock. Takes s_mutex as needed. + */ +void __ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session **psession, + int again) + __releases(ci->vfs_inode->i_lock) + __acquires(ci->vfs_inode->i_lock) +{ + struct inode *inode = &ci->vfs_inode; + int mds; + struct ceph_cap_snap *capsnap; + u32 mseq; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_session *session = NULL; /* if session != NULL, we hold + session->s_mutex */ + u64 next_follows = 0; /* keep track of how far we've gotten through the + i_cap_snaps list, and skip these entries next time + around to avoid an infinite loop */ + + if (psession) + session = *psession; + + dout("__flush_snaps %p\n", inode); +retry: + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + /* avoid an infiniute loop after retry */ + if (capsnap->follows < next_follows) + continue; + /* + * we need to wait for sync writes to complete and for dirty + * pages to be written out. + */ + if (capsnap->dirty_pages || capsnap->writing) + break; + + /* + * if cap writeback already occurred, we should have dropped + * the capsnap in ceph_put_wrbuffer_cap_refs. + */ + BUG_ON(capsnap->dirty == 0); + + /* pick mds, take s_mutex */ + if (ci->i_auth_cap == NULL) { + dout("no auth cap (migrating?), doing nothing\n"); + goto out; + } + + /* only flush each capsnap once */ + if (!again && !list_empty(&capsnap->flushing_item)) { + dout("already flushed %p, skipping\n", capsnap); + continue; + } + + mds = ci->i_auth_cap->session->s_mds; + mseq = ci->i_auth_cap->mseq; + + if (session && session->s_mds != mds) { + dout("oops, wrong session %p mutex\n", session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + session = NULL; + } + if (!session) { + spin_unlock(&inode->i_lock); + mutex_lock(&mdsc->mutex); + session = __ceph_lookup_mds_session(mdsc, mds); + mutex_unlock(&mdsc->mutex); + if (session) { + dout("inverting session/ino locks on %p\n", + session); + mutex_lock(&session->s_mutex); + } + /* + * if session == NULL, we raced against a cap + * deletion or migration. retry, and we'll + * get a better @mds value next time. + */ + spin_lock(&inode->i_lock); + goto retry; + } + + capsnap->flush_tid = ++ci->i_cap_flush_last_tid; + atomic_inc(&capsnap->nref); + if (!list_empty(&capsnap->flushing_item)) + list_del_init(&capsnap->flushing_item); + list_add_tail(&capsnap->flushing_item, + &session->s_cap_snaps_flushing); + spin_unlock(&inode->i_lock); + + dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", + inode, capsnap, capsnap->follows, capsnap->flush_tid); + send_cap_msg(session, ceph_vino(inode).ino, 0, + CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, + capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, + capsnap->size, 0, + &capsnap->mtime, &capsnap->atime, + capsnap->time_warp_seq, + capsnap->uid, capsnap->gid, capsnap->mode, + capsnap->xattr_version, capsnap->xattr_blob, + capsnap->follows); + + next_follows = capsnap->follows + 1; + ceph_put_cap_snap(capsnap); + + spin_lock(&inode->i_lock); + goto retry; + } + + /* we flushed them all; remove this inode from the queue */ + spin_lock(&mdsc->snap_flush_lock); + list_del_init(&ci->i_snap_flush_item); + spin_unlock(&mdsc->snap_flush_lock); + +out: + if (psession) + *psession = session; + else if (session) { + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + } +} + +static void ceph_flush_snaps(struct ceph_inode_info *ci) +{ + struct inode *inode = &ci->vfs_inode; + + spin_lock(&inode->i_lock); + __ceph_flush_snaps(ci, NULL, 0); + spin_unlock(&inode->i_lock); +} + +/* + * Mark caps dirty. If inode is newly dirty, return the dirty flags. + * Caller is then responsible for calling __mark_inode_dirty with the + * returned flags value. + */ +int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) +{ + struct ceph_mds_client *mdsc = + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + struct inode *inode = &ci->vfs_inode; + int was = ci->i_dirty_caps; + int dirty = 0; + + dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, + ceph_cap_string(mask), ceph_cap_string(was), + ceph_cap_string(was | mask)); + ci->i_dirty_caps |= mask; + if (was == 0) { + if (!ci->i_head_snapc) + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, + ci->i_head_snapc); + BUG_ON(!list_empty(&ci->i_dirty_item)); + spin_lock(&mdsc->cap_dirty_lock); + list_add(&ci->i_dirty_item, &mdsc->cap_dirty); + spin_unlock(&mdsc->cap_dirty_lock); + if (ci->i_flushing_caps == 0) { + ihold(inode); + dirty |= I_DIRTY_SYNC; + } + } + BUG_ON(list_empty(&ci->i_dirty_item)); + if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && + (mask & CEPH_CAP_FILE_BUFFER)) + dirty |= I_DIRTY_DATASYNC; + __cap_delay_requeue(mdsc, ci); + return dirty; +} + +/* + * Add dirty inode to the flushing list. Assigned a seq number so we + * can wait for caps to flush without starving. + * + * Called under i_lock. + */ +static int __mark_caps_flushing(struct inode *inode, + struct ceph_mds_session *session) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + int flushing; + + BUG_ON(ci->i_dirty_caps == 0); + BUG_ON(list_empty(&ci->i_dirty_item)); + + flushing = ci->i_dirty_caps; + dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", + ceph_cap_string(flushing), + ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(ci->i_flushing_caps | flushing)); + ci->i_flushing_caps |= flushing; + ci->i_dirty_caps = 0; + dout(" inode %p now !dirty\n", inode); + + spin_lock(&mdsc->cap_dirty_lock); + list_del_init(&ci->i_dirty_item); + + ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; + if (list_empty(&ci->i_flushing_item)) { + list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); + mdsc->num_cap_flushing++; + dout(" inode %p now flushing seq %lld\n", inode, + ci->i_cap_flush_seq); + } else { + list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); + dout(" inode %p now flushing (more) seq %lld\n", inode, + ci->i_cap_flush_seq); + } + spin_unlock(&mdsc->cap_dirty_lock); + + return flushing; +} + +/* + * try to invalidate mapping pages without blocking. + */ +static int try_nonblocking_invalidate(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + u32 invalidating_gen = ci->i_rdcache_gen; + + spin_unlock(&inode->i_lock); + invalidate_mapping_pages(&inode->i_data, 0, -1); + spin_lock(&inode->i_lock); + + if (inode->i_data.nrpages == 0 && + invalidating_gen == ci->i_rdcache_gen) { + /* success. */ + dout("try_nonblocking_invalidate %p success\n", inode); + /* save any racing async invalidate some trouble */ + ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; + return 0; + } + dout("try_nonblocking_invalidate %p failed\n", inode); + return -1; +} + +/* + * Swiss army knife function to examine currently used and wanted + * versus held caps. Release, flush, ack revoked caps to mds as + * appropriate. + * + * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay + * cap release further. + * CHECK_CAPS_AUTHONLY - we should only check the auth cap + * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without + * further delay. + */ +void ceph_check_caps(struct ceph_inode_info *ci, int flags, + struct ceph_mds_session *session) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + int file_wanted, used; + int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ + int issued, implemented, want, retain, revoking, flushing = 0; + int mds = -1; /* keep track of how far we've gone through i_caps list + to avoid an infinite loop on retry */ + struct rb_node *p; + int tried_invalidate = 0; + int delayed = 0, sent = 0, force_requeue = 0, num; + int queue_invalidate = 0; + int is_delayed = flags & CHECK_CAPS_NODELAY; + + /* if we are unmounting, flush any unused caps immediately. */ + if (mdsc->stopping) + is_delayed = 1; + + spin_lock(&inode->i_lock); + + if (ci->i_ceph_flags & CEPH_I_FLUSH) + flags |= CHECK_CAPS_FLUSH; + + /* flush snaps first time around only */ + if (!list_empty(&ci->i_cap_snaps)) + __ceph_flush_snaps(ci, &session, 0); + goto retry_locked; +retry: + spin_lock(&inode->i_lock); +retry_locked: + file_wanted = __ceph_caps_file_wanted(ci); + used = __ceph_caps_used(ci); + want = file_wanted | used; + issued = __ceph_caps_issued(ci, &implemented); + revoking = implemented & ~issued; + + retain = want | CEPH_CAP_PIN; + if (!mdsc->stopping && inode->i_nlink > 0) { + if (want) { + retain |= CEPH_CAP_ANY; /* be greedy */ + } else { + retain |= CEPH_CAP_ANY_SHARED; + /* + * keep RD only if we didn't have the file open RW, + * because then the mds would revoke it anyway to + * journal max_size=0. + */ + if (ci->i_max_size == 0) + retain |= CEPH_CAP_ANY_RD; + } + } + + dout("check_caps %p file_want %s used %s dirty %s flushing %s" + " issued %s revoking %s retain %s %s%s%s\n", inode, + ceph_cap_string(file_wanted), + ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), + ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(issued), ceph_cap_string(revoking), + ceph_cap_string(retain), + (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", + (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "", + (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); + + /* + * If we no longer need to hold onto old our caps, and we may + * have cached pages, but don't want them, then try to invalidate. + * If we fail, it's because pages are locked.... try again later. + */ + if ((!is_delayed || mdsc->stopping) && + ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ + inode->i_data.nrpages && /* have cached pages */ + (file_wanted == 0 || /* no open files */ + (revoking & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ + !tried_invalidate) { + dout("check_caps trying to invalidate on %p\n", inode); + if (try_nonblocking_invalidate(inode) < 0) { + if (revoking & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO)) { + dout("check_caps queuing invalidate\n"); + queue_invalidate = 1; + ci->i_rdcache_revoking = ci->i_rdcache_gen; + } else { + dout("check_caps failed to invalidate pages\n"); + /* we failed to invalidate pages. check these + caps again later. */ + force_requeue = 1; + __cap_set_timeouts(mdsc, ci); + } + } + tried_invalidate = 1; + goto retry_locked; + } + + num = 0; + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + num++; + + /* avoid looping forever */ + if (mds >= cap->mds || + ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) + continue; + + /* NOTE: no side-effects allowed, until we take s_mutex */ + + revoking = cap->implemented & ~cap->issued; + dout(" mds%d cap %p issued %s implemented %s revoking %s\n", + cap->mds, cap, ceph_cap_string(cap->issued), + ceph_cap_string(cap->implemented), + ceph_cap_string(revoking)); + + if (cap == ci->i_auth_cap && + (cap->issued & CEPH_CAP_FILE_WR)) { + /* request larger max_size from MDS? */ + if (ci->i_wanted_max_size > ci->i_max_size && + ci->i_wanted_max_size > ci->i_requested_max_size) { + dout("requesting new max_size\n"); + goto ack; + } + + /* approaching file_max? */ + if ((inode->i_size << 1) >= ci->i_max_size && + (ci->i_reported_size << 1) < ci->i_max_size) { + dout("i_size approaching max_size\n"); + goto ack; + } + } + /* flush anything dirty? */ + if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) && + ci->i_dirty_caps) { + dout("flushing dirty caps\n"); + goto ack; + } + + /* completed revocation? going down and there are no caps? */ + if (revoking && (revoking & used) == 0) { + dout("completed revocation of %s\n", + ceph_cap_string(cap->implemented & ~cap->issued)); + goto ack; + } + + /* want more caps from mds? */ + if (want & ~(cap->mds_wanted | cap->issued)) + goto ack; + + /* things we might delay */ + if ((cap->issued & ~retain) == 0 && + cap->mds_wanted == want) + continue; /* nope, all good */ + + if (is_delayed) + goto ack; + + /* delay? */ + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && + time_before(jiffies, ci->i_hold_caps_max)) { + dout(" delaying issued %s -> %s, wanted %s -> %s\n", + ceph_cap_string(cap->issued), + ceph_cap_string(cap->issued & retain), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(want)); + delayed++; + continue; + } + +ack: + if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { + dout(" skipping %p I_NOFLUSH set\n", inode); + continue; + } + + if (session && session != cap->session) { + dout("oops, wrong session %p mutex\n", session); + mutex_unlock(&session->s_mutex); + session = NULL; + } + if (!session) { + session = cap->session; + if (mutex_trylock(&session->s_mutex) == 0) { + dout("inverting session/ino locks on %p\n", + session); + spin_unlock(&inode->i_lock); + if (took_snap_rwsem) { + up_read(&mdsc->snap_rwsem); + took_snap_rwsem = 0; + } + mutex_lock(&session->s_mutex); + goto retry; + } + } + /* take snap_rwsem after session mutex */ + if (!took_snap_rwsem) { + if (down_read_trylock(&mdsc->snap_rwsem) == 0) { + dout("inverting snap/in locks on %p\n", + inode); + spin_unlock(&inode->i_lock); + down_read(&mdsc->snap_rwsem); + took_snap_rwsem = 1; + goto retry; + } + took_snap_rwsem = 1; + } + + if (cap == ci->i_auth_cap && ci->i_dirty_caps) + flushing = __mark_caps_flushing(inode, session); + else + flushing = 0; + + mds = cap->mds; /* remember mds, so we don't repeat */ + sent++; + + /* __send_cap drops i_lock */ + delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, + retain, flushing, NULL); + goto retry; /* retake i_lock and restart our cap scan. */ + } + + /* + * Reschedule delayed caps release if we delayed anything, + * otherwise cancel. + */ + if (delayed && is_delayed) + force_requeue = 1; /* __send_cap delayed release; requeue */ + if (!delayed && !is_delayed) + __cap_delay_cancel(mdsc, ci); + else if (!is_delayed || force_requeue) + __cap_delay_requeue(mdsc, ci); + + spin_unlock(&inode->i_lock); + + if (queue_invalidate) + ceph_queue_invalidate(inode); + + if (session) + mutex_unlock(&session->s_mutex); + if (took_snap_rwsem) + up_read(&mdsc->snap_rwsem); +} + +/* + * Try to flush dirty caps back to the auth mds. + */ +static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, + unsigned *flush_tid) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + int unlock_session = session ? 0 : 1; + int flushing = 0; + +retry: + spin_lock(&inode->i_lock); + if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { + dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); + goto out; + } + if (ci->i_dirty_caps && ci->i_auth_cap) { + struct ceph_cap *cap = ci->i_auth_cap; + int used = __ceph_caps_used(ci); + int want = __ceph_caps_wanted(ci); + int delayed; + + if (!session) { + spin_unlock(&inode->i_lock); + session = cap->session; + mutex_lock(&session->s_mutex); + goto retry; + } + BUG_ON(session != cap->session); + if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) + goto out; + + flushing = __mark_caps_flushing(inode, session); + + /* __send_cap drops i_lock */ + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, + cap->issued | cap->implemented, flushing, + flush_tid); + if (!delayed) + goto out_unlocked; + + spin_lock(&inode->i_lock); + __cap_delay_requeue(mdsc, ci); + } +out: + spin_unlock(&inode->i_lock); +out_unlocked: + if (session && unlock_session) + mutex_unlock(&session->s_mutex); + return flushing; +} + +/* + * Return true if we've flushed caps through the given flush_tid. + */ +static int caps_are_flushed(struct inode *inode, unsigned tid) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int i, ret = 1; + + spin_lock(&inode->i_lock); + for (i = 0; i < CEPH_CAP_BITS; i++) + if ((ci->i_flushing_caps & (1 << i)) && + ci->i_cap_flush_tid[i] <= tid) { + /* still flushing this bit */ + ret = 0; + break; + } + spin_unlock(&inode->i_lock); + return ret; +} + +/* + * Wait on any unsafe replies for the given inode. First wait on the + * newest request, and make that the upper bound. Then, if there are + * more requests, keep waiting on the oldest as long as it is still older + * than the original request. + */ +static void sync_write_wait(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct list_head *head = &ci->i_unsafe_writes; + struct ceph_osd_request *req; + u64 last_tid; + + spin_lock(&ci->i_unsafe_lock); + if (list_empty(head)) + goto out; + + /* set upper bound as _last_ entry in chain */ + req = list_entry(head->prev, struct ceph_osd_request, + r_unsafe_item); + last_tid = req->r_tid; + + do { + ceph_osdc_get_request(req); + spin_unlock(&ci->i_unsafe_lock); + dout("sync_write_wait on tid %llu (until %llu)\n", + req->r_tid, last_tid); + wait_for_completion(&req->r_safe_completion); + spin_lock(&ci->i_unsafe_lock); + ceph_osdc_put_request(req); + + /* + * from here on look at first entry in chain, since we + * only want to wait for anything older than last_tid + */ + if (list_empty(head)) + break; + req = list_entry(head->next, struct ceph_osd_request, + r_unsafe_item); + } while (req->r_tid < last_tid); +out: + spin_unlock(&ci->i_unsafe_lock); +} + +int ceph_fsync(struct file *file, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + unsigned flush_tid; + int ret; + int dirty; + + dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); + sync_write_wait(inode); + + ret = filemap_write_and_wait(inode->i_mapping); + if (ret < 0) + return ret; + + dirty = try_flush_caps(inode, NULL, &flush_tid); + dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); + + /* + * only wait on non-file metadata writeback (the mds + * can recover size and mtime, so we don't need to + * wait for that) + */ + if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { + dout("fsync waiting for flush_tid %u\n", flush_tid); + ret = wait_event_interruptible(ci->i_cap_wq, + caps_are_flushed(inode, flush_tid)); + } + + dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); + return ret; +} + +/* + * Flush any dirty caps back to the mds. If we aren't asked to wait, + * queue inode for flush but don't do so immediately, because we can + * get by with fewer MDS messages if we wait for data writeback to + * complete first. + */ +int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + unsigned flush_tid; + int err = 0; + int dirty; + int wait = wbc->sync_mode == WB_SYNC_ALL; + + dout("write_inode %p wait=%d\n", inode, wait); + if (wait) { + dirty = try_flush_caps(inode, NULL, &flush_tid); + if (dirty) + err = wait_event_interruptible(ci->i_cap_wq, + caps_are_flushed(inode, flush_tid)); + } else { + struct ceph_mds_client *mdsc = + ceph_sb_to_client(inode->i_sb)->mdsc; + + spin_lock(&inode->i_lock); + if (__ceph_caps_dirty(ci)) + __cap_delay_requeue_front(mdsc, ci); + spin_unlock(&inode->i_lock); + } + return err; +} + +/* + * After a recovering MDS goes active, we need to resend any caps + * we were flushing. + * + * Caller holds session->s_mutex. + */ +static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_cap_snap *capsnap; + + dout("kick_flushing_capsnaps mds%d\n", session->s_mds); + list_for_each_entry(capsnap, &session->s_cap_snaps_flushing, + flushing_item) { + struct ceph_inode_info *ci = capsnap->ci; + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + + spin_lock(&inode->i_lock); + cap = ci->i_auth_cap; + if (cap && cap->session == session) { + dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, + cap, capsnap); + __ceph_flush_snaps(ci, &session, 1); + } else { + pr_err("%p auth cap %p not mds%d ???\n", inode, + cap, session->s_mds); + } + spin_unlock(&inode->i_lock); + } +} + +void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci; + + kick_flushing_capsnaps(mdsc, session); + + dout("kick_flushing_caps mds%d\n", session->s_mds); + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + int delayed = 0; + + spin_lock(&inode->i_lock); + cap = ci->i_auth_cap; + if (cap && cap->session == session) { + dout("kick_flushing_caps %p cap %p %s\n", inode, + cap, ceph_cap_string(ci->i_flushing_caps)); + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + ci->i_flushing_caps, NULL); + if (delayed) { + spin_lock(&inode->i_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&inode->i_lock); + } + } else { + pr_err("%p auth cap %p not mds%d ???\n", inode, + cap, session->s_mds); + spin_unlock(&inode->i_lock); + } + } +} + +static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; + int delayed = 0; + + spin_lock(&inode->i_lock); + cap = ci->i_auth_cap; + dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, + ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + __ceph_flush_snaps(ci, &session, 1); + if (ci->i_flushing_caps) { + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + ci->i_flushing_caps, NULL); + if (delayed) { + spin_lock(&inode->i_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&inode->i_lock); + } + } else { + spin_unlock(&inode->i_lock); + } +} + + +/* + * Take references to capabilities we hold, so that we don't release + * them to the MDS prematurely. + * + * Protected by i_lock. + */ +static void __take_cap_refs(struct ceph_inode_info *ci, int got) +{ + if (got & CEPH_CAP_PIN) + ci->i_pin_ref++; + if (got & CEPH_CAP_FILE_RD) + ci->i_rd_ref++; + if (got & CEPH_CAP_FILE_CACHE) + ci->i_rdcache_ref++; + if (got & CEPH_CAP_FILE_WR) + ci->i_wr_ref++; + if (got & CEPH_CAP_FILE_BUFFER) { + if (ci->i_wb_ref == 0) + ihold(&ci->vfs_inode); + ci->i_wb_ref++; + dout("__take_cap_refs %p wb %d -> %d (?)\n", + &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); + } +} + +/* + * Try to grab cap references. Specify those refs we @want, and the + * minimal set we @need. Also include the larger offset we are writing + * to (when applicable), and check against max_size here as well. + * Note that caller is responsible for ensuring max_size increases are + * requested from the MDS. + */ +static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, + int *got, loff_t endoff, int *check_max, int *err) +{ + struct inode *inode = &ci->vfs_inode; + int ret = 0; + int have, implemented; + int file_wanted; + + dout("get_cap_refs %p need %s want %s\n", inode, + ceph_cap_string(need), ceph_cap_string(want)); + spin_lock(&inode->i_lock); + + /* make sure file is actually open */ + file_wanted = __ceph_caps_file_wanted(ci); + if ((file_wanted & need) == 0) { + dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", + ceph_cap_string(need), ceph_cap_string(file_wanted)); + *err = -EBADF; + ret = 1; + goto out; + } + + if (need & CEPH_CAP_FILE_WR) { + if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { + dout("get_cap_refs %p endoff %llu > maxsize %llu\n", + inode, endoff, ci->i_max_size); + if (endoff > ci->i_wanted_max_size) { + *check_max = 1; + ret = 1; + } + goto out; + } + /* + * If a sync write is in progress, we must wait, so that we + * can get a final snapshot value for size+mtime. + */ + if (__ceph_have_pending_cap_snap(ci)) { + dout("get_cap_refs %p cap_snap_pending\n", inode); + goto out; + } + } + have = __ceph_caps_issued(ci, &implemented); + + /* + * disallow writes while a truncate is pending + */ + if (ci->i_truncate_pending) + have &= ~CEPH_CAP_FILE_WR; + + if ((have & need) == need) { + /* + * Look at (implemented & ~have & not) so that we keep waiting + * on transition from wanted -> needed caps. This is needed + * for WRBUFFER|WR -> WR to avoid a new WR sync write from + * going before a prior buffered writeback happens. + */ + int not = want & ~(have & need); + int revoking = implemented & ~have; + dout("get_cap_refs %p have %s but not %s (revoking %s)\n", + inode, ceph_cap_string(have), ceph_cap_string(not), + ceph_cap_string(revoking)); + if ((revoking & not) == 0) { + *got = need | (have & want); + __take_cap_refs(ci, *got); + ret = 1; + } + } else { + dout("get_cap_refs %p have %s needed %s\n", inode, + ceph_cap_string(have), ceph_cap_string(need)); + } +out: + spin_unlock(&inode->i_lock); + dout("get_cap_refs %p ret %d got %s\n", inode, + ret, ceph_cap_string(*got)); + return ret; +} + +/* + * Check the offset we are writing up to against our current + * max_size. If necessary, tell the MDS we want to write to + * a larger offset. + */ +static void check_max_size(struct inode *inode, loff_t endoff) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int check = 0; + + /* do we need to explicitly request a larger max_size? */ + spin_lock(&inode->i_lock); + if ((endoff >= ci->i_max_size || + endoff > (inode->i_size << 1)) && + endoff > ci->i_wanted_max_size) { + dout("write %p at large endoff %llu, req max_size\n", + inode, endoff); + ci->i_wanted_max_size = endoff; + check = 1; + } + spin_unlock(&inode->i_lock); + if (check) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); +} + +/* + * Wait for caps, and take cap references. If we can't get a WR cap + * due to a small max_size, make sure we check_max_size (and possibly + * ask the mds) so we don't get hung up indefinitely. + */ +int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got, + loff_t endoff) +{ + int check_max, ret, err; + +retry: + if (endoff > 0) + check_max_size(&ci->vfs_inode, endoff); + check_max = 0; + err = 0; + ret = wait_event_interruptible(ci->i_cap_wq, + try_get_cap_refs(ci, need, want, + got, endoff, + &check_max, &err)); + if (err) + ret = err; + if (check_max) + goto retry; + return ret; +} + +/* + * Take cap refs. Caller must already know we hold at least one ref + * on the caps in question or we don't know this is safe. + */ +void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) +{ + spin_lock(&ci->vfs_inode.i_lock); + __take_cap_refs(ci, caps); + spin_unlock(&ci->vfs_inode.i_lock); +} + +/* + * Release cap refs. + * + * If we released the last ref on any given cap, call ceph_check_caps + * to release (or schedule a release). + * + * If we are releasing a WR cap (from a sync write), finalize any affected + * cap_snap, and wake up any waiters. + */ +void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) +{ + struct inode *inode = &ci->vfs_inode; + int last = 0, put = 0, flushsnaps = 0, wake = 0; + struct ceph_cap_snap *capsnap; + + spin_lock(&inode->i_lock); + if (had & CEPH_CAP_PIN) + --ci->i_pin_ref; + if (had & CEPH_CAP_FILE_RD) + if (--ci->i_rd_ref == 0) + last++; + if (had & CEPH_CAP_FILE_CACHE) + if (--ci->i_rdcache_ref == 0) + last++; + if (had & CEPH_CAP_FILE_BUFFER) { + if (--ci->i_wb_ref == 0) { + last++; + put++; + } + dout("put_cap_refs %p wb %d -> %d (?)\n", + inode, ci->i_wb_ref+1, ci->i_wb_ref); + } + if (had & CEPH_CAP_FILE_WR) + if (--ci->i_wr_ref == 0) { + last++; + if (!list_empty(&ci->i_cap_snaps)) { + capsnap = list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + if (capsnap->writing) { + capsnap->writing = 0; + flushsnaps = + __ceph_finish_cap_snap(ci, + capsnap); + wake = 1; + } + } + } + spin_unlock(&inode->i_lock); + + dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), + last ? " last" : "", put ? " put" : ""); + + if (last && !flushsnaps) + ceph_check_caps(ci, 0, NULL); + else if (flushsnaps) + ceph_flush_snaps(ci); + if (wake) + wake_up_all(&ci->i_cap_wq); + if (put) + iput(inode); +} + +/* + * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap + * context. Adjust per-snap dirty page accounting as appropriate. + * Once all dirty data for a cap_snap is flushed, flush snapped file + * metadata back to the MDS. If we dropped the last ref, call + * ceph_check_caps. + */ +void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, + struct ceph_snap_context *snapc) +{ + struct inode *inode = &ci->vfs_inode; + int last = 0; + int complete_capsnap = 0; + int drop_capsnap = 0; + int found = 0; + struct ceph_cap_snap *capsnap = NULL; + + spin_lock(&inode->i_lock); + ci->i_wrbuffer_ref -= nr; + last = !ci->i_wrbuffer_ref; + + if (ci->i_head_snapc == snapc) { + ci->i_wrbuffer_ref_head -= nr; + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n", + inode, + ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr, + ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, + last ? " LAST" : ""); + } else { + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->context == snapc) { + found = 1; + break; + } + } + BUG_ON(!found); + capsnap->dirty_pages -= nr; + if (capsnap->dirty_pages == 0) { + complete_capsnap = 1; + if (capsnap->dirty == 0) + /* cap writeback completed before we created + * the cap_snap; no FLUSHSNAP is needed */ + drop_capsnap = 1; + } + dout("put_wrbuffer_cap_refs on %p cap_snap %p " + " snap %lld %d/%d -> %d/%d %s%s%s\n", + inode, capsnap, capsnap->context->seq, + ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, + ci->i_wrbuffer_ref, capsnap->dirty_pages, + last ? " (wrbuffer last)" : "", + complete_capsnap ? " (complete capsnap)" : "", + drop_capsnap ? " (drop capsnap)" : ""); + if (drop_capsnap) { + ceph_put_snap_context(capsnap->context); + list_del(&capsnap->ci_item); + list_del(&capsnap->flushing_item); + ceph_put_cap_snap(capsnap); + } + } + + spin_unlock(&inode->i_lock); + + if (last) { + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + iput(inode); + } else if (complete_capsnap) { + ceph_flush_snaps(ci); + wake_up_all(&ci->i_cap_wq); + } + if (drop_capsnap) + iput(inode); +} + +/* + * Handle a cap GRANT message from the MDS. (Note that a GRANT may + * actually be a revocation if it specifies a smaller cap set.) + * + * caller holds s_mutex and i_lock, we drop both. + * + * return value: + * 0 - ok + * 1 - check_caps on auth cap only (writeback) + * 2 - check_caps (ack revoke) + */ +static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, + struct ceph_mds_session *session, + struct ceph_cap *cap, + struct ceph_buffer *xattr_buf) + __releases(inode->i_lock) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int mds = session->s_mds; + int seq = le32_to_cpu(grant->seq); + int newcaps = le32_to_cpu(grant->caps); + int issued, implemented, used, wanted, dirty; + u64 size = le64_to_cpu(grant->size); + u64 max_size = le64_to_cpu(grant->max_size); + struct timespec mtime, atime, ctime; + int check_caps = 0; + int wake = 0; + int writeback = 0; + int revoked_rdcache = 0; + int queue_invalidate = 0; + + dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", + inode, cap, mds, seq, ceph_cap_string(newcaps)); + dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, + inode->i_size); + + /* + * If CACHE is being revoked, and we have no dirty buffers, + * try to invalidate (once). (If there are dirty buffers, we + * will invalidate _after_ writeback.) + */ + if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && + !ci->i_wrbuffer_ref) { + if (try_nonblocking_invalidate(inode) == 0) { + revoked_rdcache = 1; + } else { + /* there were locked pages.. invalidate later + in a separate thread. */ + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { + queue_invalidate = 1; + ci->i_rdcache_revoking = ci->i_rdcache_gen; + } + } + } + + /* side effects now are allowed */ + + issued = __ceph_caps_issued(ci, &implemented); + issued |= implemented | __ceph_caps_dirty(ci); + + cap->cap_gen = session->s_cap_gen; + + __check_cap_issue(ci, cap, newcaps); + + if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + inode->i_mode = le32_to_cpu(grant->mode); + inode->i_uid = le32_to_cpu(grant->uid); + inode->i_gid = le32_to_cpu(grant->gid); + dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, + inode->i_uid, inode->i_gid); + } + + if ((issued & CEPH_CAP_LINK_EXCL) == 0) + inode->i_nlink = le32_to_cpu(grant->nlink); + + if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { + int len = le32_to_cpu(grant->xattr_len); + u64 version = le64_to_cpu(grant->xattr_version); + + if (version > ci->i_xattrs.version) { + dout(" got new xattrs v%llu on %p len %d\n", + version, inode, len); + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); + ci->i_xattrs.version = version; + } + } + + /* size/ctime/mtime/atime? */ + ceph_fill_file_size(inode, issued, + le32_to_cpu(grant->truncate_seq), + le64_to_cpu(grant->truncate_size), size); + ceph_decode_timespec(&mtime, &grant->mtime); + ceph_decode_timespec(&atime, &grant->atime); + ceph_decode_timespec(&ctime, &grant->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, + &atime); + + /* max size increase? */ + if (max_size != ci->i_max_size) { + dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); + ci->i_max_size = max_size; + if (max_size >= ci->i_wanted_max_size) { + ci->i_wanted_max_size = 0; /* reset */ + ci->i_requested_max_size = 0; + } + wake = 1; + } + + /* check cap bits */ + wanted = __ceph_caps_wanted(ci); + used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); + dout(" my wanted = %s, used = %s, dirty %s\n", + ceph_cap_string(wanted), + ceph_cap_string(used), + ceph_cap_string(dirty)); + if (wanted != le32_to_cpu(grant->wanted)) { + dout("mds wanted %s -> %s\n", + ceph_cap_string(le32_to_cpu(grant->wanted)), + ceph_cap_string(wanted)); + grant->wanted = cpu_to_le32(wanted); + } + + cap->seq = seq; + + /* file layout may have changed */ + ci->i_layout = grant->layout; + + /* revocation, grant, or no-op? */ + if (cap->issued & ~newcaps) { + int revoking = cap->issued & ~newcaps; + + dout("revocation: %s -> %s (revoking %s)\n", + ceph_cap_string(cap->issued), + ceph_cap_string(newcaps), + ceph_cap_string(revoking)); + if (revoking & used & CEPH_CAP_FILE_BUFFER) + writeback = 1; /* initiate writeback; will delay ack */ + else if (revoking == CEPH_CAP_FILE_CACHE && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && + queue_invalidate) + ; /* do nothing yet, invalidation will be queued */ + else if (cap == ci->i_auth_cap) + check_caps = 1; /* check auth cap only */ + else + check_caps = 2; /* check all caps */ + cap->issued = newcaps; + cap->implemented |= newcaps; + } else if (cap->issued == newcaps) { + dout("caps unchanged: %s -> %s\n", + ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); + } else { + dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), + ceph_cap_string(newcaps)); + cap->issued = newcaps; + cap->implemented |= newcaps; /* add bits only, to + * avoid stepping on a + * pending revocation */ + wake = 1; + } + BUG_ON(cap->issued & ~cap->implemented); + + spin_unlock(&inode->i_lock); + if (writeback) + /* + * queue inode for writeback: we can't actually call + * filemap_write_and_wait, etc. from message handler + * context. + */ + ceph_queue_writeback(inode); + if (queue_invalidate) + ceph_queue_invalidate(inode); + if (wake) + wake_up_all(&ci->i_cap_wq); + + if (check_caps == 1) + ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, + session); + else if (check_caps == 2) + ceph_check_caps(ci, CHECK_CAPS_NODELAY, session); + else + mutex_unlock(&session->s_mutex); +} + +/* + * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the + * MDS has been safely committed. + */ +static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, + struct ceph_mds_caps *m, + struct ceph_mds_session *session, + struct ceph_cap *cap) + __releases(inode->i_lock) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + unsigned seq = le32_to_cpu(m->seq); + int dirty = le32_to_cpu(m->dirty); + int cleaned = 0; + int drop = 0; + int i; + + for (i = 0; i < CEPH_CAP_BITS; i++) + if ((dirty & (1 << i)) && + flush_tid == ci->i_cap_flush_tid[i]) + cleaned |= 1 << i; + + dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," + " flushing %s -> %s\n", + inode, session->s_mds, seq, ceph_cap_string(dirty), + ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(ci->i_flushing_caps & ~cleaned)); + + if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) + goto out; + + ci->i_flushing_caps &= ~cleaned; + + spin_lock(&mdsc->cap_dirty_lock); + if (ci->i_flushing_caps == 0) { + list_del_init(&ci->i_flushing_item); + if (!list_empty(&session->s_cap_flushing)) + dout(" mds%d still flushing cap on %p\n", + session->s_mds, + &list_entry(session->s_cap_flushing.next, + struct ceph_inode_info, + i_flushing_item)->vfs_inode); + mdsc->num_cap_flushing--; + wake_up_all(&mdsc->cap_flushing_wq); + dout(" inode %p now !flushing\n", inode); + + if (ci->i_dirty_caps == 0) { + dout(" inode %p now clean\n", inode); + BUG_ON(!list_empty(&ci->i_dirty_item)); + drop = 1; + if (ci->i_wrbuffer_ref_head == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + } else { + BUG_ON(list_empty(&ci->i_dirty_item)); + } + } + spin_unlock(&mdsc->cap_dirty_lock); + wake_up_all(&ci->i_cap_wq); + +out: + spin_unlock(&inode->i_lock); + if (drop) + iput(inode); +} + +/* + * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can + * throw away our cap_snap. + * + * Caller hold s_mutex. + */ +static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, + struct ceph_mds_caps *m, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + u64 follows = le64_to_cpu(m->snap_follows); + struct ceph_cap_snap *capsnap; + int drop = 0; + + dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", + inode, ci, session->s_mds, follows); + + spin_lock(&inode->i_lock); + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->follows == follows) { + if (capsnap->flush_tid != flush_tid) { + dout(" cap_snap %p follows %lld tid %lld !=" + " %lld\n", capsnap, follows, + flush_tid, capsnap->flush_tid); + break; + } + WARN_ON(capsnap->dirty_pages || capsnap->writing); + dout(" removing %p cap_snap %p follows %lld\n", + inode, capsnap, follows); + ceph_put_snap_context(capsnap->context); + list_del(&capsnap->ci_item); + list_del(&capsnap->flushing_item); + ceph_put_cap_snap(capsnap); + drop = 1; + break; + } else { + dout(" skipping cap_snap %p follows %lld\n", + capsnap, capsnap->follows); + } + } + spin_unlock(&inode->i_lock); + if (drop) + iput(inode); +} + +/* + * Handle TRUNC from MDS, indicating file truncation. + * + * caller hold s_mutex. + */ +static void handle_cap_trunc(struct inode *inode, + struct ceph_mds_caps *trunc, + struct ceph_mds_session *session) + __releases(inode->i_lock) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int mds = session->s_mds; + int seq = le32_to_cpu(trunc->seq); + u32 truncate_seq = le32_to_cpu(trunc->truncate_seq); + u64 truncate_size = le64_to_cpu(trunc->truncate_size); + u64 size = le64_to_cpu(trunc->size); + int implemented = 0; + int dirty = __ceph_caps_dirty(ci); + int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); + int queue_trunc = 0; + + issued |= implemented | dirty; + + dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n", + inode, mds, seq, truncate_size, truncate_seq); + queue_trunc = ceph_fill_file_size(inode, issued, + truncate_seq, truncate_size, size); + spin_unlock(&inode->i_lock); + + if (queue_trunc) + ceph_queue_vmtruncate(inode); +} + +/* + * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a + * different one. If we are the most recent migration we've seen (as + * indicated by mseq), make note of the migrating cap bits for the + * duration (until we see the corresponding IMPORT). + * + * caller holds s_mutex + */ +static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, + struct ceph_mds_session *session, + int *open_target_sessions) +{ + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + int mds = session->s_mds; + unsigned mseq = le32_to_cpu(ex->migrate_seq); + struct ceph_cap *cap = NULL, *t; + struct rb_node *p; + int remember = 1; + + dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", + inode, ci, mds, mseq); + + spin_lock(&inode->i_lock); + + /* make sure we haven't seen a higher mseq */ + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + t = rb_entry(p, struct ceph_cap, ci_node); + if (ceph_seq_cmp(t->mseq, mseq) > 0) { + dout(" higher mseq on cap from mds%d\n", + t->session->s_mds); + remember = 0; + } + if (t->session->s_mds == mds) + cap = t; + } + + if (cap) { + if (remember) { + /* make note */ + ci->i_cap_exporting_mds = mds; + ci->i_cap_exporting_mseq = mseq; + ci->i_cap_exporting_issued = cap->issued; + + /* + * make sure we have open sessions with all possible + * export targets, so that we get the matching IMPORT + */ + *open_target_sessions = 1; + + /* + * we can't flush dirty caps that we've seen the + * EXPORT but no IMPORT for + */ + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p to cap_dirty_migrating\n", + inode); + list_move(&ci->i_dirty_item, + &mdsc->cap_dirty_migrating); + } + spin_unlock(&mdsc->cap_dirty_lock); + } + __ceph_remove_cap(cap); + } + /* else, we already released it */ + + spin_unlock(&inode->i_lock); +} + +/* + * Handle cap IMPORT. If there are temp bits from an older EXPORT, + * clean them up. + * + * caller holds s_mutex. + */ +static void handle_cap_import(struct ceph_mds_client *mdsc, + struct inode *inode, struct ceph_mds_caps *im, + struct ceph_mds_session *session, + void *snaptrace, int snaptrace_len) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int mds = session->s_mds; + unsigned issued = le32_to_cpu(im->caps); + unsigned wanted = le32_to_cpu(im->wanted); + unsigned seq = le32_to_cpu(im->seq); + unsigned mseq = le32_to_cpu(im->migrate_seq); + u64 realmino = le64_to_cpu(im->realm); + u64 cap_id = le64_to_cpu(im->cap_id); + + if (ci->i_cap_exporting_mds >= 0 && + ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { + dout("handle_cap_import inode %p ci %p mds%d mseq %d" + " - cleared exporting from mds%d\n", + inode, ci, mds, mseq, + ci->i_cap_exporting_mds); + ci->i_cap_exporting_issued = 0; + ci->i_cap_exporting_mseq = 0; + ci->i_cap_exporting_mds = -1; + + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p back to cap_dirty\n", inode); + list_move(&ci->i_dirty_item, &mdsc->cap_dirty); + } + spin_unlock(&mdsc->cap_dirty_lock); + } else { + dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", + inode, ci, mds, mseq); + } + + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, + false); + downgrade_write(&mdsc->snap_rwsem); + ceph_add_cap(inode, session, cap_id, -1, + issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, + NULL /* no caps context */); + kick_flushing_inode_caps(mdsc, session, inode); + up_read(&mdsc->snap_rwsem); + + /* make sure we re-request max_size, if necessary */ + spin_lock(&inode->i_lock); + ci->i_requested_max_size = 0; + spin_unlock(&inode->i_lock); +} + +/* + * Handle a caps message from the MDS. + * + * Identify the appropriate session, inode, and call the right handler + * based on the cap op. + */ +void ceph_handle_caps(struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct ceph_mds_client *mdsc = session->s_mdsc; + struct super_block *sb = mdsc->fsc->sb; + struct inode *inode; + struct ceph_cap *cap; + struct ceph_mds_caps *h; + int mds = session->s_mds; + int op; + u32 seq, mseq; + struct ceph_vino vino; + u64 cap_id; + u64 size, max_size; + u64 tid; + void *snaptrace; + size_t snaptrace_len; + void *flock; + u32 flock_len; + int open_target_sessions = 0; + + dout("handle_caps from mds%d\n", mds); + + /* decode */ + tid = le64_to_cpu(msg->hdr.tid); + if (msg->front.iov_len < sizeof(*h)) + goto bad; + h = msg->front.iov_base; + op = le32_to_cpu(h->op); + vino.ino = le64_to_cpu(h->ino); + vino.snap = CEPH_NOSNAP; + cap_id = le64_to_cpu(h->cap_id); + seq = le32_to_cpu(h->seq); + mseq = le32_to_cpu(h->migrate_seq); + size = le64_to_cpu(h->size); + max_size = le64_to_cpu(h->max_size); + + snaptrace = h + 1; + snaptrace_len = le32_to_cpu(h->snap_trace_len); + + if (le16_to_cpu(msg->hdr.version) >= 2) { + void *p, *end; + + p = snaptrace + snaptrace_len; + end = msg->front.iov_base + msg->front.iov_len; + ceph_decode_32_safe(&p, end, flock_len, bad); + flock = p; + } else { + flock = NULL; + flock_len = 0; + } + + mutex_lock(&session->s_mutex); + session->s_seq++; + dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, + (unsigned)seq); + + /* lookup ino */ + inode = ceph_find_inode(sb, vino); + dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, + vino.snap, inode); + if (!inode) { + dout(" i don't have ino %llx\n", vino.ino); + + if (op == CEPH_CAP_OP_IMPORT) + __queue_cap_release(session, vino.ino, cap_id, + mseq, seq); + goto flush_cap_releases; + } + + /* these will work even if we don't have a cap yet */ + switch (op) { + case CEPH_CAP_OP_FLUSHSNAP_ACK: + handle_cap_flushsnap_ack(inode, tid, h, session); + goto done; + + case CEPH_CAP_OP_EXPORT: + handle_cap_export(inode, h, session, &open_target_sessions); + goto done; + + case CEPH_CAP_OP_IMPORT: + handle_cap_import(mdsc, inode, h, session, + snaptrace, snaptrace_len); + ceph_check_caps(ceph_inode(inode), 0, session); + goto done_unlocked; + } + + /* the rest require a cap */ + spin_lock(&inode->i_lock); + cap = __get_cap_for_mds(ceph_inode(inode), mds); + if (!cap) { + dout(" no cap on %p ino %llx.%llx from mds%d\n", + inode, ceph_ino(inode), ceph_snap(inode), mds); + spin_unlock(&inode->i_lock); + goto flush_cap_releases; + } + + /* note that each of these drops i_lock for us */ + switch (op) { + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: + handle_cap_grant(inode, h, session, cap, msg->middle); + goto done_unlocked; + + case CEPH_CAP_OP_FLUSH_ACK: + handle_cap_flush_ack(inode, tid, h, session, cap); + break; + + case CEPH_CAP_OP_TRUNC: + handle_cap_trunc(inode, h, session); + break; + + default: + spin_unlock(&inode->i_lock); + pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, + ceph_cap_op_name(op)); + } + + goto done; + +flush_cap_releases: + /* + * send any full release message to try to move things + * along for the mds (who clearly thinks we still have this + * cap). + */ + ceph_add_cap_releases(mdsc, session); + ceph_send_cap_releases(mdsc, session); + +done: + mutex_unlock(&session->s_mutex); +done_unlocked: + if (inode) + iput(inode); + if (open_target_sessions) + ceph_mdsc_open_export_target_sessions(mdsc, session); + return; + +bad: + pr_err("ceph_handle_caps: corrupt message\n"); + ceph_msg_dump(msg); + return; +} + +/* + * Delayed work handler to process end of delayed cap release LRU list. + */ +void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) +{ + struct ceph_inode_info *ci; + int flags = CHECK_CAPS_NODELAY; + + dout("check_delayed_caps\n"); + while (1) { + spin_lock(&mdsc->cap_delay_lock); + if (list_empty(&mdsc->cap_delay_list)) + break; + ci = list_first_entry(&mdsc->cap_delay_list, + struct ceph_inode_info, + i_cap_delay_list); + if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && + time_before(jiffies, ci->i_hold_caps_max)) + break; + list_del_init(&ci->i_cap_delay_list); + spin_unlock(&mdsc->cap_delay_lock); + dout("check_delayed_caps on %p\n", &ci->vfs_inode); + ceph_check_caps(ci, flags, NULL); + } + spin_unlock(&mdsc->cap_delay_lock); +} + +/* + * Flush all dirty caps to the mds + */ +void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) +{ + struct ceph_inode_info *ci; + struct inode *inode; + + dout("flush_dirty_caps\n"); + spin_lock(&mdsc->cap_dirty_lock); + while (!list_empty(&mdsc->cap_dirty)) { + ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, + i_dirty_item); + inode = &ci->vfs_inode; + ihold(inode); + dout("flush_dirty_caps %p\n", inode); + spin_unlock(&mdsc->cap_dirty_lock); + ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); + iput(inode); + spin_lock(&mdsc->cap_dirty_lock); + } + spin_unlock(&mdsc->cap_dirty_lock); + dout("flush_dirty_caps done\n"); +} + +/* + * Drop open file reference. If we were the last open file, + * we may need to release capabilities to the MDS (or schedule + * their delayed release). + */ +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) +{ + struct inode *inode = &ci->vfs_inode; + int last = 0; + + spin_lock(&inode->i_lock); + dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, + ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); + BUG_ON(ci->i_nr_by_mode[fmode] == 0); + if (--ci->i_nr_by_mode[fmode] == 0) + last++; + spin_unlock(&inode->i_lock); + + if (last && ci->i_vino.snap == CEPH_NOSNAP) + ceph_check_caps(ci, 0, NULL); +} + +/* + * Helpers for embedding cap and dentry lease releases into mds + * requests. + * + * @force is used by dentry_release (below) to force inclusion of a + * record for the directory inode, even when there aren't any caps to + * drop. + */ +int ceph_encode_inode_release(void **p, struct inode *inode, + int mds, int drop, int unless, int force) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; + struct ceph_mds_request_release *rel = *p; + int used, dirty; + int ret = 0; + + spin_lock(&inode->i_lock); + used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); + + dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n", + inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), + ceph_cap_string(unless)); + + /* only drop unused, clean caps */ + drop &= ~(used | dirty); + + cap = __get_cap_for_mds(ci, mds); + if (cap && __cap_is_valid(cap)) { + if (force || + ((cap->issued & drop) && + (cap->issued & unless) == 0)) { + if ((cap->issued & drop) && + (cap->issued & unless) == 0) { + dout("encode_inode_release %p cap %p %s -> " + "%s\n", inode, cap, + ceph_cap_string(cap->issued), + ceph_cap_string(cap->issued & ~drop)); + cap->issued &= ~drop; + cap->implemented &= ~drop; + if (ci->i_ceph_flags & CEPH_I_NODELAY) { + int wanted = __ceph_caps_wanted(ci); + dout(" wanted %s -> %s (act %s)\n", + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(cap->mds_wanted & + ~wanted), + ceph_cap_string(wanted)); + cap->mds_wanted &= wanted; + } + } else { + dout("encode_inode_release %p cap %p %s" + " (force)\n", inode, cap, + ceph_cap_string(cap->issued)); + } + + rel->ino = cpu_to_le64(ceph_ino(inode)); + rel->cap_id = cpu_to_le64(cap->cap_id); + rel->seq = cpu_to_le32(cap->seq); + rel->issue_seq = cpu_to_le32(cap->issue_seq), + rel->mseq = cpu_to_le32(cap->mseq); + rel->caps = cpu_to_le32(cap->issued); + rel->wanted = cpu_to_le32(cap->mds_wanted); + rel->dname_len = 0; + rel->dname_seq = 0; + *p += sizeof(*rel); + ret = 1; + } else { + dout("encode_inode_release %p cap %p %s\n", + inode, cap, ceph_cap_string(cap->issued)); + } + } + spin_unlock(&inode->i_lock); + return ret; +} + +int ceph_encode_dentry_release(void **p, struct dentry *dentry, + int mds, int drop, int unless) +{ + struct inode *dir = dentry->d_parent->d_inode; + struct ceph_mds_request_release *rel = *p; + struct ceph_dentry_info *di = ceph_dentry(dentry); + int force = 0; + int ret; + + /* + * force an record for the directory caps if we have a dentry lease. + * this is racy (can't take i_lock and d_lock together), but it + * doesn't have to be perfect; the mds will revoke anything we don't + * release. + */ + spin_lock(&dentry->d_lock); + if (di->lease_session && di->lease_session->s_mds == mds) + force = 1; + spin_unlock(&dentry->d_lock); + + ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); + + spin_lock(&dentry->d_lock); + if (ret && di->lease_session && di->lease_session->s_mds == mds) { + dout("encode_dentry_release %p mds%d seq %d\n", + dentry, mds, (int)di->lease_seq); + rel->dname_len = cpu_to_le32(dentry->d_name.len); + memcpy(*p, dentry->d_name.name, dentry->d_name.len); + *p += dentry->d_name.len; + rel->dname_seq = cpu_to_le32(di->lease_seq); + __ceph_mdsc_drop_dentry_lease(dentry); + } + spin_unlock(&dentry->d_lock); + return ret; +} diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c new file mode 100644 index 00000000..bdce8b1f --- /dev/null +++ b/fs/ceph/ceph_frag.c @@ -0,0 +1,22 @@ +/* + * Ceph 'frag' type + */ +#include +#include + +int ceph_frag_compare(__u32 a, __u32 b) +{ + unsigned va = ceph_frag_value(a); + unsigned vb = ceph_frag_value(b); + if (va < vb) + return -1; + if (va > vb) + return 1; + va = ceph_frag_bits(a); + vb = ceph_frag_bits(b); + if (va < vb) + return -1; + if (va > vb) + return 1; + return 0; +} diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c new file mode 100644 index 00000000..0dba6915 --- /dev/null +++ b/fs/ceph/debugfs.c @@ -0,0 +1,273 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "super.h" + +#ifdef CONFIG_DEBUG_FS + +#include "mds_client.h" + +static int mdsmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_fs_client *fsc = s->private; + + if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) + return 0; + seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch); + seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root); + seq_printf(s, "session_timeout %d\n", + fsc->mdsc->mdsmap->m_session_timeout); + seq_printf(s, "session_autoclose %d\n", + fsc->mdsc->mdsmap->m_session_autoclose); + for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) { + struct ceph_entity_addr *addr = + &fsc->mdsc->mdsmap->m_info[i].addr; + int state = fsc->mdsc->mdsmap->m_info[i].state; + + seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, + ceph_pr_addr(&addr->in_addr), + ceph_mds_state_name(state)); + } + return 0; +} + +/* + * mdsc debugfs + */ +static int mdsc_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + struct rb_node *rp; + int pathlen; + u64 pathbase; + char *path; + + mutex_lock(&mdsc->mutex); + for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) { + req = rb_entry(rp, struct ceph_mds_request, r_node); + + if (req->r_request && req->r_session) + seq_printf(s, "%lld\tmds%d\t", req->r_tid, + req->r_session->s_mds); + else if (!req->r_request) + seq_printf(s, "%lld\t(no request)\t", req->r_tid); + else + seq_printf(s, "%lld\t(no session)\t", req->r_tid); + + seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); + + if (req->r_got_unsafe) + seq_printf(s, "\t(unsafe)"); + else + seq_printf(s, "\t"); + + if (req->r_inode) { + seq_printf(s, " #%llx", ceph_ino(req->r_inode)); + } else if (req->r_dentry) { + path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + &pathbase, 0); + if (IS_ERR(path)) + path = NULL; + spin_lock(&req->r_dentry->d_lock); + seq_printf(s, " #%llx/%.*s (%s)", + ceph_ino(req->r_dentry->d_parent->d_inode), + req->r_dentry->d_name.len, + req->r_dentry->d_name.name, + path ? path : ""); + spin_unlock(&req->r_dentry->d_lock); + kfree(path); + } else if (req->r_path1) { + seq_printf(s, " #%llx/%s", req->r_ino1.ino, + req->r_path1); + } + + if (req->r_old_dentry) { + path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, + &pathbase, 0); + if (IS_ERR(path)) + path = NULL; + spin_lock(&req->r_old_dentry->d_lock); + seq_printf(s, " #%llx/%.*s (%s)", + ceph_ino(req->r_old_dentry->d_parent->d_inode), + req->r_old_dentry->d_name.len, + req->r_old_dentry->d_name.name, + path ? path : ""); + spin_unlock(&req->r_old_dentry->d_lock); + kfree(path); + } else if (req->r_path2) { + if (req->r_ino2.ino) + seq_printf(s, " #%llx/%s", req->r_ino2.ino, + req->r_path2); + else + seq_printf(s, " %s", req->r_path2); + } + + seq_printf(s, "\n"); + } + mutex_unlock(&mdsc->mutex); + + return 0; +} + +static int caps_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + int total, avail, used, reserved, min; + + ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); + seq_printf(s, "total\t\t%d\n" + "avail\t\t%d\n" + "used\t\t%d\n" + "reserved\t%d\n" + "min\t%d\n", + total, avail, used, reserved, min); + return 0; +} + +static int dentry_lru_show(struct seq_file *s, void *ptr) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_dentry_info *di; + + spin_lock(&mdsc->dentry_lru_lock); + list_for_each_entry(di, &mdsc->dentry_lru, lru) { + struct dentry *dentry = di->dentry; + seq_printf(s, "%p %p\t%.*s\n", + di, dentry, dentry->d_name.len, dentry->d_name.name); + } + spin_unlock(&mdsc->dentry_lru_lock); + + return 0; +} + +CEPH_DEFINE_SHOW_FUNC(mdsmap_show) +CEPH_DEFINE_SHOW_FUNC(mdsc_show) +CEPH_DEFINE_SHOW_FUNC(caps_show) +CEPH_DEFINE_SHOW_FUNC(dentry_lru_show) + + +/* + * debugfs + */ +static int congestion_kb_set(void *data, u64 val) +{ + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + + fsc->mount_options->congestion_kb = (int)val; + return 0; +} + +static int congestion_kb_get(void *data, u64 *val) +{ + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + + *val = (u64)fsc->mount_options->congestion_kb; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, + congestion_kb_set, "%llu\n"); + + +void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) +{ + dout("ceph_fs_debugfs_cleanup\n"); + debugfs_remove(fsc->debugfs_bdi); + debugfs_remove(fsc->debugfs_congestion_kb); + debugfs_remove(fsc->debugfs_mdsmap); + debugfs_remove(fsc->debugfs_caps); + debugfs_remove(fsc->debugfs_mdsc); + debugfs_remove(fsc->debugfs_dentry_lru); +} + +int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) +{ + char name[100]; + int err = -ENOMEM; + + dout("ceph_fs_debugfs_init\n"); + fsc->debugfs_congestion_kb = + debugfs_create_file("writeback_congestion_kb", + 0600, + fsc->client->debugfs_dir, + fsc, + &congestion_kb_fops); + if (!fsc->debugfs_congestion_kb) + goto out; + + snprintf(name, sizeof(name), "../../bdi/%s", + dev_name(fsc->backing_dev_info.dev)); + fsc->debugfs_bdi = + debugfs_create_symlink("bdi", + fsc->client->debugfs_dir, + name); + if (!fsc->debugfs_bdi) + goto out; + + fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", + 0600, + fsc->client->debugfs_dir, + fsc, + &mdsmap_show_fops); + if (!fsc->debugfs_mdsmap) + goto out; + + fsc->debugfs_mdsc = debugfs_create_file("mdsc", + 0600, + fsc->client->debugfs_dir, + fsc, + &mdsc_show_fops); + if (!fsc->debugfs_mdsc) + goto out; + + fsc->debugfs_caps = debugfs_create_file("caps", + 0400, + fsc->client->debugfs_dir, + fsc, + &caps_show_fops); + if (!fsc->debugfs_caps) + goto out; + + fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", + 0600, + fsc->client->debugfs_dir, + fsc, + &dentry_lru_show_fops); + if (!fsc->debugfs_dentry_lru) + goto out; + + return 0; + +out: + ceph_fs_debugfs_cleanup(fsc); + return err; +} + + +#else /* CONFIG_DEBUG_FS */ + +int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) +{ + return 0; +} + +void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) +{ +} + +#endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c new file mode 100644 index 00000000..ef8f08c3 --- /dev/null +++ b/fs/ceph/dir.c @@ -0,0 +1,1275 @@ +#include + +#include +#include +#include +#include +#include + +#include "super.h" +#include "mds_client.h" + +/* + * Directory operations: readdir, lookup, create, link, unlink, + * rename, etc. + */ + +/* + * Ceph MDS operations are specified in terms of a base ino and + * relative path. Thus, the client can specify an operation on a + * specific inode (e.g., a getattr due to fstat(2)), or as a path + * relative to, say, the root directory. + * + * Normally, we limit ourselves to strict inode ops (no path component) + * or dentry operations (a single path component relative to an ino). The + * exception to this is open_root_dentry(), which will open the mount + * point by name. + */ + +const struct inode_operations ceph_dir_iops; +const struct file_operations ceph_dir_fops; +const struct dentry_operations ceph_dentry_ops; + +/* + * Initialize ceph dentry state. + */ +int ceph_init_dentry(struct dentry *dentry) +{ + struct ceph_dentry_info *di; + + if (dentry->d_fsdata) + return 0; + + if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ + ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + d_set_d_op(dentry, &ceph_dentry_ops); + else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) + d_set_d_op(dentry, &ceph_snapdir_dentry_ops); + else + d_set_d_op(dentry, &ceph_snap_dentry_ops); + + di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); + if (!di) + return -ENOMEM; /* oh well */ + + spin_lock(&dentry->d_lock); + if (dentry->d_fsdata) { + /* lost a race */ + kmem_cache_free(ceph_dentry_cachep, di); + goto out_unlock; + } + di->dentry = dentry; + di->lease_session = NULL; + dentry->d_fsdata = di; + dentry->d_time = jiffies; + ceph_dentry_lru_add(dentry); +out_unlock: + spin_unlock(&dentry->d_lock); + return 0; +} + + + +/* + * for readdir, we encode the directory frag and offset within that + * frag into f_pos. + */ +static unsigned fpos_frag(loff_t p) +{ + return p >> 32; +} +static unsigned fpos_off(loff_t p) +{ + return p & 0xffffffff; +} + +/* + * When possible, we try to satisfy a readdir by peeking at the + * dcache. We make this work by carefully ordering dentries on + * d_u.d_child when we initially get results back from the MDS, and + * falling back to a "normal" sync readdir if any dentries in the dir + * are dropped. + * + * I_COMPLETE tells indicates we have all dentries in the dir. It is + * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by + * the MDS if/when the directory is modified). + */ +static int __dcache_readdir(struct file *filp, + void *dirent, filldir_t filldir) +{ + struct ceph_file_info *fi = filp->private_data; + struct dentry *parent = filp->f_dentry; + struct inode *dir = parent->d_inode; + struct list_head *p; + struct dentry *dentry, *last; + struct ceph_dentry_info *di; + int err = 0; + + /* claim ref on last dentry we returned */ + last = fi->dentry; + fi->dentry = NULL; + + dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, + last); + + spin_lock(&parent->d_lock); + + /* start at beginning? */ + if (filp->f_pos == 2 || last == NULL || + filp->f_pos < ceph_dentry(last)->offset) { + if (list_empty(&parent->d_subdirs)) + goto out_unlock; + p = parent->d_subdirs.prev; + dout(" initial p %p/%p\n", p->prev, p->next); + } else { + p = last->d_u.d_child.prev; + } + +more: + dentry = list_entry(p, struct dentry, d_u.d_child); + di = ceph_dentry(dentry); + while (1) { + dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, + d_unhashed(dentry) ? "!hashed" : "hashed", + parent->d_subdirs.prev, parent->d_subdirs.next); + if (p == &parent->d_subdirs) { + fi->at_end = 1; + goto out_unlock; + } + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (!d_unhashed(dentry) && dentry->d_inode && + ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && + ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && + filp->f_pos <= di->offset) + break; + dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, + dentry->d_name.len, dentry->d_name.name, di->offset, + filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", + !dentry->d_inode ? " null" : ""); + spin_unlock(&dentry->d_lock); + p = p->prev; + dentry = list_entry(p, struct dentry, d_u.d_child); + di = ceph_dentry(dentry); + } + + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); + + dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, + dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); + filp->f_pos = di->offset; + err = filldir(dirent, dentry->d_name.name, + dentry->d_name.len, di->offset, + ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), + dentry->d_inode->i_mode >> 12); + + if (last) { + if (err < 0) { + /* remember our position */ + fi->dentry = last; + fi->next_offset = di->offset; + } else { + dput(last); + } + } + last = dentry; + + if (err < 0) + goto out; + + filp->f_pos++; + + /* make sure a dentry wasn't dropped while we didn't have parent lock */ + if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { + dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); + err = -EAGAIN; + goto out; + } + + spin_lock(&parent->d_lock); + p = p->prev; /* advance to next dentry */ + goto more; + +out_unlock: + spin_unlock(&parent->d_lock); +out: + if (last) + dput(last); + return err; +} + +/* + * make note of the last dentry we read, so we can + * continue at the same lexicographical point, + * regardless of what dir changes take place on the + * server. + */ +static int note_last_dentry(struct ceph_file_info *fi, const char *name, + int len) +{ + kfree(fi->last_name); + fi->last_name = kmalloc(len+1, GFP_NOFS); + if (!fi->last_name) + return -ENOMEM; + memcpy(fi->last_name, name, len); + fi->last_name[len] = 0; + dout("note_last_dentry '%s'\n", fi->last_name); + return 0; +} + +static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct ceph_file_info *fi = filp->private_data; + struct inode *inode = filp->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + unsigned frag = fpos_frag(filp->f_pos); + int off = fpos_off(filp->f_pos); + int err; + u32 ftype; + struct ceph_mds_reply_info_parsed *rinfo; + const int max_entries = fsc->mount_options->max_readdir; + const int max_bytes = fsc->mount_options->max_readdir_bytes; + + dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); + if (fi->at_end) + return 0; + + /* always start with . and .. */ + if (filp->f_pos == 0) { + /* note dir version at start of readdir so we can tell + * if any dentries get dropped */ + fi->dir_release_count = ci->i_release_count; + + dout("readdir off 0 -> '.'\n"); + if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), + ceph_translate_ino(inode->i_sb, inode->i_ino), + inode->i_mode >> 12) < 0) + return 0; + filp->f_pos = 1; + off = 1; + } + if (filp->f_pos == 1) { + ino_t ino = filp->f_dentry->d_parent->d_inode->i_ino; + dout("readdir off 1 -> '..'\n"); + if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), + ceph_translate_ino(inode->i_sb, ino), + inode->i_mode >> 12) < 0) + return 0; + filp->f_pos = 2; + off = 2; + } + + /* can we use the dcache? */ + spin_lock(&inode->i_lock); + if ((filp->f_pos == 2 || fi->dentry) && + !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && + ceph_snap(inode) != CEPH_SNAPDIR && + (ci->i_ceph_flags & CEPH_I_COMPLETE) && + __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { + spin_unlock(&inode->i_lock); + err = __dcache_readdir(filp, dirent, filldir); + if (err != -EAGAIN) + return err; + } else { + spin_unlock(&inode->i_lock); + } + if (fi->dentry) { + err = note_last_dentry(fi, fi->dentry->d_name.name, + fi->dentry->d_name.len); + if (err) + return err; + dput(fi->dentry); + fi->dentry = NULL; + } + + /* proceed with a normal readdir */ + +more: + /* do we have the correct frag content buffered? */ + if (fi->frag != frag || fi->last_readdir == NULL) { + struct ceph_mds_request *req; + int op = ceph_snap(inode) == CEPH_SNAPDIR ? + CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; + + /* discard old result, if any */ + if (fi->last_readdir) { + ceph_mdsc_put_request(fi->last_readdir); + fi->last_readdir = NULL; + } + + /* requery frag tree, as the frag topology may have changed */ + frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); + + dout("readdir fetching %llx.%llx frag %x offset '%s'\n", + ceph_vinop(inode), frag, fi->last_name); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + req->r_dentry = dget(filp->f_dentry); + /* hints to request -> mds selection code */ + req->r_direct_mode = USE_AUTH_MDS; + req->r_direct_hash = ceph_frag_value(frag); + req->r_direct_is_hash = true; + req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); + req->r_readdir_offset = fi->next_offset; + req->r_args.readdir.frag = cpu_to_le32(frag); + req->r_args.readdir.max_entries = cpu_to_le32(max_entries); + req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); + req->r_num_caps = max_entries + 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) { + ceph_mdsc_put_request(req); + return err; + } + dout("readdir got and parsed readdir result=%d" + " on frag %x, end=%d, complete=%d\n", err, frag, + (int)req->r_reply_info.dir_end, + (int)req->r_reply_info.dir_complete); + + if (!req->r_did_prepopulate) { + dout("readdir !did_prepopulate"); + fi->dir_release_count--; /* preclude I_COMPLETE */ + } + + /* note next offset and last dentry name */ + fi->offset = fi->next_offset; + fi->last_readdir = req; + + if (req->r_reply_info.dir_end) { + kfree(fi->last_name); + fi->last_name = NULL; + if (ceph_frag_is_rightmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; + } else { + rinfo = &req->r_reply_info; + err = note_last_dentry(fi, + rinfo->dir_dname[rinfo->dir_nr-1], + rinfo->dir_dname_len[rinfo->dir_nr-1]); + if (err) + return err; + fi->next_offset += rinfo->dir_nr; + } + } + + rinfo = &fi->last_readdir->r_reply_info; + dout("readdir frag %x num %d off %d chunkoff %d\n", frag, + rinfo->dir_nr, off, fi->offset); + while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { + u64 pos = ceph_make_fpos(frag, off); + struct ceph_mds_reply_inode *in = + rinfo->dir_in[off - fi->offset].in; + struct ceph_vino vino; + ino_t ino; + + dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", + off, off - fi->offset, rinfo->dir_nr, pos, + rinfo->dir_dname_len[off - fi->offset], + rinfo->dir_dname[off - fi->offset], in); + BUG_ON(!in); + ftype = le32_to_cpu(in->mode) >> 12; + vino.ino = le64_to_cpu(in->ino); + vino.snap = le64_to_cpu(in->snapid); + ino = ceph_vino_to_ino(vino); + if (filldir(dirent, + rinfo->dir_dname[off - fi->offset], + rinfo->dir_dname_len[off - fi->offset], + pos, + ceph_translate_ino(inode->i_sb, ino), ftype) < 0) { + dout("filldir stopping us...\n"); + return 0; + } + off++; + filp->f_pos = pos + 1; + } + + if (fi->last_name) { + ceph_mdsc_put_request(fi->last_readdir); + fi->last_readdir = NULL; + goto more; + } + + /* more frags? */ + if (!ceph_frag_is_rightmost(frag)) { + frag = ceph_frag_next(frag); + off = 0; + filp->f_pos = ceph_make_fpos(frag, off); + dout("readdir next frag is %x\n", frag); + goto more; + } + fi->at_end = 1; + + /* + * if dir_release_count still matches the dir, no dentries + * were released during the whole readdir, and we should have + * the complete dir contents in our cache. + */ + spin_lock(&inode->i_lock); + if (ci->i_release_count == fi->dir_release_count) { + dout(" marking %p complete\n", inode); + /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ + ci->i_max_offset = filp->f_pos; + } + spin_unlock(&inode->i_lock); + + dout("readdir %p filp %p done.\n", inode, filp); + return 0; +} + +static void reset_readdir(struct ceph_file_info *fi) +{ + if (fi->last_readdir) { + ceph_mdsc_put_request(fi->last_readdir); + fi->last_readdir = NULL; + } + kfree(fi->last_name); + fi->last_name = NULL; + fi->next_offset = 2; /* compensate for . and .. */ + if (fi->dentry) { + dput(fi->dentry); + fi->dentry = NULL; + } + fi->at_end = 0; +} + +static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file->f_mapping->host; + loff_t old_offset = offset; + loff_t retval; + + mutex_lock(&inode->i_mutex); + switch (origin) { + case SEEK_END: + offset += inode->i_size + 2; /* FIXME */ + break; + case SEEK_CUR: + offset += file->f_pos; + } + retval = -EINVAL; + if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + fi->at_end = 0; + } + retval = offset; + + /* + * discard buffered readdir content on seekdir(0), or + * seek to new frag, or seek prior to current chunk. + */ + if (offset == 0 || + fpos_frag(offset) != fpos_frag(old_offset) || + fpos_off(offset) < fi->offset) { + dout("dir_llseek dropping %p content\n", file); + reset_readdir(fi); + } + + /* bump dir_release_count if we did a forward seek */ + if (offset > old_offset) + fi->dir_release_count--; + } + mutex_unlock(&inode->i_mutex); + return retval; +} + +/* + * Process result of a lookup/open request. + * + * Mainly, make sure we return the final req->r_dentry (if it already + * existed) in place of the original VFS-provided dentry when they + * differ. + * + * Gracefully handle the case where the MDS replies with -ENOENT and + * no trace (which it may do, at its discretion, e.g., if it doesn't + * care to issue a lease on the negative dentry). + */ +struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, + struct dentry *dentry, int err) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct inode *parent = dentry->d_parent->d_inode; + + /* .snap dir? */ + if (err == -ENOENT && + ceph_snap(parent) == CEPH_NOSNAP && + strcmp(dentry->d_name.name, + fsc->mount_options->snapdir_name) == 0) { + struct inode *inode = ceph_get_snapdir(parent); + dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", + dentry, dentry->d_name.len, dentry->d_name.name, inode); + BUG_ON(!d_unhashed(dentry)); + d_add(dentry, inode); + err = 0; + } + + if (err == -ENOENT) { + /* no trace? */ + err = 0; + if (!req->r_reply_info.head->is_dentry) { + dout("ENOENT and no trace, dentry %p inode %p\n", + dentry, dentry->d_inode); + if (dentry->d_inode) { + d_drop(dentry); + err = -ENOENT; + } else { + d_add(dentry, NULL); + } + } + } + if (err) + dentry = ERR_PTR(err); + else if (dentry != req->r_dentry) + dentry = dget(req->r_dentry); /* we got spliced */ + else + dentry = NULL; + return dentry; +} + +static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) +{ + return ceph_ino(inode) == CEPH_INO_ROOT && + strncmp(dentry->d_name.name, ".ceph", 5) == 0; +} + +/* + * Look up a single dir entry. If there is a lookup intent, inform + * the MDS so that it gets our 'caps wanted' value in a single op. + */ +static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int op; + int err; + + dout("lookup %p dentry %p '%.*s'\n", + dir, dentry, dentry->d_name.len, dentry->d_name.name); + + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + err = ceph_init_dentry(dentry); + if (err < 0) + return ERR_PTR(err); + + /* open (but not create!) intent? */ + if (nd && + (nd->flags & LOOKUP_OPEN) && + (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */ + !(nd->intent.open.flags & O_CREAT)) { + int mode = nd->intent.open.create_mode & ~current->fs->umask; + return ceph_lookup_open(dir, dentry, nd, mode, 1); + } + + /* can we conclude ENOENT locally? */ + if (dentry->d_inode == NULL) { + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); + + spin_lock(&dir->i_lock); + dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); + if (strncmp(dentry->d_name.name, + fsc->mount_options->snapdir_name, + dentry->d_name.len) && + !is_root_ceph_dentry(dir, dentry) && + (ci->i_ceph_flags & CEPH_I_COMPLETE) && + (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { + spin_unlock(&dir->i_lock); + dout(" dir %p complete, -ENOENT\n", dir); + d_add(dentry, NULL); + di->lease_shared_gen = ci->i_shared_gen; + return NULL; + } + spin_unlock(&dir->i_lock); + } + + op = ceph_snap(dir) == CEPH_SNAPDIR ? + CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; + req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + /* we only need inode linkage */ + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); + req->r_locked_dir = dir; + err = ceph_mdsc_do_request(mdsc, NULL, req); + dentry = ceph_finish_lookup(req, dentry, err); + ceph_mdsc_put_request(req); /* will dput(dentry) */ + dout("lookup result=%p\n", dentry); + return dentry; +} + +/* + * If we do a create but get no trace back from the MDS, follow up with + * a lookup (the VFS expects us to link up the provided dentry). + */ +int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) +{ + struct dentry *result = ceph_lookup(dir, dentry, NULL); + + if (result && !IS_ERR(result)) { + /* + * We created the item, then did a lookup, and found + * it was already linked to another inode we already + * had in our cache (and thus got spliced). Link our + * dentry to that inode, but don't hash it, just in + * case the VFS wants to dereference it. + */ + BUG_ON(!result->d_inode); + d_instantiate(dentry, result->d_inode); + return 0; + } + return PTR_ERR(result); +} + +static int ceph_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int err; + + if (ceph_snap(dir) != CEPH_NOSNAP) + return -EROFS; + + dout("mknod in dir %p dentry %p mode 0%o rdev %d\n", + dir, dentry, mode, rdev); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); + if (IS_ERR(req)) { + d_drop(dentry); + return PTR_ERR(req); + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_locked_dir = dir; + req->r_args.mknod.mode = cpu_to_le32(mode); + req->r_args.mknod.rdev = cpu_to_le32(rdev); + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + ceph_mdsc_put_request(req); + if (err) + d_drop(dentry); + return err; +} + +static int ceph_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + dout("create in dir %p dentry %p name '%.*s'\n", + dir, dentry, dentry->d_name.len, dentry->d_name.name); + + if (ceph_snap(dir) != CEPH_NOSNAP) + return -EROFS; + + if (nd) { + BUG_ON((nd->flags & LOOKUP_OPEN) == 0); + dentry = ceph_lookup_open(dir, dentry, nd, mode, 0); + /* hrm, what should i do here if we get aliased? */ + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + return 0; + } + + /* fall back to mknod */ + return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0); +} + +static int ceph_symlink(struct inode *dir, struct dentry *dentry, + const char *dest) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int err; + + if (ceph_snap(dir) != CEPH_NOSNAP) + return -EROFS; + + dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); + if (IS_ERR(req)) { + d_drop(dentry); + return PTR_ERR(req); + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_path2 = kstrdup(dest, GFP_NOFS); + req->r_locked_dir = dir; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + ceph_mdsc_put_request(req); + if (err) + d_drop(dentry); + return err; +} + +static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int err = -EROFS; + int op; + + if (ceph_snap(dir) == CEPH_SNAPDIR) { + /* mkdir .snap/foo is a MKSNAP */ + op = CEPH_MDS_OP_MKSNAP; + dout("mksnap dir %p snap '%.*s' dn %p\n", dir, + dentry->d_name.len, dentry->d_name.name, dentry); + } else if (ceph_snap(dir) == CEPH_NOSNAP) { + dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode); + op = CEPH_MDS_OP_MKDIR; + } else { + goto out; + } + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_locked_dir = dir; + req->r_args.mkdir.mode = cpu_to_le32(mode); + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + ceph_mdsc_put_request(req); +out: + if (err < 0) + d_drop(dentry); + return err; +} + +static int ceph_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int err; + + if (ceph_snap(dir) != CEPH_NOSNAP) + return -EROFS; + + dout("link in dir %p old_dentry %p dentry %p\n", dir, + old_dentry, dentry); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); + if (IS_ERR(req)) { + d_drop(dentry); + return PTR_ERR(req); + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ + req->r_locked_dir = dir; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + err = ceph_mdsc_do_request(mdsc, dir, req); + if (err) { + d_drop(dentry); + } else if (!req->r_reply_info.head->is_dentry) { + ihold(old_dentry->d_inode); + d_instantiate(dentry, old_dentry->d_inode); + } + ceph_mdsc_put_request(req); + return err; +} + +/* + * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it + * looks like the link count will hit 0, drop any other caps (other + * than PIN) we don't specifically want (due to the file still being + * open). + */ +static int drop_caps_for_unlink(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + + spin_lock(&inode->i_lock); + if (inode->i_nlink == 1) { + drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); + ci->i_ceph_flags |= CEPH_I_NODELAY; + } + spin_unlock(&inode->i_lock); + return drop; +} + +/* + * rmdir and unlink are differ only by the metadata op code + */ +static int ceph_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct inode *inode = dentry->d_inode; + struct ceph_mds_request *req; + int err = -EROFS; + int op; + + if (ceph_snap(dir) == CEPH_SNAPDIR) { + /* rmdir .snap/foo is RMSNAP */ + dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len, + dentry->d_name.name, dentry); + op = CEPH_MDS_OP_RMSNAP; + } else if (ceph_snap(dir) == CEPH_NOSNAP) { + dout("unlink/rmdir dir %p dn %p inode %p\n", + dir, dentry, inode); + op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ? + CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; + } else + goto out; + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_locked_dir = dir; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + req->r_inode_drop = drop_caps_for_unlink(inode); + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + d_delete(dentry); + ceph_mdsc_put_request(req); +out: + return err; +} + +static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int err; + + if (ceph_snap(old_dir) != ceph_snap(new_dir)) + return -EXDEV; + if (ceph_snap(old_dir) != CEPH_NOSNAP || + ceph_snap(new_dir) != CEPH_NOSNAP) + return -EROFS; + dout("rename dir %p dentry %p to dir %p dentry %p\n", + old_dir, old_dentry, new_dir, new_dentry); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_dentry = dget(new_dentry); + req->r_num_caps = 2; + req->r_old_dentry = dget(old_dentry); + req->r_locked_dir = new_dir; + req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + /* release LINK_RDCACHE on source inode (mds will lock it) */ + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; + if (new_dentry->d_inode) + req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode); + err = ceph_mdsc_do_request(mdsc, old_dir, req); + if (!err && !req->r_reply_info.head->is_dentry) { + /* + * Normally d_move() is done by fill_trace (called by + * do_request, above). If there is no trace, we need + * to do it here. + */ + + /* d_move screws up d_subdirs order */ + ceph_i_clear(new_dir, CEPH_I_COMPLETE); + + d_move(old_dentry, new_dentry); + + /* ensure target dentry is invalidated, despite + rehashing bug in vfs_rename_dir */ + ceph_invalidate_dentry_lease(new_dentry); + } + ceph_mdsc_put_request(req); + return err; +} + +/* + * Ensure a dentry lease will no longer revalidate. + */ +void ceph_invalidate_dentry_lease(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_time = jiffies; + ceph_dentry(dentry)->lease_shared_gen = 0; + spin_unlock(&dentry->d_lock); +} + +/* + * Check if dentry lease is valid. If not, delete the lease. Try to + * renew if the least is more than half up. + */ +static int dentry_lease_is_valid(struct dentry *dentry) +{ + struct ceph_dentry_info *di; + struct ceph_mds_session *s; + int valid = 0; + u32 gen; + unsigned long ttl; + struct ceph_mds_session *session = NULL; + struct inode *dir = NULL; + u32 seq = 0; + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + if (di && di->lease_session) { + s = di->lease_session; + spin_lock(&s->s_cap_lock); + gen = s->s_cap_gen; + ttl = s->s_cap_ttl; + spin_unlock(&s->s_cap_lock); + + if (di->lease_gen == gen && + time_before(jiffies, dentry->d_time) && + time_before(jiffies, ttl)) { + valid = 1; + if (di->lease_renew_after && + time_after(jiffies, di->lease_renew_after)) { + /* we should renew */ + dir = dentry->d_parent->d_inode; + session = ceph_get_mds_session(s); + seq = di->lease_seq; + di->lease_renew_after = 0; + di->lease_renew_from = jiffies; + } + } + } + spin_unlock(&dentry->d_lock); + + if (session) { + ceph_mdsc_lease_send_msg(session, dir, dentry, + CEPH_MDS_LEASE_RENEW, seq); + ceph_put_mds_session(session); + } + dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); + return valid; +} + +/* + * Check if directory-wide content lease/cap is valid. + */ +static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); + int valid = 0; + + spin_lock(&dir->i_lock); + if (ci->i_shared_gen == di->lease_shared_gen) + valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); + spin_unlock(&dir->i_lock); + dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", + dir, (unsigned)ci->i_shared_gen, dentry, + (unsigned)di->lease_shared_gen, valid); + return valid; +} + +/* + * Check if cached dentry can be trusted. + */ +static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *dir; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + dir = dentry->d_parent->d_inode; + + dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, + dentry->d_name.len, dentry->d_name.name, dentry->d_inode, + ceph_dentry(dentry)->offset); + + /* always trust cached snapped dentries, snapdir dentry */ + if (ceph_snap(dir) != CEPH_NOSNAP) { + dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, + dentry->d_name.len, dentry->d_name.name, dentry->d_inode); + goto out_touch; + } + if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) + goto out_touch; + + if (dentry_lease_is_valid(dentry) || + dir_lease_is_valid(dir, dentry)) + goto out_touch; + + dout("d_revalidate %p invalid\n", dentry); + d_drop(dentry); + return 0; +out_touch: + ceph_dentry_lru_touch(dentry); + return 1; +} + +/* + * Release our ceph_dentry_info. + */ +static void ceph_d_release(struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + + dout("d_release %p\n", dentry); + if (di) { + ceph_dentry_lru_del(dentry); + if (di->lease_session) + ceph_put_mds_session(di->lease_session); + kmem_cache_free(ceph_dentry_cachep, di); + dentry->d_fsdata = NULL; + } +} + +static int ceph_snapdir_d_revalidate(struct dentry *dentry, + struct nameidata *nd) +{ + /* + * Eventually, we'll want to revalidate snapped metadata + * too... probably... + */ + return 1; +} + + + +/* + * read() on a dir. This weird interface hack only works if mounted + * with '-o dirstat'. + */ +static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, + loff_t *ppos) +{ + struct ceph_file_info *cf = file->private_data; + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + int left; + const int bufsize = 1024; + + if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) + return -EISDIR; + + if (!cf->dir_info) { + cf->dir_info = kmalloc(bufsize, GFP_NOFS); + if (!cf->dir_info) + return -ENOMEM; + cf->dir_info_len = + snprintf(cf->dir_info, bufsize, + "entries: %20lld\n" + " files: %20lld\n" + " subdirs: %20lld\n" + "rentries: %20lld\n" + " rfiles: %20lld\n" + " rsubdirs: %20lld\n" + "rbytes: %20lld\n" + "rctime: %10ld.%09ld\n", + ci->i_files + ci->i_subdirs, + ci->i_files, + ci->i_subdirs, + ci->i_rfiles + ci->i_rsubdirs, + ci->i_rfiles, + ci->i_rsubdirs, + ci->i_rbytes, + (long)ci->i_rctime.tv_sec, + (long)ci->i_rctime.tv_nsec); + } + + if (*ppos >= cf->dir_info_len) + return 0; + size = min_t(unsigned, size, cf->dir_info_len-*ppos); + left = copy_to_user(buf, cf->dir_info + *ppos, size); + if (left == size) + return -EFAULT; + *ppos += (size - left); + return size - left; +} + +/* + * an fsync() on a dir will wait for any uncommitted directory + * operations to commit. + */ +static int ceph_dir_fsync(struct file *file, int datasync) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct list_head *head = &ci->i_unsafe_dirops; + struct ceph_mds_request *req; + u64 last_tid; + int ret = 0; + + dout("dir_fsync %p\n", inode); + spin_lock(&ci->i_unsafe_lock); + if (list_empty(head)) + goto out; + + req = list_entry(head->prev, + struct ceph_mds_request, r_unsafe_dir_item); + last_tid = req->r_tid; + + do { + ceph_mdsc_get_request(req); + spin_unlock(&ci->i_unsafe_lock); + dout("dir_fsync %p wait on tid %llu (until %llu)\n", + inode, req->r_tid, last_tid); + if (req->r_timeout) { + ret = wait_for_completion_timeout( + &req->r_safe_completion, req->r_timeout); + if (ret > 0) + ret = 0; + else if (ret == 0) + ret = -EIO; /* timed out */ + } else { + wait_for_completion(&req->r_safe_completion); + } + spin_lock(&ci->i_unsafe_lock); + ceph_mdsc_put_request(req); + + if (ret || list_empty(head)) + break; + req = list_entry(head->next, + struct ceph_mds_request, r_unsafe_dir_item); + } while (req->r_tid < last_tid); +out: + spin_unlock(&ci->i_unsafe_lock); + return ret; +} + +/* + * We maintain a private dentry LRU. + * + * FIXME: this needs to be changed to a per-mds lru to be useful. + */ +void ceph_dentry_lru_add(struct dentry *dn) +{ + struct ceph_dentry_info *di = ceph_dentry(dn); + struct ceph_mds_client *mdsc; + + dout("dentry_lru_add %p %p '%.*s'\n", di, dn, + dn->d_name.len, dn->d_name.name); + if (di) { + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_add_tail(&di->lru, &mdsc->dentry_lru); + mdsc->num_dentry++; + spin_unlock(&mdsc->dentry_lru_lock); + } +} + +void ceph_dentry_lru_touch(struct dentry *dn) +{ + struct ceph_dentry_info *di = ceph_dentry(dn); + struct ceph_mds_client *mdsc; + + dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, + dn->d_name.len, dn->d_name.name, di->offset); + if (di) { + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_move_tail(&di->lru, &mdsc->dentry_lru); + spin_unlock(&mdsc->dentry_lru_lock); + } +} + +void ceph_dentry_lru_del(struct dentry *dn) +{ + struct ceph_dentry_info *di = ceph_dentry(dn); + struct ceph_mds_client *mdsc; + + dout("dentry_lru_del %p %p '%.*s'\n", di, dn, + dn->d_name.len, dn->d_name.name); + if (di) { + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_del_init(&di->lru); + mdsc->num_dentry--; + spin_unlock(&mdsc->dentry_lru_lock); + } +} + +/* + * Return name hash for a given dentry. This is dependent on + * the parent directory's hash function. + */ +unsigned ceph_dentry_hash(struct dentry *dn) +{ + struct inode *dir = dn->d_parent->d_inode; + struct ceph_inode_info *dci = ceph_inode(dir); + + switch (dci->i_dir_layout.dl_dir_hash) { + case 0: /* for backward compat */ + case CEPH_STR_HASH_LINUX: + return dn->d_name.hash; + + default: + return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, + dn->d_name.name, dn->d_name.len); + } +} + +const struct file_operations ceph_dir_fops = { + .read = ceph_read_dir, + .readdir = ceph_readdir, + .llseek = ceph_dir_llseek, + .open = ceph_open, + .release = ceph_release, + .unlocked_ioctl = ceph_ioctl, + .fsync = ceph_dir_fsync, +}; + +const struct inode_operations ceph_dir_iops = { + .lookup = ceph_lookup, + .permission = ceph_permission, + .getattr = ceph_getattr, + .setattr = ceph_setattr, + .setxattr = ceph_setxattr, + .getxattr = ceph_getxattr, + .listxattr = ceph_listxattr, + .removexattr = ceph_removexattr, + .mknod = ceph_mknod, + .symlink = ceph_symlink, + .mkdir = ceph_mkdir, + .link = ceph_link, + .unlink = ceph_unlink, + .rmdir = ceph_unlink, + .rename = ceph_rename, + .create = ceph_create, +}; + +const struct dentry_operations ceph_dentry_ops = { + .d_revalidate = ceph_d_revalidate, + .d_release = ceph_d_release, +}; + +const struct dentry_operations ceph_snapdir_dentry_ops = { + .d_revalidate = ceph_snapdir_d_revalidate, + .d_release = ceph_d_release, +}; + +const struct dentry_operations ceph_snap_dentry_ops = { + .d_release = ceph_d_release, +}; diff --git a/fs/ceph/export.c b/fs/ceph/export.c new file mode 100644 index 00000000..f67b6875 --- /dev/null +++ b/fs/ceph/export.c @@ -0,0 +1,249 @@ +#include + +#include +#include +#include + +#include "super.h" +#include "mds_client.h" + +/* + * NFS export support + * + * NFS re-export of a ceph mount is, at present, only semireliable. + * The basic issue is that the Ceph architectures doesn't lend itself + * well to generating filehandles that will remain valid forever. + * + * So, we do our best. If you're lucky, your inode will be in the + * client's cache. If it's not, and you have a connectable fh, then + * the MDS server may be able to find it for you. Otherwise, you get + * ESTALE. + * + * There are ways to this more reliable, but in the non-connectable fh + * case, we won't every work perfectly, and in the connectable case, + * some changes are needed on the MDS side to work better. + */ + +/* + * Basic fh + */ +struct ceph_nfs_fh { + u64 ino; +} __attribute__ ((packed)); + +/* + * Larger 'connectable' fh that includes parent ino and name hash. + * Use this whenever possible, as it works more reliably. + */ +struct ceph_nfs_confh { + u64 ino, parent_ino; + u32 parent_name_hash; +} __attribute__ ((packed)); + +static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, + int connectable) +{ + int type; + struct ceph_nfs_fh *fh = (void *)rawfh; + struct ceph_nfs_confh *cfh = (void *)rawfh; + struct dentry *parent = dentry->d_parent; + struct inode *inode = dentry->d_inode; + int connected_handle_length = sizeof(*cfh)/4; + int handle_length = sizeof(*fh)/4; + + /* don't re-export snaps */ + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EINVAL; + + if (*max_len >= connected_handle_length) { + dout("encode_fh %p connectable\n", dentry); + cfh->ino = ceph_ino(dentry->d_inode); + cfh->parent_ino = ceph_ino(parent->d_inode); + cfh->parent_name_hash = ceph_dentry_hash(parent); + *max_len = connected_handle_length; + type = 2; + } else if (*max_len >= handle_length) { + if (connectable) { + *max_len = connected_handle_length; + return 255; + } + dout("encode_fh %p\n", dentry); + fh->ino = ceph_ino(dentry->d_inode); + *max_len = handle_length; + type = 1; + } else { + *max_len = handle_length; + return 255; + } + return type; +} + +/* + * convert regular fh to dentry + * + * FIXME: we should try harder by querying the mds for the ino. + */ +static struct dentry *__fh_to_dentry(struct super_block *sb, + struct ceph_nfs_fh *fh) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct inode *inode; + struct dentry *dentry; + struct ceph_vino vino; + int err; + + dout("__fh_to_dentry %llx\n", fh->ino); + vino.ino = fh->ino; + vino.snap = CEPH_NOSNAP; + inode = ceph_find_inode(sb, vino); + if (!inode) { + struct ceph_mds_request *req; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + + req->r_ino1 = vino; + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + ihold(inode); + ceph_mdsc_put_request(req); + if (!inode) + return ERR_PTR(-ESTALE); + } + + dentry = d_obtain_alias(inode); + if (IS_ERR(dentry)) { + pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", + fh->ino, inode); + iput(inode); + return dentry; + } + err = ceph_init_dentry(dentry); + + if (err < 0) { + iput(inode); + return ERR_PTR(err); + } + dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry); + return dentry; +} + +/* + * convert connectable fh to dentry + */ +static struct dentry *__cfh_to_dentry(struct super_block *sb, + struct ceph_nfs_confh *cfh) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct inode *inode; + struct dentry *dentry; + struct ceph_vino vino; + int err; + + dout("__cfh_to_dentry %llx (%llx/%x)\n", + cfh->ino, cfh->parent_ino, cfh->parent_name_hash); + + vino.ino = cfh->ino; + vino.snap = CEPH_NOSNAP; + inode = ceph_find_inode(sb, vino); + if (!inode) { + struct ceph_mds_request *req; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + + req->r_ino1 = vino; + req->r_ino2.ino = cfh->parent_ino; + req->r_ino2.snap = CEPH_NOSNAP; + req->r_path2 = kmalloc(16, GFP_NOFS); + snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + ihold(inode); + ceph_mdsc_put_request(req); + if (!inode) + return ERR_PTR(err ? err : -ESTALE); + } + + dentry = d_obtain_alias(inode); + if (IS_ERR(dentry)) { + pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", + cfh->ino, inode); + iput(inode); + return dentry; + } + err = ceph_init_dentry(dentry); + if (err < 0) { + iput(inode); + return ERR_PTR(err); + } + dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry); + return dentry; +} + +static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + if (fh_type == 1) + return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw); + else + return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw); +} + +/* + * get parent, if possible. + * + * FIXME: we could do better by querying the mds to discover the + * parent. + */ +static struct dentry *ceph_fh_to_parent(struct super_block *sb, + struct fid *fid, + int fh_len, int fh_type) +{ + struct ceph_nfs_confh *cfh = (void *)fid->raw; + struct ceph_vino vino; + struct inode *inode; + struct dentry *dentry; + int err; + + if (fh_type == 1) + return ERR_PTR(-ESTALE); + + pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, + cfh->parent_name_hash); + + vino.ino = cfh->ino; + vino.snap = CEPH_NOSNAP; + inode = ceph_find_inode(sb, vino); + if (!inode) + return ERR_PTR(-ESTALE); + + dentry = d_obtain_alias(inode); + if (IS_ERR(dentry)) { + pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", + cfh->ino, inode); + iput(inode); + return dentry; + } + err = ceph_init_dentry(dentry); + if (err < 0) { + iput(inode); + return ERR_PTR(err); + } + dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry); + return dentry; +} + +const struct export_operations ceph_export_ops = { + .encode_fh = ceph_encode_fh, + .fh_to_dentry = ceph_fh_to_dentry, + .fh_to_parent = ceph_fh_to_parent, +}; diff --git a/fs/ceph/file.c b/fs/ceph/file.c new file mode 100644 index 00000000..4698a5c5 --- /dev/null +++ b/fs/ceph/file.c @@ -0,0 +1,828 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include "super.h" +#include "mds_client.h" + +/* + * Ceph file operations + * + * Implement basic open/close functionality, and implement + * read/write. + * + * We implement three modes of file I/O: + * - buffered uses the generic_file_aio_{read,write} helpers + * + * - synchronous is used when there is multi-client read/write + * sharing, avoids the page cache, and synchronously waits for an + * ack from the OSD. + * + * - direct io takes the variant of the sync path that references + * user pages directly. + * + * fsync() flushes and waits on dirty pages, but just queues metadata + * for writeback: since the MDS can recover size and mtime there is no + * need to wait for MDS acknowledgement. + */ + + +/* + * Prepare an open request. Preallocate ceph_cap to avoid an + * inopportune ENOMEM later. + */ +static struct ceph_mds_request * +prepare_open_request(struct super_block *sb, int flags, int create_mode) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int want_auth = USE_ANY_MDS; + int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; + + if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC)) + want_auth = USE_AUTH_MDS; + + req = ceph_mdsc_create_request(mdsc, op, want_auth); + if (IS_ERR(req)) + goto out; + req->r_fmode = ceph_flags_to_mode(flags); + req->r_args.open.flags = cpu_to_le32(flags); + req->r_args.open.mode = cpu_to_le32(create_mode); + req->r_args.open.preferred = cpu_to_le32(-1); +out: + return req; +} + +/* + * initialize private struct file data. + * if we fail, clean up by dropping fmode reference on the ceph_inode + */ +static int ceph_init_file(struct inode *inode, struct file *file, int fmode) +{ + struct ceph_file_info *cf; + int ret = 0; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + dout("init_file %p %p 0%o (regular)\n", inode, file, + inode->i_mode); + cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO); + if (cf == NULL) { + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + return -ENOMEM; + } + cf->fmode = fmode; + cf->next_offset = 2; + file->private_data = cf; + BUG_ON(inode->i_fop->release != ceph_release); + break; + + case S_IFLNK: + dout("init_file %p %p 0%o (symlink)\n", inode, file, + inode->i_mode); + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + break; + + default: + dout("init_file %p %p 0%o (special)\n", inode, file, + inode->i_mode); + /* + * we need to drop the open ref now, since we don't + * have .release set to ceph_release. + */ + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + BUG_ON(inode->i_fop->release == ceph_release); + + /* call the proper open fop */ + ret = inode->i_fop->open(inode, file); + } + return ret; +} + +/* + * If the filp already has private_data, that means the file was + * already opened by intent during lookup, and we do nothing. + * + * If we already have the requisite capabilities, we can satisfy + * the open request locally (no need to request new caps from the + * MDS). We do, however, need to inform the MDS (asynchronously) + * if our wanted caps set expands. + */ +int ceph_open(struct inode *inode, struct file *file) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + struct ceph_file_info *cf = file->private_data; + struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + int err; + int flags, fmode, wanted; + + if (cf) { + dout("open file %p is already opened\n", file); + return 0; + } + + /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */ + flags = file->f_flags & ~(O_CREAT|O_EXCL); + if (S_ISDIR(inode->i_mode)) + flags = O_DIRECTORY; /* mds likes to know */ + + dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode, + ceph_vinop(inode), file, flags, file->f_flags); + fmode = ceph_flags_to_mode(flags); + wanted = ceph_caps_for_mode(fmode); + + /* snapped files are read-only */ + if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE)) + return -EROFS; + + /* trivially open snapdir */ + if (ceph_snap(inode) == CEPH_SNAPDIR) { + spin_lock(&inode->i_lock); + __ceph_get_fmode(ci, fmode); + spin_unlock(&inode->i_lock); + return ceph_init_file(inode, file, fmode); + } + + /* + * No need to block if we have caps on the auth MDS (for + * write) or any MDS (for read). Update wanted set + * asynchronously. + */ + spin_lock(&inode->i_lock); + if (__ceph_is_any_real_caps(ci) && + (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { + int mds_wanted = __ceph_caps_mds_wanted(ci); + int issued = __ceph_caps_issued(ci, NULL); + + dout("open %p fmode %d want %s issued %s using existing\n", + inode, fmode, ceph_cap_string(wanted), + ceph_cap_string(issued)); + __ceph_get_fmode(ci, fmode); + spin_unlock(&inode->i_lock); + + /* adjust wanted? */ + if ((issued & wanted) != wanted && + (mds_wanted & wanted) != wanted && + ceph_snap(inode) != CEPH_SNAPDIR) + ceph_check_caps(ci, 0, NULL); + + return ceph_init_file(inode, file, fmode); + } else if (ceph_snap(inode) != CEPH_NOSNAP && + (ci->i_snap_caps & wanted) == wanted) { + __ceph_get_fmode(ci, fmode); + spin_unlock(&inode->i_lock); + return ceph_init_file(inode, file, fmode); + } + spin_unlock(&inode->i_lock); + + dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); + req = prepare_open_request(inode->i_sb, flags, 0); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + if (!err) + err = ceph_init_file(inode, file, req->r_fmode); + ceph_mdsc_put_request(req); + dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode)); +out: + return err; +} + + +/* + * Do a lookup + open with a single request. + * + * If this succeeds, but some subsequent check in the vfs + * may_open() fails, the struct *file gets cleaned up (i.e. + * ceph_release gets called). So fear not! + */ +/* + * flags + * path_lookup_open -> LOOKUP_OPEN + * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE + */ +struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, + struct nameidata *nd, int mode, + int locked_dir) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct file *file = nd->intent.open.file; + struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); + struct ceph_mds_request *req; + int err; + int flags = nd->intent.open.flags - 1; /* silly vfs! */ + + dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n", + dentry, dentry->d_name.len, dentry->d_name.name, flags, mode); + + /* do the open */ + req = prepare_open_request(dir->i_sb, flags, mode); + if (IS_ERR(req)) + return ERR_CAST(req); + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + if (flags & O_CREAT) { + req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + } + req->r_locked_dir = dir; /* caller holds dir->i_mutex */ + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + dentry = ceph_finish_lookup(req, dentry, err); + if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + err = ceph_handle_notrace_create(dir, dentry); + if (!err) + err = ceph_init_file(req->r_dentry->d_inode, file, + req->r_fmode); + ceph_mdsc_put_request(req); + dout("ceph_lookup_open result=%p\n", dentry); + return dentry; +} + +int ceph_release(struct inode *inode, struct file *file) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *cf = file->private_data; + + dout("release inode %p file %p\n", inode, file); + ceph_put_fmode(ci, cf->fmode); + if (cf->last_readdir) + ceph_mdsc_put_request(cf->last_readdir); + kfree(cf->last_name); + kfree(cf->dir_info); + dput(cf->dentry); + kmem_cache_free(ceph_file_cachep, cf); + + /* wake up anyone waiting for caps on this inode */ + wake_up_all(&ci->i_cap_wq); + return 0; +} + +/* + * Read a range of bytes striped over one or more objects. Iterate over + * objects we stripe over. (That's not atomic, but good enough for now.) + * + * If we get a short result from the OSD, check against i_size; we need to + * only return a short read to the caller if we hit EOF. + */ +static int striped_read(struct inode *inode, + u64 off, u64 len, + struct page **pages, int num_pages, + int *checkeof, bool o_direct, + unsigned long buf_align) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + u64 pos, this_len; + int io_align, page_align; + int left, pages_left; + int read; + struct page **page_pos; + int ret; + bool hit_stripe, was_short; + + /* + * we may need to do multiple reads. not atomic, unfortunately. + */ + pos = off; + left = len; + page_pos = pages; + pages_left = num_pages; + read = 0; + io_align = off & ~PAGE_MASK; + +more: + if (o_direct) + page_align = (pos - io_align + buf_align) & ~PAGE_MASK; + else + page_align = pos & ~PAGE_MASK; + this_len = left; + ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), + &ci->i_layout, pos, &this_len, + ci->i_truncate_seq, + ci->i_truncate_size, + page_pos, pages_left, page_align); + if (ret == -ENOENT) + ret = 0; + hit_stripe = this_len < left; + was_short = ret >= 0 && ret < this_len; + dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, + ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); + + if (ret > 0) { + int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; + + if (read < pos - off) { + dout(" zero gap %llu to %llu\n", off + read, pos); + ceph_zero_page_vector_range(page_align + read, + pos - off - read, pages); + } + pos += ret; + read = pos - off; + left -= ret; + page_pos += didpages; + pages_left -= didpages; + + /* hit stripe? */ + if (left && hit_stripe) + goto more; + } + + if (was_short) { + /* did we bounce off eof? */ + if (pos + left > inode->i_size) + *checkeof = 1; + + /* zero trailing bytes (inside i_size) */ + if (left > 0 && pos < inode->i_size) { + if (pos + left > inode->i_size) + left = inode->i_size - pos; + + dout("zero tail %d\n", left); + ceph_zero_page_vector_range(page_align + read, left, + pages); + read += left; + } + } + + if (ret >= 0) + ret = read; + dout("striped_read returns %d\n", ret); + return ret; +} + +/* + * Completely synchronous read and write methods. Direct from __user + * buffer to osd, or directly to user pages (if O_DIRECT). + * + * If the read spans object boundary, just do multiple reads. + */ +static ssize_t ceph_sync_read(struct file *file, char __user *data, + unsigned len, loff_t *poff, int *checkeof) +{ + struct inode *inode = file->f_dentry->d_inode; + struct page **pages; + u64 off = *poff; + int num_pages, ret; + + dout("sync_read on file %p %llu~%u %s\n", file, off, len, + (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + + if (file->f_flags & O_DIRECT) { + num_pages = calc_pages_for((unsigned long)data, len); + pages = ceph_get_direct_page_vector(data, num_pages, true); + } else { + num_pages = calc_pages_for(off, len); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + } + if (IS_ERR(pages)) + return PTR_ERR(pages); + + /* + * flush any page cache pages in this range. this + * will make concurrent normal and sync io slow, + * but it will at least behave sensibly when they are + * in sequence. + */ + ret = filemap_write_and_wait(inode->i_mapping); + if (ret < 0) + goto done; + + ret = striped_read(inode, off, len, pages, num_pages, checkeof, + file->f_flags & O_DIRECT, + (unsigned long)data & ~PAGE_MASK); + + if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) + ret = ceph_copy_page_vector_to_user(pages, data, off, ret); + if (ret >= 0) + *poff = off + ret; + +done: + if (file->f_flags & O_DIRECT) + ceph_put_page_vector(pages, num_pages, true); + else + ceph_release_page_vector(pages, num_pages); + dout("sync_read result %d\n", ret); + return ret; +} + +/* + * Write commit callback, called if we requested both an ACK and + * ONDISK commit reply from the OSD. + */ +static void sync_write_commit(struct ceph_osd_request *req, + struct ceph_msg *msg) +{ + struct ceph_inode_info *ci = ceph_inode(req->r_inode); + + dout("sync_write_commit %p tid %llu\n", req, req->r_tid); + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_item); + spin_unlock(&ci->i_unsafe_lock); + ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); +} + +/* + * Synchronous write, straight from __user pointer or user pages (if + * O_DIRECT). + * + * If write spans object boundary, just do multiple writes. (For a + * correct atomic write, we should e.g. take write locks on all + * objects, rollback on failure, etc.) + */ +static ssize_t ceph_sync_write(struct file *file, const char __user *data, + size_t left, loff_t *offset) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + struct page **pages; + int num_pages; + long long unsigned pos; + u64 len; + int written = 0; + int flags; + int do_sync = 0; + int check_caps = 0; + int page_align, io_align; + unsigned long buf_align; + int ret; + struct timespec mtime = CURRENT_TIME; + + if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP) + return -EROFS; + + dout("sync_write on file %p %lld~%u %s\n", file, *offset, + (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + + if (file->f_flags & O_APPEND) + pos = i_size_read(inode); + else + pos = *offset; + + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); + if (ret < 0) + return ret; + + ret = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + left) >> PAGE_CACHE_SHIFT); + if (ret < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret); + + flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE; + if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) + flags |= CEPH_OSD_FLAG_ACK; + else + do_sync = 1; + + /* + * we may need to do multiple writes here if we span an object + * boundary. this isn't atomic, unfortunately. :( + */ +more: + io_align = pos & ~PAGE_MASK; + buf_align = (unsigned long)data & ~PAGE_MASK; + len = left; + if (file->f_flags & O_DIRECT) { + /* write from beginning of first page, regardless of + io alignment */ + page_align = (pos - io_align + buf_align) & ~PAGE_MASK; + num_pages = calc_pages_for((unsigned long)data, len); + } else { + page_align = pos & ~PAGE_MASK; + num_pages = calc_pages_for(pos, len); + } + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), pos, &len, + CEPH_OSD_OP_WRITE, flags, + ci->i_snap_realm->cached_context, + do_sync, + ci->i_truncate_seq, ci->i_truncate_size, + &mtime, false, 2, page_align); + if (!req) + return -ENOMEM; + + if (file->f_flags & O_DIRECT) { + pages = ceph_get_direct_page_vector(data, num_pages, false); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + goto out; + } + + /* + * throw out any page cache pages in this range. this + * may block. + */ + truncate_inode_pages_range(inode->i_mapping, pos, + (pos+len) | (PAGE_CACHE_SIZE-1)); + } else { + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + goto out; + } + ret = ceph_copy_user_to_page_vector(pages, data, pos, len); + if (ret < 0) { + ceph_release_page_vector(pages, num_pages); + goto out; + } + + if ((file->f_flags & O_SYNC) == 0) { + /* get a second commit callback */ + req->r_safe_callback = sync_write_commit; + req->r_own_pages = 1; + } + } + req->r_pages = pages; + req->r_num_pages = num_pages; + req->r_inode = inode; + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) { + if (req->r_safe_callback) { + /* + * Add to inode unsafe list only after we + * start_request so that a tid has been assigned. + */ + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_item, + &ci->i_unsafe_writes); + spin_unlock(&ci->i_unsafe_lock); + ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); + } + + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (ret < 0 && req->r_safe_callback) { + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_item); + spin_unlock(&ci->i_unsafe_lock); + ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + } + } + + if (file->f_flags & O_DIRECT) + ceph_put_page_vector(pages, num_pages, false); + else if (file->f_flags & O_SYNC) + ceph_release_page_vector(pages, num_pages); + +out: + ceph_osdc_put_request(req); + if (ret == 0) { + pos += len; + written += len; + left -= len; + data += written; + if (left) + goto more; + + ret = written; + *offset = pos; + if (pos > i_size_read(inode)) + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, + NULL); + } + return ret; +} + +/* + * Wrap generic_file_aio_read with checks for cap bits on the inode. + * Atomically grab references, so that those bits are not released + * back to the MDS mid-read. + * + * Hmm, the sync read case isn't actually async... should it be? + */ +static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *filp = iocb->ki_filp; + struct ceph_file_info *fi = filp->private_data; + loff_t *ppos = &iocb->ki_pos; + size_t len = iov->iov_len; + struct inode *inode = filp->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + void __user *base = iov->iov_base; + ssize_t ret; + int want, got = 0; + int checkeof = 0, read = 0; + + dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", + inode, ceph_vinop(inode), pos, (unsigned)len, inode); +again: + __ceph_do_pending_vmtruncate(inode); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); + if (ret < 0) + goto out; + dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)len, + ceph_cap_string(got)); + + if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || + (iocb->ki_filp->f_flags & O_DIRECT) || + (inode->i_sb->s_flags & MS_SYNCHRONOUS)) + /* hmm, this isn't really async... */ + ret = ceph_sync_read(filp, base, len, ppos, &checkeof); + else + ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + +out: + dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", + inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); + ceph_put_cap_refs(ci, got); + + if (checkeof && ret >= 0) { + int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); + + /* hit EOF or hole? */ + if (statret == 0 && *ppos < inode->i_size) { + dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); + read += ret; + base += ret; + len -= ret; + checkeof = 0; + goto again; + } + } + if (ret >= 0) + ret += read; + + return ret; +} + +/* + * Take cap references to avoid releasing caps to MDS mid-write. + * + * If we are synchronous, and write with an old snap context, the OSD + * may return EOLDSNAPC. In that case, retry the write.. _after_ + * dropping our cap refs and allowing the pending snap to logically + * complete _before_ this write occurs. + * + * If we are near ENOSPC, write synchronously. + */ +static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = + &ceph_sb_to_client(inode->i_sb)->client->osdc; + loff_t endoff = pos + iov->iov_len; + int want, got = 0; + int ret, err; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + +retry_snap: + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) + return -ENOSPC; + __ceph_do_pending_vmtruncate(inode); + dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + inode->i_size); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); + if (ret < 0) + goto out; + + dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + ceph_cap_string(got)); + + if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || + (iocb->ki_filp->f_flags & O_DIRECT) || + (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { + ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, + &iocb->ki_pos); + } else { + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + + if ((ret >= 0 || ret == -EIOCBQUEUED) && + ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) + || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { + err = vfs_fsync_range(file, pos, pos + ret - 1, 1); + if (err < 0) + ret = err; + } + } + if (ret >= 0) { + int dirty; + spin_lock(&inode->i_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&inode->i_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + +out: + dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + ceph_cap_string(got)); + ceph_put_cap_refs(ci, got); + + if (ret == -EOLDSNAPC) { + dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); + goto retry_snap; + } + + return ret; +} + +/* + * llseek. be sure to verify file size on SEEK_END. + */ +static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + int ret; + + mutex_lock(&inode->i_mutex); + __ceph_do_pending_vmtruncate(inode); + switch (origin) { + case SEEK_END: + ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); + if (ret < 0) { + offset = ret; + goto out; + } + offset += inode->i_size; + break; + case SEEK_CUR: + /* + * Here we special-case the lseek(fd, 0, SEEK_CUR) + * position-querying operation. Avoid rewriting the "same" + * f_pos value back to the file because a concurrent read(), + * write() or lseek() might have altered it + */ + if (offset == 0) { + offset = file->f_pos; + goto out; + } + offset += file->f_pos; + break; + } + + if (offset < 0 || offset > inode->i_sb->s_maxbytes) { + offset = -EINVAL; + goto out; + } + + /* Special lock needed here? */ + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + +out: + mutex_unlock(&inode->i_mutex); + return offset; +} + +const struct file_operations ceph_file_fops = { + .open = ceph_open, + .release = ceph_release, + .llseek = ceph_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = ceph_aio_read, + .aio_write = ceph_aio_write, + .mmap = ceph_mmap, + .fsync = ceph_fsync, + .lock = ceph_lock, + .flock = ceph_flock, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, + .unlocked_ioctl = ceph_ioctl, + .compat_ioctl = ceph_ioctl, +}; + diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c new file mode 100644 index 00000000..d8858e96 --- /dev/null +++ b/fs/ceph/inode.c @@ -0,0 +1,1842 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "super.h" +#include "mds_client.h" +#include + +/* + * Ceph inode operations + * + * Implement basic inode helpers (get, alloc) and inode ops (getattr, + * setattr, etc.), xattr helpers, and helpers for assimilating + * metadata returned by the MDS into our cache. + * + * Also define helpers for doing asynchronous writeback, invalidation, + * and truncation for the benefit of those who can't afford to block + * (typically because they are in the message handler path). + */ + +static const struct inode_operations ceph_symlink_iops; + +static void ceph_invalidate_work(struct work_struct *work); +static void ceph_writeback_work(struct work_struct *work); +static void ceph_vmtruncate_work(struct work_struct *work); + +/* + * find or create an inode, given the ceph ino number + */ +static int ceph_set_ino_cb(struct inode *inode, void *data) +{ + ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; + inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); + return 0; +} + +struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) +{ + struct inode *inode; + ino_t t = ceph_vino_to_ino(vino); + + inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino); + if (inode == NULL) + return ERR_PTR(-ENOMEM); + if (inode->i_state & I_NEW) { + dout("get_inode created new inode %p %llx.%llx ino %llx\n", + inode, ceph_vinop(inode), (u64)inode->i_ino); + unlock_new_inode(inode); + } + + dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino, + vino.snap, inode); + return inode; +} + +/* + * get/constuct snapdir inode for a given directory + */ +struct inode *ceph_get_snapdir(struct inode *parent) +{ + struct ceph_vino vino = { + .ino = ceph_ino(parent), + .snap = CEPH_SNAPDIR, + }; + struct inode *inode = ceph_get_inode(parent->i_sb, vino); + struct ceph_inode_info *ci = ceph_inode(inode); + + BUG_ON(!S_ISDIR(parent->i_mode)); + if (IS_ERR(inode)) + return inode; + inode->i_mode = parent->i_mode; + inode->i_uid = parent->i_uid; + inode->i_gid = parent->i_gid; + inode->i_op = &ceph_dir_iops; + inode->i_fop = &ceph_dir_fops; + ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ + ci->i_rbytes = 0; + return inode; +} + +const struct inode_operations ceph_file_iops = { + .permission = ceph_permission, + .setattr = ceph_setattr, + .getattr = ceph_getattr, + .setxattr = ceph_setxattr, + .getxattr = ceph_getxattr, + .listxattr = ceph_listxattr, + .removexattr = ceph_removexattr, +}; + + +/* + * We use a 'frag tree' to keep track of the MDS's directory fragments + * for a given inode (usually there is just a single fragment). We + * need to know when a child frag is delegated to a new MDS, or when + * it is flagged as replicated, so we can direct our requests + * accordingly. + */ + +/* + * find/create a frag in the tree + */ +static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, + u32 f) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct ceph_inode_frag *frag; + int c; + + p = &ci->i_fragtree.rb_node; + while (*p) { + parent = *p; + frag = rb_entry(parent, struct ceph_inode_frag, node); + c = ceph_frag_compare(f, frag->frag); + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else + return frag; + } + + frag = kmalloc(sizeof(*frag), GFP_NOFS); + if (!frag) { + pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx " + "frag %x\n", &ci->vfs_inode, + ceph_vinop(&ci->vfs_inode), f); + return ERR_PTR(-ENOMEM); + } + frag->frag = f; + frag->split_by = 0; + frag->mds = -1; + frag->ndist = 0; + + rb_link_node(&frag->node, parent, p); + rb_insert_color(&frag->node, &ci->i_fragtree); + + dout("get_or_create_frag added %llx.%llx frag %x\n", + ceph_vinop(&ci->vfs_inode), f); + return frag; +} + +/* + * find a specific frag @f + */ +struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f) +{ + struct rb_node *n = ci->i_fragtree.rb_node; + + while (n) { + struct ceph_inode_frag *frag = + rb_entry(n, struct ceph_inode_frag, node); + int c = ceph_frag_compare(f, frag->frag); + if (c < 0) + n = n->rb_left; + else if (c > 0) + n = n->rb_right; + else + return frag; + } + return NULL; +} + +/* + * Choose frag containing the given value @v. If @pfrag is + * specified, copy the frag delegation info to the caller if + * it is present. + */ +u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, + int *found) +{ + u32 t = ceph_frag_make(0, 0); + struct ceph_inode_frag *frag; + unsigned nway, i; + u32 n; + + if (found) + *found = 0; + + mutex_lock(&ci->i_fragtree_mutex); + while (1) { + WARN_ON(!ceph_frag_contains_value(t, v)); + frag = __ceph_find_frag(ci, t); + if (!frag) + break; /* t is a leaf */ + if (frag->split_by == 0) { + if (pfrag) + memcpy(pfrag, frag, sizeof(*pfrag)); + if (found) + *found = 1; + break; + } + + /* choose child */ + nway = 1 << frag->split_by; + dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t, + frag->split_by, nway); + for (i = 0; i < nway; i++) { + n = ceph_frag_make_child(t, frag->split_by, i); + if (ceph_frag_contains_value(n, v)) { + t = n; + break; + } + } + BUG_ON(i == nway); + } + dout("choose_frag(%x) = %x\n", v, t); + + mutex_unlock(&ci->i_fragtree_mutex); + return t; +} + +/* + * Process dirfrag (delegation) info from the mds. Include leaf + * fragment in tree ONLY if ndist > 0. Otherwise, only + * branches/splits are included in i_fragtree) + */ +static int ceph_fill_dirfrag(struct inode *inode, + struct ceph_mds_reply_dirfrag *dirinfo) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_frag *frag; + u32 id = le32_to_cpu(dirinfo->frag); + int mds = le32_to_cpu(dirinfo->auth); + int ndist = le32_to_cpu(dirinfo->ndist); + int i; + int err = 0; + + mutex_lock(&ci->i_fragtree_mutex); + if (ndist == 0) { + /* no delegation info needed. */ + frag = __ceph_find_frag(ci, id); + if (!frag) + goto out; + if (frag->split_by == 0) { + /* tree leaf, remove */ + dout("fill_dirfrag removed %llx.%llx frag %x" + " (no ref)\n", ceph_vinop(inode), id); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + } else { + /* tree branch, keep and clear */ + dout("fill_dirfrag cleared %llx.%llx frag %x" + " referral\n", ceph_vinop(inode), id); + frag->mds = -1; + frag->ndist = 0; + } + goto out; + } + + + /* find/add this frag to store mds delegation info */ + frag = __get_or_create_frag(ci, id); + if (IS_ERR(frag)) { + /* this is not the end of the world; we can continue + with bad/inaccurate delegation info */ + pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n", + ceph_vinop(inode), le32_to_cpu(dirinfo->frag)); + err = -ENOMEM; + goto out; + } + + frag->mds = mds; + frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP); + for (i = 0; i < frag->ndist; i++) + frag->dist[i] = le32_to_cpu(dirinfo->dist[i]); + dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n", + ceph_vinop(inode), frag->frag, frag->ndist); + +out: + mutex_unlock(&ci->i_fragtree_mutex); + return err; +} + + +/* + * initialize a newly allocated inode. + */ +struct inode *ceph_alloc_inode(struct super_block *sb) +{ + struct ceph_inode_info *ci; + int i; + + ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS); + if (!ci) + return NULL; + + dout("alloc_inode %p\n", &ci->vfs_inode); + + ci->i_version = 0; + ci->i_time_warp_seq = 0; + ci->i_ceph_flags = 0; + ci->i_release_count = 0; + ci->i_symlink = NULL; + + memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); + + ci->i_fragtree = RB_ROOT; + mutex_init(&ci->i_fragtree_mutex); + + ci->i_xattrs.blob = NULL; + ci->i_xattrs.prealloc_blob = NULL; + ci->i_xattrs.dirty = false; + ci->i_xattrs.index = RB_ROOT; + ci->i_xattrs.count = 0; + ci->i_xattrs.names_size = 0; + ci->i_xattrs.vals_size = 0; + ci->i_xattrs.version = 0; + ci->i_xattrs.index_version = 0; + + ci->i_caps = RB_ROOT; + ci->i_auth_cap = NULL; + ci->i_dirty_caps = 0; + ci->i_flushing_caps = 0; + INIT_LIST_HEAD(&ci->i_dirty_item); + INIT_LIST_HEAD(&ci->i_flushing_item); + ci->i_cap_flush_seq = 0; + ci->i_cap_flush_last_tid = 0; + memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid)); + init_waitqueue_head(&ci->i_cap_wq); + ci->i_hold_caps_min = 0; + ci->i_hold_caps_max = 0; + INIT_LIST_HEAD(&ci->i_cap_delay_list); + ci->i_cap_exporting_mds = 0; + ci->i_cap_exporting_mseq = 0; + ci->i_cap_exporting_issued = 0; + INIT_LIST_HEAD(&ci->i_cap_snaps); + ci->i_head_snapc = NULL; + ci->i_snap_caps = 0; + + for (i = 0; i < CEPH_FILE_MODE_NUM; i++) + ci->i_nr_by_mode[i] = 0; + + ci->i_truncate_seq = 0; + ci->i_truncate_size = 0; + ci->i_truncate_pending = 0; + + ci->i_max_size = 0; + ci->i_reported_size = 0; + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + + ci->i_pin_ref = 0; + ci->i_rd_ref = 0; + ci->i_rdcache_ref = 0; + ci->i_wr_ref = 0; + ci->i_wb_ref = 0; + ci->i_wrbuffer_ref = 0; + ci->i_wrbuffer_ref_head = 0; + ci->i_shared_gen = 0; + ci->i_rdcache_gen = 0; + ci->i_rdcache_revoking = 0; + + INIT_LIST_HEAD(&ci->i_unsafe_writes); + INIT_LIST_HEAD(&ci->i_unsafe_dirops); + spin_lock_init(&ci->i_unsafe_lock); + + ci->i_snap_realm = NULL; + INIT_LIST_HEAD(&ci->i_snap_realm_item); + INIT_LIST_HEAD(&ci->i_snap_flush_item); + + INIT_WORK(&ci->i_wb_work, ceph_writeback_work); + INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work); + + INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work); + + return &ci->vfs_inode; +} + +static void ceph_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ceph_inode_info *ci = ceph_inode(inode); + + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ceph_inode_cachep, ci); +} + +void ceph_destroy_inode(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_frag *frag; + struct rb_node *n; + + dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + + ceph_queue_caps_release(inode); + + /* + * we may still have a snap_realm reference if there are stray + * caps in i_cap_exporting_issued or i_snap_caps. + */ + if (ci->i_snap_realm) { + struct ceph_mds_client *mdsc = + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + struct ceph_snap_realm *realm = ci->i_snap_realm; + + dout(" dropping residual ref to snap realm %p\n", realm); + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(mdsc, realm); + } + + kfree(ci->i_symlink); + while ((n = rb_first(&ci->i_fragtree)) != NULL) { + frag = rb_entry(n, struct ceph_inode_frag, node); + rb_erase(n, &ci->i_fragtree); + kfree(frag); + } + + __ceph_destroy_xattrs(ci); + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + if (ci->i_xattrs.prealloc_blob) + ceph_buffer_put(ci->i_xattrs.prealloc_blob); + + call_rcu(&inode->i_rcu, ceph_i_callback); +} + + +/* + * Helpers to fill in size, ctime, mtime, and atime. We have to be + * careful because either the client or MDS may have more up to date + * info, depending on which capabilities are held, and whether + * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime + * and size are monotonically increasing, except when utimes() or + * truncate() increments the corresponding _seq values.) + */ +int ceph_fill_file_size(struct inode *inode, int issued, + u32 truncate_seq, u64 truncate_size, u64 size) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int queue_trunc = 0; + + if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || + (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { + dout("size %lld -> %llu\n", inode->i_size, size); + inode->i_size = size; + inode->i_blocks = (size + (1<<9) - 1) >> 9; + ci->i_reported_size = size; + if (truncate_seq != ci->i_truncate_seq) { + dout("truncate_seq %u -> %u\n", + ci->i_truncate_seq, truncate_seq); + ci->i_truncate_seq = truncate_seq; + /* + * If we hold relevant caps, or in the case where we're + * not the only client referencing this file and we + * don't hold those caps, then we need to check whether + * the file is either opened or mmaped + */ + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| + CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| + CEPH_CAP_FILE_EXCL| + CEPH_CAP_FILE_LAZYIO)) || + mapping_mapped(inode->i_mapping) || + __ceph_caps_file_wanted(ci)) { + ci->i_truncate_pending++; + queue_trunc = 1; + } + } + } + if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 && + ci->i_truncate_size != truncate_size) { + dout("truncate_size %lld -> %llu\n", ci->i_truncate_size, + truncate_size); + ci->i_truncate_size = truncate_size; + } + return queue_trunc; +} + +void ceph_fill_file_time(struct inode *inode, int issued, + u64 time_warp_seq, struct timespec *ctime, + struct timespec *mtime, struct timespec *atime) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int warn = 0; + + if (issued & (CEPH_CAP_FILE_EXCL| + CEPH_CAP_FILE_WR| + CEPH_CAP_FILE_BUFFER| + CEPH_CAP_AUTH_EXCL| + CEPH_CAP_XATTR_EXCL)) { + if (timespec_compare(ctime, &inode->i_ctime) > 0) { + dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", + inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + ctime->tv_sec, ctime->tv_nsec); + inode->i_ctime = *ctime; + } + if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { + /* the MDS did a utimes() */ + dout("mtime %ld.%09ld -> %ld.%09ld " + "tw %d -> %d\n", + inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + mtime->tv_sec, mtime->tv_nsec, + ci->i_time_warp_seq, (int)time_warp_seq); + + inode->i_mtime = *mtime; + inode->i_atime = *atime; + ci->i_time_warp_seq = time_warp_seq; + } else if (time_warp_seq == ci->i_time_warp_seq) { + /* nobody did utimes(); take the max */ + if (timespec_compare(mtime, &inode->i_mtime) > 0) { + dout("mtime %ld.%09ld -> %ld.%09ld inc\n", + inode->i_mtime.tv_sec, + inode->i_mtime.tv_nsec, + mtime->tv_sec, mtime->tv_nsec); + inode->i_mtime = *mtime; + } + if (timespec_compare(atime, &inode->i_atime) > 0) { + dout("atime %ld.%09ld -> %ld.%09ld inc\n", + inode->i_atime.tv_sec, + inode->i_atime.tv_nsec, + atime->tv_sec, atime->tv_nsec); + inode->i_atime = *atime; + } + } else if (issued & CEPH_CAP_FILE_EXCL) { + /* we did a utimes(); ignore mds values */ + } else { + warn = 1; + } + } else { + /* we have no write|excl caps; whatever the MDS says is true */ + if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { + inode->i_ctime = *ctime; + inode->i_mtime = *mtime; + inode->i_atime = *atime; + ci->i_time_warp_seq = time_warp_seq; + } else { + warn = 1; + } + } + if (warn) /* time_warp_seq shouldn't go backwards */ + dout("%p mds time_warp_seq %llu < %u\n", + inode, time_warp_seq, ci->i_time_warp_seq); +} + +/* + * Populate an inode based on info from mds. May be called on new or + * existing inodes. + */ +static int fill_inode(struct inode *inode, + struct ceph_mds_reply_info_in *iinfo, + struct ceph_mds_reply_dirfrag *dirinfo, + struct ceph_mds_session *session, + unsigned long ttl_from, int cap_fmode, + struct ceph_cap_reservation *caps_reservation) +{ + struct ceph_mds_reply_inode *info = iinfo->in; + struct ceph_inode_info *ci = ceph_inode(inode); + int i; + int issued, implemented; + struct timespec mtime, atime, ctime; + u32 nsplits; + struct ceph_buffer *xattr_blob = NULL; + int err = 0; + int queue_trunc = 0; + + dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", + inode, ceph_vinop(inode), le64_to_cpu(info->version), + ci->i_version); + + /* + * prealloc xattr data, if it looks like we'll need it. only + * if len > 4 (meaning there are actually xattrs; the first 4 + * bytes are the xattr count). + */ + if (iinfo->xattr_len > 4) { + xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS); + if (!xattr_blob) + pr_err("fill_inode ENOMEM xattr blob %d bytes\n", + iinfo->xattr_len); + } + + spin_lock(&inode->i_lock); + + /* + * provided version will be odd if inode value is projected, + * even if stable. skip the update if we have newer stable + * info (ours>=theirs, e.g. due to racing mds replies), unless + * we are getting projected (unstable) info (in which case the + * version is odd, and we want ours>theirs). + * us them + * 2 2 skip + * 3 2 skip + * 3 3 update + */ + if (le64_to_cpu(info->version) > 0 && + (ci->i_version & ~1) >= le64_to_cpu(info->version)) + goto no_change; + + issued = __ceph_caps_issued(ci, &implemented); + issued |= implemented | __ceph_caps_dirty(ci); + + /* update inode */ + ci->i_version = le64_to_cpu(info->version); + inode->i_version++; + inode->i_rdev = le32_to_cpu(info->rdev); + + if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + inode->i_mode = le32_to_cpu(info->mode); + inode->i_uid = le32_to_cpu(info->uid); + inode->i_gid = le32_to_cpu(info->gid); + dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, + inode->i_uid, inode->i_gid); + } + + if ((issued & CEPH_CAP_LINK_EXCL) == 0) + inode->i_nlink = le32_to_cpu(info->nlink); + + /* be careful with mtime, atime, size */ + ceph_decode_timespec(&atime, &info->atime); + ceph_decode_timespec(&mtime, &info->mtime); + ceph_decode_timespec(&ctime, &info->ctime); + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(info->truncate_seq), + le64_to_cpu(info->truncate_size), + le64_to_cpu(info->size)); + ceph_fill_file_time(inode, issued, + le32_to_cpu(info->time_warp_seq), + &ctime, &mtime, &atime); + + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + + ci->i_layout = info->layout; + inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + + /* xattrs */ + /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ + if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && + le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) { + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + ci->i_xattrs.blob = xattr_blob; + if (xattr_blob) + memcpy(ci->i_xattrs.blob->vec.iov_base, + iinfo->xattr_data, iinfo->xattr_len); + ci->i_xattrs.version = le64_to_cpu(info->xattr_version); + xattr_blob = NULL; + } + + inode->i_mapping->a_ops = &ceph_aops; + inode->i_mapping->backing_dev_info = + &ceph_sb_to_client(inode->i_sb)->backing_dev_info; + + switch (inode->i_mode & S_IFMT) { + case S_IFIFO: + case S_IFBLK: + case S_IFCHR: + case S_IFSOCK: + init_special_inode(inode, inode->i_mode, inode->i_rdev); + inode->i_op = &ceph_file_iops; + break; + case S_IFREG: + inode->i_op = &ceph_file_iops; + inode->i_fop = &ceph_file_fops; + break; + case S_IFLNK: + inode->i_op = &ceph_symlink_iops; + if (!ci->i_symlink) { + int symlen = iinfo->symlink_len; + char *sym; + + BUG_ON(symlen != inode->i_size); + spin_unlock(&inode->i_lock); + + err = -ENOMEM; + sym = kmalloc(symlen+1, GFP_NOFS); + if (!sym) + goto out; + memcpy(sym, iinfo->symlink, symlen); + sym[symlen] = 0; + + spin_lock(&inode->i_lock); + if (!ci->i_symlink) + ci->i_symlink = sym; + else + kfree(sym); /* lost a race */ + } + break; + case S_IFDIR: + inode->i_op = &ceph_dir_iops; + inode->i_fop = &ceph_dir_fops; + + ci->i_dir_layout = iinfo->dir_layout; + + ci->i_files = le64_to_cpu(info->files); + ci->i_subdirs = le64_to_cpu(info->subdirs); + ci->i_rbytes = le64_to_cpu(info->rbytes); + ci->i_rfiles = le64_to_cpu(info->rfiles); + ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); + ceph_decode_timespec(&ci->i_rctime, &info->rctime); + + /* set dir completion flag? */ + if (ci->i_files == 0 && ci->i_subdirs == 0 && + ceph_snap(inode) == CEPH_NOSNAP && + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { + dout(" marking %p complete (empty)\n", inode); + /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ + ci->i_max_offset = 2; + } + break; + default: + pr_err("fill_inode %llx.%llx BAD mode 0%o\n", + ceph_vinop(inode), inode->i_mode); + } + +no_change: + spin_unlock(&inode->i_lock); + + /* queue truncate if we saw i_size decrease */ + if (queue_trunc) + ceph_queue_vmtruncate(inode); + + /* populate frag tree */ + /* FIXME: move me up, if/when version reflects fragtree changes */ + nsplits = le32_to_cpu(info->fragtree.nsplits); + mutex_lock(&ci->i_fragtree_mutex); + for (i = 0; i < nsplits; i++) { + u32 id = le32_to_cpu(info->fragtree.splits[i].frag); + struct ceph_inode_frag *frag = __get_or_create_frag(ci, id); + + if (IS_ERR(frag)) + continue; + frag->split_by = le32_to_cpu(info->fragtree.splits[i].by); + dout(" frag %x split by %d\n", frag->frag, frag->split_by); + } + mutex_unlock(&ci->i_fragtree_mutex); + + /* were we issued a capability? */ + if (info->cap.caps) { + if (ceph_snap(inode) == CEPH_NOSNAP) { + ceph_add_cap(inode, session, + le64_to_cpu(info->cap.cap_id), + cap_fmode, + le32_to_cpu(info->cap.caps), + le32_to_cpu(info->cap.wanted), + le32_to_cpu(info->cap.seq), + le32_to_cpu(info->cap.mseq), + le64_to_cpu(info->cap.realm), + info->cap.flags, + caps_reservation); + } else { + spin_lock(&inode->i_lock); + dout(" %p got snap_caps %s\n", inode, + ceph_cap_string(le32_to_cpu(info->cap.caps))); + ci->i_snap_caps |= le32_to_cpu(info->cap.caps); + if (cap_fmode >= 0) + __ceph_get_fmode(ci, cap_fmode); + spin_unlock(&inode->i_lock); + } + } else if (cap_fmode >= 0) { + pr_warning("mds issued no caps on %llx.%llx\n", + ceph_vinop(inode)); + __ceph_get_fmode(ci, cap_fmode); + } + + /* update delegation info? */ + if (dirinfo) + ceph_fill_dirfrag(inode, dirinfo); + + err = 0; + +out: + if (xattr_blob) + ceph_buffer_put(xattr_blob); + return err; +} + +/* + * caller should hold session s_mutex. + */ +static void update_dentry_lease(struct dentry *dentry, + struct ceph_mds_reply_lease *lease, + struct ceph_mds_session *session, + unsigned long from_time) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + long unsigned duration = le32_to_cpu(lease->duration_ms); + long unsigned ttl = from_time + (duration * HZ) / 1000; + long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; + struct inode *dir; + + /* only track leases on regular dentries */ + if (dentry->d_op != &ceph_dentry_ops) + return; + + spin_lock(&dentry->d_lock); + dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n", + dentry, le16_to_cpu(lease->mask), duration, ttl); + + /* make lease_rdcache_gen match directory */ + dir = dentry->d_parent->d_inode; + di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; + + if (lease->mask == 0) + goto out_unlock; + + if (di->lease_gen == session->s_cap_gen && + time_before(ttl, dentry->d_time)) + goto out_unlock; /* we already have a newer lease. */ + + if (di->lease_session && di->lease_session != session) + goto out_unlock; + + ceph_dentry_lru_touch(dentry); + + if (!di->lease_session) + di->lease_session = ceph_get_mds_session(session); + di->lease_gen = session->s_cap_gen; + di->lease_seq = le32_to_cpu(lease->seq); + di->lease_renew_after = half_ttl; + di->lease_renew_from = 0; + dentry->d_time = ttl; +out_unlock: + spin_unlock(&dentry->d_lock); + return; +} + +/* + * Set dentry's directory position based on the current dir's max, and + * order it in d_subdirs, so that dcache_readdir behaves. + */ +static void ceph_set_dentry_offset(struct dentry *dn) +{ + struct dentry *dir = dn->d_parent; + struct inode *inode = dn->d_parent->d_inode; + struct ceph_dentry_info *di; + + BUG_ON(!inode); + + di = ceph_dentry(dn); + + spin_lock(&inode->i_lock); + if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) { + spin_unlock(&inode->i_lock); + return; + } + di->offset = ceph_inode(inode)->i_max_offset++; + spin_unlock(&inode->i_lock); + + spin_lock(&dir->d_lock); + spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); + list_move(&dn->d_u.d_child, &dir->d_subdirs); + dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, + dn->d_u.d_child.prev, dn->d_u.d_child.next); + spin_unlock(&dn->d_lock); + spin_unlock(&dir->d_lock); +} + +/* + * splice a dentry to an inode. + * caller must hold directory i_mutex for this to be safe. + * + * we will only rehash the resulting dentry if @prehash is + * true; @prehash will be set to false (for the benefit of + * the caller) if we fail. + */ +static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, + bool *prehash, bool set_offset) +{ + struct dentry *realdn; + + BUG_ON(dn->d_inode); + + /* dn must be unhashed */ + if (!d_unhashed(dn)) + d_drop(dn); + realdn = d_materialise_unique(dn, in); + if (IS_ERR(realdn)) { + pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", + PTR_ERR(realdn), dn, in, ceph_vinop(in)); + if (prehash) + *prehash = false; /* don't rehash on error */ + dn = realdn; /* note realdn contains the error */ + goto out; + } else if (realdn) { + dout("dn %p (%d) spliced with %p (%d) " + "inode %p ino %llx.%llx\n", + dn, dn->d_count, + realdn, realdn->d_count, + realdn->d_inode, ceph_vinop(realdn->d_inode)); + dput(dn); + dn = realdn; + } else { + BUG_ON(!ceph_dentry(dn)); + dout("dn %p attached to %p ino %llx.%llx\n", + dn, dn->d_inode, ceph_vinop(dn->d_inode)); + } + if ((!prehash || *prehash) && d_unhashed(dn)) + d_rehash(dn); + if (set_offset) + ceph_set_dentry_offset(dn); +out: + return dn; +} + +/* + * Incorporate results into the local cache. This is either just + * one inode, or a directory, dentry, and possibly linked-to inode (e.g., + * after a lookup). + * + * A reply may contain + * a directory inode along with a dentry. + * and/or a target inode + * + * Called with snap_rwsem (read). + */ +int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, + struct ceph_mds_session *session) +{ + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct inode *in = NULL; + struct ceph_mds_reply_inode *ininfo; + struct ceph_vino vino; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + int i = 0; + int err = 0; + + dout("fill_trace %p is_dentry %d is_target %d\n", req, + rinfo->head->is_dentry, rinfo->head->is_target); + +#if 0 + /* + * Debugging hook: + * + * If we resend completed ops to a recovering mds, we get no + * trace. Since that is very rare, pretend this is the case + * to ensure the 'no trace' handlers in the callers behave. + * + * Fill in inodes unconditionally to avoid breaking cap + * invariants. + */ + if (rinfo->head->op & CEPH_MDS_OP_WRITE) { + pr_info("fill_trace faking empty trace on %lld %s\n", + req->r_tid, ceph_mds_op_name(rinfo->head->op)); + if (rinfo->head->is_dentry) { + rinfo->head->is_dentry = 0; + err = fill_inode(req->r_locked_dir, + &rinfo->diri, rinfo->dirfrag, + session, req->r_request_started, -1); + } + if (rinfo->head->is_target) { + rinfo->head->is_target = 0; + ininfo = rinfo->targeti.in; + vino.ino = le64_to_cpu(ininfo->ino); + vino.snap = le64_to_cpu(ininfo->snapid); + in = ceph_get_inode(sb, vino); + err = fill_inode(in, &rinfo->targeti, NULL, + session, req->r_request_started, + req->r_fmode); + iput(in); + } + } +#endif + + if (!rinfo->head->is_target && !rinfo->head->is_dentry) { + dout("fill_trace reply is empty!\n"); + if (rinfo->head->result == 0 && req->r_locked_dir) + ceph_invalidate_dir_request(req); + return 0; + } + + if (rinfo->head->is_dentry) { + struct inode *dir = req->r_locked_dir; + + err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, + session, req->r_request_started, -1, + &req->r_caps_reservation); + if (err < 0) + return err; + } + + /* + * ignore null lease/binding on snapdir ENOENT, or else we + * will have trouble splicing in the virtual snapdir later + */ + if (rinfo->head->is_dentry && !req->r_aborted && + (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, + fsc->mount_options->snapdir_name, + req->r_dentry->d_name.len))) { + /* + * lookup link rename : null -> possibly existing inode + * mknod symlink mkdir : null -> new inode + * unlink : linked -> null + */ + struct inode *dir = req->r_locked_dir; + struct dentry *dn = req->r_dentry; + bool have_dir_cap, have_lease; + + BUG_ON(!dn); + BUG_ON(!dir); + BUG_ON(dn->d_parent->d_inode != dir); + BUG_ON(ceph_ino(dir) != + le64_to_cpu(rinfo->diri.in->ino)); + BUG_ON(ceph_snap(dir) != + le64_to_cpu(rinfo->diri.in->snapid)); + + /* do we have a lease on the whole dir? */ + have_dir_cap = + (le32_to_cpu(rinfo->diri.in->cap.caps) & + CEPH_CAP_FILE_SHARED); + + /* do we have a dn lease? */ + have_lease = have_dir_cap || + (le16_to_cpu(rinfo->dlease->mask) & + CEPH_LOCK_DN); + + if (!have_lease) + dout("fill_trace no dentry lease or dir cap\n"); + + /* rename? */ + if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { + dout(" src %p '%.*s' dst %p '%.*s'\n", + req->r_old_dentry, + req->r_old_dentry->d_name.len, + req->r_old_dentry->d_name.name, + dn, dn->d_name.len, dn->d_name.name); + dout("fill_trace doing d_move %p -> %p\n", + req->r_old_dentry, dn); + + d_move(req->r_old_dentry, dn); + dout(" src %p '%.*s' dst %p '%.*s'\n", + req->r_old_dentry, + req->r_old_dentry->d_name.len, + req->r_old_dentry->d_name.name, + dn, dn->d_name.len, dn->d_name.name); + + /* ensure target dentry is invalidated, despite + rehashing bug in vfs_rename_dir */ + ceph_invalidate_dentry_lease(dn); + + /* + * d_move() puts the renamed dentry at the end of + * d_subdirs. We need to assign it an appropriate + * directory offset so we can behave when holding + * I_COMPLETE. + */ + ceph_set_dentry_offset(req->r_old_dentry); + dout("dn %p gets new offset %lld\n", req->r_old_dentry, + ceph_dentry(req->r_old_dentry)->offset); + + dn = req->r_old_dentry; /* use old_dentry */ + in = dn->d_inode; + } + + /* null dentry? */ + if (!rinfo->head->is_target) { + dout("fill_trace null dentry\n"); + if (dn->d_inode) { + dout("d_delete %p\n", dn); + d_delete(dn); + } else { + dout("d_instantiate %p NULL\n", dn); + d_instantiate(dn, NULL); + if (have_lease && d_unhashed(dn)) + d_rehash(dn); + update_dentry_lease(dn, rinfo->dlease, + session, + req->r_request_started); + } + goto done; + } + + /* attach proper inode */ + ininfo = rinfo->targeti.in; + vino.ino = le64_to_cpu(ininfo->ino); + vino.snap = le64_to_cpu(ininfo->snapid); + in = dn->d_inode; + if (!in) { + in = ceph_get_inode(sb, vino); + if (IS_ERR(in)) { + pr_err("fill_trace bad get_inode " + "%llx.%llx\n", vino.ino, vino.snap); + err = PTR_ERR(in); + d_delete(dn); + goto done; + } + dn = splice_dentry(dn, in, &have_lease, true); + if (IS_ERR(dn)) { + err = PTR_ERR(dn); + goto done; + } + req->r_dentry = dn; /* may have spliced */ + ihold(in); + } else if (ceph_ino(in) == vino.ino && + ceph_snap(in) == vino.snap) { + ihold(in); + } else { + dout(" %p links to %p %llx.%llx, not %llx.%llx\n", + dn, in, ceph_ino(in), ceph_snap(in), + vino.ino, vino.snap); + have_lease = false; + in = NULL; + } + + if (have_lease) + update_dentry_lease(dn, rinfo->dlease, session, + req->r_request_started); + dout(" final dn %p\n", dn); + i++; + } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP) { + struct dentry *dn = req->r_dentry; + + /* fill out a snapdir LOOKUPSNAP dentry */ + BUG_ON(!dn); + BUG_ON(!req->r_locked_dir); + BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR); + ininfo = rinfo->targeti.in; + vino.ino = le64_to_cpu(ininfo->ino); + vino.snap = le64_to_cpu(ininfo->snapid); + in = ceph_get_inode(sb, vino); + if (IS_ERR(in)) { + pr_err("fill_inode get_inode badness %llx.%llx\n", + vino.ino, vino.snap); + err = PTR_ERR(in); + d_delete(dn); + goto done; + } + dout(" linking snapped dir %p to dn %p\n", in, dn); + dn = splice_dentry(dn, in, NULL, true); + if (IS_ERR(dn)) { + err = PTR_ERR(dn); + goto done; + } + req->r_dentry = dn; /* may have spliced */ + ihold(in); + rinfo->head->is_dentry = 1; /* fool notrace handlers */ + } + + if (rinfo->head->is_target) { + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + + if (in == NULL || ceph_ino(in) != vino.ino || + ceph_snap(in) != vino.snap) { + in = ceph_get_inode(sb, vino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + goto done; + } + } + req->r_target_inode = in; + + err = fill_inode(in, + &rinfo->targeti, NULL, + session, req->r_request_started, + (le32_to_cpu(rinfo->head->result) == 0) ? + req->r_fmode : -1, + &req->r_caps_reservation); + if (err < 0) { + pr_err("fill_inode badness %p %llx.%llx\n", + in, ceph_vinop(in)); + goto done; + } + } + +done: + dout("fill_trace done err=%d\n", err); + return err; +} + +/* + * Prepopulate our cache with readdir results, leases, etc. + */ +int ceph_readdir_prepopulate(struct ceph_mds_request *req, + struct ceph_mds_session *session) +{ + struct dentry *parent = req->r_dentry; + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct qstr dname; + struct dentry *dn; + struct inode *in; + int err = 0, i; + struct inode *snapdir = NULL; + struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; + u64 frag = le32_to_cpu(rhead->args.readdir.frag); + struct ceph_dentry_info *di; + + if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { + snapdir = ceph_get_snapdir(parent->d_inode); + parent = d_find_alias(snapdir); + dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", + rinfo->dir_nr, parent); + } else { + dout("readdir_prepopulate %d items under dn %p\n", + rinfo->dir_nr, parent); + if (rinfo->dir_dir) + ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir); + } + + for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_vino vino; + + dname.name = rinfo->dir_dname[i]; + dname.len = rinfo->dir_dname_len[i]; + dname.hash = full_name_hash(dname.name, dname.len); + + vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); + vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + +retry_lookup: + dn = d_lookup(parent, &dname); + dout("d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); + + if (!dn) { + dn = d_alloc(parent, &dname); + dout("d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); + if (dn == NULL) { + dout("d_alloc badness\n"); + err = -ENOMEM; + goto out; + } + err = ceph_init_dentry(dn); + if (err < 0) { + dput(dn); + goto out; + } + } else if (dn->d_inode && + (ceph_ino(dn->d_inode) != vino.ino || + ceph_snap(dn->d_inode) != vino.snap)) { + dout(" dn %p points to wrong inode %p\n", + dn, dn->d_inode); + d_delete(dn); + dput(dn); + goto retry_lookup; + } else { + /* reorder parent's d_subdirs */ + spin_lock(&parent->d_lock); + spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); + list_move(&dn->d_u.d_child, &parent->d_subdirs); + spin_unlock(&dn->d_lock); + spin_unlock(&parent->d_lock); + } + + di = dn->d_fsdata; + di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); + + /* inode */ + if (dn->d_inode) { + in = dn->d_inode; + } else { + in = ceph_get_inode(parent->d_sb, vino); + if (IS_ERR(in)) { + dout("new_inode badness\n"); + d_delete(dn); + dput(dn); + err = PTR_ERR(in); + goto out; + } + dn = splice_dentry(dn, in, NULL, false); + if (IS_ERR(dn)) + dn = NULL; + } + + if (fill_inode(in, &rinfo->dir_in[i], NULL, session, + req->r_request_started, -1, + &req->r_caps_reservation) < 0) { + pr_err("fill_inode badness on %p\n", in); + goto next_item; + } + if (dn) + update_dentry_lease(dn, rinfo->dir_dlease[i], + req->r_session, + req->r_request_started); +next_item: + if (dn) + dput(dn); + } + req->r_did_prepopulate = true; + +out: + if (snapdir) { + iput(snapdir); + dput(parent); + } + dout("readdir_prepopulate done\n"); + return err; +} + +int ceph_inode_set_size(struct inode *inode, loff_t size) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret = 0; + + spin_lock(&inode->i_lock); + dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); + inode->i_size = size; + inode->i_blocks = (size + (1 << 9) - 1) >> 9; + + /* tell the MDS if we are approaching max_size */ + if ((size << 1) >= ci->i_max_size && + (ci->i_reported_size << 1) < ci->i_max_size) + ret = 1; + + spin_unlock(&inode->i_lock); + return ret; +} + +/* + * Write back inode data in a worker thread. (This can't be done + * in the message handler context.) + */ +void ceph_queue_writeback(struct inode *inode) +{ + if (queue_work(ceph_inode_to_client(inode)->wb_wq, + &ceph_inode(inode)->i_wb_work)) { + dout("ceph_queue_writeback %p\n", inode); + ihold(inode); + } else { + dout("ceph_queue_writeback %p failed\n", inode); + } +} + +static void ceph_writeback_work(struct work_struct *work) +{ + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_wb_work); + struct inode *inode = &ci->vfs_inode; + + dout("writeback %p\n", inode); + filemap_fdatawrite(&inode->i_data); + iput(inode); +} + +/* + * queue an async invalidation + */ +void ceph_queue_invalidate(struct inode *inode) +{ + if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, + &ceph_inode(inode)->i_pg_inv_work)) { + dout("ceph_queue_invalidate %p\n", inode); + ihold(inode); + } else { + dout("ceph_queue_invalidate %p failed\n", inode); + } +} + +/* + * invalidate any pages that are not dirty or under writeback. this + * includes pages that are clean and mapped. + */ +static void ceph_invalidate_nondirty_pages(struct address_space *mapping) +{ + struct pagevec pvec; + pgoff_t next = 0; + int i; + + pagevec_init(&pvec, 0); + while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t index; + int skip_page = + (PageDirty(page) || PageWriteback(page)); + + if (!skip_page) + skip_page = !trylock_page(page); + + /* + * We really shouldn't be looking at the ->index of an + * unlocked page. But we're not allowed to lock these + * pages. So we rely upon nobody altering the ->index + * of this (pinned-by-us) page. + */ + index = page->index; + if (index > next) + next = index; + next++; + + if (skip_page) + continue; + + generic_error_remove_page(mapping, page); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +/* + * Invalidate inode pages in a worker thread. (This can't be done + * in the message handler context.) + */ +static void ceph_invalidate_work(struct work_struct *work) +{ + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_pg_inv_work); + struct inode *inode = &ci->vfs_inode; + u32 orig_gen; + int check = 0; + + spin_lock(&inode->i_lock); + dout("invalidate_pages %p gen %d revoking %d\n", inode, + ci->i_rdcache_gen, ci->i_rdcache_revoking); + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { + /* nevermind! */ + spin_unlock(&inode->i_lock); + goto out; + } + orig_gen = ci->i_rdcache_gen; + spin_unlock(&inode->i_lock); + + ceph_invalidate_nondirty_pages(inode->i_mapping); + + spin_lock(&inode->i_lock); + if (orig_gen == ci->i_rdcache_gen && + orig_gen == ci->i_rdcache_revoking) { + dout("invalidate_pages %p gen %d successful\n", inode, + ci->i_rdcache_gen); + ci->i_rdcache_revoking--; + check = 1; + } else { + dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", + inode, orig_gen, ci->i_rdcache_gen, + ci->i_rdcache_revoking); + } + spin_unlock(&inode->i_lock); + + if (check) + ceph_check_caps(ci, 0, NULL); +out: + iput(inode); +} + + +/* + * called by trunc_wq; take i_mutex ourselves + * + * We also truncate in a separate thread as well. + */ +static void ceph_vmtruncate_work(struct work_struct *work) +{ + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_vmtruncate_work); + struct inode *inode = &ci->vfs_inode; + + dout("vmtruncate_work %p\n", inode); + mutex_lock(&inode->i_mutex); + __ceph_do_pending_vmtruncate(inode); + mutex_unlock(&inode->i_mutex); + iput(inode); +} + +/* + * Queue an async vmtruncate. If we fail to queue work, we will handle + * the truncation the next time we call __ceph_do_pending_vmtruncate. + */ +void ceph_queue_vmtruncate(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, + &ci->i_vmtruncate_work)) { + dout("ceph_queue_vmtruncate %p\n", inode); + ihold(inode); + } else { + dout("ceph_queue_vmtruncate %p failed, pending=%d\n", + inode, ci->i_truncate_pending); + } +} + +/* + * called with i_mutex held. + * + * Make sure any pending truncation is applied before doing anything + * that may depend on it. + */ +void __ceph_do_pending_vmtruncate(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + u64 to; + int wrbuffer_refs, wake = 0; + +retry: + spin_lock(&inode->i_lock); + if (ci->i_truncate_pending == 0) { + dout("__do_pending_vmtruncate %p none pending\n", inode); + spin_unlock(&inode->i_lock); + return; + } + + /* + * make sure any dirty snapped pages are flushed before we + * possibly truncate them.. so write AND block! + */ + if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { + dout("__do_pending_vmtruncate %p flushing snaps first\n", + inode); + spin_unlock(&inode->i_lock); + filemap_write_and_wait_range(&inode->i_data, 0, + inode->i_sb->s_maxbytes); + goto retry; + } + + to = ci->i_truncate_size; + wrbuffer_refs = ci->i_wrbuffer_ref; + dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, + ci->i_truncate_pending, to); + spin_unlock(&inode->i_lock); + + truncate_inode_pages(inode->i_mapping, to); + + spin_lock(&inode->i_lock); + ci->i_truncate_pending--; + if (ci->i_truncate_pending == 0) + wake = 1; + spin_unlock(&inode->i_lock); + + if (wrbuffer_refs == 0) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + if (wake) + wake_up_all(&ci->i_cap_wq); +} + + +/* + * symlinks + */ +static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ceph_inode_info *ci = ceph_inode(dentry->d_inode); + nd_set_link(nd, ci->i_symlink); + return NULL; +} + +static const struct inode_operations ceph_symlink_iops = { + .readlink = generic_readlink, + .follow_link = ceph_sym_follow_link, +}; + +/* + * setattr + */ +int ceph_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct inode *parent_inode = dentry->d_parent->d_inode; + const unsigned int ia_valid = attr->ia_valid; + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + int issued; + int release = 0, dirtied = 0; + int mask = 0; + int err = 0; + int inode_dirty_flags = 0; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + + __ceph_do_pending_vmtruncate(inode); + + err = inode_change_ok(inode, attr); + if (err != 0) + return err; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, + USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + + spin_lock(&inode->i_lock); + issued = __ceph_caps_issued(ci, NULL); + dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); + + if (ia_valid & ATTR_UID) { + dout("setattr %p uid %d -> %d\n", inode, + inode->i_uid, attr->ia_uid); + if (issued & CEPH_CAP_AUTH_EXCL) { + inode->i_uid = attr->ia_uid; + dirtied |= CEPH_CAP_AUTH_EXCL; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + attr->ia_uid != inode->i_uid) { + req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid); + mask |= CEPH_SETATTR_UID; + release |= CEPH_CAP_AUTH_SHARED; + } + } + if (ia_valid & ATTR_GID) { + dout("setattr %p gid %d -> %d\n", inode, + inode->i_gid, attr->ia_gid); + if (issued & CEPH_CAP_AUTH_EXCL) { + inode->i_gid = attr->ia_gid; + dirtied |= CEPH_CAP_AUTH_EXCL; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + attr->ia_gid != inode->i_gid) { + req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid); + mask |= CEPH_SETATTR_GID; + release |= CEPH_CAP_AUTH_SHARED; + } + } + if (ia_valid & ATTR_MODE) { + dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode, + attr->ia_mode); + if (issued & CEPH_CAP_AUTH_EXCL) { + inode->i_mode = attr->ia_mode; + dirtied |= CEPH_CAP_AUTH_EXCL; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + attr->ia_mode != inode->i_mode) { + req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); + mask |= CEPH_SETATTR_MODE; + release |= CEPH_CAP_AUTH_SHARED; + } + } + + if (ia_valid & ATTR_ATIME) { + dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode, + inode->i_atime.tv_sec, inode->i_atime.tv_nsec, + attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); + if (issued & CEPH_CAP_FILE_EXCL) { + ci->i_time_warp_seq++; + inode->i_atime = attr->ia_atime; + dirtied |= CEPH_CAP_FILE_EXCL; + } else if ((issued & CEPH_CAP_FILE_WR) && + timespec_compare(&inode->i_atime, + &attr->ia_atime) < 0) { + inode->i_atime = attr->ia_atime; + dirtied |= CEPH_CAP_FILE_WR; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + !timespec_equal(&inode->i_atime, &attr->ia_atime)) { + ceph_encode_timespec(&req->r_args.setattr.atime, + &attr->ia_atime); + mask |= CEPH_SETATTR_ATIME; + release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } + } + if (ia_valid & ATTR_MTIME) { + dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode, + inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); + if (issued & CEPH_CAP_FILE_EXCL) { + ci->i_time_warp_seq++; + inode->i_mtime = attr->ia_mtime; + dirtied |= CEPH_CAP_FILE_EXCL; + } else if ((issued & CEPH_CAP_FILE_WR) && + timespec_compare(&inode->i_mtime, + &attr->ia_mtime) < 0) { + inode->i_mtime = attr->ia_mtime; + dirtied |= CEPH_CAP_FILE_WR; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) { + ceph_encode_timespec(&req->r_args.setattr.mtime, + &attr->ia_mtime); + mask |= CEPH_SETATTR_MTIME; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } + } + if (ia_valid & ATTR_SIZE) { + dout("setattr %p size %lld -> %lld\n", inode, + inode->i_size, attr->ia_size); + if (attr->ia_size > inode->i_sb->s_maxbytes) { + err = -EINVAL; + goto out; + } + if ((issued & CEPH_CAP_FILE_EXCL) && + attr->ia_size > inode->i_size) { + inode->i_size = attr->ia_size; + inode->i_blocks = + (attr->ia_size + (1 << 9) - 1) >> 9; + inode->i_ctime = attr->ia_ctime; + ci->i_reported_size = attr->ia_size; + dirtied |= CEPH_CAP_FILE_EXCL; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + attr->ia_size != inode->i_size) { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = + cpu_to_le64(inode->i_size); + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } + } + + /* these do nothing */ + if (ia_valid & ATTR_CTIME) { + bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| + ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; + dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode, + inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, + only ? "ctime only" : "ignored"); + inode->i_ctime = attr->ia_ctime; + if (only) { + /* + * if kernel wants to dirty ctime but nothing else, + * we need to choose a cap to dirty under, or do + * a almost-no-op setattr + */ + if (issued & CEPH_CAP_AUTH_EXCL) + dirtied |= CEPH_CAP_AUTH_EXCL; + else if (issued & CEPH_CAP_FILE_EXCL) + dirtied |= CEPH_CAP_FILE_EXCL; + else if (issued & CEPH_CAP_XATTR_EXCL) + dirtied |= CEPH_CAP_XATTR_EXCL; + else + mask |= CEPH_SETATTR_CTIME; + } + } + if (ia_valid & ATTR_FILE) + dout("setattr %p ATTR_FILE ... hrm!\n", inode); + + if (dirtied) { + inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied); + inode->i_ctime = CURRENT_TIME; + } + + release &= issued; + spin_unlock(&inode->i_lock); + + if (inode_dirty_flags) + __mark_inode_dirty(inode, inode_dirty_flags); + + if (mask) { + req->r_inode = inode; + ihold(inode); + req->r_inode_drop = release; + req->r_args.setattr.mask = cpu_to_le32(mask); + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + } + dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, + ceph_cap_string(dirtied), mask); + + ceph_mdsc_put_request(req); + __ceph_do_pending_vmtruncate(inode); + return err; +out: + spin_unlock(&inode->i_lock); + ceph_mdsc_put_request(req); + return err; +} + +/* + * Verify that we have a lease on the given mask. If not, + * do a getattr against an mds. + */ +int ceph_do_getattr(struct inode *inode, int mask) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int err; + + if (ceph_snap(inode) == CEPH_SNAPDIR) { + dout("do_getattr inode %p SNAPDIR\n", inode); + return 0; + } + + dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); + if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) + return 0; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_args.getattr.mask = cpu_to_le32(mask); + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); + dout("do_getattr result=%d\n", err); + return err; +} + + +/* + * Check inode permissions. We verify we have a valid value for + * the AUTH cap, then call the generic handler. + */ +int ceph_permission(struct inode *inode, int mask, unsigned int flags) +{ + int err; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); + + if (!err) + err = generic_permission(inode, mask, flags, NULL); + return err; +} + +/* + * Get all attributes. Hopefully somedata we'll have a statlite() + * and can limit the fields we require to be accurate. + */ +int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + int err; + + err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL); + if (!err) { + generic_fillattr(inode, stat); + stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); + if (ceph_snap(inode) != CEPH_NOSNAP) + stat->dev = ceph_snap(inode); + else + stat->dev = 0; + if (S_ISDIR(inode->i_mode)) { + if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), + RBYTES)) + stat->size = ci->i_rbytes; + else + stat->size = ci->i_files + ci->i_subdirs; + stat->blocks = 0; + stat->blksize = 65536; + } + } + return err; +} diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c new file mode 100644 index 00000000..ef0b5f48 --- /dev/null +++ b/fs/ceph/ioctl.c @@ -0,0 +1,255 @@ +#include + +#include "super.h" +#include "mds_client.h" +#include + +#include "ioctl.h" + + +/* + * ioctls + */ + +/* + * get and set the file layout + */ +static long ceph_ioctl_get_layout(struct file *file, void __user *arg) +{ + struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); + struct ceph_ioctl_layout l; + int err; + + err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); + if (!err) { + l.stripe_unit = ceph_file_layout_su(ci->i_layout); + l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + l.object_size = ceph_file_layout_object_size(ci->i_layout); + l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); + l.preferred_osd = + (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred); + if (copy_to_user(arg, &l, sizeof(l))) + return -EFAULT; + } + + return err; +} + +static long ceph_ioctl_set_layout(struct file *file, void __user *arg) +{ + struct inode *inode = file->f_dentry->d_inode; + struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_request *req; + struct ceph_ioctl_layout l; + int err, i; + + /* copy and validate */ + if (copy_from_user(&l, arg, sizeof(l))) + return -EFAULT; + + if ((l.object_size & ~PAGE_MASK) || + (l.stripe_unit & ~PAGE_MASK) || + !l.stripe_unit || + (l.object_size && + (unsigned)l.object_size % (unsigned)l.stripe_unit)) + return -EINVAL; + + /* make sure it's a valid data pool */ + if (l.data_pool > 0) { + mutex_lock(&mdsc->mutex); + err = -EINVAL; + for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) + if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) { + err = 0; + break; + } + mutex_unlock(&mdsc->mutex); + if (err) + return err; + } + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT, + USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; + + req->r_args.setlayout.layout.fl_stripe_unit = + cpu_to_le32(l.stripe_unit); + req->r_args.setlayout.layout.fl_stripe_count = + cpu_to_le32(l.stripe_count); + req->r_args.setlayout.layout.fl_object_size = + cpu_to_le32(l.object_size); + req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); + req->r_args.setlayout.layout.fl_pg_preferred = + cpu_to_le32(l.preferred_osd); + + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + ceph_mdsc_put_request(req); + return err; +} + +/* + * Set a layout policy on a directory inode. All items in the tree + * rooted at this inode will inherit this layout on creation, + * (It doesn't apply retroactively ) + * unless a subdirectory has its own layout policy. + */ +static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_mds_request *req; + struct ceph_ioctl_layout l; + int err, i; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + + /* copy and validate */ + if (copy_from_user(&l, arg, sizeof(l))) + return -EFAULT; + + if ((l.object_size & ~PAGE_MASK) || + (l.stripe_unit & ~PAGE_MASK) || + !l.stripe_unit || + (l.object_size && + (unsigned)l.object_size % (unsigned)l.stripe_unit)) + return -EINVAL; + + /* make sure it's a valid data pool */ + if (l.data_pool > 0) { + mutex_lock(&mdsc->mutex); + err = -EINVAL; + for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) + if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) { + err = 0; + break; + } + mutex_unlock(&mdsc->mutex); + if (err) + return err; + } + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT, + USE_AUTH_MDS); + + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + + req->r_args.setlayout.layout.fl_stripe_unit = + cpu_to_le32(l.stripe_unit); + req->r_args.setlayout.layout.fl_stripe_count = + cpu_to_le32(l.stripe_count); + req->r_args.setlayout.layout.fl_object_size = + cpu_to_le32(l.object_size); + req->r_args.setlayout.layout.fl_pg_pool = + cpu_to_le32(l.data_pool); + req->r_args.setlayout.layout.fl_pg_preferred = + cpu_to_le32(l.preferred_osd); + + err = ceph_mdsc_do_request(mdsc, inode, req); + ceph_mdsc_put_request(req); + return err; +} + +/* + * Return object name, size/offset information, and location (OSD + * number, network address) for a given file offset. + */ +static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) +{ + struct ceph_ioctl_dataloc dl; + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = + &ceph_sb_to_client(inode->i_sb)->client->osdc; + u64 len = 1, olen; + u64 tmp; + struct ceph_object_layout ol; + struct ceph_pg pgid; + + /* copy and validate */ + if (copy_from_user(&dl, arg, sizeof(dl))) + return -EFAULT; + + down_read(&osdc->map_sem); + ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, + &dl.object_no, &dl.object_offset, &olen); + dl.file_offset -= dl.object_offset; + dl.object_size = ceph_file_layout_object_size(ci->i_layout); + dl.block_size = ceph_file_layout_su(ci->i_layout); + + /* block_offset = object_offset % block_size */ + tmp = dl.object_offset; + dl.block_offset = do_div(tmp, dl.block_size); + + snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", + ceph_ino(inode), dl.object_no); + ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout, + osdc->osdmap); + + pgid = ol.ol_pgid; + dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); + if (dl.osd >= 0) { + struct ceph_entity_addr *a = + ceph_osd_addr(osdc->osdmap, dl.osd); + if (a) + memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr)); + } else { + memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); + } + up_read(&osdc->map_sem); + + /* send result back to user */ + if (copy_to_user(arg, &dl, sizeof(dl))) + return -EFAULT; + + return 0; +} + +static long ceph_ioctl_lazyio(struct file *file) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + + if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { + spin_lock(&inode->i_lock); + ci->i_nr_by_mode[fi->fmode]--; + fi->fmode |= CEPH_FILE_MODE_LAZY; + ci->i_nr_by_mode[fi->fmode]++; + spin_unlock(&inode->i_lock); + dout("ioctl_layzio: file %p marked lazy\n", file); + + ceph_check_caps(ci, 0, NULL); + } else { + dout("ioctl_layzio: file %p already lazy\n", file); + } + return 0; +} + +long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); + switch (cmd) { + case CEPH_IOC_GET_LAYOUT: + return ceph_ioctl_get_layout(file, (void __user *)arg); + + case CEPH_IOC_SET_LAYOUT: + return ceph_ioctl_set_layout(file, (void __user *)arg); + + case CEPH_IOC_SET_LAYOUT_POLICY: + return ceph_ioctl_set_layout_policy(file, (void __user *)arg); + + case CEPH_IOC_GET_DATALOC: + return ceph_ioctl_get_dataloc(file, (void __user *)arg); + + case CEPH_IOC_LAZYIO: + return ceph_ioctl_lazyio(file); + } + + return -ENOTTY; +} diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h new file mode 100644 index 00000000..52e8fd74 --- /dev/null +++ b/fs/ceph/ioctl.h @@ -0,0 +1,44 @@ +#ifndef FS_CEPH_IOCTL_H +#define FS_CEPH_IOCTL_H + +#include +#include + +#define CEPH_IOCTL_MAGIC 0x97 + +/* just use u64 to align sanely on all archs */ +struct ceph_ioctl_layout { + __u64 stripe_unit, stripe_count, object_size; + __u64 data_pool; + __s64 preferred_osd; +}; + +#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \ + struct ceph_ioctl_layout) +#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ + struct ceph_ioctl_layout) +#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \ + struct ceph_ioctl_layout) + +/* + * Extract identity, address of the OSD and object storing a given + * file offset. + */ +struct ceph_ioctl_dataloc { + __u64 file_offset; /* in+out: file offset */ + __u64 object_offset; /* out: offset in object */ + __u64 object_no; /* out: object # */ + __u64 object_size; /* out: object size */ + char object_name[64]; /* out: object name */ + __u64 block_offset; /* out: offset in block */ + __u64 block_size; /* out: block length */ + __s64 osd; /* out: osd # */ + struct sockaddr_storage osd_addr; /* out: osd address */ +}; + +#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ + struct ceph_ioctl_dataloc) + +#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) + +#endif diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c new file mode 100644 index 00000000..80576d05 --- /dev/null +++ b/fs/ceph/locks.c @@ -0,0 +1,286 @@ +#include + +#include +#include + +#include "super.h" +#include "mds_client.h" +#include + +/** + * Implement fcntl and flock locking functions. + */ +static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, + int cmd, u8 wait, struct file_lock *fl) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_mds_client *mdsc = + ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_request *req; + int err; + u64 length = 0; + + req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + + /* mds requires start and length rather than start and end */ + if (LLONG_MAX == fl->fl_end) + length = 0; + else + length = fl->fl_end - fl->fl_start + 1; + + dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " + "length: %llu, wait: %d, type: %d", (int)lock_type, + (int)operation, (u64)fl->fl_pid, fl->fl_start, + length, wait, fl->fl_type); + + req->r_args.filelock_change.rule = lock_type; + req->r_args.filelock_change.type = cmd; + req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); + /* This should be adjusted, but I'm not sure if + namespaces actually get id numbers*/ + req->r_args.filelock_change.pid_namespace = + cpu_to_le64((u64)(unsigned long)fl->fl_nspid); + req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); + req->r_args.filelock_change.length = cpu_to_le64(length); + req->r_args.filelock_change.wait = wait; + + err = ceph_mdsc_do_request(mdsc, inode, req); + + if ( operation == CEPH_MDS_OP_GETFILELOCK){ + fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); + if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) + fl->fl_type = F_RDLCK; + else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) + fl->fl_type = F_WRLCK; + else + fl->fl_type = F_UNLCK; + + fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start); + length = le64_to_cpu(req->r_reply_info.filelock_reply->start) + + le64_to_cpu(req->r_reply_info.filelock_reply->length); + if (length >= 1) + fl->fl_end = length -1; + else + fl->fl_end = 0; + + } + ceph_mdsc_put_request(req); + dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " + "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type, + (int)operation, (u64)fl->fl_pid, fl->fl_start, + length, wait, fl->fl_type, err); + return err; +} + +/** + * Attempt to set an fcntl lock. + * For now, this just goes away to the server. Later it may be more awesome. + */ +int ceph_lock(struct file *file, int cmd, struct file_lock *fl) +{ + u8 lock_cmd; + int err; + u8 wait = 0; + u16 op = CEPH_MDS_OP_SETFILELOCK; + + fl->fl_nspid = get_pid(task_tgid(current)); + dout("ceph_lock, fl_pid:%d", fl->fl_pid); + + /* set wait bit as appropriate, then make command as Ceph expects it*/ + if (F_SETLKW == cmd) + wait = 1; + if (F_GETLK == cmd) + op = CEPH_MDS_OP_GETFILELOCK; + + if (F_RDLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_SHARED; + else if (F_WRLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_EXCL; + else + lock_cmd = CEPH_LOCK_UNLOCK; + + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); + if (!err) { + if ( op != CEPH_MDS_OP_GETFILELOCK ){ + dout("mds locked, locking locally"); + err = posix_lock_file(file, fl, NULL); + if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { + /* undo! This should only happen if + * the kernel detects local + * deadlock. */ + ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + CEPH_LOCK_UNLOCK, 0, fl); + dout("got %d on posix_lock_file, undid lock", + err); + } + } + + } else if (err == -ERESTARTSYS) { + dout("undoing lock\n"); + ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + CEPH_LOCK_UNLOCK, 0, fl); + } + return err; +} + +int ceph_flock(struct file *file, int cmd, struct file_lock *fl) +{ + u8 lock_cmd; + int err; + u8 wait = 1; + + fl->fl_nspid = get_pid(task_tgid(current)); + dout("ceph_flock, fl_pid:%d", fl->fl_pid); + + /* set wait bit, then clear it out of cmd*/ + if (cmd & LOCK_NB) + wait = 0; + cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN); + /* set command sequence that Ceph wants to see: + shared lock, exclusive lock, or unlock */ + if (LOCK_SH == cmd) + lock_cmd = CEPH_LOCK_SHARED; + else if (LOCK_EX == cmd) + lock_cmd = CEPH_LOCK_EXCL; + else + lock_cmd = CEPH_LOCK_UNLOCK; + + err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, + file, lock_cmd, wait, fl); + if (!err) { + err = flock_lock_file_wait(file, fl); + if (err) { + ceph_lock_message(CEPH_LOCK_FLOCK, + CEPH_MDS_OP_SETFILELOCK, + file, CEPH_LOCK_UNLOCK, 0, fl); + dout("got %d on flock_lock_file_wait, undid lock", err); + } + } else if (err == -ERESTARTSYS) { + dout("undoing lock\n"); + ceph_lock_message(CEPH_LOCK_FLOCK, + CEPH_MDS_OP_SETFILELOCK, + file, CEPH_LOCK_UNLOCK, 0, fl); + } + return err; +} + +/** + * Must be called with BKL already held. Fills in the passed + * counter variables, so you can prepare pagelist metadata before calling + * ceph_encode_locks. + */ +void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) +{ + struct file_lock *lock; + + *fcntl_count = 0; + *flock_count = 0; + + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { + if (lock->fl_flags & FL_POSIX) + ++(*fcntl_count); + else if (lock->fl_flags & FL_FLOCK) + ++(*flock_count); + } + dout("counted %d flock locks and %d fcntl locks", + *flock_count, *fcntl_count); +} + +/** + * Encode the flock and fcntl locks for the given inode into the pagelist. + * Format is: #fcntl locks, sequential fcntl locks, #flock locks, + * sequential flock locks. + * Must be called with lock_flocks() already held. + * If we encounter more of a specific lock type than expected, + * we return the value 1. + */ +int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks) +{ + struct file_lock *lock; + struct ceph_filelock cephlock; + int err = 0; + int seen_fcntl = 0; + int seen_flock = 0; + + dout("encoding %d flock and %d fcntl locks", num_flock_locks, + num_fcntl_locks); + err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32)); + if (err) + goto fail; + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { + if (lock->fl_flags & FL_POSIX) { + ++seen_fcntl; + if (seen_fcntl > num_fcntl_locks) { + err = -ENOSPC; + goto fail; + } + err = lock_to_ceph_filelock(lock, &cephlock); + if (err) + goto fail; + err = ceph_pagelist_append(pagelist, &cephlock, + sizeof(struct ceph_filelock)); + } + if (err) + goto fail; + } + + err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32)); + if (err) + goto fail; + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { + if (lock->fl_flags & FL_FLOCK) { + ++seen_flock; + if (seen_flock > num_flock_locks) { + err = -ENOSPC; + goto fail; + } + err = lock_to_ceph_filelock(lock, &cephlock); + if (err) + goto fail; + err = ceph_pagelist_append(pagelist, &cephlock, + sizeof(struct ceph_filelock)); + } + if (err) + goto fail; + } +fail: + return err; +} + +/* + * Given a pointer to a lock, convert it to a ceph filelock + */ +int lock_to_ceph_filelock(struct file_lock *lock, + struct ceph_filelock *cephlock) +{ + int err = 0; + + cephlock->start = cpu_to_le64(lock->fl_start); + cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); + cephlock->client = cpu_to_le64(0); + cephlock->pid = cpu_to_le64(lock->fl_pid); + cephlock->pid_namespace = + cpu_to_le64((u64)(unsigned long)lock->fl_nspid); + + switch (lock->fl_type) { + case F_RDLCK: + cephlock->type = CEPH_LOCK_SHARED; + break; + case F_WRLCK: + cephlock->type = CEPH_LOCK_EXCL; + break; + case F_UNLCK: + cephlock->type = CEPH_LOCK_UNLOCK; + break; + default: + dout("Have unknown lock type %d", lock->fl_type); + err = -EINVAL; + } + + return err; +} diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c new file mode 100644 index 00000000..0c1d9175 --- /dev/null +++ b/fs/ceph/mds_client.c @@ -0,0 +1,3454 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include "super.h" +#include "mds_client.h" + +#include +#include +#include +#include +#include + +/* + * A cluster of MDS (metadata server) daemons is responsible for + * managing the file system namespace (the directory hierarchy and + * inodes) and for coordinating shared access to storage. Metadata is + * partitioning hierarchically across a number of servers, and that + * partition varies over time as the cluster adjusts the distribution + * in order to balance load. + * + * The MDS client is primarily responsible to managing synchronous + * metadata requests for operations like open, unlink, and so forth. + * If there is a MDS failure, we find out about it when we (possibly + * request and) receive a new MDS map, and can resubmit affected + * requests. + * + * For the most part, though, we take advantage of a lossless + * communications channel to the MDS, and do not need to worry about + * timing out or resubmitting requests. + * + * We maintain a stateful "session" with each MDS we interact with. + * Within each session, we sent periodic heartbeat messages to ensure + * any capabilities or leases we have been issues remain valid. If + * the session times out and goes stale, our leases and capabilities + * are no longer valid. + */ + +struct ceph_reconnect_state { + struct ceph_pagelist *pagelist; + bool flock; +}; + +static void __wake_requests(struct ceph_mds_client *mdsc, + struct list_head *head); + +static const struct ceph_connection_operations mds_con_ops; + + +/* + * mds reply parsing + */ + +/* + * parse individual inode info + */ +static int parse_reply_info_in(void **p, void *end, + struct ceph_mds_reply_info_in *info, + int features) +{ + int err = -EIO; + + info->in = *p; + *p += sizeof(struct ceph_mds_reply_inode) + + sizeof(*info->in->fragtree.splits) * + le32_to_cpu(info->in->fragtree.nsplits); + + ceph_decode_32_safe(p, end, info->symlink_len, bad); + ceph_decode_need(p, end, info->symlink_len, bad); + info->symlink = *p; + *p += info->symlink_len; + + if (features & CEPH_FEATURE_DIRLAYOUTHASH) + ceph_decode_copy_safe(p, end, &info->dir_layout, + sizeof(info->dir_layout), bad); + else + memset(&info->dir_layout, 0, sizeof(info->dir_layout)); + + ceph_decode_32_safe(p, end, info->xattr_len, bad); + ceph_decode_need(p, end, info->xattr_len, bad); + info->xattr_data = *p; + *p += info->xattr_len; + return 0; +bad: + return err; +} + +/* + * parse a normal reply, which may contain a (dir+)dentry and/or a + * target inode. + */ +static int parse_reply_info_trace(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + int features) +{ + int err; + + if (info->head->is_dentry) { + err = parse_reply_info_in(p, end, &info->diri, features); + if (err < 0) + goto out_bad; + + if (unlikely(*p + sizeof(*info->dirfrag) > end)) + goto bad; + info->dirfrag = *p; + *p += sizeof(*info->dirfrag) + + sizeof(u32)*le32_to_cpu(info->dirfrag->ndist); + if (unlikely(*p > end)) + goto bad; + + ceph_decode_32_safe(p, end, info->dname_len, bad); + ceph_decode_need(p, end, info->dname_len, bad); + info->dname = *p; + *p += info->dname_len; + info->dlease = *p; + *p += sizeof(*info->dlease); + } + + if (info->head->is_target) { + err = parse_reply_info_in(p, end, &info->targeti, features); + if (err < 0) + goto out_bad; + } + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("problem parsing mds trace %d\n", err); + return err; +} + +/* + * parse readdir results + */ +static int parse_reply_info_dir(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + int features) +{ + u32 num, i = 0; + int err; + + info->dir_dir = *p; + if (*p + sizeof(*info->dir_dir) > end) + goto bad; + *p += sizeof(*info->dir_dir) + + sizeof(u32)*le32_to_cpu(info->dir_dir->ndist); + if (*p > end) + goto bad; + + ceph_decode_need(p, end, sizeof(num) + 2, bad); + num = ceph_decode_32(p); + info->dir_end = ceph_decode_8(p); + info->dir_complete = ceph_decode_8(p); + if (num == 0) + goto done; + + /* alloc large array */ + info->dir_nr = num; + info->dir_in = kcalloc(num, sizeof(*info->dir_in) + + sizeof(*info->dir_dname) + + sizeof(*info->dir_dname_len) + + sizeof(*info->dir_dlease), + GFP_NOFS); + if (info->dir_in == NULL) { + err = -ENOMEM; + goto out_bad; + } + info->dir_dname = (void *)(info->dir_in + num); + info->dir_dname_len = (void *)(info->dir_dname + num); + info->dir_dlease = (void *)(info->dir_dname_len + num); + + while (num) { + /* dentry */ + ceph_decode_need(p, end, sizeof(u32)*2, bad); + info->dir_dname_len[i] = ceph_decode_32(p); + ceph_decode_need(p, end, info->dir_dname_len[i], bad); + info->dir_dname[i] = *p; + *p += info->dir_dname_len[i]; + dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], + info->dir_dname[i]); + info->dir_dlease[i] = *p; + *p += sizeof(struct ceph_mds_reply_lease); + + /* inode */ + err = parse_reply_info_in(p, end, &info->dir_in[i], features); + if (err < 0) + goto out_bad; + i++; + num--; + } + +done: + if (*p != end) + goto bad; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("problem parsing dir contents %d\n", err); + return err; +} + +/* + * parse fcntl F_GETLK results + */ +static int parse_reply_info_filelock(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + int features) +{ + if (*p + sizeof(*info->filelock_reply) > end) + goto bad; + + info->filelock_reply = *p; + *p += sizeof(*info->filelock_reply); + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + return -EIO; +} + +/* + * parse extra results + */ +static int parse_reply_info_extra(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + int features) +{ + if (info->head->op == CEPH_MDS_OP_GETFILELOCK) + return parse_reply_info_filelock(p, end, info, features); + else + return parse_reply_info_dir(p, end, info, features); +} + +/* + * parse entire mds reply + */ +static int parse_reply_info(struct ceph_msg *msg, + struct ceph_mds_reply_info_parsed *info, + int features) +{ + void *p, *end; + u32 len; + int err; + + info->head = msg->front.iov_base; + p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head); + end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head); + + /* trace */ + ceph_decode_32_safe(&p, end, len, bad); + if (len > 0) { + err = parse_reply_info_trace(&p, p+len, info, features); + if (err < 0) + goto out_bad; + } + + /* extra */ + ceph_decode_32_safe(&p, end, len, bad); + if (len > 0) { + err = parse_reply_info_extra(&p, p+len, info, features); + if (err < 0) + goto out_bad; + } + + /* snap blob */ + ceph_decode_32_safe(&p, end, len, bad); + info->snapblob_len = len; + info->snapblob = p; + p += len; + + if (p != end) + goto bad; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("mds parse_reply err %d\n", err); + return err; +} + +static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) +{ + kfree(info->dir_in); +} + + +/* + * sessions + */ +static const char *session_state_name(int s) +{ + switch (s) { + case CEPH_MDS_SESSION_NEW: return "new"; + case CEPH_MDS_SESSION_OPENING: return "opening"; + case CEPH_MDS_SESSION_OPEN: return "open"; + case CEPH_MDS_SESSION_HUNG: return "hung"; + case CEPH_MDS_SESSION_CLOSING: return "closing"; + case CEPH_MDS_SESSION_RESTARTING: return "restarting"; + case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; + default: return "???"; + } +} + +static struct ceph_mds_session *get_session(struct ceph_mds_session *s) +{ + if (atomic_inc_not_zero(&s->s_ref)) { + dout("mdsc get_session %p %d -> %d\n", s, + atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref)); + return s; + } else { + dout("mdsc get_session %p 0 -- FAIL", s); + return NULL; + } +} + +void ceph_put_mds_session(struct ceph_mds_session *s) +{ + dout("mdsc put_session %p %d -> %d\n", s, + atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); + if (atomic_dec_and_test(&s->s_ref)) { + if (s->s_authorizer) + s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( + s->s_mdsc->fsc->client->monc.auth, + s->s_authorizer); + kfree(s); + } +} + +/* + * called under mdsc->mutex + */ +struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, + int mds) +{ + struct ceph_mds_session *session; + + if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL) + return NULL; + session = mdsc->sessions[mds]; + dout("lookup_mds_session %p %d\n", session, + atomic_read(&session->s_ref)); + get_session(session); + return session; +} + +static bool __have_session(struct ceph_mds_client *mdsc, int mds) +{ + if (mds >= mdsc->max_sessions) + return false; + return mdsc->sessions[mds]; +} + +static int __verify_registered_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s) +{ + if (s->s_mds >= mdsc->max_sessions || + mdsc->sessions[s->s_mds] != s) + return -ENOENT; + return 0; +} + +/* + * create+register a new session for given mds. + * called under mdsc->mutex. + */ +static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, + int mds) +{ + struct ceph_mds_session *s; + + s = kzalloc(sizeof(*s), GFP_NOFS); + if (!s) + return ERR_PTR(-ENOMEM); + s->s_mdsc = mdsc; + s->s_mds = mds; + s->s_state = CEPH_MDS_SESSION_NEW; + s->s_ttl = 0; + s->s_seq = 0; + mutex_init(&s->s_mutex); + + ceph_con_init(mdsc->fsc->client->msgr, &s->s_con); + s->s_con.private = s; + s->s_con.ops = &mds_con_ops; + s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; + s->s_con.peer_name.num = cpu_to_le64(mds); + + spin_lock_init(&s->s_cap_lock); + s->s_cap_gen = 0; + s->s_cap_ttl = 0; + s->s_renew_requested = 0; + s->s_renew_seq = 0; + INIT_LIST_HEAD(&s->s_caps); + s->s_nr_caps = 0; + s->s_trim_caps = 0; + atomic_set(&s->s_ref, 1); + INIT_LIST_HEAD(&s->s_waiting); + INIT_LIST_HEAD(&s->s_unsafe); + s->s_num_cap_releases = 0; + s->s_cap_iterator = NULL; + INIT_LIST_HEAD(&s->s_cap_releases); + INIT_LIST_HEAD(&s->s_cap_releases_done); + INIT_LIST_HEAD(&s->s_cap_flushing); + INIT_LIST_HEAD(&s->s_cap_snaps_flushing); + + dout("register_session mds%d\n", mds); + if (mds >= mdsc->max_sessions) { + int newmax = 1 << get_count_order(mds+1); + struct ceph_mds_session **sa; + + dout("register_session realloc to %d\n", newmax); + sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); + if (sa == NULL) + goto fail_realloc; + if (mdsc->sessions) { + memcpy(sa, mdsc->sessions, + mdsc->max_sessions * sizeof(void *)); + kfree(mdsc->sessions); + } + mdsc->sessions = sa; + mdsc->max_sessions = newmax; + } + mdsc->sessions[mds] = s; + atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ + + ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); + + return s; + +fail_realloc: + kfree(s); + return ERR_PTR(-ENOMEM); +} + +/* + * called under mdsc->mutex + */ +static void __unregister_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s) +{ + dout("__unregister_session mds%d %p\n", s->s_mds, s); + BUG_ON(mdsc->sessions[s->s_mds] != s); + mdsc->sessions[s->s_mds] = NULL; + ceph_con_close(&s->s_con); + ceph_put_mds_session(s); +} + +/* + * drop session refs in request. + * + * should be last request ref, or hold mdsc->mutex + */ +static void put_request_session(struct ceph_mds_request *req) +{ + if (req->r_session) { + ceph_put_mds_session(req->r_session); + req->r_session = NULL; + } +} + +void ceph_mdsc_release_request(struct kref *kref) +{ + struct ceph_mds_request *req = container_of(kref, + struct ceph_mds_request, + r_kref); + if (req->r_request) + ceph_msg_put(req->r_request); + if (req->r_reply) { + ceph_msg_put(req->r_reply); + destroy_reply_info(&req->r_reply_info); + } + if (req->r_inode) { + ceph_put_cap_refs(ceph_inode(req->r_inode), + CEPH_CAP_PIN); + iput(req->r_inode); + } + if (req->r_locked_dir) + ceph_put_cap_refs(ceph_inode(req->r_locked_dir), + CEPH_CAP_PIN); + if (req->r_target_inode) + iput(req->r_target_inode); + if (req->r_dentry) + dput(req->r_dentry); + if (req->r_old_dentry) { + ceph_put_cap_refs( + ceph_inode(req->r_old_dentry->d_parent->d_inode), + CEPH_CAP_PIN); + dput(req->r_old_dentry); + } + kfree(req->r_path1); + kfree(req->r_path2); + put_request_session(req); + ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); + kfree(req); +} + +/* + * lookup session, bump ref if found. + * + * called under mdsc->mutex. + */ +static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, + u64 tid) +{ + struct ceph_mds_request *req; + struct rb_node *n = mdsc->request_tree.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_mds_request, r_node); + if (tid < req->r_tid) + n = n->rb_left; + else if (tid > req->r_tid) + n = n->rb_right; + else { + ceph_mdsc_get_request(req); + return req; + } + } + return NULL; +} + +static void __insert_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *new) +{ + struct rb_node **p = &mdsc->request_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_mds_request *req = NULL; + + while (*p) { + parent = *p; + req = rb_entry(parent, struct ceph_mds_request, r_node); + if (new->r_tid < req->r_tid) + p = &(*p)->rb_left; + else if (new->r_tid > req->r_tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->r_node, parent, p); + rb_insert_color(&new->r_node, &mdsc->request_tree); +} + +/* + * Register an in-flight request, and assign a tid. Link to directory + * are modifying (if any). + * + * Called under mdsc->mutex. + */ +static void __register_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + struct inode *dir) +{ + req->r_tid = ++mdsc->last_tid; + if (req->r_num_caps) + ceph_reserve_caps(mdsc, &req->r_caps_reservation, + req->r_num_caps); + dout("__register_request %p tid %lld\n", req, req->r_tid); + ceph_mdsc_get_request(req); + __insert_request(mdsc, req); + + req->r_uid = current_fsuid(); + req->r_gid = current_fsgid(); + + if (dir) { + struct ceph_inode_info *ci = ceph_inode(dir); + + ihold(dir); + spin_lock(&ci->i_unsafe_lock); + req->r_unsafe_dir = dir; + list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); + spin_unlock(&ci->i_unsafe_lock); + } +} + +static void __unregister_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + dout("__unregister_request %p tid %lld\n", req, req->r_tid); + rb_erase(&req->r_node, &mdsc->request_tree); + RB_CLEAR_NODE(&req->r_node); + + if (req->r_unsafe_dir) { + struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); + + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_dir_item); + spin_unlock(&ci->i_unsafe_lock); + + iput(req->r_unsafe_dir); + req->r_unsafe_dir = NULL; + } + + ceph_mdsc_put_request(req); +} + +/* + * Choose mds to send request to next. If there is a hint set in the + * request (e.g., due to a prior forward hint from the mds), use that. + * Otherwise, consult frag tree and/or caps to identify the + * appropriate mds. If all else fails, choose randomly. + * + * Called under mdsc->mutex. + */ +struct dentry *get_nonsnap_parent(struct dentry *dentry) +{ + while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + dentry = dentry->d_parent; + return dentry; +} + +static int __choose_mds(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_cap *cap; + int mode = req->r_direct_mode; + int mds = -1; + u32 hash = req->r_direct_hash; + bool is_hash = req->r_direct_is_hash; + + /* + * is there a specific mds we should try? ignore hint if we have + * no session and the mds is not up (active or recovering). + */ + if (req->r_resend_mds >= 0 && + (__have_session(mdsc, req->r_resend_mds) || + ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { + dout("choose_mds using resend_mds mds%d\n", + req->r_resend_mds); + return req->r_resend_mds; + } + + if (mode == USE_RANDOM_MDS) + goto random; + + inode = NULL; + if (req->r_inode) { + inode = req->r_inode; + } else if (req->r_dentry) { + struct inode *dir = req->r_dentry->d_parent->d_inode; + + if (dir->i_sb != mdsc->fsc->sb) { + /* not this fs! */ + inode = req->r_dentry->d_inode; + } else if (ceph_snap(dir) != CEPH_NOSNAP) { + /* direct snapped/virtual snapdir requests + * based on parent dir inode */ + struct dentry *dn = + get_nonsnap_parent(req->r_dentry->d_parent); + inode = dn->d_inode; + dout("__choose_mds using nonsnap parent %p\n", inode); + } else if (req->r_dentry->d_inode) { + /* dentry target */ + inode = req->r_dentry->d_inode; + } else { + /* dir + name */ + inode = dir; + hash = ceph_dentry_hash(req->r_dentry); + is_hash = true; + } + } + + dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, + (int)hash, mode); + if (!inode) + goto random; + ci = ceph_inode(inode); + + if (is_hash && S_ISDIR(inode->i_mode)) { + struct ceph_inode_frag frag; + int found; + + ceph_choose_frag(ci, hash, &frag, &found); + if (found) { + if (mode == USE_ANY_MDS && frag.ndist > 0) { + u8 r; + + /* choose a random replica */ + get_random_bytes(&r, 1); + r %= frag.ndist; + mds = frag.dist[r]; + dout("choose_mds %p %llx.%llx " + "frag %u mds%d (%d/%d)\n", + inode, ceph_vinop(inode), + frag.frag, mds, + (int)r, frag.ndist); + if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= + CEPH_MDS_STATE_ACTIVE) + return mds; + } + + /* since this file/dir wasn't known to be + * replicated, then we want to look for the + * authoritative mds. */ + mode = USE_AUTH_MDS; + if (frag.mds >= 0) { + /* choose auth mds */ + mds = frag.mds; + dout("choose_mds %p %llx.%llx " + "frag %u mds%d (auth)\n", + inode, ceph_vinop(inode), frag.frag, mds); + if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= + CEPH_MDS_STATE_ACTIVE) + return mds; + } + } + } + + spin_lock(&inode->i_lock); + cap = NULL; + if (mode == USE_AUTH_MDS) + cap = ci->i_auth_cap; + if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) + cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); + if (!cap) { + spin_unlock(&inode->i_lock); + goto random; + } + mds = cap->session->s_mds; + dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", + inode, ceph_vinop(inode), mds, + cap == ci->i_auth_cap ? "auth " : "", cap); + spin_unlock(&inode->i_lock); + return mds; + +random: + mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); + dout("choose_mds chose random mds%d\n", mds); + return mds; +} + + +/* + * session messages + */ +static struct ceph_msg *create_session_msg(u32 op, u64 seq) +{ + struct ceph_msg *msg; + struct ceph_mds_session_head *h; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS); + if (!msg) { + pr_err("create_session_msg ENOMEM creating msg\n"); + return NULL; + } + h = msg->front.iov_base; + h->op = cpu_to_le32(op); + h->seq = cpu_to_le64(seq); + return msg; +} + +/* + * send session open request. + * + * called under mdsc->mutex + */ +static int __open_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + int mstate; + int mds = session->s_mds; + + /* wait for mds to go active? */ + mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); + dout("open_session to mds%d (%s)\n", mds, + ceph_mds_state_name(mstate)); + session->s_state = CEPH_MDS_SESSION_OPENING; + session->s_renew_requested = jiffies; + + /* send connect message */ + msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + +/* + * open sessions for any export targets for the given mds + * + * called under mdsc->mutex + */ +static void __open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_info *mi; + struct ceph_mds_session *ts; + int i, mds = session->s_mds; + int target; + + if (mds >= mdsc->mdsmap->m_max_mds) + return; + mi = &mdsc->mdsmap->m_info[mds]; + dout("open_export_target_sessions for mds%d (%d targets)\n", + session->s_mds, mi->num_export_targets); + + for (i = 0; i < mi->num_export_targets; i++) { + target = mi->export_targets[i]; + ts = __ceph_lookup_mds_session(mdsc, target); + if (!ts) { + ts = register_session(mdsc, target); + if (IS_ERR(ts)) + return; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) + __open_session(mdsc, session); + else + dout(" mds%d target mds%d %p is %s\n", session->s_mds, + i, ts, session_state_name(ts->s_state)); + ceph_put_mds_session(ts); + } +} + +void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + mutex_lock(&mdsc->mutex); + __open_export_target_sessions(mdsc, session); + mutex_unlock(&mdsc->mutex); +} + +/* + * session caps + */ + +/* + * Free preallocated cap messages assigned to this session + */ +static void cleanup_cap_releases(struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + + spin_lock(&session->s_cap_lock); + while (!list_empty(&session->s_cap_releases)) { + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + ceph_msg_put(msg); + } + while (!list_empty(&session->s_cap_releases_done)) { + msg = list_first_entry(&session->s_cap_releases_done, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + ceph_msg_put(msg); + } + spin_unlock(&session->s_cap_lock); +} + +/* + * Helper to safely iterate over all caps associated with a session, with + * special care taken to handle a racing __ceph_remove_cap(). + * + * Caller must hold session s_mutex. + */ +static int iterate_session_caps(struct ceph_mds_session *session, + int (*cb)(struct inode *, struct ceph_cap *, + void *), void *arg) +{ + struct list_head *p; + struct ceph_cap *cap; + struct inode *inode, *last_inode = NULL; + struct ceph_cap *old_cap = NULL; + int ret; + + dout("iterate_session_caps %p mds%d\n", session, session->s_mds); + spin_lock(&session->s_cap_lock); + p = session->s_caps.next; + while (p != &session->s_caps) { + cap = list_entry(p, struct ceph_cap, session_caps); + inode = igrab(&cap->ci->vfs_inode); + if (!inode) { + p = p->next; + continue; + } + session->s_cap_iterator = cap; + spin_unlock(&session->s_cap_lock); + + if (last_inode) { + iput(last_inode); + last_inode = NULL; + } + if (old_cap) { + ceph_put_cap(session->s_mdsc, old_cap); + old_cap = NULL; + } + + ret = cb(inode, cap, arg); + last_inode = inode; + + spin_lock(&session->s_cap_lock); + p = p->next; + if (cap->ci == NULL) { + dout("iterate_session_caps finishing cap %p removal\n", + cap); + BUG_ON(cap->session != session); + list_del_init(&cap->session_caps); + session->s_nr_caps--; + cap->session = NULL; + old_cap = cap; /* put_cap it w/o locks held */ + } + if (ret < 0) + goto out; + } + ret = 0; +out: + session->s_cap_iterator = NULL; + spin_unlock(&session->s_cap_lock); + + if (last_inode) + iput(last_inode); + if (old_cap) + ceph_put_cap(session->s_mdsc, old_cap); + + return ret; +} + +static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, + void *arg) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int drop = 0; + + dout("removing cap %p, ci is %p, inode is %p\n", + cap, ci, &ci->vfs_inode); + spin_lock(&inode->i_lock); + __ceph_remove_cap(cap); + if (!__ceph_is_any_real_caps(ci)) { + struct ceph_mds_client *mdsc = + ceph_sb_to_client(inode->i_sb)->mdsc; + + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + pr_info(" dropping dirty %s state for %p %lld\n", + ceph_cap_string(ci->i_dirty_caps), + inode, ceph_ino(inode)); + ci->i_dirty_caps = 0; + list_del_init(&ci->i_dirty_item); + drop = 1; + } + if (!list_empty(&ci->i_flushing_item)) { + pr_info(" dropping dirty+flushing %s state for %p %lld\n", + ceph_cap_string(ci->i_flushing_caps), + inode, ceph_ino(inode)); + ci->i_flushing_caps = 0; + list_del_init(&ci->i_flushing_item); + mdsc->num_cap_flushing--; + drop = 1; + } + if (drop && ci->i_wrbuffer_ref) { + pr_info(" dropping dirty data for %p %lld\n", + inode, ceph_ino(inode)); + ci->i_wrbuffer_ref = 0; + ci->i_wrbuffer_ref_head = 0; + drop++; + } + spin_unlock(&mdsc->cap_dirty_lock); + } + spin_unlock(&inode->i_lock); + while (drop--) + iput(inode); + return 0; +} + +/* + * caller must hold session s_mutex + */ +static void remove_session_caps(struct ceph_mds_session *session) +{ + dout("remove_session_caps on %p\n", session); + iterate_session_caps(session, remove_session_caps_cb, NULL); + BUG_ON(session->s_nr_caps > 0); + BUG_ON(!list_empty(&session->s_cap_flushing)); + cleanup_cap_releases(session); +} + +/* + * wake up any threads waiting on this session's caps. if the cap is + * old (didn't get renewed on the client reconnect), remove it now. + * + * caller must hold s_mutex. + */ +static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, + void *arg) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + wake_up_all(&ci->i_cap_wq); + if (arg) { + spin_lock(&inode->i_lock); + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + spin_unlock(&inode->i_lock); + } + return 0; +} + +static void wake_up_session_caps(struct ceph_mds_session *session, + int reconnect) +{ + dout("wake_up_session_caps %p mds%d\n", session, session->s_mds); + iterate_session_caps(session, wake_up_session_cb, + (void *)(unsigned long)reconnect); +} + +/* + * Send periodic message to MDS renewing all currently held caps. The + * ack will reset the expiration for all caps from this session. + * + * caller holds s_mutex + */ +static int send_renew_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + int state; + + if (time_after_eq(jiffies, session->s_cap_ttl) && + time_after_eq(session->s_cap_ttl, session->s_renew_requested)) + pr_info("mds%d caps stale\n", session->s_mds); + session->s_renew_requested = jiffies; + + /* do not try to renew caps until a recovering mds has reconnected + * with its clients. */ + state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds); + if (state < CEPH_MDS_STATE_RECONNECT) { + dout("send_renew_caps ignoring mds%d (%s)\n", + session->s_mds, ceph_mds_state_name(state)); + return 0; + } + + dout("send_renew_caps to mds%d (%s)\n", session->s_mds, + ceph_mds_state_name(state)); + msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, + ++session->s_renew_seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + +/* + * Note new cap ttl, and any transition from stale -> not stale (fresh?). + * + * Called under session->s_mutex + */ +static void renewed_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, int is_renew) +{ + int was_stale; + int wake = 0; + + spin_lock(&session->s_cap_lock); + was_stale = is_renew && (session->s_cap_ttl == 0 || + time_after_eq(jiffies, session->s_cap_ttl)); + + session->s_cap_ttl = session->s_renew_requested + + mdsc->mdsmap->m_session_timeout*HZ; + + if (was_stale) { + if (time_before(jiffies, session->s_cap_ttl)) { + pr_info("mds%d caps renewed\n", session->s_mds); + wake = 1; + } else { + pr_info("mds%d caps still stale\n", session->s_mds); + } + } + dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n", + session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh", + time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh"); + spin_unlock(&session->s_cap_lock); + + if (wake) + wake_up_session_caps(session, 0); +} + +/* + * send a session close request + */ +static int request_close_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + + dout("request_close_session mds%d state %s seq %lld\n", + session->s_mds, session_state_name(session->s_state), + session->s_seq); + msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + +/* + * Called with s_mutex held. + */ +static int __close_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + if (session->s_state >= CEPH_MDS_SESSION_CLOSING) + return 0; + session->s_state = CEPH_MDS_SESSION_CLOSING; + return request_close_session(mdsc, session); +} + +/* + * Trim old(er) caps. + * + * Because we can't cache an inode without one or more caps, we do + * this indirectly: if a cap is unused, we prune its aliases, at which + * point the inode will hopefully get dropped to. + * + * Yes, this is a bit sloppy. Our only real goal here is to respond to + * memory pressure from the MDS, though, so it needn't be perfect. + */ +static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) +{ + struct ceph_mds_session *session = arg; + struct ceph_inode_info *ci = ceph_inode(inode); + int used, oissued, mine; + + if (session->s_trim_caps <= 0) + return -1; + + spin_lock(&inode->i_lock); + mine = cap->issued | cap->implemented; + used = __ceph_caps_used(ci); + oissued = __ceph_caps_issued_other(ci, cap); + + dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", + inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), + ceph_cap_string(used)); + if (ci->i_dirty_caps) + goto out; /* dirty caps */ + if ((used & ~oissued) & mine) + goto out; /* we need these caps */ + + session->s_trim_caps--; + if (oissued) { + /* we aren't the only cap.. just remove us */ + __ceph_remove_cap(cap); + } else { + /* try to drop referring dentries */ + spin_unlock(&inode->i_lock); + d_prune_aliases(inode); + dout("trim_caps_cb %p cap %p pruned, count now %d\n", + inode, cap, atomic_read(&inode->i_count)); + return 0; + } + +out: + spin_unlock(&inode->i_lock); + return 0; +} + +/* + * Trim session cap count down to some max number. + */ +static int trim_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int max_caps) +{ + int trim_caps = session->s_nr_caps - max_caps; + + dout("trim_caps mds%d start: %d / %d, trim %d\n", + session->s_mds, session->s_nr_caps, max_caps, trim_caps); + if (trim_caps > 0) { + session->s_trim_caps = trim_caps; + iterate_session_caps(session, trim_caps_cb, session); + dout("trim_caps mds%d done: %d / %d, trimmed %d\n", + session->s_mds, session->s_nr_caps, max_caps, + trim_caps - session->s_trim_caps); + session->s_trim_caps = 0; + } + return 0; +} + +/* + * Allocate cap_release messages. If there is a partially full message + * in the queue, try to allocate enough to cover it's remainder, so that + * we can send it immediately. + * + * Called under s_mutex. + */ +int ceph_add_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg, *partial = NULL; + struct ceph_mds_cap_release *head; + int err = -ENOMEM; + int extra = mdsc->fsc->mount_options->cap_release_safety; + int num; + + dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, + extra); + + spin_lock(&session->s_cap_lock); + + if (!list_empty(&session->s_cap_releases)) { + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, + list_head); + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + if (num) { + dout(" partial %p with (%d/%d)\n", msg, num, + (int)CEPH_CAPS_PER_RELEASE); + extra += CEPH_CAPS_PER_RELEASE - num; + partial = msg; + } + } + while (session->s_num_cap_releases < session->s_nr_caps + extra) { + spin_unlock(&session->s_cap_lock); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, + GFP_NOFS); + if (!msg) + goto out_unlocked; + dout("add_cap_releases %p msg %p now %d\n", session, msg, + (int)msg->front.iov_len); + head = msg->front.iov_base; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + spin_lock(&session->s_cap_lock); + list_add(&msg->list_head, &session->s_cap_releases); + session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; + } + + if (partial) { + head = partial->front.iov_base; + num = le32_to_cpu(head->num); + dout(" queueing partial %p with %d/%d\n", partial, num, + (int)CEPH_CAPS_PER_RELEASE); + list_move_tail(&partial->list_head, + &session->s_cap_releases_done); + session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num; + } + err = 0; + spin_unlock(&session->s_cap_lock); +out_unlocked: + return err; +} + +/* + * flush all dirty inode data to disk. + * + * returns true if we've flushed through want_flush_seq + */ +static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +{ + int mds, ret = 1; + + dout("check_cap_flush want %lld\n", want_flush_seq); + mutex_lock(&mdsc->mutex); + for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { + struct ceph_mds_session *session = mdsc->sessions[mds]; + + if (!session) + continue; + get_session(session); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + if (!list_empty(&session->s_cap_flushing)) { + struct ceph_inode_info *ci = + list_entry(session->s_cap_flushing.next, + struct ceph_inode_info, + i_flushing_item); + struct inode *inode = &ci->vfs_inode; + + spin_lock(&inode->i_lock); + if (ci->i_cap_flush_seq <= want_flush_seq) { + dout("check_cap_flush still flushing %p " + "seq %lld <= %lld to mds%d\n", inode, + ci->i_cap_flush_seq, want_flush_seq, + session->s_mds); + ret = 0; + } + spin_unlock(&inode->i_lock); + } + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + + if (!ret) + return ret; + mutex_lock(&mdsc->mutex); + } + + mutex_unlock(&mdsc->mutex); + dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); + return ret; +} + +/* + * called under s_mutex + */ +void ceph_send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + + dout("send_cap_releases mds%d\n", session->s_mds); + spin_lock(&session->s_cap_lock); + while (!list_empty(&session->s_cap_releases_done)) { + msg = list_first_entry(&session->s_cap_releases_done, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + spin_unlock(&session->s_cap_lock); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); + spin_lock(&session->s_cap_lock); + } + spin_unlock(&session->s_cap_lock); +} + +static void discard_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + struct ceph_mds_cap_release *head; + unsigned num; + + dout("discard_cap_releases mds%d\n", session->s_mds); + spin_lock(&session->s_cap_lock); + + /* zero out the in-progress message */ + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); + head->num = cpu_to_le32(0); + session->s_num_cap_releases += num; + + /* requeue completed messages */ + while (!list_empty(&session->s_cap_releases_done)) { + msg = list_first_entry(&session->s_cap_releases_done, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, + num); + session->s_num_cap_releases += num; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + list_add(&msg->list_head, &session->s_cap_releases); + } + + spin_unlock(&session->s_cap_lock); +} + +/* + * requests + */ + +/* + * Create an mds request. + */ +struct ceph_mds_request * +ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) +{ + struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); + + if (!req) + return ERR_PTR(-ENOMEM); + + mutex_init(&req->r_fill_mutex); + req->r_mdsc = mdsc; + req->r_started = jiffies; + req->r_resend_mds = -1; + INIT_LIST_HEAD(&req->r_unsafe_dir_item); + req->r_fmode = -1; + kref_init(&req->r_kref); + INIT_LIST_HEAD(&req->r_wait); + init_completion(&req->r_completion); + init_completion(&req->r_safe_completion); + INIT_LIST_HEAD(&req->r_unsafe_item); + + req->r_op = op; + req->r_direct_mode = mode; + return req; +} + +/* + * return oldest (lowest) request, tid in request tree, 0 if none. + * + * called under mdsc->mutex. + */ +static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) +{ + if (RB_EMPTY_ROOT(&mdsc->request_tree)) + return NULL; + return rb_entry(rb_first(&mdsc->request_tree), + struct ceph_mds_request, r_node); +} + +static u64 __get_oldest_tid(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_request *req = __get_oldest_req(mdsc); + + if (req) + return req->r_tid; + return 0; +} + +/* + * Build a dentry's path. Allocate on heap; caller must kfree. Based + * on build_path_from_dentry in fs/cifs/dir.c. + * + * If @stop_on_nosnap, generate path relative to the first non-snapped + * inode. + * + * Encode hidden .snap dirs as a double /, i.e. + * foo/.snap/bar -> foo//bar + */ +char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, + int stop_on_nosnap) +{ + struct dentry *temp; + char *path; + int len, pos; + unsigned seq; + + if (dentry == NULL) + return ERR_PTR(-EINVAL); + +retry: + len = 0; + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + for (temp = dentry; !IS_ROOT(temp);) { + struct inode *inode = temp->d_inode; + if (inode && ceph_snap(inode) == CEPH_SNAPDIR) + len++; /* slash only */ + else if (stop_on_nosnap && inode && + ceph_snap(inode) == CEPH_NOSNAP) + break; + else + len += 1 + temp->d_name.len; + temp = temp->d_parent; + if (temp == NULL) { + rcu_read_unlock(); + pr_err("build_path corrupt dentry %p\n", dentry); + return ERR_PTR(-EINVAL); + } + } + rcu_read_unlock(); + if (len) + len--; /* no leading '/' */ + + path = kmalloc(len+1, GFP_NOFS); + if (path == NULL) + return ERR_PTR(-ENOMEM); + pos = len; + path[pos] = 0; /* trailing null */ + rcu_read_lock(); + for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) { + struct inode *inode; + + spin_lock(&temp->d_lock); + inode = temp->d_inode; + if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { + dout("build_path path+%d: %p SNAPDIR\n", + pos, temp); + } else if (stop_on_nosnap && inode && + ceph_snap(inode) == CEPH_NOSNAP) { + break; + } else { + pos -= temp->d_name.len; + if (pos < 0) { + spin_unlock(&temp->d_lock); + break; + } + strncpy(path + pos, temp->d_name.name, + temp->d_name.len); + } + spin_unlock(&temp->d_lock); + if (pos) + path[--pos] = '/'; + temp = temp->d_parent; + if (temp == NULL) { + rcu_read_unlock(); + pr_err("build_path corrupt dentry\n"); + kfree(path); + return ERR_PTR(-EINVAL); + } + } + rcu_read_unlock(); + if (pos != 0 || read_seqretry(&rename_lock, seq)) { + pr_err("build_path did not end path lookup where " + "expected, namelen is %d, pos is %d\n", len, pos); + /* presumably this is only possible if racing with a + rename of one of the parent directories (we can not + lock the dentries above us to prevent this, but + retrying should be harmless) */ + kfree(path); + goto retry; + } + + *base = ceph_ino(temp->d_inode); + *plen = len; + dout("build_path on %p %d built %llx '%.*s'\n", + dentry, dentry->d_count, *base, len, path); + return path; +} + +static int build_dentry_path(struct dentry *dentry, + const char **ppath, int *ppathlen, u64 *pino, + int *pfreepath) +{ + char *path; + + if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) { + *pino = ceph_ino(dentry->d_parent->d_inode); + *ppath = dentry->d_name.name; + *ppathlen = dentry->d_name.len; + return 0; + } + path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); + if (IS_ERR(path)) + return PTR_ERR(path); + *ppath = path; + *pfreepath = 1; + return 0; +} + +static int build_inode_path(struct inode *inode, + const char **ppath, int *ppathlen, u64 *pino, + int *pfreepath) +{ + struct dentry *dentry; + char *path; + + if (ceph_snap(inode) == CEPH_NOSNAP) { + *pino = ceph_ino(inode); + *ppathlen = 0; + return 0; + } + dentry = d_find_alias(inode); + path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); + dput(dentry); + if (IS_ERR(path)) + return PTR_ERR(path); + *ppath = path; + *pfreepath = 1; + return 0; +} + +/* + * request arguments may be specified via an inode *, a dentry *, or + * an explicit ino+path. + */ +static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, + const char *rpath, u64 rino, + const char **ppath, int *pathlen, + u64 *ino, int *freepath) +{ + int r = 0; + + if (rinode) { + r = build_inode_path(rinode, ppath, pathlen, ino, freepath); + dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), + ceph_snap(rinode)); + } else if (rdentry) { + r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); + dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, + *ppath); + } else if (rpath) { + *ino = rino; + *ppath = rpath; + *pathlen = strlen(rpath); + dout(" path %.*s\n", *pathlen, rpath); + } + + return r; +} + +/* + * called under mdsc->mutex + */ +static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + int mds) +{ + struct ceph_msg *msg; + struct ceph_mds_request_head *head; + const char *path1 = NULL; + const char *path2 = NULL; + u64 ino1 = 0, ino2 = 0; + int pathlen1 = 0, pathlen2 = 0; + int freepath1 = 0, freepath2 = 0; + int len; + u16 releases; + void *p, *end; + int ret; + + ret = set_request_path_attr(req->r_inode, req->r_dentry, + req->r_path1, req->r_ino1.ino, + &path1, &pathlen1, &ino1, &freepath1); + if (ret < 0) { + msg = ERR_PTR(ret); + goto out; + } + + ret = set_request_path_attr(NULL, req->r_old_dentry, + req->r_path2, req->r_ino2.ino, + &path2, &pathlen2, &ino2, &freepath2); + if (ret < 0) { + msg = ERR_PTR(ret); + goto out_free1; + } + + len = sizeof(*head) + + pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)); + + /* calculate (max) length for cap releases */ + len += sizeof(struct ceph_mds_request_release) * + (!!req->r_inode_drop + !!req->r_dentry_drop + + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); + if (req->r_dentry_drop) + len += req->r_dentry->d_name.len; + if (req->r_old_dentry_drop) + len += req->r_old_dentry->d_name.len; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS); + if (!msg) { + msg = ERR_PTR(-ENOMEM); + goto out_free2; + } + + msg->hdr.tid = cpu_to_le64(req->r_tid); + + head = msg->front.iov_base; + p = msg->front.iov_base + sizeof(*head); + end = msg->front.iov_base + msg->front.iov_len; + + head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); + head->op = cpu_to_le32(req->r_op); + head->caller_uid = cpu_to_le32(req->r_uid); + head->caller_gid = cpu_to_le32(req->r_gid); + head->args = req->r_args; + + ceph_encode_filepath(&p, end, ino1, path1); + ceph_encode_filepath(&p, end, ino2, path2); + + /* make note of release offset, in case we need to replay */ + req->r_request_release_offset = p - msg->front.iov_base; + + /* cap releases */ + releases = 0; + if (req->r_inode_drop) + releases += ceph_encode_inode_release(&p, + req->r_inode ? req->r_inode : req->r_dentry->d_inode, + mds, req->r_inode_drop, req->r_inode_unless, 0); + if (req->r_dentry_drop) + releases += ceph_encode_dentry_release(&p, req->r_dentry, + mds, req->r_dentry_drop, req->r_dentry_unless); + if (req->r_old_dentry_drop) + releases += ceph_encode_dentry_release(&p, req->r_old_dentry, + mds, req->r_old_dentry_drop, req->r_old_dentry_unless); + if (req->r_old_inode_drop) + releases += ceph_encode_inode_release(&p, + req->r_old_dentry->d_inode, + mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); + head->num_releases = cpu_to_le16(releases); + + BUG_ON(p > end); + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + + msg->pages = req->r_pages; + msg->nr_pages = req->r_num_pages; + msg->hdr.data_len = cpu_to_le32(req->r_data_len); + msg->hdr.data_off = cpu_to_le16(0); + +out_free2: + if (freepath2) + kfree((char *)path2); +out_free1: + if (freepath1) + kfree((char *)path1); +out: + return msg; +} + +/* + * called under mdsc->mutex if error, under no mutex if + * success. + */ +static void complete_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + if (req->r_callback) + req->r_callback(mdsc, req); + else + complete_all(&req->r_completion); +} + +/* + * called under mdsc->mutex + */ +static int __prepare_send_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + int mds) +{ + struct ceph_mds_request_head *rhead; + struct ceph_msg *msg; + int flags = 0; + + req->r_attempts++; + if (req->r_inode) { + struct ceph_cap *cap = + ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds); + + if (cap) + req->r_sent_on_mseq = cap->mseq; + else + req->r_sent_on_mseq = -1; + } + dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, + req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); + + if (req->r_got_unsafe) { + /* + * Replay. Do not regenerate message (and rebuild + * paths, etc.); just use the original message. + * Rebuilding paths will break for renames because + * d_move mangles the src name. + */ + msg = req->r_request; + rhead = msg->front.iov_base; + + flags = le32_to_cpu(rhead->flags); + flags |= CEPH_MDS_FLAG_REPLAY; + rhead->flags = cpu_to_le32(flags); + + if (req->r_target_inode) + rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + + rhead->num_retry = req->r_attempts - 1; + + /* remove cap/dentry releases from message */ + rhead->num_releases = 0; + msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); + msg->front.iov_len = req->r_request_release_offset; + return 0; + } + + if (req->r_request) { + ceph_msg_put(req->r_request); + req->r_request = NULL; + } + msg = create_request_message(mdsc, req, mds); + if (IS_ERR(msg)) { + req->r_err = PTR_ERR(msg); + complete_request(mdsc, req); + return PTR_ERR(msg); + } + req->r_request = msg; + + rhead = msg->front.iov_base; + rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); + if (req->r_got_unsafe) + flags |= CEPH_MDS_FLAG_REPLAY; + if (req->r_locked_dir) + flags |= CEPH_MDS_FLAG_WANT_DENTRY; + rhead->flags = cpu_to_le32(flags); + rhead->num_fwd = req->r_num_fwd; + rhead->num_retry = req->r_attempts - 1; + rhead->ino = 0; + + dout(" r_locked_dir = %p\n", req->r_locked_dir); + return 0; +} + +/* + * send request, or put it on the appropriate wait list. + */ +static int __do_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct ceph_mds_session *session = NULL; + int mds = -1; + int err = -EAGAIN; + + if (req->r_err || req->r_got_result) + goto out; + + if (req->r_timeout && + time_after_eq(jiffies, req->r_started + req->r_timeout)) { + dout("do_request timed out\n"); + err = -EIO; + goto finish; + } + + put_request_session(req); + + mds = __choose_mds(mdsc, req); + if (mds < 0 || + ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { + dout("do_request no mds or not active, waiting for map\n"); + list_add(&req->r_wait, &mdsc->waiting_for_map); + goto out; + } + + /* get, open session */ + session = __ceph_lookup_mds_session(mdsc, mds); + if (!session) { + session = register_session(mdsc, mds); + if (IS_ERR(session)) { + err = PTR_ERR(session); + goto finish; + } + } + req->r_session = get_session(session); + + dout("do_request mds%d session %p state %s\n", mds, session, + session_state_name(session->s_state)); + if (session->s_state != CEPH_MDS_SESSION_OPEN && + session->s_state != CEPH_MDS_SESSION_HUNG) { + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) + __open_session(mdsc, session); + list_add(&req->r_wait, &session->s_waiting); + goto out_session; + } + + /* send request */ + req->r_resend_mds = -1; /* forget any previous mds hint */ + + if (req->r_request_started == 0) /* note request start time */ + req->r_request_started = jiffies; + + err = __prepare_send_request(mdsc, req, mds); + if (!err) { + ceph_msg_get(req->r_request); + ceph_con_send(&session->s_con, req->r_request); + } + +out_session: + ceph_put_mds_session(session); +out: + return err; + +finish: + req->r_err = err; + complete_request(mdsc, req); + goto out; +} + +/* + * called under mdsc->mutex + */ +static void __wake_requests(struct ceph_mds_client *mdsc, + struct list_head *head) +{ + struct ceph_mds_request *req, *nreq; + + list_for_each_entry_safe(req, nreq, head, r_wait) { + list_del_init(&req->r_wait); + __do_request(mdsc, req); + } +} + +/* + * Wake up threads with requests pending for @mds, so that they can + * resubmit their requests to a possibly different mds. + */ +static void kick_requests(struct ceph_mds_client *mdsc, int mds) +{ + struct ceph_mds_request *req; + struct rb_node *p; + + dout("kick_requests mds%d\n", mds); + for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_mds_request, r_node); + if (req->r_got_unsafe) + continue; + if (req->r_session && + req->r_session->s_mds == mds) { + dout(" kicking tid %llu\n", req->r_tid); + __do_request(mdsc, req); + } + } +} + +void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + dout("submit_request on %p\n", req); + mutex_lock(&mdsc->mutex); + __register_request(mdsc, req, NULL); + __do_request(mdsc, req); + mutex_unlock(&mdsc->mutex); +} + +/* + * Synchrously perform an mds request. Take care of all of the + * session setup, forwarding, retry details. + */ +int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req) +{ + int err; + + dout("do_request on %p\n", req); + + /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */ + if (req->r_inode) + ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); + if (req->r_locked_dir) + ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); + if (req->r_old_dentry) + ceph_get_cap_refs( + ceph_inode(req->r_old_dentry->d_parent->d_inode), + CEPH_CAP_PIN); + + /* issue */ + mutex_lock(&mdsc->mutex); + __register_request(mdsc, req, dir); + __do_request(mdsc, req); + + if (req->r_err) { + err = req->r_err; + __unregister_request(mdsc, req); + dout("do_request early error %d\n", err); + goto out; + } + + /* wait */ + mutex_unlock(&mdsc->mutex); + dout("do_request waiting\n"); + if (req->r_timeout) { + err = (long)wait_for_completion_killable_timeout( + &req->r_completion, req->r_timeout); + if (err == 0) + err = -EIO; + } else { + err = wait_for_completion_killable(&req->r_completion); + } + dout("do_request waited, got %d\n", err); + mutex_lock(&mdsc->mutex); + + /* only abort if we didn't race with a real reply */ + if (req->r_got_result) { + err = le32_to_cpu(req->r_reply_info.head->result); + } else if (err < 0) { + dout("aborted request %lld with %d\n", req->r_tid, err); + + /* + * ensure we aren't running concurrently with + * ceph_fill_trace or ceph_readdir_prepopulate, which + * rely on locks (dir mutex) held by our caller. + */ + mutex_lock(&req->r_fill_mutex); + req->r_err = err; + req->r_aborted = true; + mutex_unlock(&req->r_fill_mutex); + + if (req->r_locked_dir && + (req->r_op & CEPH_MDS_OP_WRITE)) + ceph_invalidate_dir_request(req); + } else { + err = req->r_err; + } + +out: + mutex_unlock(&mdsc->mutex); + dout("do_request %p done, result %d\n", req, err); + return err; +} + +/* + * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS + * namespace request. + */ +void ceph_invalidate_dir_request(struct ceph_mds_request *req) +{ + struct inode *inode = req->r_locked_dir; + struct ceph_inode_info *ci = ceph_inode(inode); + + dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode); + spin_lock(&inode->i_lock); + ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + ci->i_release_count++; + spin_unlock(&inode->i_lock); + + if (req->r_dentry) + ceph_invalidate_dentry_lease(req->r_dentry); + if (req->r_old_dentry) + ceph_invalidate_dentry_lease(req->r_old_dentry); +} + +/* + * Handle mds reply. + * + * We take the session mutex and parse and process the reply immediately. + * This preserves the logical ordering of replies, capabilities, etc., sent + * by the MDS as they are applied to our local cache. + */ +static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) +{ + struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_mds_request *req; + struct ceph_mds_reply_head *head = msg->front.iov_base; + struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ + u64 tid; + int err, result; + int mds = session->s_mds; + + if (msg->front.iov_len < sizeof(*head)) { + pr_err("mdsc_handle_reply got corrupt (short) reply\n"); + ceph_msg_dump(msg); + return; + } + + /* get request, session */ + tid = le64_to_cpu(msg->hdr.tid); + mutex_lock(&mdsc->mutex); + req = __lookup_request(mdsc, tid); + if (!req) { + dout("handle_reply on unknown tid %llu\n", tid); + mutex_unlock(&mdsc->mutex); + return; + } + dout("handle_reply %p\n", req); + + /* correct session? */ + if (req->r_session != session) { + pr_err("mdsc_handle_reply got %llu on session mds%d" + " not mds%d\n", tid, session->s_mds, + req->r_session ? req->r_session->s_mds : -1); + mutex_unlock(&mdsc->mutex); + goto out; + } + + /* dup? */ + if ((req->r_got_unsafe && !head->safe) || + (req->r_got_safe && head->safe)) { + pr_warning("got a dup %s reply on %llu from mds%d\n", + head->safe ? "safe" : "unsafe", tid, mds); + mutex_unlock(&mdsc->mutex); + goto out; + } + if (req->r_got_safe && !head->safe) { + pr_warning("got unsafe after safe on %llu from mds%d\n", + tid, mds); + mutex_unlock(&mdsc->mutex); + goto out; + } + + result = le32_to_cpu(head->result); + + /* + * Handle an ESTALE + * if we're not talking to the authority, send to them + * if the authority has changed while we weren't looking, + * send to new authority + * Otherwise we just have to return an ESTALE + */ + if (result == -ESTALE) { + dout("got ESTALE on request %llu", req->r_tid); + if (!req->r_inode) { + /* do nothing; not an authority problem */ + } else if (req->r_direct_mode != USE_AUTH_MDS) { + dout("not using auth, setting for that now"); + req->r_direct_mode = USE_AUTH_MDS; + __do_request(mdsc, req); + mutex_unlock(&mdsc->mutex); + goto out; + } else { + struct ceph_inode_info *ci = ceph_inode(req->r_inode); + struct ceph_cap *cap = NULL; + + if (req->r_session) + cap = ceph_get_cap_for_mds(ci, + req->r_session->s_mds); + + dout("already using auth"); + if ((!cap || cap != ci->i_auth_cap) || + (cap->mseq != req->r_sent_on_mseq)) { + dout("but cap changed, so resending"); + __do_request(mdsc, req); + mutex_unlock(&mdsc->mutex); + goto out; + } + } + dout("have to return ESTALE on request %llu", req->r_tid); + } + + + if (head->safe) { + req->r_got_safe = true; + __unregister_request(mdsc, req); + complete_all(&req->r_safe_completion); + + if (req->r_got_unsafe) { + /* + * We already handled the unsafe response, now do the + * cleanup. No need to examine the response; the MDS + * doesn't include any result info in the safe + * response. And even if it did, there is nothing + * useful we could do with a revised return value. + */ + dout("got safe reply %llu, mds%d\n", tid, mds); + list_del_init(&req->r_unsafe_item); + + /* last unsafe request during umount? */ + if (mdsc->stopping && !__get_oldest_req(mdsc)) + complete_all(&mdsc->safe_umount_waiters); + mutex_unlock(&mdsc->mutex); + goto out; + } + } else { + req->r_got_unsafe = true; + list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); + } + + dout("handle_reply tid %lld result %d\n", tid, result); + rinfo = &req->r_reply_info; + err = parse_reply_info(msg, rinfo, session->s_con.peer_features); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + if (err < 0) { + pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid); + ceph_msg_dump(msg); + goto out_err; + } + + /* snap trace */ + if (rinfo->snapblob_len) { + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, rinfo->snapblob, + rinfo->snapblob + rinfo->snapblob_len, + le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP); + downgrade_write(&mdsc->snap_rwsem); + } else { + down_read(&mdsc->snap_rwsem); + } + + /* insert trace into our cache */ + mutex_lock(&req->r_fill_mutex); + err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); + if (err == 0) { + if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && + rinfo->dir_nr) + ceph_readdir_prepopulate(req, req->r_session); + ceph_unreserve_caps(mdsc, &req->r_caps_reservation); + } + mutex_unlock(&req->r_fill_mutex); + + up_read(&mdsc->snap_rwsem); +out_err: + mutex_lock(&mdsc->mutex); + if (!req->r_aborted) { + if (err) { + req->r_err = err; + } else { + req->r_reply = msg; + ceph_msg_get(msg); + req->r_got_result = true; + } + } else { + dout("reply arrived after request %lld was aborted\n", tid); + } + mutex_unlock(&mdsc->mutex); + + ceph_add_cap_releases(mdsc, req->r_session); + mutex_unlock(&session->s_mutex); + + /* kick calling process */ + complete_request(mdsc, req); +out: + ceph_mdsc_put_request(req); + return; +} + + + +/* + * handle mds notification that our request has been forwarded. + */ +static void handle_forward(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct ceph_mds_request *req; + u64 tid = le64_to_cpu(msg->hdr.tid); + u32 next_mds; + u32 fwd_seq; + int err = -EINVAL; + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + + ceph_decode_need(&p, end, 2*sizeof(u32), bad); + next_mds = ceph_decode_32(&p); + fwd_seq = ceph_decode_32(&p); + + mutex_lock(&mdsc->mutex); + req = __lookup_request(mdsc, tid); + if (!req) { + dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); + goto out; /* dup reply? */ + } + + if (req->r_aborted) { + dout("forward tid %llu aborted, unregistering\n", tid); + __unregister_request(mdsc, req); + } else if (fwd_seq <= req->r_num_fwd) { + dout("forward tid %llu to mds%d - old seq %d <= %d\n", + tid, next_mds, req->r_num_fwd, fwd_seq); + } else { + /* resend. forward race not possible; mds would drop */ + dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); + BUG_ON(req->r_err); + BUG_ON(req->r_got_result); + req->r_num_fwd = fwd_seq; + req->r_resend_mds = next_mds; + put_request_session(req); + __do_request(mdsc, req); + } + ceph_mdsc_put_request(req); +out: + mutex_unlock(&mdsc->mutex); + return; + +bad: + pr_err("mdsc_handle_forward decode error err=%d\n", err); +} + +/* + * handle a mds session control message + */ +static void handle_session(struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct ceph_mds_client *mdsc = session->s_mdsc; + u32 op; + u64 seq; + int mds = session->s_mds; + struct ceph_mds_session_head *h = msg->front.iov_base; + int wake = 0; + + /* decode */ + if (msg->front.iov_len != sizeof(*h)) + goto bad; + op = le32_to_cpu(h->op); + seq = le64_to_cpu(h->seq); + + mutex_lock(&mdsc->mutex); + if (op == CEPH_SESSION_CLOSE) + __unregister_session(mdsc, session); + /* FIXME: this ttl calculation is generous */ + session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose; + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + + dout("handle_session mds%d %s %p state %s seq %llu\n", + mds, ceph_session_op_name(op), session, + session_state_name(session->s_state), seq); + + if (session->s_state == CEPH_MDS_SESSION_HUNG) { + session->s_state = CEPH_MDS_SESSION_OPEN; + pr_info("mds%d came back\n", session->s_mds); + } + + switch (op) { + case CEPH_SESSION_OPEN: + if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) + pr_info("mds%d reconnect success\n", session->s_mds); + session->s_state = CEPH_MDS_SESSION_OPEN; + renewed_caps(mdsc, session, 0); + wake = 1; + if (mdsc->stopping) + __close_session(mdsc, session); + break; + + case CEPH_SESSION_RENEWCAPS: + if (session->s_renew_seq == seq) + renewed_caps(mdsc, session, 1); + break; + + case CEPH_SESSION_CLOSE: + if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) + pr_info("mds%d reconnect denied\n", session->s_mds); + remove_session_caps(session); + wake = 1; /* for good measure */ + wake_up_all(&mdsc->session_close_wq); + kick_requests(mdsc, mds); + break; + + case CEPH_SESSION_STALE: + pr_info("mds%d caps went stale, renewing\n", + session->s_mds); + spin_lock(&session->s_cap_lock); + session->s_cap_gen++; + session->s_cap_ttl = 0; + spin_unlock(&session->s_cap_lock); + send_renew_caps(mdsc, session); + break; + + case CEPH_SESSION_RECALL_STATE: + trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); + break; + + default: + pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); + WARN_ON(1); + } + + mutex_unlock(&session->s_mutex); + if (wake) { + mutex_lock(&mdsc->mutex); + __wake_requests(mdsc, &session->s_waiting); + mutex_unlock(&mdsc->mutex); + } + return; + +bad: + pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds, + (int)msg->front.iov_len); + ceph_msg_dump(msg); + return; +} + + +/* + * called under session->mutex. + */ +static void replay_unsafe_requests(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_request *req, *nreq; + int err; + + dout("replay_unsafe_requests mds%d\n", session->s_mds); + + mutex_lock(&mdsc->mutex); + list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { + err = __prepare_send_request(mdsc, req, session->s_mds); + if (!err) { + ceph_msg_get(req->r_request); + ceph_con_send(&session->s_con, req->r_request); + } + } + mutex_unlock(&mdsc->mutex); +} + +/* + * Encode information about a cap for a reconnect with the MDS. + */ +static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, + void *arg) +{ + union { + struct ceph_mds_cap_reconnect v2; + struct ceph_mds_cap_reconnect_v1 v1; + } rec; + size_t reclen; + struct ceph_inode_info *ci; + struct ceph_reconnect_state *recon_state = arg; + struct ceph_pagelist *pagelist = recon_state->pagelist; + char *path; + int pathlen, err; + u64 pathbase; + struct dentry *dentry; + + ci = cap->ci; + + dout(" adding %p ino %llx.%llx cap %p %lld %s\n", + inode, ceph_vinop(inode), cap, cap->cap_id, + ceph_cap_string(cap->issued)); + err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); + if (err) + return err; + + dentry = d_find_alias(inode); + if (dentry) { + path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out_dput; + } + } else { + path = NULL; + pathlen = 0; + } + err = ceph_pagelist_encode_string(pagelist, path, pathlen); + if (err) + goto out_free; + + spin_lock(&inode->i_lock); + cap->seq = 0; /* reset cap seq */ + cap->issue_seq = 0; /* and issue_seq */ + + if (recon_state->flock) { + rec.v2.cap_id = cpu_to_le64(cap->cap_id); + rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); + rec.v2.issued = cpu_to_le32(cap->issued); + rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + rec.v2.pathbase = cpu_to_le64(pathbase); + rec.v2.flock_len = 0; + reclen = sizeof(rec.v2); + } else { + rec.v1.cap_id = cpu_to_le64(cap->cap_id); + rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); + rec.v1.issued = cpu_to_le32(cap->issued); + rec.v1.size = cpu_to_le64(inode->i_size); + ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime); + ceph_encode_timespec(&rec.v1.atime, &inode->i_atime); + rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + rec.v1.pathbase = cpu_to_le64(pathbase); + reclen = sizeof(rec.v1); + } + spin_unlock(&inode->i_lock); + + if (recon_state->flock) { + int num_fcntl_locks, num_flock_locks; + struct ceph_pagelist_cursor trunc_point; + + ceph_pagelist_set_cursor(pagelist, &trunc_point); + do { + lock_flocks(); + ceph_count_locks(inode, &num_fcntl_locks, + &num_flock_locks); + rec.v2.flock_len = (2*sizeof(u32) + + (num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock)); + unlock_flocks(); + + /* pre-alloc pagelist */ + ceph_pagelist_truncate(pagelist, &trunc_point); + err = ceph_pagelist_append(pagelist, &rec, reclen); + if (!err) + err = ceph_pagelist_reserve(pagelist, + rec.v2.flock_len); + + /* encode locks */ + if (!err) { + lock_flocks(); + err = ceph_encode_locks(inode, + pagelist, + num_fcntl_locks, + num_flock_locks); + unlock_flocks(); + } + } while (err == -ENOSPC); + } else { + err = ceph_pagelist_append(pagelist, &rec, reclen); + } + +out_free: + kfree(path); +out_dput: + dput(dentry); + return err; +} + + +/* + * If an MDS fails and recovers, clients need to reconnect in order to + * reestablish shared state. This includes all caps issued through + * this session _and_ the snap_realm hierarchy. Because it's not + * clear which snap realms the mds cares about, we send everything we + * know about.. that ensures we'll then get any new info the + * recovering MDS might have. + * + * This is a relatively heavyweight operation, but it's rare. + * + * called with mdsc->mutex held. + */ +static void send_mds_reconnect(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *reply; + struct rb_node *p; + int mds = session->s_mds; + int err = -ENOMEM; + struct ceph_pagelist *pagelist; + struct ceph_reconnect_state recon_state; + + pr_info("mds%d reconnect start\n", mds); + + pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); + if (!pagelist) + goto fail_nopagelist; + ceph_pagelist_init(pagelist); + + reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS); + if (!reply) + goto fail_nomsg; + + mutex_lock(&session->s_mutex); + session->s_state = CEPH_MDS_SESSION_RECONNECTING; + session->s_seq = 0; + + ceph_con_open(&session->s_con, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); + + /* replay unsafe requests */ + replay_unsafe_requests(mdsc, session); + + down_read(&mdsc->snap_rwsem); + + dout("session %p state %s\n", session, + session_state_name(session->s_state)); + + /* drop old cap expires; we're about to reestablish that state */ + discard_cap_releases(mdsc, session); + + /* traverse this session's caps */ + err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); + if (err) + goto fail; + + recon_state.pagelist = pagelist; + recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; + err = iterate_session_caps(session, encode_caps_cb, &recon_state); + if (err < 0) + goto fail; + + /* + * snaprealms. we provide mds with the ino, seq (version), and + * parent for all of our realms. If the mds has any newer info, + * it will tell us. + */ + for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { + struct ceph_snap_realm *realm = + rb_entry(p, struct ceph_snap_realm, node); + struct ceph_mds_snaprealm_reconnect sr_rec; + + dout(" adding snap realm %llx seq %lld parent %llx\n", + realm->ino, realm->seq, realm->parent_ino); + sr_rec.ino = cpu_to_le64(realm->ino); + sr_rec.seq = cpu_to_le64(realm->seq); + sr_rec.parent = cpu_to_le64(realm->parent_ino); + err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); + if (err) + goto fail; + } + + reply->pagelist = pagelist; + if (recon_state.flock) + reply->hdr.version = cpu_to_le16(2); + reply->hdr.data_len = cpu_to_le32(pagelist->length); + reply->nr_pages = calc_pages_for(0, pagelist->length); + ceph_con_send(&session->s_con, reply); + + mutex_unlock(&session->s_mutex); + + mutex_lock(&mdsc->mutex); + __wake_requests(mdsc, &session->s_waiting); + mutex_unlock(&mdsc->mutex); + + up_read(&mdsc->snap_rwsem); + return; + +fail: + ceph_msg_put(reply); + up_read(&mdsc->snap_rwsem); + mutex_unlock(&session->s_mutex); +fail_nomsg: + ceph_pagelist_release(pagelist); + kfree(pagelist); +fail_nopagelist: + pr_err("error %d preparing reconnect for mds%d\n", err, mds); + return; +} + + +/* + * compare old and new mdsmaps, kicking requests + * and closing out old connections as necessary + * + * called under mdsc->mutex. + */ +static void check_new_map(struct ceph_mds_client *mdsc, + struct ceph_mdsmap *newmap, + struct ceph_mdsmap *oldmap) +{ + int i; + int oldstate, newstate; + struct ceph_mds_session *s; + + dout("check_new_map new %u old %u\n", + newmap->m_epoch, oldmap->m_epoch); + + for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) { + if (mdsc->sessions[i] == NULL) + continue; + s = mdsc->sessions[i]; + oldstate = ceph_mdsmap_get_state(oldmap, i); + newstate = ceph_mdsmap_get_state(newmap, i); + + dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n", + i, ceph_mds_state_name(oldstate), + ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", + ceph_mds_state_name(newstate), + ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", + session_state_name(s->s_state)); + + if (memcmp(ceph_mdsmap_get_addr(oldmap, i), + ceph_mdsmap_get_addr(newmap, i), + sizeof(struct ceph_entity_addr))) { + if (s->s_state == CEPH_MDS_SESSION_OPENING) { + /* the session never opened, just close it + * out now */ + __wake_requests(mdsc, &s->s_waiting); + __unregister_session(mdsc, s); + } else { + /* just close it */ + mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_lock(&mdsc->mutex); + ceph_con_close(&s->s_con); + mutex_unlock(&s->s_mutex); + s->s_state = CEPH_MDS_SESSION_RESTARTING; + } + + /* kick any requests waiting on the recovering mds */ + kick_requests(mdsc, i); + } else if (oldstate == newstate) { + continue; /* nothing new with this mds */ + } + + /* + * send reconnect? + */ + if (s->s_state == CEPH_MDS_SESSION_RESTARTING && + newstate >= CEPH_MDS_STATE_RECONNECT) { + mutex_unlock(&mdsc->mutex); + send_mds_reconnect(mdsc, s); + mutex_lock(&mdsc->mutex); + } + + /* + * kick request on any mds that has gone active. + */ + if (oldstate < CEPH_MDS_STATE_ACTIVE && + newstate >= CEPH_MDS_STATE_ACTIVE) { + if (oldstate != CEPH_MDS_STATE_CREATING && + oldstate != CEPH_MDS_STATE_STARTING) + pr_info("mds%d recovery completed\n", s->s_mds); + kick_requests(mdsc, i); + ceph_kick_flushing_caps(mdsc, s); + wake_up_session_caps(s, 1); + } + } + + for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) { + s = mdsc->sessions[i]; + if (!s) + continue; + if (!ceph_mdsmap_is_laggy(newmap, i)) + continue; + if (s->s_state == CEPH_MDS_SESSION_OPEN || + s->s_state == CEPH_MDS_SESSION_HUNG || + s->s_state == CEPH_MDS_SESSION_CLOSING) { + dout(" connecting to export targets of laggy mds%d\n", + i); + __open_export_target_sessions(mdsc, s); + } + } +} + + + +/* + * leases + */ + +/* + * caller must hold session s_mutex, dentry->d_lock + */ +void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + + ceph_put_mds_session(di->lease_session); + di->lease_session = NULL; +} + +static void handle_lease(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct super_block *sb = mdsc->fsc->sb; + struct inode *inode; + struct dentry *parent, *dentry; + struct ceph_dentry_info *di; + int mds = session->s_mds; + struct ceph_mds_lease *h = msg->front.iov_base; + u32 seq; + struct ceph_vino vino; + int mask; + struct qstr dname; + int release = 0; + + dout("handle_lease from mds%d\n", mds); + + /* decode */ + if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) + goto bad; + vino.ino = le64_to_cpu(h->ino); + vino.snap = CEPH_NOSNAP; + mask = le16_to_cpu(h->mask); + seq = le32_to_cpu(h->seq); + dname.name = (void *)h + sizeof(*h) + sizeof(u32); + dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); + if (dname.len != get_unaligned_le32(h+1)) + goto bad; + + mutex_lock(&session->s_mutex); + session->s_seq++; + + /* lookup inode */ + inode = ceph_find_inode(sb, vino); + dout("handle_lease %s, mask %d, ino %llx %p %.*s\n", + ceph_lease_op_name(h->action), mask, vino.ino, inode, + dname.len, dname.name); + if (inode == NULL) { + dout("handle_lease no inode %llx\n", vino.ino); + goto release; + } + + /* dentry */ + parent = d_find_alias(inode); + if (!parent) { + dout("no parent dentry on inode %p\n", inode); + WARN_ON(1); + goto release; /* hrm... */ + } + dname.hash = full_name_hash(dname.name, dname.len); + dentry = d_lookup(parent, &dname); + dput(parent); + if (!dentry) + goto release; + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + switch (h->action) { + case CEPH_MDS_LEASE_REVOKE: + if (di && di->lease_session == session) { + if (ceph_seq_cmp(di->lease_seq, seq) > 0) + h->seq = cpu_to_le32(di->lease_seq); + __ceph_mdsc_drop_dentry_lease(dentry); + } + release = 1; + break; + + case CEPH_MDS_LEASE_RENEW: + if (di && di->lease_session == session && + di->lease_gen == session->s_cap_gen && + di->lease_renew_from && + di->lease_renew_after == 0) { + unsigned long duration = + le32_to_cpu(h->duration_ms) * HZ / 1000; + + di->lease_seq = seq; + dentry->d_time = di->lease_renew_from + duration; + di->lease_renew_after = di->lease_renew_from + + (duration >> 1); + di->lease_renew_from = 0; + } + break; + } + spin_unlock(&dentry->d_lock); + dput(dentry); + + if (!release) + goto out; + +release: + /* let's just reuse the same message */ + h->action = CEPH_MDS_LEASE_REVOKE_ACK; + ceph_msg_get(msg); + ceph_con_send(&session->s_con, msg); + +out: + iput(inode); + mutex_unlock(&session->s_mutex); + return; + +bad: + pr_err("corrupt lease message\n"); + ceph_msg_dump(msg); +} + +void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, + struct inode *inode, + struct dentry *dentry, char action, + u32 seq) +{ + struct ceph_msg *msg; + struct ceph_mds_lease *lease; + int len = sizeof(*lease) + sizeof(u32); + int dnamelen = 0; + + dout("lease_send_msg inode %p dentry %p %s to mds%d\n", + inode, dentry, ceph_lease_op_name(action), session->s_mds); + dnamelen = dentry->d_name.len; + len += dnamelen; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS); + if (!msg) + return; + lease = msg->front.iov_base; + lease->action = action; + lease->mask = cpu_to_le16(1); + lease->ino = cpu_to_le64(ceph_vino(inode).ino); + lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); + lease->seq = cpu_to_le32(seq); + put_unaligned_le32(dnamelen, lease + 1); + memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen); + + /* + * if this is a preemptive lease RELEASE, no need to + * flush request stream, since the actual request will + * soon follow. + */ + msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE); + + ceph_con_send(&session->s_con, msg); +} + +/* + * Preemptively release a lease we expect to invalidate anyway. + * Pass @inode always, @dentry is optional. + */ +void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, + struct dentry *dentry, int mask) +{ + struct ceph_dentry_info *di; + struct ceph_mds_session *session; + u32 seq; + + BUG_ON(inode == NULL); + BUG_ON(dentry == NULL); + BUG_ON(mask == 0); + + /* is dentry lease valid? */ + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + if (!di || !di->lease_session || + di->lease_session->s_mds < 0 || + di->lease_gen != di->lease_session->s_cap_gen || + !time_before(jiffies, dentry->d_time)) { + dout("lease_release inode %p dentry %p -- " + "no lease on %d\n", + inode, dentry, mask); + spin_unlock(&dentry->d_lock); + return; + } + + /* we do have a lease on this dentry; note mds and seq */ + session = ceph_get_mds_session(di->lease_session); + seq = di->lease_seq; + __ceph_mdsc_drop_dentry_lease(dentry); + spin_unlock(&dentry->d_lock); + + dout("lease_release inode %p dentry %p mask %d to mds%d\n", + inode, dentry, mask, session->s_mds); + ceph_mdsc_lease_send_msg(session, inode, dentry, + CEPH_MDS_LEASE_RELEASE, seq); + ceph_put_mds_session(session); +} + +/* + * drop all leases (and dentry refs) in preparation for umount + */ +static void drop_leases(struct ceph_mds_client *mdsc) +{ + int i; + + dout("drop_leases\n"); + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); + if (!s) + continue; + mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_unlock(&s->s_mutex); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); +} + + + +/* + * delayed work -- periodically trim expired leases, renew caps with mds + */ +static void schedule_delayed(struct ceph_mds_client *mdsc) +{ + int delay = 5; + unsigned hz = round_jiffies_relative(HZ * delay); + schedule_delayed_work(&mdsc->delayed_work, hz); +} + +static void delayed_work(struct work_struct *work) +{ + int i; + struct ceph_mds_client *mdsc = + container_of(work, struct ceph_mds_client, delayed_work.work); + int renew_interval; + int renew_caps; + + dout("mdsc delayed_work\n"); + ceph_check_delayed_caps(mdsc); + + mutex_lock(&mdsc->mutex); + renew_interval = mdsc->mdsmap->m_session_timeout >> 2; + renew_caps = time_after_eq(jiffies, HZ*renew_interval + + mdsc->last_renew_caps); + if (renew_caps) + mdsc->last_renew_caps = jiffies; + + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); + if (s == NULL) + continue; + if (s->s_state == CEPH_MDS_SESSION_CLOSING) { + dout("resending session close request for mds%d\n", + s->s_mds); + request_close_session(mdsc, s); + ceph_put_mds_session(s); + continue; + } + if (s->s_ttl && time_after(jiffies, s->s_ttl)) { + if (s->s_state == CEPH_MDS_SESSION_OPEN) { + s->s_state = CEPH_MDS_SESSION_HUNG; + pr_info("mds%d hung\n", s->s_mds); + } + } + if (s->s_state < CEPH_MDS_SESSION_OPEN) { + /* this mds is failed or recovering, just wait */ + ceph_put_mds_session(s); + continue; + } + mutex_unlock(&mdsc->mutex); + + mutex_lock(&s->s_mutex); + if (renew_caps) + send_renew_caps(mdsc, s); + else + ceph_con_keepalive(&s->s_con); + ceph_add_cap_releases(mdsc, s); + if (s->s_state == CEPH_MDS_SESSION_OPEN || + s->s_state == CEPH_MDS_SESSION_HUNG) + ceph_send_cap_releases(mdsc, s); + mutex_unlock(&s->s_mutex); + ceph_put_mds_session(s); + + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); + + schedule_delayed(mdsc); +} + +int ceph_mdsc_init(struct ceph_fs_client *fsc) + +{ + struct ceph_mds_client *mdsc; + + mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); + if (!mdsc) + return -ENOMEM; + mdsc->fsc = fsc; + fsc->mdsc = mdsc; + mutex_init(&mdsc->mutex); + mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); + if (mdsc->mdsmap == NULL) + return -ENOMEM; + + init_completion(&mdsc->safe_umount_waiters); + init_waitqueue_head(&mdsc->session_close_wq); + INIT_LIST_HEAD(&mdsc->waiting_for_map); + mdsc->sessions = NULL; + mdsc->max_sessions = 0; + mdsc->stopping = 0; + init_rwsem(&mdsc->snap_rwsem); + mdsc->snap_realms = RB_ROOT; + INIT_LIST_HEAD(&mdsc->snap_empty); + spin_lock_init(&mdsc->snap_empty_lock); + mdsc->last_tid = 0; + mdsc->request_tree = RB_ROOT; + INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); + mdsc->last_renew_caps = jiffies; + INIT_LIST_HEAD(&mdsc->cap_delay_list); + spin_lock_init(&mdsc->cap_delay_lock); + INIT_LIST_HEAD(&mdsc->snap_flush_list); + spin_lock_init(&mdsc->snap_flush_lock); + mdsc->cap_flush_seq = 0; + INIT_LIST_HEAD(&mdsc->cap_dirty); + INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); + mdsc->num_cap_flushing = 0; + spin_lock_init(&mdsc->cap_dirty_lock); + init_waitqueue_head(&mdsc->cap_flushing_wq); + spin_lock_init(&mdsc->dentry_lru_lock); + INIT_LIST_HEAD(&mdsc->dentry_lru); + + ceph_caps_init(mdsc); + ceph_adjust_min_caps(mdsc, fsc->min_caps); + + return 0; +} + +/* + * Wait for safe replies on open mds requests. If we time out, drop + * all requests from the tree to avoid dangling dentry refs. + */ +static void wait_requests(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_request *req; + struct ceph_fs_client *fsc = mdsc->fsc; + + mutex_lock(&mdsc->mutex); + if (__get_oldest_req(mdsc)) { + mutex_unlock(&mdsc->mutex); + + dout("wait_requests waiting for requests\n"); + wait_for_completion_timeout(&mdsc->safe_umount_waiters, + fsc->client->options->mount_timeout * HZ); + + /* tear down remaining requests */ + mutex_lock(&mdsc->mutex); + while ((req = __get_oldest_req(mdsc))) { + dout("wait_requests timed out on tid %llu\n", + req->r_tid); + __unregister_request(mdsc, req); + } + } + mutex_unlock(&mdsc->mutex); + dout("wait_requests done\n"); +} + +/* + * called before mount is ro, and before dentries are torn down. + * (hmm, does this still race with new lookups?) + */ +void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) +{ + dout("pre_umount\n"); + mdsc->stopping = 1; + + drop_leases(mdsc); + ceph_flush_dirty_caps(mdsc); + wait_requests(mdsc); + + /* + * wait for reply handlers to drop their request refs and + * their inode/dcache refs + */ + ceph_msgr_flush(); +} + +/* + * wait for all write mds requests to flush. + */ +static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) +{ + struct ceph_mds_request *req = NULL, *nextreq; + struct rb_node *n; + + mutex_lock(&mdsc->mutex); + dout("wait_unsafe_requests want %lld\n", want_tid); +restart: + req = __get_oldest_req(mdsc); + while (req && req->r_tid <= want_tid) { + /* find next request */ + n = rb_next(&req->r_node); + if (n) + nextreq = rb_entry(n, struct ceph_mds_request, r_node); + else + nextreq = NULL; + if ((req->r_op & CEPH_MDS_OP_WRITE)) { + /* write op */ + ceph_mdsc_get_request(req); + if (nextreq) + ceph_mdsc_get_request(nextreq); + mutex_unlock(&mdsc->mutex); + dout("wait_unsafe_requests wait on %llu (want %llu)\n", + req->r_tid, want_tid); + wait_for_completion(&req->r_safe_completion); + mutex_lock(&mdsc->mutex); + ceph_mdsc_put_request(req); + if (!nextreq) + break; /* next dne before, so we're done! */ + if (RB_EMPTY_NODE(&nextreq->r_node)) { + /* next request was removed from tree */ + ceph_mdsc_put_request(nextreq); + goto restart; + } + ceph_mdsc_put_request(nextreq); /* won't go away */ + } + req = nextreq; + } + mutex_unlock(&mdsc->mutex); + dout("wait_unsafe_requests done\n"); +} + +void ceph_mdsc_sync(struct ceph_mds_client *mdsc) +{ + u64 want_tid, want_flush; + + if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) + return; + + dout("sync\n"); + mutex_lock(&mdsc->mutex); + want_tid = mdsc->last_tid; + want_flush = mdsc->cap_flush_seq; + mutex_unlock(&mdsc->mutex); + dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); + + ceph_flush_dirty_caps(mdsc); + + wait_unsafe_requests(mdsc, want_tid); + wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); +} + +/* + * true if all sessions are closed, or we force unmount + */ +bool done_closing_sessions(struct ceph_mds_client *mdsc) +{ + int i, n = 0; + + if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) + return true; + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) + if (mdsc->sessions[i]) + n++; + mutex_unlock(&mdsc->mutex); + return n == 0; +} + +/* + * called after sb is ro. + */ +void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_session *session; + int i; + struct ceph_fs_client *fsc = mdsc->fsc; + unsigned long timeout = fsc->client->options->mount_timeout * HZ; + + dout("close_sessions\n"); + + /* close sessions */ + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + session = __ceph_lookup_mds_session(mdsc, i); + if (!session) + continue; + mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); + __close_session(mdsc, session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); + + dout("waiting for sessions to close\n"); + wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), + timeout); + + /* tear down remaining sessions */ + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + if (mdsc->sessions[i]) { + session = get_session(mdsc->sessions[i]); + __unregister_session(mdsc, session); + mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); + remove_session_caps(session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + } + } + WARN_ON(!list_empty(&mdsc->cap_delay_list)); + mutex_unlock(&mdsc->mutex); + + ceph_cleanup_empty_realms(mdsc); + + cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ + + dout("stopped\n"); +} + +static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) +{ + dout("stop\n"); + cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ + if (mdsc->mdsmap) + ceph_mdsmap_destroy(mdsc->mdsmap); + kfree(mdsc->sessions); + ceph_caps_finalize(mdsc); +} + +void ceph_mdsc_destroy(struct ceph_fs_client *fsc) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + + dout("mdsc_destroy %p\n", mdsc); + ceph_mdsc_stop(mdsc); + + /* flush out any connection work with references to us */ + ceph_msgr_flush(); + + fsc->mdsc = NULL; + kfree(mdsc); + dout("mdsc_destroy %p done\n", mdsc); +} + + +/* + * handle mds map update. + */ +void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) +{ + u32 epoch; + u32 maplen; + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + struct ceph_mdsmap *newmap, *oldmap; + struct ceph_fsid fsid; + int err = -EINVAL; + + ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); + ceph_decode_copy(&p, &fsid, sizeof(fsid)); + if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0) + return; + epoch = ceph_decode_32(&p); + maplen = ceph_decode_32(&p); + dout("handle_map epoch %u len %d\n", epoch, (int)maplen); + + /* do we need it? */ + ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch); + mutex_lock(&mdsc->mutex); + if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { + dout("handle_map epoch %u <= our %u\n", + epoch, mdsc->mdsmap->m_epoch); + mutex_unlock(&mdsc->mutex); + return; + } + + newmap = ceph_mdsmap_decode(&p, end); + if (IS_ERR(newmap)) { + err = PTR_ERR(newmap); + goto bad_unlock; + } + + /* swap into place */ + if (mdsc->mdsmap) { + oldmap = mdsc->mdsmap; + mdsc->mdsmap = newmap; + check_new_map(mdsc, newmap, oldmap); + ceph_mdsmap_destroy(oldmap); + } else { + mdsc->mdsmap = newmap; /* first mds map */ + } + mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; + + __wake_requests(mdsc, &mdsc->waiting_for_map); + + mutex_unlock(&mdsc->mutex); + schedule_delayed(mdsc); + return; + +bad_unlock: + mutex_unlock(&mdsc->mutex); +bad: + pr_err("error decoding mdsmap %d\n", err); + return; +} + +static struct ceph_connection *con_get(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + + if (get_session(s)) { + dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref)); + return con; + } + dout("mdsc con_get %p FAIL\n", s); + return NULL; +} + +static void con_put(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + + dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1); + ceph_put_mds_session(s); +} + +/* + * if the client is unresponsive for long enough, the mds will kill + * the session entirely. + */ +static void peer_reset(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + + pr_warning("mds%d closed our session\n", s->s_mds); + send_mds_reconnect(mdsc, s); +} + +static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + int type = le16_to_cpu(msg->hdr.type); + + mutex_lock(&mdsc->mutex); + if (__verify_registered_session(mdsc, s) < 0) { + mutex_unlock(&mdsc->mutex); + goto out; + } + mutex_unlock(&mdsc->mutex); + + switch (type) { + case CEPH_MSG_MDS_MAP: + ceph_mdsc_handle_map(mdsc, msg); + break; + case CEPH_MSG_CLIENT_SESSION: + handle_session(s, msg); + break; + case CEPH_MSG_CLIENT_REPLY: + handle_reply(s, msg); + break; + case CEPH_MSG_CLIENT_REQUEST_FORWARD: + handle_forward(mdsc, s, msg); + break; + case CEPH_MSG_CLIENT_CAPS: + ceph_handle_caps(s, msg); + break; + case CEPH_MSG_CLIENT_SNAP: + ceph_handle_snap(mdsc, s, msg); + break; + case CEPH_MSG_CLIENT_LEASE: + handle_lease(mdsc, s, msg); + break; + + default: + pr_err("received unknown message type %d %s\n", type, + ceph_msg_type_name(type)); + } +out: + ceph_msg_put(msg); +} + +/* + * authentication + */ +static int get_authorizer(struct ceph_connection *con, + void **buf, int *len, int *proto, + void **reply_buf, int *reply_len, int force_new) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + int ret = 0; + + if (force_new && s->s_authorizer) { + ac->ops->destroy_authorizer(ac, s->s_authorizer); + s->s_authorizer = NULL; + } + if (s->s_authorizer == NULL) { + if (ac->ops->create_authorizer) { + ret = ac->ops->create_authorizer( + ac, CEPH_ENTITY_TYPE_MDS, + &s->s_authorizer, + &s->s_authorizer_buf, + &s->s_authorizer_buf_len, + &s->s_authorizer_reply_buf, + &s->s_authorizer_reply_buf_len); + if (ret) + return ret; + } + } + + *proto = ac->protocol; + *buf = s->s_authorizer_buf; + *len = s->s_authorizer_buf_len; + *reply_buf = s->s_authorizer_reply_buf; + *reply_len = s->s_authorizer_reply_buf_len; + return 0; +} + + +static int verify_authorizer_reply(struct ceph_connection *con, int len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + + return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); +} + +static int invalidate_authorizer(struct ceph_connection *con) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + + if (ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); + + return ceph_monc_validate_auth(&mdsc->fsc->client->monc); +} + +static const struct ceph_connection_operations mds_con_ops = { + .get = con_get, + .put = con_put, + .dispatch = dispatch, + .get_authorizer = get_authorizer, + .verify_authorizer_reply = verify_authorizer_reply, + .invalidate_authorizer = invalidate_authorizer, + .peer_reset = peer_reset, +}; + +/* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h new file mode 100644 index 00000000..7d8a0d66 --- /dev/null +++ b/fs/ceph/mds_client.h @@ -0,0 +1,379 @@ +#ifndef _FS_CEPH_MDS_CLIENT_H +#define _FS_CEPH_MDS_CLIENT_H + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Some lock dependencies: + * + * session->s_mutex + * mdsc->mutex + * + * mdsc->snap_rwsem + * + * inode->i_lock + * mdsc->snap_flush_lock + * mdsc->cap_delay_lock + * + */ + +struct ceph_fs_client; +struct ceph_cap; + +/* + * parsed info about a single inode. pointers are into the encoded + * on-wire structures within the mds reply message payload. + */ +struct ceph_mds_reply_info_in { + struct ceph_mds_reply_inode *in; + struct ceph_dir_layout dir_layout; + u32 symlink_len; + char *symlink; + u32 xattr_len; + char *xattr_data; +}; + +/* + * parsed info about an mds reply, including information about + * either: 1) the target inode and/or its parent directory and dentry, + * and directory contents (for readdir results), or + * 2) the file range lock info (for fcntl F_GETLK results). + */ +struct ceph_mds_reply_info_parsed { + struct ceph_mds_reply_head *head; + + /* trace */ + struct ceph_mds_reply_info_in diri, targeti; + struct ceph_mds_reply_dirfrag *dirfrag; + char *dname; + u32 dname_len; + struct ceph_mds_reply_lease *dlease; + + /* extra */ + union { + /* for fcntl F_GETLK results */ + struct ceph_filelock *filelock_reply; + + /* for readdir results */ + struct { + struct ceph_mds_reply_dirfrag *dir_dir; + int dir_nr; + char **dir_dname; + u32 *dir_dname_len; + struct ceph_mds_reply_lease **dir_dlease; + struct ceph_mds_reply_info_in *dir_in; + u8 dir_complete, dir_end; + }; + }; + + /* encoded blob describing snapshot contexts for certain + operations (e.g., open) */ + void *snapblob; + int snapblob_len; +}; + + +/* + * cap releases are batched and sent to the MDS en masse. + */ +#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \ + sizeof(struct ceph_mds_cap_release)) / \ + sizeof(struct ceph_mds_cap_item)) + + +/* + * state associated with each MDS<->client session + */ +enum { + CEPH_MDS_SESSION_NEW = 1, + CEPH_MDS_SESSION_OPENING = 2, + CEPH_MDS_SESSION_OPEN = 3, + CEPH_MDS_SESSION_HUNG = 4, + CEPH_MDS_SESSION_CLOSING = 5, + CEPH_MDS_SESSION_RESTARTING = 6, + CEPH_MDS_SESSION_RECONNECTING = 7, +}; + +struct ceph_mds_session { + struct ceph_mds_client *s_mdsc; + int s_mds; + int s_state; + unsigned long s_ttl; /* time until mds kills us */ + u64 s_seq; /* incoming msg seq # */ + struct mutex s_mutex; /* serialize session messages */ + + struct ceph_connection s_con; + + struct ceph_authorizer *s_authorizer; + void *s_authorizer_buf, *s_authorizer_reply_buf; + size_t s_authorizer_buf_len, s_authorizer_reply_buf_len; + + /* protected by s_cap_lock */ + spinlock_t s_cap_lock; + u32 s_cap_gen; /* inc each time we get mds stale msg */ + unsigned long s_cap_ttl; /* when session caps expire */ + struct list_head s_caps; /* all caps issued by this session */ + int s_nr_caps, s_trim_caps; + int s_num_cap_releases; + struct list_head s_cap_releases; /* waiting cap_release messages */ + struct list_head s_cap_releases_done; /* ready to send */ + struct ceph_cap *s_cap_iterator; + + /* protected by mutex */ + struct list_head s_cap_flushing; /* inodes w/ flushing caps */ + struct list_head s_cap_snaps_flushing; + unsigned long s_renew_requested; /* last time we sent a renew req */ + u64 s_renew_seq; + + atomic_t s_ref; + struct list_head s_waiting; /* waiting requests */ + struct list_head s_unsafe; /* unsafe requests */ +}; + +/* + * modes of choosing which MDS to send a request to + */ +enum { + USE_ANY_MDS, + USE_RANDOM_MDS, + USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */ +}; + +struct ceph_mds_request; +struct ceph_mds_client; + +/* + * request completion callback + */ +typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); + +/* + * an in-flight mds request + */ +struct ceph_mds_request { + u64 r_tid; /* transaction id */ + struct rb_node r_node; + struct ceph_mds_client *r_mdsc; + + int r_op; /* mds op code */ + + /* operation on what? */ + struct inode *r_inode; /* arg1 */ + struct dentry *r_dentry; /* arg1 */ + struct dentry *r_old_dentry; /* arg2: rename from or link from */ + char *r_path1, *r_path2; + struct ceph_vino r_ino1, r_ino2; + + struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ + struct inode *r_target_inode; /* resulting inode */ + + struct mutex r_fill_mutex; + + union ceph_mds_request_args r_args; + int r_fmode; /* file mode, if expecting cap */ + uid_t r_uid; + gid_t r_gid; + + /* for choosing which mds to send this request to */ + int r_direct_mode; + u32 r_direct_hash; /* choose dir frag based on this dentry hash */ + bool r_direct_is_hash; /* true if r_direct_hash is valid */ + + /* data payload is used for xattr ops */ + struct page **r_pages; + int r_num_pages; + int r_data_len; + + /* what caps shall we drop? */ + int r_inode_drop, r_inode_unless; + int r_dentry_drop, r_dentry_unless; + int r_old_dentry_drop, r_old_dentry_unless; + struct inode *r_old_inode; + int r_old_inode_drop, r_old_inode_unless; + + struct ceph_msg *r_request; /* original request */ + int r_request_release_offset; + struct ceph_msg *r_reply; + struct ceph_mds_reply_info_parsed r_reply_info; + int r_err; + bool r_aborted; + + unsigned long r_timeout; /* optional. jiffies */ + unsigned long r_started; /* start time to measure timeout against */ + unsigned long r_request_started; /* start time for mds request only, + used to measure lease durations */ + + /* link unsafe requests to parent directory, for fsync */ + struct inode *r_unsafe_dir; + struct list_head r_unsafe_dir_item; + + struct ceph_mds_session *r_session; + + int r_attempts; /* resend attempts */ + int r_num_fwd; /* number of forward attempts */ + int r_resend_mds; /* mds to resend to next, if any*/ + u32 r_sent_on_mseq; /* cap mseq request was sent at*/ + + struct kref r_kref; + struct list_head r_wait; + struct completion r_completion; + struct completion r_safe_completion; + ceph_mds_request_callback_t r_callback; + struct list_head r_unsafe_item; /* per-session unsafe list item */ + bool r_got_unsafe, r_got_safe, r_got_result; + + bool r_did_prepopulate; + u32 r_readdir_offset; + + struct ceph_cap_reservation r_caps_reservation; + int r_num_caps; +}; + +/* + * mds client state + */ +struct ceph_mds_client { + struct ceph_fs_client *fsc; + struct mutex mutex; /* all nested structures */ + + struct ceph_mdsmap *mdsmap; + struct completion safe_umount_waiters; + wait_queue_head_t session_close_wq; + struct list_head waiting_for_map; + + struct ceph_mds_session **sessions; /* NULL for mds if no session */ + int max_sessions; /* len of s_mds_sessions */ + int stopping; /* true if shutting down */ + + /* + * snap_rwsem will cover cap linkage into snaprealms, and + * realm snap contexts. (later, we can do per-realm snap + * contexts locks..) the empty list contains realms with no + * references (implying they contain no inodes with caps) that + * should be destroyed. + */ + struct rw_semaphore snap_rwsem; + struct rb_root snap_realms; + struct list_head snap_empty; + spinlock_t snap_empty_lock; /* protect snap_empty */ + + u64 last_tid; /* most recent mds request */ + struct rb_root request_tree; /* pending mds requests */ + struct delayed_work delayed_work; /* delayed work */ + unsigned long last_renew_caps; /* last time we renewed our caps */ + struct list_head cap_delay_list; /* caps with delayed release */ + spinlock_t cap_delay_lock; /* protects cap_delay_list */ + struct list_head snap_flush_list; /* cap_snaps ready to flush */ + spinlock_t snap_flush_lock; + + u64 cap_flush_seq; + struct list_head cap_dirty; /* inodes with dirty caps */ + struct list_head cap_dirty_migrating; /* ...that are migration... */ + int num_cap_flushing; /* # caps we are flushing */ + spinlock_t cap_dirty_lock; /* protects above items */ + wait_queue_head_t cap_flushing_wq; + + /* + * Cap reservations + * + * Maintain a global pool of preallocated struct ceph_caps, referenced + * by struct ceph_caps_reservations. This ensures that we preallocate + * memory needed to successfully process an MDS response. (If an MDS + * sends us cap information and we fail to process it, we will have + * problems due to the client and MDS being out of sync.) + * + * Reservations are 'owned' by a ceph_cap_reservation context. + */ + spinlock_t caps_list_lock; + struct list_head caps_list; /* unused (reserved or + unreserved) */ + int caps_total_count; /* total caps allocated */ + int caps_use_count; /* in use */ + int caps_reserve_count; /* unused, reserved */ + int caps_avail_count; /* unused, unreserved */ + int caps_min_count; /* keep at least this many + (unreserved) */ + spinlock_t dentry_lru_lock; + struct list_head dentry_lru; + int num_dentry; +}; + +extern const char *ceph_mds_op_name(int op); + +extern struct ceph_mds_session * +__ceph_lookup_mds_session(struct ceph_mds_client *, int mds); + +static inline struct ceph_mds_session * +ceph_get_mds_session(struct ceph_mds_session *s) +{ + atomic_inc(&s->s_ref); + return s; +} + +extern void ceph_put_mds_session(struct ceph_mds_session *s); + +extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, + struct ceph_msg *msg, int mds); + +extern int ceph_mdsc_init(struct ceph_fs_client *fsc); +extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); +extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); + +extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); + +extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, + struct inode *inode, + struct dentry *dn, int mask); + +extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); + +extern struct ceph_mds_request * +ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); +extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); +extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req); +static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) +{ + kref_get(&req->r_kref); +} +extern void ceph_mdsc_release_request(struct kref *kref); +static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) +{ + kref_put(&req->r_kref, ceph_mdsc_release_request); +} + +extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); + +extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); + +extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, + int stop_on_nosnap); + +extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); +extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, + struct inode *inode, + struct dentry *dentry, char action, + u32 seq); + +extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, + struct ceph_msg *msg); + +extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); + +#endif diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c new file mode 100644 index 00000000..73b7d44e --- /dev/null +++ b/fs/ceph/mdsmap.c @@ -0,0 +1,179 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "super.h" + + +/* + * choose a random mds that is "up" (i.e. has a state > 0), or -1. + */ +int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) +{ + int n = 0; + int i; + char r; + + /* count */ + for (i = 0; i < m->m_max_mds; i++) + if (m->m_info[i].state > 0) + n++; + if (n == 0) + return -1; + + /* pick */ + get_random_bytes(&r, 1); + n = r % n; + i = 0; + for (i = 0; n > 0; i++, n--) + while (m->m_info[i].state <= 0) + i++; + + return i; +} + +/* + * Decode an MDS map + * + * Ignore any fields we don't care about (there are quite a few of + * them). + */ +struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) +{ + struct ceph_mdsmap *m; + const void *start = *p; + int i, j, n; + int err = -EINVAL; + u16 version; + + m = kzalloc(sizeof(*m), GFP_NOFS); + if (m == NULL) + return ERR_PTR(-ENOMEM); + + ceph_decode_16_safe(p, end, version, bad); + + ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); + m->m_epoch = ceph_decode_32(p); + m->m_client_epoch = ceph_decode_32(p); + m->m_last_failure = ceph_decode_32(p); + m->m_root = ceph_decode_32(p); + m->m_session_timeout = ceph_decode_32(p); + m->m_session_autoclose = ceph_decode_32(p); + m->m_max_file_size = ceph_decode_64(p); + m->m_max_mds = ceph_decode_32(p); + + m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS); + if (m->m_info == NULL) + goto badmem; + + /* pick out active nodes from mds_info (state > 0) */ + n = ceph_decode_32(p); + for (i = 0; i < n; i++) { + u64 global_id; + u32 namelen; + s32 mds, inc, state; + u64 state_seq; + u8 infoversion; + struct ceph_entity_addr addr; + u32 num_export_targets; + void *pexport_targets = NULL; + struct ceph_timespec laggy_since; + + ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); + global_id = ceph_decode_64(p); + infoversion = ceph_decode_8(p); + *p += sizeof(u64); + namelen = ceph_decode_32(p); /* skip mds name */ + *p += namelen; + + ceph_decode_need(p, end, + 4*sizeof(u32) + sizeof(u64) + + sizeof(addr) + sizeof(struct ceph_timespec), + bad); + mds = ceph_decode_32(p); + inc = ceph_decode_32(p); + state = ceph_decode_32(p); + state_seq = ceph_decode_64(p); + ceph_decode_copy(p, &addr, sizeof(addr)); + ceph_decode_addr(&addr); + ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); + *p += sizeof(u32); + ceph_decode_32_safe(p, end, namelen, bad); + *p += namelen; + if (infoversion >= 2) { + ceph_decode_32_safe(p, end, num_export_targets, bad); + pexport_targets = *p; + *p += num_export_targets * sizeof(u32); + } else { + num_export_targets = 0; + } + + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", + i+1, n, global_id, mds, inc, + ceph_pr_addr(&addr.in_addr), + ceph_mds_state_name(state)); + if (mds >= 0 && mds < m->m_max_mds && state > 0) { + m->m_info[mds].global_id = global_id; + m->m_info[mds].state = state; + m->m_info[mds].addr = addr; + m->m_info[mds].laggy = + (laggy_since.tv_sec != 0 || + laggy_since.tv_nsec != 0); + m->m_info[mds].num_export_targets = num_export_targets; + if (num_export_targets) { + m->m_info[mds].export_targets = + kcalloc(num_export_targets, sizeof(u32), + GFP_NOFS); + for (j = 0; j < num_export_targets; j++) + m->m_info[mds].export_targets[j] = + ceph_decode_32(&pexport_targets); + } else { + m->m_info[mds].export_targets = NULL; + } + } + } + + /* pg_pools */ + ceph_decode_32_safe(p, end, n, bad); + m->m_num_data_pg_pools = n; + m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS); + if (!m->m_data_pg_pools) + goto badmem; + ceph_decode_need(p, end, sizeof(u32)*(n+1), bad); + for (i = 0; i < n; i++) + m->m_data_pg_pools[i] = ceph_decode_32(p); + m->m_cas_pg_pool = ceph_decode_32(p); + + /* ok, we don't care about the rest. */ + dout("mdsmap_decode success epoch %u\n", m->m_epoch); + return m; + +badmem: + err = -ENOMEM; +bad: + pr_err("corrupt mdsmap\n"); + print_hex_dump(KERN_DEBUG, "mdsmap: ", + DUMP_PREFIX_OFFSET, 16, 1, + start, end - start, true); + ceph_mdsmap_destroy(m); + return ERR_PTR(-EINVAL); +} + +void ceph_mdsmap_destroy(struct ceph_mdsmap *m) +{ + int i; + + for (i = 0; i < m->m_max_mds; i++) + kfree(m->m_info[i].export_targets); + kfree(m->m_info); + kfree(m->m_data_pg_pools); + kfree(m); +} diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c new file mode 100644 index 00000000..54b14de2 --- /dev/null +++ b/fs/ceph/snap.c @@ -0,0 +1,916 @@ +#include + +#include +#include + +#include "super.h" +#include "mds_client.h" + +#include + +/* + * Snapshots in ceph are driven in large part by cooperation from the + * client. In contrast to local file systems or file servers that + * implement snapshots at a single point in the system, ceph's + * distributed access to storage requires clients to help decide + * whether a write logically occurs before or after a recently created + * snapshot. + * + * This provides a perfect instantanous client-wide snapshot. Between + * clients, however, snapshots may appear to be applied at slightly + * different points in time, depending on delays in delivering the + * snapshot notification. + * + * Snapshots are _not_ file system-wide. Instead, each snapshot + * applies to the subdirectory nested beneath some directory. This + * effectively divides the hierarchy into multiple "realms," where all + * of the files contained by each realm share the same set of + * snapshots. An individual realm's snap set contains snapshots + * explicitly created on that realm, as well as any snaps in its + * parent's snap set _after_ the point at which the parent became it's + * parent (due to, say, a rename). Similarly, snaps from prior parents + * during the time intervals during which they were the parent are included. + * + * The client is spared most of this detail, fortunately... it must only + * maintains a hierarchy of realms reflecting the current parent/child + * realm relationship, and for each realm has an explicit list of snaps + * inherited from prior parents. + * + * A snap_realm struct is maintained for realms containing every inode + * with an open cap in the system. (The needed snap realm information is + * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq' + * version number is used to ensure that as realm parameters change (new + * snapshot, new parent, etc.) the client's realm hierarchy is updated. + * + * The realm hierarchy drives the generation of a 'snap context' for each + * realm, which simply lists the resulting set of snaps for the realm. This + * is attached to any writes sent to OSDs. + */ +/* + * Unfortunately error handling is a bit mixed here. If we get a snap + * update, but don't have enough memory to update our realm hierarchy, + * it's not clear what we can do about it (besides complaining to the + * console). + */ + + +/* + * increase ref count for the realm + * + * caller must hold snap_rwsem for write. + */ +void ceph_get_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + dout("get_realm %p %d -> %d\n", realm, + atomic_read(&realm->nref), atomic_read(&realm->nref)+1); + /* + * since we _only_ increment realm refs or empty the empty + * list with snap_rwsem held, adjusting the empty list here is + * safe. we do need to protect against concurrent empty list + * additions, however. + */ + if (atomic_read(&realm->nref) == 0) { + spin_lock(&mdsc->snap_empty_lock); + list_del_init(&realm->empty_item); + spin_unlock(&mdsc->snap_empty_lock); + } + + atomic_inc(&realm->nref); +} + +static void __insert_snap_realm(struct rb_root *root, + struct ceph_snap_realm *new) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct ceph_snap_realm *r = NULL; + + while (*p) { + parent = *p; + r = rb_entry(parent, struct ceph_snap_realm, node); + if (new->ino < r->ino) + p = &(*p)->rb_left; + else if (new->ino > r->ino) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, root); +} + +/* + * create and get the realm rooted at @ino and bump its ref count. + * + * caller must hold snap_rwsem for write. + */ +static struct ceph_snap_realm *ceph_create_snap_realm( + struct ceph_mds_client *mdsc, + u64 ino) +{ + struct ceph_snap_realm *realm; + + realm = kzalloc(sizeof(*realm), GFP_NOFS); + if (!realm) + return ERR_PTR(-ENOMEM); + + atomic_set(&realm->nref, 0); /* tree does not take a ref */ + realm->ino = ino; + INIT_LIST_HEAD(&realm->children); + INIT_LIST_HEAD(&realm->child_item); + INIT_LIST_HEAD(&realm->empty_item); + INIT_LIST_HEAD(&realm->dirty_item); + INIT_LIST_HEAD(&realm->inodes_with_caps); + spin_lock_init(&realm->inodes_with_caps_lock); + __insert_snap_realm(&mdsc->snap_realms, realm); + dout("create_snap_realm %llx %p\n", realm->ino, realm); + return realm; +} + +/* + * lookup the realm rooted at @ino. + * + * caller must hold snap_rwsem for write. + */ +struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) +{ + struct rb_node *n = mdsc->snap_realms.rb_node; + struct ceph_snap_realm *r; + + while (n) { + r = rb_entry(n, struct ceph_snap_realm, node); + if (ino < r->ino) + n = n->rb_left; + else if (ino > r->ino) + n = n->rb_right; + else { + dout("lookup_snap_realm %llx %p\n", r->ino, r); + return r; + } + } + return NULL; +} + +static void __put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm); + +/* + * called with snap_rwsem (write) + */ +static void __destroy_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); + + rb_erase(&realm->node, &mdsc->snap_realms); + + if (realm->parent) { + list_del_init(&realm->child_item); + __put_snap_realm(mdsc, realm->parent); + } + + kfree(realm->prior_parent_snaps); + kfree(realm->snaps); + ceph_put_snap_context(realm->cached_context); + kfree(realm); +} + +/* + * caller holds snap_rwsem (write) + */ +static void __put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, + atomic_read(&realm->nref), atomic_read(&realm->nref)-1); + if (atomic_dec_and_test(&realm->nref)) + __destroy_snap_realm(mdsc, realm); +} + +/* + * caller needn't hold any locks + */ +void ceph_put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) +{ + dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, + atomic_read(&realm->nref), atomic_read(&realm->nref)-1); + if (!atomic_dec_and_test(&realm->nref)) + return; + + if (down_write_trylock(&mdsc->snap_rwsem)) { + __destroy_snap_realm(mdsc, realm); + up_write(&mdsc->snap_rwsem); + } else { + spin_lock(&mdsc->snap_empty_lock); + list_add(&realm->empty_item, &mdsc->snap_empty); + spin_unlock(&mdsc->snap_empty_lock); + } +} + +/* + * Clean up any realms whose ref counts have dropped to zero. Note + * that this does not include realms who were created but not yet + * used. + * + * Called under snap_rwsem (write) + */ +static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) +{ + struct ceph_snap_realm *realm; + + spin_lock(&mdsc->snap_empty_lock); + while (!list_empty(&mdsc->snap_empty)) { + realm = list_first_entry(&mdsc->snap_empty, + struct ceph_snap_realm, empty_item); + list_del(&realm->empty_item); + spin_unlock(&mdsc->snap_empty_lock); + __destroy_snap_realm(mdsc, realm); + spin_lock(&mdsc->snap_empty_lock); + } + spin_unlock(&mdsc->snap_empty_lock); +} + +void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc) +{ + down_write(&mdsc->snap_rwsem); + __cleanup_empty_realms(mdsc); + up_write(&mdsc->snap_rwsem); +} + +/* + * adjust the parent realm of a given @realm. adjust child list, and parent + * pointers, and ref counts appropriately. + * + * return true if parent was changed, 0 if unchanged, <0 on error. + * + * caller must hold snap_rwsem for write. + */ +static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm, + u64 parentino) +{ + struct ceph_snap_realm *parent; + + if (realm->parent_ino == parentino) + return 0; + + parent = ceph_lookup_snap_realm(mdsc, parentino); + if (!parent) { + parent = ceph_create_snap_realm(mdsc, parentino); + if (IS_ERR(parent)) + return PTR_ERR(parent); + } + dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n", + realm->ino, realm, realm->parent_ino, realm->parent, + parentino, parent); + if (realm->parent) { + list_del_init(&realm->child_item); + ceph_put_snap_realm(mdsc, realm->parent); + } + realm->parent_ino = parentino; + realm->parent = parent; + ceph_get_snap_realm(mdsc, parent); + list_add(&realm->child_item, &parent->children); + return 1; +} + + +static int cmpu64_rev(const void *a, const void *b) +{ + if (*(u64 *)a < *(u64 *)b) + return 1; + if (*(u64 *)a > *(u64 *)b) + return -1; + return 0; +} + +/* + * build the snap context for a given realm. + */ +static int build_snap_context(struct ceph_snap_realm *realm) +{ + struct ceph_snap_realm *parent = realm->parent; + struct ceph_snap_context *snapc; + int err = 0; + int i; + int num = realm->num_prior_parent_snaps + realm->num_snaps; + + /* + * build parent context, if it hasn't been built. + * conservatively estimate that all parent snaps might be + * included by us. + */ + if (parent) { + if (!parent->cached_context) { + err = build_snap_context(parent); + if (err) + goto fail; + } + num += parent->cached_context->num_snaps; + } + + /* do i actually need to update? not if my context seq + matches realm seq, and my parents' does to. (this works + because we rebuild_snap_realms() works _downward_ in + hierarchy after each update.) */ + if (realm->cached_context && + realm->cached_context->seq == realm->seq && + (!parent || + realm->cached_context->seq >= parent->cached_context->seq)) { + dout("build_snap_context %llx %p: %p seq %lld (%d snaps)" + " (unchanged)\n", + realm->ino, realm, realm->cached_context, + realm->cached_context->seq, + realm->cached_context->num_snaps); + return 0; + } + + /* alloc new snap context */ + err = -ENOMEM; + if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc)) + goto fail; + snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); + if (!snapc) + goto fail; + atomic_set(&snapc->nref, 1); + + /* build (reverse sorted) snap vector */ + num = 0; + snapc->seq = realm->seq; + if (parent) { + /* include any of parent's snaps occurring _after_ my + parent became my parent */ + for (i = 0; i < parent->cached_context->num_snaps; i++) + if (parent->cached_context->snaps[i] >= + realm->parent_since) + snapc->snaps[num++] = + parent->cached_context->snaps[i]; + if (parent->cached_context->seq > snapc->seq) + snapc->seq = parent->cached_context->seq; + } + memcpy(snapc->snaps + num, realm->snaps, + sizeof(u64)*realm->num_snaps); + num += realm->num_snaps; + memcpy(snapc->snaps + num, realm->prior_parent_snaps, + sizeof(u64)*realm->num_prior_parent_snaps); + num += realm->num_prior_parent_snaps; + + sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); + snapc->num_snaps = num; + dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n", + realm->ino, realm, snapc, snapc->seq, snapc->num_snaps); + + if (realm->cached_context) + ceph_put_snap_context(realm->cached_context); + realm->cached_context = snapc; + return 0; + +fail: + /* + * if we fail, clear old (incorrect) cached_context... hopefully + * we'll have better luck building it later + */ + if (realm->cached_context) { + ceph_put_snap_context(realm->cached_context); + realm->cached_context = NULL; + } + pr_err("build_snap_context %llx %p fail %d\n", realm->ino, + realm, err); + return err; +} + +/* + * rebuild snap context for the given realm and all of its children. + */ +static void rebuild_snap_realms(struct ceph_snap_realm *realm) +{ + struct ceph_snap_realm *child; + + dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); + build_snap_context(realm); + + list_for_each_entry(child, &realm->children, child_item) + rebuild_snap_realms(child); +} + + +/* + * helper to allocate and decode an array of snapids. free prior + * instance, if any. + */ +static int dup_array(u64 **dst, __le64 *src, int num) +{ + int i; + + kfree(*dst); + if (num) { + *dst = kcalloc(num, sizeof(u64), GFP_NOFS); + if (!*dst) + return -ENOMEM; + for (i = 0; i < num; i++) + (*dst)[i] = get_unaligned_le64(src + i); + } else { + *dst = NULL; + } + return 0; +} + + +/* + * When a snapshot is applied, the size/mtime inode metadata is queued + * in a ceph_cap_snap (one for each snapshot) until writeback + * completes and the metadata can be flushed back to the MDS. + * + * However, if a (sync) write is currently in-progress when we apply + * the snapshot, we have to wait until the write succeeds or fails + * (and a final size/mtime is known). In this case the + * cap_snap->writing = 1, and is said to be "pending." When the write + * finishes, we __ceph_finish_cap_snap(). + * + * Caller must hold snap_rwsem for read (i.e., the realm topology won't + * change). + */ +void ceph_queue_cap_snap(struct ceph_inode_info *ci) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_cap_snap *capsnap; + int used, dirty; + + capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); + if (!capsnap) { + pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); + return; + } + + spin_lock(&inode->i_lock); + used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); + if (__ceph_have_pending_cap_snap(ci)) { + /* there is no point in queuing multiple "pending" cap_snaps, + as no new writes are allowed to start when pending, so any + writes in progress now were started before the previous + cap_snap. lucky us. */ + dout("queue_cap_snap %p already pending\n", inode); + kfree(capsnap); + } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) || + (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| + CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) { + struct ceph_snap_context *snapc = ci->i_head_snapc; + + dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode, + capsnap, snapc); + ihold(inode); + + atomic_set(&capsnap->nref, 1); + capsnap->ci = ci; + INIT_LIST_HEAD(&capsnap->ci_item); + INIT_LIST_HEAD(&capsnap->flushing_item); + + capsnap->follows = snapc->seq; + capsnap->issued = __ceph_caps_issued(ci, NULL); + capsnap->dirty = dirty; + + capsnap->mode = inode->i_mode; + capsnap->uid = inode->i_uid; + capsnap->gid = inode->i_gid; + + if (dirty & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + capsnap->xattr_blob = + ceph_buffer_get(ci->i_xattrs.blob); + capsnap->xattr_version = ci->i_xattrs.version; + } else { + capsnap->xattr_blob = NULL; + capsnap->xattr_version = 0; + } + + /* dirty page count moved from _head to this cap_snap; + all subsequent writes page dirties occur _after_ this + snapshot. */ + capsnap->dirty_pages = ci->i_wrbuffer_ref_head; + ci->i_wrbuffer_ref_head = 0; + capsnap->context = snapc; + ci->i_head_snapc = + ceph_get_snap_context(ci->i_snap_realm->cached_context); + dout(" new snapc is %p\n", ci->i_head_snapc); + list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); + + if (used & CEPH_CAP_FILE_WR) { + dout("queue_cap_snap %p cap_snap %p snapc %p" + " seq %llu used WR, now pending\n", inode, + capsnap, snapc, snapc->seq); + capsnap->writing = 1; + } else { + /* note mtime, size NOW. */ + __ceph_finish_cap_snap(ci, capsnap); + } + } else { + dout("queue_cap_snap %p nothing dirty|writing\n", inode); + kfree(capsnap); + } + + spin_unlock(&inode->i_lock); +} + +/* + * Finalize the size, mtime for a cap_snap.. that is, settle on final values + * to be used for the snapshot, to be flushed back to the mds. + * + * If capsnap can now be flushed, add to snap_flush list, and return 1. + * + * Caller must hold i_lock. + */ +int __ceph_finish_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap *capsnap) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + + BUG_ON(capsnap->writing); + capsnap->size = inode->i_size; + capsnap->mtime = inode->i_mtime; + capsnap->atime = inode->i_atime; + capsnap->ctime = inode->i_ctime; + capsnap->time_warp_seq = ci->i_time_warp_seq; + if (capsnap->dirty_pages) { + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " + "still has %d dirty pages\n", inode, capsnap, + capsnap->context, capsnap->context->seq, + ceph_cap_string(capsnap->dirty), capsnap->size, + capsnap->dirty_pages); + return 0; + } + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", + inode, capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size); + + spin_lock(&mdsc->snap_flush_lock); + list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); + spin_unlock(&mdsc->snap_flush_lock); + return 1; /* caller may want to ceph_flush_snaps */ +} + +/* + * Queue cap_snaps for snap writeback for this realm and its children. + * Called under snap_rwsem, so realm topology won't change. + */ +static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) +{ + struct ceph_inode_info *ci; + struct inode *lastinode = NULL; + struct ceph_snap_realm *child; + + dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); + + spin_lock(&realm->inodes_with_caps_lock); + list_for_each_entry(ci, &realm->inodes_with_caps, + i_snap_realm_item) { + struct inode *inode = igrab(&ci->vfs_inode); + if (!inode) + continue; + spin_unlock(&realm->inodes_with_caps_lock); + if (lastinode) + iput(lastinode); + lastinode = inode; + ceph_queue_cap_snap(ci); + spin_lock(&realm->inodes_with_caps_lock); + } + spin_unlock(&realm->inodes_with_caps_lock); + if (lastinode) + iput(lastinode); + + list_for_each_entry(child, &realm->children, child_item) { + dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", + realm, realm->ino, child, child->ino); + list_del_init(&child->dirty_item); + list_add(&child->dirty_item, &realm->dirty_item); + } + + list_del_init(&realm->dirty_item); + dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); +} + +/* + * Parse and apply a snapblob "snap trace" from the MDS. This specifies + * the snap realm parameters from a given realm and all of its ancestors, + * up to the root. + * + * Caller must hold snap_rwsem for write. + */ +int ceph_update_snap_trace(struct ceph_mds_client *mdsc, + void *p, void *e, bool deletion) +{ + struct ceph_mds_snap_realm *ri; /* encoded */ + __le64 *snaps; /* encoded */ + __le64 *prior_parent_snaps; /* encoded */ + struct ceph_snap_realm *realm; + int invalidate = 0; + int err = -ENOMEM; + LIST_HEAD(dirty_realms); + + dout("update_snap_trace deletion=%d\n", deletion); +more: + ceph_decode_need(&p, e, sizeof(*ri), bad); + ri = p; + p += sizeof(*ri); + ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) + + le32_to_cpu(ri->num_prior_parent_snaps)), bad); + snaps = p; + p += sizeof(u64) * le32_to_cpu(ri->num_snaps); + prior_parent_snaps = p; + p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps); + + realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino)); + if (!realm) { + realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino)); + if (IS_ERR(realm)) { + err = PTR_ERR(realm); + goto fail; + } + } + + /* ensure the parent is correct */ + err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); + if (err < 0) + goto fail; + invalidate += err; + + if (le64_to_cpu(ri->seq) > realm->seq) { + dout("update_snap_trace updating %llx %p %lld -> %lld\n", + realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); + /* update realm parameters, snap lists */ + realm->seq = le64_to_cpu(ri->seq); + realm->created = le64_to_cpu(ri->created); + realm->parent_since = le64_to_cpu(ri->parent_since); + + realm->num_snaps = le32_to_cpu(ri->num_snaps); + err = dup_array(&realm->snaps, snaps, realm->num_snaps); + if (err < 0) + goto fail; + + realm->num_prior_parent_snaps = + le32_to_cpu(ri->num_prior_parent_snaps); + err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps, + realm->num_prior_parent_snaps); + if (err < 0) + goto fail; + + /* queue realm for cap_snap creation */ + list_add(&realm->dirty_item, &dirty_realms); + + invalidate = 1; + } else if (!realm->cached_context) { + dout("update_snap_trace %llx %p seq %lld new\n", + realm->ino, realm, realm->seq); + invalidate = 1; + } else { + dout("update_snap_trace %llx %p seq %lld unchanged\n", + realm->ino, realm, realm->seq); + } + + dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, + realm, invalidate, p, e); + + if (p < e) + goto more; + + /* invalidate when we reach the _end_ (root) of the trace */ + if (invalidate) + rebuild_snap_realms(realm); + + /* + * queue cap snaps _after_ we've built the new snap contexts, + * so that i_head_snapc can be set appropriately. + */ + while (!list_empty(&dirty_realms)) { + realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, + dirty_item); + queue_realm_cap_snaps(realm); + } + + __cleanup_empty_realms(mdsc); + return 0; + +bad: + err = -EINVAL; +fail: + pr_err("update_snap_trace error %d\n", err); + return err; +} + + +/* + * Send any cap_snaps that are queued for flush. Try to carry + * s_mutex across multiple snap flushes to avoid locking overhead. + * + * Caller holds no locks. + */ +static void flush_snaps(struct ceph_mds_client *mdsc) +{ + struct ceph_inode_info *ci; + struct inode *inode; + struct ceph_mds_session *session = NULL; + + dout("flush_snaps\n"); + spin_lock(&mdsc->snap_flush_lock); + while (!list_empty(&mdsc->snap_flush_list)) { + ci = list_first_entry(&mdsc->snap_flush_list, + struct ceph_inode_info, i_snap_flush_item); + inode = &ci->vfs_inode; + ihold(inode); + spin_unlock(&mdsc->snap_flush_lock); + spin_lock(&inode->i_lock); + __ceph_flush_snaps(ci, &session, 0); + spin_unlock(&inode->i_lock); + iput(inode); + spin_lock(&mdsc->snap_flush_lock); + } + spin_unlock(&mdsc->snap_flush_lock); + + if (session) { + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + } + dout("flush_snaps done\n"); +} + + +/* + * Handle a snap notification from the MDS. + * + * This can take two basic forms: the simplest is just a snap creation + * or deletion notification on an existing realm. This should update the + * realm and its children. + * + * The more difficult case is realm creation, due to snap creation at a + * new point in the file hierarchy, or due to a rename that moves a file or + * directory into another realm. + */ +void ceph_handle_snap(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct super_block *sb = mdsc->fsc->sb; + int mds = session->s_mds; + u64 split; + int op; + int trace_len; + struct ceph_snap_realm *realm = NULL; + void *p = msg->front.iov_base; + void *e = p + msg->front.iov_len; + struct ceph_mds_snap_head *h; + int num_split_inos, num_split_realms; + __le64 *split_inos = NULL, *split_realms = NULL; + int i; + int locked_rwsem = 0; + + /* decode */ + if (msg->front.iov_len < sizeof(*h)) + goto bad; + h = p; + op = le32_to_cpu(h->op); + split = le64_to_cpu(h->split); /* non-zero if we are splitting an + * existing realm */ + num_split_inos = le32_to_cpu(h->num_split_inos); + num_split_realms = le32_to_cpu(h->num_split_realms); + trace_len = le32_to_cpu(h->trace_len); + p += sizeof(*h); + + dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds, + ceph_snap_op_name(op), split, trace_len); + + mutex_lock(&session->s_mutex); + session->s_seq++; + mutex_unlock(&session->s_mutex); + + down_write(&mdsc->snap_rwsem); + locked_rwsem = 1; + + if (op == CEPH_SNAP_OP_SPLIT) { + struct ceph_mds_snap_realm *ri; + + /* + * A "split" breaks part of an existing realm off into + * a new realm. The MDS provides a list of inodes + * (with caps) and child realms that belong to the new + * child. + */ + split_inos = p; + p += sizeof(u64) * num_split_inos; + split_realms = p; + p += sizeof(u64) * num_split_realms; + ceph_decode_need(&p, e, sizeof(*ri), bad); + /* we will peek at realm info here, but will _not_ + * advance p, as the realm update will occur below in + * ceph_update_snap_trace. */ + ri = p; + + realm = ceph_lookup_snap_realm(mdsc, split); + if (!realm) { + realm = ceph_create_snap_realm(mdsc, split); + if (IS_ERR(realm)) + goto out; + } + ceph_get_snap_realm(mdsc, realm); + + dout("splitting snap_realm %llx %p\n", realm->ino, realm); + for (i = 0; i < num_split_inos; i++) { + struct ceph_vino vino = { + .ino = le64_to_cpu(split_inos[i]), + .snap = CEPH_NOSNAP, + }; + struct inode *inode = ceph_find_inode(sb, vino); + struct ceph_inode_info *ci; + struct ceph_snap_realm *oldrealm; + + if (!inode) + continue; + ci = ceph_inode(inode); + + spin_lock(&inode->i_lock); + if (!ci->i_snap_realm) + goto skip_inode; + /* + * If this inode belongs to a realm that was + * created after our new realm, we experienced + * a race (due to another split notifications + * arriving from a different MDS). So skip + * this inode. + */ + if (ci->i_snap_realm->created > + le64_to_cpu(ri->created)) { + dout(" leaving %p in newer realm %llx %p\n", + inode, ci->i_snap_realm->ino, + ci->i_snap_realm); + goto skip_inode; + } + dout(" will move %p to split realm %llx %p\n", + inode, realm->ino, realm); + /* + * Move the inode to the new realm + */ + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + oldrealm = ci->i_snap_realm; + ci->i_snap_realm = realm; + spin_unlock(&realm->inodes_with_caps_lock); + spin_unlock(&inode->i_lock); + + ceph_get_snap_realm(mdsc, realm); + ceph_put_snap_realm(mdsc, oldrealm); + + iput(inode); + continue; + +skip_inode: + spin_unlock(&inode->i_lock); + iput(inode); + } + + /* we may have taken some of the old realm's children. */ + for (i = 0; i < num_split_realms; i++) { + struct ceph_snap_realm *child = + ceph_lookup_snap_realm(mdsc, + le64_to_cpu(split_realms[i])); + if (!child) + continue; + adjust_snap_realm_parent(mdsc, child, realm->ino); + } + } + + /* + * update using the provided snap trace. if we are deleting a + * snap, we can avoid queueing cap_snaps. + */ + ceph_update_snap_trace(mdsc, p, e, + op == CEPH_SNAP_OP_DESTROY); + + if (op == CEPH_SNAP_OP_SPLIT) + /* we took a reference when we created the realm, above */ + ceph_put_snap_realm(mdsc, realm); + + __cleanup_empty_realms(mdsc); + + up_write(&mdsc->snap_rwsem); + + flush_snaps(mdsc); + return; + +bad: + pr_err("corrupt snap message from mds%d\n", mds); + ceph_msg_dump(msg); +out: + if (locked_rwsem) + up_write(&mdsc->snap_rwsem); + return; +} + + + diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c new file mode 100644 index 00000000..cd5097d7 --- /dev/null +++ b/fs/ceph/strings.c @@ -0,0 +1,117 @@ +/* + * Ceph fs string constants + */ +#include +#include + + +const char *ceph_mds_state_name(int s) +{ + switch (s) { + /* down and out */ + case CEPH_MDS_STATE_DNE: return "down:dne"; + case CEPH_MDS_STATE_STOPPED: return "down:stopped"; + /* up and out */ + case CEPH_MDS_STATE_BOOT: return "up:boot"; + case CEPH_MDS_STATE_STANDBY: return "up:standby"; + case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay"; + case CEPH_MDS_STATE_CREATING: return "up:creating"; + case CEPH_MDS_STATE_STARTING: return "up:starting"; + /* up and in */ + case CEPH_MDS_STATE_REPLAY: return "up:replay"; + case CEPH_MDS_STATE_RESOLVE: return "up:resolve"; + case CEPH_MDS_STATE_RECONNECT: return "up:reconnect"; + case CEPH_MDS_STATE_REJOIN: return "up:rejoin"; + case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay"; + case CEPH_MDS_STATE_ACTIVE: return "up:active"; + case CEPH_MDS_STATE_STOPPING: return "up:stopping"; + } + return "???"; +} + +const char *ceph_session_op_name(int op) +{ + switch (op) { + case CEPH_SESSION_REQUEST_OPEN: return "request_open"; + case CEPH_SESSION_OPEN: return "open"; + case CEPH_SESSION_REQUEST_CLOSE: return "request_close"; + case CEPH_SESSION_CLOSE: return "close"; + case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps"; + case CEPH_SESSION_RENEWCAPS: return "renewcaps"; + case CEPH_SESSION_STALE: return "stale"; + case CEPH_SESSION_RECALL_STATE: return "recall_state"; + } + return "???"; +} + +const char *ceph_mds_op_name(int op) +{ + switch (op) { + case CEPH_MDS_OP_LOOKUP: return "lookup"; + case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; + case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; + case CEPH_MDS_OP_GETATTR: return "getattr"; + case CEPH_MDS_OP_SETXATTR: return "setxattr"; + case CEPH_MDS_OP_SETATTR: return "setattr"; + case CEPH_MDS_OP_RMXATTR: return "rmxattr"; + case CEPH_MDS_OP_READDIR: return "readdir"; + case CEPH_MDS_OP_MKNOD: return "mknod"; + case CEPH_MDS_OP_LINK: return "link"; + case CEPH_MDS_OP_UNLINK: return "unlink"; + case CEPH_MDS_OP_RENAME: return "rename"; + case CEPH_MDS_OP_MKDIR: return "mkdir"; + case CEPH_MDS_OP_RMDIR: return "rmdir"; + case CEPH_MDS_OP_SYMLINK: return "symlink"; + case CEPH_MDS_OP_CREATE: return "create"; + case CEPH_MDS_OP_OPEN: return "open"; + case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap"; + case CEPH_MDS_OP_LSSNAP: return "lssnap"; + case CEPH_MDS_OP_MKSNAP: return "mksnap"; + case CEPH_MDS_OP_RMSNAP: return "rmsnap"; + case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; + case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; + } + return "???"; +} + +const char *ceph_cap_op_name(int op) +{ + switch (op) { + case CEPH_CAP_OP_GRANT: return "grant"; + case CEPH_CAP_OP_REVOKE: return "revoke"; + case CEPH_CAP_OP_TRUNC: return "trunc"; + case CEPH_CAP_OP_EXPORT: return "export"; + case CEPH_CAP_OP_IMPORT: return "import"; + case CEPH_CAP_OP_UPDATE: return "update"; + case CEPH_CAP_OP_DROP: return "drop"; + case CEPH_CAP_OP_FLUSH: return "flush"; + case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack"; + case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap"; + case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack"; + case CEPH_CAP_OP_RELEASE: return "release"; + case CEPH_CAP_OP_RENEW: return "renew"; + } + return "???"; +} + +const char *ceph_lease_op_name(int o) +{ + switch (o) { + case CEPH_MDS_LEASE_REVOKE: return "revoke"; + case CEPH_MDS_LEASE_RELEASE: return "release"; + case CEPH_MDS_LEASE_RENEW: return "renew"; + case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack"; + } + return "???"; +} + +const char *ceph_snap_op_name(int o) +{ + switch (o) { + case CEPH_SNAP_OP_UPDATE: return "update"; + case CEPH_SNAP_OP_CREATE: return "create"; + case CEPH_SNAP_OP_DESTROY: return "destroy"; + case CEPH_SNAP_OP_SPLIT: return "split"; + } + return "???"; +} diff --git a/fs/ceph/super.c b/fs/ceph/super.c new file mode 100644 index 00000000..f2f77fd3 --- /dev/null +++ b/fs/ceph/super.c @@ -0,0 +1,921 @@ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "super.h" +#include "mds_client.h" + +#include +#include +#include +#include + +/* + * Ceph superblock operations + * + * Handle the basics of mounting, unmounting. + */ + +/* + * super ops + */ +static void ceph_put_super(struct super_block *s) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(s); + + dout("put_super\n"); + ceph_mdsc_close_sessions(fsc->mdsc); + + /* + * ensure we release the bdi before put_anon_super releases + * the device name. + */ + if (s->s_bdi == &fsc->backing_dev_info) { + bdi_unregister(&fsc->backing_dev_info); + s->s_bdi = NULL; + } + + return; +} + +static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode); + struct ceph_monmap *monmap = fsc->client->monc.monmap; + struct ceph_statfs st; + u64 fsid; + int err; + + dout("statfs\n"); + err = ceph_monc_do_statfs(&fsc->client->monc, &st); + if (err < 0) + return err; + + /* fill in kstatfs */ + buf->f_type = CEPH_SUPER_MAGIC; /* ?? */ + + /* + * express utilization in terms of large blocks to avoid + * overflow on 32-bit machines. + */ + buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; + buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); + buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >> + (CEPH_BLOCK_SHIFT-10); + buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); + + buf->f_files = le64_to_cpu(st.num_objects); + buf->f_ffree = -1; + buf->f_namelen = NAME_MAX; + buf->f_frsize = PAGE_CACHE_SIZE; + + /* leave fsid little-endian, regardless of host endianness */ + fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); + buf->f_fsid.val[0] = fsid & 0xffffffff; + buf->f_fsid.val[1] = fsid >> 32; + + return 0; +} + + +static int ceph_sync_fs(struct super_block *sb, int wait) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + + if (!wait) { + dout("sync_fs (non-blocking)\n"); + ceph_flush_dirty_caps(fsc->mdsc); + dout("sync_fs (non-blocking) done\n"); + return 0; + } + + dout("sync_fs (blocking)\n"); + ceph_osdc_sync(&fsc->client->osdc); + ceph_mdsc_sync(fsc->mdsc); + dout("sync_fs (blocking) done\n"); + return 0; +} + +/* + * mount options + */ +enum { + Opt_wsize, + Opt_rsize, + Opt_caps_wanted_delay_min, + Opt_caps_wanted_delay_max, + Opt_cap_release_safety, + Opt_readdir_max_entries, + Opt_readdir_max_bytes, + Opt_congestion_kb, + Opt_last_int, + /* int args above */ + Opt_snapdirname, + Opt_last_string, + /* string args above */ + Opt_dirstat, + Opt_nodirstat, + Opt_rbytes, + Opt_norbytes, + Opt_noasyncreaddir, + Opt_ino32, +}; + +static match_table_t fsopt_tokens = { + {Opt_wsize, "wsize=%d"}, + {Opt_rsize, "rsize=%d"}, + {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, + {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, + {Opt_cap_release_safety, "cap_release_safety=%d"}, + {Opt_readdir_max_entries, "readdir_max_entries=%d"}, + {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, + {Opt_congestion_kb, "write_congestion_kb=%d"}, + /* int args above */ + {Opt_snapdirname, "snapdirname=%s"}, + /* string args above */ + {Opt_dirstat, "dirstat"}, + {Opt_nodirstat, "nodirstat"}, + {Opt_rbytes, "rbytes"}, + {Opt_norbytes, "norbytes"}, + {Opt_noasyncreaddir, "noasyncreaddir"}, + {Opt_ino32, "ino32"}, + {-1, NULL} +}; + +static int parse_fsopt_token(char *c, void *private) +{ + struct ceph_mount_options *fsopt = private; + substring_t argstr[MAX_OPT_ARGS]; + int token, intval, ret; + + token = match_token((char *)c, fsopt_tokens, argstr); + if (token < 0) + return -EINVAL; + + if (token < Opt_last_int) { + ret = match_int(&argstr[0], &intval); + if (ret < 0) { + pr_err("bad mount option arg (not int) " + "at '%s'\n", c); + return ret; + } + dout("got int token %d val %d\n", token, intval); + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); + } else { + dout("got token %d\n", token); + } + + switch (token) { + case Opt_snapdirname: + kfree(fsopt->snapdir_name); + fsopt->snapdir_name = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + if (!fsopt->snapdir_name) + return -ENOMEM; + break; + + /* misc */ + case Opt_wsize: + fsopt->wsize = intval; + break; + case Opt_rsize: + fsopt->rsize = intval; + break; + case Opt_caps_wanted_delay_min: + fsopt->caps_wanted_delay_min = intval; + break; + case Opt_caps_wanted_delay_max: + fsopt->caps_wanted_delay_max = intval; + break; + case Opt_readdir_max_entries: + fsopt->max_readdir = intval; + break; + case Opt_readdir_max_bytes: + fsopt->max_readdir_bytes = intval; + break; + case Opt_congestion_kb: + fsopt->congestion_kb = intval; + break; + case Opt_dirstat: + fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; + break; + case Opt_nodirstat: + fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; + break; + case Opt_rbytes: + fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; + break; + case Opt_norbytes: + fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; + break; + case Opt_noasyncreaddir: + fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; + break; + case Opt_ino32: + fsopt->flags |= CEPH_MOUNT_OPT_INO32; + break; + default: + BUG_ON(token); + } + return 0; +} + +static void destroy_mount_options(struct ceph_mount_options *args) +{ + dout("destroy_mount_options %p\n", args); + kfree(args->snapdir_name); + kfree(args); +} + +static int strcmp_null(const char *s1, const char *s2) +{ + if (!s1 && !s2) + return 0; + if (s1 && !s2) + return -1; + if (!s1 && s2) + return 1; + return strcmp(s1, s2); +} + +static int compare_mount_options(struct ceph_mount_options *new_fsopt, + struct ceph_options *new_opt, + struct ceph_fs_client *fsc) +{ + struct ceph_mount_options *fsopt1 = new_fsopt; + struct ceph_mount_options *fsopt2 = fsc->mount_options; + int ofs = offsetof(struct ceph_mount_options, snapdir_name); + int ret; + + ret = memcmp(fsopt1, fsopt2, ofs); + if (ret) + return ret; + + ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name); + if (ret) + return ret; + + return ceph_compare_options(new_opt, fsc->client); +} + +static int parse_mount_options(struct ceph_mount_options **pfsopt, + struct ceph_options **popt, + int flags, char *options, + const char *dev_name, + const char **path) +{ + struct ceph_mount_options *fsopt; + const char *dev_name_end; + int err = -ENOMEM; + + fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); + if (!fsopt) + return -ENOMEM; + + dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); + + fsopt->sb_flags = flags; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + + fsopt->rsize = CEPH_RSIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; + fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; + fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + + /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ + err = -EINVAL; + if (!dev_name) + goto out; + *path = strstr(dev_name, ":/"); + if (*path == NULL) { + pr_err("device name is missing path (no :/ in %s)\n", + dev_name); + goto out; + } + dev_name_end = *path; + dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); + + /* path on server */ + *path += 2; + dout("server path '%s'\n", *path); + + err = ceph_parse_options(popt, options, dev_name, dev_name_end, + parse_fsopt_token, (void *)fsopt); + if (err) + goto out; + + /* success */ + *pfsopt = fsopt; + return 0; + +out: + destroy_mount_options(fsopt); + return err; +} + +/** + * ceph_show_options - Show mount options in /proc/mounts + * @m: seq_file to write to + * @mnt: mount descriptor + */ +static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb); + struct ceph_mount_options *fsopt = fsc->mount_options; + struct ceph_options *opt = fsc->client->options; + + if (opt->flags & CEPH_OPT_FSID) + seq_printf(m, ",fsid=%pU", &opt->fsid); + if (opt->flags & CEPH_OPT_NOSHARE) + seq_puts(m, ",noshare"); + if (opt->flags & CEPH_OPT_NOCRC) + seq_puts(m, ",nocrc"); + + if (opt->name) + seq_printf(m, ",name=%s", opt->name); + if (opt->key) + seq_puts(m, ",secret="); + + if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) + seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); + if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); + if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) + seq_printf(m, ",osdtimeout=%d", opt->osd_timeout); + if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, ",osdkeepalivetimeout=%d", + opt->osd_keepalive_timeout); + + if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) + seq_puts(m, ",dirstat"); + if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0) + seq_puts(m, ",norbytes"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) + seq_puts(m, ",noasyncreaddir"); + + if (fsopt->wsize) + seq_printf(m, ",wsize=%d", fsopt->wsize); + if (fsopt->rsize != CEPH_RSIZE_DEFAULT) + seq_printf(m, ",rsize=%d", fsopt->rsize); + if (fsopt->congestion_kb != default_congestion_kb()) + seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); + if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) + seq_printf(m, ",caps_wanted_delay_min=%d", + fsopt->caps_wanted_delay_min); + if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) + seq_printf(m, ",caps_wanted_delay_max=%d", + fsopt->caps_wanted_delay_max); + if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) + seq_printf(m, ",cap_release_safety=%d", + fsopt->cap_release_safety); + if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) + seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); + if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) + seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); + if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) + seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + return 0; +} + +/* + * handle any mon messages the standard library doesn't understand. + * return error if we don't either. + */ +static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) +{ + struct ceph_fs_client *fsc = client->private; + int type = le16_to_cpu(msg->hdr.type); + + switch (type) { + case CEPH_MSG_MDS_MAP: + ceph_mdsc_handle_map(fsc->mdsc, msg); + return 0; + + default: + return -1; + } +} + +/* + * create a new fs client + */ +struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, + struct ceph_options *opt) +{ + struct ceph_fs_client *fsc; + int err = -ENOMEM; + + fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); + if (!fsc) + return ERR_PTR(-ENOMEM); + + fsc->client = ceph_create_client(opt, fsc); + if (IS_ERR(fsc->client)) { + err = PTR_ERR(fsc->client); + goto fail; + } + fsc->client->extra_mon_dispatch = extra_mon_dispatch; + fsc->client->supported_features |= CEPH_FEATURE_FLOCK | + CEPH_FEATURE_DIRLAYOUTHASH; + fsc->client->monc.want_mdsmap = 1; + + fsc->mount_options = fsopt; + + fsc->sb = NULL; + fsc->mount_state = CEPH_MOUNT_MOUNTING; + + atomic_long_set(&fsc->writeback_count, 0); + + err = bdi_init(&fsc->backing_dev_info); + if (err < 0) + goto fail_client; + + err = -ENOMEM; + /* + * The number of concurrent works can be high but they don't need + * to be processed in parallel, limit concurrency. + */ + fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1); + if (fsc->wb_wq == NULL) + goto fail_bdi; + fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1); + if (fsc->pg_inv_wq == NULL) + goto fail_wb_wq; + fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1); + if (fsc->trunc_wq == NULL) + goto fail_pg_inv_wq; + + /* set up mempools */ + err = -ENOMEM; + fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, + fsc->mount_options->wsize >> PAGE_CACHE_SHIFT); + if (!fsc->wb_pagevec_pool) + goto fail_trunc_wq; + + /* caps */ + fsc->min_caps = fsopt->max_readdir; + + return fsc; + +fail_trunc_wq: + destroy_workqueue(fsc->trunc_wq); +fail_pg_inv_wq: + destroy_workqueue(fsc->pg_inv_wq); +fail_wb_wq: + destroy_workqueue(fsc->wb_wq); +fail_bdi: + bdi_destroy(&fsc->backing_dev_info); +fail_client: + ceph_destroy_client(fsc->client); +fail: + kfree(fsc); + return ERR_PTR(err); +} + +void destroy_fs_client(struct ceph_fs_client *fsc) +{ + dout("destroy_fs_client %p\n", fsc); + + destroy_workqueue(fsc->wb_wq); + destroy_workqueue(fsc->pg_inv_wq); + destroy_workqueue(fsc->trunc_wq); + + bdi_destroy(&fsc->backing_dev_info); + + mempool_destroy(fsc->wb_pagevec_pool); + + destroy_mount_options(fsc->mount_options); + + ceph_fs_debugfs_cleanup(fsc); + + ceph_destroy_client(fsc->client); + + kfree(fsc); + dout("destroy_fs_client %p done\n", fsc); +} + +/* + * caches + */ +struct kmem_cache *ceph_inode_cachep; +struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_dentry_cachep; +struct kmem_cache *ceph_file_cachep; + +static void ceph_inode_init_once(void *foo) +{ + struct ceph_inode_info *ci = foo; + inode_init_once(&ci->vfs_inode); +} + +static int __init init_caches(void) +{ + ceph_inode_cachep = kmem_cache_create("ceph_inode_info", + sizeof(struct ceph_inode_info), + __alignof__(struct ceph_inode_info), + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + ceph_inode_init_once); + if (ceph_inode_cachep == NULL) + return -ENOMEM; + + ceph_cap_cachep = KMEM_CACHE(ceph_cap, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_cap_cachep == NULL) + goto bad_cap; + + ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_dentry_cachep == NULL) + goto bad_dentry; + + ceph_file_cachep = KMEM_CACHE(ceph_file_info, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_file_cachep == NULL) + goto bad_file; + + return 0; + +bad_file: + kmem_cache_destroy(ceph_dentry_cachep); +bad_dentry: + kmem_cache_destroy(ceph_cap_cachep); +bad_cap: + kmem_cache_destroy(ceph_inode_cachep); + return -ENOMEM; +} + +static void destroy_caches(void) +{ + kmem_cache_destroy(ceph_inode_cachep); + kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_dentry_cachep); + kmem_cache_destroy(ceph_file_cachep); +} + + +/* + * ceph_umount_begin - initiate forced umount. Tear down down the + * mount, skipping steps that may hang while waiting for server(s). + */ +static void ceph_umount_begin(struct super_block *sb) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + + dout("ceph_umount_begin - starting forced umount\n"); + if (!fsc) + return; + fsc->mount_state = CEPH_MOUNT_SHUTDOWN; + return; +} + +static const struct super_operations ceph_super_ops = { + .alloc_inode = ceph_alloc_inode, + .destroy_inode = ceph_destroy_inode, + .write_inode = ceph_write_inode, + .sync_fs = ceph_sync_fs, + .put_super = ceph_put_super, + .show_options = ceph_show_options, + .statfs = ceph_statfs, + .umount_begin = ceph_umount_begin, +}; + +/* + * Bootstrap mount by opening the root directory. Note the mount + * @started time from caller, and time out if this takes too long. + */ +static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, + const char *path, + unsigned long started) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req = NULL; + int err; + struct dentry *root; + + /* open dir */ + dout("open_root_inode opening '%s'\n", path); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + req->r_path1 = kstrdup(path, GFP_NOFS); + req->r_ino1.ino = CEPH_INO_ROOT; + req->r_ino1.snap = CEPH_NOSNAP; + req->r_started = started; + req->r_timeout = fsc->client->options->mount_timeout * HZ; + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); + req->r_num_caps = 2; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err == 0) { + dout("open_root_inode success\n"); + if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && + fsc->sb->s_root == NULL) + root = d_alloc_root(req->r_target_inode); + else + root = d_obtain_alias(req->r_target_inode); + req->r_target_inode = NULL; + dout("open_root_inode success, root dentry is %p\n", root); + } else { + root = ERR_PTR(err); + } + ceph_mdsc_put_request(req); + return root; +} + + + + +/* + * mount: join the ceph cluster, and open root directory. + */ +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, + const char *path) +{ + int err; + unsigned long started = jiffies; /* note the start time */ + struct dentry *root; + int first = 0; /* first vfsmount for this super_block */ + + dout("mount start\n"); + mutex_lock(&fsc->client->mount_mutex); + + err = __ceph_open_session(fsc->client, started); + if (err < 0) + goto out; + + dout("mount opening root\n"); + root = open_root_dentry(fsc, "", started); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + if (fsc->sb->s_root) { + dput(root); + } else { + fsc->sb->s_root = root; + first = 1; + + err = ceph_fs_debugfs_init(fsc); + if (err < 0) + goto fail; + } + + if (path[0] == 0) { + dget(root); + } else { + dout("mount opening base mountpoint\n"); + root = open_root_dentry(fsc, path, started); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto fail; + } + } + + fsc->mount_state = CEPH_MOUNT_MOUNTED; + dout("mount success\n"); + mutex_unlock(&fsc->client->mount_mutex); + return root; + +out: + mutex_unlock(&fsc->client->mount_mutex); + return ERR_PTR(err); + +fail: + if (first) { + dput(fsc->sb->s_root); + fsc->sb->s_root = NULL; + } + goto out; +} + +static int ceph_set_super(struct super_block *s, void *data) +{ + struct ceph_fs_client *fsc = data; + int ret; + + dout("set_super %p data %p\n", s, data); + + s->s_flags = fsc->mount_options->sb_flags; + s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ + + s->s_fs_info = fsc; + fsc->sb = s; + + s->s_op = &ceph_super_ops; + s->s_export_op = &ceph_export_ops; + + s->s_time_gran = 1000; /* 1000 ns == 1 us */ + + ret = set_anon_super(s, NULL); /* what is that second arg for? */ + if (ret != 0) + goto fail; + + return ret; + +fail: + s->s_fs_info = NULL; + fsc->sb = NULL; + return ret; +} + +/* + * share superblock if same fs AND options + */ +static int ceph_compare_super(struct super_block *sb, void *data) +{ + struct ceph_fs_client *new = data; + struct ceph_mount_options *fsopt = new->mount_options; + struct ceph_options *opt = new->client->options; + struct ceph_fs_client *other = ceph_sb_to_client(sb); + + dout("ceph_compare_super %p\n", sb); + + if (compare_mount_options(fsopt, opt, other)) { + dout("monitor(s)/mount options don't match\n"); + return 0; + } + if ((opt->flags & CEPH_OPT_FSID) && + ceph_fsid_compare(&opt->fsid, &other->client->fsid)) { + dout("fsid doesn't match\n"); + return 0; + } + if (fsopt->sb_flags != other->mount_options->sb_flags) { + dout("flags differ\n"); + return 0; + } + return 1; +} + +/* + * construct our own bdi so we can control readahead, etc. + */ +static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); + +static int ceph_register_bdi(struct super_block *sb, + struct ceph_fs_client *fsc) +{ + int err; + + /* set ra_pages based on rsize mount option? */ + if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) + fsc->backing_dev_info.ra_pages = + (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) + >> PAGE_SHIFT; + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", + atomic_long_inc_return(&bdi_seq)); + if (!err) + sb->s_bdi = &fsc->backing_dev_info; + return err; +} + +static struct dentry *ceph_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct super_block *sb; + struct ceph_fs_client *fsc; + struct dentry *res; + int err; + int (*compare_super)(struct super_block *, void *) = ceph_compare_super; + const char *path = NULL; + struct ceph_mount_options *fsopt = NULL; + struct ceph_options *opt = NULL; + + dout("ceph_mount\n"); + err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); + if (err < 0) { + res = ERR_PTR(err); + goto out_final; + } + + /* create client (which we may/may not use) */ + fsc = create_fs_client(fsopt, opt); + if (IS_ERR(fsc)) { + res = ERR_CAST(fsc); + kfree(fsopt); + kfree(opt); + goto out_final; + } + + err = ceph_mdsc_init(fsc); + if (err < 0) { + res = ERR_PTR(err); + goto out; + } + + if (ceph_test_opt(fsc->client, NOSHARE)) + compare_super = NULL; + sb = sget(fs_type, compare_super, ceph_set_super, fsc); + if (IS_ERR(sb)) { + res = ERR_CAST(sb); + goto out; + } + + if (ceph_sb_to_client(sb) != fsc) { + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); + fsc = ceph_sb_to_client(sb); + dout("get_sb got existing client %p\n", fsc); + } else { + dout("get_sb using new client %p\n", fsc); + err = ceph_register_bdi(sb, fsc); + if (err < 0) { + res = ERR_PTR(err); + goto out_splat; + } + } + + res = ceph_real_mount(fsc, path); + if (IS_ERR(res)) + goto out_splat; + dout("root %p inode %p ino %llx.%llx\n", res, + res->d_inode, ceph_vinop(res->d_inode)); + return res; + +out_splat: + ceph_mdsc_close_sessions(fsc->mdsc); + deactivate_locked_super(sb); + goto out_final; + +out: + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); +out_final: + dout("ceph_mount fail %ld\n", PTR_ERR(res)); + return res; +} + +static void ceph_kill_sb(struct super_block *s) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(s); + dout("kill_sb %p\n", s); + ceph_mdsc_pre_umount(fsc->mdsc); + kill_anon_super(s); /* will call put_super after sb is r/o */ + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); +} + +static struct file_system_type ceph_fs_type = { + .owner = THIS_MODULE, + .name = "ceph", + .mount = ceph_mount, + .kill_sb = ceph_kill_sb, + .fs_flags = FS_RENAME_DOES_D_MOVE, +}; + +#define _STRINGIFY(x) #x +#define STRINGIFY(x) _STRINGIFY(x) + +static int __init init_ceph(void) +{ + int ret = init_caches(); + if (ret) + goto out; + + ret = register_filesystem(&ceph_fs_type); + if (ret) + goto out_icache; + + pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); + + return 0; + +out_icache: + destroy_caches(); +out: + return ret; +} + +static void __exit exit_ceph(void) +{ + dout("exit_ceph\n"); + unregister_filesystem(&ceph_fs_type); + destroy_caches(); +} + +module_init(init_ceph); +module_exit(exit_ceph); + +MODULE_AUTHOR("Sage Weil "); +MODULE_AUTHOR("Yehuda Sadeh "); +MODULE_AUTHOR("Patience Warnick "); +MODULE_DESCRIPTION("Ceph filesystem for Linux"); +MODULE_LICENSE("GPL"); diff --git a/fs/ceph/super.h b/fs/ceph/super.h new file mode 100644 index 00000000..f5cabefa --- /dev/null +++ b/fs/ceph/super.h @@ -0,0 +1,833 @@ +#ifndef _FS_CEPH_SUPER_H +#define _FS_CEPH_SUPER_H + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* f_type in struct statfs */ +#define CEPH_SUPER_MAGIC 0x00c36400 + +/* large granularity for statfs utilization stats to facilitate + * large volume sizes on 32-bit machines. */ +#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ +#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) + +#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ +#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ +#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ +#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ + +#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) + +#define ceph_set_mount_opt(fsc, opt) \ + (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; +#define ceph_test_mount_opt(fsc, opt) \ + (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) + +#define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */ +#define CEPH_MAX_READDIR_DEFAULT 1024 +#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) +#define CEPH_SNAPDIRNAME_DEFAULT ".snap" + +struct ceph_mount_options { + int flags; + int sb_flags; + + int wsize; + int rsize; /* max readahead */ + int congestion_kb; /* max writeback in flight */ + int caps_wanted_delay_min, caps_wanted_delay_max; + int cap_release_safety; + int max_readdir; /* max readdir result (entires) */ + int max_readdir_bytes; /* max readdir result (bytes) */ + + /* + * everything above this point can be memcmp'd; everything below + * is handled in compare_mount_options() + */ + + char *snapdir_name; /* default ".snap" */ +}; + +struct ceph_fs_client { + struct super_block *sb; + + struct ceph_mount_options *mount_options; + struct ceph_client *client; + + unsigned long mount_state; + int min_caps; /* min caps i added */ + + struct ceph_mds_client *mdsc; + + /* writeback */ + mempool_t *wb_pagevec_pool; + struct workqueue_struct *wb_wq; + struct workqueue_struct *pg_inv_wq; + struct workqueue_struct *trunc_wq; + atomic_long_t writeback_count; + + struct backing_dev_info backing_dev_info; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_dentry_lru, *debugfs_caps; + struct dentry *debugfs_congestion_kb; + struct dentry *debugfs_bdi; + struct dentry *debugfs_mdsc, *debugfs_mdsmap; +#endif +}; + + +/* + * File i/o capability. This tracks shared state with the metadata + * server that allows us to cache or writeback attributes or to read + * and write data. For any given inode, we should have one or more + * capabilities, one issued by each metadata server, and our + * cumulative access is the OR of all issued capabilities. + * + * Each cap is referenced by the inode's i_caps rbtree and by per-mds + * session capability lists. + */ +struct ceph_cap { + struct ceph_inode_info *ci; + struct rb_node ci_node; /* per-ci cap tree */ + struct ceph_mds_session *session; + struct list_head session_caps; /* per-session caplist */ + int mds; + u64 cap_id; /* unique cap id (mds provided) */ + int issued; /* latest, from the mds */ + int implemented; /* implemented superset of issued (for revocation) */ + int mds_wanted; + u32 seq, issue_seq, mseq; + u32 cap_gen; /* active/stale cycle */ + unsigned long last_used; + struct list_head caps_item; +}; + +#define CHECK_CAPS_NODELAY 1 /* do not delay any further */ +#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */ +#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */ + +/* + * Snapped cap state that is pending flush to mds. When a snapshot occurs, + * we first complete any in-process sync writes and writeback any dirty + * data before flushing the snapped state (tracked here) back to the MDS. + */ +struct ceph_cap_snap { + atomic_t nref; + struct ceph_inode_info *ci; + struct list_head ci_item, flushing_item; + + u64 follows, flush_tid; + int issued, dirty; + struct ceph_snap_context *context; + + mode_t mode; + uid_t uid; + gid_t gid; + + struct ceph_buffer *xattr_blob; + u64 xattr_version; + + u64 size; + struct timespec mtime, atime, ctime; + u64 time_warp_seq; + int writing; /* a sync write is still in progress */ + int dirty_pages; /* dirty pages awaiting writeback */ +}; + +static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) +{ + if (atomic_dec_and_test(&capsnap->nref)) { + if (capsnap->xattr_blob) + ceph_buffer_put(capsnap->xattr_blob); + kfree(capsnap); + } +} + +/* + * The frag tree describes how a directory is fragmented, potentially across + * multiple metadata servers. It is also used to indicate points where + * metadata authority is delegated, and whether/where metadata is replicated. + * + * A _leaf_ frag will be present in the i_fragtree IFF there is + * delegation info. That is, if mds >= 0 || ndist > 0. + */ +#define CEPH_MAX_DIRFRAG_REP 4 + +struct ceph_inode_frag { + struct rb_node node; + + /* fragtree state */ + u32 frag; + int split_by; /* i.e. 2^(split_by) children */ + + /* delegation and replication info */ + int mds; /* -1 if same authority as parent */ + int ndist; /* >0 if replicated */ + int dist[CEPH_MAX_DIRFRAG_REP]; +}; + +/* + * We cache inode xattrs as an encoded blob until they are first used, + * at which point we parse them into an rbtree. + */ +struct ceph_inode_xattr { + struct rb_node node; + + const char *name; + int name_len; + const char *val; + int val_len; + int dirty; + + int should_free_name; + int should_free_val; +}; + +/* + * Ceph dentry state + */ +struct ceph_dentry_info { + struct ceph_mds_session *lease_session; + u32 lease_gen, lease_shared_gen; + u32 lease_seq; + unsigned long lease_renew_after, lease_renew_from; + struct list_head lru; + struct dentry *dentry; + u64 time; + u64 offset; +}; + +struct ceph_inode_xattrs_info { + /* + * (still encoded) xattr blob. we avoid the overhead of parsing + * this until someone actually calls getxattr, etc. + * + * blob->vec.iov_len == 4 implies there are no xattrs; blob == + * NULL means we don't know. + */ + struct ceph_buffer *blob, *prealloc_blob; + + struct rb_root index; + bool dirty; + int count; + int names_size; + int vals_size; + u64 version, index_version; +}; + +/* + * Ceph inode. + */ +struct ceph_inode_info { + struct ceph_vino i_vino; /* ceph ino + snap */ + + u64 i_version; + u32 i_time_warp_seq; + + unsigned i_ceph_flags; + unsigned long i_release_count; + + struct ceph_dir_layout i_dir_layout; + struct ceph_file_layout i_layout; + char *i_symlink; + + /* for dirs */ + struct timespec i_rctime; + u64 i_rbytes, i_rfiles, i_rsubdirs; + u64 i_files, i_subdirs; + u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */ + + struct rb_root i_fragtree; + struct mutex i_fragtree_mutex; + + struct ceph_inode_xattrs_info i_xattrs; + + /* capabilities. protected _both_ by i_lock and cap->session's + * s_mutex. */ + struct rb_root i_caps; /* cap list */ + struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ + unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ + struct list_head i_dirty_item, i_flushing_item; + u64 i_cap_flush_seq; + /* we need to track cap writeback on a per-cap-bit basis, to allow + * overlapping, pipelined cap flushes to the mds. we can probably + * reduce the tid to 8 bits if we're concerned about inode size. */ + u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS]; + wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ + unsigned long i_hold_caps_min; /* jiffies */ + unsigned long i_hold_caps_max; /* jiffies */ + struct list_head i_cap_delay_list; /* for delayed cap release to mds */ + int i_cap_exporting_mds; /* to handle cap migration between */ + unsigned i_cap_exporting_mseq; /* mds's. */ + unsigned i_cap_exporting_issued; + struct ceph_cap_reservation i_cap_migration_resv; + struct list_head i_cap_snaps; /* snapped state pending flush to mds */ + struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or + dirty|flushing caps */ + unsigned i_snap_caps; /* cap bits for snapped files */ + + int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ + + u32 i_truncate_seq; /* last truncate to smaller size */ + u64 i_truncate_size; /* and the size we last truncated down to */ + int i_truncate_pending; /* still need to call vmtruncate */ + + u64 i_max_size; /* max file size authorized by mds */ + u64 i_reported_size; /* (max_)size reported to or requested of mds */ + u64 i_wanted_max_size; /* offset we'd like to write too */ + u64 i_requested_max_size; /* max_size we've requested */ + + /* held references to caps */ + int i_pin_ref; + int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref; + int i_wrbuffer_ref, i_wrbuffer_ref_head; + u32 i_shared_gen; /* increment each time we get FILE_SHARED */ + u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ + u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ + + struct list_head i_unsafe_writes; /* uncommitted sync writes */ + struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */ + spinlock_t i_unsafe_lock; + + struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ + int i_snap_realm_counter; /* snap realm (if caps) */ + struct list_head i_snap_realm_item; + struct list_head i_snap_flush_item; + + struct work_struct i_wb_work; /* writeback work */ + struct work_struct i_pg_inv_work; /* page invalidation work */ + + struct work_struct i_vmtruncate_work; + + struct inode vfs_inode; /* at end */ +}; + +static inline struct ceph_inode_info *ceph_inode(struct inode *inode) +{ + return container_of(inode, struct ceph_inode_info, vfs_inode); +} + +static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) +{ + return (struct ceph_fs_client *)inode->i_sb->s_fs_info; +} + +static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) +{ + return (struct ceph_fs_client *)sb->s_fs_info; +} + +static inline struct ceph_vino ceph_vino(struct inode *inode) +{ + return ceph_inode(inode)->i_vino; +} + +/* + * ino_t is <64 bits on many architectures, blech. + * + * i_ino (kernel inode) st_ino (userspace) + * i386 32 32 + * x86_64+ino32 64 32 + * x86_64 64 64 + */ +static inline u32 ceph_ino_to_ino32(ino_t ino) +{ + ino ^= ino >> (sizeof(ino) * 8 - 32); + if (!ino) + ino = 1; + return ino; +} + +/* + * kernel i_ino value + */ +static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) +{ + ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ +#if BITS_PER_LONG == 32 + ino = ceph_ino_to_ino32(ino); +#endif + return ino; +} + +/* + * user-visible ino (stat, filldir) + */ +#if BITS_PER_LONG == 32 +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) +{ + return ino; +} +#else +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) +{ + if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)) + ino = ceph_ino_to_ino32(ino); + return ino; +} +#endif + + +/* for printf-style formatting */ +#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap + +static inline u64 ceph_ino(struct inode *inode) +{ + return ceph_inode(inode)->i_vino.ino; +} +static inline u64 ceph_snap(struct inode *inode) +{ + return ceph_inode(inode)->i_vino.snap; +} + +static inline int ceph_ino_compare(struct inode *inode, void *data) +{ + struct ceph_vino *pvino = (struct ceph_vino *)data; + struct ceph_inode_info *ci = ceph_inode(inode); + return ci->i_vino.ino == pvino->ino && + ci->i_vino.snap == pvino->snap; +} + +static inline struct inode *ceph_find_inode(struct super_block *sb, + struct ceph_vino vino) +{ + ino_t t = ceph_vino_to_ino(vino); + return ilookup5(sb, t, ceph_ino_compare, &vino); +} + + +/* + * Ceph inode. + */ +#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ +#define CEPH_I_NODELAY 4 /* do not delay cap release */ +#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ + +static inline void ceph_i_clear(struct inode *inode, unsigned mask) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&inode->i_lock); + ci->i_ceph_flags &= ~mask; + spin_unlock(&inode->i_lock); +} + +static inline void ceph_i_set(struct inode *inode, unsigned mask) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&inode->i_lock); + ci->i_ceph_flags |= mask; + spin_unlock(&inode->i_lock); +} + +static inline bool ceph_i_test(struct inode *inode, unsigned mask) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + bool r; + + spin_lock(&inode->i_lock); + r = (ci->i_ceph_flags & mask) == mask; + spin_unlock(&inode->i_lock); + return r; +} + + +/* find a specific frag @f */ +extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, + u32 f); + +/* + * choose fragment for value @v. copy frag content to pfrag, if leaf + * exists + */ +extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, + int *found); + +static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) +{ + return (struct ceph_dentry_info *)dentry->d_fsdata; +} + +static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) +{ + return ((loff_t)frag << 32) | (loff_t)off; +} + +/* + * caps helpers + */ +static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) +{ + return !RB_EMPTY_ROOT(&ci->i_caps); +} + +extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented); +extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t); +extern int __ceph_caps_issued_other(struct ceph_inode_info *ci, + struct ceph_cap *cap); + +static inline int ceph_caps_issued(struct ceph_inode_info *ci) +{ + int issued; + spin_lock(&ci->vfs_inode.i_lock); + issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->vfs_inode.i_lock); + return issued; +} + +static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, + int touch) +{ + int r; + spin_lock(&ci->vfs_inode.i_lock); + r = __ceph_caps_issued_mask(ci, mask, touch); + spin_unlock(&ci->vfs_inode.i_lock); + return r; +} + +static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) +{ + return ci->i_dirty_caps | ci->i_flushing_caps; +} +extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); + +extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); +extern int __ceph_caps_used(struct ceph_inode_info *ci); + +extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci); + +/* + * wanted, by virtue of open file modes AND cap refs (buffered/cached data) + */ +static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) +{ + int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); + if (w & CEPH_CAP_FILE_BUFFER) + w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */ + return w; +} + +/* what the mds thinks we want */ +extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); + +extern void ceph_caps_init(struct ceph_mds_client *mdsc); +extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); +extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); +extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx, int need); +extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx); +extern void ceph_reservation_status(struct ceph_fs_client *client, + int *total, int *avail, int *used, + int *reserved, int *min); + + + +/* + * we keep buffered readdir results attached to file->private_data + */ +struct ceph_file_info { + int fmode; /* initialized on open */ + + /* readdir: position within the dir */ + u32 frag; + struct ceph_mds_request *last_readdir; + int at_end; + + /* readdir: position within a frag */ + unsigned offset; /* offset of last chunk, adjusted for . and .. */ + u64 next_offset; /* offset of next chunk (last_name's + 1) */ + char *last_name; /* last entry in previous chunk */ + struct dentry *dentry; /* next dentry (for dcache readdir) */ + unsigned long dir_release_count; + + /* used for -o dirstat read() on directory thing */ + char *dir_info; + int dir_info_len; +}; + + + +/* + * A "snap realm" describes a subset of the file hierarchy sharing + * the same set of snapshots that apply to it. The realms themselves + * are organized into a hierarchy, such that children inherit (some of) + * the snapshots of their parents. + * + * All inodes within the realm that have capabilities are linked into a + * per-realm list. + */ +struct ceph_snap_realm { + u64 ino; + atomic_t nref; + struct rb_node node; + + u64 created, seq; + u64 parent_ino; + u64 parent_since; /* snapid when our current parent became so */ + + u64 *prior_parent_snaps; /* snaps inherited from any parents we */ + int num_prior_parent_snaps; /* had prior to parent_since */ + u64 *snaps; /* snaps specific to this realm */ + int num_snaps; + + struct ceph_snap_realm *parent; + struct list_head children; /* list of child realms */ + struct list_head child_item; + + struct list_head empty_item; /* if i have ref==0 */ + + struct list_head dirty_item; /* if realm needs new context */ + + /* the current set of snaps for this realm */ + struct ceph_snap_context *cached_context; + + struct list_head inodes_with_caps; + spinlock_t inodes_with_caps_lock; +}; + +static inline int default_congestion_kb(void) +{ + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; +} + + + +/* snap.c */ +struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino); +extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm); +extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm); +extern int ceph_update_snap_trace(struct ceph_mds_client *m, + void *p, void *e, bool deletion); +extern void ceph_handle_snap(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg); +extern void ceph_queue_cap_snap(struct ceph_inode_info *ci); +extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap *capsnap); +extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); + +/* + * a cap_snap is "pending" if it is still awaiting an in-progress + * sync write (that may/may not still update size, mtime, etc.). + */ +static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) +{ + return !list_empty(&ci->i_cap_snaps) && + list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap, + ci_item)->writing; +} + +/* inode.c */ +extern const struct inode_operations ceph_file_iops; + +extern struct inode *ceph_alloc_inode(struct super_block *sb); +extern void ceph_destroy_inode(struct inode *inode); + +extern struct inode *ceph_get_inode(struct super_block *sb, + struct ceph_vino vino); +extern struct inode *ceph_get_snapdir(struct inode *parent); +extern int ceph_fill_file_size(struct inode *inode, int issued, + u32 truncate_seq, u64 truncate_size, u64 size); +extern void ceph_fill_file_time(struct inode *inode, int issued, + u64 time_warp_seq, struct timespec *ctime, + struct timespec *mtime, struct timespec *atime); +extern int ceph_fill_trace(struct super_block *sb, + struct ceph_mds_request *req, + struct ceph_mds_session *session); +extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, + struct ceph_mds_session *session); + +extern int ceph_inode_holds_cap(struct inode *inode, int mask); + +extern int ceph_inode_set_size(struct inode *inode, loff_t size); +extern void __ceph_do_pending_vmtruncate(struct inode *inode); +extern void ceph_queue_vmtruncate(struct inode *inode); + +extern void ceph_queue_invalidate(struct inode *inode); +extern void ceph_queue_writeback(struct inode *inode); + +extern int ceph_do_getattr(struct inode *inode, int mask); +extern int ceph_permission(struct inode *inode, int mask, unsigned int flags); +extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); +extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); + +/* xattr.c */ +extern int ceph_setxattr(struct dentry *, const char *, const void *, + size_t, int); +extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); +extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); +extern int ceph_removexattr(struct dentry *, const char *); +extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); +extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); + +/* caps.c */ +extern const char *ceph_cap_string(int c); +extern void ceph_handle_caps(struct ceph_mds_session *session, + struct ceph_msg *msg); +extern int ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned cap, unsigned seq, u64 realmino, int flags, + struct ceph_cap_reservation *caps_reservation); +extern void __ceph_remove_cap(struct ceph_cap *cap); +static inline void ceph_remove_cap(struct ceph_cap *cap) +{ + struct inode *inode = &cap->ci->vfs_inode; + spin_lock(&inode->i_lock); + __ceph_remove_cap(cap); + spin_unlock(&inode->i_lock); +} +extern void ceph_put_cap(struct ceph_mds_client *mdsc, + struct ceph_cap *cap); + +extern void ceph_queue_caps_release(struct inode *inode); +extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); +extern int ceph_fsync(struct file *file, int datasync); +extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, + int mds); +extern int ceph_get_cap_mds(struct inode *inode); +extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); +extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); +extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, + struct ceph_snap_context *snapc); +extern void __ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session **psession, + int again); +extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, + struct ceph_mds_session *session); +extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); +extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); + +extern int ceph_encode_inode_release(void **p, struct inode *inode, + int mds, int drop, int unless, int force); +extern int ceph_encode_dentry_release(void **p, struct dentry *dn, + int mds, int drop, int unless); + +extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, + int *got, loff_t endoff); + +/* for counting open files by mode */ +static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode) +{ + ci->i_nr_by_mode[mode]++; +} +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); + +/* addr.c */ +extern const struct address_space_operations ceph_aops; +extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); + +/* file.c */ +extern const struct file_operations ceph_file_fops; +extern const struct address_space_operations ceph_aops; +extern int ceph_copy_to_page_vector(struct page **pages, + const char *data, + loff_t off, size_t len); +extern int ceph_copy_from_page_vector(struct page **pages, + char *data, + loff_t off, size_t len); +extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); +extern int ceph_open(struct inode *inode, struct file *file); +extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, + struct nameidata *nd, int mode, + int locked_dir); +extern int ceph_release(struct inode *inode, struct file *filp); + +/* dir.c */ +extern const struct file_operations ceph_dir_fops; +extern const struct inode_operations ceph_dir_iops; +extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, + ceph_snapdir_dentry_ops; + +extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); +extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, + struct dentry *dentry, int err); + +extern void ceph_dentry_lru_add(struct dentry *dn); +extern void ceph_dentry_lru_touch(struct dentry *dn); +extern void ceph_dentry_lru_del(struct dentry *dn); +extern void ceph_invalidate_dentry_lease(struct dentry *dentry); +extern unsigned ceph_dentry_hash(struct dentry *dn); + +/* + * our d_ops vary depending on whether the inode is live, + * snapshotted (read-only), or a virtual ".snap" directory. + */ +int ceph_init_dentry(struct dentry *dentry); + + +/* ioctl.c */ +extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + +/* export.c */ +extern const struct export_operations ceph_export_ops; + +/* locks.c */ +extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); +extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); +extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); +extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, + int p_locks, int f_locks); +extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); + +static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) +{ + if (dentry && dentry->d_parent) + return dentry->d_parent->d_inode; + + return NULL; +} + +/* debugfs.c */ +extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); +extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); + +#endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c new file mode 100644 index 00000000..f42d730f --- /dev/null +++ b/fs/ceph/xattr.c @@ -0,0 +1,854 @@ +#include + +#include "super.h" +#include "mds_client.h" + +#include + +#include +#include + +static bool ceph_is_valid_xattr(const char *name) +{ + return !strncmp(name, "ceph.", 5) || + !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || + !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); +} + +/* + * These define virtual xattrs exposing the recursive directory + * statistics and layout metadata. + */ +struct ceph_vxattr_cb { + bool readonly; + char *name; + size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, + size_t size); +}; + +/* directories */ + +static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); +} + +static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_files); +} + +static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_subdirs); +} + +static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); +} + +static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_rfiles); +} + +static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_rsubdirs); +} + +static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%lld", ci->i_rbytes); +} + +static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec, + (long)ci->i_rctime.tv_nsec); +} + +static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { + { true, "ceph.dir.entries", ceph_vxattrcb_entries}, + { true, "ceph.dir.files", ceph_vxattrcb_files}, + { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, + { true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, + { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, + { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, + { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, + { true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, + { true, NULL, NULL } +}; + +/* files */ + +static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, + size_t size) +{ + int ret; + + ret = snprintf(val, size, + "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n", + (unsigned long long)ceph_file_layout_su(ci->i_layout), + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), + (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); + if (ceph_file_layout_pg_preferred(ci->i_layout)) + ret += snprintf(val + ret, size, "preferred_osd=%lld\n", + (unsigned long long)ceph_file_layout_pg_preferred( + ci->i_layout)); + return ret; +} + +static struct ceph_vxattr_cb ceph_file_vxattrs[] = { + { true, "ceph.layout", ceph_vxattrcb_layout}, + { NULL, NULL } +}; + +static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode)) + return ceph_dir_vxattrs; + else if (S_ISREG(inode->i_mode)) + return ceph_file_vxattrs; + return NULL; +} + +static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr, + const char *name) +{ + do { + if (strcmp(vxattr->name, name) == 0) + return vxattr; + vxattr++; + } while (vxattr->name); + return NULL; +} + +static int __set_xattr(struct ceph_inode_info *ci, + const char *name, int name_len, + const char *val, int val_len, + int dirty, + int should_free_name, int should_free_val, + struct ceph_inode_xattr **newxattr) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct ceph_inode_xattr *xattr = NULL; + int c; + int new = 0; + + p = &ci->i_xattrs.index.rb_node; + while (*p) { + parent = *p; + xattr = rb_entry(parent, struct ceph_inode_xattr, node); + c = strncmp(name, xattr->name, min(name_len, xattr->name_len)); + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else { + if (name_len == xattr->name_len) + break; + else if (name_len < xattr->name_len) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + xattr = NULL; + } + + if (!xattr) { + new = 1; + xattr = *newxattr; + xattr->name = name; + xattr->name_len = name_len; + xattr->should_free_name = should_free_name; + + ci->i_xattrs.count++; + dout("__set_xattr count=%d\n", ci->i_xattrs.count); + } else { + kfree(*newxattr); + *newxattr = NULL; + if (xattr->should_free_val) + kfree((void *)xattr->val); + + if (should_free_name) { + kfree((void *)name); + name = xattr->name; + } + ci->i_xattrs.names_size -= xattr->name_len; + ci->i_xattrs.vals_size -= xattr->val_len; + } + ci->i_xattrs.names_size += name_len; + ci->i_xattrs.vals_size += val_len; + if (val) + xattr->val = val; + else + xattr->val = ""; + + xattr->val_len = val_len; + xattr->dirty = dirty; + xattr->should_free_val = (val && should_free_val); + + if (new) { + rb_link_node(&xattr->node, parent, p); + rb_insert_color(&xattr->node, &ci->i_xattrs.index); + dout("__set_xattr_val p=%p\n", p); + } + + dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n", + ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val); + + return 0; +} + +static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, + const char *name) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct ceph_inode_xattr *xattr = NULL; + int name_len = strlen(name); + int c; + + p = &ci->i_xattrs.index.rb_node; + while (*p) { + parent = *p; + xattr = rb_entry(parent, struct ceph_inode_xattr, node); + c = strncmp(name, xattr->name, xattr->name_len); + if (c == 0 && name_len > xattr->name_len) + c = 1; + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else { + dout("__get_xattr %s: found %.*s\n", name, + xattr->val_len, xattr->val); + return xattr; + } + } + + dout("__get_xattr %s: not found\n", name); + + return NULL; +} + +static void __free_xattr(struct ceph_inode_xattr *xattr) +{ + BUG_ON(!xattr); + + if (xattr->should_free_name) + kfree((void *)xattr->name); + if (xattr->should_free_val) + kfree((void *)xattr->val); + + kfree(xattr); +} + +static int __remove_xattr(struct ceph_inode_info *ci, + struct ceph_inode_xattr *xattr) +{ + if (!xattr) + return -EOPNOTSUPP; + + rb_erase(&xattr->node, &ci->i_xattrs.index); + + if (xattr->should_free_name) + kfree((void *)xattr->name); + if (xattr->should_free_val) + kfree((void *)xattr->val); + + ci->i_xattrs.names_size -= xattr->name_len; + ci->i_xattrs.vals_size -= xattr->val_len; + ci->i_xattrs.count--; + kfree(xattr); + + return 0; +} + +static int __remove_xattr_by_name(struct ceph_inode_info *ci, + const char *name) +{ + struct rb_node **p; + struct ceph_inode_xattr *xattr; + int err; + + p = &ci->i_xattrs.index.rb_node; + xattr = __get_xattr(ci, name); + err = __remove_xattr(ci, xattr); + return err; +} + +static char *__copy_xattr_names(struct ceph_inode_info *ci, + char *dest) +{ + struct rb_node *p; + struct ceph_inode_xattr *xattr = NULL; + + p = rb_first(&ci->i_xattrs.index); + dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count); + + while (p) { + xattr = rb_entry(p, struct ceph_inode_xattr, node); + memcpy(dest, xattr->name, xattr->name_len); + dest[xattr->name_len] = '\0'; + + dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name, + xattr->name_len, ci->i_xattrs.names_size); + + dest += xattr->name_len + 1; + p = rb_next(p); + } + + return dest; +} + +void __ceph_destroy_xattrs(struct ceph_inode_info *ci) +{ + struct rb_node *p, *tmp; + struct ceph_inode_xattr *xattr = NULL; + + p = rb_first(&ci->i_xattrs.index); + + dout("__ceph_destroy_xattrs p=%p\n", p); + + while (p) { + xattr = rb_entry(p, struct ceph_inode_xattr, node); + tmp = p; + p = rb_next(tmp); + dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p, + xattr->name_len, xattr->name); + rb_erase(tmp, &ci->i_xattrs.index); + + __free_xattr(xattr); + } + + ci->i_xattrs.names_size = 0; + ci->i_xattrs.vals_size = 0; + ci->i_xattrs.index_version = 0; + ci->i_xattrs.count = 0; + ci->i_xattrs.index = RB_ROOT; +} + +static int __build_xattrs(struct inode *inode) + __releases(inode->i_lock) + __acquires(inode->i_lock) +{ + u32 namelen; + u32 numattr = 0; + void *p, *end; + u32 len; + const char *name, *val; + struct ceph_inode_info *ci = ceph_inode(inode); + int xattr_version; + struct ceph_inode_xattr **xattrs = NULL; + int err = 0; + int i; + + dout("__build_xattrs() len=%d\n", + ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0); + + if (ci->i_xattrs.index_version >= ci->i_xattrs.version) + return 0; /* already built */ + + __ceph_destroy_xattrs(ci); + +start: + /* updated internal xattr rb tree */ + if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) { + p = ci->i_xattrs.blob->vec.iov_base; + end = p + ci->i_xattrs.blob->vec.iov_len; + ceph_decode_32_safe(&p, end, numattr, bad); + xattr_version = ci->i_xattrs.version; + spin_unlock(&inode->i_lock); + + xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), + GFP_NOFS); + err = -ENOMEM; + if (!xattrs) + goto bad_lock; + memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *)); + for (i = 0; i < numattr; i++) { + xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr), + GFP_NOFS); + if (!xattrs[i]) + goto bad_lock; + } + + spin_lock(&inode->i_lock); + if (ci->i_xattrs.version != xattr_version) { + /* lost a race, retry */ + for (i = 0; i < numattr; i++) + kfree(xattrs[i]); + kfree(xattrs); + goto start; + } + err = -EIO; + while (numattr--) { + ceph_decode_32_safe(&p, end, len, bad); + namelen = len; + name = p; + p += len; + ceph_decode_32_safe(&p, end, len, bad); + val = p; + p += len; + + err = __set_xattr(ci, name, namelen, val, len, + 0, 0, 0, &xattrs[numattr]); + + if (err < 0) + goto bad; + } + kfree(xattrs); + } + ci->i_xattrs.index_version = ci->i_xattrs.version; + ci->i_xattrs.dirty = false; + + return err; +bad_lock: + spin_lock(&inode->i_lock); +bad: + if (xattrs) { + for (i = 0; i < numattr; i++) + kfree(xattrs[i]); + kfree(xattrs); + } + ci->i_xattrs.names_size = 0; + return err; +} + +static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, + int val_size) +{ + /* + * 4 bytes for the length, and additional 4 bytes per each xattr name, + * 4 bytes per each value + */ + int size = 4 + ci->i_xattrs.count*(4 + 4) + + ci->i_xattrs.names_size + + ci->i_xattrs.vals_size; + dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n", + ci->i_xattrs.count, ci->i_xattrs.names_size, + ci->i_xattrs.vals_size); + + if (name_size) + size += 4 + 4 + name_size + val_size; + + return size; +} + +/* + * If there are dirty xattrs, reencode xattrs into the prealloc_blob + * and swap into place. + */ +void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) +{ + struct rb_node *p; + struct ceph_inode_xattr *xattr = NULL; + void *dest; + + dout("__build_xattrs_blob %p\n", &ci->vfs_inode); + if (ci->i_xattrs.dirty) { + int need = __get_required_blob_size(ci, 0, 0); + + BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len); + + p = rb_first(&ci->i_xattrs.index); + dest = ci->i_xattrs.prealloc_blob->vec.iov_base; + + ceph_encode_32(&dest, ci->i_xattrs.count); + while (p) { + xattr = rb_entry(p, struct ceph_inode_xattr, node); + + ceph_encode_32(&dest, xattr->name_len); + memcpy(dest, xattr->name, xattr->name_len); + dest += xattr->name_len; + ceph_encode_32(&dest, xattr->val_len); + memcpy(dest, xattr->val, xattr->val_len); + dest += xattr->val_len; + + p = rb_next(p); + } + + /* adjust buffer len; it may be larger than we need */ + ci->i_xattrs.prealloc_blob->vec.iov_len = + dest - ci->i_xattrs.prealloc_blob->vec.iov_base; + + if (ci->i_xattrs.blob) + ceph_buffer_put(ci->i_xattrs.blob); + ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; + ci->i_xattrs.prealloc_blob = NULL; + ci->i_xattrs.dirty = false; + ci->i_xattrs.version++; + } +} + +ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, + size_t size) +{ + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + int err; + struct ceph_inode_xattr *xattr; + struct ceph_vxattr_cb *vxattr = NULL; + + if (!ceph_is_valid_xattr(name)) + return -ENODATA; + + /* let's see if a virtual xattr was requested */ + if (vxattrs) + vxattr = ceph_match_vxattr(vxattrs, name); + + spin_lock(&inode->i_lock); + dout("getxattr %p ver=%lld index_ver=%lld\n", inode, + ci->i_xattrs.version, ci->i_xattrs.index_version); + + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && + (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { + goto get_xattr; + } else { + spin_unlock(&inode->i_lock); + /* get xattrs from mds (if we don't already have them) */ + err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); + if (err) + return err; + } + + spin_lock(&inode->i_lock); + + if (vxattr && vxattr->readonly) { + err = vxattr->getxattr_cb(ci, value, size); + goto out; + } + + err = __build_xattrs(inode); + if (err < 0) + goto out; + +get_xattr: + err = -ENODATA; /* == ENOATTR */ + xattr = __get_xattr(ci, name); + if (!xattr) { + if (vxattr) + err = vxattr->getxattr_cb(ci, value, size); + goto out; + } + + err = -ERANGE; + if (size && size < xattr->val_len) + goto out; + + err = xattr->val_len; + if (size == 0) + goto out; + + memcpy(value, xattr->val, xattr->val_len); + +out: + spin_unlock(&inode->i_lock); + return err; +} + +ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + u32 vir_namelen = 0; + u32 namelen; + int err; + u32 len; + int i; + + spin_lock(&inode->i_lock); + dout("listxattr %p ver=%lld index_ver=%lld\n", inode, + ci->i_xattrs.version, ci->i_xattrs.index_version); + + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && + (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { + goto list_xattr; + } else { + spin_unlock(&inode->i_lock); + err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); + if (err) + return err; + } + + spin_lock(&inode->i_lock); + + err = __build_xattrs(inode); + if (err < 0) + goto out; + +list_xattr: + vir_namelen = 0; + /* include virtual dir xattrs */ + if (vxattrs) + for (i = 0; vxattrs[i].name; i++) + vir_namelen += strlen(vxattrs[i].name) + 1; + /* adding 1 byte per each variable due to the null termination */ + namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; + err = -ERANGE; + if (size && namelen > size) + goto out; + + err = namelen; + if (size == 0) + goto out; + + names = __copy_xattr_names(ci, names); + + /* virtual xattr names, too */ + if (vxattrs) + for (i = 0; vxattrs[i].name; i++) { + len = sprintf(names, "%s", vxattrs[i].name); + names += len + 1; + } + +out: + spin_unlock(&inode->i_lock); + return err; +} + +static int ceph_sync_setxattr(struct dentry *dentry, const char *name, + const char *value, size_t size, int flags) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct inode *parent_inode = dentry->d_parent->d_inode; + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = fsc->mdsc; + int err; + int i, nr_pages; + struct page **pages = NULL; + void *kaddr; + + /* copy value into some pages */ + nr_pages = calc_pages_for(0, size); + if (nr_pages) { + pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS); + if (!pages) + return -ENOMEM; + err = -ENOMEM; + for (i = 0; i < nr_pages; i++) { + pages[i] = __page_cache_alloc(GFP_NOFS); + if (!pages[i]) { + nr_pages = i; + goto out; + } + kaddr = kmap(pages[i]); + memcpy(kaddr, value + i*PAGE_CACHE_SIZE, + min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE)); + } + } + + dout("setxattr value=%.*s\n", (int)size, value); + + /* do request */ + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, + USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_inode = inode; + ihold(inode); + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; + req->r_num_caps = 1; + req->r_args.setxattr.flags = cpu_to_le32(flags); + req->r_path2 = kstrdup(name, GFP_NOFS); + + req->r_pages = pages; + req->r_num_pages = nr_pages; + req->r_data_len = size; + + dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + ceph_mdsc_put_request(req); + dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); + +out: + if (pages) { + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kfree(pages); + } + return err; +} + +int ceph_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + int err; + int name_len = strlen(name); + int val_len = size; + char *newname = NULL; + char *newval = NULL; + struct ceph_inode_xattr *xattr = NULL; + int issued; + int required_blob_size; + int dirty; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + + if (!ceph_is_valid_xattr(name)) + return -EOPNOTSUPP; + + if (vxattrs) { + struct ceph_vxattr_cb *vxattr = + ceph_match_vxattr(vxattrs, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; + } + + /* preallocate memory for xattr name, value, index node */ + err = -ENOMEM; + newname = kmemdup(name, name_len + 1, GFP_NOFS); + if (!newname) + goto out; + + if (val_len) { + newval = kmalloc(val_len + 1, GFP_NOFS); + if (!newval) + goto out; + memcpy(newval, value, val_len); + newval[val_len] = '\0'; + } + + xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); + if (!xattr) + goto out; + + spin_lock(&inode->i_lock); +retry: + issued = __ceph_caps_issued(ci, NULL); + if (!(issued & CEPH_CAP_XATTR_EXCL)) + goto do_sync; + __build_xattrs(inode); + + required_blob_size = __get_required_blob_size(ci, name_len, val_len); + + if (!ci->i_xattrs.prealloc_blob || + required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { + struct ceph_buffer *blob = NULL; + + spin_unlock(&inode->i_lock); + dout(" preaallocating new blob size=%d\n", required_blob_size); + blob = ceph_buffer_new(required_blob_size, GFP_NOFS); + if (!blob) + goto out; + spin_lock(&inode->i_lock); + if (ci->i_xattrs.prealloc_blob) + ceph_buffer_put(ci->i_xattrs.prealloc_blob); + ci->i_xattrs.prealloc_blob = blob; + goto retry; + } + + dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); + err = __set_xattr(ci, newname, name_len, newval, + val_len, 1, 1, 1, &xattr); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + ci->i_xattrs.dirty = true; + inode->i_ctime = CURRENT_TIME; + spin_unlock(&inode->i_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + return err; + +do_sync: + spin_unlock(&inode->i_lock); + err = ceph_sync_setxattr(dentry, name, value, size, flags); +out: + kfree(newname); + kfree(newval); + kfree(xattr); + return err; +} + +static int ceph_send_removexattr(struct dentry *dentry, const char *name) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct inode *inode = dentry->d_inode; + struct inode *parent_inode = dentry->d_parent->d_inode; + struct ceph_mds_request *req; + int err; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR, + USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; + req->r_num_caps = 1; + req->r_path2 = kstrdup(name, GFP_NOFS); + + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + ceph_mdsc_put_request(req); + return err; +} + +int ceph_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + int issued; + int err; + int dirty; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + + if (!ceph_is_valid_xattr(name)) + return -EOPNOTSUPP; + + if (vxattrs) { + struct ceph_vxattr_cb *vxattr = + ceph_match_vxattr(vxattrs, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; + } + + spin_lock(&inode->i_lock); + __build_xattrs(inode); + issued = __ceph_caps_issued(ci, NULL); + dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); + + if (!(issued & CEPH_CAP_XATTR_EXCL)) + goto do_sync; + + err = __remove_xattr_by_name(ceph_inode(inode), name); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + ci->i_xattrs.dirty = true; + inode->i_ctime = CURRENT_TIME; + + spin_unlock(&inode->i_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + return err; +do_sync: + spin_unlock(&inode->i_lock); + err = ceph_send_removexattr(dentry, name); + return err; +} + diff --git a/fs/char_dev.c b/fs/char_dev.c new file mode 100644 index 00000000..dca9e5e0 --- /dev/null +++ b/fs/char_dev.c @@ -0,0 +1,575 @@ +/* + * linux/fs/char_dev.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * capabilities for /dev/mem, /dev/kmem and similar directly mappable character + * devices + * - permits shared-mmap for read, write and/or exec + * - does not permit private mmap in NOMMU mode (can't do COW) + * - no readahead or I/O queue unplugging required + */ +struct backing_dev_info directly_mappable_cdev_bdi = { + .name = "char", + .capabilities = ( +#ifdef CONFIG_MMU + /* permit private copies of the data to be taken */ + BDI_CAP_MAP_COPY | +#endif + /* permit direct mmap, for read, write or exec */ + BDI_CAP_MAP_DIRECT | + BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP | + /* no writeback happens */ + BDI_CAP_NO_ACCT_AND_WRITEBACK), +}; + +static struct kobj_map *cdev_map; + +static DEFINE_MUTEX(chrdevs_lock); + +static struct char_device_struct { + struct char_device_struct *next; + unsigned int major; + unsigned int baseminor; + int minorct; + char name[64]; + struct cdev *cdev; /* will die */ +} *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; + +/* index in the above */ +static inline int major_to_index(unsigned major) +{ + return major % CHRDEV_MAJOR_HASH_SIZE; +} + +#ifdef CONFIG_PROC_FS + +void chrdev_show(struct seq_file *f, off_t offset) +{ + struct char_device_struct *cd; + + if (offset < CHRDEV_MAJOR_HASH_SIZE) { + mutex_lock(&chrdevs_lock); + for (cd = chrdevs[offset]; cd; cd = cd->next) + seq_printf(f, "%3d %s\n", cd->major, cd->name); + mutex_unlock(&chrdevs_lock); + } +} + +#endif /* CONFIG_PROC_FS */ + +/* + * Register a single major with a specified minor range. + * + * If major == 0 this functions will dynamically allocate a major and return + * its number. + * + * If major > 0 this function will attempt to reserve the passed range of + * minors and will return zero on success. + * + * Returns a -ve errno on failure. + */ +static struct char_device_struct * +__register_chrdev_region(unsigned int major, unsigned int baseminor, + int minorct, const char *name) +{ + struct char_device_struct *cd, **cp; + int ret = 0; + int i; + + cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL); + if (cd == NULL) + return ERR_PTR(-ENOMEM); + + mutex_lock(&chrdevs_lock); + + /* temporary */ + if (major == 0) { + for (i = ARRAY_SIZE(chrdevs)-1; i > 0; i--) { + if (chrdevs[i] == NULL) + break; + } + + if (i == 0) { + ret = -EBUSY; + goto out; + } + major = i; + ret = major; + } + + cd->major = major; + cd->baseminor = baseminor; + cd->minorct = minorct; + strlcpy(cd->name, name, sizeof(cd->name)); + + i = major_to_index(major); + + for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) + if ((*cp)->major > major || + ((*cp)->major == major && + (((*cp)->baseminor >= baseminor) || + ((*cp)->baseminor + (*cp)->minorct > baseminor)))) + break; + + /* Check for overlapping minor ranges. */ + if (*cp && (*cp)->major == major) { + int old_min = (*cp)->baseminor; + int old_max = (*cp)->baseminor + (*cp)->minorct - 1; + int new_min = baseminor; + int new_max = baseminor + minorct - 1; + + /* New driver overlaps from the left. */ + if (new_max >= old_min && new_max <= old_max) { + ret = -EBUSY; + goto out; + } + + /* New driver overlaps from the right. */ + if (new_min <= old_max && new_min >= old_min) { + ret = -EBUSY; + goto out; + } + } + + cd->next = *cp; + *cp = cd; + mutex_unlock(&chrdevs_lock); + return cd; +out: + mutex_unlock(&chrdevs_lock); + kfree(cd); + return ERR_PTR(ret); +} + +static struct char_device_struct * +__unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) +{ + struct char_device_struct *cd = NULL, **cp; + int i = major_to_index(major); + + mutex_lock(&chrdevs_lock); + for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) + if ((*cp)->major == major && + (*cp)->baseminor == baseminor && + (*cp)->minorct == minorct) + break; + if (*cp) { + cd = *cp; + *cp = cd->next; + } + mutex_unlock(&chrdevs_lock); + return cd; +} + +/** + * register_chrdev_region() - register a range of device numbers + * @from: the first in the desired range of device numbers; must include + * the major number. + * @count: the number of consecutive device numbers required + * @name: the name of the device or driver. + * + * Return value is zero on success, a negative error code on failure. + */ +int register_chrdev_region(dev_t from, unsigned count, const char *name) +{ + struct char_device_struct *cd; + dev_t to = from + count; + dev_t n, next; + + for (n = from; n < to; n = next) { + next = MKDEV(MAJOR(n)+1, 0); + if (next > to) + next = to; + cd = __register_chrdev_region(MAJOR(n), MINOR(n), + next - n, name); + if (IS_ERR(cd)) + goto fail; + } + return 0; +fail: + to = n; + for (n = from; n < to; n = next) { + next = MKDEV(MAJOR(n)+1, 0); + kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); + } + return PTR_ERR(cd); +} + +/** + * alloc_chrdev_region() - register a range of char device numbers + * @dev: output parameter for first assigned number + * @baseminor: first of the requested range of minor numbers + * @count: the number of minor numbers required + * @name: the name of the associated device or driver + * + * Allocates a range of char device numbers. The major number will be + * chosen dynamically, and returned (along with the first minor number) + * in @dev. Returns zero or a negative error code. + */ +int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, + const char *name) +{ + struct char_device_struct *cd; + cd = __register_chrdev_region(0, baseminor, count, name); + if (IS_ERR(cd)) + return PTR_ERR(cd); + *dev = MKDEV(cd->major, cd->baseminor); + return 0; +} + +/** + * __register_chrdev() - create and register a cdev occupying a range of minors + * @major: major device number or 0 for dynamic allocation + * @baseminor: first of the requested range of minor numbers + * @count: the number of minor numbers required + * @name: name of this range of devices + * @fops: file operations associated with this devices + * + * If @major == 0 this functions will dynamically allocate a major and return + * its number. + * + * If @major > 0 this function will attempt to reserve a device with the given + * major number and will return zero on success. + * + * Returns a -ve errno on failure. + * + * The name of this device has nothing to do with the name of the device in + * /dev. It only helps to keep track of the different owners of devices. If + * your module name has only one type of devices it's ok to use e.g. the name + * of the module here. + */ +int __register_chrdev(unsigned int major, unsigned int baseminor, + unsigned int count, const char *name, + const struct file_operations *fops) +{ + struct char_device_struct *cd; + struct cdev *cdev; + int err = -ENOMEM; + + cd = __register_chrdev_region(major, baseminor, count, name); + if (IS_ERR(cd)) + return PTR_ERR(cd); + + cdev = cdev_alloc(); + if (!cdev) + goto out2; + + cdev->owner = fops->owner; + cdev->ops = fops; + kobject_set_name(&cdev->kobj, "%s", name); + + err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); + if (err) + goto out; + + cd->cdev = cdev; + + return major ? 0 : cd->major; +out: + kobject_put(&cdev->kobj); +out2: + kfree(__unregister_chrdev_region(cd->major, baseminor, count)); + return err; +} + +/** + * unregister_chrdev_region() - return a range of device numbers + * @from: the first in the range of numbers to unregister + * @count: the number of device numbers to unregister + * + * This function will unregister a range of @count device numbers, + * starting with @from. The caller should normally be the one who + * allocated those numbers in the first place... + */ +void unregister_chrdev_region(dev_t from, unsigned count) +{ + dev_t to = from + count; + dev_t n, next; + + for (n = from; n < to; n = next) { + next = MKDEV(MAJOR(n)+1, 0); + if (next > to) + next = to; + kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); + } +} + +/** + * __unregister_chrdev - unregister and destroy a cdev + * @major: major device number + * @baseminor: first of the range of minor numbers + * @count: the number of minor numbers this cdev is occupying + * @name: name of this range of devices + * + * Unregister and destroy the cdev occupying the region described by + * @major, @baseminor and @count. This function undoes what + * __register_chrdev() did. + */ +void __unregister_chrdev(unsigned int major, unsigned int baseminor, + unsigned int count, const char *name) +{ + struct char_device_struct *cd; + + cd = __unregister_chrdev_region(major, baseminor, count); + if (cd && cd->cdev) + cdev_del(cd->cdev); + kfree(cd); +} + +static DEFINE_SPINLOCK(cdev_lock); + +static struct kobject *cdev_get(struct cdev *p) +{ + struct module *owner = p->owner; + struct kobject *kobj; + + if (owner && !try_module_get(owner)) + return NULL; + kobj = kobject_get(&p->kobj); + if (!kobj) + module_put(owner); + return kobj; +} + +void cdev_put(struct cdev *p) +{ + if (p) { + struct module *owner = p->owner; + kobject_put(&p->kobj); + module_put(owner); + } +} + +/* + * Called every time a character special file is opened + */ +static int chrdev_open(struct inode *inode, struct file *filp) +{ + struct cdev *p; + struct cdev *new = NULL; + int ret = 0; + + spin_lock(&cdev_lock); + p = inode->i_cdev; + if (!p) { + struct kobject *kobj; + int idx; + spin_unlock(&cdev_lock); + kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); + if (!kobj) + return -ENXIO; + new = container_of(kobj, struct cdev, kobj); + spin_lock(&cdev_lock); + /* Check i_cdev again in case somebody beat us to it while + we dropped the lock. */ + p = inode->i_cdev; + if (!p) { + inode->i_cdev = p = new; + list_add(&inode->i_devices, &p->list); + new = NULL; + } else if (!cdev_get(p)) + ret = -ENXIO; + } else if (!cdev_get(p)) + ret = -ENXIO; + spin_unlock(&cdev_lock); + cdev_put(new); + if (ret) + return ret; + + ret = -ENXIO; + filp->f_op = fops_get(p->ops); + if (!filp->f_op) + goto out_cdev_put; + + if (filp->f_op->open) { + ret = filp->f_op->open(inode,filp); + if (ret) + goto out_cdev_put; + } + + return 0; + + out_cdev_put: + cdev_put(p); + return ret; +} + +void cd_forget(struct inode *inode) +{ + spin_lock(&cdev_lock); + list_del_init(&inode->i_devices); + inode->i_cdev = NULL; + spin_unlock(&cdev_lock); +} + +static void cdev_purge(struct cdev *cdev) +{ + spin_lock(&cdev_lock); + while (!list_empty(&cdev->list)) { + struct inode *inode; + inode = container_of(cdev->list.next, struct inode, i_devices); + list_del_init(&inode->i_devices); + inode->i_cdev = NULL; + } + spin_unlock(&cdev_lock); +} + +/* + * Dummy default file-operations: the only thing this does + * is contain the open that then fills in the correct operations + * depending on the special file... + */ +const struct file_operations def_chr_fops = { + .open = chrdev_open, + .llseek = noop_llseek, +}; + +static struct kobject *exact_match(dev_t dev, int *part, void *data) +{ + struct cdev *p = data; + return &p->kobj; +} + +static int exact_lock(dev_t dev, void *data) +{ + struct cdev *p = data; + return cdev_get(p) ? 0 : -1; +} + +/** + * cdev_add() - add a char device to the system + * @p: the cdev structure for the device + * @dev: the first device number for which this device is responsible + * @count: the number of consecutive minor numbers corresponding to this + * device + * + * cdev_add() adds the device represented by @p to the system, making it + * live immediately. A negative error code is returned on failure. + */ +int cdev_add(struct cdev *p, dev_t dev, unsigned count) +{ + p->dev = dev; + p->count = count; + return kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); +} + +static void cdev_unmap(dev_t dev, unsigned count) +{ + kobj_unmap(cdev_map, dev, count); +} + +/** + * cdev_del() - remove a cdev from the system + * @p: the cdev structure to be removed + * + * cdev_del() removes @p from the system, possibly freeing the structure + * itself. + */ +void cdev_del(struct cdev *p) +{ + cdev_unmap(p->dev, p->count); + kobject_put(&p->kobj); +} + + +static void cdev_default_release(struct kobject *kobj) +{ + struct cdev *p = container_of(kobj, struct cdev, kobj); + cdev_purge(p); +} + +static void cdev_dynamic_release(struct kobject *kobj) +{ + struct cdev *p = container_of(kobj, struct cdev, kobj); + cdev_purge(p); + kfree(p); +} + +static struct kobj_type ktype_cdev_default = { + .release = cdev_default_release, +}; + +static struct kobj_type ktype_cdev_dynamic = { + .release = cdev_dynamic_release, +}; + +/** + * cdev_alloc() - allocate a cdev structure + * + * Allocates and returns a cdev structure, or NULL on failure. + */ +struct cdev *cdev_alloc(void) +{ + struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); + if (p) { + INIT_LIST_HEAD(&p->list); + kobject_init(&p->kobj, &ktype_cdev_dynamic); + } + return p; +} + +/** + * cdev_init() - initialize a cdev structure + * @cdev: the structure to initialize + * @fops: the file_operations for this device + * + * Initializes @cdev, remembering @fops, making it ready to add to the + * system with cdev_add(). + */ +void cdev_init(struct cdev *cdev, const struct file_operations *fops) +{ + memset(cdev, 0, sizeof *cdev); + INIT_LIST_HEAD(&cdev->list); + kobject_init(&cdev->kobj, &ktype_cdev_default); + cdev->ops = fops; +} + +static struct kobject *base_probe(dev_t dev, int *part, void *data) +{ + if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0) + /* Make old-style 2.4 aliases work */ + request_module("char-major-%d", MAJOR(dev)); + return NULL; +} + +void __init chrdev_init(void) +{ + cdev_map = kobj_map_init(base_probe, &chrdevs_lock); + bdi_init(&directly_mappable_cdev_bdi); +} + + +/* Let modules do char dev stuff */ +EXPORT_SYMBOL(register_chrdev_region); +EXPORT_SYMBOL(unregister_chrdev_region); +EXPORT_SYMBOL(alloc_chrdev_region); +EXPORT_SYMBOL(cdev_init); +EXPORT_SYMBOL(cdev_alloc); +EXPORT_SYMBOL(cdev_del); +EXPORT_SYMBOL(cdev_add); +EXPORT_SYMBOL(__register_chrdev); +EXPORT_SYMBOL(__unregister_chrdev); +EXPORT_SYMBOL(directly_mappable_cdev_bdi); diff --git a/fs/cifs/AUTHORS b/fs/cifs/AUTHORS new file mode 100644 index 00000000..ea940b1d --- /dev/null +++ b/fs/cifs/AUTHORS @@ -0,0 +1,55 @@ +Original Author +=============== +Steve French (sfrench@samba.org) + +The author wishes to express his appreciation and thanks to: +Andrew Tridgell (Samba team) for his early suggestions about smb/cifs VFS +improvements. Thanks to IBM for allowing me time and test resources to pursue +this project, to Jim McDonough from IBM (and the Samba Team) for his help, to +the IBM Linux JFS team for explaining many esoteric Linux filesystem features. +Jeremy Allison of the Samba team has done invaluable work in adding the server +side of the original CIFS Unix extensions and reviewing and implementing +portions of the newer CIFS POSIX extensions into the Samba 3 file server. Thank +Dave Boutcher of IBM Rochester (author of the OS/400 smb/cifs filesystem client) +for proving years ago that very good smb/cifs clients could be done on Unix-like +operating systems. Volker Lendecke, Andrew Tridgell, Urban Widmark, John +Newbigin and others for their work on the Linux smbfs module. Thanks to +the other members of the Storage Network Industry Association CIFS Technical +Workgroup for their work specifying this highly complex protocol and finally +thanks to the Samba team for their technical advice and encouragement. + +Patch Contributors +------------------ +Zwane Mwaikambo +Andi Kleen +Amrut Joshi +Shobhit Dayal +Sergey Vlasov +Richard Hughes +Yury Umanets +Mark Hamzy (for some of the early cifs IPv6 work) +Domen Puncer +Jesper Juhl (in particular for lots of whitespace/formatting cleanup) +Vince Negri and Dave Stahl (for finding an important caching bug) +Adrian Bunk (kcalloc cleanups) +Miklos Szeredi +Kazeon team for various fixes especially for 2.4 version. +Asser Ferno (Change Notify support) +Shaggy (Dave Kleikamp) for innumerable small fs suggestions and some good cleanup +Gunter Kukkukk (testing and suggestions for support of old servers) +Igor Mammedov (DFS support) +Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code) + +Test case and Bug Report contributors +------------------------------------- +Thanks to those in the community who have submitted detailed bug reports +and debug of problems they have found: Jochen Dolze, David Blaine, +Rene Scharfe, Martin Josefsson, Alexander Wild, Anthony Liguori, +Lars Muller, Urban Widmark, Massimiliano Ferrero, Howard Owen, +Olaf Kirch, Kieron Briggs, Nick Millington and others. Also special +mention to the Stanford Checker (SWAT) which pointed out many minor +bugs in error paths. Valuable suggestions also have come from Al Viro +and Dave Miller. + +And thanks to the IBM LTC and Power test teams and SuSE testers for +finding multiple bugs during excellent stress test runs. diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES new file mode 100644 index 00000000..bc0025cd --- /dev/null +++ b/fs/cifs/CHANGES @@ -0,0 +1,1065 @@ +Version 1.62 +------------ +Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened +to more strictly handle corrupt frames. + +Version 1.61 +------------ +Fix append problem to Samba servers (files opened with O_APPEND could +have duplicated data). Fix oops in cifs_lookup. Workaround problem +mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session. +Disable use of server inode numbers when server only +partially supports them (e.g. for one server querying inode numbers on +FindFirst fails but QPathInfo queries works). Fix oops with dfs in +cifs_put_smb_ses. Fix mmap to work on directio mounts (needed +for OpenOffice when on forcedirectio mount e.g.) + +Version 1.60 +------------- +Fix memory leak in reconnect. Fix oops in DFS mount error path. +Set s_maxbytes to smaller (the max that vfs can handle) so that +sendfile will now work over cifs mounts again. Add noforcegid +and noforceuid mount parameters. Fix small mem leak when using +ntlmv2. Fix 2nd mount to same server but with different port to +be allowed (rather than reusing the 1st port) - only when the +user explicitly overrides the port on the 2nd mount. + +Version 1.59 +------------ +Client uses server inode numbers (which are persistent) rather than +client generated ones by default (mount option "serverino" turned +on by default if server supports it). Add forceuid and forcegid +mount options (so that when negotiating unix extensions specifying +which uid mounted does not immediately force the server's reported +uids to be overridden). Add support for scope mount parm. Improve +hard link detection to use same inode for both. Do not set +read-only dos attribute on directories (for chmod) since Windows +explorer special cases this attribute bit for directories for +a different purpose. + +Version 1.58 +------------ +Guard against buffer overruns in various UCS-2 to UTF-8 string conversions +when the UTF-8 string is composed of unusually long (more than 4 byte) converted +characters. Add support for mounting root of a share which redirects immediately +to DFS target. Convert string conversion functions from Unicode to more +accurately mark string length before allocating memory (which may help the +rare cases where a UTF-8 string is much larger than the UCS2 string that +we converted from). Fix endianness of the vcnum field used during +session setup to distinguish multiple mounts to same server from different +userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental +flag to be set to 2, and mount must enable krb5 to turn on extended security). +Performance of file create to Samba improved (posix create on lookup +removes 1 of 2 network requests sent on file create) + +Version 1.57 +------------ +Improve support for multiple security contexts to the same server. We +used to use the same "vcnumber" for all connections which could cause +the server to treat subsequent connections, especially those that +are authenticated as guest, as reconnections, invalidating the earlier +user's smb session. This fix allows cifs to mount multiple times to the +same server with different userids without risking invalidating earlier +established security contexts. fsync now sends SMB Flush operation +to better ensure that we wait for server to write all of the data to +server disk (not just write it over the network). Add new mount +parameter to allow user to disable sending the (slow) SMB flush on +fsync if desired (fsync still flushes all cached write data to the server). +Posix file open support added (turned off after one attempt if server +fails to support it properly, as with Samba server versions prior to 3.3.2) +Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too +little memory for the "nativeFileSystem" field returned by the server +during mount). Endian convert inode numbers if necessary (makes it easier +to compare inode numbers on network files from big endian systems). + +Version 1.56 +------------ +Add "forcemandatorylock" mount option to allow user to use mandatory +rather than posix (advisory) byte range locks, even though server would +support posix byte range locks. Fix query of root inode when prefixpath +specified and user does not have access to query information about the +top of the share. Fix problem in 2.6.28 resolving DFS paths to +Samba servers (worked to Windows). Fix rmdir so that pending search +(readdir) requests do not get invalid results which include the now +removed directory. Fix oops in cifs_dfs_ref.c when prefixpath is not reachable +when using DFS. Add better file create support to servers which support +the CIFS POSIX protocol extensions (this adds support for new flags +on create, and improves semantics for write of locked ranges). + +Version 1.55 +------------ +Various fixes to make delete of open files behavior more predictable +(when delete of an open file fails we mark the file as "delete-on-close" +in a way that more servers accept, but only if we can first rename the +file to a temporary name). Add experimental support for more safely +handling fcntl(F_SETLEASE). Convert cifs to using blocking tcp +sends, and also let tcp autotune the socket send and receive buffers. +This reduces the number of EAGAIN errors returned by TCP/IP in +high stress workloads (and the number of retries on socket writes +when sending large SMBWriteX requests). Fix case in which a portion of +data can in some cases not get written to the file on the server before the +file is closed. Fix DFS parsing to properly handle path consumed field, +and to handle certain codepage conversions better. Fix mount and +umount race that can cause oops in mount or umount or reconnect. + +Version 1.54 +------------ +Fix premature write failure on congested networks (we would give up +on EAGAIN from the socket too quickly on large writes). +Cifs_mkdir and cifs_create now respect the setgid bit on parent dir. +Fix endian problems in acl (mode from/to cifs acl) on bigendian +architectures. Fix problems with preserving timestamps on copying open +files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit +on parent directory when server supports Unix Extensions but not POSIX +create. Update cifs.upcall version to handle new Kerberos sec flags +(this requires update of cifs.upcall program from Samba). Fix memory leak +on dns_upcall (resolving DFS referralls). Fix plain text password +authentication (requires setting SecurityFlags to 0x30030 to enable +lanman and plain text though). Fix writes to be at correct offset when +file is open with O_APPEND and file is on a directio (forcediretio) mount. +Fix bug in rewinding readdir directory searches. Add nodfs mount option. + +Version 1.53 +------------ +DFS support added (Microsoft Distributed File System client support needed +for referrals which enable a hierarchical name space among servers). +Disable temporary caching of mode bits to servers which do not support +storing of mode (e.g. Windows servers, when client mounts without cifsacl +mount option) and add new "dynperm" mount option to enable temporary caching +of mode (enable old behavior). Fix hang on mount caused when server crashes +tcp session during negotiate protocol. + +Version 1.52 +------------ +Fix oops on second mount to server when null auth is used. +Enable experimental Kerberos support. Return writebehind errors on flush +and sync so that events like out of disk space get reported properly on +cached files. Fix setxattr failure to certain Samba versions. Fix mount +of second share to disconnected server session (autoreconnect on this). +Add ability to modify cifs acls for handling chmod (when mounted with +cifsacl flag). Fix prefixpath path separator so we can handle mounts +with prefixpaths longer than one directory (one path component) when +mounted to Windows servers. Fix slow file open when cifsacl +enabled. Fix memory leak in FindNext when the SMB call returns -EBADF. + + +Version 1.51 +------------ +Fix memory leak in statfs when mounted to very old servers (e.g. +Windows 9x). Add new feature "POSIX open" which allows servers +which support the current POSIX Extensions to provide better semantics +(e.g. delete for open files opened with posix open). Take into +account umask on posix mkdir not just older style mkdir. Add +ability to mount to IPC$ share (which allows CIFS named pipes to be +opened, read and written as if they were files). When 1st tree +connect fails (e.g. due to signing negotiation failure) fix +leak that causes cifsd not to stop and rmmod to fail to cleanup +cifs_request_buffers pool. Fix problem with POSIX Open/Mkdir on +bigendian architectures. Fix possible memory corruption when +EAGAIN returned on kern_recvmsg. Return better error if server +requires packet signing but client has disabled it. When mounted +with cifsacl mount option - mode bits are approximated based +on the contents of the ACL of the file or directory. When cifs +mount helper is missing convert make sure that UNC name +has backslash (not forward slash) between ip address of server +and the share name. + +Version 1.50 +------------ +Fix NTLMv2 signing. NFS server mounted over cifs works (if cifs mount is +done with "serverino" mount option). Add support for POSIX Unlink +(helps with certain sharing violation cases when server such as +Samba supports newer POSIX CIFS Protocol Extensions). Add "nounix" +mount option to allow disabling the CIFS Unix Extensions for just +that mount. Fix hang on spinlock in find_writable_file (race when +reopening file after session crash). Byte range unlock request to +windows server could unlock more bytes (on server copy of file) +than intended if start of unlock request is well before start of +a previous byte range lock that we issued. + +Version 1.49 +------------ +IPv6 support. Enable ipv6 addresses to be passed on mount (put the ipv6 +address after the "ip=" mount option, at least until mount.cifs is fixed to +handle DNS host to ipv6 name translation). Accept override of uid or gid +on mount even when Unix Extensions are negotiated (it used to be ignored +when Unix Extensions were ignored). This allows users to override the +default uid and gid for files when they are certain that the uids or +gids on the server do not match those of the client. Make "sec=none" +mount override username (so that null user connection is attempted) +to match what documentation said. Support for very large reads, over 127K, +available to some newer servers (such as Samba 3.0.26 and later but +note that it also requires setting CIFSMaxBufSize at module install +time to a larger value which may hurt performance in some cases). +Make sign option force signing (or fail if server does not support it). + +Version 1.48 +------------ +Fix mtime bouncing around from local idea of last write times to remote time. +Fix hang (in i_size_read) when simultaneous size update of same remote file +on smp system corrupts sequence number. Do not reread unnecessarily partial page +(which we are about to overwrite anyway) when writing out file opened rw. +When DOS attribute of file on non-Unix server's file changes on the server side +from read-only back to read-write, reflect this change in default file mode +(we had been leaving a file's mode read-only until the inode were reloaded). +Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute +when archive dos attribute not set and we are changing mode back to writeable +on server which does not support the Unix Extensions). Remove read only dos +attribute on chmod when adding any write permission (ie on any of +user/group/other (not all of user/group/other ie 0222) when +mounted to windows. Add support for POSIX MkDir (slight performance +enhancement and eliminates the network race between the mkdir and set +path info of the mode). + + +Version 1.47 +------------ +Fix oops in list_del during mount caused by unaligned string. +Fix file corruption which could occur on some large file +copies caused by writepages page i/o completion bug. +Seek to SEEK_END forces check for update of file size for non-cached +files. Allow file size to be updated on remote extend of locally open, +non-cached file. Fix reconnect to newer Samba servers (or other servers +which support the CIFS Unix/POSIX extensions) so that we again tell the +server the Unix/POSIX cifs capabilities which we support (SetFSInfo). +Add experimental support for new POSIX Open/Mkdir (which returns +stat information on the open, and allows setting the mode). + +Version 1.46 +------------ +Support deep tree mounts. Better support OS/2, Win9x (DOS) time stamps. +Allow null user to be specified on mount ("username="). Do not return +EINVAL on readdir when filldir fails due to overwritten blocksize +(fixes FC problem). Return error in rename 2nd attempt retry (ie report +if rename by handle also fails, after rename by path fails, we were +not reporting whether the retry worked or not). Fix NTLMv2 to +work to Windows servers (mount with option "sec=ntlmv2"). + +Version 1.45 +------------ +Do not time out lockw calls when using posix extensions. Do not +time out requests if server still responding reasonably fast +on requests on other threads. Improve POSIX locking emulation, +(lock cancel now works, and unlock of merged range works even +to Windows servers now). Fix oops on mount to lanman servers +(win9x, os/2 etc.) when null password. Do not send listxattr +(SMB to query all EAs) if nouser_xattr specified. Fix SE Linux +problem (instantiate inodes/dentries in right order for readdir). + +Version 1.44 +------------ +Rewritten sessionsetup support, including support for legacy SMB +session setup needed for OS/2 and older servers such as Windows 95 and 98. +Fix oops on ls to OS/2 servers. Add support for level 1 FindFirst +so we can do search (ls etc.) to OS/2. Do not send NTCreateX +or recent levels of FindFirst unless server says it supports NT SMBs +(instead use legacy equivalents from LANMAN dialect). Fix to allow +NTLMv2 authentication support (now can use stronger password hashing +on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004). +Allow override of global cifs security flags on mount via "sec=" option(s). + +Version 1.43 +------------ +POSIX locking to servers which support CIFS POSIX Extensions +(disabled by default controlled by proc/fs/cifs/Experimental). +Handle conversion of long share names (especially Asian languages) +to Unicode during mount. Fix memory leak in sess struct on reconnect. +Fix rare oops after acpi suspend. Fix O_TRUNC opens to overwrite on +cifs open which helps rare case when setpathinfo fails or server does +not support it. + +Version 1.42 +------------ +Fix slow oplock break when mounted to different servers at the same time and +the tids match and we try to find matching fid on wrong server. Fix read +looping when signing required by server (2.6.16 kernel only). Fix readdir +vs. rename race which could cause each to hang. Return . and .. even +if server does not. Allow searches to skip first three entries and +begin at any location. Fix oops in find_writeable_file. + +Version 1.41 +------------ +Fix NTLMv2 security (can be enabled in /proc/fs/cifs) so customers can +configure stronger authentication. Fix sfu symlinks so they can +be followed (not just recognized). Fix wraparound of bcc on +read responses when buffer size over 64K and also fix wrap of +max smb buffer size when CIFSMaxBufSize over 64K. Fix oops in +cifs_user_read and cifs_readpages (when EAGAIN on send of smb +on socket is returned over and over). Add POSIX (advisory) byte range +locking support (requires server with newest CIFS UNIX Extensions +to the protocol implemented). Slow down negprot slightly in port 139 +RFC1001 case to give session_init time on buggy servers. + +Version 1.40 +------------ +Use fsuid (fsgid) more consistently instead of uid (gid). Improve performance +of readpages by eliminating one extra memcpy. Allow update of file size +from remote server even if file is open for write as long as mount is +directio. Recognize share mode security and send NTLM encrypted password +on tree connect if share mode negotiated. + +Version 1.39 +------------ +Defer close of a file handle slightly if pending writes depend on that handle +(this reduces the EBADF bad file handle errors that can be logged under heavy +stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2 +Fix SFU style symlinks and mknod needed for servers which do not support the +CIFS Unix Extensions. Fix setfacl/getfacl on bigendian. Timeout negative +dentries so files that the client sees as deleted but that later get created +on the server will be recognized. Add client side permission check on setattr. +Timeout stuck requests better (where server has never responded or sent corrupt +responses) + +Version 1.38 +------------ +Fix tcp socket retransmission timeouts (e.g. on ENOSPACE from the socket) +to be smaller at first (but increasing) so large write performance performance +over GigE is better. Do not hang thread on illegal byte range lock response +from Windows (Windows can send an RFC1001 size which does not match smb size) by +allowing an SMBs TCP length to be up to a few bytes longer than it should be. +wsize and rsize can now be larger than negotiated buffer size if server +supports large readx/writex, even when directio mount flag not specified. +Write size will in many cases now be 16K instead of 4K which greatly helps +file copy performance on lightly loaded networks. Fix oops in dnotify +when experimental config flag enabled. Make cifsFYI more granular. + +Version 1.37 +------------ +Fix readdir caching when unlink removes file in current search buffer, +and this is followed by a rewind search to just before the deleted entry. +Do not attempt to set ctime unless atime and/or mtime change requested +(most servers throw it away anyway). Fix length check of received smbs +to be more accurate. Fix big endian problem with mapchars mount option, +and with a field returned by statfs. + +Version 1.36 +------------ +Add support for mounting to older pre-CIFS servers such as Windows9x and ME. +For these older servers, add option for passing netbios name of server in +on mount (servernetbiosname). Add suspend support for power management, to +avoid cifsd thread preventing software suspend from working. +Add mount option for disabling the default behavior of sending byte range lock +requests to the server (necessary for certain applications which break with +mandatory lock behavior such as Evolution), and also mount option for +requesting case insensitive matching for path based requests (requesting +case sensitive is the default). + +Version 1.35 +------------ +Add writepage performance improvements. Fix path name conversions +for long filenames on mounts which were done with "mapchars" mount option +specified. Ensure multiplex ids do not collide. Fix case in which +rmmod can oops if done soon after last unmount. Fix truncated +search (readdir) output when resume filename was a long filename. +Fix filename conversion when mapchars mount option was specified and +filename was a long filename. + +Version 1.34 +------------ +Fix error mapping of the TOO_MANY_LINKS (hardlinks) case. +Do not oops if root user kills cifs oplock kernel thread or +kills the cifsd thread (NB: killing the cifs kernel threads is not +recommended, unmount and rmmod cifs will kill them when they are +no longer needed). Fix readdir to ASCII servers (ie older servers +which do not support Unicode) and also require asterisk. +Fix out of memory case in which data could be written one page +off in the page cache. + +Version 1.33 +------------ +Fix caching problem, in which readdir of directory containing a file +which was cached could cause the file's time stamp to be updated +without invalidating the readahead data (so we could get stale +file data on the client for that file even as the server copy changed). +Cleanup response processing so cifsd can not loop when abnormally +terminated. + + +Version 1.32 +------------ +Fix oops in ls when Transact2 FindFirst (or FindNext) returns more than one +transact response for an SMB request and search entry split across two frames. +Add support for lsattr (getting ext2/ext3/reiserfs attr flags from the server) +as new protocol extensions. Do not send Get/Set calls for POSIX ACLs +unless server explicitly claims to support them in CIFS Unix extensions +POSIX ACL capability bit. Fix packet signing when multiuser mounting with +different users from the same client to the same server. Fix oops in +cifs_close. Add mount option for remapping reserved characters in +filenames (also allow recognizing files with created by SFU which have any +of these seven reserved characters, except backslash, to be recognized). +Fix invalid transact2 message (we were sometimes trying to interpret +oplock breaks as SMB responses). Add ioctl for checking that the +current uid matches the uid of the mounter (needed by umount.cifs). +Reduce the number of large buffer allocations in cifs response processing +(significantly reduces memory pressure under heavy stress with multiple +processes accessing the same server at the same time). + +Version 1.31 +------------ +Fix updates of DOS attributes and time fields so that files on NT4 servers +do not get marked delete on close. Display sizes of cifs buffer pools in +cifs stats. Fix oops in unmount when cifsd thread being killed by +shutdown. Add generic readv/writev and aio support. Report inode numbers +consistently in readdir and lookup (when serverino mount option is +specified use the inode number that the server reports - for both lookup +and readdir, otherwise by default the locally generated inode number is used +for inodes created in either path since servers are not always able to +provide unique inode numbers when exporting multiple volumes from under one +sharename). + +Version 1.30 +------------ +Allow new nouser_xattr mount parm to disable xattr support for user namespace. +Do not flag user_xattr mount parm in dmesg. Retry failures setting file time +(mostly affects NT4 servers) by retry with handle based network operation. +Add new POSIX Query FS Info for returning statfs info more accurately. +Handle passwords with multiple commas in them. + +Version 1.29 +------------ +Fix default mode in sysfs of cifs module parms. Remove old readdir routine. +Fix capabilities flags for large readx so as to allow reads larger than 64K. + +Version 1.28 +------------ +Add module init parm for large SMB buffer size (to allow it to be changed +from its default of 16K) which is especially useful for large file copy +when mounting with the directio mount option. Fix oops after +returning from mount when experimental ExtendedSecurity enabled and +SpnegoNegotiated returning invalid error. Fix case to retry better when +peek returns from 1 to 3 bytes on socket which should have more data. +Fixed path based calls (such as cifs lookup) to handle path names +longer than 530 (now can handle PATH_MAX). Fix pass through authentication +from Samba server to DC (Samba required dummy LM password). + +Version 1.27 +------------ +Turn off DNOTIFY (directory change notification support) by default +(unless built with the experimental flag) to fix hang with KDE +file browser. Fix DNOTIFY flag mappings. Fix hang (in wait_event +waiting on an SMB response) in SendReceive when session dies but +reconnects quickly from another task. Add module init parms for +minimum number of large and small network buffers in the buffer pools, +and for the maximum number of simultaneous requests. + +Version 1.26 +------------ +Add setfacl support to allow setting of ACLs remotely to Samba 3.10 and later +and other POSIX CIFS compliant servers. Fix error mapping for getfacl +to EOPNOTSUPP when server does not support posix acls on the wire. Fix +improperly zeroed buffer in CIFS Unix extensions set times call. + +Version 1.25 +------------ +Fix internationalization problem in cifs readdir with filenames that map to +longer UTF-8 strings than the string on the wire was in Unicode. Add workaround +for readdir to netapp servers. Fix search rewind (seek into readdir to return +non-consecutive entries). Do not do readdir when server negotiates +buffer size to small to fit filename. Add support for reading POSIX ACLs from +the server (add also acl and noacl mount options). + +Version 1.24 +------------ +Optionally allow using server side inode numbers, rather than client generated +ones by specifying mount option "serverino" - this is required for some apps +to work which double check hardlinked files and have persistent inode numbers. + +Version 1.23 +------------ +Multiple bigendian fixes. On little endian systems (for reconnect after +network failure) fix tcp session reconnect code so we do not try first +to reconnect on reverse of port 445. Treat reparse points (NTFS junctions) +as directories rather than symlinks because we can do follow link on them. + +Version 1.22 +------------ +Add config option to enable XATTR (extended attribute) support, mapping +xattr names in the "user." namespace space to SMB/CIFS EAs. Lots of +minor fixes pointed out by the Stanford SWAT checker (mostly missing +or out of order NULL pointer checks in little used error paths). + +Version 1.21 +------------ +Add new mount parm to control whether mode check (generic_permission) is done +on the client. If Unix extensions are enabled and the uids on the client +and server do not match, client permission checks are meaningless on +server uids that do not exist on the client (this does not affect the +normal ACL check which occurs on the server). Fix default uid +on mknod to match create and mkdir. Add optional mount parm to allow +override of the default uid behavior (in which the server sets the uid +and gid of newly created files). Normally for network filesystem mounts +user want the server to set the uid/gid on newly created files (rather than +using uid of the client processes you would in a local filesystem). + +Version 1.20 +------------ +Make transaction counts more consistent. Merge /proc/fs/cifs/SimultaneousOps +info into /proc/fs/cifs/DebugData. Fix oops in rare oops in readdir +(in build_wildcard_path_from_dentry). Fix mknod to pass type field +(block/char/fifo) properly. Remove spurious mount warning log entry when +credentials passed as mount argument. Set major/minor device number in +inode for block and char devices when unix extensions enabled. + +Version 1.19 +------------ +Fix /proc/fs/cifs/Stats and DebugData display to handle larger +amounts of return data. Properly limit requests to MAX_REQ (50 +is the usual maximum active multiplex SMB/CIFS requests per server). +Do not kill cifsd (and thus hurt the other SMB session) when more than one +session to the same server (but with different userids) exists and one +of the two user's smb sessions is being removed while leaving the other. +Do not loop reconnecting in cifsd demultiplex thread when admin +kills the thread without going through unmount. + +Version 1.18 +------------ +Do not rename hardlinked files (since that should be a noop). Flush +cached write behind data when reopening a file after session abend, +except when already in write. Grab per socket sem during reconnect +to avoid oops in sendmsg if overlapping with reconnect. Do not +reset cached inode file size on readdir for files open for write on +client. + + +Version 1.17 +------------ +Update number of blocks in file so du command is happier (in Linux a fake +blocksize of 512 is required for calculating number of blocks in inode). +Fix prepare write of partial pages to read in data from server if possible. +Fix race on tcpStatus field between unmount and reconnection code, causing +cifsd process sometimes to hang around forever. Improve out of memory +checks in cifs_filldir + +Version 1.16 +------------ +Fix incorrect file size in file handle based setattr on big endian hardware. +Fix oops in build_path_from_dentry when out of memory. Add checks for invalid +and closing file structs in writepage/partialpagewrite. Add statistics +for each mounted share (new menuconfig option). Fix endianness problem in +volume information displayed in /proc/fs/cifs/DebugData (only affects +affects big endian architectures). Prevent renames while constructing +path names for open, mkdir and rmdir. + +Version 1.15 +------------ +Change to mempools for alloc smb request buffers and multiplex structs +to better handle low memory problems (and potential deadlocks). + +Version 1.14 +------------ +Fix incomplete listings of large directories on Samba servers when Unix +extensions enabled. Fix oops when smb_buffer can not be allocated. Fix +rename deadlock when writing out dirty pages at same time. + +Version 1.13 +------------ +Fix open of files in which O_CREATE can cause the mode to change in +some cases. Fix case in which retry of write overlaps file close. +Fix PPC64 build error. Reduce excessive stack usage in smb password +hashing. Fix overwrite of Linux user's view of file mode to Windows servers. + +Version 1.12 +------------ +Fixes for large file copy, signal handling, socket retry, buffer +allocation and low memory situations. + +Version 1.11 +------------ +Better port 139 support to Windows servers (RFC1001/RFC1002 Session_Initialize) +also now allowing support for specifying client netbiosname. NT4 support added. + +Version 1.10 +------------ +Fix reconnection (and certain failed mounts) to properly wake up the +blocked users thread so it does not seem hung (in some cases was blocked +until the cifs receive timeout expired). Fix spurious error logging +to kernel log when application with open network files killed. + +Version 1.09 +------------ +Fix /proc/fs module unload warning message (that could be logged +to the kernel log). Fix intermittent failure in connectathon +test7 (hardlink count not immediately refreshed in case in which +inode metadata can be incorrectly kept cached when time near zero) + +Version 1.08 +------------ +Allow file_mode and dir_mode (specified at mount time) to be enforced +locally (the server already enforced its own ACLs too) for servers +that do not report the correct mode (do not support the +CIFS Unix Extensions). + +Version 1.07 +------------ +Fix some small memory leaks in some unmount error paths. Fix major leak +of cache pages in readpages causing multiple read oriented stress +testcases (including fsx, and even large file copy) to fail over time. + +Version 1.06 +------------ +Send NTCreateX with ATTR_POSIX if Linux/Unix extensions negotiated with server. +This allows files that differ only in case and improves performance of file +creation and file open to such servers. Fix semaphore conflict which causes +slow delete of open file to Samba (which unfortunately can cause an oplock +break to self while vfs_unlink held i_sem) which can hang for 20 seconds. + +Version 1.05 +------------ +fixes to cifs_readpages for fsx test case + +Version 1.04 +------------ +Fix caching data integrity bug when extending file size especially when no +oplock on file. Fix spurious logging of valid already parsed mount options +that are parsed outside of the cifs vfs such as nosuid. + + +Version 1.03 +------------ +Connect to server when port number override not specified, and tcp port +unitialized. Reset search to restart at correct file when kernel routine +filldir returns error during large directory searches (readdir). + +Version 1.02 +------------ +Fix caching problem when files opened by multiple clients in which +page cache could contain stale data, and write through did +not occur often enough while file was still open when read ahead +(read oplock) not allowed. Treat "sep=" when first mount option +as an override of comma as the default separator between mount +options. + +Version 1.01 +------------ +Allow passwords longer than 16 bytes. Allow null password string. + +Version 1.00 +------------ +Gracefully clean up failed mounts when attempting to mount to servers such as +Windows 98 that terminate tcp sessions during protocol negotiation. Handle +embedded commas in mount parsing of passwords. + +Version 0.99 +------------ +Invalidate local inode cached pages on oplock break and when last file +instance is closed so that the client does not continue using stale local +copy rather than later modified server copy of file. Do not reconnect +when server drops the tcp session prematurely before negotiate +protocol response. Fix oops in reopen_file when dentry freed. Allow +the support for CIFS Unix Extensions to be disabled via proc interface. + +Version 0.98 +------------ +Fix hang in commit_write during reconnection of open files under heavy load. +Fix unload_nls oops in a mount failure path. Serialize writes to same socket +which also fixes any possible races when cifs signatures are enabled in SMBs +being sent out of signature sequence number order. + +Version 0.97 +------------ +Fix byte range locking bug (endian problem) causing bad offset and +length. + +Version 0.96 +------------ +Fix oops (in send_sig) caused by CIFS unmount code trying to +wake up the demultiplex thread after it had exited. Do not log +error on harmless oplock release of closed handle. + +Version 0.95 +------------ +Fix unsafe global variable usage and password hash failure on gcc 3.3.1 +Fix problem reconnecting secondary mounts to same server after session +failure. Fix invalid dentry - race in mkdir when directory gets created +by another client between the lookup and mkdir. + +Version 0.94 +------------ +Fix to list processing in reopen_files. Fix reconnection when server hung +but tcpip session still alive. Set proper timeout on socket read. + +Version 0.93 +------------ +Add missing mount options including iocharset. SMP fixes in write and open. +Fix errors in reconnecting after TCP session failure. Fix module unloading +of default nls codepage + +Version 0.92 +------------ +Active smb transactions should never go negative (fix double FreeXid). Fix +list processing in file routines. Check return code on kmalloc in open. +Fix spinlock usage for SMP. + +Version 0.91 +------------ +Fix oops in reopen_files when invalid dentry. drop dentry on server rename +and on revalidate errors. Fix cases where pid is now tgid. Fix return code +on create hard link when server does not support them. + +Version 0.90 +------------ +Fix scheduling while atomic error in getting inode info on newly created file. +Fix truncate of existing files opened with O_CREAT but not O_TRUNC set. + +Version 0.89 +------------ +Fix oops on write to dead tcp session. Remove error log write for case when file open +O_CREAT but not O_EXCL + +Version 0.88 +------------ +Fix non-POSIX behavior on rename of open file and delete of open file by taking +advantage of trans2 SetFileInfo rename facility if available on target server. +Retry on ENOSPC and EAGAIN socket errors. + +Version 0.87 +------------ +Fix oops on big endian readdir. Set blksize to be even power of two (2**blkbits) to fix +allocation size miscalculation. After oplock token lost do not read through +cache. + +Version 0.86 +------------ +Fix oops on empty file readahead. Fix for file size handling for locally cached files. + +Version 0.85 +------------ +Fix oops in mkdir when server fails to return inode info. Fix oops in reopen_files +during auto reconnection to server after server recovered from failure. + +Version 0.84 +------------ +Finish support for Linux 2.5 open/create changes, which removes the +redundant NTCreate/QPathInfo/close that was sent during file create. +Enable oplock by default. Enable packet signing by default (needed to +access many recent Windows servers) + +Version 0.83 +------------ +Fix oops when mounting to long server names caused by inverted parms to kmalloc. +Fix MultiuserMount (/proc/fs/cifs configuration setting) so that when enabled +we will choose a cifs user session (smb uid) that better matches the local +uid if a) the mount uid does not match the current uid and b) we have another +session to the same server (ip address) for a different mount which +matches the current local uid. + +Version 0.82 +------------ +Add support for mknod of block or character devices. Fix oplock +code (distributed caching) to properly send response to oplock +break from server. + +Version 0.81 +------------ +Finish up CIFS packet digital signing for the default +NTLM security case. This should help Windows 2003 +network interoperability since it is common for +packet signing to be required now. Fix statfs (stat -f) +which recently started returning errors due to +invalid value (-1 instead of 0) being set in the +struct kstatfs f_ffiles field. + +Version 0.80 +----------- +Fix oops on stopping oplock thread when removing cifs when +built as module. + +Version 0.79 +------------ +Fix mount options for ro (readonly), uid, gid and file and directory mode. + +Version 0.78 +------------ +Fix errors displayed on failed mounts to be more understandable. +Fixed various incorrect or misleading smb to posix error code mappings. + +Version 0.77 +------------ +Fix display of NTFS DFS junctions to display as symlinks. +They are the network equivalent. Fix oops in +cifs_partialpagewrite caused by missing spinlock protection +of openfile linked list. Allow writebehind caching errors to +be returned to the application at file close. + +Version 0.76 +------------ +Clean up options displayed in /proc/mounts by show_options to +be more consistent with other filesystems. + +Version 0.75 +------------ +Fix delete of readonly file to Windows servers. Reflect +presence or absence of read only dos attribute in mode +bits for servers that do not support CIFS Unix extensions. +Fix shortened results on readdir of large directories to +servers supporting CIFS Unix extensions (caused by +incorrect resume key). + +Version 0.74 +------------ +Fix truncate bug (set file size) that could cause hangs e.g. running fsx + +Version 0.73 +------------ +unload nls if mount fails. + +Version 0.72 +------------ +Add resume key support to search (readdir) code to workaround +Windows bug. Add /proc/fs/cifs/LookupCacheEnable which +allows disabling caching of attribute information for +lookups. + +Version 0.71 +------------ +Add more oplock handling (distributed caching code). Remove +dead code. Remove excessive stack space utilization from +symlink routines. + +Version 0.70 +------------ +Fix oops in get dfs referral (triggered when null path sent in to +mount). Add support for overriding rsize at mount time. + +Version 0.69 +------------ +Fix buffer overrun in readdir which caused intermittent kernel oopses. +Fix writepage code to release kmap on write data. Allow "-ip=" new +mount option to be passed in on parameter distinct from the first part +(server name portion of) the UNC name. Allow override of the +tcp port of the target server via new mount option "-port=" + +Version 0.68 +------------ +Fix search handle leak on rewind. Fix setuid and gid so that they are +reflected in the local inode immediately. Cleanup of whitespace +to make 2.4 and 2.5 versions more consistent. + + +Version 0.67 +------------ +Fix signal sending so that captive thread (cifsd) exits on umount +(which was causing the warning in kmem_cache_free of the request buffers +at rmmod time). This had broken as a sideeffect of the recent global +kernel change to daemonize. Fix memory leak in readdir code which +showed up in "ls -R" (and applications that did search rewinding). + +Version 0.66 +------------ +Reconnect tids and fids after session reconnection (still do not +reconnect byte range locks though). Fix problem caching +lookup information for directory inodes, improving performance, +especially in deep directory trees. Fix various build warnings. + +Version 0.65 +------------ +Finish fixes to commit write for caching/readahead consistency. fsx +now works to Samba servers. Fix oops caused when readahead +was interrupted by a signal. + +Version 0.64 +------------ +Fix data corruption (in partial page after truncate) that caused fsx to +fail to Windows servers. Cleaned up some extraneous error logging in +common error paths. Add generic sendfile support. + +Version 0.63 +------------ +Fix memory leak in AllocMidQEntry. +Finish reconnection logic, so connection with server can be dropped +(or server rebooted) and the cifs client will reconnect. + +Version 0.62 +------------ +Fix temporary socket leak when bad userid or password specified +(or other SMBSessSetup failure). Increase maximum buffer size to slightly +over 16K to allow negotiation of up to Samba and Windows server default read +sizes. Add support for readpages + +Version 0.61 +------------ +Fix oops when username not passed in on mount. Extensive fixes and improvements +to error logging (strip redundant newlines, change debug macros to ensure newline +passed in and to be more consistent). Fix writepage wrong file handle problem, +a readonly file handle could be incorrectly used to attempt to write out +file updates through the page cache to multiply open files. This could cause +the iozone benchmark to fail on the fwrite test. Fix bug mounting two different +shares to the same Windows server when using different usernames +(doing this to Samba servers worked but Windows was rejecting it) - now it is +possible to use different userids when connecting to the same server from a +Linux client. Fix oops when treeDisconnect called during unmount on +previously freed socket. + +Version 0.60 +------------ +Fix oops in readpages caused by not setting address space operations in inode in +rare code path. + +Version 0.59 +------------ +Includes support for deleting of open files and renaming over existing files (per POSIX +requirement). Add readlink support for Windows junction points (directory symlinks). + +Version 0.58 +------------ +Changed read and write to go through pagecache. Added additional address space operations. +Memory mapped operations now working. + +Version 0.57 +------------ +Added writepage code for additional memory mapping support. Fixed leak in xids causing +the simultaneous operations counter (/proc/fs/cifs/SimultaneousOps) to increase on +every stat call. Additional formatting cleanup. + +Version 0.56 +------------ +Fix bigendian bug in order of time conversion. Merge 2.5 to 2.4 version. Formatting cleanup. + +Version 0.55 +------------ +Fixes from Zwane Mwaikambo for adding missing return code checking in a few places. +Also included a modified version of his fix to protect global list manipulation of +the smb session and tree connection and mid related global variables. + +Version 0.54 +------------ +Fix problem with captive thread hanging around at unmount time. Adjust to 2.5.42-pre +changes to superblock layout. Remove wasteful allocation of smb buffers (now the send +buffer is reused for responses). Add more oplock handling. Additional minor cleanup. + +Version 0.53 +------------ +More stylistic updates to better match kernel style. Add additional statistics +for filesystem which can be viewed via /proc/fs/cifs. Add more pieces of NTLMv2 +and CIFS Packet Signing enablement. + +Version 0.52 +------------ +Replace call to sleep_on with safer wait_on_event. +Make stylistic changes to better match kernel style recommendations. +Remove most typedef usage (except for the PDUs themselves). + +Version 0.51 +------------ +Update mount so the -unc mount option is no longer required (the ip address can be specified +in a UNC style device name. Implementation of readpage/writepage started. + +Version 0.50 +------------ +Fix intermittent problem with incorrect smb header checking on badly +fragmented tcp responses + +Version 0.49 +------------ +Fixes to setting of allocation size and file size. + +Version 0.48 +------------ +Various 2.5.38 fixes. Now works on 2.5.38 + +Version 0.47 +------------ +Prepare for 2.5 kernel merge. Remove ifdefs. + +Version 0.46 +------------ +Socket buffer management fixes. Fix dual free. + +Version 0.45 +------------ +Various big endian fixes for hardlinks and symlinks and also for dfs. + +Version 0.44 +------------ +Various big endian fixes for servers with Unix extensions such as Samba + +Version 0.43 +------------ +Various FindNext fixes for incorrect filenames on large directory searches on big endian +clients. basic posix file i/o tests now work on big endian machines, not just le + +Version 0.42 +------------ +SessionSetup and NegotiateProtocol now work from Big Endian machines. +Various Big Endian fixes found during testing on the Linux on 390. Various fixes for compatibility with older +versions of 2.4 kernel (now builds and works again on kernels at least as early as 2.4.7). + +Version 0.41 +------------ +Various minor fixes for Connectathon Posix "basic" file i/o test suite. Directory caching fixed so hardlinked +files now return the correct number of links on fstat as they are repeatedly linked and unlinked. + +Version 0.40 +------------ +Implemented "Raw" (i.e. not encapsulated in SPNEGO) NTLMSSP (i.e. the Security Provider Interface used to negotiate +session advanced session authentication). Raw NTLMSSP is preferred by Windows 2000 Professional and Windows XP. +Began implementing support for SPNEGO encapsulation of NTLMSSP based session authentication blobs +(which is the mechanism preferred by Windows 2000 server in the absence of Kerberos). + +Version 0.38 +------------ +Introduced optional mount helper utility mount.cifs and made coreq changes to cifs vfs to enable +it. Fixed a few bugs in the DFS code (e.g. bcc two bytes too short and incorrect uid in PDU). + +Version 0.37 +------------ +Rewrote much of connection and mount/unmount logic to handle bugs with +multiple uses to same share, multiple users to same server etc. + +Version 0.36 +------------ +Fixed major problem with dentry corruption (missing call to dput) + +Version 0.35 +------------ +Rewrite of readdir code to fix bug. Various fixes for bigendian machines. +Begin adding oplock support. Multiusermount and oplockEnabled flags added to /proc/fs/cifs +although corresponding function not fully implemented in the vfs yet + +Version 0.34 +------------ +Fixed dentry caching bug, misc. cleanup + +Version 0.33 +------------ +Fixed 2.5 support to handle build and configure changes as well as misc. 2.5 changes. Now can build +on current 2.5 beta version (2.5.24) of the Linux kernel as well as on 2.4 Linux kernels. +Support for STATUS codes (newer 32 bit NT error codes) added. DFS support begun to be added. + +Version 0.32 +------------ +Unix extensions (symlink, readlink, hardlink, chmod and some chgrp and chown) implemented +and tested against Samba 2.2.5 + + +Version 0.31 +------------ +1) Fixed lockrange to be correct (it was one byte too short) + +2) Fixed GETLK (i.e. the fcntl call to test a range of bytes in a file to see if locked) to correctly +show range as locked when there is a conflict with an existing lock. + +3) default file perms are now 2767 (indicating support for mandatory locks) instead of 777 for directories +in most cases. Eventually will offer optional ability to query server for the correct perms. + +3) Fixed eventual trap when mounting twice to different shares on the same server when the first succeeded +but the second one was invalid and failed (the second one was incorrectly disconnecting the tcp and smb +session) + +4) Fixed error logging of valid mount options + +5) Removed logging of password field. + +6) Moved negotiate, treeDisconnect and uloggoffX (only tConx and SessSetup remain in connect.c) to cifssmb.c +and cleaned them up and made them more consistent with other cifs functions. + +7) Server support for Unix extensions is now fully detected and FindFirst is implemented both ways +(with or without Unix extensions) but FindNext and QueryPathInfo with the Unix extensions are not completed, +nor is the symlink support using the Unix extensions + +8) Started adding the readlink and follow_link code + +Version 0.3 +----------- +Initial drop + diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig new file mode 100644 index 00000000..f66cc162 --- /dev/null +++ b/fs/cifs/Kconfig @@ -0,0 +1,161 @@ +config CIFS + tristate "CIFS support (advanced network filesystem, SMBFS successor)" + depends on INET + select NLS + select CRYPTO + select CRYPTO_MD4 + select CRYPTO_MD5 + select CRYPTO_HMAC + select CRYPTO_ARC4 + select CRYPTO_ECB + select CRYPTO_DES + help + This is the client VFS module for the Common Internet File System + (CIFS) protocol which is the successor to the Server Message Block + (SMB) protocol, the native file sharing mechanism for most early + PC operating systems. The CIFS protocol is fully supported by + file servers such as Windows 2000 (including Windows 2003, NT 4 + and Windows XP) as well by Samba (which provides excellent CIFS + server support for Linux and many other operating systems). Limited + support for OS/2 and Windows ME and similar servers is provided as + well. + + The cifs module provides an advanced network file system + client for mounting to CIFS compliant servers. It includes + support for DFS (hierarchical name space), secure per-user + session establishment via Kerberos or NTLM or NTLMv2, + safe distributed caching (oplock), optional packet + signing, Unicode and other internationalization improvements. + If you need to mount to Samba or Windows from this machine, say Y. + +config CIFS_STATS + bool "CIFS statistics" + depends on CIFS + help + Enabling this option will cause statistics for each server share + mounted by the cifs client to be displayed in /proc/fs/cifs/Stats + +config CIFS_STATS2 + bool "Extended statistics" + depends on CIFS_STATS + help + Enabling this option will allow more detailed statistics on SMB + request timing to be displayed in /proc/fs/cifs/DebugData and also + allow optional logging of slow responses to dmesg (depending on the + value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details). + These additional statistics may have a minor effect on performance + and memory utilization. + + Unless you are a developer or are doing network performance analysis + or tuning, say N. + +config CIFS_WEAK_PW_HASH + bool "Support legacy servers which use weaker LANMAN security" + depends on CIFS + help + Modern CIFS servers including Samba and most Windows versions + (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos) + security mechanisms. These hash the password more securely + than the mechanisms used in the older LANMAN version of the + SMB protocol but LANMAN based authentication is needed to + establish sessions with some old SMB servers. + + Enabling this option allows the cifs module to mount to older + LANMAN based servers such as OS/2 and Windows 95, but such + mounts may be less secure than mounts using NTLM or more recent + security mechanisms if you are on a public network. Unless you + have a need to access old SMB servers (and are on a private + network) you probably want to say N. Even if this support + is enabled in the kernel build, LANMAN authentication will not be + used automatically. At runtime LANMAN mounts are disabled but + can be set to required (or optional) either in + /proc/fs/cifs (see fs/cifs/README for more detail) or via an + option on the mount command. This support is disabled by + default in order to reduce the possibility of a downgrade + attack. + + If unsure, say N. + +config CIFS_UPCALL + bool "Kerberos/SPNEGO advanced session setup" + depends on CIFS && KEYS + select DNS_RESOLVER + help + Enables an upcall mechanism for CIFS which accesses userspace helper + utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets + which are needed to mount to certain secure servers (for which more + secure Kerberos authentication is required). If unsure, say N. + +config CIFS_XATTR + bool "CIFS extended attributes" + depends on CIFS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). CIFS maps the name of + extended attributes beginning with the user namespace prefix + to SMB/CIFS EAs. EAs are stored on Windows servers without the + user namespace prefix, but their names are seen by Linux cifs clients + prefaced by the user namespace prefix. The system namespace + (used by some filesystems to store ACLs) is not supported at + this time. + + If unsure, say N. + +config CIFS_POSIX + bool "CIFS POSIX Extensions" + depends on CIFS_XATTR + help + Enabling this option will cause the cifs client to attempt to + negotiate a newer dialect with servers, such as Samba 3.0.5 + or later, that optionally can handle more POSIX like (rather + than Windows like) file behavior. It also enables + support for POSIX ACLs (getfacl and setfacl) to servers + (such as Samba 3.10 and later) which can negotiate + CIFS POSIX ACL support. If unsure, say N. + +config CIFS_DEBUG2 + bool "Enable additional CIFS debugging routines" + depends on CIFS + help + Enabling this option adds a few more debugging routines + to the cifs code which slightly increases the size of + the cifs module and can cause additional logging of debug + messages in some error paths, slowing performance. This + option can be turned off unless you are debugging + cifs problems. If unsure, say N. + +config CIFS_DFS_UPCALL + bool "DFS feature support" + depends on CIFS && KEYS + select DNS_RESOLVER + help + Distributed File System (DFS) support is used to access shares + transparently in an enterprise name space, even if the share + moves to a different server. This feature also enables + an upcall mechanism for CIFS which contacts userspace helper + utilities to provide server name resolution (host names to + IP addresses) which is needed for implicit mounts of DFS junction + points. If unsure, say N. + +config CIFS_FSCACHE + bool "Provide CIFS client caching support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y + help + Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data + to be cached locally on disk through the general filesystem cache + manager. If unsure, say N. + +config CIFS_ACL + bool "Provide CIFS ACL support (EXPERIMENTAL)" + depends on EXPERIMENTAL && CIFS_XATTR && KEYS + help + Allows to fetch CIFS/NTFS ACL from the server. The DACL blob + is handed over to the application/caller. + +config CIFS_NFSD_EXPORT + bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)" + depends on CIFS && EXPERIMENTAL && BROKEN + help + Allows NFS server to export a CIFS mounted share (nfsd over cifs) diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile new file mode 100644 index 00000000..005d524c --- /dev/null +++ b/fs/cifs/Makefile @@ -0,0 +1,17 @@ +# +# Makefile for Linux CIFS VFS client +# +obj-$(CONFIG_CIFS) += cifs.o + +cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ + link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \ + cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ + readdir.o ioctl.o sess.o export.o + +cifs-$(CONFIG_CIFS_ACL) += cifsacl.o + +cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o + +cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o + +cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o diff --git a/fs/cifs/README b/fs/cifs/README new file mode 100644 index 00000000..c5c2c5e5 --- /dev/null +++ b/fs/cifs/README @@ -0,0 +1,748 @@ +The CIFS VFS support for Linux supports many advanced network filesystem +features such as hierarchical dfs like namespace, hardlinks, locking and more. +It was designed to comply with the SNIA CIFS Technical Reference (which +supersedes the 1992 X/Open SMB Standard) as well as to perform best practice +practical interoperability with Windows 2000, Windows XP, Samba and equivalent +servers. This code was developed in participation with the Protocol Freedom +Information Foundation. + +Please see + http://protocolfreedom.org/ and + http://samba.org/samba/PFIF/ +for more details. + + +For questions or bug reports please contact: + sfrench@samba.org (sfrench@us.ibm.com) + +Build instructions: +================== +For Linux 2.4: +1) Get the kernel source (e.g.from http://www.kernel.org) +and download the cifs vfs source (see the project page +at http://us1.samba.org/samba/Linux_CIFS_client.html) +and change directory into the top of the kernel directory +then patch the kernel (e.g. "patch -p1 < cifs_24.patch") +to add the cifs vfs to your kernel configure options if +it has not already been added (e.g. current SuSE and UL +users do not need to apply the cifs_24.patch since the cifs vfs is +already in the kernel configure menu) and then +mkdir linux/fs/cifs and then copy the current cifs vfs files from +the cifs download to your kernel build directory e.g. + + cp /fs/cifs/* to /fs/cifs + +2) make menuconfig (or make xconfig) +3) select cifs from within the network filesystem choices +4) save and exit +5) make dep +6) make modules (or "make" if CIFS VFS not to be built as a module) + +For Linux 2.6: +1) Download the kernel (e.g. from http://www.kernel.org) +and change directory into the top of the kernel directory tree +(e.g. /usr/src/linux-2.5.73) +2) make menuconfig (or make xconfig) +3) select cifs from within the network filesystem choices +4) save and exit +5) make + + +Installation instructions: +========================= +If you have built the CIFS vfs as module (successfully) simply +type "make modules_install" (or if you prefer, manually copy the file to +the modules directory e.g. /lib/modules/2.4.10-4GB/kernel/fs/cifs/cifs.o). + +If you have built the CIFS vfs into the kernel itself, follow the instructions +for your distribution on how to install a new kernel (usually you +would simply type "make install"). + +If you do not have the utility mount.cifs (in the Samba 3.0 source tree and on +the CIFS VFS web site) copy it to the same directory in which mount.smbfs and +similar files reside (usually /sbin). Although the helper software is not +required, mount.cifs is recommended. Eventually the Samba 3.0 utility program +"net" may also be helpful since it may someday provide easier mount syntax for +users who are used to Windows e.g. + net use +Note that running the Winbind pam/nss module (logon service) on all of your +Linux clients is useful in mapping Uids and Gids consistently across the +domain to the proper network user. The mount.cifs mount helper can be +trivially built from Samba 3.0 or later source e.g. by executing: + + gcc samba/source/client/mount.cifs.c -o mount.cifs + +If cifs is built as a module, then the size and number of network buffers +and maximum number of simultaneous requests to one server can be configured. +Changing these from their defaults is not recommended. By executing modinfo + modinfo kernel/fs/cifs/cifs.ko +on kernel/fs/cifs/cifs.ko the list of configuration changes that can be made +at module initialization time (by running insmod cifs.ko) can be seen. + +Allowing User Mounts +==================== +To permit users to mount and unmount over directories they own is possible +with the cifs vfs. A way to enable such mounting is to mark the mount.cifs +utility as suid (e.g. "chmod +s /sbin/mount.cifs). To enable users to +umount shares they mount requires +1) mount.cifs version 1.4 or later +2) an entry for the share in /etc/fstab indicating that a user may +unmount it e.g. +//server/usersharename /mnt/username cifs user 0 0 + +Note that when the mount.cifs utility is run suid (allowing user mounts), +in order to reduce risks, the "nosuid" mount flag is passed in on mount to +disallow execution of an suid program mounted on the remote target. +When mount is executed as root, nosuid is not passed in by default, +and execution of suid programs on the remote target would be enabled +by default. This can be changed, as with nfs and other filesystems, +by simply specifying "nosuid" among the mount options. For user mounts +though to be able to pass the suid flag to mount requires rebuilding +mount.cifs with the following flag: + + gcc samba/source/client/mount.cifs.c -DCIFS_ALLOW_USR_SUID -o mount.cifs + +There is a corresponding manual page for cifs mounting in the Samba 3.0 and +later source tree in docs/manpages/mount.cifs.8 + +Allowing User Unmounts +====================== +To permit users to ummount directories that they have user mounted (see above), +the utility umount.cifs may be used. It may be invoked directly, or if +umount.cifs is placed in /sbin, umount can invoke the cifs umount helper +(at least for most versions of the umount utility) for umount of cifs +mounts, unless umount is invoked with -i (which will avoid invoking a umount +helper). As with mount.cifs, to enable user unmounts umount.cifs must be marked +as suid (e.g. "chmod +s /sbin/umount.cifs") or equivalent (some distributions +allow adding entries to a file to the /etc/permissions file to achieve the +equivalent suid effect). For this utility to succeed the target path +must be a cifs mount, and the uid of the current user must match the uid +of the user who mounted the resource. + +Also note that the customary way of allowing user mounts and unmounts is +(instead of using mount.cifs and unmount.cifs as suid) to add a line +to the file /etc/fstab for each //server/share you wish to mount, but +this can become unwieldy when potential mount targets include many +or unpredictable UNC names. + +Samba Considerations +==================== +To get the maximum benefit from the CIFS VFS, we recommend using a server that +supports the SNIA CIFS Unix Extensions standard (e.g. Samba 2.2.5 or later or +Samba 3.0) but the CIFS vfs works fine with a wide variety of CIFS servers. +Note that uid, gid and file permissions will display default values if you do +not have a server that supports the Unix extensions for CIFS (such as Samba +2.2.5 or later). To enable the Unix CIFS Extensions in the Samba server, add +the line: + + unix extensions = yes + +to your smb.conf file on the server. Note that the following smb.conf settings +are also useful (on the Samba server) when the majority of clients are Unix or +Linux: + + case sensitive = yes + delete readonly = yes + ea support = yes + +Note that server ea support is required for supporting xattrs from the Linux +cifs client, and that EA support is present in later versions of Samba (e.g. +3.0.6 and later (also EA support works in all versions of Windows, at least to +shares on NTFS filesystems). Extended Attribute (xattr) support is an optional +feature of most Linux filesystems which may require enabling via +make menuconfig. Client support for extended attributes (user xattr) can be +disabled on a per-mount basis by specifying "nouser_xattr" on mount. + +The CIFS client can get and set POSIX ACLs (getfacl, setfacl) to Samba servers +version 3.10 and later. Setting POSIX ACLs requires enabling both XATTR and +then POSIX support in the CIFS configuration options when building the cifs +module. POSIX ACL support can be disabled on a per mount basic by specifying +"noacl" on mount. + +Some administrators may want to change Samba's smb.conf "map archive" and +"create mask" parameters from the default. Unless the create mask is changed +newly created files can end up with an unnecessarily restrictive default mode, +which may not be what you want, although if the CIFS Unix extensions are +enabled on the server and client, subsequent setattr calls (e.g. chmod) can +fix the mode. Note that creating special devices (mknod) remotely +may require specifying a mkdev function to Samba if you are not using +Samba 3.0.6 or later. For more information on these see the manual pages +("man smb.conf") on the Samba server system. Note that the cifs vfs, +unlike the smbfs vfs, does not read the smb.conf on the client system +(the few optional settings are passed in on mount via -o parameters instead). +Note that Samba 2.2.7 or later includes a fix that allows the CIFS VFS to delete +open files (required for strict POSIX compliance). Windows Servers already +supported this feature. Samba server does not allow symlinks that refer to files +outside of the share, so in Samba versions prior to 3.0.6, most symlinks to +files with absolute paths (ie beginning with slash) such as: + ln -s /mnt/foo bar +would be forbidden. Samba 3.0.6 server or later includes the ability to create +such symlinks safely by converting unsafe symlinks (ie symlinks to server +files that are outside of the share) to a samba specific format on the server +that is ignored by local server applications and non-cifs clients and that will +not be traversed by the Samba server). This is opaque to the Linux client +application using the cifs vfs. Absolute symlinks will work to Samba 3.0.5 or +later, but only for remote clients using the CIFS Unix extensions, and will +be invisbile to Windows clients and typically will not affect local +applications running on the same server as Samba. + +Use instructions: +================ +Once the CIFS VFS support is built into the kernel or installed as a module +(cifs.o), you can use mount syntax like the following to access Samba or Windows +servers: + + mount -t cifs //9.53.216.11/e$ /mnt -o user=myname,pass=mypassword + +Before -o the option -v may be specified to make the mount.cifs +mount helper display the mount steps more verbosely. +After -o the following commonly used cifs vfs specific options +are supported: + + user= + pass= + domain= + +Other cifs mount options are described below. Use of TCP names (in addition to +ip addresses) is available if the mount helper (mount.cifs) is installed. If +you do not trust the server to which are mounted, or if you do not have +cifs signing enabled (and the physical network is insecure), consider use +of the standard mount options "noexec" and "nosuid" to reduce the risk of +running an altered binary on your local system (downloaded from a hostile server +or altered by a hostile router). + +Although mounting using format corresponding to the CIFS URL specification is +not possible in mount.cifs yet, it is possible to use an alternate format +for the server and sharename (which is somewhat similar to NFS style mount +syntax) instead of the more widely used UNC format (i.e. \\server\share): + mount -t cifs tcp_name_of_server:share_name /mnt -o user=myname,pass=mypasswd + +When using the mount helper mount.cifs, passwords may be specified via alternate +mechanisms, instead of specifying it after -o using the normal "pass=" syntax +on the command line: +1) By including it in a credential file. Specify credentials=filename as one +of the mount options. Credential files contain two lines + username=someuser + password=your_password +2) By specifying the password in the PASSWD environment variable (similarly +the user name can be taken from the USER environment variable). +3) By specifying the password in a file by name via PASSWD_FILE +4) By specifying the password in a file by file descriptor via PASSWD_FD + +If no password is provided, mount.cifs will prompt for password entry + +Restrictions +============ +Servers must support either "pure-TCP" (port 445 TCP/IP CIFS connections) or RFC +1001/1002 support for "Netbios-Over-TCP/IP." This is not likely to be a +problem as most servers support this. + +Valid filenames differ between Windows and Linux. Windows typically restricts +filenames which contain certain reserved characters (e.g.the character : +which is used to delimit the beginning of a stream name by Windows), while +Linux allows a slightly wider set of valid characters in filenames. Windows +servers can remap such characters when an explicit mapping is specified in +the Server's registry. Samba starting with version 3.10 will allow such +filenames (ie those which contain valid Linux characters, which normally +would be forbidden for Windows/CIFS semantics) as long as the server is +configured for Unix Extensions (and the client has not disabled +/proc/fs/cifs/LinuxExtensionsEnabled). + + +CIFS VFS Mount Options +====================== +A partial list of the supported mount options follows: + user The user name to use when trying to establish + the CIFS session. + password The user password. If the mount helper is + installed, the user will be prompted for password + if not supplied. + ip The ip address of the target server + unc The target server Universal Network Name (export) to + mount. + domain Set the SMB/CIFS workgroup name prepended to the + username during CIFS session establishment + forceuid Set the default uid for inodes to the uid + passed in on mount. For mounts to servers + which do support the CIFS Unix extensions, such as a + properly configured Samba server, the server provides + the uid, gid and mode so this parameter should not be + specified unless the server and clients uid and gid + numbering differ. If the server and client are in the + same domain (e.g. running winbind or nss_ldap) and + the server supports the Unix Extensions then the uid + and gid can be retrieved from the server (and uid + and gid would not have to be specifed on the mount. + For servers which do not support the CIFS Unix + extensions, the default uid (and gid) returned on lookup + of existing files will be the uid (gid) of the person + who executed the mount (root, except when mount.cifs + is configured setuid for user mounts) unless the "uid=" + (gid) mount option is specified. Also note that permission + checks (authorization checks) on accesses to a file occur + at the server, but there are cases in which an administrator + may want to restrict at the client as well. For those + servers which do not report a uid/gid owner + (such as Windows), permissions can also be checked at the + client, and a crude form of client side permission checking + can be enabled by specifying file_mode and dir_mode on + the client. (default) + forcegid (similar to above but for the groupid instead of uid) (default) + noforceuid Fill in file owner information (uid) by requesting it from + the server if possible. With this option, the value given in + the uid= option (on mount) will only be used if the server + can not support returning uids on inodes. + noforcegid (similar to above but for the group owner, gid, instead of uid) + uid Set the default uid for inodes, and indicate to the + cifs kernel driver which local user mounted. If the server + supports the unix extensions the default uid is + not used to fill in the owner fields of inodes (files) + unless the "forceuid" parameter is specified. + gid Set the default gid for inodes (similar to above). + file_mode If CIFS Unix extensions are not supported by the server + this overrides the default mode for file inodes. + fsc Enable local disk caching using FS-Cache (off by default). This + option could be useful to improve performance on a slow link, + heavily loaded server and/or network where reading from the + disk is faster than reading from the server (over the network). + This could also impact scalability positively as the + number of calls to the server are reduced. However, local + caching is not suitable for all workloads for e.g. read-once + type workloads. So, you need to consider carefully your + workload/scenario before using this option. Currently, local + disk caching is functional for CIFS files opened as read-only. + dir_mode If CIFS Unix extensions are not supported by the server + this overrides the default mode for directory inodes. + port attempt to contact the server on this tcp port, before + trying the usual ports (port 445, then 139). + iocharset Codepage used to convert local path names to and from + Unicode. Unicode is used by default for network path + names if the server supports it. If iocharset is + not specified then the nls_default specified + during the local client kernel build will be used. + If server does not support Unicode, this parameter is + unused. + rsize default read size (usually 16K). The client currently + can not use rsize larger than CIFSMaxBufSize. CIFSMaxBufSize + defaults to 16K and may be changed (from 8K to the maximum + kmalloc size allowed by your kernel) at module install time + for cifs.ko. Setting CIFSMaxBufSize to a very large value + will cause cifs to use more memory and may reduce performance + in some cases. To use rsize greater than 127K (the original + cifs protocol maximum) also requires that the server support + a new Unix Capability flag (for very large read) which some + newer servers (e.g. Samba 3.0.26 or later) do. rsize can be + set from a minimum of 2048 to a maximum of 130048 (127K or + CIFSMaxBufSize, whichever is smaller) + wsize default write size (default 57344) + maximum wsize currently allowed by CIFS is 57344 (fourteen + 4096 byte pages) + actimeo=n attribute cache timeout in seconds (default 1 second). + After this timeout, the cifs client requests fresh attribute + information from the server. This option allows to tune the + attribute cache timeout to suit the workload needs. Shorter + timeouts mean better the cache coherency, but increased number + of calls to the server. Longer timeouts mean reduced number + of calls to the server at the expense of less stricter cache + coherency checks (i.e. incorrect attribute cache for a short + period of time). + rw mount the network share read-write (note that the + server may still consider the share read-only) + ro mount network share read-only + version used to distinguish different versions of the + mount helper utility (not typically needed) + sep if first mount option (after the -o), overrides + the comma as the separator between the mount + parms. e.g. + -o user=myname,password=mypassword,domain=mydom + could be passed instead with period as the separator by + -o sep=.user=myname.password=mypassword.domain=mydom + this might be useful when comma is contained within username + or password or domain. This option is less important + when the cifs mount helper cifs.mount (version 1.1 or later) + is used. + nosuid Do not allow remote executables with the suid bit + program to be executed. This is only meaningful for mounts + to servers such as Samba which support the CIFS Unix Extensions. + If you do not trust the servers in your network (your mount + targets) it is recommended that you specify this option for + greater security. + exec Permit execution of binaries on the mount. + noexec Do not permit execution of binaries on the mount. + dev Recognize block devices on the remote mount. + nodev Do not recognize devices on the remote mount. + suid Allow remote files on this mountpoint with suid enabled to + be executed (default for mounts when executed as root, + nosuid is default for user mounts). + credentials Although ignored by the cifs kernel component, it is used by + the mount helper, mount.cifs. When mount.cifs is installed it + opens and reads the credential file specified in order + to obtain the userid and password arguments which are passed to + the cifs vfs. + guest Although ignored by the kernel component, the mount.cifs + mount helper will not prompt the user for a password + if guest is specified on the mount options. If no + password is specified a null password will be used. + perm Client does permission checks (vfs_permission check of uid + and gid of the file against the mode and desired operation), + Note that this is in addition to the normal ACL check on the + target machine done by the server software. + Client permission checking is enabled by default. + noperm Client does not do permission checks. This can expose + files on this mount to access by other users on the local + client system. It is typically only needed when the server + supports the CIFS Unix Extensions but the UIDs/GIDs on the + client and server system do not match closely enough to allow + access by the user doing the mount, but it may be useful with + non CIFS Unix Extension mounts for cases in which the default + mode is specified on the mount but is not to be enforced on the + client (e.g. perhaps when MultiUserMount is enabled) + Note that this does not affect the normal ACL check on the + target machine done by the server software (of the server + ACL against the user name provided at mount time). + serverino Use server's inode numbers instead of generating automatically + incrementing inode numbers on the client. Although this will + make it easier to spot hardlinked files (as they will have + the same inode numbers) and inode numbers may be persistent, + note that the server does not guarantee that the inode numbers + are unique if multiple server side mounts are exported under a + single share (since inode numbers on the servers might not + be unique if multiple filesystems are mounted under the same + shared higher level directory). Note that some older + (e.g. pre-Windows 2000) do not support returning UniqueIDs + or the CIFS Unix Extensions equivalent and for those + this mount option will have no effect. Exporting cifs mounts + under nfsd requires this mount option on the cifs mount. + This is now the default if server supports the + required network operation. + noserverino Client generates inode numbers (rather than using the actual one + from the server). These inode numbers will vary after + unmount or reboot which can confuse some applications, + but not all server filesystems support unique inode + numbers. + setuids If the CIFS Unix extensions are negotiated with the server + the client will attempt to set the effective uid and gid of + the local process on newly created files, directories, and + devices (create, mkdir, mknod). If the CIFS Unix Extensions + are not negotiated, for newly created files and directories + instead of using the default uid and gid specified on + the mount, cache the new file's uid and gid locally which means + that the uid for the file can change when the inode is + reloaded (or the user remounts the share). + nosetuids The client will not attempt to set the uid and gid on + on newly created files, directories, and devices (create, + mkdir, mknod) which will result in the server setting the + uid and gid to the default (usually the server uid of the + user who mounted the share). Letting the server (rather than + the client) set the uid and gid is the default. If the CIFS + Unix Extensions are not negotiated then the uid and gid for + new files will appear to be the uid (gid) of the mounter or the + uid (gid) parameter specified on the mount. + netbiosname When mounting to servers via port 139, specifies the RFC1001 + source name to use to represent the client netbios machine + name when doing the RFC1001 netbios session initialize. + direct Do not do inode data caching on files opened on this mount. + This precludes mmapping files on this mount. In some cases + with fast networks and little or no caching benefits on the + client (e.g. when the application is doing large sequential + reads bigger than page size without rereading the same data) + this can provide better performance than the default + behavior which caches reads (readahead) and writes + (writebehind) through the local Linux client pagecache + if oplock (caching token) is granted and held. Note that + direct allows write operations larger than page size + to be sent to the server. + strictcache Use for switching on strict cache mode. In this mode the + client read from the cache all the time it has Oplock Level II, + otherwise - read from the server. All written data are stored + in the cache, but if the client doesn't have Exclusive Oplock, + it writes the data to the server. + rwpidforward Forward pid of a process who opened a file to any read or write + operation on that file. This prevent applications like WINE + from failing on read and write if we use mandatory brlock style. + acl Allow setfacl and getfacl to manage posix ACLs if server + supports them. (default) + noacl Do not allow setfacl and getfacl calls on this mount + user_xattr Allow getting and setting user xattrs (those attributes whose + name begins with "user." or "os2.") as OS/2 EAs (extended + attributes) to the server. This allows support of the + setfattr and getfattr utilities. (default) + nouser_xattr Do not allow getfattr/setfattr to get/set/list xattrs + mapchars Translate six of the seven reserved characters (not backslash) + *?<>|: + to the remap range (above 0xF000), which also + allows the CIFS client to recognize files created with + such characters by Windows's POSIX emulation. This can + also be useful when mounting to most versions of Samba + (which also forbids creating and opening files + whose names contain any of these seven characters). + This has no effect if the server does not support + Unicode on the wire. + nomapchars Do not translate any of these seven characters (default). + nocase Request case insensitive path name matching (case + sensitive is the default if the server suports it). + (mount option "ignorecase" is identical to "nocase") + posixpaths If CIFS Unix extensions are supported, attempt to + negotiate posix path name support which allows certain + characters forbidden in typical CIFS filenames, without + requiring remapping. (default) + noposixpaths If CIFS Unix extensions are supported, do not request + posix path name support (this may cause servers to + reject creatingfile with certain reserved characters). + nounix Disable the CIFS Unix Extensions for this mount (tree + connection). This is rarely needed, but it may be useful + in order to turn off multiple settings all at once (ie + posix acls, posix locks, posix paths, symlink support + and retrieving uids/gids/mode from the server) or to + work around a bug in server which implement the Unix + Extensions. + nobrl Do not send byte range lock requests to the server. + This is necessary for certain applications that break + with cifs style mandatory byte range locks (and most + cifs servers do not yet support requesting advisory + byte range locks). + forcemandatorylock Even if the server supports posix (advisory) byte range + locking, send only mandatory lock requests. For some + (presumably rare) applications, originally coded for + DOS/Windows, which require Windows style mandatory byte range + locking, they may be able to take advantage of this option, + forcing the cifs client to only send mandatory locks + even if the cifs server would support posix advisory locks. + "forcemand" is accepted as a shorter form of this mount + option. + nostrictsync If this mount option is set, when an application does an + fsync call then the cifs client does not send an SMB Flush + to the server (to force the server to write all dirty data + for this file immediately to disk), although cifs still sends + all dirty (cached) file data to the server and waits for the + server to respond to the write. Since SMB Flush can be + very slow, and some servers may be reliable enough (to risk + delaying slightly flushing the data to disk on the server), + turning on this option may be useful to improve performance for + applications that fsync too much, at a small risk of server + crash. If this mount option is not set, by default cifs will + send an SMB flush request (and wait for a response) on every + fsync call. + nodfs Disable DFS (global name space support) even if the + server claims to support it. This can help work around + a problem with parsing of DFS paths with Samba server + versions 3.0.24 and 3.0.25. + remount remount the share (often used to change from ro to rw mounts + or vice versa) + cifsacl Report mode bits (e.g. on stat) based on the Windows ACL for + the file. (EXPERIMENTAL) + servern Specify the server 's netbios name (RFC1001 name) to use + when attempting to setup a session to the server. + This is needed for mounting to some older servers (such + as OS/2 or Windows 98 and Windows ME) since they do not + support a default server name. A server name can be up + to 15 characters long and is usually uppercased. + sfu When the CIFS Unix Extensions are not negotiated, attempt to + create device files and fifos in a format compatible with + Services for Unix (SFU). In addition retrieve bits 10-12 + of the mode via the SETFILEBITS extended attribute (as + SFU does). In the future the bottom 9 bits of the + mode also will be emulated using queries of the security + descriptor (ACL). + mfsymlinks Enable support for Minshall+French symlinks + (see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks) + This option is ignored when specified together with the + 'sfu' option. Minshall+French symlinks are used even if + the server supports the CIFS Unix Extensions. + sign Must use packet signing (helps avoid unwanted data modification + by intermediate systems in the route). Note that signing + does not work with lanman or plaintext authentication. + seal Must seal (encrypt) all data on this mounted share before + sending on the network. Requires support for Unix Extensions. + Note that this differs from the sign mount option in that it + causes encryption of data sent over this mounted share but other + shares mounted to the same server are unaffected. + locallease This option is rarely needed. Fcntl F_SETLEASE is + used by some applications such as Samba and NFSv4 server to + check to see whether a file is cacheable. CIFS has no way + to explicitly request a lease, but can check whether a file + is cacheable (oplocked). Unfortunately, even if a file + is not oplocked, it could still be cacheable (ie cifs client + could grant fcntl leases if no other local processes are using + the file) for cases for example such as when the server does not + support oplocks and the user is sure that the only updates to + the file will be from this client. Specifying this mount option + will allow the cifs client to check for leases (only) locally + for files which are not oplocked instead of denying leases + in that case. (EXPERIMENTAL) + sec Security mode. Allowed values are: + none attempt to connection as a null user (no name) + krb5 Use Kerberos version 5 authentication + krb5i Use Kerberos authentication and packet signing + ntlm Use NTLM password hashing (default) + ntlmi Use NTLM password hashing with signing (if + /proc/fs/cifs/PacketSigningEnabled on or if + server requires signing also can be the default) + ntlmv2 Use NTLMv2 password hashing + ntlmv2i Use NTLMv2 password hashing with packet signing + lanman (if configured in kernel config) use older + lanman hash +hard Retry file operations if server is not responding +soft Limit retries to unresponsive servers (usually only + one retry) before returning an error. (default) + +The mount.cifs mount helper also accepts a few mount options before -o +including: + + -S take password from stdin (equivalent to setting the environment + variable "PASSWD_FD=0" + -V print mount.cifs version + -? display simple usage information + +With most 2.6 kernel versions of modutils, the version of the cifs kernel +module can be displayed via modinfo. + +Misc /proc/fs/cifs Flags and Debug Info +======================================= +Informational pseudo-files: +DebugData Displays information about active CIFS sessions and + shares, features enabled as well as the cifs.ko + version. +Stats Lists summary resource usage information as well as per + share statistics, if CONFIG_CIFS_STATS in enabled + in the kernel configuration. + +Configuration pseudo-files: +MultiuserMount If set to one, more than one CIFS session to + the same server ip address can be established + if more than one uid accesses the same mount + point and if the uids user/password mapping + information is available. (default is 0) +PacketSigningEnabled If set to one, cifs packet signing is enabled + and will be used if the server requires + it. If set to two, cifs packet signing is + required even if the server considers packet + signing optional. (default 1) +SecurityFlags Flags which control security negotiation and + also packet signing. Authentication (may/must) + flags (e.g. for NTLM and/or NTLMv2) may be combined with + the signing flags. Specifying two different password + hashing mechanisms (as "must use") on the other hand + does not make much sense. Default flags are + 0x07007 + (NTLM, NTLMv2 and packet signing allowed). The maximum + allowable flags if you want to allow mounts to servers + using weaker password hashes is 0x37037 (lanman, + plaintext, ntlm, ntlmv2, signing allowed). Some + SecurityFlags require the corresponding menuconfig + options to be enabled (lanman and plaintext require + CONFIG_CIFS_WEAK_PW_HASH for example). Enabling + plaintext authentication currently requires also + enabling lanman authentication in the security flags + because the cifs module only supports sending + laintext passwords using the older lanman dialect + form of the session setup SMB. (e.g. for authentication + using plain text passwords, set the SecurityFlags + to 0x30030): + + may use packet signing 0x00001 + must use packet signing 0x01001 + may use NTLM (most common password hash) 0x00002 + must use NTLM 0x02002 + may use NTLMv2 0x00004 + must use NTLMv2 0x04004 + may use Kerberos security 0x00008 + must use Kerberos 0x08008 + may use lanman (weak) password hash 0x00010 + must use lanman password hash 0x10010 + may use plaintext passwords 0x00020 + must use plaintext passwords 0x20020 + (reserved for future packet encryption) 0x00040 + +cifsFYI If set to non-zero value, additional debug information + will be logged to the system error log. This field + contains three flags controlling different classes of + debugging entries. The maximum value it can be set + to is 7 which enables all debugging points (default 0). + Some debugging statements are not compiled into the + cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the + kernel configuration. cifsFYI may be set to one or + nore of the following flags (7 sets them all): + + log cifs informational messages 0x01 + log return codes from cifs entry points 0x02 + log slow responses (ie which take longer than 1 second) + CONFIG_CIFS_STATS2 must be enabled in .config 0x04 + + +traceSMB If set to one, debug information is logged to the + system error log with the start of smb requests + and responses (default 0) +LookupCacheEnable If set to one, inode information is kept cached + for one second improving performance of lookups + (default 1) +OplockEnabled If set to one, safe distributed caching enabled. + (default 1) +LinuxExtensionsEnabled If set to one then the client will attempt to + use the CIFS "UNIX" extensions which are optional + protocol enhancements that allow CIFS servers + to return accurate UID/GID information as well + as support symbolic links. If you use servers + such as Samba that support the CIFS Unix + extensions but do not want to use symbolic link + support and want to map the uid and gid fields + to values supplied at mount (rather than the + actual values, then set this to zero. (default 1) + +These experimental features and tracing can be enabled by changing flags in +/proc/fs/cifs (after the cifs module has been installed or built into the +kernel, e.g. insmod cifs). To enable a feature set it to 1 e.g. to enable +tracing to the kernel message log type: + + echo 7 > /proc/fs/cifs/cifsFYI + +cifsFYI functions as a bit mask. Setting it to 1 enables additional kernel +logging of various informational messages. 2 enables logging of non-zero +SMB return codes while 4 enables logging of requests that take longer +than one second to complete (except for byte range lock requests). +Setting it to 4 requires defining CONFIG_CIFS_STATS2 manually in the +source code (typically by setting it in the beginning of cifsglob.h), +and setting it to seven enables all three. Finally, tracing +the start of smb requests and responses can be enabled via: + + echo 1 > /proc/fs/cifs/traceSMB + +Per share (per client mount) statistics are available in /proc/fs/cifs/Stats +if the kernel was configured with cifs statistics enabled. The statistics +represent the number of successful (ie non-zero return code from the server) +SMB responses to some of the more common commands (open, delete, mkdir etc.). +Also recorded is the total bytes read and bytes written to the server for +that share. Note that due to client caching effects this can be less than the +number of bytes read and written by the application running on the client. +The statistics for the number of total SMBs and oplock breaks are different in +that they represent all for that share, not just those for which the server +returned success. + +Also note that "cat /proc/fs/cifs/DebugData" will display information about +the active sessions and the shares that are mounted. + +Enabling Kerberos (extended security) works but requires version 1.2 or later +of the helper program cifs.upcall to be present and to be configured in the +/etc/request-key.conf file. The cifs.upcall helper program is from the Samba +project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not +require this helper. Note that NTLMv2 security (which does not require the +cifs.upcall helper program), instead of using Kerberos, is sufficient for +some use cases. + +DFS support allows transparent redirection to shares in an MS-DFS name space. +In addition, DFS support for target shares which are specified as UNC +names which begin with host names (rather than IP addresses) requires +a user space helper (such as cifs.upcall) to be present in order to +translate host names to ip address, and the user space helper must also +be configured in the file /etc/request-key.conf. Samba, Windows servers and +many NAS appliances support DFS as a way of constructing a global name +space to ease network configuration and improve reliability. + +To use cifs Kerberos and DFS support, the Linux keyutils package should be +installed and something like the following lines should be added to the +/etc/request-key.conf file: + +create cifs.spnego * * /usr/local/sbin/cifs.upcall %k +create dns_resolver * * /usr/local/sbin/cifs.upcall %k + + diff --git a/fs/cifs/TODO b/fs/cifs/TODO new file mode 100644 index 00000000..355abcdc --- /dev/null +++ b/fs/cifs/TODO @@ -0,0 +1,129 @@ +Version 1.53 May 20, 2008 + +A Partial List of Missing Features +================================== + +Contributions are welcome. There are plenty of opportunities +for visible, important contributions to this module. Here +is a partial list of the known problems and missing features: + +a) Support for SecurityDescriptors(Windows/CIFS ACLs) for chmod/chgrp/chown +so that these operations can be supported to Windows servers + +b) Mapping POSIX ACLs (and eventually NFSv4 ACLs) to CIFS +SecurityDescriptors + +c) Better pam/winbind integration (e.g. to handle uid mapping +better) + +d) Cleanup now unneeded SessSetup code in +fs/cifs/connect.c and add back in NTLMSSP code if any servers +need it + +e) fix NTLMv2 signing when two mounts with different users to same +server. + +f) Directory entry caching relies on a 1 second timer, rather than +using FindNotify or equivalent. - (started) + +g) quota support (needs minor kernel change since quota calls +to make it to network filesystems or deviceless filesystems) + +h) investigate sync behavior (including syncpage) and check +for proper behavior of intr/nointr + +i) improve support for very old servers (OS/2 and Win9x for example) +Including support for changing the time remotely (utimes command). + +j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the +extra copy in/out of the socket buffers in some cases. + +k) Better optimize open (and pathbased setfilesize) to reduce the +oplock breaks coming from windows srv. Piggyback identical file +opens on top of each other by incrementing reference count rather +than resending (helps reduce server resource utilization and avoid +spurious oplock breaks). + +l) Improve performance of readpages by sending more than one read +at a time when 8 pages or more are requested. In conjuntion +add support for async_cifs_readpages. + +m) Add support for storing symlink info to Windows servers +in the Extended Attribute format their SFU clients would recognize. + +n) Finish fcntl D_NOTIFY support so kde and gnome file list windows +will autorefresh (partially complete by Asser). Needs minor kernel +vfs change to support removing D_NOTIFY on a file. + +o) Add GUI tool to configure /proc/fs/cifs settings and for display of +the CIFS statistics (started) + +p) implement support for security and trusted categories of xattrs +(requires minor protocol extension) to enable better support for SELINUX + +q) Implement O_DIRECT flag on open (already supported on mount) + +r) Create UID mapping facility so server UIDs can be mapped on a per +mount or a per server basis to client UIDs or nobody if no mapping +exists. This is helpful when Unix extensions are negotiated to +allow better permission checking when UIDs differ on the server +and client. Add new protocol request to the CIFS protocol +standard for asking the server for the corresponding name of a +particular uid. + +s) Add support for CIFS Unix and also the newer POSIX extensions to the +server side for Samba 4. + +t) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers) +need to add ability to set time to server (utimes command) + +u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too) + +v) mount check for unmatched uids + +w) Add support for new vfs entry point for fallocate + +x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of +processes can proceed better in parallel (on the server) + +y) Fix Samba 3 to handle reads/writes over 127K (and remove the cifs mount +restriction of wsize max being 127K) + +KNOWN BUGS (updated April 24, 2007) +==================================== +See http://bugzilla.samba.org - search on product "CifsVFS" for +current bug list. + +1) existing symbolic links (Windows reparse points) are recognized but +can not be created remotely. They are implemented for Samba and those that +support the CIFS Unix extensions, although earlier versions of Samba +overly restrict the pathnames. +2) follow_link and readdir code does not follow dfs junctions +but recognizes them +3) create of new files to FAT partitions on Windows servers can +succeed but still return access denied (appears to be Windows +server not cifs client problem) and has not been reproduced recently. +NTFS partitions do not have this problem. +4) Unix/POSIX capabilities are reset after reconnection, and affect +a few fields in the tree connection but we do do not know which +superblocks to apply these changes to. We should probably walk +the list of superblocks to set these. Also need to check the +flags on the second mount to the same share, and see if we +can do the same trick that NFS does to remount duplicate shares. + +Misc testing to do +================== +1) check out max path names and max path name components against various server +types. Try nested symlinks (8 deep). Return max path name in stat -f information + +2) Modify file portion of ltp so it can run against a mounted network +share and run it against cifs vfs in automated fashion. + +3) Additional performance testing and optimization using iozone and similar - +there are some easy changes that can be done to parallelize sequential writes, +and when signing is disabled to request larger read sizes (larger than +negotiated size) and send larger write sizes to modern servers. + +4) More exhaustively test against less common servers. More testing +against Windows 9x, Windows ME servers. + diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c new file mode 100644 index 00000000..cfd1ce34 --- /dev/null +++ b/fs/cifs/asn1.c @@ -0,0 +1,666 @@ +/* + * The ASB.1/BER parsing code is derived from ip_nat_snmp_basic.c which was in + * turn derived from the gxsnmp package by Gregory McLean & Jochen Friedrich + * + * Copyright (c) 2000 RP Internet (www.rpi.net.au). + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifs_debug.h" +#include "cifsproto.h" + +/***************************************************************************** + * + * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse) + * + *****************************************************************************/ + +/* Class */ +#define ASN1_UNI 0 /* Universal */ +#define ASN1_APL 1 /* Application */ +#define ASN1_CTX 2 /* Context */ +#define ASN1_PRV 3 /* Private */ + +/* Tag */ +#define ASN1_EOC 0 /* End Of Contents or N/A */ +#define ASN1_BOL 1 /* Boolean */ +#define ASN1_INT 2 /* Integer */ +#define ASN1_BTS 3 /* Bit String */ +#define ASN1_OTS 4 /* Octet String */ +#define ASN1_NUL 5 /* Null */ +#define ASN1_OJI 6 /* Object Identifier */ +#define ASN1_OJD 7 /* Object Description */ +#define ASN1_EXT 8 /* External */ +#define ASN1_ENUM 10 /* Enumerated */ +#define ASN1_SEQ 16 /* Sequence */ +#define ASN1_SET 17 /* Set */ +#define ASN1_NUMSTR 18 /* Numerical String */ +#define ASN1_PRNSTR 19 /* Printable String */ +#define ASN1_TEXSTR 20 /* Teletext String */ +#define ASN1_VIDSTR 21 /* Video String */ +#define ASN1_IA5STR 22 /* IA5 String */ +#define ASN1_UNITIM 23 /* Universal Time */ +#define ASN1_GENTIM 24 /* General Time */ +#define ASN1_GRASTR 25 /* Graphical String */ +#define ASN1_VISSTR 26 /* Visible String */ +#define ASN1_GENSTR 27 /* General String */ + +/* Primitive / Constructed methods*/ +#define ASN1_PRI 0 /* Primitive */ +#define ASN1_CON 1 /* Constructed */ + +/* + * Error codes. + */ +#define ASN1_ERR_NOERROR 0 +#define ASN1_ERR_DEC_EMPTY 2 +#define ASN1_ERR_DEC_EOC_MISMATCH 3 +#define ASN1_ERR_DEC_LENGTH_MISMATCH 4 +#define ASN1_ERR_DEC_BADVALUE 5 + +#define SPNEGO_OID_LEN 7 +#define NTLMSSP_OID_LEN 10 +#define KRB5_OID_LEN 7 +#define KRB5U2U_OID_LEN 8 +#define MSKRB5_OID_LEN 7 +static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 }; +static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 }; +static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 }; +static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 }; +static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 }; + +/* + * ASN.1 context. + */ +struct asn1_ctx { + int error; /* Error condition */ + unsigned char *pointer; /* Octet just to be decoded */ + unsigned char *begin; /* First octet */ + unsigned char *end; /* Octet after last octet */ +}; + +/* + * Octet string (not null terminated) + */ +struct asn1_octstr { + unsigned char *data; + unsigned int len; +}; + +static void +asn1_open(struct asn1_ctx *ctx, unsigned char *buf, unsigned int len) +{ + ctx->begin = buf; + ctx->end = buf + len; + ctx->pointer = buf; + ctx->error = ASN1_ERR_NOERROR; +} + +static unsigned char +asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch) +{ + if (ctx->pointer >= ctx->end) { + ctx->error = ASN1_ERR_DEC_EMPTY; + return 0; + } + *ch = *(ctx->pointer)++; + return 1; +} + +#if 0 /* will be needed later by spnego decoding/encoding of ntlmssp */ +static unsigned char +asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val) +{ + unsigned char ch; + + if (ctx->pointer >= ctx->end) { + ctx->error = ASN1_ERR_DEC_EMPTY; + return 0; + } + + ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */ + if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */ + *val = *(++(ctx->pointer)); /* value has enum value */ + else + return 0; + + ctx->pointer++; + return 1; +} +#endif + +static unsigned char +asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag) +{ + unsigned char ch; + + *tag = 0; + + do { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + *tag <<= 7; + *tag |= ch & 0x7F; + } while ((ch & 0x80) == 0x80); + return 1; +} + +static unsigned char +asn1_id_decode(struct asn1_ctx *ctx, + unsigned int *cls, unsigned int *con, unsigned int *tag) +{ + unsigned char ch; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *cls = (ch & 0xC0) >> 6; + *con = (ch & 0x20) >> 5; + *tag = (ch & 0x1F); + + if (*tag == 0x1F) { + if (!asn1_tag_decode(ctx, tag)) + return 0; + } + return 1; +} + +static unsigned char +asn1_length_decode(struct asn1_ctx *ctx, unsigned int *def, unsigned int *len) +{ + unsigned char ch, cnt; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch == 0x80) + *def = 0; + else { + *def = 1; + + if (ch < 0x80) + *len = ch; + else { + cnt = (unsigned char) (ch & 0x7F); + *len = 0; + + while (cnt > 0) { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + *len <<= 8; + *len |= ch; + cnt--; + } + } + } + + /* don't trust len bigger than ctx buffer */ + if (*len > ctx->end - ctx->pointer) + return 0; + + return 1; +} + +static unsigned char +asn1_header_decode(struct asn1_ctx *ctx, + unsigned char **eoc, + unsigned int *cls, unsigned int *con, unsigned int *tag) +{ + unsigned int def = 0; + unsigned int len = 0; + + if (!asn1_id_decode(ctx, cls, con, tag)) + return 0; + + if (!asn1_length_decode(ctx, &def, &len)) + return 0; + + /* primitive shall be definite, indefinite shall be constructed */ + if (*con == ASN1_PRI && !def) + return 0; + + if (def) + *eoc = ctx->pointer + len; + else + *eoc = NULL; + return 1; +} + +static unsigned char +asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc) +{ + unsigned char ch; + + if (eoc == NULL) { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch != 0x00) { + ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch != 0x00) { + ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; + return 0; + } + return 1; + } else { + if (ctx->pointer != eoc) { + ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH; + return 0; + } + return 1; + } +} + +/* static unsigned char asn1_null_decode(struct asn1_ctx *ctx, + unsigned char *eoc) +{ + ctx->pointer = eoc; + return 1; +} + +static unsigned char asn1_long_decode(struct asn1_ctx *ctx, + unsigned char *eoc, long *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = (signed char) ch; + len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof(long)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_uint_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned int *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = ch; + if (ch == 0) + len = 0; + else + len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof(unsigned int)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned long *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = ch; + if (ch == 0) + len = 0; + else + len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof(unsigned long)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char +asn1_octets_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned char **octets, unsigned int *len) +{ + unsigned char *ptr; + + *len = 0; + + *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); + if (*octets == NULL) { + return 0; + } + + ptr = *octets; + while (ctx->pointer < eoc) { + if (!asn1_octet_decode(ctx, (unsigned char *) ptr++)) { + kfree(*octets); + *octets = NULL; + return 0; + } + (*len)++; + } + return 1; +} */ + +static unsigned char +asn1_subid_decode(struct asn1_ctx *ctx, unsigned long *subid) +{ + unsigned char ch; + + *subid = 0; + + do { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *subid <<= 7; + *subid |= ch & 0x7F; + } while ((ch & 0x80) == 0x80); + return 1; +} + +static int +asn1_oid_decode(struct asn1_ctx *ctx, + unsigned char *eoc, unsigned long **oid, unsigned int *len) +{ + unsigned long subid; + unsigned int size; + unsigned long *optr; + + size = eoc - ctx->pointer + 1; + + /* first subid actually encodes first two subids */ + if (size < 2 || size > UINT_MAX/sizeof(unsigned long)) + return 0; + + *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); + if (*oid == NULL) + return 0; + + optr = *oid; + + if (!asn1_subid_decode(ctx, &subid)) { + kfree(*oid); + *oid = NULL; + return 0; + } + + if (subid < 40) { + optr[0] = 0; + optr[1] = subid; + } else if (subid < 80) { + optr[0] = 1; + optr[1] = subid - 40; + } else { + optr[0] = 2; + optr[1] = subid - 80; + } + + *len = 2; + optr += 2; + + while (ctx->pointer < eoc) { + if (++(*len) > size) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + kfree(*oid); + *oid = NULL; + return 0; + } + + if (!asn1_subid_decode(ctx, optr++)) { + kfree(*oid); + *oid = NULL; + return 0; + } + } + return 1; +} + +static int +compare_oid(unsigned long *oid1, unsigned int oid1len, + unsigned long *oid2, unsigned int oid2len) +{ + unsigned int i; + + if (oid1len != oid2len) + return 0; + else { + for (i = 0; i < oid1len; i++) { + if (oid1[i] != oid2[i]) + return 0; + } + return 1; + } +} + + /* BB check for endian conversion issues here */ + +int +decode_negTokenInit(unsigned char *security_blob, int length, + struct TCP_Server_Info *server) +{ + struct asn1_ctx ctx; + unsigned char *end; + unsigned char *sequence_end; + unsigned long *oid = NULL; + unsigned int cls, con, tag, oidlen, rc; + + /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */ + + asn1_open(&ctx, security_blob, length); + + /* GSSAPI header */ + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cFYI(1, "Error decoding negTokenInit header"); + return 0; + } else if ((cls != ASN1_APL) || (con != ASN1_CON) + || (tag != ASN1_EOC)) { + cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag); + return 0; + } + + /* Check for SPNEGO OID -- remember to free obj->oid */ + rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); + if (rc) { + if ((tag == ASN1_OJI) && (con == ASN1_PRI) && + (cls == ASN1_UNI)) { + rc = asn1_oid_decode(&ctx, end, &oid, &oidlen); + if (rc) { + rc = compare_oid(oid, oidlen, SPNEGO_OID, + SPNEGO_OID_LEN); + kfree(oid); + } + } else + rc = 0; + } + + /* SPNEGO OID not present or garbled -- bail out */ + if (!rc) { + cFYI(1, "Error decoding negTokenInit header"); + return 0; + } + + /* SPNEGO */ + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cFYI(1, "Error decoding negTokenInit"); + return 0; + } else if ((cls != ASN1_CTX) || (con != ASN1_CON) + || (tag != ASN1_EOC)) { + cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0", + cls, con, tag, end, *end); + return 0; + } + + /* negTokenInit */ + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cFYI(1, "Error decoding negTokenInit"); + return 0; + } else if ((cls != ASN1_UNI) || (con != ASN1_CON) + || (tag != ASN1_SEQ)) { + cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1", + cls, con, tag, end, *end); + return 0; + } + + /* sequence */ + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cFYI(1, "Error decoding 2nd part of negTokenInit"); + return 0; + } else if ((cls != ASN1_CTX) || (con != ASN1_CON) + || (tag != ASN1_EOC)) { + cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0", + cls, con, tag, end, *end); + return 0; + } + + /* sequence of */ + if (asn1_header_decode + (&ctx, &sequence_end, &cls, &con, &tag) == 0) { + cFYI(1, "Error decoding 2nd part of negTokenInit"); + return 0; + } else if ((cls != ASN1_UNI) || (con != ASN1_CON) + || (tag != ASN1_SEQ)) { + cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1", + cls, con, tag, end, *end); + return 0; + } + + /* list of security mechanisms */ + while (!asn1_eoc_decode(&ctx, sequence_end)) { + rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); + if (!rc) { + cFYI(1, "Error decoding negTokenInit hdr exit2"); + return 0; + } + if ((tag == ASN1_OJI) && (con == ASN1_PRI)) { + if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) { + + cFYI(1, "OID len = %d oid = 0x%lx 0x%lx " + "0x%lx 0x%lx", oidlen, *oid, + *(oid + 1), *(oid + 2), *(oid + 3)); + + if (compare_oid(oid, oidlen, MSKRB5_OID, + MSKRB5_OID_LEN)) + server->sec_mskerberos = true; + else if (compare_oid(oid, oidlen, KRB5U2U_OID, + KRB5U2U_OID_LEN)) + server->sec_kerberosu2u = true; + else if (compare_oid(oid, oidlen, KRB5_OID, + KRB5_OID_LEN)) + server->sec_kerberos = true; + else if (compare_oid(oid, oidlen, NTLMSSP_OID, + NTLMSSP_OID_LEN)) + server->sec_ntlmssp = true; + + kfree(oid); + } + } else { + cFYI(1, "Should be an oid what is going on?"); + } + } + + /* mechlistMIC */ + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + /* Check if we have reached the end of the blob, but with + no mechListMic (e.g. NTLMSSP instead of KRB5) */ + if (ctx.error == ASN1_ERR_DEC_EMPTY) + goto decode_negtoken_exit; + cFYI(1, "Error decoding last part negTokenInit exit3"); + return 0; + } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { + /* tag = 3 indicating mechListMIC */ + cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)", + cls, con, tag, end, *end); + return 0; + } + + /* sequence */ + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cFYI(1, "Error decoding last part negTokenInit exit5"); + return 0; + } else if ((cls != ASN1_UNI) || (con != ASN1_CON) + || (tag != ASN1_SEQ)) { + cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)", + cls, con, tag, end, *end); + } + + /* sequence of */ + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cFYI(1, "Error decoding last part negTokenInit exit 7"); + return 0; + } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { + cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)", + cls, con, tag, end, *end); + return 0; + } + + /* general string */ + if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { + cFYI(1, "Error decoding last part negTokenInit exit9"); + return 0; + } else if ((cls != ASN1_UNI) || (con != ASN1_PRI) + || (tag != ASN1_GENSTR)) { + cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)", + cls, con, tag, end, *end); + return 0; + } + cFYI(1, "Need to call asn1_octets_decode() function for %s", + ctx.pointer); /* is this UTF-8 or ASCII? */ +decode_negtoken_exit: + return 1; +} diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c new file mode 100644 index 00000000..545509c3 --- /dev/null +++ b/fs/cifs/cache.c @@ -0,0 +1,333 @@ +/* + * fs/cifs/cache.c - CIFS filesystem cache index structure definitions + * + * Copyright (c) 2010 Novell, Inc. + * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "fscache.h" +#include "cifs_debug.h" + +/* + * CIFS filesystem definition for FS-Cache + */ +struct fscache_netfs cifs_fscache_netfs = { + .name = "cifs", + .version = 0, +}; + +/* + * Register CIFS for caching with FS-Cache + */ +int cifs_fscache_register(void) +{ + return fscache_register_netfs(&cifs_fscache_netfs); +} + +/* + * Unregister CIFS for caching + */ +void cifs_fscache_unregister(void) +{ + fscache_unregister_netfs(&cifs_fscache_netfs); +} + +/* + * Key layout of CIFS server cache index object + */ +struct cifs_server_key { + uint16_t family; /* address family */ + __be16 port; /* IP port */ + union { + struct in_addr ipv4_addr; + struct in6_addr ipv6_addr; + } addr[0]; +}; + +/* + * Server object keyed by {IPaddress,port,family} tuple + */ +static uint16_t cifs_server_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct TCP_Server_Info *server = cookie_netfs_data; + const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr; + const struct sockaddr_in *addr = (struct sockaddr_in *) sa; + const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa; + struct cifs_server_key *key = buffer; + uint16_t key_len = sizeof(struct cifs_server_key); + + memset(key, 0, key_len); + + /* + * Should not be a problem as sin_family/sin6_family overlays + * sa_family field + */ + switch (sa->sa_family) { + case AF_INET: + key->family = sa->sa_family; + key->port = addr->sin_port; + key->addr[0].ipv4_addr = addr->sin_addr; + key_len += sizeof(key->addr[0].ipv4_addr); + break; + + case AF_INET6: + key->family = sa->sa_family; + key->port = addr6->sin6_port; + key->addr[0].ipv6_addr = addr6->sin6_addr; + key_len += sizeof(key->addr[0].ipv6_addr); + break; + + default: + cERROR(1, "Unknown network family '%d'", sa->sa_family); + key_len = 0; + break; + } + + return key_len; +} + +/* + * Server object for FS-Cache + */ +const struct fscache_cookie_def cifs_fscache_server_index_def = { + .name = "CIFS.server", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = cifs_server_get_key, +}; + +/* + * Auxiliary data attached to CIFS superblock within the cache + */ +struct cifs_fscache_super_auxdata { + u64 resource_id; /* unique server resource id */ +}; + +static char *extract_sharename(const char *treename) +{ + const char *src; + char *delim, *dst; + int len; + + /* skip double chars at the beginning */ + src = treename + 2; + + /* share name is always preceded by '\\' now */ + delim = strchr(src, '\\'); + if (!delim) + return ERR_PTR(-EINVAL); + delim++; + len = strlen(delim); + + /* caller has to free the memory */ + dst = kstrndup(delim, len, GFP_KERNEL); + if (!dst) + return ERR_PTR(-ENOMEM); + + return dst; +} + +/* + * Superblock object currently keyed by share name + */ +static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + const struct cifs_tcon *tcon = cookie_netfs_data; + char *sharename; + uint16_t len; + + sharename = extract_sharename(tcon->treeName); + if (IS_ERR(sharename)) { + cFYI(1, "%s: couldn't extract sharename\n", __func__); + sharename = NULL; + return 0; + } + + len = strlen(sharename); + if (len > maxbuf) + return 0; + + memcpy(buffer, sharename, len); + + kfree(sharename); + + return len; +} + +static uint16_t +cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + struct cifs_fscache_super_auxdata auxdata; + const struct cifs_tcon *tcon = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.resource_id = tcon->resource_id; + + if (maxbuf > sizeof(auxdata)) + maxbuf = sizeof(auxdata); + + memcpy(buffer, &auxdata, maxbuf); + + return maxbuf; +} + +static enum +fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct cifs_fscache_super_auxdata auxdata; + const struct cifs_tcon *tcon = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.resource_id = tcon->resource_id; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +/* + * Superblock object for FS-Cache + */ +const struct fscache_cookie_def cifs_fscache_super_index_def = { + .name = "CIFS.super", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = cifs_super_get_key, + .get_aux = cifs_fscache_super_get_aux, + .check_aux = cifs_fscache_super_check_aux, +}; + +/* + * Auxiliary data attached to CIFS inode within the cache + */ +struct cifs_fscache_inode_auxdata { + struct timespec last_write_time; + struct timespec last_change_time; + u64 eof; +}; + +static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + uint16_t keylen; + + /* use the UniqueId as the key */ + keylen = sizeof(cifsi->uniqueid); + if (keylen > maxbuf) + keylen = 0; + else + memcpy(buffer, &cifsi->uniqueid, keylen); + + return keylen; +} + +static void +cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size) +{ + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + + *size = cifsi->vfs_inode.i_size; +} + +static uint16_t +cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + struct cifs_fscache_inode_auxdata auxdata; + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time = cifsi->vfs_inode.i_mtime; + auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + + if (maxbuf > sizeof(auxdata)) + maxbuf = sizeof(auxdata); + + memcpy(buffer, &auxdata, maxbuf); + + return maxbuf; +} + +static enum +fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct cifs_fscache_inode_auxdata auxdata; + struct cifsInodeInfo *cifsi = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time = cifsi->vfs_inode.i_mtime; + auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) +{ + struct cifsInodeInfo *cifsi = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + cFYI(1, "%s: cifs inode 0x%p now uncached", __func__, cifsi); + + for (;;) { + nr_pages = pagevec_lookup(&pvec, + cifsi->vfs_inode.i_mapping, first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +const struct fscache_cookie_def cifs_fscache_inode_object_def = { + .name = "CIFS.uniqueid", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = cifs_fscache_inode_get_key, + .get_attr = cifs_fscache_inode_get_attr, + .get_aux = cifs_fscache_inode_get_aux, + .check_aux = cifs_fscache_inode_check_aux, + .now_uncached = cifs_fscache_inode_now_uncached, +}; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c new file mode 100644 index 00000000..2fe3cf13 --- /dev/null +++ b/fs/cifs/cifs_debug.c @@ -0,0 +1,782 @@ +/* + * fs/cifs_debug.c + * + * Copyright (C) International Business Machines Corp., 2000,2005 + * + * Modified by Steve French (sfrench@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifsfs.h" + +void +cifs_dump_mem(char *label, void *data, int length) +{ + int i, j; + int *intptr = data; + char *charptr = data; + char buf[10], line[80]; + + printk(KERN_DEBUG "%s: dump of %d bytes of data at 0x%p\n", + label, length, data); + for (i = 0; i < length; i += 16) { + line[0] = 0; + for (j = 0; (j < 4) && (i + j * 4 < length); j++) { + sprintf(buf, " %08x", intptr[i / 4 + j]); + strcat(line, buf); + } + buf[0] = ' '; + buf[2] = 0; + for (j = 0; (j < 16) && (i + j < length); j++) { + buf[1] = isprint(charptr[i + j]) ? charptr[i + j] : '.'; + strcat(line, buf); + } + printk(KERN_DEBUG "%s\n", line); + } +} + +#ifdef CONFIG_CIFS_DEBUG2 +void cifs_dump_detail(struct smb_hdr *smb) +{ + cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d", + smb->Command, smb->Status.CifsError, + smb->Flags, smb->Flags2, smb->Mid, smb->Pid); + cERROR(1, "smb buf %p len %d", smb, smbCalcSize(smb)); +} + + +void cifs_dump_mids(struct TCP_Server_Info *server) +{ + struct list_head *tmp; + struct mid_q_entry *mid_entry; + + if (server == NULL) + return; + + cERROR(1, "Dump pending requests:"); + spin_lock(&GlobalMid_Lock); + list_for_each(tmp, &server->pending_mid_q) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d", + mid_entry->midState, + (int)mid_entry->command, + mid_entry->pid, + mid_entry->callback_data, + mid_entry->mid); +#ifdef CONFIG_CIFS_STATS2 + cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld", + mid_entry->largeBuf, + mid_entry->resp_buf, + mid_entry->when_received, + jiffies); +#endif /* STATS2 */ + cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp, + mid_entry->multiEnd); + if (mid_entry->resp_buf) { + cifs_dump_detail(mid_entry->resp_buf); + cifs_dump_mem("existing buf: ", + mid_entry->resp_buf, 62); + } + } + spin_unlock(&GlobalMid_Lock); +} +#endif /* CONFIG_CIFS_DEBUG2 */ + +#ifdef CONFIG_PROC_FS +static int cifs_debug_data_proc_show(struct seq_file *m, void *v) +{ + struct list_head *tmp1, *tmp2, *tmp3; + struct mid_q_entry *mid_entry; + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + int i, j; + __u32 dev_type; + + seq_puts(m, + "Display Internal CIFS Data Structures for Debugging\n" + "---------------------------------------------------\n"); + seq_printf(m, "CIFS Version %s\n", CIFS_VERSION); + seq_printf(m, "Features:"); +#ifdef CONFIG_CIFS_DFS_UPCALL + seq_printf(m, " dfs"); +#endif +#ifdef CONFIG_CIFS_FSCACHE + seq_printf(m, " fscache"); +#endif +#ifdef CONFIG_CIFS_WEAK_PW_HASH + seq_printf(m, " lanman"); +#endif +#ifdef CONFIG_CIFS_POSIX + seq_printf(m, " posix"); +#endif +#ifdef CONFIG_CIFS_UPCALL + seq_printf(m, " spnego"); +#endif +#ifdef CONFIG_CIFS_XATTR + seq_printf(m, " xattr"); +#endif +#ifdef CONFIG_CIFS_ACL + seq_printf(m, " acl"); +#endif + seq_putc(m, '\n'); + seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); + seq_printf(m, "Servers:"); + + i = 0; + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp1, &cifs_tcp_ses_list) { + server = list_entry(tmp1, struct TCP_Server_Info, + tcp_ses_list); + i++; + list_for_each(tmp2, &server->smb_ses_list) { + ses = list_entry(tmp2, struct cifs_ses, + smb_ses_list); + if ((ses->serverDomain == NULL) || + (ses->serverOS == NULL) || + (ses->serverNOS == NULL)) { + seq_printf(m, "\n%d) entry for %s not fully " + "displayed\n\t", i, ses->serverName); + } else { + seq_printf(m, + "\n%d) Name: %s Domain: %s Uses: %d OS:" + " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB" + " session status: %d\t", + i, ses->serverName, ses->serverDomain, + ses->ses_count, ses->serverOS, ses->serverNOS, + ses->capabilities, ses->status); + } + seq_printf(m, "TCP status: %d\n\tLocal Users To " + "Server: %d SecMode: 0x%x Req On Wire: %d", + server->tcpStatus, server->srv_count, + server->sec_mode, + atomic_read(&server->inFlight)); + +#ifdef CONFIG_CIFS_STATS2 + seq_printf(m, " In Send: %d In MaxReq Wait: %d", + atomic_read(&server->inSend), + atomic_read(&server->num_waiters)); +#endif + + seq_puts(m, "\n\tShares:"); + j = 0; + list_for_each(tmp3, &ses->tcon_list) { + tcon = list_entry(tmp3, struct cifs_tcon, + tcon_list); + ++j; + dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); + seq_printf(m, "\n\t%d) %s Mounts: %d ", j, + tcon->treeName, tcon->tc_count); + if (tcon->nativeFileSystem) { + seq_printf(m, "Type: %s ", + tcon->nativeFileSystem); + } + seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x" + "\nPathComponentMax: %d Status: 0x%d", + le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), + le32_to_cpu(tcon->fsAttrInfo.Attributes), + le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), + tcon->tidStatus); + if (dev_type == FILE_DEVICE_DISK) + seq_puts(m, " type: DISK "); + else if (dev_type == FILE_DEVICE_CD_ROM) + seq_puts(m, " type: CDROM "); + else + seq_printf(m, " type: %d ", dev_type); + + if (tcon->need_reconnect) + seq_puts(m, "\tDISCONNECTED "); + seq_putc(m, '\n'); + } + + seq_puts(m, "\n\tMIDs:\n"); + + spin_lock(&GlobalMid_Lock); + list_for_each(tmp3, &server->pending_mid_q) { + mid_entry = list_entry(tmp3, struct mid_q_entry, + qhead); + seq_printf(m, "\tState: %d com: %d pid:" + " %d cbdata: %p mid %d\n", + mid_entry->midState, + (int)mid_entry->command, + mid_entry->pid, + mid_entry->callback_data, + mid_entry->mid); + } + spin_unlock(&GlobalMid_Lock); + } + } + spin_unlock(&cifs_tcp_ses_lock); + seq_putc(m, '\n'); + + /* BB add code to dump additional info such as TCP session info now */ + return 0; +} + +static int cifs_debug_data_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_debug_data_proc_show, NULL); +} + +static const struct file_operations cifs_debug_data_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_debug_data_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#ifdef CONFIG_CIFS_STATS +static ssize_t cifs_stats_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + char c; + int rc; + struct list_head *tmp1, *tmp2, *tmp3; + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + rc = get_user(c, buffer); + if (rc) + return rc; + + if (c == '1' || c == 'y' || c == 'Y' || c == '0') { +#ifdef CONFIG_CIFS_STATS2 + atomic_set(&totBufAllocCount, 0); + atomic_set(&totSmBufAllocCount, 0); +#endif /* CONFIG_CIFS_STATS2 */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp1, &cifs_tcp_ses_list) { + server = list_entry(tmp1, struct TCP_Server_Info, + tcp_ses_list); + list_for_each(tmp2, &server->smb_ses_list) { + ses = list_entry(tmp2, struct cifs_ses, + smb_ses_list); + list_for_each(tmp3, &ses->tcon_list) { + tcon = list_entry(tmp3, + struct cifs_tcon, + tcon_list); + atomic_set(&tcon->num_smbs_sent, 0); + atomic_set(&tcon->num_writes, 0); + atomic_set(&tcon->num_reads, 0); + atomic_set(&tcon->num_oplock_brks, 0); + atomic_set(&tcon->num_opens, 0); + atomic_set(&tcon->num_posixopens, 0); + atomic_set(&tcon->num_posixmkdirs, 0); + atomic_set(&tcon->num_closes, 0); + atomic_set(&tcon->num_deletes, 0); + atomic_set(&tcon->num_mkdirs, 0); + atomic_set(&tcon->num_rmdirs, 0); + atomic_set(&tcon->num_renames, 0); + atomic_set(&tcon->num_t2renames, 0); + atomic_set(&tcon->num_ffirst, 0); + atomic_set(&tcon->num_fnext, 0); + atomic_set(&tcon->num_fclose, 0); + atomic_set(&tcon->num_hardlinks, 0); + atomic_set(&tcon->num_symlinks, 0); + atomic_set(&tcon->num_locks, 0); + } + } + } + spin_unlock(&cifs_tcp_ses_lock); + } + + return count; +} + +static int cifs_stats_proc_show(struct seq_file *m, void *v) +{ + int i; + struct list_head *tmp1, *tmp2, *tmp3; + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + seq_printf(m, + "Resources in use\nCIFS Session: %d\n", + sesInfoAllocCount.counter); + seq_printf(m, "Share (unique mount targets): %d\n", + tconInfoAllocCount.counter); + seq_printf(m, "SMB Request/Response Buffer: %d Pool size: %d\n", + bufAllocCount.counter, + cifs_min_rcv + tcpSesAllocCount.counter); + seq_printf(m, "SMB Small Req/Resp Buffer: %d Pool size: %d\n", + smBufAllocCount.counter, cifs_min_small); +#ifdef CONFIG_CIFS_STATS2 + seq_printf(m, "Total Large %d Small %d Allocations\n", + atomic_read(&totBufAllocCount), + atomic_read(&totSmBufAllocCount)); +#endif /* CONFIG_CIFS_STATS2 */ + + seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount)); + seq_printf(m, + "\n%d session %d share reconnects\n", + tcpSesReconnectCount.counter, tconInfoReconnectCount.counter); + + seq_printf(m, + "Total vfs operations: %d maximum at one time: %d\n", + GlobalCurrentXid, GlobalMaxActiveXid); + + i = 0; + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp1, &cifs_tcp_ses_list) { + server = list_entry(tmp1, struct TCP_Server_Info, + tcp_ses_list); + list_for_each(tmp2, &server->smb_ses_list) { + ses = list_entry(tmp2, struct cifs_ses, + smb_ses_list); + list_for_each(tmp3, &ses->tcon_list) { + tcon = list_entry(tmp3, + struct cifs_tcon, + tcon_list); + i++; + seq_printf(m, "\n%d) %s", i, tcon->treeName); + if (tcon->need_reconnect) + seq_puts(m, "\tDISCONNECTED "); + seq_printf(m, "\nSMBs: %d Oplock Breaks: %d", + atomic_read(&tcon->num_smbs_sent), + atomic_read(&tcon->num_oplock_brks)); + seq_printf(m, "\nReads: %d Bytes: %lld", + atomic_read(&tcon->num_reads), + (long long)(tcon->bytes_read)); + seq_printf(m, "\nWrites: %d Bytes: %lld", + atomic_read(&tcon->num_writes), + (long long)(tcon->bytes_written)); + seq_printf(m, "\nFlushes: %d", + atomic_read(&tcon->num_flushes)); + seq_printf(m, "\nLocks: %d HardLinks: %d " + "Symlinks: %d", + atomic_read(&tcon->num_locks), + atomic_read(&tcon->num_hardlinks), + atomic_read(&tcon->num_symlinks)); + seq_printf(m, "\nOpens: %d Closes: %d " + "Deletes: %d", + atomic_read(&tcon->num_opens), + atomic_read(&tcon->num_closes), + atomic_read(&tcon->num_deletes)); + seq_printf(m, "\nPosix Opens: %d " + "Posix Mkdirs: %d", + atomic_read(&tcon->num_posixopens), + atomic_read(&tcon->num_posixmkdirs)); + seq_printf(m, "\nMkdirs: %d Rmdirs: %d", + atomic_read(&tcon->num_mkdirs), + atomic_read(&tcon->num_rmdirs)); + seq_printf(m, "\nRenames: %d T2 Renames %d", + atomic_read(&tcon->num_renames), + atomic_read(&tcon->num_t2renames)); + seq_printf(m, "\nFindFirst: %d FNext %d " + "FClose %d", + atomic_read(&tcon->num_ffirst), + atomic_read(&tcon->num_fnext), + atomic_read(&tcon->num_fclose)); + } + } + } + spin_unlock(&cifs_tcp_ses_lock); + + seq_putc(m, '\n'); + return 0; +} + +static int cifs_stats_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_stats_proc_show, NULL); +} + +static const struct file_operations cifs_stats_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_stats_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_stats_proc_write, +}; +#endif /* STATS */ + +static struct proc_dir_entry *proc_fs_cifs; +static const struct file_operations cifsFYI_proc_fops; +static const struct file_operations cifs_oplock_proc_fops; +static const struct file_operations cifs_lookup_cache_proc_fops; +static const struct file_operations traceSMB_proc_fops; +static const struct file_operations cifs_multiuser_mount_proc_fops; +static const struct file_operations cifs_security_flags_proc_fops; +static const struct file_operations cifs_linux_ext_proc_fops; + +void +cifs_proc_init(void) +{ + proc_fs_cifs = proc_mkdir("fs/cifs", NULL); + if (proc_fs_cifs == NULL) + return; + + proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops); + +#ifdef CONFIG_CIFS_STATS + proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops); +#endif /* STATS */ + proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops); + proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops); + proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops); + proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs, + &cifs_linux_ext_proc_fops); + proc_create("MultiuserMount", 0, proc_fs_cifs, + &cifs_multiuser_mount_proc_fops); + proc_create("SecurityFlags", 0, proc_fs_cifs, + &cifs_security_flags_proc_fops); + proc_create("LookupCacheEnabled", 0, proc_fs_cifs, + &cifs_lookup_cache_proc_fops); +} + +void +cifs_proc_clean(void) +{ + if (proc_fs_cifs == NULL) + return; + + remove_proc_entry("DebugData", proc_fs_cifs); + remove_proc_entry("cifsFYI", proc_fs_cifs); + remove_proc_entry("traceSMB", proc_fs_cifs); +#ifdef CONFIG_CIFS_STATS + remove_proc_entry("Stats", proc_fs_cifs); +#endif + remove_proc_entry("MultiuserMount", proc_fs_cifs); + remove_proc_entry("OplockEnabled", proc_fs_cifs); + remove_proc_entry("SecurityFlags", proc_fs_cifs); + remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs); + remove_proc_entry("LookupCacheEnabled", proc_fs_cifs); + remove_proc_entry("fs/cifs", NULL); +} + +static int cifsFYI_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", cifsFYI); + return 0; +} + +static int cifsFYI_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifsFYI_proc_show, NULL); +} + +static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char c; + int rc; + + rc = get_user(c, buffer); + if (rc) + return rc; + if (c == '0' || c == 'n' || c == 'N') + cifsFYI = 0; + else if (c == '1' || c == 'y' || c == 'Y') + cifsFYI = 1; + else if ((c > '1') && (c <= '9')) + cifsFYI = (int) (c - '0'); /* see cifs_debug.h for meanings */ + + return count; +} + +static const struct file_operations cifsFYI_proc_fops = { + .owner = THIS_MODULE, + .open = cifsFYI_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifsFYI_proc_write, +}; + +static int cifs_oplock_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", oplockEnabled); + return 0; +} + +static int cifs_oplock_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_oplock_proc_show, NULL); +} + +static ssize_t cifs_oplock_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + char c; + int rc; + + rc = get_user(c, buffer); + if (rc) + return rc; + if (c == '0' || c == 'n' || c == 'N') + oplockEnabled = 0; + else if (c == '1' || c == 'y' || c == 'Y') + oplockEnabled = 1; + + return count; +} + +static const struct file_operations cifs_oplock_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_oplock_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_oplock_proc_write, +}; + +static int cifs_linux_ext_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", linuxExtEnabled); + return 0; +} + +static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_linux_ext_proc_show, NULL); +} + +static ssize_t cifs_linux_ext_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + char c; + int rc; + + rc = get_user(c, buffer); + if (rc) + return rc; + if (c == '0' || c == 'n' || c == 'N') + linuxExtEnabled = 0; + else if (c == '1' || c == 'y' || c == 'Y') + linuxExtEnabled = 1; + + return count; +} + +static const struct file_operations cifs_linux_ext_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_linux_ext_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_linux_ext_proc_write, +}; + +static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", lookupCacheEnabled); + return 0; +} + +static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_lookup_cache_proc_show, NULL); +} + +static ssize_t cifs_lookup_cache_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + char c; + int rc; + + rc = get_user(c, buffer); + if (rc) + return rc; + if (c == '0' || c == 'n' || c == 'N') + lookupCacheEnabled = 0; + else if (c == '1' || c == 'y' || c == 'Y') + lookupCacheEnabled = 1; + + return count; +} + +static const struct file_operations cifs_lookup_cache_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_lookup_cache_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_lookup_cache_proc_write, +}; + +static int traceSMB_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", traceSMB); + return 0; +} + +static int traceSMB_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, traceSMB_proc_show, NULL); +} + +static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char c; + int rc; + + rc = get_user(c, buffer); + if (rc) + return rc; + if (c == '0' || c == 'n' || c == 'N') + traceSMB = 0; + else if (c == '1' || c == 'y' || c == 'Y') + traceSMB = 1; + + return count; +} + +static const struct file_operations traceSMB_proc_fops = { + .owner = THIS_MODULE, + .open = traceSMB_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = traceSMB_proc_write, +}; + +static int cifs_multiuser_mount_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", multiuser_mount); + return 0; +} + +static int cifs_multiuser_mount_proc_open(struct inode *inode, struct file *fh) +{ + return single_open(fh, cifs_multiuser_mount_proc_show, NULL); +} + +static ssize_t cifs_multiuser_mount_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + char c; + int rc; + + rc = get_user(c, buffer); + if (rc) + return rc; + if (c == '0' || c == 'n' || c == 'N') + multiuser_mount = 0; + else if (c == '1' || c == 'y' || c == 'Y') + multiuser_mount = 1; + + return count; +} + +static const struct file_operations cifs_multiuser_mount_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_multiuser_mount_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_multiuser_mount_proc_write, +}; + +static int cifs_security_flags_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%x\n", global_secflags); + return 0; +} + +static int cifs_security_flags_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_security_flags_proc_show, NULL); +} + +static ssize_t cifs_security_flags_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + unsigned int flags; + char flags_string[12]; + char c; + + if ((count < 1) || (count > 11)) + return -EINVAL; + + memset(flags_string, 0, 12); + + if (copy_from_user(flags_string, buffer, count)) + return -EFAULT; + + if (count < 3) { + /* single char or single char followed by null */ + c = flags_string[0]; + if (c == '0' || c == 'n' || c == 'N') { + global_secflags = CIFSSEC_DEF; /* default */ + return count; + } else if (c == '1' || c == 'y' || c == 'Y') { + global_secflags = CIFSSEC_MAX; + return count; + } else if (!isdigit(c)) { + cERROR(1, "invalid flag %c", c); + return -EINVAL; + } + } + /* else we have a number */ + + flags = simple_strtoul(flags_string, NULL, 0); + + cFYI(1, "sec flags 0x%x", flags); + + if (flags <= 0) { + cERROR(1, "invalid security flags %s", flags_string); + return -EINVAL; + } + + if (flags & ~CIFSSEC_MASK) { + cERROR(1, "attempt to set unsupported security flags 0x%x", + flags & ~CIFSSEC_MASK); + return -EINVAL; + } + /* flags look ok - update the global security flags for cifs module */ + global_secflags = flags; + if (global_secflags & CIFSSEC_MUST_SIGN) { + /* requiring signing implies signing is allowed */ + global_secflags |= CIFSSEC_MAY_SIGN; + cFYI(1, "packet signing now required"); + } else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) { + cFYI(1, "packet signing disabled"); + } + /* BB should we turn on MAY flags for other MUST options? */ + return count; +} + +static const struct file_operations cifs_security_flags_proc_fops = { + .owner = THIS_MODULE, + .open = cifs_security_flags_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = cifs_security_flags_proc_write, +}; +#else +inline void cifs_proc_init(void) +{ +} + +inline void cifs_proc_clean(void) +{ +} +#endif /* PROC_FS */ diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h new file mode 100644 index 00000000..8942b28c --- /dev/null +++ b/fs/cifs/cifs_debug.h @@ -0,0 +1,96 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000,2002 + * Modified by Steve French (sfrench@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * +*/ +#define CIFS_DEBUG /* BB temporary */ + +#ifndef _H_CIFS_DEBUG +#define _H_CIFS_DEBUG + +void cifs_dump_mem(char *label, void *data, int length); +#ifdef CONFIG_CIFS_DEBUG2 +#define DBG2 2 +void cifs_dump_detail(struct smb_hdr *); +void cifs_dump_mids(struct TCP_Server_Info *); +#else +#define DBG2 0 +#endif +extern int traceSMB; /* flag which enables the function below */ +void dump_smb(struct smb_hdr *, int); +#define CIFS_INFO 0x01 +#define CIFS_RC 0x02 +#define CIFS_TIMER 0x04 + +/* + * debug ON + * -------- + */ +#ifdef CIFS_DEBUG + +/* information message: e.g., configuration, major event */ +extern int cifsFYI; +#define cifsfyi(fmt, arg...) \ +do { \ + if (cifsFYI & CIFS_INFO) \ + printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg); \ +} while (0) + +#define cFYI(set, fmt, arg...) \ +do { \ + if (set) \ + cifsfyi(fmt, ##arg); \ +} while (0) + +#define cifswarn(fmt, arg...) \ + printk(KERN_WARNING fmt "\n", ##arg) + +/* debug event message: */ +extern int cifsERROR; + +#define cEVENT(fmt, arg...) \ +do { \ + if (cifsERROR) \ + printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg); \ +} while (0) + +/* error event message: e.g., i/o error */ +#define cifserror(fmt, arg...) \ +do { \ + if (cifsERROR) \ + printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg); \ +} while (0) + +#define cERROR(set, fmt, arg...) \ +do { \ + if (set) \ + cifserror(fmt, ##arg); \ +} while (0) + +/* + * debug OFF + * --------- + */ +#else /* _CIFS_DEBUG */ +#define cERROR(set, fmt, arg...) +#define cEVENT(fmt, arg...) +#define cFYI(set, fmt, arg...) +#define cifserror(fmt, arg...) +#endif /* _CIFS_DEBUG */ + +#endif /* _H_CIFS_DEBUG */ diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c new file mode 100644 index 00000000..8d8f28c9 --- /dev/null +++ b/fs/cifs/cifs_dfs_ref.c @@ -0,0 +1,370 @@ +/* + * Contains the CIFS DFS referral mounting routines used for handling + * traversal via DFS junction point + * + * Copyright (c) 2007 Igor Mammedov + * Copyright (C) International Business Machines Corp., 2008 + * Author(s): Igor Mammedov (niallain@gmail.com) + * Steve French (sfrench@us.ibm.com) + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifsfs.h" +#include "dns_resolve.h" +#include "cifs_debug.h" + +static LIST_HEAD(cifs_dfs_automount_list); + +static void cifs_dfs_expire_automounts(struct work_struct *work); +static DECLARE_DELAYED_WORK(cifs_dfs_automount_task, + cifs_dfs_expire_automounts); +static int cifs_dfs_mountpoint_expiry_timeout = 500 * HZ; + +static void cifs_dfs_expire_automounts(struct work_struct *work) +{ + struct list_head *list = &cifs_dfs_automount_list; + + mark_mounts_for_expiry(list); + if (!list_empty(list)) + schedule_delayed_work(&cifs_dfs_automount_task, + cifs_dfs_mountpoint_expiry_timeout); +} + +void cifs_dfs_release_automount_timer(void) +{ + BUG_ON(!list_empty(&cifs_dfs_automount_list)); + cancel_delayed_work_sync(&cifs_dfs_automount_task); +} + +/** + * cifs_get_share_name - extracts share name from UNC + * @node_name: pointer to UNC string + * + * Extracts sharename form full UNC. + * i.e. strips from UNC trailing path that is not part of share + * name and fixup missing '\' in the beginning of DFS node refferal + * if necessary. + * Returns pointer to share name on success or ERR_PTR on error. + * Caller is responsible for freeing returned string. + */ +static char *cifs_get_share_name(const char *node_name) +{ + int len; + char *UNC; + char *pSep; + + len = strlen(node_name); + UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */, + GFP_KERNEL); + if (!UNC) + return ERR_PTR(-ENOMEM); + + /* get share name and server name */ + if (node_name[1] != '\\') { + UNC[0] = '\\'; + strncpy(UNC+1, node_name, len); + len++; + UNC[len] = 0; + } else { + strncpy(UNC, node_name, len); + UNC[len] = 0; + } + + /* find server name end */ + pSep = memchr(UNC+2, '\\', len-2); + if (!pSep) { + cERROR(1, "%s: no server name end in node name: %s", + __func__, node_name); + kfree(UNC); + return ERR_PTR(-EINVAL); + } + + /* find sharename end */ + pSep++; + pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC)); + if (pSep) { + /* trim path up to sharename end + * now we have share name in UNC */ + *pSep = 0; + } + + return UNC; +} + + +/** + * cifs_compose_mount_options - creates mount options for refferral + * @sb_mountdata: parent/root DFS mount options (template) + * @fullpath: full path in UNC format + * @ref: server's referral + * @devname: pointer for saving device name + * + * creates mount options for submount based on template options sb_mountdata + * and replacing unc,ip,prefixpath options with ones we've got form ref_unc. + * + * Returns: pointer to new mount options or ERR_PTR. + * Caller is responcible for freeing retunrned value if it is not error. + */ +char *cifs_compose_mount_options(const char *sb_mountdata, + const char *fullpath, + const struct dfs_info3_param *ref, + char **devname) +{ + int rc; + char *mountdata = NULL; + int md_len; + char *tkn_e; + char *srvIP = NULL; + char sep = ','; + int off, noff; + + if (sb_mountdata == NULL) + return ERR_PTR(-EINVAL); + + *devname = cifs_get_share_name(ref->node_name); + if (IS_ERR(*devname)) { + rc = PTR_ERR(*devname); + *devname = NULL; + goto compose_mount_options_err; + } + + rc = dns_resolve_server_name_to_ip(*devname, &srvIP); + if (rc < 0) { + cERROR(1, "%s: Failed to resolve server part of %s to IP: %d", + __func__, *devname, rc); + goto compose_mount_options_err; + } + /* md_len = strlen(...) + 12 for 'sep+prefixpath=' + * assuming that we have 'unc=' and 'ip=' in + * the original sb_mountdata + */ + md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12; + mountdata = kzalloc(md_len+1, GFP_KERNEL); + if (mountdata == NULL) { + rc = -ENOMEM; + goto compose_mount_options_err; + } + + /* copy all options except of unc,ip,prefixpath */ + off = 0; + if (strncmp(sb_mountdata, "sep=", 4) == 0) { + sep = sb_mountdata[4]; + strncpy(mountdata, sb_mountdata, 5); + off += 5; + } + + do { + tkn_e = strchr(sb_mountdata + off, sep); + if (tkn_e == NULL) + noff = strlen(sb_mountdata + off); + else + noff = tkn_e - (sb_mountdata + off) + 1; + + if (strnicmp(sb_mountdata + off, "unc=", 4) == 0) { + off += noff; + continue; + } + if (strnicmp(sb_mountdata + off, "ip=", 3) == 0) { + off += noff; + continue; + } + if (strnicmp(sb_mountdata + off, "prefixpath=", 11) == 0) { + off += noff; + continue; + } + strncat(mountdata, sb_mountdata + off, noff); + off += noff; + } while (tkn_e); + strcat(mountdata, sb_mountdata + off); + mountdata[md_len] = '\0'; + + /* copy new IP and ref share name */ + if (mountdata[strlen(mountdata) - 1] != sep) + strncat(mountdata, &sep, 1); + strcat(mountdata, "ip="); + strcat(mountdata, srvIP); + strncat(mountdata, &sep, 1); + strcat(mountdata, "unc="); + strcat(mountdata, *devname); + + /* find & copy prefixpath */ + tkn_e = strchr(ref->node_name + 2, '\\'); + if (tkn_e == NULL) { + /* invalid unc, missing share name*/ + rc = -EINVAL; + goto compose_mount_options_err; + } + + tkn_e = strchr(tkn_e + 1, '\\'); + if (tkn_e || (strlen(fullpath) - ref->path_consumed)) { + strncat(mountdata, &sep, 1); + strcat(mountdata, "prefixpath="); + if (tkn_e) + strcat(mountdata, tkn_e + 1); + strcat(mountdata, fullpath + ref->path_consumed); + } + + /*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/ + /*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/ + +compose_mount_options_out: + kfree(srvIP); + return mountdata; + +compose_mount_options_err: + kfree(mountdata); + mountdata = ERR_PTR(rc); + goto compose_mount_options_out; +} + +/** + * cifs_dfs_do_refmount - mounts specified path using provided refferal + * @cifs_sb: parent/root superblock + * @fullpath: full path in UNC format + * @ref: server's referral + */ +static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, + const char *fullpath, const struct dfs_info3_param *ref) +{ + struct vfsmount *mnt; + char *mountdata; + char *devname = NULL; + + /* strip first '\' from fullpath */ + mountdata = cifs_compose_mount_options(cifs_sb->mountdata, + fullpath + 1, ref, &devname); + + if (IS_ERR(mountdata)) + return (struct vfsmount *)mountdata; + + mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata); + kfree(mountdata); + kfree(devname); + return mnt; + +} + +static void dump_referral(const struct dfs_info3_param *ref) +{ + cFYI(1, "DFS: ref path: %s", ref->path_name); + cFYI(1, "DFS: node path: %s", ref->node_name); + cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type); + cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag, + ref->path_consumed); +} + +/* + * Create a vfsmount that we can automount + */ +static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) +{ + struct dfs_info3_param *referrals = NULL; + unsigned int num_referrals = 0; + struct cifs_sb_info *cifs_sb; + struct cifs_ses *ses; + char *full_path; + int xid, i; + int rc; + struct vfsmount *mnt; + struct tcon_link *tlink; + + cFYI(1, "in %s", __func__); + BUG_ON(IS_ROOT(mntpt)); + + /* + * The MSDFS spec states that paths in DFS referral requests and + * responses must be prefixed by a single '\' character instead of + * the double backslashes usually used in the UNC. This function + * gives us the latter, so we must adjust the result. + */ + mnt = ERR_PTR(-ENOMEM); + full_path = build_path_from_dentry(mntpt); + if (full_path == NULL) + goto cdda_exit; + + cifs_sb = CIFS_SB(mntpt->d_inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + mnt = ERR_CAST(tlink); + goto free_full_path; + } + ses = tlink_tcon(tlink)->ses; + + xid = GetXid(); + rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls, + &num_referrals, &referrals, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + FreeXid(xid); + + cifs_put_tlink(tlink); + + mnt = ERR_PTR(-ENOENT); + for (i = 0; i < num_referrals; i++) { + int len; + dump_referral(referrals + i); + /* connect to a node */ + len = strlen(referrals[i].node_name); + if (len < 2) { + cERROR(1, "%s: Net Address path too short: %s", + __func__, referrals[i].node_name); + mnt = ERR_PTR(-EINVAL); + break; + } + mnt = cifs_dfs_do_refmount(cifs_sb, + full_path, referrals + i); + cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, + referrals[i].node_name, mnt); + if (!IS_ERR(mnt)) + goto success; + } + + /* no valid submounts were found; return error from get_dfs_path() by + * preference */ + if (rc != 0) + mnt = ERR_PTR(rc); + +success: + free_dfs_info_array(referrals, num_referrals); +free_full_path: + kfree(full_path); +cdda_exit: + cFYI(1, "leaving %s" , __func__); + return mnt; +} + +/* + * Attempt to automount the referral + */ +struct vfsmount *cifs_dfs_d_automount(struct path *path) +{ + struct vfsmount *newmnt; + + cFYI(1, "in %s", __func__); + + newmnt = cifs_dfs_do_automount(path->dentry); + if (IS_ERR(newmnt)) { + cFYI(1, "leaving %s [automount failed]" , __func__); + return newmnt; + } + + mntget(newmnt); /* prevent immediate expiration */ + mnt_set_expiry(newmnt, &cifs_dfs_automount_list); + schedule_delayed_work(&cifs_dfs_automount_task, + cifs_dfs_mountpoint_expiry_timeout); + cFYI(1, "leaving %s [ok]" , __func__); + return newmnt; +} + +const struct inode_operations cifs_dfs_referral_inode_operations = { +}; diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h new file mode 100644 index 00000000..7260e11e --- /dev/null +++ b/fs/cifs/cifs_fs_sb.h @@ -0,0 +1,65 @@ +/* + * fs/cifs/cifs_fs_sb.h + * + * Copyright (c) International Business Machines Corp., 2002,2004 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + */ +#include + +#ifndef _CIFS_FS_SB_H +#define _CIFS_FS_SB_H + +#include + +#define CIFS_MOUNT_NO_PERM 1 /* do not do client vfs_perm check */ +#define CIFS_MOUNT_SET_UID 2 /* set current's euid in create etc. */ +#define CIFS_MOUNT_SERVER_INUM 4 /* inode numbers from uniqueid from server */ +#define CIFS_MOUNT_DIRECT_IO 8 /* do not write nor read through page cache */ +#define CIFS_MOUNT_NO_XATTR 0x10 /* if set - disable xattr support */ +#define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames */ +#define CIFS_MOUNT_POSIX_PATHS 0x40 /* Negotiate posix pathnames if possible*/ +#define CIFS_MOUNT_UNX_EMUL 0x80 /* Network compat with SFUnix emulation */ +#define CIFS_MOUNT_NO_BRL 0x100 /* No sending byte range locks to srv */ +#define CIFS_MOUNT_CIFS_ACL 0x200 /* send ACL requests to non-POSIX srv */ +#define CIFS_MOUNT_OVERR_UID 0x400 /* override uid returned from server */ +#define CIFS_MOUNT_OVERR_GID 0x800 /* override gid returned from server */ +#define CIFS_MOUNT_DYNPERM 0x1000 /* allow in-memory only mode setting */ +#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */ +#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/ +#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */ +#define CIFS_MOUNT_MF_SYMLINKS 0x10000 /* Minshall+French Symlinks enabled */ +#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ +#define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */ +#define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */ +#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ + +struct cifs_sb_info { + struct rb_root tlink_tree; + spinlock_t tlink_tree_lock; + struct tcon_link *master_tlink; + struct nls_table *local_nls; + unsigned int rsize; + unsigned int wsize; + unsigned long actimeo; /* attribute cache timeout (jiffies) */ + atomic_t active; + uid_t mnt_uid; + gid_t mnt_gid; + mode_t mnt_file_mode; + mode_t mnt_dir_mode; + unsigned int mnt_cifs_flags; + char *mountdata; /* options received at mount time or via DFS refs */ + struct backing_dev_info bdi; + struct delayed_work prune_tlinks; +}; +#endif /* _CIFS_FS_SB_H */ diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c new file mode 100644 index 00000000..2272fd5f --- /dev/null +++ b/fs/cifs/cifs_spnego.c @@ -0,0 +1,175 @@ +/* + * fs/cifs/cifs_spnego.c -- SPNEGO upcall management for CIFS + * + * Copyright (c) 2007 Red Hat, Inc. + * Author(s): Jeff Layton (jlayton@redhat.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include "cifsglob.h" +#include "cifs_spnego.h" +#include "cifs_debug.h" + +/* create a new cifs key */ +static int +cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen) +{ + char *payload; + int ret; + + ret = -ENOMEM; + payload = kmalloc(datalen, GFP_KERNEL); + if (!payload) + goto error; + + /* attach the data */ + memcpy(payload, data, datalen); + key->payload.data = payload; + ret = 0; + +error: + return ret; +} + +static void +cifs_spnego_key_destroy(struct key *key) +{ + kfree(key->payload.data); +} + + +/* + * keytype for CIFS spnego keys + */ +struct key_type cifs_spnego_key_type = { + .name = "cifs.spnego", + .instantiate = cifs_spnego_key_instantiate, + .match = user_match, + .destroy = cifs_spnego_key_destroy, + .describe = user_describe, +}; + +/* length of longest version string e.g. strlen("ver=0xFF") */ +#define MAX_VER_STR_LEN 8 + +/* length of longest security mechanism name, eg in future could have + * strlen(";sec=ntlmsspi") */ +#define MAX_MECH_STR_LEN 13 + +/* strlen of "host=" */ +#define HOST_KEY_LEN 5 + +/* strlen of ";ip4=" or ";ip6=" */ +#define IP_KEY_LEN 5 + +/* strlen of ";uid=0x" */ +#define UID_KEY_LEN 7 + +/* strlen of ";creduid=0x" */ +#define CREDUID_KEY_LEN 11 + +/* strlen of ";user=" */ +#define USER_KEY_LEN 6 + +/* strlen of ";pid=0x" */ +#define PID_KEY_LEN 7 + +/* get a key struct with a SPNEGO security blob, suitable for session setup */ +struct key * +cifs_get_spnego_key(struct cifs_ses *sesInfo) +{ + struct TCP_Server_Info *server = sesInfo->server; + struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr; + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr; + char *description, *dp; + size_t desc_len; + struct key *spnego_key; + const char *hostname = server->hostname; + + /* length of fields (with semicolons): ver=0xyz ip4=ipaddress + host=hostname sec=mechanism uid=0xFF user=username */ + desc_len = MAX_VER_STR_LEN + + HOST_KEY_LEN + strlen(hostname) + + IP_KEY_LEN + INET6_ADDRSTRLEN + + MAX_MECH_STR_LEN + + UID_KEY_LEN + (sizeof(uid_t) * 2) + + CREDUID_KEY_LEN + (sizeof(uid_t) * 2) + + USER_KEY_LEN + strlen(sesInfo->user_name) + + PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; + + spnego_key = ERR_PTR(-ENOMEM); + description = kzalloc(desc_len, GFP_KERNEL); + if (description == NULL) + goto out; + + dp = description; + /* start with version and hostname portion of UNC string */ + spnego_key = ERR_PTR(-EINVAL); + sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION, + hostname); + dp = description + strlen(description); + + /* add the server address */ + if (server->dstaddr.ss_family == AF_INET) + sprintf(dp, "ip4=%pI4", &sa->sin_addr); + else if (server->dstaddr.ss_family == AF_INET6) + sprintf(dp, "ip6=%pI6", &sa6->sin6_addr); + else + goto out; + + dp = description + strlen(description); + + /* for now, only sec=krb5 and sec=mskrb5 are valid */ + if (server->sec_kerberos) + sprintf(dp, ";sec=krb5"); + else if (server->sec_mskerberos) + sprintf(dp, ";sec=mskrb5"); + else + goto out; + + dp = description + strlen(description); + sprintf(dp, ";uid=0x%x", sesInfo->linux_uid); + + dp = description + strlen(description); + sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid); + + dp = description + strlen(description); + sprintf(dp, ";user=%s", sesInfo->user_name); + + dp = description + strlen(description); + sprintf(dp, ";pid=0x%x", current->pid); + + cFYI(1, "key description = %s", description); + spnego_key = request_key(&cifs_spnego_key_type, description, ""); + +#ifdef CONFIG_CIFS_DEBUG2 + if (cifsFYI && !IS_ERR(spnego_key)) { + struct cifs_spnego_msg *msg = spnego_key->payload.data; + cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024U, + msg->secblob_len + msg->sesskey_len)); + } +#endif /* CONFIG_CIFS_DEBUG2 */ + +out: + kfree(description); + return spnego_key; +} diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h new file mode 100644 index 00000000..31bef9ee --- /dev/null +++ b/fs/cifs/cifs_spnego.h @@ -0,0 +1,47 @@ +/* + * fs/cifs/cifs_spnego.h -- SPNEGO upcall management for CIFS + * + * Copyright (c) 2007 Red Hat, Inc. + * Author(s): Jeff Layton (jlayton@redhat.com) + * Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _CIFS_SPNEGO_H +#define _CIFS_SPNEGO_H + +#define CIFS_SPNEGO_UPCALL_VERSION 2 + +/* + * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION. + * The flags field is for future use. The request-key callout should set + * sesskey_len and secblob_len, and then concatenate the SessKey+SecBlob + * and stuff it in the data field. + */ +struct cifs_spnego_msg { + uint32_t version; + uint32_t flags; + uint32_t sesskey_len; + uint32_t secblob_len; + uint8_t data[1]; +}; + +#ifdef __KERNEL__ +extern struct key_type cifs_spnego_key_type; +extern struct key *cifs_get_spnego_key(struct cifs_ses *sesInfo); +#endif /* KERNEL */ + +#endif /* _CIFS_SPNEGO_H */ diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c new file mode 100644 index 00000000..1b2e180b --- /dev/null +++ b/fs/cifs/cifs_unicode.c @@ -0,0 +1,332 @@ +/* + * fs/cifs/cifs_unicode.c + * + * Copyright (c) International Business Machines Corp., 2000,2009 + * Modified by Steve French (sfrench@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include "cifs_unicode.h" +#include "cifs_uniupr.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifs_debug.h" + +/* + * cifs_ucs2_bytes - how long will a string be after conversion? + * @ucs - pointer to input string + * @maxbytes - don't go past this many bytes of input string + * @codepage - destination codepage + * + * Walk a ucs2le string and return the number of bytes that the string will + * be after being converted to the given charset, not including any null + * termination required. Don't walk past maxbytes in the source buffer. + */ +int +cifs_ucs2_bytes(const __le16 *from, int maxbytes, + const struct nls_table *codepage) +{ + int i; + int charlen, outlen = 0; + int maxwords = maxbytes / 2; + char tmp[NLS_MAX_CHARSET_SIZE]; + __u16 ftmp; + + for (i = 0; i < maxwords; i++) { + ftmp = get_unaligned_le16(&from[i]); + if (ftmp == 0) + break; + + charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE); + if (charlen > 0) + outlen += charlen; + else + outlen++; + } + + return outlen; +} + +/* + * cifs_mapchar - convert a host-endian char to proper char in codepage + * @target - where converted character should be copied + * @src_char - 2 byte host-endian source character + * @cp - codepage to which character should be converted + * @mapchar - should character be mapped according to mapchars mount option? + * + * This function handles the conversion of a single character. It is the + * responsibility of the caller to ensure that the target buffer is large + * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). + */ +static int +cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, + bool mapchar) +{ + int len = 1; + + if (!mapchar) + goto cp_convert; + + /* + * BB: Cannot handle remapping UNI_SLASH until all the calls to + * build_path_from_dentry are modified, as they use slash as + * separator. + */ + switch (src_char) { + case UNI_COLON: + *target = ':'; + break; + case UNI_ASTERISK: + *target = '*'; + break; + case UNI_QUESTION: + *target = '?'; + break; + case UNI_PIPE: + *target = '|'; + break; + case UNI_GRTRTHAN: + *target = '>'; + break; + case UNI_LESSTHAN: + *target = '<'; + break; + default: + goto cp_convert; + } + +out: + return len; + +cp_convert: + len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); + if (len <= 0) { + *target = '?'; + len = 1; + } + goto out; +} + +/* + * cifs_from_ucs2 - convert utf16le string to local charset + * @to - destination buffer + * @from - source buffer + * @tolen - destination buffer size (in bytes) + * @fromlen - source buffer size (in bytes) + * @codepage - codepage to which characters should be converted + * @mapchar - should characters be remapped according to the mapchars option? + * + * Convert a little-endian ucs2le string (as sent by the server) to a string + * in the provided codepage. The tolen and fromlen parameters are to ensure + * that the code doesn't walk off of the end of the buffer (which is always + * a danger if the alignment of the source buffer is off). The destination + * string is always properly null terminated and fits in the destination + * buffer. Returns the length of the destination string in bytes (including + * null terminator). + * + * Note that some windows versions actually send multiword UTF-16 characters + * instead of straight UCS-2. The linux nls routines however aren't able to + * deal with those characters properly. In the event that we get some of + * those characters, they won't be translated properly. + */ +int +cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen, + const struct nls_table *codepage, bool mapchar) +{ + int i, charlen, safelen; + int outlen = 0; + int nullsize = nls_nullsize(codepage); + int fromwords = fromlen / 2; + char tmp[NLS_MAX_CHARSET_SIZE]; + __u16 ftmp; + + /* + * because the chars can be of varying widths, we need to take care + * not to overflow the destination buffer when we get close to the + * end of it. Until we get to this offset, we don't need to check + * for overflow however. + */ + safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); + + for (i = 0; i < fromwords; i++) { + ftmp = get_unaligned_le16(&from[i]); + if (ftmp == 0) + break; + + /* + * check to see if converting this character might make the + * conversion bleed into the null terminator + */ + if (outlen >= safelen) { + charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar); + if ((outlen + charlen) > (tolen - nullsize)) + break; + } + + /* put converted char into 'to' buffer */ + charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar); + outlen += charlen; + } + + /* properly null-terminate string */ + for (i = 0; i < nullsize; i++) + to[outlen++] = 0; + + return outlen; +} + +/* + * NAME: cifs_strtoUCS() + * + * FUNCTION: Convert character string to unicode string + * + */ +int +cifs_strtoUCS(__le16 *to, const char *from, int len, + const struct nls_table *codepage) +{ + int charlen; + int i; + wchar_t wchar_to; /* needed to quiet sparse */ + + for (i = 0; len && *from; i++, from += charlen, len -= charlen) { + charlen = codepage->char2uni(from, len, &wchar_to); + if (charlen < 1) { + cERROR(1, "strtoUCS: char2uni of 0x%x returned %d", + *from, charlen); + /* A question mark */ + wchar_to = 0x003f; + charlen = 1; + } + put_unaligned_le16(wchar_to, &to[i]); + } + + put_unaligned_le16(0, &to[i]); + return i; +} + +/* + * cifs_strndup_from_ucs - copy a string from wire format to the local codepage + * @src - source string + * @maxlen - don't walk past this many bytes in the source string + * @is_unicode - is this a unicode string? + * @codepage - destination codepage + * + * Take a string given by the server, convert it to the local codepage and + * put it in a new buffer. Returns a pointer to the new string or NULL on + * error. + */ +char * +cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode, + const struct nls_table *codepage) +{ + int len; + char *dst; + + if (is_unicode) { + len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage); + len += nls_nullsize(codepage); + dst = kmalloc(len, GFP_KERNEL); + if (!dst) + return NULL; + cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage, + false); + } else { + len = strnlen(src, maxlen); + len++; + dst = kmalloc(len, GFP_KERNEL); + if (!dst) + return NULL; + strlcpy(dst, src, len); + } + + return dst; +} + +/* + * Convert 16 bit Unicode pathname to wire format from string in current code + * page. Conversion may involve remapping up the six characters that are + * only legal in POSIX-like OS (if they are present in the string). Path + * names are little endian 16 bit Unicode on the wire + */ +int +cifsConvertToUCS(__le16 *target, const char *source, int srclen, + const struct nls_table *cp, int mapChars) +{ + int i, j, charlen; + char src_char; + __le16 dst_char; + wchar_t tmp; + + if (!mapChars) + return cifs_strtoUCS(target, source, PATH_MAX, cp); + + for (i = 0, j = 0; i < srclen; j++) { + src_char = source[i]; + charlen = 1; + switch (src_char) { + case 0: + put_unaligned(0, &target[j]); + goto ctoUCS_out; + case ':': + dst_char = cpu_to_le16(UNI_COLON); + break; + case '*': + dst_char = cpu_to_le16(UNI_ASTERISK); + break; + case '?': + dst_char = cpu_to_le16(UNI_QUESTION); + break; + case '<': + dst_char = cpu_to_le16(UNI_LESSTHAN); + break; + case '>': + dst_char = cpu_to_le16(UNI_GRTRTHAN); + break; + case '|': + dst_char = cpu_to_le16(UNI_PIPE); + break; + /* + * FIXME: We can not handle remapping backslash (UNI_SLASH) + * until all the calls to build_path_from_dentry are modified, + * as they use backslash as separator. + */ + default: + charlen = cp->char2uni(source + i, srclen - i, &tmp); + dst_char = cpu_to_le16(tmp); + + /* + * if no match, use question mark, which at least in + * some cases serves as wild card + */ + if (charlen < 1) { + dst_char = cpu_to_le16(0x003f); + charlen = 1; + } + } + /* + * character may take more than one byte in the source string, + * but will take exactly two bytes in the target string + */ + i += charlen; + put_unaligned(dst_char, &target[j]); + } + +ctoUCS_out: + return i; +} + diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h new file mode 100644 index 00000000..6d02fd56 --- /dev/null +++ b/fs/cifs/cifs_unicode.h @@ -0,0 +1,383 @@ +/* + * cifs_unicode: Unicode kernel case support + * + * Function: + * Convert a unicode character to upper or lower case using + * compressed tables. + * + * Copyright (c) International Business Machines Corp., 2000,2009 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * Notes: + * These APIs are based on the C library functions. The semantics + * should match the C functions but with expanded size operands. + * + * The upper/lower functions are based on a table created by mkupr. + * This is a compressed table of upper and lower case conversion. + * + */ +#ifndef _CIFS_UNICODE_H +#define _CIFS_UNICODE_H + +#include +#include +#include + +#define UNIUPR_NOLOWER /* Example to not expand lower case tables */ + +/* + * Windows maps these to the user defined 16 bit Unicode range since they are + * reserved symbols (along with \ and /), otherwise illegal to store + * in filenames in NTFS + */ +#define UNI_ASTERISK (__u16) ('*' + 0xF000) +#define UNI_QUESTION (__u16) ('?' + 0xF000) +#define UNI_COLON (__u16) (':' + 0xF000) +#define UNI_GRTRTHAN (__u16) ('>' + 0xF000) +#define UNI_LESSTHAN (__u16) ('<' + 0xF000) +#define UNI_PIPE (__u16) ('|' + 0xF000) +#define UNI_SLASH (__u16) ('\\' + 0xF000) + +/* Just define what we want from uniupr.h. We don't want to define the tables + * in each source file. + */ +#ifndef UNICASERANGE_DEFINED +struct UniCaseRange { + wchar_t start; + wchar_t end; + signed char *table; +}; +#endif /* UNICASERANGE_DEFINED */ + +#ifndef UNIUPR_NOUPPER +extern signed char CifsUniUpperTable[512]; +extern const struct UniCaseRange CifsUniUpperRange[]; +#endif /* UNIUPR_NOUPPER */ + +#ifndef UNIUPR_NOLOWER +extern signed char CifsUniLowerTable[512]; +extern const struct UniCaseRange CifsUniLowerRange[]; +#endif /* UNIUPR_NOLOWER */ + +#ifdef __KERNEL__ +int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen, + const struct nls_table *codepage, bool mapchar); +int cifs_ucs2_bytes(const __le16 *from, int maxbytes, + const struct nls_table *codepage); +int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *); +char *cifs_strndup_from_ucs(const char *src, const int maxlen, + const bool is_unicode, + const struct nls_table *codepage); +extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen, + const struct nls_table *cp, int mapChars); + +#endif + +/* + * UniStrcat: Concatenate the second string to the first + * + * Returns: + * Address of the first string + */ +static inline wchar_t * +UniStrcat(wchar_t *ucs1, const wchar_t *ucs2) +{ + wchar_t *anchor = ucs1; /* save a pointer to start of ucs1 */ + + while (*ucs1++) ; /* To end of first string */ + ucs1--; /* Return to the null */ + while ((*ucs1++ = *ucs2++)) ; /* copy string 2 over */ + return anchor; +} + +/* + * UniStrchr: Find a character in a string + * + * Returns: + * Address of first occurrence of character in string + * or NULL if the character is not in the string + */ +static inline wchar_t * +UniStrchr(const wchar_t *ucs, wchar_t uc) +{ + while ((*ucs != uc) && *ucs) + ucs++; + + if (*ucs == uc) + return (wchar_t *) ucs; + return NULL; +} + +/* + * UniStrcmp: Compare two strings + * + * Returns: + * < 0: First string is less than second + * = 0: Strings are equal + * > 0: First string is greater than second + */ +static inline int +UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2) +{ + while ((*ucs1 == *ucs2) && *ucs1) { + ucs1++; + ucs2++; + } + return (int) *ucs1 - (int) *ucs2; +} + +/* + * UniStrcpy: Copy a string + */ +static inline wchar_t * +UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2) +{ + wchar_t *anchor = ucs1; /* save the start of result string */ + + while ((*ucs1++ = *ucs2++)) ; + return anchor; +} + +/* + * UniStrlen: Return the length of a string (in 16 bit Unicode chars not bytes) + */ +static inline size_t +UniStrlen(const wchar_t *ucs1) +{ + int i = 0; + + while (*ucs1++) + i++; + return i; +} + +/* + * UniStrnlen: Return the length (in 16 bit Unicode chars not bytes) of a + * string (length limited) + */ +static inline size_t +UniStrnlen(const wchar_t *ucs1, int maxlen) +{ + int i = 0; + + while (*ucs1++) { + i++; + if (i >= maxlen) + break; + } + return i; +} + +/* + * UniStrncat: Concatenate length limited string + */ +static inline wchar_t * +UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; /* save pointer to string 1 */ + + while (*ucs1++) ; + ucs1--; /* point to null terminator of s1 */ + while (n-- && (*ucs1 = *ucs2)) { /* copy s2 after s1 */ + ucs1++; + ucs2++; + } + *ucs1 = 0; /* Null terminate the result */ + return (anchor); +} + +/* + * UniStrncmp: Compare length limited string + */ +static inline int +UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + if (!n) + return 0; /* Null strings are equal */ + while ((*ucs1 == *ucs2) && *ucs1 && --n) { + ucs1++; + ucs2++; + } + return (int) *ucs1 - (int) *ucs2; +} + +/* + * UniStrncmp_le: Compare length limited string - native to little-endian + */ +static inline int +UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + if (!n) + return 0; /* Null strings are equal */ + while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) { + ucs1++; + ucs2++; + } + return (int) *ucs1 - (int) __le16_to_cpu(*ucs2); +} + +/* + * UniStrncpy: Copy length limited string with pad + */ +static inline wchar_t * +UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = *ucs2++; + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrncpy_le: Copy length limited string with pad to little-endian + */ +static inline wchar_t * +UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = __le16_to_cpu(*ucs2++); + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrstr: Find a string in a string + * + * Returns: + * Address of first match found + * NULL if no matching string is found + */ +static inline wchar_t * +UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2) +{ + const wchar_t *anchor1 = ucs1; + const wchar_t *anchor2 = ucs2; + + while (*ucs1) { + if (*ucs1 == *ucs2) { + /* Partial match found */ + ucs1++; + ucs2++; + } else { + if (!*ucs2) /* Match found */ + return (wchar_t *) anchor1; + ucs1 = ++anchor1; /* No match */ + ucs2 = anchor2; + } + } + + if (!*ucs2) /* Both end together */ + return (wchar_t *) anchor1; /* Match found */ + return NULL; /* No match */ +} + +#ifndef UNIUPR_NOUPPER +/* + * UniToupper: Convert a unicode character to upper case + */ +static inline wchar_t +UniToupper(register wchar_t uc) +{ + register const struct UniCaseRange *rp; + + if (uc < sizeof(CifsUniUpperTable)) { + /* Latin characters */ + return uc + CifsUniUpperTable[uc]; /* Use base tables */ + } else { + rp = CifsUniUpperRange; /* Use range tables */ + while (rp->start) { + if (uc < rp->start) /* Before start of range */ + return uc; /* Uppercase = input */ + if (uc <= rp->end) /* In range */ + return uc + rp->table[uc - rp->start]; + rp++; /* Try next range */ + } + } + return uc; /* Past last range */ +} + +/* + * UniStrupr: Upper case a unicode string + */ +static inline wchar_t * +UniStrupr(register wchar_t *upin) +{ + register wchar_t *up; + + up = upin; + while (*up) { /* For all characters */ + *up = UniToupper(*up); + up++; + } + return upin; /* Return input pointer */ +} +#endif /* UNIUPR_NOUPPER */ + +#ifndef UNIUPR_NOLOWER +/* + * UniTolower: Convert a unicode character to lower case + */ +static inline wchar_t +UniTolower(register wchar_t uc) +{ + register const struct UniCaseRange *rp; + + if (uc < sizeof(CifsUniLowerTable)) { + /* Latin characters */ + return uc + CifsUniLowerTable[uc]; /* Use base tables */ + } else { + rp = CifsUniLowerRange; /* Use range tables */ + while (rp->start) { + if (uc < rp->start) /* Before start of range */ + return uc; /* Uppercase = input */ + if (uc <= rp->end) /* In range */ + return uc + rp->table[uc - rp->start]; + rp++; /* Try next range */ + } + } + return uc; /* Past last range */ +} + +/* + * UniStrlwr: Lower case a unicode string + */ +static inline wchar_t * +UniStrlwr(register wchar_t *upin) +{ + register wchar_t *up; + + up = upin; + while (*up) { /* For all characters */ + *up = UniTolower(*up); + up++; + } + return upin; /* Return input pointer */ +} + +#endif + +#endif /* _CIFS_UNICODE_H */ diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h new file mode 100644 index 00000000..0ac7c5a8 --- /dev/null +++ b/fs/cifs/cifs_uniupr.h @@ -0,0 +1,253 @@ +/* + * Copyright (c) International Business Machines Corp., 2000,2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * uniupr.h - Unicode compressed case ranges + * +*/ + +#ifndef UNIUPR_NOUPPER +/* + * Latin upper case + */ +signed char CifsUniUpperTable[512] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */ + 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 060-06f */ + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0, /* 070-07f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */ + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 0e0-0ef */ + -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121, /* 0f0-0ff */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */ + 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */ + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */ + 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */ + 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */ + 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */ + -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */ + 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */ + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -79, 0, -1, /* 1d0-1df */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */ + 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */ +}; + +/* Upper case range - Greek */ +static signed char UniCaseRangeU03a0[47] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -38, -37, -37, -37, /* 3a0-3af */ + 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 3b0-3bf */ + -32, -32, -31, -32, -32, -32, -32, -32, -32, -32, -32, -32, -64, + -63, -63, +}; + +/* Upper case range - Cyrillic */ +static signed char UniCaseRangeU0430[48] = { + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 430-43f */ + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 440-44f */ + 0, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, 0, -80, -80, /* 450-45f */ +}; + +/* Upper case range - Extended cyrillic */ +static signed char UniCaseRangeU0490[61] = { + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */ + 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, +}; + +/* Upper case range - Extended latin and greek */ +static signed char UniCaseRangeU1e00[509] = { + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */ + 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, -59, 0, -1, 0, -1, /* 1e90-1e9f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */ + 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */ + 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */ + 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */ + 74, 74, 86, 86, 86, 86, 100, 100, 0, 0, 112, 112, 126, 126, 0, 0, /* 1f70-1f7f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */ + 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */ + 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */ + 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */ + 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */ + 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Upper case range - Wide latin */ +static signed char UniCaseRangeUff40[27] = { + 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* ff40-ff4f */ + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, +}; + +/* + * Upper Case Range + */ +const struct UniCaseRange CifsUniUpperRange[] = { + {0x03a0, 0x03ce, UniCaseRangeU03a0}, + {0x0430, 0x045f, UniCaseRangeU0430}, + {0x0490, 0x04cc, UniCaseRangeU0490}, + {0x1e00, 0x1ffc, UniCaseRangeU1e00}, + {0xff40, 0xff5a, UniCaseRangeUff40}, + {0} +}; +#endif + +#ifndef UNIUPR_NOLOWER +/* + * Latin lower case + */ +signed char CifsUniLowerTable[512] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ + 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 040-04f */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0, /* 050-05f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 060-06f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 070-07f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 0c0-0cf */ + 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 0, /* 0d0-0df */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0e0-0ef */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0f0-0ff */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 100-10f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 110-11f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 120-12f */ + 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, /* 130-13f */ + 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, /* 140-14f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 150-15f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 160-16f */ + 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, 0, /* 170-17f */ + 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 79, 0, /* 180-18f */ + 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 190-19f */ + 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, /* 1a0-1af */ + 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, /* 1b0-1bf */ + 0, 0, 0, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 1, 0, 1, /* 1c0-1cf */ + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, /* 1d0-1df */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e0-1ef */ + 0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1f0-1ff */ +}; + +/* Lower case range - Greek */ +static signed char UniCaseRangeL0380[44] = { + 0, 0, 0, 0, 0, 0, 38, 0, 37, 37, 37, 0, 64, 0, 63, 63, /* 380-38f */ + 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 390-39f */ + 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, +}; + +/* Lower case range - Cyrillic */ +static signed char UniCaseRangeL0400[48] = { + 0, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 0, 80, 80, /* 400-40f */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 410-41f */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 420-42f */ +}; + +/* Lower case range - Extended cyrillic */ +static signed char UniCaseRangeL0490[60] = { + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 490-49f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4a0-4af */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4b0-4bf */ + 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, +}; + +/* Lower case range - Extended latin and greek */ +static signed char UniCaseRangeL1e00[504] = { + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e00-1e0f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e10-1e1f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e20-1e2f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e30-1e3f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e40-1e4f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e50-1e5f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e60-1e6f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e70-1e7f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e80-1e8f */ + 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 1e90-1e9f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ea0-1eaf */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1eb0-1ebf */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ec0-1ecf */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ed0-1edf */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ee0-1eef */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f00-1f0f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f10-1f1f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f20-1f2f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f30-1f3f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f40-1f4f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, -8, 0, -8, 0, -8, 0, -8, /* 1f50-1f5f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f60-1f6f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f70-1f7f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f80-1f8f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f90-1f9f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1fa0-1faf */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -74, -74, -9, 0, 0, 0, /* 1fb0-1fbf */ + 0, 0, 0, 0, 0, 0, 0, 0, -86, -86, -86, -86, -9, 0, 0, 0, /* 1fc0-1fcf */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -100, -100, 0, 0, 0, 0, /* 1fd0-1fdf */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -112, -112, -7, 0, 0, 0, /* 1fe0-1fef */ + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Lower case range - Wide latin */ +static signed char UniCaseRangeLff20[27] = { + 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* ff20-ff2f */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, +}; + +/* + * Lower Case Range + */ +const struct UniCaseRange CifsUniLowerRange[] = { + {0x0380, 0x03ab, UniCaseRangeL0380}, + {0x0400, 0x042f, UniCaseRangeL0400}, + {0x0490, 0x04cb, UniCaseRangeL0490}, + {0x1e00, 0x1ff7, UniCaseRangeL1e00}, + {0xff20, 0xff3a, UniCaseRangeLff20}, + {0} +}; +#endif diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c new file mode 100644 index 00000000..21de1d6d --- /dev/null +++ b/fs/cifs/cifsacl.c @@ -0,0 +1,1142 @@ +/* + * fs/cifs/cifsacl.c + * + * Copyright (C) International Business Machines Corp., 2007,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * Contains the routines for mapping CIFS/NTFS ACLs + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsacl.h" +#include "cifsproto.h" +#include "cifs_debug.h" + +/* security id for everyone/world system group */ +static const struct cifs_sid sid_everyone = { + 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; +/* security id for Authenticated Users system group */ +static const struct cifs_sid sid_authusers = { + 1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11)} }; +/* group users */ +static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; + +const struct cred *root_cred; + +static void +shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem, + int *nr_del) +{ + struct rb_node *node; + struct rb_node *tmp; + struct cifs_sid_id *psidid; + + node = rb_first(root); + while (node) { + tmp = node; + node = rb_next(tmp); + psidid = rb_entry(tmp, struct cifs_sid_id, rbnode); + if (nr_to_scan == 0 || *nr_del == nr_to_scan) + ++(*nr_rem); + else { + if (time_after(jiffies, psidid->time + SID_MAP_EXPIRE) + && psidid->refcount == 0) { + rb_erase(tmp, root); + ++(*nr_del); + } else + ++(*nr_rem); + } + } +} + +/* + * Run idmap cache shrinker. + */ +static int +cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc) +{ + int nr_to_scan = sc->nr_to_scan; + int nr_del = 0; + int nr_rem = 0; + struct rb_root *root; + + root = &uidtree; + spin_lock(&siduidlock); + shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); + spin_unlock(&siduidlock); + + root = &gidtree; + spin_lock(&sidgidlock); + shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); + spin_unlock(&sidgidlock); + + return nr_rem; +} + +static struct shrinker cifs_shrinker = { + .shrink = cifs_idmap_shrinker, + .seeks = DEFAULT_SEEKS, +}; + +static int +cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen) +{ + char *payload; + + payload = kmalloc(datalen, GFP_KERNEL); + if (!payload) + return -ENOMEM; + + memcpy(payload, data, datalen); + key->payload.data = payload; + return 0; +} + +static inline void +cifs_idmap_key_destroy(struct key *key) +{ + kfree(key->payload.data); +} + +struct key_type cifs_idmap_key_type = { + .name = "cifs.idmap", + .instantiate = cifs_idmap_key_instantiate, + .destroy = cifs_idmap_key_destroy, + .describe = user_describe, + .match = user_match, +}; + +static void +sid_to_str(struct cifs_sid *sidptr, char *sidstr) +{ + int i; + unsigned long saval; + char *strptr; + + strptr = sidstr; + + sprintf(strptr, "%s", "S"); + strptr = sidstr + strlen(sidstr); + + sprintf(strptr, "-%d", sidptr->revision); + strptr = sidstr + strlen(sidstr); + + for (i = 0; i < 6; ++i) { + if (sidptr->authority[i]) { + sprintf(strptr, "-%d", sidptr->authority[i]); + strptr = sidstr + strlen(sidstr); + } + } + + for (i = 0; i < sidptr->num_subauth; ++i) { + saval = le32_to_cpu(sidptr->sub_auth[i]); + sprintf(strptr, "-%ld", saval); + strptr = sidstr + strlen(sidstr); + } +} + +static void +id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr, + struct cifs_sid_id **psidid, char *typestr) +{ + int rc; + char *strptr; + struct rb_node *node = root->rb_node; + struct rb_node *parent = NULL; + struct rb_node **linkto = &(root->rb_node); + struct cifs_sid_id *lsidid; + + while (node) { + lsidid = rb_entry(node, struct cifs_sid_id, rbnode); + parent = node; + rc = compare_sids(sidptr, &((lsidid)->sid)); + if (rc > 0) { + linkto = &(node->rb_left); + node = node->rb_left; + } else if (rc < 0) { + linkto = &(node->rb_right); + node = node->rb_right; + } + } + + memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid)); + (*psidid)->time = jiffies - (SID_MAP_RETRY + 1); + (*psidid)->refcount = 0; + + sprintf((*psidid)->sidstr, "%s", typestr); + strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr); + sid_to_str(&(*psidid)->sid, strptr); + + clear_bit(SID_ID_PENDING, &(*psidid)->state); + clear_bit(SID_ID_MAPPED, &(*psidid)->state); + + rb_link_node(&(*psidid)->rbnode, parent, linkto); + rb_insert_color(&(*psidid)->rbnode, root); +} + +static struct cifs_sid_id * +id_rb_search(struct rb_root *root, struct cifs_sid *sidptr) +{ + int rc; + struct rb_node *node = root->rb_node; + struct cifs_sid_id *lsidid; + + while (node) { + lsidid = rb_entry(node, struct cifs_sid_id, rbnode); + rc = compare_sids(sidptr, &((lsidid)->sid)); + if (rc > 0) { + node = node->rb_left; + } else if (rc < 0) { + node = node->rb_right; + } else /* node found */ + return lsidid; + } + + return NULL; +} + +static int +sidid_pending_wait(void *unused) +{ + schedule(); + return signal_pending(current) ? -ERESTARTSYS : 0; +} + +static int +sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, + struct cifs_fattr *fattr, uint sidtype) +{ + int rc; + unsigned long cid; + struct key *idkey; + const struct cred *saved_cred; + struct cifs_sid_id *psidid, *npsidid; + struct rb_root *cidtree; + spinlock_t *cidlock; + + if (sidtype == SIDOWNER) { + cid = cifs_sb->mnt_uid; /* default uid, in case upcall fails */ + cidlock = &siduidlock; + cidtree = &uidtree; + } else if (sidtype == SIDGROUP) { + cid = cifs_sb->mnt_gid; /* default gid, in case upcall fails */ + cidlock = &sidgidlock; + cidtree = &gidtree; + } else + return -ENOENT; + + spin_lock(cidlock); + psidid = id_rb_search(cidtree, psid); + + if (!psidid) { /* node does not exist, allocate one & attempt adding */ + spin_unlock(cidlock); + npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL); + if (!npsidid) + return -ENOMEM; + + npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL); + if (!npsidid->sidstr) { + kfree(npsidid); + return -ENOMEM; + } + + spin_lock(cidlock); + psidid = id_rb_search(cidtree, psid); + if (psidid) { /* node happened to get inserted meanwhile */ + ++psidid->refcount; + spin_unlock(cidlock); + kfree(npsidid->sidstr); + kfree(npsidid); + } else { + psidid = npsidid; + id_rb_insert(cidtree, psid, &psidid, + sidtype == SIDOWNER ? "os:" : "gs:"); + ++psidid->refcount; + spin_unlock(cidlock); + } + } else { + ++psidid->refcount; + spin_unlock(cidlock); + } + + /* + * If we are here, it is safe to access psidid and its fields + * since a reference was taken earlier while holding the spinlock. + * A reference on the node is put without holding the spinlock + * and it is OK to do so in this case, shrinker will not erase + * this node until all references are put and we do not access + * any fields of the node after a reference is put . + */ + if (test_bit(SID_ID_MAPPED, &psidid->state)) { + cid = psidid->id; + psidid->time = jiffies; /* update ts for accessing */ + goto sid_to_id_out; + } + + if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) + goto sid_to_id_out; + + if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { + saved_cred = override_creds(root_cred); + idkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); + if (IS_ERR(idkey)) + cFYI(1, "%s: Can't map SID to an id", __func__); + else { + cid = *(unsigned long *)idkey->payload.value; + psidid->id = cid; + set_bit(SID_ID_MAPPED, &psidid->state); + key_put(idkey); + kfree(psidid->sidstr); + } + revert_creds(saved_cred); + psidid->time = jiffies; /* update ts for accessing */ + clear_bit(SID_ID_PENDING, &psidid->state); + wake_up_bit(&psidid->state, SID_ID_PENDING); + } else { + rc = wait_on_bit(&psidid->state, SID_ID_PENDING, + sidid_pending_wait, TASK_INTERRUPTIBLE); + if (rc) { + cFYI(1, "%s: sidid_pending_wait interrupted %d", + __func__, rc); + --psidid->refcount; /* decremented without spinlock */ + return rc; + } + if (test_bit(SID_ID_MAPPED, &psidid->state)) + cid = psidid->id; + } + +sid_to_id_out: + --psidid->refcount; /* decremented without spinlock */ + if (sidtype == SIDOWNER) + fattr->cf_uid = cid; + else + fattr->cf_gid = cid; + + return 0; +} + +int +init_cifs_idmap(void) +{ + struct cred *cred; + struct key *keyring; + int ret; + + cFYI(1, "Registering the %s key type\n", cifs_idmap_key_type.name); + + /* create an override credential set with a special thread keyring in + * which requests are cached + * + * this is used to prevent malicious redirections from being installed + * with add_key(). + */ + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); + if (ret < 0) + goto failed_put_key; + + ret = register_key_type(&cifs_idmap_key_type); + if (ret < 0) + goto failed_put_key; + + /* instruct request_key() to use this special keyring as a cache for + * the results it looks up */ + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + root_cred = cred; + + spin_lock_init(&siduidlock); + uidtree = RB_ROOT; + spin_lock_init(&sidgidlock); + gidtree = RB_ROOT; + + register_shrinker(&cifs_shrinker); + + cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring)); + return 0; + +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} + +void +exit_cifs_idmap(void) +{ + key_revoke(root_cred->thread_keyring); + unregister_key_type(&cifs_idmap_key_type); + put_cred(root_cred); + unregister_shrinker(&cifs_shrinker); + cFYI(1, "Unregistered %s key type\n", cifs_idmap_key_type.name); +} + +void +cifs_destroy_idmaptrees(void) +{ + struct rb_root *root; + struct rb_node *node; + + root = &uidtree; + spin_lock(&siduidlock); + while ((node = rb_first(root))) + rb_erase(node, root); + spin_unlock(&siduidlock); + + root = &gidtree; + spin_lock(&sidgidlock); + while ((node = rb_first(root))) + rb_erase(node, root); + spin_unlock(&sidgidlock); +} + +/* if the two SIDs (roughly equivalent to a UUID for a user or group) are + the same returns 1, if they do not match returns 0 */ +int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) +{ + int i; + int num_subauth, num_sat, num_saw; + + if ((!ctsid) || (!cwsid)) + return 1; + + /* compare the revision */ + if (ctsid->revision != cwsid->revision) { + if (ctsid->revision > cwsid->revision) + return 1; + else + return -1; + } + + /* compare all of the six auth values */ + for (i = 0; i < 6; ++i) { + if (ctsid->authority[i] != cwsid->authority[i]) { + if (ctsid->authority[i] > cwsid->authority[i]) + return 1; + else + return -1; + } + } + + /* compare all of the subauth values if any */ + num_sat = ctsid->num_subauth; + num_saw = cwsid->num_subauth; + num_subauth = num_sat < num_saw ? num_sat : num_saw; + if (num_subauth) { + for (i = 0; i < num_subauth; ++i) { + if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) { + if (le32_to_cpu(ctsid->sub_auth[i]) > + le32_to_cpu(cwsid->sub_auth[i])) + return 1; + else + return -1; + } + } + } + + return 0; /* sids compare/match */ +} + + +/* copy ntsd, owner sid, and group sid from a security descriptor to another */ +static void copy_sec_desc(const struct cifs_ntsd *pntsd, + struct cifs_ntsd *pnntsd, __u32 sidsoffset) +{ + int i; + + struct cifs_sid *owner_sid_ptr, *group_sid_ptr; + struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; + + /* copy security descriptor control portion */ + pnntsd->revision = pntsd->revision; + pnntsd->type = pntsd->type; + pnntsd->dacloffset = cpu_to_le32(sizeof(struct cifs_ntsd)); + pnntsd->sacloffset = 0; + pnntsd->osidoffset = cpu_to_le32(sidsoffset); + pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct cifs_sid)); + + /* copy owner sid */ + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->osidoffset)); + nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset); + + nowner_sid_ptr->revision = owner_sid_ptr->revision; + nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth; + for (i = 0; i < 6; i++) + nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i]; + for (i = 0; i < 5; i++) + nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i]; + + /* copy group sid */ + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->gsidoffset)); + ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset + + sizeof(struct cifs_sid)); + + ngroup_sid_ptr->revision = group_sid_ptr->revision; + ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth; + for (i = 0; i < 6; i++) + ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i]; + for (i = 0; i < 5; i++) + ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i]; + + return; +} + + +/* + change posix mode to reflect permissions + pmode is the existing mode (we only want to overwrite part of this + bits to set can be: S_IRWXU, S_IRWXG or S_IRWXO ie 00700 or 00070 or 00007 +*/ +static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode, + umode_t *pbits_to_set) +{ + __u32 flags = le32_to_cpu(ace_flags); + /* the order of ACEs is important. The canonical order is to begin with + DENY entries followed by ALLOW, otherwise an allow entry could be + encountered first, making the subsequent deny entry like "dead code" + which would be superflous since Windows stops when a match is made + for the operation you are trying to perform for your user */ + + /* For deny ACEs we change the mask so that subsequent allow access + control entries do not turn on the bits we are denying */ + if (type == ACCESS_DENIED) { + if (flags & GENERIC_ALL) + *pbits_to_set &= ~S_IRWXUGO; + + if ((flags & GENERIC_WRITE) || + ((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS)) + *pbits_to_set &= ~S_IWUGO; + if ((flags & GENERIC_READ) || + ((flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS)) + *pbits_to_set &= ~S_IRUGO; + if ((flags & GENERIC_EXECUTE) || + ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS)) + *pbits_to_set &= ~S_IXUGO; + return; + } else if (type != ACCESS_ALLOWED) { + cERROR(1, "unknown access control type %d", type); + return; + } + /* else ACCESS_ALLOWED type */ + + if (flags & GENERIC_ALL) { + *pmode |= (S_IRWXUGO & (*pbits_to_set)); + cFYI(DBG2, "all perms"); + return; + } + if ((flags & GENERIC_WRITE) || + ((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS)) + *pmode |= (S_IWUGO & (*pbits_to_set)); + if ((flags & GENERIC_READ) || + ((flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS)) + *pmode |= (S_IRUGO & (*pbits_to_set)); + if ((flags & GENERIC_EXECUTE) || + ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS)) + *pmode |= (S_IXUGO & (*pbits_to_set)); + + cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode); + return; +} + +/* + Generate access flags to reflect permissions mode is the existing mode. + This function is called for every ACE in the DACL whose SID matches + with either owner or group or everyone. +*/ + +static void mode_to_access_flags(umode_t mode, umode_t bits_to_use, + __u32 *pace_flags) +{ + /* reset access mask */ + *pace_flags = 0x0; + + /* bits to use are either S_IRWXU or S_IRWXG or S_IRWXO */ + mode &= bits_to_use; + + /* check for R/W/X UGO since we do not know whose flags + is this but we have cleared all the bits sans RWX for + either user or group or other as per bits_to_use */ + if (mode & S_IRUGO) + *pace_flags |= SET_FILE_READ_RIGHTS; + if (mode & S_IWUGO) + *pace_flags |= SET_FILE_WRITE_RIGHTS; + if (mode & S_IXUGO) + *pace_flags |= SET_FILE_EXEC_RIGHTS; + + cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags); + return; +} + +static __u16 fill_ace_for_sid(struct cifs_ace *pntace, + const struct cifs_sid *psid, __u64 nmode, umode_t bits) +{ + int i; + __u16 size = 0; + __u32 access_req = 0; + + pntace->type = ACCESS_ALLOWED; + pntace->flags = 0x0; + mode_to_access_flags(nmode, bits, &access_req); + if (!access_req) + access_req = SET_MINIMUM_RIGHTS; + pntace->access_req = cpu_to_le32(access_req); + + pntace->sid.revision = psid->revision; + pntace->sid.num_subauth = psid->num_subauth; + for (i = 0; i < 6; i++) + pntace->sid.authority[i] = psid->authority[i]; + for (i = 0; i < psid->num_subauth; i++) + pntace->sid.sub_auth[i] = psid->sub_auth[i]; + + size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4); + pntace->size = cpu_to_le16(size); + + return size; +} + + +#ifdef CONFIG_CIFS_DEBUG2 +static void dump_ace(struct cifs_ace *pace, char *end_of_acl) +{ + int num_subauth; + + /* validate that we do not go past end of acl */ + + if (le16_to_cpu(pace->size) < 16) { + cERROR(1, "ACE too small %d", le16_to_cpu(pace->size)); + return; + } + + if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) { + cERROR(1, "ACL too small to parse ACE"); + return; + } + + num_subauth = pace->sid.num_subauth; + if (num_subauth) { + int i; + cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d", + pace->sid.revision, pace->sid.num_subauth, pace->type, + pace->flags, le16_to_cpu(pace->size)); + for (i = 0; i < num_subauth; ++i) { + cFYI(1, "ACE sub_auth[%d]: 0x%x", i, + le32_to_cpu(pace->sid.sub_auth[i])); + } + + /* BB add length check to make sure that we do not have huge + num auths and therefore go off the end */ + } + + return; +} +#endif + + +static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, + struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, + struct cifs_fattr *fattr) +{ + int i; + int num_aces = 0; + int acl_size; + char *acl_base; + struct cifs_ace **ppace; + + /* BB need to add parm so we can store the SID BB */ + + if (!pdacl) { + /* no DACL in the security descriptor, set + all the permissions for user/group/other */ + fattr->cf_mode |= S_IRWXUGO; + return; + } + + /* validate that we do not go past end of acl */ + if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { + cERROR(1, "ACL too small to parse DACL"); + return; + } + + cFYI(DBG2, "DACL revision %d size %d num aces %d", + le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), + le32_to_cpu(pdacl->num_aces)); + + /* reset rwx permissions for user/group/other. + Also, if num_aces is 0 i.e. DACL has no ACEs, + user/group/other have no permissions */ + fattr->cf_mode &= ~(S_IRWXUGO); + + acl_base = (char *)pdacl; + acl_size = sizeof(struct cifs_acl); + + num_aces = le32_to_cpu(pdacl->num_aces); + if (num_aces > 0) { + umode_t user_mask = S_IRWXU; + umode_t group_mask = S_IRWXG; + umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO; + + ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), + GFP_KERNEL); + if (!ppace) { + cERROR(1, "DACL memory allocation error"); + return; + } + + for (i = 0; i < num_aces; ++i) { + ppace[i] = (struct cifs_ace *) (acl_base + acl_size); +#ifdef CONFIG_CIFS_DEBUG2 + dump_ace(ppace[i], end_of_acl); +#endif + if (compare_sids(&(ppace[i]->sid), pownersid) == 0) + access_flags_to_mode(ppace[i]->access_req, + ppace[i]->type, + &fattr->cf_mode, + &user_mask); + if (compare_sids(&(ppace[i]->sid), pgrpsid) == 0) + access_flags_to_mode(ppace[i]->access_req, + ppace[i]->type, + &fattr->cf_mode, + &group_mask); + if (compare_sids(&(ppace[i]->sid), &sid_everyone) == 0) + access_flags_to_mode(ppace[i]->access_req, + ppace[i]->type, + &fattr->cf_mode, + &other_mask); + if (compare_sids(&(ppace[i]->sid), &sid_authusers) == 0) + access_flags_to_mode(ppace[i]->access_req, + ppace[i]->type, + &fattr->cf_mode, + &other_mask); + + +/* memcpy((void *)(&(cifscred->aces[i])), + (void *)ppace[i], + sizeof(struct cifs_ace)); */ + + acl_base = (char *)ppace[i]; + acl_size = le16_to_cpu(ppace[i]->size); + } + + kfree(ppace); + } + + return; +} + + +static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid, + struct cifs_sid *pgrpsid, __u64 nmode) +{ + u16 size = 0; + struct cifs_acl *pnndacl; + + pnndacl = (struct cifs_acl *)((char *)pndacl + sizeof(struct cifs_acl)); + + size += fill_ace_for_sid((struct cifs_ace *) ((char *)pnndacl + size), + pownersid, nmode, S_IRWXU); + size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), + pgrpsid, nmode, S_IRWXG); + size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), + &sid_everyone, nmode, S_IRWXO); + + pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl)); + pndacl->num_aces = cpu_to_le32(3); + + return 0; +} + + +static int parse_sid(struct cifs_sid *psid, char *end_of_acl) +{ + /* BB need to add parm so we can store the SID BB */ + + /* validate that we do not go past end of ACL - sid must be at least 8 + bytes long (assuming no sub-auths - e.g. the null SID */ + if (end_of_acl < (char *)psid + 8) { + cERROR(1, "ACL too small to parse SID %p", psid); + return -EINVAL; + } + + if (psid->num_subauth) { +#ifdef CONFIG_CIFS_DEBUG2 + int i; + cFYI(1, "SID revision %d num_auth %d", + psid->revision, psid->num_subauth); + + for (i = 0; i < psid->num_subauth; i++) { + cFYI(1, "SID sub_auth[%d]: 0x%x ", i, + le32_to_cpu(psid->sub_auth[i])); + } + + /* BB add length check to make sure that we do not have huge + num auths and therefore go off the end */ + cFYI(1, "RID 0x%x", + le32_to_cpu(psid->sub_auth[psid->num_subauth-1])); +#endif + } + + return 0; +} + + +/* Convert CIFS ACL to POSIX form */ +static int parse_sec_desc(struct cifs_sb_info *cifs_sb, + struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr) +{ + int rc = 0; + struct cifs_sid *owner_sid_ptr, *group_sid_ptr; + struct cifs_acl *dacl_ptr; /* no need for SACL ptr */ + char *end_of_acl = ((char *)pntsd) + acl_len; + __u32 dacloffset; + + if (pntsd == NULL) + return -EIO; + + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->osidoffset)); + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->gsidoffset)); + dacloffset = le32_to_cpu(pntsd->dacloffset); + dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x " + "sacloffset 0x%x dacloffset 0x%x", + pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset), + le32_to_cpu(pntsd->gsidoffset), + le32_to_cpu(pntsd->sacloffset), dacloffset); +/* cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */ + rc = parse_sid(owner_sid_ptr, end_of_acl); + if (rc) { + cFYI(1, "%s: Error %d parsing Owner SID", __func__, rc); + return rc; + } + rc = sid_to_id(cifs_sb, owner_sid_ptr, fattr, SIDOWNER); + if (rc) { + cFYI(1, "%s: Error %d mapping Owner SID to uid", __func__, rc); + return rc; + } + + rc = parse_sid(group_sid_ptr, end_of_acl); + if (rc) { + cFYI(1, "%s: Error %d mapping Owner SID to gid", __func__, rc); + return rc; + } + rc = sid_to_id(cifs_sb, group_sid_ptr, fattr, SIDGROUP); + if (rc) { + cFYI(1, "%s: Error %d mapping Group SID to gid", __func__, rc); + return rc; + } + + if (dacloffset) + parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, + group_sid_ptr, fattr); + else + cFYI(1, "no ACL"); /* BB grant all or default perms? */ + +/* cifscred->uid = owner_sid_ptr->rid; + cifscred->gid = group_sid_ptr->rid; + memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr, + sizeof(struct cifs_sid)); + memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr, + sizeof(struct cifs_sid)); */ + + return rc; +} + + +/* Convert permission bits from mode to equivalent CIFS ACL */ +static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, + struct inode *inode, __u64 nmode) +{ + int rc = 0; + __u32 dacloffset; + __u32 ndacloffset; + __u32 sidsoffset; + struct cifs_sid *owner_sid_ptr, *group_sid_ptr; + struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */ + struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ + + if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL)) + return -EIO; + + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->osidoffset)); + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->gsidoffset)); + + dacloffset = le32_to_cpu(pntsd->dacloffset); + dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + + ndacloffset = sizeof(struct cifs_ntsd); + ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); + ndacl_ptr->revision = dacl_ptr->revision; + ndacl_ptr->size = 0; + ndacl_ptr->num_aces = 0; + + rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode); + + sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); + + /* copy security descriptor control portion and owner and group sid */ + copy_sec_desc(pntsd, pnntsd, sidsoffset); + + return rc; +} + +static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, + __u16 fid, u32 *pacllen) +{ + struct cifs_ntsd *pntsd = NULL; + int xid, rc; + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + + if (IS_ERR(tlink)) + return ERR_CAST(tlink); + + xid = GetXid(); + rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen); + FreeXid(xid); + + cifs_put_tlink(tlink); + + cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen); + if (rc) + return ERR_PTR(rc); + return pntsd; +} + +static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, + const char *path, u32 *pacllen) +{ + struct cifs_ntsd *pntsd = NULL; + int oplock = 0; + int xid, rc; + __u16 fid; + struct cifs_tcon *tcon; + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + + if (IS_ERR(tlink)) + return ERR_CAST(tlink); + + tcon = tlink_tcon(tlink); + xid = GetXid(); + + rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0, + &fid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (!rc) { + rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); + CIFSSMBClose(xid, tcon, fid); + } + + cifs_put_tlink(tlink); + FreeXid(xid); + + cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen); + if (rc) + return ERR_PTR(rc); + return pntsd; +} + +/* Retrieve an ACL from the server */ +struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, + struct inode *inode, const char *path, + u32 *pacllen) +{ + struct cifs_ntsd *pntsd = NULL; + struct cifsFileInfo *open_file = NULL; + + if (inode) + open_file = find_readable_file(CIFS_I(inode), true); + if (!open_file) + return get_cifs_acl_by_path(cifs_sb, path, pacllen); + + pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen); + cifsFileInfo_put(open_file); + return pntsd; +} + +static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid, + struct cifs_ntsd *pnntsd, u32 acllen) +{ + int xid, rc; + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + + xid = GetXid(); + rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen); + FreeXid(xid); + cifs_put_tlink(tlink); + + cFYI(DBG2, "SetCIFSACL rc = %d", rc); + return rc; +} + +static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, + struct cifs_ntsd *pnntsd, u32 acllen) +{ + int oplock = 0; + int xid, rc; + __u16 fid; + struct cifs_tcon *tcon; + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + + tcon = tlink_tcon(tlink); + xid = GetXid(); + + rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0, + &fid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) { + cERROR(1, "Unable to open file to set ACL"); + goto out; + } + + rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen); + cFYI(DBG2, "SetCIFSACL rc = %d", rc); + + CIFSSMBClose(xid, tcon, fid); +out: + FreeXid(xid); + cifs_put_tlink(tlink); + return rc; +} + +/* Set an ACL on the server */ +int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, + struct inode *inode, const char *path) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsFileInfo *open_file; + int rc; + + cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode); + + open_file = find_readable_file(CIFS_I(inode), true); + if (!open_file) + return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); + + rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen); + cifsFileInfo_put(open_file); + return rc; +} + +/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ +int +cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, + struct inode *inode, const char *path, const __u16 *pfid) +{ + struct cifs_ntsd *pntsd = NULL; + u32 acllen = 0; + int rc = 0; + + cFYI(DBG2, "converting ACL to mode for %s", path); + + if (pfid) + pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen); + else + pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen); + + /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ + if (IS_ERR(pntsd)) { + rc = PTR_ERR(pntsd); + cERROR(1, "%s: error %d getting sec desc", __func__, rc); + } else { + rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr); + kfree(pntsd); + if (rc) + cERROR(1, "parse sec desc failed rc = %d", rc); + } + + return rc; +} + +/* Convert mode bits to an ACL so we can update the ACL on the server */ +int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) +{ + int rc = 0; + __u32 secdesclen = 0; + struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ + struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ + + cFYI(DBG2, "set ACL from mode for %s", path); + + /* Get the security descriptor */ + pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); + + /* Add three ACEs for owner, group, everyone getting rid of + other ACEs as chmod disables ACEs and set the security descriptor */ + + if (IS_ERR(pntsd)) { + rc = PTR_ERR(pntsd); + cERROR(1, "%s: error %d getting sec desc", __func__, rc); + } else { + /* allocate memory for the smb header, + set security descriptor request security descriptor + parameters, and secuirty descriptor itself */ + + secdesclen = secdesclen < DEFSECDESCLEN ? + DEFSECDESCLEN : secdesclen; + pnntsd = kmalloc(secdesclen, GFP_KERNEL); + if (!pnntsd) { + cERROR(1, "Unable to allocate security descriptor"); + kfree(pntsd); + return -ENOMEM; + } + + rc = build_sec_desc(pntsd, pnntsd, inode, nmode); + + cFYI(DBG2, "build_sec_desc rc: %d", rc); + + if (!rc) { + /* Set the security descriptor */ + rc = set_cifs_acl(pnntsd, secdesclen, inode, path); + cFYI(DBG2, "set_cifs_acl rc: %d", rc); + } + + kfree(pnntsd); + kfree(pntsd); + } + + return rc; +} diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h new file mode 100644 index 00000000..5c902c7c --- /dev/null +++ b/fs/cifs/cifsacl.h @@ -0,0 +1,103 @@ +/* + * fs/cifs/cifsacl.h + * + * Copyright (c) International Business Machines Corp., 2007 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _CIFSACL_H +#define _CIFSACL_H + + +#define NUM_AUTHS 6 /* number of authority fields */ +#define NUM_SUBAUTHS 5 /* number of sub authority fields */ +#define NUM_WK_SIDS 7 /* number of well known sids */ +#define SIDNAMELENGTH 20 /* long enough for the ones we care about */ +#define DEFSECDESCLEN 192 /* sec desc len contaiting a dacl with three aces */ + +#define READ_BIT 0x4 +#define WRITE_BIT 0x2 +#define EXEC_BIT 0x1 + +#define UBITSHIFT 6 +#define GBITSHIFT 3 + +#define ACCESS_ALLOWED 0 +#define ACCESS_DENIED 1 + +#define SIDOWNER 1 +#define SIDGROUP 2 +#define SIDLEN 150 /* S- 1 revision- 6 authorities- max 5 sub authorities */ + +#define SID_ID_MAPPED 0 +#define SID_ID_PENDING 1 +#define SID_MAP_EXPIRE (3600 * HZ) /* map entry expires after one hour */ +#define SID_MAP_RETRY (300 * HZ) /* wait 5 minutes for next attempt to map */ + +struct cifs_ntsd { + __le16 revision; /* revision level */ + __le16 type; + __le32 osidoffset; + __le32 gsidoffset; + __le32 sacloffset; + __le32 dacloffset; +} __attribute__((packed)); + +struct cifs_sid { + __u8 revision; /* revision level */ + __u8 num_subauth; + __u8 authority[6]; + __le32 sub_auth[5]; /* sub_auth[num_subauth] */ +} __attribute__((packed)); + +struct cifs_acl { + __le16 revision; /* revision level */ + __le16 size; + __le32 num_aces; +} __attribute__((packed)); + +struct cifs_ace { + __u8 type; + __u8 flags; + __le16 size; + __le32 access_req; + struct cifs_sid sid; /* ie UUID of user or group who gets these perms */ +} __attribute__((packed)); + +struct cifs_wksid { + struct cifs_sid cifssid; + char sidname[SIDNAMELENGTH]; +} __attribute__((packed)); + +struct cifs_sid_id { + unsigned int refcount; /* increment with spinlock, decrement without */ + unsigned long id; + unsigned long time; + unsigned long state; + char *sidstr; + struct rb_node rbnode; + struct cifs_sid sid; +}; + +#ifdef __KERNEL__ +extern struct key_type cifs_idmap_key_type; +extern const struct cred *root_cred; +#endif /* KERNEL */ + +extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *); + +#endif /* _CIFSACL_H */ diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c new file mode 100644 index 00000000..5a0ee7f2 --- /dev/null +++ b/fs/cifs/cifsencrypt.c @@ -0,0 +1,762 @@ +/* + * fs/cifs/cifsencrypt.c + * + * Copyright (C) International Business Machines Corp., 2005,2006 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifs_debug.h" +#include "cifs_unicode.h" +#include "cifsproto.h" +#include "ntlmssp.h" +#include +#include + +/* + * Calculate and return the CIFS signature based on the mac key and SMB PDU. + * The 16 byte signature must be allocated by the caller. Note we only use the + * 1st eight bytes and that the smb header signature field on input contains + * the sequence number before this function is called. Also, this function + * should be called with the server->srv_mutex held. + */ +static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, + struct TCP_Server_Info *server, char *signature) +{ + int rc; + + if (cifs_pdu == NULL || signature == NULL || server == NULL) + return -EINVAL; + + if (!server->secmech.sdescmd5) { + cERROR(1, "%s: Can't generate signature\n", __func__); + return -1; + } + + rc = crypto_shash_init(&server->secmech.sdescmd5->shash); + if (rc) { + cERROR(1, "%s: Oould not init md5\n", __func__); + return rc; + } + + crypto_shash_update(&server->secmech.sdescmd5->shash, + server->session_key.response, server->session_key.len); + + crypto_shash_update(&server->secmech.sdescmd5->shash, + cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length)); + + rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); + + return 0; +} + +/* must be called with server->srv_mutex held */ +int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, + __u32 *pexpected_response_sequence_number) +{ + int rc = 0; + char smb_signature[20]; + + if ((cifs_pdu == NULL) || (server == NULL)) + return -EINVAL; + + if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) + return rc; + + cifs_pdu->Signature.Sequence.SequenceNumber = + cpu_to_le32(server->sequence_number); + cifs_pdu->Signature.Sequence.Reserved = 0; + + *pexpected_response_sequence_number = server->sequence_number++; + server->sequence_number++; + + rc = cifs_calculate_signature(cifs_pdu, server, smb_signature); + if (rc) + memset(cifs_pdu->Signature.SecuritySignature, 0, 8); + else + memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8); + + return rc; +} + +static int cifs_calc_signature2(const struct kvec *iov, int n_vec, + struct TCP_Server_Info *server, char *signature) +{ + int i; + int rc; + + if (iov == NULL || signature == NULL || server == NULL) + return -EINVAL; + + if (!server->secmech.sdescmd5) { + cERROR(1, "%s: Can't generate signature\n", __func__); + return -1; + } + + rc = crypto_shash_init(&server->secmech.sdescmd5->shash); + if (rc) { + cERROR(1, "%s: Oould not init md5\n", __func__); + return rc; + } + + crypto_shash_update(&server->secmech.sdescmd5->shash, + server->session_key.response, server->session_key.len); + + for (i = 0; i < n_vec; i++) { + if (iov[i].iov_len == 0) + continue; + if (iov[i].iov_base == NULL) { + cERROR(1, "null iovec entry"); + return -EIO; + } + /* The first entry includes a length field (which does not get + signed that occupies the first 4 bytes before the header */ + if (i == 0) { + if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ + break; /* nothing to sign or corrupt header */ + crypto_shash_update(&server->secmech.sdescmd5->shash, + iov[i].iov_base + 4, iov[i].iov_len - 4); + } else + crypto_shash_update(&server->secmech.sdescmd5->shash, + iov[i].iov_base, iov[i].iov_len); + } + + rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); + + return rc; +} + +/* must be called with server->srv_mutex held */ +int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, + __u32 *pexpected_response_sequence_number) +{ + int rc = 0; + char smb_signature[20]; + struct smb_hdr *cifs_pdu = iov[0].iov_base; + + if ((cifs_pdu == NULL) || (server == NULL)) + return -EINVAL; + + if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) + return rc; + + cifs_pdu->Signature.Sequence.SequenceNumber = + cpu_to_le32(server->sequence_number); + cifs_pdu->Signature.Sequence.Reserved = 0; + + *pexpected_response_sequence_number = server->sequence_number++; + server->sequence_number++; + + rc = cifs_calc_signature2(iov, n_vec, server, smb_signature); + if (rc) + memset(cifs_pdu->Signature.SecuritySignature, 0, 8); + else + memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8); + + return rc; +} + +int cifs_verify_signature(struct smb_hdr *cifs_pdu, + struct TCP_Server_Info *server, + __u32 expected_sequence_number) +{ + unsigned int rc; + char server_response_sig[8]; + char what_we_think_sig_should_be[20]; + + if (cifs_pdu == NULL || server == NULL) + return -EINVAL; + + if (!server->session_estab) + return 0; + + if (cifs_pdu->Command == SMB_COM_LOCKING_ANDX) { + struct smb_com_lock_req *pSMB = + (struct smb_com_lock_req *)cifs_pdu; + if (pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE) + return 0; + } + + /* BB what if signatures are supposed to be on for session but + server does not send one? BB */ + + /* Do not need to verify session setups with signature "BSRSPYL " */ + if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0) + cFYI(1, "dummy signature received for smb command 0x%x", + cifs_pdu->Command); + + /* save off the origiginal signature so we can modify the smb and check + its signature against what the server sent */ + memcpy(server_response_sig, cifs_pdu->Signature.SecuritySignature, 8); + + cifs_pdu->Signature.Sequence.SequenceNumber = + cpu_to_le32(expected_sequence_number); + cifs_pdu->Signature.Sequence.Reserved = 0; + + mutex_lock(&server->srv_mutex); + rc = cifs_calculate_signature(cifs_pdu, server, + what_we_think_sig_should_be); + mutex_unlock(&server->srv_mutex); + + if (rc) + return rc; + +/* cifs_dump_mem("what we think it should be: ", + what_we_think_sig_should_be, 16); */ + + if (memcmp(server_response_sig, what_we_think_sig_should_be, 8)) + return -EACCES; + else + return 0; + +} + +/* first calculate 24 bytes ntlm response and then 16 byte session key */ +int setup_ntlm_response(struct cifs_ses *ses) +{ + int rc = 0; + unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; + char temp_key[CIFS_SESS_KEY_SIZE]; + + if (!ses) + return -EINVAL; + + ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL); + if (!ses->auth_key.response) { + cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len); + return -ENOMEM; + } + ses->auth_key.len = temp_len; + + rc = SMBNTencrypt(ses->password, ses->server->cryptkey, + ses->auth_key.response + CIFS_SESS_KEY_SIZE); + if (rc) { + cFYI(1, "%s Can't generate NTLM response, error: %d", + __func__, rc); + return rc; + } + + rc = E_md4hash(ses->password, temp_key); + if (rc) { + cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); + return rc; + } + + rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE); + if (rc) + cFYI(1, "%s Can't generate NTLM session key, error: %d", + __func__, rc); + + return rc; +} + +#ifdef CONFIG_CIFS_WEAK_PW_HASH +int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, + char *lnm_session_key) +{ + int i; + int rc; + char password_with_pad[CIFS_ENCPWD_SIZE]; + + memset(password_with_pad, 0, CIFS_ENCPWD_SIZE); + if (password) + strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE); + + if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) { + memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE); + memcpy(lnm_session_key, password_with_pad, + CIFS_ENCPWD_SIZE); + return 0; + } + + /* calculate old style session key */ + /* calling toupper is less broken than repeatedly + calling nls_toupper would be since that will never + work for UTF8, but neither handles multibyte code pages + but the only alternative would be converting to UCS-16 (Unicode) + (using a routine something like UniStrupr) then + uppercasing and then converting back from Unicode - which + would only worth doing it if we knew it were utf8. Basically + utf8 and other multibyte codepages each need their own strupper + function since a byte at a time will ont work. */ + + for (i = 0; i < CIFS_ENCPWD_SIZE; i++) + password_with_pad[i] = toupper(password_with_pad[i]); + + rc = SMBencrypt(password_with_pad, cryptkey, lnm_session_key); + + return rc; +} +#endif /* CIFS_WEAK_PW_HASH */ + +/* Build a proper attribute value/target info pairs blob. + * Fill in netbios and dns domain name and workstation name + * and client time (total five av pairs and + one end of fields indicator. + * Allocate domain name which gets freed when session struct is deallocated. + */ +static int +build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp) +{ + unsigned int dlen; + unsigned int wlen; + unsigned int size = 6 * sizeof(struct ntlmssp2_name); + __le64 curtime; + char *defdmname = "WORKGROUP"; + unsigned char *blobptr; + struct ntlmssp2_name *attrptr; + + if (!ses->domainName) { + ses->domainName = kstrdup(defdmname, GFP_KERNEL); + if (!ses->domainName) + return -ENOMEM; + } + + dlen = strlen(ses->domainName); + wlen = strlen(ses->server->hostname); + + /* The length of this blob is a size which is + * six times the size of a structure which holds name/size + + * two times the unicode length of a domain name + + * two times the unicode length of a server name + + * size of a timestamp (which is 8 bytes). + */ + ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8; + ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL); + if (!ses->auth_key.response) { + ses->auth_key.len = 0; + cERROR(1, "Challenge target info allocation failure"); + return -ENOMEM; + } + + blobptr = ses->auth_key.response; + attrptr = (struct ntlmssp2_name *) blobptr; + + attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME); + attrptr->length = cpu_to_le16(2 * dlen); + blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); + cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp); + + blobptr += 2 * dlen; + attrptr = (struct ntlmssp2_name *) blobptr; + + attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME); + attrptr->length = cpu_to_le16(2 * wlen); + blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); + cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp); + + blobptr += 2 * wlen; + attrptr = (struct ntlmssp2_name *) blobptr; + + attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME); + attrptr->length = cpu_to_le16(2 * dlen); + blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); + cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp); + + blobptr += 2 * dlen; + attrptr = (struct ntlmssp2_name *) blobptr; + + attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME); + attrptr->length = cpu_to_le16(2 * wlen); + blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); + cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp); + + blobptr += 2 * wlen; + attrptr = (struct ntlmssp2_name *) blobptr; + + attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP); + attrptr->length = cpu_to_le16(sizeof(__le64)); + blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); + curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); + memcpy(blobptr, &curtime, sizeof(__le64)); + + return 0; +} + +/* Server has provided av pairs/target info in the type 2 challenge + * packet and we have plucked it and stored within smb session. + * We parse that blob here to find netbios domain name to be used + * as part of ntlmv2 authentication (in Target String), if not already + * specified on the command line. + * If this function returns without any error but without fetching + * domain name, authentication may fail against some server but + * may not fail against other (those who are not very particular + * about target string i.e. for some, just user name might suffice. + */ +static int +find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp) +{ + unsigned int attrsize; + unsigned int type; + unsigned int onesize = sizeof(struct ntlmssp2_name); + unsigned char *blobptr; + unsigned char *blobend; + struct ntlmssp2_name *attrptr; + + if (!ses->auth_key.len || !ses->auth_key.response) + return 0; + + blobptr = ses->auth_key.response; + blobend = blobptr + ses->auth_key.len; + + while (blobptr + onesize < blobend) { + attrptr = (struct ntlmssp2_name *) blobptr; + type = le16_to_cpu(attrptr->type); + if (type == NTLMSSP_AV_EOL) + break; + blobptr += 2; /* advance attr type */ + attrsize = le16_to_cpu(attrptr->length); + blobptr += 2; /* advance attr size */ + if (blobptr + attrsize > blobend) + break; + if (type == NTLMSSP_AV_NB_DOMAIN_NAME) { + if (!attrsize) + break; + if (!ses->domainName) { + ses->domainName = + kmalloc(attrsize + 1, GFP_KERNEL); + if (!ses->domainName) + return -ENOMEM; + cifs_from_ucs2(ses->domainName, + (__le16 *)blobptr, attrsize, attrsize, + nls_cp, false); + break; + } + } + blobptr += attrsize; /* advance attr value */ + } + + return 0; +} + +static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, + const struct nls_table *nls_cp) +{ + int rc = 0; + int len; + char nt_hash[CIFS_NTHASH_SIZE]; + wchar_t *user; + wchar_t *domain; + wchar_t *server; + + if (!ses->server->secmech.sdeschmacmd5) { + cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n"); + return -1; + } + + /* calculate md4 hash of password */ + E_md4hash(ses->password, nt_hash); + + crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, + CIFS_NTHASH_SIZE); + + rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + if (rc) { + cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n"); + return rc; + } + + /* convert ses->user_name to unicode and uppercase */ + len = strlen(ses->user_name); + user = kmalloc(2 + (len * 2), GFP_KERNEL); + if (user == NULL) { + cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n"); + rc = -ENOMEM; + goto calc_exit_2; + } + len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp); + UniStrupr(user); + + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + (char *)user, 2 * len); + + /* convert ses->domainName to unicode and uppercase */ + if (ses->domainName) { + len = strlen(ses->domainName); + + domain = kmalloc(2 + (len * 2), GFP_KERNEL); + if (domain == NULL) { + cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure"); + rc = -ENOMEM; + goto calc_exit_1; + } + len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, + nls_cp); + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + (char *)domain, 2 * len); + kfree(domain); + } else if (ses->serverName) { + len = strlen(ses->serverName); + + server = kmalloc(2 + (len * 2), GFP_KERNEL); + if (server == NULL) { + cERROR(1, "calc_ntlmv2_hash: server mem alloc failure"); + rc = -ENOMEM; + goto calc_exit_1; + } + len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, + nls_cp); + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + (char *)server, 2 * len); + kfree(server); + } + + rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + ntlmv2_hash); + +calc_exit_1: + kfree(user); +calc_exit_2: + return rc; +} + +static int +CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) +{ + int rc; + unsigned int offset = CIFS_SESS_KEY_SIZE + 8; + + if (!ses->server->secmech.sdeschmacmd5) { + cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n"); + return -1; + } + + crypto_shash_setkey(ses->server->secmech.hmacmd5, + ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + + rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + if (rc) { + cERROR(1, "CalcNTLMv2_response: could not init hmacmd5"); + return rc; + } + + if (ses->server->secType == RawNTLMSSP) + memcpy(ses->auth_key.response + offset, + ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); + else + memcpy(ses->auth_key.response + offset, + ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + ses->auth_key.response + offset, ses->auth_key.len - offset); + + rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + ses->auth_key.response + CIFS_SESS_KEY_SIZE); + + return rc; +} + + +int +setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) +{ + int rc; + int baselen; + unsigned int tilen; + struct ntlmv2_resp *buf; + char ntlmv2_hash[16]; + unsigned char *tiblob = NULL; /* target info blob */ + + if (ses->server->secType == RawNTLMSSP) { + if (!ses->domainName) { + rc = find_domain_name(ses, nls_cp); + if (rc) { + cERROR(1, "error %d finding domain name", rc); + goto setup_ntlmv2_rsp_ret; + } + } + } else { + rc = build_avpair_blob(ses, nls_cp); + if (rc) { + cERROR(1, "error %d building av pair blob", rc); + goto setup_ntlmv2_rsp_ret; + } + } + + baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp); + tilen = ses->auth_key.len; + tiblob = ses->auth_key.response; + + ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL); + if (!ses->auth_key.response) { + rc = ENOMEM; + ses->auth_key.len = 0; + cERROR(1, "%s: Can't allocate auth blob", __func__); + goto setup_ntlmv2_rsp_ret; + } + ses->auth_key.len += baselen; + + buf = (struct ntlmv2_resp *) + (ses->auth_key.response + CIFS_SESS_KEY_SIZE); + buf->blob_signature = cpu_to_le32(0x00000101); + buf->reserved = 0; + buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); + get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); + buf->reserved2 = 0; + + memcpy(ses->auth_key.response + baselen, tiblob, tilen); + + /* calculate ntlmv2_hash */ + rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); + if (rc) { + cERROR(1, "could not get v2 hash rc %d", rc); + goto setup_ntlmv2_rsp_ret; + } + + /* calculate first part of the client response (CR1) */ + rc = CalcNTLMv2_response(ses, ntlmv2_hash); + if (rc) { + cERROR(1, "Could not calculate CR1 rc: %d", rc); + goto setup_ntlmv2_rsp_ret; + } + + /* now calculate the session key for NTLMv2 */ + crypto_shash_setkey(ses->server->secmech.hmacmd5, + ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + + rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + if (rc) { + cERROR(1, "%s: Could not init hmacmd5\n", __func__); + goto setup_ntlmv2_rsp_ret; + } + + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_HMAC_MD5_HASH_SIZE); + + rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + ses->auth_key.response); + +setup_ntlmv2_rsp_ret: + kfree(tiblob); + + return rc; +} + +int +calc_seckey(struct cifs_ses *ses) +{ + int rc; + struct crypto_blkcipher *tfm_arc4; + struct scatterlist sgin, sgout; + struct blkcipher_desc desc; + unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */ + + get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE); + + tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm_arc4)) { + rc = PTR_ERR(tfm_arc4); + cERROR(1, "could not allocate crypto API arc4\n"); + return rc; + } + + desc.tfm = tfm_arc4; + + crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, + CIFS_SESS_KEY_SIZE); + + sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE); + sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); + + rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE); + if (rc) { + cERROR(1, "could not encrypt session key rc: %d\n", rc); + crypto_free_blkcipher(tfm_arc4); + return rc; + } + + /* make secondary_key/nonce as session key */ + memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE); + /* and make len as that of session key only */ + ses->auth_key.len = CIFS_SESS_KEY_SIZE; + + crypto_free_blkcipher(tfm_arc4); + + return 0; +} + +void +cifs_crypto_shash_release(struct TCP_Server_Info *server) +{ + if (server->secmech.md5) + crypto_free_shash(server->secmech.md5); + + if (server->secmech.hmacmd5) + crypto_free_shash(server->secmech.hmacmd5); + + kfree(server->secmech.sdeschmacmd5); + + kfree(server->secmech.sdescmd5); +} + +int +cifs_crypto_shash_allocate(struct TCP_Server_Info *server) +{ + int rc; + unsigned int size; + + server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); + if (IS_ERR(server->secmech.hmacmd5)) { + cERROR(1, "could not allocate crypto hmacmd5\n"); + return PTR_ERR(server->secmech.hmacmd5); + } + + server->secmech.md5 = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(server->secmech.md5)) { + cERROR(1, "could not allocate crypto md5\n"); + rc = PTR_ERR(server->secmech.md5); + goto crypto_allocate_md5_fail; + } + + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->secmech.hmacmd5); + server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL); + if (!server->secmech.sdeschmacmd5) { + cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n"); + rc = -ENOMEM; + goto crypto_allocate_hmacmd5_sdesc_fail; + } + server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5; + server->secmech.sdeschmacmd5->shash.flags = 0x0; + + + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->secmech.md5); + server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL); + if (!server->secmech.sdescmd5) { + cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n"); + rc = -ENOMEM; + goto crypto_allocate_md5_sdesc_fail; + } + server->secmech.sdescmd5->shash.tfm = server->secmech.md5; + server->secmech.sdescmd5->shash.flags = 0x0; + + return 0; + +crypto_allocate_md5_sdesc_fail: + kfree(server->secmech.sdeschmacmd5); + +crypto_allocate_hmacmd5_sdesc_fail: + crypto_free_shash(server->secmech.md5); + +crypto_allocate_md5_fail: + crypto_free_shash(server->secmech.hmacmd5); + + return rc; +} diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c new file mode 100644 index 00000000..53e7d721 --- /dev/null +++ b/fs/cifs/cifsfs.c @@ -0,0 +1,1215 @@ +/* + * fs/cifs/cifsfs.c + * + * Copyright (C) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * Common Internet FileSystem (CIFS) client + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* Note that BB means BUGBUG (ie something to fix eventually) */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#define DECLARE_GLOBALS_HERE +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include +#include +#include "cifs_spnego.h" +#include "fscache.h" +#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ + +int cifsFYI = 0; +int cifsERROR = 1; +int traceSMB = 0; +unsigned int oplockEnabled = 1; +unsigned int linuxExtEnabled = 1; +unsigned int lookupCacheEnabled = 1; +unsigned int multiuser_mount = 0; +unsigned int global_secflags = CIFSSEC_DEF; +/* unsigned int ntlmv2_support = 0; */ +unsigned int sign_CIFS_PDUs = 1; +static const struct super_operations cifs_super_ops; +unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; +module_param(CIFSMaxBufSize, int, 0); +MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). " + "Default: 16384 Range: 8192 to 130048"); +unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL; +module_param(cifs_min_rcv, int, 0); +MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: " + "1 to 64"); +unsigned int cifs_min_small = 30; +module_param(cifs_min_small, int, 0); +MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " + "Range: 2 to 256"); +unsigned int cifs_max_pending = CIFS_MAX_REQ; +module_param(cifs_max_pending, int, 0); +MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " + "Default: 50 Range: 2 to 256"); +unsigned short echo_retries = 5; +module_param(echo_retries, ushort, 0644); +MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and " + "reconnecting server. Default: 5. 0 means " + "never reconnect."); +extern mempool_t *cifs_sm_req_poolp; +extern mempool_t *cifs_req_poolp; +extern mempool_t *cifs_mid_poolp; + +void +cifs_sb_active(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); +} + +void +cifs_sb_deactive(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); +} + +static int +cifs_read_super(struct super_block *sb) +{ + struct inode *inode; + struct cifs_sb_info *cifs_sb; + int rc = 0; + + cifs_sb = CIFS_SB(sb); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL) + sb->s_flags |= MS_POSIXACL; + + if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES) + sb->s_maxbytes = MAX_LFS_FILESIZE; + else + sb->s_maxbytes = MAX_NON_LFS; + + /* BB FIXME fix time_gran to be larger for LANMAN sessions */ + sb->s_time_gran = 100; + + sb->s_magic = CIFS_MAGIC_NUMBER; + sb->s_op = &cifs_super_ops; + sb->s_bdi = &cifs_sb->bdi; + sb->s_blocksize = CIFS_MAX_MSGSIZE; + sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ + inode = cifs_root_iget(sb); + + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + inode = NULL; + goto out_no_root; + } + + sb->s_root = d_alloc_root(inode); + + if (!sb->s_root) { + rc = -ENOMEM; + goto out_no_root; + } + + /* do that *after* d_alloc_root() - we want NULL ->d_op for root here */ + if (cifs_sb_master_tcon(cifs_sb)->nocase) + sb->s_d_op = &cifs_ci_dentry_ops; + else + sb->s_d_op = &cifs_dentry_ops; + +#ifdef CIFS_NFSD_EXPORT + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + cFYI(1, "export ops supported"); + sb->s_export_op = &cifs_export_ops; + } +#endif /* CIFS_NFSD_EXPORT */ + + return 0; + +out_no_root: + cERROR(1, "cifs_read_super: get root inode failed"); + if (inode) + iput(inode); + + return rc; +} + +static void cifs_kill_sb(struct super_block *sb) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + kill_anon_super(sb); + cifs_umount(cifs_sb); +} + +static int +cifs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + int rc = -EOPNOTSUPP; + int xid; + + xid = GetXid(); + + buf->f_type = CIFS_MAGIC_NUMBER; + + /* + * PATH_MAX may be too long - it would presumably be total path, + * but note that some servers (includinng Samba 3) have a shorter + * maximum path. + * + * Instead could get the real value via SMB_QUERY_FS_ATTRIBUTE_INFO. + */ + buf->f_namelen = PATH_MAX; + buf->f_files = 0; /* undefined */ + buf->f_ffree = 0; /* unlimited */ + + /* + * We could add a second check for a QFS Unix capability bit + */ + if ((tcon->ses->capabilities & CAP_UNIX) && + (CIFS_POSIX_EXTENSIONS & le64_to_cpu(tcon->fsUnixInfo.Capability))) + rc = CIFSSMBQFSPosixInfo(xid, tcon, buf); + + /* + * Only need to call the old QFSInfo if failed on newer one, + * e.g. by OS/2. + **/ + if (rc && (tcon->ses->capabilities & CAP_NT_SMBS)) + rc = CIFSSMBQFSInfo(xid, tcon, buf); + + /* + * Some old Windows servers also do not support level 103, retry with + * older level one if old server failed the previous call or we + * bypassed it because we detected that this was an older LANMAN sess + */ + if (rc) + rc = SMBOldQFSInfo(xid, tcon, buf); + + FreeXid(xid); + return 0; +} + +static int cifs_permission(struct inode *inode, int mask, unsigned int flags) +{ + struct cifs_sb_info *cifs_sb; + + cifs_sb = CIFS_SB(inode->i_sb); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) { + if ((mask & MAY_EXEC) && !execute_ok(inode)) + return -EACCES; + else + return 0; + } else /* file mode might have been restricted at mount time + on the client (above and beyond ACL on servers) for + servers which do not support setting and viewing mode bits, + so allowing client to check permissions is useful */ + return generic_permission(inode, mask, flags, NULL); +} + +static struct kmem_cache *cifs_inode_cachep; +static struct kmem_cache *cifs_req_cachep; +static struct kmem_cache *cifs_mid_cachep; +static struct kmem_cache *cifs_sm_req_cachep; +mempool_t *cifs_sm_req_poolp; +mempool_t *cifs_req_poolp; +mempool_t *cifs_mid_poolp; + +static struct inode * +cifs_alloc_inode(struct super_block *sb) +{ + struct cifsInodeInfo *cifs_inode; + cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL); + if (!cifs_inode) + return NULL; + cifs_inode->cifsAttrs = 0x20; /* default */ + cifs_inode->time = 0; + /* Until the file is open and we have gotten oplock + info back from the server, can not assume caching of + file data or metadata */ + cifs_set_oplock_level(cifs_inode, 0); + cifs_inode->delete_pending = false; + cifs_inode->invalid_mapping = false; + cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ + cifs_inode->server_eof = 0; + cifs_inode->uniqueid = 0; + cifs_inode->createtime = 0; + + /* Can not set i_flags here - they get immediately overwritten + to zero by the VFS */ +/* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME;*/ + INIT_LIST_HEAD(&cifs_inode->openFileList); + return &cifs_inode->vfs_inode; +} + +static void cifs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); +} + +static void +cifs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, cifs_i_callback); +} + +static void +cifs_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + cifs_fscache_release_inode_cookie(inode); +} + +static void +cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) +{ + struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr; + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr; + + seq_printf(s, ",addr="); + + switch (server->dstaddr.ss_family) { + case AF_INET: + seq_printf(s, "%pI4", &sa->sin_addr.s_addr); + break; + case AF_INET6: + seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr); + if (sa6->sin6_scope_id) + seq_printf(s, "%%%u", sa6->sin6_scope_id); + break; + default: + seq_printf(s, "(unknown)"); + } +} + +static void +cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server) +{ + seq_printf(s, ",sec="); + + switch (server->secType) { + case LANMAN: + seq_printf(s, "lanman"); + break; + case NTLMv2: + seq_printf(s, "ntlmv2"); + break; + case NTLM: + seq_printf(s, "ntlm"); + break; + case Kerberos: + seq_printf(s, "krb5"); + break; + case RawNTLMSSP: + seq_printf(s, "ntlmssp"); + break; + default: + /* shouldn't ever happen */ + seq_printf(s, "unknown"); + break; + } + + if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + seq_printf(s, "i"); +} + +/* + * cifs_show_options() is for displaying mount options in /proc/mounts. + * Not all settable options are displayed but most of the important + * ones are. + */ +static int +cifs_show_options(struct seq_file *s, struct vfsmount *m) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct sockaddr *srcaddr; + srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; + + cifs_show_security(s, tcon->ses->server); + + seq_printf(s, ",unc=%s", tcon->treeName); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) + seq_printf(s, ",multiuser"); + else if (tcon->ses->user_name) + seq_printf(s, ",username=%s", tcon->ses->user_name); + + if (tcon->ses->domainName) + seq_printf(s, ",domain=%s", tcon->ses->domainName); + + if (srcaddr->sa_family != AF_UNSPEC) { + struct sockaddr_in *saddr4; + struct sockaddr_in6 *saddr6; + saddr4 = (struct sockaddr_in *)srcaddr; + saddr6 = (struct sockaddr_in6 *)srcaddr; + if (srcaddr->sa_family == AF_INET6) + seq_printf(s, ",srcaddr=%pI6c", + &saddr6->sin6_addr); + else if (srcaddr->sa_family == AF_INET) + seq_printf(s, ",srcaddr=%pI4", + &saddr4->sin_addr.s_addr); + else + seq_printf(s, ",srcaddr=BAD-AF:%i", + (int)(srcaddr->sa_family)); + } + + seq_printf(s, ",uid=%d", cifs_sb->mnt_uid); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) + seq_printf(s, ",forceuid"); + else + seq_printf(s, ",noforceuid"); + + seq_printf(s, ",gid=%d", cifs_sb->mnt_gid); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) + seq_printf(s, ",forcegid"); + else + seq_printf(s, ",noforcegid"); + + cifs_show_address(s, tcon->ses->server); + + if (!tcon->unix_ext) + seq_printf(s, ",file_mode=0%o,dir_mode=0%o", + cifs_sb->mnt_file_mode, + cifs_sb->mnt_dir_mode); + if (tcon->seal) + seq_printf(s, ",seal"); + if (tcon->nocase) + seq_printf(s, ",nocase"); + if (tcon->retry) + seq_printf(s, ",hard"); + if (tcon->unix_ext) + seq_printf(s, ",unix"); + else + seq_printf(s, ",nounix"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) + seq_printf(s, ",posixpaths"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) + seq_printf(s, ",setuids"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) + seq_printf(s, ",serverino"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + seq_printf(s, ",rwpidforward"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) + seq_printf(s, ",forcemand"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) + seq_printf(s, ",directio"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + seq_printf(s, ",nouser_xattr"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) + seq_printf(s, ",mapchars"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) + seq_printf(s, ",sfu"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + seq_printf(s, ",nobrl"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) + seq_printf(s, ",cifsacl"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + seq_printf(s, ",dynperm"); + if (m->mnt_sb->s_flags & MS_POSIXACL) + seq_printf(s, ",acl"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) + seq_printf(s, ",mfsymlinks"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) + seq_printf(s, ",fsc"); + + seq_printf(s, ",rsize=%d", cifs_sb->rsize); + seq_printf(s, ",wsize=%d", cifs_sb->wsize); + /* convert actimeo and display it in seconds */ + seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); + + return 0; +} + +static void cifs_umount_begin(struct super_block *sb) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_tcon *tcon; + + if (cifs_sb == NULL) + return; + + tcon = cifs_sb_master_tcon(cifs_sb); + + spin_lock(&cifs_tcp_ses_lock); + if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) { + /* we have other mounts to same share or we have + already tried to force umount this and woken up + all waiting network requests, nothing to do */ + spin_unlock(&cifs_tcp_ses_lock); + return; + } else if (tcon->tc_count == 1) + tcon->tidStatus = CifsExiting; + spin_unlock(&cifs_tcp_ses_lock); + + /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ + /* cancel_notify_requests(tcon); */ + if (tcon->ses && tcon->ses->server) { + cFYI(1, "wake up tasks now - umount begin not complete"); + wake_up_all(&tcon->ses->server->request_q); + wake_up_all(&tcon->ses->server->response_q); + msleep(1); /* yield */ + /* we have to kick the requests once more */ + wake_up_all(&tcon->ses->server->response_q); + msleep(1); + } + + return; +} + +#ifdef CONFIG_CIFS_STATS2 +static int cifs_show_stats(struct seq_file *s, struct vfsmount *mnt) +{ + /* BB FIXME */ + return 0; +} +#endif + +static int cifs_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_NODIRATIME; + return 0; +} + +static int cifs_drop_inode(struct inode *inode) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + /* no serverino => unconditional eviction */ + return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) || + generic_drop_inode(inode); +} + +static const struct super_operations cifs_super_ops = { + .statfs = cifs_statfs, + .alloc_inode = cifs_alloc_inode, + .destroy_inode = cifs_destroy_inode, + .drop_inode = cifs_drop_inode, + .evict_inode = cifs_evict_inode, +/* .delete_inode = cifs_delete_inode, */ /* Do not need above + function unless later we add lazy close of inodes or unless the + kernel forgets to call us with the same number of releases (closes) + as opens */ + .show_options = cifs_show_options, + .umount_begin = cifs_umount_begin, + .remount_fs = cifs_remount, +#ifdef CONFIG_CIFS_STATS2 + .show_stats = cifs_show_stats, +#endif +}; + +/* + * Get root dentry from superblock according to prefix path mount option. + * Return dentry with refcount + 1 on success and NULL otherwise. + */ +static struct dentry * +cifs_get_root(struct smb_vol *vol, struct super_block *sb) +{ + struct dentry *dentry; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + char *full_path = NULL; + char *s, *p; + char sep; + int xid; + + full_path = cifs_build_path_to_root(vol, cifs_sb, + cifs_sb_master_tcon(cifs_sb)); + if (full_path == NULL) + return ERR_PTR(-ENOMEM); + + cFYI(1, "Get root dentry for %s", full_path); + + xid = GetXid(); + sep = CIFS_DIR_SEP(cifs_sb); + dentry = dget(sb->s_root); + p = s = full_path; + + do { + struct inode *dir = dentry->d_inode; + struct dentry *child; + + if (!dir) { + dput(dentry); + dentry = ERR_PTR(-ENOENT); + break; + } + + /* skip separators */ + while (*s == sep) + s++; + if (!*s) + break; + p = s++; + /* next separator */ + while (*s && *s != sep) + s++; + + mutex_lock(&dir->i_mutex); + child = lookup_one_len(p, dentry, s - p); + mutex_unlock(&dir->i_mutex); + dput(dentry); + dentry = child; + } while (!IS_ERR(dentry)); + _FreeXid(xid); + kfree(full_path); + return dentry; +} + +static int cifs_set_super(struct super_block *sb, void *data) +{ + struct cifs_mnt_data *mnt_data = data; + sb->s_fs_info = mnt_data->cifs_sb; + return set_anon_super(sb, NULL); +} + +static struct dentry * +cifs_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + int rc; + struct super_block *sb; + struct cifs_sb_info *cifs_sb; + struct smb_vol *volume_info; + struct cifs_mnt_data mnt_data; + struct dentry *root; + + cFYI(1, "Devname: %s flags: %d ", dev_name, flags); + + volume_info = cifs_get_volume_info((char *)data, dev_name); + if (IS_ERR(volume_info)) + return ERR_CAST(volume_info); + + cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL); + if (cifs_sb == NULL) { + root = ERR_PTR(-ENOMEM); + goto out_nls; + } + + cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); + if (cifs_sb->mountdata == NULL) { + root = ERR_PTR(-ENOMEM); + goto out_cifs_sb; + } + + cifs_setup_cifs_sb(volume_info, cifs_sb); + + rc = cifs_mount(cifs_sb, volume_info); + if (rc) { + if (!(flags & MS_SILENT)) + cERROR(1, "cifs_mount failed w/return code = %d", rc); + root = ERR_PTR(rc); + goto out_mountdata; + } + + mnt_data.vol = volume_info; + mnt_data.cifs_sb = cifs_sb; + mnt_data.flags = flags; + + sb = sget(fs_type, cifs_match_super, cifs_set_super, &mnt_data); + if (IS_ERR(sb)) { + root = ERR_CAST(sb); + cifs_umount(cifs_sb); + goto out; + } + + if (sb->s_root) { + cFYI(1, "Use existing superblock"); + cifs_umount(cifs_sb); + } else { + sb->s_flags = flags; + /* BB should we make this contingent on mount parm? */ + sb->s_flags |= MS_NODIRATIME | MS_NOATIME; + + rc = cifs_read_super(sb); + if (rc) { + root = ERR_PTR(rc); + goto out_super; + } + + sb->s_flags |= MS_ACTIVE; + } + + root = cifs_get_root(volume_info, sb); + if (IS_ERR(root)) + goto out_super; + + cFYI(1, "dentry root is: %p", root); + goto out; + +out_super: + deactivate_locked_super(sb); +out: + cifs_cleanup_volume_info(volume_info); + return root; + +out_mountdata: + kfree(cifs_sb->mountdata); +out_cifs_sb: + kfree(cifs_sb); +out_nls: + unload_nls(volume_info->local_nls); + goto out; +} + +static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + ssize_t written; + int rc; + + written = generic_file_aio_write(iocb, iov, nr_segs, pos); + + if (CIFS_I(inode)->clientCanCacheAll) + return written; + + rc = filemap_fdatawrite(inode->i_mapping); + if (rc) + cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode); + + return written; +} + +static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) +{ + /* origin == SEEK_END => we must revalidate the cached file length */ + if (origin == SEEK_END) { + int rc; + struct inode *inode = file->f_path.dentry->d_inode; + + /* + * We need to be sure that all dirty pages are written and the + * server has the newest file length. + */ + if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping && + inode->i_mapping->nrpages != 0) { + rc = filemap_fdatawait(inode->i_mapping); + if (rc) { + mapping_set_error(inode->i_mapping, rc); + return rc; + } + } + /* + * Some applications poll for the file length in this strange + * way so we must seek to end on non-oplocked files by + * setting the revalidate time to zero. + */ + CIFS_I(inode)->time = 0; + + rc = cifs_revalidate_file_attr(file); + if (rc < 0) + return (loff_t)rc; + } + return generic_file_llseek_unlocked(file, offset, origin); +} + +static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) +{ + /* note that this is called by vfs setlease with lock_flocks held + to protect *lease from going away */ + struct inode *inode = file->f_path.dentry->d_inode; + struct cifsFileInfo *cfile = file->private_data; + + if (!(S_ISREG(inode->i_mode))) + return -EINVAL; + + /* check if file is oplocked */ + if (((arg == F_RDLCK) && + (CIFS_I(inode)->clientCanCacheRead)) || + ((arg == F_WRLCK) && + (CIFS_I(inode)->clientCanCacheAll))) + return generic_setlease(file, arg, lease); + else if (tlink_tcon(cfile->tlink)->local_lease && + !CIFS_I(inode)->clientCanCacheRead) + /* If the server claims to support oplock on this + file, then we still need to check oplock even + if the local_lease mount option is set, but there + are servers which do not support oplock for which + this mount option may be useful if the user + knows that the file won't be changed on the server + by anyone else */ + return generic_setlease(file, arg, lease); + else + return -EAGAIN; +} + +struct file_system_type cifs_fs_type = { + .owner = THIS_MODULE, + .name = "cifs", + .mount = cifs_do_mount, + .kill_sb = cifs_kill_sb, + /* .fs_flags */ +}; +const struct inode_operations cifs_dir_inode_ops = { + .create = cifs_create, + .lookup = cifs_lookup, + .getattr = cifs_getattr, + .unlink = cifs_unlink, + .link = cifs_hardlink, + .mkdir = cifs_mkdir, + .rmdir = cifs_rmdir, + .rename = cifs_rename, + .permission = cifs_permission, +/* revalidate:cifs_revalidate, */ + .setattr = cifs_setattr, + .symlink = cifs_symlink, + .mknod = cifs_mknod, +#ifdef CONFIG_CIFS_XATTR + .setxattr = cifs_setxattr, + .getxattr = cifs_getxattr, + .listxattr = cifs_listxattr, + .removexattr = cifs_removexattr, +#endif +}; + +const struct inode_operations cifs_file_inode_ops = { +/* revalidate:cifs_revalidate, */ + .setattr = cifs_setattr, + .getattr = cifs_getattr, /* do we need this anymore? */ + .rename = cifs_rename, + .permission = cifs_permission, +#ifdef CONFIG_CIFS_XATTR + .setxattr = cifs_setxattr, + .getxattr = cifs_getxattr, + .listxattr = cifs_listxattr, + .removexattr = cifs_removexattr, +#endif +}; + +const struct inode_operations cifs_symlink_inode_ops = { + .readlink = generic_readlink, + .follow_link = cifs_follow_link, + .put_link = cifs_put_link, + .permission = cifs_permission, + /* BB add the following two eventually */ + /* revalidate: cifs_revalidate, + setattr: cifs_notify_change, *//* BB do we need notify change */ +#ifdef CONFIG_CIFS_XATTR + .setxattr = cifs_setxattr, + .getxattr = cifs_getxattr, + .listxattr = cifs_listxattr, + .removexattr = cifs_removexattr, +#endif +}; + +const struct file_operations cifs_file_ops = { + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = cifs_file_aio_write, + .open = cifs_open, + .release = cifs_close, + .lock = cifs_lock, + .fsync = cifs_fsync, + .flush = cifs_flush, + .mmap = cifs_file_mmap, + .splice_read = generic_file_splice_read, + .llseek = cifs_llseek, +#ifdef CONFIG_CIFS_POSIX + .unlocked_ioctl = cifs_ioctl, +#endif /* CONFIG_CIFS_POSIX */ + .setlease = cifs_setlease, +}; + +const struct file_operations cifs_file_strict_ops = { + .read = do_sync_read, + .write = do_sync_write, + .aio_read = cifs_strict_readv, + .aio_write = cifs_strict_writev, + .open = cifs_open, + .release = cifs_close, + .lock = cifs_lock, + .fsync = cifs_strict_fsync, + .flush = cifs_flush, + .mmap = cifs_file_strict_mmap, + .splice_read = generic_file_splice_read, + .llseek = cifs_llseek, +#ifdef CONFIG_CIFS_POSIX + .unlocked_ioctl = cifs_ioctl, +#endif /* CONFIG_CIFS_POSIX */ + .setlease = cifs_setlease, +}; + +const struct file_operations cifs_file_direct_ops = { + /* BB reevaluate whether they can be done with directio, no cache */ + .read = do_sync_read, + .write = do_sync_write, + .aio_read = cifs_user_readv, + .aio_write = cifs_user_writev, + .open = cifs_open, + .release = cifs_close, + .lock = cifs_lock, + .fsync = cifs_fsync, + .flush = cifs_flush, + .mmap = cifs_file_mmap, + .splice_read = generic_file_splice_read, +#ifdef CONFIG_CIFS_POSIX + .unlocked_ioctl = cifs_ioctl, +#endif /* CONFIG_CIFS_POSIX */ + .llseek = cifs_llseek, + .setlease = cifs_setlease, +}; + +const struct file_operations cifs_file_nobrl_ops = { + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = cifs_file_aio_write, + .open = cifs_open, + .release = cifs_close, + .fsync = cifs_fsync, + .flush = cifs_flush, + .mmap = cifs_file_mmap, + .splice_read = generic_file_splice_read, + .llseek = cifs_llseek, +#ifdef CONFIG_CIFS_POSIX + .unlocked_ioctl = cifs_ioctl, +#endif /* CONFIG_CIFS_POSIX */ + .setlease = cifs_setlease, +}; + +const struct file_operations cifs_file_strict_nobrl_ops = { + .read = do_sync_read, + .write = do_sync_write, + .aio_read = cifs_strict_readv, + .aio_write = cifs_strict_writev, + .open = cifs_open, + .release = cifs_close, + .fsync = cifs_strict_fsync, + .flush = cifs_flush, + .mmap = cifs_file_strict_mmap, + .splice_read = generic_file_splice_read, + .llseek = cifs_llseek, +#ifdef CONFIG_CIFS_POSIX + .unlocked_ioctl = cifs_ioctl, +#endif /* CONFIG_CIFS_POSIX */ + .setlease = cifs_setlease, +}; + +const struct file_operations cifs_file_direct_nobrl_ops = { + /* BB reevaluate whether they can be done with directio, no cache */ + .read = do_sync_read, + .write = do_sync_write, + .aio_read = cifs_user_readv, + .aio_write = cifs_user_writev, + .open = cifs_open, + .release = cifs_close, + .fsync = cifs_fsync, + .flush = cifs_flush, + .mmap = cifs_file_mmap, + .splice_read = generic_file_splice_read, +#ifdef CONFIG_CIFS_POSIX + .unlocked_ioctl = cifs_ioctl, +#endif /* CONFIG_CIFS_POSIX */ + .llseek = cifs_llseek, + .setlease = cifs_setlease, +}; + +const struct file_operations cifs_dir_ops = { + .readdir = cifs_readdir, + .release = cifs_closedir, + .read = generic_read_dir, + .unlocked_ioctl = cifs_ioctl, + .llseek = generic_file_llseek, +}; + +static void +cifs_init_once(void *inode) +{ + struct cifsInodeInfo *cifsi = inode; + + inode_init_once(&cifsi->vfs_inode); + INIT_LIST_HEAD(&cifsi->lockList); +} + +static int +cifs_init_inodecache(void) +{ + cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", + sizeof(struct cifsInodeInfo), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + cifs_init_once); + if (cifs_inode_cachep == NULL) + return -ENOMEM; + + return 0; +} + +static void +cifs_destroy_inodecache(void) +{ + kmem_cache_destroy(cifs_inode_cachep); +} + +static int +cifs_init_request_bufs(void) +{ + if (CIFSMaxBufSize < 8192) { + /* Buffer size can not be smaller than 2 * PATH_MAX since maximum + Unicode path name has to fit in any SMB/CIFS path based frames */ + CIFSMaxBufSize = 8192; + } else if (CIFSMaxBufSize > 1024*127) { + CIFSMaxBufSize = 1024 * 127; + } else { + CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/ + } +/* cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */ + cifs_req_cachep = kmem_cache_create("cifs_request", + CIFSMaxBufSize + + MAX_CIFS_HDR_SIZE, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (cifs_req_cachep == NULL) + return -ENOMEM; + + if (cifs_min_rcv < 1) + cifs_min_rcv = 1; + else if (cifs_min_rcv > 64) { + cifs_min_rcv = 64; + cERROR(1, "cifs_min_rcv set to maximum (64)"); + } + + cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv, + cifs_req_cachep); + + if (cifs_req_poolp == NULL) { + kmem_cache_destroy(cifs_req_cachep); + return -ENOMEM; + } + /* MAX_CIFS_SMALL_BUFFER_SIZE bytes is enough for most SMB responses and + almost all handle based requests (but not write response, nor is it + sufficient for path based requests). A smaller size would have + been more efficient (compacting multiple slab items on one 4k page) + for the case in which debug was on, but this larger size allows + more SMBs to use small buffer alloc and is still much more + efficient to alloc 1 per page off the slab compared to 17K (5page) + alloc of large cifs buffers even when page debugging is on */ + cifs_sm_req_cachep = kmem_cache_create("cifs_small_rq", + MAX_CIFS_SMALL_BUFFER_SIZE, 0, SLAB_HWCACHE_ALIGN, + NULL); + if (cifs_sm_req_cachep == NULL) { + mempool_destroy(cifs_req_poolp); + kmem_cache_destroy(cifs_req_cachep); + return -ENOMEM; + } + + if (cifs_min_small < 2) + cifs_min_small = 2; + else if (cifs_min_small > 256) { + cifs_min_small = 256; + cFYI(1, "cifs_min_small set to maximum (256)"); + } + + cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small, + cifs_sm_req_cachep); + + if (cifs_sm_req_poolp == NULL) { + mempool_destroy(cifs_req_poolp); + kmem_cache_destroy(cifs_req_cachep); + kmem_cache_destroy(cifs_sm_req_cachep); + return -ENOMEM; + } + + return 0; +} + +static void +cifs_destroy_request_bufs(void) +{ + mempool_destroy(cifs_req_poolp); + kmem_cache_destroy(cifs_req_cachep); + mempool_destroy(cifs_sm_req_poolp); + kmem_cache_destroy(cifs_sm_req_cachep); +} + +static int +cifs_init_mids(void) +{ + cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids", + sizeof(struct mid_q_entry), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (cifs_mid_cachep == NULL) + return -ENOMEM; + + /* 3 is a reasonable minimum number of simultaneous operations */ + cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep); + if (cifs_mid_poolp == NULL) { + kmem_cache_destroy(cifs_mid_cachep); + return -ENOMEM; + } + + return 0; +} + +static void +cifs_destroy_mids(void) +{ + mempool_destroy(cifs_mid_poolp); + kmem_cache_destroy(cifs_mid_cachep); +} + +static int __init +init_cifs(void) +{ + int rc = 0; + cifs_proc_init(); + INIT_LIST_HEAD(&cifs_tcp_ses_list); +#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ + INIT_LIST_HEAD(&GlobalDnotifyReqList); + INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); +#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ +/* + * Initialize Global counters + */ + atomic_set(&sesInfoAllocCount, 0); + atomic_set(&tconInfoAllocCount, 0); + atomic_set(&tcpSesAllocCount, 0); + atomic_set(&tcpSesReconnectCount, 0); + atomic_set(&tconInfoReconnectCount, 0); + + atomic_set(&bufAllocCount, 0); + atomic_set(&smBufAllocCount, 0); +#ifdef CONFIG_CIFS_STATS2 + atomic_set(&totBufAllocCount, 0); + atomic_set(&totSmBufAllocCount, 0); +#endif /* CONFIG_CIFS_STATS2 */ + + atomic_set(&midCount, 0); + GlobalCurrentXid = 0; + GlobalTotalActiveXid = 0; + GlobalMaxActiveXid = 0; + spin_lock_init(&cifs_tcp_ses_lock); + spin_lock_init(&cifs_file_list_lock); + spin_lock_init(&GlobalMid_Lock); + + if (cifs_max_pending < 2) { + cifs_max_pending = 2; + cFYI(1, "cifs_max_pending set to min of 2"); + } else if (cifs_max_pending > 256) { + cifs_max_pending = 256; + cFYI(1, "cifs_max_pending set to max of 256"); + } + + rc = cifs_fscache_register(); + if (rc) + goto out_clean_proc; + + rc = cifs_init_inodecache(); + if (rc) + goto out_unreg_fscache; + + rc = cifs_init_mids(); + if (rc) + goto out_destroy_inodecache; + + rc = cifs_init_request_bufs(); + if (rc) + goto out_destroy_mids; + +#ifdef CONFIG_CIFS_UPCALL + rc = register_key_type(&cifs_spnego_key_type); + if (rc) + goto out_destroy_request_bufs; +#endif /* CONFIG_CIFS_UPCALL */ + +#ifdef CONFIG_CIFS_ACL + rc = init_cifs_idmap(); + if (rc) + goto out_register_key_type; +#endif /* CONFIG_CIFS_ACL */ + + rc = register_filesystem(&cifs_fs_type); + if (rc) + goto out_init_cifs_idmap; + + return 0; + +out_init_cifs_idmap: +#ifdef CONFIG_CIFS_ACL + exit_cifs_idmap(); +out_register_key_type: +#endif +#ifdef CONFIG_CIFS_UPCALL + unregister_key_type(&cifs_spnego_key_type); +out_destroy_request_bufs: +#endif + cifs_destroy_request_bufs(); +out_destroy_mids: + cifs_destroy_mids(); +out_destroy_inodecache: + cifs_destroy_inodecache(); +out_unreg_fscache: + cifs_fscache_unregister(); +out_clean_proc: + cifs_proc_clean(); + return rc; +} + +static void __exit +exit_cifs(void) +{ + cFYI(DBG2, "exit_cifs"); + cifs_proc_clean(); + cifs_fscache_unregister(); +#ifdef CONFIG_CIFS_DFS_UPCALL + cifs_dfs_release_automount_timer(); +#endif +#ifdef CONFIG_CIFS_ACL + cifs_destroy_idmaptrees(); + exit_cifs_idmap(); +#endif +#ifdef CONFIG_CIFS_UPCALL + unregister_key_type(&cifs_spnego_key_type); +#endif + unregister_filesystem(&cifs_fs_type); + cifs_destroy_inodecache(); + cifs_destroy_mids(); + cifs_destroy_request_bufs(); +} + +MODULE_AUTHOR("Steve French "); +MODULE_LICENSE("GPL"); /* combination of LGPL + GPL source behaves as GPL */ +MODULE_DESCRIPTION + ("VFS to access servers complying with the SNIA CIFS Specification " + "e.g. Samba and Windows"); +MODULE_VERSION(CIFS_VERSION); +module_init(init_cifs) +module_exit(exit_cifs) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h new file mode 100644 index 00000000..036ca83e --- /dev/null +++ b/fs/cifs/cifsfs.h @@ -0,0 +1,133 @@ +/* + * fs/cifs/cifsfs.h + * + * Copyright (c) International Business Machines Corp., 2002, 2007 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _CIFSFS_H +#define _CIFSFS_H + +#define ROOT_I 2 + +/* + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. + */ +static inline ino_t +cifs_uniqueid_to_ino_t(u64 fileid) +{ + ino_t ino = (ino_t) fileid; + if (sizeof(ino_t) < sizeof(u64)) + ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8; + return ino; +} + +extern struct file_system_type cifs_fs_type; +extern const struct address_space_operations cifs_addr_ops; +extern const struct address_space_operations cifs_addr_ops_smallbuf; + +/* Functions related to super block operations */ +extern void cifs_sb_active(struct super_block *sb); +extern void cifs_sb_deactive(struct super_block *sb); + +/* Functions related to inodes */ +extern const struct inode_operations cifs_dir_inode_ops; +extern struct inode *cifs_root_iget(struct super_block *); +extern int cifs_create(struct inode *, struct dentry *, int, + struct nameidata *); +extern struct dentry *cifs_lookup(struct inode *, struct dentry *, + struct nameidata *); +extern int cifs_unlink(struct inode *dir, struct dentry *dentry); +extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); +extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t); +extern int cifs_mkdir(struct inode *, struct dentry *, int); +extern int cifs_rmdir(struct inode *, struct dentry *); +extern int cifs_rename(struct inode *, struct dentry *, struct inode *, + struct dentry *); +extern int cifs_revalidate_file_attr(struct file *filp); +extern int cifs_revalidate_dentry_attr(struct dentry *); +extern int cifs_revalidate_file(struct file *filp); +extern int cifs_revalidate_dentry(struct dentry *); +extern int cifs_invalidate_mapping(struct inode *inode); +extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int cifs_setattr(struct dentry *, struct iattr *); + +extern const struct inode_operations cifs_file_inode_ops; +extern const struct inode_operations cifs_symlink_inode_ops; +extern const struct inode_operations cifs_dfs_referral_inode_operations; + + +/* Functions related to files and directories */ +extern const struct file_operations cifs_file_ops; +extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */ +extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */ +extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */ +extern const struct file_operations cifs_file_direct_nobrl_ops; +extern const struct file_operations cifs_file_strict_nobrl_ops; +extern int cifs_open(struct inode *inode, struct file *file); +extern int cifs_close(struct inode *inode, struct file *file); +extern int cifs_closedir(struct inode *inode, struct file *file); +extern ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); +extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); +extern ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); +extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); +extern int cifs_lock(struct file *, int, struct file_lock *); +extern int cifs_fsync(struct file *, int); +extern int cifs_strict_fsync(struct file *, int); +extern int cifs_flush(struct file *, fl_owner_t id); +extern int cifs_file_mmap(struct file * , struct vm_area_struct *); +extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *); +extern const struct file_operations cifs_dir_ops; +extern int cifs_dir_open(struct inode *inode, struct file *file); +extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); + +/* Functions related to dir entries */ +extern const struct dentry_operations cifs_dentry_ops; +extern const struct dentry_operations cifs_ci_dentry_ops; + +#ifdef CONFIG_CIFS_DFS_UPCALL +extern struct vfsmount *cifs_dfs_d_automount(struct path *path); +#else +#define cifs_dfs_d_automount NULL +#endif + +/* Functions related to symlinks */ +extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd); +extern void cifs_put_link(struct dentry *direntry, + struct nameidata *nd, void *); +extern int cifs_readlink(struct dentry *direntry, char __user *buffer, + int buflen); +extern int cifs_symlink(struct inode *inode, struct dentry *direntry, + const char *symname); +extern int cifs_removexattr(struct dentry *, const char *); +extern int cifs_setxattr(struct dentry *, const char *, const void *, + size_t, int); +extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); +extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); +extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); + +#ifdef CIFS_NFSD_EXPORT +extern const struct export_operations cifs_export_ops; +#endif /* CIFS_NFSD_EXPORT */ + +#define CIFS_VERSION "1.74" +#endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h new file mode 100644 index 00000000..7cb9dd22 --- /dev/null +++ b/fs/cifs/cifsglob.h @@ -0,0 +1,951 @@ +/* + * fs/cifs/cifsglob.h + * + * Copyright (C) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * Jeremy Allison (jra@samba.org) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + */ +#ifndef _CIFS_GLOB_H +#define _CIFS_GLOB_H + +#include +#include +#include +#include +#include "cifs_fs_sb.h" +#include "cifsacl.h" +#include +#include + +/* + * The sizes of various internal tables and strings + */ +#define MAX_UID_INFO 16 +#define MAX_SES_INFO 2 +#define MAX_TCON_INFO 4 + +#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1) +#define MAX_SERVER_SIZE 15 +#define MAX_SHARE_SIZE 80 +#define MAX_USERNAME_SIZE 256 /* reasonable maximum for current servers */ +#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */ + +#define CIFS_MIN_RCV_POOL 4 + +#define MAX_REOPEN_ATT 5 /* these many maximum attempts to reopen a file */ +/* + * default attribute cache timeout (jiffies) + */ +#define CIFS_DEF_ACTIMEO (1 * HZ) + +/* + * max attribute cache timeout (jiffies) - 2^30 + */ +#define CIFS_MAX_ACTIMEO (1 << 30) + +/* + * MAX_REQ is the maximum number of requests that WE will send + * on one socket concurrently. It also matches the most common + * value of max multiplex returned by servers. We may + * eventually want to use the negotiated value (in case + * future servers can handle more) when we are more confident that + * we will not have problems oveloading the socket with pending + * write data. + */ +#define CIFS_MAX_REQ 50 + +#define RFC1001_NAME_LEN 15 +#define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1) + +/* currently length of NIP6_FMT */ +#define SERVER_NAME_LENGTH 40 +#define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1) + +/* used to define string lengths for reversing unicode strings */ +/* (256+1)*2 = 514 */ +/* (max path length + 1 for null) * 2 for unicode */ +#define MAX_NAME 514 + +#include "cifspdu.h" + +#ifndef XATTR_DOS_ATTRIB +#define XATTR_DOS_ATTRIB "user.DOSATTRIB" +#endif + +/* + * CIFS vfs client Status information (based on what we know.) + */ + +/* associated with each tcp and smb session */ +enum statusEnum { + CifsNew = 0, + CifsGood, + CifsExiting, + CifsNeedReconnect, + CifsNeedNegotiate +}; + +enum securityEnum { + LANMAN = 0, /* Legacy LANMAN auth */ + NTLM, /* Legacy NTLM012 auth with NTLM hash */ + NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ + RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ +/* NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */ + Kerberos, /* Kerberos via SPNEGO */ +}; + +enum protocolEnum { + TCP = 0, + SCTP + /* Netbios frames protocol not supported at this time */ +}; + +struct session_key { + unsigned int len; + char *response; +}; + +/* crypto security descriptor definition */ +struct sdesc { + struct shash_desc shash; + char ctx[]; +}; + +/* crypto hashing related structure/fields, not specific to a sec mech */ +struct cifs_secmech { + struct crypto_shash *hmacmd5; /* hmac-md5 hash function */ + struct crypto_shash *md5; /* md5 hash function */ + struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ + struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ +}; + +/* per smb session structure/fields */ +struct ntlmssp_auth { + __u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */ + __u32 server_flags; /* sent by server in type 2 ntlmssp exchange */ + unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */ + char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */ +}; + +struct cifs_cred { + int uid; + int gid; + int mode; + int cecount; + struct cifs_sid osid; + struct cifs_sid gsid; + struct cifs_ntace *ntaces; + struct cifs_ace *aces; +}; + +/* + ***************************************************************** + * Except the CIFS PDUs themselves all the + * globally interesting structs should go here + ***************************************************************** + */ + +struct smb_vol { + char *username; + char *password; + char *domainname; + char *UNC; + char *UNCip; + char *iocharset; /* local code page for mapping to and from Unicode */ + char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */ + char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ + uid_t cred_uid; + uid_t linux_uid; + gid_t linux_gid; + mode_t file_mode; + mode_t dir_mode; + unsigned secFlg; + bool retry:1; + bool intr:1; + bool setuids:1; + bool override_uid:1; + bool override_gid:1; + bool dynperm:1; + bool noperm:1; + bool no_psx_acl:1; /* set if posix acl support should be disabled */ + bool cifs_acl:1; + bool no_xattr:1; /* set if xattr (EA) support should be disabled*/ + bool server_ino:1; /* use inode numbers from server ie UniqueId */ + bool direct_io:1; + bool strict_io:1; /* strict cache behavior */ + bool remap:1; /* set to remap seven reserved chars in filenames */ + bool posix_paths:1; /* unset to not ask for posix pathnames. */ + bool no_linux_ext:1; + bool sfu_emul:1; + bool nullauth:1; /* attempt to authenticate with null user */ + bool nocase:1; /* request case insensitive filenames */ + bool nobrl:1; /* disable sending byte range locks to srv */ + bool mand_lock:1; /* send mandatory not posix byte range lock reqs */ + bool seal:1; /* request transport encryption on share */ + bool nodfs:1; /* Do not request DFS, even if available */ + bool local_lease:1; /* check leases only on local system, not remote */ + bool noblocksnd:1; + bool noautotune:1; + bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ + bool fsc:1; /* enable fscache */ + bool mfsymlinks:1; /* use Minshall+French Symlinks */ + bool multiuser:1; + bool rwpidforward:1; /* pid forward for read/write operations */ + unsigned int rsize; + unsigned int wsize; + bool sockopt_tcp_nodelay:1; + unsigned short int port; + unsigned long actimeo; /* attribute cache timeout (jiffies) */ + char *prepath; + struct sockaddr_storage srcaddr; /* allow binding to a local IP */ + struct nls_table *local_nls; +}; + +#define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \ + CIFS_MOUNT_SERVER_INUM | CIFS_MOUNT_DIRECT_IO | \ + CIFS_MOUNT_NO_XATTR | CIFS_MOUNT_MAP_SPECIAL_CHR | \ + CIFS_MOUNT_UNX_EMUL | CIFS_MOUNT_NO_BRL | \ + CIFS_MOUNT_CIFS_ACL | CIFS_MOUNT_OVERR_UID | \ + CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \ + CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \ + CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \ + CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO) + +#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \ + MS_NODEV | MS_SYNCHRONOUS) + +struct cifs_mnt_data { + struct cifs_sb_info *cifs_sb; + struct smb_vol *vol; + int flags; +}; + +struct TCP_Server_Info { + struct list_head tcp_ses_list; + struct list_head smb_ses_list; + int srv_count; /* reference counter */ + /* 15 character server name + 0x20 16th byte indicating type = srv */ + char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; + enum statusEnum tcpStatus; /* what we think the status is */ + char *hostname; /* hostname portion of UNC string */ + struct socket *ssocket; + struct sockaddr_storage dstaddr; + struct sockaddr_storage srcaddr; /* locally bind to this IP */ +#ifdef CONFIG_NET_NS + struct net *net; +#endif + wait_queue_head_t response_q; + wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/ + struct list_head pending_mid_q; + bool noblocksnd; /* use blocking sendmsg */ + bool noautotune; /* do not autotune send buf sizes */ + bool tcp_nodelay; + atomic_t inFlight; /* number of requests on the wire to server */ + struct mutex srv_mutex; + struct task_struct *tsk; + char server_GUID[16]; + char sec_mode; + bool session_estab; /* mark when very first sess is established */ + u16 dialect; /* dialect index that server chose */ + enum securityEnum secType; + unsigned int maxReq; /* Clients should submit no more */ + /* than maxReq distinct unanswered SMBs to the server when using */ + /* multiplexed reads or writes */ + unsigned int maxBuf; /* maxBuf specifies the maximum */ + /* message size the server can send or receive for non-raw SMBs */ + /* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */ + /* when socket is setup (and during reconnect) before NegProt sent */ + unsigned int max_rw; /* maxRw specifies the maximum */ + /* message size the server can send or receive for */ + /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ + unsigned int max_vcs; /* maximum number of smb sessions, at least + those that can be specified uniquely with + vcnumbers */ + int capabilities; /* allow selective disabling of caps by smb sess */ + int timeAdj; /* Adjust for difference in server time zone in sec */ + __u16 CurrentMid; /* multiplex id - rotating counter */ + char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ + /* 16th byte of RFC1001 workstation name is always null */ + char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; + __u32 sequence_number; /* for signing, protected by srv_mutex */ + struct session_key session_key; + unsigned long lstrp; /* when we got last response from this server */ + struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ + /* extended security flavors that server supports */ + bool sec_ntlmssp; /* supports NTLMSSP */ + bool sec_kerberosu2u; /* supports U2U Kerberos */ + bool sec_kerberos; /* supports plain Kerberos */ + bool sec_mskerberos; /* supports legacy MS Kerberos */ + struct delayed_work echo; /* echo ping workqueue job */ +#ifdef CONFIG_CIFS_FSCACHE + struct fscache_cookie *fscache; /* client index cache cookie */ +#endif +#ifdef CONFIG_CIFS_STATS2 + atomic_t inSend; /* requests trying to send */ + atomic_t num_waiters; /* blocked waiting to get in sendrecv */ +#endif +}; + +/* + * Macros to allow the TCP_Server_Info->net field and related code to drop out + * when CONFIG_NET_NS isn't set. + */ + +#ifdef CONFIG_NET_NS + +static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv) +{ + return srv->net; +} + +static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) +{ + srv->net = net; +} + +#else + +static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv) +{ + return &init_net; +} + +static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) +{ +} + +#endif + +/* + * Session structure. One of these for each uid session with a particular host + */ +struct cifs_ses { + struct list_head smb_ses_list; + struct list_head tcon_list; + struct mutex session_mutex; + struct TCP_Server_Info *server; /* pointer to server info */ + int ses_count; /* reference counter */ + enum statusEnum status; + unsigned overrideSecFlg; /* if non-zero override global sec flags */ + __u16 ipc_tid; /* special tid for connection to IPC share */ + __u16 flags; + __u16 vcnum; + char *serverOS; /* name of operating system underlying server */ + char *serverNOS; /* name of network operating system of server */ + char *serverDomain; /* security realm of server */ + int Suid; /* remote smb uid */ + uid_t linux_uid; /* overriding owner of files on the mount */ + uid_t cred_uid; /* owner of credentials */ + int capabilities; + char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for + TCP names - will ipv6 and sctp addresses fit? */ + char *user_name; /* must not be null except during init of sess + and after mount option parsing we fill it */ + char *domainName; + char *password; + struct session_key auth_key; + struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ + bool need_reconnect:1; /* connection reset, uid now invalid */ +}; +/* no more than one of the following three session flags may be set */ +#define CIFS_SES_NT4 1 +#define CIFS_SES_OS2 2 +#define CIFS_SES_W9X 4 +/* following flag is set for old servers such as OS2 (and Win95?) + which do not negotiate NTLM or POSIX dialects, but instead + negotiate one of the older LANMAN dialects */ +#define CIFS_SES_LANMAN 8 +/* + * there is one of these for each connection to a resource on a particular + * session + */ +struct cifs_tcon { + struct list_head tcon_list; + int tc_count; + struct list_head openFileList; + struct cifs_ses *ses; /* pointer to session associated with */ + char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ + char *nativeFileSystem; + char *password; /* for share-level security */ + __u16 tid; /* The 2 byte tree id */ + __u16 Flags; /* optional support bits */ + enum statusEnum tidStatus; +#ifdef CONFIG_CIFS_STATS + atomic_t num_smbs_sent; + atomic_t num_writes; + atomic_t num_reads; + atomic_t num_flushes; + atomic_t num_oplock_brks; + atomic_t num_opens; + atomic_t num_closes; + atomic_t num_deletes; + atomic_t num_mkdirs; + atomic_t num_posixopens; + atomic_t num_posixmkdirs; + atomic_t num_rmdirs; + atomic_t num_renames; + atomic_t num_t2renames; + atomic_t num_ffirst; + atomic_t num_fnext; + atomic_t num_fclose; + atomic_t num_hardlinks; + atomic_t num_symlinks; + atomic_t num_locks; + atomic_t num_acl_get; + atomic_t num_acl_set; +#ifdef CONFIG_CIFS_STATS2 + unsigned long long time_writes; + unsigned long long time_reads; + unsigned long long time_opens; + unsigned long long time_deletes; + unsigned long long time_closes; + unsigned long long time_mkdirs; + unsigned long long time_rmdirs; + unsigned long long time_renames; + unsigned long long time_t2renames; + unsigned long long time_ffirst; + unsigned long long time_fnext; + unsigned long long time_fclose; +#endif /* CONFIG_CIFS_STATS2 */ + __u64 bytes_read; + __u64 bytes_written; + spinlock_t stat_lock; +#endif /* CONFIG_CIFS_STATS */ + FILE_SYSTEM_DEVICE_INFO fsDevInfo; + FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */ + FILE_SYSTEM_UNIX_INFO fsUnixInfo; + bool ipc:1; /* set if connection to IPC$ eg for RPC/PIPES */ + bool retry:1; + bool nocase:1; + bool seal:1; /* transport encryption for this mounted share */ + bool unix_ext:1; /* if false disable Linux extensions to CIFS protocol + for this mount even if server would support */ + bool local_lease:1; /* check leases (only) on local system not remote */ + bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ + bool need_reconnect:1; /* connection reset, tid now invalid */ +#ifdef CONFIG_CIFS_FSCACHE + u64 resource_id; /* server resource id */ + struct fscache_cookie *fscache; /* cookie for share */ +#endif + /* BB add field for back pointer to sb struct(s)? */ +}; + +/* + * This is a refcounted and timestamped container for a tcon pointer. The + * container holds a tcon reference. It is considered safe to free one of + * these when the tl_count goes to 0. The tl_time is the time of the last + * "get" on the container. + */ +struct tcon_link { + struct rb_node tl_rbnode; + uid_t tl_uid; + unsigned long tl_flags; +#define TCON_LINK_MASTER 0 +#define TCON_LINK_PENDING 1 +#define TCON_LINK_IN_TREE 2 + unsigned long tl_time; + atomic_t tl_count; + struct cifs_tcon *tl_tcon; +}; + +extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb); + +static inline struct cifs_tcon * +tlink_tcon(struct tcon_link *tlink) +{ + return tlink->tl_tcon; +} + +extern void cifs_put_tlink(struct tcon_link *tlink); + +static inline struct tcon_link * +cifs_get_tlink(struct tcon_link *tlink) +{ + if (tlink && !IS_ERR(tlink)) + atomic_inc(&tlink->tl_count); + return tlink; +} + +/* This function is always expected to succeed */ +extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb); + +/* + * This info hangs off the cifsFileInfo structure, pointed to by llist. + * This is used to track byte stream locks on the file + */ +struct cifsLockInfo { + struct list_head llist; /* pointer to next cifsLockInfo */ + __u64 offset; + __u64 length; + __u8 type; +}; + +/* + * One of these for each open instance of a file + */ +struct cifs_search_info { + loff_t index_of_last_entry; + __u16 entries_in_buffer; + __u16 info_level; + __u32 resume_key; + char *ntwrk_buf_start; + char *srch_entries_start; + char *last_entry; + char *presume_name; + unsigned int resume_name_len; + bool endOfSearch:1; + bool emptyDir:1; + bool unicode:1; + bool smallBuf:1; /* so we know which buf_release function to call */ +}; + +struct cifsFileInfo { + struct list_head tlist; /* pointer to next fid owned by tcon */ + struct list_head flist; /* next fid (file instance) for this inode */ + unsigned int uid; /* allows finding which FileInfo structure */ + __u32 pid; /* process id who opened file */ + __u16 netfid; /* file id from remote */ + /* BB add lock scope info here if needed */ ; + /* lock scope id (0 if none) */ + struct dentry *dentry; + unsigned int f_flags; + struct tcon_link *tlink; + struct mutex lock_mutex; + struct list_head llist; /* list of byte range locks we have. */ + bool invalidHandle:1; /* file closed via session abend */ + bool oplock_break_cancelled:1; + int count; /* refcount protected by cifs_file_list_lock */ + struct mutex fh_mutex; /* prevents reopen race after dead ses*/ + struct cifs_search_info srch_inf; + struct work_struct oplock_break; /* work for oplock breaks */ +}; + +struct cifs_io_parms { + __u16 netfid; + __u32 pid; + __u64 offset; + unsigned int length; + struct cifs_tcon *tcon; +}; + +/* + * Take a reference on the file private data. Must be called with + * cifs_file_list_lock held. + */ +static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file) +{ + ++cifs_file->count; +} + +void cifsFileInfo_put(struct cifsFileInfo *cifs_file); + +/* + * One of these for each file inode + */ + +struct cifsInodeInfo { + struct list_head lockList; + /* BB add in lists for dirty pages i.e. write caching info for oplock */ + struct list_head openFileList; + __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ + bool clientCanCacheRead; /* read oplock */ + bool clientCanCacheAll; /* read and writebehind oplock */ + bool delete_pending; /* DELETE_ON_CLOSE is set */ + bool invalid_mapping; /* pagecache is invalid */ + unsigned long time; /* jiffies of last update of inode */ + u64 server_eof; /* current file size on server */ + u64 uniqueid; /* server inode number */ + u64 createtime; /* creation time on server */ +#ifdef CONFIG_CIFS_FSCACHE + struct fscache_cookie *fscache; +#endif + struct inode vfs_inode; +}; + +static inline struct cifsInodeInfo * +CIFS_I(struct inode *inode) +{ + return container_of(inode, struct cifsInodeInfo, vfs_inode); +} + +static inline struct cifs_sb_info * +CIFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb) +{ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) + return '/'; + else + return '\\'; +} + +static inline void +convert_delimiter(char *path, char delim) +{ + int i; + char old_delim; + + if (path == NULL) + return; + + if (delim == '/') + old_delim = '\\'; + else + old_delim = '/'; + + for (i = 0; path[i] != '\0'; i++) { + if (path[i] == old_delim) + path[i] = delim; + } +} + +#ifdef CONFIG_CIFS_STATS +#define cifs_stats_inc atomic_inc + +static inline void cifs_stats_bytes_written(struct cifs_tcon *tcon, + unsigned int bytes) +{ + if (bytes) { + spin_lock(&tcon->stat_lock); + tcon->bytes_written += bytes; + spin_unlock(&tcon->stat_lock); + } +} + +static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon, + unsigned int bytes) +{ + spin_lock(&tcon->stat_lock); + tcon->bytes_read += bytes; + spin_unlock(&tcon->stat_lock); +} +#else + +#define cifs_stats_inc(field) do {} while (0) +#define cifs_stats_bytes_written(tcon, bytes) do {} while (0) +#define cifs_stats_bytes_read(tcon, bytes) do {} while (0) + +#endif + +struct mid_q_entry; + +/* + * This is the prototype for the mid callback function. When creating one, + * take special care to avoid deadlocks. Things to bear in mind: + * + * - it will be called by cifsd, with no locks held + * - the mid will be removed from any lists + */ +typedef void (mid_callback_t)(struct mid_q_entry *mid); + +/* one of these for every pending CIFS request to the server */ +struct mid_q_entry { + struct list_head qhead; /* mids waiting on reply from this server */ + __u16 mid; /* multiplex id */ + __u16 pid; /* process id */ + __u32 sequence_number; /* for CIFS signing */ + unsigned long when_alloc; /* when mid was created */ +#ifdef CONFIG_CIFS_STATS2 + unsigned long when_sent; /* time when smb send finished */ + unsigned long when_received; /* when demux complete (taken off wire) */ +#endif + mid_callback_t *callback; /* call completion callback */ + void *callback_data; /* general purpose pointer for callback */ + struct smb_hdr *resp_buf; /* response buffer */ + int midState; /* wish this were enum but can not pass to wait_event */ + __u8 command; /* smb command code */ + bool largeBuf:1; /* if valid response, is pointer to large buf */ + bool multiRsp:1; /* multiple trans2 responses for one request */ + bool multiEnd:1; /* both received */ +}; + +struct oplock_q_entry { + struct list_head qhead; + struct inode *pinode; + struct cifs_tcon *tcon; + __u16 netfid; +}; + +/* for pending dnotify requests */ +struct dir_notify_req { + struct list_head lhead; + __le16 Pid; + __le16 PidHigh; + __u16 Mid; + __u16 Tid; + __u16 Uid; + __u16 netfid; + __u32 filter; /* CompletionFilter (for multishot) */ + int multishot; + struct file *pfile; +}; + +struct dfs_info3_param { + int flags; /* DFSREF_REFERRAL_SERVER, DFSREF_STORAGE_SERVER*/ + int path_consumed; + int server_type; + int ref_flag; + char *path_name; + char *node_name; +}; + +/* + * common struct for holding inode info when searching for or updating an + * inode with new info + */ + +#define CIFS_FATTR_DFS_REFERRAL 0x1 +#define CIFS_FATTR_DELETE_PENDING 0x2 +#define CIFS_FATTR_NEED_REVAL 0x4 +#define CIFS_FATTR_INO_COLLISION 0x8 + +struct cifs_fattr { + u32 cf_flags; + u32 cf_cifsattrs; + u64 cf_uniqueid; + u64 cf_eof; + u64 cf_bytes; + u64 cf_createtime; + uid_t cf_uid; + gid_t cf_gid; + umode_t cf_mode; + dev_t cf_rdev; + unsigned int cf_nlink; + unsigned int cf_dtype; + struct timespec cf_atime; + struct timespec cf_mtime; + struct timespec cf_ctime; +}; + +static inline void free_dfs_info_param(struct dfs_info3_param *param) +{ + if (param) { + kfree(param->path_name); + kfree(param->node_name); + kfree(param); + } +} + +static inline void free_dfs_info_array(struct dfs_info3_param *param, + int number_of_items) +{ + int i; + if ((number_of_items == 0) || (param == NULL)) + return; + for (i = 0; i < number_of_items; i++) { + kfree(param[i].path_name); + kfree(param[i].node_name); + } + kfree(param); +} + +#define MID_FREE 0 +#define MID_REQUEST_ALLOCATED 1 +#define MID_REQUEST_SUBMITTED 2 +#define MID_RESPONSE_RECEIVED 4 +#define MID_RETRY_NEEDED 8 /* session closed while this request out */ +#define MID_RESPONSE_MALFORMED 0x10 +#define MID_SHUTDOWN 0x20 + +/* Types of response buffer returned from SendReceive2 */ +#define CIFS_NO_BUFFER 0 /* Response buffer not returned */ +#define CIFS_SMALL_BUFFER 1 +#define CIFS_LARGE_BUFFER 2 +#define CIFS_IOVEC 4 /* array of response buffers */ + +/* Type of Request to SendReceive2 */ +#define CIFS_BLOCKING_OP 1 /* operation can block */ +#define CIFS_ASYNC_OP 2 /* do not wait for response */ +#define CIFS_TIMEOUT_MASK 0x003 /* only one of above set in req */ +#define CIFS_LOG_ERROR 0x010 /* log NT STATUS if non-zero */ +#define CIFS_LARGE_BUF_OP 0x020 /* large request buffer */ +#define CIFS_NO_RESP 0x040 /* no response buffer required */ + +/* Security Flags: indicate type of session setup needed */ +#define CIFSSEC_MAY_SIGN 0x00001 +#define CIFSSEC_MAY_NTLM 0x00002 +#define CIFSSEC_MAY_NTLMV2 0x00004 +#define CIFSSEC_MAY_KRB5 0x00008 +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define CIFSSEC_MAY_LANMAN 0x00010 +#define CIFSSEC_MAY_PLNTXT 0x00020 +#else +#define CIFSSEC_MAY_LANMAN 0 +#define CIFSSEC_MAY_PLNTXT 0 +#endif /* weak passwords */ +#define CIFSSEC_MAY_SEAL 0x00040 /* not supported yet */ +#define CIFSSEC_MAY_NTLMSSP 0x00080 /* raw ntlmssp with ntlmv2 */ + +#define CIFSSEC_MUST_SIGN 0x01001 +/* note that only one of the following can be set so the +result of setting MUST flags more than once will be to +require use of the stronger protocol */ +#define CIFSSEC_MUST_NTLM 0x02002 +#define CIFSSEC_MUST_NTLMV2 0x04004 +#define CIFSSEC_MUST_KRB5 0x08008 +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define CIFSSEC_MUST_LANMAN 0x10010 +#define CIFSSEC_MUST_PLNTXT 0x20020 +#ifdef CONFIG_CIFS_UPCALL +#define CIFSSEC_MASK 0xBF0BF /* allows weak security but also krb5 */ +#else +#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */ +#endif /* UPCALL */ +#else /* do not allow weak pw hash */ +#ifdef CONFIG_CIFS_UPCALL +#define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */ +#else +#define CIFSSEC_MASK 0x87087 /* flags supported if no weak allowed */ +#endif /* UPCALL */ +#endif /* WEAK_PW_HASH */ +#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ +#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ + +#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2) +#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) +#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) +/* + ***************************************************************** + * All constants go here + ***************************************************************** + */ + +#define UID_HASH (16) + +/* + * Note that ONE module should define _DECLARE_GLOBALS_HERE to cause the + * following to be declared. + */ + +/**************************************************************************** + * Locking notes. All updates to global variables and lists should be + * protected by spinlocks or semaphores. + * + * Spinlocks + * --------- + * GlobalMid_Lock protects: + * list operations on pending_mid_q and oplockQ + * updates to XID counters, multiplex id and SMB sequence numbers + * cifs_file_list_lock protects: + * list operations on tcp and SMB session lists and tCon lists + * f_owner.lock protects certain per file struct operations + * mapping->page_lock protects certain per page operations + * + * Semaphores + * ---------- + * sesSem operations on smb session + * tconSem operations on tree connection + * fh_sem file handle reconnection operations + * + ****************************************************************************/ + +#ifdef DECLARE_GLOBALS_HERE +#define GLOBAL_EXTERN +#else +#define GLOBAL_EXTERN extern +#endif + +/* + * the list of TCP_Server_Info structures, ie each of the sockets + * connecting our client to a distinct server (ip address), is + * chained together by cifs_tcp_ses_list. The list of all our SMB + * sessions (and from that the tree connections) can be found + * by iterating over cifs_tcp_ses_list + */ +GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; + +/* + * This lock protects the cifs_tcp_ses_list, the list of smb sessions per + * tcp session, and the list of tcon's per smb session. It also protects + * the reference counters for the server, smb session, and tcon. Finally, + * changes to the tcon->tidStatus should be done while holding this lock. + */ +GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock; + +/* + * This lock protects the cifs_file->llist and cifs_file->flist + * list operations, and updates to some flags (cifs_file->invalidHandle) + * It will be moved to either use the tcon->stat_lock or equivalent later. + * If cifs_tcp_ses_lock and the lock below are both needed to be held, then + * the cifs_tcp_ses_lock must be grabbed first and released last. + */ +GLOBAL_EXTERN spinlock_t cifs_file_list_lock; + +#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ +/* Outstanding dir notify requests */ +GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; +/* DirNotify response queue */ +GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q; +#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ + +/* + * Global transaction id (XID) information + */ +GLOBAL_EXTERN unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */ +GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ +GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ +GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */ + /* on midQ entries */ +/* + * Global counters, updated atomically + */ +GLOBAL_EXTERN atomic_t sesInfoAllocCount; +GLOBAL_EXTERN atomic_t tconInfoAllocCount; +GLOBAL_EXTERN atomic_t tcpSesAllocCount; +GLOBAL_EXTERN atomic_t tcpSesReconnectCount; +GLOBAL_EXTERN atomic_t tconInfoReconnectCount; + +/* Various Debug counters */ +GLOBAL_EXTERN atomic_t bufAllocCount; /* current number allocated */ +#ifdef CONFIG_CIFS_STATS2 +GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */ +GLOBAL_EXTERN atomic_t totSmBufAllocCount; +#endif +GLOBAL_EXTERN atomic_t smBufAllocCount; +GLOBAL_EXTERN atomic_t midCount; + +/* Misc globals */ +GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions + to be established on existing mount if we + have the uid/password or Kerberos credential + or equivalent for current user */ +GLOBAL_EXTERN unsigned int oplockEnabled; +GLOBAL_EXTERN unsigned int lookupCacheEnabled; +GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent + with more secure ntlmssp2 challenge/resp */ +GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ +GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ +GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */ +GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ +GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ +GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ + +/* reconnect after this many failed echo attempts */ +GLOBAL_EXTERN unsigned short echo_retries; + +GLOBAL_EXTERN struct rb_root uidtree; +GLOBAL_EXTERN struct rb_root gidtree; +GLOBAL_EXTERN spinlock_t siduidlock; +GLOBAL_EXTERN spinlock_t sidgidlock; + +void cifs_oplock_break(struct work_struct *work); +void cifs_oplock_break_get(struct cifsFileInfo *cfile); +void cifs_oplock_break_put(struct cifsFileInfo *cfile); + +extern const struct slow_work_ops cifs_oplock_break_ops; + +#endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h new file mode 100644 index 00000000..de3aa285 --- /dev/null +++ b/fs/cifs/cifspdu.h @@ -0,0 +1,2641 @@ +/* + * fs/cifs/cifspdu.h + * + * Copyright (c) International Business Machines Corp., 2002,2009 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _CIFSPDU_H +#define _CIFSPDU_H + +#include +#include +#include "smbfsctl.h" + +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define LANMAN_PROT 0 +#define LANMAN2_PROT 1 +#define CIFS_PROT 2 +#else +#define CIFS_PROT 0 +#endif +#define POSIX_PROT (CIFS_PROT+1) +#define BAD_PROT 0xFFFF + +/* SMB command codes: + * Note some commands have minimal (wct=0,bcc=0), or uninteresting, responses + * (ie which include no useful data other than the SMB error code itself). + * This can allow us to avoid response buffer allocations and copy in some cases + */ +#define SMB_COM_CREATE_DIRECTORY 0x00 /* trivial response */ +#define SMB_COM_DELETE_DIRECTORY 0x01 /* trivial response */ +#define SMB_COM_CLOSE 0x04 /* triv req/rsp, timestamp ignored */ +#define SMB_COM_FLUSH 0x05 /* triv req/rsp */ +#define SMB_COM_DELETE 0x06 /* trivial response */ +#define SMB_COM_RENAME 0x07 /* trivial response */ +#define SMB_COM_QUERY_INFORMATION 0x08 /* aka getattr */ +#define SMB_COM_SETATTR 0x09 /* trivial response */ +#define SMB_COM_LOCKING_ANDX 0x24 /* trivial response */ +#define SMB_COM_COPY 0x29 /* trivial rsp, fail filename ignrd*/ +#define SMB_COM_ECHO 0x2B /* echo request */ +#define SMB_COM_OPEN_ANDX 0x2D /* Legacy open for old servers */ +#define SMB_COM_READ_ANDX 0x2E +#define SMB_COM_WRITE_ANDX 0x2F +#define SMB_COM_TRANSACTION2 0x32 +#define SMB_COM_TRANSACTION2_SECONDARY 0x33 +#define SMB_COM_FIND_CLOSE2 0x34 /* trivial response */ +#define SMB_COM_TREE_DISCONNECT 0x71 /* trivial response */ +#define SMB_COM_NEGOTIATE 0x72 +#define SMB_COM_SESSION_SETUP_ANDX 0x73 +#define SMB_COM_LOGOFF_ANDX 0x74 /* trivial response */ +#define SMB_COM_TREE_CONNECT_ANDX 0x75 +#define SMB_COM_NT_TRANSACT 0xA0 +#define SMB_COM_NT_TRANSACT_SECONDARY 0xA1 +#define SMB_COM_NT_CREATE_ANDX 0xA2 +#define SMB_COM_NT_CANCEL 0xA4 /* no response */ +#define SMB_COM_NT_RENAME 0xA5 /* trivial response */ + +/* Transact2 subcommand codes */ +#define TRANS2_OPEN 0x00 +#define TRANS2_FIND_FIRST 0x01 +#define TRANS2_FIND_NEXT 0x02 +#define TRANS2_QUERY_FS_INFORMATION 0x03 +#define TRANS2_SET_FS_INFORMATION 0x04 +#define TRANS2_QUERY_PATH_INFORMATION 0x05 +#define TRANS2_SET_PATH_INFORMATION 0x06 +#define TRANS2_QUERY_FILE_INFORMATION 0x07 +#define TRANS2_SET_FILE_INFORMATION 0x08 +#define TRANS2_GET_DFS_REFERRAL 0x10 +#define TRANS2_REPORT_DFS_INCOSISTENCY 0x11 + +/* SMB Transact (Named Pipe) subcommand codes */ +#define TRANS_SET_NMPIPE_STATE 0x0001 +#define TRANS_RAW_READ_NMPIPE 0x0011 +#define TRANS_QUERY_NMPIPE_STATE 0x0021 +#define TRANS_QUERY_NMPIPE_INFO 0x0022 +#define TRANS_PEEK_NMPIPE 0x0023 +#define TRANS_TRANSACT_NMPIPE 0x0026 +#define TRANS_RAW_WRITE_NMPIPE 0x0031 +#define TRANS_READ_NMPIPE 0x0036 +#define TRANS_WRITE_NMPIPE 0x0037 +#define TRANS_WAIT_NMPIPE 0x0053 +#define TRANS_CALL_NMPIPE 0x0054 + +/* NT Transact subcommand codes */ +#define NT_TRANSACT_CREATE 0x01 +#define NT_TRANSACT_IOCTL 0x02 +#define NT_TRANSACT_SET_SECURITY_DESC 0x03 +#define NT_TRANSACT_NOTIFY_CHANGE 0x04 +#define NT_TRANSACT_RENAME 0x05 +#define NT_TRANSACT_QUERY_SECURITY_DESC 0x06 +#define NT_TRANSACT_GET_USER_QUOTA 0x07 +#define NT_TRANSACT_SET_USER_QUOTA 0x08 + +#define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */ +/* future chained NTCreateXReadX bigger, but for time being NTCreateX biggest */ +/* among the requests (NTCreateX response is bigger with wct of 34) */ +#define MAX_CIFS_HDR_SIZE 0x58 /* 4 len + 32 hdr + (2*24 wct) + 2 bct + 2 pad */ +#define CIFS_SMALL_PATH 120 /* allows for (448-88)/3 */ + +/* internal cifs vfs structures */ +/***************************************************************** + * All constants go here + ***************************************************************** + */ + +/* + * Starting value for maximum SMB size negotiation + */ +#define CIFS_MAX_MSGSIZE (4*4096) + +/* + * Size of encrypted user password in bytes + */ +#define CIFS_ENCPWD_SIZE (16) + +/* + * Size of the crypto key returned on the negotiate SMB in bytes + */ +#define CIFS_CRYPTO_KEY_SIZE (8) + +/* + * Size of the ntlm client response + */ +#define CIFS_AUTH_RESP_SIZE (24) + +/* + * Size of the session key (crypto key encrypted with the password + */ +#define CIFS_SESS_KEY_SIZE (16) + +#define CIFS_CLIENT_CHALLENGE_SIZE (8) +#define CIFS_SERVER_CHALLENGE_SIZE (8) +#define CIFS_HMAC_MD5_HASH_SIZE (16) +#define CIFS_CPHTXT_SIZE (16) +#define CIFS_NTHASH_SIZE (16) + +/* + * Maximum user name length + */ +#define CIFS_UNLEN (20) + +/* + * Flags on SMB open + */ +#define SMBOPEN_WRITE_THROUGH 0x4000 +#define SMBOPEN_DENY_ALL 0x0010 +#define SMBOPEN_DENY_WRITE 0x0020 +#define SMBOPEN_DENY_READ 0x0030 +#define SMBOPEN_DENY_NONE 0x0040 +#define SMBOPEN_READ 0x0000 +#define SMBOPEN_WRITE 0x0001 +#define SMBOPEN_READWRITE 0x0002 +#define SMBOPEN_EXECUTE 0x0003 + +#define SMBOPEN_OCREATE 0x0010 +#define SMBOPEN_OTRUNC 0x0002 +#define SMBOPEN_OAPPEND 0x0001 + +/* + * SMB flag definitions + */ +#define SMBFLG_EXTD_LOCK 0x01 /* server supports lock-read write-unlock smb */ +#define SMBFLG_RCV_POSTED 0x02 /* obsolete */ +#define SMBFLG_RSVD 0x04 +#define SMBFLG_CASELESS 0x08 /* all pathnames treated as caseless (off + implies case sensitive file handling request) */ +#define SMBFLG_CANONICAL_PATH_FORMAT 0x10 /* obsolete */ +#define SMBFLG_OLD_OPLOCK 0x20 /* obsolete */ +#define SMBFLG_OLD_OPLOCK_NOTIFY 0x40 /* obsolete */ +#define SMBFLG_RESPONSE 0x80 /* this PDU is a response from server */ + +/* + * SMB flag2 definitions + */ +#define SMBFLG2_KNOWS_LONG_NAMES cpu_to_le16(1) /* can send long (non-8.3) + path names in response */ +#define SMBFLG2_KNOWS_EAS cpu_to_le16(2) +#define SMBFLG2_SECURITY_SIGNATURE cpu_to_le16(4) +#define SMBFLG2_COMPRESSED (8) +#define SMBFLG2_SECURITY_SIGNATURE_REQUIRED (0x10) +#define SMBFLG2_IS_LONG_NAME cpu_to_le16(0x40) +#define SMBFLG2_REPARSE_PATH (0x400) +#define SMBFLG2_EXT_SEC cpu_to_le16(0x800) +#define SMBFLG2_DFS cpu_to_le16(0x1000) +#define SMBFLG2_PAGING_IO cpu_to_le16(0x2000) +#define SMBFLG2_ERR_STATUS cpu_to_le16(0x4000) +#define SMBFLG2_UNICODE cpu_to_le16(0x8000) + +/* + * These are the file access permission bits defined in CIFS for the + * NTCreateAndX as well as the level 0x107 + * TRANS2_QUERY_PATH_INFORMATION API. The level 0x107, SMB_QUERY_FILE_ALL_INFO + * responds with the AccessFlags. + * The AccessFlags specifies the access permissions a caller has to the + * file and can have any suitable combination of the following values: + */ + +#define FILE_READ_DATA 0x00000001 /* Data can be read from the file */ +#define FILE_WRITE_DATA 0x00000002 /* Data can be written to the file */ +#define FILE_APPEND_DATA 0x00000004 /* Data can be appended to the file */ +#define FILE_READ_EA 0x00000008 /* Extended attributes associated */ + /* with the file can be read */ +#define FILE_WRITE_EA 0x00000010 /* Extended attributes associated */ + /* with the file can be written */ +#define FILE_EXECUTE 0x00000020 /*Data can be read into memory from */ + /* the file using system paging I/O */ +#define FILE_DELETE_CHILD 0x00000040 +#define FILE_READ_ATTRIBUTES 0x00000080 /* Attributes associated with the */ + /* file can be read */ +#define FILE_WRITE_ATTRIBUTES 0x00000100 /* Attributes associated with the */ + /* file can be written */ +#define DELETE 0x00010000 /* The file can be deleted */ +#define READ_CONTROL 0x00020000 /* The access control list and */ + /* ownership associated with the */ + /* file can be read */ +#define WRITE_DAC 0x00040000 /* The access control list and */ + /* ownership associated with the */ + /* file can be written. */ +#define WRITE_OWNER 0x00080000 /* Ownership information associated */ + /* with the file can be written */ +#define SYNCHRONIZE 0x00100000 /* The file handle can waited on to */ + /* synchronize with the completion */ + /* of an input/output request */ +#define GENERIC_ALL 0x10000000 +#define GENERIC_EXECUTE 0x20000000 +#define GENERIC_WRITE 0x40000000 +#define GENERIC_READ 0x80000000 + /* In summary - Relevant file */ + /* access flags from CIFS are */ + /* file_read_data, file_write_data */ + /* file_execute, file_read_attributes*/ + /* write_dac, and delete. */ + +#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_READ_ATTRIBUTES) +#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \ + | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES) +#define FILE_EXEC_RIGHTS (FILE_EXECUTE) + +#define SET_FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_WRITE_EA \ + | FILE_READ_ATTRIBUTES \ + | FILE_WRITE_ATTRIBUTES \ + | DELETE | READ_CONTROL | WRITE_DAC \ + | WRITE_OWNER | SYNCHRONIZE) +#define SET_FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \ + | FILE_READ_EA | FILE_WRITE_EA \ + | FILE_DELETE_CHILD | FILE_READ_ATTRIBUTES \ + | FILE_WRITE_ATTRIBUTES \ + | DELETE | READ_CONTROL | WRITE_DAC \ + | WRITE_OWNER | SYNCHRONIZE) +#define SET_FILE_EXEC_RIGHTS (FILE_READ_EA | FILE_WRITE_EA | FILE_EXECUTE \ + | FILE_READ_ATTRIBUTES \ + | FILE_WRITE_ATTRIBUTES \ + | DELETE | READ_CONTROL | WRITE_DAC \ + | WRITE_OWNER | SYNCHRONIZE) + +#define SET_MINIMUM_RIGHTS (FILE_READ_EA | FILE_READ_ATTRIBUTES \ + | READ_CONTROL | SYNCHRONIZE) + + +/* + * Invalid readdir handle + */ +#define CIFS_NO_HANDLE 0xFFFF + +#define NO_CHANGE_64 0xFFFFFFFFFFFFFFFFULL +#define NO_CHANGE_32 0xFFFFFFFFUL + +/* IPC$ in ASCII */ +#define CIFS_IPC_RESOURCE "\x49\x50\x43\x24" + +/* IPC$ in Unicode */ +#define CIFS_IPC_UNICODE_RESOURCE "\x00\x49\x00\x50\x00\x43\x00\x24\x00\x00" + +/* Unicode Null terminate 2 bytes of 0 */ +#define UNICODE_NULL "\x00\x00" +#define ASCII_NULL 0x00 + +/* + * Server type values (returned on EnumServer API + */ +#define CIFS_SV_TYPE_DC 0x00000008 +#define CIFS_SV_TYPE_BACKDC 0x00000010 + +/* + * Alias type flags (From EnumAlias API call + */ +#define CIFS_ALIAS_TYPE_FILE 0x0001 +#define CIFS_SHARE_TYPE_FILE 0x0000 + +/* + * File Attribute flags + */ +#define ATTR_READONLY 0x0001 +#define ATTR_HIDDEN 0x0002 +#define ATTR_SYSTEM 0x0004 +#define ATTR_VOLUME 0x0008 +#define ATTR_DIRECTORY 0x0010 +#define ATTR_ARCHIVE 0x0020 +#define ATTR_DEVICE 0x0040 +#define ATTR_NORMAL 0x0080 +#define ATTR_TEMPORARY 0x0100 +#define ATTR_SPARSE 0x0200 +#define ATTR_REPARSE 0x0400 +#define ATTR_COMPRESSED 0x0800 +#define ATTR_OFFLINE 0x1000 /* ie file not immediately available - + on offline storage */ +#define ATTR_NOT_CONTENT_INDEXED 0x2000 +#define ATTR_ENCRYPTED 0x4000 +#define ATTR_POSIX_SEMANTICS 0x01000000 +#define ATTR_BACKUP_SEMANTICS 0x02000000 +#define ATTR_DELETE_ON_CLOSE 0x04000000 +#define ATTR_SEQUENTIAL_SCAN 0x08000000 +#define ATTR_RANDOM_ACCESS 0x10000000 +#define ATTR_NO_BUFFERING 0x20000000 +#define ATTR_WRITE_THROUGH 0x80000000 + +/* ShareAccess flags */ +#define FILE_NO_SHARE 0x00000000 +#define FILE_SHARE_READ 0x00000001 +#define FILE_SHARE_WRITE 0x00000002 +#define FILE_SHARE_DELETE 0x00000004 +#define FILE_SHARE_ALL 0x00000007 + +/* CreateDisposition flags, similar to CreateAction as well */ +#define FILE_SUPERSEDE 0x00000000 +#define FILE_OPEN 0x00000001 +#define FILE_CREATE 0x00000002 +#define FILE_OPEN_IF 0x00000003 +#define FILE_OVERWRITE 0x00000004 +#define FILE_OVERWRITE_IF 0x00000005 + +/* CreateOptions */ +#define CREATE_NOT_FILE 0x00000001 /* if set must not be file */ +#define CREATE_WRITE_THROUGH 0x00000002 +#define CREATE_SEQUENTIAL 0x00000004 +#define CREATE_NO_BUFFER 0x00000008 /* should not buffer on srv */ +#define CREATE_SYNC_ALERT 0x00000010 /* MBZ */ +#define CREATE_ASYNC_ALERT 0x00000020 /* MBZ */ +#define CREATE_NOT_DIR 0x00000040 /* if set must not be directory */ +#define CREATE_TREE_CONNECTION 0x00000080 /* should be zero */ +#define CREATE_COMPLETE_IF_OPLK 0x00000100 /* should be zero */ +#define CREATE_NO_EA_KNOWLEDGE 0x00000200 +#define CREATE_EIGHT_DOT_THREE 0x00000400 /* doc says this is obsolete + "open for recovery" flag should + be zero in any case */ +#define CREATE_OPEN_FOR_RECOVERY 0x00000400 +#define CREATE_RANDOM_ACCESS 0x00000800 +#define CREATE_DELETE_ON_CLOSE 0x00001000 +#define CREATE_OPEN_BY_ID 0x00002000 +#define CREATE_OPEN_BACKUP_INTENT 0x00004000 +#define CREATE_NO_COMPRESSION 0x00008000 +#define CREATE_RESERVE_OPFILTER 0x00100000 /* should be zero */ +#define OPEN_REPARSE_POINT 0x00200000 +#define OPEN_NO_RECALL 0x00400000 +#define OPEN_FREE_SPACE_QUERY 0x00800000 /* should be zero */ +#define CREATE_OPTIONS_MASK 0x007FFFFF +#define CREATE_OPTION_READONLY 0x10000000 +#define CREATE_OPTION_SPECIAL 0x20000000 /* system. NB not sent over wire */ + +/* ImpersonationLevel flags */ +#define SECURITY_ANONYMOUS 0 +#define SECURITY_IDENTIFICATION 1 +#define SECURITY_IMPERSONATION 2 +#define SECURITY_DELEGATION 3 + +/* SecurityFlags */ +#define SECURITY_CONTEXT_TRACKING 0x01 +#define SECURITY_EFFECTIVE_ONLY 0x02 + +/* + * Default PID value, used in all SMBs where the PID is not important + */ +#define CIFS_DFT_PID 0x1234 + +/* + * We use the same routine for Copy and Move SMBs. This flag is used to + * distinguish + */ +#define CIFS_COPY_OP 1 +#define CIFS_RENAME_OP 2 + +#define GETU16(var) (*((__u16 *)var)) /* BB check for endian issues */ +#define GETU32(var) (*((__u32 *)var)) /* BB check for endian issues */ + +struct smb_hdr { + __be32 smb_buf_length; /* BB length is only two (rarely three) bytes, + with one or two byte "type" preceding it that will be + zero - we could mask the type byte off */ + __u8 Protocol[4]; + __u8 Command; + union { + struct { + __u8 ErrorClass; + __u8 Reserved; + __le16 Error; + } __attribute__((packed)) DosError; + __le32 CifsError; + } __attribute__((packed)) Status; + __u8 Flags; + __le16 Flags2; /* note: le */ + __le16 PidHigh; + union { + struct { + __le32 SequenceNumber; /* le */ + __u32 Reserved; /* zero */ + } __attribute__((packed)) Sequence; + __u8 SecuritySignature[8]; /* le */ + } __attribute__((packed)) Signature; + __u8 pad[2]; + __u16 Tid; + __le16 Pid; + __u16 Uid; + __u16 Mid; + __u8 WordCount; +} __attribute__((packed)); + +/* given a pointer to an smb_hdr, retrieve a void pointer to the ByteCount */ +static inline void * +BCC(struct smb_hdr *smb) +{ + return (void *)smb + sizeof(*smb) + 2 * smb->WordCount; +} + +/* given a pointer to an smb_hdr retrieve the pointer to the byte area */ +#define pByteArea(smb_var) (BCC(smb_var) + 2) + +/* get the unconverted ByteCount for a SMB packet and return it */ +static inline __u16 +get_bcc(struct smb_hdr *hdr) +{ + __le16 *bc_ptr = (__le16 *)BCC(hdr); + + return get_unaligned_le16(bc_ptr); +} + +/* set the ByteCount for a SMB packet in little-endian */ +static inline void +put_bcc(__u16 count, struct smb_hdr *hdr) +{ + __le16 *bc_ptr = (__le16 *)BCC(hdr); + + put_unaligned_le16(count, bc_ptr); +} + +/* + * Computer Name Length (since Netbios name was length 16 with last byte 0x20) + * No longer as important, now that TCP names are more commonly used to + * resolve hosts. + */ +#define CNLEN 15 + +/* + * Share Name Length (SNLEN) + * Note: This length was limited by the SMB used to get + * the Share info. NetShareEnum only returned 13 + * chars, including the null termination. + * This was removed because it no longer is limiting. + */ + +/* + * Comment Length + */ +#define MAXCOMMENTLEN 40 + +/* + * The OS/2 maximum path name + */ +#define MAX_PATHCONF 256 + +/* + * SMB frame definitions (following must be packed structs) + * See the SNIA CIFS Specification for details. + * + * The Naming convention is the lower case version of the + * smb command code name for the struct and this is typedef to the + * uppercase version of the same name with the prefix SMB_ removed + * for brevity. Although typedefs are not commonly used for + * structure definitions in the Linux kernel, their use in the + * CIFS standards document, which this code is based on, may + * make this one of the cases where typedefs for structures make + * sense to improve readability for readers of the standards doc. + * Typedefs can always be removed later if they are too distracting + * and they are only used for the CIFSs PDUs themselves, not + * internal cifs vfs structures + * + */ + +typedef struct negotiate_req { + struct smb_hdr hdr; /* wct = 0 */ + __le16 ByteCount; + unsigned char DialectsArray[1]; +} __attribute__((packed)) NEGOTIATE_REQ; + +/* Dialect index is 13 for LANMAN */ + +#define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */ + +typedef struct lanman_neg_rsp { + struct smb_hdr hdr; /* wct = 13 */ + __le16 DialectIndex; + __le16 SecurityMode; + __le16 MaxBufSize; + __le16 MaxMpxCount; + __le16 MaxNumberVcs; + __le16 RawMode; + __le32 SessionKey; + struct { + __le16 Time; + __le16 Date; + } __attribute__((packed)) SrvTime; + __le16 ServerTimeZone; + __le16 EncryptionKeyLength; + __le16 Reserved; + __u16 ByteCount; + unsigned char EncryptionKey[1]; +} __attribute__((packed)) LANMAN_NEG_RSP; + +#define READ_RAW_ENABLE 1 +#define WRITE_RAW_ENABLE 2 +#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE) + +typedef struct negotiate_rsp { + struct smb_hdr hdr; /* wct = 17 */ + __le16 DialectIndex; /* 0xFFFF = no dialect acceptable */ + __u8 SecurityMode; + __le16 MaxMpxCount; + __le16 MaxNumberVcs; + __le32 MaxBufferSize; + __le32 MaxRawSize; + __le32 SessionKey; + __le32 Capabilities; /* see below */ + __le32 SystemTimeLow; + __le32 SystemTimeHigh; + __le16 ServerTimeZone; + __u8 EncryptionKeyLength; + __u16 ByteCount; + union { + unsigned char EncryptionKey[1]; /* cap extended security off */ + /* followed by Domain name - if extended security is off */ + /* followed by 16 bytes of server GUID */ + /* then security blob if cap_extended_security negotiated */ + struct { + unsigned char GUID[16]; + unsigned char SecurityBlob[1]; + } __attribute__((packed)) extended_response; + } __attribute__((packed)) u; +} __attribute__((packed)) NEGOTIATE_RSP; + +/* SecurityMode bits */ +#define SECMODE_USER 0x01 /* off indicates share level security */ +#define SECMODE_PW_ENCRYPT 0x02 +#define SECMODE_SIGN_ENABLED 0x04 /* SMB security signatures enabled */ +#define SECMODE_SIGN_REQUIRED 0x08 /* SMB security signatures required */ + +/* Negotiate response Capabilities */ +#define CAP_RAW_MODE 0x00000001 +#define CAP_MPX_MODE 0x00000002 +#define CAP_UNICODE 0x00000004 +#define CAP_LARGE_FILES 0x00000008 +#define CAP_NT_SMBS 0x00000010 /* implies CAP_NT_FIND */ +#define CAP_RPC_REMOTE_APIS 0x00000020 +#define CAP_STATUS32 0x00000040 +#define CAP_LEVEL_II_OPLOCKS 0x00000080 +#define CAP_LOCK_AND_READ 0x00000100 +#define CAP_NT_FIND 0x00000200 +#define CAP_DFS 0x00001000 +#define CAP_INFOLEVEL_PASSTHRU 0x00002000 +#define CAP_LARGE_READ_X 0x00004000 +#define CAP_LARGE_WRITE_X 0x00008000 +#define CAP_LWIO 0x00010000 /* support fctl_srv_req_resume_key */ +#define CAP_UNIX 0x00800000 +#define CAP_COMPRESSED_DATA 0x02000000 +#define CAP_DYNAMIC_REAUTH 0x20000000 +#define CAP_PERSISTENT_HANDLES 0x40000000 +#define CAP_EXTENDED_SECURITY 0x80000000 + +typedef union smb_com_session_setup_andx { + struct { /* request format */ + struct smb_hdr hdr; /* wct = 12 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 MaxBufferSize; + __le16 MaxMpxCount; + __le16 VcNumber; + __u32 SessionKey; + __le16 SecurityBlobLength; + __u32 Reserved; + __le32 Capabilities; /* see below */ + __le16 ByteCount; + unsigned char SecurityBlob[1]; /* followed by */ + /* STRING NativeOS */ + /* STRING NativeLanMan */ + } __attribute__((packed)) req; /* NTLM request format (with + extended security */ + + struct { /* request format */ + struct smb_hdr hdr; /* wct = 13 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 MaxBufferSize; + __le16 MaxMpxCount; + __le16 VcNumber; + __u32 SessionKey; + __le16 CaseInsensitivePasswordLength; /* ASCII password len */ + __le16 CaseSensitivePasswordLength; /* Unicode password length*/ + __u32 Reserved; /* see below */ + __le32 Capabilities; + __le16 ByteCount; + unsigned char CaseInsensitivePassword[1]; /* followed by: */ + /* unsigned char * CaseSensitivePassword; */ + /* STRING AccountName */ + /* STRING PrimaryDomain */ + /* STRING NativeOS */ + /* STRING NativeLanMan */ + } __attribute__((packed)) req_no_secext; /* NTLM request format (without + extended security */ + + struct { /* default (NTLM) response format */ + struct smb_hdr hdr; /* wct = 4 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 Action; /* see below */ + __le16 SecurityBlobLength; + __u16 ByteCount; + unsigned char SecurityBlob[1]; /* followed by */ +/* unsigned char * NativeOS; */ +/* unsigned char * NativeLanMan; */ +/* unsigned char * PrimaryDomain; */ + } __attribute__((packed)) resp; /* NTLM response + (with or without extended sec) */ + + struct { /* request format */ + struct smb_hdr hdr; /* wct = 10 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 MaxBufferSize; + __le16 MaxMpxCount; + __le16 VcNumber; + __u32 SessionKey; + __le16 PasswordLength; + __u32 Reserved; /* encrypt key len and offset */ + __le16 ByteCount; + unsigned char AccountPassword[1]; /* followed by */ + /* STRING AccountName */ + /* STRING PrimaryDomain */ + /* STRING NativeOS */ + /* STRING NativeLanMan */ + } __attribute__((packed)) old_req; /* pre-NTLM (LANMAN2.1) req format */ + + struct { /* default (NTLM) response format */ + struct smb_hdr hdr; /* wct = 3 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 Action; /* see below */ + __u16 ByteCount; + unsigned char NativeOS[1]; /* followed by */ +/* unsigned char * NativeLanMan; */ +/* unsigned char * PrimaryDomain; */ + } __attribute__((packed)) old_resp; /* pre-NTLM (LANMAN2.1) response */ +} __attribute__((packed)) SESSION_SETUP_ANDX; + +/* format of NLTMv2 Response ie "case sensitive password" hash when NTLMv2 */ + +#define NTLMSSP_SERVER_TYPE 1 +#define NTLMSSP_DOMAIN_TYPE 2 +#define NTLMSSP_FQ_DOMAIN_TYPE 3 +#define NTLMSSP_DNS_DOMAIN_TYPE 4 +#define NTLMSSP_DNS_PARENT_TYPE 5 + +struct ntlmssp2_name { + __le16 type; + __le16 length; +/* char name[length]; */ +} __attribute__((packed)); + +struct ntlmv2_resp { + char ntlmv2_hash[CIFS_ENCPWD_SIZE]; + __le32 blob_signature; + __u32 reserved; + __le64 time; + __u64 client_chal; /* random */ + __u32 reserved2; + /* array of name entries could follow ending in minimum 4 byte struct */ +} __attribute__((packed)); + + +#define CIFS_NETWORK_OPSYS "CIFS VFS Client for Linux" + +/* Capabilities bits (for NTLM SessSetup request) */ +#define CAP_UNICODE 0x00000004 +#define CAP_LARGE_FILES 0x00000008 +#define CAP_NT_SMBS 0x00000010 +#define CAP_STATUS32 0x00000040 +#define CAP_LEVEL_II_OPLOCKS 0x00000080 +#define CAP_NT_FIND 0x00000200 /* reserved should be zero + (because NT_SMBs implies the same thing?) */ +#define CAP_BULK_TRANSFER 0x20000000 +#define CAP_EXTENDED_SECURITY 0x80000000 + +/* Action bits */ +#define GUEST_LOGIN 1 + +typedef struct smb_com_tconx_req { + struct smb_hdr hdr; /* wct = 4 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 Flags; /* see below */ + __le16 PasswordLength; + __le16 ByteCount; + unsigned char Password[1]; /* followed by */ +/* STRING Path *//* \\server\share name */ + /* STRING Service */ +} __attribute__((packed)) TCONX_REQ; + +typedef struct smb_com_tconx_rsp { + struct smb_hdr hdr; /* wct = 3 , not extended response */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 OptionalSupport; /* see below */ + __u16 ByteCount; + unsigned char Service[1]; /* always ASCII, not Unicode */ + /* STRING NativeFileSystem */ +} __attribute__((packed)) TCONX_RSP; + +typedef struct smb_com_tconx_rsp_ext { + struct smb_hdr hdr; /* wct = 7, extended response */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 OptionalSupport; /* see below */ + __le32 MaximalShareAccessRights; + __le32 GuestMaximalShareAccessRights; + __u16 ByteCount; + unsigned char Service[1]; /* always ASCII, not Unicode */ + /* STRING NativeFileSystem */ +} __attribute__((packed)) TCONX_RSP_EXT; + + +/* tree connect Flags */ +#define DISCONNECT_TID 0x0001 +#define TCON_EXTENDED_SIGNATURES 0x0004 +#define TCON_EXTENDED_SECINFO 0x0008 + +/* OptionalSupport bits */ +#define SMB_SUPPORT_SEARCH_BITS 0x0001 /* "must have" directory search bits + (exclusive searches supported) */ +#define SMB_SHARE_IS_IN_DFS 0x0002 +#define SMB_CSC_MASK 0x000C +/* CSC flags defined as follows */ +#define SMB_CSC_CACHE_MANUAL_REINT 0x0000 +#define SMB_CSC_CACHE_AUTO_REINT 0x0004 +#define SMB_CSC_CACHE_VDO 0x0008 +#define SMB_CSC_NO_CACHING 0x000C +#define SMB_UNIQUE_FILE_NAME 0x0010 +#define SMB_EXTENDED_SIGNATURES 0x0020 + +/* services + * + * A: ie disk + * LPT1: ie printer + * IPC ie named pipe + * COMM + * ????? ie any type + * + */ + +typedef struct smb_com_echo_req { + struct smb_hdr hdr; + __le16 EchoCount; + __le16 ByteCount; + char Data[1]; +} __attribute__((packed)) ECHO_REQ; + +typedef struct smb_com_echo_rsp { + struct smb_hdr hdr; + __le16 SequenceNumber; + __le16 ByteCount; + char Data[1]; +} __attribute__((packed)) ECHO_RSP; + +typedef struct smb_com_logoff_andx_req { + struct smb_hdr hdr; /* wct = 2 */ + __u8 AndXCommand; + __u8 AndXReserved; + __u16 AndXOffset; + __u16 ByteCount; +} __attribute__((packed)) LOGOFF_ANDX_REQ; + +typedef struct smb_com_logoff_andx_rsp { + struct smb_hdr hdr; /* wct = 2 */ + __u8 AndXCommand; + __u8 AndXReserved; + __u16 AndXOffset; + __u16 ByteCount; +} __attribute__((packed)) LOGOFF_ANDX_RSP; + +typedef union smb_com_tree_disconnect { /* as an altetnative can use flag on + tree_connect PDU to effect disconnect */ + /* tdis is probably simplest SMB PDU */ + struct { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bcc = 0 */ + } __attribute__((packed)) req; + struct { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bcc = 0 */ + } __attribute__((packed)) resp; +} __attribute__((packed)) TREE_DISCONNECT; + +typedef struct smb_com_close_req { + struct smb_hdr hdr; /* wct = 3 */ + __u16 FileID; + __u32 LastWriteTime; /* should be zero or -1 */ + __u16 ByteCount; /* 0 */ +} __attribute__((packed)) CLOSE_REQ; + +typedef struct smb_com_close_rsp { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) CLOSE_RSP; + +typedef struct smb_com_flush_req { + struct smb_hdr hdr; /* wct = 1 */ + __u16 FileID; + __u16 ByteCount; /* 0 */ +} __attribute__((packed)) FLUSH_REQ; + +typedef struct smb_com_findclose_req { + struct smb_hdr hdr; /* wct = 1 */ + __u16 FileID; + __u16 ByteCount; /* 0 */ +} __attribute__((packed)) FINDCLOSE_REQ; + +/* OpenFlags */ +#define REQ_MORE_INFO 0x00000001 /* legacy (OPEN_AND_X) only */ +#define REQ_OPLOCK 0x00000002 +#define REQ_BATCHOPLOCK 0x00000004 +#define REQ_OPENDIRONLY 0x00000008 +#define REQ_EXTENDED_INFO 0x00000010 + +/* File type */ +#define DISK_TYPE 0x0000 +#define BYTE_PIPE_TYPE 0x0001 +#define MESSAGE_PIPE_TYPE 0x0002 +#define PRINTER_TYPE 0x0003 +#define COMM_DEV_TYPE 0x0004 +#define UNKNOWN_TYPE 0xFFFF + +/* Device Type or File Status Flags */ +#define NO_EAS 0x0001 +#define NO_SUBSTREAMS 0x0002 +#define NO_REPARSETAG 0x0004 +/* following flags can apply if pipe */ +#define ICOUNT_MASK 0x00FF +#define PIPE_READ_MODE 0x0100 +#define NAMED_PIPE_TYPE 0x0400 +#define PIPE_END_POINT 0x4000 +#define BLOCKING_NAMED_PIPE 0x8000 + +typedef struct smb_com_open_req { /* also handles create */ + struct smb_hdr hdr; /* wct = 24 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u8 Reserved; /* Must Be Zero */ + __le16 NameLength; + __le32 OpenFlags; + __u32 RootDirectoryFid; + __le32 DesiredAccess; + __le64 AllocationSize; + __le32 FileAttributes; + __le32 ShareAccess; + __le32 CreateDisposition; + __le32 CreateOptions; + __le32 ImpersonationLevel; + __u8 SecurityFlags; + __le16 ByteCount; + char fileName[1]; +} __attribute__((packed)) OPEN_REQ; + +/* open response: oplock levels */ +#define OPLOCK_NONE 0 +#define OPLOCK_EXCLUSIVE 1 +#define OPLOCK_BATCH 2 +#define OPLOCK_READ 3 /* level 2 oplock */ + +/* open response for CreateAction shifted left */ +#define CIFS_CREATE_ACTION 0x20000 /* file created */ + +typedef struct smb_com_open_rsp { + struct smb_hdr hdr; /* wct = 34 BB */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u8 OplockLevel; + __u16 Fid; + __le32 CreateAction; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 FileAttributes; + __le64 AllocationSize; + __le64 EndOfFile; + __le16 FileType; + __le16 DeviceState; + __u8 DirectoryFlag; + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) OPEN_RSP; + +typedef struct smb_com_open_rsp_ext { + struct smb_hdr hdr; /* wct = 42 but meaningless due to MS bug? */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u8 OplockLevel; + __u16 Fid; + __le32 CreateAction; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 FileAttributes; + __le64 AllocationSize; + __le64 EndOfFile; + __le16 FileType; + __le16 DeviceState; + __u8 DirectoryFlag; + __u8 VolumeGUID[16]; + __u64 FileId; /* note no endian conversion - is opaque UniqueID */ + __le32 MaximalAccessRights; + __le32 GuestMaximalAccessRights; + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) OPEN_RSP_EXT; + + +/* format of legacy open request */ +typedef struct smb_com_openx_req { + struct smb_hdr hdr; /* wct = 15 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 OpenFlags; + __le16 Mode; + __le16 Sattr; /* search attributes */ + __le16 FileAttributes; /* dos attrs */ + __le32 CreateTime; /* os2 format */ + __le16 OpenFunction; + __le32 EndOfFile; + __le32 Timeout; + __le32 Reserved; + __le16 ByteCount; /* file name follows */ + char fileName[1]; +} __attribute__((packed)) OPENX_REQ; + +typedef struct smb_com_openx_rsp { + struct smb_hdr hdr; /* wct = 15 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u16 Fid; + __le16 FileAttributes; + __le32 LastWriteTime; /* os2 format */ + __le32 EndOfFile; + __le16 Access; + __le16 FileType; + __le16 IPCState; + __le16 Action; + __u32 FileId; + __u16 Reserved; + __u16 ByteCount; +} __attribute__((packed)) OPENX_RSP; + +/* For encoding of POSIX Open Request - see trans2 function 0x209 data struct */ + +/* Legacy write request for older servers */ +typedef struct smb_com_writex_req { + struct smb_hdr hdr; /* wct = 12 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u16 Fid; + __le32 OffsetLow; + __u32 Reserved; /* Timeout */ + __le16 WriteMode; /* 1 = write through */ + __le16 Remaining; + __le16 Reserved2; + __le16 DataLengthLow; + __le16 DataOffset; + __le16 ByteCount; + __u8 Pad; /* BB check for whether padded to DWORD + boundary and optimum performance here */ + char Data[0]; +} __attribute__((packed)) WRITEX_REQ; + +typedef struct smb_com_write_req { + struct smb_hdr hdr; /* wct = 14 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u16 Fid; + __le32 OffsetLow; + __u32 Reserved; + __le16 WriteMode; + __le16 Remaining; + __le16 DataLengthHigh; + __le16 DataLengthLow; + __le16 DataOffset; + __le32 OffsetHigh; + __le16 ByteCount; + __u8 Pad; /* BB check for whether padded to DWORD + boundary and optimum performance here */ + char Data[0]; +} __attribute__((packed)) WRITE_REQ; + +typedef struct smb_com_write_rsp { + struct smb_hdr hdr; /* wct = 6 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 Count; + __le16 Remaining; + __le16 CountHigh; + __u16 Reserved; + __u16 ByteCount; +} __attribute__((packed)) WRITE_RSP; + +/* legacy read request for older servers */ +typedef struct smb_com_readx_req { + struct smb_hdr hdr; /* wct = 10 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u16 Fid; + __le32 OffsetLow; + __le16 MaxCount; + __le16 MinCount; /* obsolete */ + __le32 Reserved; + __le16 Remaining; + __le16 ByteCount; +} __attribute__((packed)) READX_REQ; + +typedef struct smb_com_read_req { + struct smb_hdr hdr; /* wct = 12 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u16 Fid; + __le32 OffsetLow; + __le16 MaxCount; + __le16 MinCount; /* obsolete */ + __le32 MaxCountHigh; + __le16 Remaining; + __le32 OffsetHigh; + __le16 ByteCount; +} __attribute__((packed)) READ_REQ; + +typedef struct smb_com_read_rsp { + struct smb_hdr hdr; /* wct = 12 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 Remaining; + __le16 DataCompactionMode; + __le16 Reserved; + __le16 DataLength; + __le16 DataOffset; + __le16 DataLengthHigh; + __u64 Reserved2; + __u16 ByteCount; + __u8 Pad; /* BB check for whether padded to DWORD + boundary and optimum performance here */ + char Data[1]; +} __attribute__((packed)) READ_RSP; + +typedef struct locking_andx_range { + __le16 Pid; + __le16 Pad; + __le32 OffsetHigh; + __le32 OffsetLow; + __le32 LengthHigh; + __le32 LengthLow; +} __attribute__((packed)) LOCKING_ANDX_RANGE; + +#define LOCKING_ANDX_SHARED_LOCK 0x01 +#define LOCKING_ANDX_OPLOCK_RELEASE 0x02 +#define LOCKING_ANDX_CHANGE_LOCKTYPE 0x04 +#define LOCKING_ANDX_CANCEL_LOCK 0x08 +#define LOCKING_ANDX_LARGE_FILES 0x10 /* always on for us */ + +typedef struct smb_com_lock_req { + struct smb_hdr hdr; /* wct = 8 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u16 Fid; + __u8 LockType; + __u8 OplockLevel; + __le32 Timeout; + __le16 NumberOfUnlocks; + __le16 NumberOfLocks; + __le16 ByteCount; + LOCKING_ANDX_RANGE Locks[1]; +} __attribute__((packed)) LOCK_REQ; + +/* lock type */ +#define CIFS_RDLCK 0 +#define CIFS_WRLCK 1 +#define CIFS_UNLCK 2 +typedef struct cifs_posix_lock { + __le16 lock_type; /* 0 = Read, 1 = Write, 2 = Unlock */ + __le16 lock_flags; /* 1 = Wait (only valid for setlock) */ + __le32 pid; + __le64 start; + __le64 length; + /* BB what about additional owner info to identify network client */ +} __attribute__((packed)) CIFS_POSIX_LOCK; + +typedef struct smb_com_lock_rsp { + struct smb_hdr hdr; /* wct = 2 */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u16 ByteCount; +} __attribute__((packed)) LOCK_RSP; + +typedef struct smb_com_rename_req { + struct smb_hdr hdr; /* wct = 1 */ + __le16 SearchAttributes; /* target file attributes */ + __le16 ByteCount; + __u8 BufferFormat; /* 4 = ASCII or Unicode */ + unsigned char OldFileName[1]; + /* followed by __u8 BufferFormat2 */ + /* followed by NewFileName */ +} __attribute__((packed)) RENAME_REQ; + + /* copy request flags */ +#define COPY_MUST_BE_FILE 0x0001 +#define COPY_MUST_BE_DIR 0x0002 +#define COPY_TARGET_MODE_ASCII 0x0004 /* if not set, binary */ +#define COPY_SOURCE_MODE_ASCII 0x0008 /* if not set, binary */ +#define COPY_VERIFY_WRITES 0x0010 +#define COPY_TREE 0x0020 + +typedef struct smb_com_copy_req { + struct smb_hdr hdr; /* wct = 3 */ + __u16 Tid2; + __le16 OpenFunction; + __le16 Flags; + __le16 ByteCount; + __u8 BufferFormat; /* 4 = ASCII or Unicode */ + unsigned char OldFileName[1]; + /* followed by __u8 BufferFormat2 */ + /* followed by NewFileName string */ +} __attribute__((packed)) COPY_REQ; + +typedef struct smb_com_copy_rsp { + struct smb_hdr hdr; /* wct = 1 */ + __le16 CopyCount; /* number of files copied */ + __u16 ByteCount; /* may be zero */ + __u8 BufferFormat; /* 0x04 - only present if errored file follows */ + unsigned char ErrorFileName[1]; /* only present if error in copy */ +} __attribute__((packed)) COPY_RSP; + +#define CREATE_HARD_LINK 0x103 +#define MOVEFILE_COPY_ALLOWED 0x0002 +#define MOVEFILE_REPLACE_EXISTING 0x0001 + +typedef struct smb_com_nt_rename_req { /* A5 - also used for create hardlink */ + struct smb_hdr hdr; /* wct = 4 */ + __le16 SearchAttributes; /* target file attributes */ + __le16 Flags; /* spec says Information Level */ + __le32 ClusterCount; + __le16 ByteCount; + __u8 BufferFormat; /* 4 = ASCII or Unicode */ + unsigned char OldFileName[1]; + /* followed by __u8 BufferFormat2 */ + /* followed by NewFileName */ +} __attribute__((packed)) NT_RENAME_REQ; + +typedef struct smb_com_rename_rsp { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) RENAME_RSP; + +typedef struct smb_com_delete_file_req { + struct smb_hdr hdr; /* wct = 1 */ + __le16 SearchAttributes; + __le16 ByteCount; + __u8 BufferFormat; /* 4 = ASCII */ + unsigned char fileName[1]; +} __attribute__((packed)) DELETE_FILE_REQ; + +typedef struct smb_com_delete_file_rsp { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) DELETE_FILE_RSP; + +typedef struct smb_com_delete_directory_req { + struct smb_hdr hdr; /* wct = 0 */ + __le16 ByteCount; + __u8 BufferFormat; /* 4 = ASCII */ + unsigned char DirName[1]; +} __attribute__((packed)) DELETE_DIRECTORY_REQ; + +typedef struct smb_com_delete_directory_rsp { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) DELETE_DIRECTORY_RSP; + +typedef struct smb_com_create_directory_req { + struct smb_hdr hdr; /* wct = 0 */ + __le16 ByteCount; + __u8 BufferFormat; /* 4 = ASCII */ + unsigned char DirName[1]; +} __attribute__((packed)) CREATE_DIRECTORY_REQ; + +typedef struct smb_com_create_directory_rsp { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) CREATE_DIRECTORY_RSP; + +typedef struct smb_com_query_information_req { + struct smb_hdr hdr; /* wct = 0 */ + __le16 ByteCount; /* 1 + namelen + 1 */ + __u8 BufferFormat; /* 4 = ASCII */ + unsigned char FileName[1]; +} __attribute__((packed)) QUERY_INFORMATION_REQ; + +typedef struct smb_com_query_information_rsp { + struct smb_hdr hdr; /* wct = 10 */ + __le16 attr; + __le32 last_write_time; + __le32 size; + __u16 reserved[5]; + __le16 ByteCount; /* bcc = 0 */ +} __attribute__((packed)) QUERY_INFORMATION_RSP; + +typedef struct smb_com_setattr_req { + struct smb_hdr hdr; /* wct = 8 */ + __le16 attr; + __le16 time_low; + __le16 time_high; + __le16 reserved[5]; /* must be zero */ + __u16 ByteCount; + __u8 BufferFormat; /* 4 = ASCII */ + unsigned char fileName[1]; +} __attribute__((packed)) SETATTR_REQ; + +typedef struct smb_com_setattr_rsp { + struct smb_hdr hdr; /* wct = 0 */ + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) SETATTR_RSP; + +/* empty wct response to setattr */ + +/*******************************************************/ +/* NT Transact structure definitions follow */ +/* Currently only ioctl, acl (get security descriptor) */ +/* and notify are implemented */ +/*******************************************************/ +typedef struct smb_com_ntransact_req { + struct smb_hdr hdr; /* wct >= 19 */ + __u8 MaxSetupCount; + __u16 Reserved; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 MaxParameterCount; + __le32 MaxDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 DataCount; + __le32 DataOffset; + __u8 SetupCount; /* four setup words follow subcommand */ + /* SNIA spec incorrectly included spurious pad here */ + __le16 SubCommand; /* 2 = IOCTL/FSCTL */ + /* SetupCount words follow then */ + __le16 ByteCount; + __u8 Pad[3]; + __u8 Parms[0]; +} __attribute__((packed)) NTRANSACT_REQ; + +typedef struct smb_com_ntransact_rsp { + struct smb_hdr hdr; /* wct = 18 */ + __u8 Reserved[3]; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 ParameterDisplacement; + __le32 DataCount; + __le32 DataOffset; + __le32 DataDisplacement; + __u8 SetupCount; /* 0 */ + __u16 ByteCount; + /* __u8 Pad[3]; */ + /* parms and data follow */ +} __attribute__((packed)) NTRANSACT_RSP; + +typedef struct smb_com_transaction_ioctl_req { + struct smb_hdr hdr; /* wct = 23 */ + __u8 MaxSetupCount; + __u16 Reserved; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 MaxParameterCount; + __le32 MaxDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 DataCount; + __le32 DataOffset; + __u8 SetupCount; /* four setup words follow subcommand */ + /* SNIA spec incorrectly included spurious pad here */ + __le16 SubCommand; /* 2 = IOCTL/FSCTL */ + __le32 FunctionCode; + __u16 Fid; + __u8 IsFsctl; /* 1 = File System Control 0 = device control (IOCTL) */ + __u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */ + __le16 ByteCount; + __u8 Pad[3]; + __u8 Data[1]; +} __attribute__((packed)) TRANSACT_IOCTL_REQ; + +typedef struct smb_com_transaction_ioctl_rsp { + struct smb_hdr hdr; /* wct = 19 */ + __u8 Reserved[3]; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 ParameterDisplacement; + __le32 DataCount; + __le32 DataOffset; + __le32 DataDisplacement; + __u8 SetupCount; /* 1 */ + __le16 ReturnedDataLen; + __u16 ByteCount; +} __attribute__((packed)) TRANSACT_IOCTL_RSP; + +#define CIFS_ACL_OWNER 1 +#define CIFS_ACL_GROUP 2 +#define CIFS_ACL_DACL 4 +#define CIFS_ACL_SACL 8 + +typedef struct smb_com_transaction_qsec_req { + struct smb_hdr hdr; /* wct = 19 */ + __u8 MaxSetupCount; + __u16 Reserved; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 MaxParameterCount; + __le32 MaxDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 DataCount; + __le32 DataOffset; + __u8 SetupCount; /* no setup words follow subcommand */ + /* SNIA spec incorrectly included spurious pad here */ + __le16 SubCommand; /* 6 = QUERY_SECURITY_DESC */ + __le16 ByteCount; /* bcc = 3 + 8 */ + __u8 Pad[3]; + __u16 Fid; + __u16 Reserved2; + __le32 AclFlags; +} __attribute__((packed)) QUERY_SEC_DESC_REQ; + + +typedef struct smb_com_transaction_ssec_req { + struct smb_hdr hdr; /* wct = 19 */ + __u8 MaxSetupCount; + __u16 Reserved; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 MaxParameterCount; + __le32 MaxDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 DataCount; + __le32 DataOffset; + __u8 SetupCount; /* no setup words follow subcommand */ + /* SNIA spec incorrectly included spurious pad here */ + __le16 SubCommand; /* 3 = SET_SECURITY_DESC */ + __le16 ByteCount; /* bcc = 3 + 8 */ + __u8 Pad[3]; + __u16 Fid; + __u16 Reserved2; + __le32 AclFlags; +} __attribute__((packed)) SET_SEC_DESC_REQ; + +typedef struct smb_com_transaction_change_notify_req { + struct smb_hdr hdr; /* wct = 23 */ + __u8 MaxSetupCount; + __u16 Reserved; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 MaxParameterCount; + __le32 MaxDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 DataCount; + __le32 DataOffset; + __u8 SetupCount; /* four setup words follow subcommand */ + /* SNIA spec incorrectly included spurious pad here */ + __le16 SubCommand;/* 4 = Change Notify */ + __le32 CompletionFilter; /* operation to monitor */ + __u16 Fid; + __u8 WatchTree; /* 1 = Monitor subdirectories */ + __u8 Reserved2; + __le16 ByteCount; +/* __u8 Pad[3];*/ +/* __u8 Data[1];*/ +} __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ; + +/* BB eventually change to use generic ntransact rsp struct + and validation routine */ +typedef struct smb_com_transaction_change_notify_rsp { + struct smb_hdr hdr; /* wct = 18 */ + __u8 Reserved[3]; + __le32 TotalParameterCount; + __le32 TotalDataCount; + __le32 ParameterCount; + __le32 ParameterOffset; + __le32 ParameterDisplacement; + __le32 DataCount; + __le32 DataOffset; + __le32 DataDisplacement; + __u8 SetupCount; /* 0 */ + __u16 ByteCount; + /* __u8 Pad[3]; */ +} __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_RSP; +/* Completion Filter flags for Notify */ +#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001 +#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002 +#define FILE_NOTIFY_CHANGE_NAME 0x00000003 +#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004 +#define FILE_NOTIFY_CHANGE_SIZE 0x00000008 +#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010 +#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020 +#define FILE_NOTIFY_CHANGE_CREATION 0x00000040 +#define FILE_NOTIFY_CHANGE_EA 0x00000080 +#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100 +#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200 +#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400 +#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800 + +#define FILE_ACTION_ADDED 0x00000001 +#define FILE_ACTION_REMOVED 0x00000002 +#define FILE_ACTION_MODIFIED 0x00000003 +#define FILE_ACTION_RENAMED_OLD_NAME 0x00000004 +#define FILE_ACTION_RENAMED_NEW_NAME 0x00000005 +#define FILE_ACTION_ADDED_STREAM 0x00000006 +#define FILE_ACTION_REMOVED_STREAM 0x00000007 +#define FILE_ACTION_MODIFIED_STREAM 0x00000008 + +/* response contains array of the following structures */ +struct file_notify_information { + __le32 NextEntryOffset; + __le32 Action; + __le32 FileNameLength; + __u8 FileName[0]; +} __attribute__((packed)); + +struct reparse_data { + __u32 ReparseTag; + __u16 ReparseDataLength; + __u16 Reserved; + __u16 AltNameOffset; + __u16 AltNameLen; + __u16 TargetNameOffset; + __u16 TargetNameLen; + char LinkNamesBuf[1]; +} __attribute__((packed)); + +struct cifs_quota_data { + __u32 rsrvd1; /* 0 */ + __u32 sid_size; + __u64 rsrvd2; /* 0 */ + __u64 space_used; + __u64 soft_limit; + __u64 hard_limit; + char sid[1]; /* variable size? */ +} __attribute__((packed)); + +/* quota sub commands */ +#define QUOTA_LIST_CONTINUE 0 +#define QUOTA_LIST_START 0x100 +#define QUOTA_FOR_SID 0x101 + +struct trans2_req { + /* struct smb_hdr hdr precedes. Set wct = 14+ */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; + __u8 Reserved3; + __le16 SubCommand; /* 1st setup word - SetupCount words follow */ + __le16 ByteCount; +} __attribute__((packed)); + +struct smb_t2_req { + struct smb_hdr hdr; + struct trans2_req t2_req; +} __attribute__((packed)); + +struct trans2_resp { + /* struct smb_hdr hdr precedes. Note wct = 10 + setup count */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __u16 Reserved; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 ParameterDisplacement; + __le16 DataCount; + __le16 DataOffset; + __le16 DataDisplacement; + __u8 SetupCount; + __u8 Reserved1; + /* SetupWords[SetupCount]; + __u16 ByteCount; + __u16 Reserved2;*/ + /* data area follows */ +} __attribute__((packed)); + +struct smb_t2_rsp { + struct smb_hdr hdr; + struct trans2_resp t2_rsp; +} __attribute__((packed)); + +/* PathInfo/FileInfo infolevels */ +#define SMB_INFO_STANDARD 1 +#define SMB_SET_FILE_EA 2 +#define SMB_QUERY_FILE_EA_SIZE 2 +#define SMB_INFO_QUERY_EAS_FROM_LIST 3 +#define SMB_INFO_QUERY_ALL_EAS 4 +#define SMB_INFO_IS_NAME_VALID 6 +#define SMB_QUERY_FILE_BASIC_INFO 0x101 +#define SMB_QUERY_FILE_STANDARD_INFO 0x102 +#define SMB_QUERY_FILE_EA_INFO 0x103 +#define SMB_QUERY_FILE_NAME_INFO 0x104 +#define SMB_QUERY_FILE_ALLOCATION_INFO 0x105 +#define SMB_QUERY_FILE_END_OF_FILEINFO 0x106 +#define SMB_QUERY_FILE_ALL_INFO 0x107 +#define SMB_QUERY_ALT_NAME_INFO 0x108 +#define SMB_QUERY_FILE_STREAM_INFO 0x109 +#define SMB_QUERY_FILE_COMPRESSION_INFO 0x10B +#define SMB_QUERY_FILE_UNIX_BASIC 0x200 +#define SMB_QUERY_FILE_UNIX_LINK 0x201 +#define SMB_QUERY_POSIX_ACL 0x204 +#define SMB_QUERY_XATTR 0x205 /* e.g. system EA name space */ +#define SMB_QUERY_ATTR_FLAGS 0x206 /* append,immutable etc. */ +#define SMB_QUERY_POSIX_PERMISSION 0x207 +#define SMB_QUERY_POSIX_LOCK 0x208 +/* #define SMB_POSIX_OPEN 0x209 */ +/* #define SMB_POSIX_UNLINK 0x20a */ +#define SMB_QUERY_FILE__UNIX_INFO2 0x20b +#define SMB_QUERY_FILE_INTERNAL_INFO 0x3ee +#define SMB_QUERY_FILE_ACCESS_INFO 0x3f0 +#define SMB_QUERY_FILE_NAME_INFO2 0x3f1 /* 0x30 bytes */ +#define SMB_QUERY_FILE_POSITION_INFO 0x3f6 +#define SMB_QUERY_FILE_MODE_INFO 0x3f8 +#define SMB_QUERY_FILE_ALGN_INFO 0x3f9 + + +#define SMB_SET_FILE_BASIC_INFO 0x101 +#define SMB_SET_FILE_DISPOSITION_INFO 0x102 +#define SMB_SET_FILE_ALLOCATION_INFO 0x103 +#define SMB_SET_FILE_END_OF_FILE_INFO 0x104 +#define SMB_SET_FILE_UNIX_BASIC 0x200 +#define SMB_SET_FILE_UNIX_LINK 0x201 +#define SMB_SET_FILE_UNIX_HLINK 0x203 +#define SMB_SET_POSIX_ACL 0x204 +#define SMB_SET_XATTR 0x205 +#define SMB_SET_ATTR_FLAGS 0x206 /* append, immutable etc. */ +#define SMB_SET_POSIX_LOCK 0x208 +#define SMB_POSIX_OPEN 0x209 +#define SMB_POSIX_UNLINK 0x20a +#define SMB_SET_FILE_UNIX_INFO2 0x20b +#define SMB_SET_FILE_BASIC_INFO2 0x3ec +#define SMB_SET_FILE_RENAME_INFORMATION 0x3f2 /* BB check if qpathinfo too */ +#define SMB_FILE_ALL_INFO2 0x3fa +#define SMB_SET_FILE_ALLOCATION_INFO2 0x3fb +#define SMB_SET_FILE_END_OF_FILE_INFO2 0x3fc +#define SMB_FILE_MOVE_CLUSTER_INFO 0x407 +#define SMB_FILE_QUOTA_INFO 0x408 +#define SMB_FILE_REPARSEPOINT_INFO 0x409 +#define SMB_FILE_MAXIMUM_INFO 0x40d + +/* Find File infolevels */ +#define SMB_FIND_FILE_INFO_STANDARD 0x001 +#define SMB_FIND_FILE_QUERY_EA_SIZE 0x002 +#define SMB_FIND_FILE_QUERY_EAS_FROM_LIST 0x003 +#define SMB_FIND_FILE_DIRECTORY_INFO 0x101 +#define SMB_FIND_FILE_FULL_DIRECTORY_INFO 0x102 +#define SMB_FIND_FILE_NAMES_INFO 0x103 +#define SMB_FIND_FILE_BOTH_DIRECTORY_INFO 0x104 +#define SMB_FIND_FILE_ID_FULL_DIR_INFO 0x105 +#define SMB_FIND_FILE_ID_BOTH_DIR_INFO 0x106 +#define SMB_FIND_FILE_UNIX 0x202 + +typedef struct smb_com_transaction2_qpi_req { + struct smb_hdr hdr; /* wct = 14+ */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; + __u8 Reserved3; + __le16 SubCommand; /* one setup word */ + __le16 ByteCount; + __u8 Pad; + __le16 InformationLevel; + __u32 Reserved4; + char FileName[1]; +} __attribute__((packed)) TRANSACTION2_QPI_REQ; + +typedef struct smb_com_transaction2_qpi_rsp { + struct smb_hdr hdr; /* wct = 10 + SetupCount */ + struct trans2_resp t2; + __u16 ByteCount; + __u16 Reserved2; /* parameter word is present for infolevels > 100 */ +} __attribute__((packed)) TRANSACTION2_QPI_RSP; + +typedef struct smb_com_transaction2_spi_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; + __u8 Reserved3; + __le16 SubCommand; /* one setup word */ + __le16 ByteCount; + __u8 Pad; + __u16 Pad1; + __le16 InformationLevel; + __u32 Reserved4; + char FileName[1]; +} __attribute__((packed)) TRANSACTION2_SPI_REQ; + +typedef struct smb_com_transaction2_spi_rsp { + struct smb_hdr hdr; /* wct = 10 + SetupCount */ + struct trans2_resp t2; + __u16 ByteCount; + __u16 Reserved2; /* parameter word is present for infolevels > 100 */ +} __attribute__((packed)) TRANSACTION2_SPI_RSP; + +struct set_file_rename { + __le32 overwrite; /* 1 = overwrite dest */ + __u32 root_fid; /* zero */ + __le32 target_name_len; + char target_name[0]; /* Must be unicode */ +} __attribute__((packed)); + +struct smb_com_transaction2_sfi_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; + __u8 Reserved3; + __le16 SubCommand; /* one setup word */ + __le16 ByteCount; + __u8 Pad; + __u16 Pad1; + __u16 Fid; + __le16 InformationLevel; + __u16 Reserved4; +} __attribute__((packed)); + +struct smb_com_transaction2_sfi_rsp { + struct smb_hdr hdr; /* wct = 10 + SetupCount */ + struct trans2_resp t2; + __u16 ByteCount; + __u16 Reserved2; /* parameter word reserved - + present for infolevels > 100 */ +} __attribute__((packed)); + +struct smb_t2_qfi_req { + struct smb_hdr hdr; + struct trans2_req t2; + __u8 Pad; + __u16 Fid; + __le16 InformationLevel; +} __attribute__((packed)); + +struct smb_t2_qfi_rsp { + struct smb_hdr hdr; /* wct = 10 + SetupCount */ + struct trans2_resp t2; + __u16 ByteCount; + __u16 Reserved2; /* parameter word reserved - + present for infolevels > 100 */ +} __attribute__((packed)); + +/* + * Flags on T2 FINDFIRST and FINDNEXT + */ +#define CIFS_SEARCH_CLOSE_ALWAYS 0x0001 +#define CIFS_SEARCH_CLOSE_AT_END 0x0002 +#define CIFS_SEARCH_RETURN_RESUME 0x0004 +#define CIFS_SEARCH_CONTINUE_FROM_LAST 0x0008 +#define CIFS_SEARCH_BACKUP_SEARCH 0x0010 + +/* + * Size of the resume key on FINDFIRST and FINDNEXT calls + */ +#define CIFS_SMB_RESUME_KEY_SIZE 4 + +typedef struct smb_com_transaction2_ffirst_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; /* one */ + __u8 Reserved3; + __le16 SubCommand; /* TRANS2_FIND_FIRST */ + __le16 ByteCount; + __u8 Pad; + __le16 SearchAttributes; + __le16 SearchCount; + __le16 SearchFlags; + __le16 InformationLevel; + __le32 SearchStorageType; + char FileName[1]; +} __attribute__((packed)) TRANSACTION2_FFIRST_REQ; + +typedef struct smb_com_transaction2_ffirst_rsp { + struct smb_hdr hdr; /* wct = 10 */ + struct trans2_resp t2; + __u16 ByteCount; +} __attribute__((packed)) TRANSACTION2_FFIRST_RSP; + +typedef struct smb_com_transaction2_ffirst_rsp_parms { + __u16 SearchHandle; + __le16 SearchCount; + __le16 EndofSearch; + __le16 EAErrorOffset; + __le16 LastNameOffset; +} __attribute__((packed)) T2_FFIRST_RSP_PARMS; + +typedef struct smb_com_transaction2_fnext_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; /* one */ + __u8 Reserved3; + __le16 SubCommand; /* TRANS2_FIND_NEXT */ + __le16 ByteCount; + __u8 Pad; + __u16 SearchHandle; + __le16 SearchCount; + __le16 InformationLevel; + __u32 ResumeKey; + __le16 SearchFlags; + char ResumeFileName[1]; +} __attribute__((packed)) TRANSACTION2_FNEXT_REQ; + +typedef struct smb_com_transaction2_fnext_rsp { + struct smb_hdr hdr; /* wct = 10 */ + struct trans2_resp t2; + __u16 ByteCount; +} __attribute__((packed)) TRANSACTION2_FNEXT_RSP; + +typedef struct smb_com_transaction2_fnext_rsp_parms { + __le16 SearchCount; + __le16 EndofSearch; + __le16 EAErrorOffset; + __le16 LastNameOffset; +} __attribute__((packed)) T2_FNEXT_RSP_PARMS; + +/* QFSInfo Levels */ +#define SMB_INFO_ALLOCATION 1 +#define SMB_INFO_VOLUME 2 +#define SMB_QUERY_FS_VOLUME_INFO 0x102 +#define SMB_QUERY_FS_SIZE_INFO 0x103 +#define SMB_QUERY_FS_DEVICE_INFO 0x104 +#define SMB_QUERY_FS_ATTRIBUTE_INFO 0x105 +#define SMB_QUERY_CIFS_UNIX_INFO 0x200 +#define SMB_QUERY_POSIX_FS_INFO 0x201 +#define SMB_QUERY_POSIX_WHO_AM_I 0x202 +#define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 +#define SMB_QUERY_FS_PROXY 0x204 /* WAFS enabled. Returns structure + FILE_SYSTEM__UNIX_INFO to tell + whether new NTIOCTL available + (0xACE) for WAN friendly SMB + operations to be carried */ +#define SMB_QUERY_LABEL_INFO 0x3ea +#define SMB_QUERY_FS_QUOTA_INFO 0x3ee +#define SMB_QUERY_FS_FULL_SIZE_INFO 0x3ef +#define SMB_QUERY_OBJECTID_INFO 0x3f0 + +typedef struct smb_com_transaction2_qfsi_req { + struct smb_hdr hdr; /* wct = 14+ */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; + __u8 Reserved3; + __le16 SubCommand; /* one setup word */ + __le16 ByteCount; + __u8 Pad; + __le16 InformationLevel; +} __attribute__((packed)) TRANSACTION2_QFSI_REQ; + +typedef struct smb_com_transaction_qfsi_rsp { + struct smb_hdr hdr; /* wct = 10 + SetupCount */ + struct trans2_resp t2; + __u16 ByteCount; + __u8 Pad; /* may be three bytes? *//* followed by data area */ +} __attribute__((packed)) TRANSACTION2_QFSI_RSP; + +typedef struct whoami_rsp_data { /* Query level 0x202 */ + __u32 flags; /* 0 = Authenticated user 1 = GUEST */ + __u32 mask; /* which flags bits server understands ie 0x0001 */ + __u64 unix_user_id; + __u64 unix_user_gid; + __u32 number_of_supplementary_gids; /* may be zero */ + __u32 number_of_sids; /* may be zero */ + __u32 length_of_sid_array; /* in bytes - may be zero */ + __u32 pad; /* reserved - MBZ */ + /* __u64 gid_array[0]; */ /* may be empty */ + /* __u8 * psid_list */ /* may be empty */ +} __attribute__((packed)) WHOAMI_RSP_DATA; + +/* SETFSInfo Levels */ +#define SMB_SET_CIFS_UNIX_INFO 0x200 +typedef struct smb_com_transaction2_setfsi_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; /* 4 */ + __le16 ParameterOffset; + __le16 DataCount; /* 12 */ + __le16 DataOffset; + __u8 SetupCount; /* one */ + __u8 Reserved3; + __le16 SubCommand; /* TRANS2_SET_FS_INFORMATION */ + __le16 ByteCount; + __u8 Pad; + __u16 FileNum; /* Parameters start. */ + __le16 InformationLevel;/* Parameters end. */ + __le16 ClientUnixMajor; /* Data start. */ + __le16 ClientUnixMinor; + __le64 ClientUnixCap; /* Data end */ +} __attribute__((packed)) TRANSACTION2_SETFSI_REQ; + +typedef struct smb_com_transaction2_setfsi_rsp { + struct smb_hdr hdr; /* wct = 10 */ + struct trans2_resp t2; + __u16 ByteCount; +} __attribute__((packed)) TRANSACTION2_SETFSI_RSP; + + +typedef struct smb_com_transaction2_get_dfs_refer_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; + __le16 ParameterOffset; + __le16 DataCount; + __le16 DataOffset; + __u8 SetupCount; + __u8 Reserved3; + __le16 SubCommand; /* one setup word */ + __le16 ByteCount; + __u8 Pad[3]; /* Win2K has sent 0x0F01 (max response length + perhaps?) followed by one byte pad - doesn't + seem to matter though */ + __le16 MaxReferralLevel; + char RequestFileName[1]; +} __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_REQ; + +#define DFS_VERSION cpu_to_le16(0x0003) + +/* DFS server target type */ +#define DFS_TYPE_LINK 0x0000 /* also for sysvol targets */ +#define DFS_TYPE_ROOT 0x0001 + +/* Referral Entry Flags */ +#define DFS_NAME_LIST_REF 0x0200 /* set for domain or DC referral responses */ +#define DFS_TARGET_SET_BOUNDARY 0x0400 /* only valid with version 4 dfs req */ + +typedef struct dfs_referral_level_3 { /* version 4 is same, + one flag bit */ + __le16 VersionNumber; /* must be 3 or 4 */ + __le16 Size; + __le16 ServerType; /* 0x0001 = root targets; 0x0000 = link targets */ + __le16 ReferralEntryFlags; + __le32 TimeToLive; + __le16 DfsPathOffset; + __le16 DfsAlternatePathOffset; + __le16 NetworkAddressOffset; /* offset of the link target */ + __u8 ServiceSiteGuid[16]; /* MBZ, ignored */ +} __attribute__((packed)) REFERRAL3; + +typedef struct smb_com_transaction_get_dfs_refer_rsp { + struct smb_hdr hdr; /* wct = 10 */ + struct trans2_resp t2; + __u16 ByteCount; + __u8 Pad; + __le16 PathConsumed; + __le16 NumberOfReferrals; + __le32 DFSFlags; + REFERRAL3 referrals[1]; /* array of level 3 dfs_referral structures */ + /* followed by the strings pointed to by the referral structures */ +} __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_RSP; + +/* DFS Flags */ +#define DFSREF_REFERRAL_SERVER 0x00000001 /* all targets are DFS roots */ +#define DFSREF_STORAGE_SERVER 0x00000002 /* no further ref requests needed */ +#define DFSREF_TARGET_FAILBACK 0x00000004 /* only for DFS referral version 4 */ + +/* + ************************************************************************ + * All structs for everything above the SMB PDUs themselves + * (such as the T2 level specific data) go here + ************************************************************************ + */ + +/* + * Information on a server + */ + +struct serverInfo { + char name[16]; + unsigned char versionMajor; + unsigned char versionMinor; + unsigned long type; + unsigned int commentOffset; +} __attribute__((packed)); + +/* + * The following structure is the format of the data returned on a NetShareEnum + * with level "90" (x5A) + */ + +struct shareInfo { + char shareName[13]; + char pad; + unsigned short type; + unsigned int commentOffset; +} __attribute__((packed)); + +struct aliasInfo { + char aliasName[9]; + char pad; + unsigned int commentOffset; + unsigned char type[2]; +} __attribute__((packed)); + +struct aliasInfo92 { + int aliasNameOffset; + int serverNameOffset; + int shareNameOffset; +} __attribute__((packed)); + +typedef struct { + __le64 TotalAllocationUnits; + __le64 FreeAllocationUnits; + __le32 SectorsPerAllocationUnit; + __le32 BytesPerSector; +} __attribute__((packed)) FILE_SYSTEM_INFO; /* size info, level 0x103 */ + +typedef struct { + __le32 fsid; + __le32 SectorsPerAllocationUnit; + __le32 TotalAllocationUnits; + __le32 FreeAllocationUnits; + __le16 BytesPerSector; +} __attribute__((packed)) FILE_SYSTEM_ALLOC_INFO; + +typedef struct { + __le16 MajorVersionNumber; + __le16 MinorVersionNumber; + __le64 Capability; +} __attribute__((packed)) FILE_SYSTEM_UNIX_INFO; /* Unix extension level 0x200*/ + +/* Version numbers for CIFS UNIX major and minor. */ +#define CIFS_UNIX_MAJOR_VERSION 1 +#define CIFS_UNIX_MINOR_VERSION 0 + +/* Linux/Unix extensions capability flags */ +#define CIFS_UNIX_FCNTL_CAP 0x00000001 /* support for fcntl locks */ +#define CIFS_UNIX_POSIX_ACL_CAP 0x00000002 /* support getfacl/setfacl */ +#define CIFS_UNIX_XATTR_CAP 0x00000004 /* support new namespace */ +#define CIFS_UNIX_EXTATTR_CAP 0x00000008 /* support chattr/chflag */ +#define CIFS_UNIX_POSIX_PATHNAMES_CAP 0x00000010 /* Allow POSIX path chars */ +#define CIFS_UNIX_POSIX_PATH_OPS_CAP 0x00000020 /* Allow new POSIX path based + calls including posix open + and posix unlink */ +#define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up + to 0xFFFF00 */ +#define CIFS_UNIX_LARGE_WRITE_CAP 0x00000080 +#define CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP 0x00000100 /* can do SPNEGO crypt */ +#define CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP 0x00000200 /* must do */ +#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and + QFS PROXY call */ +#ifdef CONFIG_CIFS_POSIX +/* Can not set pathnames cap yet until we send new posix create SMB since + otherwise server can treat such handles opened with older ntcreatex + (by a new client which knows how to send posix path ops) + as non-posix handles (can affect write behavior with byte range locks. + We can add back in POSIX_PATH_OPS cap when Posix Create/Mkdir finished */ +/* #define CIFS_UNIX_CAP_MASK 0x000000fb */ +#define CIFS_UNIX_CAP_MASK 0x000000db +#else +#define CIFS_UNIX_CAP_MASK 0x00000013 +#endif /* CONFIG_CIFS_POSIX */ + + +#define CIFS_POSIX_EXTENSIONS 0x00000010 /* support for new QFSInfo */ + +typedef struct { + /* For undefined recommended transfer size return -1 in that field */ + __le32 OptimalTransferSize; /* bsize on some os, iosize on other os */ + __le32 BlockSize; + /* The next three fields are in terms of the block size. + (above). If block size is unknown, 4096 would be a + reasonable block size for a server to report. + Note that returning the blocks/blocksavail removes need + to make a second call (to QFSInfo level 0x103 to get this info. + UserBlockAvail is typically less than or equal to BlocksAvail, + if no distinction is made return the same value in each */ + __le64 TotalBlocks; + __le64 BlocksAvail; /* bfree */ + __le64 UserBlocksAvail; /* bavail */ + /* For undefined Node fields or FSID return -1 */ + __le64 TotalFileNodes; + __le64 FreeFileNodes; + __le64 FileSysIdentifier; /* fsid */ + /* NB Namelen comes from FILE_SYSTEM_ATTRIBUTE_INFO call */ + /* NB flags can come from FILE_SYSTEM_DEVICE_INFO call */ +} __attribute__((packed)) FILE_SYSTEM_POSIX_INFO; + +/* DeviceType Flags */ +#define FILE_DEVICE_CD_ROM 0x00000002 +#define FILE_DEVICE_CD_ROM_FILE_SYSTEM 0x00000003 +#define FILE_DEVICE_DFS 0x00000006 +#define FILE_DEVICE_DISK 0x00000007 +#define FILE_DEVICE_DISK_FILE_SYSTEM 0x00000008 +#define FILE_DEVICE_FILE_SYSTEM 0x00000009 +#define FILE_DEVICE_NAMED_PIPE 0x00000011 +#define FILE_DEVICE_NETWORK 0x00000012 +#define FILE_DEVICE_NETWORK_FILE_SYSTEM 0x00000014 +#define FILE_DEVICE_NULL 0x00000015 +#define FILE_DEVICE_PARALLEL_PORT 0x00000016 +#define FILE_DEVICE_PRINTER 0x00000018 +#define FILE_DEVICE_SERIAL_PORT 0x0000001b +#define FILE_DEVICE_STREAMS 0x0000001e +#define FILE_DEVICE_TAPE 0x0000001f +#define FILE_DEVICE_TAPE_FILE_SYSTEM 0x00000020 +#define FILE_DEVICE_VIRTUAL_DISK 0x00000024 +#define FILE_DEVICE_NETWORK_REDIRECTOR 0x00000028 + +typedef struct { + __le32 DeviceType; + __le32 DeviceCharacteristics; +} __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO; /* device info level 0x104 */ + +typedef struct { + __le32 Attributes; + __le32 MaxPathNameComponentLength; + __le32 FileSystemNameLen; + char FileSystemName[52]; /* do not have to save this - get subset? */ +} __attribute__((packed)) FILE_SYSTEM_ATTRIBUTE_INFO; + +/******************************************************************************/ +/* QueryFileInfo/QueryPathinfo (also for SetPath/SetFile) data buffer formats */ +/******************************************************************************/ +typedef struct { /* data block encoding of response to level 263 QPathInfo */ + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 Attributes; + __u32 Pad1; + __le64 AllocationSize; + __le64 EndOfFile; /* size ie offset to first free byte in file */ + __le32 NumberOfLinks; /* hard links */ + __u8 DeletePending; + __u8 Directory; + __u16 Pad2; + __u64 IndexNumber; + __le32 EASize; + __le32 AccessFlags; + __u64 IndexNumber1; + __le64 CurrentByteOffset; + __le32 Mode; + __le32 AlignmentRequirement; + __le32 FileNameLength; + char FileName[1]; +} __attribute__((packed)) FILE_ALL_INFO; /* level 0x107 QPathInfo */ + +/* defines for enumerating possible values of the Unix type field below */ +#define UNIX_FILE 0 +#define UNIX_DIR 1 +#define UNIX_SYMLINK 2 +#define UNIX_CHARDEV 3 +#define UNIX_BLOCKDEV 4 +#define UNIX_FIFO 5 +#define UNIX_SOCKET 6 +typedef struct { + __le64 EndOfFile; + __le64 NumOfBytes; + __le64 LastStatusChange; /*SNIA specs DCE time for the 3 time fields */ + __le64 LastAccessTime; + __le64 LastModificationTime; + __le64 Uid; + __le64 Gid; + __le32 Type; + __le64 DevMajor; + __le64 DevMinor; + __le64 UniqueId; + __le64 Permissions; + __le64 Nlinks; +} __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */ + +typedef struct { + char LinkDest[1]; +} __attribute__((packed)) FILE_UNIX_LINK_INFO; /* level 0x201 QPathInfo */ + +/* The following three structures are needed only for + setting time to NT4 and some older servers via + the primitive DOS time format */ +typedef struct { + __u16 Day:5; + __u16 Month:4; + __u16 Year:7; +} __attribute__((packed)) SMB_DATE; + +typedef struct { + __u16 TwoSeconds:5; + __u16 Minutes:6; + __u16 Hours:5; +} __attribute__((packed)) SMB_TIME; + +typedef struct { + __le16 CreationDate; /* SMB Date see above */ + __le16 CreationTime; /* SMB Time */ + __le16 LastAccessDate; + __le16 LastAccessTime; + __le16 LastWriteDate; + __le16 LastWriteTime; + __le32 DataSize; /* File Size (EOF) */ + __le32 AllocationSize; + __le16 Attributes; /* verify not u32 */ + __le32 EASize; +} __attribute__((packed)) FILE_INFO_STANDARD; /* level 1 SetPath/FileInfo */ + +typedef struct { + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 Attributes; + __u32 Pad; +} __attribute__((packed)) FILE_BASIC_INFO; /* size info, level 0x101 */ + +struct file_allocation_info { + __le64 AllocationSize; /* Note old Samba srvr rounds this up too much */ +} __attribute__((packed)); /* size used on disk, for level 0x103 for set, + 0x105 for query */ + +struct file_end_of_file_info { + __le64 FileSize; /* offset to end of file */ +} __attribute__((packed)); /* size info, level 0x104 for set, 0x106 for query */ + +struct file_alt_name_info { + __u8 alt_name[1]; +} __attribute__((packed)); /* level 0x0108 */ + +struct file_stream_info { + __le32 number_of_streams; /* BB check sizes and verify location */ + /* followed by info on streams themselves + u64 size; + u64 allocation_size + stream info */ +}; /* level 0x109 */ + +struct file_compression_info { + __le64 compressed_size; + __le16 format; + __u8 unit_shift; + __u8 ch_shift; + __u8 cl_shift; + __u8 pad[3]; +} __attribute__((packed)); /* level 0x10b */ + +/* POSIX ACL set/query path info structures */ +#define CIFS_ACL_VERSION 1 +struct cifs_posix_ace { /* access control entry (ACE) */ + __u8 cifs_e_tag; + __u8 cifs_e_perm; + __le64 cifs_uid; /* or gid */ +} __attribute__((packed)); + +struct cifs_posix_acl { /* access conrol list (ACL) */ + __le16 version; + __le16 access_entry_count; /* access ACL - count of entries */ + __le16 default_entry_count; /* default ACL - count of entries */ + struct cifs_posix_ace ace_array[0]; + /* followed by + struct cifs_posix_ace default_ace_arraay[] */ +} __attribute__((packed)); /* level 0x204 */ + +/* types of access control entries already defined in posix_acl.h */ +/* #define CIFS_POSIX_ACL_USER_OBJ 0x01 +#define CIFS_POSIX_ACL_USER 0x02 +#define CIFS_POSIX_ACL_GROUP_OBJ 0x04 +#define CIFS_POSIX_ACL_GROUP 0x08 +#define CIFS_POSIX_ACL_MASK 0x10 +#define CIFS_POSIX_ACL_OTHER 0x20 */ + +/* types of perms */ +/* #define CIFS_POSIX_ACL_EXECUTE 0x01 +#define CIFS_POSIX_ACL_WRITE 0x02 +#define CIFS_POSIX_ACL_READ 0x04 */ + +/* end of POSIX ACL definitions */ + +/* POSIX Open Flags */ +#define SMB_O_RDONLY 0x1 +#define SMB_O_WRONLY 0x2 +#define SMB_O_RDWR 0x4 +#define SMB_O_CREAT 0x10 +#define SMB_O_EXCL 0x20 +#define SMB_O_TRUNC 0x40 +#define SMB_O_APPEND 0x80 +#define SMB_O_SYNC 0x100 +#define SMB_O_DIRECTORY 0x200 +#define SMB_O_NOFOLLOW 0x400 +#define SMB_O_DIRECT 0x800 + +typedef struct { + __le32 OpenFlags; /* same as NT CreateX */ + __le32 PosixOpenFlags; + __le64 Permissions; + __le16 Level; /* reply level requested (see QPathInfo levels) */ +} __attribute__((packed)) OPEN_PSX_REQ; /* level 0x209 SetPathInfo data */ + +typedef struct { + __le16 OplockFlags; + __u16 Fid; + __le32 CreateAction; + __le16 ReturnedLevel; + __le16 Pad; + /* struct following varies based on requested level */ +} __attribute__((packed)) OPEN_PSX_RSP; /* level 0x209 SetPathInfo data */ + +#define SMB_POSIX_UNLINK_FILE_TARGET 0 +#define SMB_POSIX_UNLINK_DIRECTORY_TARGET 1 + +struct unlink_psx_rq { /* level 0x20a SetPathInfo */ + __le16 type; +} __attribute__((packed)); + +struct file_internal_info { + __le64 UniqueId; /* inode number */ +} __attribute__((packed)); /* level 0x3ee */ + +struct file_mode_info { + __le32 Mode; +} __attribute__((packed)); /* level 0x3f8 */ + +struct file_attrib_tag { + __le32 Attribute; + __le32 ReparseTag; +} __attribute__((packed)); /* level 0x40b */ + + +/********************************************************/ +/* FindFirst/FindNext transact2 data buffer formats */ +/********************************************************/ + +typedef struct { + __le32 NextEntryOffset; + __u32 ResumeKey; /* as with FileIndex - no need to convert */ + FILE_UNIX_BASIC_INFO basic; + char FileName[1]; +} __attribute__((packed)) FILE_UNIX_INFO; /* level 0x202 */ + +typedef struct { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 ExtFileAttributes; + __le32 FileNameLength; + char FileName[1]; +} __attribute__((packed)) FILE_DIRECTORY_INFO; /* level 0x101 FF resp data */ + +typedef struct { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 ExtFileAttributes; + __le32 FileNameLength; + __le32 EaSize; /* length of the xattrs */ + char FileName[1]; +} __attribute__((packed)) FILE_FULL_DIRECTORY_INFO; /* level 0x102 rsp data */ + +typedef struct { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 ExtFileAttributes; + __le32 FileNameLength; + __le32 EaSize; /* EA size */ + __le32 Reserved; + __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ + char FileName[1]; +} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */ + +typedef struct { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 ExtFileAttributes; + __le32 FileNameLength; + __le32 EaSize; /* length of the xattrs */ + __u8 ShortNameLength; + __u8 Reserved; + __u8 ShortName[12]; + char FileName[1]; +} __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FFrsp data */ + +typedef struct { + __u32 ResumeKey; + __le16 CreationDate; /* SMB Date */ + __le16 CreationTime; /* SMB Time */ + __le16 LastAccessDate; + __le16 LastAccessTime; + __le16 LastWriteDate; + __le16 LastWriteTime; + __le32 DataSize; /* File Size (EOF) */ + __le32 AllocationSize; + __le16 Attributes; /* verify not u32 */ + __u8 FileNameLength; + char FileName[1]; +} __attribute__((packed)) FIND_FILE_STANDARD_INFO; /* level 0x1 FF resp data */ + + +struct win_dev { + unsigned char type[8]; /* IntxCHR or IntxBLK */ + __le64 major; + __le64 minor; +} __attribute__((packed)); + +struct gea { + unsigned char name_len; + char name[1]; +} __attribute__((packed)); + +struct gealist { + unsigned long list_len; + struct gea list[1]; +} __attribute__((packed)); + +struct fea { + unsigned char EA_flags; + __u8 name_len; + __le16 value_len; + char name[1]; + /* optionally followed by value */ +} __attribute__((packed)); +/* flags for _FEA.fEA */ +#define FEA_NEEDEA 0x80 /* need EA bit */ + +struct fealist { + __le32 list_len; + struct fea list[1]; +} __attribute__((packed)); + +/* used to hold an arbitrary blob of data */ +struct data_blob { + __u8 *data; + size_t length; + void (*free) (struct data_blob *data_blob); +} __attribute__((packed)); + + +#ifdef CONFIG_CIFS_POSIX +/* + For better POSIX semantics from Linux client, (even better + than the existing CIFS Unix Extensions) we need updated PDUs for: + + 1) PosixCreateX - to set and return the mode, inode#, device info and + perhaps add a CreateDevice - to create Pipes and other special .inodes + Also note POSIX open flags + 2) Close - to return the last write time to do cache across close + more safely + 3) FindFirst return unique inode number - what about resume key, two + forms short (matches readdir) and full (enough info to cache inodes) + 4) Mkdir - set mode + + And under consideration: + 5) FindClose2 (return nanosecond timestamp ??) + 6) Use nanosecond timestamps throughout all time fields if + corresponding attribute flag is set + 7) sendfile - handle based copy + + what about fixing 64 bit alignment + + There are also various legacy SMB/CIFS requests used as is + + From existing Lanman and NTLM dialects: + -------------------------------------- + NEGOTIATE + SESSION_SETUP_ANDX (BB which?) + TREE_CONNECT_ANDX (BB which wct?) + TREE_DISCONNECT (BB add volume timestamp on response) + LOGOFF_ANDX + DELETE (note delete open file behavior) + DELETE_DIRECTORY + READ_AND_X + WRITE_AND_X + LOCKING_AND_X (note posix lock semantics) + RENAME (note rename across dirs and open file rename posix behaviors) + NT_RENAME (for hardlinks) Is this good enough for all features? + FIND_CLOSE2 + TRANSACTION2 (18 cases) + SMB_SET_FILE_END_OF_FILE_INFO2 SMB_SET_PATH_END_OF_FILE_INFO2 + (BB verify that never need to set allocation size) + SMB_SET_FILE_BASIC_INFO2 (setting times - BB can it be done via + Unix ext?) + + COPY (note support for copy across directories) - FUTURE, OPTIONAL + setting/getting OS/2 EAs - FUTURE (BB can this handle + setting Linux xattrs perfectly) - OPTIONAL + dnotify - FUTURE, OPTIONAL + quota - FUTURE, OPTIONAL + + Note that various requests implemented for NT interop such as + NT_TRANSACT (IOCTL) QueryReparseInfo + are unneeded to servers compliant with the CIFS POSIX extensions + + From CIFS Unix Extensions: + ------------------------- + T2 SET_PATH_INFO (SMB_SET_FILE_UNIX_LINK) for symlinks + T2 SET_PATH_INFO (SMB_SET_FILE_BASIC_INFO2) + T2 QUERY_PATH_INFO (SMB_QUERY_FILE_UNIX_LINK) + T2 QUERY_PATH_INFO (SMB_QUERY_FILE_UNIX_BASIC) BB check for missing + inode fields + Actually a need QUERY_FILE_UNIX_INFO + since has inode num + BB what about a) blksize/blkbits/blocks + b) i_version + c) i_rdev + d) notify mask? + e) generation + f) size_seqcount + T2 FIND_FIRST/FIND_NEXT FIND_FILE_UNIX + TRANS2_GET_DFS_REFERRAL - OPTIONAL but recommended + T2_QFS_INFO QueryDevice/AttributeInfo - OPTIONAL + */ + +/* xsymlink is a symlink format (used by MacOS) that can be used + to save symlink info in a regular file when + mounted to operating systems that do not + support the cifs Unix extensions or EAs (for xattr + based symlinks). For such a file to be recognized + as containing symlink data: + + 1) file size must be 1067, + 2) signature must begin file data, + 3) length field must be set to ASCII representation + of a number which is less than or equal to 1024, + 4) md5 must match that of the path data */ + +struct xsymlink { + /* 1067 bytes */ + char signature[4]; /* XSym */ /* not null terminated */ + char cr0; /* \n */ +/* ASCII representation of length (4 bytes decimal) terminated by \n not null */ + char length[4]; + char cr1; /* \n */ +/* md5 of valid subset of path ie path[0] through path[length-1] */ + __u8 md5[32]; + char cr2; /* \n */ +/* if room left, then end with \n then 0x20s by convention but not required */ + char path[1024]; +} __attribute__((packed)); + +typedef struct file_xattr_info { + /* BB do we need another field for flags? BB */ + __u32 xattr_name_len; + __u32 xattr_value_len; + char xattr_name[0]; + /* followed by xattr_value[xattr_value_len], no pad */ +} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info + level 0x205 */ + + +/* flags for chattr command */ +#define EXT_SECURE_DELETE 0x00000001 /* EXT3_SECRM_FL */ +#define EXT_ENABLE_UNDELETE 0x00000002 /* EXT3_UNRM_FL */ +/* Reserved for compress file 0x4 */ +#define EXT_SYNCHRONOUS 0x00000008 /* EXT3_SYNC_FL */ +#define EXT_IMMUTABLE_FL 0x00000010 /* EXT3_IMMUTABLE_FL */ +#define EXT_OPEN_APPEND_ONLY 0x00000020 /* EXT3_APPEND_FL */ +#define EXT_DO_NOT_BACKUP 0x00000040 /* EXT3_NODUMP_FL */ +#define EXT_NO_UPDATE_ATIME 0x00000080 /* EXT3_NOATIME_FL */ +/* 0x100 through 0x800 reserved for compression flags and are GET-ONLY */ +#define EXT_HASH_TREE_INDEXED_DIR 0x00001000 /* GET-ONLY EXT3_INDEX_FL */ +/* 0x2000 reserved for IMAGIC_FL */ +#define EXT_JOURNAL_THIS_FILE 0x00004000 /* GET-ONLY EXT3_JOURNAL_DATA_FL */ +/* 0x8000 reserved for EXT3_NOTAIL_FL */ +#define EXT_SYNCHRONOUS_DIR 0x00010000 /* EXT3_DIRSYNC_FL */ +#define EXT_TOPDIR 0x00020000 /* EXT3_TOPDIR_FL */ + +#define EXT_SET_MASK 0x000300FF +#define EXT_GET_MASK 0x0003DFFF + +typedef struct file_chattr_info { + __le64 mask; /* list of all possible attribute bits */ + __le64 mode; /* list of actual attribute bits on this inode */ +} __attribute__((packed)) FILE_CHATTR_INFO; /* ext attributes + (chattr, chflags) level 0x206 */ +#endif /* POSIX */ +#endif /* _CIFSPDU_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h new file mode 100644 index 00000000..8df28e92 --- /dev/null +++ b/fs/cifs/cifsproto.h @@ -0,0 +1,460 @@ +/* + * fs/cifs/cifsproto.h + * + * Copyright (c) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _CIFSPROTO_H +#define _CIFSPROTO_H +#include + +struct statfs; +struct smb_vol; + +/* + ***************************************************************** + * All Prototypes + ***************************************************************** + */ + +extern struct smb_hdr *cifs_buf_get(void); +extern void cifs_buf_release(void *); +extern struct smb_hdr *cifs_small_buf_get(void); +extern void cifs_small_buf_release(void *); +extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, + unsigned int /* length */); +extern unsigned int _GetXid(void); +extern void _FreeXid(unsigned int); +#define GetXid() \ +({ \ + int __xid = (int)_GetXid(); \ + cFYI(1, "CIFS VFS: in %s as Xid: %d with uid: %d", \ + __func__, __xid, current_fsuid()); \ + __xid; \ +}) + +#define FreeXid(curr_xid) \ +do { \ + _FreeXid(curr_xid); \ + cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d", \ + __func__, curr_xid, (int)rc); \ +} while (0) +extern int init_cifs_idmap(void); +extern void exit_cifs_idmap(void); +extern void cifs_destroy_idmaptrees(void); +extern char *build_path_from_dentry(struct dentry *); +extern char *cifs_build_path_to_root(struct smb_vol *vol, + struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon); +extern char *build_wildcard_path_from_dentry(struct dentry *direntry); +extern char *cifs_compose_mount_options(const char *sb_mountdata, + const char *fullpath, const struct dfs_info3_param *ref, + char **devname); +/* extern void renew_parental_timestamps(struct dentry *direntry);*/ +extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer, + struct TCP_Server_Info *server); +extern void DeleteMidQEntry(struct mid_q_entry *midEntry); +extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, + unsigned int nvec, mid_callback_t *callback, + void *cbdata, bool ignore_pend); +extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *, + struct smb_hdr * /* input */ , + struct smb_hdr * /* out */ , + int * /* bytes returned */ , const int long_op); +extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses, + struct smb_hdr *in_buf, int flags); +extern int cifs_check_receive(struct mid_q_entry *mid, + struct TCP_Server_Info *server, bool log_error); +extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, + struct kvec *, int /* nvec to send */, + int * /* type of buf returned */ , const int flags); +extern int SendReceiveBlockingLock(const unsigned int xid, + struct cifs_tcon *ptcon, + struct smb_hdr *in_buf , + struct smb_hdr *out_buf, + int *bytes_returned); +extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length); +extern bool is_valid_oplock_break(struct smb_hdr *smb, + struct TCP_Server_Info *); +extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); +extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, + unsigned int bytes_written); +extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); +extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); +extern unsigned int smbCalcSize(struct smb_hdr *ptr); +extern int decode_negTokenInit(unsigned char *security_blob, int length, + struct TCP_Server_Info *server); +extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); +extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port); +extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, + const unsigned short int port); +extern int map_smb_to_linux_error(struct smb_hdr *smb, bool logErr); +extern void header_assemble(struct smb_hdr *, char /* command */ , + const struct cifs_tcon *, int /* length of + fixed section (word count) in two byte units */); +extern int small_smb_init_no_tc(const int smb_cmd, const int wct, + struct cifs_ses *ses, + void **request_buf); +extern int CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_cp); +extern __u16 GetNextMid(struct TCP_Server_Info *server); +extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); +extern u64 cifs_UnixTimeToNT(struct timespec); +extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, + int offset); +extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); + +extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle, + struct file *file, struct tcon_link *tlink, + __u32 oplock); +extern int cifs_posix_open(char *full_path, struct inode **pinode, + struct super_block *sb, + int mode, unsigned int f_flags, + __u32 *poplock, __u16 *pnetfid, int xid); +void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr); +extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, + FILE_UNIX_BASIC_INFO *info, + struct cifs_sb_info *cifs_sb); +extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); +extern struct inode *cifs_iget(struct super_block *sb, + struct cifs_fattr *fattr); + +extern int cifs_get_file_info(struct file *filp); +extern int cifs_get_inode_info(struct inode **pinode, + const unsigned char *search_path, + FILE_ALL_INFO *pfile_info, + struct super_block *sb, int xid, const __u16 *pfid); +extern int cifs_get_file_info_unix(struct file *filp); +extern int cifs_get_inode_info_unix(struct inode **pinode, + const unsigned char *search_path, + struct super_block *sb, int xid); +extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, struct inode *inode, + const char *path, const __u16 *pfid); +extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64); +extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, + const char *, u32 *); +extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, + const char *); + +extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, + struct cifs_sb_info *cifs_sb); +extern int cifs_match_super(struct super_block *, void *); +extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info); +extern struct smb_vol *cifs_get_volume_info(char *mount_data, + const char *devname); +extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); +extern void cifs_umount(struct cifs_sb_info *); +extern void cifs_dfs_release_automount_timer(void); +void cifs_proc_init(void); +void cifs_proc_clean(void); + +extern int cifs_negotiate_protocol(unsigned int xid, + struct cifs_ses *ses); +extern int cifs_setup_session(unsigned int xid, struct cifs_ses *ses, + struct nls_table *nls_info); +extern int CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses); + +extern int CIFSTCon(unsigned int xid, struct cifs_ses *ses, + const char *tree, struct cifs_tcon *tcon, + const struct nls_table *); + +extern int CIFSFindFirst(const int xid, struct cifs_tcon *tcon, + const char *searchName, const struct nls_table *nls_codepage, + __u16 *searchHandle, struct cifs_search_info *psrch_inf, + int map, const char dirsep); + +extern int CIFSFindNext(const int xid, struct cifs_tcon *tcon, + __u16 searchHandle, struct cifs_search_info *psrch_inf); + +extern int CIFSFindClose(const int, struct cifs_tcon *tcon, + const __u16 search_handle); + +extern int CIFSSMBQFileInfo(const int xid, struct cifs_tcon *tcon, + u16 netfid, FILE_ALL_INFO *pFindData); +extern int CIFSSMBQPathInfo(const int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + FILE_ALL_INFO *findData, + int legacy /* whether to use old info level */, + const struct nls_table *nls_codepage, int remap); +extern int SMBQueryInformation(const int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + FILE_ALL_INFO *findData, + const struct nls_table *nls_codepage, int remap); + +extern int CIFSSMBUnixQFileInfo(const int xid, struct cifs_tcon *tcon, + u16 netfid, FILE_UNIX_BASIC_INFO *pFindData); +extern int CIFSSMBUnixQPathInfo(const int xid, + struct cifs_tcon *tcon, + const unsigned char *searchName, + FILE_UNIX_BASIC_INFO *pFindData, + const struct nls_table *nls_codepage, int remap); + +extern int CIFSGetDFSRefer(const int xid, struct cifs_ses *ses, + const unsigned char *searchName, + struct dfs_info3_param **target_nodes, + unsigned int *number_of_nodes_in_array, + const struct nls_table *nls_codepage, int remap); + +extern int get_dfs_path(int xid, struct cifs_ses *pSesInfo, + const char *old_path, + const struct nls_table *nls_codepage, + unsigned int *pnum_referrals, + struct dfs_info3_param **preferrals, + int remap); +extern void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + struct smb_vol *vol); +extern int CIFSSMBQFSInfo(const int xid, struct cifs_tcon *tcon, + struct kstatfs *FSData); +extern int SMBOldQFSInfo(const int xid, struct cifs_tcon *tcon, + struct kstatfs *FSData); +extern int CIFSSMBSetFSUnixInfo(const int xid, struct cifs_tcon *tcon, + __u64 cap); + +extern int CIFSSMBQFSAttributeInfo(const int xid, + struct cifs_tcon *tcon); +extern int CIFSSMBQFSDeviceInfo(const int xid, struct cifs_tcon *tcon); +extern int CIFSSMBQFSUnixInfo(const int xid, struct cifs_tcon *tcon); +extern int CIFSSMBQFSPosixInfo(const int xid, struct cifs_tcon *tcon, + struct kstatfs *FSData); + +extern int CIFSSMBSetPathInfo(const int xid, struct cifs_tcon *tcon, + const char *fileName, const FILE_BASIC_INFO *data, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSSMBSetFileInfo(const int xid, struct cifs_tcon *tcon, + const FILE_BASIC_INFO *data, __u16 fid, + __u32 pid_of_opener); +extern int CIFSSMBSetFileDisposition(const int xid, struct cifs_tcon *tcon, + bool delete_file, __u16 fid, __u32 pid_of_opener); +#if 0 +extern int CIFSSMBSetAttrLegacy(int xid, struct cifs_tcon *tcon, + char *fileName, __u16 dos_attributes, + const struct nls_table *nls_codepage); +#endif /* possibly unneeded function */ +extern int CIFSSMBSetEOF(const int xid, struct cifs_tcon *tcon, + const char *fileName, __u64 size, + bool setAllocationSizeFlag, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSSMBSetFileSize(const int xid, struct cifs_tcon *tcon, + __u64 size, __u16 fileHandle, __u32 opener_pid, + bool AllocSizeFlag); + +struct cifs_unix_set_info_args { + __u64 ctime; + __u64 atime; + __u64 mtime; + __u64 mode; + __u64 uid; + __u64 gid; + dev_t device; +}; + +extern int CIFSSMBUnixSetFileInfo(const int xid, struct cifs_tcon *tcon, + const struct cifs_unix_set_info_args *args, + u16 fid, u32 pid_of_opener); + +extern int CIFSSMBUnixSetPathInfo(const int xid, struct cifs_tcon *pTcon, + char *fileName, + const struct cifs_unix_set_info_args *args, + const struct nls_table *nls_codepage, + int remap_special_chars); + +extern int CIFSSMBMkDir(const int xid, struct cifs_tcon *tcon, + const char *newName, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSSMBRmDir(const int xid, struct cifs_tcon *tcon, + const char *name, const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSPOSIXDelFile(const int xid, struct cifs_tcon *tcon, + const char *name, __u16 type, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSSMBDelFile(const int xid, struct cifs_tcon *tcon, + const char *name, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSSMBRename(const int xid, struct cifs_tcon *tcon, + const char *fromName, const char *toName, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon, + int netfid, const char *target_name, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSCreateHardLink(const int xid, + struct cifs_tcon *tcon, + const char *fromName, const char *toName, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSUnixCreateHardLink(const int xid, + struct cifs_tcon *tcon, + const char *fromName, const char *toName, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSUnixCreateSymLink(const int xid, + struct cifs_tcon *tcon, + const char *fromName, const char *toName, + const struct nls_table *nls_codepage); +extern int CIFSSMBUnixQuerySymLink(const int xid, + struct cifs_tcon *tcon, + const unsigned char *searchName, char **syminfo, + const struct nls_table *nls_codepage); +#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL +extern int CIFSSMBQueryReparseLinkInfo(const int xid, + struct cifs_tcon *tcon, + const unsigned char *searchName, + char *symlinkinfo, const int buflen, __u16 fid, + const struct nls_table *nls_codepage); +#endif /* temporarily unused until cifs_symlink fixed */ +extern int CIFSSMBOpen(const int xid, struct cifs_tcon *tcon, + const char *fileName, const int disposition, + const int access_flags, const int omode, + __u16 *netfid, int *pOplock, FILE_ALL_INFO *, + const struct nls_table *nls_codepage, int remap); +extern int SMBLegacyOpen(const int xid, struct cifs_tcon *tcon, + const char *fileName, const int disposition, + const int access_flags, const int omode, + __u16 *netfid, int *pOplock, FILE_ALL_INFO *, + const struct nls_table *nls_codepage, int remap); +extern int CIFSPOSIXCreate(const int xid, struct cifs_tcon *tcon, + u32 posix_flags, __u64 mode, __u16 *netfid, + FILE_UNIX_BASIC_INFO *pRetData, + __u32 *pOplock, const char *name, + const struct nls_table *nls_codepage, int remap); +extern int CIFSSMBClose(const int xid, struct cifs_tcon *tcon, + const int smb_file_id); + +extern int CIFSSMBFlush(const int xid, struct cifs_tcon *tcon, + const int smb_file_id); + +extern int CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, char **buf, + int *return_buf_type); +extern int CIFSSMBWrite(const int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, const char *buf, + const char __user *ubuf, const int long_op); +extern int CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, struct kvec *iov, const int nvec, + const int long_op); +extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, __u64 *inode_number, + const struct nls_table *nls_codepage, + int remap_special_chars); + +extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon, + const __u16 netfid, const __u64 len, + const __u64 offset, const __u32 numUnlock, + const __u32 numLock, const __u8 lockType, + const bool waitFlag, const __u8 oplock_level); +extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, + const __u16 smb_file_id, const int get_flag, + const __u64 len, struct file_lock *, + const __u16 lock_type, const bool waitFlag); +extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon); +extern int CIFSSMBEcho(struct TCP_Server_Info *server); +extern int CIFSSMBLogoff(const int xid, struct cifs_ses *ses); + +extern struct cifs_ses *sesInfoAlloc(void); +extern void sesInfoFree(struct cifs_ses *); +extern struct cifs_tcon *tconInfoAlloc(void); +extern void tconInfoFree(struct cifs_tcon *); + +extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); +extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, + __u32 *); +extern int cifs_verify_signature(struct smb_hdr *, + struct TCP_Server_Info *server, + __u32 expected_sequence_number); +extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); +extern int setup_ntlm_response(struct cifs_ses *); +extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); +extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); +extern void cifs_crypto_shash_release(struct TCP_Server_Info *); +extern int calc_seckey(struct cifs_ses *); + +#ifdef CONFIG_CIFS_WEAK_PW_HASH +extern int calc_lanman_hash(const char *password, const char *cryptkey, + bool encrypt, char *lnm_session_key); +#endif /* CIFS_WEAK_PW_HASH */ +#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ +extern int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon, + const int notify_subdirs, const __u16 netfid, + __u32 filter, struct file *file, int multishot, + const struct nls_table *nls_codepage); +#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ +extern int CIFSSMBCopy(int xid, + struct cifs_tcon *source_tcon, + const char *fromName, + const __u16 target_tid, + const char *toName, const int flags, + const struct nls_table *nls_codepage, + int remap_special_chars); +extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + const unsigned char *ea_name, char *EAData, + size_t bufsize, const struct nls_table *nls_codepage, + int remap_special_chars); +extern int CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon, + const char *fileName, const char *ea_name, + const void *ea_value, const __u16 ea_value_len, + const struct nls_table *nls_codepage, int remap_special_chars); +extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon, + __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); +extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16, + struct cifs_ntsd *, __u32); +extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + char *acl_inf, const int buflen, const int acl_type, + const struct nls_table *nls_codepage, int remap_special_chars); +extern int CIFSSMBSetPosixACL(const int xid, struct cifs_tcon *tcon, + const unsigned char *fileName, + const char *local_acl, const int buflen, const int acl_type, + const struct nls_table *nls_codepage, int remap_special_chars); +extern int CIFSGetExtAttr(const int xid, struct cifs_tcon *tcon, + const int netfid, __u64 *pExtAttrBits, __u64 *pMask); +extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); +extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr); +extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, + const unsigned char *path, + struct cifs_sb_info *cifs_sb, int xid); +extern int mdfour(unsigned char *, unsigned char *, int); +extern int E_md4hash(const unsigned char *passwd, unsigned char *p16); +extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, + unsigned char *p24); + +/* asynchronous write support */ +struct cifs_writedata { + struct kref refcount; + enum writeback_sync_modes sync_mode; + struct work_struct work; + struct cifsFileInfo *cfile; + __u64 offset; + unsigned int bytes; + int result; + unsigned int nr_pages; + struct page *pages[1]; +}; + +int cifs_async_writev(struct cifs_writedata *wdata); +struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages); +void cifs_writedata_release(struct kref *refcount); + +#endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c new file mode 100644 index 00000000..07132c4e --- /dev/null +++ b/fs/cifs/cifssmb.c @@ -0,0 +1,6088 @@ +/* + * fs/cifs/cifssmb.c + * + * Copyright (C) International Business Machines Corp., 2002,2010 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * Contains the routines for constructing the SMB PDUs themselves + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + /* SMB/CIFS PDU handling routines here - except for leftovers in connect.c */ + /* These are mostly routines that operate on a pathname, or on a tree id */ + /* (mounted volume), but there are eight handle based routines which must be */ + /* treated slightly differently for reconnection purposes since we never */ + /* want to reuse a stale file handle and only the caller knows the file info */ + +#include +#include +#include +#include +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsacl.h" +#include "cifsproto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" + +#ifdef CONFIG_CIFS_POSIX +static struct { + int index; + char *name; +} protocols[] = { +#ifdef CONFIG_CIFS_WEAK_PW_HASH + {LANMAN_PROT, "\2LM1.2X002"}, + {LANMAN2_PROT, "\2LANMAN2.1"}, +#endif /* weak password hashing for legacy clients */ + {CIFS_PROT, "\2NT LM 0.12"}, + {POSIX_PROT, "\2POSIX 2"}, + {BAD_PROT, "\2"} +}; +#else +static struct { + int index; + char *name; +} protocols[] = { +#ifdef CONFIG_CIFS_WEAK_PW_HASH + {LANMAN_PROT, "\2LM1.2X002"}, + {LANMAN2_PROT, "\2LANMAN2.1"}, +#endif /* weak password hashing for legacy clients */ + {CIFS_PROT, "\2NT LM 0.12"}, + {BAD_PROT, "\2"} +}; +#endif + +/* define the number of elements in the cifs dialect array */ +#ifdef CONFIG_CIFS_POSIX +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define CIFS_NUM_PROT 4 +#else +#define CIFS_NUM_PROT 2 +#endif /* CIFS_WEAK_PW_HASH */ +#else /* not posix */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define CIFS_NUM_PROT 3 +#else +#define CIFS_NUM_PROT 1 +#endif /* CONFIG_CIFS_WEAK_PW_HASH */ +#endif /* CIFS_POSIX */ + +/* Mark as invalid, all open files on tree connections since they + were closed when session to server was lost */ +static void mark_open_files_invalid(struct cifs_tcon *pTcon) +{ + struct cifsFileInfo *open_file = NULL; + struct list_head *tmp; + struct list_head *tmp1; + +/* list all files open on tree connection and mark them invalid */ + spin_lock(&cifs_file_list_lock); + list_for_each_safe(tmp, tmp1, &pTcon->openFileList) { + open_file = list_entry(tmp, struct cifsFileInfo, tlist); + open_file->invalidHandle = true; + open_file->oplock_break_cancelled = true; + } + spin_unlock(&cifs_file_list_lock); + /* BB Add call to invalidate_inodes(sb) for all superblocks mounted + to this tcon */ +} + +/* reconnect the socket, tcon, and smb session if needed */ +static int +cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) +{ + int rc = 0; + struct cifs_ses *ses; + struct TCP_Server_Info *server; + struct nls_table *nls_codepage; + + /* + * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for + * tcp and smb session status done differently for those three - in the + * calling routine + */ + if (!tcon) + return 0; + + ses = tcon->ses; + server = ses->server; + + /* + * only tree disconnect, open, and write, (and ulogoff which does not + * have tcon) are allowed as we start force umount + */ + if (tcon->tidStatus == CifsExiting) { + if (smb_command != SMB_COM_WRITE_ANDX && + smb_command != SMB_COM_OPEN_ANDX && + smb_command != SMB_COM_TREE_DISCONNECT) { + cFYI(1, "can not send cmd %d while umounting", + smb_command); + return -ENODEV; + } + } + + /* + * Give demultiplex thread up to 10 seconds to reconnect, should be + * greater than cifs socket timeout which is 7 seconds + */ + while (server->tcpStatus == CifsNeedReconnect) { + wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus != CifsNeedReconnect), 10 * HZ); + + /* are we still trying to reconnect? */ + if (server->tcpStatus != CifsNeedReconnect) + break; + + /* + * on "soft" mounts we wait once. Hard mounts keep + * retrying until process is killed or server comes + * back on-line + */ + if (!tcon->retry) { + cFYI(1, "gave up waiting on reconnect in smb_init"); + return -EHOSTDOWN; + } + } + + if (!ses->need_reconnect && !tcon->need_reconnect) + return 0; + + nls_codepage = load_nls_default(); + + /* + * need to prevent multiple threads trying to simultaneously + * reconnect the same SMB session + */ + mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(0, ses); + if (rc == 0 && ses->need_reconnect) + rc = cifs_setup_session(0, ses, nls_codepage); + + /* do we need to reconnect tcon? */ + if (rc || !tcon->need_reconnect) { + mutex_unlock(&ses->session_mutex); + goto out; + } + + mark_open_files_invalid(tcon); + rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); + mutex_unlock(&ses->session_mutex); + cFYI(1, "reconnect tcon rc = %d", rc); + + if (rc) + goto out; + + /* + * FIXME: check if wsize needs updated due to negotiated smb buffer + * size shrinking + */ + atomic_inc(&tconInfoReconnectCount); + + /* tell server Unix caps we support */ + if (ses->capabilities & CAP_UNIX) + reset_cifs_unix_caps(0, tcon, NULL, NULL); + + /* + * Removed call to reopen open files here. It is safer (and faster) to + * reopen files one at a time as needed in read and write. + * + * FIXME: what about file locks? don't we need to reclaim them ASAP? + */ + +out: + /* + * Check if handle based operation so we know whether we can continue + * or not without returning to caller to reset file handle + */ + switch (smb_command) { + case SMB_COM_READ_ANDX: + case SMB_COM_WRITE_ANDX: + case SMB_COM_CLOSE: + case SMB_COM_FIND_CLOSE2: + case SMB_COM_LOCKING_ANDX: + rc = -EAGAIN; + } + + unload_nls(nls_codepage); + return rc; +} + +/* Allocate and return pointer to an SMB request buffer, and set basic + SMB information in the SMB header. If the return code is zero, this + function must have filled in request_buf pointer */ +static int +small_smb_init(int smb_command, int wct, struct cifs_tcon *tcon, + void **request_buf) +{ + int rc; + + rc = cifs_reconnect_tcon(tcon, smb_command); + if (rc) + return rc; + + *request_buf = cifs_small_buf_get(); + if (*request_buf == NULL) { + /* BB should we add a retry in here if not a writepage? */ + return -ENOMEM; + } + + header_assemble((struct smb_hdr *) *request_buf, smb_command, + tcon, wct); + + if (tcon != NULL) + cifs_stats_inc(&tcon->num_smbs_sent); + + return 0; +} + +int +small_smb_init_no_tc(const int smb_command, const int wct, + struct cifs_ses *ses, void **request_buf) +{ + int rc; + struct smb_hdr *buffer; + + rc = small_smb_init(smb_command, wct, NULL, request_buf); + if (rc) + return rc; + + buffer = (struct smb_hdr *)*request_buf; + buffer->Mid = GetNextMid(ses->server); + if (ses->capabilities & CAP_UNICODE) + buffer->Flags2 |= SMBFLG2_UNICODE; + if (ses->capabilities & CAP_STATUS32) + buffer->Flags2 |= SMBFLG2_ERR_STATUS; + + /* uid, tid can stay at zero as set in header assemble */ + + /* BB add support for turning on the signing when + this function is used after 1st of session setup requests */ + + return rc; +} + +/* If the return code is zero, this function must fill in request_buf pointer */ +static int +__smb_init(int smb_command, int wct, struct cifs_tcon *tcon, + void **request_buf, void **response_buf) +{ + *request_buf = cifs_buf_get(); + if (*request_buf == NULL) { + /* BB should we add a retry in here if not a writepage? */ + return -ENOMEM; + } + /* Although the original thought was we needed the response buf for */ + /* potential retries of smb operations it turns out we can determine */ + /* from the mid flags when the request buffer can be resent without */ + /* having to use a second distinct buffer for the response */ + if (response_buf) + *response_buf = *request_buf; + + header_assemble((struct smb_hdr *) *request_buf, smb_command, tcon, + wct); + + if (tcon != NULL) + cifs_stats_inc(&tcon->num_smbs_sent); + + return 0; +} + +/* If the return code is zero, this function must fill in request_buf pointer */ +static int +smb_init(int smb_command, int wct, struct cifs_tcon *tcon, + void **request_buf, void **response_buf) +{ + int rc; + + rc = cifs_reconnect_tcon(tcon, smb_command); + if (rc) + return rc; + + return __smb_init(smb_command, wct, tcon, request_buf, response_buf); +} + +static int +smb_init_no_reconnect(int smb_command, int wct, struct cifs_tcon *tcon, + void **request_buf, void **response_buf) +{ + if (tcon->ses->need_reconnect || tcon->need_reconnect) + return -EHOSTDOWN; + + return __smb_init(smb_command, wct, tcon, request_buf, response_buf); +} + +static int validate_t2(struct smb_t2_rsp *pSMB) +{ + unsigned int total_size; + + /* check for plausible wct */ + if (pSMB->hdr.WordCount < 10) + goto vt2_err; + + /* check for parm and data offset going beyond end of smb */ + if (get_unaligned_le16(&pSMB->t2_rsp.ParameterOffset) > 1024 || + get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024) + goto vt2_err; + + total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount); + if (total_size >= 512) + goto vt2_err; + + /* check that bcc is at least as big as parms + data, and that it is + * less than negotiated smb buffer + */ + total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount); + if (total_size > get_bcc(&pSMB->hdr) || + total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) + goto vt2_err; + + return 0; +vt2_err: + cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB, + sizeof(struct smb_t2_rsp) + 16); + return -EINVAL; +} + +static inline void inc_rfc1001_len(void *pSMB, int count) +{ + struct smb_hdr *hdr = (struct smb_hdr *)pSMB; + + be32_add_cpu(&hdr->smb_buf_length, count); +} + +int +CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses) +{ + NEGOTIATE_REQ *pSMB; + NEGOTIATE_RSP *pSMBr; + int rc = 0; + int bytes_returned; + int i; + struct TCP_Server_Info *server; + u16 count; + unsigned int secFlags; + + if (ses->server) + server = ses->server; + else { + rc = -EIO; + return rc; + } + rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ , + (void **) &pSMB, (void **) &pSMBr); + if (rc) + return rc; + + /* if any of auth flags (ie not sign or seal) are overriden use them */ + if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) + secFlags = ses->overrideSecFlg; /* BB FIXME fix sign flags? */ + else /* if override flags set only sign/seal OR them with global auth */ + secFlags = global_secflags | ses->overrideSecFlg; + + cFYI(1, "secFlags 0x%x", secFlags); + + pSMB->hdr.Mid = GetNextMid(server); + pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); + + if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) + pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; + else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) { + cFYI(1, "Kerberos only mechanism, enable extended security"); + pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; + } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP) + pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; + else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) { + cFYI(1, "NTLMSSP only mechanism, enable extended security"); + pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; + } + + count = 0; + for (i = 0; i < CIFS_NUM_PROT; i++) { + strncpy(pSMB->DialectsArray+count, protocols[i].name, 16); + count += strlen(protocols[i].name) + 1; + /* null at end of source and target buffers anyway */ + } + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc != 0) + goto neg_err_exit; + + server->dialect = le16_to_cpu(pSMBr->DialectIndex); + cFYI(1, "Dialect: %d", server->dialect); + /* Check wct = 1 error case */ + if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) { + /* core returns wct = 1, but we do not ask for core - otherwise + small wct just comes when dialect index is -1 indicating we + could not negotiate a common dialect */ + rc = -EOPNOTSUPP; + goto neg_err_exit; +#ifdef CONFIG_CIFS_WEAK_PW_HASH + } else if ((pSMBr->hdr.WordCount == 13) + && ((server->dialect == LANMAN_PROT) + || (server->dialect == LANMAN2_PROT))) { + __s16 tmp; + struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr; + + if ((secFlags & CIFSSEC_MAY_LANMAN) || + (secFlags & CIFSSEC_MAY_PLNTXT)) + server->secType = LANMAN; + else { + cERROR(1, "mount failed weak security disabled" + " in /proc/fs/cifs/SecurityFlags"); + rc = -EOPNOTSUPP; + goto neg_err_exit; + } + server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode); + server->maxReq = le16_to_cpu(rsp->MaxMpxCount); + server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), + (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); + server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); + /* even though we do not use raw we might as well set this + accurately, in case we ever find a need for it */ + if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { + server->max_rw = 0xFF00; + server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE; + } else { + server->max_rw = 0;/* do not need to use raw anyway */ + server->capabilities = CAP_MPX_MODE; + } + tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone); + if (tmp == -1) { + /* OS/2 often does not set timezone therefore + * we must use server time to calc time zone. + * Could deviate slightly from the right zone. + * Smallest defined timezone difference is 15 minutes + * (i.e. Nepal). Rounding up/down is done to match + * this requirement. + */ + int val, seconds, remain, result; + struct timespec ts, utc; + utc = CURRENT_TIME; + ts = cnvrtDosUnixTm(rsp->SrvTime.Date, + rsp->SrvTime.Time, 0); + cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d", + (int)ts.tv_sec, (int)utc.tv_sec, + (int)(utc.tv_sec - ts.tv_sec)); + val = (int)(utc.tv_sec - ts.tv_sec); + seconds = abs(val); + result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ; + remain = seconds % MIN_TZ_ADJ; + if (remain >= (MIN_TZ_ADJ / 2)) + result += MIN_TZ_ADJ; + if (val < 0) + result = -result; + server->timeAdj = result; + } else { + server->timeAdj = (int)tmp; + server->timeAdj *= 60; /* also in seconds */ + } + cFYI(1, "server->timeAdj: %d seconds", server->timeAdj); + + + /* BB get server time for time conversions and add + code to use it and timezone since this is not UTC */ + + if (rsp->EncryptionKeyLength == + cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { + memcpy(ses->server->cryptkey, rsp->EncryptionKey, + CIFS_CRYPTO_KEY_SIZE); + } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { + rc = -EIO; /* need cryptkey unless plain text */ + goto neg_err_exit; + } + + cFYI(1, "LANMAN negotiated"); + /* we will not end up setting signing flags - as no signing + was in LANMAN and server did not return the flags on */ + goto signing_check; +#else /* weak security disabled */ + } else if (pSMBr->hdr.WordCount == 13) { + cERROR(1, "mount failed, cifs module not built " + "with CIFS_WEAK_PW_HASH support"); + rc = -EOPNOTSUPP; +#endif /* WEAK_PW_HASH */ + goto neg_err_exit; + } else if (pSMBr->hdr.WordCount != 17) { + /* unknown wct */ + rc = -EOPNOTSUPP; + goto neg_err_exit; + } + /* else wct == 17 NTLM */ + server->sec_mode = pSMBr->SecurityMode; + if ((server->sec_mode & SECMODE_USER) == 0) + cFYI(1, "share mode security"); + + if ((server->sec_mode & SECMODE_PW_ENCRYPT) == 0) +#ifdef CONFIG_CIFS_WEAK_PW_HASH + if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0) +#endif /* CIFS_WEAK_PW_HASH */ + cERROR(1, "Server requests plain text password" + " but client support disabled"); + + if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2) + server->secType = NTLMv2; + else if (secFlags & CIFSSEC_MAY_NTLM) + server->secType = NTLM; + else if (secFlags & CIFSSEC_MAY_NTLMV2) + server->secType = NTLMv2; + else if (secFlags & CIFSSEC_MAY_KRB5) + server->secType = Kerberos; + else if (secFlags & CIFSSEC_MAY_NTLMSSP) + server->secType = RawNTLMSSP; + else if (secFlags & CIFSSEC_MAY_LANMAN) + server->secType = LANMAN; + else { + rc = -EOPNOTSUPP; + cERROR(1, "Invalid security type"); + goto neg_err_exit; + } + /* else ... any others ...? */ + + /* one byte, so no need to convert this or EncryptionKeyLen from + little endian */ + server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount); + /* probably no need to store and check maxvcs */ + server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), + (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); + server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); + cFYI(DBG2, "Max buf = %d", ses->server->maxBuf); + server->capabilities = le32_to_cpu(pSMBr->Capabilities); + server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); + server->timeAdj *= 60; + if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { + memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey, + CIFS_CRYPTO_KEY_SIZE); + } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC || + server->capabilities & CAP_EXTENDED_SECURITY) && + (pSMBr->EncryptionKeyLength == 0)) { + /* decode security blob */ + count = get_bcc(&pSMBr->hdr); + if (count < 16) { + rc = -EIO; + goto neg_err_exit; + } + spin_lock(&cifs_tcp_ses_lock); + if (server->srv_count > 1) { + spin_unlock(&cifs_tcp_ses_lock); + if (memcmp(server->server_GUID, + pSMBr->u.extended_response. + GUID, 16) != 0) { + cFYI(1, "server UID changed"); + memcpy(server->server_GUID, + pSMBr->u.extended_response.GUID, + 16); + } + } else { + spin_unlock(&cifs_tcp_ses_lock); + memcpy(server->server_GUID, + pSMBr->u.extended_response.GUID, 16); + } + + if (count == 16) { + server->secType = RawNTLMSSP; + } else { + rc = decode_negTokenInit(pSMBr->u.extended_response. + SecurityBlob, count - 16, + server); + if (rc == 1) + rc = 0; + else + rc = -EINVAL; + if (server->secType == Kerberos) { + if (!server->sec_kerberos && + !server->sec_mskerberos) + rc = -EOPNOTSUPP; + } else if (server->secType == RawNTLMSSP) { + if (!server->sec_ntlmssp) + rc = -EOPNOTSUPP; + } else + rc = -EOPNOTSUPP; + } + } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { + rc = -EIO; /* no crypt key only if plain text pwd */ + goto neg_err_exit; + } else + server->capabilities &= ~CAP_EXTENDED_SECURITY; + +#ifdef CONFIG_CIFS_WEAK_PW_HASH +signing_check: +#endif + if ((secFlags & CIFSSEC_MAY_SIGN) == 0) { + /* MUST_SIGN already includes the MAY_SIGN FLAG + so if this is zero it means that signing is disabled */ + cFYI(1, "Signing disabled"); + if (server->sec_mode & SECMODE_SIGN_REQUIRED) { + cERROR(1, "Server requires " + "packet signing to be enabled in " + "/proc/fs/cifs/SecurityFlags."); + rc = -EOPNOTSUPP; + } + server->sec_mode &= + ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); + } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) { + /* signing required */ + cFYI(1, "Must sign - secFlags 0x%x", secFlags); + if ((server->sec_mode & + (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) { + cERROR(1, "signing required but server lacks support"); + rc = -EOPNOTSUPP; + } else + server->sec_mode |= SECMODE_SIGN_REQUIRED; + } else { + /* signing optional ie CIFSSEC_MAY_SIGN */ + if ((server->sec_mode & SECMODE_SIGN_REQUIRED) == 0) + server->sec_mode &= + ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); + } + +neg_err_exit: + cifs_buf_release(pSMB); + + cFYI(1, "negprot rc %d", rc); + return rc; +} + +int +CIFSSMBTDis(const int xid, struct cifs_tcon *tcon) +{ + struct smb_hdr *smb_buffer; + int rc = 0; + + cFYI(1, "In tree disconnect"); + + /* BB: do we need to check this? These should never be NULL. */ + if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) + return -EIO; + + /* + * No need to return error on this operation if tid invalidated and + * closed on server already e.g. due to tcp session crashing. Also, + * the tcon is no longer on the list, so no need to take lock before + * checking this. + */ + if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) + return 0; + + rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon, + (void **)&smb_buffer); + if (rc) + return rc; + + rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0); + if (rc) + cFYI(1, "Tree disconnect failed %d", rc); + + /* No need to return error on this operation if tid invalidated and + closed on server already e.g. due to tcp session crashing */ + if (rc == -EAGAIN) + rc = 0; + + return rc; +} + +/* + * This is a no-op for now. We're not really interested in the reply, but + * rather in the fact that the server sent one and that server->lstrp + * gets updated. + * + * FIXME: maybe we should consider checking that the reply matches request? + */ +static void +cifs_echo_callback(struct mid_q_entry *mid) +{ + struct TCP_Server_Info *server = mid->callback_data; + + DeleteMidQEntry(mid); + atomic_dec(&server->inFlight); + wake_up(&server->request_q); +} + +int +CIFSSMBEcho(struct TCP_Server_Info *server) +{ + ECHO_REQ *smb; + int rc = 0; + struct kvec iov; + + cFYI(1, "In echo request"); + + rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb); + if (rc) + return rc; + + /* set up echo request */ + smb->hdr.Tid = 0xffff; + smb->hdr.WordCount = 1; + put_unaligned_le16(1, &smb->EchoCount); + put_bcc(1, &smb->hdr); + smb->Data[0] = 'a'; + inc_rfc1001_len(smb, 3); + iov.iov_base = smb; + iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; + + rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true); + if (rc) + cFYI(1, "Echo request failed: %d", rc); + + cifs_small_buf_release(smb); + + return rc; +} + +int +CIFSSMBLogoff(const int xid, struct cifs_ses *ses) +{ + LOGOFF_ANDX_REQ *pSMB; + int rc = 0; + + cFYI(1, "In SMBLogoff for session disconnect"); + + /* + * BB: do we need to check validity of ses and server? They should + * always be valid since we have an active reference. If not, that + * should probably be a BUG() + */ + if (!ses || !ses->server) + return -EIO; + + mutex_lock(&ses->session_mutex); + if (ses->need_reconnect) + goto session_already_dead; /* no need to send SMBlogoff if uid + already closed due to reconnect */ + rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB); + if (rc) { + mutex_unlock(&ses->session_mutex); + return rc; + } + + pSMB->hdr.Mid = GetNextMid(ses->server); + + if (ses->server->sec_mode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + + pSMB->hdr.Uid = ses->Suid; + + pSMB->AndXCommand = 0xFF; + rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0); +session_already_dead: + mutex_unlock(&ses->session_mutex); + + /* if session dead then we do not need to do ulogoff, + since server closed smb session, no sense reporting + error */ + if (rc == -EAGAIN) + rc = 0; + return rc; +} + +int +CIFSPOSIXDelFile(const int xid, struct cifs_tcon *tcon, const char *fileName, + __u16 type, const struct nls_table *nls_codepage, int remap) +{ + TRANSACTION2_SPI_REQ *pSMB = NULL; + TRANSACTION2_SPI_RSP *pSMBr = NULL; + struct unlink_psx_rq *pRqD; + int name_len; + int rc = 0; + int bytes_returned = 0; + __u16 params, param_offset, offset, byte_count; + + cFYI(1, "In POSIX delete"); +PsxDelete: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB add path length overrun check */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, fileName, name_len); + } + + params = 6 + name_len; + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = 0; /* BB double check this with jra */ + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + + /* Setup pointer to Request Data (inode type) */ + pRqD = (struct unlink_psx_rq *)(((char *)&pSMB->hdr.Protocol) + offset); + pRqD->type = cpu_to_le16(type); + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + sizeof(struct unlink_psx_rq); + + pSMB->DataCount = cpu_to_le16(sizeof(struct unlink_psx_rq)); + pSMB->TotalDataCount = cpu_to_le16(sizeof(struct unlink_psx_rq)); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_UNLINK); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cFYI(1, "Posix delete returned %d", rc); + cifs_buf_release(pSMB); + + cifs_stats_inc(&tcon->num_deletes); + + if (rc == -EAGAIN) + goto PsxDelete; + + return rc; +} + +int +CIFSSMBDelFile(const int xid, struct cifs_tcon *tcon, const char *fileName, + const struct nls_table *nls_codepage, int remap) +{ + DELETE_FILE_REQ *pSMB = NULL; + DELETE_FILE_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + +DelFileRetry: + rc = smb_init(SMB_COM_DELETE, 1, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->fileName, fileName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve check for buffer overruns BB */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->fileName, fileName, name_len); + } + pSMB->SearchAttributes = + cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM); + pSMB->BufferFormat = 0x04; + inc_rfc1001_len(pSMB, name_len + 1); + pSMB->ByteCount = cpu_to_le16(name_len + 1); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->num_deletes); + if (rc) + cFYI(1, "Error in RMFile = %d", rc); + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto DelFileRetry; + + return rc; +} + +int +CIFSSMBRmDir(const int xid, struct cifs_tcon *tcon, const char *dirName, + const struct nls_table *nls_codepage, int remap) +{ + DELETE_DIRECTORY_REQ *pSMB = NULL; + DELETE_DIRECTORY_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + + cFYI(1, "In CIFSSMBRmDir"); +RmDirRetry: + rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, dirName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve check for buffer overruns BB */ + name_len = strnlen(dirName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->DirName, dirName, name_len); + } + + pSMB->BufferFormat = 0x04; + inc_rfc1001_len(pSMB, name_len + 1); + pSMB->ByteCount = cpu_to_le16(name_len + 1); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->num_rmdirs); + if (rc) + cFYI(1, "Error in RMDir = %d", rc); + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto RmDirRetry; + return rc; +} + +int +CIFSSMBMkDir(const int xid, struct cifs_tcon *tcon, + const char *name, const struct nls_table *nls_codepage, int remap) +{ + int rc = 0; + CREATE_DIRECTORY_REQ *pSMB = NULL; + CREATE_DIRECTORY_RSP *pSMBr = NULL; + int bytes_returned; + int name_len; + + cFYI(1, "In CIFSSMBMkDir"); +MkDirRetry: + rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, name, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve check for buffer overruns BB */ + name_len = strnlen(name, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->DirName, name, name_len); + } + + pSMB->BufferFormat = 0x04; + inc_rfc1001_len(pSMB, name_len + 1); + pSMB->ByteCount = cpu_to_le16(name_len + 1); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->num_mkdirs); + if (rc) + cFYI(1, "Error in Mkdir = %d", rc); + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto MkDirRetry; + return rc; +} + +int +CIFSPOSIXCreate(const int xid, struct cifs_tcon *tcon, __u32 posix_flags, + __u64 mode, __u16 *netfid, FILE_UNIX_BASIC_INFO *pRetData, + __u32 *pOplock, const char *name, + const struct nls_table *nls_codepage, int remap) +{ + TRANSACTION2_SPI_REQ *pSMB = NULL; + TRANSACTION2_SPI_RSP *pSMBr = NULL; + int name_len; + int rc = 0; + int bytes_returned = 0; + __u16 params, param_offset, offset, byte_count, count; + OPEN_PSX_REQ *pdata; + OPEN_PSX_RSP *psx_rsp; + + cFYI(1, "In POSIX Create"); +PsxCreat: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->FileName, name, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(name, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, name, name_len); + } + + params = 6 + name_len; + count = sizeof(OPEN_PSX_REQ); + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(1000); /* large enough */ + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + pdata = (OPEN_PSX_REQ *)(((char *)&pSMB->hdr.Protocol) + offset); + pdata->Level = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC); + pdata->Permissions = cpu_to_le64(mode); + pdata->PosixOpenFlags = cpu_to_le32(posix_flags); + pdata->OpenFlags = cpu_to_le32(*pOplock); + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + count; + + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_OPEN); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Posix create returned %d", rc); + goto psx_create_err; + } + + cFYI(1, "copying inode info"); + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)) { + rc = -EIO; /* bad smb */ + goto psx_create_err; + } + + /* copy return information to pRetData */ + psx_rsp = (OPEN_PSX_RSP *)((char *) &pSMBr->hdr.Protocol + + le16_to_cpu(pSMBr->t2.DataOffset)); + + *pOplock = le16_to_cpu(psx_rsp->OplockFlags); + if (netfid) + *netfid = psx_rsp->Fid; /* cifs fid stays in le */ + /* Let caller know file was created so we can set the mode. */ + /* Do we care about the CreateAction in any other cases? */ + if (cpu_to_le32(FILE_CREATE) == psx_rsp->CreateAction) + *pOplock |= CIFS_CREATE_ACTION; + /* check to make sure response data is there */ + if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) { + pRetData->Type = cpu_to_le32(-1); /* unknown */ + cFYI(DBG2, "unknown type"); + } else { + if (get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP) + + sizeof(FILE_UNIX_BASIC_INFO)) { + cERROR(1, "Open response data too small"); + pRetData->Type = cpu_to_le32(-1); + goto psx_create_err; + } + memcpy((char *) pRetData, + (char *)psx_rsp + sizeof(OPEN_PSX_RSP), + sizeof(FILE_UNIX_BASIC_INFO)); + } + +psx_create_err: + cifs_buf_release(pSMB); + + if (posix_flags & SMB_O_DIRECTORY) + cifs_stats_inc(&tcon->num_posixmkdirs); + else + cifs_stats_inc(&tcon->num_posixopens); + + if (rc == -EAGAIN) + goto PsxCreat; + + return rc; +} + +static __u16 convert_disposition(int disposition) +{ + __u16 ofun = 0; + + switch (disposition) { + case FILE_SUPERSEDE: + ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC; + break; + case FILE_OPEN: + ofun = SMBOPEN_OAPPEND; + break; + case FILE_CREATE: + ofun = SMBOPEN_OCREATE; + break; + case FILE_OPEN_IF: + ofun = SMBOPEN_OCREATE | SMBOPEN_OAPPEND; + break; + case FILE_OVERWRITE: + ofun = SMBOPEN_OTRUNC; + break; + case FILE_OVERWRITE_IF: + ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC; + break; + default: + cFYI(1, "unknown disposition %d", disposition); + ofun = SMBOPEN_OAPPEND; /* regular open */ + } + return ofun; +} + +static int +access_flags_to_smbopen_mode(const int access_flags) +{ + int masked_flags = access_flags & (GENERIC_READ | GENERIC_WRITE); + + if (masked_flags == GENERIC_READ) + return SMBOPEN_READ; + else if (masked_flags == GENERIC_WRITE) + return SMBOPEN_WRITE; + + /* just go for read/write */ + return SMBOPEN_READWRITE; +} + +int +SMBLegacyOpen(const int xid, struct cifs_tcon *tcon, + const char *fileName, const int openDisposition, + const int access_flags, const int create_options, __u16 *netfid, + int *pOplock, FILE_ALL_INFO *pfile_info, + const struct nls_table *nls_codepage, int remap) +{ + int rc = -EACCES; + OPENX_REQ *pSMB = NULL; + OPENX_RSP *pSMBr = NULL; + int bytes_returned; + int name_len; + __u16 count; + +OldOpenRetry: + rc = smb_init(SMB_COM_OPEN_ANDX, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->AndXCommand = 0xFF; /* none */ + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + count = 1; /* account for one byte pad to word boundary */ + name_len = + cifsConvertToUCS((__le16 *) (pSMB->fileName + 1), + fileName, PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve check for buffer overruns BB */ + count = 0; /* no pad */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->fileName, fileName, name_len); + } + if (*pOplock & REQ_OPLOCK) + pSMB->OpenFlags = cpu_to_le16(REQ_OPLOCK); + else if (*pOplock & REQ_BATCHOPLOCK) + pSMB->OpenFlags = cpu_to_le16(REQ_BATCHOPLOCK); + + pSMB->OpenFlags |= cpu_to_le16(REQ_MORE_INFO); + pSMB->Mode = cpu_to_le16(access_flags_to_smbopen_mode(access_flags)); + pSMB->Mode |= cpu_to_le16(0x40); /* deny none */ + /* set file as system file if special file such + as fifo and server expecting SFU style and + no Unix extensions */ + + if (create_options & CREATE_OPTION_SPECIAL) + pSMB->FileAttributes = cpu_to_le16(ATTR_SYSTEM); + else /* BB FIXME BB */ + pSMB->FileAttributes = cpu_to_le16(0/*ATTR_NORMAL*/); + + if (create_options & CREATE_OPTION_READONLY) + pSMB->FileAttributes |= cpu_to_le16(ATTR_READONLY); + + /* BB FIXME BB */ +/* pSMB->CreateOptions = cpu_to_le32(create_options & + CREATE_OPTIONS_MASK); */ + /* BB FIXME END BB */ + + pSMB->Sattr = cpu_to_le16(ATTR_HIDDEN | ATTR_SYSTEM | ATTR_DIRECTORY); + pSMB->OpenFunction = cpu_to_le16(convert_disposition(openDisposition)); + count += name_len; + inc_rfc1001_len(pSMB, count); + + pSMB->ByteCount = cpu_to_le16(count); + /* long_op set to 1 to allow for oplock break timeouts */ + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *)pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->num_opens); + if (rc) { + cFYI(1, "Error in Open = %d", rc); + } else { + /* BB verify if wct == 15 */ + +/* *pOplock = pSMBr->OplockLevel; */ /* BB take from action field*/ + + *netfid = pSMBr->Fid; /* cifs fid stays in le */ + /* Let caller know file was created so we can set the mode. */ + /* Do we care about the CreateAction in any other cases? */ + /* BB FIXME BB */ +/* if (cpu_to_le32(FILE_CREATE) == pSMBr->CreateAction) + *pOplock |= CIFS_CREATE_ACTION; */ + /* BB FIXME END */ + + if (pfile_info) { + pfile_info->CreationTime = 0; /* BB convert CreateTime*/ + pfile_info->LastAccessTime = 0; /* BB fixme */ + pfile_info->LastWriteTime = 0; /* BB fixme */ + pfile_info->ChangeTime = 0; /* BB fixme */ + pfile_info->Attributes = + cpu_to_le32(le16_to_cpu(pSMBr->FileAttributes)); + /* the file_info buf is endian converted by caller */ + pfile_info->AllocationSize = + cpu_to_le64(le32_to_cpu(pSMBr->EndOfFile)); + pfile_info->EndOfFile = pfile_info->AllocationSize; + pfile_info->NumberOfLinks = cpu_to_le32(1); + pfile_info->DeletePending = 0; + } + } + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto OldOpenRetry; + return rc; +} + +int +CIFSSMBOpen(const int xid, struct cifs_tcon *tcon, + const char *fileName, const int openDisposition, + const int access_flags, const int create_options, __u16 *netfid, + int *pOplock, FILE_ALL_INFO *pfile_info, + const struct nls_table *nls_codepage, int remap) +{ + int rc = -EACCES; + OPEN_REQ *pSMB = NULL; + OPEN_RSP *pSMBr = NULL; + int bytes_returned; + int name_len; + __u16 count; + +openRetry: + rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->AndXCommand = 0xFF; /* none */ + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + count = 1; /* account for one byte pad to word boundary */ + name_len = + cifsConvertToUCS((__le16 *) (pSMB->fileName + 1), + fileName, PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + pSMB->NameLength = cpu_to_le16(name_len); + } else { /* BB improve check for buffer overruns BB */ + count = 0; /* no pad */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + pSMB->NameLength = cpu_to_le16(name_len); + strncpy(pSMB->fileName, fileName, name_len); + } + if (*pOplock & REQ_OPLOCK) + pSMB->OpenFlags = cpu_to_le32(REQ_OPLOCK); + else if (*pOplock & REQ_BATCHOPLOCK) + pSMB->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK); + pSMB->DesiredAccess = cpu_to_le32(access_flags); + pSMB->AllocationSize = 0; + /* set file as system file if special file such + as fifo and server expecting SFU style and + no Unix extensions */ + if (create_options & CREATE_OPTION_SPECIAL) + pSMB->FileAttributes = cpu_to_le32(ATTR_SYSTEM); + else + pSMB->FileAttributes = cpu_to_le32(ATTR_NORMAL); + + /* XP does not handle ATTR_POSIX_SEMANTICS */ + /* but it helps speed up case sensitive checks for other + servers such as Samba */ + if (tcon->ses->capabilities & CAP_UNIX) + pSMB->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS); + + if (create_options & CREATE_OPTION_READONLY) + pSMB->FileAttributes |= cpu_to_le32(ATTR_READONLY); + + pSMB->ShareAccess = cpu_to_le32(FILE_SHARE_ALL); + pSMB->CreateDisposition = cpu_to_le32(openDisposition); + pSMB->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK); + /* BB Expirement with various impersonation levels and verify */ + pSMB->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION); + pSMB->SecurityFlags = + SECURITY_CONTEXT_TRACKING | SECURITY_EFFECTIVE_ONLY; + + count += name_len; + inc_rfc1001_len(pSMB, count); + + pSMB->ByteCount = cpu_to_le16(count); + /* long_op set to 1 to allow for oplock break timeouts */ + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *)pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->num_opens); + if (rc) { + cFYI(1, "Error in Open = %d", rc); + } else { + *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */ + *netfid = pSMBr->Fid; /* cifs fid stays in le */ + /* Let caller know file was created so we can set the mode. */ + /* Do we care about the CreateAction in any other cases? */ + if (cpu_to_le32(FILE_CREATE) == pSMBr->CreateAction) + *pOplock |= CIFS_CREATE_ACTION; + if (pfile_info) { + memcpy((char *)pfile_info, (char *)&pSMBr->CreationTime, + 36 /* CreationTime to Attributes */); + /* the file_info buf is endian converted by caller */ + pfile_info->AllocationSize = pSMBr->AllocationSize; + pfile_info->EndOfFile = pSMBr->EndOfFile; + pfile_info->NumberOfLinks = cpu_to_le32(1); + pfile_info->DeletePending = 0; + } + } + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto openRetry; + return rc; +} + +int +CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, + char **buf, int *pbuf_type) +{ + int rc = -EACCES; + READ_REQ *pSMB = NULL; + READ_RSP *pSMBr = NULL; + char *pReadData = NULL; + int wct; + int resp_buf_type = 0; + struct kvec iov[1]; + __u32 pid = io_parms->pid; + __u16 netfid = io_parms->netfid; + __u64 offset = io_parms->offset; + struct cifs_tcon *tcon = io_parms->tcon; + unsigned int count = io_parms->length; + + cFYI(1, "Reading %d bytes on fid %d", count, netfid); + if (tcon->ses->capabilities & CAP_LARGE_FILES) + wct = 12; + else { + wct = 10; /* old style read */ + if ((offset >> 32) > 0) { + /* can not handle this big offset for old */ + return -EIO; + } + } + + *nbytes = 0; + rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **) &pSMB); + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16)); + + /* tcon and ses pointer are checked in smb_init */ + if (tcon->ses->server == NULL) + return -ECONNABORTED; + + pSMB->AndXCommand = 0xFF; /* none */ + pSMB->Fid = netfid; + pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF); + if (wct == 12) + pSMB->OffsetHigh = cpu_to_le32(offset >> 32); + + pSMB->Remaining = 0; + pSMB->MaxCount = cpu_to_le16(count & 0xFFFF); + pSMB->MaxCountHigh = cpu_to_le32(count >> 16); + if (wct == 12) + pSMB->ByteCount = 0; /* no need to do le conversion since 0 */ + else { + /* old style read */ + struct smb_com_readx_req *pSMBW = + (struct smb_com_readx_req *)pSMB; + pSMBW->ByteCount = 0; + } + + iov[0].iov_base = (char *)pSMB; + iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4; + rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */, + &resp_buf_type, CIFS_LOG_ERROR); + cifs_stats_inc(&tcon->num_reads); + pSMBr = (READ_RSP *)iov[0].iov_base; + if (rc) { + cERROR(1, "Send error in read = %d", rc); + } else { + int data_length = le16_to_cpu(pSMBr->DataLengthHigh); + data_length = data_length << 16; + data_length += le16_to_cpu(pSMBr->DataLength); + *nbytes = data_length; + + /*check that DataLength would not go beyond end of SMB */ + if ((data_length > CIFSMaxBufSize) + || (data_length > count)) { + cFYI(1, "bad length %d for count %d", + data_length, count); + rc = -EIO; + *nbytes = 0; + } else { + pReadData = (char *) (&pSMBr->hdr.Protocol) + + le16_to_cpu(pSMBr->DataOffset); +/* if (rc = copy_to_user(buf, pReadData, data_length)) { + cERROR(1, "Faulting on read rc = %d",rc); + rc = -EFAULT; + }*/ /* can not use copy_to_user when using page cache*/ + if (*buf) + memcpy(*buf, pReadData, data_length); + } + } + +/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ + if (*buf) { + if (resp_buf_type == CIFS_SMALL_BUFFER) + cifs_small_buf_release(iov[0].iov_base); + else if (resp_buf_type == CIFS_LARGE_BUFFER) + cifs_buf_release(iov[0].iov_base); + } else if (resp_buf_type != CIFS_NO_BUFFER) { + /* return buffer to caller to free */ + *buf = iov[0].iov_base; + if (resp_buf_type == CIFS_SMALL_BUFFER) + *pbuf_type = CIFS_SMALL_BUFFER; + else if (resp_buf_type == CIFS_LARGE_BUFFER) + *pbuf_type = CIFS_LARGE_BUFFER; + } /* else no valid buffer on return - leave as null */ + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + return rc; +} + + +int +CIFSSMBWrite(const int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, const char *buf, + const char __user *ubuf, const int long_op) +{ + int rc = -EACCES; + WRITE_REQ *pSMB = NULL; + WRITE_RSP *pSMBr = NULL; + int bytes_returned, wct; + __u32 bytes_sent; + __u16 byte_count; + __u32 pid = io_parms->pid; + __u16 netfid = io_parms->netfid; + __u64 offset = io_parms->offset; + struct cifs_tcon *tcon = io_parms->tcon; + unsigned int count = io_parms->length; + + *nbytes = 0; + + /* cFYI(1, "write at %lld %d bytes", offset, count);*/ + if (tcon->ses == NULL) + return -ECONNABORTED; + + if (tcon->ses->capabilities & CAP_LARGE_FILES) + wct = 14; + else { + wct = 12; + if ((offset >> 32) > 0) { + /* can not handle big offset for old srv */ + return -EIO; + } + } + + rc = smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16)); + + /* tcon and ses pointer are checked in smb_init */ + if (tcon->ses->server == NULL) + return -ECONNABORTED; + + pSMB->AndXCommand = 0xFF; /* none */ + pSMB->Fid = netfid; + pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF); + if (wct == 14) + pSMB->OffsetHigh = cpu_to_le32(offset >> 32); + + pSMB->Reserved = 0xFFFFFFFF; + pSMB->WriteMode = 0; + pSMB->Remaining = 0; + + /* Can increase buffer size if buffer is big enough in some cases ie we + can send more if LARGE_WRITE_X capability returned by the server and if + our buffer is big enough or if we convert to iovecs on socket writes + and eliminate the copy to the CIFS buffer */ + if (tcon->ses->capabilities & CAP_LARGE_WRITE_X) { + bytes_sent = min_t(const unsigned int, CIFSMaxBufSize, count); + } else { + bytes_sent = (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) + & ~0xFF; + } + + if (bytes_sent > count) + bytes_sent = count; + pSMB->DataOffset = + cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4); + if (buf) + memcpy(pSMB->Data, buf, bytes_sent); + else if (ubuf) { + if (copy_from_user(pSMB->Data, ubuf, bytes_sent)) { + cifs_buf_release(pSMB); + return -EFAULT; + } + } else if (count != 0) { + /* No buffer */ + cifs_buf_release(pSMB); + return -EINVAL; + } /* else setting file size with write of zero bytes */ + if (wct == 14) + byte_count = bytes_sent + 1; /* pad */ + else /* wct == 12 */ + byte_count = bytes_sent + 5; /* bigger pad, smaller smb hdr */ + + pSMB->DataLengthLow = cpu_to_le16(bytes_sent & 0xFFFF); + pSMB->DataLengthHigh = cpu_to_le16(bytes_sent >> 16); + inc_rfc1001_len(pSMB, byte_count); + + if (wct == 14) + pSMB->ByteCount = cpu_to_le16(byte_count); + else { /* old style write has byte count 4 bytes earlier + so 4 bytes pad */ + struct smb_com_writex_req *pSMBW = + (struct smb_com_writex_req *)pSMB; + pSMBW->ByteCount = cpu_to_le16(byte_count); + } + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, long_op); + cifs_stats_inc(&tcon->num_writes); + if (rc) { + cFYI(1, "Send error in write = %d", rc); + } else { + *nbytes = le16_to_cpu(pSMBr->CountHigh); + *nbytes = (*nbytes) << 16; + *nbytes += le16_to_cpu(pSMBr->Count); + + /* + * Mask off high 16 bits when bytes written as returned by the + * server is greater than bytes requested by the client. Some + * OS/2 servers are known to set incorrect CountHigh values. + */ + if (*nbytes > count) + *nbytes &= 0xFFFF; + } + + cifs_buf_release(pSMB); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + +void +cifs_writedata_release(struct kref *refcount) +{ + struct cifs_writedata *wdata = container_of(refcount, + struct cifs_writedata, refcount); + + if (wdata->cfile) + cifsFileInfo_put(wdata->cfile); + + kfree(wdata); +} + +/* + * Write failed with a retryable error. Resend the write request. It's also + * possible that the page was redirtied so re-clean the page. + */ +static void +cifs_writev_requeue(struct cifs_writedata *wdata) +{ + int i, rc; + struct inode *inode = wdata->cfile->dentry->d_inode; + + for (i = 0; i < wdata->nr_pages; i++) { + lock_page(wdata->pages[i]); + clear_page_dirty_for_io(wdata->pages[i]); + } + + do { + rc = cifs_async_writev(wdata); + } while (rc == -EAGAIN); + + for (i = 0; i < wdata->nr_pages; i++) { + if (rc != 0) + SetPageError(wdata->pages[i]); + unlock_page(wdata->pages[i]); + } + + mapping_set_error(inode->i_mapping, rc); + kref_put(&wdata->refcount, cifs_writedata_release); +} + +static void +cifs_writev_complete(struct work_struct *work) +{ + struct cifs_writedata *wdata = container_of(work, + struct cifs_writedata, work); + struct inode *inode = wdata->cfile->dentry->d_inode; + int i = 0; + + if (wdata->result == 0) { + cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes); + cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink), + wdata->bytes); + } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN) + return cifs_writev_requeue(wdata); + + for (i = 0; i < wdata->nr_pages; i++) { + struct page *page = wdata->pages[i]; + if (wdata->result == -EAGAIN) + __set_page_dirty_nobuffers(page); + else if (wdata->result < 0) + SetPageError(page); + end_page_writeback(page); + page_cache_release(page); + } + if (wdata->result != -EAGAIN) + mapping_set_error(inode->i_mapping, wdata->result); + kref_put(&wdata->refcount, cifs_writedata_release); +} + +struct cifs_writedata * +cifs_writedata_alloc(unsigned int nr_pages) +{ + struct cifs_writedata *wdata; + + /* this would overflow */ + if (nr_pages == 0) { + cERROR(1, "%s: called with nr_pages == 0!", __func__); + return NULL; + } + + /* writedata + number of page pointers */ + wdata = kzalloc(sizeof(*wdata) + + sizeof(struct page *) * (nr_pages - 1), GFP_NOFS); + if (wdata != NULL) { + INIT_WORK(&wdata->work, cifs_writev_complete); + kref_init(&wdata->refcount); + } + return wdata; +} + +/* + * Check the midState and signature on received buffer (if any), and queue the + * workqueue completion task. + */ +static void +cifs_writev_callback(struct mid_q_entry *mid) +{ + struct cifs_writedata *wdata = mid->callback_data; + struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + unsigned int written; + WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf; + + switch (mid->midState) { + case MID_RESPONSE_RECEIVED: + wdata->result = cifs_check_receive(mid, tcon->ses->server, 0); + if (wdata->result != 0) + break; + + written = le16_to_cpu(smb->CountHigh); + written <<= 16; + written += le16_to_cpu(smb->Count); + /* + * Mask off high 16 bits when bytes written as returned + * by the server is greater than bytes requested by the + * client. OS/2 servers are known to set incorrect + * CountHigh values. + */ + if (written > wdata->bytes) + written &= 0xFFFF; + + if (written < wdata->bytes) + wdata->result = -ENOSPC; + else + wdata->bytes = written; + break; + case MID_REQUEST_SUBMITTED: + case MID_RETRY_NEEDED: + wdata->result = -EAGAIN; + break; + default: + wdata->result = -EIO; + break; + } + + queue_work(system_nrt_wq, &wdata->work); + DeleteMidQEntry(mid); + atomic_dec(&tcon->ses->server->inFlight); + wake_up(&tcon->ses->server->request_q); +} + +/* cifs_async_writev - send an async write, and set up mid to handle result */ +int +cifs_async_writev(struct cifs_writedata *wdata) +{ + int i, rc = -EACCES; + WRITE_REQ *smb = NULL; + int wct; + struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct inode *inode = wdata->cfile->dentry->d_inode; + struct kvec *iov = NULL; + + if (tcon->ses->capabilities & CAP_LARGE_FILES) { + wct = 14; + } else { + wct = 12; + if (wdata->offset >> 32 > 0) { + /* can not handle big offset for old srv */ + return -EIO; + } + } + + rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **)&smb); + if (rc) + goto async_writev_out; + + /* 1 iov per page + 1 for header */ + iov = kzalloc((wdata->nr_pages + 1) * sizeof(*iov), GFP_NOFS); + if (iov == NULL) { + rc = -ENOMEM; + goto async_writev_out; + } + + smb->hdr.Pid = cpu_to_le16((__u16)wdata->cfile->pid); + smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->cfile->pid >> 16)); + + smb->AndXCommand = 0xFF; /* none */ + smb->Fid = wdata->cfile->netfid; + smb->OffsetLow = cpu_to_le32(wdata->offset & 0xFFFFFFFF); + if (wct == 14) + smb->OffsetHigh = cpu_to_le32(wdata->offset >> 32); + smb->Reserved = 0xFFFFFFFF; + smb->WriteMode = 0; + smb->Remaining = 0; + + smb->DataOffset = + cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4); + + /* 4 for RFC1001 length + 1 for BCC */ + iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1; + iov[0].iov_base = smb; + + /* marshal up the pages into iov array */ + wdata->bytes = 0; + for (i = 0; i < wdata->nr_pages; i++) { + iov[i + 1].iov_len = min(inode->i_size - + page_offset(wdata->pages[i]), + (loff_t)PAGE_CACHE_SIZE); + iov[i + 1].iov_base = kmap(wdata->pages[i]); + wdata->bytes += iov[i + 1].iov_len; + } + + cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes); + + smb->DataLengthLow = cpu_to_le16(wdata->bytes & 0xFFFF); + smb->DataLengthHigh = cpu_to_le16(wdata->bytes >> 16); + + if (wct == 14) { + inc_rfc1001_len(&smb->hdr, wdata->bytes + 1); + put_bcc(wdata->bytes + 1, &smb->hdr); + } else { + /* wct == 12 */ + struct smb_com_writex_req *smbw = + (struct smb_com_writex_req *)smb; + inc_rfc1001_len(&smbw->hdr, wdata->bytes + 5); + put_bcc(wdata->bytes + 5, &smbw->hdr); + iov[0].iov_len += 4; /* pad bigger by four bytes */ + } + + kref_get(&wdata->refcount); + rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1, + cifs_writev_callback, wdata, false); + + if (rc == 0) + cifs_stats_inc(&tcon->num_writes); + else + kref_put(&wdata->refcount, cifs_writedata_release); + + /* send is done, unmap pages */ + for (i = 0; i < wdata->nr_pages; i++) + kunmap(wdata->pages[i]); + +async_writev_out: + cifs_small_buf_release(smb); + kfree(iov); + return rc; +} + +int +CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms, + unsigned int *nbytes, struct kvec *iov, int n_vec, + const int long_op) +{ + int rc = -EACCES; + WRITE_REQ *pSMB = NULL; + int wct; + int smb_hdr_len; + int resp_buf_type = 0; + __u32 pid = io_parms->pid; + __u16 netfid = io_parms->netfid; + __u64 offset = io_parms->offset; + struct cifs_tcon *tcon = io_parms->tcon; + unsigned int count = io_parms->length; + + *nbytes = 0; + + cFYI(1, "write2 at %lld %d bytes", (long long)offset, count); + + if (tcon->ses->capabilities & CAP_LARGE_FILES) { + wct = 14; + } else { + wct = 12; + if ((offset >> 32) > 0) { + /* can not handle big offset for old srv */ + return -EIO; + } + } + rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **) &pSMB); + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16)); + + /* tcon and ses pointer are checked in smb_init */ + if (tcon->ses->server == NULL) + return -ECONNABORTED; + + pSMB->AndXCommand = 0xFF; /* none */ + pSMB->Fid = netfid; + pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF); + if (wct == 14) + pSMB->OffsetHigh = cpu_to_le32(offset >> 32); + pSMB->Reserved = 0xFFFFFFFF; + pSMB->WriteMode = 0; + pSMB->Remaining = 0; + + pSMB->DataOffset = + cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4); + + pSMB->DataLengthLow = cpu_to_le16(count & 0xFFFF); + pSMB->DataLengthHigh = cpu_to_le16(count >> 16); + /* header + 1 byte pad */ + smb_hdr_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 1; + if (wct == 14) + inc_rfc1001_len(pSMB, count + 1); + else /* wct == 12 */ + inc_rfc1001_len(pSMB, count + 5); /* smb data starts later */ + if (wct == 14) + pSMB->ByteCount = cpu_to_le16(count + 1); + else /* wct == 12 */ /* bigger pad, smaller smb hdr, keep offset ok */ { + struct smb_com_writex_req *pSMBW = + (struct smb_com_writex_req *)pSMB; + pSMBW->ByteCount = cpu_to_le16(count + 5); + } + iov[0].iov_base = pSMB; + if (wct == 14) + iov[0].iov_len = smb_hdr_len + 4; + else /* wct == 12 pad bigger by four bytes */ + iov[0].iov_len = smb_hdr_len + 8; + + + rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, + long_op); + cifs_stats_inc(&tcon->num_writes); + if (rc) { + cFYI(1, "Send error Write2 = %d", rc); + } else if (resp_buf_type == 0) { + /* presumably this can not happen, but best to be safe */ + rc = -EIO; + } else { + WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base; + *nbytes = le16_to_cpu(pSMBr->CountHigh); + *nbytes = (*nbytes) << 16; + *nbytes += le16_to_cpu(pSMBr->Count); + + /* + * Mask off high 16 bits when bytes written as returned by the + * server is greater than bytes requested by the client. OS/2 + * servers are known to set incorrect CountHigh values. + */ + if (*nbytes > count) + *nbytes &= 0xFFFF; + } + +/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ + if (resp_buf_type == CIFS_SMALL_BUFFER) + cifs_small_buf_release(iov[0].iov_base); + else if (resp_buf_type == CIFS_LARGE_BUFFER) + cifs_buf_release(iov[0].iov_base); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + + +int +CIFSSMBLock(const int xid, struct cifs_tcon *tcon, + const __u16 smb_file_id, const __u64 len, + const __u64 offset, const __u32 numUnlock, + const __u32 numLock, const __u8 lockType, + const bool waitFlag, const __u8 oplock_level) +{ + int rc = 0; + LOCK_REQ *pSMB = NULL; +/* LOCK_RSP *pSMBr = NULL; */ /* No response data other than rc to parse */ + int bytes_returned; + int timeout = 0; + __u16 count; + + cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock); + rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB); + + if (rc) + return rc; + + if (lockType == LOCKING_ANDX_OPLOCK_RELEASE) { + timeout = CIFS_ASYNC_OP; /* no response expected */ + pSMB->Timeout = 0; + } else if (waitFlag) { + timeout = CIFS_BLOCKING_OP; /* blocking operation, no timeout */ + pSMB->Timeout = cpu_to_le32(-1);/* blocking - do not time out */ + } else { + pSMB->Timeout = 0; + } + + pSMB->NumberOfLocks = cpu_to_le16(numLock); + pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock); + pSMB->LockType = lockType; + pSMB->OplockLevel = oplock_level; + pSMB->AndXCommand = 0xFF; /* none */ + pSMB->Fid = smb_file_id; /* netfid stays le */ + + if ((numLock != 0) || (numUnlock != 0)) { + pSMB->Locks[0].Pid = cpu_to_le16(current->tgid); + /* BB where to store pid high? */ + pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len); + pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32)); + pSMB->Locks[0].OffsetLow = cpu_to_le32((u32)offset); + pSMB->Locks[0].OffsetHigh = cpu_to_le32((u32)(offset>>32)); + count = sizeof(LOCKING_ANDX_RANGE); + } else { + /* oplock break */ + count = 0; + } + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + if (waitFlag) { + rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMB, &bytes_returned); + cifs_small_buf_release(pSMB); + } else { + rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *)pSMB, + timeout); + /* SMB buffer freed by function above */ + } + cifs_stats_inc(&tcon->num_locks); + if (rc) + cFYI(1, "Send error in Lock = %d", rc); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + return rc; +} + +int +CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, + const __u16 smb_file_id, const int get_flag, const __u64 len, + struct file_lock *pLockData, const __u16 lock_type, + const bool waitFlag) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; + struct cifs_posix_lock *parm_data; + int rc = 0; + int timeout = 0; + int bytes_returned = 0; + int resp_buf_type = 0; + __u16 params, param_offset, offset, byte_count, count; + struct kvec iov[1]; + + cFYI(1, "Posix Lock"); + + if (pLockData == NULL) + return -EINVAL; + + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); + + if (rc) + return rc; + + pSMBr = (struct smb_com_transaction2_sfi_rsp *)pSMB; + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + count = sizeof(struct cifs_posix_lock); + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB from sess */ + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + if (get_flag) + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION); + else + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + parm_data = (struct cifs_posix_lock *) + (((char *) &pSMB->hdr.Protocol) + offset); + + parm_data->lock_type = cpu_to_le16(lock_type); + if (waitFlag) { + timeout = CIFS_BLOCKING_OP; /* blocking operation, no timeout */ + parm_data->lock_flags = cpu_to_le16(1); + pSMB->Timeout = cpu_to_le32(-1); + } else + pSMB->Timeout = 0; + + parm_data->pid = cpu_to_le32(current->tgid); + parm_data->start = cpu_to_le64(pLockData->fl_start); + parm_data->length = cpu_to_le64(len); /* normalize negative numbers */ + + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->Fid = smb_file_id; + pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_LOCK); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + if (waitFlag) { + rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned); + } else { + iov[0].iov_base = (char *)pSMB; + iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4; + rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */, + &resp_buf_type, timeout); + pSMB = NULL; /* request buf already freed by SendReceive2. Do + not try to free it twice below on exit */ + pSMBr = (struct smb_com_transaction2_sfi_rsp *)iov[0].iov_base; + } + + if (rc) { + cFYI(1, "Send error in Posix Lock = %d", rc); + } else if (get_flag) { + /* lock structure can be returned on get */ + __u16 data_offset; + __u16 data_count; + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < sizeof(*parm_data)) { + rc = -EIO; /* bad smb */ + goto plk_err_exit; + } + data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + data_count = le16_to_cpu(pSMBr->t2.DataCount); + if (data_count < sizeof(struct cifs_posix_lock)) { + rc = -EIO; + goto plk_err_exit; + } + parm_data = (struct cifs_posix_lock *) + ((char *)&pSMBr->hdr.Protocol + data_offset); + if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK)) + pLockData->fl_type = F_UNLCK; + else { + if (parm_data->lock_type == + __constant_cpu_to_le16(CIFS_RDLCK)) + pLockData->fl_type = F_RDLCK; + else if (parm_data->lock_type == + __constant_cpu_to_le16(CIFS_WRLCK)) + pLockData->fl_type = F_WRLCK; + + pLockData->fl_start = le64_to_cpu(parm_data->start); + pLockData->fl_end = pLockData->fl_start + + le64_to_cpu(parm_data->length) - 1; + pLockData->fl_pid = le32_to_cpu(parm_data->pid); + } + } + +plk_err_exit: + if (pSMB) + cifs_small_buf_release(pSMB); + + if (resp_buf_type == CIFS_SMALL_BUFFER) + cifs_small_buf_release(iov[0].iov_base); + else if (resp_buf_type == CIFS_LARGE_BUFFER) + cifs_buf_release(iov[0].iov_base); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + + +int +CIFSSMBClose(const int xid, struct cifs_tcon *tcon, int smb_file_id) +{ + int rc = 0; + CLOSE_REQ *pSMB = NULL; + cFYI(1, "In CIFSSMBClose"); + +/* do not retry on dead session on close */ + rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB); + if (rc == -EAGAIN) + return 0; + if (rc) + return rc; + + pSMB->FileID = (__u16) smb_file_id; + pSMB->LastWriteTime = 0xFFFFFFFF; + pSMB->ByteCount = 0; + rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + cifs_stats_inc(&tcon->num_closes); + if (rc) { + if (rc != -EINTR) { + /* EINTR is expected when user ctl-c to kill app */ + cERROR(1, "Send error in Close = %d", rc); + } + } + + /* Since session is dead, file will be closed on server already */ + if (rc == -EAGAIN) + rc = 0; + + return rc; +} + +int +CIFSSMBFlush(const int xid, struct cifs_tcon *tcon, int smb_file_id) +{ + int rc = 0; + FLUSH_REQ *pSMB = NULL; + cFYI(1, "In CIFSSMBFlush"); + + rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB); + if (rc) + return rc; + + pSMB->FileID = (__u16) smb_file_id; + pSMB->ByteCount = 0; + rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + cifs_stats_inc(&tcon->num_flushes); + if (rc) + cERROR(1, "Send error in Flush = %d", rc); + + return rc; +} + +int +CIFSSMBRename(const int xid, struct cifs_tcon *tcon, + const char *fromName, const char *toName, + const struct nls_table *nls_codepage, int remap) +{ + int rc = 0; + RENAME_REQ *pSMB = NULL; + RENAME_RSP *pSMBr = NULL; + int bytes_returned; + int name_len, name_len2; + __u16 count; + + cFYI(1, "In CIFSSMBRename"); +renameRetry: + rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->BufferFormat = 0x04; + pSMB->SearchAttributes = + cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM | + ATTR_DIRECTORY); + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + pSMB->OldFileName[name_len] = 0x04; /* pad */ + /* protocol requires ASCII signature byte on Unicode string */ + pSMB->OldFileName[name_len + 1] = 0x00; + name_len2 = + cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2], + toName, PATH_MAX, nls_codepage, remap); + name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; + name_len2 *= 2; /* convert to bytes */ + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fromName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->OldFileName, fromName, name_len); + name_len2 = strnlen(toName, PATH_MAX); + name_len2++; /* trailing null */ + pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */ + strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2); + name_len2++; /* trailing null */ + name_len2++; /* signature byte */ + } + + count = 1 /* 1st signature byte */ + name_len + name_len2; + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->num_renames); + if (rc) + cFYI(1, "Send error in rename = %d", rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto renameRetry; + + return rc; +} + +int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon, + int netfid, const char *target_name, + const struct nls_table *nls_codepage, int remap) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; + struct set_file_rename *rename_info; + char *data_offset; + char dummy_string[30]; + int rc = 0; + int bytes_returned = 0; + int len_of_str; + __u16 params, param_offset, offset, count, byte_count; + + cFYI(1, "Rename to File by handle"); + rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + rename_info = (struct set_file_rename *) data_offset; + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB from sess */ + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + /* construct random name ".cifs_tmp" */ + rename_info->overwrite = cpu_to_le32(1); + rename_info->root_fid = 0; + /* unicode only call */ + if (target_name == NULL) { + sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid); + len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name, + dummy_string, 24, nls_codepage, remap); + } else { + len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name, + target_name, PATH_MAX, nls_codepage, + remap); + } + rename_info->target_name_len = cpu_to_le32(2 * len_of_str); + count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str); + byte_count += count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->Fid = netfid; + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_RENAME_INFORMATION); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, pTcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&pTcon->num_t2renames); + if (rc) + cFYI(1, "Send error in Rename (by file handle) = %d", rc); + + cifs_buf_release(pSMB); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + +int +CIFSSMBCopy(const int xid, struct cifs_tcon *tcon, const char *fromName, + const __u16 target_tid, const char *toName, const int flags, + const struct nls_table *nls_codepage, int remap) +{ + int rc = 0; + COPY_REQ *pSMB = NULL; + COPY_RSP *pSMBr = NULL; + int bytes_returned; + int name_len, name_len2; + __u16 count; + + cFYI(1, "In CIFSSMBCopy"); +copyRetry: + rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->BufferFormat = 0x04; + pSMB->Tid2 = target_tid; + + pSMB->Flags = cpu_to_le16(flags & COPY_TREE); + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = cifsConvertToUCS((__le16 *) pSMB->OldFileName, + fromName, PATH_MAX, nls_codepage, + remap); + name_len++; /* trailing null */ + name_len *= 2; + pSMB->OldFileName[name_len] = 0x04; /* pad */ + /* protocol requires ASCII signature byte on Unicode string */ + pSMB->OldFileName[name_len + 1] = 0x00; + name_len2 = + cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2], + toName, PATH_MAX, nls_codepage, remap); + name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; + name_len2 *= 2; /* convert to bytes */ + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fromName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->OldFileName, fromName, name_len); + name_len2 = strnlen(toName, PATH_MAX); + name_len2++; /* trailing null */ + pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */ + strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2); + name_len2++; /* trailing null */ + name_len2++; /* signature byte */ + } + + count = 1 /* 1st signature byte */ + name_len + name_len2; + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Send error in copy = %d with %d files copied", + rc, le16_to_cpu(pSMBr->CopyCount)); + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto copyRetry; + + return rc; +} + +int +CIFSUnixCreateSymLink(const int xid, struct cifs_tcon *tcon, + const char *fromName, const char *toName, + const struct nls_table *nls_codepage) +{ + TRANSACTION2_SPI_REQ *pSMB = NULL; + TRANSACTION2_SPI_RSP *pSMBr = NULL; + char *data_offset; + int name_len; + int name_len_target; + int rc = 0; + int bytes_returned = 0; + __u16 params, param_offset, offset, byte_count; + + cFYI(1, "In Symlink Unix style"); +createSymLinkRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifs_strtoUCS((__le16 *) pSMB->FileName, fromName, PATH_MAX + /* find define for this maxpathcomponent */ + , nls_codepage); + name_len++; /* trailing null */ + name_len *= 2; + + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fromName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, fromName, name_len); + } + params = 6 + name_len; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + + data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len_target = + cifs_strtoUCS((__le16 *) data_offset, toName, PATH_MAX + /* find define for this maxpathcomponent */ + , nls_codepage); + name_len_target++; /* trailing null */ + name_len_target *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len_target = strnlen(toName, PATH_MAX); + name_len_target++; /* trailing null */ + strncpy(data_offset, toName, name_len_target); + } + + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max on data count below from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + name_len_target; + pSMB->DataCount = cpu_to_le16(name_len_target); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_LINK); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->num_symlinks); + if (rc) + cFYI(1, "Send error in SetPathInfo create symlink = %d", rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto createSymLinkRetry; + + return rc; +} + +int +CIFSUnixCreateHardLink(const int xid, struct cifs_tcon *tcon, + const char *fromName, const char *toName, + const struct nls_table *nls_codepage, int remap) +{ + TRANSACTION2_SPI_REQ *pSMB = NULL; + TRANSACTION2_SPI_RSP *pSMBr = NULL; + char *data_offset; + int name_len; + int name_len_target; + int rc = 0; + int bytes_returned = 0; + __u16 params, param_offset, offset, byte_count; + + cFYI(1, "In Create Hard link Unix style"); +createHardLinkRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = cifsConvertToUCS((__le16 *) pSMB->FileName, toName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(toName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, toName, name_len); + } + params = 6 + name_len; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + + data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len_target = + cifsConvertToUCS((__le16 *) data_offset, fromName, PATH_MAX, + nls_codepage, remap); + name_len_target++; /* trailing null */ + name_len_target *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len_target = strnlen(fromName, PATH_MAX); + name_len_target++; /* trailing null */ + strncpy(data_offset, fromName, name_len_target); + } + + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max on data count below from sess*/ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + name_len_target; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->DataCount = cpu_to_le16(name_len_target); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_HLINK); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->num_hardlinks); + if (rc) + cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc); + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto createHardLinkRetry; + + return rc; +} + +int +CIFSCreateHardLink(const int xid, struct cifs_tcon *tcon, + const char *fromName, const char *toName, + const struct nls_table *nls_codepage, int remap) +{ + int rc = 0; + NT_RENAME_REQ *pSMB = NULL; + RENAME_RSP *pSMBr = NULL; + int bytes_returned; + int name_len, name_len2; + __u16 count; + + cFYI(1, "In CIFSCreateHardLink"); +winCreateHardLinkRetry: + + rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->SearchAttributes = + cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM | + ATTR_DIRECTORY); + pSMB->Flags = cpu_to_le16(CREATE_HARD_LINK); + pSMB->ClusterCount = 0; + + pSMB->BufferFormat = 0x04; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + + /* protocol specifies ASCII buffer format (0x04) for unicode */ + pSMB->OldFileName[name_len] = 0x04; + pSMB->OldFileName[name_len + 1] = 0x00; /* pad */ + name_len2 = + cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2], + toName, PATH_MAX, nls_codepage, remap); + name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; + name_len2 *= 2; /* convert to bytes */ + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fromName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->OldFileName, fromName, name_len); + name_len2 = strnlen(toName, PATH_MAX); + name_len2++; /* trailing null */ + pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */ + strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2); + name_len2++; /* trailing null */ + name_len2++; /* signature byte */ + } + + count = 1 /* string type byte */ + name_len + name_len2; + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->num_hardlinks); + if (rc) + cFYI(1, "Send error in hard link (NT rename) = %d", rc); + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto winCreateHardLinkRetry; + + return rc; +} + +int +CIFSSMBUnixQuerySymLink(const int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, char **symlinkinfo, + const struct nls_table *nls_codepage) +{ +/* SMB_QUERY_FILE_UNIX_LINK */ + TRANSACTION2_QPI_REQ *pSMB = NULL; + TRANSACTION2_QPI_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + __u16 params, byte_count; + char *data_start; + + cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName); + +querySymLinkRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifs_strtoUCS((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(searchName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, searchName, name_len); + } + + params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qpi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_LINK); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Send error in QuerySymLinkInfo = %d", rc); + } else { + /* decode response */ + + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + /* BB also check enough total bytes returned */ + if (rc || get_bcc(&pSMBr->hdr) < 2) + rc = -EIO; + else { + bool is_unicode; + u16 count = le16_to_cpu(pSMBr->t2.DataCount); + + data_start = ((char *) &pSMBr->hdr.Protocol) + + le16_to_cpu(pSMBr->t2.DataOffset); + + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) + is_unicode = true; + else + is_unicode = false; + + /* BB FIXME investigate remapping reserved chars here */ + *symlinkinfo = cifs_strndup_from_ucs(data_start, count, + is_unicode, nls_codepage); + if (!*symlinkinfo) + rc = -ENOMEM; + } + } + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto querySymLinkRetry; + return rc; +} + +#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL +/* + * Recent Windows versions now create symlinks more frequently + * and they use the "reparse point" mechanism below. We can of course + * do symlinks nicely to Samba and other servers which support the + * CIFS Unix Extensions and we can also do SFU symlinks and "client only" + * "MF" symlinks optionally, but for recent Windows we really need to + * reenable the code below and fix the cifs_symlink callers to handle this. + * In the interim this code has been moved to its own config option so + * it is not compiled in by default until callers fixed up and more tested. + */ +int +CIFSSMBQueryReparseLinkInfo(const int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + char *symlinkinfo, const int buflen, __u16 fid, + const struct nls_table *nls_codepage) +{ + int rc = 0; + int bytes_returned; + struct smb_com_transaction_ioctl_req *pSMB; + struct smb_com_transaction_ioctl_rsp *pSMBr; + + cFYI(1, "In Windows reparse style QueryLink for path %s", searchName); + rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->TotalParameterCount = 0 ; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le32(2); + /* BB find exact data count max from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - + MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->MaxSetupCount = 4; + pSMB->Reserved = 0; + pSMB->ParameterOffset = 0; + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 4; + pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->FunctionCode = cpu_to_le32(FSCTL_GET_REPARSE_POINT); + pSMB->IsFsctl = 1; /* FSCTL */ + pSMB->IsRootFlag = 0; + pSMB->Fid = fid; /* file handle always le */ + pSMB->ByteCount = 0; + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc); + } else { /* decode response */ + __u32 data_offset = le32_to_cpu(pSMBr->DataOffset); + __u32 data_count = le32_to_cpu(pSMBr->DataCount); + if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) { + /* BB also check enough total bytes returned */ + rc = -EIO; /* bad smb */ + goto qreparse_out; + } + if (data_count && (data_count < 2048)) { + char *end_of_smb = 2 /* sizeof byte count */ + + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount; + + struct reparse_data *reparse_buf = + (struct reparse_data *) + ((char *)&pSMBr->hdr.Protocol + + data_offset); + if ((char *)reparse_buf >= end_of_smb) { + rc = -EIO; + goto qreparse_out; + } + if ((reparse_buf->LinkNamesBuf + + reparse_buf->TargetNameOffset + + reparse_buf->TargetNameLen) > end_of_smb) { + cFYI(1, "reparse buf beyond SMB"); + rc = -EIO; + goto qreparse_out; + } + + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) { + cifs_from_ucs2(symlinkinfo, (__le16 *) + (reparse_buf->LinkNamesBuf + + reparse_buf->TargetNameOffset), + buflen, + reparse_buf->TargetNameLen, + nls_codepage, 0); + } else { /* ASCII names */ + strncpy(symlinkinfo, + reparse_buf->LinkNamesBuf + + reparse_buf->TargetNameOffset, + min_t(const int, buflen, + reparse_buf->TargetNameLen)); + } + } else { + rc = -EIO; + cFYI(1, "Invalid return data count on " + "get reparse info ioctl"); + } + symlinkinfo[buflen] = 0; /* just in case so the caller + does not go off the end of the buffer */ + cFYI(1, "readlink result - %s", symlinkinfo); + } + +qreparse_out: + cifs_buf_release(pSMB); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} +#endif /* CIFS_SYMLINK_EXPERIMENTAL */ /* BB temporarily unused */ + +#ifdef CONFIG_CIFS_POSIX + +/*Convert an Access Control Entry from wire format to local POSIX xattr format*/ +static void cifs_convert_ace(posix_acl_xattr_entry *ace, + struct cifs_posix_ace *cifs_ace) +{ + /* u8 cifs fields do not need le conversion */ + ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm); + ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag); + ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid)); + /* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */ + + return; +} + +/* Convert ACL from CIFS POSIX wire format to local Linux POSIX ACL xattr */ +static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, + const int acl_type, const int size_of_data_area) +{ + int size = 0; + int i; + __u16 count; + struct cifs_posix_ace *pACE; + struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src; + posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)trgt; + + if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION) + return -EOPNOTSUPP; + + if (acl_type & ACL_TYPE_ACCESS) { + count = le16_to_cpu(cifs_acl->access_entry_count); + pACE = &cifs_acl->ace_array[0]; + size = sizeof(struct cifs_posix_acl); + size += sizeof(struct cifs_posix_ace) * count; + /* check if we would go beyond end of SMB */ + if (size_of_data_area < size) { + cFYI(1, "bad CIFS POSIX ACL size %d vs. %d", + size_of_data_area, size); + return -EINVAL; + } + } else if (acl_type & ACL_TYPE_DEFAULT) { + count = le16_to_cpu(cifs_acl->access_entry_count); + size = sizeof(struct cifs_posix_acl); + size += sizeof(struct cifs_posix_ace) * count; +/* skip past access ACEs to get to default ACEs */ + pACE = &cifs_acl->ace_array[count]; + count = le16_to_cpu(cifs_acl->default_entry_count); + size += sizeof(struct cifs_posix_ace) * count; + /* check if we would go beyond end of SMB */ + if (size_of_data_area < size) + return -EINVAL; + } else { + /* illegal type */ + return -EINVAL; + } + + size = posix_acl_xattr_size(count); + if ((buflen == 0) || (local_acl == NULL)) { + /* used to query ACL EA size */ + } else if (size > buflen) { + return -ERANGE; + } else /* buffer big enough */ { + local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); + for (i = 0; i < count ; i++) { + cifs_convert_ace(&local_acl->a_entries[i], pACE); + pACE++; + } + } + return size; +} + +static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace, + const posix_acl_xattr_entry *local_ace) +{ + __u16 rc = 0; /* 0 = ACL converted ok */ + + cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm); + cifs_ace->cifs_e_tag = le16_to_cpu(local_ace->e_tag); + /* BB is there a better way to handle the large uid? */ + if (local_ace->e_id == cpu_to_le32(-1)) { + /* Probably no need to le convert -1 on any arch but can not hurt */ + cifs_ace->cifs_uid = cpu_to_le64(-1); + } else + cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id)); + /*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/ + return rc; +} + +/* Convert ACL from local Linux POSIX xattr to CIFS POSIX ACL wire format */ +static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, + const int buflen, const int acl_type) +{ + __u16 rc = 0; + struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data; + posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)pACL; + int count; + int i; + + if ((buflen == 0) || (pACL == NULL) || (cifs_acl == NULL)) + return 0; + + count = posix_acl_xattr_count((size_t)buflen); + cFYI(1, "setting acl with %d entries from buf of length %d and " + "version of %d", + count, buflen, le32_to_cpu(local_acl->a_version)); + if (le32_to_cpu(local_acl->a_version) != 2) { + cFYI(1, "unknown POSIX ACL version %d", + le32_to_cpu(local_acl->a_version)); + return 0; + } + cifs_acl->version = cpu_to_le16(1); + if (acl_type == ACL_TYPE_ACCESS) + cifs_acl->access_entry_count = cpu_to_le16(count); + else if (acl_type == ACL_TYPE_DEFAULT) + cifs_acl->default_entry_count = cpu_to_le16(count); + else { + cFYI(1, "unknown ACL type %d", acl_type); + return 0; + } + for (i = 0; i < count; i++) { + rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], + &local_acl->a_entries[i]); + if (rc != 0) { + /* ACE not converted */ + break; + } + } + if (rc == 0) { + rc = (__u16)(count * sizeof(struct cifs_posix_ace)); + rc += sizeof(struct cifs_posix_acl); + /* BB add check to make sure ACL does not overflow SMB */ + } + return rc; +} + +int +CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + char *acl_inf, const int buflen, const int acl_type, + const struct nls_table *nls_codepage, int remap) +{ +/* SMB_QUERY_POSIX_ACL */ + TRANSACTION2_QPI_REQ *pSMB = NULL; + TRANSACTION2_QPI_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + __u16 params, byte_count; + + cFYI(1, "In GetPosixACL (Unix) for path %s", searchName); + +queryAclRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + pSMB->FileName[name_len] = 0; + pSMB->FileName[name_len+1] = 0; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(searchName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, searchName, name_len); + } + + params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max data count below from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(4000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16( + offsetof(struct smb_com_transaction2_qpi_req, + InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_ACL); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->num_acl_get); + if (rc) { + cFYI(1, "Send error in Query POSIX ACL = %d", rc); + } else { + /* decode response */ + + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + /* BB also check enough total bytes returned */ + if (rc || get_bcc(&pSMBr->hdr) < 2) + rc = -EIO; /* bad smb */ + else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + __u16 count = le16_to_cpu(pSMBr->t2.DataCount); + rc = cifs_copy_posix_acl(acl_inf, + (char *)&pSMBr->hdr.Protocol+data_offset, + buflen, acl_type, count); + } + } + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto queryAclRetry; + return rc; +} + +int +CIFSSMBSetPosixACL(const int xid, struct cifs_tcon *tcon, + const unsigned char *fileName, + const char *local_acl, const int buflen, + const int acl_type, + const struct nls_table *nls_codepage, int remap) +{ + struct smb_com_transaction2_spi_req *pSMB = NULL; + struct smb_com_transaction2_spi_rsp *pSMBr = NULL; + char *parm_data; + int name_len; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count, data_count, param_offset, offset; + + cFYI(1, "In SetPosixACL (Unix) for path %s", fileName); +setAclRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, fileName, name_len); + } + params = 6 + name_len; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB size from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + parm_data = ((char *) &pSMB->hdr.Protocol) + offset; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + + /* convert to on the wire format for POSIX ACL */ + data_count = ACL_to_cifs_posix(parm_data, local_acl, buflen, acl_type); + + if (data_count == 0) { + rc = -EOPNOTSUPP; + goto setACLerrorExit; + } + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_ACL); + byte_count = 3 /* pad */ + params + data_count; + pSMB->DataCount = cpu_to_le16(data_count); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cFYI(1, "Set POSIX ACL returned %d", rc); + +setACLerrorExit: + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto setAclRetry; + return rc; +} + +/* BB fix tabs in this function FIXME BB */ +int +CIFSGetExtAttr(const int xid, struct cifs_tcon *tcon, + const int netfid, __u64 *pExtAttrBits, __u64 *pMask) +{ + int rc = 0; + struct smb_t2_qfi_req *pSMB = NULL; + struct smb_t2_qfi_rsp *pSMBr = NULL; + int bytes_returned; + __u16 params, byte_count; + + cFYI(1, "In GetExtAttr"); + if (tcon == NULL) + return -ENODEV; + +GetExtAttrRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2 /* level */ + 2 /* fid */; + pSMB->t2.TotalDataCount = 0; + pSMB->t2.MaxParameterCount = cpu_to_le16(4); + /* BB find exact max data count below from sess structure BB */ + pSMB->t2.MaxDataCount = cpu_to_le16(4000); + pSMB->t2.MaxSetupCount = 0; + pSMB->t2.Reserved = 0; + pSMB->t2.Flags = 0; + pSMB->t2.Timeout = 0; + pSMB->t2.Reserved2 = 0; + pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req, + Fid) - 4); + pSMB->t2.DataCount = 0; + pSMB->t2.DataOffset = 0; + pSMB->t2.SetupCount = 1; + pSMB->t2.Reserved3 = 0; + pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->t2.TotalParameterCount = cpu_to_le16(params); + pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_ATTR_FLAGS); + pSMB->Pad = 0; + pSMB->Fid = netfid; + inc_rfc1001_len(pSMB, byte_count); + pSMB->t2.ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "error %d in GetExtAttr", rc); + } else { + /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + /* BB also check enough total bytes returned */ + if (rc || get_bcc(&pSMBr->hdr) < 2) + /* If rc should we check for EOPNOSUPP and + disable the srvino flag? or in caller? */ + rc = -EIO; /* bad smb */ + else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + __u16 count = le16_to_cpu(pSMBr->t2.DataCount); + struct file_chattr_info *pfinfo; + /* BB Do we need a cast or hash here ? */ + if (count != 16) { + cFYI(1, "Illegal size ret in GetExtAttr"); + rc = -EIO; + goto GetExtAttrOut; + } + pfinfo = (struct file_chattr_info *) + (data_offset + (char *) &pSMBr->hdr.Protocol); + *pExtAttrBits = le64_to_cpu(pfinfo->mode); + *pMask = le64_to_cpu(pfinfo->mask); + } + } +GetExtAttrOut: + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto GetExtAttrRetry; + return rc; +} + +#endif /* CONFIG_POSIX */ + +#ifdef CONFIG_CIFS_ACL +/* + * Initialize NT TRANSACT SMB into small smb request buffer. This assumes that + * all NT TRANSACTS that we init here have total parm and data under about 400 + * bytes (to fit in small cifs buffer size), which is the case so far, it + * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of + * returned setup area) and MaxParameterCount (returned parms size) must be set + * by caller + */ +static int +smb_init_nttransact(const __u16 sub_command, const int setup_count, + const int parm_len, struct cifs_tcon *tcon, + void **ret_buf) +{ + int rc; + __u32 temp_offset; + struct smb_com_ntransact_req *pSMB; + + rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon, + (void **)&pSMB); + if (rc) + return rc; + *ret_buf = (void *)pSMB; + pSMB->Reserved = 0; + pSMB->TotalParameterCount = cpu_to_le32(parm_len); + pSMB->TotalDataCount = 0; + pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - + MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->DataCount = pSMB->TotalDataCount; + temp_offset = offsetof(struct smb_com_ntransact_req, Parms) + + (setup_count * 2) - 4 /* for rfc1001 length itself */; + pSMB->ParameterOffset = cpu_to_le32(temp_offset); + pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len); + pSMB->SetupCount = setup_count; /* no need to le convert byte fields */ + pSMB->SubCommand = cpu_to_le16(sub_command); + return 0; +} + +static int +validate_ntransact(char *buf, char **ppparm, char **ppdata, + __u32 *pparmlen, __u32 *pdatalen) +{ + char *end_of_smb; + __u32 data_count, data_offset, parm_count, parm_offset; + struct smb_com_ntransact_rsp *pSMBr; + u16 bcc; + + *pdatalen = 0; + *pparmlen = 0; + + if (buf == NULL) + return -EINVAL; + + pSMBr = (struct smb_com_ntransact_rsp *)buf; + + bcc = get_bcc(&pSMBr->hdr); + end_of_smb = 2 /* sizeof byte count */ + bcc + + (char *)&pSMBr->ByteCount; + + data_offset = le32_to_cpu(pSMBr->DataOffset); + data_count = le32_to_cpu(pSMBr->DataCount); + parm_offset = le32_to_cpu(pSMBr->ParameterOffset); + parm_count = le32_to_cpu(pSMBr->ParameterCount); + + *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset; + *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset; + + /* should we also check that parm and data areas do not overlap? */ + if (*ppparm > end_of_smb) { + cFYI(1, "parms start after end of smb"); + return -EINVAL; + } else if (parm_count + *ppparm > end_of_smb) { + cFYI(1, "parm end after end of smb"); + return -EINVAL; + } else if (*ppdata > end_of_smb) { + cFYI(1, "data starts after end of smb"); + return -EINVAL; + } else if (data_count + *ppdata > end_of_smb) { + cFYI(1, "data %p + count %d (%p) past smb end %p start %p", + *ppdata, data_count, (data_count + *ppdata), + end_of_smb, pSMBr); + return -EINVAL; + } else if (parm_count + data_count > bcc) { + cFYI(1, "parm count and data count larger than SMB"); + return -EINVAL; + } + *pdatalen = data_count; + *pparmlen = parm_count; + return 0; +} + +/* Get Security Descriptor (by handle) from remote server for a file or dir */ +int +CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid, + struct cifs_ntsd **acl_inf, __u32 *pbuflen) +{ + int rc = 0; + int buf_type = 0; + QUERY_SEC_DESC_REQ *pSMB; + struct kvec iov[1]; + + cFYI(1, "GetCifsACL"); + + *pbuflen = 0; + *acl_inf = NULL; + + rc = smb_init_nttransact(NT_TRANSACT_QUERY_SECURITY_DESC, 0, + 8 /* parm len */, tcon, (void **) &pSMB); + if (rc) + return rc; + + pSMB->MaxParameterCount = cpu_to_le32(4); + /* BB TEST with big acls that might need to be e.g. larger than 16K */ + pSMB->MaxSetupCount = 0; + pSMB->Fid = fid; /* file handle always le */ + pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP | + CIFS_ACL_DACL); + pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */ + inc_rfc1001_len(pSMB, 11); + iov[0].iov_base = (char *)pSMB; + iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4; + + rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type, + 0); + cifs_stats_inc(&tcon->num_acl_get); + if (rc) { + cFYI(1, "Send error in QuerySecDesc = %d", rc); + } else { /* decode response */ + __le32 *parm; + __u32 parm_len; + __u32 acl_len; + struct smb_com_ntransact_rsp *pSMBr; + char *pdata; + +/* validate_nttransact */ + rc = validate_ntransact(iov[0].iov_base, (char **)&parm, + &pdata, &parm_len, pbuflen); + if (rc) + goto qsec_out; + pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base; + + cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf); + + if (le32_to_cpu(pSMBr->ParameterCount) != 4) { + rc = -EIO; /* bad smb */ + *pbuflen = 0; + goto qsec_out; + } + +/* BB check that data area is minimum length and as big as acl_len */ + + acl_len = le32_to_cpu(*parm); + if (acl_len != *pbuflen) { + cERROR(1, "acl length %d does not match %d", + acl_len, *pbuflen); + if (*pbuflen > acl_len) + *pbuflen = acl_len; + } + + /* check if buffer is big enough for the acl + header followed by the smallest SID */ + if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) || + (*pbuflen >= 64 * 1024)) { + cERROR(1, "bad acl length %d", *pbuflen); + rc = -EINVAL; + *pbuflen = 0; + } else { + *acl_inf = kmalloc(*pbuflen, GFP_KERNEL); + if (*acl_inf == NULL) { + *pbuflen = 0; + rc = -ENOMEM; + } + memcpy(*acl_inf, pdata, *pbuflen); + } + } +qsec_out: + if (buf_type == CIFS_SMALL_BUFFER) + cifs_small_buf_release(iov[0].iov_base); + else if (buf_type == CIFS_LARGE_BUFFER) + cifs_buf_release(iov[0].iov_base); +/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ + return rc; +} + +int +CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid, + struct cifs_ntsd *pntsd, __u32 acllen) +{ + __u16 byte_count, param_count, data_count, param_offset, data_offset; + int rc = 0; + int bytes_returned = 0; + SET_SEC_DESC_REQ *pSMB = NULL; + NTRANSACT_RSP *pSMBr = NULL; + +setCifsAclRetry: + rc = smb_init(SMB_COM_NT_TRANSACT, 19, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return (rc); + + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + + param_count = 8; + param_offset = offsetof(struct smb_com_transaction_ssec_req, Fid) - 4; + data_count = acllen; + data_offset = param_offset + param_count; + byte_count = 3 /* pad */ + param_count; + + pSMB->DataCount = cpu_to_le32(data_count); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->MaxParameterCount = cpu_to_le32(4); + pSMB->MaxDataCount = cpu_to_le32(16384); + pSMB->ParameterCount = cpu_to_le32(param_count); + pSMB->ParameterOffset = cpu_to_le32(param_offset); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->DataOffset = cpu_to_le32(data_offset); + pSMB->SetupCount = 0; + pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_SET_SECURITY_DESC); + pSMB->ByteCount = cpu_to_le16(byte_count+data_count); + + pSMB->Fid = fid; /* file handle always le */ + pSMB->Reserved2 = 0; + pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL); + + if (pntsd && acllen) { + memcpy((char *) &pSMBr->hdr.Protocol + data_offset, + (char *) pntsd, + acllen); + inc_rfc1001_len(pSMB, byte_count + data_count); + } else + inc_rfc1001_len(pSMB, byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + + cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc); + if (rc) + cFYI(1, "Set CIFS ACL returned %d", rc); + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto setCifsAclRetry; + + return (rc); +} + +#endif /* CONFIG_CIFS_ACL */ + +/* Legacy Query Path Information call for lookup to old servers such + as Win9x/WinME */ +int SMBQueryInformation(const int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + FILE_ALL_INFO *pFinfo, + const struct nls_table *nls_codepage, int remap) +{ + QUERY_INFORMATION_REQ *pSMB; + QUERY_INFORMATION_RSP *pSMBr; + int rc = 0; + int bytes_returned; + int name_len; + + cFYI(1, "In SMBQPath path %s", searchName); +QInfRetry: + rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { + name_len = strnlen(searchName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, searchName, name_len); + } + pSMB->BufferFormat = 0x04; + name_len++; /* account for buffer type byte */ + inc_rfc1001_len(pSMB, (__u16)name_len); + pSMB->ByteCount = cpu_to_le16(name_len); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Send error in QueryInfo = %d", rc); + } else if (pFinfo) { + struct timespec ts; + __u32 time = le32_to_cpu(pSMBr->last_write_time); + + /* decode response */ + /* BB FIXME - add time zone adjustment BB */ + memset(pFinfo, 0, sizeof(FILE_ALL_INFO)); + ts.tv_nsec = 0; + ts.tv_sec = time; + /* decode time fields */ + pFinfo->ChangeTime = cpu_to_le64(cifs_UnixTimeToNT(ts)); + pFinfo->LastWriteTime = pFinfo->ChangeTime; + pFinfo->LastAccessTime = 0; + pFinfo->AllocationSize = + cpu_to_le64(le32_to_cpu(pSMBr->size)); + pFinfo->EndOfFile = pFinfo->AllocationSize; + pFinfo->Attributes = + cpu_to_le32(le16_to_cpu(pSMBr->attr)); + } else + rc = -EIO; /* bad buffer passed in */ + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QInfRetry; + + return rc; +} + +int +CIFSSMBQFileInfo(const int xid, struct cifs_tcon *tcon, + u16 netfid, FILE_ALL_INFO *pFindData) +{ + struct smb_t2_qfi_req *pSMB = NULL; + struct smb_t2_qfi_rsp *pSMBr = NULL; + int rc = 0; + int bytes_returned; + __u16 params, byte_count; + +QFileInfoRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2 /* level */ + 2 /* fid */; + pSMB->t2.TotalDataCount = 0; + pSMB->t2.MaxParameterCount = cpu_to_le16(4); + /* BB find exact max data count below from sess structure BB */ + pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize); + pSMB->t2.MaxSetupCount = 0; + pSMB->t2.Reserved = 0; + pSMB->t2.Flags = 0; + pSMB->t2.Timeout = 0; + pSMB->t2.Reserved2 = 0; + pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req, + Fid) - 4); + pSMB->t2.DataCount = 0; + pSMB->t2.DataOffset = 0; + pSMB->t2.SetupCount = 1; + pSMB->t2.Reserved3 = 0; + pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->t2.TotalParameterCount = cpu_to_le16(params); + pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO); + pSMB->Pad = 0; + pSMB->Fid = netfid; + inc_rfc1001_len(pSMB, byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Send error in QPathInfo = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc) /* BB add auto retry on EOPNOTSUPP? */ + rc = -EIO; + else if (get_bcc(&pSMBr->hdr) < 40) + rc = -EIO; /* bad smb */ + else if (pFindData) { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + memcpy((char *) pFindData, + (char *) &pSMBr->hdr.Protocol + + data_offset, sizeof(FILE_ALL_INFO)); + } else + rc = -ENOMEM; + } + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto QFileInfoRetry; + + return rc; +} + +int +CIFSSMBQPathInfo(const int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + FILE_ALL_INFO *pFindData, + int legacy /* old style infolevel */, + const struct nls_table *nls_codepage, int remap) +{ +/* level 263 SMB_QUERY_FILE_ALL_INFO */ + TRANSACTION2_QPI_REQ *pSMB = NULL; + TRANSACTION2_QPI_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + __u16 params, byte_count; + +/* cFYI(1, "In QPathInfo path %s", searchName); */ +QPathInfoRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(searchName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, searchName, name_len); + } + + params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(4000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qpi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + if (legacy) + pSMB->InformationLevel = cpu_to_le16(SMB_INFO_STANDARD); + else + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Send error in QPathInfo = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc) /* BB add auto retry on EOPNOTSUPP? */ + rc = -EIO; + else if (!legacy && get_bcc(&pSMBr->hdr) < 40) + rc = -EIO; /* bad smb */ + else if (legacy && get_bcc(&pSMBr->hdr) < 24) + rc = -EIO; /* 24 or 26 expected but we do not read + last field */ + else if (pFindData) { + int size; + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + + /* On legacy responses we do not read the last field, + EAsize, fortunately since it varies by subdialect and + also note it differs on Set vs. Get, ie two bytes or 4 + bytes depending but we don't care here */ + if (legacy) + size = sizeof(FILE_INFO_STANDARD); + else + size = sizeof(FILE_ALL_INFO); + memcpy((char *) pFindData, + (char *) &pSMBr->hdr.Protocol + + data_offset, size); + } else + rc = -ENOMEM; + } + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto QPathInfoRetry; + + return rc; +} + +int +CIFSSMBUnixQFileInfo(const int xid, struct cifs_tcon *tcon, + u16 netfid, FILE_UNIX_BASIC_INFO *pFindData) +{ + struct smb_t2_qfi_req *pSMB = NULL; + struct smb_t2_qfi_rsp *pSMBr = NULL; + int rc = 0; + int bytes_returned; + __u16 params, byte_count; + +UnixQFileInfoRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2 /* level */ + 2 /* fid */; + pSMB->t2.TotalDataCount = 0; + pSMB->t2.MaxParameterCount = cpu_to_le16(4); + /* BB find exact max data count below from sess structure BB */ + pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize); + pSMB->t2.MaxSetupCount = 0; + pSMB->t2.Reserved = 0; + pSMB->t2.Flags = 0; + pSMB->t2.Timeout = 0; + pSMB->t2.Reserved2 = 0; + pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req, + Fid) - 4); + pSMB->t2.DataCount = 0; + pSMB->t2.DataOffset = 0; + pSMB->t2.SetupCount = 1; + pSMB->t2.Reserved3 = 0; + pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->t2.TotalParameterCount = cpu_to_le16(params); + pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC); + pSMB->Pad = 0; + pSMB->Fid = netfid; + inc_rfc1001_len(pSMB, byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Send error in QPathInfo = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) { + cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n" + "Unix Extensions can be disabled on mount " + "by specifying the nosfu mount option."); + rc = -EIO; /* bad smb */ + } else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + memcpy((char *) pFindData, + (char *) &pSMBr->hdr.Protocol + + data_offset, + sizeof(FILE_UNIX_BASIC_INFO)); + } + } + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto UnixQFileInfoRetry; + + return rc; +} + +int +CIFSSMBUnixQPathInfo(const int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + FILE_UNIX_BASIC_INFO *pFindData, + const struct nls_table *nls_codepage, int remap) +{ +/* SMB_QUERY_FILE_UNIX_BASIC */ + TRANSACTION2_QPI_REQ *pSMB = NULL; + TRANSACTION2_QPI_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned = 0; + int name_len; + __u16 params, byte_count; + + cFYI(1, "In QPathInfo (Unix) the path %s", searchName); +UnixQPathInfoRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(searchName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, searchName, name_len); + } + + params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(4000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qpi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Send error in QPathInfo = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) { + cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n" + "Unix Extensions can be disabled on mount " + "by specifying the nosfu mount option."); + rc = -EIO; /* bad smb */ + } else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + memcpy((char *) pFindData, + (char *) &pSMBr->hdr.Protocol + + data_offset, + sizeof(FILE_UNIX_BASIC_INFO)); + } + } + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto UnixQPathInfoRetry; + + return rc; +} + +/* xid, tcon, searchName and codepage are input parms, rest are returned */ +int +CIFSFindFirst(const int xid, struct cifs_tcon *tcon, + const char *searchName, + const struct nls_table *nls_codepage, + __u16 *pnetfid, + struct cifs_search_info *psrch_inf, int remap, const char dirsep) +{ +/* level 257 SMB_ */ + TRANSACTION2_FFIRST_REQ *pSMB = NULL; + TRANSACTION2_FFIRST_RSP *pSMBr = NULL; + T2_FFIRST_RSP_PARMS *parms; + int rc = 0; + int bytes_returned = 0; + int name_len; + __u16 params, byte_count; + + cFYI(1, "In FindFirst for %s", searchName); + +findFirstRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage, remap); + /* We can not add the asterik earlier in case + it got remapped to 0xF03A as if it were part of the + directory name instead of a wildcard */ + name_len *= 2; + pSMB->FileName[name_len] = dirsep; + pSMB->FileName[name_len+1] = 0; + pSMB->FileName[name_len+2] = '*'; + pSMB->FileName[name_len+3] = 0; + name_len += 4; /* now the trailing null */ + pSMB->FileName[name_len] = 0; /* null terminate just in case */ + pSMB->FileName[name_len+1] = 0; + name_len += 2; + } else { /* BB add check for overrun of SMB buf BB */ + name_len = strnlen(searchName, PATH_MAX); +/* BB fix here and in unicode clause above ie + if (name_len > buffersize-header) + free buffer exit; BB */ + strncpy(pSMB->FileName, searchName, name_len); + pSMB->FileName[name_len] = dirsep; + pSMB->FileName[name_len+1] = '*'; + pSMB->FileName[name_len+2] = 0; + name_len += 3; + } + + params = 12 + name_len /* includes null */ ; + pSMB->TotalDataCount = 0; /* no EAs */ + pSMB->MaxParameterCount = cpu_to_le16(10); + pSMB->MaxDataCount = cpu_to_le16((tcon->ses->server->maxBuf - + MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->ParameterOffset = cpu_to_le16( + offsetof(struct smb_com_transaction2_ffirst_req, SearchAttributes) + - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; /* one byte, no need to make endian neutral */ + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_FIND_FIRST); + pSMB->SearchAttributes = + cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM | + ATTR_DIRECTORY); + pSMB->SearchCount = cpu_to_le16(CIFSMaxBufSize/sizeof(FILE_UNIX_INFO)); + pSMB->SearchFlags = cpu_to_le16(CIFS_SEARCH_CLOSE_AT_END | + CIFS_SEARCH_RETURN_RESUME); + pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level); + + /* BB what should we set StorageType to? Does it matter? BB */ + pSMB->SearchStorageType = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->num_ffirst); + + if (rc) {/* BB add logic to retry regular search if Unix search + rejected unexpectedly by server */ + /* BB Add code to handle unsupported level rc */ + cFYI(1, "Error in FindFirst = %d", rc); + + cifs_buf_release(pSMB); + + /* BB eventually could optimize out free and realloc of buf */ + /* for this case */ + if (rc == -EAGAIN) + goto findFirstRetry; + } else { /* decode response */ + /* BB remember to free buffer if error BB */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + if (rc == 0) { + unsigned int lnoff; + + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) + psrch_inf->unicode = true; + else + psrch_inf->unicode = false; + + psrch_inf->ntwrk_buf_start = (char *)pSMBr; + psrch_inf->smallBuf = 0; + psrch_inf->srch_entries_start = + (char *) &pSMBr->hdr.Protocol + + le16_to_cpu(pSMBr->t2.DataOffset); + parms = (T2_FFIRST_RSP_PARMS *)((char *) &pSMBr->hdr.Protocol + + le16_to_cpu(pSMBr->t2.ParameterOffset)); + + if (parms->EndofSearch) + psrch_inf->endOfSearch = true; + else + psrch_inf->endOfSearch = false; + + psrch_inf->entries_in_buffer = + le16_to_cpu(parms->SearchCount); + psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + + psrch_inf->entries_in_buffer; + lnoff = le16_to_cpu(parms->LastNameOffset); + if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < + lnoff) { + cERROR(1, "ignoring corrupt resume name"); + psrch_inf->last_entry = NULL; + return rc; + } + + psrch_inf->last_entry = psrch_inf->srch_entries_start + + lnoff; + + *pnetfid = parms->SearchHandle; + } else { + cifs_buf_release(pSMB); + } + } + + return rc; +} + +int CIFSFindNext(const int xid, struct cifs_tcon *tcon, + __u16 searchHandle, struct cifs_search_info *psrch_inf) +{ + TRANSACTION2_FNEXT_REQ *pSMB = NULL; + TRANSACTION2_FNEXT_RSP *pSMBr = NULL; + T2_FNEXT_RSP_PARMS *parms; + char *response_data; + int rc = 0; + int bytes_returned; + unsigned int name_len; + __u16 params, byte_count; + + cFYI(1, "In FindNext"); + + if (psrch_inf->endOfSearch) + return -ENOENT; + + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 14; /* includes 2 bytes of null string, converted to LE below*/ + byte_count = 0; + pSMB->TotalDataCount = 0; /* no EAs */ + pSMB->MaxParameterCount = cpu_to_le16(8); + pSMB->MaxDataCount = + cpu_to_le16((tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & + 0xFFFFFF00); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16( + offsetof(struct smb_com_transaction2_fnext_req,SearchHandle) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_FIND_NEXT); + pSMB->SearchHandle = searchHandle; /* always kept as le */ + pSMB->SearchCount = + cpu_to_le16(CIFSMaxBufSize / sizeof(FILE_UNIX_INFO)); + pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level); + pSMB->ResumeKey = psrch_inf->resume_key; + pSMB->SearchFlags = + cpu_to_le16(CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME); + + name_len = psrch_inf->resume_name_len; + params += name_len; + if (name_len < PATH_MAX) { + memcpy(pSMB->ResumeFileName, psrch_inf->presume_name, name_len); + byte_count += name_len; + /* 14 byte parm len above enough for 2 byte null terminator */ + pSMB->ResumeFileName[name_len] = 0; + pSMB->ResumeFileName[name_len+1] = 0; + } else { + rc = -EINVAL; + goto FNext2_err_exit; + } + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + cifs_stats_inc(&tcon->num_fnext); + if (rc) { + if (rc == -EBADF) { + psrch_inf->endOfSearch = true; + cifs_buf_release(pSMB); + rc = 0; /* search probably was closed at end of search*/ + } else + cFYI(1, "FindNext returned = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc == 0) { + unsigned int lnoff; + + /* BB fixme add lock for file (srch_info) struct here */ + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) + psrch_inf->unicode = true; + else + psrch_inf->unicode = false; + response_data = (char *) &pSMBr->hdr.Protocol + + le16_to_cpu(pSMBr->t2.ParameterOffset); + parms = (T2_FNEXT_RSP_PARMS *)response_data; + response_data = (char *)&pSMBr->hdr.Protocol + + le16_to_cpu(pSMBr->t2.DataOffset); + if (psrch_inf->smallBuf) + cifs_small_buf_release( + psrch_inf->ntwrk_buf_start); + else + cifs_buf_release(psrch_inf->ntwrk_buf_start); + psrch_inf->srch_entries_start = response_data; + psrch_inf->ntwrk_buf_start = (char *)pSMB; + psrch_inf->smallBuf = 0; + if (parms->EndofSearch) + psrch_inf->endOfSearch = true; + else + psrch_inf->endOfSearch = false; + psrch_inf->entries_in_buffer = + le16_to_cpu(parms->SearchCount); + psrch_inf->index_of_last_entry += + psrch_inf->entries_in_buffer; + lnoff = le16_to_cpu(parms->LastNameOffset); + if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < + lnoff) { + cERROR(1, "ignoring corrupt resume name"); + psrch_inf->last_entry = NULL; + return rc; + } else + psrch_inf->last_entry = + psrch_inf->srch_entries_start + lnoff; + +/* cFYI(1, "fnxt2 entries in buf %d index_of_last %d", + psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */ + + /* BB fixme add unlock here */ + } + + } + + /* BB On error, should we leave previous search buf (and count and + last entry fields) intact or free the previous one? */ + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ +FNext2_err_exit: + if (rc != 0) + cifs_buf_release(pSMB); + return rc; +} + +int +CIFSFindClose(const int xid, struct cifs_tcon *tcon, + const __u16 searchHandle) +{ + int rc = 0; + FINDCLOSE_REQ *pSMB = NULL; + + cFYI(1, "In CIFSSMBFindClose"); + rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB); + + /* no sense returning error if session restarted + as file handle has been closed */ + if (rc == -EAGAIN) + return 0; + if (rc) + return rc; + + pSMB->FileID = searchHandle; + pSMB->ByteCount = 0; + rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + if (rc) + cERROR(1, "Send error in FindClose = %d", rc); + + cifs_stats_inc(&tcon->num_fclose); + + /* Since session is dead, search handle closed on server already */ + if (rc == -EAGAIN) + rc = 0; + + return rc; +} + +int +CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + __u64 *inode_number, + const struct nls_table *nls_codepage, int remap) +{ + int rc = 0; + TRANSACTION2_QPI_REQ *pSMB = NULL; + TRANSACTION2_QPI_RSP *pSMBr = NULL; + int name_len, bytes_returned; + __u16 params, byte_count; + + cFYI(1, "In GetSrvInodeNum for %s", searchName); + if (tcon == NULL) + return -ENODEV; + +GetInodeNumberRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(searchName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, searchName, name_len); + } + + params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max data count below from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(4000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qpi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_INTERNAL_INFO); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "error %d in QueryInternalInfo", rc); + } else { + /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + /* BB also check enough total bytes returned */ + if (rc || get_bcc(&pSMBr->hdr) < 2) + /* If rc should we check for EOPNOSUPP and + disable the srvino flag? or in caller? */ + rc = -EIO; /* bad smb */ + else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + __u16 count = le16_to_cpu(pSMBr->t2.DataCount); + struct file_internal_info *pfinfo; + /* BB Do we need a cast or hash here ? */ + if (count < 8) { + cFYI(1, "Illegal size ret in QryIntrnlInf"); + rc = -EIO; + goto GetInodeNumOut; + } + pfinfo = (struct file_internal_info *) + (data_offset + (char *) &pSMBr->hdr.Protocol); + *inode_number = le64_to_cpu(pfinfo->UniqueId); + } + } +GetInodeNumOut: + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto GetInodeNumberRetry; + return rc; +} + +/* parses DFS refferal V3 structure + * caller is responsible for freeing target_nodes + * returns: + * on success - 0 + * on failure - errno + */ +static int +parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, + unsigned int *num_of_nodes, + struct dfs_info3_param **target_nodes, + const struct nls_table *nls_codepage, int remap, + const char *searchName) +{ + int i, rc = 0; + char *data_end; + bool is_unicode; + struct dfs_referral_level_3 *ref; + + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) + is_unicode = true; + else + is_unicode = false; + *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals); + + if (*num_of_nodes < 1) { + cERROR(1, "num_referrals: must be at least > 0," + "but we get num_referrals = %d\n", *num_of_nodes); + rc = -EINVAL; + goto parse_DFS_referrals_exit; + } + + ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals); + if (ref->VersionNumber != cpu_to_le16(3)) { + cERROR(1, "Referrals of V%d version are not supported," + "should be V3", le16_to_cpu(ref->VersionNumber)); + rc = -EINVAL; + goto parse_DFS_referrals_exit; + } + + /* get the upper boundary of the resp buffer */ + data_end = (char *)(&(pSMBr->PathConsumed)) + + le16_to_cpu(pSMBr->t2.DataCount); + + cFYI(1, "num_referrals: %d dfs flags: 0x%x ...\n", + *num_of_nodes, + le32_to_cpu(pSMBr->DFSFlags)); + + *target_nodes = kzalloc(sizeof(struct dfs_info3_param) * + *num_of_nodes, GFP_KERNEL); + if (*target_nodes == NULL) { + cERROR(1, "Failed to allocate buffer for target_nodes\n"); + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } + + /* collect necessary data from referrals */ + for (i = 0; i < *num_of_nodes; i++) { + char *temp; + int max_len; + struct dfs_info3_param *node = (*target_nodes)+i; + + node->flags = le32_to_cpu(pSMBr->DFSFlags); + if (is_unicode) { + __le16 *tmp = kmalloc(strlen(searchName)*2 + 2, + GFP_KERNEL); + if (tmp == NULL) { + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } + cifsConvertToUCS((__le16 *) tmp, searchName, + PATH_MAX, nls_codepage, remap); + node->path_consumed = cifs_ucs2_bytes(tmp, + le16_to_cpu(pSMBr->PathConsumed), + nls_codepage); + kfree(tmp); + } else + node->path_consumed = le16_to_cpu(pSMBr->PathConsumed); + + node->server_type = le16_to_cpu(ref->ServerType); + node->ref_flag = le16_to_cpu(ref->ReferralEntryFlags); + + /* copy DfsPath */ + temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset); + max_len = data_end - temp; + node->path_name = cifs_strndup_from_ucs(temp, max_len, + is_unicode, nls_codepage); + if (!node->path_name) { + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } + + /* copy link target UNC */ + temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset); + max_len = data_end - temp; + node->node_name = cifs_strndup_from_ucs(temp, max_len, + is_unicode, nls_codepage); + if (!node->node_name) + rc = -ENOMEM; + } + +parse_DFS_referrals_exit: + if (rc) { + free_dfs_info_array(*target_nodes, *num_of_nodes); + *target_nodes = NULL; + *num_of_nodes = 0; + } + return rc; +} + +int +CIFSGetDFSRefer(const int xid, struct cifs_ses *ses, + const unsigned char *searchName, + struct dfs_info3_param **target_nodes, + unsigned int *num_of_nodes, + const struct nls_table *nls_codepage, int remap) +{ +/* TRANS2_GET_DFS_REFERRAL */ + TRANSACTION2_GET_DFS_REFER_REQ *pSMB = NULL; + TRANSACTION2_GET_DFS_REFER_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + __u16 params, byte_count; + *num_of_nodes = 0; + *target_nodes = NULL; + + cFYI(1, "In GetDFSRefer the path %s", searchName); + if (ses == NULL) + return -ENODEV; +getDFSRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, NULL, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + /* server pointer checked in called function, + but should never be null here anyway */ + pSMB->hdr.Mid = GetNextMid(ses->server); + pSMB->hdr.Tid = ses->ipc_tid; + pSMB->hdr.Uid = ses->Suid; + if (ses->capabilities & CAP_STATUS32) + pSMB->hdr.Flags2 |= SMBFLG2_ERR_STATUS; + if (ses->capabilities & CAP_DFS) + pSMB->hdr.Flags2 |= SMBFLG2_DFS; + + if (ses->capabilities & CAP_UNICODE) { + pSMB->hdr.Flags2 |= SMBFLG2_UNICODE; + name_len = + cifsConvertToUCS((__le16 *) pSMB->RequestFileName, + searchName, PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(searchName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->RequestFileName, searchName, name_len); + } + + if (ses->server) { + if (ses->server->sec_mode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + } + + pSMB->hdr.Uid = ses->Suid; + + params = 2 /* level */ + name_len /*includes null */ ; + pSMB->TotalDataCount = 0; + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->MaxParameterCount = 0; + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(4000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_get_dfs_refer_req, MaxReferralLevel) - 4); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_GET_DFS_REFERRAL); + byte_count = params + 3 /* pad */ ; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->MaxReferralLevel = cpu_to_le16(3); + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Send error in GetDFSRefer = %d", rc); + goto GetDFSRefExit; + } + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + /* BB Also check if enough total bytes returned? */ + if (rc || get_bcc(&pSMBr->hdr) < 17) { + rc = -EIO; /* bad smb */ + goto GetDFSRefExit; + } + + cFYI(1, "Decoding GetDFSRefer response BCC: %d Offset %d", + get_bcc(&pSMBr->hdr), + le16_to_cpu(pSMBr->t2.DataOffset)); + + /* parse returned result into more usable form */ + rc = parse_DFS_referrals(pSMBr, num_of_nodes, + target_nodes, nls_codepage, remap, + searchName); + +GetDFSRefExit: + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto getDFSRetry; + + return rc; +} + +/* Query File System Info such as free space to old servers such as Win 9x */ +int +SMBOldQFSInfo(const int xid, struct cifs_tcon *tcon, struct kstatfs *FSData) +{ +/* level 0x01 SMB_QUERY_FILE_SYSTEM_INFO */ + TRANSACTION2_QFSI_REQ *pSMB = NULL; + TRANSACTION2_QFSI_RSP *pSMBr = NULL; + FILE_SYSTEM_ALLOC_INFO *response_data; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count; + + cFYI(1, "OldQFSInfo"); +oldQFSInfoRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2; /* level */ + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qfsi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_INFO_ALLOCATION); + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Send error in QFSInfo = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < 18) + rc = -EIO; /* bad smb */ + else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + cFYI(1, "qfsinf resp BCC: %d Offset %d", + get_bcc(&pSMBr->hdr), data_offset); + + response_data = (FILE_SYSTEM_ALLOC_INFO *) + (((char *) &pSMBr->hdr.Protocol) + data_offset); + FSData->f_bsize = + le16_to_cpu(response_data->BytesPerSector) * + le32_to_cpu(response_data-> + SectorsPerAllocationUnit); + FSData->f_blocks = + le32_to_cpu(response_data->TotalAllocationUnits); + FSData->f_bfree = FSData->f_bavail = + le32_to_cpu(response_data->FreeAllocationUnits); + cFYI(1, "Blocks: %lld Free: %lld Block size %ld", + (unsigned long long)FSData->f_blocks, + (unsigned long long)FSData->f_bfree, + FSData->f_bsize); + } + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto oldQFSInfoRetry; + + return rc; +} + +int +CIFSSMBQFSInfo(const int xid, struct cifs_tcon *tcon, struct kstatfs *FSData) +{ +/* level 0x103 SMB_QUERY_FILE_SYSTEM_INFO */ + TRANSACTION2_QFSI_REQ *pSMB = NULL; + TRANSACTION2_QFSI_RSP *pSMBr = NULL; + FILE_SYSTEM_INFO *response_data; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count; + + cFYI(1, "In QFSInfo"); +QFSInfoRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2; /* level */ + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qfsi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_SIZE_INFO); + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Send error in QFSInfo = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < 24) + rc = -EIO; /* bad smb */ + else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + + response_data = + (FILE_SYSTEM_INFO + *) (((char *) &pSMBr->hdr.Protocol) + + data_offset); + FSData->f_bsize = + le32_to_cpu(response_data->BytesPerSector) * + le32_to_cpu(response_data-> + SectorsPerAllocationUnit); + FSData->f_blocks = + le64_to_cpu(response_data->TotalAllocationUnits); + FSData->f_bfree = FSData->f_bavail = + le64_to_cpu(response_data->FreeAllocationUnits); + cFYI(1, "Blocks: %lld Free: %lld Block size %ld", + (unsigned long long)FSData->f_blocks, + (unsigned long long)FSData->f_bfree, + FSData->f_bsize); + } + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QFSInfoRetry; + + return rc; +} + +int +CIFSSMBQFSAttributeInfo(const int xid, struct cifs_tcon *tcon) +{ +/* level 0x105 SMB_QUERY_FILE_SYSTEM_INFO */ + TRANSACTION2_QFSI_REQ *pSMB = NULL; + TRANSACTION2_QFSI_RSP *pSMBr = NULL; + FILE_SYSTEM_ATTRIBUTE_INFO *response_data; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count; + + cFYI(1, "In QFSAttributeInfo"); +QFSAttributeRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2; /* level */ + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qfsi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_ATTRIBUTE_INFO); + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cERROR(1, "Send error in QFSAttributeInfo = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < 13) { + /* BB also check if enough bytes returned */ + rc = -EIO; /* bad smb */ + } else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + response_data = + (FILE_SYSTEM_ATTRIBUTE_INFO + *) (((char *) &pSMBr->hdr.Protocol) + + data_offset); + memcpy(&tcon->fsAttrInfo, response_data, + sizeof(FILE_SYSTEM_ATTRIBUTE_INFO)); + } + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QFSAttributeRetry; + + return rc; +} + +int +CIFSSMBQFSDeviceInfo(const int xid, struct cifs_tcon *tcon) +{ +/* level 0x104 SMB_QUERY_FILE_SYSTEM_INFO */ + TRANSACTION2_QFSI_REQ *pSMB = NULL; + TRANSACTION2_QFSI_RSP *pSMBr = NULL; + FILE_SYSTEM_DEVICE_INFO *response_data; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count; + + cFYI(1, "In QFSDeviceInfo"); +QFSDeviceRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2; /* level */ + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qfsi_req, InformationLevel) - 4); + + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_DEVICE_INFO); + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Send error in QFSDeviceInfo = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < + sizeof(FILE_SYSTEM_DEVICE_INFO)) + rc = -EIO; /* bad smb */ + else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + response_data = + (FILE_SYSTEM_DEVICE_INFO *) + (((char *) &pSMBr->hdr.Protocol) + + data_offset); + memcpy(&tcon->fsDevInfo, response_data, + sizeof(FILE_SYSTEM_DEVICE_INFO)); + } + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QFSDeviceRetry; + + return rc; +} + +int +CIFSSMBQFSUnixInfo(const int xid, struct cifs_tcon *tcon) +{ +/* level 0x200 SMB_QUERY_CIFS_UNIX_INFO */ + TRANSACTION2_QFSI_REQ *pSMB = NULL; + TRANSACTION2_QFSI_RSP *pSMBr = NULL; + FILE_SYSTEM_UNIX_INFO *response_data; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count; + + cFYI(1, "In QFSUnixInfo"); +QFSUnixRetry: + rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, + (void **) &pSMB, (void **) &pSMBr); + if (rc) + return rc; + + params = 2; /* level */ + pSMB->TotalDataCount = 0; + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(100); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + byte_count = params + 1 /* pad */ ; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(offsetof(struct + smb_com_transaction2_qfsi_req, InformationLevel) - 4); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_CIFS_UNIX_INFO); + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cERROR(1, "Send error in QFSUnixInfo = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < 13) { + rc = -EIO; /* bad smb */ + } else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + response_data = + (FILE_SYSTEM_UNIX_INFO + *) (((char *) &pSMBr->hdr.Protocol) + + data_offset); + memcpy(&tcon->fsUnixInfo, response_data, + sizeof(FILE_SYSTEM_UNIX_INFO)); + } + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QFSUnixRetry; + + + return rc; +} + +int +CIFSSMBSetFSUnixInfo(const int xid, struct cifs_tcon *tcon, __u64 cap) +{ +/* level 0x200 SMB_SET_CIFS_UNIX_INFO */ + TRANSACTION2_SETFSI_REQ *pSMB = NULL; + TRANSACTION2_SETFSI_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned = 0; + __u16 params, param_offset, offset, byte_count; + + cFYI(1, "In SETFSUnixInfo"); +SETFSUnixRetry: + /* BB switch to small buf init to save memory */ + rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, + (void **) &pSMB, (void **) &pSMBr); + if (rc) + return rc; + + params = 4; /* 2 bytes zero followed by info level. */ + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_setfsi_req, FileNum) + - 4; + offset = param_offset + params; + + pSMB->MaxParameterCount = cpu_to_le16(4); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(100); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FS_INFORMATION); + byte_count = 1 /* pad */ + params + 12; + + pSMB->DataCount = cpu_to_le16(12); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + + /* Params. */ + pSMB->FileNum = 0; + pSMB->InformationLevel = cpu_to_le16(SMB_SET_CIFS_UNIX_INFO); + + /* Data. */ + pSMB->ClientUnixMajor = cpu_to_le16(CIFS_UNIX_MAJOR_VERSION); + pSMB->ClientUnixMinor = cpu_to_le16(CIFS_UNIX_MINOR_VERSION); + pSMB->ClientUnixCap = cpu_to_le64(cap); + + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cERROR(1, "Send error in SETFSUnixInfo = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + if (rc) + rc = -EIO; /* bad smb */ + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto SETFSUnixRetry; + + return rc; +} + + + +int +CIFSSMBQFSPosixInfo(const int xid, struct cifs_tcon *tcon, + struct kstatfs *FSData) +{ +/* level 0x201 SMB_QUERY_CIFS_POSIX_INFO */ + TRANSACTION2_QFSI_REQ *pSMB = NULL; + TRANSACTION2_QFSI_RSP *pSMBr = NULL; + FILE_SYSTEM_POSIX_INFO *response_data; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count; + + cFYI(1, "In QFSPosixInfo"); +QFSPosixRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + params = 2; /* level */ + pSMB->TotalDataCount = 0; + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(100); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + byte_count = params + 1 /* pad */ ; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(offsetof(struct + smb_com_transaction2_qfsi_req, InformationLevel) - 4); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); + pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_FS_INFO); + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Send error in QFSUnixInfo = %d", rc); + } else { /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + + if (rc || get_bcc(&pSMBr->hdr) < 13) { + rc = -EIO; /* bad smb */ + } else { + __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + response_data = + (FILE_SYSTEM_POSIX_INFO + *) (((char *) &pSMBr->hdr.Protocol) + + data_offset); + FSData->f_bsize = + le32_to_cpu(response_data->BlockSize); + FSData->f_blocks = + le64_to_cpu(response_data->TotalBlocks); + FSData->f_bfree = + le64_to_cpu(response_data->BlocksAvail); + if (response_data->UserBlocksAvail == cpu_to_le64(-1)) { + FSData->f_bavail = FSData->f_bfree; + } else { + FSData->f_bavail = + le64_to_cpu(response_data->UserBlocksAvail); + } + if (response_data->TotalFileNodes != cpu_to_le64(-1)) + FSData->f_files = + le64_to_cpu(response_data->TotalFileNodes); + if (response_data->FreeFileNodes != cpu_to_le64(-1)) + FSData->f_ffree = + le64_to_cpu(response_data->FreeFileNodes); + } + } + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QFSPosixRetry; + + return rc; +} + + +/* We can not use write of zero bytes trick to + set file size due to need for large file support. Also note that + this SetPathInfo is preferred to SetFileInfo based method in next + routine which is only needed to work around a sharing violation bug + in Samba which this routine can run into */ + +int +CIFSSMBSetEOF(const int xid, struct cifs_tcon *tcon, const char *fileName, + __u64 size, bool SetAllocation, + const struct nls_table *nls_codepage, int remap) +{ + struct smb_com_transaction2_spi_req *pSMB = NULL; + struct smb_com_transaction2_spi_rsp *pSMBr = NULL; + struct file_end_of_file_info *parm_data; + int name_len; + int rc = 0; + int bytes_returned = 0; + __u16 params, byte_count, data_count, param_offset, offset; + + cFYI(1, "In SetEOF"); +SetEOFRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, fileName, name_len); + } + params = 6 + name_len; + data_count = sizeof(struct file_end_of_file_info); + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(4100); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + if (SetAllocation) { + if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2); + else + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO); + } else /* Set File Size */ { + if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO2); + else + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO); + } + + parm_data = + (struct file_end_of_file_info *) (((char *) &pSMB->hdr.Protocol) + + offset); + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + data_count; + pSMB->DataCount = cpu_to_le16(data_count); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + parm_data->FileSize = cpu_to_le64(size); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cFYI(1, "SetPathInfo (file size) returned %d", rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto SetEOFRetry; + + return rc; +} + +int +CIFSSMBSetFileSize(const int xid, struct cifs_tcon *tcon, __u64 size, + __u16 fid, __u32 pid_of_opener, bool SetAllocation) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + struct file_end_of_file_info *parm_data; + int rc = 0; + __u16 params, param_offset, offset, byte_count, count; + + cFYI(1, "SetFileSize (via SetFileInfo) %lld", + (long long)size); + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); + + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16)); + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + count = sizeof(struct file_end_of_file_info); + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + parm_data = + (struct file_end_of_file_info *) (((char *) &pSMB->hdr.Protocol) + + offset); + pSMB->DataOffset = cpu_to_le16(offset); + parm_data->FileSize = cpu_to_le64(size); + pSMB->Fid = fid; + if (SetAllocation) { + if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2); + else + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO); + } else /* Set File Size */ { + if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO2); + else + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO); + } + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + if (rc) { + cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc); + } + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + +/* Some legacy servers such as NT4 require that the file times be set on + an open handle, rather than by pathname - this is awkward due to + potential access conflicts on the open, but it is unavoidable for these + old servers since the only other choice is to go from 100 nanosecond DCE + time and resort to the original setpathinfo level which takes the ancient + DOS time format with 2 second granularity */ +int +CIFSSMBSetFileInfo(const int xid, struct cifs_tcon *tcon, + const FILE_BASIC_INFO *data, __u16 fid, __u32 pid_of_opener) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + char *data_offset; + int rc = 0; + __u16 params, param_offset, offset, byte_count, count; + + cFYI(1, "Set Times (via SetFileInfo)"); + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); + + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16)); + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + + count = sizeof(FILE_BASIC_INFO); + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->Fid = fid; + if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO2); + else + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + memcpy(data_offset, data, sizeof(FILE_BASIC_INFO)); + rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + if (rc) + cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + +int +CIFSSMBSetFileDisposition(const int xid, struct cifs_tcon *tcon, + bool delete_file, __u16 fid, __u32 pid_of_opener) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + char *data_offset; + int rc = 0; + __u16 params, param_offset, offset, byte_count, count; + + cFYI(1, "Set File Disposition (via SetFileInfo)"); + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); + + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16)); + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + + count = 1; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->Fid = fid; + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + *data_offset = delete_file ? 1 : 0; + rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + if (rc) + cFYI(1, "Send error in SetFileDisposition = %d", rc); + + return rc; +} + +int +CIFSSMBSetPathInfo(const int xid, struct cifs_tcon *tcon, + const char *fileName, const FILE_BASIC_INFO *data, + const struct nls_table *nls_codepage, int remap) +{ + TRANSACTION2_SPI_REQ *pSMB = NULL; + TRANSACTION2_SPI_RSP *pSMBr = NULL; + int name_len; + int rc = 0; + int bytes_returned = 0; + char *data_offset; + __u16 params, param_offset, offset, byte_count, count; + + cFYI(1, "In SetTimes"); + +SetTimesRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, fileName, name_len); + } + + params = 6 + name_len; + count = sizeof(FILE_BASIC_INFO); + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + count; + + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU) + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO2); + else + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + memcpy(data_offset, data, sizeof(FILE_BASIC_INFO)); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cFYI(1, "SetPathInfo (times) returned %d", rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto SetTimesRetry; + + return rc; +} + +/* Can not be used to set time stamps yet (due to old DOS time format) */ +/* Can be used to set attributes */ +#if 0 /* Possibly not needed - since it turns out that strangely NT4 has a bug + handling it anyway and NT4 was what we thought it would be needed for + Do not delete it until we prove whether needed for Win9x though */ +int +CIFSSMBSetAttrLegacy(int xid, struct cifs_tcon *tcon, char *fileName, + __u16 dos_attrs, const struct nls_table *nls_codepage) +{ + SETATTR_REQ *pSMB = NULL; + SETATTR_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int name_len; + + cFYI(1, "In SetAttrLegacy"); + +SetAttrLgcyRetry: + rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + ConvertToUCS((__le16 *) pSMB->fileName, fileName, + PATH_MAX, nls_codepage); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->fileName, fileName, name_len); + } + pSMB->attr = cpu_to_le16(dos_attrs); + pSMB->BufferFormat = 0x04; + inc_rfc1001_len(pSMB, name_len + 1); + pSMB->ByteCount = cpu_to_le16(name_len + 1); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cFYI(1, "Error in LegacySetAttr = %d", rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto SetAttrLgcyRetry; + + return rc; +} +#endif /* temporarily unneeded SetAttr legacy function */ + +static void +cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset, + const struct cifs_unix_set_info_args *args) +{ + u64 mode = args->mode; + + /* + * Samba server ignores set of file size to zero due to bugs in some + * older clients, but we should be precise - we use SetFileSize to + * set file size and do not want to truncate file size to zero + * accidentally as happened on one Samba server beta by putting + * zero instead of -1 here + */ + data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64); + data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64); + data_offset->LastStatusChange = cpu_to_le64(args->ctime); + data_offset->LastAccessTime = cpu_to_le64(args->atime); + data_offset->LastModificationTime = cpu_to_le64(args->mtime); + data_offset->Uid = cpu_to_le64(args->uid); + data_offset->Gid = cpu_to_le64(args->gid); + /* better to leave device as zero when it is */ + data_offset->DevMajor = cpu_to_le64(MAJOR(args->device)); + data_offset->DevMinor = cpu_to_le64(MINOR(args->device)); + data_offset->Permissions = cpu_to_le64(mode); + + if (S_ISREG(mode)) + data_offset->Type = cpu_to_le32(UNIX_FILE); + else if (S_ISDIR(mode)) + data_offset->Type = cpu_to_le32(UNIX_DIR); + else if (S_ISLNK(mode)) + data_offset->Type = cpu_to_le32(UNIX_SYMLINK); + else if (S_ISCHR(mode)) + data_offset->Type = cpu_to_le32(UNIX_CHARDEV); + else if (S_ISBLK(mode)) + data_offset->Type = cpu_to_le32(UNIX_BLOCKDEV); + else if (S_ISFIFO(mode)) + data_offset->Type = cpu_to_le32(UNIX_FIFO); + else if (S_ISSOCK(mode)) + data_offset->Type = cpu_to_le32(UNIX_SOCKET); +} + +int +CIFSSMBUnixSetFileInfo(const int xid, struct cifs_tcon *tcon, + const struct cifs_unix_set_info_args *args, + u16 fid, u32 pid_of_opener) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + FILE_UNIX_BASIC_INFO *data_offset; + int rc = 0; + u16 params, param_offset, offset, byte_count, count; + + cFYI(1, "Set Unix Info (via SetFileInfo)"); + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); + + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16)); + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + data_offset = (FILE_UNIX_BASIC_INFO *) + ((char *)(&pSMB->hdr.Protocol) + offset); + count = sizeof(FILE_UNIX_BASIC_INFO); + + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->Fid = fid; + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + cifs_fill_unix_set_info(data_offset, args); + + rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + if (rc) + cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + +int +CIFSSMBUnixSetPathInfo(const int xid, struct cifs_tcon *tcon, char *fileName, + const struct cifs_unix_set_info_args *args, + const struct nls_table *nls_codepage, int remap) +{ + TRANSACTION2_SPI_REQ *pSMB = NULL; + TRANSACTION2_SPI_RSP *pSMBr = NULL; + int name_len; + int rc = 0; + int bytes_returned = 0; + FILE_UNIX_BASIC_INFO *data_offset; + __u16 params, param_offset, offset, count, byte_count; + + cFYI(1, "In SetUID/GID/Mode"); +setPermsRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, fileName, name_len); + } + + params = 6 + name_len; + count = sizeof(FILE_UNIX_BASIC_INFO); + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + data_offset = + (FILE_UNIX_BASIC_INFO *) ((char *) &pSMB->hdr.Protocol + + offset); + memset(data_offset, 0, count); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->DataCount = cpu_to_le16(count); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + + cifs_fill_unix_set_info(data_offset, args); + + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cFYI(1, "SetPathInfo (perms) returned %d", rc); + + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto setPermsRetry; + return rc; +} + +#ifdef CONFIG_CIFS_XATTR +/* + * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common + * function used by listxattr and getxattr type calls. When ea_name is set, + * it looks for that attribute name and stuffs that value into the EAData + * buffer. When ea_name is NULL, it stuffs a list of attribute names into the + * buffer. In both cases, the return value is either the length of the + * resulting data or a negative error code. If EAData is a NULL pointer then + * the data isn't copied to it, but the length is returned. + */ +ssize_t +CIFSSMBQAllEAs(const int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, const unsigned char *ea_name, + char *EAData, size_t buf_size, + const struct nls_table *nls_codepage, int remap) +{ + /* BB assumes one setup word */ + TRANSACTION2_QPI_REQ *pSMB = NULL; + TRANSACTION2_QPI_RSP *pSMBr = NULL; + int rc = 0; + int bytes_returned; + int list_len; + struct fealist *ea_response_data; + struct fea *temp_fea; + char *temp_ptr; + char *end_of_smb; + __u16 params, byte_count, data_offset; + + cFYI(1, "In Query All EAs path %s", searchName); +QAllEAsRetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + list_len = + cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage, remap); + list_len++; /* trailing null */ + list_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + list_len = strnlen(searchName, PATH_MAX); + list_len++; /* trailing null */ + strncpy(pSMB->FileName, searchName, list_len); + } + + params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find exact max SMB PDU from sess structure BB */ + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = cpu_to_le16(offsetof( + struct smb_com_transaction2_qpi_req, InformationLevel) - 4); + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); + byte_count = params + 1 /* pad */ ; + pSMB->TotalParameterCount = cpu_to_le16(params); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS); + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, "Send error in QueryAllEAs = %d", rc); + goto QAllEAsOut; + } + + + /* BB also check enough total bytes returned */ + /* BB we need to improve the validity checking + of these trans2 responses */ + + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + if (rc || get_bcc(&pSMBr->hdr) < 4) { + rc = -EIO; /* bad smb */ + goto QAllEAsOut; + } + + /* check that length of list is not more than bcc */ + /* check that each entry does not go beyond length + of list */ + /* check that each element of each entry does not + go beyond end of list */ + /* validate_trans2_offsets() */ + /* BB check if start of smb + data_offset > &bcc+ bcc */ + + data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + ea_response_data = (struct fealist *) + (((char *) &pSMBr->hdr.Protocol) + data_offset); + + list_len = le32_to_cpu(ea_response_data->list_len); + cFYI(1, "ea length %d", list_len); + if (list_len <= 8) { + cFYI(1, "empty EA list returned from server"); + goto QAllEAsOut; + } + + /* make sure list_len doesn't go past end of SMB */ + end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr); + if ((char *)ea_response_data + list_len > end_of_smb) { + cFYI(1, "EA list appears to go beyond SMB"); + rc = -EIO; + goto QAllEAsOut; + } + + /* account for ea list len */ + list_len -= 4; + temp_fea = ea_response_data->list; + temp_ptr = (char *)temp_fea; + while (list_len > 0) { + unsigned int name_len; + __u16 value_len; + + list_len -= 4; + temp_ptr += 4; + /* make sure we can read name_len and value_len */ + if (list_len < 0) { + cFYI(1, "EA entry goes beyond length of list"); + rc = -EIO; + goto QAllEAsOut; + } + + name_len = temp_fea->name_len; + value_len = le16_to_cpu(temp_fea->value_len); + list_len -= name_len + 1 + value_len; + if (list_len < 0) { + cFYI(1, "EA entry goes beyond length of list"); + rc = -EIO; + goto QAllEAsOut; + } + + if (ea_name) { + if (strncmp(ea_name, temp_ptr, name_len) == 0) { + temp_ptr += name_len + 1; + rc = value_len; + if (buf_size == 0) + goto QAllEAsOut; + if ((size_t)value_len > buf_size) { + rc = -ERANGE; + goto QAllEAsOut; + } + memcpy(EAData, temp_ptr, value_len); + goto QAllEAsOut; + } + } else { + /* account for prefix user. and trailing null */ + rc += (5 + 1 + name_len); + if (rc < (int) buf_size) { + memcpy(EAData, "user.", 5); + EAData += 5; + memcpy(EAData, temp_ptr, name_len); + EAData += name_len; + /* null terminate name */ + *EAData = 0; + ++EAData; + } else if (buf_size == 0) { + /* skip copy - calc size only */ + } else { + /* stop before overrun buffer */ + rc = -ERANGE; + break; + } + } + temp_ptr += name_len + 1 + value_len; + temp_fea = (struct fea *)temp_ptr; + } + + /* didn't find the named attribute */ + if (ea_name) + rc = -ENODATA; + +QAllEAsOut: + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto QAllEAsRetry; + + return (ssize_t)rc; +} + +int +CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon, const char *fileName, + const char *ea_name, const void *ea_value, + const __u16 ea_value_len, const struct nls_table *nls_codepage, + int remap) +{ + struct smb_com_transaction2_spi_req *pSMB = NULL; + struct smb_com_transaction2_spi_rsp *pSMBr = NULL; + struct fealist *parm_data; + int name_len; + int rc = 0; + int bytes_returned = 0; + __u16 params, param_offset, byte_count, offset, count; + + cFYI(1, "In SetEA"); +SetEARetry: + rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); + name_len++; /* trailing null */ + name_len *= 2; + } else { /* BB improve the check for buffer overruns BB */ + name_len = strnlen(fileName, PATH_MAX); + name_len++; /* trailing null */ + strncpy(pSMB->FileName, fileName, name_len); + } + + params = 6 + name_len; + + /* done calculating parms using name_len of file name, + now use name_len to calculate length of ea name + we are going to create in the inode xattrs */ + if (ea_name == NULL) + name_len = 0; + else + name_len = strnlen(ea_name, 255); + + count = sizeof(*parm_data) + ea_value_len + name_len; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_spi_req, + InformationLevel) - 4; + offset = param_offset + params; + pSMB->InformationLevel = + cpu_to_le16(SMB_SET_FILE_EA); + + parm_data = + (struct fealist *) (((char *) &pSMB->hdr.Protocol) + + offset); + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + parm_data->list_len = cpu_to_le32(count); + parm_data->list[0].EA_flags = 0; + /* we checked above that name len is less than 255 */ + parm_data->list[0].name_len = (__u8)name_len; + /* EA names are always ASCII */ + if (ea_name) + strncpy(parm_data->list[0].name, ea_name, name_len); + parm_data->list[0].name[name_len] = 0; + parm_data->list[0].value_len = cpu_to_le16(ea_value_len); + /* caller ensures that ea_value_len is less than 64K but + we need to ensure that it fits within the smb */ + + /*BB add length check to see if it would fit in + negotiated SMB buffer size BB */ + /* if (ea_value_len > buffer_size - 512 (enough for header)) */ + if (ea_value_len) + memcpy(parm_data->list[0].name+name_len+1, + ea_value, ea_value_len); + + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->Reserved4 = 0; + inc_rfc1001_len(pSMB, byte_count); + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cFYI(1, "SetPathInfo (EA) returned %d", rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto SetEARetry; + + return rc; +} +#endif + +#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* BB unused temporarily */ +/* + * Years ago the kernel added a "dnotify" function for Samba server, + * to allow network clients (such as Windows) to display updated + * lists of files in directory listings automatically when + * files are added by one user when another user has the + * same directory open on their desktop. The Linux cifs kernel + * client hooked into the kernel side of this interface for + * the same reason, but ironically when the VFS moved from + * "dnotify" to "inotify" it became harder to plug in Linux + * network file system clients (the most obvious use case + * for notify interfaces is when multiple users can update + * the contents of the same directory - exactly what network + * file systems can do) although the server (Samba) could + * still use it. For the short term we leave the worker + * function ifdeffed out (below) until inotify is fixed + * in the VFS to make it easier to plug in network file + * system clients. If inotify turns out to be permanently + * incompatible for network fs clients, we could instead simply + * expose this config flag by adding a future cifs (and smb2) notify ioctl. + */ +int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon, + const int notify_subdirs, const __u16 netfid, + __u32 filter, struct file *pfile, int multishot, + const struct nls_table *nls_codepage) +{ + int rc = 0; + struct smb_com_transaction_change_notify_req *pSMB = NULL; + struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL; + struct dir_notify_req *dnotify_req; + int bytes_returned; + + cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid); + rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->TotalParameterCount = 0 ; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le32(2); + /* BB find exact data count max from sess structure BB */ + pSMB->MaxDataCount = 0; /* same in little endian or be */ +/* BB VERIFY verify which is correct for above BB */ + pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - + MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + + pSMB->MaxSetupCount = 4; + pSMB->Reserved = 0; + pSMB->ParameterOffset = 0; + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 4; /* single byte does not need le conversion */ + pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE); + pSMB->ParameterCount = pSMB->TotalParameterCount; + if (notify_subdirs) + pSMB->WatchTree = 1; /* one byte - no le conversion needed */ + pSMB->Reserved2 = 0; + pSMB->CompletionFilter = cpu_to_le32(filter); + pSMB->Fid = netfid; /* file handle always le */ + pSMB->ByteCount = 0; + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *)pSMBr, &bytes_returned, + CIFS_ASYNC_OP); + if (rc) { + cFYI(1, "Error in Notify = %d", rc); + } else { + /* Add file to outstanding requests */ + /* BB change to kmem cache alloc */ + dnotify_req = kmalloc( + sizeof(struct dir_notify_req), + GFP_KERNEL); + if (dnotify_req) { + dnotify_req->Pid = pSMB->hdr.Pid; + dnotify_req->PidHigh = pSMB->hdr.PidHigh; + dnotify_req->Mid = pSMB->hdr.Mid; + dnotify_req->Tid = pSMB->hdr.Tid; + dnotify_req->Uid = pSMB->hdr.Uid; + dnotify_req->netfid = netfid; + dnotify_req->pfile = pfile; + dnotify_req->filter = filter; + dnotify_req->multishot = multishot; + spin_lock(&GlobalMid_Lock); + list_add_tail(&dnotify_req->lhead, + &GlobalDnotifyReqList); + spin_unlock(&GlobalMid_Lock); + } else + rc = -ENOMEM; + } + cifs_buf_release(pSMB); + return rc; +} +#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c new file mode 100644 index 00000000..b7758094 --- /dev/null +++ b/fs/cifs/connect.c @@ -0,0 +1,3719 @@ +/* + * fs/cifs/connect.c + * + * Copyright (C) International Business Machines Corp., 2002,2009 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "ntlmssp.h" +#include "nterr.h" +#include "rfc1002pdu.h" +#include "fscache.h" + +#define CIFS_PORT 445 +#define RFC1001_PORT 139 + +/* SMB echo "timeout" -- FIXME: tunable? */ +#define SMB_ECHO_INTERVAL (60 * HZ) + +extern mempool_t *cifs_req_poolp; + +/* FIXME: should these be tunable? */ +#define TLINK_ERROR_EXPIRE (1 * HZ) +#define TLINK_IDLE_EXPIRE (600 * HZ) + +static int ip_connect(struct TCP_Server_Info *server); +static int generic_ip_connect(struct TCP_Server_Info *server); +static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); +static void cifs_prune_tlinks(struct work_struct *work); +static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, + const char *devname); + +/* + * cifs tcp session reconnection + * + * mark tcp session as reconnecting so temporarily locked + * mark all smb sessions as reconnecting for tcp session + * reconnect tcp session + * wake up waiters on reconnection? - (not needed currently) + */ +static int +cifs_reconnect(struct TCP_Server_Info *server) +{ + int rc = 0; + struct list_head *tmp, *tmp2; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct mid_q_entry *mid_entry; + struct list_head retry_list; + + spin_lock(&GlobalMid_Lock); + if (server->tcpStatus == CifsExiting) { + /* the demux thread will exit normally + next time through the loop */ + spin_unlock(&GlobalMid_Lock); + return rc; + } else + server->tcpStatus = CifsNeedReconnect; + spin_unlock(&GlobalMid_Lock); + server->maxBuf = 0; + + cFYI(1, "Reconnecting tcp session"); + + /* before reconnecting the tcp session, mark the smb session (uid) + and the tid bad so they are not used until reconnected */ + cFYI(1, "%s: marking sessions and tcons for reconnect", __func__); + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &server->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + ses->need_reconnect = true; + ses->ipc_tid = 0; + list_for_each(tmp2, &ses->tcon_list) { + tcon = list_entry(tmp2, struct cifs_tcon, tcon_list); + tcon->need_reconnect = true; + } + } + spin_unlock(&cifs_tcp_ses_lock); + + /* do not want to be sending data on a socket we are freeing */ + cFYI(1, "%s: tearing down socket", __func__); + mutex_lock(&server->srv_mutex); + if (server->ssocket) { + cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state, + server->ssocket->flags); + kernel_sock_shutdown(server->ssocket, SHUT_WR); + cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx", + server->ssocket->state, + server->ssocket->flags); + sock_release(server->ssocket); + server->ssocket = NULL; + } + server->sequence_number = 0; + server->session_estab = false; + kfree(server->session_key.response); + server->session_key.response = NULL; + server->session_key.len = 0; + server->lstrp = jiffies; + mutex_unlock(&server->srv_mutex); + + /* mark submitted MIDs for retry and issue callback */ + INIT_LIST_HEAD(&retry_list); + cFYI(1, "%s: moving mids to private list", __func__); + spin_lock(&GlobalMid_Lock); + list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + if (mid_entry->midState == MID_REQUEST_SUBMITTED) + mid_entry->midState = MID_RETRY_NEEDED; + list_move(&mid_entry->qhead, &retry_list); + } + spin_unlock(&GlobalMid_Lock); + + cFYI(1, "%s: issuing mid callbacks", __func__); + list_for_each_safe(tmp, tmp2, &retry_list) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + list_del_init(&mid_entry->qhead); + mid_entry->callback(mid_entry); + } + + do { + try_to_freeze(); + + /* we should try only the port we connected to before */ + rc = generic_ip_connect(server); + if (rc) { + cFYI(1, "reconnect error %d", rc); + msleep(3000); + } else { + atomic_inc(&tcpSesReconnectCount); + spin_lock(&GlobalMid_Lock); + if (server->tcpStatus != CifsExiting) + server->tcpStatus = CifsNeedNegotiate; + spin_unlock(&GlobalMid_Lock); + } + } while (server->tcpStatus == CifsNeedReconnect); + + return rc; +} + +/* + return codes: + 0 not a transact2, or all data present + >0 transact2 with that much data missing + -EINVAL = invalid transact2 + + */ +static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) +{ + struct smb_t2_rsp *pSMBt; + int remaining; + __u16 total_data_size, data_in_this_rsp; + + if (pSMB->Command != SMB_COM_TRANSACTION2) + return 0; + + /* check for plausible wct, bcc and t2 data and parm sizes */ + /* check for parm and data offset going beyond end of smb */ + if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */ + cFYI(1, "invalid transact2 word count"); + return -EINVAL; + } + + pSMBt = (struct smb_t2_rsp *)pSMB; + + total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); + data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); + + if (total_data_size == data_in_this_rsp) + return 0; + else if (total_data_size < data_in_this_rsp) { + cFYI(1, "total data %d smaller than data in frame %d", + total_data_size, data_in_this_rsp); + return -EINVAL; + } + + remaining = total_data_size - data_in_this_rsp; + + cFYI(1, "missing %d bytes from transact2, check next response", + remaining); + if (total_data_size > maxBufSize) { + cERROR(1, "TotalDataSize %d is over maximum buffer %d", + total_data_size, maxBufSize); + return -EINVAL; + } + return remaining; +} + +static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) +{ + struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond; + struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB; + char *data_area_of_target; + char *data_area_of_buf2; + int remaining; + unsigned int byte_count, total_in_buf; + __u16 total_data_size, total_in_buf2; + + total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); + + if (total_data_size != + get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount)) + cFYI(1, "total data size of primary and secondary t2 differ"); + + total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); + + remaining = total_data_size - total_in_buf; + + if (remaining < 0) + return -EPROTO; + + if (remaining == 0) /* nothing to do, ignore */ + return 0; + + total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount); + if (remaining < total_in_buf2) { + cFYI(1, "transact2 2nd response contains too much data"); + } + + /* find end of first SMB data area */ + data_area_of_target = (char *)&pSMBt->hdr.Protocol + + get_unaligned_le16(&pSMBt->t2_rsp.DataOffset); + /* validate target area */ + + data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol + + get_unaligned_le16(&pSMB2->t2_rsp.DataOffset); + + data_area_of_target += total_in_buf; + + /* copy second buffer into end of first buffer */ + total_in_buf += total_in_buf2; + /* is the result too big for the field? */ + if (total_in_buf > USHRT_MAX) + return -EPROTO; + put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount); + + /* fix up the BCC */ + byte_count = get_bcc(pTargetSMB); + byte_count += total_in_buf2; + /* is the result too big for the field? */ + if (byte_count > USHRT_MAX) + return -EPROTO; + put_bcc(byte_count, pTargetSMB); + + byte_count = be32_to_cpu(pTargetSMB->smb_buf_length); + byte_count += total_in_buf2; + /* don't allow buffer to overflow */ + if (byte_count > CIFSMaxBufSize) + return -ENOBUFS; + pTargetSMB->smb_buf_length = cpu_to_be32(byte_count); + + memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2); + + if (remaining == total_in_buf2) { + cFYI(1, "found the last secondary response"); + return 0; /* we are done */ + } else /* more responses to go */ + return 1; +} + +static void +cifs_echo_request(struct work_struct *work) +{ + int rc; + struct TCP_Server_Info *server = container_of(work, + struct TCP_Server_Info, echo.work); + + /* + * We cannot send an echo until the NEGOTIATE_PROTOCOL request is + * done, which is indicated by maxBuf != 0. Also, no need to ping if + * we got a response recently + */ + if (server->maxBuf == 0 || + time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) + goto requeue_echo; + + rc = CIFSSMBEcho(server); + if (rc) + cFYI(1, "Unable to send echo request to server: %s", + server->hostname); + +requeue_echo: + queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL); +} + +static int +cifs_demultiplex_thread(struct TCP_Server_Info *server) +{ + int length; + unsigned int pdu_length, total_read; + struct smb_hdr *smb_buffer = NULL; + struct smb_hdr *bigbuf = NULL; + struct smb_hdr *smallbuf = NULL; + struct msghdr smb_msg; + struct kvec iov; + struct socket *csocket = server->ssocket; + struct list_head *tmp, *tmp2; + struct task_struct *task_to_wake = NULL; + struct mid_q_entry *mid_entry; + char temp; + bool isLargeBuf = false; + bool isMultiRsp; + int reconnect; + + current->flags |= PF_MEMALLOC; + cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); + + length = atomic_inc_return(&tcpSesAllocCount); + if (length > 1) + mempool_resize(cifs_req_poolp, length + cifs_min_rcv, + GFP_KERNEL); + + set_freezable(); + while (server->tcpStatus != CifsExiting) { + if (try_to_freeze()) + continue; + if (bigbuf == NULL) { + bigbuf = cifs_buf_get(); + if (!bigbuf) { + cERROR(1, "No memory for large SMB response"); + msleep(3000); + /* retry will check if exiting */ + continue; + } + } else if (isLargeBuf) { + /* we are reusing a dirty large buf, clear its start */ + memset(bigbuf, 0, sizeof(struct smb_hdr)); + } + + if (smallbuf == NULL) { + smallbuf = cifs_small_buf_get(); + if (!smallbuf) { + cERROR(1, "No memory for SMB response"); + msleep(1000); + /* retry will check if exiting */ + continue; + } + /* beginning of smb buffer is cleared in our buf_get */ + } else /* if existing small buf clear beginning */ + memset(smallbuf, 0, sizeof(struct smb_hdr)); + + isLargeBuf = false; + isMultiRsp = false; + smb_buffer = smallbuf; + iov.iov_base = smb_buffer; + iov.iov_len = 4; + smb_msg.msg_control = NULL; + smb_msg.msg_controllen = 0; + pdu_length = 4; /* enough to get RFC1001 header */ + +incomplete_rcv: + if (echo_retries > 0 && server->tcpStatus == CifsGood && + time_after(jiffies, server->lstrp + + (echo_retries * SMB_ECHO_INTERVAL))) { + cERROR(1, "Server %s has not responded in %d seconds. " + "Reconnecting...", server->hostname, + (echo_retries * SMB_ECHO_INTERVAL / HZ)); + cifs_reconnect(server); + csocket = server->ssocket; + wake_up(&server->response_q); + continue; + } + + length = + kernel_recvmsg(csocket, &smb_msg, + &iov, 1, pdu_length, 0 /* BB other flags? */); + + if (server->tcpStatus == CifsExiting) { + break; + } else if (server->tcpStatus == CifsNeedReconnect) { + cFYI(1, "Reconnect after server stopped responding"); + cifs_reconnect(server); + cFYI(1, "call to reconnect done"); + csocket = server->ssocket; + continue; + } else if (length == -ERESTARTSYS || + length == -EAGAIN || + length == -EINTR) { + msleep(1); /* minimum sleep to prevent looping + allowing socket to clear and app threads to set + tcpStatus CifsNeedReconnect if server hung */ + if (pdu_length < 4) { + iov.iov_base = (4 - pdu_length) + + (char *)smb_buffer; + iov.iov_len = pdu_length; + smb_msg.msg_control = NULL; + smb_msg.msg_controllen = 0; + goto incomplete_rcv; + } else + continue; + } else if (length <= 0) { + cFYI(1, "Reconnect after unexpected peek error %d", + length); + cifs_reconnect(server); + csocket = server->ssocket; + wake_up(&server->response_q); + continue; + } else if (length < pdu_length) { + cFYI(1, "requested %d bytes but only got %d bytes", + pdu_length, length); + pdu_length -= length; + msleep(1); + goto incomplete_rcv; + } + + /* The right amount was read from socket - 4 bytes */ + /* so we can now interpret the length field */ + + /* the first byte big endian of the length field, + is actually not part of the length but the type + with the most common, zero, as regular data */ + temp = *((char *) smb_buffer); + + /* Note that FC 1001 length is big endian on the wire, + but we convert it here so it is always manipulated + as host byte order */ + pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); + + cFYI(1, "rfc1002 length 0x%x", pdu_length+4); + + if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) { + continue; + } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) { + cFYI(1, "Good RFC 1002 session rsp"); + continue; + } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) { + /* we get this from Windows 98 instead of + an error on SMB negprot response */ + cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", + pdu_length); + /* give server a second to clean up */ + msleep(1000); + /* always try 445 first on reconnect since we get NACK + * on some if we ever connected to port 139 (the NACK + * is since we do not begin with RFC1001 session + * initialize frame) + */ + cifs_set_port((struct sockaddr *) + &server->dstaddr, CIFS_PORT); + cifs_reconnect(server); + csocket = server->ssocket; + wake_up(&server->response_q); + continue; + } else if (temp != (char) 0) { + cERROR(1, "Unknown RFC 1002 frame"); + cifs_dump_mem(" Received Data: ", (char *)smb_buffer, + length); + cifs_reconnect(server); + csocket = server->ssocket; + continue; + } + + /* else we have an SMB response */ + if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) || + (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) { + cERROR(1, "Invalid size SMB length %d pdu_length %d", + length, pdu_length+4); + cifs_reconnect(server); + csocket = server->ssocket; + wake_up(&server->response_q); + continue; + } + + /* else length ok */ + reconnect = 0; + + if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { + isLargeBuf = true; + memcpy(bigbuf, smallbuf, 4); + smb_buffer = bigbuf; + } + length = 0; + iov.iov_base = 4 + (char *)smb_buffer; + iov.iov_len = pdu_length; + for (total_read = 0; total_read < pdu_length; + total_read += length) { + length = kernel_recvmsg(csocket, &smb_msg, &iov, 1, + pdu_length - total_read, 0); + if (server->tcpStatus == CifsExiting) { + /* then will exit */ + reconnect = 2; + break; + } else if (server->tcpStatus == CifsNeedReconnect) { + cifs_reconnect(server); + csocket = server->ssocket; + /* Reconnect wakes up rspns q */ + /* Now we will reread sock */ + reconnect = 1; + break; + } else if (length == -ERESTARTSYS || + length == -EAGAIN || + length == -EINTR) { + msleep(1); /* minimum sleep to prevent looping, + allowing socket to clear and app + threads to set tcpStatus + CifsNeedReconnect if server hung*/ + length = 0; + continue; + } else if (length <= 0) { + cERROR(1, "Received no data, expecting %d", + pdu_length - total_read); + cifs_reconnect(server); + csocket = server->ssocket; + reconnect = 1; + break; + } + } + if (reconnect == 2) + break; + else if (reconnect == 1) + continue; + + total_read += 4; /* account for rfc1002 hdr */ + + dump_smb(smb_buffer, total_read); + + /* + * We know that we received enough to get to the MID as we + * checked the pdu_length earlier. Now check to see + * if the rest of the header is OK. We borrow the length + * var for the rest of the loop to avoid a new stack var. + * + * 48 bytes is enough to display the header and a little bit + * into the payload for debugging purposes. + */ + length = checkSMB(smb_buffer, smb_buffer->Mid, total_read); + if (length != 0) + cifs_dump_mem("Bad SMB: ", smb_buffer, + min_t(unsigned int, total_read, 48)); + + mid_entry = NULL; + server->lstrp = jiffies; + + spin_lock(&GlobalMid_Lock); + list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + + if (mid_entry->mid != smb_buffer->Mid || + mid_entry->midState != MID_REQUEST_SUBMITTED || + mid_entry->command != smb_buffer->Command) { + mid_entry = NULL; + continue; + } + + if (length == 0 && + check2ndT2(smb_buffer, server->maxBuf) > 0) { + /* We have a multipart transact2 resp */ + isMultiRsp = true; + if (mid_entry->resp_buf) { + /* merge response - fix up 1st*/ + length = coalesce_t2(smb_buffer, + mid_entry->resp_buf); + if (length > 0) { + length = 0; + mid_entry->multiRsp = true; + break; + } else { + /* all parts received or + * packet is malformed + */ + mid_entry->multiEnd = true; + goto multi_t2_fnd; + } + } else { + if (!isLargeBuf) { + /* + * FIXME: switch to already + * allocated largebuf? + */ + cERROR(1, "1st trans2 resp " + "needs bigbuf"); + } else { + /* Have first buffer */ + mid_entry->resp_buf = + smb_buffer; + mid_entry->largeBuf = true; + bigbuf = NULL; + } + } + break; + } + mid_entry->resp_buf = smb_buffer; + mid_entry->largeBuf = isLargeBuf; +multi_t2_fnd: + if (length == 0) + mid_entry->midState = MID_RESPONSE_RECEIVED; + else + mid_entry->midState = MID_RESPONSE_MALFORMED; +#ifdef CONFIG_CIFS_STATS2 + mid_entry->when_received = jiffies; +#endif + list_del_init(&mid_entry->qhead); + break; + } + spin_unlock(&GlobalMid_Lock); + + if (mid_entry != NULL) { + mid_entry->callback(mid_entry); + /* Was previous buf put in mpx struct for multi-rsp? */ + if (!isMultiRsp) { + /* smb buffer will be freed by user thread */ + if (isLargeBuf) + bigbuf = NULL; + else + smallbuf = NULL; + } + } else if (length != 0) { + /* response sanity checks failed */ + continue; + } else if (!is_valid_oplock_break(smb_buffer, server) && + !isMultiRsp) { + cERROR(1, "No task to wake, unknown frame received! " + "NumMids %d", atomic_read(&midCount)); + cifs_dump_mem("Received Data is: ", (char *)smb_buffer, + sizeof(struct smb_hdr)); +#ifdef CONFIG_CIFS_DEBUG2 + cifs_dump_detail(smb_buffer); + cifs_dump_mids(server); +#endif /* CIFS_DEBUG2 */ + + } + } /* end while !EXITING */ + + /* take it off the list, if it's not already */ + spin_lock(&cifs_tcp_ses_lock); + list_del_init(&server->tcp_ses_list); + spin_unlock(&cifs_tcp_ses_lock); + + spin_lock(&GlobalMid_Lock); + server->tcpStatus = CifsExiting; + spin_unlock(&GlobalMid_Lock); + wake_up_all(&server->response_q); + + /* check if we have blocked requests that need to free */ + /* Note that cifs_max_pending is normally 50, but + can be set at module install time to as little as two */ + spin_lock(&GlobalMid_Lock); + if (atomic_read(&server->inFlight) >= cifs_max_pending) + atomic_set(&server->inFlight, cifs_max_pending - 1); + /* We do not want to set the max_pending too low or we + could end up with the counter going negative */ + spin_unlock(&GlobalMid_Lock); + /* Although there should not be any requests blocked on + this queue it can not hurt to be paranoid and try to wake up requests + that may haven been blocked when more than 50 at time were on the wire + to the same server - they now will see the session is in exit state + and get out of SendReceive. */ + wake_up_all(&server->request_q); + /* give those requests time to exit */ + msleep(125); + + if (server->ssocket) { + sock_release(csocket); + server->ssocket = NULL; + } + /* buffer usually freed in free_mid - need to free it here on exit */ + cifs_buf_release(bigbuf); + if (smallbuf) /* no sense logging a debug message if NULL */ + cifs_small_buf_release(smallbuf); + + if (!list_empty(&server->pending_mid_q)) { + struct list_head dispose_list; + + INIT_LIST_HEAD(&dispose_list); + spin_lock(&GlobalMid_Lock); + list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + cFYI(1, "Clearing mid 0x%x", mid_entry->mid); + mid_entry->midState = MID_SHUTDOWN; + list_move(&mid_entry->qhead, &dispose_list); + } + spin_unlock(&GlobalMid_Lock); + + /* now walk dispose list and issue callbacks */ + list_for_each_safe(tmp, tmp2, &dispose_list) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + cFYI(1, "Callback mid 0x%x", mid_entry->mid); + list_del_init(&mid_entry->qhead); + mid_entry->callback(mid_entry); + } + /* 1/8th of sec is more than enough time for them to exit */ + msleep(125); + } + + if (!list_empty(&server->pending_mid_q)) { + /* mpx threads have not exited yet give them + at least the smb send timeout time for long ops */ + /* due to delays on oplock break requests, we need + to wait at least 45 seconds before giving up + on a request getting a response and going ahead + and killing cifsd */ + cFYI(1, "Wait for exit from demultiplex thread"); + msleep(46000); + /* if threads still have not exited they are probably never + coming home not much else we can do but free the memory */ + } + + kfree(server->hostname); + task_to_wake = xchg(&server->tsk, NULL); + kfree(server); + + length = atomic_dec_return(&tcpSesAllocCount); + if (length > 0) + mempool_resize(cifs_req_poolp, length + cifs_min_rcv, + GFP_KERNEL); + + /* if server->tsk was NULL then wait for a signal before exiting */ + if (!task_to_wake) { + set_current_state(TASK_INTERRUPTIBLE); + while (!signal_pending(current)) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + set_current_state(TASK_RUNNING); + } + + module_put_and_exit(0); +} + +/* extract the host portion of the UNC string */ +static char * +extract_hostname(const char *unc) +{ + const char *src; + char *dst, *delim; + unsigned int len; + + /* skip double chars at beginning of string */ + /* BB: check validity of these bytes? */ + src = unc + 2; + + /* delimiter between hostname and sharename is always '\\' now */ + delim = strchr(src, '\\'); + if (!delim) + return ERR_PTR(-EINVAL); + + len = delim - src; + dst = kmalloc((len + 1), GFP_KERNEL); + if (dst == NULL) + return ERR_PTR(-ENOMEM); + + memcpy(dst, src, len); + dst[len] = '\0'; + + return dst; +} + +static int +cifs_parse_mount_options(const char *mountdata, const char *devname, + struct smb_vol *vol) +{ + char *value, *data, *end; + char *mountdata_copy = NULL, *options; + unsigned int temp_len, i, j; + char separator[2]; + short int override_uid = -1; + short int override_gid = -1; + bool uid_specified = false; + bool gid_specified = false; + char *nodename = utsname()->nodename; + + separator[0] = ','; + separator[1] = 0; + + /* + * does not have to be perfect mapping since field is + * informational, only used for servers that do not support + * port 445 and it can be overridden at mount time + */ + memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN); + for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++) + vol->source_rfc1001_name[i] = toupper(nodename[i]); + + vol->source_rfc1001_name[RFC1001_NAME_LEN] = 0; + /* null target name indicates to use *SMBSERVR default called name + if we end up sending RFC1001 session initialize */ + vol->target_rfc1001_name[0] = 0; + vol->cred_uid = current_uid(); + vol->linux_uid = current_uid(); + vol->linux_gid = current_gid(); + + /* default to only allowing write access to owner of the mount */ + vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR; + + /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */ + /* default is always to request posix paths. */ + vol->posix_paths = 1; + /* default to using server inode numbers where available */ + vol->server_ino = 1; + + vol->actimeo = CIFS_DEF_ACTIMEO; + + if (!mountdata) + goto cifs_parse_mount_err; + + mountdata_copy = kstrndup(mountdata, PAGE_SIZE, GFP_KERNEL); + if (!mountdata_copy) + goto cifs_parse_mount_err; + + options = mountdata_copy; + end = options + strlen(options); + if (strncmp(options, "sep=", 4) == 0) { + if (options[4] != 0) { + separator[0] = options[4]; + options += 5; + } else { + cFYI(1, "Null separator not allowed"); + } + } + + while ((data = strsep(&options, separator)) != NULL) { + if (!*data) + continue; + if ((value = strchr(data, '=')) != NULL) + *value++ = '\0'; + + /* Have to parse this before we parse for "user" */ + if (strnicmp(data, "user_xattr", 10) == 0) { + vol->no_xattr = 0; + } else if (strnicmp(data, "nouser_xattr", 12) == 0) { + vol->no_xattr = 1; + } else if (strnicmp(data, "user", 4) == 0) { + if (!value) { + printk(KERN_WARNING + "CIFS: invalid or missing username\n"); + goto cifs_parse_mount_err; + } else if (!*value) { + /* null user, ie anonymous, authentication */ + vol->nullauth = 1; + } + if (strnlen(value, MAX_USERNAME_SIZE) < + MAX_USERNAME_SIZE) { + vol->username = kstrdup(value, GFP_KERNEL); + if (!vol->username) { + printk(KERN_WARNING "CIFS: no memory " + "for username\n"); + goto cifs_parse_mount_err; + } + } else { + printk(KERN_WARNING "CIFS: username too long\n"); + goto cifs_parse_mount_err; + } + } else if (strnicmp(data, "pass", 4) == 0) { + if (!value) { + vol->password = NULL; + continue; + } else if (value[0] == 0) { + /* check if string begins with double comma + since that would mean the password really + does start with a comma, and would not + indicate an empty string */ + if (value[1] != separator[0]) { + vol->password = NULL; + continue; + } + } + temp_len = strlen(value); + /* removed password length check, NTLM passwords + can be arbitrarily long */ + + /* if comma in password, the string will be + prematurely null terminated. Commas in password are + specified across the cifs mount interface by a double + comma ie ,, and a comma used as in other cases ie ',' + as a parameter delimiter/separator is single and due + to the strsep above is temporarily zeroed. */ + + /* NB: password legally can have multiple commas and + the only illegal character in a password is null */ + + if ((value[temp_len] == 0) && + (value + temp_len < end) && + (value[temp_len+1] == separator[0])) { + /* reinsert comma */ + value[temp_len] = separator[0]; + temp_len += 2; /* move after second comma */ + while (value[temp_len] != 0) { + if (value[temp_len] == separator[0]) { + if (value[temp_len+1] == + separator[0]) { + /* skip second comma */ + temp_len++; + } else { + /* single comma indicating start + of next parm */ + break; + } + } + temp_len++; + } + if (value[temp_len] == 0) { + options = NULL; + } else { + value[temp_len] = 0; + /* point option to start of next parm */ + options = value + temp_len + 1; + } + /* go from value to value + temp_len condensing + double commas to singles. Note that this ends up + allocating a few bytes too many, which is ok */ + vol->password = kzalloc(temp_len, GFP_KERNEL); + if (vol->password == NULL) { + printk(KERN_WARNING "CIFS: no memory " + "for password\n"); + goto cifs_parse_mount_err; + } + for (i = 0, j = 0; i < temp_len; i++, j++) { + vol->password[j] = value[i]; + if (value[i] == separator[0] + && value[i+1] == separator[0]) { + /* skip second comma */ + i++; + } + } + vol->password[j] = 0; + } else { + vol->password = kzalloc(temp_len+1, GFP_KERNEL); + if (vol->password == NULL) { + printk(KERN_WARNING "CIFS: no memory " + "for password\n"); + goto cifs_parse_mount_err; + } + strcpy(vol->password, value); + } + } else if (!strnicmp(data, "ip", 2) || + !strnicmp(data, "addr", 4)) { + if (!value || !*value) { + vol->UNCip = NULL; + } else if (strnlen(value, INET6_ADDRSTRLEN) < + INET6_ADDRSTRLEN) { + vol->UNCip = kstrdup(value, GFP_KERNEL); + if (!vol->UNCip) { + printk(KERN_WARNING "CIFS: no memory " + "for UNC IP\n"); + goto cifs_parse_mount_err; + } + } else { + printk(KERN_WARNING "CIFS: ip address " + "too long\n"); + goto cifs_parse_mount_err; + } + } else if (strnicmp(data, "sec", 3) == 0) { + if (!value || !*value) { + cERROR(1, "no security value specified"); + continue; + } else if (strnicmp(value, "krb5i", 5) == 0) { + vol->secFlg |= CIFSSEC_MAY_KRB5 | + CIFSSEC_MUST_SIGN; + } else if (strnicmp(value, "krb5p", 5) == 0) { + /* vol->secFlg |= CIFSSEC_MUST_SEAL | + CIFSSEC_MAY_KRB5; */ + cERROR(1, "Krb5 cifs privacy not supported"); + goto cifs_parse_mount_err; + } else if (strnicmp(value, "krb5", 4) == 0) { + vol->secFlg |= CIFSSEC_MAY_KRB5; + } else if (strnicmp(value, "ntlmsspi", 8) == 0) { + vol->secFlg |= CIFSSEC_MAY_NTLMSSP | + CIFSSEC_MUST_SIGN; + } else if (strnicmp(value, "ntlmssp", 7) == 0) { + vol->secFlg |= CIFSSEC_MAY_NTLMSSP; + } else if (strnicmp(value, "ntlmv2i", 7) == 0) { + vol->secFlg |= CIFSSEC_MAY_NTLMV2 | + CIFSSEC_MUST_SIGN; + } else if (strnicmp(value, "ntlmv2", 6) == 0) { + vol->secFlg |= CIFSSEC_MAY_NTLMV2; + } else if (strnicmp(value, "ntlmi", 5) == 0) { + vol->secFlg |= CIFSSEC_MAY_NTLM | + CIFSSEC_MUST_SIGN; + } else if (strnicmp(value, "ntlm", 4) == 0) { + /* ntlm is default so can be turned off too */ + vol->secFlg |= CIFSSEC_MAY_NTLM; + } else if (strnicmp(value, "nontlm", 6) == 0) { + /* BB is there a better way to do this? */ + vol->secFlg |= CIFSSEC_MAY_NTLMV2; +#ifdef CONFIG_CIFS_WEAK_PW_HASH + } else if (strnicmp(value, "lanman", 6) == 0) { + vol->secFlg |= CIFSSEC_MAY_LANMAN; +#endif + } else if (strnicmp(value, "none", 4) == 0) { + vol->nullauth = 1; + } else { + cERROR(1, "bad security option: %s", value); + goto cifs_parse_mount_err; + } + } else if (strnicmp(data, "vers", 3) == 0) { + if (!value || !*value) { + cERROR(1, "no protocol version specified" + " after vers= mount option"); + } else if ((strnicmp(value, "cifs", 4) == 0) || + (strnicmp(value, "1", 1) == 0)) { + /* this is the default */ + continue; + } + } else if ((strnicmp(data, "unc", 3) == 0) + || (strnicmp(data, "target", 6) == 0) + || (strnicmp(data, "path", 4) == 0)) { + if (!value || !*value) { + printk(KERN_WARNING "CIFS: invalid path to " + "network resource\n"); + goto cifs_parse_mount_err; + } + if ((temp_len = strnlen(value, 300)) < 300) { + vol->UNC = kmalloc(temp_len+1, GFP_KERNEL); + if (vol->UNC == NULL) + goto cifs_parse_mount_err; + strcpy(vol->UNC, value); + if (strncmp(vol->UNC, "//", 2) == 0) { + vol->UNC[0] = '\\'; + vol->UNC[1] = '\\'; + } else if (strncmp(vol->UNC, "\\\\", 2) != 0) { + printk(KERN_WARNING + "CIFS: UNC Path does not begin " + "with // or \\\\ \n"); + goto cifs_parse_mount_err; + } + } else { + printk(KERN_WARNING "CIFS: UNC name too long\n"); + goto cifs_parse_mount_err; + } + } else if ((strnicmp(data, "domain", 3) == 0) + || (strnicmp(data, "workgroup", 5) == 0)) { + if (!value || !*value) { + printk(KERN_WARNING "CIFS: invalid domain name\n"); + goto cifs_parse_mount_err; + } + /* BB are there cases in which a comma can be valid in + a domain name and need special handling? */ + if (strnlen(value, 256) < 256) { + vol->domainname = kstrdup(value, GFP_KERNEL); + if (!vol->domainname) { + printk(KERN_WARNING "CIFS: no memory " + "for domainname\n"); + goto cifs_parse_mount_err; + } + cFYI(1, "Domain name set"); + } else { + printk(KERN_WARNING "CIFS: domain name too " + "long\n"); + goto cifs_parse_mount_err; + } + } else if (strnicmp(data, "srcaddr", 7) == 0) { + vol->srcaddr.ss_family = AF_UNSPEC; + + if (!value || !*value) { + printk(KERN_WARNING "CIFS: srcaddr value" + " not specified.\n"); + goto cifs_parse_mount_err; + } + i = cifs_convert_address((struct sockaddr *)&vol->srcaddr, + value, strlen(value)); + if (i == 0) { + printk(KERN_WARNING "CIFS: Could not parse" + " srcaddr: %s\n", + value); + goto cifs_parse_mount_err; + } + } else if (strnicmp(data, "prefixpath", 10) == 0) { + if (!value || !*value) { + printk(KERN_WARNING + "CIFS: invalid path prefix\n"); + goto cifs_parse_mount_err; + } + if ((temp_len = strnlen(value, 1024)) < 1024) { + if (value[0] != '/') + temp_len++; /* missing leading slash */ + vol->prepath = kmalloc(temp_len+1, GFP_KERNEL); + if (vol->prepath == NULL) + goto cifs_parse_mount_err; + if (value[0] != '/') { + vol->prepath[0] = '/'; + strcpy(vol->prepath+1, value); + } else + strcpy(vol->prepath, value); + cFYI(1, "prefix path %s", vol->prepath); + } else { + printk(KERN_WARNING "CIFS: prefix too long\n"); + goto cifs_parse_mount_err; + } + } else if (strnicmp(data, "iocharset", 9) == 0) { + if (!value || !*value) { + printk(KERN_WARNING "CIFS: invalid iocharset " + "specified\n"); + goto cifs_parse_mount_err; + } + if (strnlen(value, 65) < 65) { + if (strnicmp(value, "default", 7)) { + vol->iocharset = kstrdup(value, + GFP_KERNEL); + + if (!vol->iocharset) { + printk(KERN_WARNING "CIFS: no " + "memory for" + "charset\n"); + goto cifs_parse_mount_err; + } + } + /* if iocharset not set then load_nls_default + is used by caller */ + cFYI(1, "iocharset set to %s", value); + } else { + printk(KERN_WARNING "CIFS: iocharset name " + "too long.\n"); + goto cifs_parse_mount_err; + } + } else if (!strnicmp(data, "uid", 3) && value && *value) { + vol->linux_uid = simple_strtoul(value, &value, 0); + uid_specified = true; + } else if (!strnicmp(data, "cruid", 5) && value && *value) { + vol->cred_uid = simple_strtoul(value, &value, 0); + } else if (!strnicmp(data, "forceuid", 8)) { + override_uid = 1; + } else if (!strnicmp(data, "noforceuid", 10)) { + override_uid = 0; + } else if (!strnicmp(data, "gid", 3) && value && *value) { + vol->linux_gid = simple_strtoul(value, &value, 0); + gid_specified = true; + } else if (!strnicmp(data, "forcegid", 8)) { + override_gid = 1; + } else if (!strnicmp(data, "noforcegid", 10)) { + override_gid = 0; + } else if (strnicmp(data, "file_mode", 4) == 0) { + if (value && *value) { + vol->file_mode = + simple_strtoul(value, &value, 0); + } + } else if (strnicmp(data, "dir_mode", 4) == 0) { + if (value && *value) { + vol->dir_mode = + simple_strtoul(value, &value, 0); + } + } else if (strnicmp(data, "dirmode", 4) == 0) { + if (value && *value) { + vol->dir_mode = + simple_strtoul(value, &value, 0); + } + } else if (strnicmp(data, "port", 4) == 0) { + if (value && *value) { + vol->port = + simple_strtoul(value, &value, 0); + } + } else if (strnicmp(data, "rsize", 5) == 0) { + if (value && *value) { + vol->rsize = + simple_strtoul(value, &value, 0); + } + } else if (strnicmp(data, "wsize", 5) == 0) { + if (value && *value) { + vol->wsize = + simple_strtoul(value, &value, 0); + } + } else if (strnicmp(data, "sockopt", 5) == 0) { + if (!value || !*value) { + cERROR(1, "no socket option specified"); + continue; + } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) { + vol->sockopt_tcp_nodelay = 1; + } + } else if (strnicmp(data, "netbiosname", 4) == 0) { + if (!value || !*value || (*value == ' ')) { + cFYI(1, "invalid (empty) netbiosname"); + } else { + memset(vol->source_rfc1001_name, 0x20, + RFC1001_NAME_LEN); + /* + * FIXME: are there cases in which a comma can + * be valid in workstation netbios name (and + * need special handling)? + */ + for (i = 0; i < RFC1001_NAME_LEN; i++) { + /* don't ucase netbiosname for user */ + if (value[i] == 0) + break; + vol->source_rfc1001_name[i] = value[i]; + } + /* The string has 16th byte zero still from + set at top of the function */ + if (i == RFC1001_NAME_LEN && value[i] != 0) + printk(KERN_WARNING "CIFS: netbiosname" + " longer than 15 truncated.\n"); + } + } else if (strnicmp(data, "servern", 7) == 0) { + /* servernetbiosname specified override *SMBSERVER */ + if (!value || !*value || (*value == ' ')) { + cFYI(1, "empty server netbiosname specified"); + } else { + /* last byte, type, is 0x20 for servr type */ + memset(vol->target_rfc1001_name, 0x20, + RFC1001_NAME_LEN_WITH_NULL); + + for (i = 0; i < 15; i++) { + /* BB are there cases in which a comma can be + valid in this workstation netbios name + (and need special handling)? */ + + /* user or mount helper must uppercase + the netbiosname */ + if (value[i] == 0) + break; + else + vol->target_rfc1001_name[i] = + value[i]; + } + /* The string has 16th byte zero still from + set at top of the function */ + if (i == RFC1001_NAME_LEN && value[i] != 0) + printk(KERN_WARNING "CIFS: server net" + "biosname longer than 15 truncated.\n"); + } + } else if (strnicmp(data, "actimeo", 7) == 0) { + if (value && *value) { + vol->actimeo = HZ * simple_strtoul(value, + &value, 0); + if (vol->actimeo > CIFS_MAX_ACTIMEO) { + cERROR(1, "CIFS: attribute cache" + "timeout too large"); + goto cifs_parse_mount_err; + } + } + } else if (strnicmp(data, "credentials", 4) == 0) { + /* ignore */ + } else if (strnicmp(data, "version", 3) == 0) { + /* ignore */ + } else if (strnicmp(data, "guest", 5) == 0) { + /* ignore */ + } else if (strnicmp(data, "rw", 2) == 0 && strlen(data) == 2) { + /* ignore */ + } else if (strnicmp(data, "ro", 2) == 0) { + /* ignore */ + } else if (strnicmp(data, "noblocksend", 11) == 0) { + vol->noblocksnd = 1; + } else if (strnicmp(data, "noautotune", 10) == 0) { + vol->noautotune = 1; + } else if ((strnicmp(data, "suid", 4) == 0) || + (strnicmp(data, "nosuid", 6) == 0) || + (strnicmp(data, "exec", 4) == 0) || + (strnicmp(data, "noexec", 6) == 0) || + (strnicmp(data, "nodev", 5) == 0) || + (strnicmp(data, "noauto", 6) == 0) || + (strnicmp(data, "dev", 3) == 0)) { + /* The mount tool or mount.cifs helper (if present) + uses these opts to set flags, and the flags are read + by the kernel vfs layer before we get here (ie + before read super) so there is no point trying to + parse these options again and set anything and it + is ok to just ignore them */ + continue; + } else if (strnicmp(data, "hard", 4) == 0) { + vol->retry = 1; + } else if (strnicmp(data, "soft", 4) == 0) { + vol->retry = 0; + } else if (strnicmp(data, "perm", 4) == 0) { + vol->noperm = 0; + } else if (strnicmp(data, "noperm", 6) == 0) { + vol->noperm = 1; + } else if (strnicmp(data, "mapchars", 8) == 0) { + vol->remap = 1; + } else if (strnicmp(data, "nomapchars", 10) == 0) { + vol->remap = 0; + } else if (strnicmp(data, "sfu", 3) == 0) { + vol->sfu_emul = 1; + } else if (strnicmp(data, "nosfu", 5) == 0) { + vol->sfu_emul = 0; + } else if (strnicmp(data, "nodfs", 5) == 0) { + vol->nodfs = 1; + } else if (strnicmp(data, "posixpaths", 10) == 0) { + vol->posix_paths = 1; + } else if (strnicmp(data, "noposixpaths", 12) == 0) { + vol->posix_paths = 0; + } else if (strnicmp(data, "nounix", 6) == 0) { + vol->no_linux_ext = 1; + } else if (strnicmp(data, "nolinux", 7) == 0) { + vol->no_linux_ext = 1; + } else if ((strnicmp(data, "nocase", 6) == 0) || + (strnicmp(data, "ignorecase", 10) == 0)) { + vol->nocase = 1; + } else if (strnicmp(data, "mand", 4) == 0) { + /* ignore */ + } else if (strnicmp(data, "nomand", 6) == 0) { + /* ignore */ + } else if (strnicmp(data, "_netdev", 7) == 0) { + /* ignore */ + } else if (strnicmp(data, "brl", 3) == 0) { + vol->nobrl = 0; + } else if ((strnicmp(data, "nobrl", 5) == 0) || + (strnicmp(data, "nolock", 6) == 0)) { + vol->nobrl = 1; + /* turn off mandatory locking in mode + if remote locking is turned off since the + local vfs will do advisory */ + if (vol->file_mode == + (S_IALLUGO & ~(S_ISUID | S_IXGRP))) + vol->file_mode = S_IALLUGO; + } else if (strnicmp(data, "forcemandatorylock", 9) == 0) { + /* will take the shorter form "forcemand" as well */ + /* This mount option will force use of mandatory + (DOS/Windows style) byte range locks, instead of + using posix advisory byte range locks, even if the + Unix extensions are available and posix locks would + be supported otherwise. If Unix extensions are not + negotiated this has no effect since mandatory locks + would be used (mandatory locks is all that those + those servers support) */ + vol->mand_lock = 1; + } else if (strnicmp(data, "setuids", 7) == 0) { + vol->setuids = 1; + } else if (strnicmp(data, "nosetuids", 9) == 0) { + vol->setuids = 0; + } else if (strnicmp(data, "dynperm", 7) == 0) { + vol->dynperm = true; + } else if (strnicmp(data, "nodynperm", 9) == 0) { + vol->dynperm = false; + } else if (strnicmp(data, "nohard", 6) == 0) { + vol->retry = 0; + } else if (strnicmp(data, "nosoft", 6) == 0) { + vol->retry = 1; + } else if (strnicmp(data, "nointr", 6) == 0) { + vol->intr = 0; + } else if (strnicmp(data, "intr", 4) == 0) { + vol->intr = 1; + } else if (strnicmp(data, "nostrictsync", 12) == 0) { + vol->nostrictsync = 1; + } else if (strnicmp(data, "strictsync", 10) == 0) { + vol->nostrictsync = 0; + } else if (strnicmp(data, "serverino", 7) == 0) { + vol->server_ino = 1; + } else if (strnicmp(data, "noserverino", 9) == 0) { + vol->server_ino = 0; + } else if (strnicmp(data, "rwpidforward", 12) == 0) { + vol->rwpidforward = 1; + } else if (strnicmp(data, "cifsacl", 7) == 0) { + vol->cifs_acl = 1; + } else if (strnicmp(data, "nocifsacl", 9) == 0) { + vol->cifs_acl = 0; + } else if (strnicmp(data, "acl", 3) == 0) { + vol->no_psx_acl = 0; + } else if (strnicmp(data, "noacl", 5) == 0) { + vol->no_psx_acl = 1; + } else if (strnicmp(data, "locallease", 6) == 0) { + vol->local_lease = 1; + } else if (strnicmp(data, "sign", 4) == 0) { + vol->secFlg |= CIFSSEC_MUST_SIGN; + } else if (strnicmp(data, "seal", 4) == 0) { + /* we do not do the following in secFlags because seal + is a per tree connection (mount) not a per socket + or per-smb connection option in the protocol */ + /* vol->secFlg |= CIFSSEC_MUST_SEAL; */ + vol->seal = 1; + } else if (strnicmp(data, "direct", 6) == 0) { + vol->direct_io = 1; + } else if (strnicmp(data, "forcedirectio", 13) == 0) { + vol->direct_io = 1; + } else if (strnicmp(data, "strictcache", 11) == 0) { + vol->strict_io = 1; + } else if (strnicmp(data, "noac", 4) == 0) { + printk(KERN_WARNING "CIFS: Mount option noac not " + "supported. Instead set " + "/proc/fs/cifs/LookupCacheEnabled to 0\n"); + } else if (strnicmp(data, "fsc", 3) == 0) { +#ifndef CONFIG_CIFS_FSCACHE + cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE " + "kernel config option set"); + goto cifs_parse_mount_err; +#endif + vol->fsc = true; + } else if (strnicmp(data, "mfsymlinks", 10) == 0) { + vol->mfsymlinks = true; + } else if (strnicmp(data, "multiuser", 8) == 0) { + vol->multiuser = true; + } else + printk(KERN_WARNING "CIFS: Unknown mount option %s\n", + data); + } + if (vol->UNC == NULL) { + if (devname == NULL) { + printk(KERN_WARNING "CIFS: Missing UNC name for mount " + "target\n"); + goto cifs_parse_mount_err; + } + if ((temp_len = strnlen(devname, 300)) < 300) { + vol->UNC = kmalloc(temp_len+1, GFP_KERNEL); + if (vol->UNC == NULL) + goto cifs_parse_mount_err; + strcpy(vol->UNC, devname); + if (strncmp(vol->UNC, "//", 2) == 0) { + vol->UNC[0] = '\\'; + vol->UNC[1] = '\\'; + } else if (strncmp(vol->UNC, "\\\\", 2) != 0) { + printk(KERN_WARNING "CIFS: UNC Path does not " + "begin with // or \\\\ \n"); + goto cifs_parse_mount_err; + } + value = strpbrk(vol->UNC+2, "/\\"); + if (value) + *value = '\\'; + } else { + printk(KERN_WARNING "CIFS: UNC name too long\n"); + goto cifs_parse_mount_err; + } + } + + if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) { + cERROR(1, "Multiuser mounts currently require krb5 " + "authentication!"); + goto cifs_parse_mount_err; + } + + if (vol->UNCip == NULL) + vol->UNCip = &vol->UNC[2]; + + if (uid_specified) + vol->override_uid = override_uid; + else if (override_uid == 1) + printk(KERN_NOTICE "CIFS: ignoring forceuid mount option " + "specified with no uid= option.\n"); + + if (gid_specified) + vol->override_gid = override_gid; + else if (override_gid == 1) + printk(KERN_NOTICE "CIFS: ignoring forcegid mount option " + "specified with no gid= option.\n"); + + kfree(mountdata_copy); + return 0; + +cifs_parse_mount_err: + kfree(mountdata_copy); + return 1; +} + +/** Returns true if srcaddr isn't specified and rhs isn't + * specified, or if srcaddr is specified and + * matches the IP address of the rhs argument. + */ +static bool +srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs) +{ + switch (srcaddr->sa_family) { + case AF_UNSPEC: + return (rhs->sa_family == AF_UNSPEC); + case AF_INET: { + struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr; + struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs; + return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr); + } + case AF_INET6: { + struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr; + struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs; + return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr); + } + default: + WARN_ON(1); + return false; /* don't expect to be here */ + } +} + +/* + * If no port is specified in addr structure, we try to match with 445 port + * and if it fails - with 139 ports. It should be called only if address + * families of server and addr are equal. + */ +static bool +match_port(struct TCP_Server_Info *server, struct sockaddr *addr) +{ + __be16 port, *sport; + + switch (addr->sa_family) { + case AF_INET: + sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port; + port = ((struct sockaddr_in *) addr)->sin_port; + break; + case AF_INET6: + sport = &((struct sockaddr_in6 *) &server->dstaddr)->sin6_port; + port = ((struct sockaddr_in6 *) addr)->sin6_port; + break; + default: + WARN_ON(1); + return false; + } + + if (!port) { + port = htons(CIFS_PORT); + if (port == *sport) + return true; + + port = htons(RFC1001_PORT); + } + + return port == *sport; +} + +static bool +match_address(struct TCP_Server_Info *server, struct sockaddr *addr, + struct sockaddr *srcaddr) +{ + switch (addr->sa_family) { + case AF_INET: { + struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; + struct sockaddr_in *srv_addr4 = + (struct sockaddr_in *)&server->dstaddr; + + if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr) + return false; + break; + } + case AF_INET6: { + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; + struct sockaddr_in6 *srv_addr6 = + (struct sockaddr_in6 *)&server->dstaddr; + + if (!ipv6_addr_equal(&addr6->sin6_addr, + &srv_addr6->sin6_addr)) + return false; + if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id) + return false; + break; + } + default: + WARN_ON(1); + return false; /* don't expect to be here */ + } + + if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr)) + return false; + + return true; +} + +static bool +match_security(struct TCP_Server_Info *server, struct smb_vol *vol) +{ + unsigned int secFlags; + + if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) + secFlags = vol->secFlg; + else + secFlags = global_secflags | vol->secFlg; + + switch (server->secType) { + case LANMAN: + if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT))) + return false; + break; + case NTLMv2: + if (!(secFlags & CIFSSEC_MAY_NTLMV2)) + return false; + break; + case NTLM: + if (!(secFlags & CIFSSEC_MAY_NTLM)) + return false; + break; + case Kerberos: + if (!(secFlags & CIFSSEC_MAY_KRB5)) + return false; + break; + case RawNTLMSSP: + if (!(secFlags & CIFSSEC_MAY_NTLMSSP)) + return false; + break; + default: + /* shouldn't happen */ + return false; + } + + /* now check if signing mode is acceptable */ + if ((secFlags & CIFSSEC_MAY_SIGN) == 0 && + (server->sec_mode & SECMODE_SIGN_REQUIRED)) + return false; + else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) && + (server->sec_mode & + (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0) + return false; + + return true; +} + +static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr, + struct smb_vol *vol) +{ + if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns)) + return 0; + + if (!match_address(server, addr, + (struct sockaddr *)&vol->srcaddr)) + return 0; + + if (!match_port(server, addr)) + return 0; + + if (!match_security(server, vol)) + return 0; + + return 1; +} + +static struct TCP_Server_Info * +cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol) +{ + struct TCP_Server_Info *server; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { + if (!match_server(server, addr, vol)) + continue; + + ++server->srv_count; + spin_unlock(&cifs_tcp_ses_lock); + cFYI(1, "Existing tcp session with server found"); + return server; + } + spin_unlock(&cifs_tcp_ses_lock); + return NULL; +} + +static void +cifs_put_tcp_session(struct TCP_Server_Info *server) +{ + struct task_struct *task; + + spin_lock(&cifs_tcp_ses_lock); + if (--server->srv_count > 0) { + spin_unlock(&cifs_tcp_ses_lock); + return; + } + + put_net(cifs_net_ns(server)); + + list_del_init(&server->tcp_ses_list); + spin_unlock(&cifs_tcp_ses_lock); + + cancel_delayed_work_sync(&server->echo); + + spin_lock(&GlobalMid_Lock); + server->tcpStatus = CifsExiting; + spin_unlock(&GlobalMid_Lock); + + cifs_crypto_shash_release(server); + cifs_fscache_release_client_cookie(server); + + kfree(server->session_key.response); + server->session_key.response = NULL; + server->session_key.len = 0; + + task = xchg(&server->tsk, NULL); + if (task) + force_sig(SIGKILL, task); +} + +static struct TCP_Server_Info * +cifs_get_tcp_session(struct smb_vol *volume_info) +{ + struct TCP_Server_Info *tcp_ses = NULL; + struct sockaddr_storage addr; + struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr; + struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr; + int rc; + + memset(&addr, 0, sizeof(struct sockaddr_storage)); + + cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip); + + if (volume_info->UNCip && volume_info->UNC) { + rc = cifs_fill_sockaddr((struct sockaddr *)&addr, + volume_info->UNCip, + strlen(volume_info->UNCip), + volume_info->port); + if (!rc) { + /* we failed translating address */ + rc = -EINVAL; + goto out_err; + } + } else if (volume_info->UNCip) { + /* BB using ip addr as tcp_ses name to connect to the + DFS root below */ + cERROR(1, "Connecting to DFS root not implemented yet"); + rc = -EINVAL; + goto out_err; + } else /* which tcp_sess DFS root would we conect to */ { + cERROR(1, "CIFS mount error: No UNC path (e.g. -o " + "unc=//192.168.1.100/public) specified"); + rc = -EINVAL; + goto out_err; + } + + /* see if we already have a matching tcp_ses */ + tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info); + if (tcp_ses) + return tcp_ses; + + tcp_ses = kzalloc(sizeof(struct TCP_Server_Info), GFP_KERNEL); + if (!tcp_ses) { + rc = -ENOMEM; + goto out_err; + } + + rc = cifs_crypto_shash_allocate(tcp_ses); + if (rc) { + cERROR(1, "could not setup hash structures rc %d", rc); + goto out_err; + } + + cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); + tcp_ses->hostname = extract_hostname(volume_info->UNC); + if (IS_ERR(tcp_ses->hostname)) { + rc = PTR_ERR(tcp_ses->hostname); + goto out_err_crypto_release; + } + + tcp_ses->noblocksnd = volume_info->noblocksnd; + tcp_ses->noautotune = volume_info->noautotune; + tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay; + atomic_set(&tcp_ses->inFlight, 0); + init_waitqueue_head(&tcp_ses->response_q); + init_waitqueue_head(&tcp_ses->request_q); + INIT_LIST_HEAD(&tcp_ses->pending_mid_q); + mutex_init(&tcp_ses->srv_mutex); + memcpy(tcp_ses->workstation_RFC1001_name, + volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); + memcpy(tcp_ses->server_RFC1001_name, + volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); + tcp_ses->session_estab = false; + tcp_ses->sequence_number = 0; + tcp_ses->lstrp = jiffies; + INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); + INIT_LIST_HEAD(&tcp_ses->smb_ses_list); + INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); + + /* + * at this point we are the only ones with the pointer + * to the struct since the kernel thread not created yet + * no need to spinlock this init of tcpStatus or srv_count + */ + tcp_ses->tcpStatus = CifsNew; + memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr, + sizeof(tcp_ses->srcaddr)); + ++tcp_ses->srv_count; + + if (addr.ss_family == AF_INET6) { + cFYI(1, "attempting ipv6 connect"); + /* BB should we allow ipv6 on port 139? */ + /* other OS never observed in Wild doing 139 with v6 */ + memcpy(&tcp_ses->dstaddr, sin_server6, + sizeof(struct sockaddr_in6)); + } else + memcpy(&tcp_ses->dstaddr, sin_server, + sizeof(struct sockaddr_in)); + + rc = ip_connect(tcp_ses); + if (rc < 0) { + cERROR(1, "Error connecting to socket. Aborting operation"); + goto out_err_crypto_release; + } + + /* + * since we're in a cifs function already, we know that + * this will succeed. No need for try_module_get(). + */ + __module_get(THIS_MODULE); + tcp_ses->tsk = kthread_run((void *)(void *)cifs_demultiplex_thread, + tcp_ses, "cifsd"); + if (IS_ERR(tcp_ses->tsk)) { + rc = PTR_ERR(tcp_ses->tsk); + cERROR(1, "error %d create cifsd thread", rc); + module_put(THIS_MODULE); + goto out_err_crypto_release; + } + tcp_ses->tcpStatus = CifsNeedNegotiate; + + /* thread spawned, put it on the list */ + spin_lock(&cifs_tcp_ses_lock); + list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); + spin_unlock(&cifs_tcp_ses_lock); + + cifs_fscache_get_client_cookie(tcp_ses); + + /* queue echo request delayed work */ + queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL); + + return tcp_ses; + +out_err_crypto_release: + cifs_crypto_shash_release(tcp_ses); + + put_net(cifs_net_ns(tcp_ses)); + +out_err: + if (tcp_ses) { + if (!IS_ERR(tcp_ses->hostname)) + kfree(tcp_ses->hostname); + if (tcp_ses->ssocket) + sock_release(tcp_ses->ssocket); + kfree(tcp_ses); + } + return ERR_PTR(rc); +} + +static int match_session(struct cifs_ses *ses, struct smb_vol *vol) +{ + switch (ses->server->secType) { + case Kerberos: + if (vol->cred_uid != ses->cred_uid) + return 0; + break; + default: + /* anything else takes username/password */ + if (ses->user_name == NULL) + return 0; + if (strncmp(ses->user_name, vol->username, + MAX_USERNAME_SIZE)) + return 0; + if (strlen(vol->username) != 0 && + ses->password != NULL && + strncmp(ses->password, + vol->password ? vol->password : "", + MAX_PASSWORD_SIZE)) + return 0; + } + return 1; +} + +static struct cifs_ses * +cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) +{ + struct cifs_ses *ses; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (!match_session(ses, vol)) + continue; + ++ses->ses_count; + spin_unlock(&cifs_tcp_ses_lock); + return ses; + } + spin_unlock(&cifs_tcp_ses_lock); + return NULL; +} + +static void +cifs_put_smb_ses(struct cifs_ses *ses) +{ + int xid; + struct TCP_Server_Info *server = ses->server; + + cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count); + spin_lock(&cifs_tcp_ses_lock); + if (--ses->ses_count > 0) { + spin_unlock(&cifs_tcp_ses_lock); + return; + } + + list_del_init(&ses->smb_ses_list); + spin_unlock(&cifs_tcp_ses_lock); + + if (ses->status == CifsGood) { + xid = GetXid(); + CIFSSMBLogoff(xid, ses); + _FreeXid(xid); + } + sesInfoFree(ses); + cifs_put_tcp_session(server); +} + +static bool warned_on_ntlm; /* globals init to false automatically */ + +static struct cifs_ses * +cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) +{ + int rc = -ENOMEM, xid; + struct cifs_ses *ses; + struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; + + xid = GetXid(); + + ses = cifs_find_smb_ses(server, volume_info); + if (ses) { + cFYI(1, "Existing smb sess found (status=%d)", ses->status); + + mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(xid, ses); + if (rc) { + mutex_unlock(&ses->session_mutex); + /* problem -- put our ses reference */ + cifs_put_smb_ses(ses); + FreeXid(xid); + return ERR_PTR(rc); + } + if (ses->need_reconnect) { + cFYI(1, "Session needs reconnect"); + rc = cifs_setup_session(xid, ses, + volume_info->local_nls); + if (rc) { + mutex_unlock(&ses->session_mutex); + /* problem -- put our reference */ + cifs_put_smb_ses(ses); + FreeXid(xid); + return ERR_PTR(rc); + } + } + mutex_unlock(&ses->session_mutex); + + /* existing SMB ses has a server reference already */ + cifs_put_tcp_session(server); + FreeXid(xid); + return ses; + } + + cFYI(1, "Existing smb sess not found"); + ses = sesInfoAlloc(); + if (ses == NULL) + goto get_ses_fail; + + /* new SMB session uses our server ref */ + ses->server = server; + if (server->dstaddr.ss_family == AF_INET6) + sprintf(ses->serverName, "%pI6", &addr6->sin6_addr); + else + sprintf(ses->serverName, "%pI4", &addr->sin_addr); + + if (volume_info->username) { + ses->user_name = kstrdup(volume_info->username, GFP_KERNEL); + if (!ses->user_name) + goto get_ses_fail; + } + + /* volume_info->password freed at unmount */ + if (volume_info->password) { + ses->password = kstrdup(volume_info->password, GFP_KERNEL); + if (!ses->password) + goto get_ses_fail; + } + if (volume_info->domainname) { + ses->domainName = kstrdup(volume_info->domainname, GFP_KERNEL); + if (!ses->domainName) + goto get_ses_fail; + } + ses->cred_uid = volume_info->cred_uid; + ses->linux_uid = volume_info->linux_uid; + + /* ntlmv2 is much stronger than ntlm security, and has been broadly + supported for many years, time to update default security mechanism */ + if ((volume_info->secFlg == 0) && warned_on_ntlm == false) { + warned_on_ntlm = true; + cERROR(1, "default security mechanism requested. The default " + "security mechanism will be upgraded from ntlm to " + "ntlmv2 in kernel release 3.1"); + } + ses->overrideSecFlg = volume_info->secFlg; + + mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(xid, ses); + if (!rc) + rc = cifs_setup_session(xid, ses, volume_info->local_nls); + mutex_unlock(&ses->session_mutex); + if (rc) + goto get_ses_fail; + + /* success, put it on the list */ + spin_lock(&cifs_tcp_ses_lock); + list_add(&ses->smb_ses_list, &server->smb_ses_list); + spin_unlock(&cifs_tcp_ses_lock); + + FreeXid(xid); + return ses; + +get_ses_fail: + sesInfoFree(ses); + FreeXid(xid); + return ERR_PTR(rc); +} + +static int match_tcon(struct cifs_tcon *tcon, const char *unc) +{ + if (tcon->tidStatus == CifsExiting) + return 0; + if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE)) + return 0; + return 1; +} + +static struct cifs_tcon * +cifs_find_tcon(struct cifs_ses *ses, const char *unc) +{ + struct list_head *tmp; + struct cifs_tcon *tcon; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &ses->tcon_list) { + tcon = list_entry(tmp, struct cifs_tcon, tcon_list); + if (!match_tcon(tcon, unc)) + continue; + ++tcon->tc_count; + spin_unlock(&cifs_tcp_ses_lock); + return tcon; + } + spin_unlock(&cifs_tcp_ses_lock); + return NULL; +} + +static void +cifs_put_tcon(struct cifs_tcon *tcon) +{ + int xid; + struct cifs_ses *ses = tcon->ses; + + cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count); + spin_lock(&cifs_tcp_ses_lock); + if (--tcon->tc_count > 0) { + spin_unlock(&cifs_tcp_ses_lock); + return; + } + + list_del_init(&tcon->tcon_list); + spin_unlock(&cifs_tcp_ses_lock); + + xid = GetXid(); + CIFSSMBTDis(xid, tcon); + _FreeXid(xid); + + cifs_fscache_release_super_cookie(tcon); + tconInfoFree(tcon); + cifs_put_smb_ses(ses); +} + +static struct cifs_tcon * +cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) +{ + int rc, xid; + struct cifs_tcon *tcon; + + tcon = cifs_find_tcon(ses, volume_info->UNC); + if (tcon) { + cFYI(1, "Found match on UNC path"); + /* existing tcon already has a reference */ + cifs_put_smb_ses(ses); + if (tcon->seal != volume_info->seal) + cERROR(1, "transport encryption setting " + "conflicts with existing tid"); + return tcon; + } + + tcon = tconInfoAlloc(); + if (tcon == NULL) { + rc = -ENOMEM; + goto out_fail; + } + + tcon->ses = ses; + if (volume_info->password) { + tcon->password = kstrdup(volume_info->password, GFP_KERNEL); + if (!tcon->password) { + rc = -ENOMEM; + goto out_fail; + } + } + + if (strchr(volume_info->UNC + 3, '\\') == NULL + && strchr(volume_info->UNC + 3, '/') == NULL) { + cERROR(1, "Missing share name"); + rc = -ENODEV; + goto out_fail; + } + + /* BB Do we need to wrap session_mutex around + * this TCon call and Unix SetFS as + * we do on SessSetup and reconnect? */ + xid = GetXid(); + rc = CIFSTCon(xid, ses, volume_info->UNC, tcon, volume_info->local_nls); + FreeXid(xid); + cFYI(1, "CIFS Tcon rc = %d", rc); + if (rc) + goto out_fail; + + if (volume_info->nodfs) { + tcon->Flags &= ~SMB_SHARE_IS_IN_DFS; + cFYI(1, "DFS disabled (%d)", tcon->Flags); + } + tcon->seal = volume_info->seal; + /* we can have only one retry value for a connection + to a share so for resources mounted more than once + to the same server share the last value passed in + for the retry flag is used */ + tcon->retry = volume_info->retry; + tcon->nocase = volume_info->nocase; + tcon->local_lease = volume_info->local_lease; + + spin_lock(&cifs_tcp_ses_lock); + list_add(&tcon->tcon_list, &ses->tcon_list); + spin_unlock(&cifs_tcp_ses_lock); + + cifs_fscache_get_super_cookie(tcon); + + return tcon; + +out_fail: + tconInfoFree(tcon); + return ERR_PTR(rc); +} + +void +cifs_put_tlink(struct tcon_link *tlink) +{ + if (!tlink || IS_ERR(tlink)) + return; + + if (!atomic_dec_and_test(&tlink->tl_count) || + test_bit(TCON_LINK_IN_TREE, &tlink->tl_flags)) { + tlink->tl_time = jiffies; + return; + } + + if (!IS_ERR(tlink_tcon(tlink))) + cifs_put_tcon(tlink_tcon(tlink)); + kfree(tlink); + return; +} + +static inline struct tcon_link * +cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) +{ + return cifs_sb->master_tlink; +} + +static int +compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) +{ + struct cifs_sb_info *old = CIFS_SB(sb); + struct cifs_sb_info *new = mnt_data->cifs_sb; + + if ((sb->s_flags & CIFS_MS_MASK) != (mnt_data->flags & CIFS_MS_MASK)) + return 0; + + if ((old->mnt_cifs_flags & CIFS_MOUNT_MASK) != + (new->mnt_cifs_flags & CIFS_MOUNT_MASK)) + return 0; + + if (old->rsize != new->rsize) + return 0; + + /* + * We want to share sb only if we don't specify wsize or specified wsize + * is greater or equal than existing one. + */ + if (new->wsize && new->wsize < old->wsize) + return 0; + + if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid) + return 0; + + if (old->mnt_file_mode != new->mnt_file_mode || + old->mnt_dir_mode != new->mnt_dir_mode) + return 0; + + if (strcmp(old->local_nls->charset, new->local_nls->charset)) + return 0; + + if (old->actimeo != new->actimeo) + return 0; + + return 1; +} + +int +cifs_match_super(struct super_block *sb, void *data) +{ + struct cifs_mnt_data *mnt_data = (struct cifs_mnt_data *)data; + struct smb_vol *volume_info; + struct cifs_sb_info *cifs_sb; + struct TCP_Server_Info *tcp_srv; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct tcon_link *tlink; + struct sockaddr_storage addr; + int rc = 0; + + memset(&addr, 0, sizeof(struct sockaddr_storage)); + + spin_lock(&cifs_tcp_ses_lock); + cifs_sb = CIFS_SB(sb); + tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); + if (IS_ERR(tlink)) { + spin_unlock(&cifs_tcp_ses_lock); + return rc; + } + tcon = tlink_tcon(tlink); + ses = tcon->ses; + tcp_srv = ses->server; + + volume_info = mnt_data->vol; + + if (!volume_info->UNCip || !volume_info->UNC) + goto out; + + rc = cifs_fill_sockaddr((struct sockaddr *)&addr, + volume_info->UNCip, + strlen(volume_info->UNCip), + volume_info->port); + if (!rc) + goto out; + + if (!match_server(tcp_srv, (struct sockaddr *)&addr, volume_info) || + !match_session(ses, volume_info) || + !match_tcon(tcon, volume_info->UNC)) { + rc = 0; + goto out; + } + + rc = compare_mount_options(sb, mnt_data); +out: + spin_unlock(&cifs_tcp_ses_lock); + cifs_put_tlink(tlink); + return rc; +} + +int +get_dfs_path(int xid, struct cifs_ses *pSesInfo, const char *old_path, + const struct nls_table *nls_codepage, unsigned int *pnum_referrals, + struct dfs_info3_param **preferrals, int remap) +{ + char *temp_unc; + int rc = 0; + + *pnum_referrals = 0; + *preferrals = NULL; + + if (pSesInfo->ipc_tid == 0) { + temp_unc = kmalloc(2 /* for slashes */ + + strnlen(pSesInfo->serverName, + SERVER_NAME_LEN_WITH_NULL * 2) + + 1 + 4 /* slash IPC$ */ + 2, + GFP_KERNEL); + if (temp_unc == NULL) + return -ENOMEM; + temp_unc[0] = '\\'; + temp_unc[1] = '\\'; + strcpy(temp_unc + 2, pSesInfo->serverName); + strcpy(temp_unc + 2 + strlen(pSesInfo->serverName), "\\IPC$"); + rc = CIFSTCon(xid, pSesInfo, temp_unc, NULL, nls_codepage); + cFYI(1, "CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid); + kfree(temp_unc); + } + if (rc == 0) + rc = CIFSGetDFSRefer(xid, pSesInfo, old_path, preferrals, + pnum_referrals, nls_codepage, remap); + /* BB map targetUNCs to dfs_info3 structures, here or + in CIFSGetDFSRefer BB */ + + return rc; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key cifs_key[2]; +static struct lock_class_key cifs_slock_key[2]; + +static inline void +cifs_reclassify_socket4(struct socket *sock) +{ + struct sock *sk = sock->sk; + BUG_ON(sock_owned_by_user(sk)); + sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS", + &cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]); +} + +static inline void +cifs_reclassify_socket6(struct socket *sock) +{ + struct sock *sk = sock->sk; + BUG_ON(sock_owned_by_user(sk)); + sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS", + &cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]); +} +#else +static inline void +cifs_reclassify_socket4(struct socket *sock) +{ +} + +static inline void +cifs_reclassify_socket6(struct socket *sock) +{ +} +#endif + +/* See RFC1001 section 14 on representation of Netbios names */ +static void rfc1002mangle(char *target, char *source, unsigned int length) +{ + unsigned int i, j; + + for (i = 0, j = 0; i < (length); i++) { + /* mask a nibble at a time and encode */ + target[j] = 'A' + (0x0F & (source[i] >> 4)); + target[j+1] = 'A' + (0x0F & source[i]); + j += 2; + } + +} + +static int +bind_socket(struct TCP_Server_Info *server) +{ + int rc = 0; + if (server->srcaddr.ss_family != AF_UNSPEC) { + /* Bind to the specified local IP address */ + struct socket *socket = server->ssocket; + rc = socket->ops->bind(socket, + (struct sockaddr *) &server->srcaddr, + sizeof(server->srcaddr)); + if (rc < 0) { + struct sockaddr_in *saddr4; + struct sockaddr_in6 *saddr6; + saddr4 = (struct sockaddr_in *)&server->srcaddr; + saddr6 = (struct sockaddr_in6 *)&server->srcaddr; + if (saddr6->sin6_family == AF_INET6) + cERROR(1, "cifs: " + "Failed to bind to: %pI6c, error: %d\n", + &saddr6->sin6_addr, rc); + else + cERROR(1, "cifs: " + "Failed to bind to: %pI4, error: %d\n", + &saddr4->sin_addr.s_addr, rc); + } + } + return rc; +} + +static int +ip_rfc1001_connect(struct TCP_Server_Info *server) +{ + int rc = 0; + /* + * some servers require RFC1001 sessinit before sending + * negprot - BB check reconnection in case where second + * sessinit is sent but no second negprot + */ + struct rfc1002_session_packet *ses_init_buf; + struct smb_hdr *smb_buf; + ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet), + GFP_KERNEL); + if (ses_init_buf) { + ses_init_buf->trailer.session_req.called_len = 32; + + if (server->server_RFC1001_name && + server->server_RFC1001_name[0] != 0) + rfc1002mangle(ses_init_buf->trailer. + session_req.called_name, + server->server_RFC1001_name, + RFC1001_NAME_LEN_WITH_NULL); + else + rfc1002mangle(ses_init_buf->trailer. + session_req.called_name, + DEFAULT_CIFS_CALLED_NAME, + RFC1001_NAME_LEN_WITH_NULL); + + ses_init_buf->trailer.session_req.calling_len = 32; + + /* + * calling name ends in null (byte 16) from old smb + * convention. + */ + if (server->workstation_RFC1001_name && + server->workstation_RFC1001_name[0] != 0) + rfc1002mangle(ses_init_buf->trailer. + session_req.calling_name, + server->workstation_RFC1001_name, + RFC1001_NAME_LEN_WITH_NULL); + else + rfc1002mangle(ses_init_buf->trailer. + session_req.calling_name, + "LINUX_CIFS_CLNT", + RFC1001_NAME_LEN_WITH_NULL); + + ses_init_buf->trailer.session_req.scope1 = 0; + ses_init_buf->trailer.session_req.scope2 = 0; + smb_buf = (struct smb_hdr *)ses_init_buf; + + /* sizeof RFC1002_SESSION_REQUEST with no scope */ + smb_buf->smb_buf_length = cpu_to_be32(0x81000044); + rc = smb_send(server, smb_buf, 0x44); + kfree(ses_init_buf); + /* + * RFC1001 layer in at least one server + * requires very short break before negprot + * presumably because not expecting negprot + * to follow so fast. This is a simple + * solution that works without + * complicating the code and causes no + * significant slowing down on mount + * for everyone else + */ + usleep_range(1000, 2000); + } + /* + * else the negprot may still work without this + * even though malloc failed + */ + + return rc; +} + +static int +generic_ip_connect(struct TCP_Server_Info *server) +{ + int rc = 0; + __be16 sport; + int slen, sfamily; + struct socket *socket = server->ssocket; + struct sockaddr *saddr; + + saddr = (struct sockaddr *) &server->dstaddr; + + if (server->dstaddr.ss_family == AF_INET6) { + sport = ((struct sockaddr_in6 *) saddr)->sin6_port; + slen = sizeof(struct sockaddr_in6); + sfamily = AF_INET6; + } else { + sport = ((struct sockaddr_in *) saddr)->sin_port; + slen = sizeof(struct sockaddr_in); + sfamily = AF_INET; + } + + if (socket == NULL) { + rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM, + IPPROTO_TCP, &socket, 1); + if (rc < 0) { + cERROR(1, "Error %d creating socket", rc); + server->ssocket = NULL; + return rc; + } + + /* BB other socket options to set KEEPALIVE, NODELAY? */ + cFYI(1, "Socket created"); + server->ssocket = socket; + socket->sk->sk_allocation = GFP_NOFS; + if (sfamily == AF_INET6) + cifs_reclassify_socket6(socket); + else + cifs_reclassify_socket4(socket); + } + + rc = bind_socket(server); + if (rc < 0) + return rc; + + /* + * Eventually check for other socket options to change from + * the default. sock_setsockopt not used because it expects + * user space buffer + */ + socket->sk->sk_rcvtimeo = 7 * HZ; + socket->sk->sk_sndtimeo = 5 * HZ; + + /* make the bufsizes depend on wsize/rsize and max requests */ + if (server->noautotune) { + if (socket->sk->sk_sndbuf < (200 * 1024)) + socket->sk->sk_sndbuf = 200 * 1024; + if (socket->sk->sk_rcvbuf < (140 * 1024)) + socket->sk->sk_rcvbuf = 140 * 1024; + } + + if (server->tcp_nodelay) { + int val = 1; + rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, + (char *)&val, sizeof(val)); + if (rc) + cFYI(1, "set TCP_NODELAY socket option error %d", rc); + } + + cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx", + socket->sk->sk_sndbuf, + socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); + + rc = socket->ops->connect(socket, saddr, slen, 0); + if (rc < 0) { + cFYI(1, "Error %d connecting to server", rc); + sock_release(socket); + server->ssocket = NULL; + return rc; + } + + if (sport == htons(RFC1001_PORT)) + rc = ip_rfc1001_connect(server); + + return rc; +} + +static int +ip_connect(struct TCP_Server_Info *server) +{ + __be16 *sport; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; + struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; + + if (server->dstaddr.ss_family == AF_INET6) + sport = &addr6->sin6_port; + else + sport = &addr->sin_port; + + if (*sport == 0) { + int rc; + + /* try with 445 port at first */ + *sport = htons(CIFS_PORT); + + rc = generic_ip_connect(server); + if (rc >= 0) + return rc; + + /* if it failed, try with 139 port */ + *sport = htons(RFC1001_PORT); + } + + return generic_ip_connect(server); +} + +void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, struct smb_vol *vol_info) +{ + /* if we are reconnecting then should we check to see if + * any requested capabilities changed locally e.g. via + * remount but we can not do much about it here + * if they have (even if we could detect it by the following) + * Perhaps we could add a backpointer to array of sb from tcon + * or if we change to make all sb to same share the same + * sb as NFS - then we only have one backpointer to sb. + * What if we wanted to mount the server share twice once with + * and once without posixacls or posix paths? */ + __u64 saved_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + + if (vol_info && vol_info->no_linux_ext) { + tcon->fsUnixInfo.Capability = 0; + tcon->unix_ext = 0; /* Unix Extensions disabled */ + cFYI(1, "Linux protocol extensions disabled"); + return; + } else if (vol_info) + tcon->unix_ext = 1; /* Unix Extensions supported */ + + if (tcon->unix_ext == 0) { + cFYI(1, "Unix extensions disabled so not set on reconnect"); + return; + } + + if (!CIFSSMBQFSUnixInfo(xid, tcon)) { + __u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + cFYI(1, "unix caps which server supports %lld", cap); + /* check for reconnect case in which we do not + want to change the mount behavior if we can avoid it */ + if (vol_info == NULL) { + /* turn off POSIX ACL and PATHNAMES if not set + originally at mount time */ + if ((saved_cap & CIFS_UNIX_POSIX_ACL_CAP) == 0) + cap &= ~CIFS_UNIX_POSIX_ACL_CAP; + if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) { + if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) + cERROR(1, "POSIXPATH support change"); + cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; + } else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) { + cERROR(1, "possible reconnect error"); + cERROR(1, "server disabled POSIX path support"); + } + } + + if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP) + cERROR(1, "per-share encryption not supported yet"); + + cap &= CIFS_UNIX_CAP_MASK; + if (vol_info && vol_info->no_psx_acl) + cap &= ~CIFS_UNIX_POSIX_ACL_CAP; + else if (CIFS_UNIX_POSIX_ACL_CAP & cap) { + cFYI(1, "negotiated posix acl support"); + if (cifs_sb) + cifs_sb->mnt_cifs_flags |= + CIFS_MOUNT_POSIXACL; + } + + if (vol_info && vol_info->posix_paths == 0) + cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; + else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) { + cFYI(1, "negotiate posix pathnames"); + if (cifs_sb) + cifs_sb->mnt_cifs_flags |= + CIFS_MOUNT_POSIX_PATHS; + } + + if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) { + if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) { + cifs_sb->rsize = 127 * 1024; + cFYI(DBG2, "larger reads not supported by srv"); + } + } + + + cFYI(1, "Negotiate caps 0x%x", (int)cap); +#ifdef CONFIG_CIFS_DEBUG2 + if (cap & CIFS_UNIX_FCNTL_CAP) + cFYI(1, "FCNTL cap"); + if (cap & CIFS_UNIX_EXTATTR_CAP) + cFYI(1, "EXTATTR cap"); + if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) + cFYI(1, "POSIX path cap"); + if (cap & CIFS_UNIX_XATTR_CAP) + cFYI(1, "XATTR cap"); + if (cap & CIFS_UNIX_POSIX_ACL_CAP) + cFYI(1, "POSIX ACL cap"); + if (cap & CIFS_UNIX_LARGE_READ_CAP) + cFYI(1, "very large read cap"); + if (cap & CIFS_UNIX_LARGE_WRITE_CAP) + cFYI(1, "very large write cap"); + if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP) + cFYI(1, "transport encryption cap"); + if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP) + cFYI(1, "mandatory transport encryption cap"); +#endif /* CIFS_DEBUG2 */ + if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) { + if (vol_info == NULL) { + cFYI(1, "resetting capabilities failed"); + } else + cERROR(1, "Negotiating Unix capabilities " + "with the server failed. Consider " + "mounting with the Unix Extensions\n" + "disabled, if problems are found, " + "by specifying the nounix mount " + "option."); + + } + } +} + +void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, + struct cifs_sb_info *cifs_sb) +{ + INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks); + + spin_lock_init(&cifs_sb->tlink_tree_lock); + cifs_sb->tlink_tree = RB_ROOT; + + if (pvolume_info->rsize > CIFSMaxBufSize) { + cERROR(1, "rsize %d too large, using MaxBufSize", + pvolume_info->rsize); + cifs_sb->rsize = CIFSMaxBufSize; + } else if ((pvolume_info->rsize) && + (pvolume_info->rsize <= CIFSMaxBufSize)) + cifs_sb->rsize = pvolume_info->rsize; + else /* default */ + cifs_sb->rsize = CIFSMaxBufSize; + + if (cifs_sb->rsize < 2048) { + cifs_sb->rsize = 2048; + /* Windows ME may prefer this */ + cFYI(1, "readsize set to minimum: 2048"); + } + + /* + * Temporarily set wsize for matching superblock. If we end up using + * new sb then cifs_negotiate_wsize will later negotiate it downward + * if needed. + */ + cifs_sb->wsize = pvolume_info->wsize; + + cifs_sb->mnt_uid = pvolume_info->linux_uid; + cifs_sb->mnt_gid = pvolume_info->linux_gid; + cifs_sb->mnt_file_mode = pvolume_info->file_mode; + cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; + cFYI(1, "file mode: 0x%x dir mode: 0x%x", + cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); + + cifs_sb->actimeo = pvolume_info->actimeo; + cifs_sb->local_nls = pvolume_info->local_nls; + + if (pvolume_info->noperm) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; + if (pvolume_info->setuids) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID; + if (pvolume_info->server_ino) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM; + if (pvolume_info->remap) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR; + if (pvolume_info->no_xattr) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR; + if (pvolume_info->sfu_emul) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL; + if (pvolume_info->nobrl) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL; + if (pvolume_info->nostrictsync) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC; + if (pvolume_info->mand_lock) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL; + if (pvolume_info->rwpidforward) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD; + if (pvolume_info->cifs_acl) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL; + if (pvolume_info->override_uid) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID; + if (pvolume_info->override_gid) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID; + if (pvolume_info->dynperm) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; + if (pvolume_info->fsc) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE; + if (pvolume_info->multiuser) + cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER | + CIFS_MOUNT_NO_PERM); + if (pvolume_info->strict_io) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO; + if (pvolume_info->direct_io) { + cFYI(1, "mounting share using direct i/o"); + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; + } + if (pvolume_info->mfsymlinks) { + if (pvolume_info->sfu_emul) { + cERROR(1, "mount option mfsymlinks ignored if sfu " + "mount option is used"); + } else { + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS; + } + } + + if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) + cERROR(1, "mount option dynperm ignored if cifsacl " + "mount option supported"); +} + +/* + * When the server supports very large writes via POSIX extensions, we can + * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including + * the RFC1001 length. + * + * Note that this might make for "interesting" allocation problems during + * writeback however as we have to allocate an array of pointers for the + * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. + */ +#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4) + +/* + * When the server doesn't allow large posix writes, only allow a wsize of + * 2^17-1 minus the size of the WRITE_AND_X header. That allows for a write up + * to the maximum size described by RFC1002. + */ +#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4) + +/* + * The default wsize is 1M. find_get_pages seems to return a maximum of 256 + * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill + * a single wsize request with a single call. + */ +#define CIFS_DEFAULT_WSIZE (1024 * 1024) + +static unsigned int +cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) +{ + __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize : + CIFS_DEFAULT_WSIZE; + + /* can server support 24-bit write sizes? (via UNIX extensions) */ + if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) + wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1002_WSIZE); + + /* + * no CAP_LARGE_WRITE_X or is signing enabled without CAP_UNIX set? + * Limit it to max buffer offered by the server, minus the size of the + * WRITEX header, not including the 4 byte RFC1001 length. + */ + if (!(server->capabilities & CAP_LARGE_WRITE_X) || + (!(server->capabilities & CAP_UNIX) && + (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)))) + wsize = min_t(unsigned int, wsize, + server->maxBuf - sizeof(WRITE_REQ) + 4); + + /* hard limit of CIFS_MAX_WSIZE */ + wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE); + + return wsize; +} + +static int +is_path_accessible(int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path) +{ + int rc; + FILE_ALL_INFO *pfile_info; + + pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (pfile_info == NULL) + return -ENOMEM; + + rc = CIFSSMBQPathInfo(xid, tcon, full_path, pfile_info, + 0 /* not legacy */, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (rc == -EOPNOTSUPP || rc == -EINVAL) + rc = SMBQueryInformation(xid, tcon, full_path, pfile_info, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + kfree(pfile_info); + return rc; +} + +static void +cleanup_volume_info_contents(struct smb_vol *volume_info) +{ + kfree(volume_info->username); + kzfree(volume_info->password); + kfree(volume_info->UNC); + if (volume_info->UNCip != volume_info->UNC + 2) + kfree(volume_info->UNCip); + kfree(volume_info->domainname); + kfree(volume_info->iocharset); + kfree(volume_info->prepath); +} + +void +cifs_cleanup_volume_info(struct smb_vol *volume_info) +{ + if (!volume_info) + return; + cleanup_volume_info_contents(volume_info); + kfree(volume_info); +} + + +#ifdef CONFIG_CIFS_DFS_UPCALL +/* build_path_to_root returns full path to root when + * we do not have an exiting connection (tcon) */ +static char * +build_unc_path_to_root(const struct smb_vol *vol, + const struct cifs_sb_info *cifs_sb) +{ + char *full_path, *pos; + unsigned int pplen = vol->prepath ? strlen(vol->prepath) : 0; + unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1); + + full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL); + if (full_path == NULL) + return ERR_PTR(-ENOMEM); + + strncpy(full_path, vol->UNC, unc_len); + pos = full_path + unc_len; + + if (pplen) { + strncpy(pos, vol->prepath, pplen); + pos += pplen; + } + + *pos = '\0'; /* add trailing null */ + convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); + cFYI(1, "%s: full_path=%s", __func__, full_path); + return full_path; +} + +/* + * Perform a dfs referral query for a share and (optionally) prefix + * + * If a referral is found, cifs_sb->mountdata will be (re-)allocated + * to a string containing updated options for the submount. Otherwise it + * will be left untouched. + * + * Returns the rc from get_dfs_path to the caller, which can be used to + * determine whether there were referrals. + */ +static int +expand_dfs_referral(int xid, struct cifs_ses *pSesInfo, + struct smb_vol *volume_info, struct cifs_sb_info *cifs_sb, + int check_prefix) +{ + int rc; + unsigned int num_referrals = 0; + struct dfs_info3_param *referrals = NULL; + char *full_path = NULL, *ref_path = NULL, *mdata = NULL; + + full_path = build_unc_path_to_root(volume_info, cifs_sb); + if (IS_ERR(full_path)) + return PTR_ERR(full_path); + + /* For DFS paths, skip the first '\' of the UNC */ + ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1; + + rc = get_dfs_path(xid, pSesInfo , ref_path, cifs_sb->local_nls, + &num_referrals, &referrals, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (!rc && num_referrals > 0) { + char *fake_devname = NULL; + + mdata = cifs_compose_mount_options(cifs_sb->mountdata, + full_path + 1, referrals, + &fake_devname); + + free_dfs_info_array(referrals, num_referrals); + + if (IS_ERR(mdata)) { + rc = PTR_ERR(mdata); + mdata = NULL; + } else { + cleanup_volume_info_contents(volume_info); + memset(volume_info, '\0', sizeof(*volume_info)); + rc = cifs_setup_volume_info(volume_info, mdata, + fake_devname); + } + kfree(fake_devname); + kfree(cifs_sb->mountdata); + cifs_sb->mountdata = mdata; + } + kfree(full_path); + return rc; +} +#endif + +static int +cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, + const char *devname) +{ + int rc = 0; + + if (cifs_parse_mount_options(mount_data, devname, volume_info)) + return -EINVAL; + + if (volume_info->nullauth) { + cFYI(1, "null user"); + volume_info->username = kzalloc(1, GFP_KERNEL); + if (volume_info->username == NULL) + return -ENOMEM; + } else if (volume_info->username) { + /* BB fixme parse for domain name here */ + cFYI(1, "Username: %s", volume_info->username); + } else { + cifserror("No username specified"); + /* In userspace mount helper we can get user name from alternate + locations such as env variables and files on disk */ + return -EINVAL; + } + + /* this is needed for ASCII cp to Unicode converts */ + if (volume_info->iocharset == NULL) { + /* load_nls_default cannot return null */ + volume_info->local_nls = load_nls_default(); + } else { + volume_info->local_nls = load_nls(volume_info->iocharset); + if (volume_info->local_nls == NULL) { + cERROR(1, "CIFS mount error: iocharset %s not found", + volume_info->iocharset); + return -ELIBACC; + } + } + + return rc; +} + +struct smb_vol * +cifs_get_volume_info(char *mount_data, const char *devname) +{ + int rc; + struct smb_vol *volume_info; + + volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL); + if (!volume_info) + return ERR_PTR(-ENOMEM); + + rc = cifs_setup_volume_info(volume_info, mount_data, devname); + if (rc) { + cifs_cleanup_volume_info(volume_info); + volume_info = ERR_PTR(rc); + } + + return volume_info; +} + +int +cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) +{ + int rc; + int xid; + struct cifs_ses *pSesInfo; + struct cifs_tcon *tcon; + struct TCP_Server_Info *srvTcp; + char *full_path; + struct tcon_link *tlink; +#ifdef CONFIG_CIFS_DFS_UPCALL + int referral_walks_count = 0; +#endif + + rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); + if (rc) + return rc; + + cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages; + +#ifdef CONFIG_CIFS_DFS_UPCALL +try_mount_again: + /* cleanup activities if we're chasing a referral */ + if (referral_walks_count) { + if (tcon) + cifs_put_tcon(tcon); + else if (pSesInfo) + cifs_put_smb_ses(pSesInfo); + + FreeXid(xid); + } +#endif + rc = 0; + tcon = NULL; + pSesInfo = NULL; + srvTcp = NULL; + full_path = NULL; + tlink = NULL; + + xid = GetXid(); + + /* get a reference to a tcp session */ + srvTcp = cifs_get_tcp_session(volume_info); + if (IS_ERR(srvTcp)) { + rc = PTR_ERR(srvTcp); + bdi_destroy(&cifs_sb->bdi); + goto out; + } + + /* get a reference to a SMB session */ + pSesInfo = cifs_get_smb_ses(srvTcp, volume_info); + if (IS_ERR(pSesInfo)) { + rc = PTR_ERR(pSesInfo); + pSesInfo = NULL; + goto mount_fail_check; + } + + /* search for existing tcon to this server share */ + tcon = cifs_get_tcon(pSesInfo, volume_info); + if (IS_ERR(tcon)) { + rc = PTR_ERR(tcon); + tcon = NULL; + goto remote_path_check; + } + + /* tell server which Unix caps we support */ + if (tcon->ses->capabilities & CAP_UNIX) { + /* reset of caps checks mount to see if unix extensions + disabled for just this mount */ + reset_cifs_unix_caps(xid, tcon, cifs_sb, volume_info); + if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) && + (le64_to_cpu(tcon->fsUnixInfo.Capability) & + CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) { + rc = -EACCES; + goto mount_fail_check; + } + } else + tcon->unix_ext = 0; /* server does not support them */ + + /* do not care if following two calls succeed - informational */ + if (!tcon->ipc) { + CIFSSMBQFSDeviceInfo(xid, tcon); + CIFSSMBQFSAttributeInfo(xid, tcon); + } + + if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) { + cifs_sb->rsize = 1024 * 127; + cFYI(DBG2, "no very large read support, rsize now 127K"); + } + if (!(tcon->ses->capabilities & CAP_LARGE_READ_X)) + cifs_sb->rsize = min(cifs_sb->rsize, + (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)); + + cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info); + +remote_path_check: +#ifdef CONFIG_CIFS_DFS_UPCALL + /* + * Perform an unconditional check for whether there are DFS + * referrals for this path without prefix, to provide support + * for DFS referrals from w2k8 servers which don't seem to respond + * with PATH_NOT_COVERED to requests that include the prefix. + * Chase the referral if found, otherwise continue normally. + */ + if (referral_walks_count == 0) { + int refrc = expand_dfs_referral(xid, pSesInfo, volume_info, + cifs_sb, false); + if (!refrc) { + referral_walks_count++; + goto try_mount_again; + } + } +#endif + + /* check if a whole path is not remote */ + if (!rc && tcon) { + /* build_path_to_root works only when we have a valid tcon */ + full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon); + if (full_path == NULL) { + rc = -ENOMEM; + goto mount_fail_check; + } + rc = is_path_accessible(xid, tcon, cifs_sb, full_path); + if (rc != 0 && rc != -EREMOTE) { + kfree(full_path); + goto mount_fail_check; + } + kfree(full_path); + } + + /* get referral if needed */ + if (rc == -EREMOTE) { +#ifdef CONFIG_CIFS_DFS_UPCALL + if (referral_walks_count > MAX_NESTED_LINKS) { + /* + * BB: when we implement proper loop detection, + * we will remove this check. But now we need it + * to prevent an indefinite loop if 'DFS tree' is + * misconfigured (i.e. has loops). + */ + rc = -ELOOP; + goto mount_fail_check; + } + + rc = expand_dfs_referral(xid, pSesInfo, volume_info, cifs_sb, + true); + + if (!rc) { + referral_walks_count++; + goto try_mount_again; + } + goto mount_fail_check; +#else /* No DFS support, return error on mount */ + rc = -EOPNOTSUPP; +#endif + } + + if (rc) + goto mount_fail_check; + + /* now, hang the tcon off of the superblock */ + tlink = kzalloc(sizeof *tlink, GFP_KERNEL); + if (tlink == NULL) { + rc = -ENOMEM; + goto mount_fail_check; + } + + tlink->tl_uid = pSesInfo->linux_uid; + tlink->tl_tcon = tcon; + tlink->tl_time = jiffies; + set_bit(TCON_LINK_MASTER, &tlink->tl_flags); + set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + + cifs_sb->master_tlink = tlink; + spin_lock(&cifs_sb->tlink_tree_lock); + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); + spin_unlock(&cifs_sb->tlink_tree_lock); + + queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, + TLINK_IDLE_EXPIRE); + +mount_fail_check: + /* on error free sesinfo and tcon struct if needed */ + if (rc) { + /* If find_unc succeeded then rc == 0 so we can not end */ + /* up accidentally freeing someone elses tcon struct */ + if (tcon) + cifs_put_tcon(tcon); + else if (pSesInfo) + cifs_put_smb_ses(pSesInfo); + else + cifs_put_tcp_session(srvTcp); + bdi_destroy(&cifs_sb->bdi); + goto out; + } + + /* volume_info->password is freed above when existing session found + (in which case it is not needed anymore) but when new sesion is created + the password ptr is put in the new session structure (in which case the + password will be freed at unmount time) */ +out: + /* zero out password before freeing */ + FreeXid(xid); + return rc; +} + +/* + * Issue a TREE_CONNECT request. Note that for IPC$ shares, that the tcon + * pointer may be NULL. + */ +int +CIFSTCon(unsigned int xid, struct cifs_ses *ses, + const char *tree, struct cifs_tcon *tcon, + const struct nls_table *nls_codepage) +{ + struct smb_hdr *smb_buffer; + struct smb_hdr *smb_buffer_response; + TCONX_REQ *pSMB; + TCONX_RSP *pSMBr; + unsigned char *bcc_ptr; + int rc = 0; + int length; + __u16 bytes_left, count; + + if (ses == NULL) + return -EIO; + + smb_buffer = cifs_buf_get(); + if (smb_buffer == NULL) + return -ENOMEM; + + smb_buffer_response = smb_buffer; + + header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, + NULL /*no tid */ , 4 /*wct */ ); + + smb_buffer->Mid = GetNextMid(ses->server); + smb_buffer->Uid = ses->Suid; + pSMB = (TCONX_REQ *) smb_buffer; + pSMBr = (TCONX_RSP *) smb_buffer_response; + + pSMB->AndXCommand = 0xFF; + pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO); + bcc_ptr = &pSMB->Password[0]; + if (!tcon || (ses->server->sec_mode & SECMODE_USER)) { + pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ + *bcc_ptr = 0; /* password is null byte */ + bcc_ptr++; /* skip password */ + /* already aligned so no need to do it below */ + } else { + pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); + /* BB FIXME add code to fail this if NTLMv2 or Kerberos + specified as required (when that support is added to + the vfs in the future) as only NTLM or the much + weaker LANMAN (which we do not send by default) is accepted + by Samba (not sure whether other servers allow + NTLMv2 password here) */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH + if ((global_secflags & CIFSSEC_MAY_LANMAN) && + (ses->server->secType == LANMAN)) + calc_lanman_hash(tcon->password, ses->server->cryptkey, + ses->server->sec_mode & + SECMODE_PW_ENCRYPT ? true : false, + bcc_ptr); + else +#endif /* CIFS_WEAK_PW_HASH */ + rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, + bcc_ptr); + + bcc_ptr += CIFS_AUTH_RESP_SIZE; + if (ses->capabilities & CAP_UNICODE) { + /* must align unicode strings */ + *bcc_ptr = 0; /* null byte password */ + bcc_ptr++; + } + } + + if (ses->server->sec_mode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + + if (ses->capabilities & CAP_STATUS32) { + smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS; + } + if (ses->capabilities & CAP_DFS) { + smb_buffer->Flags2 |= SMBFLG2_DFS; + } + if (ses->capabilities & CAP_UNICODE) { + smb_buffer->Flags2 |= SMBFLG2_UNICODE; + length = + cifs_strtoUCS((__le16 *) bcc_ptr, tree, + 6 /* max utf8 char length in bytes */ * + (/* server len*/ + 256 /* share len */), nls_codepage); + bcc_ptr += 2 * length; /* convert num 16 bit words to bytes */ + bcc_ptr += 2; /* skip trailing null */ + } else { /* ASCII */ + strcpy(bcc_ptr, tree); + bcc_ptr += strlen(tree) + 1; + } + strcpy(bcc_ptr, "?????"); + bcc_ptr += strlen("?????"); + bcc_ptr += 1; + count = bcc_ptr - &pSMB->Password[0]; + pSMB->hdr.smb_buf_length = cpu_to_be32(be32_to_cpu( + pSMB->hdr.smb_buf_length) + count); + pSMB->ByteCount = cpu_to_le16(count); + + rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length, + 0); + + /* above now done in SendReceive */ + if ((rc == 0) && (tcon != NULL)) { + bool is_unicode; + + tcon->tidStatus = CifsGood; + tcon->need_reconnect = false; + tcon->tid = smb_buffer_response->Tid; + bcc_ptr = pByteArea(smb_buffer_response); + bytes_left = get_bcc(smb_buffer_response); + length = strnlen(bcc_ptr, bytes_left - 2); + if (smb_buffer->Flags2 & SMBFLG2_UNICODE) + is_unicode = true; + else + is_unicode = false; + + + /* skip service field (NB: this field is always ASCII) */ + if (length == 3) { + if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') && + (bcc_ptr[2] == 'C')) { + cFYI(1, "IPC connection"); + tcon->ipc = 1; + } + } else if (length == 2) { + if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) { + /* the most common case */ + cFYI(1, "disk share connection"); + } + } + bcc_ptr += length + 1; + bytes_left -= (length + 1); + strncpy(tcon->treeName, tree, MAX_TREE_SIZE); + + /* mostly informational -- no need to fail on error here */ + kfree(tcon->nativeFileSystem); + tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr, + bytes_left, is_unicode, + nls_codepage); + + cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem); + + if ((smb_buffer_response->WordCount == 3) || + (smb_buffer_response->WordCount == 7)) + /* field is in same location */ + tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport); + else + tcon->Flags = 0; + cFYI(1, "Tcon flags: 0x%x ", tcon->Flags); + } else if ((rc == 0) && tcon == NULL) { + /* all we need to save for IPC$ connection */ + ses->ipc_tid = smb_buffer_response->Tid; + } + + cifs_buf_release(smb_buffer); + return rc; +} + +void +cifs_umount(struct cifs_sb_info *cifs_sb) +{ + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node; + struct tcon_link *tlink; + + cancel_delayed_work_sync(&cifs_sb->prune_tlinks); + + spin_lock(&cifs_sb->tlink_tree_lock); + while ((node = rb_first(root))) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + cifs_get_tlink(tlink); + clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rb_erase(node, root); + + spin_unlock(&cifs_sb->tlink_tree_lock); + cifs_put_tlink(tlink); + spin_lock(&cifs_sb->tlink_tree_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); + + bdi_destroy(&cifs_sb->bdi); + kfree(cifs_sb->mountdata); + unload_nls(cifs_sb->local_nls); + kfree(cifs_sb); +} + +int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses) +{ + int rc = 0; + struct TCP_Server_Info *server = ses->server; + + /* only send once per connect */ + if (server->maxBuf != 0) + return 0; + + rc = CIFSSMBNegotiate(xid, ses); + if (rc == -EAGAIN) { + /* retry only once on 1st time connection */ + rc = CIFSSMBNegotiate(xid, ses); + if (rc == -EAGAIN) + rc = -EHOSTDOWN; + } + if (rc == 0) { + spin_lock(&GlobalMid_Lock); + if (server->tcpStatus == CifsNeedNegotiate) + server->tcpStatus = CifsGood; + else + rc = -EHOSTDOWN; + spin_unlock(&GlobalMid_Lock); + + } + + return rc; +} + + +int cifs_setup_session(unsigned int xid, struct cifs_ses *ses, + struct nls_table *nls_info) +{ + int rc = 0; + struct TCP_Server_Info *server = ses->server; + + ses->flags = 0; + ses->capabilities = server->capabilities; + if (linuxExtEnabled == 0) + ses->capabilities &= (~CAP_UNIX); + + cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", + server->sec_mode, server->capabilities, server->timeAdj); + + rc = CIFS_SessSetup(xid, ses, nls_info); + if (rc) { + cERROR(1, "Send error in SessSetup = %d", rc); + } else { + mutex_lock(&ses->server->srv_mutex); + if (!server->session_estab) { + server->session_key.response = ses->auth_key.response; + server->session_key.len = ses->auth_key.len; + server->sequence_number = 0x2; + server->session_estab = true; + ses->auth_key.response = NULL; + } + mutex_unlock(&server->srv_mutex); + + cFYI(1, "CIFS Session Established successfully"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); + } + + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + ses->auth_key.len = 0; + kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + + return rc; +} + +static struct cifs_tcon * +cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid) +{ + struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb); + struct cifs_ses *ses; + struct cifs_tcon *tcon = NULL; + struct smb_vol *vol_info; + char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */ + /* We used to have this as MAX_USERNAME which is */ + /* way too big now (256 instead of 32) */ + + vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL); + if (vol_info == NULL) { + tcon = ERR_PTR(-ENOMEM); + goto out; + } + + snprintf(username, sizeof(username), "krb50x%x", fsuid); + vol_info->username = username; + vol_info->local_nls = cifs_sb->local_nls; + vol_info->linux_uid = fsuid; + vol_info->cred_uid = fsuid; + vol_info->UNC = master_tcon->treeName; + vol_info->retry = master_tcon->retry; + vol_info->nocase = master_tcon->nocase; + vol_info->local_lease = master_tcon->local_lease; + vol_info->no_linux_ext = !master_tcon->unix_ext; + + /* FIXME: allow for other secFlg settings */ + vol_info->secFlg = CIFSSEC_MUST_KRB5; + + /* get a reference for the same TCP session */ + spin_lock(&cifs_tcp_ses_lock); + ++master_tcon->ses->server->srv_count; + spin_unlock(&cifs_tcp_ses_lock); + + ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info); + if (IS_ERR(ses)) { + tcon = (struct cifs_tcon *)ses; + cifs_put_tcp_session(master_tcon->ses->server); + goto out; + } + + tcon = cifs_get_tcon(ses, vol_info); + if (IS_ERR(tcon)) { + cifs_put_smb_ses(ses); + goto out; + } + + if (ses->capabilities & CAP_UNIX) + reset_cifs_unix_caps(0, tcon, NULL, vol_info); +out: + kfree(vol_info); + + return tcon; +} + +struct cifs_tcon * +cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb) +{ + return tlink_tcon(cifs_sb_master_tlink(cifs_sb)); +} + +static int +cifs_sb_tcon_pending_wait(void *unused) +{ + schedule(); + return signal_pending(current) ? -ERESTARTSYS : 0; +} + +/* find and return a tlink with given uid */ +static struct tcon_link * +tlink_rb_search(struct rb_root *root, uid_t uid) +{ + struct rb_node *node = root->rb_node; + struct tcon_link *tlink; + + while (node) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + + if (tlink->tl_uid > uid) + node = node->rb_left; + else if (tlink->tl_uid < uid) + node = node->rb_right; + else + return tlink; + } + return NULL; +} + +/* insert a tcon_link into the tree */ +static void +tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct tcon_link *tlink; + + while (*new) { + tlink = rb_entry(*new, struct tcon_link, tl_rbnode); + parent = *new; + + if (tlink->tl_uid > new_tlink->tl_uid) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&new_tlink->tl_rbnode, parent, new); + rb_insert_color(&new_tlink->tl_rbnode, root); +} + +/* + * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the + * current task. + * + * If the superblock doesn't refer to a multiuser mount, then just return + * the master tcon for the mount. + * + * First, search the rbtree for an existing tcon for this fsuid. If one + * exists, then check to see if it's pending construction. If it is then wait + * for construction to complete. Once it's no longer pending, check to see if + * it failed and either return an error or retry construction, depending on + * the timeout. + * + * If one doesn't exist then insert a new tcon_link struct into the tree and + * try to construct a new one. + */ +struct tcon_link * +cifs_sb_tlink(struct cifs_sb_info *cifs_sb) +{ + int ret; + uid_t fsuid = current_fsuid(); + struct tcon_link *tlink, *newtlink; + + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) + return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); + + spin_lock(&cifs_sb->tlink_tree_lock); + tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid); + if (tlink) + cifs_get_tlink(tlink); + spin_unlock(&cifs_sb->tlink_tree_lock); + + if (tlink == NULL) { + newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL); + if (newtlink == NULL) + return ERR_PTR(-ENOMEM); + newtlink->tl_uid = fsuid; + newtlink->tl_tcon = ERR_PTR(-EACCES); + set_bit(TCON_LINK_PENDING, &newtlink->tl_flags); + set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags); + cifs_get_tlink(newtlink); + + spin_lock(&cifs_sb->tlink_tree_lock); + /* was one inserted after previous search? */ + tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid); + if (tlink) { + cifs_get_tlink(tlink); + spin_unlock(&cifs_sb->tlink_tree_lock); + kfree(newtlink); + goto wait_for_construction; + } + tlink = newtlink; + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); + spin_unlock(&cifs_sb->tlink_tree_lock); + } else { +wait_for_construction: + ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, + cifs_sb_tcon_pending_wait, + TASK_INTERRUPTIBLE); + if (ret) { + cifs_put_tlink(tlink); + return ERR_PTR(ret); + } + + /* if it's good, return it */ + if (!IS_ERR(tlink->tl_tcon)) + return tlink; + + /* return error if we tried this already recently */ + if (time_before(jiffies, tlink->tl_time + TLINK_ERROR_EXPIRE)) { + cifs_put_tlink(tlink); + return ERR_PTR(-EACCES); + } + + if (test_and_set_bit(TCON_LINK_PENDING, &tlink->tl_flags)) + goto wait_for_construction; + } + + tlink->tl_tcon = cifs_construct_tcon(cifs_sb, fsuid); + clear_bit(TCON_LINK_PENDING, &tlink->tl_flags); + wake_up_bit(&tlink->tl_flags, TCON_LINK_PENDING); + + if (IS_ERR(tlink->tl_tcon)) { + cifs_put_tlink(tlink); + return ERR_PTR(-EACCES); + } + + return tlink; +} + +/* + * periodic workqueue job that scans tcon_tree for a superblock and closes + * out tcons. + */ +static void +cifs_prune_tlinks(struct work_struct *work) +{ + struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info, + prune_tlinks.work); + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node = rb_first(root); + struct rb_node *tmp; + struct tcon_link *tlink; + + /* + * Because we drop the spinlock in the loop in order to put the tlink + * it's not guarded against removal of links from the tree. The only + * places that remove entries from the tree are this function and + * umounts. Because this function is non-reentrant and is canceled + * before umount can proceed, this is safe. + */ + spin_lock(&cifs_sb->tlink_tree_lock); + node = rb_first(root); + while (node != NULL) { + tmp = node; + node = rb_next(tmp); + tlink = rb_entry(tmp, struct tcon_link, tl_rbnode); + + if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) || + atomic_read(&tlink->tl_count) != 0 || + time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies)) + continue; + + cifs_get_tlink(tlink); + clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rb_erase(tmp, root); + + spin_unlock(&cifs_sb->tlink_tree_lock); + cifs_put_tlink(tlink); + spin_lock(&cifs_sb->tlink_tree_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); + + queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, + TLINK_IDLE_EXPIRE); +} diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c new file mode 100644 index 00000000..ed5c07b0 --- /dev/null +++ b/fs/cifs/dir.c @@ -0,0 +1,742 @@ +/* + * fs/cifs/dir.c + * + * vfs operations that deal with dentries + * + * Copyright (C) International Business Machines Corp., 2002,2009 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" + +static void +renew_parental_timestamps(struct dentry *direntry) +{ + /* BB check if there is a way to get the kernel to do this or if we + really need this */ + do { + direntry->d_time = jiffies; + direntry = direntry->d_parent; + } while (!IS_ROOT(direntry)); +} + +/* Note: caller must free return buffer */ +char * +build_path_from_dentry(struct dentry *direntry) +{ + struct dentry *temp; + int namelen; + int dfsplen; + char *full_path; + char dirsep; + struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + unsigned seq; + + if (direntry == NULL) + return NULL; /* not much we can do if dentry is freed and + we need to reopen the file after it was closed implicitly + when the server crashed */ + + dirsep = CIFS_DIR_SEP(cifs_sb); + if (tcon->Flags & SMB_SHARE_IS_IN_DFS) + dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); + else + dfsplen = 0; +cifs_bp_rename_retry: + namelen = dfsplen; + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + for (temp = direntry; !IS_ROOT(temp);) { + namelen += (1 + temp->d_name.len); + temp = temp->d_parent; + if (temp == NULL) { + cERROR(1, "corrupt dentry"); + rcu_read_unlock(); + return NULL; + } + } + rcu_read_unlock(); + + full_path = kmalloc(namelen+1, GFP_KERNEL); + if (full_path == NULL) + return full_path; + full_path[namelen] = 0; /* trailing null */ + rcu_read_lock(); + for (temp = direntry; !IS_ROOT(temp);) { + spin_lock(&temp->d_lock); + namelen -= 1 + temp->d_name.len; + if (namelen < 0) { + spin_unlock(&temp->d_lock); + break; + } else { + full_path[namelen] = dirsep; + strncpy(full_path + namelen + 1, temp->d_name.name, + temp->d_name.len); + cFYI(0, "name: %s", full_path + namelen); + } + spin_unlock(&temp->d_lock); + temp = temp->d_parent; + if (temp == NULL) { + cERROR(1, "corrupt dentry"); + rcu_read_unlock(); + kfree(full_path); + return NULL; + } + } + rcu_read_unlock(); + if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) { + cFYI(1, "did not end path lookup where expected. namelen=%d " + "dfsplen=%d", namelen, dfsplen); + /* presumably this is only possible if racing with a rename + of one of the parent directories (we can not lock the dentries + above us to prevent this, but retrying should be harmless) */ + kfree(full_path); + goto cifs_bp_rename_retry; + } + /* DIR_SEP already set for byte 0 / vs \ but not for + subsequent slashes in prepath which currently must + be entered the right way - not sure if there is an alternative + since the '\' is a valid posix character so we can not switch + those safely to '/' if any are found in the middle of the prepath */ + /* BB test paths to Windows with '/' in the midst of prepath */ + + if (dfsplen) { + strncpy(full_path, tcon->treeName, dfsplen); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { + int i; + for (i = 0; i < dfsplen; i++) { + if (full_path[i] == '\\') + full_path[i] = '/'; + } + } + } + return full_path; +} + +/* Inode operations in similar order to how they appear in Linux file fs.h */ + +int +cifs_create(struct inode *inode, struct dentry *direntry, int mode, + struct nameidata *nd) +{ + int rc = -ENOENT; + int xid; + int create_options = CREATE_NOT_DIR; + __u32 oplock = 0; + int oflags; + /* + * BB below access is probably too much for mknod to request + * but we have to do query and setpathinfo so requesting + * less could fail (unless we want to request getatr and setatr + * permissions (only). At least for POSIX we do not have to + * request so much. + */ + int desiredAccess = GENERIC_READ | GENERIC_WRITE; + __u16 fileHandle; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + char *full_path = NULL; + FILE_ALL_INFO *buf = NULL; + struct inode *newinode = NULL; + int disposition = FILE_OVERWRITE_IF; + + xid = GetXid(); + + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + FreeXid(xid); + return PTR_ERR(tlink); + } + tcon = tlink_tcon(tlink); + + if (oplockEnabled) + oplock = REQ_OPLOCK; + + if (nd && (nd->flags & LOOKUP_OPEN)) + oflags = nd->intent.open.file->f_flags; + else + oflags = O_RDONLY | O_CREAT; + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto cifs_create_out; + } + + if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_POSIX_PATH_OPS_CAP & + le64_to_cpu(tcon->fsUnixInfo.Capability))) { + rc = cifs_posix_open(full_path, &newinode, + inode->i_sb, mode, oflags, &oplock, &fileHandle, xid); + /* EIO could indicate that (posix open) operation is not + supported, despite what server claimed in capability + negotiation. EREMOTE indicates DFS junction, which is not + handled in posix open */ + + if (rc == 0) { + if (newinode == NULL) /* query inode info */ + goto cifs_create_get_file_info; + else /* success, no need to query */ + goto cifs_create_set_dentry; + } else if ((rc != -EIO) && (rc != -EREMOTE) && + (rc != -EOPNOTSUPP) && (rc != -EINVAL)) + goto cifs_create_out; + /* else fallthrough to retry, using older open call, this is + case where server does not support this SMB level, and + falsely claims capability (also get here for DFS case + which should be rare for path not covered on files) */ + } + + if (nd && (nd->flags & LOOKUP_OPEN)) { + /* if the file is going to stay open, then we + need to set the desired access properly */ + desiredAccess = 0; + if (OPEN_FMODE(oflags) & FMODE_READ) + desiredAccess |= GENERIC_READ; /* is this too little? */ + if (OPEN_FMODE(oflags) & FMODE_WRITE) + desiredAccess |= GENERIC_WRITE; + + if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + disposition = FILE_CREATE; + else if ((oflags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC)) + disposition = FILE_OVERWRITE_IF; + else if ((oflags & O_CREAT) == O_CREAT) + disposition = FILE_OPEN_IF; + else + cFYI(1, "Create flag not set in create function"); + } + + /* BB add processing to set equivalent of mode - e.g. via CreateX with + ACLs */ + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + rc = -ENOMEM; + goto cifs_create_out; + } + + /* + * if we're not using unix extensions, see if we need to set + * ATTR_READONLY on the create call + */ + if (!tcon->unix_ext && (mode & S_IWUGO) == 0) + create_options |= CREATE_OPTION_READONLY; + + if (tcon->ses->capabilities & CAP_NT_SMBS) + rc = CIFSSMBOpen(xid, tcon, full_path, disposition, + desiredAccess, create_options, + &fileHandle, &oplock, buf, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + else + rc = -EIO; /* no NT SMB support fall into legacy open below */ + + if (rc == -EIO) { + /* old server, retry the open legacy style */ + rc = SMBLegacyOpen(xid, tcon, full_path, disposition, + desiredAccess, create_options, + &fileHandle, &oplock, buf, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + } + if (rc) { + cFYI(1, "cifs_create returned 0x%x", rc); + goto cifs_create_out; + } + + /* If Open reported that we actually created a file + then we now have to set the mode if possible */ + if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) { + struct cifs_unix_set_info_args args = { + .mode = mode, + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = 0, + }; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + args.uid = (__u64) current_fsuid(); + if (inode->i_mode & S_ISGID) + args.gid = (__u64) inode->i_gid; + else + args.gid = (__u64) current_fsgid(); + } else { + args.uid = NO_CHANGE_64; + args.gid = NO_CHANGE_64; + } + CIFSSMBUnixSetFileInfo(xid, tcon, &args, fileHandle, + current->tgid); + } else { + /* BB implement mode setting via Windows security + descriptors e.g. */ + /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/ + + /* Could set r/o dos attribute if mode & 0222 == 0 */ + } + +cifs_create_get_file_info: + /* server might mask mode so we have to query for it */ + if (tcon->unix_ext) + rc = cifs_get_inode_info_unix(&newinode, full_path, + inode->i_sb, xid); + else { + rc = cifs_get_inode_info(&newinode, full_path, buf, + inode->i_sb, xid, &fileHandle); + if (newinode) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + newinode->i_mode = mode; + if ((oplock & CIFS_CREATE_ACTION) && + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) { + newinode->i_uid = current_fsuid(); + if (inode->i_mode & S_ISGID) + newinode->i_gid = inode->i_gid; + else + newinode->i_gid = current_fsgid(); + } + } + } + +cifs_create_set_dentry: + if (rc == 0) + d_instantiate(direntry, newinode); + else + cFYI(1, "Create worked, get_inode_info failed rc = %d", rc); + + if (newinode && nd && (nd->flags & LOOKUP_OPEN)) { + struct cifsFileInfo *pfile_info; + struct file *filp; + + filp = lookup_instantiate_filp(nd, direntry, generic_file_open); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + CIFSSMBClose(xid, tcon, fileHandle); + goto cifs_create_out; + } + + pfile_info = cifs_new_fileinfo(fileHandle, filp, tlink, oplock); + if (pfile_info == NULL) { + fput(filp); + CIFSSMBClose(xid, tcon, fileHandle); + rc = -ENOMEM; + } + } else { + CIFSSMBClose(xid, tcon, fileHandle); + } + +cifs_create_out: + kfree(buf); + kfree(full_path); + cifs_put_tlink(tlink); + FreeXid(xid); + return rc; +} + +int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, + dev_t device_number) +{ + int rc = -EPERM; + int xid; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct cifs_io_parms io_parms; + char *full_path = NULL; + struct inode *newinode = NULL; + int oplock = 0; + u16 fileHandle; + FILE_ALL_INFO *buf = NULL; + unsigned int bytes_written; + struct win_dev *pdev; + + if (!old_valid_dev(device_number)) + return -EINVAL; + + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + + pTcon = tlink_tcon(tlink); + + xid = GetXid(); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto mknod_out; + } + + if (pTcon->unix_ext) { + struct cifs_unix_set_info_args args = { + .mode = mode & ~current_umask(), + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = device_number, + }; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + args.uid = (__u64) current_fsuid(); + args.gid = (__u64) current_fsgid(); + } else { + args.uid = NO_CHANGE_64; + args.gid = NO_CHANGE_64; + } + rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) + goto mknod_out; + + rc = cifs_get_inode_info_unix(&newinode, full_path, + inode->i_sb, xid); + + if (rc == 0) + d_instantiate(direntry, newinode); + goto mknod_out; + } + + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) + goto mknod_out; + + + cFYI(1, "sfu compat create special file"); + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + kfree(full_path); + rc = -ENOMEM; + FreeXid(xid); + return rc; + } + + /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */ + rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE, + GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, + &fileHandle, &oplock, buf, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) + goto mknod_out; + + /* BB Do not bother to decode buf since no local inode yet to put + * timestamps in, but we can reuse it safely */ + + pdev = (struct win_dev *)buf; + io_parms.netfid = fileHandle; + io_parms.pid = current->tgid; + io_parms.tcon = pTcon; + io_parms.offset = 0; + io_parms.length = sizeof(struct win_dev); + if (S_ISCHR(mode)) { + memcpy(pdev->type, "IntxCHR", 8); + pdev->major = + cpu_to_le64(MAJOR(device_number)); + pdev->minor = + cpu_to_le64(MINOR(device_number)); + rc = CIFSSMBWrite(xid, &io_parms, + &bytes_written, (char *)pdev, + NULL, 0); + } else if (S_ISBLK(mode)) { + memcpy(pdev->type, "IntxBLK", 8); + pdev->major = + cpu_to_le64(MAJOR(device_number)); + pdev->minor = + cpu_to_le64(MINOR(device_number)); + rc = CIFSSMBWrite(xid, &io_parms, + &bytes_written, (char *)pdev, + NULL, 0); + } /* else if (S_ISFIFO) */ + CIFSSMBClose(xid, pTcon, fileHandle); + d_drop(direntry); + + /* FIXME: add code here to set EAs */ + +mknod_out: + kfree(full_path); + kfree(buf); + FreeXid(xid); + cifs_put_tlink(tlink); + return rc; +} + +struct dentry * +cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, + struct nameidata *nd) +{ + int xid; + int rc = 0; /* to get around spurious gcc warning, set to zero here */ + __u32 oplock = 0; + __u16 fileHandle = 0; + bool posix_open = false; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct cifsFileInfo *cfile; + struct inode *newInode = NULL; + char *full_path = NULL; + struct file *filp; + + xid = GetXid(); + + cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p", + parent_dir_inode, direntry->d_name.name, direntry); + + /* check whether path exists */ + + cifs_sb = CIFS_SB(parent_dir_inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + FreeXid(xid); + return (struct dentry *)tlink; + } + pTcon = tlink_tcon(tlink); + + /* + * Don't allow the separator character in a path component. + * The VFS will not allow "/", but "\" is allowed by posix. + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { + int i; + for (i = 0; i < direntry->d_name.len; i++) + if (direntry->d_name.name[i] == '\\') { + cFYI(1, "Invalid file name"); + rc = -EINVAL; + goto lookup_out; + } + } + + /* + * O_EXCL: optimize away the lookup, but don't hash the dentry. Let + * the VFS handle the create. + */ + if (nd && (nd->flags & LOOKUP_EXCL)) { + d_instantiate(direntry, NULL); + rc = 0; + goto lookup_out; + } + + /* can not grab the rename sem here since it would + deadlock in the cases (beginning of sys_rename itself) + in which we already have the sb rename sem */ + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto lookup_out; + } + + if (direntry->d_inode != NULL) { + cFYI(1, "non-NULL inode in lookup"); + } else { + cFYI(1, "NULL inode in lookup"); + } + cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode); + + /* Posix open is only called (at lookup time) for file create now. + * For opens (rather than creates), because we do not know if it + * is a file or directory yet, and current Samba no longer allows + * us to do posix open on dirs, we could end up wasting an open call + * on what turns out to be a dir. For file opens, we wait to call posix + * open till cifs_open. It could be added here (lookup) in the future + * but the performance tradeoff of the extra network request when EISDIR + * or EACCES is returned would have to be weighed against the 50% + * reduction in network traffic in the other paths. + */ + if (pTcon->unix_ext) { + if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && + (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && + (nd->intent.open.file->f_flags & O_CREAT)) { + rc = cifs_posix_open(full_path, &newInode, + parent_dir_inode->i_sb, + nd->intent.open.create_mode, + nd->intent.open.file->f_flags, &oplock, + &fileHandle, xid); + /* + * The check below works around a bug in POSIX + * open in samba versions 3.3.1 and earlier where + * open could incorrectly fail with invalid parameter. + * If either that or op not supported returned, follow + * the normal lookup. + */ + switch (rc) { + case 0: + /* + * The server may allow us to open things like + * FIFOs, but the client isn't set up to deal + * with that. If it's not a regular file, just + * close it and proceed as if it were a normal + * lookup. + */ + if (newInode && !S_ISREG(newInode->i_mode)) { + CIFSSMBClose(xid, pTcon, fileHandle); + break; + } + case -ENOENT: + posix_open = true; + case -EOPNOTSUPP: + break; + default: + pTcon->broken_posix_open = true; + } + } + if (!posix_open) + rc = cifs_get_inode_info_unix(&newInode, full_path, + parent_dir_inode->i_sb, xid); + } else + rc = cifs_get_inode_info(&newInode, full_path, NULL, + parent_dir_inode->i_sb, xid, NULL); + + if ((rc == 0) && (newInode != NULL)) { + d_add(direntry, newInode); + if (posix_open) { + filp = lookup_instantiate_filp(nd, direntry, + generic_file_open); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + CIFSSMBClose(xid, pTcon, fileHandle); + goto lookup_out; + } + + cfile = cifs_new_fileinfo(fileHandle, filp, tlink, + oplock); + if (cfile == NULL) { + fput(filp); + CIFSSMBClose(xid, pTcon, fileHandle); + rc = -ENOMEM; + goto lookup_out; + } + } + /* since paths are not looked up by component - the parent + directories are presumed to be good here */ + renew_parental_timestamps(direntry); + + } else if (rc == -ENOENT) { + rc = 0; + direntry->d_time = jiffies; + d_add(direntry, NULL); + /* if it was once a directory (but how can we tell?) we could do + shrink_dcache_parent(direntry); */ + } else if (rc != -EACCES) { + cERROR(1, "Unexpected lookup error %d", rc); + /* We special case check for Access Denied - since that + is a common return code */ + } + +lookup_out: + kfree(full_path); + cifs_put_tlink(tlink); + FreeXid(xid); + return ERR_PTR(rc); +} + +static int +cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) +{ + if (nd && (nd->flags & LOOKUP_RCU)) + return -ECHILD; + + if (direntry->d_inode) { + if (cifs_revalidate_dentry(direntry)) + return 0; + else + return 1; + } + + /* + * This may be nfsd (or something), anyway, we can't see the + * intent of this. So, since this can be for creation, drop it. + */ + if (!nd) + return 0; + + /* + * Drop the negative dentry, in order to make sure to use the + * case sensitive name which is specified by user if this is + * for creation. + */ + if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) { + if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + return 0; + } + + if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled) + return 0; + + return 1; +} + +/* static int cifs_d_delete(struct dentry *direntry) +{ + int rc = 0; + + cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name); + + return rc; +} */ + +const struct dentry_operations cifs_dentry_ops = { + .d_revalidate = cifs_d_revalidate, + .d_automount = cifs_dfs_d_automount, +/* d_delete: cifs_d_delete, */ /* not needed except for debugging */ +}; + +static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *q) +{ + struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls; + unsigned long hash; + int i; + + hash = init_name_hash(); + for (i = 0; i < q->len; i++) + hash = partial_name_hash(nls_tolower(codepage, q->name[i]), + hash); + q->hash = end_name_hash(hash); + + return 0; +} + +static int cifs_ci_compare(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls; + + if ((name->len == len) && + (nls_strnicmp(codepage, name->name, str, len) == 0)) + return 0; + return 1; +} + +const struct dentry_operations cifs_ci_dentry_ops = { + .d_revalidate = cifs_d_revalidate, + .d_hash = cifs_ci_hash, + .d_compare = cifs_ci_compare, + .d_automount = cifs_dfs_d_automount, +}; diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c new file mode 100644 index 00000000..548f0623 --- /dev/null +++ b/fs/cifs/dns_resolve.c @@ -0,0 +1,98 @@ +/* + * fs/cifs/dns_resolve.c + * + * Copyright (c) 2007 Igor Mammedov + * Author(s): Igor Mammedov (niallain@gmail.com) + * Steve French (sfrench@us.ibm.com) + * Wang Lei (wang840925@gmail.com) + * David Howells (dhowells@redhat.com) + * + * Contains the CIFS DFS upcall routines used for hostname to + * IP address translation. + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "dns_resolve.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" + +/** + * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address. + * @unc: UNC path specifying the server + * @ip_addr: Where to return the IP address. + * + * The IP address will be returned in string form, and the caller is + * responsible for freeing it. + * + * Returns length of result on success, -ve on error. + */ +int +dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) +{ + struct sockaddr_storage ss; + const char *hostname, *sep; + char *name; + int len, rc; + + if (!ip_addr || !unc) + return -EINVAL; + + len = strlen(unc); + if (len < 3) { + cFYI(1, "%s: unc is too short: %s", __func__, unc); + return -EINVAL; + } + + /* Discount leading slashes for cifs */ + len -= 2; + hostname = unc + 2; + + /* Search for server name delimiter */ + sep = memchr(hostname, '\\', len); + if (sep) + len = sep - hostname; + else + cFYI(1, "%s: probably server name is whole unc: %s", + __func__, unc); + + /* Try to interpret hostname as an IPv4 or IPv6 address */ + rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len); + if (rc > 0) + goto name_is_IP_address; + + /* Perform the upcall */ + rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL); + if (rc < 0) + cERROR(1, "%s: unable to resolve: %*.*s", + __func__, len, len, hostname); + else + cFYI(1, "%s: resolved: %*.*s to %s", + __func__, len, len, hostname, *ip_addr); + return rc; + +name_is_IP_address: + name = kmalloc(len + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + memcpy(name, hostname, len); + name[len] = 0; + cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name); + *ip_addr = name; + return 0; +} diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h new file mode 100644 index 00000000..d3f5d27f --- /dev/null +++ b/fs/cifs/dns_resolve.h @@ -0,0 +1,30 @@ +/* + * fs/cifs/dns_resolve.h -- DNS Resolver upcall management for CIFS DFS + * Handles host name to IP address resolution + * + * Copyright (c) International Business Machines Corp., 2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _DNS_RESOLVE_H +#define _DNS_RESOLVE_H + +#ifdef __KERNEL__ +extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr); +#endif /* KERNEL */ + +#endif /* _DNS_RESOLVE_H */ diff --git a/fs/cifs/export.c b/fs/cifs/export.c new file mode 100644 index 00000000..55d87ac5 --- /dev/null +++ b/fs/cifs/export.c @@ -0,0 +1,67 @@ +/* + * fs/cifs/export.c + * + * Copyright (C) International Business Machines Corp., 2007 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * Common Internet FileSystem (CIFS) client + * + * Operations related to support for exporting files via NFSD + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + /* + * See Documentation/filesystems/nfs/Exporting + * and examples in fs/exportfs + * + * Since cifs is a network file system, an "fsid" must be included for + * any nfs exports file entries which refer to cifs paths. In addition + * the cifs mount must be mounted with the "serverino" option (ie use stable + * server inode numbers instead of locally generated temporary ones). + * Although cifs inodes do not use generation numbers (have generation number + * of zero) - the inode number alone should be good enough for simple cases + * in which users want to export cifs shares with NFS. The decode and encode + * could be improved by using a new routine which expects 64 bit inode numbers + * instead of the default 32 bit routines in fs/exportfs + * + */ + +#include +#include +#include "cifsglob.h" +#include "cifs_debug.h" +#include "cifsfs.h" + +#ifdef CIFS_NFSD_EXPORT +static struct dentry *cifs_get_parent(struct dentry *dentry) +{ + /* BB need to add code here eventually to enable export via NFSD */ + cFYI(1, "get parent for %p", dentry); + return ERR_PTR(-EACCES); +} + +const struct export_operations cifs_export_ops = { + .get_parent = cifs_get_parent, +/* Following five export operations are unneeded so far and can default: + .get_dentry = + .get_name = + .find_exported_dentry = + .decode_fh = + .encode_fs = */ +}; + +#endif /* CIFS_NFSD_EXPORT */ + diff --git a/fs/cifs/file.c b/fs/cifs/file.c new file mode 100644 index 00000000..9040cb06 --- /dev/null +++ b/fs/cifs/file.c @@ -0,0 +1,2471 @@ +/* + * fs/cifs/file.c + * + * vfs operations that deal with files + * + * Copyright (C) International Business Machines Corp., 2002,2010 + * Author(s): Steve French (sfrench@us.ibm.com) + * Jeremy Allison (jra@samba.org) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "fscache.h" + +static inline int cifs_convert_flags(unsigned int flags) +{ + if ((flags & O_ACCMODE) == O_RDONLY) + return GENERIC_READ; + else if ((flags & O_ACCMODE) == O_WRONLY) + return GENERIC_WRITE; + else if ((flags & O_ACCMODE) == O_RDWR) { + /* GENERIC_ALL is too much permission to request + can cause unnecessary access denied on create */ + /* return GENERIC_ALL; */ + return (GENERIC_READ | GENERIC_WRITE); + } + + return (READ_CONTROL | FILE_WRITE_ATTRIBUTES | FILE_READ_ATTRIBUTES | + FILE_WRITE_EA | FILE_APPEND_DATA | FILE_WRITE_DATA | + FILE_READ_DATA); +} + +static u32 cifs_posix_convert_flags(unsigned int flags) +{ + u32 posix_flags = 0; + + if ((flags & O_ACCMODE) == O_RDONLY) + posix_flags = SMB_O_RDONLY; + else if ((flags & O_ACCMODE) == O_WRONLY) + posix_flags = SMB_O_WRONLY; + else if ((flags & O_ACCMODE) == O_RDWR) + posix_flags = SMB_O_RDWR; + + if (flags & O_CREAT) + posix_flags |= SMB_O_CREAT; + if (flags & O_EXCL) + posix_flags |= SMB_O_EXCL; + if (flags & O_TRUNC) + posix_flags |= SMB_O_TRUNC; + /* be safe and imply O_SYNC for O_DSYNC */ + if (flags & O_DSYNC) + posix_flags |= SMB_O_SYNC; + if (flags & O_DIRECTORY) + posix_flags |= SMB_O_DIRECTORY; + if (flags & O_NOFOLLOW) + posix_flags |= SMB_O_NOFOLLOW; + if (flags & O_DIRECT) + posix_flags |= SMB_O_DIRECT; + + return posix_flags; +} + +static inline int cifs_get_disposition(unsigned int flags) +{ + if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + return FILE_CREATE; + else if ((flags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC)) + return FILE_OVERWRITE_IF; + else if ((flags & O_CREAT) == O_CREAT) + return FILE_OPEN_IF; + else if ((flags & O_TRUNC) == O_TRUNC) + return FILE_OVERWRITE; + else + return FILE_OPEN; +} + +int cifs_posix_open(char *full_path, struct inode **pinode, + struct super_block *sb, int mode, unsigned int f_flags, + __u32 *poplock, __u16 *pnetfid, int xid) +{ + int rc; + FILE_UNIX_BASIC_INFO *presp_data; + __u32 posix_flags = 0; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_fattr fattr; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + + cFYI(1, "posix open %s", full_path); + + presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); + if (presp_data == NULL) + return -ENOMEM; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + goto posix_open_ret; + } + + tcon = tlink_tcon(tlink); + mode &= ~current_umask(); + + posix_flags = cifs_posix_convert_flags(f_flags); + rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data, + poplock, full_path, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_put_tlink(tlink); + + if (rc) + goto posix_open_ret; + + if (presp_data->Type == cpu_to_le32(-1)) + goto posix_open_ret; /* open ok, caller does qpathinfo */ + + if (!pinode) + goto posix_open_ret; /* caller does not need info */ + + cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb); + + /* get new inode and set it up */ + if (*pinode == NULL) { + cifs_fill_uniqueid(sb, &fattr); + *pinode = cifs_iget(sb, &fattr); + if (!*pinode) { + rc = -ENOMEM; + goto posix_open_ret; + } + } else { + cifs_fattr_to_inode(*pinode, &fattr); + } + +posix_open_ret: + kfree(presp_data); + return rc; +} + +static int +cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, unsigned int f_flags, __u32 *poplock, + __u16 *pnetfid, int xid) +{ + int rc; + int desiredAccess; + int disposition; + FILE_ALL_INFO *buf; + + desiredAccess = cifs_convert_flags(f_flags); + +/********************************************************************* + * open flag mapping table: + * + * POSIX Flag CIFS Disposition + * ---------- ---------------- + * O_CREAT FILE_OPEN_IF + * O_CREAT | O_EXCL FILE_CREATE + * O_CREAT | O_TRUNC FILE_OVERWRITE_IF + * O_TRUNC FILE_OVERWRITE + * none of the above FILE_OPEN + * + * Note that there is not a direct match between disposition + * FILE_SUPERSEDE (ie create whether or not file exists although + * O_CREAT | O_TRUNC is similar but truncates the existing + * file rather than creating a new file as FILE_SUPERSEDE does + * (which uses the attributes / metadata passed in on open call) + *? + *? O_SYNC is a reasonable match to CIFS writethrough flag + *? and the read write flags match reasonably. O_LARGEFILE + *? is irrelevant because largefile support is always used + *? by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY, + * O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation + *********************************************************************/ + + disposition = cifs_get_disposition(f_flags); + + /* BB pass O_SYNC flag through on file attributes .. BB */ + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (tcon->ses->capabilities & CAP_NT_SMBS) + rc = CIFSSMBOpen(xid, tcon, full_path, disposition, + desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags + & CIFS_MOUNT_MAP_SPECIAL_CHR); + else + rc = SMBLegacyOpen(xid, tcon, full_path, disposition, + desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags + & CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (rc) + goto out; + + if (tcon->unix_ext) + rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, + xid); + else + rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, + xid, pnetfid); + +out: + kfree(buf); + return rc; +} + +struct cifsFileInfo * +cifs_new_fileinfo(__u16 fileHandle, struct file *file, + struct tcon_link *tlink, __u32 oplock) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct cifsInodeInfo *pCifsInode = CIFS_I(inode); + struct cifsFileInfo *pCifsFile; + + pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); + if (pCifsFile == NULL) + return pCifsFile; + + pCifsFile->count = 1; + pCifsFile->netfid = fileHandle; + pCifsFile->pid = current->tgid; + pCifsFile->uid = current_fsuid(); + pCifsFile->dentry = dget(dentry); + pCifsFile->f_flags = file->f_flags; + pCifsFile->invalidHandle = false; + pCifsFile->tlink = cifs_get_tlink(tlink); + mutex_init(&pCifsFile->fh_mutex); + mutex_init(&pCifsFile->lock_mutex); + INIT_LIST_HEAD(&pCifsFile->llist); + INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break); + + spin_lock(&cifs_file_list_lock); + list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList)); + /* if readable file instance put first in list*/ + if (file->f_mode & FMODE_READ) + list_add(&pCifsFile->flist, &pCifsInode->openFileList); + else + list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); + spin_unlock(&cifs_file_list_lock); + + cifs_set_oplock_level(pCifsInode, oplock); + + file->private_data = pCifsFile; + return pCifsFile; +} + +/* + * Release a reference on the file private data. This may involve closing + * the filehandle out on the server. Must be called without holding + * cifs_file_list_lock. + */ +void cifsFileInfo_put(struct cifsFileInfo *cifs_file) +{ + struct inode *inode = cifs_file->dentry->d_inode; + struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsLockInfo *li, *tmp; + + spin_lock(&cifs_file_list_lock); + if (--cifs_file->count > 0) { + spin_unlock(&cifs_file_list_lock); + return; + } + + /* remove it from the lists */ + list_del(&cifs_file->flist); + list_del(&cifs_file->tlist); + + if (list_empty(&cifsi->openFileList)) { + cFYI(1, "closing last open instance for inode %p", + cifs_file->dentry->d_inode); + + /* in strict cache mode we need invalidate mapping on the last + close because it may cause a error when we open this file + again and get at least level II oplock */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) + CIFS_I(inode)->invalid_mapping = true; + + cifs_set_oplock_level(cifsi, 0); + } + spin_unlock(&cifs_file_list_lock); + + if (!tcon->need_reconnect && !cifs_file->invalidHandle) { + int xid, rc; + + xid = GetXid(); + rc = CIFSSMBClose(xid, tcon, cifs_file->netfid); + FreeXid(xid); + } + + /* Delete any outstanding lock records. We'll lose them when the file + * is closed anyway. + */ + mutex_lock(&cifs_file->lock_mutex); + list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) { + list_del(&li->llist); + kfree(li); + } + mutex_unlock(&cifs_file->lock_mutex); + + cifs_put_tlink(cifs_file->tlink); + dput(cifs_file->dentry); + kfree(cifs_file); +} + +int cifs_open(struct inode *inode, struct file *file) +{ + int rc = -EACCES; + int xid; + __u32 oplock; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct tcon_link *tlink; + struct cifsFileInfo *pCifsFile = NULL; + char *full_path = NULL; + bool posix_open_ok = false; + __u16 netfid; + + xid = GetXid(); + + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + FreeXid(xid); + return PTR_ERR(tlink); + } + tcon = tlink_tcon(tlink); + + full_path = build_path_from_dentry(file->f_path.dentry); + if (full_path == NULL) { + rc = -ENOMEM; + goto out; + } + + cFYI(1, "inode = 0x%p file flags are 0x%x for %s", + inode, file->f_flags, full_path); + + if (oplockEnabled) + oplock = REQ_OPLOCK; + else + oplock = 0; + + if (!tcon->broken_posix_open && tcon->unix_ext && + (tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_POSIX_PATH_OPS_CAP & + le64_to_cpu(tcon->fsUnixInfo.Capability))) { + /* can not refresh inode info since size could be stale */ + rc = cifs_posix_open(full_path, &inode, inode->i_sb, + cifs_sb->mnt_file_mode /* ignored */, + file->f_flags, &oplock, &netfid, xid); + if (rc == 0) { + cFYI(1, "posix open succeeded"); + posix_open_ok = true; + } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { + if (tcon->ses->serverNOS) + cERROR(1, "server %s of type %s returned" + " unexpected error on SMB posix open" + ", disabling posix open support." + " Check if server update available.", + tcon->ses->serverName, + tcon->ses->serverNOS); + tcon->broken_posix_open = true; + } else if ((rc != -EIO) && (rc != -EREMOTE) && + (rc != -EOPNOTSUPP)) /* path not found or net err */ + goto out; + /* else fallthrough to retry open the old way on network i/o + or DFS errors */ + } + + if (!posix_open_ok) { + rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, + file->f_flags, &oplock, &netfid, xid); + if (rc) + goto out; + } + + pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock); + if (pCifsFile == NULL) { + CIFSSMBClose(xid, tcon, netfid); + rc = -ENOMEM; + goto out; + } + + cifs_fscache_set_inode_cookie(inode, file); + + if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) { + /* time to set mode which we can not set earlier due to + problems creating new read-only files */ + struct cifs_unix_set_info_args args = { + .mode = inode->i_mode, + .uid = NO_CHANGE_64, + .gid = NO_CHANGE_64, + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = 0, + }; + CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid, + pCifsFile->pid); + } + +out: + kfree(full_path); + FreeXid(xid); + cifs_put_tlink(tlink); + return rc; +} + +/* Try to reacquire byte range locks that were released when session */ +/* to server was lost */ +static int cifs_relock_file(struct cifsFileInfo *cifsFile) +{ + int rc = 0; + +/* BB list all locks open on this file and relock */ + + return rc; +} + +static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) +{ + int rc = -EACCES; + int xid; + __u32 oplock; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsInodeInfo *pCifsInode; + struct inode *inode; + char *full_path = NULL; + int desiredAccess; + int disposition = FILE_OPEN; + __u16 netfid; + + xid = GetXid(); + mutex_lock(&pCifsFile->fh_mutex); + if (!pCifsFile->invalidHandle) { + mutex_unlock(&pCifsFile->fh_mutex); + rc = 0; + FreeXid(xid); + return rc; + } + + inode = pCifsFile->dentry->d_inode; + cifs_sb = CIFS_SB(inode->i_sb); + tcon = tlink_tcon(pCifsFile->tlink); + +/* can not grab rename sem here because various ops, including + those that already have the rename sem can end up causing writepage + to get called and if the server was down that means we end up here, + and we can never tell if the caller already has the rename_sem */ + full_path = build_path_from_dentry(pCifsFile->dentry); + if (full_path == NULL) { + rc = -ENOMEM; + mutex_unlock(&pCifsFile->fh_mutex); + FreeXid(xid); + return rc; + } + + cFYI(1, "inode = 0x%p file flags 0x%x for %s", + inode, pCifsFile->f_flags, full_path); + + if (oplockEnabled) + oplock = REQ_OPLOCK; + else + oplock = 0; + + if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_POSIX_PATH_OPS_CAP & + le64_to_cpu(tcon->fsUnixInfo.Capability))) { + + /* + * O_CREAT, O_EXCL and O_TRUNC already had their effect on the + * original open. Must mask them off for a reopen. + */ + unsigned int oflags = pCifsFile->f_flags & + ~(O_CREAT | O_EXCL | O_TRUNC); + + rc = cifs_posix_open(full_path, NULL, inode->i_sb, + cifs_sb->mnt_file_mode /* ignored */, + oflags, &oplock, &netfid, xid); + if (rc == 0) { + cFYI(1, "posix reopen succeeded"); + goto reopen_success; + } + /* fallthrough to retry open the old way on errors, especially + in the reconnect path it is important to retry hard */ + } + + desiredAccess = cifs_convert_flags(pCifsFile->f_flags); + + /* Can not refresh inode by passing in file_info buf to be returned + by SMBOpen and then calling get_inode_info with returned buf + since file might have write behind data that needs to be flushed + and server version of file size can be stale. If we knew for sure + that inode was not dirty locally we could do this */ + + rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, + CREATE_NOT_DIR, &netfid, &oplock, NULL, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) { + mutex_unlock(&pCifsFile->fh_mutex); + cFYI(1, "cifs_open returned 0x%x", rc); + cFYI(1, "oplock: %d", oplock); + goto reopen_error_exit; + } + +reopen_success: + pCifsFile->netfid = netfid; + pCifsFile->invalidHandle = false; + mutex_unlock(&pCifsFile->fh_mutex); + pCifsInode = CIFS_I(inode); + + if (can_flush) { + rc = filemap_write_and_wait(inode->i_mapping); + mapping_set_error(inode->i_mapping, rc); + + if (tcon->unix_ext) + rc = cifs_get_inode_info_unix(&inode, + full_path, inode->i_sb, xid); + else + rc = cifs_get_inode_info(&inode, + full_path, NULL, inode->i_sb, + xid, NULL); + } /* else we are writing out data to server already + and could deadlock if we tried to flush data, and + since we do not know if we have data that would + invalidate the current end of file on the server + we can not go to the server to get the new inod + info */ + + cifs_set_oplock_level(pCifsInode, oplock); + + cifs_relock_file(pCifsFile); + +reopen_error_exit: + kfree(full_path); + FreeXid(xid); + return rc; +} + +int cifs_close(struct inode *inode, struct file *file) +{ + if (file->private_data != NULL) { + cifsFileInfo_put(file->private_data); + file->private_data = NULL; + } + + /* return code from the ->release op is always ignored */ + return 0; +} + +int cifs_closedir(struct inode *inode, struct file *file) +{ + int rc = 0; + int xid; + struct cifsFileInfo *pCFileStruct = file->private_data; + char *ptmp; + + cFYI(1, "Closedir inode = 0x%p", inode); + + xid = GetXid(); + + if (pCFileStruct) { + struct cifs_tcon *pTcon = tlink_tcon(pCFileStruct->tlink); + + cFYI(1, "Freeing private data in close dir"); + spin_lock(&cifs_file_list_lock); + if (!pCFileStruct->srch_inf.endOfSearch && + !pCFileStruct->invalidHandle) { + pCFileStruct->invalidHandle = true; + spin_unlock(&cifs_file_list_lock); + rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid); + cFYI(1, "Closing uncompleted readdir with rc %d", + rc); + /* not much we can do if it fails anyway, ignore rc */ + rc = 0; + } else + spin_unlock(&cifs_file_list_lock); + ptmp = pCFileStruct->srch_inf.ntwrk_buf_start; + if (ptmp) { + cFYI(1, "closedir free smb buf in srch struct"); + pCFileStruct->srch_inf.ntwrk_buf_start = NULL; + if (pCFileStruct->srch_inf.smallBuf) + cifs_small_buf_release(ptmp); + else + cifs_buf_release(ptmp); + } + cifs_put_tlink(pCFileStruct->tlink); + kfree(file->private_data); + file->private_data = NULL; + } + /* BB can we lock the filestruct while this is going on? */ + FreeXid(xid); + return rc; +} + +static int store_file_lock(struct cifsFileInfo *fid, __u64 len, + __u64 offset, __u8 lockType) +{ + struct cifsLockInfo *li = + kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); + if (li == NULL) + return -ENOMEM; + li->offset = offset; + li->length = len; + li->type = lockType; + mutex_lock(&fid->lock_mutex); + list_add(&li->llist, &fid->llist); + mutex_unlock(&fid->lock_mutex); + return 0; +} + +int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) +{ + int rc, xid; + __u32 numLock = 0; + __u32 numUnlock = 0; + __u64 length; + bool wait_flag = false; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + __u16 netfid; + __u8 lockType = LOCKING_ANDX_LARGE_FILES; + bool posix_locking = 0; + + length = 1 + pfLock->fl_end - pfLock->fl_start; + rc = -EACCES; + xid = GetXid(); + + cFYI(1, "Lock parm: 0x%x flockflags: " + "0x%x flocktype: 0x%x start: %lld end: %lld", + cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start, + pfLock->fl_end); + + if (pfLock->fl_flags & FL_POSIX) + cFYI(1, "Posix"); + if (pfLock->fl_flags & FL_FLOCK) + cFYI(1, "Flock"); + if (pfLock->fl_flags & FL_SLEEP) { + cFYI(1, "Blocking lock"); + wait_flag = true; + } + if (pfLock->fl_flags & FL_ACCESS) + cFYI(1, "Process suspended by mandatory locking - " + "not implemented yet"); + if (pfLock->fl_flags & FL_LEASE) + cFYI(1, "Lease on file - not implemented yet"); + if (pfLock->fl_flags & + (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE))) + cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags); + + if (pfLock->fl_type == F_WRLCK) { + cFYI(1, "F_WRLCK "); + numLock = 1; + } else if (pfLock->fl_type == F_UNLCK) { + cFYI(1, "F_UNLCK"); + numUnlock = 1; + /* Check if unlock includes more than + one lock range */ + } else if (pfLock->fl_type == F_RDLCK) { + cFYI(1, "F_RDLCK"); + lockType |= LOCKING_ANDX_SHARED_LOCK; + numLock = 1; + } else if (pfLock->fl_type == F_EXLCK) { + cFYI(1, "F_EXLCK"); + numLock = 1; + } else if (pfLock->fl_type == F_SHLCK) { + cFYI(1, "F_SHLCK"); + lockType |= LOCKING_ANDX_SHARED_LOCK; + numLock = 1; + } else + cFYI(1, "Unknown type of lock"); + + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink); + netfid = ((struct cifsFileInfo *)file->private_data)->netfid; + + if ((tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + posix_locking = 1; + /* BB add code here to normalize offset and length to + account for negative length which we can not accept over the + wire */ + if (IS_GETLK(cmd)) { + if (posix_locking) { + int posix_lock_type; + if (lockType & LOCKING_ANDX_SHARED_LOCK) + posix_lock_type = CIFS_RDLCK; + else + posix_lock_type = CIFS_WRLCK; + rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */, + length, pfLock, posix_lock_type, + wait_flag); + FreeXid(xid); + return rc; + } + + /* BB we could chain these into one lock request BB */ + rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, + 0, 1, lockType, 0 /* wait flag */, 0); + if (rc == 0) { + rc = CIFSSMBLock(xid, tcon, netfid, length, + pfLock->fl_start, 1 /* numUnlock */ , + 0 /* numLock */ , lockType, + 0 /* wait flag */, 0); + pfLock->fl_type = F_UNLCK; + if (rc != 0) + cERROR(1, "Error unlocking previously locked " + "range %d during test of lock", rc); + rc = 0; + + } else { + /* if rc == ERR_SHARING_VIOLATION ? */ + rc = 0; + + if (lockType & LOCKING_ANDX_SHARED_LOCK) { + pfLock->fl_type = F_WRLCK; + } else { + rc = CIFSSMBLock(xid, tcon, netfid, length, + pfLock->fl_start, 0, 1, + lockType | LOCKING_ANDX_SHARED_LOCK, + 0 /* wait flag */, 0); + if (rc == 0) { + rc = CIFSSMBLock(xid, tcon, netfid, + length, pfLock->fl_start, 1, 0, + lockType | + LOCKING_ANDX_SHARED_LOCK, + 0 /* wait flag */, 0); + pfLock->fl_type = F_RDLCK; + if (rc != 0) + cERROR(1, "Error unlocking " + "previously locked range %d " + "during test of lock", rc); + rc = 0; + } else { + pfLock->fl_type = F_WRLCK; + rc = 0; + } + } + } + + FreeXid(xid); + return rc; + } + + if (!numLock && !numUnlock) { + /* if no lock or unlock then nothing + to do since we do not know what it is */ + FreeXid(xid); + return -EOPNOTSUPP; + } + + if (posix_locking) { + int posix_lock_type; + if (lockType & LOCKING_ANDX_SHARED_LOCK) + posix_lock_type = CIFS_RDLCK; + else + posix_lock_type = CIFS_WRLCK; + + if (numUnlock == 1) + posix_lock_type = CIFS_UNLCK; + + rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */, + length, pfLock, posix_lock_type, + wait_flag); + } else { + struct cifsFileInfo *fid = file->private_data; + + if (numLock) { + rc = CIFSSMBLock(xid, tcon, netfid, length, + pfLock->fl_start, 0, numLock, lockType, + wait_flag, 0); + + if (rc == 0) { + /* For Windows locks we must store them. */ + rc = store_file_lock(fid, length, + pfLock->fl_start, lockType); + } + } else if (numUnlock) { + /* For each stored lock that this unlock overlaps + completely, unlock it. */ + int stored_rc = 0; + struct cifsLockInfo *li, *tmp; + + rc = 0; + mutex_lock(&fid->lock_mutex); + list_for_each_entry_safe(li, tmp, &fid->llist, llist) { + if (pfLock->fl_start <= li->offset && + (pfLock->fl_start + length) >= + (li->offset + li->length)) { + stored_rc = CIFSSMBLock(xid, tcon, + netfid, li->length, + li->offset, 1, 0, + li->type, false, 0); + if (stored_rc) + rc = stored_rc; + else { + list_del(&li->llist); + kfree(li); + } + } + } + mutex_unlock(&fid->lock_mutex); + } + } + + if (pfLock->fl_flags & FL_POSIX) + posix_lock_file_wait(file, pfLock); + FreeXid(xid); + return rc; +} + +/* update the file size (if needed) after a write */ +void +cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, + unsigned int bytes_written) +{ + loff_t end_of_write = offset + bytes_written; + + if (end_of_write > cifsi->server_eof) + cifsi->server_eof = end_of_write; +} + +static ssize_t cifs_write(struct cifsFileInfo *open_file, __u32 pid, + const char *write_data, size_t write_size, + loff_t *poffset) +{ + int rc = 0; + unsigned int bytes_written = 0; + unsigned int total_written; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *pTcon; + int xid; + struct dentry *dentry = open_file->dentry; + struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode); + struct cifs_io_parms io_parms; + + cifs_sb = CIFS_SB(dentry->d_sb); + + cFYI(1, "write %zd bytes to offset %lld of %s", write_size, + *poffset, dentry->d_name.name); + + pTcon = tlink_tcon(open_file->tlink); + + xid = GetXid(); + + for (total_written = 0; write_size > total_written; + total_written += bytes_written) { + rc = -EAGAIN; + while (rc == -EAGAIN) { + struct kvec iov[2]; + unsigned int len; + + if (open_file->invalidHandle) { + /* we could deadlock if we called + filemap_fdatawait from here so tell + reopen_file not to flush data to + server now */ + rc = cifs_reopen_file(open_file, false); + if (rc != 0) + break; + } + + len = min((size_t)cifs_sb->wsize, + write_size - total_written); + /* iov[0] is reserved for smb header */ + iov[1].iov_base = (char *)write_data + total_written; + iov[1].iov_len = len; + io_parms.netfid = open_file->netfid; + io_parms.pid = pid; + io_parms.tcon = pTcon; + io_parms.offset = *poffset; + io_parms.length = len; + rc = CIFSSMBWrite2(xid, &io_parms, &bytes_written, iov, + 1, 0); + } + if (rc || (bytes_written == 0)) { + if (total_written) + break; + else { + FreeXid(xid); + return rc; + } + } else { + cifs_update_eof(cifsi, *poffset, bytes_written); + *poffset += bytes_written; + } + } + + cifs_stats_bytes_written(pTcon, total_written); + + if (total_written > 0) { + spin_lock(&dentry->d_inode->i_lock); + if (*poffset > dentry->d_inode->i_size) + i_size_write(dentry->d_inode, *poffset); + spin_unlock(&dentry->d_inode->i_lock); + } + mark_inode_dirty_sync(dentry->d_inode); + FreeXid(xid); + return total_written; +} + +struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, + bool fsuid_only) +{ + struct cifsFileInfo *open_file = NULL; + struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + + /* only filter by fsuid on multiuser mounts */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) + fsuid_only = false; + + spin_lock(&cifs_file_list_lock); + /* we could simply get the first_list_entry since write-only entries + are always at the end of the list but since the first entry might + have a close pending, we go through the whole list */ + list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { + if (fsuid_only && open_file->uid != current_fsuid()) + continue; + if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) { + if (!open_file->invalidHandle) { + /* found a good file */ + /* lock it so it will not be closed on us */ + cifsFileInfo_get(open_file); + spin_unlock(&cifs_file_list_lock); + return open_file; + } /* else might as well continue, and look for + another, or simply have the caller reopen it + again rather than trying to fix this handle */ + } else /* write only file */ + break; /* write only files are last so must be done */ + } + spin_unlock(&cifs_file_list_lock); + return NULL; +} + +struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, + bool fsuid_only) +{ + struct cifsFileInfo *open_file, *inv_file = NULL; + struct cifs_sb_info *cifs_sb; + bool any_available = false; + int rc; + unsigned int refind = 0; + + /* Having a null inode here (because mapping->host was set to zero by + the VFS or MM) should not happen but we had reports of on oops (due to + it being zero) during stress testcases so we need to check for it */ + + if (cifs_inode == NULL) { + cERROR(1, "Null inode passed to cifs_writeable_file"); + dump_stack(); + return NULL; + } + + cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + + /* only filter by fsuid on multiuser mounts */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) + fsuid_only = false; + + spin_lock(&cifs_file_list_lock); +refind_writable: + if (refind > MAX_REOPEN_ATT) { + spin_unlock(&cifs_file_list_lock); + return NULL; + } + list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { + if (!any_available && open_file->pid != current->tgid) + continue; + if (fsuid_only && open_file->uid != current_fsuid()) + continue; + if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { + if (!open_file->invalidHandle) { + /* found a good writable file */ + cifsFileInfo_get(open_file); + spin_unlock(&cifs_file_list_lock); + return open_file; + } else { + if (!inv_file) + inv_file = open_file; + } + } + } + /* couldn't find useable FH with same pid, try any available */ + if (!any_available) { + any_available = true; + goto refind_writable; + } + + if (inv_file) { + any_available = false; + cifsFileInfo_get(inv_file); + } + + spin_unlock(&cifs_file_list_lock); + + if (inv_file) { + rc = cifs_reopen_file(inv_file, false); + if (!rc) + return inv_file; + else { + spin_lock(&cifs_file_list_lock); + list_move_tail(&inv_file->flist, + &cifs_inode->openFileList); + spin_unlock(&cifs_file_list_lock); + cifsFileInfo_put(inv_file); + spin_lock(&cifs_file_list_lock); + ++refind; + goto refind_writable; + } + } + + return NULL; +} + +static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) +{ + struct address_space *mapping = page->mapping; + loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + char *write_data; + int rc = -EFAULT; + int bytes_written = 0; + struct inode *inode; + struct cifsFileInfo *open_file; + + if (!mapping || !mapping->host) + return -EFAULT; + + inode = page->mapping->host; + + offset += (loff_t)from; + write_data = kmap(page); + write_data += from; + + if ((to > PAGE_CACHE_SIZE) || (from > to)) { + kunmap(page); + return -EIO; + } + + /* racing with truncate? */ + if (offset > mapping->host->i_size) { + kunmap(page); + return 0; /* don't care */ + } + + /* check to make sure that we are not extending the file */ + if (mapping->host->i_size - offset < (loff_t)to) + to = (unsigned)(mapping->host->i_size - offset); + + open_file = find_writable_file(CIFS_I(mapping->host), false); + if (open_file) { + bytes_written = cifs_write(open_file, open_file->pid, + write_data, to - from, &offset); + cifsFileInfo_put(open_file); + /* Does mm or vfs already set times? */ + inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); + if ((bytes_written > 0) && (offset)) + rc = 0; + else if (bytes_written < 0) + rc = bytes_written; + } else { + cFYI(1, "No writeable filehandles for inode"); + rc = -EIO; + } + + kunmap(page); + return rc; +} + +static int cifs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb); + bool done = false, scanned = false, range_whole = false; + pgoff_t end, index; + struct cifs_writedata *wdata; + struct page *page; + int rc = 0; + + /* + * If wsize is smaller than the page cache size, default to writing + * one page at a time via cifs_writepage + */ + if (cifs_sb->wsize < PAGE_CACHE_SIZE) + return generic_writepages(mapping, wbc); + + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = true; + scanned = true; + } +retry: + while (!done && index <= end) { + unsigned int i, nr_pages, found_pages; + pgoff_t next = 0, tofind; + struct page **pages; + + tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1, + end - index) + 1; + + wdata = cifs_writedata_alloc((unsigned int)tofind); + if (!wdata) { + rc = -ENOMEM; + break; + } + + /* + * find_get_pages_tag seems to return a max of 256 on each + * iteration, so we must call it several times in order to + * fill the array or the wsize is effectively limited to + * 256 * PAGE_CACHE_SIZE. + */ + found_pages = 0; + pages = wdata->pages; + do { + nr_pages = find_get_pages_tag(mapping, &index, + PAGECACHE_TAG_DIRTY, + tofind, pages); + found_pages += nr_pages; + tofind -= nr_pages; + pages += nr_pages; + } while (nr_pages && tofind && index <= end); + + if (found_pages == 0) { + kref_put(&wdata->refcount, cifs_writedata_release); + break; + } + + nr_pages = 0; + for (i = 0; i < found_pages; i++) { + page = wdata->pages[i]; + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + + if (nr_pages == 0) + lock_page(page); + else if (!trylock_page(page)) + break; + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + break; + } + + if (!wbc->range_cyclic && page->index > end) { + done = true; + unlock_page(page); + break; + } + + if (next && (page->index != next)) { + /* Not next consecutive page */ + unlock_page(page); + break; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + break; + } + + /* + * This actually clears the dirty bit in the radix tree. + * See cifs_writepage() for more commentary. + */ + set_page_writeback(page); + + if (page_offset(page) >= mapping->host->i_size) { + done = true; + unlock_page(page); + end_page_writeback(page); + break; + } + + wdata->pages[i] = page; + next = page->index + 1; + ++nr_pages; + } + + /* reset index to refind any pages skipped */ + if (nr_pages == 0) + index = wdata->pages[0]->index + 1; + + /* put any pages we aren't going to use */ + for (i = nr_pages; i < found_pages; i++) { + page_cache_release(wdata->pages[i]); + wdata->pages[i] = NULL; + } + + /* nothing to write? */ + if (nr_pages == 0) { + kref_put(&wdata->refcount, cifs_writedata_release); + continue; + } + + wdata->sync_mode = wbc->sync_mode; + wdata->nr_pages = nr_pages; + wdata->offset = page_offset(wdata->pages[0]); + + do { + if (wdata->cfile != NULL) + cifsFileInfo_put(wdata->cfile); + wdata->cfile = find_writable_file(CIFS_I(mapping->host), + false); + if (!wdata->cfile) { + cERROR(1, "No writable handles for inode"); + rc = -EBADF; + break; + } + rc = cifs_async_writev(wdata); + } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN); + + for (i = 0; i < nr_pages; ++i) + unlock_page(wdata->pages[i]); + + /* send failure -- clean up the mess */ + if (rc != 0) { + for (i = 0; i < nr_pages; ++i) { + if (rc == -EAGAIN) + redirty_page_for_writepage(wbc, + wdata->pages[i]); + else + SetPageError(wdata->pages[i]); + end_page_writeback(wdata->pages[i]); + page_cache_release(wdata->pages[i]); + } + if (rc != -EAGAIN) + mapping_set_error(mapping, rc); + } + kref_put(&wdata->refcount, cifs_writedata_release); + + wbc->nr_to_write -= nr_pages; + if (wbc->nr_to_write <= 0) + done = true; + + index = next; + } + + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = true; + index = 0; + goto retry; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + + return rc; +} + +static int +cifs_writepage_locked(struct page *page, struct writeback_control *wbc) +{ + int rc; + int xid; + + xid = GetXid(); +/* BB add check for wbc flags */ + page_cache_get(page); + if (!PageUptodate(page)) + cFYI(1, "ppw - page not up to date"); + + /* + * Set the "writeback" flag, and clear "dirty" in the radix tree. + * + * A writepage() implementation always needs to do either this, + * or re-dirty the page with "redirty_page_for_writepage()" in + * the case of a failure. + * + * Just unlocking the page will cause the radix tree tag-bits + * to fail to update with the state of the page correctly. + */ + set_page_writeback(page); +retry_write: + rc = cifs_partialpagewrite(page, 0, PAGE_CACHE_SIZE); + if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL) + goto retry_write; + else if (rc == -EAGAIN) + redirty_page_for_writepage(wbc, page); + else if (rc != 0) + SetPageError(page); + else + SetPageUptodate(page); + end_page_writeback(page); + page_cache_release(page); + FreeXid(xid); + return rc; +} + +static int cifs_writepage(struct page *page, struct writeback_control *wbc) +{ + int rc = cifs_writepage_locked(page, wbc); + unlock_page(page); + return rc; +} + +static int cifs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int rc; + struct inode *inode = mapping->host; + struct cifsFileInfo *cfile = file->private_data; + struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + __u32 pid; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + pid = cfile->pid; + else + pid = current->tgid; + + cFYI(1, "write_end for page %p from pos %lld with %d bytes", + page, pos, copied); + + if (PageChecked(page)) { + if (copied == len) + SetPageUptodate(page); + ClearPageChecked(page); + } else if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE) + SetPageUptodate(page); + + if (!PageUptodate(page)) { + char *page_data; + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + int xid; + + xid = GetXid(); + /* this is probably better than directly calling + partialpage_write since in this function the file handle is + known which we might as well leverage */ + /* BB check if anything else missing out of ppw + such as updating last write time */ + page_data = kmap(page); + rc = cifs_write(cfile, pid, page_data + offset, copied, &pos); + /* if (rc < 0) should we set writebehind rc? */ + kunmap(page); + + FreeXid(xid); + } else { + rc = copied; + pos += copied; + set_page_dirty(page); + } + + if (rc > 0) { + spin_lock(&inode->i_lock); + if (pos > inode->i_size) + i_size_write(inode, pos); + spin_unlock(&inode->i_lock); + } + + unlock_page(page); + page_cache_release(page); + + return rc; +} + +int cifs_strict_fsync(struct file *file, int datasync) +{ + int xid; + int rc = 0; + struct cifs_tcon *tcon; + struct cifsFileInfo *smbfile = file->private_data; + struct inode *inode = file->f_path.dentry->d_inode; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + xid = GetXid(); + + cFYI(1, "Sync file - name: %s datasync: 0x%x", + file->f_path.dentry->d_name.name, datasync); + + if (!CIFS_I(inode)->clientCanCacheRead) { + rc = cifs_invalidate_mapping(inode); + if (rc) { + cFYI(1, "rc: %d during invalidate phase", rc); + rc = 0; /* don't care about it in fsync */ + } + } + + tcon = tlink_tcon(smbfile->tlink); + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) + rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); + + FreeXid(xid); + return rc; +} + +int cifs_fsync(struct file *file, int datasync) +{ + int xid; + int rc = 0; + struct cifs_tcon *tcon; + struct cifsFileInfo *smbfile = file->private_data; + struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + + xid = GetXid(); + + cFYI(1, "Sync file - name: %s datasync: 0x%x", + file->f_path.dentry->d_name.name, datasync); + + tcon = tlink_tcon(smbfile->tlink); + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) + rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); + + FreeXid(xid); + return rc; +} + +/* + * As file closes, flush all cached write data for this inode checking + * for write behind errors. + */ +int cifs_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file->f_path.dentry->d_inode; + int rc = 0; + + if (file->f_mode & FMODE_WRITE) + rc = filemap_write_and_wait(inode->i_mapping); + + cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc); + + return rc; +} + +static int +cifs_write_allocate_pages(struct page **pages, unsigned long num_pages) +{ + int rc = 0; + unsigned long i; + + for (i = 0; i < num_pages; i++) { + pages[i] = alloc_page(__GFP_HIGHMEM); + if (!pages[i]) { + /* + * save number of pages we have already allocated and + * return with ENOMEM error + */ + num_pages = i; + rc = -ENOMEM; + goto error; + } + } + + return rc; + +error: + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + return rc; +} + +static inline +size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len) +{ + size_t num_pages; + size_t clen; + + clen = min_t(const size_t, len, wsize); + num_pages = clen / PAGE_CACHE_SIZE; + if (clen % PAGE_CACHE_SIZE) + num_pages++; + + if (cur_len) + *cur_len = clen; + + return num_pages; +} + +static ssize_t +cifs_iovec_write(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *poffset) +{ + unsigned int written; + unsigned long num_pages, npages, i; + size_t copied, len, cur_len; + ssize_t total_written = 0; + struct kvec *to_send; + struct page **pages; + struct iov_iter it; + struct inode *inode; + struct cifsFileInfo *open_file; + struct cifs_tcon *pTcon; + struct cifs_sb_info *cifs_sb; + struct cifs_io_parms io_parms; + int xid, rc; + __u32 pid; + + len = iov_length(iov, nr_segs); + if (!len) + return 0; + + rc = generic_write_checks(file, poffset, &len, 0); + if (rc) + return rc; + + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + num_pages = get_numpages(cifs_sb->wsize, len, &cur_len); + + pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL); + if (!pages) + return -ENOMEM; + + to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL); + if (!to_send) { + kfree(pages); + return -ENOMEM; + } + + rc = cifs_write_allocate_pages(pages, num_pages); + if (rc) { + kfree(pages); + kfree(to_send); + return rc; + } + + xid = GetXid(); + open_file = file->private_data; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + pid = open_file->pid; + else + pid = current->tgid; + + pTcon = tlink_tcon(open_file->tlink); + inode = file->f_path.dentry->d_inode; + + iov_iter_init(&it, iov, nr_segs, len, 0); + npages = num_pages; + + do { + size_t save_len = cur_len; + for (i = 0; i < npages; i++) { + copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE); + copied = iov_iter_copy_from_user(pages[i], &it, 0, + copied); + cur_len -= copied; + iov_iter_advance(&it, copied); + to_send[i+1].iov_base = kmap(pages[i]); + to_send[i+1].iov_len = copied; + } + + cur_len = save_len - cur_len; + + do { + if (open_file->invalidHandle) { + rc = cifs_reopen_file(open_file, false); + if (rc != 0) + break; + } + io_parms.netfid = open_file->netfid; + io_parms.pid = pid; + io_parms.tcon = pTcon; + io_parms.offset = *poffset; + io_parms.length = cur_len; + rc = CIFSSMBWrite2(xid, &io_parms, &written, to_send, + npages, 0); + } while (rc == -EAGAIN); + + for (i = 0; i < npages; i++) + kunmap(pages[i]); + + if (written) { + len -= written; + total_written += written; + cifs_update_eof(CIFS_I(inode), *poffset, written); + *poffset += written; + } else if (rc < 0) { + if (!total_written) + total_written = rc; + break; + } + + /* get length and number of kvecs of the next write */ + npages = get_numpages(cifs_sb->wsize, len, &cur_len); + } while (len > 0); + + if (total_written > 0) { + spin_lock(&inode->i_lock); + if (*poffset > inode->i_size) + i_size_write(inode, *poffset); + spin_unlock(&inode->i_lock); + } + + cifs_stats_bytes_written(pTcon, total_written); + mark_inode_dirty_sync(inode); + + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + kfree(to_send); + kfree(pages); + FreeXid(xid); + return total_written; +} + +ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + ssize_t written; + struct inode *inode; + + inode = iocb->ki_filp->f_path.dentry->d_inode; + + /* + * BB - optimize the way when signing is disabled. We can drop this + * extra memory-to-memory copying and use iovec buffers for constructing + * write request. + */ + + written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos); + if (written > 0) { + CIFS_I(inode)->invalid_mapping = true; + iocb->ki_pos = pos; + } + + return written; +} + +ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct inode *inode; + + inode = iocb->ki_filp->f_path.dentry->d_inode; + + if (CIFS_I(inode)->clientCanCacheAll) + return generic_file_aio_write(iocb, iov, nr_segs, pos); + + /* + * In strict cache mode we need to write the data to the server exactly + * from the pos to pos+len-1 rather than flush all affected pages + * because it may cause a error with mandatory locks on these pages but + * not on the region from pos to ppos+len-1. + */ + + return cifs_user_writev(iocb, iov, nr_segs, pos); +} + +static ssize_t +cifs_iovec_read(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *poffset) +{ + int rc; + int xid; + ssize_t total_read; + unsigned int bytes_read = 0; + size_t len, cur_len; + int iov_offset = 0; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *pTcon; + struct cifsFileInfo *open_file; + struct smb_com_read_rsp *pSMBr; + struct cifs_io_parms io_parms; + char *read_data; + __u32 pid; + + if (!nr_segs) + return 0; + + len = iov_length(iov, nr_segs); + if (!len) + return 0; + + xid = GetXid(); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + + open_file = file->private_data; + pTcon = tlink_tcon(open_file->tlink); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + pid = open_file->pid; + else + pid = current->tgid; + + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + cFYI(1, "attempting read on write only file instance"); + + for (total_read = 0; total_read < len; total_read += bytes_read) { + cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); + rc = -EAGAIN; + read_data = NULL; + + while (rc == -EAGAIN) { + int buf_type = CIFS_NO_BUFFER; + if (open_file->invalidHandle) { + rc = cifs_reopen_file(open_file, true); + if (rc != 0) + break; + } + io_parms.netfid = open_file->netfid; + io_parms.pid = pid; + io_parms.tcon = pTcon; + io_parms.offset = *poffset; + io_parms.length = cur_len; + rc = CIFSSMBRead(xid, &io_parms, &bytes_read, + &read_data, &buf_type); + pSMBr = (struct smb_com_read_rsp *)read_data; + if (read_data) { + char *data_offset = read_data + 4 + + le16_to_cpu(pSMBr->DataOffset); + if (memcpy_toiovecend(iov, data_offset, + iov_offset, bytes_read)) + rc = -EFAULT; + if (buf_type == CIFS_SMALL_BUFFER) + cifs_small_buf_release(read_data); + else if (buf_type == CIFS_LARGE_BUFFER) + cifs_buf_release(read_data); + read_data = NULL; + iov_offset += bytes_read; + } + } + + if (rc || (bytes_read == 0)) { + if (total_read) { + break; + } else { + FreeXid(xid); + return rc; + } + } else { + cifs_stats_bytes_read(pTcon, bytes_read); + *poffset += bytes_read; + } + } + + FreeXid(xid); + return total_read; +} + +ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + ssize_t read; + + read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos); + if (read > 0) + iocb->ki_pos = pos; + + return read; +} + +ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct inode *inode; + + inode = iocb->ki_filp->f_path.dentry->d_inode; + + if (CIFS_I(inode)->clientCanCacheRead) + return generic_file_aio_read(iocb, iov, nr_segs, pos); + + /* + * In strict cache mode we need to read from the server all the time + * if we don't have level II oplock because the server can delay mtime + * change - so we can't make a decision about inode invalidating. + * And we can also fail with pagereading if there are mandatory locks + * on pages affected by this read but not on the region from pos to + * pos+len-1. + */ + + return cifs_user_readv(iocb, iov, nr_segs, pos); +} + +static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, + loff_t *poffset) +{ + int rc = -EACCES; + unsigned int bytes_read = 0; + unsigned int total_read; + unsigned int current_read_size; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *pTcon; + int xid; + char *current_offset; + struct cifsFileInfo *open_file; + struct cifs_io_parms io_parms; + int buf_type = CIFS_NO_BUFFER; + __u32 pid; + + xid = GetXid(); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + + if (file->private_data == NULL) { + rc = -EBADF; + FreeXid(xid); + return rc; + } + open_file = file->private_data; + pTcon = tlink_tcon(open_file->tlink); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + pid = open_file->pid; + else + pid = current->tgid; + + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + cFYI(1, "attempting read on write only file instance"); + + for (total_read = 0, current_offset = read_data; + read_size > total_read; + total_read += bytes_read, current_offset += bytes_read) { + current_read_size = min_t(const int, read_size - total_read, + cifs_sb->rsize); + /* For windows me and 9x we do not want to request more + than it negotiated since it will refuse the read then */ + if ((pTcon->ses) && + !(pTcon->ses->capabilities & CAP_LARGE_FILES)) { + current_read_size = min_t(const int, current_read_size, + pTcon->ses->server->maxBuf - 128); + } + rc = -EAGAIN; + while (rc == -EAGAIN) { + if (open_file->invalidHandle) { + rc = cifs_reopen_file(open_file, true); + if (rc != 0) + break; + } + io_parms.netfid = open_file->netfid; + io_parms.pid = pid; + io_parms.tcon = pTcon; + io_parms.offset = *poffset; + io_parms.length = current_read_size; + rc = CIFSSMBRead(xid, &io_parms, &bytes_read, + ¤t_offset, &buf_type); + } + if (rc || (bytes_read == 0)) { + if (total_read) { + break; + } else { + FreeXid(xid); + return rc; + } + } else { + cifs_stats_bytes_read(pTcon, total_read); + *poffset += bytes_read; + } + } + FreeXid(xid); + return total_read; +} + +/* + * If the page is mmap'ed into a process' page tables, then we need to make + * sure that it doesn't change while being written back. + */ +static int +cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + + lock_page(page); + return VM_FAULT_LOCKED; +} + +static struct vm_operations_struct cifs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = cifs_page_mkwrite, +}; + +int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) +{ + int rc, xid; + struct inode *inode = file->f_path.dentry->d_inode; + + xid = GetXid(); + + if (!CIFS_I(inode)->clientCanCacheRead) { + rc = cifs_invalidate_mapping(inode); + if (rc) + return rc; + } + + rc = generic_file_mmap(file, vma); + if (rc == 0) + vma->vm_ops = &cifs_file_vm_ops; + FreeXid(xid); + return rc; +} + +int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int rc, xid; + + xid = GetXid(); + rc = cifs_revalidate_file(file); + if (rc) { + cFYI(1, "Validation prior to mmap failed, error=%d", rc); + FreeXid(xid); + return rc; + } + rc = generic_file_mmap(file, vma); + if (rc == 0) + vma->vm_ops = &cifs_file_vm_ops; + FreeXid(xid); + return rc; +} + + +static void cifs_copy_cache_pages(struct address_space *mapping, + struct list_head *pages, int bytes_read, char *data) +{ + struct page *page; + char *target; + + while (bytes_read > 0) { + if (list_empty(pages)) + break; + + page = list_entry(pages->prev, struct page, lru); + list_del(&page->lru); + + if (add_to_page_cache_lru(page, mapping, page->index, + GFP_KERNEL)) { + page_cache_release(page); + cFYI(1, "Add page cache failed"); + data += PAGE_CACHE_SIZE; + bytes_read -= PAGE_CACHE_SIZE; + continue; + } + page_cache_release(page); + + target = kmap_atomic(page, KM_USER0); + + if (PAGE_CACHE_SIZE > bytes_read) { + memcpy(target, data, bytes_read); + /* zero the tail end of this partial page */ + memset(target + bytes_read, 0, + PAGE_CACHE_SIZE - bytes_read); + bytes_read = 0; + } else { + memcpy(target, data, PAGE_CACHE_SIZE); + bytes_read -= PAGE_CACHE_SIZE; + } + kunmap_atomic(target, KM_USER0); + + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + data += PAGE_CACHE_SIZE; + + /* add page to FS-Cache */ + cifs_readpage_to_fscache(mapping->host, page); + } + return; +} + +static int cifs_readpages(struct file *file, struct address_space *mapping, + struct list_head *page_list, unsigned num_pages) +{ + int rc = -EACCES; + int xid; + loff_t offset; + struct page *page; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *pTcon; + unsigned int bytes_read = 0; + unsigned int read_size, i; + char *smb_read_data = NULL; + struct smb_com_read_rsp *pSMBr; + struct cifsFileInfo *open_file; + struct cifs_io_parms io_parms; + int buf_type = CIFS_NO_BUFFER; + __u32 pid; + + xid = GetXid(); + if (file->private_data == NULL) { + rc = -EBADF; + FreeXid(xid); + return rc; + } + open_file = file->private_data; + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + pTcon = tlink_tcon(open_file->tlink); + + /* + * Reads as many pages as possible from fscache. Returns -ENOBUFS + * immediately if the cookie is negative + */ + rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, + &num_pages); + if (rc == 0) + goto read_complete; + + cFYI(DBG2, "rpages: num pages %d", num_pages); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + pid = open_file->pid; + else + pid = current->tgid; + + for (i = 0; i < num_pages; ) { + unsigned contig_pages; + struct page *tmp_page; + unsigned long expected_index; + + if (list_empty(page_list)) + break; + + page = list_entry(page_list->prev, struct page, lru); + offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + + /* count adjacent pages that we will read into */ + contig_pages = 0; + expected_index = + list_entry(page_list->prev, struct page, lru)->index; + list_for_each_entry_reverse(tmp_page, page_list, lru) { + if (tmp_page->index == expected_index) { + contig_pages++; + expected_index++; + } else + break; + } + if (contig_pages + i > num_pages) + contig_pages = num_pages - i; + + /* for reads over a certain size could initiate async + read ahead */ + + read_size = contig_pages * PAGE_CACHE_SIZE; + /* Read size needs to be in multiples of one page */ + read_size = min_t(const unsigned int, read_size, + cifs_sb->rsize & PAGE_CACHE_MASK); + cFYI(DBG2, "rpages: read size 0x%x contiguous pages %d", + read_size, contig_pages); + rc = -EAGAIN; + while (rc == -EAGAIN) { + if (open_file->invalidHandle) { + rc = cifs_reopen_file(open_file, true); + if (rc != 0) + break; + } + io_parms.netfid = open_file->netfid; + io_parms.pid = pid; + io_parms.tcon = pTcon; + io_parms.offset = offset; + io_parms.length = read_size; + rc = CIFSSMBRead(xid, &io_parms, &bytes_read, + &smb_read_data, &buf_type); + /* BB more RC checks ? */ + if (rc == -EAGAIN) { + if (smb_read_data) { + if (buf_type == CIFS_SMALL_BUFFER) + cifs_small_buf_release(smb_read_data); + else if (buf_type == CIFS_LARGE_BUFFER) + cifs_buf_release(smb_read_data); + smb_read_data = NULL; + } + } + } + if ((rc < 0) || (smb_read_data == NULL)) { + cFYI(1, "Read error in readpages: %d", rc); + break; + } else if (bytes_read > 0) { + task_io_account_read(bytes_read); + pSMBr = (struct smb_com_read_rsp *)smb_read_data; + cifs_copy_cache_pages(mapping, page_list, bytes_read, + smb_read_data + 4 /* RFC1001 hdr */ + + le16_to_cpu(pSMBr->DataOffset)); + + i += bytes_read >> PAGE_CACHE_SHIFT; + cifs_stats_bytes_read(pTcon, bytes_read); + if ((bytes_read & PAGE_CACHE_MASK) != bytes_read) { + i++; /* account for partial page */ + + /* server copy of file can have smaller size + than client */ + /* BB do we need to verify this common case ? + this case is ok - if we are at server EOF + we will hit it on next read */ + + /* break; */ + } + } else { + cFYI(1, "No bytes read (%d) at offset %lld . " + "Cleaning remaining pages from readahead list", + bytes_read, offset); + /* BB turn off caching and do new lookup on + file size at server? */ + break; + } + if (smb_read_data) { + if (buf_type == CIFS_SMALL_BUFFER) + cifs_small_buf_release(smb_read_data); + else if (buf_type == CIFS_LARGE_BUFFER) + cifs_buf_release(smb_read_data); + smb_read_data = NULL; + } + bytes_read = 0; + } + +/* need to free smb_read_data buf before exit */ + if (smb_read_data) { + if (buf_type == CIFS_SMALL_BUFFER) + cifs_small_buf_release(smb_read_data); + else if (buf_type == CIFS_LARGE_BUFFER) + cifs_buf_release(smb_read_data); + smb_read_data = NULL; + } + +read_complete: + FreeXid(xid); + return rc; +} + +static int cifs_readpage_worker(struct file *file, struct page *page, + loff_t *poffset) +{ + char *read_data; + int rc; + + /* Is the page cached? */ + rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page); + if (rc == 0) + goto read_complete; + + page_cache_get(page); + read_data = kmap(page); + /* for reads over a certain size could initiate async read ahead */ + + rc = cifs_read(file, read_data, PAGE_CACHE_SIZE, poffset); + + if (rc < 0) + goto io_error; + else + cFYI(1, "Bytes read %d", rc); + + file->f_path.dentry->d_inode->i_atime = + current_fs_time(file->f_path.dentry->d_inode->i_sb); + + if (PAGE_CACHE_SIZE > rc) + memset(read_data + rc, 0, PAGE_CACHE_SIZE - rc); + + flush_dcache_page(page); + SetPageUptodate(page); + + /* send this page to the cache */ + cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page); + + rc = 0; + +io_error: + kunmap(page); + page_cache_release(page); + +read_complete: + return rc; +} + +static int cifs_readpage(struct file *file, struct page *page) +{ + loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + int rc = -EACCES; + int xid; + + xid = GetXid(); + + if (file->private_data == NULL) { + rc = -EBADF; + FreeXid(xid); + return rc; + } + + cFYI(1, "readpage %p at offset %d 0x%x\n", + page, (int)offset, (int)offset); + + rc = cifs_readpage_worker(file, page, &offset); + + unlock_page(page); + + FreeXid(xid); + return rc; +} + +static int is_inode_writable(struct cifsInodeInfo *cifs_inode) +{ + struct cifsFileInfo *open_file; + + spin_lock(&cifs_file_list_lock); + list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { + if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { + spin_unlock(&cifs_file_list_lock); + return 1; + } + } + spin_unlock(&cifs_file_list_lock); + return 0; +} + +/* We do not want to update the file size from server for inodes + open for write - to avoid races with writepage extending + the file - in the future we could consider allowing + refreshing the inode only on increases in the file size + but this is tricky to do without racing with writebehind + page caching in the current Linux kernel design */ +bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file) +{ + if (!cifsInode) + return true; + + if (is_inode_writable(cifsInode)) { + /* This inode is open for write at least once */ + struct cifs_sb_info *cifs_sb; + + cifs_sb = CIFS_SB(cifsInode->vfs_inode.i_sb); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) { + /* since no page cache to corrupt on directio + we can change size safely */ + return true; + } + + if (i_size_read(&cifsInode->vfs_inode) < end_of_file) + return true; + + return false; + } else + return true; +} + +static int cifs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + loff_t offset = pos & (PAGE_CACHE_SIZE - 1); + loff_t page_start = pos & PAGE_MASK; + loff_t i_size; + struct page *page; + int rc = 0; + + cFYI(1, "write_begin from %lld len %d", (long long)pos, len); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + rc = -ENOMEM; + goto out; + } + + if (PageUptodate(page)) + goto out; + + /* + * If we write a full page it will be up to date, no need to read from + * the server. If the write is short, we'll end up doing a sync write + * instead. + */ + if (len == PAGE_CACHE_SIZE) + goto out; + + /* + * optimize away the read when we have an oplock, and we're not + * expecting to use any of the data we'd be reading in. That + * is, when the page lies beyond the EOF, or straddles the EOF + * and the write will cover all of the existing data. + */ + if (CIFS_I(mapping->host)->clientCanCacheRead) { + i_size = i_size_read(mapping->host); + if (page_start >= i_size || + (offset == 0 && (pos + len) >= i_size)) { + zero_user_segments(page, 0, offset, + offset + len, + PAGE_CACHE_SIZE); + /* + * PageChecked means that the parts of the page + * to which we're not writing are considered up + * to date. Once the data is copied to the + * page, it can be set uptodate. + */ + SetPageChecked(page); + goto out; + } + } + + if ((file->f_flags & O_ACCMODE) != O_WRONLY) { + /* + * might as well read a page, it is fast enough. If we get + * an error, we don't need to return it. cifs_write_end will + * do a sync write instead since PG_uptodate isn't set. + */ + cifs_readpage_worker(file, page, &page_start); + } else { + /* we could try using another file handle if there is one - + but how would we lock it to prevent close of that handle + racing with this read? In any case + this will be written out by write_end so is fine */ + } +out: + *pagep = page; + return rc; +} + +static int cifs_release_page(struct page *page, gfp_t gfp) +{ + if (PagePrivate(page)) + return 0; + + return cifs_fscache_release_page(page, gfp); +} + +static void cifs_invalidate_page(struct page *page, unsigned long offset) +{ + struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); + + if (offset == 0) + cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); +} + +static int cifs_launder_page(struct page *page) +{ + int rc = 0; + loff_t range_start = page_offset(page); + loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, + .range_start = range_start, + .range_end = range_end, + }; + + cFYI(1, "Launder page: %p", page); + + if (clear_page_dirty_for_io(page)) + rc = cifs_writepage_locked(page, &wbc); + + cifs_fscache_invalidate_page(page, page->mapping->host); + return rc; +} + +void cifs_oplock_break(struct work_struct *work) +{ + struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, + oplock_break); + struct inode *inode = cfile->dentry->d_inode; + struct cifsInodeInfo *cinode = CIFS_I(inode); + int rc = 0; + + if (inode && S_ISREG(inode->i_mode)) { + if (cinode->clientCanCacheRead) + break_lease(inode, O_RDONLY); + else + break_lease(inode, O_WRONLY); + rc = filemap_fdatawrite(inode->i_mapping); + if (cinode->clientCanCacheRead == 0) { + rc = filemap_fdatawait(inode->i_mapping); + mapping_set_error(inode->i_mapping, rc); + invalidate_remote_inode(inode); + } + cFYI(1, "Oplock flush inode %p rc %d", inode, rc); + } + + /* + * releasing stale oplock after recent reconnect of smb session using + * a now incorrect file handle is not a data integrity issue but do + * not bother sending an oplock release if session to server still is + * disconnected since oplock already released by the server + */ + if (!cfile->oplock_break_cancelled) { + rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0, + 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false, + cinode->clientCanCacheRead ? 1 : 0); + cFYI(1, "Oplock release rc = %d", rc); + } + + /* + * We might have kicked in before is_valid_oplock_break() + * finished grabbing reference for us. Make sure it's done by + * waiting for cifs_file_list_lock. + */ + spin_lock(&cifs_file_list_lock); + spin_unlock(&cifs_file_list_lock); + + cifs_oplock_break_put(cfile); +} + +/* must be called while holding cifs_file_list_lock */ +void cifs_oplock_break_get(struct cifsFileInfo *cfile) +{ + cifs_sb_active(cfile->dentry->d_sb); + cifsFileInfo_get(cfile); +} + +void cifs_oplock_break_put(struct cifsFileInfo *cfile) +{ + struct super_block *sb = cfile->dentry->d_sb; + + cifsFileInfo_put(cfile); + cifs_sb_deactive(sb); +} + +const struct address_space_operations cifs_addr_ops = { + .readpage = cifs_readpage, + .readpages = cifs_readpages, + .writepage = cifs_writepage, + .writepages = cifs_writepages, + .write_begin = cifs_write_begin, + .write_end = cifs_write_end, + .set_page_dirty = __set_page_dirty_nobuffers, + .releasepage = cifs_release_page, + .invalidatepage = cifs_invalidate_page, + .launder_page = cifs_launder_page, +}; + +/* + * cifs_readpages requires the server to support a buffer large enough to + * contain the header plus one complete page of data. Otherwise, we need + * to leave cifs_readpages out of the address space operations. + */ +const struct address_space_operations cifs_addr_ops_smallbuf = { + .readpage = cifs_readpage, + .writepage = cifs_writepage, + .writepages = cifs_writepages, + .write_begin = cifs_write_begin, + .write_end = cifs_write_end, + .set_page_dirty = __set_page_dirty_nobuffers, + .releasepage = cifs_release_page, + .invalidatepage = cifs_invalidate_page, + .launder_page = cifs_launder_page, +}; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c new file mode 100644 index 00000000..42e5363b --- /dev/null +++ b/fs/cifs/fscache.c @@ -0,0 +1,235 @@ +/* + * fs/cifs/fscache.c - CIFS filesystem cache interface + * + * Copyright (c) 2010 Novell, Inc. + * Author(s): Suresh Jayaraman + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "fscache.h" +#include "cifsglob.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" + +void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) +{ + server->fscache = + fscache_acquire_cookie(cifs_fscache_netfs.primary_index, + &cifs_fscache_server_index_def, server); + cFYI(1, "%s: (0x%p/0x%p)", __func__, server, + server->fscache); +} + +void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) +{ + cFYI(1, "%s: (0x%p/0x%p)", __func__, server, + server->fscache); + fscache_relinquish_cookie(server->fscache, 0); + server->fscache = NULL; +} + +void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) +{ + struct TCP_Server_Info *server = tcon->ses->server; + + tcon->fscache = + fscache_acquire_cookie(server->fscache, + &cifs_fscache_super_index_def, tcon); + cFYI(1, "%s: (0x%p/0x%p)", __func__, server->fscache, + tcon->fscache); +} + +void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) +{ + cFYI(1, "%s: (0x%p)", __func__, tcon->fscache); + fscache_relinquish_cookie(tcon->fscache, 0); + tcon->fscache = NULL; +} + +static void cifs_fscache_enable_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + + if (cifsi->fscache) + return; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { + cifsi->fscache = fscache_acquire_cookie(tcon->fscache, + &cifs_fscache_inode_object_def, cifsi); + cFYI(1, "%s: got FH cookie (0x%p/0x%p)", __func__, + tcon->fscache, cifsi->fscache); + } +} + +void cifs_fscache_release_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + if (cifsi->fscache) { + cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache); + fscache_relinquish_cookie(cifsi->fscache, 0); + cifsi->fscache = NULL; + } +} + +static void cifs_fscache_disable_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + if (cifsi->fscache) { + cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache); + fscache_uncache_all_inode_pages(cifsi->fscache, inode); + fscache_relinquish_cookie(cifsi->fscache, 1); + cifsi->fscache = NULL; + } +} + +void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) +{ + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) + cifs_fscache_disable_inode_cookie(inode); + else + cifs_fscache_enable_inode_cookie(inode); +} + +void cifs_fscache_reset_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct fscache_cookie *old = cifsi->fscache; + + if (cifsi->fscache) { + /* retire the current fscache cache and get a new one */ + fscache_relinquish_cookie(cifsi->fscache, 1); + + cifsi->fscache = fscache_acquire_cookie( + cifs_sb_master_tcon(cifs_sb)->fscache, + &cifs_fscache_inode_object_def, + cifsi); + cFYI(1, "%s: new cookie 0x%p oldcookie 0x%p", + __func__, cifsi->fscache, old); + } +} + +int cifs_fscache_release_page(struct page *page, gfp_t gfp) +{ + if (PageFsCache(page)) { + struct inode *inode = page->mapping->host; + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + cFYI(1, "%s: (0x%p/0x%p)", __func__, page, + cifsi->fscache); + if (!fscache_maybe_release_page(cifsi->fscache, page, gfp)) + return 0; + } + + return 1; +} + +static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx, + int error) +{ + cFYI(1, "%s: (0x%p/%d)", __func__, page, error); + if (!error) + SetPageUptodate(page); + unlock_page(page); +} + +/* + * Retrieve a page from FS-Cache + */ +int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) +{ + int ret; + + cFYI(1, "%s: (fsc:%p, p:%p, i:0x%p", __func__, + CIFS_I(inode)->fscache, page, inode); + ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page, + cifs_readpage_from_fscache_complete, + NULL, + GFP_KERNEL); + switch (ret) { + + case 0: /* page found in fscache, read submitted */ + cFYI(1, "%s: submitted", __func__); + return ret; + case -ENOBUFS: /* page won't be cached */ + case -ENODATA: /* page not in cache */ + cFYI(1, "%s: %d", __func__, ret); + return 1; + + default: + cERROR(1, "unknown error ret = %d", ret); + } + return ret; +} + +/* + * Retrieve a set of pages from FS-Cache + */ +int __cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + int ret; + + cFYI(1, "%s: (0x%p/%u/0x%p)", __func__, + CIFS_I(inode)->fscache, *nr_pages, inode); + ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping, + pages, nr_pages, + cifs_readpage_from_fscache_complete, + NULL, + mapping_gfp_mask(mapping)); + switch (ret) { + case 0: /* read submitted to the cache for all pages */ + cFYI(1, "%s: submitted", __func__); + return ret; + + case -ENOBUFS: /* some pages are not cached and can't be */ + case -ENODATA: /* some pages are not cached */ + cFYI(1, "%s: no page", __func__); + return 1; + + default: + cFYI(1, "unknown error ret = %d", ret); + } + + return ret; +} + +void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) +{ + int ret; + + cFYI(1, "%s: (fsc: %p, p: %p, i: %p)", __func__, + CIFS_I(inode)->fscache, page, inode); + ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL); + if (ret != 0) + fscache_uncache_page(CIFS_I(inode)->fscache, page); +} + +void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct fscache_cookie *cookie = cifsi->fscache; + + cFYI(1, "%s: (0x%p/0x%p)", __func__, page, cookie); + fscache_wait_on_page_write(cookie, page); + fscache_uncache_page(cookie, page); +} + diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h new file mode 100644 index 00000000..63539323 --- /dev/null +++ b/fs/cifs/fscache.h @@ -0,0 +1,136 @@ +/* + * fs/cifs/fscache.h - CIFS filesystem cache interface definitions + * + * Copyright (c) 2010 Novell, Inc. + * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _CIFS_FSCACHE_H +#define _CIFS_FSCACHE_H + +#include + +#include "cifsglob.h" + +#ifdef CONFIG_CIFS_FSCACHE + +extern struct fscache_netfs cifs_fscache_netfs; +extern const struct fscache_cookie_def cifs_fscache_server_index_def; +extern const struct fscache_cookie_def cifs_fscache_super_index_def; +extern const struct fscache_cookie_def cifs_fscache_inode_object_def; + +extern int cifs_fscache_register(void); +extern void cifs_fscache_unregister(void); + +/* + * fscache.c + */ +extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *); +extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *); +extern void cifs_fscache_get_super_cookie(struct cifs_tcon *); +extern void cifs_fscache_release_super_cookie(struct cifs_tcon *); + +extern void cifs_fscache_release_inode_cookie(struct inode *); +extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *); +extern void cifs_fscache_reset_inode_cookie(struct inode *); + +extern void __cifs_fscache_invalidate_page(struct page *, struct inode *); +extern int cifs_fscache_release_page(struct page *page, gfp_t gfp); +extern int __cifs_readpage_from_fscache(struct inode *, struct page *); +extern int __cifs_readpages_from_fscache(struct inode *, + struct address_space *, + struct list_head *, + unsigned *); + +extern void __cifs_readpage_to_fscache(struct inode *, struct page *); + +static inline void cifs_fscache_invalidate_page(struct page *page, + struct inode *inode) +{ + if (PageFsCache(page)) + __cifs_fscache_invalidate_page(page, inode); +} + +static inline int cifs_readpage_from_fscache(struct inode *inode, + struct page *page) +{ + if (CIFS_I(inode)->fscache) + return __cifs_readpage_from_fscache(inode, page); + + return -ENOBUFS; +} + +static inline int cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + if (CIFS_I(inode)->fscache) + return __cifs_readpages_from_fscache(inode, mapping, pages, + nr_pages); + return -ENOBUFS; +} + +static inline void cifs_readpage_to_fscache(struct inode *inode, + struct page *page) +{ + if (PageFsCache(page)) + __cifs_readpage_to_fscache(inode, page); +} + +#else /* CONFIG_CIFS_FSCACHE */ +static inline int cifs_fscache_register(void) { return 0; } +static inline void cifs_fscache_unregister(void) {} + +static inline void +cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {} +static inline void +cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {} +static inline void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) {} +static inline void +cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {} + +static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {} +static inline void cifs_fscache_set_inode_cookie(struct inode *inode, + struct file *filp) {} +static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {} +static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp) +{ + return 1; /* May release page */ +} + +static inline void cifs_fscache_invalidate_page(struct page *page, + struct inode *inode) {} +static inline int +cifs_readpage_from_fscache(struct inode *inode, struct page *page) +{ + return -ENOBUFS; +} + +static inline int cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} + +static inline void cifs_readpage_to_fscache(struct inode *inode, + struct page *page) {} + +#endif /* CONFIG_CIFS_FSCACHE */ + +#endif /* _CIFS_FSCACHE_H */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c new file mode 100644 index 00000000..745e5cdc --- /dev/null +++ b/fs/cifs/inode.c @@ -0,0 +1,2272 @@ +/* + * fs/cifs/inode.c + * + * Copyright (C) International Business Machines Corp., 2002,2010 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "fscache.h" + + +static void cifs_set_ops(struct inode *inode) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &cifs_file_inode_ops; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + inode->i_fop = &cifs_file_direct_nobrl_ops; + else + inode->i_fop = &cifs_file_direct_ops; + } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + inode->i_fop = &cifs_file_strict_nobrl_ops; + else + inode->i_fop = &cifs_file_strict_ops; + } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + inode->i_fop = &cifs_file_nobrl_ops; + else { /* not direct, send byte range locks */ + inode->i_fop = &cifs_file_ops; + } + + /* check if server can support readpages */ + if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf < + PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) + inode->i_data.a_ops = &cifs_addr_ops_smallbuf; + else + inode->i_data.a_ops = &cifs_addr_ops; + break; + case S_IFDIR: +#ifdef CONFIG_CIFS_DFS_UPCALL + if (IS_AUTOMOUNT(inode)) { + inode->i_op = &cifs_dfs_referral_inode_operations; + } else { +#else /* NO DFS support, treat as a directory */ + { +#endif + inode->i_op = &cifs_dir_inode_ops; + inode->i_fop = &cifs_dir_ops; + } + break; + case S_IFLNK: + inode->i_op = &cifs_symlink_inode_ops; + break; + default: + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + } +} + +/* check inode attributes against fattr. If they don't match, tag the + * inode for cache invalidation + */ +static void +cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) +{ + struct cifsInodeInfo *cifs_i = CIFS_I(inode); + + cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid); + + if (inode->i_state & I_NEW) { + cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid); + return; + } + + /* don't bother with revalidation if we have an oplock */ + if (cifs_i->clientCanCacheRead) { + cFYI(1, "%s: inode %llu is oplocked", __func__, + cifs_i->uniqueid); + return; + } + + /* revalidate if mtime or size have changed */ + if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) && + cifs_i->server_eof == fattr->cf_eof) { + cFYI(1, "%s: inode %llu is unchanged", __func__, + cifs_i->uniqueid); + return; + } + + cFYI(1, "%s: invalidating inode %llu mapping", __func__, + cifs_i->uniqueid); + cifs_i->invalid_mapping = true; +} + +/* populate an inode with info from a cifs_fattr struct */ +void +cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) +{ + struct cifsInodeInfo *cifs_i = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + unsigned long oldtime = cifs_i->time; + + cifs_revalidate_cache(inode, fattr); + + inode->i_atime = fattr->cf_atime; + inode->i_mtime = fattr->cf_mtime; + inode->i_ctime = fattr->cf_ctime; + inode->i_rdev = fattr->cf_rdev; + inode->i_nlink = fattr->cf_nlink; + inode->i_uid = fattr->cf_uid; + inode->i_gid = fattr->cf_gid; + + /* if dynperm is set, don't clobber existing mode */ + if (inode->i_state & I_NEW || + !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) + inode->i_mode = fattr->cf_mode; + + cifs_i->cifsAttrs = fattr->cf_cifsattrs; + + if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) + cifs_i->time = 0; + else + cifs_i->time = jiffies; + + cFYI(1, "inode 0x%p old_time=%ld new_time=%ld", inode, + oldtime, cifs_i->time); + + cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING; + + cifs_i->server_eof = fattr->cf_eof; + /* + * Can't safely change the file size here if the client is writing to + * it due to potential races. + */ + spin_lock(&inode->i_lock); + if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) { + i_size_write(inode, fattr->cf_eof); + + /* + * i_blocks is not related to (i_size / i_blksize), + * but instead 512 byte (2**9) size is required for + * calculating num blocks. + */ + inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9; + } + spin_unlock(&inode->i_lock); + + if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) + inode->i_flags |= S_AUTOMOUNT; + cifs_set_ops(inode); +} + +void +cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) + return; + + fattr->cf_uniqueid = iunique(sb, ROOT_I); +} + +/* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */ +void +cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, + struct cifs_sb_info *cifs_sb) +{ + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_uniqueid = le64_to_cpu(info->UniqueId); + fattr->cf_bytes = le64_to_cpu(info->NumOfBytes); + fattr->cf_eof = le64_to_cpu(info->EndOfFile); + + fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); + fattr->cf_mtime = cifs_NTtimeToUnix(info->LastModificationTime); + fattr->cf_ctime = cifs_NTtimeToUnix(info->LastStatusChange); + fattr->cf_mode = le64_to_cpu(info->Permissions); + + /* + * Since we set the inode type below we need to mask off + * to avoid strange results if bits set above. + */ + fattr->cf_mode &= ~S_IFMT; + switch (le32_to_cpu(info->Type)) { + case UNIX_FILE: + fattr->cf_mode |= S_IFREG; + fattr->cf_dtype = DT_REG; + break; + case UNIX_SYMLINK: + fattr->cf_mode |= S_IFLNK; + fattr->cf_dtype = DT_LNK; + break; + case UNIX_DIR: + fattr->cf_mode |= S_IFDIR; + fattr->cf_dtype = DT_DIR; + break; + case UNIX_CHARDEV: + fattr->cf_mode |= S_IFCHR; + fattr->cf_dtype = DT_CHR; + fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor), + le64_to_cpu(info->DevMinor) & MINORMASK); + break; + case UNIX_BLOCKDEV: + fattr->cf_mode |= S_IFBLK; + fattr->cf_dtype = DT_BLK; + fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor), + le64_to_cpu(info->DevMinor) & MINORMASK); + break; + case UNIX_FIFO: + fattr->cf_mode |= S_IFIFO; + fattr->cf_dtype = DT_FIFO; + break; + case UNIX_SOCKET: + fattr->cf_mode |= S_IFSOCK; + fattr->cf_dtype = DT_SOCK; + break; + default: + /* safest to call it a file if we do not know */ + fattr->cf_mode |= S_IFREG; + fattr->cf_dtype = DT_REG; + cFYI(1, "unknown type %d", le32_to_cpu(info->Type)); + break; + } + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) + fattr->cf_uid = cifs_sb->mnt_uid; + else + fattr->cf_uid = le64_to_cpu(info->Uid); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) + fattr->cf_gid = cifs_sb->mnt_gid; + else + fattr->cf_gid = le64_to_cpu(info->Gid); + + fattr->cf_nlink = le64_to_cpu(info->Nlinks); +} + +/* + * Fill a cifs_fattr struct with fake inode info. + * + * Needed to setup cifs_fattr data for the directory which is the + * junction to the new submount (ie to setup the fake directory + * which represents a DFS referral). + */ +static void +cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + + cFYI(1, "creating fake fattr for DFS referral"); + + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; + fattr->cf_uid = cifs_sb->mnt_uid; + fattr->cf_gid = cifs_sb->mnt_gid; + fattr->cf_atime = CURRENT_TIME; + fattr->cf_ctime = CURRENT_TIME; + fattr->cf_mtime = CURRENT_TIME; + fattr->cf_nlink = 2; + fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; +} + +int cifs_get_file_info_unix(struct file *filp) +{ + int rc; + int xid; + FILE_UNIX_BASIC_INFO find_data; + struct cifs_fattr fattr; + struct inode *inode = filp->f_path.dentry->d_inode; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsFileInfo *cfile = filp->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + + xid = GetXid(); + rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data); + if (!rc) { + cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); + } else if (rc == -EREMOTE) { + cifs_create_dfs_fattr(&fattr, inode->i_sb); + rc = 0; + } + + cifs_fattr_to_inode(inode, &fattr); + FreeXid(xid); + return rc; +} + +int cifs_get_inode_info_unix(struct inode **pinode, + const unsigned char *full_path, + struct super_block *sb, int xid) +{ + int rc; + FILE_UNIX_BASIC_INFO find_data; + struct cifs_fattr fattr; + struct cifs_tcon *tcon; + struct tcon_link *tlink; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + + cFYI(1, "Getting info on %s", full_path); + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + /* could have done a find first instead but this returns more info */ + rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_put_tlink(tlink); + + if (!rc) { + cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); + } else if (rc == -EREMOTE) { + cifs_create_dfs_fattr(&fattr, sb); + rc = 0; + } else { + return rc; + } + + /* check for Minshall+French symlinks */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { + int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); + if (tmprc) + cFYI(1, "CIFSCheckMFSymlink: %d", tmprc); + } + + if (*pinode == NULL) { + /* get new inode */ + cifs_fill_uniqueid(sb, &fattr); + *pinode = cifs_iget(sb, &fattr); + if (!*pinode) + rc = -ENOMEM; + } else { + /* we already have inode, update it */ + cifs_fattr_to_inode(*pinode, &fattr); + } + + return rc; +} + +static int +cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path, + struct cifs_sb_info *cifs_sb, int xid) +{ + int rc; + int oplock = 0; + __u16 netfid; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct cifs_io_parms io_parms; + char buf[24]; + unsigned int bytes_read; + char *pbuf; + + pbuf = buf; + + fattr->cf_mode &= ~S_IFMT; + + if (fattr->cf_eof == 0) { + fattr->cf_mode |= S_IFIFO; + fattr->cf_dtype = DT_FIFO; + return 0; + } else if (fattr->cf_eof < 8) { + fattr->cf_mode |= S_IFREG; + fattr->cf_dtype = DT_REG; + return -EINVAL; /* EOPNOTSUPP? */ + } + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ, + CREATE_NOT_DIR, &netfid, &oplock, NULL, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc == 0) { + int buf_type = CIFS_NO_BUFFER; + /* Read header */ + io_parms.netfid = netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = 24; + rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, + &buf_type); + if ((rc == 0) && (bytes_read >= 8)) { + if (memcmp("IntxBLK", pbuf, 8) == 0) { + cFYI(1, "Block device"); + fattr->cf_mode |= S_IFBLK; + fattr->cf_dtype = DT_BLK; + if (bytes_read == 24) { + /* we have enough to decode dev num */ + __u64 mjr; /* major */ + __u64 mnr; /* minor */ + mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); + mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); + fattr->cf_rdev = MKDEV(mjr, mnr); + } + } else if (memcmp("IntxCHR", pbuf, 8) == 0) { + cFYI(1, "Char device"); + fattr->cf_mode |= S_IFCHR; + fattr->cf_dtype = DT_CHR; + if (bytes_read == 24) { + /* we have enough to decode dev num */ + __u64 mjr; /* major */ + __u64 mnr; /* minor */ + mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); + mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); + fattr->cf_rdev = MKDEV(mjr, mnr); + } + } else if (memcmp("IntxLNK", pbuf, 7) == 0) { + cFYI(1, "Symlink"); + fattr->cf_mode |= S_IFLNK; + fattr->cf_dtype = DT_LNK; + } else { + fattr->cf_mode |= S_IFREG; /* file? */ + fattr->cf_dtype = DT_REG; + rc = -EOPNOTSUPP; + } + } else { + fattr->cf_mode |= S_IFREG; /* then it is a file */ + fattr->cf_dtype = DT_REG; + rc = -EOPNOTSUPP; /* or some unknown SFU type */ + } + CIFSSMBClose(xid, tcon, netfid); + } + cifs_put_tlink(tlink); + return rc; +} + +#define SFBITS_MASK (S_ISVTX | S_ISGID | S_ISUID) /* SETFILEBITS valid bits */ + +/* + * Fetch mode bits as provided by SFU. + * + * FIXME: Doesn't this clobber the type bit we got from cifs_sfu_type ? + */ +static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, + struct cifs_sb_info *cifs_sb, int xid) +{ +#ifdef CONFIG_CIFS_XATTR + ssize_t rc; + char ea_value[4]; + __u32 mode; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS", + ea_value, 4 /* size of buf */, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_put_tlink(tlink); + if (rc < 0) + return (int)rc; + else if (rc > 3) { + mode = le32_to_cpu(*((__le32 *)ea_value)); + fattr->cf_mode &= ~SFBITS_MASK; + cFYI(1, "special bits 0%o org mode 0%o", mode, + fattr->cf_mode); + fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode; + cFYI(1, "special mode bits 0%o", mode); + } + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ +static void +cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, + struct cifs_sb_info *cifs_sb, bool adjust_tz) +{ + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_cifsattrs = le32_to_cpu(info->Attributes); + if (info->DeletePending) + fattr->cf_flags |= CIFS_FATTR_DELETE_PENDING; + + if (info->LastAccessTime) + fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); + else + fattr->cf_atime = CURRENT_TIME; + + fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); + fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); + + if (adjust_tz) { + fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj; + fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj; + } + + fattr->cf_eof = le64_to_cpu(info->EndOfFile); + fattr->cf_bytes = le64_to_cpu(info->AllocationSize); + fattr->cf_createtime = le64_to_cpu(info->CreationTime); + + if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { + fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; + fattr->cf_dtype = DT_DIR; + } else { + fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_REG; + + /* clear write bits if ATTR_READONLY is set */ + if (fattr->cf_cifsattrs & ATTR_READONLY) + fattr->cf_mode &= ~(S_IWUGO); + } + + fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); + + fattr->cf_uid = cifs_sb->mnt_uid; + fattr->cf_gid = cifs_sb->mnt_gid; +} + +int cifs_get_file_info(struct file *filp) +{ + int rc; + int xid; + FILE_ALL_INFO find_data; + struct cifs_fattr fattr; + struct inode *inode = filp->f_path.dentry->d_inode; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsFileInfo *cfile = filp->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + + xid = GetXid(); + rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); + switch (rc) { + case 0: + cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false); + break; + case -EREMOTE: + cifs_create_dfs_fattr(&fattr, inode->i_sb); + rc = 0; + break; + case -EOPNOTSUPP: + case -EINVAL: + /* + * FIXME: legacy server -- fall back to path-based call? + * for now, just skip revalidating and mark inode for + * immediate reval. + */ + rc = 0; + CIFS_I(inode)->time = 0; + default: + goto cgfi_exit; + } + + /* + * don't bother with SFU junk here -- just mark inode as needing + * revalidation. + */ + fattr.cf_uniqueid = CIFS_I(inode)->uniqueid; + fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; + cifs_fattr_to_inode(inode, &fattr); +cgfi_exit: + FreeXid(xid); + return rc; +} + +int cifs_get_inode_info(struct inode **pinode, + const unsigned char *full_path, FILE_ALL_INFO *pfindData, + struct super_block *sb, int xid, const __u16 *pfid) +{ + int rc = 0, tmprc; + struct cifs_tcon *pTcon; + struct tcon_link *tlink; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + char *buf = NULL; + bool adjustTZ = false; + struct cifs_fattr fattr; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + cFYI(1, "Getting info on %s", full_path); + + if ((pfindData == NULL) && (*pinode != NULL)) { + if (CIFS_I(*pinode)->clientCanCacheRead) { + cFYI(1, "No need to revalidate cached inode sizes"); + goto cgii_exit; + } + } + + /* if file info not passed in then get it from server */ + if (pfindData == NULL) { + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + rc = -ENOMEM; + goto cgii_exit; + } + pfindData = (FILE_ALL_INFO *)buf; + + /* could do find first instead but this returns more info */ + rc = CIFSSMBQPathInfo(xid, pTcon, full_path, pfindData, + 0 /* not legacy */, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + /* BB optimize code so we do not make the above call + when server claims no NT SMB support and the above call + failed at least once - set flag in tcon or mount */ + if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { + rc = SMBQueryInformation(xid, pTcon, full_path, + pfindData, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + adjustTZ = true; + } + } + + if (!rc) { + cifs_all_info_to_fattr(&fattr, (FILE_ALL_INFO *) pfindData, + cifs_sb, adjustTZ); + } else if (rc == -EREMOTE) { + cifs_create_dfs_fattr(&fattr, sb); + rc = 0; + } else { + goto cgii_exit; + } + + /* + * If an inode wasn't passed in, then get the inode number + * + * Is an i_ino of zero legal? Can we use that to check if the server + * supports returning inode numbers? Are there other sanity checks we + * can use to ensure that the server is really filling in that field? + * + * We can not use the IndexNumber field by default from Windows or + * Samba (in ALL_INFO buf) but we can request it explicitly. The SNIA + * CIFS spec claims that this value is unique within the scope of a + * share, and the windows docs hint that it's actually unique + * per-machine. + * + * There may be higher info levels that work but are there Windows + * server or network appliances for which IndexNumber field is not + * guaranteed unique? + */ + if (*pinode == NULL) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + int rc1 = 0; + + rc1 = CIFSGetSrvInodeNumber(xid, pTcon, + full_path, &fattr.cf_uniqueid, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc1 || !fattr.cf_uniqueid) { + cFYI(1, "GetSrvInodeNum rc %d", rc1); + fattr.cf_uniqueid = iunique(sb, ROOT_I); + cifs_autodisable_serverino(cifs_sb); + } + } else { + fattr.cf_uniqueid = iunique(sb, ROOT_I); + } + } else { + fattr.cf_uniqueid = CIFS_I(*pinode)->uniqueid; + } + + /* query for SFU type info if supported and needed */ + if (fattr.cf_cifsattrs & ATTR_SYSTEM && + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { + tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid); + if (tmprc) + cFYI(1, "cifs_sfu_type failed: %d", tmprc); + } + +#ifdef CONFIG_CIFS_ACL + /* fill in 0777 bits from ACL */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, + pfid); + if (rc) { + cFYI(1, "%s: Getting ACL failed with error: %d", + __func__, rc); + goto cgii_exit; + } + } +#endif /* CONFIG_CIFS_ACL */ + + /* fill in remaining high mode bits e.g. SUID, VTX */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) + cifs_sfu_mode(&fattr, full_path, cifs_sb, xid); + + /* check for Minshall+French symlinks */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { + tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); + if (tmprc) + cFYI(1, "CIFSCheckMFSymlink: %d", tmprc); + } + + if (!*pinode) { + *pinode = cifs_iget(sb, &fattr); + if (!*pinode) + rc = -ENOMEM; + } else { + cifs_fattr_to_inode(*pinode, &fattr); + } + +cgii_exit: + kfree(buf); + cifs_put_tlink(tlink); + return rc; +} + +static const struct inode_operations cifs_ipc_inode_ops = { + .lookup = cifs_lookup, +}; + +char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon) +{ + int pplen = vol->prepath ? strlen(vol->prepath) : 0; + int dfsplen; + char *full_path = NULL; + + /* if no prefix path, simply set path to the root of share to "" */ + if (pplen == 0) { + full_path = kmalloc(1, GFP_KERNEL); + if (full_path) + full_path[0] = 0; + return full_path; + } + + if (tcon->Flags & SMB_SHARE_IS_IN_DFS) + dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); + else + dfsplen = 0; + + full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL); + if (full_path == NULL) + return full_path; + + if (dfsplen) + strncpy(full_path, tcon->treeName, dfsplen); + strncpy(full_path + dfsplen, vol->prepath, pplen); + convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); + full_path[dfsplen + pplen] = 0; /* add trailing null */ + return full_path; +} + +static int +cifs_find_inode(struct inode *inode, void *opaque) +{ + struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; + + /* don't match inode with different uniqueid */ + if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) + return 0; + + /* use createtime like an i_generation field */ + if (CIFS_I(inode)->createtime != fattr->cf_createtime) + return 0; + + /* don't match inode of different type */ + if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT)) + return 0; + + /* if it's not a directory or has no dentries, then flag it */ + if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) + fattr->cf_flags |= CIFS_FATTR_INO_COLLISION; + + return 1; +} + +static int +cifs_init_inode(struct inode *inode, void *opaque) +{ + struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; + + CIFS_I(inode)->uniqueid = fattr->cf_uniqueid; + CIFS_I(inode)->createtime = fattr->cf_createtime; + return 0; +} + +/* + * walk dentry list for an inode and report whether it has aliases that + * are hashed. We use this to determine if a directory inode can actually + * be used. + */ +static bool +inode_has_hashed_dentries(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&inode->i_lock); + list_for_each_entry(dentry, &inode->i_dentry, d_alias) { + if (!d_unhashed(dentry) || IS_ROOT(dentry)) { + spin_unlock(&inode->i_lock); + return true; + } + } + spin_unlock(&inode->i_lock); + return false; +} + +/* Given fattrs, get a corresponding inode */ +struct inode * +cifs_iget(struct super_block *sb, struct cifs_fattr *fattr) +{ + unsigned long hash; + struct inode *inode; + +retry_iget5_locked: + cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid); + + /* hash down to 32-bits on 32-bit arch */ + hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); + + inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr); + if (inode) { + /* was there a potentially problematic inode collision? */ + if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) { + fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION; + + if (inode_has_hashed_dentries(inode)) { + cifs_autodisable_serverino(CIFS_SB(sb)); + iput(inode); + fattr->cf_uniqueid = iunique(sb, ROOT_I); + goto retry_iget5_locked; + } + } + + cifs_fattr_to_inode(inode, fattr); + if (sb->s_flags & MS_NOATIME) + inode->i_flags |= S_NOATIME | S_NOCMTIME; + if (inode->i_state & I_NEW) { + inode->i_ino = hash; + if (S_ISREG(inode->i_mode)) + inode->i_data.backing_dev_info = sb->s_bdi; +#ifdef CONFIG_CIFS_FSCACHE + /* initialize per-inode cache cookie pointer */ + CIFS_I(inode)->fscache = NULL; +#endif + unlock_new_inode(inode); + } + } + + return inode; +} + +/* gets root inode */ +struct inode *cifs_root_iget(struct super_block *sb) +{ + int xid; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct inode *inode = NULL; + long rc; + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + + xid = GetXid(); + if (tcon->unix_ext) + rc = cifs_get_inode_info_unix(&inode, "", sb, xid); + else + rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL); + + if (!inode) { + inode = ERR_PTR(rc); + goto out; + } + +#ifdef CONFIG_CIFS_FSCACHE + /* populate tcon->resource_id */ + tcon->resource_id = CIFS_I(inode)->uniqueid; +#endif + + if (rc && tcon->ipc) { + cFYI(1, "ipc connection - fake read inode"); + inode->i_mode |= S_IFDIR; + inode->i_nlink = 2; + inode->i_op = &cifs_ipc_inode_ops; + inode->i_fop = &simple_dir_operations; + inode->i_uid = cifs_sb->mnt_uid; + inode->i_gid = cifs_sb->mnt_gid; + } else if (rc) { + iget_failed(inode); + inode = ERR_PTR(rc); + } + +out: + /* can not call macro FreeXid here since in a void func + * TODO: This is no longer true + */ + _FreeXid(xid); + return inode; +} + +static int +cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid, + char *full_path, __u32 dosattr) +{ + int rc; + int oplock = 0; + __u16 netfid; + __u32 netpid; + bool set_time = false; + struct cifsFileInfo *open_file; + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink = NULL; + struct cifs_tcon *pTcon; + FILE_BASIC_INFO info_buf; + + if (attrs == NULL) + return -EINVAL; + + if (attrs->ia_valid & ATTR_ATIME) { + set_time = true; + info_buf.LastAccessTime = + cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime)); + } else + info_buf.LastAccessTime = 0; + + if (attrs->ia_valid & ATTR_MTIME) { + set_time = true; + info_buf.LastWriteTime = + cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime)); + } else + info_buf.LastWriteTime = 0; + + /* + * Samba throws this field away, but windows may actually use it. + * Do not set ctime unless other time stamps are changed explicitly + * (i.e. by utimes()) since we would then have a mix of client and + * server times. + */ + if (set_time && (attrs->ia_valid & ATTR_CTIME)) { + cFYI(1, "CIFS - CTIME changed"); + info_buf.ChangeTime = + cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime)); + } else + info_buf.ChangeTime = 0; + + info_buf.CreationTime = 0; /* don't change */ + info_buf.Attributes = cpu_to_le32(dosattr); + + /* + * If the file is already open for write, just use that fileid + */ + open_file = find_writable_file(cifsInode, true); + if (open_file) { + netfid = open_file->netfid; + netpid = open_file->pid; + pTcon = tlink_tcon(open_file->tlink); + goto set_via_filehandle; + } + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + tlink = NULL; + goto out; + } + pTcon = tlink_tcon(tlink); + + /* + * NT4 apparently returns success on this call, but it doesn't + * really work. + */ + if (!(pTcon->ses->flags & CIFS_SES_NT4)) { + rc = CIFSSMBSetPathInfo(xid, pTcon, full_path, + &info_buf, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc == 0) { + cifsInode->cifsAttrs = dosattr; + goto out; + } else if (rc != -EOPNOTSUPP && rc != -EINVAL) + goto out; + } + + cFYI(1, "calling SetFileInfo since SetPathInfo for " + "times not supported by this server"); + rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, + SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, + CREATE_NOT_DIR, &netfid, &oplock, + NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (rc != 0) { + if (rc == -EIO) + rc = -EINVAL; + goto out; + } + + netpid = current->tgid; + +set_via_filehandle: + rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid); + if (!rc) + cifsInode->cifsAttrs = dosattr; + + if (open_file == NULL) + CIFSSMBClose(xid, pTcon, netfid); + else + cifsFileInfo_put(open_file); +out: + if (tlink != NULL) + cifs_put_tlink(tlink); + return rc; +} + +/* + * open the given file (if it isn't already), set the DELETE_ON_CLOSE bit + * and rename it to a random name that hopefully won't conflict with + * anything else. + */ +static int +cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid) +{ + int oplock = 0; + int rc; + __u16 netfid; + struct inode *inode = dentry->d_inode; + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink; + struct cifs_tcon *tcon; + __u32 dosattr, origattr; + FILE_BASIC_INFO *info_buf = NULL; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, + DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, + &netfid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc != 0) + goto out; + + origattr = cifsInode->cifsAttrs; + if (origattr == 0) + origattr |= ATTR_NORMAL; + + dosattr = origattr & ~ATTR_READONLY; + if (dosattr == 0) + dosattr |= ATTR_NORMAL; + dosattr |= ATTR_HIDDEN; + + /* set ATTR_HIDDEN and clear ATTR_READONLY, but only if needed */ + if (dosattr != origattr) { + info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL); + if (info_buf == NULL) { + rc = -ENOMEM; + goto out_close; + } + info_buf->Attributes = cpu_to_le32(dosattr); + rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, + current->tgid); + /* although we would like to mark the file hidden + if that fails we will still try to rename it */ + if (rc != 0) + cifsInode->cifsAttrs = dosattr; + else + dosattr = origattr; /* since not able to change them */ + } + + /* rename the file */ + rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc != 0) { + rc = -ETXTBSY; + goto undo_setattr; + } + + /* try to set DELETE_ON_CLOSE */ + if (!cifsInode->delete_pending) { + rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, + current->tgid); + /* + * some samba versions return -ENOENT when we try to set the + * file disposition here. Likely a samba bug, but work around + * it for now. This means that some cifsXXX files may hang + * around after they shouldn't. + * + * BB: remove this hack after more servers have the fix + */ + if (rc == -ENOENT) + rc = 0; + else if (rc != 0) { + rc = -ETXTBSY; + goto undo_rename; + } + cifsInode->delete_pending = true; + } + +out_close: + CIFSSMBClose(xid, tcon, netfid); +out: + kfree(info_buf); + cifs_put_tlink(tlink); + return rc; + + /* + * reset everything back to the original state. Don't bother + * dealing with errors here since we can't do anything about + * them anyway. + */ +undo_rename: + CIFSSMBRenameOpenFile(xid, tcon, netfid, dentry->d_name.name, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); +undo_setattr: + if (dosattr != origattr) { + info_buf->Attributes = cpu_to_le32(origattr); + if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, + current->tgid)) + cifsInode->cifsAttrs = origattr; + } + + goto out_close; +} + + +/* + * If dentry->d_inode is null (usually meaning the cached dentry + * is a negative dentry) then we would attempt a standard SMB delete, but + * if that fails we can not attempt the fall back mechanisms on EACCESS + * but will return the EACCESS to the caller. Note that the VFS does not call + * unlink on negative dentries currently. + */ +int cifs_unlink(struct inode *dir, struct dentry *dentry) +{ + int rc = 0; + int xid; + char *full_path = NULL; + struct inode *inode = dentry->d_inode; + struct cifsInodeInfo *cifs_inode; + struct super_block *sb = dir->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct iattr *attrs = NULL; + __u32 dosattr = 0, origattr = 0; + + cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry); + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + xid = GetXid(); + + /* Unlink can be called from rename so we can not take the + * sb->s_vfs_rename_mutex here */ + full_path = build_path_from_dentry(dentry); + if (full_path == NULL) { + rc = -ENOMEM; + goto unlink_out; + } + + if ((tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_POSIX_PATH_OPS_CAP & + le64_to_cpu(tcon->fsUnixInfo.Capability))) { + rc = CIFSPOSIXDelFile(xid, tcon, full_path, + SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + cFYI(1, "posix del rc %d", rc); + if ((rc == 0) || (rc == -ENOENT)) + goto psx_del_no_retry; + } + +retry_std_delete: + rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + +psx_del_no_retry: + if (!rc) { + if (inode) + drop_nlink(inode); + } else if (rc == -ENOENT) { + d_drop(dentry); + } else if (rc == -ETXTBSY) { + rc = cifs_rename_pending_delete(full_path, dentry, xid); + if (rc == 0) + drop_nlink(inode); + } else if ((rc == -EACCES) && (dosattr == 0) && inode) { + attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); + if (attrs == NULL) { + rc = -ENOMEM; + goto out_reval; + } + + /* try to reset dos attributes */ + cifs_inode = CIFS_I(inode); + origattr = cifs_inode->cifsAttrs; + if (origattr == 0) + origattr |= ATTR_NORMAL; + dosattr = origattr & ~ATTR_READONLY; + if (dosattr == 0) + dosattr |= ATTR_NORMAL; + dosattr |= ATTR_HIDDEN; + + rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr); + if (rc != 0) + goto out_reval; + + goto retry_std_delete; + } + + /* undo the setattr if we errored out and it's needed */ + if (rc != 0 && dosattr != 0) + cifs_set_file_info(inode, attrs, xid, full_path, origattr); + +out_reval: + if (inode) { + cifs_inode = CIFS_I(inode); + cifs_inode->time = 0; /* will force revalidate to get info + when needed */ + inode->i_ctime = current_fs_time(sb); + } + dir->i_ctime = dir->i_mtime = current_fs_time(sb); + cifs_inode = CIFS_I(dir); + CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ +unlink_out: + kfree(full_path); + kfree(attrs); + FreeXid(xid); + cifs_put_tlink(tlink); + return rc; +} + +int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) +{ + int rc = 0, tmprc; + int xid; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + char *full_path = NULL; + struct inode *newinode = NULL; + struct cifs_fattr fattr; + + cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode); + + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + xid = GetXid(); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto mkdir_out; + } + + if ((pTcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_POSIX_PATH_OPS_CAP & + le64_to_cpu(pTcon->fsUnixInfo.Capability))) { + u32 oplock = 0; + FILE_UNIX_BASIC_INFO *pInfo = + kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); + if (pInfo == NULL) { + rc = -ENOMEM; + goto mkdir_out; + } + + mode &= ~current_umask(); + rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT, + mode, NULL /* netfid */, pInfo, &oplock, + full_path, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc == -EOPNOTSUPP) { + kfree(pInfo); + goto mkdir_retry_old; + } else if (rc) { + cFYI(1, "posix mkdir returned 0x%x", rc); + d_drop(direntry); + } else { + if (pInfo->Type == cpu_to_le32(-1)) { + /* no return info, go query for it */ + kfree(pInfo); + goto mkdir_get_info; + } +/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need + to set uid/gid */ + inc_nlink(inode); + + cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb); + cifs_fill_uniqueid(inode->i_sb, &fattr); + newinode = cifs_iget(inode->i_sb, &fattr); + if (!newinode) { + kfree(pInfo); + goto mkdir_get_info; + } + + d_instantiate(direntry, newinode); + +#ifdef CONFIG_CIFS_DEBUG2 + cFYI(1, "instantiated dentry %p %s to inode %p", + direntry, direntry->d_name.name, newinode); + + if (newinode->i_nlink != 2) + cFYI(1, "unexpected number of links %d", + newinode->i_nlink); +#endif + } + kfree(pInfo); + goto mkdir_out; + } +mkdir_retry_old: + /* BB add setting the equivalent of mode via CreateX w/ACLs */ + rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) { + cFYI(1, "cifs_mkdir returned 0x%x", rc); + d_drop(direntry); + } else { +mkdir_get_info: + inc_nlink(inode); + if (pTcon->unix_ext) + rc = cifs_get_inode_info_unix(&newinode, full_path, + inode->i_sb, xid); + else + rc = cifs_get_inode_info(&newinode, full_path, NULL, + inode->i_sb, xid, NULL); + + d_instantiate(direntry, newinode); + /* setting nlink not necessary except in cases where we + * failed to get it from the server or was set bogus */ + if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) + direntry->d_inode->i_nlink = 2; + + mode &= ~current_umask(); + /* must turn on setgid bit if parent dir has it */ + if (inode->i_mode & S_ISGID) + mode |= S_ISGID; + + if (pTcon->unix_ext) { + struct cifs_unix_set_info_args args = { + .mode = mode, + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = 0, + }; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + args.uid = (__u64)current_fsuid(); + if (inode->i_mode & S_ISGID) + args.gid = (__u64)inode->i_gid; + else + args.gid = (__u64)current_fsgid(); + } else { + args.uid = NO_CHANGE_64; + args.gid = NO_CHANGE_64; + } + CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + } else { + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && + (mode & S_IWUGO) == 0) { + FILE_BASIC_INFO pInfo; + struct cifsInodeInfo *cifsInode; + u32 dosattrs; + + memset(&pInfo, 0, sizeof(pInfo)); + cifsInode = CIFS_I(newinode); + dosattrs = cifsInode->cifsAttrs|ATTR_READONLY; + pInfo.Attributes = cpu_to_le32(dosattrs); + tmprc = CIFSSMBSetPathInfo(xid, pTcon, + full_path, &pInfo, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (tmprc == 0) + cifsInode->cifsAttrs = dosattrs; + } + if (direntry->d_inode) { + if (cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_DYNPERM) + direntry->d_inode->i_mode = + (mode | S_IFDIR); + + if (cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_SET_UID) { + direntry->d_inode->i_uid = + current_fsuid(); + if (inode->i_mode & S_ISGID) + direntry->d_inode->i_gid = + inode->i_gid; + else + direntry->d_inode->i_gid = + current_fsgid(); + } + } + } + } +mkdir_out: + kfree(full_path); + FreeXid(xid); + cifs_put_tlink(tlink); + return rc; +} + +int cifs_rmdir(struct inode *inode, struct dentry *direntry) +{ + int rc = 0; + int xid; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + char *full_path = NULL; + struct cifsInodeInfo *cifsInode; + + cFYI(1, "cifs_rmdir, inode = 0x%p", inode); + + xid = GetXid(); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto rmdir_exit; + } + + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + goto rmdir_exit; + } + pTcon = tlink_tcon(tlink); + + rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_put_tlink(tlink); + + if (!rc) { + drop_nlink(inode); + spin_lock(&direntry->d_inode->i_lock); + i_size_write(direntry->d_inode, 0); + clear_nlink(direntry->d_inode); + spin_unlock(&direntry->d_inode->i_lock); + } + + cifsInode = CIFS_I(direntry->d_inode); + cifsInode->time = 0; /* force revalidate to go get info when + needed */ + + cifsInode = CIFS_I(inode); + cifsInode->time = 0; /* force revalidate to get parent dir info + since cached search results now invalid */ + + direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime = + current_fs_time(inode->i_sb); + +rmdir_exit: + kfree(full_path); + FreeXid(xid); + return rc; +} + +static int +cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath, + struct dentry *to_dentry, const char *toPath) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb); + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + __u16 srcfid; + int oplock, rc; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + /* try path-based rename first */ + rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + /* + * don't bother with rename by filehandle unless file is busy and + * source Note that cross directory moves do not work with + * rename by filehandle to various Windows servers. + */ + if (rc == 0 || rc != -ETXTBSY) + goto do_rename_exit; + + /* open-file renames don't work across directories */ + if (to_dentry->d_parent != from_dentry->d_parent) + goto do_rename_exit; + + /* open the file to be renamed -- we need DELETE perms */ + rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE, + CREATE_NOT_DIR, &srcfid, &oplock, NULL, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (rc == 0) { + rc = CIFSSMBRenameOpenFile(xid, pTcon, srcfid, + (const char *) to_dentry->d_name.name, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + CIFSSMBClose(xid, pTcon, srcfid); + } +do_rename_exit: + cifs_put_tlink(tlink); + return rc; +} + +int cifs_rename(struct inode *source_dir, struct dentry *source_dentry, + struct inode *target_dir, struct dentry *target_dentry) +{ + char *fromName = NULL; + char *toName = NULL; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + FILE_UNIX_BASIC_INFO *info_buf_source = NULL; + FILE_UNIX_BASIC_INFO *info_buf_target; + int xid, rc, tmprc; + + cifs_sb = CIFS_SB(source_dir->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + xid = GetXid(); + + /* + * we already have the rename sem so we do not need to + * grab it again here to protect the path integrity + */ + fromName = build_path_from_dentry(source_dentry); + if (fromName == NULL) { + rc = -ENOMEM; + goto cifs_rename_exit; + } + + toName = build_path_from_dentry(target_dentry); + if (toName == NULL) { + rc = -ENOMEM; + goto cifs_rename_exit; + } + + rc = cifs_do_rename(xid, source_dentry, fromName, + target_dentry, toName); + + if (rc == -EEXIST && tcon->unix_ext) { + /* + * Are src and dst hardlinks of same inode? We can + * only tell with unix extensions enabled + */ + info_buf_source = + kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), + GFP_KERNEL); + if (info_buf_source == NULL) { + rc = -ENOMEM; + goto cifs_rename_exit; + } + + info_buf_target = info_buf_source + 1; + tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName, + info_buf_source, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (tmprc != 0) + goto unlink_target; + + tmprc = CIFSSMBUnixQPathInfo(xid, tcon, toName, + info_buf_target, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (tmprc == 0 && (info_buf_source->UniqueId == + info_buf_target->UniqueId)) { + /* same file, POSIX says that this is a noop */ + rc = 0; + goto cifs_rename_exit; + } + } /* else ... BB we could add the same check for Windows by + checking the UniqueId via FILE_INTERNAL_INFO */ + +unlink_target: + /* Try unlinking the target dentry if it's not negative */ + if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) { + tmprc = cifs_unlink(target_dir, target_dentry); + if (tmprc) + goto cifs_rename_exit; + + rc = cifs_do_rename(xid, source_dentry, fromName, + target_dentry, toName); + } + +cifs_rename_exit: + kfree(info_buf_source); + kfree(fromName); + kfree(toName); + FreeXid(xid); + cifs_put_tlink(tlink); + return rc; +} + +static bool +cifs_inode_needs_reval(struct inode *inode) +{ + struct cifsInodeInfo *cifs_i = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + if (cifs_i->clientCanCacheRead) + return false; + + if (!lookupCacheEnabled) + return true; + + if (cifs_i->time == 0) + return true; + + if (!time_in_range(jiffies, cifs_i->time, + cifs_i->time + cifs_sb->actimeo)) + return true; + + /* hardlinked files w/ noserverino get "special" treatment */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && + S_ISREG(inode->i_mode) && inode->i_nlink != 1) + return true; + + return false; +} + +/* + * Zap the cache. Called when invalid_mapping flag is set. + */ +int +cifs_invalidate_mapping(struct inode *inode) +{ + int rc = 0; + struct cifsInodeInfo *cifs_i = CIFS_I(inode); + + cifs_i->invalid_mapping = false; + + if (inode->i_mapping && inode->i_mapping->nrpages != 0) { + rc = invalidate_inode_pages2(inode->i_mapping); + if (rc) { + cERROR(1, "%s: could not invalidate inode %p", __func__, + inode); + cifs_i->invalid_mapping = true; + } + } + + cifs_fscache_reset_inode_cookie(inode); + return rc; +} + +int cifs_revalidate_file_attr(struct file *filp) +{ + int rc = 0; + struct inode *inode = filp->f_path.dentry->d_inode; + struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; + + if (!cifs_inode_needs_reval(inode)) + return rc; + + if (tlink_tcon(cfile->tlink)->unix_ext) + rc = cifs_get_file_info_unix(filp); + else + rc = cifs_get_file_info(filp); + + return rc; +} + +int cifs_revalidate_dentry_attr(struct dentry *dentry) +{ + int xid; + int rc = 0; + struct inode *inode = dentry->d_inode; + struct super_block *sb = dentry->d_sb; + char *full_path = NULL; + + if (inode == NULL) + return -ENOENT; + + if (!cifs_inode_needs_reval(inode)) + return rc; + + xid = GetXid(); + + /* can not safely grab the rename sem here if rename calls revalidate + since that would deadlock */ + full_path = build_path_from_dentry(dentry); + if (full_path == NULL) { + rc = -ENOMEM; + goto out; + } + + cFYI(1, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time " + "%ld jiffies %ld", full_path, inode, inode->i_count.counter, + dentry, dentry->d_time, jiffies); + + if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) + rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); + else + rc = cifs_get_inode_info(&inode, full_path, NULL, sb, + xid, NULL); + +out: + kfree(full_path); + FreeXid(xid); + return rc; +} + +int cifs_revalidate_file(struct file *filp) +{ + int rc; + struct inode *inode = filp->f_path.dentry->d_inode; + + rc = cifs_revalidate_file_attr(filp); + if (rc) + return rc; + + if (CIFS_I(inode)->invalid_mapping) + rc = cifs_invalidate_mapping(inode); + return rc; +} + +/* revalidate a dentry's inode attributes */ +int cifs_revalidate_dentry(struct dentry *dentry) +{ + int rc; + struct inode *inode = dentry->d_inode; + + rc = cifs_revalidate_dentry_attr(dentry); + if (rc) + return rc; + + if (CIFS_I(inode)->invalid_mapping) + rc = cifs_invalidate_mapping(inode); + return rc; +} + +int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct inode *inode = dentry->d_inode; + int rc; + + /* + * We need to be sure that all dirty pages are written and the server + * has actual ctime, mtime and file length. + */ + if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping && + inode->i_mapping->nrpages != 0) { + rc = filemap_fdatawait(inode->i_mapping); + if (rc) { + mapping_set_error(inode->i_mapping, rc); + return rc; + } + } + + rc = cifs_revalidate_dentry_attr(dentry); + if (rc) + return rc; + + generic_fillattr(inode, stat); + stat->blksize = CIFS_MAX_MSGSIZE; + stat->ino = CIFS_I(inode)->uniqueid; + + /* + * If on a multiuser mount without unix extensions, and the admin hasn't + * overridden them, set the ownership to the fsuid/fsgid of the current + * process. + */ + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) && + !tcon->unix_ext) { + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) + stat->uid = current_fsuid(); + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) + stat->gid = current_fsgid(); + } + return rc; +} + +static int cifs_truncate_page(struct address_space *mapping, loff_t from) +{ + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE - 1); + struct page *page; + int rc = 0; + + page = grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + unlock_page(page); + page_cache_release(page); + return rc; +} + +static void cifs_setsize(struct inode *inode, loff_t offset) +{ + loff_t oldsize; + + spin_lock(&inode->i_lock); + oldsize = inode->i_size; + i_size_write(inode, offset); + spin_unlock(&inode->i_lock); + + truncate_pagecache(inode, oldsize, offset); +} + +static int +cifs_set_file_size(struct inode *inode, struct iattr *attrs, + int xid, char *full_path) +{ + int rc; + struct cifsFileInfo *open_file; + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink = NULL; + struct cifs_tcon *pTcon = NULL; + struct cifs_io_parms io_parms; + + /* + * To avoid spurious oplock breaks from server, in the case of + * inodes that we already have open, avoid doing path based + * setting of file size if we can do it by handle. + * This keeps our caching token (oplock) and avoids timeouts + * when the local oplock break takes longer to flush + * writebehind data than the SMB timeout for the SetPathInfo + * request would allow + */ + open_file = find_writable_file(cifsInode, true); + if (open_file) { + __u16 nfid = open_file->netfid; + __u32 npid = open_file->pid; + pTcon = tlink_tcon(open_file->tlink); + rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, + npid, false); + cifsFileInfo_put(open_file); + cFYI(1, "SetFSize for attrs rc = %d", rc); + if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { + unsigned int bytes_written; + + io_parms.netfid = nfid; + io_parms.pid = npid; + io_parms.tcon = pTcon; + io_parms.offset = 0; + io_parms.length = attrs->ia_size; + rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, + NULL, NULL, 1); + cFYI(1, "Wrt seteof rc %d", rc); + } + } else + rc = -EINVAL; + + if (rc != 0) { + if (pTcon == NULL) { + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + } + + /* Set file size by pathname rather than by handle + either because no valid, writeable file handle for + it was found or because there was an error setting + it by handle */ + rc = CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size, + false, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + cFYI(1, "SetEOF by path (setattrs) rc = %d", rc); + if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { + __u16 netfid; + int oplock = 0; + + rc = SMBLegacyOpen(xid, pTcon, full_path, + FILE_OPEN, GENERIC_WRITE, + CREATE_NOT_DIR, &netfid, &oplock, NULL, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc == 0) { + unsigned int bytes_written; + + io_parms.netfid = netfid; + io_parms.pid = current->tgid; + io_parms.tcon = pTcon; + io_parms.offset = 0; + io_parms.length = attrs->ia_size; + rc = CIFSSMBWrite(xid, &io_parms, + &bytes_written, + NULL, NULL, 1); + cFYI(1, "wrt seteof rc %d", rc); + CIFSSMBClose(xid, pTcon, netfid); + } + } + if (tlink) + cifs_put_tlink(tlink); + } + + if (rc == 0) { + cifsInode->server_eof = attrs->ia_size; + cifs_setsize(inode, attrs->ia_size); + cifs_truncate_page(inode->i_mapping, inode->i_size); + } + + return rc; +} + +static int +cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) +{ + int rc; + int xid; + char *full_path = NULL; + struct inode *inode = direntry->d_inode; + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct cifs_unix_set_info_args *args = NULL; + struct cifsFileInfo *open_file; + + cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x", + direntry->d_name.name, attrs->ia_valid); + + xid = GetXid(); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) + attrs->ia_valid |= ATTR_FORCE; + + rc = inode_change_ok(inode, attrs); + if (rc < 0) + goto out; + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto out; + } + + /* + * Attempt to flush data before changing attributes. We need to do + * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the + * ownership or mode then we may also need to do this. Here, we take + * the safe way out and just do the flush on all setattr requests. If + * the flush returns error, store it to report later and continue. + * + * BB: This should be smarter. Why bother flushing pages that + * will be truncated anyway? Also, should we error out here if + * the flush returns error? + */ + rc = filemap_write_and_wait(inode->i_mapping); + mapping_set_error(inode->i_mapping, rc); + rc = 0; + + if (attrs->ia_valid & ATTR_SIZE) { + rc = cifs_set_file_size(inode, attrs, xid, full_path); + if (rc != 0) + goto out; + } + + /* skip mode change if it's just for clearing setuid/setgid */ + if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) + attrs->ia_valid &= ~ATTR_MODE; + + args = kmalloc(sizeof(*args), GFP_KERNEL); + if (args == NULL) { + rc = -ENOMEM; + goto out; + } + + /* set up the struct */ + if (attrs->ia_valid & ATTR_MODE) + args->mode = attrs->ia_mode; + else + args->mode = NO_CHANGE_64; + + if (attrs->ia_valid & ATTR_UID) + args->uid = attrs->ia_uid; + else + args->uid = NO_CHANGE_64; + + if (attrs->ia_valid & ATTR_GID) + args->gid = attrs->ia_gid; + else + args->gid = NO_CHANGE_64; + + if (attrs->ia_valid & ATTR_ATIME) + args->atime = cifs_UnixTimeToNT(attrs->ia_atime); + else + args->atime = NO_CHANGE_64; + + if (attrs->ia_valid & ATTR_MTIME) + args->mtime = cifs_UnixTimeToNT(attrs->ia_mtime); + else + args->mtime = NO_CHANGE_64; + + if (attrs->ia_valid & ATTR_CTIME) + args->ctime = cifs_UnixTimeToNT(attrs->ia_ctime); + else + args->ctime = NO_CHANGE_64; + + args->device = 0; + open_file = find_writable_file(cifsInode, true); + if (open_file) { + u16 nfid = open_file->netfid; + u32 npid = open_file->pid; + pTcon = tlink_tcon(open_file->tlink); + rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); + cifsFileInfo_put(open_file); + } else { + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + goto out; + } + pTcon = tlink_tcon(tlink); + rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_put_tlink(tlink); + } + + if (rc) + goto out; + + if ((attrs->ia_valid & ATTR_SIZE) && + attrs->ia_size != i_size_read(inode)) + truncate_setsize(inode, attrs->ia_size); + + setattr_copy(inode, attrs); + mark_inode_dirty(inode); + + /* force revalidate when any of these times are set since some + of the fs types (eg ext3, fat) do not have fine enough + time granularity to match protocol, and we do not have a + a way (yet) to query the server fs's time granularity (and + whether it rounds times down). + */ + if (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME)) + cifsInode->time = 0; +out: + kfree(args); + kfree(full_path); + FreeXid(xid); + return rc; +} + +static int +cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) +{ + int xid; + struct inode *inode = direntry->d_inode; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + char *full_path = NULL; + int rc = -EACCES; + __u32 dosattr = 0; + __u64 mode = NO_CHANGE_64; + + xid = GetXid(); + + cFYI(1, "setattr on file %s attrs->iavalid 0x%x", + direntry->d_name.name, attrs->ia_valid); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) + attrs->ia_valid |= ATTR_FORCE; + + rc = inode_change_ok(inode, attrs); + if (rc < 0) { + FreeXid(xid); + return rc; + } + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + FreeXid(xid); + return rc; + } + + /* + * Attempt to flush data before changing attributes. We need to do + * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the + * ownership or mode then we may also need to do this. Here, we take + * the safe way out and just do the flush on all setattr requests. If + * the flush returns error, store it to report later and continue. + * + * BB: This should be smarter. Why bother flushing pages that + * will be truncated anyway? Also, should we error out here if + * the flush returns error? + */ + rc = filemap_write_and_wait(inode->i_mapping); + mapping_set_error(inode->i_mapping, rc); + rc = 0; + + if (attrs->ia_valid & ATTR_SIZE) { + rc = cifs_set_file_size(inode, attrs, xid, full_path); + if (rc != 0) + goto cifs_setattr_exit; + } + + /* + * Without unix extensions we can't send ownership changes to the + * server, so silently ignore them. This is consistent with how + * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With + * CIFSACL support + proper Windows to Unix idmapping, we may be + * able to support this in the future. + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) + attrs->ia_valid &= ~(ATTR_UID | ATTR_GID); + + /* skip mode change if it's just for clearing setuid/setgid */ + if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) + attrs->ia_valid &= ~ATTR_MODE; + + if (attrs->ia_valid & ATTR_MODE) { + cFYI(1, "Mode changed to 0%o", attrs->ia_mode); + mode = attrs->ia_mode; + } + + if (attrs->ia_valid & ATTR_MODE) { + rc = 0; +#ifdef CONFIG_CIFS_ACL + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + rc = mode_to_cifs_acl(inode, full_path, mode); + if (rc) { + cFYI(1, "%s: Setting ACL failed with error: %d", + __func__, rc); + goto cifs_setattr_exit; + } + } else +#endif /* CONFIG_CIFS_ACL */ + if (((mode & S_IWUGO) == 0) && + (cifsInode->cifsAttrs & ATTR_READONLY) == 0) { + + dosattr = cifsInode->cifsAttrs | ATTR_READONLY; + + /* fix up mode if we're not using dynperm */ + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0) + attrs->ia_mode = inode->i_mode & ~S_IWUGO; + } else if ((mode & S_IWUGO) && + (cifsInode->cifsAttrs & ATTR_READONLY)) { + + dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY; + /* Attributes of 0 are ignored */ + if (dosattr == 0) + dosattr |= ATTR_NORMAL; + + /* reset local inode permissions to normal */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) { + attrs->ia_mode &= ~(S_IALLUGO); + if (S_ISDIR(inode->i_mode)) + attrs->ia_mode |= + cifs_sb->mnt_dir_mode; + else + attrs->ia_mode |= + cifs_sb->mnt_file_mode; + } + } else if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) { + /* ignore mode change - ATTR_READONLY hasn't changed */ + attrs->ia_valid &= ~ATTR_MODE; + } + } + + if (attrs->ia_valid & (ATTR_MTIME|ATTR_ATIME|ATTR_CTIME) || + ((attrs->ia_valid & ATTR_MODE) && dosattr)) { + rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr); + /* BB: check for rc = -EOPNOTSUPP and switch to legacy mode */ + + /* Even if error on time set, no sense failing the call if + the server would set the time to a reasonable value anyway, + and this check ensures that we are not being called from + sys_utimes in which case we ought to fail the call back to + the user when the server rejects the call */ + if ((rc) && (attrs->ia_valid & + (ATTR_MODE | ATTR_GID | ATTR_UID | ATTR_SIZE))) + rc = 0; + } + + /* do not need local check to inode_check_ok since the server does + that */ + if (rc) + goto cifs_setattr_exit; + + if ((attrs->ia_valid & ATTR_SIZE) && + attrs->ia_size != i_size_read(inode)) + truncate_setsize(inode, attrs->ia_size); + + setattr_copy(inode, attrs); + mark_inode_dirty(inode); + +cifs_setattr_exit: + kfree(full_path); + FreeXid(xid); + return rc; +} + +int +cifs_setattr(struct dentry *direntry, struct iattr *attrs) +{ + struct inode *inode = direntry->d_inode; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); + + if (pTcon->unix_ext) + return cifs_setattr_unix(direntry, attrs); + + return cifs_setattr_nounix(direntry, attrs); + + /* BB: add cifs_setattr_legacy for really old servers */ +} + +#if 0 +void cifs_delete_inode(struct inode *inode) +{ + cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode); + /* may have to add back in if and when safe distributed caching of + directories added e.g. via FindNotify */ +} +#endif diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c new file mode 100644 index 00000000..4221b5e4 --- /dev/null +++ b/fs/cifs/ioctl.c @@ -0,0 +1,102 @@ +/* + * fs/cifs/ioctl.c + * + * vfs operations that deal with io control + * + * Copyright (C) International Business Machines Corp., 2005,2007 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifsfs.h" + +#define CIFS_IOC_CHECKUMOUNT _IO(0xCF, 2) + +long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) +{ + struct inode *inode = filep->f_dentry->d_inode; + int rc = -ENOTTY; /* strange error - but the precedent */ + int xid; + struct cifs_sb_info *cifs_sb; +#ifdef CONFIG_CIFS_POSIX + struct cifsFileInfo *pSMBFile = filep->private_data; + struct cifs_tcon *tcon; + __u64 ExtAttrBits = 0; + __u64 ExtAttrMask = 0; + __u64 caps; +#endif /* CONFIG_CIFS_POSIX */ + + xid = GetXid(); + + cFYI(1, "ioctl file %p cmd %u arg %lu", filep, command, arg); + + cifs_sb = CIFS_SB(inode->i_sb); + + switch (command) { + case CIFS_IOC_CHECKUMOUNT: + cFYI(1, "User unmount attempted"); + if (cifs_sb->mnt_uid == current_uid()) + rc = 0; + else { + rc = -EACCES; + cFYI(1, "uids do not match"); + } + break; +#ifdef CONFIG_CIFS_POSIX + case FS_IOC_GETFLAGS: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); + if (CIFS_UNIX_EXTATTR_CAP & caps) { + rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid, + &ExtAttrBits, &ExtAttrMask); + if (rc == 0) + rc = put_user(ExtAttrBits & + FS_FL_USER_VISIBLE, + (int __user *)arg); + } + break; + + case FS_IOC_SETFLAGS: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); + if (CIFS_UNIX_EXTATTR_CAP & caps) { + if (get_user(ExtAttrBits, (int __user *)arg)) { + rc = -EFAULT; + break; + } + /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, + extAttrBits, &ExtAttrMask);*/ + } + cFYI(1, "set flags not implemented yet"); + break; +#endif /* CONFIG_CIFS_POSIX */ + default: + cFYI(1, "unsupported ioctl"); + break; + } + + FreeXid(xid); + return rc; +} diff --git a/fs/cifs/link.c b/fs/cifs/link.c new file mode 100644 index 00000000..556b1a0b --- /dev/null +++ b/fs/cifs/link.c @@ -0,0 +1,593 @@ +/* + * fs/cifs/link.c + * + * Copyright (C) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" + +#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1) +#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1)) +#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1)) +#define CIFS_MF_SYMLINK_LINK_MAXLEN (1024) +#define CIFS_MF_SYMLINK_FILE_SIZE \ + (CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN) + +#define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n" +#define CIFS_MF_SYMLINK_MD5_FORMAT \ + "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n" +#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \ + md5_hash[0], md5_hash[1], md5_hash[2], md5_hash[3], \ + md5_hash[4], md5_hash[5], md5_hash[6], md5_hash[7], \ + md5_hash[8], md5_hash[9], md5_hash[10], md5_hash[11],\ + md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15] + +static int +symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) +{ + int rc; + unsigned int size; + struct crypto_shash *md5; + struct sdesc *sdescmd5; + + md5 = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(md5)) { + rc = PTR_ERR(md5); + cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc); + return rc; + } + size = sizeof(struct shash_desc) + crypto_shash_descsize(md5); + sdescmd5 = kmalloc(size, GFP_KERNEL); + if (!sdescmd5) { + rc = -ENOMEM; + cERROR(1, "%s: Memory allocation failure\n", __func__); + goto symlink_hash_err; + } + sdescmd5->shash.tfm = md5; + sdescmd5->shash.flags = 0x0; + + rc = crypto_shash_init(&sdescmd5->shash); + if (rc) { + cERROR(1, "%s: Could not init md5 shash\n", __func__); + goto symlink_hash_err; + } + crypto_shash_update(&sdescmd5->shash, link_str, link_len); + rc = crypto_shash_final(&sdescmd5->shash, md5_hash); + +symlink_hash_err: + crypto_free_shash(md5); + kfree(sdescmd5); + + return rc; +} + +static int +CIFSParseMFSymlink(const u8 *buf, + unsigned int buf_len, + unsigned int *_link_len, + char **_link_str) +{ + int rc; + unsigned int link_len; + const char *md5_str1; + const char *link_str; + u8 md5_hash[16]; + char md5_str2[34]; + + if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE) + return -EINVAL; + + md5_str1 = (const char *)&buf[CIFS_MF_SYMLINK_MD5_OFFSET]; + link_str = (const char *)&buf[CIFS_MF_SYMLINK_LINK_OFFSET]; + + rc = sscanf(buf, CIFS_MF_SYMLINK_LEN_FORMAT, &link_len); + if (rc != 1) + return -EINVAL; + + rc = symlink_hash(link_len, link_str, md5_hash); + if (rc) { + cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc); + return rc; + } + + snprintf(md5_str2, sizeof(md5_str2), + CIFS_MF_SYMLINK_MD5_FORMAT, + CIFS_MF_SYMLINK_MD5_ARGS(md5_hash)); + + if (strncmp(md5_str1, md5_str2, 17) != 0) + return -EINVAL; + + if (_link_str) { + *_link_str = kstrndup(link_str, link_len, GFP_KERNEL); + if (!*_link_str) + return -ENOMEM; + } + + *_link_len = link_len; + return 0; +} + +static int +CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) +{ + int rc; + unsigned int link_len; + unsigned int ofs; + u8 md5_hash[16]; + + if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE) + return -EINVAL; + + link_len = strlen(link_str); + + if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) + return -ENAMETOOLONG; + + rc = symlink_hash(link_len, link_str, md5_hash); + if (rc) { + cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc); + return rc; + } + + snprintf(buf, buf_len, + CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT, + link_len, + CIFS_MF_SYMLINK_MD5_ARGS(md5_hash)); + + ofs = CIFS_MF_SYMLINK_LINK_OFFSET; + memcpy(buf + ofs, link_str, link_len); + + ofs += link_len; + if (ofs < CIFS_MF_SYMLINK_FILE_SIZE) { + buf[ofs] = '\n'; + ofs++; + } + + while (ofs < CIFS_MF_SYMLINK_FILE_SIZE) { + buf[ofs] = ' '; + ofs++; + } + + return 0; +} + +static int +CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon, + const char *fromName, const char *toName, + const struct nls_table *nls_codepage, int remap) +{ + int rc; + int oplock = 0; + __u16 netfid = 0; + u8 *buf; + unsigned int bytes_written = 0; + struct cifs_io_parms io_parms; + + buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName); + if (rc != 0) { + kfree(buf); + return rc; + } + + rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE, + CREATE_NOT_DIR, &netfid, &oplock, NULL, + nls_codepage, remap); + if (rc != 0) { + kfree(buf); + return rc; + } + + io_parms.netfid = netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + + rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, buf, NULL, 0); + CIFSSMBClose(xid, tcon, netfid); + kfree(buf); + if (rc != 0) + return rc; + + if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE) + return -EIO; + + return 0; +} + +static int +CIFSQueryMFSymLink(const int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, char **symlinkinfo, + const struct nls_table *nls_codepage, int remap) +{ + int rc; + int oplock = 0; + __u16 netfid = 0; + u8 *buf; + char *pbuf; + unsigned int bytes_read = 0; + int buf_type = CIFS_NO_BUFFER; + unsigned int link_len = 0; + struct cifs_io_parms io_parms; + FILE_ALL_INFO file_info; + + rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ, + CREATE_NOT_DIR, &netfid, &oplock, &file_info, + nls_codepage, remap); + if (rc != 0) + return rc; + + if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { + CIFSSMBClose(xid, tcon, netfid); + /* it's not a symlink */ + return -EINVAL; + } + + buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + pbuf = buf; + io_parms.netfid = netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + + rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type); + CIFSSMBClose(xid, tcon, netfid); + if (rc != 0) { + kfree(buf); + return rc; + } + + rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo); + kfree(buf); + if (rc != 0) + return rc; + + return 0; +} + +bool +CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr) +{ + if (!(fattr->cf_mode & S_IFREG)) + /* it's not a symlink */ + return false; + + if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE) + /* it's not a symlink */ + return false; + + return true; +} + +int +CIFSCheckMFSymlink(struct cifs_fattr *fattr, + const unsigned char *path, + struct cifs_sb_info *cifs_sb, int xid) +{ + int rc; + int oplock = 0; + __u16 netfid = 0; + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct cifs_io_parms io_parms; + u8 *buf; + char *pbuf; + unsigned int bytes_read = 0; + int buf_type = CIFS_NO_BUFFER; + unsigned int link_len = 0; + FILE_ALL_INFO file_info; + + if (!CIFSCouldBeMFSymlink(fattr)) + /* it's not a symlink */ + return 0; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ, + CREATE_NOT_DIR, &netfid, &oplock, &file_info, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc != 0) + goto out; + + if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { + CIFSSMBClose(xid, pTcon, netfid); + /* it's not a symlink */ + goto out; + } + + buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); + if (!buf) { + rc = -ENOMEM; + goto out; + } + pbuf = buf; + io_parms.netfid = netfid; + io_parms.pid = current->tgid; + io_parms.tcon = pTcon; + io_parms.offset = 0; + io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + + rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type); + CIFSSMBClose(xid, pTcon, netfid); + if (rc != 0) { + kfree(buf); + goto out; + } + + rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL); + kfree(buf); + if (rc == -EINVAL) { + /* it's not a symlink */ + rc = 0; + goto out; + } + + if (rc != 0) + goto out; + + /* it is a symlink */ + fattr->cf_eof = link_len; + fattr->cf_mode &= ~S_IFMT; + fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO; + fattr->cf_dtype = DT_LNK; +out: + cifs_put_tlink(tlink); + return rc; +} + +int +cifs_hardlink(struct dentry *old_file, struct inode *inode, + struct dentry *direntry) +{ + int rc = -EACCES; + int xid; + char *fromName = NULL; + char *toName = NULL; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct cifsInodeInfo *cifsInode; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + xid = GetXid(); + + fromName = build_path_from_dentry(old_file); + toName = build_path_from_dentry(direntry); + if ((fromName == NULL) || (toName == NULL)) { + rc = -ENOMEM; + goto cifs_hl_exit; + } + + if (pTcon->unix_ext) + rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + else { + rc = CIFSCreateHardLink(xid, pTcon, fromName, toName, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if ((rc == -EIO) || (rc == -EINVAL)) + rc = -EOPNOTSUPP; + } + + d_drop(direntry); /* force new lookup from server of target */ + + /* if source file is cached (oplocked) revalidate will not go to server + until the file is closed or oplock broken so update nlinks locally */ + if (old_file->d_inode) { + cifsInode = CIFS_I(old_file->d_inode); + if (rc == 0) { + old_file->d_inode->i_nlink++; +/* BB should we make this contingent on superblock flag NOATIME? */ +/* old_file->d_inode->i_ctime = CURRENT_TIME;*/ + /* parent dir timestamps will update from srv + within a second, would it really be worth it + to set the parent dir cifs inode time to zero + to force revalidate (faster) for it too? */ + } + /* if not oplocked will force revalidate to get info + on source file from srv */ + cifsInode->time = 0; + + /* Will update parent dir timestamps from srv within a second. + Would it really be worth it to set the parent dir (cifs + inode) time field to zero to force revalidate on parent + directory faster ie + CIFS_I(inode)->time = 0; */ + } + +cifs_hl_exit: + kfree(fromName); + kfree(toName); + FreeXid(xid); + cifs_put_tlink(tlink); + return rc; +} + +void * +cifs_follow_link(struct dentry *direntry, struct nameidata *nd) +{ + struct inode *inode = direntry->d_inode; + int rc = -ENOMEM; + int xid; + char *full_path = NULL; + char *target_path = NULL; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink = NULL; + struct cifs_tcon *tcon; + + xid = GetXid(); + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + tlink = NULL; + goto out; + } + tcon = tlink_tcon(tlink); + + /* + * For now, we just handle symlinks with unix extensions enabled. + * Eventually we should handle NTFS reparse points, and MacOS + * symlink support. For instance... + * + * rc = CIFSSMBQueryReparseLinkInfo(...) + * + * For now, just return -EACCES when the server doesn't support posix + * extensions. Note that we still allow querying symlinks when posix + * extensions are manually disabled. We could disable these as well + * but there doesn't seem to be any harm in allowing the client to + * read them. + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) + && !(tcon->ses->capabilities & CAP_UNIX)) { + rc = -EACCES; + goto out; + } + + full_path = build_path_from_dentry(direntry); + if (!full_path) + goto out; + + cFYI(1, "Full path: %s inode = 0x%p", full_path, inode); + + rc = -EACCES; + /* + * First try Minshall+French Symlinks, if configured + * and fallback to UNIX Extensions Symlinks. + */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) + rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + if ((rc != 0) && (tcon->ses->capabilities & CAP_UNIX)) + rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path, + cifs_sb->local_nls); + + kfree(full_path); +out: + if (rc != 0) { + kfree(target_path); + target_path = ERR_PTR(rc); + } + + FreeXid(xid); + if (tlink) + cifs_put_tlink(tlink); + nd_set_link(nd, target_path); + return NULL; +} + +int +cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) +{ + int rc = -EOPNOTSUPP; + int xid; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + char *full_path = NULL; + struct inode *newinode = NULL; + + xid = GetXid(); + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + goto symlink_exit; + } + pTcon = tlink_tcon(tlink); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto symlink_exit; + } + + cFYI(1, "Full path: %s", full_path); + cFYI(1, "symname is %s", symname); + + /* BB what if DFS and this volume is on different share? BB */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) + rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + else if (pTcon->unix_ext) + rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, + cifs_sb->local_nls); + /* else + rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName, + cifs_sb_target->local_nls); */ + + if (rc == 0) { + if (pTcon->unix_ext) + rc = cifs_get_inode_info_unix(&newinode, full_path, + inode->i_sb, xid); + else + rc = cifs_get_inode_info(&newinode, full_path, NULL, + inode->i_sb, xid, NULL); + + if (rc != 0) { + cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d", + rc); + } else { + d_instantiate(direntry, newinode); + } + } +symlink_exit: + kfree(full_path); + cifs_put_tlink(tlink); + FreeXid(xid); + return rc; +} + +void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *cookie) +{ + char *p = nd_get_link(nd); + if (!IS_ERR(p)) + kfree(p); +} diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c new file mode 100644 index 00000000..03a1f491 --- /dev/null +++ b/fs/cifs/misc.c @@ -0,0 +1,685 @@ +/* + * fs/cifs/misc.c + * + * Copyright (C) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "smberr.h" +#include "nterr.h" +#include "cifs_unicode.h" + +extern mempool_t *cifs_sm_req_poolp; +extern mempool_t *cifs_req_poolp; + +/* The xid serves as a useful identifier for each incoming vfs request, + in a similar way to the mid which is useful to track each sent smb, + and CurrentXid can also provide a running counter (although it + will eventually wrap past zero) of the total vfs operations handled + since the cifs fs was mounted */ + +unsigned int +_GetXid(void) +{ + unsigned int xid; + + spin_lock(&GlobalMid_Lock); + GlobalTotalActiveXid++; + + /* keep high water mark for number of simultaneous ops in filesystem */ + if (GlobalTotalActiveXid > GlobalMaxActiveXid) + GlobalMaxActiveXid = GlobalTotalActiveXid; + if (GlobalTotalActiveXid > 65000) + cFYI(1, "warning: more than 65000 requests active"); + xid = GlobalCurrentXid++; + spin_unlock(&GlobalMid_Lock); + return xid; +} + +void +_FreeXid(unsigned int xid) +{ + spin_lock(&GlobalMid_Lock); + /* if (GlobalTotalActiveXid == 0) + BUG(); */ + GlobalTotalActiveXid--; + spin_unlock(&GlobalMid_Lock); +} + +struct cifs_ses * +sesInfoAlloc(void) +{ + struct cifs_ses *ret_buf; + + ret_buf = kzalloc(sizeof(struct cifs_ses), GFP_KERNEL); + if (ret_buf) { + atomic_inc(&sesInfoAllocCount); + ret_buf->status = CifsNew; + ++ret_buf->ses_count; + INIT_LIST_HEAD(&ret_buf->smb_ses_list); + INIT_LIST_HEAD(&ret_buf->tcon_list); + mutex_init(&ret_buf->session_mutex); + } + return ret_buf; +} + +void +sesInfoFree(struct cifs_ses *buf_to_free) +{ + if (buf_to_free == NULL) { + cFYI(1, "Null buffer passed to sesInfoFree"); + return; + } + + atomic_dec(&sesInfoAllocCount); + kfree(buf_to_free->serverOS); + kfree(buf_to_free->serverDomain); + kfree(buf_to_free->serverNOS); + if (buf_to_free->password) { + memset(buf_to_free->password, 0, strlen(buf_to_free->password)); + kfree(buf_to_free->password); + } + kfree(buf_to_free->user_name); + kfree(buf_to_free->domainName); + kfree(buf_to_free); +} + +struct cifs_tcon * +tconInfoAlloc(void) +{ + struct cifs_tcon *ret_buf; + ret_buf = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL); + if (ret_buf) { + atomic_inc(&tconInfoAllocCount); + ret_buf->tidStatus = CifsNew; + ++ret_buf->tc_count; + INIT_LIST_HEAD(&ret_buf->openFileList); + INIT_LIST_HEAD(&ret_buf->tcon_list); +#ifdef CONFIG_CIFS_STATS + spin_lock_init(&ret_buf->stat_lock); +#endif + } + return ret_buf; +} + +void +tconInfoFree(struct cifs_tcon *buf_to_free) +{ + if (buf_to_free == NULL) { + cFYI(1, "Null buffer passed to tconInfoFree"); + return; + } + atomic_dec(&tconInfoAllocCount); + kfree(buf_to_free->nativeFileSystem); + if (buf_to_free->password) { + memset(buf_to_free->password, 0, strlen(buf_to_free->password)); + kfree(buf_to_free->password); + } + kfree(buf_to_free); +} + +struct smb_hdr * +cifs_buf_get(void) +{ + struct smb_hdr *ret_buf = NULL; + +/* We could use negotiated size instead of max_msgsize - + but it may be more efficient to always alloc same size + albeit slightly larger than necessary and maxbuffersize + defaults to this and can not be bigger */ + ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS); + + /* clear the first few header bytes */ + /* for most paths, more is cleared in header_assemble */ + if (ret_buf) { + memset(ret_buf, 0, sizeof(struct smb_hdr) + 3); + atomic_inc(&bufAllocCount); +#ifdef CONFIG_CIFS_STATS2 + atomic_inc(&totBufAllocCount); +#endif /* CONFIG_CIFS_STATS2 */ + } + + return ret_buf; +} + +void +cifs_buf_release(void *buf_to_free) +{ + if (buf_to_free == NULL) { + /* cFYI(1, "Null buffer passed to cifs_buf_release");*/ + return; + } + mempool_free(buf_to_free, cifs_req_poolp); + + atomic_dec(&bufAllocCount); + return; +} + +struct smb_hdr * +cifs_small_buf_get(void) +{ + struct smb_hdr *ret_buf = NULL; + +/* We could use negotiated size instead of max_msgsize - + but it may be more efficient to always alloc same size + albeit slightly larger than necessary and maxbuffersize + defaults to this and can not be bigger */ + ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS); + if (ret_buf) { + /* No need to clear memory here, cleared in header assemble */ + /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/ + atomic_inc(&smBufAllocCount); +#ifdef CONFIG_CIFS_STATS2 + atomic_inc(&totSmBufAllocCount); +#endif /* CONFIG_CIFS_STATS2 */ + + } + return ret_buf; +} + +void +cifs_small_buf_release(void *buf_to_free) +{ + + if (buf_to_free == NULL) { + cFYI(1, "Null buffer passed to cifs_small_buf_release"); + return; + } + mempool_free(buf_to_free, cifs_sm_req_poolp); + + atomic_dec(&smBufAllocCount); + return; +} + +/* + Find a free multiplex id (SMB mid). Otherwise there could be + mid collisions which might cause problems, demultiplexing the + wrong response to this request. Multiplex ids could collide if + one of a series requests takes much longer than the others, or + if a very large number of long lived requests (byte range + locks or FindNotify requests) are pending. No more than + 64K-1 requests can be outstanding at one time. If no + mids are available, return zero. A future optimization + could make the combination of mids and uid the key we use + to demultiplex on (rather than mid alone). + In addition to the above check, the cifs demultiplex + code already used the command code as a secondary + check of the frame and if signing is negotiated the + response would be discarded if the mid were the same + but the signature was wrong. Since the mid is not put in the + pending queue until later (when it is about to be dispatched) + we do have to limit the number of outstanding requests + to somewhat less than 64K-1 although it is hard to imagine + so many threads being in the vfs at one time. +*/ +__u16 GetNextMid(struct TCP_Server_Info *server) +{ + __u16 mid = 0; + __u16 last_mid; + bool collision; + + spin_lock(&GlobalMid_Lock); + last_mid = server->CurrentMid; /* we do not want to loop forever */ + server->CurrentMid++; + /* This nested loop looks more expensive than it is. + In practice the list of pending requests is short, + fewer than 50, and the mids are likely to be unique + on the first pass through the loop unless some request + takes longer than the 64 thousand requests before it + (and it would also have to have been a request that + did not time out) */ + while (server->CurrentMid != last_mid) { + struct mid_q_entry *mid_entry; + unsigned int num_mids; + + collision = false; + if (server->CurrentMid == 0) + server->CurrentMid++; + + num_mids = 0; + list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { + ++num_mids; + if (mid_entry->mid == server->CurrentMid && + mid_entry->midState == MID_REQUEST_SUBMITTED) { + /* This mid is in use, try a different one */ + collision = true; + break; + } + } + + /* + * if we have more than 32k mids in the list, then something + * is very wrong. Possibly a local user is trying to DoS the + * box by issuing long-running calls and SIGKILL'ing them. If + * we get to 2^16 mids then we're in big trouble as this + * function could loop forever. + * + * Go ahead and assign out the mid in this situation, but force + * an eventual reconnect to clean out the pending_mid_q. + */ + if (num_mids > 32768) + server->tcpStatus = CifsNeedReconnect; + + if (!collision) { + mid = server->CurrentMid; + break; + } + server->CurrentMid++; + } + spin_unlock(&GlobalMid_Lock); + return mid; +} + +/* NB: MID can not be set if treeCon not passed in, in that + case it is responsbility of caller to set the mid */ +void +header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , + const struct cifs_tcon *treeCon, int word_count + /* length of fixed section (word count) in two byte units */) +{ + struct list_head *temp_item; + struct cifs_ses *ses; + char *temp = (char *) buffer; + + memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */ + + buffer->smb_buf_length = cpu_to_be32( + (2 * word_count) + sizeof(struct smb_hdr) - + 4 /* RFC 1001 length field does not count */ + + 2 /* for bcc field itself */) ; + + buffer->Protocol[0] = 0xFF; + buffer->Protocol[1] = 'S'; + buffer->Protocol[2] = 'M'; + buffer->Protocol[3] = 'B'; + buffer->Command = smb_command; + buffer->Flags = 0x00; /* case sensitive */ + buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES; + buffer->Pid = cpu_to_le16((__u16)current->tgid); + buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16)); + if (treeCon) { + buffer->Tid = treeCon->tid; + if (treeCon->ses) { + if (treeCon->ses->capabilities & CAP_UNICODE) + buffer->Flags2 |= SMBFLG2_UNICODE; + if (treeCon->ses->capabilities & CAP_STATUS32) + buffer->Flags2 |= SMBFLG2_ERR_STATUS; + + /* Uid is not converted */ + buffer->Uid = treeCon->ses->Suid; + buffer->Mid = GetNextMid(treeCon->ses->server); + if (multiuser_mount != 0) { + /* For the multiuser case, there are few obvious technically */ + /* possible mechanisms to match the local linux user (uid) */ + /* to a valid remote smb user (smb_uid): */ + /* 1) Query Winbind (or other local pam/nss daemon */ + /* for userid/password/logon_domain or credential */ + /* 2) Query Winbind for uid to sid to username mapping */ + /* and see if we have a matching password for existing*/ + /* session for that user perhas getting password by */ + /* adding a new pam_cifs module that stores passwords */ + /* so that the cifs vfs can get at that for all logged*/ + /* on users */ + /* 3) (Which is the mechanism we have chosen) */ + /* Search through sessions to the same server for a */ + /* a match on the uid that was passed in on mount */ + /* with the current processes uid (or euid?) and use */ + /* that smb uid. If no existing smb session for */ + /* that uid found, use the default smb session ie */ + /* the smb session for the volume mounted which is */ + /* the same as would be used if the multiuser mount */ + /* flag were disabled. */ + + /* BB Add support for establishing new tCon and SMB Session */ + /* with userid/password pairs found on the smb session */ + /* for other target tcp/ip addresses BB */ + if (current_fsuid() != treeCon->ses->linux_uid) { + cFYI(1, "Multiuser mode and UID " + "did not match tcon uid"); + spin_lock(&cifs_tcp_ses_lock); + list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) { + ses = list_entry(temp_item, struct cifs_ses, smb_ses_list); + if (ses->linux_uid == current_fsuid()) { + if (ses->server == treeCon->ses->server) { + cFYI(1, "found matching uid substitute right smb_uid"); + buffer->Uid = ses->Suid; + break; + } else { + /* BB eventually call cifs_setup_session here */ + cFYI(1, "local UID found but no smb sess with this server exists"); + } + } + } + spin_unlock(&cifs_tcp_ses_lock); + } + } + } + if (treeCon->Flags & SMB_SHARE_IS_IN_DFS) + buffer->Flags2 |= SMBFLG2_DFS; + if (treeCon->nocase) + buffer->Flags |= SMBFLG_CASELESS; + if ((treeCon->ses) && (treeCon->ses->server)) + if (treeCon->ses->server->sec_mode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + } + +/* endian conversion of flags is now done just before sending */ + buffer->WordCount = (char) word_count; + return; +} + +static int +check_smb_hdr(struct smb_hdr *smb, __u16 mid) +{ + /* does it have the right SMB "signature" ? */ + if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) { + cERROR(1, "Bad protocol string signature header 0x%x", + *(unsigned int *)smb->Protocol); + return 1; + } + + /* Make sure that message ids match */ + if (mid != smb->Mid) { + cERROR(1, "Mids do not match. received=%u expected=%u", + smb->Mid, mid); + return 1; + } + + /* if it's a response then accept */ + if (smb->Flags & SMBFLG_RESPONSE) + return 0; + + /* only one valid case where server sends us request */ + if (smb->Command == SMB_COM_LOCKING_ANDX) + return 0; + + cERROR(1, "Server sent request, not response. mid=%u", smb->Mid); + return 1; +} + +int +checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) +{ + __u32 len = be32_to_cpu(smb->smb_buf_length); + __u32 clc_len; /* calculated length */ + cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len); + + if (length < 2 + sizeof(struct smb_hdr)) { + if ((length >= sizeof(struct smb_hdr) - 1) + && (smb->Status.CifsError != 0)) { + smb->WordCount = 0; + /* some error cases do not return wct and bcc */ + return 0; + } else if ((length == sizeof(struct smb_hdr) + 1) && + (smb->WordCount == 0)) { + char *tmp = (char *)smb; + /* Need to work around a bug in two servers here */ + /* First, check if the part of bcc they sent was zero */ + if (tmp[sizeof(struct smb_hdr)] == 0) { + /* some servers return only half of bcc + * on simple responses (wct, bcc both zero) + * in particular have seen this on + * ulogoffX and FindClose. This leaves + * one byte of bcc potentially unitialized + */ + /* zero rest of bcc */ + tmp[sizeof(struct smb_hdr)+1] = 0; + return 0; + } + cERROR(1, "rcvd invalid byte count (bcc)"); + } else { + cERROR(1, "Length less than smb header size"); + } + return 1; + } + if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { + cERROR(1, "smb length greater than MaxBufSize, mid=%d", + smb->Mid); + return 1; + } + + if (check_smb_hdr(smb, mid)) + return 1; + clc_len = smbCalcSize(smb); + + if (4 + len != length) { + cERROR(1, "Length read does not match RFC1001 length %d", + len); + return 1; + } + + if (4 + len != clc_len) { + /* check if bcc wrapped around for large read responses */ + if ((len > 64 * 1024) && (len > clc_len)) { + /* check if lengths match mod 64K */ + if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) + return 0; /* bcc wrapped */ + } + cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u", + clc_len, 4 + len, smb->Mid); + + if (4 + len < clc_len) { + cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u", + len, smb->Mid); + return 1; + } else if (len > clc_len + 512) { + /* + * Some servers (Windows XP in particular) send more + * data than the lengths in the SMB packet would + * indicate on certain calls (byte range locks and + * trans2 find first calls in particular). While the + * client can handle such a frame by ignoring the + * trailing data, we choose limit the amount of extra + * data to 512 bytes. + */ + cERROR(1, "RFC1001 size %u more than 512 bytes larger " + "than SMB for mid=%u", len, smb->Mid); + return 1; + } + } + return 0; +} + +bool +is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) +{ + struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf; + struct list_head *tmp, *tmp1, *tmp2; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct cifsInodeInfo *pCifsInode; + struct cifsFileInfo *netfile; + + cFYI(1, "Checking for oplock break or dnotify response"); + if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && + (pSMB->hdr.Flags & SMBFLG_RESPONSE)) { + struct smb_com_transaction_change_notify_rsp *pSMBr = + (struct smb_com_transaction_change_notify_rsp *)buf; + struct file_notify_information *pnotify; + __u32 data_offset = 0; + if (get_bcc(buf) > sizeof(struct file_notify_information)) { + data_offset = le32_to_cpu(pSMBr->DataOffset); + + pnotify = (struct file_notify_information *) + ((char *)&pSMBr->hdr.Protocol + data_offset); + cFYI(1, "dnotify on %s Action: 0x%x", + pnotify->FileName, pnotify->Action); + /* cifs_dump_mem("Rcvd notify Data: ",buf, + sizeof(struct smb_hdr)+60); */ + return true; + } + if (pSMBr->hdr.Status.CifsError) { + cFYI(1, "notify err 0x%d", + pSMBr->hdr.Status.CifsError); + return true; + } + return false; + } + if (pSMB->hdr.Command != SMB_COM_LOCKING_ANDX) + return false; + if (pSMB->hdr.Flags & SMBFLG_RESPONSE) { + /* no sense logging error on invalid handle on oplock + break - harmless race between close request and oplock + break response is expected from time to time writing out + large dirty files cached on the client */ + if ((NT_STATUS_INVALID_HANDLE) == + le32_to_cpu(pSMB->hdr.Status.CifsError)) { + cFYI(1, "invalid handle on oplock break"); + return true; + } else if (ERRbadfid == + le16_to_cpu(pSMB->hdr.Status.DosError.Error)) { + return true; + } else { + return false; /* on valid oplock brk we get "request" */ + } + } + if (pSMB->hdr.WordCount != 8) + return false; + + cFYI(1, "oplock type 0x%d level 0x%d", + pSMB->LockType, pSMB->OplockLevel); + if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)) + return false; + + /* look up tcon based on tid & uid */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &srv->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each(tmp1, &ses->tcon_list) { + tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + if (tcon->tid != buf->Tid) + continue; + + cifs_stats_inc(&tcon->num_oplock_brks); + spin_lock(&cifs_file_list_lock); + list_for_each(tmp2, &tcon->openFileList) { + netfile = list_entry(tmp2, struct cifsFileInfo, + tlist); + if (pSMB->Fid != netfile->netfid) + continue; + + cFYI(1, "file id match, oplock break"); + pCifsInode = CIFS_I(netfile->dentry->d_inode); + + cifs_set_oplock_level(pCifsInode, + pSMB->OplockLevel ? OPLOCK_READ : 0); + /* + * cifs_oplock_break_put() can't be called + * from here. Get reference after queueing + * succeeded. cifs_oplock_break() will + * synchronize using cifs_file_list_lock. + */ + if (queue_work(system_nrt_wq, + &netfile->oplock_break)) + cifs_oplock_break_get(netfile); + netfile->oplock_break_cancelled = false; + + spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } + spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_tcp_ses_lock); + cFYI(1, "No matching file for oplock break"); + return true; + } + } + spin_unlock(&cifs_tcp_ses_lock); + cFYI(1, "Can not process oplock break for non-existent connection"); + return true; +} + +void +dump_smb(struct smb_hdr *smb_buf, int smb_buf_length) +{ + int i, j; + char debug_line[17]; + unsigned char *buffer; + + if (traceSMB == 0) + return; + + buffer = (unsigned char *) smb_buf; + for (i = 0, j = 0; i < smb_buf_length; i++, j++) { + if (i % 8 == 0) { + /* have reached the beginning of line */ + printk(KERN_DEBUG "| "); + j = 0; + } + printk("%0#4x ", buffer[i]); + debug_line[2 * j] = ' '; + if (isprint(buffer[i])) + debug_line[1 + (2 * j)] = buffer[i]; + else + debug_line[1 + (2 * j)] = '_'; + + if (i % 8 == 7) { + /* reached end of line, time to print ascii */ + debug_line[16] = 0; + printk(" | %s\n", debug_line); + } + } + for (; j < 8; j++) { + printk(" "); + debug_line[2 * j] = ' '; + debug_line[1 + (2 * j)] = ' '; + } + printk(" | %s\n", debug_line); + return; +} + +void +cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) +{ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; + cERROR(1, "Autodisabling the use of server inode numbers on " + "%s. This server doesn't seem to support them " + "properly. Hardlinks will not be recognized on this " + "mount. Consider mounting with the \"noserverino\" " + "option to silence this message.", + cifs_sb_master_tcon(cifs_sb)->treeName); + } +} + +void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) +{ + oplock &= 0xF; + + if (oplock == OPLOCK_EXCLUSIVE) { + cinode->clientCanCacheAll = true; + cinode->clientCanCacheRead = true; + cFYI(1, "Exclusive Oplock granted on inode %p", + &cinode->vfs_inode); + } else if (oplock == OPLOCK_READ) { + cinode->clientCanCacheAll = false; + cinode->clientCanCacheRead = true; + cFYI(1, "Level II Oplock granted on inode %p", + &cinode->vfs_inode); + } else { + cinode->clientCanCacheAll = false; + cinode->clientCanCacheRead = false; + } +} diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c new file mode 100644 index 00000000..73e47e84 --- /dev/null +++ b/fs/cifs/netmisc.c @@ -0,0 +1,1008 @@ +/* + * fs/cifs/netmisc.c + * + * Copyright (c) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * Error mapping routines from Samba libsmb/errormap.c + * Copyright (C) Andrew Tridgell 2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "smberr.h" +#include "cifs_debug.h" +#include "nterr.h" + +struct smb_to_posix_error { + __u16 smb_err; + int posix_code; +}; + +static const struct smb_to_posix_error mapping_table_ERRDOS[] = { + {ERRbadfunc, -EINVAL}, + {ERRbadfile, -ENOENT}, + {ERRbadpath, -ENOTDIR}, + {ERRnofids, -EMFILE}, + {ERRnoaccess, -EACCES}, + {ERRbadfid, -EBADF}, + {ERRbadmcb, -EIO}, + {ERRnomem, -ENOMEM}, + {ERRbadmem, -EFAULT}, + {ERRbadenv, -EFAULT}, + {ERRbadformat, -EINVAL}, + {ERRbadaccess, -EACCES}, + {ERRbaddata, -EIO}, + {ERRbaddrive, -ENXIO}, + {ERRremcd, -EACCES}, + {ERRdiffdevice, -EXDEV}, + {ERRnofiles, -ENOENT}, + {ERRwriteprot, -EROFS}, + {ERRbadshare, -ETXTBSY}, + {ERRlock, -EACCES}, + {ERRunsup, -EINVAL}, + {ERRnosuchshare, -ENXIO}, + {ERRfilexists, -EEXIST}, + {ERRinvparm, -EINVAL}, + {ERRdiskfull, -ENOSPC}, + {ERRinvname, -ENOENT}, + {ERRinvlevel, -EOPNOTSUPP}, + {ERRdirnotempty, -ENOTEMPTY}, + {ERRnotlocked, -ENOLCK}, + {ERRcancelviolation, -ENOLCK}, + {ERRalreadyexists, -EEXIST}, + {ERRmoredata, -EOVERFLOW}, + {ERReasnotsupported, -EOPNOTSUPP}, + {ErrQuota, -EDQUOT}, + {ErrNotALink, -ENOLINK}, + {ERRnetlogonNotStarted, -ENOPROTOOPT}, + {ERRsymlink, -EOPNOTSUPP}, + {ErrTooManyLinks, -EMLINK}, + {0, 0} +}; + +static const struct smb_to_posix_error mapping_table_ERRSRV[] = { + {ERRerror, -EIO}, + {ERRbadpw, -EACCES}, /* was EPERM */ + {ERRbadtype, -EREMOTE}, + {ERRaccess, -EACCES}, + {ERRinvtid, -ENXIO}, + {ERRinvnetname, -ENXIO}, + {ERRinvdevice, -ENXIO}, + {ERRqfull, -ENOSPC}, + {ERRqtoobig, -ENOSPC}, + {ERRqeof, -EIO}, + {ERRinvpfid, -EBADF}, + {ERRsmbcmd, -EBADRQC}, + {ERRsrverror, -EIO}, + {ERRbadBID, -EIO}, + {ERRfilespecs, -EINVAL}, + {ERRbadLink, -EIO}, + {ERRbadpermits, -EINVAL}, + {ERRbadPID, -ESRCH}, + {ERRsetattrmode, -EINVAL}, + {ERRpaused, -EHOSTDOWN}, + {ERRmsgoff, -EHOSTDOWN}, + {ERRnoroom, -ENOSPC}, + {ERRrmuns, -EUSERS}, + {ERRtimeout, -ETIME}, + {ERRnoresource, -ENOBUFS}, + {ERRtoomanyuids, -EUSERS}, + {ERRbaduid, -EACCES}, + {ERRusempx, -EIO}, + {ERRusestd, -EIO}, + {ERR_NOTIFY_ENUM_DIR, -ENOBUFS}, + {ERRnoSuchUser, -EACCES}, +/* {ERRaccountexpired, -EACCES}, + {ERRbadclient, -EACCES}, + {ERRbadLogonTime, -EACCES}, + {ERRpasswordExpired, -EACCES},*/ + {ERRaccountexpired, -EKEYEXPIRED}, + {ERRbadclient, -EACCES}, + {ERRbadLogonTime, -EACCES}, + {ERRpasswordExpired, -EKEYEXPIRED}, + + {ERRnosupport, -EINVAL}, + {0, 0} +}; + +static const struct smb_to_posix_error mapping_table_ERRHRD[] = { + {0, 0} +}; + +/* + * Convert a string containing text IPv4 or IPv6 address to binary form. + * + * Returns 0 on failure. + */ +static int +cifs_inet_pton(const int address_family, const char *cp, int len, void *dst) +{ + int ret = 0; + + /* calculate length by finding first slash or NULL */ + if (address_family == AF_INET) + ret = in4_pton(cp, len, dst, '\\', NULL); + else if (address_family == AF_INET6) + ret = in6_pton(cp, len, dst , '\\', NULL); + + cFYI(DBG2, "address conversion returned %d for %*.*s", + ret, len, len, cp); + if (ret > 0) + ret = 1; + return ret; +} + +/* + * Try to convert a string to an IPv4 address and then attempt to convert + * it to an IPv6 address if that fails. Set the family field if either + * succeeds. If it's an IPv6 address and it has a '%' sign in it, try to + * treat the part following it as a numeric sin6_scope_id. + * + * Returns 0 on failure. + */ +int +cifs_convert_address(struct sockaddr *dst, const char *src, int len) +{ + int rc, alen, slen; + const char *pct; + char scope_id[13]; + struct sockaddr_in *s4 = (struct sockaddr_in *) dst; + struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst; + + /* IPv4 address */ + if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) { + s4->sin_family = AF_INET; + return 1; + } + + /* attempt to exclude the scope ID from the address part */ + pct = memchr(src, '%', len); + alen = pct ? pct - src : len; + + rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr); + if (!rc) + return rc; + + s6->sin6_family = AF_INET6; + if (pct) { + /* grab the scope ID */ + slen = len - (alen + 1); + if (slen <= 0 || slen > 12) + return 0; + memcpy(scope_id, pct + 1, slen); + scope_id[slen] = '\0'; + + rc = strict_strtoul(scope_id, 0, + (unsigned long *)&s6->sin6_scope_id); + rc = (rc == 0) ? 1 : 0; + } + + return rc; +} + +int +cifs_set_port(struct sockaddr *addr, const unsigned short int port) +{ + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *)addr)->sin_port = htons(port); + break; + case AF_INET6: + ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); + break; + default: + return 0; + } + return 1; +} + +int +cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, + const unsigned short int port) +{ + if (!cifs_convert_address(dst, src, len)) + return 0; + return cifs_set_port(dst, port); +} + +/***************************************************************************** +convert a NT status code to a dos class/code + *****************************************************************************/ +/* NT status -> dos error map */ +static const struct { + __u8 dos_class; + __u16 dos_code; + __u32 ntstatus; +} ntstatus_to_dos_map[] = { + { + ERRDOS, ERRgeneral, NT_STATUS_UNSUCCESSFUL}, { + ERRDOS, ERRbadfunc, NT_STATUS_NOT_IMPLEMENTED}, { + ERRDOS, ERRinvlevel, NT_STATUS_INVALID_INFO_CLASS}, { + ERRDOS, 24, NT_STATUS_INFO_LENGTH_MISMATCH}, { + ERRHRD, ERRgeneral, NT_STATUS_ACCESS_VIOLATION}, { + ERRHRD, ERRgeneral, NT_STATUS_IN_PAGE_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_QUOTA}, { + ERRDOS, ERRbadfid, NT_STATUS_INVALID_HANDLE}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_INITIAL_STACK}, { + ERRDOS, 193, NT_STATUS_BAD_INITIAL_PC}, { + ERRDOS, 87, NT_STATUS_INVALID_CID}, { + ERRHRD, ERRgeneral, NT_STATUS_TIMER_NOT_CANCELED}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER}, { + ERRDOS, ERRbadfile, NT_STATUS_NO_SUCH_DEVICE}, { + ERRDOS, ERRbadfile, NT_STATUS_NO_SUCH_FILE}, { + ERRDOS, ERRbadfunc, NT_STATUS_INVALID_DEVICE_REQUEST}, { + ERRDOS, 38, NT_STATUS_END_OF_FILE}, { + ERRDOS, 34, NT_STATUS_WRONG_VOLUME}, { + ERRDOS, 21, NT_STATUS_NO_MEDIA_IN_DEVICE}, { + ERRHRD, ERRgeneral, NT_STATUS_UNRECOGNIZED_MEDIA}, { + ERRDOS, 27, NT_STATUS_NONEXISTENT_SECTOR}, +/* { This NT error code was 'sqashed' + from NT_STATUS_MORE_PROCESSING_REQUIRED to NT_STATUS_OK + during the session setup } */ + { + ERRDOS, ERRnomem, NT_STATUS_NO_MEMORY}, { + ERRDOS, 487, NT_STATUS_CONFLICTING_ADDRESSES}, { + ERRDOS, 487, NT_STATUS_NOT_MAPPED_VIEW}, { + ERRDOS, 87, NT_STATUS_UNABLE_TO_FREE_VM}, { + ERRDOS, 87, NT_STATUS_UNABLE_TO_DELETE_SECTION}, { + ERRDOS, 2142, NT_STATUS_INVALID_SYSTEM_SERVICE}, { + ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_INSTRUCTION}, { + ERRDOS, ERRnoaccess, NT_STATUS_INVALID_LOCK_SEQUENCE}, { + ERRDOS, ERRnoaccess, NT_STATUS_INVALID_VIEW_SIZE}, { + ERRDOS, 193, NT_STATUS_INVALID_FILE_FOR_SECTION}, { + ERRDOS, ERRnoaccess, NT_STATUS_ALREADY_COMMITTED}, +/* { This NT error code was 'sqashed' + from NT_STATUS_ACCESS_DENIED to NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE + during the session setup } */ + { + ERRDOS, ERRnoaccess, NT_STATUS_ACCESS_DENIED}, { + ERRDOS, 111, NT_STATUS_BUFFER_TOO_SMALL}, { + ERRDOS, ERRbadfid, NT_STATUS_OBJECT_TYPE_MISMATCH}, { + ERRHRD, ERRgeneral, NT_STATUS_NONCONTINUABLE_EXCEPTION}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_DISPOSITION}, { + ERRHRD, ERRgeneral, NT_STATUS_UNWIND}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_STACK}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_UNWIND_TARGET}, { + ERRDOS, 158, NT_STATUS_NOT_LOCKED}, { + ERRHRD, ERRgeneral, NT_STATUS_PARITY_ERROR}, { + ERRDOS, 487, NT_STATUS_UNABLE_TO_DECOMMIT_VM}, { + ERRDOS, 487, NT_STATUS_NOT_COMMITTED}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_PORT_ATTRIBUTES}, { + ERRHRD, ERRgeneral, NT_STATUS_PORT_MESSAGE_TOO_LONG}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_MIX}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_QUOTA_LOWER}, { + ERRHRD, ERRgeneral, NT_STATUS_DISK_CORRUPT_ERROR}, { + /* mapping changed since shell does lookup on * expects FileNotFound */ + ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_INVALID}, { + ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_NOT_FOUND}, { + ERRDOS, ERRalreadyexists, NT_STATUS_OBJECT_NAME_COLLISION}, { + ERRHRD, ERRgeneral, NT_STATUS_HANDLE_NOT_WAITABLE}, { + ERRDOS, ERRbadfid, NT_STATUS_PORT_DISCONNECTED}, { + ERRHRD, ERRgeneral, NT_STATUS_DEVICE_ALREADY_ATTACHED}, { + ERRDOS, 161, NT_STATUS_OBJECT_PATH_INVALID}, { + ERRDOS, ERRbadpath, NT_STATUS_OBJECT_PATH_NOT_FOUND}, { + ERRDOS, 161, NT_STATUS_OBJECT_PATH_SYNTAX_BAD}, { + ERRHRD, ERRgeneral, NT_STATUS_DATA_OVERRUN}, { + ERRHRD, ERRgeneral, NT_STATUS_DATA_LATE_ERROR}, { + ERRDOS, 23, NT_STATUS_DATA_ERROR}, { + ERRDOS, 23, NT_STATUS_CRC_ERROR}, { + ERRDOS, ERRnomem, NT_STATUS_SECTION_TOO_BIG}, { + ERRDOS, ERRnoaccess, NT_STATUS_PORT_CONNECTION_REFUSED}, { + ERRDOS, ERRbadfid, NT_STATUS_INVALID_PORT_HANDLE}, { + ERRDOS, ERRbadshare, NT_STATUS_SHARING_VIOLATION}, { + ERRHRD, ERRgeneral, NT_STATUS_QUOTA_EXCEEDED}, { + ERRDOS, 87, NT_STATUS_INVALID_PAGE_PROTECTION}, { + ERRDOS, 288, NT_STATUS_MUTANT_NOT_OWNED}, { + ERRDOS, 298, NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED}, { + ERRDOS, 87, NT_STATUS_PORT_ALREADY_SET}, { + ERRDOS, 87, NT_STATUS_SECTION_NOT_IMAGE}, { + ERRDOS, 156, NT_STATUS_SUSPEND_COUNT_EXCEEDED}, { + ERRDOS, ERRnoaccess, NT_STATUS_THREAD_IS_TERMINATING}, { + ERRDOS, 87, NT_STATUS_BAD_WORKING_SET_LIMIT}, { + ERRDOS, 87, NT_STATUS_INCOMPATIBLE_FILE_MAP}, { + ERRDOS, 87, NT_STATUS_SECTION_PROTECTION}, { + ERRDOS, ERReasnotsupported, NT_STATUS_EAS_NOT_SUPPORTED}, { + ERRDOS, 255, NT_STATUS_EA_TOO_LARGE}, { + ERRHRD, ERRgeneral, NT_STATUS_NONEXISTENT_EA_ENTRY}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_EAS_ON_FILE}, { + ERRHRD, ERRgeneral, NT_STATUS_EA_CORRUPT_ERROR}, { + ERRDOS, ERRlock, NT_STATUS_FILE_LOCK_CONFLICT}, { + ERRDOS, ERRlock, NT_STATUS_LOCK_NOT_GRANTED}, { + ERRDOS, ERRbadfile, NT_STATUS_DELETE_PENDING}, { + ERRDOS, ERRunsup, NT_STATUS_CTL_FILE_NOT_SUPPORTED}, { + ERRHRD, ERRgeneral, NT_STATUS_UNKNOWN_REVISION}, { + ERRHRD, ERRgeneral, NT_STATUS_REVISION_MISMATCH}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_OWNER}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_PRIMARY_GROUP}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_IMPERSONATION_TOKEN}, { + ERRHRD, ERRgeneral, NT_STATUS_CANT_DISABLE_MANDATORY}, { + ERRDOS, 2215, NT_STATUS_NO_LOGON_SERVERS}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_LOGON_SESSION}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PRIVILEGE}, { + ERRDOS, ERRnoaccess, NT_STATUS_PRIVILEGE_NOT_HELD}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACCOUNT_NAME}, { + ERRHRD, ERRgeneral, NT_STATUS_USER_EXISTS}, +/* { This NT error code was 'sqashed' + from NT_STATUS_NO_SUCH_USER to NT_STATUS_LOGON_FAILURE + during the session setup } */ + { + ERRDOS, ERRnoaccess, NT_STATUS_NO_SUCH_USER}, { /* could map to 2238 */ + ERRHRD, ERRgeneral, NT_STATUS_GROUP_EXISTS}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_GROUP}, { + ERRHRD, ERRgeneral, NT_STATUS_MEMBER_IN_GROUP}, { + ERRHRD, ERRgeneral, NT_STATUS_MEMBER_NOT_IN_GROUP}, { + ERRHRD, ERRgeneral, NT_STATUS_LAST_ADMIN}, +/* { This NT error code was 'sqashed' + from NT_STATUS_WRONG_PASSWORD to NT_STATUS_LOGON_FAILURE + during the session setup } */ + { + ERRSRV, ERRbadpw, NT_STATUS_WRONG_PASSWORD}, { + ERRHRD, ERRgeneral, NT_STATUS_ILL_FORMED_PASSWORD}, { + ERRHRD, ERRgeneral, NT_STATUS_PASSWORD_RESTRICTION}, { + ERRDOS, ERRnoaccess, NT_STATUS_LOGON_FAILURE}, { + ERRHRD, ERRgeneral, NT_STATUS_ACCOUNT_RESTRICTION}, { + ERRSRV, ERRbadLogonTime, NT_STATUS_INVALID_LOGON_HOURS}, { + ERRSRV, ERRbadclient, NT_STATUS_INVALID_WORKSTATION}, { + ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_EXPIRED}, { + ERRSRV, ERRaccountexpired, NT_STATUS_ACCOUNT_DISABLED}, { + ERRHRD, ERRgeneral, NT_STATUS_NONE_MAPPED}, { + ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_LUIDS_REQUESTED}, { + ERRHRD, ERRgeneral, NT_STATUS_LUIDS_EXHAUSTED}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_SUB_AUTHORITY}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACL}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_SID}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_SECURITY_DESCR}, { + ERRDOS, 127, NT_STATUS_PROCEDURE_NOT_FOUND}, { + ERRDOS, 193, NT_STATUS_INVALID_IMAGE_FORMAT}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_TOKEN}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_INHERITANCE_ACL}, { + ERRDOS, 158, NT_STATUS_RANGE_NOT_LOCKED}, { + ERRDOS, 112, NT_STATUS_DISK_FULL}, { + ERRHRD, ERRgeneral, NT_STATUS_SERVER_DISABLED}, { + ERRHRD, ERRgeneral, NT_STATUS_SERVER_NOT_DISABLED}, { + ERRDOS, 68, NT_STATUS_TOO_MANY_GUIDS_REQUESTED}, { + ERRDOS, 259, NT_STATUS_GUIDS_EXHAUSTED}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_ID_AUTHORITY}, { + ERRDOS, 259, NT_STATUS_AGENTS_EXHAUSTED}, { + ERRDOS, 154, NT_STATUS_INVALID_VOLUME_LABEL}, { + ERRDOS, 14, NT_STATUS_SECTION_NOT_EXTENDED}, { + ERRDOS, 487, NT_STATUS_NOT_MAPPED_DATA}, { + ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_DATA_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_TYPE_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_NAME_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_ARRAY_BOUNDS_EXCEEDED}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOAT_DENORMAL_OPERAND}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOAT_DIVIDE_BY_ZERO}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOAT_INEXACT_RESULT}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOAT_INVALID_OPERATION}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOAT_OVERFLOW}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOAT_STACK_CHECK}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOAT_UNDERFLOW}, { + ERRHRD, ERRgeneral, NT_STATUS_INTEGER_DIVIDE_BY_ZERO}, { + ERRDOS, 534, NT_STATUS_INTEGER_OVERFLOW}, { + ERRHRD, ERRgeneral, NT_STATUS_PRIVILEGED_INSTRUCTION}, { + ERRDOS, ERRnomem, NT_STATUS_TOO_MANY_PAGING_FILES}, { + ERRHRD, ERRgeneral, NT_STATUS_FILE_INVALID}, { + ERRHRD, ERRgeneral, NT_STATUS_ALLOTTED_SPACE_EXCEEDED}, +/* { This NT error code was 'sqashed' + from NT_STATUS_INSUFFICIENT_RESOURCES to + NT_STATUS_INSUFF_SERVER_RESOURCES during the session setup } */ + { + ERRDOS, ERRnomem, NT_STATUS_INSUFFICIENT_RESOURCES}, { + ERRDOS, ERRbadpath, NT_STATUS_DFS_EXIT_PATH_FOUND}, { + ERRDOS, 23, NT_STATUS_DEVICE_DATA_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_CONNECTED}, { + ERRDOS, 21, NT_STATUS_DEVICE_POWER_FAILURE}, { + ERRDOS, 487, NT_STATUS_FREE_VM_NOT_AT_BASE}, { + ERRDOS, 487, NT_STATUS_MEMORY_NOT_ALLOCATED}, { + ERRHRD, ERRgeneral, NT_STATUS_WORKING_SET_QUOTA}, { + ERRDOS, 19, NT_STATUS_MEDIA_WRITE_PROTECTED}, { + ERRDOS, 21, NT_STATUS_DEVICE_NOT_READY}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_GROUP_ATTRIBUTES}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_IMPERSONATION_LEVEL}, { + ERRHRD, ERRgeneral, NT_STATUS_CANT_OPEN_ANONYMOUS}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_VALIDATION_CLASS}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_TOKEN_TYPE}, { + ERRDOS, 87, NT_STATUS_BAD_MASTER_BOOT_RECORD}, { + ERRHRD, ERRgeneral, NT_STATUS_INSTRUCTION_MISALIGNMENT}, { + ERRDOS, ERRpipebusy, NT_STATUS_INSTANCE_NOT_AVAILABLE}, { + ERRDOS, ERRpipebusy, NT_STATUS_PIPE_NOT_AVAILABLE}, { + ERRDOS, ERRbadpipe, NT_STATUS_INVALID_PIPE_STATE}, { + ERRDOS, ERRpipebusy, NT_STATUS_PIPE_BUSY}, { + ERRDOS, ERRbadfunc, NT_STATUS_ILLEGAL_FUNCTION}, { + ERRDOS, ERRnotconnected, NT_STATUS_PIPE_DISCONNECTED}, { + ERRDOS, ERRpipeclosing, NT_STATUS_PIPE_CLOSING}, { + ERRHRD, ERRgeneral, NT_STATUS_PIPE_CONNECTED}, { + ERRHRD, ERRgeneral, NT_STATUS_PIPE_LISTENING}, { + ERRDOS, ERRbadpipe, NT_STATUS_INVALID_READ_MODE}, { + ERRDOS, 121, NT_STATUS_IO_TIMEOUT}, { + ERRDOS, 38, NT_STATUS_FILE_FORCED_CLOSED}, { + ERRHRD, ERRgeneral, NT_STATUS_PROFILING_NOT_STARTED}, { + ERRHRD, ERRgeneral, NT_STATUS_PROFILING_NOT_STOPPED}, { + ERRHRD, ERRgeneral, NT_STATUS_COULD_NOT_INTERPRET}, { + ERRDOS, ERRnoaccess, NT_STATUS_FILE_IS_A_DIRECTORY}, { + ERRDOS, ERRunsup, NT_STATUS_NOT_SUPPORTED}, { + ERRDOS, 51, NT_STATUS_REMOTE_NOT_LISTENING}, { + ERRDOS, 52, NT_STATUS_DUPLICATE_NAME}, { + ERRDOS, 53, NT_STATUS_BAD_NETWORK_PATH}, { + ERRDOS, 54, NT_STATUS_NETWORK_BUSY}, { + ERRDOS, 55, NT_STATUS_DEVICE_DOES_NOT_EXIST}, { + ERRDOS, 56, NT_STATUS_TOO_MANY_COMMANDS}, { + ERRDOS, 57, NT_STATUS_ADAPTER_HARDWARE_ERROR}, { + ERRDOS, 58, NT_STATUS_INVALID_NETWORK_RESPONSE}, { + ERRDOS, 59, NT_STATUS_UNEXPECTED_NETWORK_ERROR}, { + ERRDOS, 60, NT_STATUS_BAD_REMOTE_ADAPTER}, { + ERRDOS, 61, NT_STATUS_PRINT_QUEUE_FULL}, { + ERRDOS, 62, NT_STATUS_NO_SPOOL_SPACE}, { + ERRDOS, 63, NT_STATUS_PRINT_CANCELLED}, { + ERRDOS, 64, NT_STATUS_NETWORK_NAME_DELETED}, { + ERRDOS, 65, NT_STATUS_NETWORK_ACCESS_DENIED}, { + ERRDOS, 66, NT_STATUS_BAD_DEVICE_TYPE}, { + ERRDOS, ERRnosuchshare, NT_STATUS_BAD_NETWORK_NAME}, { + ERRDOS, 68, NT_STATUS_TOO_MANY_NAMES}, { + ERRDOS, 69, NT_STATUS_TOO_MANY_SESSIONS}, { + ERRDOS, 70, NT_STATUS_SHARING_PAUSED}, { + ERRDOS, 71, NT_STATUS_REQUEST_NOT_ACCEPTED}, { + ERRDOS, 72, NT_STATUS_REDIRECTOR_PAUSED}, { + ERRDOS, 88, NT_STATUS_NET_WRITE_FAULT}, { + ERRHRD, ERRgeneral, NT_STATUS_PROFILING_AT_LIMIT}, { + ERRDOS, ERRdiffdevice, NT_STATUS_NOT_SAME_DEVICE}, { + ERRDOS, ERRnoaccess, NT_STATUS_FILE_RENAMED}, { + ERRDOS, 240, NT_STATUS_VIRTUAL_CIRCUIT_CLOSED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SECURITY_ON_OBJECT}, { + ERRHRD, ERRgeneral, NT_STATUS_CANT_WAIT}, { + ERRDOS, ERRpipeclosing, NT_STATUS_PIPE_EMPTY}, { + ERRHRD, ERRgeneral, NT_STATUS_CANT_ACCESS_DOMAIN_INFO}, { + ERRHRD, ERRgeneral, NT_STATUS_CANT_TERMINATE_SELF}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_SERVER_STATE}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_DOMAIN_STATE}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_DOMAIN_ROLE}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_DOMAIN}, { + ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_EXISTS}, { + ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_LIMIT_EXCEEDED}, { + ERRDOS, 300, NT_STATUS_OPLOCK_NOT_GRANTED}, { + ERRDOS, 301, NT_STATUS_INVALID_OPLOCK_PROTOCOL}, { + ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_DB_CORRUPTION}, { + ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_GENERIC_NOT_MAPPED}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_DESCRIPTOR_FORMAT}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_USER_BUFFER}, { + ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_IO_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_CREATE_ERR}, { + ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_MAP_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_EXTEND_ERR}, { + ERRHRD, ERRgeneral, NT_STATUS_NOT_LOGON_PROCESS}, { + ERRHRD, ERRgeneral, NT_STATUS_LOGON_SESSION_EXISTS}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_1}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_2}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_3}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_4}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_5}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_6}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_7}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_8}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_9}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_10}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_11}, { + ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_12}, { + ERRDOS, ERRbadpath, NT_STATUS_REDIRECTOR_NOT_STARTED}, { + ERRHRD, ERRgeneral, NT_STATUS_REDIRECTOR_STARTED}, { + ERRHRD, ERRgeneral, NT_STATUS_STACK_OVERFLOW}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PACKAGE}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_FUNCTION_TABLE}, { + ERRDOS, 203, 0xc0000100}, { + ERRDOS, 145, NT_STATUS_DIRECTORY_NOT_EMPTY}, { + ERRHRD, ERRgeneral, NT_STATUS_FILE_CORRUPT_ERROR}, { + ERRDOS, 267, NT_STATUS_NOT_A_DIRECTORY}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_LOGON_SESSION_STATE}, { + ERRHRD, ERRgeneral, NT_STATUS_LOGON_SESSION_COLLISION}, { + ERRDOS, 206, NT_STATUS_NAME_TOO_LONG}, { + ERRDOS, 2401, NT_STATUS_FILES_OPEN}, { + ERRDOS, 2404, NT_STATUS_CONNECTION_IN_USE}, { + ERRHRD, ERRgeneral, NT_STATUS_MESSAGE_NOT_FOUND}, { + ERRDOS, ERRnoaccess, NT_STATUS_PROCESS_IS_TERMINATING}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_LOGON_TYPE}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_GUID_TRANSLATION}, { + ERRHRD, ERRgeneral, NT_STATUS_CANNOT_IMPERSONATE}, { + ERRHRD, ERRgeneral, NT_STATUS_IMAGE_ALREADY_LOADED}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_NOT_PRESENT}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_LID_NOT_EXIST}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_LID_ALREADY_OWNED}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_NOT_LID_OWNER}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_COMMAND}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_LID}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE}, { + ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_SELECTOR}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_LDT}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_SIZE}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_OFFSET}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_DESCRIPTOR}, { + ERRDOS, 193, NT_STATUS_INVALID_IMAGE_NE_FORMAT}, { + ERRHRD, ERRgeneral, NT_STATUS_RXACT_INVALID_STATE}, { + ERRHRD, ERRgeneral, NT_STATUS_RXACT_COMMIT_FAILURE}, { + ERRHRD, ERRgeneral, NT_STATUS_MAPPED_FILE_SIZE_ZERO}, { + ERRDOS, ERRnofids, NT_STATUS_TOO_MANY_OPENED_FILES}, { + ERRHRD, ERRgeneral, NT_STATUS_CANCELLED}, { + ERRDOS, ERRnoaccess, NT_STATUS_CANNOT_DELETE}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_COMPUTER_NAME}, { + ERRDOS, ERRnoaccess, NT_STATUS_FILE_DELETED}, { + ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_ACCOUNT}, { + ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_GROUP}, { + ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_USER}, { + ERRHRD, ERRgeneral, NT_STATUS_MEMBERS_PRIMARY_GROUP}, { + ERRDOS, ERRbadfid, NT_STATUS_FILE_CLOSED}, { + ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_THREADS}, { + ERRHRD, ERRgeneral, NT_STATUS_THREAD_NOT_IN_PROCESS}, { + ERRHRD, ERRgeneral, NT_STATUS_TOKEN_ALREADY_IN_USE}, { + ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_QUOTA_EXCEEDED}, { + ERRHRD, ERRgeneral, NT_STATUS_COMMITMENT_LIMIT}, { + ERRDOS, 193, NT_STATUS_INVALID_IMAGE_LE_FORMAT}, { + ERRDOS, 193, NT_STATUS_INVALID_IMAGE_NOT_MZ}, { + ERRDOS, 193, NT_STATUS_INVALID_IMAGE_PROTECT}, { + ERRDOS, 193, NT_STATUS_INVALID_IMAGE_WIN_16}, { + ERRHRD, ERRgeneral, NT_STATUS_LOGON_SERVER_CONFLICT}, { + ERRHRD, ERRgeneral, NT_STATUS_TIME_DIFFERENCE_AT_DC}, { + ERRHRD, ERRgeneral, NT_STATUS_SYNCHRONIZATION_REQUIRED}, { + ERRDOS, 126, NT_STATUS_DLL_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_OPEN_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_IO_PRIVILEGE_FAILED}, { + ERRDOS, 182, NT_STATUS_ORDINAL_NOT_FOUND}, { + ERRDOS, 127, NT_STATUS_ENTRYPOINT_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_CONTROL_C_EXIT}, { + ERRDOS, 64, NT_STATUS_LOCAL_DISCONNECT}, { + ERRDOS, 64, NT_STATUS_REMOTE_DISCONNECT}, { + ERRDOS, 51, NT_STATUS_REMOTE_RESOURCES}, { + ERRDOS, 59, NT_STATUS_LINK_FAILED}, { + ERRDOS, 59, NT_STATUS_LINK_TIMEOUT}, { + ERRDOS, 59, NT_STATUS_INVALID_CONNECTION}, { + ERRDOS, 59, NT_STATUS_INVALID_ADDRESS}, { + ERRHRD, ERRgeneral, NT_STATUS_DLL_INIT_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_MISSING_SYSTEMFILE}, { + ERRHRD, ERRgeneral, NT_STATUS_UNHANDLED_EXCEPTION}, { + ERRHRD, ERRgeneral, NT_STATUS_APP_INIT_FAILURE}, { + ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_CREATE_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_PAGEFILE}, { + ERRDOS, 124, NT_STATUS_INVALID_LEVEL}, { + ERRDOS, 86, NT_STATUS_WRONG_PASSWORD_CORE}, { + ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_FLOAT_CONTEXT}, { + ERRDOS, 109, NT_STATUS_PIPE_BROKEN}, { + ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_CORRUPT}, { + ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_IO_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_EVENT_PAIR}, { + ERRHRD, ERRgeneral, NT_STATUS_UNRECOGNIZED_VOLUME}, { + ERRHRD, ERRgeneral, NT_STATUS_SERIAL_NO_DEVICE_INITED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_ALIAS}, { + ERRHRD, ERRgeneral, NT_STATUS_MEMBER_NOT_IN_ALIAS}, { + ERRHRD, ERRgeneral, NT_STATUS_MEMBER_IN_ALIAS}, { + ERRHRD, ERRgeneral, NT_STATUS_ALIAS_EXISTS}, { + ERRHRD, ERRgeneral, NT_STATUS_LOGON_NOT_GRANTED}, { + ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_SECRETS}, { + ERRHRD, ERRgeneral, NT_STATUS_SECRET_TOO_LONG}, { + ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_DB_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_FULLSCREEN_MODE}, { + ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_CONTEXT_IDS}, { + ERRDOS, ERRnoaccess, NT_STATUS_LOGON_TYPE_NOT_GRANTED}, { + ERRHRD, ERRgeneral, NT_STATUS_NOT_REGISTRY_FILE}, { + ERRHRD, ERRgeneral, NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED}, { + ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_FT_MISSING_MEMBER}, { + ERRHRD, ERRgeneral, NT_STATUS_ILL_FORMED_SERVICE_ENTRY}, { + ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_CHARACTER}, { + ERRHRD, ERRgeneral, NT_STATUS_UNMAPPABLE_CHARACTER}, { + ERRHRD, ERRgeneral, NT_STATUS_UNDEFINED_CHARACTER}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_VOLUME}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_WRONG_CYLINDER}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_UNKNOWN_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_BAD_REGISTERS}, { + ERRHRD, ERRgeneral, NT_STATUS_DISK_RECALIBRATE_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_DISK_OPERATION_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_DISK_RESET_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_SHARED_IRQ_BUSY}, { + ERRHRD, ERRgeneral, NT_STATUS_FT_ORPHANING}, { + ERRHRD, ERRgeneral, 0xc000016e}, { + ERRHRD, ERRgeneral, 0xc000016f}, { + ERRHRD, ERRgeneral, 0xc0000170}, { + ERRHRD, ERRgeneral, 0xc0000171}, { + ERRHRD, ERRgeneral, NT_STATUS_PARTITION_FAILURE}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_BLOCK_LENGTH}, { + ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_PARTITIONED}, { + ERRHRD, ERRgeneral, NT_STATUS_UNABLE_TO_LOCK_MEDIA}, { + ERRHRD, ERRgeneral, NT_STATUS_UNABLE_TO_UNLOAD_MEDIA}, { + ERRHRD, ERRgeneral, NT_STATUS_EOM_OVERFLOW}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_MEDIA}, { + ERRHRD, ERRgeneral, 0xc0000179}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_MEMBER}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_MEMBER}, { + ERRHRD, ERRgeneral, NT_STATUS_KEY_DELETED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_LOG_SPACE}, { + ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_SIDS}, { + ERRHRD, ERRgeneral, NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED}, { + ERRHRD, ERRgeneral, NT_STATUS_KEY_HAS_CHILDREN}, { + ERRHRD, ERRgeneral, NT_STATUS_CHILD_MUST_BE_VOLATILE}, { + ERRDOS, 87, NT_STATUS_DEVICE_CONFIGURATION_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_DRIVER_INTERNAL_ERROR}, { + ERRDOS, 22, NT_STATUS_INVALID_DEVICE_STATE}, { + ERRHRD, ERRgeneral, NT_STATUS_IO_DEVICE_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_DEVICE_PROTOCOL_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_BACKUP_CONTROLLER}, { + ERRHRD, ERRgeneral, NT_STATUS_LOG_FILE_FULL}, { + ERRDOS, 19, NT_STATUS_TOO_LATE}, { + ERRDOS, ERRnoaccess, NT_STATUS_NO_TRUST_LSA_SECRET}, +/* { This NT error code was 'sqashed' + from NT_STATUS_NO_TRUST_SAM_ACCOUNT to + NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE during the session setup } */ + { + ERRDOS, ERRnoaccess, NT_STATUS_NO_TRUST_SAM_ACCOUNT}, { + ERRDOS, ERRnoaccess, NT_STATUS_TRUSTED_DOMAIN_FAILURE}, { + ERRDOS, ERRnoaccess, NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE}, { + ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_FILE_CORRUPT}, { + ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_CANT_START}, { + ERRDOS, ERRnoaccess, NT_STATUS_TRUST_FAILURE}, { + ERRHRD, ERRgeneral, NT_STATUS_MUTANT_LIMIT_EXCEEDED}, { + ERRDOS, ERRnetlogonNotStarted, NT_STATUS_NETLOGON_NOT_STARTED}, { + ERRSRV, ERRaccountexpired, NT_STATUS_ACCOUNT_EXPIRED}, { + ERRHRD, ERRgeneral, NT_STATUS_POSSIBLE_DEADLOCK}, { + ERRHRD, ERRgeneral, NT_STATUS_NETWORK_CREDENTIAL_CONFLICT}, { + ERRHRD, ERRgeneral, NT_STATUS_REMOTE_SESSION_LIMIT}, { + ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_FILE_CHANGED}, { + ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT}, { + ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT}, { + ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT}, +/* { This NT error code was 'sqashed' + from NT_STATUS_DOMAIN_TRUST_INCONSISTENT to NT_STATUS_LOGON_FAILURE + during the session setup } */ + { + ERRDOS, ERRnoaccess, NT_STATUS_DOMAIN_TRUST_INCONSISTENT}, { + ERRHRD, ERRgeneral, NT_STATUS_FS_DRIVER_REQUIRED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_USER_SESSION_KEY}, { + ERRDOS, 59, NT_STATUS_USER_SESSION_DELETED}, { + ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_LANG_NOT_FOUND}, { + ERRDOS, ERRnomem, NT_STATUS_INSUFF_SERVER_RESOURCES}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_BUFFER_SIZE}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_COMPONENT}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_WILDCARD}, { + ERRDOS, 68, NT_STATUS_TOO_MANY_ADDRESSES}, { + ERRDOS, 52, NT_STATUS_ADDRESS_ALREADY_EXISTS}, { + ERRDOS, 64, NT_STATUS_ADDRESS_CLOSED}, { + ERRDOS, 64, NT_STATUS_CONNECTION_DISCONNECTED}, { + ERRDOS, 64, NT_STATUS_CONNECTION_RESET}, { + ERRDOS, 68, NT_STATUS_TOO_MANY_NODES}, { + ERRDOS, 59, NT_STATUS_TRANSACTION_ABORTED}, { + ERRDOS, 59, NT_STATUS_TRANSACTION_TIMED_OUT}, { + ERRDOS, 59, NT_STATUS_TRANSACTION_NO_RELEASE}, { + ERRDOS, 59, NT_STATUS_TRANSACTION_NO_MATCH}, { + ERRDOS, 59, NT_STATUS_TRANSACTION_RESPONDED}, { + ERRDOS, 59, NT_STATUS_TRANSACTION_INVALID_ID}, { + ERRDOS, 59, NT_STATUS_TRANSACTION_INVALID_TYPE}, { + ERRDOS, ERRunsup, NT_STATUS_NOT_SERVER_SESSION}, { + ERRDOS, ERRunsup, NT_STATUS_NOT_CLIENT_SESSION}, { + ERRHRD, ERRgeneral, NT_STATUS_CANNOT_LOAD_REGISTRY_FILE}, { + ERRHRD, ERRgeneral, NT_STATUS_DEBUG_ATTACH_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_SYSTEM_PROCESS_TERMINATED}, { + ERRHRD, ERRgeneral, NT_STATUS_DATA_NOT_ACCEPTED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_BROWSER_SERVERS_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_VDM_HARD_ERROR}, { + ERRHRD, ERRgeneral, NT_STATUS_DRIVER_CANCEL_TIMEOUT}, { + ERRHRD, ERRgeneral, NT_STATUS_REPLY_MESSAGE_MISMATCH}, { + ERRHRD, ERRgeneral, NT_STATUS_MAPPED_ALIGNMENT}, { + ERRDOS, 193, NT_STATUS_IMAGE_CHECKSUM_MISMATCH}, { + ERRHRD, ERRgeneral, NT_STATUS_LOST_WRITEBEHIND_DATA}, { + ERRHRD, ERRgeneral, NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID}, { + ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_MUST_CHANGE}, { + ERRHRD, ERRgeneral, NT_STATUS_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_NOT_TINY_STREAM}, { + ERRHRD, ERRgeneral, NT_STATUS_RECOVERY_FAILURE}, { + ERRHRD, ERRgeneral, NT_STATUS_STACK_OVERFLOW_READ}, { + ERRHRD, ERRgeneral, NT_STATUS_FAIL_CHECK}, { + ERRHRD, ERRgeneral, NT_STATUS_DUPLICATE_OBJECTID}, { + ERRHRD, ERRgeneral, NT_STATUS_OBJECTID_EXISTS}, { + ERRHRD, ERRgeneral, NT_STATUS_CONVERT_TO_LARGE}, { + ERRHRD, ERRgeneral, NT_STATUS_RETRY}, { + ERRHRD, ERRgeneral, NT_STATUS_FOUND_OUT_OF_SCOPE}, { + ERRHRD, ERRgeneral, NT_STATUS_ALLOCATE_BUCKET}, { + ERRHRD, ERRgeneral, NT_STATUS_PROPSET_NOT_FOUND}, { + ERRHRD, ERRgeneral, NT_STATUS_MARSHALL_OVERFLOW}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_VARIANT}, { + ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND}, { + ERRDOS, ERRnoaccess, NT_STATUS_ACCOUNT_LOCKED_OUT}, { + ERRDOS, ERRbadfid, NT_STATUS_HANDLE_NOT_CLOSABLE}, { + ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_REFUSED}, { + ERRHRD, ERRgeneral, NT_STATUS_GRACEFUL_DISCONNECT}, { + ERRHRD, ERRgeneral, NT_STATUS_ADDRESS_ALREADY_ASSOCIATED}, { + ERRHRD, ERRgeneral, NT_STATUS_ADDRESS_NOT_ASSOCIATED}, { + ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_INVALID}, { + ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_ACTIVE}, { + ERRHRD, ERRgeneral, NT_STATUS_NETWORK_UNREACHABLE}, { + ERRHRD, ERRgeneral, NT_STATUS_HOST_UNREACHABLE}, { + ERRHRD, ERRgeneral, NT_STATUS_PROTOCOL_UNREACHABLE}, { + ERRHRD, ERRgeneral, NT_STATUS_PORT_UNREACHABLE}, { + ERRHRD, ERRgeneral, NT_STATUS_REQUEST_ABORTED}, { + ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_ABORTED}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_COMPRESSION_BUFFER}, { + ERRHRD, ERRgeneral, NT_STATUS_USER_MAPPED_FILE}, { + ERRHRD, ERRgeneral, NT_STATUS_AUDIT_FAILED}, { + ERRHRD, ERRgeneral, NT_STATUS_TIMER_RESOLUTION_NOT_SET}, { + ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_COUNT_LIMIT}, { + ERRHRD, ERRgeneral, NT_STATUS_LOGIN_TIME_RESTRICTION}, { + ERRHRD, ERRgeneral, NT_STATUS_LOGIN_WKSTA_RESTRICTION}, { + ERRDOS, 193, NT_STATUS_IMAGE_MP_UP_MISMATCH}, { + ERRHRD, ERRgeneral, 0xc000024a}, { + ERRHRD, ERRgeneral, 0xc000024b}, { + ERRHRD, ERRgeneral, 0xc000024c}, { + ERRHRD, ERRgeneral, 0xc000024d}, { + ERRHRD, ERRgeneral, 0xc000024e}, { + ERRHRD, ERRgeneral, 0xc000024f}, { + ERRHRD, ERRgeneral, NT_STATUS_INSUFFICIENT_LOGON_INFO}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_DLL_ENTRYPOINT}, { + ERRHRD, ERRgeneral, NT_STATUS_BAD_SERVICE_ENTRYPOINT}, { + ERRHRD, ERRgeneral, NT_STATUS_LPC_REPLY_LOST}, { + ERRHRD, ERRgeneral, NT_STATUS_IP_ADDRESS_CONFLICT1}, { + ERRHRD, ERRgeneral, NT_STATUS_IP_ADDRESS_CONFLICT2}, { + ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_QUOTA_LIMIT}, { + ERRSRV, 3, NT_STATUS_PATH_NOT_COVERED}, { + ERRHRD, ERRgeneral, NT_STATUS_NO_CALLBACK_ACTIVE}, { + ERRHRD, ERRgeneral, NT_STATUS_LICENSE_QUOTA_EXCEEDED}, { + ERRHRD, ERRgeneral, NT_STATUS_PWD_TOO_SHORT}, { + ERRHRD, ERRgeneral, NT_STATUS_PWD_TOO_RECENT}, { + ERRHRD, ERRgeneral, NT_STATUS_PWD_HISTORY_CONFLICT}, { + ERRHRD, ERRgeneral, 0xc000025d}, { + ERRHRD, ERRgeneral, NT_STATUS_PLUGPLAY_NO_DEVICE}, { + ERRHRD, ERRgeneral, NT_STATUS_UNSUPPORTED_COMPRESSION}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_HW_PROFILE}, { + ERRHRD, ERRgeneral, NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH}, { + ERRDOS, 182, NT_STATUS_DRIVER_ORDINAL_NOT_FOUND}, { + ERRDOS, 127, NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND}, { + ERRDOS, 288, NT_STATUS_RESOURCE_NOT_OWNED}, { + ERRDOS, ErrTooManyLinks, NT_STATUS_TOO_MANY_LINKS}, { + ERRHRD, ERRgeneral, NT_STATUS_QUOTA_LIST_INCONSISTENT}, { + ERRHRD, ERRgeneral, NT_STATUS_FILE_IS_OFFLINE}, { + ERRDOS, 21, 0xc000026e}, { + ERRDOS, 161, 0xc0000281}, { + ERRDOS, ERRnoaccess, 0xc000028a}, { + ERRDOS, ERRnoaccess, 0xc000028b}, { + ERRHRD, ERRgeneral, 0xc000028c}, { + ERRDOS, ERRnoaccess, 0xc000028d}, { + ERRDOS, ERRnoaccess, 0xc000028e}, { + ERRDOS, ERRnoaccess, 0xc000028f}, { + ERRDOS, ERRnoaccess, 0xc0000290}, { + ERRDOS, ERRbadfunc, 0xc000029c}, { + ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, { + ERRDOS, ERRinvlevel, 0x007c0001}, }; + +/***************************************************************************** + Print an error message from the status code + *****************************************************************************/ +static void +cifs_print_status(__u32 status_code) +{ + int idx = 0; + + while (nt_errs[idx].nt_errstr != NULL) { + if (((nt_errs[idx].nt_errcode) & 0xFFFFFF) == + (status_code & 0xFFFFFF)) { + printk(KERN_NOTICE "Status code returned 0x%08x %s\n", + status_code, nt_errs[idx].nt_errstr); + } + idx++; + } + return; +} + + +static void +ntstatus_to_dos(__u32 ntstatus, __u8 *eclass, __u16 *ecode) +{ + int i; + if (ntstatus == 0) { + *eclass = 0; + *ecode = 0; + return; + } + for (i = 0; ntstatus_to_dos_map[i].ntstatus; i++) { + if (ntstatus == ntstatus_to_dos_map[i].ntstatus) { + *eclass = ntstatus_to_dos_map[i].dos_class; + *ecode = ntstatus_to_dos_map[i].dos_code; + return; + } + } + *eclass = ERRHRD; + *ecode = ERRgeneral; +} + +int +map_smb_to_linux_error(struct smb_hdr *smb, bool logErr) +{ + unsigned int i; + int rc = -EIO; /* if transport error smb error may not be set */ + __u8 smberrclass; + __u16 smberrcode; + + /* BB if NT Status codes - map NT BB */ + + /* old style smb error codes */ + if (smb->Status.CifsError == 0) + return 0; + + if (smb->Flags2 & SMBFLG2_ERR_STATUS) { + /* translate the newer STATUS codes to old style SMB errors + * and then to POSIX errors */ + __u32 err = le32_to_cpu(smb->Status.CifsError); + if (logErr && (err != (NT_STATUS_MORE_PROCESSING_REQUIRED))) + cifs_print_status(err); + else if (cifsFYI & CIFS_RC) + cifs_print_status(err); + ntstatus_to_dos(err, &smberrclass, &smberrcode); + } else { + smberrclass = smb->Status.DosError.ErrorClass; + smberrcode = le16_to_cpu(smb->Status.DosError.Error); + } + + /* old style errors */ + + /* DOS class smb error codes - map DOS */ + if (smberrclass == ERRDOS) { + /* 1 byte field no need to byte reverse */ + for (i = 0; + i < + sizeof(mapping_table_ERRDOS) / + sizeof(struct smb_to_posix_error); i++) { + if (mapping_table_ERRDOS[i].smb_err == 0) + break; + else if (mapping_table_ERRDOS[i].smb_err == + smberrcode) { + rc = mapping_table_ERRDOS[i].posix_code; + break; + } + /* else try next error mapping one to see if match */ + } + } else if (smberrclass == ERRSRV) { + /* server class of error codes */ + for (i = 0; + i < + sizeof(mapping_table_ERRSRV) / + sizeof(struct smb_to_posix_error); i++) { + if (mapping_table_ERRSRV[i].smb_err == 0) + break; + else if (mapping_table_ERRSRV[i].smb_err == + smberrcode) { + rc = mapping_table_ERRSRV[i].posix_code; + break; + } + /* else try next error mapping to see if match */ + } + } + /* else ERRHRD class errors or junk - return EIO */ + + cFYI(1, "Mapping smb error code 0x%x to POSIX err %d", + le32_to_cpu(smb->Status.CifsError), rc); + + /* generic corrective action e.g. reconnect SMB session on + * ERRbaduid could be added */ + + return rc; +} + +/* + * calculate the size of the SMB message based on the fixed header + * portion, the number of word parameters and the data portion of the message + */ +unsigned int +smbCalcSize(struct smb_hdr *ptr) +{ + return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + + 2 /* size of the bcc field */ + get_bcc(ptr)); +} + +/* The following are taken from fs/ntfs/util.c */ + +#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000) + +/* + * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units) + * into Unix UTC (based 1970-01-01, in seconds). + */ +struct timespec +cifs_NTtimeToUnix(__le64 ntutc) +{ + struct timespec ts; + /* BB what about the timezone? BB */ + + /* Subtract the NTFS time offset, then convert to 1s intervals. */ + u64 t; + + t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET; + ts.tv_nsec = do_div(t, 10000000) * 100; + ts.tv_sec = t; + return ts; +} + +/* Convert the Unix UTC into NT UTC. */ +u64 +cifs_UnixTimeToNT(struct timespec t) +{ + /* Convert to 100ns intervals and then add the NTFS time offset. */ + return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET; +} + +static int total_days_of_prev_months[] = +{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334}; + +struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) +{ + struct timespec ts; + int sec, min, days, month, year; + u16 date = le16_to_cpu(le_date); + u16 time = le16_to_cpu(le_time); + SMB_TIME *st = (SMB_TIME *)&time; + SMB_DATE *sd = (SMB_DATE *)&date; + + cFYI(1, "date %d time %d", date, time); + + sec = 2 * st->TwoSeconds; + min = st->Minutes; + if ((sec > 59) || (min > 59)) + cERROR(1, "illegal time min %d sec %d", min, sec); + sec += (min * 60); + sec += 60 * 60 * st->Hours; + if (st->Hours > 24) + cERROR(1, "illegal hours %d", st->Hours); + days = sd->Day; + month = sd->Month; + if ((days > 31) || (month > 12)) { + cERROR(1, "illegal date, month %d day: %d", month, days); + if (month > 12) + month = 12; + } + month -= 1; + days += total_days_of_prev_months[month]; + days += 3652; /* account for difference in days between 1980 and 1970 */ + year = sd->Year; + days += year * 365; + days += (year/4); /* leap year */ + /* generalized leap year calculation is more complex, ie no leap year + for years/100 except for years/400, but since the maximum number for DOS + year is 2**7, the last year is 1980+127, which means we need only + consider 2 special case years, ie the years 2000 and 2100, and only + adjust for the lack of leap year for the year 2100, as 2000 was a + leap year (divisable by 400) */ + if (year >= 120) /* the year 2100 */ + days = days - 1; /* do not count leap year for the year 2100 */ + + /* adjust for leap year where we are still before leap day */ + if (year != 120) + days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0); + sec += 24 * 60 * 60 * days; + + ts.tv_sec = sec + offset; + + /* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */ + + ts.tv_nsec = 0; + return ts; +} diff --git a/fs/cifs/nterr.c b/fs/cifs/nterr.c new file mode 100644 index 00000000..819fd994 --- /dev/null +++ b/fs/cifs/nterr.c @@ -0,0 +1,687 @@ +/* + * Unix SMB/Netbios implementation. + * Version 1.9. + * RPC Pipe client / server routines + * Copyright (C) Luke Kenneth Casson Leighton 1997-2001. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* NT error codes - see nterr.h */ +#include +#include +#include "nterr.h" + +const struct nt_err_code_struct nt_errs[] = { + {"NT_STATUS_OK", NT_STATUS_OK}, + {"NT_STATUS_UNSUCCESSFUL", NT_STATUS_UNSUCCESSFUL}, + {"NT_STATUS_NOT_IMPLEMENTED", NT_STATUS_NOT_IMPLEMENTED}, + {"NT_STATUS_INVALID_INFO_CLASS", NT_STATUS_INVALID_INFO_CLASS}, + {"NT_STATUS_INFO_LENGTH_MISMATCH", NT_STATUS_INFO_LENGTH_MISMATCH}, + {"NT_STATUS_ACCESS_VIOLATION", NT_STATUS_ACCESS_VIOLATION}, + {"STATUS_BUFFER_OVERFLOW", STATUS_BUFFER_OVERFLOW}, + {"NT_STATUS_IN_PAGE_ERROR", NT_STATUS_IN_PAGE_ERROR}, + {"NT_STATUS_PAGEFILE_QUOTA", NT_STATUS_PAGEFILE_QUOTA}, + {"NT_STATUS_INVALID_HANDLE", NT_STATUS_INVALID_HANDLE}, + {"NT_STATUS_BAD_INITIAL_STACK", NT_STATUS_BAD_INITIAL_STACK}, + {"NT_STATUS_BAD_INITIAL_PC", NT_STATUS_BAD_INITIAL_PC}, + {"NT_STATUS_INVALID_CID", NT_STATUS_INVALID_CID}, + {"NT_STATUS_TIMER_NOT_CANCELED", NT_STATUS_TIMER_NOT_CANCELED}, + {"NT_STATUS_INVALID_PARAMETER", NT_STATUS_INVALID_PARAMETER}, + {"NT_STATUS_NO_SUCH_DEVICE", NT_STATUS_NO_SUCH_DEVICE}, + {"NT_STATUS_NO_SUCH_FILE", NT_STATUS_NO_SUCH_FILE}, + {"NT_STATUS_INVALID_DEVICE_REQUEST", + NT_STATUS_INVALID_DEVICE_REQUEST}, + {"NT_STATUS_END_OF_FILE", NT_STATUS_END_OF_FILE}, + {"NT_STATUS_WRONG_VOLUME", NT_STATUS_WRONG_VOLUME}, + {"NT_STATUS_NO_MEDIA_IN_DEVICE", NT_STATUS_NO_MEDIA_IN_DEVICE}, + {"NT_STATUS_UNRECOGNIZED_MEDIA", NT_STATUS_UNRECOGNIZED_MEDIA}, + {"NT_STATUS_NONEXISTENT_SECTOR", NT_STATUS_NONEXISTENT_SECTOR}, + {"NT_STATUS_MORE_PROCESSING_REQUIRED", + NT_STATUS_MORE_PROCESSING_REQUIRED}, + {"NT_STATUS_NO_MEMORY", NT_STATUS_NO_MEMORY}, + {"NT_STATUS_CONFLICTING_ADDRESSES", + NT_STATUS_CONFLICTING_ADDRESSES}, + {"NT_STATUS_NOT_MAPPED_VIEW", NT_STATUS_NOT_MAPPED_VIEW}, + {"NT_STATUS_UNABLE_TO_FREE_VM", NT_STATUS_UNABLE_TO_FREE_VM}, + {"NT_STATUS_UNABLE_TO_DELETE_SECTION", + NT_STATUS_UNABLE_TO_DELETE_SECTION}, + {"NT_STATUS_INVALID_SYSTEM_SERVICE", + NT_STATUS_INVALID_SYSTEM_SERVICE}, + {"NT_STATUS_ILLEGAL_INSTRUCTION", NT_STATUS_ILLEGAL_INSTRUCTION}, + {"NT_STATUS_INVALID_LOCK_SEQUENCE", + NT_STATUS_INVALID_LOCK_SEQUENCE}, + {"NT_STATUS_INVALID_VIEW_SIZE", NT_STATUS_INVALID_VIEW_SIZE}, + {"NT_STATUS_INVALID_FILE_FOR_SECTION", + NT_STATUS_INVALID_FILE_FOR_SECTION}, + {"NT_STATUS_ALREADY_COMMITTED", NT_STATUS_ALREADY_COMMITTED}, + {"NT_STATUS_ACCESS_DENIED", NT_STATUS_ACCESS_DENIED}, + {"NT_STATUS_BUFFER_TOO_SMALL", NT_STATUS_BUFFER_TOO_SMALL}, + {"NT_STATUS_OBJECT_TYPE_MISMATCH", NT_STATUS_OBJECT_TYPE_MISMATCH}, + {"NT_STATUS_NONCONTINUABLE_EXCEPTION", + NT_STATUS_NONCONTINUABLE_EXCEPTION}, + {"NT_STATUS_INVALID_DISPOSITION", NT_STATUS_INVALID_DISPOSITION}, + {"NT_STATUS_UNWIND", NT_STATUS_UNWIND}, + {"NT_STATUS_BAD_STACK", NT_STATUS_BAD_STACK}, + {"NT_STATUS_INVALID_UNWIND_TARGET", + NT_STATUS_INVALID_UNWIND_TARGET}, + {"NT_STATUS_NOT_LOCKED", NT_STATUS_NOT_LOCKED}, + {"NT_STATUS_PARITY_ERROR", NT_STATUS_PARITY_ERROR}, + {"NT_STATUS_UNABLE_TO_DECOMMIT_VM", + NT_STATUS_UNABLE_TO_DECOMMIT_VM}, + {"NT_STATUS_NOT_COMMITTED", NT_STATUS_NOT_COMMITTED}, + {"NT_STATUS_INVALID_PORT_ATTRIBUTES", + NT_STATUS_INVALID_PORT_ATTRIBUTES}, + {"NT_STATUS_PORT_MESSAGE_TOO_LONG", + NT_STATUS_PORT_MESSAGE_TOO_LONG}, + {"NT_STATUS_INVALID_PARAMETER_MIX", + NT_STATUS_INVALID_PARAMETER_MIX}, + {"NT_STATUS_INVALID_QUOTA_LOWER", NT_STATUS_INVALID_QUOTA_LOWER}, + {"NT_STATUS_DISK_CORRUPT_ERROR", NT_STATUS_DISK_CORRUPT_ERROR}, + {"NT_STATUS_OBJECT_NAME_INVALID", NT_STATUS_OBJECT_NAME_INVALID}, + {"NT_STATUS_OBJECT_NAME_NOT_FOUND", + NT_STATUS_OBJECT_NAME_NOT_FOUND}, + {"NT_STATUS_OBJECT_NAME_COLLISION", + NT_STATUS_OBJECT_NAME_COLLISION}, + {"NT_STATUS_HANDLE_NOT_WAITABLE", NT_STATUS_HANDLE_NOT_WAITABLE}, + {"NT_STATUS_PORT_DISCONNECTED", NT_STATUS_PORT_DISCONNECTED}, + {"NT_STATUS_DEVICE_ALREADY_ATTACHED", + NT_STATUS_DEVICE_ALREADY_ATTACHED}, + {"NT_STATUS_OBJECT_PATH_INVALID", NT_STATUS_OBJECT_PATH_INVALID}, + {"NT_STATUS_OBJECT_PATH_NOT_FOUND", + NT_STATUS_OBJECT_PATH_NOT_FOUND}, + {"NT_STATUS_OBJECT_PATH_SYNTAX_BAD", + NT_STATUS_OBJECT_PATH_SYNTAX_BAD}, + {"NT_STATUS_DATA_OVERRUN", NT_STATUS_DATA_OVERRUN}, + {"NT_STATUS_DATA_LATE_ERROR", NT_STATUS_DATA_LATE_ERROR}, + {"NT_STATUS_DATA_ERROR", NT_STATUS_DATA_ERROR}, + {"NT_STATUS_CRC_ERROR", NT_STATUS_CRC_ERROR}, + {"NT_STATUS_SECTION_TOO_BIG", NT_STATUS_SECTION_TOO_BIG}, + {"NT_STATUS_PORT_CONNECTION_REFUSED", + NT_STATUS_PORT_CONNECTION_REFUSED}, + {"NT_STATUS_INVALID_PORT_HANDLE", NT_STATUS_INVALID_PORT_HANDLE}, + {"NT_STATUS_SHARING_VIOLATION", NT_STATUS_SHARING_VIOLATION}, + {"NT_STATUS_QUOTA_EXCEEDED", NT_STATUS_QUOTA_EXCEEDED}, + {"NT_STATUS_INVALID_PAGE_PROTECTION", + NT_STATUS_INVALID_PAGE_PROTECTION}, + {"NT_STATUS_MUTANT_NOT_OWNED", NT_STATUS_MUTANT_NOT_OWNED}, + {"NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED", + NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED}, + {"NT_STATUS_PORT_ALREADY_SET", NT_STATUS_PORT_ALREADY_SET}, + {"NT_STATUS_SECTION_NOT_IMAGE", NT_STATUS_SECTION_NOT_IMAGE}, + {"NT_STATUS_SUSPEND_COUNT_EXCEEDED", + NT_STATUS_SUSPEND_COUNT_EXCEEDED}, + {"NT_STATUS_THREAD_IS_TERMINATING", + NT_STATUS_THREAD_IS_TERMINATING}, + {"NT_STATUS_BAD_WORKING_SET_LIMIT", + NT_STATUS_BAD_WORKING_SET_LIMIT}, + {"NT_STATUS_INCOMPATIBLE_FILE_MAP", + NT_STATUS_INCOMPATIBLE_FILE_MAP}, + {"NT_STATUS_SECTION_PROTECTION", NT_STATUS_SECTION_PROTECTION}, + {"NT_STATUS_EAS_NOT_SUPPORTED", NT_STATUS_EAS_NOT_SUPPORTED}, + {"NT_STATUS_EA_TOO_LARGE", NT_STATUS_EA_TOO_LARGE}, + {"NT_STATUS_NONEXISTENT_EA_ENTRY", NT_STATUS_NONEXISTENT_EA_ENTRY}, + {"NT_STATUS_NO_EAS_ON_FILE", NT_STATUS_NO_EAS_ON_FILE}, + {"NT_STATUS_EA_CORRUPT_ERROR", NT_STATUS_EA_CORRUPT_ERROR}, + {"NT_STATUS_FILE_LOCK_CONFLICT", NT_STATUS_FILE_LOCK_CONFLICT}, + {"NT_STATUS_LOCK_NOT_GRANTED", NT_STATUS_LOCK_NOT_GRANTED}, + {"NT_STATUS_DELETE_PENDING", NT_STATUS_DELETE_PENDING}, + {"NT_STATUS_CTL_FILE_NOT_SUPPORTED", + NT_STATUS_CTL_FILE_NOT_SUPPORTED}, + {"NT_STATUS_UNKNOWN_REVISION", NT_STATUS_UNKNOWN_REVISION}, + {"NT_STATUS_REVISION_MISMATCH", NT_STATUS_REVISION_MISMATCH}, + {"NT_STATUS_INVALID_OWNER", NT_STATUS_INVALID_OWNER}, + {"NT_STATUS_INVALID_PRIMARY_GROUP", + NT_STATUS_INVALID_PRIMARY_GROUP}, + {"NT_STATUS_NO_IMPERSONATION_TOKEN", + NT_STATUS_NO_IMPERSONATION_TOKEN}, + {"NT_STATUS_CANT_DISABLE_MANDATORY", + NT_STATUS_CANT_DISABLE_MANDATORY}, + {"NT_STATUS_NO_LOGON_SERVERS", NT_STATUS_NO_LOGON_SERVERS}, + {"NT_STATUS_NO_SUCH_LOGON_SESSION", + NT_STATUS_NO_SUCH_LOGON_SESSION}, + {"NT_STATUS_NO_SUCH_PRIVILEGE", NT_STATUS_NO_SUCH_PRIVILEGE}, + {"NT_STATUS_PRIVILEGE_NOT_HELD", NT_STATUS_PRIVILEGE_NOT_HELD}, + {"NT_STATUS_INVALID_ACCOUNT_NAME", NT_STATUS_INVALID_ACCOUNT_NAME}, + {"NT_STATUS_USER_EXISTS", NT_STATUS_USER_EXISTS}, + {"NT_STATUS_NO_SUCH_USER", NT_STATUS_NO_SUCH_USER}, + {"NT_STATUS_GROUP_EXISTS", NT_STATUS_GROUP_EXISTS}, + {"NT_STATUS_NO_SUCH_GROUP", NT_STATUS_NO_SUCH_GROUP}, + {"NT_STATUS_MEMBER_IN_GROUP", NT_STATUS_MEMBER_IN_GROUP}, + {"NT_STATUS_MEMBER_NOT_IN_GROUP", NT_STATUS_MEMBER_NOT_IN_GROUP}, + {"NT_STATUS_LAST_ADMIN", NT_STATUS_LAST_ADMIN}, + {"NT_STATUS_WRONG_PASSWORD", NT_STATUS_WRONG_PASSWORD}, + {"NT_STATUS_ILL_FORMED_PASSWORD", NT_STATUS_ILL_FORMED_PASSWORD}, + {"NT_STATUS_PASSWORD_RESTRICTION", NT_STATUS_PASSWORD_RESTRICTION}, + {"NT_STATUS_LOGON_FAILURE", NT_STATUS_LOGON_FAILURE}, + {"NT_STATUS_ACCOUNT_RESTRICTION", NT_STATUS_ACCOUNT_RESTRICTION}, + {"NT_STATUS_INVALID_LOGON_HOURS", NT_STATUS_INVALID_LOGON_HOURS}, + {"NT_STATUS_INVALID_WORKSTATION", NT_STATUS_INVALID_WORKSTATION}, + {"NT_STATUS_PASSWORD_EXPIRED", NT_STATUS_PASSWORD_EXPIRED}, + {"NT_STATUS_ACCOUNT_DISABLED", NT_STATUS_ACCOUNT_DISABLED}, + {"NT_STATUS_NONE_MAPPED", NT_STATUS_NONE_MAPPED}, + {"NT_STATUS_TOO_MANY_LUIDS_REQUESTED", + NT_STATUS_TOO_MANY_LUIDS_REQUESTED}, + {"NT_STATUS_LUIDS_EXHAUSTED", NT_STATUS_LUIDS_EXHAUSTED}, + {"NT_STATUS_INVALID_SUB_AUTHORITY", + NT_STATUS_INVALID_SUB_AUTHORITY}, + {"NT_STATUS_INVALID_ACL", NT_STATUS_INVALID_ACL}, + {"NT_STATUS_INVALID_SID", NT_STATUS_INVALID_SID}, + {"NT_STATUS_INVALID_SECURITY_DESCR", + NT_STATUS_INVALID_SECURITY_DESCR}, + {"NT_STATUS_PROCEDURE_NOT_FOUND", NT_STATUS_PROCEDURE_NOT_FOUND}, + {"NT_STATUS_INVALID_IMAGE_FORMAT", NT_STATUS_INVALID_IMAGE_FORMAT}, + {"NT_STATUS_NO_TOKEN", NT_STATUS_NO_TOKEN}, + {"NT_STATUS_BAD_INHERITANCE_ACL", NT_STATUS_BAD_INHERITANCE_ACL}, + {"NT_STATUS_RANGE_NOT_LOCKED", NT_STATUS_RANGE_NOT_LOCKED}, + {"NT_STATUS_DISK_FULL", NT_STATUS_DISK_FULL}, + {"NT_STATUS_SERVER_DISABLED", NT_STATUS_SERVER_DISABLED}, + {"NT_STATUS_SERVER_NOT_DISABLED", NT_STATUS_SERVER_NOT_DISABLED}, + {"NT_STATUS_TOO_MANY_GUIDS_REQUESTED", + NT_STATUS_TOO_MANY_GUIDS_REQUESTED}, + {"NT_STATUS_GUIDS_EXHAUSTED", NT_STATUS_GUIDS_EXHAUSTED}, + {"NT_STATUS_INVALID_ID_AUTHORITY", NT_STATUS_INVALID_ID_AUTHORITY}, + {"NT_STATUS_AGENTS_EXHAUSTED", NT_STATUS_AGENTS_EXHAUSTED}, + {"NT_STATUS_INVALID_VOLUME_LABEL", NT_STATUS_INVALID_VOLUME_LABEL}, + {"NT_STATUS_SECTION_NOT_EXTENDED", NT_STATUS_SECTION_NOT_EXTENDED}, + {"NT_STATUS_NOT_MAPPED_DATA", NT_STATUS_NOT_MAPPED_DATA}, + {"NT_STATUS_RESOURCE_DATA_NOT_FOUND", + NT_STATUS_RESOURCE_DATA_NOT_FOUND}, + {"NT_STATUS_RESOURCE_TYPE_NOT_FOUND", + NT_STATUS_RESOURCE_TYPE_NOT_FOUND}, + {"NT_STATUS_RESOURCE_NAME_NOT_FOUND", + NT_STATUS_RESOURCE_NAME_NOT_FOUND}, + {"NT_STATUS_ARRAY_BOUNDS_EXCEEDED", + NT_STATUS_ARRAY_BOUNDS_EXCEEDED}, + {"NT_STATUS_FLOAT_DENORMAL_OPERAND", + NT_STATUS_FLOAT_DENORMAL_OPERAND}, + {"NT_STATUS_FLOAT_DIVIDE_BY_ZERO", NT_STATUS_FLOAT_DIVIDE_BY_ZERO}, + {"NT_STATUS_FLOAT_INEXACT_RESULT", NT_STATUS_FLOAT_INEXACT_RESULT}, + {"NT_STATUS_FLOAT_INVALID_OPERATION", + NT_STATUS_FLOAT_INVALID_OPERATION}, + {"NT_STATUS_FLOAT_OVERFLOW", NT_STATUS_FLOAT_OVERFLOW}, + {"NT_STATUS_FLOAT_STACK_CHECK", NT_STATUS_FLOAT_STACK_CHECK}, + {"NT_STATUS_FLOAT_UNDERFLOW", NT_STATUS_FLOAT_UNDERFLOW}, + {"NT_STATUS_INTEGER_DIVIDE_BY_ZERO", + NT_STATUS_INTEGER_DIVIDE_BY_ZERO}, + {"NT_STATUS_INTEGER_OVERFLOW", NT_STATUS_INTEGER_OVERFLOW}, + {"NT_STATUS_PRIVILEGED_INSTRUCTION", + NT_STATUS_PRIVILEGED_INSTRUCTION}, + {"NT_STATUS_TOO_MANY_PAGING_FILES", + NT_STATUS_TOO_MANY_PAGING_FILES}, + {"NT_STATUS_FILE_INVALID", NT_STATUS_FILE_INVALID}, + {"NT_STATUS_ALLOTTED_SPACE_EXCEEDED", + NT_STATUS_ALLOTTED_SPACE_EXCEEDED}, + {"NT_STATUS_INSUFFICIENT_RESOURCES", + NT_STATUS_INSUFFICIENT_RESOURCES}, + {"NT_STATUS_DFS_EXIT_PATH_FOUND", NT_STATUS_DFS_EXIT_PATH_FOUND}, + {"NT_STATUS_DEVICE_DATA_ERROR", NT_STATUS_DEVICE_DATA_ERROR}, + {"NT_STATUS_DEVICE_NOT_CONNECTED", NT_STATUS_DEVICE_NOT_CONNECTED}, + {"NT_STATUS_DEVICE_POWER_FAILURE", NT_STATUS_DEVICE_POWER_FAILURE}, + {"NT_STATUS_FREE_VM_NOT_AT_BASE", NT_STATUS_FREE_VM_NOT_AT_BASE}, + {"NT_STATUS_MEMORY_NOT_ALLOCATED", NT_STATUS_MEMORY_NOT_ALLOCATED}, + {"NT_STATUS_WORKING_SET_QUOTA", NT_STATUS_WORKING_SET_QUOTA}, + {"NT_STATUS_MEDIA_WRITE_PROTECTED", + NT_STATUS_MEDIA_WRITE_PROTECTED}, + {"NT_STATUS_DEVICE_NOT_READY", NT_STATUS_DEVICE_NOT_READY}, + {"NT_STATUS_INVALID_GROUP_ATTRIBUTES", + NT_STATUS_INVALID_GROUP_ATTRIBUTES}, + {"NT_STATUS_BAD_IMPERSONATION_LEVEL", + NT_STATUS_BAD_IMPERSONATION_LEVEL}, + {"NT_STATUS_CANT_OPEN_ANONYMOUS", NT_STATUS_CANT_OPEN_ANONYMOUS}, + {"NT_STATUS_BAD_VALIDATION_CLASS", NT_STATUS_BAD_VALIDATION_CLASS}, + {"NT_STATUS_BAD_TOKEN_TYPE", NT_STATUS_BAD_TOKEN_TYPE}, + {"NT_STATUS_BAD_MASTER_BOOT_RECORD", + NT_STATUS_BAD_MASTER_BOOT_RECORD}, + {"NT_STATUS_INSTRUCTION_MISALIGNMENT", + NT_STATUS_INSTRUCTION_MISALIGNMENT}, + {"NT_STATUS_INSTANCE_NOT_AVAILABLE", + NT_STATUS_INSTANCE_NOT_AVAILABLE}, + {"NT_STATUS_PIPE_NOT_AVAILABLE", NT_STATUS_PIPE_NOT_AVAILABLE}, + {"NT_STATUS_INVALID_PIPE_STATE", NT_STATUS_INVALID_PIPE_STATE}, + {"NT_STATUS_PIPE_BUSY", NT_STATUS_PIPE_BUSY}, + {"NT_STATUS_ILLEGAL_FUNCTION", NT_STATUS_ILLEGAL_FUNCTION}, + {"NT_STATUS_PIPE_DISCONNECTED", NT_STATUS_PIPE_DISCONNECTED}, + {"NT_STATUS_PIPE_CLOSING", NT_STATUS_PIPE_CLOSING}, + {"NT_STATUS_PIPE_CONNECTED", NT_STATUS_PIPE_CONNECTED}, + {"NT_STATUS_PIPE_LISTENING", NT_STATUS_PIPE_LISTENING}, + {"NT_STATUS_INVALID_READ_MODE", NT_STATUS_INVALID_READ_MODE}, + {"NT_STATUS_IO_TIMEOUT", NT_STATUS_IO_TIMEOUT}, + {"NT_STATUS_FILE_FORCED_CLOSED", NT_STATUS_FILE_FORCED_CLOSED}, + {"NT_STATUS_PROFILING_NOT_STARTED", + NT_STATUS_PROFILING_NOT_STARTED}, + {"NT_STATUS_PROFILING_NOT_STOPPED", + NT_STATUS_PROFILING_NOT_STOPPED}, + {"NT_STATUS_COULD_NOT_INTERPRET", NT_STATUS_COULD_NOT_INTERPRET}, + {"NT_STATUS_FILE_IS_A_DIRECTORY", NT_STATUS_FILE_IS_A_DIRECTORY}, + {"NT_STATUS_NOT_SUPPORTED", NT_STATUS_NOT_SUPPORTED}, + {"NT_STATUS_REMOTE_NOT_LISTENING", NT_STATUS_REMOTE_NOT_LISTENING}, + {"NT_STATUS_DUPLICATE_NAME", NT_STATUS_DUPLICATE_NAME}, + {"NT_STATUS_BAD_NETWORK_PATH", NT_STATUS_BAD_NETWORK_PATH}, + {"NT_STATUS_NETWORK_BUSY", NT_STATUS_NETWORK_BUSY}, + {"NT_STATUS_DEVICE_DOES_NOT_EXIST", + NT_STATUS_DEVICE_DOES_NOT_EXIST}, + {"NT_STATUS_TOO_MANY_COMMANDS", NT_STATUS_TOO_MANY_COMMANDS}, + {"NT_STATUS_ADAPTER_HARDWARE_ERROR", + NT_STATUS_ADAPTER_HARDWARE_ERROR}, + {"NT_STATUS_INVALID_NETWORK_RESPONSE", + NT_STATUS_INVALID_NETWORK_RESPONSE}, + {"NT_STATUS_UNEXPECTED_NETWORK_ERROR", + NT_STATUS_UNEXPECTED_NETWORK_ERROR}, + {"NT_STATUS_BAD_REMOTE_ADAPTER", NT_STATUS_BAD_REMOTE_ADAPTER}, + {"NT_STATUS_PRINT_QUEUE_FULL", NT_STATUS_PRINT_QUEUE_FULL}, + {"NT_STATUS_NO_SPOOL_SPACE", NT_STATUS_NO_SPOOL_SPACE}, + {"NT_STATUS_PRINT_CANCELLED", NT_STATUS_PRINT_CANCELLED}, + {"NT_STATUS_NETWORK_NAME_DELETED", NT_STATUS_NETWORK_NAME_DELETED}, + {"NT_STATUS_NETWORK_ACCESS_DENIED", + NT_STATUS_NETWORK_ACCESS_DENIED}, + {"NT_STATUS_BAD_DEVICE_TYPE", NT_STATUS_BAD_DEVICE_TYPE}, + {"NT_STATUS_BAD_NETWORK_NAME", NT_STATUS_BAD_NETWORK_NAME}, + {"NT_STATUS_TOO_MANY_NAMES", NT_STATUS_TOO_MANY_NAMES}, + {"NT_STATUS_TOO_MANY_SESSIONS", NT_STATUS_TOO_MANY_SESSIONS}, + {"NT_STATUS_SHARING_PAUSED", NT_STATUS_SHARING_PAUSED}, + {"NT_STATUS_REQUEST_NOT_ACCEPTED", NT_STATUS_REQUEST_NOT_ACCEPTED}, + {"NT_STATUS_REDIRECTOR_PAUSED", NT_STATUS_REDIRECTOR_PAUSED}, + {"NT_STATUS_NET_WRITE_FAULT", NT_STATUS_NET_WRITE_FAULT}, + {"NT_STATUS_PROFILING_AT_LIMIT", NT_STATUS_PROFILING_AT_LIMIT}, + {"NT_STATUS_NOT_SAME_DEVICE", NT_STATUS_NOT_SAME_DEVICE}, + {"NT_STATUS_FILE_RENAMED", NT_STATUS_FILE_RENAMED}, + {"NT_STATUS_VIRTUAL_CIRCUIT_CLOSED", + NT_STATUS_VIRTUAL_CIRCUIT_CLOSED}, + {"NT_STATUS_NO_SECURITY_ON_OBJECT", + NT_STATUS_NO_SECURITY_ON_OBJECT}, + {"NT_STATUS_CANT_WAIT", NT_STATUS_CANT_WAIT}, + {"NT_STATUS_PIPE_EMPTY", NT_STATUS_PIPE_EMPTY}, + {"NT_STATUS_CANT_ACCESS_DOMAIN_INFO", + NT_STATUS_CANT_ACCESS_DOMAIN_INFO}, + {"NT_STATUS_CANT_TERMINATE_SELF", NT_STATUS_CANT_TERMINATE_SELF}, + {"NT_STATUS_INVALID_SERVER_STATE", NT_STATUS_INVALID_SERVER_STATE}, + {"NT_STATUS_INVALID_DOMAIN_STATE", NT_STATUS_INVALID_DOMAIN_STATE}, + {"NT_STATUS_INVALID_DOMAIN_ROLE", NT_STATUS_INVALID_DOMAIN_ROLE}, + {"NT_STATUS_NO_SUCH_DOMAIN", NT_STATUS_NO_SUCH_DOMAIN}, + {"NT_STATUS_DOMAIN_EXISTS", NT_STATUS_DOMAIN_EXISTS}, + {"NT_STATUS_DOMAIN_LIMIT_EXCEEDED", + NT_STATUS_DOMAIN_LIMIT_EXCEEDED}, + {"NT_STATUS_OPLOCK_NOT_GRANTED", NT_STATUS_OPLOCK_NOT_GRANTED}, + {"NT_STATUS_INVALID_OPLOCK_PROTOCOL", + NT_STATUS_INVALID_OPLOCK_PROTOCOL}, + {"NT_STATUS_INTERNAL_DB_CORRUPTION", + NT_STATUS_INTERNAL_DB_CORRUPTION}, + {"NT_STATUS_INTERNAL_ERROR", NT_STATUS_INTERNAL_ERROR}, + {"NT_STATUS_GENERIC_NOT_MAPPED", NT_STATUS_GENERIC_NOT_MAPPED}, + {"NT_STATUS_BAD_DESCRIPTOR_FORMAT", + NT_STATUS_BAD_DESCRIPTOR_FORMAT}, + {"NT_STATUS_INVALID_USER_BUFFER", NT_STATUS_INVALID_USER_BUFFER}, + {"NT_STATUS_UNEXPECTED_IO_ERROR", NT_STATUS_UNEXPECTED_IO_ERROR}, + {"NT_STATUS_UNEXPECTED_MM_CREATE_ERR", + NT_STATUS_UNEXPECTED_MM_CREATE_ERR}, + {"NT_STATUS_UNEXPECTED_MM_MAP_ERROR", + NT_STATUS_UNEXPECTED_MM_MAP_ERROR}, + {"NT_STATUS_UNEXPECTED_MM_EXTEND_ERR", + NT_STATUS_UNEXPECTED_MM_EXTEND_ERR}, + {"NT_STATUS_NOT_LOGON_PROCESS", NT_STATUS_NOT_LOGON_PROCESS}, + {"NT_STATUS_LOGON_SESSION_EXISTS", NT_STATUS_LOGON_SESSION_EXISTS}, + {"NT_STATUS_INVALID_PARAMETER_1", NT_STATUS_INVALID_PARAMETER_1}, + {"NT_STATUS_INVALID_PARAMETER_2", NT_STATUS_INVALID_PARAMETER_2}, + {"NT_STATUS_INVALID_PARAMETER_3", NT_STATUS_INVALID_PARAMETER_3}, + {"NT_STATUS_INVALID_PARAMETER_4", NT_STATUS_INVALID_PARAMETER_4}, + {"NT_STATUS_INVALID_PARAMETER_5", NT_STATUS_INVALID_PARAMETER_5}, + {"NT_STATUS_INVALID_PARAMETER_6", NT_STATUS_INVALID_PARAMETER_6}, + {"NT_STATUS_INVALID_PARAMETER_7", NT_STATUS_INVALID_PARAMETER_7}, + {"NT_STATUS_INVALID_PARAMETER_8", NT_STATUS_INVALID_PARAMETER_8}, + {"NT_STATUS_INVALID_PARAMETER_9", NT_STATUS_INVALID_PARAMETER_9}, + {"NT_STATUS_INVALID_PARAMETER_10", NT_STATUS_INVALID_PARAMETER_10}, + {"NT_STATUS_INVALID_PARAMETER_11", NT_STATUS_INVALID_PARAMETER_11}, + {"NT_STATUS_INVALID_PARAMETER_12", NT_STATUS_INVALID_PARAMETER_12}, + {"NT_STATUS_REDIRECTOR_NOT_STARTED", + NT_STATUS_REDIRECTOR_NOT_STARTED}, + {"NT_STATUS_REDIRECTOR_STARTED", NT_STATUS_REDIRECTOR_STARTED}, + {"NT_STATUS_STACK_OVERFLOW", NT_STATUS_STACK_OVERFLOW}, + {"NT_STATUS_NO_SUCH_PACKAGE", NT_STATUS_NO_SUCH_PACKAGE}, + {"NT_STATUS_BAD_FUNCTION_TABLE", NT_STATUS_BAD_FUNCTION_TABLE}, + {"NT_STATUS_DIRECTORY_NOT_EMPTY", NT_STATUS_DIRECTORY_NOT_EMPTY}, + {"NT_STATUS_FILE_CORRUPT_ERROR", NT_STATUS_FILE_CORRUPT_ERROR}, + {"NT_STATUS_NOT_A_DIRECTORY", NT_STATUS_NOT_A_DIRECTORY}, + {"NT_STATUS_BAD_LOGON_SESSION_STATE", + NT_STATUS_BAD_LOGON_SESSION_STATE}, + {"NT_STATUS_LOGON_SESSION_COLLISION", + NT_STATUS_LOGON_SESSION_COLLISION}, + {"NT_STATUS_NAME_TOO_LONG", NT_STATUS_NAME_TOO_LONG}, + {"NT_STATUS_FILES_OPEN", NT_STATUS_FILES_OPEN}, + {"NT_STATUS_CONNECTION_IN_USE", NT_STATUS_CONNECTION_IN_USE}, + {"NT_STATUS_MESSAGE_NOT_FOUND", NT_STATUS_MESSAGE_NOT_FOUND}, + {"NT_STATUS_PROCESS_IS_TERMINATING", + NT_STATUS_PROCESS_IS_TERMINATING}, + {"NT_STATUS_INVALID_LOGON_TYPE", NT_STATUS_INVALID_LOGON_TYPE}, + {"NT_STATUS_NO_GUID_TRANSLATION", NT_STATUS_NO_GUID_TRANSLATION}, + {"NT_STATUS_CANNOT_IMPERSONATE", NT_STATUS_CANNOT_IMPERSONATE}, + {"NT_STATUS_IMAGE_ALREADY_LOADED", NT_STATUS_IMAGE_ALREADY_LOADED}, + {"NT_STATUS_ABIOS_NOT_PRESENT", NT_STATUS_ABIOS_NOT_PRESENT}, + {"NT_STATUS_ABIOS_LID_NOT_EXIST", NT_STATUS_ABIOS_LID_NOT_EXIST}, + {"NT_STATUS_ABIOS_LID_ALREADY_OWNED", + NT_STATUS_ABIOS_LID_ALREADY_OWNED}, + {"NT_STATUS_ABIOS_NOT_LID_OWNER", NT_STATUS_ABIOS_NOT_LID_OWNER}, + {"NT_STATUS_ABIOS_INVALID_COMMAND", + NT_STATUS_ABIOS_INVALID_COMMAND}, + {"NT_STATUS_ABIOS_INVALID_LID", NT_STATUS_ABIOS_INVALID_LID}, + {"NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE", + NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE}, + {"NT_STATUS_ABIOS_INVALID_SELECTOR", + NT_STATUS_ABIOS_INVALID_SELECTOR}, + {"NT_STATUS_NO_LDT", NT_STATUS_NO_LDT}, + {"NT_STATUS_INVALID_LDT_SIZE", NT_STATUS_INVALID_LDT_SIZE}, + {"NT_STATUS_INVALID_LDT_OFFSET", NT_STATUS_INVALID_LDT_OFFSET}, + {"NT_STATUS_INVALID_LDT_DESCRIPTOR", + NT_STATUS_INVALID_LDT_DESCRIPTOR}, + {"NT_STATUS_INVALID_IMAGE_NE_FORMAT", + NT_STATUS_INVALID_IMAGE_NE_FORMAT}, + {"NT_STATUS_RXACT_INVALID_STATE", NT_STATUS_RXACT_INVALID_STATE}, + {"NT_STATUS_RXACT_COMMIT_FAILURE", NT_STATUS_RXACT_COMMIT_FAILURE}, + {"NT_STATUS_MAPPED_FILE_SIZE_ZERO", + NT_STATUS_MAPPED_FILE_SIZE_ZERO}, + {"NT_STATUS_TOO_MANY_OPENED_FILES", + NT_STATUS_TOO_MANY_OPENED_FILES}, + {"NT_STATUS_CANCELLED", NT_STATUS_CANCELLED}, + {"NT_STATUS_CANNOT_DELETE", NT_STATUS_CANNOT_DELETE}, + {"NT_STATUS_INVALID_COMPUTER_NAME", + NT_STATUS_INVALID_COMPUTER_NAME}, + {"NT_STATUS_FILE_DELETED", NT_STATUS_FILE_DELETED}, + {"NT_STATUS_SPECIAL_ACCOUNT", NT_STATUS_SPECIAL_ACCOUNT}, + {"NT_STATUS_SPECIAL_GROUP", NT_STATUS_SPECIAL_GROUP}, + {"NT_STATUS_SPECIAL_USER", NT_STATUS_SPECIAL_USER}, + {"NT_STATUS_MEMBERS_PRIMARY_GROUP", + NT_STATUS_MEMBERS_PRIMARY_GROUP}, + {"NT_STATUS_FILE_CLOSED", NT_STATUS_FILE_CLOSED}, + {"NT_STATUS_TOO_MANY_THREADS", NT_STATUS_TOO_MANY_THREADS}, + {"NT_STATUS_THREAD_NOT_IN_PROCESS", + NT_STATUS_THREAD_NOT_IN_PROCESS}, + {"NT_STATUS_TOKEN_ALREADY_IN_USE", NT_STATUS_TOKEN_ALREADY_IN_USE}, + {"NT_STATUS_PAGEFILE_QUOTA_EXCEEDED", + NT_STATUS_PAGEFILE_QUOTA_EXCEEDED}, + {"NT_STATUS_COMMITMENT_LIMIT", NT_STATUS_COMMITMENT_LIMIT}, + {"NT_STATUS_INVALID_IMAGE_LE_FORMAT", + NT_STATUS_INVALID_IMAGE_LE_FORMAT}, + {"NT_STATUS_INVALID_IMAGE_NOT_MZ", NT_STATUS_INVALID_IMAGE_NOT_MZ}, + {"NT_STATUS_INVALID_IMAGE_PROTECT", + NT_STATUS_INVALID_IMAGE_PROTECT}, + {"NT_STATUS_INVALID_IMAGE_WIN_16", NT_STATUS_INVALID_IMAGE_WIN_16}, + {"NT_STATUS_LOGON_SERVER_CONFLICT", + NT_STATUS_LOGON_SERVER_CONFLICT}, + {"NT_STATUS_TIME_DIFFERENCE_AT_DC", + NT_STATUS_TIME_DIFFERENCE_AT_DC}, + {"NT_STATUS_SYNCHRONIZATION_REQUIRED", + NT_STATUS_SYNCHRONIZATION_REQUIRED}, + {"NT_STATUS_DLL_NOT_FOUND", NT_STATUS_DLL_NOT_FOUND}, + {"NT_STATUS_OPEN_FAILED", NT_STATUS_OPEN_FAILED}, + {"NT_STATUS_IO_PRIVILEGE_FAILED", NT_STATUS_IO_PRIVILEGE_FAILED}, + {"NT_STATUS_ORDINAL_NOT_FOUND", NT_STATUS_ORDINAL_NOT_FOUND}, + {"NT_STATUS_ENTRYPOINT_NOT_FOUND", NT_STATUS_ENTRYPOINT_NOT_FOUND}, + {"NT_STATUS_CONTROL_C_EXIT", NT_STATUS_CONTROL_C_EXIT}, + {"NT_STATUS_LOCAL_DISCONNECT", NT_STATUS_LOCAL_DISCONNECT}, + {"NT_STATUS_REMOTE_DISCONNECT", NT_STATUS_REMOTE_DISCONNECT}, + {"NT_STATUS_REMOTE_RESOURCES", NT_STATUS_REMOTE_RESOURCES}, + {"NT_STATUS_LINK_FAILED", NT_STATUS_LINK_FAILED}, + {"NT_STATUS_LINK_TIMEOUT", NT_STATUS_LINK_TIMEOUT}, + {"NT_STATUS_INVALID_CONNECTION", NT_STATUS_INVALID_CONNECTION}, + {"NT_STATUS_INVALID_ADDRESS", NT_STATUS_INVALID_ADDRESS}, + {"NT_STATUS_DLL_INIT_FAILED", NT_STATUS_DLL_INIT_FAILED}, + {"NT_STATUS_MISSING_SYSTEMFILE", NT_STATUS_MISSING_SYSTEMFILE}, + {"NT_STATUS_UNHANDLED_EXCEPTION", NT_STATUS_UNHANDLED_EXCEPTION}, + {"NT_STATUS_APP_INIT_FAILURE", NT_STATUS_APP_INIT_FAILURE}, + {"NT_STATUS_PAGEFILE_CREATE_FAILED", + NT_STATUS_PAGEFILE_CREATE_FAILED}, + {"NT_STATUS_NO_PAGEFILE", NT_STATUS_NO_PAGEFILE}, + {"NT_STATUS_INVALID_LEVEL", NT_STATUS_INVALID_LEVEL}, + {"NT_STATUS_WRONG_PASSWORD_CORE", NT_STATUS_WRONG_PASSWORD_CORE}, + {"NT_STATUS_ILLEGAL_FLOAT_CONTEXT", + NT_STATUS_ILLEGAL_FLOAT_CONTEXT}, + {"NT_STATUS_PIPE_BROKEN", NT_STATUS_PIPE_BROKEN}, + {"NT_STATUS_REGISTRY_CORRUPT", NT_STATUS_REGISTRY_CORRUPT}, + {"NT_STATUS_REGISTRY_IO_FAILED", NT_STATUS_REGISTRY_IO_FAILED}, + {"NT_STATUS_NO_EVENT_PAIR", NT_STATUS_NO_EVENT_PAIR}, + {"NT_STATUS_UNRECOGNIZED_VOLUME", NT_STATUS_UNRECOGNIZED_VOLUME}, + {"NT_STATUS_SERIAL_NO_DEVICE_INITED", + NT_STATUS_SERIAL_NO_DEVICE_INITED}, + {"NT_STATUS_NO_SUCH_ALIAS", NT_STATUS_NO_SUCH_ALIAS}, + {"NT_STATUS_MEMBER_NOT_IN_ALIAS", NT_STATUS_MEMBER_NOT_IN_ALIAS}, + {"NT_STATUS_MEMBER_IN_ALIAS", NT_STATUS_MEMBER_IN_ALIAS}, + {"NT_STATUS_ALIAS_EXISTS", NT_STATUS_ALIAS_EXISTS}, + {"NT_STATUS_LOGON_NOT_GRANTED", NT_STATUS_LOGON_NOT_GRANTED}, + {"NT_STATUS_TOO_MANY_SECRETS", NT_STATUS_TOO_MANY_SECRETS}, + {"NT_STATUS_SECRET_TOO_LONG", NT_STATUS_SECRET_TOO_LONG}, + {"NT_STATUS_INTERNAL_DB_ERROR", NT_STATUS_INTERNAL_DB_ERROR}, + {"NT_STATUS_FULLSCREEN_MODE", NT_STATUS_FULLSCREEN_MODE}, + {"NT_STATUS_TOO_MANY_CONTEXT_IDS", NT_STATUS_TOO_MANY_CONTEXT_IDS}, + {"NT_STATUS_LOGON_TYPE_NOT_GRANTED", + NT_STATUS_LOGON_TYPE_NOT_GRANTED}, + {"NT_STATUS_NOT_REGISTRY_FILE", NT_STATUS_NOT_REGISTRY_FILE}, + {"NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED", + NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED}, + {"NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR", + NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR}, + {"NT_STATUS_FT_MISSING_MEMBER", NT_STATUS_FT_MISSING_MEMBER}, + {"NT_STATUS_ILL_FORMED_SERVICE_ENTRY", + NT_STATUS_ILL_FORMED_SERVICE_ENTRY}, + {"NT_STATUS_ILLEGAL_CHARACTER", NT_STATUS_ILLEGAL_CHARACTER}, + {"NT_STATUS_UNMAPPABLE_CHARACTER", NT_STATUS_UNMAPPABLE_CHARACTER}, + {"NT_STATUS_UNDEFINED_CHARACTER", NT_STATUS_UNDEFINED_CHARACTER}, + {"NT_STATUS_FLOPPY_VOLUME", NT_STATUS_FLOPPY_VOLUME}, + {"NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND", + NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND}, + {"NT_STATUS_FLOPPY_WRONG_CYLINDER", + NT_STATUS_FLOPPY_WRONG_CYLINDER}, + {"NT_STATUS_FLOPPY_UNKNOWN_ERROR", NT_STATUS_FLOPPY_UNKNOWN_ERROR}, + {"NT_STATUS_FLOPPY_BAD_REGISTERS", NT_STATUS_FLOPPY_BAD_REGISTERS}, + {"NT_STATUS_DISK_RECALIBRATE_FAILED", + NT_STATUS_DISK_RECALIBRATE_FAILED}, + {"NT_STATUS_DISK_OPERATION_FAILED", + NT_STATUS_DISK_OPERATION_FAILED}, + {"NT_STATUS_DISK_RESET_FAILED", NT_STATUS_DISK_RESET_FAILED}, + {"NT_STATUS_SHARED_IRQ_BUSY", NT_STATUS_SHARED_IRQ_BUSY}, + {"NT_STATUS_FT_ORPHANING", NT_STATUS_FT_ORPHANING}, + {"NT_STATUS_PARTITION_FAILURE", NT_STATUS_PARTITION_FAILURE}, + {"NT_STATUS_INVALID_BLOCK_LENGTH", NT_STATUS_INVALID_BLOCK_LENGTH}, + {"NT_STATUS_DEVICE_NOT_PARTITIONED", + NT_STATUS_DEVICE_NOT_PARTITIONED}, + {"NT_STATUS_UNABLE_TO_LOCK_MEDIA", NT_STATUS_UNABLE_TO_LOCK_MEDIA}, + {"NT_STATUS_UNABLE_TO_UNLOAD_MEDIA", + NT_STATUS_UNABLE_TO_UNLOAD_MEDIA}, + {"NT_STATUS_EOM_OVERFLOW", NT_STATUS_EOM_OVERFLOW}, + {"NT_STATUS_NO_MEDIA", NT_STATUS_NO_MEDIA}, + {"NT_STATUS_NO_SUCH_MEMBER", NT_STATUS_NO_SUCH_MEMBER}, + {"NT_STATUS_INVALID_MEMBER", NT_STATUS_INVALID_MEMBER}, + {"NT_STATUS_KEY_DELETED", NT_STATUS_KEY_DELETED}, + {"NT_STATUS_NO_LOG_SPACE", NT_STATUS_NO_LOG_SPACE}, + {"NT_STATUS_TOO_MANY_SIDS", NT_STATUS_TOO_MANY_SIDS}, + {"NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED", + NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED}, + {"NT_STATUS_KEY_HAS_CHILDREN", NT_STATUS_KEY_HAS_CHILDREN}, + {"NT_STATUS_CHILD_MUST_BE_VOLATILE", + NT_STATUS_CHILD_MUST_BE_VOLATILE}, + {"NT_STATUS_DEVICE_CONFIGURATION_ERROR", + NT_STATUS_DEVICE_CONFIGURATION_ERROR}, + {"NT_STATUS_DRIVER_INTERNAL_ERROR", + NT_STATUS_DRIVER_INTERNAL_ERROR}, + {"NT_STATUS_INVALID_DEVICE_STATE", NT_STATUS_INVALID_DEVICE_STATE}, + {"NT_STATUS_IO_DEVICE_ERROR", NT_STATUS_IO_DEVICE_ERROR}, + {"NT_STATUS_DEVICE_PROTOCOL_ERROR", + NT_STATUS_DEVICE_PROTOCOL_ERROR}, + {"NT_STATUS_BACKUP_CONTROLLER", NT_STATUS_BACKUP_CONTROLLER}, + {"NT_STATUS_LOG_FILE_FULL", NT_STATUS_LOG_FILE_FULL}, + {"NT_STATUS_TOO_LATE", NT_STATUS_TOO_LATE}, + {"NT_STATUS_NO_TRUST_LSA_SECRET", NT_STATUS_NO_TRUST_LSA_SECRET}, + {"NT_STATUS_NO_TRUST_SAM_ACCOUNT", NT_STATUS_NO_TRUST_SAM_ACCOUNT}, + {"NT_STATUS_TRUSTED_DOMAIN_FAILURE", + NT_STATUS_TRUSTED_DOMAIN_FAILURE}, + {"NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE", + NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE}, + {"NT_STATUS_EVENTLOG_FILE_CORRUPT", + NT_STATUS_EVENTLOG_FILE_CORRUPT}, + {"NT_STATUS_EVENTLOG_CANT_START", NT_STATUS_EVENTLOG_CANT_START}, + {"NT_STATUS_TRUST_FAILURE", NT_STATUS_TRUST_FAILURE}, + {"NT_STATUS_MUTANT_LIMIT_EXCEEDED", + NT_STATUS_MUTANT_LIMIT_EXCEEDED}, + {"NT_STATUS_NETLOGON_NOT_STARTED", NT_STATUS_NETLOGON_NOT_STARTED}, + {"NT_STATUS_ACCOUNT_EXPIRED", NT_STATUS_ACCOUNT_EXPIRED}, + {"NT_STATUS_POSSIBLE_DEADLOCK", NT_STATUS_POSSIBLE_DEADLOCK}, + {"NT_STATUS_NETWORK_CREDENTIAL_CONFLICT", + NT_STATUS_NETWORK_CREDENTIAL_CONFLICT}, + {"NT_STATUS_REMOTE_SESSION_LIMIT", NT_STATUS_REMOTE_SESSION_LIMIT}, + {"NT_STATUS_EVENTLOG_FILE_CHANGED", + NT_STATUS_EVENTLOG_FILE_CHANGED}, + {"NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT", + NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT}, + {"NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT", + NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT}, + {"NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT", + NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT}, + {"NT_STATUS_DOMAIN_TRUST_INCONSISTENT", + NT_STATUS_DOMAIN_TRUST_INCONSISTENT}, + {"NT_STATUS_FS_DRIVER_REQUIRED", NT_STATUS_FS_DRIVER_REQUIRED}, + {"NT_STATUS_NO_USER_SESSION_KEY", NT_STATUS_NO_USER_SESSION_KEY}, + {"NT_STATUS_USER_SESSION_DELETED", NT_STATUS_USER_SESSION_DELETED}, + {"NT_STATUS_RESOURCE_LANG_NOT_FOUND", + NT_STATUS_RESOURCE_LANG_NOT_FOUND}, + {"NT_STATUS_INSUFF_SERVER_RESOURCES", + NT_STATUS_INSUFF_SERVER_RESOURCES}, + {"NT_STATUS_INVALID_BUFFER_SIZE", NT_STATUS_INVALID_BUFFER_SIZE}, + {"NT_STATUS_INVALID_ADDRESS_COMPONENT", + NT_STATUS_INVALID_ADDRESS_COMPONENT}, + {"NT_STATUS_INVALID_ADDRESS_WILDCARD", + NT_STATUS_INVALID_ADDRESS_WILDCARD}, + {"NT_STATUS_TOO_MANY_ADDRESSES", NT_STATUS_TOO_MANY_ADDRESSES}, + {"NT_STATUS_ADDRESS_ALREADY_EXISTS", + NT_STATUS_ADDRESS_ALREADY_EXISTS}, + {"NT_STATUS_ADDRESS_CLOSED", NT_STATUS_ADDRESS_CLOSED}, + {"NT_STATUS_CONNECTION_DISCONNECTED", + NT_STATUS_CONNECTION_DISCONNECTED}, + {"NT_STATUS_CONNECTION_RESET", NT_STATUS_CONNECTION_RESET}, + {"NT_STATUS_TOO_MANY_NODES", NT_STATUS_TOO_MANY_NODES}, + {"NT_STATUS_TRANSACTION_ABORTED", NT_STATUS_TRANSACTION_ABORTED}, + {"NT_STATUS_TRANSACTION_TIMED_OUT", + NT_STATUS_TRANSACTION_TIMED_OUT}, + {"NT_STATUS_TRANSACTION_NO_RELEASE", + NT_STATUS_TRANSACTION_NO_RELEASE}, + {"NT_STATUS_TRANSACTION_NO_MATCH", NT_STATUS_TRANSACTION_NO_MATCH}, + {"NT_STATUS_TRANSACTION_RESPONDED", + NT_STATUS_TRANSACTION_RESPONDED}, + {"NT_STATUS_TRANSACTION_INVALID_ID", + NT_STATUS_TRANSACTION_INVALID_ID}, + {"NT_STATUS_TRANSACTION_INVALID_TYPE", + NT_STATUS_TRANSACTION_INVALID_TYPE}, + {"NT_STATUS_NOT_SERVER_SESSION", NT_STATUS_NOT_SERVER_SESSION}, + {"NT_STATUS_NOT_CLIENT_SESSION", NT_STATUS_NOT_CLIENT_SESSION}, + {"NT_STATUS_CANNOT_LOAD_REGISTRY_FILE", + NT_STATUS_CANNOT_LOAD_REGISTRY_FILE}, + {"NT_STATUS_DEBUG_ATTACH_FAILED", NT_STATUS_DEBUG_ATTACH_FAILED}, + {"NT_STATUS_SYSTEM_PROCESS_TERMINATED", + NT_STATUS_SYSTEM_PROCESS_TERMINATED}, + {"NT_STATUS_DATA_NOT_ACCEPTED", NT_STATUS_DATA_NOT_ACCEPTED}, + {"NT_STATUS_NO_BROWSER_SERVERS_FOUND", + NT_STATUS_NO_BROWSER_SERVERS_FOUND}, + {"NT_STATUS_VDM_HARD_ERROR", NT_STATUS_VDM_HARD_ERROR}, + {"NT_STATUS_DRIVER_CANCEL_TIMEOUT", + NT_STATUS_DRIVER_CANCEL_TIMEOUT}, + {"NT_STATUS_REPLY_MESSAGE_MISMATCH", + NT_STATUS_REPLY_MESSAGE_MISMATCH}, + {"NT_STATUS_MAPPED_ALIGNMENT", NT_STATUS_MAPPED_ALIGNMENT}, + {"NT_STATUS_IMAGE_CHECKSUM_MISMATCH", + NT_STATUS_IMAGE_CHECKSUM_MISMATCH}, + {"NT_STATUS_LOST_WRITEBEHIND_DATA", + NT_STATUS_LOST_WRITEBEHIND_DATA}, + {"NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID", + NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID}, + {"NT_STATUS_PASSWORD_MUST_CHANGE", NT_STATUS_PASSWORD_MUST_CHANGE}, + {"NT_STATUS_NOT_FOUND", NT_STATUS_NOT_FOUND}, + {"NT_STATUS_NOT_TINY_STREAM", NT_STATUS_NOT_TINY_STREAM}, + {"NT_STATUS_RECOVERY_FAILURE", NT_STATUS_RECOVERY_FAILURE}, + {"NT_STATUS_STACK_OVERFLOW_READ", NT_STATUS_STACK_OVERFLOW_READ}, + {"NT_STATUS_FAIL_CHECK", NT_STATUS_FAIL_CHECK}, + {"NT_STATUS_DUPLICATE_OBJECTID", NT_STATUS_DUPLICATE_OBJECTID}, + {"NT_STATUS_OBJECTID_EXISTS", NT_STATUS_OBJECTID_EXISTS}, + {"NT_STATUS_CONVERT_TO_LARGE", NT_STATUS_CONVERT_TO_LARGE}, + {"NT_STATUS_RETRY", NT_STATUS_RETRY}, + {"NT_STATUS_FOUND_OUT_OF_SCOPE", NT_STATUS_FOUND_OUT_OF_SCOPE}, + {"NT_STATUS_ALLOCATE_BUCKET", NT_STATUS_ALLOCATE_BUCKET}, + {"NT_STATUS_PROPSET_NOT_FOUND", NT_STATUS_PROPSET_NOT_FOUND}, + {"NT_STATUS_MARSHALL_OVERFLOW", NT_STATUS_MARSHALL_OVERFLOW}, + {"NT_STATUS_INVALID_VARIANT", NT_STATUS_INVALID_VARIANT}, + {"NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND", + NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND}, + {"NT_STATUS_ACCOUNT_LOCKED_OUT", NT_STATUS_ACCOUNT_LOCKED_OUT}, + {"NT_STATUS_HANDLE_NOT_CLOSABLE", NT_STATUS_HANDLE_NOT_CLOSABLE}, + {"NT_STATUS_CONNECTION_REFUSED", NT_STATUS_CONNECTION_REFUSED}, + {"NT_STATUS_GRACEFUL_DISCONNECT", NT_STATUS_GRACEFUL_DISCONNECT}, + {"NT_STATUS_ADDRESS_ALREADY_ASSOCIATED", + NT_STATUS_ADDRESS_ALREADY_ASSOCIATED}, + {"NT_STATUS_ADDRESS_NOT_ASSOCIATED", + NT_STATUS_ADDRESS_NOT_ASSOCIATED}, + {"NT_STATUS_CONNECTION_INVALID", NT_STATUS_CONNECTION_INVALID}, + {"NT_STATUS_CONNECTION_ACTIVE", NT_STATUS_CONNECTION_ACTIVE}, + {"NT_STATUS_NETWORK_UNREACHABLE", NT_STATUS_NETWORK_UNREACHABLE}, + {"NT_STATUS_HOST_UNREACHABLE", NT_STATUS_HOST_UNREACHABLE}, + {"NT_STATUS_PROTOCOL_UNREACHABLE", NT_STATUS_PROTOCOL_UNREACHABLE}, + {"NT_STATUS_PORT_UNREACHABLE", NT_STATUS_PORT_UNREACHABLE}, + {"NT_STATUS_REQUEST_ABORTED", NT_STATUS_REQUEST_ABORTED}, + {"NT_STATUS_CONNECTION_ABORTED", NT_STATUS_CONNECTION_ABORTED}, + {"NT_STATUS_BAD_COMPRESSION_BUFFER", + NT_STATUS_BAD_COMPRESSION_BUFFER}, + {"NT_STATUS_USER_MAPPED_FILE", NT_STATUS_USER_MAPPED_FILE}, + {"NT_STATUS_AUDIT_FAILED", NT_STATUS_AUDIT_FAILED}, + {"NT_STATUS_TIMER_RESOLUTION_NOT_SET", + NT_STATUS_TIMER_RESOLUTION_NOT_SET}, + {"NT_STATUS_CONNECTION_COUNT_LIMIT", + NT_STATUS_CONNECTION_COUNT_LIMIT}, + {"NT_STATUS_LOGIN_TIME_RESTRICTION", + NT_STATUS_LOGIN_TIME_RESTRICTION}, + {"NT_STATUS_LOGIN_WKSTA_RESTRICTION", + NT_STATUS_LOGIN_WKSTA_RESTRICTION}, + {"NT_STATUS_IMAGE_MP_UP_MISMATCH", NT_STATUS_IMAGE_MP_UP_MISMATCH}, + {"NT_STATUS_INSUFFICIENT_LOGON_INFO", + NT_STATUS_INSUFFICIENT_LOGON_INFO}, + {"NT_STATUS_BAD_DLL_ENTRYPOINT", NT_STATUS_BAD_DLL_ENTRYPOINT}, + {"NT_STATUS_BAD_SERVICE_ENTRYPOINT", + NT_STATUS_BAD_SERVICE_ENTRYPOINT}, + {"NT_STATUS_LPC_REPLY_LOST", NT_STATUS_LPC_REPLY_LOST}, + {"NT_STATUS_IP_ADDRESS_CONFLICT1", NT_STATUS_IP_ADDRESS_CONFLICT1}, + {"NT_STATUS_IP_ADDRESS_CONFLICT2", NT_STATUS_IP_ADDRESS_CONFLICT2}, + {"NT_STATUS_REGISTRY_QUOTA_LIMIT", NT_STATUS_REGISTRY_QUOTA_LIMIT}, + {"NT_STATUS_PATH_NOT_COVERED", NT_STATUS_PATH_NOT_COVERED}, + {"NT_STATUS_NO_CALLBACK_ACTIVE", NT_STATUS_NO_CALLBACK_ACTIVE}, + {"NT_STATUS_LICENSE_QUOTA_EXCEEDED", + NT_STATUS_LICENSE_QUOTA_EXCEEDED}, + {"NT_STATUS_PWD_TOO_SHORT", NT_STATUS_PWD_TOO_SHORT}, + {"NT_STATUS_PWD_TOO_RECENT", NT_STATUS_PWD_TOO_RECENT}, + {"NT_STATUS_PWD_HISTORY_CONFLICT", NT_STATUS_PWD_HISTORY_CONFLICT}, + {"NT_STATUS_PLUGPLAY_NO_DEVICE", NT_STATUS_PLUGPLAY_NO_DEVICE}, + {"NT_STATUS_UNSUPPORTED_COMPRESSION", + NT_STATUS_UNSUPPORTED_COMPRESSION}, + {"NT_STATUS_INVALID_HW_PROFILE", NT_STATUS_INVALID_HW_PROFILE}, + {"NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH", + NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH}, + {"NT_STATUS_DRIVER_ORDINAL_NOT_FOUND", + NT_STATUS_DRIVER_ORDINAL_NOT_FOUND}, + {"NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND", + NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND}, + {"NT_STATUS_RESOURCE_NOT_OWNED", NT_STATUS_RESOURCE_NOT_OWNED}, + {"NT_STATUS_TOO_MANY_LINKS", NT_STATUS_TOO_MANY_LINKS}, + {"NT_STATUS_QUOTA_LIST_INCONSISTENT", + NT_STATUS_QUOTA_LIST_INCONSISTENT}, + {"NT_STATUS_FILE_IS_OFFLINE", NT_STATUS_FILE_IS_OFFLINE}, + {"NT_STATUS_NO_MORE_ENTRIES", NT_STATUS_NO_MORE_ENTRIES}, + {"STATUS_MORE_ENTRIES", STATUS_MORE_ENTRIES}, + {"STATUS_SOME_UNMAPPED", STATUS_SOME_UNMAPPED}, + {NULL, 0} +}; diff --git a/fs/cifs/nterr.h b/fs/cifs/nterr.h new file mode 100644 index 00000000..25726736 --- /dev/null +++ b/fs/cifs/nterr.h @@ -0,0 +1,561 @@ +/* + Unix SMB/Netbios implementation. + Version 1.9. + NT error code constants + Copyright (C) Andrew Tridgell 1992-2000 + Copyright (C) John H Terpstra 1996-2000 + Copyright (C) Luke Kenneth Casson Leighton 1996-2000 + Copyright (C) Paul Ashton 1998-2000 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + + + +#ifndef _NTERR_H +#define _NTERR_H + +struct nt_err_code_struct { + char *nt_errstr; + __u32 nt_errcode; +}; + +extern const struct nt_err_code_struct nt_errs[]; + +/* Win32 Status codes. */ +#define STATUS_MORE_ENTRIES 0x0105 +#define ERROR_INVALID_PARAMETER 0x0057 +#define ERROR_INSUFFICIENT_BUFFER 0x007a +#define STATUS_1804 0x070c +#define STATUS_NOTIFY_ENUM_DIR 0x010c + +/* Win32 Error codes extracted using a loop in smbclient then printing a + netmon sniff to a file. */ + +#define NT_STATUS_OK 0x0000 +#define STATUS_SOME_UNMAPPED 0x0107 +#define STATUS_BUFFER_OVERFLOW 0x80000005 +#define NT_STATUS_NO_MORE_ENTRIES 0x8000001a +#define NT_STATUS_MEDIA_CHANGED 0x8000001c +#define NT_STATUS_END_OF_MEDIA 0x8000001e +#define NT_STATUS_MEDIA_CHECK 0x80000020 +#define NT_STATUS_NO_DATA_DETECTED 0x8000001c +#define NT_STATUS_STOPPED_ON_SYMLINK 0x8000002d +#define NT_STATUS_DEVICE_REQUIRES_CLEANING 0x80000288 +#define NT_STATUS_DEVICE_DOOR_OPEN 0x80000288 +#define NT_STATUS_UNSUCCESSFUL 0xC0000000 | 0x0001 +#define NT_STATUS_NOT_IMPLEMENTED 0xC0000000 | 0x0002 +#define NT_STATUS_INVALID_INFO_CLASS 0xC0000000 | 0x0003 +#define NT_STATUS_INFO_LENGTH_MISMATCH 0xC0000000 | 0x0004 +#define NT_STATUS_ACCESS_VIOLATION 0xC0000000 | 0x0005 +#define NT_STATUS_IN_PAGE_ERROR 0xC0000000 | 0x0006 +#define NT_STATUS_PAGEFILE_QUOTA 0xC0000000 | 0x0007 +#define NT_STATUS_INVALID_HANDLE 0xC0000000 | 0x0008 +#define NT_STATUS_BAD_INITIAL_STACK 0xC0000000 | 0x0009 +#define NT_STATUS_BAD_INITIAL_PC 0xC0000000 | 0x000a +#define NT_STATUS_INVALID_CID 0xC0000000 | 0x000b +#define NT_STATUS_TIMER_NOT_CANCELED 0xC0000000 | 0x000c +#define NT_STATUS_INVALID_PARAMETER 0xC0000000 | 0x000d +#define NT_STATUS_NO_SUCH_DEVICE 0xC0000000 | 0x000e +#define NT_STATUS_NO_SUCH_FILE 0xC0000000 | 0x000f +#define NT_STATUS_INVALID_DEVICE_REQUEST 0xC0000000 | 0x0010 +#define NT_STATUS_END_OF_FILE 0xC0000000 | 0x0011 +#define NT_STATUS_WRONG_VOLUME 0xC0000000 | 0x0012 +#define NT_STATUS_NO_MEDIA_IN_DEVICE 0xC0000000 | 0x0013 +#define NT_STATUS_UNRECOGNIZED_MEDIA 0xC0000000 | 0x0014 +#define NT_STATUS_NONEXISTENT_SECTOR 0xC0000000 | 0x0015 +#define NT_STATUS_MORE_PROCESSING_REQUIRED 0xC0000000 | 0x0016 +#define NT_STATUS_NO_MEMORY 0xC0000000 | 0x0017 +#define NT_STATUS_CONFLICTING_ADDRESSES 0xC0000000 | 0x0018 +#define NT_STATUS_NOT_MAPPED_VIEW 0xC0000000 | 0x0019 +#define NT_STATUS_UNABLE_TO_FREE_VM 0x80000000 | 0x001a +#define NT_STATUS_UNABLE_TO_DELETE_SECTION 0xC0000000 | 0x001b +#define NT_STATUS_INVALID_SYSTEM_SERVICE 0xC0000000 | 0x001c +#define NT_STATUS_ILLEGAL_INSTRUCTION 0xC0000000 | 0x001d +#define NT_STATUS_INVALID_LOCK_SEQUENCE 0xC0000000 | 0x001e +#define NT_STATUS_INVALID_VIEW_SIZE 0xC0000000 | 0x001f +#define NT_STATUS_INVALID_FILE_FOR_SECTION 0xC0000000 | 0x0020 +#define NT_STATUS_ALREADY_COMMITTED 0xC0000000 | 0x0021 +#define NT_STATUS_ACCESS_DENIED 0xC0000000 | 0x0022 +#define NT_STATUS_BUFFER_TOO_SMALL 0xC0000000 | 0x0023 +#define NT_STATUS_OBJECT_TYPE_MISMATCH 0xC0000000 | 0x0024 +#define NT_STATUS_NONCONTINUABLE_EXCEPTION 0xC0000000 | 0x0025 +#define NT_STATUS_INVALID_DISPOSITION 0xC0000000 | 0x0026 +#define NT_STATUS_UNWIND 0xC0000000 | 0x0027 +#define NT_STATUS_BAD_STACK 0xC0000000 | 0x0028 +#define NT_STATUS_INVALID_UNWIND_TARGET 0xC0000000 | 0x0029 +#define NT_STATUS_NOT_LOCKED 0xC0000000 | 0x002a +#define NT_STATUS_PARITY_ERROR 0xC0000000 | 0x002b +#define NT_STATUS_UNABLE_TO_DECOMMIT_VM 0xC0000000 | 0x002c +#define NT_STATUS_NOT_COMMITTED 0xC0000000 | 0x002d +#define NT_STATUS_INVALID_PORT_ATTRIBUTES 0xC0000000 | 0x002e +#define NT_STATUS_PORT_MESSAGE_TOO_LONG 0xC0000000 | 0x002f +#define NT_STATUS_INVALID_PARAMETER_MIX 0xC0000000 | 0x0030 +#define NT_STATUS_INVALID_QUOTA_LOWER 0xC0000000 | 0x0031 +#define NT_STATUS_DISK_CORRUPT_ERROR 0xC0000000 | 0x0032 +#define NT_STATUS_OBJECT_NAME_INVALID 0xC0000000 | 0x0033 +#define NT_STATUS_OBJECT_NAME_NOT_FOUND 0xC0000000 | 0x0034 +#define NT_STATUS_OBJECT_NAME_COLLISION 0xC0000000 | 0x0035 +#define NT_STATUS_HANDLE_NOT_WAITABLE 0xC0000000 | 0x0036 +#define NT_STATUS_PORT_DISCONNECTED 0xC0000000 | 0x0037 +#define NT_STATUS_DEVICE_ALREADY_ATTACHED 0xC0000000 | 0x0038 +#define NT_STATUS_OBJECT_PATH_INVALID 0xC0000000 | 0x0039 +#define NT_STATUS_OBJECT_PATH_NOT_FOUND 0xC0000000 | 0x003a +#define NT_STATUS_OBJECT_PATH_SYNTAX_BAD 0xC0000000 | 0x003b +#define NT_STATUS_DATA_OVERRUN 0xC0000000 | 0x003c +#define NT_STATUS_DATA_LATE_ERROR 0xC0000000 | 0x003d +#define NT_STATUS_DATA_ERROR 0xC0000000 | 0x003e +#define NT_STATUS_CRC_ERROR 0xC0000000 | 0x003f +#define NT_STATUS_SECTION_TOO_BIG 0xC0000000 | 0x0040 +#define NT_STATUS_PORT_CONNECTION_REFUSED 0xC0000000 | 0x0041 +#define NT_STATUS_INVALID_PORT_HANDLE 0xC0000000 | 0x0042 +#define NT_STATUS_SHARING_VIOLATION 0xC0000000 | 0x0043 +#define NT_STATUS_QUOTA_EXCEEDED 0xC0000000 | 0x0044 +#define NT_STATUS_INVALID_PAGE_PROTECTION 0xC0000000 | 0x0045 +#define NT_STATUS_MUTANT_NOT_OWNED 0xC0000000 | 0x0046 +#define NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED 0xC0000000 | 0x0047 +#define NT_STATUS_PORT_ALREADY_SET 0xC0000000 | 0x0048 +#define NT_STATUS_SECTION_NOT_IMAGE 0xC0000000 | 0x0049 +#define NT_STATUS_SUSPEND_COUNT_EXCEEDED 0xC0000000 | 0x004a +#define NT_STATUS_THREAD_IS_TERMINATING 0xC0000000 | 0x004b +#define NT_STATUS_BAD_WORKING_SET_LIMIT 0xC0000000 | 0x004c +#define NT_STATUS_INCOMPATIBLE_FILE_MAP 0xC0000000 | 0x004d +#define NT_STATUS_SECTION_PROTECTION 0xC0000000 | 0x004e +#define NT_STATUS_EAS_NOT_SUPPORTED 0xC0000000 | 0x004f +#define NT_STATUS_EA_TOO_LARGE 0xC0000000 | 0x0050 +#define NT_STATUS_NONEXISTENT_EA_ENTRY 0xC0000000 | 0x0051 +#define NT_STATUS_NO_EAS_ON_FILE 0xC0000000 | 0x0052 +#define NT_STATUS_EA_CORRUPT_ERROR 0xC0000000 | 0x0053 +#define NT_STATUS_FILE_LOCK_CONFLICT 0xC0000000 | 0x0054 +#define NT_STATUS_LOCK_NOT_GRANTED 0xC0000000 | 0x0055 +#define NT_STATUS_DELETE_PENDING 0xC0000000 | 0x0056 +#define NT_STATUS_CTL_FILE_NOT_SUPPORTED 0xC0000000 | 0x0057 +#define NT_STATUS_UNKNOWN_REVISION 0xC0000000 | 0x0058 +#define NT_STATUS_REVISION_MISMATCH 0xC0000000 | 0x0059 +#define NT_STATUS_INVALID_OWNER 0xC0000000 | 0x005a +#define NT_STATUS_INVALID_PRIMARY_GROUP 0xC0000000 | 0x005b +#define NT_STATUS_NO_IMPERSONATION_TOKEN 0xC0000000 | 0x005c +#define NT_STATUS_CANT_DISABLE_MANDATORY 0xC0000000 | 0x005d +#define NT_STATUS_NO_LOGON_SERVERS 0xC0000000 | 0x005e +#define NT_STATUS_NO_SUCH_LOGON_SESSION 0xC0000000 | 0x005f +#define NT_STATUS_NO_SUCH_PRIVILEGE 0xC0000000 | 0x0060 +#define NT_STATUS_PRIVILEGE_NOT_HELD 0xC0000000 | 0x0061 +#define NT_STATUS_INVALID_ACCOUNT_NAME 0xC0000000 | 0x0062 +#define NT_STATUS_USER_EXISTS 0xC0000000 | 0x0063 +#define NT_STATUS_NO_SUCH_USER 0xC0000000 | 0x0064 +#define NT_STATUS_GROUP_EXISTS 0xC0000000 | 0x0065 +#define NT_STATUS_NO_SUCH_GROUP 0xC0000000 | 0x0066 +#define NT_STATUS_MEMBER_IN_GROUP 0xC0000000 | 0x0067 +#define NT_STATUS_MEMBER_NOT_IN_GROUP 0xC0000000 | 0x0068 +#define NT_STATUS_LAST_ADMIN 0xC0000000 | 0x0069 +#define NT_STATUS_WRONG_PASSWORD 0xC0000000 | 0x006a +#define NT_STATUS_ILL_FORMED_PASSWORD 0xC0000000 | 0x006b +#define NT_STATUS_PASSWORD_RESTRICTION 0xC0000000 | 0x006c +#define NT_STATUS_LOGON_FAILURE 0xC0000000 | 0x006d +#define NT_STATUS_ACCOUNT_RESTRICTION 0xC0000000 | 0x006e +#define NT_STATUS_INVALID_LOGON_HOURS 0xC0000000 | 0x006f +#define NT_STATUS_INVALID_WORKSTATION 0xC0000000 | 0x0070 +#define NT_STATUS_PASSWORD_EXPIRED 0xC0000000 | 0x0071 +#define NT_STATUS_ACCOUNT_DISABLED 0xC0000000 | 0x0072 +#define NT_STATUS_NONE_MAPPED 0xC0000000 | 0x0073 +#define NT_STATUS_TOO_MANY_LUIDS_REQUESTED 0xC0000000 | 0x0074 +#define NT_STATUS_LUIDS_EXHAUSTED 0xC0000000 | 0x0075 +#define NT_STATUS_INVALID_SUB_AUTHORITY 0xC0000000 | 0x0076 +#define NT_STATUS_INVALID_ACL 0xC0000000 | 0x0077 +#define NT_STATUS_INVALID_SID 0xC0000000 | 0x0078 +#define NT_STATUS_INVALID_SECURITY_DESCR 0xC0000000 | 0x0079 +#define NT_STATUS_PROCEDURE_NOT_FOUND 0xC0000000 | 0x007a +#define NT_STATUS_INVALID_IMAGE_FORMAT 0xC0000000 | 0x007b +#define NT_STATUS_NO_TOKEN 0xC0000000 | 0x007c +#define NT_STATUS_BAD_INHERITANCE_ACL 0xC0000000 | 0x007d +#define NT_STATUS_RANGE_NOT_LOCKED 0xC0000000 | 0x007e +#define NT_STATUS_DISK_FULL 0xC0000000 | 0x007f +#define NT_STATUS_SERVER_DISABLED 0xC0000000 | 0x0080 +#define NT_STATUS_SERVER_NOT_DISABLED 0xC0000000 | 0x0081 +#define NT_STATUS_TOO_MANY_GUIDS_REQUESTED 0xC0000000 | 0x0082 +#define NT_STATUS_GUIDS_EXHAUSTED 0xC0000000 | 0x0083 +#define NT_STATUS_INVALID_ID_AUTHORITY 0xC0000000 | 0x0084 +#define NT_STATUS_AGENTS_EXHAUSTED 0xC0000000 | 0x0085 +#define NT_STATUS_INVALID_VOLUME_LABEL 0xC0000000 | 0x0086 +#define NT_STATUS_SECTION_NOT_EXTENDED 0xC0000000 | 0x0087 +#define NT_STATUS_NOT_MAPPED_DATA 0xC0000000 | 0x0088 +#define NT_STATUS_RESOURCE_DATA_NOT_FOUND 0xC0000000 | 0x0089 +#define NT_STATUS_RESOURCE_TYPE_NOT_FOUND 0xC0000000 | 0x008a +#define NT_STATUS_RESOURCE_NAME_NOT_FOUND 0xC0000000 | 0x008b +#define NT_STATUS_ARRAY_BOUNDS_EXCEEDED 0xC0000000 | 0x008c +#define NT_STATUS_FLOAT_DENORMAL_OPERAND 0xC0000000 | 0x008d +#define NT_STATUS_FLOAT_DIVIDE_BY_ZERO 0xC0000000 | 0x008e +#define NT_STATUS_FLOAT_INEXACT_RESULT 0xC0000000 | 0x008f +#define NT_STATUS_FLOAT_INVALID_OPERATION 0xC0000000 | 0x0090 +#define NT_STATUS_FLOAT_OVERFLOW 0xC0000000 | 0x0091 +#define NT_STATUS_FLOAT_STACK_CHECK 0xC0000000 | 0x0092 +#define NT_STATUS_FLOAT_UNDERFLOW 0xC0000000 | 0x0093 +#define NT_STATUS_INTEGER_DIVIDE_BY_ZERO 0xC0000000 | 0x0094 +#define NT_STATUS_INTEGER_OVERFLOW 0xC0000000 | 0x0095 +#define NT_STATUS_PRIVILEGED_INSTRUCTION 0xC0000000 | 0x0096 +#define NT_STATUS_TOO_MANY_PAGING_FILES 0xC0000000 | 0x0097 +#define NT_STATUS_FILE_INVALID 0xC0000000 | 0x0098 +#define NT_STATUS_ALLOTTED_SPACE_EXCEEDED 0xC0000000 | 0x0099 +#define NT_STATUS_INSUFFICIENT_RESOURCES 0xC0000000 | 0x009a +#define NT_STATUS_DFS_EXIT_PATH_FOUND 0xC0000000 | 0x009b +#define NT_STATUS_DEVICE_DATA_ERROR 0xC0000000 | 0x009c +#define NT_STATUS_DEVICE_NOT_CONNECTED 0xC0000000 | 0x009d +#define NT_STATUS_DEVICE_POWER_FAILURE 0xC0000000 | 0x009e +#define NT_STATUS_FREE_VM_NOT_AT_BASE 0xC0000000 | 0x009f +#define NT_STATUS_MEMORY_NOT_ALLOCATED 0xC0000000 | 0x00a0 +#define NT_STATUS_WORKING_SET_QUOTA 0xC0000000 | 0x00a1 +#define NT_STATUS_MEDIA_WRITE_PROTECTED 0xC0000000 | 0x00a2 +#define NT_STATUS_DEVICE_NOT_READY 0xC0000000 | 0x00a3 +#define NT_STATUS_INVALID_GROUP_ATTRIBUTES 0xC0000000 | 0x00a4 +#define NT_STATUS_BAD_IMPERSONATION_LEVEL 0xC0000000 | 0x00a5 +#define NT_STATUS_CANT_OPEN_ANONYMOUS 0xC0000000 | 0x00a6 +#define NT_STATUS_BAD_VALIDATION_CLASS 0xC0000000 | 0x00a7 +#define NT_STATUS_BAD_TOKEN_TYPE 0xC0000000 | 0x00a8 +#define NT_STATUS_BAD_MASTER_BOOT_RECORD 0xC0000000 | 0x00a9 +#define NT_STATUS_INSTRUCTION_MISALIGNMENT 0xC0000000 | 0x00aa +#define NT_STATUS_INSTANCE_NOT_AVAILABLE 0xC0000000 | 0x00ab +#define NT_STATUS_PIPE_NOT_AVAILABLE 0xC0000000 | 0x00ac +#define NT_STATUS_INVALID_PIPE_STATE 0xC0000000 | 0x00ad +#define NT_STATUS_PIPE_BUSY 0xC0000000 | 0x00ae +#define NT_STATUS_ILLEGAL_FUNCTION 0xC0000000 | 0x00af +#define NT_STATUS_PIPE_DISCONNECTED 0xC0000000 | 0x00b0 +#define NT_STATUS_PIPE_CLOSING 0xC0000000 | 0x00b1 +#define NT_STATUS_PIPE_CONNECTED 0xC0000000 | 0x00b2 +#define NT_STATUS_PIPE_LISTENING 0xC0000000 | 0x00b3 +#define NT_STATUS_INVALID_READ_MODE 0xC0000000 | 0x00b4 +#define NT_STATUS_IO_TIMEOUT 0xC0000000 | 0x00b5 +#define NT_STATUS_FILE_FORCED_CLOSED 0xC0000000 | 0x00b6 +#define NT_STATUS_PROFILING_NOT_STARTED 0xC0000000 | 0x00b7 +#define NT_STATUS_PROFILING_NOT_STOPPED 0xC0000000 | 0x00b8 +#define NT_STATUS_COULD_NOT_INTERPRET 0xC0000000 | 0x00b9 +#define NT_STATUS_FILE_IS_A_DIRECTORY 0xC0000000 | 0x00ba +#define NT_STATUS_NOT_SUPPORTED 0xC0000000 | 0x00bb +#define NT_STATUS_REMOTE_NOT_LISTENING 0xC0000000 | 0x00bc +#define NT_STATUS_DUPLICATE_NAME 0xC0000000 | 0x00bd +#define NT_STATUS_BAD_NETWORK_PATH 0xC0000000 | 0x00be +#define NT_STATUS_NETWORK_BUSY 0xC0000000 | 0x00bf +#define NT_STATUS_DEVICE_DOES_NOT_EXIST 0xC0000000 | 0x00c0 +#define NT_STATUS_TOO_MANY_COMMANDS 0xC0000000 | 0x00c1 +#define NT_STATUS_ADAPTER_HARDWARE_ERROR 0xC0000000 | 0x00c2 +#define NT_STATUS_INVALID_NETWORK_RESPONSE 0xC0000000 | 0x00c3 +#define NT_STATUS_UNEXPECTED_NETWORK_ERROR 0xC0000000 | 0x00c4 +#define NT_STATUS_BAD_REMOTE_ADAPTER 0xC0000000 | 0x00c5 +#define NT_STATUS_PRINT_QUEUE_FULL 0xC0000000 | 0x00c6 +#define NT_STATUS_NO_SPOOL_SPACE 0xC0000000 | 0x00c7 +#define NT_STATUS_PRINT_CANCELLED 0xC0000000 | 0x00c8 +#define NT_STATUS_NETWORK_NAME_DELETED 0xC0000000 | 0x00c9 +#define NT_STATUS_NETWORK_ACCESS_DENIED 0xC0000000 | 0x00ca +#define NT_STATUS_BAD_DEVICE_TYPE 0xC0000000 | 0x00cb +#define NT_STATUS_BAD_NETWORK_NAME 0xC0000000 | 0x00cc +#define NT_STATUS_TOO_MANY_NAMES 0xC0000000 | 0x00cd +#define NT_STATUS_TOO_MANY_SESSIONS 0xC0000000 | 0x00ce +#define NT_STATUS_SHARING_PAUSED 0xC0000000 | 0x00cf +#define NT_STATUS_REQUEST_NOT_ACCEPTED 0xC0000000 | 0x00d0 +#define NT_STATUS_REDIRECTOR_PAUSED 0xC0000000 | 0x00d1 +#define NT_STATUS_NET_WRITE_FAULT 0xC0000000 | 0x00d2 +#define NT_STATUS_PROFILING_AT_LIMIT 0xC0000000 | 0x00d3 +#define NT_STATUS_NOT_SAME_DEVICE 0xC0000000 | 0x00d4 +#define NT_STATUS_FILE_RENAMED 0xC0000000 | 0x00d5 +#define NT_STATUS_VIRTUAL_CIRCUIT_CLOSED 0xC0000000 | 0x00d6 +#define NT_STATUS_NO_SECURITY_ON_OBJECT 0xC0000000 | 0x00d7 +#define NT_STATUS_CANT_WAIT 0xC0000000 | 0x00d8 +#define NT_STATUS_PIPE_EMPTY 0xC0000000 | 0x00d9 +#define NT_STATUS_CANT_ACCESS_DOMAIN_INFO 0xC0000000 | 0x00da +#define NT_STATUS_CANT_TERMINATE_SELF 0xC0000000 | 0x00db +#define NT_STATUS_INVALID_SERVER_STATE 0xC0000000 | 0x00dc +#define NT_STATUS_INVALID_DOMAIN_STATE 0xC0000000 | 0x00dd +#define NT_STATUS_INVALID_DOMAIN_ROLE 0xC0000000 | 0x00de +#define NT_STATUS_NO_SUCH_DOMAIN 0xC0000000 | 0x00df +#define NT_STATUS_DOMAIN_EXISTS 0xC0000000 | 0x00e0 +#define NT_STATUS_DOMAIN_LIMIT_EXCEEDED 0xC0000000 | 0x00e1 +#define NT_STATUS_OPLOCK_NOT_GRANTED 0xC0000000 | 0x00e2 +#define NT_STATUS_INVALID_OPLOCK_PROTOCOL 0xC0000000 | 0x00e3 +#define NT_STATUS_INTERNAL_DB_CORRUPTION 0xC0000000 | 0x00e4 +#define NT_STATUS_INTERNAL_ERROR 0xC0000000 | 0x00e5 +#define NT_STATUS_GENERIC_NOT_MAPPED 0xC0000000 | 0x00e6 +#define NT_STATUS_BAD_DESCRIPTOR_FORMAT 0xC0000000 | 0x00e7 +#define NT_STATUS_INVALID_USER_BUFFER 0xC0000000 | 0x00e8 +#define NT_STATUS_UNEXPECTED_IO_ERROR 0xC0000000 | 0x00e9 +#define NT_STATUS_UNEXPECTED_MM_CREATE_ERR 0xC0000000 | 0x00ea +#define NT_STATUS_UNEXPECTED_MM_MAP_ERROR 0xC0000000 | 0x00eb +#define NT_STATUS_UNEXPECTED_MM_EXTEND_ERR 0xC0000000 | 0x00ec +#define NT_STATUS_NOT_LOGON_PROCESS 0xC0000000 | 0x00ed +#define NT_STATUS_LOGON_SESSION_EXISTS 0xC0000000 | 0x00ee +#define NT_STATUS_INVALID_PARAMETER_1 0xC0000000 | 0x00ef +#define NT_STATUS_INVALID_PARAMETER_2 0xC0000000 | 0x00f0 +#define NT_STATUS_INVALID_PARAMETER_3 0xC0000000 | 0x00f1 +#define NT_STATUS_INVALID_PARAMETER_4 0xC0000000 | 0x00f2 +#define NT_STATUS_INVALID_PARAMETER_5 0xC0000000 | 0x00f3 +#define NT_STATUS_INVALID_PARAMETER_6 0xC0000000 | 0x00f4 +#define NT_STATUS_INVALID_PARAMETER_7 0xC0000000 | 0x00f5 +#define NT_STATUS_INVALID_PARAMETER_8 0xC0000000 | 0x00f6 +#define NT_STATUS_INVALID_PARAMETER_9 0xC0000000 | 0x00f7 +#define NT_STATUS_INVALID_PARAMETER_10 0xC0000000 | 0x00f8 +#define NT_STATUS_INVALID_PARAMETER_11 0xC0000000 | 0x00f9 +#define NT_STATUS_INVALID_PARAMETER_12 0xC0000000 | 0x00fa +#define NT_STATUS_REDIRECTOR_NOT_STARTED 0xC0000000 | 0x00fb +#define NT_STATUS_REDIRECTOR_STARTED 0xC0000000 | 0x00fc +#define NT_STATUS_STACK_OVERFLOW 0xC0000000 | 0x00fd +#define NT_STATUS_NO_SUCH_PACKAGE 0xC0000000 | 0x00fe +#define NT_STATUS_BAD_FUNCTION_TABLE 0xC0000000 | 0x00ff +#define NT_STATUS_DIRECTORY_NOT_EMPTY 0xC0000000 | 0x0101 +#define NT_STATUS_FILE_CORRUPT_ERROR 0xC0000000 | 0x0102 +#define NT_STATUS_NOT_A_DIRECTORY 0xC0000000 | 0x0103 +#define NT_STATUS_BAD_LOGON_SESSION_STATE 0xC0000000 | 0x0104 +#define NT_STATUS_LOGON_SESSION_COLLISION 0xC0000000 | 0x0105 +#define NT_STATUS_NAME_TOO_LONG 0xC0000000 | 0x0106 +#define NT_STATUS_FILES_OPEN 0xC0000000 | 0x0107 +#define NT_STATUS_CONNECTION_IN_USE 0xC0000000 | 0x0108 +#define NT_STATUS_MESSAGE_NOT_FOUND 0xC0000000 | 0x0109 +#define NT_STATUS_PROCESS_IS_TERMINATING 0xC0000000 | 0x010a +#define NT_STATUS_INVALID_LOGON_TYPE 0xC0000000 | 0x010b +#define NT_STATUS_NO_GUID_TRANSLATION 0xC0000000 | 0x010c +#define NT_STATUS_CANNOT_IMPERSONATE 0xC0000000 | 0x010d +#define NT_STATUS_IMAGE_ALREADY_LOADED 0xC0000000 | 0x010e +#define NT_STATUS_ABIOS_NOT_PRESENT 0xC0000000 | 0x010f +#define NT_STATUS_ABIOS_LID_NOT_EXIST 0xC0000000 | 0x0110 +#define NT_STATUS_ABIOS_LID_ALREADY_OWNED 0xC0000000 | 0x0111 +#define NT_STATUS_ABIOS_NOT_LID_OWNER 0xC0000000 | 0x0112 +#define NT_STATUS_ABIOS_INVALID_COMMAND 0xC0000000 | 0x0113 +#define NT_STATUS_ABIOS_INVALID_LID 0xC0000000 | 0x0114 +#define NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE 0xC0000000 | 0x0115 +#define NT_STATUS_ABIOS_INVALID_SELECTOR 0xC0000000 | 0x0116 +#define NT_STATUS_NO_LDT 0xC0000000 | 0x0117 +#define NT_STATUS_INVALID_LDT_SIZE 0xC0000000 | 0x0118 +#define NT_STATUS_INVALID_LDT_OFFSET 0xC0000000 | 0x0119 +#define NT_STATUS_INVALID_LDT_DESCRIPTOR 0xC0000000 | 0x011a +#define NT_STATUS_INVALID_IMAGE_NE_FORMAT 0xC0000000 | 0x011b +#define NT_STATUS_RXACT_INVALID_STATE 0xC0000000 | 0x011c +#define NT_STATUS_RXACT_COMMIT_FAILURE 0xC0000000 | 0x011d +#define NT_STATUS_MAPPED_FILE_SIZE_ZERO 0xC0000000 | 0x011e +#define NT_STATUS_TOO_MANY_OPENED_FILES 0xC0000000 | 0x011f +#define NT_STATUS_CANCELLED 0xC0000000 | 0x0120 +#define NT_STATUS_CANNOT_DELETE 0xC0000000 | 0x0121 +#define NT_STATUS_INVALID_COMPUTER_NAME 0xC0000000 | 0x0122 +#define NT_STATUS_FILE_DELETED 0xC0000000 | 0x0123 +#define NT_STATUS_SPECIAL_ACCOUNT 0xC0000000 | 0x0124 +#define NT_STATUS_SPECIAL_GROUP 0xC0000000 | 0x0125 +#define NT_STATUS_SPECIAL_USER 0xC0000000 | 0x0126 +#define NT_STATUS_MEMBERS_PRIMARY_GROUP 0xC0000000 | 0x0127 +#define NT_STATUS_FILE_CLOSED 0xC0000000 | 0x0128 +#define NT_STATUS_TOO_MANY_THREADS 0xC0000000 | 0x0129 +#define NT_STATUS_THREAD_NOT_IN_PROCESS 0xC0000000 | 0x012a +#define NT_STATUS_TOKEN_ALREADY_IN_USE 0xC0000000 | 0x012b +#define NT_STATUS_PAGEFILE_QUOTA_EXCEEDED 0xC0000000 | 0x012c +#define NT_STATUS_COMMITMENT_LIMIT 0xC0000000 | 0x012d +#define NT_STATUS_INVALID_IMAGE_LE_FORMAT 0xC0000000 | 0x012e +#define NT_STATUS_INVALID_IMAGE_NOT_MZ 0xC0000000 | 0x012f +#define NT_STATUS_INVALID_IMAGE_PROTECT 0xC0000000 | 0x0130 +#define NT_STATUS_INVALID_IMAGE_WIN_16 0xC0000000 | 0x0131 +#define NT_STATUS_LOGON_SERVER_CONFLICT 0xC0000000 | 0x0132 +#define NT_STATUS_TIME_DIFFERENCE_AT_DC 0xC0000000 | 0x0133 +#define NT_STATUS_SYNCHRONIZATION_REQUIRED 0xC0000000 | 0x0134 +#define NT_STATUS_DLL_NOT_FOUND 0xC0000000 | 0x0135 +#define NT_STATUS_OPEN_FAILED 0xC0000000 | 0x0136 +#define NT_STATUS_IO_PRIVILEGE_FAILED 0xC0000000 | 0x0137 +#define NT_STATUS_ORDINAL_NOT_FOUND 0xC0000000 | 0x0138 +#define NT_STATUS_ENTRYPOINT_NOT_FOUND 0xC0000000 | 0x0139 +#define NT_STATUS_CONTROL_C_EXIT 0xC0000000 | 0x013a +#define NT_STATUS_LOCAL_DISCONNECT 0xC0000000 | 0x013b +#define NT_STATUS_REMOTE_DISCONNECT 0xC0000000 | 0x013c +#define NT_STATUS_REMOTE_RESOURCES 0xC0000000 | 0x013d +#define NT_STATUS_LINK_FAILED 0xC0000000 | 0x013e +#define NT_STATUS_LINK_TIMEOUT 0xC0000000 | 0x013f +#define NT_STATUS_INVALID_CONNECTION 0xC0000000 | 0x0140 +#define NT_STATUS_INVALID_ADDRESS 0xC0000000 | 0x0141 +#define NT_STATUS_DLL_INIT_FAILED 0xC0000000 | 0x0142 +#define NT_STATUS_MISSING_SYSTEMFILE 0xC0000000 | 0x0143 +#define NT_STATUS_UNHANDLED_EXCEPTION 0xC0000000 | 0x0144 +#define NT_STATUS_APP_INIT_FAILURE 0xC0000000 | 0x0145 +#define NT_STATUS_PAGEFILE_CREATE_FAILED 0xC0000000 | 0x0146 +#define NT_STATUS_NO_PAGEFILE 0xC0000000 | 0x0147 +#define NT_STATUS_INVALID_LEVEL 0xC0000000 | 0x0148 +#define NT_STATUS_WRONG_PASSWORD_CORE 0xC0000000 | 0x0149 +#define NT_STATUS_ILLEGAL_FLOAT_CONTEXT 0xC0000000 | 0x014a +#define NT_STATUS_PIPE_BROKEN 0xC0000000 | 0x014b +#define NT_STATUS_REGISTRY_CORRUPT 0xC0000000 | 0x014c +#define NT_STATUS_REGISTRY_IO_FAILED 0xC0000000 | 0x014d +#define NT_STATUS_NO_EVENT_PAIR 0xC0000000 | 0x014e +#define NT_STATUS_UNRECOGNIZED_VOLUME 0xC0000000 | 0x014f +#define NT_STATUS_SERIAL_NO_DEVICE_INITED 0xC0000000 | 0x0150 +#define NT_STATUS_NO_SUCH_ALIAS 0xC0000000 | 0x0151 +#define NT_STATUS_MEMBER_NOT_IN_ALIAS 0xC0000000 | 0x0152 +#define NT_STATUS_MEMBER_IN_ALIAS 0xC0000000 | 0x0153 +#define NT_STATUS_ALIAS_EXISTS 0xC0000000 | 0x0154 +#define NT_STATUS_LOGON_NOT_GRANTED 0xC0000000 | 0x0155 +#define NT_STATUS_TOO_MANY_SECRETS 0xC0000000 | 0x0156 +#define NT_STATUS_SECRET_TOO_LONG 0xC0000000 | 0x0157 +#define NT_STATUS_INTERNAL_DB_ERROR 0xC0000000 | 0x0158 +#define NT_STATUS_FULLSCREEN_MODE 0xC0000000 | 0x0159 +#define NT_STATUS_TOO_MANY_CONTEXT_IDS 0xC0000000 | 0x015a +#define NT_STATUS_LOGON_TYPE_NOT_GRANTED 0xC0000000 | 0x015b +#define NT_STATUS_NOT_REGISTRY_FILE 0xC0000000 | 0x015c +#define NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED 0xC0000000 | 0x015d +#define NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR 0xC0000000 | 0x015e +#define NT_STATUS_FT_MISSING_MEMBER 0xC0000000 | 0x015f +#define NT_STATUS_ILL_FORMED_SERVICE_ENTRY 0xC0000000 | 0x0160 +#define NT_STATUS_ILLEGAL_CHARACTER 0xC0000000 | 0x0161 +#define NT_STATUS_UNMAPPABLE_CHARACTER 0xC0000000 | 0x0162 +#define NT_STATUS_UNDEFINED_CHARACTER 0xC0000000 | 0x0163 +#define NT_STATUS_FLOPPY_VOLUME 0xC0000000 | 0x0164 +#define NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND 0xC0000000 | 0x0165 +#define NT_STATUS_FLOPPY_WRONG_CYLINDER 0xC0000000 | 0x0166 +#define NT_STATUS_FLOPPY_UNKNOWN_ERROR 0xC0000000 | 0x0167 +#define NT_STATUS_FLOPPY_BAD_REGISTERS 0xC0000000 | 0x0168 +#define NT_STATUS_DISK_RECALIBRATE_FAILED 0xC0000000 | 0x0169 +#define NT_STATUS_DISK_OPERATION_FAILED 0xC0000000 | 0x016a +#define NT_STATUS_DISK_RESET_FAILED 0xC0000000 | 0x016b +#define NT_STATUS_SHARED_IRQ_BUSY 0xC0000000 | 0x016c +#define NT_STATUS_FT_ORPHANING 0xC0000000 | 0x016d +#define NT_STATUS_PARTITION_FAILURE 0xC0000000 | 0x0172 +#define NT_STATUS_INVALID_BLOCK_LENGTH 0xC0000000 | 0x0173 +#define NT_STATUS_DEVICE_NOT_PARTITIONED 0xC0000000 | 0x0174 +#define NT_STATUS_UNABLE_TO_LOCK_MEDIA 0xC0000000 | 0x0175 +#define NT_STATUS_UNABLE_TO_UNLOAD_MEDIA 0xC0000000 | 0x0176 +#define NT_STATUS_EOM_OVERFLOW 0xC0000000 | 0x0177 +#define NT_STATUS_NO_MEDIA 0xC0000000 | 0x0178 +#define NT_STATUS_NO_SUCH_MEMBER 0xC0000000 | 0x017a +#define NT_STATUS_INVALID_MEMBER 0xC0000000 | 0x017b +#define NT_STATUS_KEY_DELETED 0xC0000000 | 0x017c +#define NT_STATUS_NO_LOG_SPACE 0xC0000000 | 0x017d +#define NT_STATUS_TOO_MANY_SIDS 0xC0000000 | 0x017e +#define NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED 0xC0000000 | 0x017f +#define NT_STATUS_KEY_HAS_CHILDREN 0xC0000000 | 0x0180 +#define NT_STATUS_CHILD_MUST_BE_VOLATILE 0xC0000000 | 0x0181 +#define NT_STATUS_DEVICE_CONFIGURATION_ERROR 0xC0000000 | 0x0182 +#define NT_STATUS_DRIVER_INTERNAL_ERROR 0xC0000000 | 0x0183 +#define NT_STATUS_INVALID_DEVICE_STATE 0xC0000000 | 0x0184 +#define NT_STATUS_IO_DEVICE_ERROR 0xC0000000 | 0x0185 +#define NT_STATUS_DEVICE_PROTOCOL_ERROR 0xC0000000 | 0x0186 +#define NT_STATUS_BACKUP_CONTROLLER 0xC0000000 | 0x0187 +#define NT_STATUS_LOG_FILE_FULL 0xC0000000 | 0x0188 +#define NT_STATUS_TOO_LATE 0xC0000000 | 0x0189 +#define NT_STATUS_NO_TRUST_LSA_SECRET 0xC0000000 | 0x018a +#define NT_STATUS_NO_TRUST_SAM_ACCOUNT 0xC0000000 | 0x018b +#define NT_STATUS_TRUSTED_DOMAIN_FAILURE 0xC0000000 | 0x018c +#define NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE 0xC0000000 | 0x018d +#define NT_STATUS_EVENTLOG_FILE_CORRUPT 0xC0000000 | 0x018e +#define NT_STATUS_EVENTLOG_CANT_START 0xC0000000 | 0x018f +#define NT_STATUS_TRUST_FAILURE 0xC0000000 | 0x0190 +#define NT_STATUS_MUTANT_LIMIT_EXCEEDED 0xC0000000 | 0x0191 +#define NT_STATUS_NETLOGON_NOT_STARTED 0xC0000000 | 0x0192 +#define NT_STATUS_ACCOUNT_EXPIRED 0xC0000000 | 0x0193 +#define NT_STATUS_POSSIBLE_DEADLOCK 0xC0000000 | 0x0194 +#define NT_STATUS_NETWORK_CREDENTIAL_CONFLICT 0xC0000000 | 0x0195 +#define NT_STATUS_REMOTE_SESSION_LIMIT 0xC0000000 | 0x0196 +#define NT_STATUS_EVENTLOG_FILE_CHANGED 0xC0000000 | 0x0197 +#define NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT 0xC0000000 | 0x0198 +#define NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT 0xC0000000 | 0x0199 +#define NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT 0xC0000000 | 0x019a +#define NT_STATUS_DOMAIN_TRUST_INCONSISTENT 0xC0000000 | 0x019b +#define NT_STATUS_FS_DRIVER_REQUIRED 0xC0000000 | 0x019c +#define NT_STATUS_NO_USER_SESSION_KEY 0xC0000000 | 0x0202 +#define NT_STATUS_USER_SESSION_DELETED 0xC0000000 | 0x0203 +#define NT_STATUS_RESOURCE_LANG_NOT_FOUND 0xC0000000 | 0x0204 +#define NT_STATUS_INSUFF_SERVER_RESOURCES 0xC0000000 | 0x0205 +#define NT_STATUS_INVALID_BUFFER_SIZE 0xC0000000 | 0x0206 +#define NT_STATUS_INVALID_ADDRESS_COMPONENT 0xC0000000 | 0x0207 +#define NT_STATUS_INVALID_ADDRESS_WILDCARD 0xC0000000 | 0x0208 +#define NT_STATUS_TOO_MANY_ADDRESSES 0xC0000000 | 0x0209 +#define NT_STATUS_ADDRESS_ALREADY_EXISTS 0xC0000000 | 0x020a +#define NT_STATUS_ADDRESS_CLOSED 0xC0000000 | 0x020b +#define NT_STATUS_CONNECTION_DISCONNECTED 0xC0000000 | 0x020c +#define NT_STATUS_CONNECTION_RESET 0xC0000000 | 0x020d +#define NT_STATUS_TOO_MANY_NODES 0xC0000000 | 0x020e +#define NT_STATUS_TRANSACTION_ABORTED 0xC0000000 | 0x020f +#define NT_STATUS_TRANSACTION_TIMED_OUT 0xC0000000 | 0x0210 +#define NT_STATUS_TRANSACTION_NO_RELEASE 0xC0000000 | 0x0211 +#define NT_STATUS_TRANSACTION_NO_MATCH 0xC0000000 | 0x0212 +#define NT_STATUS_TRANSACTION_RESPONDED 0xC0000000 | 0x0213 +#define NT_STATUS_TRANSACTION_INVALID_ID 0xC0000000 | 0x0214 +#define NT_STATUS_TRANSACTION_INVALID_TYPE 0xC0000000 | 0x0215 +#define NT_STATUS_NOT_SERVER_SESSION 0xC0000000 | 0x0216 +#define NT_STATUS_NOT_CLIENT_SESSION 0xC0000000 | 0x0217 +#define NT_STATUS_CANNOT_LOAD_REGISTRY_FILE 0xC0000000 | 0x0218 +#define NT_STATUS_DEBUG_ATTACH_FAILED 0xC0000000 | 0x0219 +#define NT_STATUS_SYSTEM_PROCESS_TERMINATED 0xC0000000 | 0x021a +#define NT_STATUS_DATA_NOT_ACCEPTED 0xC0000000 | 0x021b +#define NT_STATUS_NO_BROWSER_SERVERS_FOUND 0xC0000000 | 0x021c +#define NT_STATUS_VDM_HARD_ERROR 0xC0000000 | 0x021d +#define NT_STATUS_DRIVER_CANCEL_TIMEOUT 0xC0000000 | 0x021e +#define NT_STATUS_REPLY_MESSAGE_MISMATCH 0xC0000000 | 0x021f +#define NT_STATUS_MAPPED_ALIGNMENT 0xC0000000 | 0x0220 +#define NT_STATUS_IMAGE_CHECKSUM_MISMATCH 0xC0000000 | 0x0221 +#define NT_STATUS_LOST_WRITEBEHIND_DATA 0xC0000000 | 0x0222 +#define NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID 0xC0000000 | 0x0223 +#define NT_STATUS_PASSWORD_MUST_CHANGE 0xC0000000 | 0x0224 +#define NT_STATUS_NOT_FOUND 0xC0000000 | 0x0225 +#define NT_STATUS_NOT_TINY_STREAM 0xC0000000 | 0x0226 +#define NT_STATUS_RECOVERY_FAILURE 0xC0000000 | 0x0227 +#define NT_STATUS_STACK_OVERFLOW_READ 0xC0000000 | 0x0228 +#define NT_STATUS_FAIL_CHECK 0xC0000000 | 0x0229 +#define NT_STATUS_DUPLICATE_OBJECTID 0xC0000000 | 0x022a +#define NT_STATUS_OBJECTID_EXISTS 0xC0000000 | 0x022b +#define NT_STATUS_CONVERT_TO_LARGE 0xC0000000 | 0x022c +#define NT_STATUS_RETRY 0xC0000000 | 0x022d +#define NT_STATUS_FOUND_OUT_OF_SCOPE 0xC0000000 | 0x022e +#define NT_STATUS_ALLOCATE_BUCKET 0xC0000000 | 0x022f +#define NT_STATUS_PROPSET_NOT_FOUND 0xC0000000 | 0x0230 +#define NT_STATUS_MARSHALL_OVERFLOW 0xC0000000 | 0x0231 +#define NT_STATUS_INVALID_VARIANT 0xC0000000 | 0x0232 +#define NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND 0xC0000000 | 0x0233 +#define NT_STATUS_ACCOUNT_LOCKED_OUT 0xC0000000 | 0x0234 +#define NT_STATUS_HANDLE_NOT_CLOSABLE 0xC0000000 | 0x0235 +#define NT_STATUS_CONNECTION_REFUSED 0xC0000000 | 0x0236 +#define NT_STATUS_GRACEFUL_DISCONNECT 0xC0000000 | 0x0237 +#define NT_STATUS_ADDRESS_ALREADY_ASSOCIATED 0xC0000000 | 0x0238 +#define NT_STATUS_ADDRESS_NOT_ASSOCIATED 0xC0000000 | 0x0239 +#define NT_STATUS_CONNECTION_INVALID 0xC0000000 | 0x023a +#define NT_STATUS_CONNECTION_ACTIVE 0xC0000000 | 0x023b +#define NT_STATUS_NETWORK_UNREACHABLE 0xC0000000 | 0x023c +#define NT_STATUS_HOST_UNREACHABLE 0xC0000000 | 0x023d +#define NT_STATUS_PROTOCOL_UNREACHABLE 0xC0000000 | 0x023e +#define NT_STATUS_PORT_UNREACHABLE 0xC0000000 | 0x023f +#define NT_STATUS_REQUEST_ABORTED 0xC0000000 | 0x0240 +#define NT_STATUS_CONNECTION_ABORTED 0xC0000000 | 0x0241 +#define NT_STATUS_BAD_COMPRESSION_BUFFER 0xC0000000 | 0x0242 +#define NT_STATUS_USER_MAPPED_FILE 0xC0000000 | 0x0243 +#define NT_STATUS_AUDIT_FAILED 0xC0000000 | 0x0244 +#define NT_STATUS_TIMER_RESOLUTION_NOT_SET 0xC0000000 | 0x0245 +#define NT_STATUS_CONNECTION_COUNT_LIMIT 0xC0000000 | 0x0246 +#define NT_STATUS_LOGIN_TIME_RESTRICTION 0xC0000000 | 0x0247 +#define NT_STATUS_LOGIN_WKSTA_RESTRICTION 0xC0000000 | 0x0248 +#define NT_STATUS_IMAGE_MP_UP_MISMATCH 0xC0000000 | 0x0249 +#define NT_STATUS_INSUFFICIENT_LOGON_INFO 0xC0000000 | 0x0250 +#define NT_STATUS_BAD_DLL_ENTRYPOINT 0xC0000000 | 0x0251 +#define NT_STATUS_BAD_SERVICE_ENTRYPOINT 0xC0000000 | 0x0252 +#define NT_STATUS_LPC_REPLY_LOST 0xC0000000 | 0x0253 +#define NT_STATUS_IP_ADDRESS_CONFLICT1 0xC0000000 | 0x0254 +#define NT_STATUS_IP_ADDRESS_CONFLICT2 0xC0000000 | 0x0255 +#define NT_STATUS_REGISTRY_QUOTA_LIMIT 0xC0000000 | 0x0256 +#define NT_STATUS_PATH_NOT_COVERED 0xC0000000 | 0x0257 +#define NT_STATUS_NO_CALLBACK_ACTIVE 0xC0000000 | 0x0258 +#define NT_STATUS_LICENSE_QUOTA_EXCEEDED 0xC0000000 | 0x0259 +#define NT_STATUS_PWD_TOO_SHORT 0xC0000000 | 0x025a +#define NT_STATUS_PWD_TOO_RECENT 0xC0000000 | 0x025b +#define NT_STATUS_PWD_HISTORY_CONFLICT 0xC0000000 | 0x025c +#define NT_STATUS_PLUGPLAY_NO_DEVICE 0xC0000000 | 0x025e +#define NT_STATUS_UNSUPPORTED_COMPRESSION 0xC0000000 | 0x025f +#define NT_STATUS_INVALID_HW_PROFILE 0xC0000000 | 0x0260 +#define NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH 0xC0000000 | 0x0261 +#define NT_STATUS_DRIVER_ORDINAL_NOT_FOUND 0xC0000000 | 0x0262 +#define NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND 0xC0000000 | 0x0263 +#define NT_STATUS_RESOURCE_NOT_OWNED 0xC0000000 | 0x0264 +#define NT_STATUS_TOO_MANY_LINKS 0xC0000000 | 0x0265 +#define NT_STATUS_QUOTA_LIST_INCONSISTENT 0xC0000000 | 0x0266 +#define NT_STATUS_FILE_IS_OFFLINE 0xC0000000 | 0x0267 +#define NT_STATUS_NO_SUCH_JOB 0xC0000000 | 0xEDE /* scheduler */ + +#endif /* _NTERR_H */ diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h new file mode 100644 index 00000000..5d52e4a3 --- /dev/null +++ b/fs/cifs/ntlmssp.h @@ -0,0 +1,128 @@ +/* + * fs/cifs/ntlmssp.h + * + * Copyright (c) International Business Machines Corp., 2002,2007 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define NTLMSSP_SIGNATURE "NTLMSSP" +/* Message Types */ +#define NtLmNegotiate cpu_to_le32(1) +#define NtLmChallenge cpu_to_le32(2) +#define NtLmAuthenticate cpu_to_le32(3) +#define UnknownMessage cpu_to_le32(8) + +/* Negotiate Flags */ +#define NTLMSSP_NEGOTIATE_UNICODE 0x01 /* Text strings are unicode */ +#define NTLMSSP_NEGOTIATE_OEM 0x02 /* Text strings are in OEM */ +#define NTLMSSP_REQUEST_TARGET 0x04 /* Srv returns its auth realm */ +/* define reserved9 0x08 */ +#define NTLMSSP_NEGOTIATE_SIGN 0x0010 /* Request signing capability */ +#define NTLMSSP_NEGOTIATE_SEAL 0x0020 /* Request confidentiality */ +#define NTLMSSP_NEGOTIATE_DGRAM 0x0040 +#define NTLMSSP_NEGOTIATE_LM_KEY 0x0080 /* Use LM session key */ +/* defined reserved 8 0x0100 */ +#define NTLMSSP_NEGOTIATE_NTLM 0x0200 /* NTLM authentication */ +#define NTLMSSP_NEGOTIATE_NT_ONLY 0x0400 /* Lanman not allowed */ +#define NTLMSSP_ANONYMOUS 0x0800 +#define NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED 0x1000 /* reserved6 */ +#define NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED 0x2000 +#define NTLMSSP_NEGOTIATE_LOCAL_CALL 0x4000 /* client/server same machine */ +#define NTLMSSP_NEGOTIATE_ALWAYS_SIGN 0x8000 /* Sign. All security levels */ +#define NTLMSSP_TARGET_TYPE_DOMAIN 0x10000 +#define NTLMSSP_TARGET_TYPE_SERVER 0x20000 +#define NTLMSSP_TARGET_TYPE_SHARE 0x40000 +#define NTLMSSP_NEGOTIATE_EXTENDED_SEC 0x80000 /* NB:not related to NTLMv2 pwd*/ +/* #define NTLMSSP_REQUEST_INIT_RESP 0x100000 */ +#define NTLMSSP_NEGOTIATE_IDENTIFY 0x100000 +#define NTLMSSP_REQUEST_ACCEPT_RESP 0x200000 /* reserved5 */ +#define NTLMSSP_REQUEST_NON_NT_KEY 0x400000 +#define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000 +/* #define reserved4 0x1000000 */ +#define NTLMSSP_NEGOTIATE_VERSION 0x2000000 /* we do not set */ +/* #define reserved3 0x4000000 */ +/* #define reserved2 0x8000000 */ +/* #define reserved1 0x10000000 */ +#define NTLMSSP_NEGOTIATE_128 0x20000000 +#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000 +#define NTLMSSP_NEGOTIATE_56 0x80000000 + +/* Define AV Pair Field IDs */ +enum av_field_type { + NTLMSSP_AV_EOL = 0, + NTLMSSP_AV_NB_COMPUTER_NAME, + NTLMSSP_AV_NB_DOMAIN_NAME, + NTLMSSP_AV_DNS_COMPUTER_NAME, + NTLMSSP_AV_DNS_DOMAIN_NAME, + NTLMSSP_AV_DNS_TREE_NAME, + NTLMSSP_AV_FLAGS, + NTLMSSP_AV_TIMESTAMP, + NTLMSSP_AV_RESTRICTION, + NTLMSSP_AV_TARGET_NAME, + NTLMSSP_AV_CHANNEL_BINDINGS +}; + +/* Although typedefs are not commonly used for structure definitions */ +/* in the Linux kernel, in this particular case they are useful */ +/* to more closely match the standards document for NTLMSSP from */ +/* OpenGroup and to make the code more closely match the standard in */ +/* appearance */ + +typedef struct _SECURITY_BUFFER { + __le16 Length; + __le16 MaximumLength; + __le32 BufferOffset; /* offset to buffer */ +} __attribute__((packed)) SECURITY_BUFFER; + +typedef struct _NEGOTIATE_MESSAGE { + __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; + __le32 MessageType; /* NtLmNegotiate = 1 */ + __le32 NegotiateFlags; + SECURITY_BUFFER DomainName; /* RFC 1001 style and ASCII */ + SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */ + /* SECURITY_BUFFER for version info not present since we + do not set the version is present flag */ + char DomainString[0]; + /* followed by WorkstationString */ +} __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE; + +typedef struct _CHALLENGE_MESSAGE { + __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; + __le32 MessageType; /* NtLmChallenge = 2 */ + SECURITY_BUFFER TargetName; + __le32 NegotiateFlags; + __u8 Challenge[CIFS_CRYPTO_KEY_SIZE]; + __u8 Reserved[8]; + SECURITY_BUFFER TargetInfoArray; + /* SECURITY_BUFFER for version info not present since we + do not set the version is present flag */ +} __attribute__((packed)) CHALLENGE_MESSAGE, *PCHALLENGE_MESSAGE; + +typedef struct _AUTHENTICATE_MESSAGE { + __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; + __le32 MessageType; /* NtLmsAuthenticate = 3 */ + SECURITY_BUFFER LmChallengeResponse; + SECURITY_BUFFER NtChallengeResponse; + SECURITY_BUFFER DomainName; + SECURITY_BUFFER UserName; + SECURITY_BUFFER WorkstationName; + SECURITY_BUFFER SessionKey; + __le32 NegotiateFlags; + /* SECURITY_BUFFER for version info not present since we + do not set the version is present flag */ + char UserString[0]; +} __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c new file mode 100644 index 00000000..6751e745 --- /dev/null +++ b/fs/cifs/readdir.c @@ -0,0 +1,887 @@ +/* + * fs/cifs/readdir.c + * + * Directory search handling + * + * Copyright (C) International Business Machines Corp., 2004, 2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "cifsfs.h" + +/* + * To be safe - for UCS to UTF-8 with strings loaded with the rare long + * characters alloc more to account for such multibyte target UTF-8 + * characters. + */ +#define UNICODE_NAME_MAX ((4 * NAME_MAX) + 2) + +#ifdef CONFIG_CIFS_DEBUG2 +static void dump_cifs_file_struct(struct file *file, char *label) +{ + struct cifsFileInfo *cf; + + if (file) { + cf = file->private_data; + if (cf == NULL) { + cFYI(1, "empty cifs private file data"); + return; + } + if (cf->invalidHandle) + cFYI(1, "invalid handle"); + if (cf->srch_inf.endOfSearch) + cFYI(1, "end of search"); + if (cf->srch_inf.emptyDir) + cFYI(1, "empty dir"); + } +} +#else +static inline void dump_cifs_file_struct(struct file *file, char *label) +{ +} +#endif /* DEBUG2 */ + +/* + * Find the dentry that matches "name". If there isn't one, create one. If it's + * a negative dentry or the uniqueid changed, then drop it and recreate it. + */ +static struct dentry * +cifs_readdir_lookup(struct dentry *parent, struct qstr *name, + struct cifs_fattr *fattr) +{ + struct dentry *dentry, *alias; + struct inode *inode; + struct super_block *sb = parent->d_inode->i_sb; + + cFYI(1, "For %s", name->name); + + if (parent->d_op && parent->d_op->d_hash) + parent->d_op->d_hash(parent, parent->d_inode, name); + else + name->hash = full_name_hash(name->name, name->len); + + dentry = d_lookup(parent, name); + if (dentry) { + /* FIXME: check for inode number changes? */ + if (dentry->d_inode != NULL) + return dentry; + d_drop(dentry); + dput(dentry); + } + + dentry = d_alloc(parent, name); + if (dentry == NULL) + return NULL; + + inode = cifs_iget(sb, fattr); + if (!inode) { + dput(dentry); + return NULL; + } + + alias = d_materialise_unique(dentry, inode); + if (alias != NULL) { + dput(dentry); + if (IS_ERR(alias)) + return NULL; + dentry = alias; + } + + return dentry; +} + +static void +cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) +{ + fattr->cf_uid = cifs_sb->mnt_uid; + fattr->cf_gid = cifs_sb->mnt_gid; + + if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { + fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; + fattr->cf_dtype = DT_DIR; + } else { + fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_REG; + } + + if (fattr->cf_cifsattrs & ATTR_READONLY) + fattr->cf_mode &= ~S_IWUGO; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL && + fattr->cf_cifsattrs & ATTR_SYSTEM) { + if (fattr->cf_eof == 0) { + fattr->cf_mode &= ~S_IFMT; + fattr->cf_mode |= S_IFIFO; + fattr->cf_dtype = DT_FIFO; + } else { + /* + * trying to get the type and mode via SFU can be slow, + * so just call those regular files for now, and mark + * for reval + */ + fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; + } + } +} + +static void +cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, + struct cifs_sb_info *cifs_sb) +{ + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes); + fattr->cf_eof = le64_to_cpu(info->EndOfFile); + fattr->cf_bytes = le64_to_cpu(info->AllocationSize); + fattr->cf_createtime = le64_to_cpu(info->CreationTime); + fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); + fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); + fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); + + cifs_fill_common_info(fattr, cifs_sb); +} + +static void +cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info, + struct cifs_sb_info *cifs_sb) +{ + int offset = cifs_sb_master_tcon(cifs_sb)->ses->server->timeAdj; + + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate, + info->LastAccessTime, offset); + fattr->cf_ctime = cnvrtDosUnixTm(info->LastWriteDate, + info->LastWriteTime, offset); + fattr->cf_mtime = cnvrtDosUnixTm(info->LastWriteDate, + info->LastWriteTime, offset); + + fattr->cf_cifsattrs = le16_to_cpu(info->Attributes); + fattr->cf_bytes = le32_to_cpu(info->AllocationSize); + fattr->cf_eof = le32_to_cpu(info->DataSize); + + cifs_fill_common_info(fattr, cifs_sb); +} + +/* BB eventually need to add the following helper function to + resolve NT_STATUS_STOPPED_ON_SYMLINK return code when + we try to do FindFirst on (NTFS) directory symlinks */ +/* +int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, + int xid) +{ + __u16 fid; + int len; + int oplock = 0; + int rc; + struct cifs_tcon *ptcon = cifs_sb_tcon(cifs_sb); + char *tmpbuffer; + + rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ, + OPEN_REPARSE_POINT, &fid, &oplock, NULL, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (!rc) { + tmpbuffer = kmalloc(maxpath); + rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path, + tmpbuffer, + maxpath -1, + fid, + cifs_sb->local_nls); + if (CIFSSMBClose(xid, ptcon, fid)) { + cFYI(1, "Error closing temporary reparsepoint open"); + } + } +} + */ + +static int initiate_cifs_search(const int xid, struct file *file) +{ + int rc = 0; + char *full_path = NULL; + struct cifsFileInfo *cifsFile; + struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + struct tcon_link *tlink = NULL; + struct cifs_tcon *pTcon; + + if (file->private_data == NULL) { + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + + cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); + if (cifsFile == NULL) { + rc = -ENOMEM; + goto error_exit; + } + file->private_data = cifsFile; + cifsFile->tlink = cifs_get_tlink(tlink); + pTcon = tlink_tcon(tlink); + } else { + cifsFile = file->private_data; + pTcon = tlink_tcon(cifsFile->tlink); + } + + cifsFile->invalidHandle = true; + cifsFile->srch_inf.endOfSearch = false; + + full_path = build_path_from_dentry(file->f_path.dentry); + if (full_path == NULL) { + rc = -ENOMEM; + goto error_exit; + } + + cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos); + +ffirst_retry: + /* test for Unix extensions */ + /* but now check for them on the share/mount not on the SMB session */ +/* if (pTcon->ses->capabilities & CAP_UNIX) { */ + if (pTcon->unix_ext) + cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX; + else if ((pTcon->ses->capabilities & + (CAP_NT_SMBS | CAP_NT_FIND)) == 0) { + cifsFile->srch_inf.info_level = SMB_FIND_FILE_INFO_STANDARD; + } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; + } else /* not srvinos - BB fixme add check for backlevel? */ { + cifsFile->srch_inf.info_level = SMB_FIND_FILE_DIRECTORY_INFO; + } + + rc = CIFSFindFirst(xid, pTcon, full_path, cifs_sb->local_nls, + &cifsFile->netfid, &cifsFile->srch_inf, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb)); + if (rc == 0) + cifsFile->invalidHandle = false; + /* BB add following call to handle readdir on new NTFS symlink errors + else if STATUS_STOPPED_ON_SYMLINK + call get_symlink_reparse_path and retry with new path */ + else if ((rc == -EOPNOTSUPP) && + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; + goto ffirst_retry; + } +error_exit: + kfree(full_path); + cifs_put_tlink(tlink); + return rc; +} + +/* return length of unicode string in bytes */ +static int cifs_unicode_bytelen(char *str) +{ + int len; + __le16 *ustr = (__le16 *)str; + + for (len = 0; len <= PATH_MAX; len++) { + if (ustr[len] == 0) + return len << 1; + } + cFYI(1, "Unicode string longer than PATH_MAX found"); + return len << 1; +} + +static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) +{ + char *new_entry; + FILE_DIRECTORY_INFO *pDirInfo = (FILE_DIRECTORY_INFO *)old_entry; + + if (level == SMB_FIND_FILE_INFO_STANDARD) { + FIND_FILE_STANDARD_INFO *pfData; + pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo; + + new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) + + pfData->FileNameLength; + } else + new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset); + cFYI(1, "new entry %p old entry %p", new_entry, old_entry); + /* validate that new_entry is not past end of SMB */ + if (new_entry >= end_of_smb) { + cERROR(1, "search entry %p began after end of SMB %p old entry %p", + new_entry, end_of_smb, old_entry); + return NULL; + } else if (((level == SMB_FIND_FILE_INFO_STANDARD) && + (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) + || ((level != SMB_FIND_FILE_INFO_STANDARD) && + (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) { + cERROR(1, "search entry %p extends after end of SMB %p", + new_entry, end_of_smb); + return NULL; + } else + return new_entry; + +} + +#define UNICODE_DOT cpu_to_le16(0x2e) + +/* return 0 if no match and 1 for . (current directory) and 2 for .. (parent) */ +static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile) +{ + int rc = 0; + char *filename = NULL; + int len = 0; + + if (cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) { + FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; + filename = &pFindData->FileName[0]; + if (cfile->srch_inf.unicode) { + len = cifs_unicode_bytelen(filename); + } else { + /* BB should we make this strnlen of PATH_MAX? */ + len = strnlen(filename, 5); + } + } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) { + FILE_DIRECTORY_INFO *pFindData = + (FILE_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + } else if (cfile->srch_inf.info_level == + SMB_FIND_FILE_FULL_DIRECTORY_INFO) { + FILE_FULL_DIRECTORY_INFO *pFindData = + (FILE_FULL_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + } else if (cfile->srch_inf.info_level == + SMB_FIND_FILE_ID_FULL_DIR_INFO) { + SEARCH_ID_FULL_DIR_INFO *pFindData = + (SEARCH_ID_FULL_DIR_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + } else if (cfile->srch_inf.info_level == + SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { + FILE_BOTH_DIRECTORY_INFO *pFindData = + (FILE_BOTH_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) { + FIND_FILE_STANDARD_INFO *pFindData = + (FIND_FILE_STANDARD_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = pFindData->FileNameLength; + } else { + cFYI(1, "Unknown findfirst level %d", + cfile->srch_inf.info_level); + } + + if (filename) { + if (cfile->srch_inf.unicode) { + __le16 *ufilename = (__le16 *)filename; + if (len == 2) { + /* check for . */ + if (ufilename[0] == UNICODE_DOT) + rc = 1; + } else if (len == 4) { + /* check for .. */ + if ((ufilename[0] == UNICODE_DOT) + && (ufilename[1] == UNICODE_DOT)) + rc = 2; + } + } else /* ASCII */ { + if (len == 1) { + if (filename[0] == '.') + rc = 1; + } else if (len == 2) { + if ((filename[0] == '.') && (filename[1] == '.')) + rc = 2; + } + } + } + + return rc; +} + +/* Check if directory that we are searching has changed so we can decide + whether we can use the cached search results from the previous search */ +static int is_dir_changed(struct file *file) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct cifsInodeInfo *cifsInfo = CIFS_I(inode); + + if (cifsInfo->time == 0) + return 1; /* directory was changed, perhaps due to unlink */ + else + return 0; + +} + +static int cifs_save_resume_key(const char *current_entry, + struct cifsFileInfo *cifsFile) +{ + int rc = 0; + unsigned int len = 0; + __u16 level; + char *filename; + + if ((cifsFile == NULL) || (current_entry == NULL)) + return -EINVAL; + + level = cifsFile->srch_inf.info_level; + + if (level == SMB_FIND_FILE_UNIX) { + FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; + + filename = &pFindData->FileName[0]; + if (cifsFile->srch_inf.unicode) { + len = cifs_unicode_bytelen(filename); + } else { + /* BB should we make this strnlen of PATH_MAX? */ + len = strnlen(filename, PATH_MAX); + } + cifsFile->srch_inf.resume_key = pFindData->ResumeKey; + } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { + FILE_DIRECTORY_INFO *pFindData = + (FILE_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + cifsFile->srch_inf.resume_key = pFindData->FileIndex; + } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) { + FILE_FULL_DIRECTORY_INFO *pFindData = + (FILE_FULL_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + cifsFile->srch_inf.resume_key = pFindData->FileIndex; + } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) { + SEARCH_ID_FULL_DIR_INFO *pFindData = + (SEARCH_ID_FULL_DIR_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + cifsFile->srch_inf.resume_key = pFindData->FileIndex; + } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { + FILE_BOTH_DIRECTORY_INFO *pFindData = + (FILE_BOTH_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + cifsFile->srch_inf.resume_key = pFindData->FileIndex; + } else if (level == SMB_FIND_FILE_INFO_STANDARD) { + FIND_FILE_STANDARD_INFO *pFindData = + (FIND_FILE_STANDARD_INFO *)current_entry; + filename = &pFindData->FileName[0]; + /* one byte length, no name conversion */ + len = (unsigned int)pFindData->FileNameLength; + cifsFile->srch_inf.resume_key = pFindData->ResumeKey; + } else { + cFYI(1, "Unknown findfirst level %d", level); + return -EINVAL; + } + cifsFile->srch_inf.resume_name_len = len; + cifsFile->srch_inf.presume_name = filename; + return rc; +} + +/* find the corresponding entry in the search */ +/* Note that the SMB server returns search entries for . and .. which + complicates logic here if we choose to parse for them and we do not + assume that they are located in the findfirst return buffer.*/ +/* We start counting in the buffer with entry 2 and increment for every + entry (do not increment for . or .. entry) */ +static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, + struct file *file, char **ppCurrentEntry, int *num_to_ret) +{ + int rc = 0; + int pos_in_buf = 0; + loff_t first_entry_in_buffer; + loff_t index_to_find = file->f_pos; + struct cifsFileInfo *cifsFile = file->private_data; + /* check if index in the buffer */ + + if ((cifsFile == NULL) || (ppCurrentEntry == NULL) || + (num_to_ret == NULL)) + return -ENOENT; + + *ppCurrentEntry = NULL; + first_entry_in_buffer = + cifsFile->srch_inf.index_of_last_entry - + cifsFile->srch_inf.entries_in_buffer; + + /* if first entry in buf is zero then is first buffer + in search response data which means it is likely . and .. + will be in this buffer, although some servers do not return + . and .. for the root of a drive and for those we need + to start two entries earlier */ + + dump_cifs_file_struct(file, "In fce "); + if (((index_to_find < cifsFile->srch_inf.index_of_last_entry) && + is_dir_changed(file)) || + (index_to_find < first_entry_in_buffer)) { + /* close and restart search */ + cFYI(1, "search backing up - close and restart search"); + spin_lock(&cifs_file_list_lock); + if (!cifsFile->srch_inf.endOfSearch && + !cifsFile->invalidHandle) { + cifsFile->invalidHandle = true; + spin_unlock(&cifs_file_list_lock); + CIFSFindClose(xid, pTcon, cifsFile->netfid); + } else + spin_unlock(&cifs_file_list_lock); + if (cifsFile->srch_inf.ntwrk_buf_start) { + cFYI(1, "freeing SMB ff cache buf on search rewind"); + if (cifsFile->srch_inf.smallBuf) + cifs_small_buf_release(cifsFile->srch_inf. + ntwrk_buf_start); + else + cifs_buf_release(cifsFile->srch_inf. + ntwrk_buf_start); + cifsFile->srch_inf.ntwrk_buf_start = NULL; + } + rc = initiate_cifs_search(xid, file); + if (rc) { + cFYI(1, "error %d reinitiating a search on rewind", + rc); + return rc; + } + cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); + } + + while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && + (rc == 0) && !cifsFile->srch_inf.endOfSearch) { + cFYI(1, "calling findnext2"); + rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, + &cifsFile->srch_inf); + cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); + if (rc) + return -ENOENT; + } + if (index_to_find < cifsFile->srch_inf.index_of_last_entry) { + /* we found the buffer that contains the entry */ + /* scan and find it */ + int i; + char *current_entry; + char *end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + + smbCalcSize((struct smb_hdr *) + cifsFile->srch_inf.ntwrk_buf_start); + + current_entry = cifsFile->srch_inf.srch_entries_start; + first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry + - cifsFile->srch_inf.entries_in_buffer; + pos_in_buf = index_to_find - first_entry_in_buffer; + cFYI(1, "found entry - pos_in_buf %d", pos_in_buf); + + for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) { + /* go entry by entry figuring out which is first */ + current_entry = nxt_dir_entry(current_entry, end_of_smb, + cifsFile->srch_inf.info_level); + } + if ((current_entry == NULL) && (i < pos_in_buf)) { + /* BB fixme - check if we should flag this error */ + cERROR(1, "reached end of buf searching for pos in buf" + " %d index to find %lld rc %d", + pos_in_buf, index_to_find, rc); + } + rc = 0; + *ppCurrentEntry = current_entry; + } else { + cFYI(1, "index not in buffer - could not findnext into it"); + return 0; + } + + if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) { + cFYI(1, "can not return entries pos_in_buf beyond last"); + *num_to_ret = 0; + } else + *num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf; + + return rc; +} + +/* inode num, inode type and filename returned */ +static int cifs_get_name_from_search_buf(struct qstr *pqst, + char *current_entry, __u16 level, unsigned int unicode, + struct cifs_sb_info *cifs_sb, unsigned int max_len, __u64 *pinum) +{ + int rc = 0; + unsigned int len = 0; + char *filename; + struct nls_table *nlt = cifs_sb->local_nls; + + *pinum = 0; + + if (level == SMB_FIND_FILE_UNIX) { + FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; + + filename = &pFindData->FileName[0]; + if (unicode) { + len = cifs_unicode_bytelen(filename); + } else { + /* BB should we make this strnlen of PATH_MAX? */ + len = strnlen(filename, PATH_MAX); + } + + *pinum = le64_to_cpu(pFindData->basic.UniqueId); + } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { + FILE_DIRECTORY_INFO *pFindData = + (FILE_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) { + FILE_FULL_DIRECTORY_INFO *pFindData = + (FILE_FULL_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) { + SEARCH_ID_FULL_DIR_INFO *pFindData = + (SEARCH_ID_FULL_DIR_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + *pinum = le64_to_cpu(pFindData->UniqueId); + } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { + FILE_BOTH_DIRECTORY_INFO *pFindData = + (FILE_BOTH_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + } else if (level == SMB_FIND_FILE_INFO_STANDARD) { + FIND_FILE_STANDARD_INFO *pFindData = + (FIND_FILE_STANDARD_INFO *)current_entry; + filename = &pFindData->FileName[0]; + /* one byte length, no name conversion */ + len = (unsigned int)pFindData->FileNameLength; + } else { + cFYI(1, "Unknown findfirst level %d", level); + return -EINVAL; + } + + if (len > max_len) { + cERROR(1, "bad search response length %d past smb end", len); + return -EINVAL; + } + + if (unicode) { + pqst->len = cifs_from_ucs2((char *) pqst->name, + (__le16 *) filename, + UNICODE_NAME_MAX, + min(len, max_len), nlt, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + pqst->len -= nls_nullsize(nlt); + } else { + pqst->name = filename; + pqst->len = len; + } + return rc; +} + +static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, + void *direntry, char *scratch_buf, unsigned int max_len) +{ + int rc = 0; + struct qstr qstring; + struct cifsFileInfo *pCifsF; + u64 inum; + ino_t ino; + struct super_block *sb; + struct cifs_sb_info *cifs_sb; + struct dentry *tmp_dentry; + struct cifs_fattr fattr; + + /* get filename and len into qstring */ + /* get dentry */ + /* decide whether to create and populate ionde */ + if ((direntry == NULL) || (file == NULL)) + return -EINVAL; + + pCifsF = file->private_data; + + if ((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL)) + return -ENOENT; + + rc = cifs_entry_is_dot(pfindEntry, pCifsF); + /* skip . and .. since we added them first */ + if (rc != 0) + return 0; + + sb = file->f_path.dentry->d_sb; + cifs_sb = CIFS_SB(sb); + + qstring.name = scratch_buf; + rc = cifs_get_name_from_search_buf(&qstring, pfindEntry, + pCifsF->srch_inf.info_level, + pCifsF->srch_inf.unicode, cifs_sb, + max_len, &inum /* returned */); + + if (rc) + return rc; + + if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX) + cifs_unix_basic_to_fattr(&fattr, + &((FILE_UNIX_INFO *) pfindEntry)->basic, + cifs_sb); + else if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) + cifs_std_info_to_fattr(&fattr, (FIND_FILE_STANDARD_INFO *) + pfindEntry, cifs_sb); + else + cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *) + pfindEntry, cifs_sb); + + if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + fattr.cf_uniqueid = inum; + } else { + fattr.cf_uniqueid = iunique(sb, ROOT_I); + cifs_autodisable_serverino(cifs_sb); + } + + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) && + CIFSCouldBeMFSymlink(&fattr)) + /* + * trying to get the type and mode can be slow, + * so just call those regular files for now, and mark + * for reval + */ + fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; + + ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); + tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); + + rc = filldir(direntry, qstring.name, qstring.len, file->f_pos, + ino, fattr.cf_dtype); + + dput(tmp_dentry); + return rc; +} + + +int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) +{ + int rc = 0; + int xid, i; + struct cifs_tcon *pTcon; + struct cifsFileInfo *cifsFile = NULL; + char *current_entry; + int num_to_fill = 0; + char *tmp_buf = NULL; + char *end_of_smb; + unsigned int max_len; + + xid = GetXid(); + + /* + * Ensure FindFirst doesn't fail before doing filldir() for '.' and + * '..'. Otherwise we won't be able to notify VFS in case of failure. + */ + if (file->private_data == NULL) { + rc = initiate_cifs_search(xid, file); + cFYI(1, "initiate cifs search rc %d", rc); + if (rc) + goto rddir2_exit; + } + + switch ((int) file->f_pos) { + case 0: + if (filldir(direntry, ".", 1, file->f_pos, + file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) { + cERROR(1, "Filldir for current dir failed"); + rc = -ENOMEM; + break; + } + file->f_pos++; + case 1: + if (filldir(direntry, "..", 2, file->f_pos, + file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) { + cERROR(1, "Filldir for parent dir failed"); + rc = -ENOMEM; + break; + } + file->f_pos++; + default: + /* 1) If search is active, + is in current search buffer? + if it before then restart search + if after then keep searching till find it */ + + if (file->private_data == NULL) { + rc = -EINVAL; + FreeXid(xid); + return rc; + } + cifsFile = file->private_data; + if (cifsFile->srch_inf.endOfSearch) { + if (cifsFile->srch_inf.emptyDir) { + cFYI(1, "End of search, empty dir"); + rc = 0; + break; + } + } /* else { + cifsFile->invalidHandle = true; + CIFSFindClose(xid, pTcon, cifsFile->netfid); + } */ + + pTcon = tlink_tcon(cifsFile->tlink); + rc = find_cifs_entry(xid, pTcon, file, + ¤t_entry, &num_to_fill); + if (rc) { + cFYI(1, "fce error %d", rc); + goto rddir2_exit; + } else if (current_entry != NULL) { + cFYI(1, "entry %lld found", file->f_pos); + } else { + cFYI(1, "could not find entry"); + goto rddir2_exit; + } + cFYI(1, "loop through %d times filling dir for net buf %p", + num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); + max_len = smbCalcSize((struct smb_hdr *) + cifsFile->srch_inf.ntwrk_buf_start); + end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; + + tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); + if (tmp_buf == NULL) { + rc = -ENOMEM; + break; + } + + for (i = 0; (i < num_to_fill) && (rc == 0); i++) { + if (current_entry == NULL) { + /* evaluate whether this case is an error */ + cERROR(1, "past SMB end, num to fill %d i %d", + num_to_fill, i); + break; + } + /* if buggy server returns . and .. late do + we want to check for that here? */ + rc = cifs_filldir(current_entry, file, + filldir, direntry, tmp_buf, max_len); + if (rc == -EOVERFLOW) { + rc = 0; + break; + } + + file->f_pos++; + if (file->f_pos == + cifsFile->srch_inf.index_of_last_entry) { + cFYI(1, "last entry in buf at pos %lld %s", + file->f_pos, tmp_buf); + cifs_save_resume_key(current_entry, cifsFile); + break; + } else + current_entry = + nxt_dir_entry(current_entry, end_of_smb, + cifsFile->srch_inf.info_level); + } + kfree(tmp_buf); + break; + } /* end switch */ + +rddir2_exit: + FreeXid(xid); + return rc; +} diff --git a/fs/cifs/rfc1002pdu.h b/fs/cifs/rfc1002pdu.h new file mode 100644 index 00000000..8b69fcce --- /dev/null +++ b/fs/cifs/rfc1002pdu.h @@ -0,0 +1,74 @@ +/* + * fs/cifs/rfc1002pdu.h + * + * Protocol Data Unit definitions for RFC 1001/1002 support + * + * Copyright (c) International Business Machines Corp., 2004 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* NB: unlike smb/cifs packets, the RFC1002 structures are big endian */ + + /* RFC 1002 session packet types */ +#define RFC1002_SESSION_MESSAGE 0x00 +#define RFC1002_SESSION_REQUEST 0x81 +#define RFC1002_POSITIVE_SESSION_RESPONSE 0x82 +#define RFC1002_NEGATIVE_SESSION_RESPONSE 0x83 +#define RFC1002_RETARGET_SESSION_RESPONSE 0x84 +#define RFC1002_SESSION_KEEP_ALIVE 0x85 + + /* RFC 1002 flags (only one defined */ +#define RFC1002_LENGTH_EXTEND 0x80 /* high order bit of length (ie +64K) */ + +struct rfc1002_session_packet { + __u8 type; + __u8 flags; + __u16 length; + union { + struct { + __u8 called_len; + __u8 called_name[32]; + __u8 scope1; /* null */ + __u8 calling_len; + __u8 calling_name[32]; + __u8 scope2; /* null */ + } __attribute__((packed)) session_req; + struct { + __u32 retarget_ip_addr; + __u16 port; + } __attribute__((packed)) retarget_resp; + __u8 neg_ses_resp_error_code; + /* POSITIVE_SESSION_RESPONSE packet does not include trailer. + SESSION_KEEP_ALIVE packet also does not include a trailer. + Trailer for the SESSION_MESSAGE packet is SMB/CIFS header */ + } __attribute__((packed)) trailer; +} __attribute__((packed)); + +/* Negative Session Response error codes */ +#define RFC1002_NOT_LISTENING_CALLED 0x80 /* not listening on called name */ +#define RFC1002_NOT_LISTENING_CALLING 0x81 /* not listening on calling name */ +#define RFC1002_NOT_PRESENT 0x82 /* called name not present */ +#define RFC1002_INSUFFICIENT_RESOURCE 0x83 +#define RFC1002_UNSPECIFIED_ERROR 0x8F + +/* RFC 1002 Datagram service packets are not defined here as they +are not needed for the network filesystem client unless we plan on +implementing broadcast resolution of the server ip address (from +server netbios name). Currently server names are resolved only via DNS +(tcp name) or ip address or an /etc/hosts equivalent mapping to ip address.*/ + +#define DEFAULT_CIFS_CALLED_NAME "*SMBSERVER " diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c new file mode 100644 index 00000000..0cfae191 --- /dev/null +++ b/fs/cifs/sess.c @@ -0,0 +1,952 @@ +/* + * fs/cifs/sess.c + * + * SMB/CIFS session setup handling routines + * + * Copyright (c) International Business Machines Corp., 2006, 2009 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" +#include "ntlmssp.h" +#include "nterr.h" +#include +#include +#include "cifs_spnego.h" + +/* + * Checks if this is the first smb session to be reconnected after + * the socket has been reestablished (so we know whether to use vc 0). + * Called while holding the cifs_tcp_ses_lock, so do not block + */ +static bool is_first_ses_reconnect(struct cifs_ses *ses) +{ + struct list_head *tmp; + struct cifs_ses *tmp_ses; + + list_for_each(tmp, &ses->server->smb_ses_list) { + tmp_ses = list_entry(tmp, struct cifs_ses, + smb_ses_list); + if (tmp_ses->need_reconnect == false) + return false; + } + /* could not find a session that was already connected, + this must be the first one we are reconnecting */ + return true; +} + +/* + * vc number 0 is treated specially by some servers, and should be the + * first one we request. After that we can use vcnumbers up to maxvcs, + * one for each smb session (some Windows versions set maxvcs incorrectly + * so maxvc=1 can be ignored). If we have too many vcs, we can reuse + * any vc but zero (some servers reset the connection on vcnum zero) + * + */ +static __le16 get_next_vcnum(struct cifs_ses *ses) +{ + __u16 vcnum = 0; + struct list_head *tmp; + struct cifs_ses *tmp_ses; + __u16 max_vcs = ses->server->max_vcs; + __u16 i; + int free_vc_found = 0; + + /* Quoting the MS-SMB specification: "Windows-based SMB servers set this + field to one but do not enforce this limit, which allows an SMB client + to establish more virtual circuits than allowed by this value ... but + other server implementations can enforce this limit." */ + if (max_vcs < 2) + max_vcs = 0xFFFF; + + spin_lock(&cifs_tcp_ses_lock); + if ((ses->need_reconnect) && is_first_ses_reconnect(ses)) + goto get_vc_num_exit; /* vcnum will be zero */ + for (i = ses->server->srv_count - 1; i < max_vcs; i++) { + if (i == 0) /* this is the only connection, use vc 0 */ + break; + + free_vc_found = 1; + + list_for_each(tmp, &ses->server->smb_ses_list) { + tmp_ses = list_entry(tmp, struct cifs_ses, + smb_ses_list); + if (tmp_ses->vcnum == i) { + free_vc_found = 0; + break; /* found duplicate, try next vcnum */ + } + } + if (free_vc_found) + break; /* we found a vcnumber that will work - use it */ + } + + if (i == 0) + vcnum = 0; /* for most common case, ie if one smb session, use + vc zero. Also for case when no free vcnum, zero + is safest to send (some clients only send zero) */ + else if (free_vc_found == 0) + vcnum = 1; /* we can not reuse vc=0 safely, since some servers + reset all uids on that, but 1 is ok. */ + else + vcnum = i; + ses->vcnum = vcnum; +get_vc_num_exit: + spin_unlock(&cifs_tcp_ses_lock); + + return cpu_to_le16(vcnum); +} + +static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) +{ + __u32 capabilities = 0; + + /* init fields common to all four types of SessSetup */ + /* Note that offsets for first seven fields in req struct are same */ + /* in CIFS Specs so does not matter which of 3 forms of struct */ + /* that we use in next few lines */ + /* Note that header is initialized to zero in header_assemble */ + pSMB->req.AndXCommand = 0xFF; + pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); + pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); + pSMB->req.VcNumber = get_next_vcnum(ses); + + /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ + + /* BB verify whether signing required on neg or just on auth frame + (and NTLM case) */ + + capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS | + CAP_LARGE_WRITE_X | CAP_LARGE_READ_X; + + if (ses->server->sec_mode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + + if (ses->capabilities & CAP_UNICODE) { + pSMB->req.hdr.Flags2 |= SMBFLG2_UNICODE; + capabilities |= CAP_UNICODE; + } + if (ses->capabilities & CAP_STATUS32) { + pSMB->req.hdr.Flags2 |= SMBFLG2_ERR_STATUS; + capabilities |= CAP_STATUS32; + } + if (ses->capabilities & CAP_DFS) { + pSMB->req.hdr.Flags2 |= SMBFLG2_DFS; + capabilities |= CAP_DFS; + } + if (ses->capabilities & CAP_UNIX) + capabilities |= CAP_UNIX; + + return capabilities; +} + +static void +unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + int bytes_ret = 0; + + /* Copy OS version */ + bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32, + nls_cp); + bcc_ptr += 2 * bytes_ret; + bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release, + 32, nls_cp); + bcc_ptr += 2 * bytes_ret; + bcc_ptr += 2; /* trailing null */ + + bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS, + 32, nls_cp); + bcc_ptr += 2 * bytes_ret; + bcc_ptr += 2; /* trailing null */ + + *pbcc_area = bcc_ptr; +} + +static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + int bytes_ret = 0; + + /* copy domain */ + if (ses->domainName == NULL) { + /* Sending null domain better than using a bogus domain name (as + we did briefly in 2.6.18) since server will use its default */ + *bcc_ptr = 0; + *(bcc_ptr+1) = 0; + bytes_ret = 0; + } else + bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName, + 256, nls_cp); + bcc_ptr += 2 * bytes_ret; + bcc_ptr += 2; /* account for null terminator */ + + *pbcc_area = bcc_ptr; +} + + +static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + int bytes_ret = 0; + + /* BB FIXME add check that strings total less + than 335 or will need to send them as arrays */ + + /* unicode strings, must be word aligned before the call */ +/* if ((long) bcc_ptr % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } */ + /* copy user */ + if (ses->user_name == NULL) { + /* null user mount */ + *bcc_ptr = 0; + *(bcc_ptr+1) = 0; + } else { + bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name, + MAX_USERNAME_SIZE, nls_cp); + } + bcc_ptr += 2 * bytes_ret; + bcc_ptr += 2; /* account for null termination */ + + unicode_domain_string(&bcc_ptr, ses, nls_cp); + unicode_oslm_strings(&bcc_ptr, nls_cp); + + *pbcc_area = bcc_ptr; +} + +static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + + /* copy user */ + /* BB what about null user mounts - check that we do this BB */ + /* copy user */ + if (ses->user_name != NULL) { + strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE); + bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE); + } + /* else null user mount */ + *bcc_ptr = 0; + bcc_ptr++; /* account for null termination */ + + /* copy domain */ + if (ses->domainName != NULL) { + strncpy(bcc_ptr, ses->domainName, 256); + bcc_ptr += strnlen(ses->domainName, 256); + } /* else we will send a null domain name + so the server will default to its own domain */ + *bcc_ptr = 0; + bcc_ptr++; + + /* BB check for overflow here */ + + strcpy(bcc_ptr, "Linux version "); + bcc_ptr += strlen("Linux version "); + strcpy(bcc_ptr, init_utsname()->release); + bcc_ptr += strlen(init_utsname()->release) + 1; + + strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); + bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; + + *pbcc_area = bcc_ptr; +} + +static void +decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int len; + char *data = *pbcc_area; + + cFYI(1, "bleft %d", bleft); + + kfree(ses->serverOS); + ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); + cFYI(1, "serverOS=%s", ses->serverOS); + len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; + data += len; + bleft -= len; + if (bleft <= 0) + return; + + kfree(ses->serverNOS); + ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); + cFYI(1, "serverNOS=%s", ses->serverNOS); + len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; + data += len; + bleft -= len; + if (bleft <= 0) + return; + + kfree(ses->serverDomain); + ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp); + cFYI(1, "serverDomain=%s", ses->serverDomain); + + return; +} + +static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft, + struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int rc = 0; + int len; + char *bcc_ptr = *pbcc_area; + + cFYI(1, "decode sessetup ascii. bleft %d", bleft); + + len = strnlen(bcc_ptr, bleft); + if (len >= bleft) + return rc; + + kfree(ses->serverOS); + + ses->serverOS = kzalloc(len + 1, GFP_KERNEL); + if (ses->serverOS) + strncpy(ses->serverOS, bcc_ptr, len); + if (strncmp(ses->serverOS, "OS/2", 4) == 0) { + cFYI(1, "OS/2 server"); + ses->flags |= CIFS_SES_OS2; + } + + bcc_ptr += len + 1; + bleft -= len + 1; + + len = strnlen(bcc_ptr, bleft); + if (len >= bleft) + return rc; + + kfree(ses->serverNOS); + + ses->serverNOS = kzalloc(len + 1, GFP_KERNEL); + if (ses->serverNOS) + strncpy(ses->serverNOS, bcc_ptr, len); + + bcc_ptr += len + 1; + bleft -= len + 1; + + len = strnlen(bcc_ptr, bleft); + if (len > bleft) + return rc; + + /* No domain field in LANMAN case. Domain is + returned by old servers in the SMB negprot response */ + /* BB For newer servers which do not support Unicode, + but thus do return domain here we could add parsing + for it later, but it is not very important */ + cFYI(1, "ascii: bytes left %d", bleft); + + return rc; +} + +static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, + struct cifs_ses *ses) +{ + unsigned int tioffset; /* challenge message target info area */ + unsigned int tilen; /* challenge message target info area length */ + + CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; + + if (blob_len < sizeof(CHALLENGE_MESSAGE)) { + cERROR(1, "challenge blob len %d too small", blob_len); + return -EINVAL; + } + + if (memcmp(pblob->Signature, "NTLMSSP", 8)) { + cERROR(1, "blob signature incorrect %s", pblob->Signature); + return -EINVAL; + } + if (pblob->MessageType != NtLmChallenge) { + cERROR(1, "Incorrect message type %d", pblob->MessageType); + return -EINVAL; + } + + memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE); + /* BB we could decode pblob->NegotiateFlags; some may be useful */ + /* In particular we can examine sign flags */ + /* BB spec says that if AvId field of MsvAvTimestamp is populated then + we must set the MIC field of the AUTHENTICATE_MESSAGE */ + ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags); + tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset); + tilen = le16_to_cpu(pblob->TargetInfoArray.Length); + if (tilen) { + ses->auth_key.response = kmalloc(tilen, GFP_KERNEL); + if (!ses->auth_key.response) { + cERROR(1, "Challenge target info allocation failure"); + return -ENOMEM; + } + memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen); + ses->auth_key.len = tilen; + } + + return 0; +} + +/* BB Move to ntlmssp.c eventually */ + +/* We do not malloc the blob, it is passed in pbuffer, because + it is fixed size, and small, making this approach cleaner */ +static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, + struct cifs_ses *ses) +{ + NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer; + __u32 flags; + + memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE)); + memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); + sec_blob->MessageType = NtLmNegotiate; + + /* BB is NTLMV2 session security format easier to use here? */ + flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | + NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | + NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; + if (ses->server->sec_mode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + flags |= NTLMSSP_NEGOTIATE_SIGN; + if (!ses->server->session_estab) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; + } + + sec_blob->NegotiateFlags = cpu_to_le32(flags); + + sec_blob->WorkstationName.BufferOffset = 0; + sec_blob->WorkstationName.Length = 0; + sec_blob->WorkstationName.MaximumLength = 0; + + /* Domain name is sent on the Challenge not Negotiate NTLMSSP request */ + sec_blob->DomainName.BufferOffset = 0; + sec_blob->DomainName.Length = 0; + sec_blob->DomainName.MaximumLength = 0; +} + +/* We do not malloc the blob, it is passed in pbuffer, because its + maximum possible size is fixed and small, making this approach cleaner. + This function returns the length of the data in the blob */ +static int build_ntlmssp_auth_blob(unsigned char *pbuffer, + u16 *buflen, + struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int rc; + AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; + __u32 flags; + unsigned char *tmp; + + memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); + sec_blob->MessageType = NtLmAuthenticate; + + flags = NTLMSSP_NEGOTIATE_56 | + NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | + NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | + NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; + if (ses->server->sec_mode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + flags |= NTLMSSP_NEGOTIATE_SIGN; + if (!ses->server->session_estab) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; + } + + tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE); + sec_blob->NegotiateFlags = cpu_to_le32(flags); + + sec_blob->LmChallengeResponse.BufferOffset = + cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE)); + sec_blob->LmChallengeResponse.Length = 0; + sec_blob->LmChallengeResponse.MaximumLength = 0; + + sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); + rc = setup_ntlmv2_rsp(ses, nls_cp); + if (rc) { + cERROR(1, "Error %d during NTLMSSP authentication", rc); + goto setup_ntlmv2_ret; + } + memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE; + + sec_blob->NtChallengeResponse.Length = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + sec_blob->NtChallengeResponse.MaximumLength = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + + if (ses->domainName == NULL) { + sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->DomainName.Length = 0; + sec_blob->DomainName.MaximumLength = 0; + tmp += 2; + } else { + int len; + len = cifs_strtoUCS((__le16 *)tmp, ses->domainName, + MAX_USERNAME_SIZE, nls_cp); + len *= 2; /* unicode is 2 bytes each */ + sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->DomainName.Length = cpu_to_le16(len); + sec_blob->DomainName.MaximumLength = cpu_to_le16(len); + tmp += len; + } + + if (ses->user_name == NULL) { + sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->UserName.Length = 0; + sec_blob->UserName.MaximumLength = 0; + tmp += 2; + } else { + int len; + len = cifs_strtoUCS((__le16 *)tmp, ses->user_name, + MAX_USERNAME_SIZE, nls_cp); + len *= 2; /* unicode is 2 bytes each */ + sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->UserName.Length = cpu_to_le16(len); + sec_blob->UserName.MaximumLength = cpu_to_le16(len); + tmp += len; + } + + sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->WorkstationName.Length = 0; + sec_blob->WorkstationName.MaximumLength = 0; + tmp += 2; + + if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) || + (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) + && !calc_seckey(ses)) { + memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE); + sec_blob->SessionKey.MaximumLength = + cpu_to_le16(CIFS_CPHTXT_SIZE); + tmp += CIFS_CPHTXT_SIZE; + } else { + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.Length = 0; + sec_blob->SessionKey.MaximumLength = 0; + } + +setup_ntlmv2_ret: + *buflen = tmp - pbuffer; + return rc; +} + +int +CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int rc = 0; + int wct; + struct smb_hdr *smb_buf; + char *bcc_ptr; + char *str_area; + SESSION_SETUP_ANDX *pSMB; + __u32 capabilities; + __u16 count; + int resp_buf_type; + struct kvec iov[3]; + enum securityEnum type; + __u16 action, bytes_remaining; + struct key *spnego_key = NULL; + __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ + u16 blob_len; + char *ntlmsspblob = NULL; + + if (ses == NULL) + return -EINVAL; + + type = ses->server->secType; + cFYI(1, "sess setup type %d", type); + if (type == RawNTLMSSP) { + /* if memory allocation is successful, caller of this function + * frees it. + */ + ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); + if (!ses->ntlmssp) + return -ENOMEM; + } + +ssetup_ntlmssp_authenticate: + if (phase == NtLmChallenge) + phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ + + if (type == LANMAN) { +#ifndef CONFIG_CIFS_WEAK_PW_HASH + /* LANMAN and plaintext are less secure and off by default. + So we make this explicitly be turned on in kconfig (in the + build) and turned on at runtime (changed from the default) + in proc/fs/cifs or via mount parm. Unfortunately this is + needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ + return -EOPNOTSUPP; +#endif + wct = 10; /* lanman 2 style sessionsetup */ + } else if ((type == NTLM) || (type == NTLMv2)) { + /* For NTLMv2 failures eventually may need to retry NTLM */ + wct = 13; /* old style NTLM sessionsetup */ + } else /* same size: negotiate or auth, NTLMSSP or extended security */ + wct = 12; + + rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, + (void **)&smb_buf); + if (rc) + return rc; + + pSMB = (SESSION_SETUP_ANDX *)smb_buf; + + capabilities = cifs_ssetup_hdr(ses, pSMB); + + /* we will send the SMB in three pieces: + a fixed length beginning part, an optional + SPNEGO blob (which can be zero length), and a + last part which will include the strings + and rest of bcc area. This allows us to avoid + a large buffer 17K allocation */ + iov[0].iov_base = (char *)pSMB; + iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4; + + /* setting this here allows the code at the end of the function + to free the request buffer if there's an error */ + resp_buf_type = CIFS_SMALL_BUFFER; + + /* 2000 big enough to fit max user, domain, NOS name etc. */ + str_area = kmalloc(2000, GFP_KERNEL); + if (str_area == NULL) { + rc = -ENOMEM; + goto ssetup_exit; + } + bcc_ptr = str_area; + + ses->flags &= ~CIFS_SES_LANMAN; + + iov[1].iov_base = NULL; + iov[1].iov_len = 0; + + if (type == LANMAN) { +#ifdef CONFIG_CIFS_WEAK_PW_HASH + char lnm_session_key[CIFS_AUTH_RESP_SIZE]; + + pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; + + /* no capabilities flags in old lanman negotiation */ + + pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); + + /* Calculate hash with password and copy into bcc_ptr. + * Encryption Key (stored as in cryptkey) gets used if the + * security mode bit in Negottiate Protocol response states + * to use challenge/response method (i.e. Password bit is 1). + */ + + rc = calc_lanman_hash(ses->password, ses->server->cryptkey, + ses->server->sec_mode & SECMODE_PW_ENCRYPT ? + true : false, lnm_session_key); + + ses->flags |= CIFS_SES_LANMAN; + memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + + /* can not sign if LANMAN negotiated so no need + to calculate signing key? but what if server + changed to do higher than lanman dialect and + we reconnected would we ever calc signing_key? */ + + cFYI(1, "Negotiating LANMAN setting up strings"); + /* Unicode not allowed for LANMAN dialects */ + ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); +#endif + } else if (type == NTLM) { + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + pSMB->req_no_secext.CaseInsensitivePasswordLength = + cpu_to_le16(CIFS_AUTH_RESP_SIZE); + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(CIFS_AUTH_RESP_SIZE); + + /* calculate ntlm response and session key */ + rc = setup_ntlm_response(ses); + if (rc) { + cERROR(1, "Error %d during NTLM authentication", rc); + goto ssetup_exit; + } + + /* copy ntlm response */ + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + + if (ses->capabilities & CAP_UNICODE) { + /* unicode strings must be word aligned */ + if (iov[0].iov_len % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); + } else + ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); + } else if (type == NTLMv2) { + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + + /* LM2 password would be here if we supported it */ + pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; + + /* calculate nlmv2 response and session key */ + rc = setup_ntlmv2_rsp(ses, nls_cp); + if (rc) { + cERROR(1, "Error %d during NTLMv2 authentication", rc); + goto ssetup_exit; + } + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; + + /* set case sensitive password length after tilen may get + * assigned, tilen is 0 otherwise. + */ + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + + if (ses->capabilities & CAP_UNICODE) { + if (iov[0].iov_len % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); + } else + ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); + } else if (type == Kerberos) { +#ifdef CONFIG_CIFS_UPCALL + struct cifs_spnego_msg *msg; + + spnego_key = cifs_get_spnego_key(ses); + if (IS_ERR(spnego_key)) { + rc = PTR_ERR(spnego_key); + spnego_key = NULL; + goto ssetup_exit; + } + + msg = spnego_key->payload.data; + /* check version field to make sure that cifs.upcall is + sending us a response in an expected form */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cERROR(1, "incorrect version of cifs.upcall (expected" + " %d but got %d)", + CIFS_SPNEGO_UPCALL_VERSION, msg->version); + rc = -EKEYREJECTED; + goto ssetup_exit; + } + + ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL); + if (!ses->auth_key.response) { + cERROR(1, "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; + goto ssetup_exit; + } + memcpy(ses->auth_key.response, msg->data, msg->sesskey_len); + ses->auth_key.len = msg->sesskey_len; + + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities = cpu_to_le32(capabilities); + iov[1].iov_base = msg->data + msg->sesskey_len; + iov[1].iov_len = msg->secblob_len; + pSMB->req.SecurityBlobLength = cpu_to_le16(iov[1].iov_len); + + if (ses->capabilities & CAP_UNICODE) { + /* unicode strings must be word aligned */ + if ((iov[0].iov_len + iov[1].iov_len) % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, nls_cp); + unicode_domain_string(&bcc_ptr, ses, nls_cp); + } else + /* BB: is this right? */ + ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); +#else /* ! CONFIG_CIFS_UPCALL */ + cERROR(1, "Kerberos negotiated but upcall support disabled!"); + rc = -ENOSYS; + goto ssetup_exit; +#endif /* CONFIG_CIFS_UPCALL */ + } else if (type == RawNTLMSSP) { + if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { + cERROR(1, "NTLMSSP requires Unicode support"); + rc = -ENOSYS; + goto ssetup_exit; + } + + cFYI(1, "ntlmssp session setup phase %d", phase); + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities |= cpu_to_le32(capabilities); + switch(phase) { + case NtLmNegotiate: + build_ntlmssp_negotiate_blob( + pSMB->req.SecurityBlob, ses); + iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); + iov[1].iov_base = pSMB->req.SecurityBlob; + pSMB->req.SecurityBlobLength = + cpu_to_le16(sizeof(NEGOTIATE_MESSAGE)); + break; + case NtLmAuthenticate: + /* + * 5 is an empirical value, large enough to hold + * authenticate message plus max 10 of av paris, + * domain, user, workstation names, flags, etc. + */ + ntlmsspblob = kzalloc( + 5*sizeof(struct _AUTHENTICATE_MESSAGE), + GFP_KERNEL); + if (!ntlmsspblob) { + cERROR(1, "Can't allocate NTLMSSP blob"); + rc = -ENOMEM; + goto ssetup_exit; + } + + rc = build_ntlmssp_auth_blob(ntlmsspblob, + &blob_len, ses, nls_cp); + if (rc) + goto ssetup_exit; + iov[1].iov_len = blob_len; + iov[1].iov_base = ntlmsspblob; + pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len); + /* + * Make sure that we tell the server that we are using + * the uid that it just gave us back on the response + * (challenge) + */ + smb_buf->Uid = ses->Suid; + break; + default: + cERROR(1, "invalid phase %d", phase); + rc = -ENOSYS; + goto ssetup_exit; + } + /* unicode strings must be word aligned */ + if ((iov[0].iov_len + iov[1].iov_len) % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, nls_cp); + } else { + cERROR(1, "secType %d not supported!", type); + rc = -ENOSYS; + goto ssetup_exit; + } + + iov[2].iov_base = str_area; + iov[2].iov_len = (long) bcc_ptr - (long) str_area; + + count = iov[1].iov_len + iov[2].iov_len; + smb_buf->smb_buf_length = + cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count); + + put_bcc(count, smb_buf); + + rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type, + CIFS_LOG_ERROR); + /* SMB request buf freed in SendReceive2 */ + + pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; + smb_buf = (struct smb_hdr *)iov[0].iov_base; + + if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError == + cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) { + if (phase != NtLmNegotiate) { + cERROR(1, "Unexpected more processing error"); + goto ssetup_exit; + } + /* NTLMSSP Negotiate sent now processing challenge (response) */ + phase = NtLmChallenge; /* process ntlmssp challenge */ + rc = 0; /* MORE_PROC rc is not an error here, but expected */ + } + if (rc) + goto ssetup_exit; + + if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) { + rc = -EIO; + cERROR(1, "bad word count %d", smb_buf->WordCount); + goto ssetup_exit; + } + action = le16_to_cpu(pSMB->resp.Action); + if (action & GUEST_LOGIN) + cFYI(1, "Guest login"); /* BB mark SesInfo struct? */ + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cFYI(1, "UID = %d ", ses->Suid); + /* response can have either 3 or 4 word count - Samba sends 3 */ + /* and lanman response is 3 */ + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + if (smb_buf->WordCount == 4) { + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cERROR(1, "bad security blob length %d", blob_len); + rc = -EINVAL; + goto ssetup_exit; + } + if (phase == NtLmChallenge) { + rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); + /* now goto beginning for ntlmssp authenticate phase */ + if (rc) + goto ssetup_exit; + } + bcc_ptr += blob_len; + bytes_remaining -= blob_len; + } + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); + } else { + rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining, + ses, nls_cp); + } + +ssetup_exit: + if (spnego_key) { + key_revoke(spnego_key); + key_put(spnego_key); + } + kfree(str_area); + kfree(ntlmsspblob); + ntlmsspblob = NULL; + if (resp_buf_type == CIFS_SMALL_BUFFER) { + cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base); + cifs_small_buf_release(iov[0].iov_base); + } else if (resp_buf_type == CIFS_LARGE_BUFFER) + cifs_buf_release(iov[0].iov_base); + + /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */ + if ((phase == NtLmChallenge) && (rc == 0)) + goto ssetup_ntlmssp_authenticate; + + return rc; +} diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c new file mode 100644 index 00000000..1c5b770c --- /dev/null +++ b/fs/cifs/smbencrypt.c @@ -0,0 +1,401 @@ +/* + Unix SMB/Netbios implementation. + Version 1.9. + SMB parameters and setup + Copyright (C) Andrew Tridgell 1992-2000 + Copyright (C) Luke Kenneth Casson Leighton 1996-2000 + Modified by Jeremy Allison 1995. + Copyright (C) Andrew Bartlett 2002-2003 + Modified by Steve French (sfrench@us.ibm.com) 2002-2003 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include +#include "cifs_unicode.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifs_debug.h" +#include "cifsproto.h" + +#ifndef false +#define false 0 +#endif +#ifndef true +#define true 1 +#endif + +/* following came from the other byteorder.h to avoid include conflicts */ +#define CVAL(buf,pos) (((unsigned char *)(buf))[pos]) +#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8) +#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val))) + +static void +str_to_key(unsigned char *str, unsigned char *key) +{ + int i; + + key[0] = str[0] >> 1; + key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2); + key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3); + key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4); + key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5); + key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6); + key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7); + key[7] = str[6] & 0x7F; + for (i = 0; i < 8; i++) + key[i] = (key[i] << 1); +} + +static int +smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) +{ + int rc; + unsigned char key2[8]; + struct crypto_blkcipher *tfm_des; + struct scatterlist sgin, sgout; + struct blkcipher_desc desc; + + str_to_key(key, key2); + + tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm_des)) { + rc = PTR_ERR(tfm_des); + cERROR(1, "could not allocate des crypto API\n"); + goto smbhash_err; + } + + desc.tfm = tfm_des; + + crypto_blkcipher_setkey(tfm_des, key2, 8); + + sg_init_one(&sgin, in, 8); + sg_init_one(&sgout, out, 8); + + rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8); + if (rc) + cERROR(1, "could not encrypt crypt key rc: %d\n", rc); + + crypto_free_blkcipher(tfm_des); +smbhash_err: + return rc; +} + +static int +E_P16(unsigned char *p14, unsigned char *p16) +{ + int rc; + unsigned char sp8[8] = + { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 }; + + rc = smbhash(p16, sp8, p14); + if (rc) + return rc; + rc = smbhash(p16 + 8, sp8, p14 + 7); + return rc; +} + +static int +E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24) +{ + int rc; + + rc = smbhash(p24, c8, p21); + if (rc) + return rc; + rc = smbhash(p24 + 8, c8, p21 + 7); + if (rc) + return rc; + rc = smbhash(p24 + 16, c8, p21 + 14); + return rc; +} + +/* produce a md4 message digest from data of length n bytes */ +int +mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) +{ + int rc; + unsigned int size; + struct crypto_shash *md4; + struct sdesc *sdescmd4; + + md4 = crypto_alloc_shash("md4", 0, 0); + if (IS_ERR(md4)) { + rc = PTR_ERR(md4); + cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc); + return rc; + } + size = sizeof(struct shash_desc) + crypto_shash_descsize(md4); + sdescmd4 = kmalloc(size, GFP_KERNEL); + if (!sdescmd4) { + rc = -ENOMEM; + cERROR(1, "%s: Memory allocation failure\n", __func__); + goto mdfour_err; + } + sdescmd4->shash.tfm = md4; + sdescmd4->shash.flags = 0x0; + + rc = crypto_shash_init(&sdescmd4->shash); + if (rc) { + cERROR(1, "%s: Could not init md4 shash\n", __func__); + goto mdfour_err; + } + crypto_shash_update(&sdescmd4->shash, link_str, link_len); + rc = crypto_shash_final(&sdescmd4->shash, md4_hash); + +mdfour_err: + crypto_free_shash(md4); + kfree(sdescmd4); + + return rc; +} + +/* + This implements the X/Open SMB password encryption + It takes a password, a 8 byte "crypt key" and puts 24 bytes of + encrypted password into p24 */ +/* Note that password must be uppercased and null terminated */ +int +SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24) +{ + int rc; + unsigned char p14[14], p16[16], p21[21]; + + memset(p14, '\0', 14); + memset(p16, '\0', 16); + memset(p21, '\0', 21); + + memcpy(p14, passwd, 14); + rc = E_P16(p14, p16); + if (rc) + return rc; + + memcpy(p21, p16, 16); + rc = E_P24(p21, c8, p24); + + return rc; +} + +/* Routines for Windows NT MD4 Hash functions. */ +static int +_my_wcslen(__u16 *str) +{ + int len = 0; + while (*str++ != 0) + len++; + return len; +} + +/* + * Convert a string into an NT UNICODE string. + * Note that regardless of processor type + * this must be in intel (little-endian) + * format. + */ + +static int +_my_mbstowcs(__u16 *dst, const unsigned char *src, int len) +{ /* BB not a very good conversion routine - change/fix */ + int i; + __u16 val; + + for (i = 0; i < len; i++) { + val = *src; + SSVAL(dst, 0, val); + dst++; + src++; + if (val == 0) + break; + } + return i; +} + +/* + * Creates the MD4 Hash of the users password in NT UNICODE. + */ + +int +E_md4hash(const unsigned char *passwd, unsigned char *p16) +{ + int rc; + int len; + __u16 wpwd[129]; + + /* Password cannot be longer than 128 characters */ + if (passwd) { + len = strlen((char *) passwd); + if (len > 128) + len = 128; + + /* Password must be converted to NT unicode */ + _my_mbstowcs(wpwd, passwd, len); + } else + len = 0; + + wpwd[len] = 0; /* Ensure string is null terminated */ + /* Calculate length in bytes */ + len = _my_wcslen(wpwd) * sizeof(__u16); + + rc = mdfour(p16, (unsigned char *) wpwd, len); + memset(wpwd, 0, 129 * 2); + + return rc; +} + +#if 0 /* currently unused */ +/* Does both the NT and LM owfs of a user's password */ +static void +nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16]) +{ + char passwd[514]; + + memset(passwd, '\0', 514); + if (strlen(pwd) < 513) + strcpy(passwd, pwd); + else + memcpy(passwd, pwd, 512); + /* Calculate the MD4 hash (NT compatible) of the password */ + memset(nt_p16, '\0', 16); + E_md4hash(passwd, nt_p16); + + /* Mangle the passwords into Lanman format */ + passwd[14] = '\0'; +/* strupper(passwd); */ + + /* Calculate the SMB (lanman) hash functions of the password */ + + memset(p16, '\0', 16); + E_P16((unsigned char *) passwd, (unsigned char *) p16); + + /* clear out local copy of user's password (just being paranoid). */ + memset(passwd, '\0', sizeof(passwd)); +} +#endif + +/* Does the NTLMv2 owfs of a user's password */ +#if 0 /* function not needed yet - but will be soon */ +static void +ntv2_owf_gen(const unsigned char owf[16], const char *user_n, + const char *domain_n, unsigned char kr_buf[16], + const struct nls_table *nls_codepage) +{ + wchar_t *user_u; + wchar_t *dom_u; + int user_l, domain_l; + struct HMACMD5Context ctx; + + /* might as well do one alloc to hold both (user_u and dom_u) */ + user_u = kmalloc(2048 * sizeof(wchar_t), GFP_KERNEL); + if (user_u == NULL) + return; + dom_u = user_u + 1024; + + /* push_ucs2(NULL, user_u, user_n, (user_l+1)*2, + STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); + push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2, + STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */ + + /* BB user and domain may need to be uppercased */ + user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage); + domain_l = cifs_strtoUCS(dom_u, domain_n, 511, nls_codepage); + + user_l++; /* trailing null */ + domain_l++; + + hmac_md5_init_limK_to_64(owf, 16, &ctx); + hmac_md5_update((const unsigned char *) user_u, user_l * 2, &ctx); + hmac_md5_update((const unsigned char *) dom_u, domain_l * 2, &ctx); + hmac_md5_final(kr_buf, &ctx); + + kfree(user_u); +} +#endif + +/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */ +#if 0 /* currently unused */ +static void +NTLMSSPOWFencrypt(unsigned char passwd[8], + unsigned char *ntlmchalresp, unsigned char p24[24]) +{ + unsigned char p21[21]; + + memset(p21, '\0', 21); + memcpy(p21, passwd, 8); + memset(p21 + 8, 0xbd, 8); + + E_P24(p21, ntlmchalresp, p24); +} +#endif + +/* Does the NT MD4 hash then des encryption. */ +int +SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) +{ + int rc; + unsigned char p16[16], p21[21]; + + memset(p16, '\0', 16); + memset(p21, '\0', 21); + + rc = E_md4hash(passwd, p16); + if (rc) { + cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); + return rc; + } + memcpy(p21, p16, 16); + rc = E_P24(p21, c8, p24); + return rc; +} + + +/* Does the md5 encryption from the NT hash for NTLMv2. */ +/* These routines will be needed later */ +#if 0 +static void +SMBOWFencrypt_ntv2(const unsigned char kr[16], + const struct data_blob *srv_chal, + const struct data_blob *cli_chal, unsigned char resp_buf[16]) +{ + struct HMACMD5Context ctx; + + hmac_md5_init_limK_to_64(kr, 16, &ctx); + hmac_md5_update(srv_chal->data, srv_chal->length, &ctx); + hmac_md5_update(cli_chal->data, cli_chal->length, &ctx); + hmac_md5_final(resp_buf, &ctx); +} + +static void +SMBsesskeygen_ntv2(const unsigned char kr[16], + const unsigned char *nt_resp, __u8 sess_key[16]) +{ + struct HMACMD5Context ctx; + + hmac_md5_init_limK_to_64(kr, 16, &ctx); + hmac_md5_update(nt_resp, 16, &ctx); + hmac_md5_final((unsigned char *) sess_key, &ctx); +} + +static void +SMBsesskeygen_ntv1(const unsigned char kr[16], + const unsigned char *nt_resp, __u8 sess_key[16]) +{ + mdfour((unsigned char *) sess_key, (unsigned char *) kr, 16); +} +#endif diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h new file mode 100644 index 00000000..7f16cb82 --- /dev/null +++ b/fs/cifs/smberr.h @@ -0,0 +1,184 @@ +/* + * fs/cifs/smberr.h + * + * Copyright (c) International Business Machines Corp., 2002,2004 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * See Error Codes section of the SNIA CIFS Specification + * for more information + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define SUCCESS 0x00 /* The request was successful. */ +#define ERRDOS 0x01 /* Error is from the core DOS operating system set */ +#define ERRSRV 0x02 /* Error is generated by the file server daemon */ +#define ERRHRD 0x03 /* Error is a hardware error. */ +#define ERRCMD 0xFF /* Command was not in the "SMB" format. */ + +/* The following error codes may be generated with the SUCCESS error class.*/ + +/*#define SUCCESS 0 The request was successful. */ + +/* The following error codes may be generated with the ERRDOS error class.*/ + +#define ERRbadfunc 1 /* Invalid function. The server did not + recognize or could not perform a + system call generated by the server, + e.g., set the DIRECTORY attribute on + a data file, invalid seek mode. */ +#define ERRbadfile 2 /* File not found. The last component + of a file's pathname could not be + found. */ +#define ERRbadpath 3 /* Directory invalid. A directory + component in a pathname could not be + found. */ +#define ERRnofids 4 /* Too many open files. The server has + no file handles available. */ +#define ERRnoaccess 5 /* Access denied, the client's context + does not permit the requested + function. This includes the + following conditions: invalid rename + command, write to Fid open for read + only, read on Fid open for write + only, attempt to delete a non-empty + directory */ +#define ERRbadfid 6 /* Invalid file handle. The file handle + specified was not recognized by the + server. */ +#define ERRbadmcb 7 /* Memory control blocks destroyed. */ +#define ERRnomem 8 /* Insufficient server memory to + perform the requested function. */ +#define ERRbadmem 9 /* Invalid memory block address. */ +#define ERRbadenv 10 /* Invalid environment. */ +#define ERRbadformat 11 /* Invalid format. */ +#define ERRbadaccess 12 /* Invalid open mode. */ +#define ERRbaddata 13 /* Invalid data (generated only by + IOCTL calls within the server). */ +#define ERRbaddrive 15 /* Invalid drive specified. */ +#define ERRremcd 16 /* A Delete Directory request attempted + to remove the server's current + directory. */ +#define ERRdiffdevice 17 /* Not same device (e.g., a cross + volume rename was attempted */ +#define ERRnofiles 18 /* A File Search command can find no + more files matching the specified + criteria. */ +#define ERRwriteprot 19 /* media is write protected */ +#define ERRgeneral 31 +#define ERRbadshare 32 /* The sharing mode specified for an + Open conflicts with existing FIDs on + the file. */ +#define ERRlock 33 /* A Lock request conflicted with an + existing lock or specified an + invalid mode, or an Unlock requested + attempted to remove a lock held by + another process. */ +#define ERRunsup 50 +#define ERRnosuchshare 67 +#define ERRfilexists 80 /* The file named in the request + already exists. */ +#define ERRinvparm 87 +#define ERRdiskfull 112 +#define ERRinvname 123 +#define ERRinvlevel 124 +#define ERRdirnotempty 145 +#define ERRnotlocked 158 +#define ERRcancelviolation 173 +#define ERRalreadyexists 183 +#define ERRbadpipe 230 +#define ERRpipebusy 231 +#define ERRpipeclosing 232 +#define ERRnotconnected 233 +#define ERRmoredata 234 +#define ERReasnotsupported 282 +#define ErrQuota 0x200 /* The operation would cause a quota + limit to be exceeded. */ +#define ErrNotALink 0x201 /* A link operation was performed on a + pathname that was not a link. */ + +/* Below errors are used internally (do not come over the wire) for passthrough + from STATUS codes to POSIX only */ +#define ERRsymlink 0xFFFD +#define ErrTooManyLinks 0xFFFE + +/* Following error codes may be generated with the ERRSRV error class.*/ + +#define ERRerror 1 /* Non-specific error code. It is + returned under the following + conditions: resource other than disk + space exhausted (e.g. TIDs), first + SMB command was not negotiate, + multiple negotiates attempted, and + internal server error. */ +#define ERRbadpw 2 /* Bad password - name/password pair in + a TreeConnect or Session Setup are + invalid. */ +#define ERRbadtype 3 /* used for indicating DFS referral + needed */ +#define ERRaccess 4 /* The client does not have the + necessary access rights within the + specified context for requested + function. */ +#define ERRinvtid 5 /* The Tid specified in a command was + invalid. */ +#define ERRinvnetname 6 /* Invalid network name in tree + connect. */ +#define ERRinvdevice 7 /* Invalid device - printer request + made to non-printer connection or + non-printer request made to printer + connection. */ +#define ERRqfull 49 /* Print queue full (files) -- returned + by open print file. */ +#define ERRqtoobig 50 /* Print queue full -- no space. */ +#define ERRqeof 51 /* EOF on print queue dump */ +#define ERRinvpfid 52 /* Invalid print file FID. */ +#define ERRsmbcmd 64 /* The server did not recognize the + command received. */ +#define ERRsrverror 65 /* The server encountered an internal + error, e.g., system file + unavailable. */ +#define ERRbadBID 66 /* (obsolete) */ +#define ERRfilespecs 67 /* The Fid and pathname parameters + contained an invalid combination of + values. */ +#define ERRbadLink 68 /* (obsolete) */ +#define ERRbadpermits 69 /* The access permissions specified for + a file or directory are not a valid + combination. */ +#define ERRbadPID 70 +#define ERRsetattrmode 71 /* attribute (mode) is invalid */ +#define ERRpaused 81 /* Server is paused */ +#define ERRmsgoff 82 /* reserved - messaging off */ +#define ERRnoroom 83 /* reserved - no room for message */ +#define ERRrmuns 87 /* reserved - too many remote names */ +#define ERRtimeout 88 /* operation timed out */ +#define ERRnoresource 89 /* No resources available for request + */ +#define ERRtoomanyuids 90 /* Too many UIDs active on this session + */ +#define ERRbaduid 91 /* The UID is not known as a valid user + */ +#define ERRusempx 250 /* temporarily unable to use raw */ +#define ERRusestd 251 /* temporarily unable to use either raw + or mpx */ +#define ERR_NOTIFY_ENUM_DIR 1024 +#define ERRnoSuchUser 2238 /* user account does not exist */ +#define ERRaccountexpired 2239 +#define ERRbadclient 2240 /* can not logon from this client */ +#define ERRbadLogonTime 2241 /* logon hours do not allow this */ +#define ERRpasswordExpired 2242 +#define ERRnetlogonNotStarted 2455 +#define ERRnosupport 0xFFFF diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h new file mode 100644 index 00000000..7056b891 --- /dev/null +++ b/fs/cifs/smbfsctl.h @@ -0,0 +1,84 @@ +/* + * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions + * + * Copyright (c) International Business Machines Corp., 2002,2009 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* IOCTL information */ +/* + * List of ioctl/fsctl function codes that are or could be useful in the + * future to remote clients like cifs or SMB2 client. There is probably + * a slightly larger set of fsctls that NTFS local filesystem could handle, + * including the seven below that we do not have struct definitions for. + * Even with protocol definitions for most of these now available, we still + * need to do some experimentation to identify which are practical to do + * remotely. Some of the following, such as the encryption/compression ones + * could be invoked from tools via a specialized hook into the VFS rather + * than via the standard vfs entry points + */ +#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000 +#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004 +#define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008 +#define FSCTL_LOCK_VOLUME 0x00090018 +#define FSCTL_UNLOCK_VOLUME 0x0009001C +#define FSCTL_IS_PATHNAME_VALID 0x0009002C /* BB add struct */ +#define FSCTL_GET_COMPRESSION 0x0009003C /* BB add struct */ +#define FSCTL_SET_COMPRESSION 0x0009C040 /* BB add struct */ +#define FSCTL_QUERY_FAT_BPB 0x00090058 /* BB add struct */ +/* Verify the next FSCTL number, we had it as 0x00090090 before */ +#define FSCTL_FILESYSTEM_GET_STATS 0x00090060 /* BB add struct */ +#define FSCTL_GET_NTFS_VOLUME_DATA 0x00090064 /* BB add struct */ +#define FSCTL_GET_RETRIEVAL_POINTERS 0x00090073 /* BB add struct */ +#define FSCTL_IS_VOLUME_DIRTY 0x00090078 /* BB add struct */ +#define FSCTL_ALLOW_EXTENDED_DASD_IO 0x00090083 /* BB add struct */ +#define FSCTL_REQUEST_FILTER_OPLOCK 0x0009008C +#define FSCTL_FIND_FILES_BY_SID 0x0009008F /* BB add struct */ +#define FSCTL_SET_OBJECT_ID 0x00090098 /* BB add struct */ +#define FSCTL_GET_OBJECT_ID 0x0009009C /* BB add struct */ +#define FSCTL_DELETE_OBJECT_ID 0x000900A0 /* BB add struct */ +#define FSCTL_SET_REPARSE_POINT 0x000900A4 /* BB add struct */ +#define FSCTL_GET_REPARSE_POINT 0x000900A8 /* BB add struct */ +#define FSCTL_DELETE_REPARSE_POINT 0x000900AC /* BB add struct */ +#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */ +#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */ +#define FSCTL_SET_SPARSE 0x000900C4 /* BB add struct */ +#define FSCTL_SET_ZERO_DATA 0x000900C8 /* BB add struct */ +#define FSCTL_SET_ENCRYPTION 0x000900D7 /* BB add struct */ +#define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB /* BB add struct */ +#define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF /* BB add struct */ +#define FSCTL_READ_RAW_ENCRYPTED 0x000900E3 /* BB add struct */ +#define FSCTL_READ_FILE_USN_DATA 0x000900EB /* BB add struct */ +#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */ +#define FSCTL_SIS_COPYFILE 0x00090100 /* BB add struct */ +#define FSCTL_RECALL_FILE 0x00090117 /* BB add struct */ +#define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */ +#define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */ +#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ +#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */ +#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ +#define FSCTL_SIS_LINK_FILES 0x0009C104 +#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */ +#define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */ +/* strange that the number for this op is not sequential with previous op */ +#define FSCTL_PIPE_WAIT 0x00110018 /* BB add struct */ +#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */ +#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */ + +#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003 +#define IO_REPARSE_TAG_HSM 0xC0000004 +#define IO_REPARSE_TAG_SIS 0x80000007 diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c new file mode 100644 index 00000000..c1b9c4b1 --- /dev/null +++ b/fs/cifs/transport.c @@ -0,0 +1,931 @@ +/* + * fs/cifs/transport.c + * + * Copyright (C) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * Jeremy Allison (jra@samba.org) 2006. + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" + +extern mempool_t *cifs_mid_poolp; + +static void +wake_up_task(struct mid_q_entry *mid) +{ + wake_up_process(mid->callback_data); +} + +struct mid_q_entry * +AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) +{ + struct mid_q_entry *temp; + + if (server == NULL) { + cERROR(1, "Null TCP session in AllocMidQEntry"); + return NULL; + } + + temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); + if (temp == NULL) + return temp; + else { + memset(temp, 0, sizeof(struct mid_q_entry)); + temp->mid = smb_buffer->Mid; /* always LE */ + temp->pid = current->pid; + temp->command = smb_buffer->Command; + cFYI(1, "For smb_command %d", temp->command); + /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ + /* when mid allocated can be before when sent */ + temp->when_alloc = jiffies; + + /* + * The default is for the mid to be synchronous, so the + * default callback just wakes up the current task. + */ + temp->callback = wake_up_task; + temp->callback_data = current; + } + + atomic_inc(&midCount); + temp->midState = MID_REQUEST_ALLOCATED; + return temp; +} + +void +DeleteMidQEntry(struct mid_q_entry *midEntry) +{ +#ifdef CONFIG_CIFS_STATS2 + unsigned long now; +#endif + midEntry->midState = MID_FREE; + atomic_dec(&midCount); + if (midEntry->largeBuf) + cifs_buf_release(midEntry->resp_buf); + else + cifs_small_buf_release(midEntry->resp_buf); +#ifdef CONFIG_CIFS_STATS2 + now = jiffies; + /* commands taking longer than one second are indications that + something is wrong, unless it is quite a slow link or server */ + if ((now - midEntry->when_alloc) > HZ) { + if ((cifsFYI & CIFS_TIMER) && + (midEntry->command != SMB_COM_LOCKING_ANDX)) { + printk(KERN_DEBUG " CIFS slow rsp: cmd %d mid %d", + midEntry->command, midEntry->mid); + printk(" A: 0x%lx S: 0x%lx R: 0x%lx\n", + now - midEntry->when_alloc, + now - midEntry->when_sent, + now - midEntry->when_received); + } + } +#endif + mempool_free(midEntry, cifs_mid_poolp); +} + +static void +delete_mid(struct mid_q_entry *mid) +{ + spin_lock(&GlobalMid_Lock); + list_del(&mid->qhead); + spin_unlock(&GlobalMid_Lock); + + DeleteMidQEntry(mid); +} + +static int +smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) +{ + int rc = 0; + int i = 0; + struct msghdr smb_msg; + struct smb_hdr *smb_buffer = iov[0].iov_base; + unsigned int len = iov[0].iov_len; + unsigned int total_len; + int first_vec = 0; + unsigned int smb_buf_length = be32_to_cpu(smb_buffer->smb_buf_length); + struct socket *ssocket = server->ssocket; + + if (ssocket == NULL) + return -ENOTSOCK; /* BB eventually add reconnect code here */ + + smb_msg.msg_name = (struct sockaddr *) &server->dstaddr; + smb_msg.msg_namelen = sizeof(struct sockaddr); + smb_msg.msg_control = NULL; + smb_msg.msg_controllen = 0; + if (server->noblocksnd) + smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; + else + smb_msg.msg_flags = MSG_NOSIGNAL; + + total_len = 0; + for (i = 0; i < n_vec; i++) + total_len += iov[i].iov_len; + + cFYI(1, "Sending smb: total_len %d", total_len); + dump_smb(smb_buffer, len); + + i = 0; + while (total_len) { + rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec], + n_vec - first_vec, total_len); + if ((rc == -ENOSPC) || (rc == -EAGAIN)) { + i++; + /* if blocking send we try 3 times, since each can block + for 5 seconds. For nonblocking we have to try more + but wait increasing amounts of time allowing time for + socket to clear. The overall time we wait in either + case to send on the socket is about 15 seconds. + Similarly we wait for 15 seconds for + a response from the server in SendReceive[2] + for the server to send a response back for + most types of requests (except SMB Write + past end of file which can be slow, and + blocking lock operations). NFS waits slightly longer + than CIFS, but this can make it take longer for + nonresponsive servers to be detected and 15 seconds + is more than enough time for modern networks to + send a packet. In most cases if we fail to send + after the retries we will kill the socket and + reconnect which may clear the network problem. + */ + if ((i >= 14) || (!server->noblocksnd && (i > 2))) { + cERROR(1, "sends on sock %p stuck for 15 seconds", + ssocket); + rc = -EAGAIN; + break; + } + msleep(1 << i); + continue; + } + if (rc < 0) + break; + + if (rc == total_len) { + total_len = 0; + break; + } else if (rc > total_len) { + cERROR(1, "sent %d requested %d", rc, total_len); + break; + } + if (rc == 0) { + /* should never happen, letting socket clear before + retrying is our only obvious option here */ + cERROR(1, "tcp sent no data"); + msleep(500); + continue; + } + total_len -= rc; + /* the line below resets i */ + for (i = first_vec; i < n_vec; i++) { + if (iov[i].iov_len) { + if (rc > iov[i].iov_len) { + rc -= iov[i].iov_len; + iov[i].iov_len = 0; + } else { + iov[i].iov_base += rc; + iov[i].iov_len -= rc; + first_vec = i; + break; + } + } + } + i = 0; /* in case we get ENOSPC on the next send */ + } + + if ((total_len > 0) && (total_len != smb_buf_length + 4)) { + cFYI(1, "partial send (%d remaining), terminating session", + total_len); + /* If we have only sent part of an SMB then the next SMB + could be taken as the remainder of this one. We need + to kill the socket so the server throws away the partial + SMB */ + server->tcpStatus = CifsNeedReconnect; + } + + if (rc < 0 && rc != -EINTR) + cERROR(1, "Error %d sending data on socket to server", rc); + else + rc = 0; + + /* Don't want to modify the buffer as a + side effect of this call. */ + smb_buffer->smb_buf_length = cpu_to_be32(smb_buf_length); + + return rc; +} + +int +smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, + unsigned int smb_buf_length) +{ + struct kvec iov; + + iov.iov_base = smb_buffer; + iov.iov_len = smb_buf_length + 4; + + return smb_sendv(server, &iov, 1); +} + +static int wait_for_free_request(struct TCP_Server_Info *server, + const int long_op) +{ + if (long_op == CIFS_ASYNC_OP) { + /* oplock breaks must not be held up */ + atomic_inc(&server->inFlight); + return 0; + } + + spin_lock(&GlobalMid_Lock); + while (1) { + if (atomic_read(&server->inFlight) >= cifs_max_pending) { + spin_unlock(&GlobalMid_Lock); +#ifdef CONFIG_CIFS_STATS2 + atomic_inc(&server->num_waiters); +#endif + wait_event(server->request_q, + atomic_read(&server->inFlight) + < cifs_max_pending); +#ifdef CONFIG_CIFS_STATS2 + atomic_dec(&server->num_waiters); +#endif + spin_lock(&GlobalMid_Lock); + } else { + if (server->tcpStatus == CifsExiting) { + spin_unlock(&GlobalMid_Lock); + return -ENOENT; + } + + /* can not count locking commands against total + as they are allowed to block on server */ + + /* update # of requests on the wire to server */ + if (long_op != CIFS_BLOCKING_OP) + atomic_inc(&server->inFlight); + spin_unlock(&GlobalMid_Lock); + break; + } + } + return 0; +} + +static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, + struct mid_q_entry **ppmidQ) +{ + if (ses->server->tcpStatus == CifsExiting) { + return -ENOENT; + } + + if (ses->server->tcpStatus == CifsNeedReconnect) { + cFYI(1, "tcp session dead - return to caller to retry"); + return -EAGAIN; + } + + if (ses->status != CifsGood) { + /* check if SMB session is bad because we are setting it up */ + if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && + (in_buf->Command != SMB_COM_NEGOTIATE)) + return -EAGAIN; + /* else ok - we are setting up session */ + } + *ppmidQ = AllocMidQEntry(in_buf, ses->server); + if (*ppmidQ == NULL) + return -ENOMEM; + spin_lock(&GlobalMid_Lock); + list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q); + spin_unlock(&GlobalMid_Lock); + return 0; +} + +static int +wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) +{ + int error; + + error = wait_event_killable(server->response_q, + midQ->midState != MID_REQUEST_SUBMITTED); + if (error < 0) + return -ERESTARTSYS; + + return 0; +} + + +/* + * Send a SMB request and set the callback function in the mid to handle + * the result. Caller is responsible for dealing with timeouts. + */ +int +cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, + unsigned int nvec, mid_callback_t *callback, void *cbdata, + bool ignore_pend) +{ + int rc; + struct mid_q_entry *mid; + struct smb_hdr *hdr = (struct smb_hdr *)iov[0].iov_base; + + rc = wait_for_free_request(server, ignore_pend ? CIFS_ASYNC_OP : 0); + if (rc) + return rc; + + /* enable signing if server requires it */ + if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + + mutex_lock(&server->srv_mutex); + mid = AllocMidQEntry(hdr, server); + if (mid == NULL) { + mutex_unlock(&server->srv_mutex); + atomic_dec(&server->inFlight); + wake_up(&server->request_q); + return -ENOMEM; + } + + /* put it on the pending_mid_q */ + spin_lock(&GlobalMid_Lock); + list_add_tail(&mid->qhead, &server->pending_mid_q); + spin_unlock(&GlobalMid_Lock); + + rc = cifs_sign_smb2(iov, nvec, server, &mid->sequence_number); + if (rc) { + mutex_unlock(&server->srv_mutex); + goto out_err; + } + + mid->callback = callback; + mid->callback_data = cbdata; + mid->midState = MID_REQUEST_SUBMITTED; +#ifdef CONFIG_CIFS_STATS2 + atomic_inc(&server->inSend); +#endif + rc = smb_sendv(server, iov, nvec); +#ifdef CONFIG_CIFS_STATS2 + atomic_dec(&server->inSend); + mid->when_sent = jiffies; +#endif + mutex_unlock(&server->srv_mutex); + if (rc) + goto out_err; + + return rc; +out_err: + delete_mid(mid); + atomic_dec(&server->inFlight); + wake_up(&server->request_q); + return rc; +} + +/* + * + * Send an SMB Request. No response info (other than return code) + * needs to be parsed. + * + * flags indicate the type of request buffer and how long to wait + * and whether to log NT STATUS code (error) before mapping it to POSIX error + * + */ +int +SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses, + struct smb_hdr *in_buf, int flags) +{ + int rc; + struct kvec iov[1]; + int resp_buf_type; + + iov[0].iov_base = (char *)in_buf; + iov[0].iov_len = be32_to_cpu(in_buf->smb_buf_length) + 4; + flags |= CIFS_NO_RESP; + rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags); + cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc); + + return rc; +} + +static int +cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) +{ + int rc = 0; + + cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command, + mid->mid, mid->midState); + + spin_lock(&GlobalMid_Lock); + switch (mid->midState) { + case MID_RESPONSE_RECEIVED: + spin_unlock(&GlobalMid_Lock); + return rc; + case MID_RETRY_NEEDED: + rc = -EAGAIN; + break; + case MID_RESPONSE_MALFORMED: + rc = -EIO; + break; + case MID_SHUTDOWN: + rc = -EHOSTDOWN; + break; + default: + list_del_init(&mid->qhead); + cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__, + mid->mid, mid->midState); + rc = -EIO; + } + spin_unlock(&GlobalMid_Lock); + + DeleteMidQEntry(mid); + return rc; +} + +/* + * An NT cancel request header looks just like the original request except: + * + * The Command is SMB_COM_NT_CANCEL + * The WordCount is zeroed out + * The ByteCount is zeroed out + * + * This function mangles an existing request buffer into a + * SMB_COM_NT_CANCEL request and then sends it. + */ +static int +send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf, + struct mid_q_entry *mid) +{ + int rc = 0; + + /* -4 for RFC1001 length and +2 for BCC field */ + in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4 + 2); + in_buf->Command = SMB_COM_NT_CANCEL; + in_buf->WordCount = 0; + put_bcc(0, in_buf); + + mutex_lock(&server->srv_mutex); + rc = cifs_sign_smb(in_buf, server, &mid->sequence_number); + if (rc) { + mutex_unlock(&server->srv_mutex); + return rc; + } + rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); + mutex_unlock(&server->srv_mutex); + + cFYI(1, "issued NT_CANCEL for mid %u, rc = %d", + in_buf->Mid, rc); + + return rc; +} + +int +cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, + bool log_error) +{ + dump_smb(mid->resp_buf, + min_t(u32, 92, be32_to_cpu(mid->resp_buf->smb_buf_length))); + + /* convert the length into a more usable form */ + if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + /* FIXME: add code to kill session */ + if (cifs_verify_signature(mid->resp_buf, server, + mid->sequence_number + 1) != 0) + cERROR(1, "Unexpected SMB signature"); + } + + /* BB special case reconnect tid and uid here? */ + return map_smb_to_linux_error(mid->resp_buf, log_error); +} + +int +SendReceive2(const unsigned int xid, struct cifs_ses *ses, + struct kvec *iov, int n_vec, int *pRespBufType /* ret */, + const int flags) +{ + int rc = 0; + int long_op; + struct mid_q_entry *midQ; + struct smb_hdr *in_buf = iov[0].iov_base; + + long_op = flags & CIFS_TIMEOUT_MASK; + + *pRespBufType = CIFS_NO_BUFFER; /* no response buf yet */ + + if ((ses == NULL) || (ses->server == NULL)) { + cifs_small_buf_release(in_buf); + cERROR(1, "Null session"); + return -EIO; + } + + if (ses->server->tcpStatus == CifsExiting) { + cifs_small_buf_release(in_buf); + return -ENOENT; + } + + /* Ensure that we do not send more than 50 overlapping requests + to the same server. We may make this configurable later or + use ses->maxReq */ + + rc = wait_for_free_request(ses->server, long_op); + if (rc) { + cifs_small_buf_release(in_buf); + return rc; + } + + /* make sure that we sign in the same order that we send on this socket + and avoid races inside tcp sendmsg code that could cause corruption + of smb data */ + + mutex_lock(&ses->server->srv_mutex); + + rc = allocate_mid(ses, in_buf, &midQ); + if (rc) { + mutex_unlock(&ses->server->srv_mutex); + cifs_small_buf_release(in_buf); + /* Update # of requests on wire to server */ + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); + return rc; + } + rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number); + if (rc) { + mutex_unlock(&ses->server->srv_mutex); + cifs_small_buf_release(in_buf); + goto out; + } + + midQ->midState = MID_REQUEST_SUBMITTED; +#ifdef CONFIG_CIFS_STATS2 + atomic_inc(&ses->server->inSend); +#endif + rc = smb_sendv(ses->server, iov, n_vec); +#ifdef CONFIG_CIFS_STATS2 + atomic_dec(&ses->server->inSend); + midQ->when_sent = jiffies; +#endif + + mutex_unlock(&ses->server->srv_mutex); + + if (rc < 0) { + cifs_small_buf_release(in_buf); + goto out; + } + + if (long_op == CIFS_ASYNC_OP) { + cifs_small_buf_release(in_buf); + goto out; + } + + rc = wait_for_response(ses->server, midQ); + if (rc != 0) { + send_nt_cancel(ses->server, in_buf, midQ); + spin_lock(&GlobalMid_Lock); + if (midQ->midState == MID_REQUEST_SUBMITTED) { + midQ->callback = DeleteMidQEntry; + spin_unlock(&GlobalMid_Lock); + cifs_small_buf_release(in_buf); + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); + return rc; + } + spin_unlock(&GlobalMid_Lock); + } + + cifs_small_buf_release(in_buf); + + rc = cifs_sync_mid_result(midQ, ses->server); + if (rc != 0) { + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); + return rc; + } + + if (!midQ->resp_buf || midQ->midState != MID_RESPONSE_RECEIVED) { + rc = -EIO; + cFYI(1, "Bad MID state?"); + goto out; + } + + iov[0].iov_base = (char *)midQ->resp_buf; + iov[0].iov_len = be32_to_cpu(midQ->resp_buf->smb_buf_length) + 4; + if (midQ->largeBuf) + *pRespBufType = CIFS_LARGE_BUFFER; + else + *pRespBufType = CIFS_SMALL_BUFFER; + + rc = cifs_check_receive(midQ, ses->server, flags & CIFS_LOG_ERROR); + + /* mark it so buf will not be freed by delete_mid */ + if ((flags & CIFS_NO_RESP) == 0) + midQ->resp_buf = NULL; +out: + delete_mid(midQ); + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); + + return rc; +} + +int +SendReceive(const unsigned int xid, struct cifs_ses *ses, + struct smb_hdr *in_buf, struct smb_hdr *out_buf, + int *pbytes_returned, const int long_op) +{ + int rc = 0; + struct mid_q_entry *midQ; + + if (ses == NULL) { + cERROR(1, "Null smb session"); + return -EIO; + } + if (ses->server == NULL) { + cERROR(1, "Null tcp session"); + return -EIO; + } + + if (ses->server->tcpStatus == CifsExiting) + return -ENOENT; + + /* Ensure that we do not send more than 50 overlapping requests + to the same server. We may make this configurable later or + use ses->maxReq */ + + if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize + + MAX_CIFS_HDR_SIZE - 4) { + cERROR(1, "Illegal length, greater than maximum frame, %d", + be32_to_cpu(in_buf->smb_buf_length)); + return -EIO; + } + + rc = wait_for_free_request(ses->server, long_op); + if (rc) + return rc; + + /* make sure that we sign in the same order that we send on this socket + and avoid races inside tcp sendmsg code that could cause corruption + of smb data */ + + mutex_lock(&ses->server->srv_mutex); + + rc = allocate_mid(ses, in_buf, &midQ); + if (rc) { + mutex_unlock(&ses->server->srv_mutex); + /* Update # of requests on wire to server */ + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); + return rc; + } + + rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); + if (rc) { + mutex_unlock(&ses->server->srv_mutex); + goto out; + } + + midQ->midState = MID_REQUEST_SUBMITTED; +#ifdef CONFIG_CIFS_STATS2 + atomic_inc(&ses->server->inSend); +#endif + rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); +#ifdef CONFIG_CIFS_STATS2 + atomic_dec(&ses->server->inSend); + midQ->when_sent = jiffies; +#endif + mutex_unlock(&ses->server->srv_mutex); + + if (rc < 0) + goto out; + + if (long_op == CIFS_ASYNC_OP) + goto out; + + rc = wait_for_response(ses->server, midQ); + if (rc != 0) { + send_nt_cancel(ses->server, in_buf, midQ); + spin_lock(&GlobalMid_Lock); + if (midQ->midState == MID_REQUEST_SUBMITTED) { + /* no longer considered to be "in-flight" */ + midQ->callback = DeleteMidQEntry; + spin_unlock(&GlobalMid_Lock); + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); + return rc; + } + spin_unlock(&GlobalMid_Lock); + } + + rc = cifs_sync_mid_result(midQ, ses->server); + if (rc != 0) { + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); + return rc; + } + + if (!midQ->resp_buf || !out_buf || + midQ->midState != MID_RESPONSE_RECEIVED) { + rc = -EIO; + cERROR(1, "Bad MID state?"); + goto out; + } + + *pbytes_returned = be32_to_cpu(midQ->resp_buf->smb_buf_length); + memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4); + rc = cifs_check_receive(midQ, ses->server, 0); +out: + delete_mid(midQ); + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); + + return rc; +} + +/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows + blocking lock to return. */ + +static int +send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon, + struct smb_hdr *in_buf, + struct smb_hdr *out_buf) +{ + int bytes_returned; + struct cifs_ses *ses = tcon->ses; + LOCK_REQ *pSMB = (LOCK_REQ *)in_buf; + + /* We just modify the current in_buf to change + the type of lock from LOCKING_ANDX_SHARED_LOCK + or LOCKING_ANDX_EXCLUSIVE_LOCK to + LOCKING_ANDX_CANCEL_LOCK. */ + + pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES; + pSMB->Timeout = 0; + pSMB->hdr.Mid = GetNextMid(ses->server); + + return SendReceive(xid, ses, in_buf, out_buf, + &bytes_returned, 0); +} + +int +SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, + struct smb_hdr *in_buf, struct smb_hdr *out_buf, + int *pbytes_returned) +{ + int rc = 0; + int rstart = 0; + struct mid_q_entry *midQ; + struct cifs_ses *ses; + + if (tcon == NULL || tcon->ses == NULL) { + cERROR(1, "Null smb session"); + return -EIO; + } + ses = tcon->ses; + + if (ses->server == NULL) { + cERROR(1, "Null tcp session"); + return -EIO; + } + + if (ses->server->tcpStatus == CifsExiting) + return -ENOENT; + + /* Ensure that we do not send more than 50 overlapping requests + to the same server. We may make this configurable later or + use ses->maxReq */ + + if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize + + MAX_CIFS_HDR_SIZE - 4) { + cERROR(1, "Illegal length, greater than maximum frame, %d", + be32_to_cpu(in_buf->smb_buf_length)); + return -EIO; + } + + rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP); + if (rc) + return rc; + + /* make sure that we sign in the same order that we send on this socket + and avoid races inside tcp sendmsg code that could cause corruption + of smb data */ + + mutex_lock(&ses->server->srv_mutex); + + rc = allocate_mid(ses, in_buf, &midQ); + if (rc) { + mutex_unlock(&ses->server->srv_mutex); + return rc; + } + + rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); + if (rc) { + delete_mid(midQ); + mutex_unlock(&ses->server->srv_mutex); + return rc; + } + + midQ->midState = MID_REQUEST_SUBMITTED; +#ifdef CONFIG_CIFS_STATS2 + atomic_inc(&ses->server->inSend); +#endif + rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); +#ifdef CONFIG_CIFS_STATS2 + atomic_dec(&ses->server->inSend); + midQ->when_sent = jiffies; +#endif + mutex_unlock(&ses->server->srv_mutex); + + if (rc < 0) { + delete_mid(midQ); + return rc; + } + + /* Wait for a reply - allow signals to interrupt. */ + rc = wait_event_interruptible(ses->server->response_q, + (!(midQ->midState == MID_REQUEST_SUBMITTED)) || + ((ses->server->tcpStatus != CifsGood) && + (ses->server->tcpStatus != CifsNew))); + + /* Were we interrupted by a signal ? */ + if ((rc == -ERESTARTSYS) && + (midQ->midState == MID_REQUEST_SUBMITTED) && + ((ses->server->tcpStatus == CifsGood) || + (ses->server->tcpStatus == CifsNew))) { + + if (in_buf->Command == SMB_COM_TRANSACTION2) { + /* POSIX lock. We send a NT_CANCEL SMB to cause the + blocking lock to return. */ + rc = send_nt_cancel(ses->server, in_buf, midQ); + if (rc) { + delete_mid(midQ); + return rc; + } + } else { + /* Windows lock. We send a LOCKINGX_CANCEL_LOCK + to cause the blocking lock to return. */ + + rc = send_lock_cancel(xid, tcon, in_buf, out_buf); + + /* If we get -ENOLCK back the lock may have + already been removed. Don't exit in this case. */ + if (rc && rc != -ENOLCK) { + delete_mid(midQ); + return rc; + } + } + + rc = wait_for_response(ses->server, midQ); + if (rc) { + send_nt_cancel(ses->server, in_buf, midQ); + spin_lock(&GlobalMid_Lock); + if (midQ->midState == MID_REQUEST_SUBMITTED) { + /* no longer considered to be "in-flight" */ + midQ->callback = DeleteMidQEntry; + spin_unlock(&GlobalMid_Lock); + return rc; + } + spin_unlock(&GlobalMid_Lock); + } + + /* We got the response - restart system call. */ + rstart = 1; + } + + rc = cifs_sync_mid_result(midQ, ses->server); + if (rc != 0) + return rc; + + /* rcvd frame is ok */ + if (out_buf == NULL || midQ->midState != MID_RESPONSE_RECEIVED) { + rc = -EIO; + cERROR(1, "Bad MID state?"); + goto out; + } + + *pbytes_returned = be32_to_cpu(midQ->resp_buf->smb_buf_length); + memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4); + rc = cifs_check_receive(midQ, ses->server, 0); +out: + delete_mid(midQ); + if (rstart && rc == -EACCES) + return -ERESTARTSYS; + return rc; +} diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c new file mode 100644 index 00000000..2a22fb29 --- /dev/null +++ b/fs/cifs/xattr.c @@ -0,0 +1,420 @@ +/* + * fs/cifs/xattr.c + * + * Copyright (c) International Business Machines Corp., 2003, 2007 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include "cifsfs.h" +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_debug.h" + +#define MAX_EA_VALUE_SIZE 65535 +#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" +#define CIFS_XATTR_CIFS_ACL "system.cifs_acl" +#define CIFS_XATTR_USER_PREFIX "user." +#define CIFS_XATTR_SYSTEM_PREFIX "system." +#define CIFS_XATTR_OS2_PREFIX "os2." +#define CIFS_XATTR_SECURITY_PREFIX "security." +#define CIFS_XATTR_TRUSTED_PREFIX "trusted." +#define XATTR_TRUSTED_PREFIX_LEN 8 +#define XATTR_SECURITY_PREFIX_LEN 9 +/* BB need to add server (Samba e.g) support for security and trusted prefix */ + + + +int cifs_removexattr(struct dentry *direntry, const char *ea_name) +{ + int rc = -EOPNOTSUPP; +#ifdef CONFIG_CIFS_XATTR + int xid; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct super_block *sb; + char *full_path = NULL; + + if (direntry == NULL) + return -EIO; + if (direntry->d_inode == NULL) + return -EIO; + sb = direntry->d_inode->i_sb; + if (sb == NULL) + return -EIO; + + cifs_sb = CIFS_SB(sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + xid = GetXid(); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto remove_ea_exit; + } + if (ea_name == NULL) { + cFYI(1, "Null xattr names not supported"); + } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) + && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) { + cFYI(1, + "illegal xattr request %s (only user namespace supported)", + ea_name); + /* BB what if no namespace prefix? */ + /* Should we just pass them to server, except for + system and perhaps security prefixes? */ + } else { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + goto remove_ea_exit; + + ea_name += 5; /* skip past user. prefix */ + rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL, + (__u16)0, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + } +remove_ea_exit: + kfree(full_path); + FreeXid(xid); + cifs_put_tlink(tlink); +#endif + return rc; +} + +int cifs_setxattr(struct dentry *direntry, const char *ea_name, + const void *ea_value, size_t value_size, int flags) +{ + int rc = -EOPNOTSUPP; +#ifdef CONFIG_CIFS_XATTR + int xid; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct super_block *sb; + char *full_path; + struct cifs_ntsd *pacl; + + if (direntry == NULL) + return -EIO; + if (direntry->d_inode == NULL) + return -EIO; + sb = direntry->d_inode->i_sb; + if (sb == NULL) + return -EIO; + + cifs_sb = CIFS_SB(sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + xid = GetXid(); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto set_ea_exit; + } + /* return dos attributes as pseudo xattr */ + /* return alt name if available as pseudo attr */ + + /* if proc/fs/cifs/streamstoxattr is set then + search server for EAs or streams to + returns as xattrs */ + if (value_size > MAX_EA_VALUE_SIZE) { + cFYI(1, "size of EA value too large"); + rc = -EOPNOTSUPP; + goto set_ea_exit; + } + + if (ea_name == NULL) { + cFYI(1, "Null xattr names not supported"); + } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + goto set_ea_exit; + if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) + cFYI(1, "attempt to set cifs inode metadata"); + + ea_name += 5; /* skip past user. prefix */ + rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, + (__u16)value_size, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + goto set_ea_exit; + + ea_name += 4; /* skip past os2. prefix */ + rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, + (__u16)value_size, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, + strlen(CIFS_XATTR_CIFS_ACL)) == 0) { + pacl = kmalloc(value_size, GFP_KERNEL); + if (!pacl) { + cFYI(1, "%s: Can't allocate memory for ACL", + __func__); + rc = -ENOMEM; + } else { +#ifdef CONFIG_CIFS_ACL + memcpy(pacl, ea_value, value_size); + rc = set_cifs_acl(pacl, value_size, + direntry->d_inode, full_path); + if (rc == 0) /* force revalidate of the inode */ + CIFS_I(direntry->d_inode)->time = 0; + kfree(pacl); +#else + cFYI(1, "Set CIFS ACL not supported yet"); +#endif /* CONFIG_CIFS_ACL */ + } + } else { + int temp; + temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS)); + if (temp == 0) { +#ifdef CONFIG_CIFS_POSIX + if (sb->s_flags & MS_POSIXACL) + rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, + ea_value, (const int)value_size, + ACL_TYPE_ACCESS, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + cFYI(1, "set POSIX ACL rc %d", rc); +#else + cFYI(1, "set POSIX ACL not supported"); +#endif + } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { +#ifdef CONFIG_CIFS_POSIX + if (sb->s_flags & MS_POSIXACL) + rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, + ea_value, (const int)value_size, + ACL_TYPE_DEFAULT, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + cFYI(1, "set POSIX default ACL rc %d", rc); +#else + cFYI(1, "set default POSIX ACL not supported"); +#endif + } else { + cFYI(1, "illegal xattr request %s (only user namespace" + " supported)", ea_name); + /* BB what if no namespace prefix? */ + /* Should we just pass them to server, except for + system and perhaps security prefixes? */ + } + } + +set_ea_exit: + kfree(full_path); + FreeXid(xid); + cifs_put_tlink(tlink); +#endif + return rc; +} + +ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, + void *ea_value, size_t buf_size) +{ + ssize_t rc = -EOPNOTSUPP; +#ifdef CONFIG_CIFS_XATTR + int xid; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct super_block *sb; + char *full_path; + + if (direntry == NULL) + return -EIO; + if (direntry->d_inode == NULL) + return -EIO; + sb = direntry->d_inode->i_sb; + if (sb == NULL) + return -EIO; + + cifs_sb = CIFS_SB(sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + xid = GetXid(); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto get_ea_exit; + } + /* return dos attributes as pseudo xattr */ + /* return alt name if available as pseudo attr */ + if (ea_name == NULL) { + cFYI(1, "Null xattr names not supported"); + } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + goto get_ea_exit; + + if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) { + cFYI(1, "attempt to query cifs inode metadata"); + /* revalidate/getattr then populate from inode */ + } /* BB add else when above is implemented */ + ea_name += 5; /* skip past user. prefix */ + rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, + buf_size, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + goto get_ea_exit; + + ea_name += 4; /* skip past os2. prefix */ + rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, + buf_size, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS)) == 0) { +#ifdef CONFIG_CIFS_POSIX + if (sb->s_flags & MS_POSIXACL) + rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, + ea_value, buf_size, ACL_TYPE_ACCESS, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); +#else + cFYI(1, "Query POSIX ACL not supported yet"); +#endif /* CONFIG_CIFS_POSIX */ + } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { +#ifdef CONFIG_CIFS_POSIX + if (sb->s_flags & MS_POSIXACL) + rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, + ea_value, buf_size, ACL_TYPE_DEFAULT, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); +#else + cFYI(1, "Query POSIX default ACL not supported yet"); +#endif /* CONFIG_CIFS_POSIX */ + } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, + strlen(CIFS_XATTR_CIFS_ACL)) == 0) { +#ifdef CONFIG_CIFS_ACL + u32 acllen; + struct cifs_ntsd *pacl; + + pacl = get_cifs_acl(cifs_sb, direntry->d_inode, + full_path, &acllen); + if (IS_ERR(pacl)) { + rc = PTR_ERR(pacl); + cERROR(1, "%s: error %zd getting sec desc", + __func__, rc); + } else { + if (ea_value) { + if (acllen > buf_size) + acllen = -ERANGE; + else + memcpy(ea_value, pacl, acllen); + } + rc = acllen; + kfree(pacl); + } +#else + cFYI(1, "Query CIFS ACL not supported yet"); +#endif /* CONFIG_CIFS_ACL */ + } else if (strncmp(ea_name, + CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { + cFYI(1, "Trusted xattr namespace not supported yet"); + } else if (strncmp(ea_name, + CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { + cFYI(1, "Security xattr namespace not supported yet"); + } else + cFYI(1, + "illegal xattr request %s (only user namespace supported)", + ea_name); + + /* We could add an additional check for streams ie + if proc/fs/cifs/streamstoxattr is set then + search server for EAs or streams to + returns as xattrs */ + + if (rc == -EINVAL) + rc = -EOPNOTSUPP; + +get_ea_exit: + kfree(full_path); + FreeXid(xid); + cifs_put_tlink(tlink); +#endif + return rc; +} + +ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) +{ + ssize_t rc = -EOPNOTSUPP; +#ifdef CONFIG_CIFS_XATTR + int xid; + struct cifs_sb_info *cifs_sb; + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + struct super_block *sb; + char *full_path; + + if (direntry == NULL) + return -EIO; + if (direntry->d_inode == NULL) + return -EIO; + sb = direntry->d_inode->i_sb; + if (sb == NULL) + return -EIO; + + cifs_sb = CIFS_SB(sb); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + return -EOPNOTSUPP; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + xid = GetXid(); + + full_path = build_path_from_dentry(direntry); + if (full_path == NULL) { + rc = -ENOMEM; + goto list_ea_exit; + } + /* return dos attributes as pseudo xattr */ + /* return alt name if available as pseudo attr */ + + /* if proc/fs/cifs/streamstoxattr is set then + search server for EAs or streams to + returns as xattrs */ + rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data, + buf_size, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + +list_ea_exit: + kfree(full_path); + FreeXid(xid); + cifs_put_tlink(tlink); +#endif + return rc; +} diff --git a/fs/coda/Kconfig b/fs/coda/Kconfig new file mode 100644 index 00000000..c0e5a7fa --- /dev/null +++ b/fs/coda/Kconfig @@ -0,0 +1,21 @@ +config CODA_FS + tristate "Coda file system support (advanced network fs)" + depends on INET + help + Coda is an advanced network file system, similar to NFS in that it + enables you to mount file systems of a remote server and access them + with regular Unix commands as if they were sitting on your hard + disk. Coda has several advantages over NFS: support for + disconnected operation (e.g. for laptops), read/write server + replication, security model for authentication and encryption, + persistent client caches and write back caching. + + If you say Y here, your Linux box will be able to act as a Coda + *client*. You will need user level code as well, both for the + client and server. Servers are currently user level, i.e. they need + no kernel support. Please read + and check out the Coda + home page . + + To compile the coda client support as a module, choose M here: the + module will be called coda. diff --git a/fs/coda/Makefile b/fs/coda/Makefile new file mode 100644 index 00000000..1bab69a0 --- /dev/null +++ b/fs/coda/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the Linux Coda filesystem routines. +# + +obj-$(CONFIG_CODA_FS) += coda.o + +coda-objs := psdev.o cache.o cnode.o inode.o dir.o file.o upcall.o \ + coda_linux.o symlink.o pioctl.o sysctl.o + +# If you want debugging output, please uncomment the following line. + +# ccflags-y := -DDEBUG -DDEBUG_SMB_MALLOC=1 diff --git a/fs/coda/cache.c b/fs/coda/cache.c new file mode 100644 index 00000000..69015787 --- /dev/null +++ b/fs/coda/cache.c @@ -0,0 +1,122 @@ +/* + * Cache operations for Coda. + * For Linux 2.1: (C) 1997 Carnegie Mellon University + * For Linux 2.3: (C) 2000 Carnegie Mellon University + * + * Carnegie Mellon encourages users of this code to contribute improvements + * to the Coda project http://www.coda.cs.cmu.edu/ . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "coda_linux.h" +#include "coda_cache.h" + +static atomic_t permission_epoch = ATOMIC_INIT(0); + +/* replace or extend an acl cache hit */ +void coda_cache_enter(struct inode *inode, int mask) +{ + struct coda_inode_info *cii = ITOC(inode); + + spin_lock(&cii->c_lock); + cii->c_cached_epoch = atomic_read(&permission_epoch); + if (cii->c_uid != current_fsuid()) { + cii->c_uid = current_fsuid(); + cii->c_cached_perm = mask; + } else + cii->c_cached_perm |= mask; + spin_unlock(&cii->c_lock); +} + +/* remove cached acl from an inode */ +void coda_cache_clear_inode(struct inode *inode) +{ + struct coda_inode_info *cii = ITOC(inode); + spin_lock(&cii->c_lock); + cii->c_cached_epoch = atomic_read(&permission_epoch) - 1; + spin_unlock(&cii->c_lock); +} + +/* remove all acl caches */ +void coda_cache_clear_all(struct super_block *sb) +{ + atomic_inc(&permission_epoch); +} + + +/* check if the mask has been matched against the acl already */ +int coda_cache_check(struct inode *inode, int mask) +{ + struct coda_inode_info *cii = ITOC(inode); + int hit; + + spin_lock(&cii->c_lock); + hit = (mask & cii->c_cached_perm) == mask && + cii->c_uid == current_fsuid() && + cii->c_cached_epoch == atomic_read(&permission_epoch); + spin_unlock(&cii->c_lock); + + return hit; +} + + +/* Purging dentries and children */ +/* The following routines drop dentries which are not + in use and flag dentries which are in use to be + zapped later. + + The flags are detected by: + - coda_dentry_revalidate (for lookups) if the flag is C_PURGE + - coda_dentry_delete: to remove dentry from the cache when d_count + falls to zero + - an inode method coda_revalidate (for attributes) if the + flag is C_VATTR +*/ + +/* this won't do any harm: just flag all children */ +static void coda_flag_children(struct dentry *parent, int flag) +{ + struct list_head *child; + struct dentry *de; + + spin_lock(&parent->d_lock); + list_for_each(child, &parent->d_subdirs) + { + de = list_entry(child, struct dentry, d_u.d_child); + /* don't know what to do with negative dentries */ + if ( ! de->d_inode ) + continue; + coda_flag_inode(de->d_inode, flag); + } + spin_unlock(&parent->d_lock); + return; +} + +void coda_flag_inode_children(struct inode *inode, int flag) +{ + struct dentry *alias_de; + + if ( !inode || !S_ISDIR(inode->i_mode)) + return; + + alias_de = d_find_alias(inode); + if (!alias_de) + return; + coda_flag_children(alias_de, flag); + shrink_dcache_parent(alias_de); + dput(alias_de); +} + diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c new file mode 100644 index 00000000..6475877b --- /dev/null +++ b/fs/coda/cnode.c @@ -0,0 +1,174 @@ +/* cnode related routines for the coda kernel code + (C) 1996 Peter Braam + */ + +#include +#include +#include + +#include +#include +#include "coda_linux.h" + +static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2) +{ + return memcmp(fid1, fid2, sizeof(*fid1)) == 0; +} + +static const struct inode_operations coda_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = coda_setattr, +}; + +/* cnode.c */ +static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr) +{ + coda_vattr_to_iattr(inode, attr); + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &coda_file_inode_operations; + inode->i_fop = &coda_file_operations; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &coda_dir_inode_operations; + inode->i_fop = &coda_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &coda_symlink_inode_operations; + inode->i_data.a_ops = &coda_symlink_aops; + inode->i_mapping = &inode->i_data; + } else + init_special_inode(inode, inode->i_mode, huge_decode_dev(attr->va_rdev)); +} + +static int coda_test_inode(struct inode *inode, void *data) +{ + struct CodaFid *fid = (struct CodaFid *)data; + struct coda_inode_info *cii = ITOC(inode); + return coda_fideq(&cii->c_fid, fid); +} + +static int coda_set_inode(struct inode *inode, void *data) +{ + struct CodaFid *fid = (struct CodaFid *)data; + struct coda_inode_info *cii = ITOC(inode); + cii->c_fid = *fid; + return 0; +} + +struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid, + struct coda_vattr * attr) +{ + struct inode *inode; + struct coda_inode_info *cii; + unsigned long hash = coda_f2i(fid); + + inode = iget5_locked(sb, hash, coda_test_inode, coda_set_inode, fid); + + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + cii = ITOC(inode); + /* we still need to set i_ino for things like stat(2) */ + inode->i_ino = hash; + /* inode is locked and unique, no need to grab cii->c_lock */ + cii->c_mapcount = 0; + unlock_new_inode(inode); + } + + /* always replace the attributes, type might have changed */ + coda_fill_inode(inode, attr); + return inode; +} + +/* this is effectively coda_iget: + - get attributes (might be cached) + - get the inode for the fid using vfs iget + - link the two up if this is needed + - fill in the attributes +*/ +int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_block *sb) +{ + struct coda_vattr attr; + int error; + + /* We get inode numbers from Venus -- see venus source */ + error = venus_getattr(sb, fid, &attr); + if ( error ) { + *inode = NULL; + return error; + } + + *inode = coda_iget(sb, fid, &attr); + if ( IS_ERR(*inode) ) { + printk("coda_cnode_make: coda_iget failed\n"); + return PTR_ERR(*inode); + } + return 0; +} + + +/* Although we treat Coda file identifiers as immutable, there is one + * special case for files created during a disconnection where they may + * not be globally unique. When an identifier collision is detected we + * first try to flush the cached inode from the kernel and finally + * resort to renaming/rehashing in-place. Userspace remembers both old + * and new values of the identifier to handle any in-flight upcalls. + * The real solution is to use globally unique UUIDs as identifiers, but + * retrofitting the existing userspace code for this is non-trivial. */ +void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid, + struct CodaFid *newfid) +{ + struct coda_inode_info *cii = ITOC(inode); + unsigned long hash = coda_f2i(newfid); + + BUG_ON(!coda_fideq(&cii->c_fid, oldfid)); + + /* replace fid and rehash inode */ + /* XXX we probably need to hold some lock here! */ + remove_inode_hash(inode); + cii->c_fid = *newfid; + inode->i_ino = hash; + __insert_inode_hash(inode, hash); +} + +/* convert a fid to an inode. */ +struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb) +{ + struct inode *inode; + unsigned long hash = coda_f2i(fid); + + if ( !sb ) { + printk("coda_fid_to_inode: no sb!\n"); + return NULL; + } + + inode = ilookup5(sb, hash, coda_test_inode, fid); + if ( !inode ) + return NULL; + + /* we should never see newly created inodes because we intentionally + * fail in the initialization callback */ + BUG_ON(inode->i_state & I_NEW); + + return inode; +} + +/* the CONTROL inode is made without asking attributes from Venus */ +int coda_cnode_makectl(struct inode **inode, struct super_block *sb) +{ + int error = -ENOMEM; + + *inode = new_inode(sb); + if (*inode) { + (*inode)->i_ino = CTL_INO; + (*inode)->i_op = &coda_ioctl_inode_operations; + (*inode)->i_fop = &coda_ioctl_operations; + (*inode)->i_mode = 0444; + error = 0; + } + + return error; +} + diff --git a/fs/coda/coda_cache.h b/fs/coda/coda_cache.h new file mode 100644 index 00000000..c910b5eb --- /dev/null +++ b/fs/coda/coda_cache.h @@ -0,0 +1,22 @@ +/* Coda filesystem -- Linux Minicache + * + * Copyright (C) 1989 - 1997 Carnegie Mellon University + * + * Carnegie Mellon University encourages users of this software to + * contribute improvements to the Coda project. Contact Peter Braam + * + */ + +#ifndef _CFSNC_HEADER_ +#define _CFSNC_HEADER_ + +/* credential cache */ +void coda_cache_enter(struct inode *inode, int mask); +void coda_cache_clear_inode(struct inode *); +void coda_cache_clear_all(struct super_block *sb); +int coda_cache_check(struct inode *inode, int mask); + +/* for downcalls and attributes and lookups */ +void coda_flag_inode_children(struct inode *inode, int flag); + +#endif /* _CFSNC_HEADER_ */ diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h new file mode 100644 index 00000000..e35071b1 --- /dev/null +++ b/fs/coda/coda_fs_i.h @@ -0,0 +1,58 @@ +/* + * coda_fs_i.h + * + * Copyright (C) 1998 Carnegie Mellon University + * + */ + +#ifndef _LINUX_CODA_FS_I +#define _LINUX_CODA_FS_I + +#include +#include +#include +#include + +/* + * coda fs inode data + * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and + * c_cached_perm. + * vfs_inode is set only when the inode is created and never changes. + * c_fid is set when the inode is created and should be considered immutable. + */ +struct coda_inode_info { + struct CodaFid c_fid; /* Coda identifier */ + u_short c_flags; /* flags (see below) */ + unsigned int c_mapcount; /* nr of times this inode is mapped */ + unsigned int c_cached_epoch; /* epoch for cached permissions */ + vuid_t c_uid; /* fsuid for cached permissions */ + unsigned int c_cached_perm; /* cached access permissions */ + spinlock_t c_lock; + struct inode vfs_inode; +}; + +/* + * coda fs file private data + */ +#define CODA_MAGIC 0xC0DAC0DA +struct coda_file_info { + int cfi_magic; /* magic number */ + struct file *cfi_container; /* container file for this cnode */ + unsigned int cfi_mapcount; /* nr of times this file is mapped */ +}; + +#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data)) + +/* flags */ +#define C_VATTR 0x1 /* Validity of vattr in inode */ +#define C_FLUSH 0x2 /* used after a flush */ +#define C_DYING 0x4 /* from venus (which died) */ +#define C_PURGE 0x8 + +int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *); +struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr); +int coda_cnode_makectl(struct inode **inode, struct super_block *sb); +struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb); +void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *); + +#endif diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h new file mode 100644 index 00000000..6b443ff4 --- /dev/null +++ b/fs/coda/coda_int.h @@ -0,0 +1,20 @@ +#ifndef _CODA_INT_ +#define _CODA_INT_ + +struct dentry; +struct file; + +extern struct file_system_type coda_fs_type; +extern unsigned long coda_timeout; +extern int coda_hard; +extern int coda_fake_statfs; + +void coda_destroy_inodecache(void); +int coda_init_inodecache(void); +int coda_fsync(struct file *coda_file, int datasync); +void coda_sysctl_init(void); +void coda_sysctl_clean(void); + +#endif /* _CODA_INT_ */ + + diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c new file mode 100644 index 00000000..2bdbcc11 --- /dev/null +++ b/fs/coda/coda_linux.c @@ -0,0 +1,192 @@ +/* + * Inode operations for Coda filesystem + * Original version: (C) 1996 P. Braam and M. Callahan + * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon encourages users to contribute improvements to + * the Coda project. Contact Peter Braam (coda@cs.cmu.edu). + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "coda_linux.h" + +/* initialize the debugging variables */ +int coda_fake_statfs; + +/* print a fid */ +char * coda_f2s(struct CodaFid *f) +{ + static char s[60]; + + sprintf(s, "(%08x.%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2], f->opaque[3]); + + return s; +} + +/* recognize special .CONTROL name */ +int coda_iscontrol(const char *name, size_t length) +{ + return ((CODA_CONTROLLEN == length) && + (strncmp(name, CODA_CONTROL, CODA_CONTROLLEN) == 0)); +} + +/* recognize /coda inode */ +int coda_isroot(struct inode *i) +{ + return ( i->i_sb->s_root->d_inode == i ); +} + +unsigned short coda_flags_to_cflags(unsigned short flags) +{ + unsigned short coda_flags = 0; + + if ((flags & O_ACCMODE) == O_RDONLY) + coda_flags |= C_O_READ; + + if ((flags & O_ACCMODE) == O_RDWR) + coda_flags |= C_O_READ | C_O_WRITE; + + if ((flags & O_ACCMODE) == O_WRONLY) + coda_flags |= C_O_WRITE; + + if (flags & O_TRUNC) + coda_flags |= C_O_TRUNC; + + if (flags & O_CREAT) + coda_flags |= C_O_CREAT; + + if (flags & O_EXCL) + coda_flags |= C_O_EXCL; + + return coda_flags; +} + + +/* utility functions below */ +void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) +{ + int inode_type; + /* inode's i_flags, i_ino are set by iget + XXX: is this all we need ?? + */ + switch (attr->va_type) { + case C_VNON: + inode_type = 0; + break; + case C_VREG: + inode_type = S_IFREG; + break; + case C_VDIR: + inode_type = S_IFDIR; + break; + case C_VLNK: + inode_type = S_IFLNK; + break; + default: + inode_type = 0; + } + inode->i_mode |= inode_type; + + if (attr->va_mode != (u_short) -1) + inode->i_mode = attr->va_mode | inode_type; + if (attr->va_uid != -1) + inode->i_uid = (uid_t) attr->va_uid; + if (attr->va_gid != -1) + inode->i_gid = (gid_t) attr->va_gid; + if (attr->va_nlink != -1) + inode->i_nlink = attr->va_nlink; + if (attr->va_size != -1) + inode->i_size = attr->va_size; + if (attr->va_size != -1) + inode->i_blocks = (attr->va_size + 511) >> 9; + if (attr->va_atime.tv_sec != -1) + inode->i_atime = attr->va_atime; + if (attr->va_mtime.tv_sec != -1) + inode->i_mtime = attr->va_mtime; + if (attr->va_ctime.tv_sec != -1) + inode->i_ctime = attr->va_ctime; +} + + +/* + * BSD sets attributes that need not be modified to -1. + * Linux uses the valid field to indicate what should be + * looked at. The BSD type field needs to be deduced from linux + * mode. + * So we have to do some translations here. + */ + +void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) +{ + unsigned int valid; + + /* clean out */ + vattr->va_mode = -1; + vattr->va_uid = (vuid_t) -1; + vattr->va_gid = (vgid_t) -1; + vattr->va_size = (off_t) -1; + vattr->va_atime.tv_sec = (time_t) -1; + vattr->va_atime.tv_nsec = (time_t) -1; + vattr->va_mtime.tv_sec = (time_t) -1; + vattr->va_mtime.tv_nsec = (time_t) -1; + vattr->va_ctime.tv_sec = (time_t) -1; + vattr->va_ctime.tv_nsec = (time_t) -1; + vattr->va_type = C_VNON; + vattr->va_fileid = -1; + vattr->va_gen = -1; + vattr->va_bytes = -1; + vattr->va_nlink = -1; + vattr->va_blocksize = -1; + vattr->va_rdev = -1; + vattr->va_flags = 0; + + /* determine the type */ +#if 0 + mode = iattr->ia_mode; + if ( S_ISDIR(mode) ) { + vattr->va_type = C_VDIR; + } else if ( S_ISREG(mode) ) { + vattr->va_type = C_VREG; + } else if ( S_ISLNK(mode) ) { + vattr->va_type = C_VLNK; + } else { + /* don't do others */ + vattr->va_type = C_VNON; + } +#endif + + /* set those vattrs that need change */ + valid = iattr->ia_valid; + if ( valid & ATTR_MODE ) { + vattr->va_mode = iattr->ia_mode; + } + if ( valid & ATTR_UID ) { + vattr->va_uid = (vuid_t) iattr->ia_uid; + } + if ( valid & ATTR_GID ) { + vattr->va_gid = (vgid_t) iattr->ia_gid; + } + if ( valid & ATTR_SIZE ) { + vattr->va_size = iattr->ia_size; + } + if ( valid & ATTR_ATIME ) { + vattr->va_atime = iattr->ia_atime; + } + if ( valid & ATTR_MTIME ) { + vattr->va_mtime = iattr->ia_mtime; + } + if ( valid & ATTR_CTIME ) { + vattr->va_ctime = iattr->ia_ctime; + } +} + diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h new file mode 100644 index 00000000..9b0c5323 --- /dev/null +++ b/fs/coda/coda_linux.h @@ -0,0 +1,101 @@ +/* + * Coda File System, Linux Kernel module + * + * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University + * Linux modifications (C) 1996, Peter J. Braam + * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon University encourages users of this software to + * contribute improvements to the Coda project. + */ + +#ifndef _LINUX_CODA_FS +#define _LINUX_CODA_FS + +#include +#include +#include +#include +#include +#include +#include +#include +#include "coda_fs_i.h" + +/* operations */ +extern const struct inode_operations coda_dir_inode_operations; +extern const struct inode_operations coda_file_inode_operations; +extern const struct inode_operations coda_ioctl_inode_operations; + +extern const struct dentry_operations coda_dentry_operations; + +extern const struct address_space_operations coda_file_aops; +extern const struct address_space_operations coda_symlink_aops; + +extern const struct file_operations coda_dir_operations; +extern const struct file_operations coda_file_operations; +extern const struct file_operations coda_ioctl_operations; + +/* operations shared over more than one file */ +int coda_open(struct inode *i, struct file *f); +int coda_release(struct inode *i, struct file *f); +int coda_permission(struct inode *inode, int mask, unsigned int flags); +int coda_revalidate_inode(struct dentry *); +int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *); +int coda_setattr(struct dentry *, struct iattr *); + +/* this file: heloers */ +char *coda_f2s(struct CodaFid *f); +int coda_isroot(struct inode *i); +int coda_iscontrol(const char *name, size_t length); + +void coda_vattr_to_iattr(struct inode *, struct coda_vattr *); +void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *); +unsigned short coda_flags_to_cflags(unsigned short); + +/* sysctl.h */ +void coda_sysctl_init(void); +void coda_sysctl_clean(void); + +#define CODA_ALLOC(ptr, cast, size) do { \ + if (size < PAGE_SIZE) \ + ptr = kmalloc((unsigned long) size, GFP_KERNEL); \ + else \ + ptr = (cast)vmalloc((unsigned long) size); \ + if (!ptr) \ + printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \ + else memset( ptr, 0, size ); \ +} while (0) + + +#define CODA_FREE(ptr,size) \ + do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0) + +/* inode to cnode access functions */ + +static inline struct coda_inode_info *ITOC(struct inode *inode) +{ + return list_entry(inode, struct coda_inode_info, vfs_inode); +} + +static __inline__ struct CodaFid *coda_i2f(struct inode *inode) +{ + return &(ITOC(inode)->c_fid); +} + +static __inline__ char *coda_i2s(struct inode *inode) +{ + return coda_f2s(&(ITOC(inode)->c_fid)); +} + +/* this will not zap the inode away */ +static __inline__ void coda_flag_inode(struct inode *inode, int flag) +{ + struct coda_inode_info *cii = ITOC(inode); + + spin_lock(&cii->c_lock); + cii->c_flags |= flag; + spin_unlock(&cii->c_lock); +} + +#endif diff --git a/fs/coda/dir.c b/fs/coda/dir.c new file mode 100644 index 00000000..2b8dae4d --- /dev/null +++ b/fs/coda/dir.c @@ -0,0 +1,650 @@ + +/* + * Directory operations for Coda filesystem + * Original version: (C) 1996 P. Braam and M. Callahan + * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon encourages users to contribute improvements to + * the Coda project. Contact Peter Braam (coda@cs.cmu.edu). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include "coda_linux.h" +#include "coda_cache.h" + +#include "coda_int.h" + +/* dir inode-ops */ +static int coda_create(struct inode *dir, struct dentry *new, int mode, struct nameidata *nd); +static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, struct nameidata *nd); +static int coda_link(struct dentry *old_dentry, struct inode *dir_inode, + struct dentry *entry); +static int coda_unlink(struct inode *dir_inode, struct dentry *entry); +static int coda_symlink(struct inode *dir_inode, struct dentry *entry, + const char *symname); +static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, int mode); +static int coda_rmdir(struct inode *dir_inode, struct dentry *entry); +static int coda_rename(struct inode *old_inode, struct dentry *old_dentry, + struct inode *new_inode, struct dentry *new_dentry); + +/* dir file-ops */ +static int coda_readdir(struct file *file, void *buf, filldir_t filldir); + +/* dentry ops */ +static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd); +static int coda_dentry_delete(const struct dentry *); + +/* support routines */ +static int coda_venus_readdir(struct file *coda_file, void *buf, + filldir_t filldir); + +/* same as fs/bad_inode.c */ +static int coda_return_EIO(void) +{ + return -EIO; +} +#define CODA_EIO_ERROR ((void *) (coda_return_EIO)) + +const struct dentry_operations coda_dentry_operations = +{ + .d_revalidate = coda_dentry_revalidate, + .d_delete = coda_dentry_delete, +}; + +const struct inode_operations coda_dir_inode_operations = +{ + .create = coda_create, + .lookup = coda_lookup, + .link = coda_link, + .unlink = coda_unlink, + .symlink = coda_symlink, + .mkdir = coda_mkdir, + .rmdir = coda_rmdir, + .mknod = CODA_EIO_ERROR, + .rename = coda_rename, + .permission = coda_permission, + .getattr = coda_getattr, + .setattr = coda_setattr, +}; + +const struct file_operations coda_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = coda_readdir, + .open = coda_open, + .release = coda_release, + .fsync = coda_fsync, +}; + + +/* inode operations for directories */ +/* access routines: lookup, readlink, permission */ +static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd) +{ + struct inode *inode = NULL; + struct CodaFid resfid = { { 0, } }; + int type = 0; + int error = 0; + const char *name = entry->d_name.name; + size_t length = entry->d_name.len; + + if (length > CODA_MAXNAMLEN) { + printk(KERN_ERR "name too long: lookup, %s (%*s)\n", + coda_i2s(dir), (int)length, name); + return ERR_PTR(-ENAMETOOLONG); + } + + /* control object, create inode on the fly */ + if (coda_isroot(dir) && coda_iscontrol(name, length)) { + error = coda_cnode_makectl(&inode, dir->i_sb); + type = CODA_NOCACHE; + goto exit; + } + + error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length, + &type, &resfid); + if (!error) + error = coda_cnode_make(&inode, &resfid, dir->i_sb); + + if (error && error != -ENOENT) + return ERR_PTR(error); + +exit: + if (inode && (type & CODA_NOCACHE)) + coda_flag_inode(inode, C_VATTR | C_PURGE); + + return d_splice_alias(inode, entry); +} + + +int coda_permission(struct inode *inode, int mask, unsigned int flags) +{ + int error; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + mask &= MAY_READ | MAY_WRITE | MAY_EXEC; + + if (!mask) + return 0; + + if ((mask & MAY_EXEC) && !execute_ok(inode)) + return -EACCES; + + if (coda_cache_check(inode, mask)) + return 0; + + error = venus_access(inode->i_sb, coda_i2f(inode), mask); + + if (!error) + coda_cache_enter(inode, mask); + + return error; +} + + +static inline void coda_dir_update_mtime(struct inode *dir) +{ +#ifdef REQUERY_VENUS_FOR_MTIME + /* invalidate the directory cnode's attributes so we refetch the + * attributes from venus next time the inode is referenced */ + coda_flag_inode(dir, C_VATTR); +#else + /* optimistically we can also act as if our nose bleeds. The + * granularity of the mtime is coarse anyways so we might actually be + * right most of the time. Note: we only do this for directories. */ + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; +#endif +} + +/* we have to wrap inc_nlink/drop_nlink because sometimes userspace uses a + * trick to fool GNU find's optimizations. If we can't be sure of the link + * (because of volume mount points) we set i_nlink to 1 which forces find + * to consider every child as a possible directory. We should also never + * see an increment or decrement for deleted directories where i_nlink == 0 */ +static inline void coda_dir_inc_nlink(struct inode *dir) +{ + if (dir->i_nlink >= 2) + inc_nlink(dir); +} + +static inline void coda_dir_drop_nlink(struct inode *dir) +{ + if (dir->i_nlink > 2) + drop_nlink(dir); +} + +/* creation routines: create, mknod, mkdir, link, symlink */ +static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd) +{ + int error; + const char *name=de->d_name.name; + int length=de->d_name.len; + struct inode *inode; + struct CodaFid newfid; + struct coda_vattr attrs; + + if (coda_isroot(dir) && coda_iscontrol(name, length)) + return -EPERM; + + error = venus_create(dir->i_sb, coda_i2f(dir), name, length, + 0, mode, &newfid, &attrs); + if (error) + goto err_out; + + inode = coda_iget(dir->i_sb, &newfid, &attrs); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto err_out; + } + + /* invalidate the directory cnode's attributes */ + coda_dir_update_mtime(dir); + d_instantiate(de, inode); + return 0; +err_out: + d_drop(de); + return error; +} + +static int coda_mkdir(struct inode *dir, struct dentry *de, int mode) +{ + struct inode *inode; + struct coda_vattr attrs; + const char *name = de->d_name.name; + int len = de->d_name.len; + int error; + struct CodaFid newfid; + + if (coda_isroot(dir) && coda_iscontrol(name, len)) + return -EPERM; + + attrs.va_mode = mode; + error = venus_mkdir(dir->i_sb, coda_i2f(dir), + name, len, &newfid, &attrs); + if (error) + goto err_out; + + inode = coda_iget(dir->i_sb, &newfid, &attrs); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto err_out; + } + + /* invalidate the directory cnode's attributes */ + coda_dir_inc_nlink(dir); + coda_dir_update_mtime(dir); + d_instantiate(de, inode); + return 0; +err_out: + d_drop(de); + return error; +} + +/* try to make de an entry in dir_inodde linked to source_de */ +static int coda_link(struct dentry *source_de, struct inode *dir_inode, + struct dentry *de) +{ + struct inode *inode = source_de->d_inode; + const char * name = de->d_name.name; + int len = de->d_name.len; + int error; + + if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) + return -EPERM; + + error = venus_link(dir_inode->i_sb, coda_i2f(inode), + coda_i2f(dir_inode), (const char *)name, len); + if (error) { + d_drop(de); + return error; + } + + coda_dir_update_mtime(dir_inode); + ihold(inode); + d_instantiate(de, inode); + inc_nlink(inode); + return 0; +} + + +static int coda_symlink(struct inode *dir_inode, struct dentry *de, + const char *symname) +{ + const char *name = de->d_name.name; + int len = de->d_name.len; + int symlen; + int error; + + if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) + return -EPERM; + + symlen = strlen(symname); + if (symlen > CODA_MAXPATHLEN) + return -ENAMETOOLONG; + + /* + * This entry is now negative. Since we do not create + * an inode for the entry we have to drop it. + */ + d_drop(de); + error = venus_symlink(dir_inode->i_sb, coda_i2f(dir_inode), name, len, + symname, symlen); + + /* mtime is no good anymore */ + if (!error) + coda_dir_update_mtime(dir_inode); + + return error; +} + +/* destruction routines: unlink, rmdir */ +static int coda_unlink(struct inode *dir, struct dentry *de) +{ + int error; + const char *name = de->d_name.name; + int len = de->d_name.len; + + error = venus_remove(dir->i_sb, coda_i2f(dir), name, len); + if (error) + return error; + + coda_dir_update_mtime(dir); + drop_nlink(de->d_inode); + return 0; +} + +static int coda_rmdir(struct inode *dir, struct dentry *de) +{ + const char *name = de->d_name.name; + int len = de->d_name.len; + int error; + + error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); + if (!error) { + /* VFS may delete the child */ + if (de->d_inode) + de->d_inode->i_nlink = 0; + + /* fix the link count of the parent */ + coda_dir_drop_nlink(dir); + coda_dir_update_mtime(dir); + } + return error; +} + +/* rename */ +static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + const char *old_name = old_dentry->d_name.name; + const char *new_name = new_dentry->d_name.name; + int old_length = old_dentry->d_name.len; + int new_length = new_dentry->d_name.len; + int error; + + error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), + coda_i2f(new_dir), old_length, new_length, + (const char *) old_name, (const char *)new_name); + if (!error) { + if (new_dentry->d_inode) { + if (S_ISDIR(new_dentry->d_inode->i_mode)) { + coda_dir_drop_nlink(old_dir); + coda_dir_inc_nlink(new_dir); + } + coda_dir_update_mtime(old_dir); + coda_dir_update_mtime(new_dir); + coda_flag_inode(new_dentry->d_inode, C_VATTR); + } else { + coda_flag_inode(old_dir, C_VATTR); + coda_flag_inode(new_dir, C_VATTR); + } + } + return error; +} + + +/* file operations for directories */ +static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir) +{ + struct coda_file_info *cfi; + struct file *host_file; + int ret; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + host_file = cfi->cfi_container; + + if (!host_file->f_op) + return -ENOTDIR; + + if (host_file->f_op->readdir) + { + /* potemkin case: we were handed a directory inode. + * We can't use vfs_readdir because we have to keep the file + * position in sync between the coda_file and the host_file. + * and as such we need grab the inode mutex. */ + struct inode *host_inode = host_file->f_path.dentry->d_inode; + + mutex_lock(&host_inode->i_mutex); + host_file->f_pos = coda_file->f_pos; + + ret = -ENOENT; + if (!IS_DEADDIR(host_inode)) { + ret = host_file->f_op->readdir(host_file, buf, filldir); + file_accessed(host_file); + } + + coda_file->f_pos = host_file->f_pos; + mutex_unlock(&host_inode->i_mutex); + } + else /* Venus: we must read Venus dirents from a file */ + ret = coda_venus_readdir(coda_file, buf, filldir); + + return ret; +} + +static inline unsigned int CDT2DT(unsigned char cdt) +{ + unsigned int dt; + + switch(cdt) { + case CDT_UNKNOWN: dt = DT_UNKNOWN; break; + case CDT_FIFO: dt = DT_FIFO; break; + case CDT_CHR: dt = DT_CHR; break; + case CDT_DIR: dt = DT_DIR; break; + case CDT_BLK: dt = DT_BLK; break; + case CDT_REG: dt = DT_REG; break; + case CDT_LNK: dt = DT_LNK; break; + case CDT_SOCK: dt = DT_SOCK; break; + case CDT_WHT: dt = DT_WHT; break; + default: dt = DT_UNKNOWN; break; + } + return dt; +} + +/* support routines */ +static int coda_venus_readdir(struct file *coda_file, void *buf, + filldir_t filldir) +{ + int result = 0; /* # of entries returned */ + struct coda_file_info *cfi; + struct coda_inode_info *cii; + struct file *host_file; + struct dentry *de; + struct venus_dirent *vdir; + unsigned long vdir_size = + (unsigned long)(&((struct venus_dirent *)0)->d_name); + unsigned int type; + struct qstr name; + ino_t ino; + int ret; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + host_file = cfi->cfi_container; + + de = coda_file->f_path.dentry; + cii = ITOC(de->d_inode); + + vdir = kmalloc(sizeof(*vdir), GFP_KERNEL); + if (!vdir) return -ENOMEM; + + if (coda_file->f_pos == 0) { + ret = filldir(buf, ".", 1, 0, de->d_inode->i_ino, DT_DIR); + if (ret < 0) + goto out; + result++; + coda_file->f_pos++; + } + if (coda_file->f_pos == 1) { + ret = filldir(buf, "..", 2, 1, de->d_parent->d_inode->i_ino, DT_DIR); + if (ret < 0) + goto out; + result++; + coda_file->f_pos++; + } + while (1) { + /* read entries from the directory file */ + ret = kernel_read(host_file, coda_file->f_pos - 2, (char *)vdir, + sizeof(*vdir)); + if (ret < 0) { + printk(KERN_ERR "coda readdir: read dir %s failed %d\n", + coda_f2s(&cii->c_fid), ret); + break; + } + if (ret == 0) break; /* end of directory file reached */ + + /* catch truncated reads */ + if (ret < vdir_size || ret < vdir_size + vdir->d_namlen) { + printk(KERN_ERR "coda readdir: short read on %s\n", + coda_f2s(&cii->c_fid)); + ret = -EBADF; + break; + } + /* validate whether the directory file actually makes sense */ + if (vdir->d_reclen < vdir_size + vdir->d_namlen) { + printk(KERN_ERR "coda readdir: invalid dir %s\n", + coda_f2s(&cii->c_fid)); + ret = -EBADF; + break; + } + + name.len = vdir->d_namlen; + name.name = vdir->d_name; + + /* Make sure we skip '.' and '..', we already got those */ + if (name.name[0] == '.' && (name.len == 1 || + (vdir->d_name[1] == '.' && name.len == 2))) + vdir->d_fileno = name.len = 0; + + /* skip null entries */ + if (vdir->d_fileno && name.len) { + /* try to look up this entry in the dcache, that way + * userspace doesn't have to worry about breaking + * getcwd by having mismatched inode numbers for + * internal volume mountpoints. */ + ino = find_inode_number(de, &name); + if (!ino) ino = vdir->d_fileno; + + type = CDT2DT(vdir->d_type); + ret = filldir(buf, name.name, name.len, + coda_file->f_pos, ino, type); + /* failure means no space for filling in this round */ + if (ret < 0) break; + result++; + } + /* we'll always have progress because d_reclen is unsigned and + * we've already established it is non-zero. */ + coda_file->f_pos += vdir->d_reclen; + } +out: + kfree(vdir); + return result ? result : ret; +} + +/* called when a cache lookup succeeds */ +static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd) +{ + struct inode *inode; + struct coda_inode_info *cii; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = de->d_inode; + if (!inode || coda_isroot(inode)) + goto out; + if (is_bad_inode(inode)) + goto bad; + + cii = ITOC(de->d_inode); + if (!(cii->c_flags & (C_PURGE | C_FLUSH))) + goto out; + + shrink_dcache_parent(de); + + /* propagate for a flush */ + if (cii->c_flags & C_FLUSH) + coda_flag_inode_children(inode, C_FLUSH); + + if (de->d_count > 1) + /* pretend it's valid, but don't change the flags */ + goto out; + + /* clear the flags. */ + spin_lock(&cii->c_lock); + cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH); + spin_unlock(&cii->c_lock); +bad: + return 0; +out: + return 1; +} + +/* + * This is the callback from dput() when d_count is going to 0. + * We use this to unhash dentries with bad inodes. + */ +static int coda_dentry_delete(const struct dentry * dentry) +{ + int flags; + + if (!dentry->d_inode) + return 0; + + flags = (ITOC(dentry->d_inode)->c_flags) & C_PURGE; + if (is_bad_inode(dentry->d_inode) || flags) { + return 1; + } + return 0; +} + + + +/* + * This is called when we want to check if the inode has + * changed on the server. Coda makes this easy since the + * cache manager Venus issues a downcall to the kernel when this + * happens + */ +int coda_revalidate_inode(struct dentry *dentry) +{ + struct coda_vattr attr; + int error; + int old_mode; + ino_t old_ino; + struct inode *inode = dentry->d_inode; + struct coda_inode_info *cii = ITOC(inode); + + if (!cii->c_flags) + return 0; + + if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) { + error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr); + if (error) + return -EIO; + + /* this inode may be lost if: + - it's ino changed + - type changes must be permitted for repair and + missing mount points. + */ + old_mode = inode->i_mode; + old_ino = inode->i_ino; + coda_vattr_to_iattr(inode, &attr); + + if ((old_mode & S_IFMT) != (inode->i_mode & S_IFMT)) { + printk("Coda: inode %ld, fid %s changed type!\n", + inode->i_ino, coda_f2s(&(cii->c_fid))); + } + + /* the following can happen when a local fid is replaced + with a global one, here we lose and declare the inode bad */ + if (inode->i_ino != old_ino) + return -EIO; + + coda_flag_inode_children(inode, C_FLUSH); + + spin_lock(&cii->c_lock); + cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH); + spin_unlock(&cii->c_lock); + } + return 0; +} diff --git a/fs/coda/file.c b/fs/coda/file.c new file mode 100644 index 00000000..0433057b --- /dev/null +++ b/fs/coda/file.c @@ -0,0 +1,234 @@ +/* + * File operations for Coda. + * Original version: (C) 1996 Peter Braam + * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon encourages users of this code to contribute improvements + * to the Coda project. Contact Peter Braam . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "coda_linux.h" +#include "coda_int.h" + +static ssize_t +coda_file_read(struct file *coda_file, char __user *buf, size_t count, loff_t *ppos) +{ + struct coda_file_info *cfi; + struct file *host_file; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + host_file = cfi->cfi_container; + + if (!host_file->f_op || !host_file->f_op->read) + return -EINVAL; + + return host_file->f_op->read(host_file, buf, count, ppos); +} + +static ssize_t +coda_file_splice_read(struct file *coda_file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t count, + unsigned int flags) +{ + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); + struct coda_file_info *cfi; + struct file *host_file; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + host_file = cfi->cfi_container; + + splice_read = host_file->f_op->splice_read; + if (!splice_read) + splice_read = default_file_splice_read; + + return splice_read(host_file, ppos, pipe, count, flags); +} + +static ssize_t +coda_file_write(struct file *coda_file, const char __user *buf, size_t count, loff_t *ppos) +{ + struct inode *host_inode, *coda_inode = coda_file->f_path.dentry->d_inode; + struct coda_file_info *cfi; + struct file *host_file; + ssize_t ret; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + host_file = cfi->cfi_container; + + if (!host_file->f_op || !host_file->f_op->write) + return -EINVAL; + + host_inode = host_file->f_path.dentry->d_inode; + mutex_lock(&coda_inode->i_mutex); + + ret = host_file->f_op->write(host_file, buf, count, ppos); + + coda_inode->i_size = host_inode->i_size; + coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; + coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC; + mutex_unlock(&coda_inode->i_mutex); + + return ret; +} + +static int +coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) +{ + struct coda_file_info *cfi; + struct coda_inode_info *cii; + struct file *host_file; + struct inode *coda_inode, *host_inode; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + host_file = cfi->cfi_container; + + if (!host_file->f_op || !host_file->f_op->mmap) + return -ENODEV; + + coda_inode = coda_file->f_path.dentry->d_inode; + host_inode = host_file->f_path.dentry->d_inode; + + cii = ITOC(coda_inode); + spin_lock(&cii->c_lock); + coda_file->f_mapping = host_file->f_mapping; + if (coda_inode->i_mapping == &coda_inode->i_data) + coda_inode->i_mapping = host_inode->i_mapping; + + /* only allow additional mmaps as long as userspace isn't changing + * the container file on us! */ + else if (coda_inode->i_mapping != host_inode->i_mapping) { + spin_unlock(&cii->c_lock); + return -EBUSY; + } + + /* keep track of how often the coda_inode/host_file has been mmapped */ + cii->c_mapcount++; + cfi->cfi_mapcount++; + spin_unlock(&cii->c_lock); + + return host_file->f_op->mmap(host_file, vma); +} + +int coda_open(struct inode *coda_inode, struct file *coda_file) +{ + struct file *host_file = NULL; + int error; + unsigned short flags = coda_file->f_flags & (~O_EXCL); + unsigned short coda_flags = coda_flags_to_cflags(flags); + struct coda_file_info *cfi; + + cfi = kmalloc(sizeof(struct coda_file_info), GFP_KERNEL); + if (!cfi) + return -ENOMEM; + + error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags, + &host_file); + if (!host_file) + error = -EIO; + + if (error) { + kfree(cfi); + return error; + } + + host_file->f_flags |= coda_file->f_flags & (O_APPEND | O_SYNC); + + cfi->cfi_magic = CODA_MAGIC; + cfi->cfi_mapcount = 0; + cfi->cfi_container = host_file; + + BUG_ON(coda_file->private_data != NULL); + coda_file->private_data = cfi; + return 0; +} + +int coda_release(struct inode *coda_inode, struct file *coda_file) +{ + unsigned short flags = (coda_file->f_flags) & (~O_EXCL); + unsigned short coda_flags = coda_flags_to_cflags(flags); + struct coda_file_info *cfi; + struct coda_inode_info *cii; + struct inode *host_inode; + int err; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + + err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode), + coda_flags, coda_file->f_cred->fsuid); + + host_inode = cfi->cfi_container->f_path.dentry->d_inode; + cii = ITOC(coda_inode); + + /* did we mmap this file? */ + spin_lock(&cii->c_lock); + if (coda_inode->i_mapping == &host_inode->i_data) { + cii->c_mapcount -= cfi->cfi_mapcount; + if (!cii->c_mapcount) + coda_inode->i_mapping = &coda_inode->i_data; + } + spin_unlock(&cii->c_lock); + + fput(cfi->cfi_container); + kfree(coda_file->private_data); + coda_file->private_data = NULL; + + /* VFS fput ignores the return value from file_operations->release, so + * there is no use returning an error here */ + return 0; +} + +int coda_fsync(struct file *coda_file, int datasync) +{ + struct file *host_file; + struct inode *coda_inode = coda_file->f_path.dentry->d_inode; + struct coda_file_info *cfi; + int err; + + if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) || + S_ISLNK(coda_inode->i_mode))) + return -EINVAL; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + host_file = cfi->cfi_container; + + err = vfs_fsync(host_file, datasync); + if (!err && !datasync) + err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); + + return err; +} + +const struct file_operations coda_file_operations = { + .llseek = generic_file_llseek, + .read = coda_file_read, + .write = coda_file_write, + .mmap = coda_file_mmap, + .open = coda_open, + .release = coda_release, + .fsync = coda_fsync, + .splice_read = coda_file_splice_read, +}; + diff --git a/fs/coda/inode.c b/fs/coda/inode.c new file mode 100644 index 00000000..871b2771 --- /dev/null +++ b/fs/coda/inode.c @@ -0,0 +1,329 @@ +/* + * Super block/filesystem wide operations + * + * Copyright (C) 1996 Peter J. Braam and + * Michael Callahan + * + * Rewritten for Linux 2.1. Peter Braam + * Copyright (C) Carnegie Mellon University + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include "coda_linux.h" +#include "coda_cache.h" + +#include "coda_int.h" + +/* VFS super_block ops */ +static void coda_evict_inode(struct inode *); +static void coda_put_super(struct super_block *); +static int coda_statfs(struct dentry *dentry, struct kstatfs *buf); + +static struct kmem_cache * coda_inode_cachep; + +static struct inode *coda_alloc_inode(struct super_block *sb) +{ + struct coda_inode_info *ei; + ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + memset(&ei->c_fid, 0, sizeof(struct CodaFid)); + ei->c_flags = 0; + ei->c_uid = 0; + ei->c_cached_perm = 0; + spin_lock_init(&ei->c_lock); + return &ei->vfs_inode; +} + +static void coda_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(coda_inode_cachep, ITOC(inode)); +} + +static void coda_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, coda_i_callback); +} + +static void init_once(void *foo) +{ + struct coda_inode_info *ei = (struct coda_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +int coda_init_inodecache(void) +{ + coda_inode_cachep = kmem_cache_create("coda_inode_cache", + sizeof(struct coda_inode_info), + 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + init_once); + if (coda_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +void coda_destroy_inodecache(void) +{ + kmem_cache_destroy(coda_inode_cachep); +} + +static int coda_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_NOATIME; + return 0; +} + +/* exported operations */ +static const struct super_operations coda_super_operations = +{ + .alloc_inode = coda_alloc_inode, + .destroy_inode = coda_destroy_inode, + .evict_inode = coda_evict_inode, + .put_super = coda_put_super, + .statfs = coda_statfs, + .remount_fs = coda_remount, +}; + +static int get_device_index(struct coda_mount_data *data) +{ + struct file *file; + struct inode *inode; + int idx; + + if(data == NULL) { + printk("coda_read_super: Bad mount data\n"); + return -1; + } + + if(data->version != CODA_MOUNT_VERSION) { + printk("coda_read_super: Bad mount version\n"); + return -1; + } + + file = fget(data->fd); + inode = NULL; + if(file) + inode = file->f_path.dentry->d_inode; + + if(!inode || !S_ISCHR(inode->i_mode) || + imajor(inode) != CODA_PSDEV_MAJOR) { + if(file) + fput(file); + + printk("coda_read_super: Bad file\n"); + return -1; + } + + idx = iminor(inode); + fput(file); + + if(idx < 0 || idx >= MAX_CODADEVS) { + printk("coda_read_super: Bad minor number\n"); + return -1; + } + + return idx; +} + +static int coda_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *root = NULL; + struct venus_comm *vc; + struct CodaFid fid; + int error; + int idx; + + idx = get_device_index((struct coda_mount_data *) data); + + /* Ignore errors in data, for backward compatibility */ + if(idx == -1) + idx = 0; + + printk(KERN_INFO "coda_read_super: device index: %i\n", idx); + + vc = &coda_comms[idx]; + mutex_lock(&vc->vc_mutex); + + if (!vc->vc_inuse) { + printk("coda_read_super: No pseudo device\n"); + error = -EINVAL; + goto unlock_out; + } + + if (vc->vc_sb) { + printk("coda_read_super: Device already mounted\n"); + error = -EBUSY; + goto unlock_out; + } + + error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY); + if (error) + goto unlock_out; + + vc->vc_sb = sb; + mutex_unlock(&vc->vc_mutex); + + sb->s_fs_info = vc; + sb->s_flags |= MS_NOATIME; + sb->s_blocksize = 4096; /* XXXXX what do we put here?? */ + sb->s_blocksize_bits = 12; + sb->s_magic = CODA_SUPER_MAGIC; + sb->s_op = &coda_super_operations; + sb->s_d_op = &coda_dentry_operations; + sb->s_bdi = &vc->bdi; + + /* get root fid from Venus: this needs the root inode */ + error = venus_rootfid(sb, &fid); + if ( error ) { + printk("coda_read_super: coda_get_rootfid failed with %d\n", + error); + goto error; + } + printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid)); + + /* make root inode */ + error = coda_cnode_make(&root, &fid, sb); + if ( error || !root ) { + printk("Failure of coda_cnode_make for root: error %d\n", error); + goto error; + } + + printk("coda_read_super: rootinode is %ld dev %s\n", + root->i_ino, root->i_sb->s_id); + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + error = -EINVAL; + goto error; + } + return 0; + +error: + if (root) + iput(root); + + mutex_lock(&vc->vc_mutex); + bdi_destroy(&vc->bdi); + vc->vc_sb = NULL; + sb->s_fs_info = NULL; +unlock_out: + mutex_unlock(&vc->vc_mutex); + return error; +} + +static void coda_put_super(struct super_block *sb) +{ + struct venus_comm *vcp = coda_vcp(sb); + mutex_lock(&vcp->vc_mutex); + bdi_destroy(&vcp->bdi); + vcp->vc_sb = NULL; + sb->s_fs_info = NULL; + mutex_unlock(&vcp->vc_mutex); + + printk("Coda: Bye bye.\n"); +} + +static void coda_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + coda_cache_clear_inode(inode); +} + +int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + int err = coda_revalidate_inode(dentry); + if (!err) + generic_fillattr(dentry->d_inode, stat); + return err; +} + +int coda_setattr(struct dentry *de, struct iattr *iattr) +{ + struct inode *inode = de->d_inode; + struct coda_vattr vattr; + int error; + + memset(&vattr, 0, sizeof(vattr)); + + inode->i_ctime = CURRENT_TIME_SEC; + coda_iattr_to_vattr(iattr, &vattr); + vattr.va_type = C_VNON; /* cannot set type */ + + /* Venus is responsible for truncating the container-file!!! */ + error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr); + + if (!error) { + coda_vattr_to_iattr(inode, &vattr); + coda_cache_clear_inode(inode); + } + return error; +} + +const struct inode_operations coda_file_inode_operations = { + .permission = coda_permission, + .getattr = coda_getattr, + .setattr = coda_setattr, +}; + +static int coda_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + int error; + + error = venus_statfs(dentry, buf); + + if (error) { + /* fake something like AFS does */ + buf->f_blocks = 9000000; + buf->f_bfree = 9000000; + buf->f_bavail = 9000000; + buf->f_files = 9000000; + buf->f_ffree = 9000000; + } + + /* and fill in the rest */ + buf->f_type = CODA_SUPER_MAGIC; + buf->f_bsize = 4096; + buf->f_namelen = CODA_MAXNAMLEN; + + return 0; +} + +/* init_coda: used by filesystems.c to register coda */ + +static struct dentry *coda_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, coda_fill_super); +} + +struct file_system_type coda_fs_type = { + .owner = THIS_MODULE, + .name = "coda", + .mount = coda_mount, + .kill_sb = kill_anon_super, + .fs_flags = FS_BINARY_MOUNTDATA, +}; + diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c new file mode 100644 index 00000000..cb140ef2 --- /dev/null +++ b/fs/coda/pioctl.c @@ -0,0 +1,90 @@ +/* + * Pioctl operations for Coda. + * Original version: (C) 1996 Peter Braam + * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon encourages users of this code to contribute improvements + * to the Coda project. Contact Peter Braam . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "coda_linux.h" + +/* pioctl ops */ +static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags); +static long coda_pioctl(struct file *filp, unsigned int cmd, + unsigned long user_data); + +/* exported from this file */ +const struct inode_operations coda_ioctl_inode_operations = { + .permission = coda_ioctl_permission, + .setattr = coda_setattr, +}; + +const struct file_operations coda_ioctl_operations = { + .owner = THIS_MODULE, + .unlocked_ioctl = coda_pioctl, + .llseek = noop_llseek, +}; + +/* the coda pioctl inode ops */ +static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags) +{ + return (mask & MAY_EXEC) ? -EACCES : 0; +} + +static long coda_pioctl(struct file *filp, unsigned int cmd, + unsigned long user_data) +{ + struct path path; + int error; + struct PioctlData data; + struct inode *inode = filp->f_dentry->d_inode; + struct inode *target_inode = NULL; + struct coda_inode_info *cnp; + + /* get the Pioctl data arguments from user space */ + if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) + return -EINVAL; + + /* + * Look up the pathname. Note that the pathname is in + * user memory, and namei takes care of this + */ + if (data.follow) + error = user_path(data.path, &path); + else + error = user_lpath(data.path, &path); + + if (error) + return error; + + target_inode = path.dentry->d_inode; + + /* return if it is not a Coda inode */ + if (target_inode->i_sb != inode->i_sb) { + error = -EINVAL; + goto out; + } + + /* now proceed to make the upcall */ + cnp = ITOC(target_inode); + + error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); +out: + path_put(&path); + return error; +} diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c new file mode 100644 index 00000000..8f616e0e --- /dev/null +++ b/fs/coda/psdev.c @@ -0,0 +1,430 @@ +/* + * An implementation of a loadable kernel mode driver providing + * multiple kernel/user space bidirectional communications links. + * + * Author: Alan Cox + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Adapted to become the Linux 2.0 Coda pseudo device + * Peter Braam + * Michael Callahan + * + * Changes for Linux 2.1 + * Copyright (c) 1997 Carnegie-Mellon University + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "coda_linux.h" + +#include "coda_int.h" + +/* statistics */ +int coda_hard; /* allows signals during upcalls */ +unsigned long coda_timeout = 30; /* .. secs, then signals will dequeue */ + + +struct venus_comm coda_comms[MAX_CODADEVS]; +static struct class *coda_psdev_class; + +/* + * Device operations + */ + +static unsigned int coda_psdev_poll(struct file *file, poll_table * wait) +{ + struct venus_comm *vcp = (struct venus_comm *) file->private_data; + unsigned int mask = POLLOUT | POLLWRNORM; + + poll_wait(file, &vcp->vc_waitq, wait); + mutex_lock(&vcp->vc_mutex); + if (!list_empty(&vcp->vc_pending)) + mask |= POLLIN | POLLRDNORM; + mutex_unlock(&vcp->vc_mutex); + + return mask; +} + +static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg) +{ + unsigned int data; + + switch(cmd) { + case CIOC_KERNEL_VERSION: + data = CODA_KERNEL_VERSION; + return put_user(data, (int __user *) arg); + default: + return -ENOTTY; + } + + return 0; +} + +/* + * Receive a message written by Venus to the psdev + */ + +static ssize_t coda_psdev_write(struct file *file, const char __user *buf, + size_t nbytes, loff_t *off) +{ + struct venus_comm *vcp = (struct venus_comm *) file->private_data; + struct upc_req *req = NULL; + struct upc_req *tmp; + struct list_head *lh; + struct coda_in_hdr hdr; + ssize_t retval = 0, count = 0; + int error; + + /* Peek at the opcode, uniquefier */ + if (copy_from_user(&hdr, buf, 2 * sizeof(u_long))) + return -EFAULT; + + if (DOWNCALL(hdr.opcode)) { + union outputArgs *dcbuf; + int size = sizeof(*dcbuf); + + if ( nbytes < sizeof(struct coda_out_hdr) ) { + printk("coda_downcall opc %d uniq %d, not enough!\n", + hdr.opcode, hdr.unique); + count = nbytes; + goto out; + } + if ( nbytes > size ) { + printk("Coda: downcall opc %d, uniq %d, too much!", + hdr.opcode, hdr.unique); + nbytes = size; + } + CODA_ALLOC(dcbuf, union outputArgs *, nbytes); + if (copy_from_user(dcbuf, buf, nbytes)) { + CODA_FREE(dcbuf, nbytes); + retval = -EFAULT; + goto out; + } + + /* what downcall errors does Venus handle ? */ + error = coda_downcall(vcp, hdr.opcode, dcbuf); + + CODA_FREE(dcbuf, nbytes); + if (error) { + printk("psdev_write: coda_downcall error: %d\n", error); + retval = error; + goto out; + } + count = nbytes; + goto out; + } + + /* Look for the message on the processing queue. */ + mutex_lock(&vcp->vc_mutex); + list_for_each(lh, &vcp->vc_processing) { + tmp = list_entry(lh, struct upc_req , uc_chain); + if (tmp->uc_unique == hdr.unique) { + req = tmp; + list_del(&req->uc_chain); + break; + } + } + mutex_unlock(&vcp->vc_mutex); + + if (!req) { + printk("psdev_write: msg (%d, %d) not found\n", + hdr.opcode, hdr.unique); + retval = -ESRCH; + goto out; + } + + /* move data into response buffer. */ + if (req->uc_outSize < nbytes) { + printk("psdev_write: too much cnt: %d, cnt: %ld, opc: %d, uniq: %d.\n", + req->uc_outSize, (long)nbytes, hdr.opcode, hdr.unique); + nbytes = req->uc_outSize; /* don't have more space! */ + } + if (copy_from_user(req->uc_data, buf, nbytes)) { + req->uc_flags |= CODA_REQ_ABORT; + wake_up(&req->uc_sleep); + retval = -EFAULT; + goto out; + } + + /* adjust outsize. is this useful ?? */ + req->uc_outSize = nbytes; + req->uc_flags |= CODA_REQ_WRITE; + count = nbytes; + + /* Convert filedescriptor into a file handle */ + if (req->uc_opcode == CODA_OPEN_BY_FD) { + struct coda_open_by_fd_out *outp = + (struct coda_open_by_fd_out *)req->uc_data; + if (!outp->oh.result) + outp->fh = fget(outp->fd); + } + + wake_up(&req->uc_sleep); +out: + return(count ? count : retval); +} + +/* + * Read a message from the kernel to Venus + */ + +static ssize_t coda_psdev_read(struct file * file, char __user * buf, + size_t nbytes, loff_t *off) +{ + DECLARE_WAITQUEUE(wait, current); + struct venus_comm *vcp = (struct venus_comm *) file->private_data; + struct upc_req *req; + ssize_t retval = 0, count = 0; + + if (nbytes == 0) + return 0; + + mutex_lock(&vcp->vc_mutex); + + add_wait_queue(&vcp->vc_waitq, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + while (list_empty(&vcp->vc_pending)) { + if (file->f_flags & O_NONBLOCK) { + retval = -EAGAIN; + break; + } + if (signal_pending(current)) { + retval = -ERESTARTSYS; + break; + } + mutex_unlock(&vcp->vc_mutex); + schedule(); + mutex_lock(&vcp->vc_mutex); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&vcp->vc_waitq, &wait); + + if (retval) + goto out; + + req = list_entry(vcp->vc_pending.next, struct upc_req,uc_chain); + list_del(&req->uc_chain); + + /* Move the input args into userspace */ + count = req->uc_inSize; + if (nbytes < req->uc_inSize) { + printk ("psdev_read: Venus read %ld bytes of %d in message\n", + (long)nbytes, req->uc_inSize); + count = nbytes; + } + + if (copy_to_user(buf, req->uc_data, count)) + retval = -EFAULT; + + /* If request was not a signal, enqueue and don't free */ + if (!(req->uc_flags & CODA_REQ_ASYNC)) { + req->uc_flags |= CODA_REQ_READ; + list_add_tail(&(req->uc_chain), &vcp->vc_processing); + goto out; + } + + CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); + kfree(req); +out: + mutex_unlock(&vcp->vc_mutex); + return (count ? count : retval); +} + +static int coda_psdev_open(struct inode * inode, struct file * file) +{ + struct venus_comm *vcp; + int idx, err; + + idx = iminor(inode); + if (idx < 0 || idx >= MAX_CODADEVS) + return -ENODEV; + + err = -EBUSY; + vcp = &coda_comms[idx]; + mutex_lock(&vcp->vc_mutex); + + if (!vcp->vc_inuse) { + vcp->vc_inuse++; + + INIT_LIST_HEAD(&vcp->vc_pending); + INIT_LIST_HEAD(&vcp->vc_processing); + init_waitqueue_head(&vcp->vc_waitq); + vcp->vc_sb = NULL; + vcp->vc_seq = 0; + + file->private_data = vcp; + err = 0; + } + + mutex_unlock(&vcp->vc_mutex); + return err; +} + + +static int coda_psdev_release(struct inode * inode, struct file * file) +{ + struct venus_comm *vcp = (struct venus_comm *) file->private_data; + struct upc_req *req, *tmp; + + if (!vcp || !vcp->vc_inuse ) { + printk("psdev_release: Not open.\n"); + return -1; + } + + mutex_lock(&vcp->vc_mutex); + + /* Wakeup clients so they can return. */ + list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) { + list_del(&req->uc_chain); + + /* Async requests need to be freed here */ + if (req->uc_flags & CODA_REQ_ASYNC) { + CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); + kfree(req); + continue; + } + req->uc_flags |= CODA_REQ_ABORT; + wake_up(&req->uc_sleep); + } + + list_for_each_entry_safe(req, tmp, &vcp->vc_processing, uc_chain) { + list_del(&req->uc_chain); + + req->uc_flags |= CODA_REQ_ABORT; + wake_up(&req->uc_sleep); + } + + file->private_data = NULL; + vcp->vc_inuse--; + mutex_unlock(&vcp->vc_mutex); + return 0; +} + + +static const struct file_operations coda_psdev_fops = { + .owner = THIS_MODULE, + .read = coda_psdev_read, + .write = coda_psdev_write, + .poll = coda_psdev_poll, + .unlocked_ioctl = coda_psdev_ioctl, + .open = coda_psdev_open, + .release = coda_psdev_release, + .llseek = noop_llseek, +}; + +static int init_coda_psdev(void) +{ + int i, err = 0; + if (register_chrdev(CODA_PSDEV_MAJOR, "coda", &coda_psdev_fops)) { + printk(KERN_ERR "coda_psdev: unable to get major %d\n", + CODA_PSDEV_MAJOR); + return -EIO; + } + coda_psdev_class = class_create(THIS_MODULE, "coda"); + if (IS_ERR(coda_psdev_class)) { + err = PTR_ERR(coda_psdev_class); + goto out_chrdev; + } + for (i = 0; i < MAX_CODADEVS; i++) { + mutex_init(&(&coda_comms[i])->vc_mutex); + device_create(coda_psdev_class, NULL, + MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i); + } + coda_sysctl_init(); + goto out; + +out_chrdev: + unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); +out: + return err; +} + +MODULE_AUTHOR("Jan Harkes, Peter J. Braam"); +MODULE_DESCRIPTION("Coda Distributed File System VFS interface"); +MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR); +MODULE_LICENSE("GPL"); +MODULE_VERSION("6.6"); + +static int __init init_coda(void) +{ + int status; + int i; + + status = coda_init_inodecache(); + if (status) + goto out2; + status = init_coda_psdev(); + if ( status ) { + printk("Problem (%d) in init_coda_psdev\n", status); + goto out1; + } + + status = register_filesystem(&coda_fs_type); + if (status) { + printk("coda: failed to register filesystem!\n"); + goto out; + } + return 0; +out: + for (i = 0; i < MAX_CODADEVS; i++) + device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); + class_destroy(coda_psdev_class); + unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); + coda_sysctl_clean(); +out1: + coda_destroy_inodecache(); +out2: + return status; +} + +static void __exit exit_coda(void) +{ + int err, i; + + err = unregister_filesystem(&coda_fs_type); + if ( err != 0 ) { + printk("coda: failed to unregister filesystem\n"); + } + for (i = 0; i < MAX_CODADEVS; i++) + device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); + class_destroy(coda_psdev_class); + unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); + coda_sysctl_clean(); + coda_destroy_inodecache(); +} + +module_init(init_coda); +module_exit(exit_coda); + diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c new file mode 100644 index 00000000..ab94ef63 --- /dev/null +++ b/fs/coda/symlink.c @@ -0,0 +1,50 @@ +/* + * Symlink inode operations for Coda filesystem + * Original version: (C) 1996 P. Braam and M. Callahan + * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon encourages users to contribute improvements to + * the Coda project. Contact Peter Braam (coda@cs.cmu.edu). + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "coda_linux.h" + +static int coda_symlink_filler(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + int error; + struct coda_inode_info *cii; + unsigned int len = PAGE_SIZE; + char *p = kmap(page); + + cii = ITOC(inode); + + error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len); + if (error) + goto fail; + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + return 0; + +fail: + SetPageError(page); + kunmap(page); + unlock_page(page); + return error; +} + +const struct address_space_operations coda_symlink_aops = { + .readpage = coda_symlink_filler, +}; diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c new file mode 100644 index 00000000..af56ad56 --- /dev/null +++ b/fs/coda/sysctl.c @@ -0,0 +1,73 @@ +/* + * Sysctl operations for Coda filesystem + * Original version: (C) 1996 P. Braam and M. Callahan + * Rewritten for Linux 2.1. (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon encourages users to contribute improvements to + * the Coda project. Contact Peter Braam (coda@cs.cmu.edu). + */ + +#include + +#include "coda_int.h" + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *fs_table_header; + +static ctl_table coda_table[] = { + { + .procname = "timeout", + .data = &coda_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "hard", + .data = &coda_hard, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "fake_statfs", + .data = &coda_fake_statfs, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec + }, + {} +}; + +static ctl_table fs_table[] = { + { + .procname = "coda", + .mode = 0555, + .child = coda_table + }, + {} +}; + +void coda_sysctl_init(void) +{ + if ( !fs_table_header ) + fs_table_header = register_sysctl_table(fs_table); +} + +void coda_sysctl_clean(void) +{ + if ( fs_table_header ) { + unregister_sysctl_table(fs_table_header); + fs_table_header = NULL; + } +} + +#else +void coda_sysctl_init(void) +{ +} + +void coda_sysctl_clean(void) +{ +} +#endif diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c new file mode 100644 index 00000000..9727e0c5 --- /dev/null +++ b/fs/coda/upcall.c @@ -0,0 +1,883 @@ +/* + * Mostly platform independent upcall operations to Venus: + * -- upcalls + * -- upcall routines + * + * Linux 2.0 version + * Copyright (C) 1996 Peter J. Braam , + * Michael Callahan + * + * Redone for Linux 2.1 + * Copyright (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon University encourages users of this code to contribute + * improvements to the Coda project. Contact Peter Braam . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "coda_linux.h" +#include "coda_cache.h" + +#include "coda_int.h" + +static int coda_upcall(struct venus_comm *vc, int inSize, int *outSize, + union inputArgs *buffer); + +static void *alloc_upcall(int opcode, int size) +{ + union inputArgs *inp; + + CODA_ALLOC(inp, union inputArgs *, size); + if (!inp) + return ERR_PTR(-ENOMEM); + + inp->ih.opcode = opcode; + inp->ih.pid = current->pid; + inp->ih.pgid = task_pgrp_nr(current); + inp->ih.uid = current_fsuid(); + + return (void*)inp; +} + +#define UPARG(op)\ +do {\ + inp = (union inputArgs *)alloc_upcall(op, insize); \ + if (IS_ERR(inp)) { return PTR_ERR(inp); }\ + outp = (union outputArgs *)(inp); \ + outsize = insize; \ +} while (0) + +#define INSIZE(tag) sizeof(struct coda_ ## tag ## _in) +#define OUTSIZE(tag) sizeof(struct coda_ ## tag ## _out) +#define SIZE(tag) max_t(unsigned int, INSIZE(tag), OUTSIZE(tag)) + + +/* the upcalls */ +int venus_rootfid(struct super_block *sb, struct CodaFid *fidp) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize = SIZE(root); + UPARG(CODA_ROOT); + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + if (!error) + *fidp = outp->coda_root.VFid; + + CODA_FREE(inp, insize); + return error; +} + +int venus_getattr(struct super_block *sb, struct CodaFid *fid, + struct coda_vattr *attr) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize = SIZE(getattr); + UPARG(CODA_GETATTR); + inp->coda_getattr.VFid = *fid; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + if (!error) + *attr = outp->coda_getattr.attr; + + CODA_FREE(inp, insize); + return error; +} + +int venus_setattr(struct super_block *sb, struct CodaFid *fid, + struct coda_vattr *vattr) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize = SIZE(setattr); + UPARG(CODA_SETATTR); + + inp->coda_setattr.VFid = *fid; + inp->coda_setattr.attr = *vattr; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_lookup(struct super_block *sb, struct CodaFid *fid, + const char *name, int length, int * type, + struct CodaFid *resfid) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int offset; + + offset = INSIZE(lookup); + insize = max_t(unsigned int, offset + length +1, OUTSIZE(lookup)); + UPARG(CODA_LOOKUP); + + inp->coda_lookup.VFid = *fid; + inp->coda_lookup.name = offset; + inp->coda_lookup.flags = CLU_CASE_SENSITIVE; + /* send Venus a null terminated string */ + memcpy((char *)(inp) + offset, name, length); + *((char *)inp + offset + length) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + if (!error) { + *resfid = outp->coda_lookup.VFid; + *type = outp->coda_lookup.vtype; + } + + CODA_FREE(inp, insize); + return error; +} + +int venus_close(struct super_block *sb, struct CodaFid *fid, int flags, + vuid_t uid) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize = SIZE(release); + UPARG(CODA_CLOSE); + + inp->ih.uid = uid; + inp->coda_close.VFid = *fid; + inp->coda_close.flags = flags; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_open(struct super_block *sb, struct CodaFid *fid, + int flags, struct file **fh) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize = SIZE(open_by_fd); + UPARG(CODA_OPEN_BY_FD); + + inp->coda_open_by_fd.VFid = *fid; + inp->coda_open_by_fd.flags = flags; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + if (!error) + *fh = outp->coda_open_by_fd.fh; + + CODA_FREE(inp, insize); + return error; +} + +int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length, + struct CodaFid *newfid, struct coda_vattr *attrs) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int offset; + + offset = INSIZE(mkdir); + insize = max_t(unsigned int, offset + length + 1, OUTSIZE(mkdir)); + UPARG(CODA_MKDIR); + + inp->coda_mkdir.VFid = *dirfid; + inp->coda_mkdir.attr = *attrs; + inp->coda_mkdir.name = offset; + /* Venus must get null terminated string */ + memcpy((char *)(inp) + offset, name, length); + *((char *)inp + offset + length) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + if (!error) { + *attrs = outp->coda_mkdir.attr; + *newfid = outp->coda_mkdir.VFid; + } + + CODA_FREE(inp, insize); + return error; +} + + +int venus_rename(struct super_block *sb, struct CodaFid *old_fid, + struct CodaFid *new_fid, size_t old_length, + size_t new_length, const char *old_name, + const char *new_name) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int offset, s; + + offset = INSIZE(rename); + insize = max_t(unsigned int, offset + new_length + old_length + 8, + OUTSIZE(rename)); + UPARG(CODA_RENAME); + + inp->coda_rename.sourceFid = *old_fid; + inp->coda_rename.destFid = *new_fid; + inp->coda_rename.srcname = offset; + + /* Venus must receive an null terminated string */ + s = ( old_length & ~0x3) +4; /* round up to word boundary */ + memcpy((char *)(inp) + offset, old_name, old_length); + *((char *)inp + offset + old_length) = '\0'; + + /* another null terminated string for Venus */ + offset += s; + inp->coda_rename.destname = offset; + s = ( new_length & ~0x3) +4; /* round up to word boundary */ + memcpy((char *)(inp) + offset, new_name, new_length); + *((char *)inp + offset + new_length) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_create(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length, int excl, int mode, + struct CodaFid *newfid, struct coda_vattr *attrs) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int offset; + + offset = INSIZE(create); + insize = max_t(unsigned int, offset + length + 1, OUTSIZE(create)); + UPARG(CODA_CREATE); + + inp->coda_create.VFid = *dirfid; + inp->coda_create.attr.va_mode = mode; + inp->coda_create.excl = excl; + inp->coda_create.mode = mode; + inp->coda_create.name = offset; + + /* Venus must get null terminated string */ + memcpy((char *)(inp) + offset, name, length); + *((char *)inp + offset + length) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + if (!error) { + *attrs = outp->coda_create.attr; + *newfid = outp->coda_create.VFid; + } + + CODA_FREE(inp, insize); + return error; +} + +int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int offset; + + offset = INSIZE(rmdir); + insize = max_t(unsigned int, offset + length + 1, OUTSIZE(rmdir)); + UPARG(CODA_RMDIR); + + inp->coda_rmdir.VFid = *dirfid; + inp->coda_rmdir.name = offset; + memcpy((char *)(inp) + offset, name, length); + *((char *)inp + offset + length) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_remove(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length) +{ + union inputArgs *inp; + union outputArgs *outp; + int error=0, insize, outsize, offset; + + offset = INSIZE(remove); + insize = max_t(unsigned int, offset + length + 1, OUTSIZE(remove)); + UPARG(CODA_REMOVE); + + inp->coda_remove.VFid = *dirfid; + inp->coda_remove.name = offset; + memcpy((char *)(inp) + offset, name, length); + *((char *)inp + offset + length) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_readlink(struct super_block *sb, struct CodaFid *fid, + char *buffer, int *length) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int retlen; + char *result; + + insize = max_t(unsigned int, + INSIZE(readlink), OUTSIZE(readlink)+ *length + 1); + UPARG(CODA_READLINK); + + inp->coda_readlink.VFid = *fid; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + if (!error) { + retlen = outp->coda_readlink.count; + if ( retlen > *length ) + retlen = *length; + *length = retlen; + result = (char *)outp + (long)outp->coda_readlink.data; + memcpy(buffer, result, retlen); + *(buffer + retlen) = '\0'; + } + + CODA_FREE(inp, insize); + return error; +} + + + +int venus_link(struct super_block *sb, struct CodaFid *fid, + struct CodaFid *dirfid, const char *name, int len ) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int offset; + + offset = INSIZE(link); + insize = max_t(unsigned int, offset + len + 1, OUTSIZE(link)); + UPARG(CODA_LINK); + + inp->coda_link.sourceFid = *fid; + inp->coda_link.destFid = *dirfid; + inp->coda_link.tname = offset; + + /* make sure strings are null terminated */ + memcpy((char *)(inp) + offset, name, len); + *((char *)inp + offset + len) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_symlink(struct super_block *sb, struct CodaFid *fid, + const char *name, int len, + const char *symname, int symlen) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int offset, s; + + offset = INSIZE(symlink); + insize = max_t(unsigned int, offset + len + symlen + 8, OUTSIZE(symlink)); + UPARG(CODA_SYMLINK); + + /* inp->coda_symlink.attr = *tva; XXXXXX */ + inp->coda_symlink.VFid = *fid; + + /* Round up to word boundary and null terminate */ + inp->coda_symlink.srcname = offset; + s = ( symlen & ~0x3 ) + 4; + memcpy((char *)(inp) + offset, symname, symlen); + *((char *)inp + offset + symlen) = '\0'; + + /* Round up to word boundary and null terminate */ + offset += s; + inp->coda_symlink.tname = offset; + s = (len & ~0x3) + 4; + memcpy((char *)(inp) + offset, name, len); + *((char *)inp + offset + len) = '\0'; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_fsync(struct super_block *sb, struct CodaFid *fid) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize=SIZE(fsync); + UPARG(CODA_FSYNC); + + inp->coda_fsync.VFid = *fid; + error = coda_upcall(coda_vcp(sb), sizeof(union inputArgs), + &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + +int venus_access(struct super_block *sb, struct CodaFid *fid, int mask) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize = SIZE(access); + UPARG(CODA_ACCESS); + + inp->coda_access.VFid = *fid; + inp->coda_access.flags = mask; + + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); + + CODA_FREE(inp, insize); + return error; +} + + +int venus_pioctl(struct super_block *sb, struct CodaFid *fid, + unsigned int cmd, struct PioctlData *data) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + int iocsize; + + insize = VC_MAXMSGSIZE; + UPARG(CODA_IOCTL); + + /* build packet for Venus */ + if (data->vi.in_size > VC_MAXDATASIZE) { + error = -EINVAL; + goto exit; + } + + if (data->vi.out_size > VC_MAXDATASIZE) { + error = -EINVAL; + goto exit; + } + + inp->coda_ioctl.VFid = *fid; + + /* the cmd field was mutated by increasing its size field to + * reflect the path and follow args. We need to subtract that + * out before sending the command to Venus. */ + inp->coda_ioctl.cmd = (cmd & ~(PIOCPARM_MASK << 16)); + iocsize = ((cmd >> 16) & PIOCPARM_MASK) - sizeof(char *) - sizeof(int); + inp->coda_ioctl.cmd |= (iocsize & PIOCPARM_MASK) << 16; + + /* in->coda_ioctl.rwflag = flag; */ + inp->coda_ioctl.len = data->vi.in_size; + inp->coda_ioctl.data = (char *)(INSIZE(ioctl)); + + /* get the data out of user space */ + if ( copy_from_user((char*)inp + (long)inp->coda_ioctl.data, + data->vi.in, data->vi.in_size) ) { + error = -EINVAL; + goto exit; + } + + error = coda_upcall(coda_vcp(sb), SIZE(ioctl) + data->vi.in_size, + &outsize, inp); + + if (error) { + printk("coda_pioctl: Venus returns: %d for %s\n", + error, coda_f2s(fid)); + goto exit; + } + + if (outsize < (long)outp->coda_ioctl.data + outp->coda_ioctl.len) { + error = -EINVAL; + goto exit; + } + + /* Copy out the OUT buffer. */ + if (outp->coda_ioctl.len > data->vi.out_size) { + error = -EINVAL; + goto exit; + } + + /* Copy out the OUT buffer. */ + if (copy_to_user(data->vi.out, + (char *)outp + (long)outp->coda_ioctl.data, + outp->coda_ioctl.len)) { + error = -EFAULT; + goto exit; + } + + exit: + CODA_FREE(inp, insize); + return error; +} + +int venus_statfs(struct dentry *dentry, struct kstatfs *sfs) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + + insize = max_t(unsigned int, INSIZE(statfs), OUTSIZE(statfs)); + UPARG(CODA_STATFS); + + error = coda_upcall(coda_vcp(dentry->d_sb), insize, &outsize, inp); + if (!error) { + sfs->f_blocks = outp->coda_statfs.stat.f_blocks; + sfs->f_bfree = outp->coda_statfs.stat.f_bfree; + sfs->f_bavail = outp->coda_statfs.stat.f_bavail; + sfs->f_files = outp->coda_statfs.stat.f_files; + sfs->f_ffree = outp->coda_statfs.stat.f_ffree; + } + + CODA_FREE(inp, insize); + return error; +} + +/* + * coda_upcall and coda_downcall routines. + */ +static void coda_block_signals(sigset_t *old) +{ + spin_lock_irq(¤t->sighand->siglock); + *old = current->blocked; + + sigfillset(¤t->blocked); + sigdelset(¤t->blocked, SIGKILL); + sigdelset(¤t->blocked, SIGSTOP); + sigdelset(¤t->blocked, SIGINT); + + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); +} + +static void coda_unblock_signals(sigset_t *old) +{ + spin_lock_irq(¤t->sighand->siglock); + current->blocked = *old; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); +} + +/* Don't allow signals to interrupt the following upcalls before venus + * has seen them, + * - CODA_CLOSE or CODA_RELEASE upcall (to avoid reference count problems) + * - CODA_STORE (to avoid data loss) + */ +#define CODA_INTERRUPTIBLE(r) (!coda_hard && \ + (((r)->uc_opcode != CODA_CLOSE && \ + (r)->uc_opcode != CODA_STORE && \ + (r)->uc_opcode != CODA_RELEASE) || \ + (r)->uc_flags & CODA_REQ_READ)) + +static inline void coda_waitfor_upcall(struct venus_comm *vcp, + struct upc_req *req) +{ + DECLARE_WAITQUEUE(wait, current); + unsigned long timeout = jiffies + coda_timeout * HZ; + sigset_t old; + int blocked; + + coda_block_signals(&old); + blocked = 1; + + add_wait_queue(&req->uc_sleep, &wait); + for (;;) { + if (CODA_INTERRUPTIBLE(req)) + set_current_state(TASK_INTERRUPTIBLE); + else + set_current_state(TASK_UNINTERRUPTIBLE); + + /* got a reply */ + if (req->uc_flags & (CODA_REQ_WRITE | CODA_REQ_ABORT)) + break; + + if (blocked && time_after(jiffies, timeout) && + CODA_INTERRUPTIBLE(req)) + { + coda_unblock_signals(&old); + blocked = 0; + } + + if (signal_pending(current)) { + list_del(&req->uc_chain); + break; + } + + mutex_unlock(&vcp->vc_mutex); + if (blocked) + schedule_timeout(HZ); + else + schedule(); + mutex_lock(&vcp->vc_mutex); + } + if (blocked) + coda_unblock_signals(&old); + + remove_wait_queue(&req->uc_sleep, &wait); + set_current_state(TASK_RUNNING); +} + + +/* + * coda_upcall will return an error in the case of + * failed communication with Venus _or_ will peek at Venus + * reply and return Venus' error. + * + * As venus has 2 types of errors, normal errors (positive) and internal + * errors (negative), normal errors are negated, while internal errors + * are all mapped to -EINTR, while showing a nice warning message. (jh) + */ +static int coda_upcall(struct venus_comm *vcp, + int inSize, int *outSize, + union inputArgs *buffer) +{ + union outputArgs *out; + union inputArgs *sig_inputArgs; + struct upc_req *req = NULL, *sig_req; + int error; + + mutex_lock(&vcp->vc_mutex); + + if (!vcp->vc_inuse) { + printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n"); + error = -ENXIO; + goto exit; + } + + /* Format the request message. */ + req = kmalloc(sizeof(struct upc_req), GFP_KERNEL); + if (!req) { + error = -ENOMEM; + goto exit; + } + + req->uc_data = (void *)buffer; + req->uc_flags = 0; + req->uc_inSize = inSize; + req->uc_outSize = *outSize ? *outSize : inSize; + req->uc_opcode = ((union inputArgs *)buffer)->ih.opcode; + req->uc_unique = ++vcp->vc_seq; + init_waitqueue_head(&req->uc_sleep); + + /* Fill in the common input args. */ + ((union inputArgs *)buffer)->ih.unique = req->uc_unique; + + /* Append msg to pending queue and poke Venus. */ + list_add_tail(&req->uc_chain, &vcp->vc_pending); + + wake_up_interruptible(&vcp->vc_waitq); + /* We can be interrupted while we wait for Venus to process + * our request. If the interrupt occurs before Venus has read + * the request, we dequeue and return. If it occurs after the + * read but before the reply, we dequeue, send a signal + * message, and return. If it occurs after the reply we ignore + * it. In no case do we want to restart the syscall. If it + * was interrupted by a venus shutdown (psdev_close), return + * ENODEV. */ + + /* Go to sleep. Wake up on signals only after the timeout. */ + coda_waitfor_upcall(vcp, req); + + /* Op went through, interrupt or not... */ + if (req->uc_flags & CODA_REQ_WRITE) { + out = (union outputArgs *)req->uc_data; + /* here we map positive Venus errors to kernel errors */ + error = -out->oh.result; + *outSize = req->uc_outSize; + goto exit; + } + + error = -EINTR; + if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) { + printk(KERN_WARNING "coda: Unexpected interruption.\n"); + goto exit; + } + + /* Interrupted before venus read it. */ + if (!(req->uc_flags & CODA_REQ_READ)) + goto exit; + + /* Venus saw the upcall, make sure we can send interrupt signal */ + if (!vcp->vc_inuse) { + printk(KERN_INFO "coda: Venus dead, not sending signal.\n"); + goto exit; + } + + error = -ENOMEM; + sig_req = kmalloc(sizeof(struct upc_req), GFP_KERNEL); + if (!sig_req) goto exit; + + CODA_ALLOC((sig_req->uc_data), char *, sizeof(struct coda_in_hdr)); + if (!sig_req->uc_data) { + kfree(sig_req); + goto exit; + } + + error = -EINTR; + sig_inputArgs = (union inputArgs *)sig_req->uc_data; + sig_inputArgs->ih.opcode = CODA_SIGNAL; + sig_inputArgs->ih.unique = req->uc_unique; + + sig_req->uc_flags = CODA_REQ_ASYNC; + sig_req->uc_opcode = sig_inputArgs->ih.opcode; + sig_req->uc_unique = sig_inputArgs->ih.unique; + sig_req->uc_inSize = sizeof(struct coda_in_hdr); + sig_req->uc_outSize = sizeof(struct coda_in_hdr); + + /* insert at head of queue! */ + list_add(&(sig_req->uc_chain), &vcp->vc_pending); + wake_up_interruptible(&vcp->vc_waitq); + +exit: + kfree(req); + mutex_unlock(&vcp->vc_mutex); + return error; +} + +/* + The statements below are part of the Coda opportunistic + programming -- taken from the Mach/BSD kernel code for Coda. + You don't get correct semantics by stating what needs to be + done without guaranteeing the invariants needed for it to happen. + When will be have time to find out what exactly is going on? (pjb) +*/ + + +/* + * There are 7 cases where cache invalidations occur. The semantics + * of each is listed here: + * + * CODA_FLUSH -- flush all entries from the name cache and the cnode cache. + * CODA_PURGEUSER -- flush all entries from the name cache for a specific user + * This call is a result of token expiration. + * + * The next arise as the result of callbacks on a file or directory. + * CODA_ZAPFILE -- flush the cached attributes for a file. + + * CODA_ZAPDIR -- flush the attributes for the dir and + * force a new lookup for all the children + of this dir. + + * + * The next is a result of Venus detecting an inconsistent file. + * CODA_PURGEFID -- flush the attribute for the file + * purge it and its children from the dcache + * + * The last allows Venus to replace local fids with global ones + * during reintegration. + * + * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */ + +int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out) +{ + struct inode *inode = NULL; + struct CodaFid *fid = NULL, *newfid; + struct super_block *sb; + + /* Handle invalidation requests. */ + mutex_lock(&vcp->vc_mutex); + sb = vcp->vc_sb; + if (!sb || !sb->s_root) + goto unlock_out; + + switch (opcode) { + case CODA_FLUSH: + coda_cache_clear_all(sb); + shrink_dcache_sb(sb); + if (sb->s_root->d_inode) + coda_flag_inode(sb->s_root->d_inode, C_FLUSH); + break; + + case CODA_PURGEUSER: + coda_cache_clear_all(sb); + break; + + case CODA_ZAPDIR: + fid = &out->coda_zapdir.CodaFid; + break; + + case CODA_ZAPFILE: + fid = &out->coda_zapfile.CodaFid; + break; + + case CODA_PURGEFID: + fid = &out->coda_purgefid.CodaFid; + break; + + case CODA_REPLACE: + fid = &out->coda_replace.OldFid; + break; + } + if (fid) + inode = coda_fid_to_inode(fid, sb); + +unlock_out: + mutex_unlock(&vcp->vc_mutex); + + if (!inode) + return 0; + + switch (opcode) { + case CODA_ZAPDIR: + coda_flag_inode_children(inode, C_PURGE); + coda_flag_inode(inode, C_VATTR); + break; + + case CODA_ZAPFILE: + coda_flag_inode(inode, C_VATTR); + break; + + case CODA_PURGEFID: + coda_flag_inode_children(inode, C_PURGE); + + /* catch the dentries later if some are still busy */ + coda_flag_inode(inode, C_PURGE); + d_prune_aliases(inode); + break; + + case CODA_REPLACE: + newfid = &out->coda_replace.NewFid; + coda_replace_fid(inode, fid, newfid); + break; + } + iput(inode); + return 0; +} + diff --git a/fs/compat.c b/fs/compat.c new file mode 100644 index 00000000..0ea00832 --- /dev/null +++ b/fs/compat.c @@ -0,0 +1,2061 @@ +/* + * linux/fs/compat.c + * + * Kernel compatibililty routines for e.g. 32 bit syscall support + * on 64 bit kernels. + * + * Copyright (C) 2002 Stephen Rothwell, IBM Corporation + * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) + * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs + * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "internal.h" + +int compat_log = 1; + +int compat_printk(const char *fmt, ...) +{ + va_list ap; + int ret; + if (!compat_log) + return 0; + va_start(ap, fmt); + ret = vprintk(fmt, ap); + va_end(ap); + return ret; +} + +#include "read_write.h" + +/* + * Not all architectures have sys_utime, so implement this in terms + * of sys_utimes. + */ +asmlinkage long compat_sys_utime(const char __user *filename, + struct compat_utimbuf __user *t) +{ + struct timespec tv[2]; + + if (t) { + if (get_user(tv[0].tv_sec, &t->actime) || + get_user(tv[1].tv_sec, &t->modtime)) + return -EFAULT; + tv[0].tv_nsec = 0; + tv[1].tv_nsec = 0; + } + return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); +} + +asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags) +{ + struct timespec tv[2]; + + if (t) { + if (get_compat_timespec(&tv[0], &t[0]) || + get_compat_timespec(&tv[1], &t[1])) + return -EFAULT; + + if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) + return 0; + } + return do_utimes(dfd, filename, t ? tv : NULL, flags); +} + +asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t) +{ + struct timespec tv[2]; + + if (t) { + if (get_user(tv[0].tv_sec, &t[0].tv_sec) || + get_user(tv[0].tv_nsec, &t[0].tv_usec) || + get_user(tv[1].tv_sec, &t[1].tv_sec) || + get_user(tv[1].tv_nsec, &t[1].tv_usec)) + return -EFAULT; + if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 || + tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0) + return -EINVAL; + tv[0].tv_nsec *= 1000; + tv[1].tv_nsec *= 1000; + } + return do_utimes(dfd, filename, t ? tv : NULL, 0); +} + +asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t) +{ + return compat_sys_futimesat(AT_FDCWD, filename, t); +} + +static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) +{ + compat_ino_t ino = stat->ino; + typeof(ubuf->st_uid) uid = 0; + typeof(ubuf->st_gid) gid = 0; + int err; + + SET_UID(uid, stat->uid); + SET_GID(gid, stat->gid); + + if ((u64) stat->size > MAX_NON_LFS || + !old_valid_dev(stat->dev) || + !old_valid_dev(stat->rdev)) + return -EOVERFLOW; + if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino) + return -EOVERFLOW; + + if (clear_user(ubuf, sizeof(*ubuf))) + return -EFAULT; + + err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev); + err |= __put_user(ino, &ubuf->st_ino); + err |= __put_user(stat->mode, &ubuf->st_mode); + err |= __put_user(stat->nlink, &ubuf->st_nlink); + err |= __put_user(uid, &ubuf->st_uid); + err |= __put_user(gid, &ubuf->st_gid); + err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev); + err |= __put_user(stat->size, &ubuf->st_size); + err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime); + err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec); + err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime); + err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec); + err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime); + err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec); + err |= __put_user(stat->blksize, &ubuf->st_blksize); + err |= __put_user(stat->blocks, &ubuf->st_blocks); + return err; +} + +asmlinkage long compat_sys_newstat(const char __user * filename, + struct compat_stat __user *statbuf) +{ + struct kstat stat; + int error; + + error = vfs_stat(filename, &stat); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} + +asmlinkage long compat_sys_newlstat(const char __user * filename, + struct compat_stat __user *statbuf) +{ + struct kstat stat; + int error; + + error = vfs_lstat(filename, &stat); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} + +#ifndef __ARCH_WANT_STAT64 +asmlinkage long compat_sys_newfstatat(unsigned int dfd, + const char __user *filename, + struct compat_stat __user *statbuf, int flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} +#endif + +asmlinkage long compat_sys_newfstat(unsigned int fd, + struct compat_stat __user * statbuf) +{ + struct kstat stat; + int error = vfs_fstat(fd, &stat); + + if (!error) + error = cp_compat_stat(&stat, statbuf); + return error; +} + +static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf) +{ + + if (sizeof ubuf->f_blocks == 4) { + if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | + kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) + return -EOVERFLOW; + /* f_files and f_ffree may be -1; it's okay + * to stuff that into 32 bits */ + if (kbuf->f_files != 0xffffffffffffffffULL + && (kbuf->f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (kbuf->f_ffree != 0xffffffffffffffffULL + && (kbuf->f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || + __put_user(kbuf->f_type, &ubuf->f_type) || + __put_user(kbuf->f_bsize, &ubuf->f_bsize) || + __put_user(kbuf->f_blocks, &ubuf->f_blocks) || + __put_user(kbuf->f_bfree, &ubuf->f_bfree) || + __put_user(kbuf->f_bavail, &ubuf->f_bavail) || + __put_user(kbuf->f_files, &ubuf->f_files) || + __put_user(kbuf->f_ffree, &ubuf->f_ffree) || + __put_user(kbuf->f_namelen, &ubuf->f_namelen) || + __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || + __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || + __put_user(kbuf->f_frsize, &ubuf->f_frsize) || + __put_user(0, &ubuf->f_spare[0]) || + __put_user(0, &ubuf->f_spare[1]) || + __put_user(0, &ubuf->f_spare[2]) || + __put_user(0, &ubuf->f_spare[3]) || + __put_user(0, &ubuf->f_spare[4])) + return -EFAULT; + return 0; +} + +/* + * The following statfs calls are copies of code from fs/statfs.c and + * should be checked against those from time to time + */ +asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) +{ + struct kstatfs tmp; + int error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); + return error; +} + +asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf) +{ + struct kstatfs tmp; + int error = fd_statfs(fd, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); + return error; +} + +static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) +{ + if (sizeof ubuf->f_blocks == 4) { + if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | + kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) + return -EOVERFLOW; + /* f_files and f_ffree may be -1; it's okay + * to stuff that into 32 bits */ + if (kbuf->f_files != 0xffffffffffffffffULL + && (kbuf->f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (kbuf->f_ffree != 0xffffffffffffffffULL + && (kbuf->f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || + __put_user(kbuf->f_type, &ubuf->f_type) || + __put_user(kbuf->f_bsize, &ubuf->f_bsize) || + __put_user(kbuf->f_blocks, &ubuf->f_blocks) || + __put_user(kbuf->f_bfree, &ubuf->f_bfree) || + __put_user(kbuf->f_bavail, &ubuf->f_bavail) || + __put_user(kbuf->f_files, &ubuf->f_files) || + __put_user(kbuf->f_ffree, &ubuf->f_ffree) || + __put_user(kbuf->f_namelen, &ubuf->f_namelen) || + __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || + __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || + __put_user(kbuf->f_frsize, &ubuf->f_frsize) || + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + return -EFAULT; + return 0; +} + +asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) +{ + struct kstatfs tmp; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); + return error; +} + +asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) +{ + struct kstatfs tmp; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = fd_statfs(fd, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); + return error; +} + +/* + * This is a copy of sys_ustat, just dealing with a structure layout. + * Given how simple this syscall is that apporach is more maintainable + * than the various conversion hacks. + */ +asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u) +{ + struct super_block *sb; + struct compat_ustat tmp; + struct kstatfs sbuf; + int err; + + sb = user_get_super(new_decode_dev(dev)); + if (!sb) + return -EINVAL; + err = statfs_by_dentry(sb->s_root, &sbuf); + drop_super(sb); + if (err) + return err; + + memset(&tmp, 0, sizeof(struct compat_ustat)); + tmp.f_tfree = sbuf.f_bfree; + tmp.f_tinode = sbuf.f_ffree; + if (copy_to_user(u, &tmp, sizeof(struct compat_ustat))) + return -EFAULT; + return 0; +} + +static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) +{ + if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || + __get_user(kfl->l_type, &ufl->l_type) || + __get_user(kfl->l_whence, &ufl->l_whence) || + __get_user(kfl->l_start, &ufl->l_start) || + __get_user(kfl->l_len, &ufl->l_len) || + __get_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} + +static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) +{ + if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || + __put_user(kfl->l_type, &ufl->l_type) || + __put_user(kfl->l_whence, &ufl->l_whence) || + __put_user(kfl->l_start, &ufl->l_start) || + __put_user(kfl->l_len, &ufl->l_len) || + __put_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} + +#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64 +static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +{ + if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || + __get_user(kfl->l_type, &ufl->l_type) || + __get_user(kfl->l_whence, &ufl->l_whence) || + __get_user(kfl->l_start, &ufl->l_start) || + __get_user(kfl->l_len, &ufl->l_len) || + __get_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} +#endif + +#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64 +static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +{ + if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || + __put_user(kfl->l_type, &ufl->l_type) || + __put_user(kfl->l_whence, &ufl->l_whence) || + __put_user(kfl->l_start, &ufl->l_start) || + __put_user(kfl->l_len, &ufl->l_len) || + __put_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} +#endif + +asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, + unsigned long arg) +{ + mm_segment_t old_fs; + struct flock f; + long ret; + + switch (cmd) { + case F_GETLK: + case F_SETLK: + case F_SETLKW: + ret = get_compat_flock(&f, compat_ptr(arg)); + if (ret != 0) + break; + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_fcntl(fd, cmd, (unsigned long)&f); + set_fs(old_fs); + if (cmd == F_GETLK && ret == 0) { + /* GETLK was successful and we need to return the data... + * but it needs to fit in the compat structure. + * l_start shouldn't be too big, unless the original + * start + end is greater than COMPAT_OFF_T_MAX, in which + * case the app was asking for trouble, so we return + * -EOVERFLOW in that case. + * l_len could be too big, in which case we just truncate it, + * and only allow the app to see that part of the conflicting + * lock that might make sense to it anyway + */ + + if (f.l_start > COMPAT_OFF_T_MAX) + ret = -EOVERFLOW; + if (f.l_len > COMPAT_OFF_T_MAX) + f.l_len = COMPAT_OFF_T_MAX; + if (ret == 0) + ret = put_compat_flock(&f, compat_ptr(arg)); + } + break; + + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + ret = get_compat_flock64(&f, compat_ptr(arg)); + if (ret != 0) + break; + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_fcntl(fd, (cmd == F_GETLK64) ? F_GETLK : + ((cmd == F_SETLK64) ? F_SETLK : F_SETLKW), + (unsigned long)&f); + set_fs(old_fs); + if (cmd == F_GETLK64 && ret == 0) { + /* need to return lock information - see above for commentary */ + if (f.l_start > COMPAT_LOFF_T_MAX) + ret = -EOVERFLOW; + if (f.l_len > COMPAT_LOFF_T_MAX) + f.l_len = COMPAT_LOFF_T_MAX; + if (ret == 0) + ret = put_compat_flock64(&f, compat_ptr(arg)); + } + break; + + default: + ret = sys_fcntl(fd, cmd, arg); + break; + } + return ret; +} + +asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd, + unsigned long arg) +{ + if ((cmd == F_GETLK64) || (cmd == F_SETLK64) || (cmd == F_SETLKW64)) + return -EINVAL; + return compat_sys_fcntl64(fd, cmd, arg); +} + +asmlinkage long +compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p) +{ + long ret; + aio_context_t ctx64; + + mm_segment_t oldfs = get_fs(); + if (unlikely(get_user(ctx64, ctx32p))) + return -EFAULT; + + set_fs(KERNEL_DS); + /* The __user pointer cast is valid because of the set_fs() */ + ret = sys_io_setup(nr_reqs, (aio_context_t __user *) &ctx64); + set_fs(oldfs); + /* truncating is ok because it's a user address */ + if (!ret) + ret = put_user((u32) ctx64, ctx32p); + return ret; +} + +asmlinkage long +compat_sys_io_getevents(aio_context_t ctx_id, + unsigned long min_nr, + unsigned long nr, + struct io_event __user *events, + struct compat_timespec __user *timeout) +{ + long ret; + struct timespec t; + struct timespec __user *ut = NULL; + + ret = -EFAULT; + if (unlikely(!access_ok(VERIFY_WRITE, events, + nr * sizeof(struct io_event)))) + goto out; + if (timeout) { + if (get_compat_timespec(&t, timeout)) + goto out; + + ut = compat_alloc_user_space(sizeof(*ut)); + if (copy_to_user(ut, &t, sizeof(t)) ) + goto out; + } + ret = sys_io_getevents(ctx_id, min_nr, nr, events, ut); +out: + return ret; +} + +/* A write operation does a read from user space and vice versa */ +#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) + +ssize_t compat_rw_copy_check_uvector(int type, + const struct compat_iovec __user *uvector, unsigned long nr_segs, + unsigned long fast_segs, struct iovec *fast_pointer, + struct iovec **ret_pointer) +{ + compat_ssize_t tot_len; + struct iovec *iov = *ret_pointer = fast_pointer; + ssize_t ret = 0; + int seg; + + /* + * SuS says "The readv() function *may* fail if the iovcnt argument + * was less than or equal to 0, or greater than {IOV_MAX}. Linux has + * traditionally returned zero for zero segments, so... + */ + if (nr_segs == 0) + goto out; + + ret = -EINVAL; + if (nr_segs > UIO_MAXIOV || nr_segs < 0) + goto out; + if (nr_segs > fast_segs) { + ret = -ENOMEM; + iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + if (iov == NULL) + goto out; + } + *ret_pointer = iov; + + /* + * Single unix specification: + * We should -EINVAL if an element length is not >= 0 and fitting an + * ssize_t. + * + * In Linux, the total length is limited to MAX_RW_COUNT, there is + * no overflow possibility. + */ + tot_len = 0; + ret = -EINVAL; + for (seg = 0; seg < nr_segs; seg++) { + compat_uptr_t buf; + compat_ssize_t len; + + if (__get_user(len, &uvector->iov_len) || + __get_user(buf, &uvector->iov_base)) { + ret = -EFAULT; + goto out; + } + if (len < 0) /* size_t not fitting in compat_ssize_t .. */ + goto out; + if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { + ret = -EFAULT; + goto out; + } + if (len > MAX_RW_COUNT - tot_len) + len = MAX_RW_COUNT - tot_len; + tot_len += len; + iov->iov_base = compat_ptr(buf); + iov->iov_len = (compat_size_t) len; + uvector++; + iov++; + } + ret = tot_len; + +out: + return ret; +} + +static inline long +copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) +{ + compat_uptr_t uptr; + int i; + + for (i = 0; i < nr; ++i) { + if (get_user(uptr, ptr32 + i)) + return -EFAULT; + if (put_user(compat_ptr(uptr), ptr64 + i)) + return -EFAULT; + } + return 0; +} + +#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) + +asmlinkage long +compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb) +{ + struct iocb __user * __user *iocb64; + long ret; + + if (unlikely(nr < 0)) + return -EINVAL; + + if (nr > MAX_AIO_SUBMITS) + nr = MAX_AIO_SUBMITS; + + iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); + ret = copy_iocb(nr, iocb, iocb64); + if (!ret) + ret = do_io_submit(ctx_id, nr, iocb64, 1); + return ret; +} + +struct compat_ncp_mount_data { + compat_int_t version; + compat_uint_t ncp_fd; + __compat_uid_t mounted_uid; + compat_pid_t wdog_pid; + unsigned char mounted_vol[NCP_VOLNAME_LEN + 1]; + compat_uint_t time_out; + compat_uint_t retry_count; + compat_uint_t flags; + __compat_uid_t uid; + __compat_gid_t gid; + compat_mode_t file_mode; + compat_mode_t dir_mode; +}; + +struct compat_ncp_mount_data_v4 { + compat_int_t version; + compat_ulong_t flags; + compat_ulong_t mounted_uid; + compat_long_t wdog_pid; + compat_uint_t ncp_fd; + compat_uint_t time_out; + compat_uint_t retry_count; + compat_ulong_t uid; + compat_ulong_t gid; + compat_ulong_t file_mode; + compat_ulong_t dir_mode; +}; + +static void *do_ncp_super_data_conv(void *raw_data) +{ + int version = *(unsigned int *)raw_data; + + if (version == 3) { + struct compat_ncp_mount_data *c_n = raw_data; + struct ncp_mount_data *n = raw_data; + + n->dir_mode = c_n->dir_mode; + n->file_mode = c_n->file_mode; + n->gid = c_n->gid; + n->uid = c_n->uid; + memmove (n->mounted_vol, c_n->mounted_vol, (sizeof (c_n->mounted_vol) + 3 * sizeof (unsigned int))); + n->wdog_pid = c_n->wdog_pid; + n->mounted_uid = c_n->mounted_uid; + } else if (version == 4) { + struct compat_ncp_mount_data_v4 *c_n = raw_data; + struct ncp_mount_data_v4 *n = raw_data; + + n->dir_mode = c_n->dir_mode; + n->file_mode = c_n->file_mode; + n->gid = c_n->gid; + n->uid = c_n->uid; + n->retry_count = c_n->retry_count; + n->time_out = c_n->time_out; + n->ncp_fd = c_n->ncp_fd; + n->wdog_pid = c_n->wdog_pid; + n->mounted_uid = c_n->mounted_uid; + n->flags = c_n->flags; + } else if (version != 5) { + return NULL; + } + + return raw_data; +} + + +struct compat_nfs_string { + compat_uint_t len; + compat_uptr_t data; +}; + +static inline void compat_nfs_string(struct nfs_string *dst, + struct compat_nfs_string *src) +{ + dst->data = compat_ptr(src->data); + dst->len = src->len; +} + +struct compat_nfs4_mount_data_v1 { + compat_int_t version; + compat_int_t flags; + compat_int_t rsize; + compat_int_t wsize; + compat_int_t timeo; + compat_int_t retrans; + compat_int_t acregmin; + compat_int_t acregmax; + compat_int_t acdirmin; + compat_int_t acdirmax; + struct compat_nfs_string client_addr; + struct compat_nfs_string mnt_path; + struct compat_nfs_string hostname; + compat_uint_t host_addrlen; + compat_uptr_t host_addr; + compat_int_t proto; + compat_int_t auth_flavourlen; + compat_uptr_t auth_flavours; +}; + +static int do_nfs4_super_data_conv(void *raw_data) +{ + int version = *(compat_uint_t *) raw_data; + + if (version == 1) { + struct compat_nfs4_mount_data_v1 *raw = raw_data; + struct nfs4_mount_data *real = raw_data; + + /* copy the fields backwards */ + real->auth_flavours = compat_ptr(raw->auth_flavours); + real->auth_flavourlen = raw->auth_flavourlen; + real->proto = raw->proto; + real->host_addr = compat_ptr(raw->host_addr); + real->host_addrlen = raw->host_addrlen; + compat_nfs_string(&real->hostname, &raw->hostname); + compat_nfs_string(&real->mnt_path, &raw->mnt_path); + compat_nfs_string(&real->client_addr, &raw->client_addr); + real->acdirmax = raw->acdirmax; + real->acdirmin = raw->acdirmin; + real->acregmax = raw->acregmax; + real->acregmin = raw->acregmin; + real->retrans = raw->retrans; + real->timeo = raw->timeo; + real->wsize = raw->wsize; + real->rsize = raw->rsize; + real->flags = raw->flags; + real->version = raw->version; + } + + return 0; +} + +#define NCPFS_NAME "ncpfs" +#define NFS4_NAME "nfs4" + +asmlinkage long compat_sys_mount(const char __user * dev_name, + const char __user * dir_name, + const char __user * type, unsigned long flags, + const void __user * data) +{ + char *kernel_type; + unsigned long data_page; + char *kernel_dev; + char *dir_page; + int retval; + + retval = copy_mount_string(type, &kernel_type); + if (retval < 0) + goto out; + + dir_page = getname(dir_name); + retval = PTR_ERR(dir_page); + if (IS_ERR(dir_page)) + goto out1; + + retval = copy_mount_string(dev_name, &kernel_dev); + if (retval < 0) + goto out2; + + retval = copy_mount_options(data, &data_page); + if (retval < 0) + goto out3; + + retval = -EINVAL; + + if (kernel_type && data_page) { + if (!strcmp(kernel_type, NCPFS_NAME)) { + do_ncp_super_data_conv((void *)data_page); + } else if (!strcmp(kernel_type, NFS4_NAME)) { + if (do_nfs4_super_data_conv((void *) data_page)) + goto out4; + } + } + + retval = do_mount(kernel_dev, dir_page, kernel_type, + flags, (void*)data_page); + + out4: + free_page(data_page); + out3: + kfree(kernel_dev); + out2: + putname(dir_page); + out1: + kfree(kernel_type); + out: + return retval; +} + +struct compat_old_linux_dirent { + compat_ulong_t d_ino; + compat_ulong_t d_offset; + unsigned short d_namlen; + char d_name[1]; +}; + +struct compat_readdir_callback { + struct compat_old_linux_dirent __user *dirent; + int result; +}; + +static int compat_fillonedir(void *__buf, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct compat_readdir_callback *buf = __buf; + struct compat_old_linux_dirent __user *dirent; + compat_ulong_t d_ino; + + if (buf->result) + return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; + return -EOVERFLOW; + } + buf->result++; + dirent = buf->dirent; + if (!access_ok(VERIFY_WRITE, dirent, + (unsigned long)(dirent->d_name + namlen + 1) - + (unsigned long)dirent)) + goto efault; + if ( __put_user(d_ino, &dirent->d_ino) || + __put_user(offset, &dirent->d_offset) || + __put_user(namlen, &dirent->d_namlen) || + __copy_to_user(dirent->d_name, name, namlen) || + __put_user(0, dirent->d_name + namlen)) + goto efault; + return 0; +efault: + buf->result = -EFAULT; + return -EFAULT; +} + +asmlinkage long compat_sys_old_readdir(unsigned int fd, + struct compat_old_linux_dirent __user *dirent, unsigned int count) +{ + int error; + struct file *file; + struct compat_readdir_callback buf; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + + buf.result = 0; + buf.dirent = dirent; + + error = vfs_readdir(file, compat_fillonedir, &buf); + if (buf.result) + error = buf.result; + + fput(file); +out: + return error; +} + +struct compat_linux_dirent { + compat_ulong_t d_ino; + compat_ulong_t d_off; + unsigned short d_reclen; + char d_name[1]; +}; + +struct compat_getdents_callback { + struct compat_linux_dirent __user *current_dir; + struct compat_linux_dirent __user *previous; + int count; + int error; +}; + +static int compat_filldir(void *__buf, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct compat_linux_dirent __user * dirent; + struct compat_getdents_callback *buf = __buf; + compat_ulong_t d_ino; + int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + + namlen + 2, sizeof(compat_long_t)); + + buf->error = -EINVAL; /* only used if we fail.. */ + if (reclen > buf->count) + return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; + return -EOVERFLOW; + } + dirent = buf->previous; + if (dirent) { + if (__put_user(offset, &dirent->d_off)) + goto efault; + } + dirent = buf->current_dir; + if (__put_user(d_ino, &dirent->d_ino)) + goto efault; + if (__put_user(reclen, &dirent->d_reclen)) + goto efault; + if (copy_to_user(dirent->d_name, name, namlen)) + goto efault; + if (__put_user(0, dirent->d_name + namlen)) + goto efault; + if (__put_user(d_type, (char __user *) dirent + reclen - 1)) + goto efault; + buf->previous = dirent; + dirent = (void __user *)dirent + reclen; + buf->current_dir = dirent; + buf->count -= reclen; + return 0; +efault: + buf->error = -EFAULT; + return -EFAULT; +} + +asmlinkage long compat_sys_getdents(unsigned int fd, + struct compat_linux_dirent __user *dirent, unsigned int count) +{ + struct file * file; + struct compat_linux_dirent __user * lastdirent; + struct compat_getdents_callback buf; + int error; + + error = -EFAULT; + if (!access_ok(VERIFY_WRITE, dirent, count)) + goto out; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + + buf.current_dir = dirent; + buf.previous = NULL; + buf.count = count; + buf.error = 0; + + error = vfs_readdir(file, compat_filldir, &buf); + if (error >= 0) + error = buf.error; + lastdirent = buf.previous; + if (lastdirent) { + if (put_user(file->f_pos, &lastdirent->d_off)) + error = -EFAULT; + else + error = count - buf.count; + } + fput(file); +out: + return error; +} + +#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64 + +struct compat_getdents_callback64 { + struct linux_dirent64 __user *current_dir; + struct linux_dirent64 __user *previous; + int count; + int error; +}; + +static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t offset, + u64 ino, unsigned int d_type) +{ + struct linux_dirent64 __user *dirent; + struct compat_getdents_callback64 *buf = __buf; + int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, + sizeof(u64)); + u64 off; + + buf->error = -EINVAL; /* only used if we fail.. */ + if (reclen > buf->count) + return -EINVAL; + dirent = buf->previous; + + if (dirent) { + if (__put_user_unaligned(offset, &dirent->d_off)) + goto efault; + } + dirent = buf->current_dir; + if (__put_user_unaligned(ino, &dirent->d_ino)) + goto efault; + off = 0; + if (__put_user_unaligned(off, &dirent->d_off)) + goto efault; + if (__put_user(reclen, &dirent->d_reclen)) + goto efault; + if (__put_user(d_type, &dirent->d_type)) + goto efault; + if (copy_to_user(dirent->d_name, name, namlen)) + goto efault; + if (__put_user(0, dirent->d_name + namlen)) + goto efault; + buf->previous = dirent; + dirent = (void __user *)dirent + reclen; + buf->current_dir = dirent; + buf->count -= reclen; + return 0; +efault: + buf->error = -EFAULT; + return -EFAULT; +} + +asmlinkage long compat_sys_getdents64(unsigned int fd, + struct linux_dirent64 __user * dirent, unsigned int count) +{ + struct file * file; + struct linux_dirent64 __user * lastdirent; + struct compat_getdents_callback64 buf; + int error; + + error = -EFAULT; + if (!access_ok(VERIFY_WRITE, dirent, count)) + goto out; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + + buf.current_dir = dirent; + buf.previous = NULL; + buf.count = count; + buf.error = 0; + + error = vfs_readdir(file, compat_filldir64, &buf); + if (error >= 0) + error = buf.error; + lastdirent = buf.previous; + if (lastdirent) { + typeof(lastdirent->d_off) d_off = file->f_pos; + if (__put_user_unaligned(d_off, &lastdirent->d_off)) + error = -EFAULT; + else + error = count - buf.count; + } + fput(file); +out: + return error; +} +#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ + +static ssize_t compat_do_readv_writev(int type, struct file *file, + const struct compat_iovec __user *uvector, + unsigned long nr_segs, loff_t *pos) +{ + compat_ssize_t tot_len; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + ssize_t ret; + io_fn_t fn; + iov_fn_t fnv; + + ret = -EINVAL; + if (!file->f_op) + goto out; + + ret = -EFAULT; + if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) + goto out; + + tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs, + UIO_FASTIOV, iovstack, &iov); + if (tot_len == 0) { + ret = 0; + goto out; + } + + ret = rw_verify_area(type, file, pos, tot_len); + if (ret < 0) + goto out; + + fnv = NULL; + if (type == READ) { + fn = file->f_op->read; + fnv = file->f_op->aio_read; + } else { + fn = (io_fn_t)file->f_op->write; + fnv = file->f_op->aio_write; + } + + if (fnv) + ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, + pos, fnv); + else + ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); + +out: + if (iov != iovstack) + kfree(iov); + if ((ret + (type == READ)) > 0) { + if (type == READ) + fsnotify_access(file); + else + fsnotify_modify(file); + } + return ret; +} + +static size_t compat_readv(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + ssize_t ret = -EBADF; + + if (!(file->f_mode & FMODE_READ)) + goto out; + + ret = -EINVAL; + if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) + goto out; + + ret = compat_do_readv_writev(READ, file, vec, vlen, pos); + +out: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +asmlinkage ssize_t +compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen) +{ + struct file *file; + int fput_needed; + ssize_t ret; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + ret = compat_readv(file, vec, vlen, &file->f_pos); + fput_light(file, fput_needed); + return ret; +} + +asmlinkage ssize_t +compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen, u32 pos_low, u32 pos_high) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + struct file *file; + int fput_needed; + ssize_t ret; + + if (pos < 0) + return -EINVAL; + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + ret = -ESPIPE; + if (file->f_mode & FMODE_PREAD) + ret = compat_readv(file, vec, vlen, &pos); + fput_light(file, fput_needed); + return ret; +} + +static size_t compat_writev(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + ssize_t ret = -EBADF; + + if (!(file->f_mode & FMODE_WRITE)) + goto out; + + ret = -EINVAL; + if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) + goto out; + + ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); + +out: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + +asmlinkage ssize_t +compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen) +{ + struct file *file; + int fput_needed; + ssize_t ret; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + ret = compat_writev(file, vec, vlen, &file->f_pos); + fput_light(file, fput_needed); + return ret; +} + +asmlinkage ssize_t +compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen, u32 pos_low, u32 pos_high) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + struct file *file; + int fput_needed; + ssize_t ret; + + if (pos < 0) + return -EINVAL; + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + ret = -ESPIPE; + if (file->f_mode & FMODE_PWRITE) + ret = compat_writev(file, vec, vlen, &pos); + fput_light(file, fput_needed); + return ret; +} + +asmlinkage long +compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32, + unsigned int nr_segs, unsigned int flags) +{ + unsigned i; + struct iovec __user *iov; + if (nr_segs > UIO_MAXIOV) + return -EINVAL; + iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); + for (i = 0; i < nr_segs; i++) { + struct compat_iovec v; + if (get_user(v.iov_base, &iov32[i].iov_base) || + get_user(v.iov_len, &iov32[i].iov_len) || + put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || + put_user(v.iov_len, &iov[i].iov_len)) + return -EFAULT; + } + return sys_vmsplice(fd, iov, nr_segs, flags); +} + +/* + * Exactly like fs/open.c:sys_open(), except that it doesn't set the + * O_LARGEFILE flag. + */ +asmlinkage long +compat_sys_open(const char __user *filename, int flags, int mode) +{ + return do_sys_open(AT_FDCWD, filename, flags, mode); +} + +/* + * Exactly like fs/open.c:sys_openat(), except that it doesn't set the + * O_LARGEFILE flag. + */ +asmlinkage long +compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode) +{ + return do_sys_open(dfd, filename, flags, mode); +} + +#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) + +static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, + int timeval, int ret) +{ + struct timespec ts; + + if (!p) + return ret; + + if (current->personality & STICKY_TIMEOUTS) + goto sticky; + + /* No update for zero timeout */ + if (!end_time->tv_sec && !end_time->tv_nsec) + return ret; + + ktime_get_ts(&ts); + ts = timespec_sub(*end_time, ts); + if (ts.tv_sec < 0) + ts.tv_sec = ts.tv_nsec = 0; + + if (timeval) { + struct compat_timeval rtv; + + rtv.tv_sec = ts.tv_sec; + rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; + + if (!copy_to_user(p, &rtv, sizeof(rtv))) + return ret; + } else { + struct compat_timespec rts; + + rts.tv_sec = ts.tv_sec; + rts.tv_nsec = ts.tv_nsec; + + if (!copy_to_user(p, &rts, sizeof(rts))) + return ret; + } + /* + * If an application puts its timeval in read-only memory, we + * don't want the Linux-specific update to the timeval to + * cause a fault after the select has completed + * successfully. However, because we're not updating the + * timeval, we can't restart the system call. + */ + +sticky: + if (ret == -ERESTARTNOHAND) + ret = -EINTR; + return ret; +} + +/* + * Ooo, nasty. We need here to frob 32-bit unsigned longs to + * 64-bit unsigned longs. + */ +static +int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, + unsigned long *fdset) +{ + nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); + if (ufdset) { + unsigned long odd; + + if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t))) + return -EFAULT; + + odd = nr & 1UL; + nr &= ~1UL; + while (nr) { + unsigned long h, l; + if (__get_user(l, ufdset) || __get_user(h, ufdset+1)) + return -EFAULT; + ufdset += 2; + *fdset++ = h << 32 | l; + nr -= 2; + } + if (odd && __get_user(*fdset, ufdset)) + return -EFAULT; + } else { + /* Tricky, must clear full unsigned long in the + * kernel fdset at the end, this makes sure that + * actually happens. + */ + memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t)); + } + return 0; +} + +static +int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, + unsigned long *fdset) +{ + unsigned long odd; + nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); + + if (!ufdset) + return 0; + + odd = nr & 1UL; + nr &= ~1UL; + while (nr) { + unsigned long h, l; + l = *fdset++; + h = l >> 32; + if (__put_user(l, ufdset) || __put_user(h, ufdset+1)) + return -EFAULT; + ufdset += 2; + nr -= 2; + } + if (odd && __put_user(*fdset, ufdset)) + return -EFAULT; + return 0; +} + + +/* + * This is a virtual copy of sys_select from fs/select.c and probably + * should be compared to it from time to time + */ + +/* + * We can actually return ERESTARTSYS instead of EINTR, but I'd + * like to be certain this leads to no problems. So I return + * EINTR just for safety. + * + * Update: ERESTARTSYS breaks at least the xview clock binary, so + * I'm trying ERESTARTNOHAND which restart only when you want to. + */ +int compat_core_sys_select(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct timespec *end_time) +{ + fd_set_bits fds; + void *bits; + int size, max_fds, ret = -EINVAL; + struct fdtable *fdt; + long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; + + if (n < 0) + goto out_nofds; + + /* max_fds can increase, so grab it once to avoid race */ + rcu_read_lock(); + fdt = files_fdtable(current->files); + max_fds = fdt->max_fds; + rcu_read_unlock(); + if (n > max_fds) + n = max_fds; + + /* + * We need 6 bitmaps (in/out/ex for both incoming and outgoing), + * since we used fdset we need to allocate memory in units of + * long-words. + */ + size = FDS_BYTES(n); + bits = stack_fds; + if (size > sizeof(stack_fds) / 6) { + bits = kmalloc(6 * size, GFP_KERNEL); + ret = -ENOMEM; + if (!bits) + goto out_nofds; + } + fds.in = (unsigned long *) bits; + fds.out = (unsigned long *) (bits + size); + fds.ex = (unsigned long *) (bits + 2*size); + fds.res_in = (unsigned long *) (bits + 3*size); + fds.res_out = (unsigned long *) (bits + 4*size); + fds.res_ex = (unsigned long *) (bits + 5*size); + + if ((ret = compat_get_fd_set(n, inp, fds.in)) || + (ret = compat_get_fd_set(n, outp, fds.out)) || + (ret = compat_get_fd_set(n, exp, fds.ex))) + goto out; + zero_fd_set(n, fds.res_in); + zero_fd_set(n, fds.res_out); + zero_fd_set(n, fds.res_ex); + + ret = do_select(n, &fds, end_time); + + if (ret < 0) + goto out; + if (!ret) { + ret = -ERESTARTNOHAND; + if (signal_pending(current)) + goto out; + ret = 0; + } + + if (compat_set_fd_set(n, inp, fds.res_in) || + compat_set_fd_set(n, outp, fds.res_out) || + compat_set_fd_set(n, exp, fds.res_ex)) + ret = -EFAULT; +out: + if (bits != stack_fds) + kfree(bits); +out_nofds: + return ret; +} + +asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct compat_timeval __user *tvp) +{ + struct timespec end_time, *to = NULL; + struct compat_timeval tv; + int ret; + + if (tvp) { + if (copy_from_user(&tv, tvp, sizeof(tv))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, + tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), + (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) + return -EINVAL; + } + + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = poll_select_copy_remaining(&end_time, tvp, 1, ret); + + return ret; +} + +struct compat_sel_arg_struct { + compat_ulong_t n; + compat_uptr_t inp; + compat_uptr_t outp; + compat_uptr_t exp; + compat_uptr_t tvp; +}; + +asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg) +{ + struct compat_sel_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), + compat_ptr(a.exp), compat_ptr(a.tvp)); +} + +#ifdef HAVE_SET_RESTORE_SIGMASK +static long do_compat_pselect(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, + compat_size_t sigsetsize) +{ + compat_sigset_t ss32; + sigset_t ksigmask, sigsaved; + struct compat_timespec ts; + struct timespec end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&ksigmask, &ss32); + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); + + if (ret == -ERESTARTNOHAND) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + return ret; +} + +asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct compat_timespec __user *tsp, void __user *sig) +{ + compat_size_t sigsetsize = 0; + compat_uptr_t up = 0; + + if (sig) { + if (!access_ok(VERIFY_READ, sig, + sizeof(compat_uptr_t)+sizeof(compat_size_t)) || + __get_user(up, (compat_uptr_t __user *)sig) || + __get_user(sigsetsize, + (compat_size_t __user *)(sig+sizeof(up)))) + return -EFAULT; + } + return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up), + sigsetsize); +} + +asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, + unsigned int nfds, struct compat_timespec __user *tsp, + const compat_sigset_t __user *sigmask, compat_size_t sigsetsize) +{ + compat_sigset_t ss32; + sigset_t ksigmask, sigsaved; + struct compat_timespec ts; + struct timespec end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&ksigmask, &ss32); + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = do_sys_poll(ufds, nfds, to); + + /* We can restart this syscall, usually */ + if (ret == -EINTR) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + ret = -ERESTARTNOHAND; + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); + + return ret; +} +#endif /* HAVE_SET_RESTORE_SIGMASK */ + +#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED) +/* Stuff for NFS server syscalls... */ +struct compat_nfsctl_svc { + u16 svc32_port; + s32 svc32_nthreads; +}; + +struct compat_nfsctl_client { + s8 cl32_ident[NFSCLNT_IDMAX+1]; + s32 cl32_naddr; + struct in_addr cl32_addrlist[NFSCLNT_ADDRMAX]; + s32 cl32_fhkeytype; + s32 cl32_fhkeylen; + u8 cl32_fhkey[NFSCLNT_KEYMAX]; +}; + +struct compat_nfsctl_export { + char ex32_client[NFSCLNT_IDMAX+1]; + char ex32_path[NFS_MAXPATHLEN+1]; + compat_dev_t ex32_dev; + compat_ino_t ex32_ino; + compat_int_t ex32_flags; + __compat_uid_t ex32_anon_uid; + __compat_gid_t ex32_anon_gid; +}; + +struct compat_nfsctl_fdparm { + struct sockaddr gd32_addr; + s8 gd32_path[NFS_MAXPATHLEN+1]; + compat_int_t gd32_version; +}; + +struct compat_nfsctl_fsparm { + struct sockaddr gd32_addr; + s8 gd32_path[NFS_MAXPATHLEN+1]; + compat_int_t gd32_maxlen; +}; + +struct compat_nfsctl_arg { + compat_int_t ca32_version; /* safeguard */ + union { + struct compat_nfsctl_svc u32_svc; + struct compat_nfsctl_client u32_client; + struct compat_nfsctl_export u32_export; + struct compat_nfsctl_fdparm u32_getfd; + struct compat_nfsctl_fsparm u32_getfs; + } u; +#define ca32_svc u.u32_svc +#define ca32_client u.u32_client +#define ca32_export u.u32_export +#define ca32_getfd u.u32_getfd +#define ca32_getfs u.u32_getfs +}; + +union compat_nfsctl_res { + __u8 cr32_getfh[NFS_FHSIZE]; + struct knfsd_fh cr32_getfs; +}; + +static int compat_nfs_svc_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) +{ + if (!access_ok(VERIFY_READ, &arg->ca32_svc, sizeof(arg->ca32_svc)) || + get_user(karg->ca_version, &arg->ca32_version) || + __get_user(karg->ca_svc.svc_port, &arg->ca32_svc.svc32_port) || + __get_user(karg->ca_svc.svc_nthreads, + &arg->ca32_svc.svc32_nthreads)) + return -EFAULT; + return 0; +} + +static int compat_nfs_clnt_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) +{ + if (!access_ok(VERIFY_READ, &arg->ca32_client, + sizeof(arg->ca32_client)) || + get_user(karg->ca_version, &arg->ca32_version) || + __copy_from_user(&karg->ca_client.cl_ident[0], + &arg->ca32_client.cl32_ident[0], + NFSCLNT_IDMAX) || + __get_user(karg->ca_client.cl_naddr, + &arg->ca32_client.cl32_naddr) || + __copy_from_user(&karg->ca_client.cl_addrlist[0], + &arg->ca32_client.cl32_addrlist[0], + (sizeof(struct in_addr) * NFSCLNT_ADDRMAX)) || + __get_user(karg->ca_client.cl_fhkeytype, + &arg->ca32_client.cl32_fhkeytype) || + __get_user(karg->ca_client.cl_fhkeylen, + &arg->ca32_client.cl32_fhkeylen) || + __copy_from_user(&karg->ca_client.cl_fhkey[0], + &arg->ca32_client.cl32_fhkey[0], + NFSCLNT_KEYMAX)) + return -EFAULT; + + return 0; +} + +static int compat_nfs_exp_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) +{ + if (!access_ok(VERIFY_READ, &arg->ca32_export, + sizeof(arg->ca32_export)) || + get_user(karg->ca_version, &arg->ca32_version) || + __copy_from_user(&karg->ca_export.ex_client[0], + &arg->ca32_export.ex32_client[0], + NFSCLNT_IDMAX) || + __copy_from_user(&karg->ca_export.ex_path[0], + &arg->ca32_export.ex32_path[0], + NFS_MAXPATHLEN) || + __get_user(karg->ca_export.ex_dev, + &arg->ca32_export.ex32_dev) || + __get_user(karg->ca_export.ex_ino, + &arg->ca32_export.ex32_ino) || + __get_user(karg->ca_export.ex_flags, + &arg->ca32_export.ex32_flags) || + __get_user(karg->ca_export.ex_anon_uid, + &arg->ca32_export.ex32_anon_uid) || + __get_user(karg->ca_export.ex_anon_gid, + &arg->ca32_export.ex32_anon_gid)) + return -EFAULT; + SET_UID(karg->ca_export.ex_anon_uid, karg->ca_export.ex_anon_uid); + SET_GID(karg->ca_export.ex_anon_gid, karg->ca_export.ex_anon_gid); + + return 0; +} + +static int compat_nfs_getfd_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) +{ + if (!access_ok(VERIFY_READ, &arg->ca32_getfd, + sizeof(arg->ca32_getfd)) || + get_user(karg->ca_version, &arg->ca32_version) || + __copy_from_user(&karg->ca_getfd.gd_addr, + &arg->ca32_getfd.gd32_addr, + (sizeof(struct sockaddr))) || + __copy_from_user(&karg->ca_getfd.gd_path, + &arg->ca32_getfd.gd32_path, + (NFS_MAXPATHLEN+1)) || + __get_user(karg->ca_getfd.gd_version, + &arg->ca32_getfd.gd32_version)) + return -EFAULT; + + return 0; +} + +static int compat_nfs_getfs_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) +{ + if (!access_ok(VERIFY_READ,&arg->ca32_getfs,sizeof(arg->ca32_getfs)) || + get_user(karg->ca_version, &arg->ca32_version) || + __copy_from_user(&karg->ca_getfs.gd_addr, + &arg->ca32_getfs.gd32_addr, + (sizeof(struct sockaddr))) || + __copy_from_user(&karg->ca_getfs.gd_path, + &arg->ca32_getfs.gd32_path, + (NFS_MAXPATHLEN+1)) || + __get_user(karg->ca_getfs.gd_maxlen, + &arg->ca32_getfs.gd32_maxlen)) + return -EFAULT; + + return 0; +} + +/* This really doesn't need translations, we are only passing + * back a union which contains opaque nfs file handle data. + */ +static int compat_nfs_getfh_res_trans(union nfsctl_res *kres, + union compat_nfsctl_res __user *res) +{ + int err; + + err = copy_to_user(res, kres, sizeof(*res)); + + return (err) ? -EFAULT : 0; +} + +asmlinkage long compat_sys_nfsservctl(int cmd, + struct compat_nfsctl_arg __user *arg, + union compat_nfsctl_res __user *res) +{ + struct nfsctl_arg *karg; + union nfsctl_res *kres; + mm_segment_t oldfs; + int err; + + karg = kmalloc(sizeof(*karg), GFP_USER); + kres = kmalloc(sizeof(*kres), GFP_USER); + if(!karg || !kres) { + err = -ENOMEM; + goto done; + } + + switch(cmd) { + case NFSCTL_SVC: + err = compat_nfs_svc_trans(karg, arg); + break; + + case NFSCTL_ADDCLIENT: + err = compat_nfs_clnt_trans(karg, arg); + break; + + case NFSCTL_DELCLIENT: + err = compat_nfs_clnt_trans(karg, arg); + break; + + case NFSCTL_EXPORT: + case NFSCTL_UNEXPORT: + err = compat_nfs_exp_trans(karg, arg); + break; + + case NFSCTL_GETFD: + err = compat_nfs_getfd_trans(karg, arg); + break; + + case NFSCTL_GETFS: + err = compat_nfs_getfs_trans(karg, arg); + break; + + default: + err = -EINVAL; + break; + } + + if (err) + goto done; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + /* The __user pointer casts are valid because of the set_fs() */ + err = sys_nfsservctl(cmd, (void __user *) karg, (void __user *) kres); + set_fs(oldfs); + + if (err) + goto done; + + if((cmd == NFSCTL_GETFD) || + (cmd == NFSCTL_GETFS)) + err = compat_nfs_getfh_res_trans(kres, res); + +done: + kfree(karg); + kfree(kres); + return err; +} +#else /* !NFSD */ +long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2) +{ + return sys_ni_syscall(); +} +#endif + +#ifdef CONFIG_EPOLL + +#ifdef HAVE_SET_RESTORE_SIGMASK +asmlinkage long compat_sys_epoll_pwait(int epfd, + struct compat_epoll_event __user *events, + int maxevents, int timeout, + const compat_sigset_t __user *sigmask, + compat_size_t sigsetsize) +{ + long err; + compat_sigset_t csigmask; + sigset_t ksigmask, sigsaved; + + /* + * If the caller wants a certain signal mask to be set during the wait, + * we apply it here. + */ + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) + return -EFAULT; + sigset_from_compat(&ksigmask, &csigmask); + sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + err = sys_epoll_wait(epfd, events, maxevents, timeout); + + /* + * If we changed the signal mask, we need to restore the original one. + * In case we've got a signal while waiting, we do not restore the + * signal mask yet, and we allow do_signal() to deliver the signal on + * the way back to userspace, before the signal mask is restored. + */ + if (sigmask) { + if (err == -EINTR) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } else + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + } + + return err; +} +#endif /* HAVE_SET_RESTORE_SIGMASK */ + +#endif /* CONFIG_EPOLL */ + +#ifdef CONFIG_SIGNALFD + +asmlinkage long compat_sys_signalfd4(int ufd, + const compat_sigset_t __user *sigmask, + compat_size_t sigsetsize, int flags) +{ + compat_sigset_t ss32; + sigset_t tmp; + sigset_t __user *ksigmask; + + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&tmp, &ss32); + ksigmask = compat_alloc_user_space(sizeof(sigset_t)); + if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) + return -EFAULT; + + return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); +} + +asmlinkage long compat_sys_signalfd(int ufd, + const compat_sigset_t __user *sigmask, + compat_size_t sigsetsize) +{ + return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0); +} +#endif /* CONFIG_SIGNALFD */ + +#ifdef CONFIG_TIMERFD + +asmlinkage long compat_sys_timerfd_settime(int ufd, int flags, + const struct compat_itimerspec __user *utmr, + struct compat_itimerspec __user *otmr) +{ + int error; + struct itimerspec t; + struct itimerspec __user *ut; + + if (get_compat_itimerspec(&t, utmr)) + return -EFAULT; + ut = compat_alloc_user_space(2 * sizeof(struct itimerspec)); + if (copy_to_user(&ut[0], &t, sizeof(t))) + return -EFAULT; + error = sys_timerfd_settime(ufd, flags, &ut[0], &ut[1]); + if (!error && otmr) + error = (copy_from_user(&t, &ut[1], sizeof(struct itimerspec)) || + put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0; + + return error; +} + +asmlinkage long compat_sys_timerfd_gettime(int ufd, + struct compat_itimerspec __user *otmr) +{ + int error; + struct itimerspec t; + struct itimerspec __user *ut; + + ut = compat_alloc_user_space(sizeof(struct itimerspec)); + error = sys_timerfd_gettime(ufd, ut); + if (!error) + error = (copy_from_user(&t, ut, sizeof(struct itimerspec)) || + put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0; + + return error; +} + +#endif /* CONFIG_TIMERFD */ + +#ifdef CONFIG_FHANDLE +/* + * Exactly like fs/open.c:sys_open_by_handle_at(), except that it + * doesn't set the O_LARGEFILE flag. + */ +asmlinkage long +compat_sys_open_by_handle_at(int mountdirfd, + struct file_handle __user *handle, int flags) +{ + return do_handle_open(mountdirfd, handle, flags); +} +#endif diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c new file mode 100644 index 00000000..112e45a1 --- /dev/null +++ b/fs/compat_binfmt_elf.c @@ -0,0 +1,133 @@ +/* + * 32-bit compatibility support for ELF format executables and core dumps. + * + * Copyright (C) 2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * Red Hat Author: Roland McGrath. + * + * This file is used in a 64-bit kernel that wants to support 32-bit ELF. + * asm/elf.h is responsible for defining the compat_* and COMPAT_* macros + * used below, with definitions appropriate for 32-bit ABI compatibility. + * + * We use macros to rename the ABI types and machine-dependent + * functions used in binfmt_elf.c to compat versions. + */ + +#include +#include + +/* + * Rename the basic ELF layout types to refer to the 32-bit class of files. + */ +#undef ELF_CLASS +#define ELF_CLASS ELFCLASS32 + +#undef elfhdr +#undef elf_phdr +#undef elf_shdr +#undef elf_note +#undef elf_addr_t +#define elfhdr elf32_hdr +#define elf_phdr elf32_phdr +#define elf_shdr elf32_shdr +#define elf_note elf32_note +#define elf_addr_t Elf32_Addr + +/* + * The machine-dependent core note format types are defined in elfcore-compat.h, + * which requires asm/elf.h to define compat_elf_gregset_t et al. + */ +#define elf_prstatus compat_elf_prstatus +#define elf_prpsinfo compat_elf_prpsinfo + +/* + * Compat version of cputime_to_compat_timeval, perhaps this + * should be an inline in . + */ +static void cputime_to_compat_timeval(const cputime_t cputime, + struct compat_timeval *value) +{ + struct timeval tv; + cputime_to_timeval(cputime, &tv); + value->tv_sec = tv.tv_sec; + value->tv_usec = tv.tv_usec; +} + +#undef cputime_to_timeval +#define cputime_to_timeval cputime_to_compat_timeval + + +/* + * To use this file, asm/elf.h must define compat_elf_check_arch. + * The other following macros can be defined if the compat versions + * differ from the native ones, or omitted when they match. + */ + +#undef ELF_ARCH +#undef elf_check_arch +#define elf_check_arch compat_elf_check_arch + +#ifdef COMPAT_ELF_PLATFORM +#undef ELF_PLATFORM +#define ELF_PLATFORM COMPAT_ELF_PLATFORM +#endif + +#ifdef COMPAT_ELF_HWCAP +#undef ELF_HWCAP +#define ELF_HWCAP COMPAT_ELF_HWCAP +#endif + +#ifdef COMPAT_ARCH_DLINFO +#undef ARCH_DLINFO +#define ARCH_DLINFO COMPAT_ARCH_DLINFO +#endif + +#ifdef COMPAT_ELF_ET_DYN_BASE +#undef ELF_ET_DYN_BASE +#define ELF_ET_DYN_BASE COMPAT_ELF_ET_DYN_BASE +#endif + +#ifdef COMPAT_ELF_EXEC_PAGESIZE +#undef ELF_EXEC_PAGESIZE +#define ELF_EXEC_PAGESIZE COMPAT_ELF_EXEC_PAGESIZE +#endif + +#ifdef COMPAT_ELF_PLAT_INIT +#undef ELF_PLAT_INIT +#define ELF_PLAT_INIT COMPAT_ELF_PLAT_INIT +#endif + +#ifdef COMPAT_SET_PERSONALITY +#undef SET_PERSONALITY +#define SET_PERSONALITY COMPAT_SET_PERSONALITY +#endif + +#ifdef compat_start_thread +#undef start_thread +#define start_thread compat_start_thread +#endif + +#ifdef compat_arch_setup_additional_pages +#undef ARCH_HAS_SETUP_ADDITIONAL_PAGES +#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 +#undef arch_setup_additional_pages +#define arch_setup_additional_pages compat_arch_setup_additional_pages +#endif + +/* + * Rename a few of the symbols that binfmt_elf.c will define. + * These are all local so the names don't really matter, but it + * might make some debugging less confusing not to duplicate them. + */ +#define elf_format compat_elf_format +#define init_elf_binfmt init_compat_elf_binfmt +#define exit_elf_binfmt exit_compat_elf_binfmt + +/* + * We share all the actual code with the native (64-bit) version. + */ +#include "binfmt_elf.c" diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c new file mode 100644 index 00000000..61abb638 --- /dev/null +++ b/fs/compat_ioctl.c @@ -0,0 +1,1656 @@ +/* + * ioctl32.c: Conversion between 32bit and 64bit native ioctls. + * + * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) + * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs + * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) + * + * These routines maintain argument size conversion between 32bit and 64bit + * ioctls. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#ifdef CONFIG_BLOCK +#include +#include +#include +#include +#endif + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include + +#ifdef CONFIG_SPARC +#include +#endif + +static int w_long(unsigned int fd, unsigned int cmd, + compat_ulong_t __user *argp) +{ + mm_segment_t old_fs = get_fs(); + int err; + unsigned long val; + + set_fs (KERNEL_DS); + err = sys_ioctl(fd, cmd, (unsigned long)&val); + set_fs (old_fs); + if (!err && put_user(val, argp)) + return -EFAULT; + return err; +} + +struct compat_video_event { + int32_t type; + compat_time_t timestamp; + union { + video_size_t size; + unsigned int frame_rate; + } u; +}; + +static int do_video_get_event(unsigned int fd, unsigned int cmd, + struct compat_video_event __user *up) +{ + struct video_event kevent; + mm_segment_t old_fs = get_fs(); + int err; + + set_fs(KERNEL_DS); + err = sys_ioctl(fd, cmd, (unsigned long) &kevent); + set_fs(old_fs); + + if (!err) { + err = put_user(kevent.type, &up->type); + err |= put_user(kevent.timestamp, &up->timestamp); + err |= put_user(kevent.u.size.w, &up->u.size.w); + err |= put_user(kevent.u.size.h, &up->u.size.h); + err |= put_user(kevent.u.size.aspect_ratio, + &up->u.size.aspect_ratio); + if (err) + err = -EFAULT; + } + + return err; +} + +struct compat_video_still_picture { + compat_uptr_t iFrame; + int32_t size; +}; + +static int do_video_stillpicture(unsigned int fd, unsigned int cmd, + struct compat_video_still_picture __user *up) +{ + struct video_still_picture __user *up_native; + compat_uptr_t fp; + int32_t size; + int err; + + err = get_user(fp, &up->iFrame); + err |= get_user(size, &up->size); + if (err) + return -EFAULT; + + up_native = + compat_alloc_user_space(sizeof(struct video_still_picture)); + + err = put_user(compat_ptr(fp), &up_native->iFrame); + err |= put_user(size, &up_native->size); + if (err) + return -EFAULT; + + err = sys_ioctl(fd, cmd, (unsigned long) up_native); + + return err; +} + +struct compat_video_spu_palette { + int length; + compat_uptr_t palette; +}; + +static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, + struct compat_video_spu_palette __user *up) +{ + struct video_spu_palette __user *up_native; + compat_uptr_t palp; + int length, err; + + err = get_user(palp, &up->palette); + err |= get_user(length, &up->length); + + up_native = compat_alloc_user_space(sizeof(struct video_spu_palette)); + err = put_user(compat_ptr(palp), &up_native->palette); + err |= put_user(length, &up_native->length); + if (err) + return -EFAULT; + + err = sys_ioctl(fd, cmd, (unsigned long) up_native); + + return err; +} + +#ifdef CONFIG_BLOCK +typedef struct sg_io_hdr32 { + compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */ + compat_int_t dxfer_direction; /* [i] data transfer direction */ + unsigned char cmd_len; /* [i] SCSI command length ( <= 16 bytes) */ + unsigned char mx_sb_len; /* [i] max length to write to sbp */ + unsigned short iovec_count; /* [i] 0 implies no scatter gather */ + compat_uint_t dxfer_len; /* [i] byte count of data transfer */ + compat_uint_t dxferp; /* [i], [*io] points to data transfer memory + or scatter gather list */ + compat_uptr_t cmdp; /* [i], [*i] points to command to perform */ + compat_uptr_t sbp; /* [i], [*o] points to sense_buffer memory */ + compat_uint_t timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */ + compat_uint_t flags; /* [i] 0 -> default, see SG_FLAG... */ + compat_int_t pack_id; /* [i->o] unused internally (normally) */ + compat_uptr_t usr_ptr; /* [i->o] unused internally */ + unsigned char status; /* [o] scsi status */ + unsigned char masked_status; /* [o] shifted, masked scsi status */ + unsigned char msg_status; /* [o] messaging level data (optional) */ + unsigned char sb_len_wr; /* [o] byte count actually written to sbp */ + unsigned short host_status; /* [o] errors from host adapter */ + unsigned short driver_status; /* [o] errors from software driver */ + compat_int_t resid; /* [o] dxfer_len - actual_transferred */ + compat_uint_t duration; /* [o] time taken by cmd (unit: millisec) */ + compat_uint_t info; /* [o] auxiliary information */ +} sg_io_hdr32_t; /* 64 bytes long (on sparc32) */ + +typedef struct sg_iovec32 { + compat_uint_t iov_base; + compat_uint_t iov_len; +} sg_iovec32_t; + +static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count) +{ + sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1); + sg_iovec32_t __user *iov32 = dxferp; + int i; + + for (i = 0; i < iovec_count; i++) { + u32 base, len; + + if (get_user(base, &iov32[i].iov_base) || + get_user(len, &iov32[i].iov_len) || + put_user(compat_ptr(base), &iov[i].iov_base) || + put_user(len, &iov[i].iov_len)) + return -EFAULT; + } + + if (put_user(iov, &sgio->dxferp)) + return -EFAULT; + return 0; +} + +static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, + sg_io_hdr32_t __user *sgio32) +{ + sg_io_hdr_t __user *sgio; + u16 iovec_count; + u32 data; + void __user *dxferp; + int err; + int interface_id; + + if (get_user(interface_id, &sgio32->interface_id)) + return -EFAULT; + if (interface_id != 'S') + return sys_ioctl(fd, cmd, (unsigned long)sgio32); + + if (get_user(iovec_count, &sgio32->iovec_count)) + return -EFAULT; + + { + void __user *top = compat_alloc_user_space(0); + void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) + + (iovec_count * sizeof(sg_iovec_t))); + if (new > top) + return -EINVAL; + + sgio = new; + } + + /* Ok, now construct. */ + if (copy_in_user(&sgio->interface_id, &sgio32->interface_id, + (2 * sizeof(int)) + + (2 * sizeof(unsigned char)) + + (1 * sizeof(unsigned short)) + + (1 * sizeof(unsigned int)))) + return -EFAULT; + + if (get_user(data, &sgio32->dxferp)) + return -EFAULT; + dxferp = compat_ptr(data); + if (iovec_count) { + if (sg_build_iovec(sgio, dxferp, iovec_count)) + return -EFAULT; + } else { + if (put_user(dxferp, &sgio->dxferp)) + return -EFAULT; + } + + { + unsigned char __user *cmdp; + unsigned char __user *sbp; + + if (get_user(data, &sgio32->cmdp)) + return -EFAULT; + cmdp = compat_ptr(data); + + if (get_user(data, &sgio32->sbp)) + return -EFAULT; + sbp = compat_ptr(data); + + if (put_user(cmdp, &sgio->cmdp) || + put_user(sbp, &sgio->sbp)) + return -EFAULT; + } + + if (copy_in_user(&sgio->timeout, &sgio32->timeout, + 3 * sizeof(int))) + return -EFAULT; + + if (get_user(data, &sgio32->usr_ptr)) + return -EFAULT; + if (put_user(compat_ptr(data), &sgio->usr_ptr)) + return -EFAULT; + + err = sys_ioctl(fd, cmd, (unsigned long) sgio); + + if (err >= 0) { + void __user *datap; + + if (copy_in_user(&sgio32->pack_id, &sgio->pack_id, + sizeof(int)) || + get_user(datap, &sgio->usr_ptr) || + put_user((u32)(unsigned long)datap, + &sgio32->usr_ptr) || + copy_in_user(&sgio32->status, &sgio->status, + (4 * sizeof(unsigned char)) + + (2 * sizeof(unsigned short)) + + (3 * sizeof(int)))) + err = -EFAULT; + } + + return err; +} + +struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ + char req_state; + char orphan; + char sg_io_owned; + char problem; + int pack_id; + compat_uptr_t usr_ptr; + unsigned int duration; + int unused; +}; + +static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct + compat_sg_req_info __user *o) +{ + int err, i; + sg_req_info_t __user *r; + r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); + err = sys_ioctl(fd,cmd,(unsigned long)r); + if (err < 0) + return err; + for (i = 0; i < SG_MAX_QUEUE; i++) { + void __user *ptr; + int d; + + if (copy_in_user(o + i, r + i, offsetof(sg_req_info_t, usr_ptr)) || + get_user(ptr, &r[i].usr_ptr) || + get_user(d, &r[i].duration) || + put_user((u32)(unsigned long)(ptr), &o[i].usr_ptr) || + put_user(d, &o[i].duration)) + return -EFAULT; + } + return err; +} +#endif /* CONFIG_BLOCK */ + +struct sock_fprog32 { + unsigned short len; + compat_caddr_t filter; +}; + +#define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32) +#define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32) + +static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, + struct sock_fprog32 __user *u_fprog32) +{ + struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog)); + void __user *fptr64; + u32 fptr32; + u16 flen; + + if (get_user(flen, &u_fprog32->len) || + get_user(fptr32, &u_fprog32->filter)) + return -EFAULT; + + fptr64 = compat_ptr(fptr32); + + if (put_user(flen, &u_fprog64->len) || + put_user(fptr64, &u_fprog64->filter)) + return -EFAULT; + + if (cmd == PPPIOCSPASS32) + cmd = PPPIOCSPASS; + else + cmd = PPPIOCSACTIVE; + + return sys_ioctl(fd, cmd, (unsigned long) u_fprog64); +} + +struct ppp_option_data32 { + compat_caddr_t ptr; + u32 length; + compat_int_t transmit; +}; +#define PPPIOCSCOMPRESS32 _IOW('t', 77, struct ppp_option_data32) + +struct ppp_idle32 { + compat_time_t xmit_idle; + compat_time_t recv_idle; +}; +#define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32) + +static int ppp_gidle(unsigned int fd, unsigned int cmd, + struct ppp_idle32 __user *idle32) +{ + struct ppp_idle __user *idle; + __kernel_time_t xmit, recv; + int err; + + idle = compat_alloc_user_space(sizeof(*idle)); + + err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle); + + if (!err) { + if (get_user(xmit, &idle->xmit_idle) || + get_user(recv, &idle->recv_idle) || + put_user(xmit, &idle32->xmit_idle) || + put_user(recv, &idle32->recv_idle)) + err = -EFAULT; + } + return err; +} + +static int ppp_scompress(unsigned int fd, unsigned int cmd, + struct ppp_option_data32 __user *odata32) +{ + struct ppp_option_data __user *odata; + __u32 data; + void __user *datap; + + odata = compat_alloc_user_space(sizeof(*odata)); + + if (get_user(data, &odata32->ptr)) + return -EFAULT; + + datap = compat_ptr(data); + if (put_user(datap, &odata->ptr)) + return -EFAULT; + + if (copy_in_user(&odata->length, &odata32->length, + sizeof(__u32) + sizeof(int))) + return -EFAULT; + + return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata); +} + +#ifdef CONFIG_BLOCK +struct mtget32 { + compat_long_t mt_type; + compat_long_t mt_resid; + compat_long_t mt_dsreg; + compat_long_t mt_gstat; + compat_long_t mt_erreg; + compat_daddr_t mt_fileno; + compat_daddr_t mt_blkno; +}; +#define MTIOCGET32 _IOR('m', 2, struct mtget32) + +struct mtpos32 { + compat_long_t mt_blkno; +}; +#define MTIOCPOS32 _IOR('m', 3, struct mtpos32) + +static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) +{ + mm_segment_t old_fs = get_fs(); + struct mtget get; + struct mtget32 __user *umget32; + struct mtpos pos; + struct mtpos32 __user *upos32; + unsigned long kcmd; + void *karg; + int err = 0; + + switch(cmd) { + case MTIOCPOS32: + kcmd = MTIOCPOS; + karg = &pos; + break; + default: /* MTIOCGET32 */ + kcmd = MTIOCGET; + karg = &get; + break; + } + set_fs (KERNEL_DS); + err = sys_ioctl (fd, kcmd, (unsigned long)karg); + set_fs (old_fs); + if (err) + return err; + switch (cmd) { + case MTIOCPOS32: + upos32 = argp; + err = __put_user(pos.mt_blkno, &upos32->mt_blkno); + break; + case MTIOCGET32: + umget32 = argp; + err = __put_user(get.mt_type, &umget32->mt_type); + err |= __put_user(get.mt_resid, &umget32->mt_resid); + err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg); + err |= __put_user(get.mt_gstat, &umget32->mt_gstat); + err |= __put_user(get.mt_erreg, &umget32->mt_erreg); + err |= __put_user(get.mt_fileno, &umget32->mt_fileno); + err |= __put_user(get.mt_blkno, &umget32->mt_blkno); + break; + } + return err ? -EFAULT: 0; +} + +#endif /* CONFIG_BLOCK */ + +/* Bluetooth ioctls */ +#define HCIUARTSETPROTO _IOW('U', 200, int) +#define HCIUARTGETPROTO _IOR('U', 201, int) +#define HCIUARTGETDEVICE _IOR('U', 202, int) +#define HCIUARTSETFLAGS _IOW('U', 203, int) +#define HCIUARTGETFLAGS _IOR('U', 204, int) + +#define BNEPCONNADD _IOW('B', 200, int) +#define BNEPCONNDEL _IOW('B', 201, int) +#define BNEPGETCONNLIST _IOR('B', 210, int) +#define BNEPGETCONNINFO _IOR('B', 211, int) + +#define CMTPCONNADD _IOW('C', 200, int) +#define CMTPCONNDEL _IOW('C', 201, int) +#define CMTPGETCONNLIST _IOR('C', 210, int) +#define CMTPGETCONNINFO _IOR('C', 211, int) + +#define HIDPCONNADD _IOW('H', 200, int) +#define HIDPCONNDEL _IOW('H', 201, int) +#define HIDPGETCONNLIST _IOR('H', 210, int) +#define HIDPGETCONNINFO _IOR('H', 211, int) + + +struct serial_struct32 { + compat_int_t type; + compat_int_t line; + compat_uint_t port; + compat_int_t irq; + compat_int_t flags; + compat_int_t xmit_fifo_size; + compat_int_t custom_divisor; + compat_int_t baud_base; + unsigned short close_delay; + char io_type; + char reserved_char[1]; + compat_int_t hub6; + unsigned short closing_wait; /* time to wait before closing */ + unsigned short closing_wait2; /* no longer used... */ + compat_uint_t iomem_base; + unsigned short iomem_reg_shift; + unsigned int port_high; + /* compat_ulong_t iomap_base FIXME */ + compat_int_t reserved[1]; +}; + +static int serial_struct_ioctl(unsigned fd, unsigned cmd, + struct serial_struct32 __user *ss32) +{ + typedef struct serial_struct SS; + typedef struct serial_struct32 SS32; + int err; + struct serial_struct ss; + mm_segment_t oldseg = get_fs(); + __u32 udata; + unsigned int base; + + if (cmd == TIOCSSERIAL) { + if (!access_ok(VERIFY_READ, ss32, sizeof(SS32))) + return -EFAULT; + if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base))) + return -EFAULT; + if (__get_user(udata, &ss32->iomem_base)) + return -EFAULT; + ss.iomem_base = compat_ptr(udata); + if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) || + __get_user(ss.port_high, &ss32->port_high)) + return -EFAULT; + ss.iomap_base = 0UL; + } + set_fs(KERNEL_DS); + err = sys_ioctl(fd,cmd,(unsigned long)(&ss)); + set_fs(oldseg); + if (cmd == TIOCGSERIAL && err >= 0) { + if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32))) + return -EFAULT; + if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base))) + return -EFAULT; + base = (unsigned long)ss.iomem_base >> 32 ? + 0xffffffff : (unsigned)(unsigned long)ss.iomem_base; + if (__put_user(base, &ss32->iomem_base) || + __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) || + __put_user(ss.port_high, &ss32->port_high)) + return -EFAULT; + } + return err; +} + +/* + * I2C layer ioctls + */ + +struct i2c_msg32 { + u16 addr; + u16 flags; + u16 len; + compat_caddr_t buf; +}; + +struct i2c_rdwr_ioctl_data32 { + compat_caddr_t msgs; /* struct i2c_msg __user *msgs */ + u32 nmsgs; +}; + +struct i2c_smbus_ioctl_data32 { + u8 read_write; + u8 command; + u32 size; + compat_caddr_t data; /* union i2c_smbus_data *data */ +}; + +struct i2c_rdwr_aligned { + struct i2c_rdwr_ioctl_data cmd; + struct i2c_msg msgs[0]; +}; + +static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, + struct i2c_rdwr_ioctl_data32 __user *udata) +{ + struct i2c_rdwr_aligned __user *tdata; + struct i2c_msg __user *tmsgs; + struct i2c_msg32 __user *umsgs; + compat_caddr_t datap; + int nmsgs, i; + + if (get_user(nmsgs, &udata->nmsgs)) + return -EFAULT; + if (nmsgs > I2C_RDRW_IOCTL_MAX_MSGS) + return -EINVAL; + + if (get_user(datap, &udata->msgs)) + return -EFAULT; + umsgs = compat_ptr(datap); + + tdata = compat_alloc_user_space(sizeof(*tdata) + + nmsgs * sizeof(struct i2c_msg)); + tmsgs = &tdata->msgs[0]; + + if (put_user(nmsgs, &tdata->cmd.nmsgs) || + put_user(tmsgs, &tdata->cmd.msgs)) + return -EFAULT; + + for (i = 0; i < nmsgs; i++) { + if (copy_in_user(&tmsgs[i].addr, &umsgs[i].addr, 3*sizeof(u16))) + return -EFAULT; + if (get_user(datap, &umsgs[i].buf) || + put_user(compat_ptr(datap), &tmsgs[i].buf)) + return -EFAULT; + } + return sys_ioctl(fd, cmd, (unsigned long)tdata); +} + +static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, + struct i2c_smbus_ioctl_data32 __user *udata) +{ + struct i2c_smbus_ioctl_data __user *tdata; + compat_caddr_t datap; + + tdata = compat_alloc_user_space(sizeof(*tdata)); + if (tdata == NULL) + return -ENOMEM; + if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata))) + return -EFAULT; + + if (!access_ok(VERIFY_READ, udata, sizeof(*udata))) + return -EFAULT; + + if (__copy_in_user(&tdata->read_write, &udata->read_write, 2 * sizeof(u8))) + return -EFAULT; + if (__copy_in_user(&tdata->size, &udata->size, 2 * sizeof(u32))) + return -EFAULT; + if (__get_user(datap, &udata->data) || + __put_user(compat_ptr(datap), &tdata->data)) + return -EFAULT; + + return sys_ioctl(fd, cmd, (unsigned long)tdata); +} + +#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) +#define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t) +#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) +#define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t) + +static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp) +{ + mm_segment_t oldfs = get_fs(); + compat_ulong_t val32; + unsigned long kval; + int ret; + + switch (cmd) { + case RTC_IRQP_READ32: + case RTC_EPOCH_READ32: + set_fs(KERNEL_DS); + ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ? + RTC_IRQP_READ : RTC_EPOCH_READ, + (unsigned long)&kval); + set_fs(oldfs); + if (ret) + return ret; + val32 = kval; + return put_user(val32, (unsigned int __user *)argp); + case RTC_IRQP_SET32: + return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp); + case RTC_EPOCH_SET32: + return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp); + } + + return -ENOIOCTLCMD; +} + +/* on ia32 l_start is on a 32-bit boundary */ +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) +struct space_resv_32 { + __s16 l_type; + __s16 l_whence; + __s64 l_start __attribute__((packed)); + /* len == 0 means until end of file */ + __s64 l_len __attribute__((packed)); + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +}; + +#define FS_IOC_RESVSP_32 _IOW ('X', 40, struct space_resv_32) +#define FS_IOC_RESVSP64_32 _IOW ('X', 42, struct space_resv_32) + +/* just account for different alignment */ +static int compat_ioctl_preallocate(struct file *file, + struct space_resv_32 __user *p32) +{ + struct space_resv __user *p = compat_alloc_user_space(sizeof(*p)); + + if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || + copy_in_user(&p->l_whence, &p32->l_whence, sizeof(s16)) || + copy_in_user(&p->l_start, &p32->l_start, sizeof(s64)) || + copy_in_user(&p->l_len, &p32->l_len, sizeof(s64)) || + copy_in_user(&p->l_sysid, &p32->l_sysid, sizeof(s32)) || + copy_in_user(&p->l_pid, &p32->l_pid, sizeof(u32)) || + copy_in_user(&p->l_pad, &p32->l_pad, 4*sizeof(u32))) + return -EFAULT; + + return ioctl_preallocate(file, p); +} +#endif + +/* + * simple reversible transform to make our table more evenly + * distributed after sorting. + */ +#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) + +#define COMPATIBLE_IOCTL(cmd) XFORM(cmd), +/* ioctl should not be warned about even if it's not implemented. + Valid reasons to use this: + - It is implemented with ->compat_ioctl on some device, but programs + call it on others too. + - The ioctl is not implemented in the native kernel, but programs + call it commonly anyways. + Most other reasons are not valid. */ +#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd) + +static unsigned int ioctl_pointer[] = { +/* compatible ioctls first */ +COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */ +COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */ + +/* Big T */ +COMPATIBLE_IOCTL(TCGETA) +COMPATIBLE_IOCTL(TCSETA) +COMPATIBLE_IOCTL(TCSETAW) +COMPATIBLE_IOCTL(TCSETAF) +COMPATIBLE_IOCTL(TCSBRK) +COMPATIBLE_IOCTL(TCXONC) +COMPATIBLE_IOCTL(TCFLSH) +COMPATIBLE_IOCTL(TCGETS) +COMPATIBLE_IOCTL(TCSETS) +COMPATIBLE_IOCTL(TCSETSW) +COMPATIBLE_IOCTL(TCSETSF) +COMPATIBLE_IOCTL(TIOCLINUX) +COMPATIBLE_IOCTL(TIOCSBRK) +COMPATIBLE_IOCTL(TIOCGDEV) +COMPATIBLE_IOCTL(TIOCCBRK) +COMPATIBLE_IOCTL(TIOCGSID) +COMPATIBLE_IOCTL(TIOCGICOUNT) +/* Little t */ +COMPATIBLE_IOCTL(TIOCGETD) +COMPATIBLE_IOCTL(TIOCSETD) +COMPATIBLE_IOCTL(TIOCEXCL) +COMPATIBLE_IOCTL(TIOCNXCL) +COMPATIBLE_IOCTL(TIOCCONS) +COMPATIBLE_IOCTL(TIOCGSOFTCAR) +COMPATIBLE_IOCTL(TIOCSSOFTCAR) +COMPATIBLE_IOCTL(TIOCSWINSZ) +COMPATIBLE_IOCTL(TIOCGWINSZ) +COMPATIBLE_IOCTL(TIOCMGET) +COMPATIBLE_IOCTL(TIOCMBIC) +COMPATIBLE_IOCTL(TIOCMBIS) +COMPATIBLE_IOCTL(TIOCMSET) +COMPATIBLE_IOCTL(TIOCPKT) +COMPATIBLE_IOCTL(TIOCNOTTY) +COMPATIBLE_IOCTL(TIOCSTI) +COMPATIBLE_IOCTL(TIOCOUTQ) +COMPATIBLE_IOCTL(TIOCSPGRP) +COMPATIBLE_IOCTL(TIOCGPGRP) +COMPATIBLE_IOCTL(TIOCGPTN) +COMPATIBLE_IOCTL(TIOCSPTLCK) +COMPATIBLE_IOCTL(TIOCSERGETLSR) +COMPATIBLE_IOCTL(TIOCSIG) +#ifdef TCGETS2 +COMPATIBLE_IOCTL(TCGETS2) +COMPATIBLE_IOCTL(TCSETS2) +COMPATIBLE_IOCTL(TCSETSW2) +COMPATIBLE_IOCTL(TCSETSF2) +#endif +/* Little f */ +COMPATIBLE_IOCTL(FIOCLEX) +COMPATIBLE_IOCTL(FIONCLEX) +COMPATIBLE_IOCTL(FIOASYNC) +COMPATIBLE_IOCTL(FIONBIO) +COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */ +COMPATIBLE_IOCTL(FS_IOC_FIEMAP) +/* 0x00 */ +COMPATIBLE_IOCTL(FIBMAP) +COMPATIBLE_IOCTL(FIGETBSZ) +/* 'X' - originally XFS but some now in the VFS */ +COMPATIBLE_IOCTL(FIFREEZE) +COMPATIBLE_IOCTL(FITHAW) +COMPATIBLE_IOCTL(KDGETKEYCODE) +COMPATIBLE_IOCTL(KDSETKEYCODE) +COMPATIBLE_IOCTL(KDGKBTYPE) +COMPATIBLE_IOCTL(KDGETMODE) +COMPATIBLE_IOCTL(KDGKBMODE) +COMPATIBLE_IOCTL(KDGKBMETA) +COMPATIBLE_IOCTL(KDGKBENT) +COMPATIBLE_IOCTL(KDSKBENT) +COMPATIBLE_IOCTL(KDGKBSENT) +COMPATIBLE_IOCTL(KDSKBSENT) +COMPATIBLE_IOCTL(KDGKBDIACR) +COMPATIBLE_IOCTL(KDSKBDIACR) +COMPATIBLE_IOCTL(KDKBDREP) +COMPATIBLE_IOCTL(KDGKBLED) +COMPATIBLE_IOCTL(KDGETLED) +#ifdef CONFIG_BLOCK +/* Big S */ +COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) +COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK) +COMPATIBLE_IOCTL(SCSI_IOCTL_DOORUNLOCK) +COMPATIBLE_IOCTL(SCSI_IOCTL_TEST_UNIT_READY) +COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER) +COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) +COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) +COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) +#endif +/* Big V (don't complain on serial console) */ +IGNORE_IOCTL(VT_OPENQRY) +IGNORE_IOCTL(VT_GETMODE) +/* Little p (/dev/rtc, /dev/envctrl, etc.) */ +COMPATIBLE_IOCTL(RTC_AIE_ON) +COMPATIBLE_IOCTL(RTC_AIE_OFF) +COMPATIBLE_IOCTL(RTC_UIE_ON) +COMPATIBLE_IOCTL(RTC_UIE_OFF) +COMPATIBLE_IOCTL(RTC_PIE_ON) +COMPATIBLE_IOCTL(RTC_PIE_OFF) +COMPATIBLE_IOCTL(RTC_WIE_ON) +COMPATIBLE_IOCTL(RTC_WIE_OFF) +COMPATIBLE_IOCTL(RTC_ALM_SET) +COMPATIBLE_IOCTL(RTC_ALM_READ) +COMPATIBLE_IOCTL(RTC_RD_TIME) +COMPATIBLE_IOCTL(RTC_SET_TIME) +COMPATIBLE_IOCTL(RTC_WKALM_SET) +COMPATIBLE_IOCTL(RTC_WKALM_RD) +/* + * These two are only for the sbus rtc driver, but + * hwclock tries them on every rtc device first when + * running on sparc. On other architectures the entries + * are useless but harmless. + */ +COMPATIBLE_IOCTL(_IOR('p', 20, int[7])) /* RTCGET */ +COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */ +/* Little m */ +COMPATIBLE_IOCTL(MTIOCTOP) +/* Socket level stuff */ +COMPATIBLE_IOCTL(FIOQSIZE) +#ifdef CONFIG_BLOCK +/* loop */ +IGNORE_IOCTL(LOOP_CLR_FD) +/* md calls this on random blockdevs */ +IGNORE_IOCTL(RAID_VERSION) +/* SG stuff */ +COMPATIBLE_IOCTL(SG_SET_TIMEOUT) +COMPATIBLE_IOCTL(SG_GET_TIMEOUT) +COMPATIBLE_IOCTL(SG_EMULATED_HOST) +COMPATIBLE_IOCTL(SG_GET_TRANSFORM) +COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE) +COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE) +COMPATIBLE_IOCTL(SG_GET_SCSI_ID) +COMPATIBLE_IOCTL(SG_SET_FORCE_LOW_DMA) +COMPATIBLE_IOCTL(SG_GET_LOW_DMA) +COMPATIBLE_IOCTL(SG_SET_FORCE_PACK_ID) +COMPATIBLE_IOCTL(SG_GET_PACK_ID) +COMPATIBLE_IOCTL(SG_GET_NUM_WAITING) +COMPATIBLE_IOCTL(SG_SET_DEBUG) +COMPATIBLE_IOCTL(SG_GET_SG_TABLESIZE) +COMPATIBLE_IOCTL(SG_GET_COMMAND_Q) +COMPATIBLE_IOCTL(SG_SET_COMMAND_Q) +COMPATIBLE_IOCTL(SG_GET_VERSION_NUM) +COMPATIBLE_IOCTL(SG_NEXT_CMD_LEN) +COMPATIBLE_IOCTL(SG_SCSI_RESET) +COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) +COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN) +COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN) +#endif +/* PPP stuff */ +COMPATIBLE_IOCTL(PPPIOCGFLAGS) +COMPATIBLE_IOCTL(PPPIOCSFLAGS) +COMPATIBLE_IOCTL(PPPIOCGASYNCMAP) +COMPATIBLE_IOCTL(PPPIOCSASYNCMAP) +COMPATIBLE_IOCTL(PPPIOCGUNIT) +COMPATIBLE_IOCTL(PPPIOCGRASYNCMAP) +COMPATIBLE_IOCTL(PPPIOCSRASYNCMAP) +COMPATIBLE_IOCTL(PPPIOCGMRU) +COMPATIBLE_IOCTL(PPPIOCSMRU) +COMPATIBLE_IOCTL(PPPIOCSMAXCID) +COMPATIBLE_IOCTL(PPPIOCGXASYNCMAP) +COMPATIBLE_IOCTL(PPPIOCSXASYNCMAP) +COMPATIBLE_IOCTL(PPPIOCXFERUNIT) +/* PPPIOCSCOMPRESS is translated */ +COMPATIBLE_IOCTL(PPPIOCGNPMODE) +COMPATIBLE_IOCTL(PPPIOCSNPMODE) +COMPATIBLE_IOCTL(PPPIOCGDEBUG) +COMPATIBLE_IOCTL(PPPIOCSDEBUG) +/* PPPIOCSPASS is translated */ +/* PPPIOCSACTIVE is translated */ +/* PPPIOCGIDLE is translated */ +COMPATIBLE_IOCTL(PPPIOCNEWUNIT) +COMPATIBLE_IOCTL(PPPIOCATTACH) +COMPATIBLE_IOCTL(PPPIOCDETACH) +COMPATIBLE_IOCTL(PPPIOCSMRRU) +COMPATIBLE_IOCTL(PPPIOCCONNECT) +COMPATIBLE_IOCTL(PPPIOCDISCONN) +COMPATIBLE_IOCTL(PPPIOCATTCHAN) +COMPATIBLE_IOCTL(PPPIOCGCHAN) +/* PPPOX */ +COMPATIBLE_IOCTL(PPPOEIOCSFWD) +COMPATIBLE_IOCTL(PPPOEIOCDFWD) +/* ppdev */ +COMPATIBLE_IOCTL(PPSETMODE) +COMPATIBLE_IOCTL(PPRSTATUS) +COMPATIBLE_IOCTL(PPRCONTROL) +COMPATIBLE_IOCTL(PPWCONTROL) +COMPATIBLE_IOCTL(PPFCONTROL) +COMPATIBLE_IOCTL(PPRDATA) +COMPATIBLE_IOCTL(PPWDATA) +COMPATIBLE_IOCTL(PPCLAIM) +COMPATIBLE_IOCTL(PPRELEASE) +COMPATIBLE_IOCTL(PPYIELD) +COMPATIBLE_IOCTL(PPEXCL) +COMPATIBLE_IOCTL(PPDATADIR) +COMPATIBLE_IOCTL(PPNEGOT) +COMPATIBLE_IOCTL(PPWCTLONIRQ) +COMPATIBLE_IOCTL(PPCLRIRQ) +COMPATIBLE_IOCTL(PPSETPHASE) +COMPATIBLE_IOCTL(PPGETMODES) +COMPATIBLE_IOCTL(PPGETMODE) +COMPATIBLE_IOCTL(PPGETPHASE) +COMPATIBLE_IOCTL(PPGETFLAGS) +COMPATIBLE_IOCTL(PPSETFLAGS) +/* Big A */ +/* sparc only */ +/* Big Q for sound/OSS */ +COMPATIBLE_IOCTL(SNDCTL_SEQ_RESET) +COMPATIBLE_IOCTL(SNDCTL_SEQ_SYNC) +COMPATIBLE_IOCTL(SNDCTL_SYNTH_INFO) +COMPATIBLE_IOCTL(SNDCTL_SEQ_CTRLRATE) +COMPATIBLE_IOCTL(SNDCTL_SEQ_GETOUTCOUNT) +COMPATIBLE_IOCTL(SNDCTL_SEQ_GETINCOUNT) +COMPATIBLE_IOCTL(SNDCTL_SEQ_PERCMODE) +COMPATIBLE_IOCTL(SNDCTL_FM_LOAD_INSTR) +COMPATIBLE_IOCTL(SNDCTL_SEQ_TESTMIDI) +COMPATIBLE_IOCTL(SNDCTL_SEQ_RESETSAMPLES) +COMPATIBLE_IOCTL(SNDCTL_SEQ_NRSYNTHS) +COMPATIBLE_IOCTL(SNDCTL_SEQ_NRMIDIS) +COMPATIBLE_IOCTL(SNDCTL_MIDI_INFO) +COMPATIBLE_IOCTL(SNDCTL_SEQ_THRESHOLD) +COMPATIBLE_IOCTL(SNDCTL_SYNTH_MEMAVL) +COMPATIBLE_IOCTL(SNDCTL_FM_4OP_ENABLE) +COMPATIBLE_IOCTL(SNDCTL_SEQ_PANIC) +COMPATIBLE_IOCTL(SNDCTL_SEQ_OUTOFBAND) +COMPATIBLE_IOCTL(SNDCTL_SEQ_GETTIME) +COMPATIBLE_IOCTL(SNDCTL_SYNTH_ID) +COMPATIBLE_IOCTL(SNDCTL_SYNTH_CONTROL) +COMPATIBLE_IOCTL(SNDCTL_SYNTH_REMOVESAMPLE) +/* Big T for sound/OSS */ +COMPATIBLE_IOCTL(SNDCTL_TMR_TIMEBASE) +COMPATIBLE_IOCTL(SNDCTL_TMR_START) +COMPATIBLE_IOCTL(SNDCTL_TMR_STOP) +COMPATIBLE_IOCTL(SNDCTL_TMR_CONTINUE) +COMPATIBLE_IOCTL(SNDCTL_TMR_TEMPO) +COMPATIBLE_IOCTL(SNDCTL_TMR_SOURCE) +COMPATIBLE_IOCTL(SNDCTL_TMR_METRONOME) +COMPATIBLE_IOCTL(SNDCTL_TMR_SELECT) +/* Little m for sound/OSS */ +COMPATIBLE_IOCTL(SNDCTL_MIDI_PRETIME) +COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUMODE) +COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUCMD) +/* Big P for sound/OSS */ +COMPATIBLE_IOCTL(SNDCTL_DSP_RESET) +COMPATIBLE_IOCTL(SNDCTL_DSP_SYNC) +COMPATIBLE_IOCTL(SNDCTL_DSP_SPEED) +COMPATIBLE_IOCTL(SNDCTL_DSP_STEREO) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETBLKSIZE) +COMPATIBLE_IOCTL(SNDCTL_DSP_CHANNELS) +COMPATIBLE_IOCTL(SOUND_PCM_WRITE_FILTER) +COMPATIBLE_IOCTL(SNDCTL_DSP_POST) +COMPATIBLE_IOCTL(SNDCTL_DSP_SUBDIVIDE) +COMPATIBLE_IOCTL(SNDCTL_DSP_SETFRAGMENT) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETFMTS) +COMPATIBLE_IOCTL(SNDCTL_DSP_SETFMT) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETOSPACE) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETISPACE) +COMPATIBLE_IOCTL(SNDCTL_DSP_NONBLOCK) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETCAPS) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETTRIGGER) +COMPATIBLE_IOCTL(SNDCTL_DSP_SETTRIGGER) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETIPTR) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETOPTR) +/* SNDCTL_DSP_MAPINBUF, XXX needs translation */ +/* SNDCTL_DSP_MAPOUTBUF, XXX needs translation */ +COMPATIBLE_IOCTL(SNDCTL_DSP_SETSYNCRO) +COMPATIBLE_IOCTL(SNDCTL_DSP_SETDUPLEX) +COMPATIBLE_IOCTL(SNDCTL_DSP_GETODELAY) +COMPATIBLE_IOCTL(SNDCTL_DSP_PROFILE) +COMPATIBLE_IOCTL(SOUND_PCM_READ_RATE) +COMPATIBLE_IOCTL(SOUND_PCM_READ_CHANNELS) +COMPATIBLE_IOCTL(SOUND_PCM_READ_BITS) +COMPATIBLE_IOCTL(SOUND_PCM_READ_FILTER) +/* Big C for sound/OSS */ +COMPATIBLE_IOCTL(SNDCTL_COPR_RESET) +COMPATIBLE_IOCTL(SNDCTL_COPR_LOAD) +COMPATIBLE_IOCTL(SNDCTL_COPR_RDATA) +COMPATIBLE_IOCTL(SNDCTL_COPR_RCODE) +COMPATIBLE_IOCTL(SNDCTL_COPR_WDATA) +COMPATIBLE_IOCTL(SNDCTL_COPR_WCODE) +COMPATIBLE_IOCTL(SNDCTL_COPR_RUN) +COMPATIBLE_IOCTL(SNDCTL_COPR_HALT) +COMPATIBLE_IOCTL(SNDCTL_COPR_SENDMSG) +COMPATIBLE_IOCTL(SNDCTL_COPR_RCVMSG) +/* Big M for sound/OSS */ +COMPATIBLE_IOCTL(SOUND_MIXER_READ_VOLUME) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_BASS) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_TREBLE) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_SYNTH) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_PCM) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_SPEAKER) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_MIC) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_CD) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_IMIX) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_ALTPCM) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECLEV) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_IGAIN) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_OGAIN) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE1) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE2) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE3) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL1)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL2)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL3)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEIN)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEOUT)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_VIDEO)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_RADIO)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_MONITOR)) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_MUTE) +/* SOUND_MIXER_READ_ENHANCE, same value as READ_MUTE */ +/* SOUND_MIXER_READ_LOUD, same value as READ_MUTE */ +COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECSRC) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_DEVMASK) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECMASK) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_STEREODEVS) +COMPATIBLE_IOCTL(SOUND_MIXER_READ_CAPS) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_VOLUME) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_BASS) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_TREBLE) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SYNTH) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_PCM) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SPEAKER) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MIC) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_CD) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IMIX) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_ALTPCM) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECLEV) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IGAIN) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_OGAIN) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE1) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE2) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE3) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL1)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL2)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL3)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEIN)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEOUT)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_VIDEO)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_RADIO)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_MONITOR)) +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MUTE) +/* SOUND_MIXER_WRITE_ENHANCE, same value as WRITE_MUTE */ +/* SOUND_MIXER_WRITE_LOUD, same value as WRITE_MUTE */ +COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECSRC) +COMPATIBLE_IOCTL(SOUND_MIXER_INFO) +COMPATIBLE_IOCTL(SOUND_OLD_MIXER_INFO) +COMPATIBLE_IOCTL(SOUND_MIXER_ACCESS) +COMPATIBLE_IOCTL(SOUND_MIXER_AGC) +COMPATIBLE_IOCTL(SOUND_MIXER_3DSE) +COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE1) +COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE2) +COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE3) +COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE4) +COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5) +COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) +COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) +COMPATIBLE_IOCTL(OSS_GETVERSION) +/* Raw devices */ +COMPATIBLE_IOCTL(RAW_SETBIND) +COMPATIBLE_IOCTL(RAW_GETBIND) +/* Watchdog */ +COMPATIBLE_IOCTL(WDIOC_GETSUPPORT) +COMPATIBLE_IOCTL(WDIOC_GETSTATUS) +COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS) +COMPATIBLE_IOCTL(WDIOC_GETTEMP) +COMPATIBLE_IOCTL(WDIOC_SETOPTIONS) +COMPATIBLE_IOCTL(WDIOC_KEEPALIVE) +COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT) +COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT) +/* Big R */ +COMPATIBLE_IOCTL(RNDGETENTCNT) +COMPATIBLE_IOCTL(RNDADDTOENTCNT) +COMPATIBLE_IOCTL(RNDGETPOOL) +COMPATIBLE_IOCTL(RNDADDENTROPY) +COMPATIBLE_IOCTL(RNDZAPENTCNT) +COMPATIBLE_IOCTL(RNDCLEARPOOL) +/* Bluetooth */ +COMPATIBLE_IOCTL(HCIDEVUP) +COMPATIBLE_IOCTL(HCIDEVDOWN) +COMPATIBLE_IOCTL(HCIDEVRESET) +COMPATIBLE_IOCTL(HCIDEVRESTAT) +COMPATIBLE_IOCTL(HCIGETDEVLIST) +COMPATIBLE_IOCTL(HCIGETDEVINFO) +COMPATIBLE_IOCTL(HCIGETCONNLIST) +COMPATIBLE_IOCTL(HCIGETCONNINFO) +COMPATIBLE_IOCTL(HCIGETAUTHINFO) +COMPATIBLE_IOCTL(HCISETRAW) +COMPATIBLE_IOCTL(HCISETSCAN) +COMPATIBLE_IOCTL(HCISETAUTH) +COMPATIBLE_IOCTL(HCISETENCRYPT) +COMPATIBLE_IOCTL(HCISETPTYPE) +COMPATIBLE_IOCTL(HCISETLINKPOL) +COMPATIBLE_IOCTL(HCISETLINKMODE) +COMPATIBLE_IOCTL(HCISETACLMTU) +COMPATIBLE_IOCTL(HCISETSCOMTU) +COMPATIBLE_IOCTL(HCIBLOCKADDR) +COMPATIBLE_IOCTL(HCIUNBLOCKADDR) +COMPATIBLE_IOCTL(HCIINQUIRY) +COMPATIBLE_IOCTL(HCIUARTSETPROTO) +COMPATIBLE_IOCTL(HCIUARTGETPROTO) +COMPATIBLE_IOCTL(RFCOMMCREATEDEV) +COMPATIBLE_IOCTL(RFCOMMRELEASEDEV) +COMPATIBLE_IOCTL(RFCOMMGETDEVLIST) +COMPATIBLE_IOCTL(RFCOMMGETDEVINFO) +COMPATIBLE_IOCTL(RFCOMMSTEALDLC) +COMPATIBLE_IOCTL(BNEPCONNADD) +COMPATIBLE_IOCTL(BNEPCONNDEL) +COMPATIBLE_IOCTL(BNEPGETCONNLIST) +COMPATIBLE_IOCTL(BNEPGETCONNINFO) +COMPATIBLE_IOCTL(CMTPCONNADD) +COMPATIBLE_IOCTL(CMTPCONNDEL) +COMPATIBLE_IOCTL(CMTPGETCONNLIST) +COMPATIBLE_IOCTL(CMTPGETCONNINFO) +COMPATIBLE_IOCTL(HIDPCONNADD) +COMPATIBLE_IOCTL(HIDPCONNDEL) +COMPATIBLE_IOCTL(HIDPGETCONNLIST) +COMPATIBLE_IOCTL(HIDPGETCONNINFO) +/* CAPI */ +COMPATIBLE_IOCTL(CAPI_REGISTER) +COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER) +COMPATIBLE_IOCTL(CAPI_GET_VERSION) +COMPATIBLE_IOCTL(CAPI_GET_SERIAL) +COMPATIBLE_IOCTL(CAPI_GET_PROFILE) +COMPATIBLE_IOCTL(CAPI_MANUFACTURER_CMD) +COMPATIBLE_IOCTL(CAPI_GET_ERRCODE) +COMPATIBLE_IOCTL(CAPI_INSTALLED) +COMPATIBLE_IOCTL(CAPI_GET_FLAGS) +COMPATIBLE_IOCTL(CAPI_SET_FLAGS) +COMPATIBLE_IOCTL(CAPI_CLR_FLAGS) +COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT) +COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT) +/* Siemens Gigaset */ +COMPATIBLE_IOCTL(GIGASET_REDIR) +COMPATIBLE_IOCTL(GIGASET_CONFIG) +COMPATIBLE_IOCTL(GIGASET_BRKCHARS) +COMPATIBLE_IOCTL(GIGASET_VERSION) +/* Misc. */ +COMPATIBLE_IOCTL(0x41545900) /* ATYIO_CLKR */ +COMPATIBLE_IOCTL(0x41545901) /* ATYIO_CLKW */ +COMPATIBLE_IOCTL(PCIIOC_CONTROLLER) +COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO) +COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM) +COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE) +/* NBD */ +COMPATIBLE_IOCTL(NBD_DO_IT) +COMPATIBLE_IOCTL(NBD_CLEAR_SOCK) +COMPATIBLE_IOCTL(NBD_CLEAR_QUE) +COMPATIBLE_IOCTL(NBD_PRINT_DEBUG) +COMPATIBLE_IOCTL(NBD_DISCONNECT) +/* i2c */ +COMPATIBLE_IOCTL(I2C_SLAVE) +COMPATIBLE_IOCTL(I2C_SLAVE_FORCE) +COMPATIBLE_IOCTL(I2C_TENBIT) +COMPATIBLE_IOCTL(I2C_PEC) +COMPATIBLE_IOCTL(I2C_RETRIES) +COMPATIBLE_IOCTL(I2C_TIMEOUT) +/* hiddev */ +COMPATIBLE_IOCTL(HIDIOCGVERSION) +COMPATIBLE_IOCTL(HIDIOCAPPLICATION) +COMPATIBLE_IOCTL(HIDIOCGDEVINFO) +COMPATIBLE_IOCTL(HIDIOCGSTRING) +COMPATIBLE_IOCTL(HIDIOCINITREPORT) +COMPATIBLE_IOCTL(HIDIOCGREPORT) +COMPATIBLE_IOCTL(HIDIOCSREPORT) +COMPATIBLE_IOCTL(HIDIOCGREPORTINFO) +COMPATIBLE_IOCTL(HIDIOCGFIELDINFO) +COMPATIBLE_IOCTL(HIDIOCGUSAGE) +COMPATIBLE_IOCTL(HIDIOCSUSAGE) +COMPATIBLE_IOCTL(HIDIOCGUCODE) +COMPATIBLE_IOCTL(HIDIOCGFLAG) +COMPATIBLE_IOCTL(HIDIOCSFLAG) +COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX) +COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO) +/* dvb */ +COMPATIBLE_IOCTL(AUDIO_STOP) +COMPATIBLE_IOCTL(AUDIO_PLAY) +COMPATIBLE_IOCTL(AUDIO_PAUSE) +COMPATIBLE_IOCTL(AUDIO_CONTINUE) +COMPATIBLE_IOCTL(AUDIO_SELECT_SOURCE) +COMPATIBLE_IOCTL(AUDIO_SET_MUTE) +COMPATIBLE_IOCTL(AUDIO_SET_AV_SYNC) +COMPATIBLE_IOCTL(AUDIO_SET_BYPASS_MODE) +COMPATIBLE_IOCTL(AUDIO_CHANNEL_SELECT) +COMPATIBLE_IOCTL(AUDIO_GET_STATUS) +COMPATIBLE_IOCTL(AUDIO_GET_CAPABILITIES) +COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER) +COMPATIBLE_IOCTL(AUDIO_SET_ID) +COMPATIBLE_IOCTL(AUDIO_SET_MIXER) +COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE) +COMPATIBLE_IOCTL(AUDIO_SET_EXT_ID) +COMPATIBLE_IOCTL(AUDIO_SET_ATTRIBUTES) +COMPATIBLE_IOCTL(AUDIO_SET_KARAOKE) +COMPATIBLE_IOCTL(DMX_START) +COMPATIBLE_IOCTL(DMX_STOP) +COMPATIBLE_IOCTL(DMX_SET_FILTER) +COMPATIBLE_IOCTL(DMX_SET_PES_FILTER) +COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE) +COMPATIBLE_IOCTL(DMX_GET_PES_PIDS) +COMPATIBLE_IOCTL(DMX_GET_CAPS) +COMPATIBLE_IOCTL(DMX_SET_SOURCE) +COMPATIBLE_IOCTL(DMX_GET_STC) +COMPATIBLE_IOCTL(FE_GET_INFO) +COMPATIBLE_IOCTL(FE_DISEQC_RESET_OVERLOAD) +COMPATIBLE_IOCTL(FE_DISEQC_SEND_MASTER_CMD) +COMPATIBLE_IOCTL(FE_DISEQC_RECV_SLAVE_REPLY) +COMPATIBLE_IOCTL(FE_DISEQC_SEND_BURST) +COMPATIBLE_IOCTL(FE_SET_TONE) +COMPATIBLE_IOCTL(FE_SET_VOLTAGE) +COMPATIBLE_IOCTL(FE_ENABLE_HIGH_LNB_VOLTAGE) +COMPATIBLE_IOCTL(FE_READ_STATUS) +COMPATIBLE_IOCTL(FE_READ_BER) +COMPATIBLE_IOCTL(FE_READ_SIGNAL_STRENGTH) +COMPATIBLE_IOCTL(FE_READ_SNR) +COMPATIBLE_IOCTL(FE_READ_UNCORRECTED_BLOCKS) +COMPATIBLE_IOCTL(FE_SET_FRONTEND) +COMPATIBLE_IOCTL(FE_GET_FRONTEND) +COMPATIBLE_IOCTL(FE_GET_EVENT) +COMPATIBLE_IOCTL(FE_DISHNETWORK_SEND_LEGACY_CMD) +COMPATIBLE_IOCTL(VIDEO_STOP) +COMPATIBLE_IOCTL(VIDEO_PLAY) +COMPATIBLE_IOCTL(VIDEO_FREEZE) +COMPATIBLE_IOCTL(VIDEO_CONTINUE) +COMPATIBLE_IOCTL(VIDEO_SELECT_SOURCE) +COMPATIBLE_IOCTL(VIDEO_SET_BLANK) +COMPATIBLE_IOCTL(VIDEO_GET_STATUS) +COMPATIBLE_IOCTL(VIDEO_SET_DISPLAY_FORMAT) +COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD) +COMPATIBLE_IOCTL(VIDEO_SLOWMOTION) +COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES) +COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER) +COMPATIBLE_IOCTL(VIDEO_SET_ID) +COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE) +COMPATIBLE_IOCTL(VIDEO_SET_FORMAT) +COMPATIBLE_IOCTL(VIDEO_SET_SYSTEM) +COMPATIBLE_IOCTL(VIDEO_SET_HIGHLIGHT) +COMPATIBLE_IOCTL(VIDEO_SET_SPU) +COMPATIBLE_IOCTL(VIDEO_GET_NAVI) +COMPATIBLE_IOCTL(VIDEO_SET_ATTRIBUTES) +COMPATIBLE_IOCTL(VIDEO_GET_SIZE) +COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE) + +/* joystick */ +COMPATIBLE_IOCTL(JSIOCGVERSION) +COMPATIBLE_IOCTL(JSIOCGAXES) +COMPATIBLE_IOCTL(JSIOCGBUTTONS) +COMPATIBLE_IOCTL(JSIOCGNAME(0)) + +#ifdef TIOCGLTC +COMPATIBLE_IOCTL(TIOCGLTC) +COMPATIBLE_IOCTL(TIOCSLTC) +#endif +#ifdef TIOCSTART +/* + * For these two we have definitions in ioctls.h and/or termios.h on + * some architectures but no actual implemention. Some applications + * like bash call them if they are defined in the headers, so we provide + * entries here to avoid syslog message spew. + */ +COMPATIBLE_IOCTL(TIOCSTART) +COMPATIBLE_IOCTL(TIOCSTOP) +#endif + +/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl, + but we don't want warnings on other file systems. So declare + them as compatible here. */ +#define VFAT_IOCTL_READDIR_BOTH32 _IOR('r', 1, struct compat_dirent[2]) +#define VFAT_IOCTL_READDIR_SHORT32 _IOR('r', 2, struct compat_dirent[2]) + +IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32) +IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32) + +#ifdef CONFIG_SPARC +/* Sparc framebuffers, handled in sbusfb_compat_ioctl() */ +IGNORE_IOCTL(FBIOGTYPE) +IGNORE_IOCTL(FBIOSATTR) +IGNORE_IOCTL(FBIOGATTR) +IGNORE_IOCTL(FBIOSVIDEO) +IGNORE_IOCTL(FBIOGVIDEO) +IGNORE_IOCTL(FBIOSCURPOS) +IGNORE_IOCTL(FBIOGCURPOS) +IGNORE_IOCTL(FBIOGCURMAX) +IGNORE_IOCTL(FBIOPUTCMAP32) +IGNORE_IOCTL(FBIOGETCMAP32) +IGNORE_IOCTL(FBIOSCURSOR32) +IGNORE_IOCTL(FBIOGCURSOR32) +#endif +}; + +/* + * Convert common ioctl arguments based on their command number + * + * Please do not add any code in here. Instead, implement + * a compat_ioctl operation in the place that handleѕ the + * ioctl for the native case. + */ +static long do_ioctl_trans(int fd, unsigned int cmd, + unsigned long arg, struct file *file) +{ + void __user *argp = compat_ptr(arg); + + switch (cmd) { + case PPPIOCGIDLE32: + return ppp_gidle(fd, cmd, argp); + case PPPIOCSCOMPRESS32: + return ppp_scompress(fd, cmd, argp); + case PPPIOCSPASS32: + case PPPIOCSACTIVE32: + return ppp_sock_fprog_ioctl_trans(fd, cmd, argp); +#ifdef CONFIG_BLOCK + case SG_IO: + return sg_ioctl_trans(fd, cmd, argp); + case SG_GET_REQUEST_TABLE: + return sg_grt_trans(fd, cmd, argp); + case MTIOCGET32: + case MTIOCPOS32: + return mt_ioctl_trans(fd, cmd, argp); +#endif + /* Serial */ + case TIOCGSERIAL: + case TIOCSSERIAL: + return serial_struct_ioctl(fd, cmd, argp); + /* i2c */ + case I2C_FUNCS: + return w_long(fd, cmd, argp); + case I2C_RDWR: + return do_i2c_rdwr_ioctl(fd, cmd, argp); + case I2C_SMBUS: + return do_i2c_smbus_ioctl(fd, cmd, argp); + /* Not implemented in the native kernel */ + case RTC_IRQP_READ32: + case RTC_IRQP_SET32: + case RTC_EPOCH_READ32: + case RTC_EPOCH_SET32: + return rtc_ioctl(fd, cmd, argp); + + /* dvb */ + case VIDEO_GET_EVENT: + return do_video_get_event(fd, cmd, argp); + case VIDEO_STILLPICTURE: + return do_video_stillpicture(fd, cmd, argp); + case VIDEO_SET_SPU_PALETTE: + return do_video_set_spu_palette(fd, cmd, argp); + } + + /* + * These take an integer instead of a pointer as 'arg', + * so we must not do a compat_ptr() translation. + */ + switch (cmd) { + /* Big T */ + case TCSBRKP: + case TIOCMIWAIT: + case TIOCSCTTY: + /* RAID */ + case HOT_REMOVE_DISK: + case HOT_ADD_DISK: + case SET_DISK_FAULTY: + case SET_BITMAP_FILE: + /* Big K */ + case KDSIGACCEPT: + case KIOCSOUND: + case KDMKTONE: + case KDSETMODE: + case KDSKBMODE: + case KDSKBMETA: + case KDSKBLED: + case KDSETLED: + /* NBD */ + case NBD_SET_SOCK: + case NBD_SET_BLKSIZE: + case NBD_SET_SIZE: + case NBD_SET_SIZE_BLOCKS: + return do_vfs_ioctl(file, fd, cmd, arg); + } + + return -ENOIOCTLCMD; +} + +static void compat_ioctl_error(struct file *filp, unsigned int fd, + unsigned int cmd, unsigned long arg) +{ + char buf[10]; + char *fn = "?"; + char *path; + + /* find the name of the device. */ + path = (char *)__get_free_page(GFP_KERNEL); + if (path) { + fn = d_path(&filp->f_path, path, PAGE_SIZE); + if (IS_ERR(fn)) + fn = "?"; + } + + sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK); + if (!isprint(buf[1])) + sprintf(buf, "%02x", buf[1]); + compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) " + "cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n", + current->comm, current->pid, + (int)fd, (unsigned int)cmd, buf, + (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK, + (unsigned int)arg, fn); + + if (path) + free_page((unsigned long)path); +} + +static int compat_ioctl_check_table(unsigned int xcmd) +{ + int i; + const int max = ARRAY_SIZE(ioctl_pointer) - 1; + + BUILD_BUG_ON(max >= (1 << 16)); + + /* guess initial offset into table, assuming a + normalized distribution */ + i = ((xcmd >> 16) * max) >> 16; + + /* do linear search up first, until greater or equal */ + while (ioctl_pointer[i] < xcmd && i < max) + i++; + + /* then do linear search down */ + while (ioctl_pointer[i] > xcmd && i > 0) + i--; + + return ioctl_pointer[i] == xcmd; +} + +asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, + unsigned long arg) +{ + struct file *filp; + int error = -EBADF; + int fput_needed; + + filp = fget_light(fd, &fput_needed); + if (!filp) + goto out; + + /* RED-PEN how should LSM module know it's handling 32bit? */ + error = security_file_ioctl(filp, cmd, arg); + if (error) + goto out_fput; + + /* + * To allow the compat_ioctl handlers to be self contained + * we need to check the common ioctls here first. + * Just handle them with the standard handlers below. + */ + switch (cmd) { + case FIOCLEX: + case FIONCLEX: + case FIONBIO: + case FIOASYNC: + case FIOQSIZE: + break; + +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) + case FS_IOC_RESVSP_32: + case FS_IOC_RESVSP64_32: + error = compat_ioctl_preallocate(filp, compat_ptr(arg)); + goto out_fput; +#else + case FS_IOC_RESVSP: + case FS_IOC_RESVSP64: + error = ioctl_preallocate(filp, compat_ptr(arg)); + goto out_fput; +#endif + + case FIBMAP: + case FIGETBSZ: + case FIONREAD: + if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) + break; + /*FALL THROUGH*/ + + default: + if (filp->f_op && filp->f_op->compat_ioctl) { + error = filp->f_op->compat_ioctl(filp, cmd, arg); + if (error != -ENOIOCTLCMD) + goto out_fput; + } + + if (!filp->f_op || !filp->f_op->unlocked_ioctl) + goto do_ioctl; + break; + } + + if (compat_ioctl_check_table(XFORM(cmd))) + goto found_handler; + + error = do_ioctl_trans(fd, cmd, arg, filp); + if (error == -ENOIOCTLCMD) { + static int count; + + if (++count <= 50) + compat_ioctl_error(filp, fd, cmd, arg); + error = -EINVAL; + } + + goto out_fput; + + found_handler: + arg = (unsigned long)compat_ptr(arg); + do_ioctl: + error = do_vfs_ioctl(filp, fd, cmd, arg); + out_fput: + fput_light(filp, fput_needed); + out: + return error; +} + +static int __init init_sys32_ioctl_cmp(const void *p, const void *q) +{ + unsigned int a, b; + a = *(unsigned int *)p; + b = *(unsigned int *)q; + if (a > b) + return 1; + if (a < b) + return -1; + return 0; +} + +static int __init init_sys32_ioctl(void) +{ + sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer), + init_sys32_ioctl_cmp, NULL); + return 0; +} +__initcall(init_sys32_ioctl); diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig new file mode 100644 index 00000000..9febcdef --- /dev/null +++ b/fs/configfs/Kconfig @@ -0,0 +1,11 @@ +config CONFIGFS_FS + tristate "Userspace-driven configuration filesystem" + select SYSFS + help + configfs is a RAM-based filesystem that provides the converse + of sysfs's functionality. Where sysfs is a filesystem-based + view of kernel objects, configfs is a filesystem-based manager + of kernel objects, or config_items. + + Both sysfs and configfs can and should exist together on the + same system. One is not a replacement for the other. diff --git a/fs/configfs/Makefile b/fs/configfs/Makefile new file mode 100644 index 00000000..00ffb278 --- /dev/null +++ b/fs/configfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the configfs virtual filesystem +# + +obj-$(CONFIG_CONFIGFS_FS) += configfs.o + +configfs-objs := inode.o file.o dir.o symlink.o mount.o item.o diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h new file mode 100644 index 00000000..82bda8fd --- /dev/null +++ b/fs/configfs/configfs_internal.h @@ -0,0 +1,161 @@ +/* -*- mode: c; c-basic-offset:8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * configfs_internal.h - Internal stuff for configfs + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#include +#include +#include + +struct configfs_dirent { + atomic_t s_count; + int s_dependent_count; + struct list_head s_sibling; + struct list_head s_children; + struct list_head s_links; + void * s_element; + int s_type; + umode_t s_mode; + struct dentry * s_dentry; + struct iattr * s_iattr; +#ifdef CONFIG_LOCKDEP + int s_depth; +#endif +}; + +#define CONFIGFS_ROOT 0x0001 +#define CONFIGFS_DIR 0x0002 +#define CONFIGFS_ITEM_ATTR 0x0004 +#define CONFIGFS_ITEM_LINK 0x0020 +#define CONFIGFS_USET_DIR 0x0040 +#define CONFIGFS_USET_DEFAULT 0x0080 +#define CONFIGFS_USET_DROPPING 0x0100 +#define CONFIGFS_USET_IN_MKDIR 0x0200 +#define CONFIGFS_USET_CREATING 0x0400 +#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) + +extern struct mutex configfs_symlink_mutex; +extern spinlock_t configfs_dirent_lock; + +extern struct vfsmount * configfs_mount; +extern struct kmem_cache *configfs_dir_cachep; + +extern int configfs_is_root(struct config_item *item); + +extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *); +extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *)); +extern int configfs_inode_init(void); +extern void configfs_inode_exit(void); + +extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); +extern int configfs_make_dirent(struct configfs_dirent *, + struct dentry *, void *, umode_t, int); +extern int configfs_dirent_is_ready(struct configfs_dirent *); + +extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int); +extern void configfs_hash_and_remove(struct dentry * dir, const char * name); + +extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); +extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); +extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr); + +extern int configfs_pin_fs(void); +extern void configfs_release_fs(void); + +extern struct rw_semaphore configfs_rename_sem; +extern struct super_block * configfs_sb; +extern const struct file_operations configfs_dir_operations; +extern const struct file_operations configfs_file_operations; +extern const struct file_operations bin_fops; +extern const struct inode_operations configfs_dir_inode_operations; +extern const struct inode_operations configfs_symlink_inode_operations; +extern const struct dentry_operations configfs_dentry_ops; + +extern int configfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname); +extern int configfs_unlink(struct inode *dir, struct dentry *dentry); + +struct configfs_symlink { + struct list_head sl_list; + struct config_item *sl_target; +}; + +extern int configfs_create_link(struct configfs_symlink *sl, + struct dentry *parent, + struct dentry *dentry); + +static inline struct config_item * to_item(struct dentry * dentry) +{ + struct configfs_dirent * sd = dentry->d_fsdata; + return ((struct config_item *) sd->s_element); +} + +static inline struct configfs_attribute * to_attr(struct dentry * dentry) +{ + struct configfs_dirent * sd = dentry->d_fsdata; + return ((struct configfs_attribute *) sd->s_element); +} + +static inline struct config_item *configfs_get_config_item(struct dentry *dentry) +{ + struct config_item * item = NULL; + + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) { + struct configfs_dirent * sd = dentry->d_fsdata; + if (sd->s_type & CONFIGFS_ITEM_LINK) { + struct configfs_symlink * sl = sd->s_element; + item = config_item_get(sl->sl_target); + } else + item = config_item_get(sd->s_element); + } + spin_unlock(&dentry->d_lock); + + return item; +} + +static inline void release_configfs_dirent(struct configfs_dirent * sd) +{ + if (!(sd->s_type & CONFIGFS_ROOT)) { + kfree(sd->s_iattr); + kmem_cache_free(configfs_dir_cachep, sd); + } +} + +static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd) +{ + if (sd) { + WARN_ON(!atomic_read(&sd->s_count)); + atomic_inc(&sd->s_count); + } + return sd; +} + +static inline void configfs_put(struct configfs_dirent * sd) +{ + WARN_ON(!atomic_read(&sd->s_count)); + if (atomic_dec_and_test(&sd->s_count)) + release_configfs_dirent(sd); +} + diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c new file mode 100644 index 00000000..9a37a9b6 --- /dev/null +++ b/fs/configfs/dir.c @@ -0,0 +1,1766 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dir.c - Operations for configfs directories. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#undef DEBUG + +#include +#include +#include +#include +#include + +#include +#include "configfs_internal.h" + +DECLARE_RWSEM(configfs_rename_sem); +/* + * Protects mutations of configfs_dirent linkage together with proper i_mutex + * Also protects mutations of symlinks linkage to target configfs_dirent + * Mutators of configfs_dirent linkage must *both* have the proper inode locked + * and configfs_dirent_lock locked, in that order. + * This allows one to safely traverse configfs_dirent trees and symlinks without + * having to lock inodes. + * + * Protects setting of CONFIGFS_USET_DROPPING: checking the flag + * unlocked is not reliable unless in detach_groups() called from + * rmdir()/unregister() and from configfs_attach_group() + */ +DEFINE_SPINLOCK(configfs_dirent_lock); + +static void configfs_d_iput(struct dentry * dentry, + struct inode * inode) +{ + struct configfs_dirent *sd = dentry->d_fsdata; + + if (sd) { + BUG_ON(sd->s_dentry != dentry); + /* Coordinate with configfs_readdir */ + spin_lock(&configfs_dirent_lock); + sd->s_dentry = NULL; + spin_unlock(&configfs_dirent_lock); + configfs_put(sd); + } + iput(inode); +} + +/* + * We _must_ delete our dentries on last dput, as the chain-to-parent + * behavior is required to clear the parents of default_groups. + */ +static int configfs_d_delete(const struct dentry *dentry) +{ + return 1; +} + +const struct dentry_operations configfs_dentry_ops = { + .d_iput = configfs_d_iput, + /* simple_delete_dentry() isn't exported */ + .d_delete = configfs_d_delete, +}; + +#ifdef CONFIG_LOCKDEP + +/* + * Helpers to make lockdep happy with our recursive locking of default groups' + * inodes (see configfs_attach_group() and configfs_detach_group()). + * We put default groups i_mutexes in separate classes according to their depth + * from the youngest non-default group ancestor. + * + * For a non-default group A having default groups A/B, A/C, and A/C/D, default + * groups A/B and A/C will have their inode's mutex in class + * default_group_class[0], and default group A/C/D will be in + * default_group_class[1]. + * + * The lock classes are declared and assigned in inode.c, according to the + * s_depth value. + * The s_depth value is initialized to -1, adjusted to >= 0 when attaching + * default groups, and reset to -1 when all default groups are attached. During + * attachment, if configfs_create() sees s_depth > 0, the lock class of the new + * inode's mutex is set to default_group_class[s_depth - 1]. + */ + +static void configfs_init_dirent_depth(struct configfs_dirent *sd) +{ + sd->s_depth = -1; +} + +static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd, + struct configfs_dirent *sd) +{ + int parent_depth = parent_sd->s_depth; + + if (parent_depth >= 0) + sd->s_depth = parent_depth + 1; +} + +static void +configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd) +{ + /* + * item's i_mutex class is already setup, so s_depth is now only + * used to set new sub-directories s_depth, which is always done + * with item's i_mutex locked. + */ + /* + * sd->s_depth == -1 iff we are a non default group. + * else (we are a default group) sd->s_depth > 0 (see + * create_dir()). + */ + if (sd->s_depth == -1) + /* + * We are a non default group and we are going to create + * default groups. + */ + sd->s_depth = 0; +} + +static void +configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd) +{ + /* We will not create default groups anymore. */ + sd->s_depth = -1; +} + +#else /* CONFIG_LOCKDEP */ + +static void configfs_init_dirent_depth(struct configfs_dirent *sd) +{ +} + +static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd, + struct configfs_dirent *sd) +{ +} + +static void +configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd) +{ +} + +static void +configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd) +{ +} + +#endif /* CONFIG_LOCKDEP */ + +/* + * Allocates a new configfs_dirent and links it to the parent configfs_dirent + */ +static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd, + void *element, int type) +{ + struct configfs_dirent * sd; + + sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL); + if (!sd) + return ERR_PTR(-ENOMEM); + + atomic_set(&sd->s_count, 1); + INIT_LIST_HEAD(&sd->s_links); + INIT_LIST_HEAD(&sd->s_children); + sd->s_element = element; + sd->s_type = type; + configfs_init_dirent_depth(sd); + spin_lock(&configfs_dirent_lock); + if (parent_sd->s_type & CONFIGFS_USET_DROPPING) { + spin_unlock(&configfs_dirent_lock); + kmem_cache_free(configfs_dir_cachep, sd); + return ERR_PTR(-ENOENT); + } + list_add(&sd->s_sibling, &parent_sd->s_children); + spin_unlock(&configfs_dirent_lock); + + return sd; +} + +/* + * + * Return -EEXIST if there is already a configfs element with the same + * name for the same parent. + * + * called with parent inode's i_mutex held + */ +static int configfs_dirent_exists(struct configfs_dirent *parent_sd, + const unsigned char *new) +{ + struct configfs_dirent * sd; + + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + if (sd->s_element) { + const unsigned char *existing = configfs_get_name(sd); + if (strcmp(existing, new)) + continue; + else + return -EEXIST; + } + } + + return 0; +} + + +int configfs_make_dirent(struct configfs_dirent * parent_sd, + struct dentry * dentry, void * element, + umode_t mode, int type) +{ + struct configfs_dirent * sd; + + sd = configfs_new_dirent(parent_sd, element, type); + if (IS_ERR(sd)) + return PTR_ERR(sd); + + sd->s_mode = mode; + sd->s_dentry = dentry; + if (dentry) + dentry->d_fsdata = configfs_get(sd); + + return 0; +} + +static int init_dir(struct inode * inode) +{ + inode->i_op = &configfs_dir_inode_operations; + inode->i_fop = &configfs_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + return 0; +} + +static int configfs_init_file(struct inode * inode) +{ + inode->i_size = PAGE_SIZE; + inode->i_fop = &configfs_file_operations; + return 0; +} + +static int init_symlink(struct inode * inode) +{ + inode->i_op = &configfs_symlink_inode_operations; + return 0; +} + +static int create_dir(struct config_item * k, struct dentry * p, + struct dentry * d) +{ + int error; + umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; + + error = configfs_dirent_exists(p->d_fsdata, d->d_name.name); + if (!error) + error = configfs_make_dirent(p->d_fsdata, d, k, mode, + CONFIGFS_DIR | CONFIGFS_USET_CREATING); + if (!error) { + configfs_set_dir_dirent_depth(p->d_fsdata, d->d_fsdata); + error = configfs_create(d, mode, init_dir); + if (!error) { + inc_nlink(p->d_inode); + } else { + struct configfs_dirent *sd = d->d_fsdata; + if (sd) { + spin_lock(&configfs_dirent_lock); + list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); + configfs_put(sd); + } + } + } + return error; +} + + +/** + * configfs_create_dir - create a directory for an config_item. + * @item: config_itemwe're creating directory for. + * @dentry: config_item's dentry. + * + * Note: user-created entries won't be allowed under this new directory + * until it is validated by configfs_dir_set_ready() + */ + +static int configfs_create_dir(struct config_item * item, struct dentry *dentry) +{ + struct dentry * parent; + int error = 0; + + BUG_ON(!item); + + if (item->ci_parent) + parent = item->ci_parent->ci_dentry; + else if (configfs_mount && configfs_mount->mnt_sb) + parent = configfs_mount->mnt_sb->s_root; + else + return -EFAULT; + + error = create_dir(item,parent,dentry); + if (!error) + item->ci_dentry = dentry; + return error; +} + +/* + * Allow userspace to create new entries under a new directory created with + * configfs_create_dir(), and under all of its chidlren directories recursively. + * @sd configfs_dirent of the new directory to validate + * + * Caller must hold configfs_dirent_lock. + */ +static void configfs_dir_set_ready(struct configfs_dirent *sd) +{ + struct configfs_dirent *child_sd; + + sd->s_type &= ~CONFIGFS_USET_CREATING; + list_for_each_entry(child_sd, &sd->s_children, s_sibling) + if (child_sd->s_type & CONFIGFS_USET_CREATING) + configfs_dir_set_ready(child_sd); +} + +/* + * Check that a directory does not belong to a directory hierarchy being + * attached and not validated yet. + * @sd configfs_dirent of the directory to check + * + * @return non-zero iff the directory was validated + * + * Note: takes configfs_dirent_lock, so the result may change from false to true + * in two consecutive calls, but never from true to false. + */ +int configfs_dirent_is_ready(struct configfs_dirent *sd) +{ + int ret; + + spin_lock(&configfs_dirent_lock); + ret = !(sd->s_type & CONFIGFS_USET_CREATING); + spin_unlock(&configfs_dirent_lock); + + return ret; +} + +int configfs_create_link(struct configfs_symlink *sl, + struct dentry *parent, + struct dentry *dentry) +{ + int err = 0; + umode_t mode = S_IFLNK | S_IRWXUGO; + + err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode, + CONFIGFS_ITEM_LINK); + if (!err) { + err = configfs_create(dentry, mode, init_symlink); + if (err) { + struct configfs_dirent *sd = dentry->d_fsdata; + if (sd) { + spin_lock(&configfs_dirent_lock); + list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); + configfs_put(sd); + } + } + } + return err; +} + +static void remove_dir(struct dentry * d) +{ + struct dentry * parent = dget(d->d_parent); + struct configfs_dirent * sd; + + sd = d->d_fsdata; + spin_lock(&configfs_dirent_lock); + list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); + configfs_put(sd); + if (d->d_inode) + simple_rmdir(parent->d_inode,d); + + pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count); + + dput(parent); +} + +/** + * configfs_remove_dir - remove an config_item's directory. + * @item: config_item we're removing. + * + * The only thing special about this is that we remove any files in + * the directory before we remove the directory, and we've inlined + * what used to be configfs_rmdir() below, instead of calling separately. + * + * Caller holds the mutex of the item's inode + */ + +static void configfs_remove_dir(struct config_item * item) +{ + struct dentry * dentry = dget(item->ci_dentry); + + if (!dentry) + return; + + remove_dir(dentry); + /** + * Drop reference from dget() on entrance. + */ + dput(dentry); +} + + +/* attaches attribute's configfs_dirent to the dentry corresponding to the + * attribute file + */ +static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry) +{ + struct configfs_attribute * attr = sd->s_element; + int error; + + dentry->d_fsdata = configfs_get(sd); + sd->s_dentry = dentry; + error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, + configfs_init_file); + if (error) { + configfs_put(sd); + return error; + } + + d_rehash(dentry); + + return 0; +} + +static struct dentry * configfs_lookup(struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata; + struct configfs_dirent * sd; + int found = 0; + int err; + + /* + * Fake invisibility if dir belongs to a group/default groups hierarchy + * being attached + * + * This forbids userspace to read/write attributes of items which may + * not complete their initialization, since the dentries of the + * attributes won't be instantiated. + */ + err = -ENOENT; + if (!configfs_dirent_is_ready(parent_sd)) + goto out; + + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + if (sd->s_type & CONFIGFS_NOT_PINNED) { + const unsigned char * name = configfs_get_name(sd); + + if (strcmp(name, dentry->d_name.name)) + continue; + + found = 1; + err = configfs_attach_attr(sd, dentry); + break; + } + } + + if (!found) { + /* + * If it doesn't exist and it isn't a NOT_PINNED item, + * it must be negative. + */ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + d_add(dentry, NULL); + return NULL; + } + +out: + return ERR_PTR(err); +} + +/* + * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are + * attributes and are removed by rmdir(). We recurse, setting + * CONFIGFS_USET_DROPPING on all children that are candidates for + * default detach. + * If there is an error, the caller will reset the flags via + * configfs_detach_rollback(). + */ +static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex) +{ + struct configfs_dirent *parent_sd = dentry->d_fsdata; + struct configfs_dirent *sd; + int ret; + + /* Mark that we're trying to drop the group */ + parent_sd->s_type |= CONFIGFS_USET_DROPPING; + + ret = -EBUSY; + if (!list_empty(&parent_sd->s_links)) + goto out; + + ret = 0; + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + if (!sd->s_element || + (sd->s_type & CONFIGFS_NOT_PINNED)) + continue; + if (sd->s_type & CONFIGFS_USET_DEFAULT) { + /* Abort if racing with mkdir() */ + if (sd->s_type & CONFIGFS_USET_IN_MKDIR) { + if (wait_mutex) + *wait_mutex = &sd->s_dentry->d_inode->i_mutex; + return -EAGAIN; + } + + /* + * Yup, recursive. If there's a problem, blame + * deep nesting of default_groups + */ + ret = configfs_detach_prep(sd->s_dentry, wait_mutex); + if (!ret) + continue; + } else + ret = -ENOTEMPTY; + + break; + } + +out: + return ret; +} + +/* + * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was + * set. + */ +static void configfs_detach_rollback(struct dentry *dentry) +{ + struct configfs_dirent *parent_sd = dentry->d_fsdata; + struct configfs_dirent *sd; + + parent_sd->s_type &= ~CONFIGFS_USET_DROPPING; + + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) + if (sd->s_type & CONFIGFS_USET_DEFAULT) + configfs_detach_rollback(sd->s_dentry); +} + +static void detach_attrs(struct config_item * item) +{ + struct dentry * dentry = dget(item->ci_dentry); + struct configfs_dirent * parent_sd; + struct configfs_dirent * sd, * tmp; + + if (!dentry) + return; + + pr_debug("configfs %s: dropping attrs for dir\n", + dentry->d_name.name); + + parent_sd = dentry->d_fsdata; + list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { + if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED)) + continue; + spin_lock(&configfs_dirent_lock); + list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); + configfs_drop_dentry(sd, dentry); + configfs_put(sd); + } + + /** + * Drop reference from dget() on entrance. + */ + dput(dentry); +} + +static int populate_attrs(struct config_item *item) +{ + struct config_item_type *t = item->ci_type; + struct configfs_attribute *attr; + int error = 0; + int i; + + if (!t) + return -EINVAL; + if (t->ct_attrs) { + for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) { + if ((error = configfs_create_file(item, attr))) + break; + } + } + + if (error) + detach_attrs(item); + + return error; +} + +static int configfs_attach_group(struct config_item *parent_item, + struct config_item *item, + struct dentry *dentry); +static void configfs_detach_group(struct config_item *item); + +static void detach_groups(struct config_group *group) +{ + struct dentry * dentry = dget(group->cg_item.ci_dentry); + struct dentry *child; + struct configfs_dirent *parent_sd; + struct configfs_dirent *sd, *tmp; + + if (!dentry) + return; + + parent_sd = dentry->d_fsdata; + list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { + if (!sd->s_element || + !(sd->s_type & CONFIGFS_USET_DEFAULT)) + continue; + + child = sd->s_dentry; + + mutex_lock(&child->d_inode->i_mutex); + + configfs_detach_group(sd->s_element); + child->d_inode->i_flags |= S_DEAD; + dont_mount(child); + + mutex_unlock(&child->d_inode->i_mutex); + + d_delete(child); + dput(child); + } + + /** + * Drop reference from dget() on entrance. + */ + dput(dentry); +} + +/* + * This fakes mkdir(2) on a default_groups[] entry. It + * creates a dentry, attachs it, and then does fixup + * on the sd->s_type. + * + * We could, perhaps, tweak our parent's ->mkdir for a minute and + * try using vfs_mkdir. Just a thought. + */ +static int create_default_group(struct config_group *parent_group, + struct config_group *group) +{ + int ret; + struct qstr name; + struct configfs_dirent *sd; + /* We trust the caller holds a reference to parent */ + struct dentry *child, *parent = parent_group->cg_item.ci_dentry; + + if (!group->cg_item.ci_name) + group->cg_item.ci_name = group->cg_item.ci_namebuf; + name.name = group->cg_item.ci_name; + name.len = strlen(name.name); + name.hash = full_name_hash(name.name, name.len); + + ret = -ENOMEM; + child = d_alloc(parent, &name); + if (child) { + d_add(child, NULL); + + ret = configfs_attach_group(&parent_group->cg_item, + &group->cg_item, child); + if (!ret) { + sd = child->d_fsdata; + sd->s_type |= CONFIGFS_USET_DEFAULT; + } else { + BUG_ON(child->d_inode); + d_drop(child); + dput(child); + } + } + + return ret; +} + +static int populate_groups(struct config_group *group) +{ + struct config_group *new_group; + int ret = 0; + int i; + + if (group->default_groups) { + for (i = 0; group->default_groups[i]; i++) { + new_group = group->default_groups[i]; + + ret = create_default_group(group, new_group); + if (ret) { + detach_groups(group); + break; + } + } + } + + return ret; +} + +/* + * All of link_obj/unlink_obj/link_group/unlink_group require that + * subsys->su_mutex is held. + */ + +static void unlink_obj(struct config_item *item) +{ + struct config_group *group; + + group = item->ci_group; + if (group) { + list_del_init(&item->ci_entry); + + item->ci_group = NULL; + item->ci_parent = NULL; + + /* Drop the reference for ci_entry */ + config_item_put(item); + + /* Drop the reference for ci_parent */ + config_group_put(group); + } +} + +static void link_obj(struct config_item *parent_item, struct config_item *item) +{ + /* + * Parent seems redundant with group, but it makes certain + * traversals much nicer. + */ + item->ci_parent = parent_item; + + /* + * We hold a reference on the parent for the child's ci_parent + * link. + */ + item->ci_group = config_group_get(to_config_group(parent_item)); + list_add_tail(&item->ci_entry, &item->ci_group->cg_children); + + /* + * We hold a reference on the child for ci_entry on the parent's + * cg_children + */ + config_item_get(item); +} + +static void unlink_group(struct config_group *group) +{ + int i; + struct config_group *new_group; + + if (group->default_groups) { + for (i = 0; group->default_groups[i]; i++) { + new_group = group->default_groups[i]; + unlink_group(new_group); + } + } + + group->cg_subsys = NULL; + unlink_obj(&group->cg_item); +} + +static void link_group(struct config_group *parent_group, struct config_group *group) +{ + int i; + struct config_group *new_group; + struct configfs_subsystem *subsys = NULL; /* gcc is a turd */ + + link_obj(&parent_group->cg_item, &group->cg_item); + + if (parent_group->cg_subsys) + subsys = parent_group->cg_subsys; + else if (configfs_is_root(&parent_group->cg_item)) + subsys = to_configfs_subsystem(group); + else + BUG(); + group->cg_subsys = subsys; + + if (group->default_groups) { + for (i = 0; group->default_groups[i]; i++) { + new_group = group->default_groups[i]; + link_group(group, new_group); + } + } +} + +/* + * The goal is that configfs_attach_item() (and + * configfs_attach_group()) can be called from either the VFS or this + * module. That is, they assume that the items have been created, + * the dentry allocated, and the dcache is all ready to go. + * + * If they fail, they must clean up after themselves as if they + * had never been called. The caller (VFS or local function) will + * handle cleaning up the dcache bits. + * + * configfs_detach_group() and configfs_detach_item() behave similarly on + * the way out. They assume that the proper semaphores are held, they + * clean up the configfs items, and they expect their callers will + * handle the dcache bits. + */ +static int configfs_attach_item(struct config_item *parent_item, + struct config_item *item, + struct dentry *dentry) +{ + int ret; + + ret = configfs_create_dir(item, dentry); + if (!ret) { + ret = populate_attrs(item); + if (ret) { + /* + * We are going to remove an inode and its dentry but + * the VFS may already have hit and used them. Thus, + * we must lock them as rmdir() would. + */ + mutex_lock(&dentry->d_inode->i_mutex); + configfs_remove_dir(item); + dentry->d_inode->i_flags |= S_DEAD; + dont_mount(dentry); + mutex_unlock(&dentry->d_inode->i_mutex); + d_delete(dentry); + } + } + + return ret; +} + +/* Caller holds the mutex of the item's inode */ +static void configfs_detach_item(struct config_item *item) +{ + detach_attrs(item); + configfs_remove_dir(item); +} + +static int configfs_attach_group(struct config_item *parent_item, + struct config_item *item, + struct dentry *dentry) +{ + int ret; + struct configfs_dirent *sd; + + ret = configfs_attach_item(parent_item, item, dentry); + if (!ret) { + sd = dentry->d_fsdata; + sd->s_type |= CONFIGFS_USET_DIR; + + /* + * FYI, we're faking mkdir in populate_groups() + * We must lock the group's inode to avoid races with the VFS + * which can already hit the inode and try to add/remove entries + * under it. + * + * We must also lock the inode to remove it safely in case of + * error, as rmdir() would. + */ + mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + configfs_adjust_dir_dirent_depth_before_populate(sd); + ret = populate_groups(to_config_group(item)); + if (ret) { + configfs_detach_item(item); + dentry->d_inode->i_flags |= S_DEAD; + dont_mount(dentry); + } + configfs_adjust_dir_dirent_depth_after_populate(sd); + mutex_unlock(&dentry->d_inode->i_mutex); + if (ret) + d_delete(dentry); + } + + return ret; +} + +/* Caller holds the mutex of the group's inode */ +static void configfs_detach_group(struct config_item *item) +{ + detach_groups(to_config_group(item)); + configfs_detach_item(item); +} + +/* + * After the item has been detached from the filesystem view, we are + * ready to tear it out of the hierarchy. Notify the client before + * we do that so they can perform any cleanup that requires + * navigating the hierarchy. A client does not need to provide this + * callback. The subsystem semaphore MUST be held by the caller, and + * references must be valid for both items. It also assumes the + * caller has validated ci_type. + */ +static void client_disconnect_notify(struct config_item *parent_item, + struct config_item *item) +{ + struct config_item_type *type; + + type = parent_item->ci_type; + BUG_ON(!type); + + if (type->ct_group_ops && type->ct_group_ops->disconnect_notify) + type->ct_group_ops->disconnect_notify(to_config_group(parent_item), + item); +} + +/* + * Drop the initial reference from make_item()/make_group() + * This function assumes that reference is held on item + * and that item holds a valid reference to the parent. Also, it + * assumes the caller has validated ci_type. + */ +static void client_drop_item(struct config_item *parent_item, + struct config_item *item) +{ + struct config_item_type *type; + + type = parent_item->ci_type; + BUG_ON(!type); + + /* + * If ->drop_item() exists, it is responsible for the + * config_item_put(). + */ + if (type->ct_group_ops && type->ct_group_ops->drop_item) + type->ct_group_ops->drop_item(to_config_group(parent_item), + item); + else + config_item_put(item); +} + +#ifdef DEBUG +static void configfs_dump_one(struct configfs_dirent *sd, int level) +{ + printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd)); + +#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type); + type_print(CONFIGFS_ROOT); + type_print(CONFIGFS_DIR); + type_print(CONFIGFS_ITEM_ATTR); + type_print(CONFIGFS_ITEM_LINK); + type_print(CONFIGFS_USET_DIR); + type_print(CONFIGFS_USET_DEFAULT); + type_print(CONFIGFS_USET_DROPPING); +#undef type_print +} + +static int configfs_dump(struct configfs_dirent *sd, int level) +{ + struct configfs_dirent *child_sd; + int ret = 0; + + configfs_dump_one(sd, level); + + if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT))) + return 0; + + list_for_each_entry(child_sd, &sd->s_children, s_sibling) { + ret = configfs_dump(child_sd, level + 2); + if (ret) + break; + } + + return ret; +} +#endif + + +/* + * configfs_depend_item() and configfs_undepend_item() + * + * WARNING: Do not call these from a configfs callback! + * + * This describes these functions and their helpers. + * + * Allow another kernel system to depend on a config_item. If this + * happens, the item cannot go away until the dependent can live without + * it. The idea is to give client modules as simple an interface as + * possible. When a system asks them to depend on an item, they just + * call configfs_depend_item(). If the item is live and the client + * driver is in good shape, we'll happily do the work for them. + * + * Why is the locking complex? Because configfs uses the VFS to handle + * all locking, but this function is called outside the normal + * VFS->configfs path. So it must take VFS locks to prevent the + * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc). This is + * why you can't call these functions underneath configfs callbacks. + * + * Note, btw, that this can be called at *any* time, even when a configfs + * subsystem isn't registered, or when configfs is loading or unloading. + * Just like configfs_register_subsystem(). So we take the same + * precautions. We pin the filesystem. We lock configfs_dirent_lock. + * If we can find the target item in the + * configfs tree, it must be part of the subsystem tree as well, so we + * do not need the subsystem semaphore. Holding configfs_dirent_lock helps + * locking out mkdir() and rmdir(), who might be racing us. + */ + +/* + * configfs_depend_prep() + * + * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are + * attributes. This is similar but not the same to configfs_detach_prep(). + * Note that configfs_detach_prep() expects the parent to be locked when it + * is called, but we lock the parent *inside* configfs_depend_prep(). We + * do that so we can unlock it if we find nothing. + * + * Here we do a depth-first search of the dentry hierarchy looking for + * our object. + * We deliberately ignore items tagged as dropping since they are virtually + * dead, as well as items in the middle of attachment since they virtually + * do not exist yet. This completes the locking out of racing mkdir() and + * rmdir(). + * Note: subdirectories in the middle of attachment start with s_type = + * CONFIGFS_DIR|CONFIGFS_USET_CREATING set by create_dir(). When + * CONFIGFS_USET_CREATING is set, we ignore the item. The actual set of + * s_type is in configfs_new_dirent(), which has configfs_dirent_lock. + * + * If the target is not found, -ENOENT is bubbled up. + * + * This adds a requirement that all config_items be unique! + * + * This is recursive. There isn't + * much on the stack, though, so folks that need this function - be careful + * about your stack! Patches will be accepted to make it iterative. + */ +static int configfs_depend_prep(struct dentry *origin, + struct config_item *target) +{ + struct configfs_dirent *child_sd, *sd = origin->d_fsdata; + int ret = 0; + + BUG_ON(!origin || !sd); + + if (sd->s_element == target) /* Boo-yah */ + goto out; + + list_for_each_entry(child_sd, &sd->s_children, s_sibling) { + if ((child_sd->s_type & CONFIGFS_DIR) && + !(child_sd->s_type & CONFIGFS_USET_DROPPING) && + !(child_sd->s_type & CONFIGFS_USET_CREATING)) { + ret = configfs_depend_prep(child_sd->s_dentry, + target); + if (!ret) + goto out; /* Child path boo-yah */ + } + } + + /* We looped all our children and didn't find target */ + ret = -ENOENT; + +out: + return ret; +} + +int configfs_depend_item(struct configfs_subsystem *subsys, + struct config_item *target) +{ + int ret; + struct configfs_dirent *p, *root_sd, *subsys_sd = NULL; + struct config_item *s_item = &subsys->su_group.cg_item; + + /* + * Pin the configfs filesystem. This means we can safely access + * the root of the configfs filesystem. + */ + ret = configfs_pin_fs(); + if (ret) + return ret; + + /* + * Next, lock the root directory. We're going to check that the + * subsystem is really registered, and so we need to lock out + * configfs_[un]register_subsystem(). + */ + mutex_lock(&configfs_sb->s_root->d_inode->i_mutex); + + root_sd = configfs_sb->s_root->d_fsdata; + + list_for_each_entry(p, &root_sd->s_children, s_sibling) { + if (p->s_type & CONFIGFS_DIR) { + if (p->s_element == s_item) { + subsys_sd = p; + break; + } + } + } + + if (!subsys_sd) { + ret = -ENOENT; + goto out_unlock_fs; + } + + /* Ok, now we can trust subsys/s_item */ + + spin_lock(&configfs_dirent_lock); + /* Scan the tree, return 0 if found */ + ret = configfs_depend_prep(subsys_sd->s_dentry, target); + if (ret) + goto out_unlock_dirent_lock; + + /* + * We are sure that the item is not about to be removed by rmdir(), and + * not in the middle of attachment by mkdir(). + */ + p = target->ci_dentry->d_fsdata; + p->s_dependent_count += 1; + +out_unlock_dirent_lock: + spin_unlock(&configfs_dirent_lock); +out_unlock_fs: + mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex); + + /* + * If we succeeded, the fs is pinned via other methods. If not, + * we're done with it anyway. So release_fs() is always right. + */ + configfs_release_fs(); + + return ret; +} +EXPORT_SYMBOL(configfs_depend_item); + +/* + * Release the dependent linkage. This is much simpler than + * configfs_depend_item() because we know that that the client driver is + * pinned, thus the subsystem is pinned, and therefore configfs is pinned. + */ +void configfs_undepend_item(struct configfs_subsystem *subsys, + struct config_item *target) +{ + struct configfs_dirent *sd; + + /* + * Since we can trust everything is pinned, we just need + * configfs_dirent_lock. + */ + spin_lock(&configfs_dirent_lock); + + sd = target->ci_dentry->d_fsdata; + BUG_ON(sd->s_dependent_count < 1); + + sd->s_dependent_count -= 1; + + /* + * After this unlock, we cannot trust the item to stay alive! + * DO NOT REFERENCE item after this unlock. + */ + spin_unlock(&configfs_dirent_lock); +} +EXPORT_SYMBOL(configfs_undepend_item); + +static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int ret = 0; + int module_got = 0; + struct config_group *group = NULL; + struct config_item *item = NULL; + struct config_item *parent_item; + struct configfs_subsystem *subsys; + struct configfs_dirent *sd; + struct config_item_type *type; + struct module *subsys_owner = NULL, *new_item_owner = NULL; + char *name; + + if (dentry->d_parent == configfs_sb->s_root) { + ret = -EPERM; + goto out; + } + + sd = dentry->d_parent->d_fsdata; + + /* + * Fake invisibility if dir belongs to a group/default groups hierarchy + * being attached + */ + if (!configfs_dirent_is_ready(sd)) { + ret = -ENOENT; + goto out; + } + + if (!(sd->s_type & CONFIGFS_USET_DIR)) { + ret = -EPERM; + goto out; + } + + /* Get a working ref for the duration of this function */ + parent_item = configfs_get_config_item(dentry->d_parent); + type = parent_item->ci_type; + subsys = to_config_group(parent_item)->cg_subsys; + BUG_ON(!subsys); + + if (!type || !type->ct_group_ops || + (!type->ct_group_ops->make_group && + !type->ct_group_ops->make_item)) { + ret = -EPERM; /* Lack-of-mkdir returns -EPERM */ + goto out_put; + } + + /* + * The subsystem may belong to a different module than the item + * being created. We don't want to safely pin the new item but + * fail to pin the subsystem it sits under. + */ + if (!subsys->su_group.cg_item.ci_type) { + ret = -EINVAL; + goto out_put; + } + subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner; + if (!try_module_get(subsys_owner)) { + ret = -EINVAL; + goto out_put; + } + + name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL); + if (!name) { + ret = -ENOMEM; + goto out_subsys_put; + } + + snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); + + mutex_lock(&subsys->su_mutex); + if (type->ct_group_ops->make_group) { + group = type->ct_group_ops->make_group(to_config_group(parent_item), name); + if (!group) + group = ERR_PTR(-ENOMEM); + if (!IS_ERR(group)) { + link_group(to_config_group(parent_item), group); + item = &group->cg_item; + } else + ret = PTR_ERR(group); + } else { + item = type->ct_group_ops->make_item(to_config_group(parent_item), name); + if (!item) + item = ERR_PTR(-ENOMEM); + if (!IS_ERR(item)) + link_obj(parent_item, item); + else + ret = PTR_ERR(item); + } + mutex_unlock(&subsys->su_mutex); + + kfree(name); + if (ret) { + /* + * If ret != 0, then link_obj() was never called. + * There are no extra references to clean up. + */ + goto out_subsys_put; + } + + /* + * link_obj() has been called (via link_group() for groups). + * From here on out, errors must clean that up. + */ + + type = item->ci_type; + if (!type) { + ret = -EINVAL; + goto out_unlink; + } + + new_item_owner = type->ct_owner; + if (!try_module_get(new_item_owner)) { + ret = -EINVAL; + goto out_unlink; + } + + /* + * I hate doing it this way, but if there is + * an error, module_put() probably should + * happen after any cleanup. + */ + module_got = 1; + + /* + * Make racing rmdir() fail if it did not tag parent with + * CONFIGFS_USET_DROPPING + * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will + * fail and let rmdir() terminate correctly + */ + spin_lock(&configfs_dirent_lock); + /* This will make configfs_detach_prep() fail */ + sd->s_type |= CONFIGFS_USET_IN_MKDIR; + spin_unlock(&configfs_dirent_lock); + + if (group) + ret = configfs_attach_group(parent_item, item, dentry); + else + ret = configfs_attach_item(parent_item, item, dentry); + + spin_lock(&configfs_dirent_lock); + sd->s_type &= ~CONFIGFS_USET_IN_MKDIR; + if (!ret) + configfs_dir_set_ready(dentry->d_fsdata); + spin_unlock(&configfs_dirent_lock); + +out_unlink: + if (ret) { + /* Tear down everything we built up */ + mutex_lock(&subsys->su_mutex); + + client_disconnect_notify(parent_item, item); + if (group) + unlink_group(group); + else + unlink_obj(item); + client_drop_item(parent_item, item); + + mutex_unlock(&subsys->su_mutex); + + if (module_got) + module_put(new_item_owner); + } + +out_subsys_put: + if (ret) + module_put(subsys_owner); + +out_put: + /* + * link_obj()/link_group() took a reference from child->parent, + * so the parent is safely pinned. We can drop our working + * reference. + */ + config_item_put(parent_item); + +out: + return ret; +} + +static int configfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct config_item *parent_item; + struct config_item *item; + struct configfs_subsystem *subsys; + struct configfs_dirent *sd; + struct module *subsys_owner = NULL, *dead_item_owner = NULL; + int ret; + + if (dentry->d_parent == configfs_sb->s_root) + return -EPERM; + + sd = dentry->d_fsdata; + if (sd->s_type & CONFIGFS_USET_DEFAULT) + return -EPERM; + + /* Get a working ref until we have the child */ + parent_item = configfs_get_config_item(dentry->d_parent); + subsys = to_config_group(parent_item)->cg_subsys; + BUG_ON(!subsys); + + if (!parent_item->ci_type) { + config_item_put(parent_item); + return -EINVAL; + } + + /* configfs_mkdir() shouldn't have allowed this */ + BUG_ON(!subsys->su_group.cg_item.ci_type); + subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner; + + /* + * Ensure that no racing symlink() will make detach_prep() fail while + * the new link is temporarily attached + */ + do { + struct mutex *wait_mutex; + + mutex_lock(&configfs_symlink_mutex); + spin_lock(&configfs_dirent_lock); + /* + * Here's where we check for dependents. We're protected by + * configfs_dirent_lock. + * If no dependent, atomically tag the item as dropping. + */ + ret = sd->s_dependent_count ? -EBUSY : 0; + if (!ret) { + ret = configfs_detach_prep(dentry, &wait_mutex); + if (ret) + configfs_detach_rollback(dentry); + } + spin_unlock(&configfs_dirent_lock); + mutex_unlock(&configfs_symlink_mutex); + + if (ret) { + if (ret != -EAGAIN) { + config_item_put(parent_item); + return ret; + } + + /* Wait until the racing operation terminates */ + mutex_lock(wait_mutex); + mutex_unlock(wait_mutex); + } + } while (ret == -EAGAIN); + + /* Get a working ref for the duration of this function */ + item = configfs_get_config_item(dentry); + + /* Drop reference from above, item already holds one. */ + config_item_put(parent_item); + + if (item->ci_type) + dead_item_owner = item->ci_type->ct_owner; + + if (sd->s_type & CONFIGFS_USET_DIR) { + configfs_detach_group(item); + + mutex_lock(&subsys->su_mutex); + client_disconnect_notify(parent_item, item); + unlink_group(to_config_group(item)); + } else { + configfs_detach_item(item); + + mutex_lock(&subsys->su_mutex); + client_disconnect_notify(parent_item, item); + unlink_obj(item); + } + + client_drop_item(parent_item, item); + mutex_unlock(&subsys->su_mutex); + + /* Drop our reference from above */ + config_item_put(item); + + module_put(dead_item_owner); + module_put(subsys_owner); + + return 0; +} + +const struct inode_operations configfs_dir_inode_operations = { + .mkdir = configfs_mkdir, + .rmdir = configfs_rmdir, + .symlink = configfs_symlink, + .unlink = configfs_unlink, + .lookup = configfs_lookup, + .setattr = configfs_setattr, +}; + +#if 0 +int configfs_rename_dir(struct config_item * item, const char *new_name) +{ + int error = 0; + struct dentry * new_dentry, * parent; + + if (!strcmp(config_item_name(item), new_name)) + return -EINVAL; + + if (!item->parent) + return -EINVAL; + + down_write(&configfs_rename_sem); + parent = item->parent->dentry; + + mutex_lock(&parent->d_inode->i_mutex); + + new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); + if (!IS_ERR(new_dentry)) { + if (!new_dentry->d_inode) { + error = config_item_set_name(item, "%s", new_name); + if (!error) { + d_add(new_dentry, NULL); + d_move(item->dentry, new_dentry); + } + else + d_delete(new_dentry); + } else + error = -EEXIST; + dput(new_dentry); + } + mutex_unlock(&parent->d_inode->i_mutex); + up_write(&configfs_rename_sem); + + return error; +} +#endif + +static int configfs_dir_open(struct inode *inode, struct file *file) +{ + struct dentry * dentry = file->f_path.dentry; + struct configfs_dirent * parent_sd = dentry->d_fsdata; + int err; + + mutex_lock(&dentry->d_inode->i_mutex); + /* + * Fake invisibility if dir belongs to a group/default groups hierarchy + * being attached + */ + err = -ENOENT; + if (configfs_dirent_is_ready(parent_sd)) { + file->private_data = configfs_new_dirent(parent_sd, NULL, 0); + if (IS_ERR(file->private_data)) + err = PTR_ERR(file->private_data); + else + err = 0; + } + mutex_unlock(&dentry->d_inode->i_mutex); + + return err; +} + +static int configfs_dir_close(struct inode *inode, struct file *file) +{ + struct dentry * dentry = file->f_path.dentry; + struct configfs_dirent * cursor = file->private_data; + + mutex_lock(&dentry->d_inode->i_mutex); + spin_lock(&configfs_dirent_lock); + list_del_init(&cursor->s_sibling); + spin_unlock(&configfs_dirent_lock); + mutex_unlock(&dentry->d_inode->i_mutex); + + release_configfs_dirent(cursor); + + return 0; +} + +/* Relationship between s_mode and the DT_xxx types */ +static inline unsigned char dt_type(struct configfs_dirent *sd) +{ + return (sd->s_mode >> 12) & 15; +} + +static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct configfs_dirent * parent_sd = dentry->d_fsdata; + struct configfs_dirent *cursor = filp->private_data; + struct list_head *p, *q = &cursor->s_sibling; + ino_t ino = 0; + int i = filp->f_pos; + + switch (i) { + case 0: + ino = dentry->d_inode->i_ino; + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + break; + filp->f_pos++; + i++; + /* fallthrough */ + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) + break; + filp->f_pos++; + i++; + /* fallthrough */ + default: + if (filp->f_pos == 2) { + spin_lock(&configfs_dirent_lock); + list_move(q, &parent_sd->s_children); + spin_unlock(&configfs_dirent_lock); + } + for (p=q->next; p!= &parent_sd->s_children; p=p->next) { + struct configfs_dirent *next; + const char * name; + int len; + struct inode *inode = NULL; + + next = list_entry(p, struct configfs_dirent, + s_sibling); + if (!next->s_element) + continue; + + name = configfs_get_name(next); + len = strlen(name); + + /* + * We'll have a dentry and an inode for + * PINNED items and for open attribute + * files. We lock here to prevent a race + * with configfs_d_iput() clearing + * s_dentry before calling iput(). + * + * Why do we go to the trouble? If + * someone has an attribute file open, + * the inode number should match until + * they close it. Beyond that, we don't + * care. + */ + spin_lock(&configfs_dirent_lock); + dentry = next->s_dentry; + if (dentry) + inode = dentry->d_inode; + if (inode) + ino = inode->i_ino; + spin_unlock(&configfs_dirent_lock); + if (!inode) + ino = iunique(configfs_sb, 2); + + if (filldir(dirent, name, len, filp->f_pos, ino, + dt_type(next)) < 0) + return 0; + + spin_lock(&configfs_dirent_lock); + list_move(q, p); + spin_unlock(&configfs_dirent_lock); + p = q; + filp->f_pos++; + } + } + return 0; +} + +static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin) +{ + struct dentry * dentry = file->f_path.dentry; + + mutex_lock(&dentry->d_inode->i_mutex); + switch (origin) { + case 1: + offset += file->f_pos; + case 0: + if (offset >= 0) + break; + default: + mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); + return -EINVAL; + } + if (offset != file->f_pos) { + file->f_pos = offset; + if (file->f_pos >= 2) { + struct configfs_dirent *sd = dentry->d_fsdata; + struct configfs_dirent *cursor = file->private_data; + struct list_head *p; + loff_t n = file->f_pos - 2; + + spin_lock(&configfs_dirent_lock); + list_del(&cursor->s_sibling); + p = sd->s_children.next; + while (n && p != &sd->s_children) { + struct configfs_dirent *next; + next = list_entry(p, struct configfs_dirent, + s_sibling); + if (next->s_element) + n--; + p = p->next; + } + list_add_tail(&cursor->s_sibling, p); + spin_unlock(&configfs_dirent_lock); + } + } + mutex_unlock(&dentry->d_inode->i_mutex); + return offset; +} + +const struct file_operations configfs_dir_operations = { + .open = configfs_dir_open, + .release = configfs_dir_close, + .llseek = configfs_dir_lseek, + .read = generic_read_dir, + .readdir = configfs_readdir, +}; + +int configfs_register_subsystem(struct configfs_subsystem *subsys) +{ + int err; + struct config_group *group = &subsys->su_group; + struct qstr name; + struct dentry *dentry; + struct configfs_dirent *sd; + + err = configfs_pin_fs(); + if (err) + return err; + + if (!group->cg_item.ci_name) + group->cg_item.ci_name = group->cg_item.ci_namebuf; + + sd = configfs_sb->s_root->d_fsdata; + link_group(to_config_group(sd->s_element), group); + + mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex, + I_MUTEX_PARENT); + + name.name = group->cg_item.ci_name; + name.len = strlen(name.name); + name.hash = full_name_hash(name.name, name.len); + + err = -ENOMEM; + dentry = d_alloc(configfs_sb->s_root, &name); + if (dentry) { + d_add(dentry, NULL); + + err = configfs_attach_group(sd->s_element, &group->cg_item, + dentry); + if (err) { + BUG_ON(dentry->d_inode); + d_drop(dentry); + dput(dentry); + } else { + spin_lock(&configfs_dirent_lock); + configfs_dir_set_ready(dentry->d_fsdata); + spin_unlock(&configfs_dirent_lock); + } + } + + mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex); + + if (err) { + unlink_group(group); + configfs_release_fs(); + } + + return err; +} + +void configfs_unregister_subsystem(struct configfs_subsystem *subsys) +{ + struct config_group *group = &subsys->su_group; + struct dentry *dentry = group->cg_item.ci_dentry; + + if (dentry->d_parent != configfs_sb->s_root) { + printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n"); + return; + } + + mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex, + I_MUTEX_PARENT); + mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock(&configfs_symlink_mutex); + spin_lock(&configfs_dirent_lock); + if (configfs_detach_prep(dentry, NULL)) { + printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); + } + spin_unlock(&configfs_dirent_lock); + mutex_unlock(&configfs_symlink_mutex); + configfs_detach_group(&group->cg_item); + dentry->d_inode->i_flags |= S_DEAD; + dont_mount(dentry); + mutex_unlock(&dentry->d_inode->i_mutex); + + d_delete(dentry); + + mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex); + + dput(dentry); + + unlink_group(group); + configfs_release_fs(); +} + +EXPORT_SYMBOL(configfs_register_subsystem); +EXPORT_SYMBOL(configfs_unregister_subsystem); diff --git a/fs/configfs/file.c b/fs/configfs/file.c new file mode 100644 index 00000000..2b6cb23d --- /dev/null +++ b/fs/configfs/file.c @@ -0,0 +1,344 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * file.c - operations for regular (text) files. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#include +#include "configfs_internal.h" + +/* + * A simple attribute can only be 4096 characters. Why 4k? Because the + * original code limited it to PAGE_SIZE. That's a bad idea, though, + * because an attribute of 16k on ia64 won't work on x86. So we limit to + * 4k, our minimum common page size. + */ +#define SIMPLE_ATTR_SIZE 4096 + +struct configfs_buffer { + size_t count; + loff_t pos; + char * page; + struct configfs_item_operations * ops; + struct mutex mutex; + int needs_read_fill; +}; + + +/** + * fill_read_buffer - allocate and fill buffer from item. + * @dentry: dentry pointer. + * @buffer: data buffer for file. + * + * Allocate @buffer->page, if it hasn't been already, then call the + * config_item's show() method to fill the buffer with this attribute's + * data. + * This is called only once, on the file's first read. + */ +static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer) +{ + struct configfs_attribute * attr = to_attr(dentry); + struct config_item * item = to_item(dentry->d_parent); + struct configfs_item_operations * ops = buffer->ops; + int ret = 0; + ssize_t count; + + if (!buffer->page) + buffer->page = (char *) get_zeroed_page(GFP_KERNEL); + if (!buffer->page) + return -ENOMEM; + + count = ops->show_attribute(item,attr,buffer->page); + buffer->needs_read_fill = 0; + BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE); + if (count >= 0) + buffer->count = count; + else + ret = count; + return ret; +} + +/** + * configfs_read_file - read an attribute. + * @file: file pointer. + * @buf: buffer to fill. + * @count: number of bytes to read. + * @ppos: starting offset in file. + * + * Userspace wants to read an attribute file. The attribute descriptor + * is in the file's ->d_fsdata. The target item is in the directory's + * ->d_fsdata. + * + * We call fill_read_buffer() to allocate and fill the buffer from the + * item's show() method exactly once (if the read is happening from + * the beginning of the file). That should fill the entire buffer with + * all the data the item has to offer for that attribute. + * We then call flush_read_buffer() to copy the buffer to userspace + * in the increments specified. + */ + +static ssize_t +configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct configfs_buffer * buffer = file->private_data; + ssize_t retval = 0; + + mutex_lock(&buffer->mutex); + if (buffer->needs_read_fill) { + if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) + goto out; + } + pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", + __func__, count, *ppos, buffer->page); + retval = simple_read_from_buffer(buf, count, ppos, buffer->page, + buffer->count); +out: + mutex_unlock(&buffer->mutex); + return retval; +} + + +/** + * fill_write_buffer - copy buffer from userspace. + * @buffer: data buffer for file. + * @buf: data from user. + * @count: number of bytes in @userbuf. + * + * Allocate @buffer->page if it hasn't been already, then + * copy the user-supplied buffer into it. + */ + +static int +fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count) +{ + int error; + + if (!buffer->page) + buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0); + if (!buffer->page) + return -ENOMEM; + + if (count >= SIMPLE_ATTR_SIZE) + count = SIMPLE_ATTR_SIZE - 1; + error = copy_from_user(buffer->page,buf,count); + buffer->needs_read_fill = 1; + /* if buf is assumed to contain a string, terminate it by \0, + * so e.g. sscanf() can scan the string easily */ + buffer->page[count] = 0; + return error ? -EFAULT : count; +} + + +/** + * flush_write_buffer - push buffer to config_item. + * @dentry: dentry to the attribute + * @buffer: data buffer for file. + * @count: number of bytes + * + * Get the correct pointers for the config_item and the attribute we're + * dealing with, then call the store() method for the attribute, + * passing the buffer that we acquired in fill_write_buffer(). + */ + +static int +flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count) +{ + struct configfs_attribute * attr = to_attr(dentry); + struct config_item * item = to_item(dentry->d_parent); + struct configfs_item_operations * ops = buffer->ops; + + return ops->store_attribute(item,attr,buffer->page,count); +} + + +/** + * configfs_write_file - write an attribute. + * @file: file pointer + * @buf: data to write + * @count: number of bytes + * @ppos: starting offset + * + * Similar to configfs_read_file(), though working in the opposite direction. + * We allocate and fill the data from the user in fill_write_buffer(), + * then push it to the config_item in flush_write_buffer(). + * There is no easy way for us to know if userspace is only doing a partial + * write, so we don't support them. We expect the entire buffer to come + * on the first write. + * Hint: if you're writing a value, first read the file, modify only the + * the value you're changing, then write entire buffer back. + */ + +static ssize_t +configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + struct configfs_buffer * buffer = file->private_data; + ssize_t len; + + mutex_lock(&buffer->mutex); + len = fill_write_buffer(buffer, buf, count); + if (len > 0) + len = flush_write_buffer(file->f_path.dentry, buffer, count); + if (len > 0) + *ppos += len; + mutex_unlock(&buffer->mutex); + return len; +} + +static int check_perm(struct inode * inode, struct file * file) +{ + struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent); + struct configfs_attribute * attr = to_attr(file->f_path.dentry); + struct configfs_buffer * buffer; + struct configfs_item_operations * ops = NULL; + int error = 0; + + if (!item || !attr) + goto Einval; + + /* Grab the module reference for this attribute if we have one */ + if (!try_module_get(attr->ca_owner)) { + error = -ENODEV; + goto Done; + } + + if (item->ci_type) + ops = item->ci_type->ct_item_ops; + else + goto Eaccess; + + /* File needs write support. + * The inode's perms must say it's ok, + * and we must have a store method. + */ + if (file->f_mode & FMODE_WRITE) { + + if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute) + goto Eaccess; + + } + + /* File needs read support. + * The inode's perms must say it's ok, and we there + * must be a show method for it. + */ + if (file->f_mode & FMODE_READ) { + if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute) + goto Eaccess; + } + + /* No error? Great, allocate a buffer for the file, and store it + * it in file->private_data for easy access. + */ + buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL); + if (!buffer) { + error = -ENOMEM; + goto Enomem; + } + mutex_init(&buffer->mutex); + buffer->needs_read_fill = 1; + buffer->ops = ops; + file->private_data = buffer; + goto Done; + + Einval: + error = -EINVAL; + goto Done; + Eaccess: + error = -EACCES; + Enomem: + module_put(attr->ca_owner); + Done: + if (error && item) + config_item_put(item); + return error; +} + +static int configfs_open_file(struct inode * inode, struct file * filp) +{ + return check_perm(inode,filp); +} + +static int configfs_release(struct inode * inode, struct file * filp) +{ + struct config_item * item = to_item(filp->f_path.dentry->d_parent); + struct configfs_attribute * attr = to_attr(filp->f_path.dentry); + struct module * owner = attr->ca_owner; + struct configfs_buffer * buffer = filp->private_data; + + if (item) + config_item_put(item); + /* After this point, attr should not be accessed. */ + module_put(owner); + + if (buffer) { + if (buffer->page) + free_page((unsigned long)buffer->page); + mutex_destroy(&buffer->mutex); + kfree(buffer); + } + return 0; +} + +const struct file_operations configfs_file_operations = { + .read = configfs_read_file, + .write = configfs_write_file, + .llseek = generic_file_llseek, + .open = configfs_open_file, + .release = configfs_release, +}; + + +int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type) +{ + struct configfs_dirent * parent_sd = dir->d_fsdata; + umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; + int error = 0; + + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL); + error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type); + mutex_unlock(&dir->d_inode->i_mutex); + + return error; +} + + +/** + * configfs_create_file - create an attribute file for an item. + * @item: item we're creating for. + * @attr: atrribute descriptor. + */ + +int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr) +{ + BUG_ON(!item || !item->ci_dentry || !attr); + + return configfs_add_file(item->ci_dentry, attr, + CONFIGFS_ITEM_ATTR); +} + diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c new file mode 100644 index 00000000..c83f4768 --- /dev/null +++ b/fs/configfs/inode.c @@ -0,0 +1,297 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * inode.c - basic inode and dentry operations. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + * + * Please see Documentation/filesystems/configfs.txt for more information. + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "configfs_internal.h" + +#ifdef CONFIG_LOCKDEP +static struct lock_class_key default_group_class[MAX_LOCK_DEPTH]; +#endif + +extern struct super_block * configfs_sb; + +static const struct address_space_operations configfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, +}; + +static struct backing_dev_info configfs_backing_dev_info = { + .name = "configfs", + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, +}; + +static const struct inode_operations configfs_inode_operations ={ + .setattr = configfs_setattr, +}; + +int configfs_setattr(struct dentry * dentry, struct iattr * iattr) +{ + struct inode * inode = dentry->d_inode; + struct configfs_dirent * sd = dentry->d_fsdata; + struct iattr * sd_iattr; + unsigned int ia_valid = iattr->ia_valid; + int error; + + if (!sd) + return -EINVAL; + + sd_iattr = sd->s_iattr; + if (!sd_iattr) { + /* setting attributes for the first time, allocate now */ + sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); + if (!sd_iattr) + return -ENOMEM; + /* assign default attributes */ + sd_iattr->ia_mode = sd->s_mode; + sd_iattr->ia_uid = 0; + sd_iattr->ia_gid = 0; + sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; + sd->s_iattr = sd_iattr; + } + /* attributes were changed atleast once in past */ + + error = simple_setattr(dentry, iattr); + if (error) + return error; + + if (ia_valid & ATTR_UID) + sd_iattr->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + sd_iattr->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + sd_iattr->ia_mode = sd->s_mode = mode; + } + + return error; +} + +static inline void set_default_inode_attr(struct inode * inode, mode_t mode) +{ + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +} + +static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) +{ + inode->i_mode = iattr->ia_mode; + inode->i_uid = iattr->ia_uid; + inode->i_gid = iattr->ia_gid; + inode->i_atime = iattr->ia_atime; + inode->i_mtime = iattr->ia_mtime; + inode->i_ctime = iattr->ia_ctime; +} + +struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd) +{ + struct inode * inode = new_inode(configfs_sb); + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_mapping->a_ops = &configfs_aops; + inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; + inode->i_op = &configfs_inode_operations; + + if (sd->s_iattr) { + /* sysfs_dirent has non-default attributes + * get them for the new inode from persistent copy + * in sysfs_dirent + */ + set_inode_attr(inode, sd->s_iattr); + } else + set_default_inode_attr(inode, mode); + } + return inode; +} + +#ifdef CONFIG_LOCKDEP + +static void configfs_set_inode_lock_class(struct configfs_dirent *sd, + struct inode *inode) +{ + int depth = sd->s_depth; + + if (depth > 0) { + if (depth <= ARRAY_SIZE(default_group_class)) { + lockdep_set_class(&inode->i_mutex, + &default_group_class[depth - 1]); + } else { + /* + * In practice the maximum level of locking depth is + * already reached. Just inform about possible reasons. + */ + printk(KERN_INFO "configfs: Too many levels of inodes" + " for the locking correctness validator.\n"); + printk(KERN_INFO "Spurious warnings may appear.\n"); + } + } +} + +#else /* CONFIG_LOCKDEP */ + +static void configfs_set_inode_lock_class(struct configfs_dirent *sd, + struct inode *inode) +{ +} + +#endif /* CONFIG_LOCKDEP */ + +int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *)) +{ + int error = 0; + struct inode * inode = NULL; + if (dentry) { + if (!dentry->d_inode) { + struct configfs_dirent *sd = dentry->d_fsdata; + if ((inode = configfs_new_inode(mode, sd))) { + if (dentry->d_parent && dentry->d_parent->d_inode) { + struct inode *p_inode = dentry->d_parent->d_inode; + p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; + } + configfs_set_inode_lock_class(sd, inode); + goto Proceed; + } + else + error = -ENOMEM; + } else + error = -EEXIST; + } else + error = -ENOENT; + goto Done; + + Proceed: + if (init) + error = init(inode); + if (!error) { + d_instantiate(dentry, inode); + if (S_ISDIR(mode) || S_ISLNK(mode)) + dget(dentry); /* pin link and directory dentries in core */ + } else + iput(inode); + Done: + return error; +} + +/* + * Get the name for corresponding element represented by the given configfs_dirent + */ +const unsigned char * configfs_get_name(struct configfs_dirent *sd) +{ + struct configfs_attribute *attr; + + BUG_ON(!sd || !sd->s_element); + + /* These always have a dentry, so use that */ + if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK)) + return sd->s_dentry->d_name.name; + + if (sd->s_type & CONFIGFS_ITEM_ATTR) { + attr = sd->s_element; + return attr->ca_name; + } + return NULL; +} + + +/* + * Unhashes the dentry corresponding to given configfs_dirent + * Called with parent inode's i_mutex held. + */ +void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) +{ + struct dentry * dentry = sd->s_dentry; + + if (dentry) { + spin_lock(&dentry->d_lock); + if (!(d_unhashed(dentry) && dentry->d_inode)) { + dget_dlock(dentry); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + simple_unlink(parent->d_inode, dentry); + } else + spin_unlock(&dentry->d_lock); + } +} + +void configfs_hash_and_remove(struct dentry * dir, const char * name) +{ + struct configfs_dirent * sd; + struct configfs_dirent * parent_sd = dir->d_fsdata; + + if (dir->d_inode == NULL) + /* no inode means this hasn't been made visible yet */ + return; + + mutex_lock(&dir->d_inode->i_mutex); + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + if (!sd->s_element) + continue; + if (!strcmp(configfs_get_name(sd), name)) { + spin_lock(&configfs_dirent_lock); + list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); + configfs_drop_dentry(sd, dir); + configfs_put(sd); + break; + } + } + mutex_unlock(&dir->d_inode->i_mutex); +} + +int __init configfs_inode_init(void) +{ + return bdi_init(&configfs_backing_dev_info); +} + +void __exit configfs_inode_exit(void) +{ + bdi_destroy(&configfs_backing_dev_info); +} diff --git a/fs/configfs/item.c b/fs/configfs/item.c new file mode 100644 index 00000000..76dc4c3e --- /dev/null +++ b/fs/configfs/item.c @@ -0,0 +1,216 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * item.c - library routines for handling generic config items + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on kobject: + * kobject is Copyright (c) 2002-2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + * + * Please see the file Documentation/filesystems/configfs.txt for + * critical information about using the config_item interface. + */ + +#include +#include +#include +#include + +#include + + +static inline struct config_item * to_item(struct list_head * entry) +{ + return container_of(entry,struct config_item,ci_entry); +} + +/* Evil kernel */ +static void config_item_release(struct kref *kref); + +/** + * config_item_init - initialize item. + * @item: item in question. + */ +void config_item_init(struct config_item * item) +{ + kref_init(&item->ci_kref); + INIT_LIST_HEAD(&item->ci_entry); +} + +/** + * config_item_set_name - Set the name of an item + * @item: item. + * @name: name. + * + * If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a + * dynamically allocated string that @item->ci_name points to. + * Otherwise, use the static @item->ci_namebuf array. + */ +int config_item_set_name(struct config_item * item, const char * fmt, ...) +{ + int error = 0; + int limit = CONFIGFS_ITEM_NAME_LEN; + int need; + va_list args; + char * name; + + /* + * First, try the static array + */ + va_start(args,fmt); + need = vsnprintf(item->ci_namebuf,limit,fmt,args); + va_end(args); + if (need < limit) + name = item->ci_namebuf; + else { + /* + * Need more space? Allocate it and try again + */ + limit = need + 1; + name = kmalloc(limit,GFP_KERNEL); + if (!name) { + error = -ENOMEM; + goto Done; + } + va_start(args,fmt); + need = vsnprintf(name,limit,fmt,args); + va_end(args); + + /* Still? Give up. */ + if (need >= limit) { + kfree(name); + error = -EFAULT; + goto Done; + } + } + + /* Free the old name, if necessary. */ + if (item->ci_name && item->ci_name != item->ci_namebuf) + kfree(item->ci_name); + + /* Now, set the new name */ + item->ci_name = name; + Done: + return error; +} + +EXPORT_SYMBOL(config_item_set_name); + +void config_item_init_type_name(struct config_item *item, + const char *name, + struct config_item_type *type) +{ + config_item_set_name(item, name); + item->ci_type = type; + config_item_init(item); +} +EXPORT_SYMBOL(config_item_init_type_name); + +void config_group_init_type_name(struct config_group *group, const char *name, + struct config_item_type *type) +{ + config_item_set_name(&group->cg_item, name); + group->cg_item.ci_type = type; + config_group_init(group); +} +EXPORT_SYMBOL(config_group_init_type_name); + +struct config_item * config_item_get(struct config_item * item) +{ + if (item) + kref_get(&item->ci_kref); + return item; +} + +static void config_item_cleanup(struct config_item * item) +{ + struct config_item_type * t = item->ci_type; + struct config_group * s = item->ci_group; + struct config_item * parent = item->ci_parent; + + pr_debug("config_item %s: cleaning up\n",config_item_name(item)); + if (item->ci_name != item->ci_namebuf) + kfree(item->ci_name); + item->ci_name = NULL; + if (t && t->ct_item_ops && t->ct_item_ops->release) + t->ct_item_ops->release(item); + if (s) + config_group_put(s); + if (parent) + config_item_put(parent); +} + +static void config_item_release(struct kref *kref) +{ + config_item_cleanup(container_of(kref, struct config_item, ci_kref)); +} + +/** + * config_item_put - decrement refcount for item. + * @item: item. + * + * Decrement the refcount, and if 0, call config_item_cleanup(). + */ +void config_item_put(struct config_item * item) +{ + if (item) + kref_put(&item->ci_kref, config_item_release); +} + +/** + * config_group_init - initialize a group for use + * @k: group + */ +void config_group_init(struct config_group *group) +{ + config_item_init(&group->cg_item); + INIT_LIST_HEAD(&group->cg_children); +} + +/** + * config_group_find_item - search for item in group. + * @group: group we're looking in. + * @name: item's name. + * + * Iterate over @group->cg_list, looking for a matching config_item. + * If matching item is found take a reference and return the item. + * Caller must have locked group via @group->cg_subsys->su_mtx. + */ +struct config_item *config_group_find_item(struct config_group *group, + const char *name) +{ + struct list_head * entry; + struct config_item * ret = NULL; + + list_for_each(entry,&group->cg_children) { + struct config_item * item = to_item(entry); + if (config_item_name(item) && + !strcmp(config_item_name(item), name)) { + ret = config_item_get(item); + break; + } + } + return ret; +} + +EXPORT_SYMBOL(config_item_init); +EXPORT_SYMBOL(config_group_init); +EXPORT_SYMBOL(config_item_get); +EXPORT_SYMBOL(config_item_put); +EXPORT_SYMBOL(config_group_find_item); diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c new file mode 100644 index 00000000..ecc62178 --- /dev/null +++ b/fs/configfs/mount.c @@ -0,0 +1,187 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * mount.c - operations for initializing and mounting configfs. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include "configfs_internal.h" + +/* Random magic number */ +#define CONFIGFS_MAGIC 0x62656570 + +struct vfsmount * configfs_mount = NULL; +struct super_block * configfs_sb = NULL; +struct kmem_cache *configfs_dir_cachep; +static int configfs_mnt_count = 0; + +static const struct super_operations configfs_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, +}; + +static struct config_group configfs_root_group = { + .cg_item = { + .ci_namebuf = "root", + .ci_name = configfs_root_group.cg_item.ci_namebuf, + }, +}; + +int configfs_is_root(struct config_item *item) +{ + return item == &configfs_root_group.cg_item; +} + +static struct configfs_dirent configfs_root = { + .s_sibling = LIST_HEAD_INIT(configfs_root.s_sibling), + .s_children = LIST_HEAD_INIT(configfs_root.s_children), + .s_element = &configfs_root_group.cg_item, + .s_type = CONFIGFS_ROOT, + .s_iattr = NULL, +}; + +static int configfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + struct dentry *root; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = CONFIGFS_MAGIC; + sb->s_op = &configfs_ops; + sb->s_time_gran = 1; + configfs_sb = sb; + + inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, + &configfs_root); + if (inode) { + inode->i_op = &configfs_dir_inode_operations; + inode->i_fop = &configfs_dir_operations; + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + } else { + pr_debug("configfs: could not get root inode\n"); + return -ENOMEM; + } + + root = d_alloc_root(inode); + if (!root) { + pr_debug("%s: could not get root dentry!\n",__func__); + iput(inode); + return -ENOMEM; + } + config_group_init(&configfs_root_group); + configfs_root_group.cg_item.ci_dentry = root; + root->d_fsdata = &configfs_root; + sb->s_root = root; + sb->s_d_op = &configfs_dentry_ops; /* the rest get that */ + return 0; +} + +static struct dentry *configfs_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, configfs_fill_super); +} + +static struct file_system_type configfs_fs_type = { + .owner = THIS_MODULE, + .name = "configfs", + .mount = configfs_do_mount, + .kill_sb = kill_litter_super, +}; + +int configfs_pin_fs(void) +{ + return simple_pin_fs(&configfs_fs_type, &configfs_mount, + &configfs_mnt_count); +} + +void configfs_release_fs(void) +{ + simple_release_fs(&configfs_mount, &configfs_mnt_count); +} + + +static struct kobject *config_kobj; + +static int __init configfs_init(void) +{ + int err = -ENOMEM; + + configfs_dir_cachep = kmem_cache_create("configfs_dir_cache", + sizeof(struct configfs_dirent), + 0, 0, NULL); + if (!configfs_dir_cachep) + goto out; + + config_kobj = kobject_create_and_add("config", kernel_kobj); + if (!config_kobj) { + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; + goto out; + } + + err = register_filesystem(&configfs_fs_type); + if (err) { + printk(KERN_ERR "configfs: Unable to register filesystem!\n"); + kobject_put(config_kobj); + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; + goto out; + } + + err = configfs_inode_init(); + if (err) { + unregister_filesystem(&configfs_fs_type); + kobject_put(config_kobj); + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; + } +out: + return err; +} + +static void __exit configfs_exit(void) +{ + unregister_filesystem(&configfs_fs_type); + kobject_put(config_kobj); + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; + configfs_inode_exit(); +} + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("0.0.2"); +MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); + +module_init(configfs_init); +module_exit(configfs_exit); diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c new file mode 100644 index 00000000..0f3eb41d --- /dev/null +++ b/fs/configfs/symlink.c @@ -0,0 +1,320 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * symlink.c - operations for configfs symlinks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#include +#include +#include +#include + +#include +#include "configfs_internal.h" + +/* Protects attachments of new symlinks */ +DEFINE_MUTEX(configfs_symlink_mutex); + +static int item_depth(struct config_item * item) +{ + struct config_item * p = item; + int depth = 0; + do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p)); + return depth; +} + +static int item_path_length(struct config_item * item) +{ + struct config_item * p = item; + int length = 1; + do { + length += strlen(config_item_name(p)) + 1; + p = p->ci_parent; + } while (p && !configfs_is_root(p)); + return length; +} + +static void fill_item_path(struct config_item * item, char * buffer, int length) +{ + struct config_item * p; + + --length; + for (p = item; p && !configfs_is_root(p); p = p->ci_parent) { + int cur = strlen(config_item_name(p)); + + /* back up enough to print this bus id with '/' */ + length -= cur; + strncpy(buffer + length,config_item_name(p),cur); + *(buffer + --length) = '/'; + } +} + +static int create_link(struct config_item *parent_item, + struct config_item *item, + struct dentry *dentry) +{ + struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata; + struct configfs_symlink *sl; + int ret; + + ret = -ENOENT; + if (!configfs_dirent_is_ready(target_sd)) + goto out; + ret = -ENOMEM; + sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL); + if (sl) { + sl->sl_target = config_item_get(item); + spin_lock(&configfs_dirent_lock); + if (target_sd->s_type & CONFIGFS_USET_DROPPING) { + spin_unlock(&configfs_dirent_lock); + config_item_put(item); + kfree(sl); + return -ENOENT; + } + list_add(&sl->sl_list, &target_sd->s_links); + spin_unlock(&configfs_dirent_lock); + ret = configfs_create_link(sl, parent_item->ci_dentry, + dentry); + if (ret) { + spin_lock(&configfs_dirent_lock); + list_del_init(&sl->sl_list); + spin_unlock(&configfs_dirent_lock); + config_item_put(item); + kfree(sl); + } + } + +out: + return ret; +} + + +static int get_target(const char *symname, struct path *path, + struct config_item **target) +{ + int ret; + + ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path); + if (!ret) { + if (path->dentry->d_sb == configfs_sb) { + *target = configfs_get_config_item(path->dentry); + if (!*target) { + ret = -ENOENT; + path_put(path); + } + } else { + ret = -EPERM; + path_put(path); + } + } + + return ret; +} + + +int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + int ret; + struct path path; + struct configfs_dirent *sd; + struct config_item *parent_item; + struct config_item *target_item = NULL; + struct config_item_type *type; + + ret = -EPERM; /* What lack-of-symlink returns */ + if (dentry->d_parent == configfs_sb->s_root) + goto out; + + sd = dentry->d_parent->d_fsdata; + /* + * Fake invisibility if dir belongs to a group/default groups hierarchy + * being attached + */ + ret = -ENOENT; + if (!configfs_dirent_is_ready(sd)) + goto out; + + parent_item = configfs_get_config_item(dentry->d_parent); + type = parent_item->ci_type; + + ret = -EPERM; + if (!type || !type->ct_item_ops || + !type->ct_item_ops->allow_link) + goto out_put; + + ret = get_target(symname, &path, &target_item); + if (ret) + goto out_put; + + ret = type->ct_item_ops->allow_link(parent_item, target_item); + if (!ret) { + mutex_lock(&configfs_symlink_mutex); + ret = create_link(parent_item, target_item, dentry); + mutex_unlock(&configfs_symlink_mutex); + if (ret && type->ct_item_ops->drop_link) + type->ct_item_ops->drop_link(parent_item, + target_item); + } + + config_item_put(target_item); + path_put(&path); + +out_put: + config_item_put(parent_item); + +out: + return ret; +} + +int configfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct configfs_dirent *sd = dentry->d_fsdata; + struct configfs_symlink *sl; + struct config_item *parent_item; + struct config_item_type *type; + int ret; + + ret = -EPERM; /* What lack-of-symlink returns */ + if (!(sd->s_type & CONFIGFS_ITEM_LINK)) + goto out; + + BUG_ON(dentry->d_parent == configfs_sb->s_root); + + sl = sd->s_element; + + parent_item = configfs_get_config_item(dentry->d_parent); + type = parent_item->ci_type; + + spin_lock(&configfs_dirent_lock); + list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); + configfs_drop_dentry(sd, dentry->d_parent); + dput(dentry); + configfs_put(sd); + + /* + * drop_link() must be called before + * list_del_init(&sl->sl_list), so that the order of + * drop_link(this, target) and drop_item(target) is preserved. + */ + if (type && type->ct_item_ops && + type->ct_item_ops->drop_link) + type->ct_item_ops->drop_link(parent_item, + sl->sl_target); + + spin_lock(&configfs_dirent_lock); + list_del_init(&sl->sl_list); + spin_unlock(&configfs_dirent_lock); + + /* Put reference from create_link() */ + config_item_put(sl->sl_target); + kfree(sl); + + config_item_put(parent_item); + + ret = 0; + +out: + return ret; +} + +static int configfs_get_target_path(struct config_item * item, struct config_item * target, + char *path) +{ + char * s; + int depth, size; + + depth = item_depth(item); + size = item_path_length(target) + depth * 3 - 1; + if (size > PATH_MAX) + return -ENAMETOOLONG; + + pr_debug("%s: depth = %d, size = %d\n", __func__, depth, size); + + for (s = path; depth--; s += 3) + strcpy(s,"../"); + + fill_item_path(target, path, size); + pr_debug("%s: path = '%s'\n", __func__, path); + + return 0; +} + +static int configfs_getlink(struct dentry *dentry, char * path) +{ + struct config_item *item, *target_item; + int error = 0; + + item = configfs_get_config_item(dentry->d_parent); + if (!item) + return -EINVAL; + + target_item = configfs_get_config_item(dentry); + if (!target_item) { + config_item_put(item); + return -EINVAL; + } + + down_read(&configfs_rename_sem); + error = configfs_get_target_path(item, target_item, path); + up_read(&configfs_rename_sem); + + config_item_put(item); + config_item_put(target_item); + return error; + +} + +static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + int error = -ENOMEM; + unsigned long page = get_zeroed_page(GFP_KERNEL); + + if (page) { + error = configfs_getlink(dentry, (char *)page); + if (!error) { + nd_set_link(nd, (char *)page); + return (void *)page; + } + } + + nd_set_link(nd, ERR_PTR(error)); + return NULL; +} + +static void configfs_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + if (cookie) { + unsigned long page = (unsigned long)cookie; + free_page(page); + } +} + +const struct inode_operations configfs_symlink_inode_operations = { + .follow_link = configfs_follow_link, + .readlink = generic_readlink, + .put_link = configfs_put_link, + .setattr = configfs_setattr, +}; + diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig new file mode 100644 index 00000000..cd06466f --- /dev/null +++ b/fs/cramfs/Kconfig @@ -0,0 +1,19 @@ +config CRAMFS + tristate "Compressed ROM file system support (cramfs)" + depends on BLOCK + select ZLIB_INFLATE + help + Saying Y here includes support for CramFs (Compressed ROM File + System). CramFs is designed to be a simple, small, and compressed + file system for ROM based embedded systems. CramFs is read-only, + limited to 256MB file systems (with 16MB files), and doesn't support + 16/32 bits uid/gid, hard links and timestamps. + + See and + for further information. + + To compile this as a module, choose M here: the module will be called + cramfs. Note that the root file system (the one containing the + directory /) cannot be compiled as a module. + + If unsure, say N. diff --git a/fs/cramfs/Makefile b/fs/cramfs/Makefile new file mode 100644 index 00000000..92ebb464 --- /dev/null +++ b/fs/cramfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux cramfs routines. +# + +obj-$(CONFIG_CRAMFS) += cramfs.o + +cramfs-objs := inode.o uncompress.o diff --git a/fs/cramfs/README b/fs/cramfs/README new file mode 100644 index 00000000..445d1c2d --- /dev/null +++ b/fs/cramfs/README @@ -0,0 +1,168 @@ +Notes on Filesystem Layout +-------------------------- + +These notes describe what mkcramfs generates. Kernel requirements are +a bit looser, e.g. it doesn't care if the items are +swapped around (though it does care that directory entries (inodes) in +a given directory are contiguous, as this is used by readdir). + +All data is currently in host-endian format; neither mkcramfs nor the +kernel ever do swabbing. (See section `Block Size' below.) + +: + + + + +: struct cramfs_super (see cramfs_fs.h). + +: + For each file: + struct cramfs_inode (see cramfs_fs.h). + Filename. Not generally null-terminated, but it is + null-padded to a multiple of 4 bytes. + +The order of inode traversal is described as "width-first" (not to be +confused with breadth-first); i.e. like depth-first but listing all of +a directory's entries before recursing down its subdirectories: the +same order as `ls -AUR' (but without the /^\..*:$/ directory header +lines); put another way, the same order as `find -type d -exec +ls -AU1 {} \;'. + +Beginning in 2.4.7, directory entries are sorted. This optimization +allows cramfs_lookup to return more quickly when a filename does not +exist, speeds up user-space directory sorts, etc. + +: + One for each file that's either a symlink or a + regular file of non-zero st_size. + +: + nblocks * + (where nblocks = (st_size - 1) / blksize + 1) + nblocks * + padding to multiple of 4 bytes + +The i'th for a file stores the byte offset of the +*end* of the i'th (i.e. one past the last byte, which is the +same as the start of the (i+1)'th if there is one). The first + immediately follows the last for the file. +s are each 32 bits long. + +The order of 's is a depth-first descent of the directory +tree, i.e. the same order as `find -size +0 \( -type f -o -type l \) +-print'. + + +: The i'th is the output of zlib's compress function +applied to the i'th blksize-sized chunk of the input data. +(For the last of the file, the input may of course be smaller.) +Each may be a different size. (See above.) +s are merely byte-aligned, not generally u32-aligned. + + +Holes +----- + +This kernel supports cramfs holes (i.e. [efficient representation of] +blocks in uncompressed data consisting entirely of NUL bytes), but by +default mkcramfs doesn't test for & create holes, since cramfs in +kernels up to at least 2.3.39 didn't support holes. Run mkcramfs +with -z if you want it to create files that can have holes in them. + + +Tools +----- + +The cramfs user-space tools, including mkcramfs and cramfsck, are +located at . + + +Future Development +================== + +Block Size +---------- + +(Block size in cramfs refers to the size of input data that is +compressed at a time. It's intended to be somewhere around +PAGE_CACHE_SIZE for cramfs_readpage's convenience.) + +The superblock ought to indicate the block size that the fs was +written for, since comments in indicate that +PAGE_CACHE_SIZE may grow in future (if I interpret the comment +correctly). + +Currently, mkcramfs #define's PAGE_CACHE_SIZE as 4096 and uses that +for blksize, whereas Linux-2.3.39 uses its PAGE_CACHE_SIZE, which in +turn is defined as PAGE_SIZE (which can be as large as 32KB on arm). +This discrepancy is a bug, though it's not clear which should be +changed. + +One option is to change mkcramfs to take its PAGE_CACHE_SIZE from +. Personally I don't like this option, but it does +require the least amount of change: just change `#define +PAGE_CACHE_SIZE (4096)' to `#include '. The disadvantage +is that the generated cramfs cannot always be shared between different +kernels, not even necessarily kernels of the same architecture if +PAGE_CACHE_SIZE is subject to change between kernel versions +(currently possible with arm and ia64). + +The remaining options try to make cramfs more sharable. + +One part of that is addressing endianness. The two options here are +`always use little-endian' (like ext2fs) or `writer chooses +endianness; kernel adapts at runtime'. Little-endian wins because of +code simplicity and little CPU overhead even on big-endian machines. + +The cost of swabbing is changing the code to use the le32_to_cpu +etc. macros as used by ext2fs. We don't need to swab the compressed +data, only the superblock, inodes and block pointers. + + +The other part of making cramfs more sharable is choosing a block +size. The options are: + + 1. Always 4096 bytes. + + 2. Writer chooses blocksize; kernel adapts but rejects blocksize > + PAGE_CACHE_SIZE. + + 3. Writer chooses blocksize; kernel adapts even to blocksize > + PAGE_CACHE_SIZE. + +It's easy enough to change the kernel to use a smaller value than +PAGE_CACHE_SIZE: just make cramfs_readpage read multiple blocks. + +The cost of option 1 is that kernels with a larger PAGE_CACHE_SIZE +value don't get as good compression as they can. + +The cost of option 2 relative to option 1 is that the code uses +variables instead of #define'd constants. The gain is that people +with kernels having larger PAGE_CACHE_SIZE can make use of that if +they don't mind their cramfs being inaccessible to kernels with +smaller PAGE_CACHE_SIZE values. + +Option 3 is easy to implement if we don't mind being CPU-inefficient: +e.g. get readpage to decompress to a buffer of size MAX_BLKSIZE (which +must be no larger than 32KB) and discard what it doesn't need. +Getting readpage to read into all the covered pages is harder. + +The main advantage of option 3 over 1, 2, is better compression. The +cost is greater complexity. Probably not worth it, but I hope someone +will disagree. (If it is implemented, then I'll re-use that code in +e2compr.) + + +Another cost of 2 and 3 over 1 is making mkcramfs use a different +block size, but that just means adding and parsing a -b option. + + +Inode Size +---------- + +Given that cramfs will probably be used for CDs etc. as well as just +silicon ROMs, it might make sense to expand the inode a little from +its current 12 bytes. Inodes other than the root inode are followed +by filename, so the expansion doesn't even have to be a multiple of 4 +bytes. diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c new file mode 100644 index 00000000..739fb59b --- /dev/null +++ b/fs/cramfs/inode.c @@ -0,0 +1,601 @@ +/* + * Compressed rom filesystem for Linux. + * + * Copyright (C) 1999 Linus Torvalds. + * + * This file is released under the GPL. + */ + +/* + * These are the VFS interfaces to the compressed rom filesystem. + * The actual compression is based on zlib, see the other files. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static const struct super_operations cramfs_ops; +static const struct inode_operations cramfs_dir_inode_operations; +static const struct file_operations cramfs_directory_operations; +static const struct address_space_operations cramfs_aops; + +static DEFINE_MUTEX(read_mutex); + + +/* These macros may change in future, to provide better st_ino semantics. */ +#define OFFSET(x) ((x)->i_ino) + +static unsigned long cramino(const struct cramfs_inode *cino, unsigned int offset) +{ + if (!cino->offset) + return offset + 1; + if (!cino->size) + return offset + 1; + + /* + * The file mode test fixes buggy mkcramfs implementations where + * cramfs_inode->offset is set to a non zero value for entries + * which did not contain data, like devices node and fifos. + */ + switch (cino->mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + return cino->offset << 2; + default: + break; + } + return offset + 1; +} + +static struct inode *get_cramfs_inode(struct super_block *sb, + const struct cramfs_inode *cramfs_inode, unsigned int offset) +{ + struct inode *inode; + static struct timespec zerotime; + + inode = iget_locked(sb, cramino(cramfs_inode, offset)); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + switch (cramfs_inode->mode & S_IFMT) { + case S_IFREG: + inode->i_fop = &generic_ro_fops; + inode->i_data.a_ops = &cramfs_aops; + break; + case S_IFDIR: + inode->i_op = &cramfs_dir_inode_operations; + inode->i_fop = &cramfs_directory_operations; + break; + case S_IFLNK: + inode->i_op = &page_symlink_inode_operations; + inode->i_data.a_ops = &cramfs_aops; + break; + default: + init_special_inode(inode, cramfs_inode->mode, + old_decode_dev(cramfs_inode->size)); + } + + inode->i_mode = cramfs_inode->mode; + inode->i_uid = cramfs_inode->uid; + inode->i_gid = cramfs_inode->gid; + + /* if the lower 2 bits are zero, the inode contains data */ + if (!(inode->i_ino & 3)) { + inode->i_size = cramfs_inode->size; + inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; + } + + /* Struct copy intentional */ + inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; + /* inode->i_nlink is left 1 - arguably wrong for directories, + but it's the best we can do without reading the directory + contents. 1 yields the right result in GNU find, even + without -noleaf option. */ + + unlock_new_inode(inode); + + return inode; +} + +/* + * We have our own block cache: don't fill up the buffer cache + * with the rom-image, because the way the filesystem is set + * up the accesses should be fairly regular and cached in the + * page cache and dentry tree anyway.. + * + * This also acts as a way to guarantee contiguous areas of up to + * BLKS_PER_BUF*PAGE_CACHE_SIZE, so that the caller doesn't need to + * worry about end-of-buffer issues even when decompressing a full + * page cache. + */ +#define READ_BUFFERS (2) +/* NEXT_BUFFER(): Loop over [0..(READ_BUFFERS-1)]. */ +#define NEXT_BUFFER(_ix) ((_ix) ^ 1) + +/* + * BLKS_PER_BUF_SHIFT should be at least 2 to allow for "compressed" + * data that takes up more space than the original and with unlucky + * alignment. + */ +#define BLKS_PER_BUF_SHIFT (2) +#define BLKS_PER_BUF (1 << BLKS_PER_BUF_SHIFT) +#define BUFFER_SIZE (BLKS_PER_BUF*PAGE_CACHE_SIZE) + +static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE]; +static unsigned buffer_blocknr[READ_BUFFERS]; +static struct super_block * buffer_dev[READ_BUFFERS]; +static int next_buffer; + +/* + * Returns a pointer to a buffer containing at least LEN bytes of + * filesystem starting at byte offset OFFSET into the filesystem. + */ +static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned int len) +{ + struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + struct page *pages[BLKS_PER_BUF]; + unsigned i, blocknr, buffer; + unsigned long devsize; + char *data; + + if (!len) + return NULL; + blocknr = offset >> PAGE_CACHE_SHIFT; + offset &= PAGE_CACHE_SIZE - 1; + + /* Check if an existing buffer already has the data.. */ + for (i = 0; i < READ_BUFFERS; i++) { + unsigned int blk_offset; + + if (buffer_dev[i] != sb) + continue; + if (blocknr < buffer_blocknr[i]) + continue; + blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_CACHE_SHIFT; + blk_offset += offset; + if (blk_offset + len > BUFFER_SIZE) + continue; + return read_buffers[i] + blk_offset; + } + + devsize = mapping->host->i_size >> PAGE_CACHE_SHIFT; + + /* Ok, read in BLKS_PER_BUF pages completely first. */ + for (i = 0; i < BLKS_PER_BUF; i++) { + struct page *page = NULL; + + if (blocknr + i < devsize) { + page = read_mapping_page_async(mapping, blocknr + i, + NULL); + /* synchronous error? */ + if (IS_ERR(page)) + page = NULL; + } + pages[i] = page; + } + + for (i = 0; i < BLKS_PER_BUF; i++) { + struct page *page = pages[i]; + if (page) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + /* asynchronous error */ + page_cache_release(page); + pages[i] = NULL; + } + } + } + + buffer = next_buffer; + next_buffer = NEXT_BUFFER(buffer); + buffer_blocknr[buffer] = blocknr; + buffer_dev[buffer] = sb; + + data = read_buffers[buffer]; + for (i = 0; i < BLKS_PER_BUF; i++) { + struct page *page = pages[i]; + if (page) { + memcpy(data, kmap(page), PAGE_CACHE_SIZE); + kunmap(page); + page_cache_release(page); + } else + memset(data, 0, PAGE_CACHE_SIZE); + data += PAGE_CACHE_SIZE; + } + return read_buffers[buffer] + offset; +} + +static void cramfs_put_super(struct super_block *sb) +{ + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; +} + +static int cramfs_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_RDONLY; + return 0; +} + +static int cramfs_fill_super(struct super_block *sb, void *data, int silent) +{ + int i; + struct cramfs_super super; + unsigned long root_offset; + struct cramfs_sb_info *sbi; + struct inode *root; + + sb->s_flags |= MS_RDONLY; + + sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + + /* Invalidate the read buffers on mount: think disk change.. */ + mutex_lock(&read_mutex); + for (i = 0; i < READ_BUFFERS; i++) + buffer_blocknr[i] = -1; + + /* Read the first block and get the superblock from it */ + memcpy(&super, cramfs_read(sb, 0, sizeof(super)), sizeof(super)); + mutex_unlock(&read_mutex); + + /* Do sanity checks on the superblock */ + if (super.magic != CRAMFS_MAGIC) { + /* check for wrong endianess */ + if (super.magic == CRAMFS_MAGIC_WEND) { + if (!silent) + printk(KERN_ERR "cramfs: wrong endianess\n"); + goto out; + } + + /* check at 512 byte offset */ + mutex_lock(&read_mutex); + memcpy(&super, cramfs_read(sb, 512, sizeof(super)), sizeof(super)); + mutex_unlock(&read_mutex); + if (super.magic != CRAMFS_MAGIC) { + if (super.magic == CRAMFS_MAGIC_WEND && !silent) + printk(KERN_ERR "cramfs: wrong endianess\n"); + else if (!silent) + printk(KERN_ERR "cramfs: wrong magic\n"); + goto out; + } + } + + /* get feature flags first */ + if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) { + printk(KERN_ERR "cramfs: unsupported filesystem features\n"); + goto out; + } + + /* Check that the root inode is in a sane state */ + if (!S_ISDIR(super.root.mode)) { + printk(KERN_ERR "cramfs: root is not a directory\n"); + goto out; + } + /* correct strange, hard-coded permissions of mkcramfs */ + super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); + + root_offset = super.root.offset << 2; + if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) { + sbi->size=super.size; + sbi->blocks=super.fsid.blocks; + sbi->files=super.fsid.files; + } else { + sbi->size=1<<28; + sbi->blocks=0; + sbi->files=0; + } + sbi->magic=super.magic; + sbi->flags=super.flags; + if (root_offset == 0) + printk(KERN_INFO "cramfs: empty filesystem"); + else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) && + ((root_offset != sizeof(struct cramfs_super)) && + (root_offset != 512 + sizeof(struct cramfs_super)))) + { + printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset); + goto out; + } + + /* Set it all up.. */ + sb->s_op = &cramfs_ops; + root = get_cramfs_inode(sb, &super.root, 0); + if (IS_ERR(root)) + goto out; + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + iput(root); + goto out; + } + return 0; +out: + kfree(sbi); + sb->s_fs_info = NULL; + return -EINVAL; +} + +static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = CRAMFS_MAGIC; + buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_blocks = CRAMFS_SB(sb)->blocks; + buf->f_bfree = 0; + buf->f_bavail = 0; + buf->f_files = CRAMFS_SB(sb)->files; + buf->f_ffree = 0; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = CRAMFS_MAXPATHLEN; + return 0; +} + +/* + * Read a cramfs directory entry. + */ +static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + char *buf; + unsigned int offset; + int copied; + + /* Offset within the thing. */ + offset = filp->f_pos; + if (offset >= inode->i_size) + return 0; + /* Directory entries are always 4-byte aligned */ + if (offset & 3) + return -EINVAL; + + buf = kmalloc(CRAMFS_MAXPATHLEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + copied = 0; + while (offset < inode->i_size) { + struct cramfs_inode *de; + unsigned long nextoffset; + char *name; + ino_t ino; + mode_t mode; + int namelen, error; + + mutex_lock(&read_mutex); + de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN); + name = (char *)(de+1); + + /* + * Namelengths on disk are shifted by two + * and the name padded out to 4-byte boundaries + * with zeroes. + */ + namelen = de->namelen << 2; + memcpy(buf, name, namelen); + ino = cramino(de, OFFSET(inode) + offset); + mode = de->mode; + mutex_unlock(&read_mutex); + nextoffset = offset + sizeof(*de) + namelen; + for (;;) { + if (!namelen) { + kfree(buf); + return -EIO; + } + if (buf[namelen-1]) + break; + namelen--; + } + error = filldir(dirent, buf, namelen, offset, ino, mode >> 12); + if (error) + break; + + offset = nextoffset; + filp->f_pos = offset; + copied++; + } + kfree(buf); + return 0; +} + +/* + * Lookup and fill in the inode data.. + */ +static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + unsigned int offset = 0; + struct inode *inode = NULL; + int sorted; + + mutex_lock(&read_mutex); + sorted = CRAMFS_SB(dir->i_sb)->flags & CRAMFS_FLAG_SORTED_DIRS; + while (offset < dir->i_size) { + struct cramfs_inode *de; + char *name; + int namelen, retval; + int dir_off = OFFSET(dir) + offset; + + de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN); + name = (char *)(de+1); + + /* Try to take advantage of sorted directories */ + if (sorted && (dentry->d_name.name[0] < name[0])) + break; + + namelen = de->namelen << 2; + offset += sizeof(*de) + namelen; + + /* Quick check that the name is roughly the right length */ + if (((dentry->d_name.len + 3) & ~3) != namelen) + continue; + + for (;;) { + if (!namelen) { + inode = ERR_PTR(-EIO); + goto out; + } + if (name[namelen-1]) + break; + namelen--; + } + if (namelen != dentry->d_name.len) + continue; + retval = memcmp(dentry->d_name.name, name, namelen); + if (retval > 0) + continue; + if (!retval) { + inode = get_cramfs_inode(dir->i_sb, de, dir_off); + break; + } + /* else (retval < 0) */ + if (sorted) + break; + } +out: + mutex_unlock(&read_mutex); + if (IS_ERR(inode)) + return ERR_CAST(inode); + d_add(dentry, inode); + return NULL; +} + +static int cramfs_readpage(struct file *file, struct page * page) +{ + struct inode *inode = page->mapping->host; + u32 maxblock; + int bytes_filled; + void *pgdata; + + maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + bytes_filled = 0; + pgdata = kmap(page); + + if (page->index < maxblock) { + struct super_block *sb = inode->i_sb; + u32 blkptr_offset = OFFSET(inode) + page->index*4; + u32 start_offset, compr_len; + + start_offset = OFFSET(inode) + maxblock*4; + mutex_lock(&read_mutex); + if (page->index) + start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, + 4); + compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - + start_offset); + mutex_unlock(&read_mutex); + + if (compr_len == 0) + ; /* hole */ + else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) { + pr_err("cramfs: bad compressed blocksize %u\n", + compr_len); + goto err; + } else { + mutex_lock(&read_mutex); + bytes_filled = cramfs_uncompress_block(pgdata, + PAGE_CACHE_SIZE, + cramfs_read(sb, start_offset, compr_len), + compr_len); + mutex_unlock(&read_mutex); + if (unlikely(bytes_filled < 0)) + goto err; + } + } + + memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled); + flush_dcache_page(page); + kunmap(page); + SetPageUptodate(page); + unlock_page(page); + return 0; + +err: + kunmap(page); + ClearPageUptodate(page); + SetPageError(page); + unlock_page(page); + return 0; +} + +static const struct address_space_operations cramfs_aops = { + .readpage = cramfs_readpage +}; + +/* + * Our operations: + */ + +/* + * A directory can only readdir + */ +static const struct file_operations cramfs_directory_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = cramfs_readdir, +}; + +static const struct inode_operations cramfs_dir_inode_operations = { + .lookup = cramfs_lookup, +}; + +static const struct super_operations cramfs_ops = { + .put_super = cramfs_put_super, + .remount_fs = cramfs_remount, + .statfs = cramfs_statfs, +}; + +static struct dentry *cramfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super); +} + +static struct file_system_type cramfs_fs_type = { + .owner = THIS_MODULE, + .name = "cramfs", + .mount = cramfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_cramfs_fs(void) +{ + int rv; + + rv = cramfs_uncompress_init(); + if (rv < 0) + return rv; + rv = register_filesystem(&cramfs_fs_type); + if (rv < 0) + cramfs_uncompress_exit(); + return rv; +} + +static void __exit exit_cramfs_fs(void) +{ + cramfs_uncompress_exit(); + unregister_filesystem(&cramfs_fs_type); +} + +module_init(init_cramfs_fs) +module_exit(exit_cramfs_fs) +MODULE_LICENSE("GPL"); diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c new file mode 100644 index 00000000..02332980 --- /dev/null +++ b/fs/cramfs/uncompress.c @@ -0,0 +1,77 @@ +/* + * uncompress.c + * + * (C) Copyright 1999 Linus Torvalds + * + * cramfs interfaces to the uncompression library. There's really just + * three entrypoints: + * + * - cramfs_uncompress_init() - called to initialize the thing. + * - cramfs_uncompress_exit() - tell me when you're done + * - cramfs_uncompress_block() - uncompress a block. + * + * NOTE NOTE NOTE! The uncompression is entirely single-threaded. We + * only have one stream, and we'll initialize it only once even if it + * then is used by multiple filesystems. + */ + +#include +#include +#include +#include +#include + +static z_stream stream; +static int initialized; + +/* Returns length of decompressed data. */ +int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen) +{ + int err; + + stream.next_in = src; + stream.avail_in = srclen; + + stream.next_out = dst; + stream.avail_out = dstlen; + + err = zlib_inflateReset(&stream); + if (err != Z_OK) { + printk("zlib_inflateReset error %d\n", err); + zlib_inflateEnd(&stream); + zlib_inflateInit(&stream); + } + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto err; + return stream.total_out; + +err: + printk("Error %d while decompressing!\n", err); + printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); + return -EIO; +} + +int cramfs_uncompress_init(void) +{ + if (!initialized++) { + stream.workspace = vmalloc(zlib_inflate_workspacesize()); + if ( !stream.workspace ) { + initialized = 0; + return -ENOMEM; + } + stream.next_in = NULL; + stream.avail_in = 0; + zlib_inflateInit(&stream); + } + return 0; +} + +void cramfs_uncompress_exit(void) +{ + if (!--initialized) { + zlib_inflateEnd(&stream); + vfree(stream.workspace); + } +} diff --git a/fs/dcache.c b/fs/dcache.c new file mode 100644 index 00000000..0b51cfc9 --- /dev/null +++ b/fs/dcache.c @@ -0,0 +1,3124 @@ +/* + * fs/dcache.c + * + * Complete reimplementation + * (C) 1997 Thomas Schoebel-Theuer, + * with heavy changes by Linus Torvalds + */ + +/* + * Notes on the allocation strategy: + * + * The dcache is a master of the icache - whenever a dcache entry + * exists, the inode will always exist. "iput()" is done either when + * the dcache entry is deleted or garbage collected. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * Usage: + * dcache->d_inode->i_lock protects: + * - i_dentry, d_alias, d_inode of aliases + * dcache_hash_bucket lock protects: + * - the dcache hash table + * s_anon bl list spinlock protects: + * - the s_anon list (see __d_drop) + * dcache_lru_lock protects: + * - the dcache lru lists and counters + * d_lock protects: + * - d_flags + * - d_name + * - d_lru + * - d_count + * - d_unhashed() + * - d_parent and d_subdirs + * - childrens' d_child and d_parent + * - d_alias, d_inode + * + * Ordering: + * dentry->d_inode->i_lock + * dentry->d_lock + * dcache_lru_lock + * dcache_hash_bucket lock + * s_anon lock + * + * If there is an ancestor relationship: + * dentry->d_parent->...->d_parent->d_lock + * ... + * dentry->d_parent->d_lock + * dentry->d_lock + * + * If no ancestor relationship: + * if (dentry1 < dentry2) + * dentry1->d_lock + * dentry2->d_lock + */ +int sysctl_vfs_cache_pressure __read_mostly = 100; +EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock); +__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); + +EXPORT_SYMBOL(rename_lock); + +static struct kmem_cache *dentry_cache __read_mostly; + +/* + * This is the single most critical data structure when it comes + * to the dcache: the hashtable for lookups. Somebody should try + * to make this good - I've just made it work. + * + * This hash-function tries to avoid losing too many bits of hash + * information, yet avoid using a prime hash-size or similar. + */ +#define D_HASHBITS d_hash_shift +#define D_HASHMASK d_hash_mask + +static unsigned int d_hash_mask __read_mostly; +static unsigned int d_hash_shift __read_mostly; + +static struct hlist_bl_head *dentry_hashtable __read_mostly; + +static inline struct hlist_bl_head *d_hash(struct dentry *parent, + unsigned long hash) +{ + hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES; + hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS); + return dentry_hashtable + (hash & D_HASHMASK); +} + +/* Statistics gathering. */ +struct dentry_stat_t dentry_stat = { + .age_limit = 45, +}; + +static DEFINE_PER_CPU(unsigned int, nr_dentry); + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +static int get_nr_dentry(void) +{ + int i; + int sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry, i); + return sum < 0 ? 0 : sum; +} + +int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + dentry_stat.nr_dentry = get_nr_dentry(); + return proc_dointvec(table, write, buffer, lenp, ppos); +} +#endif + +static void __d_free(struct rcu_head *head) +{ + struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); + + WARN_ON(!list_empty(&dentry->d_alias)); + if (dname_external(dentry)) + kfree(dentry->d_name.name); + kmem_cache_free(dentry_cache, dentry); +} + +/* + * no locks, please. + */ +static void d_free(struct dentry *dentry) +{ + BUG_ON(dentry->d_count); + this_cpu_dec(nr_dentry); + if (dentry->d_op && dentry->d_op->d_release) + dentry->d_op->d_release(dentry); + + /* if dentry was never visible to RCU, immediate free is OK */ + if (!(dentry->d_flags & DCACHE_RCUACCESS)) + __d_free(&dentry->d_u.d_rcu); + else + call_rcu(&dentry->d_u.d_rcu, __d_free); +} + +/** + * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups + * @dentry: the target dentry + * After this call, in-progress rcu-walk path lookup will fail. This + * should be called after unhashing, and after changing d_inode (if + * the dentry has not already been unhashed). + */ +static inline void dentry_rcuwalk_barrier(struct dentry *dentry) +{ + assert_spin_locked(&dentry->d_lock); + /* Go through a barrier */ + write_seqcount_barrier(&dentry->d_seq); +} + +/* + * Release the dentry's inode, using the filesystem + * d_iput() operation if defined. Dentry has no refcount + * and is unhashed. + */ +static void dentry_iput(struct dentry * dentry) + __releases(dentry->d_lock) + __releases(dentry->d_inode->i_lock) +{ + struct inode *inode = dentry->d_inode; + if (inode) { + dentry->d_inode = NULL; + list_del_init(&dentry->d_alias); + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + if (!inode->i_nlink) + fsnotify_inoderemove(inode); + if (dentry->d_op && dentry->d_op->d_iput) + dentry->d_op->d_iput(dentry, inode); + else + iput(inode); + } else { + spin_unlock(&dentry->d_lock); + } +} + +/* + * Release the dentry's inode, using the filesystem + * d_iput() operation if defined. dentry remains in-use. + */ +static void dentry_unlink_inode(struct dentry * dentry) + __releases(dentry->d_lock) + __releases(dentry->d_inode->i_lock) +{ + struct inode *inode = dentry->d_inode; + dentry->d_inode = NULL; + list_del_init(&dentry->d_alias); + dentry_rcuwalk_barrier(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + if (!inode->i_nlink) + fsnotify_inoderemove(inode); + if (dentry->d_op && dentry->d_op->d_iput) + dentry->d_op->d_iput(dentry, inode); + else + iput(inode); +} + +/* + * dentry_lru_(add|del|move_tail) must be called with d_lock held. + */ +static void dentry_lru_add(struct dentry *dentry) +{ + if (list_empty(&dentry->d_lru)) { + spin_lock(&dcache_lru_lock); + list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + dentry->d_sb->s_nr_dentry_unused++; + dentry_stat.nr_unused++; + spin_unlock(&dcache_lru_lock); + } +} + +static void __dentry_lru_del(struct dentry *dentry) +{ + list_del_init(&dentry->d_lru); + dentry->d_flags &= ~DCACHE_SHRINK_LIST; + dentry->d_sb->s_nr_dentry_unused--; + dentry_stat.nr_unused--; +} + +static void dentry_lru_del(struct dentry *dentry) +{ + if (!list_empty(&dentry->d_lru)) { + spin_lock(&dcache_lru_lock); + __dentry_lru_del(dentry); + spin_unlock(&dcache_lru_lock); + } +} + +static void dentry_lru_move_tail(struct dentry *dentry) +{ + spin_lock(&dcache_lru_lock); + if (list_empty(&dentry->d_lru)) { + list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + dentry->d_sb->s_nr_dentry_unused++; + dentry_stat.nr_unused++; + } else { + list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + } + spin_unlock(&dcache_lru_lock); +} + +/** + * d_kill - kill dentry and return parent + * @dentry: dentry to kill + * @parent: parent dentry + * + * The dentry must already be unhashed and removed from the LRU. + * + * If this is the root of the dentry tree, return NULL. + * + * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by + * d_kill. + */ +static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) + __releases(dentry->d_lock) + __releases(parent->d_lock) + __releases(dentry->d_inode->i_lock) +{ + list_del(&dentry->d_u.d_child); + /* + * Inform try_to_ascend() that we are no longer attached to the + * dentry tree + */ + dentry->d_flags |= DCACHE_DISCONNECTED; + if (parent) + spin_unlock(&parent->d_lock); + dentry_iput(dentry); + /* + * dentry_iput drops the locks, at which point nobody (except + * transient RCU lookups) can reach this dentry. + */ + d_free(dentry); + return parent; +} + +/** + * d_drop - drop a dentry + * @dentry: dentry to drop + * + * d_drop() unhashes the entry from the parent dentry hashes, so that it won't + * be found through a VFS lookup any more. Note that this is different from + * deleting the dentry - d_delete will try to mark the dentry negative if + * possible, giving a successful _negative_ lookup, while d_drop will + * just make the cache lookup fail. + * + * d_drop() is used mainly for stuff that wants to invalidate a dentry for some + * reason (NFS timeouts or autofs deletes). + * + * __d_drop requires dentry->d_lock. + */ +void __d_drop(struct dentry *dentry) +{ + if (!d_unhashed(dentry)) { + struct hlist_bl_head *b; + if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) + b = &dentry->d_sb->s_anon; + else + b = d_hash(dentry->d_parent, dentry->d_name.hash); + + hlist_bl_lock(b); + __hlist_bl_del(&dentry->d_hash); + dentry->d_hash.pprev = NULL; + hlist_bl_unlock(b); + + dentry_rcuwalk_barrier(dentry); + } +} +EXPORT_SYMBOL(__d_drop); + +void d_drop(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_drop); + +/* + * Finish off a dentry we've decided to kill. + * dentry->d_lock must be held, returns with it unlocked. + * If ref is non-zero, then decrement the refcount too. + * Returns dentry requiring refcount drop, or NULL if we're done. + */ +static inline struct dentry *dentry_kill(struct dentry *dentry, int ref) + __releases(dentry->d_lock) +{ + struct inode *inode; + struct dentry *parent; + + inode = dentry->d_inode; + if (inode && !spin_trylock(&inode->i_lock)) { +relock: + spin_unlock(&dentry->d_lock); + cpu_relax(); + return dentry; /* try again with same dentry */ + } + if (IS_ROOT(dentry)) + parent = NULL; + else + parent = dentry->d_parent; + if (parent && !spin_trylock(&parent->d_lock)) { + if (inode) + spin_unlock(&inode->i_lock); + goto relock; + } + + if (ref) + dentry->d_count--; + /* if dentry was on the d_lru list delete it from there */ + dentry_lru_del(dentry); + /* if it was on the hash then remove it */ + __d_drop(dentry); + return d_kill(dentry, parent); +} + +/* + * This is dput + * + * This is complicated by the fact that we do not want to put + * dentries that are no longer on any hash chain on the unused + * list: we'd much rather just get rid of them immediately. + * + * However, that implies that we have to traverse the dentry + * tree upwards to the parents which might _also_ now be + * scheduled for deletion (it may have been only waiting for + * its last child to go away). + * + * This tail recursion is done by hand as we don't want to depend + * on the compiler to always get this right (gcc generally doesn't). + * Real recursion would eat up our stack space. + */ + +/* + * dput - release a dentry + * @dentry: dentry to release + * + * Release a dentry. This will drop the usage count and if appropriate + * call the dentry unlink method as well as removing it from the queues and + * releasing its resources. If the parent dentries were scheduled for release + * they too may now get deleted. + */ +void dput(struct dentry *dentry) +{ + if (!dentry) + return; + +repeat: + if (dentry->d_count == 1) + might_sleep(); + spin_lock(&dentry->d_lock); + BUG_ON(!dentry->d_count); + if (dentry->d_count > 1) { + dentry->d_count--; + spin_unlock(&dentry->d_lock); + return; + } + + if (dentry->d_flags & DCACHE_OP_DELETE) { + if (dentry->d_op->d_delete(dentry)) + goto kill_it; + } + + /* Unreachable? Get rid of it */ + if (d_unhashed(dentry)) + goto kill_it; + + /* Otherwise leave it cached and ensure it's on the LRU */ + dentry->d_flags |= DCACHE_REFERENCED; + dentry_lru_add(dentry); + + dentry->d_count--; + spin_unlock(&dentry->d_lock); + return; + +kill_it: + dentry = dentry_kill(dentry, 1); + if (dentry) + goto repeat; +} +EXPORT_SYMBOL(dput); + +/** + * d_invalidate - invalidate a dentry + * @dentry: dentry to invalidate + * + * Try to invalidate the dentry if it turns out to be + * possible. If there are other dentries that can be + * reached through this one we can't delete it and we + * return -EBUSY. On success we return 0. + * + * no dcache lock. + */ + +int d_invalidate(struct dentry * dentry) +{ + /* + * If it's already been dropped, return OK. + */ + spin_lock(&dentry->d_lock); + if (d_unhashed(dentry)) { + spin_unlock(&dentry->d_lock); + return 0; + } + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. + */ + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dentry->d_lock); + shrink_dcache_parent(dentry); + spin_lock(&dentry->d_lock); + } + + /* + * Somebody else still using it? + * + * If it's a directory, we can't drop it + * for fear of somebody re-populating it + * with children (even though dropping it + * would make it unreachable from the root, + * we might still populate it if it was a + * working directory or similar). + */ + if (dentry->d_count > 1) { + if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { + spin_unlock(&dentry->d_lock); + return -EBUSY; + } + } + + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + return 0; +} +EXPORT_SYMBOL(d_invalidate); + +/* This must be called with d_lock held */ +static inline void __dget_dlock(struct dentry *dentry) +{ + dentry->d_count++; +} + +static inline void __dget(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __dget_dlock(dentry); + spin_unlock(&dentry->d_lock); +} + +struct dentry *dget_parent(struct dentry *dentry) +{ + struct dentry *ret; + +repeat: + /* + * Don't need rcu_dereference because we re-check it was correct under + * the lock. + */ + rcu_read_lock(); + ret = dentry->d_parent; + if (!ret) { + rcu_read_unlock(); + goto out; + } + spin_lock(&ret->d_lock); + if (unlikely(ret != dentry->d_parent)) { + spin_unlock(&ret->d_lock); + rcu_read_unlock(); + goto repeat; + } + rcu_read_unlock(); + BUG_ON(!ret->d_count); + ret->d_count++; + spin_unlock(&ret->d_lock); +out: + return ret; +} +EXPORT_SYMBOL(dget_parent); + +/** + * d_find_alias - grab a hashed alias of inode + * @inode: inode in question + * @want_discon: flag, used by d_splice_alias, to request + * that only a DISCONNECTED alias be returned. + * + * If inode has a hashed alias, or is a directory and has any alias, + * acquire the reference to alias and return it. Otherwise return NULL. + * Notice that if inode is a directory there can be only one alias and + * it can be unhashed only if it has no children, or if it is the root + * of a filesystem. + * + * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer + * any other hashed alias over that one unless @want_discon is set, + * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias. + */ +static struct dentry *__d_find_alias(struct inode *inode, int want_discon) +{ + struct dentry *alias, *discon_alias; + +again: + discon_alias = NULL; + list_for_each_entry(alias, &inode->i_dentry, d_alias) { + spin_lock(&alias->d_lock); + if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { + if (IS_ROOT(alias) && + (alias->d_flags & DCACHE_DISCONNECTED)) { + discon_alias = alias; + } else if (!want_discon) { + __dget_dlock(alias); + spin_unlock(&alias->d_lock); + return alias; + } + } + spin_unlock(&alias->d_lock); + } + if (discon_alias) { + alias = discon_alias; + spin_lock(&alias->d_lock); + if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { + if (IS_ROOT(alias) && + (alias->d_flags & DCACHE_DISCONNECTED)) { + __dget_dlock(alias); + spin_unlock(&alias->d_lock); + return alias; + } + } + spin_unlock(&alias->d_lock); + goto again; + } + return NULL; +} + +struct dentry *d_find_alias(struct inode *inode) +{ + struct dentry *de = NULL; + + if (!list_empty(&inode->i_dentry)) { + spin_lock(&inode->i_lock); + de = __d_find_alias(inode, 0); + spin_unlock(&inode->i_lock); + } + return de; +} +EXPORT_SYMBOL(d_find_alias); + +/* + * Try to kill dentries associated with this inode. + * WARNING: you must own a reference to inode. + */ +void d_prune_aliases(struct inode *inode) +{ + struct dentry *dentry; +restart: + spin_lock(&inode->i_lock); + list_for_each_entry(dentry, &inode->i_dentry, d_alias) { + spin_lock(&dentry->d_lock); + if (!dentry->d_count) { + __dget_dlock(dentry); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + dput(dentry); + goto restart; + } + spin_unlock(&dentry->d_lock); + } + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(d_prune_aliases); + +/* + * Try to throw away a dentry - free the inode, dput the parent. + * Requires dentry->d_lock is held, and dentry->d_count == 0. + * Releases dentry->d_lock. + * + * This may fail if locks cannot be acquired no problem, just try again. + */ +static void try_prune_one_dentry(struct dentry *dentry) + __releases(dentry->d_lock) +{ + struct dentry *parent; + + parent = dentry_kill(dentry, 0); + /* + * If dentry_kill returns NULL, we have nothing more to do. + * if it returns the same dentry, trylocks failed. In either + * case, just loop again. + * + * Otherwise, we need to prune ancestors too. This is necessary + * to prevent quadratic behavior of shrink_dcache_parent(), but + * is also expected to be beneficial in reducing dentry cache + * fragmentation. + */ + if (!parent) + return; + if (parent == dentry) + return; + + /* Prune ancestors. */ + dentry = parent; + while (dentry) { + spin_lock(&dentry->d_lock); + if (dentry->d_count > 1) { + dentry->d_count--; + spin_unlock(&dentry->d_lock); + return; + } + dentry = dentry_kill(dentry, 1); + } +} + +static void shrink_dentry_list(struct list_head *list) +{ + struct dentry *dentry; + + rcu_read_lock(); + for (;;) { + dentry = list_entry_rcu(list->prev, struct dentry, d_lru); + if (&dentry->d_lru == list) + break; /* empty */ + spin_lock(&dentry->d_lock); + if (dentry != list_entry(list->prev, struct dentry, d_lru)) { + spin_unlock(&dentry->d_lock); + continue; + } + + /* + * We found an inuse dentry which was not removed from + * the LRU because of laziness during lookup. Do not free + * it - just keep it off the LRU list. + */ + if (dentry->d_count) { + dentry_lru_del(dentry); + spin_unlock(&dentry->d_lock); + continue; + } + + rcu_read_unlock(); + + try_prune_one_dentry(dentry); + + rcu_read_lock(); + } + rcu_read_unlock(); +} + +/** + * __shrink_dcache_sb - shrink the dentry LRU on a given superblock + * @sb: superblock to shrink dentry LRU. + * @count: number of entries to prune + * @flags: flags to control the dentry processing + * + * If flags contains DCACHE_REFERENCED reference dentries will not be pruned. + */ +static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) +{ + /* called from prune_dcache() and shrink_dcache_parent() */ + struct dentry *dentry; + LIST_HEAD(referenced); + LIST_HEAD(tmp); + int cnt = *count; + +relock: + spin_lock(&dcache_lru_lock); + while (!list_empty(&sb->s_dentry_lru)) { + dentry = list_entry(sb->s_dentry_lru.prev, + struct dentry, d_lru); + BUG_ON(dentry->d_sb != sb); + + if (!spin_trylock(&dentry->d_lock)) { + spin_unlock(&dcache_lru_lock); + cpu_relax(); + goto relock; + } + + /* + * If we are honouring the DCACHE_REFERENCED flag and the + * dentry has this flag set, don't free it. Clear the flag + * and put it back on the LRU. + */ + if (flags & DCACHE_REFERENCED && + dentry->d_flags & DCACHE_REFERENCED) { + dentry->d_flags &= ~DCACHE_REFERENCED; + list_move(&dentry->d_lru, &referenced); + spin_unlock(&dentry->d_lock); + } else { + list_move_tail(&dentry->d_lru, &tmp); + dentry->d_flags |= DCACHE_SHRINK_LIST; + spin_unlock(&dentry->d_lock); + if (!--cnt) + break; + } + cond_resched_lock(&dcache_lru_lock); + } + if (!list_empty(&referenced)) + list_splice(&referenced, &sb->s_dentry_lru); + spin_unlock(&dcache_lru_lock); + + shrink_dentry_list(&tmp); + + *count = cnt; +} + +/** + * prune_dcache - shrink the dcache + * @count: number of entries to try to free + * + * Shrink the dcache. This is done when we need more memory, or simply when we + * need to unmount something (at which point we need to unuse all dentries). + * + * This function may fail to free any resources if all the dentries are in use. + */ +static void prune_dcache(int count) +{ + struct super_block *sb, *p = NULL; + int w_count; + int unused = dentry_stat.nr_unused; + int prune_ratio; + int pruned; + + if (unused == 0 || count == 0) + return; + if (count >= unused) + prune_ratio = 1; + else + prune_ratio = unused / count; + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; + if (sb->s_nr_dentry_unused == 0) + continue; + sb->s_count++; + /* Now, we reclaim unused dentrins with fairness. + * We reclaim them same percentage from each superblock. + * We calculate number of dentries to scan on this sb + * as follows, but the implementation is arranged to avoid + * overflows: + * number of dentries to scan on this sb = + * count * (number of dentries on this sb / + * number of dentries in the machine) + */ + spin_unlock(&sb_lock); + if (prune_ratio != 1) + w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1; + else + w_count = sb->s_nr_dentry_unused; + pruned = w_count; + /* + * We need to be sure this filesystem isn't being unmounted, + * otherwise we could race with generic_shutdown_super(), and + * end up holding a reference to an inode while the filesystem + * is unmounted. So we try to get s_umount, and make sure + * s_root isn't NULL. + */ + if (down_read_trylock(&sb->s_umount)) { + if ((sb->s_root != NULL) && + (!list_empty(&sb->s_dentry_lru))) { + __shrink_dcache_sb(sb, &w_count, + DCACHE_REFERENCED); + pruned -= w_count; + } + up_read(&sb->s_umount); + } + spin_lock(&sb_lock); + if (p) + __put_super(p); + count -= pruned; + p = sb; + /* more work left to do? */ + if (count <= 0) + break; + } + if (p) + __put_super(p); + spin_unlock(&sb_lock); +} + +/** + * shrink_dcache_sb - shrink dcache for a superblock + * @sb: superblock + * + * Shrink the dcache for the specified super block. This is used to free + * the dcache before unmounting a file system. + */ +void shrink_dcache_sb(struct super_block *sb) +{ + LIST_HEAD(tmp); + + spin_lock(&dcache_lru_lock); + while (!list_empty(&sb->s_dentry_lru)) { + list_splice_init(&sb->s_dentry_lru, &tmp); + spin_unlock(&dcache_lru_lock); + shrink_dentry_list(&tmp); + spin_lock(&dcache_lru_lock); + } + spin_unlock(&dcache_lru_lock); +} +EXPORT_SYMBOL(shrink_dcache_sb); + +/* + * destroy a single subtree of dentries for unmount + * - see the comments on shrink_dcache_for_umount() for a description of the + * locking + */ +static void shrink_dcache_for_umount_subtree(struct dentry *dentry) +{ + struct dentry *parent; + unsigned detached = 0; + + BUG_ON(!IS_ROOT(dentry)); + + /* detach this root from the system */ + spin_lock(&dentry->d_lock); + dentry_lru_del(dentry); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + + for (;;) { + /* descend to the first leaf in the current subtree */ + while (!list_empty(&dentry->d_subdirs)) { + struct dentry *loop; + + /* this is a branch with children - detach all of them + * from the system in one go */ + spin_lock(&dentry->d_lock); + list_for_each_entry(loop, &dentry->d_subdirs, + d_u.d_child) { + spin_lock_nested(&loop->d_lock, + DENTRY_D_LOCK_NESTED); + dentry_lru_del(loop); + __d_drop(loop); + spin_unlock(&loop->d_lock); + } + spin_unlock(&dentry->d_lock); + + /* move to the first child */ + dentry = list_entry(dentry->d_subdirs.next, + struct dentry, d_u.d_child); + } + + /* consume the dentries from this leaf up through its parents + * until we find one with children or run out altogether */ + do { + struct inode *inode; + + if (dentry->d_count != 0) { + printk(KERN_ERR + "BUG: Dentry %p{i=%lx,n=%s}" + " still in use (%d)" + " [unmount of %s %s]\n", + dentry, + dentry->d_inode ? + dentry->d_inode->i_ino : 0UL, + dentry->d_name.name, + dentry->d_count, + dentry->d_sb->s_type->name, + dentry->d_sb->s_id); + BUG(); + } + + if (IS_ROOT(dentry)) { + parent = NULL; + list_del(&dentry->d_u.d_child); + } else { + parent = dentry->d_parent; + spin_lock(&parent->d_lock); + parent->d_count--; + list_del(&dentry->d_u.d_child); + spin_unlock(&parent->d_lock); + } + + detached++; + + inode = dentry->d_inode; + if (inode) { + dentry->d_inode = NULL; + list_del_init(&dentry->d_alias); + if (dentry->d_op && dentry->d_op->d_iput) + dentry->d_op->d_iput(dentry, inode); + else + iput(inode); + } + + d_free(dentry); + + /* finished when we fall off the top of the tree, + * otherwise we ascend to the parent and move to the + * next sibling if there is one */ + if (!parent) + return; + dentry = parent; + } while (list_empty(&dentry->d_subdirs)); + + dentry = list_entry(dentry->d_subdirs.next, + struct dentry, d_u.d_child); + } +} + +/* + * destroy the dentries attached to a superblock on unmounting + * - we don't need to use dentry->d_lock because: + * - the superblock is detached from all mountings and open files, so the + * dentry trees will not be rearranged by the VFS + * - s_umount is write-locked, so the memory pressure shrinker will ignore + * any dentries belonging to this superblock that it comes across + * - the filesystem itself is no longer permitted to rearrange the dentries + * in this superblock + */ +void shrink_dcache_for_umount(struct super_block *sb) +{ + struct dentry *dentry; + + if (down_read_trylock(&sb->s_umount)) + BUG(); + + dentry = sb->s_root; + sb->s_root = NULL; + spin_lock(&dentry->d_lock); + dentry->d_count--; + spin_unlock(&dentry->d_lock); + shrink_dcache_for_umount_subtree(dentry); + + while (!hlist_bl_empty(&sb->s_anon)) { + dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash); + shrink_dcache_for_umount_subtree(dentry); + } +} + +/* + * This tries to ascend one level of parenthood, but + * we can race with renaming, so we need to re-check + * the parenthood after dropping the lock and check + * that the sequence number still matches. + */ +static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq) +{ + struct dentry *new = old->d_parent; + + rcu_read_lock(); + spin_unlock(&old->d_lock); + spin_lock(&new->d_lock); + + /* + * might go back up the wrong parent if we have had a rename + * or deletion + */ + if (new != old->d_parent || + (old->d_flags & DCACHE_DISCONNECTED) || + (!locked && read_seqretry(&rename_lock, seq))) { + spin_unlock(&new->d_lock); + new = NULL; + } + rcu_read_unlock(); + return new; +} + + +/* + * Search for at least 1 mount point in the dentry's subdirs. + * We descend to the next level whenever the d_subdirs + * list is non-empty and continue searching. + */ + +/** + * have_submounts - check for mounts over a dentry + * @parent: dentry to check. + * + * Return true if the parent or its subdirectories contain + * a mount point + */ +int have_submounts(struct dentry *parent) +{ + struct dentry *this_parent; + struct list_head *next; + unsigned seq; + int locked = 0; + + seq = read_seqbegin(&rename_lock); +again: + this_parent = parent; + + if (d_mountpoint(parent)) + goto positive; + spin_lock(&this_parent->d_lock); +repeat: + next = this_parent->d_subdirs.next; +resume: + while (next != &this_parent->d_subdirs) { + struct list_head *tmp = next; + struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); + next = tmp->next; + + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + /* Have we found a mount point ? */ + if (d_mountpoint(dentry)) { + spin_unlock(&dentry->d_lock); + spin_unlock(&this_parent->d_lock); + goto positive; + } + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); + this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); + goto repeat; + } + spin_unlock(&dentry->d_lock); + } + /* + * All done at this level ... ascend and resume the search. + */ + if (this_parent != parent) { + struct dentry *child = this_parent; + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) + goto rename_retry; + next = child->d_u.d_child.next; + goto resume; + } + spin_unlock(&this_parent->d_lock); + if (!locked && read_seqretry(&rename_lock, seq)) + goto rename_retry; + if (locked) + write_sequnlock(&rename_lock); + return 0; /* No mount points found in tree */ +positive: + if (!locked && read_seqretry(&rename_lock, seq)) + goto rename_retry; + if (locked) + write_sequnlock(&rename_lock); + return 1; + +rename_retry: + locked = 1; + write_seqlock(&rename_lock); + goto again; +} +EXPORT_SYMBOL(have_submounts); + +/* + * Search the dentry child list for the specified parent, + * and move any unused dentries to the end of the unused + * list for prune_dcache(). We descend to the next level + * whenever the d_subdirs list is non-empty and continue + * searching. + * + * It returns zero iff there are no unused children, + * otherwise it returns the number of children moved to + * the end of the unused list. This may not be the total + * number of unused children, because select_parent can + * drop the lock and return early due to latency + * constraints. + */ +static int select_parent(struct dentry * parent) +{ + struct dentry *this_parent; + struct list_head *next; + unsigned seq; + int found = 0; + int locked = 0; + + seq = read_seqbegin(&rename_lock); +again: + this_parent = parent; + spin_lock(&this_parent->d_lock); +repeat: + next = this_parent->d_subdirs.next; +resume: + while (next != &this_parent->d_subdirs) { + struct list_head *tmp = next; + struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); + next = tmp->next; + + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + + /* + * move only zero ref count dentries to the end + * of the unused list for prune_dcache + * + * Those which are presently on the shrink list, being processed + * by shrink_dentry_list(), shouldn't be moved. Otherwise the + * loop in shrink_dcache_parent() might not make any progress + * and loop forever. + */ + if (dentry->d_count) { + dentry_lru_del(dentry); + } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { + dentry_lru_move_tail(dentry); + found++; + } + /* + * We can return to the caller if we have found some (this + * ensures forward progress). We'll be coming back to find + * the rest. + */ + if (found && need_resched()) { + spin_unlock(&dentry->d_lock); + goto out; + } + + /* + * Descend a level if the d_subdirs list is non-empty. + */ + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); + this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); + goto repeat; + } + + spin_unlock(&dentry->d_lock); + } + /* + * All done at this level ... ascend and resume the search. + */ + if (this_parent != parent) { + struct dentry *child = this_parent; + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) + goto rename_retry; + next = child->d_u.d_child.next; + goto resume; + } +out: + spin_unlock(&this_parent->d_lock); + if (!locked && read_seqretry(&rename_lock, seq)) + goto rename_retry; + if (locked) + write_sequnlock(&rename_lock); + return found; + +rename_retry: + if (found) + return found; + locked = 1; + write_seqlock(&rename_lock); + goto again; +} + +/** + * shrink_dcache_parent - prune dcache + * @parent: parent of entries to prune + * + * Prune the dcache to remove unused children of the parent dentry. + */ + +void shrink_dcache_parent(struct dentry * parent) +{ + struct super_block *sb = parent->d_sb; + int found; + + while ((found = select_parent(parent)) != 0) + __shrink_dcache_sb(sb, &found, 0); +} +EXPORT_SYMBOL(shrink_dcache_parent); + +/* + * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain. + * + * We need to avoid reentering the filesystem if the caller is performing a + * GFP_NOFS allocation attempt. One example deadlock is: + * + * ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache-> + * prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->put_inode-> + * ext2_discard_prealloc->ext2_free_blocks->lock_super->DEADLOCK. + * + * In this case we return -1 to tell the caller that we baled. + */ +static int shrink_dcache_memory(struct shrinker *shrink, + struct shrink_control *sc) +{ + int nr = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + + if (nr) { + if (!(gfp_mask & __GFP_FS)) + return -1; + prune_dcache(nr); + } + + return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; +} + +static struct shrinker dcache_shrinker = { + .shrink = shrink_dcache_memory, + .seeks = DEFAULT_SEEKS, +}; + +/** + * d_alloc - allocate a dcache entry + * @parent: parent of entry to allocate + * @name: qstr of the name + * + * Allocates a dentry. It returns %NULL if there is insufficient memory + * available. On a success the dentry is returned. The name passed in is + * copied and the copy passed in may be reused after this call. + */ + +struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) +{ + struct dentry *dentry; + char *dname; + + dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); + if (!dentry) + return NULL; + + if (name->len > DNAME_INLINE_LEN-1) { + dname = kmalloc(name->len + 1, GFP_KERNEL); + if (!dname) { + kmem_cache_free(dentry_cache, dentry); + return NULL; + } + } else { + dname = dentry->d_iname; + } + dentry->d_name.name = dname; + + dentry->d_name.len = name->len; + dentry->d_name.hash = name->hash; + memcpy(dname, name->name, name->len); + dname[name->len] = 0; + + dentry->d_count = 1; + dentry->d_flags = 0; + spin_lock_init(&dentry->d_lock); + seqcount_init(&dentry->d_seq); + dentry->d_inode = NULL; + dentry->d_parent = NULL; + dentry->d_sb = NULL; + dentry->d_op = NULL; + dentry->d_fsdata = NULL; + INIT_HLIST_BL_NODE(&dentry->d_hash); + INIT_LIST_HEAD(&dentry->d_lru); + INIT_LIST_HEAD(&dentry->d_subdirs); + INIT_LIST_HEAD(&dentry->d_alias); + INIT_LIST_HEAD(&dentry->d_u.d_child); + + if (parent) { + spin_lock(&parent->d_lock); + /* + * don't need child lock because it is not subject + * to concurrency here + */ + __dget_dlock(parent); + dentry->d_parent = parent; + dentry->d_sb = parent->d_sb; + d_set_d_op(dentry, dentry->d_sb->s_d_op); + list_add(&dentry->d_u.d_child, &parent->d_subdirs); + spin_unlock(&parent->d_lock); + } + + this_cpu_inc(nr_dentry); + + return dentry; +} +EXPORT_SYMBOL(d_alloc); + +struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) +{ + struct dentry *dentry = d_alloc(NULL, name); + if (dentry) { + dentry->d_sb = sb; + d_set_d_op(dentry, dentry->d_sb->s_d_op); + dentry->d_parent = dentry; + dentry->d_flags |= DCACHE_DISCONNECTED; + } + return dentry; +} +EXPORT_SYMBOL(d_alloc_pseudo); + +struct dentry *d_alloc_name(struct dentry *parent, const char *name) +{ + struct qstr q; + + q.name = name; + q.len = strlen(name); + q.hash = full_name_hash(q.name, q.len); + return d_alloc(parent, &q); +} +EXPORT_SYMBOL(d_alloc_name); + +void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) +{ + WARN_ON_ONCE(dentry->d_op); + WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | + DCACHE_OP_COMPARE | + DCACHE_OP_REVALIDATE | + DCACHE_OP_DELETE )); + dentry->d_op = op; + if (!op) + return; + if (op->d_hash) + dentry->d_flags |= DCACHE_OP_HASH; + if (op->d_compare) + dentry->d_flags |= DCACHE_OP_COMPARE; + if (op->d_revalidate) + dentry->d_flags |= DCACHE_OP_REVALIDATE; + if (op->d_delete) + dentry->d_flags |= DCACHE_OP_DELETE; + +} +EXPORT_SYMBOL(d_set_d_op); + +static void __d_instantiate(struct dentry *dentry, struct inode *inode) +{ + spin_lock(&dentry->d_lock); + if (inode) { + if (unlikely(IS_AUTOMOUNT(inode))) + dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; + list_add(&dentry->d_alias, &inode->i_dentry); + } + dentry->d_inode = inode; + dentry_rcuwalk_barrier(dentry); + spin_unlock(&dentry->d_lock); + fsnotify_d_instantiate(dentry, inode); +} + +/** + * d_instantiate - fill in inode information for a dentry + * @entry: dentry to complete + * @inode: inode to attach to this dentry + * + * Fill in inode information in the entry. + * + * This turns negative dentries into productive full members + * of society. + * + * NOTE! This assumes that the inode count has been incremented + * (or otherwise set) by the caller to indicate that it is now + * in use by the dcache. + */ + +void d_instantiate(struct dentry *entry, struct inode * inode) +{ + BUG_ON(!list_empty(&entry->d_alias)); + if (inode) + spin_lock(&inode->i_lock); + __d_instantiate(entry, inode); + if (inode) + spin_unlock(&inode->i_lock); + security_d_instantiate(entry, inode); +} +EXPORT_SYMBOL(d_instantiate); + +/** + * d_instantiate_unique - instantiate a non-aliased dentry + * @entry: dentry to instantiate + * @inode: inode to attach to this dentry + * + * Fill in inode information in the entry. On success, it returns NULL. + * If an unhashed alias of "entry" already exists, then we return the + * aliased dentry instead and drop one reference to inode. + * + * Note that in order to avoid conflicts with rename() etc, the caller + * had better be holding the parent directory semaphore. + * + * This also assumes that the inode count has been incremented + * (or otherwise set) by the caller to indicate that it is now + * in use by the dcache. + */ +static struct dentry *__d_instantiate_unique(struct dentry *entry, + struct inode *inode) +{ + struct dentry *alias; + int len = entry->d_name.len; + const char *name = entry->d_name.name; + unsigned int hash = entry->d_name.hash; + + if (!inode) { + __d_instantiate(entry, NULL); + return NULL; + } + + list_for_each_entry(alias, &inode->i_dentry, d_alias) { + struct qstr *qstr = &alias->d_name; + + /* + * Don't need alias->d_lock here, because aliases with + * d_parent == entry->d_parent are not subject to name or + * parent changes, because the parent inode i_mutex is held. + */ + if (qstr->hash != hash) + continue; + if (alias->d_parent != entry->d_parent) + continue; + if (dentry_cmp(qstr->name, qstr->len, name, len)) + continue; + __dget(alias); + return alias; + } + + __d_instantiate(entry, inode); + return NULL; +} + +struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) +{ + struct dentry *result; + + BUG_ON(!list_empty(&entry->d_alias)); + + if (inode) + spin_lock(&inode->i_lock); + result = __d_instantiate_unique(entry, inode); + if (inode) + spin_unlock(&inode->i_lock); + + if (!result) { + security_d_instantiate(entry, inode); + return NULL; + } + + BUG_ON(!d_unhashed(result)); + iput(inode); + return result; +} + +EXPORT_SYMBOL(d_instantiate_unique); + +/** + * d_alloc_root - allocate root dentry + * @root_inode: inode to allocate the root for + * + * Allocate a root ("/") dentry for the inode given. The inode is + * instantiated and returned. %NULL is returned if there is insufficient + * memory or the inode passed is %NULL. + */ + +struct dentry * d_alloc_root(struct inode * root_inode) +{ + struct dentry *res = NULL; + + if (root_inode) { + static const struct qstr name = { .name = "/", .len = 1 }; + + res = d_alloc(NULL, &name); + if (res) { + res->d_sb = root_inode->i_sb; + d_set_d_op(res, res->d_sb->s_d_op); + res->d_parent = res; + d_instantiate(res, root_inode); + } + } + return res; +} +EXPORT_SYMBOL(d_alloc_root); + +static struct dentry * __d_find_any_alias(struct inode *inode) +{ + struct dentry *alias; + + if (list_empty(&inode->i_dentry)) + return NULL; + alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias); + __dget(alias); + return alias; +} + +static struct dentry * d_find_any_alias(struct inode *inode) +{ + struct dentry *de; + + spin_lock(&inode->i_lock); + de = __d_find_any_alias(inode); + spin_unlock(&inode->i_lock); + return de; +} + + +/** + * d_obtain_alias - find or allocate a dentry for a given inode + * @inode: inode to allocate the dentry for + * + * Obtain a dentry for an inode resulting from NFS filehandle conversion or + * similar open by handle operations. The returned dentry may be anonymous, + * or may have a full name (if the inode was already in the cache). + * + * When called on a directory inode, we must ensure that the inode only ever + * has one dentry. If a dentry is found, that is returned instead of + * allocating a new one. + * + * On successful return, the reference to the inode has been transferred + * to the dentry. In case of an error the reference on the inode is released. + * To make it easier to use in export operations a %NULL or IS_ERR inode may + * be passed in and will be the error will be propagate to the return value, + * with a %NULL @inode replaced by ERR_PTR(-ESTALE). + */ +struct dentry *d_obtain_alias(struct inode *inode) +{ + static const struct qstr anonstring = { .name = "" }; + struct dentry *tmp; + struct dentry *res; + + if (!inode) + return ERR_PTR(-ESTALE); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + res = d_find_any_alias(inode); + if (res) + goto out_iput; + + tmp = d_alloc(NULL, &anonstring); + if (!tmp) { + res = ERR_PTR(-ENOMEM); + goto out_iput; + } + tmp->d_parent = tmp; /* make sure dput doesn't croak */ + + + spin_lock(&inode->i_lock); + res = __d_find_any_alias(inode); + if (res) { + spin_unlock(&inode->i_lock); + dput(tmp); + goto out_iput; + } + + /* attach a disconnected dentry */ + spin_lock(&tmp->d_lock); + tmp->d_sb = inode->i_sb; + d_set_d_op(tmp, tmp->d_sb->s_d_op); + tmp->d_inode = inode; + tmp->d_flags |= DCACHE_DISCONNECTED; + list_add(&tmp->d_alias, &inode->i_dentry); + hlist_bl_lock(&tmp->d_sb->s_anon); + hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon); + hlist_bl_unlock(&tmp->d_sb->s_anon); + spin_unlock(&tmp->d_lock); + spin_unlock(&inode->i_lock); + security_d_instantiate(tmp, inode); + + return tmp; + + out_iput: + if (res && !IS_ERR(res)) + security_d_instantiate(res, inode); + iput(inode); + return res; +} +EXPORT_SYMBOL(d_obtain_alias); + +/** + * d_splice_alias - splice a disconnected dentry into the tree if one exists + * @inode: the inode which may have a disconnected dentry + * @dentry: a negative dentry which we want to point to the inode. + * + * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and + * DCACHE_DISCONNECTED), then d_move that in place of the given dentry + * and return it, else simply d_add the inode to the dentry and return NULL. + * + * This is needed in the lookup routine of any filesystem that is exportable + * (via knfsd) so that we can build dcache paths to directories effectively. + * + * If a dentry was found and moved, then it is returned. Otherwise NULL + * is returned. This matches the expected return value of ->lookup. + * + */ +struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) +{ + struct dentry *new = NULL; + + if (inode && S_ISDIR(inode->i_mode)) { + spin_lock(&inode->i_lock); + new = __d_find_alias(inode, 1); + if (new) { + BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); + spin_unlock(&inode->i_lock); + security_d_instantiate(new, inode); + d_move(new, dentry); + iput(inode); + } else { + /* already taking inode->i_lock, so d_add() by hand */ + __d_instantiate(dentry, inode); + spin_unlock(&inode->i_lock); + security_d_instantiate(dentry, inode); + d_rehash(dentry); + } + } else + d_add(dentry, inode); + return new; +} +EXPORT_SYMBOL(d_splice_alias); + +/** + * d_add_ci - lookup or allocate new dentry with case-exact name + * @inode: the inode case-insensitive lookup has found + * @dentry: the negative dentry that was passed to the parent's lookup func + * @name: the case-exact name to be associated with the returned dentry + * + * This is to avoid filling the dcache with case-insensitive names to the + * same inode, only the actual correct case is stored in the dcache for + * case-insensitive filesystems. + * + * For a case-insensitive lookup match and if the the case-exact dentry + * already exists in in the dcache, use it and return it. + * + * If no entry exists with the exact case name, allocate new dentry with + * the exact case, and return the spliced entry. + */ +struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, + struct qstr *name) +{ + int error; + struct dentry *found; + struct dentry *new; + + /* + * First check if a dentry matching the name already exists, + * if not go ahead and create it now. + */ + found = d_hash_and_lookup(dentry->d_parent, name); + if (!found) { + new = d_alloc(dentry->d_parent, name); + if (!new) { + error = -ENOMEM; + goto err_out; + } + + found = d_splice_alias(inode, new); + if (found) { + dput(new); + return found; + } + return new; + } + + /* + * If a matching dentry exists, and it's not negative use it. + * + * Decrement the reference count to balance the iget() done + * earlier on. + */ + if (found->d_inode) { + if (unlikely(found->d_inode != inode)) { + /* This can't happen because bad inodes are unhashed. */ + BUG_ON(!is_bad_inode(inode)); + BUG_ON(!is_bad_inode(found->d_inode)); + } + iput(inode); + return found; + } + + /* + * Negative dentry: instantiate it unless the inode is a directory and + * already has a dentry. + */ + spin_lock(&inode->i_lock); + if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) { + __d_instantiate(found, inode); + spin_unlock(&inode->i_lock); + security_d_instantiate(found, inode); + return found; + } + + /* + * In case a directory already has a (disconnected) entry grab a + * reference to it, move it in place and use it. + */ + new = list_entry(inode->i_dentry.next, struct dentry, d_alias); + __dget(new); + spin_unlock(&inode->i_lock); + security_d_instantiate(found, inode); + d_move(new, found); + iput(inode); + dput(found); + return new; + +err_out: + iput(inode); + return ERR_PTR(error); +} +EXPORT_SYMBOL(d_add_ci); + +/** + * __d_lookup_rcu - search for a dentry (racy, store-free) + * @parent: parent dentry + * @name: qstr of name we wish to find + * @seq: returns d_seq value at the point where the dentry was found + * @inode: returns dentry->d_inode when the inode was found valid. + * Returns: dentry, or NULL + * + * __d_lookup_rcu is the dcache lookup function for rcu-walk name + * resolution (store-free path walking) design described in + * Documentation/filesystems/path-lookup.txt. + * + * This is not to be used outside core vfs. + * + * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock + * held, and rcu_read_lock held. The returned dentry must not be stored into + * without taking d_lock and checking d_seq sequence count against @seq + * returned here. + * + * A refcount may be taken on the found dentry with the __d_rcu_to_refcount + * function. + * + * Alternatively, __d_lookup_rcu may be called again to look up the child of + * the returned dentry, so long as its parent's seqlock is checked after the + * child is looked up. Thus, an interlocking stepping of sequence lock checks + * is formed, giving integrity down the path walk. + */ +struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, + unsigned *seq, struct inode **inode) +{ + unsigned int len = name->len; + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct hlist_bl_head *b = d_hash(parent, hash); + struct hlist_bl_node *node; + struct dentry *dentry; + + /* + * Note: There is significant duplication with __d_lookup_rcu which is + * required to prevent single threaded performance regressions + * especially on architectures where smp_rmb (in seqcounts) are costly. + * Keep the two functions in sync. + */ + + /* + * The hash list is protected using RCU. + * + * Carefully use d_seq when comparing a candidate dentry, to avoid + * races with d_move(). + * + * It is possible that concurrent renames can mess up our list + * walk here and result in missing our dentry, resulting in the + * false-negative result. d_lookup() protects against concurrent + * renames using rename_lock seqlock. + * + * See Documentation/filesystems/path-lookup.txt for more details. + */ + hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { + struct inode *i; + const char *tname; + int tlen; + + if (dentry->d_name.hash != hash) + continue; + +seqretry: + *seq = read_seqcount_begin(&dentry->d_seq); + if (dentry->d_parent != parent) + continue; + if (d_unhashed(dentry)) + continue; + tlen = dentry->d_name.len; + tname = dentry->d_name.name; + i = dentry->d_inode; + prefetch(tname); + /* + * This seqcount check is required to ensure name and + * len are loaded atomically, so as not to walk off the + * edge of memory when walking. If we could load this + * atomically some other way, we could drop this check. + */ + if (read_seqcount_retry(&dentry->d_seq, *seq)) + goto seqretry; + if (parent->d_flags & DCACHE_OP_COMPARE) { + if (parent->d_op->d_compare(parent, *inode, + dentry, i, + tlen, tname, name)) + continue; + } else { + if (dentry_cmp(tname, tlen, str, len)) + continue; + } + /* + * No extra seqcount check is required after the name + * compare. The caller must perform a seqcount check in + * order to do anything useful with the returned dentry + * anyway. + */ + *inode = i; + return dentry; + } + return NULL; +} + +/** + * d_lookup - search for a dentry + * @parent: parent dentry + * @name: qstr of name we wish to find + * Returns: dentry, or NULL + * + * d_lookup searches the children of the parent dentry for the name in + * question. If the dentry is found its reference count is incremented and the + * dentry is returned. The caller must use dput to free the entry when it has + * finished using it. %NULL is returned if the dentry does not exist. + */ +struct dentry *d_lookup(struct dentry *parent, struct qstr *name) +{ + struct dentry *dentry; + unsigned seq; + + do { + seq = read_seqbegin(&rename_lock); + dentry = __d_lookup(parent, name); + if (dentry) + break; + } while (read_seqretry(&rename_lock, seq)); + return dentry; +} +EXPORT_SYMBOL(d_lookup); + +/** + * __d_lookup - search for a dentry (racy) + * @parent: parent dentry + * @name: qstr of name we wish to find + * Returns: dentry, or NULL + * + * __d_lookup is like d_lookup, however it may (rarely) return a + * false-negative result due to unrelated rename activity. + * + * __d_lookup is slightly faster by avoiding rename_lock read seqlock, + * however it must be used carefully, eg. with a following d_lookup in + * the case of failure. + * + * __d_lookup callers must be commented. + */ +struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) +{ + unsigned int len = name->len; + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct hlist_bl_head *b = d_hash(parent, hash); + struct hlist_bl_node *node; + struct dentry *found = NULL; + struct dentry *dentry; + + /* + * Note: There is significant duplication with __d_lookup_rcu which is + * required to prevent single threaded performance regressions + * especially on architectures where smp_rmb (in seqcounts) are costly. + * Keep the two functions in sync. + */ + + /* + * The hash list is protected using RCU. + * + * Take d_lock when comparing a candidate dentry, to avoid races + * with d_move(). + * + * It is possible that concurrent renames can mess up our list + * walk here and result in missing our dentry, resulting in the + * false-negative result. d_lookup() protects against concurrent + * renames using rename_lock seqlock. + * + * See Documentation/filesystems/path-lookup.txt for more details. + */ + rcu_read_lock(); + + hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { + const char *tname; + int tlen; + + if (dentry->d_name.hash != hash) + continue; + + spin_lock(&dentry->d_lock); + if (dentry->d_parent != parent) + goto next; + if (d_unhashed(dentry)) + goto next; + + /* + * It is safe to compare names since d_move() cannot + * change the qstr (protected by d_lock). + */ + tlen = dentry->d_name.len; + tname = dentry->d_name.name; + if (parent->d_flags & DCACHE_OP_COMPARE) { + if (parent->d_op->d_compare(parent, parent->d_inode, + dentry, dentry->d_inode, + tlen, tname, name)) + goto next; + } else { + if (dentry_cmp(tname, tlen, str, len)) + goto next; + } + + dentry->d_count++; + found = dentry; + spin_unlock(&dentry->d_lock); + break; +next: + spin_unlock(&dentry->d_lock); + } + rcu_read_unlock(); + + return found; +} + +/** + * d_hash_and_lookup - hash the qstr then search for a dentry + * @dir: Directory to search in + * @name: qstr of name we wish to find + * + * On hash failure or on lookup failure NULL is returned. + */ +struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) +{ + struct dentry *dentry = NULL; + + /* + * Check for a fs-specific hash function. Note that we must + * calculate the standard hash first, as the d_op->d_hash() + * routine may choose to leave the hash value unchanged. + */ + name->hash = full_name_hash(name->name, name->len); + if (dir->d_flags & DCACHE_OP_HASH) { + if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0) + goto out; + } + dentry = d_lookup(dir, name); +out: + return dentry; +} + +/** + * d_validate - verify dentry provided from insecure source (deprecated) + * @dentry: The dentry alleged to be valid child of @dparent + * @dparent: The parent dentry (known to be valid) + * + * An insecure source has sent us a dentry, here we verify it and dget() it. + * This is used by ncpfs in its readdir implementation. + * Zero is returned in the dentry is invalid. + * + * This function is slow for big directories, and deprecated, do not use it. + */ +int d_validate(struct dentry *dentry, struct dentry *dparent) +{ + struct dentry *child; + + spin_lock(&dparent->d_lock); + list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) { + if (dentry == child) { + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + __dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&dparent->d_lock); + return 1; + } + } + spin_unlock(&dparent->d_lock); + + return 0; +} +EXPORT_SYMBOL(d_validate); + +/* + * When a file is deleted, we have two options: + * - turn this dentry into a negative dentry + * - unhash this dentry and free it. + * + * Usually, we want to just turn this into + * a negative dentry, but if anybody else is + * currently using the dentry or the inode + * we can't do that and we fall back on removing + * it from the hash queues and waiting for + * it to be deleted later when it has no users + */ + +/** + * d_delete - delete a dentry + * @dentry: The dentry to delete + * + * Turn the dentry into a negative dentry if possible, otherwise + * remove it from the hash queues so it can be deleted later + */ + +void d_delete(struct dentry * dentry) +{ + struct inode *inode; + int isdir = 0; + /* + * Are we the only user? + */ +again: + spin_lock(&dentry->d_lock); + inode = dentry->d_inode; + isdir = S_ISDIR(inode->i_mode); + if (dentry->d_count == 1) { + if (inode && !spin_trylock(&inode->i_lock)) { + spin_unlock(&dentry->d_lock); + cpu_relax(); + goto again; + } + dentry->d_flags &= ~DCACHE_CANT_MOUNT; + dentry_unlink_inode(dentry); + fsnotify_nameremove(dentry, isdir); + return; + } + + if (!d_unhashed(dentry)) + __d_drop(dentry); + + spin_unlock(&dentry->d_lock); + + fsnotify_nameremove(dentry, isdir); +} +EXPORT_SYMBOL(d_delete); + +static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b) +{ + BUG_ON(!d_unhashed(entry)); + hlist_bl_lock(b); + entry->d_flags |= DCACHE_RCUACCESS; + hlist_bl_add_head_rcu(&entry->d_hash, b); + hlist_bl_unlock(b); +} + +static void _d_rehash(struct dentry * entry) +{ + __d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash)); +} + +/** + * d_rehash - add an entry back to the hash + * @entry: dentry to add to the hash + * + * Adds a dentry to the hash according to its name. + */ + +void d_rehash(struct dentry * entry) +{ + spin_lock(&entry->d_lock); + _d_rehash(entry); + spin_unlock(&entry->d_lock); +} +EXPORT_SYMBOL(d_rehash); + +/** + * dentry_update_name_case - update case insensitive dentry with a new name + * @dentry: dentry to be updated + * @name: new name + * + * Update a case insensitive dentry with new case of name. + * + * dentry must have been returned by d_lookup with name @name. Old and new + * name lengths must match (ie. no d_compare which allows mismatched name + * lengths). + * + * Parent inode i_mutex must be held over d_lookup and into this call (to + * keep renames and concurrent inserts, and readdir(2) away). + */ +void dentry_update_name_case(struct dentry *dentry, struct qstr *name) +{ + BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex)); + BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ + + spin_lock(&dentry->d_lock); + write_seqcount_begin(&dentry->d_seq); + memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); + write_seqcount_end(&dentry->d_seq); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(dentry_update_name_case); + +static void switch_names(struct dentry *dentry, struct dentry *target) +{ + if (dname_external(target)) { + if (dname_external(dentry)) { + /* + * Both external: swap the pointers + */ + swap(target->d_name.name, dentry->d_name.name); + } else { + /* + * dentry:internal, target:external. Steal target's + * storage and make target internal. + */ + memcpy(target->d_iname, dentry->d_name.name, + dentry->d_name.len + 1); + dentry->d_name.name = target->d_name.name; + target->d_name.name = target->d_iname; + } + } else { + if (dname_external(dentry)) { + /* + * dentry:external, target:internal. Give dentry's + * storage to target and make dentry internal + */ + memcpy(dentry->d_iname, target->d_name.name, + target->d_name.len + 1); + target->d_name.name = dentry->d_name.name; + dentry->d_name.name = dentry->d_iname; + } else { + /* + * Both are internal. Just copy target to dentry + */ + memcpy(dentry->d_iname, target->d_name.name, + target->d_name.len + 1); + dentry->d_name.len = target->d_name.len; + return; + } + } + swap(dentry->d_name.len, target->d_name.len); +} + +static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target) +{ + /* + * XXXX: do we really need to take target->d_lock? + */ + if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent) + spin_lock(&target->d_parent->d_lock); + else { + if (d_ancestor(dentry->d_parent, target->d_parent)) { + spin_lock(&dentry->d_parent->d_lock); + spin_lock_nested(&target->d_parent->d_lock, + DENTRY_D_LOCK_NESTED); + } else { + spin_lock(&target->d_parent->d_lock); + spin_lock_nested(&dentry->d_parent->d_lock, + DENTRY_D_LOCK_NESTED); + } + } + if (target < dentry) { + spin_lock_nested(&target->d_lock, 2); + spin_lock_nested(&dentry->d_lock, 3); + } else { + spin_lock_nested(&dentry->d_lock, 2); + spin_lock_nested(&target->d_lock, 3); + } +} + +static void dentry_unlock_parents_for_move(struct dentry *dentry, + struct dentry *target) +{ + if (target->d_parent != dentry->d_parent) + spin_unlock(&dentry->d_parent->d_lock); + if (target->d_parent != target) + spin_unlock(&target->d_parent->d_lock); +} + +/* + * When switching names, the actual string doesn't strictly have to + * be preserved in the target - because we're dropping the target + * anyway. As such, we can just do a simple memcpy() to copy over + * the new name before we switch. + * + * Note that we have to be a lot more careful about getting the hash + * switched - we have to switch the hash value properly even if it + * then no longer matches the actual (corrupted) string of the target. + * The hash value has to match the hash queue that the dentry is on.. + */ +/* + * __d_move - move a dentry + * @dentry: entry to move + * @target: new dentry + * + * Update the dcache to reflect the move of a file name. Negative + * dcache entries should not be moved in this way. Caller hold + * rename_lock. + */ +static void __d_move(struct dentry * dentry, struct dentry * target) +{ + if (!dentry->d_inode) + printk(KERN_WARNING "VFS: moving negative dcache entry\n"); + + BUG_ON(d_ancestor(dentry, target)); + BUG_ON(d_ancestor(target, dentry)); + + dentry_lock_for_move(dentry, target); + + write_seqcount_begin(&dentry->d_seq); + write_seqcount_begin(&target->d_seq); + + /* __d_drop does write_seqcount_barrier, but they're OK to nest. */ + + /* + * Move the dentry to the target hash queue. Don't bother checking + * for the same hash queue because of how unlikely it is. + */ + __d_drop(dentry); + __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash)); + + /* Unhash the target: dput() will then get rid of it */ + __d_drop(target); + + list_del(&dentry->d_u.d_child); + list_del(&target->d_u.d_child); + + /* Switch the names.. */ + switch_names(dentry, target); + swap(dentry->d_name.hash, target->d_name.hash); + + /* ... and switch the parents */ + if (IS_ROOT(dentry)) { + dentry->d_parent = target->d_parent; + target->d_parent = target; + INIT_LIST_HEAD(&target->d_u.d_child); + } else { + swap(dentry->d_parent, target->d_parent); + + /* And add them back to the (new) parent lists */ + list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); + } + + list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); + + write_seqcount_end(&target->d_seq); + write_seqcount_end(&dentry->d_seq); + + dentry_unlock_parents_for_move(dentry, target); + spin_unlock(&target->d_lock); + fsnotify_d_move(dentry); + spin_unlock(&dentry->d_lock); +} + +/* + * d_move - move a dentry + * @dentry: entry to move + * @target: new dentry + * + * Update the dcache to reflect the move of a file name. Negative + * dcache entries should not be moved in this way. + */ +void d_move(struct dentry *dentry, struct dentry *target) +{ + write_seqlock(&rename_lock); + __d_move(dentry, target); + write_sequnlock(&rename_lock); +} +EXPORT_SYMBOL(d_move); + +/** + * d_ancestor - search for an ancestor + * @p1: ancestor dentry + * @p2: child dentry + * + * Returns the ancestor dentry of p2 which is a child of p1, if p1 is + * an ancestor of p2, else NULL. + */ +struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) +{ + struct dentry *p; + + for (p = p2; !IS_ROOT(p); p = p->d_parent) { + if (p->d_parent == p1) + return p; + } + return NULL; +} + +/* + * This helper attempts to cope with remotely renamed directories + * + * It assumes that the caller is already holding + * dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock + * + * Note: If ever the locking in lock_rename() changes, then please + * remember to update this too... + */ +static struct dentry *__d_unalias(struct inode *inode, + struct dentry *dentry, struct dentry *alias) +{ + struct mutex *m1 = NULL, *m2 = NULL; + struct dentry *ret; + + /* If alias and dentry share a parent, then no extra locks required */ + if (alias->d_parent == dentry->d_parent) + goto out_unalias; + + /* See lock_rename() */ + ret = ERR_PTR(-EBUSY); + if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) + goto out_err; + m1 = &dentry->d_sb->s_vfs_rename_mutex; + if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex)) + goto out_err; + m2 = &alias->d_parent->d_inode->i_mutex; +out_unalias: + __d_move(alias, dentry); + ret = alias; +out_err: + spin_unlock(&inode->i_lock); + if (m2) + mutex_unlock(m2); + if (m1) + mutex_unlock(m1); + return ret; +} + +/* + * Prepare an anonymous dentry for life in the superblock's dentry tree as a + * named dentry in place of the dentry to be replaced. + * returns with anon->d_lock held! + */ +static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) +{ + struct dentry *dparent, *aparent; + + dentry_lock_for_move(anon, dentry); + + write_seqcount_begin(&dentry->d_seq); + write_seqcount_begin(&anon->d_seq); + + dparent = dentry->d_parent; + aparent = anon->d_parent; + + switch_names(dentry, anon); + swap(dentry->d_name.hash, anon->d_name.hash); + + dentry->d_parent = (aparent == anon) ? dentry : aparent; + list_del(&dentry->d_u.d_child); + if (!IS_ROOT(dentry)) + list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); + else + INIT_LIST_HEAD(&dentry->d_u.d_child); + + anon->d_parent = (dparent == dentry) ? anon : dparent; + list_del(&anon->d_u.d_child); + if (!IS_ROOT(anon)) + list_add(&anon->d_u.d_child, &anon->d_parent->d_subdirs); + else + INIT_LIST_HEAD(&anon->d_u.d_child); + + write_seqcount_end(&dentry->d_seq); + write_seqcount_end(&anon->d_seq); + + dentry_unlock_parents_for_move(anon, dentry); + spin_unlock(&dentry->d_lock); + + /* anon->d_lock still locked, returns locked */ + anon->d_flags &= ~DCACHE_DISCONNECTED; +} + +/** + * d_materialise_unique - introduce an inode into the tree + * @dentry: candidate dentry + * @inode: inode to bind to the dentry, to which aliases may be attached + * + * Introduces an dentry into the tree, substituting an extant disconnected + * root directory alias in its place if there is one + */ +struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) +{ + struct dentry *actual; + + BUG_ON(!d_unhashed(dentry)); + + if (!inode) { + actual = dentry; + __d_instantiate(dentry, NULL); + d_rehash(actual); + goto out_nolock; + } + + spin_lock(&inode->i_lock); + + if (S_ISDIR(inode->i_mode)) { + struct dentry *alias; + + /* Does an aliased dentry already exist? */ + alias = __d_find_alias(inode, 0); + if (alias) { + actual = alias; + write_seqlock(&rename_lock); + + if (d_ancestor(alias, dentry)) { + /* Check for loops */ + actual = ERR_PTR(-ELOOP); + spin_unlock(&inode->i_lock); + } else if (IS_ROOT(alias)) { + /* Is this an anonymous mountpoint that we + * could splice into our tree? */ + __d_materialise_dentry(dentry, alias); + write_sequnlock(&rename_lock); + __d_drop(alias); + goto found; + } else { + /* Nope, but we must(!) avoid directory + * aliasing. This drops inode->i_lock */ + actual = __d_unalias(inode, dentry, alias); + } + write_sequnlock(&rename_lock); + if (IS_ERR(actual)) + dput(alias); + goto out_nolock; + } + } + + /* Add a unique reference */ + actual = __d_instantiate_unique(dentry, inode); + if (!actual) + actual = dentry; + else + BUG_ON(!d_unhashed(actual)); + + spin_lock(&actual->d_lock); +found: + _d_rehash(actual); + spin_unlock(&actual->d_lock); + spin_unlock(&inode->i_lock); +out_nolock: + if (actual == dentry) { + security_d_instantiate(dentry, inode); + return NULL; + } + + iput(inode); + return actual; +} +EXPORT_SYMBOL_GPL(d_materialise_unique); + +static int prepend(char **buffer, int *buflen, const char *str, int namelen) +{ + *buflen -= namelen; + if (*buflen < 0) + return -ENAMETOOLONG; + *buffer -= namelen; + memcpy(*buffer, str, namelen); + return 0; +} + +static int prepend_name(char **buffer, int *buflen, struct qstr *name) +{ + return prepend(buffer, buflen, name->name, name->len); +} + +/** + * prepend_path - Prepend path string to a buffer + * @path: the dentry/vfsmount to report + * @root: root vfsmnt/dentry + * @buffer: pointer to the end of the buffer + * @buflen: pointer to buffer length + * + * Caller holds the rename_lock. + */ +static int prepend_path(const struct path *path, + const struct path *root, + char **buffer, int *buflen) +{ + struct dentry *dentry = path->dentry; + struct vfsmount *vfsmnt = path->mnt; + bool slash = false; + int error = 0; + + br_read_lock(vfsmount_lock); + while (dentry != root->dentry || vfsmnt != root->mnt) { + struct dentry * parent; + + if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { + /* Global root? */ + if (vfsmnt->mnt_parent == vfsmnt) { + goto global_root; + } + dentry = vfsmnt->mnt_mountpoint; + vfsmnt = vfsmnt->mnt_parent; + continue; + } + parent = dentry->d_parent; + prefetch(parent); + spin_lock(&dentry->d_lock); + error = prepend_name(buffer, buflen, &dentry->d_name); + spin_unlock(&dentry->d_lock); + if (!error) + error = prepend(buffer, buflen, "/", 1); + if (error) + break; + + slash = true; + dentry = parent; + } + + if (!error && !slash) + error = prepend(buffer, buflen, "/", 1); + +out: + br_read_unlock(vfsmount_lock); + return error; + +global_root: + /* + * Filesystems needing to implement special "root names" + * should do so with ->d_dname() + */ + if (IS_ROOT(dentry) && + (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) { + WARN(1, "Root dentry has weird name <%.*s>\n", + (int) dentry->d_name.len, dentry->d_name.name); + } + if (!slash) + error = prepend(buffer, buflen, "/", 1); + if (!error) + error = vfsmnt->mnt_ns ? 1 : 2; + goto out; +} + +/** + * __d_path - return the path of a dentry + * @path: the dentry/vfsmount to report + * @root: root vfsmnt/dentry + * @buf: buffer to return value in + * @buflen: buffer length + * + * Convert a dentry into an ASCII path name. + * + * Returns a pointer into the buffer or an error code if the + * path was too long. + * + * "buflen" should be positive. + * + * If the path is not reachable from the supplied root, return %NULL. + */ +char *__d_path(const struct path *path, + const struct path *root, + char *buf, int buflen) +{ + char *res = buf + buflen; + int error; + + prepend(&res, &buflen, "\0", 1); + write_seqlock(&rename_lock); + error = prepend_path(path, root, &res, &buflen); + write_sequnlock(&rename_lock); + + if (error < 0) + return ERR_PTR(error); + if (error > 0) + return NULL; + return res; +} + +char *d_absolute_path(const struct path *path, + char *buf, int buflen) +{ + struct path root = {}; + char *res = buf + buflen; + int error; + + prepend(&res, &buflen, "\0", 1); + write_seqlock(&rename_lock); + error = prepend_path(path, &root, &res, &buflen); + write_sequnlock(&rename_lock); + + if (error > 1) + error = -EINVAL; + if (error < 0) + return ERR_PTR(error); + return res; +} + +/* + * same as __d_path but appends "(deleted)" for unlinked files. + */ +static int path_with_deleted(const struct path *path, + const struct path *root, + char **buf, int *buflen) +{ + prepend(buf, buflen, "\0", 1); + if (d_unlinked(path->dentry)) { + int error = prepend(buf, buflen, " (deleted)", 10); + if (error) + return error; + } + + return prepend_path(path, root, buf, buflen); +} + +static int prepend_unreachable(char **buffer, int *buflen) +{ + return prepend(buffer, buflen, "(unreachable)", 13); +} + +/** + * d_path - return the path of a dentry + * @path: path to report + * @buf: buffer to return value in + * @buflen: buffer length + * + * Convert a dentry into an ASCII path name. If the entry has been deleted + * the string " (deleted)" is appended. Note that this is ambiguous. + * + * Returns a pointer into the buffer or an error code if the path was + * too long. Note: Callers should use the returned pointer, not the passed + * in buffer, to use the name! The implementation often starts at an offset + * into the buffer, and may leave 0 bytes at the start. + * + * "buflen" should be positive. + */ +char *d_path(const struct path *path, char *buf, int buflen) +{ + char *res = buf + buflen; + struct path root; + int error; + + /* + * We have various synthetic filesystems that never get mounted. On + * these filesystems dentries are never used for lookup purposes, and + * thus don't need to be hashed. They also don't need a name until a + * user wants to identify the object in /proc/pid/fd/. The little hack + * below allows us to generate a name for these objects on demand: + */ + if (path->dentry->d_op && path->dentry->d_op->d_dname) + return path->dentry->d_op->d_dname(path->dentry, buf, buflen); + + get_fs_root(current->fs, &root); + write_seqlock(&rename_lock); + error = path_with_deleted(path, &root, &res, &buflen); + if (error < 0) + res = ERR_PTR(error); + write_sequnlock(&rename_lock); + path_put(&root); + return res; +} +EXPORT_SYMBOL(d_path); + +/** + * d_path_with_unreachable - return the path of a dentry + * @path: path to report + * @buf: buffer to return value in + * @buflen: buffer length + * + * The difference from d_path() is that this prepends "(unreachable)" + * to paths which are unreachable from the current process' root. + */ +char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) +{ + char *res = buf + buflen; + struct path root; + int error; + + if (path->dentry->d_op && path->dentry->d_op->d_dname) + return path->dentry->d_op->d_dname(path->dentry, buf, buflen); + + get_fs_root(current->fs, &root); + write_seqlock(&rename_lock); + error = path_with_deleted(path, &root, &res, &buflen); + if (error > 0) + error = prepend_unreachable(&res, &buflen); + write_sequnlock(&rename_lock); + path_put(&root); + if (error) + res = ERR_PTR(error); + + return res; +} + +/* + * Helper function for dentry_operations.d_dname() members + */ +char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, + const char *fmt, ...) +{ + va_list args; + char temp[64]; + int sz; + + va_start(args, fmt); + sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1; + va_end(args); + + if (sz > sizeof(temp) || sz > buflen) + return ERR_PTR(-ENAMETOOLONG); + + buffer += buflen - sz; + return memcpy(buffer, temp, sz); +} + +/* + * Write full pathname from the root of the filesystem into the buffer. + */ +static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) +{ + char *end = buf + buflen; + char *retval; + + prepend(&end, &buflen, "\0", 1); + if (buflen < 1) + goto Elong; + /* Get '/' right */ + retval = end-1; + *retval = '/'; + + while (!IS_ROOT(dentry)) { + struct dentry *parent = dentry->d_parent; + int error; + + prefetch(parent); + spin_lock(&dentry->d_lock); + error = prepend_name(&end, &buflen, &dentry->d_name); + spin_unlock(&dentry->d_lock); + if (error != 0 || prepend(&end, &buflen, "/", 1) != 0) + goto Elong; + + retval = end; + dentry = parent; + } + return retval; +Elong: + return ERR_PTR(-ENAMETOOLONG); +} + +char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) +{ + char *retval; + + write_seqlock(&rename_lock); + retval = __dentry_path(dentry, buf, buflen); + write_sequnlock(&rename_lock); + + return retval; +} +EXPORT_SYMBOL(dentry_path_raw); + +char *dentry_path(struct dentry *dentry, char *buf, int buflen) +{ + char *p = NULL; + char *retval; + + write_seqlock(&rename_lock); + if (d_unlinked(dentry)) { + p = buf + buflen; + if (prepend(&p, &buflen, "//deleted", 10) != 0) + goto Elong; + buflen++; + } + retval = __dentry_path(dentry, buf, buflen); + write_sequnlock(&rename_lock); + if (!IS_ERR(retval) && p) + *p = '/'; /* restore '/' overriden with '\0' */ + return retval; +Elong: + return ERR_PTR(-ENAMETOOLONG); +} + +/* + * NOTE! The user-level library version returns a + * character pointer. The kernel system call just + * returns the length of the buffer filled (which + * includes the ending '\0' character), or a negative + * error value. So libc would do something like + * + * char *getcwd(char * buf, size_t size) + * { + * int retval; + * + * retval = sys_getcwd(buf, size); + * if (retval >= 0) + * return buf; + * errno = -retval; + * return NULL; + * } + */ +SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) +{ + int error; + struct path pwd, root; + char *page = (char *) __get_free_page(GFP_USER); + + if (!page) + return -ENOMEM; + + get_fs_root_and_pwd(current->fs, &root, &pwd); + + error = -ENOENT; + write_seqlock(&rename_lock); + if (!d_unlinked(pwd.dentry)) { + unsigned long len; + char *cwd = page + PAGE_SIZE; + int buflen = PAGE_SIZE; + + prepend(&cwd, &buflen, "\0", 1); + error = prepend_path(&pwd, &root, &cwd, &buflen); + write_sequnlock(&rename_lock); + + if (error < 0) + goto out; + + /* Unreachable from current root */ + if (error > 0) { + error = prepend_unreachable(&cwd, &buflen); + if (error) + goto out; + } + + error = -ERANGE; + len = PAGE_SIZE + page - cwd; + if (len <= size) { + error = len; + if (copy_to_user(buf, cwd, len)) + error = -EFAULT; + } + } else { + write_sequnlock(&rename_lock); + } + +out: + path_put(&pwd); + path_put(&root); + free_page((unsigned long) page); + return error; +} + +/* + * Test whether new_dentry is a subdirectory of old_dentry. + * + * Trivially implemented using the dcache structure + */ + +/** + * is_subdir - is new dentry a subdirectory of old_dentry + * @new_dentry: new dentry + * @old_dentry: old dentry + * + * Returns 1 if new_dentry is a subdirectory of the parent (at any depth). + * Returns 0 otherwise. + * Caller must ensure that "new_dentry" is pinned before calling is_subdir() + */ + +int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) +{ + int result; + unsigned seq; + + if (new_dentry == old_dentry) + return 1; + + do { + /* for restarting inner loop in case of seq retry */ + seq = read_seqbegin(&rename_lock); + /* + * Need rcu_readlock to protect against the d_parent trashing + * due to d_move + */ + rcu_read_lock(); + if (d_ancestor(old_dentry, new_dentry)) + result = 1; + else + result = 0; + rcu_read_unlock(); + } while (read_seqretry(&rename_lock, seq)); + + return result; +} + +int path_is_under(struct path *path1, struct path *path2) +{ + struct vfsmount *mnt = path1->mnt; + struct dentry *dentry = path1->dentry; + int res; + + br_read_lock(vfsmount_lock); + if (mnt != path2->mnt) { + for (;;) { + if (mnt->mnt_parent == mnt) { + br_read_unlock(vfsmount_lock); + return 0; + } + if (mnt->mnt_parent == path2->mnt) + break; + mnt = mnt->mnt_parent; + } + dentry = mnt->mnt_mountpoint; + } + res = is_subdir(dentry, path2->dentry); + br_read_unlock(vfsmount_lock); + return res; +} +EXPORT_SYMBOL(path_is_under); + +void d_genocide(struct dentry *root) +{ + struct dentry *this_parent; + struct list_head *next; + unsigned seq; + int locked = 0; + + seq = read_seqbegin(&rename_lock); +again: + this_parent = root; + spin_lock(&this_parent->d_lock); +repeat: + next = this_parent->d_subdirs.next; +resume: + while (next != &this_parent->d_subdirs) { + struct list_head *tmp = next; + struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); + next = tmp->next; + + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (d_unhashed(dentry) || !dentry->d_inode) { + spin_unlock(&dentry->d_lock); + continue; + } + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); + this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); + goto repeat; + } + if (!(dentry->d_flags & DCACHE_GENOCIDE)) { + dentry->d_flags |= DCACHE_GENOCIDE; + dentry->d_count--; + } + spin_unlock(&dentry->d_lock); + } + if (this_parent != root) { + struct dentry *child = this_parent; + if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { + this_parent->d_flags |= DCACHE_GENOCIDE; + this_parent->d_count--; + } + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) + goto rename_retry; + next = child->d_u.d_child.next; + goto resume; + } + spin_unlock(&this_parent->d_lock); + if (!locked && read_seqretry(&rename_lock, seq)) + goto rename_retry; + if (locked) + write_sequnlock(&rename_lock); + return; + +rename_retry: + locked = 1; + write_seqlock(&rename_lock); + goto again; +} + +/** + * find_inode_number - check for dentry with name + * @dir: directory to check + * @name: Name to find. + * + * Check whether a dentry already exists for the given name, + * and return the inode number if it has an inode. Otherwise + * 0 is returned. + * + * This routine is used to post-process directory listings for + * filesystems using synthetic inode numbers, and is necessary + * to keep getcwd() working. + */ + +ino_t find_inode_number(struct dentry *dir, struct qstr *name) +{ + struct dentry * dentry; + ino_t ino = 0; + + dentry = d_hash_and_lookup(dir, name); + if (dentry) { + if (dentry->d_inode) + ino = dentry->d_inode->i_ino; + dput(dentry); + } + return ino; +} +EXPORT_SYMBOL(find_inode_number); + +static __initdata unsigned long dhash_entries; +static int __init set_dhash_entries(char *str) +{ + if (!str) + return 0; + dhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("dhash_entries=", set_dhash_entries); + +static void __init dcache_init_early(void) +{ + int loop; + + /* If hashes are distributed across NUMA nodes, defer + * hash allocation until vmalloc space is available. + */ + if (hashdist) + return; + + dentry_hashtable = + alloc_large_system_hash("Dentry cache", + sizeof(struct hlist_bl_head), + dhash_entries, + 13, + HASH_EARLY, + &d_hash_shift, + &d_hash_mask, + 0); + + for (loop = 0; loop < (1 << d_hash_shift); loop++) + INIT_HLIST_BL_HEAD(dentry_hashtable + loop); +} + +static void __init dcache_init(void) +{ + int loop; + + /* + * A constructor could be added for stable state like the lists, + * but it is probably not worth it because of the cache nature + * of the dcache. + */ + dentry_cache = KMEM_CACHE(dentry, + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); + + register_shrinker(&dcache_shrinker); + + /* Hash may have been set up in dcache_init_early */ + if (!hashdist) + return; + + dentry_hashtable = + alloc_large_system_hash("Dentry cache", + sizeof(struct hlist_bl_head), + dhash_entries, + 13, + 0, + &d_hash_shift, + &d_hash_mask, + 0); + + for (loop = 0; loop < (1 << d_hash_shift); loop++) + INIT_HLIST_BL_HEAD(dentry_hashtable + loop); +} + +/* SLAB cache for __getname() consumers */ +struct kmem_cache *names_cachep __read_mostly; +EXPORT_SYMBOL(names_cachep); + +EXPORT_SYMBOL(d_genocide); + +void __init vfs_caches_init_early(void) +{ + dcache_init_early(); + inode_init_early(); +} + +void __init vfs_caches_init(unsigned long mempages) +{ + unsigned long reserve; + + /* Base hash sizes on available memory, with a reserve equal to + 150% of current kernel size */ + + reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1); + mempages -= reserve; + + names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + + dcache_init(); + inode_init(); + files_init(mempages); + mnt_init(); + bdev_cache_init(); + chrdev_init(); +} diff --git a/fs/dcookies.c b/fs/dcookies.c new file mode 100644 index 00000000..dda0dc70 --- /dev/null +++ b/fs/dcookies.c @@ -0,0 +1,345 @@ +/* + * dcookies.c + * + * Copyright 2002 John Levon + * + * Persistent cookie-path mappings. These are used by + * profilers to convert a per-task EIP value into something + * non-transitory that can be processed at a later date. + * This is done by locking the dentry/vfsmnt pair in the + * kernel until released by the tasks needing the persistent + * objects. The tag is simply an unsigned long that refers + * to the pair and can be looked up from userspace. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* The dcookies are allocated from a kmem_cache and + * hashed onto a small number of lists. None of the + * code here is particularly performance critical + */ +struct dcookie_struct { + struct path path; + struct list_head hash_list; +}; + +static LIST_HEAD(dcookie_users); +static DEFINE_MUTEX(dcookie_mutex); +static struct kmem_cache *dcookie_cache __read_mostly; +static struct list_head *dcookie_hashtable __read_mostly; +static size_t hash_size __read_mostly; + +static inline int is_live(void) +{ + return !(list_empty(&dcookie_users)); +} + + +/* The dentry is locked, its address will do for the cookie */ +static inline unsigned long dcookie_value(struct dcookie_struct * dcs) +{ + return (unsigned long)dcs->path.dentry; +} + + +static size_t dcookie_hash(unsigned long dcookie) +{ + return (dcookie >> L1_CACHE_SHIFT) & (hash_size - 1); +} + + +static struct dcookie_struct * find_dcookie(unsigned long dcookie) +{ + struct dcookie_struct *found = NULL; + struct dcookie_struct * dcs; + struct list_head * pos; + struct list_head * list; + + list = dcookie_hashtable + dcookie_hash(dcookie); + + list_for_each(pos, list) { + dcs = list_entry(pos, struct dcookie_struct, hash_list); + if (dcookie_value(dcs) == dcookie) { + found = dcs; + break; + } + } + + return found; +} + + +static void hash_dcookie(struct dcookie_struct * dcs) +{ + struct list_head * list = dcookie_hashtable + dcookie_hash(dcookie_value(dcs)); + list_add(&dcs->hash_list, list); +} + + +static struct dcookie_struct *alloc_dcookie(struct path *path) +{ + struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache, + GFP_KERNEL); + struct dentry *d; + if (!dcs) + return NULL; + + d = path->dentry; + spin_lock(&d->d_lock); + d->d_flags |= DCACHE_COOKIE; + spin_unlock(&d->d_lock); + + dcs->path = *path; + path_get(path); + hash_dcookie(dcs); + return dcs; +} + + +/* This is the main kernel-side routine that retrieves the cookie + * value for a dentry/vfsmnt pair. + */ +int get_dcookie(struct path *path, unsigned long *cookie) +{ + int err = 0; + struct dcookie_struct * dcs; + + mutex_lock(&dcookie_mutex); + + if (!is_live()) { + err = -EINVAL; + goto out; + } + + if (path->dentry->d_flags & DCACHE_COOKIE) { + dcs = find_dcookie((unsigned long)path->dentry); + } else { + dcs = alloc_dcookie(path); + if (!dcs) { + err = -ENOMEM; + goto out; + } + } + + *cookie = dcookie_value(dcs); + +out: + mutex_unlock(&dcookie_mutex); + return err; +} + + +/* And here is where the userspace process can look up the cookie value + * to retrieve the path. + */ +SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len) +{ + unsigned long cookie = (unsigned long)cookie64; + int err = -EINVAL; + char * kbuf; + char * path; + size_t pathlen; + struct dcookie_struct * dcs; + + /* we could leak path information to users + * without dir read permission without this + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&dcookie_mutex); + + if (!is_live()) { + err = -EINVAL; + goto out; + } + + if (!(dcs = find_dcookie(cookie))) + goto out; + + err = -ENOMEM; + kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!kbuf) + goto out; + + /* FIXME: (deleted) ? */ + path = d_path(&dcs->path, kbuf, PAGE_SIZE); + + mutex_unlock(&dcookie_mutex); + + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out_free; + } + + err = -ERANGE; + + pathlen = kbuf + PAGE_SIZE - path; + if (pathlen <= len) { + err = pathlen; + if (copy_to_user(buf, path, pathlen)) + err = -EFAULT; + } + +out_free: + kfree(kbuf); + return err; +out: + mutex_unlock(&dcookie_mutex); + return err; +} +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_lookup_dcookie(u64 cookie64, long buf, long len) +{ + return SYSC_lookup_dcookie(cookie64, (char __user *) buf, (size_t) len); +} +SYSCALL_ALIAS(sys_lookup_dcookie, SyS_lookup_dcookie); +#endif + +static int dcookie_init(void) +{ + struct list_head * d; + unsigned int i, hash_bits; + int err = -ENOMEM; + + dcookie_cache = kmem_cache_create("dcookie_cache", + sizeof(struct dcookie_struct), + 0, 0, NULL); + + if (!dcookie_cache) + goto out; + + dcookie_hashtable = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!dcookie_hashtable) + goto out_kmem; + + err = 0; + + /* + * Find the power-of-two list-heads that can fit into the allocation.. + * We don't guarantee that "sizeof(struct list_head)" is necessarily + * a power-of-two. + */ + hash_size = PAGE_SIZE / sizeof(struct list_head); + hash_bits = 0; + do { + hash_bits++; + } while ((hash_size >> hash_bits) != 0); + hash_bits--; + + /* + * Re-calculate the actual number of entries and the mask + * from the number of bits we can fit. + */ + hash_size = 1UL << hash_bits; + + /* And initialize the newly allocated array */ + d = dcookie_hashtable; + i = hash_size; + do { + INIT_LIST_HEAD(d); + d++; + i--; + } while (i); + +out: + return err; +out_kmem: + kmem_cache_destroy(dcookie_cache); + goto out; +} + + +static void free_dcookie(struct dcookie_struct * dcs) +{ + struct dentry *d = dcs->path.dentry; + + spin_lock(&d->d_lock); + d->d_flags &= ~DCACHE_COOKIE; + spin_unlock(&d->d_lock); + + path_put(&dcs->path); + kmem_cache_free(dcookie_cache, dcs); +} + + +static void dcookie_exit(void) +{ + struct list_head * list; + struct list_head * pos; + struct list_head * pos2; + struct dcookie_struct * dcs; + size_t i; + + for (i = 0; i < hash_size; ++i) { + list = dcookie_hashtable + i; + list_for_each_safe(pos, pos2, list) { + dcs = list_entry(pos, struct dcookie_struct, hash_list); + list_del(&dcs->hash_list); + free_dcookie(dcs); + } + } + + kfree(dcookie_hashtable); + kmem_cache_destroy(dcookie_cache); +} + + +struct dcookie_user { + struct list_head next; +}; + +struct dcookie_user * dcookie_register(void) +{ + struct dcookie_user * user; + + mutex_lock(&dcookie_mutex); + + user = kmalloc(sizeof(struct dcookie_user), GFP_KERNEL); + if (!user) + goto out; + + if (!is_live() && dcookie_init()) + goto out_free; + + list_add(&user->next, &dcookie_users); + +out: + mutex_unlock(&dcookie_mutex); + return user; +out_free: + kfree(user); + user = NULL; + goto out; +} + + +void dcookie_unregister(struct dcookie_user * user) +{ + mutex_lock(&dcookie_mutex); + + list_del(&user->next); + kfree(user); + + if (!is_live()) + dcookie_exit(); + + mutex_unlock(&dcookie_mutex); +} + +EXPORT_SYMBOL_GPL(dcookie_register); +EXPORT_SYMBOL_GPL(dcookie_unregister); +EXPORT_SYMBOL_GPL(get_dcookie); diff --git a/fs/debugfs/Makefile b/fs/debugfs/Makefile new file mode 100644 index 00000000..840c4569 --- /dev/null +++ b/fs/debugfs/Makefile @@ -0,0 +1,4 @@ +debugfs-objs := inode.o file.o + +obj-$(CONFIG_DEBUG_FS) += debugfs.o + diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c new file mode 100644 index 00000000..90f76575 --- /dev/null +++ b/fs/debugfs/file.c @@ -0,0 +1,527 @@ +/* + * file.c - part of debugfs, a tiny little debug file system + * + * Copyright (C) 2004 Greg Kroah-Hartman + * Copyright (C) 2004 IBM Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * debugfs is for people to use instead of /proc or /sys. + * See Documentation/DocBook/filesystems for more details. + * + */ + +#include +#include +#include +#include +#include + +static ssize_t default_read_file(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return 0; +} + +static ssize_t default_write_file(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return count; +} + +static int default_open(struct inode *inode, struct file *file) +{ + if (inode->i_private) + file->private_data = inode->i_private; + + return 0; +} + +const struct file_operations debugfs_file_operations = { + .read = default_read_file, + .write = default_write_file, + .open = default_open, + .llseek = noop_llseek, +}; + +static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, dentry->d_inode->i_private); + return NULL; +} + +const struct inode_operations debugfs_link_operations = { + .readlink = generic_readlink, + .follow_link = debugfs_follow_link, +}; + +static int debugfs_u8_set(void *data, u64 val) +{ + *(u8 *)data = val; + return 0; +} +static int debugfs_u8_get(void *data, u64 *val) +{ + *val = *(u8 *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); + +/** + * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_u8(const char *name, mode_t mode, + struct dentry *parent, u8 *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u8_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u8_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_u8); +} +EXPORT_SYMBOL_GPL(debugfs_create_u8); + +static int debugfs_u16_set(void *data, u64 val) +{ + *(u16 *)data = val; + return 0; +} +static int debugfs_u16_get(void *data, u64 *val) +{ + *val = *(u16 *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); + +/** + * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_u16(const char *name, mode_t mode, + struct dentry *parent, u16 *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u16_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u16_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_u16); +} +EXPORT_SYMBOL_GPL(debugfs_create_u16); + +static int debugfs_u32_set(void *data, u64 val) +{ + *(u32 *)data = val; + return 0; +} +static int debugfs_u32_get(void *data, u64 *val) +{ + *val = *(u32 *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); + +/** + * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_u32(const char *name, mode_t mode, + struct dentry *parent, u32 *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u32_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u32_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_u32); +} +EXPORT_SYMBOL_GPL(debugfs_create_u32); + +static int debugfs_u64_set(void *data, u64 val) +{ + *(u64 *)data = val; + return 0; +} + +static int debugfs_u64_get(void *data, u64 *val) +{ + *val = *(u64 *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); + +/** + * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_u64(const char *name, mode_t mode, + struct dentry *parent, u64 *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u64_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u64_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_u64); +} +EXPORT_SYMBOL_GPL(debugfs_create_u64); + +DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); + +DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n"); + +DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); + +DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n"); + +/* + * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value + * + * These functions are exactly the same as the above functions (but use a hex + * output for the decimal challenged). For details look at the above unsigned + * decimal functions. + */ + +/** + * debugfs_create_x8 - create a debugfs file that is used to read and write an unsigned 8-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_x8(const char *name, mode_t mode, + struct dentry *parent, u8 *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x8_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x8_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_x8); +} +EXPORT_SYMBOL_GPL(debugfs_create_x8); + +/** + * debugfs_create_x16 - create a debugfs file that is used to read and write an unsigned 16-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_x16(const char *name, mode_t mode, + struct dentry *parent, u16 *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x16_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x16_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_x16); +} +EXPORT_SYMBOL_GPL(debugfs_create_x16); + +/** + * debugfs_create_x32 - create a debugfs file that is used to read and write an unsigned 32-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_x32(const char *name, mode_t mode, + struct dentry *parent, u32 *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x32_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x32_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_x32); +} +EXPORT_SYMBOL_GPL(debugfs_create_x32); + +/** + * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_x64(const char *name, mode_t mode, + struct dentry *parent, u64 *value) +{ + return debugfs_create_file(name, mode, parent, value, &fops_x64); +} +EXPORT_SYMBOL_GPL(debugfs_create_x64); + + +static int debugfs_size_t_set(void *data, u64 val) +{ + *(size_t *)data = val; + return 0; +} +static int debugfs_size_t_get(void *data, u64 *val) +{ + *val = *(size_t *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, + "%llu\n"); /* %llu and %zu are more or less the same */ + +/** + * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_size_t(const char *name, mode_t mode, + struct dentry *parent, size_t *value) +{ + return debugfs_create_file(name, mode, parent, value, &fops_size_t); +} +EXPORT_SYMBOL_GPL(debugfs_create_size_t); + + +static ssize_t read_file_bool(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[3]; + u32 *val = file->private_data; + + if (*val) + buf[0] = 'Y'; + else + buf[0] = 'N'; + buf[1] = '\n'; + buf[2] = 0x00; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t write_file_bool(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[32]; + size_t buf_size; + bool bv; + u32 *val = file->private_data; + + buf_size = min(count, (sizeof(buf)-1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + if (strtobool(buf, &bv) == 0) + *val = bv; + + return count; +} + +static const struct file_operations fops_bool = { + .read = read_file_bool, + .write = write_file_bool, + .open = default_open, + .llseek = default_llseek, +}; + +/** + * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_bool(const char *name, mode_t mode, + struct dentry *parent, u32 *value) +{ + return debugfs_create_file(name, mode, parent, value, &fops_bool); +} +EXPORT_SYMBOL_GPL(debugfs_create_bool); + +static ssize_t read_file_blob(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct debugfs_blob_wrapper *blob = file->private_data; + return simple_read_from_buffer(user_buf, count, ppos, blob->data, + blob->size); +} + +static const struct file_operations fops_blob = { + .read = read_file_blob, + .open = default_open, + .llseek = default_llseek, +}; + +/** + * debugfs_create_blob - create a debugfs file that is used to read a binary blob + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @blob: a pointer to a struct debugfs_blob_wrapper which contains a pointer + * to the blob data and the size of the data. + * + * This function creates a file in debugfs with the given name that exports + * @blob->data as a binary blob. If the @mode variable is so set it can be + * read from. Writing is not supported. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_blob(const char *name, mode_t mode, + struct dentry *parent, + struct debugfs_blob_wrapper *blob) +{ + return debugfs_create_file(name, mode, parent, blob, &fops_blob); +} +EXPORT_SYMBOL_GPL(debugfs_create_blob); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c new file mode 100644 index 00000000..e7a7a2f0 --- /dev/null +++ b/fs/debugfs/inode.c @@ -0,0 +1,544 @@ +/* + * file.c - part of debugfs, a tiny little debug file system + * + * Copyright (C) 2004 Greg Kroah-Hartman + * Copyright (C) 2004 IBM Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * debugfs is for people to use instead of /proc or /sys. + * See Documentation/DocBook/kernel-api for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct vfsmount *debugfs_mount; +static int debugfs_mount_count; +static bool debugfs_registered; + +static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev, + void *data, const struct file_operations *fops) + +{ + struct inode *inode = new_inode(sb); + + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + default: + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_fop = fops ? fops : &debugfs_file_operations; + inode->i_private = data; + break; + case S_IFLNK: + inode->i_op = &debugfs_link_operations; + inode->i_fop = fops; + inode->i_private = data; + break; + case S_IFDIR: + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = fops ? fops : &simple_dir_operations; + inode->i_private = data; + + /* directory inodes start off with i_nlink == 2 + * (for "." entry) */ + inc_nlink(inode); + break; + } + } + return inode; +} + +/* SMP-safe */ +static int debugfs_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t dev, void *data, + const struct file_operations *fops) +{ + struct inode *inode; + int error = -EPERM; + + if (dentry->d_inode) + return -EEXIST; + + inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops); + if (inode) { + d_instantiate(dentry, inode); + dget(dentry); + error = 0; + } + return error; +} + +static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode, + void *data, const struct file_operations *fops) +{ + int res; + + mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR; + res = debugfs_mknod(dir, dentry, mode, 0, data, fops); + if (!res) { + inc_nlink(dir); + fsnotify_mkdir(dir, dentry); + } + return res; +} + +static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode, + void *data, const struct file_operations *fops) +{ + mode = (mode & S_IALLUGO) | S_IFLNK; + return debugfs_mknod(dir, dentry, mode, 0, data, fops); +} + +static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode, + void *data, const struct file_operations *fops) +{ + int res; + + mode = (mode & S_IALLUGO) | S_IFREG; + res = debugfs_mknod(dir, dentry, mode, 0, data, fops); + if (!res) + fsnotify_create(dir, dentry); + return res; +} + +static inline int debugfs_positive(struct dentry *dentry) +{ + return dentry->d_inode && !d_unhashed(dentry); +} + +static int debug_fill_super(struct super_block *sb, void *data, int silent) +{ + static struct tree_descr debug_files[] = {{""}}; + + return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); +} + +static struct dentry *debug_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_single(fs_type, flags, data, debug_fill_super); +} + +static struct file_system_type debug_fs_type = { + .owner = THIS_MODULE, + .name = "debugfs", + .mount = debug_mount, + .kill_sb = kill_litter_super, +}; + +static int debugfs_create_by_name(const char *name, mode_t mode, + struct dentry *parent, + struct dentry **dentry, + void *data, + const struct file_operations *fops) +{ + int error = 0; + + /* If the parent is not specified, we create it in the root. + * We need the root dentry to do this, which is in the super + * block. A pointer to that is in the struct vfsmount that we + * have around. + */ + if (!parent) + parent = debugfs_mount->mnt_sb->s_root; + + *dentry = NULL; + mutex_lock(&parent->d_inode->i_mutex); + *dentry = lookup_one_len(name, parent, strlen(name)); + if (!IS_ERR(*dentry)) { + switch (mode & S_IFMT) { + case S_IFDIR: + error = debugfs_mkdir(parent->d_inode, *dentry, mode, + data, fops); + break; + case S_IFLNK: + error = debugfs_link(parent->d_inode, *dentry, mode, + data, fops); + break; + default: + error = debugfs_create(parent->d_inode, *dentry, mode, + data, fops); + break; + } + dput(*dentry); + } else + error = PTR_ERR(*dentry); + mutex_unlock(&parent->d_inode->i_mutex); + + return error; +} + +/** + * debugfs_create_file - create a file in the debugfs filesystem + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this paramater is NULL, then the + * file will be created in the root of the debugfs filesystem. + * @data: a pointer to something that the caller will want to get to later + * on. The inode.i_private pointer will point to this value on + * the open() call. + * @fops: a pointer to a struct file_operations that should be used for + * this file. + * + * This is the basic "create a file" function for debugfs. It allows for a + * wide range of flexibility in creating a file, or a directory (if you want + * to create a directory, the debugfs_create_dir() function is + * recommended to be used instead.) + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *debugfs_create_file(const char *name, mode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops) +{ + struct dentry *dentry = NULL; + int error; + + pr_debug("debugfs: creating file '%s'\n",name); + + error = simple_pin_fs(&debug_fs_type, &debugfs_mount, + &debugfs_mount_count); + if (error) + goto exit; + + error = debugfs_create_by_name(name, mode, parent, &dentry, + data, fops); + if (error) { + dentry = NULL; + simple_release_fs(&debugfs_mount, &debugfs_mount_count); + goto exit; + } +exit: + return dentry; +} +EXPORT_SYMBOL_GPL(debugfs_create_file); + +/** + * debugfs_create_dir - create a directory in the debugfs filesystem + * @name: a pointer to a string containing the name of the directory to + * create. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this paramater is NULL, then the + * directory will be created in the root of the debugfs filesystem. + * + * This function creates a directory in debugfs with the given name. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) +{ + return debugfs_create_file(name, + S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, + parent, NULL, NULL); +} +EXPORT_SYMBOL_GPL(debugfs_create_dir); + +/** + * debugfs_create_symlink- create a symbolic link in the debugfs filesystem + * @name: a pointer to a string containing the name of the symbolic link to + * create. + * @parent: a pointer to the parent dentry for this symbolic link. This + * should be a directory dentry if set. If this paramater is NULL, + * then the symbolic link will be created in the root of the debugfs + * filesystem. + * @target: a pointer to a string containing the path to the target of the + * symbolic link. + * + * This function creates a symbolic link with the given name in debugfs that + * links to the given target path. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the symbolic + * link is to be removed (no automatic cleanup happens if your module is + * unloaded, you are responsible here.) If an error occurs, %NULL will be + * returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, + const char *target) +{ + struct dentry *result; + char *link; + + link = kstrdup(target, GFP_KERNEL); + if (!link) + return NULL; + + result = debugfs_create_file(name, S_IFLNK | S_IRWXUGO, parent, link, + NULL); + if (!result) + kfree(link); + return result; +} +EXPORT_SYMBOL_GPL(debugfs_create_symlink); + +static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) +{ + int ret = 0; + + if (debugfs_positive(dentry)) { + if (dentry->d_inode) { + dget(dentry); + switch (dentry->d_inode->i_mode & S_IFMT) { + case S_IFDIR: + ret = simple_rmdir(parent->d_inode, dentry); + break; + case S_IFLNK: + kfree(dentry->d_inode->i_private); + /* fall through */ + default: + simple_unlink(parent->d_inode, dentry); + break; + } + if (!ret) + d_delete(dentry); + dput(dentry); + } + } + return ret; +} + +/** + * debugfs_remove - removes a file or directory from the debugfs filesystem + * @dentry: a pointer to a the dentry of the file or directory to be + * removed. + * + * This function removes a file or directory in debugfs that was previously + * created with a call to another debugfs function (like + * debugfs_create_file() or variants thereof.) + * + * This function is required to be called in order for the file to be + * removed, no automatic cleanup of files will happen when a module is + * removed, you are responsible here. + */ +void debugfs_remove(struct dentry *dentry) +{ + struct dentry *parent; + int ret; + + if (!dentry) + return; + + parent = dentry->d_parent; + if (!parent || !parent->d_inode) + return; + + mutex_lock(&parent->d_inode->i_mutex); + ret = __debugfs_remove(dentry, parent); + mutex_unlock(&parent->d_inode->i_mutex); + if (!ret) + simple_release_fs(&debugfs_mount, &debugfs_mount_count); +} +EXPORT_SYMBOL_GPL(debugfs_remove); + +/** + * debugfs_remove_recursive - recursively removes a directory + * @dentry: a pointer to a the dentry of the directory to be removed. + * + * This function recursively removes a directory tree in debugfs that + * was previously created with a call to another debugfs function + * (like debugfs_create_file() or variants thereof.) + * + * This function is required to be called in order for the file to be + * removed, no automatic cleanup of files will happen when a module is + * removed, you are responsible here. + */ +void debugfs_remove_recursive(struct dentry *dentry) +{ + struct dentry *child; + struct dentry *parent; + + if (!dentry) + return; + + parent = dentry->d_parent; + if (!parent || !parent->d_inode) + return; + + parent = dentry; + mutex_lock(&parent->d_inode->i_mutex); + + while (1) { + /* + * When all dentries under "parent" has been removed, + * walk up the tree until we reach our starting point. + */ + if (list_empty(&parent->d_subdirs)) { + mutex_unlock(&parent->d_inode->i_mutex); + if (parent == dentry) + break; + parent = parent->d_parent; + mutex_lock(&parent->d_inode->i_mutex); + } + child = list_entry(parent->d_subdirs.next, struct dentry, + d_u.d_child); + next_sibling: + + /* + * If "child" isn't empty, walk down the tree and + * remove all its descendants first. + */ + if (!list_empty(&child->d_subdirs)) { + mutex_unlock(&parent->d_inode->i_mutex); + parent = child; + mutex_lock(&parent->d_inode->i_mutex); + continue; + } + __debugfs_remove(child, parent); + if (parent->d_subdirs.next == &child->d_u.d_child) { + /* + * Try the next sibling. + */ + if (child->d_u.d_child.next != &parent->d_subdirs) { + child = list_entry(child->d_u.d_child.next, + struct dentry, + d_u.d_child); + goto next_sibling; + } + + /* + * Avoid infinite loop if we fail to remove + * one dentry. + */ + mutex_unlock(&parent->d_inode->i_mutex); + break; + } + simple_release_fs(&debugfs_mount, &debugfs_mount_count); + } + + parent = dentry->d_parent; + mutex_lock(&parent->d_inode->i_mutex); + __debugfs_remove(dentry, parent); + mutex_unlock(&parent->d_inode->i_mutex); + simple_release_fs(&debugfs_mount, &debugfs_mount_count); +} +EXPORT_SYMBOL_GPL(debugfs_remove_recursive); + +/** + * debugfs_rename - rename a file/directory in the debugfs filesystem + * @old_dir: a pointer to the parent dentry for the renamed object. This + * should be a directory dentry. + * @old_dentry: dentry of an object to be renamed. + * @new_dir: a pointer to the parent dentry where the object should be + * moved. This should be a directory dentry. + * @new_name: a pointer to a string containing the target name. + * + * This function renames a file/directory in debugfs. The target must not + * exist for rename to succeed. + * + * This function will return a pointer to old_dentry (which is updated to + * reflect renaming) if it succeeds. If an error occurs, %NULL will be + * returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, + struct dentry *new_dir, const char *new_name) +{ + int error; + struct dentry *dentry = NULL, *trap; + const char *old_name; + + trap = lock_rename(new_dir, old_dir); + /* Source or destination directories don't exist? */ + if (!old_dir->d_inode || !new_dir->d_inode) + goto exit; + /* Source does not exist, cyclic rename, or mountpoint? */ + if (!old_dentry->d_inode || old_dentry == trap || + d_mountpoint(old_dentry)) + goto exit; + dentry = lookup_one_len(new_name, new_dir, strlen(new_name)); + /* Lookup failed, cyclic rename or target exists? */ + if (IS_ERR(dentry) || dentry == trap || dentry->d_inode) + goto exit; + + old_name = fsnotify_oldname_init(old_dentry->d_name.name); + + error = simple_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, + dentry); + if (error) { + fsnotify_oldname_free(old_name); + goto exit; + } + d_move(old_dentry, dentry); + fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name, + S_ISDIR(old_dentry->d_inode->i_mode), + NULL, old_dentry); + fsnotify_oldname_free(old_name); + unlock_rename(new_dir, old_dir); + dput(dentry); + return old_dentry; +exit: + if (dentry && !IS_ERR(dentry)) + dput(dentry); + unlock_rename(new_dir, old_dir); + return NULL; +} +EXPORT_SYMBOL_GPL(debugfs_rename); + +/** + * debugfs_initialized - Tells whether debugfs has been registered + */ +bool debugfs_initialized(void) +{ + return debugfs_registered; +} +EXPORT_SYMBOL_GPL(debugfs_initialized); + + +static struct kobject *debug_kobj; + +static int __init debugfs_init(void) +{ + int retval; + + debug_kobj = kobject_create_and_add("debug", kernel_kobj); + if (!debug_kobj) + return -EINVAL; + + retval = register_filesystem(&debug_fs_type); + if (retval) + kobject_put(debug_kobj); + else + debugfs_registered = true; + + return retval; +} +core_initcall(debugfs_init); + diff --git a/fs/devpts/Makefile b/fs/devpts/Makefile new file mode 100644 index 00000000..236696ef --- /dev/null +++ b/fs/devpts/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux /dev/pts virtual filesystem. +# + +obj-$(CONFIG_UNIX98_PTYS) += devpts.o + +devpts-$(CONFIG_UNIX98_PTYS) := inode.o diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c new file mode 100644 index 00000000..2f27e578 --- /dev/null +++ b/fs/devpts/inode.c @@ -0,0 +1,572 @@ +/* -*- linux-c -*- --------------------------------------------------------- * + * + * linux/fs/devpts/inode.c + * + * Copyright 1998-2004 H. Peter Anvin -- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * ------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEVPTS_DEFAULT_MODE 0600 +/* + * ptmx is a new node in /dev/pts and will be unused in legacy (single- + * instance) mode. To prevent surprises in user space, set permissions of + * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful + * permissions. + */ +#define DEVPTS_DEFAULT_PTMX_MODE 0000 +#define PTMX_MINOR 2 + +extern int pty_limit; /* Config limit on Unix98 ptys */ +static DEFINE_MUTEX(allocated_ptys_lock); + +static struct vfsmount *devpts_mnt; + +struct pts_mount_opts { + int setuid; + int setgid; + uid_t uid; + gid_t gid; + umode_t mode; + umode_t ptmxmode; + int newinstance; +}; + +enum { + Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_mode, "mode=%o"}, +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + {Opt_ptmxmode, "ptmxmode=%o"}, + {Opt_newinstance, "newinstance"}, +#endif + {Opt_err, NULL} +}; + +struct pts_fs_info { + struct ida allocated_ptys; + struct pts_mount_opts mount_opts; + struct dentry *ptmx_dentry; +}; + +static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct super_block *pts_sb_from_inode(struct inode *inode) +{ +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) + return inode->i_sb; +#endif + return devpts_mnt->mnt_sb; +} + +#define PARSE_MOUNT 0 +#define PARSE_REMOUNT 1 + +/* + * parse_mount_options(): + * Set @opts to mount options specified in @data. If an option is not + * specified in @data, set it to its default value. The exception is + * 'newinstance' option which can only be set/cleared on a mount (i.e. + * cannot be changed during remount). + * + * Note: @data may be NULL (in which case all options are set to default). + */ +static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) +{ + char *p; + + opts->setuid = 0; + opts->setgid = 0; + opts->uid = 0; + opts->gid = 0; + opts->mode = DEVPTS_DEFAULT_MODE; + opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + + /* newinstance makes sense only on initial mount */ + if (op == PARSE_MOUNT) + opts->newinstance = 0; + + while ((p = strsep(&data, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + int option; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + return -EINVAL; + opts->uid = option; + opts->setuid = 1; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return -EINVAL; + opts->gid = option; + opts->setgid = 1; + break; + case Opt_mode: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + case Opt_ptmxmode: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->ptmxmode = option & S_IALLUGO; + break; + case Opt_newinstance: + /* newinstance makes sense only on initial mount */ + if (op == PARSE_MOUNT) + opts->newinstance = 1; + break; +#endif + default: + printk(KERN_ERR "devpts: called with bogus options\n"); + return -EINVAL; + } + } + + return 0; +} + +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES +static int mknod_ptmx(struct super_block *sb) +{ + int mode; + int rc = -ENOMEM; + struct dentry *dentry; + struct inode *inode; + struct dentry *root = sb->s_root; + struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_mount_opts *opts = &fsi->mount_opts; + + mutex_lock(&root->d_inode->i_mutex); + + /* If we have already created ptmx node, return */ + if (fsi->ptmx_dentry) { + rc = 0; + goto out; + } + + dentry = d_alloc_name(root, "ptmx"); + if (!dentry) { + printk(KERN_NOTICE "Unable to alloc dentry for ptmx node\n"); + goto out; + } + + /* + * Create a new 'ptmx' node in this mount of devpts. + */ + inode = new_inode(sb); + if (!inode) { + printk(KERN_ERR "Unable to alloc inode for ptmx node\n"); + dput(dentry); + goto out; + } + + inode->i_ino = 2; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + mode = S_IFCHR|opts->ptmxmode; + init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); + + d_add(dentry, inode); + + fsi->ptmx_dentry = dentry; + rc = 0; +out: + mutex_unlock(&root->d_inode->i_mutex); + return rc; +} + +static void update_ptmx_mode(struct pts_fs_info *fsi) +{ + struct inode *inode; + if (fsi->ptmx_dentry) { + inode = fsi->ptmx_dentry->d_inode; + inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode; + } +} +#else +static inline void update_ptmx_mode(struct pts_fs_info *fsi) +{ + return; +} +#endif + +static int devpts_remount(struct super_block *sb, int *flags, char *data) +{ + int err; + struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_mount_opts *opts = &fsi->mount_opts; + + err = parse_mount_options(data, PARSE_REMOUNT, opts); + + /* + * parse_mount_options() restores options to default values + * before parsing and may have changed ptmxmode. So, update the + * mode in the inode too. Bogus options don't fail the remount, + * so do this even on error return. + */ + update_ptmx_mode(fsi); + + return err; +} + +static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb); + struct pts_mount_opts *opts = &fsi->mount_opts; + + if (opts->setuid) + seq_printf(seq, ",uid=%u", opts->uid); + if (opts->setgid) + seq_printf(seq, ",gid=%u", opts->gid); + seq_printf(seq, ",mode=%03o", opts->mode); +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); +#endif + + return 0; +} + +static const struct super_operations devpts_sops = { + .statfs = simple_statfs, + .remount_fs = devpts_remount, + .show_options = devpts_show_options, +}; + +static void *new_pts_fs_info(void) +{ + struct pts_fs_info *fsi; + + fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL); + if (!fsi) + return NULL; + + ida_init(&fsi->allocated_ptys); + fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; + fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + + return fsi; +} + +static int +devpts_fill_super(struct super_block *s, void *data, int silent) +{ + struct inode *inode; + + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = DEVPTS_SUPER_MAGIC; + s->s_op = &devpts_sops; + s->s_time_gran = 1; + + s->s_fs_info = new_pts_fs_info(); + if (!s->s_fs_info) + goto fail; + + inode = new_inode(s); + if (!inode) + goto free_fsi; + inode->i_ino = 1; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inode->i_nlink = 2; + + s->s_root = d_alloc_root(inode); + if (s->s_root) + return 0; + + printk(KERN_ERR "devpts: get root dentry failed\n"); + iput(inode); + +free_fsi: + kfree(s->s_fs_info); +fail: + return -ENOMEM; +} + +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES +static int compare_init_pts_sb(struct super_block *s, void *p) +{ + if (devpts_mnt) + return devpts_mnt->mnt_sb == s; + return 0; +} + +/* + * devpts_mount() + * + * If the '-o newinstance' mount option was specified, mount a new + * (private) instance of devpts. PTYs created in this instance are + * independent of the PTYs in other devpts instances. + * + * If the '-o newinstance' option was not specified, mount/remount the + * initial kernel mount of devpts. This type of mount gives the + * legacy, single-instance semantics. + * + * The 'newinstance' option is needed to support multiple namespace + * semantics in devpts while preserving backward compatibility of the + * current 'single-namespace' semantics. i.e all mounts of devpts + * without the 'newinstance' mount option should bind to the initial + * kernel mount, like mount_single(). + * + * Mounts with 'newinstance' option create a new, private namespace. + * + * NOTE: + * + * For single-mount semantics, devpts cannot use mount_single(), + * because mount_single()/sget() find and use the super-block from + * the most recent mount of devpts. But that recent mount may be a + * 'newinstance' mount and mount_single() would pick the newinstance + * super-block instead of the initial super-block. + */ +static struct dentry *devpts_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + int error; + struct pts_mount_opts opts; + struct super_block *s; + + error = parse_mount_options(data, PARSE_MOUNT, &opts); + if (error) + return ERR_PTR(error); + + if (opts.newinstance) + s = sget(fs_type, NULL, set_anon_super, NULL); + else + s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL); + + if (IS_ERR(s)) + return ERR_CAST(s); + + if (!s->s_root) { + s->s_flags = flags; + error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0); + if (error) + goto out_undo_sget; + s->s_flags |= MS_ACTIVE; + } + + memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); + + error = mknod_ptmx(s); + if (error) + goto out_undo_sget; + + return dget(s->s_root); + +out_undo_sget: + deactivate_locked_super(s); + return ERR_PTR(error); +} + +#else +/* + * This supports only the legacy single-instance semantics (no + * multiple-instance semantics) + */ +static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, devpts_fill_super); +} +#endif + +static void devpts_kill_sb(struct super_block *sb) +{ + struct pts_fs_info *fsi = DEVPTS_SB(sb); + + kfree(fsi); + kill_litter_super(sb); +} + +static struct file_system_type devpts_fs_type = { + .name = "devpts", + .mount = devpts_mount, + .kill_sb = devpts_kill_sb, +}; + +/* + * The normal naming convention is simply /dev/pts/; this conforms + * to the System V naming convention + */ + +int devpts_new_index(struct inode *ptmx_inode) +{ + struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct pts_fs_info *fsi = DEVPTS_SB(sb); + int index; + int ida_ret; + +retry: + if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) + return -ENOMEM; + + mutex_lock(&allocated_ptys_lock); + ida_ret = ida_get_new(&fsi->allocated_ptys, &index); + if (ida_ret < 0) { + mutex_unlock(&allocated_ptys_lock); + if (ida_ret == -EAGAIN) + goto retry; + return -EIO; + } + + if (index >= pty_limit) { + ida_remove(&fsi->allocated_ptys, index); + mutex_unlock(&allocated_ptys_lock); + return -EIO; + } + mutex_unlock(&allocated_ptys_lock); + return index; +} + +void devpts_kill_index(struct inode *ptmx_inode, int idx) +{ + struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct pts_fs_info *fsi = DEVPTS_SB(sb); + + mutex_lock(&allocated_ptys_lock); + ida_remove(&fsi->allocated_ptys, idx); + mutex_unlock(&allocated_ptys_lock); +} + +int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) +{ + /* tty layer puts index from devpts_new_index() in here */ + int number = tty->index; + struct tty_driver *driver = tty->driver; + dev_t device = MKDEV(driver->major, driver->minor_start+number); + struct dentry *dentry; + struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct inode *inode = new_inode(sb); + struct dentry *root = sb->s_root; + struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_mount_opts *opts = &fsi->mount_opts; + int ret = 0; + char s[12]; + + /* We're supposed to be given the slave end of a pty */ + BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY); + BUG_ON(driver->subtype != PTY_TYPE_SLAVE); + + if (!inode) + return -ENOMEM; + + inode->i_ino = number + 3; + inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); + inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + init_special_inode(inode, S_IFCHR|opts->mode, device); + inode->i_private = tty; + tty->driver_data = inode; + + sprintf(s, "%d", number); + + mutex_lock(&root->d_inode->i_mutex); + + dentry = d_alloc_name(root, s); + if (dentry) { + d_add(dentry, inode); + fsnotify_create(root->d_inode, dentry); + } else { + iput(inode); + ret = -ENOMEM; + } + + mutex_unlock(&root->d_inode->i_mutex); + + return ret; +} + +struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) +{ + struct dentry *dentry; + struct tty_struct *tty; + + BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); + + /* Ensure dentry has not been deleted by devpts_pty_kill() */ + dentry = d_find_alias(pts_inode); + if (!dentry) + return NULL; + + tty = NULL; + if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) + tty = (struct tty_struct *)pts_inode->i_private; + + dput(dentry); + + return tty; +} + +void devpts_pty_kill(struct tty_struct *tty) +{ + struct inode *inode = tty->driver_data; + struct super_block *sb = pts_sb_from_inode(inode); + struct dentry *root = sb->s_root; + struct dentry *dentry; + + BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); + + mutex_lock(&root->d_inode->i_mutex); + + dentry = d_find_alias(inode); + + inode->i_nlink--; + d_delete(dentry); + dput(dentry); /* d_alloc_name() in devpts_pty_new() */ + dput(dentry); /* d_find_alias above */ + + mutex_unlock(&root->d_inode->i_mutex); +} + +static int __init init_devpts_fs(void) +{ + int err = register_filesystem(&devpts_fs_type); + if (!err) { + devpts_mnt = kern_mount(&devpts_fs_type); + if (IS_ERR(devpts_mnt)) { + err = PTR_ERR(devpts_mnt); + unregister_filesystem(&devpts_fs_type); + } + } + return err; +} +module_init(init_devpts_fs) diff --git a/fs/direct-io.c b/fs/direct-io.c new file mode 100644 index 00000000..ac5f1641 --- /dev/null +++ b/fs/direct-io.c @@ -0,0 +1,1256 @@ +/* + * fs/direct-io.c + * + * Copyright (C) 2002, Linus Torvalds. + * + * O_DIRECT + * + * 04Jul2002 Andrew Morton + * Initial version + * 11Sep2002 janetinc@us.ibm.com + * added readv/writev support. + * 29Oct2002 Andrew Morton + * rewrote bio_add_page() support. + * 30Oct2002 pbadari@us.ibm.com + * added support for non-aligned IO. + * 06Nov2002 pbadari@us.ibm.com + * added asynchronous IO support. + * 21Jul2003 nathans@sgi.com + * added IO completion notifier. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * How many user pages to map in one call to get_user_pages(). This determines + * the size of a structure on the stack. + */ +#define DIO_PAGES 64 + +/* + * This code generally works in units of "dio_blocks". A dio_block is + * somewhere between the hard sector size and the filesystem block size. it + * is determined on a per-invocation basis. When talking to the filesystem + * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity + * down by dio->blkfactor. Similarly, fs-blocksize quantities are converted + * to bio_block quantities by shifting left by blkfactor. + * + * If blkfactor is zero then the user's request was aligned to the filesystem's + * blocksize. + */ + +struct dio { + /* BIO submission state */ + struct bio *bio; /* bio under assembly */ + struct inode *inode; + int rw; + loff_t i_size; /* i_size when submitted */ + int flags; /* doesn't change */ + unsigned blkbits; /* doesn't change */ + unsigned blkfactor; /* When we're using an alignment which + is finer than the filesystem's soft + blocksize, this specifies how much + finer. blkfactor=2 means 1/4-block + alignment. Does not change */ + unsigned start_zero_done; /* flag: sub-blocksize zeroing has + been performed at the start of a + write */ + int pages_in_io; /* approximate total IO pages */ + size_t size; /* total request size (doesn't change)*/ + sector_t block_in_file; /* Current offset into the underlying + file in dio_block units. */ + unsigned blocks_available; /* At block_in_file. changes */ + sector_t final_block_in_request;/* doesn't change */ + unsigned first_block_in_page; /* doesn't change, Used only once */ + int boundary; /* prev block is at a boundary */ + int reap_counter; /* rate limit reaping */ + get_block_t *get_block; /* block mapping function */ + dio_iodone_t *end_io; /* IO completion function */ + dio_submit_t *submit_io; /* IO submition function */ + loff_t logical_offset_in_bio; /* current first logical block in bio */ + sector_t final_block_in_bio; /* current final block in bio + 1 */ + sector_t next_block_for_io; /* next block to be put under IO, + in dio_blocks units */ + struct buffer_head map_bh; /* last get_block() result */ + + /* + * Deferred addition of a page to the dio. These variables are + * private to dio_send_cur_page(), submit_page_section() and + * dio_bio_add_page(). + */ + struct page *cur_page; /* The page */ + unsigned cur_page_offset; /* Offset into it, in bytes */ + unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ + sector_t cur_page_block; /* Where it starts */ + loff_t cur_page_fs_offset; /* Offset in file */ + + /* BIO completion state */ + spinlock_t bio_lock; /* protects BIO fields below */ + unsigned long refcount; /* direct_io_worker() and bios */ + struct bio *bio_list; /* singly linked via bi_private */ + struct task_struct *waiter; /* waiting task (NULL if none) */ + + /* AIO related stuff */ + struct kiocb *iocb; /* kiocb */ + int is_async; /* is IO async ? */ + int io_error; /* IO error in completion path */ + ssize_t result; /* IO result */ + + /* + * Page fetching state. These variables belong to dio_refill_pages(). + */ + int curr_page; /* changes */ + int total_pages; /* doesn't change */ + unsigned long curr_user_address;/* changes */ + + /* + * Page queue. These variables belong to dio_refill_pages() and + * dio_get_page(). + */ + unsigned head; /* next page to process */ + unsigned tail; /* last valid page + 1 */ + int page_errors; /* errno from get_user_pages() */ + + /* + * pages[] (and any fields placed after it) are not zeroed out at + * allocation time. Don't add new fields after pages[] unless you + * wish that they not be zeroed. + */ + struct page *pages[DIO_PAGES]; /* page buffer */ +}; + +/* + * How many pages are in the queue? + */ +static inline unsigned dio_pages_present(struct dio *dio) +{ + return dio->tail - dio->head; +} + +/* + * Go grab and pin some userspace pages. Typically we'll get 64 at a time. + */ +static int dio_refill_pages(struct dio *dio) +{ + int ret; + int nr_pages; + + nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES); + ret = get_user_pages_fast( + dio->curr_user_address, /* Where from? */ + nr_pages, /* How many pages? */ + dio->rw == READ, /* Write to memory? */ + &dio->pages[0]); /* Put results here */ + + if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) { + struct page *page = ZERO_PAGE(0); + /* + * A memory fault, but the filesystem has some outstanding + * mapped blocks. We need to use those blocks up to avoid + * leaking stale data in the file. + */ + if (dio->page_errors == 0) + dio->page_errors = ret; + page_cache_get(page); + dio->pages[0] = page; + dio->head = 0; + dio->tail = 1; + ret = 0; + goto out; + } + + if (ret >= 0) { + dio->curr_user_address += ret * PAGE_SIZE; + dio->curr_page += ret; + dio->head = 0; + dio->tail = ret; + ret = 0; + } +out: + return ret; +} + +/* + * Get another userspace page. Returns an ERR_PTR on error. Pages are + * buffered inside the dio so that we can call get_user_pages() against a + * decent number of pages, less frequently. To provide nicer use of the + * L1 cache. + */ +static struct page *dio_get_page(struct dio *dio) +{ + if (dio_pages_present(dio) == 0) { + int ret; + + ret = dio_refill_pages(dio); + if (ret) + return ERR_PTR(ret); + BUG_ON(dio_pages_present(dio) == 0); + } + return dio->pages[dio->head++]; +} + +/** + * dio_complete() - called when all DIO BIO I/O has been completed + * @offset: the byte offset in the file of the completed operation + * + * This releases locks as dictated by the locking type, lets interested parties + * know that a DIO operation has completed, and calculates the resulting return + * code for the operation. + * + * It lets the filesystem know if it registered an interest earlier via + * get_block. Pass the private field of the map buffer_head so that + * filesystems can use it to hold additional state between get_block calls and + * dio_complete. + */ +static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async) +{ + ssize_t transferred = 0; + + /* + * AIO submission can race with bio completion to get here while + * expecting to have the last io completed by bio completion. + * In that case -EIOCBQUEUED is in fact not an error we want + * to preserve through this call. + */ + if (ret == -EIOCBQUEUED) + ret = 0; + + if (dio->result) { + transferred = dio->result; + + /* Check for short read case */ + if ((dio->rw == READ) && ((offset + transferred) > dio->i_size)) + transferred = dio->i_size - offset; + } + + if (ret == 0) + ret = dio->page_errors; + if (ret == 0) + ret = dio->io_error; + if (ret == 0) + ret = transferred; + + if (dio->end_io && dio->result) { + dio->end_io(dio->iocb, offset, transferred, + dio->map_bh.b_private, ret, is_async); + } else if (is_async) { + aio_complete(dio->iocb, ret, 0); + } + + if (dio->flags & DIO_LOCKING) + /* lockdep: non-owner release */ + up_read_non_owner(&dio->inode->i_alloc_sem); + + return ret; +} + +static int dio_bio_complete(struct dio *dio, struct bio *bio); +/* + * Asynchronous IO callback. + */ +static void dio_bio_end_aio(struct bio *bio, int error) +{ + struct dio *dio = bio->bi_private; + unsigned long remaining; + unsigned long flags; + + /* cleanup the bio */ + dio_bio_complete(dio, bio); + + spin_lock_irqsave(&dio->bio_lock, flags); + remaining = --dio->refcount; + if (remaining == 1 && dio->waiter) + wake_up_process(dio->waiter); + spin_unlock_irqrestore(&dio->bio_lock, flags); + + if (remaining == 0) { + dio_complete(dio, dio->iocb->ki_pos, 0, true); + kfree(dio); + } +} + +/* + * The BIO completion handler simply queues the BIO up for the process-context + * handler. + * + * During I/O bi_private points at the dio. After I/O, bi_private is used to + * implement a singly-linked list of completed BIOs, at dio->bio_list. + */ +static void dio_bio_end_io(struct bio *bio, int error) +{ + struct dio *dio = bio->bi_private; + unsigned long flags; + + spin_lock_irqsave(&dio->bio_lock, flags); + bio->bi_private = dio->bio_list; + dio->bio_list = bio; + if (--dio->refcount == 1 && dio->waiter) + wake_up_process(dio->waiter); + spin_unlock_irqrestore(&dio->bio_lock, flags); +} + +/** + * dio_end_io - handle the end io action for the given bio + * @bio: The direct io bio thats being completed + * @error: Error if there was one + * + * This is meant to be called by any filesystem that uses their own dio_submit_t + * so that the DIO specific endio actions are dealt with after the filesystem + * has done it's completion work. + */ +void dio_end_io(struct bio *bio, int error) +{ + struct dio *dio = bio->bi_private; + + if (dio->is_async) + dio_bio_end_aio(bio, error); + else + dio_bio_end_io(bio, error); +} +EXPORT_SYMBOL_GPL(dio_end_io); + +static void +dio_bio_alloc(struct dio *dio, struct block_device *bdev, + sector_t first_sector, int nr_vecs) +{ + struct bio *bio; + + /* + * bio_alloc() is guaranteed to return a bio when called with + * __GFP_WAIT and we request a valid number of vectors. + */ + bio = bio_alloc(GFP_KERNEL, nr_vecs); + + bio->bi_bdev = bdev; + bio->bi_sector = first_sector; + if (dio->is_async) + bio->bi_end_io = dio_bio_end_aio; + else + bio->bi_end_io = dio_bio_end_io; + + dio->bio = bio; + dio->logical_offset_in_bio = dio->cur_page_fs_offset; +} + +/* + * In the AIO read case we speculatively dirty the pages before starting IO. + * During IO completion, any of these pages which happen to have been written + * back will be redirtied by bio_check_pages_dirty(). + * + * bios hold a dio reference between submit_bio and ->end_io. + */ +static void dio_bio_submit(struct dio *dio) +{ + struct bio *bio = dio->bio; + unsigned long flags; + + bio->bi_private = dio; + + spin_lock_irqsave(&dio->bio_lock, flags); + dio->refcount++; + spin_unlock_irqrestore(&dio->bio_lock, flags); + + if (dio->is_async && dio->rw == READ) + bio_set_pages_dirty(bio); + + if (dio->submit_io) + dio->submit_io(dio->rw, bio, dio->inode, + dio->logical_offset_in_bio); + else + submit_bio(dio->rw, bio); + + dio->bio = NULL; + dio->boundary = 0; + dio->logical_offset_in_bio = 0; +} + +/* + * Release any resources in case of a failure + */ +static void dio_cleanup(struct dio *dio) +{ + while (dio_pages_present(dio)) + page_cache_release(dio_get_page(dio)); +} + +/* + * Wait for the next BIO to complete. Remove it and return it. NULL is + * returned once all BIOs have been completed. This must only be called once + * all bios have been issued so that dio->refcount can only decrease. This + * requires that that the caller hold a reference on the dio. + */ +static struct bio *dio_await_one(struct dio *dio) +{ + unsigned long flags; + struct bio *bio = NULL; + + spin_lock_irqsave(&dio->bio_lock, flags); + + /* + * Wait as long as the list is empty and there are bios in flight. bio + * completion drops the count, maybe adds to the list, and wakes while + * holding the bio_lock so we don't need set_current_state()'s barrier + * and can call it after testing our condition. + */ + while (dio->refcount > 1 && dio->bio_list == NULL) { + __set_current_state(TASK_UNINTERRUPTIBLE); + dio->waiter = current; + spin_unlock_irqrestore(&dio->bio_lock, flags); + io_schedule(); + /* wake up sets us TASK_RUNNING */ + spin_lock_irqsave(&dio->bio_lock, flags); + dio->waiter = NULL; + } + if (dio->bio_list) { + bio = dio->bio_list; + dio->bio_list = bio->bi_private; + } + spin_unlock_irqrestore(&dio->bio_lock, flags); + return bio; +} + +/* + * Process one completed BIO. No locks are held. + */ +static int dio_bio_complete(struct dio *dio, struct bio *bio) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec; + int page_no; + + if (!uptodate) + dio->io_error = -EIO; + + if (dio->is_async && dio->rw == READ) { + bio_check_pages_dirty(bio); /* transfers ownership */ + } else { + for (page_no = 0; page_no < bio->bi_vcnt; page_no++) { + struct page *page = bvec[page_no].bv_page; + + if (dio->rw == READ && !PageCompound(page)) + set_page_dirty_lock(page); + page_cache_release(page); + } + bio_put(bio); + } + return uptodate ? 0 : -EIO; +} + +/* + * Wait on and process all in-flight BIOs. This must only be called once + * all bios have been issued so that the refcount can only decrease. + * This just waits for all bios to make it through dio_bio_complete. IO + * errors are propagated through dio->io_error and should be propagated via + * dio_complete(). + */ +static void dio_await_completion(struct dio *dio) +{ + struct bio *bio; + do { + bio = dio_await_one(dio); + if (bio) + dio_bio_complete(dio, bio); + } while (bio); +} + +/* + * A really large O_DIRECT read or write can generate a lot of BIOs. So + * to keep the memory consumption sane we periodically reap any completed BIOs + * during the BIO generation phase. + * + * This also helps to limit the peak amount of pinned userspace memory. + */ +static int dio_bio_reap(struct dio *dio) +{ + int ret = 0; + + if (dio->reap_counter++ >= 64) { + while (dio->bio_list) { + unsigned long flags; + struct bio *bio; + int ret2; + + spin_lock_irqsave(&dio->bio_lock, flags); + bio = dio->bio_list; + dio->bio_list = bio->bi_private; + spin_unlock_irqrestore(&dio->bio_lock, flags); + ret2 = dio_bio_complete(dio, bio); + if (ret == 0) + ret = ret2; + } + dio->reap_counter = 0; + } + return ret; +} + +/* + * Call into the fs to map some more disk blocks. We record the current number + * of available blocks at dio->blocks_available. These are in units of the + * fs blocksize, (1 << inode->i_blkbits). + * + * The fs is allowed to map lots of blocks at once. If it wants to do that, + * it uses the passed inode-relative block number as the file offset, as usual. + * + * get_block() is passed the number of i_blkbits-sized blocks which direct_io + * has remaining to do. The fs should not map more than this number of blocks. + * + * If the fs has mapped a lot of blocks, it should populate bh->b_size to + * indicate how much contiguous disk space has been made available at + * bh->b_blocknr. + * + * If *any* of the mapped blocks are new, then the fs must set buffer_new(). + * This isn't very efficient... + * + * In the case of filesystem holes: the fs may return an arbitrarily-large + * hole by returning an appropriate value in b_size and by clearing + * buffer_mapped(). However the direct-io code will only process holes one + * block at a time - it will repeatedly call get_block() as it walks the hole. + */ +static int get_more_blocks(struct dio *dio) +{ + int ret; + struct buffer_head *map_bh = &dio->map_bh; + sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ + unsigned long fs_count; /* Number of filesystem-sized blocks */ + unsigned long dio_count;/* Number of dio_block-sized blocks */ + unsigned long blkmask; + int create; + + /* + * If there was a memory error and we've overwritten all the + * mapped blocks then we can now return that memory error + */ + ret = dio->page_errors; + if (ret == 0) { + BUG_ON(dio->block_in_file >= dio->final_block_in_request); + fs_startblk = dio->block_in_file >> dio->blkfactor; + dio_count = dio->final_block_in_request - dio->block_in_file; + fs_count = dio_count >> dio->blkfactor; + blkmask = (1 << dio->blkfactor) - 1; + if (dio_count & blkmask) + fs_count++; + + map_bh->b_state = 0; + map_bh->b_size = fs_count << dio->inode->i_blkbits; + + /* + * For writes inside i_size on a DIO_SKIP_HOLES filesystem we + * forbid block creations: only overwrites are permitted. + * We will return early to the caller once we see an + * unmapped buffer head returned, and the caller will fall + * back to buffered I/O. + * + * Otherwise the decision is left to the get_blocks method, + * which may decide to handle it or also return an unmapped + * buffer head. + */ + create = dio->rw & WRITE; + if (dio->flags & DIO_SKIP_HOLES) { + if (dio->block_in_file < (i_size_read(dio->inode) >> + dio->blkbits)) + create = 0; + } + + ret = (*dio->get_block)(dio->inode, fs_startblk, + map_bh, create); + } + return ret; +} + +/* + * There is no bio. Make one now. + */ +static int dio_new_bio(struct dio *dio, sector_t start_sector) +{ + sector_t sector; + int ret, nr_pages; + + ret = dio_bio_reap(dio); + if (ret) + goto out; + sector = start_sector << (dio->blkbits - 9); + nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); + nr_pages = min(nr_pages, BIO_MAX_PAGES); + BUG_ON(nr_pages <= 0); + dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); + dio->boundary = 0; +out: + return ret; +} + +/* + * Attempt to put the current chunk of 'cur_page' into the current BIO. If + * that was successful then update final_block_in_bio and take a ref against + * the just-added page. + * + * Return zero on success. Non-zero means the caller needs to start a new BIO. + */ +static int dio_bio_add_page(struct dio *dio) +{ + int ret; + + ret = bio_add_page(dio->bio, dio->cur_page, + dio->cur_page_len, dio->cur_page_offset); + if (ret == dio->cur_page_len) { + /* + * Decrement count only, if we are done with this page + */ + if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE) + dio->pages_in_io--; + page_cache_get(dio->cur_page); + dio->final_block_in_bio = dio->cur_page_block + + (dio->cur_page_len >> dio->blkbits); + ret = 0; + } else { + ret = 1; + } + return ret; +} + +/* + * Put cur_page under IO. The section of cur_page which is described by + * cur_page_offset,cur_page_len is put into a BIO. The section of cur_page + * starts on-disk at cur_page_block. + * + * We take a ref against the page here (on behalf of its presence in the bio). + * + * The caller of this function is responsible for removing cur_page from the + * dio, and for dropping the refcount which came from that presence. + */ +static int dio_send_cur_page(struct dio *dio) +{ + int ret = 0; + + if (dio->bio) { + loff_t cur_offset = dio->cur_page_fs_offset; + loff_t bio_next_offset = dio->logical_offset_in_bio + + dio->bio->bi_size; + + /* + * See whether this new request is contiguous with the old. + * + * Btrfs cannot handle having logically non-contiguous requests + * submitted. For example if you have + * + * Logical: [0-4095][HOLE][8192-12287] + * Physical: [0-4095] [4096-8191] + * + * We cannot submit those pages together as one BIO. So if our + * current logical offset in the file does not equal what would + * be the next logical offset in the bio, submit the bio we + * have. + */ + if (dio->final_block_in_bio != dio->cur_page_block || + cur_offset != bio_next_offset) + dio_bio_submit(dio); + /* + * Submit now if the underlying fs is about to perform a + * metadata read + */ + else if (dio->boundary) + dio_bio_submit(dio); + } + + if (dio->bio == NULL) { + ret = dio_new_bio(dio, dio->cur_page_block); + if (ret) + goto out; + } + + if (dio_bio_add_page(dio) != 0) { + dio_bio_submit(dio); + ret = dio_new_bio(dio, dio->cur_page_block); + if (ret == 0) { + ret = dio_bio_add_page(dio); + BUG_ON(ret != 0); + } + } +out: + return ret; +} + +/* + * An autonomous function to put a chunk of a page under deferred IO. + * + * The caller doesn't actually know (or care) whether this piece of page is in + * a BIO, or is under IO or whatever. We just take care of all possible + * situations here. The separation between the logic of do_direct_IO() and + * that of submit_page_section() is important for clarity. Please don't break. + * + * The chunk of page starts on-disk at blocknr. + * + * We perform deferred IO, by recording the last-submitted page inside our + * private part of the dio structure. If possible, we just expand the IO + * across that page here. + * + * If that doesn't work out then we put the old page into the bio and add this + * page to the dio instead. + */ +static int +submit_page_section(struct dio *dio, struct page *page, + unsigned offset, unsigned len, sector_t blocknr) +{ + int ret = 0; + + if (dio->rw & WRITE) { + /* + * Read accounting is performed in submit_bio() + */ + task_io_account_write(len); + } + + /* + * Can we just grow the current page's presence in the dio? + */ + if ( (dio->cur_page == page) && + (dio->cur_page_offset + dio->cur_page_len == offset) && + (dio->cur_page_block + + (dio->cur_page_len >> dio->blkbits) == blocknr)) { + dio->cur_page_len += len; + + /* + * If dio->boundary then we want to schedule the IO now to + * avoid metadata seeks. + */ + if (dio->boundary) { + ret = dio_send_cur_page(dio); + page_cache_release(dio->cur_page); + dio->cur_page = NULL; + } + goto out; + } + + /* + * If there's a deferred page already there then send it. + */ + if (dio->cur_page) { + ret = dio_send_cur_page(dio); + page_cache_release(dio->cur_page); + dio->cur_page = NULL; + if (ret) + goto out; + } + + page_cache_get(page); /* It is in dio */ + dio->cur_page = page; + dio->cur_page_offset = offset; + dio->cur_page_len = len; + dio->cur_page_block = blocknr; + dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits; +out: + return ret; +} + +/* + * Clean any dirty buffers in the blockdev mapping which alias newly-created + * file blocks. Only called for S_ISREG files - blockdevs do not set + * buffer_new + */ +static void clean_blockdev_aliases(struct dio *dio) +{ + unsigned i; + unsigned nblocks; + + nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits; + + for (i = 0; i < nblocks; i++) { + unmap_underlying_metadata(dio->map_bh.b_bdev, + dio->map_bh.b_blocknr + i); + } +} + +/* + * If we are not writing the entire block and get_block() allocated + * the block for us, we need to fill-in the unused portion of the + * block with zeros. This happens only if user-buffer, fileoffset or + * io length is not filesystem block-size multiple. + * + * `end' is zero if we're doing the start of the IO, 1 at the end of the + * IO. + */ +static void dio_zero_block(struct dio *dio, int end) +{ + unsigned dio_blocks_per_fs_block; + unsigned this_chunk_blocks; /* In dio_blocks */ + unsigned this_chunk_bytes; + struct page *page; + + dio->start_zero_done = 1; + if (!dio->blkfactor || !buffer_new(&dio->map_bh)) + return; + + dio_blocks_per_fs_block = 1 << dio->blkfactor; + this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1); + + if (!this_chunk_blocks) + return; + + /* + * We need to zero out part of an fs block. It is either at the + * beginning or the end of the fs block. + */ + if (end) + this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; + + this_chunk_bytes = this_chunk_blocks << dio->blkbits; + + page = ZERO_PAGE(0); + if (submit_page_section(dio, page, 0, this_chunk_bytes, + dio->next_block_for_io)) + return; + + dio->next_block_for_io += this_chunk_blocks; +} + +/* + * Walk the user pages, and the file, mapping blocks to disk and generating + * a sequence of (page,offset,len,block) mappings. These mappings are injected + * into submit_page_section(), which takes care of the next stage of submission + * + * Direct IO against a blockdev is different from a file. Because we can + * happily perform page-sized but 512-byte aligned IOs. It is important that + * blockdev IO be able to have fine alignment and large sizes. + * + * So what we do is to permit the ->get_block function to populate bh.b_size + * with the size of IO which is permitted at this offset and this i_blkbits. + * + * For best results, the blockdev should be set up with 512-byte i_blkbits and + * it should set b_size to PAGE_SIZE or more inside get_block(). This gives + * fine alignment but still allows this function to work in PAGE_SIZE units. + */ +static int do_direct_IO(struct dio *dio) +{ + const unsigned blkbits = dio->blkbits; + const unsigned blocks_per_page = PAGE_SIZE >> blkbits; + struct page *page; + unsigned block_in_page; + struct buffer_head *map_bh = &dio->map_bh; + int ret = 0; + + /* The I/O can start at any block offset within the first page */ + block_in_page = dio->first_block_in_page; + + while (dio->block_in_file < dio->final_block_in_request) { + page = dio_get_page(dio); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto out; + } + + while (block_in_page < blocks_per_page) { + unsigned offset_in_page = block_in_page << blkbits; + unsigned this_chunk_bytes; /* # of bytes mapped */ + unsigned this_chunk_blocks; /* # of blocks */ + unsigned u; + + if (dio->blocks_available == 0) { + /* + * Need to go and map some more disk + */ + unsigned long blkmask; + unsigned long dio_remainder; + + ret = get_more_blocks(dio); + if (ret) { + page_cache_release(page); + goto out; + } + if (!buffer_mapped(map_bh)) + goto do_holes; + + dio->blocks_available = + map_bh->b_size >> dio->blkbits; + dio->next_block_for_io = + map_bh->b_blocknr << dio->blkfactor; + if (buffer_new(map_bh)) + clean_blockdev_aliases(dio); + + if (!dio->blkfactor) + goto do_holes; + + blkmask = (1 << dio->blkfactor) - 1; + dio_remainder = (dio->block_in_file & blkmask); + + /* + * If we are at the start of IO and that IO + * starts partway into a fs-block, + * dio_remainder will be non-zero. If the IO + * is a read then we can simply advance the IO + * cursor to the first block which is to be + * read. But if the IO is a write and the + * block was newly allocated we cannot do that; + * the start of the fs block must be zeroed out + * on-disk + */ + if (!buffer_new(map_bh)) + dio->next_block_for_io += dio_remainder; + dio->blocks_available -= dio_remainder; + } +do_holes: + /* Handle holes */ + if (!buffer_mapped(map_bh)) { + loff_t i_size_aligned; + + /* AKPM: eargh, -ENOTBLK is a hack */ + if (dio->rw & WRITE) { + page_cache_release(page); + return -ENOTBLK; + } + + /* + * Be sure to account for a partial block as the + * last block in the file + */ + i_size_aligned = ALIGN(i_size_read(dio->inode), + 1 << blkbits); + if (dio->block_in_file >= + i_size_aligned >> blkbits) { + /* We hit eof */ + page_cache_release(page); + goto out; + } + zero_user(page, block_in_page << blkbits, + 1 << blkbits); + dio->block_in_file++; + block_in_page++; + goto next_block; + } + + /* + * If we're performing IO which has an alignment which + * is finer than the underlying fs, go check to see if + * we must zero out the start of this block. + */ + if (unlikely(dio->blkfactor && !dio->start_zero_done)) + dio_zero_block(dio, 0); + + /* + * Work out, in this_chunk_blocks, how much disk we + * can add to this page + */ + this_chunk_blocks = dio->blocks_available; + u = (PAGE_SIZE - offset_in_page) >> blkbits; + if (this_chunk_blocks > u) + this_chunk_blocks = u; + u = dio->final_block_in_request - dio->block_in_file; + if (this_chunk_blocks > u) + this_chunk_blocks = u; + this_chunk_bytes = this_chunk_blocks << blkbits; + BUG_ON(this_chunk_bytes == 0); + + dio->boundary = buffer_boundary(map_bh); + ret = submit_page_section(dio, page, offset_in_page, + this_chunk_bytes, dio->next_block_for_io); + if (ret) { + page_cache_release(page); + goto out; + } + dio->next_block_for_io += this_chunk_blocks; + + dio->block_in_file += this_chunk_blocks; + block_in_page += this_chunk_blocks; + dio->blocks_available -= this_chunk_blocks; +next_block: + BUG_ON(dio->block_in_file > dio->final_block_in_request); + if (dio->block_in_file == dio->final_block_in_request) + break; + } + + /* Drop the ref which was taken in get_user_pages() */ + page_cache_release(page); + block_in_page = 0; + } +out: + return ret; +} + +/* + * Releases both i_mutex and i_alloc_sem + */ +static ssize_t +direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, + const struct iovec *iov, loff_t offset, unsigned long nr_segs, + unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, struct dio *dio) +{ + unsigned long user_addr; + unsigned long flags; + int seg; + ssize_t ret = 0; + ssize_t ret2; + size_t bytes; + + dio->inode = inode; + dio->rw = rw; + dio->blkbits = blkbits; + dio->blkfactor = inode->i_blkbits - blkbits; + dio->block_in_file = offset >> blkbits; + + dio->get_block = get_block; + dio->end_io = end_io; + dio->submit_io = submit_io; + dio->final_block_in_bio = -1; + dio->next_block_for_io = -1; + + dio->iocb = iocb; + dio->i_size = i_size_read(inode); + + spin_lock_init(&dio->bio_lock); + dio->refcount = 1; + + /* + * In case of non-aligned buffers, we may need 2 more + * pages since we need to zero out first and last block. + */ + if (unlikely(dio->blkfactor)) + dio->pages_in_io = 2; + + for (seg = 0; seg < nr_segs; seg++) { + user_addr = (unsigned long)iov[seg].iov_base; + dio->pages_in_io += + ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE + - user_addr/PAGE_SIZE); + } + + for (seg = 0; seg < nr_segs; seg++) { + user_addr = (unsigned long)iov[seg].iov_base; + dio->size += bytes = iov[seg].iov_len; + + /* Index into the first page of the first block */ + dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; + dio->final_block_in_request = dio->block_in_file + + (bytes >> blkbits); + /* Page fetching state */ + dio->head = 0; + dio->tail = 0; + dio->curr_page = 0; + + dio->total_pages = 0; + if (user_addr & (PAGE_SIZE-1)) { + dio->total_pages++; + bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); + } + dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + dio->curr_user_address = user_addr; + + ret = do_direct_IO(dio); + + dio->result += iov[seg].iov_len - + ((dio->final_block_in_request - dio->block_in_file) << + blkbits); + + if (ret) { + dio_cleanup(dio); + break; + } + } /* end iovec loop */ + + if (ret == -ENOTBLK) { + /* + * The remaining part of the request will be + * be handled by buffered I/O when we return + */ + ret = 0; + } + /* + * There may be some unwritten disk at the end of a part-written + * fs-block-sized block. Go zero that now. + */ + dio_zero_block(dio, 1); + + if (dio->cur_page) { + ret2 = dio_send_cur_page(dio); + if (ret == 0) + ret = ret2; + page_cache_release(dio->cur_page); + dio->cur_page = NULL; + } + if (dio->bio) + dio_bio_submit(dio); + + /* + * It is possible that, we return short IO due to end of file. + * In that case, we need to release all the pages we got hold on. + */ + dio_cleanup(dio); + + /* + * All block lookups have been performed. For READ requests + * we can let i_mutex go now that its achieved its purpose + * of protecting us from looking up uninitialized blocks. + */ + if (rw == READ && (dio->flags & DIO_LOCKING)) + mutex_unlock(&dio->inode->i_mutex); + + /* + * The only time we want to leave bios in flight is when a successful + * partial aio read or full aio write have been setup. In that case + * bio completion will call aio_complete. The only time it's safe to + * call aio_complete is when we return -EIOCBQUEUED, so we key on that. + * This had *better* be the only place that raises -EIOCBQUEUED. + */ + BUG_ON(ret == -EIOCBQUEUED); + if (dio->is_async && ret == 0 && dio->result && + ((rw & READ) || (dio->result == dio->size))) + ret = -EIOCBQUEUED; + + if (ret != -EIOCBQUEUED) + dio_await_completion(dio); + + /* + * Sync will always be dropping the final ref and completing the + * operation. AIO can if it was a broken operation described above or + * in fact if all the bios race to complete before we get here. In + * that case dio_complete() translates the EIOCBQUEUED into the proper + * return code that the caller will hand to aio_complete(). + * + * This is managed by the bio_lock instead of being an atomic_t so that + * completion paths can drop their ref and use the remaining count to + * decide to wake the submission path atomically. + */ + spin_lock_irqsave(&dio->bio_lock, flags); + ret2 = --dio->refcount; + spin_unlock_irqrestore(&dio->bio_lock, flags); + + if (ret2 == 0) { + ret = dio_complete(dio, offset, ret, false); + kfree(dio); + } else + BUG_ON(ret != -EIOCBQUEUED); + + return ret; +} + +/* + * This is a library function for use by filesystem drivers. + * + * The locking rules are governed by the flags parameter: + * - if the flags value contains DIO_LOCKING we use a fancy locking + * scheme for dumb filesystems. + * For writes this function is called under i_mutex and returns with + * i_mutex held, for reads, i_mutex is not held on entry, but it is + * taken and dropped again before returning. + * For reads and writes i_alloc_sem is taken in shared mode and released + * on I/O completion (which may happen asynchronously after returning to + * the caller). + * + * - if the flags value does NOT contain DIO_LOCKING we don't use any + * internal locking but rather rely on the filesystem to synchronize + * direct I/O reads/writes versus each other and truncate. + * For reads and writes both i_mutex and i_alloc_sem are not held on + * entry and are never taken. + */ +ssize_t +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) +{ + int seg; + size_t size; + unsigned long addr; + unsigned blkbits = inode->i_blkbits; + unsigned bdev_blkbits = 0; + unsigned blocksize_mask = (1 << blkbits) - 1; + ssize_t retval = -EINVAL; + loff_t end = offset; + struct dio *dio; + + if (rw & WRITE) + rw = WRITE_ODIRECT; + + if (bdev) + bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); + + if (offset & blocksize_mask) { + if (bdev) + blkbits = bdev_blkbits; + blocksize_mask = (1 << blkbits) - 1; + if (offset & blocksize_mask) + goto out; + } + + /* Check the memory alignment. Blocks cannot straddle pages */ + for (seg = 0; seg < nr_segs; seg++) { + addr = (unsigned long)iov[seg].iov_base; + size = iov[seg].iov_len; + end += size; + if ((addr & blocksize_mask) || (size & blocksize_mask)) { + if (bdev) + blkbits = bdev_blkbits; + blocksize_mask = (1 << blkbits) - 1; + if ((addr & blocksize_mask) || (size & blocksize_mask)) + goto out; + } + } + + dio = kmalloc(sizeof(*dio), GFP_KERNEL); + retval = -ENOMEM; + if (!dio) + goto out; + /* + * Believe it or not, zeroing out the page array caused a .5% + * performance regression in a database benchmark. So, we take + * care to only zero out what's needed. + */ + memset(dio, 0, offsetof(struct dio, pages)); + + dio->flags = flags; + if (dio->flags & DIO_LOCKING) { + /* watch out for a 0 len io from a tricksy fs */ + if (rw == READ && end > offset) { + struct address_space *mapping = + iocb->ki_filp->f_mapping; + + /* will be released by direct_io_worker */ + mutex_lock(&inode->i_mutex); + + retval = filemap_write_and_wait_range(mapping, offset, + end - 1); + if (retval) { + mutex_unlock(&inode->i_mutex); + kfree(dio); + goto out; + } + } + + /* + * Will be released at I/O completion, possibly in a + * different thread. + */ + down_read_non_owner(&inode->i_alloc_sem); + } + + /* + * For file extending writes updating i_size before data + * writeouts complete can expose uninitialized blocks. So + * even for AIO, we need to wait for i/o to complete before + * returning in this case. + */ + dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && + (end > i_size_read(inode))); + + retval = direct_io_worker(rw, iocb, inode, iov, offset, + nr_segs, blkbits, get_block, end_io, + submit_io, dio); + +out: + return retval; +} +EXPORT_SYMBOL(__blockdev_direct_IO); diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig new file mode 100644 index 00000000..1897eb1b --- /dev/null +++ b/fs/dlm/Kconfig @@ -0,0 +1,16 @@ +menuconfig DLM + tristate "Distributed Lock Manager (DLM)" + depends on EXPERIMENTAL && INET + depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) + select IP_SCTP + help + A general purpose distributed lock manager for kernel or userspace + applications. + +config DLM_DEBUG + bool "DLM debugging" + depends on DLM + help + Under the debugfs mount point, the name of each lockspace will + appear as a file in the "dlm" directory. The output is the + list of resource and locks the local node knows about. diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile new file mode 100644 index 00000000..ca1c9124 --- /dev/null +++ b/fs/dlm/Makefile @@ -0,0 +1,21 @@ +obj-$(CONFIG_DLM) += dlm.o +dlm-y := ast.o \ + config.o \ + dir.o \ + lock.o \ + lockspace.o \ + main.o \ + member.o \ + memory.o \ + midcomms.o \ + netlink.o \ + lowcomms.o \ + plock.o \ + rcom.o \ + recover.o \ + recoverd.o \ + requestqueue.o \ + user.o \ + util.o +dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o + diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c new file mode 100644 index 00000000..abc49f29 --- /dev/null +++ b/fs/dlm/ast.c @@ -0,0 +1,346 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lock.h" +#include "user.h" +#include "ast.h" + +#define WAKE_ASTS 0 + +static uint64_t ast_seq_count; +static struct list_head ast_queue; +static spinlock_t ast_queue_lock; +static struct task_struct * astd_task; +static unsigned long astd_wakeflags; +static struct mutex astd_running; + + +static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb) +{ + int i; + + log_print("last_bast %x %llu flags %x mode %d sb %d %x", + lkb->lkb_id, + (unsigned long long)lkb->lkb_last_bast.seq, + lkb->lkb_last_bast.flags, + lkb->lkb_last_bast.mode, + lkb->lkb_last_bast.sb_status, + lkb->lkb_last_bast.sb_flags); + + log_print("last_cast %x %llu flags %x mode %d sb %d %x", + lkb->lkb_id, + (unsigned long long)lkb->lkb_last_cast.seq, + lkb->lkb_last_cast.flags, + lkb->lkb_last_cast.mode, + lkb->lkb_last_cast.sb_status, + lkb->lkb_last_cast.sb_flags); + + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + log_print("cb %x %llu flags %x mode %d sb %d %x", + lkb->lkb_id, + (unsigned long long)lkb->lkb_callbacks[i].seq, + lkb->lkb_callbacks[i].flags, + lkb->lkb_callbacks[i].mode, + lkb->lkb_callbacks[i].sb_status, + lkb->lkb_callbacks[i].sb_flags); + } +} + +void dlm_del_ast(struct dlm_lkb *lkb) +{ + spin_lock(&ast_queue_lock); + if (!list_empty(&lkb->lkb_astqueue)) + list_del_init(&lkb->lkb_astqueue); + spin_unlock(&ast_queue_lock); +} + +int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags, uint64_t seq) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + uint64_t prev_seq; + int prev_mode; + int i; + + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + if (lkb->lkb_callbacks[i].seq) + continue; + + /* + * Suppress some redundant basts here, do more on removal. + * Don't even add a bast if the callback just before it + * is a bast for the same mode or a more restrictive mode. + * (the addional > PR check is needed for PR/CW inversion) + */ + + if ((i > 0) && (flags & DLM_CB_BAST) && + (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) { + + prev_seq = lkb->lkb_callbacks[i-1].seq; + prev_mode = lkb->lkb_callbacks[i-1].mode; + + if ((prev_mode == mode) || + (prev_mode > mode && prev_mode > DLM_LOCK_PR)) { + + log_debug(ls, "skip %x add bast %llu mode %d " + "for bast %llu mode %d", + lkb->lkb_id, + (unsigned long long)seq, + mode, + (unsigned long long)prev_seq, + prev_mode); + return 0; + } + } + + lkb->lkb_callbacks[i].seq = seq; + lkb->lkb_callbacks[i].flags = flags; + lkb->lkb_callbacks[i].mode = mode; + lkb->lkb_callbacks[i].sb_status = status; + lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF); + break; + } + + if (i == DLM_CALLBACKS_SIZE) { + log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x", + lkb->lkb_id, (unsigned long long)seq, + flags, mode, status, sbflags); + dlm_dump_lkb_callbacks(lkb); + return -1; + } + + return 0; +} + +int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_callback *cb, int *resid) +{ + int i; + + *resid = 0; + + if (!lkb->lkb_callbacks[0].seq) + return -ENOENT; + + /* oldest undelivered cb is callbacks[0] */ + + memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback)); + memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback)); + + /* shift others down */ + + for (i = 1; i < DLM_CALLBACKS_SIZE; i++) { + if (!lkb->lkb_callbacks[i].seq) + break; + memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i], + sizeof(struct dlm_callback)); + memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback)); + (*resid)++; + } + + /* if cb is a bast, it should be skipped if the blocking mode is + compatible with the last granted mode */ + + if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) { + if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) { + cb->flags |= DLM_CB_SKIP; + + log_debug(ls, "skip %x bast %llu mode %d " + "for cast %llu mode %d", + lkb->lkb_id, + (unsigned long long)cb->seq, + cb->mode, + (unsigned long long)lkb->lkb_last_cast.seq, + lkb->lkb_last_cast.mode); + return 0; + } + } + + if (cb->flags & DLM_CB_CAST) { + memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback)); + lkb->lkb_last_cast_time = ktime_get(); + } + + if (cb->flags & DLM_CB_BAST) { + memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback)); + lkb->lkb_last_bast_time = ktime_get(); + } + + return 0; +} + +void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, + uint32_t sbflags) +{ + uint64_t seq; + int rv; + + spin_lock(&ast_queue_lock); + + seq = ++ast_seq_count; + + if (lkb->lkb_flags & DLM_IFL_USER) { + spin_unlock(&ast_queue_lock); + dlm_user_add_ast(lkb, flags, mode, status, sbflags, seq); + return; + } + + rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq); + if (rv < 0) { + spin_unlock(&ast_queue_lock); + return; + } + + if (list_empty(&lkb->lkb_astqueue)) { + kref_get(&lkb->lkb_ref); + list_add_tail(&lkb->lkb_astqueue, &ast_queue); + } + spin_unlock(&ast_queue_lock); + + set_bit(WAKE_ASTS, &astd_wakeflags); + wake_up_process(astd_task); +} + +static void process_asts(void) +{ + struct dlm_ls *ls = NULL; + struct dlm_rsb *r = NULL; + struct dlm_lkb *lkb; + void (*castfn) (void *astparam); + void (*bastfn) (void *astparam, int mode); + struct dlm_callback callbacks[DLM_CALLBACKS_SIZE]; + int i, rv, resid; + +repeat: + spin_lock(&ast_queue_lock); + list_for_each_entry(lkb, &ast_queue, lkb_astqueue) { + r = lkb->lkb_resource; + ls = r->res_ls; + + if (dlm_locking_stopped(ls)) + continue; + + /* we remove from astqueue list and remove everything in + lkb_callbacks before releasing the spinlock so empty + lkb_astqueue is always consistent with empty lkb_callbacks */ + + list_del_init(&lkb->lkb_astqueue); + + castfn = lkb->lkb_astfn; + bastfn = lkb->lkb_bastfn; + + memset(&callbacks, 0, sizeof(callbacks)); + + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid); + if (rv < 0) + break; + } + spin_unlock(&ast_queue_lock); + + if (resid) { + /* shouldn't happen, for loop should have removed all */ + log_error(ls, "callback resid %d lkb %x", + resid, lkb->lkb_id); + } + + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + if (!callbacks[i].seq) + break; + if (callbacks[i].flags & DLM_CB_SKIP) { + continue; + } else if (callbacks[i].flags & DLM_CB_BAST) { + bastfn(lkb->lkb_astparam, callbacks[i].mode); + } else if (callbacks[i].flags & DLM_CB_CAST) { + lkb->lkb_lksb->sb_status = callbacks[i].sb_status; + lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; + castfn(lkb->lkb_astparam); + } + } + + /* removes ref for ast_queue, may cause lkb to be freed */ + dlm_put_lkb(lkb); + + cond_resched(); + goto repeat; + } + spin_unlock(&ast_queue_lock); +} + +static inline int no_asts(void) +{ + int ret; + + spin_lock(&ast_queue_lock); + ret = list_empty(&ast_queue); + spin_unlock(&ast_queue_lock); + return ret; +} + +static int dlm_astd(void *data) +{ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (!test_bit(WAKE_ASTS, &astd_wakeflags)) + schedule(); + set_current_state(TASK_RUNNING); + + mutex_lock(&astd_running); + if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags)) + process_asts(); + mutex_unlock(&astd_running); + } + return 0; +} + +void dlm_astd_wake(void) +{ + if (!no_asts()) { + set_bit(WAKE_ASTS, &astd_wakeflags); + wake_up_process(astd_task); + } +} + +int dlm_astd_start(void) +{ + struct task_struct *p; + int error = 0; + + INIT_LIST_HEAD(&ast_queue); + spin_lock_init(&ast_queue_lock); + mutex_init(&astd_running); + + p = kthread_run(dlm_astd, NULL, "dlm_astd"); + if (IS_ERR(p)) + error = PTR_ERR(p); + else + astd_task = p; + return error; +} + +void dlm_astd_stop(void) +{ + kthread_stop(astd_task); +} + +void dlm_astd_suspend(void) +{ + mutex_lock(&astd_running); +} + +void dlm_astd_resume(void) +{ + mutex_unlock(&astd_running); +} + diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h new file mode 100644 index 00000000..8aa89c9b --- /dev/null +++ b/fs/dlm/ast.h @@ -0,0 +1,31 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __ASTD_DOT_H__ +#define __ASTD_DOT_H__ + +void dlm_del_ast(struct dlm_lkb *lkb); +int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags, uint64_t seq); +int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_callback *cb, int *resid); +void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, + uint32_t sbflags); + +void dlm_astd_wake(void); +int dlm_astd_start(void); +void dlm_astd_stop(void); +void dlm_astd_suspend(void); +void dlm_astd_resume(void); + +#endif + diff --git a/fs/dlm/config.c b/fs/dlm/config.c new file mode 100644 index 00000000..9b026ea8 --- /dev/null +++ b/fs/dlm/config.c @@ -0,0 +1,1010 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "config.h" +#include "lowcomms.h" + +/* + * /config/dlm//spaces//nodes//nodeid + * /config/dlm//spaces//nodes//weight + * /config/dlm//comms//nodeid + * /config/dlm//comms//local + * /config/dlm//comms//addr + * The level is useless, but I haven't figured out how to avoid it. + */ + +static struct config_group *space_list; +static struct config_group *comm_list; +static struct dlm_comm *local_comm; + +struct dlm_clusters; +struct dlm_cluster; +struct dlm_spaces; +struct dlm_space; +struct dlm_comms; +struct dlm_comm; +struct dlm_nodes; +struct dlm_node; + +static struct config_group *make_cluster(struct config_group *, const char *); +static void drop_cluster(struct config_group *, struct config_item *); +static void release_cluster(struct config_item *); +static struct config_group *make_space(struct config_group *, const char *); +static void drop_space(struct config_group *, struct config_item *); +static void release_space(struct config_item *); +static struct config_item *make_comm(struct config_group *, const char *); +static void drop_comm(struct config_group *, struct config_item *); +static void release_comm(struct config_item *); +static struct config_item *make_node(struct config_group *, const char *); +static void drop_node(struct config_group *, struct config_item *); +static void release_node(struct config_item *); + +static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, + char *buf); +static ssize_t store_cluster(struct config_item *i, + struct configfs_attribute *a, + const char *buf, size_t len); +static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, + char *buf); +static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, + const char *buf, size_t len); +static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, + char *buf); +static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, + const char *buf, size_t len); + +static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf); +static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf, + size_t len); +static ssize_t comm_local_read(struct dlm_comm *cm, char *buf); +static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, + size_t len); +static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, + size_t len); +static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf); +static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, + size_t len); +static ssize_t node_weight_read(struct dlm_node *nd, char *buf); +static ssize_t node_weight_write(struct dlm_node *nd, const char *buf, + size_t len); + +struct dlm_cluster { + struct config_group group; + unsigned int cl_tcp_port; + unsigned int cl_buffer_size; + unsigned int cl_rsbtbl_size; + unsigned int cl_lkbtbl_size; + unsigned int cl_dirtbl_size; + unsigned int cl_recover_timer; + unsigned int cl_toss_secs; + unsigned int cl_scan_secs; + unsigned int cl_log_debug; + unsigned int cl_protocol; + unsigned int cl_timewarn_cs; + unsigned int cl_waitwarn_us; +}; + +enum { + CLUSTER_ATTR_TCP_PORT = 0, + CLUSTER_ATTR_BUFFER_SIZE, + CLUSTER_ATTR_RSBTBL_SIZE, + CLUSTER_ATTR_LKBTBL_SIZE, + CLUSTER_ATTR_DIRTBL_SIZE, + CLUSTER_ATTR_RECOVER_TIMER, + CLUSTER_ATTR_TOSS_SECS, + CLUSTER_ATTR_SCAN_SECS, + CLUSTER_ATTR_LOG_DEBUG, + CLUSTER_ATTR_PROTOCOL, + CLUSTER_ATTR_TIMEWARN_CS, + CLUSTER_ATTR_WAITWARN_US, +}; + +struct cluster_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct dlm_cluster *, char *); + ssize_t (*store)(struct dlm_cluster *, const char *, size_t); +}; + +static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, + int *info_field, int check_zero, + const char *buf, size_t len) +{ + unsigned int x; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + x = simple_strtoul(buf, NULL, 0); + + if (check_zero && !x) + return -EINVAL; + + *cl_field = x; + *info_field = x; + + return len; +} + +#define CLUSTER_ATTR(name, check_zero) \ +static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \ +{ \ + return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \ + check_zero, buf, len); \ +} \ +static ssize_t name##_read(struct dlm_cluster *cl, char *buf) \ +{ \ + return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \ +} \ +static struct cluster_attribute cluster_attr_##name = \ +__CONFIGFS_ATTR(name, 0644, name##_read, name##_write) + +CLUSTER_ATTR(tcp_port, 1); +CLUSTER_ATTR(buffer_size, 1); +CLUSTER_ATTR(rsbtbl_size, 1); +CLUSTER_ATTR(lkbtbl_size, 1); +CLUSTER_ATTR(dirtbl_size, 1); +CLUSTER_ATTR(recover_timer, 1); +CLUSTER_ATTR(toss_secs, 1); +CLUSTER_ATTR(scan_secs, 1); +CLUSTER_ATTR(log_debug, 0); +CLUSTER_ATTR(protocol, 0); +CLUSTER_ATTR(timewarn_cs, 1); +CLUSTER_ATTR(waitwarn_us, 0); + +static struct configfs_attribute *cluster_attrs[] = { + [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, + [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr, + [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr, + [CLUSTER_ATTR_LKBTBL_SIZE] = &cluster_attr_lkbtbl_size.attr, + [CLUSTER_ATTR_DIRTBL_SIZE] = &cluster_attr_dirtbl_size.attr, + [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr, + [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr, + [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr, + [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr, + [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr, + [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr, + [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr, + NULL, +}; + +enum { + COMM_ATTR_NODEID = 0, + COMM_ATTR_LOCAL, + COMM_ATTR_ADDR, +}; + +struct comm_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct dlm_comm *, char *); + ssize_t (*store)(struct dlm_comm *, const char *, size_t); +}; + +static struct comm_attribute comm_attr_nodeid = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "nodeid", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = comm_nodeid_read, + .store = comm_nodeid_write, +}; + +static struct comm_attribute comm_attr_local = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "local", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = comm_local_read, + .store = comm_local_write, +}; + +static struct comm_attribute comm_attr_addr = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "addr", + .ca_mode = S_IRUGO | S_IWUSR }, + .store = comm_addr_write, +}; + +static struct configfs_attribute *comm_attrs[] = { + [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr, + [COMM_ATTR_LOCAL] = &comm_attr_local.attr, + [COMM_ATTR_ADDR] = &comm_attr_addr.attr, + NULL, +}; + +enum { + NODE_ATTR_NODEID = 0, + NODE_ATTR_WEIGHT, +}; + +struct node_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct dlm_node *, char *); + ssize_t (*store)(struct dlm_node *, const char *, size_t); +}; + +static struct node_attribute node_attr_nodeid = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "nodeid", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = node_nodeid_read, + .store = node_nodeid_write, +}; + +static struct node_attribute node_attr_weight = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "weight", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = node_weight_read, + .store = node_weight_write, +}; + +static struct configfs_attribute *node_attrs[] = { + [NODE_ATTR_NODEID] = &node_attr_nodeid.attr, + [NODE_ATTR_WEIGHT] = &node_attr_weight.attr, + NULL, +}; + +struct dlm_clusters { + struct configfs_subsystem subsys; +}; + +struct dlm_spaces { + struct config_group ss_group; +}; + +struct dlm_space { + struct config_group group; + struct list_head members; + struct mutex members_lock; + int members_count; +}; + +struct dlm_comms { + struct config_group cs_group; +}; + +struct dlm_comm { + struct config_item item; + int nodeid; + int local; + int addr_count; + struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; +}; + +struct dlm_nodes { + struct config_group ns_group; +}; + +struct dlm_node { + struct config_item item; + struct list_head list; /* space->members */ + int nodeid; + int weight; + int new; +}; + +static struct configfs_group_operations clusters_ops = { + .make_group = make_cluster, + .drop_item = drop_cluster, +}; + +static struct configfs_item_operations cluster_ops = { + .release = release_cluster, + .show_attribute = show_cluster, + .store_attribute = store_cluster, +}; + +static struct configfs_group_operations spaces_ops = { + .make_group = make_space, + .drop_item = drop_space, +}; + +static struct configfs_item_operations space_ops = { + .release = release_space, +}; + +static struct configfs_group_operations comms_ops = { + .make_item = make_comm, + .drop_item = drop_comm, +}; + +static struct configfs_item_operations comm_ops = { + .release = release_comm, + .show_attribute = show_comm, + .store_attribute = store_comm, +}; + +static struct configfs_group_operations nodes_ops = { + .make_item = make_node, + .drop_item = drop_node, +}; + +static struct configfs_item_operations node_ops = { + .release = release_node, + .show_attribute = show_node, + .store_attribute = store_node, +}; + +static struct config_item_type clusters_type = { + .ct_group_ops = &clusters_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type cluster_type = { + .ct_item_ops = &cluster_ops, + .ct_attrs = cluster_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type spaces_type = { + .ct_group_ops = &spaces_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type space_type = { + .ct_item_ops = &space_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type comms_type = { + .ct_group_ops = &comms_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type comm_type = { + .ct_item_ops = &comm_ops, + .ct_attrs = comm_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type nodes_type = { + .ct_group_ops = &nodes_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type node_type = { + .ct_item_ops = &node_ops, + .ct_attrs = node_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct dlm_cluster *config_item_to_cluster(struct config_item *i) +{ + return i ? container_of(to_config_group(i), struct dlm_cluster, group) : + NULL; +} + +static struct dlm_space *config_item_to_space(struct config_item *i) +{ + return i ? container_of(to_config_group(i), struct dlm_space, group) : + NULL; +} + +static struct dlm_comm *config_item_to_comm(struct config_item *i) +{ + return i ? container_of(i, struct dlm_comm, item) : NULL; +} + +static struct dlm_node *config_item_to_node(struct config_item *i) +{ + return i ? container_of(i, struct dlm_node, item) : NULL; +} + +static struct config_group *make_cluster(struct config_group *g, + const char *name) +{ + struct dlm_cluster *cl = NULL; + struct dlm_spaces *sps = NULL; + struct dlm_comms *cms = NULL; + void *gps = NULL; + + cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS); + gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS); + sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS); + cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS); + + if (!cl || !gps || !sps || !cms) + goto fail; + + config_group_init_type_name(&cl->group, name, &cluster_type); + config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type); + config_group_init_type_name(&cms->cs_group, "comms", &comms_type); + + cl->group.default_groups = gps; + cl->group.default_groups[0] = &sps->ss_group; + cl->group.default_groups[1] = &cms->cs_group; + cl->group.default_groups[2] = NULL; + + cl->cl_tcp_port = dlm_config.ci_tcp_port; + cl->cl_buffer_size = dlm_config.ci_buffer_size; + cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size; + cl->cl_lkbtbl_size = dlm_config.ci_lkbtbl_size; + cl->cl_dirtbl_size = dlm_config.ci_dirtbl_size; + cl->cl_recover_timer = dlm_config.ci_recover_timer; + cl->cl_toss_secs = dlm_config.ci_toss_secs; + cl->cl_scan_secs = dlm_config.ci_scan_secs; + cl->cl_log_debug = dlm_config.ci_log_debug; + cl->cl_protocol = dlm_config.ci_protocol; + cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; + cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; + + space_list = &sps->ss_group; + comm_list = &cms->cs_group; + return &cl->group; + + fail: + kfree(cl); + kfree(gps); + kfree(sps); + kfree(cms); + return ERR_PTR(-ENOMEM); +} + +static void drop_cluster(struct config_group *g, struct config_item *i) +{ + struct dlm_cluster *cl = config_item_to_cluster(i); + struct config_item *tmp; + int j; + + for (j = 0; cl->group.default_groups[j]; j++) { + tmp = &cl->group.default_groups[j]->cg_item; + cl->group.default_groups[j] = NULL; + config_item_put(tmp); + } + + space_list = NULL; + comm_list = NULL; + + config_item_put(i); +} + +static void release_cluster(struct config_item *i) +{ + struct dlm_cluster *cl = config_item_to_cluster(i); + kfree(cl->group.default_groups); + kfree(cl); +} + +static struct config_group *make_space(struct config_group *g, const char *name) +{ + struct dlm_space *sp = NULL; + struct dlm_nodes *nds = NULL; + void *gps = NULL; + + sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS); + gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS); + nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS); + + if (!sp || !gps || !nds) + goto fail; + + config_group_init_type_name(&sp->group, name, &space_type); + config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type); + + sp->group.default_groups = gps; + sp->group.default_groups[0] = &nds->ns_group; + sp->group.default_groups[1] = NULL; + + INIT_LIST_HEAD(&sp->members); + mutex_init(&sp->members_lock); + sp->members_count = 0; + return &sp->group; + + fail: + kfree(sp); + kfree(gps); + kfree(nds); + return ERR_PTR(-ENOMEM); +} + +static void drop_space(struct config_group *g, struct config_item *i) +{ + struct dlm_space *sp = config_item_to_space(i); + struct config_item *tmp; + int j; + + /* assert list_empty(&sp->members) */ + + for (j = 0; sp->group.default_groups[j]; j++) { + tmp = &sp->group.default_groups[j]->cg_item; + sp->group.default_groups[j] = NULL; + config_item_put(tmp); + } + + config_item_put(i); +} + +static void release_space(struct config_item *i) +{ + struct dlm_space *sp = config_item_to_space(i); + kfree(sp->group.default_groups); + kfree(sp); +} + +static struct config_item *make_comm(struct config_group *g, const char *name) +{ + struct dlm_comm *cm; + + cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS); + if (!cm) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&cm->item, name, &comm_type); + cm->nodeid = -1; + cm->local = 0; + cm->addr_count = 0; + return &cm->item; +} + +static void drop_comm(struct config_group *g, struct config_item *i) +{ + struct dlm_comm *cm = config_item_to_comm(i); + if (local_comm == cm) + local_comm = NULL; + dlm_lowcomms_close(cm->nodeid); + while (cm->addr_count--) + kfree(cm->addr[cm->addr_count]); + config_item_put(i); +} + +static void release_comm(struct config_item *i) +{ + struct dlm_comm *cm = config_item_to_comm(i); + kfree(cm); +} + +static struct config_item *make_node(struct config_group *g, const char *name) +{ + struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); + struct dlm_node *nd; + + nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS); + if (!nd) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&nd->item, name, &node_type); + nd->nodeid = -1; + nd->weight = 1; /* default weight of 1 if none is set */ + nd->new = 1; /* set to 0 once it's been read by dlm_nodeid_list() */ + + mutex_lock(&sp->members_lock); + list_add(&nd->list, &sp->members); + sp->members_count++; + mutex_unlock(&sp->members_lock); + + return &nd->item; +} + +static void drop_node(struct config_group *g, struct config_item *i) +{ + struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); + struct dlm_node *nd = config_item_to_node(i); + + mutex_lock(&sp->members_lock); + list_del(&nd->list); + sp->members_count--; + mutex_unlock(&sp->members_lock); + + config_item_put(i); +} + +static void release_node(struct config_item *i) +{ + struct dlm_node *nd = config_item_to_node(i); + kfree(nd); +} + +static struct dlm_clusters clusters_root = { + .subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "dlm", + .ci_type = &clusters_type, + }, + }, + }, +}; + +int __init dlm_config_init(void) +{ + config_group_init(&clusters_root.subsys.su_group); + mutex_init(&clusters_root.subsys.su_mutex); + return configfs_register_subsystem(&clusters_root.subsys); +} + +void dlm_config_exit(void) +{ + configfs_unregister_subsystem(&clusters_root.subsys); +} + +/* + * Functions for user space to read/write attributes + */ + +static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, + char *buf) +{ + struct dlm_cluster *cl = config_item_to_cluster(i); + struct cluster_attribute *cla = + container_of(a, struct cluster_attribute, attr); + return cla->show ? cla->show(cl, buf) : 0; +} + +static ssize_t store_cluster(struct config_item *i, + struct configfs_attribute *a, + const char *buf, size_t len) +{ + struct dlm_cluster *cl = config_item_to_cluster(i); + struct cluster_attribute *cla = + container_of(a, struct cluster_attribute, attr); + return cla->store ? cla->store(cl, buf, len) : -EINVAL; +} + +static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, + char *buf) +{ + struct dlm_comm *cm = config_item_to_comm(i); + struct comm_attribute *cma = + container_of(a, struct comm_attribute, attr); + return cma->show ? cma->show(cm, buf) : 0; +} + +static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, + const char *buf, size_t len) +{ + struct dlm_comm *cm = config_item_to_comm(i); + struct comm_attribute *cma = + container_of(a, struct comm_attribute, attr); + return cma->store ? cma->store(cm, buf, len) : -EINVAL; +} + +static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf) +{ + return sprintf(buf, "%d\n", cm->nodeid); +} + +static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf, + size_t len) +{ + cm->nodeid = simple_strtol(buf, NULL, 0); + return len; +} + +static ssize_t comm_local_read(struct dlm_comm *cm, char *buf) +{ + return sprintf(buf, "%d\n", cm->local); +} + +static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, + size_t len) +{ + cm->local= simple_strtol(buf, NULL, 0); + if (cm->local && !local_comm) + local_comm = cm; + return len; +} + +static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len) +{ + struct sockaddr_storage *addr; + + if (len != sizeof(struct sockaddr_storage)) + return -EINVAL; + + if (cm->addr_count >= DLM_MAX_ADDR_COUNT) + return -ENOSPC; + + addr = kzalloc(sizeof(*addr), GFP_NOFS); + if (!addr) + return -ENOMEM; + + memcpy(addr, buf, len); + cm->addr[cm->addr_count++] = addr; + return len; +} + +static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, + char *buf) +{ + struct dlm_node *nd = config_item_to_node(i); + struct node_attribute *nda = + container_of(a, struct node_attribute, attr); + return nda->show ? nda->show(nd, buf) : 0; +} + +static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, + const char *buf, size_t len) +{ + struct dlm_node *nd = config_item_to_node(i); + struct node_attribute *nda = + container_of(a, struct node_attribute, attr); + return nda->store ? nda->store(nd, buf, len) : -EINVAL; +} + +static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf) +{ + return sprintf(buf, "%d\n", nd->nodeid); +} + +static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, + size_t len) +{ + nd->nodeid = simple_strtol(buf, NULL, 0); + return len; +} + +static ssize_t node_weight_read(struct dlm_node *nd, char *buf) +{ + return sprintf(buf, "%d\n", nd->weight); +} + +static ssize_t node_weight_write(struct dlm_node *nd, const char *buf, + size_t len) +{ + nd->weight = simple_strtol(buf, NULL, 0); + return len; +} + +/* + * Functions for the dlm to get the info that's been configured + */ + +static struct dlm_space *get_space(char *name) +{ + struct config_item *i; + + if (!space_list) + return NULL; + + mutex_lock(&space_list->cg_subsys->su_mutex); + i = config_group_find_item(space_list, name); + mutex_unlock(&space_list->cg_subsys->su_mutex); + + return config_item_to_space(i); +} + +static void put_space(struct dlm_space *sp) +{ + config_item_put(&sp->group.cg_item); +} + +static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y) +{ + switch (x->ss_family) { + case AF_INET: { + struct sockaddr_in *sinx = (struct sockaddr_in *)x; + struct sockaddr_in *siny = (struct sockaddr_in *)y; + if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr) + return 0; + if (sinx->sin_port != siny->sin_port) + return 0; + break; + } + case AF_INET6: { + struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x; + struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y; + if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr)) + return 0; + if (sinx->sin6_port != siny->sin6_port) + return 0; + break; + } + default: + return 0; + } + return 1; +} + +static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr) +{ + struct config_item *i; + struct dlm_comm *cm = NULL; + int found = 0; + + if (!comm_list) + return NULL; + + mutex_lock(&clusters_root.subsys.su_mutex); + + list_for_each_entry(i, &comm_list->cg_children, ci_entry) { + cm = config_item_to_comm(i); + + if (nodeid) { + if (cm->nodeid != nodeid) + continue; + found = 1; + config_item_get(i); + break; + } else { + if (!cm->addr_count || !addr_compare(cm->addr[0], addr)) + continue; + found = 1; + config_item_get(i); + break; + } + } + mutex_unlock(&clusters_root.subsys.su_mutex); + + if (!found) + cm = NULL; + return cm; +} + +static void put_comm(struct dlm_comm *cm) +{ + config_item_put(&cm->item); +} + +/* caller must free mem */ +int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, + int **new_out, int *new_count_out) +{ + struct dlm_space *sp; + struct dlm_node *nd; + int i = 0, rv = 0, ids_count = 0, new_count = 0; + int *ids, *new; + + sp = get_space(lsname); + if (!sp) + return -EEXIST; + + mutex_lock(&sp->members_lock); + if (!sp->members_count) { + rv = -EINVAL; + printk(KERN_ERR "dlm: zero members_count\n"); + goto out; + } + + ids_count = sp->members_count; + + ids = kcalloc(ids_count, sizeof(int), GFP_NOFS); + if (!ids) { + rv = -ENOMEM; + goto out; + } + + list_for_each_entry(nd, &sp->members, list) { + ids[i++] = nd->nodeid; + if (nd->new) + new_count++; + } + + if (ids_count != i) + printk(KERN_ERR "dlm: bad nodeid count %d %d\n", ids_count, i); + + if (!new_count) + goto out_ids; + + new = kcalloc(new_count, sizeof(int), GFP_NOFS); + if (!new) { + kfree(ids); + rv = -ENOMEM; + goto out; + } + + i = 0; + list_for_each_entry(nd, &sp->members, list) { + if (nd->new) { + new[i++] = nd->nodeid; + nd->new = 0; + } + } + *new_count_out = new_count; + *new_out = new; + + out_ids: + *ids_count_out = ids_count; + *ids_out = ids; + out: + mutex_unlock(&sp->members_lock); + put_space(sp); + return rv; +} + +int dlm_node_weight(char *lsname, int nodeid) +{ + struct dlm_space *sp; + struct dlm_node *nd; + int w = -EEXIST; + + sp = get_space(lsname); + if (!sp) + goto out; + + mutex_lock(&sp->members_lock); + list_for_each_entry(nd, &sp->members, list) { + if (nd->nodeid != nodeid) + continue; + w = nd->weight; + break; + } + mutex_unlock(&sp->members_lock); + put_space(sp); + out: + return w; +} + +int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr) +{ + struct dlm_comm *cm = get_comm(nodeid, NULL); + if (!cm) + return -EEXIST; + if (!cm->addr_count) + return -ENOENT; + memcpy(addr, cm->addr[0], sizeof(*addr)); + put_comm(cm); + return 0; +} + +int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) +{ + struct dlm_comm *cm = get_comm(0, addr); + if (!cm) + return -EEXIST; + *nodeid = cm->nodeid; + put_comm(cm); + return 0; +} + +int dlm_our_nodeid(void) +{ + return local_comm ? local_comm->nodeid : 0; +} + +/* num 0 is first addr, num 1 is second addr */ +int dlm_our_addr(struct sockaddr_storage *addr, int num) +{ + if (!local_comm) + return -1; + if (num + 1 > local_comm->addr_count) + return -1; + memcpy(addr, local_comm->addr[num], sizeof(*addr)); + return 0; +} + +/* Config file defaults */ +#define DEFAULT_TCP_PORT 21064 +#define DEFAULT_BUFFER_SIZE 4096 +#define DEFAULT_RSBTBL_SIZE 1024 +#define DEFAULT_LKBTBL_SIZE 1024 +#define DEFAULT_DIRTBL_SIZE 1024 +#define DEFAULT_RECOVER_TIMER 5 +#define DEFAULT_TOSS_SECS 10 +#define DEFAULT_SCAN_SECS 5 +#define DEFAULT_LOG_DEBUG 0 +#define DEFAULT_PROTOCOL 0 +#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ +#define DEFAULT_WAITWARN_US 0 + +struct dlm_config_info dlm_config = { + .ci_tcp_port = DEFAULT_TCP_PORT, + .ci_buffer_size = DEFAULT_BUFFER_SIZE, + .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, + .ci_lkbtbl_size = DEFAULT_LKBTBL_SIZE, + .ci_dirtbl_size = DEFAULT_DIRTBL_SIZE, + .ci_recover_timer = DEFAULT_RECOVER_TIMER, + .ci_toss_secs = DEFAULT_TOSS_SECS, + .ci_scan_secs = DEFAULT_SCAN_SECS, + .ci_log_debug = DEFAULT_LOG_DEBUG, + .ci_protocol = DEFAULT_PROTOCOL, + .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, + .ci_waitwarn_us = DEFAULT_WAITWARN_US +}; + diff --git a/fs/dlm/config.h b/fs/dlm/config.h new file mode 100644 index 00000000..dd0ce24d --- /dev/null +++ b/fs/dlm/config.h @@ -0,0 +1,47 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __CONFIG_DOT_H__ +#define __CONFIG_DOT_H__ + +#define DLM_MAX_ADDR_COUNT 3 + +struct dlm_config_info { + int ci_tcp_port; + int ci_buffer_size; + int ci_rsbtbl_size; + int ci_lkbtbl_size; + int ci_dirtbl_size; + int ci_recover_timer; + int ci_toss_secs; + int ci_scan_secs; + int ci_log_debug; + int ci_protocol; + int ci_timewarn_cs; + int ci_waitwarn_us; +}; + +extern struct dlm_config_info dlm_config; + +int dlm_config_init(void); +void dlm_config_exit(void); +int dlm_node_weight(char *lsname, int nodeid); +int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, + int **new_out, int *new_count_out); +int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr); +int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid); +int dlm_our_nodeid(void); +int dlm_our_addr(struct sockaddr_storage *addr, int num); + +#endif /* __CONFIG_DOT_H__ */ + diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c new file mode 100644 index 00000000..59779237 --- /dev/null +++ b/fs/dlm/debug_fs.c @@ -0,0 +1,731 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include "dlm_internal.h" +#include "lock.h" + +#define DLM_DEBUG_BUF_LEN 4096 +static char debug_buf[DLM_DEBUG_BUF_LEN]; +static struct mutex debug_buf_lock; + +static struct dentry *dlm_root; + +static char *print_lockmode(int mode) +{ + switch (mode) { + case DLM_LOCK_IV: + return "--"; + case DLM_LOCK_NL: + return "NL"; + case DLM_LOCK_CR: + return "CR"; + case DLM_LOCK_CW: + return "CW"; + case DLM_LOCK_PR: + return "PR"; + case DLM_LOCK_PW: + return "PW"; + case DLM_LOCK_EX: + return "EX"; + default: + return "??"; + } +} + +static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *res) +{ + seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); + + if (lkb->lkb_status == DLM_LKSTS_CONVERT || + lkb->lkb_status == DLM_LKSTS_WAITING) + seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode)); + + if (lkb->lkb_nodeid) { + if (lkb->lkb_nodeid != res->res_nodeid) + seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid, + lkb->lkb_remid); + else + seq_printf(s, " Master: %08x", lkb->lkb_remid); + } + + if (lkb->lkb_wait_type) + seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); + + return seq_printf(s, "\n"); +} + +static int print_format1(struct dlm_rsb *res, struct seq_file *s) +{ + struct dlm_lkb *lkb; + int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list; + int rv; + + lock_rsb(res); + + rv = seq_printf(s, "\nResource %p Name (len=%d) \"", + res, res->res_length); + if (rv) + goto out; + + for (i = 0; i < res->res_length; i++) { + if (isprint(res->res_name[i])) + seq_printf(s, "%c", res->res_name[i]); + else + seq_printf(s, "%c", '.'); + } + + if (res->res_nodeid > 0) + rv = seq_printf(s, "\" \nLocal Copy, Master is node %d\n", + res->res_nodeid); + else if (res->res_nodeid == 0) + rv = seq_printf(s, "\" \nMaster Copy\n"); + else if (res->res_nodeid == -1) + rv = seq_printf(s, "\" \nLooking up master (lkid %x)\n", + res->res_first_lkid); + else + rv = seq_printf(s, "\" \nInvalid master %d\n", + res->res_nodeid); + if (rv) + goto out; + + /* Print the LVB: */ + if (res->res_lvbptr) { + seq_printf(s, "LVB: "); + for (i = 0; i < lvblen; i++) { + if (i == lvblen / 2) + seq_printf(s, "\n "); + seq_printf(s, "%02x ", + (unsigned char) res->res_lvbptr[i]); + } + if (rsb_flag(res, RSB_VALNOTVALID)) + seq_printf(s, " (INVALID)"); + rv = seq_printf(s, "\n"); + if (rv) + goto out; + } + + root_list = !list_empty(&res->res_root_list); + recover_list = !list_empty(&res->res_recover_list); + + if (root_list || recover_list) { + rv = seq_printf(s, "Recovery: root %d recover %d flags %lx " + "count %d\n", root_list, recover_list, + res->res_flags, res->res_recover_locks_count); + if (rv) + goto out; + } + + /* Print the locks attached to this resource */ + seq_printf(s, "Granted Queue\n"); + list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) { + rv = print_format1_lock(s, lkb, res); + if (rv) + goto out; + } + + seq_printf(s, "Conversion Queue\n"); + list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) { + rv = print_format1_lock(s, lkb, res); + if (rv) + goto out; + } + + seq_printf(s, "Waiting Queue\n"); + list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) { + rv = print_format1_lock(s, lkb, res); + if (rv) + goto out; + } + + if (list_empty(&res->res_lookup)) + goto out; + + seq_printf(s, "Lookup Queue\n"); + list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) { + rv = seq_printf(s, "%08x %s", lkb->lkb_id, + print_lockmode(lkb->lkb_rqmode)); + if (lkb->lkb_wait_type) + seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); + rv = seq_printf(s, "\n"); + } + out: + unlock_rsb(res); + return rv; +} + +static int print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *r) +{ + u64 xid = 0; + u64 us; + int rv; + + if (lkb->lkb_flags & DLM_IFL_USER) { + if (lkb->lkb_ua) + xid = lkb->lkb_ua->xid; + } + + /* microseconds since lkb was added to current queue */ + us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp)); + + /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us + r_nodeid r_len r_name */ + + rv = seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n", + lkb->lkb_id, + lkb->lkb_nodeid, + lkb->lkb_remid, + lkb->lkb_ownpid, + (unsigned long long)xid, + lkb->lkb_exflags, + lkb->lkb_flags, + lkb->lkb_status, + lkb->lkb_grmode, + lkb->lkb_rqmode, + (unsigned long long)us, + r->res_nodeid, + r->res_length, + r->res_name); + return rv; +} + +static int print_format2(struct dlm_rsb *r, struct seq_file *s) +{ + struct dlm_lkb *lkb; + int rv = 0; + + lock_rsb(r); + + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + rv = print_format2_lock(s, lkb, r); + if (rv) + goto out; + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + rv = print_format2_lock(s, lkb, r); + if (rv) + goto out; + } + + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) { + rv = print_format2_lock(s, lkb, r); + if (rv) + goto out; + } + out: + unlock_rsb(r); + return rv; +} + +static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, + int rsb_lookup) +{ + u64 xid = 0; + int rv; + + if (lkb->lkb_flags & DLM_IFL_USER) { + if (lkb->lkb_ua) + xid = lkb->lkb_ua->xid; + } + + rv = seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n", + lkb->lkb_id, + lkb->lkb_nodeid, + lkb->lkb_remid, + lkb->lkb_ownpid, + (unsigned long long)xid, + lkb->lkb_exflags, + lkb->lkb_flags, + lkb->lkb_status, + lkb->lkb_grmode, + lkb->lkb_rqmode, + lkb->lkb_last_bast.mode, + rsb_lookup, + lkb->lkb_wait_type, + lkb->lkb_lvbseq, + (unsigned long long)ktime_to_ns(lkb->lkb_timestamp), + (unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time)); + return rv; +} + +static int print_format3(struct dlm_rsb *r, struct seq_file *s) +{ + struct dlm_lkb *lkb; + int i, lvblen = r->res_ls->ls_lvblen; + int print_name = 1; + int rv; + + lock_rsb(r); + + rv = seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ", + r, + r->res_nodeid, + r->res_first_lkid, + r->res_flags, + !list_empty(&r->res_root_list), + !list_empty(&r->res_recover_list), + r->res_recover_locks_count, + r->res_length); + if (rv) + goto out; + + for (i = 0; i < r->res_length; i++) { + if (!isascii(r->res_name[i]) || !isprint(r->res_name[i])) + print_name = 0; + } + + seq_printf(s, "%s", print_name ? "str " : "hex"); + + for (i = 0; i < r->res_length; i++) { + if (print_name) + seq_printf(s, "%c", r->res_name[i]); + else + seq_printf(s, " %02x", (unsigned char)r->res_name[i]); + } + rv = seq_printf(s, "\n"); + if (rv) + goto out; + + if (!r->res_lvbptr) + goto do_locks; + + seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen); + + for (i = 0; i < lvblen; i++) + seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]); + rv = seq_printf(s, "\n"); + if (rv) + goto out; + + do_locks: + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + rv = print_format3_lock(s, lkb, 0); + if (rv) + goto out; + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + rv = print_format3_lock(s, lkb, 0); + if (rv) + goto out; + } + + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) { + rv = print_format3_lock(s, lkb, 0); + if (rv) + goto out; + } + + list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) { + rv = print_format3_lock(s, lkb, 1); + if (rv) + goto out; + } + out: + unlock_rsb(r); + return rv; +} + +struct rsbtbl_iter { + struct dlm_rsb *rsb; + unsigned bucket; + int format; + int header; +}; + +/* seq_printf returns -1 if the buffer is full, and 0 otherwise. + If the buffer is full, seq_printf can be called again, but it + does nothing and just returns -1. So, the these printing routines + periodically check the return value to avoid wasting too much time + trying to print to a full buffer. */ + +static int table_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct rsbtbl_iter *ri = iter_ptr; + int rv = 0; + + switch (ri->format) { + case 1: + rv = print_format1(ri->rsb, seq); + break; + case 2: + if (ri->header) { + seq_printf(seq, "id nodeid remid pid xid exflags " + "flags sts grmode rqmode time_ms " + "r_nodeid r_len r_name\n"); + ri->header = 0; + } + rv = print_format2(ri->rsb, seq); + break; + case 3: + if (ri->header) { + seq_printf(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n"); + ri->header = 0; + } + rv = print_format3(ri->rsb, seq); + break; + } + + return rv; +} + +static const struct seq_operations format1_seq_ops; +static const struct seq_operations format2_seq_ops; +static const struct seq_operations format3_seq_ops; + +static void *table_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct dlm_ls *ls = seq->private; + struct rsbtbl_iter *ri; + struct dlm_rsb *r; + loff_t n = *pos; + unsigned bucket, entry; + + bucket = n >> 32; + entry = n & ((1LL << 32) - 1); + + if (bucket >= ls->ls_rsbtbl_size) + return NULL; + + ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS); + if (!ri) + return NULL; + if (n == 0) + ri->header = 1; + if (seq->op == &format1_seq_ops) + ri->format = 1; + if (seq->op == &format2_seq_ops) + ri->format = 2; + if (seq->op == &format3_seq_ops) + ri->format = 3; + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + if (!list_empty(&ls->ls_rsbtbl[bucket].list)) { + list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, + res_hashchain) { + if (!entry--) { + dlm_hold_rsb(r); + ri->rsb = r; + ri->bucket = bucket; + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + return ri; + } + } + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + + /* + * move to the first rsb in the next non-empty bucket + */ + + /* zero the entry */ + n &= ~((1LL << 32) - 1); + + while (1) { + bucket++; + n += 1LL << 32; + + if (bucket >= ls->ls_rsbtbl_size) { + kfree(ri); + return NULL; + } + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + if (!list_empty(&ls->ls_rsbtbl[bucket].list)) { + r = list_first_entry(&ls->ls_rsbtbl[bucket].list, + struct dlm_rsb, res_hashchain); + dlm_hold_rsb(r); + ri->rsb = r; + ri->bucket = bucket; + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + *pos = n; + return ri; + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + } +} + +static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) +{ + struct dlm_ls *ls = seq->private; + struct rsbtbl_iter *ri = iter_ptr; + struct list_head *next; + struct dlm_rsb *r, *rp; + loff_t n = *pos; + unsigned bucket; + + bucket = n >> 32; + + /* + * move to the next rsb in the same bucket + */ + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + rp = ri->rsb; + next = rp->res_hashchain.next; + + if (next != &ls->ls_rsbtbl[bucket].list) { + r = list_entry(next, struct dlm_rsb, res_hashchain); + dlm_hold_rsb(r); + ri->rsb = r; + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + dlm_put_rsb(rp); + ++*pos; + return ri; + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + dlm_put_rsb(rp); + + /* + * move to the first rsb in the next non-empty bucket + */ + + /* zero the entry */ + n &= ~((1LL << 32) - 1); + + while (1) { + bucket++; + n += 1LL << 32; + + if (bucket >= ls->ls_rsbtbl_size) { + kfree(ri); + return NULL; + } + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + if (!list_empty(&ls->ls_rsbtbl[bucket].list)) { + r = list_first_entry(&ls->ls_rsbtbl[bucket].list, + struct dlm_rsb, res_hashchain); + dlm_hold_rsb(r); + ri->rsb = r; + ri->bucket = bucket; + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + *pos = n; + return ri; + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + } +} + +static void table_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + struct rsbtbl_iter *ri = iter_ptr; + + if (ri) { + dlm_put_rsb(ri->rsb); + kfree(ri); + } +} + +static const struct seq_operations format1_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + +static const struct seq_operations format2_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + +static const struct seq_operations format3_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + +static const struct file_operations format1_fops; +static const struct file_operations format2_fops; +static const struct file_operations format3_fops; + +static int table_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret = -1; + + if (file->f_op == &format1_fops) + ret = seq_open(file, &format1_seq_ops); + else if (file->f_op == &format2_fops) + ret = seq_open(file, &format2_seq_ops); + else if (file->f_op == &format3_fops) + ret = seq_open(file, &format3_seq_ops); + + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static const struct file_operations format1_fops = { + .owner = THIS_MODULE, + .open = table_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static const struct file_operations format2_fops = { + .owner = THIS_MODULE, + .open = table_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static const struct file_operations format3_fops = { + .owner = THIS_MODULE, + .open = table_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* + * dump lkb's on the ls_waiters list + */ + +static int waiters_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +static ssize_t waiters_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct dlm_ls *ls = file->private_data; + struct dlm_lkb *lkb; + size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv; + + mutex_lock(&debug_buf_lock); + mutex_lock(&ls->ls_waiters_mutex); + memset(debug_buf, 0, sizeof(debug_buf)); + + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { + ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n", + lkb->lkb_id, lkb->lkb_wait_type, + lkb->lkb_nodeid, lkb->lkb_resource->res_name); + if (ret >= len - pos) + break; + pos += ret; + } + mutex_unlock(&ls->ls_waiters_mutex); + + rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos); + mutex_unlock(&debug_buf_lock); + return rv; +} + +static const struct file_operations waiters_fops = { + .owner = THIS_MODULE, + .open = waiters_open, + .read = waiters_read, + .llseek = default_llseek, +}; + +void dlm_delete_debug_file(struct dlm_ls *ls) +{ + if (ls->ls_debug_rsb_dentry) + debugfs_remove(ls->ls_debug_rsb_dentry); + if (ls->ls_debug_waiters_dentry) + debugfs_remove(ls->ls_debug_waiters_dentry); + if (ls->ls_debug_locks_dentry) + debugfs_remove(ls->ls_debug_locks_dentry); + if (ls->ls_debug_all_dentry) + debugfs_remove(ls->ls_debug_all_dentry); +} + +int dlm_create_debug_file(struct dlm_ls *ls) +{ + char name[DLM_LOCKSPACE_LEN+8]; + + /* format 1 */ + + ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &format1_fops); + if (!ls->ls_debug_rsb_dentry) + goto fail; + + /* format 2 */ + + memset(name, 0, sizeof(name)); + snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name); + + ls->ls_debug_locks_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &format2_fops); + if (!ls->ls_debug_locks_dentry) + goto fail; + + /* format 3 */ + + memset(name, 0, sizeof(name)); + snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name); + + ls->ls_debug_all_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &format3_fops); + if (!ls->ls_debug_all_dentry) + goto fail; + + memset(name, 0, sizeof(name)); + snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name); + + ls->ls_debug_waiters_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &waiters_fops); + if (!ls->ls_debug_waiters_dentry) + goto fail; + + return 0; + + fail: + dlm_delete_debug_file(ls); + return -ENOMEM; +} + +int __init dlm_register_debugfs(void) +{ + mutex_init(&debug_buf_lock); + dlm_root = debugfs_create_dir("dlm", NULL); + return dlm_root ? 0 : -ENOMEM; +} + +void dlm_unregister_debugfs(void) +{ + debugfs_remove(dlm_root); +} + diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c new file mode 100644 index 00000000..7b84c1db --- /dev/null +++ b/fs/dlm/dir.c @@ -0,0 +1,441 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "lowcomms.h" +#include "rcom.h" +#include "config.h" +#include "memory.h" +#include "recover.h" +#include "util.h" +#include "lock.h" +#include "dir.h" + + +static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de) +{ + spin_lock(&ls->ls_recover_list_lock); + list_add(&de->list, &ls->ls_recover_list); + spin_unlock(&ls->ls_recover_list_lock); +} + +static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len) +{ + int found = 0; + struct dlm_direntry *de; + + spin_lock(&ls->ls_recover_list_lock); + list_for_each_entry(de, &ls->ls_recover_list, list) { + if (de->length == len) { + list_del(&de->list); + de->master_nodeid = 0; + memset(de->name, 0, len); + found = 1; + break; + } + } + spin_unlock(&ls->ls_recover_list_lock); + + if (!found) + de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS); + return de; +} + +void dlm_clear_free_entries(struct dlm_ls *ls) +{ + struct dlm_direntry *de; + + spin_lock(&ls->ls_recover_list_lock); + while (!list_empty(&ls->ls_recover_list)) { + de = list_entry(ls->ls_recover_list.next, struct dlm_direntry, + list); + list_del(&de->list); + kfree(de); + } + spin_unlock(&ls->ls_recover_list_lock); +} + +/* + * We use the upper 16 bits of the hash value to select the directory node. + * Low bits are used for distribution of rsb's among hash buckets on each node. + * + * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of + * num_nodes to the hash value. This value in the desired range is used as an + * offset into the sorted list of nodeid's to give the particular nodeid. + */ + +int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash) +{ + struct list_head *tmp; + struct dlm_member *memb = NULL; + uint32_t node, n = 0; + int nodeid; + + if (ls->ls_num_nodes == 1) { + nodeid = dlm_our_nodeid(); + goto out; + } + + if (ls->ls_node_array) { + node = (hash >> 16) % ls->ls_total_weight; + nodeid = ls->ls_node_array[node]; + goto out; + } + + /* make_member_array() failed to kmalloc ls_node_array... */ + + node = (hash >> 16) % ls->ls_num_nodes; + + list_for_each(tmp, &ls->ls_nodes) { + if (n++ != node) + continue; + memb = list_entry(tmp, struct dlm_member, list); + break; + } + + DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n", + ls->ls_num_nodes, n, node);); + nodeid = memb->nodeid; + out: + return nodeid; +} + +int dlm_dir_nodeid(struct dlm_rsb *r) +{ + return dlm_hash2nodeid(r->res_ls, r->res_hash); +} + +static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len) +{ + uint32_t val; + + val = jhash(name, len, 0); + val &= (ls->ls_dirtbl_size - 1); + + return val; +} + +static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de) +{ + uint32_t bucket; + + bucket = dir_hash(ls, de->name, de->length); + list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); +} + +static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name, + int namelen, uint32_t bucket) +{ + struct dlm_direntry *de; + + list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) { + if (de->length == namelen && !memcmp(name, de->name, namelen)) + goto out; + } + de = NULL; + out: + return de; +} + +void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen) +{ + struct dlm_direntry *de; + uint32_t bucket; + + bucket = dir_hash(ls, name, namelen); + + spin_lock(&ls->ls_dirtbl[bucket].lock); + + de = search_bucket(ls, name, namelen, bucket); + + if (!de) { + log_error(ls, "remove fr %u none", nodeid); + goto out; + } + + if (de->master_nodeid != nodeid) { + log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid); + goto out; + } + + list_del(&de->list); + kfree(de); + out: + spin_unlock(&ls->ls_dirtbl[bucket].lock); +} + +void dlm_dir_clear(struct dlm_ls *ls) +{ + struct list_head *head; + struct dlm_direntry *de; + int i; + + DLM_ASSERT(list_empty(&ls->ls_recover_list), ); + + for (i = 0; i < ls->ls_dirtbl_size; i++) { + spin_lock(&ls->ls_dirtbl[i].lock); + head = &ls->ls_dirtbl[i].list; + while (!list_empty(head)) { + de = list_entry(head->next, struct dlm_direntry, list); + list_del(&de->list); + put_free_de(ls, de); + } + spin_unlock(&ls->ls_dirtbl[i].lock); + } +} + +int dlm_recover_directory(struct dlm_ls *ls) +{ + struct dlm_member *memb; + struct dlm_direntry *de; + char *b, *last_name = NULL; + int error = -ENOMEM, last_len, count = 0; + uint16_t namelen; + + log_debug(ls, "dlm_recover_directory"); + + if (dlm_no_directory(ls)) + goto out_status; + + dlm_dir_clear(ls); + + last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS); + if (!last_name) + goto out; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + memset(last_name, 0, DLM_RESNAME_MAXLEN); + last_len = 0; + + for (;;) { + int left; + error = dlm_recovery_stopped(ls); + if (error) + goto out_free; + + error = dlm_rcom_names(ls, memb->nodeid, + last_name, last_len); + if (error) + goto out_free; + + schedule(); + + /* + * pick namelen/name pairs out of received buffer + */ + + b = ls->ls_recover_buf->rc_buf; + left = ls->ls_recover_buf->rc_header.h_length; + left -= sizeof(struct dlm_rcom); + + for (;;) { + __be16 v; + + error = -EINVAL; + if (left < sizeof(__be16)) + goto out_free; + + memcpy(&v, b, sizeof(__be16)); + namelen = be16_to_cpu(v); + b += sizeof(__be16); + left -= sizeof(__be16); + + /* namelen of 0xFFFFF marks end of names for + this node; namelen of 0 marks end of the + buffer */ + + if (namelen == 0xFFFF) + goto done; + if (!namelen) + break; + + if (namelen > left) + goto out_free; + + if (namelen > DLM_RESNAME_MAXLEN) + goto out_free; + + error = -ENOMEM; + de = get_free_de(ls, namelen); + if (!de) + goto out_free; + + de->master_nodeid = memb->nodeid; + de->length = namelen; + last_len = namelen; + memcpy(de->name, b, namelen); + memcpy(last_name, b, namelen); + b += namelen; + left -= namelen; + + add_entry_to_hash(ls, de); + count++; + } + } + done: + ; + } + + out_status: + error = 0; + dlm_set_recover_status(ls, DLM_RS_DIR); + log_debug(ls, "dlm_recover_directory %d entries", count); + out_free: + kfree(last_name); + out: + dlm_clear_free_entries(ls); + return error; +} + +static int get_entry(struct dlm_ls *ls, int nodeid, char *name, + int namelen, int *r_nodeid) +{ + struct dlm_direntry *de, *tmp; + uint32_t bucket; + + bucket = dir_hash(ls, name, namelen); + + spin_lock(&ls->ls_dirtbl[bucket].lock); + de = search_bucket(ls, name, namelen, bucket); + if (de) { + *r_nodeid = de->master_nodeid; + spin_unlock(&ls->ls_dirtbl[bucket].lock); + if (*r_nodeid == nodeid) + return -EEXIST; + return 0; + } + + spin_unlock(&ls->ls_dirtbl[bucket].lock); + + if (namelen > DLM_RESNAME_MAXLEN) + return -EINVAL; + + de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS); + if (!de) + return -ENOMEM; + + de->master_nodeid = nodeid; + de->length = namelen; + memcpy(de->name, name, namelen); + + spin_lock(&ls->ls_dirtbl[bucket].lock); + tmp = search_bucket(ls, name, namelen, bucket); + if (tmp) { + kfree(de); + de = tmp; + } else { + list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); + } + *r_nodeid = de->master_nodeid; + spin_unlock(&ls->ls_dirtbl[bucket].lock); + return 0; +} + +int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen, + int *r_nodeid) +{ + return get_entry(ls, nodeid, name, namelen, r_nodeid); +} + +static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len) +{ + struct dlm_rsb *r; + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + if (len == r->res_length && !memcmp(name, r->res_name, len)) { + up_read(&ls->ls_root_sem); + return r; + } + } + up_read(&ls->ls_root_sem); + return NULL; +} + +/* Find the rsb where we left off (or start again), then send rsb names + for rsb's we're master of and whose directory node matches the requesting + node. inbuf is the rsb name last sent, inlen is the name's length */ + +void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, + char *outbuf, int outlen, int nodeid) +{ + struct list_head *list; + struct dlm_rsb *r; + int offset = 0, dir_nodeid; + __be16 be_namelen; + + down_read(&ls->ls_root_sem); + + if (inlen > 1) { + r = find_rsb_root(ls, inbuf, inlen); + if (!r) { + inbuf[inlen - 1] = '\0'; + log_error(ls, "copy_master_names from %d start %d %s", + nodeid, inlen, inbuf); + goto out; + } + list = r->res_root_list.next; + } else { + list = ls->ls_root_list.next; + } + + for (offset = 0; list != &ls->ls_root_list; list = list->next) { + r = list_entry(list, struct dlm_rsb, res_root_list); + if (r->res_nodeid) + continue; + + dir_nodeid = dlm_dir_nodeid(r); + if (dir_nodeid != nodeid) + continue; + + /* + * The block ends when we can't fit the following in the + * remaining buffer space: + * namelen (uint16_t) + + * name (r->res_length) + + * end-of-block record 0x0000 (uint16_t) + */ + + if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) { + /* Write end-of-block record */ + be_namelen = cpu_to_be16(0); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); + goto out; + } + + be_namelen = cpu_to_be16(r->res_length); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); + memcpy(outbuf + offset, r->res_name, r->res_length); + offset += r->res_length; + } + + /* + * If we've reached the end of the list (and there's room) write a + * terminating record. + */ + + if ((list == &ls->ls_root_list) && + (offset + sizeof(uint16_t) <= outlen)) { + be_namelen = cpu_to_be16(0xFFFF); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); + } + + out: + up_read(&ls->ls_root_sem); +} + diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h new file mode 100644 index 00000000..0b0eb126 --- /dev/null +++ b/fs/dlm/dir.h @@ -0,0 +1,30 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __DIR_DOT_H__ +#define __DIR_DOT_H__ + + +int dlm_dir_nodeid(struct dlm_rsb *rsb); +int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash); +void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len); +void dlm_dir_clear(struct dlm_ls *ls); +void dlm_clear_free_entries(struct dlm_ls *ls); +int dlm_recover_directory(struct dlm_ls *ls); +int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen, + int *r_nodeid); +void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, + char *outbuf, int outlen, int nodeid); + +#endif /* __DIR_DOT_H__ */ + diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h new file mode 100644 index 00000000..0262451e --- /dev/null +++ b/fs/dlm/dlm_internal.h @@ -0,0 +1,616 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __DLM_INTERNAL_DOT_H__ +#define __DLM_INTERNAL_DOT_H__ + +/* + * This is the main header file to be included in each DLM source file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "config.h" + +/* Size of the temp buffer midcomms allocates on the stack. + We try to make this large enough so most messages fit. + FIXME: should sctp make this unnecessary? */ + +#define DLM_INBUF_LEN 148 + +struct dlm_ls; +struct dlm_lkb; +struct dlm_rsb; +struct dlm_member; +struct dlm_lkbtable; +struct dlm_rsbtable; +struct dlm_dirtable; +struct dlm_direntry; +struct dlm_recover; +struct dlm_header; +struct dlm_message; +struct dlm_rcom; +struct dlm_mhandle; + +#define log_print(fmt, args...) \ + printk(KERN_ERR "dlm: "fmt"\n" , ##args) +#define log_error(ls, fmt, args...) \ + printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) + +#define log_debug(ls, fmt, args...) \ +do { \ + if (dlm_config.ci_log_debug) \ + printk(KERN_DEBUG "dlm: %s: " fmt "\n", \ + (ls)->ls_name , ##args); \ +} while (0) + +#define DLM_ASSERT(x, do) \ +{ \ + if (!(x)) \ + { \ + printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \ + "DLM: assertion: \"%s\"\n" \ + "DLM: time = %lu\n", \ + __LINE__, __FILE__, #x, jiffies); \ + {do} \ + printk("\n"); \ + BUG(); \ + panic("DLM: Record message above and reboot.\n"); \ + } \ +} + + +struct dlm_direntry { + struct list_head list; + uint32_t master_nodeid; + uint16_t length; + char name[1]; +}; + +struct dlm_dirtable { + struct list_head list; + spinlock_t lock; +}; + +struct dlm_rsbtable { + struct list_head list; + struct list_head toss; + spinlock_t lock; +}; + +struct dlm_lkbtable { + struct list_head list; + rwlock_t lock; + uint16_t counter; +}; + +/* + * Lockspace member (per node in a ls) + */ + +struct dlm_member { + struct list_head list; + int nodeid; + int weight; +}; + +/* + * Save and manage recovery state for a lockspace. + */ + +struct dlm_recover { + struct list_head list; + int *nodeids; /* nodeids of all members */ + int node_count; + int *new; /* nodeids of new members */ + int new_count; + uint64_t seq; +}; + +/* + * Pass input args to second stage locking function. + */ + +struct dlm_args { + uint32_t flags; + void (*astfn) (void *astparam); + void *astparam; + void (*bastfn) (void *astparam, int mode); + int mode; + struct dlm_lksb *lksb; + unsigned long timeout; +}; + + +/* + * Lock block + * + * A lock can be one of three types: + * + * local copy lock is mastered locally + * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set) + * process copy lock is mastered on a remote node + * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set) + * master copy master node's copy of a lock owned by remote node + * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set) + * + * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or + * dlm_unlock. The dlm does not modify these or use any private flags in + * this field; it only contains DLM_LKF_ flags from dlm.h. These flags + * are sent as-is to the remote master when the lock is remote. + * + * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h. + * Some internal flags are shared between the master and process nodes; + * these shared flags are kept in the lower two bytes. One of these + * flags set on the master copy will be propagated to the process copy + * and v.v. Other internal flags are private to the master or process + * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes. + * + * lkb_sbflags: status block flags. These flags are copied directly into + * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion + * ast. All defined in dlm.h with DLM_SBF_ prefix. + * + * lkb_status: the lock status indicates which rsb queue the lock is + * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT + * + * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a + * reply is needed. Only set when the lkb is on the lockspace waiters + * list awaiting a reply from a remote node. + * + * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb + * is a master copy, nodeid specifies the remote lock holder, when the + * lkb is a process copy, the nodeid specifies the lock master. + */ + +/* lkb_status */ + +#define DLM_LKSTS_WAITING 1 +#define DLM_LKSTS_GRANTED 2 +#define DLM_LKSTS_CONVERT 3 + +/* lkb_flags */ + +#define DLM_IFL_MSTCPY 0x00010000 +#define DLM_IFL_RESEND 0x00020000 +#define DLM_IFL_DEAD 0x00040000 +#define DLM_IFL_OVERLAP_UNLOCK 0x00080000 +#define DLM_IFL_OVERLAP_CANCEL 0x00100000 +#define DLM_IFL_ENDOFLIFE 0x00200000 +#define DLM_IFL_WATCH_TIMEWARN 0x00400000 +#define DLM_IFL_TIMEOUT_CANCEL 0x00800000 +#define DLM_IFL_DEADLOCK_CANCEL 0x01000000 +#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ +#define DLM_IFL_USER 0x00000001 +#define DLM_IFL_ORPHAN 0x00000002 + +#define DLM_CALLBACKS_SIZE 6 + +#define DLM_CB_CAST 0x00000001 +#define DLM_CB_BAST 0x00000002 +#define DLM_CB_SKIP 0x00000004 + +struct dlm_callback { + uint64_t seq; + uint32_t flags; /* DLM_CBF_ */ + int sb_status; /* copy to lksb status */ + uint8_t sb_flags; /* copy to lksb flags */ + int8_t mode; /* rq mode of bast, gr mode of cast */ +}; + +struct dlm_lkb { + struct dlm_rsb *lkb_resource; /* the rsb */ + struct kref lkb_ref; + int lkb_nodeid; /* copied from rsb */ + int lkb_ownpid; /* pid of lock owner */ + uint32_t lkb_id; /* our lock ID */ + uint32_t lkb_remid; /* lock ID on remote partner */ + uint32_t lkb_exflags; /* external flags from caller */ + uint32_t lkb_sbflags; /* lksb flags */ + uint32_t lkb_flags; /* internal flags */ + uint32_t lkb_lvbseq; /* lvb sequence number */ + + int8_t lkb_status; /* granted, waiting, convert */ + int8_t lkb_rqmode; /* requested lock mode */ + int8_t lkb_grmode; /* granted lock mode */ + int8_t lkb_highbast; /* highest mode bast sent for */ + + int8_t lkb_wait_type; /* type of reply waiting for */ + int8_t lkb_wait_count; + int lkb_wait_nodeid; /* for debugging */ + + struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ + struct list_head lkb_statequeue; /* rsb g/c/w list */ + struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */ + struct list_head lkb_wait_reply; /* waiting for remote reply */ + struct list_head lkb_astqueue; /* need ast to be sent */ + struct list_head lkb_ownqueue; /* list of locks for a process */ + struct list_head lkb_time_list; + ktime_t lkb_timestamp; + ktime_t lkb_wait_time; + unsigned long lkb_timeout_cs; + + struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE]; + struct dlm_callback lkb_last_cast; + struct dlm_callback lkb_last_bast; + ktime_t lkb_last_cast_time; /* for debugging */ + ktime_t lkb_last_bast_time; /* for debugging */ + + char *lkb_lvbptr; + struct dlm_lksb *lkb_lksb; /* caller's status block */ + void (*lkb_astfn) (void *astparam); + void (*lkb_bastfn) (void *astparam, int mode); + union { + void *lkb_astparam; /* caller's ast arg */ + struct dlm_user_args *lkb_ua; + }; +}; + + +struct dlm_rsb { + struct dlm_ls *res_ls; /* the lockspace */ + struct kref res_ref; + struct mutex res_mutex; + unsigned long res_flags; + int res_length; /* length of rsb name */ + int res_nodeid; + uint32_t res_lvbseq; + uint32_t res_hash; + uint32_t res_bucket; /* rsbtbl */ + unsigned long res_toss_time; + uint32_t res_first_lkid; + struct list_head res_lookup; /* lkbs waiting on first */ + struct list_head res_hashchain; /* rsbtbl */ + struct list_head res_grantqueue; + struct list_head res_convertqueue; + struct list_head res_waitqueue; + + struct list_head res_root_list; /* used for recovery */ + struct list_head res_recover_list; /* used for recovery */ + int res_recover_locks_count; + + char *res_lvbptr; + char res_name[1]; +}; + +/* find_rsb() flags */ + +#define R_MASTER 1 /* only return rsb if it's a master */ +#define R_CREATE 2 /* create/add rsb if not found */ + +/* rsb_flags */ + +enum rsb_flags { + RSB_MASTER_UNCERTAIN, + RSB_VALNOTVALID, + RSB_VALNOTVALID_PREV, + RSB_NEW_MASTER, + RSB_NEW_MASTER2, + RSB_RECOVER_CONVERT, + RSB_LOCKS_PURGED, +}; + +static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag) +{ + __set_bit(flag, &r->res_flags); +} + +static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag) +{ + __clear_bit(flag, &r->res_flags); +} + +static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag) +{ + return test_bit(flag, &r->res_flags); +} + + +/* dlm_header is first element of all structs sent between nodes */ + +#define DLM_HEADER_MAJOR 0x00030000 +#define DLM_HEADER_MINOR 0x00000000 + +#define DLM_MSG 1 +#define DLM_RCOM 2 + +struct dlm_header { + uint32_t h_version; + uint32_t h_lockspace; + uint32_t h_nodeid; /* nodeid of sender */ + uint16_t h_length; + uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */ + uint8_t h_pad; +}; + + +#define DLM_MSG_REQUEST 1 +#define DLM_MSG_CONVERT 2 +#define DLM_MSG_UNLOCK 3 +#define DLM_MSG_CANCEL 4 +#define DLM_MSG_REQUEST_REPLY 5 +#define DLM_MSG_CONVERT_REPLY 6 +#define DLM_MSG_UNLOCK_REPLY 7 +#define DLM_MSG_CANCEL_REPLY 8 +#define DLM_MSG_GRANT 9 +#define DLM_MSG_BAST 10 +#define DLM_MSG_LOOKUP 11 +#define DLM_MSG_REMOVE 12 +#define DLM_MSG_LOOKUP_REPLY 13 +#define DLM_MSG_PURGE 14 + +struct dlm_message { + struct dlm_header m_header; + uint32_t m_type; /* DLM_MSG_ */ + uint32_t m_nodeid; + uint32_t m_pid; + uint32_t m_lkid; /* lkid on sender */ + uint32_t m_remid; /* lkid on receiver */ + uint32_t m_parent_lkid; + uint32_t m_parent_remid; + uint32_t m_exflags; + uint32_t m_sbflags; + uint32_t m_flags; + uint32_t m_lvbseq; + uint32_t m_hash; + int m_status; + int m_grmode; + int m_rqmode; + int m_bastmode; + int m_asts; + int m_result; /* 0 or -EXXX */ + char m_extra[0]; /* name or lvb */ +}; + + +#define DLM_RS_NODES 0x00000001 +#define DLM_RS_NODES_ALL 0x00000002 +#define DLM_RS_DIR 0x00000004 +#define DLM_RS_DIR_ALL 0x00000008 +#define DLM_RS_LOCKS 0x00000010 +#define DLM_RS_LOCKS_ALL 0x00000020 +#define DLM_RS_DONE 0x00000040 +#define DLM_RS_DONE_ALL 0x00000080 + +#define DLM_RCOM_STATUS 1 +#define DLM_RCOM_NAMES 2 +#define DLM_RCOM_LOOKUP 3 +#define DLM_RCOM_LOCK 4 +#define DLM_RCOM_STATUS_REPLY 5 +#define DLM_RCOM_NAMES_REPLY 6 +#define DLM_RCOM_LOOKUP_REPLY 7 +#define DLM_RCOM_LOCK_REPLY 8 + +struct dlm_rcom { + struct dlm_header rc_header; + uint32_t rc_type; /* DLM_RCOM_ */ + int rc_result; /* multi-purpose */ + uint64_t rc_id; /* match reply with request */ + uint64_t rc_seq; /* sender's ls_recover_seq */ + uint64_t rc_seq_reply; /* remote ls_recover_seq */ + char rc_buf[0]; +}; + +union dlm_packet { + struct dlm_header header; /* common to other two */ + struct dlm_message message; + struct dlm_rcom rcom; +}; + +struct rcom_config { + __le32 rf_lvblen; + __le32 rf_lsflags; + __le64 rf_unused; +}; + +struct rcom_lock { + __le32 rl_ownpid; + __le32 rl_lkid; + __le32 rl_remid; + __le32 rl_parent_lkid; + __le32 rl_parent_remid; + __le32 rl_exflags; + __le32 rl_flags; + __le32 rl_lvbseq; + __le32 rl_result; + int8_t rl_rqmode; + int8_t rl_grmode; + int8_t rl_status; + int8_t rl_asts; + __le16 rl_wait_type; + __le16 rl_namelen; + char rl_name[DLM_RESNAME_MAXLEN]; + char rl_lvb[0]; +}; + +struct dlm_ls { + struct list_head ls_list; /* list of lockspaces */ + dlm_lockspace_t *ls_local_handle; + uint32_t ls_global_id; /* global unique lockspace ID */ + uint32_t ls_exflags; + int ls_lvblen; + int ls_count; /* refcount of processes in + the dlm using this ls */ + int ls_create_count; /* create/release refcount */ + unsigned long ls_flags; /* LSFL_ */ + unsigned long ls_scan_time; + struct kobject ls_kobj; + + struct dlm_rsbtable *ls_rsbtbl; + uint32_t ls_rsbtbl_size; + + struct dlm_lkbtable *ls_lkbtbl; + uint32_t ls_lkbtbl_size; + + struct dlm_dirtable *ls_dirtbl; + uint32_t ls_dirtbl_size; + + struct mutex ls_waiters_mutex; + struct list_head ls_waiters; /* lkbs needing a reply */ + + struct mutex ls_orphans_mutex; + struct list_head ls_orphans; + + struct mutex ls_timeout_mutex; + struct list_head ls_timeout; + + struct list_head ls_nodes; /* current nodes in ls */ + struct list_head ls_nodes_gone; /* dead node list, recovery */ + int ls_num_nodes; /* number of nodes in ls */ + int ls_low_nodeid; + int ls_total_weight; + int *ls_node_array; + + struct dlm_rsb ls_stub_rsb; /* for returning errors */ + struct dlm_lkb ls_stub_lkb; /* for returning errors */ + struct dlm_message ls_stub_ms; /* for faking a reply */ + + struct dentry *ls_debug_rsb_dentry; /* debugfs */ + struct dentry *ls_debug_waiters_dentry; /* debugfs */ + struct dentry *ls_debug_locks_dentry; /* debugfs */ + struct dentry *ls_debug_all_dentry; /* debugfs */ + + wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ + int ls_uevent_result; + struct completion ls_members_done; + int ls_members_result; + + struct miscdevice ls_device; + + /* recovery related */ + + struct timer_list ls_timer; + struct task_struct *ls_recoverd_task; + struct mutex ls_recoverd_active; + spinlock_t ls_recover_lock; + unsigned long ls_recover_begin; /* jiffies timestamp */ + uint32_t ls_recover_status; /* DLM_RS_ */ + uint64_t ls_recover_seq; + struct dlm_recover *ls_recover_args; + struct rw_semaphore ls_in_recovery; /* block local requests */ + struct rw_semaphore ls_recv_active; /* block dlm_recv */ + struct list_head ls_requestqueue;/* queue remote requests */ + struct mutex ls_requestqueue_mutex; + struct dlm_rcom *ls_recover_buf; + int ls_recover_nodeid; /* for debugging */ + uint64_t ls_rcom_seq; + spinlock_t ls_rcom_spin; + struct list_head ls_recover_list; + spinlock_t ls_recover_list_lock; + int ls_recover_list_count; + wait_queue_head_t ls_wait_general; + struct mutex ls_clear_proc_locks; + + struct list_head ls_root_list; /* root resources */ + struct rw_semaphore ls_root_sem; /* protect root_list */ + + int ls_namelen; + char ls_name[1]; +}; + +#define LSFL_WORK 0 +#define LSFL_RUNNING 1 +#define LSFL_RECOVERY_STOP 2 +#define LSFL_RCOM_READY 3 +#define LSFL_RCOM_WAIT 4 +#define LSFL_UEVENT_WAIT 5 +#define LSFL_TIMEWARN 6 + +/* much of this is just saving user space pointers associated with the + lock that we pass back to the user lib with an ast */ + +struct dlm_user_args { + struct dlm_user_proc *proc; /* each process that opens the lockspace + device has private data + (dlm_user_proc) on the struct file, + the process's locks point back to it*/ + struct dlm_lksb lksb; + struct dlm_lksb __user *user_lksb; + void __user *castparam; + void __user *castaddr; + void __user *bastparam; + void __user *bastaddr; + uint64_t xid; +}; + +#define DLM_PROC_FLAGS_CLOSING 1 +#define DLM_PROC_FLAGS_COMPAT 2 + +/* locks list is kept so we can remove all a process's locks when it + exits (or orphan those that are persistent) */ + +struct dlm_user_proc { + dlm_lockspace_t *lockspace; + unsigned long flags; /* DLM_PROC_FLAGS */ + struct list_head asts; + spinlock_t asts_spin; + struct list_head locks; + spinlock_t locks_spin; + struct list_head unlocking; + wait_queue_head_t wait; +}; + +static inline int dlm_locking_stopped(struct dlm_ls *ls) +{ + return !test_bit(LSFL_RUNNING, &ls->ls_flags); +} + +static inline int dlm_recovery_stopped(struct dlm_ls *ls) +{ + return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); +} + +static inline int dlm_no_directory(struct dlm_ls *ls) +{ + return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0; +} + +int dlm_netlink_init(void); +void dlm_netlink_exit(void); +void dlm_timeout_warn(struct dlm_lkb *lkb); +int dlm_plock_init(void); +void dlm_plock_exit(void); + +#ifdef CONFIG_DLM_DEBUG +int dlm_register_debugfs(void); +void dlm_unregister_debugfs(void); +int dlm_create_debug_file(struct dlm_ls *ls); +void dlm_delete_debug_file(struct dlm_ls *ls); +#else +static inline int dlm_register_debugfs(void) { return 0; } +static inline void dlm_unregister_debugfs(void) { } +static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; } +static inline void dlm_delete_debug_file(struct dlm_ls *ls) { } +#endif + +#endif /* __DLM_INTERNAL_DOT_H__ */ + diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c new file mode 100644 index 00000000..f71d0b5a --- /dev/null +++ b/fs/dlm/lock.c @@ -0,0 +1,5115 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* Central locking logic has four stages: + + dlm_lock() + dlm_unlock() + + request_lock(ls, lkb) + convert_lock(ls, lkb) + unlock_lock(ls, lkb) + cancel_lock(ls, lkb) + + _request_lock(r, lkb) + _convert_lock(r, lkb) + _unlock_lock(r, lkb) + _cancel_lock(r, lkb) + + do_request(r, lkb) + do_convert(r, lkb) + do_unlock(r, lkb) + do_cancel(r, lkb) + + Stage 1 (lock, unlock) is mainly about checking input args and + splitting into one of the four main operations: + + dlm_lock = request_lock + dlm_lock+CONVERT = convert_lock + dlm_unlock = unlock_lock + dlm_unlock+CANCEL = cancel_lock + + Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is + provided to the next stage. + + Stage 3, _xxxx_lock(), determines if the operation is local or remote. + When remote, it calls send_xxxx(), when local it calls do_xxxx(). + + Stage 4, do_xxxx(), is the guts of the operation. It manipulates the + given rsb and lkb and queues callbacks. + + For remote operations, send_xxxx() results in the corresponding do_xxxx() + function being executed on the remote node. The connecting send/receive + calls on local (L) and remote (R) nodes: + + L: send_xxxx() -> R: receive_xxxx() + R: do_xxxx() + L: receive_xxxx_reply() <- R: send_xxxx_reply() +*/ +#include +#include +#include "dlm_internal.h" +#include +#include "memory.h" +#include "lowcomms.h" +#include "requestqueue.h" +#include "util.h" +#include "dir.h" +#include "member.h" +#include "lockspace.h" +#include "ast.h" +#include "lock.h" +#include "rcom.h" +#include "recover.h" +#include "lvb_table.h" +#include "user.h" +#include "config.h" + +static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode); +static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int send_remove(struct dlm_rsb *r); +static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); +static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); +static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, + struct dlm_message *ms); +static int receive_extralen(struct dlm_message *ms); +static void do_purge(struct dlm_ls *ls, int nodeid, int pid); +static void del_timeout(struct dlm_lkb *lkb); + +/* + * Lock compatibilty matrix - thanks Steve + * UN = Unlocked state. Not really a state, used as a flag + * PD = Padding. Used to make the matrix a nice power of two in size + * Other states are the same as the VMS DLM. + * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same) + */ + +static const int __dlm_compat_matrix[8][8] = { + /* UN NL CR CW PR PW EX PD */ + {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */ + {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */ + {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */ + {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */ + {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */ + {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */ + {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */ + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */ +}; + +/* + * This defines the direction of transfer of LVB data. + * Granted mode is the row; requested mode is the column. + * Usage: matrix[grmode+1][rqmode+1] + * 1 = LVB is returned to the caller + * 0 = LVB is written to the resource + * -1 = nothing happens to the LVB + */ + +const int dlm_lvb_operations[8][8] = { + /* UN NL CR CW PR PW EX PD*/ + { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */ + { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */ + { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */ + { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */ + { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */ + { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */ + { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */ + { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */ +}; + +#define modes_compat(gr, rq) \ + __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1] + +int dlm_modes_compat(int mode1, int mode2) +{ + return __dlm_compat_matrix[mode1 + 1][mode2 + 1]; +} + +/* + * Compatibility matrix for conversions with QUECVT set. + * Granted mode is the row; requested mode is the column. + * Usage: matrix[grmode+1][rqmode+1] + */ + +static const int __quecvt_compat_matrix[8][8] = { + /* UN NL CR CW PR PW EX PD */ + {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */ + {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */ + {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */ + {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */ + {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */ + {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */ + {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */ + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */ +}; + +void dlm_print_lkb(struct dlm_lkb *lkb) +{ + printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n" + " status %d rqmode %d grmode %d wait_type %d\n", + lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags, + lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode, + lkb->lkb_grmode, lkb->lkb_wait_type); +} + +static void dlm_print_rsb(struct dlm_rsb *r) +{ + printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n", + r->res_nodeid, r->res_flags, r->res_first_lkid, + r->res_recover_locks_count, r->res_name); +} + +void dlm_dump_rsb(struct dlm_rsb *r) +{ + struct dlm_lkb *lkb; + + dlm_print_rsb(r); + + printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n", + list_empty(&r->res_root_list), list_empty(&r->res_recover_list)); + printk(KERN_ERR "rsb lookup list\n"); + list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) + dlm_print_lkb(lkb); + printk(KERN_ERR "rsb grant queue:\n"); + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) + dlm_print_lkb(lkb); + printk(KERN_ERR "rsb convert queue:\n"); + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) + dlm_print_lkb(lkb); + printk(KERN_ERR "rsb wait queue:\n"); + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) + dlm_print_lkb(lkb); +} + +/* Threads cannot use the lockspace while it's being recovered */ + +static inline void dlm_lock_recovery(struct dlm_ls *ls) +{ + down_read(&ls->ls_in_recovery); +} + +void dlm_unlock_recovery(struct dlm_ls *ls) +{ + up_read(&ls->ls_in_recovery); +} + +int dlm_lock_recovery_try(struct dlm_ls *ls) +{ + return down_read_trylock(&ls->ls_in_recovery); +} + +static inline int can_be_queued(struct dlm_lkb *lkb) +{ + return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE); +} + +static inline int force_blocking_asts(struct dlm_lkb *lkb) +{ + return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST); +} + +static inline int is_demoted(struct dlm_lkb *lkb) +{ + return (lkb->lkb_sbflags & DLM_SBF_DEMOTED); +} + +static inline int is_altmode(struct dlm_lkb *lkb) +{ + return (lkb->lkb_sbflags & DLM_SBF_ALTMODE); +} + +static inline int is_granted(struct dlm_lkb *lkb) +{ + return (lkb->lkb_status == DLM_LKSTS_GRANTED); +} + +static inline int is_remote(struct dlm_rsb *r) +{ + DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r);); + return !!r->res_nodeid; +} + +static inline int is_process_copy(struct dlm_lkb *lkb) +{ + return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY)); +} + +static inline int is_master_copy(struct dlm_lkb *lkb) +{ + if (lkb->lkb_flags & DLM_IFL_MSTCPY) + DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb);); + return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0; +} + +static inline int middle_conversion(struct dlm_lkb *lkb) +{ + if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) || + (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW)) + return 1; + return 0; +} + +static inline int down_conversion(struct dlm_lkb *lkb) +{ + return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode); +} + +static inline int is_overlap_unlock(struct dlm_lkb *lkb) +{ + return lkb->lkb_flags & DLM_IFL_OVERLAP_UNLOCK; +} + +static inline int is_overlap_cancel(struct dlm_lkb *lkb) +{ + return lkb->lkb_flags & DLM_IFL_OVERLAP_CANCEL; +} + +static inline int is_overlap(struct dlm_lkb *lkb) +{ + return (lkb->lkb_flags & (DLM_IFL_OVERLAP_UNLOCK | + DLM_IFL_OVERLAP_CANCEL)); +} + +static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + if (is_master_copy(lkb)) + return; + + del_timeout(lkb); + + DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb);); + + /* if the operation was a cancel, then return -DLM_ECANCEL, if a + timeout caused the cancel then return -ETIMEDOUT */ + if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) { + lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL; + rv = -ETIMEDOUT; + } + + if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) { + lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL; + rv = -EDEADLK; + } + + dlm_add_ast(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags); +} + +static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + queue_cast(r, lkb, + is_overlap_unlock(lkb) ? -DLM_EUNLOCK : -DLM_ECANCEL); +} + +static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) +{ + if (is_master_copy(lkb)) { + send_bast(r, lkb, rqmode); + } else { + dlm_add_ast(lkb, DLM_CB_BAST, rqmode, 0, 0); + } +} + +/* + * Basic operations on rsb's and lkb's + */ + +static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len) +{ + struct dlm_rsb *r; + + r = dlm_allocate_rsb(ls, len); + if (!r) + return NULL; + + r->res_ls = ls; + r->res_length = len; + memcpy(r->res_name, name, len); + mutex_init(&r->res_mutex); + + INIT_LIST_HEAD(&r->res_lookup); + INIT_LIST_HEAD(&r->res_grantqueue); + INIT_LIST_HEAD(&r->res_convertqueue); + INIT_LIST_HEAD(&r->res_waitqueue); + INIT_LIST_HEAD(&r->res_root_list); + INIT_LIST_HEAD(&r->res_recover_list); + + return r; +} + +static int search_rsb_list(struct list_head *head, char *name, int len, + unsigned int flags, struct dlm_rsb **r_ret) +{ + struct dlm_rsb *r; + int error = 0; + + list_for_each_entry(r, head, res_hashchain) { + if (len == r->res_length && !memcmp(name, r->res_name, len)) + goto found; + } + *r_ret = NULL; + return -EBADR; + + found: + if (r->res_nodeid && (flags & R_MASTER)) + error = -ENOTBLK; + *r_ret = r; + return error; +} + +static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b, + unsigned int flags, struct dlm_rsb **r_ret) +{ + struct dlm_rsb *r; + int error; + + error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r); + if (!error) { + kref_get(&r->res_ref); + goto out; + } + error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r); + if (error) + goto out; + + list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list); + + if (dlm_no_directory(ls)) + goto out; + + if (r->res_nodeid == -1) { + rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); + r->res_first_lkid = 0; + } else if (r->res_nodeid > 0) { + rsb_set_flag(r, RSB_MASTER_UNCERTAIN); + r->res_first_lkid = 0; + } else { + DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r);); + DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),); + } + out: + *r_ret = r; + return error; +} + +static int search_rsb(struct dlm_ls *ls, char *name, int len, int b, + unsigned int flags, struct dlm_rsb **r_ret) +{ + int error; + spin_lock(&ls->ls_rsbtbl[b].lock); + error = _search_rsb(ls, name, len, b, flags, r_ret); + spin_unlock(&ls->ls_rsbtbl[b].lock); + return error; +} + +/* + * Find rsb in rsbtbl and potentially create/add one + * + * Delaying the release of rsb's has a similar benefit to applications keeping + * NL locks on an rsb, but without the guarantee that the cached master value + * will still be valid when the rsb is reused. Apps aren't always smart enough + * to keep NL locks on an rsb that they may lock again shortly; this can lead + * to excessive master lookups and removals if we don't delay the release. + * + * Searching for an rsb means looking through both the normal list and toss + * list. When found on the toss list the rsb is moved to the normal list with + * ref count of 1; when found on normal list the ref count is incremented. + */ + +static int find_rsb(struct dlm_ls *ls, char *name, int namelen, + unsigned int flags, struct dlm_rsb **r_ret) +{ + struct dlm_rsb *r = NULL, *tmp; + uint32_t hash, bucket; + int error = -EINVAL; + + if (namelen > DLM_RESNAME_MAXLEN) + goto out; + + if (dlm_no_directory(ls)) + flags |= R_CREATE; + + error = 0; + hash = jhash(name, namelen, 0); + bucket = hash & (ls->ls_rsbtbl_size - 1); + + error = search_rsb(ls, name, namelen, bucket, flags, &r); + if (!error) + goto out; + + if (error == -EBADR && !(flags & R_CREATE)) + goto out; + + /* the rsb was found but wasn't a master copy */ + if (error == -ENOTBLK) + goto out; + + error = -ENOMEM; + r = create_rsb(ls, name, namelen); + if (!r) + goto out; + + r->res_hash = hash; + r->res_bucket = bucket; + r->res_nodeid = -1; + kref_init(&r->res_ref); + + /* With no directory, the master can be set immediately */ + if (dlm_no_directory(ls)) { + int nodeid = dlm_dir_nodeid(r); + if (nodeid == dlm_our_nodeid()) + nodeid = 0; + r->res_nodeid = nodeid; + } + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + error = _search_rsb(ls, name, namelen, bucket, 0, &tmp); + if (!error) { + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + dlm_free_rsb(r); + r = tmp; + goto out; + } + list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list); + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + error = 0; + out: + *r_ret = r; + return error; +} + +/* This is only called to add a reference when the code already holds + a valid reference to the rsb, so there's no need for locking. */ + +static inline void hold_rsb(struct dlm_rsb *r) +{ + kref_get(&r->res_ref); +} + +void dlm_hold_rsb(struct dlm_rsb *r) +{ + hold_rsb(r); +} + +static void toss_rsb(struct kref *kref) +{ + struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref); + struct dlm_ls *ls = r->res_ls; + + DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r);); + kref_init(&r->res_ref); + list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss); + r->res_toss_time = jiffies; + if (r->res_lvbptr) { + dlm_free_lvb(r->res_lvbptr); + r->res_lvbptr = NULL; + } +} + +/* When all references to the rsb are gone it's transferred to + the tossed list for later disposal. */ + +static void put_rsb(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + uint32_t bucket = r->res_bucket; + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + kref_put(&r->res_ref, toss_rsb); + spin_unlock(&ls->ls_rsbtbl[bucket].lock); +} + +void dlm_put_rsb(struct dlm_rsb *r) +{ + put_rsb(r); +} + +/* See comment for unhold_lkb */ + +static void unhold_rsb(struct dlm_rsb *r) +{ + int rv; + rv = kref_put(&r->res_ref, toss_rsb); + DLM_ASSERT(!rv, dlm_dump_rsb(r);); +} + +static void kill_rsb(struct kref *kref) +{ + struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref); + + /* All work is done after the return from kref_put() so we + can release the write_lock before the remove and free. */ + + DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r);); + DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r);); +} + +/* Attaching/detaching lkb's from rsb's is for rsb reference counting. + The rsb must exist as long as any lkb's for it do. */ + +static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + hold_rsb(r); + lkb->lkb_resource = r; +} + +static void detach_lkb(struct dlm_lkb *lkb) +{ + if (lkb->lkb_resource) { + put_rsb(lkb->lkb_resource); + lkb->lkb_resource = NULL; + } +} + +static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) +{ + struct dlm_lkb *lkb, *tmp; + uint32_t lkid = 0; + uint16_t bucket; + + lkb = dlm_allocate_lkb(ls); + if (!lkb) + return -ENOMEM; + + lkb->lkb_nodeid = -1; + lkb->lkb_grmode = DLM_LOCK_IV; + kref_init(&lkb->lkb_ref); + INIT_LIST_HEAD(&lkb->lkb_ownqueue); + INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); + INIT_LIST_HEAD(&lkb->lkb_time_list); + INIT_LIST_HEAD(&lkb->lkb_astqueue); + + get_random_bytes(&bucket, sizeof(bucket)); + bucket &= (ls->ls_lkbtbl_size - 1); + + write_lock(&ls->ls_lkbtbl[bucket].lock); + + /* counter can roll over so we must verify lkid is not in use */ + + while (lkid == 0) { + lkid = (bucket << 16) | ls->ls_lkbtbl[bucket].counter++; + + list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list, + lkb_idtbl_list) { + if (tmp->lkb_id != lkid) + continue; + lkid = 0; + break; + } + } + + lkb->lkb_id = lkid; + list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list); + write_unlock(&ls->ls_lkbtbl[bucket].lock); + + *lkb_ret = lkb; + return 0; +} + +static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid) +{ + struct dlm_lkb *lkb; + uint16_t bucket = (lkid >> 16); + + list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) { + if (lkb->lkb_id == lkid) + return lkb; + } + return NULL; +} + +static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret) +{ + struct dlm_lkb *lkb; + uint16_t bucket = (lkid >> 16); + + if (bucket >= ls->ls_lkbtbl_size) + return -EBADSLT; + + read_lock(&ls->ls_lkbtbl[bucket].lock); + lkb = __find_lkb(ls, lkid); + if (lkb) + kref_get(&lkb->lkb_ref); + read_unlock(&ls->ls_lkbtbl[bucket].lock); + + *lkb_ret = lkb; + return lkb ? 0 : -ENOENT; +} + +static void kill_lkb(struct kref *kref) +{ + struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref); + + /* All work is done after the return from kref_put() so we + can release the write_lock before the detach_lkb */ + + DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb);); +} + +/* __put_lkb() is used when an lkb may not have an rsb attached to + it so we need to provide the lockspace explicitly */ + +static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) +{ + uint16_t bucket = (lkb->lkb_id >> 16); + + write_lock(&ls->ls_lkbtbl[bucket].lock); + if (kref_put(&lkb->lkb_ref, kill_lkb)) { + list_del(&lkb->lkb_idtbl_list); + write_unlock(&ls->ls_lkbtbl[bucket].lock); + + detach_lkb(lkb); + + /* for local/process lkbs, lvbptr points to caller's lksb */ + if (lkb->lkb_lvbptr && is_master_copy(lkb)) + dlm_free_lvb(lkb->lkb_lvbptr); + dlm_free_lkb(lkb); + return 1; + } else { + write_unlock(&ls->ls_lkbtbl[bucket].lock); + return 0; + } +} + +int dlm_put_lkb(struct dlm_lkb *lkb) +{ + struct dlm_ls *ls; + + DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb);); + DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb);); + + ls = lkb->lkb_resource->res_ls; + return __put_lkb(ls, lkb); +} + +/* This is only called to add a reference when the code already holds + a valid reference to the lkb, so there's no need for locking. */ + +static inline void hold_lkb(struct dlm_lkb *lkb) +{ + kref_get(&lkb->lkb_ref); +} + +/* This is called when we need to remove a reference and are certain + it's not the last ref. e.g. del_lkb is always called between a + find_lkb/put_lkb and is always the inverse of a previous add_lkb. + put_lkb would work fine, but would involve unnecessary locking */ + +static inline void unhold_lkb(struct dlm_lkb *lkb) +{ + int rv; + rv = kref_put(&lkb->lkb_ref, kill_lkb); + DLM_ASSERT(!rv, dlm_print_lkb(lkb);); +} + +static void lkb_add_ordered(struct list_head *new, struct list_head *head, + int mode) +{ + struct dlm_lkb *lkb = NULL; + + list_for_each_entry(lkb, head, lkb_statequeue) + if (lkb->lkb_rqmode < mode) + break; + + __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue); +} + +/* add/remove lkb to rsb's grant/convert/wait queue */ + +static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status) +{ + kref_get(&lkb->lkb_ref); + + DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb);); + + lkb->lkb_timestamp = ktime_get(); + + lkb->lkb_status = status; + + switch (status) { + case DLM_LKSTS_WAITING: + if (lkb->lkb_exflags & DLM_LKF_HEADQUE) + list_add(&lkb->lkb_statequeue, &r->res_waitqueue); + else + list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue); + break; + case DLM_LKSTS_GRANTED: + /* convention says granted locks kept in order of grmode */ + lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue, + lkb->lkb_grmode); + break; + case DLM_LKSTS_CONVERT: + if (lkb->lkb_exflags & DLM_LKF_HEADQUE) + list_add(&lkb->lkb_statequeue, &r->res_convertqueue); + else + list_add_tail(&lkb->lkb_statequeue, + &r->res_convertqueue); + break; + default: + DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status);); + } +} + +static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + lkb->lkb_status = 0; + list_del(&lkb->lkb_statequeue); + unhold_lkb(lkb); +} + +static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts) +{ + hold_lkb(lkb); + del_lkb(r, lkb); + add_lkb(r, lkb, sts); + unhold_lkb(lkb); +} + +static int msg_reply_type(int mstype) +{ + switch (mstype) { + case DLM_MSG_REQUEST: + return DLM_MSG_REQUEST_REPLY; + case DLM_MSG_CONVERT: + return DLM_MSG_CONVERT_REPLY; + case DLM_MSG_UNLOCK: + return DLM_MSG_UNLOCK_REPLY; + case DLM_MSG_CANCEL: + return DLM_MSG_CANCEL_REPLY; + case DLM_MSG_LOOKUP: + return DLM_MSG_LOOKUP_REPLY; + } + return -1; +} + +static int nodeid_warned(int nodeid, int num_nodes, int *warned) +{ + int i; + + for (i = 0; i < num_nodes; i++) { + if (!warned[i]) { + warned[i] = nodeid; + return 0; + } + if (warned[i] == nodeid) + return 1; + } + return 0; +} + +void dlm_scan_waiters(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + ktime_t zero = ktime_set(0, 0); + s64 us; + s64 debug_maxus = 0; + u32 debug_scanned = 0; + u32 debug_expired = 0; + int num_nodes = 0; + int *warned = NULL; + + if (!dlm_config.ci_waitwarn_us) + return; + + mutex_lock(&ls->ls_waiters_mutex); + + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { + if (ktime_equal(lkb->lkb_wait_time, zero)) + continue; + + debug_scanned++; + + us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time)); + + if (us < dlm_config.ci_waitwarn_us) + continue; + + lkb->lkb_wait_time = zero; + + debug_expired++; + if (us > debug_maxus) + debug_maxus = us; + + if (!num_nodes) { + num_nodes = ls->ls_num_nodes; + warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int)); + if (warned) + memset(warned, 0, num_nodes * sizeof(int)); + } + if (!warned) + continue; + if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned)) + continue; + + log_error(ls, "waitwarn %x %lld %d us check connection to " + "node %d", lkb->lkb_id, (long long)us, + dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid); + } + mutex_unlock(&ls->ls_waiters_mutex); + + if (warned) + kfree(warned); + + if (debug_expired) + log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us", + debug_scanned, debug_expired, + dlm_config.ci_waitwarn_us, (long long)debug_maxus); +} + +/* add/remove lkb from global waiters list of lkb's waiting for + a reply from a remote node */ + +static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int error = 0; + + mutex_lock(&ls->ls_waiters_mutex); + + if (is_overlap_unlock(lkb) || + (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) { + error = -EINVAL; + goto out; + } + + if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) { + switch (mstype) { + case DLM_MSG_UNLOCK: + lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK; + break; + case DLM_MSG_CANCEL: + lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL; + break; + default: + error = -EBUSY; + goto out; + } + lkb->lkb_wait_count++; + hold_lkb(lkb); + + log_debug(ls, "addwait %x cur %d overlap %d count %d f %x", + lkb->lkb_id, lkb->lkb_wait_type, mstype, + lkb->lkb_wait_count, lkb->lkb_flags); + goto out; + } + + DLM_ASSERT(!lkb->lkb_wait_count, + dlm_print_lkb(lkb); + printk("wait_count %d\n", lkb->lkb_wait_count);); + + lkb->lkb_wait_count++; + lkb->lkb_wait_type = mstype; + lkb->lkb_wait_time = ktime_get(); + lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */ + hold_lkb(lkb); + list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); + out: + if (error) + log_error(ls, "addwait error %x %d flags %x %d %d %s", + lkb->lkb_id, error, lkb->lkb_flags, mstype, + lkb->lkb_wait_type, lkb->lkb_resource->res_name); + mutex_unlock(&ls->ls_waiters_mutex); + return error; +} + +/* We clear the RESEND flag because we might be taking an lkb off the waiters + list as part of process_requestqueue (e.g. a lookup that has an optimized + request reply on the requestqueue) between dlm_recover_waiters_pre() which + set RESEND and dlm_recover_waiters_post() */ + +static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, + struct dlm_message *ms) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int overlap_done = 0; + + if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) { + log_debug(ls, "remwait %x unlock_reply overlap", lkb->lkb_id); + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; + overlap_done = 1; + goto out_del; + } + + if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) { + log_debug(ls, "remwait %x cancel_reply overlap", lkb->lkb_id); + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + overlap_done = 1; + goto out_del; + } + + /* Cancel state was preemptively cleared by a successful convert, + see next comment, nothing to do. */ + + if ((mstype == DLM_MSG_CANCEL_REPLY) && + (lkb->lkb_wait_type != DLM_MSG_CANCEL)) { + log_debug(ls, "remwait %x cancel_reply wait_type %d", + lkb->lkb_id, lkb->lkb_wait_type); + return -1; + } + + /* Remove for the convert reply, and premptively remove for the + cancel reply. A convert has been granted while there's still + an outstanding cancel on it (the cancel is moot and the result + in the cancel reply should be 0). We preempt the cancel reply + because the app gets the convert result and then can follow up + with another op, like convert. This subsequent op would see the + lingering state of the cancel and fail with -EBUSY. */ + + if ((mstype == DLM_MSG_CONVERT_REPLY) && + (lkb->lkb_wait_type == DLM_MSG_CONVERT) && + is_overlap_cancel(lkb) && ms && !ms->m_result) { + log_debug(ls, "remwait %x convert_reply zap overlap_cancel", + lkb->lkb_id); + lkb->lkb_wait_type = 0; + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + lkb->lkb_wait_count--; + goto out_del; + } + + /* N.B. type of reply may not always correspond to type of original + msg due to lookup->request optimization, verify others? */ + + if (lkb->lkb_wait_type) { + lkb->lkb_wait_type = 0; + goto out_del; + } + + log_error(ls, "remwait error %x reply %d flags %x no wait_type", + lkb->lkb_id, mstype, lkb->lkb_flags); + return -1; + + out_del: + /* the force-unlock/cancel has completed and we haven't recvd a reply + to the op that was in progress prior to the unlock/cancel; we + give up on any reply to the earlier op. FIXME: not sure when/how + this would happen */ + + if (overlap_done && lkb->lkb_wait_type) { + log_error(ls, "remwait error %x reply %d wait_type %d overlap", + lkb->lkb_id, mstype, lkb->lkb_wait_type); + lkb->lkb_wait_count--; + lkb->lkb_wait_type = 0; + } + + DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb);); + + lkb->lkb_flags &= ~DLM_IFL_RESEND; + lkb->lkb_wait_count--; + if (!lkb->lkb_wait_count) + list_del_init(&lkb->lkb_wait_reply); + unhold_lkb(lkb); + return 0; +} + +static int remove_from_waiters(struct dlm_lkb *lkb, int mstype) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int error; + + mutex_lock(&ls->ls_waiters_mutex); + error = _remove_from_waiters(lkb, mstype, NULL); + mutex_unlock(&ls->ls_waiters_mutex); + return error; +} + +/* Handles situations where we might be processing a "fake" or "stub" reply in + which we can't try to take waiters_mutex again. */ + +static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int error; + + if (ms->m_flags != DLM_IFL_STUB_MS) + mutex_lock(&ls->ls_waiters_mutex); + error = _remove_from_waiters(lkb, ms->m_type, ms); + if (ms->m_flags != DLM_IFL_STUB_MS) + mutex_unlock(&ls->ls_waiters_mutex); + return error; +} + +static void dir_remove(struct dlm_rsb *r) +{ + int to_nodeid; + + if (dlm_no_directory(r->res_ls)) + return; + + to_nodeid = dlm_dir_nodeid(r); + if (to_nodeid != dlm_our_nodeid()) + send_remove(r); + else + dlm_dir_remove_entry(r->res_ls, to_nodeid, + r->res_name, r->res_length); +} + +/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is + found since they are in order of newest to oldest? */ + +static int shrink_bucket(struct dlm_ls *ls, int b) +{ + struct dlm_rsb *r; + int count = 0, found; + + for (;;) { + found = 0; + spin_lock(&ls->ls_rsbtbl[b].lock); + list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss, + res_hashchain) { + if (!time_after_eq(jiffies, r->res_toss_time + + dlm_config.ci_toss_secs * HZ)) + continue; + found = 1; + break; + } + + if (!found) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + break; + } + + if (kref_put(&r->res_ref, kill_rsb)) { + list_del(&r->res_hashchain); + spin_unlock(&ls->ls_rsbtbl[b].lock); + + if (is_master(r)) + dir_remove(r); + dlm_free_rsb(r); + count++; + } else { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_error(ls, "tossed rsb in use %s", r->res_name); + } + } + + return count; +} + +void dlm_scan_rsbs(struct dlm_ls *ls) +{ + int i; + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + shrink_bucket(ls, i); + if (dlm_locking_stopped(ls)) + break; + cond_resched(); + } +} + +static void add_timeout(struct dlm_lkb *lkb) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + + if (is_master_copy(lkb)) + return; + + if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) && + !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { + lkb->lkb_flags |= DLM_IFL_WATCH_TIMEWARN; + goto add_it; + } + if (lkb->lkb_exflags & DLM_LKF_TIMEOUT) + goto add_it; + return; + + add_it: + DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb);); + mutex_lock(&ls->ls_timeout_mutex); + hold_lkb(lkb); + list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout); + mutex_unlock(&ls->ls_timeout_mutex); +} + +static void del_timeout(struct dlm_lkb *lkb) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + + mutex_lock(&ls->ls_timeout_mutex); + if (!list_empty(&lkb->lkb_time_list)) { + list_del_init(&lkb->lkb_time_list); + unhold_lkb(lkb); + } + mutex_unlock(&ls->ls_timeout_mutex); +} + +/* FIXME: is it safe to look at lkb_exflags, lkb_flags, lkb_timestamp, and + lkb_lksb_timeout without lock_rsb? Note: we can't lock timeout_mutex + and then lock rsb because of lock ordering in add_timeout. We may need + to specify some special timeout-related bits in the lkb that are just to + be accessed under the timeout_mutex. */ + +void dlm_scan_timeout(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + struct dlm_lkb *lkb; + int do_cancel, do_warn; + s64 wait_us; + + for (;;) { + if (dlm_locking_stopped(ls)) + break; + + do_cancel = 0; + do_warn = 0; + mutex_lock(&ls->ls_timeout_mutex); + list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) { + + wait_us = ktime_to_us(ktime_sub(ktime_get(), + lkb->lkb_timestamp)); + + if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) && + wait_us >= (lkb->lkb_timeout_cs * 10000)) + do_cancel = 1; + + if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && + wait_us >= dlm_config.ci_timewarn_cs * 10000) + do_warn = 1; + + if (!do_cancel && !do_warn) + continue; + hold_lkb(lkb); + break; + } + mutex_unlock(&ls->ls_timeout_mutex); + + if (!do_cancel && !do_warn) + break; + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + if (do_warn) { + /* clear flag so we only warn once */ + lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN; + if (!(lkb->lkb_exflags & DLM_LKF_TIMEOUT)) + del_timeout(lkb); + dlm_timeout_warn(lkb); + } + + if (do_cancel) { + log_debug(ls, "timeout cancel %x node %d %s", + lkb->lkb_id, lkb->lkb_nodeid, r->res_name); + lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN; + lkb->lkb_flags |= DLM_IFL_TIMEOUT_CANCEL; + del_timeout(lkb); + _cancel_lock(r, lkb); + } + + unlock_rsb(r); + unhold_rsb(r); + dlm_put_lkb(lkb); + } +} + +/* This is only called by dlm_recoverd, and we rely on dlm_ls_stop() stopping + dlm_recoverd before checking/setting ls_recover_begin. */ + +void dlm_adjust_timeouts(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin); + + ls->ls_recover_begin = 0; + mutex_lock(&ls->ls_timeout_mutex); + list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) + lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); + mutex_unlock(&ls->ls_timeout_mutex); + + if (!dlm_config.ci_waitwarn_us) + return; + + mutex_lock(&ls->ls_waiters_mutex); + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { + if (ktime_to_us(lkb->lkb_wait_time)) + lkb->lkb_wait_time = ktime_get(); + } + mutex_unlock(&ls->ls_waiters_mutex); +} + +/* lkb is master or local copy */ + +static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int b, len = r->res_ls->ls_lvblen; + + /* b=1 lvb returned to caller + b=0 lvb written to rsb or invalidated + b=-1 do nothing */ + + b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; + + if (b == 1) { + if (!lkb->lkb_lvbptr) + return; + + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + return; + + if (!r->res_lvbptr) + return; + + memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len); + lkb->lkb_lvbseq = r->res_lvbseq; + + } else if (b == 0) { + if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) { + rsb_set_flag(r, RSB_VALNOTVALID); + return; + } + + if (!lkb->lkb_lvbptr) + return; + + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + return; + + if (!r->res_lvbptr) + r->res_lvbptr = dlm_allocate_lvb(r->res_ls); + + if (!r->res_lvbptr) + return; + + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len); + r->res_lvbseq++; + lkb->lkb_lvbseq = r->res_lvbseq; + rsb_clear_flag(r, RSB_VALNOTVALID); + } + + if (rsb_flag(r, RSB_VALNOTVALID)) + lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID; +} + +static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + if (lkb->lkb_grmode < DLM_LOCK_PW) + return; + + if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) { + rsb_set_flag(r, RSB_VALNOTVALID); + return; + } + + if (!lkb->lkb_lvbptr) + return; + + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + return; + + if (!r->res_lvbptr) + r->res_lvbptr = dlm_allocate_lvb(r->res_ls); + + if (!r->res_lvbptr) + return; + + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); + r->res_lvbseq++; + rsb_clear_flag(r, RSB_VALNOTVALID); +} + +/* lkb is process copy (pc) */ + +static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + int b; + + if (!lkb->lkb_lvbptr) + return; + + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + return; + + b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; + if (b == 1) { + int len = receive_extralen(ms); + if (len > DLM_RESNAME_MAXLEN) + len = DLM_RESNAME_MAXLEN; + memcpy(lkb->lkb_lvbptr, ms->m_extra, len); + lkb->lkb_lvbseq = ms->m_lvbseq; + } +} + +/* Manipulate lkb's on rsb's convert/granted/waiting queues + remove_lock -- used for unlock, removes lkb from granted + revert_lock -- used for cancel, moves lkb from convert to granted + grant_lock -- used for request and convert, adds lkb to granted or + moves lkb from convert or waiting to granted + + Each of these is used for master or local copy lkb's. There is + also a _pc() variation used to make the corresponding change on + a process copy (pc) lkb. */ + +static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + del_lkb(r, lkb); + lkb->lkb_grmode = DLM_LOCK_IV; + /* this unhold undoes the original ref from create_lkb() + so this leads to the lkb being freed */ + unhold_lkb(lkb); +} + +static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + set_lvb_unlock(r, lkb); + _remove_lock(r, lkb); +} + +static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + _remove_lock(r, lkb); +} + +/* returns: 0 did nothing + 1 moved lock to granted + -1 removed lock */ + +static int revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int rv = 0; + + lkb->lkb_rqmode = DLM_LOCK_IV; + + switch (lkb->lkb_status) { + case DLM_LKSTS_GRANTED: + break; + case DLM_LKSTS_CONVERT: + move_lkb(r, lkb, DLM_LKSTS_GRANTED); + rv = 1; + break; + case DLM_LKSTS_WAITING: + del_lkb(r, lkb); + lkb->lkb_grmode = DLM_LOCK_IV; + /* this unhold undoes the original ref from create_lkb() + so this leads to the lkb being freed */ + unhold_lkb(lkb); + rv = -1; + break; + default: + log_print("invalid status for revert %d", lkb->lkb_status); + } + return rv; +} + +static int revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + return revert_lock(r, lkb); +} + +static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + if (lkb->lkb_grmode != lkb->lkb_rqmode) { + lkb->lkb_grmode = lkb->lkb_rqmode; + if (lkb->lkb_status) + move_lkb(r, lkb, DLM_LKSTS_GRANTED); + else + add_lkb(r, lkb, DLM_LKSTS_GRANTED); + } + + lkb->lkb_rqmode = DLM_LOCK_IV; +} + +static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + set_lvb_lock(r, lkb); + _grant_lock(r, lkb); + lkb->lkb_highbast = 0; +} + +static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + set_lvb_lock_pc(r, lkb, ms); + _grant_lock(r, lkb); +} + +/* called by grant_pending_locks() which means an async grant message must + be sent to the requesting node in addition to granting the lock if the + lkb belongs to a remote node. */ + +static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + grant_lock(r, lkb); + if (is_master_copy(lkb)) + send_grant(r, lkb); + else + queue_cast(r, lkb, 0); +} + +/* The special CONVDEADLK, ALTPR and ALTCW flags allow the master to + change the granted/requested modes. We're munging things accordingly in + the process copy. + CONVDEADLK: our grmode may have been forced down to NL to resolve a + conversion deadlock + ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become + compatible with other granted locks */ + +static void munge_demoted(struct dlm_lkb *lkb) +{ + if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) { + log_print("munge_demoted %x invalid modes gr %d rq %d", + lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode); + return; + } + + lkb->lkb_grmode = DLM_LOCK_NL; +} + +static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + if (ms->m_type != DLM_MSG_REQUEST_REPLY && + ms->m_type != DLM_MSG_GRANT) { + log_print("munge_altmode %x invalid reply type %d", + lkb->lkb_id, ms->m_type); + return; + } + + if (lkb->lkb_exflags & DLM_LKF_ALTPR) + lkb->lkb_rqmode = DLM_LOCK_PR; + else if (lkb->lkb_exflags & DLM_LKF_ALTCW) + lkb->lkb_rqmode = DLM_LOCK_CW; + else { + log_print("munge_altmode invalid exflags %x", lkb->lkb_exflags); + dlm_print_lkb(lkb); + } +} + +static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head) +{ + struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb, + lkb_statequeue); + if (lkb->lkb_id == first->lkb_id) + return 1; + + return 0; +} + +/* Check if the given lkb conflicts with another lkb on the queue. */ + +static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb) +{ + struct dlm_lkb *this; + + list_for_each_entry(this, head, lkb_statequeue) { + if (this == lkb) + continue; + if (!modes_compat(this, lkb)) + return 1; + } + return 0; +} + +/* + * "A conversion deadlock arises with a pair of lock requests in the converting + * queue for one resource. The granted mode of each lock blocks the requested + * mode of the other lock." + * + * Part 2: if the granted mode of lkb is preventing an earlier lkb in the + * convert queue from being granted, then deadlk/demote lkb. + * + * Example: + * Granted Queue: empty + * Convert Queue: NL->EX (first lock) + * PR->EX (second lock) + * + * The first lock can't be granted because of the granted mode of the second + * lock and the second lock can't be granted because it's not first in the + * list. We either cancel lkb's conversion (PR->EX) and return EDEADLK, or we + * demote the granted mode of lkb (from PR to NL) if it has the CONVDEADLK + * flag set and return DEMOTED in the lksb flags. + * + * Originally, this function detected conv-deadlk in a more limited scope: + * - if !modes_compat(lkb1, lkb2) && !modes_compat(lkb2, lkb1), or + * - if lkb1 was the first entry in the queue (not just earlier), and was + * blocked by the granted mode of lkb2, and there was nothing on the + * granted queue preventing lkb1 from being granted immediately, i.e. + * lkb2 was the only thing preventing lkb1 from being granted. + * + * That second condition meant we'd only say there was conv-deadlk if + * resolving it (by demotion) would lead to the first lock on the convert + * queue being granted right away. It allowed conversion deadlocks to exist + * between locks on the convert queue while they couldn't be granted anyway. + * + * Now, we detect and take action on conversion deadlocks immediately when + * they're created, even if they may not be immediately consequential. If + * lkb1 exists anywhere in the convert queue and lkb2 comes in with a granted + * mode that would prevent lkb1's conversion from being granted, we do a + * deadlk/demote on lkb2 right away and don't let it onto the convert queue. + * I think this means that the lkb_is_ahead condition below should always + * be zero, i.e. there will never be conv-deadlk between two locks that are + * both already on the convert queue. + */ + +static int conversion_deadlock_detect(struct dlm_rsb *r, struct dlm_lkb *lkb2) +{ + struct dlm_lkb *lkb1; + int lkb_is_ahead = 0; + + list_for_each_entry(lkb1, &r->res_convertqueue, lkb_statequeue) { + if (lkb1 == lkb2) { + lkb_is_ahead = 1; + continue; + } + + if (!lkb_is_ahead) { + if (!modes_compat(lkb2, lkb1)) + return 1; + } else { + if (!modes_compat(lkb2, lkb1) && + !modes_compat(lkb1, lkb2)) + return 1; + } + } + return 0; +} + +/* + * Return 1 if the lock can be granted, 0 otherwise. + * Also detect and resolve conversion deadlocks. + * + * lkb is the lock to be granted + * + * now is 1 if the function is being called in the context of the + * immediate request, it is 0 if called later, after the lock has been + * queued. + * + * References are from chapter 6 of "VAXcluster Principles" by Roy Davis + */ + +static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now) +{ + int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV); + + /* + * 6-10: Version 5.4 introduced an option to address the phenomenon of + * a new request for a NL mode lock being blocked. + * + * 6-11: If the optional EXPEDITE flag is used with the new NL mode + * request, then it would be granted. In essence, the use of this flag + * tells the Lock Manager to expedite theis request by not considering + * what may be in the CONVERTING or WAITING queues... As of this + * writing, the EXPEDITE flag can be used only with new requests for NL + * mode locks. This flag is not valid for conversion requests. + * + * A shortcut. Earlier checks return an error if EXPEDITE is used in a + * conversion or used with a non-NL requested mode. We also know an + * EXPEDITE request is always granted immediately, so now must always + * be 1. The full condition to grant an expedite request: (now && + * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can + * therefore be shortened to just checking the flag. + */ + + if (lkb->lkb_exflags & DLM_LKF_EXPEDITE) + return 1; + + /* + * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be + * added to the remaining conditions. + */ + + if (queue_conflict(&r->res_grantqueue, lkb)) + goto out; + + /* + * 6-3: By default, a conversion request is immediately granted if the + * requested mode is compatible with the modes of all other granted + * locks + */ + + if (queue_conflict(&r->res_convertqueue, lkb)) + goto out; + + /* + * 6-5: But the default algorithm for deciding whether to grant or + * queue conversion requests does not by itself guarantee that such + * requests are serviced on a "first come first serve" basis. This, in + * turn, can lead to a phenomenon known as "indefinate postponement". + * + * 6-7: This issue is dealt with by using the optional QUECVT flag with + * the system service employed to request a lock conversion. This flag + * forces certain conversion requests to be queued, even if they are + * compatible with the granted modes of other locks on the same + * resource. Thus, the use of this flag results in conversion requests + * being ordered on a "first come first servce" basis. + * + * DCT: This condition is all about new conversions being able to occur + * "in place" while the lock remains on the granted queue (assuming + * nothing else conflicts.) IOW if QUECVT isn't set, a conversion + * doesn't _have_ to go onto the convert queue where it's processed in + * order. The "now" variable is necessary to distinguish converts + * being received and processed for the first time now, because once a + * convert is moved to the conversion queue the condition below applies + * requiring fifo granting. + */ + + if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT)) + return 1; + + /* + * The NOORDER flag is set to avoid the standard vms rules on grant + * order. + */ + + if (lkb->lkb_exflags & DLM_LKF_NOORDER) + return 1; + + /* + * 6-3: Once in that queue [CONVERTING], a conversion request cannot be + * granted until all other conversion requests ahead of it are granted + * and/or canceled. + */ + + if (!now && conv && first_in_list(lkb, &r->res_convertqueue)) + return 1; + + /* + * 6-4: By default, a new request is immediately granted only if all + * three of the following conditions are satisfied when the request is + * issued: + * - The queue of ungranted conversion requests for the resource is + * empty. + * - The queue of ungranted new requests for the resource is empty. + * - The mode of the new request is compatible with the most + * restrictive mode of all granted locks on the resource. + */ + + if (now && !conv && list_empty(&r->res_convertqueue) && + list_empty(&r->res_waitqueue)) + return 1; + + /* + * 6-4: Once a lock request is in the queue of ungranted new requests, + * it cannot be granted until the queue of ungranted conversion + * requests is empty, all ungranted new requests ahead of it are + * granted and/or canceled, and it is compatible with the granted mode + * of the most restrictive lock granted on the resource. + */ + + if (!now && !conv && list_empty(&r->res_convertqueue) && + first_in_list(lkb, &r->res_waitqueue)) + return 1; + out: + return 0; +} + +static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, + int *err) +{ + int rv; + int8_t alt = 0, rqmode = lkb->lkb_rqmode; + int8_t is_convert = (lkb->lkb_grmode != DLM_LOCK_IV); + + if (err) + *err = 0; + + rv = _can_be_granted(r, lkb, now); + if (rv) + goto out; + + /* + * The CONVDEADLK flag is non-standard and tells the dlm to resolve + * conversion deadlocks by demoting grmode to NL, otherwise the dlm + * cancels one of the locks. + */ + + if (is_convert && can_be_queued(lkb) && + conversion_deadlock_detect(r, lkb)) { + if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) { + lkb->lkb_grmode = DLM_LOCK_NL; + lkb->lkb_sbflags |= DLM_SBF_DEMOTED; + } else if (!(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { + if (err) + *err = -EDEADLK; + else { + log_print("can_be_granted deadlock %x now %d", + lkb->lkb_id, now); + dlm_dump_rsb(r); + } + } + goto out; + } + + /* + * The ALTPR and ALTCW flags are non-standard and tell the dlm to try + * to grant a request in a mode other than the normal rqmode. It's a + * simple way to provide a big optimization to applications that can + * use them. + */ + + if (rqmode != DLM_LOCK_PR && (lkb->lkb_exflags & DLM_LKF_ALTPR)) + alt = DLM_LOCK_PR; + else if (rqmode != DLM_LOCK_CW && (lkb->lkb_exflags & DLM_LKF_ALTCW)) + alt = DLM_LOCK_CW; + + if (alt) { + lkb->lkb_rqmode = alt; + rv = _can_be_granted(r, lkb, now); + if (rv) + lkb->lkb_sbflags |= DLM_SBF_ALTMODE; + else + lkb->lkb_rqmode = rqmode; + } + out: + return rv; +} + +/* FIXME: I don't think that can_be_granted() can/will demote or find deadlock + for locks pending on the convert list. Once verified (watch for these + log_prints), we should be able to just call _can_be_granted() and not + bother with the demote/deadlk cases here (and there's no easy way to deal + with a deadlk here, we'd have to generate something like grant_lock with + the deadlk error.) */ + +/* Returns the highest requested mode of all blocked conversions; sets + cw if there's a blocked conversion to DLM_LOCK_CW. */ + +static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw) +{ + struct dlm_lkb *lkb, *s; + int hi, demoted, quit, grant_restart, demote_restart; + int deadlk; + + quit = 0; + restart: + grant_restart = 0; + demote_restart = 0; + hi = DLM_LOCK_IV; + + list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) { + demoted = is_demoted(lkb); + deadlk = 0; + + if (can_be_granted(r, lkb, 0, &deadlk)) { + grant_lock_pending(r, lkb); + grant_restart = 1; + continue; + } + + if (!demoted && is_demoted(lkb)) { + log_print("WARN: pending demoted %x node %d %s", + lkb->lkb_id, lkb->lkb_nodeid, r->res_name); + demote_restart = 1; + continue; + } + + if (deadlk) { + log_print("WARN: pending deadlock %x node %d %s", + lkb->lkb_id, lkb->lkb_nodeid, r->res_name); + dlm_dump_rsb(r); + continue; + } + + hi = max_t(int, lkb->lkb_rqmode, hi); + + if (cw && lkb->lkb_rqmode == DLM_LOCK_CW) + *cw = 1; + } + + if (grant_restart) + goto restart; + if (demote_restart && !quit) { + quit = 1; + goto restart; + } + + return max_t(int, high, hi); +} + +static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw) +{ + struct dlm_lkb *lkb, *s; + + list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) { + if (can_be_granted(r, lkb, 0, NULL)) + grant_lock_pending(r, lkb); + else { + high = max_t(int, lkb->lkb_rqmode, high); + if (lkb->lkb_rqmode == DLM_LOCK_CW) + *cw = 1; + } + } + + return high; +} + +/* cw of 1 means there's a lock with a rqmode of DLM_LOCK_CW that's blocked + on either the convert or waiting queue. + high is the largest rqmode of all locks blocked on the convert or + waiting queue. */ + +static int lock_requires_bast(struct dlm_lkb *gr, int high, int cw) +{ + if (gr->lkb_grmode == DLM_LOCK_PR && cw) { + if (gr->lkb_highbast < DLM_LOCK_EX) + return 1; + return 0; + } + + if (gr->lkb_highbast < high && + !__dlm_compat_matrix[gr->lkb_grmode+1][high+1]) + return 1; + return 0; +} + +static void grant_pending_locks(struct dlm_rsb *r) +{ + struct dlm_lkb *lkb, *s; + int high = DLM_LOCK_IV; + int cw = 0; + + DLM_ASSERT(is_master(r), dlm_dump_rsb(r);); + + high = grant_pending_convert(r, high, &cw); + high = grant_pending_wait(r, high, &cw); + + if (high == DLM_LOCK_IV) + return; + + /* + * If there are locks left on the wait/convert queue then send blocking + * ASTs to granted locks based on the largest requested mode (high) + * found above. + */ + + list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) { + if (lkb->lkb_bastfn && lock_requires_bast(lkb, high, cw)) { + if (cw && high == DLM_LOCK_PR && + lkb->lkb_grmode == DLM_LOCK_PR) + queue_bast(r, lkb, DLM_LOCK_CW); + else + queue_bast(r, lkb, high); + lkb->lkb_highbast = high; + } + } +} + +static int modes_require_bast(struct dlm_lkb *gr, struct dlm_lkb *rq) +{ + if ((gr->lkb_grmode == DLM_LOCK_PR && rq->lkb_rqmode == DLM_LOCK_CW) || + (gr->lkb_grmode == DLM_LOCK_CW && rq->lkb_rqmode == DLM_LOCK_PR)) { + if (gr->lkb_highbast < DLM_LOCK_EX) + return 1; + return 0; + } + + if (gr->lkb_highbast < rq->lkb_rqmode && !modes_compat(gr, rq)) + return 1; + return 0; +} + +static void send_bast_queue(struct dlm_rsb *r, struct list_head *head, + struct dlm_lkb *lkb) +{ + struct dlm_lkb *gr; + + list_for_each_entry(gr, head, lkb_statequeue) { + /* skip self when sending basts to convertqueue */ + if (gr == lkb) + continue; + if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) { + queue_bast(r, gr, lkb->lkb_rqmode); + gr->lkb_highbast = lkb->lkb_rqmode; + } + } +} + +static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + send_bast_queue(r, &r->res_grantqueue, lkb); +} + +static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + send_bast_queue(r, &r->res_grantqueue, lkb); + send_bast_queue(r, &r->res_convertqueue, lkb); +} + +/* set_master(r, lkb) -- set the master nodeid of a resource + + The purpose of this function is to set the nodeid field in the given + lkb using the nodeid field in the given rsb. If the rsb's nodeid is + known, it can just be copied to the lkb and the function will return + 0. If the rsb's nodeid is _not_ known, it needs to be looked up + before it can be copied to the lkb. + + When the rsb nodeid is being looked up remotely, the initial lkb + causing the lookup is kept on the ls_waiters list waiting for the + lookup reply. Other lkb's waiting for the same rsb lookup are kept + on the rsb's res_lookup list until the master is verified. + + Return values: + 0: nodeid is set in rsb/lkb and the caller should go ahead and use it + 1: the rsb master is not available and the lkb has been placed on + a wait queue +*/ + +static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + struct dlm_ls *ls = r->res_ls; + int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); + + if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) { + rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); + r->res_first_lkid = lkb->lkb_id; + lkb->lkb_nodeid = r->res_nodeid; + return 0; + } + + if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) { + list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup); + return 1; + } + + if (r->res_nodeid == 0) { + lkb->lkb_nodeid = 0; + return 0; + } + + if (r->res_nodeid > 0) { + lkb->lkb_nodeid = r->res_nodeid; + return 0; + } + + DLM_ASSERT(r->res_nodeid == -1, dlm_dump_rsb(r);); + + dir_nodeid = dlm_dir_nodeid(r); + + if (dir_nodeid != our_nodeid) { + r->res_first_lkid = lkb->lkb_id; + send_lookup(r, lkb); + return 1; + } + + for (i = 0; i < 2; i++) { + /* It's possible for dlm_scand to remove an old rsb for + this same resource from the toss list, us to create + a new one, look up the master locally, and find it + already exists just before dlm_scand does the + dir_remove() on the previous rsb. */ + + error = dlm_dir_lookup(ls, our_nodeid, r->res_name, + r->res_length, &ret_nodeid); + if (!error) + break; + log_debug(ls, "dir_lookup error %d %s", error, r->res_name); + schedule(); + } + if (error && error != -EEXIST) + return error; + + if (ret_nodeid == our_nodeid) { + r->res_first_lkid = 0; + r->res_nodeid = 0; + lkb->lkb_nodeid = 0; + } else { + r->res_first_lkid = lkb->lkb_id; + r->res_nodeid = ret_nodeid; + lkb->lkb_nodeid = ret_nodeid; + } + return 0; +} + +static void process_lookup_list(struct dlm_rsb *r) +{ + struct dlm_lkb *lkb, *safe; + + list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) { + list_del_init(&lkb->lkb_rsb_lookup); + _request_lock(r, lkb); + schedule(); + } +} + +/* confirm_master -- confirm (or deny) an rsb's master nodeid */ + +static void confirm_master(struct dlm_rsb *r, int error) +{ + struct dlm_lkb *lkb; + + if (!r->res_first_lkid) + return; + + switch (error) { + case 0: + case -EINPROGRESS: + r->res_first_lkid = 0; + process_lookup_list(r); + break; + + case -EAGAIN: + case -EBADR: + case -ENOTBLK: + /* the remote request failed and won't be retried (it was + a NOQUEUE, or has been canceled/unlocked); make a waiting + lkb the first_lkid */ + + r->res_first_lkid = 0; + + if (!list_empty(&r->res_lookup)) { + lkb = list_entry(r->res_lookup.next, struct dlm_lkb, + lkb_rsb_lookup); + list_del_init(&lkb->lkb_rsb_lookup); + r->res_first_lkid = lkb->lkb_id; + _request_lock(r, lkb); + } + break; + + default: + log_error(r->res_ls, "confirm_master unknown error %d", error); + } +} + +static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, + int namelen, unsigned long timeout_cs, + void (*ast) (void *astparam), + void *astparam, + void (*bast) (void *astparam, int mode), + struct dlm_args *args) +{ + int rv = -EINVAL; + + /* check for invalid arg usage */ + + if (mode < 0 || mode > DLM_LOCK_EX) + goto out; + + if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN)) + goto out; + + if (flags & DLM_LKF_CANCEL) + goto out; + + if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT)) + goto out; + + if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT)) + goto out; + + if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE) + goto out; + + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT) + goto out; + + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT) + goto out; + + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE) + goto out; + + if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL) + goto out; + + if (!ast || !lksb) + goto out; + + if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr) + goto out; + + if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid) + goto out; + + /* these args will be copied to the lkb in validate_lock_args, + it cannot be done now because when converting locks, fields in + an active lkb cannot be modified before locking the rsb */ + + args->flags = flags; + args->astfn = ast; + args->astparam = astparam; + args->bastfn = bast; + args->timeout = timeout_cs; + args->mode = mode; + args->lksb = lksb; + rv = 0; + out: + return rv; +} + +static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args) +{ + if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK | + DLM_LKF_FORCEUNLOCK)) + return -EINVAL; + + if (flags & DLM_LKF_CANCEL && flags & DLM_LKF_FORCEUNLOCK) + return -EINVAL; + + args->flags = flags; + args->astparam = astarg; + return 0; +} + +static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_args *args) +{ + int rv = -EINVAL; + + if (args->flags & DLM_LKF_CONVERT) { + if (lkb->lkb_flags & DLM_IFL_MSTCPY) + goto out; + + if (args->flags & DLM_LKF_QUECVT && + !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1]) + goto out; + + rv = -EBUSY; + if (lkb->lkb_status != DLM_LKSTS_GRANTED) + goto out; + + if (lkb->lkb_wait_type) + goto out; + + if (is_overlap(lkb)) + goto out; + } + + lkb->lkb_exflags = args->flags; + lkb->lkb_sbflags = 0; + lkb->lkb_astfn = args->astfn; + lkb->lkb_astparam = args->astparam; + lkb->lkb_bastfn = args->bastfn; + lkb->lkb_rqmode = args->mode; + lkb->lkb_lksb = args->lksb; + lkb->lkb_lvbptr = args->lksb->sb_lvbptr; + lkb->lkb_ownpid = (int) current->pid; + lkb->lkb_timeout_cs = args->timeout; + rv = 0; + out: + if (rv) + log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s", + rv, lkb->lkb_id, lkb->lkb_flags, args->flags, + lkb->lkb_status, lkb->lkb_wait_type, + lkb->lkb_resource->res_name); + return rv; +} + +/* when dlm_unlock() sees -EBUSY with CANCEL/FORCEUNLOCK it returns 0 + for success */ + +/* note: it's valid for lkb_nodeid/res_nodeid to be -1 when we get here + because there may be a lookup in progress and it's valid to do + cancel/unlockf on it */ + +static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) +{ + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + int rv = -EINVAL; + + if (lkb->lkb_flags & DLM_IFL_MSTCPY) { + log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id); + dlm_print_lkb(lkb); + goto out; + } + + /* an lkb may still exist even though the lock is EOL'ed due to a + cancel, unlock or failed noqueue request; an app can't use these + locks; return same error as if the lkid had not been found at all */ + + if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) { + log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id); + rv = -ENOENT; + goto out; + } + + /* an lkb may be waiting for an rsb lookup to complete where the + lookup was initiated by another lock */ + + if (!list_empty(&lkb->lkb_rsb_lookup)) { + if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) { + log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id); + list_del_init(&lkb->lkb_rsb_lookup); + queue_cast(lkb->lkb_resource, lkb, + args->flags & DLM_LKF_CANCEL ? + -DLM_ECANCEL : -DLM_EUNLOCK); + unhold_lkb(lkb); /* undoes create_lkb() */ + } + /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */ + rv = -EBUSY; + goto out; + } + + /* cancel not allowed with another cancel/unlock in progress */ + + if (args->flags & DLM_LKF_CANCEL) { + if (lkb->lkb_exflags & DLM_LKF_CANCEL) + goto out; + + if (is_overlap(lkb)) + goto out; + + /* don't let scand try to do a cancel */ + del_timeout(lkb); + + if (lkb->lkb_flags & DLM_IFL_RESEND) { + lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL; + rv = -EBUSY; + goto out; + } + + /* there's nothing to cancel */ + if (lkb->lkb_status == DLM_LKSTS_GRANTED && + !lkb->lkb_wait_type) { + rv = -EBUSY; + goto out; + } + + switch (lkb->lkb_wait_type) { + case DLM_MSG_LOOKUP: + case DLM_MSG_REQUEST: + lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL; + rv = -EBUSY; + goto out; + case DLM_MSG_UNLOCK: + case DLM_MSG_CANCEL: + goto out; + } + /* add_to_waiters() will set OVERLAP_CANCEL */ + goto out_ok; + } + + /* do we need to allow a force-unlock if there's a normal unlock + already in progress? in what conditions could the normal unlock + fail such that we'd want to send a force-unlock to be sure? */ + + if (args->flags & DLM_LKF_FORCEUNLOCK) { + if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK) + goto out; + + if (is_overlap_unlock(lkb)) + goto out; + + /* don't let scand try to do a cancel */ + del_timeout(lkb); + + if (lkb->lkb_flags & DLM_IFL_RESEND) { + lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK; + rv = -EBUSY; + goto out; + } + + switch (lkb->lkb_wait_type) { + case DLM_MSG_LOOKUP: + case DLM_MSG_REQUEST: + lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK; + rv = -EBUSY; + goto out; + case DLM_MSG_UNLOCK: + goto out; + } + /* add_to_waiters() will set OVERLAP_UNLOCK */ + goto out_ok; + } + + /* normal unlock not allowed if there's any op in progress */ + rv = -EBUSY; + if (lkb->lkb_wait_type || lkb->lkb_wait_count) + goto out; + + out_ok: + /* an overlapping op shouldn't blow away exflags from other op */ + lkb->lkb_exflags |= args->flags; + lkb->lkb_sbflags = 0; + lkb->lkb_astparam = args->astparam; + rv = 0; + out: + if (rv) + log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv, + lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags, + args->flags, lkb->lkb_wait_type, + lkb->lkb_resource->res_name); + return rv; +} + +/* + * Four stage 4 varieties: + * do_request(), do_convert(), do_unlock(), do_cancel() + * These are called on the master node for the given lock and + * from the central locking logic. + */ + +static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error = 0; + + if (can_be_granted(r, lkb, 1, NULL)) { + grant_lock(r, lkb); + queue_cast(r, lkb, 0); + goto out; + } + + if (can_be_queued(lkb)) { + error = -EINPROGRESS; + add_lkb(r, lkb, DLM_LKSTS_WAITING); + add_timeout(lkb); + goto out; + } + + error = -EAGAIN; + queue_cast(r, lkb, -EAGAIN); + out: + return error; +} + +static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + switch (error) { + case -EAGAIN: + if (force_blocking_asts(lkb)) + send_blocking_asts_all(r, lkb); + break; + case -EINPROGRESS: + send_blocking_asts(r, lkb); + break; + } +} + +static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error = 0; + int deadlk = 0; + + /* changing an existing lock may allow others to be granted */ + + if (can_be_granted(r, lkb, 1, &deadlk)) { + grant_lock(r, lkb); + queue_cast(r, lkb, 0); + goto out; + } + + /* can_be_granted() detected that this lock would block in a conversion + deadlock, so we leave it on the granted queue and return EDEADLK in + the ast for the convert. */ + + if (deadlk) { + /* it's left on the granted queue */ + log_debug(r->res_ls, "deadlock %x node %d sts%d g%d r%d %s", + lkb->lkb_id, lkb->lkb_nodeid, lkb->lkb_status, + lkb->lkb_grmode, lkb->lkb_rqmode, r->res_name); + revert_lock(r, lkb); + queue_cast(r, lkb, -EDEADLK); + error = -EDEADLK; + goto out; + } + + /* is_demoted() means the can_be_granted() above set the grmode + to NL, and left us on the granted queue. This auto-demotion + (due to CONVDEADLK) might mean other locks, and/or this lock, are + now grantable. We have to try to grant other converting locks + before we try again to grant this one. */ + + if (is_demoted(lkb)) { + grant_pending_convert(r, DLM_LOCK_IV, NULL); + if (_can_be_granted(r, lkb, 1)) { + grant_lock(r, lkb); + queue_cast(r, lkb, 0); + goto out; + } + /* else fall through and move to convert queue */ + } + + if (can_be_queued(lkb)) { + error = -EINPROGRESS; + del_lkb(r, lkb); + add_lkb(r, lkb, DLM_LKSTS_CONVERT); + add_timeout(lkb); + goto out; + } + + error = -EAGAIN; + queue_cast(r, lkb, -EAGAIN); + out: + return error; +} + +static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + switch (error) { + case 0: + grant_pending_locks(r); + /* grant_pending_locks also sends basts */ + break; + case -EAGAIN: + if (force_blocking_asts(lkb)) + send_blocking_asts_all(r, lkb); + break; + case -EINPROGRESS: + send_blocking_asts(r, lkb); + break; + } +} + +static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + remove_lock(r, lkb); + queue_cast(r, lkb, -DLM_EUNLOCK); + return -DLM_EUNLOCK; +} + +static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + grant_pending_locks(r); +} + +/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */ + +static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + error = revert_lock(r, lkb); + if (error) { + queue_cast(r, lkb, -DLM_ECANCEL); + return -DLM_ECANCEL; + } + return 0; +} + +static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + if (error) + grant_pending_locks(r); +} + +/* + * Four stage 3 varieties: + * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock() + */ + +/* add a new lkb to a possibly new rsb, called by requesting process */ + +static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + /* set_master: sets lkb nodeid from r */ + + error = set_master(r, lkb); + if (error < 0) + goto out; + if (error) { + error = 0; + goto out; + } + + if (is_remote(r)) { + /* receive_request() calls do_request() on remote node */ + error = send_request(r, lkb); + } else { + error = do_request(r, lkb); + /* for remote locks the request_reply is sent + between do_request and do_request_effects */ + do_request_effects(r, lkb, error); + } + out: + return error; +} + +/* change some property of an existing lkb, e.g. mode */ + +static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + if (is_remote(r)) { + /* receive_convert() calls do_convert() on remote node */ + error = send_convert(r, lkb); + } else { + error = do_convert(r, lkb); + /* for remote locks the convert_reply is sent + between do_convert and do_convert_effects */ + do_convert_effects(r, lkb, error); + } + + return error; +} + +/* remove an existing lkb from the granted queue */ + +static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + if (is_remote(r)) { + /* receive_unlock() calls do_unlock() on remote node */ + error = send_unlock(r, lkb); + } else { + error = do_unlock(r, lkb); + /* for remote locks the unlock_reply is sent + between do_unlock and do_unlock_effects */ + do_unlock_effects(r, lkb, error); + } + + return error; +} + +/* remove an existing lkb from the convert or wait queue */ + +static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + if (is_remote(r)) { + /* receive_cancel() calls do_cancel() on remote node */ + error = send_cancel(r, lkb); + } else { + error = do_cancel(r, lkb); + /* for remote locks the cancel_reply is sent + between do_cancel and do_cancel_effects */ + do_cancel_effects(r, lkb, error); + } + + return error; +} + +/* + * Four stage 2 varieties: + * request_lock(), convert_lock(), unlock_lock(), cancel_lock() + */ + +static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name, + int len, struct dlm_args *args) +{ + struct dlm_rsb *r; + int error; + + error = validate_lock_args(ls, lkb, args); + if (error) + goto out; + + error = find_rsb(ls, name, len, R_CREATE, &r); + if (error) + goto out; + + lock_rsb(r); + + attach_lkb(r, lkb); + lkb->lkb_lksb->sb_lkid = lkb->lkb_id; + + error = _request_lock(r, lkb); + + unlock_rsb(r); + put_rsb(r); + + out: + return error; +} + +static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_args *args) +{ + struct dlm_rsb *r; + int error; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_lock_args(ls, lkb, args); + if (error) + goto out; + + error = _convert_lock(r, lkb); + out: + unlock_rsb(r); + put_rsb(r); + return error; +} + +static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_args *args) +{ + struct dlm_rsb *r; + int error; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_unlock_args(lkb, args); + if (error) + goto out; + + error = _unlock_lock(r, lkb); + out: + unlock_rsb(r); + put_rsb(r); + return error; +} + +static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_args *args) +{ + struct dlm_rsb *r; + int error; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_unlock_args(lkb, args); + if (error) + goto out; + + error = _cancel_lock(r, lkb); + out: + unlock_rsb(r); + put_rsb(r); + return error; +} + +/* + * Two stage 1 varieties: dlm_lock() and dlm_unlock() + */ + +int dlm_lock(dlm_lockspace_t *lockspace, + int mode, + struct dlm_lksb *lksb, + uint32_t flags, + void *name, + unsigned int namelen, + uint32_t parent_lkid, + void (*ast) (void *astarg), + void *astarg, + void (*bast) (void *astarg, int mode)) +{ + struct dlm_ls *ls; + struct dlm_lkb *lkb; + struct dlm_args args; + int error, convert = flags & DLM_LKF_CONVERT; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + dlm_lock_recovery(ls); + + if (convert) + error = find_lkb(ls, lksb->sb_lkid, &lkb); + else + error = create_lkb(ls, &lkb); + + if (error) + goto out; + + error = set_lock_args(mode, lksb, flags, namelen, 0, ast, + astarg, bast, &args); + if (error) + goto out_put; + + if (convert) + error = convert_lock(ls, lkb, &args); + else + error = request_lock(ls, lkb, name, namelen, &args); + + if (error == -EINPROGRESS) + error = 0; + out_put: + if (convert || error) + __put_lkb(ls, lkb); + if (error == -EAGAIN || error == -EDEADLK) + error = 0; + out: + dlm_unlock_recovery(ls); + dlm_put_lockspace(ls); + return error; +} + +int dlm_unlock(dlm_lockspace_t *lockspace, + uint32_t lkid, + uint32_t flags, + struct dlm_lksb *lksb, + void *astarg) +{ + struct dlm_ls *ls; + struct dlm_lkb *lkb; + struct dlm_args args; + int error; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + error = set_unlock_args(flags, astarg, &args); + if (error) + goto out_put; + + if (flags & DLM_LKF_CANCEL) + error = cancel_lock(ls, lkb, &args); + else + error = unlock_lock(ls, lkb, &args); + + if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL) + error = 0; + if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK))) + error = 0; + out_put: + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + dlm_put_lockspace(ls); + return error; +} + +/* + * send/receive routines for remote operations and replies + * + * send_args + * send_common + * send_request receive_request + * send_convert receive_convert + * send_unlock receive_unlock + * send_cancel receive_cancel + * send_grant receive_grant + * send_bast receive_bast + * send_lookup receive_lookup + * send_remove receive_remove + * + * send_common_reply + * receive_request_reply send_request_reply + * receive_convert_reply send_convert_reply + * receive_unlock_reply send_unlock_reply + * receive_cancel_reply send_cancel_reply + * receive_lookup_reply send_lookup_reply + */ + +static int _create_message(struct dlm_ls *ls, int mb_len, + int to_nodeid, int mstype, + struct dlm_message **ms_ret, + struct dlm_mhandle **mh_ret) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + char *mb; + + /* get_buffer gives us a message handle (mh) that we need to + pass into lowcomms_commit and a message buffer (mb) that we + write our data into */ + + mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); + if (!mh) + return -ENOBUFS; + + memset(mb, 0, mb_len); + + ms = (struct dlm_message *) mb; + + ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + ms->m_header.h_lockspace = ls->ls_global_id; + ms->m_header.h_nodeid = dlm_our_nodeid(); + ms->m_header.h_length = mb_len; + ms->m_header.h_cmd = DLM_MSG; + + ms->m_type = mstype; + + *mh_ret = mh; + *ms_ret = ms; + return 0; +} + +static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, + int to_nodeid, int mstype, + struct dlm_message **ms_ret, + struct dlm_mhandle **mh_ret) +{ + int mb_len = sizeof(struct dlm_message); + + switch (mstype) { + case DLM_MSG_REQUEST: + case DLM_MSG_LOOKUP: + case DLM_MSG_REMOVE: + mb_len += r->res_length; + break; + case DLM_MSG_CONVERT: + case DLM_MSG_UNLOCK: + case DLM_MSG_REQUEST_REPLY: + case DLM_MSG_CONVERT_REPLY: + case DLM_MSG_GRANT: + if (lkb && lkb->lkb_lvbptr) + mb_len += r->res_ls->ls_lvblen; + break; + } + + return _create_message(r->res_ls, mb_len, to_nodeid, mstype, + ms_ret, mh_ret); +} + +/* further lowcomms enhancements or alternate implementations may make + the return value from this function useful at some point */ + +static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms) +{ + dlm_message_out(ms); + dlm_lowcomms_commit_buffer(mh); + return 0; +} + +static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + ms->m_nodeid = lkb->lkb_nodeid; + ms->m_pid = lkb->lkb_ownpid; + ms->m_lkid = lkb->lkb_id; + ms->m_remid = lkb->lkb_remid; + ms->m_exflags = lkb->lkb_exflags; + ms->m_sbflags = lkb->lkb_sbflags; + ms->m_flags = lkb->lkb_flags; + ms->m_lvbseq = lkb->lkb_lvbseq; + ms->m_status = lkb->lkb_status; + ms->m_grmode = lkb->lkb_grmode; + ms->m_rqmode = lkb->lkb_rqmode; + ms->m_hash = r->res_hash; + + /* m_result and m_bastmode are set from function args, + not from lkb fields */ + + if (lkb->lkb_bastfn) + ms->m_asts |= DLM_CB_BAST; + if (lkb->lkb_astfn) + ms->m_asts |= DLM_CB_CAST; + + /* compare with switch in create_message; send_remove() doesn't + use send_args() */ + + switch (ms->m_type) { + case DLM_MSG_REQUEST: + case DLM_MSG_LOOKUP: + memcpy(ms->m_extra, r->res_name, r->res_length); + break; + case DLM_MSG_CONVERT: + case DLM_MSG_UNLOCK: + case DLM_MSG_REQUEST_REPLY: + case DLM_MSG_CONVERT_REPLY: + case DLM_MSG_GRANT: + if (!lkb->lkb_lvbptr) + break; + memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); + break; + } +} + +static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = r->res_nodeid; + + error = add_to_waiters(lkb, mstype, to_nodeid); + if (error) + return error; + + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); + if (error) + goto fail; + + send_args(r, lkb, ms); + + error = send_message(mh, ms); + if (error) + goto fail; + return 0; + + fail: + remove_from_waiters(lkb, msg_reply_type(mstype)); + return error; +} + +static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + return send_common(r, lkb, DLM_MSG_REQUEST); +} + +static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + int error; + + error = send_common(r, lkb, DLM_MSG_CONVERT); + + /* down conversions go without a reply from the master */ + if (!error && down_conversion(lkb)) { + remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY); + r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS; + r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; + r->res_ls->ls_stub_ms.m_result = 0; + __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms); + } + + return error; +} + +/* FIXME: if this lkb is the only lock we hold on the rsb, then set + MASTER_UNCERTAIN to force the next request on the rsb to confirm + that the master is still correct. */ + +static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + return send_common(r, lkb, DLM_MSG_UNLOCK); +} + +static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + return send_common(r, lkb, DLM_MSG_CANCEL); +} + +static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = lkb->lkb_nodeid; + + error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh); + if (error) + goto out; + + send_args(r, lkb, ms); + + ms->m_result = 0; + + error = send_message(mh, ms); + out: + return error; +} + +static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = lkb->lkb_nodeid; + + error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh); + if (error) + goto out; + + send_args(r, lkb, ms); + + ms->m_bastmode = mode; + + error = send_message(mh, ms); + out: + return error; +} + +static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = dlm_dir_nodeid(r); + + error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid); + if (error) + return error; + + error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh); + if (error) + goto fail; + + send_args(r, lkb, ms); + + error = send_message(mh, ms); + if (error) + goto fail; + return 0; + + fail: + remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY); + return error; +} + +static int send_remove(struct dlm_rsb *r) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = dlm_dir_nodeid(r); + + error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh); + if (error) + goto out; + + memcpy(ms->m_extra, r->res_name, r->res_length); + ms->m_hash = r->res_hash; + + error = send_message(mh, ms); + out: + return error; +} + +static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, + int mstype, int rv) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int to_nodeid, error; + + to_nodeid = lkb->lkb_nodeid; + + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); + if (error) + goto out; + + send_args(r, lkb, ms); + + ms->m_result = rv; + + error = send_message(mh, ms); + out: + return error; +} + +static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv); +} + +static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv); +} + +static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv); +} + +static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) +{ + return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv); +} + +static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, + int ret_nodeid, int rv) +{ + struct dlm_rsb *r = &ls->ls_stub_rsb; + struct dlm_message *ms; + struct dlm_mhandle *mh; + int error, nodeid = ms_in->m_header.h_nodeid; + + error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh); + if (error) + goto out; + + ms->m_lkid = ms_in->m_lkid; + ms->m_result = rv; + ms->m_nodeid = ret_nodeid; + + error = send_message(mh, ms); + out: + return error; +} + +/* which args we save from a received message depends heavily on the type + of message, unlike the send side where we can safely send everything about + the lkb for any type of message */ + +static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + lkb->lkb_exflags = ms->m_exflags; + lkb->lkb_sbflags = ms->m_sbflags; + lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | + (ms->m_flags & 0x0000FFFF); +} + +static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + if (ms->m_flags == DLM_IFL_STUB_MS) + return; + + lkb->lkb_sbflags = ms->m_sbflags; + lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | + (ms->m_flags & 0x0000FFFF); +} + +static int receive_extralen(struct dlm_message *ms) +{ + return (ms->m_header.h_length - sizeof(struct dlm_message)); +} + +static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + int len; + + if (lkb->lkb_exflags & DLM_LKF_VALBLK) { + if (!lkb->lkb_lvbptr) + lkb->lkb_lvbptr = dlm_allocate_lvb(ls); + if (!lkb->lkb_lvbptr) + return -ENOMEM; + len = receive_extralen(ms); + if (len > DLM_RESNAME_MAXLEN) + len = DLM_RESNAME_MAXLEN; + memcpy(lkb->lkb_lvbptr, ms->m_extra, len); + } + return 0; +} + +static void fake_bastfn(void *astparam, int mode) +{ + log_print("fake_bastfn should not be called"); +} + +static void fake_astfn(void *astparam) +{ + log_print("fake_astfn should not be called"); +} + +static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + lkb->lkb_nodeid = ms->m_header.h_nodeid; + lkb->lkb_ownpid = ms->m_pid; + lkb->lkb_remid = ms->m_lkid; + lkb->lkb_grmode = DLM_LOCK_IV; + lkb->lkb_rqmode = ms->m_rqmode; + + lkb->lkb_bastfn = (ms->m_asts & DLM_CB_BAST) ? &fake_bastfn : NULL; + lkb->lkb_astfn = (ms->m_asts & DLM_CB_CAST) ? &fake_astfn : NULL; + + if (lkb->lkb_exflags & DLM_LKF_VALBLK) { + /* lkb was just created so there won't be an lvb yet */ + lkb->lkb_lvbptr = dlm_allocate_lvb(ls); + if (!lkb->lkb_lvbptr) + return -ENOMEM; + } + + return 0; +} + +static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + if (lkb->lkb_status != DLM_LKSTS_GRANTED) + return -EBUSY; + + if (receive_lvb(ls, lkb, ms)) + return -ENOMEM; + + lkb->lkb_rqmode = ms->m_rqmode; + lkb->lkb_lvbseq = ms->m_lvbseq; + + return 0; +} + +static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + if (receive_lvb(ls, lkb, ms)) + return -ENOMEM; + return 0; +} + +/* We fill in the stub-lkb fields with the info that send_xxxx_reply() + uses to send a reply and that the remote end uses to process the reply. */ + +static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb = &ls->ls_stub_lkb; + lkb->lkb_nodeid = ms->m_header.h_nodeid; + lkb->lkb_remid = ms->m_lkid; +} + +/* This is called after the rsb is locked so that we can safely inspect + fields in the lkb. */ + +static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + int from = ms->m_header.h_nodeid; + int error = 0; + + switch (ms->m_type) { + case DLM_MSG_CONVERT: + case DLM_MSG_UNLOCK: + case DLM_MSG_CANCEL: + if (!is_master_copy(lkb) || lkb->lkb_nodeid != from) + error = -EINVAL; + break; + + case DLM_MSG_CONVERT_REPLY: + case DLM_MSG_UNLOCK_REPLY: + case DLM_MSG_CANCEL_REPLY: + case DLM_MSG_GRANT: + case DLM_MSG_BAST: + if (!is_process_copy(lkb) || lkb->lkb_nodeid != from) + error = -EINVAL; + break; + + case DLM_MSG_REQUEST_REPLY: + if (!is_process_copy(lkb)) + error = -EINVAL; + else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from) + error = -EINVAL; + break; + + default: + error = -EINVAL; + } + + if (error) + log_error(lkb->lkb_resource->res_ls, + "ignore invalid message %d from %d %x %x %x %d", + ms->m_type, from, lkb->lkb_id, lkb->lkb_remid, + lkb->lkb_flags, lkb->lkb_nodeid); + return error; +} + +static void receive_request(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error, namelen; + + error = create_lkb(ls, &lkb); + if (error) + goto fail; + + receive_flags(lkb, ms); + lkb->lkb_flags |= DLM_IFL_MSTCPY; + error = receive_request_args(ls, lkb, ms); + if (error) { + __put_lkb(ls, lkb); + goto fail; + } + + namelen = receive_extralen(ms); + + error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r); + if (error) { + __put_lkb(ls, lkb); + goto fail; + } + + lock_rsb(r); + + attach_lkb(r, lkb); + error = do_request(r, lkb); + send_request_reply(r, lkb, error); + do_request_effects(r, lkb, error); + + unlock_rsb(r); + put_rsb(r); + + if (error == -EINPROGRESS) + error = 0; + if (error) + dlm_put_lkb(lkb); + return; + + fail: + setup_stub_lkb(ls, ms); + send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); +} + +static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error, reply = 1; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) + goto fail; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + receive_flags(lkb, ms); + + error = receive_convert_args(ls, lkb, ms); + if (error) { + send_convert_reply(r, lkb, error); + goto out; + } + + reply = !down_conversion(lkb); + + error = do_convert(r, lkb); + if (reply) + send_convert_reply(r, lkb, error); + do_convert_effects(r, lkb, error); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return; + + fail: + setup_stub_lkb(ls, ms); + send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); +} + +static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) + goto fail; + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + receive_flags(lkb, ms); + + error = receive_unlock_args(ls, lkb, ms); + if (error) { + send_unlock_reply(r, lkb, error); + goto out; + } + + error = do_unlock(r, lkb); + send_unlock_reply(r, lkb, error); + do_unlock_effects(r, lkb, error); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return; + + fail: + setup_stub_lkb(ls, ms); + send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); +} + +static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) + goto fail; + + receive_flags(lkb, ms); + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + error = do_cancel(r, lkb); + send_cancel_reply(r, lkb, error); + do_cancel_effects(r, lkb, error); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return; + + fail: + setup_stub_lkb(ls, ms); + send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); +} + +static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) { + log_debug(ls, "receive_grant from %d no lkb %x", + ms->m_header.h_nodeid, ms->m_remid); + return; + } + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + receive_flags_reply(lkb, ms); + if (is_altmode(lkb)) + munge_altmode(lkb, ms); + grant_lock_pc(r, lkb, ms); + queue_cast(r, lkb, 0); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); +} + +static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) { + log_debug(ls, "receive_bast from %d no lkb %x", + ms->m_header.h_nodeid, ms->m_remid); + return; + } + + r = lkb->lkb_resource; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + queue_bast(r, lkb, ms->m_bastmode); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); +} + +static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms) +{ + int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid; + + from_nodeid = ms->m_header.h_nodeid; + our_nodeid = dlm_our_nodeid(); + + len = receive_extralen(ms); + + dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash); + if (dir_nodeid != our_nodeid) { + log_error(ls, "lookup dir_nodeid %d from %d", + dir_nodeid, from_nodeid); + error = -EINVAL; + ret_nodeid = -1; + goto out; + } + + error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid); + + /* Optimization: we're master so treat lookup as a request */ + if (!error && ret_nodeid == our_nodeid) { + receive_request(ls, ms); + return; + } + out: + send_lookup_reply(ls, ms, ret_nodeid, error); +} + +static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms) +{ + int len, dir_nodeid, from_nodeid; + + from_nodeid = ms->m_header.h_nodeid; + + len = receive_extralen(ms); + + dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash); + if (dir_nodeid != dlm_our_nodeid()) { + log_error(ls, "remove dir entry dir_nodeid %d from %d", + dir_nodeid, from_nodeid); + return; + } + + dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len); +} + +static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms) +{ + do_purge(ls, ms->m_nodeid, ms->m_pid); +} + +static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error, mstype, result; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) { + log_debug(ls, "receive_request_reply from %d no lkb %x", + ms->m_header.h_nodeid, ms->m_remid); + return; + } + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + mstype = lkb->lkb_wait_type; + error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); + if (error) + goto out; + + /* Optimization: the dir node was also the master, so it took our + lookup as a request and sent request reply instead of lookup reply */ + if (mstype == DLM_MSG_LOOKUP) { + r->res_nodeid = ms->m_header.h_nodeid; + lkb->lkb_nodeid = r->res_nodeid; + } + + /* this is the value returned from do_request() on the master */ + result = ms->m_result; + + switch (result) { + case -EAGAIN: + /* request would block (be queued) on remote master */ + queue_cast(r, lkb, -EAGAIN); + confirm_master(r, -EAGAIN); + unhold_lkb(lkb); /* undoes create_lkb() */ + break; + + case -EINPROGRESS: + case 0: + /* request was queued or granted on remote master */ + receive_flags_reply(lkb, ms); + lkb->lkb_remid = ms->m_lkid; + if (is_altmode(lkb)) + munge_altmode(lkb, ms); + if (result) { + add_lkb(r, lkb, DLM_LKSTS_WAITING); + add_timeout(lkb); + } else { + grant_lock_pc(r, lkb, ms); + queue_cast(r, lkb, 0); + } + confirm_master(r, result); + break; + + case -EBADR: + case -ENOTBLK: + /* find_rsb failed to find rsb or rsb wasn't master */ + log_debug(ls, "receive_request_reply %x %x master diff %d %d", + lkb->lkb_id, lkb->lkb_flags, r->res_nodeid, result); + r->res_nodeid = -1; + lkb->lkb_nodeid = -1; + + if (is_overlap(lkb)) { + /* we'll ignore error in cancel/unlock reply */ + queue_cast_overlap(r, lkb); + confirm_master(r, result); + unhold_lkb(lkb); /* undoes create_lkb() */ + } else + _request_lock(r, lkb); + break; + + default: + log_error(ls, "receive_request_reply %x error %d", + lkb->lkb_id, result); + } + + if (is_overlap_unlock(lkb) && (result == 0 || result == -EINPROGRESS)) { + log_debug(ls, "receive_request_reply %x result %d unlock", + lkb->lkb_id, result); + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + send_unlock(r, lkb); + } else if (is_overlap_cancel(lkb) && (result == -EINPROGRESS)) { + log_debug(ls, "receive_request_reply %x cancel", lkb->lkb_id); + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + send_cancel(r, lkb); + } else { + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; + } + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); +} + +static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, + struct dlm_message *ms) +{ + /* this is the value returned from do_convert() on the master */ + switch (ms->m_result) { + case -EAGAIN: + /* convert would block (be queued) on remote master */ + queue_cast(r, lkb, -EAGAIN); + break; + + case -EDEADLK: + receive_flags_reply(lkb, ms); + revert_lock_pc(r, lkb); + queue_cast(r, lkb, -EDEADLK); + break; + + case -EINPROGRESS: + /* convert was queued on remote master */ + receive_flags_reply(lkb, ms); + if (is_demoted(lkb)) + munge_demoted(lkb); + del_lkb(r, lkb); + add_lkb(r, lkb, DLM_LKSTS_CONVERT); + add_timeout(lkb); + break; + + case 0: + /* convert was granted on remote master */ + receive_flags_reply(lkb, ms); + if (is_demoted(lkb)) + munge_demoted(lkb); + grant_lock_pc(r, lkb, ms); + queue_cast(r, lkb, 0); + break; + + default: + log_error(r->res_ls, "receive_convert_reply %x error %d", + lkb->lkb_id, ms->m_result); + } +} + +static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + struct dlm_rsb *r = lkb->lkb_resource; + int error; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + /* stub reply can happen with waiters_mutex held */ + error = remove_from_waiters_ms(lkb, ms); + if (error) + goto out; + + __receive_convert_reply(r, lkb, ms); + out: + unlock_rsb(r); + put_rsb(r); +} + +static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + int error; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) { + log_debug(ls, "receive_convert_reply from %d no lkb %x", + ms->m_header.h_nodeid, ms->m_remid); + return; + } + + _receive_convert_reply(lkb, ms); + dlm_put_lkb(lkb); +} + +static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + struct dlm_rsb *r = lkb->lkb_resource; + int error; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + /* stub reply can happen with waiters_mutex held */ + error = remove_from_waiters_ms(lkb, ms); + if (error) + goto out; + + /* this is the value returned from do_unlock() on the master */ + + switch (ms->m_result) { + case -DLM_EUNLOCK: + receive_flags_reply(lkb, ms); + remove_lock_pc(r, lkb); + queue_cast(r, lkb, -DLM_EUNLOCK); + break; + case -ENOENT: + break; + default: + log_error(r->res_ls, "receive_unlock_reply %x error %d", + lkb->lkb_id, ms->m_result); + } + out: + unlock_rsb(r); + put_rsb(r); +} + +static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + int error; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) { + log_debug(ls, "receive_unlock_reply from %d no lkb %x", + ms->m_header.h_nodeid, ms->m_remid); + return; + } + + _receive_unlock_reply(lkb, ms); + dlm_put_lkb(lkb); +} + +static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + struct dlm_rsb *r = lkb->lkb_resource; + int error; + + hold_rsb(r); + lock_rsb(r); + + error = validate_message(lkb, ms); + if (error) + goto out; + + /* stub reply can happen with waiters_mutex held */ + error = remove_from_waiters_ms(lkb, ms); + if (error) + goto out; + + /* this is the value returned from do_cancel() on the master */ + + switch (ms->m_result) { + case -DLM_ECANCEL: + receive_flags_reply(lkb, ms); + revert_lock_pc(r, lkb); + queue_cast(r, lkb, -DLM_ECANCEL); + break; + case 0: + break; + default: + log_error(r->res_ls, "receive_cancel_reply %x error %d", + lkb->lkb_id, ms->m_result); + } + out: + unlock_rsb(r); + put_rsb(r); +} + +static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + int error; + + error = find_lkb(ls, ms->m_remid, &lkb); + if (error) { + log_debug(ls, "receive_cancel_reply from %d no lkb %x", + ms->m_header.h_nodeid, ms->m_remid); + return; + } + + _receive_cancel_reply(lkb, ms); + dlm_put_lkb(lkb); +} + +static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error, ret_nodeid; + + error = find_lkb(ls, ms->m_lkid, &lkb); + if (error) { + log_error(ls, "receive_lookup_reply no lkb"); + return; + } + + /* ms->m_result is the value returned by dlm_dir_lookup on dir node + FIXME: will a non-zero error ever be returned? */ + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + error = remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY); + if (error) + goto out; + + ret_nodeid = ms->m_nodeid; + if (ret_nodeid == dlm_our_nodeid()) { + r->res_nodeid = 0; + ret_nodeid = 0; + r->res_first_lkid = 0; + } else { + /* set_master() will copy res_nodeid to lkb_nodeid */ + r->res_nodeid = ret_nodeid; + } + + if (is_overlap(lkb)) { + log_debug(ls, "receive_lookup_reply %x unlock %x", + lkb->lkb_id, lkb->lkb_flags); + queue_cast_overlap(r, lkb); + unhold_lkb(lkb); /* undoes create_lkb() */ + goto out_list; + } + + _request_lock(r, lkb); + + out_list: + if (!ret_nodeid) + process_lookup_list(r); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); +} + +static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) +{ + if (!dlm_is_member(ls, ms->m_header.h_nodeid)) { + log_debug(ls, "ignore non-member message %d from %d %x %x %d", + ms->m_type, ms->m_header.h_nodeid, ms->m_lkid, + ms->m_remid, ms->m_result); + return; + } + + switch (ms->m_type) { + + /* messages sent to a master node */ + + case DLM_MSG_REQUEST: + receive_request(ls, ms); + break; + + case DLM_MSG_CONVERT: + receive_convert(ls, ms); + break; + + case DLM_MSG_UNLOCK: + receive_unlock(ls, ms); + break; + + case DLM_MSG_CANCEL: + receive_cancel(ls, ms); + break; + + /* messages sent from a master node (replies to above) */ + + case DLM_MSG_REQUEST_REPLY: + receive_request_reply(ls, ms); + break; + + case DLM_MSG_CONVERT_REPLY: + receive_convert_reply(ls, ms); + break; + + case DLM_MSG_UNLOCK_REPLY: + receive_unlock_reply(ls, ms); + break; + + case DLM_MSG_CANCEL_REPLY: + receive_cancel_reply(ls, ms); + break; + + /* messages sent from a master node (only two types of async msg) */ + + case DLM_MSG_GRANT: + receive_grant(ls, ms); + break; + + case DLM_MSG_BAST: + receive_bast(ls, ms); + break; + + /* messages sent to a dir node */ + + case DLM_MSG_LOOKUP: + receive_lookup(ls, ms); + break; + + case DLM_MSG_REMOVE: + receive_remove(ls, ms); + break; + + /* messages sent from a dir node (remove has no reply) */ + + case DLM_MSG_LOOKUP_REPLY: + receive_lookup_reply(ls, ms); + break; + + /* other messages */ + + case DLM_MSG_PURGE: + receive_purge(ls, ms); + break; + + default: + log_error(ls, "unknown message type %d", ms->m_type); + } + + dlm_astd_wake(); +} + +/* If the lockspace is in recovery mode (locking stopped), then normal + messages are saved on the requestqueue for processing after recovery is + done. When not in recovery mode, we wait for dlm_recoverd to drain saved + messages off the requestqueue before we process new ones. This occurs right + after recovery completes when we transition from saving all messages on + requestqueue, to processing all the saved messages, to processing new + messages as they arrive. */ + +static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms, + int nodeid) +{ + if (dlm_locking_stopped(ls)) { + dlm_add_requestqueue(ls, nodeid, ms); + } else { + dlm_wait_requestqueue(ls); + _receive_message(ls, ms); + } +} + +/* This is called by dlm_recoverd to process messages that were saved on + the requestqueue. */ + +void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms) +{ + _receive_message(ls, ms); +} + +/* This is called by the midcomms layer when something is received for + the lockspace. It could be either a MSG (normal message sent as part of + standard locking activity) or an RCOM (recovery message sent as part of + lockspace recovery). */ + +void dlm_receive_buffer(union dlm_packet *p, int nodeid) +{ + struct dlm_header *hd = &p->header; + struct dlm_ls *ls; + int type = 0; + + switch (hd->h_cmd) { + case DLM_MSG: + dlm_message_in(&p->message); + type = p->message.m_type; + break; + case DLM_RCOM: + dlm_rcom_in(&p->rcom); + type = p->rcom.rc_type; + break; + default: + log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid); + return; + } + + if (hd->h_nodeid != nodeid) { + log_print("invalid h_nodeid %d from %d lockspace %x", + hd->h_nodeid, nodeid, hd->h_lockspace); + return; + } + + ls = dlm_find_lockspace_global(hd->h_lockspace); + if (!ls) { + if (dlm_config.ci_log_debug) + log_print("invalid lockspace %x from %d cmd %d type %d", + hd->h_lockspace, nodeid, hd->h_cmd, type); + + if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) + dlm_send_ls_not_ready(nodeid, &p->rcom); + return; + } + + /* this rwsem allows dlm_ls_stop() to wait for all dlm_recv threads to + be inactive (in this ls) before transitioning to recovery mode */ + + down_read(&ls->ls_recv_active); + if (hd->h_cmd == DLM_MSG) + dlm_receive_message(ls, &p->message, nodeid); + else + dlm_receive_rcom(ls, &p->rcom, nodeid); + up_read(&ls->ls_recv_active); + + dlm_put_lockspace(ls); +} + +static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_message *ms_stub) +{ + if (middle_conversion(lkb)) { + hold_lkb(lkb); + memset(ms_stub, 0, sizeof(struct dlm_message)); + ms_stub->m_flags = DLM_IFL_STUB_MS; + ms_stub->m_type = DLM_MSG_CONVERT_REPLY; + ms_stub->m_result = -EINPROGRESS; + ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + _receive_convert_reply(lkb, ms_stub); + + /* Same special case as in receive_rcom_lock_args() */ + lkb->lkb_grmode = DLM_LOCK_IV; + rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT); + unhold_lkb(lkb); + + } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) { + lkb->lkb_flags |= DLM_IFL_RESEND; + } + + /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down + conversions are async; there's no reply from the remote master */ +} + +/* A waiting lkb needs recovery if the master node has failed, or + the master node is changing (only when no directory is used) */ + +static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb) +{ + if (dlm_is_removed(ls, lkb->lkb_nodeid)) + return 1; + + if (!dlm_no_directory(ls)) + return 0; + + if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid) + return 1; + + return 0; +} + +/* Recovery for locks that are waiting for replies from nodes that are now + gone. We can just complete unlocks and cancels by faking a reply from the + dead node. Requests and up-conversions we flag to be resent after + recovery. Down-conversions can just be completed with a fake reply like + unlocks. Conversions between PR and CW need special attention. */ + +void dlm_recover_waiters_pre(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb, *safe; + struct dlm_message *ms_stub; + int wait_type, stub_unlock_result, stub_cancel_result; + + ms_stub = kmalloc(GFP_KERNEL, sizeof(struct dlm_message)); + if (!ms_stub) { + log_error(ls, "dlm_recover_waiters_pre no mem"); + return; + } + + mutex_lock(&ls->ls_waiters_mutex); + + list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { + + /* exclude debug messages about unlocks because there can be so + many and they aren't very interesting */ + + if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) { + log_debug(ls, "recover_waiter %x nodeid %d " + "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid, + lkb->lkb_wait_type, lkb->lkb_wait_nodeid); + } + + /* all outstanding lookups, regardless of destination will be + resent after recovery is done */ + + if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) { + lkb->lkb_flags |= DLM_IFL_RESEND; + continue; + } + + if (!waiter_needs_recovery(ls, lkb)) + continue; + + wait_type = lkb->lkb_wait_type; + stub_unlock_result = -DLM_EUNLOCK; + stub_cancel_result = -DLM_ECANCEL; + + /* Main reply may have been received leaving a zero wait_type, + but a reply for the overlapping op may not have been + received. In that case we need to fake the appropriate + reply for the overlap op. */ + + if (!wait_type) { + if (is_overlap_cancel(lkb)) { + wait_type = DLM_MSG_CANCEL; + if (lkb->lkb_grmode == DLM_LOCK_IV) + stub_cancel_result = 0; + } + if (is_overlap_unlock(lkb)) { + wait_type = DLM_MSG_UNLOCK; + if (lkb->lkb_grmode == DLM_LOCK_IV) + stub_unlock_result = -ENOENT; + } + + log_debug(ls, "rwpre overlap %x %x %d %d %d", + lkb->lkb_id, lkb->lkb_flags, wait_type, + stub_cancel_result, stub_unlock_result); + } + + switch (wait_type) { + + case DLM_MSG_REQUEST: + lkb->lkb_flags |= DLM_IFL_RESEND; + break; + + case DLM_MSG_CONVERT: + recover_convert_waiter(ls, lkb, ms_stub); + break; + + case DLM_MSG_UNLOCK: + hold_lkb(lkb); + memset(ms_stub, 0, sizeof(struct dlm_message)); + ms_stub->m_flags = DLM_IFL_STUB_MS; + ms_stub->m_type = DLM_MSG_UNLOCK_REPLY; + ms_stub->m_result = stub_unlock_result; + ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + _receive_unlock_reply(lkb, ms_stub); + dlm_put_lkb(lkb); + break; + + case DLM_MSG_CANCEL: + hold_lkb(lkb); + memset(ms_stub, 0, sizeof(struct dlm_message)); + ms_stub->m_flags = DLM_IFL_STUB_MS; + ms_stub->m_type = DLM_MSG_CANCEL_REPLY; + ms_stub->m_result = stub_cancel_result; + ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + _receive_cancel_reply(lkb, ms_stub); + dlm_put_lkb(lkb); + break; + + default: + log_error(ls, "invalid lkb wait_type %d %d", + lkb->lkb_wait_type, wait_type); + } + schedule(); + } + mutex_unlock(&ls->ls_waiters_mutex); + kfree(ms_stub); +} + +static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + int found = 0; + + mutex_lock(&ls->ls_waiters_mutex); + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { + if (lkb->lkb_flags & DLM_IFL_RESEND) { + hold_lkb(lkb); + found = 1; + break; + } + } + mutex_unlock(&ls->ls_waiters_mutex); + + if (!found) + lkb = NULL; + return lkb; +} + +/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the + master or dir-node for r. Processing the lkb may result in it being placed + back on waiters. */ + +/* We do this after normal locking has been enabled and any saved messages + (in requestqueue) have been processed. We should be confident that at + this point we won't get or process a reply to any of these waiting + operations. But, new ops may be coming in on the rsbs/locks here from + userspace or remotely. */ + +/* there may have been an overlap unlock/cancel prior to recovery or after + recovery. if before, the lkb may still have a pos wait_count; if after, the + overlap flag would just have been set and nothing new sent. we can be + confident here than any replies to either the initial op or overlap ops + prior to recovery have been received. */ + +int dlm_recover_waiters_post(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error = 0, mstype, err, oc, ou; + + while (1) { + if (dlm_locking_stopped(ls)) { + log_debug(ls, "recover_waiters_post aborted"); + error = -EINTR; + break; + } + + lkb = find_resend_waiter(ls); + if (!lkb) + break; + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + mstype = lkb->lkb_wait_type; + oc = is_overlap_cancel(lkb); + ou = is_overlap_unlock(lkb); + err = 0; + + log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d", + lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid); + + /* At this point we assume that we won't get a reply to any + previous op or overlap op on this lock. First, do a big + remove_from_waiters() for all previous ops. */ + + lkb->lkb_flags &= ~DLM_IFL_RESEND; + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; + lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + lkb->lkb_wait_type = 0; + lkb->lkb_wait_count = 0; + mutex_lock(&ls->ls_waiters_mutex); + list_del_init(&lkb->lkb_wait_reply); + mutex_unlock(&ls->ls_waiters_mutex); + unhold_lkb(lkb); /* for waiters list */ + + if (oc || ou) { + /* do an unlock or cancel instead of resending */ + switch (mstype) { + case DLM_MSG_LOOKUP: + case DLM_MSG_REQUEST: + queue_cast(r, lkb, ou ? -DLM_EUNLOCK : + -DLM_ECANCEL); + unhold_lkb(lkb); /* undoes create_lkb() */ + break; + case DLM_MSG_CONVERT: + if (oc) { + queue_cast(r, lkb, -DLM_ECANCEL); + } else { + lkb->lkb_exflags |= DLM_LKF_FORCEUNLOCK; + _unlock_lock(r, lkb); + } + break; + default: + err = 1; + } + } else { + switch (mstype) { + case DLM_MSG_LOOKUP: + case DLM_MSG_REQUEST: + _request_lock(r, lkb); + if (is_master(r)) + confirm_master(r, 0); + break; + case DLM_MSG_CONVERT: + _convert_lock(r, lkb); + break; + default: + err = 1; + } + } + + if (err) + log_error(ls, "recover_waiters_post %x %d %x %d %d", + lkb->lkb_id, mstype, lkb->lkb_flags, oc, ou); + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + } + + return error; +} + +static void purge_queue(struct dlm_rsb *r, struct list_head *queue, + int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb)) +{ + struct dlm_ls *ls = r->res_ls; + struct dlm_lkb *lkb, *safe; + + list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) { + if (test(ls, lkb)) { + rsb_set_flag(r, RSB_LOCKS_PURGED); + del_lkb(r, lkb); + /* this put should free the lkb */ + if (!dlm_put_lkb(lkb)) + log_error(ls, "purged lkb not released"); + } + } +} + +static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb) +{ + return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid)); +} + +static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb) +{ + return is_master_copy(lkb); +} + +static void purge_dead_locks(struct dlm_rsb *r) +{ + purge_queue(r, &r->res_grantqueue, &purge_dead_test); + purge_queue(r, &r->res_convertqueue, &purge_dead_test); + purge_queue(r, &r->res_waitqueue, &purge_dead_test); +} + +void dlm_purge_mstcpy_locks(struct dlm_rsb *r) +{ + purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test); + purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test); + purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test); +} + +/* Get rid of locks held by nodes that are gone. */ + +int dlm_purge_locks(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + + log_debug(ls, "dlm_purge_locks"); + + down_write(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + hold_rsb(r); + lock_rsb(r); + if (is_master(r)) + purge_dead_locks(r); + unlock_rsb(r); + unhold_rsb(r); + + schedule(); + } + up_write(&ls->ls_root_sem); + + return 0; +} + +static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket) +{ + struct dlm_rsb *r, *r_ret = NULL; + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) { + if (!rsb_flag(r, RSB_LOCKS_PURGED)) + continue; + hold_rsb(r); + rsb_clear_flag(r, RSB_LOCKS_PURGED); + r_ret = r; + break; + } + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + return r_ret; +} + +void dlm_grant_after_purge(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + int bucket = 0; + + while (1) { + r = find_purged_rsb(ls, bucket); + if (!r) { + if (bucket == ls->ls_rsbtbl_size - 1) + break; + bucket++; + continue; + } + lock_rsb(r); + if (is_master(r)) { + grant_pending_locks(r); + confirm_master(r, 0); + } + unlock_rsb(r); + put_rsb(r); + schedule(); + } +} + +static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid, + uint32_t remid) +{ + struct dlm_lkb *lkb; + + list_for_each_entry(lkb, head, lkb_statequeue) { + if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) + return lkb; + } + return NULL; +} + +static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid, + uint32_t remid) +{ + struct dlm_lkb *lkb; + + lkb = search_remid_list(&r->res_grantqueue, nodeid, remid); + if (lkb) + return lkb; + lkb = search_remid_list(&r->res_convertqueue, nodeid, remid); + if (lkb) + return lkb; + lkb = search_remid_list(&r->res_waitqueue, nodeid, remid); + if (lkb) + return lkb; + return NULL; +} + +/* needs at least dlm_rcom + rcom_lock */ +static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_rsb *r, struct dlm_rcom *rc) +{ + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; + + lkb->lkb_nodeid = rc->rc_header.h_nodeid; + lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid); + lkb->lkb_remid = le32_to_cpu(rl->rl_lkid); + lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags); + lkb->lkb_flags = le32_to_cpu(rl->rl_flags) & 0x0000FFFF; + lkb->lkb_flags |= DLM_IFL_MSTCPY; + lkb->lkb_lvbseq = le32_to_cpu(rl->rl_lvbseq); + lkb->lkb_rqmode = rl->rl_rqmode; + lkb->lkb_grmode = rl->rl_grmode; + /* don't set lkb_status because add_lkb wants to itself */ + + lkb->lkb_bastfn = (rl->rl_asts & DLM_CB_BAST) ? &fake_bastfn : NULL; + lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL; + + if (lkb->lkb_exflags & DLM_LKF_VALBLK) { + int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) - + sizeof(struct rcom_lock); + if (lvblen > ls->ls_lvblen) + return -EINVAL; + lkb->lkb_lvbptr = dlm_allocate_lvb(ls); + if (!lkb->lkb_lvbptr) + return -ENOMEM; + memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen); + } + + /* Conversions between PR and CW (middle modes) need special handling. + The real granted mode of these converting locks cannot be determined + until all locks have been rebuilt on the rsb (recover_conversion) */ + + if (rl->rl_wait_type == cpu_to_le16(DLM_MSG_CONVERT) && + middle_conversion(lkb)) { + rl->rl_status = DLM_LKSTS_CONVERT; + lkb->lkb_grmode = DLM_LOCK_IV; + rsb_set_flag(r, RSB_RECOVER_CONVERT); + } + + return 0; +} + +/* This lkb may have been recovered in a previous aborted recovery so we need + to check if the rsb already has an lkb with the given remote nodeid/lkid. + If so we just send back a standard reply. If not, we create a new lkb with + the given values and send back our lkid. We send back our lkid by sending + back the rcom_lock struct we got but with the remid field filled in. */ + +/* needs at least dlm_rcom + rcom_lock */ +int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; + struct dlm_rsb *r; + struct dlm_lkb *lkb; + int error; + + if (rl->rl_parent_lkid) { + error = -EOPNOTSUPP; + goto out; + } + + error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen), + R_MASTER, &r); + if (error) + goto out; + + lock_rsb(r); + + lkb = search_remid(r, rc->rc_header.h_nodeid, le32_to_cpu(rl->rl_lkid)); + if (lkb) { + error = -EEXIST; + goto out_remid; + } + + error = create_lkb(ls, &lkb); + if (error) + goto out_unlock; + + error = receive_rcom_lock_args(ls, lkb, r, rc); + if (error) { + __put_lkb(ls, lkb); + goto out_unlock; + } + + attach_lkb(r, lkb); + add_lkb(r, lkb, rl->rl_status); + error = 0; + + out_remid: + /* this is the new value returned to the lock holder for + saving in its process-copy lkb */ + rl->rl_remid = cpu_to_le32(lkb->lkb_id); + + out_unlock: + unlock_rsb(r); + put_rsb(r); + out: + if (error) + log_debug(ls, "recover_master_copy %d %x", error, + le32_to_cpu(rl->rl_lkid)); + rl->rl_result = cpu_to_le32(error); + return error; +} + +/* needs at least dlm_rcom + rcom_lock */ +int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; + struct dlm_rsb *r; + struct dlm_lkb *lkb; + int error; + + error = find_lkb(ls, le32_to_cpu(rl->rl_lkid), &lkb); + if (error) { + log_error(ls, "recover_process_copy no lkid %x", + le32_to_cpu(rl->rl_lkid)); + return error; + } + + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); + + error = le32_to_cpu(rl->rl_result); + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + switch (error) { + case -EBADR: + /* There's a chance the new master received our lock before + dlm_recover_master_reply(), this wouldn't happen if we did + a barrier between recover_masters and recover_locks. */ + log_debug(ls, "master copy not ready %x r %lx %s", lkb->lkb_id, + (unsigned long)r, r->res_name); + dlm_send_rcom_lock(r, lkb); + goto out; + case -EEXIST: + log_debug(ls, "master copy exists %x", lkb->lkb_id); + /* fall through */ + case 0: + lkb->lkb_remid = le32_to_cpu(rl->rl_remid); + break; + default: + log_error(ls, "dlm_recover_process_copy unknown error %d %x", + error, lkb->lkb_id); + } + + /* an ack for dlm_recover_locks() which waits for replies from + all the locks it sends to new masters */ + dlm_recovered_lock(r); + out: + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + + return 0; +} + +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, + int mode, uint32_t flags, void *name, unsigned int namelen, + unsigned long timeout_cs) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + int error; + + dlm_lock_recovery(ls); + + error = create_lkb(ls, &lkb); + if (error) { + kfree(ua); + goto out; + } + + if (flags & DLM_LKF_VALBLK) { + ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); + if (!ua->lksb.sb_lvbptr) { + kfree(ua); + __put_lkb(ls, lkb); + error = -ENOMEM; + goto out; + } + } + + /* After ua is attached to lkb it will be freed by dlm_free_lkb(). + When DLM_IFL_USER is set, the dlm knows that this is a userspace + lock and that lkb_astparam is the dlm_user_args structure. */ + + error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, + fake_astfn, ua, fake_bastfn, &args); + lkb->lkb_flags |= DLM_IFL_USER; + + if (error) { + __put_lkb(ls, lkb); + goto out; + } + + error = request_lock(ls, lkb, name, namelen, &args); + + switch (error) { + case 0: + break; + case -EINPROGRESS: + error = 0; + break; + case -EAGAIN: + error = 0; + /* fall through */ + default: + __put_lkb(ls, lkb); + goto out; + } + + /* add this new lkb to the per-process list of locks */ + spin_lock(&ua->proc->locks_spin); + hold_lkb(lkb); + list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); + spin_unlock(&ua->proc->locks_spin); + out: + dlm_unlock_recovery(ls); + return error; +} + +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in, + unsigned long timeout_cs) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + struct dlm_user_args *ua; + int error; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + /* user can change the params on its lock when it converts it, or + add an lvb that didn't exist before */ + + ua = lkb->lkb_ua; + + if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) { + ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); + if (!ua->lksb.sb_lvbptr) { + error = -ENOMEM; + goto out_put; + } + } + if (lvb_in && ua->lksb.sb_lvbptr) + memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN); + + ua->xid = ua_tmp->xid; + ua->castparam = ua_tmp->castparam; + ua->castaddr = ua_tmp->castaddr; + ua->bastparam = ua_tmp->bastparam; + ua->bastaddr = ua_tmp->bastaddr; + ua->user_lksb = ua_tmp->user_lksb; + + error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs, + fake_astfn, ua, fake_bastfn, &args); + if (error) + goto out_put; + + error = convert_lock(ls, lkb, &args); + + if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK) + error = 0; + out_put: + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + kfree(ua_tmp); + return error; +} + +int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + uint32_t flags, uint32_t lkid, char *lvb_in) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + struct dlm_user_args *ua; + int error; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + ua = lkb->lkb_ua; + + if (lvb_in && ua->lksb.sb_lvbptr) + memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN); + if (ua_tmp->castparam) + ua->castparam = ua_tmp->castparam; + ua->user_lksb = ua_tmp->user_lksb; + + error = set_unlock_args(flags, ua, &args); + if (error) + goto out_put; + + error = unlock_lock(ls, lkb, &args); + + if (error == -DLM_EUNLOCK) + error = 0; + /* from validate_unlock_args() */ + if (error == -EBUSY && (flags & DLM_LKF_FORCEUNLOCK)) + error = 0; + if (error) + goto out_put; + + spin_lock(&ua->proc->locks_spin); + /* dlm_user_add_ast() may have already taken lkb off the proc list */ + if (!list_empty(&lkb->lkb_ownqueue)) + list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking); + spin_unlock(&ua->proc->locks_spin); + out_put: + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + kfree(ua_tmp); + return error; +} + +int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + uint32_t flags, uint32_t lkid) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + struct dlm_user_args *ua; + int error; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + ua = lkb->lkb_ua; + if (ua_tmp->castparam) + ua->castparam = ua_tmp->castparam; + ua->user_lksb = ua_tmp->user_lksb; + + error = set_unlock_args(flags, ua, &args); + if (error) + goto out_put; + + error = cancel_lock(ls, lkb, &args); + + if (error == -DLM_ECANCEL) + error = 0; + /* from validate_unlock_args() */ + if (error == -EBUSY) + error = 0; + out_put: + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + kfree(ua_tmp); + return error; +} + +int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid) +{ + struct dlm_lkb *lkb; + struct dlm_args args; + struct dlm_user_args *ua; + struct dlm_rsb *r; + int error; + + dlm_lock_recovery(ls); + + error = find_lkb(ls, lkid, &lkb); + if (error) + goto out; + + ua = lkb->lkb_ua; + + error = set_unlock_args(flags, ua, &args); + if (error) + goto out_put; + + /* same as cancel_lock(), but set DEADLOCK_CANCEL after lock_rsb */ + + r = lkb->lkb_resource; + hold_rsb(r); + lock_rsb(r); + + error = validate_unlock_args(lkb, &args); + if (error) + goto out_r; + lkb->lkb_flags |= DLM_IFL_DEADLOCK_CANCEL; + + error = _cancel_lock(r, lkb); + out_r: + unlock_rsb(r); + put_rsb(r); + + if (error == -DLM_ECANCEL) + error = 0; + /* from validate_unlock_args() */ + if (error == -EBUSY) + error = 0; + out_put: + dlm_put_lkb(lkb); + out: + dlm_unlock_recovery(ls); + return error; +} + +/* lkb's that are removed from the waiters list by revert are just left on the + orphans list with the granted orphan locks, to be freed by purge */ + +static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) +{ + struct dlm_args args; + int error; + + hold_lkb(lkb); + mutex_lock(&ls->ls_orphans_mutex); + list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans); + mutex_unlock(&ls->ls_orphans_mutex); + + set_unlock_args(0, lkb->lkb_ua, &args); + + error = cancel_lock(ls, lkb, &args); + if (error == -DLM_ECANCEL) + error = 0; + return error; +} + +/* The force flag allows the unlock to go ahead even if the lkb isn't granted. + Regardless of what rsb queue the lock is on, it's removed and freed. */ + +static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) +{ + struct dlm_args args; + int error; + + set_unlock_args(DLM_LKF_FORCEUNLOCK, lkb->lkb_ua, &args); + + error = unlock_lock(ls, lkb, &args); + if (error == -DLM_EUNLOCK) + error = 0; + return error; +} + +/* We have to release clear_proc_locks mutex before calling unlock_proc_lock() + (which does lock_rsb) due to deadlock with receiving a message that does + lock_rsb followed by dlm_user_add_ast() */ + +static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, + struct dlm_user_proc *proc) +{ + struct dlm_lkb *lkb = NULL; + + mutex_lock(&ls->ls_clear_proc_locks); + if (list_empty(&proc->locks)) + goto out; + + lkb = list_entry(proc->locks.next, struct dlm_lkb, lkb_ownqueue); + list_del_init(&lkb->lkb_ownqueue); + + if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) + lkb->lkb_flags |= DLM_IFL_ORPHAN; + else + lkb->lkb_flags |= DLM_IFL_DEAD; + out: + mutex_unlock(&ls->ls_clear_proc_locks); + return lkb; +} + +/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which + 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts, + which we clear here. */ + +/* proc CLOSING flag is set so no more device_reads should look at proc->asts + list, and no more device_writes should add lkb's to proc->locks list; so we + shouldn't need to take asts_spin or locks_spin here. this assumes that + device reads/writes/closes are serialized -- FIXME: we may need to serialize + them ourself. */ + +void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) +{ + struct dlm_lkb *lkb, *safe; + + dlm_lock_recovery(ls); + + while (1) { + lkb = del_proc_lock(ls, proc); + if (!lkb) + break; + del_timeout(lkb); + if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) + orphan_proc_lock(ls, lkb); + else + unlock_proc_lock(ls, lkb); + + /* this removes the reference for the proc->locks list + added by dlm_user_request, it may result in the lkb + being freed */ + + dlm_put_lkb(lkb); + } + + mutex_lock(&ls->ls_clear_proc_locks); + + /* in-progress unlocks */ + list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { + list_del_init(&lkb->lkb_ownqueue); + lkb->lkb_flags |= DLM_IFL_DEAD; + dlm_put_lkb(lkb); + } + + list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { + memset(&lkb->lkb_callbacks, 0, + sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); + list_del_init(&lkb->lkb_astqueue); + dlm_put_lkb(lkb); + } + + mutex_unlock(&ls->ls_clear_proc_locks); + dlm_unlock_recovery(ls); +} + +static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) +{ + struct dlm_lkb *lkb, *safe; + + while (1) { + lkb = NULL; + spin_lock(&proc->locks_spin); + if (!list_empty(&proc->locks)) { + lkb = list_entry(proc->locks.next, struct dlm_lkb, + lkb_ownqueue); + list_del_init(&lkb->lkb_ownqueue); + } + spin_unlock(&proc->locks_spin); + + if (!lkb) + break; + + lkb->lkb_flags |= DLM_IFL_DEAD; + unlock_proc_lock(ls, lkb); + dlm_put_lkb(lkb); /* ref from proc->locks list */ + } + + spin_lock(&proc->locks_spin); + list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { + list_del_init(&lkb->lkb_ownqueue); + lkb->lkb_flags |= DLM_IFL_DEAD; + dlm_put_lkb(lkb); + } + spin_unlock(&proc->locks_spin); + + spin_lock(&proc->asts_spin); + list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { + memset(&lkb->lkb_callbacks, 0, + sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); + list_del_init(&lkb->lkb_astqueue); + dlm_put_lkb(lkb); + } + spin_unlock(&proc->asts_spin); +} + +/* pid of 0 means purge all orphans */ + +static void do_purge(struct dlm_ls *ls, int nodeid, int pid) +{ + struct dlm_lkb *lkb, *safe; + + mutex_lock(&ls->ls_orphans_mutex); + list_for_each_entry_safe(lkb, safe, &ls->ls_orphans, lkb_ownqueue) { + if (pid && lkb->lkb_ownpid != pid) + continue; + unlock_proc_lock(ls, lkb); + list_del_init(&lkb->lkb_ownqueue); + dlm_put_lkb(lkb); + } + mutex_unlock(&ls->ls_orphans_mutex); +} + +static int send_purge(struct dlm_ls *ls, int nodeid, int pid) +{ + struct dlm_message *ms; + struct dlm_mhandle *mh; + int error; + + error = _create_message(ls, sizeof(struct dlm_message), nodeid, + DLM_MSG_PURGE, &ms, &mh); + if (error) + return error; + ms->m_nodeid = nodeid; + ms->m_pid = pid; + + return send_message(mh, ms); +} + +int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, + int nodeid, int pid) +{ + int error = 0; + + if (nodeid != dlm_our_nodeid()) { + error = send_purge(ls, nodeid, pid); + } else { + dlm_lock_recovery(ls); + if (pid == current->pid) + purge_proc_locks(ls, proc); + else + do_purge(ls, nodeid, pid); + dlm_unlock_recovery(ls); + } + return error; +} + diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h new file mode 100644 index 00000000..265017a7 --- /dev/null +++ b/fs/dlm/lock.h @@ -0,0 +1,70 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LOCK_DOT_H__ +#define __LOCK_DOT_H__ + +void dlm_dump_rsb(struct dlm_rsb *r); +void dlm_print_lkb(struct dlm_lkb *lkb); +void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms); +void dlm_receive_buffer(union dlm_packet *p, int nodeid); +int dlm_modes_compat(int mode1, int mode2); +void dlm_put_rsb(struct dlm_rsb *r); +void dlm_hold_rsb(struct dlm_rsb *r); +int dlm_put_lkb(struct dlm_lkb *lkb); +void dlm_scan_rsbs(struct dlm_ls *ls); +int dlm_lock_recovery_try(struct dlm_ls *ls); +void dlm_unlock_recovery(struct dlm_ls *ls); +void dlm_scan_waiters(struct dlm_ls *ls); +void dlm_scan_timeout(struct dlm_ls *ls); +void dlm_adjust_timeouts(struct dlm_ls *ls); + +int dlm_purge_locks(struct dlm_ls *ls); +void dlm_purge_mstcpy_locks(struct dlm_rsb *r); +void dlm_grant_after_purge(struct dlm_ls *ls); +int dlm_recover_waiters_post(struct dlm_ls *ls); +void dlm_recover_waiters_pre(struct dlm_ls *ls); +int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); +int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); + +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, + uint32_t flags, void *name, unsigned int namelen, + unsigned long timeout_cs); +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in, + unsigned long timeout_cs); +int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + uint32_t flags, uint32_t lkid, char *lvb_in); +int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + uint32_t flags, uint32_t lkid); +int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, + int nodeid, int pid); +int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid); +void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc); + +static inline int is_master(struct dlm_rsb *r) +{ + return !r->res_nodeid; +} + +static inline void lock_rsb(struct dlm_rsb *r) +{ + mutex_lock(&r->res_mutex); +} + +static inline void unlock_rsb(struct dlm_rsb *r) +{ + mutex_unlock(&r->res_mutex); +} + +#endif + diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c new file mode 100644 index 00000000..14cbf409 --- /dev/null +++ b/fs/dlm/lockspace.c @@ -0,0 +1,844 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "recoverd.h" +#include "ast.h" +#include "dir.h" +#include "lowcomms.h" +#include "config.h" +#include "memory.h" +#include "lock.h" +#include "recover.h" +#include "requestqueue.h" +#include "user.h" + +static int ls_count; +static struct mutex ls_lock; +static struct list_head lslist; +static spinlock_t lslist_lock; +static struct task_struct * scand_task; + + +static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + ssize_t ret = len; + int n = simple_strtol(buf, NULL, 0); + + ls = dlm_find_lockspace_local(ls->ls_local_handle); + if (!ls) + return -EINVAL; + + switch (n) { + case 0: + dlm_ls_stop(ls); + break; + case 1: + dlm_ls_start(ls); + break; + default: + ret = -EINVAL; + } + dlm_put_lockspace(ls); + return ret; +} + +static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + ls->ls_uevent_result = simple_strtol(buf, NULL, 0); + set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags); + wake_up(&ls->ls_uevent_wait); + return len; +} + +static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", ls->ls_global_id); +} + +static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + ls->ls_global_id = simple_strtoul(buf, NULL, 0); + return len; +} + +static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf) +{ + uint32_t status = dlm_recover_status(ls); + return snprintf(buf, PAGE_SIZE, "%x\n", status); +} + +static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", ls->ls_recover_nodeid); +} + +struct dlm_attr { + struct attribute attr; + ssize_t (*show)(struct dlm_ls *, char *); + ssize_t (*store)(struct dlm_ls *, const char *, size_t); +}; + +static struct dlm_attr dlm_attr_control = { + .attr = {.name = "control", .mode = S_IWUSR}, + .store = dlm_control_store +}; + +static struct dlm_attr dlm_attr_event = { + .attr = {.name = "event_done", .mode = S_IWUSR}, + .store = dlm_event_store +}; + +static struct dlm_attr dlm_attr_id = { + .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR}, + .show = dlm_id_show, + .store = dlm_id_store +}; + +static struct dlm_attr dlm_attr_recover_status = { + .attr = {.name = "recover_status", .mode = S_IRUGO}, + .show = dlm_recover_status_show +}; + +static struct dlm_attr dlm_attr_recover_nodeid = { + .attr = {.name = "recover_nodeid", .mode = S_IRUGO}, + .show = dlm_recover_nodeid_show +}; + +static struct attribute *dlm_attrs[] = { + &dlm_attr_control.attr, + &dlm_attr_event.attr, + &dlm_attr_id.attr, + &dlm_attr_recover_status.attr, + &dlm_attr_recover_nodeid.attr, + NULL, +}; + +static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); + struct dlm_attr *a = container_of(attr, struct dlm_attr, attr); + return a->show ? a->show(ls, buf) : 0; +} + +static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); + struct dlm_attr *a = container_of(attr, struct dlm_attr, attr); + return a->store ? a->store(ls, buf, len) : len; +} + +static void lockspace_kobj_release(struct kobject *k) +{ + struct dlm_ls *ls = container_of(k, struct dlm_ls, ls_kobj); + kfree(ls); +} + +static const struct sysfs_ops dlm_attr_ops = { + .show = dlm_attr_show, + .store = dlm_attr_store, +}; + +static struct kobj_type dlm_ktype = { + .default_attrs = dlm_attrs, + .sysfs_ops = &dlm_attr_ops, + .release = lockspace_kobj_release, +}; + +static struct kset *dlm_kset; + +static int do_uevent(struct dlm_ls *ls, int in) +{ + int error; + + if (in) + kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE); + else + kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE); + + log_debug(ls, "%s the lockspace group...", in ? "joining" : "leaving"); + + /* dlm_controld will see the uevent, do the necessary group management + and then write to sysfs to wake us */ + + error = wait_event_interruptible(ls->ls_uevent_wait, + test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags)); + + log_debug(ls, "group event done %d %d", error, ls->ls_uevent_result); + + if (error) + goto out; + + error = ls->ls_uevent_result; + out: + if (error) + log_error(ls, "group %s failed %d %d", in ? "join" : "leave", + error, ls->ls_uevent_result); + return error; +} + +static int dlm_uevent(struct kset *kset, struct kobject *kobj, + struct kobj_uevent_env *env) +{ + struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); + + add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name); + return 0; +} + +static struct kset_uevent_ops dlm_uevent_ops = { + .uevent = dlm_uevent, +}; + +int __init dlm_lockspace_init(void) +{ + ls_count = 0; + mutex_init(&ls_lock); + INIT_LIST_HEAD(&lslist); + spin_lock_init(&lslist_lock); + + dlm_kset = kset_create_and_add("dlm", &dlm_uevent_ops, kernel_kobj); + if (!dlm_kset) { + printk(KERN_WARNING "%s: can not create kset\n", __func__); + return -ENOMEM; + } + return 0; +} + +void dlm_lockspace_exit(void) +{ + kset_unregister(dlm_kset); +} + +static struct dlm_ls *find_ls_to_scan(void) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (time_after_eq(jiffies, ls->ls_scan_time + + dlm_config.ci_scan_secs * HZ)) { + spin_unlock(&lslist_lock); + return ls; + } + } + spin_unlock(&lslist_lock); + return NULL; +} + +static int dlm_scand(void *data) +{ + struct dlm_ls *ls; + + while (!kthread_should_stop()) { + ls = find_ls_to_scan(); + if (ls) { + if (dlm_lock_recovery_try(ls)) { + ls->ls_scan_time = jiffies; + dlm_scan_rsbs(ls); + dlm_scan_timeout(ls); + dlm_scan_waiters(ls); + dlm_unlock_recovery(ls); + } else { + ls->ls_scan_time += HZ; + } + continue; + } + schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ); + } + return 0; +} + +static int dlm_scand_start(void) +{ + struct task_struct *p; + int error = 0; + + p = kthread_run(dlm_scand, NULL, "dlm_scand"); + if (IS_ERR(p)) + error = PTR_ERR(p); + else + scand_task = p; + return error; +} + +static void dlm_scand_stop(void) +{ + kthread_stop(scand_task); +} + +struct dlm_ls *dlm_find_lockspace_global(uint32_t id) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + + list_for_each_entry(ls, &lslist, ls_list) { + if (ls->ls_global_id == id) { + ls->ls_count++; + goto out; + } + } + ls = NULL; + out: + spin_unlock(&lslist_lock); + return ls; +} + +struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (ls->ls_local_handle == lockspace) { + ls->ls_count++; + goto out; + } + } + ls = NULL; + out: + spin_unlock(&lslist_lock); + return ls; +} + +struct dlm_ls *dlm_find_lockspace_device(int minor) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (ls->ls_device.minor == minor) { + ls->ls_count++; + goto out; + } + } + ls = NULL; + out: + spin_unlock(&lslist_lock); + return ls; +} + +void dlm_put_lockspace(struct dlm_ls *ls) +{ + spin_lock(&lslist_lock); + ls->ls_count--; + spin_unlock(&lslist_lock); +} + +static void remove_lockspace(struct dlm_ls *ls) +{ + for (;;) { + spin_lock(&lslist_lock); + if (ls->ls_count == 0) { + WARN_ON(ls->ls_create_count != 0); + list_del(&ls->ls_list); + spin_unlock(&lslist_lock); + return; + } + spin_unlock(&lslist_lock); + ssleep(1); + } +} + +static int threads_start(void) +{ + int error; + + /* Thread which process lock requests for all lockspace's */ + error = dlm_astd_start(); + if (error) { + log_print("cannot start dlm_astd thread %d", error); + goto fail; + } + + error = dlm_scand_start(); + if (error) { + log_print("cannot start dlm_scand thread %d", error); + goto astd_fail; + } + + /* Thread for sending/receiving messages for all lockspace's */ + error = dlm_lowcomms_start(); + if (error) { + log_print("cannot start dlm lowcomms %d", error); + goto scand_fail; + } + + return 0; + + scand_fail: + dlm_scand_stop(); + astd_fail: + dlm_astd_stop(); + fail: + return error; +} + +static void threads_stop(void) +{ + dlm_scand_stop(); + dlm_lowcomms_stop(); + dlm_astd_stop(); +} + +static int new_lockspace(const char *name, int namelen, void **lockspace, + uint32_t flags, int lvblen) +{ + struct dlm_ls *ls; + int i, size, error; + int do_unreg = 0; + + if (namelen > DLM_LOCKSPACE_LEN) + return -EINVAL; + + if (!lvblen || (lvblen % 8)) + return -EINVAL; + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + + if (!dlm_user_daemon_available()) { + module_put(THIS_MODULE); + return -EUNATCH; + } + + error = 0; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + WARN_ON(ls->ls_create_count <= 0); + if (ls->ls_namelen != namelen) + continue; + if (memcmp(ls->ls_name, name, namelen)) + continue; + if (flags & DLM_LSFL_NEWEXCL) { + error = -EEXIST; + break; + } + ls->ls_create_count++; + *lockspace = ls; + error = 1; + break; + } + spin_unlock(&lslist_lock); + + if (error) + goto out; + + error = -ENOMEM; + + ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS); + if (!ls) + goto out; + memcpy(ls->ls_name, name, namelen); + ls->ls_namelen = namelen; + ls->ls_lvblen = lvblen; + ls->ls_count = 0; + ls->ls_flags = 0; + ls->ls_scan_time = jiffies; + + if (flags & DLM_LSFL_TIMEWARN) + set_bit(LSFL_TIMEWARN, &ls->ls_flags); + + /* ls_exflags are forced to match among nodes, and we don't + need to require all nodes to have some flags set */ + ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | + DLM_LSFL_NEWEXCL)); + + size = dlm_config.ci_rsbtbl_size; + ls->ls_rsbtbl_size = size; + + ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS); + if (!ls->ls_rsbtbl) + goto out_lsfree; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list); + INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss); + spin_lock_init(&ls->ls_rsbtbl[i].lock); + } + + size = dlm_config.ci_lkbtbl_size; + ls->ls_lkbtbl_size = size; + + ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS); + if (!ls->ls_lkbtbl) + goto out_rsbfree; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list); + rwlock_init(&ls->ls_lkbtbl[i].lock); + ls->ls_lkbtbl[i].counter = 1; + } + + size = dlm_config.ci_dirtbl_size; + ls->ls_dirtbl_size = size; + + ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS); + if (!ls->ls_dirtbl) + goto out_lkbfree; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&ls->ls_dirtbl[i].list); + spin_lock_init(&ls->ls_dirtbl[i].lock); + } + + INIT_LIST_HEAD(&ls->ls_waiters); + mutex_init(&ls->ls_waiters_mutex); + INIT_LIST_HEAD(&ls->ls_orphans); + mutex_init(&ls->ls_orphans_mutex); + INIT_LIST_HEAD(&ls->ls_timeout); + mutex_init(&ls->ls_timeout_mutex); + + INIT_LIST_HEAD(&ls->ls_nodes); + INIT_LIST_HEAD(&ls->ls_nodes_gone); + ls->ls_num_nodes = 0; + ls->ls_low_nodeid = 0; + ls->ls_total_weight = 0; + ls->ls_node_array = NULL; + + memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb)); + ls->ls_stub_rsb.res_ls = ls; + + ls->ls_debug_rsb_dentry = NULL; + ls->ls_debug_waiters_dentry = NULL; + + init_waitqueue_head(&ls->ls_uevent_wait); + ls->ls_uevent_result = 0; + init_completion(&ls->ls_members_done); + ls->ls_members_result = -1; + + ls->ls_recoverd_task = NULL; + mutex_init(&ls->ls_recoverd_active); + spin_lock_init(&ls->ls_recover_lock); + spin_lock_init(&ls->ls_rcom_spin); + get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t)); + ls->ls_recover_status = 0; + ls->ls_recover_seq = 0; + ls->ls_recover_args = NULL; + init_rwsem(&ls->ls_in_recovery); + init_rwsem(&ls->ls_recv_active); + INIT_LIST_HEAD(&ls->ls_requestqueue); + mutex_init(&ls->ls_requestqueue_mutex); + mutex_init(&ls->ls_clear_proc_locks); + + ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); + if (!ls->ls_recover_buf) + goto out_dirfree; + + INIT_LIST_HEAD(&ls->ls_recover_list); + spin_lock_init(&ls->ls_recover_list_lock); + ls->ls_recover_list_count = 0; + ls->ls_local_handle = ls; + init_waitqueue_head(&ls->ls_wait_general); + INIT_LIST_HEAD(&ls->ls_root_list); + init_rwsem(&ls->ls_root_sem); + + down_write(&ls->ls_in_recovery); + + spin_lock(&lslist_lock); + ls->ls_create_count = 1; + list_add(&ls->ls_list, &lslist); + spin_unlock(&lslist_lock); + + /* needs to find ls in lslist */ + error = dlm_recoverd_start(ls); + if (error) { + log_error(ls, "can't start dlm_recoverd %d", error); + goto out_delist; + } + + ls->ls_kobj.kset = dlm_kset; + error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL, + "%s", ls->ls_name); + if (error) + goto out_stop; + kobject_uevent(&ls->ls_kobj, KOBJ_ADD); + + /* let kobject handle freeing of ls if there's an error */ + do_unreg = 1; + + /* This uevent triggers dlm_controld in userspace to add us to the + group of nodes that are members of this lockspace (managed by the + cluster infrastructure.) Once it's done that, it tells us who the + current lockspace members are (via configfs) and then tells the + lockspace to start running (via sysfs) in dlm_ls_start(). */ + + error = do_uevent(ls, 1); + if (error) + goto out_stop; + + wait_for_completion(&ls->ls_members_done); + error = ls->ls_members_result; + if (error) + goto out_members; + + dlm_create_debug_file(ls); + + log_debug(ls, "join complete"); + *lockspace = ls; + return 0; + + out_members: + do_uevent(ls, 0); + dlm_clear_members(ls); + kfree(ls->ls_node_array); + out_stop: + dlm_recoverd_stop(ls); + out_delist: + spin_lock(&lslist_lock); + list_del(&ls->ls_list); + spin_unlock(&lslist_lock); + kfree(ls->ls_recover_buf); + out_dirfree: + kfree(ls->ls_dirtbl); + out_lkbfree: + kfree(ls->ls_lkbtbl); + out_rsbfree: + kfree(ls->ls_rsbtbl); + out_lsfree: + if (do_unreg) + kobject_put(&ls->ls_kobj); + else + kfree(ls); + out: + module_put(THIS_MODULE); + return error; +} + +int dlm_new_lockspace(const char *name, int namelen, void **lockspace, + uint32_t flags, int lvblen) +{ + int error = 0; + + mutex_lock(&ls_lock); + if (!ls_count) + error = threads_start(); + if (error) + goto out; + + error = new_lockspace(name, namelen, lockspace, flags, lvblen); + if (!error) + ls_count++; + if (error > 0) + error = 0; + if (!ls_count) + threads_stop(); + out: + mutex_unlock(&ls_lock); + return error; +} + +/* Return 1 if the lockspace still has active remote locks, + * 2 if the lockspace still has active local locks. + */ +static int lockspace_busy(struct dlm_ls *ls) +{ + int i, lkb_found = 0; + struct dlm_lkb *lkb; + + /* NOTE: We check the lockidtbl here rather than the resource table. + This is because there may be LKBs queued as ASTs that have been + unlinked from their RSBs and are pending deletion once the AST has + been delivered */ + + for (i = 0; i < ls->ls_lkbtbl_size; i++) { + read_lock(&ls->ls_lkbtbl[i].lock); + if (!list_empty(&ls->ls_lkbtbl[i].list)) { + lkb_found = 1; + list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list, + lkb_idtbl_list) { + if (!lkb->lkb_nodeid) { + read_unlock(&ls->ls_lkbtbl[i].lock); + return 2; + } + } + } + read_unlock(&ls->ls_lkbtbl[i].lock); + } + return lkb_found; +} + +static int release_lockspace(struct dlm_ls *ls, int force) +{ + struct dlm_lkb *lkb; + struct dlm_rsb *rsb; + struct list_head *head; + int i, busy, rv; + + busy = lockspace_busy(ls); + + spin_lock(&lslist_lock); + if (ls->ls_create_count == 1) { + if (busy > force) + rv = -EBUSY; + else { + /* remove_lockspace takes ls off lslist */ + ls->ls_create_count = 0; + rv = 0; + } + } else if (ls->ls_create_count > 1) { + rv = --ls->ls_create_count; + } else { + rv = -EINVAL; + } + spin_unlock(&lslist_lock); + + if (rv) { + log_debug(ls, "release_lockspace no remove %d", rv); + return rv; + } + + dlm_device_deregister(ls); + + if (force < 3 && dlm_user_daemon_available()) + do_uevent(ls, 0); + + dlm_recoverd_stop(ls); + + remove_lockspace(ls); + + dlm_delete_debug_file(ls); + + dlm_astd_suspend(); + + kfree(ls->ls_recover_buf); + + /* + * Free direntry structs. + */ + + dlm_dir_clear(ls); + kfree(ls->ls_dirtbl); + + /* + * Free all lkb's on lkbtbl[] lists. + */ + + for (i = 0; i < ls->ls_lkbtbl_size; i++) { + head = &ls->ls_lkbtbl[i].list; + while (!list_empty(head)) { + lkb = list_entry(head->next, struct dlm_lkb, + lkb_idtbl_list); + + list_del(&lkb->lkb_idtbl_list); + + dlm_del_ast(lkb); + + if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) + dlm_free_lvb(lkb->lkb_lvbptr); + + dlm_free_lkb(lkb); + } + } + dlm_astd_resume(); + + kfree(ls->ls_lkbtbl); + + /* + * Free all rsb's on rsbtbl[] lists + */ + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + head = &ls->ls_rsbtbl[i].list; + while (!list_empty(head)) { + rsb = list_entry(head->next, struct dlm_rsb, + res_hashchain); + + list_del(&rsb->res_hashchain); + dlm_free_rsb(rsb); + } + + head = &ls->ls_rsbtbl[i].toss; + while (!list_empty(head)) { + rsb = list_entry(head->next, struct dlm_rsb, + res_hashchain); + list_del(&rsb->res_hashchain); + dlm_free_rsb(rsb); + } + } + + kfree(ls->ls_rsbtbl); + + /* + * Free structures on any other lists + */ + + dlm_purge_requestqueue(ls); + kfree(ls->ls_recover_args); + dlm_clear_free_entries(ls); + dlm_clear_members(ls); + dlm_clear_members_gone(ls); + kfree(ls->ls_node_array); + log_debug(ls, "release_lockspace final free"); + kobject_put(&ls->ls_kobj); + /* The ls structure will be freed when the kobject is done with */ + + module_put(THIS_MODULE); + return 0; +} + +/* + * Called when a system has released all its locks and is not going to use the + * lockspace any longer. We free everything we're managing for this lockspace. + * Remaining nodes will go through the recovery process as if we'd died. The + * lockspace must continue to function as usual, participating in recoveries, + * until this returns. + * + * Force has 4 possible values: + * 0 - don't destroy locksapce if it has any LKBs + * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs + * 2 - destroy lockspace regardless of LKBs + * 3 - destroy lockspace as part of a forced shutdown + */ + +int dlm_release_lockspace(void *lockspace, int force) +{ + struct dlm_ls *ls; + int error; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + dlm_put_lockspace(ls); + + mutex_lock(&ls_lock); + error = release_lockspace(ls, force); + if (!error) + ls_count--; + if (!ls_count) + threads_stop(); + mutex_unlock(&ls_lock); + + return error; +} + +void dlm_stop_lockspaces(void) +{ + struct dlm_ls *ls; + + restart: + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) + continue; + spin_unlock(&lslist_lock); + log_error(ls, "no userland control daemon, stopping lockspace"); + dlm_ls_stop(ls); + goto restart; + } + spin_unlock(&lslist_lock); +} + diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h new file mode 100644 index 00000000..f879f879 --- /dev/null +++ b/fs/dlm/lockspace.h @@ -0,0 +1,26 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LOCKSPACE_DOT_H__ +#define __LOCKSPACE_DOT_H__ + +int dlm_lockspace_init(void); +void dlm_lockspace_exit(void); +struct dlm_ls *dlm_find_lockspace_global(uint32_t id); +struct dlm_ls *dlm_find_lockspace_local(void *id); +struct dlm_ls *dlm_find_lockspace_device(int minor); +void dlm_put_lockspace(struct dlm_ls *ls); +void dlm_stop_lockspaces(void); + +#endif /* __LOCKSPACE_DOT_H__ */ + diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c new file mode 100644 index 00000000..5e2c71f0 --- /dev/null +++ b/fs/dlm/lowcomms.c @@ -0,0 +1,1572 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * lowcomms.c + * + * This is the "low-level" comms layer. + * + * It is responsible for sending/receiving messages + * from other nodes in the cluster. + * + * Cluster nodes are referred to by their nodeids. nodeids are + * simply 32 bit numbers to the locking module - if they need to + * be expanded for the cluster infrastructure then that is its + * responsibility. It is this layer's + * responsibility to resolve these into IP address or + * whatever it needs for inter-node communication. + * + * The comms level is two kernel threads that deal mainly with + * the receiving of messages from other nodes and passing them + * up to the mid-level comms layer (which understands the + * message format) for execution by the locking core, and + * a send thread which does all the setting up of connections + * to remote nodes and the sending of data. Threads are not allowed + * to send their own data because it may cause them to wait in times + * of high load. Also, this way, the sending thread can collect together + * messages bound for one node and send them in one block. + * + * lowcomms will choose to use either TCP or SCTP as its transport layer + * depending on the configuration variable 'protocol'. This should be set + * to 0 (default) for TCP or 1 for SCTP. It should be configured using a + * cluster-wide mechanism as it must be the same on all nodes of the cluster + * for the DLM to function. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dlm_internal.h" +#include "lowcomms.h" +#include "midcomms.h" +#include "config.h" + +#define NEEDED_RMEM (4*1024*1024) +#define CONN_HASH_SIZE 32 + +/* Number of messages to send before rescheduling */ +#define MAX_SEND_MSG_COUNT 25 + +struct cbuf { + unsigned int base; + unsigned int len; + unsigned int mask; +}; + +static void cbuf_add(struct cbuf *cb, int n) +{ + cb->len += n; +} + +static int cbuf_data(struct cbuf *cb) +{ + return ((cb->base + cb->len) & cb->mask); +} + +static void cbuf_init(struct cbuf *cb, int size) +{ + cb->base = cb->len = 0; + cb->mask = size-1; +} + +static void cbuf_eat(struct cbuf *cb, int n) +{ + cb->len -= n; + cb->base += n; + cb->base &= cb->mask; +} + +static bool cbuf_empty(struct cbuf *cb) +{ + return cb->len == 0; +} + +struct connection { + struct socket *sock; /* NULL if not connected */ + uint32_t nodeid; /* So we know who we are in the list */ + struct mutex sock_mutex; + unsigned long flags; +#define CF_READ_PENDING 1 +#define CF_WRITE_PENDING 2 +#define CF_CONNECT_PENDING 3 +#define CF_INIT_PENDING 4 +#define CF_IS_OTHERCON 5 +#define CF_CLOSE 6 +#define CF_APP_LIMITED 7 + struct list_head writequeue; /* List of outgoing writequeue_entries */ + spinlock_t writequeue_lock; + int (*rx_action) (struct connection *); /* What to do when active */ + void (*connect_action) (struct connection *); /* What to do to connect */ + struct page *rx_page; + struct cbuf cb; + int retries; +#define MAX_CONNECT_RETRIES 3 + int sctp_assoc; + struct hlist_node list; + struct connection *othercon; + struct work_struct rwork; /* Receive workqueue */ + struct work_struct swork; /* Send workqueue */ +}; +#define sock2con(x) ((struct connection *)(x)->sk_user_data) + +/* An entry waiting to be sent */ +struct writequeue_entry { + struct list_head list; + struct page *page; + int offset; + int len; + int end; + int users; + struct connection *con; +}; + +static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; +static int dlm_local_count; + +/* Work queues */ +static struct workqueue_struct *recv_workqueue; +static struct workqueue_struct *send_workqueue; + +static struct hlist_head connection_hash[CONN_HASH_SIZE]; +static DEFINE_MUTEX(connections_lock); +static struct kmem_cache *con_cache; + +static void process_recv_sockets(struct work_struct *work); +static void process_send_sockets(struct work_struct *work); + + +/* This is deliberately very simple because most clusters have simple + sequential nodeids, so we should be able to go straight to a connection + struct in the array */ +static inline int nodeid_hash(int nodeid) +{ + return nodeid & (CONN_HASH_SIZE-1); +} + +static struct connection *__find_con(int nodeid) +{ + int r; + struct hlist_node *h; + struct connection *con; + + r = nodeid_hash(nodeid); + + hlist_for_each_entry(con, h, &connection_hash[r], list) { + if (con->nodeid == nodeid) + return con; + } + return NULL; +} + +/* + * If 'allocation' is zero then we don't attempt to create a new + * connection structure for this node. + */ +static struct connection *__nodeid2con(int nodeid, gfp_t alloc) +{ + struct connection *con = NULL; + int r; + + con = __find_con(nodeid); + if (con || !alloc) + return con; + + con = kmem_cache_zalloc(con_cache, alloc); + if (!con) + return NULL; + + r = nodeid_hash(nodeid); + hlist_add_head(&con->list, &connection_hash[r]); + + con->nodeid = nodeid; + mutex_init(&con->sock_mutex); + INIT_LIST_HEAD(&con->writequeue); + spin_lock_init(&con->writequeue_lock); + INIT_WORK(&con->swork, process_send_sockets); + INIT_WORK(&con->rwork, process_recv_sockets); + + /* Setup action pointers for child sockets */ + if (con->nodeid) { + struct connection *zerocon = __find_con(0); + + con->connect_action = zerocon->connect_action; + if (!con->rx_action) + con->rx_action = zerocon->rx_action; + } + + return con; +} + +/* Loop round all connections */ +static void foreach_conn(void (*conn_func)(struct connection *c)) +{ + int i; + struct hlist_node *h, *n; + struct connection *con; + + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_safe(con, h, n, &connection_hash[i], list){ + conn_func(con); + } + } +} + +static struct connection *nodeid2con(int nodeid, gfp_t allocation) +{ + struct connection *con; + + mutex_lock(&connections_lock); + con = __nodeid2con(nodeid, allocation); + mutex_unlock(&connections_lock); + + return con; +} + +/* This is a bit drastic, but only called when things go wrong */ +static struct connection *assoc2con(int assoc_id) +{ + int i; + struct hlist_node *h; + struct connection *con; + + mutex_lock(&connections_lock); + + for (i = 0 ; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry(con, h, &connection_hash[i], list) { + if (con->sctp_assoc == assoc_id) { + mutex_unlock(&connections_lock); + return con; + } + } + } + mutex_unlock(&connections_lock); + return NULL; +} + +static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) +{ + struct sockaddr_storage addr; + int error; + + if (!dlm_local_count) + return -1; + + error = dlm_nodeid_to_addr(nodeid, &addr); + if (error) + return error; + + if (dlm_local_addr[0]->ss_family == AF_INET) { + struct sockaddr_in *in4 = (struct sockaddr_in *) &addr; + struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr; + ret4->sin_addr.s_addr = in4->sin_addr.s_addr; + } else { + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr; + struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr; + ipv6_addr_copy(&ret6->sin6_addr, &in6->sin6_addr); + } + + return 0; +} + +/* Data available on socket or listen socket received a connect */ +static void lowcomms_data_ready(struct sock *sk, int count_unused) +{ + struct connection *con = sock2con(sk); + if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) + queue_work(recv_workqueue, &con->rwork); +} + +static void lowcomms_write_space(struct sock *sk) +{ + struct connection *con = sock2con(sk); + + if (!con) + return; + + clear_bit(SOCK_NOSPACE, &con->sock->flags); + + if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) { + con->sock->sk->sk_write_pending--; + clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags); + } + + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + queue_work(send_workqueue, &con->swork); +} + +static inline void lowcomms_connect_sock(struct connection *con) +{ + if (test_bit(CF_CLOSE, &con->flags)) + return; + if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) + queue_work(send_workqueue, &con->swork); +} + +static void lowcomms_state_change(struct sock *sk) +{ + if (sk->sk_state == TCP_ESTABLISHED) + lowcomms_write_space(sk); +} + +int dlm_lowcomms_connect_node(int nodeid) +{ + struct connection *con; + + /* with sctp there's no connecting without sending */ + if (dlm_config.ci_protocol != 0) + return 0; + + if (nodeid == dlm_our_nodeid()) + return 0; + + con = nodeid2con(nodeid, GFP_NOFS); + if (!con) + return -ENOMEM; + lowcomms_connect_sock(con); + return 0; +} + +/* Make a socket active */ +static int add_sock(struct socket *sock, struct connection *con) +{ + con->sock = sock; + + /* Install a data_ready callback */ + con->sock->sk->sk_data_ready = lowcomms_data_ready; + con->sock->sk->sk_write_space = lowcomms_write_space; + con->sock->sk->sk_state_change = lowcomms_state_change; + con->sock->sk->sk_user_data = con; + con->sock->sk->sk_allocation = GFP_NOFS; + return 0; +} + +/* Add the port number to an IPv6 or 4 sockaddr and return the address + length */ +static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, + int *addr_len) +{ + saddr->ss_family = dlm_local_addr[0]->ss_family; + if (saddr->ss_family == AF_INET) { + struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; + in4_addr->sin_port = cpu_to_be16(port); + *addr_len = sizeof(struct sockaddr_in); + memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero)); + } else { + struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; + in6_addr->sin6_port = cpu_to_be16(port); + *addr_len = sizeof(struct sockaddr_in6); + } + memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len); +} + +/* Close a remote connection and tidy up */ +static void close_connection(struct connection *con, bool and_other) +{ + mutex_lock(&con->sock_mutex); + + if (con->sock) { + sock_release(con->sock); + con->sock = NULL; + } + if (con->othercon && and_other) { + /* Will only re-enter once. */ + close_connection(con->othercon, false); + } + if (con->rx_page) { + __free_page(con->rx_page); + con->rx_page = NULL; + } + + con->retries = 0; + mutex_unlock(&con->sock_mutex); +} + +/* We only send shutdown messages to nodes that are not part of the cluster */ +static void sctp_send_shutdown(sctp_assoc_t associd) +{ + static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; + struct msghdr outmessage; + struct cmsghdr *cmsg; + struct sctp_sndrcvinfo *sinfo; + int ret; + struct connection *con; + + con = nodeid2con(0,0); + BUG_ON(con == NULL); + + outmessage.msg_name = NULL; + outmessage.msg_namelen = 0; + outmessage.msg_control = outcmsg; + outmessage.msg_controllen = sizeof(outcmsg); + outmessage.msg_flags = MSG_EOR; + + cmsg = CMSG_FIRSTHDR(&outmessage); + cmsg->cmsg_level = IPPROTO_SCTP; + cmsg->cmsg_type = SCTP_SNDRCV; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); + outmessage.msg_controllen = cmsg->cmsg_len; + sinfo = CMSG_DATA(cmsg); + memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); + + sinfo->sinfo_flags |= MSG_EOF; + sinfo->sinfo_assoc_id = associd; + + ret = kernel_sendmsg(con->sock, &outmessage, NULL, 0, 0); + + if (ret != 0) + log_print("send EOF to node failed: %d", ret); +} + +static void sctp_init_failed_foreach(struct connection *con) +{ + con->sctp_assoc = 0; + if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + queue_work(send_workqueue, &con->swork); + } +} + +/* INIT failed but we don't know which node... + restart INIT on all pending nodes */ +static void sctp_init_failed(void) +{ + mutex_lock(&connections_lock); + + foreach_conn(sctp_init_failed_foreach); + + mutex_unlock(&connections_lock); +} + +/* Something happened to an association */ +static void process_sctp_notification(struct connection *con, + struct msghdr *msg, char *buf) +{ + union sctp_notification *sn = (union sctp_notification *)buf; + + if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) { + switch (sn->sn_assoc_change.sac_state) { + + case SCTP_COMM_UP: + case SCTP_RESTART: + { + /* Check that the new node is in the lockspace */ + struct sctp_prim prim; + int nodeid; + int prim_len, ret; + int addr_len; + struct connection *new_con; + sctp_peeloff_arg_t parg; + int parglen = sizeof(parg); + int err; + + /* + * We get this before any data for an association. + * We verify that the node is in the cluster and + * then peel off a socket for it. + */ + if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) { + log_print("COMM_UP for invalid assoc ID %d", + (int)sn->sn_assoc_change.sac_assoc_id); + sctp_init_failed(); + return; + } + memset(&prim, 0, sizeof(struct sctp_prim)); + prim_len = sizeof(struct sctp_prim); + prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id; + + ret = kernel_getsockopt(con->sock, + IPPROTO_SCTP, + SCTP_PRIMARY_ADDR, + (char*)&prim, + &prim_len); + if (ret < 0) { + log_print("getsockopt/sctp_primary_addr on " + "new assoc %d failed : %d", + (int)sn->sn_assoc_change.sac_assoc_id, + ret); + + /* Retry INIT later */ + new_con = assoc2con(sn->sn_assoc_change.sac_assoc_id); + if (new_con) + clear_bit(CF_CONNECT_PENDING, &con->flags); + return; + } + make_sockaddr(&prim.ssp_addr, 0, &addr_len); + if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) { + int i; + unsigned char *b=(unsigned char *)&prim.ssp_addr; + log_print("reject connect from unknown addr"); + for (i=0; isn_assoc_change.sac_assoc_id; + ret = kernel_getsockopt(con->sock, IPPROTO_SCTP, + SCTP_SOCKOPT_PEELOFF, + (void *)&parg, &parglen); + if (ret < 0) { + log_print("Can't peel off a socket for " + "connection %d to node %d: err=%d", + parg.associd, nodeid, ret); + return; + } + new_con->sock = sockfd_lookup(parg.sd, &err); + if (!new_con->sock) { + log_print("sockfd_lookup error %d", err); + return; + } + add_sock(new_con->sock, new_con); + sockfd_put(new_con->sock); + + log_print("connecting to %d sctp association %d", + nodeid, (int)sn->sn_assoc_change.sac_assoc_id); + + /* Send any pending writes */ + clear_bit(CF_CONNECT_PENDING, &new_con->flags); + clear_bit(CF_INIT_PENDING, &con->flags); + if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) { + queue_work(send_workqueue, &new_con->swork); + } + if (!test_and_set_bit(CF_READ_PENDING, &new_con->flags)) + queue_work(recv_workqueue, &new_con->rwork); + } + break; + + case SCTP_COMM_LOST: + case SCTP_SHUTDOWN_COMP: + { + con = assoc2con(sn->sn_assoc_change.sac_assoc_id); + if (con) { + con->sctp_assoc = 0; + } + } + break; + + /* We don't know which INIT failed, so clear the PENDING flags + * on them all. if assoc_id is zero then it will then try + * again */ + + case SCTP_CANT_STR_ASSOC: + { + log_print("Can't start SCTP association - retrying"); + sctp_init_failed(); + } + break; + + default: + log_print("unexpected SCTP assoc change id=%d state=%d", + (int)sn->sn_assoc_change.sac_assoc_id, + sn->sn_assoc_change.sac_state); + } + } +} + +/* Data received from remote end */ +static int receive_from_sock(struct connection *con) +{ + int ret = 0; + struct msghdr msg = {}; + struct kvec iov[2]; + unsigned len; + int r; + int call_again_soon = 0; + int nvec; + char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; + + mutex_lock(&con->sock_mutex); + + if (con->sock == NULL) { + ret = -EAGAIN; + goto out_close; + } + + if (con->rx_page == NULL) { + /* + * This doesn't need to be atomic, but I think it should + * improve performance if it is. + */ + con->rx_page = alloc_page(GFP_ATOMIC); + if (con->rx_page == NULL) + goto out_resched; + cbuf_init(&con->cb, PAGE_CACHE_SIZE); + } + + /* Only SCTP needs these really */ + memset(&incmsg, 0, sizeof(incmsg)); + msg.msg_control = incmsg; + msg.msg_controllen = sizeof(incmsg); + + /* + * iov[0] is the bit of the circular buffer between the current end + * point (cb.base + cb.len) and the end of the buffer. + */ + iov[0].iov_len = con->cb.base - cbuf_data(&con->cb); + iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb); + iov[1].iov_len = 0; + nvec = 1; + + /* + * iov[1] is the bit of the circular buffer between the start of the + * buffer and the start of the currently used section (cb.base) + */ + if (cbuf_data(&con->cb) >= con->cb.base) { + iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb); + iov[1].iov_len = con->cb.base; + iov[1].iov_base = page_address(con->rx_page); + nvec = 2; + } + len = iov[0].iov_len + iov[1].iov_len; + + r = ret = kernel_recvmsg(con->sock, &msg, iov, nvec, len, + MSG_DONTWAIT | MSG_NOSIGNAL); + if (ret <= 0) + goto out_close; + + /* Process SCTP notifications */ + if (msg.msg_flags & MSG_NOTIFICATION) { + msg.msg_control = incmsg; + msg.msg_controllen = sizeof(incmsg); + + process_sctp_notification(con, &msg, + page_address(con->rx_page) + con->cb.base); + mutex_unlock(&con->sock_mutex); + return 0; + } + BUG_ON(con->nodeid == 0); + + if (ret == len) + call_again_soon = 1; + cbuf_add(&con->cb, ret); + ret = dlm_process_incoming_buffer(con->nodeid, + page_address(con->rx_page), + con->cb.base, con->cb.len, + PAGE_CACHE_SIZE); + if (ret == -EBADMSG) { + log_print("lowcomms: addr=%p, base=%u, len=%u, " + "iov_len=%u, iov_base[0]=%p, read=%d", + page_address(con->rx_page), con->cb.base, con->cb.len, + len, iov[0].iov_base, r); + } + if (ret < 0) + goto out_close; + cbuf_eat(&con->cb, ret); + + if (cbuf_empty(&con->cb) && !call_again_soon) { + __free_page(con->rx_page); + con->rx_page = NULL; + } + + if (call_again_soon) + goto out_resched; + mutex_unlock(&con->sock_mutex); + return 0; + +out_resched: + if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) + queue_work(recv_workqueue, &con->rwork); + mutex_unlock(&con->sock_mutex); + return -EAGAIN; + +out_close: + mutex_unlock(&con->sock_mutex); + if (ret != -EAGAIN) { + close_connection(con, false); + /* Reconnect when there is something to send */ + } + /* Don't return success if we really got EOF */ + if (ret == 0) + ret = -EAGAIN; + + return ret; +} + +/* Listening socket is busy, accept a connection */ +static int tcp_accept_from_sock(struct connection *con) +{ + int result; + struct sockaddr_storage peeraddr; + struct socket *newsock; + int len; + int nodeid; + struct connection *newcon; + struct connection *addcon; + + memset(&peeraddr, 0, sizeof(peeraddr)); + result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, + IPPROTO_TCP, &newsock); + if (result < 0) + return -ENOMEM; + + mutex_lock_nested(&con->sock_mutex, 0); + + result = -ENOTCONN; + if (con->sock == NULL) + goto accept_err; + + newsock->type = con->sock->type; + newsock->ops = con->sock->ops; + + result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK); + if (result < 0) + goto accept_err; + + /* Get the connected socket's peer */ + memset(&peeraddr, 0, sizeof(peeraddr)); + if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, + &len, 2)) { + result = -ECONNABORTED; + goto accept_err; + } + + /* Get the new node's NODEID */ + make_sockaddr(&peeraddr, 0, &len); + if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) { + log_print("connect from non cluster node"); + sock_release(newsock); + mutex_unlock(&con->sock_mutex); + return -1; + } + + log_print("got connection from %d", nodeid); + + /* Check to see if we already have a connection to this node. This + * could happen if the two nodes initiate a connection at roughly + * the same time and the connections cross on the wire. + * In this case we store the incoming one in "othercon" + */ + newcon = nodeid2con(nodeid, GFP_NOFS); + if (!newcon) { + result = -ENOMEM; + goto accept_err; + } + mutex_lock_nested(&newcon->sock_mutex, 1); + if (newcon->sock) { + struct connection *othercon = newcon->othercon; + + if (!othercon) { + othercon = kmem_cache_zalloc(con_cache, GFP_NOFS); + if (!othercon) { + log_print("failed to allocate incoming socket"); + mutex_unlock(&newcon->sock_mutex); + result = -ENOMEM; + goto accept_err; + } + othercon->nodeid = nodeid; + othercon->rx_action = receive_from_sock; + mutex_init(&othercon->sock_mutex); + INIT_WORK(&othercon->swork, process_send_sockets); + INIT_WORK(&othercon->rwork, process_recv_sockets); + set_bit(CF_IS_OTHERCON, &othercon->flags); + } + if (!othercon->sock) { + newcon->othercon = othercon; + othercon->sock = newsock; + newsock->sk->sk_user_data = othercon; + add_sock(newsock, othercon); + addcon = othercon; + } + else { + printk("Extra connection from node %d attempted\n", nodeid); + result = -EAGAIN; + mutex_unlock(&newcon->sock_mutex); + goto accept_err; + } + } + else { + newsock->sk->sk_user_data = newcon; + newcon->rx_action = receive_from_sock; + add_sock(newsock, newcon); + addcon = newcon; + } + + mutex_unlock(&newcon->sock_mutex); + + /* + * Add it to the active queue in case we got data + * between processing the accept adding the socket + * to the read_sockets list + */ + if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) + queue_work(recv_workqueue, &addcon->rwork); + mutex_unlock(&con->sock_mutex); + + return 0; + +accept_err: + mutex_unlock(&con->sock_mutex); + sock_release(newsock); + + if (result != -EAGAIN) + log_print("error accepting connection from node: %d", result); + return result; +} + +static void free_entry(struct writequeue_entry *e) +{ + __free_page(e->page); + kfree(e); +} + +/* Initiate an SCTP association. + This is a special case of send_to_sock() in that we don't yet have a + peeled-off socket for this association, so we use the listening socket + and add the primary IP address of the remote node. + */ +static void sctp_init_assoc(struct connection *con) +{ + struct sockaddr_storage rem_addr; + char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; + struct msghdr outmessage; + struct cmsghdr *cmsg; + struct sctp_sndrcvinfo *sinfo; + struct connection *base_con; + struct writequeue_entry *e; + int len, offset; + int ret; + int addrlen; + struct kvec iov[1]; + + if (test_and_set_bit(CF_INIT_PENDING, &con->flags)) + return; + + if (con->retries++ > MAX_CONNECT_RETRIES) + return; + + if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) { + log_print("no address for nodeid %d", con->nodeid); + return; + } + base_con = nodeid2con(0, 0); + BUG_ON(base_con == NULL); + + make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen); + + outmessage.msg_name = &rem_addr; + outmessage.msg_namelen = addrlen; + outmessage.msg_control = outcmsg; + outmessage.msg_controllen = sizeof(outcmsg); + outmessage.msg_flags = MSG_EOR; + + spin_lock(&con->writequeue_lock); + + if (list_empty(&con->writequeue)) { + spin_unlock(&con->writequeue_lock); + log_print("writequeue empty for nodeid %d", con->nodeid); + return; + } + + e = list_first_entry(&con->writequeue, struct writequeue_entry, list); + len = e->len; + offset = e->offset; + spin_unlock(&con->writequeue_lock); + + /* Send the first block off the write queue */ + iov[0].iov_base = page_address(e->page)+offset; + iov[0].iov_len = len; + + cmsg = CMSG_FIRSTHDR(&outmessage); + cmsg->cmsg_level = IPPROTO_SCTP; + cmsg->cmsg_type = SCTP_SNDRCV; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); + sinfo = CMSG_DATA(cmsg); + memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); + sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid()); + outmessage.msg_controllen = cmsg->cmsg_len; + + ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len); + if (ret < 0) { + log_print("Send first packet to node %d failed: %d", + con->nodeid, ret); + + /* Try again later */ + clear_bit(CF_CONNECT_PENDING, &con->flags); + clear_bit(CF_INIT_PENDING, &con->flags); + } + else { + spin_lock(&con->writequeue_lock); + e->offset += ret; + e->len -= ret; + + if (e->len == 0 && e->users == 0) { + list_del(&e->list); + free_entry(e); + } + spin_unlock(&con->writequeue_lock); + } +} + +/* Connect a new socket to its peer */ +static void tcp_connect_to_sock(struct connection *con) +{ + int result = -EHOSTUNREACH; + struct sockaddr_storage saddr, src_addr; + int addr_len; + struct socket *sock = NULL; + int one = 1; + + if (con->nodeid == 0) { + log_print("attempt to connect sock 0 foiled"); + return; + } + + mutex_lock(&con->sock_mutex); + if (con->retries++ > MAX_CONNECT_RETRIES) + goto out; + + /* Some odd races can cause double-connects, ignore them */ + if (con->sock) { + result = 0; + goto out; + } + + /* Create a socket to communicate with */ + result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); + if (result < 0) + goto out_err; + + memset(&saddr, 0, sizeof(saddr)); + if (dlm_nodeid_to_addr(con->nodeid, &saddr)) + goto out_err; + + sock->sk->sk_user_data = con; + con->rx_action = receive_from_sock; + con->connect_action = tcp_connect_to_sock; + add_sock(sock, con); + + /* Bind to our cluster-known address connecting to avoid + routing problems */ + memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr)); + make_sockaddr(&src_addr, 0, &addr_len); + result = sock->ops->bind(sock, (struct sockaddr *) &src_addr, + addr_len); + if (result < 0) { + log_print("could not bind for connect: %d", result); + /* This *may* not indicate a critical error */ + } + + make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); + + log_print("connecting to %d", con->nodeid); + + /* Turn off Nagle's algorithm */ + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, + sizeof(one)); + + result = + sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, + O_NONBLOCK); + if (result == -EINPROGRESS) + result = 0; + if (result == 0) + goto out; + +out_err: + if (con->sock) { + sock_release(con->sock); + con->sock = NULL; + } else if (sock) { + sock_release(sock); + } + /* + * Some errors are fatal and this list might need adjusting. For other + * errors we try again until the max number of retries is reached. + */ + if (result != -EHOSTUNREACH && result != -ENETUNREACH && + result != -ENETDOWN && result != -EINVAL + && result != -EPROTONOSUPPORT) { + lowcomms_connect_sock(con); + result = 0; + } +out: + mutex_unlock(&con->sock_mutex); + return; +} + +static struct socket *tcp_create_listen_sock(struct connection *con, + struct sockaddr_storage *saddr) +{ + struct socket *sock = NULL; + int result = 0; + int one = 1; + int addr_len; + + if (dlm_local_addr[0]->ss_family == AF_INET) + addr_len = sizeof(struct sockaddr_in); + else + addr_len = sizeof(struct sockaddr_in6); + + /* Create a socket to communicate with */ + result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); + if (result < 0) { + log_print("Can't create listening comms socket"); + goto create_out; + } + + /* Turn off Nagle's algorithm */ + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, + sizeof(one)); + + result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + (char *)&one, sizeof(one)); + + if (result < 0) { + log_print("Failed to set SO_REUSEADDR on socket: %d", result); + } + sock->sk->sk_user_data = con; + con->rx_action = tcp_accept_from_sock; + con->connect_action = tcp_connect_to_sock; + con->sock = sock; + + /* Bind to our port */ + make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); + result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len); + if (result < 0) { + log_print("Can't bind to port %d", dlm_config.ci_tcp_port); + sock_release(sock); + sock = NULL; + con->sock = NULL; + goto create_out; + } + result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&one, sizeof(one)); + if (result < 0) { + log_print("Set keepalive failed: %d", result); + } + + result = sock->ops->listen(sock, 5); + if (result < 0) { + log_print("Can't listen on port %d", dlm_config.ci_tcp_port); + sock_release(sock); + sock = NULL; + goto create_out; + } + +create_out: + return sock; +} + +/* Get local addresses */ +static void init_local(void) +{ + struct sockaddr_storage sas, *addr; + int i; + + dlm_local_count = 0; + for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) { + if (dlm_our_addr(&sas, i)) + break; + + addr = kmalloc(sizeof(*addr), GFP_NOFS); + if (!addr) + break; + memcpy(addr, &sas, sizeof(*addr)); + dlm_local_addr[dlm_local_count++] = addr; + } +} + +/* Bind to an IP address. SCTP allows multiple address so it can do + multi-homing */ +static int add_sctp_bind_addr(struct connection *sctp_con, + struct sockaddr_storage *addr, + int addr_len, int num) +{ + int result = 0; + + if (num == 1) + result = kernel_bind(sctp_con->sock, + (struct sockaddr *) addr, + addr_len); + else + result = kernel_setsockopt(sctp_con->sock, SOL_SCTP, + SCTP_SOCKOPT_BINDX_ADD, + (char *)addr, addr_len); + + if (result < 0) + log_print("Can't bind to port %d addr number %d", + dlm_config.ci_tcp_port, num); + + return result; +} + +/* Initialise SCTP socket and bind to all interfaces */ +static int sctp_listen_for_all(void) +{ + struct socket *sock = NULL; + struct sockaddr_storage localaddr; + struct sctp_event_subscribe subscribe; + int result = -EINVAL, num = 1, i, addr_len; + struct connection *con = nodeid2con(0, GFP_NOFS); + int bufsize = NEEDED_RMEM; + + if (!con) + return -ENOMEM; + + log_print("Using SCTP for communications"); + + result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET, + IPPROTO_SCTP, &sock); + if (result < 0) { + log_print("Can't create comms socket, check SCTP is loaded"); + goto out; + } + + /* Listen for events */ + memset(&subscribe, 0, sizeof(subscribe)); + subscribe.sctp_data_io_event = 1; + subscribe.sctp_association_event = 1; + subscribe.sctp_send_failure_event = 1; + subscribe.sctp_shutdown_event = 1; + subscribe.sctp_partial_delivery_event = 1; + + result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE, + (char *)&bufsize, sizeof(bufsize)); + if (result) + log_print("Error increasing buffer space on socket %d", result); + + result = kernel_setsockopt(sock, SOL_SCTP, SCTP_EVENTS, + (char *)&subscribe, sizeof(subscribe)); + if (result < 0) { + log_print("Failed to set SCTP_EVENTS on socket: result=%d", + result); + goto create_delsock; + } + + /* Init con struct */ + sock->sk->sk_user_data = con; + con->sock = sock; + con->sock->sk->sk_data_ready = lowcomms_data_ready; + con->rx_action = receive_from_sock; + con->connect_action = sctp_init_assoc; + + /* Bind to all interfaces. */ + for (i = 0; i < dlm_local_count; i++) { + memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr)); + make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len); + + result = add_sctp_bind_addr(con, &localaddr, addr_len, num); + if (result) + goto create_delsock; + ++num; + } + + result = sock->ops->listen(sock, 5); + if (result < 0) { + log_print("Can't set socket listening"); + goto create_delsock; + } + + return 0; + +create_delsock: + sock_release(sock); + con->sock = NULL; +out: + return result; +} + +static int tcp_listen_for_all(void) +{ + struct socket *sock = NULL; + struct connection *con = nodeid2con(0, GFP_NOFS); + int result = -EINVAL; + + if (!con) + return -ENOMEM; + + /* We don't support multi-homed hosts */ + if (dlm_local_addr[1] != NULL) { + log_print("TCP protocol can't handle multi-homed hosts, " + "try SCTP"); + return -EINVAL; + } + + log_print("Using TCP for communications"); + + sock = tcp_create_listen_sock(con, dlm_local_addr[0]); + if (sock) { + add_sock(sock, con); + result = 0; + } + else { + result = -EADDRINUSE; + } + + return result; +} + + + +static struct writequeue_entry *new_writequeue_entry(struct connection *con, + gfp_t allocation) +{ + struct writequeue_entry *entry; + + entry = kmalloc(sizeof(struct writequeue_entry), allocation); + if (!entry) + return NULL; + + entry->page = alloc_page(allocation); + if (!entry->page) { + kfree(entry); + return NULL; + } + + entry->offset = 0; + entry->len = 0; + entry->end = 0; + entry->users = 0; + entry->con = con; + + return entry; +} + +void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) +{ + struct connection *con; + struct writequeue_entry *e; + int offset = 0; + int users = 0; + + con = nodeid2con(nodeid, allocation); + if (!con) + return NULL; + + spin_lock(&con->writequeue_lock); + e = list_entry(con->writequeue.prev, struct writequeue_entry, list); + if ((&e->list == &con->writequeue) || + (PAGE_CACHE_SIZE - e->end < len)) { + e = NULL; + } else { + offset = e->end; + e->end += len; + users = e->users++; + } + spin_unlock(&con->writequeue_lock); + + if (e) { + got_one: + *ppc = page_address(e->page) + offset; + return e; + } + + e = new_writequeue_entry(con, allocation); + if (e) { + spin_lock(&con->writequeue_lock); + offset = e->end; + e->end += len; + users = e->users++; + list_add_tail(&e->list, &con->writequeue); + spin_unlock(&con->writequeue_lock); + goto got_one; + } + return NULL; +} + +void dlm_lowcomms_commit_buffer(void *mh) +{ + struct writequeue_entry *e = (struct writequeue_entry *)mh; + struct connection *con = e->con; + int users; + + spin_lock(&con->writequeue_lock); + users = --e->users; + if (users) + goto out; + e->len = e->end - e->offset; + spin_unlock(&con->writequeue_lock); + + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) { + queue_work(send_workqueue, &con->swork); + } + return; + +out: + spin_unlock(&con->writequeue_lock); + return; +} + +/* Send a message */ +static void send_to_sock(struct connection *con) +{ + int ret = 0; + const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + struct writequeue_entry *e; + int len, offset; + int count = 0; + + mutex_lock(&con->sock_mutex); + if (con->sock == NULL) + goto out_connect; + + spin_lock(&con->writequeue_lock); + for (;;) { + e = list_entry(con->writequeue.next, struct writequeue_entry, + list); + if ((struct list_head *) e == &con->writequeue) + break; + + len = e->len; + offset = e->offset; + BUG_ON(len == 0 && e->users == 0); + spin_unlock(&con->writequeue_lock); + + ret = 0; + if (len) { + ret = kernel_sendpage(con->sock, e->page, offset, len, + msg_flags); + if (ret == -EAGAIN || ret == 0) { + if (ret == -EAGAIN && + test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) && + !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { + /* Notify TCP that we're limited by the + * application window size. + */ + set_bit(SOCK_NOSPACE, &con->sock->flags); + con->sock->sk->sk_write_pending++; + } + cond_resched(); + goto out; + } + if (ret <= 0) + goto send_error; + } + + /* Don't starve people filling buffers */ + if (++count >= MAX_SEND_MSG_COUNT) { + cond_resched(); + count = 0; + } + + spin_lock(&con->writequeue_lock); + e->offset += ret; + e->len -= ret; + + if (e->len == 0 && e->users == 0) { + list_del(&e->list); + free_entry(e); + continue; + } + } + spin_unlock(&con->writequeue_lock); +out: + mutex_unlock(&con->sock_mutex); + return; + +send_error: + mutex_unlock(&con->sock_mutex); + close_connection(con, false); + lowcomms_connect_sock(con); + return; + +out_connect: + mutex_unlock(&con->sock_mutex); + if (!test_bit(CF_INIT_PENDING, &con->flags)) + lowcomms_connect_sock(con); + return; +} + +static void clean_one_writequeue(struct connection *con) +{ + struct writequeue_entry *e, *safe; + + spin_lock(&con->writequeue_lock); + list_for_each_entry_safe(e, safe, &con->writequeue, list) { + list_del(&e->list); + free_entry(e); + } + spin_unlock(&con->writequeue_lock); +} + +/* Called from recovery when it knows that a node has + left the cluster */ +int dlm_lowcomms_close(int nodeid) +{ + struct connection *con; + + log_print("closing connection to node %d", nodeid); + con = nodeid2con(nodeid, 0); + if (con) { + clear_bit(CF_CONNECT_PENDING, &con->flags); + clear_bit(CF_WRITE_PENDING, &con->flags); + set_bit(CF_CLOSE, &con->flags); + if (cancel_work_sync(&con->swork)) + log_print("canceled swork for node %d", nodeid); + if (cancel_work_sync(&con->rwork)) + log_print("canceled rwork for node %d", nodeid); + clean_one_writequeue(con); + close_connection(con, true); + } + return 0; +} + +/* Receive workqueue function */ +static void process_recv_sockets(struct work_struct *work) +{ + struct connection *con = container_of(work, struct connection, rwork); + int err; + + clear_bit(CF_READ_PENDING, &con->flags); + do { + err = con->rx_action(con); + } while (!err); +} + +/* Send workqueue function */ +static void process_send_sockets(struct work_struct *work) +{ + struct connection *con = container_of(work, struct connection, swork); + + if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { + con->connect_action(con); + set_bit(CF_WRITE_PENDING, &con->flags); + } + if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags)) + send_to_sock(con); +} + + +/* Discard all entries on the write queues */ +static void clean_writequeues(void) +{ + foreach_conn(clean_one_writequeue); +} + +static void work_stop(void) +{ + destroy_workqueue(recv_workqueue); + destroy_workqueue(send_workqueue); +} + +static int work_start(void) +{ + recv_workqueue = alloc_workqueue("dlm_recv", + WQ_UNBOUND | WQ_MEM_RECLAIM, 1); + if (!recv_workqueue) { + log_print("can't start dlm_recv"); + return -ENOMEM; + } + + send_workqueue = alloc_workqueue("dlm_send", + WQ_UNBOUND | WQ_MEM_RECLAIM, 1); + if (!send_workqueue) { + log_print("can't start dlm_send"); + destroy_workqueue(recv_workqueue); + return -ENOMEM; + } + + return 0; +} + +static void stop_conn(struct connection *con) +{ + con->flags |= 0x0F; + if (con->sock && con->sock->sk) + con->sock->sk->sk_user_data = NULL; +} + +static void free_conn(struct connection *con) +{ + close_connection(con, true); + if (con->othercon) + kmem_cache_free(con_cache, con->othercon); + hlist_del(&con->list); + kmem_cache_free(con_cache, con); +} + +void dlm_lowcomms_stop(void) +{ + /* Set all the flags to prevent any + socket activity. + */ + mutex_lock(&connections_lock); + foreach_conn(stop_conn); + mutex_unlock(&connections_lock); + + work_stop(); + + mutex_lock(&connections_lock); + clean_writequeues(); + + foreach_conn(free_conn); + + mutex_unlock(&connections_lock); + kmem_cache_destroy(con_cache); +} + +int dlm_lowcomms_start(void) +{ + int error = -EINVAL; + struct connection *con; + int i; + + for (i = 0; i < CONN_HASH_SIZE; i++) + INIT_HLIST_HEAD(&connection_hash[i]); + + init_local(); + if (!dlm_local_count) { + error = -ENOTCONN; + log_print("no local IP address has been set"); + goto out; + } + + error = -ENOMEM; + con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection), + __alignof__(struct connection), 0, + NULL); + if (!con_cache) + goto out; + + /* Start listening */ + if (dlm_config.ci_protocol == 0) + error = tcp_listen_for_all(); + else + error = sctp_listen_for_all(); + if (error) + goto fail_unlisten; + + error = work_start(); + if (error) + goto fail_unlisten; + + return 0; + +fail_unlisten: + con = nodeid2con(0,0); + if (con) { + close_connection(con, false); + kmem_cache_free(con_cache, con); + } + kmem_cache_destroy(con_cache); + +out: + return error; +} diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h new file mode 100644 index 00000000..1311e642 --- /dev/null +++ b/fs/dlm/lowcomms.h @@ -0,0 +1,25 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LOWCOMMS_DOT_H__ +#define __LOWCOMMS_DOT_H__ + +int dlm_lowcomms_start(void); +void dlm_lowcomms_stop(void); +int dlm_lowcomms_close(int nodeid); +void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc); +void dlm_lowcomms_commit_buffer(void *mh); +int dlm_lowcomms_connect_node(int nodeid); + +#endif /* __LOWCOMMS_DOT_H__ */ + diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h new file mode 100644 index 00000000..cc3e92f3 --- /dev/null +++ b/fs/dlm/lvb_table.h @@ -0,0 +1,18 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __LVB_TABLE_DOT_H__ +#define __LVB_TABLE_DOT_H__ + +extern const int dlm_lvb_operations[8][8]; + +#endif diff --git a/fs/dlm/main.c b/fs/dlm/main.c new file mode 100644 index 00000000..5a59efa0 --- /dev/null +++ b/fs/dlm/main.c @@ -0,0 +1,95 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "lock.h" +#include "user.h" +#include "memory.h" +#include "config.h" + +static int __init init_dlm(void) +{ + int error; + + error = dlm_memory_init(); + if (error) + goto out; + + error = dlm_lockspace_init(); + if (error) + goto out_mem; + + error = dlm_config_init(); + if (error) + goto out_lockspace; + + error = dlm_register_debugfs(); + if (error) + goto out_config; + + error = dlm_user_init(); + if (error) + goto out_debug; + + error = dlm_netlink_init(); + if (error) + goto out_user; + + error = dlm_plock_init(); + if (error) + goto out_netlink; + + printk("DLM installed\n"); + + return 0; + + out_netlink: + dlm_netlink_exit(); + out_user: + dlm_user_exit(); + out_debug: + dlm_unregister_debugfs(); + out_config: + dlm_config_exit(); + out_lockspace: + dlm_lockspace_exit(); + out_mem: + dlm_memory_exit(); + out: + return error; +} + +static void __exit exit_dlm(void) +{ + dlm_plock_exit(); + dlm_netlink_exit(); + dlm_user_exit(); + dlm_config_exit(); + dlm_memory_exit(); + dlm_lockspace_exit(); + dlm_unregister_debugfs(); +} + +module_init(init_dlm); +module_exit(exit_dlm); + +MODULE_DESCRIPTION("Distributed Lock Manager"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +EXPORT_SYMBOL_GPL(dlm_new_lockspace); +EXPORT_SYMBOL_GPL(dlm_release_lockspace); +EXPORT_SYMBOL_GPL(dlm_lock); +EXPORT_SYMBOL_GPL(dlm_unlock); + diff --git a/fs/dlm/member.c b/fs/dlm/member.c new file mode 100644 index 00000000..b12532e5 --- /dev/null +++ b/fs/dlm/member.c @@ -0,0 +1,390 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "recoverd.h" +#include "recover.h" +#include "rcom.h" +#include "config.h" +#include "lowcomms.h" + +static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) +{ + struct dlm_member *memb = NULL; + struct list_head *tmp; + struct list_head *newlist = &new->list; + struct list_head *head = &ls->ls_nodes; + + list_for_each(tmp, head) { + memb = list_entry(tmp, struct dlm_member, list); + if (new->nodeid < memb->nodeid) + break; + } + + if (!memb) + list_add_tail(newlist, head); + else { + /* FIXME: can use list macro here */ + newlist->prev = tmp->prev; + newlist->next = tmp; + tmp->prev->next = newlist; + tmp->prev = newlist; + } +} + +static int dlm_add_member(struct dlm_ls *ls, int nodeid) +{ + struct dlm_member *memb; + int w, error; + + memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); + if (!memb) + return -ENOMEM; + + w = dlm_node_weight(ls->ls_name, nodeid); + if (w < 0) { + kfree(memb); + return w; + } + + error = dlm_lowcomms_connect_node(nodeid); + if (error < 0) { + kfree(memb); + return error; + } + + memb->nodeid = nodeid; + memb->weight = w; + add_ordered_member(ls, memb); + ls->ls_num_nodes++; + return 0; +} + +static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb) +{ + list_move(&memb->list, &ls->ls_nodes_gone); + ls->ls_num_nodes--; +} + +int dlm_is_member(struct dlm_ls *ls, int nodeid) +{ + struct dlm_member *memb; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->nodeid == nodeid) + return 1; + } + return 0; +} + +int dlm_is_removed(struct dlm_ls *ls, int nodeid) +{ + struct dlm_member *memb; + + list_for_each_entry(memb, &ls->ls_nodes_gone, list) { + if (memb->nodeid == nodeid) + return 1; + } + return 0; +} + +static void clear_memb_list(struct list_head *head) +{ + struct dlm_member *memb; + + while (!list_empty(head)) { + memb = list_entry(head->next, struct dlm_member, list); + list_del(&memb->list); + kfree(memb); + } +} + +void dlm_clear_members(struct dlm_ls *ls) +{ + clear_memb_list(&ls->ls_nodes); + ls->ls_num_nodes = 0; +} + +void dlm_clear_members_gone(struct dlm_ls *ls) +{ + clear_memb_list(&ls->ls_nodes_gone); +} + +static void make_member_array(struct dlm_ls *ls) +{ + struct dlm_member *memb; + int i, w, x = 0, total = 0, all_zero = 0, *array; + + kfree(ls->ls_node_array); + ls->ls_node_array = NULL; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->weight) + total += memb->weight; + } + + /* all nodes revert to weight of 1 if all have weight 0 */ + + if (!total) { + total = ls->ls_num_nodes; + all_zero = 1; + } + + ls->ls_total_weight = total; + + array = kmalloc(sizeof(int) * total, GFP_NOFS); + if (!array) + return; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (!all_zero && !memb->weight) + continue; + + if (all_zero) + w = 1; + else + w = memb->weight; + + DLM_ASSERT(x < total, printk("total %d x %d\n", total, x);); + + for (i = 0; i < w; i++) + array[x++] = memb->nodeid; + } + + ls->ls_node_array = array; +} + +/* send a status request to all members just to establish comms connections */ + +static int ping_members(struct dlm_ls *ls) +{ + struct dlm_member *memb; + int error = 0; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + error = dlm_recovery_stopped(ls); + if (error) + break; + error = dlm_rcom_status(ls, memb->nodeid); + if (error) + break; + } + if (error) + log_debug(ls, "ping_members aborted %d last nodeid %d", + error, ls->ls_recover_nodeid); + return error; +} + +int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) +{ + struct dlm_member *memb, *safe; + int i, error, found, pos = 0, neg = 0, low = -1; + + /* previously removed members that we've not finished removing need to + count as a negative change so the "neg" recovery steps will happen */ + + list_for_each_entry(memb, &ls->ls_nodes_gone, list) { + log_debug(ls, "prev removed member %d", memb->nodeid); + neg++; + } + + /* move departed members from ls_nodes to ls_nodes_gone */ + + list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { + found = 0; + for (i = 0; i < rv->node_count; i++) { + if (memb->nodeid == rv->nodeids[i]) { + found = 1; + break; + } + } + + if (!found) { + neg++; + dlm_remove_member(ls, memb); + log_debug(ls, "remove member %d", memb->nodeid); + } + } + + /* Add an entry to ls_nodes_gone for members that were removed and + then added again, so that previous state for these nodes will be + cleared during recovery. */ + + for (i = 0; i < rv->new_count; i++) { + if (!dlm_is_member(ls, rv->new[i])) + continue; + log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]); + + memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); + if (!memb) + return -ENOMEM; + memb->nodeid = rv->new[i]; + list_add_tail(&memb->list, &ls->ls_nodes_gone); + neg++; + } + + /* add new members to ls_nodes */ + + for (i = 0; i < rv->node_count; i++) { + if (dlm_is_member(ls, rv->nodeids[i])) + continue; + dlm_add_member(ls, rv->nodeids[i]); + pos++; + log_debug(ls, "add member %d", rv->nodeids[i]); + } + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (low == -1 || memb->nodeid < low) + low = memb->nodeid; + } + ls->ls_low_nodeid = low; + + make_member_array(ls); + dlm_set_recover_status(ls, DLM_RS_NODES); + *neg_out = neg; + + error = ping_members(ls); + if (!error || error == -EPROTO) { + /* new_lockspace() may be waiting to know if the config + is good or bad */ + ls->ls_members_result = error; + complete(&ls->ls_members_done); + } + if (error) + goto out; + + error = dlm_recover_members_wait(ls); + out: + log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error); + return error; +} + +/* Userspace guarantees that dlm_ls_stop() has completed on all nodes before + dlm_ls_start() is called on any of them to start the new recovery. */ + +int dlm_ls_stop(struct dlm_ls *ls) +{ + int new; + + /* + * Prevent dlm_recv from being in the middle of something when we do + * the stop. This includes ensuring dlm_recv isn't processing a + * recovery message (rcom), while dlm_recoverd is aborting and + * resetting things from an in-progress recovery. i.e. we want + * dlm_recoverd to abort its recovery without worrying about dlm_recv + * processing an rcom at the same time. Stopping dlm_recv also makes + * it easy for dlm_receive_message() to check locking stopped and add a + * message to the requestqueue without races. + */ + + down_write(&ls->ls_recv_active); + + /* + * Abort any recovery that's in progress (see RECOVERY_STOP, + * dlm_recovery_stopped()) and tell any other threads running in the + * dlm to quit any processing (see RUNNING, dlm_locking_stopped()). + */ + + spin_lock(&ls->ls_recover_lock); + set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); + new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags); + ls->ls_recover_seq++; + spin_unlock(&ls->ls_recover_lock); + + /* + * Let dlm_recv run again, now any normal messages will be saved on the + * requestqueue for later. + */ + + up_write(&ls->ls_recv_active); + + /* + * This in_recovery lock does two things: + * 1) Keeps this function from returning until all threads are out + * of locking routines and locking is truly stopped. + * 2) Keeps any new requests from being processed until it's unlocked + * when recovery is complete. + */ + + if (new) + down_write(&ls->ls_in_recovery); + + /* + * The recoverd suspend/resume makes sure that dlm_recoverd (if + * running) has noticed RECOVERY_STOP above and quit processing the + * previous recovery. + */ + + dlm_recoverd_suspend(ls); + ls->ls_recover_status = 0; + dlm_recoverd_resume(ls); + + if (!ls->ls_recover_begin) + ls->ls_recover_begin = jiffies; + return 0; +} + +int dlm_ls_start(struct dlm_ls *ls) +{ + struct dlm_recover *rv = NULL, *rv_old; + int *ids = NULL, *new = NULL; + int error, ids_count = 0, new_count = 0; + + rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS); + if (!rv) + return -ENOMEM; + + error = dlm_nodeid_list(ls->ls_name, &ids, &ids_count, + &new, &new_count); + if (error < 0) + goto fail; + + spin_lock(&ls->ls_recover_lock); + + /* the lockspace needs to be stopped before it can be started */ + + if (!dlm_locking_stopped(ls)) { + spin_unlock(&ls->ls_recover_lock); + log_error(ls, "start ignored: lockspace running"); + error = -EINVAL; + goto fail; + } + + rv->nodeids = ids; + rv->node_count = ids_count; + rv->new = new; + rv->new_count = new_count; + rv->seq = ++ls->ls_recover_seq; + rv_old = ls->ls_recover_args; + ls->ls_recover_args = rv; + spin_unlock(&ls->ls_recover_lock); + + if (rv_old) { + log_error(ls, "unused recovery %llx %d", + (unsigned long long)rv_old->seq, rv_old->node_count); + kfree(rv_old->nodeids); + kfree(rv_old->new); + kfree(rv_old); + } + + dlm_recoverd_kick(ls); + return 0; + + fail: + kfree(rv); + kfree(ids); + kfree(new); + return error; +} + diff --git a/fs/dlm/member.h b/fs/dlm/member.h new file mode 100644 index 00000000..7a26fca1 --- /dev/null +++ b/fs/dlm/member.h @@ -0,0 +1,25 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __MEMBER_DOT_H__ +#define __MEMBER_DOT_H__ + +int dlm_ls_stop(struct dlm_ls *ls); +int dlm_ls_start(struct dlm_ls *ls); +void dlm_clear_members(struct dlm_ls *ls); +void dlm_clear_members_gone(struct dlm_ls *ls); +int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); +int dlm_is_removed(struct dlm_ls *ls, int nodeid); +int dlm_is_member(struct dlm_ls *ls, int nodeid); + +#endif /* __MEMBER_DOT_H__ */ + diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c new file mode 100644 index 00000000..8e0d00db --- /dev/null +++ b/fs/dlm/memory.c @@ -0,0 +1,92 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "config.h" +#include "memory.h" + +static struct kmem_cache *lkb_cache; + + +int __init dlm_memory_init(void) +{ + int ret = 0; + + lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb), + __alignof__(struct dlm_lkb), 0, NULL); + if (!lkb_cache) + ret = -ENOMEM; + return ret; +} + +void dlm_memory_exit(void) +{ + if (lkb_cache) + kmem_cache_destroy(lkb_cache); +} + +char *dlm_allocate_lvb(struct dlm_ls *ls) +{ + char *p; + + p = kzalloc(ls->ls_lvblen, GFP_NOFS); + return p; +} + +void dlm_free_lvb(char *p) +{ + kfree(p); +} + +/* FIXME: have some minimal space built-in to rsb for the name and + kmalloc a separate name if needed, like dentries are done */ + +struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen) +{ + struct dlm_rsb *r; + + DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); + + r = kzalloc(sizeof(*r) + namelen, GFP_NOFS); + return r; +} + +void dlm_free_rsb(struct dlm_rsb *r) +{ + if (r->res_lvbptr) + dlm_free_lvb(r->res_lvbptr); + kfree(r); +} + +struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + + lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS); + return lkb; +} + +void dlm_free_lkb(struct dlm_lkb *lkb) +{ + if (lkb->lkb_flags & DLM_IFL_USER) { + struct dlm_user_args *ua; + ua = lkb->lkb_ua; + if (ua) { + if (ua->lksb.sb_lvbptr) + kfree(ua->lksb.sb_lvbptr); + kfree(ua); + } + } + kmem_cache_free(lkb_cache, lkb); +} + diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h new file mode 100644 index 00000000..485fb291 --- /dev/null +++ b/fs/dlm/memory.h @@ -0,0 +1,27 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __MEMORY_DOT_H__ +#define __MEMORY_DOT_H__ + +int dlm_memory_init(void); +void dlm_memory_exit(void); +struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen); +void dlm_free_rsb(struct dlm_rsb *r); +struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls); +void dlm_free_lkb(struct dlm_lkb *l); +char *dlm_allocate_lvb(struct dlm_ls *ls); +void dlm_free_lvb(char *l); + +#endif /* __MEMORY_DOT_H__ */ + diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c new file mode 100644 index 00000000..f3396c62 --- /dev/null +++ b/fs/dlm/midcomms.c @@ -0,0 +1,137 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * midcomms.c + * + * This is the appallingly named "mid-level" comms layer. + * + * Its purpose is to take packets from the "real" comms layer, + * split them up into packets and pass them to the interested + * part of the locking mechanism. + * + * It also takes messages from the locking layer, formats them + * into packets and sends them to the comms layer. + */ + +#include "dlm_internal.h" +#include "lowcomms.h" +#include "config.h" +#include "lock.h" +#include "midcomms.h" + + +static void copy_from_cb(void *dst, const void *base, unsigned offset, + unsigned len, unsigned limit) +{ + unsigned copy = len; + + if ((copy + offset) > limit) + copy = limit - offset; + memcpy(dst, base + offset, copy); + len -= copy; + if (len) + memcpy(dst + copy, base, len); +} + +/* + * Called from the low-level comms layer to process a buffer of + * commands. + * + * Only complete messages are processed here, any "spare" bytes from + * the end of a buffer are saved and tacked onto the front of the next + * message that comes in. I doubt this will happen very often but we + * need to be able to cope with it and I don't want the task to be waiting + * for packets to come in when there is useful work to be done. + */ + +int dlm_process_incoming_buffer(int nodeid, const void *base, + unsigned offset, unsigned len, unsigned limit) +{ + union { + unsigned char __buf[DLM_INBUF_LEN]; + /* this is to force proper alignment on some arches */ + union dlm_packet p; + } __tmp; + union dlm_packet *p = &__tmp.p; + int ret = 0; + int err = 0; + uint16_t msglen; + uint32_t lockspace; + + while (len > sizeof(struct dlm_header)) { + + /* Copy just the header to check the total length. The + message may wrap around the end of the buffer back to the + start, so we need to use a temp buffer and copy_from_cb. */ + + copy_from_cb(p, base, offset, sizeof(struct dlm_header), + limit); + + msglen = le16_to_cpu(p->header.h_length); + lockspace = p->header.h_lockspace; + + err = -EINVAL; + if (msglen < sizeof(struct dlm_header)) + break; + if (p->header.h_cmd == DLM_MSG) { + if (msglen < sizeof(struct dlm_message)) + break; + } else { + if (msglen < sizeof(struct dlm_rcom)) + break; + } + err = -E2BIG; + if (msglen > dlm_config.ci_buffer_size) { + log_print("message size %d from %d too big, buf len %d", + msglen, nodeid, len); + break; + } + err = 0; + + /* If only part of the full message is contained in this + buffer, then do nothing and wait for lowcomms to call + us again later with more data. We return 0 meaning + we've consumed none of the input buffer. */ + + if (msglen > len) + break; + + /* Allocate a larger temp buffer if the full message won't fit + in the buffer on the stack (which should work for most + ordinary messages). */ + + if (msglen > sizeof(__tmp) && p == &__tmp.p) { + p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); + if (p == NULL) + return ret; + } + + copy_from_cb(p, base, offset, msglen, limit); + + BUG_ON(lockspace != p->header.h_lockspace); + + ret += msglen; + offset += msglen; + offset &= (limit - 1); + len -= msglen; + + dlm_receive_buffer(p, nodeid); + } + + if (p != &__tmp.p) + kfree(p); + + return err ? err : ret; +} + diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h new file mode 100644 index 00000000..95852a5f --- /dev/null +++ b/fs/dlm/midcomms.h @@ -0,0 +1,21 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __MIDCOMMS_DOT_H__ +#define __MIDCOMMS_DOT_H__ + +int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset, + unsigned len, unsigned limit); + +#endif /* __MIDCOMMS_DOT_H__ */ + diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c new file mode 100644 index 00000000..ef17e016 --- /dev/null +++ b/fs/dlm/netlink.c @@ -0,0 +1,139 @@ +/* + * Copyright (C) 2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include + +#include "dlm_internal.h" + +static uint32_t dlm_nl_seqnum; +static uint32_t listener_nlpid; + +static struct genl_family family = { + .id = GENL_ID_GENERATE, + .name = DLM_GENL_NAME, + .version = DLM_GENL_VERSION, +}; + +static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size) +{ + struct sk_buff *skb; + void *data; + + skb = genlmsg_new(size, GFP_NOFS); + if (!skb) + return -ENOMEM; + + /* add the message headers */ + data = genlmsg_put(skb, 0, dlm_nl_seqnum++, &family, 0, cmd); + if (!data) { + nlmsg_free(skb); + return -EINVAL; + } + + *skbp = skb; + return 0; +} + +static struct dlm_lock_data *mk_data(struct sk_buff *skb) +{ + struct nlattr *ret; + + ret = nla_reserve(skb, DLM_TYPE_LOCK, sizeof(struct dlm_lock_data)); + if (!ret) + return NULL; + return nla_data(ret); +} + +static int send_data(struct sk_buff *skb) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + void *data = genlmsg_data(genlhdr); + int rv; + + rv = genlmsg_end(skb, data); + if (rv < 0) { + nlmsg_free(skb); + return rv; + } + + return genlmsg_unicast(&init_net, skb, listener_nlpid); +} + +static int user_cmd(struct sk_buff *skb, struct genl_info *info) +{ + listener_nlpid = info->snd_pid; + printk("user_cmd nlpid %u\n", listener_nlpid); + return 0; +} + +static struct genl_ops dlm_nl_ops = { + .cmd = DLM_CMD_HELLO, + .doit = user_cmd, +}; + +int __init dlm_netlink_init(void) +{ + return genl_register_family_with_ops(&family, &dlm_nl_ops, 1); +} + +void dlm_netlink_exit(void) +{ + genl_unregister_family(&family); +} + +static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb) +{ + struct dlm_rsb *r = lkb->lkb_resource; + + memset(data, 0, sizeof(struct dlm_lock_data)); + + data->version = DLM_LOCK_DATA_VERSION; + data->nodeid = lkb->lkb_nodeid; + data->ownpid = lkb->lkb_ownpid; + data->id = lkb->lkb_id; + data->remid = lkb->lkb_remid; + data->status = lkb->lkb_status; + data->grmode = lkb->lkb_grmode; + data->rqmode = lkb->lkb_rqmode; + if (lkb->lkb_ua) + data->xid = lkb->lkb_ua->xid; + if (r) { + data->lockspace_id = r->res_ls->ls_global_id; + data->resource_namelen = r->res_length; + memcpy(data->resource_name, r->res_name, r->res_length); + } +} + +void dlm_timeout_warn(struct dlm_lkb *lkb) +{ + struct sk_buff *uninitialized_var(send_skb); + struct dlm_lock_data *data; + size_t size; + int rv; + + size = nla_total_size(sizeof(struct dlm_lock_data)) + + nla_total_size(0); /* why this? */ + + rv = prepare_data(DLM_CMD_TIMEOUT, &send_skb, size); + if (rv < 0) + return; + + data = mk_data(send_skb); + if (!data) { + nlmsg_free(send_skb); + return; + } + + fill_data(data, lkb); + + send_data(send_skb); +} + diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c new file mode 100644 index 00000000..e2b87800 --- /dev/null +++ b/fs/dlm/plock.c @@ -0,0 +1,503 @@ +/* + * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include + +#include "dlm_internal.h" +#include "lockspace.h" + +static spinlock_t ops_lock; +static struct list_head send_list; +static struct list_head recv_list; +static wait_queue_head_t send_wq; +static wait_queue_head_t recv_wq; + +struct plock_op { + struct list_head list; + int done; + struct dlm_plock_info info; +}; + +struct plock_xop { + struct plock_op xop; + void *callback; + void *fl; + void *file; + struct file_lock flc; +}; + + +static inline void set_version(struct dlm_plock_info *info) +{ + info->version[0] = DLM_PLOCK_VERSION_MAJOR; + info->version[1] = DLM_PLOCK_VERSION_MINOR; + info->version[2] = DLM_PLOCK_VERSION_PATCH; +} + +static int check_version(struct dlm_plock_info *info) +{ + if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) || + (DLM_PLOCK_VERSION_MINOR < info->version[1])) { + log_print("plock device version mismatch: " + "kernel (%u.%u.%u), user (%u.%u.%u)", + DLM_PLOCK_VERSION_MAJOR, + DLM_PLOCK_VERSION_MINOR, + DLM_PLOCK_VERSION_PATCH, + info->version[0], + info->version[1], + info->version[2]); + return -EINVAL; + } + return 0; +} + +static void send_op(struct plock_op *op) +{ + set_version(&op->info); + INIT_LIST_HEAD(&op->list); + spin_lock(&ops_lock); + list_add_tail(&op->list, &send_list); + spin_unlock(&ops_lock); + wake_up(&send_wq); +} + +/* If a process was killed while waiting for the only plock on a file, + locks_remove_posix will not see any lock on the file so it won't + send an unlock-close to us to pass on to userspace to clean up the + abandoned waiter. So, we have to insert the unlock-close when the + lock call is interrupted. */ + +static void do_unlock_close(struct dlm_ls *ls, u64 number, + struct file *file, struct file_lock *fl) +{ + struct plock_op *op; + + op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) + return; + + op->info.optype = DLM_PLOCK_OP_UNLOCK; + op->info.pid = fl->fl_pid; + op->info.fsid = ls->ls_global_id; + op->info.number = number; + op->info.start = 0; + op->info.end = OFFSET_MAX; + if (fl->fl_lmops && fl->fl_lmops->fl_grant) + op->info.owner = (__u64) fl->fl_pid; + else + op->info.owner = (__u64)(long) fl->fl_owner; + + op->info.flags |= DLM_PLOCK_FL_CLOSE; + send_op(op); +} + +int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, + int cmd, struct file_lock *fl) +{ + struct dlm_ls *ls; + struct plock_op *op; + struct plock_xop *xop; + int rv; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + xop = kzalloc(sizeof(*xop), GFP_NOFS); + if (!xop) { + rv = -ENOMEM; + goto out; + } + + op = &xop->xop; + op->info.optype = DLM_PLOCK_OP_LOCK; + op->info.pid = fl->fl_pid; + op->info.ex = (fl->fl_type == F_WRLCK); + op->info.wait = IS_SETLKW(cmd); + op->info.fsid = ls->ls_global_id; + op->info.number = number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + if (fl->fl_lmops && fl->fl_lmops->fl_grant) { + /* fl_owner is lockd which doesn't distinguish + processes on the nfs client */ + op->info.owner = (__u64) fl->fl_pid; + xop->callback = fl->fl_lmops->fl_grant; + locks_init_lock(&xop->flc); + locks_copy_lock(&xop->flc, fl); + xop->fl = fl; + xop->file = file; + } else { + op->info.owner = (__u64)(long) fl->fl_owner; + xop->callback = NULL; + } + + send_op(op); + + if (xop->callback == NULL) { + rv = wait_event_killable(recv_wq, (op->done != 0)); + if (rv == -ERESTARTSYS) { + log_debug(ls, "dlm_posix_lock: wait killed %llx", + (unsigned long long)number); + spin_lock(&ops_lock); + list_del(&op->list); + spin_unlock(&ops_lock); + kfree(xop); + do_unlock_close(ls, number, file, fl); + goto out; + } + } else { + rv = FILE_LOCK_DEFERRED; + goto out; + } + + spin_lock(&ops_lock); + if (!list_empty(&op->list)) { + log_error(ls, "dlm_posix_lock: op on list %llx", + (unsigned long long)number); + list_del(&op->list); + } + spin_unlock(&ops_lock); + + rv = op->info.rv; + + if (!rv) { + if (posix_lock_file_wait(file, fl) < 0) + log_error(ls, "dlm_posix_lock: vfs lock error %llx", + (unsigned long long)number); + } + + kfree(xop); +out: + dlm_put_lockspace(ls); + return rv; +} +EXPORT_SYMBOL_GPL(dlm_posix_lock); + +/* Returns failure iff a successful lock operation should be canceled */ +static int dlm_plock_callback(struct plock_op *op) +{ + struct file *file; + struct file_lock *fl; + struct file_lock *flc; + int (*notify)(void *, void *, int) = NULL; + struct plock_xop *xop = (struct plock_xop *)op; + int rv = 0; + + spin_lock(&ops_lock); + if (!list_empty(&op->list)) { + log_print("dlm_plock_callback: op on list %llx", + (unsigned long long)op->info.number); + list_del(&op->list); + } + spin_unlock(&ops_lock); + + /* check if the following 2 are still valid or make a copy */ + file = xop->file; + flc = &xop->flc; + fl = xop->fl; + notify = xop->callback; + + if (op->info.rv) { + notify(fl, NULL, op->info.rv); + goto out; + } + + /* got fs lock; bookkeep locally as well: */ + flc->fl_flags &= ~FL_SLEEP; + if (posix_lock_file(file, flc, NULL)) { + /* + * This can only happen in the case of kmalloc() failure. + * The filesystem's own lock is the authoritative lock, + * so a failure to get the lock locally is not a disaster. + * As long as the fs cannot reliably cancel locks (especially + * in a low-memory situation), we're better off ignoring + * this failure than trying to recover. + */ + log_print("dlm_plock_callback: vfs lock error %llx file %p fl %p", + (unsigned long long)op->info.number, file, fl); + } + + rv = notify(fl, NULL, 0); + if (rv) { + /* XXX: We need to cancel the fs lock here: */ + log_print("dlm_plock_callback: lock granted after lock request " + "failed; dangling lock!\n"); + goto out; + } + +out: + kfree(xop); + return rv; +} + +int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, + struct file_lock *fl) +{ + struct dlm_ls *ls; + struct plock_op *op; + int rv; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) { + rv = -ENOMEM; + goto out; + } + + if (posix_lock_file_wait(file, fl) < 0) + log_error(ls, "dlm_posix_unlock: vfs unlock error %llx", + (unsigned long long)number); + + op->info.optype = DLM_PLOCK_OP_UNLOCK; + op->info.pid = fl->fl_pid; + op->info.fsid = ls->ls_global_id; + op->info.number = number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + if (fl->fl_lmops && fl->fl_lmops->fl_grant) + op->info.owner = (__u64) fl->fl_pid; + else + op->info.owner = (__u64)(long) fl->fl_owner; + + if (fl->fl_flags & FL_CLOSE) { + op->info.flags |= DLM_PLOCK_FL_CLOSE; + send_op(op); + rv = 0; + goto out; + } + + send_op(op); + wait_event(recv_wq, (op->done != 0)); + + spin_lock(&ops_lock); + if (!list_empty(&op->list)) { + log_error(ls, "dlm_posix_unlock: op on list %llx", + (unsigned long long)number); + list_del(&op->list); + } + spin_unlock(&ops_lock); + + rv = op->info.rv; + + if (rv == -ENOENT) + rv = 0; + + kfree(op); +out: + dlm_put_lockspace(ls); + return rv; +} +EXPORT_SYMBOL_GPL(dlm_posix_unlock); + +int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, + struct file_lock *fl) +{ + struct dlm_ls *ls; + struct plock_op *op; + int rv; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) { + rv = -ENOMEM; + goto out; + } + + op->info.optype = DLM_PLOCK_OP_GET; + op->info.pid = fl->fl_pid; + op->info.ex = (fl->fl_type == F_WRLCK); + op->info.fsid = ls->ls_global_id; + op->info.number = number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + if (fl->fl_lmops && fl->fl_lmops->fl_grant) + op->info.owner = (__u64) fl->fl_pid; + else + op->info.owner = (__u64)(long) fl->fl_owner; + + send_op(op); + wait_event(recv_wq, (op->done != 0)); + + spin_lock(&ops_lock); + if (!list_empty(&op->list)) { + log_error(ls, "dlm_posix_get: op on list %llx", + (unsigned long long)number); + list_del(&op->list); + } + spin_unlock(&ops_lock); + + /* info.rv from userspace is 1 for conflict, 0 for no-conflict, + -ENOENT if there are no locks on the file */ + + rv = op->info.rv; + + fl->fl_type = F_UNLCK; + if (rv == -ENOENT) + rv = 0; + else if (rv > 0) { + locks_init_lock(fl); + fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; + fl->fl_flags = FL_POSIX; + fl->fl_pid = op->info.pid; + fl->fl_start = op->info.start; + fl->fl_end = op->info.end; + rv = 0; + } + + kfree(op); +out: + dlm_put_lockspace(ls); + return rv; +} +EXPORT_SYMBOL_GPL(dlm_posix_get); + +/* a read copies out one plock request from the send list */ +static ssize_t dev_read(struct file *file, char __user *u, size_t count, + loff_t *ppos) +{ + struct dlm_plock_info info; + struct plock_op *op = NULL; + + if (count < sizeof(info)) + return -EINVAL; + + spin_lock(&ops_lock); + if (!list_empty(&send_list)) { + op = list_entry(send_list.next, struct plock_op, list); + if (op->info.flags & DLM_PLOCK_FL_CLOSE) + list_del(&op->list); + else + list_move(&op->list, &recv_list); + memcpy(&info, &op->info, sizeof(info)); + } + spin_unlock(&ops_lock); + + if (!op) + return -EAGAIN; + + /* there is no need to get a reply from userspace for unlocks + that were generated by the vfs cleaning up for a close + (the process did not make an unlock call). */ + + if (op->info.flags & DLM_PLOCK_FL_CLOSE) + kfree(op); + + if (copy_to_user(u, &info, sizeof(info))) + return -EFAULT; + return sizeof(info); +} + +/* a write copies in one plock result that should match a plock_op + on the recv list */ +static ssize_t dev_write(struct file *file, const char __user *u, size_t count, + loff_t *ppos) +{ + struct dlm_plock_info info; + struct plock_op *op; + int found = 0, do_callback = 0; + + if (count != sizeof(info)) + return -EINVAL; + + if (copy_from_user(&info, u, sizeof(info))) + return -EFAULT; + + if (check_version(&info)) + return -EINVAL; + + spin_lock(&ops_lock); + list_for_each_entry(op, &recv_list, list) { + if (op->info.fsid == info.fsid && + op->info.number == info.number && + op->info.owner == info.owner) { + struct plock_xop *xop = (struct plock_xop *)op; + list_del_init(&op->list); + memcpy(&op->info, &info, sizeof(info)); + if (xop->callback) + do_callback = 1; + else + op->done = 1; + found = 1; + break; + } + } + spin_unlock(&ops_lock); + + if (found) { + if (do_callback) + dlm_plock_callback(op); + else + wake_up(&recv_wq); + } else + log_print("dev_write no op %x %llx", info.fsid, + (unsigned long long)info.number); + return count; +} + +static unsigned int dev_poll(struct file *file, poll_table *wait) +{ + unsigned int mask = 0; + + poll_wait(file, &send_wq, wait); + + spin_lock(&ops_lock); + if (!list_empty(&send_list)) + mask = POLLIN | POLLRDNORM; + spin_unlock(&ops_lock); + + return mask; +} + +static const struct file_operations dev_fops = { + .read = dev_read, + .write = dev_write, + .poll = dev_poll, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice plock_dev_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = DLM_PLOCK_MISC_NAME, + .fops = &dev_fops +}; + +int dlm_plock_init(void) +{ + int rv; + + spin_lock_init(&ops_lock); + INIT_LIST_HEAD(&send_list); + INIT_LIST_HEAD(&recv_list); + init_waitqueue_head(&send_wq); + init_waitqueue_head(&recv_wq); + + rv = misc_register(&plock_dev_misc); + if (rv) + log_print("dlm_plock_init: misc_register failed %d", rv); + return rv; +} + +void dlm_plock_exit(void) +{ + if (misc_deregister(&plock_dev_misc) < 0) + log_print("dlm_plock_exit: misc_deregister failed"); +} + diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c new file mode 100644 index 00000000..f10a50f2 --- /dev/null +++ b/fs/dlm/rcom.c @@ -0,0 +1,511 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "lowcomms.h" +#include "midcomms.h" +#include "rcom.h" +#include "recover.h" +#include "dir.h" +#include "config.h" +#include "memory.h" +#include "lock.h" +#include "util.h" + + +static int rcom_response(struct dlm_ls *ls) +{ + return test_bit(LSFL_RCOM_READY, &ls->ls_flags); +} + +static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, + struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + char *mb; + int mb_len = sizeof(struct dlm_rcom) + len; + + mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); + if (!mh) { + log_print("create_rcom to %d type %d len %d ENOBUFS", + to_nodeid, type, len); + return -ENOBUFS; + } + memset(mb, 0, mb_len); + + rc = (struct dlm_rcom *) mb; + + rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + rc->rc_header.h_lockspace = ls->ls_global_id; + rc->rc_header.h_nodeid = dlm_our_nodeid(); + rc->rc_header.h_length = mb_len; + rc->rc_header.h_cmd = DLM_RCOM; + + rc->rc_type = type; + + spin_lock(&ls->ls_recover_lock); + rc->rc_seq = ls->ls_recover_seq; + spin_unlock(&ls->ls_recover_lock); + + *mh_ret = mh; + *rc_ret = rc; + return 0; +} + +static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh, + struct dlm_rcom *rc) +{ + dlm_rcom_out(rc); + dlm_lowcomms_commit_buffer(mh); +} + +/* When replying to a status request, a node also sends back its + configuration values. The requesting node then checks that the remote + node is configured the same way as itself. */ + +static void make_config(struct dlm_ls *ls, struct rcom_config *rf) +{ + rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen); + rf->rf_lsflags = cpu_to_le32(ls->ls_exflags); +} + +static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) +{ + struct rcom_config *rf = (struct rcom_config *) rc->rc_buf; + size_t conf_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_config); + + if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) { + log_error(ls, "version mismatch: %x nodeid %d: %x", + DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid, + rc->rc_header.h_version); + return -EPROTO; + } + + if (rc->rc_header.h_length < conf_size) { + log_error(ls, "config too short: %d nodeid %d", + rc->rc_header.h_length, nodeid); + return -EPROTO; + } + + if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen || + le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) { + log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x", + ls->ls_lvblen, ls->ls_exflags, nodeid, + le32_to_cpu(rf->rf_lvblen), + le32_to_cpu(rf->rf_lsflags)); + return -EPROTO; + } + return 0; +} + +static void allow_sync_reply(struct dlm_ls *ls, uint64_t *new_seq) +{ + spin_lock(&ls->ls_rcom_spin); + *new_seq = ++ls->ls_rcom_seq; + set_bit(LSFL_RCOM_WAIT, &ls->ls_flags); + spin_unlock(&ls->ls_rcom_spin); +} + +static void disallow_sync_reply(struct dlm_ls *ls) +{ + spin_lock(&ls->ls_rcom_spin); + clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags); + clear_bit(LSFL_RCOM_READY, &ls->ls_flags); + spin_unlock(&ls->ls_rcom_spin); +} + +int dlm_rcom_status(struct dlm_ls *ls, int nodeid) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + int error = 0; + + ls->ls_recover_nodeid = nodeid; + + if (nodeid == dlm_our_nodeid()) { + rc = ls->ls_recover_buf; + rc->rc_result = dlm_recover_status(ls); + goto out; + } + + error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh); + if (error) + goto out; + + allow_sync_reply(ls, &rc->rc_id); + memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size); + + send_rcom(ls, mh, rc); + + error = dlm_wait_function(ls, &rcom_response); + disallow_sync_reply(ls); + if (error) + goto out; + + rc = ls->ls_recover_buf; + + if (rc->rc_result == -ESRCH) { + /* we pretend the remote lockspace exists with 0 status */ + log_debug(ls, "remote node %d not ready", nodeid); + rc->rc_result = 0; + } else + error = check_config(ls, rc, nodeid); + /* the caller looks at rc_result for the remote recovery status */ + out: + return error; +} + +static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + int error, nodeid = rc_in->rc_header.h_nodeid; + + error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY, + sizeof(struct rcom_config), &rc, &mh); + if (error) + return; + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + rc->rc_result = dlm_recover_status(ls); + make_config(ls, (struct rcom_config *) rc->rc_buf); + + send_rcom(ls, mh, rc); +} + +static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + spin_lock(&ls->ls_rcom_spin); + if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) || + rc_in->rc_id != ls->ls_rcom_seq) { + log_debug(ls, "reject reply %d from %d seq %llx expect %llx", + rc_in->rc_type, rc_in->rc_header.h_nodeid, + (unsigned long long)rc_in->rc_id, + (unsigned long long)ls->ls_rcom_seq); + goto out; + } + memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length); + set_bit(LSFL_RCOM_READY, &ls->ls_flags); + clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags); + wake_up(&ls->ls_wait_general); + out: + spin_unlock(&ls->ls_rcom_spin); +} + +int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + int error = 0; + int max_size = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom); + + ls->ls_recover_nodeid = nodeid; + + if (nodeid == dlm_our_nodeid()) { + ls->ls_recover_buf->rc_header.h_length = + dlm_config.ci_buffer_size; + dlm_copy_master_names(ls, last_name, last_len, + ls->ls_recover_buf->rc_buf, + max_size, nodeid); + goto out; + } + + error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh); + if (error) + goto out; + memcpy(rc->rc_buf, last_name, last_len); + + allow_sync_reply(ls, &rc->rc_id); + memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size); + + send_rcom(ls, mh, rc); + + error = dlm_wait_function(ls, &rcom_response); + disallow_sync_reply(ls); + out: + return error; +} + +static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + int error, inlen, outlen, nodeid; + + nodeid = rc_in->rc_header.h_nodeid; + inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); + outlen = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom); + + error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh); + if (error) + return; + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + + dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, + nodeid); + send_rcom(ls, mh, rc); +} + +int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + struct dlm_ls *ls = r->res_ls; + int error; + + error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length, + &rc, &mh); + if (error) + goto out; + memcpy(rc->rc_buf, r->res_name, r->res_length); + rc->rc_id = (unsigned long) r; + + send_rcom(ls, mh, rc); + out: + return error; +} + +static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid; + int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); + + error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh); + if (error) + return; + + error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid); + if (error) + ret_nodeid = error; + rc->rc_result = ret_nodeid; + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + + send_rcom(ls, mh, rc); +} + +static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + dlm_recover_master_reply(ls, rc_in); +} + +static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, + struct rcom_lock *rl) +{ + memset(rl, 0, sizeof(*rl)); + + rl->rl_ownpid = cpu_to_le32(lkb->lkb_ownpid); + rl->rl_lkid = cpu_to_le32(lkb->lkb_id); + rl->rl_exflags = cpu_to_le32(lkb->lkb_exflags); + rl->rl_flags = cpu_to_le32(lkb->lkb_flags); + rl->rl_lvbseq = cpu_to_le32(lkb->lkb_lvbseq); + rl->rl_rqmode = lkb->lkb_rqmode; + rl->rl_grmode = lkb->lkb_grmode; + rl->rl_status = lkb->lkb_status; + rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type); + + if (lkb->lkb_bastfn) + rl->rl_asts |= DLM_CB_BAST; + if (lkb->lkb_astfn) + rl->rl_asts |= DLM_CB_CAST; + + rl->rl_namelen = cpu_to_le16(r->res_length); + memcpy(rl->rl_name, r->res_name, r->res_length); + + /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ? + If so, receive_rcom_lock_args() won't take this copy. */ + + if (lkb->lkb_lvbptr) + memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); +} + +int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +{ + struct dlm_ls *ls = r->res_ls; + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + struct rcom_lock *rl; + int error, len = sizeof(struct rcom_lock); + + if (lkb->lkb_lvbptr) + len += ls->ls_lvblen; + + error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh); + if (error) + goto out; + + rl = (struct rcom_lock *) rc->rc_buf; + pack_rcom_lock(r, lkb, rl); + rc->rc_id = (unsigned long) r; + + send_rcom(ls, mh, rc); + out: + return error; +} + +/* needs at least dlm_rcom + rcom_lock */ +static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + struct dlm_rcom *rc; + struct dlm_mhandle *mh; + int error, nodeid = rc_in->rc_header.h_nodeid; + + dlm_recover_master_copy(ls, rc_in); + + error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY, + sizeof(struct rcom_lock), &rc, &mh); + if (error) + return; + + /* We send back the same rcom_lock struct we received, but + dlm_recover_master_copy() has filled in rl_remid and rl_result */ + + memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock)); + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + + send_rcom(ls, mh, rc); +} + +/* If the lockspace doesn't exist then still send a status message + back; it's possible that it just doesn't have its global_id yet. */ + +int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) +{ + struct dlm_rcom *rc; + struct rcom_config *rf; + struct dlm_mhandle *mh; + char *mb; + int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config); + + mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb); + if (!mh) + return -ENOBUFS; + memset(mb, 0, mb_len); + + rc = (struct dlm_rcom *) mb; + + rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace; + rc->rc_header.h_nodeid = dlm_our_nodeid(); + rc->rc_header.h_length = mb_len; + rc->rc_header.h_cmd = DLM_RCOM; + + rc->rc_type = DLM_RCOM_STATUS_REPLY; + rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; + rc->rc_result = -ESRCH; + + rf = (struct rcom_config *) rc->rc_buf; + rf->rf_lvblen = cpu_to_le32(~0U); + + dlm_rcom_out(rc); + dlm_lowcomms_commit_buffer(mh); + + return 0; +} + +static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + uint64_t seq; + int rv = 0; + + switch (rc->rc_type) { + case DLM_RCOM_STATUS_REPLY: + case DLM_RCOM_NAMES_REPLY: + case DLM_RCOM_LOOKUP_REPLY: + case DLM_RCOM_LOCK_REPLY: + spin_lock(&ls->ls_recover_lock); + seq = ls->ls_recover_seq; + spin_unlock(&ls->ls_recover_lock); + if (rc->rc_seq_reply != seq) { + log_debug(ls, "ignoring old reply %x from %d " + "seq_reply %llx expect %llx", + rc->rc_type, rc->rc_header.h_nodeid, + (unsigned long long)rc->rc_seq_reply, + (unsigned long long)seq); + rv = 1; + } + } + return rv; +} + +/* Called by dlm_recv; corresponds to dlm_receive_message() but special + recovery-only comms are sent through here. */ + +void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) +{ + int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock); + + if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) { + log_debug(ls, "ignoring recovery message %x from %d", + rc->rc_type, nodeid); + goto out; + } + + if (is_old_reply(ls, rc)) + goto out; + + switch (rc->rc_type) { + case DLM_RCOM_STATUS: + receive_rcom_status(ls, rc); + break; + + case DLM_RCOM_NAMES: + receive_rcom_names(ls, rc); + break; + + case DLM_RCOM_LOOKUP: + receive_rcom_lookup(ls, rc); + break; + + case DLM_RCOM_LOCK: + if (rc->rc_header.h_length < lock_size) + goto Eshort; + receive_rcom_lock(ls, rc); + break; + + case DLM_RCOM_STATUS_REPLY: + receive_sync_reply(ls, rc); + break; + + case DLM_RCOM_NAMES_REPLY: + receive_sync_reply(ls, rc); + break; + + case DLM_RCOM_LOOKUP_REPLY: + receive_rcom_lookup_reply(ls, rc); + break; + + case DLM_RCOM_LOCK_REPLY: + if (rc->rc_header.h_length < lock_size) + goto Eshort; + dlm_recover_process_copy(ls, rc); + break; + + default: + log_error(ls, "receive_rcom bad type %d", rc->rc_type); + } +out: + return; +Eshort: + log_error(ls, "recovery message %x from %d is too short", + rc->rc_type, nodeid); +} + diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h new file mode 100644 index 00000000..b09abd29 --- /dev/null +++ b/fs/dlm/rcom.h @@ -0,0 +1,25 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __RCOM_DOT_H__ +#define __RCOM_DOT_H__ + +int dlm_rcom_status(struct dlm_ls *ls, int nodeid); +int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); +int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); +int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); +void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid); +int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in); + +#endif + diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c new file mode 100644 index 00000000..14638235 --- /dev/null +++ b/fs/dlm/recover.c @@ -0,0 +1,789 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "dir.h" +#include "config.h" +#include "ast.h" +#include "memory.h" +#include "rcom.h" +#include "lock.h" +#include "lowcomms.h" +#include "member.h" +#include "recover.h" + + +/* + * Recovery waiting routines: these functions wait for a particular reply from + * a remote node, or for the remote node to report a certain status. They need + * to abort if the lockspace is stopped indicating a node has failed (perhaps + * the one being waited for). + */ + +/* + * Wait until given function returns non-zero or lockspace is stopped + * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another + * function thinks it could have completed the waited-on task, they should wake + * up ls_wait_general to get an immediate response rather than waiting for the + * timer to detect the result. A timer wakes us up periodically while waiting + * to see if we should abort due to a node failure. This should only be called + * by the dlm_recoverd thread. + */ + +static void dlm_wait_timer_fn(unsigned long data) +{ + struct dlm_ls *ls = (struct dlm_ls *) data; + mod_timer(&ls->ls_timer, jiffies + (dlm_config.ci_recover_timer * HZ)); + wake_up(&ls->ls_wait_general); +} + +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)) +{ + int error = 0; + + init_timer(&ls->ls_timer); + ls->ls_timer.function = dlm_wait_timer_fn; + ls->ls_timer.data = (long) ls; + ls->ls_timer.expires = jiffies + (dlm_config.ci_recover_timer * HZ); + add_timer(&ls->ls_timer); + + wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls)); + del_timer_sync(&ls->ls_timer); + + if (dlm_recovery_stopped(ls)) { + log_debug(ls, "dlm_wait_function aborted"); + error = -EINTR; + } + return error; +} + +/* + * An efficient way for all nodes to wait for all others to have a certain + * status. The node with the lowest nodeid polls all the others for their + * status (wait_status_all) and all the others poll the node with the low id + * for its accumulated result (wait_status_low). When all nodes have set + * status flag X, then status flag X_ALL will be set on the low nodeid. + */ + +uint32_t dlm_recover_status(struct dlm_ls *ls) +{ + uint32_t status; + spin_lock(&ls->ls_recover_lock); + status = ls->ls_recover_status; + spin_unlock(&ls->ls_recover_lock); + return status; +} + +void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status) +{ + spin_lock(&ls->ls_recover_lock); + ls->ls_recover_status |= status; + spin_unlock(&ls->ls_recover_lock); +} + +static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status) +{ + struct dlm_rcom *rc = ls->ls_recover_buf; + struct dlm_member *memb; + int error = 0, delay; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + delay = 0; + for (;;) { + if (dlm_recovery_stopped(ls)) { + error = -EINTR; + goto out; + } + + error = dlm_rcom_status(ls, memb->nodeid); + if (error) + goto out; + + if (rc->rc_result & wait_status) + break; + if (delay < 1000) + delay += 20; + msleep(delay); + } + } + out: + return error; +} + +static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status) +{ + struct dlm_rcom *rc = ls->ls_recover_buf; + int error = 0, delay = 0, nodeid = ls->ls_low_nodeid; + + for (;;) { + if (dlm_recovery_stopped(ls)) { + error = -EINTR; + goto out; + } + + error = dlm_rcom_status(ls, nodeid); + if (error) + break; + + if (rc->rc_result & wait_status) + break; + if (delay < 1000) + delay += 20; + msleep(delay); + } + out: + return error; +} + +static int wait_status(struct dlm_ls *ls, uint32_t status) +{ + uint32_t status_all = status << 1; + int error; + + if (ls->ls_low_nodeid == dlm_our_nodeid()) { + error = wait_status_all(ls, status); + if (!error) + dlm_set_recover_status(ls, status_all); + } else + error = wait_status_low(ls, status_all); + + return error; +} + +int dlm_recover_members_wait(struct dlm_ls *ls) +{ + return wait_status(ls, DLM_RS_NODES); +} + +int dlm_recover_directory_wait(struct dlm_ls *ls) +{ + return wait_status(ls, DLM_RS_DIR); +} + +int dlm_recover_locks_wait(struct dlm_ls *ls) +{ + return wait_status(ls, DLM_RS_LOCKS); +} + +int dlm_recover_done_wait(struct dlm_ls *ls) +{ + return wait_status(ls, DLM_RS_DONE); +} + +/* + * The recover_list contains all the rsb's for which we've requested the new + * master nodeid. As replies are returned from the resource directories the + * rsb's are removed from the list. When the list is empty we're done. + * + * The recover_list is later similarly used for all rsb's for which we've sent + * new lkb's and need to receive new corresponding lkid's. + * + * We use the address of the rsb struct as a simple local identifier for the + * rsb so we can match an rcom reply with the rsb it was sent for. + */ + +static int recover_list_empty(struct dlm_ls *ls) +{ + int empty; + + spin_lock(&ls->ls_recover_list_lock); + empty = list_empty(&ls->ls_recover_list); + spin_unlock(&ls->ls_recover_list_lock); + + return empty; +} + +static void recover_list_add(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + + spin_lock(&ls->ls_recover_list_lock); + if (list_empty(&r->res_recover_list)) { + list_add_tail(&r->res_recover_list, &ls->ls_recover_list); + ls->ls_recover_list_count++; + dlm_hold_rsb(r); + } + spin_unlock(&ls->ls_recover_list_lock); +} + +static void recover_list_del(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + + spin_lock(&ls->ls_recover_list_lock); + list_del_init(&r->res_recover_list); + ls->ls_recover_list_count--; + spin_unlock(&ls->ls_recover_list_lock); + + dlm_put_rsb(r); +} + +static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id) +{ + struct dlm_rsb *r = NULL; + + spin_lock(&ls->ls_recover_list_lock); + + list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) { + if (id == (unsigned long) r) + goto out; + } + r = NULL; + out: + spin_unlock(&ls->ls_recover_list_lock); + return r; +} + +static void recover_list_clear(struct dlm_ls *ls) +{ + struct dlm_rsb *r, *s; + + spin_lock(&ls->ls_recover_list_lock); + list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) { + list_del_init(&r->res_recover_list); + r->res_recover_locks_count = 0; + dlm_put_rsb(r); + ls->ls_recover_list_count--; + } + + if (ls->ls_recover_list_count != 0) { + log_error(ls, "warning: recover_list_count %d", + ls->ls_recover_list_count); + ls->ls_recover_list_count = 0; + } + spin_unlock(&ls->ls_recover_list_lock); +} + + +/* Master recovery: find new master node for rsb's that were + mastered on nodes that have been removed. + + dlm_recover_masters + recover_master + dlm_send_rcom_lookup -> receive_rcom_lookup + dlm_dir_lookup + receive_rcom_lookup_reply <- + dlm_recover_master_reply + set_new_master + set_master_lkbs + set_lock_master +*/ + +/* + * Set the lock master for all LKBs in a lock queue + * If we are the new master of the rsb, we may have received new + * MSTCPY locks from other nodes already which we need to ignore + * when setting the new nodeid. + */ + +static void set_lock_master(struct list_head *queue, int nodeid) +{ + struct dlm_lkb *lkb; + + list_for_each_entry(lkb, queue, lkb_statequeue) + if (!(lkb->lkb_flags & DLM_IFL_MSTCPY)) + lkb->lkb_nodeid = nodeid; +} + +static void set_master_lkbs(struct dlm_rsb *r) +{ + set_lock_master(&r->res_grantqueue, r->res_nodeid); + set_lock_master(&r->res_convertqueue, r->res_nodeid); + set_lock_master(&r->res_waitqueue, r->res_nodeid); +} + +/* + * Propagate the new master nodeid to locks + * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider. + * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which + * rsb's to consider. + */ + +static void set_new_master(struct dlm_rsb *r, int nodeid) +{ + lock_rsb(r); + r->res_nodeid = nodeid; + set_master_lkbs(r); + rsb_set_flag(r, RSB_NEW_MASTER); + rsb_set_flag(r, RSB_NEW_MASTER2); + unlock_rsb(r); +} + +/* + * We do async lookups on rsb's that need new masters. The rsb's + * waiting for a lookup reply are kept on the recover_list. + */ + +static int recover_master(struct dlm_rsb *r) +{ + struct dlm_ls *ls = r->res_ls; + int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); + + dir_nodeid = dlm_dir_nodeid(r); + + if (dir_nodeid == our_nodeid) { + error = dlm_dir_lookup(ls, our_nodeid, r->res_name, + r->res_length, &ret_nodeid); + if (error) + log_error(ls, "recover dir lookup error %d", error); + + if (ret_nodeid == our_nodeid) + ret_nodeid = 0; + set_new_master(r, ret_nodeid); + } else { + recover_list_add(r); + error = dlm_send_rcom_lookup(r, dir_nodeid); + } + + return error; +} + +/* + * When not using a directory, most resource names will hash to a new static + * master nodeid and the resource will need to be remastered. + */ + +static int recover_master_static(struct dlm_rsb *r) +{ + int master = dlm_dir_nodeid(r); + + if (master == dlm_our_nodeid()) + master = 0; + + if (r->res_nodeid != master) { + if (is_master(r)) + dlm_purge_mstcpy_locks(r); + set_new_master(r, master); + return 1; + } + return 0; +} + +/* + * Go through local root resources and for each rsb which has a master which + * has departed, get the new master nodeid from the directory. The dir will + * assign mastery to the first node to look up the new master. That means + * we'll discover in this lookup if we're the new master of any rsb's. + * + * We fire off all the dir lookup requests individually and asynchronously to + * the correct dir node. + */ + +int dlm_recover_masters(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + int error = 0, count = 0; + + log_debug(ls, "dlm_recover_masters"); + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + if (dlm_recovery_stopped(ls)) { + up_read(&ls->ls_root_sem); + error = -EINTR; + goto out; + } + + if (dlm_no_directory(ls)) + count += recover_master_static(r); + else if (!is_master(r) && + (dlm_is_removed(ls, r->res_nodeid) || + rsb_flag(r, RSB_NEW_MASTER))) { + recover_master(r); + count++; + } + + schedule(); + } + up_read(&ls->ls_root_sem); + + log_debug(ls, "dlm_recover_masters %d resources", count); + + error = dlm_wait_function(ls, &recover_list_empty); + out: + if (error) + recover_list_clear(ls); + return error; +} + +int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + struct dlm_rsb *r; + int nodeid; + + r = recover_list_find(ls, rc->rc_id); + if (!r) { + log_error(ls, "dlm_recover_master_reply no id %llx", + (unsigned long long)rc->rc_id); + goto out; + } + + nodeid = rc->rc_result; + if (nodeid == dlm_our_nodeid()) + nodeid = 0; + + set_new_master(r, nodeid); + recover_list_del(r); + + if (recover_list_empty(ls)) + wake_up(&ls->ls_wait_general); + out: + return 0; +} + + +/* Lock recovery: rebuild the process-copy locks we hold on a + remastered rsb on the new rsb master. + + dlm_recover_locks + recover_locks + recover_locks_queue + dlm_send_rcom_lock -> receive_rcom_lock + dlm_recover_master_copy + receive_rcom_lock_reply <- + dlm_recover_process_copy +*/ + + +/* + * keep a count of the number of lkb's we send to the new master; when we get + * an equal number of replies then recovery for the rsb is done + */ + +static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head) +{ + struct dlm_lkb *lkb; + int error = 0; + + list_for_each_entry(lkb, head, lkb_statequeue) { + error = dlm_send_rcom_lock(r, lkb); + if (error) + break; + r->res_recover_locks_count++; + } + + return error; +} + +static int recover_locks(struct dlm_rsb *r) +{ + int error = 0; + + lock_rsb(r); + + DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r);); + + error = recover_locks_queue(r, &r->res_grantqueue); + if (error) + goto out; + error = recover_locks_queue(r, &r->res_convertqueue); + if (error) + goto out; + error = recover_locks_queue(r, &r->res_waitqueue); + if (error) + goto out; + + if (r->res_recover_locks_count) + recover_list_add(r); + else + rsb_clear_flag(r, RSB_NEW_MASTER); + out: + unlock_rsb(r); + return error; +} + +int dlm_recover_locks(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + int error, count = 0; + + log_debug(ls, "dlm_recover_locks"); + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + if (is_master(r)) { + rsb_clear_flag(r, RSB_NEW_MASTER); + continue; + } + + if (!rsb_flag(r, RSB_NEW_MASTER)) + continue; + + if (dlm_recovery_stopped(ls)) { + error = -EINTR; + up_read(&ls->ls_root_sem); + goto out; + } + + error = recover_locks(r); + if (error) { + up_read(&ls->ls_root_sem); + goto out; + } + + count += r->res_recover_locks_count; + } + up_read(&ls->ls_root_sem); + + log_debug(ls, "dlm_recover_locks %d locks", count); + + error = dlm_wait_function(ls, &recover_list_empty); + out: + if (error) + recover_list_clear(ls); + else + dlm_set_recover_status(ls, DLM_RS_LOCKS); + return error; +} + +void dlm_recovered_lock(struct dlm_rsb *r) +{ + DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r);); + + r->res_recover_locks_count--; + if (!r->res_recover_locks_count) { + rsb_clear_flag(r, RSB_NEW_MASTER); + recover_list_del(r); + } + + if (recover_list_empty(r->res_ls)) + wake_up(&r->res_ls->ls_wait_general); +} + +/* + * The lvb needs to be recovered on all master rsb's. This includes setting + * the VALNOTVALID flag if necessary, and determining the correct lvb contents + * based on the lvb's of the locks held on the rsb. + * + * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it + * was already set prior to recovery, it's not cleared, regardless of locks. + * + * The LVB contents are only considered for changing when this is a new master + * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with + * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken + * from the lkb with the largest lvb sequence number. + */ + +static void recover_lvb(struct dlm_rsb *r) +{ + struct dlm_lkb *lkb, *high_lkb = NULL; + uint32_t high_seq = 0; + int lock_lvb_exists = 0; + int big_lock_exists = 0; + int lvblen = r->res_ls->ls_lvblen; + + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + continue; + + lock_lvb_exists = 1; + + if (lkb->lkb_grmode > DLM_LOCK_CR) { + big_lock_exists = 1; + goto setflag; + } + + if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) { + high_lkb = lkb; + high_seq = lkb->lkb_lvbseq; + } + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + continue; + + lock_lvb_exists = 1; + + if (lkb->lkb_grmode > DLM_LOCK_CR) { + big_lock_exists = 1; + goto setflag; + } + + if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) { + high_lkb = lkb; + high_seq = lkb->lkb_lvbseq; + } + } + + setflag: + if (!lock_lvb_exists) + goto out; + + if (!big_lock_exists) + rsb_set_flag(r, RSB_VALNOTVALID); + + /* don't mess with the lvb unless we're the new master */ + if (!rsb_flag(r, RSB_NEW_MASTER2)) + goto out; + + if (!r->res_lvbptr) { + r->res_lvbptr = dlm_allocate_lvb(r->res_ls); + if (!r->res_lvbptr) + goto out; + } + + if (big_lock_exists) { + r->res_lvbseq = lkb->lkb_lvbseq; + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen); + } else if (high_lkb) { + r->res_lvbseq = high_lkb->lkb_lvbseq; + memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen); + } else { + r->res_lvbseq = 0; + memset(r->res_lvbptr, 0, lvblen); + } + out: + return; +} + +/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks + converting PR->CW or CW->PR need to have their lkb_grmode set. */ + +static void recover_conversion(struct dlm_rsb *r) +{ + struct dlm_lkb *lkb; + int grmode = -1; + + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + if (lkb->lkb_grmode == DLM_LOCK_PR || + lkb->lkb_grmode == DLM_LOCK_CW) { + grmode = lkb->lkb_grmode; + break; + } + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + if (lkb->lkb_grmode != DLM_LOCK_IV) + continue; + if (grmode == -1) + lkb->lkb_grmode = lkb->lkb_rqmode; + else + lkb->lkb_grmode = grmode; + } +} + +/* We've become the new master for this rsb and waiting/converting locks may + need to be granted in dlm_grant_after_purge() due to locks that may have + existed from a removed node. */ + +static void set_locks_purged(struct dlm_rsb *r) +{ + if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue)) + rsb_set_flag(r, RSB_LOCKS_PURGED); +} + +void dlm_recover_rsbs(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + int count = 0; + + log_debug(ls, "dlm_recover_rsbs"); + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + lock_rsb(r); + if (is_master(r)) { + if (rsb_flag(r, RSB_RECOVER_CONVERT)) + recover_conversion(r); + if (rsb_flag(r, RSB_NEW_MASTER2)) + set_locks_purged(r); + recover_lvb(r); + count++; + } + rsb_clear_flag(r, RSB_RECOVER_CONVERT); + rsb_clear_flag(r, RSB_NEW_MASTER2); + unlock_rsb(r); + } + up_read(&ls->ls_root_sem); + + log_debug(ls, "dlm_recover_rsbs %d rsbs", count); +} + +/* Create a single list of all root rsb's to be used during recovery */ + +int dlm_create_root_list(struct dlm_ls *ls) +{ + struct dlm_rsb *r; + int i, error = 0; + + down_write(&ls->ls_root_sem); + if (!list_empty(&ls->ls_root_list)) { + log_error(ls, "root list not empty"); + error = -EINVAL; + goto out; + } + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + spin_lock(&ls->ls_rsbtbl[i].lock); + list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) { + list_add(&r->res_root_list, &ls->ls_root_list); + dlm_hold_rsb(r); + } + + /* If we're using a directory, add tossed rsbs to the root + list; they'll have entries created in the new directory, + but no other recovery steps should do anything with them. */ + + if (dlm_no_directory(ls)) { + spin_unlock(&ls->ls_rsbtbl[i].lock); + continue; + } + + list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) { + list_add(&r->res_root_list, &ls->ls_root_list); + dlm_hold_rsb(r); + } + spin_unlock(&ls->ls_rsbtbl[i].lock); + } + out: + up_write(&ls->ls_root_sem); + return error; +} + +void dlm_release_root_list(struct dlm_ls *ls) +{ + struct dlm_rsb *r, *safe; + + down_write(&ls->ls_root_sem); + list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) { + list_del_init(&r->res_root_list); + dlm_put_rsb(r); + } + up_write(&ls->ls_root_sem); +} + +/* If not using a directory, clear the entire toss list, there's no benefit to + caching the master value since it's fixed. If we are using a dir, keep the + rsb's we're the master of. Recovery will add them to the root list and from + there they'll be entered in the rebuilt directory. */ + +void dlm_clear_toss_list(struct dlm_ls *ls) +{ + struct dlm_rsb *r, *safe; + int i; + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + spin_lock(&ls->ls_rsbtbl[i].lock); + list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss, + res_hashchain) { + if (dlm_no_directory(ls) || !is_master(r)) { + list_del(&r->res_hashchain); + dlm_free_rsb(r); + } + } + spin_unlock(&ls->ls_rsbtbl[i].lock); + } +} + diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h new file mode 100644 index 00000000..ebd0363f --- /dev/null +++ b/fs/dlm/recover.h @@ -0,0 +1,34 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __RECOVER_DOT_H__ +#define __RECOVER_DOT_H__ + +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)); +uint32_t dlm_recover_status(struct dlm_ls *ls); +void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status); +int dlm_recover_members_wait(struct dlm_ls *ls); +int dlm_recover_directory_wait(struct dlm_ls *ls); +int dlm_recover_locks_wait(struct dlm_ls *ls); +int dlm_recover_done_wait(struct dlm_ls *ls); +int dlm_recover_masters(struct dlm_ls *ls); +int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc); +int dlm_recover_locks(struct dlm_ls *ls); +void dlm_recovered_lock(struct dlm_rsb *r); +int dlm_create_root_list(struct dlm_ls *ls); +void dlm_release_root_list(struct dlm_ls *ls); +void dlm_clear_toss_list(struct dlm_ls *ls); +void dlm_recover_rsbs(struct dlm_ls *ls); + +#endif /* __RECOVER_DOT_H__ */ + diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c new file mode 100644 index 00000000..fd677c8c --- /dev/null +++ b/fs/dlm/recoverd.c @@ -0,0 +1,323 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lockspace.h" +#include "member.h" +#include "dir.h" +#include "ast.h" +#include "recover.h" +#include "lowcomms.h" +#include "lock.h" +#include "requestqueue.h" +#include "recoverd.h" + + +/* If the start for which we're re-enabling locking (seq) has been superseded + by a newer stop (ls_recover_seq), we need to leave locking disabled. + + We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees + locking stopped and b) adds a message to the requestqueue, but dlm_recoverd + enables locking and clears the requestqueue between a and b. */ + +static int enable_locking(struct dlm_ls *ls, uint64_t seq) +{ + int error = -EINTR; + + down_write(&ls->ls_recv_active); + + spin_lock(&ls->ls_recover_lock); + if (ls->ls_recover_seq == seq) { + set_bit(LSFL_RUNNING, &ls->ls_flags); + /* unblocks processes waiting to enter the dlm */ + up_write(&ls->ls_in_recovery); + error = 0; + } + spin_unlock(&ls->ls_recover_lock); + + up_write(&ls->ls_recv_active); + return error; +} + +static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) +{ + unsigned long start; + int error, neg = 0; + + log_debug(ls, "recover %llx", (unsigned long long)rv->seq); + + mutex_lock(&ls->ls_recoverd_active); + + /* + * Suspending and resuming dlm_astd ensures that no lkb's from this ls + * will be processed by dlm_astd during recovery. + */ + + dlm_astd_suspend(); + dlm_astd_resume(); + + /* + * Free non-master tossed rsb's. Master rsb's are kept on toss + * list and put on root list to be included in resdir recovery. + */ + + dlm_clear_toss_list(ls); + + /* + * This list of root rsb's will be the basis of most of the recovery + * routines. + */ + + dlm_create_root_list(ls); + + /* + * Add or remove nodes from the lockspace's ls_nodes list. + * Also waits for all nodes to complete dlm_recover_members. + */ + + error = dlm_recover_members(ls, rv, &neg); + if (error) { + log_debug(ls, "recover_members failed %d", error); + goto fail; + } + start = jiffies; + + /* + * Rebuild our own share of the directory by collecting from all other + * nodes their master rsb names that hash to us. + */ + + error = dlm_recover_directory(ls); + if (error) { + log_debug(ls, "recover_directory failed %d", error); + goto fail; + } + + /* + * Wait for all nodes to complete directory rebuild. + */ + + error = dlm_recover_directory_wait(ls); + if (error) { + log_debug(ls, "recover_directory_wait failed %d", error); + goto fail; + } + + /* + * We may have outstanding operations that are waiting for a reply from + * a failed node. Mark these to be resent after recovery. Unlock and + * cancel ops can just be completed. + */ + + dlm_recover_waiters_pre(ls); + + error = dlm_recovery_stopped(ls); + if (error) + goto fail; + + if (neg || dlm_no_directory(ls)) { + /* + * Clear lkb's for departed nodes. + */ + + dlm_purge_locks(ls); + + /* + * Get new master nodeid's for rsb's that were mastered on + * departed nodes. + */ + + error = dlm_recover_masters(ls); + if (error) { + log_debug(ls, "recover_masters failed %d", error); + goto fail; + } + + /* + * Send our locks on remastered rsb's to the new masters. + */ + + error = dlm_recover_locks(ls); + if (error) { + log_debug(ls, "recover_locks failed %d", error); + goto fail; + } + + error = dlm_recover_locks_wait(ls); + if (error) { + log_debug(ls, "recover_locks_wait failed %d", error); + goto fail; + } + + /* + * Finalize state in master rsb's now that all locks can be + * checked. This includes conversion resolution and lvb + * settings. + */ + + dlm_recover_rsbs(ls); + } else { + /* + * Other lockspace members may be going through the "neg" steps + * while also adding us to the lockspace, in which case they'll + * be doing the recover_locks (RS_LOCKS) barrier. + */ + dlm_set_recover_status(ls, DLM_RS_LOCKS); + + error = dlm_recover_locks_wait(ls); + if (error) { + log_debug(ls, "recover_locks_wait failed %d", error); + goto fail; + } + } + + dlm_release_root_list(ls); + + /* + * Purge directory-related requests that are saved in requestqueue. + * All dir requests from before recovery are invalid now due to the dir + * rebuild and will be resent by the requesting nodes. + */ + + dlm_purge_requestqueue(ls); + + dlm_set_recover_status(ls, DLM_RS_DONE); + error = dlm_recover_done_wait(ls); + if (error) { + log_debug(ls, "recover_done_wait failed %d", error); + goto fail; + } + + dlm_clear_members_gone(ls); + + dlm_adjust_timeouts(ls); + + error = enable_locking(ls, rv->seq); + if (error) { + log_debug(ls, "enable_locking failed %d", error); + goto fail; + } + + error = dlm_process_requestqueue(ls); + if (error) { + log_debug(ls, "process_requestqueue failed %d", error); + goto fail; + } + + error = dlm_recover_waiters_post(ls); + if (error) { + log_debug(ls, "recover_waiters_post failed %d", error); + goto fail; + } + + dlm_grant_after_purge(ls); + + dlm_astd_wake(); + + log_debug(ls, "recover %llx done: %u ms", + (unsigned long long)rv->seq, + jiffies_to_msecs(jiffies - start)); + mutex_unlock(&ls->ls_recoverd_active); + + return 0; + + fail: + dlm_release_root_list(ls); + log_debug(ls, "recover %llx error %d", + (unsigned long long)rv->seq, error); + mutex_unlock(&ls->ls_recoverd_active); + return error; +} + +/* The dlm_ls_start() that created the rv we take here may already have been + stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP + flag set. */ + +static void do_ls_recovery(struct dlm_ls *ls) +{ + struct dlm_recover *rv = NULL; + + spin_lock(&ls->ls_recover_lock); + rv = ls->ls_recover_args; + ls->ls_recover_args = NULL; + if (rv && ls->ls_recover_seq == rv->seq) + clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); + spin_unlock(&ls->ls_recover_lock); + + if (rv) { + ls_recover(ls, rv); + kfree(rv->nodeids); + kfree(rv->new); + kfree(rv); + } +} + +static int dlm_recoverd(void *arg) +{ + struct dlm_ls *ls; + + ls = dlm_find_lockspace_local(arg); + if (!ls) { + log_print("dlm_recoverd: no lockspace %p", arg); + return -1; + } + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (!test_bit(LSFL_WORK, &ls->ls_flags)) + schedule(); + set_current_state(TASK_RUNNING); + + if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags)) + do_ls_recovery(ls); + } + + dlm_put_lockspace(ls); + return 0; +} + +void dlm_recoverd_kick(struct dlm_ls *ls) +{ + set_bit(LSFL_WORK, &ls->ls_flags); + wake_up_process(ls->ls_recoverd_task); +} + +int dlm_recoverd_start(struct dlm_ls *ls) +{ + struct task_struct *p; + int error = 0; + + p = kthread_run(dlm_recoverd, ls, "dlm_recoverd"); + if (IS_ERR(p)) + error = PTR_ERR(p); + else + ls->ls_recoverd_task = p; + return error; +} + +void dlm_recoverd_stop(struct dlm_ls *ls) +{ + kthread_stop(ls->ls_recoverd_task); +} + +void dlm_recoverd_suspend(struct dlm_ls *ls) +{ + wake_up(&ls->ls_wait_general); + mutex_lock(&ls->ls_recoverd_active); +} + +void dlm_recoverd_resume(struct dlm_ls *ls) +{ + mutex_unlock(&ls->ls_recoverd_active); +} + diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h new file mode 100644 index 00000000..866657c5 --- /dev/null +++ b/fs/dlm/recoverd.h @@ -0,0 +1,24 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __RECOVERD_DOT_H__ +#define __RECOVERD_DOT_H__ + +void dlm_recoverd_kick(struct dlm_ls *ls); +void dlm_recoverd_stop(struct dlm_ls *ls); +int dlm_recoverd_start(struct dlm_ls *ls); +void dlm_recoverd_suspend(struct dlm_ls *ls); +void dlm_recoverd_resume(struct dlm_ls *ls); + +#endif /* __RECOVERD_DOT_H__ */ + diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c new file mode 100644 index 00000000..a44fa228 --- /dev/null +++ b/fs/dlm/requestqueue.c @@ -0,0 +1,188 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "member.h" +#include "lock.h" +#include "dir.h" +#include "config.h" +#include "requestqueue.h" + +struct rq_entry { + struct list_head list; + int nodeid; + struct dlm_message request; +}; + +/* + * Requests received while the lockspace is in recovery get added to the + * request queue and processed when recovery is complete. This happens when + * the lockspace is suspended on some nodes before it is on others, or the + * lockspace is enabled on some while still suspended on others. + */ + +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) +{ + struct rq_entry *e; + int length = ms->m_header.h_length - sizeof(struct dlm_message); + + e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS); + if (!e) { + log_print("dlm_add_requestqueue: out of memory len %d", length); + return; + } + + e->nodeid = nodeid; + memcpy(&e->request, ms, ms->m_header.h_length); + + mutex_lock(&ls->ls_requestqueue_mutex); + list_add_tail(&e->list, &ls->ls_requestqueue); + mutex_unlock(&ls->ls_requestqueue_mutex); +} + +/* + * Called by dlm_recoverd to process normal messages saved while recovery was + * happening. Normal locking has been enabled before this is called. dlm_recv + * upon receiving a message, will wait for all saved messages to be drained + * here before processing the message it got. If a new dlm_ls_stop() arrives + * while we're processing these saved messages, it may block trying to suspend + * dlm_recv if dlm_recv is waiting for us in dlm_wait_requestqueue. In that + * case, we don't abort since locking_stopped is still 0. If dlm_recv is not + * waiting for us, then this processing may be aborted due to locking_stopped. + */ + +int dlm_process_requestqueue(struct dlm_ls *ls) +{ + struct rq_entry *e; + int error = 0; + + mutex_lock(&ls->ls_requestqueue_mutex); + + for (;;) { + if (list_empty(&ls->ls_requestqueue)) { + mutex_unlock(&ls->ls_requestqueue_mutex); + error = 0; + break; + } + e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list); + mutex_unlock(&ls->ls_requestqueue_mutex); + + dlm_receive_message_saved(ls, &e->request); + + mutex_lock(&ls->ls_requestqueue_mutex); + list_del(&e->list); + kfree(e); + + if (dlm_locking_stopped(ls)) { + log_debug(ls, "process_requestqueue abort running"); + mutex_unlock(&ls->ls_requestqueue_mutex); + error = -EINTR; + break; + } + schedule(); + } + + return error; +} + +/* + * After recovery is done, locking is resumed and dlm_recoverd takes all the + * saved requests and processes them as they would have been by dlm_recv. At + * the same time, dlm_recv will start receiving new requests from remote nodes. + * We want to delay dlm_recv processing new requests until dlm_recoverd has + * finished processing the old saved requests. We don't check for locking + * stopped here because dlm_ls_stop won't stop locking until it's suspended us + * (dlm_recv). + */ + +void dlm_wait_requestqueue(struct dlm_ls *ls) +{ + for (;;) { + mutex_lock(&ls->ls_requestqueue_mutex); + if (list_empty(&ls->ls_requestqueue)) + break; + mutex_unlock(&ls->ls_requestqueue_mutex); + schedule(); + } + mutex_unlock(&ls->ls_requestqueue_mutex); +} + +static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) +{ + uint32_t type = ms->m_type; + + /* the ls is being cleaned up and freed by release_lockspace */ + if (!ls->ls_count) + return 1; + + if (dlm_is_removed(ls, nodeid)) + return 1; + + /* directory operations are always purged because the directory is + always rebuilt during recovery and the lookups resent */ + + if (type == DLM_MSG_REMOVE || + type == DLM_MSG_LOOKUP || + type == DLM_MSG_LOOKUP_REPLY) + return 1; + + if (!dlm_no_directory(ls)) + return 0; + + /* with no directory, the master is likely to change as a part of + recovery; requests to/from the defunct master need to be purged */ + + switch (type) { + case DLM_MSG_REQUEST: + case DLM_MSG_CONVERT: + case DLM_MSG_UNLOCK: + case DLM_MSG_CANCEL: + /* we're no longer the master of this resource, the sender + will resend to the new master (see waiter_needs_recovery) */ + + if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid()) + return 1; + break; + + case DLM_MSG_REQUEST_REPLY: + case DLM_MSG_CONVERT_REPLY: + case DLM_MSG_UNLOCK_REPLY: + case DLM_MSG_CANCEL_REPLY: + case DLM_MSG_GRANT: + /* this reply is from the former master of the resource, + we'll resend to the new master if needed */ + + if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid) + return 1; + break; + } + + return 0; +} + +void dlm_purge_requestqueue(struct dlm_ls *ls) +{ + struct dlm_message *ms; + struct rq_entry *e, *safe; + + mutex_lock(&ls->ls_requestqueue_mutex); + list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) { + ms = &e->request; + + if (purge_request(ls, ms, e->nodeid)) { + list_del(&e->list); + kfree(e); + } + } + mutex_unlock(&ls->ls_requestqueue_mutex); +} + diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h new file mode 100644 index 00000000..10ce449b --- /dev/null +++ b/fs/dlm/requestqueue.h @@ -0,0 +1,22 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __REQUESTQUEUE_DOT_H__ +#define __REQUESTQUEUE_DOT_H__ + +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms); +int dlm_process_requestqueue(struct dlm_ls *ls); +void dlm_wait_requestqueue(struct dlm_ls *ls); +void dlm_purge_requestqueue(struct dlm_ls *ls); + +#endif + diff --git a/fs/dlm/user.c b/fs/dlm/user.c new file mode 100644 index 00000000..e96bf3e9 --- /dev/null +++ b/fs/dlm/user.c @@ -0,0 +1,1011 @@ +/* + * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dlm_internal.h" +#include "lockspace.h" +#include "lock.h" +#include "lvb_table.h" +#include "user.h" +#include "ast.h" + +static const char name_prefix[] = "dlm"; +static const struct file_operations device_fops; +static atomic_t dlm_monitor_opened; +static int dlm_monitor_unused = 1; + +#ifdef CONFIG_COMPAT + +struct dlm_lock_params32 { + __u8 mode; + __u8 namelen; + __u16 unused; + __u32 flags; + __u32 lkid; + __u32 parent; + __u64 xid; + __u64 timeout; + __u32 castparam; + __u32 castaddr; + __u32 bastparam; + __u32 bastaddr; + __u32 lksb; + char lvb[DLM_USER_LVB_LEN]; + char name[0]; +}; + +struct dlm_write_request32 { + __u32 version[3]; + __u8 cmd; + __u8 is64bit; + __u8 unused[2]; + + union { + struct dlm_lock_params32 lock; + struct dlm_lspace_params lspace; + struct dlm_purge_params purge; + } i; +}; + +struct dlm_lksb32 { + __u32 sb_status; + __u32 sb_lkid; + __u8 sb_flags; + __u32 sb_lvbptr; +}; + +struct dlm_lock_result32 { + __u32 version[3]; + __u32 length; + __u32 user_astaddr; + __u32 user_astparam; + __u32 user_lksb; + struct dlm_lksb32 lksb; + __u8 bast_mode; + __u8 unused[3]; + /* Offsets may be zero if no data is present */ + __u32 lvb_offset; +}; + +static void compat_input(struct dlm_write_request *kb, + struct dlm_write_request32 *kb32, + int namelen) +{ + kb->version[0] = kb32->version[0]; + kb->version[1] = kb32->version[1]; + kb->version[2] = kb32->version[2]; + + kb->cmd = kb32->cmd; + kb->is64bit = kb32->is64bit; + if (kb->cmd == DLM_USER_CREATE_LOCKSPACE || + kb->cmd == DLM_USER_REMOVE_LOCKSPACE) { + kb->i.lspace.flags = kb32->i.lspace.flags; + kb->i.lspace.minor = kb32->i.lspace.minor; + memcpy(kb->i.lspace.name, kb32->i.lspace.name, namelen); + } else if (kb->cmd == DLM_USER_PURGE) { + kb->i.purge.nodeid = kb32->i.purge.nodeid; + kb->i.purge.pid = kb32->i.purge.pid; + } else { + kb->i.lock.mode = kb32->i.lock.mode; + kb->i.lock.namelen = kb32->i.lock.namelen; + kb->i.lock.flags = kb32->i.lock.flags; + kb->i.lock.lkid = kb32->i.lock.lkid; + kb->i.lock.parent = kb32->i.lock.parent; + kb->i.lock.xid = kb32->i.lock.xid; + kb->i.lock.timeout = kb32->i.lock.timeout; + kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam; + kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr; + kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam; + kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr; + kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb; + memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN); + memcpy(kb->i.lock.name, kb32->i.lock.name, namelen); + } +} + +static void compat_output(struct dlm_lock_result *res, + struct dlm_lock_result32 *res32) +{ + res32->version[0] = res->version[0]; + res32->version[1] = res->version[1]; + res32->version[2] = res->version[2]; + + res32->user_astaddr = (__u32)(long)res->user_astaddr; + res32->user_astparam = (__u32)(long)res->user_astparam; + res32->user_lksb = (__u32)(long)res->user_lksb; + res32->bast_mode = res->bast_mode; + + res32->lvb_offset = res->lvb_offset; + res32->length = res->length; + + res32->lksb.sb_status = res->lksb.sb_status; + res32->lksb.sb_flags = res->lksb.sb_flags; + res32->lksb.sb_lkid = res->lksb.sb_lkid; + res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr; +} +#endif + +/* Figure out if this lock is at the end of its life and no longer + available for the application to use. The lkb still exists until + the final ast is read. A lock becomes EOL in three situations: + 1. a noqueue request fails with EAGAIN + 2. an unlock completes with EUNLOCK + 3. a cancel of a waiting request completes with ECANCEL/EDEADLK + An EOL lock needs to be removed from the process's list of locks. + And we can't allow any new operation on an EOL lock. This is + not related to the lifetime of the lkb struct which is managed + entirely by refcount. */ + +static int lkb_is_endoflife(int mode, int status) +{ + switch (status) { + case -DLM_EUNLOCK: + return 1; + case -DLM_ECANCEL: + case -ETIMEDOUT: + case -EDEADLK: + case -EAGAIN: + if (mode == DLM_LOCK_IV) + return 1; + break; + } + return 0; +} + +/* we could possibly check if the cancel of an orphan has resulted in the lkb + being removed and then remove that lkb from the orphans list and free it */ + +void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags, uint64_t seq) +{ + struct dlm_ls *ls; + struct dlm_user_args *ua; + struct dlm_user_proc *proc; + int rv; + + if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) + return; + + ls = lkb->lkb_resource->res_ls; + mutex_lock(&ls->ls_clear_proc_locks); + + /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast + can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed + lkb->ua so we can't try to use it. This second check is necessary + for cases where a completion ast is received for an operation that + began before clear_proc_locks did its cancel/unlock. */ + + if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) + goto out; + + DLM_ASSERT(lkb->lkb_ua, dlm_print_lkb(lkb);); + ua = lkb->lkb_ua; + proc = ua->proc; + + if ((flags & DLM_CB_BAST) && ua->bastaddr == NULL) + goto out; + + if ((flags & DLM_CB_CAST) && lkb_is_endoflife(mode, status)) + lkb->lkb_flags |= DLM_IFL_ENDOFLIFE; + + spin_lock(&proc->asts_spin); + + rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq); + if (rv < 0) { + spin_unlock(&proc->asts_spin); + goto out; + } + + if (list_empty(&lkb->lkb_astqueue)) { + kref_get(&lkb->lkb_ref); + list_add_tail(&lkb->lkb_astqueue, &proc->asts); + wake_up_interruptible(&proc->wait); + } + spin_unlock(&proc->asts_spin); + + if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) { + /* N.B. spin_lock locks_spin, not asts_spin */ + spin_lock(&proc->locks_spin); + if (!list_empty(&lkb->lkb_ownqueue)) { + list_del_init(&lkb->lkb_ownqueue); + dlm_put_lkb(lkb); + } + spin_unlock(&proc->locks_spin); + } + out: + mutex_unlock(&ls->ls_clear_proc_locks); +} + +static int device_user_lock(struct dlm_user_proc *proc, + struct dlm_lock_params *params) +{ + struct dlm_ls *ls; + struct dlm_user_args *ua; + int error = -ENOMEM; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + if (!params->castaddr || !params->lksb) { + error = -EINVAL; + goto out; + } + + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); + if (!ua) + goto out; + ua->proc = proc; + ua->user_lksb = params->lksb; + ua->castparam = params->castparam; + ua->castaddr = params->castaddr; + ua->bastparam = params->bastparam; + ua->bastaddr = params->bastaddr; + ua->xid = params->xid; + + if (params->flags & DLM_LKF_CONVERT) + error = dlm_user_convert(ls, ua, + params->mode, params->flags, + params->lkid, params->lvb, + (unsigned long) params->timeout); + else { + error = dlm_user_request(ls, ua, + params->mode, params->flags, + params->name, params->namelen, + (unsigned long) params->timeout); + if (!error) + error = ua->lksb.sb_lkid; + } + out: + dlm_put_lockspace(ls); + return error; +} + +static int device_user_unlock(struct dlm_user_proc *proc, + struct dlm_lock_params *params) +{ + struct dlm_ls *ls; + struct dlm_user_args *ua; + int error = -ENOMEM; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); + if (!ua) + goto out; + ua->proc = proc; + ua->user_lksb = params->lksb; + ua->castparam = params->castparam; + ua->castaddr = params->castaddr; + + if (params->flags & DLM_LKF_CANCEL) + error = dlm_user_cancel(ls, ua, params->flags, params->lkid); + else + error = dlm_user_unlock(ls, ua, params->flags, params->lkid, + params->lvb); + out: + dlm_put_lockspace(ls); + return error; +} + +static int device_user_deadlock(struct dlm_user_proc *proc, + struct dlm_lock_params *params) +{ + struct dlm_ls *ls; + int error; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + error = dlm_user_deadlock(ls, params->flags, params->lkid); + + dlm_put_lockspace(ls); + return error; +} + +static int dlm_device_register(struct dlm_ls *ls, char *name) +{ + int error, len; + + /* The device is already registered. This happens when the + lockspace is created multiple times from userspace. */ + if (ls->ls_device.name) + return 0; + + error = -ENOMEM; + len = strlen(name) + strlen(name_prefix) + 2; + ls->ls_device.name = kzalloc(len, GFP_NOFS); + if (!ls->ls_device.name) + goto fail; + + snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix, + name); + ls->ls_device.fops = &device_fops; + ls->ls_device.minor = MISC_DYNAMIC_MINOR; + + error = misc_register(&ls->ls_device); + if (error) { + kfree(ls->ls_device.name); + } +fail: + return error; +} + +int dlm_device_deregister(struct dlm_ls *ls) +{ + int error; + + /* The device is not registered. This happens when the lockspace + was never used from userspace, or when device_create_lockspace() + calls dlm_release_lockspace() after the register fails. */ + if (!ls->ls_device.name) + return 0; + + error = misc_deregister(&ls->ls_device); + if (!error) + kfree(ls->ls_device.name); + return error; +} + +static int device_user_purge(struct dlm_user_proc *proc, + struct dlm_purge_params *params) +{ + struct dlm_ls *ls; + int error; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + error = dlm_user_purge(ls, proc, params->nodeid, params->pid); + + dlm_put_lockspace(ls); + return error; +} + +static int device_create_lockspace(struct dlm_lspace_params *params) +{ + dlm_lockspace_t *lockspace; + struct dlm_ls *ls; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = dlm_new_lockspace(params->name, strlen(params->name), + &lockspace, params->flags, DLM_USER_LVB_LEN); + if (error) + return error; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -ENOENT; + + error = dlm_device_register(ls, params->name); + dlm_put_lockspace(ls); + + if (error) + dlm_release_lockspace(lockspace, 0); + else + error = ls->ls_device.minor; + + return error; +} + +static int device_remove_lockspace(struct dlm_lspace_params *params) +{ + dlm_lockspace_t *lockspace; + struct dlm_ls *ls; + int error, force = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ls = dlm_find_lockspace_device(params->minor); + if (!ls) + return -ENOENT; + + if (params->flags & DLM_USER_LSFLG_FORCEFREE) + force = 2; + + lockspace = ls->ls_local_handle; + dlm_put_lockspace(ls); + + /* The final dlm_release_lockspace waits for references to go to + zero, so all processes will need to close their device for the + ls before the release will proceed. release also calls the + device_deregister above. Converting a positive return value + from release to zero means that userspace won't know when its + release was the final one, but it shouldn't need to know. */ + + error = dlm_release_lockspace(lockspace, force); + if (error > 0) + error = 0; + return error; +} + +/* Check the user's version matches ours */ +static int check_version(struct dlm_write_request *req) +{ + if (req->version[0] != DLM_DEVICE_VERSION_MAJOR || + (req->version[0] == DLM_DEVICE_VERSION_MAJOR && + req->version[1] > DLM_DEVICE_VERSION_MINOR)) { + + printk(KERN_DEBUG "dlm: process %s (%d) version mismatch " + "user (%d.%d.%d) kernel (%d.%d.%d)\n", + current->comm, + task_pid_nr(current), + req->version[0], + req->version[1], + req->version[2], + DLM_DEVICE_VERSION_MAJOR, + DLM_DEVICE_VERSION_MINOR, + DLM_DEVICE_VERSION_PATCH); + return -EINVAL; + } + return 0; +} + +/* + * device_write + * + * device_user_lock + * dlm_user_request -> request_lock + * dlm_user_convert -> convert_lock + * + * device_user_unlock + * dlm_user_unlock -> unlock_lock + * dlm_user_cancel -> cancel_lock + * + * device_create_lockspace + * dlm_new_lockspace + * + * device_remove_lockspace + * dlm_release_lockspace + */ + +/* a write to a lockspace device is a lock or unlock request, a write + to the control device is to create/remove a lockspace */ + +static ssize_t device_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct dlm_user_proc *proc = file->private_data; + struct dlm_write_request *kbuf; + sigset_t tmpsig, allsigs; + int error; + +#ifdef CONFIG_COMPAT + if (count < sizeof(struct dlm_write_request32)) +#else + if (count < sizeof(struct dlm_write_request)) +#endif + return -EINVAL; + + kbuf = kzalloc(count + 1, GFP_NOFS); + if (!kbuf) + return -ENOMEM; + + if (copy_from_user(kbuf, buf, count)) { + error = -EFAULT; + goto out_free; + } + + if (check_version(kbuf)) { + error = -EBADE; + goto out_free; + } + +#ifdef CONFIG_COMPAT + if (!kbuf->is64bit) { + struct dlm_write_request32 *k32buf; + int namelen = 0; + + if (count > sizeof(struct dlm_write_request32)) + namelen = count - sizeof(struct dlm_write_request32); + + k32buf = (struct dlm_write_request32 *)kbuf; + + /* add 1 after namelen so that the name string is terminated */ + kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1, + GFP_NOFS); + if (!kbuf) { + kfree(k32buf); + return -ENOMEM; + } + + if (proc) + set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags); + + compat_input(kbuf, k32buf, namelen); + kfree(k32buf); + } +#endif + + /* do we really need this? can a write happen after a close? */ + if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) && + (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) { + error = -EINVAL; + goto out_free; + } + + sigfillset(&allsigs); + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); + + error = -EINVAL; + + switch (kbuf->cmd) + { + case DLM_USER_LOCK: + if (!proc) { + log_print("no locking on control device"); + goto out_sig; + } + error = device_user_lock(proc, &kbuf->i.lock); + break; + + case DLM_USER_UNLOCK: + if (!proc) { + log_print("no locking on control device"); + goto out_sig; + } + error = device_user_unlock(proc, &kbuf->i.lock); + break; + + case DLM_USER_DEADLOCK: + if (!proc) { + log_print("no locking on control device"); + goto out_sig; + } + error = device_user_deadlock(proc, &kbuf->i.lock); + break; + + case DLM_USER_CREATE_LOCKSPACE: + if (proc) { + log_print("create/remove only on control device"); + goto out_sig; + } + error = device_create_lockspace(&kbuf->i.lspace); + break; + + case DLM_USER_REMOVE_LOCKSPACE: + if (proc) { + log_print("create/remove only on control device"); + goto out_sig; + } + error = device_remove_lockspace(&kbuf->i.lspace); + break; + + case DLM_USER_PURGE: + if (!proc) { + log_print("no locking on control device"); + goto out_sig; + } + error = device_user_purge(proc, &kbuf->i.purge); + break; + + default: + log_print("Unknown command passed to DLM device : %d\n", + kbuf->cmd); + } + + out_sig: + sigprocmask(SIG_SETMASK, &tmpsig, NULL); + out_free: + kfree(kbuf); + return error; +} + +/* Every process that opens the lockspace device has its own "proc" structure + hanging off the open file that's used to keep track of locks owned by the + process and asts that need to be delivered to the process. */ + +static int device_open(struct inode *inode, struct file *file) +{ + struct dlm_user_proc *proc; + struct dlm_ls *ls; + + ls = dlm_find_lockspace_device(iminor(inode)); + if (!ls) + return -ENOENT; + + proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS); + if (!proc) { + dlm_put_lockspace(ls); + return -ENOMEM; + } + + proc->lockspace = ls->ls_local_handle; + INIT_LIST_HEAD(&proc->asts); + INIT_LIST_HEAD(&proc->locks); + INIT_LIST_HEAD(&proc->unlocking); + spin_lock_init(&proc->asts_spin); + spin_lock_init(&proc->locks_spin); + init_waitqueue_head(&proc->wait); + file->private_data = proc; + + return 0; +} + +static int device_close(struct inode *inode, struct file *file) +{ + struct dlm_user_proc *proc = file->private_data; + struct dlm_ls *ls; + sigset_t tmpsig, allsigs; + + ls = dlm_find_lockspace_local(proc->lockspace); + if (!ls) + return -ENOENT; + + sigfillset(&allsigs); + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); + + set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags); + + dlm_clear_proc_locks(ls, proc); + + /* at this point no more lkb's should exist for this lockspace, + so there's no chance of dlm_user_add_ast() being called and + looking for lkb->ua->proc */ + + kfree(proc); + file->private_data = NULL; + + dlm_put_lockspace(ls); + dlm_put_lockspace(ls); /* for the find in device_open() */ + + /* FIXME: AUTOFREE: if this ls is no longer used do + device_remove_lockspace() */ + + sigprocmask(SIG_SETMASK, &tmpsig, NULL); + recalc_sigpending(); + + return 0; +} + +static int copy_result_to_user(struct dlm_user_args *ua, int compat, + uint32_t flags, int mode, int copy_lvb, + char __user *buf, size_t count) +{ +#ifdef CONFIG_COMPAT + struct dlm_lock_result32 result32; +#endif + struct dlm_lock_result result; + void *resultptr; + int error=0; + int len; + int struct_len; + + memset(&result, 0, sizeof(struct dlm_lock_result)); + result.version[0] = DLM_DEVICE_VERSION_MAJOR; + result.version[1] = DLM_DEVICE_VERSION_MINOR; + result.version[2] = DLM_DEVICE_VERSION_PATCH; + memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb)); + result.user_lksb = ua->user_lksb; + + /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated + in a conversion unless the conversion is successful. See code + in dlm_user_convert() for updating ua from ua_tmp. OpenVMS, though, + notes that a new blocking AST address and parameter are set even if + the conversion fails, so maybe we should just do that. */ + + if (flags & DLM_CB_BAST) { + result.user_astaddr = ua->bastaddr; + result.user_astparam = ua->bastparam; + result.bast_mode = mode; + } else { + result.user_astaddr = ua->castaddr; + result.user_astparam = ua->castparam; + } + +#ifdef CONFIG_COMPAT + if (compat) + len = sizeof(struct dlm_lock_result32); + else +#endif + len = sizeof(struct dlm_lock_result); + struct_len = len; + + /* copy lvb to userspace if there is one, it's been updated, and + the user buffer has space for it */ + + if (copy_lvb && ua->lksb.sb_lvbptr && count >= len + DLM_USER_LVB_LEN) { + if (copy_to_user(buf+len, ua->lksb.sb_lvbptr, + DLM_USER_LVB_LEN)) { + error = -EFAULT; + goto out; + } + + result.lvb_offset = len; + len += DLM_USER_LVB_LEN; + } + + result.length = len; + resultptr = &result; +#ifdef CONFIG_COMPAT + if (compat) { + compat_output(&result, &result32); + resultptr = &result32; + } +#endif + + if (copy_to_user(buf, resultptr, struct_len)) + error = -EFAULT; + else + error = len; + out: + return error; +} + +static int copy_version_to_user(char __user *buf, size_t count) +{ + struct dlm_device_version ver; + + memset(&ver, 0, sizeof(struct dlm_device_version)); + ver.version[0] = DLM_DEVICE_VERSION_MAJOR; + ver.version[1] = DLM_DEVICE_VERSION_MINOR; + ver.version[2] = DLM_DEVICE_VERSION_PATCH; + + if (copy_to_user(buf, &ver, sizeof(struct dlm_device_version))) + return -EFAULT; + return sizeof(struct dlm_device_version); +} + +/* a read returns a single ast described in a struct dlm_lock_result */ + +static ssize_t device_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct dlm_user_proc *proc = file->private_data; + struct dlm_lkb *lkb; + DECLARE_WAITQUEUE(wait, current); + struct dlm_callback cb; + int rv, resid, copy_lvb = 0; + + if (count == sizeof(struct dlm_device_version)) { + rv = copy_version_to_user(buf, count); + return rv; + } + + if (!proc) { + log_print("non-version read from control device %zu", count); + return -EINVAL; + } + +#ifdef CONFIG_COMPAT + if (count < sizeof(struct dlm_lock_result32)) +#else + if (count < sizeof(struct dlm_lock_result)) +#endif + return -EINVAL; + + try_another: + + /* do we really need this? can a read happen after a close? */ + if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags)) + return -EINVAL; + + spin_lock(&proc->asts_spin); + if (list_empty(&proc->asts)) { + if (file->f_flags & O_NONBLOCK) { + spin_unlock(&proc->asts_spin); + return -EAGAIN; + } + + add_wait_queue(&proc->wait, &wait); + + repeat: + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty(&proc->asts) && !signal_pending(current)) { + spin_unlock(&proc->asts_spin); + schedule(); + spin_lock(&proc->asts_spin); + goto repeat; + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&proc->wait, &wait); + + if (signal_pending(current)) { + spin_unlock(&proc->asts_spin); + return -ERESTARTSYS; + } + } + + /* if we empty lkb_callbacks, we don't want to unlock the spinlock + without removing lkb_astqueue; so empty lkb_astqueue is always + consistent with empty lkb_callbacks */ + + lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue); + + rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid); + if (rv < 0) { + /* this shouldn't happen; lkb should have been removed from + list when resid was zero */ + log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id); + list_del_init(&lkb->lkb_astqueue); + spin_unlock(&proc->asts_spin); + /* removes ref for proc->asts, may cause lkb to be freed */ + dlm_put_lkb(lkb); + goto try_another; + } + if (!resid) + list_del_init(&lkb->lkb_astqueue); + spin_unlock(&proc->asts_spin); + + if (cb.flags & DLM_CB_SKIP) { + /* removes ref for proc->asts, may cause lkb to be freed */ + if (!resid) + dlm_put_lkb(lkb); + goto try_another; + } + + if (cb.flags & DLM_CB_CAST) { + int old_mode, new_mode; + + old_mode = lkb->lkb_last_cast.mode; + new_mode = cb.mode; + + if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr && + dlm_lvb_operations[old_mode + 1][new_mode + 1]) + copy_lvb = 1; + + lkb->lkb_lksb->sb_status = cb.sb_status; + lkb->lkb_lksb->sb_flags = cb.sb_flags; + } + + rv = copy_result_to_user(lkb->lkb_ua, + test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), + cb.flags, cb.mode, copy_lvb, buf, count); + + /* removes ref for proc->asts, may cause lkb to be freed */ + if (!resid) + dlm_put_lkb(lkb); + + return rv; +} + +static unsigned int device_poll(struct file *file, poll_table *wait) +{ + struct dlm_user_proc *proc = file->private_data; + + poll_wait(file, &proc->wait, wait); + + spin_lock(&proc->asts_spin); + if (!list_empty(&proc->asts)) { + spin_unlock(&proc->asts_spin); + return POLLIN | POLLRDNORM; + } + spin_unlock(&proc->asts_spin); + return 0; +} + +int dlm_user_daemon_available(void) +{ + /* dlm_controld hasn't started (or, has started, but not + properly populated configfs) */ + + if (!dlm_our_nodeid()) + return 0; + + /* This is to deal with versions of dlm_controld that don't + know about the monitor device. We assume that if the + dlm_controld was started (above), but the monitor device + was never opened, that it's an old version. dlm_controld + should open the monitor device before populating configfs. */ + + if (dlm_monitor_unused) + return 1; + + return atomic_read(&dlm_monitor_opened) ? 1 : 0; +} + +static int ctl_device_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + return 0; +} + +static int ctl_device_close(struct inode *inode, struct file *file) +{ + return 0; +} + +static int monitor_device_open(struct inode *inode, struct file *file) +{ + atomic_inc(&dlm_monitor_opened); + dlm_monitor_unused = 0; + return 0; +} + +static int monitor_device_close(struct inode *inode, struct file *file) +{ + if (atomic_dec_and_test(&dlm_monitor_opened)) + dlm_stop_lockspaces(); + return 0; +} + +static const struct file_operations device_fops = { + .open = device_open, + .release = device_close, + .read = device_read, + .write = device_write, + .poll = device_poll, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static const struct file_operations ctl_device_fops = { + .open = ctl_device_open, + .release = ctl_device_close, + .read = device_read, + .write = device_write, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice ctl_device = { + .name = "dlm-control", + .fops = &ctl_device_fops, + .minor = MISC_DYNAMIC_MINOR, +}; + +static const struct file_operations monitor_device_fops = { + .open = monitor_device_open, + .release = monitor_device_close, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice monitor_device = { + .name = "dlm-monitor", + .fops = &monitor_device_fops, + .minor = MISC_DYNAMIC_MINOR, +}; + +int __init dlm_user_init(void) +{ + int error; + + atomic_set(&dlm_monitor_opened, 0); + + error = misc_register(&ctl_device); + if (error) { + log_print("misc_register failed for control device"); + goto out; + } + + error = misc_register(&monitor_device); + if (error) { + log_print("misc_register failed for monitor device"); + misc_deregister(&ctl_device); + } + out: + return error; +} + +void dlm_user_exit(void) +{ + misc_deregister(&ctl_device); + misc_deregister(&monitor_device); +} + diff --git a/fs/dlm/user.h b/fs/dlm/user.h new file mode 100644 index 00000000..00499ab8 --- /dev/null +++ b/fs/dlm/user.h @@ -0,0 +1,19 @@ +/* + * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __USER_DOT_H__ +#define __USER_DOT_H__ + +void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags, uint64_t seq); +int dlm_user_init(void); +void dlm_user_exit(void); +int dlm_device_deregister(struct dlm_ls *ls); +int dlm_user_daemon_available(void); + +#endif diff --git a/fs/dlm/util.c b/fs/dlm/util.c new file mode 100644 index 00000000..e36520af --- /dev/null +++ b/fs/dlm/util.c @@ -0,0 +1,154 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "rcom.h" +#include "util.h" + +#define DLM_ERRNO_EDEADLK 35 +#define DLM_ERRNO_EBADR 53 +#define DLM_ERRNO_EBADSLT 57 +#define DLM_ERRNO_EPROTO 71 +#define DLM_ERRNO_EOPNOTSUPP 95 +#define DLM_ERRNO_ETIMEDOUT 110 +#define DLM_ERRNO_EINPROGRESS 115 + +static void header_out(struct dlm_header *hd) +{ + hd->h_version = cpu_to_le32(hd->h_version); + hd->h_lockspace = cpu_to_le32(hd->h_lockspace); + hd->h_nodeid = cpu_to_le32(hd->h_nodeid); + hd->h_length = cpu_to_le16(hd->h_length); +} + +static void header_in(struct dlm_header *hd) +{ + hd->h_version = le32_to_cpu(hd->h_version); + hd->h_lockspace = le32_to_cpu(hd->h_lockspace); + hd->h_nodeid = le32_to_cpu(hd->h_nodeid); + hd->h_length = le16_to_cpu(hd->h_length); +} + +/* higher errno values are inconsistent across architectures, so select + one set of values for on the wire */ + +static int to_dlm_errno(int err) +{ + switch (err) { + case -EDEADLK: + return -DLM_ERRNO_EDEADLK; + case -EBADR: + return -DLM_ERRNO_EBADR; + case -EBADSLT: + return -DLM_ERRNO_EBADSLT; + case -EPROTO: + return -DLM_ERRNO_EPROTO; + case -EOPNOTSUPP: + return -DLM_ERRNO_EOPNOTSUPP; + case -ETIMEDOUT: + return -DLM_ERRNO_ETIMEDOUT; + case -EINPROGRESS: + return -DLM_ERRNO_EINPROGRESS; + } + return err; +} + +static int from_dlm_errno(int err) +{ + switch (err) { + case -DLM_ERRNO_EDEADLK: + return -EDEADLK; + case -DLM_ERRNO_EBADR: + return -EBADR; + case -DLM_ERRNO_EBADSLT: + return -EBADSLT; + case -DLM_ERRNO_EPROTO: + return -EPROTO; + case -DLM_ERRNO_EOPNOTSUPP: + return -EOPNOTSUPP; + case -DLM_ERRNO_ETIMEDOUT: + return -ETIMEDOUT; + case -DLM_ERRNO_EINPROGRESS: + return -EINPROGRESS; + } + return err; +} + +void dlm_message_out(struct dlm_message *ms) +{ + header_out(&ms->m_header); + + ms->m_type = cpu_to_le32(ms->m_type); + ms->m_nodeid = cpu_to_le32(ms->m_nodeid); + ms->m_pid = cpu_to_le32(ms->m_pid); + ms->m_lkid = cpu_to_le32(ms->m_lkid); + ms->m_remid = cpu_to_le32(ms->m_remid); + ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid); + ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid); + ms->m_exflags = cpu_to_le32(ms->m_exflags); + ms->m_sbflags = cpu_to_le32(ms->m_sbflags); + ms->m_flags = cpu_to_le32(ms->m_flags); + ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq); + ms->m_hash = cpu_to_le32(ms->m_hash); + ms->m_status = cpu_to_le32(ms->m_status); + ms->m_grmode = cpu_to_le32(ms->m_grmode); + ms->m_rqmode = cpu_to_le32(ms->m_rqmode); + ms->m_bastmode = cpu_to_le32(ms->m_bastmode); + ms->m_asts = cpu_to_le32(ms->m_asts); + ms->m_result = cpu_to_le32(to_dlm_errno(ms->m_result)); +} + +void dlm_message_in(struct dlm_message *ms) +{ + header_in(&ms->m_header); + + ms->m_type = le32_to_cpu(ms->m_type); + ms->m_nodeid = le32_to_cpu(ms->m_nodeid); + ms->m_pid = le32_to_cpu(ms->m_pid); + ms->m_lkid = le32_to_cpu(ms->m_lkid); + ms->m_remid = le32_to_cpu(ms->m_remid); + ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid); + ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid); + ms->m_exflags = le32_to_cpu(ms->m_exflags); + ms->m_sbflags = le32_to_cpu(ms->m_sbflags); + ms->m_flags = le32_to_cpu(ms->m_flags); + ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq); + ms->m_hash = le32_to_cpu(ms->m_hash); + ms->m_status = le32_to_cpu(ms->m_status); + ms->m_grmode = le32_to_cpu(ms->m_grmode); + ms->m_rqmode = le32_to_cpu(ms->m_rqmode); + ms->m_bastmode = le32_to_cpu(ms->m_bastmode); + ms->m_asts = le32_to_cpu(ms->m_asts); + ms->m_result = from_dlm_errno(le32_to_cpu(ms->m_result)); +} + +void dlm_rcom_out(struct dlm_rcom *rc) +{ + header_out(&rc->rc_header); + + rc->rc_type = cpu_to_le32(rc->rc_type); + rc->rc_result = cpu_to_le32(rc->rc_result); + rc->rc_id = cpu_to_le64(rc->rc_id); + rc->rc_seq = cpu_to_le64(rc->rc_seq); + rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply); +} + +void dlm_rcom_in(struct dlm_rcom *rc) +{ + header_in(&rc->rc_header); + + rc->rc_type = le32_to_cpu(rc->rc_type); + rc->rc_result = le32_to_cpu(rc->rc_result); + rc->rc_id = le64_to_cpu(rc->rc_id); + rc->rc_seq = le64_to_cpu(rc->rc_seq); + rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply); +} diff --git a/fs/dlm/util.h b/fs/dlm/util.h new file mode 100644 index 00000000..2b259151 --- /dev/null +++ b/fs/dlm/util.h @@ -0,0 +1,22 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __UTIL_DOT_H__ +#define __UTIL_DOT_H__ + +void dlm_message_out(struct dlm_message *ms); +void dlm_message_in(struct dlm_message *ms); +void dlm_rcom_out(struct dlm_rcom *rc); +void dlm_rcom_in(struct dlm_rcom *rc); + +#endif + diff --git a/fs/drop_caches.c b/fs/drop_caches.c new file mode 100644 index 00000000..c00e055b --- /dev/null +++ b/fs/drop_caches.c @@ -0,0 +1,67 @@ +/* + * Implement the manual drop-all-pagecache function + */ + +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* A global variable is a bit ugly, but it keeps the code simple */ +int sysctl_drop_caches; + +static void drop_pagecache_sb(struct super_block *sb, void *unused) +{ + struct inode *inode, *toput_inode = NULL; + + spin_lock(&inode_sb_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); + invalidate_mapping_pages(inode->i_mapping, 0, -1); + iput(toput_inode); + toput_inode = inode; + spin_lock(&inode_sb_list_lock); + } + spin_unlock(&inode_sb_list_lock); + iput(toput_inode); +} + +static void drop_slab(void) +{ + int nr_objects; + struct shrink_control shrink = { + .gfp_mask = GFP_KERNEL, + }; + + do { + nr_objects = shrink_slab(&shrink, 1000, 1000); + } while (nr_objects > 10); +} + +int drop_caches_sysctl_handler(ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret) + return ret; + if (write) { + if (sysctl_drop_caches & 1) + iterate_supers(drop_pagecache_sb, NULL); + if (sysctl_drop_caches & 2) + drop_slab(); + } + return 0; +} diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig new file mode 100644 index 00000000..1cd6d9d3 --- /dev/null +++ b/fs/ecryptfs/Kconfig @@ -0,0 +1,14 @@ +config ECRYPT_FS + tristate "eCrypt filesystem layer support (EXPERIMENTAL)" + depends on EXPERIMENTAL && KEYS && CRYPTO + select CRYPTO_ECB + select CRYPTO_CBC + select CRYPTO_MD5 + help + Encrypted filesystem that operates on the VFS layer. See + to learn more about + eCryptfs. Userspace components are required and can be + obtained from . + + To compile this file system support as a module, choose M here: the + module will be called ecryptfs. diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile new file mode 100644 index 00000000..2cc9ee4a --- /dev/null +++ b/fs/ecryptfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux 2.6 eCryptfs +# + +obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o + +ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o miscdev.o kthread.o debug.o diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c new file mode 100644 index 00000000..c6602d24 --- /dev/null +++ b/fs/ecryptfs/crypto.c @@ -0,0 +1,2238 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2004 Erez Zadok + * Copyright (C) 2001-2004 Stony Brook University + * Copyright (C) 2004-2007 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Michael C. Thompson + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +static int +ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, + struct page *dst_page, int dst_offset, + struct page *src_page, int src_offset, int size, + unsigned char *iv); +static int +ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, + struct page *dst_page, int dst_offset, + struct page *src_page, int src_offset, int size, + unsigned char *iv); + +/** + * ecryptfs_to_hex + * @dst: Buffer to take hex character representation of contents of + * src; must be at least of size (src_size * 2) + * @src: Buffer to be converted to a hex string respresentation + * @src_size: number of bytes to convert + */ +void ecryptfs_to_hex(char *dst, char *src, size_t src_size) +{ + int x; + + for (x = 0; x < src_size; x++) + sprintf(&dst[x * 2], "%.2x", (unsigned char)src[x]); +} + +/** + * ecryptfs_from_hex + * @dst: Buffer to take the bytes from src hex; must be at least of + * size (src_size / 2) + * @src: Buffer to be converted from a hex string respresentation to raw value + * @dst_size: size of dst buffer, or number of hex characters pairs to convert + */ +void ecryptfs_from_hex(char *dst, char *src, int dst_size) +{ + int x; + char tmp[3] = { 0, }; + + for (x = 0; x < dst_size; x++) { + tmp[0] = src[x * 2]; + tmp[1] = src[x * 2 + 1]; + dst[x] = (unsigned char)simple_strtol(tmp, NULL, 16); + } +} + +/** + * ecryptfs_calculate_md5 - calculates the md5 of @src + * @dst: Pointer to 16 bytes of allocated memory + * @crypt_stat: Pointer to crypt_stat struct for the current inode + * @src: Data to be md5'd + * @len: Length of @src + * + * Uses the allocated crypto context that crypt_stat references to + * generate the MD5 sum of the contents of src. + */ +static int ecryptfs_calculate_md5(char *dst, + struct ecryptfs_crypt_stat *crypt_stat, + char *src, int len) +{ + struct scatterlist sg; + struct hash_desc desc = { + .tfm = crypt_stat->hash_tfm, + .flags = CRYPTO_TFM_REQ_MAY_SLEEP + }; + int rc = 0; + + mutex_lock(&crypt_stat->cs_hash_tfm_mutex); + sg_init_one(&sg, (u8 *)src, len); + if (!desc.tfm) { + desc.tfm = crypto_alloc_hash(ECRYPTFS_DEFAULT_HASH, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(desc.tfm)) { + rc = PTR_ERR(desc.tfm); + ecryptfs_printk(KERN_ERR, "Error attempting to " + "allocate crypto context; rc = [%d]\n", + rc); + goto out; + } + crypt_stat->hash_tfm = desc.tfm; + } + rc = crypto_hash_init(&desc); + if (rc) { + printk(KERN_ERR + "%s: Error initializing crypto hash; rc = [%d]\n", + __func__, rc); + goto out; + } + rc = crypto_hash_update(&desc, &sg, len); + if (rc) { + printk(KERN_ERR + "%s: Error updating crypto hash; rc = [%d]\n", + __func__, rc); + goto out; + } + rc = crypto_hash_final(&desc, dst); + if (rc) { + printk(KERN_ERR + "%s: Error finalizing crypto hash; rc = [%d]\n", + __func__, rc); + goto out; + } +out: + mutex_unlock(&crypt_stat->cs_hash_tfm_mutex); + return rc; +} + +static int ecryptfs_crypto_api_algify_cipher_name(char **algified_name, + char *cipher_name, + char *chaining_modifier) +{ + int cipher_name_len = strlen(cipher_name); + int chaining_modifier_len = strlen(chaining_modifier); + int algified_name_len; + int rc; + + algified_name_len = (chaining_modifier_len + cipher_name_len + 3); + (*algified_name) = kmalloc(algified_name_len, GFP_KERNEL); + if (!(*algified_name)) { + rc = -ENOMEM; + goto out; + } + snprintf((*algified_name), algified_name_len, "%s(%s)", + chaining_modifier, cipher_name); + rc = 0; +out: + return rc; +} + +/** + * ecryptfs_derive_iv + * @iv: destination for the derived iv vale + * @crypt_stat: Pointer to crypt_stat struct for the current inode + * @offset: Offset of the extent whose IV we are to derive + * + * Generate the initialization vector from the given root IV and page + * offset. + * + * Returns zero on success; non-zero on error. + */ +int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset) +{ + int rc = 0; + char dst[MD5_DIGEST_SIZE]; + char src[ECRYPTFS_MAX_IV_BYTES + 16]; + + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "root iv:\n"); + ecryptfs_dump_hex(crypt_stat->root_iv, crypt_stat->iv_bytes); + } + /* TODO: It is probably secure to just cast the least + * significant bits of the root IV into an unsigned long and + * add the offset to that rather than go through all this + * hashing business. -Halcrow */ + memcpy(src, crypt_stat->root_iv, crypt_stat->iv_bytes); + memset((src + crypt_stat->iv_bytes), 0, 16); + snprintf((src + crypt_stat->iv_bytes), 16, "%lld", offset); + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "source:\n"); + ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16)); + } + rc = ecryptfs_calculate_md5(dst, crypt_stat, src, + (crypt_stat->iv_bytes + 16)); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error attempting to compute " + "MD5 while generating IV for a page\n"); + goto out; + } + memcpy(iv, dst, crypt_stat->iv_bytes); + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "derived iv:\n"); + ecryptfs_dump_hex(iv, crypt_stat->iv_bytes); + } +out: + return rc; +} + +/** + * ecryptfs_init_crypt_stat + * @crypt_stat: Pointer to the crypt_stat struct to initialize. + * + * Initialize the crypt_stat structure. + */ +void +ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) +{ + memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); + INIT_LIST_HEAD(&crypt_stat->keysig_list); + mutex_init(&crypt_stat->keysig_list_mutex); + mutex_init(&crypt_stat->cs_mutex); + mutex_init(&crypt_stat->cs_tfm_mutex); + mutex_init(&crypt_stat->cs_hash_tfm_mutex); + crypt_stat->flags |= ECRYPTFS_STRUCT_INITIALIZED; +} + +/** + * ecryptfs_destroy_crypt_stat + * @crypt_stat: Pointer to the crypt_stat struct to initialize. + * + * Releases all memory associated with a crypt_stat struct. + */ +void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) +{ + struct ecryptfs_key_sig *key_sig, *key_sig_tmp; + + if (crypt_stat->tfm) + crypto_free_blkcipher(crypt_stat->tfm); + if (crypt_stat->hash_tfm) + crypto_free_hash(crypt_stat->hash_tfm); + list_for_each_entry_safe(key_sig, key_sig_tmp, + &crypt_stat->keysig_list, crypt_stat_list) { + list_del(&key_sig->crypt_stat_list); + kmem_cache_free(ecryptfs_key_sig_cache, key_sig); + } + memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); +} + +void ecryptfs_destroy_mount_crypt_stat( + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + struct ecryptfs_global_auth_tok *auth_tok, *auth_tok_tmp; + + if (!(mount_crypt_stat->flags & ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED)) + return; + mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); + list_for_each_entry_safe(auth_tok, auth_tok_tmp, + &mount_crypt_stat->global_auth_tok_list, + mount_crypt_stat_list) { + list_del(&auth_tok->mount_crypt_stat_list); + if (auth_tok->global_auth_tok_key + && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID)) + key_put(auth_tok->global_auth_tok_key); + kmem_cache_free(ecryptfs_global_auth_tok_cache, auth_tok); + } + mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); + memset(mount_crypt_stat, 0, sizeof(struct ecryptfs_mount_crypt_stat)); +} + +/** + * virt_to_scatterlist + * @addr: Virtual address + * @size: Size of data; should be an even multiple of the block size + * @sg: Pointer to scatterlist array; set to NULL to obtain only + * the number of scatterlist structs required in array + * @sg_size: Max array size + * + * Fills in a scatterlist array with page references for a passed + * virtual address. + * + * Returns the number of scatterlist structs in array used + */ +int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, + int sg_size) +{ + int i = 0; + struct page *pg; + int offset; + int remainder_of_page; + + sg_init_table(sg, sg_size); + + while (size > 0 && i < sg_size) { + pg = virt_to_page(addr); + offset = offset_in_page(addr); + if (sg) + sg_set_page(&sg[i], pg, 0, offset); + remainder_of_page = PAGE_CACHE_SIZE - offset; + if (size >= remainder_of_page) { + if (sg) + sg[i].length = remainder_of_page; + addr += remainder_of_page; + size -= remainder_of_page; + } else { + if (sg) + sg[i].length = size; + addr += size; + size = 0; + } + i++; + } + if (size > 0) + return -ENOMEM; + return i; +} + +/** + * encrypt_scatterlist + * @crypt_stat: Pointer to the crypt_stat struct to initialize. + * @dest_sg: Destination of encrypted data + * @src_sg: Data to be encrypted + * @size: Length of data to be encrypted + * @iv: iv to use during encryption + * + * Returns the number of bytes encrypted; negative value on error + */ +static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, + struct scatterlist *dest_sg, + struct scatterlist *src_sg, int size, + unsigned char *iv) +{ + struct blkcipher_desc desc = { + .tfm = crypt_stat->tfm, + .info = iv, + .flags = CRYPTO_TFM_REQ_MAY_SLEEP + }; + int rc = 0; + + BUG_ON(!crypt_stat || !crypt_stat->tfm + || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)); + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n", + crypt_stat->key_size); + ecryptfs_dump_hex(crypt_stat->key, + crypt_stat->key_size); + } + /* Consider doing this once, when the file is opened */ + mutex_lock(&crypt_stat->cs_tfm_mutex); + if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) { + rc = crypto_blkcipher_setkey(crypt_stat->tfm, crypt_stat->key, + crypt_stat->key_size); + crypt_stat->flags |= ECRYPTFS_KEY_SET; + } + if (rc) { + ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n", + rc); + mutex_unlock(&crypt_stat->cs_tfm_mutex); + rc = -EINVAL; + goto out; + } + ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size); + crypto_blkcipher_encrypt_iv(&desc, dest_sg, src_sg, size); + mutex_unlock(&crypt_stat->cs_tfm_mutex); +out: + return rc; +} + +/** + * ecryptfs_lower_offset_for_extent + * + * Convert an eCryptfs page index into a lower byte offset + */ +static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num, + struct ecryptfs_crypt_stat *crypt_stat) +{ + (*offset) = ecryptfs_lower_header_size(crypt_stat) + + (crypt_stat->extent_size * extent_num); +} + +/** + * ecryptfs_encrypt_extent + * @enc_extent_page: Allocated page into which to encrypt the data in + * @page + * @crypt_stat: crypt_stat containing cryptographic context for the + * encryption operation + * @page: Page containing plaintext data extent to encrypt + * @extent_offset: Page extent offset for use in generating IV + * + * Encrypts one extent of data. + * + * Return zero on success; non-zero otherwise + */ +static int ecryptfs_encrypt_extent(struct page *enc_extent_page, + struct ecryptfs_crypt_stat *crypt_stat, + struct page *page, + unsigned long extent_offset) +{ + loff_t extent_base; + char extent_iv[ECRYPTFS_MAX_IV_BYTES]; + int rc; + + extent_base = (((loff_t)page->index) + * (PAGE_CACHE_SIZE / crypt_stat->extent_size)); + rc = ecryptfs_derive_iv(extent_iv, crypt_stat, + (extent_base + extent_offset)); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for " + "extent [0x%.16llx]; rc = [%d]\n", + (unsigned long long)(extent_base + extent_offset), rc); + goto out; + } + rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0, + page, (extent_offset + * crypt_stat->extent_size), + crypt_stat->extent_size, extent_iv); + if (rc < 0) { + printk(KERN_ERR "%s: Error attempting to encrypt page with " + "page->index = [%ld], extent_offset = [%ld]; " + "rc = [%d]\n", __func__, page->index, extent_offset, + rc); + goto out; + } + rc = 0; +out: + return rc; +} + +/** + * ecryptfs_encrypt_page + * @page: Page mapped from the eCryptfs inode for the file; contains + * decrypted content that needs to be encrypted (to a temporary + * page; not in place) and written out to the lower file + * + * Encrypt an eCryptfs page. This is done on a per-extent basis. Note + * that eCryptfs pages may straddle the lower pages -- for instance, + * if the file was created on a machine with an 8K page size + * (resulting in an 8K header), and then the file is copied onto a + * host with a 32K page size, then when reading page 0 of the eCryptfs + * file, 24K of page 0 of the lower file will be read and decrypted, + * and then 8K of page 1 of the lower file will be read and decrypted. + * + * Returns zero on success; negative on error + */ +int ecryptfs_encrypt_page(struct page *page) +{ + struct inode *ecryptfs_inode; + struct ecryptfs_crypt_stat *crypt_stat; + char *enc_extent_virt; + struct page *enc_extent_page = NULL; + loff_t extent_offset; + int rc = 0; + + ecryptfs_inode = page->mapping->host; + crypt_stat = + &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); + BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); + enc_extent_page = alloc_page(GFP_USER); + if (!enc_extent_page) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Error allocating memory for " + "encrypted extent\n"); + goto out; + } + enc_extent_virt = kmap(enc_extent_page); + for (extent_offset = 0; + extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); + extent_offset++) { + loff_t offset; + + rc = ecryptfs_encrypt_extent(enc_extent_page, crypt_stat, page, + extent_offset); + if (rc) { + printk(KERN_ERR "%s: Error encrypting extent; " + "rc = [%d]\n", __func__, rc); + goto out; + } + ecryptfs_lower_offset_for_extent( + &offset, ((((loff_t)page->index) + * (PAGE_CACHE_SIZE + / crypt_stat->extent_size)) + + extent_offset), crypt_stat); + rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, + offset, crypt_stat->extent_size); + if (rc < 0) { + ecryptfs_printk(KERN_ERR, "Error attempting " + "to write lower page; rc = [%d]" + "\n", rc); + goto out; + } + } + rc = 0; +out: + if (enc_extent_page) { + kunmap(enc_extent_page); + __free_page(enc_extent_page); + } + return rc; +} + +static int ecryptfs_decrypt_extent(struct page *page, + struct ecryptfs_crypt_stat *crypt_stat, + struct page *enc_extent_page, + unsigned long extent_offset) +{ + loff_t extent_base; + char extent_iv[ECRYPTFS_MAX_IV_BYTES]; + int rc; + + extent_base = (((loff_t)page->index) + * (PAGE_CACHE_SIZE / crypt_stat->extent_size)); + rc = ecryptfs_derive_iv(extent_iv, crypt_stat, + (extent_base + extent_offset)); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for " + "extent [0x%.16llx]; rc = [%d]\n", + (unsigned long long)(extent_base + extent_offset), rc); + goto out; + } + rc = ecryptfs_decrypt_page_offset(crypt_stat, page, + (extent_offset + * crypt_stat->extent_size), + enc_extent_page, 0, + crypt_stat->extent_size, extent_iv); + if (rc < 0) { + printk(KERN_ERR "%s: Error attempting to decrypt to page with " + "page->index = [%ld], extent_offset = [%ld]; " + "rc = [%d]\n", __func__, page->index, extent_offset, + rc); + goto out; + } + rc = 0; +out: + return rc; +} + +/** + * ecryptfs_decrypt_page + * @page: Page mapped from the eCryptfs inode for the file; data read + * and decrypted from the lower file will be written into this + * page + * + * Decrypt an eCryptfs page. This is done on a per-extent basis. Note + * that eCryptfs pages may straddle the lower pages -- for instance, + * if the file was created on a machine with an 8K page size + * (resulting in an 8K header), and then the file is copied onto a + * host with a 32K page size, then when reading page 0 of the eCryptfs + * file, 24K of page 0 of the lower file will be read and decrypted, + * and then 8K of page 1 of the lower file will be read and decrypted. + * + * Returns zero on success; negative on error + */ +int ecryptfs_decrypt_page(struct page *page) +{ + struct inode *ecryptfs_inode; + struct ecryptfs_crypt_stat *crypt_stat; + char *enc_extent_virt; + struct page *enc_extent_page = NULL; + unsigned long extent_offset; + int rc = 0; + + ecryptfs_inode = page->mapping->host; + crypt_stat = + &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); + BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); + enc_extent_page = alloc_page(GFP_USER); + if (!enc_extent_page) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Error allocating memory for " + "encrypted extent\n"); + goto out; + } + enc_extent_virt = kmap(enc_extent_page); + for (extent_offset = 0; + extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); + extent_offset++) { + loff_t offset; + + ecryptfs_lower_offset_for_extent( + &offset, ((page->index * (PAGE_CACHE_SIZE + / crypt_stat->extent_size)) + + extent_offset), crypt_stat); + rc = ecryptfs_read_lower(enc_extent_virt, offset, + crypt_stat->extent_size, + ecryptfs_inode); + if (rc < 0) { + ecryptfs_printk(KERN_ERR, "Error attempting " + "to read lower page; rc = [%d]" + "\n", rc); + goto out; + } + rc = ecryptfs_decrypt_extent(page, crypt_stat, enc_extent_page, + extent_offset); + if (rc) { + printk(KERN_ERR "%s: Error encrypting extent; " + "rc = [%d]\n", __func__, rc); + goto out; + } + } +out: + if (enc_extent_page) { + kunmap(enc_extent_page); + __free_page(enc_extent_page); + } + return rc; +} + +/** + * decrypt_scatterlist + * @crypt_stat: Cryptographic context + * @dest_sg: The destination scatterlist to decrypt into + * @src_sg: The source scatterlist to decrypt from + * @size: The number of bytes to decrypt + * @iv: The initialization vector to use for the decryption + * + * Returns the number of bytes decrypted; negative value on error + */ +static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, + struct scatterlist *dest_sg, + struct scatterlist *src_sg, int size, + unsigned char *iv) +{ + struct blkcipher_desc desc = { + .tfm = crypt_stat->tfm, + .info = iv, + .flags = CRYPTO_TFM_REQ_MAY_SLEEP + }; + int rc = 0; + + /* Consider doing this once, when the file is opened */ + mutex_lock(&crypt_stat->cs_tfm_mutex); + rc = crypto_blkcipher_setkey(crypt_stat->tfm, crypt_stat->key, + crypt_stat->key_size); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n", + rc); + mutex_unlock(&crypt_stat->cs_tfm_mutex); + rc = -EINVAL; + goto out; + } + ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size); + rc = crypto_blkcipher_decrypt_iv(&desc, dest_sg, src_sg, size); + mutex_unlock(&crypt_stat->cs_tfm_mutex); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error decrypting; rc = [%d]\n", + rc); + goto out; + } + rc = size; +out: + return rc; +} + +/** + * ecryptfs_encrypt_page_offset + * @crypt_stat: The cryptographic context + * @dst_page: The page to encrypt into + * @dst_offset: The offset in the page to encrypt into + * @src_page: The page to encrypt from + * @src_offset: The offset in the page to encrypt from + * @size: The number of bytes to encrypt + * @iv: The initialization vector to use for the encryption + * + * Returns the number of bytes encrypted + */ +static int +ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, + struct page *dst_page, int dst_offset, + struct page *src_page, int src_offset, int size, + unsigned char *iv) +{ + struct scatterlist src_sg, dst_sg; + + sg_init_table(&src_sg, 1); + sg_init_table(&dst_sg, 1); + + sg_set_page(&src_sg, src_page, size, src_offset); + sg_set_page(&dst_sg, dst_page, size, dst_offset); + return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv); +} + +/** + * ecryptfs_decrypt_page_offset + * @crypt_stat: The cryptographic context + * @dst_page: The page to decrypt into + * @dst_offset: The offset in the page to decrypt into + * @src_page: The page to decrypt from + * @src_offset: The offset in the page to decrypt from + * @size: The number of bytes to decrypt + * @iv: The initialization vector to use for the decryption + * + * Returns the number of bytes decrypted + */ +static int +ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, + struct page *dst_page, int dst_offset, + struct page *src_page, int src_offset, int size, + unsigned char *iv) +{ + struct scatterlist src_sg, dst_sg; + + sg_init_table(&src_sg, 1); + sg_set_page(&src_sg, src_page, size, src_offset); + + sg_init_table(&dst_sg, 1); + sg_set_page(&dst_sg, dst_page, size, dst_offset); + + return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv); +} + +#define ECRYPTFS_MAX_SCATTERLIST_LEN 4 + +/** + * ecryptfs_init_crypt_ctx + * @crypt_stat: Uninitialized crypt stats structure + * + * Initialize the crypto context. + * + * TODO: Performance: Keep a cache of initialized cipher contexts; + * only init if needed + */ +int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat) +{ + char *full_alg_name; + int rc = -EINVAL; + + if (!crypt_stat->cipher) { + ecryptfs_printk(KERN_ERR, "No cipher specified\n"); + goto out; + } + ecryptfs_printk(KERN_DEBUG, + "Initializing cipher [%s]; strlen = [%d]; " + "key_size_bits = [%zd]\n", + crypt_stat->cipher, (int)strlen(crypt_stat->cipher), + crypt_stat->key_size << 3); + if (crypt_stat->tfm) { + rc = 0; + goto out; + } + mutex_lock(&crypt_stat->cs_tfm_mutex); + rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name, + crypt_stat->cipher, "cbc"); + if (rc) + goto out_unlock; + crypt_stat->tfm = crypto_alloc_blkcipher(full_alg_name, 0, + CRYPTO_ALG_ASYNC); + kfree(full_alg_name); + if (IS_ERR(crypt_stat->tfm)) { + rc = PTR_ERR(crypt_stat->tfm); + crypt_stat->tfm = NULL; + ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): " + "Error initializing cipher [%s]\n", + crypt_stat->cipher); + goto out_unlock; + } + crypto_blkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY); + rc = 0; +out_unlock: + mutex_unlock(&crypt_stat->cs_tfm_mutex); +out: + return rc; +} + +static void set_extent_mask_and_shift(struct ecryptfs_crypt_stat *crypt_stat) +{ + int extent_size_tmp; + + crypt_stat->extent_mask = 0xFFFFFFFF; + crypt_stat->extent_shift = 0; + if (crypt_stat->extent_size == 0) + return; + extent_size_tmp = crypt_stat->extent_size; + while ((extent_size_tmp & 0x01) == 0) { + extent_size_tmp >>= 1; + crypt_stat->extent_mask <<= 1; + crypt_stat->extent_shift++; + } +} + +void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) +{ + /* Default values; may be overwritten as we are parsing the + * packets. */ + crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE; + set_extent_mask_and_shift(crypt_stat); + crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES; + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; + else { + if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) + crypt_stat->metadata_size = + ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; + else + crypt_stat->metadata_size = PAGE_CACHE_SIZE; + } +} + +/** + * ecryptfs_compute_root_iv + * @crypt_stats + * + * On error, sets the root IV to all 0's. + */ +int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat) +{ + int rc = 0; + char dst[MD5_DIGEST_SIZE]; + + BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE); + BUG_ON(crypt_stat->iv_bytes <= 0); + if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { + rc = -EINVAL; + ecryptfs_printk(KERN_WARNING, "Session key not valid; " + "cannot generate root IV\n"); + goto out; + } + rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key, + crypt_stat->key_size); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error attempting to compute " + "MD5 while generating root IV\n"); + goto out; + } + memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes); +out: + if (rc) { + memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes); + crypt_stat->flags |= ECRYPTFS_SECURITY_WARNING; + } + return rc; +} + +static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat) +{ + get_random_bytes(crypt_stat->key, crypt_stat->key_size); + crypt_stat->flags |= ECRYPTFS_KEY_VALID; + ecryptfs_compute_root_iv(crypt_stat); + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "Generated new session key:\n"); + ecryptfs_dump_hex(crypt_stat->key, + crypt_stat->key_size); + } +} + +/** + * ecryptfs_copy_mount_wide_flags_to_inode_flags + * @crypt_stat: The inode's cryptographic context + * @mount_crypt_stat: The mount point's cryptographic context + * + * This function propagates the mount-wide flags to individual inode + * flags. + */ +static void ecryptfs_copy_mount_wide_flags_to_inode_flags( + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + if (mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED) + crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; + if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED; + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { + crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES; + if (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK) + crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK; + else if (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_FEK) + crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK; + } +} + +static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + struct ecryptfs_global_auth_tok *global_auth_tok; + int rc = 0; + + mutex_lock(&crypt_stat->keysig_list_mutex); + mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); + + list_for_each_entry(global_auth_tok, + &mount_crypt_stat->global_auth_tok_list, + mount_crypt_stat_list) { + if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_FNEK) + continue; + rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig); + if (rc) { + printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc); + goto out; + } + } + +out: + mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); + mutex_unlock(&crypt_stat->keysig_list_mutex); + return rc; +} + +/** + * ecryptfs_set_default_crypt_stat_vals + * @crypt_stat: The inode's cryptographic context + * @mount_crypt_stat: The mount point's cryptographic context + * + * Default values in the event that policy does not override them. + */ +static void ecryptfs_set_default_crypt_stat_vals( + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat, + mount_crypt_stat); + ecryptfs_set_default_sizes(crypt_stat); + strcpy(crypt_stat->cipher, ECRYPTFS_DEFAULT_CIPHER); + crypt_stat->key_size = ECRYPTFS_DEFAULT_KEY_BYTES; + crypt_stat->flags &= ~(ECRYPTFS_KEY_VALID); + crypt_stat->file_version = ECRYPTFS_FILE_VERSION; + crypt_stat->mount_crypt_stat = mount_crypt_stat; +} + +/** + * ecryptfs_new_file_context + * @ecryptfs_dentry: The eCryptfs dentry + * + * If the crypto context for the file has not yet been established, + * this is where we do that. Establishing a new crypto context + * involves the following decisions: + * - What cipher to use? + * - What set of authentication tokens to use? + * Here we just worry about getting enough information into the + * authentication tokens so that we know that they are available. + * We associate the available authentication tokens with the new file + * via the set of signatures in the crypt_stat struct. Later, when + * the headers are actually written out, we may again defer to + * userspace to perform the encryption of the session key; for the + * foreseeable future, this will be the case with public key packets. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry) +{ + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + int cipher_name_len; + int rc = 0; + + ecryptfs_set_default_crypt_stat_vals(crypt_stat, mount_crypt_stat); + crypt_stat->flags |= (ECRYPTFS_ENCRYPTED | ECRYPTFS_KEY_VALID); + ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat, + mount_crypt_stat); + rc = ecryptfs_copy_mount_wide_sigs_to_inode_sigs(crypt_stat, + mount_crypt_stat); + if (rc) { + printk(KERN_ERR "Error attempting to copy mount-wide key sigs " + "to the inode key sigs; rc = [%d]\n", rc); + goto out; + } + cipher_name_len = + strlen(mount_crypt_stat->global_default_cipher_name); + memcpy(crypt_stat->cipher, + mount_crypt_stat->global_default_cipher_name, + cipher_name_len); + crypt_stat->cipher[cipher_name_len] = '\0'; + crypt_stat->key_size = + mount_crypt_stat->global_default_cipher_key_size; + ecryptfs_generate_new_key(crypt_stat); + rc = ecryptfs_init_crypt_ctx(crypt_stat); + if (rc) + ecryptfs_printk(KERN_ERR, "Error initializing cryptographic " + "context for cipher [%s]: rc = [%d]\n", + crypt_stat->cipher, rc); +out: + return rc; +} + +/** + * ecryptfs_validate_marker - check for the ecryptfs marker + * @data: The data block in which to check + * + * Returns zero if marker found; -EINVAL if not found + */ +static int ecryptfs_validate_marker(char *data) +{ + u32 m_1, m_2; + + m_1 = get_unaligned_be32(data); + m_2 = get_unaligned_be32(data + 4); + if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2) + return 0; + ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; " + "MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2, + MAGIC_ECRYPTFS_MARKER); + ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = " + "[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER)); + return -EINVAL; +} + +struct ecryptfs_flag_map_elem { + u32 file_flag; + u32 local_flag; +}; + +/* Add support for additional flags by adding elements here. */ +static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = { + {0x00000001, ECRYPTFS_ENABLE_HMAC}, + {0x00000002, ECRYPTFS_ENCRYPTED}, + {0x00000004, ECRYPTFS_METADATA_IN_XATTR}, + {0x00000008, ECRYPTFS_ENCRYPT_FILENAMES} +}; + +/** + * ecryptfs_process_flags + * @crypt_stat: The cryptographic context + * @page_virt: Source data to be parsed + * @bytes_read: Updated with the number of bytes read + * + * Returns zero on success; non-zero if the flag set is invalid + */ +static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat, + char *page_virt, int *bytes_read) +{ + int rc = 0; + int i; + u32 flags; + + flags = get_unaligned_be32(page_virt); + for (i = 0; i < ((sizeof(ecryptfs_flag_map) + / sizeof(struct ecryptfs_flag_map_elem))); i++) + if (flags & ecryptfs_flag_map[i].file_flag) { + crypt_stat->flags |= ecryptfs_flag_map[i].local_flag; + } else + crypt_stat->flags &= ~(ecryptfs_flag_map[i].local_flag); + /* Version is in top 8 bits of the 32-bit flag vector */ + crypt_stat->file_version = ((flags >> 24) & 0xFF); + (*bytes_read) = 4; + return rc; +} + +/** + * write_ecryptfs_marker + * @page_virt: The pointer to in a page to begin writing the marker + * @written: Number of bytes written + * + * Marker = 0x3c81b7f5 + */ +static void write_ecryptfs_marker(char *page_virt, size_t *written) +{ + u32 m_1, m_2; + + get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2)); + m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER); + put_unaligned_be32(m_1, page_virt); + page_virt += (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2); + put_unaligned_be32(m_2, page_virt); + (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; +} + +void ecryptfs_write_crypt_stat_flags(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat, + size_t *written) +{ + u32 flags = 0; + int i; + + for (i = 0; i < ((sizeof(ecryptfs_flag_map) + / sizeof(struct ecryptfs_flag_map_elem))); i++) + if (crypt_stat->flags & ecryptfs_flag_map[i].local_flag) + flags |= ecryptfs_flag_map[i].file_flag; + /* Version is in top 8 bits of the 32-bit flag vector */ + flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000); + put_unaligned_be32(flags, page_virt); + (*written) = 4; +} + +struct ecryptfs_cipher_code_str_map_elem { + char cipher_str[16]; + u8 cipher_code; +}; + +/* Add support for additional ciphers by adding elements here. The + * cipher_code is whatever OpenPGP applicatoins use to identify the + * ciphers. List in order of probability. */ +static struct ecryptfs_cipher_code_str_map_elem +ecryptfs_cipher_code_str_map[] = { + {"aes",RFC2440_CIPHER_AES_128 }, + {"blowfish", RFC2440_CIPHER_BLOWFISH}, + {"des3_ede", RFC2440_CIPHER_DES3_EDE}, + {"cast5", RFC2440_CIPHER_CAST_5}, + {"twofish", RFC2440_CIPHER_TWOFISH}, + {"cast6", RFC2440_CIPHER_CAST_6}, + {"aes", RFC2440_CIPHER_AES_192}, + {"aes", RFC2440_CIPHER_AES_256} +}; + +/** + * ecryptfs_code_for_cipher_string + * @cipher_name: The string alias for the cipher + * @key_bytes: Length of key in bytes; used for AES code selection + * + * Returns zero on no match, or the cipher code on match + */ +u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes) +{ + int i; + u8 code = 0; + struct ecryptfs_cipher_code_str_map_elem *map = + ecryptfs_cipher_code_str_map; + + if (strcmp(cipher_name, "aes") == 0) { + switch (key_bytes) { + case 16: + code = RFC2440_CIPHER_AES_128; + break; + case 24: + code = RFC2440_CIPHER_AES_192; + break; + case 32: + code = RFC2440_CIPHER_AES_256; + } + } else { + for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++) + if (strcmp(cipher_name, map[i].cipher_str) == 0) { + code = map[i].cipher_code; + break; + } + } + return code; +} + +/** + * ecryptfs_cipher_code_to_string + * @str: Destination to write out the cipher name + * @cipher_code: The code to convert to cipher name string + * + * Returns zero on success + */ +int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code) +{ + int rc = 0; + int i; + + str[0] = '\0'; + for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++) + if (cipher_code == ecryptfs_cipher_code_str_map[i].cipher_code) + strcpy(str, ecryptfs_cipher_code_str_map[i].cipher_str); + if (str[0] == '\0') { + ecryptfs_printk(KERN_WARNING, "Cipher code not recognized: " + "[%d]\n", cipher_code); + rc = -EINVAL; + } + return rc; +} + +int ecryptfs_read_and_validate_header_region(struct inode *inode) +{ + u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES]; + u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES; + int rc; + + rc = ecryptfs_read_lower(file_size, 0, ECRYPTFS_SIZE_AND_MARKER_BYTES, + inode); + if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES) + return rc >= 0 ? -EINVAL : rc; + rc = ecryptfs_validate_marker(marker); + if (!rc) + ecryptfs_i_size_init(file_size, inode); + return rc; +} + +void +ecryptfs_write_header_metadata(char *virt, + struct ecryptfs_crypt_stat *crypt_stat, + size_t *written) +{ + u32 header_extent_size; + u16 num_header_extents_at_front; + + header_extent_size = (u32)crypt_stat->extent_size; + num_header_extents_at_front = + (u16)(crypt_stat->metadata_size / crypt_stat->extent_size); + put_unaligned_be32(header_extent_size, virt); + virt += 4; + put_unaligned_be16(num_header_extents_at_front, virt); + (*written) = 6; +} + +struct kmem_cache *ecryptfs_header_cache; + +/** + * ecryptfs_write_headers_virt + * @page_virt: The virtual address to write the headers to + * @max: The size of memory allocated at page_virt + * @size: Set to the number of bytes written by this function + * @crypt_stat: The cryptographic context + * @ecryptfs_dentry: The eCryptfs dentry + * + * Format version: 1 + * + * Header Extent: + * Octets 0-7: Unencrypted file size (big-endian) + * Octets 8-15: eCryptfs special marker + * Octets 16-19: Flags + * Octet 16: File format version number (between 0 and 255) + * Octets 17-18: Reserved + * Octet 19: Bit 1 (lsb): Reserved + * Bit 2: Encrypted? + * Bits 3-8: Reserved + * Octets 20-23: Header extent size (big-endian) + * Octets 24-25: Number of header extents at front of file + * (big-endian) + * Octet 26: Begin RFC 2440 authentication token packet set + * Data Extent 0: + * Lower data (CBC encrypted) + * Data Extent 1: + * Lower data (CBC encrypted) + * ... + * + * Returns zero on success + */ +static int ecryptfs_write_headers_virt(char *page_virt, size_t max, + size_t *size, + struct ecryptfs_crypt_stat *crypt_stat, + struct dentry *ecryptfs_dentry) +{ + int rc; + size_t written; + size_t offset; + + offset = ECRYPTFS_FILE_SIZE_BYTES; + write_ecryptfs_marker((page_virt + offset), &written); + offset += written; + ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat, + &written); + offset += written; + ecryptfs_write_header_metadata((page_virt + offset), crypt_stat, + &written); + offset += written; + rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat, + ecryptfs_dentry, &written, + max - offset); + if (rc) + ecryptfs_printk(KERN_WARNING, "Error generating key packet " + "set; rc = [%d]\n", rc); + if (size) { + offset += written; + *size = offset; + } + return rc; +} + +static int +ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, + char *virt, size_t virt_len) +{ + int rc; + + rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, + 0, virt_len); + if (rc < 0) + printk(KERN_ERR "%s: Error attempting to write header " + "information to lower file; rc = [%d]\n", __func__, rc); + else + rc = 0; + return rc; +} + +static int +ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, + char *page_virt, size_t size) +{ + int rc; + + rc = ecryptfs_setxattr(ecryptfs_dentry, ECRYPTFS_XATTR_NAME, page_virt, + size, 0); + return rc; +} + +static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, + unsigned int order) +{ + struct page *page; + + page = alloc_pages(gfp_mask | __GFP_ZERO, order); + if (page) + return (unsigned long) page_address(page); + return 0; +} + +/** + * ecryptfs_write_metadata + * @ecryptfs_dentry: The eCryptfs dentry + * + * Write the file headers out. This will likely involve a userspace + * callout, in which the session key is encrypted with one or more + * public keys and/or the passphrase necessary to do the encryption is + * retrieved via a prompt. Exactly what happens at this point should + * be policy-dependent. + * + * Returns zero on success; non-zero on error + */ +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) +{ + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + unsigned int order; + char *virt; + size_t virt_len; + size_t size = 0; + int rc = 0; + + if (likely(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { + printk(KERN_ERR "Key is invalid; bailing out\n"); + rc = -EINVAL; + goto out; + } + } else { + printk(KERN_WARNING "%s: Encrypted flag not set\n", + __func__); + rc = -EINVAL; + goto out; + } + virt_len = crypt_stat->metadata_size; + order = get_order(virt_len); + /* Released in this function */ + virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order); + if (!virt) { + printk(KERN_ERR "%s: Out of memory\n", __func__); + rc = -ENOMEM; + goto out; + } + /* Zeroed page ensures the in-header unencrypted i_size is set to 0 */ + rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat, + ecryptfs_dentry); + if (unlikely(rc)) { + printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n", + __func__, rc); + goto out_free; + } + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, + size); + else + rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt, + virt_len); + if (rc) { + printk(KERN_ERR "%s: Error writing metadata out to lower file; " + "rc = [%d]\n", __func__, rc); + goto out_free; + } +out_free: + free_pages((unsigned long)virt, order); +out: + return rc; +} + +#define ECRYPTFS_DONT_VALIDATE_HEADER_SIZE 0 +#define ECRYPTFS_VALIDATE_HEADER_SIZE 1 +static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat, + char *virt, int *bytes_read, + int validate_header_size) +{ + int rc = 0; + u32 header_extent_size; + u16 num_header_extents_at_front; + + header_extent_size = get_unaligned_be32(virt); + virt += sizeof(__be32); + num_header_extents_at_front = get_unaligned_be16(virt); + crypt_stat->metadata_size = (((size_t)num_header_extents_at_front + * (size_t)header_extent_size)); + (*bytes_read) = (sizeof(__be32) + sizeof(__be16)); + if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE) + && (crypt_stat->metadata_size + < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) { + rc = -EINVAL; + printk(KERN_WARNING "Invalid header size: [%zd]\n", + crypt_stat->metadata_size); + } + return rc; +} + +/** + * set_default_header_data + * @crypt_stat: The cryptographic context + * + * For version 0 file format; this function is only for backwards + * compatibility for files created with the prior versions of + * eCryptfs. + */ +static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat) +{ + crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; +} + +void ecryptfs_i_size_init(const char *page_virt, struct inode *inode) +{ + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct ecryptfs_crypt_stat *crypt_stat; + u64 file_size; + + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + mount_crypt_stat = + &ecryptfs_superblock_to_private(inode->i_sb)->mount_crypt_stat; + if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { + file_size = i_size_read(ecryptfs_inode_to_lower(inode)); + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + file_size += crypt_stat->metadata_size; + } else + file_size = get_unaligned_be64(page_virt); + i_size_write(inode, (loff_t)file_size); + crypt_stat->flags |= ECRYPTFS_I_SIZE_INITIALIZED; +} + +/** + * ecryptfs_read_headers_virt + * @page_virt: The virtual address into which to read the headers + * @crypt_stat: The cryptographic context + * @ecryptfs_dentry: The eCryptfs dentry + * @validate_header_size: Whether to validate the header size while reading + * + * Read/parse the header data. The header format is detailed in the + * comment block for the ecryptfs_write_headers_virt() function. + * + * Returns zero on success + */ +static int ecryptfs_read_headers_virt(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat, + struct dentry *ecryptfs_dentry, + int validate_header_size) +{ + int rc = 0; + int offset; + int bytes_read; + + ecryptfs_set_default_sizes(crypt_stat); + crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + offset = ECRYPTFS_FILE_SIZE_BYTES; + rc = ecryptfs_validate_marker(page_virt + offset); + if (rc) + goto out; + if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED)) + ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode); + offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; + rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset), + &bytes_read); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error processing flags\n"); + goto out; + } + if (crypt_stat->file_version > ECRYPTFS_SUPPORTED_FILE_VERSION) { + ecryptfs_printk(KERN_WARNING, "File version is [%d]; only " + "file version [%d] is supported by this " + "version of eCryptfs\n", + crypt_stat->file_version, + ECRYPTFS_SUPPORTED_FILE_VERSION); + rc = -EINVAL; + goto out; + } + offset += bytes_read; + if (crypt_stat->file_version >= 1) { + rc = parse_header_metadata(crypt_stat, (page_virt + offset), + &bytes_read, validate_header_size); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error reading header " + "metadata; rc = [%d]\n", rc); + } + offset += bytes_read; + } else + set_default_header_data(crypt_stat); + rc = ecryptfs_parse_packet_set(crypt_stat, (page_virt + offset), + ecryptfs_dentry); +out: + return rc; +} + +/** + * ecryptfs_read_xattr_region + * @page_virt: The vitual address into which to read the xattr data + * @ecryptfs_inode: The eCryptfs inode + * + * Attempts to read the crypto metadata from the extended attribute + * region of the lower file. + * + * Returns zero on success; non-zero on error + */ +int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode) +{ + struct dentry *lower_dentry = + ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_dentry; + ssize_t size; + int rc = 0; + + size = ecryptfs_getxattr_lower(lower_dentry, ECRYPTFS_XATTR_NAME, + page_virt, ECRYPTFS_DEFAULT_EXTENT_SIZE); + if (size < 0) { + if (unlikely(ecryptfs_verbosity > 0)) + printk(KERN_INFO "Error attempting to read the [%s] " + "xattr from the lower file; return value = " + "[%zd]\n", ECRYPTFS_XATTR_NAME, size); + rc = -EINVAL; + goto out; + } +out: + return rc; +} + +int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, + struct inode *inode) +{ + u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES]; + u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES; + int rc; + + rc = ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry), + ECRYPTFS_XATTR_NAME, file_size, + ECRYPTFS_SIZE_AND_MARKER_BYTES); + if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES) + return rc >= 0 ? -EINVAL : rc; + rc = ecryptfs_validate_marker(marker); + if (!rc) + ecryptfs_i_size_init(file_size, inode); + return rc; +} + +/** + * ecryptfs_read_metadata + * + * Common entry point for reading file metadata. From here, we could + * retrieve the header information from the header region of the file, + * the xattr region of the file, or some other repostory that is + * stored separately from the file itself. The current implementation + * supports retrieving the metadata information from the file contents + * and from the xattr region. + * + * Returns zero if valid headers found and parsed; non-zero otherwise + */ +int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) +{ + int rc = 0; + char *page_virt = NULL; + struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + + ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat, + mount_crypt_stat); + /* Read the first page from the underlying file */ + page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER); + if (!page_virt) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Unable to allocate page_virt\n", + __func__); + goto out; + } + rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size, + ecryptfs_inode); + if (rc >= 0) + rc = ecryptfs_read_headers_virt(page_virt, crypt_stat, + ecryptfs_dentry, + ECRYPTFS_VALIDATE_HEADER_SIZE); + if (rc) { + memset(page_virt, 0, PAGE_CACHE_SIZE); + rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode); + if (rc) { + printk(KERN_DEBUG "Valid eCryptfs headers not found in " + "file header region or xattr region, inode %lu\n", + ecryptfs_inode->i_ino); + rc = -EINVAL; + goto out; + } + rc = ecryptfs_read_headers_virt(page_virt, crypt_stat, + ecryptfs_dentry, + ECRYPTFS_DONT_VALIDATE_HEADER_SIZE); + if (rc) { + printk(KERN_DEBUG "Valid eCryptfs headers not found in " + "file xattr region either, inode %lu\n", + ecryptfs_inode->i_ino); + rc = -EINVAL; + } + if (crypt_stat->mount_crypt_stat->flags + & ECRYPTFS_XATTR_METADATA_ENABLED) { + crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; + } else { + printk(KERN_WARNING "Attempt to access file with " + "crypto metadata only in the extended attribute " + "region, but eCryptfs was mounted without " + "xattr support enabled. eCryptfs will not treat " + "this like an encrypted file, inode %lu\n", + ecryptfs_inode->i_ino); + rc = -EINVAL; + } + } +out: + if (page_virt) { + memset(page_virt, 0, PAGE_CACHE_SIZE); + kmem_cache_free(ecryptfs_header_cache, page_virt); + } + return rc; +} + +/** + * ecryptfs_encrypt_filename - encrypt filename + * + * CBC-encrypts the filename. We do not want to encrypt the same + * filename with the same key and IV, which may happen with hard + * links, so we prepend random bits to each filename. + * + * Returns zero on success; non-zero otherwise + */ +static int +ecryptfs_encrypt_filename(struct ecryptfs_filename *filename, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + int rc = 0; + + filename->encrypted_filename = NULL; + filename->encrypted_filename_size = 0; + if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { + size_t packet_size; + size_t remaining_bytes; + + rc = ecryptfs_write_tag_70_packet( + NULL, NULL, + &filename->encrypted_filename_size, + mount_crypt_stat, NULL, + filename->filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to get packet " + "size for tag 72; rc = [%d]\n", __func__, + rc); + filename->encrypted_filename_size = 0; + goto out; + } + filename->encrypted_filename = + kmalloc(filename->encrypted_filename_size, GFP_KERNEL); + if (!filename->encrypted_filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc [%zd] bytes\n", __func__, + filename->encrypted_filename_size); + rc = -ENOMEM; + goto out; + } + remaining_bytes = filename->encrypted_filename_size; + rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename, + &remaining_bytes, + &packet_size, + mount_crypt_stat, + filename->filename, + filename->filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to generate " + "tag 70 packet; rc = [%d]\n", __func__, + rc); + kfree(filename->encrypted_filename); + filename->encrypted_filename = NULL; + filename->encrypted_filename_size = 0; + goto out; + } + filename->encrypted_filename_size = packet_size; + } else { + printk(KERN_ERR "%s: No support for requested filename " + "encryption method in this release\n", __func__); + rc = -EOPNOTSUPP; + goto out; + } +out: + return rc; +} + +static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size, + const char *name, size_t name_size) +{ + int rc = 0; + + (*copied_name) = kmalloc((name_size + 1), GFP_KERNEL); + if (!(*copied_name)) { + rc = -ENOMEM; + goto out; + } + memcpy((void *)(*copied_name), (void *)name, name_size); + (*copied_name)[(name_size)] = '\0'; /* Only for convenience + * in printing out the + * string in debug + * messages */ + (*copied_name_size) = name_size; +out: + return rc; +} + +/** + * ecryptfs_process_key_cipher - Perform key cipher initialization. + * @key_tfm: Crypto context for key material, set by this function + * @cipher_name: Name of the cipher + * @key_size: Size of the key in bytes + * + * Returns zero on success. Any crypto_tfm structs allocated here + * should be released by other functions, such as on a superblock put + * event, regardless of whether this function succeeds for fails. + */ +static int +ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, + char *cipher_name, size_t *key_size) +{ + char dummy_key[ECRYPTFS_MAX_KEY_BYTES]; + char *full_alg_name = NULL; + int rc; + + *key_tfm = NULL; + if (*key_size > ECRYPTFS_MAX_KEY_BYTES) { + rc = -EINVAL; + printk(KERN_ERR "Requested key size is [%zd] bytes; maximum " + "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES); + goto out; + } + rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name, cipher_name, + "ecb"); + if (rc) + goto out; + *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(*key_tfm)) { + rc = PTR_ERR(*key_tfm); + printk(KERN_ERR "Unable to allocate crypto cipher with name " + "[%s]; rc = [%d]\n", full_alg_name, rc); + goto out; + } + crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY); + if (*key_size == 0) { + struct blkcipher_alg *alg = crypto_blkcipher_alg(*key_tfm); + + *key_size = alg->max_keysize; + } + get_random_bytes(dummy_key, *key_size); + rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); + if (rc) { + printk(KERN_ERR "Error attempting to set key of size [%zd] for " + "cipher [%s]; rc = [%d]\n", *key_size, full_alg_name, + rc); + rc = -EINVAL; + goto out; + } +out: + kfree(full_alg_name); + return rc; +} + +struct kmem_cache *ecryptfs_key_tfm_cache; +static struct list_head key_tfm_list; +struct mutex key_tfm_list_mutex; + +int __init ecryptfs_init_crypto(void) +{ + mutex_init(&key_tfm_list_mutex); + INIT_LIST_HEAD(&key_tfm_list); + return 0; +} + +/** + * ecryptfs_destroy_crypto - free all cached key_tfms on key_tfm_list + * + * Called only at module unload time + */ +int ecryptfs_destroy_crypto(void) +{ + struct ecryptfs_key_tfm *key_tfm, *key_tfm_tmp; + + mutex_lock(&key_tfm_list_mutex); + list_for_each_entry_safe(key_tfm, key_tfm_tmp, &key_tfm_list, + key_tfm_list) { + list_del(&key_tfm->key_tfm_list); + if (key_tfm->key_tfm) + crypto_free_blkcipher(key_tfm->key_tfm); + kmem_cache_free(ecryptfs_key_tfm_cache, key_tfm); + } + mutex_unlock(&key_tfm_list_mutex); + return 0; +} + +int +ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name, + size_t key_size) +{ + struct ecryptfs_key_tfm *tmp_tfm; + int rc = 0; + + BUG_ON(!mutex_is_locked(&key_tfm_list_mutex)); + + tmp_tfm = kmem_cache_alloc(ecryptfs_key_tfm_cache, GFP_KERNEL); + if (key_tfm != NULL) + (*key_tfm) = tmp_tfm; + if (!tmp_tfm) { + rc = -ENOMEM; + printk(KERN_ERR "Error attempting to allocate from " + "ecryptfs_key_tfm_cache\n"); + goto out; + } + mutex_init(&tmp_tfm->key_tfm_mutex); + strncpy(tmp_tfm->cipher_name, cipher_name, + ECRYPTFS_MAX_CIPHER_NAME_SIZE); + tmp_tfm->cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; + tmp_tfm->key_size = key_size; + rc = ecryptfs_process_key_cipher(&tmp_tfm->key_tfm, + tmp_tfm->cipher_name, + &tmp_tfm->key_size); + if (rc) { + printk(KERN_ERR "Error attempting to initialize key TFM " + "cipher with name = [%s]; rc = [%d]\n", + tmp_tfm->cipher_name, rc); + kmem_cache_free(ecryptfs_key_tfm_cache, tmp_tfm); + if (key_tfm != NULL) + (*key_tfm) = NULL; + goto out; + } + list_add(&tmp_tfm->key_tfm_list, &key_tfm_list); +out: + return rc; +} + +/** + * ecryptfs_tfm_exists - Search for existing tfm for cipher_name. + * @cipher_name: the name of the cipher to search for + * @key_tfm: set to corresponding tfm if found + * + * Searches for cached key_tfm matching @cipher_name + * Must be called with &key_tfm_list_mutex held + * Returns 1 if found, with @key_tfm set + * Returns 0 if not found, with @key_tfm set to NULL + */ +int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm) +{ + struct ecryptfs_key_tfm *tmp_key_tfm; + + BUG_ON(!mutex_is_locked(&key_tfm_list_mutex)); + + list_for_each_entry(tmp_key_tfm, &key_tfm_list, key_tfm_list) { + if (strcmp(tmp_key_tfm->cipher_name, cipher_name) == 0) { + if (key_tfm) + (*key_tfm) = tmp_key_tfm; + return 1; + } + } + if (key_tfm) + (*key_tfm) = NULL; + return 0; +} + +/** + * ecryptfs_get_tfm_and_mutex_for_cipher_name + * + * @tfm: set to cached tfm found, or new tfm created + * @tfm_mutex: set to mutex for cached tfm found, or new tfm created + * @cipher_name: the name of the cipher to search for and/or add + * + * Sets pointers to @tfm & @tfm_mutex matching @cipher_name. + * Searches for cached item first, and creates new if not found. + * Returns 0 on success, non-zero if adding new cipher failed + */ +int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm, + struct mutex **tfm_mutex, + char *cipher_name) +{ + struct ecryptfs_key_tfm *key_tfm; + int rc = 0; + + (*tfm) = NULL; + (*tfm_mutex) = NULL; + + mutex_lock(&key_tfm_list_mutex); + if (!ecryptfs_tfm_exists(cipher_name, &key_tfm)) { + rc = ecryptfs_add_new_key_tfm(&key_tfm, cipher_name, 0); + if (rc) { + printk(KERN_ERR "Error adding new key_tfm to list; " + "rc = [%d]\n", rc); + goto out; + } + } + (*tfm) = key_tfm->key_tfm; + (*tfm_mutex) = &key_tfm->key_tfm_mutex; +out: + mutex_unlock(&key_tfm_list_mutex); + return rc; +} + +/* 64 characters forming a 6-bit target field */ +static unsigned char *portable_filename_chars = ("-.0123456789ABCD" + "EFGHIJKLMNOPQRST" + "UVWXYZabcdefghij" + "klmnopqrstuvwxyz"); + +/* We could either offset on every reverse map or just pad some 0x00's + * at the front here */ +static const unsigned char filename_rev_map[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */ + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */ + 0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */ + 0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */ + 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */ + 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */ + 0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */ + 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ + 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ + 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ + 0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */ +}; + +/** + * ecryptfs_encode_for_filename + * @dst: Destination location for encoded filename + * @dst_size: Size of the encoded filename in bytes + * @src: Source location for the filename to encode + * @src_size: Size of the source in bytes + */ +void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size, + unsigned char *src, size_t src_size) +{ + size_t num_blocks; + size_t block_num = 0; + size_t dst_offset = 0; + unsigned char last_block[3]; + + if (src_size == 0) { + (*dst_size) = 0; + goto out; + } + num_blocks = (src_size / 3); + if ((src_size % 3) == 0) { + memcpy(last_block, (&src[src_size - 3]), 3); + } else { + num_blocks++; + last_block[2] = 0x00; + switch (src_size % 3) { + case 1: + last_block[0] = src[src_size - 1]; + last_block[1] = 0x00; + break; + case 2: + last_block[0] = src[src_size - 2]; + last_block[1] = src[src_size - 1]; + } + } + (*dst_size) = (num_blocks * 4); + if (!dst) + goto out; + while (block_num < num_blocks) { + unsigned char *src_block; + unsigned char dst_block[4]; + + if (block_num == (num_blocks - 1)) + src_block = last_block; + else + src_block = &src[block_num * 3]; + dst_block[0] = ((src_block[0] >> 2) & 0x3F); + dst_block[1] = (((src_block[0] << 4) & 0x30) + | ((src_block[1] >> 4) & 0x0F)); + dst_block[2] = (((src_block[1] << 2) & 0x3C) + | ((src_block[2] >> 6) & 0x03)); + dst_block[3] = (src_block[2] & 0x3F); + dst[dst_offset++] = portable_filename_chars[dst_block[0]]; + dst[dst_offset++] = portable_filename_chars[dst_block[1]]; + dst[dst_offset++] = portable_filename_chars[dst_block[2]]; + dst[dst_offset++] = portable_filename_chars[dst_block[3]]; + block_num++; + } +out: + return; +} + +/** + * ecryptfs_decode_from_filename + * @dst: If NULL, this function only sets @dst_size and returns. If + * non-NULL, this function decodes the encoded octets in @src + * into the memory that @dst points to. + * @dst_size: Set to the size of the decoded string. + * @src: The encoded set of octets to decode. + * @src_size: The size of the encoded set of octets to decode. + */ +static void +ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size, + const unsigned char *src, size_t src_size) +{ + u8 current_bit_offset = 0; + size_t src_byte_offset = 0; + size_t dst_byte_offset = 0; + + if (dst == NULL) { + /* Not exact; conservatively long. Every block of 4 + * encoded characters decodes into a block of 3 + * decoded characters. This segment of code provides + * the caller with the maximum amount of allocated + * space that @dst will need to point to in a + * subsequent call. */ + (*dst_size) = (((src_size + 1) * 3) / 4); + goto out; + } + while (src_byte_offset < src_size) { + unsigned char src_byte = + filename_rev_map[(int)src[src_byte_offset]]; + + switch (current_bit_offset) { + case 0: + dst[dst_byte_offset] = (src_byte << 2); + current_bit_offset = 6; + break; + case 6: + dst[dst_byte_offset++] |= (src_byte >> 4); + dst[dst_byte_offset] = ((src_byte & 0xF) + << 4); + current_bit_offset = 4; + break; + case 4: + dst[dst_byte_offset++] |= (src_byte >> 2); + dst[dst_byte_offset] = (src_byte << 6); + current_bit_offset = 2; + break; + case 2: + dst[dst_byte_offset++] |= (src_byte); + dst[dst_byte_offset] = 0; + current_bit_offset = 0; + break; + } + src_byte_offset++; + } + (*dst_size) = dst_byte_offset; +out: + return; +} + +/** + * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text + * @crypt_stat: The crypt_stat struct associated with the file anem to encode + * @name: The plaintext name + * @length: The length of the plaintext + * @encoded_name: The encypted name + * + * Encrypts and encodes a filename into something that constitutes a + * valid filename for a filesystem, with printable characters. + * + * We assume that we have a properly initialized crypto context, + * pointed to by crypt_stat->tfm. + * + * Returns zero on success; non-zero on otherwise + */ +int ecryptfs_encrypt_and_encode_filename( + char **encoded_name, + size_t *encoded_name_size, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + const char *name, size_t name_size) +{ + size_t encoded_name_no_prefix_size; + int rc = 0; + + (*encoded_name) = NULL; + (*encoded_name_size) = 0; + if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) + || (mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) { + struct ecryptfs_filename *filename; + + filename = kzalloc(sizeof(*filename), GFP_KERNEL); + if (!filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kzalloc [%zd] bytes\n", __func__, + sizeof(*filename)); + rc = -ENOMEM; + goto out; + } + filename->filename = (char *)name; + filename->filename_size = name_size; + rc = ecryptfs_encrypt_filename(filename, crypt_stat, + mount_crypt_stat); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt " + "filename; rc = [%d]\n", __func__, rc); + kfree(filename); + goto out; + } + ecryptfs_encode_for_filename( + NULL, &encoded_name_no_prefix_size, + filename->encrypted_filename, + filename->encrypted_filename_size); + if ((crypt_stat && (crypt_stat->flags + & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat + && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) + (*encoded_name_size) = + (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + else + (*encoded_name_size) = + (ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL); + if (!(*encoded_name)) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kzalloc [%zd] bytes\n", __func__, + (*encoded_name_size)); + rc = -ENOMEM; + kfree(filename->encrypted_filename); + kfree(filename); + goto out; + } + if ((crypt_stat && (crypt_stat->flags + & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat + && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { + memcpy((*encoded_name), + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE); + ecryptfs_encode_for_filename( + ((*encoded_name) + + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE), + &encoded_name_no_prefix_size, + filename->encrypted_filename, + filename->encrypted_filename_size); + (*encoded_name_size) = + (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + (*encoded_name)[(*encoded_name_size)] = '\0'; + } else { + rc = -EOPNOTSUPP; + } + if (rc) { + printk(KERN_ERR "%s: Error attempting to encode " + "encrypted filename; rc = [%d]\n", __func__, + rc); + kfree((*encoded_name)); + (*encoded_name) = NULL; + (*encoded_name_size) = 0; + } + kfree(filename->encrypted_filename); + kfree(filename); + } else { + rc = ecryptfs_copy_filename(encoded_name, + encoded_name_size, + name, name_size); + } +out: + return rc; +} + +/** + * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext + * @plaintext_name: The plaintext name + * @plaintext_name_size: The plaintext name size + * @ecryptfs_dir_dentry: eCryptfs directory dentry + * @name: The filename in cipher text + * @name_size: The cipher text name size + * + * Decrypts and decodes the filename. + * + * Returns zero on error; non-zero otherwise + */ +int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, + size_t *plaintext_name_size, + struct dentry *ecryptfs_dir_dentry, + const char *name, size_t name_size) +{ + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; + char *decoded_name; + size_t decoded_name_size; + size_t packet_size; + int rc = 0; + + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) + && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) { + const char *orig_name = name; + size_t orig_name_size = name_size; + + name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; + name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; + ecryptfs_decode_from_filename(NULL, &decoded_name_size, + name, name_size); + decoded_name = kmalloc(decoded_name_size, GFP_KERNEL); + if (!decoded_name) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc [%zd] bytes\n", __func__, + decoded_name_size); + rc = -ENOMEM; + goto out; + } + ecryptfs_decode_from_filename(decoded_name, &decoded_name_size, + name, name_size); + rc = ecryptfs_parse_tag_70_packet(plaintext_name, + plaintext_name_size, + &packet_size, + mount_crypt_stat, + decoded_name, + decoded_name_size); + if (rc) { + printk(KERN_INFO "%s: Could not parse tag 70 packet " + "from filename; copying through filename " + "as-is\n", __func__); + rc = ecryptfs_copy_filename(plaintext_name, + plaintext_name_size, + orig_name, orig_name_size); + goto out_free; + } + } else { + rc = ecryptfs_copy_filename(plaintext_name, + plaintext_name_size, + name, name_size); + goto out; + } +out_free: + kfree(decoded_name); +out: + return rc; +} diff --git a/fs/ecryptfs/debug.c b/fs/ecryptfs/debug.c new file mode 100644 index 00000000..3d2bdf54 --- /dev/null +++ b/fs/ecryptfs/debug.c @@ -0,0 +1,121 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * Functions only useful for debugging. + * + * Copyright (C) 2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include "ecryptfs_kernel.h" + +/** + * ecryptfs_dump_auth_tok - debug function to print auth toks + * + * This function will print the contents of an ecryptfs authentication + * token. + */ +void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok) +{ + char salt[ECRYPTFS_SALT_SIZE * 2 + 1]; + char sig[ECRYPTFS_SIG_SIZE_HEX + 1]; + + ecryptfs_printk(KERN_DEBUG, "Auth tok at mem loc [%p]:\n", + auth_tok); + if (auth_tok->flags & ECRYPTFS_PRIVATE_KEY) { + ecryptfs_printk(KERN_DEBUG, " * private key type\n"); + } else { + ecryptfs_printk(KERN_DEBUG, " * passphrase type\n"); + ecryptfs_to_hex(salt, auth_tok->token.password.salt, + ECRYPTFS_SALT_SIZE); + salt[ECRYPTFS_SALT_SIZE * 2] = '\0'; + ecryptfs_printk(KERN_DEBUG, " * salt = [%s]\n", salt); + if (auth_tok->token.password.flags & + ECRYPTFS_PERSISTENT_PASSWORD) { + ecryptfs_printk(KERN_DEBUG, " * persistent\n"); + } + memcpy(sig, auth_tok->token.password.signature, + ECRYPTFS_SIG_SIZE_HEX); + sig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; + ecryptfs_printk(KERN_DEBUG, " * signature = [%s]\n", sig); + } + ecryptfs_printk(KERN_DEBUG, " * session_key.flags = [0x%x]\n", + auth_tok->session_key.flags); + if (auth_tok->session_key.flags + & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT) + ecryptfs_printk(KERN_DEBUG, + " * Userspace decrypt request set\n"); + if (auth_tok->session_key.flags + & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT) + ecryptfs_printk(KERN_DEBUG, + " * Userspace encrypt request set\n"); + if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_DECRYPTED_KEY) { + ecryptfs_printk(KERN_DEBUG, " * Contains decrypted key\n"); + ecryptfs_printk(KERN_DEBUG, + " * session_key.decrypted_key_size = [0x%x]\n", + auth_tok->session_key.decrypted_key_size); + ecryptfs_printk(KERN_DEBUG, " * Decrypted session key " + "dump:\n"); + if (ecryptfs_verbosity > 0) + ecryptfs_dump_hex(auth_tok->session_key.decrypted_key, + ECRYPTFS_DEFAULT_KEY_BYTES); + } + if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_ENCRYPTED_KEY) { + ecryptfs_printk(KERN_DEBUG, " * Contains encrypted key\n"); + ecryptfs_printk(KERN_DEBUG, + " * session_key.encrypted_key_size = [0x%x]\n", + auth_tok->session_key.encrypted_key_size); + ecryptfs_printk(KERN_DEBUG, " * Encrypted session key " + "dump:\n"); + if (ecryptfs_verbosity > 0) + ecryptfs_dump_hex(auth_tok->session_key.encrypted_key, + auth_tok->session_key. + encrypted_key_size); + } +} + +/** + * ecryptfs_dump_hex - debug hex printer + * @data: string of bytes to be printed + * @bytes: number of bytes to print + * + * Dump hexadecimal representation of char array + */ +void ecryptfs_dump_hex(char *data, int bytes) +{ + int i = 0; + int add_newline = 1; + + if (ecryptfs_verbosity < 1) + return; + if (bytes != 0) { + printk(KERN_DEBUG "0x%.2x.", (unsigned char)data[i]); + i++; + } + while (i < bytes) { + printk("0x%.2x.", (unsigned char)data[i]); + i++; + if (i % 16 == 0) { + printk("\n"); + add_newline = 0; + } else + add_newline = 1; + } + if (add_newline) + printk("\n"); +} + diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c new file mode 100644 index 00000000..534c1d46 --- /dev/null +++ b/fs/ecryptfs/dentry.c @@ -0,0 +1,105 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +/** + * ecryptfs_d_revalidate - revalidate an ecryptfs dentry + * @dentry: The ecryptfs dentry + * @nd: The associated nameidata + * + * Called when the VFS needs to revalidate a dentry. This + * is called whenever a name lookup finds a dentry in the + * dcache. Most filesystems leave this as NULL, because all their + * dentries in the dcache are valid. + * + * Returns 1 if valid, 0 otherwise. + * + */ +static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *lower_dentry; + struct vfsmount *lower_mnt; + struct dentry *dentry_save = NULL; + struct vfsmount *vfsmount_save = NULL; + int rc = 1; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); + if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) + goto out; + if (nd) { + dentry_save = nd->path.dentry; + vfsmount_save = nd->path.mnt; + nd->path.dentry = lower_dentry; + nd->path.mnt = lower_mnt; + } + rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd); + if (nd) { + nd->path.dentry = dentry_save; + nd->path.mnt = vfsmount_save; + } + if (dentry->d_inode) { + struct inode *lower_inode = + ecryptfs_inode_to_lower(dentry->d_inode); + + fsstack_copy_attr_all(dentry->d_inode, lower_inode); + } +out: + return rc; +} + +struct kmem_cache *ecryptfs_dentry_info_cache; + +/** + * ecryptfs_d_release + * @dentry: The ecryptfs dentry + * + * Called when a dentry is really deallocated. + */ +static void ecryptfs_d_release(struct dentry *dentry) +{ + if (ecryptfs_dentry_to_private(dentry)) { + if (ecryptfs_dentry_to_lower(dentry)) { + dput(ecryptfs_dentry_to_lower(dentry)); + mntput(ecryptfs_dentry_to_lower_mnt(dentry)); + } + kmem_cache_free(ecryptfs_dentry_info_cache, + ecryptfs_dentry_to_private(dentry)); + } + return; +} + +const struct dentry_operations ecryptfs_dops = { + .d_revalidate = ecryptfs_d_revalidate, + .d_release = ecryptfs_d_release, +}; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h new file mode 100644 index 00000000..43c7c43b --- /dev/null +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -0,0 +1,771 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * Kernel declarations. + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2008 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Trevor S. Highland + * Tyler Hicks + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#ifndef ECRYPTFS_KERNEL_H +#define ECRYPTFS_KERNEL_H + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Version verification for shared data structures w/ userspace */ +#define ECRYPTFS_VERSION_MAJOR 0x00 +#define ECRYPTFS_VERSION_MINOR 0x04 +#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x03 +/* These flags indicate which features are supported by the kernel + * module; userspace tools such as the mount helper read + * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine + * how to behave. */ +#define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001 +#define ECRYPTFS_VERSIONING_PUBKEY 0x00000002 +#define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004 +#define ECRYPTFS_VERSIONING_POLICY 0x00000008 +#define ECRYPTFS_VERSIONING_XATTR 0x00000010 +#define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 +#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 +#define ECRYPTFS_VERSIONING_HMAC 0x00000080 +#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100 +#define ECRYPTFS_VERSIONING_GCM 0x00000200 +#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ + | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ + | ECRYPTFS_VERSIONING_PUBKEY \ + | ECRYPTFS_VERSIONING_XATTR \ + | ECRYPTFS_VERSIONING_MULTKEY \ + | ECRYPTFS_VERSIONING_DEVMISC \ + | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) +#define ECRYPTFS_MAX_PASSWORD_LENGTH 64 +#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH +#define ECRYPTFS_SALT_SIZE 8 +#define ECRYPTFS_SALT_SIZE_HEX (ECRYPTFS_SALT_SIZE*2) +/* The original signature size is only for what is stored on disk; all + * in-memory representations are expanded hex, so it better adapted to + * be passed around or referenced on the command line */ +#define ECRYPTFS_SIG_SIZE 8 +#define ECRYPTFS_SIG_SIZE_HEX (ECRYPTFS_SIG_SIZE*2) +#define ECRYPTFS_PASSWORD_SIG_SIZE ECRYPTFS_SIG_SIZE_HEX +#define ECRYPTFS_MAX_KEY_BYTES 64 +#define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512 +#define ECRYPTFS_DEFAULT_IV_BYTES 16 +#define ECRYPTFS_FILE_VERSION 0x03 +#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096 +#define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192 +#define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32 +#define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ +#define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3) +#define ECRYPTFS_MAX_PKI_NAME_BYTES 16 +#define ECRYPTFS_DEFAULT_NUM_USERS 4 +#define ECRYPTFS_MAX_NUM_USERS 32768 +#define ECRYPTFS_XATTR_NAME "user.ecryptfs" + +#define RFC2440_CIPHER_DES3_EDE 0x02 +#define RFC2440_CIPHER_CAST_5 0x03 +#define RFC2440_CIPHER_BLOWFISH 0x04 +#define RFC2440_CIPHER_AES_128 0x07 +#define RFC2440_CIPHER_AES_192 0x08 +#define RFC2440_CIPHER_AES_256 0x09 +#define RFC2440_CIPHER_TWOFISH 0x0a +#define RFC2440_CIPHER_CAST_6 0x0b + +#define RFC2440_CIPHER_RSA 0x01 + +/** + * For convenience, we may need to pass around the encrypted session + * key between kernel and userspace because the authentication token + * may not be extractable. For example, the TPM may not release the + * private key, instead requiring the encrypted data and returning the + * decrypted data. + */ +struct ecryptfs_session_key { +#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT 0x00000001 +#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT 0x00000002 +#define ECRYPTFS_CONTAINS_DECRYPTED_KEY 0x00000004 +#define ECRYPTFS_CONTAINS_ENCRYPTED_KEY 0x00000008 + u32 flags; + u32 encrypted_key_size; + u32 decrypted_key_size; + u8 encrypted_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES]; + u8 decrypted_key[ECRYPTFS_MAX_KEY_BYTES]; +}; + +struct ecryptfs_password { + u32 password_bytes; + s32 hash_algo; + u32 hash_iterations; + u32 session_key_encryption_key_bytes; +#define ECRYPTFS_PERSISTENT_PASSWORD 0x01 +#define ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET 0x02 + u32 flags; + /* Iterated-hash concatenation of salt and passphrase */ + u8 session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES]; + u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1]; + /* Always in expanded hex */ + u8 salt[ECRYPTFS_SALT_SIZE]; +}; + +enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY}; + +struct ecryptfs_private_key { + u32 key_size; + u32 data_len; + u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1]; + char pki_type[ECRYPTFS_MAX_PKI_NAME_BYTES + 1]; + u8 data[]; +}; + +/* May be a password or a private key */ +struct ecryptfs_auth_tok { + u16 version; /* 8-bit major and 8-bit minor */ + u16 token_type; +#define ECRYPTFS_ENCRYPT_ONLY 0x00000001 + u32 flags; + struct ecryptfs_session_key session_key; + u8 reserved[32]; + union { + struct ecryptfs_password password; + struct ecryptfs_private_key private_key; + } token; +} __attribute__ ((packed)); + +void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok); +extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size); +extern void ecryptfs_from_hex(char *dst, char *src, int dst_size); + +struct ecryptfs_key_record { + unsigned char type; + size_t enc_key_size; + unsigned char sig[ECRYPTFS_SIG_SIZE]; + unsigned char enc_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES]; +}; + +struct ecryptfs_auth_tok_list { + struct ecryptfs_auth_tok *auth_tok; + struct list_head list; +}; + +struct ecryptfs_crypt_stat; +struct ecryptfs_mount_crypt_stat; + +struct ecryptfs_page_crypt_context { + struct page *page; +#define ECRYPTFS_PREPARE_COMMIT_MODE 0 +#define ECRYPTFS_WRITEPAGE_MODE 1 + unsigned int mode; + union { + struct file *lower_file; + struct writeback_control *wbc; + } param; +}; + +static inline struct ecryptfs_auth_tok * +ecryptfs_get_key_payload_data(struct key *key) +{ + return (struct ecryptfs_auth_tok *) + (((struct user_key_payload*)key->payload.data)->data); +} + +#define ECRYPTFS_MAX_KEYSET_SIZE 1024 +#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32 +#define ECRYPTFS_MAX_NUM_ENC_KEYS 64 +#define ECRYPTFS_MAX_IV_BYTES 16 /* 128 bits */ +#define ECRYPTFS_SALT_BYTES 2 +#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5 +#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8 /* 4*2 */ +#define ECRYPTFS_FILE_SIZE_BYTES (sizeof(u64)) +#define ECRYPTFS_SIZE_AND_MARKER_BYTES (ECRYPTFS_FILE_SIZE_BYTES \ + + MAGIC_ECRYPTFS_MARKER_SIZE_BYTES) +#define ECRYPTFS_DEFAULT_CIPHER "aes" +#define ECRYPTFS_DEFAULT_KEY_BYTES 16 +#define ECRYPTFS_DEFAULT_HASH "md5" +#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH +#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 +#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C +#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED +#define ECRYPTFS_TAG_64_PACKET_TYPE 0x40 +#define ECRYPTFS_TAG_65_PACKET_TYPE 0x41 +#define ECRYPTFS_TAG_66_PACKET_TYPE 0x42 +#define ECRYPTFS_TAG_67_PACKET_TYPE 0x43 +#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename + * as dentry name */ +#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in + * metadata */ +#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as + * dentry name */ +#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as + * metadata */ +/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >= + * ECRYPTFS_MAX_IV_BYTES */ +#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16 +#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */ +#define MD5_DIGEST_SIZE 16 +#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE +#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED." +#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23 +#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED." +#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24 +#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32) + +struct ecryptfs_key_sig { + struct list_head crypt_stat_list; + char keysig[ECRYPTFS_SIG_SIZE_HEX + 1]; +}; + +struct ecryptfs_filename { + struct list_head crypt_stat_list; +#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001 + u32 flags; + u32 seq_no; + char *filename; + char *encrypted_filename; + size_t filename_size; + size_t encrypted_filename_size; + char fnek_sig[ECRYPTFS_SIG_SIZE_HEX]; + char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1]; +}; + +/** + * This is the primary struct associated with each encrypted file. + * + * TODO: cache align/pack? + */ +struct ecryptfs_crypt_stat { +#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 +#define ECRYPTFS_POLICY_APPLIED 0x00000002 +#define ECRYPTFS_ENCRYPTED 0x00000004 +#define ECRYPTFS_SECURITY_WARNING 0x00000008 +#define ECRYPTFS_ENABLE_HMAC 0x00000010 +#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000020 +#define ECRYPTFS_KEY_VALID 0x00000040 +#define ECRYPTFS_METADATA_IN_XATTR 0x00000080 +#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000100 +#define ECRYPTFS_KEY_SET 0x00000200 +#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000400 +#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00000800 +#define ECRYPTFS_ENCFN_USE_FEK 0x00001000 +#define ECRYPTFS_UNLINK_SIGS 0x00002000 +#define ECRYPTFS_I_SIZE_INITIALIZED 0x00004000 + u32 flags; + unsigned int file_version; + size_t iv_bytes; + size_t metadata_size; + size_t extent_size; /* Data extent size; default is 4096 */ + size_t key_size; + size_t extent_shift; + unsigned int extent_mask; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct crypto_blkcipher *tfm; + struct crypto_hash *hash_tfm; /* Crypto context for generating + * the initialization vectors */ + unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; + unsigned char key[ECRYPTFS_MAX_KEY_BYTES]; + unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES]; + struct list_head keysig_list; + struct mutex keysig_list_mutex; + struct mutex cs_tfm_mutex; + struct mutex cs_hash_tfm_mutex; + struct mutex cs_mutex; +}; + +/* inode private data. */ +struct ecryptfs_inode_info { + struct inode vfs_inode; + struct inode *wii_inode; + struct mutex lower_file_mutex; + atomic_t lower_file_count; + struct file *lower_file; + struct ecryptfs_crypt_stat crypt_stat; +}; + +/* dentry private data. Each dentry must keep track of a lower + * vfsmount too. */ +struct ecryptfs_dentry_info { + struct path lower_path; + struct ecryptfs_crypt_stat *crypt_stat; +}; + +/** + * ecryptfs_global_auth_tok - A key used to encrypt all new files under the mountpoint + * @flags: Status flags + * @mount_crypt_stat_list: These auth_toks hang off the mount-wide + * cryptographic context. Every time a new + * inode comes into existence, eCryptfs copies + * the auth_toks on that list to the set of + * auth_toks on the inode's crypt_stat + * @global_auth_tok_key: The key from the user's keyring for the sig + * @global_auth_tok: The key contents + * @sig: The key identifier + * + * ecryptfs_global_auth_tok structs refer to authentication token keys + * in the user keyring that apply to newly created files. A list of + * these objects hangs off of the mount_crypt_stat struct for any + * given eCryptfs mount. This struct maintains a reference to both the + * key contents and the key itself so that the key can be put on + * unmount. + */ +struct ecryptfs_global_auth_tok { +#define ECRYPTFS_AUTH_TOK_INVALID 0x00000001 +#define ECRYPTFS_AUTH_TOK_FNEK 0x00000002 + u32 flags; + struct list_head mount_crypt_stat_list; + struct key *global_auth_tok_key; + unsigned char sig[ECRYPTFS_SIG_SIZE_HEX + 1]; +}; + +/** + * ecryptfs_key_tfm - Persistent key tfm + * @key_tfm: crypto API handle to the key + * @key_size: Key size in bytes + * @key_tfm_mutex: Mutex to ensure only one operation in eCryptfs is + * using the persistent TFM at any point in time + * @key_tfm_list: Handle to hang this off the module-wide TFM list + * @cipher_name: String name for the cipher for this TFM + * + * Typically, eCryptfs will use the same ciphers repeatedly throughout + * the course of its operations. In order to avoid unnecessarily + * destroying and initializing the same cipher repeatedly, eCryptfs + * keeps a list of crypto API contexts around to use when needed. + */ +struct ecryptfs_key_tfm { + struct crypto_blkcipher *key_tfm; + size_t key_size; + struct mutex key_tfm_mutex; + struct list_head key_tfm_list; + unsigned char cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; +}; + +extern struct mutex key_tfm_list_mutex; + +/** + * This struct is to enable a mount-wide passphrase/salt combo. This + * is more or less a stopgap to provide similar functionality to other + * crypto filesystems like EncFS or CFS until full policy support is + * implemented in eCryptfs. + */ +struct ecryptfs_mount_crypt_stat { + /* Pointers to memory we do not own, do not free these */ +#define ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED 0x00000001 +#define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002 +#define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004 +#define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008 +#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010 +#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020 +#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040 +#define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY 0x00000080 + u32 flags; + struct list_head global_auth_tok_list; + struct mutex global_auth_tok_list_mutex; + size_t global_default_cipher_key_size; + size_t global_default_fn_cipher_key_bytes; + unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE + + 1]; + unsigned char global_default_fn_cipher_name[ + ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; + char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1]; +}; + +/* superblock private data. */ +struct ecryptfs_sb_info { + struct super_block *wsi_sb; + struct ecryptfs_mount_crypt_stat mount_crypt_stat; + struct backing_dev_info bdi; +}; + +/* file private data. */ +struct ecryptfs_file_info { + struct file *wfi_file; + struct ecryptfs_crypt_stat *crypt_stat; +}; + +/* auth_tok <=> encrypted_session_key mappings */ +struct ecryptfs_auth_tok_list_item { + unsigned char encrypted_session_key[ECRYPTFS_MAX_KEY_BYTES]; + struct list_head list; + struct ecryptfs_auth_tok auth_tok; +}; + +struct ecryptfs_message { + /* Can never be greater than ecryptfs_message_buf_len */ + /* Used to find the parent msg_ctx */ + /* Inherits from msg_ctx->index */ + u32 index; + u32 data_len; + u8 data[]; +}; + +struct ecryptfs_msg_ctx { +#define ECRYPTFS_MSG_CTX_STATE_FREE 0x01 +#define ECRYPTFS_MSG_CTX_STATE_PENDING 0x02 +#define ECRYPTFS_MSG_CTX_STATE_DONE 0x03 +#define ECRYPTFS_MSG_CTX_STATE_NO_REPLY 0x04 + u8 state; +#define ECRYPTFS_MSG_HELO 100 +#define ECRYPTFS_MSG_QUIT 101 +#define ECRYPTFS_MSG_REQUEST 102 +#define ECRYPTFS_MSG_RESPONSE 103 + u8 type; + u32 index; + /* Counter converts to a sequence number. Each message sent + * out for which we expect a response has an associated + * sequence number. The response must have the same sequence + * number as the counter for the msg_stc for the message to be + * valid. */ + u32 counter; + size_t msg_size; + struct ecryptfs_message *msg; + struct task_struct *task; + struct list_head node; + struct list_head daemon_out_list; + struct mutex mux; +}; + +struct ecryptfs_daemon; + +struct ecryptfs_daemon { +#define ECRYPTFS_DAEMON_IN_READ 0x00000001 +#define ECRYPTFS_DAEMON_IN_POLL 0x00000002 +#define ECRYPTFS_DAEMON_ZOMBIE 0x00000004 +#define ECRYPTFS_DAEMON_MISCDEV_OPEN 0x00000008 + u32 flags; + u32 num_queued_msg_ctx; + struct pid *pid; + uid_t euid; + struct user_namespace *user_ns; + struct task_struct *task; + struct mutex mux; + struct list_head msg_ctx_out_queue; + wait_queue_head_t wait; + struct hlist_node euid_chain; +}; + +extern struct mutex ecryptfs_daemon_hash_mux; + +static inline size_t +ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat) +{ + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + return 0; + return crypt_stat->metadata_size; +} + +static inline struct ecryptfs_file_info * +ecryptfs_file_to_private(struct file *file) +{ + return file->private_data; +} + +static inline void +ecryptfs_set_file_private(struct file *file, + struct ecryptfs_file_info *file_info) +{ + file->private_data = file_info; +} + +static inline struct file *ecryptfs_file_to_lower(struct file *file) +{ + return ((struct ecryptfs_file_info *)file->private_data)->wfi_file; +} + +static inline void +ecryptfs_set_file_lower(struct file *file, struct file *lower_file) +{ + ((struct ecryptfs_file_info *)file->private_data)->wfi_file = + lower_file; +} + +static inline struct ecryptfs_inode_info * +ecryptfs_inode_to_private(struct inode *inode) +{ + return container_of(inode, struct ecryptfs_inode_info, vfs_inode); +} + +static inline struct inode *ecryptfs_inode_to_lower(struct inode *inode) +{ + return ecryptfs_inode_to_private(inode)->wii_inode; +} + +static inline void +ecryptfs_set_inode_lower(struct inode *inode, struct inode *lower_inode) +{ + ecryptfs_inode_to_private(inode)->wii_inode = lower_inode; +} + +static inline struct ecryptfs_sb_info * +ecryptfs_superblock_to_private(struct super_block *sb) +{ + return (struct ecryptfs_sb_info *)sb->s_fs_info; +} + +static inline void +ecryptfs_set_superblock_private(struct super_block *sb, + struct ecryptfs_sb_info *sb_info) +{ + sb->s_fs_info = sb_info; +} + +static inline struct super_block * +ecryptfs_superblock_to_lower(struct super_block *sb) +{ + return ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb; +} + +static inline void +ecryptfs_set_superblock_lower(struct super_block *sb, + struct super_block *lower_sb) +{ + ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb; +} + +static inline struct ecryptfs_dentry_info * +ecryptfs_dentry_to_private(struct dentry *dentry) +{ + return (struct ecryptfs_dentry_info *)dentry->d_fsdata; +} + +static inline void +ecryptfs_set_dentry_private(struct dentry *dentry, + struct ecryptfs_dentry_info *dentry_info) +{ + dentry->d_fsdata = dentry_info; +} + +static inline struct dentry * +ecryptfs_dentry_to_lower(struct dentry *dentry) +{ + return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry; +} + +static inline void +ecryptfs_set_dentry_lower(struct dentry *dentry, struct dentry *lower_dentry) +{ + ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry = + lower_dentry; +} + +static inline struct vfsmount * +ecryptfs_dentry_to_lower_mnt(struct dentry *dentry) +{ + return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt; +} + +static inline void +ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt) +{ + ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt = + lower_mnt; +} + +#define ecryptfs_printk(type, fmt, arg...) \ + __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); +__attribute__ ((format(printf, 1, 2))) +void __ecryptfs_printk(const char *fmt, ...); + +extern const struct file_operations ecryptfs_main_fops; +extern const struct file_operations ecryptfs_dir_fops; +extern const struct inode_operations ecryptfs_main_iops; +extern const struct inode_operations ecryptfs_dir_iops; +extern const struct inode_operations ecryptfs_symlink_iops; +extern const struct super_operations ecryptfs_sops; +extern const struct dentry_operations ecryptfs_dops; +extern const struct address_space_operations ecryptfs_aops; +extern int ecryptfs_verbosity; +extern unsigned int ecryptfs_message_buf_len; +extern signed long ecryptfs_message_wait_timeout; +extern unsigned int ecryptfs_number_of_users; + +extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache; +extern struct kmem_cache *ecryptfs_file_info_cache; +extern struct kmem_cache *ecryptfs_dentry_info_cache; +extern struct kmem_cache *ecryptfs_inode_info_cache; +extern struct kmem_cache *ecryptfs_sb_info_cache; +extern struct kmem_cache *ecryptfs_header_cache; +extern struct kmem_cache *ecryptfs_xattr_cache; +extern struct kmem_cache *ecryptfs_key_record_cache; +extern struct kmem_cache *ecryptfs_key_sig_cache; +extern struct kmem_cache *ecryptfs_global_auth_tok_cache; +extern struct kmem_cache *ecryptfs_key_tfm_cache; +extern struct kmem_cache *ecryptfs_open_req_cache; + +struct ecryptfs_open_req { +#define ECRYPTFS_REQ_PROCESSED 0x00000001 +#define ECRYPTFS_REQ_DROPPED 0x00000002 +#define ECRYPTFS_REQ_ZOMBIE 0x00000004 + u32 flags; + struct file **lower_file; + struct dentry *lower_dentry; + struct vfsmount *lower_mnt; + wait_queue_head_t wait; + struct mutex mux; + struct list_head kthread_ctl_list; +}; + +struct inode *ecryptfs_get_inode(struct inode *lower_inode, + struct super_block *sb); +void ecryptfs_i_size_init(const char *page_virt, struct inode *inode); +int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, + size_t *decrypted_name_size, + struct dentry *ecryptfs_dentry, + const char *name, size_t name_size); +int ecryptfs_fill_zeros(struct file *file, loff_t new_length); +int ecryptfs_encrypt_and_encode_filename( + char **encoded_name, + size_t *encoded_name_size, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + const char *name, size_t name_size); +struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); +void ecryptfs_dump_hex(char *data, int bytes); +int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, + int sg_size); +int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat); +void ecryptfs_rotate_iv(unsigned char *iv); +void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); +void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); +void ecryptfs_destroy_mount_crypt_stat( + struct ecryptfs_mount_crypt_stat *mount_crypt_stat); +int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat); +int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); +int ecryptfs_encrypt_page(struct page *page); +int ecryptfs_decrypt_page(struct page *page); +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); +int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); +int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); +void ecryptfs_write_crypt_stat_flags(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat, + size_t *written); +int ecryptfs_read_and_validate_header_region(struct inode *inode); +int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, + struct inode *inode); +u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes); +int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code); +void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); +int ecryptfs_generate_key_packet_set(char *dest_base, + struct ecryptfs_crypt_stat *crypt_stat, + struct dentry *ecryptfs_dentry, + size_t *len, size_t max); +int +ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, + unsigned char *src, struct dentry *ecryptfs_dentry); +int ecryptfs_truncate(struct dentry *dentry, loff_t new_length); +ssize_t +ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, + void *value, size_t size); +int +ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); +int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); +int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns, + struct pid *pid); +int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns, + struct pid *pid); +int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, + struct user_namespace *user_ns, struct pid *pid, + u32 seq); +int ecryptfs_send_message(char *data, int data_len, + struct ecryptfs_msg_ctx **msg_ctx); +int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, + struct ecryptfs_message **emsg); +int ecryptfs_init_messaging(void); +void ecryptfs_release_messaging(void); + +void +ecryptfs_write_header_metadata(char *virt, + struct ecryptfs_crypt_stat *crypt_stat, + size_t *written); +int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig); +int +ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *sig, u32 global_auth_tok_flags); +int ecryptfs_get_global_auth_tok_for_sig( + struct ecryptfs_global_auth_tok **global_auth_tok, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig); +int +ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name, + size_t key_size); +int ecryptfs_init_crypto(void); +int ecryptfs_destroy_crypto(void); +int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm); +int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm, + struct mutex **tfm_mutex, + char *cipher_name); +int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, + struct ecryptfs_auth_tok **auth_tok, + char *sig); +int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, + loff_t offset, size_t size); +int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, + struct page *page_for_lower, + size_t offset_in_page, size_t size); +int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size); +int ecryptfs_read_lower(char *data, loff_t offset, size_t size, + struct inode *ecryptfs_inode); +int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, + pgoff_t page_index, + size_t offset_in_page, size_t size, + struct inode *ecryptfs_inode); +struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index); +int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); +int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, + struct user_namespace *user_ns); +int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, + size_t *length_size); +int ecryptfs_write_packet_length(char *dest, size_t size, + size_t *packet_size_length); +int ecryptfs_init_ecryptfs_miscdev(void); +void ecryptfs_destroy_ecryptfs_miscdev(void); +int ecryptfs_send_miscdev(char *data, size_t data_size, + struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, + u16 msg_flags, struct ecryptfs_daemon *daemon); +void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx); +int +ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, + struct user_namespace *user_ns, struct pid *pid); +int ecryptfs_init_kthread(void); +void ecryptfs_destroy_kthread(void); +int ecryptfs_privileged_open(struct file **lower_file, + struct dentry *lower_dentry, + struct vfsmount *lower_mnt, + const struct cred *cred); +int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode); +void ecryptfs_put_lower_file(struct inode *inode); +int +ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *filename, size_t filename_size); +int +ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *data, size_t max_packet_size); +int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset); + +#endif /* #ifndef ECRYPTFS_KERNEL_H */ diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c new file mode 100644 index 00000000..0c1a6527 --- /dev/null +++ b/fs/ecryptfs/file.c @@ -0,0 +1,379 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2004 Erez Zadok + * Copyright (C) 2001-2004 Stony Brook University + * Copyright (C) 2004-2007 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Michael C. Thompson + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +/** + * ecryptfs_read_update_atime + * + * generic_file_read updates the atime of upper layer inode. But, it + * doesn't give us a chance to update the atime of the lower layer + * inode. This function is a wrapper to generic_file_read. It + * updates the atime of the lower level inode if generic_file_read + * returns without any errors. This is to be used only for file reads. + * The function to be used for directory reads is ecryptfs_read. + */ +static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + ssize_t rc; + struct dentry *lower_dentry; + struct vfsmount *lower_vfsmount; + struct file *file = iocb->ki_filp; + + rc = generic_file_aio_read(iocb, iov, nr_segs, pos); + /* + * Even though this is a async interface, we need to wait + * for IO to finish to update atime + */ + if (-EIOCBQUEUED == rc) + rc = wait_on_sync_kiocb(iocb); + if (rc >= 0) { + lower_dentry = ecryptfs_dentry_to_lower(file->f_path.dentry); + lower_vfsmount = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry); + touch_atime(lower_vfsmount, lower_dentry); + } + return rc; +} + +struct ecryptfs_getdents_callback { + void *dirent; + struct dentry *dentry; + filldir_t filldir; + int filldir_called; + int entries_written; +}; + +/* Inspired by generic filldir in fs/readdir.c */ +static int +ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ecryptfs_getdents_callback *buf = + (struct ecryptfs_getdents_callback *)dirent; + size_t name_size; + char *name; + int rc; + + buf->filldir_called++; + rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size, + buf->dentry, lower_name, + lower_namelen); + if (rc) { + printk(KERN_ERR "%s: Error attempting to decode and decrypt " + "filename [%s]; rc = [%d]\n", __func__, lower_name, + rc); + goto out; + } + rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type); + kfree(name); + if (rc >= 0) + buf->entries_written++; +out: + return rc; +} + +/** + * ecryptfs_readdir + * @file: The eCryptfs directory file + * @dirent: Directory entry handle + * @filldir: The filldir callback function + */ +static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + int rc; + struct file *lower_file; + struct inode *inode; + struct ecryptfs_getdents_callback buf; + + lower_file = ecryptfs_file_to_lower(file); + lower_file->f_pos = file->f_pos; + inode = file->f_path.dentry->d_inode; + memset(&buf, 0, sizeof(buf)); + buf.dirent = dirent; + buf.dentry = file->f_path.dentry; + buf.filldir = filldir; + buf.filldir_called = 0; + buf.entries_written = 0; + rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf); + file->f_pos = lower_file->f_pos; + if (rc < 0) + goto out; + if (buf.filldir_called && !buf.entries_written) + goto out; + if (rc >= 0) + fsstack_copy_attr_atime(inode, + lower_file->f_path.dentry->d_inode); +out: + return rc; +} + +static void ecryptfs_vma_close(struct vm_area_struct *vma) +{ + filemap_write_and_wait(vma->vm_file->f_mapping); +} + +static const struct vm_operations_struct ecryptfs_file_vm_ops = { + .close = ecryptfs_vma_close, + .fault = filemap_fault, +}; + +static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int rc; + + rc = generic_file_mmap(file, vma); + if (!rc) + vma->vm_ops = &ecryptfs_file_vm_ops; + + return rc; +} + +struct kmem_cache *ecryptfs_file_info_cache; + +/** + * ecryptfs_open + * @inode: inode speciying file to open + * @file: Structure to return filled in + * + * Opens the file specified by inode. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_open(struct inode *inode, struct file *file) +{ + int rc = 0; + struct ecryptfs_crypt_stat *crypt_stat = NULL; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct dentry *ecryptfs_dentry = file->f_path.dentry; + /* Private value of ecryptfs_dentry allocated in + * ecryptfs_lookup() */ + struct dentry *lower_dentry; + struct ecryptfs_file_info *file_info; + + mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + if ((mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + && ((file->f_flags & O_WRONLY) || (file->f_flags & O_RDWR) + || (file->f_flags & O_CREAT) || (file->f_flags & O_TRUNC) + || (file->f_flags & O_APPEND))) { + printk(KERN_WARNING "Mount has encrypted view enabled; " + "files may only be read\n"); + rc = -EPERM; + goto out; + } + /* Released in ecryptfs_release or end of function if failure */ + file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); + ecryptfs_set_file_private(file, file_info); + if (!file_info) { + ecryptfs_printk(KERN_ERR, + "Error attempting to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + mutex_lock(&crypt_stat->cs_mutex); + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) { + ecryptfs_printk(KERN_DEBUG, "Setting flags for stat...\n"); + /* Policy code enabled in future release */ + crypt_stat->flags |= (ECRYPTFS_POLICY_APPLIED + | ECRYPTFS_ENCRYPTED); + } + mutex_unlock(&crypt_stat->cs_mutex); + rc = ecryptfs_get_lower_file(ecryptfs_dentry, inode); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + ecryptfs_dentry->d_name.name, rc); + goto out_free; + } + if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE) + == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) { + rc = -EPERM; + printk(KERN_WARNING "%s: Lower file is RO; eCryptfs " + "file must hence be opened RO\n", __func__); + goto out_put; + } + ecryptfs_set_file_lower( + file, ecryptfs_inode_to_private(inode)->lower_file); + if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { + ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); + mutex_lock(&crypt_stat->cs_mutex); + crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); + mutex_unlock(&crypt_stat->cs_mutex); + rc = 0; + goto out; + } + mutex_lock(&crypt_stat->cs_mutex); + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED) + || !(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { + rc = ecryptfs_read_metadata(ecryptfs_dentry); + if (rc) { + ecryptfs_printk(KERN_DEBUG, + "Valid headers not found\n"); + if (!(mount_crypt_stat->flags + & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) { + rc = -EIO; + printk(KERN_WARNING "Either the lower file " + "is not in a valid eCryptfs format, " + "or the key could not be retrieved. " + "Plaintext passthrough mode is not " + "enabled; returning -EIO\n"); + mutex_unlock(&crypt_stat->cs_mutex); + goto out_put; + } + rc = 0; + crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED + | ECRYPTFS_ENCRYPTED); + mutex_unlock(&crypt_stat->cs_mutex); + goto out; + } + } + mutex_unlock(&crypt_stat->cs_mutex); + ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = " + "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino, + (unsigned long long)i_size_read(inode)); + goto out; +out_put: + ecryptfs_put_lower_file(inode); +out_free: + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(file)); +out: + return rc; +} + +static int ecryptfs_flush(struct file *file, fl_owner_t td) +{ + return file->f_mode & FMODE_WRITE + ? filemap_write_and_wait(file->f_mapping) : 0; +} + +static int ecryptfs_release(struct inode *inode, struct file *file) +{ + ecryptfs_put_lower_file(inode); + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(file)); + return 0; +} + +static int +ecryptfs_fsync(struct file *file, int datasync) +{ + int rc = 0; + + rc = generic_file_fsync(file, datasync); + if (rc) + goto out; + rc = vfs_fsync(ecryptfs_file_to_lower(file), datasync); +out: + return rc; +} + +static int ecryptfs_fasync(int fd, struct file *file, int flag) +{ + int rc = 0; + struct file *lower_file = NULL; + + lower_file = ecryptfs_file_to_lower(file); + if (lower_file->f_op && lower_file->f_op->fasync) + rc = lower_file->f_op->fasync(fd, lower_file, flag); + return rc; +} + +static long +ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct file *lower_file = NULL; + long rc = -ENOTTY; + + if (ecryptfs_file_to_private(file)) + lower_file = ecryptfs_file_to_lower(file); + if (lower_file && lower_file->f_op && lower_file->f_op->unlocked_ioctl) + rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); + return rc; +} + +#ifdef CONFIG_COMPAT +static long +ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct file *lower_file = NULL; + long rc = -ENOIOCTLCMD; + + if (ecryptfs_file_to_private(file)) + lower_file = ecryptfs_file_to_lower(file); + if (lower_file && lower_file->f_op && lower_file->f_op->compat_ioctl) + rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); + return rc; +} +#endif + +const struct file_operations ecryptfs_dir_fops = { + .readdir = ecryptfs_readdir, + .read = generic_read_dir, + .unlocked_ioctl = ecryptfs_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ecryptfs_compat_ioctl, +#endif + .open = ecryptfs_open, + .flush = ecryptfs_flush, + .release = ecryptfs_release, + .fsync = ecryptfs_fsync, + .fasync = ecryptfs_fasync, + .splice_read = generic_file_splice_read, + .llseek = default_llseek, +}; + +const struct file_operations ecryptfs_main_fops = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = ecryptfs_read_update_atime, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .readdir = ecryptfs_readdir, + .unlocked_ioctl = ecryptfs_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ecryptfs_compat_ioctl, +#endif + .mmap = ecryptfs_file_mmap, + .open = ecryptfs_open, + .flush = ecryptfs_flush, + .release = ecryptfs_release, + .fsync = ecryptfs_fsync, + .fasync = ecryptfs_fasync, + .splice_read = generic_file_splice_read, +}; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c new file mode 100644 index 00000000..27173293 --- /dev/null +++ b/fs/ecryptfs/inode.c @@ -0,0 +1,1228 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2004 Erez Zadok + * Copyright (C) 2001-2004 Stony Brook University + * Copyright (C) 2004-2007 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Michael C. Thompsion + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +static struct dentry *lock_parent(struct dentry *dentry) +{ + struct dentry *dir; + + dir = dget_parent(dentry); + mutex_lock_nested(&(dir->d_inode->i_mutex), I_MUTEX_PARENT); + return dir; +} + +static void unlock_dir(struct dentry *dir) +{ + mutex_unlock(&dir->d_inode->i_mutex); + dput(dir); +} + +static int ecryptfs_inode_test(struct inode *inode, void *lower_inode) +{ + if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode) + return 1; + return 0; +} + +static int ecryptfs_inode_set(struct inode *inode, void *opaque) +{ + struct inode *lower_inode = opaque; + + ecryptfs_set_inode_lower(inode, lower_inode); + fsstack_copy_attr_all(inode, lower_inode); + /* i_size will be overwritten for encrypted regular files */ + fsstack_copy_inode_size(inode, lower_inode); + inode->i_ino = lower_inode->i_ino; + inode->i_version++; + inode->i_mapping->a_ops = &ecryptfs_aops; + inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; + + if (S_ISLNK(inode->i_mode)) + inode->i_op = &ecryptfs_symlink_iops; + else if (S_ISDIR(inode->i_mode)) + inode->i_op = &ecryptfs_dir_iops; + else + inode->i_op = &ecryptfs_main_iops; + + if (S_ISDIR(inode->i_mode)) + inode->i_fop = &ecryptfs_dir_fops; + else if (special_file(inode->i_mode)) + init_special_inode(inode, inode->i_mode, inode->i_rdev); + else + inode->i_fop = &ecryptfs_main_fops; + + return 0; +} + +static struct inode *__ecryptfs_get_inode(struct inode *lower_inode, + struct super_block *sb) +{ + struct inode *inode; + + if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) + return ERR_PTR(-EXDEV); + if (!igrab(lower_inode)) + return ERR_PTR(-ESTALE); + inode = iget5_locked(sb, (unsigned long)lower_inode, + ecryptfs_inode_test, ecryptfs_inode_set, + lower_inode); + if (!inode) { + iput(lower_inode); + return ERR_PTR(-EACCES); + } + if (!(inode->i_state & I_NEW)) + iput(lower_inode); + + return inode; +} + +struct inode *ecryptfs_get_inode(struct inode *lower_inode, + struct super_block *sb) +{ + struct inode *inode = __ecryptfs_get_inode(lower_inode, sb); + + if (!IS_ERR(inode) && (inode->i_state & I_NEW)) + unlock_new_inode(inode); + + return inode; +} + +/** + * ecryptfs_interpose + * @lower_dentry: Existing dentry in the lower filesystem + * @dentry: ecryptfs' dentry + * @sb: ecryptfs's super_block + * + * Interposes upper and lower dentries. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_interpose(struct dentry *lower_dentry, + struct dentry *dentry, struct super_block *sb) +{ + struct inode *inode = ecryptfs_get_inode(lower_dentry->d_inode, sb); + + if (IS_ERR(inode)) + return PTR_ERR(inode); + d_instantiate(dentry, inode); + + return 0; +} + +/** + * ecryptfs_create_underlying_file + * @lower_dir_inode: inode of the parent in the lower fs of the new file + * @dentry: New file's dentry + * @mode: The mode of the new file + * @nd: nameidata of ecryptfs' parent's dentry & vfsmount + * + * Creates the file in the lower file system. + * + * Returns zero on success; non-zero on error condition + */ +static int +ecryptfs_create_underlying_file(struct inode *lower_dir_inode, + struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); + struct dentry *dentry_save; + struct vfsmount *vfsmount_save; + unsigned int flags_save; + int rc; + + if (nd) { + dentry_save = nd->path.dentry; + vfsmount_save = nd->path.mnt; + flags_save = nd->flags; + nd->path.dentry = lower_dentry; + nd->path.mnt = lower_mnt; + nd->flags &= ~LOOKUP_OPEN; + } + rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd); + if (nd) { + nd->path.dentry = dentry_save; + nd->path.mnt = vfsmount_save; + nd->flags = flags_save; + } + return rc; +} + +/** + * ecryptfs_do_create + * @directory_inode: inode of the new file's dentry's parent in ecryptfs + * @ecryptfs_dentry: New file's dentry in ecryptfs + * @mode: The mode of the new file + * @nd: nameidata of ecryptfs' parent's dentry & vfsmount + * + * Creates the underlying file and the eCryptfs inode which will link to + * it. It will also update the eCryptfs directory inode to mimic the + * stat of the lower directory inode. + * + * Returns zero on success; non-zero on error condition + */ +static int +ecryptfs_do_create(struct inode *directory_inode, + struct dentry *ecryptfs_dentry, int mode, + struct nameidata *nd) +{ + int rc; + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); + lower_dir_dentry = lock_parent(lower_dentry); + if (IS_ERR(lower_dir_dentry)) { + ecryptfs_printk(KERN_ERR, "Error locking directory of " + "dentry\n"); + rc = PTR_ERR(lower_dir_dentry); + goto out; + } + rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, + ecryptfs_dentry, mode, nd); + if (rc) { + printk(KERN_ERR "%s: Failure to create dentry in lower fs; " + "rc = [%d]\n", __func__, rc); + goto out_lock; + } + rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, + directory_inode->i_sb); + if (rc) { + ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n"); + goto out_lock; + } + fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); + fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); +out_lock: + unlock_dir(lower_dir_dentry); +out: + return rc; +} + +/** + * ecryptfs_initialize_file + * + * Cause the file to be changed from a basic empty file to an ecryptfs + * file with a header and first data page. + * + * Returns zero on success + */ +static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) +{ + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + int rc = 0; + + if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { + ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); + crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); + goto out; + } + ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); + rc = ecryptfs_new_file_context(ecryptfs_dentry); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error creating new file " + "context; rc = [%d]\n", rc); + goto out; + } + rc = ecryptfs_get_lower_file(ecryptfs_dentry, + ecryptfs_dentry->d_inode); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + ecryptfs_dentry->d_name.name, rc); + goto out; + } + rc = ecryptfs_write_metadata(ecryptfs_dentry); + if (rc) + printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); + ecryptfs_put_lower_file(ecryptfs_dentry->d_inode); +out: + return rc; +} + +/** + * ecryptfs_create + * @dir: The inode of the directory in which to create the file. + * @dentry: The eCryptfs dentry + * @mode: The mode of the new file. + * @nd: nameidata + * + * Creates a new file. + * + * Returns zero on success; non-zero on error condition + */ +static int +ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, + int mode, struct nameidata *nd) +{ + int rc; + + /* ecryptfs_do_create() calls ecryptfs_interpose() */ + rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd); + if (unlikely(rc)) { + ecryptfs_printk(KERN_WARNING, "Failed to create file in" + "lower filesystem\n"); + goto out; + } + /* At this point, a file exists on "disk"; we need to make sure + * that this on disk file is prepared to be an ecryptfs file */ + rc = ecryptfs_initialize_file(ecryptfs_dentry); +out: + return rc; +} + +static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) +{ + struct ecryptfs_crypt_stat *crypt_stat; + int rc; + + rc = ecryptfs_get_lower_file(dentry, inode); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + dentry->d_name.name, rc); + return rc; + } + + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + /* TODO: lock for crypt_stat comparison */ + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) + ecryptfs_set_default_sizes(crypt_stat); + + rc = ecryptfs_read_and_validate_header_region(inode); + ecryptfs_put_lower_file(inode); + if (rc) { + rc = ecryptfs_read_and_validate_xattr_region(dentry, inode); + if (!rc) + crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; + } + + /* Must return 0 to allow non-eCryptfs files to be looked up, too */ + return 0; +} + +/** + * ecryptfs_lookup_interpose - Dentry interposition for a lookup + */ +static int ecryptfs_lookup_interpose(struct dentry *dentry, + struct dentry *lower_dentry, + struct inode *dir_inode) +{ + struct inode *inode, *lower_inode = lower_dentry->d_inode; + struct ecryptfs_dentry_info *dentry_info; + struct vfsmount *lower_mnt; + int rc = 0; + + lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); + fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); + BUG_ON(!lower_dentry->d_count); + + dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); + ecryptfs_set_dentry_private(dentry, dentry_info); + if (!dentry_info) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to allocate ecryptfs_dentry_info struct\n", + __func__); + dput(lower_dentry); + mntput(lower_mnt); + d_drop(dentry); + return -ENOMEM; + } + ecryptfs_set_dentry_lower(dentry, lower_dentry); + ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); + + if (!lower_dentry->d_inode) { + /* We want to add because we couldn't find in lower */ + d_add(dentry, NULL); + return 0; + } + inode = __ecryptfs_get_inode(lower_inode, dir_inode->i_sb); + if (IS_ERR(inode)) { + printk(KERN_ERR "%s: Error interposing; rc = [%ld]\n", + __func__, PTR_ERR(inode)); + return PTR_ERR(inode); + } + if (S_ISREG(inode->i_mode)) { + rc = ecryptfs_i_size_read(dentry, inode); + if (rc) { + make_bad_inode(inode); + return rc; + } + } + + if (inode->i_state & I_NEW) + unlock_new_inode(inode); + d_add(dentry, inode); + + return rc; +} + +/** + * ecryptfs_lookup + * @ecryptfs_dir_inode: The eCryptfs directory inode + * @ecryptfs_dentry: The eCryptfs dentry that we are looking up + * @ecryptfs_nd: nameidata; may be NULL + * + * Find a file on disk. If the file does not exist, then we'll add it to the + * dentry cache and continue on to read it from the disk. + */ +static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, + struct dentry *ecryptfs_dentry, + struct nameidata *ecryptfs_nd) +{ + char *encrypted_and_encoded_name = NULL; + size_t encrypted_and_encoded_name_size; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; + struct dentry *lower_dir_dentry, *lower_dentry; + int rc = 0; + + if ((ecryptfs_dentry->d_name.len == 1 + && !strcmp(ecryptfs_dentry->d_name.name, ".")) + || (ecryptfs_dentry->d_name.len == 2 + && !strcmp(ecryptfs_dentry->d_name.name, ".."))) { + goto out_d_drop; + } + lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); + mutex_lock(&lower_dir_dentry->d_inode->i_mutex); + lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, + lower_dir_dentry, + ecryptfs_dentry->d_name.len); + mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); + if (IS_ERR(lower_dentry)) { + rc = PTR_ERR(lower_dentry); + ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " + "[%d] on lower_dentry = [%s]\n", __func__, rc, + encrypted_and_encoded_name); + goto out_d_drop; + } + if (lower_dentry->d_inode) + goto interpose; + mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + if (!(mount_crypt_stat + && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) + goto interpose; + dput(lower_dentry); + rc = ecryptfs_encrypt_and_encode_filename( + &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, + NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name, + ecryptfs_dentry->d_name.len); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt and encode " + "filename; rc = [%d]\n", __func__, rc); + goto out_d_drop; + } + mutex_lock(&lower_dir_dentry->d_inode->i_mutex); + lower_dentry = lookup_one_len(encrypted_and_encoded_name, + lower_dir_dentry, + encrypted_and_encoded_name_size); + mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); + if (IS_ERR(lower_dentry)) { + rc = PTR_ERR(lower_dentry); + ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " + "[%d] on lower_dentry = [%s]\n", __func__, rc, + encrypted_and_encoded_name); + goto out_d_drop; + } +interpose: + rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry, + ecryptfs_dir_inode); + goto out; +out_d_drop: + d_drop(ecryptfs_dentry); +out: + kfree(encrypted_and_encoded_name); + return ERR_PTR(rc); +} + +static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + struct dentry *lower_old_dentry; + struct dentry *lower_new_dentry; + struct dentry *lower_dir_dentry; + u64 file_size_save; + int rc; + + file_size_save = i_size_read(old_dentry->d_inode); + lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); + lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); + dget(lower_old_dentry); + dget(lower_new_dentry); + lower_dir_dentry = lock_parent(lower_new_dentry); + rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode, + lower_new_dentry); + if (rc || !lower_new_dentry->d_inode) + goto out_lock; + rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); + if (rc) + goto out_lock; + fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); + fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); + old_dentry->d_inode->i_nlink = + ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink; + i_size_write(new_dentry->d_inode, file_size_save); +out_lock: + unlock_dir(lower_dir_dentry); + dput(lower_new_dentry); + dput(lower_old_dentry); + return rc; +} + +static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int rc = 0; + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir); + struct dentry *lower_dir_dentry; + + dget(lower_dentry); + lower_dir_dentry = lock_parent(lower_dentry); + rc = vfs_unlink(lower_dir_inode, lower_dentry); + if (rc) { + printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); + goto out_unlock; + } + fsstack_copy_attr_times(dir, lower_dir_inode); + dentry->d_inode->i_nlink = + ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink; + dentry->d_inode->i_ctime = dir->i_ctime; + d_drop(dentry); +out_unlock: + unlock_dir(lower_dir_dentry); + dput(lower_dentry); + return rc; +} + +static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + int rc; + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + char *encoded_symname; + size_t encoded_symlen; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + dget(lower_dentry); + lower_dir_dentry = lock_parent(lower_dentry); + mount_crypt_stat = &ecryptfs_superblock_to_private( + dir->i_sb)->mount_crypt_stat; + rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname, + &encoded_symlen, + NULL, + mount_crypt_stat, symname, + strlen(symname)); + if (rc) + goto out_lock; + rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, + encoded_symname); + kfree(encoded_symname); + if (rc || !lower_dentry->d_inode) + goto out_lock; + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); + if (rc) + goto out_lock; + fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); + fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); +out_lock: + unlock_dir(lower_dir_dentry); + dput(lower_dentry); + if (!dentry->d_inode) + d_drop(dentry); + return rc; +} + +static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int rc; + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + lower_dir_dentry = lock_parent(lower_dentry); + rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode); + if (rc || !lower_dentry->d_inode) + goto out; + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); + if (rc) + goto out; + fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); + fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); + dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; +out: + unlock_dir(lower_dir_dentry); + if (!dentry->d_inode) + d_drop(dentry); + return rc; +} + +static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + int rc; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + dget(dentry); + lower_dir_dentry = lock_parent(lower_dentry); + dget(lower_dentry); + rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry); + dput(lower_dentry); + if (!rc && dentry->d_inode) + clear_nlink(dentry->d_inode); + fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); + dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; + unlock_dir(lower_dir_dentry); + if (!rc) + d_drop(dentry); + dput(dentry); + return rc; +} + +static int +ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +{ + int rc; + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + lower_dir_dentry = lock_parent(lower_dentry); + rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev); + if (rc || !lower_dentry->d_inode) + goto out; + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); + if (rc) + goto out; + fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); + fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); +out: + unlock_dir(lower_dir_dentry); + if (!dentry->d_inode) + d_drop(dentry); + return rc; +} + +static int +ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int rc; + struct dentry *lower_old_dentry; + struct dentry *lower_new_dentry; + struct dentry *lower_old_dir_dentry; + struct dentry *lower_new_dir_dentry; + struct dentry *trap = NULL; + + lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); + lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); + dget(lower_old_dentry); + dget(lower_new_dentry); + lower_old_dir_dentry = dget_parent(lower_old_dentry); + lower_new_dir_dentry = dget_parent(lower_new_dentry); + trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + /* source should not be ancestor of target */ + if (trap == lower_old_dentry) { + rc = -EINVAL; + goto out_lock; + } + /* target should not be ancestor of source */ + if (trap == lower_new_dentry) { + rc = -ENOTEMPTY; + goto out_lock; + } + rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, + lower_new_dir_dentry->d_inode, lower_new_dentry); + if (rc) + goto out_lock; + fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); + if (new_dir != old_dir) + fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); +out_lock: + unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + dput(lower_new_dir_dentry); + dput(lower_old_dir_dentry); + dput(lower_new_dentry); + dput(lower_old_dentry); + return rc; +} + +static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf, + size_t *bufsiz) +{ + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + char *lower_buf; + size_t lower_bufsiz = PATH_MAX; + mm_segment_t old_fs; + int rc; + + lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL); + if (!lower_buf) { + rc = -ENOMEM; + goto out; + } + old_fs = get_fs(); + set_fs(get_ds()); + rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, + (char __user *)lower_buf, + lower_bufsiz); + set_fs(old_fs); + if (rc < 0) + goto out; + lower_bufsiz = rc; + rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry, + lower_buf, lower_bufsiz); +out: + kfree(lower_buf); + return rc; +} + +static int +ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +{ + char *kbuf; + size_t kbufsiz, copied; + int rc; + + rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz); + if (rc) + goto out; + copied = min_t(size_t, bufsiz, kbufsiz); + rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied; + kfree(kbuf); + fsstack_copy_attr_atime(dentry->d_inode, + ecryptfs_dentry_to_lower(dentry)->d_inode); +out: + return rc; +} + +static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + char *buf; + int len = PAGE_SIZE, rc; + mm_segment_t old_fs; + + /* Released in ecryptfs_put_link(); only release here on error */ + buf = kmalloc(len, GFP_KERNEL); + if (!buf) { + buf = ERR_PTR(-ENOMEM); + goto out; + } + old_fs = get_fs(); + set_fs(get_ds()); + rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); + set_fs(old_fs); + if (rc < 0) { + kfree(buf); + buf = ERR_PTR(rc); + } else + buf[rc] = '\0'; +out: + nd_set_link(nd, buf); + return NULL; +} + +static void +ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) +{ + char *buf = nd_get_link(nd); + if (!IS_ERR(buf)) { + /* Free the char* */ + kfree(buf); + } +} + +/** + * upper_size_to_lower_size + * @crypt_stat: Crypt_stat associated with file + * @upper_size: Size of the upper file + * + * Calculate the required size of the lower file based on the + * specified size of the upper file. This calculation is based on the + * number of headers in the underlying file and the extent size. + * + * Returns Calculated size of the lower file. + */ +static loff_t +upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat, + loff_t upper_size) +{ + loff_t lower_size; + + lower_size = ecryptfs_lower_header_size(crypt_stat); + if (upper_size != 0) { + loff_t num_extents; + + num_extents = upper_size >> crypt_stat->extent_shift; + if (upper_size & ~crypt_stat->extent_mask) + num_extents++; + lower_size += (num_extents * crypt_stat->extent_size); + } + return lower_size; +} + +/** + * truncate_upper + * @dentry: The ecryptfs layer dentry + * @ia: Address of the ecryptfs inode's attributes + * @lower_ia: Address of the lower inode's attributes + * + * Function to handle truncations modifying the size of the file. Note + * that the file sizes are interpolated. When expanding, we are simply + * writing strings of 0's out. When truncating, we truncate the upper + * inode and update the lower_ia according to the page index + * interpolations. If ATTR_SIZE is set in lower_ia->ia_valid upon return, + * the caller must use lower_ia in a call to notify_change() to perform + * the truncation of the lower inode. + * + * Returns zero on success; non-zero otherwise + */ +static int truncate_upper(struct dentry *dentry, struct iattr *ia, + struct iattr *lower_ia) +{ + int rc = 0; + struct inode *inode = dentry->d_inode; + struct ecryptfs_crypt_stat *crypt_stat; + loff_t i_size = i_size_read(inode); + loff_t lower_size_before_truncate; + loff_t lower_size_after_truncate; + + if (unlikely((ia->ia_size == i_size))) { + lower_ia->ia_valid &= ~ATTR_SIZE; + return 0; + } + rc = ecryptfs_get_lower_file(dentry, inode); + if (rc) + return rc; + crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; + /* Switch on growing or shrinking file */ + if (ia->ia_size > i_size) { + char zero[] = { 0x00 }; + + lower_ia->ia_valid &= ~ATTR_SIZE; + /* Write a single 0 at the last position of the file; + * this triggers code that will fill in 0's throughout + * the intermediate portion of the previous end of the + * file and the new and of the file */ + rc = ecryptfs_write(inode, zero, + (ia->ia_size - 1), 1); + } else { /* ia->ia_size < i_size_read(inode) */ + /* We're chopping off all the pages down to the page + * in which ia->ia_size is located. Fill in the end of + * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to + * PAGE_CACHE_SIZE with zeros. */ + size_t num_zeros = (PAGE_CACHE_SIZE + - (ia->ia_size & ~PAGE_CACHE_MASK)); + + if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + truncate_setsize(inode, ia->ia_size); + lower_ia->ia_size = ia->ia_size; + lower_ia->ia_valid |= ATTR_SIZE; + goto out; + } + if (num_zeros) { + char *zeros_virt; + + zeros_virt = kzalloc(num_zeros, GFP_KERNEL); + if (!zeros_virt) { + rc = -ENOMEM; + goto out; + } + rc = ecryptfs_write(inode, zeros_virt, + ia->ia_size, num_zeros); + kfree(zeros_virt); + if (rc) { + printk(KERN_ERR "Error attempting to zero out " + "the remainder of the end page on " + "reducing truncate; rc = [%d]\n", rc); + goto out; + } + } + truncate_setsize(inode, ia->ia_size); + rc = ecryptfs_write_inode_size_to_metadata(inode); + if (rc) { + printk(KERN_ERR "Problem with " + "ecryptfs_write_inode_size_to_metadata; " + "rc = [%d]\n", rc); + goto out; + } + /* We are reducing the size of the ecryptfs file, and need to + * know if we need to reduce the size of the lower file. */ + lower_size_before_truncate = + upper_size_to_lower_size(crypt_stat, i_size); + lower_size_after_truncate = + upper_size_to_lower_size(crypt_stat, ia->ia_size); + if (lower_size_after_truncate < lower_size_before_truncate) { + lower_ia->ia_size = lower_size_after_truncate; + lower_ia->ia_valid |= ATTR_SIZE; + } else + lower_ia->ia_valid &= ~ATTR_SIZE; + } +out: + ecryptfs_put_lower_file(inode); + return rc; +} + +static int ecryptfs_inode_newsize_ok(struct inode *inode, loff_t offset) +{ + struct ecryptfs_crypt_stat *crypt_stat; + loff_t lower_oldsize, lower_newsize; + + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + lower_oldsize = upper_size_to_lower_size(crypt_stat, + i_size_read(inode)); + lower_newsize = upper_size_to_lower_size(crypt_stat, offset); + if (lower_newsize > lower_oldsize) { + /* + * The eCryptfs inode and the new *lower* size are mixed here + * because we may not have the lower i_mutex held and/or it may + * not be appropriate to call inode_newsize_ok() with inodes + * from other filesystems. + */ + return inode_newsize_ok(inode, lower_newsize); + } + + return 0; +} + +/** + * ecryptfs_truncate + * @dentry: The ecryptfs layer dentry + * @new_length: The length to expand the file to + * + * Simple function that handles the truncation of an eCryptfs inode and + * its corresponding lower inode. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) +{ + struct iattr ia = { .ia_valid = ATTR_SIZE, .ia_size = new_length }; + struct iattr lower_ia = { .ia_valid = 0 }; + int rc; + + rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length); + if (rc) + return rc; + + rc = truncate_upper(dentry, &ia, &lower_ia); + if (!rc && lower_ia.ia_valid & ATTR_SIZE) { + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + + mutex_lock(&lower_dentry->d_inode->i_mutex); + rc = notify_change(lower_dentry, &lower_ia); + mutex_unlock(&lower_dentry->d_inode->i_mutex); + } + return rc; +} + +static int +ecryptfs_permission(struct inode *inode, int mask, unsigned int flags) +{ + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + return inode_permission(ecryptfs_inode_to_lower(inode), mask); +} + +/** + * ecryptfs_setattr + * @dentry: dentry handle to the inode to modify + * @ia: Structure with flags of what to change and values + * + * Updates the metadata of an inode. If the update is to the size + * i.e. truncation, then ecryptfs_truncate will handle the size modification + * of both the ecryptfs inode and the lower inode. + * + * All other metadata changes will be passed right to the lower filesystem, + * and we will just update our inode to look like the lower. + */ +static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) +{ + int rc = 0; + struct dentry *lower_dentry; + struct iattr lower_ia; + struct inode *inode; + struct inode *lower_inode; + struct ecryptfs_crypt_stat *crypt_stat; + + crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; + if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) + ecryptfs_init_crypt_stat(crypt_stat); + inode = dentry->d_inode; + lower_inode = ecryptfs_inode_to_lower(inode); + lower_dentry = ecryptfs_dentry_to_lower(dentry); + mutex_lock(&crypt_stat->cs_mutex); + if (S_ISDIR(dentry->d_inode->i_mode)) + crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); + else if (S_ISREG(dentry->d_inode->i_mode) + && (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED) + || !(crypt_stat->flags & ECRYPTFS_KEY_VALID))) { + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + + mount_crypt_stat = &ecryptfs_superblock_to_private( + dentry->d_sb)->mount_crypt_stat; + rc = ecryptfs_get_lower_file(dentry, inode); + if (rc) { + mutex_unlock(&crypt_stat->cs_mutex); + goto out; + } + rc = ecryptfs_read_metadata(dentry); + ecryptfs_put_lower_file(inode); + if (rc) { + if (!(mount_crypt_stat->flags + & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) { + rc = -EIO; + printk(KERN_WARNING "Either the lower file " + "is not in a valid eCryptfs format, " + "or the key could not be retrieved. " + "Plaintext passthrough mode is not " + "enabled; returning -EIO\n"); + mutex_unlock(&crypt_stat->cs_mutex); + goto out; + } + rc = 0; + crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED + | ECRYPTFS_ENCRYPTED); + } + } + mutex_unlock(&crypt_stat->cs_mutex); + + rc = inode_change_ok(inode, ia); + if (rc) + goto out; + if (ia->ia_valid & ATTR_SIZE) { + rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size); + if (rc) + goto out; + } + + if (S_ISREG(inode->i_mode)) { + rc = filemap_write_and_wait(inode->i_mapping); + if (rc) + goto out; + fsstack_copy_attr_all(inode, lower_inode); + } + memcpy(&lower_ia, ia, sizeof(lower_ia)); + if (ia->ia_valid & ATTR_FILE) + lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file); + if (ia->ia_valid & ATTR_SIZE) { + rc = truncate_upper(dentry, ia, &lower_ia); + if (rc < 0) + goto out; + } + + /* + * mode change is for clearing setuid/setgid bits. Allow lower fs + * to interpret this in its own way. + */ + if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) + lower_ia.ia_valid &= ~ATTR_MODE; + + mutex_lock(&lower_dentry->d_inode->i_mutex); + rc = notify_change(lower_dentry, &lower_ia); + mutex_unlock(&lower_dentry->d_inode->i_mutex); +out: + fsstack_copy_attr_all(inode, lower_inode); + return rc; +} + +int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + int rc = 0; + + mount_crypt_stat = &ecryptfs_superblock_to_private( + dentry->d_sb)->mount_crypt_stat; + generic_fillattr(dentry->d_inode, stat); + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { + char *target; + size_t targetsiz; + + rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz); + if (!rc) { + kfree(target); + stat->size = targetsiz; + } + } + return rc; +} + +int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct kstat lower_stat; + int rc; + + rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry), + ecryptfs_dentry_to_lower(dentry), &lower_stat); + if (!rc) { + fsstack_copy_attr_all(dentry->d_inode, + ecryptfs_inode_to_lower(dentry->d_inode)); + generic_fillattr(dentry->d_inode, stat); + stat->blocks = lower_stat.blocks; + } + return rc; +} + +int +ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + int rc = 0; + struct dentry *lower_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + if (!lower_dentry->d_inode->i_op->setxattr) { + rc = -EOPNOTSUPP; + goto out; + } + + rc = vfs_setxattr(lower_dentry, name, value, size, flags); + if (!rc) + fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode); +out: + return rc; +} + +ssize_t +ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, + void *value, size_t size) +{ + int rc = 0; + + if (!lower_dentry->d_inode->i_op->getxattr) { + rc = -EOPNOTSUPP; + goto out; + } + mutex_lock(&lower_dentry->d_inode->i_mutex); + rc = lower_dentry->d_inode->i_op->getxattr(lower_dentry, name, value, + size); + mutex_unlock(&lower_dentry->d_inode->i_mutex); +out: + return rc; +} + +static ssize_t +ecryptfs_getxattr(struct dentry *dentry, const char *name, void *value, + size_t size) +{ + return ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry), name, + value, size); +} + +static ssize_t +ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size) +{ + int rc = 0; + struct dentry *lower_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + if (!lower_dentry->d_inode->i_op->listxattr) { + rc = -EOPNOTSUPP; + goto out; + } + mutex_lock(&lower_dentry->d_inode->i_mutex); + rc = lower_dentry->d_inode->i_op->listxattr(lower_dentry, list, size); + mutex_unlock(&lower_dentry->d_inode->i_mutex); +out: + return rc; +} + +static int ecryptfs_removexattr(struct dentry *dentry, const char *name) +{ + int rc = 0; + struct dentry *lower_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + if (!lower_dentry->d_inode->i_op->removexattr) { + rc = -EOPNOTSUPP; + goto out; + } + mutex_lock(&lower_dentry->d_inode->i_mutex); + rc = lower_dentry->d_inode->i_op->removexattr(lower_dentry, name); + mutex_unlock(&lower_dentry->d_inode->i_mutex); +out: + return rc; +} + +const struct inode_operations ecryptfs_symlink_iops = { + .readlink = ecryptfs_readlink, + .follow_link = ecryptfs_follow_link, + .put_link = ecryptfs_put_link, + .permission = ecryptfs_permission, + .setattr = ecryptfs_setattr, + .getattr = ecryptfs_getattr_link, + .setxattr = ecryptfs_setxattr, + .getxattr = ecryptfs_getxattr, + .listxattr = ecryptfs_listxattr, + .removexattr = ecryptfs_removexattr +}; + +const struct inode_operations ecryptfs_dir_iops = { + .create = ecryptfs_create, + .lookup = ecryptfs_lookup, + .link = ecryptfs_link, + .unlink = ecryptfs_unlink, + .symlink = ecryptfs_symlink, + .mkdir = ecryptfs_mkdir, + .rmdir = ecryptfs_rmdir, + .mknod = ecryptfs_mknod, + .rename = ecryptfs_rename, + .permission = ecryptfs_permission, + .setattr = ecryptfs_setattr, + .setxattr = ecryptfs_setxattr, + .getxattr = ecryptfs_getxattr, + .listxattr = ecryptfs_listxattr, + .removexattr = ecryptfs_removexattr +}; + +const struct inode_operations ecryptfs_main_iops = { + .permission = ecryptfs_permission, + .setattr = ecryptfs_setattr, + .getattr = ecryptfs_getattr, + .setxattr = ecryptfs_setxattr, + .getxattr = ecryptfs_getxattr, + .listxattr = ecryptfs_listxattr, + .removexattr = ecryptfs_removexattr +}; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c new file mode 100644 index 00000000..89dc18e7 --- /dev/null +++ b/fs/ecryptfs/keystore.c @@ -0,0 +1,2530 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * In-kernel key management code. Includes functions to parse and + * write authentication token-related packets with the underlying + * file. + * + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Michael C. Thompson + * Trevor S. Highland + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +/** + * request_key returned an error instead of a valid key address; + * determine the type of error, make appropriate log entries, and + * return an error code. + */ +static int process_request_key_err(long err_code) +{ + int rc = 0; + + switch (err_code) { + case -ENOKEY: + ecryptfs_printk(KERN_WARNING, "No key\n"); + rc = -ENOENT; + break; + case -EKEYEXPIRED: + ecryptfs_printk(KERN_WARNING, "Key expired\n"); + rc = -ETIME; + break; + case -EKEYREVOKED: + ecryptfs_printk(KERN_WARNING, "Key revoked\n"); + rc = -EINVAL; + break; + default: + ecryptfs_printk(KERN_WARNING, "Unknown error code: " + "[0x%.16lx]\n", err_code); + rc = -EINVAL; + } + return rc; +} + +static int process_find_global_auth_tok_for_sig_err(int err_code) +{ + int rc = err_code; + + switch (err_code) { + case -ENOENT: + ecryptfs_printk(KERN_WARNING, "Missing auth tok\n"); + break; + case -EINVAL: + ecryptfs_printk(KERN_WARNING, "Invalid auth tok\n"); + break; + default: + rc = process_request_key_err(err_code); + break; + } + return rc; +} + +/** + * ecryptfs_parse_packet_length + * @data: Pointer to memory containing length at offset + * @size: This function writes the decoded size to this memory + * address; zero on error + * @length_size: The number of bytes occupied by the encoded length + * + * Returns zero on success; non-zero on error + */ +int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, + size_t *length_size) +{ + int rc = 0; + + (*length_size) = 0; + (*size) = 0; + if (data[0] < 192) { + /* One-byte length */ + (*size) = (unsigned char)data[0]; + (*length_size) = 1; + } else if (data[0] < 224) { + /* Two-byte length */ + (*size) = (((unsigned char)(data[0]) - 192) * 256); + (*size) += ((unsigned char)(data[1]) + 192); + (*length_size) = 2; + } else if (data[0] == 255) { + /* Five-byte length; we're not supposed to see this */ + ecryptfs_printk(KERN_ERR, "Five-byte packet length not " + "supported\n"); + rc = -EINVAL; + goto out; + } else { + ecryptfs_printk(KERN_ERR, "Error parsing packet length\n"); + rc = -EINVAL; + goto out; + } +out: + return rc; +} + +/** + * ecryptfs_write_packet_length + * @dest: The byte array target into which to write the length. Must + * have at least 5 bytes allocated. + * @size: The length to write. + * @packet_size_length: The number of bytes used to encode the packet + * length is written to this address. + * + * Returns zero on success; non-zero on error. + */ +int ecryptfs_write_packet_length(char *dest, size_t size, + size_t *packet_size_length) +{ + int rc = 0; + + if (size < 192) { + dest[0] = size; + (*packet_size_length) = 1; + } else if (size < 65536) { + dest[0] = (((size - 192) / 256) + 192); + dest[1] = ((size - 192) % 256); + (*packet_size_length) = 2; + } else { + rc = -EINVAL; + ecryptfs_printk(KERN_WARNING, + "Unsupported packet size: [%zd]\n", size); + } + return rc; +} + +static int +write_tag_64_packet(char *signature, struct ecryptfs_session_key *session_key, + char **packet, size_t *packet_len) +{ + size_t i = 0; + size_t data_len; + size_t packet_size_len; + char *message; + int rc; + + /* + * ***** TAG 64 Packet Format ***** + * | Content Type | 1 byte | + * | Key Identifier Size | 1 or 2 bytes | + * | Key Identifier | arbitrary | + * | Encrypted File Encryption Key Size | 1 or 2 bytes | + * | Encrypted File Encryption Key | arbitrary | + */ + data_len = (5 + ECRYPTFS_SIG_SIZE_HEX + + session_key->encrypted_key_size); + *packet = kmalloc(data_len, GFP_KERNEL); + message = *packet; + if (!message) { + ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + message[i++] = ECRYPTFS_TAG_64_PACKET_TYPE; + rc = ecryptfs_write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX, + &packet_size_len); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 64 packet " + "header; cannot generate packet length\n"); + goto out; + } + i += packet_size_len; + memcpy(&message[i], signature, ECRYPTFS_SIG_SIZE_HEX); + i += ECRYPTFS_SIG_SIZE_HEX; + rc = ecryptfs_write_packet_length(&message[i], + session_key->encrypted_key_size, + &packet_size_len); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 64 packet " + "header; cannot generate packet length\n"); + goto out; + } + i += packet_size_len; + memcpy(&message[i], session_key->encrypted_key, + session_key->encrypted_key_size); + i += session_key->encrypted_key_size; + *packet_len = i; +out: + return rc; +} + +static int +parse_tag_65_packet(struct ecryptfs_session_key *session_key, u8 *cipher_code, + struct ecryptfs_message *msg) +{ + size_t i = 0; + char *data; + size_t data_len; + size_t m_size; + size_t message_len; + u16 checksum = 0; + u16 expected_checksum = 0; + int rc; + + /* + * ***** TAG 65 Packet Format ***** + * | Content Type | 1 byte | + * | Status Indicator | 1 byte | + * | File Encryption Key Size | 1 or 2 bytes | + * | File Encryption Key | arbitrary | + */ + message_len = msg->data_len; + data = msg->data; + if (message_len < 4) { + rc = -EIO; + goto out; + } + if (data[i++] != ECRYPTFS_TAG_65_PACKET_TYPE) { + ecryptfs_printk(KERN_ERR, "Type should be ECRYPTFS_TAG_65\n"); + rc = -EIO; + goto out; + } + if (data[i++]) { + ecryptfs_printk(KERN_ERR, "Status indicator has non-zero value " + "[%d]\n", data[i-1]); + rc = -EIO; + goto out; + } + rc = ecryptfs_parse_packet_length(&data[i], &m_size, &data_len); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error parsing packet length; " + "rc = [%d]\n", rc); + goto out; + } + i += data_len; + if (message_len < (i + m_size)) { + ecryptfs_printk(KERN_ERR, "The message received from ecryptfsd " + "is shorter than expected\n"); + rc = -EIO; + goto out; + } + if (m_size < 3) { + ecryptfs_printk(KERN_ERR, + "The decrypted key is not long enough to " + "include a cipher code and checksum\n"); + rc = -EIO; + goto out; + } + *cipher_code = data[i++]; + /* The decrypted key includes 1 byte cipher code and 2 byte checksum */ + session_key->decrypted_key_size = m_size - 3; + if (session_key->decrypted_key_size > ECRYPTFS_MAX_KEY_BYTES) { + ecryptfs_printk(KERN_ERR, "key_size [%d] larger than " + "the maximum key size [%d]\n", + session_key->decrypted_key_size, + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); + rc = -EIO; + goto out; + } + memcpy(session_key->decrypted_key, &data[i], + session_key->decrypted_key_size); + i += session_key->decrypted_key_size; + expected_checksum += (unsigned char)(data[i++]) << 8; + expected_checksum += (unsigned char)(data[i++]); + for (i = 0; i < session_key->decrypted_key_size; i++) + checksum += session_key->decrypted_key[i]; + if (expected_checksum != checksum) { + ecryptfs_printk(KERN_ERR, "Invalid checksum for file " + "encryption key; expected [%x]; calculated " + "[%x]\n", expected_checksum, checksum); + rc = -EIO; + } +out: + return rc; +} + + +static int +write_tag_66_packet(char *signature, u8 cipher_code, + struct ecryptfs_crypt_stat *crypt_stat, char **packet, + size_t *packet_len) +{ + size_t i = 0; + size_t j; + size_t data_len; + size_t checksum = 0; + size_t packet_size_len; + char *message; + int rc; + + /* + * ***** TAG 66 Packet Format ***** + * | Content Type | 1 byte | + * | Key Identifier Size | 1 or 2 bytes | + * | Key Identifier | arbitrary | + * | File Encryption Key Size | 1 or 2 bytes | + * | File Encryption Key | arbitrary | + */ + data_len = (5 + ECRYPTFS_SIG_SIZE_HEX + crypt_stat->key_size); + *packet = kmalloc(data_len, GFP_KERNEL); + message = *packet; + if (!message) { + ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + message[i++] = ECRYPTFS_TAG_66_PACKET_TYPE; + rc = ecryptfs_write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX, + &packet_size_len); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet " + "header; cannot generate packet length\n"); + goto out; + } + i += packet_size_len; + memcpy(&message[i], signature, ECRYPTFS_SIG_SIZE_HEX); + i += ECRYPTFS_SIG_SIZE_HEX; + /* The encrypted key includes 1 byte cipher code and 2 byte checksum */ + rc = ecryptfs_write_packet_length(&message[i], crypt_stat->key_size + 3, + &packet_size_len); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet " + "header; cannot generate packet length\n"); + goto out; + } + i += packet_size_len; + message[i++] = cipher_code; + memcpy(&message[i], crypt_stat->key, crypt_stat->key_size); + i += crypt_stat->key_size; + for (j = 0; j < crypt_stat->key_size; j++) + checksum += crypt_stat->key[j]; + message[i++] = (checksum / 256) % 256; + message[i++] = (checksum % 256); + *packet_len = i; +out: + return rc; +} + +static int +parse_tag_67_packet(struct ecryptfs_key_record *key_rec, + struct ecryptfs_message *msg) +{ + size_t i = 0; + char *data; + size_t data_len; + size_t message_len; + int rc; + + /* + * ***** TAG 65 Packet Format ***** + * | Content Type | 1 byte | + * | Status Indicator | 1 byte | + * | Encrypted File Encryption Key Size | 1 or 2 bytes | + * | Encrypted File Encryption Key | arbitrary | + */ + message_len = msg->data_len; + data = msg->data; + /* verify that everything through the encrypted FEK size is present */ + if (message_len < 4) { + rc = -EIO; + printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable " + "message length is [%d]\n", __func__, message_len, 4); + goto out; + } + if (data[i++] != ECRYPTFS_TAG_67_PACKET_TYPE) { + rc = -EIO; + printk(KERN_ERR "%s: Type should be ECRYPTFS_TAG_67\n", + __func__); + goto out; + } + if (data[i++]) { + rc = -EIO; + printk(KERN_ERR "%s: Status indicator has non zero " + "value [%d]\n", __func__, data[i-1]); + + goto out; + } + rc = ecryptfs_parse_packet_length(&data[i], &key_rec->enc_key_size, + &data_len); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error parsing packet length; " + "rc = [%d]\n", rc); + goto out; + } + i += data_len; + if (message_len < (i + key_rec->enc_key_size)) { + rc = -EIO; + printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n", + __func__, message_len, (i + key_rec->enc_key_size)); + goto out; + } + if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { + rc = -EIO; + printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than " + "the maximum key size [%d]\n", __func__, + key_rec->enc_key_size, + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); + goto out; + } + memcpy(key_rec->enc_key, &data[i], key_rec->enc_key_size); +out: + return rc; +} + +/** + * ecryptfs_verify_version + * @version: The version number to confirm + * + * Returns zero on good version; non-zero otherwise + */ +static int ecryptfs_verify_version(u16 version) +{ + int rc = 0; + unsigned char major; + unsigned char minor; + + major = ((version >> 8) & 0xFF); + minor = (version & 0xFF); + if (major != ECRYPTFS_VERSION_MAJOR) { + ecryptfs_printk(KERN_ERR, "Major version number mismatch. " + "Expected [%d]; got [%d]\n", + ECRYPTFS_VERSION_MAJOR, major); + rc = -EINVAL; + goto out; + } + if (minor != ECRYPTFS_VERSION_MINOR) { + ecryptfs_printk(KERN_ERR, "Minor version number mismatch. " + "Expected [%d]; got [%d]\n", + ECRYPTFS_VERSION_MINOR, minor); + rc = -EINVAL; + goto out; + } +out: + return rc; +} + +/** + * ecryptfs_verify_auth_tok_from_key + * @auth_tok_key: key containing the authentication token + * @auth_tok: authentication token + * + * Returns zero on valid auth tok; -EINVAL otherwise + */ +static int +ecryptfs_verify_auth_tok_from_key(struct key *auth_tok_key, + struct ecryptfs_auth_tok **auth_tok) +{ + int rc = 0; + + (*auth_tok) = ecryptfs_get_key_payload_data(auth_tok_key); + if (ecryptfs_verify_version((*auth_tok)->version)) { + printk(KERN_ERR "Data structure version mismatch. Userspace " + "tools must match eCryptfs kernel module with major " + "version [%d] and minor version [%d]\n", + ECRYPTFS_VERSION_MAJOR, ECRYPTFS_VERSION_MINOR); + rc = -EINVAL; + goto out; + } + if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD + && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) { + printk(KERN_ERR "Invalid auth_tok structure " + "returned from key query\n"); + rc = -EINVAL; + goto out; + } +out: + return rc; +} + +static int +ecryptfs_find_global_auth_tok_for_sig( + struct key **auth_tok_key, + struct ecryptfs_auth_tok **auth_tok, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) +{ + struct ecryptfs_global_auth_tok *walker; + int rc = 0; + + (*auth_tok_key) = NULL; + (*auth_tok) = NULL; + mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); + list_for_each_entry(walker, + &mount_crypt_stat->global_auth_tok_list, + mount_crypt_stat_list) { + if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX)) + continue; + + if (walker->flags & ECRYPTFS_AUTH_TOK_INVALID) { + rc = -EINVAL; + goto out; + } + + rc = key_validate(walker->global_auth_tok_key); + if (rc) { + if (rc == -EKEYEXPIRED) + goto out; + goto out_invalid_auth_tok; + } + + down_write(&(walker->global_auth_tok_key->sem)); + rc = ecryptfs_verify_auth_tok_from_key( + walker->global_auth_tok_key, auth_tok); + if (rc) + goto out_invalid_auth_tok_unlock; + + (*auth_tok_key) = walker->global_auth_tok_key; + key_get(*auth_tok_key); + goto out; + } + rc = -ENOENT; + goto out; +out_invalid_auth_tok_unlock: + up_write(&(walker->global_auth_tok_key->sem)); +out_invalid_auth_tok: + printk(KERN_WARNING "Invalidating auth tok with sig = [%s]\n", sig); + walker->flags |= ECRYPTFS_AUTH_TOK_INVALID; + key_put(walker->global_auth_tok_key); + walker->global_auth_tok_key = NULL; +out: + mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); + return rc; +} + +/** + * ecryptfs_find_auth_tok_for_sig + * @auth_tok: Set to the matching auth_tok; NULL if not found + * @crypt_stat: inode crypt_stat crypto context + * @sig: Sig of auth_tok to find + * + * For now, this function simply looks at the registered auth_tok's + * linked off the mount_crypt_stat, so all the auth_toks that can be + * used must be registered at mount time. This function could + * potentially try a lot harder to find auth_tok's (e.g., by calling + * out to ecryptfsd to dynamically retrieve an auth_tok object) so + * that static registration of auth_tok's will no longer be necessary. + * + * Returns zero on no error; non-zero on error + */ +static int +ecryptfs_find_auth_tok_for_sig( + struct key **auth_tok_key, + struct ecryptfs_auth_tok **auth_tok, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *sig) +{ + int rc = 0; + + rc = ecryptfs_find_global_auth_tok_for_sig(auth_tok_key, auth_tok, + mount_crypt_stat, sig); + if (rc == -ENOENT) { + /* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the + * mount_crypt_stat structure, we prevent to use auth toks that + * are not inserted through the ecryptfs_add_global_auth_tok + * function. + */ + if (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY) + return -EINVAL; + + rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok, + sig); + } + return rc; +} + +/** + * write_tag_70_packet can gobble a lot of stack space. We stuff most + * of the function's parameters in a kmalloc'd struct to help reduce + * eCryptfs' overall stack usage. + */ +struct ecryptfs_write_tag_70_packet_silly_stack { + u8 cipher_code; + size_t max_packet_size; + size_t packet_size_len; + size_t block_aligned_filename_size; + size_t block_size; + size_t i; + size_t j; + size_t num_rand_bytes; + struct mutex *tfm_mutex; + char *block_aligned_filename; + struct ecryptfs_auth_tok *auth_tok; + struct scatterlist src_sg[2]; + struct scatterlist dst_sg[2]; + struct blkcipher_desc desc; + char iv[ECRYPTFS_MAX_IV_BYTES]; + char hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; + char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; + struct hash_desc hash_desc; + struct scatterlist hash_sg; +}; + +/** + * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK + * @filename: NULL-terminated filename string + * + * This is the simplest mechanism for achieving filename encryption in + * eCryptfs. It encrypts the given filename with the mount-wide + * filename encryption key (FNEK) and stores it in a packet to @dest, + * which the callee will encode and write directly into the dentry + * name. + */ +int +ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *filename, size_t filename_size) +{ + struct ecryptfs_write_tag_70_packet_silly_stack *s; + struct key *auth_tok_key = NULL; + int rc = 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " + "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + rc = -ENOMEM; + goto out; + } + s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + (*packet_size) = 0; + rc = ecryptfs_find_auth_tok_for_sig( + &auth_tok_key, + &s->auth_tok, mount_crypt_stat, + mount_crypt_stat->global_default_fnek_sig); + if (rc) { + printk(KERN_ERR "%s: Error attempting to find auth tok for " + "fnek sig [%s]; rc = [%d]\n", __func__, + mount_crypt_stat->global_default_fnek_sig, rc); + goto out; + } + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name( + &s->desc.tfm, + &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name); + if (unlikely(rc)) { + printk(KERN_ERR "Internal error whilst attempting to get " + "tfm and mutex for cipher name [%s]; rc = [%d]\n", + mount_crypt_stat->global_default_fn_cipher_name, rc); + goto out; + } + mutex_lock(s->tfm_mutex); + s->block_size = crypto_blkcipher_blocksize(s->desc.tfm); + /* Plus one for the \0 separator between the random prefix + * and the plaintext filename */ + s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1); + s->block_aligned_filename_size = (s->num_rand_bytes + filename_size); + if ((s->block_aligned_filename_size % s->block_size) != 0) { + s->num_rand_bytes += (s->block_size + - (s->block_aligned_filename_size + % s->block_size)); + s->block_aligned_filename_size = (s->num_rand_bytes + + filename_size); + } + /* Octet 0: Tag 70 identifier + * Octets 1-N1: Tag 70 packet size (includes cipher identifier + * and block-aligned encrypted filename size) + * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE) + * Octet N2-N3: Cipher identifier (1 octet) + * Octets N3-N4: Block-aligned encrypted filename + * - Consists of a minimum number of random characters, a \0 + * separator, and then the filename */ + s->max_packet_size = (1 /* Tag 70 identifier */ + + 3 /* Max Tag 70 packet size */ + + ECRYPTFS_SIG_SIZE /* FNEK sig */ + + 1 /* Cipher identifier */ + + s->block_aligned_filename_size); + if (dest == NULL) { + (*packet_size) = s->max_packet_size; + goto out_unlock; + } + if (s->max_packet_size > (*remaining_bytes)) { + printk(KERN_WARNING "%s: Require [%zd] bytes to write; only " + "[%zd] available\n", __func__, s->max_packet_size, + (*remaining_bytes)); + rc = -EINVAL; + goto out_unlock; + } + s->block_aligned_filename = kzalloc(s->block_aligned_filename_size, + GFP_KERNEL); + if (!s->block_aligned_filename) { + printk(KERN_ERR "%s: Out of kernel memory whilst attempting to " + "kzalloc [%zd] bytes\n", __func__, + s->block_aligned_filename_size); + rc = -ENOMEM; + goto out_unlock; + } + s->i = 0; + dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE; + rc = ecryptfs_write_packet_length(&dest[s->i], + (ECRYPTFS_SIG_SIZE + + 1 /* Cipher code */ + + s->block_aligned_filename_size), + &s->packet_size_len); + if (rc) { + printk(KERN_ERR "%s: Error generating tag 70 packet " + "header; cannot generate packet length; rc = [%d]\n", + __func__, rc); + goto out_free_unlock; + } + s->i += s->packet_size_len; + ecryptfs_from_hex(&dest[s->i], + mount_crypt_stat->global_default_fnek_sig, + ECRYPTFS_SIG_SIZE); + s->i += ECRYPTFS_SIG_SIZE; + s->cipher_code = ecryptfs_code_for_cipher_string( + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (s->cipher_code == 0) { + printk(KERN_WARNING "%s: Unable to generate code for " + "cipher [%s] with key bytes [%zd]\n", __func__, + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + rc = -EINVAL; + goto out_free_unlock; + } + dest[s->i++] = s->cipher_code; + /* TODO: Support other key modules than passphrase for + * filename encryption */ + if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) { + rc = -EOPNOTSUPP; + printk(KERN_INFO "%s: Filename encryption only supports " + "password tokens\n", __func__); + goto out_free_unlock; + } + sg_init_one( + &s->hash_sg, + (u8 *)s->auth_tok->token.password.session_key_encryption_key, + s->auth_tok->token.password.session_key_encryption_key_bytes); + s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(s->hash_desc.tfm)) { + rc = PTR_ERR(s->hash_desc.tfm); + printk(KERN_ERR "%s: Error attempting to " + "allocate hash crypto context; rc = [%d]\n", + __func__, rc); + goto out_free_unlock; + } + rc = crypto_hash_init(&s->hash_desc); + if (rc) { + printk(KERN_ERR + "%s: Error initializing crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_update( + &s->hash_desc, &s->hash_sg, + s->auth_tok->token.password.session_key_encryption_key_bytes); + if (rc) { + printk(KERN_ERR + "%s: Error updating crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_final(&s->hash_desc, s->hash); + if (rc) { + printk(KERN_ERR + "%s: Error finalizing crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) { + s->block_aligned_filename[s->j] = + s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)]; + if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE) + == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) { + sg_init_one(&s->hash_sg, (u8 *)s->hash, + ECRYPTFS_TAG_70_DIGEST_SIZE); + rc = crypto_hash_init(&s->hash_desc); + if (rc) { + printk(KERN_ERR + "%s: Error initializing crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_update(&s->hash_desc, &s->hash_sg, + ECRYPTFS_TAG_70_DIGEST_SIZE); + if (rc) { + printk(KERN_ERR + "%s: Error updating crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_final(&s->hash_desc, s->tmp_hash); + if (rc) { + printk(KERN_ERR + "%s: Error finalizing crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + memcpy(s->hash, s->tmp_hash, + ECRYPTFS_TAG_70_DIGEST_SIZE); + } + if (s->block_aligned_filename[s->j] == '\0') + s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL; + } + memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename, + filename_size); + rc = virt_to_scatterlist(s->block_aligned_filename, + s->block_aligned_filename_size, s->src_sg, 2); + if (rc < 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert filename memory to scatterlist; rc = [%d]. " + "block_aligned_filename_size = [%zd]\n", __func__, rc, + s->block_aligned_filename_size); + goto out_release_free_unlock; + } + rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size, + s->dst_sg, 2); + if (rc < 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert encrypted filename memory to scatterlist; " + "rc = [%d]. block_aligned_filename_size = [%zd]\n", + __func__, rc, s->block_aligned_filename_size); + goto out_release_free_unlock; + } + /* The characters in the first block effectively do the job + * of the IV here, so we just use 0's for the IV. Note the + * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + * >= ECRYPTFS_MAX_IV_BYTES. */ + memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); + s->desc.info = s->iv; + rc = crypto_blkcipher_setkey( + s->desc.tfm, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc < 0) { + printk(KERN_ERR "%s: Error setting key for crypto context; " + "rc = [%d]. s->auth_tok->token.password.session_key_" + "encryption_key = [0x%p]; mount_crypt_stat->" + "global_default_fn_cipher_key_bytes = [%zd]\n", __func__, + rc, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + goto out_release_free_unlock; + } + rc = crypto_blkcipher_encrypt_iv(&s->desc, s->dst_sg, s->src_sg, + s->block_aligned_filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt filename; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + s->i += s->block_aligned_filename_size; + (*packet_size) = s->i; + (*remaining_bytes) -= (*packet_size); +out_release_free_unlock: + crypto_free_hash(s->hash_desc.tfm); +out_free_unlock: + kzfree(s->block_aligned_filename); +out_unlock: + mutex_unlock(s->tfm_mutex); +out: + if (auth_tok_key) { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + } + kfree(s); + return rc; +} + +struct ecryptfs_parse_tag_70_packet_silly_stack { + u8 cipher_code; + size_t max_packet_size; + size_t packet_size_len; + size_t parsed_tag_70_packet_size; + size_t block_aligned_filename_size; + size_t block_size; + size_t i; + struct mutex *tfm_mutex; + char *decrypted_filename; + struct ecryptfs_auth_tok *auth_tok; + struct scatterlist src_sg[2]; + struct scatterlist dst_sg[2]; + struct blkcipher_desc desc; + char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1]; + char iv[ECRYPTFS_MAX_IV_BYTES]; + char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; +}; + +/** + * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet + * @filename: This function kmalloc's the memory for the filename + * @filename_size: This function sets this to the amount of memory + * kmalloc'd for the filename + * @packet_size: This function sets this to the the number of octets + * in the packet parsed + * @mount_crypt_stat: The mount-wide cryptographic context + * @data: The memory location containing the start of the tag 70 + * packet + * @max_packet_size: The maximum legal size of the packet to be parsed + * from @data + * + * Returns zero on success; non-zero otherwise + */ +int +ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *data, size_t max_packet_size) +{ + struct ecryptfs_parse_tag_70_packet_silly_stack *s; + struct key *auth_tok_key = NULL; + int rc = 0; + + (*packet_size) = 0; + (*filename_size) = 0; + (*filename) = NULL; + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " + "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + rc = -ENOMEM; + goto out; + } + s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) { + printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be " + "at least [%d]\n", __func__, max_packet_size, + (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)); + rc = -EINVAL; + goto out; + } + /* Octet 0: Tag 70 identifier + * Octets 1-N1: Tag 70 packet size (includes cipher identifier + * and block-aligned encrypted filename size) + * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE) + * Octet N2-N3: Cipher identifier (1 octet) + * Octets N3-N4: Block-aligned encrypted filename + * - Consists of a minimum number of random numbers, a \0 + * separator, and then the filename */ + if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) { + printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be " + "tag [0x%.2x]\n", __func__, + data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE); + rc = -EINVAL; + goto out; + } + rc = ecryptfs_parse_packet_length(&data[(*packet_size)], + &s->parsed_tag_70_packet_size, + &s->packet_size_len); + if (rc) { + printk(KERN_WARNING "%s: Error parsing packet length; " + "rc = [%d]\n", __func__, rc); + goto out; + } + s->block_aligned_filename_size = (s->parsed_tag_70_packet_size + - ECRYPTFS_SIG_SIZE - 1); + if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size) + > max_packet_size) { + printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet " + "size is [%zd]\n", __func__, max_packet_size, + (1 + s->packet_size_len + 1 + + s->block_aligned_filename_size)); + rc = -EINVAL; + goto out; + } + (*packet_size) += s->packet_size_len; + ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)], + ECRYPTFS_SIG_SIZE); + s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0'; + (*packet_size) += ECRYPTFS_SIG_SIZE; + s->cipher_code = data[(*packet_size)++]; + rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code); + if (rc) { + printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n", + __func__, s->cipher_code); + goto out; + } + rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key, + &s->auth_tok, mount_crypt_stat, + s->fnek_sig_hex); + if (rc) { + printk(KERN_ERR "%s: Error attempting to find auth tok for " + "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex, + rc); + goto out; + } + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm, + &s->tfm_mutex, + s->cipher_string); + if (unlikely(rc)) { + printk(KERN_ERR "Internal error whilst attempting to get " + "tfm and mutex for cipher name [%s]; rc = [%d]\n", + s->cipher_string, rc); + goto out; + } + mutex_lock(s->tfm_mutex); + rc = virt_to_scatterlist(&data[(*packet_size)], + s->block_aligned_filename_size, s->src_sg, 2); + if (rc < 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert encrypted filename memory to scatterlist; " + "rc = [%d]. block_aligned_filename_size = [%zd]\n", + __func__, rc, s->block_aligned_filename_size); + goto out_unlock; + } + (*packet_size) += s->block_aligned_filename_size; + s->decrypted_filename = kmalloc(s->block_aligned_filename_size, + GFP_KERNEL); + if (!s->decrypted_filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting to " + "kmalloc [%zd] bytes\n", __func__, + s->block_aligned_filename_size); + rc = -ENOMEM; + goto out_unlock; + } + rc = virt_to_scatterlist(s->decrypted_filename, + s->block_aligned_filename_size, s->dst_sg, 2); + if (rc < 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert decrypted filename memory to scatterlist; " + "rc = [%d]. block_aligned_filename_size = [%zd]\n", + __func__, rc, s->block_aligned_filename_size); + goto out_free_unlock; + } + /* The characters in the first block effectively do the job of + * the IV here, so we just use 0's for the IV. Note the + * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + * >= ECRYPTFS_MAX_IV_BYTES. */ + memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); + s->desc.info = s->iv; + /* TODO: Support other key modules than passphrase for + * filename encryption */ + if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) { + rc = -EOPNOTSUPP; + printk(KERN_INFO "%s: Filename encryption only supports " + "password tokens\n", __func__); + goto out_free_unlock; + } + rc = crypto_blkcipher_setkey( + s->desc.tfm, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc < 0) { + printk(KERN_ERR "%s: Error setting key for crypto context; " + "rc = [%d]. s->auth_tok->token.password.session_key_" + "encryption_key = [0x%p]; mount_crypt_stat->" + "global_default_fn_cipher_key_bytes = [%zd]\n", __func__, + rc, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + goto out_free_unlock; + } + rc = crypto_blkcipher_decrypt_iv(&s->desc, s->dst_sg, s->src_sg, + s->block_aligned_filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to decrypt filename; " + "rc = [%d]\n", __func__, rc); + goto out_free_unlock; + } + s->i = 0; + while (s->decrypted_filename[s->i] != '\0' + && s->i < s->block_aligned_filename_size) + s->i++; + if (s->i == s->block_aligned_filename_size) { + printk(KERN_WARNING "%s: Invalid tag 70 packet; could not " + "find valid separator between random characters and " + "the filename\n", __func__); + rc = -EINVAL; + goto out_free_unlock; + } + s->i++; + (*filename_size) = (s->block_aligned_filename_size - s->i); + if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) { + printk(KERN_WARNING "%s: Filename size is [%zd], which is " + "invalid\n", __func__, (*filename_size)); + rc = -EINVAL; + goto out_free_unlock; + } + (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL); + if (!(*filename)) { + printk(KERN_ERR "%s: Out of memory whilst attempting to " + "kmalloc [%zd] bytes\n", __func__, + ((*filename_size) + 1)); + rc = -ENOMEM; + goto out_free_unlock; + } + memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size)); + (*filename)[(*filename_size)] = '\0'; +out_free_unlock: + kfree(s->decrypted_filename); +out_unlock: + mutex_unlock(s->tfm_mutex); +out: + if (rc) { + (*packet_size) = 0; + (*filename_size) = 0; + (*filename) = NULL; + } + if (auth_tok_key) { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + } + kfree(s); + return rc; +} + +static int +ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok) +{ + int rc = 0; + + (*sig) = NULL; + switch (auth_tok->token_type) { + case ECRYPTFS_PASSWORD: + (*sig) = auth_tok->token.password.signature; + break; + case ECRYPTFS_PRIVATE_KEY: + (*sig) = auth_tok->token.private_key.signature; + break; + default: + printk(KERN_ERR "Cannot get sig for auth_tok of type [%d]\n", + auth_tok->token_type); + rc = -EINVAL; + } + return rc; +} + +/** + * decrypt_pki_encrypted_session_key - Decrypt the session key with the given auth_tok. + * @auth_tok: The key authentication token used to decrypt the session key + * @crypt_stat: The cryptographic context + * + * Returns zero on success; non-zero error otherwise. + */ +static int +decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat) +{ + u8 cipher_code = 0; + struct ecryptfs_msg_ctx *msg_ctx; + struct ecryptfs_message *msg = NULL; + char *auth_tok_sig; + char *payload; + size_t payload_len; + int rc; + + rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok); + if (rc) { + printk(KERN_ERR "Unrecognized auth tok type: [%d]\n", + auth_tok->token_type); + goto out; + } + rc = write_tag_64_packet(auth_tok_sig, &(auth_tok->session_key), + &payload, &payload_len); + if (rc) { + ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet\n"); + goto out; + } + rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error sending message to " + "ecryptfsd\n"); + goto out; + } + rc = ecryptfs_wait_for_response(msg_ctx, &msg); + if (rc) { + ecryptfs_printk(KERN_ERR, "Failed to receive tag 65 packet " + "from the user space daemon\n"); + rc = -EIO; + goto out; + } + rc = parse_tag_65_packet(&(auth_tok->session_key), + &cipher_code, msg); + if (rc) { + printk(KERN_ERR "Failed to parse tag 65 packet; rc = [%d]\n", + rc); + goto out; + } + auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY; + memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key, + auth_tok->session_key.decrypted_key_size); + crypt_stat->key_size = auth_tok->session_key.decrypted_key_size; + rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher, cipher_code); + if (rc) { + ecryptfs_printk(KERN_ERR, "Cipher code [%d] is invalid\n", + cipher_code) + goto out; + } + crypt_stat->flags |= ECRYPTFS_KEY_VALID; + if (ecryptfs_verbosity > 0) { + ecryptfs_printk(KERN_DEBUG, "Decrypted session key:\n"); + ecryptfs_dump_hex(crypt_stat->key, + crypt_stat->key_size); + } +out: + if (msg) + kfree(msg); + return rc; +} + +static void wipe_auth_tok_list(struct list_head *auth_tok_list_head) +{ + struct ecryptfs_auth_tok_list_item *auth_tok_list_item; + struct ecryptfs_auth_tok_list_item *auth_tok_list_item_tmp; + + list_for_each_entry_safe(auth_tok_list_item, auth_tok_list_item_tmp, + auth_tok_list_head, list) { + list_del(&auth_tok_list_item->list); + kmem_cache_free(ecryptfs_auth_tok_list_item_cache, + auth_tok_list_item); + } +} + +struct kmem_cache *ecryptfs_auth_tok_list_item_cache; + +/** + * parse_tag_1_packet + * @crypt_stat: The cryptographic context to modify based on packet contents + * @data: The raw bytes of the packet. + * @auth_tok_list: eCryptfs parses packets into authentication tokens; + * a new authentication token will be placed at the + * end of this list for this packet. + * @new_auth_tok: Pointer to a pointer to memory that this function + * allocates; sets the memory address of the pointer to + * NULL on error. This object is added to the + * auth_tok_list. + * @packet_size: This function writes the size of the parsed packet + * into this memory location; zero on error. + * @max_packet_size: The maximum allowable packet size + * + * Returns zero on success; non-zero on error. + */ +static int +parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat, + unsigned char *data, struct list_head *auth_tok_list, + struct ecryptfs_auth_tok **new_auth_tok, + size_t *packet_size, size_t max_packet_size) +{ + size_t body_size; + struct ecryptfs_auth_tok_list_item *auth_tok_list_item; + size_t length_size; + int rc = 0; + + (*packet_size) = 0; + (*new_auth_tok) = NULL; + /** + * This format is inspired by OpenPGP; see RFC 2440 + * packet tag 1 + * + * Tag 1 identifier (1 byte) + * Max Tag 1 packet size (max 3 bytes) + * Version (1 byte) + * Key identifier (8 bytes; ECRYPTFS_SIG_SIZE) + * Cipher identifier (1 byte) + * Encrypted key size (arbitrary) + * + * 12 bytes minimum packet size + */ + if (unlikely(max_packet_size < 12)) { + printk(KERN_ERR "Invalid max packet size; must be >=12\n"); + rc = -EINVAL; + goto out; + } + if (data[(*packet_size)++] != ECRYPTFS_TAG_1_PACKET_TYPE) { + printk(KERN_ERR "Enter w/ first byte != 0x%.2x\n", + ECRYPTFS_TAG_1_PACKET_TYPE); + rc = -EINVAL; + goto out; + } + /* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or + * at end of function upon failure */ + auth_tok_list_item = + kmem_cache_zalloc(ecryptfs_auth_tok_list_item_cache, + GFP_KERNEL); + if (!auth_tok_list_item) { + printk(KERN_ERR "Unable to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + (*new_auth_tok) = &auth_tok_list_item->auth_tok; + rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size, + &length_size); + if (rc) { + printk(KERN_WARNING "Error parsing packet length; " + "rc = [%d]\n", rc); + goto out_free; + } + if (unlikely(body_size < (ECRYPTFS_SIG_SIZE + 2))) { + printk(KERN_WARNING "Invalid body size ([%td])\n", body_size); + rc = -EINVAL; + goto out_free; + } + (*packet_size) += length_size; + if (unlikely((*packet_size) + body_size > max_packet_size)) { + printk(KERN_WARNING "Packet size exceeds max\n"); + rc = -EINVAL; + goto out_free; + } + if (unlikely(data[(*packet_size)++] != 0x03)) { + printk(KERN_WARNING "Unknown version number [%d]\n", + data[(*packet_size) - 1]); + rc = -EINVAL; + goto out_free; + } + ecryptfs_to_hex((*new_auth_tok)->token.private_key.signature, + &data[(*packet_size)], ECRYPTFS_SIG_SIZE); + *packet_size += ECRYPTFS_SIG_SIZE; + /* This byte is skipped because the kernel does not need to + * know which public key encryption algorithm was used */ + (*packet_size)++; + (*new_auth_tok)->session_key.encrypted_key_size = + body_size - (ECRYPTFS_SIG_SIZE + 2); + if ((*new_auth_tok)->session_key.encrypted_key_size + > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { + printk(KERN_WARNING "Tag 1 packet contains key larger " + "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES"); + rc = -EINVAL; + goto out; + } + memcpy((*new_auth_tok)->session_key.encrypted_key, + &data[(*packet_size)], (body_size - (ECRYPTFS_SIG_SIZE + 2))); + (*packet_size) += (*new_auth_tok)->session_key.encrypted_key_size; + (*new_auth_tok)->session_key.flags &= + ~ECRYPTFS_CONTAINS_DECRYPTED_KEY; + (*new_auth_tok)->session_key.flags |= + ECRYPTFS_CONTAINS_ENCRYPTED_KEY; + (*new_auth_tok)->token_type = ECRYPTFS_PRIVATE_KEY; + (*new_auth_tok)->flags = 0; + (*new_auth_tok)->session_key.flags &= + ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT); + (*new_auth_tok)->session_key.flags &= + ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT); + list_add(&auth_tok_list_item->list, auth_tok_list); + goto out; +out_free: + (*new_auth_tok) = NULL; + memset(auth_tok_list_item, 0, + sizeof(struct ecryptfs_auth_tok_list_item)); + kmem_cache_free(ecryptfs_auth_tok_list_item_cache, + auth_tok_list_item); +out: + if (rc) + (*packet_size) = 0; + return rc; +} + +/** + * parse_tag_3_packet + * @crypt_stat: The cryptographic context to modify based on packet + * contents. + * @data: The raw bytes of the packet. + * @auth_tok_list: eCryptfs parses packets into authentication tokens; + * a new authentication token will be placed at the end + * of this list for this packet. + * @new_auth_tok: Pointer to a pointer to memory that this function + * allocates; sets the memory address of the pointer to + * NULL on error. This object is added to the + * auth_tok_list. + * @packet_size: This function writes the size of the parsed packet + * into this memory location; zero on error. + * @max_packet_size: maximum number of bytes to parse + * + * Returns zero on success; non-zero on error. + */ +static int +parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat, + unsigned char *data, struct list_head *auth_tok_list, + struct ecryptfs_auth_tok **new_auth_tok, + size_t *packet_size, size_t max_packet_size) +{ + size_t body_size; + struct ecryptfs_auth_tok_list_item *auth_tok_list_item; + size_t length_size; + int rc = 0; + + (*packet_size) = 0; + (*new_auth_tok) = NULL; + /** + *This format is inspired by OpenPGP; see RFC 2440 + * packet tag 3 + * + * Tag 3 identifier (1 byte) + * Max Tag 3 packet size (max 3 bytes) + * Version (1 byte) + * Cipher code (1 byte) + * S2K specifier (1 byte) + * Hash identifier (1 byte) + * Salt (ECRYPTFS_SALT_SIZE) + * Hash iterations (1 byte) + * Encrypted key (arbitrary) + * + * (ECRYPTFS_SALT_SIZE + 7) minimum packet size + */ + if (max_packet_size < (ECRYPTFS_SALT_SIZE + 7)) { + printk(KERN_ERR "Max packet size too large\n"); + rc = -EINVAL; + goto out; + } + if (data[(*packet_size)++] != ECRYPTFS_TAG_3_PACKET_TYPE) { + printk(KERN_ERR "First byte != 0x%.2x; invalid packet\n", + ECRYPTFS_TAG_3_PACKET_TYPE); + rc = -EINVAL; + goto out; + } + /* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or + * at end of function upon failure */ + auth_tok_list_item = + kmem_cache_zalloc(ecryptfs_auth_tok_list_item_cache, GFP_KERNEL); + if (!auth_tok_list_item) { + printk(KERN_ERR "Unable to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + (*new_auth_tok) = &auth_tok_list_item->auth_tok; + rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size, + &length_size); + if (rc) { + printk(KERN_WARNING "Error parsing packet length; rc = [%d]\n", + rc); + goto out_free; + } + if (unlikely(body_size < (ECRYPTFS_SALT_SIZE + 5))) { + printk(KERN_WARNING "Invalid body size ([%td])\n", body_size); + rc = -EINVAL; + goto out_free; + } + (*packet_size) += length_size; + if (unlikely((*packet_size) + body_size > max_packet_size)) { + printk(KERN_ERR "Packet size exceeds max\n"); + rc = -EINVAL; + goto out_free; + } + (*new_auth_tok)->session_key.encrypted_key_size = + (body_size - (ECRYPTFS_SALT_SIZE + 5)); + if ((*new_auth_tok)->session_key.encrypted_key_size + > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { + printk(KERN_WARNING "Tag 3 packet contains key larger " + "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n"); + rc = -EINVAL; + goto out_free; + } + if (unlikely(data[(*packet_size)++] != 0x04)) { + printk(KERN_WARNING "Unknown version number [%d]\n", + data[(*packet_size) - 1]); + rc = -EINVAL; + goto out_free; + } + rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher, + (u16)data[(*packet_size)]); + if (rc) + goto out_free; + /* A little extra work to differentiate among the AES key + * sizes; see RFC2440 */ + switch(data[(*packet_size)++]) { + case RFC2440_CIPHER_AES_192: + crypt_stat->key_size = 24; + break; + default: + crypt_stat->key_size = + (*new_auth_tok)->session_key.encrypted_key_size; + } + rc = ecryptfs_init_crypt_ctx(crypt_stat); + if (rc) + goto out_free; + if (unlikely(data[(*packet_size)++] != 0x03)) { + printk(KERN_WARNING "Only S2K ID 3 is currently supported\n"); + rc = -ENOSYS; + goto out_free; + } + /* TODO: finish the hash mapping */ + switch (data[(*packet_size)++]) { + case 0x01: /* See RFC2440 for these numbers and their mappings */ + /* Choose MD5 */ + memcpy((*new_auth_tok)->token.password.salt, + &data[(*packet_size)], ECRYPTFS_SALT_SIZE); + (*packet_size) += ECRYPTFS_SALT_SIZE; + /* This conversion was taken straight from RFC2440 */ + (*new_auth_tok)->token.password.hash_iterations = + ((u32) 16 + (data[(*packet_size)] & 15)) + << ((data[(*packet_size)] >> 4) + 6); + (*packet_size)++; + /* Friendly reminder: + * (*new_auth_tok)->session_key.encrypted_key_size = + * (body_size - (ECRYPTFS_SALT_SIZE + 5)); */ + memcpy((*new_auth_tok)->session_key.encrypted_key, + &data[(*packet_size)], + (*new_auth_tok)->session_key.encrypted_key_size); + (*packet_size) += + (*new_auth_tok)->session_key.encrypted_key_size; + (*new_auth_tok)->session_key.flags &= + ~ECRYPTFS_CONTAINS_DECRYPTED_KEY; + (*new_auth_tok)->session_key.flags |= + ECRYPTFS_CONTAINS_ENCRYPTED_KEY; + (*new_auth_tok)->token.password.hash_algo = 0x01; /* MD5 */ + break; + default: + ecryptfs_printk(KERN_ERR, "Unsupported hash algorithm: " + "[%d]\n", data[(*packet_size) - 1]); + rc = -ENOSYS; + goto out_free; + } + (*new_auth_tok)->token_type = ECRYPTFS_PASSWORD; + /* TODO: Parametarize; we might actually want userspace to + * decrypt the session key. */ + (*new_auth_tok)->session_key.flags &= + ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT); + (*new_auth_tok)->session_key.flags &= + ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT); + list_add(&auth_tok_list_item->list, auth_tok_list); + goto out; +out_free: + (*new_auth_tok) = NULL; + memset(auth_tok_list_item, 0, + sizeof(struct ecryptfs_auth_tok_list_item)); + kmem_cache_free(ecryptfs_auth_tok_list_item_cache, + auth_tok_list_item); +out: + if (rc) + (*packet_size) = 0; + return rc; +} + +/** + * parse_tag_11_packet + * @data: The raw bytes of the packet + * @contents: This function writes the data contents of the literal + * packet into this memory location + * @max_contents_bytes: The maximum number of bytes that this function + * is allowed to write into contents + * @tag_11_contents_size: This function writes the size of the parsed + * contents into this memory location; zero on + * error + * @packet_size: This function writes the size of the parsed packet + * into this memory location; zero on error + * @max_packet_size: maximum number of bytes to parse + * + * Returns zero on success; non-zero on error. + */ +static int +parse_tag_11_packet(unsigned char *data, unsigned char *contents, + size_t max_contents_bytes, size_t *tag_11_contents_size, + size_t *packet_size, size_t max_packet_size) +{ + size_t body_size; + size_t length_size; + int rc = 0; + + (*packet_size) = 0; + (*tag_11_contents_size) = 0; + /* This format is inspired by OpenPGP; see RFC 2440 + * packet tag 11 + * + * Tag 11 identifier (1 byte) + * Max Tag 11 packet size (max 3 bytes) + * Binary format specifier (1 byte) + * Filename length (1 byte) + * Filename ("_CONSOLE") (8 bytes) + * Modification date (4 bytes) + * Literal data (arbitrary) + * + * We need at least 16 bytes of data for the packet to even be + * valid. + */ + if (max_packet_size < 16) { + printk(KERN_ERR "Maximum packet size too small\n"); + rc = -EINVAL; + goto out; + } + if (data[(*packet_size)++] != ECRYPTFS_TAG_11_PACKET_TYPE) { + printk(KERN_WARNING "Invalid tag 11 packet format\n"); + rc = -EINVAL; + goto out; + } + rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size, + &length_size); + if (rc) { + printk(KERN_WARNING "Invalid tag 11 packet format\n"); + goto out; + } + if (body_size < 14) { + printk(KERN_WARNING "Invalid body size ([%td])\n", body_size); + rc = -EINVAL; + goto out; + } + (*packet_size) += length_size; + (*tag_11_contents_size) = (body_size - 14); + if (unlikely((*packet_size) + body_size + 1 > max_packet_size)) { + printk(KERN_ERR "Packet size exceeds max\n"); + rc = -EINVAL; + goto out; + } + if (unlikely((*tag_11_contents_size) > max_contents_bytes)) { + printk(KERN_ERR "Literal data section in tag 11 packet exceeds " + "expected size\n"); + rc = -EINVAL; + goto out; + } + if (data[(*packet_size)++] != 0x62) { + printk(KERN_WARNING "Unrecognizable packet\n"); + rc = -EINVAL; + goto out; + } + if (data[(*packet_size)++] != 0x08) { + printk(KERN_WARNING "Unrecognizable packet\n"); + rc = -EINVAL; + goto out; + } + (*packet_size) += 12; /* Ignore filename and modification date */ + memcpy(contents, &data[(*packet_size)], (*tag_11_contents_size)); + (*packet_size) += (*tag_11_contents_size); +out: + if (rc) { + (*packet_size) = 0; + (*tag_11_contents_size) = 0; + } + return rc; +} + +int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, + struct ecryptfs_auth_tok **auth_tok, + char *sig) +{ + int rc = 0; + + (*auth_tok_key) = request_key(&key_type_user, sig, NULL); + if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { + printk(KERN_ERR "Could not find key with description: [%s]\n", + sig); + rc = process_request_key_err(PTR_ERR(*auth_tok_key)); + (*auth_tok_key) = NULL; + goto out; + } + down_write(&(*auth_tok_key)->sem); + rc = ecryptfs_verify_auth_tok_from_key(*auth_tok_key, auth_tok); + if (rc) { + up_write(&(*auth_tok_key)->sem); + key_put(*auth_tok_key); + (*auth_tok_key) = NULL; + goto out; + } +out: + return rc; +} + +/** + * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok. + * @auth_tok: The passphrase authentication token to use to encrypt the FEK + * @crypt_stat: The cryptographic context + * + * Returns zero on success; non-zero error otherwise + */ +static int +decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat) +{ + struct scatterlist dst_sg[2]; + struct scatterlist src_sg[2]; + struct mutex *tfm_mutex; + struct blkcipher_desc desc = { + .flags = CRYPTO_TFM_REQ_MAY_SLEEP + }; + int rc = 0; + + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk( + KERN_DEBUG, "Session key encryption key (size [%d]):\n", + auth_tok->token.password.session_key_encryption_key_bytes); + ecryptfs_dump_hex( + auth_tok->token.password.session_key_encryption_key, + auth_tok->token.password.session_key_encryption_key_bytes); + } + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex, + crypt_stat->cipher); + if (unlikely(rc)) { + printk(KERN_ERR "Internal error whilst attempting to get " + "tfm and mutex for cipher name [%s]; rc = [%d]\n", + crypt_stat->cipher, rc); + goto out; + } + rc = virt_to_scatterlist(auth_tok->session_key.encrypted_key, + auth_tok->session_key.encrypted_key_size, + src_sg, 2); + if (rc < 1 || rc > 2) { + printk(KERN_ERR "Internal error whilst attempting to convert " + "auth_tok->session_key.encrypted_key to scatterlist; " + "expected rc = 1; got rc = [%d]. " + "auth_tok->session_key.encrypted_key_size = [%d]\n", rc, + auth_tok->session_key.encrypted_key_size); + goto out; + } + auth_tok->session_key.decrypted_key_size = + auth_tok->session_key.encrypted_key_size; + rc = virt_to_scatterlist(auth_tok->session_key.decrypted_key, + auth_tok->session_key.decrypted_key_size, + dst_sg, 2); + if (rc < 1 || rc > 2) { + printk(KERN_ERR "Internal error whilst attempting to convert " + "auth_tok->session_key.decrypted_key to scatterlist; " + "expected rc = 1; got rc = [%d]\n", rc); + goto out; + } + mutex_lock(tfm_mutex); + rc = crypto_blkcipher_setkey( + desc.tfm, auth_tok->token.password.session_key_encryption_key, + crypt_stat->key_size); + if (unlikely(rc < 0)) { + mutex_unlock(tfm_mutex); + printk(KERN_ERR "Error setting key for crypto context\n"); + rc = -EINVAL; + goto out; + } + rc = crypto_blkcipher_decrypt(&desc, dst_sg, src_sg, + auth_tok->session_key.encrypted_key_size); + mutex_unlock(tfm_mutex); + if (unlikely(rc)) { + printk(KERN_ERR "Error decrypting; rc = [%d]\n", rc); + goto out; + } + auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY; + memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key, + auth_tok->session_key.decrypted_key_size); + crypt_stat->flags |= ECRYPTFS_KEY_VALID; + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n", + crypt_stat->key_size); + ecryptfs_dump_hex(crypt_stat->key, + crypt_stat->key_size); + } +out: + return rc; +} + +/** + * ecryptfs_parse_packet_set + * @crypt_stat: The cryptographic context + * @src: Virtual address of region of memory containing the packets + * @ecryptfs_dentry: The eCryptfs dentry associated with the packet set + * + * Get crypt_stat to have the file's session key if the requisite key + * is available to decrypt the session key. + * + * Returns Zero if a valid authentication token was retrieved and + * processed; negative value for file not encrypted or for error + * conditions. + */ +int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, + unsigned char *src, + struct dentry *ecryptfs_dentry) +{ + size_t i = 0; + size_t found_auth_tok; + size_t next_packet_is_auth_tok_packet; + struct list_head auth_tok_list; + struct ecryptfs_auth_tok *matching_auth_tok; + struct ecryptfs_auth_tok *candidate_auth_tok; + char *candidate_auth_tok_sig; + size_t packet_size; + struct ecryptfs_auth_tok *new_auth_tok; + unsigned char sig_tmp_space[ECRYPTFS_SIG_SIZE]; + struct ecryptfs_auth_tok_list_item *auth_tok_list_item; + size_t tag_11_contents_size; + size_t tag_11_packet_size; + struct key *auth_tok_key = NULL; + int rc = 0; + + INIT_LIST_HEAD(&auth_tok_list); + /* Parse the header to find as many packets as we can; these will be + * added the our &auth_tok_list */ + next_packet_is_auth_tok_packet = 1; + while (next_packet_is_auth_tok_packet) { + size_t max_packet_size = ((PAGE_CACHE_SIZE - 8) - i); + + switch (src[i]) { + case ECRYPTFS_TAG_3_PACKET_TYPE: + rc = parse_tag_3_packet(crypt_stat, + (unsigned char *)&src[i], + &auth_tok_list, &new_auth_tok, + &packet_size, max_packet_size); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error parsing " + "tag 3 packet\n"); + rc = -EIO; + goto out_wipe_list; + } + i += packet_size; + rc = parse_tag_11_packet((unsigned char *)&src[i], + sig_tmp_space, + ECRYPTFS_SIG_SIZE, + &tag_11_contents_size, + &tag_11_packet_size, + max_packet_size); + if (rc) { + ecryptfs_printk(KERN_ERR, "No valid " + "(ecryptfs-specific) literal " + "packet containing " + "authentication token " + "signature found after " + "tag 3 packet\n"); + rc = -EIO; + goto out_wipe_list; + } + i += tag_11_packet_size; + if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) { + ecryptfs_printk(KERN_ERR, "Expected " + "signature of size [%d]; " + "read size [%zd]\n", + ECRYPTFS_SIG_SIZE, + tag_11_contents_size); + rc = -EIO; + goto out_wipe_list; + } + ecryptfs_to_hex(new_auth_tok->token.password.signature, + sig_tmp_space, tag_11_contents_size); + new_auth_tok->token.password.signature[ + ECRYPTFS_PASSWORD_SIG_SIZE] = '\0'; + crypt_stat->flags |= ECRYPTFS_ENCRYPTED; + break; + case ECRYPTFS_TAG_1_PACKET_TYPE: + rc = parse_tag_1_packet(crypt_stat, + (unsigned char *)&src[i], + &auth_tok_list, &new_auth_tok, + &packet_size, max_packet_size); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error parsing " + "tag 1 packet\n"); + rc = -EIO; + goto out_wipe_list; + } + i += packet_size; + crypt_stat->flags |= ECRYPTFS_ENCRYPTED; + break; + case ECRYPTFS_TAG_11_PACKET_TYPE: + ecryptfs_printk(KERN_WARNING, "Invalid packet set " + "(Tag 11 not allowed by itself)\n"); + rc = -EIO; + goto out_wipe_list; + break; + default: + ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] " + "of the file header; hex value of " + "character is [0x%.2x]\n", i, src[i]); + next_packet_is_auth_tok_packet = 0; + } + } + if (list_empty(&auth_tok_list)) { + printk(KERN_ERR "The lower file appears to be a non-encrypted " + "eCryptfs file; this is not supported in this version " + "of the eCryptfs kernel module\n"); + rc = -EINVAL; + goto out; + } + /* auth_tok_list contains the set of authentication tokens + * parsed from the metadata. We need to find a matching + * authentication token that has the secret component(s) + * necessary to decrypt the EFEK in the auth_tok parsed from + * the metadata. There may be several potential matches, but + * just one will be sufficient to decrypt to get the FEK. */ +find_next_matching_auth_tok: + found_auth_tok = 0; + list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) { + candidate_auth_tok = &auth_tok_list_item->auth_tok; + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, + "Considering cadidate auth tok:\n"); + ecryptfs_dump_auth_tok(candidate_auth_tok); + } + rc = ecryptfs_get_auth_tok_sig(&candidate_auth_tok_sig, + candidate_auth_tok); + if (rc) { + printk(KERN_ERR + "Unrecognized candidate auth tok type: [%d]\n", + candidate_auth_tok->token_type); + rc = -EINVAL; + goto out_wipe_list; + } + rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key, + &matching_auth_tok, + crypt_stat->mount_crypt_stat, + candidate_auth_tok_sig); + if (!rc) { + found_auth_tok = 1; + goto found_matching_auth_tok; + } + } + if (!found_auth_tok) { + ecryptfs_printk(KERN_ERR, "Could not find a usable " + "authentication token\n"); + rc = -EIO; + goto out_wipe_list; + } +found_matching_auth_tok: + if (candidate_auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) { + memcpy(&(candidate_auth_tok->token.private_key), + &(matching_auth_tok->token.private_key), + sizeof(struct ecryptfs_private_key)); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + rc = decrypt_pki_encrypted_session_key(candidate_auth_tok, + crypt_stat); + } else if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD) { + memcpy(&(candidate_auth_tok->token.password), + &(matching_auth_tok->token.password), + sizeof(struct ecryptfs_password)); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + rc = decrypt_passphrase_encrypted_session_key( + candidate_auth_tok, crypt_stat); + } else { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + rc = -EINVAL; + } + if (rc) { + struct ecryptfs_auth_tok_list_item *auth_tok_list_item_tmp; + + ecryptfs_printk(KERN_WARNING, "Error decrypting the " + "session key for authentication token with sig " + "[%.*s]; rc = [%d]. Removing auth tok " + "candidate from the list and searching for " + "the next match.\n", ECRYPTFS_SIG_SIZE_HEX, + candidate_auth_tok_sig, rc); + list_for_each_entry_safe(auth_tok_list_item, + auth_tok_list_item_tmp, + &auth_tok_list, list) { + if (candidate_auth_tok + == &auth_tok_list_item->auth_tok) { + list_del(&auth_tok_list_item->list); + kmem_cache_free( + ecryptfs_auth_tok_list_item_cache, + auth_tok_list_item); + goto find_next_matching_auth_tok; + } + } + BUG(); + } + rc = ecryptfs_compute_root_iv(crypt_stat); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error computing " + "the root IV\n"); + goto out_wipe_list; + } + rc = ecryptfs_init_crypt_ctx(crypt_stat); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error initializing crypto " + "context for cipher [%s]; rc = [%d]\n", + crypt_stat->cipher, rc); + } +out_wipe_list: + wipe_auth_tok_list(&auth_tok_list); +out: + return rc; +} + +static int +pki_encrypt_session_key(struct key *auth_tok_key, + struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_key_record *key_rec) +{ + struct ecryptfs_msg_ctx *msg_ctx = NULL; + char *payload = NULL; + size_t payload_len; + struct ecryptfs_message *msg; + int rc; + + rc = write_tag_66_packet(auth_tok->token.private_key.signature, + ecryptfs_code_for_cipher_string( + crypt_stat->cipher, + crypt_stat->key_size), + crypt_stat, &payload, &payload_len); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); + goto out; + } + rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error sending message to " + "ecryptfsd\n"); + goto out; + } + rc = ecryptfs_wait_for_response(msg_ctx, &msg); + if (rc) { + ecryptfs_printk(KERN_ERR, "Failed to receive tag 67 packet " + "from the user space daemon\n"); + rc = -EIO; + goto out; + } + rc = parse_tag_67_packet(key_rec, msg); + if (rc) + ecryptfs_printk(KERN_ERR, "Error parsing tag 67 packet\n"); + kfree(msg); +out: + kfree(payload); + return rc; +} +/** + * write_tag_1_packet - Write an RFC2440-compatible tag 1 (public key) packet + * @dest: Buffer into which to write the packet + * @remaining_bytes: Maximum number of bytes that can be writtn + * @auth_tok_key: The authentication token key to unlock and put when done with + * @auth_tok + * @auth_tok: The authentication token used for generating the tag 1 packet + * @crypt_stat: The cryptographic context + * @key_rec: The key record struct for the tag 1 packet + * @packet_size: This function will write the number of bytes that end + * up constituting the packet; set to zero on error + * + * Returns zero on success; non-zero on error. + */ +static int +write_tag_1_packet(char *dest, size_t *remaining_bytes, + struct key *auth_tok_key, struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_key_record *key_rec, size_t *packet_size) +{ + size_t i; + size_t encrypted_session_key_valid = 0; + size_t packet_size_length; + size_t max_packet_size; + int rc = 0; + + (*packet_size) = 0; + ecryptfs_from_hex(key_rec->sig, auth_tok->token.private_key.signature, + ECRYPTFS_SIG_SIZE); + encrypted_session_key_valid = 0; + for (i = 0; i < crypt_stat->key_size; i++) + encrypted_session_key_valid |= + auth_tok->session_key.encrypted_key[i]; + if (encrypted_session_key_valid) { + memcpy(key_rec->enc_key, + auth_tok->session_key.encrypted_key, + auth_tok->session_key.encrypted_key_size); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + goto encrypted_session_key_set; + } + if (auth_tok->session_key.encrypted_key_size == 0) + auth_tok->session_key.encrypted_key_size = + auth_tok->token.private_key.key_size; + rc = pki_encrypt_session_key(auth_tok_key, auth_tok, crypt_stat, + key_rec); + if (rc) { + printk(KERN_ERR "Failed to encrypt session key via a key " + "module; rc = [%d]\n", rc); + goto out; + } + if (ecryptfs_verbosity > 0) { + ecryptfs_printk(KERN_DEBUG, "Encrypted key:\n"); + ecryptfs_dump_hex(key_rec->enc_key, key_rec->enc_key_size); + } +encrypted_session_key_set: + /* This format is inspired by OpenPGP; see RFC 2440 + * packet tag 1 */ + max_packet_size = (1 /* Tag 1 identifier */ + + 3 /* Max Tag 1 packet size */ + + 1 /* Version */ + + ECRYPTFS_SIG_SIZE /* Key identifier */ + + 1 /* Cipher identifier */ + + key_rec->enc_key_size); /* Encrypted key size */ + if (max_packet_size > (*remaining_bytes)) { + printk(KERN_ERR "Packet length larger than maximum allowable; " + "need up to [%td] bytes, but there are only [%td] " + "available\n", max_packet_size, (*remaining_bytes)); + rc = -EINVAL; + goto out; + } + dest[(*packet_size)++] = ECRYPTFS_TAG_1_PACKET_TYPE; + rc = ecryptfs_write_packet_length(&dest[(*packet_size)], + (max_packet_size - 4), + &packet_size_length); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 1 packet " + "header; cannot generate packet length\n"); + goto out; + } + (*packet_size) += packet_size_length; + dest[(*packet_size)++] = 0x03; /* version 3 */ + memcpy(&dest[(*packet_size)], key_rec->sig, ECRYPTFS_SIG_SIZE); + (*packet_size) += ECRYPTFS_SIG_SIZE; + dest[(*packet_size)++] = RFC2440_CIPHER_RSA; + memcpy(&dest[(*packet_size)], key_rec->enc_key, + key_rec->enc_key_size); + (*packet_size) += key_rec->enc_key_size; +out: + if (rc) + (*packet_size) = 0; + else + (*remaining_bytes) -= (*packet_size); + return rc; +} + +/** + * write_tag_11_packet + * @dest: Target into which Tag 11 packet is to be written + * @remaining_bytes: Maximum packet length + * @contents: Byte array of contents to copy in + * @contents_length: Number of bytes in contents + * @packet_length: Length of the Tag 11 packet written; zero on error + * + * Returns zero on success; non-zero on error. + */ +static int +write_tag_11_packet(char *dest, size_t *remaining_bytes, char *contents, + size_t contents_length, size_t *packet_length) +{ + size_t packet_size_length; + size_t max_packet_size; + int rc = 0; + + (*packet_length) = 0; + /* This format is inspired by OpenPGP; see RFC 2440 + * packet tag 11 */ + max_packet_size = (1 /* Tag 11 identifier */ + + 3 /* Max Tag 11 packet size */ + + 1 /* Binary format specifier */ + + 1 /* Filename length */ + + 8 /* Filename ("_CONSOLE") */ + + 4 /* Modification date */ + + contents_length); /* Literal data */ + if (max_packet_size > (*remaining_bytes)) { + printk(KERN_ERR "Packet length larger than maximum allowable; " + "need up to [%td] bytes, but there are only [%td] " + "available\n", max_packet_size, (*remaining_bytes)); + rc = -EINVAL; + goto out; + } + dest[(*packet_length)++] = ECRYPTFS_TAG_11_PACKET_TYPE; + rc = ecryptfs_write_packet_length(&dest[(*packet_length)], + (max_packet_size - 4), + &packet_size_length); + if (rc) { + printk(KERN_ERR "Error generating tag 11 packet header; cannot " + "generate packet length. rc = [%d]\n", rc); + goto out; + } + (*packet_length) += packet_size_length; + dest[(*packet_length)++] = 0x62; /* binary data format specifier */ + dest[(*packet_length)++] = 8; + memcpy(&dest[(*packet_length)], "_CONSOLE", 8); + (*packet_length) += 8; + memset(&dest[(*packet_length)], 0x00, 4); + (*packet_length) += 4; + memcpy(&dest[(*packet_length)], contents, contents_length); + (*packet_length) += contents_length; + out: + if (rc) + (*packet_length) = 0; + else + (*remaining_bytes) -= (*packet_length); + return rc; +} + +/** + * write_tag_3_packet + * @dest: Buffer into which to write the packet + * @remaining_bytes: Maximum number of bytes that can be written + * @auth_tok: Authentication token + * @crypt_stat: The cryptographic context + * @key_rec: encrypted key + * @packet_size: This function will write the number of bytes that end + * up constituting the packet; set to zero on error + * + * Returns zero on success; non-zero on error. + */ +static int +write_tag_3_packet(char *dest, size_t *remaining_bytes, + struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_key_record *key_rec, size_t *packet_size) +{ + size_t i; + size_t encrypted_session_key_valid = 0; + char session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES]; + struct scatterlist dst_sg[2]; + struct scatterlist src_sg[2]; + struct mutex *tfm_mutex = NULL; + u8 cipher_code; + size_t packet_size_length; + size_t max_packet_size; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + crypt_stat->mount_crypt_stat; + struct blkcipher_desc desc = { + .tfm = NULL, + .flags = CRYPTO_TFM_REQ_MAY_SLEEP + }; + int rc = 0; + + (*packet_size) = 0; + ecryptfs_from_hex(key_rec->sig, auth_tok->token.password.signature, + ECRYPTFS_SIG_SIZE); + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex, + crypt_stat->cipher); + if (unlikely(rc)) { + printk(KERN_ERR "Internal error whilst attempting to get " + "tfm and mutex for cipher name [%s]; rc = [%d]\n", + crypt_stat->cipher, rc); + goto out; + } + if (mount_crypt_stat->global_default_cipher_key_size == 0) { + struct blkcipher_alg *alg = crypto_blkcipher_alg(desc.tfm); + + printk(KERN_WARNING "No key size specified at mount; " + "defaulting to [%d]\n", alg->max_keysize); + mount_crypt_stat->global_default_cipher_key_size = + alg->max_keysize; + } + if (crypt_stat->key_size == 0) + crypt_stat->key_size = + mount_crypt_stat->global_default_cipher_key_size; + if (auth_tok->session_key.encrypted_key_size == 0) + auth_tok->session_key.encrypted_key_size = + crypt_stat->key_size; + if (crypt_stat->key_size == 24 + && strcmp("aes", crypt_stat->cipher) == 0) { + memset((crypt_stat->key + 24), 0, 8); + auth_tok->session_key.encrypted_key_size = 32; + } else + auth_tok->session_key.encrypted_key_size = crypt_stat->key_size; + key_rec->enc_key_size = + auth_tok->session_key.encrypted_key_size; + encrypted_session_key_valid = 0; + for (i = 0; i < auth_tok->session_key.encrypted_key_size; i++) + encrypted_session_key_valid |= + auth_tok->session_key.encrypted_key[i]; + if (encrypted_session_key_valid) { + ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; " + "using auth_tok->session_key.encrypted_key, " + "where key_rec->enc_key_size = [%zd]\n", + key_rec->enc_key_size); + memcpy(key_rec->enc_key, + auth_tok->session_key.encrypted_key, + key_rec->enc_key_size); + goto encrypted_session_key_set; + } + if (auth_tok->token.password.flags & + ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET) { + ecryptfs_printk(KERN_DEBUG, "Using previously generated " + "session key encryption key of size [%d]\n", + auth_tok->token.password. + session_key_encryption_key_bytes); + memcpy(session_key_encryption_key, + auth_tok->token.password.session_key_encryption_key, + crypt_stat->key_size); + ecryptfs_printk(KERN_DEBUG, + "Cached session key " "encryption key: \n"); + if (ecryptfs_verbosity > 0) + ecryptfs_dump_hex(session_key_encryption_key, 16); + } + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "Session key encryption key:\n"); + ecryptfs_dump_hex(session_key_encryption_key, 16); + } + rc = virt_to_scatterlist(crypt_stat->key, key_rec->enc_key_size, + src_sg, 2); + if (rc < 1 || rc > 2) { + ecryptfs_printk(KERN_ERR, "Error generating scatterlist " + "for crypt_stat session key; expected rc = 1; " + "got rc = [%d]. key_rec->enc_key_size = [%zd]\n", + rc, key_rec->enc_key_size); + rc = -ENOMEM; + goto out; + } + rc = virt_to_scatterlist(key_rec->enc_key, key_rec->enc_key_size, + dst_sg, 2); + if (rc < 1 || rc > 2) { + ecryptfs_printk(KERN_ERR, "Error generating scatterlist " + "for crypt_stat encrypted session key; " + "expected rc = 1; got rc = [%d]. " + "key_rec->enc_key_size = [%zd]\n", rc, + key_rec->enc_key_size); + rc = -ENOMEM; + goto out; + } + mutex_lock(tfm_mutex); + rc = crypto_blkcipher_setkey(desc.tfm, session_key_encryption_key, + crypt_stat->key_size); + if (rc < 0) { + mutex_unlock(tfm_mutex); + ecryptfs_printk(KERN_ERR, "Error setting key for crypto " + "context; rc = [%d]\n", rc); + goto out; + } + rc = 0; + ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n", + crypt_stat->key_size); + rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg, + (*key_rec).enc_key_size); + mutex_unlock(tfm_mutex); + if (rc) { + printk(KERN_ERR "Error encrypting; rc = [%d]\n", rc); + goto out; + } + ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n"); + if (ecryptfs_verbosity > 0) { + ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n", + key_rec->enc_key_size); + ecryptfs_dump_hex(key_rec->enc_key, + key_rec->enc_key_size); + } +encrypted_session_key_set: + /* This format is inspired by OpenPGP; see RFC 2440 + * packet tag 3 */ + max_packet_size = (1 /* Tag 3 identifier */ + + 3 /* Max Tag 3 packet size */ + + 1 /* Version */ + + 1 /* Cipher code */ + + 1 /* S2K specifier */ + + 1 /* Hash identifier */ + + ECRYPTFS_SALT_SIZE /* Salt */ + + 1 /* Hash iterations */ + + key_rec->enc_key_size); /* Encrypted key size */ + if (max_packet_size > (*remaining_bytes)) { + printk(KERN_ERR "Packet too large; need up to [%td] bytes, but " + "there are only [%td] available\n", max_packet_size, + (*remaining_bytes)); + rc = -EINVAL; + goto out; + } + dest[(*packet_size)++] = ECRYPTFS_TAG_3_PACKET_TYPE; + /* Chop off the Tag 3 identifier(1) and Tag 3 packet size(3) + * to get the number of octets in the actual Tag 3 packet */ + rc = ecryptfs_write_packet_length(&dest[(*packet_size)], + (max_packet_size - 4), + &packet_size_length); + if (rc) { + printk(KERN_ERR "Error generating tag 3 packet header; cannot " + "generate packet length. rc = [%d]\n", rc); + goto out; + } + (*packet_size) += packet_size_length; + dest[(*packet_size)++] = 0x04; /* version 4 */ + /* TODO: Break from RFC2440 so that arbitrary ciphers can be + * specified with strings */ + cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher, + crypt_stat->key_size); + if (cipher_code == 0) { + ecryptfs_printk(KERN_WARNING, "Unable to generate code for " + "cipher [%s]\n", crypt_stat->cipher); + rc = -EINVAL; + goto out; + } + dest[(*packet_size)++] = cipher_code; + dest[(*packet_size)++] = 0x03; /* S2K */ + dest[(*packet_size)++] = 0x01; /* MD5 (TODO: parameterize) */ + memcpy(&dest[(*packet_size)], auth_tok->token.password.salt, + ECRYPTFS_SALT_SIZE); + (*packet_size) += ECRYPTFS_SALT_SIZE; /* salt */ + dest[(*packet_size)++] = 0x60; /* hash iterations (65536) */ + memcpy(&dest[(*packet_size)], key_rec->enc_key, + key_rec->enc_key_size); + (*packet_size) += key_rec->enc_key_size; +out: + if (rc) + (*packet_size) = 0; + else + (*remaining_bytes) -= (*packet_size); + return rc; +} + +struct kmem_cache *ecryptfs_key_record_cache; + +/** + * ecryptfs_generate_key_packet_set + * @dest_base: Virtual address from which to write the key record set + * @crypt_stat: The cryptographic context from which the + * authentication tokens will be retrieved + * @ecryptfs_dentry: The dentry, used to retrieve the mount crypt stat + * for the global parameters + * @len: The amount written + * @max: The maximum amount of data allowed to be written + * + * Generates a key packet set and writes it to the virtual address + * passed in. + * + * Returns zero on success; non-zero on error. + */ +int +ecryptfs_generate_key_packet_set(char *dest_base, + struct ecryptfs_crypt_stat *crypt_stat, + struct dentry *ecryptfs_dentry, size_t *len, + size_t max) +{ + struct ecryptfs_auth_tok *auth_tok; + struct key *auth_tok_key = NULL; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + size_t written; + struct ecryptfs_key_record *key_rec; + struct ecryptfs_key_sig *key_sig; + int rc = 0; + + (*len) = 0; + mutex_lock(&crypt_stat->keysig_list_mutex); + key_rec = kmem_cache_alloc(ecryptfs_key_record_cache, GFP_KERNEL); + if (!key_rec) { + rc = -ENOMEM; + goto out; + } + list_for_each_entry(key_sig, &crypt_stat->keysig_list, + crypt_stat_list) { + memset(key_rec, 0, sizeof(*key_rec)); + rc = ecryptfs_find_global_auth_tok_for_sig(&auth_tok_key, + &auth_tok, + mount_crypt_stat, + key_sig->keysig); + if (rc) { + printk(KERN_WARNING "Unable to retrieve auth tok with " + "sig = [%s]\n", key_sig->keysig); + rc = process_find_global_auth_tok_for_sig_err(rc); + goto out_free; + } + if (auth_tok->token_type == ECRYPTFS_PASSWORD) { + rc = write_tag_3_packet((dest_base + (*len)), + &max, auth_tok, + crypt_stat, key_rec, + &written); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error " + "writing tag 3 packet\n"); + goto out_free; + } + (*len) += written; + /* Write auth tok signature packet */ + rc = write_tag_11_packet((dest_base + (*len)), &max, + key_rec->sig, + ECRYPTFS_SIG_SIZE, &written); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error writing " + "auth tok signature packet\n"); + goto out_free; + } + (*len) += written; + } else if (auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) { + rc = write_tag_1_packet(dest_base + (*len), &max, + auth_tok_key, auth_tok, + crypt_stat, key_rec, &written); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error " + "writing tag 1 packet\n"); + goto out_free; + } + (*len) += written; + } else { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + ecryptfs_printk(KERN_WARNING, "Unsupported " + "authentication token type\n"); + rc = -EINVAL; + goto out_free; + } + } + if (likely(max > 0)) { + dest_base[(*len)] = 0x00; + } else { + ecryptfs_printk(KERN_ERR, "Error writing boundary byte\n"); + rc = -EIO; + } +out_free: + kmem_cache_free(ecryptfs_key_record_cache, key_rec); +out: + if (rc) + (*len) = 0; + mutex_unlock(&crypt_stat->keysig_list_mutex); + return rc; +} + +struct kmem_cache *ecryptfs_key_sig_cache; + +int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig) +{ + struct ecryptfs_key_sig *new_key_sig; + + new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL); + if (!new_key_sig) { + printk(KERN_ERR + "Error allocating from ecryptfs_key_sig_cache\n"); + return -ENOMEM; + } + memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX); + new_key_sig->keysig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; + /* Caller must hold keysig_list_mutex */ + list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list); + + return 0; +} + +struct kmem_cache *ecryptfs_global_auth_tok_cache; + +int +ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *sig, u32 global_auth_tok_flags) +{ + struct ecryptfs_global_auth_tok *new_auth_tok; + int rc = 0; + + new_auth_tok = kmem_cache_zalloc(ecryptfs_global_auth_tok_cache, + GFP_KERNEL); + if (!new_auth_tok) { + rc = -ENOMEM; + printk(KERN_ERR "Error allocating from " + "ecryptfs_global_auth_tok_cache\n"); + goto out; + } + memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX); + new_auth_tok->flags = global_auth_tok_flags; + new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; + mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); + list_add(&new_auth_tok->mount_crypt_stat_list, + &mount_crypt_stat->global_auth_tok_list); + mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); +out: + return rc; +} + diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c new file mode 100644 index 00000000..69f994a7 --- /dev/null +++ b/fs/ecryptfs/kthread.c @@ -0,0 +1,197 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 2008 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +struct kmem_cache *ecryptfs_open_req_cache; + +static struct ecryptfs_kthread_ctl { +#define ECRYPTFS_KTHREAD_ZOMBIE 0x00000001 + u32 flags; + struct mutex mux; + struct list_head req_list; + wait_queue_head_t wait; +} ecryptfs_kthread_ctl; + +static struct task_struct *ecryptfs_kthread; + +/** + * ecryptfs_threadfn + * @ignored: ignored + * + * The eCryptfs kernel thread that has the responsibility of getting + * the lower file with RW permissions. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_threadfn(void *ignored) +{ + set_freezable(); + while (1) { + struct ecryptfs_open_req *req; + + wait_event_freezable( + ecryptfs_kthread_ctl.wait, + (!list_empty(&ecryptfs_kthread_ctl.req_list) + || kthread_should_stop())); + mutex_lock(&ecryptfs_kthread_ctl.mux); + if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) { + mutex_unlock(&ecryptfs_kthread_ctl.mux); + goto out; + } + while (!list_empty(&ecryptfs_kthread_ctl.req_list)) { + req = list_first_entry(&ecryptfs_kthread_ctl.req_list, + struct ecryptfs_open_req, + kthread_ctl_list); + mutex_lock(&req->mux); + list_del(&req->kthread_ctl_list); + if (!(req->flags & ECRYPTFS_REQ_ZOMBIE)) { + dget(req->lower_dentry); + mntget(req->lower_mnt); + (*req->lower_file) = dentry_open( + req->lower_dentry, req->lower_mnt, + (O_RDWR | O_LARGEFILE), current_cred()); + req->flags |= ECRYPTFS_REQ_PROCESSED; + } + wake_up(&req->wait); + mutex_unlock(&req->mux); + } + mutex_unlock(&ecryptfs_kthread_ctl.mux); + } +out: + return 0; +} + +int __init ecryptfs_init_kthread(void) +{ + int rc = 0; + + mutex_init(&ecryptfs_kthread_ctl.mux); + init_waitqueue_head(&ecryptfs_kthread_ctl.wait); + INIT_LIST_HEAD(&ecryptfs_kthread_ctl.req_list); + ecryptfs_kthread = kthread_run(&ecryptfs_threadfn, NULL, + "ecryptfs-kthread"); + if (IS_ERR(ecryptfs_kthread)) { + rc = PTR_ERR(ecryptfs_kthread); + printk(KERN_ERR "%s: Failed to create kernel thread; rc = [%d]" + "\n", __func__, rc); + } + return rc; +} + +void ecryptfs_destroy_kthread(void) +{ + struct ecryptfs_open_req *req; + + mutex_lock(&ecryptfs_kthread_ctl.mux); + ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE; + list_for_each_entry(req, &ecryptfs_kthread_ctl.req_list, + kthread_ctl_list) { + mutex_lock(&req->mux); + req->flags |= ECRYPTFS_REQ_ZOMBIE; + wake_up(&req->wait); + mutex_unlock(&req->mux); + } + mutex_unlock(&ecryptfs_kthread_ctl.mux); + kthread_stop(ecryptfs_kthread); + wake_up(&ecryptfs_kthread_ctl.wait); +} + +/** + * ecryptfs_privileged_open + * @lower_file: Result of dentry_open by root on lower dentry + * @lower_dentry: Lower dentry for file to open + * @lower_mnt: Lower vfsmount for file to open + * + * This function gets a r/w file opened againt the lower dentry. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_privileged_open(struct file **lower_file, + struct dentry *lower_dentry, + struct vfsmount *lower_mnt, + const struct cred *cred) +{ + struct ecryptfs_open_req *req; + int flags = O_LARGEFILE; + int rc = 0; + + /* Corresponding dput() and mntput() are done when the + * lower file is fput() when all eCryptfs files for the inode are + * released. */ + dget(lower_dentry); + mntget(lower_mnt); + flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR; + (*lower_file) = dentry_open(lower_dentry, lower_mnt, flags, cred); + if (!IS_ERR(*lower_file)) + goto out; + if (flags & O_RDONLY) { + rc = PTR_ERR((*lower_file)); + goto out; + } + req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL); + if (!req) { + rc = -ENOMEM; + goto out; + } + mutex_init(&req->mux); + req->lower_file = lower_file; + req->lower_dentry = lower_dentry; + req->lower_mnt = lower_mnt; + init_waitqueue_head(&req->wait); + req->flags = 0; + mutex_lock(&ecryptfs_kthread_ctl.mux); + if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) { + rc = -EIO; + mutex_unlock(&ecryptfs_kthread_ctl.mux); + printk(KERN_ERR "%s: We are in the middle of shutting down; " + "aborting privileged request to open lower file\n", + __func__); + goto out_free; + } + list_add_tail(&req->kthread_ctl_list, &ecryptfs_kthread_ctl.req_list); + mutex_unlock(&ecryptfs_kthread_ctl.mux); + wake_up(&ecryptfs_kthread_ctl.wait); + wait_event(req->wait, (req->flags != 0)); + mutex_lock(&req->mux); + BUG_ON(req->flags == 0); + if (req->flags & ECRYPTFS_REQ_DROPPED + || req->flags & ECRYPTFS_REQ_ZOMBIE) { + rc = -EIO; + printk(KERN_WARNING "%s: Privileged open request dropped\n", + __func__); + goto out_unlock; + } + if (IS_ERR(*req->lower_file)) + rc = PTR_ERR(*req->lower_file); +out_unlock: + mutex_unlock(&req->mux); +out_free: + kmem_cache_free(ecryptfs_open_req_cache, req); +out: + return rc; +} diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c new file mode 100644 index 00000000..b4a6befb --- /dev/null +++ b/fs/ecryptfs/main.c @@ -0,0 +1,867 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2007 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Michael C. Thompson + * Tyler Hicks + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +/** + * Module parameter that defines the ecryptfs_verbosity level. + */ +int ecryptfs_verbosity = 0; + +module_param(ecryptfs_verbosity, int, 0); +MODULE_PARM_DESC(ecryptfs_verbosity, + "Initial verbosity level (0 or 1; defaults to " + "0, which is Quiet)"); + +/** + * Module parameter that defines the number of message buffer elements + */ +unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS; + +module_param(ecryptfs_message_buf_len, uint, 0); +MODULE_PARM_DESC(ecryptfs_message_buf_len, + "Number of message buffer elements"); + +/** + * Module parameter that defines the maximum guaranteed amount of time to wait + * for a response from ecryptfsd. The actual sleep time will be, more than + * likely, a small amount greater than this specified value, but only less if + * the message successfully arrives. + */ +signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ; + +module_param(ecryptfs_message_wait_timeout, long, 0); +MODULE_PARM_DESC(ecryptfs_message_wait_timeout, + "Maximum number of seconds that an operation will " + "sleep while waiting for a message response from " + "userspace"); + +/** + * Module parameter that is an estimate of the maximum number of users + * that will be concurrently using eCryptfs. Set this to the right + * value to balance performance and memory use. + */ +unsigned int ecryptfs_number_of_users = ECRYPTFS_DEFAULT_NUM_USERS; + +module_param(ecryptfs_number_of_users, uint, 0); +MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of " + "concurrent users of eCryptfs"); + +void __ecryptfs_printk(const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + if (fmt[1] == '7') { /* KERN_DEBUG */ + if (ecryptfs_verbosity >= 1) + vprintk(fmt, args); + } else + vprintk(fmt, args); + va_end(args); +} + +/** + * ecryptfs_init_lower_file + * @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with + * the lower dentry and the lower mount set + * + * eCryptfs only ever keeps a single open file for every lower + * inode. All I/O operations to the lower inode occur through that + * file. When the first eCryptfs dentry that interposes with the first + * lower dentry for that inode is created, this function creates the + * lower file struct and associates it with the eCryptfs + * inode. When all eCryptfs files associated with the inode are released, the + * file is closed. + * + * The lower file will be opened with read/write permissions, if + * possible. Otherwise, it is opened read-only. + * + * This function does nothing if a lower file is already + * associated with the eCryptfs inode. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_init_lower_file(struct dentry *dentry, + struct file **lower_file) +{ + const struct cred *cred = current_cred(); + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); + int rc; + + rc = ecryptfs_privileged_open(lower_file, lower_dentry, lower_mnt, + cred); + if (rc) { + printk(KERN_ERR "Error opening lower file " + "for lower_dentry [0x%p] and lower_mnt [0x%p]; " + "rc = [%d]\n", lower_dentry, lower_mnt, rc); + (*lower_file) = NULL; + } + return rc; +} + +int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode) +{ + struct ecryptfs_inode_info *inode_info; + int count, rc = 0; + + inode_info = ecryptfs_inode_to_private(inode); + mutex_lock(&inode_info->lower_file_mutex); + count = atomic_inc_return(&inode_info->lower_file_count); + if (WARN_ON_ONCE(count < 1)) + rc = -EINVAL; + else if (count == 1) { + rc = ecryptfs_init_lower_file(dentry, + &inode_info->lower_file); + if (rc) + atomic_set(&inode_info->lower_file_count, 0); + } + mutex_unlock(&inode_info->lower_file_mutex); + return rc; +} + +void ecryptfs_put_lower_file(struct inode *inode) +{ + struct ecryptfs_inode_info *inode_info; + + inode_info = ecryptfs_inode_to_private(inode); + if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count, + &inode_info->lower_file_mutex)) { + fput(inode_info->lower_file); + inode_info->lower_file = NULL; + mutex_unlock(&inode_info->lower_file_mutex); + } +} + +enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, + ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, + ecryptfs_opt_ecryptfs_key_bytes, + ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, + ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, + ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, + ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only, + ecryptfs_opt_check_dev_ruid, + ecryptfs_opt_err }; + +static const match_table_t tokens = { + {ecryptfs_opt_sig, "sig=%s"}, + {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"}, + {ecryptfs_opt_cipher, "cipher=%s"}, + {ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"}, + {ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"}, + {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, + {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, + {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, + {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, + {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, + {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, + {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, + {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"}, + {ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"}, + {ecryptfs_opt_err, NULL} +}; + +static int ecryptfs_init_global_auth_toks( + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + struct ecryptfs_global_auth_tok *global_auth_tok; + struct ecryptfs_auth_tok *auth_tok; + int rc = 0; + + list_for_each_entry(global_auth_tok, + &mount_crypt_stat->global_auth_tok_list, + mount_crypt_stat_list) { + rc = ecryptfs_keyring_auth_tok_for_sig( + &global_auth_tok->global_auth_tok_key, &auth_tok, + global_auth_tok->sig); + if (rc) { + printk(KERN_ERR "Could not find valid key in user " + "session keyring for sig specified in mount " + "option: [%s]\n", global_auth_tok->sig); + global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID; + goto out; + } else { + global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID; + up_write(&(global_auth_tok->global_auth_tok_key)->sem); + } + } +out: + return rc; +} + +static void ecryptfs_init_mount_crypt_stat( + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + memset((void *)mount_crypt_stat, 0, + sizeof(struct ecryptfs_mount_crypt_stat)); + INIT_LIST_HEAD(&mount_crypt_stat->global_auth_tok_list); + mutex_init(&mount_crypt_stat->global_auth_tok_list_mutex); + mount_crypt_stat->flags |= ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED; +} + +/** + * ecryptfs_parse_options + * @sb: The ecryptfs super block + * @options: The options passed to the kernel + * @check_ruid: set to 1 if device uid should be checked against the ruid + * + * Parse mount options: + * debug=N - ecryptfs_verbosity level for debug output + * sig=XXX - description(signature) of the key to use + * + * Returns the dentry object of the lower-level (lower/interposed) + * directory; We want to mount our stackable file system on top of + * that lower directory. + * + * The signature of the key to use must be the description of a key + * already in the keyring. Mounting will fail if the key can not be + * found. + * + * Returns zero on success; non-zero on error + */ +static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, + uid_t *check_ruid) +{ + char *p; + int rc = 0; + int sig_set = 0; + int cipher_name_set = 0; + int fn_cipher_name_set = 0; + int cipher_key_bytes; + int cipher_key_bytes_set = 0; + int fn_cipher_key_bytes; + int fn_cipher_key_bytes_set = 0; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &sbi->mount_crypt_stat; + substring_t args[MAX_OPT_ARGS]; + int token; + char *sig_src; + char *cipher_name_dst; + char *cipher_name_src; + char *fn_cipher_name_dst; + char *fn_cipher_name_src; + char *fnek_dst; + char *fnek_src; + char *cipher_key_bytes_src; + char *fn_cipher_key_bytes_src; + + *check_ruid = 0; + + if (!options) { + rc = -EINVAL; + goto out; + } + ecryptfs_init_mount_crypt_stat(mount_crypt_stat); + while ((p = strsep(&options, ",")) != NULL) { + if (!*p) + continue; + token = match_token(p, tokens, args); + switch (token) { + case ecryptfs_opt_sig: + case ecryptfs_opt_ecryptfs_sig: + sig_src = args[0].from; + rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, + sig_src, 0); + if (rc) { + printk(KERN_ERR "Error attempting to register " + "global sig; rc = [%d]\n", rc); + goto out; + } + sig_set = 1; + break; + case ecryptfs_opt_cipher: + case ecryptfs_opt_ecryptfs_cipher: + cipher_name_src = args[0].from; + cipher_name_dst = + mount_crypt_stat-> + global_default_cipher_name; + strncpy(cipher_name_dst, cipher_name_src, + ECRYPTFS_MAX_CIPHER_NAME_SIZE); + cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; + cipher_name_set = 1; + break; + case ecryptfs_opt_ecryptfs_key_bytes: + cipher_key_bytes_src = args[0].from; + cipher_key_bytes = + (int)simple_strtol(cipher_key_bytes_src, + &cipher_key_bytes_src, 0); + mount_crypt_stat->global_default_cipher_key_size = + cipher_key_bytes; + cipher_key_bytes_set = 1; + break; + case ecryptfs_opt_passthrough: + mount_crypt_stat->flags |= + ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED; + break; + case ecryptfs_opt_xattr_metadata: + mount_crypt_stat->flags |= + ECRYPTFS_XATTR_METADATA_ENABLED; + break; + case ecryptfs_opt_encrypted_view: + mount_crypt_stat->flags |= + ECRYPTFS_XATTR_METADATA_ENABLED; + mount_crypt_stat->flags |= + ECRYPTFS_ENCRYPTED_VIEW_ENABLED; + break; + case ecryptfs_opt_fnek_sig: + fnek_src = args[0].from; + fnek_dst = + mount_crypt_stat->global_default_fnek_sig; + strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX); + mount_crypt_stat->global_default_fnek_sig[ + ECRYPTFS_SIG_SIZE_HEX] = '\0'; + rc = ecryptfs_add_global_auth_tok( + mount_crypt_stat, + mount_crypt_stat->global_default_fnek_sig, + ECRYPTFS_AUTH_TOK_FNEK); + if (rc) { + printk(KERN_ERR "Error attempting to register " + "global fnek sig [%s]; rc = [%d]\n", + mount_crypt_stat->global_default_fnek_sig, + rc); + goto out; + } + mount_crypt_stat->flags |= + (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES + | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK); + break; + case ecryptfs_opt_fn_cipher: + fn_cipher_name_src = args[0].from; + fn_cipher_name_dst = + mount_crypt_stat->global_default_fn_cipher_name; + strncpy(fn_cipher_name_dst, fn_cipher_name_src, + ECRYPTFS_MAX_CIPHER_NAME_SIZE); + mount_crypt_stat->global_default_fn_cipher_name[ + ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; + fn_cipher_name_set = 1; + break; + case ecryptfs_opt_fn_cipher_key_bytes: + fn_cipher_key_bytes_src = args[0].from; + fn_cipher_key_bytes = + (int)simple_strtol(fn_cipher_key_bytes_src, + &fn_cipher_key_bytes_src, 0); + mount_crypt_stat->global_default_fn_cipher_key_bytes = + fn_cipher_key_bytes; + fn_cipher_key_bytes_set = 1; + break; + case ecryptfs_opt_unlink_sigs: + mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; + break; + case ecryptfs_opt_mount_auth_tok_only: + mount_crypt_stat->flags |= + ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; + break; + case ecryptfs_opt_check_dev_ruid: + *check_ruid = 1; + break; + case ecryptfs_opt_err: + default: + printk(KERN_WARNING + "%s: eCryptfs: unrecognized option [%s]\n", + __func__, p); + } + } + if (!sig_set) { + rc = -EINVAL; + ecryptfs_printk(KERN_ERR, "You must supply at least one valid " + "auth tok signature as a mount " + "parameter; see the eCryptfs README\n"); + goto out; + } + if (!cipher_name_set) { + int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); + + BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); + strcpy(mount_crypt_stat->global_default_cipher_name, + ECRYPTFS_DEFAULT_CIPHER); + } + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !fn_cipher_name_set) + strcpy(mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_cipher_name); + if (!cipher_key_bytes_set) + mount_crypt_stat->global_default_cipher_key_size = 0; + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !fn_cipher_key_bytes_set) + mount_crypt_stat->global_default_fn_cipher_key_bytes = + mount_crypt_stat->global_default_cipher_key_size; + mutex_lock(&key_tfm_list_mutex); + if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, + NULL)) { + rc = ecryptfs_add_new_key_tfm( + NULL, mount_crypt_stat->global_default_cipher_name, + mount_crypt_stat->global_default_cipher_key_size); + if (rc) { + printk(KERN_ERR "Error attempting to initialize " + "cipher with name = [%s] and key size = [%td]; " + "rc = [%d]\n", + mount_crypt_stat->global_default_cipher_name, + mount_crypt_stat->global_default_cipher_key_size, + rc); + rc = -EINVAL; + mutex_unlock(&key_tfm_list_mutex); + goto out; + } + } + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !ecryptfs_tfm_exists( + mount_crypt_stat->global_default_fn_cipher_name, NULL)) { + rc = ecryptfs_add_new_key_tfm( + NULL, mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc) { + printk(KERN_ERR "Error attempting to initialize " + "cipher with name = [%s] and key size = [%td]; " + "rc = [%d]\n", + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes, + rc); + rc = -EINVAL; + mutex_unlock(&key_tfm_list_mutex); + goto out; + } + } + mutex_unlock(&key_tfm_list_mutex); + rc = ecryptfs_init_global_auth_toks(mount_crypt_stat); + if (rc) + printk(KERN_WARNING "One or more global auth toks could not " + "properly register; rc = [%d]\n", rc); +out: + return rc; +} + +struct kmem_cache *ecryptfs_sb_info_cache; +static struct file_system_type ecryptfs_fs_type; + +/** + * ecryptfs_get_sb + * @fs_type + * @flags + * @dev_name: The path to mount over + * @raw_data: The options passed into the kernel + */ +static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + struct super_block *s; + struct ecryptfs_sb_info *sbi; + struct ecryptfs_dentry_info *root_info; + const char *err = "Getting sb failed"; + struct inode *inode; + struct path path; + uid_t check_ruid; + int rc; + + sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); + if (!sbi) { + rc = -ENOMEM; + goto out; + } + + rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid); + if (rc) { + err = "Error parsing options"; + goto out; + } + + s = sget(fs_type, NULL, set_anon_super, NULL); + if (IS_ERR(s)) { + rc = PTR_ERR(s); + goto out; + } + + s->s_flags = flags; + rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); + if (rc) + goto out1; + + ecryptfs_set_superblock_private(s, sbi); + s->s_bdi = &sbi->bdi; + + /* ->kill_sb() will take care of sbi after that point */ + sbi = NULL; + s->s_op = &ecryptfs_sops; + s->s_d_op = &ecryptfs_dops; + + err = "Reading sb failed"; + rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); + if (rc) { + ecryptfs_printk(KERN_WARNING, "kern_path() failed\n"); + goto out1; + } + if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) { + rc = -EINVAL; + printk(KERN_ERR "Mount on filesystem of type " + "eCryptfs explicitly disallowed due to " + "known incompatibilities\n"); + goto out_free; + } + + if (check_ruid && path.dentry->d_inode->i_uid != current_uid()) { + rc = -EPERM; + printk(KERN_ERR "Mount of device (uid: %d) not owned by " + "requested user (uid: %d)\n", + path.dentry->d_inode->i_uid, current_uid()); + goto out_free; + } + + ecryptfs_set_superblock_lower(s, path.dentry->d_sb); + s->s_maxbytes = path.dentry->d_sb->s_maxbytes; + s->s_blocksize = path.dentry->d_sb->s_blocksize; + s->s_magic = ECRYPTFS_SUPER_MAGIC; + + inode = ecryptfs_get_inode(path.dentry->d_inode, s); + rc = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_free; + + s->s_root = d_alloc_root(inode); + if (!s->s_root) { + iput(inode); + rc = -ENOMEM; + goto out_free; + } + + rc = -ENOMEM; + root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); + if (!root_info) + goto out_free; + + /* ->kill_sb() will take care of root_info */ + ecryptfs_set_dentry_private(s->s_root, root_info); + ecryptfs_set_dentry_lower(s->s_root, path.dentry); + ecryptfs_set_dentry_lower_mnt(s->s_root, path.mnt); + + s->s_flags |= MS_ACTIVE; + return dget(s->s_root); + +out_free: + path_put(&path); +out1: + deactivate_locked_super(s); +out: + if (sbi) { + ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); + kmem_cache_free(ecryptfs_sb_info_cache, sbi); + } + printk(KERN_ERR "%s; rc = [%d]\n", err, rc); + return ERR_PTR(rc); +} + +/** + * ecryptfs_kill_block_super + * @sb: The ecryptfs super block + * + * Used to bring the superblock down and free the private data. + */ +static void ecryptfs_kill_block_super(struct super_block *sb) +{ + struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); + kill_anon_super(sb); + if (!sb_info) + return; + ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); + bdi_destroy(&sb_info->bdi); + kmem_cache_free(ecryptfs_sb_info_cache, sb_info); +} + +static struct file_system_type ecryptfs_fs_type = { + .owner = THIS_MODULE, + .name = "ecryptfs", + .mount = ecryptfs_mount, + .kill_sb = ecryptfs_kill_block_super, + .fs_flags = 0 +}; + +/** + * inode_info_init_once + * + * Initializes the ecryptfs_inode_info_cache when it is created + */ +static void +inode_info_init_once(void *vptr) +{ + struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr; + + inode_init_once(&ei->vfs_inode); +} + +static struct ecryptfs_cache_info { + struct kmem_cache **cache; + const char *name; + size_t size; + void (*ctor)(void *obj); +} ecryptfs_cache_infos[] = { + { + .cache = &ecryptfs_auth_tok_list_item_cache, + .name = "ecryptfs_auth_tok_list_item", + .size = sizeof(struct ecryptfs_auth_tok_list_item), + }, + { + .cache = &ecryptfs_file_info_cache, + .name = "ecryptfs_file_cache", + .size = sizeof(struct ecryptfs_file_info), + }, + { + .cache = &ecryptfs_dentry_info_cache, + .name = "ecryptfs_dentry_info_cache", + .size = sizeof(struct ecryptfs_dentry_info), + }, + { + .cache = &ecryptfs_inode_info_cache, + .name = "ecryptfs_inode_cache", + .size = sizeof(struct ecryptfs_inode_info), + .ctor = inode_info_init_once, + }, + { + .cache = &ecryptfs_sb_info_cache, + .name = "ecryptfs_sb_cache", + .size = sizeof(struct ecryptfs_sb_info), + }, + { + .cache = &ecryptfs_header_cache, + .name = "ecryptfs_headers", + .size = PAGE_CACHE_SIZE, + }, + { + .cache = &ecryptfs_xattr_cache, + .name = "ecryptfs_xattr_cache", + .size = PAGE_CACHE_SIZE, + }, + { + .cache = &ecryptfs_key_record_cache, + .name = "ecryptfs_key_record_cache", + .size = sizeof(struct ecryptfs_key_record), + }, + { + .cache = &ecryptfs_key_sig_cache, + .name = "ecryptfs_key_sig_cache", + .size = sizeof(struct ecryptfs_key_sig), + }, + { + .cache = &ecryptfs_global_auth_tok_cache, + .name = "ecryptfs_global_auth_tok_cache", + .size = sizeof(struct ecryptfs_global_auth_tok), + }, + { + .cache = &ecryptfs_key_tfm_cache, + .name = "ecryptfs_key_tfm_cache", + .size = sizeof(struct ecryptfs_key_tfm), + }, + { + .cache = &ecryptfs_open_req_cache, + .name = "ecryptfs_open_req_cache", + .size = sizeof(struct ecryptfs_open_req), + }, +}; + +static void ecryptfs_free_kmem_caches(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { + struct ecryptfs_cache_info *info; + + info = &ecryptfs_cache_infos[i]; + if (*(info->cache)) + kmem_cache_destroy(*(info->cache)); + } +} + +/** + * ecryptfs_init_kmem_caches + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_init_kmem_caches(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { + struct ecryptfs_cache_info *info; + + info = &ecryptfs_cache_infos[i]; + *(info->cache) = kmem_cache_create(info->name, info->size, + 0, SLAB_HWCACHE_ALIGN, info->ctor); + if (!*(info->cache)) { + ecryptfs_free_kmem_caches(); + ecryptfs_printk(KERN_WARNING, "%s: " + "kmem_cache_create failed\n", + info->name); + return -ENOMEM; + } + } + return 0; +} + +static struct kobject *ecryptfs_kobj; + +static ssize_t version_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buff) +{ + return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK); +} + +static struct kobj_attribute version_attr = __ATTR_RO(version); + +static struct attribute *attributes[] = { + &version_attr.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = attributes, +}; + +static int do_sysfs_registration(void) +{ + int rc; + + ecryptfs_kobj = kobject_create_and_add("ecryptfs", fs_kobj); + if (!ecryptfs_kobj) { + printk(KERN_ERR "Unable to create ecryptfs kset\n"); + rc = -ENOMEM; + goto out; + } + rc = sysfs_create_group(ecryptfs_kobj, &attr_group); + if (rc) { + printk(KERN_ERR + "Unable to create ecryptfs version attributes\n"); + kobject_put(ecryptfs_kobj); + } +out: + return rc; +} + +static void do_sysfs_unregistration(void) +{ + sysfs_remove_group(ecryptfs_kobj, &attr_group); + kobject_put(ecryptfs_kobj); +} + +static int __init ecryptfs_init(void) +{ + int rc; + + if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_CACHE_SIZE) { + rc = -EINVAL; + ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is " + "larger than the host's page size, and so " + "eCryptfs cannot run on this system. The " + "default eCryptfs extent size is [%u] bytes; " + "the page size is [%lu] bytes.\n", + ECRYPTFS_DEFAULT_EXTENT_SIZE, + (unsigned long)PAGE_CACHE_SIZE); + goto out; + } + rc = ecryptfs_init_kmem_caches(); + if (rc) { + printk(KERN_ERR + "Failed to allocate one or more kmem_cache objects\n"); + goto out; + } + rc = register_filesystem(&ecryptfs_fs_type); + if (rc) { + printk(KERN_ERR "Failed to register filesystem\n"); + goto out_free_kmem_caches; + } + rc = do_sysfs_registration(); + if (rc) { + printk(KERN_ERR "sysfs registration failed\n"); + goto out_unregister_filesystem; + } + rc = ecryptfs_init_kthread(); + if (rc) { + printk(KERN_ERR "%s: kthread initialization failed; " + "rc = [%d]\n", __func__, rc); + goto out_do_sysfs_unregistration; + } + rc = ecryptfs_init_messaging(); + if (rc) { + printk(KERN_ERR "Failure occurred while attempting to " + "initialize the communications channel to " + "ecryptfsd\n"); + goto out_destroy_kthread; + } + rc = ecryptfs_init_crypto(); + if (rc) { + printk(KERN_ERR "Failure whilst attempting to init crypto; " + "rc = [%d]\n", rc); + goto out_release_messaging; + } + if (ecryptfs_verbosity > 0) + printk(KERN_CRIT "eCryptfs verbosity set to %d. Secret values " + "will be written to the syslog!\n", ecryptfs_verbosity); + + goto out; +out_release_messaging: + ecryptfs_release_messaging(); +out_destroy_kthread: + ecryptfs_destroy_kthread(); +out_do_sysfs_unregistration: + do_sysfs_unregistration(); +out_unregister_filesystem: + unregister_filesystem(&ecryptfs_fs_type); +out_free_kmem_caches: + ecryptfs_free_kmem_caches(); +out: + return rc; +} + +static void __exit ecryptfs_exit(void) +{ + int rc; + + rc = ecryptfs_destroy_crypto(); + if (rc) + printk(KERN_ERR "Failure whilst attempting to destroy crypto; " + "rc = [%d]\n", rc); + ecryptfs_release_messaging(); + ecryptfs_destroy_kthread(); + do_sysfs_unregistration(); + unregister_filesystem(&ecryptfs_fs_type); + ecryptfs_free_kmem_caches(); +} + +MODULE_AUTHOR("Michael A. Halcrow "); +MODULE_DESCRIPTION("eCryptfs"); + +MODULE_LICENSE("GPL"); + +module_init(ecryptfs_init) +module_exit(ecryptfs_exit) diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c new file mode 100644 index 00000000..ab224809 --- /dev/null +++ b/fs/ecryptfs/messaging.c @@ -0,0 +1,578 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 2004-2008 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Tyler Hicks + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +static LIST_HEAD(ecryptfs_msg_ctx_free_list); +static LIST_HEAD(ecryptfs_msg_ctx_alloc_list); +static struct mutex ecryptfs_msg_ctx_lists_mux; + +static struct hlist_head *ecryptfs_daemon_hash; +struct mutex ecryptfs_daemon_hash_mux; +static int ecryptfs_hash_bits; +#define ecryptfs_uid_hash(uid) \ + hash_long((unsigned long)uid, ecryptfs_hash_bits) + +static u32 ecryptfs_msg_counter; +static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; + +/** + * ecryptfs_acquire_free_msg_ctx + * @msg_ctx: The context that was acquired from the free list + * + * Acquires a context element from the free list and locks the mutex + * on the context. Sets the msg_ctx task to current. Returns zero on + * success; non-zero on error or upon failure to acquire a free + * context element. Must be called with ecryptfs_msg_ctx_lists_mux + * held. + */ +static int ecryptfs_acquire_free_msg_ctx(struct ecryptfs_msg_ctx **msg_ctx) +{ + struct list_head *p; + int rc; + + if (list_empty(&ecryptfs_msg_ctx_free_list)) { + printk(KERN_WARNING "%s: The eCryptfs free " + "context list is empty. It may be helpful to " + "specify the ecryptfs_message_buf_len " + "parameter to be greater than the current " + "value of [%d]\n", __func__, ecryptfs_message_buf_len); + rc = -ENOMEM; + goto out; + } + list_for_each(p, &ecryptfs_msg_ctx_free_list) { + *msg_ctx = list_entry(p, struct ecryptfs_msg_ctx, node); + if (mutex_trylock(&(*msg_ctx)->mux)) { + (*msg_ctx)->task = current; + rc = 0; + goto out; + } + } + rc = -ENOMEM; +out: + return rc; +} + +/** + * ecryptfs_msg_ctx_free_to_alloc + * @msg_ctx: The context to move from the free list to the alloc list + * + * Must be called with ecryptfs_msg_ctx_lists_mux held. + */ +static void ecryptfs_msg_ctx_free_to_alloc(struct ecryptfs_msg_ctx *msg_ctx) +{ + list_move(&msg_ctx->node, &ecryptfs_msg_ctx_alloc_list); + msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_PENDING; + msg_ctx->counter = ++ecryptfs_msg_counter; +} + +/** + * ecryptfs_msg_ctx_alloc_to_free + * @msg_ctx: The context to move from the alloc list to the free list + * + * Must be called with ecryptfs_msg_ctx_lists_mux held. + */ +void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx) +{ + list_move(&(msg_ctx->node), &ecryptfs_msg_ctx_free_list); + if (msg_ctx->msg) + kfree(msg_ctx->msg); + msg_ctx->msg = NULL; + msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_FREE; +} + +/** + * ecryptfs_find_daemon_by_euid + * @euid: The effective user id which maps to the desired daemon id + * @user_ns: The namespace in which @euid applies + * @daemon: If return value is zero, points to the desired daemon pointer + * + * Must be called with ecryptfs_daemon_hash_mux held. + * + * Search the hash list for the given user id. + * + * Returns zero if the user id exists in the list; non-zero otherwise. + */ +int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, + struct user_namespace *user_ns) +{ + struct hlist_node *elem; + int rc; + + hlist_for_each_entry(*daemon, elem, + &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)], + euid_chain) { + if ((*daemon)->euid == euid && (*daemon)->user_ns == user_ns) { + rc = 0; + goto out; + } + } + rc = -EINVAL; +out: + return rc; +} + +/** + * ecryptfs_spawn_daemon - Create and initialize a new daemon struct + * @daemon: Pointer to set to newly allocated daemon struct + * @euid: Effective user id for the daemon + * @user_ns: The namespace in which @euid applies + * @pid: Process id for the daemon + * + * Must be called ceremoniously while in possession of + * ecryptfs_sacred_daemon_hash_mux + * + * Returns zero on success; non-zero otherwise + */ +int +ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, + struct user_namespace *user_ns, struct pid *pid) +{ + int rc = 0; + + (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL); + if (!(*daemon)) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " + "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); + goto out; + } + (*daemon)->euid = euid; + (*daemon)->user_ns = get_user_ns(user_ns); + (*daemon)->pid = get_pid(pid); + (*daemon)->task = current; + mutex_init(&(*daemon)->mux); + INIT_LIST_HEAD(&(*daemon)->msg_ctx_out_queue); + init_waitqueue_head(&(*daemon)->wait); + (*daemon)->num_queued_msg_ctx = 0; + hlist_add_head(&(*daemon)->euid_chain, + &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)]); +out: + return rc; +} + +/** + * ecryptfs_exorcise_daemon - Destroy the daemon struct + * + * Must be called ceremoniously while in possession of + * ecryptfs_daemon_hash_mux and the daemon's own mux. + */ +int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon) +{ + struct ecryptfs_msg_ctx *msg_ctx, *msg_ctx_tmp; + int rc = 0; + + mutex_lock(&daemon->mux); + if ((daemon->flags & ECRYPTFS_DAEMON_IN_READ) + || (daemon->flags & ECRYPTFS_DAEMON_IN_POLL)) { + rc = -EBUSY; + printk(KERN_WARNING "%s: Attempt to destroy daemon with pid " + "[0x%p], but it is in the midst of a read or a poll\n", + __func__, daemon->pid); + mutex_unlock(&daemon->mux); + goto out; + } + list_for_each_entry_safe(msg_ctx, msg_ctx_tmp, + &daemon->msg_ctx_out_queue, daemon_out_list) { + list_del(&msg_ctx->daemon_out_list); + daemon->num_queued_msg_ctx--; + printk(KERN_WARNING "%s: Warning: dropping message that is in " + "the out queue of a dying daemon\n", __func__); + ecryptfs_msg_ctx_alloc_to_free(msg_ctx); + } + hlist_del(&daemon->euid_chain); + if (daemon->task) + wake_up_process(daemon->task); + if (daemon->pid) + put_pid(daemon->pid); + if (daemon->user_ns) + put_user_ns(daemon->user_ns); + mutex_unlock(&daemon->mux); + kzfree(daemon); +out: + return rc; +} + +/** + * ecryptfs_process_quit + * @euid: The user ID owner of the message + * @user_ns: The namespace in which @euid applies + * @pid: The process ID for the userspace program that sent the + * message + * + * Deletes the corresponding daemon for the given euid and pid, if + * it is the registered that is requesting the deletion. Returns zero + * after deleting the desired daemon; non-zero otherwise. + */ +int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns, + struct pid *pid) +{ + struct ecryptfs_daemon *daemon; + int rc; + + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, user_ns); + if (rc || !daemon) { + rc = -EINVAL; + printk(KERN_ERR "Received request from user [%d] to " + "unregister unrecognized daemon [0x%p]\n", euid, pid); + goto out_unlock; + } + rc = ecryptfs_exorcise_daemon(daemon); +out_unlock: + mutex_unlock(&ecryptfs_daemon_hash_mux); + return rc; +} + +/** + * ecryptfs_process_reponse + * @msg: The ecryptfs message received; the caller should sanity check + * msg->data_len and free the memory + * @pid: The process ID of the userspace application that sent the + * message + * @seq: The sequence number of the message; must match the sequence + * number for the existing message context waiting for this + * response + * + * Processes a response message after sending an operation request to + * userspace. Some other process is awaiting this response. Before + * sending out its first communications, the other process allocated a + * msg_ctx from the ecryptfs_msg_ctx_arr at a particular index. The + * response message contains this index so that we can copy over the + * response message into the msg_ctx that the process holds a + * reference to. The other process is going to wake up, check to see + * that msg_ctx->state == ECRYPTFS_MSG_CTX_STATE_DONE, and then + * proceed to read off and process the response message. Returns zero + * upon delivery to desired context element; non-zero upon delivery + * failure or error. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, + struct user_namespace *user_ns, struct pid *pid, + u32 seq) +{ + struct ecryptfs_daemon *uninitialized_var(daemon); + struct ecryptfs_msg_ctx *msg_ctx; + size_t msg_size; + struct nsproxy *nsproxy; + struct user_namespace *tsk_user_ns; + uid_t ctx_euid; + int rc; + + if (msg->index >= ecryptfs_message_buf_len) { + rc = -EINVAL; + printk(KERN_ERR "%s: Attempt to reference " + "context buffer at index [%d]; maximum " + "allowable is [%d]\n", __func__, msg->index, + (ecryptfs_message_buf_len - 1)); + goto out; + } + msg_ctx = &ecryptfs_msg_ctx_arr[msg->index]; + mutex_lock(&msg_ctx->mux); + mutex_lock(&ecryptfs_daemon_hash_mux); + rcu_read_lock(); + nsproxy = task_nsproxy(msg_ctx->task); + if (nsproxy == NULL) { + rc = -EBADMSG; + printk(KERN_ERR "%s: Receiving process is a zombie. Dropping " + "message.\n", __func__); + rcu_read_unlock(); + mutex_unlock(&ecryptfs_daemon_hash_mux); + goto wake_up; + } + tsk_user_ns = __task_cred(msg_ctx->task)->user->user_ns; + ctx_euid = task_euid(msg_ctx->task); + rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, tsk_user_ns); + rcu_read_unlock(); + mutex_unlock(&ecryptfs_daemon_hash_mux); + if (rc) { + rc = -EBADMSG; + printk(KERN_WARNING "%s: User [%d] received a " + "message response from process [0x%p] but does " + "not have a registered daemon\n", __func__, + ctx_euid, pid); + goto wake_up; + } + if (ctx_euid != euid) { + rc = -EBADMSG; + printk(KERN_WARNING "%s: Received message from user " + "[%d]; expected message from user [%d]\n", __func__, + euid, ctx_euid); + goto unlock; + } + if (tsk_user_ns != user_ns) { + rc = -EBADMSG; + printk(KERN_WARNING "%s: Received message from user_ns " + "[0x%p]; expected message from user_ns [0x%p]\n", + __func__, user_ns, tsk_user_ns); + goto unlock; + } + if (daemon->pid != pid) { + rc = -EBADMSG; + printk(KERN_ERR "%s: User [%d] sent a message response " + "from an unrecognized process [0x%p]\n", + __func__, ctx_euid, pid); + goto unlock; + } + if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_PENDING) { + rc = -EINVAL; + printk(KERN_WARNING "%s: Desired context element is not " + "pending a response\n", __func__); + goto unlock; + } else if (msg_ctx->counter != seq) { + rc = -EINVAL; + printk(KERN_WARNING "%s: Invalid message sequence; " + "expected [%d]; received [%d]\n", __func__, + msg_ctx->counter, seq); + goto unlock; + } + msg_size = (sizeof(*msg) + msg->data_len); + msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); + if (!msg_ctx->msg) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " + "GFP_KERNEL memory\n", __func__, msg_size); + goto unlock; + } + memcpy(msg_ctx->msg, msg, msg_size); + msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE; + rc = 0; +wake_up: + wake_up_process(msg_ctx->task); +unlock: + mutex_unlock(&msg_ctx->mux); +out: + return rc; +} + +/** + * ecryptfs_send_message_locked + * @data: The data to send + * @data_len: The length of data + * @msg_ctx: The message context allocated for the send + * + * Must be called with ecryptfs_daemon_hash_mux held. + * + * Returns zero on success; non-zero otherwise + */ +static int +ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, + struct ecryptfs_msg_ctx **msg_ctx) +{ + struct ecryptfs_daemon *daemon; + uid_t euid = current_euid(); + int rc; + + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); + if (rc || !daemon) { + rc = -ENOTCONN; + printk(KERN_ERR "%s: User [%d] does not have a daemon " + "registered\n", __func__, euid); + goto out; + } + mutex_lock(&ecryptfs_msg_ctx_lists_mux); + rc = ecryptfs_acquire_free_msg_ctx(msg_ctx); + if (rc) { + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + printk(KERN_WARNING "%s: Could not claim a free " + "context element\n", __func__); + goto out; + } + ecryptfs_msg_ctx_free_to_alloc(*msg_ctx); + mutex_unlock(&(*msg_ctx)->mux); + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type, 0, + daemon); + if (rc) + printk(KERN_ERR "%s: Error attempting to send message to " + "userspace daemon; rc = [%d]\n", __func__, rc); +out: + return rc; +} + +/** + * ecryptfs_send_message + * @data: The data to send + * @data_len: The length of data + * @msg_ctx: The message context allocated for the send + * + * Grabs ecryptfs_daemon_hash_mux. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_send_message(char *data, int data_len, + struct ecryptfs_msg_ctx **msg_ctx) +{ + int rc; + + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = ecryptfs_send_message_locked(data, data_len, ECRYPTFS_MSG_REQUEST, + msg_ctx); + mutex_unlock(&ecryptfs_daemon_hash_mux); + return rc; +} + +/** + * ecryptfs_wait_for_response + * @msg_ctx: The context that was assigned when sending a message + * @msg: The incoming message from userspace; not set if rc != 0 + * + * Sleeps until awaken by ecryptfs_receive_message or until the amount + * of time exceeds ecryptfs_message_wait_timeout. If zero is + * returned, msg will point to a valid message from userspace; a + * non-zero value is returned upon failure to receive a message or an + * error occurs. Callee must free @msg on success. + */ +int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, + struct ecryptfs_message **msg) +{ + signed long timeout = ecryptfs_message_wait_timeout * HZ; + int rc = 0; + +sleep: + timeout = schedule_timeout_interruptible(timeout); + mutex_lock(&ecryptfs_msg_ctx_lists_mux); + mutex_lock(&msg_ctx->mux); + if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_DONE) { + if (timeout) { + mutex_unlock(&msg_ctx->mux); + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + goto sleep; + } + rc = -ENOMSG; + } else { + *msg = msg_ctx->msg; + msg_ctx->msg = NULL; + } + ecryptfs_msg_ctx_alloc_to_free(msg_ctx); + mutex_unlock(&msg_ctx->mux); + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + return rc; +} + +int __init ecryptfs_init_messaging(void) +{ + int i; + int rc = 0; + + if (ecryptfs_number_of_users > ECRYPTFS_MAX_NUM_USERS) { + ecryptfs_number_of_users = ECRYPTFS_MAX_NUM_USERS; + printk(KERN_WARNING "%s: Specified number of users is " + "too large, defaulting to [%d] users\n", __func__, + ecryptfs_number_of_users); + } + mutex_init(&ecryptfs_daemon_hash_mux); + mutex_lock(&ecryptfs_daemon_hash_mux); + ecryptfs_hash_bits = 1; + while (ecryptfs_number_of_users >> ecryptfs_hash_bits) + ecryptfs_hash_bits++; + ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head) + * (1 << ecryptfs_hash_bits)), + GFP_KERNEL); + if (!ecryptfs_daemon_hash) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); + mutex_unlock(&ecryptfs_daemon_hash_mux); + goto out; + } + for (i = 0; i < (1 << ecryptfs_hash_bits); i++) + INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]); + mutex_unlock(&ecryptfs_daemon_hash_mux); + ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx) + * ecryptfs_message_buf_len), + GFP_KERNEL); + if (!ecryptfs_msg_ctx_arr) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); + goto out; + } + mutex_init(&ecryptfs_msg_ctx_lists_mux); + mutex_lock(&ecryptfs_msg_ctx_lists_mux); + ecryptfs_msg_counter = 0; + for (i = 0; i < ecryptfs_message_buf_len; i++) { + INIT_LIST_HEAD(&ecryptfs_msg_ctx_arr[i].node); + INIT_LIST_HEAD(&ecryptfs_msg_ctx_arr[i].daemon_out_list); + mutex_init(&ecryptfs_msg_ctx_arr[i].mux); + mutex_lock(&ecryptfs_msg_ctx_arr[i].mux); + ecryptfs_msg_ctx_arr[i].index = i; + ecryptfs_msg_ctx_arr[i].state = ECRYPTFS_MSG_CTX_STATE_FREE; + ecryptfs_msg_ctx_arr[i].counter = 0; + ecryptfs_msg_ctx_arr[i].task = NULL; + ecryptfs_msg_ctx_arr[i].msg = NULL; + list_add_tail(&ecryptfs_msg_ctx_arr[i].node, + &ecryptfs_msg_ctx_free_list); + mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux); + } + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + rc = ecryptfs_init_ecryptfs_miscdev(); + if (rc) + ecryptfs_release_messaging(); +out: + return rc; +} + +void ecryptfs_release_messaging(void) +{ + if (ecryptfs_msg_ctx_arr) { + int i; + + mutex_lock(&ecryptfs_msg_ctx_lists_mux); + for (i = 0; i < ecryptfs_message_buf_len; i++) { + mutex_lock(&ecryptfs_msg_ctx_arr[i].mux); + if (ecryptfs_msg_ctx_arr[i].msg) + kfree(ecryptfs_msg_ctx_arr[i].msg); + mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux); + } + kfree(ecryptfs_msg_ctx_arr); + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + } + if (ecryptfs_daemon_hash) { + struct hlist_node *elem; + struct ecryptfs_daemon *daemon; + int i; + + mutex_lock(&ecryptfs_daemon_hash_mux); + for (i = 0; i < (1 << ecryptfs_hash_bits); i++) { + int rc; + + hlist_for_each_entry(daemon, elem, + &ecryptfs_daemon_hash[i], + euid_chain) { + rc = ecryptfs_exorcise_daemon(daemon); + if (rc) + printk(KERN_ERR "%s: Error whilst " + "attempting to destroy daemon; " + "rc = [%d]. Dazed and confused, " + "but trying to continue.\n", + __func__, rc); + } + } + kfree(ecryptfs_daemon_hash); + mutex_unlock(&ecryptfs_daemon_hash_mux); + } + ecryptfs_destroy_ecryptfs_miscdev(); + return; +} diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c new file mode 100644 index 00000000..0dc5a3d5 --- /dev/null +++ b/fs/ecryptfs/miscdev.c @@ -0,0 +1,547 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 2008 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +static atomic_t ecryptfs_num_miscdev_opens; + +/** + * ecryptfs_miscdev_poll + * @file: dev file (ignored) + * @pt: dev poll table (ignored) + * + * Returns the poll mask + */ +static unsigned int +ecryptfs_miscdev_poll(struct file *file, poll_table *pt) +{ + struct ecryptfs_daemon *daemon; + unsigned int mask = 0; + uid_t euid = current_euid(); + int rc; + + mutex_lock(&ecryptfs_daemon_hash_mux); + /* TODO: Just use file->private_data? */ + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); + BUG_ON(rc || !daemon); + mutex_lock(&daemon->mux); + mutex_unlock(&ecryptfs_daemon_hash_mux); + if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { + printk(KERN_WARNING "%s: Attempt to poll on zombified " + "daemon\n", __func__); + goto out_unlock_daemon; + } + if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) + goto out_unlock_daemon; + if (daemon->flags & ECRYPTFS_DAEMON_IN_POLL) + goto out_unlock_daemon; + daemon->flags |= ECRYPTFS_DAEMON_IN_POLL; + mutex_unlock(&daemon->mux); + poll_wait(file, &daemon->wait, pt); + mutex_lock(&daemon->mux); + if (!list_empty(&daemon->msg_ctx_out_queue)) + mask |= POLLIN | POLLRDNORM; +out_unlock_daemon: + daemon->flags &= ~ECRYPTFS_DAEMON_IN_POLL; + mutex_unlock(&daemon->mux); + return mask; +} + +/** + * ecryptfs_miscdev_open + * @inode: inode of miscdev handle (ignored) + * @file: file for miscdev handle (ignored) + * + * Returns zero on success; non-zero otherwise + */ +static int +ecryptfs_miscdev_open(struct inode *inode, struct file *file) +{ + struct ecryptfs_daemon *daemon = NULL; + uid_t euid = current_euid(); + int rc; + + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = try_module_get(THIS_MODULE); + if (rc == 0) { + rc = -EIO; + printk(KERN_ERR "%s: Error attempting to increment module use " + "count; rc = [%d]\n", __func__, rc); + goto out_unlock_daemon_list; + } + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); + if (rc || !daemon) { + rc = ecryptfs_spawn_daemon(&daemon, euid, current_user_ns(), + task_pid(current)); + if (rc) { + printk(KERN_ERR "%s: Error attempting to spawn daemon; " + "rc = [%d]\n", __func__, rc); + goto out_module_put_unlock_daemon_list; + } + } + mutex_lock(&daemon->mux); + if (daemon->pid != task_pid(current)) { + rc = -EINVAL; + printk(KERN_ERR "%s: pid [0x%p] has registered with euid [%d], " + "but pid [0x%p] has attempted to open the handle " + "instead\n", __func__, daemon->pid, daemon->euid, + task_pid(current)); + goto out_unlock_daemon; + } + if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) { + rc = -EBUSY; + printk(KERN_ERR "%s: Miscellaneous device handle may only be " + "opened once per daemon; pid [0x%p] already has this " + "handle open\n", __func__, daemon->pid); + goto out_unlock_daemon; + } + daemon->flags |= ECRYPTFS_DAEMON_MISCDEV_OPEN; + atomic_inc(&ecryptfs_num_miscdev_opens); +out_unlock_daemon: + mutex_unlock(&daemon->mux); +out_module_put_unlock_daemon_list: + if (rc) + module_put(THIS_MODULE); +out_unlock_daemon_list: + mutex_unlock(&ecryptfs_daemon_hash_mux); + return rc; +} + +/** + * ecryptfs_miscdev_release + * @inode: inode of fs/ecryptfs/euid handle (ignored) + * @file: file for fs/ecryptfs/euid handle (ignored) + * + * This keeps the daemon registered until the daemon sends another + * ioctl to fs/ecryptfs/ctl or until the kernel module unregisters. + * + * Returns zero on success; non-zero otherwise + */ +static int +ecryptfs_miscdev_release(struct inode *inode, struct file *file) +{ + struct ecryptfs_daemon *daemon = NULL; + uid_t euid = current_euid(); + int rc; + + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); + BUG_ON(rc || !daemon); + mutex_lock(&daemon->mux); + BUG_ON(daemon->pid != task_pid(current)); + BUG_ON(!(daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN)); + daemon->flags &= ~ECRYPTFS_DAEMON_MISCDEV_OPEN; + atomic_dec(&ecryptfs_num_miscdev_opens); + mutex_unlock(&daemon->mux); + rc = ecryptfs_exorcise_daemon(daemon); + if (rc) { + printk(KERN_CRIT "%s: Fatal error whilst attempting to " + "shut down daemon; rc = [%d]. Please report this " + "bug.\n", __func__, rc); + BUG(); + } + module_put(THIS_MODULE); + mutex_unlock(&ecryptfs_daemon_hash_mux); + return rc; +} + +/** + * ecryptfs_send_miscdev + * @data: Data to send to daemon; may be NULL + * @data_size: Amount of data to send to daemon + * @msg_ctx: Message context, which is used to handle the reply. If + * this is NULL, then we do not expect a reply. + * @msg_type: Type of message + * @msg_flags: Flags for message + * @daemon: eCryptfs daemon object + * + * Add msg_ctx to queue and then, if it exists, notify the blocked + * miscdevess about the data being available. Must be called with + * ecryptfs_daemon_hash_mux held. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_send_miscdev(char *data, size_t data_size, + struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, + u16 msg_flags, struct ecryptfs_daemon *daemon) +{ + int rc = 0; + + mutex_lock(&msg_ctx->mux); + msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size), + GFP_KERNEL); + if (!msg_ctx->msg) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc(%zd, GFP_KERNEL)\n", __func__, + (sizeof(*msg_ctx->msg) + data_size)); + goto out_unlock; + } + msg_ctx->msg->index = msg_ctx->index; + msg_ctx->msg->data_len = data_size; + msg_ctx->type = msg_type; + memcpy(msg_ctx->msg->data, data, data_size); + msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size); + mutex_lock(&daemon->mux); + list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue); + daemon->num_queued_msg_ctx++; + wake_up_interruptible(&daemon->wait); + mutex_unlock(&daemon->mux); +out_unlock: + mutex_unlock(&msg_ctx->mux); + return rc; +} + +/** + * ecryptfs_miscdev_read - format and send message from queue + * @file: fs/ecryptfs/euid miscdevfs handle (ignored) + * @buf: User buffer into which to copy the next message on the daemon queue + * @count: Amount of space available in @buf + * @ppos: Offset in file (ignored) + * + * Pulls the most recent message from the daemon queue, formats it for + * being sent via a miscdevfs handle, and copies it into @buf + * + * Returns the number of bytes copied into the user buffer + */ +static ssize_t +ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct ecryptfs_daemon *daemon; + struct ecryptfs_msg_ctx *msg_ctx; + size_t packet_length_size; + char packet_length[3]; + size_t i; + size_t total_length; + uid_t euid = current_euid(); + int rc; + + mutex_lock(&ecryptfs_daemon_hash_mux); + /* TODO: Just use file->private_data? */ + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); + BUG_ON(rc || !daemon); + mutex_lock(&daemon->mux); + if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { + rc = 0; + mutex_unlock(&ecryptfs_daemon_hash_mux); + printk(KERN_WARNING "%s: Attempt to read from zombified " + "daemon\n", __func__); + goto out_unlock_daemon; + } + if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) { + rc = 0; + mutex_unlock(&ecryptfs_daemon_hash_mux); + goto out_unlock_daemon; + } + /* This daemon will not go away so long as this flag is set */ + daemon->flags |= ECRYPTFS_DAEMON_IN_READ; + mutex_unlock(&ecryptfs_daemon_hash_mux); +check_list: + if (list_empty(&daemon->msg_ctx_out_queue)) { + mutex_unlock(&daemon->mux); + rc = wait_event_interruptible( + daemon->wait, !list_empty(&daemon->msg_ctx_out_queue)); + mutex_lock(&daemon->mux); + if (rc < 0) { + rc = 0; + goto out_unlock_daemon; + } + } + if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { + rc = 0; + goto out_unlock_daemon; + } + if (list_empty(&daemon->msg_ctx_out_queue)) { + /* Something else jumped in since the + * wait_event_interruptable() and removed the + * message from the queue; try again */ + goto check_list; + } + BUG_ON(euid != daemon->euid); + BUG_ON(current_user_ns() != daemon->user_ns); + BUG_ON(task_pid(current) != daemon->pid); + msg_ctx = list_first_entry(&daemon->msg_ctx_out_queue, + struct ecryptfs_msg_ctx, daemon_out_list); + BUG_ON(!msg_ctx); + mutex_lock(&msg_ctx->mux); + if (msg_ctx->msg) { + rc = ecryptfs_write_packet_length(packet_length, + msg_ctx->msg_size, + &packet_length_size); + if (rc) { + rc = 0; + printk(KERN_WARNING "%s: Error writing packet length; " + "rc = [%d]\n", __func__, rc); + goto out_unlock_msg_ctx; + } + } else { + packet_length_size = 0; + msg_ctx->msg_size = 0; + } + /* miscdevfs packet format: + * Octet 0: Type + * Octets 1-4: network byte order msg_ctx->counter + * Octets 5-N0: Size of struct ecryptfs_message to follow + * Octets N0-N1: struct ecryptfs_message (including data) + * + * Octets 5-N1 not written if the packet type does not + * include a message */ + total_length = (1 + 4 + packet_length_size + msg_ctx->msg_size); + if (count < total_length) { + rc = 0; + printk(KERN_WARNING "%s: Only given user buffer of " + "size [%zd], but we need [%zd] to read the " + "pending message\n", __func__, count, total_length); + goto out_unlock_msg_ctx; + } + rc = -EFAULT; + if (put_user(msg_ctx->type, buf)) + goto out_unlock_msg_ctx; + if (put_user(cpu_to_be32(msg_ctx->counter), (__be32 __user *)(buf + 1))) + goto out_unlock_msg_ctx; + i = 5; + if (msg_ctx->msg) { + if (copy_to_user(&buf[i], packet_length, packet_length_size)) + goto out_unlock_msg_ctx; + i += packet_length_size; + if (copy_to_user(&buf[i], msg_ctx->msg, msg_ctx->msg_size)) + goto out_unlock_msg_ctx; + i += msg_ctx->msg_size; + } + rc = i; + list_del(&msg_ctx->daemon_out_list); + kfree(msg_ctx->msg); + msg_ctx->msg = NULL; + /* We do not expect a reply from the userspace daemon for any + * message type other than ECRYPTFS_MSG_REQUEST */ + if (msg_ctx->type != ECRYPTFS_MSG_REQUEST) + ecryptfs_msg_ctx_alloc_to_free(msg_ctx); +out_unlock_msg_ctx: + mutex_unlock(&msg_ctx->mux); +out_unlock_daemon: + daemon->flags &= ~ECRYPTFS_DAEMON_IN_READ; + mutex_unlock(&daemon->mux); + return rc; +} + +/** + * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon + * @data: Bytes comprising struct ecryptfs_message + * @data_size: sizeof(struct ecryptfs_message) + data len + * @euid: Effective user id of miscdevess sending the miscdev response + * @user_ns: The namespace in which @euid applies + * @pid: Miscdevess id of miscdevess sending the miscdev response + * @seq: Sequence number for miscdev response packet + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_miscdev_response(char *data, size_t data_size, + uid_t euid, struct user_namespace *user_ns, + struct pid *pid, u32 seq) +{ + struct ecryptfs_message *msg = (struct ecryptfs_message *)data; + int rc; + + if ((sizeof(*msg) + msg->data_len) != data_size) { + printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = " + "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__, + (sizeof(*msg) + msg->data_len), data_size); + rc = -EINVAL; + goto out; + } + rc = ecryptfs_process_response(msg, euid, user_ns, pid, seq); + if (rc) + printk(KERN_ERR + "Error processing response message; rc = [%d]\n", rc); +out: + return rc; +} + +/** + * ecryptfs_miscdev_write - handle write to daemon miscdev handle + * @file: File for misc dev handle (ignored) + * @buf: Buffer containing user data + * @count: Amount of data in @buf + * @ppos: Pointer to offset in file (ignored) + * + * miscdevfs packet format: + * Octet 0: Type + * Octets 1-4: network byte order msg_ctx->counter (0's for non-response) + * Octets 5-N0: Size of struct ecryptfs_message to follow + * Octets N0-N1: struct ecryptfs_message (including data) + * + * Returns the number of bytes read from @buf + */ +static ssize_t +ecryptfs_miscdev_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + __be32 counter_nbo; + u32 seq; + size_t packet_size, packet_size_length, i; + ssize_t sz = 0; + char *data; + uid_t euid = current_euid(); + unsigned char packet_size_peek[3]; + int rc; + + if (count == 0) { + goto out; + } else if (count == (1 + 4)) { + /* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */ + goto memdup; + } else if (count < (1 + 4 + 1) + || count > (1 + 4 + 2 + sizeof(struct ecryptfs_message) + 4 + + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)) { + printk(KERN_WARNING "%s: Acceptable packet size range is " + "[%d-%lu], but amount of data written is [%zu].", + __func__, (1 + 4 + 1), + (1 + 4 + 2 + sizeof(struct ecryptfs_message) + 4 + + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES), count); + return -EINVAL; + } + + if (copy_from_user(packet_size_peek, (buf + 1 + 4), + sizeof(packet_size_peek))) { + printk(KERN_WARNING "%s: Error while inspecting packet size\n", + __func__); + return -EFAULT; + } + + rc = ecryptfs_parse_packet_length(packet_size_peek, &packet_size, + &packet_size_length); + if (rc) { + printk(KERN_WARNING "%s: Error parsing packet length; " + "rc = [%d]\n", __func__, rc); + return rc; + } + + if ((1 + 4 + packet_size_length + packet_size) != count) { + printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__, + packet_size); + return -EINVAL; + } + +memdup: + data = memdup_user(buf, count); + if (IS_ERR(data)) { + printk(KERN_ERR "%s: memdup_user returned error [%ld]\n", + __func__, PTR_ERR(data)); + goto out; + } + sz = count; + i = 0; + switch (data[i++]) { + case ECRYPTFS_MSG_RESPONSE: + if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) { + printk(KERN_WARNING "%s: Minimum acceptable packet " + "size is [%zd], but amount of data written is " + "only [%zd]. Discarding response packet.\n", + __func__, + (1 + 4 + 1 + sizeof(struct ecryptfs_message)), + count); + goto out_free; + } + memcpy(&counter_nbo, &data[i], 4); + seq = be32_to_cpu(counter_nbo); + i += 4 + packet_size_length; + rc = ecryptfs_miscdev_response(&data[i], packet_size, + euid, current_user_ns(), + task_pid(current), seq); + if (rc) + printk(KERN_WARNING "%s: Failed to deliver miscdev " + "response to requesting operation; rc = [%d]\n", + __func__, rc); + break; + case ECRYPTFS_MSG_HELO: + case ECRYPTFS_MSG_QUIT: + break; + default: + ecryptfs_printk(KERN_WARNING, "Dropping miscdev " + "message of unrecognized type [%d]\n", + data[0]); + break; + } +out_free: + kfree(data); +out: + return sz; +} + + +static const struct file_operations ecryptfs_miscdev_fops = { + .open = ecryptfs_miscdev_open, + .poll = ecryptfs_miscdev_poll, + .read = ecryptfs_miscdev_read, + .write = ecryptfs_miscdev_write, + .release = ecryptfs_miscdev_release, + .llseek = noop_llseek, +}; + +static struct miscdevice ecryptfs_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "ecryptfs", + .fops = &ecryptfs_miscdev_fops +}; + +/** + * ecryptfs_init_ecryptfs_miscdev + * + * Messages sent to the userspace daemon from the kernel are placed on + * a queue associated with the daemon. The next read against the + * miscdev handle by that daemon will return the oldest message placed + * on the message queue for the daemon. + * + * Returns zero on success; non-zero otherwise + */ +int __init ecryptfs_init_ecryptfs_miscdev(void) +{ + int rc; + + atomic_set(&ecryptfs_num_miscdev_opens, 0); + rc = misc_register(&ecryptfs_miscdev); + if (rc) + printk(KERN_ERR "%s: Failed to register miscellaneous device " + "for communications with userspace daemons; rc = [%d]\n", + __func__, rc); + return rc; +} + +/** + * ecryptfs_destroy_ecryptfs_miscdev + * + * All of the daemons must be exorcised prior to calling this + * function. + */ +void ecryptfs_destroy_ecryptfs_miscdev(void) +{ + BUG_ON(atomic_read(&ecryptfs_num_miscdev_opens) != 0); + misc_deregister(&ecryptfs_miscdev); +} diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c new file mode 100644 index 00000000..6a44148c --- /dev/null +++ b/fs/ecryptfs/mmap.c @@ -0,0 +1,566 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * This is where eCryptfs coordinates the symmetric encryption and + * decryption of the file data as it passes between the lower + * encrypted file and the upper decrypted file. + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2007 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +/** + * ecryptfs_get_locked_page + * + * Get one page from cache or lower f/s, return error otherwise. + * + * Returns locked and up-to-date page (if ok), with increased + * refcnt. + */ +struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index) +{ + struct page *page = read_mapping_page(inode->i_mapping, index, NULL); + if (!IS_ERR(page)) + lock_page(page); + return page; +} + +/** + * ecryptfs_writepage + * @page: Page that is locked before this call is made + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc) +{ + int rc; + + /* + * Refuse to write the page out if we are called from reclaim context + * since our writepage() path may potentially allocate memory when + * calling into the lower fs vfs_write() which may in turn invoke + * us again. + */ + if (current->flags & PF_MEMALLOC) { + redirty_page_for_writepage(wbc, page); + rc = 0; + goto out; + } + + rc = ecryptfs_encrypt_page(page); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error encrypting " + "page (upper index [0x%.16lx])\n", page->index); + ClearPageUptodate(page); + goto out; + } + SetPageUptodate(page); +out: + unlock_page(page); + return rc; +} + +static void strip_xattr_flag(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat) +{ + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { + size_t written; + + crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR; + ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat, + &written); + crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; + } +} + +/** + * Header Extent: + * Octets 0-7: Unencrypted file size (big-endian) + * Octets 8-15: eCryptfs special marker + * Octets 16-19: Flags + * Octet 16: File format version number (between 0 and 255) + * Octets 17-18: Reserved + * Octet 19: Bit 1 (lsb): Reserved + * Bit 2: Encrypted? + * Bits 3-8: Reserved + * Octets 20-23: Header extent size (big-endian) + * Octets 24-25: Number of header extents at front of file + * (big-endian) + * Octet 26: Begin RFC 2440 authentication token packet set + */ + +/** + * ecryptfs_copy_up_encrypted_with_header + * @page: Sort of a ``virtual'' representation of the encrypted lower + * file. The actual lower file does not have the metadata in + * the header. This is locked. + * @crypt_stat: The eCryptfs inode's cryptographic context + * + * The ``view'' is the version of the file that userspace winds up + * seeing, with the header information inserted. + */ +static int +ecryptfs_copy_up_encrypted_with_header(struct page *page, + struct ecryptfs_crypt_stat *crypt_stat) +{ + loff_t extent_num_in_page = 0; + loff_t num_extents_per_page = (PAGE_CACHE_SIZE + / crypt_stat->extent_size); + int rc = 0; + + while (extent_num_in_page < num_extents_per_page) { + loff_t view_extent_num = ((((loff_t)page->index) + * num_extents_per_page) + + extent_num_in_page); + size_t num_header_extents_at_front = + (crypt_stat->metadata_size / crypt_stat->extent_size); + + if (view_extent_num < num_header_extents_at_front) { + /* This is a header extent */ + char *page_virt; + + page_virt = kmap_atomic(page, KM_USER0); + memset(page_virt, 0, PAGE_CACHE_SIZE); + /* TODO: Support more than one header extent */ + if (view_extent_num == 0) { + size_t written; + + rc = ecryptfs_read_xattr_region( + page_virt, page->mapping->host); + strip_xattr_flag(page_virt + 16, crypt_stat); + ecryptfs_write_header_metadata(page_virt + 20, + crypt_stat, + &written); + } + kunmap_atomic(page_virt, KM_USER0); + flush_dcache_page(page); + if (rc) { + printk(KERN_ERR "%s: Error reading xattr " + "region; rc = [%d]\n", __func__, rc); + goto out; + } + } else { + /* This is an encrypted data extent */ + loff_t lower_offset = + ((view_extent_num * crypt_stat->extent_size) + - crypt_stat->metadata_size); + + rc = ecryptfs_read_lower_page_segment( + page, (lower_offset >> PAGE_CACHE_SHIFT), + (lower_offset & ~PAGE_CACHE_MASK), + crypt_stat->extent_size, page->mapping->host); + if (rc) { + printk(KERN_ERR "%s: Error attempting to read " + "extent at offset [%lld] in the lower " + "file; rc = [%d]\n", __func__, + lower_offset, rc); + goto out; + } + } + extent_num_in_page++; + } +out: + return rc; +} + +/** + * ecryptfs_readpage + * @file: An eCryptfs file + * @page: Page from eCryptfs inode mapping into which to stick the read data + * + * Read in a page, decrypting if necessary. + * + * Returns zero on success; non-zero on error. + */ +static int ecryptfs_readpage(struct file *file, struct page *page) +{ + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat; + int rc = 0; + + if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + rc = ecryptfs_read_lower_page_segment(page, page->index, 0, + PAGE_CACHE_SIZE, + page->mapping->host); + } else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) { + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { + rc = ecryptfs_copy_up_encrypted_with_header(page, + crypt_stat); + if (rc) { + printk(KERN_ERR "%s: Error attempting to copy " + "the encrypted content from the lower " + "file whilst inserting the metadata " + "from the xattr into the header; rc = " + "[%d]\n", __func__, rc); + goto out; + } + + } else { + rc = ecryptfs_read_lower_page_segment( + page, page->index, 0, PAGE_CACHE_SIZE, + page->mapping->host); + if (rc) { + printk(KERN_ERR "Error reading page; rc = " + "[%d]\n", rc); + goto out; + } + } + } else { + rc = ecryptfs_decrypt_page(page); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error decrypting page; " + "rc = [%d]\n", rc); + goto out; + } + } +out: + if (rc) + ClearPageUptodate(page); + else + SetPageUptodate(page); + ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n", + page->index); + unlock_page(page); + return rc; +} + +/** + * Called with lower inode mutex held. + */ +static int fill_zeros_to_end_of_page(struct page *page, unsigned int to) +{ + struct inode *inode = page->mapping->host; + int end_byte_in_page; + + if ((i_size_read(inode) / PAGE_CACHE_SIZE) != page->index) + goto out; + end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE; + if (to > end_byte_in_page) + end_byte_in_page = to; + zero_user_segment(page, end_byte_in_page, PAGE_CACHE_SIZE); +out: + return 0; +} + +/** + * ecryptfs_write_begin + * @file: The eCryptfs file + * @mapping: The eCryptfs object + * @pos: The file offset at which to start writing + * @len: Length of the write + * @flags: Various flags + * @pagep: Pointer to return the page + * @fsdata: Pointer to return fs data (unused) + * + * This function must zero any hole we create + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_write_begin(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + loff_t prev_page_end_size; + int rc = 0; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT); + if (!PageUptodate(page)) { + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(mapping->host)->crypt_stat; + + if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + rc = ecryptfs_read_lower_page_segment( + page, index, 0, PAGE_CACHE_SIZE, mapping->host); + if (rc) { + printk(KERN_ERR "%s: Error attemping to read " + "lower page segment; rc = [%d]\n", + __func__, rc); + ClearPageUptodate(page); + goto out; + } else + SetPageUptodate(page); + } else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) { + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { + rc = ecryptfs_copy_up_encrypted_with_header( + page, crypt_stat); + if (rc) { + printk(KERN_ERR "%s: Error attempting " + "to copy the encrypted content " + "from the lower file whilst " + "inserting the metadata from " + "the xattr into the header; rc " + "= [%d]\n", __func__, rc); + ClearPageUptodate(page); + goto out; + } + SetPageUptodate(page); + } else { + rc = ecryptfs_read_lower_page_segment( + page, index, 0, PAGE_CACHE_SIZE, + mapping->host); + if (rc) { + printk(KERN_ERR "%s: Error reading " + "page; rc = [%d]\n", + __func__, rc); + ClearPageUptodate(page); + goto out; + } + SetPageUptodate(page); + } + } else { + if (prev_page_end_size + >= i_size_read(page->mapping->host)) { + zero_user(page, 0, PAGE_CACHE_SIZE); + } else { + rc = ecryptfs_decrypt_page(page); + if (rc) { + printk(KERN_ERR "%s: Error decrypting " + "page at index [%ld]; " + "rc = [%d]\n", + __func__, page->index, rc); + ClearPageUptodate(page); + goto out; + } + } + SetPageUptodate(page); + } + } + /* If creating a page or more of holes, zero them out via truncate. + * Note, this will increase i_size. */ + if (index != 0) { + if (prev_page_end_size > i_size_read(page->mapping->host)) { + rc = ecryptfs_truncate(file->f_path.dentry, + prev_page_end_size); + if (rc) { + printk(KERN_ERR "%s: Error on attempt to " + "truncate to (higher) offset [%lld];" + " rc = [%d]\n", __func__, + prev_page_end_size, rc); + goto out; + } + } + } + /* Writing to a new page, and creating a small hole from start + * of page? Zero it out. */ + if ((i_size_read(mapping->host) == prev_page_end_size) + && (pos != 0)) + zero_user(page, 0, PAGE_CACHE_SIZE); +out: + if (unlikely(rc)) { + unlock_page(page); + page_cache_release(page); + *pagep = NULL; + } + return rc; +} + +/** + * ecryptfs_write_inode_size_to_header + * + * Writes the lower file size to the first 8 bytes of the header. + * + * Returns zero on success; non-zero on error. + */ +static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode) +{ + char *file_size_virt; + int rc; + + file_size_virt = kmalloc(sizeof(u64), GFP_KERNEL); + if (!file_size_virt) { + rc = -ENOMEM; + goto out; + } + put_unaligned_be64(i_size_read(ecryptfs_inode), file_size_virt); + rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0, + sizeof(u64)); + kfree(file_size_virt); + if (rc < 0) + printk(KERN_ERR "%s: Error writing file size to header; " + "rc = [%d]\n", __func__, rc); + else + rc = 0; +out: + return rc; +} + +struct kmem_cache *ecryptfs_xattr_cache; + +static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) +{ + ssize_t size; + void *xattr_virt; + struct dentry *lower_dentry = + ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_dentry; + struct inode *lower_inode = lower_dentry->d_inode; + int rc; + + if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) { + printk(KERN_WARNING + "No support for setting xattr in lower filesystem\n"); + rc = -ENOSYS; + goto out; + } + xattr_virt = kmem_cache_alloc(ecryptfs_xattr_cache, GFP_KERNEL); + if (!xattr_virt) { + printk(KERN_ERR "Out of memory whilst attempting to write " + "inode size to xattr\n"); + rc = -ENOMEM; + goto out; + } + mutex_lock(&lower_inode->i_mutex); + size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME, + xattr_virt, PAGE_CACHE_SIZE); + if (size < 0) + size = 8; + put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); + rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME, + xattr_virt, size, 0); + mutex_unlock(&lower_inode->i_mutex); + if (rc) + printk(KERN_ERR "Error whilst attempting to write inode size " + "to lower file xattr; rc = [%d]\n", rc); + kmem_cache_free(ecryptfs_xattr_cache, xattr_virt); +out: + return rc; +} + +int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) +{ + struct ecryptfs_crypt_stat *crypt_stat; + + crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; + BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + return ecryptfs_write_inode_size_to_xattr(ecryptfs_inode); + else + return ecryptfs_write_inode_size_to_header(ecryptfs_inode); +} + +/** + * ecryptfs_write_end + * @file: The eCryptfs file object + * @mapping: The eCryptfs object + * @pos: The file position + * @len: The length of the data (unused) + * @copied: The amount of data copied + * @page: The eCryptfs page + * @fsdata: The fsdata (unused) + * + * This is where we encrypt the data and pass the encrypted data to + * the lower filesystem. In OpenPGP-compatible mode, we operate on + * entire underlying packets. + */ +static int ecryptfs_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + copied; + struct inode *ecryptfs_inode = mapping->host; + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; + int rc; + int need_unlock_page = 1; + + ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" + "(page w/ index = [0x%.16lx], to = [%d])\n", index, to); + if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0, + to); + if (!rc) { + rc = copied; + fsstack_copy_inode_size(ecryptfs_inode, + ecryptfs_inode_to_lower(ecryptfs_inode)); + } + goto out; + } + /* Fills in zeros if 'to' goes beyond inode size */ + rc = fill_zeros_to_end_of_page(page, to); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error attempting to fill " + "zeros in page with index = [0x%.16lx]\n", index); + goto out; + } + set_page_dirty(page); + unlock_page(page); + need_unlock_page = 0; + if (pos + copied > i_size_read(ecryptfs_inode)) { + i_size_write(ecryptfs_inode, pos + copied); + ecryptfs_printk(KERN_DEBUG, "Expanded file size to " + "[0x%.16llx]\n", + (unsigned long long)i_size_read(ecryptfs_inode)); + balance_dirty_pages_ratelimited(mapping); + rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); + if (rc) { + printk(KERN_ERR "Error writing inode size to metadata; " + "rc = [%d]\n", rc); + goto out; + } + } + rc = copied; +out: + if (need_unlock_page) + unlock_page(page); + page_cache_release(page); + return rc; +} + +static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block) +{ + int rc = 0; + struct inode *inode; + struct inode *lower_inode; + + inode = (struct inode *)mapping->host; + lower_inode = ecryptfs_inode_to_lower(inode); + if (lower_inode->i_mapping->a_ops->bmap) + rc = lower_inode->i_mapping->a_ops->bmap(lower_inode->i_mapping, + block); + return rc; +} + +const struct address_space_operations ecryptfs_aops = { + .writepage = ecryptfs_writepage, + .readpage = ecryptfs_readpage, + .write_begin = ecryptfs_write_begin, + .write_end = ecryptfs_write_end, + .bmap = ecryptfs_bmap, +}; diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c new file mode 100644 index 00000000..608c1c3f --- /dev/null +++ b/fs/ecryptfs/read_write.c @@ -0,0 +1,357 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 2007 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include "ecryptfs_kernel.h" + +/** + * ecryptfs_write_lower + * @ecryptfs_inode: The eCryptfs inode + * @data: Data to write + * @offset: Byte offset in the lower file to which to write the data + * @size: Number of bytes from @data to write at @offset in the lower + * file + * + * Write data to the lower file. + * + * Returns bytes written on success; less than zero on error + */ +int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, + loff_t offset, size_t size) +{ + struct file *lower_file; + mm_segment_t fs_save; + ssize_t rc; + + lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file; + if (!lower_file) + return -EIO; + fs_save = get_fs(); + set_fs(get_ds()); + rc = vfs_write(lower_file, data, size, &offset); + set_fs(fs_save); + mark_inode_dirty_sync(ecryptfs_inode); + return rc; +} + +/** + * ecryptfs_write_lower_page_segment + * @ecryptfs_inode: The eCryptfs inode + * @page_for_lower: The page containing the data to be written to the + * lower file + * @offset_in_page: The offset in the @page_for_lower from which to + * start writing the data + * @size: The amount of data from @page_for_lower to write to the + * lower file + * + * Determines the byte offset in the file for the given page and + * offset within the page, maps the page, and makes the call to write + * the contents of @page_for_lower to the lower inode. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, + struct page *page_for_lower, + size_t offset_in_page, size_t size) +{ + char *virt; + loff_t offset; + int rc; + + offset = ((((loff_t)page_for_lower->index) << PAGE_CACHE_SHIFT) + + offset_in_page); + virt = kmap(page_for_lower); + rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size); + if (rc > 0) + rc = 0; + kunmap(page_for_lower); + return rc; +} + +/** + * ecryptfs_write + * @ecryptfs_inode: The eCryptfs file into which to write + * @data: Virtual address where data to write is located + * @offset: Offset in the eCryptfs file at which to begin writing the + * data from @data + * @size: The number of bytes to write from @data + * + * Write an arbitrary amount of data to an arbitrary location in the + * eCryptfs inode page cache. This is done on a page-by-page, and then + * by an extent-by-extent, basis; individual extents are encrypted and + * written to the lower page cache (via VFS writes). This function + * takes care of all the address translation to locations in the lower + * filesystem; it also handles truncate events, writing out zeros + * where necessary. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, + size_t size) +{ + struct page *ecryptfs_page; + struct ecryptfs_crypt_stat *crypt_stat; + char *ecryptfs_page_virt; + loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); + loff_t data_offset = 0; + loff_t pos; + int rc = 0; + + crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; + /* + * if we are writing beyond current size, then start pos + * at the current size - we'll fill in zeros from there. + */ + if (offset > ecryptfs_file_size) + pos = ecryptfs_file_size; + else + pos = offset; + while (pos < (offset + size)) { + pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT); + size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK); + size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page); + loff_t total_remaining_bytes = ((offset + size) - pos); + + if (fatal_signal_pending(current)) { + rc = -EINTR; + break; + } + + if (num_bytes > total_remaining_bytes) + num_bytes = total_remaining_bytes; + if (pos < offset) { + /* remaining zeros to write, up to destination offset */ + loff_t total_remaining_zeros = (offset - pos); + + if (num_bytes > total_remaining_zeros) + num_bytes = total_remaining_zeros; + } + ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode, + ecryptfs_page_idx); + if (IS_ERR(ecryptfs_page)) { + rc = PTR_ERR(ecryptfs_page); + printk(KERN_ERR "%s: Error getting page at " + "index [%ld] from eCryptfs inode " + "mapping; rc = [%d]\n", __func__, + ecryptfs_page_idx, rc); + goto out; + } + ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0); + + /* + * pos: where we're now writing, offset: where the request was + * If current pos is before request, we are filling zeros + * If we are at or beyond request, we are writing the *data* + * If we're in a fresh page beyond eof, zero it in either case + */ + if (pos < offset || !start_offset_in_page) { + /* We are extending past the previous end of the file. + * Fill in zero values to the end of the page */ + memset(((char *)ecryptfs_page_virt + + start_offset_in_page), 0, + PAGE_CACHE_SIZE - start_offset_in_page); + } + + /* pos >= offset, we are now writing the data request */ + if (pos >= offset) { + memcpy(((char *)ecryptfs_page_virt + + start_offset_in_page), + (data + data_offset), num_bytes); + data_offset += num_bytes; + } + kunmap_atomic(ecryptfs_page_virt, KM_USER0); + flush_dcache_page(ecryptfs_page); + SetPageUptodate(ecryptfs_page); + unlock_page(ecryptfs_page); + if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) + rc = ecryptfs_encrypt_page(ecryptfs_page); + else + rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, + ecryptfs_page, + start_offset_in_page, + data_offset); + page_cache_release(ecryptfs_page); + if (rc) { + printk(KERN_ERR "%s: Error encrypting " + "page; rc = [%d]\n", __func__, rc); + goto out; + } + pos += num_bytes; + } + if (pos > ecryptfs_file_size) { + i_size_write(ecryptfs_inode, pos); + if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) { + int rc2; + + rc2 = ecryptfs_write_inode_size_to_metadata( + ecryptfs_inode); + if (rc2) { + printk(KERN_ERR "Problem with " + "ecryptfs_write_inode_size_to_metadata; " + "rc = [%d]\n", rc2); + if (!rc) + rc = rc2; + goto out; + } + } + } +out: + return rc; +} + +/** + * ecryptfs_read_lower + * @data: The read data is stored here by this function + * @offset: Byte offset in the lower file from which to read the data + * @size: Number of bytes to read from @offset of the lower file and + * store into @data + * @ecryptfs_inode: The eCryptfs inode + * + * Read @size bytes of data at byte offset @offset from the lower + * inode into memory location @data. + * + * Returns bytes read on success; 0 on EOF; less than zero on error + */ +int ecryptfs_read_lower(char *data, loff_t offset, size_t size, + struct inode *ecryptfs_inode) +{ + struct file *lower_file; + mm_segment_t fs_save; + ssize_t rc; + + lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file; + if (!lower_file) + return -EIO; + fs_save = get_fs(); + set_fs(get_ds()); + rc = vfs_read(lower_file, data, size, &offset); + set_fs(fs_save); + return rc; +} + +/** + * ecryptfs_read_lower_page_segment + * @page_for_ecryptfs: The page into which data for eCryptfs will be + * written + * @offset_in_page: Offset in @page_for_ecryptfs from which to start + * writing + * @size: The number of bytes to write into @page_for_ecryptfs + * @ecryptfs_inode: The eCryptfs inode + * + * Determines the byte offset in the file for the given page and + * offset within the page, maps the page, and makes the call to read + * the contents of @page_for_ecryptfs from the lower inode. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, + pgoff_t page_index, + size_t offset_in_page, size_t size, + struct inode *ecryptfs_inode) +{ + char *virt; + loff_t offset; + int rc; + + offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page); + virt = kmap(page_for_ecryptfs); + rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode); + if (rc > 0) + rc = 0; + kunmap(page_for_ecryptfs); + flush_dcache_page(page_for_ecryptfs); + return rc; +} + +#if 0 +/** + * ecryptfs_read + * @data: The virtual address into which to write the data read (and + * possibly decrypted) from the lower file + * @offset: The offset in the decrypted view of the file from which to + * read into @data + * @size: The number of bytes to read into @data + * @ecryptfs_file: The eCryptfs file from which to read + * + * Read an arbitrary amount of data from an arbitrary location in the + * eCryptfs page cache. This is done on an extent-by-extent basis; + * individual extents are decrypted and read from the lower page + * cache (via VFS reads). This function takes care of all the + * address translation to locations in the lower filesystem. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_read(char *data, loff_t offset, size_t size, + struct file *ecryptfs_file) +{ + struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode; + struct page *ecryptfs_page; + char *ecryptfs_page_virt; + loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); + loff_t data_offset = 0; + loff_t pos; + int rc = 0; + + if ((offset + size) > ecryptfs_file_size) { + rc = -EINVAL; + printk(KERN_ERR "%s: Attempt to read data past the end of the " + "file; offset = [%lld]; size = [%td]; " + "ecryptfs_file_size = [%lld]\n", + __func__, offset, size, ecryptfs_file_size); + goto out; + } + pos = offset; + while (pos < (offset + size)) { + pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT); + size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK); + size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page); + size_t total_remaining_bytes = ((offset + size) - pos); + + if (num_bytes > total_remaining_bytes) + num_bytes = total_remaining_bytes; + ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode, + ecryptfs_page_idx); + if (IS_ERR(ecryptfs_page)) { + rc = PTR_ERR(ecryptfs_page); + printk(KERN_ERR "%s: Error getting page at " + "index [%ld] from eCryptfs inode " + "mapping; rc = [%d]\n", __func__, + ecryptfs_page_idx, rc); + goto out; + } + ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0); + memcpy((data + data_offset), + ((char *)ecryptfs_page_virt + start_offset_in_page), + num_bytes); + kunmap_atomic(ecryptfs_page_virt, KM_USER0); + flush_dcache_page(ecryptfs_page); + SetPageUptodate(ecryptfs_page); + unlock_page(ecryptfs_page); + page_cache_release(ecryptfs_page); + pos += num_bytes; + data_offset += num_bytes; + } +out: + return rc; +} +#endif /* 0 */ diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c new file mode 100644 index 00000000..dbd52d40 --- /dev/null +++ b/fs/ecryptfs/super.c @@ -0,0 +1,181 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow + * Michael C. Thompson + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ecryptfs_kernel.h" + +struct kmem_cache *ecryptfs_inode_info_cache; + +/** + * ecryptfs_alloc_inode - allocate an ecryptfs inode + * @sb: Pointer to the ecryptfs super block + * + * Called to bring an inode into existence. + * + * Only handle allocation, setting up structures should be done in + * ecryptfs_read_inode. This is because the kernel, between now and + * then, will 0 out the private data pointer. + * + * Returns a pointer to a newly allocated inode, NULL otherwise + */ +static struct inode *ecryptfs_alloc_inode(struct super_block *sb) +{ + struct ecryptfs_inode_info *inode_info; + struct inode *inode = NULL; + + inode_info = kmem_cache_alloc(ecryptfs_inode_info_cache, GFP_KERNEL); + if (unlikely(!inode_info)) + goto out; + ecryptfs_init_crypt_stat(&inode_info->crypt_stat); + mutex_init(&inode_info->lower_file_mutex); + atomic_set(&inode_info->lower_file_count, 0); + inode_info->lower_file = NULL; + inode = &inode_info->vfs_inode; +out: + return inode; +} + +static void ecryptfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ecryptfs_inode_info *inode_info; + inode_info = ecryptfs_inode_to_private(inode); + + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ecryptfs_inode_info_cache, inode_info); +} + +/** + * ecryptfs_destroy_inode + * @inode: The ecryptfs inode + * + * This is used during the final destruction of the inode. All + * allocation of memory related to the inode, including allocated + * memory in the crypt_stat struct, will be released here. + * There should be no chance that this deallocation will be missed. + */ +static void ecryptfs_destroy_inode(struct inode *inode) +{ + struct ecryptfs_inode_info *inode_info; + + inode_info = ecryptfs_inode_to_private(inode); + BUG_ON(inode_info->lower_file); + ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); + call_rcu(&inode->i_rcu, ecryptfs_i_callback); +} + +/** + * ecryptfs_statfs + * @sb: The ecryptfs super block + * @buf: The struct kstatfs to fill in with stats + * + * Get the filesystem statistics. Currently, we let this pass right through + * to the lower filesystem and take no action ourselves. + */ +static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + + if (!lower_dentry->d_sb->s_op->statfs) + return -ENOSYS; + return lower_dentry->d_sb->s_op->statfs(lower_dentry, buf); +} + +/** + * ecryptfs_evict_inode + * @inode - The ecryptfs inode + * + * Called by iput() when the inode reference count reached zero + * and the inode is not hashed anywhere. Used to clear anything + * that needs to be, before the inode is completely destroyed and put + * on the inode free list. We use this to drop out reference to the + * lower inode. + */ +static void ecryptfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + iput(ecryptfs_inode_to_lower(inode)); +} + +/** + * ecryptfs_show_options + * + * Prints the mount options for a given superblock. + * Returns zero; does not fail. + */ +static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + struct super_block *sb = mnt->mnt_sb; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; + struct ecryptfs_global_auth_tok *walker; + + mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); + list_for_each_entry(walker, + &mount_crypt_stat->global_auth_tok_list, + mount_crypt_stat_list) { + if (walker->flags & ECRYPTFS_AUTH_TOK_FNEK) + seq_printf(m, ",ecryptfs_fnek_sig=%s", walker->sig); + else + seq_printf(m, ",ecryptfs_sig=%s", walker->sig); + } + mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); + + seq_printf(m, ",ecryptfs_cipher=%s", + mount_crypt_stat->global_default_cipher_name); + + if (mount_crypt_stat->global_default_cipher_key_size) + seq_printf(m, ",ecryptfs_key_bytes=%zd", + mount_crypt_stat->global_default_cipher_key_size); + if (mount_crypt_stat->flags & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED) + seq_printf(m, ",ecryptfs_passthrough"); + if (mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED) + seq_printf(m, ",ecryptfs_xattr_metadata"); + if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + seq_printf(m, ",ecryptfs_encrypted_view"); + if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS) + seq_printf(m, ",ecryptfs_unlink_sigs"); + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY) + seq_printf(m, ",ecryptfs_mount_auth_tok_only"); + + return 0; +} + +const struct super_operations ecryptfs_sops = { + .alloc_inode = ecryptfs_alloc_inode, + .destroy_inode = ecryptfs_destroy_inode, + .drop_inode = generic_drop_inode, + .statfs = ecryptfs_statfs, + .remount_fs = NULL, + .evict_inode = ecryptfs_evict_inode, + .show_options = ecryptfs_show_options +}; diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig new file mode 100644 index 00000000..6ebfc1c2 --- /dev/null +++ b/fs/efs/Kconfig @@ -0,0 +1,14 @@ +config EFS_FS + tristate "EFS file system support (read only) (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + help + EFS is an older file system used for non-ISO9660 CD-ROMs and hard + disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer + uses the XFS file system for hard disk partitions however). + + This implementation only offers read-only access. If you don't know + what all this is about, it's safe to say N. For more information + about EFS see its home page at . + + To compile the EFS file system support as a module, choose M here: the + module will be called efs. diff --git a/fs/efs/Makefile b/fs/efs/Makefile new file mode 100644 index 00000000..963543d4 --- /dev/null +++ b/fs/efs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux efs-filesystem routines. +# + +obj-$(CONFIG_EFS_FS) += efs.o + +efs-objs := super.o inode.o namei.o dir.o file.o symlink.o diff --git a/fs/efs/dir.c b/fs/efs/dir.c new file mode 100644 index 00000000..7ee6f7e3 --- /dev/null +++ b/fs/efs/dir.c @@ -0,0 +1,110 @@ +/* + * dir.c + * + * Copyright (c) 1999 Al Smith + */ + +#include +#include "efs.h" + +static int efs_readdir(struct file *, void *, filldir_t); + +const struct file_operations efs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = efs_readdir, +}; + +const struct inode_operations efs_dir_inode_operations = { + .lookup = efs_lookup, +}; + +static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { + struct inode *inode = filp->f_path.dentry->d_inode; + struct buffer_head *bh; + + struct efs_dir *dirblock; + struct efs_dentry *dirslot; + efs_ino_t inodenum; + efs_block_t block; + int slot, namelen; + char *nameptr; + + if (inode->i_size & (EFS_DIRBSIZE-1)) + printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); + + /* work out where this entry can be found */ + block = filp->f_pos >> EFS_DIRBSIZE_BITS; + + /* each block contains at most 256 slots */ + slot = filp->f_pos & 0xff; + + /* look at all blocks */ + while (block < inode->i_blocks) { + /* read the dir block */ + bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); + + if (!bh) { + printk(KERN_ERR "EFS: readdir(): failed to read dir block %d\n", block); + break; + } + + dirblock = (struct efs_dir *) bh->b_data; + + if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { + printk(KERN_ERR "EFS: readdir(): invalid directory block\n"); + brelse(bh); + break; + } + + while (slot < dirblock->slots) { + if (dirblock->space[slot] == 0) { + slot++; + continue; + } + + dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot)); + + inodenum = be32_to_cpu(dirslot->inode); + namelen = dirslot->namelen; + nameptr = dirslot->name; + +#ifdef DEBUG + printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen); +#endif + if (namelen > 0) { + /* found the next entry */ + filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; + + /* copy filename and data in dirslot */ + filldir(dirent, nameptr, namelen, filp->f_pos, inodenum, DT_UNKNOWN); + + /* sanity check */ + if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { + printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot); + slot++; + continue; + } + + /* store position of next slot */ + if (++slot == dirblock->slots) { + slot = 0; + block++; + } + brelse(bh); + filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; + goto out; + } + slot++; + } + brelse(bh); + + slot = 0; + block++; + } + + filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; +out: + return 0; +} + diff --git a/fs/efs/efs.h b/fs/efs/efs.h new file mode 100644 index 00000000..d8305b58 --- /dev/null +++ b/fs/efs/efs.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 1999 Al Smith + * + * Portions derived from work (c) 1995,1996 Christian Vogelgsang. + * Portions derived from IRIX header files (c) 1988 Silicon Graphics + */ +#ifndef _EFS_EFS_H_ +#define _EFS_EFS_H_ + +#include +#include + +#define EFS_VERSION "1.0a" + +static const char cprt[] = "EFS: "EFS_VERSION" - (c) 1999 Al Smith "; + + +/* 1 block is 512 bytes */ +#define EFS_BLOCKSIZE_BITS 9 +#define EFS_BLOCKSIZE (1 << EFS_BLOCKSIZE_BITS) + +typedef int32_t efs_block_t; +typedef uint32_t efs_ino_t; + +#define EFS_DIRECTEXTENTS 12 + +/* + * layout of an extent, in memory and on disk. 8 bytes exactly. + */ +typedef union extent_u { + unsigned char raw[8]; + struct extent_s { + unsigned int ex_magic:8; /* magic # (zero) */ + unsigned int ex_bn:24; /* basic block */ + unsigned int ex_length:8; /* numblocks in this extent */ + unsigned int ex_offset:24; /* logical offset into file */ + } cooked; +} efs_extent; + +typedef struct edevs { + __be16 odev; + __be32 ndev; +} efs_devs; + +/* + * extent based filesystem inode as it appears on disk. The efs inode + * is exactly 128 bytes long. + */ +struct efs_dinode { + __be16 di_mode; /* mode and type of file */ + __be16 di_nlink; /* number of links to file */ + __be16 di_uid; /* owner's user id */ + __be16 di_gid; /* owner's group id */ + __be32 di_size; /* number of bytes in file */ + __be32 di_atime; /* time last accessed */ + __be32 di_mtime; /* time last modified */ + __be32 di_ctime; /* time created */ + __be32 di_gen; /* generation number */ + __be16 di_numextents; /* # of extents */ + u_char di_version; /* version of inode */ + u_char di_spare; /* spare - used by AFS */ + union di_addr { + efs_extent di_extents[EFS_DIRECTEXTENTS]; + efs_devs di_dev; /* device for IFCHR/IFBLK */ + } di_u; +}; + +/* efs inode storage in memory */ +struct efs_inode_info { + int numextents; + int lastextent; + + efs_extent extents[EFS_DIRECTEXTENTS]; + struct inode vfs_inode; +}; + +#include + +#define EFS_DIRBSIZE_BITS EFS_BLOCKSIZE_BITS +#define EFS_DIRBSIZE (1 << EFS_DIRBSIZE_BITS) + +struct efs_dentry { + __be32 inode; + unsigned char namelen; + char name[3]; +}; + +#define EFS_DENTSIZE (sizeof(struct efs_dentry) - 3 + 1) +#define EFS_MAXNAMELEN ((1 << (sizeof(char) * 8)) - 1) + +#define EFS_DIRBLK_HEADERSIZE 4 +#define EFS_DIRBLK_MAGIC 0xbeef /* moo */ + +struct efs_dir { + __be16 magic; + unsigned char firstused; + unsigned char slots; + + unsigned char space[EFS_DIRBSIZE - EFS_DIRBLK_HEADERSIZE]; +}; + +#define EFS_MAXENTS \ + ((EFS_DIRBSIZE - EFS_DIRBLK_HEADERSIZE) / \ + (EFS_DENTSIZE + sizeof(char))) + +#define EFS_SLOTAT(dir, slot) EFS_REALOFF((dir)->space[slot]) + +#define EFS_REALOFF(offset) ((offset << 1)) + + +static inline struct efs_inode_info *INODE_INFO(struct inode *inode) +{ + return container_of(inode, struct efs_inode_info, vfs_inode); +} + +static inline struct efs_sb_info *SUPER_INFO(struct super_block *sb) +{ + return sb->s_fs_info; +} + +struct statfs; +struct fid; + +extern const struct inode_operations efs_dir_inode_operations; +extern const struct file_operations efs_dir_operations; +extern const struct address_space_operations efs_symlink_aops; + +extern struct inode *efs_iget(struct super_block *, unsigned long); +extern efs_block_t efs_map_block(struct inode *, efs_block_t); +extern int efs_get_block(struct inode *, sector_t, struct buffer_head *, int); + +extern struct dentry *efs_lookup(struct inode *, struct dentry *, struct nameidata *); +extern struct dentry *efs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +extern struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +extern struct dentry *efs_get_parent(struct dentry *); +extern int efs_bmap(struct inode *, int); + +#endif /* _EFS_EFS_H_ */ diff --git a/fs/efs/file.c b/fs/efs/file.c new file mode 100644 index 00000000..1ccb364f --- /dev/null +++ b/fs/efs/file.c @@ -0,0 +1,60 @@ +/* + * file.c + * + * Copyright (c) 1999 Al Smith + * + * Portions derived from work (c) 1995,1996 Christian Vogelgsang. + */ + +#include +#include "efs.h" + +int efs_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int error = -EROFS; + long phys; + + if (create) + return error; + if (iblock >= inode->i_blocks) { +#ifdef DEBUG + /* + * i have no idea why this happens as often as it does + */ + printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n", + block, + inode->i_blocks, + inode->i_size); +#endif + return 0; + } + phys = efs_map_block(inode, iblock); + if (phys) + map_bh(bh_result, inode->i_sb, phys); + return 0; +} + +int efs_bmap(struct inode *inode, efs_block_t block) { + + if (block < 0) { + printk(KERN_WARNING "EFS: bmap(): block < 0\n"); + return 0; + } + + /* are we about to read past the end of a file ? */ + if (!(block < inode->i_blocks)) { +#ifdef DEBUG + /* + * i have no idea why this happens as often as it does + */ + printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n", + block, + inode->i_blocks, + inode->i_size); +#endif + return 0; + } + + return efs_map_block(inode, block); +} diff --git a/fs/efs/inode.c b/fs/efs/inode.c new file mode 100644 index 00000000..9c13412e --- /dev/null +++ b/fs/efs/inode.c @@ -0,0 +1,313 @@ +/* + * inode.c + * + * Copyright (c) 1999 Al Smith + * + * Portions derived from work (c) 1995,1996 Christian Vogelgsang, + * and from work (c) 1998 Mike Shaver. + */ + +#include +#include +#include +#include "efs.h" +#include + +static int efs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,efs_get_block); +} +static sector_t _efs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,efs_get_block); +} +static const struct address_space_operations efs_aops = { + .readpage = efs_readpage, + .bmap = _efs_bmap +}; + +static inline void extent_copy(efs_extent *src, efs_extent *dst) { + /* + * this is slightly evil. it doesn't just copy + * efs_extent from src to dst, it also mangles + * the bits so that dst ends up in cpu byte-order. + */ + + dst->cooked.ex_magic = (unsigned int) src->raw[0]; + dst->cooked.ex_bn = ((unsigned int) src->raw[1] << 16) | + ((unsigned int) src->raw[2] << 8) | + ((unsigned int) src->raw[3] << 0); + dst->cooked.ex_length = (unsigned int) src->raw[4]; + dst->cooked.ex_offset = ((unsigned int) src->raw[5] << 16) | + ((unsigned int) src->raw[6] << 8) | + ((unsigned int) src->raw[7] << 0); + return; +} + +struct inode *efs_iget(struct super_block *super, unsigned long ino) +{ + int i, inode_index; + dev_t device; + u32 rdev; + struct buffer_head *bh; + struct efs_sb_info *sb = SUPER_INFO(super); + struct efs_inode_info *in; + efs_block_t block, offset; + struct efs_dinode *efs_inode; + struct inode *inode; + + inode = iget_locked(super, ino); + if (IS_ERR(inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + in = INODE_INFO(inode); + + /* + ** EFS layout: + ** + ** | cylinder group | cylinder group | cylinder group ..etc + ** |inodes|data |inodes|data |inodes|data ..etc + ** + ** work out the inode block index, (considering initially that the + ** inodes are stored as consecutive blocks). then work out the block + ** number of that inode given the above layout, and finally the + ** offset of the inode within that block. + */ + + inode_index = inode->i_ino / + (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); + + block = sb->fs_start + sb->first_block + + (sb->group_size * (inode_index / sb->inode_blocks)) + + (inode_index % sb->inode_blocks); + + offset = (inode->i_ino % + (EFS_BLOCKSIZE / sizeof(struct efs_dinode))) * + sizeof(struct efs_dinode); + + bh = sb_bread(inode->i_sb, block); + if (!bh) { + printk(KERN_WARNING "EFS: bread() failed at block %d\n", block); + goto read_inode_error; + } + + efs_inode = (struct efs_dinode *) (bh->b_data + offset); + + inode->i_mode = be16_to_cpu(efs_inode->di_mode); + inode->i_nlink = be16_to_cpu(efs_inode->di_nlink); + inode->i_uid = (uid_t)be16_to_cpu(efs_inode->di_uid); + inode->i_gid = (gid_t)be16_to_cpu(efs_inode->di_gid); + inode->i_size = be32_to_cpu(efs_inode->di_size); + inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime); + inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime); + inode->i_ctime.tv_sec = be32_to_cpu(efs_inode->di_ctime); + inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0; + + /* this is the number of blocks in the file */ + if (inode->i_size == 0) { + inode->i_blocks = 0; + } else { + inode->i_blocks = ((inode->i_size - 1) >> EFS_BLOCKSIZE_BITS) + 1; + } + + rdev = be16_to_cpu(efs_inode->di_u.di_dev.odev); + if (rdev == 0xffff) { + rdev = be32_to_cpu(efs_inode->di_u.di_dev.ndev); + if (sysv_major(rdev) > 0xfff) + device = 0; + else + device = MKDEV(sysv_major(rdev), sysv_minor(rdev)); + } else + device = old_decode_dev(rdev); + + /* get the number of extents for this object */ + in->numextents = be16_to_cpu(efs_inode->di_numextents); + in->lastextent = 0; + + /* copy the extents contained within the inode to memory */ + for(i = 0; i < EFS_DIRECTEXTENTS; i++) { + extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i])); + if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) { + printk(KERN_WARNING "EFS: extent %d has bad magic number in inode %lu\n", i, inode->i_ino); + brelse(bh); + goto read_inode_error; + } + } + + brelse(bh); + +#ifdef DEBUG + printk(KERN_DEBUG "EFS: efs_iget(): inode %lu, extents %d, mode %o\n", + inode->i_ino, in->numextents, inode->i_mode); +#endif + + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + inode->i_op = &efs_dir_inode_operations; + inode->i_fop = &efs_dir_operations; + break; + case S_IFREG: + inode->i_fop = &generic_ro_fops; + inode->i_data.a_ops = &efs_aops; + break; + case S_IFLNK: + inode->i_op = &page_symlink_inode_operations; + inode->i_data.a_ops = &efs_symlink_aops; + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + init_special_inode(inode, inode->i_mode, device); + break; + default: + printk(KERN_WARNING "EFS: unsupported inode mode %o\n", inode->i_mode); + goto read_inode_error; + break; + } + + unlock_new_inode(inode); + return inode; + +read_inode_error: + printk(KERN_WARNING "EFS: failed to read inode %lu\n", inode->i_ino); + iget_failed(inode); + return ERR_PTR(-EIO); +} + +static inline efs_block_t +efs_extent_check(efs_extent *ptr, efs_block_t block, struct efs_sb_info *sb) { + efs_block_t start; + efs_block_t length; + efs_block_t offset; + + /* + * given an extent and a logical block within a file, + * can this block be found within this extent ? + */ + start = ptr->cooked.ex_bn; + length = ptr->cooked.ex_length; + offset = ptr->cooked.ex_offset; + + if ((block >= offset) && (block < offset+length)) { + return(sb->fs_start + start + block - offset); + } else { + return 0; + } +} + +efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { + struct efs_sb_info *sb = SUPER_INFO(inode->i_sb); + struct efs_inode_info *in = INODE_INFO(inode); + struct buffer_head *bh = NULL; + + int cur, last, first = 1; + int ibase, ioffset, dirext, direxts, indext, indexts; + efs_block_t iblock, result = 0, lastblock = 0; + efs_extent ext, *exts; + + last = in->lastextent; + + if (in->numextents <= EFS_DIRECTEXTENTS) { + /* first check the last extent we returned */ + if ((result = efs_extent_check(&in->extents[last], block, sb))) + return result; + + /* if we only have one extent then nothing can be found */ + if (in->numextents == 1) { + printk(KERN_ERR "EFS: map_block() failed to map (1 extent)\n"); + return 0; + } + + direxts = in->numextents; + + /* + * check the stored extents in the inode + * start with next extent and check forwards + */ + for(dirext = 1; dirext < direxts; dirext++) { + cur = (last + dirext) % in->numextents; + if ((result = efs_extent_check(&in->extents[cur], block, sb))) { + in->lastextent = cur; + return result; + } + } + + printk(KERN_ERR "EFS: map_block() failed to map block %u (dir)\n", block); + return 0; + } + +#ifdef DEBUG + printk(KERN_DEBUG "EFS: map_block(): indirect search for logical block %u\n", block); +#endif + direxts = in->extents[0].cooked.ex_offset; + indexts = in->numextents; + + for(indext = 0; indext < indexts; indext++) { + cur = (last + indext) % indexts; + + /* + * work out which direct extent contains `cur'. + * + * also compute ibase: i.e. the number of the first + * indirect extent contained within direct extent `cur'. + * + */ + ibase = 0; + for(dirext = 0; cur < ibase && dirext < direxts; dirext++) { + ibase += in->extents[dirext].cooked.ex_length * + (EFS_BLOCKSIZE / sizeof(efs_extent)); + } + + if (dirext == direxts) { + /* should never happen */ + printk(KERN_ERR "EFS: couldn't find direct extent for indirect extent %d (block %u)\n", cur, block); + if (bh) brelse(bh); + return 0; + } + + /* work out block number and offset of this indirect extent */ + iblock = sb->fs_start + in->extents[dirext].cooked.ex_bn + + (cur - ibase) / + (EFS_BLOCKSIZE / sizeof(efs_extent)); + ioffset = (cur - ibase) % + (EFS_BLOCKSIZE / sizeof(efs_extent)); + + if (first || lastblock != iblock) { + if (bh) brelse(bh); + + bh = sb_bread(inode->i_sb, iblock); + if (!bh) { + printk(KERN_ERR "EFS: bread() failed at block %d\n", iblock); + return 0; + } +#ifdef DEBUG + printk(KERN_DEBUG "EFS: map_block(): read indirect extent block %d\n", iblock); +#endif + first = 0; + lastblock = iblock; + } + + exts = (efs_extent *) bh->b_data; + + extent_copy(&(exts[ioffset]), &ext); + + if (ext.cooked.ex_magic != 0) { + printk(KERN_ERR "EFS: extent %d has bad magic number in block %d\n", cur, iblock); + if (bh) brelse(bh); + return 0; + } + + if ((result = efs_extent_check(&ext, block, sb))) { + if (bh) brelse(bh); + in->lastextent = cur; + return result; + } + } + if (bh) brelse(bh); + printk(KERN_ERR "EFS: map_block() failed to map block %u (indir)\n", block); + return 0; +} + +MODULE_LICENSE("GPL"); diff --git a/fs/efs/namei.c b/fs/efs/namei.c new file mode 100644 index 00000000..1511bf9e --- /dev/null +++ b/fs/efs/namei.c @@ -0,0 +1,118 @@ +/* + * namei.c + * + * Copyright (c) 1999 Al Smith + * + * Portions derived from work (c) 1995,1996 Christian Vogelgsang. + */ + +#include +#include +#include +#include "efs.h" + + +static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) { + struct buffer_head *bh; + + int slot, namelen; + char *nameptr; + struct efs_dir *dirblock; + struct efs_dentry *dirslot; + efs_ino_t inodenum; + efs_block_t block; + + if (inode->i_size & (EFS_DIRBSIZE-1)) + printk(KERN_WARNING "EFS: WARNING: find_entry(): directory size not a multiple of EFS_DIRBSIZE\n"); + + for(block = 0; block < inode->i_blocks; block++) { + + bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); + if (!bh) { + printk(KERN_ERR "EFS: find_entry(): failed to read dir block %d\n", block); + return 0; + } + + dirblock = (struct efs_dir *) bh->b_data; + + if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { + printk(KERN_ERR "EFS: find_entry(): invalid directory block\n"); + brelse(bh); + return(0); + } + + for(slot = 0; slot < dirblock->slots; slot++) { + dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot)); + + namelen = dirslot->namelen; + nameptr = dirslot->name; + + if ((namelen == len) && (!memcmp(name, nameptr, len))) { + inodenum = be32_to_cpu(dirslot->inode); + brelse(bh); + return(inodenum); + } + } + brelse(bh); + } + return(0); +} + +struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { + efs_ino_t inodenum; + struct inode * inode = NULL; + + inodenum = efs_find_entry(dir, dentry->d_name.name, dentry->d_name.len); + if (inodenum) { + inode = efs_iget(dir->i_sb, inodenum); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + + return d_splice_alias(inode, dentry); +} + +static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino, + u32 generation) +{ + struct inode *inode; + + if (ino == 0) + return ERR_PTR(-ESTALE); + inode = efs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +struct dentry *efs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + efs_nfs_get_inode); +} + +struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + efs_nfs_get_inode); +} + +struct dentry *efs_get_parent(struct dentry *child) +{ + struct dentry *parent = ERR_PTR(-ENOENT); + efs_ino_t ino; + + ino = efs_find_entry(child->d_inode, "..", 2); + if (ino) + parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino)); + + return parent; +} diff --git a/fs/efs/super.c b/fs/efs/super.c new file mode 100644 index 00000000..0f31acb0 --- /dev/null +++ b/fs/efs/super.c @@ -0,0 +1,359 @@ +/* + * super.c + * + * Copyright (c) 1999 Al Smith + * + * Portions derived from work (c) 1995,1996 Christian Vogelgsang. + */ + +#include +#include +#include +#include +#include +#include + +#include "efs.h" +#include +#include + +static int efs_statfs(struct dentry *dentry, struct kstatfs *buf); +static int efs_fill_super(struct super_block *s, void *d, int silent); + +static struct dentry *efs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super); +} + +static struct file_system_type efs_fs_type = { + .owner = THIS_MODULE, + .name = "efs", + .mount = efs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static struct pt_types sgi_pt_types[] = { + {0x00, "SGI vh"}, + {0x01, "SGI trkrepl"}, + {0x02, "SGI secrepl"}, + {0x03, "SGI raw"}, + {0x04, "SGI bsd"}, + {SGI_SYSV, "SGI sysv"}, + {0x06, "SGI vol"}, + {SGI_EFS, "SGI efs"}, + {0x08, "SGI lv"}, + {0x09, "SGI rlv"}, + {0x0A, "SGI xfs"}, + {0x0B, "SGI xfslog"}, + {0x0C, "SGI xlv"}, + {0x82, "Linux swap"}, + {0x83, "Linux native"}, + {0, NULL} +}; + + +static struct kmem_cache * efs_inode_cachep; + +static struct inode *efs_alloc_inode(struct super_block *sb) +{ + struct efs_inode_info *ei; + ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void efs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(efs_inode_cachep, INODE_INFO(inode)); +} + +static void efs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, efs_i_callback); +} + +static void init_once(void *foo) +{ + struct efs_inode_info *ei = (struct efs_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + efs_inode_cachep = kmem_cache_create("efs_inode_cache", + sizeof(struct efs_inode_info), + 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + init_once); + if (efs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(efs_inode_cachep); +} + +static void efs_put_super(struct super_block *s) +{ + kfree(s->s_fs_info); + s->s_fs_info = NULL; +} + +static int efs_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_RDONLY; + return 0; +} + +static const struct super_operations efs_superblock_operations = { + .alloc_inode = efs_alloc_inode, + .destroy_inode = efs_destroy_inode, + .put_super = efs_put_super, + .statfs = efs_statfs, + .remount_fs = efs_remount, +}; + +static const struct export_operations efs_export_ops = { + .fh_to_dentry = efs_fh_to_dentry, + .fh_to_parent = efs_fh_to_parent, + .get_parent = efs_get_parent, +}; + +static int __init init_efs_fs(void) { + int err; + printk("EFS: "EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); + err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&efs_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_efs_fs(void) { + unregister_filesystem(&efs_fs_type); + destroy_inodecache(); +} + +module_init(init_efs_fs) +module_exit(exit_efs_fs) + +static efs_block_t efs_validate_vh(struct volume_header *vh) { + int i; + __be32 cs, *ui; + int csum; + efs_block_t sblock = 0; /* shuts up gcc */ + struct pt_types *pt_entry; + int pt_type, slice = -1; + + if (be32_to_cpu(vh->vh_magic) != VHMAGIC) { + /* + * assume that we're dealing with a partition and allow + * read_super() to try and detect a valid superblock + * on the next block. + */ + return 0; + } + + ui = ((__be32 *) (vh + 1)) - 1; + for(csum = 0; ui >= ((__be32 *) vh);) { + cs = *ui--; + csum += be32_to_cpu(cs); + } + if (csum) { + printk(KERN_INFO "EFS: SGI disklabel: checksum bad, label corrupted\n"); + return 0; + } + +#ifdef DEBUG + printk(KERN_DEBUG "EFS: bf: \"%16s\"\n", vh->vh_bootfile); + + for(i = 0; i < NVDIR; i++) { + int j; + char name[VDNAMESIZE+1]; + + for(j = 0; j < VDNAMESIZE; j++) { + name[j] = vh->vh_vd[i].vd_name[j]; + } + name[j] = (char) 0; + + if (name[0]) { + printk(KERN_DEBUG "EFS: vh: %8s block: 0x%08x size: 0x%08x\n", + name, + (int) be32_to_cpu(vh->vh_vd[i].vd_lbn), + (int) be32_to_cpu(vh->vh_vd[i].vd_nbytes)); + } + } +#endif + + for(i = 0; i < NPARTAB; i++) { + pt_type = (int) be32_to_cpu(vh->vh_pt[i].pt_type); + for(pt_entry = sgi_pt_types; pt_entry->pt_name; pt_entry++) { + if (pt_type == pt_entry->pt_type) break; + } +#ifdef DEBUG + if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) { + printk(KERN_DEBUG "EFS: pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n", + i, + (int) be32_to_cpu(vh->vh_pt[i].pt_firstlbn), + (int) be32_to_cpu(vh->vh_pt[i].pt_nblks), + pt_type, + (pt_entry->pt_name) ? pt_entry->pt_name : "unknown"); + } +#endif + if (IS_EFS(pt_type)) { + sblock = be32_to_cpu(vh->vh_pt[i].pt_firstlbn); + slice = i; + } + } + + if (slice == -1) { + printk(KERN_NOTICE "EFS: partition table contained no EFS partitions\n"); +#ifdef DEBUG + } else { + printk(KERN_INFO "EFS: using slice %d (type %s, offset 0x%x)\n", + slice, + (pt_entry->pt_name) ? pt_entry->pt_name : "unknown", + sblock); +#endif + } + return sblock; +} + +static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) { + + if (!IS_EFS_MAGIC(be32_to_cpu(super->fs_magic))) + return -1; + + sb->fs_magic = be32_to_cpu(super->fs_magic); + sb->total_blocks = be32_to_cpu(super->fs_size); + sb->first_block = be32_to_cpu(super->fs_firstcg); + sb->group_size = be32_to_cpu(super->fs_cgfsize); + sb->data_free = be32_to_cpu(super->fs_tfree); + sb->inode_free = be32_to_cpu(super->fs_tinode); + sb->inode_blocks = be16_to_cpu(super->fs_cgisize); + sb->total_groups = be16_to_cpu(super->fs_ncg); + + return 0; +} + +static int efs_fill_super(struct super_block *s, void *d, int silent) +{ + struct efs_sb_info *sb; + struct buffer_head *bh; + struct inode *root; + int ret = -EINVAL; + + sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); + if (!sb) + return -ENOMEM; + s->s_fs_info = sb; + + s->s_magic = EFS_SUPER_MAGIC; + if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { + printk(KERN_ERR "EFS: device does not support %d byte blocks\n", + EFS_BLOCKSIZE); + goto out_no_fs_ul; + } + + /* read the vh (volume header) block */ + bh = sb_bread(s, 0); + + if (!bh) { + printk(KERN_ERR "EFS: cannot read volume header\n"); + goto out_no_fs_ul; + } + + /* + * if this returns zero then we didn't find any partition table. + * this isn't (yet) an error - just assume for the moment that + * the device is valid and go on to search for a superblock. + */ + sb->fs_start = efs_validate_vh((struct volume_header *) bh->b_data); + brelse(bh); + + if (sb->fs_start == -1) { + goto out_no_fs_ul; + } + + bh = sb_bread(s, sb->fs_start + EFS_SUPER); + if (!bh) { + printk(KERN_ERR "EFS: cannot read superblock\n"); + goto out_no_fs_ul; + } + + if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { +#ifdef DEBUG + printk(KERN_WARNING "EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER); +#endif + brelse(bh); + goto out_no_fs_ul; + } + brelse(bh); + + if (!(s->s_flags & MS_RDONLY)) { +#ifdef DEBUG + printk(KERN_INFO "EFS: forcing read-only mode\n"); +#endif + s->s_flags |= MS_RDONLY; + } + s->s_op = &efs_superblock_operations; + s->s_export_op = &efs_export_ops; + root = efs_iget(s, EFS_ROOTINODE); + if (IS_ERR(root)) { + printk(KERN_ERR "EFS: get root inode failed\n"); + ret = PTR_ERR(root); + goto out_no_fs; + } + + s->s_root = d_alloc_root(root); + if (!(s->s_root)) { + printk(KERN_ERR "EFS: get root dentry failed\n"); + iput(root); + ret = -ENOMEM; + goto out_no_fs; + } + + return 0; + +out_no_fs_ul: +out_no_fs: + s->s_fs_info = NULL; + kfree(sb); + return ret; +} + +static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; + struct efs_sb_info *sbi = SUPER_INFO(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */ + buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */ + buf->f_blocks = sbi->total_groups * /* total data blocks */ + (sbi->group_size - sbi->inode_blocks); + buf->f_bfree = sbi->data_free; /* free data blocks */ + buf->f_bavail = sbi->data_free; /* free blocks for non-root */ + buf->f_files = sbi->total_groups * /* total inodes */ + sbi->inode_blocks * + (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); + buf->f_ffree = sbi->inode_free; /* free inodes */ + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ + + return 0; +} + diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c new file mode 100644 index 00000000..75117d0d --- /dev/null +++ b/fs/efs/symlink.c @@ -0,0 +1,54 @@ +/* + * symlink.c + * + * Copyright (c) 1999 Al Smith + * + * Portions derived from work (c) 1995,1996 Christian Vogelgsang. + */ + +#include +#include +#include +#include "efs.h" + +static int efs_symlink_readpage(struct file *file, struct page *page) +{ + char *link = kmap(page); + struct buffer_head * bh; + struct inode * inode = page->mapping->host; + efs_block_t size = inode->i_size; + int err; + + err = -ENAMETOOLONG; + if (size > 2 * EFS_BLOCKSIZE) + goto fail; + + /* read first 512 bytes of link target */ + err = -EIO; + bh = sb_bread(inode->i_sb, efs_bmap(inode, 0)); + if (!bh) + goto fail; + memcpy(link, bh->b_data, (size > EFS_BLOCKSIZE) ? EFS_BLOCKSIZE : size); + brelse(bh); + if (size > EFS_BLOCKSIZE) { + bh = sb_bread(inode->i_sb, efs_bmap(inode, 1)); + if (!bh) + goto fail; + memcpy(link + EFS_BLOCKSIZE, bh->b_data, size - EFS_BLOCKSIZE); + brelse(bh); + } + link[size] = '\0'; + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + return 0; +fail: + SetPageError(page); + kunmap(page); + unlock_page(page); + return err; +} + +const struct address_space_operations efs_symlink_aops = { + .readpage = efs_symlink_readpage +}; diff --git a/fs/eventfd.c b/fs/eventfd.c new file mode 100644 index 00000000..d9a59177 --- /dev/null +++ b/fs/eventfd.c @@ -0,0 +1,439 @@ +/* + * fs/eventfd.c + * + * Copyright (C) 2007 Davide Libenzi + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct eventfd_ctx { + struct kref kref; + wait_queue_head_t wqh; + /* + * Every time that a write(2) is performed on an eventfd, the + * value of the __u64 being written is added to "count" and a + * wakeup is performed on "wqh". A read(2) will return the "count" + * value to userspace, and will reset "count" to zero. The kernel + * side eventfd_signal() also, adds to the "count" counter and + * issue a wakeup. + */ + __u64 count; + unsigned int flags; +}; + +/** + * eventfd_signal - Adds @n to the eventfd counter. + * @ctx: [in] Pointer to the eventfd context. + * @n: [in] Value of the counter to be added to the eventfd internal counter. + * The value cannot be negative. + * + * This function is supposed to be called by the kernel in paths that do not + * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX + * value, and we signal this as overflow condition by returining a POLLERR + * to poll(2). + * + * Returns @n in case of success, a non-negative number lower than @n in case + * of overflow, or the following error codes: + * + * -EINVAL : The value of @n is negative. + */ +int eventfd_signal(struct eventfd_ctx *ctx, int n) +{ + unsigned long flags; + + if (n < 0) + return -EINVAL; + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ULLONG_MAX - ctx->count < n) + n = (int) (ULLONG_MAX - ctx->count); + ctx->count += n; + if (waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, POLLIN); + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return n; +} +EXPORT_SYMBOL_GPL(eventfd_signal); + +static void eventfd_free_ctx(struct eventfd_ctx *ctx) +{ + kfree(ctx); +} + +static void eventfd_free(struct kref *kref) +{ + struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); + + eventfd_free_ctx(ctx); +} + +/** + * eventfd_ctx_get - Acquires a reference to the internal eventfd context. + * @ctx: [in] Pointer to the eventfd context. + * + * Returns: In case of success, returns a pointer to the eventfd context. + */ +struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx) +{ + kref_get(&ctx->kref); + return ctx; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_get); + +/** + * eventfd_ctx_put - Releases a reference to the internal eventfd context. + * @ctx: [in] Pointer to eventfd context. + * + * The eventfd context reference must have been previously acquired either + * with eventfd_ctx_get() or eventfd_ctx_fdget(). + */ +void eventfd_ctx_put(struct eventfd_ctx *ctx) +{ + kref_put(&ctx->kref, eventfd_free); +} +EXPORT_SYMBOL_GPL(eventfd_ctx_put); + +static int eventfd_release(struct inode *inode, struct file *file) +{ + struct eventfd_ctx *ctx = file->private_data; + + wake_up_poll(&ctx->wqh, POLLHUP); + eventfd_ctx_put(ctx); + return 0; +} + +static unsigned int eventfd_poll(struct file *file, poll_table *wait) +{ + struct eventfd_ctx *ctx = file->private_data; + unsigned int events = 0; + unsigned long flags; + + poll_wait(file, &ctx->wqh, wait); + + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ctx->count > 0) + events |= POLLIN; + if (ctx->count == ULLONG_MAX) + events |= POLLERR; + if (ULLONG_MAX - 1 > ctx->count) + events |= POLLOUT; + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return events; +} + +static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) +{ + *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; + ctx->count -= *cnt; +} + +/** + * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. + * @ctx: [in] Pointer to eventfd context. + * @wait: [in] Wait queue to be removed. + * @cnt: [out] Pointer to the 64-bit counter value. + * + * Returns %0 if successful, or the following error codes: + * + * -EAGAIN : The operation would have blocked. + * + * This is used to atomically remove a wait queue entry from the eventfd wait + * queue head, and read/reset the counter value. + */ +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, + __u64 *cnt) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->wqh.lock, flags); + eventfd_ctx_do_read(ctx, cnt); + __remove_wait_queue(&ctx->wqh, wait); + if (*cnt != 0 && waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, POLLOUT); + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return *cnt != 0 ? 0 : -EAGAIN; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); + +/** + * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero. + * @ctx: [in] Pointer to eventfd context. + * @no_wait: [in] Different from zero if the operation should not block. + * @cnt: [out] Pointer to the 64-bit counter value. + * + * Returns %0 if successful, or the following error codes: + * + * -EAGAIN : The operation would have blocked but @no_wait was non-zero. + * -ERESTARTSYS : A signal interrupted the wait operation. + * + * If @no_wait is zero, the function might sleep until the eventfd internal + * counter becomes greater than zero. + */ +ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt) +{ + ssize_t res; + DECLARE_WAITQUEUE(wait, current); + + spin_lock_irq(&ctx->wqh.lock); + *cnt = 0; + res = -EAGAIN; + if (ctx->count > 0) + res = 0; + else if (!no_wait) { + __add_wait_queue(&ctx->wqh, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (ctx->count > 0) { + res = 0; + break; + } + if (signal_pending(current)) { + res = -ERESTARTSYS; + break; + } + spin_unlock_irq(&ctx->wqh.lock); + schedule(); + spin_lock_irq(&ctx->wqh.lock); + } + __remove_wait_queue(&ctx->wqh, &wait); + __set_current_state(TASK_RUNNING); + } + if (likely(res == 0)) { + eventfd_ctx_do_read(ctx, cnt); + if (waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, POLLOUT); + } + spin_unlock_irq(&ctx->wqh.lock); + + return res; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_read); + +static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct eventfd_ctx *ctx = file->private_data; + ssize_t res; + __u64 cnt; + + if (count < sizeof(cnt)) + return -EINVAL; + res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt); + if (res < 0) + return res; + + return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt); +} + +static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, + loff_t *ppos) +{ + struct eventfd_ctx *ctx = file->private_data; + ssize_t res; + __u64 ucnt; + DECLARE_WAITQUEUE(wait, current); + + if (count < sizeof(ucnt)) + return -EINVAL; + if (copy_from_user(&ucnt, buf, sizeof(ucnt))) + return -EFAULT; + if (ucnt == ULLONG_MAX) + return -EINVAL; + spin_lock_irq(&ctx->wqh.lock); + res = -EAGAIN; + if (ULLONG_MAX - ctx->count > ucnt) + res = sizeof(ucnt); + else if (!(file->f_flags & O_NONBLOCK)) { + __add_wait_queue(&ctx->wqh, &wait); + for (res = 0;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (ULLONG_MAX - ctx->count > ucnt) { + res = sizeof(ucnt); + break; + } + if (signal_pending(current)) { + res = -ERESTARTSYS; + break; + } + spin_unlock_irq(&ctx->wqh.lock); + schedule(); + spin_lock_irq(&ctx->wqh.lock); + } + __remove_wait_queue(&ctx->wqh, &wait); + __set_current_state(TASK_RUNNING); + } + if (likely(res > 0)) { + ctx->count += ucnt; + if (waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, POLLIN); + } + spin_unlock_irq(&ctx->wqh.lock); + + return res; +} + +static const struct file_operations eventfd_fops = { + .release = eventfd_release, + .poll = eventfd_poll, + .read = eventfd_read, + .write = eventfd_write, + .llseek = noop_llseek, +}; + +/** + * eventfd_fget - Acquire a reference of an eventfd file descriptor. + * @fd: [in] Eventfd file descriptor. + * + * Returns a pointer to the eventfd file structure in case of success, or the + * following error pointer: + * + * -EBADF : Invalid @fd file descriptor. + * -EINVAL : The @fd file descriptor is not an eventfd file. + */ +struct file *eventfd_fget(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + if (file->f_op != &eventfd_fops) { + fput(file); + return ERR_PTR(-EINVAL); + } + + return file; +} +EXPORT_SYMBOL_GPL(eventfd_fget); + +/** + * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context. + * @fd: [in] Eventfd file descriptor. + * + * Returns a pointer to the internal eventfd context, otherwise the error + * pointers returned by the following functions: + * + * eventfd_fget + */ +struct eventfd_ctx *eventfd_ctx_fdget(int fd) +{ + struct file *file; + struct eventfd_ctx *ctx; + + file = eventfd_fget(fd); + if (IS_ERR(file)) + return (struct eventfd_ctx *) file; + ctx = eventfd_ctx_get(file->private_data); + fput(file); + + return ctx; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); + +/** + * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context. + * @file: [in] Eventfd file pointer. + * + * Returns a pointer to the internal eventfd context, otherwise the error + * pointer: + * + * -EINVAL : The @fd file descriptor is not an eventfd file. + */ +struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) +{ + if (file->f_op != &eventfd_fops) + return ERR_PTR(-EINVAL); + + return eventfd_ctx_get(file->private_data); +} +EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); + +/** + * eventfd_file_create - Creates an eventfd file pointer. + * @count: Initial eventfd counter value. + * @flags: Flags for the eventfd file. + * + * This function creates an eventfd file pointer, w/out installing it into + * the fd table. This is useful when the eventfd file is used during the + * initialization of data structures that require extra setup after the eventfd + * creation. So the eventfd creation is split into the file pointer creation + * phase, and the file descriptor installation phase. + * In this way races with userspace closing the newly installed file descriptor + * can be avoided. + * Returns an eventfd file pointer, or a proper error pointer. + */ +struct file *eventfd_file_create(unsigned int count, int flags) +{ + struct file *file; + struct eventfd_ctx *ctx; + + /* Check the EFD_* constants for consistency. */ + BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); + + if (flags & ~EFD_FLAGS_SET) + return ERR_PTR(-EINVAL); + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + + kref_init(&ctx->kref); + init_waitqueue_head(&ctx->wqh); + ctx->count = count; + ctx->flags = flags; + + file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, + O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); + if (IS_ERR(file)) + eventfd_free_ctx(ctx); + + return file; +} + +SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) +{ + int fd, error; + struct file *file; + + error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS); + if (error < 0) + return error; + fd = error; + + file = eventfd_file_create(count, flags); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_put_unused_fd; + } + fd_install(fd, file); + + return fd; + +err_put_unused_fd: + put_unused_fd(fd); + + return error; +} + +SYSCALL_DEFINE1(eventfd, unsigned int, count) +{ + return sys_eventfd2(count, 0); +} + diff --git a/fs/eventpoll.c b/fs/eventpoll.c new file mode 100644 index 00000000..35a852a2 --- /dev/null +++ b/fs/eventpoll.c @@ -0,0 +1,1818 @@ +/* + * fs/eventpoll.c (Efficient event retrieval implementation) + * Copyright (C) 2001,...,2009 Davide Libenzi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Davide Libenzi + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * LOCKING: + * There are three level of locking required by epoll : + * + * 1) epmutex (mutex) + * 2) ep->mtx (mutex) + * 3) ep->lock (spinlock) + * + * The acquire order is the one listed above, from 1 to 3. + * We need a spinlock (ep->lock) because we manipulate objects + * from inside the poll callback, that might be triggered from + * a wake_up() that in turn might be called from IRQ context. + * So we can't sleep inside the poll callback and hence we need + * a spinlock. During the event transfer loop (from kernel to + * user space) we could end up sleeping due a copy_to_user(), so + * we need a lock that will allow us to sleep. This lock is a + * mutex (ep->mtx). It is acquired during the event transfer loop, + * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). + * Then we also need a global mutex to serialize eventpoll_release_file() + * and ep_free(). + * This mutex is acquired by ep_free() during the epoll file + * cleanup path and it is also acquired by eventpoll_release_file() + * if a file has been pushed inside an epoll set and it is then + * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL). + * It is also acquired when inserting an epoll fd onto another epoll + * fd. We do this so that we walk the epoll tree and ensure that this + * insertion does not create a cycle of epoll file descriptors, which + * could lead to deadlock. We need a global mutex to prevent two + * simultaneous inserts (A into B and B into A) from racing and + * constructing a cycle without either insert observing that it is + * going to. + * It is necessary to acquire multiple "ep->mtx"es at once in the + * case when one epoll fd is added to another. In this case, we + * always acquire the locks in the order of nesting (i.e. after + * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired + * before e2->mtx). Since we disallow cycles of epoll file + * descriptors, this ensures that the mutexes are well-ordered. In + * order to communicate this nesting to lockdep, when walking a tree + * of epoll file descriptors, we use the current recursion depth as + * the lockdep subkey. + * It is possible to drop the "ep->mtx" and to use the global + * mutex "epmutex" (together with "ep->lock") to have it working, + * but having "ep->mtx" will make the interface more scalable. + * Events that require holding "epmutex" are very rare, while for + * normal operations the epoll private "ep->mtx" will guarantee + * a better scalability. + */ + +/* Epoll private bits inside the event mask */ +#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) + +/* Maximum number of nesting allowed inside epoll sets */ +#define EP_MAX_NESTS 4 + +#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) + +#define EP_UNACTIVE_PTR ((void *) -1L) + +#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) + +struct epoll_filefd { + struct file *file; + int fd; +}; + +/* + * Structure used to track possible nested calls, for too deep recursions + * and loop cycles. + */ +struct nested_call_node { + struct list_head llink; + void *cookie; + void *ctx; +}; + +/* + * This structure is used as collector for nested calls, to check for + * maximum recursion dept and loop cycles. + */ +struct nested_calls { + struct list_head tasks_call_list; + spinlock_t lock; +}; + +/* + * Each file descriptor added to the eventpoll interface will + * have an entry of this type linked to the "rbr" RB tree. + */ +struct epitem { + /* RB tree node used to link this structure to the eventpoll RB tree */ + struct rb_node rbn; + + /* List header used to link this structure to the eventpoll ready list */ + struct list_head rdllink; + + /* + * Works together "struct eventpoll"->ovflist in keeping the + * single linked chain of items. + */ + struct epitem *next; + + /* The file descriptor information this item refers to */ + struct epoll_filefd ffd; + + /* Number of active wait queue attached to poll operations */ + int nwait; + + /* List containing poll wait queues */ + struct list_head pwqlist; + + /* The "container" of this item */ + struct eventpoll *ep; + + /* List header used to link this item to the "struct file" items list */ + struct list_head fllink; + + /* The structure that describe the interested events and the source fd */ + struct epoll_event event; +}; + +/* + * This structure is stored inside the "private_data" member of the file + * structure and represents the main data structure for the eventpoll + * interface. + */ +struct eventpoll { + /* Protect the access to this structure */ + spinlock_t lock; + + /* + * This mutex is used to ensure that files are not removed + * while epoll is using them. This is held during the event + * collection loop, the file cleanup path, the epoll file exit + * code and the ctl operations. + */ + struct mutex mtx; + + /* Wait queue used by sys_epoll_wait() */ + wait_queue_head_t wq; + + /* Wait queue used by file->poll() */ + wait_queue_head_t poll_wait; + + /* List of ready file descriptors */ + struct list_head rdllist; + + /* RB tree root used to store monitored fd structs */ + struct rb_root rbr; + + /* + * This is a single linked list that chains all the "struct epitem" that + * happened while transferring ready events to userspace w/out + * holding ->lock. + */ + struct epitem *ovflist; + + /* The user that created the eventpoll descriptor */ + struct user_struct *user; + + struct file *file; + + /* used to optimize loop detection check */ + int visited; + struct list_head visited_list_link; +}; + +/* Wait structure used by the poll hooks */ +struct eppoll_entry { + /* List header used to link this structure to the "struct epitem" */ + struct list_head llink; + + /* The "base" pointer is set to the container "struct epitem" */ + struct epitem *base; + + /* + * Wait queue item that will be linked to the target file wait + * queue head. + */ + wait_queue_t wait; + + /* The wait queue head that linked the "wait" wait queue item */ + wait_queue_head_t *whead; +}; + +/* Wrapper struct used by poll queueing */ +struct ep_pqueue { + poll_table pt; + struct epitem *epi; +}; + +/* Used by the ep_send_events() function as callback private data */ +struct ep_send_events_data { + int maxevents; + struct epoll_event __user *events; +}; + +/* + * Configuration options available inside /proc/sys/fs/epoll/ + */ +/* Maximum number of epoll watched descriptors, per user */ +static long max_user_watches __read_mostly; + +/* + * This mutex is used to serialize ep_free() and eventpoll_release_file(). + */ +static DEFINE_MUTEX(epmutex); + +/* Used to check for epoll file descriptor inclusion loops */ +static struct nested_calls poll_loop_ncalls; + +/* Used for safe wake up implementation */ +static struct nested_calls poll_safewake_ncalls; + +/* Used to call file's f_op->poll() under the nested calls boundaries */ +static struct nested_calls poll_readywalk_ncalls; + +/* Slab cache used to allocate "struct epitem" */ +static struct kmem_cache *epi_cache __read_mostly; + +/* Slab cache used to allocate "struct eppoll_entry" */ +static struct kmem_cache *pwq_cache __read_mostly; + +/* Visited nodes during ep_loop_check(), so we can unset them when we finish */ +static LIST_HEAD(visited_list); + +/* + * List of files with newly added links, where we may need to limit the number + * of emanating paths. Protected by the epmutex. + */ +static LIST_HEAD(tfile_check_list); + +#ifdef CONFIG_SYSCTL + +#include + +static long zero; +static long long_max = LONG_MAX; + +ctl_table epoll_table[] = { + { + .procname = "max_user_watches", + .data = &max_user_watches, + .maxlen = sizeof(max_user_watches), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero, + .extra2 = &long_max, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static const struct file_operations eventpoll_fops; + +static inline int is_file_epoll(struct file *f) +{ + return f->f_op == &eventpoll_fops; +} + +/* Setup the structure that is used as key for the RB tree */ +static inline void ep_set_ffd(struct epoll_filefd *ffd, + struct file *file, int fd) +{ + ffd->file = file; + ffd->fd = fd; +} + +/* Compare RB tree keys */ +static inline int ep_cmp_ffd(struct epoll_filefd *p1, + struct epoll_filefd *p2) +{ + return (p1->file > p2->file ? +1: + (p1->file < p2->file ? -1 : p1->fd - p2->fd)); +} + +/* Tells us if the item is currently linked */ +static inline int ep_is_linked(struct list_head *p) +{ + return !list_empty(p); +} + +static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p) +{ + return container_of(p, struct eppoll_entry, wait); +} + +/* Get the "struct epitem" from a wait queue pointer */ +static inline struct epitem *ep_item_from_wait(wait_queue_t *p) +{ + return container_of(p, struct eppoll_entry, wait)->base; +} + +/* Get the "struct epitem" from an epoll queue wrapper */ +static inline struct epitem *ep_item_from_epqueue(poll_table *p) +{ + return container_of(p, struct ep_pqueue, pt)->epi; +} + +/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ +static inline int ep_op_has_event(int op) +{ + return op != EPOLL_CTL_DEL; +} + +/* Initialize the poll safe wake up structure */ +static void ep_nested_calls_init(struct nested_calls *ncalls) +{ + INIT_LIST_HEAD(&ncalls->tasks_call_list); + spin_lock_init(&ncalls->lock); +} + +/** + * ep_events_available - Checks if ready events might be available. + * + * @ep: Pointer to the eventpoll context. + * + * Returns: Returns a value different than zero if ready events are available, + * or zero otherwise. + */ +static inline int ep_events_available(struct eventpoll *ep) +{ + return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; +} + +/** + * ep_call_nested - Perform a bound (possibly) nested call, by checking + * that the recursion limit is not exceeded, and that + * the same nested call (by the meaning of same cookie) is + * no re-entered. + * + * @ncalls: Pointer to the nested_calls structure to be used for this call. + * @max_nests: Maximum number of allowed nesting calls. + * @nproc: Nested call core function pointer. + * @priv: Opaque data to be passed to the @nproc callback. + * @cookie: Cookie to be used to identify this nested call. + * @ctx: This instance context. + * + * Returns: Returns the code returned by the @nproc callback, or -1 if + * the maximum recursion limit has been exceeded. + */ +static int ep_call_nested(struct nested_calls *ncalls, int max_nests, + int (*nproc)(void *, void *, int), void *priv, + void *cookie, void *ctx) +{ + int error, call_nests = 0; + unsigned long flags; + struct list_head *lsthead = &ncalls->tasks_call_list; + struct nested_call_node *tncur; + struct nested_call_node tnode; + + spin_lock_irqsave(&ncalls->lock, flags); + + /* + * Try to see if the current task is already inside this wakeup call. + * We use a list here, since the population inside this set is always + * very much limited. + */ + list_for_each_entry(tncur, lsthead, llink) { + if (tncur->ctx == ctx && + (tncur->cookie == cookie || ++call_nests > max_nests)) { + /* + * Ops ... loop detected or maximum nest level reached. + * We abort this wake by breaking the cycle itself. + */ + error = -1; + goto out_unlock; + } + } + + /* Add the current task and cookie to the list */ + tnode.ctx = ctx; + tnode.cookie = cookie; + list_add(&tnode.llink, lsthead); + + spin_unlock_irqrestore(&ncalls->lock, flags); + + /* Call the nested function */ + error = (*nproc)(priv, cookie, call_nests); + + /* Remove the current task from the list */ + spin_lock_irqsave(&ncalls->lock, flags); + list_del(&tnode.llink); +out_unlock: + spin_unlock_irqrestore(&ncalls->lock, flags); + + return error; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, + unsigned long events, int subclass) +{ + unsigned long flags; + + spin_lock_irqsave_nested(&wqueue->lock, flags, subclass); + wake_up_locked_poll(wqueue, events); + spin_unlock_irqrestore(&wqueue->lock, flags); +} +#else +static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, + unsigned long events, int subclass) +{ + wake_up_poll(wqueue, events); +} +#endif + +static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) +{ + ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN, + 1 + call_nests); + return 0; +} + +/* + * Perform a safe wake up of the poll wait list. The problem is that + * with the new callback'd wake up system, it is possible that the + * poll callback is reentered from inside the call to wake_up() done + * on the poll wait queue head. The rule is that we cannot reenter the + * wake up code from the same task more than EP_MAX_NESTS times, + * and we cannot reenter the same wait queue head at all. This will + * enable to have a hierarchy of epoll file descriptor of no more than + * EP_MAX_NESTS deep. + */ +static void ep_poll_safewake(wait_queue_head_t *wq) +{ + int this_cpu = get_cpu(); + + ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, + ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu); + + put_cpu(); +} + +static void ep_remove_wait_queue(struct eppoll_entry *pwq) +{ + wait_queue_head_t *whead; + + rcu_read_lock(); + /* If it is cleared by POLLFREE, it should be rcu-safe */ + whead = rcu_dereference(pwq->whead); + if (whead) + remove_wait_queue(whead, &pwq->wait); + rcu_read_unlock(); +} + +/* + * This function unregisters poll callbacks from the associated file + * descriptor. Must be called with "mtx" held (or "epmutex" if called from + * ep_free). + */ +static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) +{ + struct list_head *lsthead = &epi->pwqlist; + struct eppoll_entry *pwq; + + while (!list_empty(lsthead)) { + pwq = list_first_entry(lsthead, struct eppoll_entry, llink); + + list_del(&pwq->llink); + ep_remove_wait_queue(pwq); + kmem_cache_free(pwq_cache, pwq); + } +} + +/** + * ep_scan_ready_list - Scans the ready list in a way that makes possible for + * the scan code, to call f_op->poll(). Also allows for + * O(NumReady) performance. + * + * @ep: Pointer to the epoll private data structure. + * @sproc: Pointer to the scan callback. + * @priv: Private opaque data passed to the @sproc callback. + * @depth: The current depth of recursive f_op->poll calls. + * + * Returns: The same integer error code returned by the @sproc callback. + */ +static int ep_scan_ready_list(struct eventpoll *ep, + int (*sproc)(struct eventpoll *, + struct list_head *, void *), + void *priv, + int depth) +{ + int error, pwake = 0; + unsigned long flags; + struct epitem *epi, *nepi; + LIST_HEAD(txlist); + + /* + * We need to lock this because we could be hit by + * eventpoll_release_file() and epoll_ctl(). + */ + mutex_lock_nested(&ep->mtx, depth); + + /* + * Steal the ready list, and re-init the original one to the + * empty list. Also, set ep->ovflist to NULL so that events + * happening while looping w/out locks, are not lost. We cannot + * have the poll callback to queue directly on ep->rdllist, + * because we want the "sproc" callback to be able to do it + * in a lockless way. + */ + spin_lock_irqsave(&ep->lock, flags); + list_splice_init(&ep->rdllist, &txlist); + ep->ovflist = NULL; + spin_unlock_irqrestore(&ep->lock, flags); + + /* + * Now call the callback function. + */ + error = (*sproc)(ep, &txlist, priv); + + spin_lock_irqsave(&ep->lock, flags); + /* + * During the time we spent inside the "sproc" callback, some + * other events might have been queued by the poll callback. + * We re-insert them inside the main ready-list here. + */ + for (nepi = ep->ovflist; (epi = nepi) != NULL; + nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { + /* + * We need to check if the item is already in the list. + * During the "sproc" callback execution time, items are + * queued into ->ovflist but the "txlist" might already + * contain them, and the list_splice() below takes care of them. + */ + if (!ep_is_linked(&epi->rdllink)) + list_add_tail(&epi->rdllink, &ep->rdllist); + } + /* + * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after + * releasing the lock, events will be queued in the normal way inside + * ep->rdllist. + */ + ep->ovflist = EP_UNACTIVE_PTR; + + /* + * Quickly re-inject items left on "txlist". + */ + list_splice(&txlist, &ep->rdllist); + + if (!list_empty(&ep->rdllist)) { + /* + * Wake up (if active) both the eventpoll wait list and + * the ->poll() wait list (delayed after we release the lock). + */ + if (waitqueue_active(&ep->wq)) + wake_up_locked(&ep->wq); + if (waitqueue_active(&ep->poll_wait)) + pwake++; + } + spin_unlock_irqrestore(&ep->lock, flags); + + mutex_unlock(&ep->mtx); + + /* We have to call this outside the lock */ + if (pwake) + ep_poll_safewake(&ep->poll_wait); + + return error; +} + +/* + * Removes a "struct epitem" from the eventpoll RB tree and deallocates + * all the associated resources. Must be called with "mtx" held. + */ +static int ep_remove(struct eventpoll *ep, struct epitem *epi) +{ + unsigned long flags; + struct file *file = epi->ffd.file; + + /* + * Removes poll wait queue hooks. We _have_ to do this without holding + * the "ep->lock" otherwise a deadlock might occur. This because of the + * sequence of the lock acquisition. Here we do "ep->lock" then the wait + * queue head lock when unregistering the wait queue. The wakeup callback + * will run by holding the wait queue head lock and will call our callback + * that will try to get "ep->lock". + */ + ep_unregister_pollwait(ep, epi); + + /* Remove the current item from the list of epoll hooks */ + spin_lock(&file->f_lock); + if (ep_is_linked(&epi->fllink)) + list_del_init(&epi->fllink); + spin_unlock(&file->f_lock); + + rb_erase(&epi->rbn, &ep->rbr); + + spin_lock_irqsave(&ep->lock, flags); + if (ep_is_linked(&epi->rdllink)) + list_del_init(&epi->rdllink); + spin_unlock_irqrestore(&ep->lock, flags); + + /* At this point it is safe to free the eventpoll item */ + kmem_cache_free(epi_cache, epi); + + atomic_long_dec(&ep->user->epoll_watches); + + return 0; +} + +static void ep_free(struct eventpoll *ep) +{ + struct rb_node *rbp; + struct epitem *epi; + + /* We need to release all tasks waiting for these file */ + if (waitqueue_active(&ep->poll_wait)) + ep_poll_safewake(&ep->poll_wait); + + /* + * We need to lock this because we could be hit by + * eventpoll_release_file() while we're freeing the "struct eventpoll". + * We do not need to hold "ep->mtx" here because the epoll file + * is on the way to be removed and no one has references to it + * anymore. The only hit might come from eventpoll_release_file() but + * holding "epmutex" is sufficient here. + */ + mutex_lock(&epmutex); + + /* + * Walks through the whole tree by unregistering poll callbacks. + */ + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + + ep_unregister_pollwait(ep, epi); + } + + /* + * Walks through the whole tree by freeing each "struct epitem". At this + * point we are sure no poll callbacks will be lingering around, and also by + * holding "epmutex" we can be sure that no file cleanup code will hit + * us during this operation. So we can avoid the lock on "ep->lock". + */ + while ((rbp = rb_first(&ep->rbr)) != NULL) { + epi = rb_entry(rbp, struct epitem, rbn); + ep_remove(ep, epi); + } + + mutex_unlock(&epmutex); + mutex_destroy(&ep->mtx); + free_uid(ep->user); + kfree(ep); +} + +static int ep_eventpoll_release(struct inode *inode, struct file *file) +{ + struct eventpoll *ep = file->private_data; + + if (ep) + ep_free(ep); + + return 0; +} + +static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv) +{ + struct epitem *epi, *tmp; + + list_for_each_entry_safe(epi, tmp, head, rdllink) { + if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & + epi->event.events) + return POLLIN | POLLRDNORM; + else { + /* + * Item has been dropped into the ready list by the poll + * callback, but it's not actually ready, as far as + * caller requested events goes. We can remove it here. + */ + list_del_init(&epi->rdllink); + } + } + + return 0; +} + +static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) +{ + return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1); +} + +static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) +{ + int pollflags; + struct eventpoll *ep = file->private_data; + + /* Insert inside our poll wait queue */ + poll_wait(file, &ep->poll_wait, wait); + + /* + * Proceed to find out if wanted events are really available inside + * the ready list. This need to be done under ep_call_nested() + * supervision, since the call to f_op->poll() done on listed files + * could re-enter here. + */ + pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, + ep_poll_readyevents_proc, ep, ep, current); + + return pollflags != -1 ? pollflags : 0; +} + +/* File callbacks that implement the eventpoll file behaviour */ +static const struct file_operations eventpoll_fops = { + .release = ep_eventpoll_release, + .poll = ep_eventpoll_poll, + .llseek = noop_llseek, +}; + +/* + * This is called from eventpoll_release() to unlink files from the eventpoll + * interface. We need to have this facility to cleanup correctly files that are + * closed without being removed from the eventpoll interface. + */ +void eventpoll_release_file(struct file *file) +{ + struct list_head *lsthead = &file->f_ep_links; + struct eventpoll *ep; + struct epitem *epi; + + /* + * We don't want to get "file->f_lock" because it is not + * necessary. It is not necessary because we're in the "struct file" + * cleanup path, and this means that no one is using this file anymore. + * So, for example, epoll_ctl() cannot hit here since if we reach this + * point, the file counter already went to zero and fget() would fail. + * The only hit might come from ep_free() but by holding the mutex + * will correctly serialize the operation. We do need to acquire + * "ep->mtx" after "epmutex" because ep_remove() requires it when called + * from anywhere but ep_free(). + * + * Besides, ep_remove() acquires the lock, so we can't hold it here. + */ + mutex_lock(&epmutex); + + while (!list_empty(lsthead)) { + epi = list_first_entry(lsthead, struct epitem, fllink); + + ep = epi->ep; + list_del_init(&epi->fllink); + mutex_lock_nested(&ep->mtx, 0); + ep_remove(ep, epi); + mutex_unlock(&ep->mtx); + } + + mutex_unlock(&epmutex); +} + +static int ep_alloc(struct eventpoll **pep) +{ + int error; + struct user_struct *user; + struct eventpoll *ep; + + user = get_current_user(); + error = -ENOMEM; + ep = kzalloc(sizeof(*ep), GFP_KERNEL); + if (unlikely(!ep)) + goto free_uid; + + spin_lock_init(&ep->lock); + mutex_init(&ep->mtx); + init_waitqueue_head(&ep->wq); + init_waitqueue_head(&ep->poll_wait); + INIT_LIST_HEAD(&ep->rdllist); + ep->rbr = RB_ROOT; + ep->ovflist = EP_UNACTIVE_PTR; + ep->user = user; + + *pep = ep; + + return 0; + +free_uid: + free_uid(user); + return error; +} + +/* + * Search the file inside the eventpoll tree. The RB tree operations + * are protected by the "mtx" mutex, and ep_find() must be called with + * "mtx" held. + */ +static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) +{ + int kcmp; + struct rb_node *rbp; + struct epitem *epi, *epir = NULL; + struct epoll_filefd ffd; + + ep_set_ffd(&ffd, file, fd); + for (rbp = ep->rbr.rb_node; rbp; ) { + epi = rb_entry(rbp, struct epitem, rbn); + kcmp = ep_cmp_ffd(&ffd, &epi->ffd); + if (kcmp > 0) + rbp = rbp->rb_right; + else if (kcmp < 0) + rbp = rbp->rb_left; + else { + epir = epi; + break; + } + } + + return epir; +} + +/* + * This is the callback that is passed to the wait queue wakeup + * mechanism. It is called by the stored file descriptors when they + * have events to report. + */ +static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + int pwake = 0; + unsigned long flags; + struct epitem *epi = ep_item_from_wait(wait); + struct eventpoll *ep = epi->ep; + + if ((unsigned long)key & POLLFREE) { + ep_pwq_from_wait(wait)->whead = NULL; + /* + * whead = NULL above can race with ep_remove_wait_queue() + * which can do another remove_wait_queue() after us, so we + * can't use __remove_wait_queue(). whead->lock is held by + * the caller. + */ + list_del_init(&wait->task_list); + } + + spin_lock_irqsave(&ep->lock, flags); + + /* + * If the event mask does not contain any poll(2) event, we consider the + * descriptor to be disabled. This condition is likely the effect of the + * EPOLLONESHOT bit that disables the descriptor when an event is received, + * until the next EPOLL_CTL_MOD will be issued. + */ + if (!(epi->event.events & ~EP_PRIVATE_BITS)) + goto out_unlock; + + /* + * Check the events coming with the callback. At this stage, not + * every device reports the events in the "key" parameter of the + * callback. We need to be able to handle both cases here, hence the + * test for "key" != NULL before the event match test. + */ + if (key && !((unsigned long) key & epi->event.events)) + goto out_unlock; + + /* + * If we are transferring events to userspace, we can hold no locks + * (because we're accessing user memory, and because of linux f_op->poll() + * semantics). All the events that happen during that period of time are + * chained in ep->ovflist and requeued later on. + */ + if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { + if (epi->next == EP_UNACTIVE_PTR) { + epi->next = ep->ovflist; + ep->ovflist = epi; + } + goto out_unlock; + } + + /* If this file is already in the ready list we exit soon */ + if (!ep_is_linked(&epi->rdllink)) + list_add_tail(&epi->rdllink, &ep->rdllist); + + /* + * Wake up ( if active ) both the eventpoll wait list and the ->poll() + * wait list. + */ + if (waitqueue_active(&ep->wq)) + wake_up_locked(&ep->wq); + if (waitqueue_active(&ep->poll_wait)) + pwake++; + +out_unlock: + spin_unlock_irqrestore(&ep->lock, flags); + + /* We have to call this outside the lock */ + if (pwake) + ep_poll_safewake(&ep->poll_wait); + + return 1; +} + +/* + * This is the callback that is used to add our wait queue to the + * target file wakeup lists. + */ +static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, + poll_table *pt) +{ + struct epitem *epi = ep_item_from_epqueue(pt); + struct eppoll_entry *pwq; + + if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { + init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); + pwq->whead = whead; + pwq->base = epi; + add_wait_queue(whead, &pwq->wait); + list_add_tail(&pwq->llink, &epi->pwqlist); + epi->nwait++; + } else { + /* We have to signal that an error occurred */ + epi->nwait = -1; + } +} + +static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) +{ + int kcmp; + struct rb_node **p = &ep->rbr.rb_node, *parent = NULL; + struct epitem *epic; + + while (*p) { + parent = *p; + epic = rb_entry(parent, struct epitem, rbn); + kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); + if (kcmp > 0) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&epi->rbn, parent, p); + rb_insert_color(&epi->rbn, &ep->rbr); +} + + + +#define PATH_ARR_SIZE 5 +/* + * These are the number paths of length 1 to 5, that we are allowing to emanate + * from a single file of interest. For example, we allow 1000 paths of length + * 1, to emanate from each file of interest. This essentially represents the + * potential wakeup paths, which need to be limited in order to avoid massive + * uncontrolled wakeup storms. The common use case should be a single ep which + * is connected to n file sources. In this case each file source has 1 path + * of length 1. Thus, the numbers below should be more than sufficient. These + * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify + * and delete can't add additional paths. Protected by the epmutex. + */ +static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; +static int path_count[PATH_ARR_SIZE]; + +static int path_count_inc(int nests) +{ + /* Allow an arbitrary number of depth 1 paths */ + if (nests == 0) + return 0; + + if (++path_count[nests] > path_limits[nests]) + return -1; + return 0; +} + +static void path_count_init(void) +{ + int i; + + for (i = 0; i < PATH_ARR_SIZE; i++) + path_count[i] = 0; +} + +static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct file *child_file; + struct epitem *epi; + + list_for_each_entry(epi, &file->f_ep_links, fllink) { + child_file = epi->ep->file; + if (is_file_epoll(child_file)) { + if (list_empty(&child_file->f_ep_links)) { + if (path_count_inc(call_nests)) { + error = -1; + break; + } + } else { + error = ep_call_nested(&poll_loop_ncalls, + EP_MAX_NESTS, + reverse_path_check_proc, + child_file, child_file, + current); + } + if (error != 0) + break; + } else { + printk(KERN_ERR "reverse_path_check_proc: " + "file is not an ep!\n"); + } + } + return error; +} + +/** + * reverse_path_check - The tfile_check_list is list of file *, which have + * links that are proposed to be newly added. We need to + * make sure that those added links don't add too many + * paths such that we will spend all our time waking up + * eventpoll objects. + * + * Returns: Returns zero if the proposed links don't create too many paths, + * -1 otherwise. + */ +static int reverse_path_check(void) +{ + int length = 0; + int error = 0; + struct file *current_file; + + /* let's call this for all tfiles */ + list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { + length++; + path_count_init(); + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + reverse_path_check_proc, current_file, + current_file, current); + if (error) + break; + } + return error; +} + +/* + * Must be called with "mtx" held. + */ +static int ep_insert(struct eventpoll *ep, struct epoll_event *event, + struct file *tfile, int fd) +{ + int error, revents, pwake = 0; + unsigned long flags; + long user_watches; + struct epitem *epi; + struct ep_pqueue epq; + + user_watches = atomic_long_read(&ep->user->epoll_watches); + if (unlikely(user_watches >= max_user_watches)) + return -ENOSPC; + if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) + return -ENOMEM; + + /* Item initialization follow here ... */ + INIT_LIST_HEAD(&epi->rdllink); + INIT_LIST_HEAD(&epi->fllink); + INIT_LIST_HEAD(&epi->pwqlist); + epi->ep = ep; + ep_set_ffd(&epi->ffd, tfile, fd); + epi->event = *event; + epi->nwait = 0; + epi->next = EP_UNACTIVE_PTR; + + /* Initialize the poll table using the queue callback */ + epq.epi = epi; + init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); + + /* + * Attach the item to the poll hooks and get current event bits. + * We can safely use the file* here because its usage count has + * been increased by the caller of this function. Note that after + * this operation completes, the poll callback can start hitting + * the new item. + */ + revents = tfile->f_op->poll(tfile, &epq.pt); + + /* + * We have to check if something went wrong during the poll wait queue + * install process. Namely an allocation for a wait queue failed due + * high memory pressure. + */ + error = -ENOMEM; + if (epi->nwait < 0) + goto error_unregister; + + /* Add the current item to the list of active epoll hook for this file */ + spin_lock(&tfile->f_lock); + list_add_tail(&epi->fllink, &tfile->f_ep_links); + spin_unlock(&tfile->f_lock); + + /* + * Add the current item to the RB tree. All RB tree operations are + * protected by "mtx", and ep_insert() is called with "mtx" held. + */ + ep_rbtree_insert(ep, epi); + + /* now check if we've created too many backpaths */ + error = -EINVAL; + if (reverse_path_check()) + goto error_remove_epi; + + /* We have to drop the new item inside our item list to keep track of it */ + spin_lock_irqsave(&ep->lock, flags); + + /* If the file is already "ready" we drop it inside the ready list */ + if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { + list_add_tail(&epi->rdllink, &ep->rdllist); + + /* Notify waiting tasks that events are available */ + if (waitqueue_active(&ep->wq)) + wake_up_locked(&ep->wq); + if (waitqueue_active(&ep->poll_wait)) + pwake++; + } + + spin_unlock_irqrestore(&ep->lock, flags); + + atomic_long_inc(&ep->user->epoll_watches); + + /* We have to call this outside the lock */ + if (pwake) + ep_poll_safewake(&ep->poll_wait); + + return 0; + +error_remove_epi: + spin_lock(&tfile->f_lock); + if (ep_is_linked(&epi->fllink)) + list_del_init(&epi->fllink); + spin_unlock(&tfile->f_lock); + + rb_erase(&epi->rbn, &ep->rbr); + +error_unregister: + ep_unregister_pollwait(ep, epi); + + /* + * We need to do this because an event could have been arrived on some + * allocated wait queue. Note that we don't care about the ep->ovflist + * list, since that is used/cleaned only inside a section bound by "mtx". + * And ep_insert() is called with "mtx" held. + */ + spin_lock_irqsave(&ep->lock, flags); + if (ep_is_linked(&epi->rdllink)) + list_del_init(&epi->rdllink); + spin_unlock_irqrestore(&ep->lock, flags); + + kmem_cache_free(epi_cache, epi); + + return error; +} + +/* + * Modify the interest event mask by dropping an event if the new mask + * has a match in the current file status. Must be called with "mtx" held. + */ +static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event) +{ + int pwake = 0; + unsigned int revents; + + /* + * Set the new event interest mask before calling f_op->poll(); + * otherwise we might miss an event that happens between the + * f_op->poll() call and the new event set registering. + */ + epi->event.events = event->events; + epi->event.data = event->data; /* protected by mtx */ + + /* + * Get current event bits. We can safely use the file* here because + * its usage count has been increased by the caller of this function. + */ + revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); + + /* + * If the item is "hot" and it is not registered inside the ready + * list, push it inside. + */ + if (revents & event->events) { + spin_lock_irq(&ep->lock); + if (!ep_is_linked(&epi->rdllink)) { + list_add_tail(&epi->rdllink, &ep->rdllist); + + /* Notify waiting tasks that events are available */ + if (waitqueue_active(&ep->wq)) + wake_up_locked(&ep->wq); + if (waitqueue_active(&ep->poll_wait)) + pwake++; + } + spin_unlock_irq(&ep->lock); + } + + /* We have to call this outside the lock */ + if (pwake) + ep_poll_safewake(&ep->poll_wait); + + return 0; +} + +static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv) +{ + struct ep_send_events_data *esed = priv; + int eventcnt; + unsigned int revents; + struct epitem *epi; + struct epoll_event __user *uevent; + + /* + * We can loop without lock because we are passed a task private list. + * Items cannot vanish during the loop because ep_scan_ready_list() is + * holding "mtx" during this call. + */ + for (eventcnt = 0, uevent = esed->events; + !list_empty(head) && eventcnt < esed->maxevents;) { + epi = list_first_entry(head, struct epitem, rdllink); + + list_del_init(&epi->rdllink); + + revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & + epi->event.events; + + /* + * If the event mask intersect the caller-requested one, + * deliver the event to userspace. Again, ep_scan_ready_list() + * is holding "mtx", so no operations coming from userspace + * can change the item. + */ + if (revents) { + if (__put_user(revents, &uevent->events) || + __put_user(epi->event.data, &uevent->data)) { + list_add(&epi->rdllink, head); + return eventcnt ? eventcnt : -EFAULT; + } + eventcnt++; + uevent++; + if (epi->event.events & EPOLLONESHOT) + epi->event.events &= EP_PRIVATE_BITS; + else if (!(epi->event.events & EPOLLET)) { + /* + * If this file has been added with Level + * Trigger mode, we need to insert back inside + * the ready list, so that the next call to + * epoll_wait() will check again the events + * availability. At this point, no one can insert + * into ep->rdllist besides us. The epoll_ctl() + * callers are locked out by + * ep_scan_ready_list() holding "mtx" and the + * poll callback will queue them in ep->ovflist. + */ + list_add_tail(&epi->rdllink, &ep->rdllist); + } + } + } + + return eventcnt; +} + +static int ep_send_events(struct eventpoll *ep, + struct epoll_event __user *events, int maxevents) +{ + struct ep_send_events_data esed; + + esed.maxevents = maxevents; + esed.events = events; + + return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0); +} + +static inline struct timespec ep_set_mstimeout(long ms) +{ + struct timespec now, ts = { + .tv_sec = ms / MSEC_PER_SEC, + .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC), + }; + + ktime_get_ts(&now); + return timespec_add_safe(now, ts); +} + +/** + * ep_poll - Retrieves ready events, and delivers them to the caller supplied + * event buffer. + * + * @ep: Pointer to the eventpoll context. + * @events: Pointer to the userspace buffer where the ready events should be + * stored. + * @maxevents: Size (in terms of number of events) of the caller event buffer. + * @timeout: Maximum timeout for the ready events fetch operation, in + * milliseconds. If the @timeout is zero, the function will not block, + * while if the @timeout is less than zero, the function will block + * until at least one event has been retrieved (or an error + * occurred). + * + * Returns: Returns the number of ready events which have been fetched, or an + * error code, in case of error. + */ +static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, + int maxevents, long timeout) +{ + int res = 0, eavail, timed_out = 0; + unsigned long flags; + long slack = 0; + wait_queue_t wait; + ktime_t expires, *to = NULL; + + if (timeout > 0) { + struct timespec end_time = ep_set_mstimeout(timeout); + + slack = select_estimate_accuracy(&end_time); + to = &expires; + *to = timespec_to_ktime(end_time); + } else if (timeout == 0) { + /* + * Avoid the unnecessary trip to the wait queue loop, if the + * caller specified a non blocking operation. + */ + timed_out = 1; + spin_lock_irqsave(&ep->lock, flags); + goto check_events; + } + +fetch_events: + spin_lock_irqsave(&ep->lock, flags); + + if (!ep_events_available(ep)) { + /* + * We don't have any available event to return to the caller. + * We need to sleep here, and we will be wake up by + * ep_poll_callback() when events will become available. + */ + init_waitqueue_entry(&wait, current); + __add_wait_queue_exclusive(&ep->wq, &wait); + + for (;;) { + /* + * We don't want to sleep if the ep_poll_callback() sends us + * a wakeup in between. That's why we set the task state + * to TASK_INTERRUPTIBLE before doing the checks. + */ + set_current_state(TASK_INTERRUPTIBLE); + if (ep_events_available(ep) || timed_out) + break; + if (signal_pending(current)) { + res = -EINTR; + break; + } + + spin_unlock_irqrestore(&ep->lock, flags); + if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + timed_out = 1; + + spin_lock_irqsave(&ep->lock, flags); + } + __remove_wait_queue(&ep->wq, &wait); + + set_current_state(TASK_RUNNING); + } +check_events: + /* Is it worth to try to dig for events ? */ + eavail = ep_events_available(ep); + + spin_unlock_irqrestore(&ep->lock, flags); + + /* + * Try to transfer events to user space. In case we get 0 events and + * there's still timeout left over, we go trying again in search of + * more luck. + */ + if (!res && eavail && + !(res = ep_send_events(ep, events, maxevents)) && !timed_out) + goto fetch_events; + + return res; +} + +/** + * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested() + * API, to verify that adding an epoll file inside another + * epoll structure, does not violate the constraints, in + * terms of closed loops, or too deep chains (which can + * result in excessive stack usage). + * + * @priv: Pointer to the epoll file to be currently checked. + * @cookie: Original cookie for this call. This is the top-of-the-chain epoll + * data structure pointer. + * @call_nests: Current dept of the @ep_call_nested() call stack. + * + * Returns: Returns zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or -1 otherwise. + */ +static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct eventpoll *ep = file->private_data; + struct eventpoll *ep_tovisit; + struct rb_node *rbp; + struct epitem *epi; + + mutex_lock_nested(&ep->mtx, call_nests + 1); + ep->visited = 1; + list_add(&ep->visited_list_link, &visited_list); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + if (unlikely(is_file_epoll(epi->ffd.file))) { + ep_tovisit = epi->ffd.file->private_data; + if (ep_tovisit->visited) + continue; + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, epi->ffd.file, + ep_tovisit, current); + if (error != 0) + break; + } else { + /* + * If we've reached a file that is not associated with + * an ep, then we need to check if the newly added + * links are going to add too many wakeup paths. We do + * this by adding it to the tfile_check_list, if it's + * not already there, and calling reverse_path_check() + * during ep_insert(). + */ + if (list_empty(&epi->ffd.file->f_tfile_llink)) + list_add(&epi->ffd.file->f_tfile_llink, + &tfile_check_list); + } + } + mutex_unlock(&ep->mtx); + + return error; +} + +/** + * ep_loop_check - Performs a check to verify that adding an epoll file (@file) + * another epoll file (represented by @ep) does not create + * closed loops or too deep chains. + * + * @ep: Pointer to the epoll private data structure. + * @file: Pointer to the epoll file to be checked. + * + * Returns: Returns zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or -1 otherwise. + */ +static int ep_loop_check(struct eventpoll *ep, struct file *file) +{ + int ret; + struct eventpoll *ep_cur, *ep_next; + + ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, file, ep, current); + /* clear visited list */ + list_for_each_entry_safe(ep_cur, ep_next, &visited_list, + visited_list_link) { + ep_cur->visited = 0; + list_del(&ep_cur->visited_list_link); + } + return ret; +} + +static void clear_tfile_check_list(void) +{ + struct file *file; + + /* first clear the tfile_check_list */ + while (!list_empty(&tfile_check_list)) { + file = list_first_entry(&tfile_check_list, struct file, + f_tfile_llink); + list_del_init(&file->f_tfile_llink); + } + INIT_LIST_HEAD(&tfile_check_list); +} + +/* + * Open an eventpoll file descriptor. + */ +SYSCALL_DEFINE1(epoll_create1, int, flags) +{ + int error, fd; + struct eventpoll *ep = NULL; + struct file *file; + + /* Check the EPOLL_* constant for consistency. */ + BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); + + if (flags & ~EPOLL_CLOEXEC) + return -EINVAL; + /* + * Create the internal data structure ("struct eventpoll"). + */ + error = ep_alloc(&ep); + if (error < 0) + return error; + /* + * Creates all the items needed to setup an eventpoll file. That is, + * a file structure and a free file descriptor. + */ + fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); + if (fd < 0) { + error = fd; + goto out_free_ep; + } + file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, + O_RDWR | (flags & O_CLOEXEC)); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto out_free_fd; + } + fd_install(fd, file); + ep->file = file; + return fd; + +out_free_fd: + put_unused_fd(fd); +out_free_ep: + ep_free(ep); + return error; +} + +SYSCALL_DEFINE1(epoll_create, int, size) +{ + if (size <= 0) + return -EINVAL; + + return sys_epoll_create1(0); +} + +/* + * The following function implements the controller interface for + * the eventpoll file that enables the insertion/removal/change of + * file descriptors inside the interest set. + */ +SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + struct epoll_event __user *, event) +{ + int error; + int did_lock_epmutex = 0; + struct file *file, *tfile; + struct eventpoll *ep; + struct epitem *epi; + struct epoll_event epds; + + error = -EFAULT; + if (ep_op_has_event(op) && + copy_from_user(&epds, event, sizeof(struct epoll_event))) + goto error_return; + + /* Get the "struct file *" for the eventpoll file */ + error = -EBADF; + file = fget(epfd); + if (!file) + goto error_return; + + /* Get the "struct file *" for the target file */ + tfile = fget(fd); + if (!tfile) + goto error_fput; + + /* The target file descriptor must support poll */ + error = -EPERM; + if (!tfile->f_op || !tfile->f_op->poll) + goto error_tgt_fput; + + /* + * We have to check that the file structure underneath the file descriptor + * the user passed to us _is_ an eventpoll file. And also we do not permit + * adding an epoll file descriptor inside itself. + */ + error = -EINVAL; + if (file == tfile || !is_file_epoll(file)) + goto error_tgt_fput; + + /* + * At this point it is safe to assume that the "private_data" contains + * our own data structure. + */ + ep = file->private_data; + + /* + * When we insert an epoll file descriptor, inside another epoll file + * descriptor, there is the change of creating closed loops, which are + * better be handled here, than in more critical paths. While we are + * checking for loops we also determine the list of files reachable + * and hang them on the tfile_check_list, so we can check that we + * haven't created too many possible wakeup paths. + * + * We need to hold the epmutex across both ep_insert and ep_remove + * b/c we want to make sure we are looking at a coherent view of + * epoll network. + */ + if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { + mutex_lock(&epmutex); + did_lock_epmutex = 1; + } + if (op == EPOLL_CTL_ADD) { + if (is_file_epoll(tfile)) { + error = -ELOOP; + if (ep_loop_check(ep, tfile) != 0) + goto error_tgt_fput; + } else + list_add(&tfile->f_tfile_llink, &tfile_check_list); + } + + mutex_lock_nested(&ep->mtx, 0); + + /* + * Try to lookup the file inside our RB tree, Since we grabbed "mtx" + * above, we can be sure to be able to use the item looked up by + * ep_find() till we release the mutex. + */ + epi = ep_find(ep, tfile, fd); + + error = -EINVAL; + switch (op) { + case EPOLL_CTL_ADD: + if (!epi) { + epds.events |= POLLERR | POLLHUP; + error = ep_insert(ep, &epds, tfile, fd); + } else + error = -EEXIST; + clear_tfile_check_list(); + break; + case EPOLL_CTL_DEL: + if (epi) + error = ep_remove(ep, epi); + else + error = -ENOENT; + break; + case EPOLL_CTL_MOD: + if (epi) { + epds.events |= POLLERR | POLLHUP; + error = ep_modify(ep, epi, &epds); + } else + error = -ENOENT; + break; + } + mutex_unlock(&ep->mtx); + +error_tgt_fput: + if (did_lock_epmutex) + mutex_unlock(&epmutex); + + fput(tfile); +error_fput: + fput(file); +error_return: + + return error; +} + +/* + * Implement the event wait interface for the eventpoll file. It is the kernel + * part of the user space epoll_wait(2). + */ +SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, + int, maxevents, int, timeout) +{ + int error; + struct file *file; + struct eventpoll *ep; + + /* The maximum number of event must be greater than zero */ + if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) + return -EINVAL; + + /* Verify that the area passed by the user is writeable */ + if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) { + error = -EFAULT; + goto error_return; + } + + /* Get the "struct file *" for the eventpoll file */ + error = -EBADF; + file = fget(epfd); + if (!file) + goto error_return; + + /* + * We have to check that the file structure underneath the fd + * the user passed to us _is_ an eventpoll file. + */ + error = -EINVAL; + if (!is_file_epoll(file)) + goto error_fput; + + /* + * At this point it is safe to assume that the "private_data" contains + * our own data structure. + */ + ep = file->private_data; + + /* Time to fish for events ... */ + error = ep_poll(ep, events, maxevents, timeout); + +error_fput: + fput(file); +error_return: + + return error; +} + +#ifdef HAVE_SET_RESTORE_SIGMASK + +/* + * Implement the event wait interface for the eventpoll file. It is the kernel + * part of the user space epoll_pwait(2). + */ +SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, + int, maxevents, int, timeout, const sigset_t __user *, sigmask, + size_t, sigsetsize) +{ + int error; + sigset_t ksigmask, sigsaved; + + /* + * If the caller wants a certain signal mask to be set during the wait, + * we apply it here. + */ + if (sigmask) { + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) + return -EFAULT; + sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + error = sys_epoll_wait(epfd, events, maxevents, timeout); + + /* + * If we changed the signal mask, we need to restore the original one. + * In case we've got a signal while waiting, we do not restore the + * signal mask yet, and we allow do_signal() to deliver the signal on + * the way back to userspace, before the signal mask is restored. + */ + if (sigmask) { + if (error == -EINTR) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } else + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + } + + return error; +} + +#endif /* HAVE_SET_RESTORE_SIGMASK */ + +static int __init eventpoll_init(void) +{ + struct sysinfo si; + + si_meminfo(&si); + /* + * Allows top 4% of lomem to be allocated for epoll watches (per user). + */ + max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / + EP_ITEM_COST; + BUG_ON(max_user_watches < 0); + + /* + * Initialize the structure used to perform epoll file descriptor + * inclusion loops checks. + */ + ep_nested_calls_init(&poll_loop_ncalls); + + /* Initialize the structure used to perform safe poll wait head wake ups */ + ep_nested_calls_init(&poll_safewake_ncalls); + + /* Initialize the structure used to perform file's f_op->poll() calls */ + ep_nested_calls_init(&poll_readywalk_ncalls); + + /* Allocates slab cache used to allocate "struct epitem" items */ + epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + /* Allocates slab cache used to allocate "struct eppoll_entry" */ + pwq_cache = kmem_cache_create("eventpoll_pwq", + sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL); + + return 0; +} +fs_initcall(eventpoll_init); diff --git a/fs/exec.c b/fs/exec.c new file mode 100644 index 00000000..044c13ff --- /dev/null +++ b/fs/exec.c @@ -0,0 +1,2247 @@ +/* + * linux/fs/exec.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * #!-checking implemented by tytso. + */ +/* + * Demand-loading implemented 01.12.91 - no need to read anything but + * the header into memory. The inode of the executable is put into + * "current->executable", and page faults do the actual loading. Clean. + * + * Once more I can proudly say that linux stood up to being changed: it + * was less than 2 hours work to get demand-loading completely implemented. + * + * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, + * current->executable is only used by the procfs. This allows a dispatch + * table to check for several different types of binary formats. We keep + * trying until we recognize the file or we run out of supported binary + * formats. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "internal.h" + +int core_uses_pid; +char core_pattern[CORENAME_MAX_SIZE] = "core"; +unsigned int core_pipe_limit; +int suid_dumpable = 0; + +struct core_name { + char *corename; + int used, size; +}; +static atomic_t call_count = ATOMIC_INIT(1); + +/* The maximal length of core_pattern is also specified in sysctl.c */ + +static LIST_HEAD(formats); +static DEFINE_RWLOCK(binfmt_lock); + +int __register_binfmt(struct linux_binfmt * fmt, int insert) +{ + if (!fmt) + return -EINVAL; + write_lock(&binfmt_lock); + insert ? list_add(&fmt->lh, &formats) : + list_add_tail(&fmt->lh, &formats); + write_unlock(&binfmt_lock); + return 0; +} + +EXPORT_SYMBOL(__register_binfmt); + +void unregister_binfmt(struct linux_binfmt * fmt) +{ + write_lock(&binfmt_lock); + list_del(&fmt->lh); + write_unlock(&binfmt_lock); +} + +EXPORT_SYMBOL(unregister_binfmt); + +static inline void put_binfmt(struct linux_binfmt * fmt) +{ + module_put(fmt->module); +} + +/* + * Note that a shared library must be both readable and executable due to + * security reasons. + * + * Also note that we take the address to load from from the file itself. + */ +SYSCALL_DEFINE1(uselib, const char __user *, library) +{ + struct file *file; + char *tmp = getname(library); + int error = PTR_ERR(tmp); + static const struct open_flags uselib_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN + }; + + if (IS_ERR(tmp)) + goto out; + + file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW); + putname(tmp); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; + + error = -EINVAL; + if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) + goto exit; + + error = -EACCES; + if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) + goto exit; + + fsnotify_open(file); + + error = -ENOEXEC; + if(file->f_op) { + struct linux_binfmt * fmt; + + read_lock(&binfmt_lock); + list_for_each_entry(fmt, &formats, lh) { + if (!fmt->load_shlib) + continue; + if (!try_module_get(fmt->module)) + continue; + read_unlock(&binfmt_lock); + error = fmt->load_shlib(file); + read_lock(&binfmt_lock); + put_binfmt(fmt); + if (error != -ENOEXEC) + break; + } + read_unlock(&binfmt_lock); + } +exit: + fput(file); +out: + return error; +} + +#ifdef CONFIG_MMU +/* + * The nascent bprm->mm is not visible until exec_mmap() but it can + * use a lot of memory, account these pages in current->mm temporary + * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we + * change the counter back via acct_arg_size(0). + */ +static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ + struct mm_struct *mm = current->mm; + long diff = (long)(pages - bprm->vma_pages); + + if (!mm || !diff) + return; + + bprm->vma_pages = pages; + +#ifdef SPLIT_RSS_COUNTING + add_mm_counter(mm, MM_ANONPAGES, diff); +#else + spin_lock(&mm->page_table_lock); + add_mm_counter(mm, MM_ANONPAGES, diff); + spin_unlock(&mm->page_table_lock); +#endif +} + +static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, + int write) +{ + struct page *page; + int ret; + +#ifdef CONFIG_STACK_GROWSUP + if (write) { + ret = expand_downwards(bprm->vma, pos); + if (ret < 0) + return NULL; + } +#endif + ret = get_user_pages(current, bprm->mm, pos, + 1, write, 1, &page, NULL); + if (ret <= 0) + return NULL; + + if (write) { + unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; + struct rlimit *rlim; + + acct_arg_size(bprm, size / PAGE_SIZE); + + /* + * We've historically supported up to 32 pages (ARG_MAX) + * of argument strings even with small stacks + */ + if (size <= ARG_MAX) + return page; + + /* + * Limit to 1/4-th the stack size for the argv+env strings. + * This ensures that: + * - the remaining binfmt code will not run out of stack space, + * - the program will have a reasonable amount of stack left + * to work from. + */ + rlim = current->signal->rlim; + if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) { + put_page(page); + return NULL; + } + } + + return page; +} + +static void put_arg_page(struct page *page) +{ + put_page(page); +} + +static void free_arg_page(struct linux_binprm *bprm, int i) +{ +} + +static void free_arg_pages(struct linux_binprm *bprm) +{ +} + +static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, + struct page *page) +{ + flush_cache_page(bprm->vma, pos, page_to_pfn(page)); +} + +static int __bprm_mm_init(struct linux_binprm *bprm) +{ + int err; + struct vm_area_struct *vma = NULL; + struct mm_struct *mm = bprm->mm; + + bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!vma) + return -ENOMEM; + + down_write(&mm->mmap_sem); + vma->vm_mm = mm; + + /* + * Place the stack at the largest stack address the architecture + * supports. Later, we'll move this to an appropriate place. We don't + * use STACK_TOP because that can depend on attributes which aren't + * configured yet. + */ + BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + vma->vm_end = STACK_TOP_MAX; + vma->vm_start = vma->vm_end - PAGE_SIZE; + vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + INIT_LIST_HEAD(&vma->anon_vma_chain); + + err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); + if (err) + goto err; + + err = insert_vm_struct(mm, vma); + if (err) + goto err; + + mm->stack_vm = mm->total_vm = 1; + up_write(&mm->mmap_sem); + bprm->p = vma->vm_end - sizeof(void *); + return 0; +err: + up_write(&mm->mmap_sem); + bprm->vma = NULL; + kmem_cache_free(vm_area_cachep, vma); + return err; +} + +static bool valid_arg_len(struct linux_binprm *bprm, long len) +{ + return len <= MAX_ARG_STRLEN; +} + +#else + +static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ +} + +static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, + int write) +{ + struct page *page; + + page = bprm->page[pos / PAGE_SIZE]; + if (!page && write) { + page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); + if (!page) + return NULL; + bprm->page[pos / PAGE_SIZE] = page; + } + + return page; +} + +static void put_arg_page(struct page *page) +{ +} + +static void free_arg_page(struct linux_binprm *bprm, int i) +{ + if (bprm->page[i]) { + __free_page(bprm->page[i]); + bprm->page[i] = NULL; + } +} + +static void free_arg_pages(struct linux_binprm *bprm) +{ + int i; + + for (i = 0; i < MAX_ARG_PAGES; i++) + free_arg_page(bprm, i); +} + +static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, + struct page *page) +{ +} + +static int __bprm_mm_init(struct linux_binprm *bprm) +{ + bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); + return 0; +} + +static bool valid_arg_len(struct linux_binprm *bprm, long len) +{ + return len <= bprm->p; +} + +#endif /* CONFIG_MMU */ + +/* + * Create a new mm_struct and populate it with a temporary stack + * vm_area_struct. We don't have enough context at this point to set the stack + * flags, permissions, and offset, so we use temporary values. We'll update + * them later in setup_arg_pages(). + */ +int bprm_mm_init(struct linux_binprm *bprm) +{ + int err; + struct mm_struct *mm = NULL; + + bprm->mm = mm = mm_alloc(); + err = -ENOMEM; + if (!mm) + goto err; + + err = init_new_context(current, mm); + if (err) + goto err; + + err = __bprm_mm_init(bprm); + if (err) + goto err; + + return 0; + +err: + if (mm) { + bprm->mm = NULL; + mmdrop(mm); + } + + return err; +} + +struct user_arg_ptr { +#ifdef CONFIG_COMPAT + bool is_compat; +#endif + union { + const char __user *const __user *native; +#ifdef CONFIG_COMPAT + compat_uptr_t __user *compat; +#endif + } ptr; +}; + +static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) +{ + const char __user *native; + +#ifdef CONFIG_COMPAT + if (unlikely(argv.is_compat)) { + compat_uptr_t compat; + + if (get_user(compat, argv.ptr.compat + nr)) + return ERR_PTR(-EFAULT); + + return compat_ptr(compat); + } +#endif + + if (get_user(native, argv.ptr.native + nr)) + return ERR_PTR(-EFAULT); + + return native; +} + +/* + * count() counts the number of strings in array ARGV. + */ +static int count(struct user_arg_ptr argv, int max) +{ + int i = 0; + + if (argv.ptr.native != NULL) { + for (;;) { + const char __user *p = get_user_arg_ptr(argv, i); + + if (!p) + break; + + if (IS_ERR(p)) + return -EFAULT; + + if (i++ >= max) + return -E2BIG; + + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; + cond_resched(); + } + } + return i; +} + +/* + * 'copy_strings()' copies argument/environment strings from the old + * processes's memory to the new process's stack. The call to get_user_pages() + * ensures the destination page is created and not swapped out. + */ +static int copy_strings(int argc, struct user_arg_ptr argv, + struct linux_binprm *bprm) +{ + struct page *kmapped_page = NULL; + char *kaddr = NULL; + unsigned long kpos = 0; + int ret; + + while (argc-- > 0) { + const char __user *str; + int len; + unsigned long pos; + + ret = -EFAULT; + str = get_user_arg_ptr(argv, argc); + if (IS_ERR(str)) + goto out; + + len = strnlen_user(str, MAX_ARG_STRLEN); + if (!len) + goto out; + + ret = -E2BIG; + if (!valid_arg_len(bprm, len)) + goto out; + + /* We're going to work our way backwords. */ + pos = bprm->p; + str += len; + bprm->p -= len; + + while (len > 0) { + int offset, bytes_to_copy; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTNOHAND; + goto out; + } + cond_resched(); + + offset = pos % PAGE_SIZE; + if (offset == 0) + offset = PAGE_SIZE; + + bytes_to_copy = offset; + if (bytes_to_copy > len) + bytes_to_copy = len; + + offset -= bytes_to_copy; + pos -= bytes_to_copy; + str -= bytes_to_copy; + len -= bytes_to_copy; + + if (!kmapped_page || kpos != (pos & PAGE_MASK)) { + struct page *page; + + page = get_arg_page(bprm, pos, 1); + if (!page) { + ret = -E2BIG; + goto out; + } + + if (kmapped_page) { + flush_kernel_dcache_page(kmapped_page); + kunmap(kmapped_page); + put_arg_page(kmapped_page); + } + kmapped_page = page; + kaddr = kmap(kmapped_page); + kpos = pos & PAGE_MASK; + flush_arg_page(bprm, kpos, kmapped_page); + } + if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { + ret = -EFAULT; + goto out; + } + } + } + ret = 0; +out: + if (kmapped_page) { + flush_kernel_dcache_page(kmapped_page); + kunmap(kmapped_page); + put_arg_page(kmapped_page); + } + return ret; +} + +/* + * Like copy_strings, but get argv and its values from kernel memory. + */ +int copy_strings_kernel(int argc, const char *const *__argv, + struct linux_binprm *bprm) +{ + int r; + mm_segment_t oldfs = get_fs(); + struct user_arg_ptr argv = { + .ptr.native = (const char __user *const __user *)__argv, + }; + + set_fs(KERNEL_DS); + r = copy_strings(argc, argv, bprm); + set_fs(oldfs); + + return r; +} +EXPORT_SYMBOL(copy_strings_kernel); + +#ifdef CONFIG_MMU + +/* + * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once + * the binfmt code determines where the new stack should reside, we shift it to + * its final location. The process proceeds as follows: + * + * 1) Use shift to calculate the new vma endpoints. + * 2) Extend vma to cover both the old and new ranges. This ensures the + * arguments passed to subsequent functions are consistent. + * 3) Move vma's page tables to the new range. + * 4) Free up any cleared pgd range. + * 5) Shrink the vma to cover only the new range. + */ +static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long old_start = vma->vm_start; + unsigned long old_end = vma->vm_end; + unsigned long length = old_end - old_start; + unsigned long new_start = old_start - shift; + unsigned long new_end = old_end - shift; + struct mmu_gather tlb; + + BUG_ON(new_start > new_end); + + /* + * ensure there are no vmas between where we want to go + * and where we are + */ + if (vma != find_vma(mm, new_start)) + return -EFAULT; + + /* + * cover the whole range: [new_start, old_end) + */ + if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) + return -ENOMEM; + + /* + * move the page tables downwards, on failure we rely on + * process cleanup to remove whatever mess we made. + */ + if (length != move_page_tables(vma, old_start, + vma, new_start, length)) + return -ENOMEM; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, 0); + if (new_end > old_start) { + /* + * when the old and new regions overlap clear from new_end. + */ + free_pgd_range(&tlb, new_end, old_end, new_end, + vma->vm_next ? vma->vm_next->vm_start : 0); + } else { + /* + * otherwise, clean from old_start; this is done to not touch + * the address space in [new_end, old_start) some architectures + * have constraints on va-space that make this illegal (IA64) - + * for the others its just a little faster. + */ + free_pgd_range(&tlb, old_start, old_end, new_end, + vma->vm_next ? vma->vm_next->vm_start : 0); + } + tlb_finish_mmu(&tlb, new_end, old_end); + + /* + * Shrink the vma to just the new range. Always succeeds. + */ + vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); + + return 0; +} + +/* + * Finalizes the stack vm_area_struct. The flags and permissions are updated, + * the stack is optionally relocated, and some extra space is added. + */ +int setup_arg_pages(struct linux_binprm *bprm, + unsigned long stack_top, + int executable_stack) +{ + unsigned long ret; + unsigned long stack_shift; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = bprm->vma; + struct vm_area_struct *prev = NULL; + unsigned long vm_flags; + unsigned long stack_base; + unsigned long stack_size; + unsigned long stack_expand; + unsigned long rlim_stack; + +#ifdef CONFIG_STACK_GROWSUP + /* Limit stack size to 1GB */ + stack_base = rlimit_max(RLIMIT_STACK); + if (stack_base > (1 << 30)) + stack_base = 1 << 30; + + /* Make sure we didn't let the argument array grow too large. */ + if (vma->vm_end - vma->vm_start > stack_base) + return -ENOMEM; + + stack_base = PAGE_ALIGN(stack_top - stack_base); + + stack_shift = vma->vm_start - stack_base; + mm->arg_start = bprm->p - stack_shift; + bprm->p = vma->vm_end - stack_shift; +#else + stack_top = arch_align_stack(stack_top); + stack_top = PAGE_ALIGN(stack_top); + + if (unlikely(stack_top < mmap_min_addr) || + unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) + return -ENOMEM; + + stack_shift = vma->vm_end - stack_top; + + bprm->p -= stack_shift; + mm->arg_start = bprm->p; +#endif + + if (bprm->loader) + bprm->loader -= stack_shift; + bprm->exec -= stack_shift; + + down_write(&mm->mmap_sem); + vm_flags = VM_STACK_FLAGS; + + /* + * Adjust stack execute permissions; explicitly enable for + * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone + * (arch default) otherwise. + */ + if (unlikely(executable_stack == EXSTACK_ENABLE_X)) + vm_flags |= VM_EXEC; + else if (executable_stack == EXSTACK_DISABLE_X) + vm_flags &= ~VM_EXEC; + vm_flags |= mm->def_flags; + vm_flags |= VM_STACK_INCOMPLETE_SETUP; + + ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, + vm_flags); + if (ret) + goto out_unlock; + BUG_ON(prev != vma); + + /* Move stack pages down in memory. */ + if (stack_shift) { + ret = shift_arg_pages(vma, stack_shift); + if (ret) + goto out_unlock; + } + + /* mprotect_fixup is overkill to remove the temporary stack flags */ + vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; + + stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ + stack_size = vma->vm_end - vma->vm_start; + /* + * Align this down to a page boundary as expand_stack + * will align it up. + */ + rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK; +#ifdef CONFIG_STACK_GROWSUP + if (stack_size + stack_expand > rlim_stack) + stack_base = vma->vm_start + rlim_stack; + else + stack_base = vma->vm_end + stack_expand; +#else + if (stack_size + stack_expand > rlim_stack) + stack_base = vma->vm_end - rlim_stack; + else + stack_base = vma->vm_start - stack_expand; +#endif + current->mm->start_stack = bprm->p; + ret = expand_stack(vma, stack_base); + if (ret) + ret = -EFAULT; + +out_unlock: + up_write(&mm->mmap_sem); + return ret; +} +EXPORT_SYMBOL(setup_arg_pages); + +#endif /* CONFIG_MMU */ + +struct file *open_exec(const char *name) +{ + struct file *file; + int err; + static const struct open_flags open_exec_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN + }; + + file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW); + if (IS_ERR(file)) + goto out; + + err = -EACCES; + if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) + goto exit; + + if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) + goto exit; + + fsnotify_open(file); + + err = deny_write_access(file); + if (err) + goto exit; + +out: + return file; + +exit: + fput(file); + return ERR_PTR(err); +} +EXPORT_SYMBOL(open_exec); + +int kernel_read(struct file *file, loff_t offset, + char *addr, unsigned long count) +{ + mm_segment_t old_fs; + loff_t pos = offset; + int result; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + result = vfs_read(file, (void __user *)addr, count, &pos); + set_fs(old_fs); + return result; +} + +EXPORT_SYMBOL(kernel_read); + +static int exec_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk; + struct mm_struct * old_mm, *active_mm; + + /* Notify parent that we're no longer interested in the old VM */ + tsk = current; + old_mm = current->mm; + sync_mm_rss(tsk, old_mm); + mm_release(tsk, old_mm); + + if (old_mm) { + /* + * Make sure that if there is a core dump in progress + * for the old mm, we get out and die instead of going + * through with the exec. We must hold mmap_sem around + * checking core_state and changing tsk->mm. + */ + down_read(&old_mm->mmap_sem); + if (unlikely(old_mm->core_state)) { + up_read(&old_mm->mmap_sem); + return -EINTR; + } + } + task_lock(tsk); + active_mm = tsk->active_mm; + tsk->mm = mm; + tsk->active_mm = mm; + activate_mm(active_mm, mm); + if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + atomic_dec(&old_mm->oom_disable_count); + atomic_inc(&tsk->mm->oom_disable_count); + } + task_unlock(tsk); + arch_pick_mmap_layout(mm); + if (old_mm) { + up_read(&old_mm->mmap_sem); + BUG_ON(active_mm != old_mm); + mm_update_next_owner(old_mm); + mmput(old_mm); + return 0; + } + mmdrop(active_mm); + return 0; +} + +/* + * This function makes sure the current process has its own signal table, + * so that flush_signal_handlers can later reset the handlers without + * disturbing other processes. (Other processes might share the signal + * table via the CLONE_SIGHAND option to clone().) + */ +static int de_thread(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + struct sighand_struct *oldsighand = tsk->sighand; + spinlock_t *lock = &oldsighand->siglock; + + if (thread_group_empty(tsk)) + goto no_thread_group; + + /* + * Kill all other threads in the thread group. + */ + spin_lock_irq(lock); + if (signal_group_exit(sig)) { + /* + * Another group action in progress, just + * return so that the signal is processed. + */ + spin_unlock_irq(lock); + return -EAGAIN; + } + + sig->group_exit_task = tsk; + sig->notify_count = zap_other_threads(tsk); + if (!thread_group_leader(tsk)) + sig->notify_count--; + + while (sig->notify_count) { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(lock); + schedule(); + spin_lock_irq(lock); + } + spin_unlock_irq(lock); + + /* + * At this point all other threads have exited, all we have to + * do is to wait for the thread group leader to become inactive, + * and to assume its PID: + */ + if (!thread_group_leader(tsk)) { + struct task_struct *leader = tsk->group_leader; + + sig->notify_count = -1; /* for exit_notify() */ + for (;;) { + write_lock_irq(&tasklist_lock); + if (likely(leader->exit_state)) + break; + __set_current_state(TASK_UNINTERRUPTIBLE); + write_unlock_irq(&tasklist_lock); + schedule(); + } + + /* + * The only record we have of the real-time age of a + * process, regardless of execs it's done, is start_time. + * All the past CPU time is accumulated in signal_struct + * from sister threads now dead. But in this non-leader + * exec, nothing survives from the original leader thread, + * whose birth marks the true age of this process now. + * When we take on its identity by switching to its PID, we + * also take its birthdate (always earlier than our own). + */ + tsk->start_time = leader->start_time; + + BUG_ON(!same_thread_group(leader, tsk)); + BUG_ON(has_group_leader_pid(tsk)); + /* + * An exec() starts a new thread group with the + * TGID of the previous thread group. Rehash the + * two threads with a switched PID, and release + * the former thread group leader: + */ + + /* Become a process group leader with the old leader's pid. + * The old leader becomes a thread of the this thread group. + * Note: The old leader also uses this pid until release_task + * is called. Odd but simple and correct. + */ + detach_pid(tsk, PIDTYPE_PID); + tsk->pid = leader->pid; + attach_pid(tsk, PIDTYPE_PID, task_pid(leader)); + transfer_pid(leader, tsk, PIDTYPE_PGID); + transfer_pid(leader, tsk, PIDTYPE_SID); + + list_replace_rcu(&leader->tasks, &tsk->tasks); + list_replace_init(&leader->sibling, &tsk->sibling); + + tsk->group_leader = tsk; + leader->group_leader = tsk; + + tsk->exit_signal = SIGCHLD; + + BUG_ON(leader->exit_state != EXIT_ZOMBIE); + leader->exit_state = EXIT_DEAD; + write_unlock_irq(&tasklist_lock); + + release_task(leader); + } + + sig->group_exit_task = NULL; + sig->notify_count = 0; + +no_thread_group: + if (current->mm) + setmax_mm_hiwater_rss(&sig->maxrss, current->mm); + + exit_itimers(sig); + flush_itimer_signals(); + + if (atomic_read(&oldsighand->count) != 1) { + struct sighand_struct *newsighand; + /* + * This ->sighand is shared with the CLONE_SIGHAND + * but not CLONE_THREAD task, switch to the new one. + */ + newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); + if (!newsighand) + return -ENOMEM; + + atomic_set(&newsighand->count, 1); + memcpy(newsighand->action, oldsighand->action, + sizeof(newsighand->action)); + + write_lock_irq(&tasklist_lock); + spin_lock(&oldsighand->siglock); + rcu_assign_pointer(tsk->sighand, newsighand); + spin_unlock(&oldsighand->siglock); + write_unlock_irq(&tasklist_lock); + + __cleanup_sighand(oldsighand); + } + + BUG_ON(!thread_group_leader(tsk)); + return 0; +} + +/* + * These functions flushes out all traces of the currently running executable + * so that a new one can be started + */ +static void flush_old_files(struct files_struct * files) +{ + long j = -1; + struct fdtable *fdt; + + spin_lock(&files->file_lock); + for (;;) { + unsigned long set, i; + + j++; + i = j * __NFDBITS; + fdt = files_fdtable(files); + if (i >= fdt->max_fds) + break; + set = fdt->close_on_exec->fds_bits[j]; + if (!set) + continue; + fdt->close_on_exec->fds_bits[j] = 0; + spin_unlock(&files->file_lock); + for ( ; set ; i++,set >>= 1) { + if (set & 1) { + sys_close(i); + } + } + spin_lock(&files->file_lock); + + } + spin_unlock(&files->file_lock); +} + +char *get_task_comm(char *buf, struct task_struct *tsk) +{ + /* buf must be at least sizeof(tsk->comm) in size */ + task_lock(tsk); + strncpy(buf, tsk->comm, sizeof(tsk->comm)); + task_unlock(tsk); + return buf; +} +EXPORT_SYMBOL_GPL(get_task_comm); + +void set_task_comm(struct task_struct *tsk, char *buf) +{ + task_lock(tsk); + + /* + * Threads may access current->comm without holding + * the task lock, so write the string carefully. + * Readers without a lock may see incomplete new + * names but are safe from non-terminating string reads. + */ + memset(tsk->comm, 0, TASK_COMM_LEN); + wmb(); + strlcpy(tsk->comm, buf, sizeof(tsk->comm)); + task_unlock(tsk); + perf_event_comm(tsk); +} + +int flush_old_exec(struct linux_binprm * bprm) +{ + int retval; + + /* + * Make sure we have a private signal table and that + * we are unassociated from the previous thread group. + */ + retval = de_thread(current); + if (retval) + goto out; + + set_mm_exe_file(bprm->mm, bprm->file); + + /* + * Release all of the old mmap stuff + */ + acct_arg_size(bprm, 0); + retval = exec_mmap(bprm->mm); + if (retval) + goto out; + + bprm->mm = NULL; /* We're using it now */ + + set_fs(USER_DS); + current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD); + flush_thread(); + current->personality &= ~bprm->per_clear; + + return 0; + +out: + return retval; +} +EXPORT_SYMBOL(flush_old_exec); + +void setup_new_exec(struct linux_binprm * bprm) +{ + int i, ch; + const char *name; + char tcomm[sizeof(current->comm)]; + + arch_pick_mmap_layout(current->mm); + + /* This is the point of no return */ + current->sas_ss_sp = current->sas_ss_size = 0; + + if (current_euid() == current_uid() && current_egid() == current_gid()) + set_dumpable(current->mm, 1); + else + set_dumpable(current->mm, suid_dumpable); + + name = bprm->filename; + + /* Copies the binary name from after last slash */ + for (i=0; (ch = *(name++)) != '\0';) { + if (ch == '/') + i = 0; /* overwrite what we wrote */ + else + if (i < (sizeof(tcomm) - 1)) + tcomm[i++] = ch; + } + tcomm[i] = '\0'; + set_task_comm(current, tcomm); + + /* Set the new mm task size. We have to do that late because it may + * depend on TIF_32BIT which is only updated in flush_thread() on + * some architectures like powerpc + */ + current->mm->task_size = TASK_SIZE; + + /* install the new credentials */ + if (bprm->cred->uid != current_euid() || + bprm->cred->gid != current_egid()) { + current->pdeath_signal = 0; + } else if (file_permission(bprm->file, MAY_READ) || + bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) { + set_dumpable(current->mm, suid_dumpable); + } + + /* + * Flush performance counters when crossing a + * security domain: + */ + if (!get_dumpable(current->mm)) + perf_event_exit_task(current); + + /* An exec changes our domain. We are no longer part of the thread + group */ + + current->self_exec_id++; + + flush_signal_handlers(current, 0); + flush_old_files(current->files); +} +EXPORT_SYMBOL(setup_new_exec); + +/* + * Prepare credentials and lock ->cred_guard_mutex. + * install_exec_creds() commits the new creds and drops the lock. + * Or, if exec fails before, free_bprm() should release ->cred and + * and unlock. + */ +int prepare_bprm_creds(struct linux_binprm *bprm) +{ + if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) + return -ERESTARTNOINTR; + + bprm->cred = prepare_exec_creds(); + if (likely(bprm->cred)) + return 0; + + mutex_unlock(¤t->signal->cred_guard_mutex); + return -ENOMEM; +} + +void free_bprm(struct linux_binprm *bprm) +{ + free_arg_pages(bprm); + if (bprm->cred) { + mutex_unlock(¤t->signal->cred_guard_mutex); + abort_creds(bprm->cred); + } + kfree(bprm); +} + +/* + * install the new credentials for this executable + */ +void install_exec_creds(struct linux_binprm *bprm) +{ + security_bprm_committing_creds(bprm); + + commit_creds(bprm->cred); + bprm->cred = NULL; + /* + * cred_guard_mutex must be held at least to this point to prevent + * ptrace_attach() from altering our determination of the task's + * credentials; any time after this it may be unlocked. + */ + security_bprm_committed_creds(bprm); + mutex_unlock(¤t->signal->cred_guard_mutex); +} +EXPORT_SYMBOL(install_exec_creds); + +/* + * determine how safe it is to execute the proposed program + * - the caller must hold ->cred_guard_mutex to protect against + * PTRACE_ATTACH + */ +int check_unsafe_exec(struct linux_binprm *bprm) +{ + struct task_struct *p = current, *t; + unsigned n_fs; + int res = 0; + + bprm->unsafe = tracehook_unsafe_exec(p); + + n_fs = 1; + spin_lock(&p->fs->lock); + rcu_read_lock(); + for (t = next_thread(p); t != p; t = next_thread(t)) { + if (t->fs == p->fs) + n_fs++; + } + rcu_read_unlock(); + + if (p->fs->users > n_fs) { + bprm->unsafe |= LSM_UNSAFE_SHARE; + } else { + res = -EAGAIN; + if (!p->fs->in_exec) { + p->fs->in_exec = 1; + res = 1; + } + } + spin_unlock(&p->fs->lock); + + return res; +} + +/* + * Fill the binprm structure from the inode. + * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes + * + * This may be called multiple times for binary chains (scripts for example). + */ +int prepare_binprm(struct linux_binprm *bprm) +{ + umode_t mode; + struct inode * inode = bprm->file->f_path.dentry->d_inode; + int retval; + + mode = inode->i_mode; + if (bprm->file->f_op == NULL) + return -EACCES; + + /* clear any previous set[ug]id data from a previous binary */ + bprm->cred->euid = current_euid(); + bprm->cred->egid = current_egid(); + + if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) { + /* Set-uid? */ + if (mode & S_ISUID) { + bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->cred->euid = inode->i_uid; + } + + /* Set-gid? */ + /* + * If setgid is set but no group execute bit then this + * is a candidate for mandatory locking, not a setgid + * executable. + */ + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->cred->egid = inode->i_gid; + } + } + + /* fill in binprm security blob */ + retval = security_bprm_set_creds(bprm); + if (retval) + return retval; + bprm->cred_prepared = 1; + + memset(bprm->buf, 0, BINPRM_BUF_SIZE); + return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); +} + +EXPORT_SYMBOL(prepare_binprm); + +/* + * Arguments are '\0' separated strings found at the location bprm->p + * points to; chop off the first by relocating brpm->p to right after + * the first '\0' encountered. + */ +int remove_arg_zero(struct linux_binprm *bprm) +{ + int ret = 0; + unsigned long offset; + char *kaddr; + struct page *page; + + if (!bprm->argc) + return 0; + + do { + offset = bprm->p & ~PAGE_MASK; + page = get_arg_page(bprm, bprm->p, 0); + if (!page) { + ret = -EFAULT; + goto out; + } + kaddr = kmap_atomic(page, KM_USER0); + + for (; offset < PAGE_SIZE && kaddr[offset]; + offset++, bprm->p++) + ; + + kunmap_atomic(kaddr, KM_USER0); + put_arg_page(page); + + if (offset == PAGE_SIZE) + free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1); + } while (offset == PAGE_SIZE); + + bprm->p++; + bprm->argc--; + ret = 0; + +out: + return ret; +} +EXPORT_SYMBOL(remove_arg_zero); + +/* + * cycle the list of binary formats handler, until one recognizes the image + */ +int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) +{ + unsigned int depth = bprm->recursion_depth; + int try,retval; + struct linux_binfmt *fmt; + + retval = security_bprm_check(bprm); + if (retval) + return retval; + + retval = audit_bprm(bprm); + if (retval) + return retval; + + retval = -ENOENT; + for (try=0; try<2; try++) { + read_lock(&binfmt_lock); + list_for_each_entry(fmt, &formats, lh) { + int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary; + if (!fn) + continue; + if (!try_module_get(fmt->module)) + continue; + read_unlock(&binfmt_lock); + retval = fn(bprm, regs); + /* + * Restore the depth counter to its starting value + * in this call, so we don't have to rely on every + * load_binary function to restore it on return. + */ + bprm->recursion_depth = depth; + if (retval >= 0) { + if (depth == 0) + tracehook_report_exec(fmt, bprm, regs); + put_binfmt(fmt); + allow_write_access(bprm->file); + if (bprm->file) + fput(bprm->file); + bprm->file = NULL; + current->did_exec = 1; + proc_exec_connector(current); + return retval; + } + read_lock(&binfmt_lock); + put_binfmt(fmt); + if (retval != -ENOEXEC || bprm->mm == NULL) + break; + if (!bprm->file) { + read_unlock(&binfmt_lock); + return retval; + } + } + read_unlock(&binfmt_lock); + if (retval != -ENOEXEC || bprm->mm == NULL) { + break; +#ifdef CONFIG_MODULES + } else { +#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) + if (printable(bprm->buf[0]) && + printable(bprm->buf[1]) && + printable(bprm->buf[2]) && + printable(bprm->buf[3])) + break; /* -ENOEXEC */ + if (try) + break; /* -ENOEXEC */ + request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); +#endif + } + } + return retval; +} + +EXPORT_SYMBOL(search_binary_handler); + +/* + * sys_execve() executes a new program. + */ +static int do_execve_common(const char *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + struct pt_regs *regs) +{ + struct linux_binprm *bprm; + struct file *file; + struct files_struct *displaced; + bool clear_in_exec; + int retval; + + retval = unshare_files(&displaced); + if (retval) + goto out_ret; + + retval = -ENOMEM; + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) + goto out_files; + + retval = prepare_bprm_creds(bprm); + if (retval) + goto out_free; + + retval = check_unsafe_exec(bprm); + if (retval < 0) + goto out_free; + clear_in_exec = retval; + current->in_execve = 1; + + file = open_exec(filename); + retval = PTR_ERR(file); + if (IS_ERR(file)) + goto out_unmark; + + sched_exec(); + + bprm->file = file; + bprm->filename = filename; + bprm->interp = filename; + + retval = bprm_mm_init(bprm); + if (retval) + goto out_file; + + bprm->argc = count(argv, MAX_ARG_STRINGS); + if ((retval = bprm->argc) < 0) + goto out; + + bprm->envc = count(envp, MAX_ARG_STRINGS); + if ((retval = bprm->envc) < 0) + goto out; + + retval = prepare_binprm(bprm); + if (retval < 0) + goto out; + + retval = copy_strings_kernel(1, &bprm->filename, bprm); + if (retval < 0) + goto out; + + bprm->exec = bprm->p; + retval = copy_strings(bprm->envc, envp, bprm); + if (retval < 0) + goto out; + + retval = copy_strings(bprm->argc, argv, bprm); + if (retval < 0) + goto out; + + retval = search_binary_handler(bprm,regs); + if (retval < 0) + goto out; + + /* execve succeeded */ + current->fs->in_exec = 0; + current->in_execve = 0; + acct_update_integrals(current); + free_bprm(bprm); + if (displaced) + put_files_struct(displaced); + return retval; + +out: + if (bprm->mm) { + acct_arg_size(bprm, 0); + mmput(bprm->mm); + } + +out_file: + if (bprm->file) { + allow_write_access(bprm->file); + fput(bprm->file); + } + +out_unmark: + if (clear_in_exec) + current->fs->in_exec = 0; + current->in_execve = 0; + +out_free: + free_bprm(bprm); + +out_files: + if (displaced) + reset_files_struct(displaced); +out_ret: + return retval; +} + +int do_execve(const char *filename, + const char __user *const __user *__argv, + const char __user *const __user *__envp, + struct pt_regs *regs) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + return do_execve_common(filename, argv, envp, regs); +} + +#ifdef CONFIG_COMPAT +int compat_do_execve(char *filename, + compat_uptr_t __user *__argv, + compat_uptr_t __user *__envp, + struct pt_regs *regs) +{ + struct user_arg_ptr argv = { + .is_compat = true, + .ptr.compat = __argv, + }; + struct user_arg_ptr envp = { + .is_compat = true, + .ptr.compat = __envp, + }; + return do_execve_common(filename, argv, envp, regs); +} +#endif + +void set_binfmt(struct linux_binfmt *new) +{ + struct mm_struct *mm = current->mm; + + if (mm->binfmt) + module_put(mm->binfmt->module); + + mm->binfmt = new; + if (new) + __module_get(new->module); +} + +EXPORT_SYMBOL(set_binfmt); + +static int expand_corename(struct core_name *cn) +{ + char *old_corename = cn->corename; + + cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count); + cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL); + + if (!cn->corename) { + kfree(old_corename); + return -ENOMEM; + } + + return 0; +} + +static int cn_printf(struct core_name *cn, const char *fmt, ...) +{ + char *cur; + int need; + int ret; + va_list arg; + + va_start(arg, fmt); + need = vsnprintf(NULL, 0, fmt, arg); + va_end(arg); + + if (likely(need < cn->size - cn->used - 1)) + goto out_printf; + + ret = expand_corename(cn); + if (ret) + goto expand_fail; + +out_printf: + cur = cn->corename + cn->used; + va_start(arg, fmt); + vsnprintf(cur, need + 1, fmt, arg); + va_end(arg); + cn->used += need; + return 0; + +expand_fail: + return ret; +} + +static int cn_print_exe_file(struct core_name *cn) +{ + struct file *exe_file; + char *pathbuf, *path, *p; + int ret; + + exe_file = get_mm_exe_file(current->mm); + if (!exe_file) + return cn_printf(cn, "(unknown)"); + + pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); + if (!pathbuf) { + ret = -ENOMEM; + goto put_exe_file; + } + + path = d_path(&exe_file->f_path, pathbuf, PATH_MAX); + if (IS_ERR(path)) { + ret = PTR_ERR(path); + goto free_buf; + } + + for (p = path; *p; p++) + if (*p == '/') + *p = '!'; + + ret = cn_printf(cn, "%s", path); + +free_buf: + kfree(pathbuf); +put_exe_file: + fput(exe_file); + return ret; +} + +/* format_corename will inspect the pattern parameter, and output a + * name into corename, which must have space for at least + * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. + */ +static int format_corename(struct core_name *cn, long signr) +{ + const struct cred *cred = current_cred(); + const char *pat_ptr = core_pattern; + int ispipe = (*pat_ptr == '|'); + int pid_in_pattern = 0; + int err = 0; + + cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count); + cn->corename = kmalloc(cn->size, GFP_KERNEL); + cn->used = 0; + + if (!cn->corename) + return -ENOMEM; + + /* Repeat as long as we have more pattern to process and more output + space */ + while (*pat_ptr) { + if (*pat_ptr != '%') { + if (*pat_ptr == 0) + goto out; + err = cn_printf(cn, "%c", *pat_ptr++); + } else { + switch (*++pat_ptr) { + /* single % at the end, drop that */ + case 0: + goto out; + /* Double percent, output one percent */ + case '%': + err = cn_printf(cn, "%c", '%'); + break; + /* pid */ + case 'p': + pid_in_pattern = 1; + err = cn_printf(cn, "%d", + task_tgid_vnr(current)); + break; + /* uid */ + case 'u': + err = cn_printf(cn, "%d", cred->uid); + break; + /* gid */ + case 'g': + err = cn_printf(cn, "%d", cred->gid); + break; + /* signal that caused the coredump */ + case 's': + err = cn_printf(cn, "%ld", signr); + break; + /* UNIX time of coredump */ + case 't': { + struct timeval tv; + do_gettimeofday(&tv); + err = cn_printf(cn, "%lu", tv.tv_sec); + break; + } + /* hostname */ + case 'h': + down_read(&uts_sem); + err = cn_printf(cn, "%s", + utsname()->nodename); + up_read(&uts_sem); + break; + /* executable */ + case 'e': + err = cn_printf(cn, "%s", current->comm); + break; + case 'E': + err = cn_print_exe_file(cn); + break; + /* core limit size */ + case 'c': + err = cn_printf(cn, "%lu", + rlimit(RLIMIT_CORE)); + break; + default: + break; + } + ++pat_ptr; + } + + if (err) + return err; + } + + /* Backward compatibility with core_uses_pid: + * + * If core_pattern does not include a %p (as is the default) + * and core_uses_pid is set, then .%pid will be appended to + * the filename. Do not do this for piped commands. */ + if (!ispipe && !pid_in_pattern && core_uses_pid) { + err = cn_printf(cn, ".%d", task_tgid_vnr(current)); + if (err) + return err; + } +out: + return ispipe; +} + +static int zap_process(struct task_struct *start, int exit_code) +{ + struct task_struct *t; + int nr = 0; + + start->signal->flags = SIGNAL_GROUP_EXIT; + start->signal->group_exit_code = exit_code; + start->signal->group_stop_count = 0; + + t = start; + do { + task_clear_group_stop_pending(t); + if (t != current && t->mm) { + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); + nr++; + } + } while_each_thread(start, t); + + return nr; +} + +static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, + struct core_state *core_state, int exit_code) +{ + struct task_struct *g, *p; + unsigned long flags; + int nr = -EAGAIN; + + spin_lock_irq(&tsk->sighand->siglock); + if (!signal_group_exit(tsk->signal)) { + mm->core_state = core_state; + nr = zap_process(tsk, exit_code); + } + spin_unlock_irq(&tsk->sighand->siglock); + if (unlikely(nr < 0)) + return nr; + + if (atomic_read(&mm->mm_users) == nr + 1) + goto done; + /* + * We should find and kill all tasks which use this mm, and we should + * count them correctly into ->nr_threads. We don't take tasklist + * lock, but this is safe wrt: + * + * fork: + * None of sub-threads can fork after zap_process(leader). All + * processes which were created before this point should be + * visible to zap_threads() because copy_process() adds the new + * process to the tail of init_task.tasks list, and lock/unlock + * of ->siglock provides a memory barrier. + * + * do_exit: + * The caller holds mm->mmap_sem. This means that the task which + * uses this mm can't pass exit_mm(), so it can't exit or clear + * its ->mm. + * + * de_thread: + * It does list_replace_rcu(&leader->tasks, ¤t->tasks), + * we must see either old or new leader, this does not matter. + * However, it can change p->sighand, so lock_task_sighand(p) + * must be used. Since p->mm != NULL and we hold ->mmap_sem + * it can't fail. + * + * Note also that "g" can be the old leader with ->mm == NULL + * and already unhashed and thus removed from ->thread_group. + * This is OK, __unhash_process()->list_del_rcu() does not + * clear the ->next pointer, we will find the new leader via + * next_thread(). + */ + rcu_read_lock(); + for_each_process(g) { + if (g == tsk->group_leader) + continue; + if (g->flags & PF_KTHREAD) + continue; + p = g; + do { + if (p->mm) { + if (unlikely(p->mm == mm)) { + lock_task_sighand(p, &flags); + nr += zap_process(p, exit_code); + unlock_task_sighand(p, &flags); + } + break; + } + } while_each_thread(g, p); + } + rcu_read_unlock(); +done: + atomic_set(&core_state->nr_threads, nr); + return nr; +} + +static int coredump_wait(int exit_code, struct core_state *core_state) +{ + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + struct completion *vfork_done; + int core_waiters = -EBUSY; + + init_completion(&core_state->startup); + core_state->dumper.task = tsk; + core_state->dumper.next = NULL; + + down_write(&mm->mmap_sem); + if (!mm->core_state) + core_waiters = zap_threads(tsk, mm, core_state, exit_code); + up_write(&mm->mmap_sem); + + if (unlikely(core_waiters < 0)) + goto fail; + + /* + * Make sure nobody is waiting for us to release the VM, + * otherwise we can deadlock when we wait on each other + */ + vfork_done = tsk->vfork_done; + if (vfork_done) { + tsk->vfork_done = NULL; + complete(vfork_done); + } + + if (core_waiters) + wait_for_completion(&core_state->startup); +fail: + return core_waiters; +} + +static void coredump_finish(struct mm_struct *mm) +{ + struct core_thread *curr, *next; + struct task_struct *task; + + next = mm->core_state->dumper.next; + while ((curr = next) != NULL) { + next = curr->next; + task = curr->task; + /* + * see exit_mm(), curr->task must not see + * ->task == NULL before we read ->next. + */ + smp_mb(); + curr->task = NULL; + wake_up_process(task); + } + + mm->core_state = NULL; +} + +/* + * set_dumpable converts traditional three-value dumpable to two flags and + * stores them into mm->flags. It modifies lower two bits of mm->flags, but + * these bits are not changed atomically. So get_dumpable can observe the + * intermediate state. To avoid doing unexpected behavior, get get_dumpable + * return either old dumpable or new one by paying attention to the order of + * modifying the bits. + * + * dumpable | mm->flags (binary) + * old new | initial interim final + * ---------+----------------------- + * 0 1 | 00 01 01 + * 0 2 | 00 10(*) 11 + * 1 0 | 01 00 00 + * 1 2 | 01 11 11 + * 2 0 | 11 10(*) 00 + * 2 1 | 11 11 01 + * + * (*) get_dumpable regards interim value of 10 as 11. + */ +void set_dumpable(struct mm_struct *mm, int value) +{ + switch (value) { + case 0: + clear_bit(MMF_DUMPABLE, &mm->flags); + smp_wmb(); + clear_bit(MMF_DUMP_SECURELY, &mm->flags); + break; + case 1: + set_bit(MMF_DUMPABLE, &mm->flags); + smp_wmb(); + clear_bit(MMF_DUMP_SECURELY, &mm->flags); + break; + case 2: + set_bit(MMF_DUMP_SECURELY, &mm->flags); + smp_wmb(); + set_bit(MMF_DUMPABLE, &mm->flags); + break; + } +} + +static int __get_dumpable(unsigned long mm_flags) +{ + int ret; + + ret = mm_flags & MMF_DUMPABLE_MASK; + return (ret >= 2) ? 2 : ret; +} + +int get_dumpable(struct mm_struct *mm) +{ + return __get_dumpable(mm->flags); +} + +static void wait_for_dump_helpers(struct file *file) +{ + struct pipe_inode_info *pipe; + + pipe = file->f_path.dentry->d_inode->i_pipe; + + pipe_lock(pipe); + pipe->readers++; + pipe->writers--; + + while ((pipe->readers > 1) && (!signal_pending(current))) { + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + pipe_wait(pipe); + } + + pipe->readers--; + pipe->writers++; + pipe_unlock(pipe); + +} + + +/* + * umh_pipe_setup + * helper function to customize the process used + * to collect the core in userspace. Specifically + * it sets up a pipe and installs it as fd 0 (stdin) + * for the process. Returns 0 on success, or + * PTR_ERR on failure. + * Note that it also sets the core limit to 1. This + * is a special value that we use to trap recursive + * core dumps + */ +static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) +{ + struct file *rp, *wp; + struct fdtable *fdt; + struct coredump_params *cp = (struct coredump_params *)info->data; + struct files_struct *cf = current->files; + + wp = create_write_pipe(0); + if (IS_ERR(wp)) + return PTR_ERR(wp); + + rp = create_read_pipe(wp, 0); + if (IS_ERR(rp)) { + free_write_pipe(wp); + return PTR_ERR(rp); + } + + cp->file = wp; + + sys_close(0); + fd_install(0, rp); + spin_lock(&cf->file_lock); + fdt = files_fdtable(cf); + FD_SET(0, fdt->open_fds); + FD_CLR(0, fdt->close_on_exec); + spin_unlock(&cf->file_lock); + + /* and disallow core files too */ + current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; + + return 0; +} + +void do_coredump(long signr, int exit_code, struct pt_regs *regs) +{ + struct core_state core_state; + struct core_name cn; + struct mm_struct *mm = current->mm; + struct linux_binfmt * binfmt; + const struct cred *old_cred; + struct cred *cred; + int retval = 0; + int flag = 0; + int ispipe; + static atomic_t core_dump_count = ATOMIC_INIT(0); + struct coredump_params cprm = { + .signr = signr, + .regs = regs, + .limit = rlimit(RLIMIT_CORE), + /* + * We must use the same mm->flags while dumping core to avoid + * inconsistency of bit flags, since this flag is not protected + * by any locks. + */ + .mm_flags = mm->flags, + }; + + audit_core_dumps(signr); + + binfmt = mm->binfmt; + if (!binfmt || !binfmt->core_dump) + goto fail; + if (!__get_dumpable(cprm.mm_flags)) + goto fail; + + cred = prepare_creds(); + if (!cred) + goto fail; + /* + * We cannot trust fsuid as being the "true" uid of the + * process nor do we know its entire history. We only know it + * was tainted so we dump it as root in mode 2. + */ + if (__get_dumpable(cprm.mm_flags) == 2) { + /* Setuid core dump mode */ + flag = O_EXCL; /* Stop rewrite attacks */ + cred->fsuid = 0; /* Dump root private */ + } + + retval = coredump_wait(exit_code, &core_state); + if (retval < 0) + goto fail_creds; + + old_cred = override_creds(cred); + + /* + * Clear any false indication of pending signals that might + * be seen by the filesystem code called to write the core file. + */ + clear_thread_flag(TIF_SIGPENDING); + + ispipe = format_corename(&cn, signr); + + if (ispipe == -ENOMEM) { + printk(KERN_WARNING "format_corename failed\n"); + printk(KERN_WARNING "Aborting core\n"); + goto fail_corename; + } + + if (ispipe) { + int dump_count; + char **helper_argv; + + if (cprm.limit == 1) { + /* + * Normally core limits are irrelevant to pipes, since + * we're not writing to the file system, but we use + * cprm.limit of 1 here as a speacial value. Any + * non-1 limit gets set to RLIM_INFINITY below, but + * a limit of 0 skips the dump. This is a consistent + * way to catch recursive crashes. We can still crash + * if the core_pattern binary sets RLIM_CORE = !1 + * but it runs as root, and can do lots of stupid things + * Note that we use task_tgid_vnr here to grab the pid + * of the process group leader. That way we get the + * right pid if a thread in a multi-threaded + * core_pattern process dies. + */ + printk(KERN_WARNING + "Process %d(%s) has RLIMIT_CORE set to 1\n", + task_tgid_vnr(current), current->comm); + printk(KERN_WARNING "Aborting core\n"); + goto fail_unlock; + } + cprm.limit = RLIM_INFINITY; + + dump_count = atomic_inc_return(&core_dump_count); + if (core_pipe_limit && (core_pipe_limit < dump_count)) { + printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n", + task_tgid_vnr(current), current->comm); + printk(KERN_WARNING "Skipping core dump\n"); + goto fail_dropcount; + } + + helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL); + if (!helper_argv) { + printk(KERN_WARNING "%s failed to allocate memory\n", + __func__); + goto fail_dropcount; + } + + retval = call_usermodehelper_fns(helper_argv[0], helper_argv, + NULL, UMH_WAIT_EXEC, umh_pipe_setup, + NULL, &cprm); + argv_free(helper_argv); + if (retval) { + printk(KERN_INFO "Core dump to %s pipe failed\n", + cn.corename); + goto close_fail; + } + } else { + struct inode *inode; + + if (cprm.limit < binfmt->min_coredump) + goto fail_unlock; + + cprm.file = filp_open(cn.corename, + O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, + 0600); + if (IS_ERR(cprm.file)) + goto fail_unlock; + + inode = cprm.file->f_path.dentry->d_inode; + if (inode->i_nlink > 1) + goto close_fail; + if (d_unhashed(cprm.file->f_path.dentry)) + goto close_fail; + /* + * AK: actually i see no reason to not allow this for named + * pipes etc, but keep the previous behaviour for now. + */ + if (!S_ISREG(inode->i_mode)) + goto close_fail; + /* + * Dont allow local users get cute and trick others to coredump + * into their pre-created files. + */ + if (inode->i_uid != current_fsuid()) + goto close_fail; + if (!cprm.file->f_op || !cprm.file->f_op->write) + goto close_fail; + if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) + goto close_fail; + } + + retval = binfmt->core_dump(&cprm); + if (retval) + current->signal->group_exit_code |= 0x80; + + if (ispipe && core_pipe_limit) + wait_for_dump_helpers(cprm.file); +close_fail: + if (cprm.file) + filp_close(cprm.file, NULL); +fail_dropcount: + if (ispipe) + atomic_dec(&core_dump_count); +fail_unlock: + kfree(cn.corename); +fail_corename: + coredump_finish(mm); + revert_creds(old_cred); +fail_creds: + put_cred(cred); +fail: + return; +} + +/* + * Core dumping helper functions. These are the only things you should + * do on a core-file: use only these functions to write out all the + * necessary info. + */ +int dump_write(struct file *file, const void *addr, int nr) +{ + return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; +} +EXPORT_SYMBOL(dump_write); + +int dump_seek(struct file *file, loff_t off) +{ + int ret = 1; + + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + if (file->f_op->llseek(file, off, SEEK_CUR) < 0) + return 0; + } else { + char *buf = (char *)get_zeroed_page(GFP_KERNEL); + + if (!buf) + return 0; + while (off > 0) { + unsigned long n = off; + + if (n > PAGE_SIZE) + n = PAGE_SIZE; + if (!dump_write(file, buf, n)) { + ret = 0; + break; + } + off -= n; + } + free_page((unsigned long)buf); + } + return ret; +} +EXPORT_SYMBOL(dump_seek); diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS new file mode 100644 index 00000000..1b2d4c63 --- /dev/null +++ b/fs/exofs/BUGS @@ -0,0 +1,3 @@ +- Out-of-space may cause a severe problem if the object (and directory entry) + were written, but the inode attributes failed. Then if the filesystem was + unmounted and mounted the kernel can get into an endless loop doing a readdir. diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild new file mode 100644 index 00000000..2d0f757f --- /dev/null +++ b/fs/exofs/Kbuild @@ -0,0 +1,16 @@ +# +# Kbuild for the EXOFS module +# +# Copyright (C) 2008 Panasas Inc. All rights reserved. +# +# Authors: +# Boaz Harrosh +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 +# +# Kbuild - Gets included from the Kernels Makefile and build system +# + +exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o +obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig new file mode 100644 index 00000000..86194b2f --- /dev/null +++ b/fs/exofs/Kconfig @@ -0,0 +1,13 @@ +config EXOFS_FS + tristate "exofs: OSD based file system support" + depends on SCSI_OSD_ULD + help + EXOFS is a file system that uses an OSD storage device, + as its backing storage. + +# Debugging-related stuff +config EXOFS_DEBUG + bool "Enable debugging" + depends on EXOFS_FS + help + This option enables EXOFS debug prints. diff --git a/fs/exofs/common.h b/fs/exofs/common.h new file mode 100644 index 00000000..3bbd4695 --- /dev/null +++ b/fs/exofs/common.h @@ -0,0 +1,262 @@ +/* + * common.h - Common definitions for both Kernel and user-mode utilities + * + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __EXOFS_COM_H__ +#define __EXOFS_COM_H__ + +#include + +#include +#include +#include + +/**************************************************************************** + * Object ID related defines + * NOTE: inode# = object ID - EXOFS_OBJ_OFF + ****************************************************************************/ +#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */ +#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */ +#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */ +#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */ +#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ + +/* exofs Application specific page/attribute */ +/* Inode attrs */ +# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) +# define EXOFS_ATTR_INODE_DATA 1 +# define EXOFS_ATTR_INODE_FILE_LAYOUT 2 +# define EXOFS_ATTR_INODE_DIR_LAYOUT 3 +/* Partition attrs */ +# define EXOFS_APAGE_SB_DATA (0xF0000000U + 3) +# define EXOFS_ATTR_SB_STATS 1 + +/* + * The maximum number of files we can have is limited by the size of the + * inode number. This is the largest object ID that the file system supports. + * Object IDs 0, 1, and 2 are always in use (see above defines). + */ +enum { + EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX : + (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)), + EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF), +}; + +/**************************************************************************** + * Misc. + ****************************************************************************/ +#define EXOFS_BLKSHIFT 12 +#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT) + +/**************************************************************************** + * superblock-related things + ****************************************************************************/ +#define EXOFS_SUPER_MAGIC 0x5DF5 + +/* + * The file system control block - stored in object EXOFS_SUPER_ID's data. + * This is where the in-memory superblock is stored on disk. + */ +enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1}; +struct exofs_fscb { + __le64 s_nextid; /* Only used after mkfs */ + __le64 s_numfiles; /* Only used after mkfs */ + __le32 s_version; /* == EXOFS_FSCB_VER */ + __le16 s_magic; /* Magic signature */ + __le16 s_newfs; /* Non-zero if this is a new fs */ + + /* From here on it's a static part, only written by mkexofs */ + __le64 s_dev_table_oid; /* Resurved, not used */ + __le64 s_dev_table_count; /* == 0 means no dev_table */ +} __packed; + +/* + * This struct is set on the FS partition's attributes. + * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together + * with the create command, to atomically persist the sb writeable information. + */ +struct exofs_sb_stats { + __le64 s_nextid; /* Highest object ID used */ + __le64 s_numfiles; /* Number of files on fs */ +} __packed; + +/* + * Describes the raid used in the FS. It is part of the device table. + * This here is taken from the pNFS-objects definition. In exofs we + * use one raid policy through-out the filesystem. (NOTE: the funny + * alignment at beginning. We take care of it at exofs_device_table. + */ +struct exofs_dt_data_map { + __le32 cb_num_comps; + __le64 cb_stripe_unit; + __le32 cb_group_width; + __le32 cb_group_depth; + __le32 cb_mirror_cnt; + __le32 cb_raid_algorithm; +} __packed; + +/* + * This is an osd device information descriptor. It is a single entry in + * the exofs device table. It describes an osd target lun which + * contains data belonging to this FS. (Same partition_id on all devices) + */ +struct exofs_dt_device_info { + __le32 systemid_len; + u8 systemid[OSD_SYSTEMID_LEN]; + __le64 long_name_offset; /* If !0 then offset-in-file */ + __le32 osdname_len; /* */ + u8 osdname[44]; /* Embbeded, Usually an asci uuid */ +} __packed; + +/* + * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data. + * It contains the raid used for this multy-device FS and an array of + * participating devices. + */ +struct exofs_device_table { + __le32 dt_version; /* == EXOFS_DT_VER */ + struct exofs_dt_data_map dt_data_map; /* Raid policy to use */ + + /* Resurved space For future use. Total includeing this: + * (8 * sizeof(le64)) + */ + __le64 __Resurved[4]; + + __le64 dt_num_devices; /* Array size */ + struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */ +} __packed; + +/**************************************************************************** + * inode-related things + ****************************************************************************/ +#define EXOFS_IDATA 5 + +/* + * The file control block - stored in an object's attributes. This is where + * the in-memory inode is stored on disk. + */ +struct exofs_fcb { + __le64 i_size; /* Size of the file */ + __le16 i_mode; /* File mode */ + __le16 i_links_count; /* Links count */ + __le32 i_uid; /* Owner Uid */ + __le32 i_gid; /* Group Id */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Creation time */ + __le32 i_mtime; /* Modification time */ + __le32 i_flags; /* File flags (unused for now)*/ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */ +}; + +#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb) + +/* This is the Attribute the fcb is stored in */ +static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF( + EXOFS_APAGE_FS_DATA, + EXOFS_ATTR_INODE_DATA, + EXOFS_INO_ATTR_SIZE); + +/**************************************************************************** + * dentry-related things + ****************************************************************************/ +#define EXOFS_NAME_LEN 255 + +/* + * The on-disk directory entry + */ +struct exofs_dir_entry { + __le64 inode_no; /* inode number */ + __le16 rec_len; /* directory entry length */ + u8 name_len; /* name length */ + u8 file_type; /* umm...file type */ + char name[EXOFS_NAME_LEN]; /* file name */ +}; + +enum { + EXOFS_FT_UNKNOWN, + EXOFS_FT_REG_FILE, + EXOFS_FT_DIR, + EXOFS_FT_CHRDEV, + EXOFS_FT_BLKDEV, + EXOFS_FT_FIFO, + EXOFS_FT_SOCK, + EXOFS_FT_SYMLINK, + EXOFS_FT_MAX +}; + +#define EXOFS_DIR_PAD 4 +#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1) +#define EXOFS_DIR_REC_LEN(name_len) \ + (((name_len) + offsetof(struct exofs_dir_entry, name) + \ + EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) + +/* + * The on-disk (optional) layout structure. + * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT + * attribute, attached to any inode, usually to a directory. + */ + +enum exofs_inode_layout_gen_functions { + LAYOUT_MOVING_WINDOW = 0, + LAYOUT_IMPLICT = 1, +}; + +struct exofs_on_disk_inode_layout { + __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */ + __le16 pad; + union { + /* gen_func == LAYOUT_MOVING_WINDOW (default) */ + struct exofs_layout_sliding_window { + __le32 num_devices; /* first n devices in global-table*/ + } sliding_window __packed; + + /* gen_func == LAYOUT_IMPLICT */ + struct exofs_layout_implict_list { + struct exofs_dt_data_map data_map; + /* Variable array of size data_map.cb_num_comps. These + * are device indexes of the devices in the global table + */ + __le32 dev_indexes[]; + } implict __packed; + }; +} __packed; + +static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs) +{ + return sizeof(struct exofs_on_disk_inode_layout) + + max_devs * sizeof(__le32); +} + +#endif /*ifndef __EXOFS_COM_H__*/ diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c new file mode 100644 index 00000000..d0941c6a --- /dev/null +++ b/fs/exofs/dir.c @@ -0,0 +1,673 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "exofs.h" + +static inline unsigned exofs_chunk_size(struct inode *inode) +{ + return inode->i_sb->s_blocksize; +} + +static inline void exofs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +/* Accesses dir's inode->i_size must be called under inode lock */ +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +} + +static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr) +{ + loff_t last_byte = inode->i_size; + + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte; +} + +static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len) +{ + struct address_space *mapping = page->mapping; + struct inode *dir = mapping->host; + int err = 0; + + dir->i_version++; + + if (!PageUptodate(page)) + SetPageUptodate(page); + + if (pos+len > dir->i_size) { + i_size_write(dir, pos+len); + mark_inode_dirty(dir); + } + set_page_dirty(page); + + if (IS_DIRSYNC(dir)) + err = write_one_page(page, 1); + else + unlock_page(page); + + return err; +} + +static void exofs_check_page(struct page *page) +{ + struct inode *dir = page->mapping->host; + unsigned chunk_size = exofs_chunk_size(dir); + char *kaddr = page_address(page); + unsigned offs, rec_len; + unsigned limit = PAGE_CACHE_SIZE; + struct exofs_dir_entry *p; + char *error; + + /* if the page is the last one in the directory */ + if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_CACHE_MASK; + if (limit & (chunk_size - 1)) + goto Ebadsize; + if (!limit) + goto out; + } + for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) { + p = (struct exofs_dir_entry *)(kaddr + offs); + rec_len = le16_to_cpu(p->rec_len); + + if (rec_len < EXOFS_DIR_REC_LEN(1)) + goto Eshort; + if (rec_len & 3) + goto Ealign; + if (rec_len < EXOFS_DIR_REC_LEN(p->name_len)) + goto Enamelen; + if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) + goto Espan; + } + if (offs != limit) + goto Eend; +out: + SetPageChecked(page); + return; + +Ebadsize: + EXOFS_ERR("ERROR [exofs_check_page]: " + "size of directory(0x%lx) is not a multiple of chunk size\n", + dir->i_ino + ); + goto fail; +Eshort: + error = "rec_len is smaller than minimal"; + goto bad_entry; +Ealign: + error = "unaligned directory entry"; + goto bad_entry; +Enamelen: + error = "rec_len is too small for name_len"; + goto bad_entry; +Espan: + error = "directory entry across blocks"; + goto bad_entry; +bad_entry: + EXOFS_ERR( + "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - " + "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n", + dir->i_ino, error, (page->index<inode_no)), + rec_len, p->name_len); + goto fail; +Eend: + p = (struct exofs_dir_entry *)(kaddr + offs); + EXOFS_ERR("ERROR [exofs_check_page]: " + "entry in directory(0x%lx) spans the page boundary" + "offset=%lu, inode=0x%llx\n", + dir->i_ino, (page->index<inode_no))); +fail: + SetPageChecked(page); + SetPageError(page); +} + +static struct page *exofs_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + + if (!IS_ERR(page)) { + kmap(page); + if (!PageChecked(page)) + exofs_check_page(page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + exofs_put_page(page); + return ERR_PTR(-EIO); +} + +static inline int exofs_match(int len, const unsigned char *name, + struct exofs_dir_entry *de) +{ + if (len != de->name_len) + return 0; + if (!de->inode_no) + return 0; + return !memcmp(name, de->name, len); +} + +static inline +struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p) +{ + return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len)); +} + +static inline unsigned +exofs_validate_entry(char *base, unsigned offset, unsigned mask) +{ + struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset); + struct exofs_dir_entry *p = + (struct exofs_dir_entry *)(base + (offset&mask)); + while ((char *)p < (char *)de) { + if (p->rec_len == 0) + break; + p = exofs_next_entry(p); + } + return (char *)p - base; +} + +static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = { + [EXOFS_FT_UNKNOWN] = DT_UNKNOWN, + [EXOFS_FT_REG_FILE] = DT_REG, + [EXOFS_FT_DIR] = DT_DIR, + [EXOFS_FT_CHRDEV] = DT_CHR, + [EXOFS_FT_BLKDEV] = DT_BLK, + [EXOFS_FT_FIFO] = DT_FIFO, + [EXOFS_FT_SOCK] = DT_SOCK, + [EXOFS_FT_SYMLINK] = DT_LNK, +}; + +#define S_SHIFT 12 +static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK, +}; + +static inline +void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode) +{ + mode_t mode = inode->i_mode; + de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; +} + +static int +exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + loff_t pos = filp->f_pos; + struct inode *inode = filp->f_path.dentry->d_inode; + unsigned int offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = dir_pages(inode); + unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); + unsigned char *types = NULL; + int need_revalidate = (filp->f_version != inode->i_version); + + if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1)) + return 0; + + types = exofs_filetype_table; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + struct exofs_dir_entry *de; + struct page *page = exofs_get_page(inode, n); + + if (IS_ERR(page)) { + EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n", + inode->i_ino); + filp->f_pos += PAGE_CACHE_SIZE - offset; + return PTR_ERR(page); + } + kaddr = page_address(page); + if (unlikely(need_revalidate)) { + if (offset) { + offset = exofs_validate_entry(kaddr, offset, + chunk_mask); + filp->f_pos = (n<f_version = inode->i_version; + need_revalidate = 0; + } + de = (struct exofs_dir_entry *)(kaddr + offset); + limit = kaddr + exofs_last_byte(inode, n) - + EXOFS_DIR_REC_LEN(1); + for (; (char *)de <= limit; de = exofs_next_entry(de)) { + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: " + "zero-length entry in directory(0x%lx)\n", + inode->i_ino); + exofs_put_page(page); + return -EIO; + } + if (de->inode_no) { + int over; + unsigned char d_type = DT_UNKNOWN; + + if (types && de->file_type < EXOFS_FT_MAX) + d_type = types[de->file_type]; + + offset = (char *)de - kaddr; + over = filldir(dirent, de->name, de->name_len, + (n<inode_no), + d_type); + if (over) { + exofs_put_page(page); + return 0; + } + } + filp->f_pos += le16_to_cpu(de->rec_len); + } + exofs_put_page(page); + } + + return 0; +} + +struct exofs_dir_entry *exofs_find_entry(struct inode *dir, + struct dentry *dentry, struct page **res_page) +{ + const unsigned char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned reclen = EXOFS_DIR_REC_LEN(namelen); + unsigned long start, n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + struct exofs_i_info *oi = exofs_i(dir); + struct exofs_dir_entry *de; + + if (npages == 0) + goto out; + + *res_page = NULL; + + start = oi->i_dir_start_lookup; + if (start >= npages) + start = 0; + n = start; + do { + char *kaddr; + page = exofs_get_page(dir, n); + if (!IS_ERR(page)) { + kaddr = page_address(page); + de = (struct exofs_dir_entry *) kaddr; + kaddr += exofs_last_byte(dir, n) - reclen; + while ((char *) de <= kaddr) { + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: zero-length entry in " + "directory(0x%lx)\n", + dir->i_ino); + exofs_put_page(page); + goto out; + } + if (exofs_match(namelen, name, de)) + goto found; + de = exofs_next_entry(de); + } + exofs_put_page(page); + } + if (++n >= npages) + n = 0; + } while (n != start); +out: + return NULL; + +found: + *res_page = page; + oi->i_dir_start_lookup = n; + return de; +} + +struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p) +{ + struct page *page = exofs_get_page(dir, 0); + struct exofs_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = exofs_next_entry( + (struct exofs_dir_entry *)page_address(page)); + *p = page; + } + return de; +} + +ino_t exofs_parent_ino(struct dentry *child) +{ + struct page *page; + struct exofs_dir_entry *de; + ino_t ino; + + de = exofs_dotdot(child->d_inode, &page); + if (!de) + return 0; + + ino = le64_to_cpu(de->inode_no); + exofs_put_page(page); + return ino; +} + +ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry) +{ + ino_t res = 0; + struct exofs_dir_entry *de; + struct page *page; + + de = exofs_find_entry(dir, dentry, &page); + if (de) { + res = le64_to_cpu(de->inode_no); + exofs_put_page(page); + } + return res; +} + +int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de, + struct page *page, struct inode *inode) +{ + loff_t pos = page_offset(page) + + (char *) de - (char *) page_address(page); + unsigned len = le16_to_cpu(de->rec_len); + int err; + + lock_page(page); + err = exofs_write_begin(NULL, page->mapping, pos, len, + AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); + if (err) + EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n", + err); + + de->inode_no = cpu_to_le64(inode->i_ino); + exofs_set_de_type(de, inode); + if (likely(!err)) + err = exofs_commit_chunk(page, pos, len); + exofs_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + return err; +} + +int exofs_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + const unsigned char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned chunk_size = exofs_chunk_size(dir); + unsigned reclen = EXOFS_DIR_REC_LEN(namelen); + unsigned short rec_len, name_len; + struct page *page = NULL; + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + struct exofs_dir_entry *de; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr; + loff_t pos; + int err; + + for (n = 0; n <= npages; n++) { + char *dir_end; + + page = exofs_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = page_address(page); + dir_end = kaddr + exofs_last_byte(dir, n); + de = (struct exofs_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE - reclen; + while ((char *)de <= kaddr) { + if ((char *)de == dir_end) { + name_len = 0; + rec_len = chunk_size; + de->rec_len = cpu_to_le16(chunk_size); + de->inode_no = 0; + goto got_it; + } + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: exofs_add_link: " + "zero-length entry in directory(0x%lx)\n", + inode->i_ino); + err = -EIO; + goto out_unlock; + } + err = -EEXIST; + if (exofs_match(namelen, name, de)) + goto out_unlock; + name_len = EXOFS_DIR_REC_LEN(de->name_len); + rec_len = le16_to_cpu(de->rec_len); + if (!de->inode_no && rec_len >= reclen) + goto got_it; + if (rec_len >= name_len + reclen) + goto got_it; + de = (struct exofs_dir_entry *) ((char *) de + rec_len); + } + unlock_page(page); + exofs_put_page(page); + } + + EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n", + dentry, inode->i_ino); + return -EINVAL; + +got_it: + pos = page_offset(page) + + (char *)de - (char *)page_address(page); + err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0, + &page, NULL); + if (err) + goto out_unlock; + if (de->inode_no) { + struct exofs_dir_entry *de1 = + (struct exofs_dir_entry *)((char *)de + name_len); + de1->rec_len = cpu_to_le16(rec_len - name_len); + de->rec_len = cpu_to_le16(name_len); + de = de1; + } + de->name_len = namelen; + memcpy(de->name, name, namelen); + de->inode_no = cpu_to_le64(inode->i_ino); + exofs_set_de_type(de, inode); + err = exofs_commit_chunk(page, pos, rec_len); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + sbi->s_numfiles++; + +out_put: + exofs_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + char *kaddr = page_address(page); + unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1); + unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len); + loff_t pos; + struct exofs_dir_entry *pde = NULL; + struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from); + int err; + + while (de < dir) { + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: exofs_delete_entry:" + "zero-length entry in directory(0x%lx)\n", + inode->i_ino); + err = -EIO; + goto out; + } + pde = de; + de = exofs_next_entry(de); + } + if (pde) + from = (char *)pde - (char *)page_address(page); + pos = page_offset(page) + from; + lock_page(page); + err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0, + &page, NULL); + if (err) + EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n", + err); + if (pde) + pde->rec_len = cpu_to_le16(to - from); + dir->inode_no = 0; + if (likely(!err)) + err = exofs_commit_chunk(page, pos, to - from); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + mark_inode_dirty(inode); + sbi->s_numfiles--; +out: + exofs_put_page(page); + return err; +} + +/* kept aligned on 4 bytes */ +#define THIS_DIR ".\0\0" +#define PARENT_DIR "..\0" + +int exofs_make_empty(struct inode *inode, struct inode *parent) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); + unsigned chunk_size = exofs_chunk_size(inode); + struct exofs_dir_entry *de; + int err; + void *kaddr; + + if (!page) + return -ENOMEM; + + err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0, + &page, NULL); + if (err) { + unlock_page(page); + goto fail; + } + + kaddr = kmap_atomic(page, KM_USER0); + de = (struct exofs_dir_entry *)kaddr; + de->name_len = 1; + de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1)); + memcpy(de->name, THIS_DIR, sizeof(THIS_DIR)); + de->inode_no = cpu_to_le64(inode->i_ino); + exofs_set_de_type(de, inode); + + de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1)); + de->name_len = 2; + de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1)); + de->inode_no = cpu_to_le64(parent->i_ino); + memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); + exofs_set_de_type(de, inode); + kunmap_atomic(kaddr, KM_USER0); + err = exofs_commit_chunk(page, 0, chunk_size); +fail: + page_cache_release(page); + return err; +} + +int exofs_empty_dir(struct inode *inode) +{ + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + + for (i = 0; i < npages; i++) { + char *kaddr; + struct exofs_dir_entry *de; + page = exofs_get_page(inode, i); + + if (IS_ERR(page)) + continue; + + kaddr = page_address(page); + de = (struct exofs_dir_entry *)kaddr; + kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1); + + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + EXOFS_ERR("ERROR: exofs_empty_dir: " + "zero-length directory entry" + "kaddr=%p, de=%p\n", kaddr, de); + goto not_empty; + } + if (de->inode_no != 0) { + /* check for . and .. */ + if (de->name[0] != '.') + goto not_empty; + if (de->name_len > 2) + goto not_empty; + if (de->name_len < 2) { + if (le64_to_cpu(de->inode_no) != + inode->i_ino) + goto not_empty; + } else if (de->name[1] != '.') + goto not_empty; + } + de = exofs_next_entry(de); + } + exofs_put_page(page); + } + return 1; + +not_empty: + exofs_put_page(page); + return 0; +} + +const struct file_operations exofs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = exofs_readdir, +}; diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h new file mode 100644 index 00000000..c965806c --- /dev/null +++ b/fs/exofs/exofs.h @@ -0,0 +1,308 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __EXOFS_H__ +#define __EXOFS_H__ + +#include +#include +#include +#include "common.h" + +/* FIXME: Remove once pnfs hits mainline + * #include + */ +#include "pnfs.h" + +#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) + +#ifdef CONFIG_EXOFS_DEBUG +#define EXOFS_DBGMSG(fmt, a...) \ + printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a) +#else +#define EXOFS_DBGMSG(fmt, a...) \ + do { if (0) printk(fmt, ##a); } while (0) +#endif + +/* u64 has problems with printk this will cast it to unsigned long long */ +#define _LLU(x) (unsigned long long)(x) + +struct exofs_layout { + osd_id s_pid; /* partition ID of file system*/ + + /* Our way of looking at the data_map */ + unsigned stripe_unit; + unsigned mirrors_p1; + + unsigned group_width; + u64 group_depth; + unsigned group_count; + + enum exofs_inode_layout_gen_functions lay_func; + + unsigned s_numdevs; /* Num of devices in array */ + struct osd_dev *s_ods[0]; /* Variable length */ +}; + +/* + * our extension to the in-memory superblock + */ +struct exofs_sb_info { + struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ + int s_timeout; /* timeout for OSD operations */ + uint64_t s_nextid; /* highest object ID used */ + uint32_t s_numfiles; /* number of files on fs */ + spinlock_t s_next_gen_lock; /* spinlock for gen # update */ + u32 s_next_generation; /* next gen # to use */ + atomic_t s_curr_pending; /* number of pending commands */ + uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ + struct backing_dev_info bdi; /* register our bdi with VFS */ + + struct pnfs_osd_data_map data_map; /* Default raid to use + * FIXME: Needed ? + */ +/* struct exofs_layout dir_layout;*/ /* Default dir layout */ + struct exofs_layout layout; /* Default files layout, + * contains the variable osd_dev + * array. Keep last */ + struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ +}; + +/* + * our extension to the in-memory inode + */ +struct exofs_i_info { + struct inode vfs_inode; /* normal in-memory inode */ + wait_queue_head_t i_wq; /* wait queue for inode */ + unsigned long i_flags; /* various atomic flags */ + uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ + uint32_t i_dir_start_lookup; /* which page to start lookup */ + uint64_t i_commit_size; /* the object's written length */ + uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ +}; + +static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) +{ + return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF; +} + +struct exofs_io_state; +typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private); + +struct exofs_io_state { + struct kref kref; + + void *private; + exofs_io_done_fn done; + + struct exofs_layout *layout; + struct osd_obj_id obj; + u8 *cred; + + /* Global read/write IO*/ + loff_t offset; + unsigned long length; + void *kern_buff; + + struct page **pages; + unsigned nr_pages; + unsigned pgbase; + unsigned pages_consumed; + + /* Attributes */ + unsigned in_attr_len; + struct osd_attr *in_attr; + unsigned out_attr_len; + struct osd_attr *out_attr; + + /* Variable array of size numdevs */ + unsigned numdevs; + struct exofs_per_dev_state { + struct osd_request *or; + struct bio *bio; + loff_t offset; + unsigned length; + unsigned dev; + } per_dev[]; +}; + +static inline unsigned exofs_io_state_size(unsigned numdevs) +{ + return sizeof(struct exofs_io_state) + + sizeof(struct exofs_per_dev_state) * numdevs; +} + +/* + * our inode flags + */ +#define OBJ_2BCREATED 0 /* object will be created soon*/ +#define OBJ_CREATED 1 /* object has been created on the osd*/ + +static inline int obj_2bcreated(struct exofs_i_info *oi) +{ + return test_bit(OBJ_2BCREATED, &oi->i_flags); +} + +static inline void set_obj_2bcreated(struct exofs_i_info *oi) +{ + set_bit(OBJ_2BCREATED, &oi->i_flags); +} + +static inline int obj_created(struct exofs_i_info *oi) +{ + return test_bit(OBJ_CREATED, &oi->i_flags); +} + +static inline void set_obj_created(struct exofs_i_info *oi) +{ + set_bit(OBJ_CREATED, &oi->i_flags); +} + +int __exofs_wait_obj_created(struct exofs_i_info *oi); +static inline int wait_obj_created(struct exofs_i_info *oi) +{ + if (likely(obj_created(oi))) + return 0; + + return __exofs_wait_obj_created(oi); +} + +/* + * get to our inode from the vfs inode + */ +static inline struct exofs_i_info *exofs_i(struct inode *inode) +{ + return container_of(inode, struct exofs_i_info, vfs_inode); +} + +/* + * Given a layout, object_number and stripe_index return the associated global + * dev_index + */ +unsigned exofs_layout_od_id(struct exofs_layout *layout, + osd_id obj_no, unsigned layout_index); +/* + * Maximum count of links to a file + */ +#define EXOFS_LINK_MAX 32000 + +/************************* + * function declarations * + *************************/ + +/* ios.c */ +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], + const struct osd_obj_id *obj); +int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, + u64 offset, void *p, unsigned length); + +int exofs_get_io_state(struct exofs_layout *layout, + struct exofs_io_state **ios); +void exofs_put_io_state(struct exofs_io_state *ios); + +int exofs_check_io(struct exofs_io_state *ios, u64 *resid); + +int exofs_sbi_create(struct exofs_io_state *ios); +int exofs_sbi_remove(struct exofs_io_state *ios); +int exofs_sbi_write(struct exofs_io_state *ios); +int exofs_sbi_read(struct exofs_io_state *ios); + +int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr); + +int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len); +static inline int exofs_oi_write(struct exofs_i_info *oi, + struct exofs_io_state *ios) +{ + ios->obj.id = exofs_oi_objno(oi); + ios->cred = oi->i_cred; + return exofs_sbi_write(ios); +} + +static inline int exofs_oi_read(struct exofs_i_info *oi, + struct exofs_io_state *ios) +{ + ios->obj.id = exofs_oi_objno(oi); + ios->cred = oi->i_cred; + return exofs_sbi_read(ios); +} + +/* inode.c */ +unsigned exofs_max_io_pages(struct exofs_layout *layout, + unsigned expected_pages); +int exofs_setattr(struct dentry *, struct iattr *); +int exofs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +extern struct inode *exofs_iget(struct super_block *, unsigned long); +struct inode *exofs_new_inode(struct inode *, int); +extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); +extern void exofs_evict_inode(struct inode *); + +/* dir.c: */ +int exofs_add_link(struct dentry *, struct inode *); +ino_t exofs_inode_by_name(struct inode *, struct dentry *); +int exofs_delete_entry(struct exofs_dir_entry *, struct page *); +int exofs_make_empty(struct inode *, struct inode *); +struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *, + struct page **); +int exofs_empty_dir(struct inode *); +struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **); +ino_t exofs_parent_ino(struct dentry *child); +int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *, + struct inode *); + +/* super.c */ +int exofs_sbi_write_stats(struct exofs_sb_info *sbi); + +/********************* + * operation vectors * + *********************/ +/* dir.c: */ +extern const struct file_operations exofs_dir_operations; + +/* file.c */ +extern const struct inode_operations exofs_file_inode_operations; +extern const struct file_operations exofs_file_operations; + +/* inode.c */ +extern const struct address_space_operations exofs_aops; +extern const struct osd_attr g_attr_logical_length; + +/* namei.c */ +extern const struct inode_operations exofs_dir_inode_operations; +extern const struct inode_operations exofs_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations exofs_symlink_inode_operations; +extern const struct inode_operations exofs_fast_symlink_inode_operations; + +#endif diff --git a/fs/exofs/file.c b/fs/exofs/file.c new file mode 100644 index 00000000..45ca323d --- /dev/null +++ b/fs/exofs/file.c @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "exofs.h" + +static int exofs_release_file(struct inode *inode, struct file *filp) +{ + return 0; +} + +/* exofs_file_fsync - flush the inode to disk + * + * Note, in exofs all metadata is written as part of inode, regardless. + * The writeout is synchronous + */ +static int exofs_file_fsync(struct file *filp, int datasync) +{ + int ret; + + ret = sync_inode_metadata(filp->f_mapping->host, 1); + return ret; +} + +static int exofs_flush(struct file *file, fl_owner_t id) +{ + int ret = vfs_fsync(file, 0); + /* TODO: Flush the OSD target */ + return ret; +} + +const struct file_operations exofs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .open = generic_file_open, + .release = exofs_release_file, + .fsync = exofs_file_fsync, + .flush = exofs_flush, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +}; + +const struct inode_operations exofs_file_inode_operations = { + .setattr = exofs_setattr, +}; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c new file mode 100644 index 00000000..8472c098 --- /dev/null +++ b/fs/exofs/inode.c @@ -0,0 +1,1374 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "exofs.h" + +#define EXOFS_DBGMSG2(M...) do {} while (0) + +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), + MAX_PAGES_KMALLOC = + PAGE_SIZE / sizeof(struct page *), +}; + +unsigned exofs_max_io_pages(struct exofs_layout *layout, + unsigned expected_pages) +{ + unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); + + /* TODO: easily support bio chaining */ + pages = min_t(unsigned, pages, + layout->group_width * BIO_MAX_PAGES_KMALLOC); + return pages; +} + +struct page_collect { + struct exofs_sb_info *sbi; + struct inode *inode; + unsigned expected_pages; + struct exofs_io_state *ios; + + struct page **pages; + unsigned alloc_pages; + unsigned nr_pages; + unsigned long length; + loff_t pg_first; /* keep 64bit also in 32-arches */ + bool read_4_write; /* This means two things: that the read is sync + * And the pages should not be unlocked. + */ +}; + +static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, + struct inode *inode) +{ + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + + pcol->sbi = sbi; + pcol->inode = inode; + pcol->expected_pages = expected_pages; + + pcol->ios = NULL; + pcol->pages = NULL; + pcol->alloc_pages = 0; + pcol->nr_pages = 0; + pcol->length = 0; + pcol->pg_first = -1; + pcol->read_4_write = false; +} + +static void _pcol_reset(struct page_collect *pcol) +{ + pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); + + pcol->pages = NULL; + pcol->alloc_pages = 0; + pcol->nr_pages = 0; + pcol->length = 0; + pcol->pg_first = -1; + pcol->ios = NULL; + + /* this is probably the end of the loop but in writes + * it might not end here. don't be left with nothing + */ + if (!pcol->expected_pages) + pcol->expected_pages = MAX_PAGES_KMALLOC; +} + +static int pcol_try_alloc(struct page_collect *pcol) +{ + unsigned pages; + + if (!pcol->ios) { /* First time allocate io_state */ + int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); + + if (ret) + return ret; + } + + /* TODO: easily support bio chaining */ + pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages); + + for (; pages; pages >>= 1) { + pcol->pages = kmalloc(pages * sizeof(struct page *), + GFP_KERNEL); + if (likely(pcol->pages)) { + pcol->alloc_pages = pages; + return 0; + } + } + + EXOFS_ERR("Failed to kmalloc expected_pages=%u\n", + pcol->expected_pages); + return -ENOMEM; +} + +static void pcol_free(struct page_collect *pcol) +{ + kfree(pcol->pages); + pcol->pages = NULL; + + if (pcol->ios) { + exofs_put_io_state(pcol->ios); + pcol->ios = NULL; + } +} + +static int pcol_add_page(struct page_collect *pcol, struct page *page, + unsigned len) +{ + if (unlikely(pcol->nr_pages >= pcol->alloc_pages)) + return -ENOMEM; + + pcol->pages[pcol->nr_pages++] = page; + pcol->length += len; + return 0; +} + +static int update_read_page(struct page *page, int ret) +{ + if (ret == 0) { + /* Everything is OK */ + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + } else if (ret == -EFAULT) { + /* In this case we were trying to read something that wasn't on + * disk yet - return a page full of zeroes. This should be OK, + * because the object should be empty (if there was a write + * before this read, the read would be waiting with the page + * locked */ + clear_highpage(page); + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + ret = 0; /* recovered error */ + EXOFS_DBGMSG("recovered read error\n"); + } else /* Error */ + SetPageError(page); + + return ret; +} + +static void update_write_page(struct page *page, int ret) +{ + if (ret) { + mapping_set_error(page->mapping, ret); + SetPageError(page); + } + end_page_writeback(page); +} + +/* Called at the end of reads, to optionally unlock pages and update their + * status. + */ +static int __readpages_done(struct page_collect *pcol) +{ + int i; + u64 resid; + u64 good_bytes; + u64 length = 0; + int ret = exofs_check_io(pcol->ios, &resid); + + if (likely(!ret)) + good_bytes = pcol->length; + else + good_bytes = pcol->length - resid; + + EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" + " length=0x%lx nr_pages=%u\n", + pcol->inode->i_ino, _LLU(good_bytes), pcol->length, + pcol->nr_pages); + + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; + struct inode *inode = page->mapping->host; + int page_stat; + + if (inode != pcol->inode) + continue; /* osd might add more pages at end */ + + if (likely(length < good_bytes)) + page_stat = 0; + else + page_stat = ret; + + EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n", + inode->i_ino, page->index, + page_stat ? "bad_bytes" : "good_bytes"); + + ret = update_read_page(page, page_stat); + if (!pcol->read_4_write) + unlock_page(page); + length += PAGE_SIZE; + } + + pcol_free(pcol); + EXOFS_DBGMSG2("readpages_done END\n"); + return ret; +} + +/* callback of async reads */ +static void readpages_done(struct exofs_io_state *ios, void *p) +{ + struct page_collect *pcol = p; + + __readpages_done(pcol); + atomic_dec(&pcol->sbi->s_curr_pending); + kfree(pcol); +} + +static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) +{ + int i; + + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; + + if (rw == READ) + update_read_page(page, ret); + else + update_write_page(page, ret); + + unlock_page(page); + } +} + +static int read_exec(struct page_collect *pcol) +{ + struct exofs_i_info *oi = exofs_i(pcol->inode); + struct exofs_io_state *ios = pcol->ios; + struct page_collect *pcol_copy = NULL; + int ret; + + if (!pcol->pages) + return 0; + + ios->pages = pcol->pages; + ios->nr_pages = pcol->nr_pages; + ios->length = pcol->length; + ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; + + if (pcol->read_4_write) { + exofs_oi_read(oi, pcol->ios); + return __readpages_done(pcol); + } + + pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); + if (!pcol_copy) { + ret = -ENOMEM; + goto err; + } + + *pcol_copy = *pcol; + ios->done = readpages_done; + ios->private = pcol_copy; + ret = exofs_oi_read(oi, ios); + if (unlikely(ret)) + goto err; + + atomic_inc(&pcol->sbi->s_curr_pending); + + EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", + ios->obj.id, _LLU(ios->offset), pcol->length); + + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + return 0; + +err: + if (!pcol->read_4_write) + _unlock_pcol_pages(pcol, ret, READ); + + pcol_free(pcol); + + kfree(pcol_copy); + return ret; +} + +/* readpage_strip is called either directly from readpage() or by the VFS from + * within read_cache_pages(), to add one more page to be read. It will try to + * collect as many contiguous pages as posible. If a discontinuity is + * encountered, or it runs out of resources, it will submit the previous segment + * and will start a new collection. Eventually caller must submit the last + * segment if present. + */ +static int readpage_strip(void *data, struct page *page) +{ + struct page_collect *pcol = data; + struct inode *inode = pcol->inode; + struct exofs_i_info *oi = exofs_i(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + size_t len; + int ret; + + /* FIXME: Just for debugging, will be removed */ + if (PageUptodate(page)) + EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, + page->index); + + if (page->index < end_index) + len = PAGE_CACHE_SIZE; + else if (page->index == end_index) + len = i_size & ~PAGE_CACHE_MASK; + else + len = 0; + + if (!len || !obj_created(oi)) { + /* this will be out of bounds, or doesn't exist yet. + * Current page is cleared and the request is split + */ + clear_highpage(page); + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + + if (!pcol->read_4_write) + unlock_page(page); + EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx " + "read_4_write=%d index=0x%lx end_index=0x%lx " + "splitting\n", inode->i_ino, len, + pcol->read_4_write, page->index, end_index); + + return read_exec(pcol); + } + +try_again: + + if (unlikely(pcol->pg_first == -1)) { + pcol->pg_first = page->index; + } else if (unlikely((pcol->pg_first + pcol->nr_pages) != + page->index)) { + /* Discontinuity detected, split the request */ + ret = read_exec(pcol); + if (unlikely(ret)) + goto fail; + goto try_again; + } + + if (!pcol->pages) { + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + goto fail; + } + + if (len != PAGE_CACHE_SIZE) + zero_user(page, len, PAGE_CACHE_SIZE - len); + + EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", + inode->i_ino, page->index, len); + + ret = pcol_add_page(pcol, page, len); + if (ret) { + EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p " + "this_len=0x%zx nr_pages=%u length=0x%lx\n", + page, len, pcol->nr_pages, pcol->length); + + /* split the request, and start again with current page */ + ret = read_exec(pcol); + if (unlikely(ret)) + goto fail; + + goto try_again; + } + + return 0; + +fail: + /* SetPageError(page); ??? */ + unlock_page(page); + return ret; +} + +static int exofs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct page_collect pcol; + int ret; + + _pcol_init(&pcol, nr_pages, mapping->host); + + ret = read_cache_pages(mapping, pages, readpage_strip, &pcol); + if (ret) { + EXOFS_ERR("read_cache_pages => %d\n", ret); + return ret; + } + + return read_exec(&pcol); +} + +static int _readpage(struct page *page, bool read_4_write) +{ + struct page_collect pcol; + int ret; + + _pcol_init(&pcol, 1, page->mapping->host); + + pcol.read_4_write = read_4_write; + ret = readpage_strip(&pcol, page); + if (ret) { + EXOFS_ERR("_readpage => %d\n", ret); + return ret; + } + + return read_exec(&pcol); +} + +/* + * We don't need the file + */ +static int exofs_readpage(struct file *file, struct page *page) +{ + return _readpage(page, false); +} + +/* Callback for osd_write. All writes are asynchronous */ +static void writepages_done(struct exofs_io_state *ios, void *p) +{ + struct page_collect *pcol = p; + int i; + u64 resid; + u64 good_bytes; + u64 length = 0; + int ret = exofs_check_io(ios, &resid); + + atomic_dec(&pcol->sbi->s_curr_pending); + + if (likely(!ret)) + good_bytes = pcol->length; + else + good_bytes = pcol->length - resid; + + EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" + " length=0x%lx nr_pages=%u\n", + pcol->inode->i_ino, _LLU(good_bytes), pcol->length, + pcol->nr_pages); + + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; + struct inode *inode = page->mapping->host; + int page_stat; + + if (inode != pcol->inode) + continue; /* osd might add more pages to a bio */ + + if (likely(length < good_bytes)) + page_stat = 0; + else + page_stat = ret; + + update_write_page(page, page_stat); + unlock_page(page); + EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", + inode->i_ino, page->index, page_stat); + + length += PAGE_SIZE; + } + + pcol_free(pcol); + kfree(pcol); + EXOFS_DBGMSG2("writepages_done END\n"); +} + +static int write_exec(struct page_collect *pcol) +{ + struct exofs_i_info *oi = exofs_i(pcol->inode); + struct exofs_io_state *ios = pcol->ios; + struct page_collect *pcol_copy = NULL; + int ret; + + if (!pcol->pages) + return 0; + + pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); + if (!pcol_copy) { + EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n"); + ret = -ENOMEM; + goto err; + } + + *pcol_copy = *pcol; + + ios->pages = pcol_copy->pages; + ios->nr_pages = pcol_copy->nr_pages; + ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; + ios->length = pcol_copy->length; + ios->done = writepages_done; + ios->private = pcol_copy; + + ret = exofs_oi_write(oi, ios); + if (unlikely(ret)) { + EXOFS_ERR("write_exec: exofs_oi_write() Failed\n"); + goto err; + } + + atomic_inc(&pcol->sbi->s_curr_pending); + EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", + pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), + pcol->length); + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + return 0; + +err: + _unlock_pcol_pages(pcol, ret, WRITE); + pcol_free(pcol); + kfree(pcol_copy); + + return ret; +} + +/* writepage_strip is called either directly from writepage() or by the VFS from + * within write_cache_pages(), to add one more page to be written to storage. + * It will try to collect as many contiguous pages as possible. If a + * discontinuity is encountered or it runs out of resources it will submit the + * previous segment and will start a new collection. + * Eventually caller must submit the last segment if present. + */ +static int writepage_strip(struct page *page, + struct writeback_control *wbc_unused, void *data) +{ + struct page_collect *pcol = data; + struct inode *inode = pcol->inode; + struct exofs_i_info *oi = exofs_i(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + size_t len; + int ret; + + BUG_ON(!PageLocked(page)); + + ret = wait_obj_created(oi); + if (unlikely(ret)) + goto fail; + + if (page->index < end_index) + /* in this case, the page is within the limits of the file */ + len = PAGE_CACHE_SIZE; + else { + len = i_size & ~PAGE_CACHE_MASK; + + if (page->index > end_index || !len) { + /* in this case, the page is outside the limits + * (truncate in progress) + */ + ret = write_exec(pcol); + if (unlikely(ret)) + goto fail; + if (PageError(page)) + ClearPageError(page); + unlock_page(page); + EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) " + "outside the limits\n", + inode->i_ino, page->index); + return 0; + } + } + +try_again: + + if (unlikely(pcol->pg_first == -1)) { + pcol->pg_first = page->index; + } else if (unlikely((pcol->pg_first + pcol->nr_pages) != + page->index)) { + /* Discontinuity detected, split the request */ + ret = write_exec(pcol); + if (unlikely(ret)) + goto fail; + + EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n", + inode->i_ino, page->index); + goto try_again; + } + + if (!pcol->pages) { + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + goto fail; + } + + EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", + inode->i_ino, page->index, len); + + ret = pcol_add_page(pcol, page, len); + if (unlikely(ret)) { + EXOFS_DBGMSG2("Failed pcol_add_page " + "nr_pages=%u total_length=0x%lx\n", + pcol->nr_pages, pcol->length); + + /* split the request, next loop will start again */ + ret = write_exec(pcol); + if (unlikely(ret)) { + EXOFS_DBGMSG("write_exec failed => %d", ret); + goto fail; + } + + goto try_again; + } + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + return 0; + +fail: + EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n", + inode->i_ino, page->index, ret); + set_bit(AS_EIO, &page->mapping->flags); + unlock_page(page); + return ret; +} + +static int exofs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct page_collect pcol; + long start, end, expected_pages; + int ret; + + start = wbc->range_start >> PAGE_CACHE_SHIFT; + end = (wbc->range_end == LLONG_MAX) ? + start + mapping->nrpages : + wbc->range_end >> PAGE_CACHE_SHIFT; + + if (start || end) + expected_pages = end - start + 1; + else + expected_pages = mapping->nrpages; + + if (expected_pages < 32L) + expected_pages = 32L; + + EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " + "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n", + mapping->host->i_ino, wbc->range_start, wbc->range_end, + mapping->nrpages, start, end, expected_pages); + + _pcol_init(&pcol, expected_pages, mapping->host); + + ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); + if (ret) { + EXOFS_ERR("write_cache_pages => %d\n", ret); + return ret; + } + + return write_exec(&pcol); +} + +static int exofs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct page_collect pcol; + int ret; + + _pcol_init(&pcol, 1, page->mapping->host); + + ret = writepage_strip(page, NULL, &pcol); + if (ret) { + EXOFS_ERR("exofs_writepage => %d\n", ret); + return ret; + } + + return write_exec(&pcol); +} + +/* i_mutex held using inode->i_size directly */ +static void _write_failed(struct inode *inode, loff_t to) +{ + if (to > inode->i_size) + truncate_pagecache(inode, to, inode->i_size); +} + +int exofs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret = 0; + struct page *page; + + page = *pagep; + if (page == NULL) { + ret = simple_write_begin(file, mapping, pos, len, flags, pagep, + fsdata); + if (ret) { + EXOFS_DBGMSG("simple_write_begin failed\n"); + goto out; + } + + page = *pagep; + } + + /* read modify write */ + if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { + loff_t i_size = i_size_read(mapping->host); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + size_t rlen; + + if (page->index < end_index) + rlen = PAGE_CACHE_SIZE; + else if (page->index == end_index) + rlen = i_size & ~PAGE_CACHE_MASK; + else + rlen = 0; + + if (!rlen) { + clear_highpage(page); + SetPageUptodate(page); + goto out; + } + + ret = _readpage(page, true); + if (ret) { + /*SetPageError was done by _readpage. Is it ok?*/ + unlock_page(page); + EXOFS_DBGMSG("__readpage failed\n"); + } + } +out: + if (unlikely(ret)) + _write_failed(mapping->host, pos + len); + + return ret; +} + +static int exofs_write_begin_export(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + *pagep = NULL; + + return exofs_write_begin(file, mapping, pos, len, flags, pagep, + fsdata); +} + +static int exofs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + /* According to comment in simple_write_end i_mutex is held */ + loff_t i_size = inode->i_size; + int ret; + + ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata); + if (unlikely(ret)) + _write_failed(inode, pos + len); + + /* TODO: once simple_write_end marks inode dirty remove */ + if (i_size != inode->i_size) + mark_inode_dirty(inode); + return ret; +} + +static int exofs_releasepage(struct page *page, gfp_t gfp) +{ + EXOFS_DBGMSG("page 0x%lx\n", page->index); + WARN_ON(1); + return 0; +} + +static void exofs_invalidatepage(struct page *page, unsigned long offset) +{ + EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset); + WARN_ON(1); +} + +const struct address_space_operations exofs_aops = { + .readpage = exofs_readpage, + .readpages = exofs_readpages, + .writepage = exofs_writepage, + .writepages = exofs_writepages, + .write_begin = exofs_write_begin_export, + .write_end = exofs_write_end, + .releasepage = exofs_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, + .invalidatepage = exofs_invalidatepage, + + /* Not implemented Yet */ + .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ + .direct_IO = NULL, /* TODO: Should be trivial to do */ + + /* With these NULL has special meaning or default is not exported */ + .get_xip_mem = NULL, + .migratepage = NULL, + .launder_page = NULL, + .is_partially_uptodate = NULL, + .error_remove_page = NULL, +}; + +/****************************************************************************** + * INODE OPERATIONS + *****************************************************************************/ + +/* + * Test whether an inode is a fast symlink. + */ +static inline int exofs_inode_is_fast_symlink(struct inode *inode) +{ + struct exofs_i_info *oi = exofs_i(inode); + + return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); +} + +const struct osd_attr g_attr_logical_length = ATTR_DEF( + OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); + +static int _do_truncate(struct inode *inode, loff_t newsize) +{ + struct exofs_i_info *oi = exofs_i(inode); + int ret; + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + ret = exofs_oi_truncate(oi, (u64)newsize); + if (likely(!ret)) + truncate_setsize(inode, newsize); + + EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n", + inode->i_ino, newsize, ret); + return ret; +} + +/* + * Set inode attributes - update size attribute on OSD if needed, + * otherwise just call generic functions. + */ +int exofs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int error; + + /* if we are about to modify an object, and it hasn't been + * created yet, wait + */ + error = wait_obj_created(exofs_i(inode)); + if (unlikely(error)) + return error; + + error = inode_change_ok(inode, iattr); + if (unlikely(error)) + return error; + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(inode)) { + error = _do_truncate(inode, iattr->ia_size); + if (unlikely(error)) + return error; + } + + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + return 0; +} + +static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF( + EXOFS_APAGE_FS_DATA, + EXOFS_ATTR_INODE_FILE_LAYOUT, + 0); +static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF( + EXOFS_APAGE_FS_DATA, + EXOFS_ATTR_INODE_DIR_LAYOUT, + 0); + +/* + * Read the Linux inode info from the OSD, and return it as is. In exofs the + * inode info is in an application specific page/attribute of the osd-object. + */ +static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, + struct exofs_fcb *inode) +{ + struct exofs_sb_info *sbi = sb->s_fs_info; + struct osd_attr attrs[] = { + [0] = g_attr_inode_data, + [1] = g_attr_inode_file_layout, + [2] = g_attr_inode_dir_layout, + }; + struct exofs_io_state *ios; + struct exofs_on_disk_inode_layout *layout; + int ret; + + ret = exofs_get_io_state(&sbi->layout, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + return ret; + } + + ios->obj.id = exofs_oi_objno(oi); + exofs_make_credential(oi->i_cred, &ios->obj); + ios->cred = oi->i_cred; + + attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); + attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); + + ios->in_attr = attrs; + ios->in_attr_len = ARRAY_SIZE(attrs); + + ret = exofs_sbi_read(ios); + if (unlikely(ret)) { + EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", + _LLU(ios->obj.id), ret); + memset(inode, 0, sizeof(*inode)); + inode->i_mode = 0040000 | (0777 & ~022); + /* If object is lost on target we might as well enable it's + * delete. + */ + if ((ret == -ENOENT) || (ret == -EINVAL)) + ret = 0; + goto out; + } + + ret = extract_attr_from_ios(ios, &attrs[0]); + if (ret) { + EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + goto out; + } + WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); + memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE); + + ret = extract_attr_from_ios(ios, &attrs[1]); + if (ret) { + EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + goto out; + } + if (attrs[1].len) { + layout = attrs[1].val_ptr; + if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { + EXOFS_ERR("%s: unsupported files layout %d\n", + __func__, layout->gen_func); + ret = -ENOTSUPP; + goto out; + } + } + + ret = extract_attr_from_ios(ios, &attrs[2]); + if (ret) { + EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + goto out; + } + if (attrs[2].len) { + layout = attrs[2].val_ptr; + if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { + EXOFS_ERR("%s: unsupported meta-data layout %d\n", + __func__, layout->gen_func); + ret = -ENOTSUPP; + goto out; + } + } + +out: + exofs_put_io_state(ios); + return ret; +} + +static void __oi_init(struct exofs_i_info *oi) +{ + init_waitqueue_head(&oi->i_wq); + oi->i_flags = 0; +} +/* + * Fill in an inode read from the OSD and set it up for use + */ +struct inode *exofs_iget(struct super_block *sb, unsigned long ino) +{ + struct exofs_i_info *oi; + struct exofs_fcb fcb; + struct inode *inode; + int ret; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + oi = exofs_i(inode); + __oi_init(oi); + + /* read the inode from the osd */ + ret = exofs_get_inode(sb, oi, &fcb); + if (ret) + goto bad_inode; + + set_obj_created(oi); + + /* copy stuff from on-disk struct to in-memory struct */ + inode->i_mode = le16_to_cpu(fcb.i_mode); + inode->i_uid = le32_to_cpu(fcb.i_uid); + inode->i_gid = le32_to_cpu(fcb.i_gid); + inode->i_nlink = le16_to_cpu(fcb.i_links_count); + inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); + inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); + inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); + inode->i_ctime.tv_nsec = + inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0; + oi->i_commit_size = le64_to_cpu(fcb.i_size); + i_size_write(inode, oi->i_commit_size); + inode->i_blkbits = EXOFS_BLKSHIFT; + inode->i_generation = le32_to_cpu(fcb.i_generation); + + oi->i_dir_start_lookup = 0; + + if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { + ret = -ESTALE; + goto bad_inode; + } + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (fcb.i_data[0]) + inode->i_rdev = + old_decode_dev(le32_to_cpu(fcb.i_data[0])); + else + inode->i_rdev = + new_decode_dev(le32_to_cpu(fcb.i_data[1])); + } else { + memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); + } + + inode->i_mapping->backing_dev_info = sb->s_bdi; + if (S_ISREG(inode->i_mode)) { + inode->i_op = &exofs_file_inode_operations; + inode->i_fop = &exofs_file_operations; + inode->i_mapping->a_ops = &exofs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &exofs_dir_inode_operations; + inode->i_fop = &exofs_dir_operations; + inode->i_mapping->a_ops = &exofs_aops; + } else if (S_ISLNK(inode->i_mode)) { + if (exofs_inode_is_fast_symlink(inode)) + inode->i_op = &exofs_fast_symlink_inode_operations; + else { + inode->i_op = &exofs_symlink_inode_operations; + inode->i_mapping->a_ops = &exofs_aops; + } + } else { + inode->i_op = &exofs_special_inode_operations; + if (fcb.i_data[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(fcb.i_data[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(fcb.i_data[1]))); + } + + unlock_new_inode(inode); + return inode; + +bad_inode: + iget_failed(inode); + return ERR_PTR(ret); +} + +int __exofs_wait_obj_created(struct exofs_i_info *oi) +{ + if (!obj_created(oi)) { + EXOFS_DBGMSG("!obj_created\n"); + BUG_ON(!obj_2bcreated(oi)); + wait_event(oi->i_wq, obj_created(oi)); + EXOFS_DBGMSG("wait_event done\n"); + } + return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; +} + +/* + * Callback function from exofs_new_inode(). The important thing is that we + * set the obj_created flag so that other methods know that the object exists on + * the OSD. + */ +static void create_done(struct exofs_io_state *ios, void *p) +{ + struct inode *inode = p; + struct exofs_i_info *oi = exofs_i(inode); + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + int ret; + + ret = exofs_check_io(ios, NULL); + exofs_put_io_state(ios); + + atomic_dec(&sbi->s_curr_pending); + + if (unlikely(ret)) { + EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx", + _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); + /*TODO: When FS is corrupted creation can fail, object already + * exist. Get rid of this asynchronous creation, if exist + * increment the obj counter and try the next object. Until we + * succeed. All these dangling objects will be made into lost + * files by chkfs.exofs + */ + } + + set_obj_created(oi); + + wake_up(&oi->i_wq); +} + +/* + * Set up a new inode and create an object for it on the OSD + */ +struct inode *exofs_new_inode(struct inode *dir, int mode) +{ + struct super_block *sb; + struct inode *inode; + struct exofs_i_info *oi; + struct exofs_sb_info *sbi; + struct exofs_io_state *ios; + int ret; + + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + oi = exofs_i(inode); + __oi_init(oi); + + set_obj_2bcreated(oi); + + sbi = sb->s_fs_info; + + inode->i_mapping->backing_dev_info = sb->s_bdi; + inode_init_owner(inode, dir, mode); + inode->i_ino = sbi->s_nextid++; + inode->i_blkbits = EXOFS_BLKSHIFT; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + oi->i_commit_size = inode->i_size = 0; + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + insert_inode_hash(inode); + + exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ + + mark_inode_dirty(inode); + + ret = exofs_get_io_state(&sbi->layout, &ios); + if (unlikely(ret)) { + EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); + return ERR_PTR(ret); + } + + ios->obj.id = exofs_oi_objno(oi); + exofs_make_credential(oi->i_cred, &ios->obj); + + ios->done = create_done; + ios->private = inode; + ios->cred = oi->i_cred; + ret = exofs_sbi_create(ios); + if (ret) { + exofs_put_io_state(ios); + return ERR_PTR(ret); + } + atomic_inc(&sbi->s_curr_pending); + + return inode; +} + +/* + * struct to pass two arguments to update_inode's callback + */ +struct updatei_args { + struct exofs_sb_info *sbi; + struct exofs_fcb fcb; +}; + +/* + * Callback function from exofs_update_inode(). + */ +static void updatei_done(struct exofs_io_state *ios, void *p) +{ + struct updatei_args *args = p; + + exofs_put_io_state(ios); + + atomic_dec(&args->sbi->s_curr_pending); + + kfree(args); +} + +/* + * Write the inode to the OSD. Just fill up the struct, and set the attribute + * synchronously or asynchronously depending on the do_sync flag. + */ +static int exofs_update_inode(struct inode *inode, int do_sync) +{ + struct exofs_i_info *oi = exofs_i(inode); + struct super_block *sb = inode->i_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; + struct exofs_io_state *ios; + struct osd_attr attr; + struct exofs_fcb *fcb; + struct updatei_args *args; + int ret; + + args = kzalloc(sizeof(*args), GFP_KERNEL); + if (!args) { + EXOFS_DBGMSG("Failed kzalloc of args\n"); + return -ENOMEM; + } + + fcb = &args->fcb; + + fcb->i_mode = cpu_to_le16(inode->i_mode); + fcb->i_uid = cpu_to_le32(inode->i_uid); + fcb->i_gid = cpu_to_le32(inode->i_gid); + fcb->i_links_count = cpu_to_le16(inode->i_nlink); + fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec); + fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + oi->i_commit_size = i_size_read(inode); + fcb->i_size = cpu_to_le64(oi->i_commit_size); + fcb->i_generation = cpu_to_le32(inode->i_generation); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + fcb->i_data[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + fcb->i_data[1] = 0; + } else { + fcb->i_data[0] = 0; + fcb->i_data[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + fcb->i_data[2] = 0; + } + } else + memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); + + ret = exofs_get_io_state(&sbi->layout, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + goto free_args; + } + + attr = g_attr_inode_data; + attr.val_ptr = fcb; + ios->out_attr_len = 1; + ios->out_attr = &attr; + + wait_obj_created(oi); + + if (!do_sync) { + args->sbi = sbi; + ios->done = updatei_done; + ios->private = args; + } + + ret = exofs_oi_write(oi, ios); + if (!do_sync && !ret) { + atomic_inc(&sbi->s_curr_pending); + goto out; /* deallocation in updatei_done */ + } + + exofs_put_io_state(ios); +free_args: + kfree(args); +out: + EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n", + inode->i_ino, do_sync, ret); + return ret; +} + +int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */ + return exofs_update_inode(inode, 1); +} + +/* + * Callback function from exofs_delete_inode() - don't have much cleaning up to + * do. + */ +static void delete_done(struct exofs_io_state *ios, void *p) +{ + struct exofs_sb_info *sbi = p; + + exofs_put_io_state(ios); + + atomic_dec(&sbi->s_curr_pending); +} + +/* + * Called when the refcount of an inode reaches zero. We remove the object + * from the OSD here. We make sure the object was created before we try and + * delete it. + */ +void exofs_evict_inode(struct inode *inode) +{ + struct exofs_i_info *oi = exofs_i(inode); + struct super_block *sb = inode->i_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; + struct exofs_io_state *ios; + int ret; + + truncate_inode_pages(&inode->i_data, 0); + + /* TODO: should do better here */ + if (inode->i_nlink || is_bad_inode(inode)) + goto no_delete; + + inode->i_size = 0; + end_writeback(inode); + + /* if we are deleting an obj that hasn't been created yet, wait. + * This also makes sure that create_done cannot be called with an + * already evicted inode. + */ + wait_obj_created(oi); + /* ignore the error, attempt a remove anyway */ + + /* Now Remove the OSD objects */ + ret = exofs_get_io_state(&sbi->layout, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); + return; + } + + ios->obj.id = exofs_oi_objno(oi); + ios->done = delete_done; + ios->private = sbi; + ios->cred = oi->i_cred; + ret = exofs_sbi_remove(ios); + if (ret) { + EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__); + exofs_put_io_state(ios); + return; + } + atomic_inc(&sbi->s_curr_pending); + + return; + +no_delete: + end_writeback(inode); +} diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c new file mode 100644 index 00000000..f74a2ec0 --- /dev/null +++ b/fs/exofs/ios.c @@ -0,0 +1,803 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include + +#include "exofs.h" + +#define EXOFS_DBGMSG2(M...) do {} while (0) +/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */ + +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) +{ + osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); +} + +int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, + u64 offset, void *p, unsigned length) +{ + struct osd_request *or = osd_start_request(od, GFP_KERNEL); +/* struct osd_sense_info osi = {.key = 0};*/ + int ret; + + if (unlikely(!or)) { + EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); + return -ENOMEM; + } + ret = osd_req_read_kern(or, obj, offset, p, length); + if (unlikely(ret)) { + EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); + goto out; + } + + ret = osd_finalize_request(or, 0, cred, NULL); + if (unlikely(ret)) { + EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); + goto out; + } + + ret = osd_execute_request(or); + if (unlikely(ret)) + EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); + /* osd_req_decode_sense(or, ret); */ + +out: + osd_end_request(or); + return ret; +} + +int exofs_get_io_state(struct exofs_layout *layout, + struct exofs_io_state **pios) +{ + struct exofs_io_state *ios; + + /*TODO: Maybe use kmem_cach per sbi of size + * exofs_io_state_size(layout->s_numdevs) + */ + ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); + if (unlikely(!ios)) { + EXOFS_DBGMSG("Failed kzalloc bytes=%d\n", + exofs_io_state_size(layout->s_numdevs)); + *pios = NULL; + return -ENOMEM; + } + + ios->layout = layout; + ios->obj.partition = layout->s_pid; + *pios = ios; + return 0; +} + +void exofs_put_io_state(struct exofs_io_state *ios) +{ + if (ios) { + unsigned i; + + for (i = 0; i < ios->numdevs; i++) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[i]; + + if (per_dev->or) + osd_end_request(per_dev->or); + if (per_dev->bio) + bio_put(per_dev->bio); + } + + kfree(ios); + } +} + +unsigned exofs_layout_od_id(struct exofs_layout *layout, + osd_id obj_no, unsigned layout_index) +{ +/* switch (layout->lay_func) { + case LAYOUT_MOVING_WINDOW: + {*/ + unsigned dev_mod = obj_no; + + return (layout_index + dev_mod * layout->mirrors_p1) % + layout->s_numdevs; +/* } + case LAYOUT_FUNC_IMPLICT: + return layout->devs[layout_index]; + }*/ +} + +static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, + unsigned layout_index) +{ + return ios->layout->s_ods[ + exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)]; +} + +static void _sync_done(struct exofs_io_state *ios, void *p) +{ + struct completion *waiting = p; + + complete(waiting); +} + +static void _last_io(struct kref *kref) +{ + struct exofs_io_state *ios = container_of( + kref, struct exofs_io_state, kref); + + ios->done(ios, ios->private); +} + +static void _done_io(struct osd_request *or, void *p) +{ + struct exofs_io_state *ios = p; + + kref_put(&ios->kref, _last_io); +} + +static int exofs_io_execute(struct exofs_io_state *ios) +{ + DECLARE_COMPLETION_ONSTACK(wait); + bool sync = (ios->done == NULL); + int i, ret; + + if (sync) { + ios->done = _sync_done; + ios->private = &wait; + } + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + if (unlikely(!or)) + continue; + + ret = osd_finalize_request(or, 0, ios->cred, NULL); + if (unlikely(ret)) { + EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", + ret); + return ret; + } + } + + kref_init(&ios->kref); + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + if (unlikely(!or)) + continue; + + kref_get(&ios->kref); + osd_execute_request_async(or, _done_io, ios); + } + + kref_put(&ios->kref, _last_io); + ret = 0; + + if (sync) { + wait_for_completion(&wait); + ret = exofs_check_io(ios, NULL); + } + return ret; +} + +static void _clear_bio(struct bio *bio) +{ + struct bio_vec *bv; + unsigned i; + + __bio_for_each_segment(bv, bio, i, 0) { + unsigned this_count = bv->bv_len; + + if (likely(PAGE_SIZE == this_count)) + clear_highpage(bv->bv_page); + else + zero_user(bv->bv_page, bv->bv_offset, this_count); + } +} + +int exofs_check_io(struct exofs_io_state *ios, u64 *resid) +{ + enum osd_err_priority acumulated_osd_err = 0; + int acumulated_lin_err = 0; + int i; + + for (i = 0; i < ios->numdevs; i++) { + struct osd_sense_info osi; + struct osd_request *or = ios->per_dev[i].or; + int ret; + + if (unlikely(!or)) + continue; + + ret = osd_req_decode_sense(or, &osi); + if (likely(!ret)) + continue; + + if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { + /* start read offset passed endof file */ + _clear_bio(ios->per_dev[i].bio); + EXOFS_DBGMSG("start read offset passed end of file " + "offset=0x%llx, length=0x%llx\n", + _LLU(ios->per_dev[i].offset), + _LLU(ios->per_dev[i].length)); + + continue; /* we recovered */ + } + + if (osi.osd_err_pri >= acumulated_osd_err) { + acumulated_osd_err = osi.osd_err_pri; + acumulated_lin_err = ret; + } + } + + /* TODO: raid specific residual calculations */ + if (resid) { + if (likely(!acumulated_lin_err)) + *resid = 0; + else + *resid = ios->length; + } + + return acumulated_lin_err; +} + +/* + * L - logical offset into the file + * + * U - The number of bytes in a stripe within a group + * + * U = stripe_unit * group_width + * + * T - The number of bytes striped within a group of component objects + * (before advancing to the next group) + * + * T = stripe_unit * group_width * group_depth + * + * S - The number of bytes striped across all component objects + * before the pattern repeats + * + * S = stripe_unit * group_width * group_depth * group_count + * + * M - The "major" (i.e., across all components) stripe number + * + * M = L / S + * + * G - Counts the groups from the beginning of the major stripe + * + * G = (L - (M * S)) / T [or (L % S) / T] + * + * H - The byte offset within the group + * + * H = (L - (M * S)) % T [or (L % S) % T] + * + * N - The "minor" (i.e., across the group) stripe number + * + * N = H / U + * + * C - The component index coresponding to L + * + * C = (H - (N * U)) / stripe_unit + G * group_width + * [or (L % U) / stripe_unit + G * group_width] + * + * O - The component offset coresponding to L + * + * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit + */ +struct _striping_info { + u64 obj_offset; + u64 group_length; + unsigned dev; + unsigned unit_off; +}; + +static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, + struct _striping_info *si) +{ + u32 stripe_unit = ios->layout->stripe_unit; + u32 group_width = ios->layout->group_width; + u64 group_depth = ios->layout->group_depth; + + u32 U = stripe_unit * group_width; + u64 T = U * group_depth; + u64 S = T * ios->layout->group_count; + u64 M = div64_u64(file_offset, S); + + /* + G = (L - (M * S)) / T + H = (L - (M * S)) % T + */ + u64 LmodS = file_offset - M * S; + u32 G = div64_u64(LmodS, T); + u64 H = LmodS - G * T; + + u32 N = div_u64(H, U); + + /* "H - (N * U)" is just "H % U" so it's bound to u32 */ + si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; + si->dev *= ios->layout->mirrors_p1; + + div_u64_rem(file_offset, stripe_unit, &si->unit_off); + + si->obj_offset = si->unit_off + (N * stripe_unit) + + (M * group_depth * stripe_unit); + + si->group_length = T - H; +} + +static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct exofs_per_dev_state *per_dev, + int cur_len) +{ + unsigned pg = *cur_pg; + struct request_queue *q = + osd_request_queue(exofs_ios_od(ios, per_dev->dev)); + + per_dev->length += cur_len; + + if (per_dev->bio == NULL) { + unsigned pages_in_stripe = ios->layout->group_width * + (ios->layout->stripe_unit / PAGE_SIZE); + unsigned bio_size = (ios->nr_pages + pages_in_stripe) / + ios->layout->group_width; + + per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); + if (unlikely(!per_dev->bio)) { + EXOFS_DBGMSG("Failed to allocate BIO size=%u\n", + bio_size); + return -ENOMEM; + } + } + + while (cur_len > 0) { + unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); + unsigned added_len; + + BUG_ON(ios->nr_pages <= pg); + cur_len -= pglen; + + added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], + pglen, pgbase); + if (unlikely(pglen != added_len)) + return -ENOMEM; + pgbase = 0; + ++pg; + } + BUG_ON(cur_len); + + *cur_pg = pg; + return 0; +} + +static int _prepare_one_group(struct exofs_io_state *ios, u64 length, + struct _striping_info *si) +{ + unsigned stripe_unit = ios->layout->stripe_unit; + unsigned mirrors_p1 = ios->layout->mirrors_p1; + unsigned devs_in_group = ios->layout->group_width * mirrors_p1; + unsigned dev = si->dev; + unsigned first_dev = dev - (dev % devs_in_group); + unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; + unsigned cur_pg = ios->pages_consumed; + int ret = 0; + + while (length) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[dev]; + unsigned cur_len, page_off = 0; + + if (!per_dev->length) { + per_dev->dev = dev; + if (dev < si->dev) { + per_dev->offset = si->obj_offset + stripe_unit - + si->unit_off; + cur_len = stripe_unit; + } else if (dev == si->dev) { + per_dev->offset = si->obj_offset; + cur_len = stripe_unit - si->unit_off; + page_off = si->unit_off & ~PAGE_MASK; + BUG_ON(page_off && (page_off != ios->pgbase)); + } else { /* dev > si->dev */ + per_dev->offset = si->obj_offset - si->unit_off; + cur_len = stripe_unit; + } + + if (max_comp < dev) + max_comp = dev; + } else { + cur_len = stripe_unit; + } + if (cur_len >= length) + cur_len = length; + + ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, + cur_len); + if (unlikely(ret)) + goto out; + + dev += mirrors_p1; + dev = (dev % devs_in_group) + first_dev; + + length -= cur_len; + } +out: + ios->numdevs = max_comp + mirrors_p1; + ios->pages_consumed = cur_pg; + return ret; +} + +static int _prepare_for_striping(struct exofs_io_state *ios) +{ + u64 length = ios->length; + u64 offset = ios->offset; + struct _striping_info si; + int ret = 0; + + if (!ios->pages) { + if (ios->kern_buff) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; + + _calc_stripe_info(ios, ios->offset, &si); + per_dev->offset = si.obj_offset; + per_dev->dev = si.dev; + + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (si.unit_off + ios->length > + ios->layout->stripe_unit)); + } + ios->numdevs = ios->layout->mirrors_p1; + return 0; + } + + while (length) { + _calc_stripe_info(ios, offset, &si); + + if (length < si.group_length) + si.group_length = length; + + ret = _prepare_one_group(ios, si.group_length, &si); + if (unlikely(ret)) + goto out; + + offset += si.group_length; + length -= si.group_length; + } + +out: + return ret; +} + +int exofs_sbi_create(struct exofs_io_state *ios) +{ + int i, ret; + + for (i = 0; i < ios->layout->s_numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + osd_req_create_object(or, &ios->obj); + } + ret = exofs_io_execute(ios); + +out: + return ret; +} + +int exofs_sbi_remove(struct exofs_io_state *ios) +{ + int i, ret; + + for (i = 0; i < ios->layout->s_numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + osd_req_remove_object(or, &ios->obj); + } + ret = exofs_io_execute(ios); + +out: + return ret; +} + +static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) +{ + struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp]; + unsigned dev = ios->per_dev[cur_comp].dev; + unsigned last_comp = cur_comp + ios->layout->mirrors_p1; + int ret = 0; + + if (ios->pages && !master_dev->length) + return 0; /* Just an empty slot */ + + for (; cur_comp < last_comp; ++cur_comp, ++dev) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_request *or; + + or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + per_dev->or = or; + per_dev->offset = master_dev->offset; + + if (ios->pages) { + struct bio *bio; + + if (per_dev != master_dev) { + bio = bio_kmalloc(GFP_KERNEL, + master_dev->bio->bi_max_vecs); + if (unlikely(!bio)) { + EXOFS_DBGMSG( + "Failed to allocate BIO size=%u\n", + master_dev->bio->bi_max_vecs); + ret = -ENOMEM; + goto out; + } + + __bio_clone(bio, master_dev->bio); + bio->bi_bdev = NULL; + bio->bi_next = NULL; + per_dev->length = master_dev->length; + per_dev->bio = bio; + per_dev->dev = dev; + } else { + bio = master_dev->bio; + /* FIXME: bio_set_dir() */ + bio->bi_rw |= REQ_WRITE; + } + + osd_req_write(or, &ios->obj, per_dev->offset, bio, + per_dev->length); + EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", + _LLU(ios->obj.id), _LLU(per_dev->offset), + _LLU(per_dev->length), dev); + } else if (ios->kern_buff) { + ret = osd_req_write_kern(or, &ios->obj, per_dev->offset, + ios->kern_buff, ios->length); + if (unlikely(ret)) + goto out; + EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", + _LLU(ios->obj.id), _LLU(per_dev->offset), + _LLU(ios->length), dev); + } else { + osd_req_set_attributes(or, &ios->obj); + EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", + _LLU(ios->obj.id), ios->out_attr_len, dev); + } + + if (ios->out_attr) + osd_req_add_set_attr_list(or, ios->out_attr, + ios->out_attr_len); + + if (ios->in_attr) + osd_req_add_get_attr_list(or, ios->in_attr, + ios->in_attr_len); + } + +out: + return ret; +} + +int exofs_sbi_write(struct exofs_io_state *ios) +{ + int i; + int ret; + + ret = _prepare_for_striping(ios); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + ret = _sbi_write_mirror(ios, i); + if (unlikely(ret)) + return ret; + } + + ret = exofs_io_execute(ios); + return ret; +} + +static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) +{ + struct osd_request *or; + struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + unsigned first_dev = (unsigned)ios->obj.id; + + if (ios->pages && !per_dev->length) + return 0; /* Just an empty slot */ + + first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; + or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + return -ENOMEM; + } + per_dev->or = or; + + if (ios->pages) { + osd_req_read(or, &ios->obj, per_dev->offset, + per_dev->bio, per_dev->length); + EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" + " dev=%d\n", _LLU(ios->obj.id), + _LLU(per_dev->offset), _LLU(per_dev->length), + first_dev); + } else if (ios->kern_buff) { + int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset, + ios->kern_buff, ios->length); + EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d ret=>%d\n", + _LLU(ios->obj.id), _LLU(per_dev->offset), + _LLU(ios->length), first_dev, ret); + if (unlikely(ret)) + return ret; + } else { + osd_req_get_attributes(or, &ios->obj); + EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", + _LLU(ios->obj.id), ios->in_attr_len, first_dev); + } + if (ios->out_attr) + osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); + + if (ios->in_attr) + osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); + + return 0; +} + +int exofs_sbi_read(struct exofs_io_state *ios) +{ + int i; + int ret; + + ret = _prepare_for_striping(ios); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + ret = _sbi_read_mirror(ios, i); + if (unlikely(ret)) + return ret; + } + + ret = exofs_io_execute(ios); + return ret; +} + +int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) +{ + struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ + void *iter = NULL; + int nelem; + + do { + nelem = 1; + osd_req_decode_get_attr_list(ios->per_dev[0].or, + &cur_attr, &nelem, &iter); + if ((cur_attr.attr_page == attr->attr_page) && + (cur_attr.attr_id == attr->attr_id)) { + attr->len = cur_attr.len; + attr->val_ptr = cur_attr.val_ptr; + return 0; + } + } while (iter); + + return -EIO; +} + +static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp, + struct osd_attr *attr) +{ + int last_comp = cur_comp + ios->layout->mirrors_p1; + + for (; cur_comp < last_comp; ++cur_comp) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_request *or; + + or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + return -ENOMEM; + } + per_dev->or = or; + + osd_req_set_attributes(or, &ios->obj); + osd_req_add_set_attr_list(or, attr, 1); + } + + return 0; +} + +int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) +{ + struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; + struct exofs_io_state *ios; + struct exofs_trunc_attr { + struct osd_attr attr; + __be64 newsize; + } *size_attrs; + struct _striping_info si; + int i, ret; + + ret = exofs_get_io_state(&sbi->layout, &ios); + if (unlikely(ret)) + return ret; + + size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs), + GFP_KERNEL); + if (unlikely(!size_attrs)) { + ret = -ENOMEM; + goto out; + } + + ios->obj.id = exofs_oi_objno(oi); + ios->cred = oi->i_cred; + + ios->numdevs = ios->layout->s_numdevs; + _calc_stripe_info(ios, size, &si); + + for (i = 0; i < ios->layout->group_width; ++i) { + struct exofs_trunc_attr *size_attr = &size_attrs[i]; + u64 obj_size; + + if (i < si.dev) + obj_size = si.obj_offset + + ios->layout->stripe_unit - si.unit_off; + else if (i == si.dev) + obj_size = si.obj_offset; + else /* i > si.dev */ + obj_size = si.obj_offset - si.unit_off; + + size_attr->newsize = cpu_to_be64(obj_size); + size_attr->attr = g_attr_logical_length; + size_attr->attr.val_ptr = &size_attr->newsize; + + ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, + &size_attr->attr); + if (unlikely(ret)) + goto out; + } + ret = exofs_io_execute(ios); + +out: + kfree(size_attrs); + exofs_put_io_state(ios); + return ret; +} diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c new file mode 100644 index 00000000..4d70db11 --- /dev/null +++ b/fs/exofs/namei.c @@ -0,0 +1,336 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "exofs.h" + +static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = exofs_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + ino_t ino; + + if (dentry->d_name.len > EXOFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ino = exofs_inode_by_name(dir, dentry); + inode = NULL; + if (ino) { + inode = exofs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + return d_splice_alias(inode, dentry); +} + +static int exofs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode = exofs_new_inode(dir, mode); + int err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &exofs_file_inode_operations; + inode->i_fop = &exofs_file_operations; + inode->i_mapping->a_ops = &exofs_aops; + mark_inode_dirty(inode); + err = exofs_add_nondir(dentry, inode); + } + return err; +} + +static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + struct inode *inode; + int err; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + inode = exofs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); + mark_inode_dirty(inode); + err = exofs_add_nondir(dentry, inode); + } + return err; +} + +static int exofs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct super_block *sb = dir->i_sb; + int err = -ENAMETOOLONG; + unsigned l = strlen(symname)+1; + struct inode *inode; + struct exofs_i_info *oi; + + if (l > sb->s_blocksize) + goto out; + + inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + oi = exofs_i(inode); + if (l > sizeof(oi->i_data)) { + /* slow symlink */ + inode->i_op = &exofs_symlink_inode_operations; + inode->i_mapping->a_ops = &exofs_aops; + memset(oi->i_data, 0, sizeof(oi->i_data)); + + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + } else { + /* fast symlink */ + inode->i_op = &exofs_fast_symlink_inode_operations; + memcpy(oi->i_data, symname, l); + inode->i_size = l-1; + } + mark_inode_dirty(inode); + + err = exofs_add_nondir(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + iput(inode); + goto out; +} + +static int exofs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + + if (inode->i_nlink >= EXOFS_LINK_MAX) + return -EMLINK; + + inode->i_ctime = CURRENT_TIME; + inode_inc_link_count(inode); + ihold(inode); + + return exofs_add_nondir(dentry, inode); +} + +static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + int err = -EMLINK; + + if (dir->i_nlink >= EXOFS_LINK_MAX) + goto out; + + inode_inc_link_count(dir); + + inode = exofs_new_inode(dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + inode->i_op = &exofs_dir_inode_operations; + inode->i_fop = &exofs_dir_operations; + inode->i_mapping->a_ops = &exofs_aops; + + inode_inc_link_count(inode); + + err = exofs_make_empty(inode, dir); + if (err) + goto out_fail; + + err = exofs_add_link(dentry, inode); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + iput(inode); +out_dir: + inode_dec_link_count(dir); + goto out; +} + +static int exofs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct exofs_dir_entry *de; + struct page *page; + int err = -ENOENT; + + de = exofs_find_entry(dir, dentry, &page); + if (!de) + goto out; + + err = exofs_delete_entry(de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + err = 0; +out: + return err; +} + +static int exofs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int err = -ENOTEMPTY; + + if (exofs_empty_dir(inode)) { + err = exofs_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + inode_dec_link_count(inode); + inode_dec_link_count(dir); + } + } + return err; +} + +static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct page *dir_page = NULL; + struct exofs_dir_entry *dir_de = NULL; + struct page *old_page; + struct exofs_dir_entry *old_de; + int err = -ENOENT; + + old_de = exofs_find_entry(old_dir, old_dentry, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = exofs_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page *new_page; + struct exofs_dir_entry *new_de; + + err = -ENOTEMPTY; + if (dir_de && !exofs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = exofs_find_entry(new_dir, new_dentry, &new_page); + if (!new_de) + goto out_dir; + err = exofs_set_link(new_dir, new_de, new_page, old_inode); + new_inode->i_ctime = CURRENT_TIME; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + if (err) + goto out_dir; + } else { + if (dir_de) { + err = -EMLINK; + if (new_dir->i_nlink >= EXOFS_LINK_MAX) + goto out_dir; + } + err = exofs_add_link(new_dentry, old_inode); + if (err) + goto out_dir; + if (dir_de) + inode_inc_link_count(new_dir); + } + + old_inode->i_ctime = CURRENT_TIME; + + exofs_delete_entry(old_de, old_page); + mark_inode_dirty(old_inode); + + if (dir_de) { + err = exofs_set_link(old_inode, dir_de, dir_page, new_dir); + inode_dec_link_count(old_dir); + if (err) + goto out_dir; + } + return 0; + + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + return err; +} + +const struct inode_operations exofs_dir_inode_operations = { + .create = exofs_create, + .lookup = exofs_lookup, + .link = exofs_link, + .unlink = exofs_unlink, + .symlink = exofs_symlink, + .mkdir = exofs_mkdir, + .rmdir = exofs_rmdir, + .mknod = exofs_mknod, + .rename = exofs_rename, + .setattr = exofs_setattr, +}; + +const struct inode_operations exofs_special_inode_operations = { + .setattr = exofs_setattr, +}; diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h new file mode 100644 index 00000000..c52e9888 --- /dev/null +++ b/fs/exofs/pnfs.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify it under the + * terms of the GNU General Public License version 2 as published by the Free + * Software Foundation. + * + */ + +/* FIXME: Remove this file once pnfs hits mainline */ + +#ifndef __EXOFS_PNFS_H__ +#define __EXOFS_PNFS_H__ + +#if ! defined(__PNFS_OSD_XDR_H__) + +enum pnfs_iomode { + IOMODE_READ = 1, + IOMODE_RW = 2, + IOMODE_ANY = 3, +}; + +/* Layout Structure */ +enum pnfs_osd_raid_algorithm4 { + PNFS_OSD_RAID_0 = 1, + PNFS_OSD_RAID_4 = 2, + PNFS_OSD_RAID_5 = 3, + PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ +}; + +struct pnfs_osd_data_map { + u32 odm_num_comps; + u64 odm_stripe_unit; + u32 odm_group_width; + u32 odm_group_depth; + u32 odm_mirror_cnt; + u32 odm_raid_algorithm; +}; + +#endif /* ! defined(__PNFS_OSD_XDR_H__) */ + +#endif /* __EXOFS_PNFS_H__ */ diff --git a/fs/exofs/super.c b/fs/exofs/super.c new file mode 100644 index 00000000..c57beddc --- /dev/null +++ b/fs/exofs/super.c @@ -0,0 +1,1001 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include +#include + +#include "exofs.h" + +/****************************************************************************** + * MOUNT OPTIONS + *****************************************************************************/ + +/* + * struct to hold what we get from mount options + */ +struct exofs_mountopt { + bool is_osdname; + const char *dev_name; + uint64_t pid; + int timeout; +}; + +/* + * exofs-specific mount-time options. + */ +enum { Opt_name, Opt_pid, Opt_to, Opt_err }; + +/* + * Our mount-time options. These should ideally be 64-bit unsigned, but the + * kernel's parsing functions do not currently support that. 32-bit should be + * sufficient for most applications now. + */ +static match_table_t tokens = { + {Opt_name, "osdname=%s"}, + {Opt_pid, "pid=%u"}, + {Opt_to, "to=%u"}, + {Opt_err, NULL} +}; + +/* + * The main option parsing method. Also makes sure that all of the mandatory + * mount options were set. + */ +static int parse_options(char *options, struct exofs_mountopt *opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + bool s_pid = false; + + EXOFS_DBGMSG("parse_options %s\n", options); + /* defaults */ + memset(opts, 0, sizeof(*opts)); + opts->timeout = BLK_DEFAULT_SG_TIMEOUT; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + char str[32]; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_name: + opts->dev_name = match_strdup(&args[0]); + if (unlikely(!opts->dev_name)) { + EXOFS_ERR("Error allocating dev_name"); + return -ENOMEM; + } + opts->is_osdname = true; + break; + case Opt_pid: + if (0 == match_strlcpy(str, &args[0], sizeof(str))) + return -EINVAL; + opts->pid = simple_strtoull(str, NULL, 0); + if (opts->pid < EXOFS_MIN_PID) { + EXOFS_ERR("Partition ID must be >= %u", + EXOFS_MIN_PID); + return -EINVAL; + } + s_pid = 1; + break; + case Opt_to: + if (match_int(&args[0], &option)) + return -EINVAL; + if (option <= 0) { + EXOFS_ERR("Timout must be > 0"); + return -EINVAL; + } + opts->timeout = option * HZ; + break; + } + } + + if (!s_pid) { + EXOFS_ERR("Need to specify the following options:\n"); + EXOFS_ERR(" -o pid=pid_no_to_use\n"); + return -EINVAL; + } + + return 0; +} + +/****************************************************************************** + * INODE CACHE + *****************************************************************************/ + +/* + * Our inode cache. Isn't it pretty? + */ +static struct kmem_cache *exofs_inode_cachep; + +/* + * Allocate an inode in the cache + */ +static struct inode *exofs_alloc_inode(struct super_block *sb) +{ + struct exofs_i_info *oi; + + oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL); + if (!oi) + return NULL; + + oi->vfs_inode.i_version = 1; + return &oi->vfs_inode; +} + +static void exofs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); +} + +/* + * Remove an inode from the cache + */ +static void exofs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, exofs_i_callback); +} + +/* + * Initialize the inode + */ +static void exofs_init_once(void *foo) +{ + struct exofs_i_info *oi = foo; + + inode_init_once(&oi->vfs_inode); +} + +/* + * Create and initialize the inode cache + */ +static int init_inodecache(void) +{ + exofs_inode_cachep = kmem_cache_create("exofs_inode_cache", + sizeof(struct exofs_i_info), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + exofs_init_once); + if (exofs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +/* + * Destroy the inode cache + */ +static void destroy_inodecache(void) +{ + kmem_cache_destroy(exofs_inode_cachep); +} + +/****************************************************************************** + * SUPERBLOCK FUNCTIONS + *****************************************************************************/ +static const struct super_operations exofs_sops; +static const struct export_operations exofs_export_ops; + +static const struct osd_attr g_attr_sb_stats = ATTR_DEF( + EXOFS_APAGE_SB_DATA, + EXOFS_ATTR_SB_STATS, + sizeof(struct exofs_sb_stats)); + +static int __sbi_read_stats(struct exofs_sb_info *sbi) +{ + struct osd_attr attrs[] = { + [0] = g_attr_sb_stats, + }; + struct exofs_io_state *ios; + int ret; + + ret = exofs_get_io_state(&sbi->layout, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + return ret; + } + + ios->cred = sbi->s_cred; + + ios->in_attr = attrs; + ios->in_attr_len = ARRAY_SIZE(attrs); + + ret = exofs_sbi_read(ios); + if (unlikely(ret)) { + EXOFS_ERR("Error reading super_block stats => %d\n", ret); + goto out; + } + + ret = extract_attr_from_ios(ios, &attrs[0]); + if (ret) { + EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__); + goto out; + } + if (attrs[0].len) { + struct exofs_sb_stats *ess; + + if (unlikely(attrs[0].len != sizeof(*ess))) { + EXOFS_ERR("%s: Wrong version of exofs_sb_stats " + "size(%d) != expected(%zd)\n", + __func__, attrs[0].len, sizeof(*ess)); + goto out; + } + + ess = attrs[0].val_ptr; + sbi->s_nextid = le64_to_cpu(ess->s_nextid); + sbi->s_numfiles = le32_to_cpu(ess->s_numfiles); + } + +out: + exofs_put_io_state(ios); + return ret; +} + +static void stats_done(struct exofs_io_state *ios, void *p) +{ + exofs_put_io_state(ios); + /* Good thanks nothing to do anymore */ +} + +/* Asynchronously write the stats attribute */ +int exofs_sbi_write_stats(struct exofs_sb_info *sbi) +{ + struct osd_attr attrs[] = { + [0] = g_attr_sb_stats, + }; + struct exofs_io_state *ios; + int ret; + + ret = exofs_get_io_state(&sbi->layout, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + return ret; + } + + sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid); + sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles); + attrs[0].val_ptr = &sbi->s_ess; + + ios->cred = sbi->s_cred; + ios->done = stats_done; + ios->private = sbi; + ios->out_attr = attrs; + ios->out_attr_len = ARRAY_SIZE(attrs); + + ret = exofs_sbi_write(ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); + exofs_put_io_state(ios); + } + + return ret; +} + +/* + * Write the superblock to the OSD + */ +int exofs_sync_fs(struct super_block *sb, int wait) +{ + struct exofs_sb_info *sbi; + struct exofs_fscb *fscb; + struct exofs_io_state *ios; + int ret = -ENOMEM; + + fscb = kmalloc(sizeof(*fscb), GFP_KERNEL); + if (unlikely(!fscb)) + return -ENOMEM; + + sbi = sb->s_fs_info; + + /* NOTE: We no longer dirty the super_block anywhere in exofs. The + * reason we write the fscb here on unmount is so we can stay backwards + * compatible with fscb->s_version == 1. (What we are not compatible + * with is if a new version FS crashed and then we try to mount an old + * version). Otherwise the exofs_fscb is read-only from mkfs time. All + * the writeable info is set in exofs_sbi_write_stats() above. + */ + ret = exofs_get_io_state(&sbi->layout, &ios); + if (unlikely(ret)) + goto out; + + lock_super(sb); + + ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); + memset(fscb, 0, ios->length); + fscb->s_nextid = cpu_to_le64(sbi->s_nextid); + fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); + fscb->s_magic = cpu_to_le16(sb->s_magic); + fscb->s_newfs = 0; + fscb->s_version = EXOFS_FSCB_VER; + + ios->obj.id = EXOFS_SUPER_ID; + ios->offset = 0; + ios->kern_buff = fscb; + ios->cred = sbi->s_cred; + + ret = exofs_sbi_write(ios); + if (unlikely(ret)) + EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); + else + sb->s_dirt = 0; + + + unlock_super(sb); +out: + EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); + exofs_put_io_state(ios); + kfree(fscb); + return ret; +} + +static void exofs_write_super(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) + exofs_sync_fs(sb, 1); + else + sb->s_dirt = 0; +} + +static void _exofs_print_device(const char *msg, const char *dev_path, + struct osd_dev *od, u64 pid) +{ + const struct osd_dev_info *odi = osduld_device_info(od); + + printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n", + msg, dev_path ?: "", odi->osdname, _LLU(pid)); +} + +void exofs_free_sbi(struct exofs_sb_info *sbi) +{ + while (sbi->layout.s_numdevs) { + int i = --sbi->layout.s_numdevs; + struct osd_dev *od = sbi->layout.s_ods[i]; + + if (od) { + sbi->layout.s_ods[i] = NULL; + osduld_put_device(od); + } + } + kfree(sbi); +} + +/* + * This function is called when the vfs is freeing the superblock. We just + * need to free our own part. + */ +static void exofs_put_super(struct super_block *sb) +{ + int num_pend; + struct exofs_sb_info *sbi = sb->s_fs_info; + + /* make sure there are no pending commands */ + for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; + num_pend = atomic_read(&sbi->s_curr_pending)) { + wait_queue_head_t wq; + + printk(KERN_NOTICE "%s: !!Pending operations in flight. " + "This is a BUG. please report to osd-dev@open-osd.org\n", + __func__); + init_waitqueue_head(&wq); + wait_event_timeout(wq, + (atomic_read(&sbi->s_curr_pending) == 0), + msecs_to_jiffies(100)); + } + + _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], + sbi->layout.s_pid); + + bdi_destroy(&sbi->bdi); + exofs_free_sbi(sbi); + sb->s_fs_info = NULL; +} + +static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, + struct exofs_device_table *dt) +{ + u64 stripe_length; + + sbi->data_map.odm_num_comps = + le32_to_cpu(dt->dt_data_map.cb_num_comps); + sbi->data_map.odm_stripe_unit = + le64_to_cpu(dt->dt_data_map.cb_stripe_unit); + sbi->data_map.odm_group_width = + le32_to_cpu(dt->dt_data_map.cb_group_width); + sbi->data_map.odm_group_depth = + le32_to_cpu(dt->dt_data_map.cb_group_depth); + sbi->data_map.odm_mirror_cnt = + le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); + sbi->data_map.odm_raid_algorithm = + le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); + +/* FIXME: Only raid0 for now. if not so, do not mount */ + if (sbi->data_map.odm_num_comps != numdevs) { + EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", + sbi->data_map.odm_num_comps, numdevs); + return -EINVAL; + } + if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { + EXOFS_ERR("Only RAID_0 for now\n"); + return -EINVAL; + } + if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { + EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", + numdevs, sbi->data_map.odm_mirror_cnt); + return -EINVAL; + } + + if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { + EXOFS_ERR("Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); + return -EINVAL; + } + + sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; + sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; + + if (sbi->data_map.odm_group_width) { + sbi->layout.group_width = sbi->data_map.odm_group_width; + sbi->layout.group_depth = sbi->data_map.odm_group_depth; + if (!sbi->layout.group_depth) { + EXOFS_ERR("group_depth == 0 && group_width != 0\n"); + return -EINVAL; + } + sbi->layout.group_count = sbi->data_map.odm_num_comps / + sbi->layout.mirrors_p1 / + sbi->data_map.odm_group_width; + } else { + if (sbi->data_map.odm_group_depth) { + printk(KERN_NOTICE "Warning: group_depth ignored " + "group_width == 0 && group_depth == %d\n", + sbi->data_map.odm_group_depth); + sbi->data_map.odm_group_depth = 0; + } + sbi->layout.group_width = sbi->data_map.odm_num_comps / + sbi->layout.mirrors_p1; + sbi->layout.group_depth = -1; + sbi->layout.group_count = 1; + } + + stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit; + if (stripe_length >= (1ULL << 32)) { + EXOFS_ERR("Total Stripe length(0x%llx)" + " >= 32bit is not supported\n", _LLU(stripe_length)); + return -EINVAL; + } + + return 0; +} + +static unsigned __ra_pages(struct exofs_layout *layout) +{ + const unsigned _MIN_RA = 32; /* min 128K read-ahead */ + unsigned ra_pages = layout->group_width * layout->stripe_unit / + PAGE_SIZE; + unsigned max_io_pages = exofs_max_io_pages(layout, ~0); + + ra_pages *= 2; /* two stripes */ + if (ra_pages < _MIN_RA) + ra_pages = roundup(_MIN_RA, ra_pages / 2); + + if (ra_pages > max_io_pages) + ra_pages = max_io_pages; + + return ra_pages; +} + +/* @odi is valid only as long as @fscb_dev is valid */ +static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, + struct osd_dev_info *odi) +{ + odi->systemid_len = le32_to_cpu(dt_dev->systemid_len); + memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len); + + odi->osdname_len = le32_to_cpu(dt_dev->osdname_len); + odi->osdname = dt_dev->osdname; + + /* FIXME support long names. Will need a _put function */ + if (dt_dev->long_name_offset) + return -EINVAL; + + /* Make sure osdname is printable! + * mkexofs should give us space for a null-terminator else the + * device-table is invalid. + */ + if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname))) + odi->osdname_len = sizeof(dt_dev->osdname) - 1; + dt_dev->osdname[odi->osdname_len] = 0; + + /* If it's all zeros something is bad we read past end-of-obj */ + return !(odi->systemid_len || odi->osdname_len); +} + +static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, + unsigned table_count) +{ + struct exofs_sb_info *sbi = *psbi; + struct osd_dev *fscb_od; + struct osd_obj_id obj = {.partition = sbi->layout.s_pid, + .id = EXOFS_DEVTABLE_ID}; + struct exofs_device_table *dt; + unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + + sizeof(*dt); + unsigned numdevs, i; + int ret; + + dt = kmalloc(table_bytes, GFP_KERNEL); + if (unlikely(!dt)) { + EXOFS_ERR("ERROR: allocating %x bytes for device table\n", + table_bytes); + return -ENOMEM; + } + + fscb_od = sbi->layout.s_ods[0]; + sbi->layout.s_ods[0] = NULL; + sbi->layout.s_numdevs = 0; + ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); + if (unlikely(ret)) { + EXOFS_ERR("ERROR: reading device table\n"); + goto out; + } + + numdevs = le64_to_cpu(dt->dt_num_devices); + if (unlikely(!numdevs)) { + ret = -EINVAL; + goto out; + } + WARN_ON(table_count != numdevs); + + ret = _read_and_match_data_map(sbi, numdevs, dt); + if (unlikely(ret)) + goto out; + + if (likely(numdevs > 1)) { + unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]); + + sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); + if (unlikely(!sbi)) { + ret = -ENOMEM; + goto out; + } + memset(&sbi->layout.s_ods[1], 0, + size - sizeof(sbi->layout.s_ods[0])); + *psbi = sbi; + } + + for (i = 0; i < numdevs; i++) { + struct exofs_fscb fscb; + struct osd_dev_info odi; + struct osd_dev *od; + + if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) { + EXOFS_ERR("ERROR: Read all-zeros device entry\n"); + ret = -EINVAL; + goto out; + } + + printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", + i, odi.osdname); + + /* On all devices the device table is identical. The user can + * specify any one of the participating devices on the command + * line. We always keep them in device-table order. + */ + if (fscb_od && osduld_device_same(fscb_od, &odi)) { + sbi->layout.s_ods[i] = fscb_od; + ++sbi->layout.s_numdevs; + fscb_od = NULL; + continue; + } + + od = osduld_info_lookup(&odi); + if (IS_ERR(od)) { + ret = PTR_ERR(od); + EXOFS_ERR("ERROR: device requested is not found " + "osd_name-%s =>%d\n", odi.osdname, ret); + goto out; + } + + sbi->layout.s_ods[i] = od; + ++sbi->layout.s_numdevs; + + /* Read the fscb of the other devices to make sure the FS + * partition is there. + */ + ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, + sizeof(fscb)); + if (unlikely(ret)) { + EXOFS_ERR("ERROR: Malformed participating device " + "error reading fscb osd_name-%s\n", + odi.osdname); + goto out; + } + + /* TODO: verify other information is correct and FS-uuid + * matches. Benny what did you say about device table + * generation and old devices? + */ + } + +out: + kfree(dt); + if (unlikely(!ret && fscb_od)) { + EXOFS_ERR( + "ERROR: Bad device-table container device not present\n"); + osduld_put_device(fscb_od); + ret = -EINVAL; + } + + return ret; +} + +/* + * Read the superblock from the OSD and fill in the fields + */ +static int exofs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *root; + struct exofs_mountopt *opts = data; + struct exofs_sb_info *sbi; /*extended info */ + struct osd_dev *od; /* Master device */ + struct exofs_fscb fscb; /*on-disk superblock info */ + struct osd_obj_id obj; + unsigned table_count; + int ret; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); + if (ret) + goto free_bdi; + + /* use mount options to fill superblock */ + if (opts->is_osdname) { + struct osd_dev_info odi = {.systemid_len = 0}; + + odi.osdname_len = strlen(opts->dev_name); + odi.osdname = (u8 *)opts->dev_name; + od = osduld_info_lookup(&odi); + } else { + od = osduld_path_lookup(opts->dev_name); + } + if (IS_ERR(od)) { + ret = -EINVAL; + goto free_sbi; + } + + /* Default layout in case we do not have a device-table */ + sbi->layout.stripe_unit = PAGE_SIZE; + sbi->layout.mirrors_p1 = 1; + sbi->layout.group_width = 1; + sbi->layout.group_depth = -1; + sbi->layout.group_count = 1; + sbi->layout.s_ods[0] = od; + sbi->layout.s_numdevs = 1; + sbi->layout.s_pid = opts->pid; + sbi->s_timeout = opts->timeout; + + /* fill in some other data by hand */ + memset(sb->s_id, 0, sizeof(sb->s_id)); + strcpy(sb->s_id, "exofs"); + sb->s_blocksize = EXOFS_BLKSIZE; + sb->s_blocksize_bits = EXOFS_BLKSHIFT; + sb->s_maxbytes = MAX_LFS_FILESIZE; + atomic_set(&sbi->s_curr_pending, 0); + sb->s_bdev = NULL; + sb->s_dev = 0; + + obj.partition = sbi->layout.s_pid; + obj.id = EXOFS_SUPER_ID; + exofs_make_credential(sbi->s_cred, &obj); + + ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb)); + if (unlikely(ret)) + goto free_sbi; + + sb->s_magic = le16_to_cpu(fscb.s_magic); + /* NOTE: we read below to be backward compatible with old versions */ + sbi->s_nextid = le64_to_cpu(fscb.s_nextid); + sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles); + + /* make sure what we read from the object store is correct */ + if (sb->s_magic != EXOFS_SUPER_MAGIC) { + if (!silent) + EXOFS_ERR("ERROR: Bad magic value\n"); + ret = -EINVAL; + goto free_sbi; + } + if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) { + EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n", + EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version)); + ret = -EINVAL; + goto free_sbi; + } + + /* start generation numbers from a random point */ + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + + table_count = le64_to_cpu(fscb.s_dev_table_count); + if (table_count) { + ret = exofs_read_lookup_dev_table(&sbi, table_count); + if (unlikely(ret)) + goto free_sbi; + } + + __sbi_read_stats(sbi); + + /* set up operation vectors */ + sbi->bdi.ra_pages = __ra_pages(&sbi->layout); + sb->s_bdi = &sbi->bdi; + sb->s_fs_info = sbi; + sb->s_op = &exofs_sops; + sb->s_export_op = &exofs_export_ops; + root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); + if (IS_ERR(root)) { + EXOFS_ERR("ERROR: exofs_iget failed\n"); + ret = PTR_ERR(root); + goto free_sbi; + } + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + iput(root); + EXOFS_ERR("ERROR: get root inode failed\n"); + ret = -ENOMEM; + goto free_sbi; + } + + if (!S_ISDIR(root->i_mode)) { + dput(sb->s_root); + sb->s_root = NULL; + EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n", + root->i_mode); + ret = -EINVAL; + goto free_sbi; + } + + _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0], + sbi->layout.s_pid); + if (opts->is_osdname) + kfree(opts->dev_name); + return 0; + +free_sbi: + bdi_destroy(&sbi->bdi); +free_bdi: + EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", + opts->dev_name, sbi->layout.s_pid, ret); + exofs_free_sbi(sbi); + if (opts->is_osdname) + kfree(opts->dev_name); + return ret; +} + +/* + * Set up the superblock (calls exofs_fill_super eventually) + */ +static struct dentry *exofs_mount(struct file_system_type *type, + int flags, const char *dev_name, + void *data) +{ + struct exofs_mountopt opts; + int ret; + + ret = parse_options(data, &opts); + if (ret) + return ERR_PTR(ret); + + if (!opts.dev_name) + opts.dev_name = dev_name; + return mount_nodev(type, flags, &opts, exofs_fill_super); +} + +/* + * Return information about the file system state in the buffer. This is used + * by the 'df' command, for example. + */ +static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; + struct exofs_io_state *ios; + struct osd_attr attrs[] = { + ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, + OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), + ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION, + OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)), + }; + uint64_t capacity = ULLONG_MAX; + uint64_t used = ULLONG_MAX; + uint8_t cred_a[OSD_CAP_LEN]; + int ret; + + ret = exofs_get_io_state(&sbi->layout, &ios); + if (ret) { + EXOFS_DBGMSG("exofs_get_io_state failed.\n"); + return ret; + } + + exofs_make_credential(cred_a, &ios->obj); + ios->cred = sbi->s_cred; + ios->in_attr = attrs; + ios->in_attr_len = ARRAY_SIZE(attrs); + + ret = exofs_sbi_read(ios); + if (unlikely(ret)) + goto out; + + ret = extract_attr_from_ios(ios, &attrs[0]); + if (likely(!ret)) { + capacity = get_unaligned_be64(attrs[0].val_ptr); + if (unlikely(!capacity)) + capacity = ULLONG_MAX; + } else + EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n"); + + ret = extract_attr_from_ios(ios, &attrs[1]); + if (likely(!ret)) + used = get_unaligned_be64(attrs[1].val_ptr); + else + EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n"); + + /* fill in the stats buffer */ + buf->f_type = EXOFS_SUPER_MAGIC; + buf->f_bsize = EXOFS_BLKSIZE; + buf->f_blocks = capacity >> 9; + buf->f_bfree = (capacity - used) >> 9; + buf->f_bavail = buf->f_bfree; + buf->f_files = sbi->s_numfiles; + buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles; + buf->f_namelen = EXOFS_NAME_LEN; + +out: + exofs_put_io_state(ios); + return ret; +} + +static const struct super_operations exofs_sops = { + .alloc_inode = exofs_alloc_inode, + .destroy_inode = exofs_destroy_inode, + .write_inode = exofs_write_inode, + .evict_inode = exofs_evict_inode, + .put_super = exofs_put_super, + .write_super = exofs_write_super, + .sync_fs = exofs_sync_fs, + .statfs = exofs_statfs, +}; + +/****************************************************************************** + * EXPORT OPERATIONS + *****************************************************************************/ + +struct dentry *exofs_get_parent(struct dentry *child) +{ + unsigned long ino = exofs_parent_ino(child); + + if (!ino) + return ERR_PTR(-ESTALE); + + return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino)); +} + +static struct inode *exofs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + inode = exofs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *exofs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + exofs_nfs_get_inode); +} + +static struct dentry *exofs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + exofs_nfs_get_inode); +} + +static const struct export_operations exofs_export_ops = { + .fh_to_dentry = exofs_fh_to_dentry, + .fh_to_parent = exofs_fh_to_parent, + .get_parent = exofs_get_parent, +}; + +/****************************************************************************** + * INSMOD/RMMOD + *****************************************************************************/ + +/* + * struct that describes this file system + */ +static struct file_system_type exofs_type = { + .owner = THIS_MODULE, + .name = "exofs", + .mount = exofs_mount, + .kill_sb = generic_shutdown_super, +}; + +static int __init init_exofs(void) +{ + int err; + + err = init_inodecache(); + if (err) + goto out; + + err = register_filesystem(&exofs_type); + if (err) + goto out_d; + + return 0; +out_d: + destroy_inodecache(); +out: + return err; +} + +static void __exit exit_exofs(void) +{ + unregister_filesystem(&exofs_type); + destroy_inodecache(); +} + +MODULE_AUTHOR("Avishay Traeger "); +MODULE_DESCRIPTION("exofs"); +MODULE_LICENSE("GPL"); + +module_init(init_exofs) +module_exit(exit_exofs) diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c new file mode 100644 index 00000000..4dd687c3 --- /dev/null +++ b/fs/exofs/symlink.c @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh + * + * Copyrights for code taken from ext2: + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "exofs.h" + +static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct exofs_i_info *oi = exofs_i(dentry->d_inode); + + nd_set_link(nd, (char *)oi->i_data); + return NULL; +} + +const struct inode_operations exofs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +}; + +const struct inode_operations exofs_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = exofs_follow_link, +}; diff --git a/fs/exportfs/Makefile b/fs/exportfs/Makefile new file mode 100644 index 00000000..d7c5d4dd --- /dev/null +++ b/fs/exportfs/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for the filesystem export support routines. + +obj-$(CONFIG_EXPORTFS) += exportfs.o + +exportfs-objs := expfs.o diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c new file mode 100644 index 00000000..b05acb79 --- /dev/null +++ b/fs/exportfs/expfs.c @@ -0,0 +1,497 @@ +/* + * Copyright (C) Neil Brown 2002 + * Copyright (C) Christoph Hellwig 2007 + * + * This file contains the code mapping from inodes to NFS file handles, + * and for mapping back from file handles to dentries. + * + * For details on why we do all the strange and hairy things in here + * take a look at Documentation/filesystems/nfs/Exporting. + */ +#include +#include +#include +#include +#include +#include +#include + +#define dprintk(fmt, args...) do{}while(0) + + +static int get_name(struct vfsmount *mnt, struct dentry *dentry, char *name, + struct dentry *child); + + +static int exportfs_get_name(struct vfsmount *mnt, struct dentry *dir, + char *name, struct dentry *child) +{ + const struct export_operations *nop = dir->d_sb->s_export_op; + + if (nop->get_name) + return nop->get_name(dir, name, child); + else + return get_name(mnt, dir, name, child); +} + +/* + * Check if the dentry or any of it's aliases is acceptable. + */ +static struct dentry * +find_acceptable_alias(struct dentry *result, + int (*acceptable)(void *context, struct dentry *dentry), + void *context) +{ + struct dentry *dentry, *toput = NULL; + struct inode *inode; + + if (acceptable(context, result)) + return result; + + inode = result->d_inode; + spin_lock(&inode->i_lock); + list_for_each_entry(dentry, &inode->i_dentry, d_alias) { + dget(dentry); + spin_unlock(&inode->i_lock); + if (toput) + dput(toput); + if (dentry != result && acceptable(context, dentry)) { + dput(result); + return dentry; + } + spin_lock(&inode->i_lock); + toput = dentry; + } + spin_unlock(&inode->i_lock); + + if (toput) + dput(toput); + return NULL; +} + +/* + * Find root of a disconnected subtree and return a reference to it. + */ +static struct dentry * +find_disconnected_root(struct dentry *dentry) +{ + dget(dentry); + while (!IS_ROOT(dentry)) { + struct dentry *parent = dget_parent(dentry); + + if (!(parent->d_flags & DCACHE_DISCONNECTED)) { + dput(parent); + break; + } + + dput(dentry); + dentry = parent; + } + return dentry; +} + +/* + * Make sure target_dir is fully connected to the dentry tree. + * + * It may already be, as the flag isn't always updated when connection happens. + */ +static int +reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf) +{ + int noprogress = 0; + int err = -ESTALE; + + /* + * It is possible that a confused file system might not let us complete + * the path to the root. For example, if get_parent returns a directory + * in which we cannot find a name for the child. While this implies a + * very sick filesystem we don't want it to cause knfsd to spin. Hence + * the noprogress counter. If we go through the loop 10 times (2 is + * probably enough) without getting anywhere, we just give up + */ + while (target_dir->d_flags & DCACHE_DISCONNECTED && noprogress++ < 10) { + struct dentry *pd = find_disconnected_root(target_dir); + + if (!IS_ROOT(pd)) { + /* must have found a connected parent - great */ + spin_lock(&pd->d_lock); + pd->d_flags &= ~DCACHE_DISCONNECTED; + spin_unlock(&pd->d_lock); + noprogress = 0; + } else if (pd == mnt->mnt_sb->s_root) { + printk(KERN_ERR "export: Eeek filesystem root is not connected, impossible\n"); + spin_lock(&pd->d_lock); + pd->d_flags &= ~DCACHE_DISCONNECTED; + spin_unlock(&pd->d_lock); + noprogress = 0; + } else { + /* + * We have hit the top of a disconnected path, try to + * find parent and connect. + * + * Racing with some other process renaming a directory + * isn't much of a problem here. If someone renames + * the directory, it will end up properly connected, + * which is what we want + * + * Getting the parent can't be supported generically, + * the locking is too icky. + * + * Instead we just return EACCES. If server reboots + * or inodes get flushed, you lose + */ + struct dentry *ppd = ERR_PTR(-EACCES); + struct dentry *npd; + + mutex_lock(&pd->d_inode->i_mutex); + if (mnt->mnt_sb->s_export_op->get_parent) + ppd = mnt->mnt_sb->s_export_op->get_parent(pd); + mutex_unlock(&pd->d_inode->i_mutex); + + if (IS_ERR(ppd)) { + err = PTR_ERR(ppd); + dprintk("%s: get_parent of %ld failed, err %d\n", + __func__, pd->d_inode->i_ino, err); + dput(pd); + break; + } + + dprintk("%s: find name of %lu in %lu\n", __func__, + pd->d_inode->i_ino, ppd->d_inode->i_ino); + err = exportfs_get_name(mnt, ppd, nbuf, pd); + if (err) { + dput(ppd); + dput(pd); + if (err == -ENOENT) + /* some race between get_parent and + * get_name? just try again + */ + continue; + break; + } + dprintk("%s: found name: %s\n", __func__, nbuf); + mutex_lock(&ppd->d_inode->i_mutex); + npd = lookup_one_len(nbuf, ppd, strlen(nbuf)); + mutex_unlock(&ppd->d_inode->i_mutex); + if (IS_ERR(npd)) { + err = PTR_ERR(npd); + dprintk("%s: lookup failed: %d\n", + __func__, err); + dput(ppd); + dput(pd); + break; + } + /* we didn't really want npd, we really wanted + * a side-effect of the lookup. + * hopefully, npd == pd, though it isn't really + * a problem if it isn't + */ + if (npd == pd) + noprogress = 0; + else + printk("%s: npd != pd\n", __func__); + dput(npd); + dput(ppd); + if (IS_ROOT(pd)) { + /* something went wrong, we have to give up */ + dput(pd); + break; + } + } + dput(pd); + } + + if (target_dir->d_flags & DCACHE_DISCONNECTED) { + /* something went wrong - oh-well */ + if (!err) + err = -ESTALE; + return err; + } + + return 0; +} + +struct getdents_callback { + char *name; /* name that was found. It already points to a + buffer NAME_MAX+1 is size */ + unsigned long ino; /* the inum we are looking for */ + int found; /* inode matched? */ + int sequence; /* sequence counter */ +}; + +/* + * A rather strange filldir function to capture + * the name matching the specified inode number. + */ +static int filldir_one(void * __buf, const char * name, int len, + loff_t pos, u64 ino, unsigned int d_type) +{ + struct getdents_callback *buf = __buf; + int result = 0; + + buf->sequence++; + if (buf->ino == ino) { + memcpy(buf->name, name, len); + buf->name[len] = '\0'; + buf->found = 1; + result = -1; + } + return result; +} + +/** + * get_name - default export_operations->get_name function + * @dentry: the directory in which to find a name + * @name: a pointer to a %NAME_MAX+1 char buffer to store the name + * @child: the dentry for the child directory. + * + * calls readdir on the parent until it finds an entry with + * the same inode number as the child, and returns that. + */ +static int get_name(struct vfsmount *mnt, struct dentry *dentry, + char *name, struct dentry *child) +{ + const struct cred *cred = current_cred(); + struct inode *dir = dentry->d_inode; + int error; + struct file *file; + struct getdents_callback buffer; + + error = -ENOTDIR; + if (!dir || !S_ISDIR(dir->i_mode)) + goto out; + error = -EINVAL; + if (!dir->i_fop) + goto out; + /* + * Open the directory ... + */ + file = dentry_open(dget(dentry), mntget(mnt), O_RDONLY, cred); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; + + error = -EINVAL; + if (!file->f_op->readdir) + goto out_close; + + buffer.name = name; + buffer.ino = child->d_inode->i_ino; + buffer.found = 0; + buffer.sequence = 0; + while (1) { + int old_seq = buffer.sequence; + + error = vfs_readdir(file, filldir_one, &buffer); + if (buffer.found) { + error = 0; + break; + } + + if (error < 0) + break; + + error = -ENOENT; + if (old_seq == buffer.sequence) + break; + } + +out_close: + fput(file); +out: + return error; +} + +/** + * export_encode_fh - default export_operations->encode_fh function + * @dentry: the dentry to encode + * @fh: where to store the file handle fragment + * @max_len: maximum length to store there + * @connectable: whether to store parent information + * + * This default encode_fh function assumes that the 32 inode number + * is suitable for locating an inode, and that the generation number + * can be used to check that it is still valid. It places them in the + * filehandle fragment where export_decode_fh expects to find them. + */ +static int export_encode_fh(struct dentry *dentry, struct fid *fid, + int *max_len, int connectable) +{ + struct inode * inode = dentry->d_inode; + int len = *max_len; + int type = FILEID_INO32_GEN; + + if (connectable && (len < 4)) { + *max_len = 4; + return 255; + } else if (len < 2) { + *max_len = 2; + return 255; + } + + len = 2; + fid->i32.ino = inode->i_ino; + fid->i32.gen = inode->i_generation; + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + + spin_lock(&dentry->d_lock); + parent = dentry->d_parent->d_inode; + fid->i32.parent_ino = parent->i_ino; + fid->i32.parent_gen = parent->i_generation; + spin_unlock(&dentry->d_lock); + len = 4; + type = FILEID_INO32_GEN_PARENT; + } + *max_len = len; + return type; +} + +int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, + int connectable) +{ + const struct export_operations *nop = dentry->d_sb->s_export_op; + int error; + + if (nop->encode_fh) + error = nop->encode_fh(dentry, fid->raw, max_len, connectable); + else + error = export_encode_fh(dentry, fid, max_len, connectable); + + return error; +} +EXPORT_SYMBOL_GPL(exportfs_encode_fh); + +struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, + int fh_len, int fileid_type, + int (*acceptable)(void *, struct dentry *), void *context) +{ + const struct export_operations *nop = mnt->mnt_sb->s_export_op; + struct dentry *result, *alias; + char nbuf[NAME_MAX+1]; + int err; + + /* + * Try to get any dentry for the given file handle from the filesystem. + */ + if (!nop || !nop->fh_to_dentry) + return ERR_PTR(-ESTALE); + result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); + if (!result) + result = ERR_PTR(-ESTALE); + if (IS_ERR(result)) + return result; + + if (S_ISDIR(result->d_inode->i_mode)) { + /* + * This request is for a directory. + * + * On the positive side there is only one dentry for each + * directory inode. On the negative side this implies that we + * to ensure our dentry is connected all the way up to the + * filesystem root. + */ + if (result->d_flags & DCACHE_DISCONNECTED) { + err = reconnect_path(mnt, result, nbuf); + if (err) + goto err_result; + } + + if (!acceptable(context, result)) { + err = -EACCES; + goto err_result; + } + + return result; + } else { + /* + * It's not a directory. Life is a little more complicated. + */ + struct dentry *target_dir, *nresult; + + /* + * See if either the dentry we just got from the filesystem + * or any alias for it is acceptable. This is always true + * if this filesystem is exported without the subtreecheck + * option. If the filesystem is exported with the subtree + * check option there's a fair chance we need to look at + * the parent directory in the file handle and make sure + * it's connected to the filesystem root. + */ + alias = find_acceptable_alias(result, acceptable, context); + if (alias) + return alias; + + /* + * Try to extract a dentry for the parent directory from the + * file handle. If this fails we'll have to give up. + */ + err = -ESTALE; + if (!nop->fh_to_parent) + goto err_result; + + target_dir = nop->fh_to_parent(mnt->mnt_sb, fid, + fh_len, fileid_type); + if (!target_dir) + goto err_result; + err = PTR_ERR(target_dir); + if (IS_ERR(target_dir)) + goto err_result; + + /* + * And as usual we need to make sure the parent directory is + * connected to the filesystem root. The VFS really doesn't + * like disconnected directories.. + */ + err = reconnect_path(mnt, target_dir, nbuf); + if (err) { + dput(target_dir); + goto err_result; + } + + /* + * Now that we've got both a well-connected parent and a + * dentry for the inode we're after, make sure that our + * inode is actually connected to the parent. + */ + err = exportfs_get_name(mnt, target_dir, nbuf, result); + if (!err) { + mutex_lock(&target_dir->d_inode->i_mutex); + nresult = lookup_one_len(nbuf, target_dir, + strlen(nbuf)); + mutex_unlock(&target_dir->d_inode->i_mutex); + if (!IS_ERR(nresult)) { + if (nresult->d_inode) { + dput(result); + result = nresult; + } else + dput(nresult); + } + } + + /* + * At this point we are done with the parent, but it's pinned + * by the child dentry anyway. + */ + dput(target_dir); + + /* + * And finally make sure the dentry is actually acceptable + * to NFSD. + */ + alias = find_acceptable_alias(result, acceptable, context); + if (!alias) { + err = -EACCES; + goto err_result; + } + + return alias; + } + + err_result: + dput(result); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(exportfs_decode_fh); + +MODULE_LICENSE("GPL"); diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig new file mode 100644 index 00000000..14a6780f --- /dev/null +++ b/fs/ext2/Kconfig @@ -0,0 +1,55 @@ +config EXT2_FS + tristate "Second extended fs support" + help + Ext2 is a standard Linux file system for hard disks. + + To compile this file system support as a module, choose M here: the + module will be called ext2. + + If unsure, say Y. + +config EXT2_FS_XATTR + bool "Ext2 extended attributes" + depends on EXT2_FS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + +config EXT2_FS_POSIX_ACL + bool "Ext2 POSIX Access Control Lists" + depends on EXT2_FS_XATTR + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config EXT2_FS_SECURITY + bool "Ext2 Security Labels" + depends on EXT2_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext2 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config EXT2_FS_XIP + bool "Ext2 execute in place support" + depends on EXT2_FS && MMU + help + Execute in place can be used on memory-backed block devices. If you + enable this option, you can select to mount block devices which are + capable of this feature without using the page cache. + + If you do not use a block device that is capable of using this, + or if unsure, say N. diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile new file mode 100644 index 00000000..f42af45c --- /dev/null +++ b/fs/ext2/Makefile @@ -0,0 +1,13 @@ +# +# Makefile for the linux ext2-filesystem routines. +# + +obj-$(CONFIG_EXT2_FS) += ext2.o + +ext2-y := balloc.o dir.o file.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o + +ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o +ext2-$(CONFIG_EXT2_FS_SECURITY) += xattr_security.o +ext2-$(CONFIG_EXT2_FS_XIP) += xip.o diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c new file mode 100644 index 00000000..abea5a17 --- /dev/null +++ b/fs/ext2/acl.c @@ -0,0 +1,445 @@ +/* + * linux/fs/ext2/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include +#include "ext2.h" +#include "xattr.h" +#include "acl.h" + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl * +ext2_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(ext2_acl_header)) + return ERR_PTR(-EINVAL); + if (((ext2_acl_header *)value)->a_version != + cpu_to_le32(EXT2_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(ext2_acl_header); + count = ext2_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n=0; n < count; n++) { + ext2_acl_entry *entry = + (ext2_acl_entry *)value; + if ((char *)value + sizeof(ext2_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(ext2_acl_entry_short); + acl->a_entries[n].e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + value = (char *)value + sizeof(ext2_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_id = + le32_to_cpu(entry->e_id); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void * +ext2_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + ext2_acl_header *ext_acl; + char *e; + size_t n; + + *size = ext2_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(ext2_acl_header) + acl->a_count * + sizeof(ext2_acl_entry), GFP_KERNEL); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(EXT2_ACL_VERSION); + e = (char *)ext_acl + sizeof(ext2_acl_header); + for (n=0; n < acl->a_count; n++) { + ext2_acl_entry *entry = (ext2_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER: + case ACL_GROUP: + entry->e_id = + cpu_to_le32(acl->a_entries[n].e_id); + e += sizeof(ext2_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(ext2_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +/* + * inode->i_mutex: don't care + */ +static struct posix_acl * +ext2_get_acl(struct inode *inode, int type) +{ + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return NULL; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + BUG(); + } + retval = ext2_xattr_get(inode, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ext2_xattr_get(inode, name_index, "", value, retval); + } + if (retval > 0) + acl = ext2_acl_from_disk(value, retval); + else if (retval == -ENODATA || retval == -ENOSYS) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +/* + * inode->i_mutex: down + */ +static int +ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + if (!test_opt(inode->i_sb, POSIX_ACL)) + return 0; + + switch(type) { + case ACL_TYPE_ACCESS: + name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; + else { + inode->i_mode = mode; + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + if (error == 0) + acl = NULL; + } + } + break; + + case ACL_TYPE_DEFAULT: + name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + if (acl) { + value = ext2_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = ext2_xattr_set(inode, name_index, "", value, size, 0); + + kfree(value); + if (!error) + set_cached_acl(inode, type, acl); + return error; +} + +int +ext2_check_acl(struct inode *inode, int mask, unsigned int flags) +{ + struct posix_acl *acl; + + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -ECHILD; + return -EAGAIN; + } + + acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + + return -EAGAIN; +} + +/* + * Initialize the ACLs of a new inode. Called from ext2_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) + */ +int +ext2_init_acl(struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int error = 0; + + if (!S_ISLNK(inode->i_mode)) { + if (test_opt(dir->i_sb, POSIX_ACL)) { + acl = ext2_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) + inode->i_mode &= ~current_umask(); + } + if (test_opt(inode->i_sb, POSIX_ACL) && acl) { + struct posix_acl *clone; + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { + error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl); + if (error) + goto cleanup; + } + clone = posix_acl_clone(acl, GFP_KERNEL); + error = -ENOMEM; + if (!clone) + goto cleanup; + mode = inode->i_mode; + error = posix_acl_create_masq(clone, &mode); + if (error >= 0) { + inode->i_mode = mode; + if (error > 0) { + /* This is an extended ACL */ + error = ext2_set_acl(inode, + ACL_TYPE_ACCESS, clone); + } + } + posix_acl_release(clone); + } +cleanup: + posix_acl_release(acl); + return error; +} + +/* + * Does chmod for an inode that may have an Access Control List. The + * inode->i_mode field must be updated to the desired value by the caller + * before calling this function. + * Returns 0 on success, or a negative error number. + * + * We change the ACL rather than storing some ACL entries in the file + * mode permission bits (which would be more efficient), because that + * would break once additional permissions (like ACL_APPEND, ACL_DELETE + * for directories) are added. There are no more bits available in the + * file mode. + * + * inode->i_mutex: down + */ +int +ext2_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int error; + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return 0; + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + error = posix_acl_chmod_masq(clone, inode->i_mode); + if (!error) + error = ext2_set_acl(inode, ACL_TYPE_ACCESS, clone); + posix_acl_release(clone); + return error; +} + +/* + * Extended attribut handlers + */ +static size_t +ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); + + if (!test_opt(dentry->d_sb, POSIX_ACL)) + return 0; + if (list && size <= list_size) + memcpy(list, POSIX_ACL_XATTR_ACCESS, size); + return size; +} + +static size_t +ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); + + if (!test_opt(dentry->d_sb, POSIX_ACL)) + return 0; + if (list && size <= list_size) + memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); + return size; +} + +static int +ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) +{ + struct posix_acl *acl; + int error; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) + return -EOPNOTSUPP; + + acl = ext2_get_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +static int +ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) +{ + struct posix_acl *acl; + int error; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) + return -EOPNOTSUPP; + if (!inode_owner_or_capable(dentry->d_inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + error = posix_acl_valid(acl); + if (error) + goto release_and_out; + } + } else + acl = NULL; + + error = ext2_set_acl(dentry->d_inode, type, acl); + +release_and_out: + posix_acl_release(acl); + return error; +} + +const struct xattr_handler ext2_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = ext2_xattr_list_acl_access, + .get = ext2_xattr_get_acl, + .set = ext2_xattr_set_acl, +}; + +const struct xattr_handler ext2_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = ext2_xattr_list_acl_default, + .get = ext2_xattr_get_acl, + .set = ext2_xattr_set_acl, +}; diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h new file mode 100644 index 00000000..c939b7b1 --- /dev/null +++ b/fs/ext2/acl.h @@ -0,0 +1,78 @@ +/* + File: fs/ext2/acl.h + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +#define EXT2_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} ext2_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} ext2_acl_entry_short; + +typedef struct { + __le32 a_version; +} ext2_acl_header; + +static inline size_t ext2_acl_size(int count) +{ + if (count <= 4) { + return sizeof(ext2_acl_header) + + count * sizeof(ext2_acl_entry_short); + } else { + return sizeof(ext2_acl_header) + + 4 * sizeof(ext2_acl_entry_short) + + (count - 4) * sizeof(ext2_acl_entry); + } +} + +static inline int ext2_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(ext2_acl_header); + s = size - 4 * sizeof(ext2_acl_entry_short); + if (s < 0) { + if (size % sizeof(ext2_acl_entry_short)) + return -1; + return size / sizeof(ext2_acl_entry_short); + } else { + if (s % sizeof(ext2_acl_entry)) + return -1; + return s / sizeof(ext2_acl_entry) + 4; + } +} + +#ifdef CONFIG_EXT2_FS_POSIX_ACL + +/* acl.c */ +extern int ext2_check_acl (struct inode *, int, unsigned int); +extern int ext2_acl_chmod (struct inode *); +extern int ext2_init_acl (struct inode *, struct inode *); + +#else +#include +#define ext2_check_acl NULL +#define ext2_get_acl NULL +#define ext2_set_acl NULL + +static inline int +ext2_acl_chmod (struct inode *inode) +{ + return 0; +} + +static inline int ext2_init_acl (struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif + diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c new file mode 100644 index 00000000..8f44cef1 --- /dev/null +++ b/fs/ext2/balloc.c @@ -0,0 +1,1555 @@ +/* + * linux/fs/ext2/balloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include "ext2.h" +#include +#include +#include +#include +#include + +/* + * balloc.c contains the blocks allocation and deallocation routines + */ + +/* + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext2_fill_super). + */ + + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh) +{ + unsigned long group_desc; + unsigned long offset; + struct ext2_group_desc * desc; + struct ext2_sb_info *sbi = EXT2_SB(sb); + + if (block_group >= sbi->s_groups_count) { + ext2_error (sb, "ext2_get_group_desc", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sbi->s_groups_count); + + return NULL; + } + + group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(sb); + offset = block_group & (EXT2_DESC_PER_BLOCK(sb) - 1); + if (!sbi->s_group_desc[group_desc]) { + ext2_error (sb, "ext2_get_group_desc", + "Group descriptor not loaded - " + "block_group = %d, group_desc = %lu, desc = %lu", + block_group, group_desc, offset); + return NULL; + } + + desc = (struct ext2_group_desc *) sbi->s_group_desc[group_desc]->b_data; + if (bh) + *bh = sbi->s_group_desc[group_desc]; + return desc + offset; +} + +static int ext2_valid_block_bitmap(struct super_block *sb, + struct ext2_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh) +{ + ext2_grpblk_t offset; + ext2_grpblk_t next_zero_bit; + ext2_fsblk_t bitmap_blk; + ext2_fsblk_t group_first_block; + + group_first_block = ext2_group_first_block_no(sb, block_group); + + /* check whether block bitmap block number is set */ + bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); + offset = bitmap_blk - group_first_block; + if (!ext2_test_bit(offset, bh->b_data)) + /* bad block bitmap */ + goto err_out; + + /* check whether the inode bitmap block number is set */ + bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap); + offset = bitmap_blk - group_first_block; + if (!ext2_test_bit(offset, bh->b_data)) + /* bad block bitmap */ + goto err_out; + + /* check whether the inode table block number is set */ + bitmap_blk = le32_to_cpu(desc->bg_inode_table); + offset = bitmap_blk - group_first_block; + next_zero_bit = ext2_find_next_zero_bit(bh->b_data, + offset + EXT2_SB(sb)->s_itb_per_group, + offset); + if (next_zero_bit >= offset + EXT2_SB(sb)->s_itb_per_group) + /* good bitmap for inode tables */ + return 1; + +err_out: + ext2_error(sb, __func__, + "Invalid block bitmap - " + "block_group = %d, block = %lu", + block_group, bitmap_blk); + return 0; +} + +/* + * Read the bitmap for a given block_group,and validate the + * bits for block/inode/inode tables are set in the bitmaps + * + * Return buffer_head on success or NULL in case of failure. + */ +static struct buffer_head * +read_block_bitmap(struct super_block *sb, unsigned int block_group) +{ + struct ext2_group_desc * desc; + struct buffer_head * bh = NULL; + ext2_fsblk_t bitmap_blk; + + desc = ext2_get_group_desc(sb, block_group, NULL); + if (!desc) + return NULL; + bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext2_error(sb, __func__, + "Cannot read block bitmap - " + "block_group = %d, block_bitmap = %u", + block_group, le32_to_cpu(desc->bg_block_bitmap)); + return NULL; + } + if (likely(bh_uptodate_or_lock(bh))) + return bh; + + if (bh_submit_read(bh) < 0) { + brelse(bh); + ext2_error(sb, __func__, + "Cannot read block bitmap - " + "block_group = %d, block_bitmap = %u", + block_group, le32_to_cpu(desc->bg_block_bitmap)); + return NULL; + } + + ext2_valid_block_bitmap(sb, desc, block_group, bh); + /* + * file system mounted not to panic on error, continue with corrupt + * bitmap + */ + return bh; +} + +static void release_blocks(struct super_block *sb, int count) +{ + if (count) { + struct ext2_sb_info *sbi = EXT2_SB(sb); + + percpu_counter_add(&sbi->s_freeblocks_counter, count); + sb->s_dirt = 1; + } +} + +static void group_adjust_blocks(struct super_block *sb, int group_no, + struct ext2_group_desc *desc, struct buffer_head *bh, int count) +{ + if (count) { + struct ext2_sb_info *sbi = EXT2_SB(sb); + unsigned free_blocks; + + spin_lock(sb_bgl_lock(sbi, group_no)); + free_blocks = le16_to_cpu(desc->bg_free_blocks_count); + desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count); + spin_unlock(sb_bgl_lock(sbi, group_no)); + sb->s_dirt = 1; + mark_buffer_dirty(bh); + } +} + +/* + * The reservation window structure operations + * -------------------------------------------- + * Operations include: + * dump, find, add, remove, is_empty, find_next_reservable_window, etc. + * + * We use a red-black tree to represent per-filesystem reservation + * windows. + * + */ + +/** + * __rsv_window_dump() -- Dump the filesystem block allocation reservation map + * @rb_root: root of per-filesystem reservation rb tree + * @verbose: verbose mode + * @fn: function which wishes to dump the reservation map + * + * If verbose is turned on, it will print the whole block reservation + * windows(start, end). Otherwise, it will only print out the "bad" windows, + * those windows that overlap with their immediate neighbors. + */ +#if 1 +static void __rsv_window_dump(struct rb_root *root, int verbose, + const char *fn) +{ + struct rb_node *n; + struct ext2_reserve_window_node *rsv, *prev; + int bad; + +restart: + n = rb_first(root); + bad = 0; + prev = NULL; + + printk("Block Allocation Reservation Windows Map (%s):\n", fn); + while (n) { + rsv = rb_entry(n, struct ext2_reserve_window_node, rsv_node); + if (verbose) + printk("reservation window 0x%p " + "start: %lu, end: %lu\n", + rsv, rsv->rsv_start, rsv->rsv_end); + if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) { + printk("Bad reservation %p (start >= end)\n", + rsv); + bad = 1; + } + if (prev && prev->rsv_end >= rsv->rsv_start) { + printk("Bad reservation %p (prev->end >= start)\n", + rsv); + bad = 1; + } + if (bad) { + if (!verbose) { + printk("Restarting reservation walk in verbose mode\n"); + verbose = 1; + goto restart; + } + } + n = rb_next(n); + prev = rsv; + } + printk("Window map complete.\n"); + BUG_ON(bad); +} +#define rsv_window_dump(root, verbose) \ + __rsv_window_dump((root), (verbose), __func__) +#else +#define rsv_window_dump(root, verbose) do {} while (0) +#endif + +/** + * goal_in_my_reservation() + * @rsv: inode's reservation window + * @grp_goal: given goal block relative to the allocation block group + * @group: the current allocation block group + * @sb: filesystem super block + * + * Test if the given goal block (group relative) is within the file's + * own block reservation window range. + * + * If the reservation window is outside the goal allocation group, return 0; + * grp_goal (given goal block) could be -1, which means no specific + * goal block. In this case, always return 1. + * If the goal block is within the reservation window, return 1; + * otherwise, return 0; + */ +static int +goal_in_my_reservation(struct ext2_reserve_window *rsv, ext2_grpblk_t grp_goal, + unsigned int group, struct super_block * sb) +{ + ext2_fsblk_t group_first_block, group_last_block; + + group_first_block = ext2_group_first_block_no(sb, group); + group_last_block = group_first_block + EXT2_BLOCKS_PER_GROUP(sb) - 1; + + if ((rsv->_rsv_start > group_last_block) || + (rsv->_rsv_end < group_first_block)) + return 0; + if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start) + || (grp_goal + group_first_block > rsv->_rsv_end))) + return 0; + return 1; +} + +/** + * search_reserve_window() + * @rb_root: root of reservation tree + * @goal: target allocation block + * + * Find the reserved window which includes the goal, or the previous one + * if the goal is not in any window. + * Returns NULL if there are no windows or if all windows start after the goal. + */ +static struct ext2_reserve_window_node * +search_reserve_window(struct rb_root *root, ext2_fsblk_t goal) +{ + struct rb_node *n = root->rb_node; + struct ext2_reserve_window_node *rsv; + + if (!n) + return NULL; + + do { + rsv = rb_entry(n, struct ext2_reserve_window_node, rsv_node); + + if (goal < rsv->rsv_start) + n = n->rb_left; + else if (goal > rsv->rsv_end) + n = n->rb_right; + else + return rsv; + } while (n); + /* + * We've fallen off the end of the tree: the goal wasn't inside + * any particular node. OK, the previous node must be to one + * side of the interval containing the goal. If it's the RHS, + * we need to back up one. + */ + if (rsv->rsv_start > goal) { + n = rb_prev(&rsv->rsv_node); + rsv = rb_entry(n, struct ext2_reserve_window_node, rsv_node); + } + return rsv; +} + +/* + * ext2_rsv_window_add() -- Insert a window to the block reservation rb tree. + * @sb: super block + * @rsv: reservation window to add + * + * Must be called with rsv_lock held. + */ +void ext2_rsv_window_add(struct super_block *sb, + struct ext2_reserve_window_node *rsv) +{ + struct rb_root *root = &EXT2_SB(sb)->s_rsv_window_root; + struct rb_node *node = &rsv->rsv_node; + ext2_fsblk_t start = rsv->rsv_start; + + struct rb_node ** p = &root->rb_node; + struct rb_node * parent = NULL; + struct ext2_reserve_window_node *this; + + while (*p) + { + parent = *p; + this = rb_entry(parent, struct ext2_reserve_window_node, rsv_node); + + if (start < this->rsv_start) + p = &(*p)->rb_left; + else if (start > this->rsv_end) + p = &(*p)->rb_right; + else { + rsv_window_dump(root, 1); + BUG(); + } + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); +} + +/** + * rsv_window_remove() -- unlink a window from the reservation rb tree + * @sb: super block + * @rsv: reservation window to remove + * + * Mark the block reservation window as not allocated, and unlink it + * from the filesystem reservation window rb tree. Must be called with + * rsv_lock held. + */ +static void rsv_window_remove(struct super_block *sb, + struct ext2_reserve_window_node *rsv) +{ + rsv->rsv_start = EXT2_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_end = EXT2_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_alloc_hit = 0; + rb_erase(&rsv->rsv_node, &EXT2_SB(sb)->s_rsv_window_root); +} + +/* + * rsv_is_empty() -- Check if the reservation window is allocated. + * @rsv: given reservation window to check + * + * returns 1 if the end block is EXT2_RESERVE_WINDOW_NOT_ALLOCATED. + */ +static inline int rsv_is_empty(struct ext2_reserve_window *rsv) +{ + /* a valid reservation end block could not be 0 */ + return (rsv->_rsv_end == EXT2_RESERVE_WINDOW_NOT_ALLOCATED); +} + +/** + * ext2_init_block_alloc_info() + * @inode: file inode structure + * + * Allocate and initialize the reservation window structure, and + * link the window to the ext2 inode structure at last + * + * The reservation window structure is only dynamically allocated + * and linked to ext2 inode the first time the open file + * needs a new block. So, before every ext2_new_block(s) call, for + * regular files, we should check whether the reservation window + * structure exists or not. In the latter case, this function is called. + * Fail to do so will result in block reservation being turned off for that + * open file. + * + * This function is called from ext2_get_blocks_handle(), also called + * when setting the reservation window size through ioctl before the file + * is open for write (needs block allocation). + * + * Needs truncate_mutex protection prior to calling this function. + */ +void ext2_init_block_alloc_info(struct inode *inode) +{ + struct ext2_inode_info *ei = EXT2_I(inode); + struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info; + struct super_block *sb = inode->i_sb; + + block_i = kmalloc(sizeof(*block_i), GFP_NOFS); + if (block_i) { + struct ext2_reserve_window_node *rsv = &block_i->rsv_window_node; + + rsv->rsv_start = EXT2_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_end = EXT2_RESERVE_WINDOW_NOT_ALLOCATED; + + /* + * if filesystem is mounted with NORESERVATION, the goal + * reservation window size is set to zero to indicate + * block reservation is off + */ + if (!test_opt(sb, RESERVATION)) + rsv->rsv_goal_size = 0; + else + rsv->rsv_goal_size = EXT2_DEFAULT_RESERVE_BLOCKS; + rsv->rsv_alloc_hit = 0; + block_i->last_alloc_logical_block = 0; + block_i->last_alloc_physical_block = 0; + } + ei->i_block_alloc_info = block_i; +} + +/** + * ext2_discard_reservation() + * @inode: inode + * + * Discard(free) block reservation window on last file close, or truncate + * or at last iput(). + * + * It is being called in three cases: + * ext2_release_file(): last writer closes the file + * ext2_clear_inode(): last iput(), when nobody links to this file. + * ext2_truncate(): when the block indirect map is about to change. + */ +void ext2_discard_reservation(struct inode *inode) +{ + struct ext2_inode_info *ei = EXT2_I(inode); + struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext2_reserve_window_node *rsv; + spinlock_t *rsv_lock = &EXT2_SB(inode->i_sb)->s_rsv_window_lock; + + if (!block_i) + return; + + rsv = &block_i->rsv_window_node; + if (!rsv_is_empty(&rsv->rsv_window)) { + spin_lock(rsv_lock); + if (!rsv_is_empty(&rsv->rsv_window)) + rsv_window_remove(inode->i_sb, rsv); + spin_unlock(rsv_lock); + } +} + +/** + * ext2_free_blocks_sb() -- Free given blocks and update quota and i_blocks + * @inode: inode + * @block: start physcial block to free + * @count: number of blocks to free + */ +void ext2_free_blocks (struct inode * inode, unsigned long block, + unsigned long count) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head * bh2; + unsigned long block_group; + unsigned long bit; + unsigned long i; + unsigned long overflow; + struct super_block * sb = inode->i_sb; + struct ext2_sb_info * sbi = EXT2_SB(sb); + struct ext2_group_desc * desc; + struct ext2_super_block * es = sbi->s_es; + unsigned freed = 0, group_freed; + + if (block < le32_to_cpu(es->s_first_data_block) || + block + count < block || + block + count > le32_to_cpu(es->s_blocks_count)) { + ext2_error (sb, "ext2_free_blocks", + "Freeing blocks not in datazone - " + "block = %lu, count = %lu", block, count); + goto error_return; + } + + ext2_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1); + +do_more: + overflow = 0; + block_group = (block - le32_to_cpu(es->s_first_data_block)) / + EXT2_BLOCKS_PER_GROUP(sb); + bit = (block - le32_to_cpu(es->s_first_data_block)) % + EXT2_BLOCKS_PER_GROUP(sb); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT2_BLOCKS_PER_GROUP(sb)) { + overflow = bit + count - EXT2_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + + desc = ext2_get_group_desc (sb, block_group, &bh2); + if (!desc) + goto error_return; + + if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) || + in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) || + in_range (block, le32_to_cpu(desc->bg_inode_table), + sbi->s_itb_per_group) || + in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table), + sbi->s_itb_per_group)) { + ext2_error (sb, "ext2_free_blocks", + "Freeing blocks in system zones - " + "Block = %lu, count = %lu", + block, count); + goto error_return; + } + + for (i = 0, group_freed = 0; i < count; i++) { + if (!ext2_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + bit + i, bitmap_bh->b_data)) { + ext2_error(sb, __func__, + "bit already cleared for block %lu", block + i); + } else { + group_freed++; + } + } + + mark_buffer_dirty(bitmap_bh); + if (sb->s_flags & MS_SYNCHRONOUS) + sync_dirty_buffer(bitmap_bh); + + group_adjust_blocks(sb, block_group, desc, bh2, group_freed); + freed += group_freed; + + if (overflow) { + block += count; + count = overflow; + goto do_more; + } +error_return: + brelse(bitmap_bh); + release_blocks(sb, freed); + dquot_free_block_nodirty(inode, freed); +} + +/** + * bitmap_search_next_usable_block() + * @start: the starting block (group relative) of the search + * @bh: bufferhead contains the block group bitmap + * @maxblocks: the ending block (group relative) of the reservation + * + * The bitmap search --- search forward through the actual bitmap on disk until + * we find a bit free. + */ +static ext2_grpblk_t +bitmap_search_next_usable_block(ext2_grpblk_t start, struct buffer_head *bh, + ext2_grpblk_t maxblocks) +{ + ext2_grpblk_t next; + + next = ext2_find_next_zero_bit(bh->b_data, maxblocks, start); + if (next >= maxblocks) + return -1; + return next; +} + +/** + * find_next_usable_block() + * @start: the starting block (group relative) to find next + * allocatable block in bitmap. + * @bh: bufferhead contains the block group bitmap + * @maxblocks: the ending block (group relative) for the search + * + * Find an allocatable block in a bitmap. We perform the "most + * appropriate allocation" algorithm of looking for a free block near + * the initial goal; then for a free byte somewhere in the bitmap; + * then for any free bit in the bitmap. + */ +static ext2_grpblk_t +find_next_usable_block(int start, struct buffer_head *bh, int maxblocks) +{ + ext2_grpblk_t here, next; + char *p, *r; + + if (start > 0) { + /* + * The goal was occupied; search forward for a free + * block within the next XX blocks. + * + * end_goal is more or less random, but it has to be + * less than EXT2_BLOCKS_PER_GROUP. Aligning up to the + * next 64-bit boundary is simple.. + */ + ext2_grpblk_t end_goal = (start + 63) & ~63; + if (end_goal > maxblocks) + end_goal = maxblocks; + here = ext2_find_next_zero_bit(bh->b_data, end_goal, start); + if (here < end_goal) + return here; + ext2_debug("Bit not found near goal\n"); + } + + here = start; + if (here < 0) + here = 0; + + p = ((char *)bh->b_data) + (here >> 3); + r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); + next = (r - ((char *)bh->b_data)) << 3; + + if (next < maxblocks && next >= here) + return next; + + here = bitmap_search_next_usable_block(here, bh, maxblocks); + return here; +} + +/** + * ext2_try_to_allocate() + * @sb: superblock + * @group: given allocation block group + * @bitmap_bh: bufferhead holds the block bitmap + * @grp_goal: given target block within the group + * @count: target number of blocks to allocate + * @my_rsv: reservation window + * + * Attempt to allocate blocks within a give range. Set the range of allocation + * first, then find the first free bit(s) from the bitmap (within the range), + * and at last, allocate the blocks by claiming the found free bit as allocated. + * + * To set the range of this allocation: + * if there is a reservation window, only try to allocate block(s) + * from the file's own reservation window; + * Otherwise, the allocation range starts from the give goal block, + * ends at the block group's last block. + * + * If we failed to allocate the desired block then we may end up crossing to a + * new bitmap. + */ +static int +ext2_try_to_allocate(struct super_block *sb, int group, + struct buffer_head *bitmap_bh, ext2_grpblk_t grp_goal, + unsigned long *count, + struct ext2_reserve_window *my_rsv) +{ + ext2_fsblk_t group_first_block; + ext2_grpblk_t start, end; + unsigned long num = 0; + + /* we do allocation within the reservation window if we have a window */ + if (my_rsv) { + group_first_block = ext2_group_first_block_no(sb, group); + if (my_rsv->_rsv_start >= group_first_block) + start = my_rsv->_rsv_start - group_first_block; + else + /* reservation window cross group boundary */ + start = 0; + end = my_rsv->_rsv_end - group_first_block + 1; + if (end > EXT2_BLOCKS_PER_GROUP(sb)) + /* reservation window crosses group boundary */ + end = EXT2_BLOCKS_PER_GROUP(sb); + if ((start <= grp_goal) && (grp_goal < end)) + start = grp_goal; + else + grp_goal = -1; + } else { + if (grp_goal > 0) + start = grp_goal; + else + start = 0; + end = EXT2_BLOCKS_PER_GROUP(sb); + } + + BUG_ON(start > EXT2_BLOCKS_PER_GROUP(sb)); + +repeat: + if (grp_goal < 0) { + grp_goal = find_next_usable_block(start, bitmap_bh, end); + if (grp_goal < 0) + goto fail_access; + if (!my_rsv) { + int i; + + for (i = 0; i < 7 && grp_goal > start && + !ext2_test_bit(grp_goal - 1, + bitmap_bh->b_data); + i++, grp_goal--) + ; + } + } + start = grp_goal; + + if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), grp_goal, + bitmap_bh->b_data)) { + /* + * The block was allocated by another thread, or it was + * allocated and then freed by another thread + */ + start++; + grp_goal++; + if (start >= end) + goto fail_access; + goto repeat; + } + num++; + grp_goal++; + while (num < *count && grp_goal < end + && !ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), + grp_goal, bitmap_bh->b_data)) { + num++; + grp_goal++; + } + *count = num; + return grp_goal - num; +fail_access: + *count = num; + return -1; +} + +/** + * find_next_reservable_window(): + * find a reservable space within the given range. + * It does not allocate the reservation window for now: + * alloc_new_reservation() will do the work later. + * + * @search_head: the head of the searching list; + * This is not necessarily the list head of the whole filesystem + * + * We have both head and start_block to assist the search + * for the reservable space. The list starts from head, + * but we will shift to the place where start_block is, + * then start from there, when looking for a reservable space. + * + * @size: the target new reservation window size + * + * @group_first_block: the first block we consider to start + * the real search from + * + * @last_block: + * the maximum block number that our goal reservable space + * could start from. This is normally the last block in this + * group. The search will end when we found the start of next + * possible reservable space is out of this boundary. + * This could handle the cross boundary reservation window + * request. + * + * basically we search from the given range, rather than the whole + * reservation double linked list, (start_block, last_block) + * to find a free region that is of my size and has not + * been reserved. + * + */ +static int find_next_reservable_window( + struct ext2_reserve_window_node *search_head, + struct ext2_reserve_window_node *my_rsv, + struct super_block * sb, + ext2_fsblk_t start_block, + ext2_fsblk_t last_block) +{ + struct rb_node *next; + struct ext2_reserve_window_node *rsv, *prev; + ext2_fsblk_t cur; + int size = my_rsv->rsv_goal_size; + + /* TODO: make the start of the reservation window byte-aligned */ + /* cur = *start_block & ~7;*/ + cur = start_block; + rsv = search_head; + if (!rsv) + return -1; + + while (1) { + if (cur <= rsv->rsv_end) + cur = rsv->rsv_end + 1; + + /* TODO? + * in the case we could not find a reservable space + * that is what is expected, during the re-search, we could + * remember what's the largest reservable space we could have + * and return that one. + * + * For now it will fail if we could not find the reservable + * space with expected-size (or more)... + */ + if (cur > last_block) + return -1; /* fail */ + + prev = rsv; + next = rb_next(&rsv->rsv_node); + rsv = rb_entry(next,struct ext2_reserve_window_node,rsv_node); + + /* + * Reached the last reservation, we can just append to the + * previous one. + */ + if (!next) + break; + + if (cur + size <= rsv->rsv_start) { + /* + * Found a reserveable space big enough. We could + * have a reservation across the group boundary here + */ + break; + } + } + /* + * we come here either : + * when we reach the end of the whole list, + * and there is empty reservable space after last entry in the list. + * append it to the end of the list. + * + * or we found one reservable space in the middle of the list, + * return the reservation window that we could append to. + * succeed. + */ + + if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) + rsv_window_remove(sb, my_rsv); + + /* + * Let's book the whole available window for now. We will check the + * disk bitmap later and then, if there are free blocks then we adjust + * the window size if it's larger than requested. + * Otherwise, we will remove this node from the tree next time + * call find_next_reservable_window. + */ + my_rsv->rsv_start = cur; + my_rsv->rsv_end = cur + size - 1; + my_rsv->rsv_alloc_hit = 0; + + if (prev != my_rsv) + ext2_rsv_window_add(sb, my_rsv); + + return 0; +} + +/** + * alloc_new_reservation()--allocate a new reservation window + * + * To make a new reservation, we search part of the filesystem + * reservation list (the list that inside the group). We try to + * allocate a new reservation window near the allocation goal, + * or the beginning of the group, if there is no goal. + * + * We first find a reservable space after the goal, then from + * there, we check the bitmap for the first free block after + * it. If there is no free block until the end of group, then the + * whole group is full, we failed. Otherwise, check if the free + * block is inside the expected reservable space, if so, we + * succeed. + * If the first free block is outside the reservable space, then + * start from the first free block, we search for next available + * space, and go on. + * + * on succeed, a new reservation will be found and inserted into the list + * It contains at least one free block, and it does not overlap with other + * reservation windows. + * + * failed: we failed to find a reservation window in this group + * + * @rsv: the reservation + * + * @grp_goal: The goal (group-relative). It is where the search for a + * free reservable space should start from. + * if we have a goal(goal >0 ), then start from there, + * no goal(goal = -1), we start from the first block + * of the group. + * + * @sb: the super block + * @group: the group we are trying to allocate in + * @bitmap_bh: the block group block bitmap + * + */ +static int alloc_new_reservation(struct ext2_reserve_window_node *my_rsv, + ext2_grpblk_t grp_goal, struct super_block *sb, + unsigned int group, struct buffer_head *bitmap_bh) +{ + struct ext2_reserve_window_node *search_head; + ext2_fsblk_t group_first_block, group_end_block, start_block; + ext2_grpblk_t first_free_block; + struct rb_root *fs_rsv_root = &EXT2_SB(sb)->s_rsv_window_root; + unsigned long size; + int ret; + spinlock_t *rsv_lock = &EXT2_SB(sb)->s_rsv_window_lock; + + group_first_block = ext2_group_first_block_no(sb, group); + group_end_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1); + + if (grp_goal < 0) + start_block = group_first_block; + else + start_block = grp_goal + group_first_block; + + size = my_rsv->rsv_goal_size; + + if (!rsv_is_empty(&my_rsv->rsv_window)) { + /* + * if the old reservation is cross group boundary + * and if the goal is inside the old reservation window, + * we will come here when we just failed to allocate from + * the first part of the window. We still have another part + * that belongs to the next group. In this case, there is no + * point to discard our window and try to allocate a new one + * in this group(which will fail). we should + * keep the reservation window, just simply move on. + * + * Maybe we could shift the start block of the reservation + * window to the first block of next group. + */ + + if ((my_rsv->rsv_start <= group_end_block) && + (my_rsv->rsv_end > group_end_block) && + (start_block >= my_rsv->rsv_start)) + return -1; + + if ((my_rsv->rsv_alloc_hit > + (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) { + /* + * if the previously allocation hit ratio is + * greater than 1/2, then we double the size of + * the reservation window the next time, + * otherwise we keep the same size window + */ + size = size * 2; + if (size > EXT2_MAX_RESERVE_BLOCKS) + size = EXT2_MAX_RESERVE_BLOCKS; + my_rsv->rsv_goal_size= size; + } + } + + spin_lock(rsv_lock); + /* + * shift the search start to the window near the goal block + */ + search_head = search_reserve_window(fs_rsv_root, start_block); + + /* + * find_next_reservable_window() simply finds a reservable window + * inside the given range(start_block, group_end_block). + * + * To make sure the reservation window has a free bit inside it, we + * need to check the bitmap after we found a reservable window. + */ +retry: + ret = find_next_reservable_window(search_head, my_rsv, sb, + start_block, group_end_block); + + if (ret == -1) { + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; + } + + /* + * On success, find_next_reservable_window() returns the + * reservation window where there is a reservable space after it. + * Before we reserve this reservable space, we need + * to make sure there is at least a free block inside this region. + * + * Search the first free bit on the block bitmap. Search starts from + * the start block of the reservable space we just found. + */ + spin_unlock(rsv_lock); + first_free_block = bitmap_search_next_usable_block( + my_rsv->rsv_start - group_first_block, + bitmap_bh, group_end_block - group_first_block + 1); + + if (first_free_block < 0) { + /* + * no free block left on the bitmap, no point + * to reserve the space. return failed. + */ + spin_lock(rsv_lock); + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; /* failed */ + } + + start_block = first_free_block + group_first_block; + /* + * check if the first free block is within the + * free space we just reserved + */ + if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) + return 0; /* success */ + /* + * if the first free bit we found is out of the reservable space + * continue search for next reservable space, + * start from where the free block is, + * we also shift the list head to where we stopped last time + */ + search_head = my_rsv; + spin_lock(rsv_lock); + goto retry; +} + +/** + * try_to_extend_reservation() + * @my_rsv: given reservation window + * @sb: super block + * @size: the delta to extend + * + * Attempt to expand the reservation window large enough to have + * required number of free blocks + * + * Since ext2_try_to_allocate() will always allocate blocks within + * the reservation window range, if the window size is too small, + * multiple blocks allocation has to stop at the end of the reservation + * window. To make this more efficient, given the total number of + * blocks needed and the current size of the window, we try to + * expand the reservation window size if necessary on a best-effort + * basis before ext2_new_blocks() tries to allocate blocks. + */ +static void try_to_extend_reservation(struct ext2_reserve_window_node *my_rsv, + struct super_block *sb, int size) +{ + struct ext2_reserve_window_node *next_rsv; + struct rb_node *next; + spinlock_t *rsv_lock = &EXT2_SB(sb)->s_rsv_window_lock; + + if (!spin_trylock(rsv_lock)) + return; + + next = rb_next(&my_rsv->rsv_node); + + if (!next) + my_rsv->rsv_end += size; + else { + next_rsv = rb_entry(next, struct ext2_reserve_window_node, rsv_node); + + if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size) + my_rsv->rsv_end += size; + else + my_rsv->rsv_end = next_rsv->rsv_start - 1; + } + spin_unlock(rsv_lock); +} + +/** + * ext2_try_to_allocate_with_rsv() + * @sb: superblock + * @group: given allocation block group + * @bitmap_bh: bufferhead holds the block bitmap + * @grp_goal: given target block within the group + * @count: target number of blocks to allocate + * @my_rsv: reservation window + * + * This is the main function used to allocate a new block and its reservation + * window. + * + * Each time when a new block allocation is need, first try to allocate from + * its own reservation. If it does not have a reservation window, instead of + * looking for a free bit on bitmap first, then look up the reservation list to + * see if it is inside somebody else's reservation window, we try to allocate a + * reservation window for it starting from the goal first. Then do the block + * allocation within the reservation window. + * + * This will avoid keeping on searching the reservation list again and + * again when somebody is looking for a free block (without + * reservation), and there are lots of free blocks, but they are all + * being reserved. + * + * We use a red-black tree for the per-filesystem reservation list. + */ +static ext2_grpblk_t +ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group, + struct buffer_head *bitmap_bh, ext2_grpblk_t grp_goal, + struct ext2_reserve_window_node * my_rsv, + unsigned long *count) +{ + ext2_fsblk_t group_first_block, group_last_block; + ext2_grpblk_t ret = 0; + unsigned long num = *count; + + /* + * we don't deal with reservation when + * filesystem is mounted without reservation + * or the file is not a regular file + * or last attempt to allocate a block with reservation turned on failed + */ + if (my_rsv == NULL) { + return ext2_try_to_allocate(sb, group, bitmap_bh, + grp_goal, count, NULL); + } + /* + * grp_goal is a group relative block number (if there is a goal) + * 0 <= grp_goal < EXT2_BLOCKS_PER_GROUP(sb) + * first block is a filesystem wide block number + * first block is the block number of the first block in this group + */ + group_first_block = ext2_group_first_block_no(sb, group); + group_last_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1); + + /* + * Basically we will allocate a new block from inode's reservation + * window. + * + * We need to allocate a new reservation window, if: + * a) inode does not have a reservation window; or + * b) last attempt to allocate a block from existing reservation + * failed; or + * c) we come here with a goal and with a reservation window + * + * We do not need to allocate a new reservation window if we come here + * at the beginning with a goal and the goal is inside the window, or + * we don't have a goal but already have a reservation window. + * then we could go to allocate from the reservation window directly. + */ + while (1) { + if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || + !goal_in_my_reservation(&my_rsv->rsv_window, + grp_goal, group, sb)) { + if (my_rsv->rsv_goal_size < *count) + my_rsv->rsv_goal_size = *count; + ret = alloc_new_reservation(my_rsv, grp_goal, sb, + group, bitmap_bh); + if (ret < 0) + break; /* failed */ + + if (!goal_in_my_reservation(&my_rsv->rsv_window, + grp_goal, group, sb)) + grp_goal = -1; + } else if (grp_goal >= 0) { + int curr = my_rsv->rsv_end - + (grp_goal + group_first_block) + 1; + + if (curr < *count) + try_to_extend_reservation(my_rsv, sb, + *count - curr); + } + + if ((my_rsv->rsv_start > group_last_block) || + (my_rsv->rsv_end < group_first_block)) { + rsv_window_dump(&EXT2_SB(sb)->s_rsv_window_root, 1); + BUG(); + } + ret = ext2_try_to_allocate(sb, group, bitmap_bh, grp_goal, + &num, &my_rsv->rsv_window); + if (ret >= 0) { + my_rsv->rsv_alloc_hit += num; + *count = num; + break; /* succeed */ + } + num = *count; + } + return ret; +} + +/** + * ext2_has_free_blocks() + * @sbi: in-core super block structure. + * + * Check if filesystem has at least 1 free block available for allocation. + */ +static int ext2_has_free_blocks(struct ext2_sb_info *sbi) +{ + ext2_fsblk_t free_blocks, root_blocks; + + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); + if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current_fsuid() && + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + return 0; + } + return 1; +} + +/* + * ext2_new_blocks() -- core block(s) allocation function + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: target number of blocks to allocate + * @errp: error code + * + * ext2_new_blocks uses a goal block to assist allocation. If the goal is + * free, or there is a free block within 32 blocks of the goal, that block + * is allocated. Otherwise a forward search is made for a free block; within + * each block group the search first looks for an entire free byte in the block + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, + unsigned long *count, int *errp) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gdp_bh; + int group_no; + int goal_group; + ext2_grpblk_t grp_target_blk; /* blockgroup relative goal block */ + ext2_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ + ext2_fsblk_t ret_block; /* filesyetem-wide allocated block */ + int bgi; /* blockgroup iteration index */ + int performed_allocation = 0; + ext2_grpblk_t free_blocks; /* number of free blocks in a group */ + struct super_block *sb; + struct ext2_group_desc *gdp; + struct ext2_super_block *es; + struct ext2_sb_info *sbi; + struct ext2_reserve_window_node *my_rsv = NULL; + struct ext2_block_alloc_info *block_i; + unsigned short windowsz = 0; + unsigned long ngroups; + unsigned long num = *count; + int ret; + + *errp = -ENOSPC; + sb = inode->i_sb; + if (!sb) { + printk("ext2_new_blocks: nonexistent device"); + return 0; + } + + /* + * Check quota for allocation of this block. + */ + ret = dquot_alloc_block(inode, num); + if (ret) { + *errp = ret; + return 0; + } + + sbi = EXT2_SB(sb); + es = EXT2_SB(sb)->s_es; + ext2_debug("goal=%lu.\n", goal); + /* + * Allocate a block from reservation only when + * filesystem is mounted with reservation(default,-o reservation), and + * it's a regular file, and + * the desired window size is greater than 0 (One could use ioctl + * command EXT2_IOC_SETRSVSZ to set the window size to 0 to turn off + * reservation on that particular file) + */ + block_i = EXT2_I(inode)->i_block_alloc_info; + if (block_i) { + windowsz = block_i->rsv_window_node.rsv_goal_size; + if (windowsz > 0) + my_rsv = &block_i->rsv_window_node; + } + + if (!ext2_has_free_blocks(sbi)) { + *errp = -ENOSPC; + goto out; + } + + /* + * First, test whether the goal block is free. + */ + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= le32_to_cpu(es->s_blocks_count)) + goal = le32_to_cpu(es->s_first_data_block); + group_no = (goal - le32_to_cpu(es->s_first_data_block)) / + EXT2_BLOCKS_PER_GROUP(sb); + goal_group = group_no; +retry_alloc: + gdp = ext2_get_group_desc(sb, group_no, &gdp_bh); + if (!gdp) + goto io_error; + + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + /* + * if there is not enough free blocks to make a new resevation + * turn off reservation for this allocation + */ + if (my_rsv && (free_blocks < windowsz) + && (free_blocks > 0) + && (rsv_is_empty(&my_rsv->rsv_window))) + my_rsv = NULL; + + if (free_blocks > 0) { + grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) % + EXT2_BLOCKS_PER_GROUP(sb)); + bitmap_bh = read_block_bitmap(sb, group_no); + if (!bitmap_bh) + goto io_error; + grp_alloc_blk = ext2_try_to_allocate_with_rsv(sb, group_no, + bitmap_bh, grp_target_blk, + my_rsv, &num); + if (grp_alloc_blk >= 0) + goto allocated; + } + + ngroups = EXT2_SB(sb)->s_groups_count; + smp_rmb(); + + /* + * Now search the rest of the groups. We assume that + * group_no and gdp correctly point to the last group visited. + */ + for (bgi = 0; bgi < ngroups; bgi++) { + group_no++; + if (group_no >= ngroups) + group_no = 0; + gdp = ext2_get_group_desc(sb, group_no, &gdp_bh); + if (!gdp) + goto io_error; + + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + /* + * skip this group (and avoid loading bitmap) if there + * are no free blocks + */ + if (!free_blocks) + continue; + /* + * skip this group if the number of + * free blocks is less than half of the reservation + * window size. + */ + if (my_rsv && (free_blocks <= (windowsz/2))) + continue; + + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, group_no); + if (!bitmap_bh) + goto io_error; + /* + * try to allocate block(s) from this group, without a goal(-1). + */ + grp_alloc_blk = ext2_try_to_allocate_with_rsv(sb, group_no, + bitmap_bh, -1, my_rsv, &num); + if (grp_alloc_blk >= 0) + goto allocated; + } + /* + * We may end up a bogus earlier ENOSPC error due to + * filesystem is "full" of reservations, but + * there maybe indeed free blocks available on disk + * In this case, we just forget about the reservations + * just do block allocation as without reservations. + */ + if (my_rsv) { + my_rsv = NULL; + windowsz = 0; + group_no = goal_group; + goto retry_alloc; + } + /* No space left on the device */ + *errp = -ENOSPC; + goto out; + +allocated: + + ext2_debug("using block group %d(%d)\n", + group_no, gdp->bg_free_blocks_count); + + ret_block = grp_alloc_blk + ext2_group_first_block_no(sb, group_no); + + if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) || + in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) || + in_range(ret_block, le32_to_cpu(gdp->bg_inode_table), + EXT2_SB(sb)->s_itb_per_group) || + in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table), + EXT2_SB(sb)->s_itb_per_group)) { + ext2_error(sb, "ext2_new_blocks", + "Allocating block in system zone - " + "blocks from "E2FSBLK", length %lu", + ret_block, num); + /* + * ext2_try_to_allocate marked the blocks we allocated as in + * use. So we may want to selectively mark some of the blocks + * as free + */ + goto retry_alloc; + } + + performed_allocation = 1; + + if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) { + ext2_error(sb, "ext2_new_blocks", + "block("E2FSBLK") >= blocks count(%d) - " + "block_group = %d, es == %p ", ret_block, + le32_to_cpu(es->s_blocks_count), group_no, es); + goto out; + } + + group_adjust_blocks(sb, group_no, gdp, gdp_bh, -num); + percpu_counter_sub(&sbi->s_freeblocks_counter, num); + + mark_buffer_dirty(bitmap_bh); + if (sb->s_flags & MS_SYNCHRONOUS) + sync_dirty_buffer(bitmap_bh); + + *errp = 0; + brelse(bitmap_bh); + dquot_free_block_nodirty(inode, *count-num); + mark_inode_dirty(inode); + *count = num; + return ret_block; + +io_error: + *errp = -EIO; +out: + /* + * Undo the block allocation + */ + if (!performed_allocation) { + dquot_free_block_nodirty(inode, *count); + mark_inode_dirty(inode); + } + brelse(bitmap_bh); + return 0; +} + +ext2_fsblk_t ext2_new_block(struct inode *inode, unsigned long goal, int *errp) +{ + unsigned long count = 1; + + return ext2_new_blocks(inode, goal, &count, errp); +} + +#ifdef EXT2FS_DEBUG + +static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; + +unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars) +{ + unsigned int i; + unsigned long sum = 0; + + if (!map) + return (0); + for (i = 0; i < numchars; i++) + sum += nibblemap[map->b_data[i] & 0xf] + + nibblemap[(map->b_data[i] >> 4) & 0xf]; + return (sum); +} + +#endif /* EXT2FS_DEBUG */ + +unsigned long ext2_count_free_blocks (struct super_block * sb) +{ + struct ext2_group_desc * desc; + unsigned long desc_count = 0; + int i; +#ifdef EXT2FS_DEBUG + unsigned long bitmap_count, x; + struct ext2_super_block *es; + + es = EXT2_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + desc = NULL; + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + struct buffer_head *bitmap_bh; + desc = ext2_get_group_desc (sb, i, NULL); + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_blocks_count); + bitmap_bh = read_block_bitmap(sb, i); + if (!bitmap_bh) + continue; + + x = ext2_count_free(bitmap_bh, sb->s_blocksize); + printk ("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(desc->bg_free_blocks_count), x); + bitmap_count += x; + brelse(bitmap_bh); + } + printk("ext2_count_free_blocks: stored = %lu, computed = %lu, %lu\n", + (long)le32_to_cpu(es->s_free_blocks_count), + desc_count, bitmap_count); + return bitmap_count; +#else + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + desc = ext2_get_group_desc (sb, i, NULL); + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_blocks_count); + } + return desc_count; +#endif +} + +static inline int test_root(int a, int b) +{ + int num = b; + + while (a > num) + num *= b; + return num == a; +} + +static int ext2_group_sparse(int group) +{ + if (group <= 1) + return 1; + return (test_root(group, 3) || test_root(group, 5) || + test_root(group, 7)); +} + +/** + * ext2_bg_has_super - number of blocks used by the superblock in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the superblock (primary or backup) + * in this group. Currently this will be only 0 or 1. + */ +int ext2_bg_has_super(struct super_block *sb, int group) +{ + if (EXT2_HAS_RO_COMPAT_FEATURE(sb,EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER)&& + !ext2_group_sparse(group)) + return 0; + return 1; +} + +/** + * ext2_bg_num_gdb - number of blocks used by the group table in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the group descriptor table + * (primary or backup) in this group. In the future there may be a + * different number of descriptor blocks in each group. + */ +unsigned long ext2_bg_num_gdb(struct super_block *sb, int group) +{ + return ext2_bg_has_super(sb, group) ? EXT2_SB(sb)->s_gdb_count : 0; +} + diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c new file mode 100644 index 00000000..47cda410 --- /dev/null +++ b/fs/ext2/dir.c @@ -0,0 +1,733 @@ +/* + * linux/fs/ext2/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * All code that works with directory layout had been switched to pagecache + * and moved here. AV + */ + +#include "ext2.h" +#include +#include +#include + +typedef struct ext2_dir_entry_2 ext2_dirent; + +/* + * Tests against MAX_REC_LEN etc were put in place for 64k block + * sizes; if that is not possible on this arch, we can skip + * those tests and speed things up. + */ +static inline unsigned ext2_rec_len_from_disk(__le16 dlen) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_CACHE_SIZE >= 65536) + if (len == EXT2_MAX_REC_LEN) + return 1 << 16; +#endif + return len; +} + +static inline __le16 ext2_rec_len_to_disk(unsigned len) +{ +#if (PAGE_CACHE_SIZE >= 65536) + if (len == (1 << 16)) + return cpu_to_le16(EXT2_MAX_REC_LEN); + else + BUG_ON(len > (1 << 16)); +#endif + return cpu_to_le16(len); +} + +/* + * ext2 uses block-sized chunks. Arguably, sector-sized ones would be + * more robust, but we have what we have + */ +static inline unsigned ext2_chunk_size(struct inode *inode) +{ + return inode->i_sb->s_blocksize; +} + +static inline void ext2_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +/* + * Return the offset into page `page_nr' of the last valid + * byte in that page, plus one. + */ +static unsigned +ext2_last_byte(struct inode *inode, unsigned long page_nr) +{ + unsigned last_byte = inode->i_size; + + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte; +} + +static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) +{ + struct address_space *mapping = page->mapping; + struct inode *dir = mapping->host; + int err = 0; + + dir->i_version++; + block_write_end(NULL, mapping, pos, len, len, page, NULL); + + if (pos+len > dir->i_size) { + i_size_write(dir, pos+len); + mark_inode_dirty(dir); + } + + if (IS_DIRSYNC(dir)) { + err = write_one_page(page, 1); + if (!err) + err = sync_inode_metadata(dir, 1); + } else { + unlock_page(page); + } + + return err; +} + +static void ext2_check_page(struct page *page, int quiet) +{ + struct inode *dir = page->mapping->host; + struct super_block *sb = dir->i_sb; + unsigned chunk_size = ext2_chunk_size(dir); + char *kaddr = page_address(page); + u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count); + unsigned offs, rec_len; + unsigned limit = PAGE_CACHE_SIZE; + ext2_dirent *p; + char *error; + + if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_CACHE_MASK; + if (limit & (chunk_size - 1)) + goto Ebadsize; + if (!limit) + goto out; + } + for (offs = 0; offs <= limit - EXT2_DIR_REC_LEN(1); offs += rec_len) { + p = (ext2_dirent *)(kaddr + offs); + rec_len = ext2_rec_len_from_disk(p->rec_len); + + if (unlikely(rec_len < EXT2_DIR_REC_LEN(1))) + goto Eshort; + if (unlikely(rec_len & 3)) + goto Ealign; + if (unlikely(rec_len < EXT2_DIR_REC_LEN(p->name_len))) + goto Enamelen; + if (unlikely(((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))) + goto Espan; + if (unlikely(le32_to_cpu(p->inode) > max_inumber)) + goto Einumber; + } + if (offs != limit) + goto Eend; +out: + SetPageChecked(page); + return; + + /* Too bad, we had an error */ + +Ebadsize: + if (!quiet) + ext2_error(sb, __func__, + "size of directory #%lu is not a multiple " + "of chunk size", dir->i_ino); + goto fail; +Eshort: + error = "rec_len is smaller than minimal"; + goto bad_entry; +Ealign: + error = "unaligned directory entry"; + goto bad_entry; +Enamelen: + error = "rec_len is too small for name_len"; + goto bad_entry; +Espan: + error = "directory entry across blocks"; + goto bad_entry; +Einumber: + error = "inode out of bounds"; +bad_entry: + if (!quiet) + ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<inode), + rec_len, p->name_len); + goto fail; +Eend: + if (!quiet) { + p = (ext2_dirent *)(kaddr + offs); + ext2_error(sb, "ext2_check_page", + "entry in directory #%lu spans the page boundary" + "offset=%lu, inode=%lu", + dir->i_ino, (page->index<inode)); + } +fail: + SetPageChecked(page); + SetPageError(page); +} + +static struct page * ext2_get_page(struct inode *dir, unsigned long n, + int quiet) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + if (!IS_ERR(page)) { + kmap(page); + if (!PageChecked(page)) + ext2_check_page(page, quiet); + if (PageError(page)) + goto fail; + } + return page; + +fail: + ext2_put_page(page); + return ERR_PTR(-EIO); +} + +/* + * NOTE! unlike strncmp, ext2_match returns 1 for success, 0 for failure. + * + * len <= EXT2_NAME_LEN and de != NULL are guaranteed by caller. + */ +static inline int ext2_match (int len, const char * const name, + struct ext2_dir_entry_2 * de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * p is at least 6 bytes before the end of page + */ +static inline ext2_dirent *ext2_next_entry(ext2_dirent *p) +{ + return (ext2_dirent *)((char *)p + + ext2_rec_len_from_disk(p->rec_len)); +} + +static inline unsigned +ext2_validate_entry(char *base, unsigned offset, unsigned mask) +{ + ext2_dirent *de = (ext2_dirent*)(base + offset); + ext2_dirent *p = (ext2_dirent*)(base + (offset&mask)); + while ((char*)p < (char*)de) { + if (p->rec_len == 0) + break; + p = ext2_next_entry(p); + } + return (char *)p - base; +} + +static unsigned char ext2_filetype_table[EXT2_FT_MAX] = { + [EXT2_FT_UNKNOWN] = DT_UNKNOWN, + [EXT2_FT_REG_FILE] = DT_REG, + [EXT2_FT_DIR] = DT_DIR, + [EXT2_FT_CHRDEV] = DT_CHR, + [EXT2_FT_BLKDEV] = DT_BLK, + [EXT2_FT_FIFO] = DT_FIFO, + [EXT2_FT_SOCK] = DT_SOCK, + [EXT2_FT_SYMLINK] = DT_LNK, +}; + +#define S_SHIFT 12 +static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT2_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT2_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT2_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT2_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT2_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT2_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT2_FT_SYMLINK, +}; + +static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode) +{ + mode_t mode = inode->i_mode; + if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + else + de->file_type = 0; +} + +static int +ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) +{ + loff_t pos = filp->f_pos; + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + unsigned int offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = dir_pages(inode); + unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); + unsigned char *types = NULL; + int need_revalidate = filp->f_version != inode->i_version; + + if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) + return 0; + + if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) + types = ext2_filetype_table; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + ext2_dirent *de; + struct page *page = ext2_get_page(inode, n, 0); + + if (IS_ERR(page)) { + ext2_error(sb, __func__, + "bad page in #%lu", + inode->i_ino); + filp->f_pos += PAGE_CACHE_SIZE - offset; + return PTR_ERR(page); + } + kaddr = page_address(page); + if (unlikely(need_revalidate)) { + if (offset) { + offset = ext2_validate_entry(kaddr, offset, chunk_mask); + filp->f_pos = (n<f_version = inode->i_version; + need_revalidate = 0; + } + de = (ext2_dirent *)(kaddr+offset); + limit = kaddr + ext2_last_byte(inode, n) - EXT2_DIR_REC_LEN(1); + for ( ;(char*)de <= limit; de = ext2_next_entry(de)) { + if (de->rec_len == 0) { + ext2_error(sb, __func__, + "zero-length directory entry"); + ext2_put_page(page); + return -EIO; + } + if (de->inode) { + int over; + unsigned char d_type = DT_UNKNOWN; + + if (types && de->file_type < EXT2_FT_MAX) + d_type = types[de->file_type]; + + offset = (char *)de - kaddr; + over = filldir(dirent, de->name, de->name_len, + (n<inode), d_type); + if (over) { + ext2_put_page(page); + return 0; + } + } + filp->f_pos += ext2_rec_len_from_disk(de->rec_len); + } + ext2_put_page(page); + } + return 0; +} + +/* + * ext2_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the page in which the entry was found (as a parameter - res_page), + * and the entry itself. Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir, + struct qstr *child, struct page ** res_page) +{ + const char *name = child->name; + int namelen = child->len; + unsigned reclen = EXT2_DIR_REC_LEN(namelen); + unsigned long start, n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + struct ext2_inode_info *ei = EXT2_I(dir); + ext2_dirent * de; + int dir_has_error = 0; + + if (npages == 0) + goto out; + + /* OFFSET_CACHE */ + *res_page = NULL; + + start = ei->i_dir_start_lookup; + if (start >= npages) + start = 0; + n = start; + do { + char *kaddr; + page = ext2_get_page(dir, n, dir_has_error); + if (!IS_ERR(page)) { + kaddr = page_address(page); + de = (ext2_dirent *) kaddr; + kaddr += ext2_last_byte(dir, n) - reclen; + while ((char *) de <= kaddr) { + if (de->rec_len == 0) { + ext2_error(dir->i_sb, __func__, + "zero-length directory entry"); + ext2_put_page(page); + goto out; + } + if (ext2_match (namelen, name, de)) + goto found; + de = ext2_next_entry(de); + } + ext2_put_page(page); + } else + dir_has_error = 1; + + if (++n >= npages) + n = 0; + /* next page is past the blocks we've got */ + if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { + ext2_error(dir->i_sb, __func__, + "dir %lu size %lld exceeds block count %llu", + dir->i_ino, dir->i_size, + (unsigned long long)dir->i_blocks); + goto out; + } + } while (n != start); +out: + return NULL; + +found: + *res_page = page; + ei->i_dir_start_lookup = n; + return de; +} + +struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p) +{ + struct page *page = ext2_get_page(dir, 0, 0); + ext2_dirent *de = NULL; + + if (!IS_ERR(page)) { + de = ext2_next_entry((ext2_dirent *) page_address(page)); + *p = page; + } + return de; +} + +ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child) +{ + ino_t res = 0; + struct ext2_dir_entry_2 *de; + struct page *page; + + de = ext2_find_entry (dir, child, &page); + if (de) { + res = le32_to_cpu(de->inode); + ext2_put_page(page); + } + return res; +} + +static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len) +{ + return __block_write_begin(page, pos, len, ext2_get_block); +} + +/* Releases the page */ +void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, + struct page *page, struct inode *inode, int update_times) +{ + loff_t pos = page_offset(page) + + (char *) de - (char *) page_address(page); + unsigned len = ext2_rec_len_from_disk(de->rec_len); + int err; + + lock_page(page); + err = ext2_prepare_chunk(page, pos, len); + BUG_ON(err); + de->inode = cpu_to_le32(inode->i_ino); + ext2_set_de_type(de, inode); + err = ext2_commit_chunk(page, pos, len); + ext2_put_page(page); + if (update_times) + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; + mark_inode_dirty(dir); +} + +/* + * Parent is locked. + */ +int ext2_add_link (struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned chunk_size = ext2_chunk_size(dir); + unsigned reclen = EXT2_DIR_REC_LEN(namelen); + unsigned short rec_len, name_len; + struct page *page = NULL; + ext2_dirent * de; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr; + loff_t pos; + int err; + + /* + * We take care of directory expansion in the same loop. + * This code plays outside i_size, so it locks the page + * to protect that region. + */ + for (n = 0; n <= npages; n++) { + char *dir_end; + + page = ext2_get_page(dir, n, 0); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = page_address(page); + dir_end = kaddr + ext2_last_byte(dir, n); + de = (ext2_dirent *)kaddr; + kaddr += PAGE_CACHE_SIZE - reclen; + while ((char *)de <= kaddr) { + if ((char *)de == dir_end) { + /* We hit i_size */ + name_len = 0; + rec_len = chunk_size; + de->rec_len = ext2_rec_len_to_disk(chunk_size); + de->inode = 0; + goto got_it; + } + if (de->rec_len == 0) { + ext2_error(dir->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out_unlock; + } + err = -EEXIST; + if (ext2_match (namelen, name, de)) + goto out_unlock; + name_len = EXT2_DIR_REC_LEN(de->name_len); + rec_len = ext2_rec_len_from_disk(de->rec_len); + if (!de->inode && rec_len >= reclen) + goto got_it; + if (rec_len >= name_len + reclen) + goto got_it; + de = (ext2_dirent *) ((char *) de + rec_len); + } + unlock_page(page); + ext2_put_page(page); + } + BUG(); + return -EINVAL; + +got_it: + pos = page_offset(page) + + (char*)de - (char*)page_address(page); + err = ext2_prepare_chunk(page, pos, rec_len); + if (err) + goto out_unlock; + if (de->inode) { + ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len); + de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len); + de->rec_len = ext2_rec_len_to_disk(name_len); + de = de1; + } + de->name_len = namelen; + memcpy(de->name, name, namelen); + de->inode = cpu_to_le32(inode->i_ino); + ext2_set_de_type (de, inode); + err = ext2_commit_chunk(page, pos, rec_len); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; + mark_inode_dirty(dir); + /* OFFSET_CACHE */ +out_put: + ext2_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +/* + * ext2_delete_entry deletes a directory entry by merging it with the + * previous entry. Page is up-to-date. Releases the page. + */ +int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page ) +{ + struct inode *inode = page->mapping->host; + char *kaddr = page_address(page); + unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1); + unsigned to = ((char *)dir - kaddr) + + ext2_rec_len_from_disk(dir->rec_len); + loff_t pos; + ext2_dirent * pde = NULL; + ext2_dirent * de = (ext2_dirent *) (kaddr + from); + int err; + + while ((char*)de < (char*)dir) { + if (de->rec_len == 0) { + ext2_error(inode->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out; + } + pde = de; + de = ext2_next_entry(de); + } + if (pde) + from = (char*)pde - (char*)page_address(page); + pos = page_offset(page) + from; + lock_page(page); + err = ext2_prepare_chunk(page, pos, to - from); + BUG_ON(err); + if (pde) + pde->rec_len = ext2_rec_len_to_disk(to - from); + dir->inode = 0; + err = ext2_commit_chunk(page, pos, to - from); + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL; + mark_inode_dirty(inode); +out: + ext2_put_page(page); + return err; +} + +/* + * Set the first fragment of directory. + */ +int ext2_make_empty(struct inode *inode, struct inode *parent) +{ + struct page *page = grab_cache_page(inode->i_mapping, 0); + unsigned chunk_size = ext2_chunk_size(inode); + struct ext2_dir_entry_2 * de; + int err; + void *kaddr; + + if (!page) + return -ENOMEM; + + err = ext2_prepare_chunk(page, 0, chunk_size); + if (err) { + unlock_page(page); + goto fail; + } + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr, 0, chunk_size); + de = (struct ext2_dir_entry_2 *)kaddr; + de->name_len = 1; + de->rec_len = ext2_rec_len_to_disk(EXT2_DIR_REC_LEN(1)); + memcpy (de->name, ".\0\0", 4); + de->inode = cpu_to_le32(inode->i_ino); + ext2_set_de_type (de, inode); + + de = (struct ext2_dir_entry_2 *)(kaddr + EXT2_DIR_REC_LEN(1)); + de->name_len = 2; + de->rec_len = ext2_rec_len_to_disk(chunk_size - EXT2_DIR_REC_LEN(1)); + de->inode = cpu_to_le32(parent->i_ino); + memcpy (de->name, "..\0", 4); + ext2_set_de_type (de, inode); + kunmap_atomic(kaddr, KM_USER0); + err = ext2_commit_chunk(page, 0, chunk_size); +fail: + page_cache_release(page); + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int ext2_empty_dir (struct inode * inode) +{ + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + int dir_has_error = 0; + + for (i = 0; i < npages; i++) { + char *kaddr; + ext2_dirent * de; + page = ext2_get_page(inode, i, dir_has_error); + + if (IS_ERR(page)) { + dir_has_error = 1; + continue; + } + + kaddr = page_address(page); + de = (ext2_dirent *)kaddr; + kaddr += ext2_last_byte(inode, i) - EXT2_DIR_REC_LEN(1); + + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + ext2_error(inode->i_sb, __func__, + "zero-length directory entry"); + printk("kaddr=%p, de=%p\n", kaddr, de); + goto not_empty; + } + if (de->inode != 0) { + /* check for . and .. */ + if (de->name[0] != '.') + goto not_empty; + if (de->name_len > 2) + goto not_empty; + if (de->name_len < 2) { + if (de->inode != + cpu_to_le32(inode->i_ino)) + goto not_empty; + } else if (de->name[1] != '.') + goto not_empty; + } + de = ext2_next_entry(de); + } + ext2_put_page(page); + } + return 1; + +not_empty: + ext2_put_page(page); + return 0; +} + +const struct file_operations ext2_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = ext2_readdir, + .unlocked_ioctl = ext2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext2_compat_ioctl, +#endif + .fsync = ext2_fsync, +}; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h new file mode 100644 index 00000000..645be9e7 --- /dev/null +++ b/fs/ext2/ext2.h @@ -0,0 +1,182 @@ +#include +#include + +/* + * ext2 mount options + */ +struct ext2_mount_options { + unsigned long s_mount_opt; + uid_t s_resuid; + gid_t s_resgid; +}; + +/* + * second extended file system inode data in memory + */ +struct ext2_inode_info { + __le32 i_data[15]; + __u32 i_flags; + __u32 i_faddr; + __u8 i_frag_no; + __u8 i_frag_size; + __u16 i_state; + __u32 i_file_acl; + __u32 i_dir_acl; + __u32 i_dtime; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is used for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + __u32 i_block_group; + + /* block reservation info */ + struct ext2_block_alloc_info *i_block_alloc_info; + + __u32 i_dir_start_lookup; +#ifdef CONFIG_EXT2_FS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif + rwlock_t i_meta_lock; + + /* + * truncate_mutex is for serialising ext2_truncate() against + * ext2_getblock(). It also protects the internals of the inode's + * reservation data structures: ext2_reserve_window and + * ext2_reserve_window_node. + */ + struct mutex truncate_mutex; + struct inode vfs_inode; + struct list_head i_orphan; /* unlinked but open inodes */ +}; + +/* + * Inode dynamic state flags + */ +#define EXT2_STATE_NEW 0x00000001 /* inode is newly created */ + + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext2 source programs needs to include it so they are duplicated here. + */ + +static inline struct ext2_inode_info *EXT2_I(struct inode *inode) +{ + return container_of(inode, struct ext2_inode_info, vfs_inode); +} + +/* balloc.c */ +extern int ext2_bg_has_super(struct super_block *sb, int group); +extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group); +extern ext2_fsblk_t ext2_new_block(struct inode *, unsigned long, int *); +extern ext2_fsblk_t ext2_new_blocks(struct inode *, unsigned long, + unsigned long *, int *); +extern void ext2_free_blocks (struct inode *, unsigned long, + unsigned long); +extern unsigned long ext2_count_free_blocks (struct super_block *); +extern unsigned long ext2_count_dirs (struct super_block *); +extern void ext2_check_blocks_bitmap (struct super_block *); +extern struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh); +extern void ext2_discard_reservation (struct inode *); +extern int ext2_should_retry_alloc(struct super_block *sb, int *retries); +extern void ext2_init_block_alloc_info(struct inode *); +extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_window_node *rsv); + +/* dir.c */ +extern int ext2_add_link (struct dentry *, struct inode *); +extern ino_t ext2_inode_by_name(struct inode *, struct qstr *); +extern int ext2_make_empty(struct inode *, struct inode *); +extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *, struct page **); +extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *); +extern int ext2_empty_dir (struct inode *); +extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); +extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int); + +/* ialloc.c */ +extern struct inode * ext2_new_inode (struct inode *, int, const struct qstr *); +extern void ext2_free_inode (struct inode *); +extern unsigned long ext2_count_free_inodes (struct super_block *); +extern void ext2_check_inodes_bitmap (struct super_block *); +extern unsigned long ext2_count_free (struct buffer_head *, unsigned); + +/* inode.c */ +extern struct inode *ext2_iget (struct super_block *, unsigned long); +extern int ext2_write_inode (struct inode *, struct writeback_control *); +extern void ext2_evict_inode(struct inode *); +extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern int ext2_setattr (struct dentry *, struct iattr *); +extern void ext2_set_inode_flags(struct inode *inode); +extern void ext2_get_inode_flags(struct ext2_inode_info *); +extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); + +/* ioctl.c */ +extern long ext2_ioctl(struct file *, unsigned int, unsigned long); +extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* namei.c */ +struct dentry *ext2_get_parent(struct dentry *child); + +/* super.c */ +extern void ext2_error (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void ext2_msg(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void ext2_update_dynamic_rev (struct super_block *sb); +extern void ext2_write_super (struct super_block *); + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext2_dir_operations; + +/* file.c */ +extern int ext2_fsync(struct file *file, int datasync); +extern const struct inode_operations ext2_file_inode_operations; +extern const struct file_operations ext2_file_operations; +extern const struct file_operations ext2_xip_file_operations; + +/* inode.c */ +extern const struct address_space_operations ext2_aops; +extern const struct address_space_operations ext2_aops_xip; +extern const struct address_space_operations ext2_nobh_aops; + +/* namei.c */ +extern const struct inode_operations ext2_dir_inode_operations; +extern const struct inode_operations ext2_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations ext2_fast_symlink_inode_operations; +extern const struct inode_operations ext2_symlink_inode_operations; + +static inline ext2_fsblk_t +ext2_group_first_block_no(struct super_block *sb, unsigned long group_no) +{ + return group_no * (ext2_fsblk_t)EXT2_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block); +} + +#define ext2_set_bit __test_and_set_bit_le +#define ext2_clear_bit __test_and_clear_bit_le +#define ext2_test_bit test_bit_le +#define ext2_find_first_zero_bit find_first_zero_bit_le +#define ext2_find_next_zero_bit find_next_zero_bit_le diff --git a/fs/ext2/file.c b/fs/ext2/file.c new file mode 100644 index 00000000..49eec945 --- /dev/null +++ b/fs/ext2/file.c @@ -0,0 +1,107 @@ +/* + * linux/fs/ext2/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 fs regular file handling primitives + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + */ + +#include +#include +#include +#include "ext2.h" +#include "xattr.h" +#include "acl.h" + +/* + * Called when filp is released. This happens when all file descriptors + * for a single struct file are closed. Note that different open() calls + * for the same file yield different struct file structures. + */ +static int ext2_release_file (struct inode * inode, struct file * filp) +{ + if (filp->f_mode & FMODE_WRITE) { + mutex_lock(&EXT2_I(inode)->truncate_mutex); + ext2_discard_reservation(inode); + mutex_unlock(&EXT2_I(inode)->truncate_mutex); + } + return 0; +} + +int ext2_fsync(struct file *file, int datasync) +{ + int ret; + struct super_block *sb = file->f_mapping->host->i_sb; + struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + + ret = generic_file_fsync(file, datasync); + if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { + /* We don't really know where the IO error happened... */ + ext2_error(sb, __func__, + "detected IO error when writing metadata buffers"); + ret = -EIO; + } + return ret; +} + +/* + * We have mostly NULL's here: the current defaults are ok for + * the ext2 filesystem. + */ +const struct file_operations ext2_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .unlocked_ioctl = ext2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext2_compat_ioctl, +#endif + .mmap = generic_file_mmap, + .open = dquot_file_open, + .release = ext2_release_file, + .fsync = ext2_fsync, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +}; + +#ifdef CONFIG_EXT2_FS_XIP +const struct file_operations ext2_xip_file_operations = { + .llseek = generic_file_llseek, + .read = xip_file_read, + .write = xip_file_write, + .unlocked_ioctl = ext2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext2_compat_ioctl, +#endif + .mmap = xip_file_mmap, + .open = dquot_file_open, + .release = ext2_release_file, + .fsync = ext2_fsync, +}; +#endif + +const struct inode_operations ext2_file_inode_operations = { +#ifdef CONFIG_EXT2_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext2_listxattr, + .removexattr = generic_removexattr, +#endif + .setattr = ext2_setattr, + .check_acl = ext2_check_acl, + .fiemap = ext2_fiemap, +}; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c new file mode 100644 index 00000000..ee9ed319 --- /dev/null +++ b/fs/ext2/ialloc.c @@ -0,0 +1,674 @@ +/* + * linux/fs/ext2/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@dcs.ed.ac.uk), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include "ext2.h" +#include "xattr.h" +#include "acl.h" + +/* + * ialloc.c contains the inodes allocation and deallocation routines + */ + +/* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. + */ + + +/* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return buffer_head of bitmap on success or NULL. + */ +static struct buffer_head * +read_inode_bitmap(struct super_block * sb, unsigned long block_group) +{ + struct ext2_group_desc *desc; + struct buffer_head *bh = NULL; + + desc = ext2_get_group_desc(sb, block_group, NULL); + if (!desc) + goto error_out; + + bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap)); + if (!bh) + ext2_error(sb, "read_inode_bitmap", + "Cannot read inode bitmap - " + "block_group = %lu, inode_bitmap = %u", + block_group, le32_to_cpu(desc->bg_inode_bitmap)); +error_out: + return bh; +} + +static void ext2_release_inode(struct super_block *sb, int group, int dir) +{ + struct ext2_group_desc * desc; + struct buffer_head *bh; + + desc = ext2_get_group_desc(sb, group, &bh); + if (!desc) { + ext2_error(sb, "ext2_release_inode", + "can't get descriptor for group %d", group); + return; + } + + spin_lock(sb_bgl_lock(EXT2_SB(sb), group)); + le16_add_cpu(&desc->bg_free_inodes_count, 1); + if (dir) + le16_add_cpu(&desc->bg_used_dirs_count, -1); + spin_unlock(sb_bgl_lock(EXT2_SB(sb), group)); + if (dir) + percpu_counter_dec(&EXT2_SB(sb)->s_dirs_counter); + sb->s_dirt = 1; + mark_buffer_dirty(bh); +} + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ext2_free_inode (struct inode * inode) +{ + struct super_block * sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head *bitmap_bh; + unsigned long block_group; + unsigned long bit; + struct ext2_super_block * es; + + ino = inode->i_ino; + ext2_debug ("freeing inode %lu\n", ino); + + /* + * Note: we must free any quota before locking the superblock, + * as writing the quota to disk may need the lock as well. + */ + /* Quota is already initialized in iput() */ + ext2_xattr_delete_inode(inode); + dquot_free_inode(inode); + dquot_drop(inode); + + es = EXT2_SB(sb)->s_es; + is_directory = S_ISDIR(inode->i_mode); + + if (ino < EXT2_FIRST_INO(sb) || + ino > le32_to_cpu(es->s_inodes_count)) { + ext2_error (sb, "ext2_free_inode", + "reserved or nonexistent inode %lu", ino); + return; + } + block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb); + bitmap_bh = read_inode_bitmap(sb, block_group); + if (!bitmap_bh) + return; + + /* Ok, now we can actually update the inode bitmaps.. */ + if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group), + bit, (void *) bitmap_bh->b_data)) + ext2_error (sb, "ext2_free_inode", + "bit already cleared for inode %lu", ino); + else + ext2_release_inode(sb, block_group, is_directory); + mark_buffer_dirty(bitmap_bh); + if (sb->s_flags & MS_SYNCHRONOUS) + sync_dirty_buffer(bitmap_bh); + + brelse(bitmap_bh); +} + +/* + * We perform asynchronous prereading of the new inode's inode block when + * we create the inode, in the expectation that the inode will be written + * back soon. There are two reasons: + * + * - When creating a large number of files, the async prereads will be + * nicely merged into large reads + * - When writing out a large number of inodes, we don't need to keep on + * stalling the writes while we read the inode block. + * + * FIXME: ext2_get_group_desc() needs to be simplified. + */ +static void ext2_preread_inode(struct inode *inode) +{ + unsigned long block_group; + unsigned long offset; + unsigned long block; + struct ext2_group_desc * gdp; + struct backing_dev_info *bdi; + + bdi = inode->i_mapping->backing_dev_info; + if (bdi_read_congested(bdi)) + return; + if (bdi_write_congested(bdi)) + return; + + block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); + gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL); + if (gdp == NULL) + return; + + /* + * Figure out the offset within the block group inode table + */ + offset = ((inode->i_ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) * + EXT2_INODE_SIZE(inode->i_sb); + block = le32_to_cpu(gdp->bg_inode_table) + + (offset >> EXT2_BLOCK_SIZE_BITS(inode->i_sb)); + sb_breadahead(inode->i_sb, block); +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory\'s block + * group to find a free inode. + */ +static int find_group_dir(struct super_block *sb, struct inode *parent) +{ + int ngroups = EXT2_SB(sb)->s_groups_count; + int avefreei = ext2_count_free_inodes(sb) / ngroups; + struct ext2_group_desc *desc, *best_desc = NULL; + int group, best_group = -1; + + for (group = 0; group < ngroups; group++) { + desc = ext2_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + continue; + if (!best_desc || + (le16_to_cpu(desc->bg_free_blocks_count) > + le16_to_cpu(best_desc->bg_free_blocks_count))) { + best_group = group; + best_desc = desc; + } + } + if (!best_desc) + return -1; + + return best_group; +} + +/* + * Orlov's allocator for directories. + * + * We always try to spread first-level directories. + * + * If there are blockgroups with both free inodes and free blocks counts + * not worse than average we return one with smallest directory count. + * Otherwise we simply return a random group. + * + * For the rest rules look so: + * + * It's OK to put directory into a group unless + * it has too many directories already (max_dirs) or + * it has too few free inodes left (min_inodes) or + * it has too few free blocks left (min_blocks) or + * it's already running too large debt (max_debt). + * Parent's group is preferred, if it doesn't satisfy these + * conditions we search cyclically through the rest. If none + * of the groups look good we just look for a group with more + * free inodes than average (starting at parent's group). + * + * Debt is incremented each time we allocate a directory and decremented + * when we allocate an inode, within 0--255. + */ + +#define INODE_COST 64 +#define BLOCK_COST 256 + +static int find_group_orlov(struct super_block *sb, struct inode *parent) +{ + int parent_group = EXT2_I(parent)->i_block_group; + struct ext2_sb_info *sbi = EXT2_SB(sb); + struct ext2_super_block *es = sbi->s_es; + int ngroups = sbi->s_groups_count; + int inodes_per_group = EXT2_INODES_PER_GROUP(sb); + int freei; + int avefreei; + int free_blocks; + int avefreeb; + int blocks_per_dir; + int ndirs; + int max_debt, max_dirs, min_blocks, min_inodes; + int group = -1, i; + struct ext2_group_desc *desc; + + freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + avefreei = freei / ngroups; + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + avefreeb = free_blocks / ngroups; + ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + + if ((parent == sb->s_root->d_inode) || + (EXT2_I(parent)->i_flags & EXT2_TOPDIR_FL)) { + struct ext2_group_desc *best_desc = NULL; + int best_ndir = inodes_per_group; + int best_group = -1; + + get_random_bytes(&group, sizeof(group)); + parent_group = (unsigned)group % ngroups; + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext2_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + continue; + if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) + continue; + best_group = group; + best_ndir = le16_to_cpu(desc->bg_used_dirs_count); + best_desc = desc; + } + if (best_group >= 0) { + desc = best_desc; + group = best_group; + goto found; + } + goto fallback; + } + + if (ndirs == 0) + ndirs = 1; /* percpu_counters are approximate... */ + + blocks_per_dir = (le32_to_cpu(es->s_blocks_count)-free_blocks) / ndirs; + + max_dirs = ndirs / ngroups + inodes_per_group / 16; + min_inodes = avefreei - inodes_per_group / 4; + min_blocks = avefreeb - EXT2_BLOCKS_PER_GROUP(sb) / 4; + + max_debt = EXT2_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, BLOCK_COST); + if (max_debt * INODE_COST > inodes_per_group) + max_debt = inodes_per_group / INODE_COST; + if (max_debt > 255) + max_debt = 255; + if (max_debt == 0) + max_debt = 1; + + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext2_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (sbi->s_debts[group] >= max_debt) + continue; + if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) + continue; + if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) + continue; + goto found; + } + +fallback: + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext2_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) + goto found; + } + + if (avefreei) { + /* + * The free-inodes counter is approximate, and for really small + * filesystems the above test can fail to find any blockgroups + */ + avefreei = 0; + goto fallback; + } + + return -1; + +found: + return group; +} + +static int find_group_other(struct super_block *sb, struct inode *parent) +{ + int parent_group = EXT2_I(parent)->i_block_group; + int ngroups = EXT2_SB(sb)->s_groups_count; + struct ext2_group_desc *desc; + int group, i; + + /* + * Try to place the inode in its parent directory + */ + group = parent_group; + desc = ext2_get_group_desc (sb, group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) + goto found; + + /* + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in + * the same blockgroup. But we want files which are in a different + * directory which shares a blockgroup with our parent to land in a + * different blockgroup. + * + * So add our directory's i_ino into the starting point for the hash. + */ + group = (group + parent->i_ino) % ngroups; + + /* + * Use a quadratic hash to find a group with a free inode and some + * free blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + group += i; + if (group >= ngroups) + group -= ngroups; + desc = ext2_get_group_desc (sb, group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) + goto found; + } + + /* + * That failed: try linear search for a free inode, even if that group + * has no free blocks. + */ + group = parent_group; + for (i = 0; i < ngroups; i++) { + if (++group >= ngroups) + group = 0; + desc = ext2_get_group_desc (sb, group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + goto found; + } + + return -1; + +found: + return group; +} + +struct inode *ext2_new_inode(struct inode *dir, int mode, + const struct qstr *qstr) +{ + struct super_block *sb; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + int group, i; + ino_t ino = 0; + struct inode * inode; + struct ext2_group_desc *gdp; + struct ext2_super_block *es; + struct ext2_inode_info *ei; + struct ext2_sb_info *sbi; + int err; + + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + ei = EXT2_I(inode); + sbi = EXT2_SB(sb); + es = sbi->s_es; + if (S_ISDIR(mode)) { + if (test_opt(sb, OLDALLOC)) + group = find_group_dir(sb, dir); + else + group = find_group_orlov(sb, dir); + } else + group = find_group_other(sb, dir); + + if (group == -1) { + err = -ENOSPC; + goto fail; + } + + for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext2_get_group_desc(sb, group, &bh2); + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, group); + if (!bitmap_bh) { + err = -EIO; + goto fail; + } + ino = 0; + +repeat_in_this_group: + ino = ext2_find_next_zero_bit((unsigned long *)bitmap_bh->b_data, + EXT2_INODES_PER_GROUP(sb), ino); + if (ino >= EXT2_INODES_PER_GROUP(sb)) { + /* + * Rare race: find_group_xx() decided that there were + * free inodes in this group, but by the time we tried + * to allocate one, they're all gone. This can also + * occur because the counters which find_group_orlov() + * uses are approximate. So just go and search the + * next block group. + */ + if (++group == sbi->s_groups_count) + group = 0; + continue; + } + if (ext2_set_bit_atomic(sb_bgl_lock(sbi, group), + ino, bitmap_bh->b_data)) { + /* we lost this inode */ + if (++ino >= EXT2_INODES_PER_GROUP(sb)) { + /* this group is exhausted, try next group */ + if (++group == sbi->s_groups_count) + group = 0; + continue; + } + /* try to find free inode in the same group */ + goto repeat_in_this_group; + } + goto got; + } + + /* + * Scanned all blockgroups. + */ + err = -ENOSPC; + goto fail; +got: + mark_buffer_dirty(bitmap_bh); + if (sb->s_flags & MS_SYNCHRONOUS) + sync_dirty_buffer(bitmap_bh); + brelse(bitmap_bh); + + ino += group * EXT2_INODES_PER_GROUP(sb) + 1; + if (ino < EXT2_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext2_error (sb, "ext2_new_inode", + "reserved inode or inode > inodes count - " + "block_group = %d,inode=%lu", group, + (unsigned long) ino); + err = -EIO; + goto fail; + } + + percpu_counter_add(&sbi->s_freeinodes_counter, -1); + if (S_ISDIR(mode)) + percpu_counter_inc(&sbi->s_dirs_counter); + + spin_lock(sb_bgl_lock(sbi, group)); + le16_add_cpu(&gdp->bg_free_inodes_count, -1); + if (S_ISDIR(mode)) { + if (sbi->s_debts[group] < 255) + sbi->s_debts[group]++; + le16_add_cpu(&gdp->bg_used_dirs_count, 1); + } else { + if (sbi->s_debts[group]) + sbi->s_debts[group]--; + } + spin_unlock(sb_bgl_lock(sbi, group)); + + sb->s_dirt = 1; + mark_buffer_dirty(bh2); + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(inode, dir, mode); + + inode->i_ino = ino; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_flags = + ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED); + ei->i_faddr = 0; + ei->i_frag_no = 0; + ei->i_frag_size = 0; + ei->i_file_acl = 0; + ei->i_dir_acl = 0; + ei->i_dtime = 0; + ei->i_block_alloc_info = NULL; + ei->i_block_group = group; + ei->i_dir_start_lookup = 0; + ei->i_state = EXT2_STATE_NEW; + ext2_set_inode_flags(inode); + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + if (insert_inode_locked(inode) < 0) { + err = -EINVAL; + goto fail_drop; + } + + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + + err = ext2_init_acl(inode, dir); + if (err) + goto fail_free_drop; + + err = ext2_init_security(inode, dir, qstr); + if (err) + goto fail_free_drop; + + mark_inode_dirty(inode); + ext2_debug("allocating inode %lu\n", inode->i_ino); + ext2_preread_inode(inode); + return inode; + +fail_free_drop: + dquot_free_inode(inode); + +fail_drop: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + inode->i_nlink = 0; + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(err); + +fail: + make_bad_inode(inode); + iput(inode); + return ERR_PTR(err); +} + +unsigned long ext2_count_free_inodes (struct super_block * sb) +{ + struct ext2_group_desc *desc; + unsigned long desc_count = 0; + int i; + +#ifdef EXT2FS_DEBUG + struct ext2_super_block *es; + unsigned long bitmap_count = 0; + struct buffer_head *bitmap_bh = NULL; + + es = EXT2_SB(sb)->s_es; + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + unsigned x; + + desc = ext2_get_group_desc (sb, i, NULL); + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_inodes_count); + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, i); + if (!bitmap_bh) + continue; + + x = ext2_count_free(bitmap_bh, EXT2_INODES_PER_GROUP(sb) / 8); + printk("group %d: stored = %d, counted = %u\n", + i, le16_to_cpu(desc->bg_free_inodes_count), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n", + percpu_counter_read(&EXT2_SB(sb)->s_freeinodes_counter), + desc_count, bitmap_count); + return desc_count; +#else + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + desc = ext2_get_group_desc (sb, i, NULL); + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_inodes_count); + } + return desc_count; +#endif +} + +/* Called at mount-time, super-block is locked */ +unsigned long ext2_count_dirs (struct super_block * sb) +{ + unsigned long count = 0; + int i; + + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + struct ext2_group_desc *gdp = ext2_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + count += le16_to_cpu(gdp->bg_used_dirs_count); + } + return count; +} + diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c new file mode 100644 index 00000000..788e09a0 --- /dev/null +++ b/fs/ext2/inode.c @@ -0,0 +1,1552 @@ +/* + * linux/fs/ext2/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@dcs.ed.ac.uk), 1993, 1998 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext2.h" +#include "acl.h" +#include "xip.h" + +MODULE_AUTHOR("Remy Card and others"); +MODULE_DESCRIPTION("Second Extended Filesystem"); +MODULE_LICENSE("GPL"); + +static int __ext2_write_inode(struct inode *inode, int do_sync); + +/* + * Test whether an inode is a fast symlink. + */ +static inline int ext2_inode_is_fast_symlink(struct inode *inode) +{ + int ea_blocks = EXT2_I(inode)->i_file_acl ? + (inode->i_sb->s_blocksize >> 9) : 0; + + return (S_ISLNK(inode->i_mode) && + inode->i_blocks - ea_blocks == 0); +} + +static void ext2_truncate_blocks(struct inode *inode, loff_t offset); + +static void ext2_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + ext2_truncate_blocks(inode, inode->i_size); + } +} + +/* + * Called at the last iput() if i_nlink is zero. + */ +void ext2_evict_inode(struct inode * inode) +{ + struct ext2_block_alloc_info *rsv; + int want_delete = 0; + + if (!inode->i_nlink && !is_bad_inode(inode)) { + want_delete = 1; + dquot_initialize(inode); + } else { + dquot_drop(inode); + } + + truncate_inode_pages(&inode->i_data, 0); + + if (want_delete) { + /* set dtime */ + EXT2_I(inode)->i_dtime = get_seconds(); + mark_inode_dirty(inode); + __ext2_write_inode(inode, inode_needs_sync(inode)); + /* truncate to 0 */ + inode->i_size = 0; + if (inode->i_blocks) + ext2_truncate_blocks(inode, 0); + } + + invalidate_inode_buffers(inode); + end_writeback(inode); + + ext2_discard_reservation(inode); + rsv = EXT2_I(inode)->i_block_alloc_info; + EXT2_I(inode)->i_block_alloc_info = NULL; + if (unlikely(rsv)) + kfree(rsv); + + if (want_delete) + ext2_free_inode(inode); +} + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +static inline int verify_chain(Indirect *from, Indirect *to) +{ + while (from <= to && from->key == *from->p) + from++; + return (from > to); +} + +/** + * ext2_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * To store the locations of file's data ext2 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext2_block_to_path(struct inode *inode, + long i_block, int offsets[4], int *boundary) +{ + int ptrs = EXT2_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT2_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT2_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < 0) { + ext2_msg(inode->i_sb, KERN_WARNING, + "warning: %s: block < 0", __func__); + } else if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ( (i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT2_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT2_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT2_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext2_msg(inode->i_sb, KERN_WARNING, + "warning: %s: block is too big", __func__); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + + return n; +} + +/** + * ext2_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it notices that chain had been changed while it was reading + * (ditto, *@err == -EAGAIN) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + */ +static Indirect *ext2_get_branch(struct inode *inode, + int depth, + int *offsets, + Indirect chain[4], + int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, EXT2_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_bread(sb, le32_to_cpu(p->key)); + if (!bh) + goto failure; + read_lock(&EXT2_I(inode)->i_meta_lock); + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); + read_unlock(&EXT2_I(inode)->i_meta_lock); + if (!p->key) + goto no_block; + } + return NULL; + +changed: + read_unlock(&EXT2_I(inode)->i_meta_lock); + brelse(bh); + *err = -EAGAIN; + goto no_block; +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext2_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the preferred place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ + +static ext2_fsblk_t ext2_find_near(struct inode *inode, Indirect *ind) +{ + struct ext2_inode_info *ei = EXT2_I(inode); + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; + __le32 *p; + ext2_fsblk_t bg_start; + ext2_fsblk_t colour; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) + if (*p) + return le32_to_cpu(*p); + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred from inode itself? OK, just put it into + * the same cylinder group then. + */ + bg_start = ext2_group_first_block_no(inode->i_sb, ei->i_block_group); + colour = (current->pid % 16) * + (EXT2_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour; +} + +/** + * ext2_find_goal - find a preferred place for allocation. + * @inode: owner + * @block: block we want + * @partial: pointer to the last triple within a chain + * + * Returns preferred place for a block (the goal). + */ + +static inline ext2_fsblk_t ext2_find_goal(struct inode *inode, long block, + Indirect *partial) +{ + struct ext2_block_alloc_info *block_i; + + block_i = EXT2_I(inode)->i_block_alloc_info; + + /* + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ + if (block_i && (block == block_i->last_alloc_logical_block + 1) + && (block_i->last_alloc_physical_block != 0)) { + return block_i->last_alloc_physical_block + 1; + } + + return ext2_find_near(inode, partial); +} + +/** + * ext2_blks_to_allocate: Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int +ext2_blks_to_allocate(Indirect * branch, int k, unsigned long blks, + int blocks_to_boundary) +{ + unsigned long count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now don't hanel cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary + && le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext2_alloc_blocks: multiple allocate blocks needed for a branch + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @blks: on return it will store the total number of allocated + * direct blocks + */ +static int ext2_alloc_blocks(struct inode *inode, + ext2_fsblk_t goal, int indirect_blks, int blks, + ext2_fsblk_t new_blocks[4], int *err) +{ + int target, i; + unsigned long count = 0; + int index = 0; + ext2_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + target = blks + indirect_blks; + + while (1) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext2_new_blocks(inode,goal,&count,err); + if (*err) + goto failed_out; + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + + if (count > 0) + break; + } + + /* save the new block number for the first direct block */ + new_blocks[index] = current_block; + + /* total number of blocks allocated for direct blocks */ + ret = count; + *err = 0; + return ret; +failed_out: + for (i = 0; i key). Upon the exit we have the same + * picture as after the successful ext2_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext2_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ + +static int ext2_alloc_branch(struct inode *inode, + int indirect_blks, int *blks, ext2_fsblk_t goal, + int *offsets, Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int i, n = 0; + int err = 0; + struct buffer_head *bh; + int num; + ext2_fsblk_t new_blocks[4]; + ext2_fsblk_t current_block; + + num = ext2_alloc_blocks(inode, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; + + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + branch[n].bh = bh; + lock_buffer(bh); + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if ( n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i=1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + set_buffer_uptodate(bh); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + /* We used to sync bh here if IS_SYNC(inode). + * But we now rely upon generic_write_sync() + * and b_inode_buffers. But not for directories. + */ + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) + sync_dirty_buffer(bh); + } + *blks = num; + return err; +} + +/** + * ext2_splice_branch - splice the allocated branch onto inode. + * @inode: owner + * @block: (logical) number of block we are adding + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static void ext2_splice_branch(struct inode *inode, + long block, Indirect *where, int num, int blks) +{ + int i; + struct ext2_block_alloc_info *block_i; + ext2_fsblk_t current_block; + + block_i = EXT2_I(inode)->i_block_alloc_info; + + /* XXX LOCKING probably should have i_meta_lock ?*/ + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i ) = cpu_to_le32(current_block++); + } + + /* + * update the most recently allocated logical & physical block + * in i_block_alloc_info, to assist find the proper goal block for next + * allocation + */ + if (block_i) { + block_i->last_alloc_logical_block = block + blks - 1; + block_i->last_alloc_physical_block = + le32_to_cpu(where[num].key) + blks - 1; + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + + /* had we spliced it onto indirect block? */ + if (where->bh) + mark_buffer_dirty_inode(where->bh, inode); + + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +} + +/* + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + */ +static int ext2_get_blocks(struct inode *inode, + sector_t iblock, unsigned long maxblocks, + struct buffer_head *bh_result, + int create) +{ + int err = -EIO; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + ext2_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + struct ext2_inode_info *ei = EXT2_I(inode); + int count = 0; + ext2_fsblk_t first_block = 0; + + depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary); + + if (depth == 0) + return (err); + + partial = ext2_get_branch(inode, depth, offsets, chain, &err); + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + clear_buffer_new(bh_result); /* What's this do? */ + count++; + /*map more blocks*/ + while (count < maxblocks && count <= blocks_to_boundary) { + ext2_fsblk_t blk; + + if (!verify_chain(chain, chain + depth - 1)) { + /* + * Indirect block might be removed by + * truncate while we were reading it. + * Handling of that case: forget what we've + * got now, go to reread. + */ + err = -EAGAIN; + count = 0; + break; + } + blk = le32_to_cpu(*(chain[depth-1].p + count)); + if (blk == first_block + count) + count++; + else + break; + } + if (err != -EAGAIN) + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) + goto cleanup; + + mutex_lock(&ei->truncate_mutex); + /* + * If the indirect block is missing while we are reading + * the chain(ext2_get_branch() returns -EAGAIN err), or + * if the chain has been changed after we grab the semaphore, + * (either because another process truncated this branch, or + * another get_block allocated this branch) re-grab the chain to see if + * the request block has been allocated or not. + * + * Since we already block the truncate/other get_block + * at this point, we will have the current copy of the chain when we + * splice the branch into the tree. + */ + if (err == -EAGAIN || !verify_chain(chain, partial)) { + while (partial > chain) { + brelse(partial->bh); + partial--; + } + partial = ext2_get_branch(inode, depth, offsets, chain, &err); + if (!partial) { + count++; + mutex_unlock(&ei->truncate_mutex); + if (err) + goto cleanup; + clear_buffer_new(bh_result); + goto got_it; + } + } + + /* + * Okay, we need to do block allocation. Lazily initialize the block + * allocation info here if necessary + */ + if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) + ext2_init_block_alloc_info(inode); + + goal = ext2_find_goal(inode, iblock, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext2_blks_to_allocate(partial, indirect_blks, + maxblocks, blocks_to_boundary); + /* + * XXX ???? Block out ext2_truncate while we alter the tree + */ + err = ext2_alloc_branch(inode, indirect_blks, &count, goal, + offsets + (partial - chain), partial); + + if (err) { + mutex_unlock(&ei->truncate_mutex); + goto cleanup; + } + + if (ext2_use_xip(inode->i_sb)) { + /* + * we need to clear the block + */ + err = ext2_clear_xip_target (inode, + le32_to_cpu(chain[depth-1].key)); + if (err) { + mutex_unlock(&ei->truncate_mutex); + goto cleanup; + } + } + + ext2_splice_branch(inode, iblock, partial, indirect_blks, count); + mutex_unlock(&ei->truncate_mutex); + set_buffer_new(bh_result); +got_it: + map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + if (count > blocks_to_boundary) + set_buffer_boundary(bh_result); + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + brelse(partial->bh); + partial--; + } + return err; +} + +int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) +{ + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + int ret = ext2_get_blocks(inode, iblock, max_blocks, + bh_result, create); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + return ret; + +} + +int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, start, len, + ext2_get_block); +} + +static int ext2_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, ext2_get_block, wbc); +} + +static int ext2_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, ext2_get_block); +} + +static int +ext2_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, ext2_get_block); +} + +static int +ext2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, + ext2_get_block); + if (ret < 0) + ext2_write_failed(mapping, pos + len); + return ret; +} + +static int ext2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ret < len) + ext2_write_failed(mapping, pos + len); + return ret; +} + +static int +ext2_nobh_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata, + ext2_get_block); + if (ret < 0) + ext2_write_failed(mapping, pos + len); + return ret; +} + +static int ext2_nobh_writepage(struct page *page, + struct writeback_control *wbc) +{ + return nobh_writepage(page, ext2_get_block, wbc); +} + +static sector_t ext2_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,ext2_get_block); +} + +static ssize_t +ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + + ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, + iov, offset, nr_segs, ext2_get_block, NULL); + if (ret < 0 && (rw & WRITE)) + ext2_write_failed(mapping, offset + iov_length(iov, nr_segs)); + return ret; +} + +static int +ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, ext2_get_block); +} + +const struct address_space_operations ext2_aops = { + .readpage = ext2_readpage, + .readpages = ext2_readpages, + .writepage = ext2_writepage, + .write_begin = ext2_write_begin, + .write_end = ext2_write_end, + .bmap = ext2_bmap, + .direct_IO = ext2_direct_IO, + .writepages = ext2_writepages, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +const struct address_space_operations ext2_aops_xip = { + .bmap = ext2_bmap, + .get_xip_mem = ext2_get_xip_mem, +}; + +const struct address_space_operations ext2_nobh_aops = { + .readpage = ext2_readpage, + .readpages = ext2_readpages, + .writepage = ext2_nobh_writepage, + .write_begin = ext2_nobh_write_begin, + .write_end = nobh_write_end, + .bmap = ext2_bmap, + .direct_IO = ext2_direct_IO, + .writepages = ext2_writepages, + .migratepage = buffer_migrate_page, + .error_remove_page = generic_error_remove_page, +}; + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext2_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext2_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext2_truncate(). + * + * When we do truncate() we may have to clean the ends of several indirect + * blocks but leave the blocks themselves alive. Block is partially + * truncated if some data below the new i_size is referred from it (and + * it is on the path to the first completely truncated data block, indeed). + * We have to free the top of that path along with everything to the right + * of the path. Since no allocation past the truncation point is possible + * until ext2_truncate() finishes, we may safely do the latter, but top + * of branch may require special attention - pageout below the truncation + * point might try to populate it. + * + * We atomically detach the top of branch from the tree, store the block + * number of its root in *@top, pointers to buffer_heads of partially + * truncated blocks - in @chain[].bh and pointers to their last elements + * that should not be removed - in @chain[].p. Return value is the pointer + * to last filled element of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0].p + * (no partially truncated stuff there). + */ + +static Indirect *ext2_find_shared(struct inode *inode, + int depth, + int offsets[4], + Indirect chain[4], + __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext2_get_branch(inode, k, offsets, chain, &err); + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + write_lock(&EXT2_I(inode)->i_meta_lock); + if (!partial->key && *partial->p) { + write_unlock(&EXT2_I(inode)->i_meta_lock); + goto no_top; + } + for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + *p->p = 0; + } + write_unlock(&EXT2_I(inode)->i_meta_lock); + + while(partial > p) + { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/** + * ext2_free_data - free a list of data blocks + * @inode: inode we are dealing with + * @p: array of block numbers + * @q: points immediately past the end of array + * + * We are freeing all blocks referred from that array (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q) +{ + unsigned long block_to_free = 0, count = 0; + unsigned long nr; + + for ( ; p < q ; p++) { + nr = le32_to_cpu(*p); + if (nr) { + *p = 0; + /* accumulate blocks to free if they're contiguous */ + if (count == 0) + goto free_this; + else if (block_to_free == nr - count) + count++; + else { + ext2_free_blocks (inode, block_to_free, count); + mark_inode_dirty(inode); + free_this: + block_to_free = nr; + count = 1; + } + } + } + if (count > 0) { + ext2_free_blocks (inode, block_to_free, count); + mark_inode_dirty(inode); + } +} + +/** + * ext2_free_branches - free an array of branches + * @inode: inode we are dealing with + * @p: array of block numbers + * @q: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks referred from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int depth) +{ + struct buffer_head * bh; + unsigned long nr; + + if (depth--) { + int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); + for ( ; p < q ; p++) { + nr = le32_to_cpu(*p); + if (!nr) + continue; + *p = 0; + bh = sb_bread(inode->i_sb, nr); + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + ext2_error(inode->i_sb, "ext2_free_branches", + "Read failure, inode=%ld, block=%ld", + inode->i_ino, nr); + continue; + } + ext2_free_branches(inode, + (__le32*)bh->b_data, + (__le32*)bh->b_data + addr_per_block, + depth); + bforget(bh); + ext2_free_blocks(inode, nr, 1); + mark_inode_dirty(inode); + } + } else + ext2_free_data(inode, p, q); +} + +static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) +{ + __le32 *i_data = EXT2_I(inode)->i_data; + struct ext2_inode_info *ei = EXT2_I(inode); + int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n; + long iblock; + unsigned blocksize; + blocksize = inode->i_sb->s_blocksize; + iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); + + n = ext2_block_to_path(inode, iblock, offsets, NULL); + if (n == 0) + return; + + /* + * From here we block out all ext2_get_block() callers who want to + * modify the block allocation tree. + */ + mutex_lock(&ei->truncate_mutex); + + if (n == 1) { + ext2_free_data(inode, i_data+offsets[0], + i_data + EXT2_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext2_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (already detached) */ + if (nr) { + if (partial == chain) + mark_inode_dirty(inode); + else + mark_buffer_dirty_inode(partial->bh, inode); + ext2_free_branches(inode, &nr, &nr+1, (chain+n-1) - partial); + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext2_free_branches(inode, + partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + mark_buffer_dirty_inode(partial->bh, inode); + brelse (partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT2_IND_BLOCK]; + if (nr) { + i_data[EXT2_IND_BLOCK] = 0; + mark_inode_dirty(inode); + ext2_free_branches(inode, &nr, &nr+1, 1); + } + case EXT2_IND_BLOCK: + nr = i_data[EXT2_DIND_BLOCK]; + if (nr) { + i_data[EXT2_DIND_BLOCK] = 0; + mark_inode_dirty(inode); + ext2_free_branches(inode, &nr, &nr+1, 2); + } + case EXT2_DIND_BLOCK: + nr = i_data[EXT2_TIND_BLOCK]; + if (nr) { + i_data[EXT2_TIND_BLOCK] = 0; + mark_inode_dirty(inode); + ext2_free_branches(inode, &nr, &nr+1, 3); + } + case EXT2_TIND_BLOCK: + ; + } + + ext2_discard_reservation(inode); + + mutex_unlock(&ei->truncate_mutex); +} + +static void ext2_truncate_blocks(struct inode *inode, loff_t offset) +{ + /* + * XXX: it seems like a bug here that we don't allow + * IS_APPEND inode to have blocks-past-i_size trimmed off. + * review and fix this. + * + * Also would be nice to be able to handle IO errors and such, + * but that's probably too much to ask. + */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (ext2_inode_is_fast_symlink(inode)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + __ext2_truncate_blocks(inode, offset); +} + +static int ext2_setsize(struct inode *inode, loff_t newsize) +{ + int error; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return -EINVAL; + if (ext2_inode_is_fast_symlink(inode)) + return -EINVAL; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + if (mapping_is_xip(inode->i_mapping)) + error = xip_truncate_page(inode->i_mapping, newsize); + else if (test_opt(inode->i_sb, NOBH)) + error = nobh_truncate_page(inode->i_mapping, + newsize, ext2_get_block); + else + error = block_truncate_page(inode->i_mapping, + newsize, ext2_get_block); + if (error) + return error; + + truncate_setsize(inode, newsize); + __ext2_truncate_blocks(inode, newsize); + + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + if (inode_needs_sync(inode)) { + sync_mapping_buffers(inode->i_mapping); + sync_inode_metadata(inode, 1); + } else { + mark_inode_dirty(inode); + } + + return 0; +} + +static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino, + struct buffer_head **p) +{ + struct buffer_head * bh; + unsigned long block_group; + unsigned long block; + unsigned long offset; + struct ext2_group_desc * gdp; + + *p = NULL; + if ((ino != EXT2_ROOT_INO && ino < EXT2_FIRST_INO(sb)) || + ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count)) + goto Einval; + + block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb); + gdp = ext2_get_group_desc(sb, block_group, NULL); + if (!gdp) + goto Egdp; + /* + * Figure out the offset within the block group inode table + */ + offset = ((ino - 1) % EXT2_INODES_PER_GROUP(sb)) * EXT2_INODE_SIZE(sb); + block = le32_to_cpu(gdp->bg_inode_table) + + (offset >> EXT2_BLOCK_SIZE_BITS(sb)); + if (!(bh = sb_bread(sb, block))) + goto Eio; + + *p = bh; + offset &= (EXT2_BLOCK_SIZE(sb) - 1); + return (struct ext2_inode *) (bh->b_data + offset); + +Einval: + ext2_error(sb, "ext2_get_inode", "bad inode number: %lu", + (unsigned long) ino); + return ERR_PTR(-EINVAL); +Eio: + ext2_error(sb, "ext2_get_inode", + "unable to read inode block - inode=%lu, block=%lu", + (unsigned long) ino, block); +Egdp: + return ERR_PTR(-EIO); +} + +void ext2_set_inode_flags(struct inode *inode) +{ + unsigned int flags = EXT2_I(inode)->i_flags; + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + if (flags & EXT2_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT2_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & EXT2_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & EXT2_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT2_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ +void ext2_get_inode_flags(struct ext2_inode_info *ei) +{ + unsigned int flags = ei->vfs_inode.i_flags; + + ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL| + EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL); + if (flags & S_SYNC) + ei->i_flags |= EXT2_SYNC_FL; + if (flags & S_APPEND) + ei->i_flags |= EXT2_APPEND_FL; + if (flags & S_IMMUTABLE) + ei->i_flags |= EXT2_IMMUTABLE_FL; + if (flags & S_NOATIME) + ei->i_flags |= EXT2_NOATIME_FL; + if (flags & S_DIRSYNC) + ei->i_flags |= EXT2_DIRSYNC_FL; +} + +struct inode *ext2_iget (struct super_block *sb, unsigned long ino) +{ + struct ext2_inode_info *ei; + struct buffer_head * bh; + struct ext2_inode *raw_inode; + struct inode *inode; + long ret = -EIO; + int n; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ei = EXT2_I(inode); + ei->i_block_alloc_info = NULL; + + raw_inode = ext2_get_inode(inode->i_sb, ino, &bh); + if (IS_ERR(raw_inode)) { + ret = PTR_ERR(raw_inode); + goto bad_inode; + } + + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (!(test_opt (inode->i_sb, NO_UID32))) { + inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + inode->i_size = le32_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); + inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); + inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); + inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0; + ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0 && (inode->i_mode == 0 || ei->i_dtime)) { + /* this inode is deleted */ + brelse (bh); + ret = -ESTALE; + goto bad_inode; + } + inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); + ei->i_flags = le32_to_cpu(raw_inode->i_flags); + ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); + ei->i_frag_no = raw_inode->i_frag; + ei->i_frag_size = raw_inode->i_fsize; + ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + ei->i_dir_acl = 0; + if (S_ISREG(inode->i_mode)) + inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; + else + ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); + ei->i_dtime = 0; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + ei->i_state = 0; + ei->i_block_group = (ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); + ei->i_dir_start_lookup = 0; + + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (n = 0; n < EXT2_N_BLOCKS; n++) + ei->i_data[n] = raw_inode->i_block[n]; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext2_file_inode_operations; + if (ext2_use_xip(inode->i_sb)) { + inode->i_mapping->a_ops = &ext2_aops_xip; + inode->i_fop = &ext2_xip_file_operations; + } else if (test_opt(inode->i_sb, NOBH)) { + inode->i_mapping->a_ops = &ext2_nobh_aops; + inode->i_fop = &ext2_file_operations; + } else { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_file_operations; + } + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext2_dir_inode_operations; + inode->i_fop = &ext2_dir_operations; + if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; + } else if (S_ISLNK(inode->i_mode)) { + if (ext2_inode_is_fast_symlink(inode)) { + inode->i_op = &ext2_fast_symlink_inode_operations; + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { + inode->i_op = &ext2_symlink_inode_operations; + if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; + } + } else { + inode->i_op = &ext2_special_inode_operations; + if (raw_inode->i_block[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } + brelse (bh); + ext2_set_inode_flags(inode); + unlock_new_inode(inode); + return inode; + +bad_inode: + iget_failed(inode); + return ERR_PTR(ret); +} + +static int __ext2_write_inode(struct inode *inode, int do_sync) +{ + struct ext2_inode_info *ei = EXT2_I(inode); + struct super_block *sb = inode->i_sb; + ino_t ino = inode->i_ino; + uid_t uid = inode->i_uid; + gid_t gid = inode->i_gid; + struct buffer_head * bh; + struct ext2_inode * raw_inode = ext2_get_inode(sb, ino, &bh); + int n; + int err = 0; + + if (IS_ERR(raw_inode)) + return -EIO; + + /* For fields not not tracking in the in-memory inode, + * initialise them to zero for new inodes. */ + if (ei->i_state & EXT2_STATE_NEW) + memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size); + + ext2_get_inode_flags(ei); + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if (!(test_opt(sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid)); +/* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if (!ei->i_dtime) { + raw_inode->i_uid_high = cpu_to_le16(high_16_bits(uid)); + raw_inode->i_gid_high = cpu_to_le16(high_16_bits(gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(uid)); + raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le32(inode->i_size); + raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); + raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); + raw_inode->i_flags = cpu_to_le32(ei->i_flags); + raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); + raw_inode->i_frag = ei->i_frag_no; + raw_inode->i_fsize = ei->i_frag_size; + raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); + if (!S_ISREG(inode->i_mode)) + raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); + else { + raw_inode->i_size_high = cpu_to_le32(inode->i_size >> 32); + if (inode->i_size > 0x7fffffffULL) { + if (!EXT2_HAS_RO_COMPAT_FEATURE(sb, + EXT2_FEATURE_RO_COMPAT_LARGE_FILE) || + EXT2_SB(sb)->s_es->s_rev_level == + cpu_to_le32(EXT2_GOOD_OLD_REV)) { + /* If this is the first large file + * created, add a flag to the superblock. + */ + spin_lock(&EXT2_SB(sb)->s_lock); + ext2_update_dynamic_rev(sb); + EXT2_SET_RO_COMPAT_FEATURE(sb, + EXT2_FEATURE_RO_COMPAT_LARGE_FILE); + spin_unlock(&EXT2_SB(sb)->s_lock); + ext2_write_super(sb); + } + } + } + + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + raw_inode->i_block[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + raw_inode->i_block[1] = 0; + } else { + raw_inode->i_block[0] = 0; + raw_inode->i_block[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + raw_inode->i_block[2] = 0; + } + } else for (n = 0; n < EXT2_N_BLOCKS; n++) + raw_inode->i_block[n] = ei->i_data[n]; + mark_buffer_dirty(bh); + if (do_sync) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + printk ("IO error syncing ext2 inode [%s:%08lx]\n", + sb->s_id, (unsigned long) ino); + err = -EIO; + } + } + ei->i_state &= ~EXT2_STATE_NEW; + brelse (bh); + return err; +} + +int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + +int ext2_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + if (is_quota_modification(inode, iattr)) + dquot_initialize(inode); + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || + (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { + error = dquot_transfer(inode, iattr); + if (error) + return error; + } + if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) { + error = ext2_setsize(inode, iattr->ia_size); + if (error) + return error; + } + setattr_copy(inode, iattr); + if (iattr->ia_valid & ATTR_MODE) + error = ext2_acl_chmod(inode); + mark_inode_dirty(inode); + + return error; +} diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c new file mode 100644 index 00000000..f81e250a --- /dev/null +++ b/fs/ext2/ioctl.c @@ -0,0 +1,178 @@ +/* + * linux/fs/ext2/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include "ext2.h" +#include +#include +#include +#include +#include +#include +#include + + +long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct ext2_inode_info *ei = EXT2_I(inode); + unsigned int flags; + unsigned short rsv_window_size; + int ret; + + ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT2_IOC_GETFLAGS: + ext2_get_inode_flags(ei); + flags = ei->i_flags & EXT2_FL_USER_VISIBLE; + return put_user(flags, (int __user *) arg); + case EXT2_IOC_SETFLAGS: { + unsigned int oldflags; + + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + + if (!inode_owner_or_capable(inode)) { + ret = -EACCES; + goto setflags_out; + } + + if (get_user(flags, (int __user *) arg)) { + ret = -EFAULT; + goto setflags_out; + } + + flags = ext2_mask_flags(inode->i_mode, flags); + + mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + mutex_unlock(&inode->i_mutex); + ret = -EPERM; + goto setflags_out; + } + oldflags = ei->i_flags; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); + ret = -EPERM; + goto setflags_out; + } + } + + flags = flags & EXT2_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE; + ei->i_flags = flags; + mutex_unlock(&inode->i_mutex); + + ext2_set_inode_flags(inode); + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +setflags_out: + mnt_drop_write(filp->f_path.mnt); + return ret; + } + case EXT2_IOC_GETVERSION: + return put_user(inode->i_generation, (int __user *) arg); + case EXT2_IOC_SETVERSION: + if (!inode_owner_or_capable(inode)) + return -EPERM; + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + if (get_user(inode->i_generation, (int __user *) arg)) { + ret = -EFAULT; + } else { + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + } + mnt_drop_write(filp->f_path.mnt); + return ret; + case EXT2_IOC_GETRSVSZ: + if (test_opt(inode->i_sb, RESERVATION) + && S_ISREG(inode->i_mode) + && ei->i_block_alloc_info) { + rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size; + return put_user(rsv_window_size, (int __user *)arg); + } + return -ENOTTY; + case EXT2_IOC_SETRSVSZ: { + + if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) + return -ENOTTY; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(rsv_window_size, (int __user *)arg)) + return -EFAULT; + + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + + if (rsv_window_size > EXT2_MAX_RESERVE_BLOCKS) + rsv_window_size = EXT2_MAX_RESERVE_BLOCKS; + + /* + * need to allocate reservation structure for this inode + * before set the window size + */ + /* + * XXX What lock should protect the rsv_goal_size? + * Accessed in ext2_get_block only. ext3 uses i_truncate. + */ + mutex_lock(&ei->truncate_mutex); + if (!ei->i_block_alloc_info) + ext2_init_block_alloc_info(inode); + + if (ei->i_block_alloc_info){ + struct ext2_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; + rsv->rsv_goal_size = rsv_window_size; + } + mutex_unlock(&ei->truncate_mutex); + mnt_drop_write(filp->f_path.mnt); + return 0; + } + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ext2_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case EXT2_IOC32_GETFLAGS: + cmd = EXT2_IOC_GETFLAGS; + break; + case EXT2_IOC32_SETFLAGS: + cmd = EXT2_IOC_SETFLAGS; + break; + case EXT2_IOC32_GETVERSION: + cmd = EXT2_IOC_GETVERSION; + break; + case EXT2_IOC32_SETVERSION: + cmd = EXT2_IOC_SETVERSION; + break; + default: + return -ENOIOCTLCMD; + } + return ext2_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c new file mode 100644 index 00000000..ed5c5d49 --- /dev/null +++ b/fs/ext2/namei.c @@ -0,0 +1,427 @@ +/* + * linux/fs/ext2/namei.c + * + * Rewrite to pagecache. Almost all code had been changed, so blame me + * if the things go wrong. Please, send bug reports to + * viro@parcelfarce.linux.theplanet.co.uk + * + * Stuff here is basically a glue between the VFS and generic UNIXish + * filesystem that keeps everything in pagecache. All knowledge of the + * directory layout is in fs/ext2/dir.c - it turned out to be easily separatable + * and it's easier to debug that way. In principle we might want to + * generalize that a bit and turn it into a library. Or not. + * + * The only non-static object here is ext2_dir_inode_operations. + * + * TODO: get rid of kmap() use, add readahead. + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include "ext2.h" +#include "xattr.h" +#include "acl.h" +#include "xip.h" + +static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = ext2_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + unlock_new_inode(inode); + return 0; + } + inode_dec_link_count(inode); + unlock_new_inode(inode); + iput(inode); + return err; +} + +/* + * Methods themselves. + */ + +static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode * inode; + ino_t ino; + + if (dentry->d_name.len > EXT2_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ino = ext2_inode_by_name(dir, &dentry->d_name); + inode = NULL; + if (ino) { + inode = ext2_iget(dir->i_sb, ino); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ESTALE) { + ext2_error(dir->i_sb, __func__, + "deleted inode referenced: %lu", + (unsigned long) ino); + return ERR_PTR(-EIO); + } else { + return ERR_CAST(inode); + } + } + } + return d_splice_alias(inode, dentry); +} + +struct dentry *ext2_get_parent(struct dentry *child) +{ + struct qstr dotdot = {.name = "..", .len = 2}; + unsigned long ino = ext2_inode_by_name(child->d_inode, &dotdot); + if (!ino) + return ERR_PTR(-ENOENT); + return d_obtain_alias(ext2_iget(child->d_inode->i_sb, ino)); +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) +{ + struct inode *inode; + + dquot_initialize(dir); + + inode = ext2_new_inode(dir, mode, &dentry->d_name); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &ext2_file_inode_operations; + if (ext2_use_xip(inode->i_sb)) { + inode->i_mapping->a_ops = &ext2_aops_xip; + inode->i_fop = &ext2_xip_file_operations; + } else if (test_opt(inode->i_sb, NOBH)) { + inode->i_mapping->a_ops = &ext2_nobh_aops; + inode->i_fop = &ext2_file_operations; + } else { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_file_operations; + } + mark_inode_dirty(inode); + return ext2_add_nondir(dentry, inode); +} + +static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) +{ + struct inode * inode; + int err; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + dquot_initialize(dir); + + inode = ext2_new_inode (dir, mode, &dentry->d_name); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); +#ifdef CONFIG_EXT2_FS_XATTR + inode->i_op = &ext2_special_inode_operations; +#endif + mark_inode_dirty(inode); + err = ext2_add_nondir(dentry, inode); + } + return err; +} + +static int ext2_symlink (struct inode * dir, struct dentry * dentry, + const char * symname) +{ + struct super_block * sb = dir->i_sb; + int err = -ENAMETOOLONG; + unsigned l = strlen(symname)+1; + struct inode * inode; + + if (l > sb->s_blocksize) + goto out; + + dquot_initialize(dir); + + inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + if (l > sizeof (EXT2_I(inode)->i_data)) { + /* slow symlink */ + inode->i_op = &ext2_symlink_inode_operations; + if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + } else { + /* fast symlink */ + inode->i_op = &ext2_fast_symlink_inode_operations; + memcpy((char*)(EXT2_I(inode)->i_data),symname,l); + inode->i_size = l-1; + } + mark_inode_dirty(inode); + + err = ext2_add_nondir(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + unlock_new_inode(inode); + iput (inode); + goto out; +} + +static int ext2_link (struct dentry * old_dentry, struct inode * dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + int err; + + if (inode->i_nlink >= EXT2_LINK_MAX) + return -EMLINK; + + dquot_initialize(dir); + + inode->i_ctime = CURRENT_TIME_SEC; + inode_inc_link_count(inode); + ihold(inode); + + err = ext2_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) +{ + struct inode * inode; + int err = -EMLINK; + + if (dir->i_nlink >= EXT2_LINK_MAX) + goto out; + + dquot_initialize(dir); + + inode_inc_link_count(dir); + + inode = ext2_new_inode(dir, S_IFDIR | mode, &dentry->d_name); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + inode->i_op = &ext2_dir_inode_operations; + inode->i_fop = &ext2_dir_operations; + if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; + + inode_inc_link_count(inode); + + err = ext2_make_empty(inode, dir); + if (err) + goto out_fail; + + err = ext2_add_link(dentry, inode); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); + unlock_new_inode(inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + unlock_new_inode(inode); + iput(inode); +out_dir: + inode_dec_link_count(dir); + goto out; +} + +static int ext2_unlink(struct inode * dir, struct dentry *dentry) +{ + struct inode * inode = dentry->d_inode; + struct ext2_dir_entry_2 * de; + struct page * page; + int err = -ENOENT; + + dquot_initialize(dir); + + de = ext2_find_entry (dir, &dentry->d_name, &page); + if (!de) + goto out; + + err = ext2_delete_entry (de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + err = 0; +out: + return err; +} + +static int ext2_rmdir (struct inode * dir, struct dentry *dentry) +{ + struct inode * inode = dentry->d_inode; + int err = -ENOTEMPTY; + + if (ext2_empty_dir(inode)) { + err = ext2_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + inode_dec_link_count(inode); + inode_dec_link_count(dir); + } + } + return err; +} + +static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, + struct inode * new_dir, struct dentry * new_dentry ) +{ + struct inode * old_inode = old_dentry->d_inode; + struct inode * new_inode = new_dentry->d_inode; + struct page * dir_page = NULL; + struct ext2_dir_entry_2 * dir_de = NULL; + struct page * old_page; + struct ext2_dir_entry_2 * old_de; + int err = -ENOENT; + + dquot_initialize(old_dir); + dquot_initialize(new_dir); + + old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = ext2_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page *new_page; + struct ext2_dir_entry_2 *new_de; + + err = -ENOTEMPTY; + if (dir_de && !ext2_empty_dir (new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page); + if (!new_de) + goto out_dir; + ext2_set_link(new_dir, new_de, new_page, old_inode, 1); + new_inode->i_ctime = CURRENT_TIME_SEC; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + } else { + if (dir_de) { + err = -EMLINK; + if (new_dir->i_nlink >= EXT2_LINK_MAX) + goto out_dir; + } + err = ext2_add_link(new_dentry, old_inode); + if (err) + goto out_dir; + if (dir_de) + inode_inc_link_count(new_dir); + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(old_inode); + + ext2_delete_entry (old_de, old_page); + + if (dir_de) { + if (old_dir != new_dir) + ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0); + else { + kunmap(dir_page); + page_cache_release(dir_page); + } + inode_dec_link_count(old_dir); + } + return 0; + + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + return err; +} + +const struct inode_operations ext2_dir_inode_operations = { + .create = ext2_create, + .lookup = ext2_lookup, + .link = ext2_link, + .unlink = ext2_unlink, + .symlink = ext2_symlink, + .mkdir = ext2_mkdir, + .rmdir = ext2_rmdir, + .mknod = ext2_mknod, + .rename = ext2_rename, +#ifdef CONFIG_EXT2_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext2_listxattr, + .removexattr = generic_removexattr, +#endif + .setattr = ext2_setattr, + .check_acl = ext2_check_acl, +}; + +const struct inode_operations ext2_special_inode_operations = { +#ifdef CONFIG_EXT2_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext2_listxattr, + .removexattr = generic_removexattr, +#endif + .setattr = ext2_setattr, + .check_acl = ext2_check_acl, +}; diff --git a/fs/ext2/super.c b/fs/ext2/super.c new file mode 100644 index 00000000..1dd62ed3 --- /dev/null +++ b/fs/ext2/super.c @@ -0,0 +1,1525 @@ +/* + * linux/fs/ext2/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext2.h" +#include "xattr.h" +#include "acl.h" +#include "xip.h" + +static void ext2_sync_super(struct super_block *sb, + struct ext2_super_block *es, int wait); +static int ext2_remount (struct super_block * sb, int * flags, char * data); +static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); +static int ext2_sync_fs(struct super_block *sb, int wait); + +void ext2_error(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + struct ext2_sb_info *sbi = EXT2_SB(sb); + struct ext2_super_block *es = sbi->s_es; + + if (!(sb->s_flags & MS_RDONLY)) { + spin_lock(&sbi->s_lock); + sbi->s_mount_state |= EXT2_ERROR_FS; + es->s_state |= cpu_to_le16(EXT2_ERROR_FS); + spin_unlock(&sbi->s_lock); + ext2_sync_super(sb, es, 1); + } + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "EXT2-fs (%s): error: %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); + + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT2-fs: panic from previous error\n"); + if (test_opt(sb, ERRORS_RO)) { + ext2_msg(sb, KERN_CRIT, + "error: remounting filesystem read-only"); + sb->s_flags |= MS_RDONLY; + } +} + +void ext2_msg(struct super_block *sb, const char *prefix, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%sEXT2-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + + va_end(args); +} + +/* + * This must be called with sbi->s_lock held. + */ +void ext2_update_dynamic_rev(struct super_block *sb) +{ + struct ext2_super_block *es = EXT2_SB(sb)->s_es; + + if (le32_to_cpu(es->s_rev_level) > EXT2_GOOD_OLD_REV) + return; + + ext2_msg(sb, KERN_WARNING, + "warning: updating to rev %d because of " + "new feature flag, running e2fsck is recommended", + EXT2_DYNAMIC_REV); + + es->s_first_ino = cpu_to_le32(EXT2_GOOD_OLD_FIRST_INO); + es->s_inode_size = cpu_to_le16(EXT2_GOOD_OLD_INODE_SIZE); + es->s_rev_level = cpu_to_le32(EXT2_DYNAMIC_REV); + /* leave es->s_feature_*compat flags alone */ + /* es->s_uuid will be set by e2fsck if empty */ + + /* + * The rest of the superblock fields should be zero, and if not it + * means they are likely already in use, so leave them alone. We + * can leave it up to e2fsck to clean up any inconsistencies there. + */ +} + +static void ext2_put_super (struct super_block * sb) +{ + int db_count; + int i; + struct ext2_sb_info *sbi = EXT2_SB(sb); + + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + + if (sb->s_dirt) + ext2_write_super(sb); + + ext2_xattr_put_super(sb); + if (!(sb->s_flags & MS_RDONLY)) { + struct ext2_super_block *es = sbi->s_es; + + spin_lock(&sbi->s_lock); + es->s_state = cpu_to_le16(sbi->s_mount_state); + spin_unlock(&sbi->s_lock); + ext2_sync_super(sb, es, 1); + } + db_count = sbi->s_gdb_count; + for (i = 0; i < db_count; i++) + if (sbi->s_group_desc[i]) + brelse (sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); + kfree(sbi->s_debts); + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + brelse (sbi->s_sbh); + sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); + kfree(sbi); +} + +static struct kmem_cache * ext2_inode_cachep; + +static struct inode *ext2_alloc_inode(struct super_block *sb) +{ + struct ext2_inode_info *ei; + ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; + return &ei->vfs_inode; +} + +static void ext2_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ext2_inode_cachep, EXT2_I(inode)); +} + +static void ext2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ext2_i_callback); +} + +static void init_once(void *foo) +{ + struct ext2_inode_info *ei = (struct ext2_inode_info *) foo; + + rwlock_init(&ei->i_meta_lock); +#ifdef CONFIG_EXT2_FS_XATTR + init_rwsem(&ei->xattr_sem); +#endif + mutex_init(&ei->truncate_mutex); + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", + sizeof(struct ext2_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (ext2_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(ext2_inode_cachep); +} + +static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct super_block *sb = vfs->mnt_sb; + struct ext2_sb_info *sbi = EXT2_SB(sb); + struct ext2_super_block *es = sbi->s_es; + unsigned long def_mount_opts; + + spin_lock(&sbi->s_lock); + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + + if (sbi->s_sb_block != 1) + seq_printf(seq, ",sb=%lu", sbi->s_sb_block); + if (test_opt(sb, MINIX_DF)) + seq_puts(seq, ",minixdf"); + if (test_opt(sb, GRPID)) + seq_puts(seq, ",grpid"); + if (!test_opt(sb, GRPID) && (def_mount_opts & EXT2_DEFM_BSDGROUPS)) + seq_puts(seq, ",nogrpid"); + if (sbi->s_resuid != EXT2_DEF_RESUID || + le16_to_cpu(es->s_def_resuid) != EXT2_DEF_RESUID) { + seq_printf(seq, ",resuid=%u", sbi->s_resuid); + } + if (sbi->s_resgid != EXT2_DEF_RESGID || + le16_to_cpu(es->s_def_resgid) != EXT2_DEF_RESGID) { + seq_printf(seq, ",resgid=%u", sbi->s_resgid); + } + if (test_opt(sb, ERRORS_RO)) { + int def_errors = le16_to_cpu(es->s_errors); + + if (def_errors == EXT2_ERRORS_PANIC || + def_errors == EXT2_ERRORS_CONTINUE) { + seq_puts(seq, ",errors=remount-ro"); + } + } + if (test_opt(sb, ERRORS_CONT)) + seq_puts(seq, ",errors=continue"); + if (test_opt(sb, ERRORS_PANIC)) + seq_puts(seq, ",errors=panic"); + if (test_opt(sb, NO_UID32)) + seq_puts(seq, ",nouid32"); + if (test_opt(sb, DEBUG)) + seq_puts(seq, ",debug"); + if (test_opt(sb, OLDALLOC)) + seq_puts(seq, ",oldalloc"); + +#ifdef CONFIG_EXT2_FS_XATTR + if (test_opt(sb, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + if (!test_opt(sb, XATTR_USER) && + (def_mount_opts & EXT2_DEFM_XATTR_USER)) { + seq_puts(seq, ",nouser_xattr"); + } +#endif + +#ifdef CONFIG_EXT2_FS_POSIX_ACL + if (test_opt(sb, POSIX_ACL)) + seq_puts(seq, ",acl"); + if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT2_DEFM_ACL)) + seq_puts(seq, ",noacl"); +#endif + + if (test_opt(sb, NOBH)) + seq_puts(seq, ",nobh"); + +#if defined(CONFIG_QUOTA) + if (sbi->s_mount_opt & EXT2_MOUNT_USRQUOTA) + seq_puts(seq, ",usrquota"); + + if (sbi->s_mount_opt & EXT2_MOUNT_GRPQUOTA) + seq_puts(seq, ",grpquota"); +#endif + +#if defined(CONFIG_EXT2_FS_XIP) + if (sbi->s_mount_opt & EXT2_MOUNT_XIP) + seq_puts(seq, ",xip"); +#endif + + if (!test_opt(sb, RESERVATION)) + seq_puts(seq, ",noreservation"); + + spin_unlock(&sbi->s_lock); + return 0; +} + +#ifdef CONFIG_QUOTA +static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); +static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); +#endif + +static const struct super_operations ext2_sops = { + .alloc_inode = ext2_alloc_inode, + .destroy_inode = ext2_destroy_inode, + .write_inode = ext2_write_inode, + .evict_inode = ext2_evict_inode, + .put_super = ext2_put_super, + .write_super = ext2_write_super, + .sync_fs = ext2_sync_fs, + .statfs = ext2_statfs, + .remount_fs = ext2_remount, + .show_options = ext2_show_options, +#ifdef CONFIG_QUOTA + .quota_read = ext2_quota_read, + .quota_write = ext2_quota_write, +#endif +}; + +static struct inode *ext2_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino < EXT2_FIRST_INO(sb) && ino != EXT2_ROOT_INO) + return ERR_PTR(-ESTALE); + if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count)) + return ERR_PTR(-ESTALE); + + /* iget isn't really right if the inode is currently unallocated!! + * ext2_read_inode currently does appropriate checks, but + * it might be "neater" to call ext2_get_inode first and check + * if the inode is valid..... + */ + inode = ext2_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *ext2_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ext2_nfs_get_inode); +} + +static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ext2_nfs_get_inode); +} + +/* Yes, most of these are left as NULL!! + * A NULL value implies the default, which works with ext2-like file + * systems, but can be improved upon. + * Currently only get_parent is required. + */ +static const struct export_operations ext2_export_ops = { + .fh_to_dentry = ext2_fh_to_dentry, + .fh_to_parent = ext2_fh_to_parent, + .get_parent = ext2_get_parent, +}; + +static unsigned long get_sb_block(void **data) +{ + unsigned long sb_block; + char *options = (char *) *data; + + if (!options || strncmp(options, "sb=", 3) != 0) + return 1; /* Default location */ + options += 3; + sb_block = simple_strtoul(options, &options, 0); + if (*options && *options != ',') { + printk("EXT2-fs: Invalid sb specification: %s\n", + (char *) *data); + return 1; + } + if (*options == ',') + options++; + *data = (void *) options; + return sb_block; +} + +enum { + Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, + Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, + Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, + Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, + Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota, + Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation +}; + +static const match_table_t tokens = { + {Opt_bsd_df, "bsddf"}, + {Opt_minix_df, "minixdf"}, + {Opt_grpid, "grpid"}, + {Opt_grpid, "bsdgroups"}, + {Opt_nogrpid, "nogrpid"}, + {Opt_nogrpid, "sysvgroups"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, + {Opt_sb, "sb=%u"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_nouid32, "nouid32"}, + {Opt_nocheck, "check=none"}, + {Opt_nocheck, "nocheck"}, + {Opt_debug, "debug"}, + {Opt_oldalloc, "oldalloc"}, + {Opt_orlov, "orlov"}, + {Opt_nobh, "nobh"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_xip, "xip"}, + {Opt_grpquota, "grpquota"}, + {Opt_ignore, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, + {Opt_reservation, "reservation"}, + {Opt_noreservation, "noreservation"}, + {Opt_err, NULL} +}; + +static int parse_options(char *options, struct super_block *sb) +{ + char *p; + struct ext2_sb_info *sbi = EXT2_SB(sb); + substring_t args[MAX_OPT_ARGS]; + int option; + + if (!options) + return 1; + + while ((p = strsep (&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_bsd_df: + clear_opt (sbi->s_mount_opt, MINIX_DF); + break; + case Opt_minix_df: + set_opt (sbi->s_mount_opt, MINIX_DF); + break; + case Opt_grpid: + set_opt (sbi->s_mount_opt, GRPID); + break; + case Opt_nogrpid: + clear_opt (sbi->s_mount_opt, GRPID); + break; + case Opt_resuid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_resuid = option; + break; + case Opt_resgid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_resgid = option; + break; + case Opt_sb: + /* handled by get_sb_block() instead of here */ + /* *sb_block = match_int(&args[0]); */ + break; + case Opt_err_panic: + clear_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (sbi->s_mount_opt, ERRORS_RO); + set_opt (sbi->s_mount_opt, ERRORS_PANIC); + break; + case Opt_err_ro: + clear_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (sbi->s_mount_opt, ERRORS_PANIC); + set_opt (sbi->s_mount_opt, ERRORS_RO); + break; + case Opt_err_cont: + clear_opt (sbi->s_mount_opt, ERRORS_RO); + clear_opt (sbi->s_mount_opt, ERRORS_PANIC); + set_opt (sbi->s_mount_opt, ERRORS_CONT); + break; + case Opt_nouid32: + set_opt (sbi->s_mount_opt, NO_UID32); + break; + case Opt_nocheck: + clear_opt (sbi->s_mount_opt, CHECK); + break; + case Opt_debug: + set_opt (sbi->s_mount_opt, DEBUG); + break; + case Opt_oldalloc: + set_opt (sbi->s_mount_opt, OLDALLOC); + break; + case Opt_orlov: + clear_opt (sbi->s_mount_opt, OLDALLOC); + break; + case Opt_nobh: + set_opt (sbi->s_mount_opt, NOBH); + break; +#ifdef CONFIG_EXT2_FS_XATTR + case Opt_user_xattr: + set_opt (sbi->s_mount_opt, XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt (sbi->s_mount_opt, XATTR_USER); + break; +#else + case Opt_user_xattr: + case Opt_nouser_xattr: + ext2_msg(sb, KERN_INFO, "(no)user_xattr options" + "not supported"); + break; +#endif +#ifdef CONFIG_EXT2_FS_POSIX_ACL + case Opt_acl: + set_opt(sbi->s_mount_opt, POSIX_ACL); + break; + case Opt_noacl: + clear_opt(sbi->s_mount_opt, POSIX_ACL); + break; +#else + case Opt_acl: + case Opt_noacl: + ext2_msg(sb, KERN_INFO, + "(no)acl options not supported"); + break; +#endif + case Opt_xip: +#ifdef CONFIG_EXT2_FS_XIP + set_opt (sbi->s_mount_opt, XIP); +#else + ext2_msg(sb, KERN_INFO, "xip option not supported"); +#endif + break; + +#if defined(CONFIG_QUOTA) + case Opt_quota: + case Opt_usrquota: + set_opt(sbi->s_mount_opt, USRQUOTA); + break; + + case Opt_grpquota: + set_opt(sbi->s_mount_opt, GRPQUOTA); + break; +#else + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + ext2_msg(sb, KERN_INFO, + "quota operations not supported"); + break; +#endif + + case Opt_reservation: + set_opt(sbi->s_mount_opt, RESERVATION); + ext2_msg(sb, KERN_INFO, "reservations ON"); + break; + case Opt_noreservation: + clear_opt(sbi->s_mount_opt, RESERVATION); + ext2_msg(sb, KERN_INFO, "reservations OFF"); + break; + case Opt_ignore: + break; + default: + return 0; + } + } + return 1; +} + +static int ext2_setup_super (struct super_block * sb, + struct ext2_super_block * es, + int read_only) +{ + int res = 0; + struct ext2_sb_info *sbi = EXT2_SB(sb); + + if (le32_to_cpu(es->s_rev_level) > EXT2_MAX_SUPP_REV) { + ext2_msg(sb, KERN_ERR, + "error: revision level too high, " + "forcing read-only mode"); + res = MS_RDONLY; + } + if (read_only) + return res; + if (!(sbi->s_mount_state & EXT2_VALID_FS)) + ext2_msg(sb, KERN_WARNING, + "warning: mounting unchecked fs, " + "running e2fsck is recommended"); + else if ((sbi->s_mount_state & EXT2_ERROR_FS)) + ext2_msg(sb, KERN_WARNING, + "warning: mounting fs with errors, " + "running e2fsck is recommended"); + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && + le16_to_cpu(es->s_mnt_count) >= + (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) + ext2_msg(sb, KERN_WARNING, + "warning: maximal mount count reached, " + "running e2fsck is recommended"); + else if (le32_to_cpu(es->s_checkinterval) && + (le32_to_cpu(es->s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= get_seconds())) + ext2_msg(sb, KERN_WARNING, + "warning: checktime reached, " + "running e2fsck is recommended"); + if (!le16_to_cpu(es->s_max_mnt_count)) + es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT); + le16_add_cpu(&es->s_mnt_count, 1); + if (test_opt (sb, DEBUG)) + ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, " + "bpg=%lu, ipg=%lu, mo=%04lx]", + EXT2FS_VERSION, EXT2FS_DATE, sb->s_blocksize, + sbi->s_frag_size, + sbi->s_groups_count, + EXT2_BLOCKS_PER_GROUP(sb), + EXT2_INODES_PER_GROUP(sb), + sbi->s_mount_opt); + return res; +} + +static int ext2_check_descriptors(struct super_block *sb) +{ + int i; + struct ext2_sb_info *sbi = EXT2_SB(sb); + + ext2_debug ("Checking group descriptors"); + + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext2_group_desc *gdp = ext2_get_group_desc(sb, i, NULL); + ext2_fsblk_t first_block = ext2_group_first_block_no(sb, i); + ext2_fsblk_t last_block; + + if (i == sbi->s_groups_count - 1) + last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; + else + last_block = first_block + + (EXT2_BLOCKS_PER_GROUP(sb) - 1); + + if (le32_to_cpu(gdp->bg_block_bitmap) < first_block || + le32_to_cpu(gdp->bg_block_bitmap) > last_block) + { + ext2_error (sb, "ext2_check_descriptors", + "Block bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) le32_to_cpu(gdp->bg_block_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block || + le32_to_cpu(gdp->bg_inode_bitmap) > last_block) + { + ext2_error (sb, "ext2_check_descriptors", + "Inode bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) le32_to_cpu(gdp->bg_inode_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_table) < first_block || + le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 > + last_block) + { + ext2_error (sb, "ext2_check_descriptors", + "Inode table for group %d" + " not in group (block %lu)!", + i, (unsigned long) le32_to_cpu(gdp->bg_inode_table)); + return 0; + } + } + return 1; +} + +/* + * Maximal file size. There is a direct, and {,double-,triple-}indirect + * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. + * We need to be 1 filesystem block less than the 2^32 sector limit. + */ +static loff_t ext2_max_size(int bits) +{ + loff_t res = EXT2_NDIR_BLOCKS; + int meta_blocks; + loff_t upper_limit; + + /* This is calculated to be the largest file size for a + * dense, file such that the total number of + * sectors in the file, including data and all indirect blocks, + * does not exceed 2^32 -1 + * __u32 i_blocks representing the total number of + * 512 bytes blocks of the file + */ + upper_limit = (1LL << 32) - 1; + + /* total blocks in file system block size */ + upper_limit >>= (bits - 9); + + + /* indirect blocks */ + meta_blocks = 1; + /* double indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)); + /* tripple indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); + + upper_limit -= meta_blocks; + upper_limit <<= bits; + + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + res <<= bits; + if (res > upper_limit) + res = upper_limit; + + if (res > MAX_LFS_FILESIZE) + res = MAX_LFS_FILESIZE; + + return res; +} + +static unsigned long descriptor_loc(struct super_block *sb, + unsigned long logic_sb_block, + int nr) +{ + struct ext2_sb_info *sbi = EXT2_SB(sb); + unsigned long bg, first_meta_bg; + int has_super = 0; + + first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + + if (!EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_META_BG) || + nr < first_meta_bg) + return (logic_sb_block + nr + 1); + bg = sbi->s_desc_per_block * nr; + if (ext2_bg_has_super(sb, bg)) + has_super = 1; + + return ext2_group_first_block_no(sb, bg) + has_super; +} + +static int ext2_fill_super(struct super_block *sb, void *data, int silent) +{ + struct buffer_head * bh; + struct ext2_sb_info * sbi; + struct ext2_super_block * es; + struct inode *root; + unsigned long block; + unsigned long sb_block = get_sb_block(&data); + unsigned long logic_sb_block; + unsigned long offset = 0; + unsigned long def_mount_opts; + long ret = -EINVAL; + int blocksize = BLOCK_SIZE; + int db_count; + int i, j; + __le32 features; + int err; + + err = -ENOMEM; + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + goto failed_unlock; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + goto failed_unlock; + } + sb->s_fs_info = sbi; + sbi->s_sb_block = sb_block; + + spin_lock_init(&sbi->s_lock); + + /* + * See what the current blocksize for the device is, and + * use that as the blocksize. Otherwise (or if the blocksize + * is smaller than the default) use the default. + * This is important for devices that have a hardware + * sectorsize that is larger than the default. + */ + blocksize = sb_min_blocksize(sb, BLOCK_SIZE); + if (!blocksize) { + ext2_msg(sb, KERN_ERR, "error: unable to set blocksize"); + goto failed_sbi; + } + + /* + * If the superblock doesn't start on a hardware sector boundary, + * calculate the offset. + */ + if (blocksize != BLOCK_SIZE) { + logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize; + offset = (sb_block*BLOCK_SIZE) % blocksize; + } else { + logic_sb_block = sb_block; + } + + if (!(bh = sb_bread(sb, logic_sb_block))) { + ext2_msg(sb, KERN_ERR, "error: unable to read superblock"); + goto failed_sbi; + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext2 macro-instructions depend on its value + */ + es = (struct ext2_super_block *) (((char *)bh->b_data) + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + + if (sb->s_magic != EXT2_SUPER_MAGIC) + goto cantfind_ext2; + + /* Set defaults before we parse the mount options */ + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + if (def_mount_opts & EXT2_DEFM_DEBUG) + set_opt(sbi->s_mount_opt, DEBUG); + if (def_mount_opts & EXT2_DEFM_BSDGROUPS) + set_opt(sbi->s_mount_opt, GRPID); + if (def_mount_opts & EXT2_DEFM_UID16) + set_opt(sbi->s_mount_opt, NO_UID32); +#ifdef CONFIG_EXT2_FS_XATTR + if (def_mount_opts & EXT2_DEFM_XATTR_USER) + set_opt(sbi->s_mount_opt, XATTR_USER); +#endif +#ifdef CONFIG_EXT2_FS_POSIX_ACL + if (def_mount_opts & EXT2_DEFM_ACL) + set_opt(sbi->s_mount_opt, POSIX_ACL); +#endif + + if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC) + set_opt(sbi->s_mount_opt, ERRORS_PANIC); + else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_CONTINUE) + set_opt(sbi->s_mount_opt, ERRORS_CONT); + else + set_opt(sbi->s_mount_opt, ERRORS_RO); + + sbi->s_resuid = le16_to_cpu(es->s_def_resuid); + sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + + set_opt(sbi->s_mount_opt, RESERVATION); + + if (!parse_options((char *) data, sb)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? + MS_POSIXACL : 0); + + ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset + EXT2_MOUNT_XIP if not */ + + if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && + (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || + EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) || + EXT2_HAS_INCOMPAT_FEATURE(sb, ~0U))) + ext2_msg(sb, KERN_WARNING, + "warning: feature flags set on rev 0 fs, " + "running e2fsck is recommended"); + /* + * Check feature flags regardless of the revision level, since we + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. + */ + features = EXT2_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP); + if (features) { + ext2_msg(sb, KERN_ERR, "error: couldn't mount because of " + "unsupported optional features (%x)", + le32_to_cpu(features)); + goto failed_mount; + } + if (!(sb->s_flags & MS_RDONLY) && + (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){ + ext2_msg(sb, KERN_ERR, "error: couldn't mount RDWR because of " + "unsupported optional features (%x)", + le32_to_cpu(features)); + goto failed_mount; + } + + blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); + + if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) { + if (!silent) + ext2_msg(sb, KERN_ERR, + "error: unsupported blocksize for xip"); + goto failed_mount; + } + + /* If the blocksize doesn't match, re-read the thing.. */ + if (sb->s_blocksize != blocksize) { + brelse(bh); + + if (!sb_set_blocksize(sb, blocksize)) { + ext2_msg(sb, KERN_ERR, + "error: bad blocksize %d", blocksize); + goto failed_sbi; + } + + logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize; + offset = (sb_block*BLOCK_SIZE) % blocksize; + bh = sb_bread(sb, logic_sb_block); + if(!bh) { + ext2_msg(sb, KERN_ERR, "error: couldn't read" + "superblock on 2nd try"); + goto failed_sbi; + } + es = (struct ext2_super_block *) (((char *)bh->b_data) + offset); + sbi->s_es = es; + if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) { + ext2_msg(sb, KERN_ERR, "error: magic mismatch"); + goto failed_mount; + } + } + + sb->s_maxbytes = ext2_max_size(sb->s_blocksize_bits); + + if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV) { + sbi->s_inode_size = EXT2_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT2_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if ((sbi->s_inode_size < EXT2_GOOD_OLD_INODE_SIZE) || + !is_power_of_2(sbi->s_inode_size) || + (sbi->s_inode_size > blocksize)) { + ext2_msg(sb, KERN_ERR, + "error: unsupported inode size: %d", + sbi->s_inode_size); + goto failed_mount; + } + } + + sbi->s_frag_size = EXT2_MIN_FRAG_SIZE << + le32_to_cpu(es->s_log_frag_size); + if (sbi->s_frag_size == 0) + goto cantfind_ext2; + sbi->s_frags_per_block = sb->s_blocksize / sbi->s_frag_size; + + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + + if (EXT2_INODE_SIZE(sb) == 0) + goto cantfind_ext2; + sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0) + goto cantfind_ext2; + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = sb->s_blocksize / + sizeof (struct ext2_group_desc); + sbi->s_sbh = bh; + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = + ilog2 (EXT2_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = + ilog2 (EXT2_DESC_PER_BLOCK(sb)); + + if (sb->s_magic != EXT2_SUPER_MAGIC) + goto cantfind_ext2; + + if (sb->s_blocksize != bh->b_size) { + if (!silent) + ext2_msg(sb, KERN_ERR, "error: unsupported blocksize"); + goto failed_mount; + } + + if (sb->s_blocksize != sbi->s_frag_size) { + ext2_msg(sb, KERN_ERR, + "error: fragsize %lu != blocksize %lu" + "(not supported yet)", + sbi->s_frag_size, sb->s_blocksize); + goto failed_mount; + } + + if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { + ext2_msg(sb, KERN_ERR, + "error: #blocks per group too big: %lu", + sbi->s_blocks_per_group); + goto failed_mount; + } + if (sbi->s_frags_per_group > sb->s_blocksize * 8) { + ext2_msg(sb, KERN_ERR, + "error: #fragments per group too big: %lu", + sbi->s_frags_per_group); + goto failed_mount; + } + if (sbi->s_inodes_per_group > sb->s_blocksize * 8) { + ext2_msg(sb, KERN_ERR, + "error: #inodes per group too big: %lu", + sbi->s_inodes_per_group); + goto failed_mount; + } + + if (EXT2_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext2; + sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) - 1) + / EXT2_BLOCKS_PER_GROUP(sb)) + 1; + db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / + EXT2_DESC_PER_BLOCK(sb); + sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL); + if (sbi->s_group_desc == NULL) { + ext2_msg(sb, KERN_ERR, "error: not enough memory"); + goto failed_mount; + } + bgl_lock_init(sbi->s_blockgroup_lock); + sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); + if (!sbi->s_debts) { + ext2_msg(sb, KERN_ERR, "error: not enough memory"); + goto failed_mount_group_desc; + } + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logic_sb_block, i); + sbi->s_group_desc[i] = sb_bread(sb, block); + if (!sbi->s_group_desc[i]) { + for (j = 0; j < i; j++) + brelse (sbi->s_group_desc[j]); + ext2_msg(sb, KERN_ERR, + "error: unable to read group descriptors"); + goto failed_mount_group_desc; + } + } + if (!ext2_check_descriptors (sb)) { + ext2_msg(sb, KERN_ERR, "group descriptors corrupted"); + goto failed_mount2; + } + sbi->s_gdb_count = db_count; + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + + /* per fileystem reservation list head & lock */ + spin_lock_init(&sbi->s_rsv_window_lock); + sbi->s_rsv_window_root = RB_ROOT; + /* + * Add a single, static dummy reservation to the start of the + * reservation window list --- it gives us a placeholder for + * append-at-start-of-list which makes the allocation logic + * _much_ simpler. + */ + sbi->s_rsv_window_head.rsv_start = EXT2_RESERVE_WINDOW_NOT_ALLOCATED; + sbi->s_rsv_window_head.rsv_end = EXT2_RESERVE_WINDOW_NOT_ALLOCATED; + sbi->s_rsv_window_head.rsv_alloc_hit = 0; + sbi->s_rsv_window_head.rsv_goal_size = 0; + ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); + + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext2_count_free_blocks(sb)); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext2_count_free_inodes(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext2_count_dirs(sb)); + } + if (err) { + ext2_msg(sb, KERN_ERR, "error: insufficient memory"); + goto failed_mount3; + } + /* + * set up enough so that it can read an inode + */ + sb->s_op = &ext2_sops; + sb->s_export_op = &ext2_export_ops; + sb->s_xattr = ext2_xattr_handlers; + +#ifdef CONFIG_QUOTA + sb->dq_op = &dquot_operations; + sb->s_qcop = &dquot_quotactl_ops; +#endif + + root = ext2_iget(sb, EXT2_ROOT_INO); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto failed_mount3; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + iput(root); + ext2_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); + goto failed_mount3; + } + + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + iput(root); + ext2_msg(sb, KERN_ERR, "error: get root inode failed"); + ret = -ENOMEM; + goto failed_mount3; + } + if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) + ext2_msg(sb, KERN_WARNING, + "warning: mounting ext3 filesystem as ext2"); + if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY)) + sb->s_flags |= MS_RDONLY; + ext2_write_super(sb); + return 0; + +cantfind_ext2: + if (!silent) + ext2_msg(sb, KERN_ERR, + "error: can't find an ext2 filesystem on dev %s.", + sb->s_id); + goto failed_mount; +failed_mount3: + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); +failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); +failed_mount_group_desc: + kfree(sbi->s_group_desc); + kfree(sbi->s_debts); +failed_mount: + brelse(bh); +failed_sbi: + sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); + kfree(sbi); +failed_unlock: + return ret; +} + +static void ext2_clear_super_error(struct super_block *sb) +{ + struct buffer_head *sbh = EXT2_SB(sb)->s_sbh; + + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + ext2_msg(sb, KERN_ERR, + "previous I/O error to superblock detected\n"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } +} + +static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, + int wait) +{ + ext2_clear_super_error(sb); + spin_lock(&EXT2_SB(sb)->s_lock); + es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); + es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); + es->s_wtime = cpu_to_le32(get_seconds()); + /* unlock before we do IO */ + spin_unlock(&EXT2_SB(sb)->s_lock); + mark_buffer_dirty(EXT2_SB(sb)->s_sbh); + if (wait) + sync_dirty_buffer(EXT2_SB(sb)->s_sbh); + sb->s_dirt = 0; +} + +/* + * In the second extended file system, it is not necessary to + * write the super block since we use a mapping of the + * disk super block in a buffer. + * + * However, this function is still used to set the fs valid + * flags to 0. We need to set this flag to 0 since the fs + * may have been checked while mounted and e2fsck may have + * set s_state to EXT2_VALID_FS after some corrections. + */ +static int ext2_sync_fs(struct super_block *sb, int wait) +{ + struct ext2_sb_info *sbi = EXT2_SB(sb); + struct ext2_super_block *es = EXT2_SB(sb)->s_es; + + spin_lock(&sbi->s_lock); + if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { + ext2_debug("setting valid to 0\n"); + es->s_state &= cpu_to_le16(~EXT2_VALID_FS); + } + spin_unlock(&sbi->s_lock); + ext2_sync_super(sb, es, wait); + return 0; +} + + +void ext2_write_super(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) + ext2_sync_fs(sb, 1); + else + sb->s_dirt = 0; +} + +static int ext2_remount (struct super_block * sb, int * flags, char * data) +{ + struct ext2_sb_info * sbi = EXT2_SB(sb); + struct ext2_super_block * es; + unsigned long old_mount_opt = sbi->s_mount_opt; + struct ext2_mount_options old_opts; + unsigned long old_sb_flags; + int err; + + spin_lock(&sbi->s_lock); + + /* Store the old options */ + old_sb_flags = sb->s_flags; + old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_resuid = sbi->s_resuid; + old_opts.s_resgid = sbi->s_resgid; + + /* + * Allow the "check" option to be passed as a remount option. + */ + if (!parse_options(data, sb)) { + err = -EINVAL; + goto restore_opts; + } + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + + ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset + EXT2_MOUNT_XIP if not */ + + if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) { + ext2_msg(sb, KERN_WARNING, + "warning: unsupported blocksize for xip"); + err = -EINVAL; + goto restore_opts; + } + + es = sbi->s_es; + if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) { + ext2_msg(sb, KERN_WARNING, "warning: refusing change of " + "xip flag with busy inodes while remounting"); + sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; + sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; + } + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { + spin_unlock(&sbi->s_lock); + return 0; + } + if (*flags & MS_RDONLY) { + if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || + !(sbi->s_mount_state & EXT2_VALID_FS)) { + spin_unlock(&sbi->s_lock); + return 0; + } + + /* + * OK, we are remounting a valid rw partition rdonly, so set + * the rdonly flag and then mark the partition as valid again. + */ + es->s_state = cpu_to_le16(sbi->s_mount_state); + es->s_mtime = cpu_to_le32(get_seconds()); + spin_unlock(&sbi->s_lock); + + err = dquot_suspend(sb, -1); + if (err < 0) { + spin_lock(&sbi->s_lock); + goto restore_opts; + } + + ext2_sync_super(sb, es, 1); + } else { + __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, + ~EXT2_FEATURE_RO_COMPAT_SUPP); + if (ret) { + ext2_msg(sb, KERN_WARNING, + "warning: couldn't remount RDWR because of " + "unsupported optional features (%x).", + le32_to_cpu(ret)); + err = -EROFS; + goto restore_opts; + } + /* + * Mounting a RDONLY partition read-write, so reread and + * store the current valid flag. (It may have been changed + * by e2fsck since we originally mounted the partition.) + */ + sbi->s_mount_state = le16_to_cpu(es->s_state); + if (!ext2_setup_super (sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; + spin_unlock(&sbi->s_lock); + + ext2_write_super(sb); + + dquot_resume(sb, -1); + } + + return 0; +restore_opts: + sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_resuid = old_opts.s_resuid; + sbi->s_resgid = old_opts.s_resgid; + sb->s_flags = old_sb_flags; + spin_unlock(&sbi->s_lock); + return err; +} + +static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) +{ + struct super_block *sb = dentry->d_sb; + struct ext2_sb_info *sbi = EXT2_SB(sb); + struct ext2_super_block *es = sbi->s_es; + u64 fsid; + + spin_lock(&sbi->s_lock); + + if (test_opt (sb, MINIX_DF)) + sbi->s_overhead_last = 0; + else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) { + unsigned long i, overhead = 0; + smp_rmb(); + + /* + * Compute the overhead (FS structures). This is constant + * for a given filesystem unless the number of block groups + * changes so we cache the previous value until it does. + */ + + /* + * All of the blocks before first_data_block are + * overhead + */ + overhead = le32_to_cpu(es->s_first_data_block); + + /* + * Add the overhead attributed to the superblock and + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ + for (i = 0; i < sbi->s_groups_count; i++) + overhead += ext2_bg_has_super(sb, i) + + ext2_bg_num_gdb(sb, i); + + /* + * Every block group has an inode bitmap, a block + * bitmap, and an inode table. + */ + overhead += (sbi->s_groups_count * + (2 + sbi->s_itb_per_group)); + sbi->s_overhead_last = overhead; + smp_wmb(); + sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count); + } + + buf->f_type = EXT2_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last; + buf->f_bfree = ext2_count_free_blocks(sb); + es->s_free_blocks_count = cpu_to_le32(buf->f_bfree); + buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); + if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); + buf->f_ffree = ext2_count_free_inodes(sb); + es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); + buf->f_namelen = EXT2_NAME_LEN; + fsid = le64_to_cpup((void *)es->s_uuid) ^ + le64_to_cpup((void *)es->s_uuid + sizeof(u64)); + buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + spin_unlock(&sbi->s_lock); + return 0; +} + +static struct dentry *ext2_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super); +} + +#ifdef CONFIG_QUOTA + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) + * we don't have to be afraid of races */ +static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> EXT2_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head tmp_bh; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + + tmp_bh.b_state = 0; + tmp_bh.b_size = sb->s_blocksize; + err = ext2_get_block(inode, blk, &tmp_bh, 0); + if (err < 0) + return err; + if (!buffer_mapped(&tmp_bh)) /* A hole? */ + memset(data, 0, tocopy); + else { + bh = sb_bread(sb, tmp_bh.b_blocknr); + if (!bh) + return -EIO; + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + } + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t ext2_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> EXT2_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t towrite = len; + struct buffer_head tmp_bh; + struct buffer_head *bh; + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + + tmp_bh.b_state = 0; + err = ext2_get_block(inode, blk, &tmp_bh, 1); + if (err < 0) + goto out; + if (offset || tocopy != EXT2_BLOCK_SIZE(sb)) + bh = sb_bread(sb, tmp_bh.b_blocknr); + else + bh = sb_getblk(sb, tmp_bh.b_blocknr); + if (!bh) { + err = -EIO; + goto out; + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, tocopy); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + brelse(bh); + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } +out: + if (len == towrite) { + mutex_unlock(&inode->i_mutex); + return err; + } + if (inode->i_size < off+len-towrite) + i_size_write(inode, off+len-towrite); + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + mutex_unlock(&inode->i_mutex); + return len - towrite; +} + +#endif + +static struct file_system_type ext2_fs_type = { + .owner = THIS_MODULE, + .name = "ext2", + .mount = ext2_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_ext2_fs(void) +{ + int err = init_ext2_xattr(); + if (err) + return err; + err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&ext2_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + exit_ext2_xattr(); + return err; +} + +static void __exit exit_ext2_fs(void) +{ + unregister_filesystem(&ext2_fs_type); + destroy_inodecache(); + exit_ext2_xattr(); +} + +module_init(init_ext2_fs) +module_exit(exit_ext2_fs) diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c new file mode 100644 index 00000000..565cf817 --- /dev/null +++ b/fs/ext2/symlink.c @@ -0,0 +1,54 @@ +/* + * linux/fs/ext2/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 symlink handling code + */ + +#include "ext2.h" +#include "xattr.h" +#include + +static void *ext2_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ext2_inode_info *ei = EXT2_I(dentry->d_inode); + nd_set_link(nd, (char *)ei->i_data); + return NULL; +} + +const struct inode_operations ext2_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = ext2_setattr, +#ifdef CONFIG_EXT2_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext2_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +const struct inode_operations ext2_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ext2_follow_link, + .setattr = ext2_setattr, +#ifdef CONFIG_EXT2_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext2_listxattr, + .removexattr = generic_removexattr, +#endif +}; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c new file mode 100644 index 00000000..52997061 --- /dev/null +++ b/fs/ext2/xattr.c @@ -0,0 +1,1031 @@ +/* + * linux/fs/ext2/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher + * + * Fix by Harrison Xing . + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko . + * xattr consolidation Copyright (c) 2004 James Morris , + * Red Hat Inc. + * + */ + +/* + * Extended attributes are stored on disk blocks allocated outside of + * any inode. The i_file_acl field is then made to point to this allocated + * block. If all extended attributes of an inode are identical, these + * inodes may share the same extended attribute block. Such situations + * are automatically detected by keeping a cache of recent attribute block + * numbers and hashes over the block's contents in memory. + * + * + * Extended attribute block layout: + * + * +------------------+ + * | header | + * | entry 1 | | + * | entry 2 | | growing downwards + * | entry 3 | v + * | four null bytes | + * | . . . | + * | value 1 | ^ + * | value 3 | | growing upwards + * | value 2 | | + * +------------------+ + * + * The block header is followed by multiple entry descriptors. These entry + * descriptors are variable in size, and aligned to EXT2_XATTR_PAD + * byte boundaries. The entry descriptors are sorted by attribute name, + * so that two extended attribute blocks can be compared efficiently. + * + * Attribute values are aligned to the end of the block, stored in + * no specific order. They are also padded to EXT2_XATTR_PAD byte + * boundaries. No additional gaps are left between them. + * + * Locking strategy + * ---------------- + * EXT2_I(inode)->i_file_acl is protected by EXT2_I(inode)->xattr_sem. + * EA blocks are only changed if they are exclusive to an inode, so + * holding xattr_sem also means that nothing but the EA block's reference + * count will change. Multiple writers to an EA block are synchronized + * by the bh lock. No more than a single bh lock is held at any time + * to avoid deadlocks. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext2.h" +#include "xattr.h" +#include "acl.h" + +#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr)) +#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#ifdef EXT2_XATTR_DEBUG +# define ea_idebug(inode, f...) do { \ + printk(KERN_DEBUG "inode %s:%ld: ", \ + inode->i_sb->s_id, inode->i_ino); \ + printk(f); \ + printk("\n"); \ + } while (0) +# define ea_bdebug(bh, f...) do { \ + char b[BDEVNAME_SIZE]; \ + printk(KERN_DEBUG "block %s:%lu: ", \ + bdevname(bh->b_bdev, b), \ + (unsigned long) bh->b_blocknr); \ + printk(f); \ + printk("\n"); \ + } while (0) +#else +# define ea_idebug(f...) +# define ea_bdebug(f...) +#endif + +static int ext2_xattr_set2(struct inode *, struct buffer_head *, + struct ext2_xattr_header *); + +static int ext2_xattr_cache_insert(struct buffer_head *); +static struct buffer_head *ext2_xattr_cache_find(struct inode *, + struct ext2_xattr_header *); +static void ext2_xattr_rehash(struct ext2_xattr_header *, + struct ext2_xattr_entry *); + +static struct mb_cache *ext2_xattr_cache; + +static const struct xattr_handler *ext2_xattr_handler_map[] = { + [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, +#ifdef CONFIG_EXT2_FS_POSIX_ACL + [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler, + [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext2_xattr_acl_default_handler, +#endif + [EXT2_XATTR_INDEX_TRUSTED] = &ext2_xattr_trusted_handler, +#ifdef CONFIG_EXT2_FS_SECURITY + [EXT2_XATTR_INDEX_SECURITY] = &ext2_xattr_security_handler, +#endif +}; + +const struct xattr_handler *ext2_xattr_handlers[] = { + &ext2_xattr_user_handler, + &ext2_xattr_trusted_handler, +#ifdef CONFIG_EXT2_FS_POSIX_ACL + &ext2_xattr_acl_access_handler, + &ext2_xattr_acl_default_handler, +#endif +#ifdef CONFIG_EXT2_FS_SECURITY + &ext2_xattr_security_handler, +#endif + NULL +}; + +static inline const struct xattr_handler * +ext2_xattr_handler(int name_index) +{ + const struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map)) + handler = ext2_xattr_handler_map[name_index]; + return handler; +} + +/* + * ext2_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +int +ext2_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct buffer_head *bh = NULL; + struct ext2_xattr_entry *entry; + size_t name_len, size; + char *end; + int error; + + ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", + name_index, name, buffer, (long)buffer_size); + + if (name == NULL) + return -EINVAL; + down_read(&EXT2_I(inode)->xattr_sem); + error = -ENODATA; + if (!EXT2_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); + end = bh->b_data + bh->b_size; + if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || + HDR(bh)->h_blocks != cpu_to_le32(1)) { +bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", + "inode %ld: bad block %d", inode->i_ino, + EXT2_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + /* find named attribute */ + name_len = strlen(name); + + error = -ERANGE; + if (name_len > 255) + goto cleanup; + entry = FIRST_ENTRY(bh); + while (!IS_LAST_ENTRY(entry)) { + struct ext2_xattr_entry *next = + EXT2_XATTR_NEXT(entry); + if ((char *)next >= end) + goto bad_block; + if (name_index == entry->e_name_index && + name_len == entry->e_name_len && + memcmp(name, entry->e_name, name_len) == 0) + goto found; + entry = next; + } + if (ext2_xattr_cache_insert(bh)) + ea_idebug(inode, "cache insert failed"); + error = -ENODATA; + goto cleanup; +found: + /* check the buffer size */ + if (entry->e_value_block != 0) + goto bad_block; + size = le32_to_cpu(entry->e_value_size); + if (size > inode->i_sb->s_blocksize || + le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) + goto bad_block; + + if (ext2_xattr_cache_insert(bh)) + ea_idebug(inode, "cache insert failed"); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + /* return value of attribute */ + memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), + size); + } + error = size; + +cleanup: + brelse(bh); + up_read(&EXT2_I(inode)->xattr_sem); + + return error; +} + +/* + * ext2_xattr_list() + * + * Copy a list of attribute names into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +static int +ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = dentry->d_inode; + struct buffer_head *bh = NULL; + struct ext2_xattr_entry *entry; + char *end; + size_t rest = buffer_size; + int error; + + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); + + down_read(&EXT2_I(inode)->xattr_sem); + error = 0; + if (!EXT2_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); + end = bh->b_data + bh->b_size; + if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || + HDR(bh)->h_blocks != cpu_to_le32(1)) { +bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", + "inode %ld: bad block %d", inode->i_ino, + EXT2_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + + /* check the on-disk data structure */ + entry = FIRST_ENTRY(bh); + while (!IS_LAST_ENTRY(entry)) { + struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(entry); + + if ((char *)next >= end) + goto bad_block; + entry = next; + } + if (ext2_xattr_cache_insert(bh)) + ea_idebug(inode, "cache insert failed"); + + /* list the attribute names */ + for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); + entry = EXT2_XATTR_NEXT(entry)) { + const struct xattr_handler *handler = + ext2_xattr_handler(entry->e_name_index); + + if (handler) { + size_t size = handler->list(dentry, buffer, rest, + entry->e_name, + entry->e_name_len, + handler->flags); + if (buffer) { + if (size > rest) { + error = -ERANGE; + goto cleanup; + } + buffer += size; + } + rest -= size; + } + } + error = buffer_size - rest; /* total size */ + +cleanup: + brelse(bh); + up_read(&EXT2_I(inode)->xattr_sem); + + return error; +} + +/* + * Inode operation listxattr() + * + * dentry->d_inode->i_mutex: don't care + */ +ssize_t +ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + return ext2_xattr_list(dentry, buffer, size); +} + +/* + * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is + * not set, set it. + */ +static void ext2_xattr_update_super_block(struct super_block *sb) +{ + if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR)) + return; + + spin_lock(&EXT2_SB(sb)->s_lock); + EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR); + spin_unlock(&EXT2_SB(sb)->s_lock); + sb->s_dirt = 1; + mark_buffer_dirty(EXT2_SB(sb)->s_sbh); +} + +/* + * ext2_xattr_set() + * + * Create, replace or remove an extended attribute for this inode. Value + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int +ext2_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len, int flags) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *bh = NULL; + struct ext2_xattr_header *header = NULL; + struct ext2_xattr_entry *here, *last; + size_t name_len, free, min_offs = sb->s_blocksize; + int not_found = 1, error; + char *end; + + /* + * header -- Points either into bh, or to a temporarily + * allocated buffer. + * here -- The named entry found, or the place for inserting, within + * the block pointed to by header. + * last -- Points right after the last named entry within the block + * pointed to by header. + * min_offs -- The offset of the first value (values are aligned + * towards the end of the block). + * end -- Points right after the block pointed to by header. + */ + + ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", + name_index, name, value, (long)value_len); + + if (value == NULL) + value_len = 0; + if (name == NULL) + return -EINVAL; + name_len = strlen(name); + if (name_len > 255 || value_len > sb->s_blocksize) + return -ERANGE; + down_write(&EXT2_I(inode)->xattr_sem); + if (EXT2_I(inode)->i_file_acl) { + /* The inode already has an extended attribute block. */ + bh = sb_bread(sb, EXT2_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), + le32_to_cpu(HDR(bh)->h_refcount)); + header = HDR(bh); + end = bh->b_data + bh->b_size; + if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || + header->h_blocks != cpu_to_le32(1)) { +bad_block: ext2_error(sb, "ext2_xattr_set", + "inode %ld: bad block %d", inode->i_ino, + EXT2_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + /* Find the named attribute. */ + here = FIRST_ENTRY(bh); + while (!IS_LAST_ENTRY(here)) { + struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here); + if ((char *)next >= end) + goto bad_block; + if (!here->e_value_block && here->e_value_size) { + size_t offs = le16_to_cpu(here->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + not_found = name_index - here->e_name_index; + if (!not_found) + not_found = name_len - here->e_name_len; + if (!not_found) + not_found = memcmp(name, here->e_name,name_len); + if (not_found <= 0) + break; + here = next; + } + last = here; + /* We still need to compute min_offs and last. */ + while (!IS_LAST_ENTRY(last)) { + struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last); + if ((char *)next >= end) + goto bad_block; + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + last = next; + } + + /* Check whether we have enough space left. */ + free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); + } else { + /* We will use a new extended attribute block. */ + free = sb->s_blocksize - + sizeof(struct ext2_xattr_header) - sizeof(__u32); + here = last = NULL; /* avoid gcc uninitialized warning. */ + } + + if (not_found) { + /* Request to remove a nonexistent attribute? */ + error = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + error = 0; + if (value == NULL) + goto cleanup; + } else { + /* Request to create an existing attribute? */ + error = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + if (!here->e_value_block && here->e_value_size) { + size_t size = le32_to_cpu(here->e_value_size); + + if (le16_to_cpu(here->e_value_offs) + size > + sb->s_blocksize || size > sb->s_blocksize) + goto bad_block; + free += EXT2_XATTR_SIZE(size); + } + free += EXT2_XATTR_LEN(name_len); + } + error = -ENOSPC; + if (free < EXT2_XATTR_LEN(name_len) + EXT2_XATTR_SIZE(value_len)) + goto cleanup; + + /* Here we know that we can set the new attribute. */ + + if (header) { + struct mb_cache_entry *ce; + + /* assert(header == HDR(bh)); */ + ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, + bh->b_blocknr); + lock_buffer(bh); + if (header->h_refcount == cpu_to_le32(1)) { + ea_bdebug(bh, "modifying in-place"); + if (ce) + mb_cache_entry_free(ce); + /* keep the buffer locked while modifying it. */ + } else { + int offset; + + if (ce) + mb_cache_entry_release(ce); + unlock_buffer(bh); + ea_bdebug(bh, "cloning"); + header = kmalloc(bh->b_size, GFP_KERNEL); + error = -ENOMEM; + if (header == NULL) + goto cleanup; + memcpy(header, HDR(bh), bh->b_size); + header->h_refcount = cpu_to_le32(1); + + offset = (char *)here - bh->b_data; + here = ENTRY((char *)header + offset); + offset = (char *)last - bh->b_data; + last = ENTRY((char *)header + offset); + } + } else { + /* Allocate a buffer where we construct the new block. */ + header = kzalloc(sb->s_blocksize, GFP_KERNEL); + error = -ENOMEM; + if (header == NULL) + goto cleanup; + end = (char *)header + sb->s_blocksize; + header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC); + header->h_blocks = header->h_refcount = cpu_to_le32(1); + last = here = ENTRY(header+1); + } + + /* Iff we are modifying the block in-place, bh is locked here. */ + + if (not_found) { + /* Insert the new name. */ + size_t size = EXT2_XATTR_LEN(name_len); + size_t rest = (char *)last - (char *)here; + memmove((char *)here + size, here, rest); + memset(here, 0, size); + here->e_name_index = name_index; + here->e_name_len = name_len; + memcpy(here->e_name, name, name_len); + } else { + if (!here->e_value_block && here->e_value_size) { + char *first_val = (char *)header + min_offs; + size_t offs = le16_to_cpu(here->e_value_offs); + char *val = (char *)header + offs; + size_t size = EXT2_XATTR_SIZE( + le32_to_cpu(here->e_value_size)); + + if (size == EXT2_XATTR_SIZE(value_len)) { + /* The old and the new value have the same + size. Just replace. */ + here->e_value_size = cpu_to_le32(value_len); + memset(val + size - EXT2_XATTR_PAD, 0, + EXT2_XATTR_PAD); /* Clear pad bytes. */ + memcpy(val, value, value_len); + goto skip_replace; + } + + /* Remove the old value. */ + memmove(first_val + size, first_val, val - first_val); + memset(first_val, 0, size); + here->e_value_offs = 0; + min_offs += size; + + /* Adjust all value offsets. */ + last = ENTRY(header+1); + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + if (!last->e_value_block && o < offs) + last->e_value_offs = + cpu_to_le16(o + size); + last = EXT2_XATTR_NEXT(last); + } + } + if (value == NULL) { + /* Remove the old name. */ + size_t size = EXT2_XATTR_LEN(name_len); + last = ENTRY((char *)last - size); + memmove(here, (char*)here + size, + (char*)last - (char*)here); + memset(last, 0, size); + } + } + + if (value != NULL) { + /* Insert the new value. */ + here->e_value_size = cpu_to_le32(value_len); + if (value_len) { + size_t size = EXT2_XATTR_SIZE(value_len); + char *val = (char *)header + min_offs - size; + here->e_value_offs = + cpu_to_le16((char *)val - (char *)header); + memset(val + size - EXT2_XATTR_PAD, 0, + EXT2_XATTR_PAD); /* Clear the pad bytes. */ + memcpy(val, value, value_len); + } + } + +skip_replace: + if (IS_LAST_ENTRY(ENTRY(header+1))) { + /* This block is now empty. */ + if (bh && header == HDR(bh)) + unlock_buffer(bh); /* we were modifying in-place. */ + error = ext2_xattr_set2(inode, bh, NULL); + } else { + ext2_xattr_rehash(header, here); + if (bh && header == HDR(bh)) + unlock_buffer(bh); /* we were modifying in-place. */ + error = ext2_xattr_set2(inode, bh, header); + } + +cleanup: + brelse(bh); + if (!(bh && header == HDR(bh))) + kfree(header); + up_write(&EXT2_I(inode)->xattr_sem); + + return error; +} + +/* + * Second half of ext2_xattr_set(): Update the file system. + */ +static int +ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, + struct ext2_xattr_header *header) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; + int error; + + if (header) { + new_bh = ext2_xattr_cache_find(inode, header); + if (new_bh) { + /* We found an identical block in the cache. */ + if (new_bh == old_bh) { + ea_bdebug(new_bh, "keeping this block"); + } else { + /* The old block is released after updating + the inode. */ + ea_bdebug(new_bh, "reusing block"); + + error = dquot_alloc_block(inode, 1); + if (error) { + unlock_buffer(new_bh); + goto cleanup; + } + le32_add_cpu(&HDR(new_bh)->h_refcount, 1); + ea_bdebug(new_bh, "refcount now=%d", + le32_to_cpu(HDR(new_bh)->h_refcount)); + } + unlock_buffer(new_bh); + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. No need to lock the block as we + don't need to change the reference count. */ + new_bh = old_bh; + get_bh(new_bh); + ext2_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + ext2_fsblk_t goal = ext2_group_first_block_no(sb, + EXT2_I(inode)->i_block_group); + int block = ext2_new_block(inode, goal, &error); + if (error) + goto cleanup; + ea_idebug(inode, "creating block %d", block); + + new_bh = sb_getblk(sb, block); + if (!new_bh) { + ext2_free_blocks(inode, block, 1); + mark_inode_dirty(inode); + error = -EIO; + goto cleanup; + } + lock_buffer(new_bh); + memcpy(new_bh->b_data, header, new_bh->b_size); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); + ext2_xattr_cache_insert(new_bh); + + ext2_xattr_update_super_block(sb); + } + mark_buffer_dirty(new_bh); + if (IS_SYNC(inode)) { + sync_dirty_buffer(new_bh); + error = -EIO; + if (buffer_req(new_bh) && !buffer_uptodate(new_bh)) + goto cleanup; + } + } + + /* Update the inode. */ + EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; + inode->i_ctime = CURRENT_TIME_SEC; + if (IS_SYNC(inode)) { + error = sync_inode_metadata(inode, 1); + /* In case sync failed due to ENOSPC the inode was actually + * written (only some dirty data were not) so we just proceed + * as if nothing happened and cleanup the unused block */ + if (error && error != -ENOSPC) { + if (new_bh && new_bh != old_bh) { + dquot_free_block_nodirty(inode, 1); + mark_inode_dirty(inode); + } + goto cleanup; + } + } else + mark_inode_dirty(inode); + + error = 0; + if (old_bh && old_bh != new_bh) { + struct mb_cache_entry *ce; + + /* + * If there was an old block and we are no longer using it, + * release the old block. + */ + ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev, + old_bh->b_blocknr); + lock_buffer(old_bh); + if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { + /* Free the old block. */ + if (ce) + mb_cache_entry_free(ce); + ea_bdebug(old_bh, "freeing"); + ext2_free_blocks(inode, old_bh->b_blocknr, 1); + mark_inode_dirty(inode); + /* We let our caller release old_bh, so we + * need to duplicate the buffer before. */ + get_bh(old_bh); + bforget(old_bh); + } else { + /* Decrement the refcount only. */ + le32_add_cpu(&HDR(old_bh)->h_refcount, -1); + if (ce) + mb_cache_entry_release(ce); + dquot_free_block_nodirty(inode, 1); + mark_inode_dirty(inode); + mark_buffer_dirty(old_bh); + ea_bdebug(old_bh, "refcount now=%d", + le32_to_cpu(HDR(old_bh)->h_refcount)); + } + unlock_buffer(old_bh); + } + +cleanup: + brelse(new_bh); + + return error; +} + +/* + * ext2_xattr_delete_inode() + * + * Free extended attribute resources associated with this inode. This + * is called immediately before an inode is freed. + */ +void +ext2_xattr_delete_inode(struct inode *inode) +{ + struct buffer_head *bh = NULL; + struct mb_cache_entry *ce; + + down_write(&EXT2_I(inode)->xattr_sem); + if (!EXT2_I(inode)->i_file_acl) + goto cleanup; + bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl); + if (!bh) { + ext2_error(inode->i_sb, "ext2_xattr_delete_inode", + "inode %ld: block %d read error", inode->i_ino, + EXT2_I(inode)->i_file_acl); + goto cleanup; + } + ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); + if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || + HDR(bh)->h_blocks != cpu_to_le32(1)) { + ext2_error(inode->i_sb, "ext2_xattr_delete_inode", + "inode %ld: bad block %d", inode->i_ino, + EXT2_I(inode)->i_file_acl); + goto cleanup; + } + ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr); + lock_buffer(bh); + if (HDR(bh)->h_refcount == cpu_to_le32(1)) { + if (ce) + mb_cache_entry_free(ce); + ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); + get_bh(bh); + bforget(bh); + unlock_buffer(bh); + } else { + le32_add_cpu(&HDR(bh)->h_refcount, -1); + if (ce) + mb_cache_entry_release(ce); + ea_bdebug(bh, "refcount now=%d", + le32_to_cpu(HDR(bh)->h_refcount)); + unlock_buffer(bh); + mark_buffer_dirty(bh); + if (IS_SYNC(inode)) + sync_dirty_buffer(bh); + dquot_free_block_nodirty(inode, 1); + } + EXT2_I(inode)->i_file_acl = 0; + +cleanup: + brelse(bh); + up_write(&EXT2_I(inode)->xattr_sem); +} + +/* + * ext2_xattr_put_super() + * + * This is called when a file system is unmounted. + */ +void +ext2_xattr_put_super(struct super_block *sb) +{ + mb_cache_shrink(sb->s_bdev); +} + + +/* + * ext2_xattr_cache_insert() + * + * Create a new entry in the extended attribute cache, and insert + * it unless such an entry is already in the cache. + * + * Returns 0, or a negative error number on failure. + */ +static int +ext2_xattr_cache_insert(struct buffer_head *bh) +{ + __u32 hash = le32_to_cpu(HDR(bh)->h_hash); + struct mb_cache_entry *ce; + int error; + + ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS); + if (!ce) + return -ENOMEM; + error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); + if (error) { + mb_cache_entry_free(ce); + if (error == -EBUSY) { + ea_bdebug(bh, "already in cache (%d cache entries)", + atomic_read(&ext2_xattr_cache->c_entry_count)); + error = 0; + } + } else { + ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, + atomic_read(&ext2_xattr_cache->c_entry_count)); + mb_cache_entry_release(ce); + } + return error; +} + +/* + * ext2_xattr_cmp() + * + * Compare two extended attribute blocks for equality. + * + * Returns 0 if the blocks are equal, 1 if they differ, and + * a negative error number on errors. + */ +static int +ext2_xattr_cmp(struct ext2_xattr_header *header1, + struct ext2_xattr_header *header2) +{ + struct ext2_xattr_entry *entry1, *entry2; + + entry1 = ENTRY(header1+1); + entry2 = ENTRY(header2+1); + while (!IS_LAST_ENTRY(entry1)) { + if (IS_LAST_ENTRY(entry2)) + return 1; + if (entry1->e_hash != entry2->e_hash || + entry1->e_name_index != entry2->e_name_index || + entry1->e_name_len != entry2->e_name_len || + entry1->e_value_size != entry2->e_value_size || + memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) + return 1; + if (entry1->e_value_block != 0 || entry2->e_value_block != 0) + return -EIO; + if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + (char *)header2 + le16_to_cpu(entry2->e_value_offs), + le32_to_cpu(entry1->e_value_size))) + return 1; + + entry1 = EXT2_XATTR_NEXT(entry1); + entry2 = EXT2_XATTR_NEXT(entry2); + } + if (!IS_LAST_ENTRY(entry2)) + return 1; + return 0; +} + +/* + * ext2_xattr_cache_find() + * + * Find an identical extended attribute block. + * + * Returns a locked buffer head to the block found, or NULL if such + * a block was not found or an error occurred. + */ +static struct buffer_head * +ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) +{ + __u32 hash = le32_to_cpu(header->h_hash); + struct mb_cache_entry *ce; + + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); +again: + ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev, + hash); + while (ce) { + struct buffer_head *bh; + + if (IS_ERR(ce)) { + if (PTR_ERR(ce) == -EAGAIN) + goto again; + break; + } + + bh = sb_bread(inode->i_sb, ce->e_block); + if (!bh) { + ext2_error(inode->i_sb, "ext2_xattr_cache_find", + "inode %ld: block %ld read error", + inode->i_ino, (unsigned long) ce->e_block); + } else { + lock_buffer(bh); + if (le32_to_cpu(HDR(bh)->h_refcount) > + EXT2_XATTR_REFCOUNT_MAX) { + ea_idebug(inode, "block %ld refcount %d>%d", + (unsigned long) ce->e_block, + le32_to_cpu(HDR(bh)->h_refcount), + EXT2_XATTR_REFCOUNT_MAX); + } else if (!ext2_xattr_cmp(header, HDR(bh))) { + ea_bdebug(bh, "b_count=%d", + atomic_read(&(bh->b_count))); + mb_cache_entry_release(ce); + return bh; + } + unlock_buffer(bh); + brelse(bh); + } + ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); + } + return NULL; +} + +#define NAME_HASH_SHIFT 5 +#define VALUE_HASH_SHIFT 16 + +/* + * ext2_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header, + struct ext2_xattr_entry *entry) +{ + __u32 hash = 0; + char *name = entry->e_name; + int n; + + for (n=0; n < entry->e_name_len; n++) { + hash = (hash << NAME_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ + *name++; + } + + if (entry->e_value_block == 0 && entry->e_value_size != 0) { + __le32 *value = (__le32 *)((char *)header + + le16_to_cpu(entry->e_value_offs)); + for (n = (le32_to_cpu(entry->e_value_size) + + EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); + } + } + entry->e_hash = cpu_to_le32(hash); +} + +#undef NAME_HASH_SHIFT +#undef VALUE_HASH_SHIFT + +#define BLOCK_HASH_SHIFT 16 + +/* + * ext2_xattr_rehash() + * + * Re-compute the extended attribute hash value after an entry has changed. + */ +static void ext2_xattr_rehash(struct ext2_xattr_header *header, + struct ext2_xattr_entry *entry) +{ + struct ext2_xattr_entry *here; + __u32 hash = 0; + + ext2_xattr_hash_entry(header, entry); + here = ENTRY(header+1); + while (!IS_LAST_ENTRY(here)) { + if (!here->e_hash) { + /* Block is not shared if an entry's hash value == 0 */ + hash = 0; + break; + } + hash = (hash << BLOCK_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ + le32_to_cpu(here->e_hash); + here = EXT2_XATTR_NEXT(here); + } + header->h_hash = cpu_to_le32(hash); +} + +#undef BLOCK_HASH_SHIFT + +int __init +init_ext2_xattr(void) +{ + ext2_xattr_cache = mb_cache_create("ext2_xattr", 6); + if (!ext2_xattr_cache) + return -ENOMEM; + return 0; +} + +void +exit_ext2_xattr(void) +{ + mb_cache_destroy(ext2_xattr_cache); +} diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h new file mode 100644 index 00000000..5e41cccf --- /dev/null +++ b/fs/ext2/xattr.h @@ -0,0 +1,127 @@ +/* + File: linux/ext2_xattr.h + + On-disk format of extended attributes for the ext2 filesystem. + + (C) 2001 Andreas Gruenbacher, +*/ + +#include +#include + +/* Magic value in attribute blocks */ +#define EXT2_XATTR_MAGIC 0xEA020000 + +/* Maximum number of references to one attribute block */ +#define EXT2_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define EXT2_XATTR_INDEX_USER 1 +#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EXT2_XATTR_INDEX_TRUSTED 4 +#define EXT2_XATTR_INDEX_LUSTRE 5 +#define EXT2_XATTR_INDEX_SECURITY 6 + +struct ext2_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __le32 h_blocks; /* number of disk blocks used */ + __le32 h_hash; /* hash value of all attributes */ + __u32 h_reserved[4]; /* zero right now */ +}; + +struct ext2_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ + __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[0]; /* attribute name */ +}; + +#define EXT2_XATTR_PAD_BITS 2 +#define EXT2_XATTR_PAD (1<e_name_len)) ) +#define EXT2_XATTR_SIZE(size) \ + (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) + +# ifdef CONFIG_EXT2_FS_XATTR + +extern const struct xattr_handler ext2_xattr_user_handler; +extern const struct xattr_handler ext2_xattr_trusted_handler; +extern const struct xattr_handler ext2_xattr_acl_access_handler; +extern const struct xattr_handler ext2_xattr_acl_default_handler; +extern const struct xattr_handler ext2_xattr_security_handler; + +extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); + +extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); + +extern void ext2_xattr_delete_inode(struct inode *); +extern void ext2_xattr_put_super(struct super_block *); + +extern int init_ext2_xattr(void); +extern void exit_ext2_xattr(void); + +extern const struct xattr_handler *ext2_xattr_handlers[]; + +# else /* CONFIG_EXT2_FS_XATTR */ + +static inline int +ext2_xattr_get(struct inode *inode, int name_index, + const char *name, void *buffer, size_t size) +{ + return -EOPNOTSUPP; +} + +static inline int +ext2_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline void +ext2_xattr_delete_inode(struct inode *inode) +{ +} + +static inline void +ext2_xattr_put_super(struct super_block *sb) +{ +} + +static inline int +init_ext2_xattr(void) +{ + return 0; +} + +static inline void +exit_ext2_xattr(void) +{ +} + +#define ext2_xattr_handlers NULL + +# endif /* CONFIG_EXT2_FS_XATTR */ + +#ifdef CONFIG_EXT2_FS_SECURITY +extern int ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); +#else +static inline int ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return 0; +} +#endif diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c new file mode 100644 index 00000000..5d979b43 --- /dev/null +++ b/fs/ext2/xattr_security.c @@ -0,0 +1,76 @@ +/* + * linux/fs/ext2/xattr_security.c + * Handler for storing security labels as extended attributes. + */ + +#include +#include +#include +#include +#include +#include +#include "xattr.h" + +static size_t +ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const int prefix_len = XATTR_SECURITY_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext2_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name, + buffer, size); +} + +static int +ext2_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name, + value, size, flags); +} + +int +ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + int err; + size_t len; + void *value; + char *name; + + err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, + name, value, len, 0); + kfree(name); + kfree(value); + return err; +} + +const struct xattr_handler ext2_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = ext2_xattr_security_list, + .get = ext2_xattr_security_get, + .set = ext2_xattr_security_set, +}; diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c new file mode 100644 index 00000000..667e46a8 --- /dev/null +++ b/fs/ext2/xattr_trusted.c @@ -0,0 +1,58 @@ +/* + * linux/fs/ext2/xattr_trusted.c + * Handler for trusted extended attributes. + * + * Copyright (C) 2003 by Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include +#include "xattr.h" + +static size_t +ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const int prefix_len = XATTR_TRUSTED_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!capable(CAP_SYS_ADMIN)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext2_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name, + buffer, size); +} + +static int +ext2_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name, + value, size, flags); +} + +const struct xattr_handler ext2_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ext2_xattr_trusted_list, + .get = ext2_xattr_trusted_get, + .set = ext2_xattr_trusted_set, +}; diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c new file mode 100644 index 00000000..099d20f4 --- /dev/null +++ b/fs/ext2/xattr_user.c @@ -0,0 +1,62 @@ +/* + * linux/fs/ext2/xattr_user.c + * Handler for extended user attributes. + * + * Copyright (C) 2001 by Andreas Gruenbacher, + */ + +#include +#include +#include +#include "ext2.h" +#include "xattr.h" + +static size_t +ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = XATTR_USER_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!test_opt(dentry->d_sb, XATTR_USER)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext2_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_USER, + name, buffer, size); +} + +static int +ext2_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, XATTR_USER)) + return -EOPNOTSUPP; + + return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_USER, + name, value, size, flags); +} + +const struct xattr_handler ext2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ext2_xattr_user_list, + .get = ext2_xattr_user_get, + .set = ext2_xattr_user_set, +}; diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c new file mode 100644 index 00000000..322a56b2 --- /dev/null +++ b/fs/ext2/xip.c @@ -0,0 +1,92 @@ +/* + * linux/fs/ext2/xip.c + * + * Copyright (C) 2005 IBM Corporation + * Author: Carsten Otte (cotte@de.ibm.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ext2.h" +#include "xip.h" + +static inline int +__inode_direct_access(struct inode *inode, sector_t block, + void **kaddr, unsigned long *pfn) +{ + struct block_device *bdev = inode->i_sb->s_bdev; + const struct block_device_operations *ops = bdev->bd_disk->fops; + sector_t sector; + + sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */ + + BUG_ON(!ops->direct_access); + return ops->direct_access(bdev, sector, kaddr, pfn); +} + +static inline int +__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create, + sector_t *result) +{ + struct buffer_head tmp; + int rc; + + memset(&tmp, 0, sizeof(struct buffer_head)); + rc = ext2_get_block(inode, pgoff, &tmp, create); + *result = tmp.b_blocknr; + + /* did we get a sparse block (hole in the file)? */ + if (!tmp.b_blocknr && !rc) { + BUG_ON(create); + rc = -ENODATA; + } + + return rc; +} + +int +ext2_clear_xip_target(struct inode *inode, sector_t block) +{ + void *kaddr; + unsigned long pfn; + int rc; + + rc = __inode_direct_access(inode, block, &kaddr, &pfn); + if (!rc) + clear_page(kaddr); + return rc; +} + +void ext2_xip_verify_sb(struct super_block *sb) +{ + struct ext2_sb_info *sbi = EXT2_SB(sb); + + if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) && + !sb->s_bdev->bd_disk->fops->direct_access) { + sbi->s_mount_opt &= (~EXT2_MOUNT_XIP); + ext2_msg(sb, KERN_WARNING, + "warning: ignoring xip option - " + "not supported by bdev"); + } +} + +int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create, + void **kmem, unsigned long *pfn) +{ + int rc; + sector_t block; + + /* first, retrieve the sector number */ + rc = __ext2_get_block(mapping->host, pgoff, create, &block); + if (rc) + return rc; + + /* retrieve address of the target data */ + rc = __inode_direct_access(mapping->host, block, kmem, pfn); + return rc; +} diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h new file mode 100644 index 00000000..18b34d2f --- /dev/null +++ b/fs/ext2/xip.h @@ -0,0 +1,26 @@ +/* + * linux/fs/ext2/xip.h + * + * Copyright (C) 2005 IBM Corporation + * Author: Carsten Otte (cotte@de.ibm.com) + */ + +#ifdef CONFIG_EXT2_FS_XIP +extern void ext2_xip_verify_sb (struct super_block *); +extern int ext2_clear_xip_target (struct inode *, sector_t); + +static inline int ext2_use_xip (struct super_block *sb) +{ + struct ext2_sb_info *sbi = EXT2_SB(sb); + return (sbi->s_mount_opt & EXT2_MOUNT_XIP); +} +int ext2_get_xip_mem(struct address_space *, pgoff_t, int, + void **, unsigned long *); +#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem) +#else +#define mapping_is_xip(map) 0 +#define ext2_xip_verify_sb(sb) do { } while (0) +#define ext2_use_xip(sb) 0 +#define ext2_clear_xip_target(inode, chain) 0 +#define ext2_get_xip_mem NULL +#endif diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig new file mode 100644 index 00000000..e8c6ba0e --- /dev/null +++ b/fs/ext3/Kconfig @@ -0,0 +1,89 @@ +config EXT3_FS + tristate "Ext3 journalling file system support" + select JBD + help + This is the journalling version of the Second extended file system + (often called ext3), the de facto standard Linux file system + (method to organize files on a storage device) for hard disks. + + The journalling code included in this driver means you do not have + to run e2fsck (file system checker) on your file systems after a + crash. The journal keeps track of any changes that were being made + at the time the system crashed, and can ensure that your file system + is consistent without the need for a lengthy check. + + Other than adding the journal to the file system, the on-disk format + of ext3 is identical to ext2. It is possible to freely switch + between using the ext3 driver and the ext2 driver, as long as the + file system has been cleanly unmounted, or e2fsck is run on the file + system. + + To add a journal on an existing ext2 file system or change the + behavior of ext3 file systems, you can use the tune2fs utility ("man + tune2fs"). To modify attributes of files and directories on ext3 + file systems, use chattr ("man chattr"). You need to be using + e2fsprogs version 1.20 or later in order to create ext3 journals + (available at ). + + To compile this file system support as a module, choose M here: the + module will be called ext3. + +config EXT3_DEFAULTS_TO_ORDERED + bool "Default to 'data=ordered' in ext3" + depends on EXT3_FS + default y + help + The journal mode options for ext3 have different tradeoffs + between when data is guaranteed to be on disk and + performance. The use of "data=writeback" can cause + unwritten data to appear in files after an system crash or + power failure, which can be a security issue. However, + "data=ordered" mode can also result in major performance + problems, including seconds-long delays before an fsync() + call returns. For details, see: + + http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs + + If you have been historically happy with ext3's performance, + data=ordered mode will be a safe choice and you should + answer 'y' here. If you understand the reliability and data + privacy issues of data=writeback and are willing to make + that trade off, answer 'n'. + +config EXT3_FS_XATTR + bool "Ext3 extended attributes" + depends on EXT3_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + + You need this for POSIX ACL support on ext3. + +config EXT3_FS_POSIX_ACL + bool "Ext3 POSIX Access Control Lists" + depends on EXT3_FS_XATTR + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config EXT3_FS_SECURITY + bool "Ext3 Security Labels" + depends on EXT3_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext3 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/ext3/Makefile b/fs/ext3/Makefile new file mode 100644 index 00000000..e77766a8 --- /dev/null +++ b/fs/ext3/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the linux ext3-filesystem routines. +# + +obj-$(CONFIG_EXT3_FS) += ext3.o + +ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o + +ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c new file mode 100644 index 00000000..9d021c0d --- /dev/null +++ b/fs/ext3/acl.c @@ -0,0 +1,481 @@ +/* + * linux/fs/ext3/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl * +ext3_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(ext3_acl_header)) + return ERR_PTR(-EINVAL); + if (((ext3_acl_header *)value)->a_version != + cpu_to_le32(EXT3_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(ext3_acl_header); + count = ext3_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n=0; n < count; n++) { + ext3_acl_entry *entry = + (ext3_acl_entry *)value; + if ((char *)value + sizeof(ext3_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(ext3_acl_entry_short); + acl->a_entries[n].e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + value = (char *)value + sizeof(ext3_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_id = + le32_to_cpu(entry->e_id); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void * +ext3_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + ext3_acl_header *ext_acl; + char *e; + size_t n; + + *size = ext3_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count * + sizeof(ext3_acl_entry), GFP_NOFS); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION); + e = (char *)ext_acl + sizeof(ext3_acl_header); + for (n=0; n < acl->a_count; n++) { + ext3_acl_entry *entry = (ext3_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER: + case ACL_GROUP: + entry->e_id = + cpu_to_le32(acl->a_entries[n].e_id); + e += sizeof(ext3_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(ext3_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_mutex: don't care + */ +static struct posix_acl * +ext3_get_acl(struct inode *inode, int type) +{ + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return NULL; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + BUG(); + } + + retval = ext3_xattr_get(inode, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ext3_xattr_get(inode, name_index, "", value, retval); + } + if (retval > 0) + acl = ext3_acl_from_disk(value, retval); + else if (retval == -ENODATA || retval == -ENOSYS) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +/* + * Set the access or default ACL of an inode. + * + * inode->i_mutex: down unless called from ext3_new_inode + */ +static int +ext3_set_acl(handle_t *handle, struct inode *inode, int type, + struct posix_acl *acl) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch(type) { + case ACL_TYPE_ACCESS: + name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; + else { + inode->i_mode = mode; + inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + if (error == 0) + acl = NULL; + } + } + break; + + case ACL_TYPE_DEFAULT: + name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + if (acl) { + value = ext3_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = ext3_xattr_set_handle(handle, inode, name_index, "", + value, size, 0); + + kfree(value); + + if (!error) + set_cached_acl(inode, type, acl); + + return error; +} + +int +ext3_check_acl(struct inode *inode, int mask, unsigned int flags) +{ + struct posix_acl *acl; + + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -ECHILD; + return -EAGAIN; + } + + acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + + return -EAGAIN; +} + +/* + * Initialize the ACLs of a new inode. Called from ext3_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) + */ +int +ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int error = 0; + + if (!S_ISLNK(inode->i_mode)) { + if (test_opt(dir->i_sb, POSIX_ACL)) { + acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) + inode->i_mode &= ~current_umask(); + } + if (test_opt(inode->i_sb, POSIX_ACL) && acl) { + struct posix_acl *clone; + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { + error = ext3_set_acl(handle, inode, + ACL_TYPE_DEFAULT, acl); + if (error) + goto cleanup; + } + clone = posix_acl_clone(acl, GFP_NOFS); + error = -ENOMEM; + if (!clone) + goto cleanup; + + mode = inode->i_mode; + error = posix_acl_create_masq(clone, &mode); + if (error >= 0) { + inode->i_mode = mode; + if (error > 0) { + /* This is an extended ACL */ + error = ext3_set_acl(handle, inode, + ACL_TYPE_ACCESS, clone); + } + } + posix_acl_release(clone); + } +cleanup: + posix_acl_release(acl); + return error; +} + +/* + * Does chmod for an inode that may have an Access Control List. The + * inode->i_mode field must be updated to the desired value by the caller + * before calling this function. + * Returns 0 on success, or a negative error number. + * + * We change the ACL rather than storing some ACL entries in the file + * mode permission bits (which would be more efficient), because that + * would break once additional permissions (like ACL_APPEND, ACL_DELETE + * for directories) are added. There are no more bits available in the + * file mode. + * + * inode->i_mutex: down + */ +int +ext3_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + if (!test_opt(inode->i_sb, POSIX_ACL)) + return 0; + acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + error = posix_acl_chmod_masq(clone, inode->i_mode); + if (!error) { + handle_t *handle; + int retries = 0; + + retry: + handle = ext3_journal_start(inode, + EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + ext3_std_error(inode->i_sb, error); + goto out; + } + error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, clone); + ext3_journal_stop(handle); + if (error == -ENOSPC && + ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + } +out: + posix_acl_release(clone); + return error; +} + +/* + * Extended attribute handlers + */ +static size_t +ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) +{ + const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); + + if (!test_opt(dentry->d_sb, POSIX_ACL)) + return 0; + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_ACCESS, size); + return size; +} + +static size_t +ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) +{ + const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); + + if (!test_opt(dentry->d_sb, POSIX_ACL)) + return 0; + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); + return size; +} + +static int +ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) +{ + struct posix_acl *acl; + int error; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) + return -EOPNOTSUPP; + + acl = ext3_get_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +static int +ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) +{ + struct inode *inode = dentry->d_inode; + handle_t *handle; + struct posix_acl *acl; + int error, retries = 0; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(inode->i_sb, POSIX_ACL)) + return -EOPNOTSUPP; + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + error = posix_acl_valid(acl); + if (error) + goto release_and_out; + } + } else + acl = NULL; + +retry: + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + error = ext3_set_acl(handle, inode, type, acl); + ext3_journal_stop(handle); + if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + +release_and_out: + posix_acl_release(acl); + return error; +} + +const struct xattr_handler ext3_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = ext3_xattr_list_acl_access, + .get = ext3_xattr_get_acl, + .set = ext3_xattr_set_acl, +}; + +const struct xattr_handler ext3_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = ext3_xattr_list_acl_default, + .get = ext3_xattr_get_acl, + .set = ext3_xattr_set_acl, +}; diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h new file mode 100644 index 00000000..5faf8048 --- /dev/null +++ b/fs/ext3/acl.h @@ -0,0 +1,77 @@ +/* + File: fs/ext3/acl.h + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +#define EXT3_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} ext3_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} ext3_acl_entry_short; + +typedef struct { + __le32 a_version; +} ext3_acl_header; + +static inline size_t ext3_acl_size(int count) +{ + if (count <= 4) { + return sizeof(ext3_acl_header) + + count * sizeof(ext3_acl_entry_short); + } else { + return sizeof(ext3_acl_header) + + 4 * sizeof(ext3_acl_entry_short) + + (count - 4) * sizeof(ext3_acl_entry); + } +} + +static inline int ext3_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(ext3_acl_header); + s = size - 4 * sizeof(ext3_acl_entry_short); + if (s < 0) { + if (size % sizeof(ext3_acl_entry_short)) + return -1; + return size / sizeof(ext3_acl_entry_short); + } else { + if (s % sizeof(ext3_acl_entry)) + return -1; + return s / sizeof(ext3_acl_entry) + 4; + } +} + +#ifdef CONFIG_EXT3_FS_POSIX_ACL + +/* acl.c */ +extern int ext3_check_acl (struct inode *, int, unsigned int); +extern int ext3_acl_chmod (struct inode *); +extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); + +#else /* CONFIG_EXT3_FS_POSIX_ACL */ +#include +#define ext3_check_acl NULL + +static inline int +ext3_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int +ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif /* CONFIG_EXT3_FS_POSIX_ACL */ + diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c new file mode 100644 index 00000000..fe52297e --- /dev/null +++ b/fs/ext3/balloc.c @@ -0,0 +1,2156 @@ +/* + * linux/fs/ext3/balloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * balloc.c contains the blocks allocation and deallocation routines + */ + +/* + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext3_fill_super). + */ + + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +/* + * Calculate the block group number and offset, given a block number + */ +static void ext3_get_group_no_and_offset(struct super_block *sb, + ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp) +{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + blocknr = blocknr - le32_to_cpu(es->s_first_data_block); + if (offsetp) + *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb); + if (blockgrpp) + *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb); +} + +/** + * ext3_get_group_desc() -- load group descriptor from disk + * @sb: super block + * @block_group: given block group + * @bh: pointer to the buffer head to store the block + * group descriptor + */ +struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh) +{ + unsigned long group_desc; + unsigned long offset; + struct ext3_group_desc * desc; + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (block_group >= sbi->s_groups_count) { + ext3_error (sb, "ext3_get_group_desc", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sbi->s_groups_count); + + return NULL; + } + smp_rmb(); + + group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb); + offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1); + if (!sbi->s_group_desc[group_desc]) { + ext3_error (sb, "ext3_get_group_desc", + "Group descriptor not loaded - " + "block_group = %d, group_desc = %lu, desc = %lu", + block_group, group_desc, offset); + return NULL; + } + + desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data; + if (bh) + *bh = sbi->s_group_desc[group_desc]; + return desc + offset; +} + +static int ext3_valid_block_bitmap(struct super_block *sb, + struct ext3_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh) +{ + ext3_grpblk_t offset; + ext3_grpblk_t next_zero_bit; + ext3_fsblk_t bitmap_blk; + ext3_fsblk_t group_first_block; + + group_first_block = ext3_group_first_block_no(sb, block_group); + + /* check whether block bitmap block number is set */ + bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); + offset = bitmap_blk - group_first_block; + if (!ext3_test_bit(offset, bh->b_data)) + /* bad block bitmap */ + goto err_out; + + /* check whether the inode bitmap block number is set */ + bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap); + offset = bitmap_blk - group_first_block; + if (!ext3_test_bit(offset, bh->b_data)) + /* bad block bitmap */ + goto err_out; + + /* check whether the inode table block number is set */ + bitmap_blk = le32_to_cpu(desc->bg_inode_table); + offset = bitmap_blk - group_first_block; + next_zero_bit = ext3_find_next_zero_bit(bh->b_data, + offset + EXT3_SB(sb)->s_itb_per_group, + offset); + if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group) + /* good bitmap for inode tables */ + return 1; + +err_out: + ext3_error(sb, __func__, + "Invalid block bitmap - " + "block_group = %d, block = %lu", + block_group, bitmap_blk); + return 0; +} + +/** + * read_block_bitmap() + * @sb: super block + * @block_group: given block group + * + * Read the bitmap for a given block_group,and validate the + * bits for block/inode/inode tables are set in the bitmaps + * + * Return buffer_head on success or NULL in case of failure. + */ +static struct buffer_head * +read_block_bitmap(struct super_block *sb, unsigned int block_group) +{ + struct ext3_group_desc * desc; + struct buffer_head * bh = NULL; + ext3_fsblk_t bitmap_blk; + + desc = ext3_get_group_desc(sb, block_group, NULL); + if (!desc) + return NULL; + bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext3_error(sb, __func__, + "Cannot read block bitmap - " + "block_group = %d, block_bitmap = %u", + block_group, le32_to_cpu(desc->bg_block_bitmap)); + return NULL; + } + if (likely(bh_uptodate_or_lock(bh))) + return bh; + + if (bh_submit_read(bh) < 0) { + brelse(bh); + ext3_error(sb, __func__, + "Cannot read block bitmap - " + "block_group = %d, block_bitmap = %u", + block_group, le32_to_cpu(desc->bg_block_bitmap)); + return NULL; + } + ext3_valid_block_bitmap(sb, desc, block_group, bh); + /* + * file system mounted not to panic on error, continue with corrupt + * bitmap + */ + return bh; +} +/* + * The reservation window structure operations + * -------------------------------------------- + * Operations include: + * dump, find, add, remove, is_empty, find_next_reservable_window, etc. + * + * We use a red-black tree to represent per-filesystem reservation + * windows. + * + */ + +/** + * __rsv_window_dump() -- Dump the filesystem block allocation reservation map + * @rb_root: root of per-filesystem reservation rb tree + * @verbose: verbose mode + * @fn: function which wishes to dump the reservation map + * + * If verbose is turned on, it will print the whole block reservation + * windows(start, end). Otherwise, it will only print out the "bad" windows, + * those windows that overlap with their immediate neighbors. + */ +#if 1 +static void __rsv_window_dump(struct rb_root *root, int verbose, + const char *fn) +{ + struct rb_node *n; + struct ext3_reserve_window_node *rsv, *prev; + int bad; + +restart: + n = rb_first(root); + bad = 0; + prev = NULL; + + printk("Block Allocation Reservation Windows Map (%s):\n", fn); + while (n) { + rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); + if (verbose) + printk("reservation window 0x%p " + "start: %lu, end: %lu\n", + rsv, rsv->rsv_start, rsv->rsv_end); + if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) { + printk("Bad reservation %p (start >= end)\n", + rsv); + bad = 1; + } + if (prev && prev->rsv_end >= rsv->rsv_start) { + printk("Bad reservation %p (prev->end >= start)\n", + rsv); + bad = 1; + } + if (bad) { + if (!verbose) { + printk("Restarting reservation walk in verbose mode\n"); + verbose = 1; + goto restart; + } + } + n = rb_next(n); + prev = rsv; + } + printk("Window map complete.\n"); + BUG_ON(bad); +} +#define rsv_window_dump(root, verbose) \ + __rsv_window_dump((root), (verbose), __func__) +#else +#define rsv_window_dump(root, verbose) do {} while (0) +#endif + +/** + * goal_in_my_reservation() + * @rsv: inode's reservation window + * @grp_goal: given goal block relative to the allocation block group + * @group: the current allocation block group + * @sb: filesystem super block + * + * Test if the given goal block (group relative) is within the file's + * own block reservation window range. + * + * If the reservation window is outside the goal allocation group, return 0; + * grp_goal (given goal block) could be -1, which means no specific + * goal block. In this case, always return 1. + * If the goal block is within the reservation window, return 1; + * otherwise, return 0; + */ +static int +goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal, + unsigned int group, struct super_block * sb) +{ + ext3_fsblk_t group_first_block, group_last_block; + + group_first_block = ext3_group_first_block_no(sb, group); + group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); + + if ((rsv->_rsv_start > group_last_block) || + (rsv->_rsv_end < group_first_block)) + return 0; + if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start) + || (grp_goal + group_first_block > rsv->_rsv_end))) + return 0; + return 1; +} + +/** + * search_reserve_window() + * @rb_root: root of reservation tree + * @goal: target allocation block + * + * Find the reserved window which includes the goal, or the previous one + * if the goal is not in any window. + * Returns NULL if there are no windows or if all windows start after the goal. + */ +static struct ext3_reserve_window_node * +search_reserve_window(struct rb_root *root, ext3_fsblk_t goal) +{ + struct rb_node *n = root->rb_node; + struct ext3_reserve_window_node *rsv; + + if (!n) + return NULL; + + do { + rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); + + if (goal < rsv->rsv_start) + n = n->rb_left; + else if (goal > rsv->rsv_end) + n = n->rb_right; + else + return rsv; + } while (n); + /* + * We've fallen off the end of the tree: the goal wasn't inside + * any particular node. OK, the previous node must be to one + * side of the interval containing the goal. If it's the RHS, + * we need to back up one. + */ + if (rsv->rsv_start > goal) { + n = rb_prev(&rsv->rsv_node); + rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); + } + return rsv; +} + +/** + * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree. + * @sb: super block + * @rsv: reservation window to add + * + * Must be called with rsv_lock hold. + */ +void ext3_rsv_window_add(struct super_block *sb, + struct ext3_reserve_window_node *rsv) +{ + struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root; + struct rb_node *node = &rsv->rsv_node; + ext3_fsblk_t start = rsv->rsv_start; + + struct rb_node ** p = &root->rb_node; + struct rb_node * parent = NULL; + struct ext3_reserve_window_node *this; + + while (*p) + { + parent = *p; + this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node); + + if (start < this->rsv_start) + p = &(*p)->rb_left; + else if (start > this->rsv_end) + p = &(*p)->rb_right; + else { + rsv_window_dump(root, 1); + BUG(); + } + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); +} + +/** + * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree + * @sb: super block + * @rsv: reservation window to remove + * + * Mark the block reservation window as not allocated, and unlink it + * from the filesystem reservation window rb tree. Must be called with + * rsv_lock hold. + */ +static void rsv_window_remove(struct super_block *sb, + struct ext3_reserve_window_node *rsv) +{ + rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_alloc_hit = 0; + rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root); +} + +/* + * rsv_is_empty() -- Check if the reservation window is allocated. + * @rsv: given reservation window to check + * + * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED. + */ +static inline int rsv_is_empty(struct ext3_reserve_window *rsv) +{ + /* a valid reservation end block could not be 0 */ + return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED; +} + +/** + * ext3_init_block_alloc_info() + * @inode: file inode structure + * + * Allocate and initialize the reservation window structure, and + * link the window to the ext3 inode structure at last + * + * The reservation window structure is only dynamically allocated + * and linked to ext3 inode the first time the open file + * needs a new block. So, before every ext3_new_block(s) call, for + * regular files, we should check whether the reservation window + * structure exists or not. In the latter case, this function is called. + * Fail to do so will result in block reservation being turned off for that + * open file. + * + * This function is called from ext3_get_blocks_handle(), also called + * when setting the reservation window size through ioctl before the file + * is open for write (needs block allocation). + * + * Needs truncate_mutex protection prior to call this function. + */ +void ext3_init_block_alloc_info(struct inode *inode) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; + struct super_block *sb = inode->i_sb; + + block_i = kmalloc(sizeof(*block_i), GFP_NOFS); + if (block_i) { + struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node; + + rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + + /* + * if filesystem is mounted with NORESERVATION, the goal + * reservation window size is set to zero to indicate + * block reservation is off + */ + if (!test_opt(sb, RESERVATION)) + rsv->rsv_goal_size = 0; + else + rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS; + rsv->rsv_alloc_hit = 0; + block_i->last_alloc_logical_block = 0; + block_i->last_alloc_physical_block = 0; + } + ei->i_block_alloc_info = block_i; +} + +/** + * ext3_discard_reservation() + * @inode: inode + * + * Discard(free) block reservation window on last file close, or truncate + * or at last iput(). + * + * It is being called in three cases: + * ext3_release_file(): last writer close the file + * ext3_clear_inode(): last iput(), when nobody link to this file. + * ext3_truncate(): when the block indirect map is about to change. + * + */ +void ext3_discard_reservation(struct inode *inode) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext3_reserve_window_node *rsv; + spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock; + + if (!block_i) + return; + + rsv = &block_i->rsv_window_node; + if (!rsv_is_empty(&rsv->rsv_window)) { + spin_lock(rsv_lock); + if (!rsv_is_empty(&rsv->rsv_window)) + rsv_window_remove(inode->i_sb, rsv); + spin_unlock(rsv_lock); + } +} + +/** + * ext3_free_blocks_sb() -- Free given blocks and update quota + * @handle: handle to this transaction + * @sb: super block + * @block: start physcial block to free + * @count: number of blocks to free + * @pdquot_freed_blocks: pointer to quota + */ +void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb, + ext3_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gd_bh; + unsigned long block_group; + ext3_grpblk_t bit; + unsigned long i; + unsigned long overflow; + struct ext3_group_desc * desc; + struct ext3_super_block * es; + struct ext3_sb_info *sbi; + int err = 0, ret; + ext3_grpblk_t group_freed; + + *pdquot_freed_blocks = 0; + sbi = EXT3_SB(sb); + es = sbi->s_es; + if (block < le32_to_cpu(es->s_first_data_block) || + block + count < block || + block + count > le32_to_cpu(es->s_blocks_count)) { + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks not in datazone - " + "block = "E3FSBLK", count = %lu", block, count); + goto error_return; + } + + ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1); + +do_more: + overflow = 0; + block_group = (block - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + bit = (block - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { + overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + desc = ext3_get_group_desc (sb, block_group, &gd_bh); + if (!desc) + goto error_return; + + if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) || + in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) || + in_range (block, le32_to_cpu(desc->bg_inode_table), + sbi->s_itb_per_group) || + in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table), + sbi->s_itb_per_group)) { + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks in system zones - " + "Block = "E3FSBLK", count = %lu", + block, count); + goto error_return; + } + + /* + * We are about to start releasing blocks in the bitmap, + * so we need undo access. + */ + /* @@@ check errors */ + BUFFER_TRACE(bitmap_bh, "getting undo access"); + err = ext3_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + jbd_lock_bh_state(bitmap_bh); + + for (i = 0, group_freed = 0; i < count; i++) { + /* + * An HJ special. This is expensive... + */ +#ifdef CONFIG_JBD_DEBUG + jbd_unlock_bh_state(bitmap_bh); + { + struct buffer_head *debug_bh; + debug_bh = sb_find_get_block(sb, block + i); + if (debug_bh) { + BUFFER_TRACE(debug_bh, "Deleted!"); + if (!bh2jh(bitmap_bh)->b_committed_data) + BUFFER_TRACE(debug_bh, + "No committed data in bitmap"); + BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); + __brelse(debug_bh); + } + } + jbd_lock_bh_state(bitmap_bh); +#endif + if (need_resched()) { + jbd_unlock_bh_state(bitmap_bh); + cond_resched(); + jbd_lock_bh_state(bitmap_bh); + } + /* @@@ This prevents newly-allocated data from being + * freed and then reallocated within the same + * transaction. + * + * Ideally we would want to allow that to happen, but to + * do so requires making journal_forget() capable of + * revoking the queued write of a data block, which + * implies blocking on the journal lock. *forget() + * cannot block due to truncate races. + * + * Eventually we can fix this by making journal_forget() + * return a status indicating whether or not it was able + * to revoke the buffer. On successful revoke, it is + * safe not to set the allocation bit in the committed + * bitmap, because we know that there is no outstanding + * activity on the buffer any more and so it is safe to + * reallocate it. + */ + BUFFER_TRACE(bitmap_bh, "set in b_committed_data"); + J_ASSERT_BH(bitmap_bh, + bh2jh(bitmap_bh)->b_committed_data != NULL); + ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, + bh2jh(bitmap_bh)->b_committed_data); + + /* + * We clear the bit in the bitmap after setting the committed + * data bit, because this is the reverse order to that which + * the allocator uses. + */ + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + bit + i, bitmap_bh->b_data)) { + jbd_unlock_bh_state(bitmap_bh); + ext3_error(sb, __func__, + "bit already cleared for block "E3FSBLK, + block + i); + jbd_lock_bh_state(bitmap_bh); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + group_freed++; + } + } + jbd_unlock_bh_state(bitmap_bh); + + spin_lock(sb_bgl_lock(sbi, block_group)); + le16_add_cpu(&desc->bg_free_blocks_count, group_freed); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_add(&sbi->s_freeblocks_counter, count); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext3_journal_dirty_metadata(handle, gd_bh); + if (!err) err = ret; + *pdquot_freed_blocks += group_freed; + + if (overflow && !err) { + block += count; + count = overflow; + goto do_more; + } + +error_return: + brelse(bitmap_bh); + ext3_std_error(sb, err); + return; +} + +/** + * ext3_free_blocks() -- Free given blocks and update quota + * @handle: handle for this transaction + * @inode: inode + * @block: start physical block to free + * @count: number of blocks to count + */ +void ext3_free_blocks(handle_t *handle, struct inode *inode, + ext3_fsblk_t block, unsigned long count) +{ + struct super_block * sb; + unsigned long dquot_freed_blocks; + + sb = inode->i_sb; + if (!sb) { + printk ("ext3_free_blocks: nonexistent device"); + return; + } + ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); + if (dquot_freed_blocks) + dquot_free_block(inode, dquot_freed_blocks); + return; +} + +/** + * ext3_test_allocatable() + * @nr: given allocation block group + * @bh: bufferhead contains the bitmap of the given block group + * + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This + * prevents deletes from freeing up the page for reuse until we have + * committed the delete transaction. + * + * If we didn't do this, then deleting something and reallocating it as + * data would allow the old block to be overwritten before the + * transaction committed (because we force data to disk before commit). + * This would lead to corruption if we crashed between overwriting the + * data and committing the delete. + * + * @@@ We may want to make this allocation behaviour conditional on + * data-writes at some point, and disable it for metadata allocations or + * sync-data inodes. + */ +static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh) +{ + int ret; + struct journal_head *jh = bh2jh(bh); + + if (ext3_test_bit(nr, bh->b_data)) + return 0; + + jbd_lock_bh_state(bh); + if (!jh->b_committed_data) + ret = 1; + else + ret = !ext3_test_bit(nr, jh->b_committed_data); + jbd_unlock_bh_state(bh); + return ret; +} + +/** + * bitmap_search_next_usable_block() + * @start: the starting block (group relative) of the search + * @bh: bufferhead contains the block group bitmap + * @maxblocks: the ending block (group relative) of the reservation + * + * The bitmap search --- search forward alternately through the actual + * bitmap on disk and the last-committed copy in journal, until we find a + * bit free in both bitmaps. + */ +static ext3_grpblk_t +bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, + ext3_grpblk_t maxblocks) +{ + ext3_grpblk_t next; + struct journal_head *jh = bh2jh(bh); + + while (start < maxblocks) { + next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start); + if (next >= maxblocks) + return -1; + if (ext3_test_allocatable(next, bh)) + return next; + jbd_lock_bh_state(bh); + if (jh->b_committed_data) + start = ext3_find_next_zero_bit(jh->b_committed_data, + maxblocks, next); + jbd_unlock_bh_state(bh); + } + return -1; +} + +/** + * find_next_usable_block() + * @start: the starting block (group relative) to find next + * allocatable block in bitmap. + * @bh: bufferhead contains the block group bitmap + * @maxblocks: the ending block (group relative) for the search + * + * Find an allocatable block in a bitmap. We honor both the bitmap and + * its last-committed copy (if that exists), and perform the "most + * appropriate allocation" algorithm of looking for a free block near + * the initial goal; then for a free byte somewhere in the bitmap; then + * for any free bit in the bitmap. + */ +static ext3_grpblk_t +find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, + ext3_grpblk_t maxblocks) +{ + ext3_grpblk_t here, next; + char *p, *r; + + if (start > 0) { + /* + * The goal was occupied; search forward for a free + * block within the next XX blocks. + * + * end_goal is more or less random, but it has to be + * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the + * next 64-bit boundary is simple.. + */ + ext3_grpblk_t end_goal = (start + 63) & ~63; + if (end_goal > maxblocks) + end_goal = maxblocks; + here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); + if (here < end_goal && ext3_test_allocatable(here, bh)) + return here; + ext3_debug("Bit not found near goal\n"); + } + + here = start; + if (here < 0) + here = 0; + + p = bh->b_data + (here >> 3); + r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); + next = (r - bh->b_data) << 3; + + if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh)) + return next; + + /* + * The bitmap search --- search forward alternately through the actual + * bitmap and the last-committed copy until we find a bit free in + * both + */ + here = bitmap_search_next_usable_block(here, bh, maxblocks); + return here; +} + +/** + * claim_block() + * @lock: the spin lock for this block group + * @block: the free block (group relative) to allocate + * @bh: the buffer_head contains the block group bitmap + * + * We think we can allocate this block in this bitmap. Try to set the bit. + * If that succeeds then check that nobody has allocated and then freed the + * block since we saw that is was not marked in b_committed_data. If it _was_ + * allocated and freed then clear the bit in the bitmap again and return + * zero (failure). + */ +static inline int +claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh) +{ + struct journal_head *jh = bh2jh(bh); + int ret; + + if (ext3_set_bit_atomic(lock, block, bh->b_data)) + return 0; + jbd_lock_bh_state(bh); + if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) { + ext3_clear_bit_atomic(lock, block, bh->b_data); + ret = 0; + } else { + ret = 1; + } + jbd_unlock_bh_state(bh); + return ret; +} + +/** + * ext3_try_to_allocate() + * @sb: superblock + * @handle: handle to this transaction + * @group: given allocation block group + * @bitmap_bh: bufferhead holds the block bitmap + * @grp_goal: given target block within the group + * @count: target number of blocks to allocate + * @my_rsv: reservation window + * + * Attempt to allocate blocks within a give range. Set the range of allocation + * first, then find the first free bit(s) from the bitmap (within the range), + * and at last, allocate the blocks by claiming the found free bit as allocated. + * + * To set the range of this allocation: + * if there is a reservation window, only try to allocate block(s) from the + * file's own reservation window; + * Otherwise, the allocation range starts from the give goal block, ends at + * the block group's last block. + * + * If we failed to allocate the desired block then we may end up crossing to a + * new bitmap. In that case we must release write access to the old one via + * ext3_journal_release_buffer(), else we'll run out of credits. + */ +static ext3_grpblk_t +ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, + struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal, + unsigned long *count, struct ext3_reserve_window *my_rsv) +{ + ext3_fsblk_t group_first_block; + ext3_grpblk_t start, end; + unsigned long num = 0; + + /* we do allocation within the reservation window if we have a window */ + if (my_rsv) { + group_first_block = ext3_group_first_block_no(sb, group); + if (my_rsv->_rsv_start >= group_first_block) + start = my_rsv->_rsv_start - group_first_block; + else + /* reservation window cross group boundary */ + start = 0; + end = my_rsv->_rsv_end - group_first_block + 1; + if (end > EXT3_BLOCKS_PER_GROUP(sb)) + /* reservation window crosses group boundary */ + end = EXT3_BLOCKS_PER_GROUP(sb); + if ((start <= grp_goal) && (grp_goal < end)) + start = grp_goal; + else + grp_goal = -1; + } else { + if (grp_goal > 0) + start = grp_goal; + else + start = 0; + end = EXT3_BLOCKS_PER_GROUP(sb); + } + + BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb)); + +repeat: + if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) { + grp_goal = find_next_usable_block(start, bitmap_bh, end); + if (grp_goal < 0) + goto fail_access; + if (!my_rsv) { + int i; + + for (i = 0; i < 7 && grp_goal > start && + ext3_test_allocatable(grp_goal - 1, + bitmap_bh); + i++, grp_goal--) + ; + } + } + start = grp_goal; + + if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), + grp_goal, bitmap_bh)) { + /* + * The block was allocated by another thread, or it was + * allocated and then freed by another thread + */ + start++; + grp_goal++; + if (start >= end) + goto fail_access; + goto repeat; + } + num++; + grp_goal++; + while (num < *count && grp_goal < end + && ext3_test_allocatable(grp_goal, bitmap_bh) + && claim_block(sb_bgl_lock(EXT3_SB(sb), group), + grp_goal, bitmap_bh)) { + num++; + grp_goal++; + } + *count = num; + return grp_goal - num; +fail_access: + *count = num; + return -1; +} + +/** + * find_next_reservable_window(): + * find a reservable space within the given range. + * It does not allocate the reservation window for now: + * alloc_new_reservation() will do the work later. + * + * @search_head: the head of the searching list; + * This is not necessarily the list head of the whole filesystem + * + * We have both head and start_block to assist the search + * for the reservable space. The list starts from head, + * but we will shift to the place where start_block is, + * then start from there, when looking for a reservable space. + * + * @my_rsv: the reservation window + * + * @sb: the super block + * + * @start_block: the first block we consider to start + * the real search from + * + * @last_block: + * the maximum block number that our goal reservable space + * could start from. This is normally the last block in this + * group. The search will end when we found the start of next + * possible reservable space is out of this boundary. + * This could handle the cross boundary reservation window + * request. + * + * basically we search from the given range, rather than the whole + * reservation double linked list, (start_block, last_block) + * to find a free region that is of my size and has not + * been reserved. + * + */ +static int find_next_reservable_window( + struct ext3_reserve_window_node *search_head, + struct ext3_reserve_window_node *my_rsv, + struct super_block * sb, + ext3_fsblk_t start_block, + ext3_fsblk_t last_block) +{ + struct rb_node *next; + struct ext3_reserve_window_node *rsv, *prev; + ext3_fsblk_t cur; + int size = my_rsv->rsv_goal_size; + + /* TODO: make the start of the reservation window byte-aligned */ + /* cur = *start_block & ~7;*/ + cur = start_block; + rsv = search_head; + if (!rsv) + return -1; + + while (1) { + if (cur <= rsv->rsv_end) + cur = rsv->rsv_end + 1; + + /* TODO? + * in the case we could not find a reservable space + * that is what is expected, during the re-search, we could + * remember what's the largest reservable space we could have + * and return that one. + * + * For now it will fail if we could not find the reservable + * space with expected-size (or more)... + */ + if (cur > last_block) + return -1; /* fail */ + + prev = rsv; + next = rb_next(&rsv->rsv_node); + rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node); + + /* + * Reached the last reservation, we can just append to the + * previous one. + */ + if (!next) + break; + + if (cur + size <= rsv->rsv_start) { + /* + * Found a reserveable space big enough. We could + * have a reservation across the group boundary here + */ + break; + } + } + /* + * we come here either : + * when we reach the end of the whole list, + * and there is empty reservable space after last entry in the list. + * append it to the end of the list. + * + * or we found one reservable space in the middle of the list, + * return the reservation window that we could append to. + * succeed. + */ + + if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) + rsv_window_remove(sb, my_rsv); + + /* + * Let's book the whole available window for now. We will check the + * disk bitmap later and then, if there are free blocks then we adjust + * the window size if it's larger than requested. + * Otherwise, we will remove this node from the tree next time + * call find_next_reservable_window. + */ + my_rsv->rsv_start = cur; + my_rsv->rsv_end = cur + size - 1; + my_rsv->rsv_alloc_hit = 0; + + if (prev != my_rsv) + ext3_rsv_window_add(sb, my_rsv); + + return 0; +} + +/** + * alloc_new_reservation()--allocate a new reservation window + * + * To make a new reservation, we search part of the filesystem + * reservation list (the list that inside the group). We try to + * allocate a new reservation window near the allocation goal, + * or the beginning of the group, if there is no goal. + * + * We first find a reservable space after the goal, then from + * there, we check the bitmap for the first free block after + * it. If there is no free block until the end of group, then the + * whole group is full, we failed. Otherwise, check if the free + * block is inside the expected reservable space, if so, we + * succeed. + * If the first free block is outside the reservable space, then + * start from the first free block, we search for next available + * space, and go on. + * + * on succeed, a new reservation will be found and inserted into the list + * It contains at least one free block, and it does not overlap with other + * reservation windows. + * + * failed: we failed to find a reservation window in this group + * + * @my_rsv: the reservation window + * + * @grp_goal: The goal (group-relative). It is where the search for a + * free reservable space should start from. + * if we have a grp_goal(grp_goal >0 ), then start from there, + * no grp_goal(grp_goal = -1), we start from the first block + * of the group. + * + * @sb: the super block + * @group: the group we are trying to allocate in + * @bitmap_bh: the block group block bitmap + * + */ +static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, + ext3_grpblk_t grp_goal, struct super_block *sb, + unsigned int group, struct buffer_head *bitmap_bh) +{ + struct ext3_reserve_window_node *search_head; + ext3_fsblk_t group_first_block, group_end_block, start_block; + ext3_grpblk_t first_free_block; + struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; + unsigned long size; + int ret; + spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; + + group_first_block = ext3_group_first_block_no(sb, group); + group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); + + if (grp_goal < 0) + start_block = group_first_block; + else + start_block = grp_goal + group_first_block; + + size = my_rsv->rsv_goal_size; + + if (!rsv_is_empty(&my_rsv->rsv_window)) { + /* + * if the old reservation is cross group boundary + * and if the goal is inside the old reservation window, + * we will come here when we just failed to allocate from + * the first part of the window. We still have another part + * that belongs to the next group. In this case, there is no + * point to discard our window and try to allocate a new one + * in this group(which will fail). we should + * keep the reservation window, just simply move on. + * + * Maybe we could shift the start block of the reservation + * window to the first block of next group. + */ + + if ((my_rsv->rsv_start <= group_end_block) && + (my_rsv->rsv_end > group_end_block) && + (start_block >= my_rsv->rsv_start)) + return -1; + + if ((my_rsv->rsv_alloc_hit > + (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) { + /* + * if the previously allocation hit ratio is + * greater than 1/2, then we double the size of + * the reservation window the next time, + * otherwise we keep the same size window + */ + size = size * 2; + if (size > EXT3_MAX_RESERVE_BLOCKS) + size = EXT3_MAX_RESERVE_BLOCKS; + my_rsv->rsv_goal_size= size; + } + } + + spin_lock(rsv_lock); + /* + * shift the search start to the window near the goal block + */ + search_head = search_reserve_window(fs_rsv_root, start_block); + + /* + * find_next_reservable_window() simply finds a reservable window + * inside the given range(start_block, group_end_block). + * + * To make sure the reservation window has a free bit inside it, we + * need to check the bitmap after we found a reservable window. + */ +retry: + ret = find_next_reservable_window(search_head, my_rsv, sb, + start_block, group_end_block); + + if (ret == -1) { + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; + } + + /* + * On success, find_next_reservable_window() returns the + * reservation window where there is a reservable space after it. + * Before we reserve this reservable space, we need + * to make sure there is at least a free block inside this region. + * + * searching the first free bit on the block bitmap and copy of + * last committed bitmap alternatively, until we found a allocatable + * block. Search start from the start block of the reservable space + * we just found. + */ + spin_unlock(rsv_lock); + first_free_block = bitmap_search_next_usable_block( + my_rsv->rsv_start - group_first_block, + bitmap_bh, group_end_block - group_first_block + 1); + + if (first_free_block < 0) { + /* + * no free block left on the bitmap, no point + * to reserve the space. return failed. + */ + spin_lock(rsv_lock); + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; /* failed */ + } + + start_block = first_free_block + group_first_block; + /* + * check if the first free block is within the + * free space we just reserved + */ + if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) + return 0; /* success */ + /* + * if the first free bit we found is out of the reservable space + * continue search for next reservable space, + * start from where the free block is, + * we also shift the list head to where we stopped last time + */ + search_head = my_rsv; + spin_lock(rsv_lock); + goto retry; +} + +/** + * try_to_extend_reservation() + * @my_rsv: given reservation window + * @sb: super block + * @size: the delta to extend + * + * Attempt to expand the reservation window large enough to have + * required number of free blocks + * + * Since ext3_try_to_allocate() will always allocate blocks within + * the reservation window range, if the window size is too small, + * multiple blocks allocation has to stop at the end of the reservation + * window. To make this more efficient, given the total number of + * blocks needed and the current size of the window, we try to + * expand the reservation window size if necessary on a best-effort + * basis before ext3_new_blocks() tries to allocate blocks, + */ +static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv, + struct super_block *sb, int size) +{ + struct ext3_reserve_window_node *next_rsv; + struct rb_node *next; + spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; + + if (!spin_trylock(rsv_lock)) + return; + + next = rb_next(&my_rsv->rsv_node); + + if (!next) + my_rsv->rsv_end += size; + else { + next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node); + + if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size) + my_rsv->rsv_end += size; + else + my_rsv->rsv_end = next_rsv->rsv_start - 1; + } + spin_unlock(rsv_lock); +} + +/** + * ext3_try_to_allocate_with_rsv() + * @sb: superblock + * @handle: handle to this transaction + * @group: given allocation block group + * @bitmap_bh: bufferhead holds the block bitmap + * @grp_goal: given target block within the group + * @my_rsv: reservation window + * @count: target number of blocks to allocate + * @errp: pointer to store the error code + * + * This is the main function used to allocate a new block and its reservation + * window. + * + * Each time when a new block allocation is need, first try to allocate from + * its own reservation. If it does not have a reservation window, instead of + * looking for a free bit on bitmap first, then look up the reservation list to + * see if it is inside somebody else's reservation window, we try to allocate a + * reservation window for it starting from the goal first. Then do the block + * allocation within the reservation window. + * + * This will avoid keeping on searching the reservation list again and + * again when somebody is looking for a free block (without + * reservation), and there are lots of free blocks, but they are all + * being reserved. + * + * We use a red-black tree for the per-filesystem reservation list. + * + */ +static ext3_grpblk_t +ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, + unsigned int group, struct buffer_head *bitmap_bh, + ext3_grpblk_t grp_goal, + struct ext3_reserve_window_node * my_rsv, + unsigned long *count, int *errp) +{ + ext3_fsblk_t group_first_block, group_last_block; + ext3_grpblk_t ret = 0; + int fatal; + unsigned long num = *count; + + *errp = 0; + + /* + * Make sure we use undo access for the bitmap, because it is critical + * that we do the frozen_data COW on bitmap buffers in all cases even + * if the buffer is in BJ_Forget state in the committing transaction. + */ + BUFFER_TRACE(bitmap_bh, "get undo access for new block"); + fatal = ext3_journal_get_undo_access(handle, bitmap_bh); + if (fatal) { + *errp = fatal; + return -1; + } + + /* + * we don't deal with reservation when + * filesystem is mounted without reservation + * or the file is not a regular file + * or last attempt to allocate a block with reservation turned on failed + */ + if (my_rsv == NULL ) { + ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, + grp_goal, count, NULL); + goto out; + } + /* + * grp_goal is a group relative block number (if there is a goal) + * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb) + * first block is a filesystem wide block number + * first block is the block number of the first block in this group + */ + group_first_block = ext3_group_first_block_no(sb, group); + group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); + + /* + * Basically we will allocate a new block from inode's reservation + * window. + * + * We need to allocate a new reservation window, if: + * a) inode does not have a reservation window; or + * b) last attempt to allocate a block from existing reservation + * failed; or + * c) we come here with a goal and with a reservation window + * + * We do not need to allocate a new reservation window if we come here + * at the beginning with a goal and the goal is inside the window, or + * we don't have a goal but already have a reservation window. + * then we could go to allocate from the reservation window directly. + */ + while (1) { + if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || + !goal_in_my_reservation(&my_rsv->rsv_window, + grp_goal, group, sb)) { + if (my_rsv->rsv_goal_size < *count) + my_rsv->rsv_goal_size = *count; + ret = alloc_new_reservation(my_rsv, grp_goal, sb, + group, bitmap_bh); + if (ret < 0) + break; /* failed */ + + if (!goal_in_my_reservation(&my_rsv->rsv_window, + grp_goal, group, sb)) + grp_goal = -1; + } else if (grp_goal >= 0) { + int curr = my_rsv->rsv_end - + (grp_goal + group_first_block) + 1; + + if (curr < *count) + try_to_extend_reservation(my_rsv, sb, + *count - curr); + } + + if ((my_rsv->rsv_start > group_last_block) || + (my_rsv->rsv_end < group_first_block)) { + rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1); + BUG(); + } + ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, + grp_goal, &num, &my_rsv->rsv_window); + if (ret >= 0) { + my_rsv->rsv_alloc_hit += num; + *count = num; + break; /* succeed */ + } + num = *count; + } +out: + if (ret >= 0) { + BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for " + "bitmap block"); + fatal = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (fatal) { + *errp = fatal; + return -1; + } + return ret; + } + + BUFFER_TRACE(bitmap_bh, "journal_release_buffer"); + ext3_journal_release_buffer(handle, bitmap_bh); + return ret; +} + +/** + * ext3_has_free_blocks() + * @sbi: in-core super block structure. + * + * Check if filesystem has at least 1 free block available for allocation. + */ +static int ext3_has_free_blocks(struct ext3_sb_info *sbi) +{ + ext3_fsblk_t free_blocks, root_blocks; + + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); + if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current_fsuid() && + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + return 0; + } + return 1; +} + +/** + * ext3_should_retry_alloc() + * @sb: super block + * @retries number of attemps has been made + * + * ext3_should_retry_alloc() is called when ENOSPC is returned, and if + * it is profitable to retry the operation, this function will wait + * for the current or committing transaction to complete, and then + * return TRUE. + * + * if the total number of retries exceed three times, return FALSE. + */ +int ext3_should_retry_alloc(struct super_block *sb, int *retries) +{ + if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3) + return 0; + + jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); + + return journal_force_commit_nested(EXT3_SB(sb)->s_journal); +} + +/** + * ext3_new_blocks() -- core block(s) allocation function + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: target number of blocks to allocate + * @errp: error code + * + * ext3_new_blocks uses a goal block to assist allocation. It tries to + * allocate block(s) from the block group contains the goal block first. If that + * fails, it will try to allocate block(s) from other block groups without + * any specific goal block. + * + */ +ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, unsigned long *count, int *errp) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gdp_bh; + int group_no; + int goal_group; + ext3_grpblk_t grp_target_blk; /* blockgroup relative goal block */ + ext3_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ + ext3_fsblk_t ret_block; /* filesyetem-wide allocated block */ + int bgi; /* blockgroup iteration index */ + int fatal = 0, err; + int performed_allocation = 0; + ext3_grpblk_t free_blocks; /* number of free blocks in a group */ + struct super_block *sb; + struct ext3_group_desc *gdp; + struct ext3_super_block *es; + struct ext3_sb_info *sbi; + struct ext3_reserve_window_node *my_rsv = NULL; + struct ext3_block_alloc_info *block_i; + unsigned short windowsz = 0; +#ifdef EXT3FS_DEBUG + static int goal_hits, goal_attempts; +#endif + unsigned long ngroups; + unsigned long num = *count; + + *errp = -ENOSPC; + sb = inode->i_sb; + if (!sb) { + printk("ext3_new_block: nonexistent device"); + return 0; + } + + /* + * Check quota for allocation of this block. + */ + err = dquot_alloc_block(inode, num); + if (err) { + *errp = err; + return 0; + } + + sbi = EXT3_SB(sb); + es = EXT3_SB(sb)->s_es; + ext3_debug("goal=%lu.\n", goal); + /* + * Allocate a block from reservation only when + * filesystem is mounted with reservation(default,-o reservation), and + * it's a regular file, and + * the desired window size is greater than 0 (One could use ioctl + * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off + * reservation on that particular file) + */ + block_i = EXT3_I(inode)->i_block_alloc_info; + if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) + my_rsv = &block_i->rsv_window_node; + + if (!ext3_has_free_blocks(sbi)) { + *errp = -ENOSPC; + goto out; + } + + /* + * First, test whether the goal block is free. + */ + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= le32_to_cpu(es->s_blocks_count)) + goal = le32_to_cpu(es->s_first_data_block); + group_no = (goal - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + goal_group = group_no; +retry_alloc: + gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); + if (!gdp) + goto io_error; + + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + /* + * if there is not enough free blocks to make a new resevation + * turn off reservation for this allocation + */ + if (my_rsv && (free_blocks < windowsz) + && (free_blocks > 0) + && (rsv_is_empty(&my_rsv->rsv_window))) + my_rsv = NULL; + + if (free_blocks > 0) { + grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb)); + bitmap_bh = read_block_bitmap(sb, group_no); + if (!bitmap_bh) + goto io_error; + grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle, + group_no, bitmap_bh, grp_target_blk, + my_rsv, &num, &fatal); + if (fatal) + goto out; + if (grp_alloc_blk >= 0) + goto allocated; + } + + ngroups = EXT3_SB(sb)->s_groups_count; + smp_rmb(); + + /* + * Now search the rest of the groups. We assume that + * group_no and gdp correctly point to the last group visited. + */ + for (bgi = 0; bgi < ngroups; bgi++) { + group_no++; + if (group_no >= ngroups) + group_no = 0; + gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); + if (!gdp) + goto io_error; + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + /* + * skip this group (and avoid loading bitmap) if there + * are no free blocks + */ + if (!free_blocks) + continue; + /* + * skip this group if the number of + * free blocks is less than half of the reservation + * window size. + */ + if (my_rsv && (free_blocks <= (windowsz/2))) + continue; + + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, group_no); + if (!bitmap_bh) + goto io_error; + /* + * try to allocate block(s) from this group, without a goal(-1). + */ + grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle, + group_no, bitmap_bh, -1, my_rsv, + &num, &fatal); + if (fatal) + goto out; + if (grp_alloc_blk >= 0) + goto allocated; + } + /* + * We may end up a bogus earlier ENOSPC error due to + * filesystem is "full" of reservations, but + * there maybe indeed free blocks available on disk + * In this case, we just forget about the reservations + * just do block allocation as without reservations. + */ + if (my_rsv) { + my_rsv = NULL; + windowsz = 0; + group_no = goal_group; + goto retry_alloc; + } + /* No space left on the device */ + *errp = -ENOSPC; + goto out; + +allocated: + + ext3_debug("using block group %d(%d)\n", + group_no, gdp->bg_free_blocks_count); + + BUFFER_TRACE(gdp_bh, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, gdp_bh); + if (fatal) + goto out; + + ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no); + + if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) || + in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) || + in_range(ret_block, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group) || + in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group)) { + ext3_error(sb, "ext3_new_block", + "Allocating block in system zone - " + "blocks from "E3FSBLK", length %lu", + ret_block, num); + /* + * claim_block() marked the blocks we allocated as in use. So we + * may want to selectively mark some of the blocks as free. + */ + goto retry_alloc; + } + + performed_allocation = 1; + +#ifdef CONFIG_JBD_DEBUG + { + struct buffer_head *debug_bh; + + /* Record bitmap buffer state in the newly allocated block */ + debug_bh = sb_find_get_block(sb, ret_block); + if (debug_bh) { + BUFFER_TRACE(debug_bh, "state when allocated"); + BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state"); + brelse(debug_bh); + } + } + jbd_lock_bh_state(bitmap_bh); + spin_lock(sb_bgl_lock(sbi, group_no)); + if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) { + int i; + + for (i = 0; i < num; i++) { + if (ext3_test_bit(grp_alloc_blk+i, + bh2jh(bitmap_bh)->b_committed_data)) { + printk("%s: block was unexpectedly set in " + "b_committed_data\n", __func__); + } + } + } + ext3_debug("found bit %d\n", grp_alloc_blk); + spin_unlock(sb_bgl_lock(sbi, group_no)); + jbd_unlock_bh_state(bitmap_bh); +#endif + + if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) { + ext3_error(sb, "ext3_new_block", + "block("E3FSBLK") >= blocks count(%d) - " + "block_group = %d, es == %p ", ret_block, + le32_to_cpu(es->s_blocks_count), group_no, es); + goto out; + } + + /* + * It is up to the caller to add the new buffer to a journal + * list of some description. We don't know in advance whether + * the caller wants to use it as metadata or data. + */ + ext3_debug("allocating block %lu. Goal hits %d of %d.\n", + ret_block, goal_hits, goal_attempts); + + spin_lock(sb_bgl_lock(sbi, group_no)); + le16_add_cpu(&gdp->bg_free_blocks_count, -num); + spin_unlock(sb_bgl_lock(sbi, group_no)); + percpu_counter_sub(&sbi->s_freeblocks_counter, num); + + BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); + err = ext3_journal_dirty_metadata(handle, gdp_bh); + if (!fatal) + fatal = err; + + if (fatal) + goto out; + + *errp = 0; + brelse(bitmap_bh); + dquot_free_block(inode, *count-num); + *count = num; + return ret_block; + +io_error: + *errp = -EIO; +out: + if (fatal) { + *errp = fatal; + ext3_std_error(sb, fatal); + } + /* + * Undo the block allocation + */ + if (!performed_allocation) + dquot_free_block(inode, *count); + brelse(bitmap_bh); + return 0; +} + +ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int *errp) +{ + unsigned long count = 1; + + return ext3_new_blocks(handle, inode, goal, &count, errp); +} + +/** + * ext3_count_free_blocks() -- count filesystem free blocks + * @sb: superblock + * + * Adds up the number of free blocks from each block group. + */ +ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb) +{ + ext3_fsblk_t desc_count; + struct ext3_group_desc *gdp; + int i; + unsigned long ngroups = EXT3_SB(sb)->s_groups_count; +#ifdef EXT3FS_DEBUG + struct ext3_super_block *es; + ext3_fsblk_t bitmap_count; + unsigned long x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT3_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + + smp_rmb(); + for (i = 0; i < ngroups; i++) { + gdp = ext3_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, i); + if (bitmap_bh == NULL) + continue; + + x = ext3_count_free(bitmap_bh, sb->s_blocksize); + printk("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_blocks_count), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk("ext3_count_free_blocks: stored = "E3FSBLK + ", computed = "E3FSBLK", "E3FSBLK"\n", + le32_to_cpu(es->s_free_blocks_count), + desc_count, bitmap_count); + return bitmap_count; +#else + desc_count = 0; + smp_rmb(); + for (i = 0; i < ngroups; i++) { + gdp = ext3_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + } + + return desc_count; +#endif +} + +static inline int test_root(int a, int b) +{ + int num = b; + + while (a > num) + num *= b; + return num == a; +} + +static int ext3_group_sparse(int group) +{ + if (group <= 1) + return 1; + if (!(group & 1)) + return 0; + return (test_root(group, 7) || test_root(group, 5) || + test_root(group, 3)); +} + +/** + * ext3_bg_has_super - number of blocks used by the superblock in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the superblock (primary or backup) + * in this group. Currently this will be only 0 or 1. + */ +int ext3_bg_has_super(struct super_block *sb, int group) +{ + if (EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) && + !ext3_group_sparse(group)) + return 0; + return 1; +} + +static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group) +{ + unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb); + unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb); + unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1; + + if (group == first || group == first + 1 || group == last) + return 1; + return 0; +} + +static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group) +{ + return ext3_bg_has_super(sb, group) ? EXT3_SB(sb)->s_gdb_count : 0; +} + +/** + * ext3_bg_num_gdb - number of blocks used by the group table in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the group descriptor table + * (primary or backup) in this group. In the future there may be a + * different number of descriptor blocks in each group. + */ +unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) +{ + unsigned long first_meta_bg = + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg); + unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb); + + if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) || + metagroup < first_meta_bg) + return ext3_bg_num_gdb_nometa(sb,group); + + return ext3_bg_num_gdb_meta(sb,group); + +} + +/** + * ext3_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @group: allocation group to trim + * @start: first group block to examine + * @max: last group block to examine + * @gdp: allocation group description structure + * @minblocks: minimum extent block count + * + * ext3_trim_all_free walks through group's block bitmap searching for free + * blocks. When the free block is found, it tries to allocate this block and + * consequent free block to get the biggest free extent possible, until it + * reaches any used block. Then issue a TRIM command on this extent and free + * the extent in the block bitmap. This is done until whole group is scanned. + */ +ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group, + ext3_grpblk_t start, ext3_grpblk_t max, + ext3_grpblk_t minblocks) +{ + handle_t *handle; + ext3_grpblk_t next, free_blocks, bit, freed, count = 0; + ext3_fsblk_t discard_block; + struct ext3_sb_info *sbi; + struct buffer_head *gdp_bh, *bitmap_bh = NULL; + struct ext3_group_desc *gdp; + int err = 0, ret = 0; + + /* + * We will update one block bitmap, and one group descriptor + */ + handle = ext3_journal_start_sb(sb, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + bitmap_bh = read_block_bitmap(sb, group); + if (!bitmap_bh) { + err = -EIO; + goto err_out; + } + + BUFFER_TRACE(bitmap_bh, "getting undo access"); + err = ext3_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto err_out; + + gdp = ext3_get_group_desc(sb, group, &gdp_bh); + if (!gdp) { + err = -EIO; + goto err_out; + } + + BUFFER_TRACE(gdp_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto err_out; + + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + sbi = EXT3_SB(sb); + + /* Walk through the whole group */ + while (start < max) { + start = bitmap_search_next_usable_block(start, bitmap_bh, max); + if (start < 0) + break; + next = start; + + /* + * Allocate contiguous free extents by setting bits in the + * block bitmap + */ + while (next < max + && claim_block(sb_bgl_lock(sbi, group), + next, bitmap_bh)) { + next++; + } + + /* We did not claim any blocks */ + if (next == start) + continue; + + discard_block = (ext3_fsblk_t)start + + ext3_group_first_block_no(sb, group); + + /* Update counters */ + spin_lock(sb_bgl_lock(sbi, group)); + le16_add_cpu(&gdp->bg_free_blocks_count, start - next); + spin_unlock(sb_bgl_lock(sbi, group)); + percpu_counter_sub(&sbi->s_freeblocks_counter, next - start); + + free_blocks -= next - start; + /* Do not issue a TRIM on extents smaller than minblocks */ + if ((next - start) < minblocks) + goto free_extent; + + /* Send the TRIM command down to the device */ + err = sb_issue_discard(sb, discard_block, next - start, + GFP_NOFS, 0); + count += (next - start); +free_extent: + freed = 0; + + /* + * Clear bits in the bitmap + */ + for (bit = start; bit < next; bit++) { + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group), + bit, bitmap_bh->b_data)) { + ext3_error(sb, __func__, + "bit already cleared for block "E3FSBLK, + (unsigned long)bit); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + freed++; + } + } + + /* Update couters */ + spin_lock(sb_bgl_lock(sbi, group)); + le16_add_cpu(&gdp->bg_free_blocks_count, freed); + spin_unlock(sb_bgl_lock(sbi, group)); + percpu_counter_add(&sbi->s_freeblocks_counter, freed); + + start = next; + if (err < 0) { + if (err != -EOPNOTSUPP) + ext3_warning(sb, __func__, "Discard command " + "returned error %d\n", err); + break; + } + + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + break; + } + + cond_resched(); + + /* No more suitable extents */ + if (free_blocks < minblocks) + break; + } + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + ret = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (!err) + err = ret; + + /* And the group descriptor block */ + BUFFER_TRACE(gdp_bh, "dirtied group descriptor block"); + ret = ext3_journal_dirty_metadata(handle, gdp_bh); + if (!err) + err = ret; + + ext3_debug("trimmed %d blocks in the group %d\n", + count, group); + +err_out: + if (err) + count = err; + ext3_journal_stop(handle); + brelse(bitmap_bh); + + return count; +} + +/** + * ext3_trim_fs() -- trim ioctl handle function + * @sb: superblock for filesystem + * @start: First Byte to trim + * @len: number of Bytes to trim from start + * @minlen: minimum extent length in Bytes + * + * ext3_trim_fs goes through all allocation groups containing Bytes from + * start to start+len. For each such a group ext3_trim_all_free function + * is invoked to trim all free space. + */ +int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + ext3_grpblk_t last_block, first_block, free_blocks; + unsigned long first_group, last_group; + unsigned long group, ngroups; + struct ext3_group_desc *gdp; + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + uint64_t start, len, minlen, trimmed; + ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count); + int ret = 0; + + start = (range->start >> sb->s_blocksize_bits) + + le32_to_cpu(es->s_first_data_block); + len = range->len >> sb->s_blocksize_bits; + minlen = range->minlen >> sb->s_blocksize_bits; + trimmed = 0; + + if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb))) + return -EINVAL; + if (start >= max_blks) + goto out; + if (start + len > max_blks) + len = max_blks - start; + + ngroups = EXT3_SB(sb)->s_groups_count; + smp_rmb(); + + /* Determine first and last group to examine based on start and len */ + ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start, + &first_group, &first_block); + ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len), + &last_group, &last_block); + last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; + last_block = EXT3_BLOCKS_PER_GROUP(sb); + + if (first_group > last_group) + return -EINVAL; + + for (group = first_group; group <= last_group; group++) { + gdp = ext3_get_group_desc(sb, group, NULL); + if (!gdp) + break; + + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + if (free_blocks < minlen) + continue; + + /* + * For all the groups except the last one, last block will + * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to + * change it for the last group in which case first_block + + * len < EXT3_BLOCKS_PER_GROUP(sb). + */ + if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb)) + last_block = first_block + len; + len -= last_block - first_block; + + ret = ext3_trim_all_free(sb, group, first_block, + last_block, minlen); + if (ret < 0) + break; + + trimmed += ret; + first_block = 0; + } + + if (ret >= 0) + ret = 0; + +out: + range->len = trimmed * sb->s_blocksize; + + return ret; +} diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c new file mode 100644 index 00000000..6afc39d8 --- /dev/null +++ b/fs/ext3/bitmap.c @@ -0,0 +1,32 @@ +/* + * linux/fs/ext3/bitmap.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include +#include +#include + +#ifdef EXT3FS_DEBUG + +static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; + +unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) +{ + unsigned int i; + unsigned long sum = 0; + + if (!map) + return (0); + for (i = 0; i < numchars; i++) + sum += nibblemap[map->b_data[i] & 0xf] + + nibblemap[(map->b_data[i] >> 4) & 0xf]; + return (sum); +} + +#endif /* EXT3FS_DEBUG */ + diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c new file mode 100644 index 00000000..34f0a072 --- /dev/null +++ b/fs/ext3/dir.c @@ -0,0 +1,519 @@ +/* + * linux/fs/ext3/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Hash Tree Directory indexing (c) 2001 Daniel Phillips + * + */ + +#include +#include +#include +#include +#include +#include + +static unsigned char ext3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int ext3_readdir(struct file *, void *, filldir_t); +static int ext3_dx_readdir(struct file * filp, + void * dirent, filldir_t filldir); +static int ext3_release_dir (struct inode * inode, + struct file * filp); + +const struct file_operations ext3_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = ext3_readdir, /* we take BKL. needed?*/ + .unlocked_ioctl = ext3_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext3_compat_ioctl, +#endif + .fsync = ext3_sync_file, /* BKL held */ + .release = ext3_release_dir, +}; + + +static unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT3_FT_MAX)) + return DT_UNKNOWN; + + return (ext3_filetype_table[filetype]); +} + + +int ext3_check_dir_entry (const char * function, struct inode * dir, + struct ext3_dir_entry_2 * de, + struct buffer_head * bh, + unsigned long offset) +{ + const char * error_msg = NULL; + const int rlen = ext3_rec_len_from_disk(de->rec_len); + + if (unlikely(rlen < EXT3_DIR_REC_LEN(1))) + error_msg = "rec_len is smaller than minimal"; + else if (unlikely(rlen % 4 != 0)) + error_msg = "rec_len % 4 != 0"; + else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len))) + error_msg = "rec_len is too small for name_len"; + else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))) + error_msg = "directory entry across blocks"; + else if (unlikely(le32_to_cpu(de->inode) > + le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))) + error_msg = "inode out of bounds"; + + if (unlikely(error_msg != NULL)) + ext3_error (dir->i_sb, function, + "bad entry in directory #%lu: %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error_msg, offset, + (unsigned long) le32_to_cpu(de->inode), + rlen, de->name_len); + + return error_msg == NULL ? 1 : 0; +} + +static int ext3_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + int error = 0; + unsigned long offset; + int i, stored; + struct ext3_dir_entry_2 *de; + struct super_block *sb; + int err; + struct inode *inode = filp->f_path.dentry->d_inode; + int ret = 0; + int dir_has_error = 0; + + sb = inode->i_sb; + + if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_INDEX) && + ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) { + err = ext3_dx_readdir(filp, dirent, filldir); + if (err != ERR_BAD_DX_DIR) { + ret = err; + goto out; + } + /* + * We don't set the inode dirty flag since it's not + * critical that it get flushed back to the disk. + */ + EXT3_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; + } + stored = 0; + offset = filp->f_pos & (sb->s_blocksize - 1); + + while (!error && !stored && filp->f_pos < inode->i_size) { + unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb); + struct buffer_head map_bh; + struct buffer_head *bh = NULL; + + map_bh.b_state = 0; + err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0); + if (err > 0) { + pgoff_t index = map_bh.b_blocknr >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + if (!ra_has_index(&filp->f_ra, index)) + page_cache_sync_readahead( + sb->s_bdev->bd_inode->i_mapping, + &filp->f_ra, filp, + index, 1); + filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + bh = ext3_bread(NULL, inode, blk, 0, &err); + } + + /* + * We ignore I/O errors on directories so users have a chance + * of recovering data when there's a bad sector + */ + if (!bh) { + if (!dir_has_error) { + ext3_error(sb, __func__, "directory #%lu " + "contains a hole at offset %lld", + inode->i_ino, filp->f_pos); + dir_has_error = 1; + } + /* corrupt size? Maybe no more blocks to read */ + if (filp->f_pos > inode->i_blocks << 9) + break; + filp->f_pos += sb->s_blocksize - offset; + continue; + } + +revalidate: + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (filp->f_version != inode->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ext3_dir_entry_2 *) + (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext3_rec_len_from_disk(de->rec_len) < + EXT3_DIR_REC_LEN(1)) + break; + i += ext3_rec_len_from_disk(de->rec_len); + } + offset = i; + filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + | offset; + filp->f_version = inode->i_version; + } + + while (!error && filp->f_pos < inode->i_size + && offset < sb->s_blocksize) { + de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); + if (!ext3_check_dir_entry ("ext3_readdir", inode, de, + bh, offset)) { + /* On error, skip the f_pos to the + next block. */ + filp->f_pos = (filp->f_pos | + (sb->s_blocksize - 1)) + 1; + brelse (bh); + ret = stored; + goto out; + } + offset += ext3_rec_len_from_disk(de->rec_len); + if (le32_to_cpu(de->inode)) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + u64 version = filp->f_version; + + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type)); + if (error) + break; + if (version != filp->f_version) + goto revalidate; + stored ++; + } + filp->f_pos += ext3_rec_len_from_disk(de->rec_len); + } + offset = 0; + brelse (bh); + } +out: + return ret; +} + +/* + * These functions convert from the major/minor hash to an f_pos + * value. + * + * Currently we only use major hash numer. This is unfortunate, but + * on 32-bit machines, the same VFS interface is used for lseek and + * llseek, so if we use the 64 bit offset, then the 32-bit versions of + * lseek/telldir/seekdir will blow out spectacularly, and from within + * the ext2 low-level routine, we don't know if we're being called by + * a 64-bit version of the system call or the 32-bit version of the + * system call. Worse yet, NFSv2 only allows for a 32-bit readdir + * cookie. Sigh. + */ +#define hash2pos(major, minor) (major >> 1) +#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) +#define pos2min_hash(pos) (0) + +/* + * This structure holds the nodes of the red-black tree used to store + * the directory entry in hash order. + */ +struct fname { + __u32 hash; + __u32 minor_hash; + struct rb_node rb_hash; + struct fname *next; + __u32 inode; + __u8 name_len; + __u8 file_type; + char name[0]; +}; + +/* + * This functoin implements a non-recursive way of freeing all of the + * nodes in the red-black tree. + */ +static void free_rb_tree_fname(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct rb_node *parent; + struct fname *fname; + + while (n) { + /* Do the node's children first */ + if (n->rb_left) { + n = n->rb_left; + continue; + } + if (n->rb_right) { + n = n->rb_right; + continue; + } + /* + * The node has no children; free it, and then zero + * out parent's link to it. Finally go to the + * beginning of the loop and try to free the parent + * node. + */ + parent = rb_parent(n); + fname = rb_entry(n, struct fname, rb_hash); + while (fname) { + struct fname * old = fname; + fname = fname->next; + kfree (old); + } + if (!parent) + *root = RB_ROOT; + else if (parent->rb_left == n) + parent->rb_left = NULL; + else if (parent->rb_right == n) + parent->rb_right = NULL; + n = parent; + } +} + + +static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos) +{ + struct dir_private_info *p; + + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); + if (!p) + return NULL; + p->curr_hash = pos2maj_hash(pos); + p->curr_minor_hash = pos2min_hash(pos); + return p; +} + +void ext3_htree_free_dir_info(struct dir_private_info *p) +{ + free_rb_tree_fname(&p->root); + kfree(p); +} + +/* + * Given a directory entry, enter it into the fname rb tree. + */ +int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3_dir_entry_2 *dirent) +{ + struct rb_node **p, *parent = NULL; + struct fname * fname, *new_fn; + struct dir_private_info *info; + int len; + + info = (struct dir_private_info *) dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ + len = sizeof(struct fname) + dirent->name_len + 1; + new_fn = kzalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; + new_fn->hash = hash; + new_fn->minor_hash = minor_hash; + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = dirent->name_len; + new_fn->file_type = dirent->file_type; + memcpy(new_fn->name, dirent->name, dirent->name_len); + new_fn->name[dirent->name_len] = 0; + + while (*p) { + parent = *p; + fname = rb_entry(parent, struct fname, rb_hash); + + /* + * If the hash and minor hash match up, then we put + * them on a linked list. This rarely happens... + */ + if ((new_fn->hash == fname->hash) && + (new_fn->minor_hash == fname->minor_hash)) { + new_fn->next = fname->next; + fname->next = new_fn; + return 0; + } + + if (new_fn->hash < fname->hash) + p = &(*p)->rb_left; + else if (new_fn->hash > fname->hash) + p = &(*p)->rb_right; + else if (new_fn->minor_hash < fname->minor_hash) + p = &(*p)->rb_left; + else /* if (new_fn->minor_hash > fname->minor_hash) */ + p = &(*p)->rb_right; + } + + rb_link_node(&new_fn->rb_hash, parent, p); + rb_insert_color(&new_fn->rb_hash, &info->root); + return 0; +} + + + +/* + * This is a helper function for ext3_dx_readdir. It calls filldir + * for all entres on the fname linked list. (Normally there is only + * one entry on the linked list, unless there are 62 bit hash collisions.) + */ +static int call_filldir(struct file * filp, void * dirent, + filldir_t filldir, struct fname *fname) +{ + struct dir_private_info *info = filp->private_data; + loff_t curr_pos; + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block * sb; + int error; + + sb = inode->i_sb; + + if (!fname) { + printk("call_filldir: called with null fname?!?\n"); + return 0; + } + curr_pos = hash2pos(fname->hash, fname->minor_hash); + while (fname) { + error = filldir(dirent, fname->name, + fname->name_len, curr_pos, + fname->inode, + get_dtype(sb, fname->file_type)); + if (error) { + filp->f_pos = curr_pos; + info->extra_fname = fname; + return error; + } + fname = fname->next; + } + return 0; +} + +static int ext3_dx_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + struct dir_private_info *info = filp->private_data; + struct inode *inode = filp->f_path.dentry->d_inode; + struct fname *fname; + int ret; + + if (!info) { + info = ext3_htree_create_dir_info(filp->f_pos); + if (!info) + return -ENOMEM; + filp->private_data = info; + } + + if (filp->f_pos == EXT3_HTREE_EOF) + return 0; /* EOF */ + + /* Some one has messed with f_pos; reset the world */ + if (info->last_pos != filp->f_pos) { + free_rb_tree_fname(&info->root); + info->curr_node = NULL; + info->extra_fname = NULL; + info->curr_hash = pos2maj_hash(filp->f_pos); + info->curr_minor_hash = pos2min_hash(filp->f_pos); + } + + /* + * If there are any leftover names on the hash collision + * chain, return them first. + */ + if (info->extra_fname) { + if (call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; + info->extra_fname = NULL; + goto next_node; + } else if (!info->curr_node) + info->curr_node = rb_first(&info->root); + + while (1) { + /* + * Fill the rbtree if we have no more entries, + * or the inode has changed since we last read in the + * cached entries. + */ + if ((!info->curr_node) || + (filp->f_version != inode->i_version)) { + info->curr_node = NULL; + free_rb_tree_fname(&info->root); + filp->f_version = inode->i_version; + ret = ext3_htree_fill_tree(filp, info->curr_hash, + info->curr_minor_hash, + &info->next_hash); + if (ret < 0) + return ret; + if (ret == 0) { + filp->f_pos = EXT3_HTREE_EOF; + break; + } + info->curr_node = rb_first(&info->root); + } + + fname = rb_entry(info->curr_node, struct fname, rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + if (call_filldir(filp, dirent, filldir, fname)) + break; + next_node: + info->curr_node = rb_next(info->curr_node); + if (info->curr_node) { + fname = rb_entry(info->curr_node, struct fname, + rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + } else { + if (info->next_hash == ~0) { + filp->f_pos = EXT3_HTREE_EOF; + break; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } +finished: + info->last_pos = filp->f_pos; + return 0; +} + +static int ext3_release_dir (struct inode * inode, struct file * filp) +{ + if (filp->private_data) + ext3_htree_free_dir_info(filp->private_data); + + return 0; +} diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c new file mode 100644 index 00000000..d401f148 --- /dev/null +++ b/fs/ext3/ext3_jbd.c @@ -0,0 +1,59 @@ +/* + * Interface between ext3 and JBD + */ + +#include + +int __ext3_journal_get_undo_access(const char *where, handle_t *handle, + struct buffer_head *bh) +{ + int err = journal_get_undo_access(handle, bh); + if (err) + ext3_journal_abort_handle(where, __func__, bh, handle,err); + return err; +} + +int __ext3_journal_get_write_access(const char *where, handle_t *handle, + struct buffer_head *bh) +{ + int err = journal_get_write_access(handle, bh); + if (err) + ext3_journal_abort_handle(where, __func__, bh, handle,err); + return err; +} + +int __ext3_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh) +{ + int err = journal_forget(handle, bh); + if (err) + ext3_journal_abort_handle(where, __func__, bh, handle,err); + return err; +} + +int __ext3_journal_revoke(const char *where, handle_t *handle, + unsigned long blocknr, struct buffer_head *bh) +{ + int err = journal_revoke(handle, blocknr, bh); + if (err) + ext3_journal_abort_handle(where, __func__, bh, handle,err); + return err; +} + +int __ext3_journal_get_create_access(const char *where, + handle_t *handle, struct buffer_head *bh) +{ + int err = journal_get_create_access(handle, bh); + if (err) + ext3_journal_abort_handle(where, __func__, bh, handle,err); + return err; +} + +int __ext3_journal_dirty_metadata(const char *where, + handle_t *handle, struct buffer_head *bh) +{ + int err = journal_dirty_metadata(handle, bh); + if (err) + ext3_journal_abort_handle(where, __func__, bh, handle,err); + return err; +} diff --git a/fs/ext3/file.c b/fs/ext3/file.c new file mode 100644 index 00000000..f55df0e6 --- /dev/null +++ b/fs/ext3/file.c @@ -0,0 +1,85 @@ +/* + * linux/fs/ext3/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 fs regular file handling primitives + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + */ + +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" + +/* + * Called when an inode is released. Note that this is different + * from ext3_file_open: open gets called at every open, but release + * gets called only when /all/ the files are closed. + */ +static int ext3_release_file (struct inode * inode, struct file * filp) +{ + if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) { + filemap_flush(inode->i_mapping); + ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); + } + /* if we are the last writer on the inode, drop the block reservation */ + if ((filp->f_mode & FMODE_WRITE) && + (atomic_read(&inode->i_writecount) == 1)) + { + mutex_lock(&EXT3_I(inode)->truncate_mutex); + ext3_discard_reservation(inode); + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + } + if (is_dx(inode) && filp->private_data) + ext3_htree_free_dir_info(filp->private_data); + + return 0; +} + +const struct file_operations ext3_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .unlocked_ioctl = ext3_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext3_compat_ioctl, +#endif + .mmap = generic_file_mmap, + .open = dquot_file_open, + .release = ext3_release_file, + .fsync = ext3_sync_file, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +}; + +const struct inode_operations ext3_file_inode_operations = { + .truncate = ext3_truncate, + .setattr = ext3_setattr, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif + .check_acl = ext3_check_acl, + .fiemap = ext3_fiemap, +}; + diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c new file mode 100644 index 00000000..09b13bb3 --- /dev/null +++ b/fs/ext3/fsync.c @@ -0,0 +1,95 @@ +/* + * linux/fs/ext3/fsync.c + * + * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) + * from + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3fs fsync primitive + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Removed unnecessary code duplication for little endian machines + * and excessive __inline__s. + * Andi Kleen, 1997 + * + * Major simplications and cleanup - we only need to do the metadata, because + * we can depend on generic_block_fdatasync() to sync the data blocks. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * akpm: A new design for ext3_sync_file(). + * + * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). + * There cannot be a transaction open by this task. + * Another task could have dirtied this inode. Its data can be in any + * state in the journalling system. + * + * What we do is just kick off a commit and wait on it. This will snapshot the + * inode to disk. + */ + +int ext3_sync_file(struct file *file, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; + int ret, needs_barrier = 0; + tid_t commit_tid; + + if (inode->i_sb->s_flags & MS_RDONLY) + return 0; + + J_ASSERT(ext3_journal_current_handle() == NULL); + + /* + * data=writeback,ordered: + * The caller's filemap_fdatawrite()/wait will sync the data. + * Metadata is in the journal, we wait for a proper transaction + * to commit here. + * + * data=journal: + * filemap_fdatawrite won't do anything (the buffers are clean). + * ext3_force_commit will write the file data into the journal and + * will wait on that. + * filemap_fdatawait() will encounter a ton of newly-dirtied pages + * (they were dirtied by commit). But that's OK - the blocks are + * safe in-journal, which is all fsync() needs to ensure. + */ + if (ext3_should_journal_data(inode)) + return ext3_force_commit(inode->i_sb); + + if (datasync) + commit_tid = atomic_read(&ei->i_datasync_tid); + else + commit_tid = atomic_read(&ei->i_sync_tid); + + if (test_opt(inode->i_sb, BARRIER) && + !journal_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = 1; + log_start_commit(journal, commit_tid); + ret = log_wait_commit(journal, commit_tid); + + /* + * In case we didn't commit a transaction, we have to flush + * disk caches manually so that data really is on persistent + * storage + */ + if (needs_barrier) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + return ret; +} diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c new file mode 100644 index 00000000..7d215b4d --- /dev/null +++ b/fs/ext3/hash.c @@ -0,0 +1,208 @@ +/* + * linux/fs/ext3/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + * + * This file is released under the GPL v2. + * + * This file may be redistributed under the terms of the GNU Public + * License. + */ + +#include +#include +#include +#include + +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[4], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while(--n); + + buf[0] += b0; + buf[1] += b1; +} + + +/* The old legacy hash */ +static __u32 dx_hack_hash_unsigned(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const unsigned char *ucp = (const unsigned char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i=0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) ucp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +/* + * Returns the hash of a filename. If len is 0 and name is NULL, then + * this function can be used to test whether or not a hash version is + * supported. + * + * The seed is an 4 longword (32 bits) "secret" which can be used to + * uniquify a hash. If the seed is all zero's, then some default seed + * may be used. + * + * A particular hash version specifies whether or not the seed is + * represented, and whether or not the returned hash is 32 bits or 64 + * bits. 32 bit hashes will return 0 for the minor hash. + */ +int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) +{ + __u32 hash; + __u32 minor_hash = 0; + const char *p; + int i; + __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + /* Check to see if the seed is all zero's */ + if (hinfo->seed) { + for (i=0; i < 4; i++) { + if (hinfo->seed[i]) + break; + } + if (i < 4) + memcpy(buf, hinfo->seed, sizeof(buf)); + } + + switch (hinfo->hash_version) { + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); + break; + case DX_HASH_LEGACY: + hash = dx_hack_hash_signed(name, len); + break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; + case DX_HASH_HALF_MD4: + p = name; + while (len > 0) { + (*str2hashbuf)(p, len, in, 8); + half_md4_transform(buf, in); + len -= 32; + p += 32; + } + minor_hash = buf[2]; + hash = buf[1]; + break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; + case DX_HASH_TEA: + p = name; + while (len > 0) { + (*str2hashbuf)(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + hash = buf[0]; + minor_hash = buf[1]; + break; + default: + hinfo->hash = 0; + return -1; + } + hash = hash & ~1; + if (hash == (EXT3_HTREE_EOF << 1)) + hash = (EXT3_HTREE_EOF-1) << 1; + hinfo->hash = hash; + hinfo->minor_hash = minor_hash; + return 0; +} diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c new file mode 100644 index 00000000..0b3da7cc --- /dev/null +++ b/fs/ext3/ialloc.c @@ -0,0 +1,768 @@ +/* + * linux/fs/ext3/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "xattr.h" +#include "acl.h" + +/* + * ialloc.c contains the inodes allocation and deallocation routines + */ + +/* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. + */ + + +/* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return buffer_head of bitmap on success or NULL. + */ +static struct buffer_head * +read_inode_bitmap(struct super_block * sb, unsigned long block_group) +{ + struct ext3_group_desc *desc; + struct buffer_head *bh = NULL; + + desc = ext3_get_group_desc(sb, block_group, NULL); + if (!desc) + goto error_out; + + bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap)); + if (!bh) + ext3_error(sb, "read_inode_bitmap", + "Cannot read inode bitmap - " + "block_group = %lu, inode_bitmap = %u", + block_group, le32_to_cpu(desc->bg_inode_bitmap)); +error_out: + return bh; +} + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ext3_free_inode (handle_t *handle, struct inode * inode) +{ + struct super_block * sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + unsigned long block_group; + unsigned long bit; + struct ext3_group_desc * gdp; + struct ext3_super_block * es; + struct ext3_sb_info *sbi; + int fatal = 0, err; + + if (atomic_read(&inode->i_count) > 1) { + printk ("ext3_free_inode: inode has count=%d\n", + atomic_read(&inode->i_count)); + return; + } + if (inode->i_nlink) { + printk ("ext3_free_inode: inode has nlink=%d\n", + inode->i_nlink); + return; + } + if (!sb) { + printk("ext3_free_inode: inode on nonexistent device\n"); + return; + } + sbi = EXT3_SB(sb); + + ino = inode->i_ino; + ext3_debug ("freeing inode %lu\n", ino); + + is_directory = S_ISDIR(inode->i_mode); + + es = EXT3_SB(sb)->s_es; + if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_free_inode", + "reserved or nonexistent inode %lu", ino); + goto error_return; + } + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); + bitmap_bh = read_inode_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bitmap_bh); + if (fatal) + goto error_return; + + /* Ok, now we can actually update the inode bitmaps.. */ + if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + bit, bitmap_bh->b_data)) + ext3_error (sb, "ext3_free_inode", + "bit already cleared for inode %lu", ino); + else { + gdp = ext3_get_group_desc (sb, block_group, &bh2); + + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bh2); + if (fatal) goto error_return; + + if (gdp) { + spin_lock(sb_bgl_lock(sbi, block_group)); + le16_add_cpu(&gdp->bg_free_inodes_count, 1); + if (is_directory) + le16_add_cpu(&gdp->bg_used_dirs_count, -1); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (is_directory) + percpu_counter_dec(&sbi->s_dirs_counter); + + } + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (!fatal) fatal = err; + } + BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (!fatal) + fatal = err; + +error_return: + brelse(bitmap_bh); + ext3_std_error(sb, fatal); +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory\'s block + * group to find a free inode. + */ +static int find_group_dir(struct super_block *sb, struct inode *parent) +{ + int ngroups = EXT3_SB(sb)->s_groups_count; + unsigned int freei, avefreei; + struct ext3_group_desc *desc, *best_desc = NULL; + int group, best_group = -1; + + freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter); + avefreei = freei / ngroups; + + for (group = 0; group < ngroups; group++) { + desc = ext3_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + continue; + if (!best_desc || + (le16_to_cpu(desc->bg_free_blocks_count) > + le16_to_cpu(best_desc->bg_free_blocks_count))) { + best_group = group; + best_desc = desc; + } + } + return best_group; +} + +/* + * Orlov's allocator for directories. + * + * We always try to spread first-level directories. + * + * If there are blockgroups with both free inodes and free blocks counts + * not worse than average we return one with smallest directory count. + * Otherwise we simply return a random group. + * + * For the rest rules look so: + * + * It's OK to put directory into a group unless + * it has too many directories already (max_dirs) or + * it has too few free inodes left (min_inodes) or + * it has too few free blocks left (min_blocks) or + * it's already running too large debt (max_debt). + * Parent's group is preferred, if it doesn't satisfy these + * conditions we search cyclically through the rest. If none + * of the groups look good we just look for a group with more + * free inodes than average (starting at parent's group). + * + * Debt is incremented each time we allocate a directory and decremented + * when we allocate an inode, within 0--255. + */ + +#define INODE_COST 64 +#define BLOCK_COST 256 + +static int find_group_orlov(struct super_block *sb, struct inode *parent) +{ + int parent_group = EXT3_I(parent)->i_block_group; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + int ngroups = sbi->s_groups_count; + int inodes_per_group = EXT3_INODES_PER_GROUP(sb); + unsigned int freei, avefreei; + ext3_fsblk_t freeb, avefreeb; + ext3_fsblk_t blocks_per_dir; + unsigned int ndirs; + int max_debt, max_dirs, min_inodes; + ext3_grpblk_t min_blocks; + int group = -1, i; + struct ext3_group_desc *desc; + + freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + avefreei = freei / ngroups; + freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + avefreeb = freeb / ngroups; + ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + + if ((parent == sb->s_root->d_inode) || + (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) { + int best_ndir = inodes_per_group; + int best_group = -1; + + get_random_bytes(&group, sizeof(group)); + parent_group = (unsigned)group % ngroups; + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext3_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + continue; + if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) + continue; + best_group = group; + best_ndir = le16_to_cpu(desc->bg_used_dirs_count); + } + if (best_group >= 0) + return best_group; + goto fallback; + } + + blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs; + + max_dirs = ndirs / ngroups + inodes_per_group / 16; + min_inodes = avefreei - inodes_per_group / 4; + min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4; + + max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST); + if (max_debt * INODE_COST > inodes_per_group) + max_debt = inodes_per_group / INODE_COST; + if (max_debt > 255) + max_debt = 255; + if (max_debt == 0) + max_debt = 1; + + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext3_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) + continue; + if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) + continue; + return group; + } + +fallback: + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext3_get_group_desc (sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) + return group; + } + + if (avefreei) { + /* + * The free-inodes counter is approximate, and for really small + * filesystems the above test can fail to find any blockgroups + */ + avefreei = 0; + goto fallback; + } + + return -1; +} + +static int find_group_other(struct super_block *sb, struct inode *parent) +{ + int parent_group = EXT3_I(parent)->i_block_group; + int ngroups = EXT3_SB(sb)->s_groups_count; + struct ext3_group_desc *desc; + int group, i; + + /* + * Try to place the inode in its parent directory + */ + group = parent_group; + desc = ext3_get_group_desc (sb, group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) + return group; + + /* + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in + * the same blockgroup. But we want files which are in a different + * directory which shares a blockgroup with our parent to land in a + * different blockgroup. + * + * So add our directory's i_ino into the starting point for the hash. + */ + group = (group + parent->i_ino) % ngroups; + + /* + * Use a quadratic hash to find a group with a free inode and some free + * blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + group += i; + if (group >= ngroups) + group -= ngroups; + desc = ext3_get_group_desc (sb, group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) + return group; + } + + /* + * That failed: try linear search for a free inode, even if that group + * has no free blocks. + */ + group = parent_group; + for (i = 0; i < ngroups; i++) { + if (++group >= ngroups) + group = 0; + desc = ext3_get_group_desc (sb, group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + return group; + } + + return -1; +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, + const struct qstr *qstr, int mode) +{ + struct super_block *sb; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + int group; + unsigned long ino = 0; + struct inode * inode; + struct ext3_group_desc * gdp = NULL; + struct ext3_super_block * es; + struct ext3_inode_info *ei; + struct ext3_sb_info *sbi; + int err = 0; + struct inode *ret; + int i; + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT3_I(inode); + + sbi = EXT3_SB(sb); + es = sbi->s_es; + if (S_ISDIR(mode)) { + if (test_opt (sb, OLDALLOC)) + group = find_group_dir(sb, dir); + else + group = find_group_orlov(sb, dir); + } else + group = find_group_other(sb, dir); + + err = -ENOSPC; + if (group == -1) + goto out; + + for (i = 0; i < sbi->s_groups_count; i++) { + err = -EIO; + + gdp = ext3_get_group_desc(sb, group, &bh2); + if (!gdp) + goto fail; + + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, group); + if (!bitmap_bh) + goto fail; + + ino = 0; + +repeat_in_this_group: + ino = ext3_find_next_zero_bit((unsigned long *) + bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino); + if (ino < EXT3_INODES_PER_GROUP(sb)) { + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bitmap_bh); + if (err) + goto fail; + + if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group), + ino, bitmap_bh->b_data)) { + /* we won it */ + BUFFER_TRACE(bitmap_bh, + "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, + bitmap_bh); + if (err) + goto fail; + goto got; + } + /* we lost it */ + journal_release_buffer(handle, bitmap_bh); + + if (++ino < EXT3_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; + } + + /* + * This case is possible in concurrent environment. It is very + * rare. We cannot repeat the find_group_xxx() call because + * that will simply return the same blockgroup, because the + * group descriptor metadata has not yet been updated. + * So we just go onto the next blockgroup. + */ + if (++group == sbi->s_groups_count) + group = 0; + } + err = -ENOSPC; + goto out; + +got: + ino += group * EXT3_INODES_PER_GROUP(sb) + 1; + if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_new_inode", + "reserved inode or inode > inodes count - " + "block_group = %d, inode=%lu", group, ino); + err = -EIO; + goto fail; + } + + BUFFER_TRACE(bh2, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh2); + if (err) goto fail; + spin_lock(sb_bgl_lock(sbi, group)); + le16_add_cpu(&gdp->bg_free_inodes_count, -1); + if (S_ISDIR(mode)) { + le16_add_cpu(&gdp->bg_used_dirs_count, 1); + } + spin_unlock(sb_bgl_lock(sbi, group)); + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) goto fail; + + percpu_counter_dec(&sbi->s_freeinodes_counter); + if (S_ISDIR(mode)) + percpu_counter_inc(&sbi->s_dirs_counter); + + + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(inode, dir, mode); + + inode->i_ino = ino; + /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + + ei->i_flags = + ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED); +#ifdef EXT3_FRAGMENTS + ei->i_faddr = 0; + ei->i_frag_no = 0; + ei->i_frag_size = 0; +#endif + ei->i_file_acl = 0; + ei->i_dir_acl = 0; + ei->i_dtime = 0; + ei->i_block_alloc_info = NULL; + ei->i_block_group = group; + + ext3_set_inode_flags(inode); + if (IS_DIRSYNC(inode)) + handle->h_sync = 1; + if (insert_inode_locked(inode) < 0) { + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + goto fail; + } + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + + ei->i_state_flags = 0; + ext3_set_inode_state(inode, EXT3_STATE_NEW); + + /* See comment in ext3_iget for explanation */ + if (ino >= EXT3_FIRST_INO(sb) + 1 && + EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) { + ei->i_extra_isize = + sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE; + } else { + ei->i_extra_isize = 0; + } + + ret = inode; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + + err = ext3_init_acl(handle, inode, dir); + if (err) + goto fail_free_drop; + + err = ext3_init_security(handle, inode, dir, qstr); + if (err) + goto fail_free_drop; + + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); + goto fail_free_drop; + } + + ext3_debug("allocating inode %lu\n", inode->i_ino); + goto really_out; +fail: + ext3_std_error(sb, err); +out: + iput(inode); + ret = ERR_PTR(err); +really_out: + brelse(bitmap_bh); + return ret; + +fail_free_drop: + dquot_free_inode(inode); + +fail_drop: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + inode->i_nlink = 0; + unlock_new_inode(inode); + iput(inode); + brelse(bitmap_bh); + return ERR_PTR(err); +} + +/* Verify that we are loading a valid orphan from disk */ +struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) +{ + unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count); + unsigned long block_group; + int bit; + struct buffer_head *bitmap_bh; + struct inode *inode = NULL; + long err = -EIO; + + /* Error cases - e2fsck has already cleaned up for us */ + if (ino > max_ino) { + ext3_warning(sb, __func__, + "bad orphan ino %lu! e2fsck was run?", ino); + goto error; + } + + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); + bitmap_bh = read_inode_bitmap(sb, block_group); + if (!bitmap_bh) { + ext3_warning(sb, __func__, + "inode bitmap error for orphan %lu", ino); + goto error; + } + + /* Having the inode bit set should be a 100% indicator that this + * is a valid orphan (no e2fsck run on fs). Orphans also include + * inodes that were being truncated, so we can't check i_nlink==0. + */ + if (!ext3_test_bit(bit, bitmap_bh->b_data)) + goto bad_orphan; + + inode = ext3_iget(sb, ino); + if (IS_ERR(inode)) + goto iget_failed; + + /* + * If the orphans has i_nlinks > 0 then it should be able to be + * truncated, otherwise it won't be removed from the orphan list + * during processing and an infinite loop will result. + */ + if (inode->i_nlink && !ext3_can_truncate(inode)) + goto bad_orphan; + + if (NEXT_ORPHAN(inode) > max_ino) + goto bad_orphan; + brelse(bitmap_bh); + return inode; + +iget_failed: + err = PTR_ERR(inode); + inode = NULL; +bad_orphan: + ext3_warning(sb, __func__, + "bad orphan inode %lu! e2fsck was run?", ino); + printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n", + bit, (unsigned long long)bitmap_bh->b_blocknr, + ext3_test_bit(bit, bitmap_bh->b_data)); + printk(KERN_NOTICE "inode=%p\n", inode); + if (inode) { + printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + is_bad_inode(inode)); + printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", + NEXT_ORPHAN(inode)); + printk(KERN_NOTICE "max_ino=%lu\n", max_ino); + printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); + /* Avoid freeing blocks if we got a bad deleted inode */ + if (inode->i_nlink == 0) + inode->i_blocks = 0; + iput(inode); + } + brelse(bitmap_bh); +error: + return ERR_PTR(err); +} + +unsigned long ext3_count_free_inodes (struct super_block * sb) +{ + unsigned long desc_count; + struct ext3_group_desc *gdp; + int i; +#ifdef EXT3FS_DEBUG + struct ext3_super_block *es; + unsigned long bitmap_count, x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT3_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, i); + if (!bitmap_bh) + continue; + + x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8); + printk("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_inodes_count), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + return desc_count; +#else + desc_count = 0; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + cond_resched(); + } + return desc_count; +#endif +} + +/* Called at mount-time, super-block is locked */ +unsigned long ext3_count_dirs (struct super_block * sb) +{ + unsigned long count = 0; + int i; + + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + struct ext3_group_desc *gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + count += le16_to_cpu(gdp->bg_used_dirs_count); + } + return count; +} + diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c new file mode 100644 index 00000000..db9ba1a3 --- /dev/null +++ b/fs/ext3/inode.c @@ -0,0 +1,3516 @@ +/* + * linux/fs/ext3/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" + +static int ext3_writepage_trans_blocks(struct inode *inode); + +/* + * Test whether an inode is a fast symlink. + */ +static int ext3_inode_is_fast_symlink(struct inode *inode) +{ + int ea_blocks = EXT3_I(inode)->i_file_acl ? + (inode->i_sb->s_blocksize >> 9) : 0; + + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); +} + +/* + * The ext3 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + */ +int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t blocknr) +{ + int err; + + might_sleep(); + + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %lx\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext3_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call journal_forget"); + return ext3_journal_forget(handle, bh); + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call ext3_journal_revoke"); + err = ext3_journal_revoke(handle, blocknr, bh); + if (err) + ext3_abort(inode->i_sb, __func__, + "error %d when attempting revoke", err); + BUFFER_TRACE(bh, "exit"); + return err; +} + +/* + * Work out how many blocks we need to proceed with the next chunk of a + * truncate transaction. + */ +static unsigned long blocks_for_truncate(struct inode *inode) +{ + unsigned long needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext3 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT3_MAX_TRANS_DATA) + needed = EXT3_MAX_TRANS_DATA; + + return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ +static handle_t *start_transaction(struct inode *inode) +{ + handle_t *result; + + result = ext3_journal_start(inode, blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; + + ext3_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) + return 0; + if (!ext3_journal_extend(handle, blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Restart the transaction associated with *handle. This does a commit, + * so before we call here everything must be consistently dirtied against + * this transaction. + */ +static int truncate_restart_transaction(handle_t *handle, struct inode *inode) +{ + int ret; + + jbd_debug(2, "restarting handle %p\n", handle); + /* + * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle + * At this moment, get_block can be called only for blocks inside + * i_size since page cache has been already dropped and writes are + * blocked by i_mutex. So we can safely drop the truncate_mutex. + */ + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + ret = ext3_journal_restart(handle, blocks_for_truncate(inode)); + mutex_lock(&EXT3_I(inode)->truncate_mutex); + return ret; +} + +/* + * Called at inode eviction from icache + */ +void ext3_evict_inode (struct inode *inode) +{ + struct ext3_block_alloc_info *rsv; + handle_t *handle; + int want_delete = 0; + + if (!inode->i_nlink && !is_bad_inode(inode)) { + dquot_initialize(inode); + want_delete = 1; + } + + truncate_inode_pages(&inode->i_data, 0); + + ext3_discard_reservation(inode); + rsv = EXT3_I(inode)->i_block_alloc_info; + EXT3_I(inode)->i_block_alloc_info = NULL; + if (unlikely(rsv)) + kfree(rsv); + + if (!want_delete) + goto no_delete; + + handle = start_transaction(inode); + if (IS_ERR(handle)) { + /* + * If we're going to skip the normal cleanup, we still need to + * make sure that the in-core orphan linked list is properly + * cleaned up. + */ + ext3_orphan_del(NULL, inode); + goto no_delete; + } + + if (IS_SYNC(inode)) + handle->h_sync = 1; + inode->i_size = 0; + if (inode->i_blocks) + ext3_truncate(inode); + /* + * Kill off the orphan record which ext3_truncate created. + * AKPM: I think this can be inside the above `if'. + * Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - this is because we don't + * know if ext3_truncate() actually created an orphan record. + * (Well, we could do this if we need to, but heck - it works) + */ + ext3_orphan_del(handle, inode); + EXT3_I(inode)->i_dtime = get_seconds(); + + /* + * One subtle ordering requirement: if anything has gone wrong + * (transaction abort, IO errors, whatever), then we can still + * do these next steps (the fs will already have been marked as + * having errors), but we can't free the inode if the mark_dirty + * fails. + */ + if (ext3_mark_inode_dirty(handle, inode)) { + /* If that failed, just dquot_drop() and be done with that */ + dquot_drop(inode); + end_writeback(inode); + } else { + ext3_xattr_delete_inode(handle, inode); + dquot_free_inode(inode); + dquot_drop(inode); + end_writeback(inode); + ext3_free_inode(handle, inode); + } + ext3_journal_stop(handle); + return; +no_delete: + end_writeback(inode); + dquot_drop(inode); +} + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +static int verify_chain(Indirect *from, Indirect *to) +{ + while (from <= to && from->key == *from->p) + from++; + return (from > to); +} + +/** + * ext3_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext3 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext3_block_to_path(struct inode *inode, + long i_block, int offsets[4], int *boundary) +{ + int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT3_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < 0) { + ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); + } else if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ( (i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT3_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT3_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT3_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big"); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + return n; +} + +/** + * ext3_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it notices that chain had been changed while it was reading + * (ditto, *@err == -EAGAIN) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + */ +static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_bread(sb, le32_to_cpu(p->key)); + if (!bh) + goto failure; + /* Reader: pointers */ + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +changed: + brelse(bh); + *err = -EAGAIN; + goto no_block; +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext3_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the preferred place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ +static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; + __le32 *p; + ext3_fsblk_t bg_start; + ext3_grpblk_t colour; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) { + if (*p) + return le32_to_cpu(*p); + } + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. + */ + bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group); + colour = (current->pid % 16) * + (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour; +} + +/** + * ext3_find_goal - find a preferred place for allocation. + * @inode: owner + * @block: block we want + * @partial: pointer to the last triple within a chain + * + * Normally this function find the preferred place for block allocation, + * returns it. + */ + +static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block, + Indirect *partial) +{ + struct ext3_block_alloc_info *block_i; + + block_i = EXT3_I(inode)->i_block_alloc_info; + + /* + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ + if (block_i && (block == block_i->last_alloc_logical_block + 1) + && (block_i->last_alloc_physical_block != 0)) { + return block_i->last_alloc_physical_block + 1; + } + + return ext3_find_near(inode, partial); +} + +/** + * ext3_blks_to_allocate - Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks, + int blocks_to_boundary) +{ + unsigned long count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext3_alloc_blocks - multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: owner + * @goal: preferred place for allocation + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * @blks: number of blocks need to allocated for direct blocks + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @err: here we store the error value + * + * return the number of direct blocks allocated + */ +static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int indirect_blks, int blks, + ext3_fsblk_t new_blocks[4], int *err) +{ + int target, i; + unsigned long count = 0; + int index = 0; + ext3_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + target = blks + indirect_blks; + + while (1) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext3_new_blocks(handle,inode,goal,&count,err); + if (*err) + goto failed_out; + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + + if (count > 0) + break; + } + + /* save the new block number for the first direct block */ + new_blocks[index] = current_block; + + /* total number of blocks allocated for direct blocks */ + ret = count; + *err = 0; + return ret; +failed_out: + for (i = 0; i key). Upon the exit we have the same + * picture as after the successful ext3_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +static int ext3_alloc_branch(handle_t *handle, struct inode *inode, + int indirect_blks, int *blks, ext3_fsblk_t goal, + int *offsets, Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int i, n = 0; + int err = 0; + struct buffer_head *bh; + int num; + ext3_fsblk_t new_blocks[4]; + ext3_fsblk_t current_block; + + num = ext3_alloc_blocks(handle, inode, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; + + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext3_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + brelse(bh); + goto failed; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if ( n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i=1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + goto failed; + } + *blks = num; + return err; +failed: + /* Allocation failed, free what we already allocated */ + for (i = 1; i <= n ; i++) { + BUFFER_TRACE(branch[i].bh, "call journal_forget"); + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static int ext3_splice_branch(handle_t *handle, struct inode *inode, + long block, Indirect *where, int num, int blks) +{ + int i; + int err = 0; + struct ext3_block_alloc_info *block_i; + ext3_fsblk_t current_block; + struct ext3_inode_info *ei = EXT3_I(inode); + + block_i = ei->i_block_alloc_info; + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i ) = cpu_to_le32(current_block++); + } + + /* + * update the most recently allocated logical & physical block + * in i_block_alloc_info, to assist find the proper goal block for next + * allocation + */ + if (block_i) { + block_i->last_alloc_logical_block = block + blks - 1; + block_i->last_alloc_physical_block = + le32_to_cpu(where[num].key) + blks - 1; + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + + inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + /* ext3_mark_inode_dirty already updated i_sync_tid */ + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); + + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + * Inode was dirtied above. + */ + jbd_debug(5, "splicing direct\n"); + } + return err; + +err_out: + for (i = 1; i <= num; i++) { + BUFFER_TRACE(where[i].bh, "call journal_forget"); + ext3_journal_forget(handle, where[i].bh); + ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); + } + ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); + + return err; +} + +/* + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * The BKL may not be held on entry here. Be sure to take it early. + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + */ +int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, + sector_t iblock, unsigned long maxblocks, + struct buffer_head *bh_result, + int create) +{ + int err = -EIO; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + ext3_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + struct ext3_inode_info *ei = EXT3_I(inode); + int count = 0; + ext3_fsblk_t first_block = 0; + + + J_ASSERT(handle != NULL || create == 0); + depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); + + if (depth == 0) + goto out; + + partial = ext3_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + clear_buffer_new(bh_result); + count++; + /*map more blocks*/ + while (count < maxblocks && count <= blocks_to_boundary) { + ext3_fsblk_t blk; + + if (!verify_chain(chain, chain + depth - 1)) { + /* + * Indirect block might be removed by + * truncate while we were reading it. + * Handling of that case: forget what we've + * got now. Flag the err as EAGAIN, so it + * will reread. + */ + err = -EAGAIN; + count = 0; + break; + } + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + if (err != -EAGAIN) + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) + goto cleanup; + + mutex_lock(&ei->truncate_mutex); + + /* + * If the indirect block is missing while we are reading + * the chain(ext3_get_branch() returns -EAGAIN err), or + * if the chain has been changed after we grab the semaphore, + * (either because another process truncated this branch, or + * another get_block allocated this branch) re-grab the chain to see if + * the request block has been allocated or not. + * + * Since we already block the truncate/other get_block + * at this point, we will have the current copy of the chain when we + * splice the branch into the tree. + */ + if (err == -EAGAIN || !verify_chain(chain, partial)) { + while (partial > chain) { + brelse(partial->bh); + partial--; + } + partial = ext3_get_branch(inode, depth, offsets, chain, &err); + if (!partial) { + count++; + mutex_unlock(&ei->truncate_mutex); + if (err) + goto cleanup; + clear_buffer_new(bh_result); + goto got_it; + } + } + + /* + * Okay, we need to do block allocation. Lazily initialize the block + * allocation info here if necessary + */ + if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) + ext3_init_block_alloc_info(inode); + + goal = ext3_find_goal(inode, iblock, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext3_blks_to_allocate(partial, indirect_blks, + maxblocks, blocks_to_boundary); + /* + * Block out ext3_truncate while we alter the tree + */ + err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, + offsets + (partial - chain), partial); + + /* + * The ext3_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) + err = ext3_splice_branch(handle, inode, iblock, + partial, indirect_blks, count); + mutex_unlock(&ei->truncate_mutex); + if (err) + goto cleanup; + + set_buffer_new(bh_result); +got_it: + map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + if (count > blocks_to_boundary) + set_buffer_boundary(bh_result); + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + BUFFER_TRACE(bh_result, "returned"); +out: + return err; +} + +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 +/* + * Number of credits we need for writing DIO_MAX_BLOCKS: + * We need sb + group descriptor + bitmap + inode -> 4 + * For B blocks with A block pointers per block we need: + * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). + * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. + */ +#define DIO_CREDITS 25 + +static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + handle_t *handle = ext3_journal_current_handle(); + int ret = 0, started = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + + if (create && !handle) { /* Direct IO write... */ + if (max_blocks > DIO_MAX_BLOCKS) + max_blocks = DIO_MAX_BLOCKS; + handle = ext3_journal_start(inode, DIO_CREDITS + + EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + started = 1; + } + + ret = ext3_get_blocks_handle(handle, inode, iblock, + max_blocks, bh_result, create); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + if (started) + ext3_journal_stop(handle); +out: + return ret; +} + +int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, start, len, + ext3_get_block); +} + +/* + * `handle' can be NULL if create is zero + */ +struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, + long block, int create, int *errp) +{ + struct buffer_head dummy; + int fatal = 0, err; + + J_ASSERT(handle != NULL || create == 0); + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); + err = ext3_get_blocks_handle(handle, inode, block, 1, + &dummy, create); + /* + * ext3_get_blocks_handle() returns number of blocks + * mapped. 0 in case of a HOLE. + */ + if (err > 0) { + if (err > 1) + WARN_ON(1); + err = 0; + } + *errp = err; + if (!err && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); + if (!bh) { + *errp = -EIO; + goto err; + } + if (buffer_new(&dummy)) { + J_ASSERT(create != 0); + J_ASSERT(handle != NULL); + + /* + * Now that we do not always journal data, we should + * keep in mind whether this should always journal the + * new buffer as metadata. For now, regular file + * writes use ext3_get_block instead, so it's not a + * problem. + */ + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + fatal = ext3_journal_get_create_access(handle, bh); + if (!fatal && !buffer_uptodate(bh)) { + memset(bh->b_data,0,inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (!fatal) + fatal = err; + } else { + BUFFER_TRACE(bh, "not a new buffer"); + } + if (fatal) { + *errp = fatal; + brelse(bh); + bh = NULL; + } + return bh; + } +err: + return NULL; +} + +struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, + int block, int create, int *err) +{ + struct buffer_head * bh; + + bh = ext3_getblk(handle, inode, block, create, err); + if (!bh) + return bh; + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(READ_META, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + put_bh(bh); + *err = -EIO; + return NULL; +} + +static int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for ( bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) + { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext3_get_block() + * and the commit_write(). So doing the journal_start at the start of + * prepare_write() is the right place. + * + * Also, this function can nest inside ext3_writepage() -> + * block_write_full_page(). In that case, we *know* that ext3_writepage() + * has generated enough buffer credits to do the whole page. So we won't + * block on the journal in that case, which is good, because the caller may + * be PF_MEMALLOC. + * + * By accident, ext3 can be reentered when a transaction is open via + * quota file writes. If we were to commit the transaction while thus + * reentered, there can be a deadlock - we would be holding a quota + * lock, and the commit would never complete if another thread had a + * transaction open and was blocking on the quota lock - a ranking + * violation. + * + * So what we do is to rely on the fact that journal_stop/journal_start + * will _not_ run commit under these circumstances because handle->h_ref + * is elevated. We'll still have enough credits for the tiny quotafile + * write. + */ +static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) +{ + int dirty = buffer_dirty(bh); + int ret; + + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + /* + * __block_prepare_write() could have dirtied some buffers. Clean + * the dirty bit as jbd2_journal_get_write_access() could complain + * otherwise about fs integrity issues. Setting of the dirty bit + * by __block_prepare_write() isn't a real problem here as we clear + * the bit before releasing a page lock and thus writeback cannot + * ever write the buffer. + */ + if (dirty) + clear_buffer_dirty(bh); + ret = ext3_journal_get_write_access(handle, bh); + if (!ret && dirty) + ret = ext3_journal_dirty_metadata(handle, bh); + return ret; +} + +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext3_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext3_truncate(inode); +} + +static int ext3_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + int ret; + handle_t *handle; + int retries = 0; + struct page *page; + pgoff_t index; + unsigned from, to; + /* Reserve one block more for addition to orphan list in case + * we allocate blocks but write fails for some reason */ + int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + +retry: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + handle = ext3_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + unlock_page(page); + page_cache_release(page); + ret = PTR_ERR(handle); + goto out; + } + ret = __block_write_begin(page, pos, len, ext3_get_block); + if (ret) + goto write_begin_failed; + + if (ext3_should_journal_data(inode)) { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, do_journal_get_write_access); + } +write_begin_failed: + if (ret) { + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + * + * Add inode to orphan list in case we crash before truncate + * finishes. Do this only if ext3_can_truncate() agrees so + * that orphan processing code is happy. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); + ext3_journal_stop(handle); + unlock_page(page); + page_cache_release(page); + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); + } + if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + return ret; +} + + +int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) +{ + int err = journal_dirty_data(handle, bh); + if (err) + ext3_journal_abort_handle(__func__, __func__, + bh, handle, err); + return err; +} + +/* For ordered writepage and write_end functions */ +static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) +{ + /* + * Write could have mapped the buffer but it didn't copy the data in + * yet. So avoid filing such buffer into a transaction. + */ + if (buffer_mapped(bh) && buffer_uptodate(bh)) + return ext3_journal_dirty_data(handle, bh); + return 0; +} + +/* For write_end() in data=journal mode */ +static int write_end_fn(handle_t *handle, struct buffer_head *bh) +{ + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + set_buffer_uptodate(bh); + return ext3_journal_dirty_metadata(handle, bh); +} + +/* + * This is nasty and subtle: ext3_write_begin() could have allocated blocks + * for the whole page but later we failed to copy the data in. Update inode + * size according to what we managed to copy. The rest is going to be + * truncated in write_end function. + */ +static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied) +{ + /* What matters to us is i_disksize. We don't write i_size anywhere */ + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + if (pos + copied > EXT3_I(inode)->i_disksize) { + EXT3_I(inode)->i_disksize = pos + copied; + mark_inode_dirty(inode); + } +} + +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from page_symlink(). + * + * ext3 never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. + */ +static int ext3_ordered_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext3_journal_current_handle(); + struct inode *inode = file->f_mapping->host; + unsigned from, to; + int ret = 0, ret2; + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + copied; + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, journal_dirty_data_fn); + + if (ret == 0) + update_file_sizes(inode, pos, copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); + ret2 = ext3_journal_stop(handle); + if (!ret) + ret = ret2; + unlock_page(page); + page_cache_release(page); + + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); + return ret ? ret : copied; +} + +static int ext3_writeback_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext3_journal_current_handle(); + struct inode *inode = file->f_mapping->host; + int ret; + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + update_file_sizes(inode, pos, copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); + ret = ext3_journal_stop(handle); + unlock_page(page); + page_cache_release(page); + + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); + return ret ? ret : copied; +} + +static int ext3_journalled_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext3_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + int partial = 0; + unsigned from, to; + + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + if (copied < len) { + if (!PageUptodate(page)) + copied = 0; + page_zero_new_buffers(page, from + copied, to); + to = from + copied; + } + + ret = walk_page_buffers(handle, page_buffers(page), from, + to, &partial, write_end_fn); + if (!partial) + SetPageUptodate(page); + + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + /* + * There may be allocated blocks outside of i_size because + * we failed to copy some data. Prepare for truncate. + */ + if (pos + len > inode->i_size && ext3_can_truncate(inode)) + ext3_orphan_add(handle, inode); + ext3_set_inode_state(inode, EXT3_STATE_JDATA); + if (inode->i_size > EXT3_I(inode)->i_disksize) { + EXT3_I(inode)->i_disksize = inode->i_size; + ret2 = ext3_mark_inode_dirty(handle, inode); + if (!ret) + ret = ret2; + } + + ret2 = ext3_journal_stop(handle); + if (!ret) + ret = ret2; + unlock_page(page); + page_cache_release(page); + + if (pos + len > inode->i_size) + ext3_truncate_failed_write(inode); + return ret ? ret : copied; +} + +/* + * bmap() is special. It gets used by applications such as lilo and by + * the swapper to find the on-disk block of a specific piece of data. + * + * Naturally, this is dangerous if the block concerned is still in the + * journal. If somebody makes a swapfile on an ext3 data-journaling + * filesystem and enables swap, then they may get a nasty shock when the + * data getting swapped to that swapfile suddenly gets overwritten by + * the original zero's written out previously to the journal and + * awaiting writeback in the kernel's buffer cache. + * + * So, if we see any bmap calls here on a modified, data-journaled file, + * take extra steps to flush any blocks which might be in the cache. + */ +static sector_t ext3_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + journal_t *journal; + int err; + + if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) { + /* + * This is a REALLY heavyweight approach, but the use of + * bmap on dirty files is expected to be extremely rare: + * only if we run lilo or swapon on a freshly made file + * do we expect this to happen. + * + * (bmap requires CAP_SYS_RAWIO so this does not + * represent an unprivileged user DOS attack --- we'd be + * in trouble if mortal users could trigger this path at + * will.) + * + * NB. EXT3_STATE_JDATA is not set on files other than + * regular files. If somebody wants to bmap a directory + * or symlink and gets confused because the buffer + * hasn't yet been flushed to disk, they deserve + * everything they get. + */ + + ext3_clear_inode_state(inode, EXT3_STATE_JDATA); + journal = EXT3_JOURNAL(inode); + journal_lock_updates(journal); + err = journal_flush(journal); + journal_unlock_updates(journal); + + if (err) + return 0; + } + + return generic_block_bmap(mapping,block,ext3_get_block); +} + +static int bget_one(handle_t *handle, struct buffer_head *bh) +{ + get_bh(bh); + return 0; +} + +static int bput_one(handle_t *handle, struct buffer_head *bh) +{ + put_bh(bh); + return 0; +} + +static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) +{ + return !buffer_mapped(bh); +} + +/* + * Note that we always start a transaction even if we're not journalling + * data. This is to preserve ordering: any hole instantiation within + * __block_write_full_page -> ext3_get_block() should be journalled + * along with the data so we don't crash and then get metadata which + * refers to old data. + * + * In all journalling modes block_write_full_page() will start the I/O. + * + * Problem: + * + * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext3_writepage() + * + * Similar for: + * + * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... + * + * Same applies to ext3_get_block(). We will deadlock on various things like + * lock_journal and i_truncate_mutex. + * + * Setting PF_MEMALLOC here doesn't work - too many internal memory + * allocations fail. + * + * 16May01: If we're reentered then journal_current_handle() will be + * non-zero. We simply *return*. + * + * 1 July 2001: @@@ FIXME: + * In journalled data mode, a data buffer may be metadata against the + * current transaction. But the same file is part of a shared mapping + * and someone does a writepage() on it. + * + * We will move the buffer onto the async_data list, but *after* it has + * been dirtied. So there's a small window where we have dirty data on + * BJ_Metadata. + * + * Note that this only applies to the last partial page in the file. The + * bit which block_write_full_page() uses prepare/commit for. (That's + * broken code anyway: it's wrong for msync()). + * + * It's a rare case: affects the final partial page, for journalled data + * where the file is subject to bith write() and writepage() in the same + * transction. To fix it we'll need a custom block_write_full_page(). + * We'll probably need that anyway for journalling writepage() output. + * + * We don't honour synchronous mounts for writepage(). That would be + * disastrous. Any write() or metadata operation will sync the fs for + * us. + * + * AKPM2: if all the page's buffers are mapped to disk and !data=journal, + * we don't need to open a transaction here. + */ +static int ext3_ordered_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *page_bufs; + handle_t *handle = NULL; + int ret = 0; + int err; + + J_ASSERT(PageLocked(page)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); + + /* + * We give up here if we're reentered, because it might be for a + * different filesystem. + */ + if (ext3_journal_current_handle()) + goto out_fail; + + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + page_bufs = page_buffers(page); + } else { + page_bufs = page_buffers(page); + if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE, + NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } + } + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); + + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_fail; + } + + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bget_one); + + ret = block_write_full_page(page, ext3_get_block, wbc); + + /* + * The page can become unlocked at any point now, and + * truncate can then come in and change things. So we + * can't touch *page from now on. But *page_bufs is + * safe due to elevated refcount. + */ + + /* + * And attach them to the current transaction. But only if + * block_write_full_page() succeeded. Otherwise they are unmapped, + * and generally junk. + */ + if (ret == 0) { + err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, + NULL, journal_dirty_data_fn); + if (!ret) + ret = err; + } + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bput_one); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; + +out_fail: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return ret; +} + +static int ext3_writeback_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + handle_t *handle = NULL; + int ret = 0; + int err; + + J_ASSERT(PageLocked(page)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); + + if (ext3_journal_current_handle()) + goto out_fail; + + if (page_has_buffers(page)) { + if (!walk_page_buffers(NULL, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } + } + + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_fail; + } + + ret = block_write_full_page(page, ext3_get_block, wbc); + + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; + +out_fail: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return ret; +} + +static int ext3_journalled_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + handle_t *handle = NULL; + int ret = 0; + int err; + + J_ASSERT(PageLocked(page)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); + + if (ext3_journal_current_handle()) + goto no_write; + + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto no_write; + } + + if (!page_has_buffers(page) || PageChecked(page)) { + /* + * It's mmapped pagecache. Add buffers and journal it. There + * doesn't seem much point in redirtying the page here. + */ + ClearPageChecked(page); + ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE, + ext3_get_block); + if (ret != 0) { + ext3_journal_stop(handle); + goto out_unlock; + } + ret = walk_page_buffers(handle, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); + + err = walk_page_buffers(handle, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, write_end_fn); + if (ret == 0) + ret = err; + ext3_set_inode_state(inode, EXT3_STATE_JDATA); + unlock_page(page); + } else { + /* + * It may be a page full of checkpoint-mode buffers. We don't + * really know unless we go poke around in the buffer_heads. + * But block_write_full_page will do the right thing. + */ + ret = block_write_full_page(page, ext3_get_block, wbc); + } + err = ext3_journal_stop(handle); + if (!ret) + ret = err; +out: + return ret; + +no_write: + redirty_page_for_writepage(wbc, page); +out_unlock: + unlock_page(page); + goto out; +} + +static int ext3_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, ext3_get_block); +} + +static int +ext3_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); +} + +static void ext3_invalidatepage(struct page *page, unsigned long offset) +{ + journal_t *journal = EXT3_JOURNAL(page->mapping->host); + + /* + * If it's a full truncate we just forget about the pending dirtying + */ + if (offset == 0) + ClearPageChecked(page); + + journal_invalidatepage(journal, page, offset); +} + +static int ext3_releasepage(struct page *page, gfp_t wait) +{ + journal_t *journal = EXT3_JOURNAL(page->mapping->host); + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + return journal_try_to_free_buffers(journal, page, wait); +} + +/* + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + * If the O_DIRECT write is intantiating holes inside i_size and the machine + * crashes then stale disk data _may_ be exposed inside the file. But current + * VFS code falls back into buffered path in that case so we are safe. + */ +static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); + handle_t *handle; + ssize_t ret; + int orphan = 0; + size_t count = iov_length(iov, nr_segs); + int retries = 0; + + if (rw == WRITE) { + loff_t final_size = offset + count; + + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext3_journal_start(inode, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext3_orphan_add(handle, inode); + if (ret) { + ext3_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext3_journal_stop(handle); + } + } + +retry: + ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext3_get_block, NULL); + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely((rw & WRITE) && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + + if (end > isize) + vmtruncate(inode, isize); + } + if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext3_journal_start(inode, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Truncate allocated blocks + * and pretend the write failed... */ + ext3_truncate(inode); + ret = PTR_ERR(handle); + goto out; + } + if (inode->i_nlink) + ext3_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext3_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext3_mark_inode_dirty(handle, inode); + } + } + err = ext3_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +/* + * Pages can be marked dirty completely asynchronously from ext3's journalling + * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do + * much here because ->set_page_dirty is called under VFS locks. The page is + * not necessarily locked. + * + * We cannot just dirty the page and leave attached buffers clean, because the + * buffers' dirty state is "definitive". We cannot just set the buffers dirty + * or jbddirty because all the journalling code will explode. + * + * So what we do is to mark the page "pending dirty" and next time writepage + * is called, propagate that into the buffers appropriately. + */ +static int ext3_journalled_set_page_dirty(struct page *page) +{ + SetPageChecked(page); + return __set_page_dirty_nobuffers(page); +} + +static const struct address_space_operations ext3_ordered_aops = { + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_ordered_writepage, + .write_begin = ext3_write_begin, + .write_end = ext3_ordered_write_end, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations ext3_writeback_aops = { + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_writeback_writepage, + .write_begin = ext3_write_begin, + .write_end = ext3_writeback_write_end, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations ext3_journalled_aops = { + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_journalled_writepage, + .write_begin = ext3_write_begin, + .write_end = ext3_journalled_write_end, + .set_page_dirty = ext3_journalled_set_page_dirty, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +void ext3_set_aops(struct inode *inode) +{ + if (ext3_should_order_data(inode)) + inode->i_mapping->a_ops = &ext3_ordered_aops; + else if (ext3_should_writeback_data(inode)) + inode->i_mapping->a_ops = &ext3_writeback_aops; + else + inode->i_mapping->a_ops = &ext3_journalled_aops; +} + +/* + * ext3_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +static int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) +{ + ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, iblock, length, pos; + struct inode *inode = mapping->host; + struct buffer_head *bh; + int err = 0; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (buffer_freed(bh)) { + BUFFER_TRACE(bh, "freed: skip"); + goto unlock; + } + + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "unmapped"); + ext3_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto unlock; + } + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + if (ext3_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto unlock; + } + + zero_user(page, offset, length); + BUFFER_TRACE(bh, "zeroed end of block"); + + err = 0; + if (ext3_should_journal_data(inode)) { + err = ext3_journal_dirty_metadata(handle, bh); + } else { + if (ext3_should_order_data(inode)) + err = ext3_journal_dirty_data(handle, bh); + mark_buffer_dirty(bh); + } + +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext3_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext3_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext3_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is referred + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext3_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext3_find_shared(struct inode *inode, int depth, + int offsets[4], Indirect chain[4], __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offset + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext3_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext3. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while(partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + */ +static void ext3_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t block_to_free, + unsigned long count, __le32 *first, __le32 *last) +{ + __le32 *p; + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + if (ext3_journal_dirty_metadata(handle, bh)) + return; + } + ext3_mark_inode_dirty(handle, inode); + truncate_restart_transaction(handle, inode); + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + if (ext3_journal_get_write_access(handle, bh)) + return; + } + } + + /* + * Any buffers which are on the journal will be in memory. We find + * them on the hash table so journal_revoke() will run journal_forget() + * on them. We've already detached each block from the file, so + * bforget() in journal_forget() should be safe. + * + * AKPM: turn on bforget in journal_forget()!!! + */ + for (p = first; p < last; p++) { + u32 nr = le32_to_cpu(*p); + if (nr) { + struct buffer_head *bh; + + *p = 0; + bh = sb_find_get_block(inode->i_sb, nr); + ext3_forget(handle, 0, inode, bh, nr); + } + } + + ext3_free_blocks(handle, inode, block_to_free, count); +} + +/** + * ext3_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks referred from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext3_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, + __le32 *first, __le32 *last) +{ + ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + ext3_fsblk_t nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind + for current block */ + int err; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + ext3_clear_blocks(handle, inode, this_bh, + block_to_free, + count, block_to_free_p, p); + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (count > 0) + ext3_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if (bh2jh(this_bh)) + ext3_journal_dirty_metadata(handle, this_bh); + else + ext3_error(inode->i_sb, "ext3_free_data", + "circular indirect block detected, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long)this_bh->b_blocknr); + } +} + +/** + * ext3_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks referred from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext3_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + __le32 *first, __le32 *last, int depth) +{ + ext3_fsblk_t nr; + __le32 *p; + + if (is_handle_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + /* Go read the buffer for the next level down */ + bh = sb_bread(inode->i_sb, nr); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + ext3_error(inode->i_sb, "ext3_free_branches", + "Read failure, inode=%lu, block="E3FSBLK, + inode->i_ino, nr); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext3_free_branches(handle, inode, bh, + (__le32*)bh->b_data, + (__le32*)bh->b_data + addr_per_block, + depth); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (is_handle_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext3_mark_inode_dirty(handle, inode); + truncate_restart_transaction(handle, inode); + } + + /* + * We've probably journalled the indirect block several + * times during the truncate. But it's no longer + * needed and we now drop it from the transaction via + * journal_revoke(). + * + * That's easy if it's exclusively part of this + * transaction. But if it's part of the committing + * transaction then journal_forget() will simply + * brelse() it. That means that if the underlying + * block is reallocated in ext3_get_block(), + * unmap_underlying_metadata() will find this block + * and will try to get rid of it. damn, damn. Thus + * we don't allow a block to be reallocated until + * a transaction freeing it has fully committed. + * + * We also have to make sure journal replay after a + * crash does not overwrite non-journaled data blocks + * with old metadata when the block got reallocated for + * data. Thus we have to store a revoke record for a + * block in the same transaction in which we free the + * block. + */ + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + + ext3_free_blocks(handle, inode, nr, 1); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext3_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext3_free_data(handle, inode, parent_bh, first, last); + } +} + +int ext3_can_truncate(struct inode *inode) +{ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return 0; + if (S_ISREG(inode->i_mode)) + return 1; + if (S_ISDIR(inode->i_mode)) + return 1; + if (S_ISLNK(inode->i_mode)) + return !ext3_inode_is_fast_symlink(inode); + return 0; +} + +/* + * ext3_truncate() + * + * We block out ext3_get_block() block instantiations across the entire + * transaction, and VFS/VM ensures that ext3_truncate() cannot run + * simultaneously on behalf of the same inode. + * + * As we work through the truncate and commmit bits of it to the journal there + * is one core, guiding principle: the file's tree must always be consistent on + * disk. We must be able to restart the truncate after a crash. + * + * The file's tree may be transiently inconsistent in memory (although it + * probably isn't), but whenever we close off and commit a journal transaction, + * the contents of (the filesystem + the journal) must be consistent and + * restartable. It's pretty simple, really: bottom up, right to left (although + * left-to-right works OK too). + * + * Note that at recovery time, journal replay occurs *before* the restart of + * truncate against the orphan inode list. + * + * The committed inode has the new, desired i_size (which is the same as + * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see + * that this inode's truncate did not complete and it will again call + * ext3_truncate() to have another go. So there will be instantiated blocks + * to the right of the truncation point in a crashed ext3 filesystem. But + * that's fine - as long as they are linked from the inode, the post-crash + * ext3_truncate() run will find them and release them. + */ +void ext3_truncate(struct inode *inode) +{ + handle_t *handle; + struct ext3_inode_info *ei = EXT3_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n; + long last_block; + unsigned blocksize = inode->i_sb->s_blocksize; + struct page *page; + + if (!ext3_can_truncate(inode)) + goto out_notrans; + + if (inode->i_size == 0 && ext3_should_writeback_data(inode)) + ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); + + /* + * We have to lock the EOF page here, because lock_page() nests + * outside journal_start(). + */ + if ((inode->i_size & (blocksize - 1)) == 0) { + /* Block boundary? Nothing to do */ + page = NULL; + } else { + page = grab_cache_page(mapping, + inode->i_size >> PAGE_CACHE_SHIFT); + if (!page) + goto out_notrans; + } + + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { + clear_highpage(page); + flush_dcache_page(page); + unlock_page(page); + page_cache_release(page); + } + goto out_notrans; + } + + last_block = (inode->i_size + blocksize-1) + >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); + + if (page) + ext3_block_truncate_page(handle, page, mapping, inode->i_size); + + n = ext3_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + goto out_stop; /* error */ + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext3_orphan_add(handle, inode)) + goto out_stop; + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext3 *really* writes onto the disk inode. + */ + ei->i_disksize = inode->i_size; + + /* + * From here we block out all ext3_get_block() callers who want to + * modify the block allocation tree. + */ + mutex_lock(&ei->truncate_mutex); + + if (n == 1) { /* direct blocks */ + ext3_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT3_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext3_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + ext3_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext3_free_branches(handle, inode, partial->bh, partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse (partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT3_IND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT3_IND_BLOCK] = 0; + } + case EXT3_IND_BLOCK: + nr = i_data[EXT3_DIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT3_DIND_BLOCK] = 0; + } + case EXT3_DIND_BLOCK: + nr = i_data[EXT3_TIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT3_TIND_BLOCK] = 0; + } + case EXT3_TIND_BLOCK: + ; + } + + ext3_discard_reservation(inode); + + mutex_unlock(&ei->truncate_mutex); + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + + /* + * In a multi-transaction truncate, we only make the final transaction + * synchronous + */ + if (IS_SYNC(inode)) + handle->h_sync = 1; +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext3_evict_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext3_orphan_del(handle, inode); + + ext3_journal_stop(handle); + return; +out_notrans: + /* + * Delete the inode from orphan list so that it doesn't stay there + * forever and trigger assertion on umount. + */ + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); +} + +static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, + unsigned long ino, struct ext3_iloc *iloc) +{ + unsigned long block_group; + unsigned long offset; + ext3_fsblk_t block; + struct ext3_group_desc *gdp; + + if (!ext3_valid_inum(sb, ino)) { + /* + * This error is already checked for in namei.c unless we are + * looking at an NFS filehandle, in which case no error + * report is needed + */ + return 0; + } + + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + gdp = ext3_get_group_desc(sb, block_group, NULL); + if (!gdp) + return 0; + /* + * Figure out the offset within the block group inode table + */ + offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) * + EXT3_INODE_SIZE(sb); + block = le32_to_cpu(gdp->bg_inode_table) + + (offset >> EXT3_BLOCK_SIZE_BITS(sb)); + + iloc->block_group = block_group; + iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1); + return block; +} + +/* + * ext3_get_inode_loc returns with an extra refcount against the inode's + * underlying buffer_head on success. If 'in_mem' is true, we have all + * data in memory that is needed to recreate the on-disk version of this + * inode. + */ +static int __ext3_get_inode_loc(struct inode *inode, + struct ext3_iloc *iloc, int in_mem) +{ + ext3_fsblk_t block; + struct buffer_head *bh; + + block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc); + if (!block) + return -EIO; + + bh = sb_getblk(inode->i_sb, block); + if (!bh) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "unable to read inode block - " + "inode=%lu, block="E3FSBLK, + inode->i_ino, block); + return -EIO; + } + if (!buffer_uptodate(bh)) { + lock_buffer(bh); + + /* + * If the buffer has the write error flag, we have failed + * to write out another inode in the same block. In this + * case, we don't have to read the block because we may + * read the old inode data successfully. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) + set_buffer_uptodate(bh); + + if (buffer_uptodate(bh)) { + /* someone brought it uptodate while we waited */ + unlock_buffer(bh); + goto has_buffer; + } + + /* + * If we have all information of the inode in memory and this + * is the only valid inode in the block, we need not read the + * block. + */ + if (in_mem) { + struct buffer_head *bitmap_bh; + struct ext3_group_desc *desc; + int inodes_per_buffer; + int inode_offset, i; + int block_group; + int start; + + block_group = (inode->i_ino - 1) / + EXT3_INODES_PER_GROUP(inode->i_sb); + inodes_per_buffer = bh->b_size / + EXT3_INODE_SIZE(inode->i_sb); + inode_offset = ((inode->i_ino - 1) % + EXT3_INODES_PER_GROUP(inode->i_sb)); + start = inode_offset & ~(inodes_per_buffer - 1); + + /* Is the inode bitmap in cache? */ + desc = ext3_get_group_desc(inode->i_sb, + block_group, NULL); + if (!desc) + goto make_io; + + bitmap_bh = sb_getblk(inode->i_sb, + le32_to_cpu(desc->bg_inode_bitmap)); + if (!bitmap_bh) + goto make_io; + + /* + * If the inode bitmap isn't in cache then the + * optimisation may end up performing two reads instead + * of one, so skip it. + */ + if (!buffer_uptodate(bitmap_bh)) { + brelse(bitmap_bh); + goto make_io; + } + for (i = start; i < start + inodes_per_buffer; i++) { + if (i == inode_offset) + continue; + if (ext3_test_bit(i, bitmap_bh->b_data)) + break; + } + brelse(bitmap_bh); + if (i == start + inodes_per_buffer) { + /* all other inodes are free, so skip I/O */ + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + unlock_buffer(bh); + goto has_buffer; + } + } + +make_io: + /* + * There are other valid inodes in the buffer, this inode + * has in-inode xattrs, or we don't have this inode in memory. + * Read the block from disk. + */ + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ_META, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + ext3_error(inode->i_sb, "ext3_get_inode_loc", + "unable to read inode block - " + "inode=%lu, block="E3FSBLK, + inode->i_ino, block); + brelse(bh); + return -EIO; + } + } +has_buffer: + iloc->bh = bh; + return 0; +} + +int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) +{ + /* We have all inode data except xattrs in memory here. */ + return __ext3_get_inode_loc(inode, iloc, + !ext3_test_inode_state(inode, EXT3_STATE_XATTR)); +} + +void ext3_set_inode_flags(struct inode *inode) +{ + unsigned int flags = EXT3_I(inode)->i_flags; + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + if (flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT3_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & EXT3_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & EXT3_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT3_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +/* Propagate flags from i_flags to EXT3_I(inode)->i_flags */ +void ext3_get_inode_flags(struct ext3_inode_info *ei) +{ + unsigned int flags = ei->vfs_inode.i_flags; + + ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL| + EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL); + if (flags & S_SYNC) + ei->i_flags |= EXT3_SYNC_FL; + if (flags & S_APPEND) + ei->i_flags |= EXT3_APPEND_FL; + if (flags & S_IMMUTABLE) + ei->i_flags |= EXT3_IMMUTABLE_FL; + if (flags & S_NOATIME) + ei->i_flags |= EXT3_NOATIME_FL; + if (flags & S_DIRSYNC) + ei->i_flags |= EXT3_DIRSYNC_FL; +} + +struct inode *ext3_iget(struct super_block *sb, unsigned long ino) +{ + struct ext3_iloc iloc; + struct ext3_inode *raw_inode; + struct ext3_inode_info *ei; + struct buffer_head *bh; + struct inode *inode; + journal_t *journal = EXT3_SB(sb)->s_journal; + transaction_t *transaction; + long ret; + int block; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ei = EXT3_I(inode); + ei->i_block_alloc_info = NULL; + + ret = __ext3_get_inode_loc(inode, &iloc, 0); + if (ret < 0) + goto bad_inode; + bh = iloc.bh; + raw_inode = ext3_raw_inode(&iloc); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if(!(test_opt (inode->i_sb, NO_UID32))) { + inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + inode->i_size = le32_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); + inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); + inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); + inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; + + ei->i_state_flags = 0; + ei->i_dir_start_lookup = 0; + ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0) { + if (inode->i_mode == 0 || + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) { + /* this inode is deleted */ + brelse (bh); + ret = -ESTALE; + goto bad_inode; + } + /* The only unlinked inodes we let through here have + * valid i_mode and are being read by the orphan + * recovery code: that's fine, we're about to complete + * the process of deleting those. */ + } + inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); + ei->i_flags = le32_to_cpu(raw_inode->i_flags); +#ifdef EXT3_FRAGMENTS + ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); + ei->i_frag_no = raw_inode->i_frag; + ei->i_frag_size = raw_inode->i_fsize; +#endif + ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + if (!S_ISREG(inode->i_mode)) { + ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); + } else { + inode->i_size |= + ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; + } + ei->i_disksize = inode->i_size; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + ei->i_block_group = iloc.block_group; + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT3_N_BLOCKS; block++) + ei->i_data[block] = raw_inode->i_block[block]; + INIT_LIST_HEAD(&ei->i_orphan); + + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + spin_unlock(&journal->j_state_lock); + atomic_set(&ei->i_sync_tid, tid); + atomic_set(&ei->i_datasync_tid, tid); + } + + if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && + EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { + /* + * When mke2fs creates big inodes it does not zero out + * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE, + * so ignore those first few inodes. + */ + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > + EXT3_INODE_SIZE(inode->i_sb)) { + brelse (bh); + ret = -EIO; + goto bad_inode; + } + if (ei->i_extra_isize == 0) { + /* The extra space is currently unused. Use it. */ + ei->i_extra_isize = sizeof(struct ext3_inode) - + EXT3_GOOD_OLD_INODE_SIZE; + } else { + __le32 *magic = (void *)raw_inode + + EXT3_GOOD_OLD_INODE_SIZE + + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) + ext3_set_inode_state(inode, EXT3_STATE_XATTR); + } + } else + ei->i_extra_isize = 0; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + ext3_set_aops(inode); + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (ext3_inode_is_fast_symlink(inode)) { + inode->i_op = &ext3_fast_symlink_inode_operations; + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { + inode->i_op = &ext3_symlink_inode_operations; + ext3_set_aops(inode); + } + } else { + inode->i_op = &ext3_special_inode_operations; + if (raw_inode->i_block[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } + brelse (iloc.bh); + ext3_set_inode_flags(inode); + unlock_new_inode(inode); + return inode; + +bad_inode: + iget_failed(inode); + return ERR_PTR(ret); +} + +/* + * Post the struct inode info into an on-disk inode location in the + * buffer-cache. This gobbles the caller's reference to the + * buffer_head in the inode location struct. + * + * The caller must have write access to iloc->bh. + */ +static int ext3_do_update_inode(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc) +{ + struct ext3_inode *raw_inode = ext3_raw_inode(iloc); + struct ext3_inode_info *ei = EXT3_I(inode); + struct buffer_head *bh = iloc->bh; + int err = 0, rc, block; + +again: + /* we can't allow multiple procs in here at once, its a bit racey */ + lock_buffer(bh); + + /* For fields not not tracking in the in-memory inode, + * initialise them to zero for new inodes. */ + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) + memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); + + ext3_get_inode_flags(ei); + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if(!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); +/* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if(!ei->i_dtime) { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(inode->i_uid)); + raw_inode->i_gid_high = + cpu_to_le16(high_16_bits(inode->i_gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = + cpu_to_le16(fs_high2lowuid(inode->i_uid)); + raw_inode->i_gid_low = + cpu_to_le16(fs_high2lowgid(inode->i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le32(ei->i_disksize); + raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); + raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); + raw_inode->i_flags = cpu_to_le32(ei->i_flags); +#ifdef EXT3_FRAGMENTS + raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); + raw_inode->i_frag = ei->i_frag_no; + raw_inode->i_fsize = ei->i_frag_size; +#endif + raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); + if (!S_ISREG(inode->i_mode)) { + raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); + } else { + raw_inode->i_size_high = + cpu_to_le32(ei->i_disksize >> 32); + if (ei->i_disksize > 0x7fffffffULL) { + struct super_block *sb = inode->i_sb; + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || + EXT3_SB(sb)->s_es->s_rev_level == + cpu_to_le32(EXT3_GOOD_OLD_REV)) { + /* If this is the first large file + * created, add a flag to the superblock. + */ + unlock_buffer(bh); + err = ext3_journal_get_write_access(handle, + EXT3_SB(sb)->s_sbh); + if (err) + goto out_brelse; + + ext3_update_dynamic_rev(sb); + EXT3_SET_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_LARGE_FILE); + handle->h_sync = 1; + err = ext3_journal_dirty_metadata(handle, + EXT3_SB(sb)->s_sbh); + /* get our lock and start over */ + goto again; + } + } + } + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + raw_inode->i_block[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + raw_inode->i_block[1] = 0; + } else { + raw_inode->i_block[0] = 0; + raw_inode->i_block[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + raw_inode->i_block[2] = 0; + } + } else for (block = 0; block < EXT3_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + + if (ei->i_extra_isize) + raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + unlock_buffer(bh); + rc = ext3_journal_dirty_metadata(handle, bh); + if (!err) + err = rc; + ext3_clear_inode_state(inode, EXT3_STATE_NEW); + + atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); +out_brelse: + brelse (bh); + ext3_std_error(inode->i_sb, err); + return err; +} + +/* + * ext3_write_inode() + * + * We are called from a few places: + * + * - Within generic_file_write() for O_SYNC files. + * Here, there will be no transaction running. We wait for any running + * trasnaction to commit. + * + * - Within sys_sync(), kupdate and such. + * We wait on commit, if tol to. + * + * - Within prune_icache() (PF_MEMALLOC == true) + * Here we simply return. We can't afford to block kswapd on the + * journal commit. + * + * In all cases it is actually safe for us to return without doing anything, + * because the inode has been copied into a raw inode buffer in + * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for + * knfsd. + * + * Note that we are absolutely dependent upon all inode dirtiers doing the + * right thing: they *must* call mark_inode_dirty() after dirtying info in + * which we are interested. + * + * It would be a bug for them to not do this. The code: + * + * mark_inode_dirty(inode) + * stuff(); + * inode->i_size = expr; + * + * is in error because a kswapd-driven write_inode() could occur while + * `stuff()' is running, and the new i_size will be lost. Plus the inode + * will no longer be on the superblock's dirty inode list. + */ +int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + if (current->flags & PF_MEMALLOC) + return 0; + + if (ext3_journal_current_handle()) { + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); + dump_stack(); + return -EIO; + } + + if (wbc->sync_mode != WB_SYNC_ALL) + return 0; + + return ext3_force_commit(inode->i_sb); +} + +/* + * ext3_setattr() + * + * Called from notify_change. + * + * We want to trap VFS attempts to truncate the file as soon as + * possible. In particular, we want to make sure that when the VFS + * shrinks i_size, we put the inode on the orphan list and modify + * i_disksize immediately, so that during the subsequent flushing of + * dirty pages and freeing of disk blocks, we can guarantee that any + * commit will leave the blocks being flushed in an unused state on + * disk. (On recovery, the inode will get truncated and the blocks will + * be freed, so we have a strong guarantee that no future commit will + * leave these blocks visible to the user.) + * + * Called with inode->sem down. + */ +int ext3_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error, rc = 0; + const unsigned int ia_valid = attr->ia_valid; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + handle_t *handle; + + /* (user+group)*(old+new) structure, inode write (sb, + * inode block, ? - but truncate inode update has it) */ + handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + error = dquot_transfer(inode, attr); + if (error) { + ext3_journal_stop(handle); + return error; + } + /* Update corresponding info in inode so that everything is in + * one transaction */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + error = ext3_mark_inode_dirty(handle, inode); + ext3_journal_stop(handle); + } + + if (S_ISREG(inode->i_mode) && + attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { + handle_t *handle; + + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + + error = ext3_orphan_add(handle, inode); + EXT3_I(inode)->i_disksize = attr->ia_size; + rc = ext3_mark_inode_dirty(handle, inode); + if (!error) + error = rc; + ext3_journal_stop(handle); + } + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + rc = vmtruncate(inode, attr->ia_size); + if (rc) + goto err_out; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + + if (ia_valid & ATTR_MODE) + rc = ext3_acl_chmod(inode); + +err_out: + ext3_std_error(inode->i_sb, error); + if (!error) + error = rc; + return error; +} + + +/* + * How many blocks doth make a writepage()? + * + * With N blocks per page, it may be: + * N data blocks + * 2 indirect block + * 2 dindirect + * 1 tindirect + * N+5 bitmap blocks (from the above) + * N+5 group descriptor summary blocks + * 1 inode block + * 1 superblock. + * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files + * + * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS + * + * With ordered or writeback data it's the same, less the N data blocks. + * + * If the inode's direct blocks can hold an integral number of pages then a + * page cannot straddle two indirect blocks, and we can only touch one indirect + * and dindirect block, and the "5" above becomes "3". + * + * This still overestimates under most circumstances. If we were to pass the + * start and end offsets in here as well we could do block_to_path() on each + * block and work out the exact number of indirects which are touched. Pah. + */ + +static int ext3_writepage_trans_blocks(struct inode *inode) +{ + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else + ret = 2 * (bpp + indirects) + indirects + 2; + +#ifdef CONFIG_QUOTA + /* We know that structure was already allocated during dquot_initialize so + * we will be updating only the data blocks + inodes */ + ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); +#endif + + return ret; +} + +/* + * The caller must have previously called ext3_reserve_inode_write(). + * Give this, we know that the caller already has write access to iloc->bh. + */ +int ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, struct ext3_iloc *iloc) +{ + int err = 0; + + /* the do_update_inode consumes one bh->b_count */ + get_bh(iloc->bh); + + /* ext3_do_update_inode() does journal_dirty_metadata */ + err = ext3_do_update_inode(handle, inode, iloc); + put_bh(iloc->bh); + return err; +} + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int +ext3_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext3_iloc *iloc) +{ + int err = 0; + if (handle) { + err = ext3_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, iloc->bh); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; + } + } + } + ext3_std_error(inode->i_sb, err); + return err; +} + +/* + * What we do here is to mark the in-core inode as clean with respect to inode + * dirtiness (it may still be data-dirty). + * This means that the in-core inode may be reaped by prune_icache + * without having to perform any I/O. This is a very good thing, + * because *any* task may call prune_icache - even ones which + * have a transaction open against a different journal. + * + * Is this cheating? Not really. Sure, we haven't written the + * inode out, but prune_icache isn't a user-visible syncing function. + * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) + * we start and wait on commits. + * + * Is this efficient/effective? Well, we're being nice to the system + * by cleaning up our inodes proactively so they can be reaped + * without I/O. But we are potentially leaving up to five seconds' + * worth of inodes floating about which prune_icache wants us to + * write out. One way to fix that would be to get prune_icache() + * to do a write_super() to free up some memory. It has the desired + * effect. + */ +int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) +{ + struct ext3_iloc iloc; + int err; + + might_sleep(); + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (!err) + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + return err; +} + +/* + * ext3_dirty_inode() is called from __mark_inode_dirty() + * + * We're really interested in the case where a file is being extended. + * i_size has been changed by generic_commit_write() and we thus need + * to include the updated inode in the current transaction. + * + * Also, dquot_alloc_space() will always dirty the inode when blocks + * are allocated to the file. + * + * If the inode is marked synchronous, we don't honour that here - doing + * so would cause a commit on atime updates, which we don't bother doing. + * We handle synchronous inodes at the highest possible level. + */ +void ext3_dirty_inode(struct inode *inode, int flags) +{ + handle_t *current_handle = ext3_journal_current_handle(); + handle_t *handle; + + handle = ext3_journal_start(inode, 2); + if (IS_ERR(handle)) + goto out; + if (current_handle && + current_handle->h_transaction != handle->h_transaction) { + /* This task has a transaction open against a different fs */ + printk(KERN_EMERG "%s: transactions do not match!\n", + __func__); + } else { + jbd_debug(5, "marking dirty. outer handle=%p\n", + current_handle); + ext3_mark_inode_dirty(handle, inode); + } + ext3_journal_stop(handle); +out: + return; +} + +#if 0 +/* + * Bind an inode's backing buffer_head into this transaction, to prevent + * it from being flushed to disk early. Unlike + * ext3_reserve_inode_write, this leaves behind no bh reference and + * returns no iloc structure, so the caller needs to repeat the iloc + * lookup to mark the inode dirty later. + */ +static int ext3_pin_inode(handle_t *handle, struct inode *inode) +{ + struct ext3_iloc iloc; + + int err = 0; + if (handle) { + err = ext3_get_inode_loc(inode, &iloc); + if (!err) { + BUFFER_TRACE(iloc.bh, "get_write_access"); + err = journal_get_write_access(handle, iloc.bh); + if (!err) + err = ext3_journal_dirty_metadata(handle, + iloc.bh); + brelse(iloc.bh); + } + } + ext3_std_error(inode->i_sb, err); + return err; +} +#endif + +int ext3_change_inode_journal_flag(struct inode *inode, int val) +{ + journal_t *journal; + handle_t *handle; + int err; + + /* + * We have to be very careful here: changing a data block's + * journaling status dynamically is dangerous. If we write a + * data block to the journal, change the status and then delete + * that block, we risk forgetting to revoke the old log record + * from the journal and so a subsequent replay can corrupt data. + * So, first we make sure that the journal is empty and that + * nobody is changing anything. + */ + + journal = EXT3_JOURNAL(inode); + if (is_journal_aborted(journal)) + return -EROFS; + + journal_lock_updates(journal); + journal_flush(journal); + + /* + * OK, there are no updates running now, and all cached data is + * synced to disk. We are now in a completely consistent state + * which doesn't have anything in the journal, and we know that + * no filesystem updates are running, so it is safe to modify + * the inode's in-core data-journaling state flag now. + */ + + if (val) + EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL; + else + EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL; + ext3_set_aops(inode); + + journal_unlock_updates(journal); + + /* Finally we can mark the inode as dirty. */ + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext3_mark_inode_dirty(handle, inode); + handle->h_sync = 1; + ext3_journal_stop(handle); + ext3_std_error(inode->i_sb, err); + + return err; +} diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c new file mode 100644 index 00000000..f4090bd2 --- /dev/null +++ b/fs/ext3/ioctl.c @@ -0,0 +1,352 @@ +/* + * linux/fs/ext3/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct ext3_inode_info *ei = EXT3_I(inode); + unsigned int flags; + unsigned short rsv_window_size; + + ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT3_IOC_GETFLAGS: + ext3_get_inode_flags(ei); + flags = ei->i_flags & EXT3_FL_USER_VISIBLE; + return put_user(flags, (int __user *) arg); + case EXT3_IOC_SETFLAGS: { + handle_t *handle = NULL; + int err; + struct ext3_iloc iloc; + unsigned int oldflags; + unsigned int jflag; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + + flags = ext3_mask_flags(inode->i_mode, flags); + + mutex_lock(&inode->i_mutex); + + /* Is it quota file? Do not allow user to mess with it */ + err = -EPERM; + if (IS_NOQUOTA(inode)) + goto flags_out; + + oldflags = ei->i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT3_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; + } + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto flags_out; + } + if (IS_SYNC(inode)) + handle->h_sync = 1; + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + flags = flags & EXT3_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; + ei->i_flags = flags; + + ext3_set_inode_flags(inode); + inode->i_ctime = CURRENT_TIME_SEC; + + err = ext3_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext3_journal_stop(handle); + if (err) + goto flags_out; + + if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) + err = ext3_change_inode_journal_flag(inode, jflag); +flags_out: + mutex_unlock(&inode->i_mutex); + mnt_drop_write(filp->f_path.mnt); + return err; + } + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); + case EXT3_IOC_SETVERSION: + case EXT3_IOC_SETVERSION_OLD: { + handle_t *handle; + struct ext3_iloc iloc; + __u32 generation; + int err; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (get_user(generation, (int __user *) arg)) { + err = -EFAULT; + goto setversion_out; + } + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto setversion_out; + } + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err == 0) { + inode->i_ctime = CURRENT_TIME_SEC; + inode->i_generation = generation; + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + } + ext3_journal_stop(handle); +setversion_out: + mnt_drop_write(filp->f_path.mnt); + return err; + } +#ifdef CONFIG_JBD_DEBUG + case EXT3_IOC_WAIT_FOR_READONLY: + /* + * This is racy - by the time we're woken up and running, + * the superblock could be released. And the module could + * have been unloaded. So sue me. + * + * Returns 1 if it slept, else zero. + */ + { + struct super_block *sb = inode->i_sb; + DECLARE_WAITQUEUE(wait, current); + int ret = 0; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); + if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) { + schedule(); + ret = 1; + } + remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); + return ret; + } +#endif + case EXT3_IOC_GETRSVSZ: + if (test_opt(inode->i_sb, RESERVATION) + && S_ISREG(inode->i_mode) + && ei->i_block_alloc_info) { + rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size; + return put_user(rsv_window_size, (int __user *)arg); + } + return -ENOTTY; + case EXT3_IOC_SETRSVSZ: { + int err; + + if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) + return -ENOTTY; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + + if (!inode_owner_or_capable(inode)) { + err = -EACCES; + goto setrsvsz_out; + } + + if (get_user(rsv_window_size, (int __user *)arg)) { + err = -EFAULT; + goto setrsvsz_out; + } + + if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS) + rsv_window_size = EXT3_MAX_RESERVE_BLOCKS; + + /* + * need to allocate reservation structure for this inode + * before set the window size + */ + mutex_lock(&ei->truncate_mutex); + if (!ei->i_block_alloc_info) + ext3_init_block_alloc_info(inode); + + if (ei->i_block_alloc_info){ + struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; + rsv->rsv_goal_size = rsv_window_size; + } + mutex_unlock(&ei->truncate_mutex); +setrsvsz_out: + mnt_drop_write(filp->f_path.mnt); + return err; + } + case EXT3_IOC_GROUP_EXTEND: { + ext3_fsblk_t n_blocks_count; + struct super_block *sb = inode->i_sb; + int err, err2; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + + if (get_user(n_blocks_count, (__u32 __user *)arg)) { + err = -EFAULT; + goto group_extend_out; + } + err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count); + journal_lock_updates(EXT3_SB(sb)->s_journal); + err2 = journal_flush(EXT3_SB(sb)->s_journal); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + if (err == 0) + err = err2; +group_extend_out: + mnt_drop_write(filp->f_path.mnt); + return err; + } + case EXT3_IOC_GROUP_ADD: { + struct ext3_new_group_data input; + struct super_block *sb = inode->i_sb; + int err, err2; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + + if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg, + sizeof(input))) { + err = -EFAULT; + goto group_add_out; + } + + err = ext3_group_add(sb, &input); + journal_lock_updates(EXT3_SB(sb)->s_journal); + err2 = journal_flush(EXT3_SB(sb)->s_journal); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + if (err == 0) + err = err2; +group_add_out: + mnt_drop_write(filp->f_path.mnt); + return err; + } + case FITRIM: { + + struct super_block *sb = inode->i_sb; + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct fstrim_range *)arg, + sizeof(range))) + return -EFAULT; + + ret = ext3_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case EXT3_IOC32_GETFLAGS: + cmd = EXT3_IOC_GETFLAGS; + break; + case EXT3_IOC32_SETFLAGS: + cmd = EXT3_IOC_SETFLAGS; + break; + case EXT3_IOC32_GETVERSION: + cmd = EXT3_IOC_GETVERSION; + break; + case EXT3_IOC32_SETVERSION: + cmd = EXT3_IOC_SETVERSION; + break; + case EXT3_IOC32_GROUP_EXTEND: + cmd = EXT3_IOC_GROUP_EXTEND; + break; + case EXT3_IOC32_GETVERSION_OLD: + cmd = EXT3_IOC_GETVERSION_OLD; + break; + case EXT3_IOC32_SETVERSION_OLD: + cmd = EXT3_IOC_SETVERSION_OLD; + break; +#ifdef CONFIG_JBD_DEBUG + case EXT3_IOC32_WAIT_FOR_READONLY: + cmd = EXT3_IOC_WAIT_FOR_READONLY; + break; +#endif + case EXT3_IOC32_GETRSVSZ: + cmd = EXT3_IOC_GETRSVSZ; + break; + case EXT3_IOC32_SETRSVSZ: + cmd = EXT3_IOC_SETRSVSZ; + break; + case EXT3_IOC_GROUP_ADD: + break; + default: + return -ENOIOCTLCMD; + } + return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c new file mode 100644 index 00000000..e5a71111 --- /dev/null +++ b/fs/ext3/namei.c @@ -0,0 +1,2550 @@ +/* + * linux/fs/ext3/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 + * Hash Tree Directory indexing (c) + * Daniel Phillips, 2001 + * Hash Tree Directory indexing porting + * Christopher Li, 2002 + * Hash Tree Directory indexing cleanup + * Theodore Ts'o, 2002 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "namei.h" +#include "xattr.h" +#include "acl.h" + +/* + * define how far ahead to read directories while searching them. + */ +#define NAMEI_RA_CHUNKS 2 +#define NAMEI_RA_BLOCKS 4 +#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) +#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + +static struct buffer_head *ext3_append(handle_t *handle, + struct inode *inode, + u32 *block, int *err) +{ + struct buffer_head *bh; + + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + bh = ext3_bread(handle, inode, *block, 1, err); + if (bh) { + inode->i_size += inode->i_sb->s_blocksize; + EXT3_I(inode)->i_disksize = inode->i_size; + *err = ext3_journal_get_write_access(handle, bh); + if (*err) { + brelse(bh); + bh = NULL; + } + } + return bh; +} + +#ifndef assert +#define assert(test) J_ASSERT(test) +#endif + +#ifdef DX_DEBUG +#define dxtrace(command) command +#else +#define dxtrace(command) +#endif + +struct fake_dirent +{ + __le32 inode; + __le16 rec_len; + u8 name_len; + u8 file_type; +}; + +struct dx_countlimit +{ + __le16 limit; + __le16 count; +}; + +struct dx_entry +{ + __le32 hash; + __le32 block; +}; + +/* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +struct dx_root +{ + struct fake_dirent dot; + char dot_name[4]; + struct fake_dirent dotdot; + char dotdot_name[4]; + struct dx_root_info + { + __le32 reserved_zero; + u8 hash_version; + u8 info_length; /* 8 */ + u8 indirect_levels; + u8 unused_flags; + } + info; + struct dx_entry entries[0]; +}; + +struct dx_node +{ + struct fake_dirent fake; + struct dx_entry entries[0]; +}; + + +struct dx_frame +{ + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; +}; + +struct dx_map_entry +{ + u32 hash; + u16 offs; + u16 size; +}; + +static inline unsigned dx_get_block (struct dx_entry *entry); +static void dx_set_block (struct dx_entry *entry, unsigned value); +static inline unsigned dx_get_hash (struct dx_entry *entry); +static void dx_set_hash (struct dx_entry *entry, unsigned value); +static unsigned dx_get_count (struct dx_entry *entries); +static unsigned dx_get_limit (struct dx_entry *entries); +static void dx_set_count (struct dx_entry *entries, unsigned value); +static void dx_set_limit (struct dx_entry *entries, unsigned value); +static unsigned dx_root_limit (struct inode *dir, unsigned infosize); +static unsigned dx_node_limit (struct inode *dir); +static struct dx_frame *dx_probe(struct qstr *entry, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, + int *err); +static void dx_release (struct dx_frame *frames); +static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); +static void dx_sort_map(struct dx_map_entry *map, unsigned count); +static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); +static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize); +static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); +static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash); +static struct buffer_head * ext3_dx_find_entry(struct inode *dir, + struct qstr *entry, struct ext3_dir_entry_2 **res_dir, + int *err); +static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); + +/* + * p is at least 6 bytes before the end of page + */ +static inline struct ext3_dir_entry_2 * +ext3_next_entry(struct ext3_dir_entry_2 *p) +{ + return (struct ext3_dir_entry_2 *)((char *)p + + ext3_rec_len_from_disk(p->rec_len)); +} + +/* + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ + +static inline unsigned dx_get_block (struct dx_entry *entry) +{ + return le32_to_cpu(entry->block) & 0x00ffffff; +} + +static inline void dx_set_block (struct dx_entry *entry, unsigned value) +{ + entry->block = cpu_to_le32(value); +} + +static inline unsigned dx_get_hash (struct dx_entry *entry) +{ + return le32_to_cpu(entry->hash); +} + +static inline void dx_set_hash (struct dx_entry *entry, unsigned value) +{ + entry->hash = cpu_to_le32(value); +} + +static inline unsigned dx_get_count (struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->count); +} + +static inline unsigned dx_get_limit (struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->limit); +} + +static inline void dx_set_count (struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); +} + +static inline void dx_set_limit (struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); +} + +static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - + EXT3_DIR_REC_LEN(2) - infosize; + return entry_space / sizeof(struct dx_entry); +} + +static inline unsigned dx_node_limit (struct inode *dir) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); + return entry_space / sizeof(struct dx_entry); +} + +/* + * Debug + */ +#ifdef DX_DEBUG +static void dx_show_index (char * label, struct dx_entry *entries) +{ + int i, n = dx_get_count (entries); + printk("%s index ", label); + for (i = 0; i < n; i++) + { + printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i)); + } + printk("\n"); +} + +struct stats +{ + unsigned names; + unsigned space; + unsigned bcount; +}; + +static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de, + int size, int show_names) +{ + unsigned names = 0, space = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + printk("names: "); + while ((char *) de < base + size) + { + if (de->inode) + { + if (show_names) + { + int len = de->name_len; + char *name = de->name; + while (len--) printk("%c", *name++); + ext3fs_dirhash(de->name, de->name_len, &h); + printk(":%x.%u ", h.hash, + ((char *) de - base)); + } + space += EXT3_DIR_REC_LEN(de->name_len); + names++; + } + de = ext3_next_entry(de); + } + printk("(%i)\n", names); + return (struct stats) { names, space, 1 }; +} + +struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, + struct dx_entry *entries, int levels) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count = dx_get_count (entries), names = 0, space = 0, i; + unsigned bcount = 0; + struct buffer_head *bh; + int err; + printk("%i indexed blocks...\n", count); + for (i = 0; i < count; i++, entries++) + { + u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; + u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; + struct stats stats; + printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); + if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; + stats = levels? + dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): + dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0); + names += stats.names; + space += stats.space; + bcount += stats.bcount; + brelse (bh); + } + if (bcount) + printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", + names, space/bcount,(space/bcount)*100/blocksize); + return (struct stats) { names, space, bcount}; +} +#endif /* DX_DEBUG */ + +/* + * Probe for a directory leaf block to search. + * + * dx_probe can return ERR_BAD_DX_DIR, which means there was a format + * error in the directory index, and the caller should fall back to + * searching the directory normally. The callers of dx_probe **MUST** + * check for this error code, and make sure it never gets reflected + * back to userspace. + */ +static struct dx_frame * +dx_probe(struct qstr *entry, struct inode *dir, + struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) +{ + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; + struct dx_root *root; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; + + frame->bh = NULL; + if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) + goto fail; + root = (struct dx_root *) bh->b_data; + if (root->info.hash_version != DX_HASH_TEA && + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY) { + ext3_warning(dir->i_sb, __func__, + "Unrecognised inode hash code %d", + root->info.hash_version); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; + hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; + if (entry) + ext3fs_dirhash(entry->name, entry->len, hinfo); + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { + ext3_warning(dir->i_sb, __func__, + "Unimplemented inode hash flags: %#06x", + root->info.unused_flags); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + if ((indirect = root->info.indirect_levels) > 1) { + ext3_warning(dir->i_sb, __func__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + entries = (struct dx_entry *) (((char *)&root->info) + + root->info.info_length); + + if (dx_get_limit(entries) != dx_root_limit(dir, + root->info.info_length)) { + ext3_warning(dir->i_sb, __func__, + "dx entry: limit != root limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + dxtrace (printk("Look up %x", hash)); + while (1) + { + count = dx_get_count(entries); + if (!count || count > dx_get_limit(entries)) { + ext3_warning(dir->i_sb, __func__, + "dx entry: no count or count > limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail2; + } + + p = entries + 1; + q = entries + count - 1; + while (p <= q) + { + m = p + (q - p)/2; + dxtrace(printk(".")); + if (dx_get_hash(m) > hash) + q = m - 1; + else + p = m + 1; + } + + if (0) // linear search cross check + { + unsigned n = count - 1; + at = entries; + while (n--) + { + dxtrace(printk(",")); + if (dx_get_hash(++at) > hash) + { + at--; + break; + } + } + assert (at == p - 1); + } + + at = p - 1; + dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); + frame->bh = bh; + frame->entries = entries; + frame->at = at; + if (!indirect--) return frame; + if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) + goto fail2; + at = entries = ((struct dx_node *) bh->b_data)->entries; + if (dx_get_limit(entries) != dx_node_limit (dir)) { + ext3_warning(dir->i_sb, __func__, + "dx entry: limit != node limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail2; + } + frame++; + frame->bh = NULL; + } +fail2: + while (frame >= frame_in) { + brelse(frame->bh); + frame--; + } +fail: + if (*err == ERR_BAD_DX_DIR) + ext3_warning(dir->i_sb, __func__, + "Corrupt dir inode %ld, running e2fsck is " + "recommended.", dir->i_ino); + return NULL; +} + +static void dx_release (struct dx_frame *frames) +{ + if (frames[0].bh == NULL) + return; + + if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) + brelse(frames[1].bh); + brelse(frames[0].bh); +} + +/* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search + * should be necessary. Whether or not the search is necessary is + * controlled by the hash parameter. If the hash value is even, then + * the search is only continued if the next block starts with that + * hash value. This is used if we are searching for a specific file. + * + * If the hash value is HASH_NB_ALWAYS, then always go to the next block. + * + * This function returns 1 if the caller should continue to search, + * or 0 if it should not. If there is an error reading one of the + * index blocks, it will a negative error code. + * + * If start_hash is non-null, it will be filled in with the starting + * hash of the next page. + */ +static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash) +{ + struct dx_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + + p = frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and + * increment pointer in the parent node. When we break out of + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ + while (1) { + if (++(p->at) < p->entries + dx_get_count(p->entries)) + break; + if (p == frames) + return 0; + num_frames++; + p--; + } + + /* + * If the hash is 1, then continue only if the next page has a + * continuation hash of any value. This is used for readdir + * handling. Otherwise, check to see if the hash matches the + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ + bhash = dx_get_hash(p->at); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { + if ((bhash & ~1) != hash) + return 0; + } + /* + * If the hash is HASH_NB_ALWAYS, we always go to the next + * block so no check is necessary + */ + while (num_frames--) { + if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), + 0, &err))) + return err; /* Failure */ + p++; + brelse (p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } + return 1; +} + + +/* + * This function fills a red-black tree with information from a + * directory block. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +static int htree_dirblock_to_tree(struct file *dir_file, + struct inode *dir, int block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash) +{ + struct buffer_head *bh; + struct ext3_dir_entry_2 *de, *top; + int err, count = 0; + + dxtrace(printk("In htree dirblock_to_tree: block %d\n", block)); + if (!(bh = ext3_bread (NULL, dir, block, 0, &err))) + return err; + + de = (struct ext3_dir_entry_2 *) bh->b_data; + top = (struct ext3_dir_entry_2 *) ((char *) de + + dir->i_sb->s_blocksize - + EXT3_DIR_REC_LEN(0)); + for (; de < top; de = ext3_next_entry(de)) { + if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, + (block<i_sb)) + +((char *)de - bh->b_data))) { + /* On error, skip the f_pos to the next block. */ + dir_file->f_pos = (dir_file->f_pos | + (dir->i_sb->s_blocksize - 1)) + 1; + brelse (bh); + return count; + } + ext3fs_dirhash(de->name, de->name_len, hinfo); + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + if ((err = ext3_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de)) != 0) { + brelse(bh); + return err; + } + count++; + } + brelse(bh); + return count; +} + + +/* + * This function fills a red-black tree with information from a + * directory. We start scanning the directory in hash order, starting + * at start_hash and start_minor_hash. + * + * This function returns the number of entries inserted into the tree, + * or a negative error code. + */ +int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash) +{ + struct dx_hash_info hinfo; + struct ext3_dir_entry_2 *de; + struct dx_frame frames[2], *frame; + struct inode *dir; + int block, err; + int count = 0; + int ret; + __u32 hashval; + + dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, + start_minor_hash)); + dir = dir_file->f_path.dentry->d_inode; + if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { + hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT3_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, + start_hash, start_minor_hash); + *next_hash = ~0; + return count; + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; + frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); + if (!frame) + return err; + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; + if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) + goto errout; + count++; + } + if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { + de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; + de = ext3_next_entry(de); + if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0) + goto errout; + count++; + } + + while (1) { + block = dx_get_block(frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { + err = ret; + goto errout; + } + count += ret; + hashval = ~0; + ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, + frame, frames, &hashval); + *next_hash = hashval; + if (ret < 0) { + err = ret; + goto errout; + } + /* + * Stop if: (a) there are no more entries, or + * (b) we have inserted at least one entry and the + * next hash value is not a continuation + */ + if ((ret == 0) || + (count && ((hashval & 1) == 0))) + break; + } + dx_release(frames); + dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + count, *next_hash)); + return count; +errout: + dx_release(frames); + return (err); +} + + +/* + * Directory block splitting, compacting + */ + +/* + * Create map of hash values, offsets, and sizes, stored at end of block. + * Returns number of entries mapped. + */ +static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) +{ + int count = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + while ((char *) de < base + blocksize) + { + if (de->name_len && de->inode) { + ext3fs_dirhash(de->name, de->name_len, &h); + map_tail--; + map_tail->hash = h.hash; + map_tail->offs = (u16) ((char *) de - base); + map_tail->size = le16_to_cpu(de->rec_len); + count++; + cond_resched(); + } + /* XXX: do we need to check rec_len == 0 case? -Chris */ + de = ext3_next_entry(de); + } + return count; +} + +/* Sort map by hash value */ +static void dx_sort_map (struct dx_map_entry *map, unsigned count) +{ + struct dx_map_entry *p, *q, *top = map + count - 1; + int more; + /* Combsort until bubble sort doesn't suck */ + while (count > 2) + { + count = count*10/13; + if (count - 9 < 2) /* 9, 10 -> 11 */ + count = 11; + for (p = top, q = p - count; q >= map; p--, q--) + if (p->hash < q->hash) + swap(*p, *q); + } + /* Garden variety bubble sort */ + do { + more = 0; + q = top; + while (q-- > map) + { + if (q[1].hash >= q[0].hash) + continue; + swap(*(q+1), *q); + more = 1; + } + } while(more); +} + +static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) +{ + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; + int count = dx_get_count(entries); + + assert(count < dx_get_limit(entries)); + assert(old < entries + count); + memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); +} + +static void ext3_update_dx_flag(struct inode *inode) +{ + if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_INDEX)) + EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; +} + +/* + * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. + * + * `len <= EXT3_NAME_LEN' is guaranteed by caller. + * `de != NULL' is guaranteed by caller. + */ +static inline int ext3_match (int len, const char * const name, + struct ext3_dir_entry_2 * de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * Returns 0 if not found, -1 on failure, and 1 on success + */ +static inline int search_dirblock(struct buffer_head * bh, + struct inode *dir, + struct qstr *child, + unsigned long offset, + struct ext3_dir_entry_2 ** res_dir) +{ + struct ext3_dir_entry_2 * de; + char * dlimit; + int de_len; + const char *name = child->name; + int namelen = child->len; + + de = (struct ext3_dir_entry_2 *) bh->b_data; + dlimit = bh->b_data + dir->i_sb->s_blocksize; + while ((char *) de < dlimit) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + + if ((char *) de + namelen <= dlimit && + ext3_match (namelen, name, de)) { + /* found a match - just to be sure, do a full check */ + if (!ext3_check_dir_entry("ext3_find_entry", + dir, de, bh, offset)) + return -1; + *res_dir = de; + return 1; + } + /* prevent looping on a bad block */ + de_len = ext3_rec_len_from_disk(de->rec_len); + if (de_len <= 0) + return -1; + offset += de_len; + de = (struct ext3_dir_entry_2 *) ((char *) de + de_len); + } + return 0; +} + + +/* + * ext3_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + * + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +static struct buffer_head *ext3_find_entry(struct inode *dir, + struct qstr *entry, + struct ext3_dir_entry_2 **res_dir) +{ + struct super_block * sb; + struct buffer_head * bh_use[NAMEI_RA_SIZE]; + struct buffer_head * bh, *ret = NULL; + unsigned long start, block, b; + const u8 *name = entry->name; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead + buffer */ + int num = 0; + int nblocks, i, err; + int namelen; + + *res_dir = NULL; + sb = dir->i_sb; + namelen = entry->len; + if (namelen > EXT3_NAME_LEN) + return NULL; + if ((namelen <= 2) && (name[0] == '.') && + (name[1] == '.' || name[1] == 0)) { + /* + * "." or ".." will only be in the first block + * NFS may look up ".."; "." should be handled by the VFS + */ + block = start = 0; + nblocks = 1; + goto restart; + } + if (is_dx(dir)) { + bh = ext3_dx_find_entry(dir, entry, res_dir, &err); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the + * old fashioned way. + */ + if (bh || (err != ERR_BAD_DX_DIR)) + return bh; + dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); + } + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + start = EXT3_I(dir)->i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; +restart: + do { + /* + * We deal with the read-ahead logic here. + */ + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + b = block; + for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { + /* + * Terminate if we reach the end of the + * directory and must wrap, or if our + * search has finished at this block. + */ + if (b >= nblocks || (num && block == start)) { + bh_use[ra_max] = NULL; + break; + } + num++; + bh = ext3_getblk(NULL, dir, b++, 0, &err); + bh_use[ra_max] = bh; + if (bh) + ll_rw_block(READ_META, 1, &bh); + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* read error, skip block & hope for the best */ + ext3_error(sb, __func__, "reading directory #%lu " + "offset %lu", dir->i_ino, block); + brelse(bh); + goto next; + } + i = search_dirblock(bh, dir, entry, + block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { + EXT3_I(dir)->i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) + goto cleanup_and_exit; + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + if (block < nblocks) { + start = 0; + goto restart; + } + +cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse (bh_use[ra_ptr]); + return ret; +} + +static struct buffer_head * ext3_dx_find_entry(struct inode *dir, + struct qstr *entry, struct ext3_dir_entry_2 **res_dir, + int *err) +{ + struct super_block *sb = dir->i_sb; + struct dx_hash_info hinfo; + struct dx_frame frames[2], *frame; + struct buffer_head *bh; + unsigned long block; + int retval; + + if (!(frame = dx_probe(entry, dir, &hinfo, frames, err))) + return NULL; + do { + block = dx_get_block(frame->at); + if (!(bh = ext3_bread (NULL,dir, block, 0, err))) + goto errout; + + retval = search_dirblock(bh, dir, entry, + block << EXT3_BLOCK_SIZE_BITS(sb), + res_dir); + if (retval == 1) { + dx_release(frames); + return bh; + } + brelse(bh); + if (retval == -1) { + *err = ERR_BAD_DX_DIR; + goto errout; + } + + /* Check to see if we should continue to search */ + retval = ext3_htree_next_block(dir, hinfo.hash, frame, + frames, NULL); + if (retval < 0) { + ext3_warning(sb, __func__, + "error reading index page in directory #%lu", + dir->i_ino); + *err = retval; + goto errout; + } + } while (retval == 1); + + *err = -ENOENT; +errout: + dxtrace(printk("%s not found\n", name)); + dx_release (frames); + return NULL; +} + +static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode * inode; + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + bh = ext3_find_entry(dir, &dentry->d_name, &de); + inode = NULL; + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); + brelse (bh); + if (!ext3_valid_inum(dir->i_sb, ino)) { + ext3_error(dir->i_sb, "ext3_lookup", + "bad inode number: %lu", ino); + return ERR_PTR(-EIO); + } + inode = ext3_iget(dir->i_sb, ino); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ESTALE) { + ext3_error(dir->i_sb, __func__, + "deleted inode referenced: %lu", + ino); + return ERR_PTR(-EIO); + } else { + return ERR_CAST(inode); + } + } + } + return d_splice_alias(inode, dentry); +} + + +struct dentry *ext3_get_parent(struct dentry *child) +{ + unsigned long ino; + struct qstr dotdot = {.name = "..", .len = 2}; + struct ext3_dir_entry_2 * de; + struct buffer_head *bh; + + bh = ext3_find_entry(child->d_inode, &dotdot, &de); + if (!bh) + return ERR_PTR(-ENOENT); + ino = le32_to_cpu(de->inode); + brelse(bh); + + if (!ext3_valid_inum(child->d_inode->i_sb, ino)) { + ext3_error(child->d_inode->i_sb, "ext3_get_parent", + "bad inode number: %lu", ino); + return ERR_PTR(-EIO); + } + + return d_obtain_alias(ext3_iget(child->d_inode->i_sb, ino)); +} + +#define S_SHIFT 12 +static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT3_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT3_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT3_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT3_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT3_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT3_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT3_FT_SYMLINK, +}; + +static inline void ext3_set_de_type(struct super_block *sb, + struct ext3_dir_entry_2 *de, + umode_t mode) { + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +/* + * Move count entries from end of map between two memory locations. + * Returns pointer to last entry moved. + */ +static struct ext3_dir_entry_2 * +dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) +{ + unsigned rec_len = 0; + + while (count--) { + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); + ((struct ext3_dir_entry_2 *) to)->rec_len = + ext3_rec_len_to_disk(rec_len); + de->inode = 0; + map++; + to += rec_len; + } + return (struct ext3_dir_entry_2 *) (to - rec_len); +} + +/* + * Compact each dir entry in the range to the minimal rec_len. + * Returns pointer to last entry in range. + */ +static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize) +{ + struct ext3_dir_entry_2 *next, *to, *prev; + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base; + unsigned rec_len = 0; + + prev = to = de; + while ((char *)de < base + blocksize) { + next = ext3_next_entry(de); + if (de->inode && de->name_len) { + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext3_rec_len_to_disk(rec_len); + prev = to; + to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); + } + de = next; + } + return prev; +} + +/* + * Split a full leaf block to make room for a new dir entry. + * Allocate a new block, and move entries so that they are approx. equally full. + * Returns pointer to de in block into which the new entry will be inserted. + */ +static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo, int *error) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; + struct buffer_head *bh2; + u32 newblock; + u32 hash2; + struct dx_map_entry *map; + char *data1 = (*bh)->b_data, *data2; + unsigned split, move, size; + struct ext3_dir_entry_2 *de = NULL, *de2; + int err = 0, i; + + bh2 = ext3_append (handle, dir, &newblock, &err); + if (!(bh2)) { + brelse(*bh); + *bh = NULL; + goto errout; + } + + BUFFER_TRACE(*bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, *bh); + if (err) + goto journal_error; + + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + + data2 = bh2->b_data; + + /* create map in the end of data2 block */ + map = (struct dx_map_entry *) (data2 + blocksize); + count = dx_make_map ((struct ext3_dir_entry_2 *) data1, + blocksize, hinfo, map); + map -= count; + dx_sort_map (map, count); + /* Split the existing block in the middle, size-wise */ + size = 0; + move = 0; + for (i = count-1; i >= 0; i--) { + /* is more than half of this entry in 2nd half of the block? */ + if (size + map[i].size/2 > blocksize/2) + break; + size += map[i].size; + move++; + } + /* map index at which we will split */ + split = count - move; + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk("Split block %i at %x, %i/%i\n", + dx_get_block(frame->at), hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split); + de = dx_pack_dirents(data1,blocksize); + de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de); + de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2); + dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); + dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); + + /* Which block gets the new entry? */ + if (hinfo->hash >= hash2) + { + swap(*bh, bh2); + de = de2; + } + dx_insert_block (frame, hash2 + continued, newblock); + err = ext3_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; + err = ext3_journal_dirty_metadata (handle, frame->bh); + if (err) + goto journal_error; + brelse (bh2); + dxtrace(dx_show_index ("frame", frame->entries)); + return de; + +journal_error: + brelse(*bh); + brelse(bh2); + *bh = NULL; + ext3_std_error(dir->i_sb, err); +errout: + *error = err; + return NULL; +} + + +/* + * Add a new entry into a directory (leaf) block. If de is non-NULL, + * it points to a directory entry which is guaranteed to be large + * enough for new directory entry. If de is NULL, then + * add_dirent_to_buf will attempt search the directory block for + * space. It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + * + * NOTE! bh is NOT released in the case where ENOSPC is returned. In + * all other cases bh is released. + */ +static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct ext3_dir_entry_2 *de, + struct buffer_head * bh) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned long offset = 0; + unsigned short reclen; + int nlen, rlen, err; + char *top; + + reclen = EXT3_DIR_REC_LEN(namelen); + if (!de) { + de = (struct ext3_dir_entry_2 *)bh->b_data; + top = bh->b_data + dir->i_sb->s_blocksize - reclen; + while ((char *) de <= top) { + if (!ext3_check_dir_entry("ext3_add_entry", dir, de, + bh, offset)) { + brelse (bh); + return -EIO; + } + if (ext3_match (namelen, name, de)) { + brelse (bh); + return -EEXIST; + } + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = ext3_rec_len_from_disk(de->rec_len); + if ((de->inode? rlen - nlen: rlen) >= reclen) + break; + de = (struct ext3_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -ENOSPC; + } + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) { + ext3_std_error(dir->i_sb, err); + brelse(bh); + return err; + } + + /* By now the buffer is marked for journaling */ + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = ext3_rec_len_from_disk(de->rec_len); + if (de->inode) { + struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = ext3_rec_len_to_disk(rlen - nlen); + de->rec_len = ext3_rec_len_to_disk(nlen); + de = de1; + } + de->file_type = EXT3_FT_UNKNOWN; + if (inode) { + de->inode = cpu_to_le32(inode->i_ino); + ext3_set_de_type(dir->i_sb, de, inode->i_mode); + } else + de->inode = 0; + de->name_len = namelen; + memcpy (de->name, name, namelen); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext3_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + ext3_update_dx_flag(dir); + dir->i_version++; + ext3_mark_inode_dirty(handle, dir); + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + ext3_std_error(dir->i_sb, err); + brelse(bh); + return 0; +} + +/* + * This converts a one block unindexed directory to a 3 block indexed + * directory, and adds the dentry to the indexed directory. + */ +static int make_indexed_dir(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct buffer_head *bh) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; + struct dx_frame frames[2], *frame; + struct dx_entry *entries; + struct ext3_dir_entry_2 *de, *de2; + char *data1, *top; + unsigned len; + int retval; + unsigned blocksize; + struct dx_hash_info hinfo; + u32 block; + struct fake_dirent *fde; + + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); + retval = ext3_journal_get_write_access(handle, bh); + if (retval) { + ext3_std_error(dir->i_sb, retval); + brelse(bh); + return retval; + } + root = (struct dx_root *) bh->b_data; + + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext3_dir_entry_2 *)((char *)fde + + ext3_rec_len_from_disk(fde->rec_len)); + if ((char *) de >= (((char *) root) + blocksize)) { + ext3_error(dir->i_sb, __func__, + "invalid rec_len for '..' in inode %lu", + dir->i_ino); + brelse(bh); + return -EIO; + } + len = ((char *) root) + blocksize - (char *) de; + + bh2 = ext3_append (handle, dir, &block, &retval); + if (!(bh2)) { + brelse(bh); + return retval; + } + EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; + data1 = bh2->b_data; + + memcpy (data1, de, len); + de = (struct ext3_dir_entry_2 *) data1; + top = data1 + len; + while ((char *)(de2 = ext3_next_entry(de)) < top) + de = de2; + de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de); + /* Initialize the root; the dot dirents already exist */ + de = (struct ext3_dir_entry_2 *) (&root->dotdot); + de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2)); + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + entries = root->entries; + dx_set_block (entries, 1); + dx_set_count (entries, 1); + dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); + + /* Initialize as for dx_probe */ + hinfo.hash_version = root->info.hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + ext3fs_dirhash(name, namelen, &hinfo); + frame = frames; + frame->entries = entries; + frame->at = entries; + frame->bh = bh; + bh = bh2; + /* + * Mark buffers dirty here so that if do_split() fails we write a + * consistent set of buffers to disk. + */ + ext3_journal_dirty_metadata(handle, frame->bh); + ext3_journal_dirty_metadata(handle, bh); + de = do_split(handle,dir, &bh, frame, &hinfo, &retval); + if (!de) { + ext3_mark_inode_dirty(handle, dir); + dx_release(frames); + return retval; + } + dx_release(frames); + + return add_dirent_to_buf(handle, dentry, inode, de, bh); +} + +/* + * ext3_add_entry() + * + * adds a file entry to the specified directory, using the same + * semantics as ext3_find_entry(). It returns NULL if it failed. + * + * NOTE!! The inode part of 'de' is left at 0 - which means you + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +static int ext3_add_entry (handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + struct buffer_head * bh; + struct ext3_dir_entry_2 *de; + struct super_block * sb; + int retval; + int dx_fallback=0; + unsigned blocksize; + u32 block, blocks; + + sb = dir->i_sb; + blocksize = sb->s_blocksize; + if (!dentry->d_name.len) + return -EINVAL; + if (is_dx(dir)) { + retval = ext3_dx_add_entry(handle, dentry, inode); + if (!retval || (retval != ERR_BAD_DX_DIR)) + return retval; + EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL; + dx_fallback++; + ext3_mark_inode_dirty(handle, dir); + } + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0; block < blocks; block++) { + bh = ext3_bread(handle, dir, block, 0, &retval); + if(!bh) + return retval; + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (retval != -ENOSPC) + return retval; + + if (blocks == 1 && !dx_fallback && + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) + return make_indexed_dir(handle, dentry, inode, bh); + brelse(bh); + } + bh = ext3_append(handle, dir, &block, &retval); + if (!bh) + return retval; + de = (struct ext3_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = ext3_rec_len_to_disk(blocksize); + return add_dirent_to_buf(handle, dentry, inode, de, bh); +} + +/* + * Returns 0 for success, or a negative error value + */ +static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct dx_frame frames[2], *frame; + struct dx_entry *entries, *at; + struct dx_hash_info hinfo; + struct buffer_head * bh; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block * sb = dir->i_sb; + struct ext3_dir_entry_2 *de; + int err; + + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); + if (!frame) + return err; + entries = frame->entries; + at = frame->at; + + if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + goto cleanup; + + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto journal_error; + + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (err != -ENOSPC) { + bh = NULL; + goto cleanup; + } + + /* Block full, should compress but for now just split */ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + u32 newblock; + unsigned icount = dx_get_count(entries); + int levels = frame - frames; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + + if (levels && (dx_get_count(frames->entries) == + dx_get_limit(frames->entries))) { + ext3_warning(sb, __func__, + "Directory index full!"); + err = -ENOSPC; + goto cleanup; + } + bh2 = ext3_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + memset(&node2->fake, 0, sizeof(struct fake_dirent)); + node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize); + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + if (levels) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk("Split index %i/%i\n", icount1, icount2)); + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext3_journal_get_write_access(handle, + frames[0].bh); + if (err) + goto journal_error; + + memcpy ((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); + dx_set_count (entries, icount1); + dx_set_count (entries2, icount2); + dx_set_limit (entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { + frame->at = at = at - entries - icount1 + entries2; + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } + dx_insert_block (frames + 0, hash2, newblock); + dxtrace(dx_show_index ("node", frames[1].entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; + brelse (bh2); + } else { + dxtrace(printk("Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; + + /* Add new access path frame */ + frame = frames + 1; + frame->at = at = at - entries + entries2; + frame->entries = entries = entries2; + frame->bh = bh2; + err = ext3_journal_get_write_access(handle, + frame->bh); + if (err) + goto journal_error; + } + err = ext3_journal_dirty_metadata(handle, frames[0].bh); + if (err) + goto journal_error; + } + de = do_split(handle, dir, &bh, frame, &hinfo, &err); + if (!de) + goto cleanup; + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + bh = NULL; + goto cleanup; + +journal_error: + ext3_std_error(dir->i_sb, err); +cleanup: + if (bh) + brelse(bh); + dx_release(frames); + return err; +} + +/* + * ext3_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +static int ext3_delete_entry (handle_t *handle, + struct inode * dir, + struct ext3_dir_entry_2 * de_del, + struct buffer_head * bh) +{ + struct ext3_dir_entry_2 * de, * pde; + int i; + + i = 0; + pde = NULL; + de = (struct ext3_dir_entry_2 *) bh->b_data; + while (i < bh->b_size) { + if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) + return -EIO; + if (de == de_del) { + int err; + + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto journal_error; + + if (pde) + pde->rec_len = ext3_rec_len_to_disk( + ext3_rec_len_from_disk(pde->rec_len) + + ext3_rec_len_from_disk(de->rec_len)); + else + de->inode = 0; + dir->i_version++; + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) { +journal_error: + ext3_std_error(dir->i_sb, err); + return err; + } + return 0; + } + i += ext3_rec_len_from_disk(de->rec_len); + pde = de; + de = ext3_next_entry(de); + } + return -ENOENT; +} + +static int ext3_add_nondir(handle_t *handle, + struct dentry *dentry, struct inode *inode) +{ + int err = ext3_add_entry(handle, dentry, inode); + if (!err) { + ext3_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); + unlock_new_inode(inode); + return 0; + } + drop_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return err; +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, + struct nameidata *nd) +{ + handle_t *handle; + struct inode * inode; + int err, retries = 0; + + dquot_initialize(dir); + +retry: + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + ext3_set_aops(inode); + err = ext3_add_nondir(handle, dentry, inode); + } + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext3_mknod (struct inode * dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + dquot_initialize(dir); + +retry: + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); +#ifdef CONFIG_EXT3_FS_XATTR + inode->i_op = &ext3_special_inode_operations; +#endif + err = ext3_add_nondir(handle, dentry, inode); + } + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) +{ + handle_t *handle; + struct inode * inode; + struct buffer_head * dir_block = NULL; + struct ext3_dir_entry_2 * de; + int err, retries = 0; + + if (dir->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + + dquot_initialize(dir); + +retry: + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) + goto out_clear_inode; + + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext3_journal_get_write_access(handle, dir_block); + if (err) + goto out_clear_inode; + + de = (struct ext3_dir_entry_2 *) dir_block->b_data; + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len)); + strcpy (de->name, "."); + ext3_set_de_type(dir->i_sb, de, S_IFDIR); + de = ext3_next_entry(de); + de->inode = cpu_to_le32(dir->i_ino); + de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize - + EXT3_DIR_REC_LEN(1)); + de->name_len = 2; + strcpy (de->name, ".."); + ext3_set_de_type(dir->i_sb, de, S_IFDIR); + inode->i_nlink = 2; + BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, dir_block); + if (err) + goto out_clear_inode; + + err = ext3_mark_inode_dirty(handle, inode); + if (!err) + err = ext3_add_entry (handle, dentry, inode); + + if (err) { +out_clear_inode: + inode->i_nlink = 0; + unlock_new_inode(inode); + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; + } + inc_nlink(dir); + ext3_update_dx_flag(dir); + err = ext3_mark_inode_dirty(handle, dir); + if (err) + goto out_clear_inode; + + d_instantiate(dentry, inode); + unlock_new_inode(inode); +out_stop: + brelse(dir_block); + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +static int empty_dir (struct inode * inode) +{ + unsigned long offset; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de, * de1; + struct super_block * sb; + int err = 0; + + sb = inode->i_sb; + if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) || + !(bh = ext3_bread (NULL, inode, 0, 0, &err))) { + if (err) + ext3_error(inode->i_sb, __func__, + "error %d reading directory #%lu offset 0", + err, inode->i_ino); + else + ext3_warning(inode->i_sb, __func__, + "bad directory (dir #%lu) - no data block", + inode->i_ino); + return 1; + } + de = (struct ext3_dir_entry_2 *) bh->b_data; + de1 = ext3_next_entry(de); + if (le32_to_cpu(de->inode) != inode->i_ino || + !le32_to_cpu(de1->inode) || + strcmp (".", de->name) || + strcmp ("..", de1->name)) { + ext3_warning (inode->i_sb, "empty_dir", + "bad directory (dir #%lu) - no `.' or `..'", + inode->i_ino); + brelse (bh); + return 1; + } + offset = ext3_rec_len_from_disk(de->rec_len) + + ext3_rec_len_from_disk(de1->rec_len); + de = ext3_next_entry(de1); + while (offset < inode->i_size ) { + if (!bh || + (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + err = 0; + brelse (bh); + bh = ext3_bread (NULL, inode, + offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err); + if (!bh) { + if (err) + ext3_error(sb, __func__, + "error %d reading directory" + " #%lu offset %lu", + err, inode->i_ino, offset); + offset += sb->s_blocksize; + continue; + } + de = (struct ext3_dir_entry_2 *) bh->b_data; + } + if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) { + de = (struct ext3_dir_entry_2 *)(bh->b_data + + sb->s_blocksize); + offset = (offset | (sb->s_blocksize - 1)) + 1; + continue; + } + if (le32_to_cpu(de->inode)) { + brelse (bh); + return 0; + } + offset += ext3_rec_len_from_disk(de->rec_len); + de = ext3_next_entry(de); + } + brelse (bh); + return 1; +} + +/* ext3_orphan_add() links an unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext3_orphan_cleanup(). + */ +int ext3_orphan_add(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ext3_iloc iloc; + int err = 0, rc; + + mutex_lock(&EXT3_SB(sb)->s_orphan_lock); + if (!list_empty(&EXT3_I(inode)->i_orphan)) + goto out_unlock; + + /* Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. */ + + /* @@@ FIXME: Observation from aviro: + * I think I can trigger J_ASSERT in ext3_orphan_add(). We block + * here (on s_orphan_lock), so race with ext3_link() which might bump + * ->i_nlink. For, say it, character device. Not a regular file, + * not a directory, not a symlink and ->i_nlink > 0. + * + * tytso, 4/25/2009: I'm not sure how that could happen; + * shouldn't the fs core protect us from these sort of + * unlink()/link() races? + */ + J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + + BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (err) + goto out_unlock; + + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_unlock; + + /* Insert this inode at the head of the on-disk orphan list... */ + NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan); + EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + rc = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + + /* Only add to the head of the in-memory list if all the + * previous operations succeeded. If the orphan_add is going to + * fail (possibly taking the journal offline), we can't risk + * leaving the inode on the orphan list: stray orphan-list + * entries can cause panics at unmount time. + * + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ + if (!err) + list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); + + jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); + jbd_debug(4, "orphan inode %lu will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); +out_unlock: + mutex_unlock(&EXT3_SB(sb)->s_orphan_lock); + ext3_std_error(inode->i_sb, err); + return err; +} + +/* + * ext3_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ +int ext3_orphan_del(handle_t *handle, struct inode *inode) +{ + struct list_head *prev; + struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_sb_info *sbi; + unsigned long ino_next; + struct ext3_iloc iloc; + int err = 0; + + mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock); + if (list_empty(&ei->i_orphan)) + goto out; + + ino_next = NEXT_ORPHAN(inode); + prev = ei->i_orphan.prev; + sbi = EXT3_SB(inode->i_sb); + + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + + list_del_init(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on + * disk, but we still need to remove the inode from the linked + * list in memory. */ + if (!handle) + goto out; + + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_err; + + if (prev == &sbi->s_orphan) { + jbd_debug(4, "superblock will point to %lu\n", ino_next); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto out_brelse; + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); + } else { + struct ext3_iloc iloc2; + struct inode *i_prev = + &list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode; + + jbd_debug(4, "orphan inode %lu will point to %lu\n", + i_prev->i_ino, ino_next); + err = ext3_reserve_inode_write(handle, i_prev, &iloc2); + if (err) + goto out_brelse; + NEXT_ORPHAN(i_prev) = ino_next; + err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2); + } + if (err) + goto out_brelse; + NEXT_ORPHAN(inode) = 0; + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + +out_err: + ext3_std_error(inode->i_sb, err); +out: + mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock); + return err; + +out_brelse: + brelse(iloc.bh); + goto out_err; +} + +static int ext3_rmdir (struct inode * dir, struct dentry *dentry) +{ + int retval; + struct inode * inode; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; + + /* Initialize quotas before so that eventual writes go in + * separate transaction */ + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + retval = -ENOENT; + bh = ext3_find_entry(dir, &dentry->d_name, &de); + if (!bh) + goto end_rmdir; + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = dentry->d_inode; + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_rmdir; + + retval = -ENOTEMPTY; + if (!empty_dir (inode)) + goto end_rmdir; + + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_rmdir; + if (inode->i_nlink != 2) + ext3_warning (inode->i_sb, "ext3_rmdir", + "empty directory has nlink!=2 (%d)", + inode->i_nlink); + inode->i_version++; + clear_nlink(inode); + /* There's no need to set i_disksize: the fact that i_nlink is + * zero will ensure that the right thing happens during any + * recovery. */ + inode->i_size = 0; + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + drop_nlink(dir); + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + +end_rmdir: + ext3_journal_stop(handle); + brelse (bh); + return retval; +} + +static int ext3_unlink(struct inode * dir, struct dentry *dentry) +{ + int retval; + struct inode * inode; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; + + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + retval = -ENOENT; + bh = ext3_find_entry(dir, &dentry->d_name, &de); + if (!bh) + goto end_unlink; + + inode = dentry->d_inode; + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_unlink; + + if (!inode->i_nlink) { + ext3_warning (inode->i_sb, "ext3_unlink", + "Deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); + inode->i_nlink = 1; + } + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + drop_nlink(inode); + if (!inode->i_nlink) + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime; + ext3_mark_inode_dirty(handle, inode); + retval = 0; + +end_unlink: + ext3_journal_stop(handle); + brelse (bh); + return retval; +} + +static int ext3_symlink (struct inode * dir, + struct dentry *dentry, const char * symname) +{ + handle_t *handle; + struct inode * inode; + int l, err, retries = 0; + int credits; + + l = strlen(symname)+1; + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + + dquot_initialize(dir); + + if (l > EXT3_N_BLOCKS * 4) { + /* + * For non-fast symlinks, we just allocate inode and put it on + * orphan list in the first transaction => we need bitmap, + * group descriptor, sb, inode block, quota blocks, and + * possibly selinux xattr blocks. + */ + credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT3_XATTR_TRANS_BLOCKS; + } else { + /* + * Fast symlink. We have to add entry to directory + * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS), + * allocate new inode (bitmap, group descriptor, inode block, + * quota blocks, sb is already counted in previous macros). + */ + credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } +retry: + handle = ext3_journal_start(dir, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + if (l > EXT3_N_BLOCKS * 4) { + inode->i_op = &ext3_symlink_inode_operations; + ext3_set_aops(inode); + /* + * We cannot call page_symlink() with transaction started + * because it calls into ext3_write_begin() which acquires page + * lock which ranks below transaction start (and it can also + * wait for journal commit if we are running out of space). So + * we have to stop transaction now and restart it when symlink + * contents is written. + * + * To keep fs consistent in case of crash, we have to put inode + * to orphan list in the mean time. + */ + drop_nlink(inode); + err = ext3_orphan_add(handle, inode); + ext3_journal_stop(handle); + if (err) + goto err_drop_inode; + err = __page_symlink(inode, symname, l, 1); + if (err) + goto err_drop_inode; + /* + * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS + * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified + */ + handle = ext3_journal_start(dir, + EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto err_drop_inode; + } + inc_nlink(inode); + err = ext3_orphan_del(handle, inode); + if (err) { + ext3_journal_stop(handle); + drop_nlink(inode); + goto err_drop_inode; + } + } else { + inode->i_op = &ext3_fast_symlink_inode_operations; + memcpy((char*)&EXT3_I(inode)->i_data,symname,l); + inode->i_size = l-1; + } + EXT3_I(inode)->i_disksize = inode->i_size; + err = ext3_add_nondir(handle, dentry, inode); +out_stop: + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +err_drop_inode: + unlock_new_inode(inode); + iput(inode); + return err; +} + +static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) +{ + handle_t *handle; + struct inode *inode = old_dentry->d_inode; + int err, retries = 0; + + if (inode->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + + dquot_initialize(dir); + +retry: + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode->i_ctime = CURRENT_TIME_SEC; + inc_nlink(inode); + ihold(inode); + + err = ext3_add_entry(handle, dentry, inode); + if (!err) { + ext3_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); + } else { + drop_nlink(inode); + iput(inode); + } + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +#define PARENT_INO(buffer) \ + (ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode) + +/* + * Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + */ +static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, + struct inode * new_dir,struct dentry *new_dentry) +{ + handle_t *handle; + struct inode * old_inode, * new_inode; + struct buffer_head * old_bh, * new_bh, * dir_bh; + struct ext3_dir_entry_2 * old_de, * new_de; + int retval, flush_file = 0; + + dquot_initialize(old_dir); + dquot_initialize(new_dir); + + old_bh = new_bh = dir_bh = NULL; + + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + if (new_dentry->d_inode) + dquot_initialize(new_dentry->d_inode); + handle = ext3_journal_start(old_dir, 2 * + EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + handle->h_sync = 1; + + old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + old_inode = old_dentry->d_inode; + retval = -ENOENT; + if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) + goto end_rename; + + new_inode = new_dentry->d_inode; + new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de); + if (new_bh) { + if (!new_inode) { + brelse (new_bh); + new_bh = NULL; + } + } + if (S_ISDIR(old_inode->i_mode)) { + if (new_inode) { + retval = -ENOTEMPTY; + if (!empty_dir (new_inode)) + goto end_rename; + } + retval = -EIO; + dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval); + if (!dir_bh) + goto end_rename; + if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) + goto end_rename; + retval = -EMLINK; + if (!new_inode && new_dir!=old_dir && + new_dir->i_nlink >= EXT3_LINK_MAX) + goto end_rename; + } + if (!new_bh) { + retval = ext3_add_entry (handle, new_dentry, old_inode); + if (retval) + goto end_rename; + } else { + BUFFER_TRACE(new_bh, "get write access"); + retval = ext3_journal_get_write_access(handle, new_bh); + if (retval) + goto journal_error; + new_de->inode = cpu_to_le32(old_inode->i_ino); + if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, + EXT3_FEATURE_INCOMPAT_FILETYPE)) + new_de->file_type = old_de->file_type; + new_dir->i_version++; + new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, new_dir); + BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); + retval = ext3_journal_dirty_metadata(handle, new_bh); + if (retval) + goto journal_error; + brelse(new_bh); + new_bh = NULL; + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, old_inode); + + /* + * ok, that's it + */ + if (le32_to_cpu(old_de->inode) != old_inode->i_ino || + old_de->name_len != old_dentry->d_name.len || + strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || + (retval = ext3_delete_entry(handle, old_dir, + old_de, old_bh)) == -ENOENT) { + /* old_de could have moved from under us during htree split, so + * make sure that we are deleting the right entry. We might + * also be pointing to a stale entry in the unused part of + * old_bh so just checking inum and the name isn't enough. */ + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; + + old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name, + &old_de2); + if (old_bh2) { + retval = ext3_delete_entry(handle, old_dir, + old_de2, old_bh2); + brelse(old_bh2); + } + } + if (retval) { + ext3_warning(old_dir->i_sb, "ext3_rename", + "Deleting old file (%lu), %d, error=%d", + old_dir->i_ino, old_dir->i_nlink, retval); + } + + if (new_inode) { + drop_nlink(new_inode); + new_inode->i_ctime = CURRENT_TIME_SEC; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; + ext3_update_dx_flag(old_dir); + if (dir_bh) { + BUFFER_TRACE(dir_bh, "get_write_access"); + retval = ext3_journal_get_write_access(handle, dir_bh); + if (retval) + goto journal_error; + PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); + BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); + retval = ext3_journal_dirty_metadata(handle, dir_bh); + if (retval) { +journal_error: + ext3_std_error(new_dir->i_sb, retval); + goto end_rename; + } + drop_nlink(old_dir); + if (new_inode) { + drop_nlink(new_inode); + } else { + inc_nlink(new_dir); + ext3_update_dx_flag(new_dir); + ext3_mark_inode_dirty(handle, new_dir); + } + } + ext3_mark_inode_dirty(handle, old_dir); + if (new_inode) { + ext3_mark_inode_dirty(handle, new_inode); + if (!new_inode->i_nlink) + ext3_orphan_add(handle, new_inode); + if (ext3_should_writeback_data(new_inode)) + flush_file = 1; + } + retval = 0; + +end_rename: + brelse (dir_bh); + brelse (old_bh); + brelse (new_bh); + ext3_journal_stop(handle); + if (retval == 0 && flush_file) + filemap_flush(old_inode->i_mapping); + return retval; +} + +/* + * directories can handle most operations... + */ +const struct inode_operations ext3_dir_inode_operations = { + .create = ext3_create, + .lookup = ext3_lookup, + .link = ext3_link, + .unlink = ext3_unlink, + .symlink = ext3_symlink, + .mkdir = ext3_mkdir, + .rmdir = ext3_rmdir, + .mknod = ext3_mknod, + .rename = ext3_rename, + .setattr = ext3_setattr, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif + .check_acl = ext3_check_acl, +}; + +const struct inode_operations ext3_special_inode_operations = { + .setattr = ext3_setattr, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif + .check_acl = ext3_check_acl, +}; diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h new file mode 100644 index 00000000..f2ce2b00 --- /dev/null +++ b/fs/ext3/namei.h @@ -0,0 +1,8 @@ +/* linux/fs/ext3/namei.h + * + * Copyright (C) 2005 Simtec Electronics + * Ben Dooks + * +*/ + +extern struct dentry *ext3_get_parent(struct dentry *child); diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c new file mode 100644 index 00000000..7916e4ce --- /dev/null +++ b/fs/ext3/resize.c @@ -0,0 +1,1120 @@ +/* + * linux/fs/ext3/resize.c + * + * Support for resizing an ext3 filesystem while it is mounted. + * + * Copyright (C) 2001, 2002 Andreas Dilger + * + * This could probably be made into a module, because it is not often in use. + */ + + +#define EXT3FS_DEBUG + +#include + +#include +#include + + +#define outside(b, first, last) ((b) < (first) || (b) >= (last)) +#define inside(b, first, last) ((b) >= (first) && (b) < (last)) + +static int verify_group_input(struct super_block *sb, + struct ext3_new_group_data *input) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count); + ext3_fsblk_t end = start + input->blocks_count; + unsigned group = input->group; + ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; + unsigned overhead = ext3_bg_has_super(sb, group) ? + (1 + ext3_bg_num_gdb(sb, group) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + ext3_fsblk_t metaend = start + overhead; + struct buffer_head *bh = NULL; + ext3_grpblk_t free_blocks_count; + int err = -EINVAL; + + input->free_blocks_count = free_blocks_count = + input->blocks_count - 2 - overhead - sbi->s_itb_per_group; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks " + "(%d free, %u reserved)\n", + ext3_bg_has_super(sb, input->group) ? "normal" : + "no-super", input->group, input->blocks_count, + free_blocks_count, input->reserved_blocks); + + if (group != sbi->s_groups_count) + ext3_warning(sb, __func__, + "Cannot add at group %u (only %lu groups)", + input->group, sbi->s_groups_count); + else if ((start - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb)) + ext3_warning(sb, __func__, "Last group not full"); + else if (input->reserved_blocks > input->blocks_count / 5) + ext3_warning(sb, __func__, "Reserved blocks too high (%u)", + input->reserved_blocks); + else if (free_blocks_count < 0) + ext3_warning(sb, __func__, "Bad blocks count %u", + input->blocks_count); + else if (!(bh = sb_bread(sb, end - 1))) + ext3_warning(sb, __func__, + "Cannot read last block ("E3FSBLK")", + end - 1); + else if (outside(input->block_bitmap, start, end)) + ext3_warning(sb, __func__, + "Block bitmap not in group (block %u)", + input->block_bitmap); + else if (outside(input->inode_bitmap, start, end)) + ext3_warning(sb, __func__, + "Inode bitmap not in group (block %u)", + input->inode_bitmap); + else if (outside(input->inode_table, start, end) || + outside(itend - 1, start, end)) + ext3_warning(sb, __func__, + "Inode table not in group (blocks %u-"E3FSBLK")", + input->inode_table, itend - 1); + else if (input->inode_bitmap == input->block_bitmap) + ext3_warning(sb, __func__, + "Block bitmap same as inode bitmap (%u)", + input->block_bitmap); + else if (inside(input->block_bitmap, input->inode_table, itend)) + ext3_warning(sb, __func__, + "Block bitmap (%u) in inode table (%u-"E3FSBLK")", + input->block_bitmap, input->inode_table, itend-1); + else if (inside(input->inode_bitmap, input->inode_table, itend)) + ext3_warning(sb, __func__, + "Inode bitmap (%u) in inode table (%u-"E3FSBLK")", + input->inode_bitmap, input->inode_table, itend-1); + else if (inside(input->block_bitmap, start, metaend)) + ext3_warning(sb, __func__, + "Block bitmap (%u) in GDT table" + " ("E3FSBLK"-"E3FSBLK")", + input->block_bitmap, start, metaend - 1); + else if (inside(input->inode_bitmap, start, metaend)) + ext3_warning(sb, __func__, + "Inode bitmap (%u) in GDT table" + " ("E3FSBLK"-"E3FSBLK")", + input->inode_bitmap, start, metaend - 1); + else if (inside(input->inode_table, start, metaend) || + inside(itend - 1, start, metaend)) + ext3_warning(sb, __func__, + "Inode table (%u-"E3FSBLK") overlaps" + "GDT table ("E3FSBLK"-"E3FSBLK")", + input->inode_table, itend - 1, start, metaend - 1); + else + err = 0; + brelse(bh); + + return err; +} + +static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, + ext3_fsblk_t blk) +{ + struct buffer_head *bh; + int err; + + bh = sb_getblk(sb, blk); + if (!bh) + return ERR_PTR(-EIO); + if ((err = ext3_journal_get_write_access(handle, bh))) { + brelse(bh); + bh = ERR_PTR(err); + } else { + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + } + + return bh; +} + +/* + * To avoid calling the atomic setbit hundreds or thousands of times, we only + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ +static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + + if (start_bit >= end_bit) + return; + + ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); + for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) + ext3_set_bit(i, bitmap); + if (i < end_bit) + memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +} + +/* + * If we have fewer than thresh credits, extend by EXT3_MAX_TRANS_DATA. + * If that fails, restart the transaction & regain write access for the + * buffer head which is used for block_bitmap modifications. + */ +static int extend_or_restart_transaction(handle_t *handle, int thresh, + struct buffer_head *bh) +{ + int err; + + if (handle->h_buffer_credits >= thresh) + return 0; + + err = ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA); + if (err < 0) + return err; + if (err) { + err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA); + if (err) + return err; + err = ext3_journal_get_write_access(handle, bh); + if (err) + return err; + } + + return 0; +} + +/* + * Set up the block and inode bitmaps, and the inode table for the new group. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to + * ensure the recovery is correct in case of a failure just after resize. + * If any part of this fails, we simply abort the resize. + */ +static int setup_new_group_blocks(struct super_block *sb, + struct ext3_new_group_data *input) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group); + int reserved_gdb = ext3_bg_has_super(sb, input->group) ? + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; + unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group); + struct buffer_head *bh; + handle_t *handle; + ext3_fsblk_t block; + ext3_grpblk_t bit; + int i; + int err = 0, err2; + + /* This transaction may be extended/restarted along the way */ + handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA); + + if (IS_ERR(handle)) + return PTR_ERR(handle); + + mutex_lock(&sbi->s_resize_lock); + if (input->group != sbi->s_groups_count) { + err = -EBUSY; + goto exit_journal; + } + + if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { + err = PTR_ERR(bh); + goto exit_journal; + } + + if (ext3_bg_has_super(sb, input->group)) { + ext3_debug("mark backup superblock %#04lx (+0)\n", start); + ext3_set_bit(0, bh->b_data); + } + + /* Copy all of the GDT blocks into the backup in this group */ + for (i = 0, bit = 1, block = start + 1; + i < gdblocks; i++, block++, bit++) { + struct buffer_head *gdb; + + ext3_debug("update backup group %#04lx (+%d)\n", block, bit); + + err = extend_or_restart_transaction(handle, 1, bh); + if (err) + goto exit_bh; + + gdb = sb_getblk(sb, block); + if (!gdb) { + err = -EIO; + goto exit_bh; + } + if ((err = ext3_journal_get_write_access(handle, gdb))) { + brelse(gdb); + goto exit_bh; + } + lock_buffer(gdb); + memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); + set_buffer_uptodate(gdb); + unlock_buffer(gdb); + err = ext3_journal_dirty_metadata(handle, gdb); + if (err) { + brelse(gdb); + goto exit_bh; + } + ext3_set_bit(bit, bh->b_data); + brelse(gdb); + } + + /* Zero out all of the reserved backup group descriptor table blocks */ + for (i = 0, bit = gdblocks + 1, block = start + bit; + i < reserved_gdb; i++, block++, bit++) { + struct buffer_head *gdb; + + ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit); + + err = extend_or_restart_transaction(handle, 1, bh); + if (err) + goto exit_bh; + + if (IS_ERR(gdb = bclean(handle, sb, block))) { + err = PTR_ERR(gdb); + goto exit_bh; + } + err = ext3_journal_dirty_metadata(handle, gdb); + if (err) { + brelse(gdb); + goto exit_bh; + } + ext3_set_bit(bit, bh->b_data); + brelse(gdb); + } + ext3_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap, + input->block_bitmap - start); + ext3_set_bit(input->block_bitmap - start, bh->b_data); + ext3_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap, + input->inode_bitmap - start); + ext3_set_bit(input->inode_bitmap - start, bh->b_data); + + /* Zero out all of the inode table blocks */ + for (i = 0, block = input->inode_table, bit = block - start; + i < sbi->s_itb_per_group; i++, bit++, block++) { + struct buffer_head *it; + + ext3_debug("clear inode block %#04lx (+%d)\n", block, bit); + + err = extend_or_restart_transaction(handle, 1, bh); + if (err) + goto exit_bh; + + if (IS_ERR(it = bclean(handle, sb, block))) { + err = PTR_ERR(it); + goto exit_bh; + } + err = ext3_journal_dirty_metadata(handle, it); + if (err) { + brelse(it); + goto exit_bh; + } + brelse(it); + ext3_set_bit(bit, bh->b_data); + } + + err = extend_or_restart_transaction(handle, 2, bh); + if (err) + goto exit_bh; + + mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb), + bh->b_data); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + goto exit_bh; + brelse(bh); + + /* Mark unused entries in inode bitmap used */ + ext3_debug("clear inode bitmap %#04x (+%ld)\n", + input->inode_bitmap, input->inode_bitmap - start); + if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { + err = PTR_ERR(bh); + goto exit_journal; + } + + mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb), + bh->b_data); + err = ext3_journal_dirty_metadata(handle, bh); +exit_bh: + brelse(bh); + +exit_journal: + mutex_unlock(&sbi->s_resize_lock); + if ((err2 = ext3_journal_stop(handle)) && !err) + err = err2; + + return err; +} + +/* + * Iterate through the groups which hold BACKUP superblock/GDT copies in an + * ext3 filesystem. The counters should be initialized to 1, 5, and 7 before + * calling this for the first time. In a sparse filesystem it will be the + * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... + * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... + */ +static unsigned ext3_list_backups(struct super_block *sb, unsigned *three, + unsigned *five, unsigned *seven) +{ + unsigned *min = three; + int mult = 3; + unsigned ret; + + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ret = *min; + *min += 1; + return ret; + } + + if (*five < *min) { + min = five; + mult = 5; + } + if (*seven < *min) { + min = seven; + mult = 7; + } + + ret = *min; + *min *= mult; + + return ret; +} + +/* + * Check that all of the backup GDT blocks are held in the primary GDT block. + * It is assumed that they are stored in group order. Returns the number of + * groups in current filesystem that have BACKUPS, or -ve error code. + */ +static int verify_reserved_gdb(struct super_block *sb, + struct buffer_head *primary) +{ + const ext3_fsblk_t blk = primary->b_blocknr; + const unsigned long end = EXT3_SB(sb)->s_groups_count; + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned grp; + __le32 *p = (__le32 *)primary->b_data; + int gdbackups = 0; + + while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { + if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){ + ext3_warning(sb, __func__, + "reserved GDT "E3FSBLK + " missing grp %d ("E3FSBLK")", + blk, grp, + grp * EXT3_BLOCKS_PER_GROUP(sb) + blk); + return -EINVAL; + } + if (++gdbackups > EXT3_ADDR_PER_BLOCK(sb)) + return -EFBIG; + } + + return gdbackups; +} + +/* + * Called when we need to bring a reserved group descriptor table block into + * use from the resize inode. The primary copy of the new GDT block currently + * is an indirect block (under the double indirect block in the resize inode). + * The new backup GDT blocks will be stored as leaf blocks in this indirect + * block, in group order. Even though we know all the block numbers we need, + * we check to ensure that the resize inode has actually reserved these blocks. + * + * Don't need to update the block bitmaps because the blocks are still in use. + * + * We get all of the error cases out of the way, so that we are sure to not + * fail once we start modifying the data on disk, because JBD has no rollback. + */ +static int add_new_gdb(handle_t *handle, struct inode *inode, + struct ext3_new_group_data *input, + struct buffer_head **primary) +{ + struct super_block *sb = inode->i_sb; + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb); + ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; + struct buffer_head **o_group_desc, **n_group_desc; + struct buffer_head *dind; + int gdbackups; + struct ext3_iloc iloc; + __le32 *data; + int err; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG + "EXT3-fs: ext3_add_new_gdb: adding group block %lu\n", + gdb_num); + + /* + * If we are not using the primary superblock/GDT copy don't resize, + * because the user tools have no way of handling this. Probably a + * bad time to do it anyways. + */ + if (EXT3_SB(sb)->s_sbh->b_blocknr != + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) { + ext3_warning(sb, __func__, + "won't resize using backup superblock at %llu", + (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr); + return -EPERM; + } + + *primary = sb_bread(sb, gdblock); + if (!*primary) + return -EIO; + + if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { + err = gdbackups; + goto exit_bh; + } + + data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_bh; + } + + data = (__le32 *)dind->b_data; + if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) { + ext3_warning(sb, __func__, + "new group %u GDT block "E3FSBLK" not reserved", + input->group, gdblock); + err = -EINVAL; + goto exit_dind; + } + + if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh))) + goto exit_dind; + + if ((err = ext3_journal_get_write_access(handle, *primary))) + goto exit_sbh; + + if ((err = ext3_journal_get_write_access(handle, dind))) + goto exit_primary; + + /* ext3_reserve_inode_write() gets a reference on the iloc */ + if ((err = ext3_reserve_inode_write(handle, inode, &iloc))) + goto exit_dindj; + + n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_NOFS); + if (!n_group_desc) { + err = -ENOMEM; + ext3_warning (sb, __func__, + "not enough memory for %lu groups", gdb_num + 1); + goto exit_inode; + } + + /* + * Finally, we have all of the possible failures behind us... + * + * Remove new GDT block from inode double-indirect block and clear out + * the new GDT block for use (which also "frees" the backup GDT blocks + * from the reserved inode). We don't need to change the bitmaps for + * these blocks, because they are marked as in-use from being in the + * reserved inode, and will become GDT blocks (primary and backup). + */ + data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0; + err = ext3_journal_dirty_metadata(handle, dind); + if (err) + goto exit_group_desc; + brelse(dind); + dind = NULL; + inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (err) + goto exit_group_desc; + memset((*primary)->b_data, 0, sb->s_blocksize); + err = ext3_journal_dirty_metadata(handle, *primary); + if (err) + goto exit_group_desc; + + o_group_desc = EXT3_SB(sb)->s_group_desc; + memcpy(n_group_desc, o_group_desc, + EXT3_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + n_group_desc[gdb_num] = *primary; + EXT3_SB(sb)->s_group_desc = n_group_desc; + EXT3_SB(sb)->s_gdb_count++; + kfree(o_group_desc); + + le16_add_cpu(&es->s_reserved_gdt_blocks, -1); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + if (err) + goto exit_inode; + + return 0; + +exit_group_desc: + kfree(n_group_desc); +exit_inode: + //ext3_journal_release_buffer(handle, iloc.bh); + brelse(iloc.bh); +exit_dindj: + //ext3_journal_release_buffer(handle, dind); +exit_primary: + //ext3_journal_release_buffer(handle, *primary); +exit_sbh: + //ext3_journal_release_buffer(handle, *primary); +exit_dind: + brelse(dind); +exit_bh: + brelse(*primary); + + ext3_debug("leaving with error %d\n", err); + return err; +} + +/* + * Called when we are adding a new group which has a backup copy of each of + * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. + * We need to add these reserved backup GDT blocks to the resize inode, so + * that they are kept for future resizing and not allocated to files. + * + * Each reserved backup GDT block will go into a different indirect block. + * The indirect blocks are actually the primary reserved GDT blocks, + * so we know in advance what their block numbers are. We only get the + * double-indirect block to verify it is pointing to the primary reserved + * GDT blocks so we don't overwrite a data block by accident. The reserved + * backup GDT blocks are stored in their reserved primary GDT block. + */ +static int reserve_backup_gdb(handle_t *handle, struct inode *inode, + struct ext3_new_group_data *input) +{ + struct super_block *sb = inode->i_sb; + int reserved_gdb =le16_to_cpu(EXT3_SB(sb)->s_es->s_reserved_gdt_blocks); + struct buffer_head **primary; + struct buffer_head *dind; + struct ext3_iloc iloc; + ext3_fsblk_t blk; + __le32 *data, *end; + int gdbackups = 0; + int res, i; + int err; + + primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS); + if (!primary) + return -ENOMEM; + + data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_free; + } + + blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count; + data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count % + EXT3_ADDR_PER_BLOCK(sb)); + end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb); + + /* Get each reserved primary GDT block and verify it holds backups */ + for (res = 0; res < reserved_gdb; res++, blk++) { + if (le32_to_cpu(*data) != blk) { + ext3_warning(sb, __func__, + "reserved block "E3FSBLK + " not at offset %ld", + blk, + (long)(data - (__le32 *)dind->b_data)); + err = -EINVAL; + goto exit_bh; + } + primary[res] = sb_bread(sb, blk); + if (!primary[res]) { + err = -EIO; + goto exit_bh; + } + if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { + brelse(primary[res]); + err = gdbackups; + goto exit_bh; + } + if (++data >= end) + data = (__le32 *)dind->b_data; + } + + for (i = 0; i < reserved_gdb; i++) { + if ((err = ext3_journal_get_write_access(handle, primary[i]))) { + /* + int j; + for (j = 0; j < i; j++) + ext3_journal_release_buffer(handle, primary[j]); + */ + goto exit_bh; + } + } + + if ((err = ext3_reserve_inode_write(handle, inode, &iloc))) + goto exit_bh; + + /* + * Finally we can add each of the reserved backup GDT blocks from + * the new group to its reserved primary GDT block. + */ + blk = input->group * EXT3_BLOCKS_PER_GROUP(sb); + for (i = 0; i < reserved_gdb; i++) { + int err2; + data = (__le32 *)primary[i]->b_data; + /* printk("reserving backup %lu[%u] = %lu\n", + primary[i]->b_blocknr, gdbackups, + blk + primary[i]->b_blocknr); */ + data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); + err2 = ext3_journal_dirty_metadata(handle, primary[i]); + if (!err) + err = err2; + } + inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; + ext3_mark_iloc_dirty(handle, inode, &iloc); + +exit_bh: + while (--res >= 0) + brelse(primary[res]); + brelse(dind); + +exit_free: + kfree(primary); + + return err; +} + +/* + * Update the backup copies of the ext3 metadata. These don't need to be part + * of the main resize transaction, because e2fsck will re-write them if there + * is a problem (basically only OOM will cause a problem). However, we + * _should_ update the backups if possible, in case the primary gets trashed + * for some reason and we need to run e2fsck from a backup superblock. The + * important part is that the new block and inode counts are in the backup + * superblocks, and the location of the new group metadata in the GDT backups. + * + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. + */ +static void update_backups(struct super_block *sb, + int blk_off, char *data, int size) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + const unsigned long last = sbi->s_groups_count; + const int bpg = EXT3_BLOCKS_PER_GROUP(sb); + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned group; + int rest = sb->s_blocksize - size; + handle_t *handle; + int err = 0, err2; + + handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA); + if (IS_ERR(handle)) { + group = 1; + err = PTR_ERR(handle); + goto exit_err; + } + + while ((group = ext3_list_backups(sb, &three, &five, &seven)) < last) { + struct buffer_head *bh; + + /* Out of journal space, and can't get more - abort - so sad */ + if (handle->h_buffer_credits == 0 && + ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA) && + (err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA))) + break; + + bh = sb_getblk(sb, group * bpg + blk_off); + if (!bh) { + err = -EIO; + break; + } + ext3_debug("update metadata backup %#04lx\n", + (unsigned long)bh->b_blocknr); + if ((err = ext3_journal_get_write_access(handle, bh))) { + brelse(bh); + break; + } + lock_buffer(bh); + memcpy(bh->b_data, data, size); + if (rest) + memset(bh->b_data + size, 0, rest); + set_buffer_uptodate(bh); + unlock_buffer(bh); + err = ext3_journal_dirty_metadata(handle, bh); + brelse(bh); + if (err) + break; + } + if ((err2 = ext3_journal_stop(handle)) && !err) + err = err2; + + /* + * Ugh! Need to have e2fsck write the backup copies. It is too + * late to revert the resize, we shouldn't fail just because of + * the backup copies (they are only needed in case of corruption). + * + * However, if we got here we have a journal problem too, so we + * can't really start a transaction to mark the superblock. + * Chicken out and just set the flag on the hope it will be written + * to disk, and if not - we will simply wait until next fsck. + */ +exit_err: + if (err) { + ext3_warning(sb, __func__, + "can't update backup for group %d (err %d), " + "forcing fsck on next reboot", group, err); + sbi->s_mount_state &= ~EXT3_VALID_FS; + sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS); + mark_buffer_dirty(sbi->s_sbh); + } +} + +/* Add group descriptor data to an existing or new group descriptor block. + * Ensure we handle all possible error conditions _before_ we start modifying + * the filesystem, because we cannot abort the transaction and not have it + * write the data to disk. + * + * If we are on a GDT block boundary, we need to get the reserved GDT block. + * Otherwise, we may need to add backup GDT blocks for a sparse group. + * + * We only need to hold the superblock lock while we are actually adding + * in the new group's counts to the superblock. Prior to that we have + * not really "added" the group at all. We re-check that we are still + * adding in the last group in case things have changed since verifying. + */ +int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + int reserved_gdb = ext3_bg_has_super(sb, input->group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + struct buffer_head *primary = NULL; + struct ext3_group_desc *gdp; + struct inode *inode = NULL; + handle_t *handle; + int gdb_off, gdb_num; + int err, err2; + + gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb); + gdb_off = input->group % EXT3_DESC_PER_BLOCK(sb); + + if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ext3_warning(sb, __func__, + "Can't resize non-sparse filesystem further"); + return -EPERM; + } + + if (le32_to_cpu(es->s_blocks_count) + input->blocks_count < + le32_to_cpu(es->s_blocks_count)) { + ext3_warning(sb, __func__, "blocks_count overflow\n"); + return -EINVAL; + } + + if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) < + le32_to_cpu(es->s_inodes_count)) { + ext3_warning(sb, __func__, "inodes_count overflow\n"); + return -EINVAL; + } + + if (reserved_gdb || gdb_off == 0) { + if (!EXT3_HAS_COMPAT_FEATURE(sb, + EXT3_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { + ext3_warning(sb, __func__, + "No reserved GDT blocks, can't resize"); + return -EPERM; + } + inode = ext3_iget(sb, EXT3_RESIZE_INO); + if (IS_ERR(inode)) { + ext3_warning(sb, __func__, + "Error opening resize inode"); + return PTR_ERR(inode); + } + } + + if ((err = verify_group_input(sb, input))) + goto exit_put; + + if ((err = setup_new_group_blocks(sb, input))) + goto exit_put; + + /* + * We will always be modifying at least the superblock and a GDT + * block. If we are adding a group past the last current GDT block, + * we will also modify the inode and the dindirect block. If we + * are adding a group with superblock/GDT backups we will also + * modify each of the reserved GDT dindirect blocks. + */ + handle = ext3_journal_start_sb(sb, + ext3_bg_has_super(sb, input->group) ? + 3 + reserved_gdb : 4); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto exit_put; + } + + mutex_lock(&sbi->s_resize_lock); + if (input->group != sbi->s_groups_count) { + ext3_warning(sb, __func__, + "multiple resizers run on filesystem!"); + err = -EBUSY; + goto exit_journal; + } + + if ((err = ext3_journal_get_write_access(handle, sbi->s_sbh))) + goto exit_journal; + + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { + primary = sbi->s_group_desc[gdb_num]; + if ((err = ext3_journal_get_write_access(handle, primary))) + goto exit_journal; + + if (reserved_gdb && ext3_bg_num_gdb(sb, input->group) && + (err = reserve_backup_gdb(handle, inode, input))) + goto exit_journal; + } else if ((err = add_new_gdb(handle, inode, input, &primary))) + goto exit_journal; + + /* + * OK, now we've set up the new group. Time to make it active. + * + * We do not lock all allocations via s_resize_lock + * so we have to be safe wrt. concurrent accesses the group + * data. So we need to be careful to set all of the relevant + * group descriptor data etc. *before* we enable the group. + * + * The key field here is sbi->s_groups_count: as long as + * that retains its old value, nobody is going to access the new + * group. + * + * So first we update all the descriptor metadata for the new + * group; then we update the total disk blocks count; then we + * update the groups count to enable the group; then finally we + * update the free space counts so that the system can start + * using the new disk blocks. + */ + + /* Update group descriptor block for new group */ + gdp = (struct ext3_group_desc *)primary->b_data + gdb_off; + + gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap); + gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap); + gdp->bg_inode_table = cpu_to_le32(input->inode_table); + gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); + gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb)); + + /* + * Make the new blocks and inodes valid next. We do this before + * increasing the group count so that once the group is enabled, + * all of its blocks and inodes are already valid. + * + * We always allocate group-by-group, then block-by-block or + * inode-by-inode within a group, so enabling these + * blocks/inodes before the group is live won't actually let us + * allocate the new space yet. + */ + le32_add_cpu(&es->s_blocks_count, input->blocks_count); + le32_add_cpu(&es->s_inodes_count, EXT3_INODES_PER_GROUP(sb)); + + /* + * We need to protect s_groups_count against other CPUs seeing + * inconsistent state in the superblock. + * + * The precise rules we use are: + * + * * Writers of s_groups_count *must* hold s_resize_lock + * AND + * * Writers must perform a smp_wmb() after updating all dependent + * data and before modifying the groups count + * + * * Readers must hold s_resize_lock over the access + * OR + * * Readers must perform an smp_rmb() after reading the groups count + * and before reading any dependent data. + * + * NB. These rules can be relaxed when checking the group count + * while freeing data, as we can only allocate from a block + * group after serialising against the group count, and we can + * only then free after serialising in turn against that + * allocation. + */ + smp_wmb(); + + /* Update the global fs size fields */ + sbi->s_groups_count++; + + err = ext3_journal_dirty_metadata(handle, primary); + if (err) + goto exit_journal; + + /* Update the reserved block counts only once the new group is + * active. */ + le32_add_cpu(&es->s_r_blocks_count, input->reserved_blocks); + + /* Update the free space counts */ + percpu_counter_add(&sbi->s_freeblocks_counter, + input->free_blocks_count); + percpu_counter_add(&sbi->s_freeinodes_counter, + EXT3_INODES_PER_GROUP(sb)); + + err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); + +exit_journal: + mutex_unlock(&sbi->s_resize_lock); + if ((err2 = ext3_journal_stop(handle)) && !err) + err = err2; + if (!err) { + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext3_super_block)); + update_backups(sb, primary->b_blocknr, primary->b_data, + primary->b_size); + } +exit_put: + iput(inode); + return err; +} /* ext3_group_add */ + +/* Extend the filesystem to the new number of blocks specified. This entry + * point is only used to extend the current filesystem to the end of the last + * existing group. It can be accessed via ioctl, or by "remount,resize=" + * for emergencies (because it has no dependencies on reserved blocks). + * + * If we _really_ wanted, we could use default values to call ext3_group_add() + * allow the "remount" trick to work for arbitrary resizing, assuming enough + * GDT blocks are reserved to grow to the desired size. + */ +int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, + ext3_fsblk_t n_blocks_count) +{ + ext3_fsblk_t o_blocks_count; + ext3_grpblk_t last; + ext3_grpblk_t add; + struct buffer_head * bh; + handle_t *handle; + int err; + unsigned long freed_blocks; + + /* We don't need to worry about locking wrt other resizers just + * yet: we're going to revalidate es->s_blocks_count after + * taking the s_resize_lock below. */ + o_blocks_count = le32_to_cpu(es->s_blocks_count); + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK + " up to "E3FSBLK" blocks\n", + o_blocks_count, n_blocks_count); + + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + + if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + printk(KERN_ERR "EXT3-fs: filesystem on %s:" + " too large to resize to "E3FSBLK" blocks safely\n", + sb->s_id, n_blocks_count); + if (sizeof(sector_t) < 8) + ext3_warning(sb, __func__, + "CONFIG_LBDAF not enabled\n"); + return -EINVAL; + } + + if (n_blocks_count < o_blocks_count) { + ext3_warning(sb, __func__, + "can't shrink FS - resize aborted"); + return -EBUSY; + } + + /* Handle the remaining blocks in the last group only. */ + last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb); + + if (last == 0) { + ext3_warning(sb, __func__, + "need to use ext2online to resize further"); + return -EPERM; + } + + add = EXT3_BLOCKS_PER_GROUP(sb) - last; + + if (o_blocks_count + add < o_blocks_count) { + ext3_warning(sb, __func__, "blocks_count overflow"); + return -EINVAL; + } + + if (o_blocks_count + add > n_blocks_count) + add = n_blocks_count - o_blocks_count; + + if (o_blocks_count + add < n_blocks_count) + ext3_warning(sb, __func__, + "will only finish group ("E3FSBLK + " blocks, %u new)", + o_blocks_count + add, add); + + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, o_blocks_count + add -1); + if (!bh) { + ext3_warning(sb, __func__, + "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + /* We will update the superblock, one block bitmap, and + * one group descriptor via ext3_free_blocks(). + */ + handle = ext3_journal_start_sb(sb, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext3_warning(sb, __func__, "error %d on journal start",err); + goto exit_put; + } + + mutex_lock(&EXT3_SB(sb)->s_resize_lock); + if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { + ext3_warning(sb, __func__, + "multiple resizers run on filesystem!"); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); + ext3_journal_stop(handle); + err = -EBUSY; + goto exit_put; + } + + if ((err = ext3_journal_get_write_access(handle, + EXT3_SB(sb)->s_sbh))) { + ext3_warning(sb, __func__, + "error %d on journal write access", err); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); + ext3_journal_stop(handle); + goto exit_put; + } + es->s_blocks_count = cpu_to_le32(o_blocks_count + add); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); + if (err) { + ext3_warning(sb, __func__, + "error %d on journal dirty metadata", err); + ext3_journal_stop(handle); + goto exit_put; + } + ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n", + o_blocks_count, o_blocks_count + add); + ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); + ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", + o_blocks_count, o_blocks_count + add); + if ((err = ext3_journal_stop(handle))) + goto exit_put; + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n", + le32_to_cpu(es->s_blocks_count)); + update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext3_super_block)); +exit_put: + return err; +} /* ext3_group_extend */ diff --git a/fs/ext3/super.c b/fs/ext3/super.c new file mode 100644 index 00000000..aad153ef --- /dev/null +++ b/fs/ext3/super.c @@ -0,0 +1,3084 @@ +/* + * linux/fs/ext3/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "xattr.h" +#include "acl.h" +#include "namei.h" + +#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED + #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA +#else + #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA +#endif + +static int ext3_load_journal(struct super_block *, struct ext3_super_block *, + unsigned long journal_devnum); +static int ext3_create_journal(struct super_block *, struct ext3_super_block *, + unsigned int); +static int ext3_commit_super(struct super_block *sb, + struct ext3_super_block *es, + int sync); +static void ext3_mark_recovery_complete(struct super_block * sb, + struct ext3_super_block * es); +static void ext3_clear_journal_err(struct super_block * sb, + struct ext3_super_block * es); +static int ext3_sync_fs(struct super_block *sb, int wait); +static const char *ext3_decode_error(struct super_block * sb, int errno, + char nbuf[16]); +static int ext3_remount (struct super_block * sb, int * flags, char * data); +static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); +static int ext3_unfreeze(struct super_block *sb); +static int ext3_freeze(struct super_block *sb); + +/* + * Wrappers for journal_start/end. + * + * The only special thing we need to do here is to make sure that all + * journal_end calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + */ +handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) +{ + journal_t *journal; + + if (sb->s_flags & MS_RDONLY) + return ERR_PTR(-EROFS); + + /* Special case here: if the journal has aborted behind our + * backs (eg. EIO in the commit thread), then we still need to + * take the FS itself readonly cleanly. */ + journal = EXT3_SB(sb)->s_journal; + if (is_journal_aborted(journal)) { + ext3_abort(sb, __func__, + "Detected aborted journal"); + return ERR_PTR(-EROFS); + } + + return journal_start(journal, nblocks); +} + +/* + * The only special thing we need to do here is to make sure that all + * journal_stop calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + */ +int __ext3_journal_stop(const char *where, handle_t *handle) +{ + struct super_block *sb; + int err; + int rc; + + sb = handle->h_transaction->t_journal->j_private; + err = handle->h_err; + rc = journal_stop(handle); + + if (!err) + err = rc; + if (err) + __ext3_std_error(sb, where, err); + return err; +} + +void ext3_journal_abort_handle(const char *caller, const char *err_fn, + struct buffer_head *bh, handle_t *handle, int err) +{ + char nbuf[16]; + const char *errstr = ext3_decode_error(NULL, err, nbuf); + + if (bh) + BUFFER_TRACE(bh, "abort"); + + if (!handle->h_err) + handle->h_err = err; + + if (is_handle_aborted(handle)) + return; + + printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n", + caller, errstr, err_fn); + + journal_abort_handle(handle); +} + +void ext3_msg(struct super_block *sb, const char *prefix, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + + va_end(args); +} + +/* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * + * On ext2, we can store the error state of the filesystem in the + * superblock. That is not possible on ext3, because we may have other + * write ordering constraints on the superblock which prevent us from + * writing it out straight away; and given that the journal is about to + * be aborted, we can't rely on the current, or future, transactions to + * write out the superblock safely. + * + * We'll just use the journal_abort() error code to record an error in + * the journal instead. On recovery, the journal will complain about + * that error until we've noted it down and cleared it. + */ + +static void ext3_handle_error(struct super_block *sb) +{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + es->s_state |= cpu_to_le16(EXT3_ERROR_FS); + + if (sb->s_flags & MS_RDONLY) + return; + + if (!test_opt (sb, ERRORS_CONT)) { + journal_t *journal = EXT3_SB(sb)->s_journal; + + set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); + if (journal) + journal_abort(journal, -EIO); + } + if (test_opt (sb, ERRORS_RO)) { + ext3_msg(sb, KERN_CRIT, + "error: remounting filesystem read-only"); + sb->s_flags |= MS_RDONLY; + } + ext3_commit_super(sb, es, 1); + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT3-fs (%s): panic forced after error\n", + sb->s_id); +} + +void ext3_error(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); + + ext3_handle_error(sb); +} + +static const char *ext3_decode_error(struct super_block * sb, int errno, + char nbuf[16]) +{ + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT) + errstr = "Journal has aborted"; + else + errstr = "Readonly filesystem"; + break; + default: + /* If the caller passed in an extra buffer for unknown + * errors, textualise them now. Else we just return + * NULL. */ + if (nbuf) { + /* Check for truncated error codes... */ + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + break; + } + + return errstr; +} + +/* __ext3_std_error decodes expected errors from journaling functions + * automatically and invokes the appropriate error response. */ + +void __ext3_std_error (struct super_block * sb, const char * function, + int errno) +{ + char nbuf[16]; + const char *errstr; + + /* Special case: if the error is EROFS, and we're not already + * inside a transaction, then there's really no point in logging + * an error. */ + if (errno == -EROFS && journal_current_handle() == NULL && + (sb->s_flags & MS_RDONLY)) + return; + + errstr = ext3_decode_error(sb, errno, nbuf); + ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr); + + ext3_handle_error(sb); +} + +/* + * ext3_abort is a much stronger failure handler than ext3_error. The + * abort function may be used to deal with unrecoverable failures such + * as journal IO errors or ENOMEM at a critical moment in log management. + * + * We unconditionally force the filesystem into an ABORT|READONLY state, + * unless the error response on the fs has been set to panic in which + * case we take the easy way out and panic immediately. + */ + +void ext3_abort(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); + + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT3-fs: panic from previous error\n"); + + if (sb->s_flags & MS_RDONLY) + return; + + ext3_msg(sb, KERN_CRIT, + "error: remounting filesystem read-only"); + EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + sb->s_flags |= MS_RDONLY; + set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); + if (EXT3_SB(sb)->s_journal) + journal_abort(EXT3_SB(sb)->s_journal, -EIO); +} + +void ext3_warning(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); +} + +void ext3_update_dynamic_rev(struct super_block *sb) +{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) + return; + + ext3_msg(sb, KERN_WARNING, + "warning: updating to rev %d because of " + "new feature flag, running e2fsck is recommended", + EXT3_DYNAMIC_REV); + + es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO); + es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE); + es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV); + /* leave es->s_feature_*compat flags alone */ + /* es->s_uuid will be set by e2fsck if empty */ + + /* + * The rest of the superblock fields should be zero, and if not it + * means they are likely already in use, so leave them alone. We + * can leave it up to e2fsck to clean up any inconsistencies there. + */ +} + +/* + * Open the external journal device + */ +static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) +{ + struct block_device *bdev; + char b[BDEVNAME_SIZE]; + + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); + if (IS_ERR(bdev)) + goto fail; + return bdev; + +fail: + ext3_msg(sb, "error: failed to open journal device %s: %ld", + __bdevname(dev, b), PTR_ERR(bdev)); + + return NULL; +} + +/* + * Release the journal device + */ +static int ext3_blkdev_put(struct block_device *bdev) +{ + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +} + +static int ext3_blkdev_remove(struct ext3_sb_info *sbi) +{ + struct block_device *bdev; + int ret = -ENODEV; + + bdev = sbi->journal_bdev; + if (bdev) { + ret = ext3_blkdev_put(bdev); + sbi->journal_bdev = NULL; + } + return ret; +} + +static inline struct inode *orphan_list_entry(struct list_head *l) +{ + return &list_entry(l, struct ext3_inode_info, i_orphan)->vfs_inode; +} + +static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) +{ + struct list_head *l; + + ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d", + le32_to_cpu(sbi->s_es->s_last_orphan)); + + ext3_msg(sb, KERN_ERR, "sb_info orphan list:"); + list_for_each(l, &sbi->s_orphan) { + struct inode *inode = orphan_list_entry(l); + ext3_msg(sb, KERN_ERR, " " + "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", + inode->i_sb->s_id, inode->i_ino, inode, + inode->i_mode, inode->i_nlink, + NEXT_ORPHAN(inode)); + } +} + +static void ext3_put_super (struct super_block * sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + int i, err; + + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + ext3_xattr_put_super(sb); + err = journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + if (err < 0) + ext3_abort(sb, __func__, "Couldn't clean up the journal"); + + if (!(sb->s_flags & MS_RDONLY)) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + es->s_state = cpu_to_le16(sbi->s_mount_state); + BUFFER_TRACE(sbi->s_sbh, "marking dirty"); + mark_buffer_dirty(sbi->s_sbh); + ext3_commit_super(sb, es, 1); + } + + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + brelse(sbi->s_sbh); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + + /* Debugging code just in case the in-memory inode orphan list + * isn't empty. The on-disk one can be non-empty if we've + * detected an error and taken the fs readonly, but the + * in-memory list had better be clean by this point. */ + if (!list_empty(&sbi->s_orphan)) + dump_orphan_list(sb, sbi); + J_ASSERT(list_empty(&sbi->s_orphan)); + + invalidate_bdev(sb->s_bdev); + if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + sync_blockdev(sbi->journal_bdev); + invalidate_bdev(sbi->journal_bdev); + ext3_blkdev_remove(sbi); + } + sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); + kfree(sbi); +} + +static struct kmem_cache *ext3_inode_cachep; + +/* + * Called inside transaction, so use GFP_NOFS + */ +static struct inode *ext3_alloc_inode(struct super_block *sb) +{ + struct ext3_inode_info *ei; + + ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; + atomic_set(&ei->i_datasync_tid, 0); + atomic_set(&ei->i_sync_tid, 0); + return &ei->vfs_inode; +} + +static void ext3_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); +} + +static void ext3_destroy_inode(struct inode *inode) +{ + if (!list_empty(&(EXT3_I(inode)->i_orphan))) { + printk("EXT3 Inode %p: orphan list check failed!\n", + EXT3_I(inode)); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, + EXT3_I(inode), sizeof(struct ext3_inode_info), + false); + dump_stack(); + } + call_rcu(&inode->i_rcu, ext3_i_callback); +} + +static void init_once(void *foo) +{ + struct ext3_inode_info *ei = (struct ext3_inode_info *) foo; + + INIT_LIST_HEAD(&ei->i_orphan); +#ifdef CONFIG_EXT3_FS_XATTR + init_rwsem(&ei->xattr_sem); +#endif + mutex_init(&ei->truncate_mutex); + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", + sizeof(struct ext3_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (ext3_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(ext3_inode_cachep); +} + +static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb) +{ +#if defined(CONFIG_QUOTA) + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + + if (test_opt(sb, USRQUOTA)) + seq_puts(seq, ",usrquota"); + + if (test_opt(sb, GRPQUOTA)) + seq_puts(seq, ",grpquota"); +#endif +} + +static char *data_mode_string(unsigned long mode) +{ + switch (mode) { + case EXT3_MOUNT_JOURNAL_DATA: + return "journal"; + case EXT3_MOUNT_ORDERED_DATA: + return "ordered"; + case EXT3_MOUNT_WRITEBACK_DATA: + return "writeback"; + } + return "unknown"; +} + +/* + * Show an option if + * - it's set to a non-default value OR + * - if the per-sb default is different from the global default + */ +static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct super_block *sb = vfs->mnt_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + unsigned long def_mount_opts; + + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + + if (sbi->s_sb_block != 1) + seq_printf(seq, ",sb=%lu", sbi->s_sb_block); + if (test_opt(sb, MINIX_DF)) + seq_puts(seq, ",minixdf"); + if (test_opt(sb, GRPID)) + seq_puts(seq, ",grpid"); + if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS)) + seq_puts(seq, ",nogrpid"); + if (sbi->s_resuid != EXT3_DEF_RESUID || + le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) { + seq_printf(seq, ",resuid=%u", sbi->s_resuid); + } + if (sbi->s_resgid != EXT3_DEF_RESGID || + le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) { + seq_printf(seq, ",resgid=%u", sbi->s_resgid); + } + if (test_opt(sb, ERRORS_RO)) { + int def_errors = le16_to_cpu(es->s_errors); + + if (def_errors == EXT3_ERRORS_PANIC || + def_errors == EXT3_ERRORS_CONTINUE) { + seq_puts(seq, ",errors=remount-ro"); + } + } + if (test_opt(sb, ERRORS_CONT)) + seq_puts(seq, ",errors=continue"); + if (test_opt(sb, ERRORS_PANIC)) + seq_puts(seq, ",errors=panic"); + if (test_opt(sb, NO_UID32)) + seq_puts(seq, ",nouid32"); + if (test_opt(sb, DEBUG)) + seq_puts(seq, ",debug"); + if (test_opt(sb, OLDALLOC)) + seq_puts(seq, ",oldalloc"); +#ifdef CONFIG_EXT3_FS_XATTR + if (test_opt(sb, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + if (!test_opt(sb, XATTR_USER) && + (def_mount_opts & EXT3_DEFM_XATTR_USER)) { + seq_puts(seq, ",nouser_xattr"); + } +#endif +#ifdef CONFIG_EXT3_FS_POSIX_ACL + if (test_opt(sb, POSIX_ACL)) + seq_puts(seq, ",acl"); + if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT3_DEFM_ACL)) + seq_puts(seq, ",noacl"); +#endif + if (!test_opt(sb, RESERVATION)) + seq_puts(seq, ",noreservation"); + if (sbi->s_commit_interval) { + seq_printf(seq, ",commit=%u", + (unsigned) (sbi->s_commit_interval / HZ)); + } + + /* + * Always display barrier state so it's clear what the status is. + */ + seq_puts(seq, ",barrier="); + seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); + seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); + if (test_opt(sb, DATA_ERR_ABORT)) + seq_puts(seq, ",data_err=abort"); + + if (test_opt(sb, NOLOAD)) + seq_puts(seq, ",norecovery"); + + ext3_show_quota_options(seq, sb); + + return 0; +} + + +static struct inode *ext3_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO) + return ERR_PTR(-ESTALE); + if (ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) + return ERR_PTR(-ESTALE); + + /* iget isn't really right if the inode is currently unallocated!! + * + * ext3_read_inode will return a bad_inode if the inode had been + * deleted, so we should be safe. + * + * Currently we don't know the generation for parent directory, so + * a generation of 0 means "accept any" + */ + inode = ext3_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +static struct dentry *ext3_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ext3_nfs_get_inode); +} + +static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ext3_nfs_get_inode); +} + +/* + * Try to release metadata pages (indirect blocks, directories) which are + * mapped via the block device. Since these pages could have journal heads + * which would prevent try_to_free_buffers() from freeing them, we must use + * jbd layer's try_to_free_buffers() function to release them. + */ +static int bdev_try_to_free_page(struct super_block *sb, struct page *page, + gfp_t wait) +{ + journal_t *journal = EXT3_SB(sb)->s_journal; + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return journal_try_to_free_buffers(journal, page, + wait & ~__GFP_WAIT); + return try_to_free_buffers(page); +} + +#ifdef CONFIG_QUOTA +#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") +#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) + +static int ext3_write_dquot(struct dquot *dquot); +static int ext3_acquire_dquot(struct dquot *dquot); +static int ext3_release_dquot(struct dquot *dquot); +static int ext3_mark_dquot_dirty(struct dquot *dquot); +static int ext3_write_info(struct super_block *sb, int type); +static int ext3_quota_on(struct super_block *sb, int type, int format_id, + struct path *path); +static int ext3_quota_on_mount(struct super_block *sb, int type); +static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off); +static ssize_t ext3_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); + +static const struct dquot_operations ext3_quota_operations = { + .write_dquot = ext3_write_dquot, + .acquire_dquot = ext3_acquire_dquot, + .release_dquot = ext3_release_dquot, + .mark_dirty = ext3_mark_dquot_dirty, + .write_info = ext3_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +}; + +static const struct quotactl_ops ext3_qctl_operations = { + .quota_on = ext3_quota_on, + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk +}; +#endif + +static const struct super_operations ext3_sops = { + .alloc_inode = ext3_alloc_inode, + .destroy_inode = ext3_destroy_inode, + .write_inode = ext3_write_inode, + .dirty_inode = ext3_dirty_inode, + .evict_inode = ext3_evict_inode, + .put_super = ext3_put_super, + .sync_fs = ext3_sync_fs, + .freeze_fs = ext3_freeze, + .unfreeze_fs = ext3_unfreeze, + .statfs = ext3_statfs, + .remount_fs = ext3_remount, + .show_options = ext3_show_options, +#ifdef CONFIG_QUOTA + .quota_read = ext3_quota_read, + .quota_write = ext3_quota_write, +#endif + .bdev_try_to_free_page = bdev_try_to_free_page, +}; + +static const struct export_operations ext3_export_ops = { + .fh_to_dentry = ext3_fh_to_dentry, + .fh_to_parent = ext3_fh_to_parent, + .get_parent = ext3_get_parent, +}; + +enum { + Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, + Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, + Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_resize, Opt_usrquota, Opt_grpquota +}; + +static const match_table_t tokens = { + {Opt_bsd_df, "bsddf"}, + {Opt_minix_df, "minixdf"}, + {Opt_grpid, "grpid"}, + {Opt_grpid, "bsdgroups"}, + {Opt_nogrpid, "nogrpid"}, + {Opt_nogrpid, "sysvgroups"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, + {Opt_sb, "sb=%u"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_nouid32, "nouid32"}, + {Opt_nocheck, "nocheck"}, + {Opt_nocheck, "check=none"}, + {Opt_debug, "debug"}, + {Opt_oldalloc, "oldalloc"}, + {Opt_orlov, "orlov"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_reservation, "reservation"}, + {Opt_noreservation, "noreservation"}, + {Opt_noload, "noload"}, + {Opt_noload, "norecovery"}, + {Opt_nobh, "nobh"}, + {Opt_bh, "bh"}, + {Opt_commit, "commit=%u"}, + {Opt_journal_update, "journal=update"}, + {Opt_journal_inum, "journal=%u"}, + {Opt_journal_dev, "journal_dev=%u"}, + {Opt_abort, "abort"}, + {Opt_data_journal, "data=journal"}, + {Opt_data_ordered, "data=ordered"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, + {Opt_grpquota, "grpquota"}, + {Opt_noquota, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, + {Opt_barrier, "barrier=%u"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_resize, "resize"}, + {Opt_err, NULL}, +}; + +static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) +{ + ext3_fsblk_t sb_block; + char *options = (char *) *data; + + if (!options || strncmp(options, "sb=", 3) != 0) + return 1; /* Default location */ + options += 3; + /*todo: use simple_strtoll with >32bit ext3 */ + sb_block = simple_strtoul(options, &options, 0); + if (*options && *options != ',') { + ext3_msg(sb, "error: invalid sb specification: %s", + (char *) *data); + return 1; + } + if (*options == ',') + options++; + *data = (void *) options; + return sb_block; +} + +#ifdef CONFIG_QUOTA +static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char *qname; + + if (sb_any_quota_loaded(sb) && + !sbi->s_qf_names[qtype]) { + ext3_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return 0; + } + qname = match_strdup(args); + if (!qname) { + ext3_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return 0; + } + if (sbi->s_qf_names[qtype] && + strcmp(sbi->s_qf_names[qtype], qname)) { + ext3_msg(sb, KERN_ERR, + "%s quota file already specified", QTYPE2NAME(qtype)); + kfree(qname); + return 0; + } + sbi->s_qf_names[qtype] = qname; + if (strchr(sbi->s_qf_names[qtype], '/')) { + ext3_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; + } + set_opt(sbi->s_mount_opt, QUOTA); + return 1; +} + +static int clear_qf_name(struct super_block *sb, int qtype) { + + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sb_any_quota_loaded(sb) && + sbi->s_qf_names[qtype]) { + ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return 0; + } + /* + * The space will be released later when all options are confirmed + * to be correct + */ + sbi->s_qf_names[qtype] = NULL; + return 1; +} +#endif + +static int parse_options (char *options, struct super_block *sb, + unsigned int *inum, unsigned long *journal_devnum, + ext3_fsblk_t *n_blocks_count, int is_remount) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char * p; + substring_t args[MAX_OPT_ARGS]; + int data_opt = 0; + int option; +#ifdef CONFIG_QUOTA + int qfmt; +#endif + + if (!options) + return 1; + + while ((p = strsep (&options, ",")) != NULL) { + int token; + if (!*p) + continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = 0; + token = match_token(p, tokens, args); + switch (token) { + case Opt_bsd_df: + clear_opt (sbi->s_mount_opt, MINIX_DF); + break; + case Opt_minix_df: + set_opt (sbi->s_mount_opt, MINIX_DF); + break; + case Opt_grpid: + set_opt (sbi->s_mount_opt, GRPID); + break; + case Opt_nogrpid: + clear_opt (sbi->s_mount_opt, GRPID); + break; + case Opt_resuid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_resuid = option; + break; + case Opt_resgid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_resgid = option; + break; + case Opt_sb: + /* handled by get_sb_block() instead of here */ + /* *sb_block = match_int(&args[0]); */ + break; + case Opt_err_panic: + clear_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (sbi->s_mount_opt, ERRORS_RO); + set_opt (sbi->s_mount_opt, ERRORS_PANIC); + break; + case Opt_err_ro: + clear_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (sbi->s_mount_opt, ERRORS_PANIC); + set_opt (sbi->s_mount_opt, ERRORS_RO); + break; + case Opt_err_cont: + clear_opt (sbi->s_mount_opt, ERRORS_RO); + clear_opt (sbi->s_mount_opt, ERRORS_PANIC); + set_opt (sbi->s_mount_opt, ERRORS_CONT); + break; + case Opt_nouid32: + set_opt (sbi->s_mount_opt, NO_UID32); + break; + case Opt_nocheck: + clear_opt (sbi->s_mount_opt, CHECK); + break; + case Opt_debug: + set_opt (sbi->s_mount_opt, DEBUG); + break; + case Opt_oldalloc: + set_opt (sbi->s_mount_opt, OLDALLOC); + break; + case Opt_orlov: + clear_opt (sbi->s_mount_opt, OLDALLOC); + break; +#ifdef CONFIG_EXT3_FS_XATTR + case Opt_user_xattr: + set_opt (sbi->s_mount_opt, XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt (sbi->s_mount_opt, XATTR_USER); + break; +#else + case Opt_user_xattr: + case Opt_nouser_xattr: + ext3_msg(sb, KERN_INFO, + "(no)user_xattr options not supported"); + break; +#endif +#ifdef CONFIG_EXT3_FS_POSIX_ACL + case Opt_acl: + set_opt(sbi->s_mount_opt, POSIX_ACL); + break; + case Opt_noacl: + clear_opt(sbi->s_mount_opt, POSIX_ACL); + break; +#else + case Opt_acl: + case Opt_noacl: + ext3_msg(sb, KERN_INFO, + "(no)acl options not supported"); + break; +#endif + case Opt_reservation: + set_opt(sbi->s_mount_opt, RESERVATION); + break; + case Opt_noreservation: + clear_opt(sbi->s_mount_opt, RESERVATION); + break; + case Opt_journal_update: + /* @@@ FIXME */ + /* Eventually we will want to be able to create + a journal file here. For now, only allow the + user to specify an existing inode to be the + journal file. */ + if (is_remount) { + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); + return 0; + } + set_opt (sbi->s_mount_opt, UPDATE_JOURNAL); + break; + case Opt_journal_inum: + if (is_remount) { + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); + return 0; + } + if (match_int(&args[0], &option)) + return 0; + *inum = option; + break; + case Opt_journal_dev: + if (is_remount) { + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); + return 0; + } + if (match_int(&args[0], &option)) + return 0; + *journal_devnum = option; + break; + case Opt_noload: + set_opt (sbi->s_mount_opt, NOLOAD); + break; + case Opt_commit: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + if (option == 0) + option = JBD_DEFAULT_MAX_COMMIT_AGE; + sbi->s_commit_interval = HZ * option; + break; + case Opt_data_journal: + data_opt = EXT3_MOUNT_JOURNAL_DATA; + goto datacheck; + case Opt_data_ordered: + data_opt = EXT3_MOUNT_ORDERED_DATA; + goto datacheck; + case Opt_data_writeback: + data_opt = EXT3_MOUNT_WRITEBACK_DATA; + datacheck: + if (is_remount) { + if (test_opt(sb, DATA_FLAGS) == data_opt) + break; + ext3_msg(sb, KERN_ERR, + "error: cannot change " + "data mode on remount. The filesystem " + "is mounted in data=%s mode and you " + "try to remount it in data=%s mode.", + data_mode_string(test_opt(sb, + DATA_FLAGS)), + data_mode_string(data_opt)); + return 0; + } else { + clear_opt(sbi->s_mount_opt, DATA_FLAGS); + sbi->s_mount_opt |= data_opt; + } + break; + case Opt_data_err_abort: + set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; + case Opt_data_err_ignore: + clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; +#ifdef CONFIG_QUOTA + case Opt_usrjquota: + if (!set_qf_name(sb, USRQUOTA, &args[0])) + return 0; + break; + case Opt_grpjquota: + if (!set_qf_name(sb, GRPQUOTA, &args[0])) + return 0; + break; + case Opt_offusrjquota: + if (!clear_qf_name(sb, USRQUOTA)) + return 0; + break; + case Opt_offgrpjquota: + if (!clear_qf_name(sb, GRPQUOTA)) + return 0; + break; + case Opt_jqfmt_vfsold: + qfmt = QFMT_VFS_OLD; + goto set_qf_format; + case Opt_jqfmt_vfsv0: + qfmt = QFMT_VFS_V0; + goto set_qf_format; + case Opt_jqfmt_vfsv1: + qfmt = QFMT_VFS_V1; +set_qf_format: + if (sb_any_quota_loaded(sb) && + sbi->s_jquota_fmt != qfmt) { + ext3_msg(sb, KERN_ERR, "error: cannot change " + "journaled quota options when " + "quota turned on."); + return 0; + } + sbi->s_jquota_fmt = qfmt; + break; + case Opt_quota: + case Opt_usrquota: + set_opt(sbi->s_mount_opt, QUOTA); + set_opt(sbi->s_mount_opt, USRQUOTA); + break; + case Opt_grpquota: + set_opt(sbi->s_mount_opt, QUOTA); + set_opt(sbi->s_mount_opt, GRPQUOTA); + break; + case Opt_noquota: + if (sb_any_quota_loaded(sb)) { + ext3_msg(sb, KERN_ERR, "error: cannot change " + "quota options when quota turned on."); + return 0; + } + clear_opt(sbi->s_mount_opt, QUOTA); + clear_opt(sbi->s_mount_opt, USRQUOTA); + clear_opt(sbi->s_mount_opt, GRPQUOTA); + break; +#else + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + ext3_msg(sb, KERN_ERR, + "error: quota options not supported."); + break; + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: + ext3_msg(sb, KERN_ERR, + "error: journaled quota options not " + "supported."); + break; + case Opt_noquota: + break; +#endif + case Opt_abort: + set_opt(sbi->s_mount_opt, ABORT); + break; + case Opt_nobarrier: + clear_opt(sbi->s_mount_opt, BARRIER); + break; + case Opt_barrier: + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = 1; /* No argument, default to 1 */ + if (option) + set_opt(sbi->s_mount_opt, BARRIER); + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; + case Opt_ignore: + break; + case Opt_resize: + if (!is_remount) { + ext3_msg(sb, KERN_ERR, + "error: resize option only available " + "for remount"); + return 0; + } + if (match_int(&args[0], &option) != 0) + return 0; + *n_blocks_count = option; + break; + case Opt_nobh: + ext3_msg(sb, KERN_WARNING, + "warning: ignoring deprecated nobh option"); + break; + case Opt_bh: + ext3_msg(sb, KERN_WARNING, + "warning: ignoring deprecated bh option"); + break; + default: + ext3_msg(sb, KERN_ERR, + "error: unrecognized mount option \"%s\" " + "or missing value", p); + return 0; + } + } +#ifdef CONFIG_QUOTA + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + clear_opt(sbi->s_mount_opt, USRQUOTA); + if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + clear_opt(sbi->s_mount_opt, GRPQUOTA); + + if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { + ext3_msg(sb, KERN_ERR, "error: old and new quota " + "format mixing."); + return 0; + } + + if (!sbi->s_jquota_fmt) { + ext3_msg(sb, KERN_ERR, "error: journaled quota format " + "not specified."); + return 0; + } + } else { + if (sbi->s_jquota_fmt) { + ext3_msg(sb, KERN_ERR, "error: journaled quota format " + "specified with no journaling " + "enabled."); + return 0; + } + } +#endif + return 1; +} + +static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, + int read_only) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int res = 0; + + if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) { + ext3_msg(sb, KERN_ERR, + "error: revision level too high, " + "forcing read-only mode"); + res = MS_RDONLY; + } + if (read_only) + return res; + if (!(sbi->s_mount_state & EXT3_VALID_FS)) + ext3_msg(sb, KERN_WARNING, + "warning: mounting unchecked fs, " + "running e2fsck is recommended"); + else if ((sbi->s_mount_state & EXT3_ERROR_FS)) + ext3_msg(sb, KERN_WARNING, + "warning: mounting fs with errors, " + "running e2fsck is recommended"); + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && + le16_to_cpu(es->s_mnt_count) >= + le16_to_cpu(es->s_max_mnt_count)) + ext3_msg(sb, KERN_WARNING, + "warning: maximal mount count reached, " + "running e2fsck is recommended"); + else if (le32_to_cpu(es->s_checkinterval) && + (le32_to_cpu(es->s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= get_seconds())) + ext3_msg(sb, KERN_WARNING, + "warning: checktime reached, " + "running e2fsck is recommended"); +#if 0 + /* @@@ We _will_ want to clear the valid bit if we find + inconsistencies, to force a fsck at reboot. But for + a plain journaled filesystem we can keep it set as + valid forever! :) */ + es->s_state &= cpu_to_le16(~EXT3_VALID_FS); +#endif + if (!le16_to_cpu(es->s_max_mnt_count)) + es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); + le16_add_cpu(&es->s_mnt_count, 1); + es->s_mtime = cpu_to_le32(get_seconds()); + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + + ext3_commit_super(sb, es, 1); + if (test_opt(sb, DEBUG)) + ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, " + "bpg=%lu, ipg=%lu, mo=%04lx]", + sb->s_blocksize, + sbi->s_groups_count, + EXT3_BLOCKS_PER_GROUP(sb), + EXT3_INODES_PER_GROUP(sb), + sbi->s_mount_opt); + + if (EXT3_SB(sb)->s_journal->j_inode == NULL) { + char b[BDEVNAME_SIZE]; + ext3_msg(sb, KERN_INFO, "using external journal on %s", + bdevname(EXT3_SB(sb)->s_journal->j_dev, b)); + } else { + ext3_msg(sb, KERN_INFO, "using internal journal"); + } + cleancache_init_fs(sb); + return res; +} + +/* Called at mount-time, super-block is locked */ +static int ext3_check_descriptors(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int i; + + ext3_debug ("Checking group descriptors"); + + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL); + ext3_fsblk_t first_block = ext3_group_first_block_no(sb, i); + ext3_fsblk_t last_block; + + if (i == sbi->s_groups_count - 1) + last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; + else + last_block = first_block + + (EXT3_BLOCKS_PER_GROUP(sb) - 1); + + if (le32_to_cpu(gdp->bg_block_bitmap) < first_block || + le32_to_cpu(gdp->bg_block_bitmap) > last_block) + { + ext3_error (sb, "ext3_check_descriptors", + "Block bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_block_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block || + le32_to_cpu(gdp->bg_inode_bitmap) > last_block) + { + ext3_error (sb, "ext3_check_descriptors", + "Inode bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_inode_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_table) < first_block || + le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 > + last_block) + { + ext3_error (sb, "ext3_check_descriptors", + "Inode table for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_inode_table)); + return 0; + } + } + + sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb)); + sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3_count_free_inodes(sb)); + return 1; +} + + +/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext3_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ +static void ext3_orphan_cleanup (struct super_block * sb, + struct ext3_super_block * es) +{ + unsigned int s_flags = sb->s_flags; + int nr_orphans = 0, nr_truncates = 0; +#ifdef CONFIG_QUOTA + int i; +#endif + if (!es->s_last_orphan) { + jbd_debug(4, "no orphan inodes to clean up\n"); + return; + } + + if (bdev_read_only(sb->s_bdev)) { + ext3_msg(sb, KERN_ERR, "error: write access " + "unavailable, skipping orphan cleanup."); + return; + } + + /* Check if feature set allows readwrite operations */ + if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) { + ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); + return; + } + + if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { + if (es->s_last_orphan) + jbd_debug(1, "Errors on filesystem, " + "clearing orphan list.\n"); + es->s_last_orphan = 0; + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + return; + } + + if (s_flags & MS_RDONLY) { + ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); + sb->s_flags &= ~MS_RDONLY; + } +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + for (i = 0; i < MAXQUOTAS; i++) { + if (EXT3_SB(sb)->s_qf_names[i]) { + int ret = ext3_quota_on_mount(sb, i); + if (ret < 0) + ext3_msg(sb, KERN_ERR, + "error: cannot turn on journaled " + "quota: %d", ret); + } + } +#endif + + while (es->s_last_orphan) { + struct inode *inode; + + inode = ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); + if (IS_ERR(inode)) { + es->s_last_orphan = 0; + break; + } + + list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); + dquot_initialize(inode); + if (inode->i_nlink) { + printk(KERN_DEBUG + "%s: truncating inode %lu to %Ld bytes\n", + __func__, inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %lu to %Ld bytes\n", + inode->i_ino, inode->i_size); + ext3_truncate(inode); + nr_truncates++; + } else { + printk(KERN_DEBUG + "%s: deleting unreferenced inode %lu\n", + __func__, inode->i_ino); + jbd_debug(2, "deleting unreferenced inode %lu\n", + inode->i_ino); + nr_orphans++; + } + iput(inode); /* The delete magic happens here! */ + } + +#define PLURAL(x) (x), ((x)==1) ? "" : "s" + + if (nr_orphans) + ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted", + PLURAL(nr_orphans)); + if (nr_truncates) + ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up", + PLURAL(nr_truncates)); +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + for (i = 0; i < MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } +#endif + sb->s_flags = s_flags; /* Restore MS_RDONLY status */ +} + +/* + * Maximal file size. There is a direct, and {,double-,triple-}indirect + * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. + * We need to be 1 filesystem block less than the 2^32 sector limit. + */ +static loff_t ext3_max_size(int bits) +{ + loff_t res = EXT3_NDIR_BLOCKS; + int meta_blocks; + loff_t upper_limit; + + /* This is calculated to be the largest file size for a + * dense, file such that the total number of + * sectors in the file, including data and all indirect blocks, + * does not exceed 2^32 -1 + * __u32 i_blocks representing the total number of + * 512 bytes blocks of the file + */ + upper_limit = (1LL << 32) - 1; + + /* total blocks in file system block size */ + upper_limit >>= (bits - 9); + + + /* indirect blocks */ + meta_blocks = 1; + /* double indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)); + /* tripple indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); + + upper_limit -= meta_blocks; + upper_limit <<= bits; + + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + res <<= bits; + if (res > upper_limit) + res = upper_limit; + + if (res > MAX_LFS_FILESIZE) + res = MAX_LFS_FILESIZE; + + return res; +} + +static ext3_fsblk_t descriptor_loc(struct super_block *sb, + ext3_fsblk_t logic_sb_block, + int nr) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long bg, first_meta_bg; + int has_super = 0; + + first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) || + nr < first_meta_bg) + return (logic_sb_block + nr + 1); + bg = sbi->s_desc_per_block * nr; + if (ext3_bg_has_super(sb, bg)) + has_super = 1; + return (has_super + ext3_group_first_block_no(sb, bg)); +} + + +static int ext3_fill_super (struct super_block *sb, void *data, int silent) +{ + struct buffer_head * bh; + struct ext3_super_block *es = NULL; + struct ext3_sb_info *sbi; + ext3_fsblk_t block; + ext3_fsblk_t sb_block = get_sb_block(&data, sb); + ext3_fsblk_t logic_sb_block; + unsigned long offset = 0; + unsigned int journal_inum = 0; + unsigned long journal_devnum = 0; + unsigned long def_mount_opts; + struct inode *root; + int blocksize; + int hblock; + int db_count; + int i; + int needs_recovery; + int ret = -EINVAL; + __le32 features; + int err; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + return -ENOMEM; + } + sb->s_fs_info = sbi; + sbi->s_mount_opt = 0; + sbi->s_resuid = EXT3_DEF_RESUID; + sbi->s_resgid = EXT3_DEF_RESGID; + sbi->s_sb_block = sb_block; + + blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); + if (!blocksize) { + ext3_msg(sb, KERN_ERR, "error: unable to set blocksize"); + goto out_fail; + } + + /* + * The ext3 superblock will not be buffer aligned for other than 1kB + * block sizes. We need to calculate the offset from buffer start. + */ + if (blocksize != EXT3_MIN_BLOCK_SIZE) { + logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; + offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; + } else { + logic_sb_block = sb_block; + } + + if (!(bh = sb_bread(sb, logic_sb_block))) { + ext3_msg(sb, KERN_ERR, "error: unable to read superblock"); + goto out_fail; + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext3 macro-instructions depend on its value + */ + es = (struct ext3_super_block *) (bh->b_data + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + if (sb->s_magic != EXT3_SUPER_MAGIC) + goto cantfind_ext3; + + /* Set defaults before we parse the mount options */ + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + if (def_mount_opts & EXT3_DEFM_DEBUG) + set_opt(sbi->s_mount_opt, DEBUG); + if (def_mount_opts & EXT3_DEFM_BSDGROUPS) + set_opt(sbi->s_mount_opt, GRPID); + if (def_mount_opts & EXT3_DEFM_UID16) + set_opt(sbi->s_mount_opt, NO_UID32); +#ifdef CONFIG_EXT3_FS_XATTR + if (def_mount_opts & EXT3_DEFM_XATTR_USER) + set_opt(sbi->s_mount_opt, XATTR_USER); +#endif +#ifdef CONFIG_EXT3_FS_POSIX_ACL + if (def_mount_opts & EXT3_DEFM_ACL) + set_opt(sbi->s_mount_opt, POSIX_ACL); +#endif + if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) + set_opt(sbi->s_mount_opt, JOURNAL_DATA); + else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) + set_opt(sbi->s_mount_opt, ORDERED_DATA); + else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK) + set_opt(sbi->s_mount_opt, WRITEBACK_DATA); + + if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC) + set_opt(sbi->s_mount_opt, ERRORS_PANIC); + else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_CONTINUE) + set_opt(sbi->s_mount_opt, ERRORS_CONT); + else + set_opt(sbi->s_mount_opt, ERRORS_RO); + + sbi->s_resuid = le16_to_cpu(es->s_def_resuid); + sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + + set_opt(sbi->s_mount_opt, RESERVATION); + + if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, + NULL, 0)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + + if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && + (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || + EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || + EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) + ext3_msg(sb, KERN_WARNING, + "warning: feature flags set on rev 0 fs, " + "running e2fsck is recommended"); + /* + * Check feature flags regardless of the revision level, since we + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. + */ + features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP); + if (features) { + ext3_msg(sb, KERN_ERR, + "error: couldn't mount because of unsupported " + "optional features (%x)", le32_to_cpu(features)); + goto failed_mount; + } + features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP); + if (!(sb->s_flags & MS_RDONLY) && features) { + ext3_msg(sb, KERN_ERR, + "error: couldn't mount RDWR because of unsupported " + "optional features (%x)", le32_to_cpu(features)); + goto failed_mount; + } + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + if (blocksize < EXT3_MIN_BLOCK_SIZE || + blocksize > EXT3_MAX_BLOCK_SIZE) { + ext3_msg(sb, KERN_ERR, + "error: couldn't mount because of unsupported " + "filesystem blocksize %d", blocksize); + goto failed_mount; + } + + hblock = bdev_logical_block_size(sb->s_bdev); + if (sb->s_blocksize != blocksize) { + /* + * Make sure the blocksize for the filesystem is larger + * than the hardware sectorsize for the machine. + */ + if (blocksize < hblock) { + ext3_msg(sb, KERN_ERR, + "error: fsblocksize %d too small for " + "hardware sectorsize %d", blocksize, hblock); + goto failed_mount; + } + + brelse (bh); + if (!sb_set_blocksize(sb, blocksize)) { + ext3_msg(sb, KERN_ERR, + "error: bad blocksize %d", blocksize); + goto out_fail; + } + logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; + offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; + bh = sb_bread(sb, logic_sb_block); + if (!bh) { + ext3_msg(sb, KERN_ERR, + "error: can't read superblock on 2nd try"); + goto failed_mount; + } + es = (struct ext3_super_block *)(bh->b_data + offset); + sbi->s_es = es; + if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) { + ext3_msg(sb, KERN_ERR, + "error: magic mismatch"); + goto failed_mount; + } + } + + sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits); + + if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) { + sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) || + (!is_power_of_2(sbi->s_inode_size)) || + (sbi->s_inode_size > blocksize)) { + ext3_msg(sb, KERN_ERR, + "error: unsupported inode size: %d", + sbi->s_inode_size); + goto failed_mount; + } + } + sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << + le32_to_cpu(es->s_log_frag_size); + if (blocksize != sbi->s_frag_size) { + ext3_msg(sb, KERN_ERR, + "error: fragsize %lu != blocksize %u (unsupported)", + sbi->s_frag_size, blocksize); + goto failed_mount; + } + sbi->s_frags_per_block = 1; + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + if (EXT3_INODE_SIZE(sb) == 0 || EXT3_INODES_PER_GROUP(sb) == 0) + goto cantfind_ext3; + sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0) + goto cantfind_ext3; + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc); + sbi->s_sbh = bh; + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb)); + for (i=0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + sbi->s_def_hash_version = es->s_def_hash_version; + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + } + + if (sbi->s_blocks_per_group > blocksize * 8) { + ext3_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", + sbi->s_blocks_per_group); + goto failed_mount; + } + if (sbi->s_frags_per_group > blocksize * 8) { + ext3_msg(sb, KERN_ERR, + "error: #fragments per group too big: %lu", + sbi->s_frags_per_group); + goto failed_mount; + } + if (sbi->s_inodes_per_group > blocksize * 8) { + ext3_msg(sb, KERN_ERR, + "error: #inodes per group too big: %lu", + sbi->s_inodes_per_group); + goto failed_mount; + } + + err = generic_check_addressable(sb->s_blocksize_bits, + le32_to_cpu(es->s_blocks_count)); + if (err) { + ext3_msg(sb, KERN_ERR, + "error: filesystem is too large to mount safely"); + if (sizeof(sector_t) < 8) + ext3_msg(sb, KERN_ERR, + "error: CONFIG_LBDAF not enabled"); + ret = err; + goto failed_mount; + } + + if (EXT3_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext3; + sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) - 1) + / EXT3_BLOCKS_PER_GROUP(sb)) + 1; + db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb)); + sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), + GFP_KERNEL); + if (sbi->s_group_desc == NULL) { + ext3_msg(sb, KERN_ERR, + "error: not enough memory"); + ret = -ENOMEM; + goto failed_mount; + } + + bgl_lock_init(sbi->s_blockgroup_lock); + + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logic_sb_block, i); + sbi->s_group_desc[i] = sb_bread(sb, block); + if (!sbi->s_group_desc[i]) { + ext3_msg(sb, KERN_ERR, + "error: can't read group descriptor %d", i); + db_count = i; + goto failed_mount2; + } + } + if (!ext3_check_descriptors (sb)) { + ext3_msg(sb, KERN_ERR, + "error: group descriptors corrupted"); + goto failed_mount2; + } + sbi->s_gdb_count = db_count; + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + + /* per fileystem reservation list head & lock */ + spin_lock_init(&sbi->s_rsv_window_lock); + sbi->s_rsv_window_root = RB_ROOT; + /* Add a single, static dummy reservation to the start of the + * reservation window list --- it gives us a placeholder for + * append-at-start-of-list which makes the allocation logic + * _much_ simpler. */ + sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + sbi->s_rsv_window_head.rsv_alloc_hit = 0; + sbi->s_rsv_window_head.rsv_goal_size = 0; + ext3_rsv_window_add(sb, &sbi->s_rsv_window_head); + + /* + * set up enough so that it can read an inode + */ + sb->s_op = &ext3_sops; + sb->s_export_op = &ext3_export_ops; + sb->s_xattr = ext3_xattr_handlers; +#ifdef CONFIG_QUOTA + sb->s_qcop = &ext3_qctl_operations; + sb->dq_op = &ext3_quota_operations; +#endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); + mutex_init(&sbi->s_resize_lock); + + sb->s_root = NULL; + + needs_recovery = (es->s_last_orphan != 0 || + EXT3_HAS_INCOMPAT_FEATURE(sb, + EXT3_FEATURE_INCOMPAT_RECOVER)); + + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! + */ + if (!test_opt(sb, NOLOAD) && + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { + if (ext3_load_journal(sb, es, journal_devnum)) + goto failed_mount2; + } else if (journal_inum) { + if (ext3_create_journal(sb, es, journal_inum)) + goto failed_mount2; + } else { + if (!silent) + ext3_msg(sb, KERN_ERR, + "error: no journal found. " + "mounting ext3 over ext2?"); + goto failed_mount2; + } + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext3_count_free_blocks(sb)); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext3_count_free_inodes(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + } + if (err) { + ext3_msg(sb, KERN_ERR, "error: insufficient memory"); + ret = err; + goto failed_mount3; + } + + /* We have now updated the journal if required, so we can + * validate the data journaling mode. */ + switch (test_opt(sb, DATA_FLAGS)) { + case 0: + /* No mode set, assume a default based on the journal + capabilities: ORDERED_DATA if the journal can + cope, else JOURNAL_DATA */ + if (journal_check_available_features + (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) + set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE); + else + set_opt(sbi->s_mount_opt, JOURNAL_DATA); + break; + + case EXT3_MOUNT_ORDERED_DATA: + case EXT3_MOUNT_WRITEBACK_DATA: + if (!journal_check_available_features + (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { + ext3_msg(sb, KERN_ERR, + "error: journal does not support " + "requested data journaling mode"); + goto failed_mount3; + } + default: + break; + } + + /* + * The journal_load will have done any necessary log recovery, + * so we can safely mount the rest of the filesystem now. + */ + + root = ext3_iget(sb, EXT3_ROOT_INO); + if (IS_ERR(root)) { + ext3_msg(sb, KERN_ERR, "error: get root inode failed"); + ret = PTR_ERR(root); + goto failed_mount3; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + iput(root); + ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); + goto failed_mount3; + } + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); + iput(root); + ret = -ENOMEM; + goto failed_mount3; + } + + ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); + + EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; + ext3_orphan_cleanup(sb, es); + EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; + if (needs_recovery) + ext3_msg(sb, KERN_INFO, "recovery complete"); + ext3_mark_recovery_complete(sb, es); + ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode", + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + + return 0; + +cantfind_ext3: + if (!silent) + ext3_msg(sb, KERN_INFO, + "error: can't find ext3 filesystem on dev %s.", + sb->s_id); + goto failed_mount; + +failed_mount3: + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + journal_destroy(sbi->s_journal); +failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); +failed_mount: +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + ext3_blkdev_remove(sbi); + brelse(bh); +out_fail: + sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); + kfree(sbi); + return ret; +} + +/* + * Setup any per-fs journal parameters now. We'll do this both on + * initial mount, once the journal has been initialised but before we've + * done any recovery; and again on any subsequent remount. + */ +static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sbi->s_commit_interval) + journal->j_commit_interval = sbi->s_commit_interval; + /* We could also set up an ext3-specific default for the commit + * interval here, but for now we'll just fall back to the jbd + * default. */ + + spin_lock(&journal->j_state_lock); + if (test_opt(sb, BARRIER)) + journal->j_flags |= JFS_BARRIER; + else + journal->j_flags &= ~JFS_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR; + spin_unlock(&journal->j_state_lock); +} + +static journal_t *ext3_get_journal(struct super_block *sb, + unsigned int journal_inum) +{ + struct inode *journal_inode; + journal_t *journal; + + /* First, test for the existence of a valid inode on disk. Bad + * things happen if we iget() an unused inode, as the subsequent + * iput() will try to delete it. */ + + journal_inode = ext3_iget(sb, journal_inum); + if (IS_ERR(journal_inode)) { + ext3_msg(sb, KERN_ERR, "error: no journal found"); + return NULL; + } + if (!journal_inode->i_nlink) { + make_bad_inode(journal_inode); + iput(journal_inode); + ext3_msg(sb, KERN_ERR, "error: journal inode is deleted"); + return NULL; + } + + jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", + journal_inode, journal_inode->i_size); + if (!S_ISREG(journal_inode->i_mode)) { + ext3_msg(sb, KERN_ERR, "error: invalid journal inode"); + iput(journal_inode); + return NULL; + } + + journal = journal_init_inode(journal_inode); + if (!journal) { + ext3_msg(sb, KERN_ERR, "error: could not load journal inode"); + iput(journal_inode); + return NULL; + } + journal->j_private = sb; + ext3_init_journal_params(sb, journal); + return journal; +} + +static journal_t *ext3_get_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + struct buffer_head * bh; + journal_t *journal; + ext3_fsblk_t start; + ext3_fsblk_t len; + int hblock, blocksize; + ext3_fsblk_t sb_block; + unsigned long offset; + struct ext3_super_block * es; + struct block_device *bdev; + + bdev = ext3_blkdev_get(j_dev, sb); + if (bdev == NULL) + return NULL; + + blocksize = sb->s_blocksize; + hblock = bdev_logical_block_size(bdev); + if (blocksize < hblock) { + ext3_msg(sb, KERN_ERR, + "error: blocksize too small for journal device"); + goto out_bdev; + } + + sb_block = EXT3_MIN_BLOCK_SIZE / blocksize; + offset = EXT3_MIN_BLOCK_SIZE % blocksize; + set_blocksize(bdev, blocksize); + if (!(bh = __bread(bdev, sb_block, blocksize))) { + ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of " + "external journal"); + goto out_bdev; + } + + es = (struct ext3_super_block *) (bh->b_data + offset); + if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || + !(le32_to_cpu(es->s_feature_incompat) & + EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { + ext3_msg(sb, KERN_ERR, "error: external journal has " + "bad superblock"); + brelse(bh); + goto out_bdev; + } + + if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { + ext3_msg(sb, KERN_ERR, "error: journal UUID does not match"); + brelse(bh); + goto out_bdev; + } + + len = le32_to_cpu(es->s_blocks_count); + start = sb_block + 1; + brelse(bh); /* we're done with the superblock */ + + journal = journal_init_dev(bdev, sb->s_bdev, + start, len, blocksize); + if (!journal) { + ext3_msg(sb, KERN_ERR, + "error: failed to create device journal"); + goto out_bdev; + } + journal->j_private = sb; + ll_rw_block(READ, 1, &journal->j_sb_buffer); + wait_on_buffer(journal->j_sb_buffer); + if (!buffer_uptodate(journal->j_sb_buffer)) { + ext3_msg(sb, KERN_ERR, "I/O error on journal device"); + goto out_journal; + } + if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { + ext3_msg(sb, KERN_ERR, + "error: external journal has more than one " + "user (unsupported) - %d", + be32_to_cpu(journal->j_superblock->s_nr_users)); + goto out_journal; + } + EXT3_SB(sb)->journal_bdev = bdev; + ext3_init_journal_params(sb, journal); + return journal; +out_journal: + journal_destroy(journal); +out_bdev: + ext3_blkdev_put(bdev); + return NULL; +} + +static int ext3_load_journal(struct super_block *sb, + struct ext3_super_block *es, + unsigned long journal_devnum) +{ + journal_t *journal; + unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); + dev_t journal_dev; + int err = 0; + int really_read_only; + + if (journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + ext3_msg(sb, KERN_INFO, "external journal device major/minor " + "numbers have changed"); + journal_dev = new_decode_dev(journal_devnum); + } else + journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); + + really_read_only = bdev_read_only(sb->s_bdev); + + /* + * Are we loading a blank journal or performing recovery after a + * crash? For recovery, we need to check in advance whether we + * can get read-write access to the device. + */ + + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { + if (sb->s_flags & MS_RDONLY) { + ext3_msg(sb, KERN_INFO, + "recovery required on readonly filesystem"); + if (really_read_only) { + ext3_msg(sb, KERN_ERR, "error: write access " + "unavailable, cannot proceed"); + return -EROFS; + } + ext3_msg(sb, KERN_INFO, + "write access will be enabled during recovery"); + } + } + + if (journal_inum && journal_dev) { + ext3_msg(sb, KERN_ERR, "error: filesystem has both journal " + "and inode journals"); + return -EINVAL; + } + + if (journal_inum) { + if (!(journal = ext3_get_journal(sb, journal_inum))) + return -EINVAL; + } else { + if (!(journal = ext3_get_dev_journal(sb, journal_dev))) + return -EINVAL; + } + + if (!(journal->j_flags & JFS_BARRIER)) + printk(KERN_INFO "EXT3-fs: barriers not enabled\n"); + + if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { + err = journal_update_format(journal); + if (err) { + ext3_msg(sb, KERN_ERR, "error updating journal"); + journal_destroy(journal); + return err; + } + } + + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) + err = journal_wipe(journal, !really_read_only); + if (!err) + err = journal_load(journal); + + if (err) { + ext3_msg(sb, KERN_ERR, "error loading journal"); + journal_destroy(journal); + return err; + } + + EXT3_SB(sb)->s_journal = journal; + ext3_clear_journal_err(sb, es); + + if (!really_read_only && journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + es->s_journal_dev = cpu_to_le32(journal_devnum); + + /* Make sure we flush the recovery flag to disk. */ + ext3_commit_super(sb, es, 1); + } + + return 0; +} + +static int ext3_create_journal(struct super_block *sb, + struct ext3_super_block *es, + unsigned int journal_inum) +{ + journal_t *journal; + int err; + + if (sb->s_flags & MS_RDONLY) { + ext3_msg(sb, KERN_ERR, + "error: readonly filesystem when trying to " + "create journal"); + return -EROFS; + } + + journal = ext3_get_journal(sb, journal_inum); + if (!journal) + return -EINVAL; + + ext3_msg(sb, KERN_INFO, "creating new journal on inode %u", + journal_inum); + + err = journal_create(journal); + if (err) { + ext3_msg(sb, KERN_ERR, "error creating journal"); + journal_destroy(journal); + return -EIO; + } + + EXT3_SB(sb)->s_journal = journal; + + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); + + es->s_journal_inum = cpu_to_le32(journal_inum); + + /* Make sure we flush the recovery flag to disk. */ + ext3_commit_super(sb, es, 1); + + return 0; +} + +static int ext3_commit_super(struct super_block *sb, + struct ext3_super_block *es, + int sync) +{ + struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; + int error = 0; + + if (!sbh) + return error; + + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + ext3_msg(sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + /* + * If the file system is mounted read-only, don't update the + * superblock write time. This avoids updating the superblock + * write time when we are mounting the root file system + * read/only but we need to replay the journal; at that point, + * for people who are east of GMT and who make their clock + * tick in localtime for Windows bug-for-bug compatibility, + * the clock is set in the future, and this will cause e2fsck + * to complain and force a full file system check. + */ + if (!(sb->s_flags & MS_RDONLY)) + es->s_wtime = cpu_to_le32(get_seconds()); + es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); + es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); + BUFFER_TRACE(sbh, "marking dirty"); + mark_buffer_dirty(sbh); + if (sync) { + error = sync_dirty_buffer(sbh); + if (buffer_write_io_error(sbh)) { + ext3_msg(sb, KERN_ERR, "I/O error while writing " + "superblock"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + } + return error; +} + + +/* + * Have we just finished recovery? If so, and if we are mounting (or + * remounting) the filesystem readonly, then we will end up with a + * consistent fs on disk. Record that fact. + */ +static void ext3_mark_recovery_complete(struct super_block * sb, + struct ext3_super_block * es) +{ + journal_t *journal = EXT3_SB(sb)->s_journal; + + journal_lock_updates(journal); + if (journal_flush(journal) < 0) + goto out; + + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && + sb->s_flags & MS_RDONLY) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + ext3_commit_super(sb, es, 1); + } + +out: + journal_unlock_updates(journal); +} + +/* + * If we are mounting (or read-write remounting) a filesystem whose journal + * has recorded an error from a previous lifetime, move that error to the + * main filesystem now. + */ +static void ext3_clear_journal_err(struct super_block *sb, + struct ext3_super_block *es) +{ + journal_t *journal; + int j_errno; + const char *errstr; + + journal = EXT3_SB(sb)->s_journal; + + /* + * Now check for any error status which may have been recorded in the + * journal by a prior ext3_error() or ext3_abort() + */ + + j_errno = journal_errno(journal); + if (j_errno) { + char nbuf[16]; + + errstr = ext3_decode_error(sb, j_errno, nbuf); + ext3_warning(sb, __func__, "Filesystem error recorded " + "from previous mount: %s", errstr); + ext3_warning(sb, __func__, "Marking fs in need of " + "filesystem check."); + + EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + es->s_state |= cpu_to_le16(EXT3_ERROR_FS); + ext3_commit_super (sb, es, 1); + + journal_clear_err(journal); + } +} + +/* + * Force the running and committing transactions to commit, + * and wait on the commit. + */ +int ext3_force_commit(struct super_block *sb) +{ + journal_t *journal; + int ret; + + if (sb->s_flags & MS_RDONLY) + return 0; + + journal = EXT3_SB(sb)->s_journal; + ret = ext3_journal_force_commit(journal); + return ret; +} + +static int ext3_sync_fs(struct super_block *sb, int wait) +{ + tid_t target; + + if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { + if (wait) + log_wait_commit(EXT3_SB(sb)->s_journal, target); + } + return 0; +} + +/* + * LVM calls this function before a (read-only) snapshot is created. This + * gives us a chance to flush the journal completely and mark the fs clean. + */ +static int ext3_freeze(struct super_block *sb) +{ + int error = 0; + journal_t *journal; + + if (!(sb->s_flags & MS_RDONLY)) { + journal = EXT3_SB(sb)->s_journal; + + /* Now we set up the journal barrier. */ + journal_lock_updates(journal); + + /* + * We don't want to clear needs_recovery flag when we failed + * to flush the journal. + */ + error = journal_flush(journal); + if (error < 0) + goto out; + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + if (error) + goto out; + } + return 0; + +out: + journal_unlock_updates(journal); + return error; +} + +/* + * Called by LVM after the snapshot is done. We need to reset the RECOVER + * flag here, even though the filesystem is not technically dirty yet. + */ +static int ext3_unfreeze(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) { + lock_super(sb); + /* Reser the needs_recovery flag before the fs is unlocked. */ + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + unlock_super(sb); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + } + return 0; +} + +static int ext3_remount (struct super_block * sb, int * flags, char * data) +{ + struct ext3_super_block * es; + struct ext3_sb_info *sbi = EXT3_SB(sb); + ext3_fsblk_t n_blocks_count = 0; + unsigned long old_sb_flags; + struct ext3_mount_options old_opts; + int enable_quota = 0; + int err; +#ifdef CONFIG_QUOTA + int i; +#endif + + /* Store the original options */ + lock_super(sb); + old_sb_flags = sb->s_flags; + old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_resuid = sbi->s_resuid; + old_opts.s_resgid = sbi->s_resgid; + old_opts.s_commit_interval = sbi->s_commit_interval; +#ifdef CONFIG_QUOTA + old_opts.s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) + old_opts.s_qf_names[i] = sbi->s_qf_names[i]; +#endif + + /* + * Allow the "check" option to be passed as a remount option. + */ + if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { + err = -EINVAL; + goto restore_opts; + } + + if (test_opt(sb, ABORT)) + ext3_abort(sb, __func__, "Abort forced by user"); + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + + es = sbi->s_es; + + ext3_init_journal_params(sb, sbi->s_journal); + + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || + n_blocks_count > le32_to_cpu(es->s_blocks_count)) { + if (test_opt(sb, ABORT)) { + err = -EROFS; + goto restore_opts; + } + + if (*flags & MS_RDONLY) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + + /* + * First of all, the unconditional stuff we have to do + * to disable replay of the journal when we next remount + */ + sb->s_flags |= MS_RDONLY; + + /* + * OK, test if we are remounting a valid rw partition + * readonly, and if so set the rdonly flag and then + * mark the partition as valid again. + */ + if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) && + (sbi->s_mount_state & EXT3_VALID_FS)) + es->s_state = cpu_to_le16(sbi->s_mount_state); + + ext3_mark_recovery_complete(sb, es); + } else { + __le32 ret; + if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, + ~EXT3_FEATURE_RO_COMPAT_SUPP))) { + ext3_msg(sb, KERN_WARNING, + "warning: couldn't remount RDWR " + "because of unsupported optional " + "features (%x)", le32_to_cpu(ret)); + err = -EROFS; + goto restore_opts; + } + + /* + * If we have an unprocessed orphan list hanging + * around from a previously readonly bdev mount, + * require a full umount/remount for now. + */ + if (es->s_last_orphan) { + ext3_msg(sb, KERN_WARNING, "warning: couldn't " + "remount RDWR because of unprocessed " + "orphan inode list. Please " + "umount/remount instead."); + err = -EINVAL; + goto restore_opts; + } + + /* + * Mounting a RDONLY partition read-write, so reread + * and store the current valid flag. (It may have + * been changed by e2fsck since we originally mounted + * the partition.) + */ + ext3_clear_journal_err(sb, es); + sbi->s_mount_state = le16_to_cpu(es->s_state); + if ((err = ext3_group_extend(sb, es, n_blocks_count))) + goto restore_opts; + if (!ext3_setup_super (sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; + enable_quota = 1; + } + } +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + if (old_opts.s_qf_names[i] && + old_opts.s_qf_names[i] != sbi->s_qf_names[i]) + kfree(old_opts.s_qf_names[i]); +#endif + unlock_super(sb); + + if (enable_quota) + dquot_resume(sb, -1); + return 0; +restore_opts: + sb->s_flags = old_sb_flags; + sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_resuid = old_opts.s_resuid; + sbi->s_resgid = old_opts.s_resgid; + sbi->s_commit_interval = old_opts.s_commit_interval; +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = old_opts.s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i] && + old_opts.s_qf_names[i] != sbi->s_qf_names[i]) + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = old_opts.s_qf_names[i]; + } +#endif + unlock_super(sb); + return err; +} + +static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) +{ + struct super_block *sb = dentry->d_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + u64 fsid; + + if (test_opt(sb, MINIX_DF)) { + sbi->s_overhead_last = 0; + } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) { + unsigned long ngroups = sbi->s_groups_count, i; + ext3_fsblk_t overhead = 0; + smp_rmb(); + + /* + * Compute the overhead (FS structures). This is constant + * for a given filesystem unless the number of block groups + * changes so we cache the previous value until it does. + */ + + /* + * All of the blocks before first_data_block are + * overhead + */ + overhead = le32_to_cpu(es->s_first_data_block); + + /* + * Add the overhead attributed to the superblock and + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ + for (i = 0; i < ngroups; i++) { + overhead += ext3_bg_has_super(sb, i) + + ext3_bg_num_gdb(sb, i); + cond_resched(); + } + + /* + * Every block group has an inode bitmap, a block + * bitmap, and an inode table. + */ + overhead += ngroups * (2 + sbi->s_itb_per_group); + sbi->s_overhead_last = overhead; + smp_wmb(); + sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count); + } + + buf->f_type = EXT3_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last; + buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); + buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); + if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); + buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); + buf->f_namelen = EXT3_NAME_LEN; + fsid = le64_to_cpup((void *)es->s_uuid) ^ + le64_to_cpup((void *)es->s_uuid + sizeof(u64)); + buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + return 0; +} + +/* Helper function for writing quotas on sync - we need to start transaction before quota file + * is locked for write. Otherwise the are possible deadlocks: + * Process 1 Process 2 + * ext3_create() quota_sync() + * journal_start() write_dquot() + * dquot_initialize() down(dqio_mutex) + * down(dqio_mutex) journal_start() + * + */ + +#ifdef CONFIG_QUOTA + +static inline struct inode *dquot_to_inode(struct dquot *dquot) +{ + return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; +} + +static int ext3_write_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + struct inode *inode; + + inode = dquot_to_inode(dquot); + handle = ext3_journal_start(inode, + EXT3_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit(dquot); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3_acquire_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext3_journal_start(dquot_to_inode(dquot), + EXT3_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_acquire(dquot); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3_release_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext3_journal_start(dquot_to_inode(dquot), + EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) { + /* Release dquot anyway to avoid endless cycle in dqput() */ + dquot_release(dquot); + return PTR_ERR(handle); + } + ret = dquot_release(dquot); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3_mark_dquot_dirty(struct dquot *dquot) +{ + /* Are we journaling quotas? */ + if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || + EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + dquot_mark_dquot_dirty(dquot); + return ext3_write_dquot(dquot); + } else { + return dquot_mark_dquot_dirty(dquot); + } +} + +static int ext3_write_info(struct super_block *sb, int type) +{ + int ret, err; + handle_t *handle; + + /* Data block + inode block */ + handle = ext3_journal_start(sb->s_root->d_inode, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit_info(sb, type); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +/* + * Turn on quotas during mount time - we need to find + * the quota file and such... + */ +static int ext3_quota_on_mount(struct super_block *sb, int type) +{ + return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], + EXT3_SB(sb)->s_jquota_fmt, type); +} + +/* + * Standard function to be called on quota_on + */ +static int ext3_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + int err; + + if (!test_opt(sb, QUOTA)) + return -EINVAL; + + /* Quotafile not on the same filesystem? */ + if (path->mnt->mnt_sb != sb) + return -EXDEV; + /* Journaling quota? */ + if (EXT3_SB(sb)->s_qf_names[type]) { + /* Quotafile not of fs root? */ + if (path->dentry->d_parent != sb->s_root) + ext3_msg(sb, KERN_WARNING, + "warning: Quota file not on filesystem root. " + "Journaled quota will not work."); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (ext3_should_journal_data(path->dentry->d_inode)) { + /* + * We don't need to lock updates but journal_flush() could + * otherwise be livelocked... + */ + journal_lock_updates(EXT3_SB(sb)->s_journal); + err = journal_flush(EXT3_SB(sb)->s_journal); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + if (err) + return err; + } + + return dquot_quota_on(sb, type, format_id, path); +} + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) + * we don't have to be afraid of races */ +static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + bh = ext3_bread(NULL, inode, blk, 0, &err); + if (err) + return err; + if (!bh) /* A hole? */ + memset(data, 0, tocopy); + else + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +static ssize_t ext3_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL; + struct buffer_head *bh; + handle_t *handle = journal_current_handle(); + + if (!handle) { + ext3_msg(sb, KERN_WARNING, + "warning: quota write (off=%llu, len=%llu)" + " cancelled because transaction is not started.", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + + /* + * Since we account only one data block in transaction credits, + * then it is impossible to cross a block boundary. + */ + if (sb->s_blocksize - offset < len) { + ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because not block aligned", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + bh = ext3_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + if (journal_quota) { + err = ext3_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + goto out; + } + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, len); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + if (journal_quota) + err = ext3_journal_dirty_metadata(handle, bh); + else { + /* Always do at least ordered writes for quotas */ + err = ext3_journal_dirty_data(handle, bh); + mark_buffer_dirty(bh); + } + brelse(bh); +out: + if (err) { + mutex_unlock(&inode->i_mutex); + return err; + } + if (inode->i_size < off + len) { + i_size_write(inode, off + len); + EXT3_I(inode)->i_disksize = inode->i_size; + } + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); + mutex_unlock(&inode->i_mutex); + return len; +} + +#endif + +static struct dentry *ext3_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super); +} + +static struct file_system_type ext3_fs_type = { + .owner = THIS_MODULE, + .name = "ext3", + .mount = ext3_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_ext3_fs(void) +{ + int err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&ext3_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + exit_ext3_xattr(); + return err; +} + +static void __exit exit_ext3_fs(void) +{ + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); +} + +MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); +MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); +MODULE_LICENSE("GPL"); +module_init(init_ext3_fs) +module_exit(exit_ext3_fs) diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c new file mode 100644 index 00000000..7c489820 --- /dev/null +++ b/fs/ext3/symlink.c @@ -0,0 +1,56 @@ +/* + * linux/fs/ext3/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 symlink handling code + */ + +#include +#include +#include +#include +#include "xattr.h" + +static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ext3_inode_info *ei = EXT3_I(dentry->d_inode); + nd_set_link(nd, (char*)ei->i_data); + return NULL; +} + +const struct inode_operations ext3_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = ext3_setattr, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +const struct inode_operations ext3_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ext3_follow_link, + .setattr = ext3_setattr, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif +}; diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c new file mode 100644 index 00000000..d565759d --- /dev/null +++ b/fs/ext3/xattr.c @@ -0,0 +1,1336 @@ +/* + * linux/fs/ext3/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * Fix by Harrison Xing . + * Ext3 code with a lot of help from Eric Jarman . + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko . + * xattr consolidation Copyright (c) 2004 James Morris , + * Red Hat Inc. + * ea-in-inode support by Alex Tomas aka bzzz + * and Andreas Gruenbacher . + */ + +/* + * Extended attributes are stored directly in inodes (on file systems with + * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl + * field contains the block number if an inode uses an additional block. All + * attributes must fit in the inode and one additional block. Blocks that + * contain the identical set of attributes may be shared among several inodes. + * Identical blocks are detected by keeping a cache of blocks that have + * recently been accessed. + * + * The attributes in inodes and on blocks have a different header; the entries + * are stored in the same format: + * + * +------------------+ + * | header | + * | entry 1 | | + * | entry 2 | | growing downwards + * | entry 3 | v + * | four null bytes | + * | . . . | + * | value 1 | ^ + * | value 3 | | growing upwards + * | value 2 | | + * +------------------+ + * + * The header is followed by multiple entry descriptors. In disk blocks, the + * entry descriptors are kept sorted. In inodes, they are unsorted. The + * attribute values are aligned to the end of the block in no specific order. + * + * Locking strategy + * ---------------- + * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem. + * EA blocks are only changed if they are exclusive to an inode, so + * holding xattr_sem also means that nothing but the EA block's reference + * count can change. Multiple writers to the same block are synchronized + * by the buffer lock. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" + +#define BHDR(bh) ((struct ext3_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr)) +#define BFIRST(bh) ENTRY(BHDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#define IHDR(inode, raw_inode) \ + ((struct ext3_xattr_ibody_header *) \ + ((void *)raw_inode + \ + EXT3_GOOD_OLD_INODE_SIZE + \ + EXT3_I(inode)->i_extra_isize)) +#define IFIRST(hdr) ((struct ext3_xattr_entry *)((hdr)+1)) + +#ifdef EXT3_XATTR_DEBUG +# define ea_idebug(inode, f...) do { \ + printk(KERN_DEBUG "inode %s:%lu: ", \ + inode->i_sb->s_id, inode->i_ino); \ + printk(f); \ + printk("\n"); \ + } while (0) +# define ea_bdebug(bh, f...) do { \ + char b[BDEVNAME_SIZE]; \ + printk(KERN_DEBUG "block %s:%lu: ", \ + bdevname(bh->b_bdev, b), \ + (unsigned long) bh->b_blocknr); \ + printk(f); \ + printk("\n"); \ + } while (0) +#else +# define ea_idebug(f...) +# define ea_bdebug(f...) +#endif + +static void ext3_xattr_cache_insert(struct buffer_head *); +static struct buffer_head *ext3_xattr_cache_find(struct inode *, + struct ext3_xattr_header *, + struct mb_cache_entry **); +static void ext3_xattr_rehash(struct ext3_xattr_header *, + struct ext3_xattr_entry *); +static int ext3_xattr_list(struct dentry *dentry, char *buffer, + size_t buffer_size); + +static struct mb_cache *ext3_xattr_cache; + +static const struct xattr_handler *ext3_xattr_handler_map[] = { + [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, +#ifdef CONFIG_EXT3_FS_POSIX_ACL + [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, + [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler, +#endif + [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler, +#ifdef CONFIG_EXT3_FS_SECURITY + [EXT3_XATTR_INDEX_SECURITY] = &ext3_xattr_security_handler, +#endif +}; + +const struct xattr_handler *ext3_xattr_handlers[] = { + &ext3_xattr_user_handler, + &ext3_xattr_trusted_handler, +#ifdef CONFIG_EXT3_FS_POSIX_ACL + &ext3_xattr_acl_access_handler, + &ext3_xattr_acl_default_handler, +#endif +#ifdef CONFIG_EXT3_FS_SECURITY + &ext3_xattr_security_handler, +#endif + NULL +}; + +static inline const struct xattr_handler * +ext3_xattr_handler(int name_index) +{ + const struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map)) + handler = ext3_xattr_handler_map[name_index]; + return handler; +} + +/* + * Inode operation listxattr() + * + * dentry->d_inode->i_mutex: don't care + */ +ssize_t +ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + return ext3_xattr_list(dentry, buffer, size); +} + +static int +ext3_xattr_check_names(struct ext3_xattr_entry *entry, void *end) +{ + while (!IS_LAST_ENTRY(entry)) { + struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(entry); + if ((void *)next >= end) + return -EIO; + entry = next; + } + return 0; +} + +static inline int +ext3_xattr_check_block(struct buffer_head *bh) +{ + int error; + + if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) + return -EIO; + error = ext3_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); + return error; +} + +static inline int +ext3_xattr_check_entry(struct ext3_xattr_entry *entry, size_t size) +{ + size_t value_size = le32_to_cpu(entry->e_value_size); + + if (entry->e_value_block != 0 || value_size > size || + le16_to_cpu(entry->e_value_offs) + value_size > size) + return -EIO; + return 0; +} + +static int +ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index, + const char *name, size_t size, int sorted) +{ + struct ext3_xattr_entry *entry; + size_t name_len; + int cmp = 1; + + if (name == NULL) + return -EINVAL; + name_len = strlen(name); + entry = *pentry; + for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { + cmp = name_index - entry->e_name_index; + if (!cmp) + cmp = name_len - entry->e_name_len; + if (!cmp) + cmp = memcmp(name, entry->e_name, name_len); + if (cmp <= 0 && (sorted || cmp == 0)) + break; + } + *pentry = entry; + if (!cmp && ext3_xattr_check_entry(entry, size)) + return -EIO; + return cmp ? -ENODATA : 0; +} + +static int +ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct buffer_head *bh = NULL; + struct ext3_xattr_entry *entry; + size_t size; + int error; + + ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", + name_index, name, buffer, (long)buffer_size); + + error = -ENODATA; + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext3_xattr_check_block(bh)) { +bad_block: ext3_error(inode->i_sb, __func__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext3_xattr_cache_insert(bh); + entry = BFIRST(bh); + error = ext3_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), + size); + } + error = size; + +cleanup: + brelse(bh); + return error; +} + +static int +ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct ext3_xattr_ibody_header *header; + struct ext3_xattr_entry *entry; + struct ext3_inode *raw_inode; + struct ext3_iloc iloc; + size_t size; + void *end; + int error; + + if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) + return -ENODATA; + error = ext3_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext3_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; + error = ext3_xattr_check_names(entry, end); + if (error) + goto cleanup; + error = ext3_xattr_find_entry(&entry, name_index, name, + end - (void *)entry, 0); + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, (void *)IFIRST(header) + + le16_to_cpu(entry->e_value_offs), size); + } + error = size; + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext3_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +int +ext3_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + int error; + + down_read(&EXT3_I(inode)->xattr_sem); + error = ext3_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size); + if (error == -ENODATA) + error = ext3_xattr_block_get(inode, name_index, name, buffer, + buffer_size); + up_read(&EXT3_I(inode)->xattr_sem); + return error; +} + +static int +ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry, + char *buffer, size_t buffer_size) +{ + size_t rest = buffer_size; + + for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { + const struct xattr_handler *handler = + ext3_xattr_handler(entry->e_name_index); + + if (handler) { + size_t size = handler->list(dentry, buffer, rest, + entry->e_name, + entry->e_name_len, + handler->flags); + if (buffer) { + if (size > rest) + return -ERANGE; + buffer += size; + } + rest -= size; + } + } + return buffer_size - rest; +} + +static int +ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = dentry->d_inode; + struct buffer_head *bh = NULL; + int error; + + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); + + error = 0; + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext3_xattr_check_block(bh)) { + ext3_error(inode->i_sb, __func__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext3_xattr_cache_insert(bh); + error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); + +cleanup: + brelse(bh); + + return error; +} + +static int +ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = dentry->d_inode; + struct ext3_xattr_ibody_header *header; + struct ext3_inode *raw_inode; + struct ext3_iloc iloc; + void *end; + int error; + + if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) + return 0; + error = ext3_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext3_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; + error = ext3_xattr_check_names(IFIRST(header), end); + if (error) + goto cleanup; + error = ext3_xattr_list_entries(dentry, IFIRST(header), + buffer, buffer_size); + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext3_xattr_list() + * + * Copy a list of attribute names into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +static int +ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + int i_error, b_error; + + down_read(&EXT3_I(dentry->d_inode)->xattr_sem); + i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size); + if (i_error < 0) { + b_error = 0; + } else { + if (buffer) { + buffer += i_error; + buffer_size -= i_error; + } + b_error = ext3_xattr_block_list(dentry, buffer, buffer_size); + if (b_error < 0) + i_error = 0; + } + up_read(&EXT3_I(dentry->d_inode)->xattr_sem); + return i_error + b_error; +} + +/* + * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is + * not set, set it. + */ +static void ext3_xattr_update_super_block(handle_t *handle, + struct super_block *sb) +{ + if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR)) + return; + + if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) { + EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR); + ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + } +} + +/* + * Release the xattr block BH: If the reference count is > 1, decrement + * it; otherwise free the block. + */ +static void +ext3_xattr_release_block(handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + struct mb_cache_entry *ce = NULL; + int error = 0; + + ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr); + error = ext3_journal_get_write_access(handle, bh); + if (error) + goto out; + + lock_buffer(bh); + + if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); + ext3_free_blocks(handle, inode, bh->b_blocknr, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { + le32_add_cpu(&BHDR(bh)->h_refcount, -1); + error = ext3_journal_dirty_metadata(handle, bh); + if (IS_SYNC(inode)) + handle->h_sync = 1; + dquot_free_block(inode, 1); + ea_bdebug(bh, "refcount now=%d; releasing", + le32_to_cpu(BHDR(bh)->h_refcount)); + if (ce) + mb_cache_entry_release(ce); + } + unlock_buffer(bh); +out: + ext3_std_error(inode->i_sb, error); + return; +} + +struct ext3_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ext3_xattr_search { + struct ext3_xattr_entry *first; + void *base; + void *end; + struct ext3_xattr_entry *here; + int not_found; +}; + +static int +ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s) +{ + struct ext3_xattr_entry *last; + size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); + + /* Compute min_offs and last. */ + last = s->first; + for (; !IS_LAST_ENTRY(last); last = EXT3_XATTR_NEXT(last)) { + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + } + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) { + if (!s->here->e_value_block && s->here->e_value_size) { + size_t size = le32_to_cpu(s->here->e_value_size); + free += EXT3_XATTR_SIZE(size); + } + free += EXT3_XATTR_LEN(name_len); + } + if (i->value) { + if (free < EXT3_XATTR_SIZE(i->value_len) || + free < EXT3_XATTR_LEN(name_len) + + EXT3_XATTR_SIZE(i->value_len)) + return -ENOSPC; + } + + if (i->value && s->not_found) { + /* Insert the new name. */ + size_t size = EXT3_XATTR_LEN(name_len); + size_t rest = (void *)last - (void *)s->here + sizeof(__u32); + memmove((void *)s->here + size, s->here, rest); + memset(s->here, 0, size); + s->here->e_name_index = i->name_index; + s->here->e_name_len = name_len; + memcpy(s->here->e_name, i->name, name_len); + } else { + if (!s->here->e_value_block && s->here->e_value_size) { + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(s->here->e_value_offs); + void *val = s->base + offs; + size_t size = EXT3_XATTR_SIZE( + le32_to_cpu(s->here->e_value_size)); + + if (i->value && size == EXT3_XATTR_SIZE(i->value_len)) { + /* The old and the new value have the same + size. Just replace. */ + s->here->e_value_size = + cpu_to_le32(i->value_len); + memset(val + size - EXT3_XATTR_PAD, 0, + EXT3_XATTR_PAD); /* Clear pad bytes. */ + memcpy(val, i->value, i->value_len); + return 0; + } + + /* Remove the old value. */ + memmove(first_val + size, first_val, val - first_val); + memset(first_val, 0, size); + s->here->e_value_size = 0; + s->here->e_value_offs = 0; + min_offs += size; + + /* Adjust all value offsets. */ + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + if (!last->e_value_block && + last->e_value_size && o < offs) + last->e_value_offs = + cpu_to_le16(o + size); + last = EXT3_XATTR_NEXT(last); + } + } + if (!i->value) { + /* Remove the old name. */ + size_t size = EXT3_XATTR_LEN(name_len); + last = ENTRY((void *)last - size); + memmove(s->here, (void *)s->here + size, + (void *)last - (void *)s->here + sizeof(__u32)); + memset(last, 0, size); + } + } + + if (i->value) { + /* Insert the new value. */ + s->here->e_value_size = cpu_to_le32(i->value_len); + if (i->value_len) { + size_t size = EXT3_XATTR_SIZE(i->value_len); + void *val = s->base + min_offs - size; + s->here->e_value_offs = cpu_to_le16(min_offs - size); + memset(val + size - EXT3_XATTR_PAD, 0, + EXT3_XATTR_PAD); /* Clear the pad bytes. */ + memcpy(val, i->value, i->value_len); + } + } + return 0; +} + +struct ext3_xattr_block_find { + struct ext3_xattr_search s; + struct buffer_head *bh; +}; + +static int +ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i, + struct ext3_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + int error; + + ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", + i->name_index, i->name, i->value, (long)i->value_len); + + if (EXT3_I(inode)->i_file_acl) { + /* The inode already has an extended attribute block. */ + bs->bh = sb_bread(sb, EXT3_I(inode)->i_file_acl); + error = -EIO; + if (!bs->bh) + goto cleanup; + ea_bdebug(bs->bh, "b_count=%d, refcount=%d", + atomic_read(&(bs->bh->b_count)), + le32_to_cpu(BHDR(bs->bh)->h_refcount)); + if (ext3_xattr_check_block(bs->bh)) { + ext3_error(sb, __func__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + /* Find the named attribute. */ + bs->s.base = BHDR(bs->bh); + bs->s.first = BFIRST(bs->bh); + bs->s.end = bs->bh->b_data + bs->bh->b_size; + bs->s.here = bs->s.first; + error = ext3_xattr_find_entry(&bs->s.here, i->name_index, + i->name, bs->bh->b_size, 1); + if (error && error != -ENODATA) + goto cleanup; + bs->s.not_found = error; + } + error = 0; + +cleanup: + return error; +} + +static int +ext3_xattr_block_set(handle_t *handle, struct inode *inode, + struct ext3_xattr_info *i, + struct ext3_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; + struct ext3_xattr_search *s = &bs->s; + struct mb_cache_entry *ce = NULL; + int error = 0; + +#define header(x) ((struct ext3_xattr_header *)(x)) + + if (i->value && i->value_len > sb->s_blocksize) + return -ENOSPC; + if (s->base) { + ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev, + bs->bh->b_blocknr); + error = ext3_journal_get_write_access(handle, bs->bh); + if (error) + goto cleanup; + lock_buffer(bs->bh); + + if (header(s->base)->h_refcount == cpu_to_le32(1)) { + if (ce) { + mb_cache_entry_free(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "modifying in-place"); + error = ext3_xattr_set_entry(i, s); + if (!error) { + if (!IS_LAST_ENTRY(s->first)) + ext3_xattr_rehash(header(s->base), + s->here); + ext3_xattr_cache_insert(bs->bh); + } + unlock_buffer(bs->bh); + if (error == -EIO) + goto bad_block; + if (!error) + error = ext3_journal_dirty_metadata(handle, + bs->bh); + if (error) + goto cleanup; + goto inserted; + } else { + int offset = (char *)s->here - bs->bh->b_data; + + unlock_buffer(bs->bh); + journal_release_buffer(handle, bs->bh); + + if (ce) { + mb_cache_entry_release(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "cloning"); + s->base = kmalloc(bs->bh->b_size, GFP_NOFS); + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); + s->first = ENTRY(header(s->base)+1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->here = ENTRY(s->base + offset); + s->end = s->base + bs->bh->b_size; + } + } else { + /* Allocate a buffer where we construct the new block. */ + s->base = kzalloc(sb->s_blocksize, GFP_NOFS); + /* assert(header == s->base) */ + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); + header(s->base)->h_blocks = cpu_to_le32(1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->first = ENTRY(header(s->base)+1); + s->here = ENTRY(header(s->base)+1); + s->end = s->base + sb->s_blocksize; + } + + error = ext3_xattr_set_entry(i, s); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + if (!IS_LAST_ENTRY(s->first)) + ext3_xattr_rehash(header(s->base), s->here); + +inserted: + if (!IS_LAST_ENTRY(s->first)) { + new_bh = ext3_xattr_cache_find(inode, header(s->base), &ce); + if (new_bh) { + /* We found an identical block in the cache. */ + if (new_bh == bs->bh) + ea_bdebug(new_bh, "keeping"); + else { + /* The old block is released after updating + the inode. */ + error = dquot_alloc_block(inode, 1); + if (error) + goto cleanup; + error = ext3_journal_get_write_access(handle, + new_bh); + if (error) + goto cleanup_dquot; + lock_buffer(new_bh); + le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); + ea_bdebug(new_bh, "reusing; refcount now=%d", + le32_to_cpu(BHDR(new_bh)->h_refcount)); + unlock_buffer(new_bh); + error = ext3_journal_dirty_metadata(handle, + new_bh); + if (error) + goto cleanup_dquot; + } + mb_cache_entry_release(ce); + ce = NULL; + } else if (bs->bh && s->base == bs->bh->b_data) { + /* We were modifying this block in-place. */ + ea_bdebug(bs->bh, "keeping this block"); + new_bh = bs->bh; + get_bh(new_bh); + } else { + /* We need to allocate a new block */ + ext3_fsblk_t goal = ext3_group_first_block_no(sb, + EXT3_I(inode)->i_block_group); + ext3_fsblk_t block; + + /* + * Protect us agaist concurrent allocations to the + * same inode from ext3_..._writepage(). Reservation + * code does not expect racing allocations. + */ + mutex_lock(&EXT3_I(inode)->truncate_mutex); + block = ext3_new_block(handle, inode, goal, &error); + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + if (error) + goto cleanup; + ea_idebug(inode, "creating block %d", block); + + new_bh = sb_getblk(sb, block); + if (!new_bh) { +getblk_failed: + ext3_free_blocks(handle, inode, block, 1); + error = -EIO; + goto cleanup; + } + lock_buffer(new_bh); + error = ext3_journal_get_create_access(handle, new_bh); + if (error) { + unlock_buffer(new_bh); + goto getblk_failed; + } + memcpy(new_bh->b_data, s->base, new_bh->b_size); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); + ext3_xattr_cache_insert(new_bh); + error = ext3_journal_dirty_metadata(handle, new_bh); + if (error) + goto cleanup; + } + } + + /* Update the inode. */ + EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; + + /* Drop the previous xattr block. */ + if (bs->bh && bs->bh != new_bh) + ext3_xattr_release_block(handle, inode, bs->bh); + error = 0; + +cleanup: + if (ce) + mb_cache_entry_release(ce); + brelse(new_bh); + if (!(bs->bh && s->base == bs->bh->b_data)) + kfree(s->base); + + return error; + +cleanup_dquot: + dquot_free_block(inode, 1); + goto cleanup; + +bad_block: + ext3_error(inode->i_sb, __func__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3_I(inode)->i_file_acl); + goto cleanup; + +#undef header +} + +struct ext3_xattr_ibody_find { + struct ext3_xattr_search s; + struct ext3_iloc iloc; +}; + +static int +ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i, + struct ext3_xattr_ibody_find *is) +{ + struct ext3_xattr_ibody_header *header; + struct ext3_inode *raw_inode; + int error; + + if (EXT3_I(inode)->i_extra_isize == 0) + return 0; + raw_inode = ext3_raw_inode(&is->iloc); + header = IHDR(inode, raw_inode); + is->s.base = is->s.first = IFIRST(header); + is->s.here = is->s.first; + is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; + if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) { + error = ext3_xattr_check_names(IFIRST(header), is->s.end); + if (error) + return error; + /* Find the named attribute. */ + error = ext3_xattr_find_entry(&is->s.here, i->name_index, + i->name, is->s.end - + (void *)is->s.base, 0); + if (error && error != -ENODATA) + return error; + is->s.not_found = error; + } + return 0; +} + +static int +ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext3_xattr_info *i, + struct ext3_xattr_ibody_find *is) +{ + struct ext3_xattr_ibody_header *header; + struct ext3_xattr_search *s = &is->s; + int error; + + if (EXT3_I(inode)->i_extra_isize == 0) + return -ENOSPC; + error = ext3_xattr_set_entry(i, s); + if (error) + return error; + header = IHDR(inode, ext3_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); + ext3_set_inode_state(inode, EXT3_STATE_XATTR); + } else { + header->h_magic = cpu_to_le32(0); + ext3_clear_inode_state(inode, EXT3_STATE_XATTR); + } + return 0; +} + +/* + * ext3_xattr_set_handle() + * + * Create, replace or remove an extended attribute for this inode. Value + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int +ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len, + int flags) +{ + struct ext3_xattr_info i = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + + }; + struct ext3_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext3_xattr_block_find bs = { + .s = { .not_found = -ENODATA, }, + }; + int error; + + if (!name) + return -EINVAL; + if (strlen(name) > 255) + return -ERANGE; + down_write(&EXT3_I(inode)->xattr_sem); + error = ext3_get_inode_loc(inode, &is.iloc); + if (error) + goto cleanup; + + error = ext3_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto cleanup; + + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) { + struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); + memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); + ext3_clear_inode_state(inode, EXT3_STATE_NEW); + } + + error = ext3_xattr_ibody_find(inode, &i, &is); + if (error) + goto cleanup; + if (is.s.not_found) + error = ext3_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + if (is.s.not_found && bs.s.not_found) { + error = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + error = 0; + if (!value) + goto cleanup; + } else { + error = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + if (!value) { + if (!is.s.not_found) + error = ext3_xattr_ibody_set(handle, inode, &i, &is); + else if (!bs.s.not_found) + error = ext3_xattr_block_set(handle, inode, &i, &bs); + } else { + error = ext3_xattr_ibody_set(handle, inode, &i, &is); + if (!error && !bs.s.not_found) { + i.value = NULL; + error = ext3_xattr_block_set(handle, inode, &i, &bs); + } else if (error == -ENOSPC) { + if (EXT3_I(inode)->i_file_acl && !bs.s.base) { + error = ext3_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + } + error = ext3_xattr_block_set(handle, inode, &i, &bs); + if (error) + goto cleanup; + if (!is.s.not_found) { + i.value = NULL; + error = ext3_xattr_ibody_set(handle, inode, &i, + &is); + } + } + } + if (!error) { + ext3_xattr_update_super_block(handle, inode->i_sb); + inode->i_ctime = CURRENT_TIME_SEC; + error = ext3_mark_iloc_dirty(handle, inode, &is.iloc); + /* + * The bh is consumed by ext3_mark_iloc_dirty, even with + * error != 0. + */ + is.iloc.bh = NULL; + if (IS_SYNC(inode)) + handle->h_sync = 1; + } + +cleanup: + brelse(is.iloc.bh); + brelse(bs.bh); + up_write(&EXT3_I(inode)->xattr_sem); + return error; +} + +/* + * ext3_xattr_set() + * + * Like ext3_xattr_set_handle, but start from an inode. This extended + * attribute modification is a filesystem transaction by itself. + * + * Returns 0, or a negative error number on failure. + */ +int +ext3_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len, int flags) +{ + handle_t *handle; + int error, retries = 0; + +retry: + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + } else { + int error2; + + error = ext3_xattr_set_handle(handle, inode, name_index, name, + value, value_len, flags); + error2 = ext3_journal_stop(handle); + if (error == -ENOSPC && + ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + if (error == 0) + error = error2; + } + + return error; +} + +/* + * ext3_xattr_delete_inode() + * + * Free extended attribute resources associated with this inode. This + * is called immediately before an inode is freed. We have exclusive + * access to the inode. + */ +void +ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ + struct buffer_head *bh = NULL; + + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; + bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); + if (!bh) { + ext3_error(inode->i_sb, __func__, + "inode %lu: block "E3FSBLK" read error", inode->i_ino, + EXT3_I(inode)->i_file_acl); + goto cleanup; + } + if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) { + ext3_error(inode->i_sb, __func__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3_I(inode)->i_file_acl); + goto cleanup; + } + ext3_xattr_release_block(handle, inode, bh); + EXT3_I(inode)->i_file_acl = 0; + +cleanup: + brelse(bh); +} + +/* + * ext3_xattr_put_super() + * + * This is called when a file system is unmounted. + */ +void +ext3_xattr_put_super(struct super_block *sb) +{ + mb_cache_shrink(sb->s_bdev); +} + +/* + * ext3_xattr_cache_insert() + * + * Create a new entry in the extended attribute cache, and insert + * it unless such an entry is already in the cache. + * + * Returns 0, or a negative error number on failure. + */ +static void +ext3_xattr_cache_insert(struct buffer_head *bh) +{ + __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); + struct mb_cache_entry *ce; + int error; + + ce = mb_cache_entry_alloc(ext3_xattr_cache, GFP_NOFS); + if (!ce) { + ea_bdebug(bh, "out of memory"); + return; + } + error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); + if (error) { + mb_cache_entry_free(ce); + if (error == -EBUSY) { + ea_bdebug(bh, "already in cache"); + error = 0; + } + } else { + ea_bdebug(bh, "inserting [%x]", (int)hash); + mb_cache_entry_release(ce); + } +} + +/* + * ext3_xattr_cmp() + * + * Compare two extended attribute blocks for equality. + * + * Returns 0 if the blocks are equal, 1 if they differ, and + * a negative error number on errors. + */ +static int +ext3_xattr_cmp(struct ext3_xattr_header *header1, + struct ext3_xattr_header *header2) +{ + struct ext3_xattr_entry *entry1, *entry2; + + entry1 = ENTRY(header1+1); + entry2 = ENTRY(header2+1); + while (!IS_LAST_ENTRY(entry1)) { + if (IS_LAST_ENTRY(entry2)) + return 1; + if (entry1->e_hash != entry2->e_hash || + entry1->e_name_index != entry2->e_name_index || + entry1->e_name_len != entry2->e_name_len || + entry1->e_value_size != entry2->e_value_size || + memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) + return 1; + if (entry1->e_value_block != 0 || entry2->e_value_block != 0) + return -EIO; + if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + (char *)header2 + le16_to_cpu(entry2->e_value_offs), + le32_to_cpu(entry1->e_value_size))) + return 1; + + entry1 = EXT3_XATTR_NEXT(entry1); + entry2 = EXT3_XATTR_NEXT(entry2); + } + if (!IS_LAST_ENTRY(entry2)) + return 1; + return 0; +} + +/* + * ext3_xattr_cache_find() + * + * Find an identical extended attribute block. + * + * Returns a pointer to the block found, or NULL if such a block was + * not found or an error occurred. + */ +static struct buffer_head * +ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header, + struct mb_cache_entry **pce) +{ + __u32 hash = le32_to_cpu(header->h_hash); + struct mb_cache_entry *ce; + + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); +again: + ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev, + hash); + while (ce) { + struct buffer_head *bh; + + if (IS_ERR(ce)) { + if (PTR_ERR(ce) == -EAGAIN) + goto again; + break; + } + bh = sb_bread(inode->i_sb, ce->e_block); + if (!bh) { + ext3_error(inode->i_sb, __func__, + "inode %lu: block %lu read error", + inode->i_ino, (unsigned long) ce->e_block); + } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= + EXT3_XATTR_REFCOUNT_MAX) { + ea_idebug(inode, "block %lu refcount %d>=%d", + (unsigned long) ce->e_block, + le32_to_cpu(BHDR(bh)->h_refcount), + EXT3_XATTR_REFCOUNT_MAX); + } else if (ext3_xattr_cmp(header, BHDR(bh)) == 0) { + *pce = ce; + return bh; + } + brelse(bh); + ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); + } + return NULL; +} + +#define NAME_HASH_SHIFT 5 +#define VALUE_HASH_SHIFT 16 + +/* + * ext3_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header, + struct ext3_xattr_entry *entry) +{ + __u32 hash = 0; + char *name = entry->e_name; + int n; + + for (n=0; n < entry->e_name_len; n++) { + hash = (hash << NAME_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ + *name++; + } + + if (entry->e_value_block == 0 && entry->e_value_size != 0) { + __le32 *value = (__le32 *)((char *)header + + le16_to_cpu(entry->e_value_offs)); + for (n = (le32_to_cpu(entry->e_value_size) + + EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); + } + } + entry->e_hash = cpu_to_le32(hash); +} + +#undef NAME_HASH_SHIFT +#undef VALUE_HASH_SHIFT + +#define BLOCK_HASH_SHIFT 16 + +/* + * ext3_xattr_rehash() + * + * Re-compute the extended attribute hash value after an entry has changed. + */ +static void ext3_xattr_rehash(struct ext3_xattr_header *header, + struct ext3_xattr_entry *entry) +{ + struct ext3_xattr_entry *here; + __u32 hash = 0; + + ext3_xattr_hash_entry(header, entry); + here = ENTRY(header+1); + while (!IS_LAST_ENTRY(here)) { + if (!here->e_hash) { + /* Block is not shared if an entry's hash value == 0 */ + hash = 0; + break; + } + hash = (hash << BLOCK_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ + le32_to_cpu(here->e_hash); + here = EXT3_XATTR_NEXT(here); + } + header->h_hash = cpu_to_le32(hash); +} + +#undef BLOCK_HASH_SHIFT + +int __init +init_ext3_xattr(void) +{ + ext3_xattr_cache = mb_cache_create("ext3_xattr", 6); + if (!ext3_xattr_cache) + return -ENOMEM; + return 0; +} + +void +exit_ext3_xattr(void) +{ + if (ext3_xattr_cache) + mb_cache_destroy(ext3_xattr_cache); + ext3_xattr_cache = NULL; +} diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h new file mode 100644 index 00000000..2be4f69b --- /dev/null +++ b/fs/ext3/xattr.h @@ -0,0 +1,138 @@ +/* + File: fs/ext3/xattr.h + + On-disk format of extended attributes for the ext3 filesystem. + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +/* Magic value in attribute blocks */ +#define EXT3_XATTR_MAGIC 0xEA020000 + +/* Maximum number of references to one attribute block */ +#define EXT3_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define EXT3_XATTR_INDEX_USER 1 +#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EXT3_XATTR_INDEX_TRUSTED 4 +#define EXT3_XATTR_INDEX_LUSTRE 5 +#define EXT3_XATTR_INDEX_SECURITY 6 + +struct ext3_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __le32 h_blocks; /* number of disk blocks used */ + __le32 h_hash; /* hash value of all attributes */ + __u32 h_reserved[4]; /* zero right now */ +}; + +struct ext3_xattr_ibody_header { + __le32 h_magic; /* magic number for identification */ +}; + +struct ext3_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ + __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[0]; /* attribute name */ +}; + +#define EXT3_XATTR_PAD_BITS 2 +#define EXT3_XATTR_PAD (1<e_name_len)) ) +#define EXT3_XATTR_SIZE(size) \ + (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND) + +# ifdef CONFIG_EXT3_FS_XATTR + +extern const struct xattr_handler ext3_xattr_user_handler; +extern const struct xattr_handler ext3_xattr_trusted_handler; +extern const struct xattr_handler ext3_xattr_acl_access_handler; +extern const struct xattr_handler ext3_xattr_acl_default_handler; +extern const struct xattr_handler ext3_xattr_security_handler; + +extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); + +extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int); +extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); + +extern void ext3_xattr_delete_inode(handle_t *, struct inode *); +extern void ext3_xattr_put_super(struct super_block *); + +extern int init_ext3_xattr(void); +extern void exit_ext3_xattr(void); + +extern const struct xattr_handler *ext3_xattr_handlers[]; + +# else /* CONFIG_EXT3_FS_XATTR */ + +static inline int +ext3_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext3_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline void +ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ +} + +static inline void +ext3_xattr_put_super(struct super_block *sb) +{ +} + +static inline int +init_ext3_xattr(void) +{ + return 0; +} + +static inline void +exit_ext3_xattr(void) +{ +} + +#define ext3_xattr_handlers NULL + +# endif /* CONFIG_EXT3_FS_XATTR */ + +#ifdef CONFIG_EXT3_FS_SECURITY +extern int ext3_init_security(handle_t *handle, struct inode *inode, + struct inode *dir, const struct qstr *qstr); +#else +static inline int ext3_init_security(handle_t *handle, struct inode *inode, + struct inode *dir, const struct qstr *qstr) +{ + return 0; +} +#endif diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c new file mode 100644 index 00000000..b8d9f83a --- /dev/null +++ b/fs/ext3/xattr_security.c @@ -0,0 +1,78 @@ +/* + * linux/fs/ext3/xattr_security.c + * Handler for storing security labels as extended attributes. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" + +static size_t +ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext3_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + name, buffer, size); +} + +static int +ext3_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + name, value, size, flags); +} + +int +ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + int err; + size_t len; + void *value; + char *name; + + err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY, + name, value, len, 0); + kfree(name); + kfree(value); + return err; +} + +const struct xattr_handler ext3_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = ext3_xattr_security_list, + .get = ext3_xattr_security_get, + .set = ext3_xattr_security_set, +}; diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c new file mode 100644 index 00000000..dc8edda9 --- /dev/null +++ b/fs/ext3/xattr_trusted.c @@ -0,0 +1,59 @@ +/* + * linux/fs/ext3/xattr_trusted.c + * Handler for trusted extended attributes. + * + * Copyright (C) 2003 by Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include +#include +#include "xattr.h" + +static size_t +ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!capable(CAP_SYS_ADMIN)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext3_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, + name, buffer, size); +} + +static int +ext3_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, name, + value, size, flags); +} + +const struct xattr_handler ext3_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ext3_xattr_trusted_list, + .get = ext3_xattr_trusted_get, + .set = ext3_xattr_trusted_set, +}; diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c new file mode 100644 index 00000000..7a321974 --- /dev/null +++ b/fs/ext3/xattr_user.c @@ -0,0 +1,62 @@ +/* + * linux/fs/ext3/xattr_user.c + * Handler for extended user attributes. + * + * Copyright (C) 2001 by Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include +#include "xattr.h" + +static size_t +ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = XATTR_USER_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!test_opt(dentry->d_sb, XATTR_USER)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_USER, + name, buffer, size); +} + +static int +ext3_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_USER, + name, value, size, flags); +} + +const struct xattr_handler ext3_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ext3_xattr_user_list, + .get = ext3_xattr_user_get, + .set = ext3_xattr_user_set, +}; diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig new file mode 100644 index 00000000..9ed1bb1f --- /dev/null +++ b/fs/ext4/Kconfig @@ -0,0 +1,85 @@ +config EXT4_FS + tristate "The Extended 4 (ext4) filesystem" + select JBD2 + select CRC16 + help + This is the next generation of the ext3 filesystem. + + Unlike the change from ext2 filesystem to ext3 filesystem, + the on-disk format of ext4 is not forwards compatible with + ext3; it is based on extent maps and it supports 48-bit + physical block numbers. The ext4 filesystem also supports delayed + allocation, persistent preallocation, high resolution time stamps, + and a number of other features to improve performance and speed + up fsck time. For more information, please see the web pages at + http://ext4.wiki.kernel.org. + + The ext4 filesystem will support mounting an ext3 + filesystem; while there will be some performance gains from + the delayed allocation and inode table readahead, the best + performance gains will require enabling ext4 features in the + filesystem, or formatting a new filesystem as an ext4 + filesystem initially. + + To compile this file system support as a module, choose M here. The + module will be called ext4. + + If unsure, say N. + +config EXT4_USE_FOR_EXT23 + bool "Use ext4 for ext2/ext3 file systems" + depends on EXT4_FS + depends on EXT3_FS=n || EXT2_FS=n + default y + help + Allow the ext4 file system driver code to be used for ext2 or + ext3 file system mounts. This allows users to reduce their + compiled kernel size by using one file system driver for + ext2, ext3, and ext4 file systems. + +config EXT4_FS_XATTR + bool "Ext4 extended attributes" + depends on EXT4_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + + You need this for POSIX ACL support on ext4. + +config EXT4_FS_POSIX_ACL + bool "Ext4 POSIX Access Control Lists" + depends on EXT4_FS_XATTR + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config EXT4_FS_SECURITY + bool "Ext4 Security Labels" + depends on EXT4_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext4 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config EXT4_DEBUG + bool "EXT4 debugging support" + depends on EXT4_FS + help + Enables run-time debugging support for the ext4 filesystem. + + If you select Y here, then you will be able to turn on debugging + with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile new file mode 100644 index 00000000..04109460 --- /dev/null +++ b/fs/ext4/Makefile @@ -0,0 +1,14 @@ +# +# Makefile for the linux ext4-filesystem routines. +# + +obj-$(CONFIG_EXT4_FS) += ext4.o + +ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ + mmp.o + +ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o +ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c new file mode 100644 index 00000000..21eacd7b --- /dev/null +++ b/fs/ext4/acl.c @@ -0,0 +1,479 @@ +/* + * linux/fs/ext4/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "acl.h" + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl * +ext4_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(ext4_acl_header)) + return ERR_PTR(-EINVAL); + if (((ext4_acl_header *)value)->a_version != + cpu_to_le32(EXT4_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(ext4_acl_header); + count = ext4_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + ext4_acl_entry *entry = + (ext4_acl_entry *)value; + if ((char *)value + sizeof(ext4_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + + switch (acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(ext4_acl_entry_short); + acl->a_entries[n].e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + value = (char *)value + sizeof(ext4_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_id = + le32_to_cpu(entry->e_id); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void * +ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + ext4_acl_header *ext_acl; + char *e; + size_t n; + + *size = ext4_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count * + sizeof(ext4_acl_entry), GFP_NOFS); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); + e = (char *)ext_acl + sizeof(ext4_acl_header); + for (n = 0; n < acl->a_count; n++) { + ext4_acl_entry *entry = (ext4_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + switch (acl->a_entries[n].e_tag) { + case ACL_USER: + case ACL_GROUP: + entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + e += sizeof(ext4_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(ext4_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_mutex: don't care + */ +static struct posix_acl * +ext4_get_acl(struct inode *inode, int type) +{ + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return NULL; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + BUG(); + } + retval = ext4_xattr_get(inode, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ext4_xattr_get(inode, name_index, "", value, retval); + } + if (retval > 0) + acl = ext4_acl_from_disk(value, retval); + else if (retval == -ENODATA || retval == -ENOSYS) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +/* + * Set the access or default ACL of an inode. + * + * inode->i_mutex: down unless called from ext4_new_inode + */ +static int +ext4_set_acl(handle_t *handle, struct inode *inode, int type, + struct posix_acl *acl) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; + else { + inode->i_mode = mode; + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + if (error == 0) + acl = NULL; + } + } + break; + + case ACL_TYPE_DEFAULT: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + if (acl) { + value = ext4_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = ext4_xattr_set_handle(handle, inode, name_index, "", + value, size, 0); + + kfree(value); + if (!error) + set_cached_acl(inode, type, acl); + + return error; +} + +int +ext4_check_acl(struct inode *inode, int mask, unsigned int flags) +{ + struct posix_acl *acl; + + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -ECHILD; + return -EAGAIN; + } + + acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + + return -EAGAIN; +} + +/* + * Initialize the ACLs of a new inode. Called from ext4_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) + */ +int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int error = 0; + + if (!S_ISLNK(inode->i_mode)) { + if (test_opt(dir->i_sb, POSIX_ACL)) { + acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) + inode->i_mode &= ~current_umask(); + } + if (test_opt(inode->i_sb, POSIX_ACL) && acl) { + struct posix_acl *clone; + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { + error = ext4_set_acl(handle, inode, + ACL_TYPE_DEFAULT, acl); + if (error) + goto cleanup; + } + clone = posix_acl_clone(acl, GFP_NOFS); + error = -ENOMEM; + if (!clone) + goto cleanup; + + mode = inode->i_mode; + error = posix_acl_create_masq(clone, &mode); + if (error >= 0) { + inode->i_mode = mode; + if (error > 0) { + /* This is an extended ACL */ + error = ext4_set_acl(handle, inode, + ACL_TYPE_ACCESS, clone); + } + } + posix_acl_release(clone); + } +cleanup: + posix_acl_release(acl); + return error; +} + +/* + * Does chmod for an inode that may have an Access Control List. The + * inode->i_mode field must be updated to the desired value by the caller + * before calling this function. + * Returns 0 on success, or a negative error number. + * + * We change the ACL rather than storing some ACL entries in the file + * mode permission bits (which would be more efficient), because that + * would break once additional permissions (like ACL_APPEND, ACL_DELETE + * for directories) are added. There are no more bits available in the + * file mode. + * + * inode->i_mutex: down + */ +int +ext4_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + if (!test_opt(inode->i_sb, POSIX_ACL)) + return 0; + acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + error = posix_acl_chmod_masq(clone, inode->i_mode); + if (!error) { + handle_t *handle; + int retries = 0; + + retry: + handle = ext4_journal_start(inode, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + ext4_std_error(inode->i_sb, error); + goto out; + } + error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone); + ext4_journal_stop(handle); + if (error == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + } +out: + posix_acl_release(clone); + return error; +} + +/* + * Extended attribute handlers + */ +static size_t +ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) +{ + const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); + + if (!test_opt(dentry->d_sb, POSIX_ACL)) + return 0; + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_ACCESS, size); + return size; +} + +static size_t +ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) +{ + const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); + + if (!test_opt(dentry->d_sb, POSIX_ACL)) + return 0; + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); + return size; +} + +static int +ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) +{ + struct posix_acl *acl; + int error; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) + return -EOPNOTSUPP; + + acl = ext4_get_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +static int +ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) +{ + struct inode *inode = dentry->d_inode; + handle_t *handle; + struct posix_acl *acl; + int error, retries = 0; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(inode->i_sb, POSIX_ACL)) + return -EOPNOTSUPP; + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + error = posix_acl_valid(acl); + if (error) + goto release_and_out; + } + } else + acl = NULL; + +retry: + handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + error = ext4_set_acl(handle, inode, type, acl); + ext4_journal_stop(handle); + if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + +release_and_out: + posix_acl_release(acl); + return error; +} + +const struct xattr_handler ext4_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = ext4_xattr_list_acl_access, + .get = ext4_xattr_get_acl, + .set = ext4_xattr_set_acl, +}; + +const struct xattr_handler ext4_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = ext4_xattr_list_acl_default, + .get = ext4_xattr_get_acl, + .set = ext4_xattr_set_acl, +}; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h new file mode 100644 index 00000000..dec82116 --- /dev/null +++ b/fs/ext4/acl.h @@ -0,0 +1,77 @@ +/* + File: fs/ext4/acl.h + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +#define EXT4_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} ext4_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} ext4_acl_entry_short; + +typedef struct { + __le32 a_version; +} ext4_acl_header; + +static inline size_t ext4_acl_size(int count) +{ + if (count <= 4) { + return sizeof(ext4_acl_header) + + count * sizeof(ext4_acl_entry_short); + } else { + return sizeof(ext4_acl_header) + + 4 * sizeof(ext4_acl_entry_short) + + (count - 4) * sizeof(ext4_acl_entry); + } +} + +static inline int ext4_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(ext4_acl_header); + s = size - 4 * sizeof(ext4_acl_entry_short); + if (s < 0) { + if (size % sizeof(ext4_acl_entry_short)) + return -1; + return size / sizeof(ext4_acl_entry_short); + } else { + if (s % sizeof(ext4_acl_entry)) + return -1; + return s / sizeof(ext4_acl_entry) + 4; + } +} + +#ifdef CONFIG_EXT4_FS_POSIX_ACL + +/* acl.c */ +extern int ext4_check_acl(struct inode *, int, unsigned int); +extern int ext4_acl_chmod(struct inode *); +extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); + +#else /* CONFIG_EXT4_FS_POSIX_ACL */ +#include +#define ext4_check_acl NULL + +static inline int +ext4_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif /* CONFIG_EXT4_FS_POSIX_ACL */ + diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c new file mode 100644 index 00000000..264f6949 --- /dev/null +++ b/fs/ext4/balloc.c @@ -0,0 +1,622 @@ +/* + * linux/fs/ext4/balloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include "ext4.h" +#include "ext4_jbd2.h" +#include "mballoc.h" + +#include + +/* + * balloc.c contains the blocks allocation and deallocation routines + */ + +/* + * Calculate the block group number and offset, given a block number + */ +void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + ext4_grpblk_t offset; + + blocknr = blocknr - le32_to_cpu(es->s_first_data_block); + offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)); + if (offsetp) + *offsetp = offset; + if (blockgrpp) + *blockgrpp = blocknr; + +} + +static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, + ext4_group_t block_group) +{ + ext4_group_t actual_group; + ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); + if (actual_group == block_group) + return 1; + return 0; +} + +static int ext4_group_used_meta_blocks(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + ext4_fsblk_t tmp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + /* block bitmap, inode bitmap, and inode table blocks */ + int used_blocks = sbi->s_itb_per_group + 2; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), + block_group)) + used_blocks--; + + if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), + block_group)) + used_blocks--; + + tmp = ext4_inode_table(sb, gdp); + for (; tmp < ext4_inode_table(sb, gdp) + + sbi->s_itb_per_group; tmp++) { + if (!ext4_block_in_group(sb, tmp, block_group)) + used_blocks -= 1; + } + } + return used_blocks; +} + +/* Initializes an uninitialized block bitmap if given, and returns the + * number of blocks free in the group. */ +unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, + ext4_group_t block_group, struct ext4_group_desc *gdp) +{ + int bit, bit_max; + ext4_group_t ngroups = ext4_get_groups_count(sb); + unsigned free_blocks, group_blocks; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (bh) { + J_ASSERT_BH(bh, buffer_locked(bh)); + + /* If checksum is bad mark all blocks used to prevent allocation + * essentially implementing a per-group read-only flag. */ + if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + ext4_error(sb, "Checksum bad for group %u", + block_group); + ext4_free_blks_set(sb, gdp, 0); + ext4_free_inodes_set(sb, gdp, 0); + ext4_itable_unused_set(sb, gdp, 0); + memset(bh->b_data, 0xff, sb->s_blocksize); + return 0; + } + memset(bh->b_data, 0, sb->s_blocksize); + } + + /* Check for superblock and gdt backups in this group */ + bit_max = ext4_bg_has_super(sb, block_group); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || + block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * + sbi->s_desc_per_block) { + if (bit_max) { + bit_max += ext4_bg_num_gdb(sb, block_group); + bit_max += + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + } + } else { /* For META_BG_BLOCK_GROUPS */ + bit_max += ext4_bg_num_gdb(sb, block_group); + } + + if (block_group == ngroups - 1) { + /* + * Even though mke2fs always initialize first and last group + * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need + * to make sure we calculate the right free blocks + */ + group_blocks = ext4_blocks_count(sbi->s_es) - + ext4_group_first_block_no(sb, ngroups - 1); + } else { + group_blocks = EXT4_BLOCKS_PER_GROUP(sb); + } + + free_blocks = group_blocks - bit_max; + + if (bh) { + ext4_fsblk_t start, tmp; + int flex_bg = 0; + + for (bit = 0; bit < bit_max; bit++) + ext4_set_bit(bit, bh->b_data); + + start = ext4_group_first_block_no(sb, block_group); + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_FLEX_BG)) + flex_bg = 1; + + /* Set bits for block and inode bitmaps, and inode table */ + tmp = ext4_block_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(tmp - start, bh->b_data); + + tmp = ext4_inode_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(tmp - start, bh->b_data); + + tmp = ext4_inode_table(sb, gdp); + for (; tmp < ext4_inode_table(sb, gdp) + + sbi->s_itb_per_group; tmp++) { + if (!flex_bg || + ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(tmp - start, bh->b_data); + } + /* + * Also if the number of blocks within the group is + * less than the blocksize * 8 ( which is the size + * of bitmap ), set rest of the block bitmap to 1 + */ + ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8, + bh->b_data); + } + return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); +} + + +/* + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext4_fill_super). + */ + +/** + * ext4_get_group_desc() -- load group descriptor from disk + * @sb: super block + * @block_group: given block group + * @bh: pointer to the buffer head to store the block + * group descriptor + */ +struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head **bh) +{ + unsigned int group_desc; + unsigned int offset; + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (block_group >= ngroups) { + ext4_error(sb, "block_group >= groups_count - block_group = %u," + " groups_count = %u", block_group, ngroups); + + return NULL; + } + + group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); + offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); + if (!sbi->s_group_desc[group_desc]) { + ext4_error(sb, "Group descriptor not loaded - " + "block_group = %u, group_desc = %u, desc = %u", + block_group, group_desc, offset); + return NULL; + } + + desc = (struct ext4_group_desc *)( + (__u8 *)sbi->s_group_desc[group_desc]->b_data + + offset * EXT4_DESC_SIZE(sb)); + if (bh) + *bh = sbi->s_group_desc[group_desc]; + return desc; +} + +static int ext4_valid_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh) +{ + ext4_grpblk_t offset; + ext4_grpblk_t next_zero_bit; + ext4_fsblk_t bitmap_blk; + ext4_fsblk_t group_first_block; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + /* with FLEX_BG, the inode/block bitmaps and itable + * blocks may not be in the group at all + * so the bitmap validation will be skipped for those groups + * or it has to also read the block group where the bitmaps + * are located to verify they are set. + */ + return 1; + } + group_first_block = ext4_group_first_block_no(sb, block_group); + + /* check whether block bitmap block number is set */ + bitmap_blk = ext4_block_bitmap(sb, desc); + offset = bitmap_blk - group_first_block; + if (!ext4_test_bit(offset, bh->b_data)) + /* bad block bitmap */ + goto err_out; + + /* check whether the inode bitmap block number is set */ + bitmap_blk = ext4_inode_bitmap(sb, desc); + offset = bitmap_blk - group_first_block; + if (!ext4_test_bit(offset, bh->b_data)) + /* bad block bitmap */ + goto err_out; + + /* check whether the inode table block number is set */ + bitmap_blk = ext4_inode_table(sb, desc); + offset = bitmap_blk - group_first_block; + next_zero_bit = ext4_find_next_zero_bit(bh->b_data, + offset + EXT4_SB(sb)->s_itb_per_group, + offset); + if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group) + /* good bitmap for inode tables */ + return 1; + +err_out: + ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu", + block_group, bitmap_blk); + return 0; +} +/** + * ext4_read_block_bitmap() + * @sb: super block + * @block_group: given block group + * + * Read the bitmap for a given block_group,and validate the + * bits for block/inode/inode tables are set in the bitmaps + * + * Return buffer_head on success or NULL in case of failure. + */ +struct buffer_head * +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct ext4_group_desc *desc; + struct buffer_head *bh = NULL; + ext4_fsblk_t bitmap_blk; + + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return NULL; + bitmap_blk = ext4_block_bitmap(sb, desc); + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext4_error(sb, "Cannot read block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + + if (bitmap_uptodate(bh)) + return bh; + + lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } + ext4_lock_group(sb, block_group); + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + ext4_init_block_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + return bh; + } + ext4_unlock_group(sb, block_group); + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + return bh; + } + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + trace_ext4_read_block_bitmap_load(sb, block_group); + set_bitmap_uptodate(bh); + if (bh_submit_read(bh) < 0) { + put_bh(bh); + ext4_error(sb, "Cannot read block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + ext4_valid_block_bitmap(sb, desc, block_group, bh); + /* + * file system mounted not to panic on error, + * continue with corrupt bitmap + */ + return bh; +} + +/** + * ext4_has_free_blocks() + * @sbi: in-core super block structure. + * @nblocks: number of needed blocks + * + * Check if filesystem has nblocks free & available for allocation. + * On success return 1, return 0 on failure. + */ +static int ext4_has_free_blocks(struct ext4_sb_info *sbi, + s64 nblocks, unsigned int flags) +{ + s64 free_blocks, dirty_blocks, root_blocks; + struct percpu_counter *fbc = &sbi->s_freeblocks_counter; + struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; + + free_blocks = percpu_counter_read_positive(fbc); + dirty_blocks = percpu_counter_read_positive(dbc); + root_blocks = ext4_r_blocks_count(sbi->s_es); + + if (free_blocks - (nblocks + root_blocks + dirty_blocks) < + EXT4_FREEBLOCKS_WATERMARK) { + free_blocks = percpu_counter_sum_positive(fbc); + dirty_blocks = percpu_counter_sum_positive(dbc); + } + /* Check whether we have space after + * accounting for current dirty blocks & root reserved blocks. + */ + if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks)) + return 1; + + /* Hm, nope. Are (enough) root reserved blocks available? */ + if (sbi->s_resuid == current_fsuid() || + ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || + capable(CAP_SYS_RESOURCE) || + (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + + if (free_blocks >= (nblocks + dirty_blocks)) + return 1; + } + + return 0; +} + +int ext4_claim_free_blocks(struct ext4_sb_info *sbi, + s64 nblocks, unsigned int flags) +{ + if (ext4_has_free_blocks(sbi, nblocks, flags)) { + percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); + return 0; + } else + return -ENOSPC; +} + +/** + * ext4_should_retry_alloc() + * @sb: super block + * @retries number of attemps has been made + * + * ext4_should_retry_alloc() is called when ENOSPC is returned, and if + * it is profitable to retry the operation, this function will wait + * for the current or committing transaction to complete, and then + * return TRUE. + * + * if the total number of retries exceed three times, return FALSE. + */ +int ext4_should_retry_alloc(struct super_block *sb, int *retries) +{ + if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) || + (*retries)++ > 3 || + !EXT4_SB(sb)->s_journal) + return 0; + + jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); + + return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); +} + +/* + * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: pointer to total number of blocks needed + * @errp: error code + * + * Return 1st allocated block number on success, *count stores total account + * error stores in errp pointer + */ +ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, unsigned int flags, + unsigned long *count, int *errp) +{ + struct ext4_allocation_request ar; + ext4_fsblk_t ret; + + memset(&ar, 0, sizeof(ar)); + /* Fill with neighbour allocated blocks */ + ar.inode = inode; + ar.goal = goal; + ar.len = count ? *count : 1; + ar.flags = flags; + + ret = ext4_mb_new_blocks(handle, &ar, errp); + if (count) + *count = ar.len; + /* + * Account for the allocated meta blocks. We will never + * fail EDQUOT for metdata, but we do account for it. + */ + if (!(*errp) && + ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + EXT4_I(inode)->i_allocated_meta_blocks += ar.len; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + dquot_alloc_block_nofail(inode, ar.len); + } + return ret; +} + +/** + * ext4_count_free_blocks() -- count filesystem free blocks + * @sb: superblock + * + * Adds up the number of free blocks from each block group. + */ +ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) +{ + ext4_fsblk_t desc_count; + struct ext4_group_desc *gdp; + ext4_group_t i; + ext4_group_t ngroups = ext4_get_groups_count(sb); +#ifdef EXT4FS_DEBUG + struct ext4_super_block *es; + ext4_fsblk_t bitmap_count; + unsigned int x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT4_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += ext4_free_blks_count(sb, gdp); + brelse(bitmap_bh); + bitmap_bh = ext4_read_block_bitmap(sb, i); + if (bitmap_bh == NULL) + continue; + + x = ext4_count_free(bitmap_bh, sb->s_blocksize); + printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", + i, ext4_free_blks_count(sb, gdp), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu" + ", computed = %llu, %llu\n", ext4_free_blocks_count(es), + desc_count, bitmap_count); + return bitmap_count; +#else + desc_count = 0; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += ext4_free_blks_count(sb, gdp); + } + + return desc_count; +#endif +} + +static inline int test_root(ext4_group_t a, int b) +{ + int num = b; + + while (a > num) + num *= b; + return num == a; +} + +static int ext4_group_sparse(ext4_group_t group) +{ + if (group <= 1) + return 1; + if (!(group & 1)) + return 0; + return (test_root(group, 7) || test_root(group, 5) || + test_root(group, 3)); +} + +/** + * ext4_bg_has_super - number of blocks used by the superblock in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the superblock (primary or backup) + * in this group. Currently this will be only 0 or 1. + */ +int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) +{ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && + !ext4_group_sparse(group)) + return 0; + return 1; +} + +static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, + ext4_group_t group) +{ + unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); + ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb); + ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1; + + if (group == first || group == first + 1 || group == last) + return 1; + return 0; +} + +static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, + ext4_group_t group) +{ + if (!ext4_bg_has_super(sb, group)) + return 0; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + else + return EXT4_SB(sb)->s_gdb_count; +} + +/** + * ext4_bg_num_gdb - number of blocks used by the group table in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the group descriptor table + * (primary or backup) in this group. In the future there may be a + * different number of descriptor blocks in each group. + */ +unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) +{ + unsigned long first_meta_bg = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) || + metagroup < first_meta_bg) + return ext4_bg_num_gdb_nometa(sb, group); + + return ext4_bg_num_gdb_meta(sb,group); + +} + diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c new file mode 100644 index 00000000..fa3af81a --- /dev/null +++ b/fs/ext4/bitmap.c @@ -0,0 +1,31 @@ +/* + * linux/fs/ext4/bitmap.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include +#include +#include "ext4.h" + +#ifdef EXT4FS_DEBUG + +static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; + +unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars) +{ + unsigned int i, sum = 0; + + if (!map) + return 0; + for (i = 0; i < numchars; i++) + sum += nibblemap[map->b_data[i] & 0xf] + + nibblemap[(map->b_data[i] >> 4) & 0xf]; + return sum; +} + +#endif /* EXT4FS_DEBUG */ + diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c new file mode 100644 index 00000000..fac90f3f --- /dev/null +++ b/fs/ext4/block_validity.c @@ -0,0 +1,248 @@ +/* + * linux/fs/ext4/block_validity.c + * + * Copyright (C) 2009 + * Theodore Ts'o (tytso@mit.edu) + * + * Track which blocks in the filesystem are metadata blocks that + * should never be used as data blocks by files or directories. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4.h" + +struct ext4_system_zone { + struct rb_node node; + ext4_fsblk_t start_blk; + unsigned int count; +}; + +static struct kmem_cache *ext4_system_zone_cachep; + +int __init ext4_init_system_zone(void) +{ + ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0); + if (ext4_system_zone_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_system_zone(void) +{ + kmem_cache_destroy(ext4_system_zone_cachep); +} + +static inline int can_merge(struct ext4_system_zone *entry1, + struct ext4_system_zone *entry2) +{ + if ((entry1->start_blk + entry1->count) == entry2->start_blk) + return 1; + return 0; +} + +/* + * Mark a range of blocks as belonging to the "system zone" --- that + * is, filesystem metadata blocks which should never be used by + * inodes. + */ +static int add_system_zone(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *new_entry = NULL, *entry; + struct rb_node **n = &sbi->system_blks.rb_node, *node; + struct rb_node *parent = NULL, *new_node = NULL; + + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_system_zone, node); + if (start_blk < entry->start_blk) + n = &(*n)->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + if (start_blk + count > (entry->start_blk + + entry->count)) + entry->count = (start_blk + count - + entry->start_blk); + new_node = *n; + new_entry = rb_entry(new_node, struct ext4_system_zone, + node); + break; + } + } + + if (!new_entry) { + new_entry = kmem_cache_alloc(ext4_system_zone_cachep, + GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->start_blk = start_blk; + new_entry->count = count; + new_node = &new_entry->node; + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &sbi->system_blks); + } + + /* Can we merge to the left? */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &sbi->system_blks); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + + /* Can we merge to the right? */ + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &sbi->system_blks); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + return 0; +} + +static void debug_print_tree(struct ext4_sb_info *sbi) +{ + struct rb_node *node; + struct ext4_system_zone *entry; + int first = 1; + + printk(KERN_INFO "System zones: "); + node = rb_first(&sbi->system_blks); + while (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + printk("%s%llu-%llu", first ? "" : ", ", + entry->start_blk, entry->start_blk + entry->count - 1); + first = 0; + node = rb_next(node); + } + printk("\n"); +} + +int ext4_setup_system_zone(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp; + ext4_group_t i; + int flex_size = ext4_flex_bg_size(sbi); + int ret; + + if (!test_opt(sb, BLOCK_VALIDITY)) { + if (EXT4_SB(sb)->system_blks.rb_node) + ext4_release_system_zone(sb); + return 0; + } + if (EXT4_SB(sb)->system_blks.rb_node) + return 0; + + for (i=0; i < ngroups; i++) { + if (ext4_bg_has_super(sb, i) && + ((i < 5) || ((i % flex_size) == 0))) + add_system_zone(sbi, ext4_group_first_block_no(sb, i), + ext4_bg_num_gdb(sb, i) + 1); + gdp = ext4_get_group_desc(sb, i, NULL); + ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); + if (ret) + return ret; + ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1); + if (ret) + return ret; + ret = add_system_zone(sbi, ext4_inode_table(sb, gdp), + sbi->s_itb_per_group); + if (ret) + return ret; + } + + if (test_opt(sb, DEBUG)) + debug_print_tree(EXT4_SB(sb)); + return 0; +} + +/* Called when the filesystem is unmounted */ +void ext4_release_system_zone(struct super_block *sb) +{ + struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node; + struct rb_node *parent; + struct ext4_system_zone *entry; + + while (n) { + /* Do the node's children first */ + if (n->rb_left) { + n = n->rb_left; + continue; + } + if (n->rb_right) { + n = n->rb_right; + continue; + } + /* + * The node has no children; free it, and then zero + * out parent's link to it. Finally go to the + * beginning of the loop and try to free the parent + * node. + */ + parent = rb_parent(n); + entry = rb_entry(n, struct ext4_system_zone, node); + kmem_cache_free(ext4_system_zone_cachep, entry); + if (!parent) + EXT4_SB(sb)->system_blks = RB_ROOT; + else if (parent->rb_left == n) + parent->rb_left = NULL; + else if (parent->rb_right == n) + parent->rb_right = NULL; + n = parent; + } + EXT4_SB(sb)->system_blks = RB_ROOT; +} + +/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with filesystem metadata blocks. + */ +int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *entry; + struct rb_node *n = sbi->system_blks.rb_node; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + } + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + } + } + return 1; +} + diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c new file mode 100644 index 00000000..164c5609 --- /dev/null +++ b/fs/ext4/dir.c @@ -0,0 +1,542 @@ +/* + * linux/fs/ext4/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Hash Tree Directory indexing (c) 2001 Daniel Phillips + * + */ + +#include +#include +#include +#include +#include +#include "ext4.h" + +static unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int ext4_readdir(struct file *, void *, filldir_t); +static int ext4_dx_readdir(struct file *filp, + void *dirent, filldir_t filldir); +static int ext4_release_dir(struct inode *inode, + struct file *filp); + +const struct file_operations ext4_dir_operations = { + .llseek = ext4_llseek, + .read = generic_read_dir, + .readdir = ext4_readdir, /* we take BKL. needed?*/ + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .fsync = ext4_sync_file, + .release = ext4_release_dir, +}; + + +static unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT4_FT_MAX)) + return DT_UNKNOWN; + + return (ext4_filetype_table[filetype]); +} + +/* + * Return 0 if the directory entry is OK, and 1 if there is a problem + * + * Note: this is the opposite of what ext2 and ext3 historically returned... + */ +int __ext4_check_dir_entry(const char *function, unsigned int line, + struct inode *dir, struct file *filp, + struct ext4_dir_entry_2 *de, + struct buffer_head *bh, + unsigned int offset) +{ + const char *error_msg = NULL; + const int rlen = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); + + if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) + error_msg = "rec_len is smaller than minimal"; + else if (unlikely(rlen % 4 != 0)) + error_msg = "rec_len % 4 != 0"; + else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) + error_msg = "rec_len is too small for name_len"; + else if (unlikely(((char *) de - bh->b_data) + rlen > + dir->i_sb->s_blocksize)) + error_msg = "directory entry across blocks"; + else if (unlikely(le32_to_cpu(de->inode) > + le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) + error_msg = "inode out of bounds"; + else + return 0; + + if (filp) + ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0, + "bad entry in directory: %s - offset=%u(%u), " + "inode=%u, rec_len=%d, name_len=%d", + error_msg, (unsigned) (offset%bh->b_size), + offset, le32_to_cpu(de->inode), + rlen, de->name_len); + else + ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0, + "bad entry in directory: %s - offset=%u(%u), " + "inode=%u, rec_len=%d, name_len=%d", + error_msg, (unsigned) (offset%bh->b_size), + offset, le32_to_cpu(de->inode), + rlen, de->name_len); + + return 1; +} + +static int ext4_readdir(struct file *filp, + void *dirent, filldir_t filldir) +{ + int error = 0; + unsigned int offset; + int i, stored; + struct ext4_dir_entry_2 *de; + struct super_block *sb; + int err; + struct inode *inode = filp->f_path.dentry->d_inode; + int ret = 0; + int dir_has_error = 0; + + sb = inode->i_sb; + + if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX) && + ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) { + err = ext4_dx_readdir(filp, dirent, filldir); + if (err != ERR_BAD_DX_DIR) { + ret = err; + goto out; + } + /* + * We don't set the inode dirty flag since it's not + * critical that it get flushed back to the disk. + */ + ext4_clear_inode_flag(filp->f_path.dentry->d_inode, + EXT4_INODE_INDEX); + } + stored = 0; + offset = filp->f_pos & (sb->s_blocksize - 1); + + while (!error && !stored && filp->f_pos < inode->i_size) { + struct ext4_map_blocks map; + struct buffer_head *bh = NULL; + + map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); + map.m_len = 1; + err = ext4_map_blocks(NULL, inode, &map, 0); + if (err > 0) { + pgoff_t index = map.m_pblk >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + if (!ra_has_index(&filp->f_ra, index)) + page_cache_sync_readahead( + sb->s_bdev->bd_inode->i_mapping, + &filp->f_ra, filp, + index, 1); + filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); + } + + /* + * We ignore I/O errors on directories so users have a chance + * of recovering data when there's a bad sector + */ + if (!bh) { + if (!dir_has_error) { + EXT4_ERROR_FILE(filp, 0, + "directory contains a " + "hole at offset %llu", + (unsigned long long) filp->f_pos); + dir_has_error = 1; + } + /* corrupt size? Maybe no more blocks to read */ + if (filp->f_pos > inode->i_blocks << 9) + break; + filp->f_pos += sb->s_blocksize - offset; + continue; + } + +revalidate: + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (filp->f_version != inode->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ext4_dir_entry_2 *) + (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + } + offset = i; + filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + | offset; + filp->f_version = inode->i_version; + } + + while (!error && filp->f_pos < inode->i_size + && offset < sb->s_blocksize) { + de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); + if (ext4_check_dir_entry(inode, filp, de, + bh, offset)) { + /* + * On error, skip the f_pos to the next block + */ + filp->f_pos = (filp->f_pos | + (sb->s_blocksize - 1)) + 1; + brelse(bh); + ret = stored; + goto out; + } + offset += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + if (le32_to_cpu(de->inode)) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + u64 version = filp->f_version; + + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type)); + if (error) + break; + if (version != filp->f_version) + goto revalidate; + stored++; + } + filp->f_pos += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + } + offset = 0; + brelse(bh); + } +out: + return ret; +} + +/* + * These functions convert from the major/minor hash to an f_pos + * value. + * + * Currently we only use major hash numer. This is unfortunate, but + * on 32-bit machines, the same VFS interface is used for lseek and + * llseek, so if we use the 64 bit offset, then the 32-bit versions of + * lseek/telldir/seekdir will blow out spectacularly, and from within + * the ext2 low-level routine, we don't know if we're being called by + * a 64-bit version of the system call or the 32-bit version of the + * system call. Worse yet, NFSv2 only allows for a 32-bit readdir + * cookie. Sigh. + */ +#define hash2pos(major, minor) (major >> 1) +#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) +#define pos2min_hash(pos) (0) + +/* + * This structure holds the nodes of the red-black tree used to store + * the directory entry in hash order. + */ +struct fname { + __u32 hash; + __u32 minor_hash; + struct rb_node rb_hash; + struct fname *next; + __u32 inode; + __u8 name_len; + __u8 file_type; + char name[0]; +}; + +/* + * This functoin implements a non-recursive way of freeing all of the + * nodes in the red-black tree. + */ +static void free_rb_tree_fname(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct rb_node *parent; + struct fname *fname; + + while (n) { + /* Do the node's children first */ + if (n->rb_left) { + n = n->rb_left; + continue; + } + if (n->rb_right) { + n = n->rb_right; + continue; + } + /* + * The node has no children; free it, and then zero + * out parent's link to it. Finally go to the + * beginning of the loop and try to free the parent + * node. + */ + parent = rb_parent(n); + fname = rb_entry(n, struct fname, rb_hash); + while (fname) { + struct fname *old = fname; + fname = fname->next; + kfree(old); + } + if (!parent) + *root = RB_ROOT; + else if (parent->rb_left == n) + parent->rb_left = NULL; + else if (parent->rb_right == n) + parent->rb_right = NULL; + n = parent; + } +} + + +static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos) +{ + struct dir_private_info *p; + + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); + if (!p) + return NULL; + p->curr_hash = pos2maj_hash(pos); + p->curr_minor_hash = pos2min_hash(pos); + return p; +} + +void ext4_htree_free_dir_info(struct dir_private_info *p) +{ + free_rb_tree_fname(&p->root); + kfree(p); +} + +/* + * Given a directory entry, enter it into the fname rb tree. + */ +int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent) +{ + struct rb_node **p, *parent = NULL; + struct fname *fname, *new_fn; + struct dir_private_info *info; + int len; + + info = dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ + len = sizeof(struct fname) + dirent->name_len + 1; + new_fn = kzalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; + new_fn->hash = hash; + new_fn->minor_hash = minor_hash; + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = dirent->name_len; + new_fn->file_type = dirent->file_type; + memcpy(new_fn->name, dirent->name, dirent->name_len); + new_fn->name[dirent->name_len] = 0; + + while (*p) { + parent = *p; + fname = rb_entry(parent, struct fname, rb_hash); + + /* + * If the hash and minor hash match up, then we put + * them on a linked list. This rarely happens... + */ + if ((new_fn->hash == fname->hash) && + (new_fn->minor_hash == fname->minor_hash)) { + new_fn->next = fname->next; + fname->next = new_fn; + return 0; + } + + if (new_fn->hash < fname->hash) + p = &(*p)->rb_left; + else if (new_fn->hash > fname->hash) + p = &(*p)->rb_right; + else if (new_fn->minor_hash < fname->minor_hash) + p = &(*p)->rb_left; + else /* if (new_fn->minor_hash > fname->minor_hash) */ + p = &(*p)->rb_right; + } + + rb_link_node(&new_fn->rb_hash, parent, p); + rb_insert_color(&new_fn->rb_hash, &info->root); + return 0; +} + + + +/* + * This is a helper function for ext4_dx_readdir. It calls filldir + * for all entres on the fname linked list. (Normally there is only + * one entry on the linked list, unless there are 62 bit hash collisions.) + */ +static int call_filldir(struct file *filp, void *dirent, + filldir_t filldir, struct fname *fname) +{ + struct dir_private_info *info = filp->private_data; + loff_t curr_pos; + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb; + int error; + + sb = inode->i_sb; + + if (!fname) { + printk(KERN_ERR "EXT4-fs: call_filldir: called with " + "null fname?!?\n"); + return 0; + } + curr_pos = hash2pos(fname->hash, fname->minor_hash); + while (fname) { + error = filldir(dirent, fname->name, + fname->name_len, curr_pos, + fname->inode, + get_dtype(sb, fname->file_type)); + if (error) { + filp->f_pos = curr_pos; + info->extra_fname = fname; + return error; + } + fname = fname->next; + } + return 0; +} + +static int ext4_dx_readdir(struct file *filp, + void *dirent, filldir_t filldir) +{ + struct dir_private_info *info = filp->private_data; + struct inode *inode = filp->f_path.dentry->d_inode; + struct fname *fname; + int ret; + + if (!info) { + info = ext4_htree_create_dir_info(filp->f_pos); + if (!info) + return -ENOMEM; + filp->private_data = info; + } + + if (filp->f_pos == EXT4_HTREE_EOF) + return 0; /* EOF */ + + /* Some one has messed with f_pos; reset the world */ + if (info->last_pos != filp->f_pos) { + free_rb_tree_fname(&info->root); + info->curr_node = NULL; + info->extra_fname = NULL; + info->curr_hash = pos2maj_hash(filp->f_pos); + info->curr_minor_hash = pos2min_hash(filp->f_pos); + } + + /* + * If there are any leftover names on the hash collision + * chain, return them first. + */ + if (info->extra_fname) { + if (call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; + info->extra_fname = NULL; + goto next_node; + } else if (!info->curr_node) + info->curr_node = rb_first(&info->root); + + while (1) { + /* + * Fill the rbtree if we have no more entries, + * or the inode has changed since we last read in the + * cached entries. + */ + if ((!info->curr_node) || + (filp->f_version != inode->i_version)) { + info->curr_node = NULL; + free_rb_tree_fname(&info->root); + filp->f_version = inode->i_version; + ret = ext4_htree_fill_tree(filp, info->curr_hash, + info->curr_minor_hash, + &info->next_hash); + if (ret < 0) + return ret; + if (ret == 0) { + filp->f_pos = EXT4_HTREE_EOF; + break; + } + info->curr_node = rb_first(&info->root); + } + + fname = rb_entry(info->curr_node, struct fname, rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + if (call_filldir(filp, dirent, filldir, fname)) + break; + next_node: + info->curr_node = rb_next(info->curr_node); + if (info->curr_node) { + fname = rb_entry(info->curr_node, struct fname, + rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + } else { + if (info->next_hash == ~0) { + filp->f_pos = EXT4_HTREE_EOF; + break; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } +finished: + info->last_pos = filp->f_pos; + return 0; +} + +static int ext4_release_dir(struct inode *inode, struct file *filp) +{ + if (filp->private_data) + ext4_htree_free_dir_info(filp->private_data); + + return 0; +} diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h new file mode 100644 index 00000000..1a34c1c8 --- /dev/null +++ b/fs/ext4/ext4.h @@ -0,0 +1,2236 @@ +/* + * ext4.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_H +#define _EXT4_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __KERNEL__ +#include +#endif + +/* + * The fourth extended filesystem constants/structures + */ + +/* + * Define EXT4FS_DEBUG to produce debug messages + */ +#undef EXT4FS_DEBUG + +/* + * Debug code + */ +#ifdef EXT4FS_DEBUG +#define ext4_debug(f, a...) \ + do { \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext4_debug(f, a...) do {} while (0) +#endif + +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ + +/* prefer goal again. length */ +#define EXT4_MB_HINT_MERGE 0x0001 +/* blocks already reserved */ +#define EXT4_MB_HINT_RESERVED 0x0002 +/* metadata is being allocated */ +#define EXT4_MB_HINT_METADATA 0x0004 +/* first blocks in the file */ +#define EXT4_MB_HINT_FIRST 0x0008 +/* search for the best chunk */ +#define EXT4_MB_HINT_BEST 0x0010 +/* data is being allocated */ +#define EXT4_MB_HINT_DATA 0x0020 +/* don't preallocate (for tails) */ +#define EXT4_MB_HINT_NOPREALLOC 0x0040 +/* allocate for locality group */ +#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 +/* allocate goal blocks or none */ +#define EXT4_MB_HINT_GOAL_ONLY 0x0100 +/* goal is meaningful */ +#define EXT4_MB_HINT_TRY_GOAL 0x0200 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 0x0400 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 0x0800 +/* Use reserved root blocks if needed */ +#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 + +struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; + /* how many blocks we want to allocate */ + unsigned int len; + /* logical block in target inode */ + ext4_lblk_t logical; + /* the closest logical allocated block to the left */ + ext4_lblk_t lleft; + /* the closest logical allocated block to the right */ + ext4_lblk_t lright; + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* phys. block for the closest logical allocated block to the left */ + ext4_fsblk_t pleft; + /* phys. block for the closest logical allocated block to the right */ + ext4_fsblk_t pright; + /* flags. see above EXT4_MB_HINT_* */ + unsigned int flags; +}; + +/* + * Logical to physical block mapping, used by ext4_map_blocks() + * + * This structure is used to pass requests into ext4_map_blocks() as + * well as to store the information returned by ext4_map_blocks(). It + * takes less room on the stack than a struct buffer_head. + */ +#define EXT4_MAP_NEW (1 << BH_New) +#define EXT4_MAP_MAPPED (1 << BH_Mapped) +#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) +#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) +#define EXT4_MAP_UNINIT (1 << BH_Uninit) +#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ + EXT4_MAP_UNINIT) + +struct ext4_map_blocks { + ext4_fsblk_t m_pblk; + ext4_lblk_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* + * For delayed allocation tracking + */ +struct mpage_da_data { + struct inode *inode; + sector_t b_blocknr; /* start block number of extent */ + size_t b_size; /* size of extent */ + unsigned long b_state; /* state of the extent */ + unsigned long first_page, next_page; /* extent of pages */ + struct writeback_control *wbc; + int io_done; + int pages_written; + int retval; +}; + +/* + * Flags for ext4_io_end->flags + */ +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_ERROR 0x0002 +#define EXT4_IO_END_QUEUED 0x0004 + +struct ext4_io_page { + struct page *p_page; + atomic_t p_count; +}; + +#define MAX_IO_PAGES 128 + +typedef struct ext4_io_end { + struct list_head list; /* per-file finished IO list */ + struct inode *inode; /* file being written to */ + unsigned int flag; /* unwritten or not */ + struct page *page; /* page struct for buffer write */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ + struct work_struct work; /* data work queue */ + struct kiocb *iocb; /* iocb struct for AIO */ + int result; /* error value for AIO */ + int num_io_pages; + struct ext4_io_page *pages[MAX_IO_PAGES]; +} ext4_io_end_t; + +struct ext4_io_submit { + int io_op; + struct bio *io_bio; + ext4_io_end_t *io_end; + struct ext4_io_page *io_page; + sector_t io_next_block; +}; + +/* + * Special inodes numbers + */ +#define EXT4_BAD_INO 1 /* Bad blocks inode */ +#define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ +#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ +#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT4_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext4 filesystems */ +#define EXT4_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT4_LINK_MAX 65000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT4_MIN_BLOCK_SIZE 1024 +#define EXT4_MAX_BLOCK_SIZE 65536 +#define EXT4_MIN_BLOCK_LOG_SIZE 10 +#define EXT4_MAX_BLOCK_LOG_SIZE 16 +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +#else +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) +#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) +#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) +#else +#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) + +/* + * Structure of a blocks group descriptor + */ +struct ext4_group_desc +{ + __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ + __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ + __le32 bg_inode_table_lo; /* Inodes table block */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ + __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ + __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ + __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ + __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ + __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ + __le32 bg_inode_table_hi; /* Inodes table block MSB */ + __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ + __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ + __le16 bg_used_dirs_count_hi; /* Directories count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __u32 bg_reserved2[3]; +}; + +/* + * Structure of a flex block group info + */ + +struct flex_groups { + atomic_t free_inodes; + atomic_t free_blocks; + atomic_t used_dirs; +}; + +#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ +#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ +#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT4_MIN_DESC_SIZE 32 +#define EXT4_MIN_DESC_SIZE_64BIT 64 +#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE +#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) +#ifdef __KERNEL__ +# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) +# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) +# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) +#else +# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) +# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT4_NDIR_BLOCKS 12 +#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS +#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) +#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) +#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT4_UNRM_FL 0x00000002 /* Undelete */ +#define EXT4_COMPR_FL 0x00000004 /* Compress file */ +#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT4_DIRTY_FL 0x00000100 +#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */ +/* End compression flags --- maybe not all used */ +#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + +/* + * Inode flags used for atomic set/get + */ +enum { + EXT4_INODE_SECRM = 0, /* Secure deletion */ + EXT4_INODE_UNRM = 1, /* Undelete */ + EXT4_INODE_COMPR = 2, /* Compress file */ + EXT4_INODE_SYNC = 3, /* Synchronous updates */ + EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ + EXT4_INODE_APPEND = 5, /* writes to file may only append */ + EXT4_INODE_NODUMP = 6, /* do not dump file */ + EXT4_INODE_NOATIME = 7, /* do not update atime */ +/* Reserved for compression usage... */ + EXT4_INODE_DIRTY = 8, + EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ + EXT4_INODE_NOCOMPR = 10, /* Don't compress */ + EXT4_INODE_ECOMPR = 11, /* Compression error */ +/* End compression flags --- maybe not all used */ + EXT4_INODE_INDEX = 12, /* hash-indexed directory */ + EXT4_INODE_IMAGIC = 13, /* AFS directory */ + EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ + EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ + EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ + EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ + EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ + EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ +}; + +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ + printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ + EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } + +/* + * Since it's pretty easy to mix up bit numbers and hex values, and we + * can't do a compile-time test for ENUM values, we use a run-time + * test to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop + * out so it won't cost any extra space in the compiled kernel image. + * But it's important that these values are the same, since we are + * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL + * must be consistent with the values of FS_XXX_FL defined in + * include/linux/fs.h and the on-disk values found in ext2, ext3, and + * ext4 filesystems, and of course the values defined in e2fsprogs. + * + * It's not paranoia if the Murphy's Law really *is* out to get you. :-) + */ +static inline void ext4_check_flag_values(void) +{ + CHECK_FLAG_VALUE(SECRM); + CHECK_FLAG_VALUE(UNRM); + CHECK_FLAG_VALUE(COMPR); + CHECK_FLAG_VALUE(SYNC); + CHECK_FLAG_VALUE(IMMUTABLE); + CHECK_FLAG_VALUE(APPEND); + CHECK_FLAG_VALUE(NODUMP); + CHECK_FLAG_VALUE(NOATIME); + CHECK_FLAG_VALUE(DIRTY); + CHECK_FLAG_VALUE(COMPRBLK); + CHECK_FLAG_VALUE(NOCOMPR); + CHECK_FLAG_VALUE(ECOMPR); + CHECK_FLAG_VALUE(INDEX); + CHECK_FLAG_VALUE(IMAGIC); + CHECK_FLAG_VALUE(JOURNAL_DATA); + CHECK_FLAG_VALUE(NOTAIL); + CHECK_FLAG_VALUE(DIRSYNC); + CHECK_FLAG_VALUE(TOPDIR); + CHECK_FLAG_VALUE(HUGE_FILE); + CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(EA_INODE); + CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(RESERVED); +} + +/* Used to pass group descriptor data when online resize is done */ +struct ext4_new_group_input { + __u32 group; /* Group number for this data */ + __u64 block_bitmap; /* Absolute block number of block bitmap */ + __u64 inode_bitmap; /* Absolute block number of inode bitmap */ + __u64 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +struct compat_ext4_new_group_input { + u32 group; + compat_u64 block_bitmap; + compat_u64 inode_bitmap; + compat_u64 inode_table; + u32 blocks_count; + u16 reserved_blocks; + u16 unused; +}; +#endif + +/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ +struct ext4_new_group_data { + __u32 group; + __u64 block_bitmap; + __u64 inode_bitmap; + __u64 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 unused; + __u32 free_blocks_count; +}; + +/* + * Flags used by ext4_map_blocks() + */ + /* Allocate any needed blocks and/or convert an unitialized + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 0x0001 + /* Request the creation of an unitialized extent */ +#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Caller is from the delayed allocation writeout path, + so set the magic i_delalloc_reserve_flag after taking the + inode allocation semaphore for */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 + /* caller is from the direct IO path, request to creation of an + unitialized extents if not allocated, split the uninitialized + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + /* Punch out blocks of an extent */ +#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 + /* Don't normalize allocation size (used for fallocate) */ +#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + +/* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 + +/* + * ioctl commands + */ +#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT4_IOC_GETVERSION _IOR('f', 3, long) +#define EXT4_IOC_SETVERSION _IOW('f', 4, long) +#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#ifdef CONFIG_JBD2_DEBUG +#define EXT4_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) +#endif +#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) +#define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ + /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ +#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) +#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) +#ifdef CONFIG_JBD2_DEBUG +#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) +#endif +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +#endif + +/* Max physical block we can address w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + +/* + * Structure of an inode on the disk + */ +struct ext4_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size_lo; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Inode Change time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks_lo; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_version; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl_lo; /* File ACL */ + __le32 i_size_high; + __le32 i_obso_faddr; /* Obsoleted fragment address */ + union { + struct { + __le16 l_i_blocks_high; /* were l_i_reserved1 */ + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __le16 m_i_file_acl_high; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_pad1; + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ + __le32 i_version_hi; /* high 32 bits for 64-bit version */ +}; + +struct move_extent { + __u32 reserved; /* should be zero */ + __u32 donor_fd; /* donor file descriptor */ + __u64 orig_start; /* logical start offset in block for orig */ + __u64 donor_start; /* logical start offset in block for donor */ + __u64 len; /* block length to be moved */ + __u64 moved_len; /* moved block length */ +}; + +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +/* + * Extended fields will fit into an inode if the filesystem was formatted + * with large inodes (-I 256 or larger) and there are not currently any EAs + * consuming all of the available space. For new inodes we always reserve + * enough space for the kernel's known extended fields, but for inodes + * created with an old kernel this might not have been the case. None of + * the extended inode fields is critical for correct filesystem operation. + * This macro checks if a certain field fits in the inode. Note that + * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize + */ +#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ + ((offsetof(typeof(*ext4_inode), field) + \ + sizeof((ext4_inode)->field)) \ + <= (EXT4_GOOD_OLD_INODE_SIZE + \ + (einode)->i_extra_isize)) \ + +static inline __le32 ext4_encode_extra_time(struct timespec *time) +{ + return cpu_to_le32((sizeof(time->tv_sec) > 4 ? + (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) | + ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK)); +} + +static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) +{ + if (sizeof(time->tv_sec) > 4) + time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) + << 32; + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; +} + +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +do { \ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(inode)->xtime); \ +} while (0) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(einode)->xtime); \ +} while (0) + +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +do { \ + (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ + ext4_decode_extra_time(&(inode)->xtime, \ + raw_inode->xtime ## _extra); \ + else \ + (inode)->xtime.tv_nsec = 0; \ +} while (0) + +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime.tv_sec = \ + (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + ext4_decode_extra_time(&(einode)->xtime, \ + raw_inode->xtime ## _extra); \ + else \ + (einode)->xtime.tv_nsec = 0; \ +} while (0) + +#define i_disk_version osd1.linux1.l_i_version + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_file_acl_high osd2.linux2.l_i_file_acl_high +#define i_blocks_high osd2.linux2.l_i_blocks_high +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_reserved2 osd2.linux2.l_i_reserved2 + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_file_acl_high osd2.masix2.m_i_file_acl_high +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +/* + * storage for cached extent + * If ec_len == 0, then the cache is invalid. + * If ec_start == 0, then the cache represents a gap (null mapping) + */ +struct ext4_ext_cache { + ext4_fsblk_t ec_start; + ext4_lblk_t ec_block; + __u32 ec_len; /* must be 32bit to return holes */ +}; + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is ued for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) + unsigned long i_state_flags; /* Dynamic state flags */ +#endif + unsigned long i_flags; + +#ifdef CONFIG_EXT4_FS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + struct inode vfs_inode; + struct jbd2_inode *jinode; + + struct ext4_ext_cache i_cached_extent; + /* + * File creation time. Its function is same as that of + * struct timespec i_{a,c,m}time in the generic inode. + */ + struct timespec i_crtime; + + /* mballoc */ + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* allocation reservation info for delalloc */ + unsigned int i_reserved_data_blocks; + unsigned int i_reserved_meta_blocks; + unsigned int i_allocated_meta_blocks; + ext4_lblk_t i_da_metadata_calc_last_lblock; + int i_da_metadata_calc_len; + + /* on-disk additional length */ + __u16 i_extra_isize; + +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + + /* completed IOs that might need unwritten extents handling */ + struct list_head i_completed_io_list; + spinlock_t i_completed_io_lock; + atomic_t i_ioend_count; /* Number of outstanding io_end structs */ + /* current io_end structure for async DIO write*/ + ext4_io_end_t *cur_aio_dio; + atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */ + + spinlock_t i_block_reservation_lock; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; +}; + +/* + * File system states + */ +#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT4_ERROR_FS 0x0002 /* Errors detected */ +#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags + */ +#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ +#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ +#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ +#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ +#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ +#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ + +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt +#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) + +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + +#define ext4_set_bit __test_and_set_bit_le +#define ext4_set_bit_atomic ext2_set_bit_atomic +#define ext4_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit_atomic ext2_clear_bit_atomic +#define ext4_test_bit test_bit_le +#define ext4_find_first_zero_bit find_first_zero_bit_le +#define ext4_find_next_zero_bit find_next_zero_bit_le +#define ext4_find_next_bit find_next_bit_le + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT4_ERRORS_PANIC 3 /* Panic */ +#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE + +/* + * Structure of the super block + */ +struct ext4_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count_lo; /* Blocks count */ + __le32 s_r_blocks_count_lo; /* Reserved blocks count */ + __le32 s_free_blocks_count_lo; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_obso_log_frag_size; /* Obsoleted fragment size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_obso_frags_per_group; /* Obsoleted fragments per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT4_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_jnl_backup_type; + __le16 s_desc_size; /* size of group descriptor */ +/*100*/ __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_reserved_char_pad; + __le16 s_reserved_pad; + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32]; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32]; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_reserved[112]; /* Padding to the end of the block */ +}; + +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + +#ifdef __KERNEL__ + +/* + * run-time mount flags + */ +#define EXT4_MF_MNTDIR_SAMPLED 0x0001 +#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ + +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ + unsigned long s_overhead_last; /* Last calculated overhead */ + unsigned long s_blocks_last; /* Last seen block count */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head **s_group_desc; + unsigned int s_mount_opt; + unsigned int s_mount_opt2; + unsigned int s_mount_flags; + ext4_fsblk_t s_sb_block; + uid_t s_resuid; + gid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + unsigned int s_inode_goal; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyblocks_counter; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + + /* Journaling */ + struct journal_s *s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + struct mutex s_resize_lock; + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct block_device *journal_bdev; +#ifdef CONFIG_JBD2_DEBUG + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ +#endif +#ifdef CONFIG_QUOTA + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct rb_root system_blks; + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + /* ext4 extent cache stats */ + unsigned long extent_cache_hits; + unsigned long extent_cache_misses; + + /* for buddy allocator */ + struct ext4_group_info ***s_group_info; + struct inode *s_buddy_cache; + spinlock_t s_md_lock; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + unsigned int s_max_writeback_mb_bump; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + + /* stats for buddy allocator */ + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + atomic_t s_lock_busy; + + /* locality groups */ + struct ext4_locality_group __percpu *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; + + /* workqueue for dio unwritten */ + struct workqueue_struct *dio_unwritten_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; + + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ + unsigned int s_li_wait_mult; + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; +}; + +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext4_inode_info *EXT4_I(struct inode *inode) +{ + return container_of(inode, struct ext4_inode_info, vfs_inode); +} + +static inline struct timespec ext4_current_time(struct inode *inode) +{ + return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? + current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; +} + +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT4_ROOT_INO || + ino == EXT4_JOURNAL_INO || + ino == EXT4_RESIZE_INO || + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); +} + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_JDATA, /* journaled data exists */ + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ + EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ +}; + +#define EXT4_INODE_BIT_FNS(name, field, offset) \ +static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ +{ \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ +{ \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ +{ \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} + +EXT4_INODE_BIT_FNS(flag, flags, 0) +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif +#else +/* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT4_SB(sb) (sb) +#endif + +#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT4_OS_LINUX 0 +#define EXT4_OS_HURD 1 +#define EXT4_OS_MASIX 2 +#define EXT4_OS_FREEBSD 3 +#define EXT4_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV +#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV + +#define EXT4_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ + ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) +#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0) +#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ + ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0) +#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 + +#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 + +#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 +#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 +#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ + +#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_MMP) +#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE) + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT4_DEF_RESUID 0 +#define EXT4_DEF_RESGID 0 + +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + +/* + * Default mount options + */ +#define EXT4_DEFM_DEBUG 0x0001 +#define EXT4_DEFM_BSDGROUPS 0x0002 +#define EXT4_DEFM_XATTR_USER 0x0004 +#define EXT4_DEFM_ACL 0x0008 +#define EXT4_DEFM_UID16 0x0010 +#define EXT4_DEFM_JMODE 0x0060 +#define EXT4_DEFM_JMODE_DATA 0x0020 +#define EXT4_DEFM_JMODE_ORDERED 0x0040 +#define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 + +/* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + +/* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + +/* + * Structure of a directory entry + */ +#define EXT4_NAME_LEN 255 + +struct ext4_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT4 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext4_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * Ext4 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT4_FT_UNKNOWN 0 +#define EXT4_FT_REG_FILE 1 +#define EXT4_FT_DIR 2 +#define EXT4_FT_CHRDEV 3 +#define EXT4_FT_BLKDEV 4 +#define EXT4_FT_FIFO 5 +#define EXT4_FT_SOCK 6 +#define EXT4_FT_SYMLINK 7 + +#define EXT4_FT_MAX 8 + +/* + * EXT4_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT4_DIR_PAD 4 +#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) +#define EXT4_MAX_REC_LEN ((1<<16)-1) + +/* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_CACHE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) + BUG(); +#if (PAGE_CACHE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ + EXT4_FEATURE_COMPAT_DIR_INDEX) && \ + ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) +#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) +#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + +#define EXT4_HTREE_EOF 0x7fffffff + +/* + * Control parameters used by ext4_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext4_iloc +{ + struct buffer_head *bh; + unsigned long offset; + ext4_group_t block_group; +}; + +static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) +{ + return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext4_fsblk_t +ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) +{ + return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR -75000 + +void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); + +/* + * Timeout and state flag for lazy initialization inode thread. + */ +#define EXT4_DEF_LI_WAIT_MULT 10 +#define EXT4_DEF_LI_MAX_START_DELAY 5 +#define EXT4_LAZYINIT_QUIT 0x0001 +#define EXT4_LAZYINIT_RUNNING 0x0002 + +/* + * Lazy inode table initialization info + */ +struct ext4_lazy_init { + unsigned long li_state; + struct list_head li_request_list; + struct mutex li_list_mtx; +}; + +struct ext4_li_request { + struct super_block *lr_super; + struct ext4_sb_info *lr_sbi; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; + unsigned long lr_timeout; +}; + +struct ext4_features { + struct kobject f_kobj; + struct completion f_kobj_unregister; +}; + +/* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; /* Magic number for MMP */ + __le32 mmp_seq; /* Sequence no. updated periodically */ + + /* + * mmp_time, mmp_nodename & mmp_bdevname are only used for information + * purposes and do not affect the correctness of the algorithm + */ + __le64 mmp_time; /* Time last updated */ + char mmp_nodename[64]; /* Node which last updated MMP block */ + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ + + /* + * mmp_check_interval is used to verify if the MMP block has been + * updated on the block device. The value is updated based on the + * maximum time to write the MMP block during an update cycle. + */ + __le16 mmp_check_interval; + + __le16 mmp_pad1; + __le32 mmp_pad2[227]; +}; + +/* arguments passed to the mmp thread */ +struct mmpd_data { + struct buffer_head *bh; /* bh from initial read_mmp_block() */ + struct super_block *sb; /* super block of the fs */ +}; + +/* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every + * update interval x the multiplier (the value is then adapted based on the + * write latency). The reason is that writes can be delayed under load and we + * don't want readers to incorrectly assume that the filesystem is no longer + * in use. + */ +#define EXT4_MMP_CHECK_MULT 2UL + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL + +/* + * Maximum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext4 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* bitmap.c */ +extern unsigned int ext4_count_free(struct buffer_head *, unsigned); + +/* balloc.c */ +extern unsigned int ext4_block_group(struct super_block *sb, + ext4_fsblk_t blocknr); +extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, + ext4_fsblk_t blocknr); +extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); +extern unsigned long ext4_bg_num_gdb(struct super_block *sb, + ext4_group_t group); +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, + unsigned int flags, + unsigned long *count, + int *errp); +extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, + s64 nblocks, unsigned int flags); +extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); +extern void ext4_check_blocks_bitmap(struct super_block *); +extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); +extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); +struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +#define ext4_free_blocks_after_init(sb, group, desc) \ + ext4_init_block_bitmap(sb, NULL, group, desc) + +/* dir.c */ +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, + struct ext4_dir_entry_2 *, + struct buffer_head *, unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (offset))) +extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent); +extern void ext4_htree_free_dir_info(struct dir_private_info *p); + +/* fsync.c */ +extern int ext4_sync_file(struct file *, int); +extern int ext4_flush_completed_IO(struct inode *); + +/* hash.c */ +extern int ext4fs_dirhash(const char *name, int len, struct + dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode *ext4_new_inode(handle_t *, struct inode *, int, + const struct qstr *qstr, __u32 goal); +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_check_inodes_bitmap(struct super_block *); +extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); + +/* mballoc.c */ +extern long ext4_mb_stats; +extern long ext4_mb_max_to_scan; +extern int ext4_mb_init(struct super_block *, int); +extern int ext4_mb_release(struct super_block *); +extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, + struct ext4_allocation_request *, int *); +extern int ext4_mb_reserve_blocks(struct super_block *, int); +extern void ext4_discard_preallocations(struct inode *); +extern int __init ext4_init_mballoc(void); +extern void ext4_exit_mballoc(void); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); +extern int ext4_mb_add_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); +extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); + +/* inode.c */ +struct buffer_head *ext4_getblk(handle_t *, struct inode *, + ext4_lblk_t, int, int *); +struct buffer_head *ext4_bread(handle_t *, struct inode *, + ext4_lblk_t, int, int *); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); + +extern struct inode *ext4_iget(struct super_block *, unsigned long); +extern int ext4_write_inode(struct inode *, struct writeback_control *); +extern int ext4_setattr(struct dentry *, struct iattr *); +extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +extern void ext4_evict_inode(struct inode *); +extern void ext4_clear_inode(struct inode *); +extern int ext4_sync_inode(handle_t *, struct inode *); +extern void ext4_dirty_inode(struct inode *, int); +extern int ext4_change_inode_journal_flag(struct inode *, int); +extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_can_truncate(struct inode *inode); +extern void ext4_truncate(struct inode *); +extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); +extern void ext4_set_inode_flags(struct inode *); +extern void ext4_get_inode_flags(struct ext4_inode_info *); +extern int ext4_alloc_da_blocks(struct inode *inode); +extern void ext4_set_aops(struct inode *inode); +extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from); +extern int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length); +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); +/* ioctl.c */ +extern long ext4_ioctl(struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* migrate.c */ +extern int ext4_ext_migrate(struct inode *); + +/* namei.c */ +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + +/* resize.c */ +extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); +extern int ext4_group_extend(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count); + +/* super.c */ +extern void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...) + __attribute__ ((format (printf, 4, 5))); +#define ext4_error(sb, message...) __ext4_error(sb, __func__, \ + __LINE__, ## message) +extern void ext4_error_inode(struct inode *, const char *, unsigned int, + ext4_fsblk_t, const char *, ...) + __attribute__ ((format (printf, 5, 6))); +extern void ext4_error_file(struct file *, const char *, unsigned int, + ext4_fsblk_t, const char *, ...) + __attribute__ ((format (printf, 5, 6))); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...) + __attribute__ ((format (printf, 4, 5))); +#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ + __LINE__, ## message) +extern void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...) + __attribute__ ((format (printf, 4, 5))); +#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ + __LINE__, ## message) +extern void ext4_msg(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, + const char *, unsigned int, const char *); +#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ + __LINE__, msg) +extern void __ext4_grp_locked_error(const char *, unsigned int, \ + struct super_block *, ext4_group_t, \ + unsigned long, ext4_fsblk_t, \ + const char *, ...) + __attribute__ ((format (printf, 7, 8))); +#define ext4_grp_locked_error(sb, grp, message...) \ + __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) +extern void ext4_update_dynamic_rev(struct super_block *sb); +extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, + __u32 compat); +extern int ext4_update_rocompat_feature(handle_t *handle, + struct super_block *sb, __u32 rocompat); +extern int ext4_update_incompat_feature(handle_t *handle, + struct super_block *sb, __u32 incompat); +extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_blks_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_blks_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, + struct ext4_group_desc *gdp); + +static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | + le32_to_cpu(es->s_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | + le32_to_cpu(es->s_r_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) | + le32_to_cpu(es->s_free_blocks_count_lo); +} + +static inline void ext4_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline loff_t ext4_isize(struct ext4_inode *raw_inode) +{ + if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + else + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); +} + +static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +{ + raw_inode->i_size_lo = cpu_to_le32(i_size); + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); +} + +static inline +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info ***grp_info; + long indexv, indexh; + grp_info = EXT4_SB(sb)->s_group_info; + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + return grp_info[indexv][indexh]; +} + +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} + +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + +#define ext4_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ +} while (0) + +#ifdef CONFIG_SMP +/* Each CPU can accumulate percpu_counter_batch blocks in their local + * counters. So we need to make sure we have free blocks more + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#else +#define EXT4_FREEBLOCKS_WATERMARK 0 +#endif + +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + /* + * XXX: replace with spinlock if seen contended -bzzz + */ + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = newsize; + up_write(&EXT4_I(inode)->i_data_sem); + return ; +} + +struct ext4_group_info { + unsigned long bb_state; + struct rb_root bb_free_root; + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means + * 5 free 8-block regions. */ +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) + +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) +{ + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} + +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + spinlock_t *lock = ext4_group_lock_ptr(sb, group); + if (spin_trylock(lock)) + /* + * We're able to grab the lock right away, so drop the + * lock contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + else { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(lock); + } +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + spin_unlock(ext4_group_lock_ptr(sb, group)); +} + +static inline void ext4_mark_super_dirty(struct super_block *sb) +{ + if (EXT4_SB(sb)->s_journal == NULL) + sb->s_dirt =1; +} + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext4_dir_operations; + +/* file.c */ +extern const struct inode_operations ext4_file_inode_operations; +extern const struct file_operations ext4_file_operations; +extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); + +/* namei.c */ +extern const struct inode_operations ext4_dir_inode_operations; +extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); + +/* symlink.c */ +extern const struct inode_operations ext4_symlink_inode_operations; +extern const struct inode_operations ext4_fast_symlink_inode_operations; + +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init ext4_init_system_zone(void); +extern void ext4_exit_system_zone(void); +extern int ext4_data_block_valid(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count); + +/* extents.c */ +extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, + int chunk); +extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern void ext4_ext_truncate(struct inode *); +extern int ext4_ext_punch_hole(struct file *file, loff_t offset, + loff_t length); +extern void ext4_ext_init(struct super_block *); +extern void ext4_ext_release(struct super_block *); +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, + loff_t len); +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, + ssize_t len); +extern int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +/* move_extent.c */ +extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, + __u64 len, __u64 *moved_len); + +/* page-io.c */ +extern int __init ext4_init_pageio(void); +extern void ext4_exit_pageio(void); +extern void ext4_ioend_wait(struct inode *); +extern void ext4_free_io_end(ext4_io_end_t *io); +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern int ext4_end_io_nolock(ext4_io_end_t *io); +extern void ext4_io_submit(struct ext4_io_submit *io); +extern int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc); + +/* mmp.c */ +extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + +/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ +enum ext4_state_bits { + BH_Uninit /* blocks are allocated but uninitialized on disk */ + = BH_JBDPrivateStart, +}; + +BUFFER_FNS(Uninit, uninit) +TAS_BUFFER_FNS(Uninit, uninit) + +/* + * Add new method to test wether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +/* For ioend & aio unwritten conversion wait queues */ +#define EXT4_WQ_HASH_SZ 37 +#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; +extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; + +#endif /* __KERNEL__ */ + +#endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h new file mode 100644 index 00000000..095c36f3 --- /dev/null +++ b/fs/ext4/ext4_extents.h @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +#ifndef _EXT4_EXTENTS +#define _EXT4_EXTENTS + +#include "ext4.h" + +/* + * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks + * becomes very small, so index split, in-depth growing and + * other hard changes happen much more often. + * This is for debug purposes only. + */ +#define AGGRESSIVE_TEST_ + +/* + * With EXTENTS_STATS defined, the number of blocks and extents + * are collected in the truncate path. They'll be shown at + * umount time. + */ +#define EXTENTS_STATS__ + +/* + * If CHECK_BINSEARCH is defined, then the results of the binary search + * will also be checked by linear search. + */ +#define CHECK_BINSEARCH__ + +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(a...) printk(a) +#else +#define ext_debug(a...) +#endif + +/* + * If EXT_STATS is defined then stats numbers are collected. + * These number will be displayed at umount time. + */ +#define EXT_STATS_ + + +/* + * ext4_inode has i_block array (60 bytes total). + * The first 12 bytes store ext4_extent_header; + * the remainder stores an array of ext4_extent. + */ + +/* + * This is the extent on-disk structure. + * It's used at the bottom of the tree. + */ +struct ext4_extent { + __le32 ee_block; /* first logical block extent covers */ + __le16 ee_len; /* number of blocks covered by extent */ + __le16 ee_start_hi; /* high 16 bits of physical block */ + __le32 ee_start_lo; /* low 32 bits of physical block */ +}; + +/* + * This is index on-disk structure. + * It's used at all the levels except the bottom. + */ +struct ext4_extent_idx { + __le32 ei_block; /* index covers logical blocks from 'block' */ + __le32 ei_leaf_lo; /* pointer to the physical block of the next * + * level. leaf or next index could be there */ + __le16 ei_leaf_hi; /* high 16 bits of physical block */ + __u16 ei_unused; +}; + +/* + * Each block (leaves and indexes), even inode-stored has header. + */ +struct ext4_extent_header { + __le16 eh_magic; /* probably will support different formats */ + __le16 eh_entries; /* number of valid entries */ + __le16 eh_max; /* capacity of store in entries */ + __le16 eh_depth; /* has tree real underlying blocks? */ + __le32 eh_generation; /* generation of the tree */ +}; + +#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) + +/* + * Array of ext4_ext_path contains path to some extent. + * Creation/lookup routines use it for traversal/splitting/etc. + * Truncate uses it to simulate recursive walking. + */ +struct ext4_ext_path { + ext4_fsblk_t p_block; + __u16 p_depth; + struct ext4_extent *p_ext; + struct ext4_extent_idx *p_idx; + struct ext4_extent_header *p_hdr; + struct buffer_head *p_bh; +}; + +/* + * structure for external API + */ + +/* + * to be called by ext4_ext_walk_space() + * negative retcode - error + * positive retcode - signal for ext4_ext_walk_space(), see below + * callback must return valid extent (passed or newly created) + */ +typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t, + struct ext4_ext_cache *, + struct ext4_extent *, void *); + +#define EXT_CONTINUE 0 +#define EXT_BREAK 1 +#define EXT_REPEAT 2 + +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + +/* + * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an + * initialized extent. This is 2^15 and not (2^16 - 1), since we use the + * MSB of ee_len field in the extent datastructure to signify if this + * particular extent is an initialized extent or an uninitialized (i.e. + * preallocated). + * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an + * uninitialized extent. + * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an + * uninitialized one. In other words, if MSB of ee_len is set, it is an + * uninitialized extent with only one special scenario when ee_len = 0x8000. + * In this case we can not have an uninitialized extent of zero length and + * thus we make it as a special case of initialized extent with 0x8000 length. + * This way we get better extent-to-group alignment for initialized extents. + * Hence, the maximum number of blocks we can have in an *initialized* + * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767). + */ +#define EXT_INIT_MAX_LEN (1UL << 15) +#define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1) + + +#define EXT_FIRST_EXTENT(__hdr__) \ + ((struct ext4_extent *) (((char *) (__hdr__)) + \ + sizeof(struct ext4_extent_header))) +#define EXT_FIRST_INDEX(__hdr__) \ + ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \ + sizeof(struct ext4_extent_header))) +#define EXT_HAS_FREE_INDEX(__path__) \ + (le16_to_cpu((__path__)->p_hdr->eh_entries) \ + < le16_to_cpu((__path__)->p_hdr->eh_max)) +#define EXT_LAST_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_LAST_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_MAX_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) + +static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) +{ + return (struct ext4_extent_header *) EXT4_I(inode)->i_data; +} + +static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh) +{ + return (struct ext4_extent_header *) bh->b_data; +} + +static inline unsigned short ext_depth(struct inode *inode) +{ + return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); +} + +static inline void +ext4_ext_invalidate_cache(struct inode *inode) +{ + EXT4_I(inode)->i_cached_extent.ec_len = 0; +} + +static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) +{ + /* We can not have an uninitialized extent of zero length! */ + BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); + ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); +} + +static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext) +{ + /* Extent with ee_len of 0x8000 is treated as an initialized extent */ + return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); +} + +static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) +{ + return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ? + le16_to_cpu(ext->ee_len) : + (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); +} + +static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) +{ + ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); +} + +/* + * ext4_ext_pblock: + * combine low and high parts of physical block number into ext4_fsblk_t + */ +static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex) +{ + ext4_fsblk_t block; + + block = le32_to_cpu(ex->ee_start_lo); + block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; + return block; +} + +/* + * ext4_idx_pblock: + * combine low and high parts of a leaf physical block number into ext4_fsblk_t + */ +static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix) +{ + ext4_fsblk_t block; + + block = le32_to_cpu(ix->ei_leaf_lo); + block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; + return block; +} + +/* + * ext4_ext_store_pblock: + * stores a large physical block number into an extent struct, + * breaking it into parts + */ +static inline void ext4_ext_store_pblock(struct ext4_extent *ex, + ext4_fsblk_t pb) +{ + ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & + 0xffff); +} + +/* + * ext4_idx_store_pblock: + * stores a large physical block number into an index struct, + * breaking it into parts + */ +static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, + ext4_fsblk_t pb) +{ + ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & + 0xffff); +} + +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_extent_tree_init(handle_t *, struct inode *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path *); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +#endif /* _EXT4_EXTENTS */ + diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c new file mode 100644 index 00000000..f5240aa1 --- /dev/null +++ b/fs/ext4/ext4_jbd2.c @@ -0,0 +1,152 @@ +/* + * Interface between ext4 and JBD + */ + +#include "ext4_jbd2.h" + +#include + +int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh) +{ + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_write_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, bh, + handle, err); + } + return err; +} + +/* + * The ext4 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + * + * If the handle isn't valid we're not journaling, but we still need to + * call into ext4_journal_revoke() to put the buffer head. + */ +int __ext4_forget(const char *where, unsigned int line, handle_t *handle, + int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr) +{ + int err; + + might_sleep(); + + trace_ext4_forget(inode, is_metadata, blocknr); + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %x\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* In the no journal case, we can just do a bforget and return */ + if (!ext4_handle_valid(handle)) { + bforget(bh); + return 0; + } + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext4_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call jbd2_journal_forget"); + err = jbd2_journal_forget(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + return err; + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call jbd2_journal_revoke"); + err = jbd2_journal_revoke(handle, blocknr, bh); + if (err) { + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + __ext4_abort(inode->i_sb, where, line, + "error %d when attempting revoke", err); + } + BUFFER_TRACE(bh, "exit"); + return err; +} + +int __ext4_journal_get_create_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh) +{ + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_create_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + } + return err; +} + +int __ext4_handle_dirty_metadata(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_dirty_metadata(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + } else { + if (inode) + mark_buffer_dirty_inode(bh, inode); + else + mark_buffer_dirty(bh); + if (inode && inode_needs_sync(inode)) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + struct ext4_super_block *es; + + es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_block = + cpu_to_le64(bh->b_blocknr); + ext4_error_inode(inode, where, line, + bh->b_blocknr, + "IO error syncing itable block"); + err = -EIO; + } + } + } + return err; +} + +int __ext4_handle_dirty_super(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb) +{ + struct buffer_head *bh = EXT4_SB(sb)->s_sbh; + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_dirty_metadata(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + } else + sb->s_dirt = 1; + return err; +} diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h new file mode 100644 index 00000000..95af6f87 --- /dev/null +++ b/fs/ext4/ext4_jbd2.h @@ -0,0 +1,327 @@ +/* + * ext4_jbd2.h + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1998--1999 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Ext4-specific journaling extensions. + */ + +#ifndef _EXT4_JBD2_H +#define _EXT4_JBD2_H + +#include +#include +#include "ext4.h" + +#define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal) + +/* Define the number of blocks we need to account to a transaction to + * modify one block of data. + * + * We may have to touch one inode, one bitmap buffer, up to three + * indirection blocks, the group and superblock summaries, and the data + * block to complete the transaction. + * + * For extents-enabled fs we may have to allocate and modify up to + * 5 levels of tree + root which are stored in the inode. */ + +#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + ? 27U : 8U) + +/* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ + +#define EXT4_XATTR_TRANS_BLOCKS 6U + +/* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ + EXT4_XATTR_TRANS_BLOCKS - 2 + \ + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) + +/* + * Define the number of metadata blocks we need to account to modify data. + * + * This include super block, inode block, quota blocks and xattr blocks + */ +#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) + +/* Delete operations potentially hit one directory's namespace plus an + * entire inode, plus arbitrary amounts of bitmap/indirection data. Be + * generous. We can grow the delete transaction later if necessary. */ + +#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64) + +/* Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * write(2) and truncate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. */ + +#define EXT4_MAX_TRANS_DATA 64U + +/* We break up a large truncate or write transaction once the handle's + * buffer credits gets this low, we need either to extend the + * transaction or to start a new one. Reserve enough space here for + * inode, bitmap, superblock, group and indirection updates for at least + * one block, plus two quota updates. Quota allocations are not + * needed. */ + +#define EXT4_RESERVE_TRANS_BLOCKS 12U + +#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8 + +#ifdef CONFIG_QUOTA +/* Amount of blocks needed for quota update - we know that the structure was + * allocated so we need to update only data block */ +#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0) +/* Amount of blocks needed for quota insert/delete - we do some block writes + * but inode, sb and group updates are done only once */ +#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ + (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) + +#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ + (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) +#else +#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0 +#define EXT4_QUOTA_INIT_BLOCKS(sb) 0 +#define EXT4_QUOTA_DEL_BLOCKS(sb) 0 +#endif +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) + +int +ext4_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc); + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc); + +int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); + +/* + * Wrapper functions with which ext4 calls into JBD. + */ +void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, + struct buffer_head *bh, handle_t *handle, int err); + +int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh); + +int __ext4_forget(const char *where, unsigned int line, handle_t *handle, + int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr); + +int __ext4_journal_get_create_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh); + +int __ext4_handle_dirty_metadata(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct buffer_head *bh); + +int __ext4_handle_dirty_super(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb); + +#define ext4_journal_get_write_access(handle, bh) \ + __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) +#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ + __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ + (bh), (block_nr)) +#define ext4_journal_get_create_access(handle, bh) \ + __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh)) +#define ext4_handle_dirty_metadata(handle, inode, bh) \ + __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ + (bh)) +#define ext4_handle_dirty_super(handle, sb) \ + __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) + +handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); + +#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) + +/* Note: Do not use this for NULL handles. This is only to determine if + * a properly allocated handle is using a journal or not. */ +static inline int ext4_handle_valid(handle_t *handle) +{ + if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) + return 0; + return 1; +} + +static inline void ext4_handle_sync(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + handle->h_sync = 1; +} + +static inline void ext4_handle_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + if (ext4_handle_valid(handle)) + jbd2_journal_release_buffer(handle, bh); +} + +static inline int ext4_handle_is_aborted(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + return is_handle_aborted(handle); + return 0; +} + +static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) +{ + if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed) + return 0; + return 1; +} + +static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) +{ + return ext4_journal_start_sb(inode->i_sb, nblocks); +} + +#define ext4_journal_stop(handle) \ + __ext4_journal_stop(__func__, __LINE__, (handle)) + +static inline handle_t *ext4_journal_current_handle(void) +{ + return journal_current_handle(); +} + +static inline int ext4_journal_extend(handle_t *handle, int nblocks) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_extend(handle, nblocks); + return 0; +} + +static inline int ext4_journal_restart(handle_t *handle, int nblocks) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_restart(handle, nblocks); + return 0; +} + +static inline int ext4_journal_blocks_per_page(struct inode *inode) +{ + if (EXT4_JOURNAL(inode) != NULL) + return jbd2_journal_blocks_per_page(inode); + return 0; +} + +static inline int ext4_journal_force_commit(journal_t *journal) +{ + if (journal) + return jbd2_journal_force_commit(journal); + return 0; +} + +static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode); + return 0; +} + +static inline void ext4_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + ei->i_datasync_tid = handle->h_transaction->t_tid; + } +} + +/* super.c */ +int ext4_force_commit(struct super_block *sb); + +/* + * Ext4 inode journal modes + */ +#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ +#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ +#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ + +static inline int ext4_inode_journal_mode(struct inode *inode) +{ + if (EXT4_JOURNAL(inode) == NULL) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC)) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + else + BUG(); +} + +static inline int ext4_should_journal_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; +} + +static inline int ext4_should_order_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; +} + +static inline int ext4_should_writeback_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; +} + +/* + * This function controls whether or not we should try to go down the + * dioread_nolock code paths, which makes it safe to avoid taking + * i_mutex for direct I/O reads. This only works for extent-based + * files, and it doesn't work if data journaling is enabled, since the + * dioread_nolock code uses b_private to pass information back to the + * I/O completion handler, and this conflicts with the jbd's use of + * b_private. + */ +static inline int ext4_should_dioread_nolock(struct inode *inode) +{ + if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) + return 0; + if (!S_ISREG(inode->i_mode)) + return 0; + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return 0; + if (ext4_should_journal_data(inode)) + return 0; + return 1; +} + +#endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c new file mode 100644 index 00000000..611647b2 --- /dev/null +++ b/fs/ext4/extents.c @@ -0,0 +1,4364 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * Architecture independence: + * Copyright (c) 2005, Bull S.A. + * Written by Pierre Peiffer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +/* + * Extents support for EXT4 + * + * TODO: + * - ext4*_error() should be used in some situations + * - analyze all BUG()/BUG_ON(), use -EIO where appropriate + * - smart tree reduction + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4_extents.h" + +#include + +static int ext4_split_extent(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_map_blocks *map, + int split_flag, + int flags); + +static int ext4_ext_truncate_extend_restart(handle_t *handle, + struct inode *inode, + int needed) +{ + int err; + + if (!ext4_handle_valid(handle)) + return 0; + if (handle->h_buffer_credits > needed) + return 0; + err = ext4_journal_extend(handle, needed); + if (err <= 0) + return err; + err = ext4_truncate_restart_trans(handle, inode, needed); + if (err == 0) + err = -EAGAIN; + + return err; +} + +/* + * could return: + * - EROFS + * - ENOMEM + */ +static int ext4_ext_get_access(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + if (path->p_bh) { + /* path points to block */ + return ext4_journal_get_write_access(handle, path->p_bh); + } + /* path points to leaf/index in inode body */ + /* we use in-core data, no need to protect them */ + return 0; +} + +/* + * could return: + * - EROFS + * - ENOMEM + * - EIO + */ +static int ext4_ext_dirty(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int err; + if (path->p_bh) { + /* path points to block */ + err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); + } else { + /* path points to leaf/index in inode body */ + err = ext4_mark_inode_dirty(handle, inode); + } + return err; +} + +static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t block) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + ext4_grpblk_t colour; + ext4_group_t block_group; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); + int depth; + + if (path) { + struct ext4_extent *ex; + depth = path->p_depth; + + /* + * Try to predict block placement assuming that we are + * filling in a file which will eventually be + * non-sparse --- i.e., in the case of libbfd writing + * an ELF object sections out-of-order but in a way + * the eventually results in a contiguous object or + * executable file, or some database extending a table + * space file. However, this is actually somewhat + * non-ideal if we are writing a sparse file such as + * qemu or KVM writing a raw image file that is going + * to stay fairly sparse, since it will end up + * fragmenting the file system's free space. Maybe we + * should have some hueristics or some way to allow + * userspace to pass a hint to file system, + * especially if the latter case turns out to be + * common. + */ + ex = path[depth].p_ext; + if (ex) { + ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); + ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); + + if (block > ext_block) + return ext_pblk + (block - ext_block); + else + return ext_pblk - (ext_block - block); + } + + /* it looks like index is empty; + * try to find starting block from index itself */ + if (path[depth].p_bh) + return path[depth].p_bh->b_blocknr; + } + + /* OK. use inode's group */ + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + /* + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular + * files will start at the second block group. This + * tends to speed up directory access and improves + * fsck times. + */ + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); + return bg_start + colour + block; +} + +/* + * Allocation for a meta data block + */ +static ext4_fsblk_t +ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex, int *err, unsigned int flags) +{ + ext4_fsblk_t goal, newblock; + + goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); + newblock = ext4_new_meta_blocks(handle, inode, goal, flags, + NULL, err); + return newblock; +} + +static inline int ext4_ext_space_block(struct inode *inode, int check) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent); + if (!check) { +#ifdef AGGRESSIVE_TEST + if (size > 6) + size = 6; +#endif + } + return size; +} + +static inline int ext4_ext_space_block_idx(struct inode *inode, int check) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent_idx); + if (!check) { +#ifdef AGGRESSIVE_TEST + if (size > 5) + size = 5; +#endif + } + return size; +} + +static inline int ext4_ext_space_root(struct inode *inode, int check) +{ + int size; + + size = sizeof(EXT4_I(inode)->i_data); + size -= sizeof(struct ext4_extent_header); + size /= sizeof(struct ext4_extent); + if (!check) { +#ifdef AGGRESSIVE_TEST + if (size > 3) + size = 3; +#endif + } + return size; +} + +static inline int ext4_ext_space_root_idx(struct inode *inode, int check) +{ + int size; + + size = sizeof(EXT4_I(inode)->i_data); + size -= sizeof(struct ext4_extent_header); + size /= sizeof(struct ext4_extent_idx); + if (!check) { +#ifdef AGGRESSIVE_TEST + if (size > 4) + size = 4; +#endif + } + return size; +} + +/* + * Calculate the number of metadata blocks needed + * to allocate @blocks + * Worse case is one block per extent + */ +int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + int idxs, num = 0; + + idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent_idx)); + + /* + * If the new delayed allocation block is contiguous with the + * previous da block, it can share index blocks with the + * previous block, so we only need to allocate a new index + * block every idxs leaf blocks. At ldxs**2 blocks, we need + * an additional index block, and at ldxs**3 blocks, yet + * another index blocks. + */ + if (ei->i_da_metadata_calc_len && + ei->i_da_metadata_calc_last_lblock+1 == lblock) { + if ((ei->i_da_metadata_calc_len % idxs) == 0) + num++; + if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) + num++; + if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { + num++; + ei->i_da_metadata_calc_len = 0; + } else + ei->i_da_metadata_calc_len++; + ei->i_da_metadata_calc_last_lblock++; + return num; + } + + /* + * In the worst case we need a new set of index blocks at + * every level of the inode's extent tree. + */ + ei->i_da_metadata_calc_len = 1; + ei->i_da_metadata_calc_last_lblock = lblock; + return ext_depth(inode) + 1; +} + +static int +ext4_ext_max_entries(struct inode *inode, int depth) +{ + int max; + + if (depth == ext_depth(inode)) { + if (depth == 0) + max = ext4_ext_space_root(inode, 1); + else + max = ext4_ext_space_root_idx(inode, 1); + } else { + if (depth == 0) + max = ext4_ext_space_block(inode, 1); + else + max = ext4_ext_space_block_idx(inode, 1); + } + + return max; +} + +static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) +{ + ext4_fsblk_t block = ext4_ext_pblock(ext); + int len = ext4_ext_get_actual_len(ext); + + if (len == 0) + return 0; + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); +} + +static int ext4_valid_extent_idx(struct inode *inode, + struct ext4_extent_idx *ext_idx) +{ + ext4_fsblk_t block = ext4_idx_pblock(ext_idx); + + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); +} + +static int ext4_valid_extent_entries(struct inode *inode, + struct ext4_extent_header *eh, + int depth) +{ + struct ext4_extent *ext; + struct ext4_extent_idx *ext_idx; + unsigned short entries; + if (eh->eh_entries == 0) + return 1; + + entries = le16_to_cpu(eh->eh_entries); + + if (depth == 0) { + /* leaf entries */ + ext = EXT_FIRST_EXTENT(eh); + while (entries) { + if (!ext4_valid_extent(inode, ext)) + return 0; + ext++; + entries--; + } + } else { + ext_idx = EXT_FIRST_INDEX(eh); + while (entries) { + if (!ext4_valid_extent_idx(inode, ext_idx)) + return 0; + ext_idx++; + entries--; + } + } + return 1; +} + +static int __ext4_ext_check(const char *function, unsigned int line, + struct inode *inode, struct ext4_extent_header *eh, + int depth) +{ + const char *error_msg; + int max = 0; + + if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { + error_msg = "invalid magic"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { + error_msg = "unexpected eh_depth"; + goto corrupted; + } + if (unlikely(eh->eh_max == 0)) { + error_msg = "invalid eh_max"; + goto corrupted; + } + max = ext4_ext_max_entries(inode, depth); + if (unlikely(le16_to_cpu(eh->eh_max) > max)) { + error_msg = "too large eh_max"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { + error_msg = "invalid eh_entries"; + goto corrupted; + } + if (!ext4_valid_extent_entries(inode, eh, depth)) { + error_msg = "invalid extent entries"; + goto corrupted; + } + return 0; + +corrupted: + ext4_error_inode(inode, function, line, 0, + "bad header/extent: %s - magic %x, " + "entries %u, max %u(%u), depth %u(%u)", + error_msg, le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), + max, le16_to_cpu(eh->eh_depth), depth); + + return -EIO; +} + +#define ext4_ext_check(inode, eh, depth) \ + __ext4_ext_check(__func__, __LINE__, inode, eh, depth) + +int ext4_ext_check_inode(struct inode *inode) +{ + return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); +} + +#ifdef EXT_DEBUG +static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) +{ + int k, l = path->p_depth; + + ext_debug("path:"); + for (k = 0; k <= l; k++, path++) { + if (path->p_idx) { + ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), + ext4_idx_pblock(path->p_idx)); + } else if (path->p_ext) { + ext_debug(" %d:[%d]%d:%llu ", + le32_to_cpu(path->p_ext->ee_block), + ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_get_actual_len(path->p_ext), + ext4_ext_pblock(path->p_ext)); + } else + ext_debug(" []"); + } + ext_debug("\n"); +} + +static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) +{ + int depth = ext_depth(inode); + struct ext4_extent_header *eh; + struct ext4_extent *ex; + int i; + + if (!path) + return; + + eh = path[depth].p_hdr; + ex = EXT_FIRST_EXTENT(eh); + + ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); + + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { + ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); + } + ext_debug("\n"); +} + +static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, + ext4_fsblk_t newblock, int level) +{ + int depth = ext_depth(inode); + struct ext4_extent *ex; + + if (depth != level) { + struct ext4_extent_idx *idx; + idx = path[level].p_idx; + while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { + ext_debug("%d: move %d:%llu in new index %llu\n", level, + le32_to_cpu(idx->ei_block), + ext4_idx_pblock(idx), + newblock); + idx++; + } + + return; + } + + ex = path[depth].p_ext; + while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { + ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", + le32_to_cpu(ex->ee_block), + ext4_ext_pblock(ex), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + newblock); + ex++; + } +} + +#else +#define ext4_ext_show_path(inode, path) +#define ext4_ext_show_leaf(inode, path) +#define ext4_ext_show_move(inode, path, newblock, level) +#endif + +void ext4_ext_drop_refs(struct ext4_ext_path *path) +{ + int depth = path->p_depth; + int i; + + for (i = 0; i <= depth; i++, path++) + if (path->p_bh) { + brelse(path->p_bh); + path->p_bh = NULL; + } +} + +/* + * ext4_ext_binsearch_idx: + * binary search for the closest index of the given block + * the header must be checked before calling this + */ +static void +ext4_ext_binsearch_idx(struct inode *inode, + struct ext4_ext_path *path, ext4_lblk_t block) +{ + struct ext4_extent_header *eh = path->p_hdr; + struct ext4_extent_idx *r, *l, *m; + + + ext_debug("binsearch for %u(idx): ", block); + + l = EXT_FIRST_INDEX(eh) + 1; + r = EXT_LAST_INDEX(eh); + while (l <= r) { + m = l + (r - l) / 2; + if (block < le32_to_cpu(m->ei_block)) + r = m - 1; + else + l = m + 1; + ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), + m, le32_to_cpu(m->ei_block), + r, le32_to_cpu(r->ei_block)); + } + + path->p_idx = l - 1; + ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), + ext4_idx_pblock(path->p_idx)); + +#ifdef CHECK_BINSEARCH + { + struct ext4_extent_idx *chix, *ix; + int k; + + chix = ix = EXT_FIRST_INDEX(eh); + for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { + if (k != 0 && + le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { + printk(KERN_DEBUG "k=%d, ix=0x%p, " + "first=0x%p\n", k, + ix, EXT_FIRST_INDEX(eh)); + printk(KERN_DEBUG "%u <= %u\n", + le32_to_cpu(ix->ei_block), + le32_to_cpu(ix[-1].ei_block)); + } + BUG_ON(k && le32_to_cpu(ix->ei_block) + <= le32_to_cpu(ix[-1].ei_block)); + if (block < le32_to_cpu(ix->ei_block)) + break; + chix = ix; + } + BUG_ON(chix != path->p_idx); + } +#endif + +} + +/* + * ext4_ext_binsearch: + * binary search for closest extent of the given block + * the header must be checked before calling this + */ +static void +ext4_ext_binsearch(struct inode *inode, + struct ext4_ext_path *path, ext4_lblk_t block) +{ + struct ext4_extent_header *eh = path->p_hdr; + struct ext4_extent *r, *l, *m; + + if (eh->eh_entries == 0) { + /* + * this leaf is empty: + * we get such a leaf in split/add case + */ + return; + } + + ext_debug("binsearch for %u: ", block); + + l = EXT_FIRST_EXTENT(eh) + 1; + r = EXT_LAST_EXTENT(eh); + + while (l <= r) { + m = l + (r - l) / 2; + if (block < le32_to_cpu(m->ee_block)) + r = m - 1; + else + l = m + 1; + ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), + m, le32_to_cpu(m->ee_block), + r, le32_to_cpu(r->ee_block)); + } + + path->p_ext = l - 1; + ext_debug(" -> %d:%llu:[%d]%d ", + le32_to_cpu(path->p_ext->ee_block), + ext4_ext_pblock(path->p_ext), + ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_get_actual_len(path->p_ext)); + +#ifdef CHECK_BINSEARCH + { + struct ext4_extent *chex, *ex; + int k; + + chex = ex = EXT_FIRST_EXTENT(eh); + for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { + BUG_ON(k && le32_to_cpu(ex->ee_block) + <= le32_to_cpu(ex[-1].ee_block)); + if (block < le32_to_cpu(ex->ee_block)) + break; + chex = ex; + } + BUG_ON(chex != path->p_ext); + } +#endif + +} + +int ext4_ext_tree_init(handle_t *handle, struct inode *inode) +{ + struct ext4_extent_header *eh; + + eh = ext_inode_hdr(inode); + eh->eh_depth = 0; + eh->eh_entries = 0; + eh->eh_magic = EXT4_EXT_MAGIC; + eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); + ext4_mark_inode_dirty(handle, inode); + ext4_ext_invalidate_cache(inode); + return 0; +} + +struct ext4_ext_path * +ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, + struct ext4_ext_path *path) +{ + struct ext4_extent_header *eh; + struct buffer_head *bh; + short int depth, i, ppos = 0, alloc = 0; + + eh = ext_inode_hdr(inode); + depth = ext_depth(inode); + + /* account possible depth increase */ + if (!path) { + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), + GFP_NOFS); + if (!path) + return ERR_PTR(-ENOMEM); + alloc = 1; + } + path[0].p_hdr = eh; + path[0].p_bh = NULL; + + i = depth; + /* walk through the tree */ + while (i) { + int need_to_validate = 0; + + ext_debug("depth %d: num %d, max %d\n", + ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + + ext4_ext_binsearch_idx(inode, path + ppos, block); + path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); + path[ppos].p_depth = i; + path[ppos].p_ext = NULL; + + bh = sb_getblk(inode->i_sb, path[ppos].p_block); + if (unlikely(!bh)) + goto err; + if (!bh_uptodate_or_lock(bh)) { + trace_ext4_ext_load_extent(inode, block, + path[ppos].p_block); + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto err; + } + /* validate the extent entries */ + need_to_validate = 1; + } + eh = ext_block_hdr(bh); + ppos++; + if (unlikely(ppos > depth)) { + put_bh(bh); + EXT4_ERROR_INODE(inode, + "ppos %d > depth %d", ppos, depth); + goto err; + } + path[ppos].p_bh = bh; + path[ppos].p_hdr = eh; + i--; + + if (need_to_validate && ext4_ext_check(inode, eh, i)) + goto err; + } + + path[ppos].p_depth = i; + path[ppos].p_ext = NULL; + path[ppos].p_idx = NULL; + + /* find extent */ + ext4_ext_binsearch(inode, path + ppos, block); + /* if not an empty leaf */ + if (path[ppos].p_ext) + path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); + + ext4_ext_show_path(inode, path); + + return path; + +err: + ext4_ext_drop_refs(path); + if (alloc) + kfree(path); + return ERR_PTR(-EIO); +} + +/* + * ext4_ext_insert_index: + * insert new index [@logical;@ptr] into the block at @curp; + * check where to insert: before @curp or after @curp + */ +static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, + struct ext4_ext_path *curp, + int logical, ext4_fsblk_t ptr) +{ + struct ext4_extent_idx *ix; + int len, err; + + err = ext4_ext_get_access(handle, inode, curp); + if (err) + return err; + + if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { + EXT4_ERROR_INODE(inode, + "logical %d == ei_block %d!", + logical, le32_to_cpu(curp->p_idx->ei_block)); + return -EIO; + } + len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; + if (logical > le32_to_cpu(curp->p_idx->ei_block)) { + /* insert after */ + if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { + len = (len - 1) * sizeof(struct ext4_extent_idx); + len = len < 0 ? 0 : len; + ext_debug("insert new index %d after: %llu. " + "move %d from 0x%p to 0x%p\n", + logical, ptr, len, + (curp->p_idx + 1), (curp->p_idx + 2)); + memmove(curp->p_idx + 2, curp->p_idx + 1, len); + } + ix = curp->p_idx + 1; + } else { + /* insert before */ + len = len * sizeof(struct ext4_extent_idx); + len = len < 0 ? 0 : len; + ext_debug("insert new index %d before: %llu. " + "move %d from 0x%p to 0x%p\n", + logical, ptr, len, + curp->p_idx, (curp->p_idx + 1)); + memmove(curp->p_idx + 1, curp->p_idx, len); + ix = curp->p_idx; + } + + ix->ei_block = cpu_to_le32(logical); + ext4_idx_store_pblock(ix, ptr); + le16_add_cpu(&curp->p_hdr->eh_entries, 1); + + if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) + > le16_to_cpu(curp->p_hdr->eh_max))) { + EXT4_ERROR_INODE(inode, + "logical %d == ei_block %d!", + logical, le32_to_cpu(curp->p_idx->ei_block)); + return -EIO; + } + if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); + return -EIO; + } + + err = ext4_ext_dirty(handle, inode, curp); + ext4_std_error(inode->i_sb, err); + + return err; +} + +/* + * ext4_ext_split: + * inserts new subtree into the path, using free index entry + * at depth @at: + * - allocates all needed blocks (new leaf and all intermediate index blocks) + * - makes decision where to split + * - moves remaining extents and index entries (right to the split point) + * into the newly allocated blocks + * - initializes subtree + */ +static int ext4_ext_split(handle_t *handle, struct inode *inode, + unsigned int flags, + struct ext4_ext_path *path, + struct ext4_extent *newext, int at) +{ + struct buffer_head *bh = NULL; + int depth = ext_depth(inode); + struct ext4_extent_header *neh; + struct ext4_extent_idx *fidx; + int i = at, k, m, a; + ext4_fsblk_t newblock, oldblock; + __le32 border; + ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ + int err = 0; + + /* make decision: where to split? */ + /* FIXME: now decision is simplest: at current extent */ + + /* if current leaf will be split, then we should use + * border from split point */ + if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); + return -EIO; + } + if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { + border = path[depth].p_ext[1].ee_block; + ext_debug("leaf will be split." + " next leaf starts at %d\n", + le32_to_cpu(border)); + } else { + border = newext->ee_block; + ext_debug("leaf will be added." + " next leaf starts at %d\n", + le32_to_cpu(border)); + } + + /* + * If error occurs, then we break processing + * and mark filesystem read-only. index won't + * be inserted and tree will be in consistent + * state. Next mount will repair buffers too. + */ + + /* + * Get array to track all allocated blocks. + * We need this to handle errors and free blocks + * upon them. + */ + ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); + if (!ablocks) + return -ENOMEM; + + /* allocate all needed blocks */ + ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); + for (a = 0; a < depth - at; a++) { + newblock = ext4_ext_new_meta_block(handle, inode, path, + newext, &err, flags); + if (newblock == 0) + goto cleanup; + ablocks[a] = newblock; + } + + /* initialize new leaf */ + newblock = ablocks[--a]; + if (unlikely(newblock == 0)) { + EXT4_ERROR_INODE(inode, "newblock == 0!"); + err = -EIO; + goto cleanup; + } + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; + } + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, bh); + if (err) + goto cleanup; + + neh = ext_block_hdr(bh); + neh->eh_entries = 0; + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); + neh->eh_magic = EXT4_EXT_MAGIC; + neh->eh_depth = 0; + + /* move remainder of path[depth] to the new leaf */ + if (unlikely(path[depth].p_hdr->eh_entries != + path[depth].p_hdr->eh_max)) { + EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", + path[depth].p_hdr->eh_entries, + path[depth].p_hdr->eh_max); + err = -EIO; + goto cleanup; + } + /* start copy from next extent */ + m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; + ext4_ext_show_move(inode, path, newblock, depth); + if (m) { + struct ext4_extent *ex; + ex = EXT_FIRST_EXTENT(neh); + memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); + le16_add_cpu(&neh->eh_entries, m); + } + + set_buffer_uptodate(bh); + unlock_buffer(bh); + + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old leaf */ + if (m) { + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto cleanup; + le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto cleanup; + + } + + /* create intermediate indexes */ + k = depth - at - 1; + if (unlikely(k < 0)) { + EXT4_ERROR_INODE(inode, "k %d < 0!", k); + err = -EIO; + goto cleanup; + } + if (k) + ext_debug("create %d intermediate indices\n", k); + /* insert new index into current index block */ + /* current depth stored in i var */ + i = depth - 1; + while (k--) { + oldblock = newblock; + newblock = ablocks[--a]; + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; + } + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, bh); + if (err) + goto cleanup; + + neh = ext_block_hdr(bh); + neh->eh_entries = cpu_to_le16(1); + neh->eh_magic = EXT4_EXT_MAGIC; + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); + neh->eh_depth = cpu_to_le16(depth - i); + fidx = EXT_FIRST_INDEX(neh); + fidx->ei_block = border; + ext4_idx_store_pblock(fidx, oldblock); + + ext_debug("int.index at %d (block %llu): %u -> %llu\n", + i, newblock, le32_to_cpu(border), oldblock); + + /* move remainder of path[i] to the new index block */ + if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != + EXT_LAST_INDEX(path[i].p_hdr))) { + EXT4_ERROR_INODE(inode, + "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", + le32_to_cpu(path[i].p_ext->ee_block)); + err = -EIO; + goto cleanup; + } + /* start copy indexes */ + m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; + ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, + EXT_MAX_INDEX(path[i].p_hdr)); + ext4_ext_show_move(inode, path, newblock, i); + if (m) { + memmove(++fidx, path[i].p_idx, + sizeof(struct ext4_extent_idx) * m); + le16_add_cpu(&neh->eh_entries, m); + } + set_buffer_uptodate(bh); + unlock_buffer(bh); + + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old index */ + if (m) { + err = ext4_ext_get_access(handle, inode, path + i); + if (err) + goto cleanup; + le16_add_cpu(&path[i].p_hdr->eh_entries, -m); + err = ext4_ext_dirty(handle, inode, path + i); + if (err) + goto cleanup; + } + + i--; + } + + /* insert new index */ + err = ext4_ext_insert_index(handle, inode, path + at, + le32_to_cpu(border), newblock); + +cleanup: + if (bh) { + if (buffer_locked(bh)) + unlock_buffer(bh); + brelse(bh); + } + + if (err) { + /* free all allocated blocks in error case */ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; + ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, + EXT4_FREE_BLOCKS_METADATA); + } + } + kfree(ablocks); + + return err; +} + +/* + * ext4_ext_grow_indepth: + * implements tree growing procedure: + * - allocates new block + * - moves top-level data (index block or leaf) into the new block + * - initializes new top-level, creating index that points to the + * just created block + */ +static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, + unsigned int flags, + struct ext4_ext_path *path, + struct ext4_extent *newext) +{ + struct ext4_ext_path *curp = path; + struct ext4_extent_header *neh; + struct buffer_head *bh; + ext4_fsblk_t newblock; + int err = 0; + + newblock = ext4_ext_new_meta_block(handle, inode, path, + newext, &err, flags); + if (newblock == 0) + return err; + + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + ext4_std_error(inode->i_sb, err); + return err; + } + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + goto out; + } + + /* move top-level index/leaf into new block */ + memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data)); + + /* set size of new block */ + neh = ext_block_hdr(bh); + /* old root could have indexes or leaves + * so calculate e_max right way */ + if (ext_depth(inode)) + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); + else + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); + neh->eh_magic = EXT4_EXT_MAGIC; + set_buffer_uptodate(bh); + unlock_buffer(bh); + + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto out; + + /* create index in new top-level index: num,max,pointer */ + err = ext4_ext_get_access(handle, inode, curp); + if (err) + goto out; + + curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; + curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); + curp->p_hdr->eh_entries = cpu_to_le16(1); + curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); + + if (path[0].p_hdr->eh_depth) + curp->p_idx->ei_block = + EXT_FIRST_INDEX(path[0].p_hdr)->ei_block; + else + curp->p_idx->ei_block = + EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; + ext4_idx_store_pblock(curp->p_idx, newblock); + + neh = ext_inode_hdr(inode); + ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", + le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), + le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), + ext4_idx_pblock(EXT_FIRST_INDEX(neh))); + + neh->eh_depth = cpu_to_le16(path->p_depth + 1); + err = ext4_ext_dirty(handle, inode, curp); +out: + brelse(bh); + + return err; +} + +/* + * ext4_ext_create_new_leaf: + * finds empty index and adds new leaf. + * if no free index is found, then it requests in-depth growing. + */ +static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, + unsigned int flags, + struct ext4_ext_path *path, + struct ext4_extent *newext) +{ + struct ext4_ext_path *curp; + int depth, i, err = 0; + +repeat: + i = depth = ext_depth(inode); + + /* walk up to the tree and look for free index entry */ + curp = path + depth; + while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { + i--; + curp--; + } + + /* we use already allocated block for index block, + * so subsequent data blocks should be contiguous */ + if (EXT_HAS_FREE_INDEX(curp)) { + /* if we found index with free entry, then use that + * entry: create all needed subtree and add new leaf */ + err = ext4_ext_split(handle, inode, flags, path, newext, i); + if (err) + goto out; + + /* refill path */ + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, + (ext4_lblk_t)le32_to_cpu(newext->ee_block), + path); + if (IS_ERR(path)) + err = PTR_ERR(path); + } else { + /* tree is full, time to grow in depth */ + err = ext4_ext_grow_indepth(handle, inode, flags, + path, newext); + if (err) + goto out; + + /* refill path */ + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, + (ext4_lblk_t)le32_to_cpu(newext->ee_block), + path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + + /* + * only first (depth 0 -> 1) produces free space; + * in all other cases we have to split the grown tree + */ + depth = ext_depth(inode); + if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { + /* now we need to split */ + goto repeat; + } + } + +out: + return err; +} + +/* + * search the closest allocated block to the left for *logical + * and returns it at @logical + it's physical address at @phys + * if *logical is the smallest allocated block, the function + * returns 0 at @phys + * return value contains 0 (success) or error code + */ +static int ext4_ext_search_left(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *logical, ext4_fsblk_t *phys) +{ + struct ext4_extent_idx *ix; + struct ext4_extent *ex; + int depth, ee_len; + + if (unlikely(path == NULL)) { + EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); + return -EIO; + } + depth = path->p_depth; + *phys = 0; + + if (depth == 0 && path->p_ext == NULL) + return 0; + + /* usually extent in the path covers blocks smaller + * then *logical, but it can be that extent is the + * first one in the file */ + + ex = path[depth].p_ext; + ee_len = ext4_ext_get_actual_len(ex); + if (*logical < le32_to_cpu(ex->ee_block)) { + if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { + EXT4_ERROR_INODE(inode, + "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", + *logical, le32_to_cpu(ex->ee_block)); + return -EIO; + } + while (--depth >= 0) { + ix = path[depth].p_idx; + if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, + "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", + ix != NULL ? ix->ei_block : 0, + EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? + EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, + depth); + return -EIO; + } + } + return 0; + } + + if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { + EXT4_ERROR_INODE(inode, + "logical %d < ee_block %d + ee_len %d!", + *logical, le32_to_cpu(ex->ee_block), ee_len); + return -EIO; + } + + *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; + *phys = ext4_ext_pblock(ex) + ee_len - 1; + return 0; +} + +/* + * search the closest allocated block to the right for *logical + * and returns it at @logical + it's physical address at @phys + * if *logical is the smallest allocated block, the function + * returns 0 at @phys + * return value contains 0 (success) or error code + */ +static int ext4_ext_search_right(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *logical, ext4_fsblk_t *phys) +{ + struct buffer_head *bh = NULL; + struct ext4_extent_header *eh; + struct ext4_extent_idx *ix; + struct ext4_extent *ex; + ext4_fsblk_t block; + int depth; /* Note, NOT eh_depth; depth from top of tree */ + int ee_len; + + if (unlikely(path == NULL)) { + EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); + return -EIO; + } + depth = path->p_depth; + *phys = 0; + + if (depth == 0 && path->p_ext == NULL) + return 0; + + /* usually extent in the path covers blocks smaller + * then *logical, but it can be that extent is the + * first one in the file */ + + ex = path[depth].p_ext; + ee_len = ext4_ext_get_actual_len(ex); + if (*logical < le32_to_cpu(ex->ee_block)) { + if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { + EXT4_ERROR_INODE(inode, + "first_extent(path[%d].p_hdr) != ex", + depth); + return -EIO; + } + while (--depth >= 0) { + ix = path[depth].p_idx; + if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, + "ix != EXT_FIRST_INDEX *logical %d!", + *logical); + return -EIO; + } + } + *logical = le32_to_cpu(ex->ee_block); + *phys = ext4_ext_pblock(ex); + return 0; + } + + if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { + EXT4_ERROR_INODE(inode, + "logical %d < ee_block %d + ee_len %d!", + *logical, le32_to_cpu(ex->ee_block), ee_len); + return -EIO; + } + + if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { + /* next allocated block in this leaf */ + ex++; + *logical = le32_to_cpu(ex->ee_block); + *phys = ext4_ext_pblock(ex); + return 0; + } + + /* go up and search for index to the right */ + while (--depth >= 0) { + ix = path[depth].p_idx; + if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) + goto got_index; + } + + /* we've gone up to the root and found no index to the right */ + return 0; + +got_index: + /* we've found index to the right, let's + * follow it and find the closest allocated + * block to the right */ + ix++; + block = ext4_idx_pblock(ix); + while (++depth < path->p_depth) { + bh = sb_bread(inode->i_sb, block); + if (bh == NULL) + return -EIO; + eh = ext_block_hdr(bh); + /* subtract from p_depth to get proper eh_depth */ + if (ext4_ext_check(inode, eh, path->p_depth - depth)) { + put_bh(bh); + return -EIO; + } + ix = EXT_FIRST_INDEX(eh); + block = ext4_idx_pblock(ix); + put_bh(bh); + } + + bh = sb_bread(inode->i_sb, block); + if (bh == NULL) + return -EIO; + eh = ext_block_hdr(bh); + if (ext4_ext_check(inode, eh, path->p_depth - depth)) { + put_bh(bh); + return -EIO; + } + ex = EXT_FIRST_EXTENT(eh); + *logical = le32_to_cpu(ex->ee_block); + *phys = ext4_ext_pblock(ex); + put_bh(bh); + return 0; +} + +/* + * ext4_ext_next_allocated_block: + * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. + * NOTE: it considers block number from index entry as + * allocated block. Thus, index entries have to be consistent + * with leaves. + */ +static ext4_lblk_t +ext4_ext_next_allocated_block(struct ext4_ext_path *path) +{ + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + + if (depth == 0 && path->p_ext == NULL) + return EXT_MAX_BLOCKS; + + while (depth >= 0) { + if (depth == path->p_depth) { + /* leaf */ + if (path[depth].p_ext != + EXT_LAST_EXTENT(path[depth].p_hdr)) + return le32_to_cpu(path[depth].p_ext[1].ee_block); + } else { + /* index */ + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return le32_to_cpu(path[depth].p_idx[1].ei_block); + } + depth--; + } + + return EXT_MAX_BLOCKS; +} + +/* + * ext4_ext_next_leaf_block: + * returns first allocated block from next leaf or EXT_MAX_BLOCKS + */ +static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, + struct ext4_ext_path *path) +{ + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + + /* zero-tree has no leaf blocks at all */ + if (depth == 0) + return EXT_MAX_BLOCKS; + + /* go to index block */ + depth--; + + while (depth >= 0) { + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return (ext4_lblk_t) + le32_to_cpu(path[depth].p_idx[1].ei_block); + depth--; + } + + return EXT_MAX_BLOCKS; +} + +/* + * ext4_ext_correct_indexes: + * if leaf gets modified and modified extent is first in the leaf, + * then we have to correct all indexes above. + * TODO: do we need to correct tree in all cases? + */ +static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + struct ext4_extent_header *eh; + int depth = ext_depth(inode); + struct ext4_extent *ex; + __le32 border; + int k, err = 0; + + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + + if (unlikely(ex == NULL || eh == NULL)) { + EXT4_ERROR_INODE(inode, + "ex %p == NULL or eh %p == NULL", ex, eh); + return -EIO; + } + + if (depth == 0) { + /* there is no tree at all */ + return 0; + } + + if (ex != EXT_FIRST_EXTENT(eh)) { + /* we correct tree if first leaf got modified only */ + return 0; + } + + /* + * TODO: we need correction if border is smaller than current one + */ + k = depth - 1; + border = path[depth].p_ext->ee_block; + err = ext4_ext_get_access(handle, inode, path + k); + if (err) + return err; + path[k].p_idx->ei_block = border; + err = ext4_ext_dirty(handle, inode, path + k); + if (err) + return err; + + while (k--) { + /* change all left-side indexes */ + if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) + break; + err = ext4_ext_get_access(handle, inode, path + k); + if (err) + break; + path[k].p_idx->ei_block = border; + err = ext4_ext_dirty(handle, inode, path + k); + if (err) + break; + } + + return err; +} + +int +ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, + struct ext4_extent *ex2) +{ + unsigned short ext1_ee_len, ext2_ee_len, max_len; + + /* + * Make sure that either both extents are uninitialized, or + * both are _not_. + */ + if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + return 0; + + if (ext4_ext_is_uninitialized(ex1)) + max_len = EXT_UNINIT_MAX_LEN; + else + max_len = EXT_INIT_MAX_LEN; + + ext1_ee_len = ext4_ext_get_actual_len(ex1); + ext2_ee_len = ext4_ext_get_actual_len(ex2); + + if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != + le32_to_cpu(ex2->ee_block)) + return 0; + + /* + * To allow future support for preallocated extents to be added + * as an RO_COMPAT feature, refuse to merge to extents if + * this can result in the top bit of ee_len being set. + */ + if (ext1_ee_len + ext2_ee_len > max_len) + return 0; +#ifdef AGGRESSIVE_TEST + if (ext1_ee_len >= 4) + return 0; +#endif + + if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) + return 1; + return 0; +} + +/* + * This function tries to merge the "ex" extent to the next extent in the tree. + * It always tries to merge towards right. If you want to merge towards + * left, pass "ex - 1" as argument instead of "ex". + * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns + * 1 if they got merged. + */ +static int ext4_ext_try_to_merge_right(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) +{ + struct ext4_extent_header *eh; + unsigned int depth, len; + int merge_done = 0; + int uninitialized = 0; + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + eh = path[depth].p_hdr; + + while (ex < EXT_LAST_EXTENT(eh)) { + if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) + break; + /* merge with next extent! */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(ex + 1)); + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + + if (ex + 1 < EXT_LAST_EXTENT(eh)) { + len = (EXT_LAST_EXTENT(eh) - ex - 1) + * sizeof(struct ext4_extent); + memmove(ex + 1, ex + 2, len); + } + le16_add_cpu(&eh->eh_entries, -1); + merge_done = 1; + WARN_ON(eh->eh_entries == 0); + if (!eh->eh_entries) + EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); + } + + return merge_done; +} + +/* + * This function tries to merge the @ex extent to neighbours in the tree. + * return 1 if merge left else 0. + */ +static int ext4_ext_try_to_merge(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) { + struct ext4_extent_header *eh; + unsigned int depth; + int merge_done = 0; + int ret = 0; + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + eh = path[depth].p_hdr; + + if (ex > EXT_FIRST_EXTENT(eh)) + merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); + + if (!merge_done) + ret = ext4_ext_try_to_merge_right(inode, path, ex); + + return ret; +} + +/* + * check if a portion of the "newext" extent overlaps with an + * existing extent. + * + * If there is an overlap discovered, it updates the length of the newext + * such that there will be no overlap, and then returns 1. + * If there is no overlap found, it returns 0. + */ +static unsigned int ext4_ext_check_overlap(struct inode *inode, + struct ext4_extent *newext, + struct ext4_ext_path *path) +{ + ext4_lblk_t b1, b2; + unsigned int depth, len1; + unsigned int ret = 0; + + b1 = le32_to_cpu(newext->ee_block); + len1 = ext4_ext_get_actual_len(newext); + depth = ext_depth(inode); + if (!path[depth].p_ext) + goto out; + b2 = le32_to_cpu(path[depth].p_ext->ee_block); + + /* + * get the next allocated block if the extent in the path + * is before the requested block(s) + */ + if (b2 < b1) { + b2 = ext4_ext_next_allocated_block(path); + if (b2 == EXT_MAX_BLOCKS) + goto out; + } + + /* check for wrap through zero on extent logical start block*/ + if (b1 + len1 < b1) { + len1 = EXT_MAX_BLOCKS - b1; + newext->ee_len = cpu_to_le16(len1); + ret = 1; + } + + /* check for overlap */ + if (b1 + len1 > b2) { + newext->ee_len = cpu_to_le16(b2 - b1); + ret = 1; + } +out: + return ret; +} + +/* + * ext4_ext_insert_extent: + * tries to merge requsted extent into the existing extent or + * inserts requested extent as new one into the tree, + * creating new leaf in the no-space case. + */ +int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int flag) +{ + struct ext4_extent_header *eh; + struct ext4_extent *ex, *fex; + struct ext4_extent *nearex; /* nearest extent */ + struct ext4_ext_path *npath = NULL; + int depth, len, err; + ext4_lblk_t next; + unsigned uninitialized = 0; + int flags = 0; + + if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { + EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); + return -EIO; + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + return -EIO; + } + + /* try to insert block into found extent and return */ + if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) + && ext4_can_extents_be_merged(inode, ex, newext)) { + ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + return err; + + /* + * ext4_can_extents_be_merged should have checked that either + * both extents are uninitialized, or both aren't. Thus we + * need to check only one of them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } + +repeat: + depth = ext_depth(inode); + eh = path[depth].p_hdr; + if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) + goto has_space; + + /* probably next leaf has space for us? */ + fex = EXT_LAST_EXTENT(eh); + next = ext4_ext_next_leaf_block(inode, path); + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) + && next != EXT_MAX_BLOCKS) { + ext_debug("next leaf block - %d\n", next); + BUG_ON(npath != NULL); + npath = ext4_ext_find_extent(inode, next, NULL); + if (IS_ERR(npath)) + return PTR_ERR(npath); + BUG_ON(npath->p_depth != path->p_depth); + eh = npath[depth].p_hdr; + if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { + ext_debug("next leaf isn't full(%d)\n", + le16_to_cpu(eh->eh_entries)); + path = npath; + goto repeat; + } + ext_debug("next leaf has no free space(%d,%d)\n", + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + } + + /* + * There is no free space in the found leaf. + * We're gonna add a new leaf in the tree. + */ + if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) + flags = EXT4_MB_USE_ROOT_BLOCKS; + err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); + if (err) + goto cleanup; + depth = ext_depth(inode); + eh = path[depth].p_hdr; + +has_space: + nearex = path[depth].p_ext; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto cleanup; + + if (!nearex) { + /* there is no extent in this leaf, create first one */ + ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext)); + path[depth].p_ext = EXT_FIRST_EXTENT(eh); + } else if (le32_to_cpu(newext->ee_block) + > le32_to_cpu(nearex->ee_block)) { +/* BUG_ON(newext->ee_block == nearex->ee_block); */ + if (nearex != EXT_LAST_EXTENT(eh)) { + len = EXT_MAX_EXTENT(eh) - nearex; + len = (len - 1) * sizeof(struct ext4_extent); + len = len < 0 ? 0 : len; + ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " + "move %d from 0x%p to 0x%p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + nearex, len, nearex + 1, nearex + 2); + memmove(nearex + 2, nearex + 1, len); + } + path[depth].p_ext = nearex + 1; + } else { + BUG_ON(newext->ee_block == nearex->ee_block); + len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); + len = len < 0 ? 0 : len; + ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " + "move %d from 0x%p to 0x%p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + nearex, len, nearex + 1, nearex + 2); + memmove(nearex + 1, nearex, len); + path[depth].p_ext = nearex; + } + + le16_add_cpu(&eh->eh_entries, 1); + nearex = path[depth].p_ext; + nearex->ee_block = newext->ee_block; + ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); + nearex->ee_len = newext->ee_len; + +merge: + /* try to merge extents to the right */ + if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) + ext4_ext_try_to_merge(inode, path, nearex); + + /* try to merge extents to the left */ + + /* time to correct all indexes above */ + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto cleanup; + + err = ext4_ext_dirty(handle, inode, path + depth); + +cleanup: + if (npath) { + ext4_ext_drop_refs(npath); + kfree(npath); + } + ext4_ext_invalidate_cache(inode); + return err; +} + +static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, + ext4_lblk_t num, ext_prepare_callback func, + void *cbdata) +{ + struct ext4_ext_path *path = NULL; + struct ext4_ext_cache cbex; + struct ext4_extent *ex; + ext4_lblk_t next, start = 0, end = 0; + ext4_lblk_t last = block + num; + int depth, exists, err = 0; + + BUG_ON(func == NULL); + BUG_ON(inode == NULL); + + while (block < last && block != EXT_MAX_BLOCKS) { + num = last - block; + /* find extent for this block */ + down_read(&EXT4_I(inode)->i_data_sem); + path = ext4_ext_find_extent(inode, block, path); + up_read(&EXT4_I(inode)->i_data_sem); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + break; + } + + depth = ext_depth(inode); + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + err = -EIO; + break; + } + ex = path[depth].p_ext; + next = ext4_ext_next_allocated_block(path); + + exists = 0; + if (!ex) { + /* there is no extent yet, so try to allocate + * all requested space */ + start = block; + end = block + num; + } else if (le32_to_cpu(ex->ee_block) > block) { + /* need to allocate space before found extent */ + start = block; + end = le32_to_cpu(ex->ee_block); + if (block + num < end) + end = block + num; + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { + /* need to allocate space after found extent */ + start = block; + end = block + num; + if (end >= next) + end = next; + } else if (block >= le32_to_cpu(ex->ee_block)) { + /* + * some part of requested space is covered + * by found extent + */ + start = block; + end = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); + if (block + num < end) + end = block + num; + exists = 1; + } else { + BUG(); + } + BUG_ON(end <= start); + + if (!exists) { + cbex.ec_block = start; + cbex.ec_len = end - start; + cbex.ec_start = 0; + } else { + cbex.ec_block = le32_to_cpu(ex->ee_block); + cbex.ec_len = ext4_ext_get_actual_len(ex); + cbex.ec_start = ext4_ext_pblock(ex); + } + + if (unlikely(cbex.ec_len == 0)) { + EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); + err = -EIO; + break; + } + err = func(inode, next, &cbex, ex, cbdata); + ext4_ext_drop_refs(path); + + if (err < 0) + break; + + if (err == EXT_REPEAT) + continue; + else if (err == EXT_BREAK) { + err = 0; + break; + } + + if (ext_depth(inode) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; + } + + block = cbex.ec_block + cbex.ec_len; + } + + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + + return err; +} + +static void +ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, + __u32 len, ext4_fsblk_t start) +{ + struct ext4_ext_cache *cex; + BUG_ON(len == 0); + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + cex = &EXT4_I(inode)->i_cached_extent; + cex->ec_block = block; + cex->ec_len = len; + cex->ec_start = start; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +} + +/* + * ext4_ext_put_gap_in_cache: + * calculate boundaries of the gap that the requested block fits into + * and cache this gap + */ +static void +ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, + ext4_lblk_t block) +{ + int depth = ext_depth(inode); + unsigned long len; + ext4_lblk_t lblock; + struct ext4_extent *ex; + + ex = path[depth].p_ext; + if (ex == NULL) { + /* there is no extent yet, so gap is [0;-] */ + lblock = 0; + len = EXT_MAX_BLOCKS; + ext_debug("cache gap(whole file):"); + } else if (block < le32_to_cpu(ex->ee_block)) { + lblock = block; + len = le32_to_cpu(ex->ee_block) - block; + ext_debug("cache gap(before): %u [%u:%u]", + block, + le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex)); + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { + ext4_lblk_t next; + lblock = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); + + next = ext4_ext_next_allocated_block(path); + ext_debug("cache gap(after): [%u:%u] %u", + le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex), + block); + BUG_ON(next == lblock); + len = next - lblock; + } else { + lblock = len = 0; + BUG(); + } + + ext_debug(" -> %u:%lu\n", lblock, len); + ext4_ext_put_in_cache(inode, lblock, len, 0); +} + +/* + * ext4_ext_in_cache() + * Checks to see if the given block is in the cache. + * If it is, the cached extent is stored in the given + * cache extent pointer. If the cached extent is a hole, + * this routine should be used instead of + * ext4_ext_in_cache if the calling function needs to + * know the size of the hole. + * + * @inode: The files inode + * @block: The block to look for in the cache + * @ex: Pointer where the cached extent will be stored + * if it contains block + * + * Return 0 if cache is invalid; 1 if the cache is valid + */ +static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, + struct ext4_ext_cache *ex){ + struct ext4_ext_cache *cex; + struct ext4_sb_info *sbi; + int ret = 0; + + /* + * We borrow i_block_reservation_lock to protect i_cached_extent + */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + cex = &EXT4_I(inode)->i_cached_extent; + sbi = EXT4_SB(inode->i_sb); + + /* has cache valid data? */ + if (cex->ec_len == 0) + goto errout; + + if (in_range(block, cex->ec_block, cex->ec_len)) { + memcpy(ex, cex, sizeof(struct ext4_ext_cache)); + ext_debug("%u cached by %u:%u:%llu\n", + block, + cex->ec_block, cex->ec_len, cex->ec_start); + ret = 1; + } +errout: + if (!ret) + sbi->extent_cache_misses++; + else + sbi->extent_cache_hits++; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return ret; +} + +/* + * ext4_ext_in_cache() + * Checks to see if the given block is in the cache. + * If it is, the cached extent is stored in the given + * extent pointer. + * + * @inode: The files inode + * @block: The block to look for in the cache + * @ex: Pointer where the cached extent will be stored + * if it contains block + * + * Return 0 if cache is invalid; 1 if the cache is valid + */ +static int +ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, + struct ext4_extent *ex) +{ + struct ext4_ext_cache cex; + int ret = 0; + + if (ext4_ext_check_cache(inode, block, &cex)) { + ex->ee_block = cpu_to_le32(cex.ec_block); + ext4_ext_store_pblock(ex, cex.ec_start); + ex->ee_len = cpu_to_le16(cex.ec_len); + ret = 1; + } + + return ret; +} + + +/* + * ext4_ext_rm_idx: + * removes index from the index block. + * It's used in truncate case only, thus all requests are for + * last index in the block only. + */ +static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int err; + ext4_fsblk_t leaf; + + /* free index block */ + path--; + leaf = ext4_idx_pblock(path->p_idx); + if (unlikely(path->p_hdr->eh_entries == 0)) { + EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); + return -EIO; + } + err = ext4_ext_get_access(handle, inode, path); + if (err) + return err; + le16_add_cpu(&path->p_hdr->eh_entries, -1); + err = ext4_ext_dirty(handle, inode, path); + if (err) + return err; + ext_debug("index is empty, remove it, free block %llu\n", leaf); + ext4_free_blocks(handle, inode, NULL, leaf, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + return err; +} + +/* + * ext4_ext_calc_credits_for_single_extent: + * This routine returns max. credits that needed to insert an extent + * to the extent tree. + * When pass the actual path, the caller should calculate credits + * under i_data_sem. + */ +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, + struct ext4_ext_path *path) +{ + if (path) { + int depth = ext_depth(inode); + int ret = 0; + + /* probably there is space in leaf? */ + if (le16_to_cpu(path[depth].p_hdr->eh_entries) + < le16_to_cpu(path[depth].p_hdr->eh_max)) { + + /* + * There are some space in the leaf tree, no + * need to account for leaf block credit + * + * bitmaps and block group descriptor blocks + * and other metadat blocks still need to be + * accounted. + */ + /* 1 bitmap, 1 block group descriptor */ + ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); + return ret; + } + } + + return ext4_chunk_trans_blocks(inode, nrblocks); +} + +/* + * How many index/leaf blocks need to change/allocate to modify nrblocks? + * + * if nrblocks are fit in a single extent (chunk flag is 1), then + * in the worse case, each tree level index/leaf need to be changed + * if the tree split due to insert a new extent, then the old tree + * index/leaf need to be updated too + * + * If the nrblocks are discontiguous, they could cause + * the whole tree split more than once, but this is really rare. + */ +int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int index; + int depth = ext_depth(inode); + + if (chunk) + index = depth * 2; + else + index = depth * 3; + + return index; +} + +static int ext4_remove_blocks(handle_t *handle, struct inode *inode, + struct ext4_extent *ex, + ext4_lblk_t from, ext4_lblk_t to) +{ + unsigned short ee_len = ext4_ext_get_actual_len(ex); + int flags = EXT4_FREE_BLOCKS_FORGET; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; +#ifdef EXTENTS_STATS + { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + spin_lock(&sbi->s_ext_stats_lock); + sbi->s_ext_blocks += ee_len; + sbi->s_ext_extents++; + if (ee_len < sbi->s_ext_min) + sbi->s_ext_min = ee_len; + if (ee_len > sbi->s_ext_max) + sbi->s_ext_max = ee_len; + if (ext_depth(inode) > sbi->s_depth_max) + sbi->s_depth_max = ext_depth(inode); + spin_unlock(&sbi->s_ext_stats_lock); + } +#endif + if (from >= le32_to_cpu(ex->ee_block) + && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { + /* tail removal */ + ext4_lblk_t num; + ext4_fsblk_t start; + + num = le32_to_cpu(ex->ee_block) + ee_len - from; + start = ext4_ext_pblock(ex) + ee_len - num; + ext_debug("free last %u blocks starting %llu\n", num, start); + ext4_free_blocks(handle, inode, NULL, start, num, flags); + } else if (from == le32_to_cpu(ex->ee_block) + && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { + /* head removal */ + ext4_lblk_t num; + ext4_fsblk_t start; + + num = to - from; + start = ext4_ext_pblock(ex); + + ext_debug("free first %u blocks starting %llu\n", num, start); + ext4_free_blocks(handle, inode, 0, start, num, flags); + + } else { + printk(KERN_INFO "strange request: removal(2) " + "%u-%u from %u:%u\n", + from, to, le32_to_cpu(ex->ee_block), ee_len); + } + return 0; +} + + +/* + * ext4_ext_rm_leaf() Removes the extents associated with the + * blocks appearing between "start" and "end", and splits the extents + * if "start" and "end" appear in the same extent + * + * @handle: The journal handle + * @inode: The files inode + * @path: The path to the leaf + * @start: The first block to remove + * @end: The last block to remove + */ +static int +ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, ext4_lblk_t start, + ext4_lblk_t end) +{ + int err = 0, correct_index = 0; + int depth = ext_depth(inode), credits; + struct ext4_extent_header *eh; + ext4_lblk_t a, b, block; + unsigned num; + ext4_lblk_t ex_ee_block; + unsigned short ex_ee_len; + unsigned uninitialized = 0; + struct ext4_extent *ex; + struct ext4_map_blocks map; + + /* the header must be checked already in ext4_ext_remove_space() */ + ext_debug("truncate since %u in leaf\n", start); + if (!path[depth].p_hdr) + path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); + eh = path[depth].p_hdr; + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + return -EIO; + } + /* find where to start removing */ + ex = EXT_LAST_EXTENT(eh); + + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + + while (ex >= EXT_FIRST_EXTENT(eh) && + ex_ee_block + ex_ee_len > start) { + + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + else + uninitialized = 0; + + ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, + uninitialized, ex_ee_len); + path[depth].p_ext = ex; + + a = ex_ee_block > start ? ex_ee_block : start; + b = ex_ee_block+ex_ee_len - 1 < end ? + ex_ee_block+ex_ee_len - 1 : end; + + ext_debug(" border %u:%u\n", a, b); + + /* If this extent is beyond the end of the hole, skip it */ + if (end <= ex_ee_block) { + ex--; + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + continue; + } else if (a != ex_ee_block && + b != ex_ee_block + ex_ee_len - 1) { + /* + * If this is a truncate, then this condition should + * never happen because at least one of the end points + * needs to be on the edge of the extent. + */ + if (end == EXT_MAX_BLOCKS - 1) { + ext_debug(" bad truncate %u:%u\n", + start, end); + block = 0; + num = 0; + err = -EIO; + goto out; + } + /* + * else this is a hole punch, so the extent needs to + * be split since neither edge of the hole is on the + * extent edge + */ + else{ + map.m_pblk = ext4_ext_pblock(ex); + map.m_lblk = ex_ee_block; + map.m_len = b - ex_ee_block; + + err = ext4_split_extent(handle, + inode, path, &map, 0, + EXT4_GET_BLOCKS_PUNCH_OUT_EXT | + EXT4_GET_BLOCKS_PRE_IO); + + if (err < 0) + goto out; + + ex_ee_len = ext4_ext_get_actual_len(ex); + + b = ex_ee_block+ex_ee_len - 1 < end ? + ex_ee_block+ex_ee_len - 1 : end; + + /* Then remove tail of this extent */ + block = ex_ee_block; + num = a - block; + } + } else if (a != ex_ee_block) { + /* remove tail of the extent */ + block = ex_ee_block; + num = a - block; + } else if (b != ex_ee_block + ex_ee_len - 1) { + /* remove head of the extent */ + block = b; + num = ex_ee_block + ex_ee_len - b; + + /* + * If this is a truncate, this condition + * should never happen + */ + if (end == EXT_MAX_BLOCKS - 1) { + ext_debug(" bad truncate %u:%u\n", + start, end); + err = -EIO; + goto out; + } + } else { + /* remove whole extent: excellent! */ + block = ex_ee_block; + num = 0; + if (a != ex_ee_block) { + ext_debug(" bad truncate %u:%u\n", + start, end); + err = -EIO; + goto out; + } + + if (b != ex_ee_block + ex_ee_len - 1) { + ext_debug(" bad truncate %u:%u\n", + start, end); + err = -EIO; + goto out; + } + } + + /* + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups plus ex_ee_len/blocks_per_block_group for + * the worst case + */ + credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); + if (ex == EXT_FIRST_EXTENT(eh)) { + correct_index = 1; + credits += (ext_depth(inode)) + 1; + } + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); + + err = ext4_ext_truncate_extend_restart(handle, inode, credits); + if (err) + goto out; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + err = ext4_remove_blocks(handle, inode, ex, a, b); + if (err) + goto out; + + if (num == 0) { + /* this extent is removed; mark slot entirely unused */ + ext4_ext_store_pblock(ex, 0); + } else if (block != ex_ee_block) { + /* + * If this was a head removal, then we need to update + * the physical block since it is now at a different + * location + */ + ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a)); + } + + ex->ee_block = cpu_to_le32(block); + ex->ee_len = cpu_to_le16(num); + /* + * Do not mark uninitialized if all the blocks in the + * extent have been removed. + */ + if (uninitialized && num) + ext4_ext_mark_uninitialized(ex); + + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + /* + * If the extent was completely released, + * we need to remove it from the leaf + */ + if (num == 0) { + if (end != EXT_MAX_BLOCKS - 1) { + /* + * For hole punching, we need to scoot all the + * extents up when an extent is removed so that + * we dont have blank extents in the middle + */ + memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * + sizeof(struct ext4_extent)); + + /* Now get rid of the one at the end */ + memset(EXT_LAST_EXTENT(eh), 0, + sizeof(struct ext4_extent)); + } + le16_add_cpu(&eh->eh_entries, -1); + } + + ext_debug("new extent: %u:%u:%llu\n", block, num, + ext4_ext_pblock(ex)); + ex--; + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + } + + if (correct_index && eh->eh_entries) + err = ext4_ext_correct_indexes(handle, inode, path); + + /* if this leaf is free, then we should + * remove it from index block above */ + if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) + err = ext4_ext_rm_idx(handle, inode, path + depth); + +out: + return err; +} + +/* + * ext4_ext_more_to_rm: + * returns 1 if current index has to be freed (even partial) + */ +static int +ext4_ext_more_to_rm(struct ext4_ext_path *path) +{ + BUG_ON(path->p_idx == NULL); + + if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) + return 0; + + /* + * if truncate on deeper level happened, it wasn't partial, + * so we have to consider current index for truncation + */ + if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) + return 0; + return 1; +} + +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) +{ + struct super_block *sb = inode->i_sb; + int depth = ext_depth(inode); + struct ext4_ext_path *path; + handle_t *handle; + int i, err; + + ext_debug("truncate since %u\n", start); + + /* probably first extent we're gonna free will be last in block */ + handle = ext4_journal_start(inode, depth + 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + +again: + ext4_ext_invalidate_cache(inode); + + /* + * We start scanning from right side, freeing all the blocks + * after i_size and walking into the tree depth-wise. + */ + depth = ext_depth(inode); + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); + if (path == NULL) { + ext4_journal_stop(handle); + return -ENOMEM; + } + path[0].p_depth = depth; + path[0].p_hdr = ext_inode_hdr(inode); + if (ext4_ext_check(inode, path[0].p_hdr, depth)) { + err = -EIO; + goto out; + } + i = err = 0; + + while (i >= 0 && err == 0) { + if (i == depth) { + /* this is leaf block */ + err = ext4_ext_rm_leaf(handle, inode, path, + start, end); + /* root level has p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + continue; + } + + /* this is index block */ + if (!path[i].p_hdr) { + ext_debug("initialize header\n"); + path[i].p_hdr = ext_block_hdr(path[i].p_bh); + } + + if (!path[i].p_idx) { + /* this level hasn't been touched yet */ + path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); + path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; + ext_debug("init index ptr: hdr 0x%p, num %d\n", + path[i].p_hdr, + le16_to_cpu(path[i].p_hdr->eh_entries)); + } else { + /* we were already here, see at next index */ + path[i].p_idx--; + } + + ext_debug("level %d - index, first 0x%p, cur 0x%p\n", + i, EXT_FIRST_INDEX(path[i].p_hdr), + path[i].p_idx); + if (ext4_ext_more_to_rm(path + i)) { + struct buffer_head *bh; + /* go to the next level */ + ext_debug("move to level %d (block %llu)\n", + i + 1, ext4_idx_pblock(path[i].p_idx)); + memset(path + i + 1, 0, sizeof(*path)); + bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); + if (!bh) { + /* should we reset i_size? */ + err = -EIO; + break; + } + if (WARN_ON(i + 1 > depth)) { + err = -EIO; + break; + } + if (ext4_ext_check(inode, ext_block_hdr(bh), + depth - i - 1)) { + err = -EIO; + break; + } + path[i + 1].p_bh = bh; + + /* save actual number of indexes since this + * number is changed at the next iteration */ + path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); + i++; + } else { + /* we finished processing this index, go up */ + if (path[i].p_hdr->eh_entries == 0 && i > 0) { + /* index is empty, remove it; + * handle must be already prepared by the + * truncatei_leaf() */ + err = ext4_ext_rm_idx(handle, inode, path + i); + } + /* root level has p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + ext_debug("return to level %d\n", i); + } + } + + /* TODO: flexible tree reduction should be here */ + if (path->p_hdr->eh_entries == 0) { + /* + * truncate to zero freed all the tree, + * so we need to correct eh_depth + */ + err = ext4_ext_get_access(handle, inode, path); + if (err == 0) { + ext_inode_hdr(inode)->eh_depth = 0; + ext_inode_hdr(inode)->eh_max = + cpu_to_le16(ext4_ext_space_root(inode, 0)); + err = ext4_ext_dirty(handle, inode, path); + } + } +out: + ext4_ext_drop_refs(path); + kfree(path); + if (err == -EAGAIN) + goto again; + ext4_journal_stop(handle); + + return err; +} + +/* + * called at mount time + */ +void ext4_ext_init(struct super_block *sb) +{ + /* + * possible initialization would be here + */ + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { +#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) + printk(KERN_INFO "EXT4-fs: file extents enabled"); +#ifdef AGGRESSIVE_TEST + printk(", aggressive tests"); +#endif +#ifdef CHECK_BINSEARCH + printk(", check binsearch"); +#endif +#ifdef EXTENTS_STATS + printk(", stats"); +#endif + printk("\n"); +#endif +#ifdef EXTENTS_STATS + spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); + EXT4_SB(sb)->s_ext_min = 1 << 30; + EXT4_SB(sb)->s_ext_max = 0; +#endif + } +} + +/* + * called at umount time + */ +void ext4_ext_release(struct super_block *sb) +{ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + return; + +#ifdef EXTENTS_STATS + if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", + sbi->s_ext_blocks, sbi->s_ext_extents, + sbi->s_ext_blocks / sbi->s_ext_extents); + printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", + sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); + } +#endif +} + +/* FIXME!! we need to try to merge to left or right after zero-out */ +static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + int ret; + + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); + if (ret > 0) + ret = 0; + + return ret; +} + +/* + * used by extent splitting. + */ +#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ + due to ENOSPC */ +#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ +#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ + +/* + * ext4_split_extent_at() splits an extent at given block. + * + * @handle: the journal handle + * @inode: the file inode + * @path: the path to the extent + * @split: the logical block where the extent is splitted. + * @split_flags: indicates if the extent could be zeroout if split fails, and + * the states(init or uninit) of new extents. + * @flags: flags used to insert new extent to extent tree. + * + * + * Splits extent [a, b] into two extents [a, @split) and [@split, b], states + * of which are deterimined by split_flag. + * + * There are two cases: + * a> the extent are splitted into two extent. + * b> split is not needed, and just mark the extent. + * + * return 0 on success. + */ +static int ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, + int flags) +{ + ext4_fsblk_t newblock; + ext4_lblk_t ee_block; + struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex2 = NULL; + unsigned int ee_len, depth; + int err = 0; + + ext_debug("ext4_split_extents_at: inode %lu, logical" + "block %llu\n", inode->i_ino, (unsigned long long)split); + + ext4_ext_show_leaf(inode, path); + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + newblock = split - ee_block + ext4_ext_pblock(ex); + + BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + if (split == ee_block) { + /* + * case b: block @split is the block that the extent begins with + * then we just change the state of the extent, and splitting + * is not needed. + */ + if (split_flag & EXT4_EXT_MARK_UNINIT2) + ext4_ext_mark_uninitialized(ex); + else + ext4_ext_mark_initialized(ex); + + if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) + ext4_ext_try_to_merge(inode, path, ex); + + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; + } + + /* case a */ + memcpy(&orig_ex, ex, sizeof(orig_ex)); + ex->ee_len = cpu_to_le16(split - ee_block); + if (split_flag & EXT4_EXT_MARK_UNINIT1) + ext4_ext_mark_uninitialized(ex); + + /* + * path may lead to new leaf, not to original leaf any more + * after ext4_ext_insert_extent() returns, + */ + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto fix_extent_len; + + ex2 = &newex; + ex2->ee_block = cpu_to_le32(split); + ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); + ext4_ext_store_pblock(ex2, newblock); + if (split_flag & EXT4_EXT_MARK_UNINIT2) + ext4_ext_mark_uninitialized(ex2); + + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_len = cpu_to_le16(ee_len); + ext4_ext_try_to_merge(inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; + } else if (err) + goto fix_extent_len; + +out: + ext4_ext_show_leaf(inode, path); + return err; + +fix_extent_len: + ex->ee_len = orig_ex.ee_len; + ext4_ext_dirty(handle, inode, path + depth); + return err; +} + +/* + * ext4_split_extents() splits an extent and mark extent which is covered + * by @map as split_flags indicates + * + * It may result in splitting the extent into multiple extents (upto three) + * There are three possibilities: + * a> There is no split required + * b> Splits in two extents: Split is happening at either end of the extent + * c> Splits in three extents: Somone is splitting in middle of the extent + * + */ +static int ext4_split_extent(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_map_blocks *map, + int split_flag, + int flags) +{ + ext4_lblk_t ee_block; + struct ext4_extent *ex; + unsigned int ee_len, depth; + int err = 0; + int uninitialized; + int split_flag1, flags1; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + uninitialized = ext4_ext_is_uninitialized(ex); + + if (map->m_lblk + map->m_len < ee_block + ee_len) { + split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? + EXT4_EXT_MAY_ZEROOUT : 0; + flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; + if (uninitialized) + split_flag1 |= EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2; + err = ext4_split_extent_at(handle, inode, path, + map->m_lblk + map->m_len, split_flag1, flags1); + if (err) + goto out; + } + + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, map->m_lblk, path); + if (IS_ERR(path)) + return PTR_ERR(path); + + if (map->m_lblk >= ee_block) { + split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? + EXT4_EXT_MAY_ZEROOUT : 0; + if (uninitialized) + split_flag1 |= EXT4_EXT_MARK_UNINIT1; + if (split_flag & EXT4_EXT_MARK_UNINIT2) + split_flag1 |= EXT4_EXT_MARK_UNINIT2; + err = ext4_split_extent_at(handle, inode, path, + map->m_lblk, split_flag1, flags); + if (err) + goto out; + } + + ext4_ext_show_leaf(inode, path); +out: + return err ? err : map->m_len; +} + +#define EXT4_EXT_ZERO_LEN 7 +/* + * This function is called by ext4_ext_map_blocks() if someone tries to write + * to an uninitialized extent. It may result in splitting the uninitialized + * extent into multiple extents (up to three - one initialized and two + * uninitialized). + * There are three possibilities: + * a> There is no split required: Entire extent should be initialized + * b> Splits in two extents: Write is happening at either end of the extent + * c> Splits in three extents: Somone is writing in middle of the extent + */ +static int ext4_ext_convert_to_initialized(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path) +{ + struct ext4_map_blocks split_map; + struct ext4_extent zero_ex; + struct ext4_extent *ex; + ext4_lblk_t ee_block, eof_block; + unsigned int allocated, ee_len, depth; + int err = 0; + int split_flag = 0; + + ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); + + eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + if (eof_block < map->m_lblk + map->m_len) + eof_block = map->m_lblk + map->m_len; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + allocated = ee_len - (map->m_lblk - ee_block); + + WARN_ON(map->m_lblk < ee_block); + /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully insde i_size or new_size. + */ + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; + + /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ + if (ee_len <= 2*EXT4_EXT_ZERO_LEN && + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + err = ext4_ext_zeroout(inode, ex); + if (err) + goto out; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + ext4_ext_mark_initialized(ex); + ext4_ext_try_to_merge(inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; + } + + /* + * four cases: + * 1. split the extent into three extents. + * 2. split the extent into two extents, zeroout the first half. + * 3. split the extent into two extents, zeroout the second half. + * 4. split the extent into two extents with out zeroout. + */ + split_map.m_lblk = map->m_lblk; + split_map.m_len = map->m_len; + + if (allocated > map->m_len) { + if (allocated <= EXT4_EXT_ZERO_LEN && + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + /* case 3 */ + zero_ex.ee_block = + cpu_to_le32(map->m_lblk); + zero_ex.ee_len = cpu_to_le16(allocated); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex) + map->m_lblk - ee_block); + err = ext4_ext_zeroout(inode, &zero_ex); + if (err) + goto out; + split_map.m_lblk = map->m_lblk; + split_map.m_len = allocated; + } else if ((map->m_lblk - ee_block + map->m_len < + EXT4_EXT_ZERO_LEN) && + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + /* case 2 */ + if (map->m_lblk != ee_block) { + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = cpu_to_le16(map->m_lblk - + ee_block); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + err = ext4_ext_zeroout(inode, &zero_ex); + if (err) + goto out; + } + + split_map.m_lblk = ee_block; + split_map.m_len = map->m_lblk - ee_block + map->m_len; + allocated = map->m_len; + } + } + + allocated = ext4_split_extent(handle, inode, path, + &split_map, split_flag, 0); + if (allocated < 0) + err = allocated; + +out: + return err ? err : allocated; +} + +/* + * This function is called by ext4_ext_map_blocks() from + * ext4_get_blocks_dio_write() when DIO to write + * to an uninitialized extent. + * + * Writing to an uninitialized extent may result in splitting the uninitialized + * extent into multiple /initialized uninitialized extents (up to three) + * There are three possibilities: + * a> There is no split required: Entire extent should be uninitialized + * b> Splits in two extents: Write is happening at either end of the extent + * c> Splits in three extents: Somone is writing in middle of the extent + * + * One of more index blocks maybe needed if the extent tree grow after + * the uninitialized extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the uninitialized extent before DIO submit + * the IO. The uninitialized extent called at this time will be split + * into three uninitialized extent(at most). After IO complete, the part + * being filled will be convert to initialized by the end_io callback function + * via ext4_convert_unwritten_extents(). + * + * Returns the size of uninitialized extent to be written on success. + */ +static int ext4_split_unwritten_extents(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, + int flags) +{ + ext4_lblk_t eof_block; + ext4_lblk_t ee_block; + struct ext4_extent *ex; + unsigned int ee_len; + int split_flag = 0, depth; + + ext_debug("ext4_split_unwritten_extents: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); + + eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + if (eof_block < map->m_lblk + map->m_len) + eof_block = map->m_lblk + map->m_len; + /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully insde i_size or new_size. + */ + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; + split_flag |= EXT4_EXT_MARK_UNINIT2; + + flags |= EXT4_GET_BLOCKS_PRE_IO; + return ext4_split_extent(handle, inode, path, map, split_flag, flags); +} + +static int ext4_convert_unwritten_extents_endio(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path) +{ + struct ext4_extent *ex; + struct ext4_extent_header *eh; + int depth; + int err = 0; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + + ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex)); + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* first mark the extent as initialized */ + ext4_ext_mark_initialized(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); +out: + ext4_ext_show_leaf(inode, path); + return err; +} + +static void unmap_underlying_metadata_blocks(struct block_device *bdev, + sector_t block, int count) +{ + int i; + for (i = 0; i < count; i++) + unmap_underlying_metadata(bdev, block + i); +} + +/* + * Handle EOFBLOCKS_FL flag, clearing it if necessary + */ +static int check_eofblocks_fl(handle_t *handle, struct inode *inode, + ext4_lblk_t lblk, + struct ext4_ext_path *path, + unsigned int len) +{ + int i, depth; + struct ext4_extent_header *eh; + struct ext4_extent *last_ex; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) + return 0; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + + if (unlikely(!eh->eh_entries)) { + EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " + "EOFBLOCKS_FL set"); + return -EIO; + } + last_ex = EXT_LAST_EXTENT(eh); + /* + * We should clear the EOFBLOCKS_FL flag if we are writing the + * last block in the last extent in the file. We test this by + * first checking to see if the caller to + * ext4_ext_get_blocks() was interested in the last block (or + * a block beyond the last block) in the current extent. If + * this turns out to be false, we can bail out from this + * function immediately. + */ + if (lblk + len < le32_to_cpu(last_ex->ee_block) + + ext4_ext_get_actual_len(last_ex)) + return 0; + /* + * If the caller does appear to be planning to write at or + * beyond the end of the current extent, we then test to see + * if the current extent is the last extent in the file, by + * checking to make sure it was reached via the rightmost node + * at each level of the tree. + */ + for (i = depth-1; i >= 0; i--) + if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) + return 0; + ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + return ext4_mark_inode_dirty(handle, inode); +} + +static int +ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, int flags, + unsigned int allocated, ext4_fsblk_t newblock) +{ + int ret = 0; + int err = 0; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" + "block %llu, max_blocks %u, flags %d, allocated %u", + inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, + flags, allocated); + ext4_ext_show_leaf(inode, path); + + /* get_block() before submit the IO, split the extent */ + if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { + ret = ext4_split_unwritten_extents(handle, inode, map, + path, flags); + /* + * Flag the inode(non aio case) or end_io struct (aio case) + * that this IO needs to conversion to written when IO is + * completed + */ + if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { + io->flag = EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } else + ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + if (ext4_should_dioread_nolock(inode)) + map->m_flags |= EXT4_MAP_UNINIT; + goto out; + } + /* IO end_io complete, convert the filled extent to written */ + if ((flags & EXT4_GET_BLOCKS_CONVERT)) { + ret = ext4_convert_unwritten_extents_endio(handle, inode, + path); + if (ret >= 0) { + ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, map->m_len); + } else + err = ret; + goto out2; + } + /* buffered IO case */ + /* + * repeat fallocate creation request + * we already have an unwritten extent + */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) + goto map_out; + + /* buffered READ or buffered write_begin() lookup */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + /* + * We have blocks reserved already. We + * return allocated blocks so that delalloc + * won't do block reservation for us. But + * the buffer head will be unmapped so that + * a read from the block returns 0s. + */ + map->m_flags |= EXT4_MAP_UNWRITTEN; + goto out1; + } + + /* buffered write, writepage time, convert*/ + ret = ext4_ext_convert_to_initialized(handle, inode, map, path); + if (ret >= 0) { + ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, + map->m_len); + if (err < 0) + goto out2; + } + +out: + if (ret <= 0) { + err = ret; + goto out2; + } else + allocated = ret; + map->m_flags |= EXT4_MAP_NEW; + /* + * if we allocated more blocks than requested + * we need to make sure we unmap the extra block + * allocated. The actual needed block will get + * unmapped later when we find the buffer_head marked + * new. + */ + if (allocated > map->m_len) { + unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, + newblock + map->m_len, + allocated - map->m_len); + allocated = map->m_len; + } + + /* + * If we have done fallocate with the offset that is already + * delayed allocated, we would have block reservation + * and quota reservation done in the delayed write path. + * But fallocate would have already updated quota and block + * count for this offset. So cancel these reservation + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_da_update_reserve_space(inode, allocated, 0); + +map_out: + map->m_flags |= EXT4_MAP_MAPPED; +out1: + if (allocated > map->m_len) + allocated = map->m_len; + ext4_ext_show_leaf(inode, path); + map->m_pblk = newblock; + map->m_len = allocated; +out2: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return err ? err : allocated; +} + +/* + * Block allocation/map/preallocation routine for extents based files + * + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block + * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + * + * return > 0, number of of blocks already mapped/allocated + * if create == 0 and these are pre-allocated blocks + * buffer head is unmapped + * otherwise blocks are mapped + * + * return = 0, if plain look up failed (blocks have not been allocated) + * buffer head is unmapped + * + * return < 0, error case. + */ +int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent newex, *ex; + ext4_fsblk_t newblock = 0; + int err = 0, depth, ret; + unsigned int allocated = 0; + unsigned int punched_out = 0; + unsigned int result = 0; + struct ext4_allocation_request ar; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + struct ext4_map_blocks punch_map; + + ext_debug("blocks %u/%u requested for inode %lu\n", + map->m_lblk, map->m_len, inode->i_ino); + trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); + + /* check in cache */ + if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && + ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { + if (!newex.ee_start_lo && !newex.ee_start_hi) { + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + /* + * block isn't allocated yet and + * user doesn't want to allocate it + */ + goto out2; + } + /* we should allocate requested block */ + } else { + /* block is already allocated */ + newblock = map->m_lblk + - le32_to_cpu(newex.ee_block) + + ext4_ext_pblock(&newex); + /* number of remaining blocks in the extent */ + allocated = ext4_ext_get_actual_len(&newex) - + (map->m_lblk - le32_to_cpu(newex.ee_block)); + goto out; + } + } + + /* find extent for this block */ + path = ext4_ext_find_extent(inode, map->m_lblk, NULL); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out2; + } + + depth = ext_depth(inode); + + /* + * consistent leaf must not be empty; + * this situation is possible, though, _during_ tree modification; + * this is why assert can't be put in ext4_ext_find_extent() + */ + if (unlikely(path[depth].p_ext == NULL && depth != 0)) { + EXT4_ERROR_INODE(inode, "bad extent address " + "lblock: %lu, depth: %d pblock %lld", + (unsigned long) map->m_lblk, depth, + path[depth].p_block); + err = -EIO; + goto out2; + } + + ex = path[depth].p_ext; + if (ex) { + ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); + ext4_fsblk_t ee_start = ext4_ext_pblock(ex); + unsigned short ee_len; + + /* + * Uninitialized extents are treated as holes, except that + * we split out initialized portions during a write. + */ + ee_len = ext4_ext_get_actual_len(ex); + /* if found extent covers block, simply return it */ + if (in_range(map->m_lblk, ee_block, ee_len)) { + newblock = map->m_lblk - ee_block + ee_start; + /* number of remaining blocks in the extent */ + allocated = ee_len - (map->m_lblk - ee_block); + ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, + ee_block, ee_len, newblock); + + if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) { + /* + * Do not put uninitialized extent + * in the cache + */ + if (!ext4_ext_is_uninitialized(ex)) { + ext4_ext_put_in_cache(inode, ee_block, + ee_len, ee_start); + goto out; + } + ret = ext4_ext_handle_uninitialized_extents( + handle, inode, map, path, flags, + allocated, newblock); + return ret; + } + + /* + * Punch out the map length, but only to the + * end of the extent + */ + punched_out = allocated < map->m_len ? + allocated : map->m_len; + + /* + * Sense extents need to be converted to + * uninitialized, they must fit in an + * uninitialized extent + */ + if (punched_out > EXT_UNINIT_MAX_LEN) + punched_out = EXT_UNINIT_MAX_LEN; + + punch_map.m_lblk = map->m_lblk; + punch_map.m_pblk = newblock; + punch_map.m_len = punched_out; + punch_map.m_flags = 0; + + /* Check to see if the extent needs to be split */ + if (punch_map.m_len != ee_len || + punch_map.m_lblk != ee_block) { + + ret = ext4_split_extent(handle, inode, + path, &punch_map, 0, + EXT4_GET_BLOCKS_PUNCH_OUT_EXT | + EXT4_GET_BLOCKS_PRE_IO); + + if (ret < 0) { + err = ret; + goto out2; + } + /* + * find extent for the block at + * the start of the hole + */ + ext4_ext_drop_refs(path); + kfree(path); + + path = ext4_ext_find_extent(inode, + map->m_lblk, NULL); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out2; + } + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_len = ext4_ext_get_actual_len(ex); + ee_block = le32_to_cpu(ex->ee_block); + ee_start = ext4_ext_pblock(ex); + + } + + ext4_ext_mark_uninitialized(ex); + + err = ext4_ext_remove_space(inode, map->m_lblk, + map->m_lblk + punched_out); + + goto out2; + } + } + + /* + * requested block isn't allocated yet; + * we couldn't try to create block if create flag is zero + */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + /* + * put just found gap into cache to speed up + * subsequent requests + */ + ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + goto out2; + } + /* + * Okay, we need to do block allocation. + */ + + /* find neighbour allocated blocks */ + ar.lleft = map->m_lblk; + err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); + if (err) + goto out2; + ar.lright = map->m_lblk; + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); + if (err) + goto out2; + + /* + * See if request is beyond maximum number of blocks we can have in + * a single extent. For an initialized extent this limit is + * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is + * EXT_UNINIT_MAX_LEN. + */ + if (map->m_len > EXT_INIT_MAX_LEN && + !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) + map->m_len = EXT_INIT_MAX_LEN; + else if (map->m_len > EXT_UNINIT_MAX_LEN && + (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) + map->m_len = EXT_UNINIT_MAX_LEN; + + /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ + newex.ee_block = cpu_to_le32(map->m_lblk); + newex.ee_len = cpu_to_le16(map->m_len); + err = ext4_ext_check_overlap(inode, &newex, path); + if (err) + allocated = ext4_ext_get_actual_len(&newex); + else + allocated = map->m_len; + + /* allocate new block */ + ar.inode = inode; + ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); + ar.logical = map->m_lblk; + ar.len = allocated; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + else + /* disable in-core preallocation for non-regular files */ + ar.flags = 0; + if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) + ar.flags |= EXT4_MB_HINT_NOPREALLOC; + newblock = ext4_mb_new_blocks(handle, &ar, &err); + if (!newblock) + goto out2; + ext_debug("allocate new block: goal %llu, found %llu/%u\n", + ar.goal, newblock, allocated); + + /* try to insert new extent into found leaf and return */ + ext4_ext_store_pblock(&newex, newblock); + newex.ee_len = cpu_to_le16(ar.len); + /* Mark uninitialized */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ + ext4_ext_mark_uninitialized(&newex); + /* + * io_end structure was created for every IO write to an + * uninitialized extent. To avoid unnecessary conversion, + * here we flag the IO that really needs the conversion. + * For non asycn direct IO case, flag the inode state + * that we need to perform conversion when IO is done. + */ + if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { + if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { + io->flag = EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } else + ext4_set_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN); + } + if (ext4_should_dioread_nolock(inode)) + map->m_flags |= EXT4_MAP_UNINIT; + } + + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); + if (!err) + err = ext4_ext_insert_extent(handle, inode, path, + &newex, flags); + if (err) { + int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? + EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; + /* free data blocks we just allocated */ + /* not a good idea to call discard here directly, + * but otherwise we'd need to call it every free() */ + ext4_discard_preallocations(inode); + ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), + ext4_ext_get_actual_len(&newex), fb_flags); + goto out2; + } + + /* previous routine could use block we allocated */ + newblock = ext4_ext_pblock(&newex); + allocated = ext4_ext_get_actual_len(&newex); + if (allocated > map->m_len) + allocated = map->m_len; + map->m_flags |= EXT4_MAP_NEW; + + /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_da_update_reserve_space(inode, allocated, 1); + + /* + * Cache the extent and update transaction to commit on fdatasync only + * when it is _not_ an uninitialized extent. + */ + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { + ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock); + ext4_update_inode_fsync_trans(handle, inode, 1); + } else + ext4_update_inode_fsync_trans(handle, inode, 0); +out: + if (allocated > map->m_len) + allocated = map->m_len; + ext4_ext_show_leaf(inode, path); + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; + map->m_len = allocated; +out2: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, + newblock, map->m_len, err ? err : allocated); + + result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? + punched_out : allocated; + + return err ? err : result; +} + +void ext4_ext_truncate(struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct super_block *sb = inode->i_sb; + ext4_lblk_t last_block; + handle_t *handle; + int err = 0; + + /* + * finish any pending end_io work so we won't run the risk of + * converting any truncated blocks to initialized later + */ + ext4_flush_completed_IO(inode); + + /* + * probably first extent we're gonna free will be last in block + */ + err = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, err); + if (IS_ERR(handle)) + return; + + if (inode->i_size & (sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, mapping, inode->i_size); + + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_ext_invalidate_cache(inode); + + ext4_discard_preallocations(inode); + + /* + * TODO: optimization is possible here. + * Probably we need not scan at all, + * because page truncation is enough. + */ + + /* we have to know where to truncate from in crash case */ + EXT4_I(inode)->i_disksize = inode->i_size; + ext4_mark_inode_dirty(handle, inode); + + last_block = (inode->i_size + sb->s_blocksize - 1) + >> EXT4_BLOCK_SIZE_BITS(sb); + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); + + /* In a multi-transaction truncate, we only make the final + * transaction synchronous. + */ + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + up_write(&EXT4_I(inode)->i_data_sem); + +out_stop: + /* + * If this was a simple ftruncate() and the file will remain alive, + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); +} + +static void ext4_falloc_update_inode(struct inode *inode, + int mode, loff_t new_size, int update_ctime) +{ + struct timespec now; + + if (update_ctime) { + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_ctime, &now)) + inode->i_ctime = now; + } + /* + * Update only when preallocation was requested beyond + * the file size. + */ + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + if (new_size > i_size_read(inode)) + i_size_write(inode, new_size); + if (new_size > EXT4_I(inode)->i_disksize) + ext4_update_i_disksize(inode, new_size); + } else { + /* + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if (new_size > i_size_read(inode)) + ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + } + +} + +/* + * preallocate space for a file. This implements ext4's fallocate file + * operation, which gets called from sys_fallocate system call. + * For block-mapped files, posix_fallocate should fall back to the method + * of writing zeroes to the required new blocks (the same behavior which is + * expected for file systems which do not support fallocate() system call). + */ +long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + handle_t *handle; + loff_t new_size; + unsigned int max_blocks; + int ret = 0; + int ret2 = 0; + int retries = 0; + struct ext4_map_blocks map; + unsigned int credits, blkbits = inode->i_blkbits; + + /* + * currently supporting (pre)allocate mode for extent-based + * files _only_ + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EOPNOTSUPP; + + /* Return error if mode is not supported */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return ext4_punch_hole(file, offset, len); + + trace_ext4_fallocate_enter(inode, offset, len, mode); + map.m_lblk = offset >> blkbits; + /* + * We can't just convert len to max_blocks because + * If blocksize = 4096 offset = 3072 and len = 2048 + */ + max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) + - map.m_lblk; + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, (len + offset)); + if (ret) { + mutex_unlock(&inode->i_mutex); + trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); + return ret; + } +retry: + while (ret >= 0 && ret < max_blocks) { + map.m_lblk = map.m_lblk + ret; + map.m_len = max_blocks = max_blocks - ret; + handle = ext4_journal_start(inode, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + ret = ext4_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | + EXT4_GET_BLOCKS_NO_NORMALIZE); + if (ret <= 0) { +#ifdef EXT4FS_DEBUG + WARN_ON(ret <= 0); + printk(KERN_ERR "%s: ext4_ext_map_blocks " + "returned error inode#%lu, block=%u, " + "max_blocks=%u", __func__, + inode->i_ino, map.m_lblk, max_blocks); +#endif + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + break; + } + if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, + blkbits) >> blkbits)) + new_size = offset + len; + else + new_size = (map.m_lblk + ret) << blkbits; + + ext4_falloc_update_inode(inode, mode, new_size, + (map.m_flags & EXT4_MAP_NEW)); + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if (ret2) + break; + } + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + mutex_unlock(&inode->i_mutex); + trace_ext4_fallocate_exit(inode, offset, max_blocks, + ret > 0 ? ret2 : ret); + return ret > 0 ? ret2 : ret; +} + +/* + * This function convert a range of blocks to written extents + * The caller of this function will pass the start offset and the size. + * all unwritten extents within this range will be converted to + * written extents. + * + * This function is called from the direct IO end io call back + * function, to convert the fallocated extents after IO is completed. + * Returns 0 on success. + */ +int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, + ssize_t len) +{ + handle_t *handle; + unsigned int max_blocks; + int ret = 0; + int ret2 = 0; + struct ext4_map_blocks map; + unsigned int credits, blkbits = inode->i_blkbits; + + map.m_lblk = offset >> blkbits; + /* + * We can't just convert len to max_blocks because + * If blocksize = 4096 offset = 3072 and len = 2048 + */ + max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - + map.m_lblk); + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + while (ret >= 0 && ret < max_blocks) { + map.m_lblk += ret; + map.m_len = (max_blocks -= ret); + handle = ext4_journal_start(inode, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + ret = ext4_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_IO_CONVERT_EXT); + if (ret <= 0) { + WARN_ON(ret <= 0); + printk(KERN_ERR "%s: ext4_ext_map_blocks " + "returned error inode#%lu, block=%u, " + "max_blocks=%u", __func__, + inode->i_ino, map.m_lblk, map.m_len); + } + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if (ret <= 0 || ret2 ) + break; + } + return ret > 0 ? ret2 : ret; +} + +/* + * Callback function called for each extent to gather FIEMAP information. + */ +static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, + struct ext4_ext_cache *newex, struct ext4_extent *ex, + void *data) +{ + __u64 logical; + __u64 physical; + __u64 length; + __u32 flags = 0; + int ret = 0; + struct fiemap_extent_info *fieinfo = data; + unsigned char blksize_bits; + + blksize_bits = inode->i_sb->s_blocksize_bits; + logical = (__u64)newex->ec_block << blksize_bits; + + if (newex->ec_start == 0) { + /* + * No extent in extent-tree contains block @newex->ec_start, + * then the block may stay in 1)a hole or 2)delayed-extent. + * + * Holes or delayed-extents are processed as follows. + * 1. lookup dirty pages with specified range in pagecache. + * If no page is got, then there is no delayed-extent and + * return with EXT_CONTINUE. + * 2. find the 1st mapped buffer, + * 3. check if the mapped buffer is both in the request range + * and a delayed buffer. If not, there is no delayed-extent, + * then return. + * 4. a delayed-extent is found, the extent will be collected. + */ + ext4_lblk_t end = 0; + pgoff_t last_offset; + pgoff_t offset; + pgoff_t index; + pgoff_t start_index = 0; + struct page **pages = NULL; + struct buffer_head *bh = NULL; + struct buffer_head *head = NULL; + unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *); + + pages = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (pages == NULL) + return -ENOMEM; + + offset = logical >> PAGE_SHIFT; +repeat: + last_offset = offset; + head = NULL; + ret = find_get_pages_tag(inode->i_mapping, &offset, + PAGECACHE_TAG_DIRTY, nr_pages, pages); + + if (!(flags & FIEMAP_EXTENT_DELALLOC)) { + /* First time, try to find a mapped buffer. */ + if (ret == 0) { +out: + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + /* just a hole. */ + kfree(pages); + return EXT_CONTINUE; + } + index = 0; + +next_page: + /* Try to find the 1st mapped buffer. */ + end = ((__u64)pages[index]->index << PAGE_SHIFT) >> + blksize_bits; + if (!page_has_buffers(pages[index])) + goto out; + head = page_buffers(pages[index]); + if (!head) + goto out; + + index++; + bh = head; + do { + if (end >= newex->ec_block + + newex->ec_len) + /* The buffer is out of + * the request range. + */ + goto out; + + if (buffer_mapped(bh) && + end >= newex->ec_block) { + start_index = index - 1; + /* get the 1st mapped buffer. */ + goto found_mapped_buffer; + } + + bh = bh->b_this_page; + end++; + } while (bh != head); + + /* No mapped buffer in the range found in this page, + * We need to look up next page. + */ + if (index >= ret) { + /* There is no page left, but we need to limit + * newex->ec_len. + */ + newex->ec_len = end - newex->ec_block; + goto out; + } + goto next_page; + } else { + /*Find contiguous delayed buffers. */ + if (ret > 0 && pages[0]->index == last_offset) + head = page_buffers(pages[0]); + bh = head; + index = 1; + start_index = 0; + } + +found_mapped_buffer: + if (bh != NULL && buffer_delay(bh)) { + /* 1st or contiguous delayed buffer found. */ + if (!(flags & FIEMAP_EXTENT_DELALLOC)) { + /* + * 1st delayed buffer found, record + * the start of extent. + */ + flags |= FIEMAP_EXTENT_DELALLOC; + newex->ec_block = end; + logical = (__u64)end << blksize_bits; + } + /* Find contiguous delayed buffers. */ + do { + if (!buffer_delay(bh)) + goto found_delayed_extent; + bh = bh->b_this_page; + end++; + } while (bh != head); + + for (; index < ret; index++) { + if (!page_has_buffers(pages[index])) { + bh = NULL; + break; + } + head = page_buffers(pages[index]); + if (!head) { + bh = NULL; + break; + } + + if (pages[index]->index != + pages[start_index]->index + index + - start_index) { + /* Blocks are not contiguous. */ + bh = NULL; + break; + } + bh = head; + do { + if (!buffer_delay(bh)) + /* Delayed-extent ends. */ + goto found_delayed_extent; + bh = bh->b_this_page; + end++; + } while (bh != head); + } + } else if (!(flags & FIEMAP_EXTENT_DELALLOC)) + /* a hole found. */ + goto out; + +found_delayed_extent: + newex->ec_len = min(end - newex->ec_block, + (ext4_lblk_t)EXT_INIT_MAX_LEN); + if (ret == nr_pages && bh != NULL && + newex->ec_len < EXT_INIT_MAX_LEN && + buffer_delay(bh)) { + /* Have not collected an extent and continue. */ + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + goto repeat; + } + + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + kfree(pages); + } + + physical = (__u64)newex->ec_start << blksize_bits; + length = (__u64)newex->ec_len << blksize_bits; + + if (ex && ext4_ext_is_uninitialized(ex)) + flags |= FIEMAP_EXTENT_UNWRITTEN; + + if (next == EXT_MAX_BLOCKS) + flags |= FIEMAP_EXTENT_LAST; + + ret = fiemap_fill_next_extent(fieinfo, logical, physical, + length, flags); + if (ret < 0) + return ret; + if (ret == 1) + return EXT_BREAK; + return EXT_CONTINUE; +} + +/* fiemap flags we can handle specified here */ +#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) + +static int ext4_xattr_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo) +{ + __u64 physical = 0; + __u64 length; + __u32 flags = FIEMAP_EXTENT_LAST; + int blockbits = inode->i_sb->s_blocksize_bits; + int error = 0; + + /* in-inode? */ + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + struct ext4_iloc iloc; + int offset; /* offset of xattr in inode */ + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + physical = iloc.bh->b_blocknr << blockbits; + offset = EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize; + physical += offset; + length = EXT4_SB(inode->i_sb)->s_inode_size - offset; + flags |= FIEMAP_EXTENT_DATA_INLINE; + brelse(iloc.bh); + } else { /* external block */ + physical = EXT4_I(inode)->i_file_acl << blockbits; + length = inode->i_sb->s_blocksize; + } + + if (physical) + error = fiemap_fill_next_extent(fieinfo, 0, physical, + length, flags); + return (error < 0 ? error : 0); +} + +/* + * ext4_ext_punch_hole + * + * Punches a hole of "length" bytes in a file starting + * at byte "offset" + * + * @inode: The inode of the file to punch a hole in + * @offset: The starting byte offset of the hole + * @length: The length of the hole + * + * Returns the number of blocks removed or negative on err + */ +int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct ext4_ext_cache cache_ex; + ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks; + struct address_space *mapping = inode->i_mapping; + struct ext4_map_blocks map; + handle_t *handle; + loff_t first_block_offset, last_block_offset, block_len; + loff_t first_page, last_page, first_page_offset, last_page_offset; + int ret, credits, blocks_released, err = 0; + + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + + first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb); + last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb); + + first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + last_page = (offset + length) >> PAGE_CACHE_SHIFT; + + first_page_offset = first_page << PAGE_CACHE_SHIFT; + last_page_offset = last_page << PAGE_CACHE_SHIFT; + + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + err = filemap_write_and_wait_range(mapping, + first_page_offset == 0 ? 0 : first_page_offset-1, + last_page_offset); + + if (err) + return err; + } + + /* Now release the pages */ + if (last_page_offset > first_page_offset) { + truncate_inode_pages_range(mapping, first_page_offset, + last_page_offset-1); + } + + /* finish any pending end_io work */ + ext4_flush_completed_IO(inode); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_orphan_add(handle, inode); + if (err) + goto out; + + /* + * Now we need to zero out the un block aligned data. + * If the file is smaller than a block, just + * zero out the middle + */ + if (first_block > last_block) + ext4_block_zero_page_range(handle, mapping, offset, length); + else { + /* zero out the head of the hole before the first block */ + block_len = first_block_offset - offset; + if (block_len > 0) + ext4_block_zero_page_range(handle, mapping, + offset, block_len); + + /* zero out the tail of the hole after the last block */ + block_len = offset + length - last_block_offset; + if (block_len > 0) { + ext4_block_zero_page_range(handle, mapping, + last_block_offset, block_len); + } + } + + /* If there are no blocks to remove, return now */ + if (first_block >= last_block) + goto out; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_ext_invalidate_cache(inode); + ext4_discard_preallocations(inode); + + /* + * Loop over all the blocks and identify blocks + * that need to be punched out + */ + iblock = first_block; + blocks_released = 0; + while (iblock < last_block) { + max_blocks = last_block - iblock; + num_blocks = 1; + memset(&map, 0, sizeof(map)); + map.m_lblk = iblock; + map.m_len = max_blocks; + ret = ext4_ext_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_PUNCH_OUT_EXT); + + if (ret > 0) { + blocks_released += ret; + num_blocks = ret; + } else if (ret == 0) { + /* + * If map blocks could not find the block, + * then it is in a hole. If the hole was + * not already cached, then map blocks should + * put it in the cache. So we can get the hole + * out of the cache + */ + memset(&cache_ex, 0, sizeof(cache_ex)); + if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) && + !cache_ex.ec_start) { + + /* The hole is cached */ + num_blocks = cache_ex.ec_block + + cache_ex.ec_len - iblock; + + } else { + /* The block could not be identified */ + err = -EIO; + break; + } + } else { + /* Map blocks error */ + err = ret; + break; + } + + if (num_blocks == 0) { + /* This condition should never happen */ + ext_debug("Block lookup failed"); + err = -EIO; + break; + } + + iblock += num_blocks; + } + + if (blocks_released > 0) { + ext4_ext_invalidate_cache(inode); + ext4_discard_preallocations(inode); + } + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + up_write(&EXT4_I(inode)->i_data_sem); + +out: + ext4_orphan_del(handle, inode); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + return err; +} +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + ext4_lblk_t start_blk; + int error = 0; + + /* fallback to generic here if not in extents fmt */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return generic_block_fiemap(inode, fieinfo, start, len, + ext4_get_block); + + if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) + return -EBADR; + + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + error = ext4_xattr_fiemap(inode, fieinfo); + } else { + ext4_lblk_t len_blks; + __u64 last_blk; + + start_blk = start >> inode->i_sb->s_blocksize_bits; + last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; + if (last_blk >= EXT_MAX_BLOCKS) + last_blk = EXT_MAX_BLOCKS-1; + len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; + + /* + * Walk the extent tree gathering extent information. + * ext4_ext_fiemap_cb will push extents back to user. + */ + error = ext4_ext_walk_space(inode, start_blk, len_blks, + ext4_ext_fiemap_cb, fieinfo); + } + + return error; +} diff --git a/fs/ext4/file.c b/fs/ext4/file.c new file mode 100644 index 00000000..2c097232 --- /dev/null +++ b/fs/ext4/file.c @@ -0,0 +1,286 @@ +/* + * linux/fs/ext4/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 fs regular file handling primitives + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + */ + +#include +#include +#include +#include +#include +#include +#include "ext4.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" + +/* + * Called when an inode is released. Note that this is different + * from ext4_file_open: open gets called at every open, but release + * gets called only when /all/ the files are closed. + */ +static int ext4_release_file(struct inode *inode, struct file *filp) +{ + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { + ext4_alloc_da_blocks(inode); + ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + } + /* if we are the last writer on the inode, drop the block reservation */ + if ((filp->f_mode & FMODE_WRITE) && + (atomic_read(&inode->i_writecount) == 1) && + !EXT4_I(inode)->i_reserved_data_blocks) + { + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + } + if (is_dx(inode) && filp->private_data) + ext4_htree_free_dir_info(filp->private_data); + + return 0; +} + +static void ext4_aiodio_wait(struct inode *inode) +{ + wait_queue_head_t *wq = ext4_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0)); +} + +/* + * This tests whether the IO in question is block-aligned or not. + * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they + * are converted to written only after the IO is complete. Until they are + * mapped, these blocks appear as holes, so dio_zero_block() will assume that + * it needs to zero out portions of the start and/or end block. If 2 AIO + * threads are at work on the same unwritten block, they must be synchronized + * or one thread will zero the other's data, causing corruption. + */ +static int +ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct super_block *sb = inode->i_sb; + int blockmask = sb->s_blocksize - 1; + size_t count = iov_length(iov, nr_segs); + loff_t final_size = pos + count; + + if (pos >= inode->i_size) + return 0; + + if ((pos & blockmask) || (final_size & blockmask)) + return 1; + + return 0; +} + +static ssize_t +ext4_file_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + int unaligned_aio = 0; + int ret; + + /* + * If we have encountered a bitmap-format file, the size limit + * is smaller than s_maxbytes, which is for extent-mapped files. + */ + + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + size_t length = iov_length(iov, nr_segs); + + if ((pos > sbi->s_bitmap_maxbytes || + (pos == sbi->s_bitmap_maxbytes && length > 0))) + return -EFBIG; + + if (pos + length > sbi->s_bitmap_maxbytes) { + nr_segs = iov_shorten((struct iovec *)iov, nr_segs, + sbi->s_bitmap_maxbytes - pos); + } + } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) && + !is_sync_kiocb(iocb))) { + unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos); + } + + /* Unaligned direct AIO must be serialized; see comment above */ + if (unaligned_aio) { + static unsigned long unaligned_warn_time; + + /* Warn about this once per day */ + if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ)) + ext4_msg(inode->i_sb, KERN_WARNING, + "Unaligned AIO/DIO on inode %ld by %s; " + "performance will be poor.", + inode->i_ino, current->comm); + mutex_lock(ext4_aio_mutex(inode)); + ext4_aiodio_wait(inode); + } + + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + + if (unaligned_aio) + mutex_unlock(ext4_aio_mutex(inode)); + + return ret; +} + +static const struct vm_operations_struct ext4_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = ext4_page_mkwrite, +}; + +static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &ext4_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} + +static int ext4_file_open(struct inode * inode, struct file * filp) +{ + struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + struct vfsmount *mnt = filp->f_path.mnt; + struct path path; + char buf[64], *cp; + + if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && + !(sb->s_flags & MS_RDONLY))) { + sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; + /* + * Sample where the filesystem has been mounted and + * store it in the superblock for sysadmin convenience + * when trying to sort through large numbers of block + * devices or filesystem images. + */ + memset(buf, 0, sizeof(buf)); + path.mnt = mnt; + path.dentry = mnt->mnt_root; + cp = d_path(&path, buf, sizeof(buf)); + if (!IS_ERR(cp)) { + memcpy(sbi->s_es->s_last_mounted, cp, + sizeof(sbi->s_es->s_last_mounted)); + ext4_mark_super_dirty(sb); + } + } + /* + * Set up the jbd2_inode if we are opening the inode for + * writing and the journal is present + */ + if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) { + struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL); + + spin_lock(&inode->i_lock); + if (!ei->jinode) { + if (!jinode) { + spin_unlock(&inode->i_lock); + return -ENOMEM; + } + ei->jinode = jinode; + jbd2_journal_init_jbd_inode(ei->jinode, inode); + jinode = NULL; + } + spin_unlock(&inode->i_lock); + if (unlikely(jinode != NULL)) + jbd2_free_inode(jinode); + } + return dquot_file_open(inode, filp); +} + +/* + * ext4_llseek() copied from generic_file_llseek() to handle both + * block-mapped and extent-mapped maxbytes values. This should + * otherwise be identical with generic_file_llseek(). + */ +loff_t ext4_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes; + + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + else + maxbytes = inode->i_sb->s_maxbytes; + mutex_lock(&inode->i_mutex); + switch (origin) { + case SEEK_END: + offset += inode->i_size; + break; + case SEEK_CUR: + if (offset == 0) { + mutex_unlock(&inode->i_mutex); + return file->f_pos; + } + offset += file->f_pos; + break; + } + + if (offset < 0 || offset > maxbytes) { + mutex_unlock(&inode->i_mutex); + return -EINVAL; + } + + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + mutex_unlock(&inode->i_mutex); + + return offset; +} + +const struct file_operations ext4_file_operations = { + .llseek = ext4_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = ext4_file_write, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .mmap = ext4_file_mmap, + .open = ext4_file_open, + .release = ext4_release_file, + .fsync = ext4_sync_file, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, + .fallocate = ext4_fallocate, +}; + +const struct inode_operations ext4_file_inode_operations = { + .setattr = ext4_setattr, + .getattr = ext4_getattr, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif + .check_acl = ext4_check_acl, + .fiemap = ext4_fiemap, +}; + diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c new file mode 100644 index 00000000..ce66d2fe --- /dev/null +++ b/fs/ext4/fsync.c @@ -0,0 +1,225 @@ +/* + * linux/fs/ext4/fsync.c + * + * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) + * from + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4fs fsync primitive + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Removed unnecessary code duplication for little endian machines + * and excessive __inline__s. + * Andi Kleen, 1997 + * + * Major simplications and cleanup - we only need to do the metadata, because + * we can depend on generic_block_fdatasync() to sync the data blocks. + */ + +#include +#include +#include +#include +#include +#include + +#include "ext4.h" +#include "ext4_jbd2.h" + +#include + +static void dump_completed_IO(struct inode * inode) +{ +#ifdef EXT4FS_DEBUG + struct list_head *cur, *before, *after; + ext4_io_end_t *io, *io0, *io1; + unsigned long flags; + + if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ + ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); + return; + } + + ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ + cur = &io->list; + before = cur->prev; + io0 = container_of(before, ext4_io_end_t, list); + after = cur->next; + io1 = container_of(after, ext4_io_end_t, list); + + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + io, inode->i_ino, io0, io1); + } + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); +#endif +} + +/* + * This function is called from ext4_sync_file(). + * + * When IO is completed, the work to convert unwritten extents to + * written is queued on workqueue but may not get immediately + * scheduled. When fsync is called, we need to ensure the + * conversion is complete before fsync returns. + * The inode keeps track of a list of pending/completed IO that + * might needs to do the conversion. This function walks through + * the list and convert the related unwritten extents for completed IO + * to written. + * The function return the number of pending IOs on success. + */ +extern int ext4_flush_completed_IO(struct inode *inode) +{ + ext4_io_end_t *io; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; + int ret = 0; + int ret2 = 0; + + if (list_empty(&ei->i_completed_io_list)) + return ret; + + dump_completed_IO(inode); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + while (!list_empty(&ei->i_completed_io_list)){ + io = list_entry(ei->i_completed_io_list.next, + ext4_io_end_t, list); + /* + * Calling ext4_end_io_nolock() to convert completed + * IO to written. + * + * When ext4_sync_file() is called, run_queue() may already + * about to flush the work corresponding to this io structure. + * It will be upset if it founds the io structure related + * to the work-to-be schedule is freed. + * + * Thus we need to keep the io structure still valid here after + * conversion finished. The io structure has a flag to + * avoid double converting from both fsync and background work + * queue work. + */ + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + ret = ext4_end_io_nolock(io); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (ret < 0) + ret2 = ret; + else + list_del_init(&io->list); + } + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + return (ret2 < 0) ? ret2 : 0; +} + +/* + * If we're not journaling and this is a just-created file, we have to + * sync our parent directory (if it was freshly created) since + * otherwise it will only be written by writeback, leaving a huge + * window during which a crash may lose the file. This may apply for + * the parent directory's parent as well, and so on recursively, if + * they are also freshly created. + */ +static int ext4_sync_parent(struct inode *inode) +{ + struct writeback_control wbc; + struct dentry *dentry = NULL; + int ret = 0; + + while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { + ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); + dentry = list_entry(inode->i_dentry.next, + struct dentry, d_alias); + if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) + break; + inode = dentry->d_parent->d_inode; + ret = sync_mapping_buffers(inode->i_mapping); + if (ret) + break; + memset(&wbc, 0, sizeof(wbc)); + wbc.sync_mode = WB_SYNC_ALL; + wbc.nr_to_write = 0; /* only write out the inode */ + ret = sync_inode(inode, &wbc); + if (ret) + break; + } + return ret; +} + +/* + * akpm: A new design for ext4_sync_file(). + * + * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). + * There cannot be a transaction open by this task. + * Another task could have dirtied this inode. Its data can be in any + * state in the journalling system. + * + * What we do is just kick off a commit and wait on it. This will snapshot the + * inode to disk. + * + * i_mutex lock is held when entering and exiting this function + */ + +int ext4_sync_file(struct file *file, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + int ret; + tid_t commit_tid; + bool needs_barrier = false; + + J_ASSERT(ext4_journal_current_handle() == NULL); + + trace_ext4_sync_file_enter(file, datasync); + + if (inode->i_sb->s_flags & MS_RDONLY) + return 0; + + ret = ext4_flush_completed_IO(inode); + if (ret < 0) + goto out; + + if (!journal) { + ret = generic_file_fsync(file, datasync); + if (!ret && !list_empty(&inode->i_dentry)) + ret = ext4_sync_parent(inode); + goto out; + } + + /* + * data=writeback,ordered: + * The caller's filemap_fdatawrite()/wait will sync the data. + * Metadata is in the journal, we wait for proper transaction to + * commit here. + * + * data=journal: + * filemap_fdatawrite won't do anything (the buffers are clean). + * ext4_force_commit will write the file data into the journal and + * will wait on that. + * filemap_fdatawait() will encounter a ton of newly-dirtied pages + * (they were dirtied by commit). But that's OK - the blocks are + * safe in-journal, which is all fsync() needs to ensure. + */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + goto out; + } + + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = true; + jbd2_log_start_commit(journal, commit_tid); + ret = jbd2_log_wait_commit(journal, commit_tid); + if (needs_barrier) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + out: + trace_ext4_sync_file_exit(inode, ret); + return ret; +} diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c new file mode 100644 index 00000000..ac8f168c --- /dev/null +++ b/fs/ext4/hash.c @@ -0,0 +1,208 @@ +/* + * linux/fs/ext4/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + * + * This file is released under the GPL v2. + * + * This file may be redistributed under the terms of the GNU Public + * License. + */ + +#include +#include +#include +#include "ext4.h" + +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[4], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while (--n); + + buf[0] += b0; + buf[1] += b1; +} + + +/* The old legacy hash */ +static __u32 dx_hack_hash_unsigned(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const unsigned char *ucp = (const unsigned char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) ucp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +/* + * Returns the hash of a filename. If len is 0 and name is NULL, then + * this function can be used to test whether or not a hash version is + * supported. + * + * The seed is an 4 longword (32 bits) "secret" which can be used to + * uniquify a hash. If the seed is all zero's, then some default seed + * may be used. + * + * A particular hash version specifies whether or not the seed is + * represented, and whether or not the returned hash is 32 bits or 64 + * bits. 32 bit hashes will return 0 for the minor hash. + */ +int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) +{ + __u32 hash; + __u32 minor_hash = 0; + const char *p; + int i; + __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + /* Check to see if the seed is all zero's */ + if (hinfo->seed) { + for (i = 0; i < 4; i++) { + if (hinfo->seed[i]) + break; + } + if (i < 4) + memcpy(buf, hinfo->seed, sizeof(buf)); + } + + switch (hinfo->hash_version) { + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); + break; + case DX_HASH_LEGACY: + hash = dx_hack_hash_signed(name, len); + break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; + case DX_HASH_HALF_MD4: + p = name; + while (len > 0) { + (*str2hashbuf)(p, len, in, 8); + half_md4_transform(buf, in); + len -= 32; + p += 32; + } + minor_hash = buf[2]; + hash = buf[1]; + break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; + case DX_HASH_TEA: + p = name; + while (len > 0) { + (*str2hashbuf)(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + hash = buf[0]; + minor_hash = buf[1]; + break; + default: + hinfo->hash = 0; + return -1; + } + hash = hash & ~1; + if (hash == (EXT4_HTREE_EOF << 1)) + hash = (EXT4_HTREE_EOF-1) << 1; + hinfo->hash = hash; + hinfo->minor_hash = minor_hash; + return 0; +} diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c new file mode 100644 index 00000000..412469b2 --- /dev/null +++ b/fs/ext4/ialloc.c @@ -0,0 +1,1338 @@ +/* + * linux/fs/ext4/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" + +#include + +/* + * ialloc.c contains the inodes allocation and deallocation routines + */ + +/* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. + */ + +/* + * To avoid calling the atomic setbit hundreds or thousands of times, we only + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ +void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + + if (start_bit >= end_bit) + return; + + ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); + for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) + ext4_set_bit(i, bitmap); + if (i < end_bit) + memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +} + +/* Initializes an uninitialized inode bitmap */ +static unsigned ext4_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + J_ASSERT_BH(bh, buffer_locked(bh)); + + /* If checksum is bad mark all blocks and inodes use to prevent + * allocation, essentially implementing a per-group read-only flag. */ + if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + ext4_error(sb, "Checksum bad for group %u", block_group); + ext4_free_blks_set(sb, gdp, 0); + ext4_free_inodes_set(sb, gdp, 0); + ext4_itable_unused_set(sb, gdp, 0); + memset(bh->b_data, 0xff, sb->s_blocksize); + return 0; + } + + memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, + bh->b_data); + + return EXT4_INODES_PER_GROUP(sb); +} + +/* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return buffer_head of bitmap on success or NULL. + */ +static struct buffer_head * +ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct ext4_group_desc *desc; + struct buffer_head *bh = NULL; + ext4_fsblk_t bitmap_blk; + + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return NULL; + + bitmap_blk = ext4_inode_bitmap(sb, desc); + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext4_error(sb, "Cannot read inode bitmap - " + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + if (bitmap_uptodate(bh)) + return bh; + + lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } + + ext4_lock_group(sb, block_group); + if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + ext4_init_inode_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + return bh; + } + ext4_unlock_group(sb, block_group); + + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + return bh; + } + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + trace_ext4_load_inode_bitmap(sb, block_group); + set_bitmap_uptodate(bh); + if (bh_submit_read(bh) < 0) { + put_bh(bh); + ext4_error(sb, "Cannot read inode bitmap - " + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + return bh; +} + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ext4_free_inode(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + ext4_group_t block_group; + unsigned long bit; + struct ext4_group_desc *gdp; + struct ext4_super_block *es; + struct ext4_sb_info *sbi; + int fatal = 0, err, count, cleared; + + if (atomic_read(&inode->i_count) > 1) { + printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", + atomic_read(&inode->i_count)); + return; + } + if (inode->i_nlink) { + printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n", + inode->i_nlink); + return; + } + if (!sb) { + printk(KERN_ERR "ext4_free_inode: inode on " + "nonexistent device\n"); + return; + } + sbi = EXT4_SB(sb); + + ino = inode->i_ino; + ext4_debug("freeing inode %lu\n", ino); + trace_ext4_free_inode(inode); + + /* + * Note: we must free any quota before locking the superblock, + * as writing the quota to disk may need the lock as well. + */ + dquot_initialize(inode); + ext4_xattr_delete_inode(handle, inode); + dquot_free_inode(inode); + dquot_drop(inode); + + is_directory = S_ISDIR(inode->i_mode); + + /* Do this BEFORE marking the inode not in use or returning an error */ + ext4_clear_inode(inode); + + es = EXT4_SB(sb)->s_es; + if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext4_error(sb, "reserved or nonexistent inode %lu", ino); + goto error_return; + } + block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + fatal = ext4_journal_get_write_access(handle, bitmap_bh); + if (fatal) + goto error_return; + + fatal = -ESRCH; + gdp = ext4_get_group_desc(sb, block_group, &bh2); + if (gdp) { + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext4_journal_get_write_access(handle, bh2); + } + ext4_lock_group(sb, block_group); + cleared = ext4_clear_bit(bit, bitmap_bh->b_data); + if (fatal || !cleared) { + ext4_unlock_group(sb, block_group); + goto out; + } + + count = ext4_free_inodes_count(sb, gdp) + 1; + ext4_free_inodes_set(sb, gdp, count); + if (is_directory) { + count = ext4_used_dirs_count(sb, gdp) - 1; + ext4_used_dirs_set(sb, gdp, count); + percpu_counter_dec(&sbi->s_dirs_counter); + } + gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_unlock_group(sb, block_group); + + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, block_group); + + atomic_inc(&sbi->s_flex_groups[f].free_inodes); + if (is_directory) + atomic_dec(&sbi->s_flex_groups[f].used_dirs); + } + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); +out: + if (cleared) { + BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (!fatal) + fatal = err; + ext4_mark_super_dirty(sb); + } else + ext4_error(sb, "bit already cleared for inode %lu", ino); + +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, fatal); +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory\'s block + * group to find a free inode. + */ +static int find_group_dir(struct super_block *sb, struct inode *parent, + ext4_group_t *best_group) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + unsigned int freei, avefreei; + struct ext4_group_desc *desc, *best_desc = NULL; + ext4_group_t group; + int ret = -1; + + freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter); + avefreei = freei / ngroups; + + for (group = 0; group < ngroups; group++) { + desc = ext4_get_group_desc(sb, group, NULL); + if (!desc || !ext4_free_inodes_count(sb, desc)) + continue; + if (ext4_free_inodes_count(sb, desc) < avefreei) + continue; + if (!best_desc || + (ext4_free_blks_count(sb, desc) > + ext4_free_blks_count(sb, best_desc))) { + *best_group = group; + best_desc = desc; + ret = 0; + } + } + return ret; +} + +#define free_block_ratio 10 + +static int find_group_flex(struct super_block *sb, struct inode *parent, + ext4_group_t *best_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *desc; + struct flex_groups *flex_group = sbi->s_flex_groups; + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); + ext4_group_t ngroups = ext4_get_groups_count(sb); + int flex_size = ext4_flex_bg_size(sbi); + ext4_group_t best_flex = parent_fbg_group; + int blocks_per_flex = sbi->s_blocks_per_group * flex_size; + int flexbg_free_blocks; + int flex_freeb_ratio; + ext4_group_t n_fbg_groups; + ext4_group_t i; + + n_fbg_groups = (ngroups + flex_size - 1) >> + sbi->s_log_groups_per_flex; + +find_close_to_parent: + flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks); + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; + if (atomic_read(&flex_group[best_flex].free_inodes) && + flex_freeb_ratio > free_block_ratio) + goto found_flexbg; + + if (best_flex && best_flex == parent_fbg_group) { + best_flex--; + goto find_close_to_parent; + } + + for (i = 0; i < n_fbg_groups; i++) { + if (i == parent_fbg_group || i == parent_fbg_group - 1) + continue; + + flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks); + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; + + if (flex_freeb_ratio > free_block_ratio && + (atomic_read(&flex_group[i].free_inodes))) { + best_flex = i; + goto found_flexbg; + } + + if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) || + ((atomic_read(&flex_group[i].free_blocks) > + atomic_read(&flex_group[best_flex].free_blocks)) && + atomic_read(&flex_group[i].free_inodes))) + best_flex = i; + } + + if (!atomic_read(&flex_group[best_flex].free_inodes) || + !atomic_read(&flex_group[best_flex].free_blocks)) + return -1; + +found_flexbg: + for (i = best_flex * flex_size; i < ngroups && + i < (best_flex + 1) * flex_size; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (ext4_free_inodes_count(sb, desc)) { + *best_group = i; + goto out; + } + } + + return -1; +out: + return 0; +} + +struct orlov_stats { + __u32 free_inodes; + __u32 free_blocks; + __u32 used_dirs; +}; + +/* + * Helper function for Orlov's allocator; returns critical information + * for a particular block group or flex_bg. If flex_size is 1, then g + * is a block group number; otherwise it is flex_bg number. + */ +static void get_orlov_stats(struct super_block *sb, ext4_group_t g, + int flex_size, struct orlov_stats *stats) +{ + struct ext4_group_desc *desc; + struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; + + if (flex_size > 1) { + stats->free_inodes = atomic_read(&flex_group[g].free_inodes); + stats->free_blocks = atomic_read(&flex_group[g].free_blocks); + stats->used_dirs = atomic_read(&flex_group[g].used_dirs); + return; + } + + desc = ext4_get_group_desc(sb, g, NULL); + if (desc) { + stats->free_inodes = ext4_free_inodes_count(sb, desc); + stats->free_blocks = ext4_free_blks_count(sb, desc); + stats->used_dirs = ext4_used_dirs_count(sb, desc); + } else { + stats->free_inodes = 0; + stats->free_blocks = 0; + stats->used_dirs = 0; + } +} + +/* + * Orlov's allocator for directories. + * + * We always try to spread first-level directories. + * + * If there are blockgroups with both free inodes and free blocks counts + * not worse than average we return one with smallest directory count. + * Otherwise we simply return a random group. + * + * For the rest rules look so: + * + * It's OK to put directory into a group unless + * it has too many directories already (max_dirs) or + * it has too few free inodes left (min_inodes) or + * it has too few free blocks left (min_blocks) or + * Parent's group is preferred, if it doesn't satisfy these + * conditions we search cyclically through the rest. If none + * of the groups look good we just look for a group with more + * free inodes than average (starting at parent's group). + */ + +static int find_group_orlov(struct super_block *sb, struct inode *parent, + ext4_group_t *group, int mode, + const struct qstr *qstr) +{ + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t real_ngroups = ext4_get_groups_count(sb); + int inodes_per_group = EXT4_INODES_PER_GROUP(sb); + unsigned int freei, avefreei; + ext4_fsblk_t freeb, avefreeb; + unsigned int ndirs; + int max_dirs, min_inodes; + ext4_grpblk_t min_blocks; + ext4_group_t i, grp, g, ngroups; + struct ext4_group_desc *desc; + struct orlov_stats stats; + int flex_size = ext4_flex_bg_size(sbi); + struct dx_hash_info hinfo; + + ngroups = real_ngroups; + if (flex_size > 1) { + ngroups = (real_ngroups + flex_size - 1) >> + sbi->s_log_groups_per_flex; + parent_group >>= sbi->s_log_groups_per_flex; + } + + freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + avefreei = freei / ngroups; + freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + avefreeb = freeb; + do_div(avefreeb, ngroups); + ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + + if (S_ISDIR(mode) && + ((parent == sb->s_root->d_inode) || + (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { + int best_ndir = inodes_per_group; + int ret = -1; + + if (qstr) { + hinfo.hash_version = DX_HASH_HALF_MD4; + hinfo.seed = sbi->s_hash_seed; + ext4fs_dirhash(qstr->name, qstr->len, &hinfo); + grp = hinfo.hash; + } else + get_random_bytes(&grp, sizeof(grp)); + parent_group = (unsigned)grp % ngroups; + for (i = 0; i < ngroups; i++) { + g = (parent_group + i) % ngroups; + get_orlov_stats(sb, g, flex_size, &stats); + if (!stats.free_inodes) + continue; + if (stats.used_dirs >= best_ndir) + continue; + if (stats.free_inodes < avefreei) + continue; + if (stats.free_blocks < avefreeb) + continue; + grp = g; + ret = 0; + best_ndir = stats.used_dirs; + } + if (ret) + goto fallback; + found_flex_bg: + if (flex_size == 1) { + *group = grp; + return 0; + } + + /* + * We pack inodes at the beginning of the flexgroup's + * inode tables. Block allocation decisions will do + * something similar, although regular files will + * start at 2nd block group of the flexgroup. See + * ext4_ext_find_goal() and ext4_find_near(). + */ + grp *= flex_size; + for (i = 0; i < flex_size; i++) { + if (grp+i >= real_ngroups) + break; + desc = ext4_get_group_desc(sb, grp+i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = grp+i; + return 0; + } + } + goto fallback; + } + + max_dirs = ndirs / ngroups + inodes_per_group / 16; + min_inodes = avefreei - inodes_per_group*flex_size / 4; + if (min_inodes < 1) + min_inodes = 1; + min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; + + /* + * Start looking in the flex group where we last allocated an + * inode for this parent directory + */ + if (EXT4_I(parent)->i_last_alloc_group != ~0) { + parent_group = EXT4_I(parent)->i_last_alloc_group; + if (flex_size > 1) + parent_group >>= sbi->s_log_groups_per_flex; + } + + for (i = 0; i < ngroups; i++) { + grp = (parent_group + i) % ngroups; + get_orlov_stats(sb, grp, flex_size, &stats); + if (stats.used_dirs >= max_dirs) + continue; + if (stats.free_inodes < min_inodes) + continue; + if (stats.free_blocks < min_blocks) + continue; + goto found_flex_bg; + } + +fallback: + ngroups = real_ngroups; + avefreei = freei / ngroups; +fallback_retry: + parent_group = EXT4_I(parent)->i_block_group; + for (i = 0; i < ngroups; i++) { + grp = (parent_group + i) % ngroups; + desc = ext4_get_group_desc(sb, grp, NULL); + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_inodes_count(sb, desc) >= avefreei) { + *group = grp; + return 0; + } + } + + if (avefreei) { + /* + * The free-inodes counter is approximate, and for really small + * filesystems the above test can fail to find any blockgroups + */ + avefreei = 0; + goto fallback_retry; + } + + return -1; +} + +static int find_group_other(struct super_block *sb, struct inode *parent, + ext4_group_t *group, int mode) +{ + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); + struct ext4_group_desc *desc; + int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); + + /* + * Try to place the inode is the same flex group as its + * parent. If we can't find space, use the Orlov algorithm to + * find another flex group, and store that information in the + * parent directory's inode information so that use that flex + * group for future allocations. + */ + if (flex_size > 1) { + int retry = 0; + + try_again: + parent_group &= ~(flex_size-1); + last = parent_group + flex_size; + if (last > ngroups) + last = ngroups; + for (i = parent_group; i < last; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = i; + return 0; + } + } + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { + retry = 1; + parent_group = EXT4_I(parent)->i_last_alloc_group; + goto try_again; + } + /* + * If this didn't work, use the Orlov search algorithm + * to find a new flex group; we pass in the mode to + * avoid the topdir algorithms. + */ + *group = parent_group + flex_size; + if (*group > ngroups) + *group = 0; + return find_group_orlov(sb, parent, group, mode, NULL); + } + + /* + * Try to place the inode in its parent directory + */ + *group = parent_group; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_blks_count(sb, desc)) + return 0; + + /* + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in + * the same blockgroup. But we want files which are in a different + * directory which shares a blockgroup with our parent to land in a + * different blockgroup. + * + * So add our directory's i_ino into the starting point for the hash. + */ + *group = (*group + parent->i_ino) % ngroups; + + /* + * Use a quadratic hash to find a group with a free inode and some free + * blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + *group += i; + if (*group >= ngroups) + *group -= ngroups; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_blks_count(sb, desc)) + return 0; + } + + /* + * That failed: try linear search for a free inode, even if that group + * has no free blocks. + */ + *group = parent_group; + for (i = 0; i < ngroups; i++) { + if (++*group >= ngroups) + *group = 0; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) + return 0; + } + + return -1; +} + +/* + * claim the inode from the inode bitmap. If the group + * is uninit we need to take the groups's ext4_group_lock + * and clear the uninit flag. The inode bitmap update + * and group desc uninit flag clear should be done + * after holding ext4_group_lock so that ext4_read_inode_bitmap + * doesn't race with the ext4_claim_inode + */ +static int ext4_claim_inode(struct super_block *sb, + struct buffer_head *inode_bitmap_bh, + unsigned long ino, ext4_group_t group, int mode) +{ + int free = 0, retval = 0, count; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + + /* + * We have to be sure that new inode allocation does not race with + * inode table initialization, because otherwise we may end up + * allocating and writing new inode right before sb_issue_zeroout + * takes place and overwriting our new inode with zeroes. So we + * take alloc_sem to prevent it. + */ + down_read(&grp->alloc_sem); + ext4_lock_group(sb, group); + if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { + /* not a free inode */ + retval = 1; + goto err_ret; + } + ino++; + if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || + ino > EXT4_INODES_PER_GROUP(sb)) { + ext4_unlock_group(sb, group); + up_read(&grp->alloc_sem); + ext4_error(sb, "reserved inode or inode > inodes count - " + "block_group = %u, inode=%lu", group, + ino + group * EXT4_INODES_PER_GROUP(sb)); + return 1; + } + /* If we didn't allocate from within the initialized part of the inode + * table then we need to initialize up to this inode. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + /* When marking the block group with + * ~EXT4_BG_INODE_UNINIT we don't want to depend + * on the value of bg_itable_unused even though + * mke2fs could have initialized the same for us. + * Instead we calculated the value below + */ + + free = 0; + } else { + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + } + + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + * + */ + if (ino > free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); + } + count = ext4_free_inodes_count(sb, gdp) - 1; + ext4_free_inodes_set(sb, gdp, count); + if (S_ISDIR(mode)) { + count = ext4_used_dirs_count(sb, gdp) + 1; + ext4_used_dirs_set(sb, gdp, count); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, group); + + atomic_inc(&sbi->s_flex_groups[f].used_dirs); + } + } + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); +err_ret: + ext4_unlock_group(sb, group); + up_read(&grp->alloc_sem); + return retval; +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, + const struct qstr *qstr, __u32 goal) +{ + struct super_block *sb; + struct buffer_head *inode_bitmap_bh = NULL; + struct buffer_head *group_desc_bh; + ext4_group_t ngroups, group = 0; + unsigned long ino = 0; + struct inode *inode; + struct ext4_group_desc *gdp = NULL; + struct ext4_inode_info *ei; + struct ext4_sb_info *sbi; + int ret2, err = 0; + struct inode *ret; + ext4_group_t i; + int free = 0; + static int once = 1; + ext4_group_t flex_group; + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + + sb = dir->i_sb; + ngroups = ext4_get_groups_count(sb); + trace_ext4_request_inode(dir, mode); + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT4_I(inode); + sbi = EXT4_SB(sb); + + if (!goal) + goal = sbi->s_inode_goal; + + if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { + group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); + ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); + ret2 = 0; + goto got_group; + } + + if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { + ret2 = find_group_flex(sb, dir, &group); + if (ret2 == -1) { + ret2 = find_group_other(sb, dir, &group, mode); + if (ret2 == 0 && once) { + once = 0; + printk(KERN_NOTICE "ext4: find_group_flex " + "failed, fallback succeeded dir %lu\n", + dir->i_ino); + } + } + goto got_group; + } + + if (S_ISDIR(mode)) { + if (test_opt(sb, OLDALLOC)) + ret2 = find_group_dir(sb, dir, &group); + else + ret2 = find_group_orlov(sb, dir, &group, mode, qstr); + } else + ret2 = find_group_other(sb, dir, &group, mode); + +got_group: + EXT4_I(dir)->i_last_alloc_group = group; + err = -ENOSPC; + if (ret2 == -1) + goto out; + + for (i = 0; i < ngroups; i++, ino = 0) { + err = -EIO; + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp) + goto fail; + + brelse(inode_bitmap_bh); + inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); + if (!inode_bitmap_bh) + goto fail; + +repeat_in_this_group: + ino = ext4_find_next_zero_bit((unsigned long *) + inode_bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb), ino); + + if (ino < EXT4_INODES_PER_GROUP(sb)) { + + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, + inode_bitmap_bh); + if (err) + goto fail; + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, + group_desc_bh); + if (err) + goto fail; + if (!ext4_claim_inode(sb, inode_bitmap_bh, + ino, group, mode)) { + /* we won it */ + BUFFER_TRACE(inode_bitmap_bh, + "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, + NULL, + inode_bitmap_bh); + if (err) + goto fail; + /* zero bit is inode number 1*/ + ino++; + goto got; + } + /* we lost it */ + ext4_handle_release_buffer(handle, inode_bitmap_bh); + ext4_handle_release_buffer(handle, group_desc_bh); + + if (++ino < EXT4_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; + } + + /* + * This case is possible in concurrent environment. It is very + * rare. We cannot repeat the find_group_xxx() call because + * that will simply return the same blockgroup, because the + * group descriptor metadata has not yet been updated. + * So we just go onto the next blockgroup. + */ + if (++group == ngroups) + group = 0; + } + err = -ENOSPC; + goto out; + +got: + /* We may have to initialize the block bitmap if it isn't already */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && + gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + struct buffer_head *block_bitmap_bh; + + block_bitmap_bh = ext4_read_block_bitmap(sb, group); + BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); + err = ext4_journal_get_write_access(handle, block_bitmap_bh); + if (err) { + brelse(block_bitmap_bh); + goto fail; + } + + free = 0; + ext4_lock_group(sb, group); + /* recheck and clear flag under lock if we still need to */ + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + free = ext4_free_blocks_after_init(sb, group, gdp); + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_blks_set(sb, gdp, free); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, + gdp); + } + ext4_unlock_group(sb, group); + + /* Don't need to dirty bitmap block if we didn't change it */ + if (free) { + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); + err = ext4_handle_dirty_metadata(handle, + NULL, block_bitmap_bh); + } + + brelse(block_bitmap_bh); + if (err) + goto fail; + } + BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); + if (err) + goto fail; + + percpu_counter_dec(&sbi->s_freeinodes_counter); + if (S_ISDIR(mode)) + percpu_counter_inc(&sbi->s_dirs_counter); + ext4_mark_super_dirty(sb); + + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, group); + atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); + } + + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(inode, dir, mode); + + inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); + /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = + ext4_current_time(inode); + + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + + /* + * Don't inherit extent flag from directory, amongst others. We set + * extent flag on newly created directory and file only if -o extent + * mount option is specified + */ + ei->i_flags = + ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); + ei->i_file_acl = 0; + ei->i_dtime = 0; + ei->i_block_group = group; + ei->i_last_alloc_group = ~0; + + ext4_set_inode_flags(inode); + if (IS_DIRSYNC(inode)) + ext4_handle_sync(handle); + if (insert_inode_locked(inode) < 0) { + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + goto fail; + } + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ + ext4_set_inode_state(inode, EXT4_STATE_NEW); + + ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; + + ret = inode; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + + err = ext4_init_acl(handle, inode, dir); + if (err) + goto fail_free_drop; + + err = ext4_init_security(handle, inode, dir, qstr); + if (err) + goto fail_free_drop; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + /* set extent flag only for directory, file and normal symlink*/ + if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode); + } + } + + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + ei->i_datasync_tid = handle->h_transaction->t_tid; + } + + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_std_error(sb, err); + goto fail_free_drop; + } + + ext4_debug("allocating inode %lu\n", inode->i_ino); + trace_ext4_allocate_inode(inode, dir, mode); + goto really_out; +fail: + ext4_std_error(sb, err); +out: + iput(inode); + ret = ERR_PTR(err); +really_out: + brelse(inode_bitmap_bh); + return ret; + +fail_free_drop: + dquot_free_inode(inode); + +fail_drop: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + inode->i_nlink = 0; + unlock_new_inode(inode); + iput(inode); + brelse(inode_bitmap_bh); + return ERR_PTR(err); +} + +/* Verify that we are loading a valid orphan from disk */ +struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) +{ + unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); + ext4_group_t block_group; + int bit; + struct buffer_head *bitmap_bh; + struct inode *inode = NULL; + long err = -EIO; + + /* Error cases - e2fsck has already cleaned up for us */ + if (ino > max_ino) { + ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino); + goto error; + } + + block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); + if (!bitmap_bh) { + ext4_warning(sb, "inode bitmap error for orphan %lu", ino); + goto error; + } + + /* Having the inode bit set should be a 100% indicator that this + * is a valid orphan (no e2fsck run on fs). Orphans also include + * inodes that were being truncated, so we can't check i_nlink==0. + */ + if (!ext4_test_bit(bit, bitmap_bh->b_data)) + goto bad_orphan; + + inode = ext4_iget(sb, ino); + if (IS_ERR(inode)) + goto iget_failed; + + /* + * If the orphans has i_nlinks > 0 then it should be able to be + * truncated, otherwise it won't be removed from the orphan list + * during processing and an infinite loop will result. + */ + if (inode->i_nlink && !ext4_can_truncate(inode)) + goto bad_orphan; + + if (NEXT_ORPHAN(inode) > max_ino) + goto bad_orphan; + brelse(bitmap_bh); + return inode; + +iget_failed: + err = PTR_ERR(inode); + inode = NULL; +bad_orphan: + ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); + printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", + bit, (unsigned long long)bitmap_bh->b_blocknr, + ext4_test_bit(bit, bitmap_bh->b_data)); + printk(KERN_NOTICE "inode=%p\n", inode); + if (inode) { + printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + is_bad_inode(inode)); + printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", + NEXT_ORPHAN(inode)); + printk(KERN_NOTICE "max_ino=%lu\n", max_ino); + printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); + /* Avoid freeing blocks if we got a bad deleted inode */ + if (inode->i_nlink == 0) + inode->i_blocks = 0; + iput(inode); + } + brelse(bitmap_bh); +error: + return ERR_PTR(err); +} + +unsigned long ext4_count_free_inodes(struct super_block *sb) +{ + unsigned long desc_count; + struct ext4_group_desc *gdp; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); +#ifdef EXT4FS_DEBUG + struct ext4_super_block *es; + unsigned long bitmap_count, x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT4_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += ext4_free_inodes_count(sb, gdp); + brelse(bitmap_bh); + bitmap_bh = ext4_read_inode_bitmap(sb, i); + if (!bitmap_bh) + continue; + + x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); + printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", + (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk(KERN_DEBUG "ext4_count_free_inodes: " + "stored = %u, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + return desc_count; +#else + desc_count = 0; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += ext4_free_inodes_count(sb, gdp); + cond_resched(); + } + return desc_count; +#endif +} + +/* Called at mount-time, super-block is locked */ +unsigned long ext4_count_dirs(struct super_block * sb) +{ + unsigned long count = 0; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + for (i = 0; i < ngroups; i++) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + count += ext4_used_dirs_count(sb, gdp); + } + return count; +} + +/* + * Zeroes not yet zeroed inode table - just write zeroes through the whole + * inode table. Must be called without any spinlock held. The only place + * where it is called from on active part of filesystem is ext4lazyinit + * thread, so we do not need any special locks, however we have to prevent + * inode allocation from the current group, so we take alloc_sem lock, to + * block ext4_claim_inode until we are finished. + */ +extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, + int barrier) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + struct buffer_head *group_desc_bh; + handle_t *handle; + ext4_fsblk_t blk; + int num, ret = 0, used_blks = 0; + + /* This should not happen, but just to be sure check this */ + if (sb->s_flags & MS_RDONLY) { + ret = 1; + goto out; + } + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp) + goto out; + + /* + * We do not need to lock this, because we are the only one + * handling this flag. + */ + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) + goto out; + + handle = ext4_journal_start_sb(sb, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + down_write(&grp->alloc_sem); + /* + * If inode bitmap was already initialized there may be some + * used inodes so we need to skip blocks with used inodes in + * inode table. + */ + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) + used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp)), + sbi->s_inodes_per_block); + + if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { + ext4_error(sb, "Something is wrong with group %u\n" + "Used itable blocks: %d" + "itable unused count: %u\n", + group, used_blks, + ext4_itable_unused_count(sb, gdp)); + ret = 1; + goto out; + } + + blk = ext4_inode_table(sb, gdp) + used_blks; + num = sbi->s_itb_per_group - used_blks; + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + ret = ext4_journal_get_write_access(handle, + group_desc_bh); + if (ret) + goto err_out; + + /* + * Skip zeroout if the inode table is full. But we set the ZEROED + * flag anyway, because obviously, when it is full it does not need + * further zeroing. + */ + if (unlikely(num == 0)) + goto skip_zeroout; + + ext4_debug("going to zero out inode table in group %d\n", + group); + ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS); + if (ret < 0) + goto err_out; + if (barrier) + blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL); + +skip_zeroout: + ext4_lock_group(sb, group); + gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_unlock_group(sb, group); + + BUFFER_TRACE(group_desc_bh, + "call ext4_handle_dirty_metadata"); + ret = ext4_handle_dirty_metadata(handle, NULL, + group_desc_bh); + +err_out: + up_write(&grp->alloc_sem); + ext4_journal_stop(handle); +out: + return ret; +} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c new file mode 100644 index 00000000..c1e6a726 --- /dev/null +++ b/fs/ext4/inode.c @@ -0,0 +1,5957 @@ +/* + * linux/fs/ext4/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "ext4_extents.h" + +#include + +#define MPAGE_DA_EXTENT_TAIL 0x01 + +static inline int ext4_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + trace_ext4_begin_ordered_truncate(inode, new_size); + /* + * If jinode is zero, then we never opened the file for + * writing, so there's no need to call + * jbd2_journal_begin_ordered_truncate() since there's no + * outstanding writes we need to flush. + */ + if (!EXT4_I(inode)->jinode) + return 0; + return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode, + new_size); +} + +static void ext4_invalidatepage(struct page *page, unsigned long offset); +static int noalloc_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); +static int __ext4_journalled_writepage(struct page *page, unsigned int len); +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); + +/* + * Test whether an inode is a fast symlink. + */ +static int ext4_inode_is_fast_symlink(struct inode *inode) +{ + int ea_blocks = EXT4_I(inode)->i_file_acl ? + (inode->i_sb->s_blocksize >> 9) : 0; + + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); +} + +/* + * Work out how many blocks we need to proceed with the next chunk of a + * truncate transaction. + */ +static unsigned long blocks_for_truncate(struct inode *inode) +{ + ext4_lblk_t needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext4 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT4_MAX_TRANS_DATA) + needed = EXT4_MAX_TRANS_DATA; + + return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ +static handle_t *start_transaction(struct inode *inode) +{ + handle_t *result; + + result = ext4_journal_start(inode, blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; + + ext4_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (!ext4_handle_valid(handle)) + return 0; + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) + return 0; + if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Restart the transaction associated with *handle. This does a commit, + * so before we call here everything must be consistently dirtied against + * this transaction. + */ +int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, + int nblocks) +{ + int ret; + + /* + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_mutex. So we can safely drop the i_data_sem here. + */ + BUG_ON(EXT4_JOURNAL(inode) == NULL); + jbd_debug(2, "restarting handle %p\n", handle); + up_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_journal_restart(handle, nblocks); + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + return ret; +} + +/* + * Called at the last iput() if i_nlink is zero. + */ +void ext4_evict_inode(struct inode *inode) +{ + handle_t *handle; + int err; + + trace_ext4_evict_inode(inode); + + ext4_ioend_wait(inode); + + if (inode->i_nlink) { + truncate_inode_pages(&inode->i_data, 0); + goto no_delete; + } + + if (!is_bad_inode(inode)) + dquot_initialize(inode); + + if (ext4_should_order_data(inode)) + ext4_begin_ordered_truncate(inode, 0); + truncate_inode_pages(&inode->i_data, 0); + + if (is_bad_inode(inode)) + goto no_delete; + + handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); + if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); + /* + * If we're going to skip the normal cleanup, we still need to + * make sure that the in-core orphan linked list is properly + * cleaned up. + */ + ext4_orphan_del(NULL, inode); + goto no_delete; + } + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + inode->i_size = 0; + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_warning(inode->i_sb, + "couldn't mark inode dirty (err %d)", err); + goto stop_handle; + } + if (inode->i_blocks) + ext4_truncate(inode); + + /* + * ext4_ext_truncate() doesn't reserve any slop when it + * restarts journal transactions; therefore there may not be + * enough credits left in the handle to remove the inode from + * the orphan list and set the dtime field. + */ + if (!ext4_handle_has_enough_credits(handle, 3)) { + err = ext4_journal_extend(handle, 3); + if (err > 0) + err = ext4_journal_restart(handle, 3); + if (err != 0) { + ext4_warning(inode->i_sb, + "couldn't extend journal (err %d)", err); + stop_handle: + ext4_journal_stop(handle); + ext4_orphan_del(NULL, inode); + goto no_delete; + } + } + + /* + * Kill off the orphan record which ext4_truncate created. + * AKPM: I think this can be inside the above `if'. + * Note that ext4_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - this is because we don't + * know if ext4_truncate() actually created an orphan record. + * (Well, we could do this if we need to, but heck - it works) + */ + ext4_orphan_del(handle, inode); + EXT4_I(inode)->i_dtime = get_seconds(); + + /* + * One subtle ordering requirement: if anything has gone wrong + * (transaction abort, IO errors, whatever), then we can still + * do these next steps (the fs will already have been marked as + * having errors), but we can't free the inode if the mark_dirty + * fails. + */ + if (ext4_mark_inode_dirty(handle, inode)) + /* If that failed, just do the required in-core inode clear. */ + ext4_clear_inode(inode); + else + ext4_free_inode(handle, inode); + ext4_journal_stop(handle); + return; +no_delete: + ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ +} + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +/** + * ext4_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext4 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext4_block_to_path(struct inode *inode, + ext4_lblk_t i_block, + ext4_lblk_t offsets[4], int *boundary) +{ + int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT4_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ((i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT4_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT4_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT4_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext4_warning(inode->i_sb, "block %lu > max in inode %lu", + i_block + direct_blocks + + indirect_blocks + double_blocks, inode->i_ino); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + return n; +} + +static int __ext4_check_blockref(const char *function, unsigned int line, + struct inode *inode, + __le32 *p, unsigned int max) +{ + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + __le32 *bref = p; + unsigned int blk; + + while (bref < p+max) { + blk = le32_to_cpu(*bref++); + if (blk && + unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), + blk, 1))) { + es->s_last_error_block = cpu_to_le64(blk); + ext4_error_inode(inode, function, line, blk, + "invalid block"); + return -EIO; + } + } + return 0; +} + + +#define ext4_check_indirect_blockref(inode, bh) \ + __ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_check_inode_blockref(inode) \ + __ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/** + * ext4_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) + */ +static Indirect *ext4_get_branch(struct inode *inode, int depth, + ext4_lblk_t *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_getblk(sb, le32_to_cpu(p->key)); + if (unlikely(!bh)) + goto failure; + + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto failure; + } + /* validate block references */ + if (ext4_check_indirect_blockref(inode, bh)) { + put_bh(bh); + goto failure; + } + } + + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext4_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the preferred place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ +static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; + __le32 *p; + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + ext4_grpblk_t colour; + ext4_group_t block_group; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) { + if (*p) + return le32_to_cpu(*p); + } + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. + */ + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); + return bg_start + colour; +} + +/** + * ext4_find_goal - find a preferred place for allocation. + * @inode: owner + * @block: block we want + * @partial: pointer to the last triple within a chain + * + * Normally this function find the preferred place for block allocation, + * returns it. + * Because this is only used for non-extent files, we limit the block nr + * to 32 bits. + */ +static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, + Indirect *partial) +{ + ext4_fsblk_t goal; + + /* + * XXX need to get goal block from mballoc's data structures + */ + + goal = ext4_find_near(inode, partial); + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + return goal; +} + +/** + * ext4_blks_to_allocate - Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, + int blocks_to_boundary) +{ + unsigned int count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext4_alloc_blocks: multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: inode which needs allocated blocks + * @iblock: the logical block to start allocated at + * @goal: preferred physical block of allocation + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * @blks: number of desired blocks + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @err: on return it will store the error code + * + * This function will return the number of blocks allocated as + * requested by the passed-in parameters. + */ +static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + int indirect_blks, int blks, + ext4_fsblk_t new_blocks[4], int *err) +{ + struct ext4_allocation_request ar; + int target, i; + unsigned long count = 0, blk_allocated = 0; + int index = 0; + ext4_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + /* first we try to allocate the indirect blocks */ + target = indirect_blks; + while (target > 0) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext4_new_meta_blocks(handle, inode, goal, + 0, &count, err); + if (*err) + goto failed_out; + + if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + count %lu > %d!", + current_block, count, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + if (count > 0) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + printk(KERN_INFO "%s returned more blocks than " + "requested\n", __func__); + WARN_ON(1); + break; + } + } + + target = blks - count ; + blk_allocated = count; + if (!target) + goto allocated; + /* Now allocate data blocks */ + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.goal = goal; + ar.len = target; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + /* enable in-core preallocation only for regular files */ + ar.flags = EXT4_MB_HINT_DATA; + + current_block = ext4_mb_new_blocks(handle, &ar, err); + if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + ar.len %d > %d!", + current_block, ar.len, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } + + if (*err && (target == blks)) { + /* + * if the allocation failed and we didn't allocate + * any blocks before + */ + goto failed_out; + } + if (!*err) { + if (target == blks) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + } + blk_allocated += ar.len; + } +allocated: + /* total number of blocks allocated for direct blocks */ + ret = blk_allocated; + *err = 0; + return ret; +failed_out: + for (i = 0; i < index; i++) + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); + return ret; +} + +/** + * ext4_alloc_branch - allocate and set up a chain of blocks. + * @handle: handle for this transaction + * @inode: owner + * @indirect_blks: number of allocated indirect blocks + * @blks: number of allocated direct blocks + * @goal: preferred place for allocation + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext4_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext4_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +static int ext4_alloc_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, int indirect_blks, + int *blks, ext4_fsblk_t goal, + ext4_lblk_t *offsets, Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int i, n = 0; + int err = 0; + struct buffer_head *bh; + int num; + ext4_fsblk_t new_blocks[4]; + ext4_fsblk_t current_block; + + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; + + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -EIO; + goto failed; + } + + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext4_journal_get_create_access(handle, bh); + if (err) { + /* Don't brelse(bh) here; it's done in + * ext4_journal_forget() below */ + unlock_buffer(bh); + goto failed; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if (n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i = 1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto failed; + } + *blks = num; + return err; +failed: + /* Allocation failed, free what we already allocated */ + ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); + for (i = 1; i <= n ; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, + EXT4_FREE_BLOCKS_FORGET); + } + for (i = n+1; i < indirect_blks; i++) + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); + + ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); + + return err; +} + +/** + * ext4_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext4_alloc_branch) + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static int ext4_splice_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t block, Indirect *where, int num, + int blks) +{ + int i; + int err = 0; + ext4_fsblk_t current_block; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i) = cpu_to_le32(current_block++); + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + */ + ext4_mark_inode_dirty(handle, inode); + jbd_debug(5, "splicing direct\n"); + } + return err; + +err_out: + for (i = 1; i <= num; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); + } + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), + blks, 0); + + return err; +} + +/* + * The ext4_ind_map_blocks() function handles non-extents inodes + * (i.e., using the traditional indirect/double-indirect i_blocks + * scheme) for ext4_map_blocks(). + * + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + * + * The ext4_ind_get_blocks() function should be called with + * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem + * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system + * blocks. + */ +static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + int flags) +{ + int err = -EIO; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + ext4_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + int count = 0; + ext4_fsblk_t first_block = 0; + + trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); + J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); + J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); + depth = ext4_block_to_path(inode, map->m_lblk, offsets, + &blocks_to_boundary); + + if (depth == 0) + goto out; + + partial = ext4_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + count++; + /*map more blocks*/ + while (count < map->m_len && count <= blocks_to_boundary) { + ext4_fsblk_t blk; + + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) + goto cleanup; + + /* + * Okay, we need to do block allocation. + */ + goal = ext4_find_goal(inode, map->m_lblk, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext4_blks_to_allocate(partial, indirect_blks, + map->m_len, blocks_to_boundary); + /* + * Block out ext4_truncate while we alter the tree + */ + err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, + &count, goal, + offsets + (partial - chain), partial); + + /* + * The ext4_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) + err = ext4_splice_branch(handle, inode, map->m_lblk, + partial, indirect_blks, count); + if (err) + goto cleanup; + + map->m_flags |= EXT4_MAP_NEW; + + ext4_update_inode_fsync_trans(handle, inode, 1); +got_it: + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = le32_to_cpu(chain[depth-1].key); + map->m_len = count; + if (count > blocks_to_boundary) + map->m_flags |= EXT4_MAP_BOUNDARY; + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +out: + trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, + map->m_pblk, map->m_len, err); + return err; +} + +#ifdef CONFIG_QUOTA +qsize_t *ext4_get_reserved_space(struct inode *inode) +{ + return &EXT4_I(inode)->i_reserved_quota; +} +#endif + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate a new block at @lblocks for non extent file based file + */ +static int ext4_indirect_calc_metadata_amount(struct inode *inode, + sector_t lblock) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); + int blk_bits; + + if (lblock < EXT4_NDIR_BLOCKS) + return 0; + + lblock -= EXT4_NDIR_BLOCKS; + + if (ei->i_da_metadata_calc_len && + (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { + ei->i_da_metadata_calc_len++; + return 0; + } + ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; + ei->i_da_metadata_calc_len = 1; + blk_bits = order_base_2(lblock); + return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate a block located at @lblock + */ +static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) +{ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return ext4_ext_calc_metadata_amount(inode, lblock); + + return ext4_indirect_calc_metadata_amount(inode, lblock); +} + +/* + * Called with i_data_sem down, which is important since we can call + * ext4_discard_preallocations() from here. + */ +void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + spin_lock(&ei->i_block_reservation_lock); + trace_ext4_da_update_reserve_space(inode, used); + if (unlikely(used > ei->i_reserved_data_blocks)) { + ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " + "with only %d reserved data blocks\n", + __func__, inode->i_ino, used, + ei->i_reserved_data_blocks); + WARN_ON(1); + used = ei->i_reserved_data_blocks; + } + + /* Update per-inode reservations */ + ei->i_reserved_data_blocks -= used; + ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + used + ei->i_allocated_meta_blocks); + ei->i_allocated_meta_blocks = 0; + + if (ei->i_reserved_data_blocks == 0) { + /* + * We can release all of the reserved metadata blocks + * only when we have written all of the delayed + * allocation blocks. + */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + ei->i_reserved_meta_blocks); + ei->i_reserved_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; + } + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + + /* Update quota subsystem for data blocks */ + if (quota_claim) + dquot_claim_block(inode, used); + else { + /* + * We did fallocate with an offset that is already delayed + * allocated. So on delayed allocated writeback we should + * not re-claim the quota for fallocated blocks. + */ + dquot_release_reservation_block(inode, used); + } + + /* + * If we have done all the pending block allocations and if + * there aren't any writers on the inode, we can discard the + * inode's preallocations. + */ + if ((ei->i_reserved_data_blocks == 0) && + (atomic_read(&inode->i_writecount) == 0)) + ext4_discard_preallocations(inode); +} + +static int __check_block_validity(struct inode *inode, const char *func, + unsigned int line, + struct ext4_map_blocks *map) +{ + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, + map->m_len)) { + ext4_error_inode(inode, func, line, map->m_pblk, + "lblock %lu mapped to illegal pblock " + "(length %d)", (unsigned long) map->m_lblk, + map->m_len); + return -EIO; + } + return 0; +} + +#define check_block_validity(inode, map) \ + __check_block_validity((inode), __func__, __LINE__, (map)) + +/* + * Return the number of contiguous dirty pages in a given inode + * starting at page frame idx. + */ +static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, + unsigned int max_pages) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index; + struct pagevec pvec; + pgoff_t num = 0; + int i, nr_pages, done = 0; + + if (max_pages == 0) + return 0; + pagevec_init(&pvec, 0); + while (!done) { + index = idx; + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + (pgoff_t)PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + lock_page(page); + if (unlikely(page->mapping != mapping) || + !PageDirty(page) || + PageWriteback(page) || + page->index != idx) { + done = 1; + unlock_page(page); + break; + } + if (page_has_buffers(page)) { + bh = head = page_buffers(page); + do { + if (!buffer_delay(bh) && + !buffer_unwritten(bh)) + done = 1; + bh = bh->b_this_page; + } while (!done && (bh != head)); + } + unlock_page(page); + if (done) + break; + idx++; + num++; + if (num >= max_pages) { + done = 1; + break; + } + } + pagevec_release(&pvec); + } + return num; +} + +/* + * The ext4_map_blocks() function tries to look up the requested blocks, + * and returns if the blocks are already mapped. + * + * Otherwise it takes the write lock of the i_data_sem and allocate blocks + * and store the allocated blocks in the result buffer head and mark it + * mapped. + * + * If file type is extents based, it will call ext4_ext_map_blocks(), + * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping + * based files + * + * On success, it returns the number of blocks being mapped or allocate. + * if create==0 and the blocks are pre-allocated and uninitialized block, + * the result buffer head is unmapped. If the create ==1, it will make sure + * the buffer head is mapped. + * + * It returns 0 if plain look up failed (blocks have not been allocated), in + * that casem, buffer head is unmapped + * + * It returns the error in case of allocation failure. + */ +int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + int retval; + + map->m_flags = 0; + ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," + "logical block %lu\n", inode->i_ino, flags, map->m_len, + (unsigned long) map->m_lblk); + /* + * Try to see if we can get the block without requesting a new + * file system block. + */ + down_read((&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, 0); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, 0); + } + up_read((&EXT4_I(inode)->i_data_sem)); + + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + int ret = check_block_validity(inode, map); + if (ret != 0) + return ret; + } + + /* If it is only a block(s) look up */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) + return retval; + + /* + * Returns if the blocks have already allocated + * + * Note that if blocks have been preallocated + * ext4_ext_get_block() returns th create = 0 + * with buffer head unmapped. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) + return retval; + + /* + * When we call get_blocks without the create flag, the + * BH_Unwritten flag could have gotten set if the blocks + * requested were part of a uninitialized extent. We need to + * clear this flag now that we are committed to convert all or + * part of the uninitialized extent to be an initialized + * extent. This is because we need to avoid the combination + * of BH_Unwritten and BH_Mapped flags being simultaneously + * set on the buffer_head. + */ + map->m_flags &= ~EXT4_MAP_UNWRITTEN; + + /* + * New blocks allocate and/or writing to uninitialized extent + * will possibly result in updating i_data, so we take + * the write lock of i_data_sem, and call get_blocks() + * with create == 1 flag. + */ + down_write((&EXT4_I(inode)->i_data_sem)); + + /* + * if the caller is from delayed allocation writeout path + * we have already reserved fs blocks for allocation + * let the underlying get_block() function know to + * avoid double accounting + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); + /* + * We need to check for EXT4 here because migrate + * could have changed the inode type in between + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags); + + if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { + /* + * We allocated new blocks which will result in + * i_data's format changing. Force the migrate + * to fail by clearing migrate flags + */ + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + } + + /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. We don't + * support fallocate for non extent files. So we can update + * reserve space here. + */ + if ((retval > 0) && + (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) + ext4_da_update_reserve_space(inode, retval, 1); + } + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); + + up_write((&EXT4_I(inode)->i_data_sem)); + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + int ret = check_block_validity(inode, map); + if (ret != 0) + return ret; + } + return retval; +} + +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 + +static int _ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int flags) +{ + handle_t *handle = ext4_journal_current_handle(); + struct ext4_map_blocks map; + int ret = 0, started = 0; + int dio_credits; + + map.m_lblk = iblock; + map.m_len = bh->b_size >> inode->i_blkbits; + + if (flags && !handle) { + /* Direct IO write... */ + if (map.m_len > DIO_MAX_BLOCKS) + map.m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); + handle = ext4_journal_start(inode, dio_credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } + started = 1; + } + + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret > 0) { + map_bh(bh, inode->i_sb, map.m_pblk); + bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + bh->b_size = inode->i_sb->s_blocksize * map.m_len; + ret = 0; + } + if (started) + ext4_journal_stop(handle); + return ret; +} + +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + return _ext4_get_block(inode, iblock, bh, + create ? EXT4_GET_BLOCKS_CREATE : 0); +} + +/* + * `handle' can be NULL if create is zero + */ +struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, + ext4_lblk_t block, int create, int *errp) +{ + struct ext4_map_blocks map; + struct buffer_head *bh; + int fatal = 0, err; + + J_ASSERT(handle != NULL || create == 0); + + map.m_lblk = block; + map.m_len = 1; + err = ext4_map_blocks(handle, inode, &map, + create ? EXT4_GET_BLOCKS_CREATE : 0); + + if (err < 0) + *errp = err; + if (err <= 0) + return NULL; + *errp = 0; + + bh = sb_getblk(inode->i_sb, map.m_pblk); + if (!bh) { + *errp = -EIO; + return NULL; + } + if (map.m_flags & EXT4_MAP_NEW) { + J_ASSERT(create != 0); + J_ASSERT(handle != NULL); + + /* + * Now that we do not always journal data, we should + * keep in mind whether this should always journal the + * new buffer as metadata. For now, regular file + * writes use ext4_get_block instead, so it's not a + * problem. + */ + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + fatal = ext4_journal_get_create_access(handle, bh); + if (!fatal && !buffer_uptodate(bh)) { + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (!fatal) + fatal = err; + } else { + BUFFER_TRACE(bh, "not a new buffer"); + } + if (fatal) { + *errp = fatal; + brelse(bh); + bh = NULL; + } + return bh; +} + +struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, + ext4_lblk_t block, int create, int *err) +{ + struct buffer_head *bh; + + bh = ext4_getblk(handle, inode, block, create, err); + if (!bh) + return bh; + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(READ_META, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + put_bh(bh); + *err = -EIO; + return NULL; +} + +static int walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for (bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext4_get_block() + * and the commit_write(). So doing the jbd2_journal_start at the start of + * prepare_write() is the right place. + * + * Also, this function can nest inside ext4_writepage() -> + * block_write_full_page(). In that case, we *know* that ext4_writepage() + * has generated enough buffer credits to do the whole page. So we won't + * block on the journal in that case, which is good, because the caller may + * be PF_MEMALLOC. + * + * By accident, ext4 can be reentered when a transaction is open via + * quota file writes. If we were to commit the transaction while thus + * reentered, there can be a deadlock - we would be holding a quota + * lock, and the commit would never complete if another thread had a + * transaction open and was blocking on the quota lock - a ranking + * violation. + * + * So what we do is to rely on the fact that jbd2_journal_stop/journal_start + * will _not_ run commit under these circumstances because handle->h_ref + * is elevated. We'll still have enough credits for the tiny quotafile + * write. + */ +static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) +{ + int dirty = buffer_dirty(bh); + int ret; + + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + /* + * __block_write_begin() could have dirtied some buffers. Clean + * the dirty bit as jbd2_journal_get_write_access() could complain + * otherwise about fs integrity issues. Setting of the dirty bit + * by __block_write_begin() isn't a real problem here as we clear + * the bit before releasing a page lock and thus writeback cannot + * ever write the buffer. + */ + if (dirty) + clear_buffer_dirty(bh); + ret = ext4_journal_get_write_access(handle, bh); + if (!ret && dirty) + ret = ext4_handle_dirty_metadata(handle, NULL, bh); + return ret; +} + +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + +static int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +static int ext4_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + int ret, needed_blocks; + handle_t *handle; + int retries = 0; + struct page *page; + pgoff_t index; + unsigned from, to; + + trace_ext4_write_begin(inode, pos, len, flags); + /* + * Reserve one block more for addition to orphan list in case + * we allocate blocks but write fails for some reason + */ + needed_blocks = ext4_writepage_trans_blocks(inode) + 1; + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + +retry: + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; + } + *pagep = page; + + if (ext4_should_dioread_nolock(inode)) + ret = __block_write_begin(page, pos, len, ext4_get_block_write); + else + ret = __block_write_begin(page, pos, len, ext4_get_block); + + if (!ret && ext4_should_journal_data(inode)) { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, do_journal_get_write_access); + } + + if (ret) { + unlock_page(page); + page_cache_release(page); + /* + * __block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + * + * Add inode to orphan list in case we crash before + * truncate finishes + */ + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + ext4_orphan_add(handle, inode); + + ext4_journal_stop(handle); + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might + * still be on the orphan list; we need to + * make sure the inode is removed from the + * orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + return ret; +} + +/* For write_end() in data=journal mode */ +static int write_end_fn(handle_t *handle, struct buffer_head *bh) +{ + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + set_buffer_uptodate(bh); + return ext4_handle_dirty_metadata(handle, NULL, bh); +} + +static int ext4_generic_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int i_size_changed = 0; + struct inode *inode = mapping->host; + handle_t *handle = ext4_journal_current_handle(); + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + * + * But it's important to update i_size while still holding page lock: + * page writeout could otherwise come in and zero beyond i_size. + */ + if (pos + copied > inode->i_size) { + i_size_write(inode, pos + copied); + i_size_changed = 1; + } + + if (pos + copied > EXT4_I(inode)->i_disksize) { + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_update_i_disksize(inode, (pos + copied)); + i_size_changed = 1; + } + unlock_page(page); + page_cache_release(page); + + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + ext4_mark_inode_dirty(handle, inode); + + return copied; +} + +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from page_symlink(). + * + * ext4 never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. + */ +static int ext4_ordered_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + + trace_ext4_ordered_write_end(inode, pos, len, copied); + ret = ext4_jbd2_file_inode(handle, inode); + + if (ret == 0) { + ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + copied = ret2; + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + /* if we have allocated more blocks and copied + * less. We will have blocks allocated outside + * inode->i_size. So truncate them + */ + ext4_orphan_add(handle, inode); + if (ret2 < 0) + ret = ret2; + } + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + + return ret ? ret : copied; +} + +static int ext4_writeback_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + + trace_ext4_writeback_write_end(inode, pos, len, copied); + ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + copied = ret2; + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + /* if we have allocated more blocks and copied + * less. We will have blocks allocated outside + * inode->i_size. So truncate them + */ + ext4_orphan_add(handle, inode); + + if (ret2 < 0) + ret = ret2; + + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return ret ? ret : copied; +} + +static int ext4_journalled_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + int partial = 0; + unsigned from, to; + loff_t new_i_size; + + trace_ext4_journalled_write_end(inode, pos, len, copied); + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + BUG_ON(!ext4_handle_valid(handle)); + + if (copied < len) { + if (!PageUptodate(page)) + copied = 0; + page_zero_new_buffers(page, from+copied, to); + } + + ret = walk_page_buffers(handle, page_buffers(page), from, + to, &partial, write_end_fn); + if (!partial) + SetPageUptodate(page); + new_i_size = pos + copied; + if (new_i_size > inode->i_size) + i_size_write(inode, pos+copied); + ext4_set_inode_state(inode, EXT4_STATE_JDATA); + if (new_i_size > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, new_i_size); + ret2 = ext4_mark_inode_dirty(handle, inode); + if (!ret) + ret = ret2; + } + + unlock_page(page); + page_cache_release(page); + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + /* if we have allocated more blocks and copied + * less. We will have blocks allocated outside + * inode->i_size. So truncate them + */ + ext4_orphan_add(handle, inode); + + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return ret ? ret : copied; +} + +/* + * Reserve a single block located at lblock + */ +static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) +{ + int retries = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long md_needed; + int ret; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ +repeat: + spin_lock(&ei->i_block_reservation_lock); + md_needed = ext4_calc_metadata_amount(inode, lblock); + trace_ext4_da_reserve_space(inode, md_needed); + spin_unlock(&ei->i_block_reservation_lock); + + /* + * We will charge metadata quota at writeout time; this saves + * us from metadata over-estimation, though we may go over by + * a small amount in the end. Here we just reserve for data. + */ + ret = dquot_reserve_block(inode, 1); + if (ret) + return ret; + /* + * We do still charge estimated metadata to the sb though; + * we cannot afford to run out of free blocks. + */ + if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) { + dquot_release_reservation_block(inode, 1); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + yield(); + goto repeat; + } + return -ENOSPC; + } + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks++; + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); + + return 0; /* success */ +} + +static void ext4_da_release_space(struct inode *inode, int to_free) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + if (!to_free) + return; /* Nothing to release, exit */ + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + + trace_ext4_da_release_space(inode, to_free); + if (unlikely(to_free > ei->i_reserved_data_blocks)) { + /* + * if there aren't enough reserved blocks, then the + * counter is messed up somewhere. Since this + * function is called from invalidate page, it's + * harmless to return without any action. + */ + ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " + "ino %lu, to_free %d with only %d reserved " + "data blocks\n", inode->i_ino, to_free, + ei->i_reserved_data_blocks); + WARN_ON(1); + to_free = ei->i_reserved_data_blocks; + } + ei->i_reserved_data_blocks -= to_free; + + if (ei->i_reserved_data_blocks == 0) { + /* + * We can release all of the reserved metadata blocks + * only when we have written all of the delayed + * allocation blocks. + */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + ei->i_reserved_meta_blocks); + ei->i_reserved_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; + } + + /* update fs dirty data blocks counter */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); + + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + + dquot_release_reservation_block(inode, to_free); +} + +static void ext4_da_page_release_reservation(struct page *page, + unsigned long offset) +{ + int to_release = 0; + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + + if ((offset <= curr_off) && (buffer_delay(bh))) { + to_release++; + clear_buffer_delay(bh); + } + curr_off = next_off; + } while ((bh = bh->b_this_page) != head); + ext4_da_release_space(page->mapping->host, to_release); +} + +/* + * Delayed allocation stuff + */ + +/* + * mpage_da_submit_io - walks through extent of pages and try to write + * them with writepage() call back + * + * @mpd->inode: inode + * @mpd->first_page: first page of the extent + * @mpd->next_page: page after the last page of the extent + * + * By the time mpage_da_submit_io() is called we expect all blocks + * to be allocated. this may be wrong if allocation failed. + * + * As pages are already locked by write_cache_pages(), we can't use it + */ +static int mpage_da_submit_io(struct mpage_da_data *mpd, + struct ext4_map_blocks *map) +{ + struct pagevec pvec; + unsigned long index, end; + int ret = 0, err, nr_pages, i; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + loff_t size = i_size_read(inode); + unsigned int len, block_start; + struct buffer_head *bh, *page_bufs = NULL; + int journal_data = ext4_should_journal_data(inode); + sector_t pblock = 0, cur_logical = 0; + struct ext4_io_submit io_submit; + + BUG_ON(mpd->next_page <= mpd->first_page); + memset(&io_submit, 0, sizeof(io_submit)); + /* + * We need to start from the first_page to the next_page - 1 + * to make sure we also write the mapped dirty buffer_heads. + * If we look at mpd->b_blocknr we would only be looking + * at the currently mapped buffer_heads. + */ + index = mpd->first_page; + end = mpd->next_page - 1; + + pagevec_init(&pvec, 0); + while (index <= end) { + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + int commit_write = 0, skip_page = 0; + struct page *page = pvec.pages[i]; + + index = page->index; + if (index > end) + break; + + if (index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + if (map) { + cur_logical = index << (PAGE_CACHE_SHIFT - + inode->i_blkbits); + pblock = map->m_pblk + (cur_logical - + map->m_lblk); + } + index++; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + + /* + * If the page does not have buffers (for + * whatever reason), try to create them using + * __block_write_begin. If this fails, + * skip the page and move on. + */ + if (!page_has_buffers(page)) { + if (__block_write_begin(page, 0, len, + noalloc_get_block_write)) { + skip_page: + unlock_page(page); + continue; + } + commit_write = 1; + } + + bh = page_bufs = page_buffers(page); + block_start = 0; + do { + if (!bh) + goto skip_page; + if (map && (cur_logical >= map->m_lblk) && + (cur_logical <= (map->m_lblk + + (map->m_len - 1)))) { + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock; + } + if (buffer_unwritten(bh) || + buffer_mapped(bh)) + BUG_ON(bh->b_blocknr != pblock); + if (map->m_flags & EXT4_MAP_UNINIT) + set_buffer_uninit(bh); + clear_buffer_unwritten(bh); + } + + /* + * skip page if block allocation undone and + * block is dirty + */ + if (ext4_bh_delay_or_unwritten(NULL, bh)) + skip_page = 1; + bh = bh->b_this_page; + block_start += bh->b_size; + cur_logical++; + pblock++; + } while (bh != page_bufs); + + if (skip_page) + goto skip_page; + + if (commit_write) + /* mark the buffer_heads as dirty & uptodate */ + block_commit_write(page, 0, len); + + clear_page_dirty_for_io(page); + /* + * Delalloc doesn't support data journalling, + * but eventually maybe we'll lift this + * restriction. + */ + if (unlikely(journal_data && PageChecked(page))) + err = __ext4_journalled_writepage(page, len); + else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) + err = ext4_bio_write_page(&io_submit, page, + len, mpd->wbc); + else if (buffer_uninit(page_bufs)) { + ext4_set_bh_endio(page_bufs, inode); + err = block_write_full_page_endio(page, + noalloc_get_block_write, + mpd->wbc, ext4_end_io_buffer_write); + } else + err = block_write_full_page(page, + noalloc_get_block_write, mpd->wbc); + + if (!err) + mpd->pages_written++; + /* + * In error case, we have to continue because + * remaining pages are still locked + */ + if (ret == 0) + ret = err; + } + pagevec_release(&pvec); + } + ext4_io_submit(&io_submit); + return ret; +} + +static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) +{ + int nr_pages, i; + pgoff_t index, end; + struct pagevec pvec; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + + index = mpd->first_page; + end = mpd->next_page - 1; + while (index <= end) { + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + if (page->index > end) + break; + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + block_invalidatepage(page, 0); + ClearPageUptodate(page); + unlock_page(page); + } + index = pvec.pages[nr_pages - 1]->index + 1; + pagevec_release(&pvec); + } + return; +} + +static void ext4_print_free_blocks(struct inode *inode) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + printk(KERN_CRIT "Total free blocks count %lld\n", + ext4_count_free_blocks(inode->i_sb)); + printk(KERN_CRIT "Free/Dirty block details\n"); + printk(KERN_CRIT "free_blocks=%lld\n", + (long long) percpu_counter_sum(&sbi->s_freeblocks_counter)); + printk(KERN_CRIT "dirty_blocks=%lld\n", + (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + printk(KERN_CRIT "Block reservation details\n"); + printk(KERN_CRIT "i_reserved_data_blocks=%u\n", + EXT4_I(inode)->i_reserved_data_blocks); + printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", + EXT4_I(inode)->i_reserved_meta_blocks); + return; +} + +/* + * mpage_da_map_and_submit - go through given space, map them + * if necessary, and then submit them for I/O + * + * @mpd - bh describing space + * + * The function skips space we know is already mapped to disk blocks. + * + */ +static void mpage_da_map_and_submit(struct mpage_da_data *mpd) +{ + int err, blks, get_blocks_flags; + struct ext4_map_blocks map, *mapp = NULL; + sector_t next = mpd->b_blocknr; + unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; + loff_t disksize = EXT4_I(mpd->inode)->i_disksize; + handle_t *handle = NULL; + + /* + * If the blocks are mapped already, or we couldn't accumulate + * any blocks, then proceed immediately to the submission stage. + */ + if ((mpd->b_size == 0) || + ((mpd->b_state & (1 << BH_Mapped)) && + !(mpd->b_state & (1 << BH_Delay)) && + !(mpd->b_state & (1 << BH_Unwritten)))) + goto submit_io; + + handle = ext4_journal_current_handle(); + BUG_ON(!handle); + + /* + * Call ext4_map_blocks() to allocate any delayed allocation + * blocks, or to convert an uninitialized extent to be + * initialized (in the case where we have written into + * one or more preallocated blocks). + * + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to + * indicate that we are on the delayed allocation path. This + * affects functions in many different parts of the allocation + * call path. This flag exists primarily because we don't + * want to change *many* call functions, so ext4_map_blocks() + * will set the EXT4_STATE_DELALLOC_RESERVED flag once the + * inode's allocation semaphore is taken. + * + * If the blocks in questions were delalloc blocks, set + * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting + * variables are updated after the blocks have been allocated. + */ + map.m_lblk = next; + map.m_len = max_blocks; + get_blocks_flags = EXT4_GET_BLOCKS_CREATE; + if (ext4_should_dioread_nolock(mpd->inode)) + get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; + if (mpd->b_state & (1 << BH_Delay)) + get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + + blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); + if (blks < 0) { + struct super_block *sb = mpd->inode->i_sb; + + err = blks; + /* + * If get block returns EAGAIN or ENOSPC and there + * appears to be free blocks we will just let + * mpage_da_submit_io() unlock all of the pages. + */ + if (err == -EAGAIN) + goto submit_io; + + if (err == -ENOSPC && + ext4_count_free_blocks(sb)) { + mpd->retval = err; + goto submit_io; + } + + /* + * get block failure will cause us to loop in + * writepages, because a_ops->writepage won't be able + * to make progress. The page will be redirtied by + * writepage and writepages will again try to write + * the same. + */ + if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { + ext4_msg(sb, KERN_CRIT, + "delayed block allocation failed for inode %lu " + "at logical offset %llu with max blocks %zd " + "with error %d", mpd->inode->i_ino, + (unsigned long long) next, + mpd->b_size >> mpd->inode->i_blkbits, err); + ext4_msg(sb, KERN_CRIT, + "This should not happen!! Data will be lost\n"); + if (err == -ENOSPC) + ext4_print_free_blocks(mpd->inode); + } + /* invalidate all the pages */ + ext4_da_block_invalidatepages(mpd); + + /* Mark this page range as having been completed */ + mpd->io_done = 1; + return; + } + BUG_ON(blks == 0); + + mapp = ↦ + if (map.m_flags & EXT4_MAP_NEW) { + struct block_device *bdev = mpd->inode->i_sb->s_bdev; + int i; + + for (i = 0; i < map.m_len; i++) + unmap_underlying_metadata(bdev, map.m_pblk + i); + } + + if (ext4_should_order_data(mpd->inode)) { + err = ext4_jbd2_file_inode(handle, mpd->inode); + if (err) + /* This only happens if the journal is aborted */ + return; + } + + /* + * Update on-disk size along with block allocation. + */ + disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; + if (disksize > i_size_read(mpd->inode)) + disksize = i_size_read(mpd->inode); + if (disksize > EXT4_I(mpd->inode)->i_disksize) { + ext4_update_i_disksize(mpd->inode, disksize); + err = ext4_mark_inode_dirty(handle, mpd->inode); + if (err) + ext4_error(mpd->inode->i_sb, + "Failed to mark inode %lu dirty", + mpd->inode->i_ino); + } + +submit_io: + mpage_da_submit_io(mpd, mapp); + mpd->io_done = 1; +} + +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ + (1 << BH_Delay) | (1 << BH_Unwritten)) + +/* + * mpage_add_bh_to_extent - try to add one more block to extent of blocks + * + * @mpd->lbh - extent of blocks + * @logical - logical number of the block in the file + * @bh - bh of the block (used to access block's state) + * + * the function is used to collect contig. blocks in same state + */ +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, + sector_t logical, size_t b_size, + unsigned long b_state) +{ + sector_t next; + int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; + + /* + * XXX Don't go larger than mballoc is willing to allocate + * This is a stopgap solution. We eventually need to fold + * mpage_da_submit_io() into this function and then call + * ext4_map_blocks() multiple times in a loop + */ + if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) + goto flush_it; + + /* check if thereserved journal credits might overflow */ + if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { + if (nrblocks >= EXT4_MAX_TRANS_DATA) { + /* + * With non-extent format we are limited by the journal + * credit available. Total credit needed to insert + * nrblocks contiguous blocks is dependent on the + * nrblocks. So limit nrblocks. + */ + goto flush_it; + } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > + EXT4_MAX_TRANS_DATA) { + /* + * Adding the new buffer_head would make it cross the + * allowed limit for which we have journal credit + * reserved. So limit the new bh->b_size + */ + b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << + mpd->inode->i_blkbits; + /* we will do mpage_da_submit_io in the next loop */ + } + } + /* + * First block in the extent + */ + if (mpd->b_size == 0) { + mpd->b_blocknr = logical; + mpd->b_size = b_size; + mpd->b_state = b_state & BH_FLAGS; + return; + } + + next = mpd->b_blocknr + nrblocks; + /* + * Can we merge the block to our big extent? + */ + if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { + mpd->b_size += b_size; + return; + } + +flush_it: + /* + * We couldn't merge the block to our extent, so we + * need to flush current extent and start new one + */ + mpage_da_map_and_submit(mpd); + return; +} + +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) +{ + return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); +} + +/* + * This is a special get_blocks_t callback which is used by + * ext4_da_write_begin(). It will either return mapped block or + * reserve space for a single block. + * + * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. + * We also have b_blocknr = -1 and b_bdev initialized properly + * + * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. + * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev + * initialized properly. + */ +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + struct ext4_map_blocks map; + int ret = 0; + sector_t invalid_block = ~((sector_t) 0xffff); + + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; + + BUG_ON(create == 0); + BUG_ON(bh->b_size != inode->i_sb->s_blocksize); + + map.m_lblk = iblock; + map.m_len = 1; + + /* + * first, we need to know whether the block is allocated already + * preallocated blocks are unmapped but should treated + * the same as allocated blocks. + */ + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + if (ret == 0) { + if (buffer_delay(bh)) + return 0; /* Not sure this could or should happen */ + /* + * XXX: __block_write_begin() unmaps passed block, is it OK? + */ + ret = ext4_da_reserve_space(inode, iblock); + if (ret) + /* not enough space to reserve */ + return ret; + + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; + } + + map_bh(bh, inode->i_sb, map.m_pblk); + bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + + if (buffer_unwritten(bh)) { + /* A delayed write to unwritten bh should be marked + * new and mapped. Mapped ensures that we don't do + * get_block multiple times when we write to the same + * offset and new ensures that we do proper zero out + * for partial write. + */ + set_buffer_new(bh); + set_buffer_mapped(bh); + } + return 0; +} + +/* + * This function is used as a standard get_block_t calback function + * when there is no desire to allocate any blocks. It is used as a + * callback function for block_write_begin() and block_write_full_page(). + * These functions should only try to map a single block at a time. + * + * Since this function doesn't do block allocations even if the caller + * requests it by passing in create=1, it is critically important that + * any caller checks to make sure that any buffer heads are returned + * by this function are either all already mapped or marked for + * delayed allocation before calling block_write_full_page(). Otherwise, + * b_blocknr could be left unitialized, and the page write functions will + * be taken by surprise. + */ +static int noalloc_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + return _ext4_get_block(inode, iblock, bh_result, 0); +} + +static int bget_one(handle_t *handle, struct buffer_head *bh) +{ + get_bh(bh); + return 0; +} + +static int bput_one(handle_t *handle, struct buffer_head *bh) +{ + put_bh(bh); + return 0; +} + +static int __ext4_journalled_writepage(struct page *page, + unsigned int len) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct buffer_head *page_bufs; + handle_t *handle = NULL; + int ret = 0; + int err; + + ClearPageChecked(page); + page_bufs = page_buffers(page); + BUG_ON(!page_bufs); + walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); + /* As soon as we unlock the page, it can go away, but we have + * references to buffers so we are safe */ + unlock_page(page); + + handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + BUG_ON(!ext4_handle_valid(handle)); + + ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, + do_journal_get_write_access); + + err = walk_page_buffers(handle, page_bufs, 0, len, NULL, + write_end_fn); + if (ret == 0) + ret = err; + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + + walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); + ext4_set_inode_state(inode, EXT4_STATE_JDATA); +out: + return ret; +} + +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); + +/* + * Note that we don't need to start a transaction unless we're journaling data + * because we should have holes filled from ext4_page_mkwrite(). We even don't + * need to file the inode to the transaction's list in ordered mode because if + * we are writing back data added by write(), the inode is already there and if + * we are writing back data modified via mmap(), no one guarantees in which + * transaction the data will hit the disk. In case we are journaling data, we + * cannot start transaction directly because transaction start ranks above page + * lock so we have to do some magic. + * + * This function can get called via... + * - ext4_da_writepages after taking page lock (have journal handle) + * - journal_submit_inode_data_buffers (no journal handle) + * - shrink_page_list via pdflush (no journal handle) + * - grab_page_cache when doing write_begin (have journal handle) + * + * We don't do any block allocation in this function. If we have page with + * multiple blocks we need to write those buffer_heads that are mapped. This + * is important for mmaped based write. So if we do with blocksize 1K + * truncate(f, 1024); + * a = mmap(f, 0, 4096); + * a[0] = 'a'; + * truncate(f, 4096); + * we have in the page first buffer_head mapped via page_mkwrite call back + * but other bufer_heads would be unmapped but dirty(dirty done via the + * do_wp_page). So writepage should write the first block. If we modify + * the mmap area beyond 1024 we will again get a page_fault and the + * page_mkwrite callback will do the block allocation and mark the + * buffer_heads mapped. + * + * We redirty the page if we have any buffer_heads that is either delay or + * unwritten in the page. + * + * We can get recursively called as show below. + * + * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext4_writepage() + * + * But since we don't do any block allocation we should not deadlock. + * Page also have the dirty flag cleared so we don't get recurive page_lock. + */ +static int ext4_writepage(struct page *page, + struct writeback_control *wbc) +{ + int ret = 0, commit_write = 0; + loff_t size; + unsigned int len; + struct buffer_head *page_bufs = NULL; + struct inode *inode = page->mapping->host; + + trace_ext4_writepage(page); + size = i_size_read(inode); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + /* + * If the page does not have buffers (for whatever reason), + * try to create them using __block_write_begin. If this + * fails, redirty the page and move on. + */ + if (!page_has_buffers(page)) { + if (__block_write_begin(page, 0, len, + noalloc_get_block_write)) { + redirty_page: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + commit_write = 1; + } + page_bufs = page_buffers(page); + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_delay_or_unwritten)) { + /* + * We don't want to do block allocation, so redirty + * the page and return. We may reach here when we do + * a journal commit via journal_submit_inode_data_buffers. + * We can also reach here via shrink_page_list + */ + goto redirty_page; + } + if (commit_write) + /* now mark the buffer_heads as dirty and uptodate */ + block_commit_write(page, 0, len); + + if (PageChecked(page) && ext4_should_journal_data(inode)) + /* + * It's mmapped pagecache. Add buffers and journal it. There + * doesn't seem much point in redirtying the page here. + */ + return __ext4_journalled_writepage(page, len); + + if (buffer_uninit(page_bufs)) { + ext4_set_bh_endio(page_bufs, inode); + ret = block_write_full_page_endio(page, noalloc_get_block_write, + wbc, ext4_end_io_buffer_write); + } else + ret = block_write_full_page(page, noalloc_get_block_write, + wbc); + + return ret; +} + +/* + * This is called via ext4_da_writepages() to + * calculate the total number of credits to reserve to fit + * a single extent allocation into a single transaction, + * ext4_da_writpeages() will loop calling this before + * the block allocation. + */ + +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ + int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; + + /* + * With non-extent format the journal credit needed to + * insert nrblocks contiguous block is dependent on + * number of contiguous block. So we will limit + * number of contiguous block to a sane value + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && + (max_blocks > EXT4_MAX_TRANS_DATA)) + max_blocks = EXT4_MAX_TRANS_DATA; + + return ext4_chunk_trans_blocks(inode, max_blocks); +} + +/* + * write_cache_pages_da - walk the list of dirty pages of the given + * address space and accumulate pages that need writing, and call + * mpage_da_map_and_submit to map a single contiguous memory region + * and then write them. + */ +static int write_cache_pages_da(struct address_space *mapping, + struct writeback_control *wbc, + struct mpage_da_data *mpd, + pgoff_t *done_index) +{ + struct buffer_head *bh, *head; + struct inode *inode = mapping->host; + struct pagevec pvec; + unsigned int nr_pages; + sector_t logical; + pgoff_t index, end; + long nr_to_write = wbc->nr_to_write; + int i, tag, ret = 0; + + memset(mpd, 0, sizeof(struct mpage_da_data)); + mpd->wbc = wbc; + mpd->inode = inode; + pagevec_init(&pvec, 0); + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; + + *done_index = index; + while (index <= end) { + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + return 0; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + */ + if (page->index > end) + goto out; + + *done_index = page->index + 1; + + /* + * If we can't merge this page, and we have + * accumulated an contiguous region, write it + */ + if ((mpd->next_page != page->index) && + (mpd->next_page != mpd->first_page)) { + mpage_da_map_and_submit(mpd); + goto ret_extent_tail; + } + + lock_page(page); + + /* + * If the page is no longer dirty, or its + * mapping no longer corresponds to inode we + * are writing (which means it has been + * truncated or invalidated), or the page is + * already under writeback and we are not + * doing a data integrity writeback, skip the page + */ + if (!PageDirty(page) || + (PageWriteback(page) && + (wbc->sync_mode == WB_SYNC_NONE)) || + unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + wait_on_page_writeback(page); + BUG_ON(PageWriteback(page)); + + if (mpd->next_page != page->index) + mpd->first_page = page->index; + mpd->next_page = page->index + 1; + logical = (sector_t) page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + if (!page_has_buffers(page)) { + mpage_add_bh_to_extent(mpd, logical, + PAGE_CACHE_SIZE, + (1 << BH_Dirty) | (1 << BH_Uptodate)); + if (mpd->io_done) + goto ret_extent_tail; + } else { + /* + * Page with regular buffer heads, + * just add all dirty ones + */ + head = page_buffers(page); + bh = head; + do { + BUG_ON(buffer_locked(bh)); + /* + * We need to try to allocate + * unmapped blocks in the same page. + * Otherwise we won't make progress + * with the page in ext4_writepage + */ + if (ext4_bh_delay_or_unwritten(NULL, bh)) { + mpage_add_bh_to_extent(mpd, logical, + bh->b_size, + bh->b_state); + if (mpd->io_done) + goto ret_extent_tail; + } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { + /* + * mapped dirty buffer. We need + * to update the b_state + * because we look at b_state + * in mpage_da_map_blocks. We + * don't update b_size because + * if we find an unmapped + * buffer_head later we need to + * use the b_state flag of that + * buffer_head. + */ + if (mpd->b_size == 0) + mpd->b_state = bh->b_state & BH_FLAGS; + } + logical++; + } while ((bh = bh->b_this_page) != head); + } + + if (nr_to_write > 0) { + nr_to_write--; + if (nr_to_write == 0 && + wbc->sync_mode == WB_SYNC_NONE) + /* + * We stop writing back only if we are + * not doing integrity sync. In case of + * integrity sync we have to keep going + * because someone may be concurrently + * dirtying pages, and we might have + * synced a lot of newly appeared dirty + * pages, but have not synced all of the + * old dirty pages. + */ + goto out; + } + } + pagevec_release(&pvec); + cond_resched(); + } + return 0; +ret_extent_tail: + ret = MPAGE_DA_EXTENT_TAIL; +out: + pagevec_release(&pvec); + cond_resched(); + return ret; +} + + +static int ext4_da_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + pgoff_t index; + int range_whole = 0; + handle_t *handle = NULL; + struct mpage_da_data mpd; + struct inode *inode = mapping->host; + int pages_written = 0; + unsigned int max_pages; + int range_cyclic, cycled = 1, io_done = 0; + int needed_blocks, ret = 0; + long desired_nr_to_write, nr_to_writebump = 0; + loff_t range_start = wbc->range_start; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + pgoff_t done_index = 0; + pgoff_t end; + + trace_ext4_da_writepages(inode, wbc); + + /* + * No pages to write? This is mainly a kludge to avoid starting + * a transaction for special inodes like journal inode on last iput() + * because that could violate lock ordering on umount + */ + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + /* + * If the filesystem has aborted, it is read-only, so return + * right away instead of dumping stack traces later on that + * will obscure the real source of the problem. We test + * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because + * the latter could be true if the filesystem is mounted + * read-only, and in that case, ext4_da_writepages should + * *never* be called, so if that ever happens, we would want + * the stack trace. + */ + if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) + return -EROFS; + + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + + range_cyclic = wbc->range_cyclic; + if (wbc->range_cyclic) { + index = mapping->writeback_index; + if (index) + cycled = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = LLONG_MAX; + wbc->range_cyclic = 0; + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + } + + /* + * This works around two forms of stupidity. The first is in + * the writeback code, which caps the maximum number of pages + * written to be 1024 pages. This is wrong on multiple + * levels; different architectues have a different page size, + * which changes the maximum amount of data which gets + * written. Secondly, 4 megabytes is way too small. XFS + * forces this value to be 16 megabytes by multiplying + * nr_to_write parameter by four, and then relies on its + * allocator to allocate larger extents to make them + * contiguous. Unfortunately this brings us to the second + * stupidity, which is that ext4's mballoc code only allocates + * at most 2048 blocks. So we force contiguous writes up to + * the number of dirty blocks in the inode, or + * sbi->max_writeback_mb_bump whichever is smaller. + */ + max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); + if (!range_cyclic && range_whole) { + if (wbc->nr_to_write == LONG_MAX) + desired_nr_to_write = wbc->nr_to_write; + else + desired_nr_to_write = wbc->nr_to_write * 8; + } else + desired_nr_to_write = ext4_num_dirty_pages(inode, index, + max_pages); + if (desired_nr_to_write > max_pages) + desired_nr_to_write = max_pages; + + if (wbc->nr_to_write < desired_nr_to_write) { + nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; + wbc->nr_to_write = desired_nr_to_write; + } + +retry: + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + + while (!ret && wbc->nr_to_write > 0) { + + /* + * we insert one extent at a time. So we need + * credit needed for single extent allocation. + * journalled mode is currently not supported + * by delalloc + */ + BUG_ON(ext4_should_journal_data(inode)); + needed_blocks = ext4_da_writepages_trans_blocks(inode); + + /* start a new transaction*/ + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " + "%ld pages, ino %lu; err %d", __func__, + wbc->nr_to_write, inode->i_ino, ret); + goto out_writepages; + } + + /* + * Now call write_cache_pages_da() to find the next + * contiguous region of logical blocks that need + * blocks to be allocated by ext4 and submit them. + */ + ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); + /* + * If we have a contiguous extent of pages and we + * haven't done the I/O yet, map the blocks and submit + * them for I/O. + */ + if (!mpd.io_done && mpd.next_page != mpd.first_page) { + mpage_da_map_and_submit(&mpd); + ret = MPAGE_DA_EXTENT_TAIL; + } + trace_ext4_da_write_pages(inode, &mpd); + wbc->nr_to_write -= mpd.pages_written; + + ext4_journal_stop(handle); + + if ((mpd.retval == -ENOSPC) && sbi->s_journal) { + /* commit the transaction which would + * free blocks released in the transaction + * and try again + */ + jbd2_journal_force_commit_nested(sbi->s_journal); + ret = 0; + } else if (ret == MPAGE_DA_EXTENT_TAIL) { + /* + * got one extent now try with + * rest of the pages + */ + pages_written += mpd.pages_written; + ret = 0; + io_done = 1; + } else if (wbc->nr_to_write) + /* + * There is no more writeout needed + * or we requested for a noblocking writeout + * and we found the device congested + */ + break; + } + if (!io_done && !cycled) { + cycled = 1; + index = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = mapping->writeback_index - 1; + goto retry; + } + + /* Update index */ + wbc->range_cyclic = range_cyclic; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * set the writeback_index so that range_cyclic + * mode will write it back later + */ + mapping->writeback_index = done_index; + +out_writepages: + wbc->nr_to_write -= nr_to_writebump; + wbc->range_start = range_start; + trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); + return ret; +} + +#define FALL_BACK_TO_NONDELALLOC 1 +static int ext4_nonda_switch(struct super_block *sb) +{ + s64 free_blocks, dirty_blocks; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* + * switch to non delalloc mode if we are running low + * on free block. The free block accounting via percpu + * counters can get slightly wrong with percpu_counter_batch getting + * accumulated on each CPU without updating global counters + * Delalloc need an accurate free block accounting. So switch + * to non delalloc when we are near to error range. + */ + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); + if (2 * free_blocks < 3 * dirty_blocks || + free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { + /* + * free block count is less than 150% of dirty blocks + * or free blocks is less than watermark + */ + return 1; + } + /* + * Even if we don't switch but are nearing capacity, + * start pushing delalloc when 1/2 of free blocks are dirty. + */ + if (free_blocks < 2 * dirty_blocks) + writeback_inodes_sb_if_idle(sb); + + return 0; +} + +static int ext4_da_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret, retries = 0; + struct page *page; + pgoff_t index; + struct inode *inode = mapping->host; + handle_t *handle; + + index = pos >> PAGE_CACHE_SHIFT; + + if (ext4_nonda_switch(inode->i_sb)) { + *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; + return ext4_write_begin(file, mapping, pos, + len, flags, pagep, fsdata); + } + *fsdata = (void *)0; + trace_ext4_da_write_begin(inode, pos, len, flags); +retry: + /* + * With delayed allocation, we don't log the i_disksize update + * if there is delayed block allocation. But we still need + * to journalling the i_disksize update if writes to the end + * of file which has an already mapped buffer. + */ + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; + } + *pagep = page; + + ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); + if (ret < 0) { + unlock_page(page); + ext4_journal_stop(handle); + page_cache_release(page); + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + ext4_truncate_failed_write(inode); + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + return ret; +} + +/* + * Check if we should update i_disksize + * when write to the end of file but not require block allocation + */ +static int ext4_da_should_update_i_disksize(struct page *page, + unsigned long offset) +{ + struct buffer_head *bh; + struct inode *inode = page->mapping->host; + unsigned int idx; + int i; + + bh = page_buffers(page); + idx = offset >> inode->i_blkbits; + + for (i = 0; i < idx; i++) + bh = bh->b_this_page; + + if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) + return 0; + return 1; +} + +static int ext4_da_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int ret = 0, ret2; + handle_t *handle = ext4_journal_current_handle(); + loff_t new_i_size; + unsigned long start, end; + int write_mode = (int)(unsigned long)fsdata; + + if (write_mode == FALL_BACK_TO_NONDELALLOC) { + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: + return ext4_ordered_write_end(file, mapping, pos, + len, copied, page, fsdata); + case EXT4_INODE_WRITEBACK_DATA_MODE: + return ext4_writeback_write_end(file, mapping, pos, + len, copied, page, fsdata); + default: + BUG(); + } + } + + trace_ext4_da_write_end(inode, pos, len, copied); + start = pos & (PAGE_CACHE_SIZE - 1); + end = start + copied - 1; + + /* + * generic_write_end() will run mark_inode_dirty() if i_size + * changes. So let's piggyback the i_disksize mark_inode_dirty + * into that. + */ + + new_i_size = pos + copied; + if (copied && new_i_size > EXT4_I(inode)->i_disksize) { + if (ext4_da_should_update_i_disksize(page, end)) { + down_write(&EXT4_I(inode)->i_data_sem); + if (new_i_size > EXT4_I(inode)->i_disksize) { + /* + * Updating i_disksize when extending file + * without needing block allocation + */ + if (ext4_should_order_data(inode)) + ret = ext4_jbd2_file_inode(handle, + inode); + + EXT4_I(inode)->i_disksize = new_i_size; + } + up_write(&EXT4_I(inode)->i_data_sem); + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_mark_inode_dirty(handle, inode); + } + } + ret2 = generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + copied = ret2; + if (ret2 < 0) + ret = ret2; + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + return ret ? ret : copied; +} + +static void ext4_da_invalidatepage(struct page *page, unsigned long offset) +{ + /* + * Drop reserved blocks + */ + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + ext4_da_page_release_reservation(page, offset); + +out: + ext4_invalidatepage(page, offset); + + return; +} + +/* + * Force all delayed allocation blocks to be allocated for a given inode. + */ +int ext4_alloc_da_blocks(struct inode *inode) +{ + trace_ext4_alloc_da_blocks(inode); + + if (!EXT4_I(inode)->i_reserved_data_blocks && + !EXT4_I(inode)->i_reserved_meta_blocks) + return 0; + + /* + * We do something simple for now. The filemap_flush() will + * also start triggering a write of the data blocks, which is + * not strictly speaking necessary (and for users of + * laptop_mode, not even desirable). However, to do otherwise + * would require replicating code paths in: + * + * ext4_da_writepages() -> + * write_cache_pages() ---> (via passed in callback function) + * __mpage_da_writepage() --> + * mpage_add_bh_to_extent() + * mpage_da_map_blocks() + * + * The problem is that write_cache_pages(), located in + * mm/page-writeback.c, marks pages clean in preparation for + * doing I/O, which is not desirable if we're not planning on + * doing I/O at all. + * + * We could call write_cache_pages(), and then redirty all of + * the pages by calling redirty_page_for_writepage() but that + * would be ugly in the extreme. So instead we would need to + * replicate parts of the code in the above functions, + * simplifying them because we wouldn't actually intend to + * write out the pages, but rather only collect contiguous + * logical block extents, call the multi-block allocator, and + * then update the buffer heads with the block allocations. + * + * For now, though, we'll cheat by calling filemap_flush(), + * which will map the blocks, and start the I/O, but not + * actually wait for the I/O to complete. + */ + return filemap_flush(inode->i_mapping); +} + +/* + * bmap() is special. It gets used by applications such as lilo and by + * the swapper to find the on-disk block of a specific piece of data. + * + * Naturally, this is dangerous if the block concerned is still in the + * journal. If somebody makes a swapfile on an ext4 data-journaling + * filesystem and enables swap, then they may get a nasty shock when the + * data getting swapped to that swapfile suddenly gets overwritten by + * the original zero's written out previously to the journal and + * awaiting writeback in the kernel's buffer cache. + * + * So, if we see any bmap calls here on a modified, data-journaled file, + * take extra steps to flush any blocks which might be in the cache. + */ +static sector_t ext4_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + journal_t *journal; + int err; + + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + test_opt(inode->i_sb, DELALLOC)) { + /* + * With delalloc we want to sync the file + * so that we can make sure we allocate + * blocks for file + */ + filemap_write_and_wait(mapping); + } + + if (EXT4_JOURNAL(inode) && + ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { + /* + * This is a REALLY heavyweight approach, but the use of + * bmap on dirty files is expected to be extremely rare: + * only if we run lilo or swapon on a freshly made file + * do we expect this to happen. + * + * (bmap requires CAP_SYS_RAWIO so this does not + * represent an unprivileged user DOS attack --- we'd be + * in trouble if mortal users could trigger this path at + * will.) + * + * NB. EXT4_STATE_JDATA is not set on files other than + * regular files. If somebody wants to bmap a directory + * or symlink and gets confused because the buffer + * hasn't yet been flushed to disk, they deserve + * everything they get. + */ + + ext4_clear_inode_state(inode, EXT4_STATE_JDATA); + journal = EXT4_JOURNAL(inode); + jbd2_journal_lock_updates(journal); + err = jbd2_journal_flush(journal); + jbd2_journal_unlock_updates(journal); + + if (err) + return 0; + } + + return generic_block_bmap(mapping, block, ext4_get_block); +} + +static int ext4_readpage(struct file *file, struct page *page) +{ + trace_ext4_readpage(page); + return mpage_readpage(page, ext4_get_block); +} + +static int +ext4_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); +} + +static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + if (!page_has_buffers(page)) + return; + head = bh = page_buffers(page); + do { + if (offset <= curr_off && test_clear_buffer_uninit(bh) + && bh->b_private) { + ext4_free_io_end(bh->b_private); + bh->b_private = NULL; + bh->b_end_io = NULL; + } + curr_off = curr_off + bh->b_size; + bh = bh->b_this_page; + } while (bh != head); +} + +static void ext4_invalidatepage(struct page *page, unsigned long offset) +{ + journal_t *journal = EXT4_JOURNAL(page->mapping->host); + + trace_ext4_invalidatepage(page, offset); + + /* + * free any io_end structure allocated for buffers to be discarded + */ + if (ext4_should_dioread_nolock(page->mapping->host)) + ext4_invalidatepage_free_endio(page, offset); + /* + * If it's a full truncate we just forget about the pending dirtying + */ + if (offset == 0) + ClearPageChecked(page); + + if (journal) + jbd2_journal_invalidatepage(journal, page, offset); + else + block_invalidatepage(page, offset); +} + +static int ext4_releasepage(struct page *page, gfp_t wait) +{ + journal_t *journal = EXT4_JOURNAL(page->mapping->host); + + trace_ext4_releasepage(page); + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return jbd2_journal_try_to_free_buffers(journal, page, wait); + else + return try_to_free_buffers(page); +} + +/* + * O_DIRECT for ext3 (or indirect map) based files + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + * If the O_DIRECT write is intantiating holes inside i_size and the machine + * crashes then stale disk data _may_ be exposed inside the file. But current + * VFS code falls back into buffered path in that case so we are safe. + */ +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle; + ssize_t ret; + int orphan = 0; + size_t count = iov_length(iov, nr_segs); + int retries = 0; + + if (rw == WRITE) { + loff_t final_size = offset + count; + + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext4_journal_stop(handle); + } + } + +retry: + if (rw == READ && ext4_should_dioread_nolock(inode)) { + if (unlikely(!list_empty(&ei->i_completed_io_list))) { + mutex_lock(&inode->i_mutex); + ext4_flush_completed_IO(inode); + mutex_unlock(&inode->i_mutex); + } + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL, NULL, 0); + } else { + ret = blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL); + + if (unlikely((rw & WRITE) && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + + if (end > isize) + ext4_truncate_failed_write(inode); + } + } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Bail out and pretend + * the write failed... */ + ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + + goto out; + } + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext4_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext4_mark_inode_dirty(handle, inode); + } + } + err = ext4_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +/* + * ext4_get_block used when preparing for a DIO write or buffer write. + * We allocate an uinitialized extent if blocks haven't been allocated. + * The extent will be converted to initialized after the IO is complete. + */ +static int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); +} + +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, + ssize_t size, void *private, int ret, + bool is_async) +{ + ext4_io_end_t *io_end = iocb->private; + struct workqueue_struct *wq; + unsigned long flags; + struct ext4_inode_info *ei; + + /* if not async direct IO or dio with 0 bytes write, just return */ + if (!io_end || !size) + goto out; + + ext_debug("ext4_end_io_dio(): io_end 0x%p" + "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", + iocb->private, io_end->inode->i_ino, iocb, offset, + size); + + /* if not aio dio with unwritten extents, just free io and return */ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + ext4_free_io_end(io_end); + iocb->private = NULL; +out: + if (is_async) + aio_complete(iocb, ret, 0); + return; + } + + io_end->offset = offset; + io_end->size = size; + if (is_async) { + io_end->iocb = iocb; + io_end->result = ret; + } + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; + + /* Add the io_end to per-inode completed aio dio list*/ + ei = EXT4_I(io_end->inode); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &ei->i_completed_io_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); + iocb->private = NULL; +} + +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) +{ + ext4_io_end_t *io_end = bh->b_private; + struct workqueue_struct *wq; + struct inode *inode; + unsigned long flags; + + if (!test_clear_buffer_uninit(bh) || !io_end) + goto out; + + if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { + printk("sb umounted, discard end_io request for inode %lu\n", + io_end->inode->i_ino); + ext4_free_io_end(io_end); + goto out; + } + + /* + * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now, + * but being more careful is always safe for the future change. + */ + inode = io_end->inode; + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } + + /* Add the io_end to per-inode completed io list*/ + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); + + wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); +out: + bh->b_private = NULL; + bh->b_end_io = NULL; + clear_buffer_uninit(bh); + end_buffer_async_write(bh, uptodate); +} + +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) +{ + ext4_io_end_t *io_end; + struct page *page = bh->b_page; + loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; + size_t size = bh->b_size; + +retry: + io_end = ext4_init_io_end(inode, GFP_ATOMIC); + if (!io_end) { + pr_warn_ratelimited("%s: allocation fail\n", __func__); + schedule(); + goto retry; + } + io_end->offset = offset; + io_end->size = size; + /* + * We need to hold a reference to the page to make sure it + * doesn't get evicted before ext4_end_io_work() has a chance + * to convert the extent from written to unwritten. + */ + io_end->page = page; + get_page(io_end->page); + + bh->b_private = io_end; + bh->b_end_io = ext4_end_io_buffer_write; + return 0; +} + +/* + * For ext4 extent files, ext4 will do direct-io write to holes, + * preallocated extents, and those write extend the file, no need to + * fall back to buffered IO. + * + * For holes, we fallocate those blocks, mark them as uninitialized + * If those blocks were preallocated, we mark sure they are splited, but + * still keep the range to write as uninitialized. + * + * The unwrritten extents will be converted to written when DIO is completed. + * For async direct IO, since the IO may still pending when return, we + * set up an end_io call back function, which will do the conversion + * when async direct IO completed. + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + */ +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + size_t count = iov_length(iov, nr_segs); + + loff_t final_size = offset + count; + if (rw == WRITE && final_size <= inode->i_size) { + /* + * We could direct write to holes and fallocate. + * + * Allocated blocks to fill the hole are marked as uninitialized + * to prevent parallel buffered read to expose the stale data + * before DIO complete the data IO. + * + * As to previously fallocated extents, ext4 get_block + * will just simply mark the buffer mapped but still + * keep the extents uninitialized. + * + * for non AIO case, we will convert those unwritten extents + * to written after return back from blockdev_direct_IO. + * + * for async DIO, the conversion needs to be defered when + * the IO is completed. The ext4 end_io callback function + * will be called to take care of the conversion work. + * Here for async case, we allocate an io_end structure to + * hook to the iocb. + */ + iocb->private = NULL; + EXT4_I(inode)->cur_aio_dio = NULL; + if (!is_sync_kiocb(iocb)) { + iocb->private = ext4_init_io_end(inode, GFP_NOFS); + if (!iocb->private) + return -ENOMEM; + /* + * we save the io structure for current async + * direct IO, so that later ext4_map_blocks() + * could flag the io structure whether there + * is a unwritten extents needs to be converted + * when IO is completed. + */ + EXT4_I(inode)->cur_aio_dio = iocb->private; + } + + ret = blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block_write, + ext4_end_io_dio); + if (iocb->private) + EXT4_I(inode)->cur_aio_dio = NULL; + /* + * The io_end structure takes a reference to the inode, + * that structure needs to be destroyed and the + * reference to the inode need to be dropped, when IO is + * complete, even with 0 byte write, or failed. + * + * In the successful AIO DIO case, the io_end structure will be + * desctroyed and the reference to the inode will be dropped + * after the end_io call back function is called. + * + * In the case there is 0 byte write, or error case, since + * VFS direct IO won't invoke the end_io call back function, + * we need to free the end_io structure here. + */ + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { + ext4_free_io_end(iocb->private); + iocb->private = NULL; + } else if (ret > 0 && ext4_test_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN)) { + int err; + /* + * for non AIO case, since the IO is already + * completed, we could do the conversion right here + */ + err = ext4_convert_unwritten_extents(inode, + offset, ret); + if (err < 0) + ret = err; + ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + } + return ret; + } + + /* for write the the end of file case, we fall back to old way */ + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); +} + +static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); + else + ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + trace_ext4_direct_IO_exit(inode, offset, + iov_length(iov, nr_segs), rw, ret); + return ret; +} + +/* + * Pages can be marked dirty completely asynchronously from ext4's journalling + * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do + * much here because ->set_page_dirty is called under VFS locks. The page is + * not necessarily locked. + * + * We cannot just dirty the page and leave attached buffers clean, because the + * buffers' dirty state is "definitive". We cannot just set the buffers dirty + * or jbddirty because all the journalling code will explode. + * + * So what we do is to mark the page "pending dirty" and next time writepage + * is called, propagate that into the buffers appropriately. + */ +static int ext4_journalled_set_page_dirty(struct page *page) +{ + SetPageChecked(page); + return __set_page_dirty_nobuffers(page); +} + +static const struct address_space_operations ext4_ordered_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writepage, + .write_begin = ext4_write_begin, + .write_end = ext4_ordered_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations ext4_writeback_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writepage, + .write_begin = ext4_write_begin, + .write_end = ext4_writeback_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations ext4_journalled_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writepage, + .write_begin = ext4_write_begin, + .write_end = ext4_journalled_write_end, + .set_page_dirty = ext4_journalled_set_page_dirty, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations ext4_da_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writepage, + .writepages = ext4_da_writepages, + .write_begin = ext4_da_write_begin, + .write_end = ext4_da_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_da_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +void ext4_set_aops(struct inode *inode) +{ + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_ordered_aops; + break; + case EXT4_INODE_WRITEBACK_DATA_MODE: + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_writeback_aops; + break; + case EXT4_INODE_JOURNAL_DATA_MODE: + inode->i_mapping->a_ops = &ext4_journalled_aops; + break; + default: + BUG(); + } +} + +/* + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) +{ + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned length; + unsigned blocksize; + struct inode *inode = mapping->host; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + + return ext4_block_zero_page_range(handle, mapping, from, length); +} + +/* + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' + */ +int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) +{ + ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, max, pos; + ext4_lblk_t iblock; + struct inode *inode = mapping->host; + struct buffer_head *bh; + struct page *page; + int err = 0; + + page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, + mapping_gfp_mask(mapping) & ~__GFP_FS); + if (!page) + return -EINVAL; + + blocksize = inode->i_sb->s_blocksize; + max = blocksize - (offset & (blocksize - 1)); + + /* + * correct length if it does not fall between + * 'from' and the end of the block + */ + if (length > max || length < 0) + length = max; + + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (buffer_freed(bh)) { + BUFFER_TRACE(bh, "freed: skip"); + goto unlock; + } + + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "unmapped"); + ext4_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto unlock; + } + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + if (ext4_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto unlock; + } + + zero_user(page, offset, length); + + BUFFER_TRACE(bh, "zeroed end of block"); + + err = 0; + if (ext4_should_journal_data(inode)) { + err = ext4_handle_dirty_metadata(handle, inode, bh); + } else { + if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode) + err = ext4_jbd2_file_inode(handle, inode); + mark_buffer_dirty(bh); + } + +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext4_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext4_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext4_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is referred + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext4_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext4_find_shared(struct inode *inode, int depth, + ext4_lblk_t offsets[4], Indirect chain[4], + __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offset + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext4_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext4. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while (partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + * + * Return 0 on success, 1 on invalid block range + * and < 0 on fatal error. + */ +static int ext4_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, + ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, + __le32 *last) +{ + __le32 *p; + int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; + int err; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, + count)) { + EXT4_ERROR_INODE(inode, "attempt to clear invalid " + "blocks %llu len %lu", + (unsigned long long) block_to_free, count); + return 1; + } + + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + goto out_err; + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) + goto out_err; + err = ext4_truncate_restart_trans(handle, inode, + blocks_for_truncate(inode)); + if (unlikely(err)) + goto out_err; + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out_err; + } + } + + for (p = first; p < last; p++) + *p = 0; + + ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); + return 0; +out_err: + ext4_std_error(inode->i_sb, err); + return err; +} + +/** + * ext4_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks referred from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext4_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, + __le32 *first, __le32 *last) +{ + ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + ext4_fsblk_t nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind + for current block */ + int err = 0; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + err = ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p); + if (err) + break; + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (!err && count > 0) + err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + if (err < 0) + /* fatal error */ + return; + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) + ext4_handle_dirty_metadata(handle, inode, this_bh); + else + EXT4_ERROR_INODE(inode, + "circular indirect block detected at " + "block %llu", + (unsigned long long) this_bh->b_blocknr); + } +} + +/** + * ext4_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks referred from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext4_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + __le32 *first, __le32 *last, int depth) +{ + ext4_fsblk_t nr; + __le32 *p; + + if (ext4_handle_is_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), + nr, 1)) { + EXT4_ERROR_INODE(inode, + "invalid indirect mapped " + "block %lu (level %d)", + (unsigned long) nr, depth); + break; + } + + /* Go read the buffer for the next level down */ + bh = sb_bread(inode->i_sb, nr); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + EXT4_ERROR_INODE_BLOCK(inode, nr, + "Read failure"); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext4_free_branches(handle, inode, bh, + (__le32 *) bh->b_data, + (__le32 *) bh->b_data + addr_per_block, + depth); + brelse(bh); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (ext4_handle_is_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext4_mark_inode_dirty(handle, inode); + ext4_truncate_restart_trans(handle, inode, + blocks_for_truncate(inode)); + } + + /* + * The forget flag here is critical because if + * we are journaling (and not doing data + * journaling), we have to make sure a revoke + * record is written to prevent the journal + * replay from overwriting the (former) + * indirect block if it gets reallocated as a + * data block. This must happen in the same + * transaction where the data blocks are + * actually freed. + */ + ext4_free_blocks(handle, inode, NULL, nr, 1, + EXT4_FREE_BLOCKS_METADATA| + EXT4_FREE_BLOCKS_FORGET); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext4_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, + inode, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext4_free_data(handle, inode, parent_bh, first, last); + } +} + +int ext4_can_truncate(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + return 1; + if (S_ISDIR(inode->i_mode)) + return 1; + if (S_ISLNK(inode->i_mode)) + return !ext4_inode_is_fast_symlink(inode); + return 0; +} + +/* + * ext4_punch_hole: punches a hole in a file by releaseing the blocks + * associated with the given offset and length + * + * @inode: File inode + * @offset: The offset where the hole will begin + * @len: The length of the hole + * + * Returns: 0 on sucess or negative on failure + */ + +int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) +{ + struct inode *inode = file->f_path.dentry->d_inode; + if (!S_ISREG(inode->i_mode)) + return -ENOTSUPP; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + /* TODO: Add support for non extent hole punching */ + return -ENOTSUPP; + } + + return ext4_ext_punch_hole(file, offset, length); +} + +/* + * ext4_truncate() + * + * We block out ext4_get_block() block instantiations across the entire + * transaction, and VFS/VM ensures that ext4_truncate() cannot run + * simultaneously on behalf of the same inode. + * + * As we work through the truncate and commmit bits of it to the journal there + * is one core, guiding principle: the file's tree must always be consistent on + * disk. We must be able to restart the truncate after a crash. + * + * The file's tree may be transiently inconsistent in memory (although it + * probably isn't), but whenever we close off and commit a journal transaction, + * the contents of (the filesystem + the journal) must be consistent and + * restartable. It's pretty simple, really: bottom up, right to left (although + * left-to-right works OK too). + * + * Note that at recovery time, journal replay occurs *before* the restart of + * truncate against the orphan inode list. + * + * The committed inode has the new, desired i_size (which is the same as + * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see + * that this inode's truncate did not complete and it will again call + * ext4_truncate() to have another go. So there will be instantiated blocks + * to the right of the truncation point in a crashed ext4 filesystem. But + * that's fine - as long as they are linked from the inode, the post-crash + * ext4_truncate() run will find them and release them. + */ +void ext4_truncate(struct inode *inode) +{ + handle_t *handle; + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n = 0; + ext4_lblk_t last_block, max_block; + unsigned blocksize = inode->i_sb->s_blocksize; + + trace_ext4_truncate_enter(inode); + + if (!ext4_can_truncate(inode)) + return; + + ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) + ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ext4_ext_truncate(inode); + trace_ext4_truncate_exit(inode); + return; + } + + handle = start_transaction(inode); + if (IS_ERR(handle)) + return; /* AKPM: return what? */ + + last_block = (inode->i_size + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + + if (inode->i_size & (blocksize - 1)) + if (ext4_block_truncate_page(handle, mapping, inode->i_size)) + goto out_stop; + + if (last_block != max_block) { + n = ext4_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + goto out_stop; /* error */ + } + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + /* + * From here we block out all ext4_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&ei->i_data_sem); + + ext4_discard_preallocations(inode); + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext4 *really* writes onto the disk inode. + */ + ei->i_disksize = inode->i_size; + + if (last_block == max_block) { + /* + * It is unnecessary to free any data blocks if last_block is + * equal to the indirect block limit. + */ + goto out_unlock; + } else if (n == 1) { /* direct blocks */ + ext4_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + case EXT4_IND_BLOCK: + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + case EXT4_DIND_BLOCK: + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + case EXT4_TIND_BLOCK: + ; + } + +out_unlock: + up_write(&ei->i_data_sem); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + + /* + * In a multi-transaction truncate, we only make the final transaction + * synchronous + */ + if (IS_SYNC(inode)) + ext4_handle_sync(handle); +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + ext4_journal_stop(handle); + trace_ext4_truncate_exit(inode); +} + +/* + * ext4_get_inode_loc returns with an extra refcount against the inode's + * underlying buffer_head on success. If 'in_mem' is true, we have all + * data in memory that is needed to recreate the on-disk version of this + * inode. + */ +static int __ext4_get_inode_loc(struct inode *inode, + struct ext4_iloc *iloc, int in_mem) +{ + struct ext4_group_desc *gdp; + struct buffer_head *bh; + struct super_block *sb = inode->i_sb; + ext4_fsblk_t block; + int inodes_per_block, inode_offset; + + iloc->bh = NULL; + if (!ext4_valid_inum(sb, inode->i_ino)) + return -EIO; + + iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); + gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); + if (!gdp) + return -EIO; + + /* + * Figure out the offset within the block group inode table + */ + inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + inode_offset = ((inode->i_ino - 1) % + EXT4_INODES_PER_GROUP(sb)); + block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); + iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); + + bh = sb_getblk(sb, block); + if (!bh) { + EXT4_ERROR_INODE_BLOCK(inode, block, + "unable to read itable block"); + return -EIO; + } + if (!buffer_uptodate(bh)) { + lock_buffer(bh); + + /* + * If the buffer has the write error flag, we have failed + * to write out another inode in the same block. In this + * case, we don't have to read the block because we may + * read the old inode data successfully. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) + set_buffer_uptodate(bh); + + if (buffer_uptodate(bh)) { + /* someone brought it uptodate while we waited */ + unlock_buffer(bh); + goto has_buffer; + } + + /* + * If we have all information of the inode in memory and this + * is the only valid inode in the block, we need not read the + * block. + */ + if (in_mem) { + struct buffer_head *bitmap_bh; + int i, start; + + start = inode_offset & ~(inodes_per_block - 1); + + /* Is the inode bitmap in cache? */ + bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); + if (!bitmap_bh) + goto make_io; + + /* + * If the inode bitmap isn't in cache then the + * optimisation may end up performing two reads instead + * of one, so skip it. + */ + if (!buffer_uptodate(bitmap_bh)) { + brelse(bitmap_bh); + goto make_io; + } + for (i = start; i < start + inodes_per_block; i++) { + if (i == inode_offset) + continue; + if (ext4_test_bit(i, bitmap_bh->b_data)) + break; + } + brelse(bitmap_bh); + if (i == start + inodes_per_block) { + /* all other inodes are free, so skip I/O */ + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + unlock_buffer(bh); + goto has_buffer; + } + } + +make_io: + /* + * If we need to do any I/O, try to pre-readahead extra + * blocks from the inode table. + */ + if (EXT4_SB(sb)->s_inode_readahead_blks) { + ext4_fsblk_t b, end, table; + unsigned num; + + table = ext4_inode_table(sb, gdp); + /* s_inode_readahead_blks is always a power of 2 */ + b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); + if (table > b) + b = table; + end = b + EXT4_SB(sb)->s_inode_readahead_blks; + num = EXT4_INODES_PER_GROUP(sb); + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + num -= ext4_itable_unused_count(sb, gdp); + table += num / inodes_per_block; + if (end > table) + end = table; + while (b <= end) + sb_breadahead(sb, b++); + } + + /* + * There are other valid inodes in the buffer, this inode + * has in-inode xattrs, or we don't have this inode in memory. + * Read the block from disk. + */ + trace_ext4_load_inode(inode); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ_META, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + EXT4_ERROR_INODE_BLOCK(inode, block, + "unable to read itable block"); + brelse(bh); + return -EIO; + } + } +has_buffer: + iloc->bh = bh; + return 0; +} + +int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) +{ + /* We have all inode data except xattrs in memory here. */ + return __ext4_get_inode_loc(inode, iloc, + !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); +} + +void ext4_set_inode_flags(struct inode *inode) +{ + unsigned int flags = EXT4_I(inode)->i_flags; + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + if (flags & EXT4_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT4_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & EXT4_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & EXT4_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT4_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ +void ext4_get_inode_flags(struct ext4_inode_info *ei) +{ + unsigned int vfs_fl; + unsigned long old_fl, new_fl; + + do { + vfs_fl = ei->vfs_inode.i_flags; + old_fl = ei->i_flags; + new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| + EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| + EXT4_DIRSYNC_FL); + if (vfs_fl & S_SYNC) + new_fl |= EXT4_SYNC_FL; + if (vfs_fl & S_APPEND) + new_fl |= EXT4_APPEND_FL; + if (vfs_fl & S_IMMUTABLE) + new_fl |= EXT4_IMMUTABLE_FL; + if (vfs_fl & S_NOATIME) + new_fl |= EXT4_NOATIME_FL; + if (vfs_fl & S_DIRSYNC) + new_fl |= EXT4_DIRSYNC_FL; + } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); +} + +static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + blkcnt_t i_blocks ; + struct inode *inode = &(ei->vfs_inode); + struct super_block *sb = inode->i_sb; + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + /* we are using combined 48 bit field */ + i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | + le32_to_cpu(raw_inode->i_blocks_lo); + if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { + /* i_blocks represent file system block size */ + return i_blocks << (inode->i_blkbits - 9); + } else { + return i_blocks; + } + } else { + return le32_to_cpu(raw_inode->i_blocks_lo); + } +} + +struct inode *ext4_iget(struct super_block *sb, unsigned long ino) +{ + struct ext4_iloc iloc; + struct ext4_inode *raw_inode; + struct ext4_inode_info *ei; + struct inode *inode; + journal_t *journal = EXT4_SB(sb)->s_journal; + long ret; + int block; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ei = EXT4_I(inode); + iloc.bh = NULL; + + ret = __ext4_get_inode_loc(inode, &iloc, 0); + if (ret < 0) + goto bad_inode; + raw_inode = ext4_raw_inode(&iloc); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (!(test_opt(inode->i_sb, NO_UID32))) { + inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ + ei->i_dir_start_lookup = 0; + ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0) { + if (inode->i_mode == 0 || + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { + /* this inode is deleted */ + ret = -ESTALE; + goto bad_inode; + } + /* The only unlinked inodes we let through here have + * valid i_mode and are being read by the orphan + * recovery code: that's fine, we're about to complete + * the process of deleting those. */ + } + ei->i_flags = le32_to_cpu(raw_inode->i_flags); + inode->i_blocks = ext4_inode_blocks(raw_inode, ei); + ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) + ei->i_file_acl |= + ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; + inode->i_size = ext4_isize(raw_inode); + ei->i_disksize = inode->i_size; +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + ei->i_block_group = iloc.block_group; + ei->i_last_alloc_group = ~0; + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT4_N_BLOCKS; block++) + ei->i_data[block] = raw_inode->i_block[block]; + INIT_LIST_HEAD(&ei->i_orphan); + + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + read_unlock(&journal->j_state_lock); + ei->i_sync_tid = tid; + ei->i_datasync_tid = tid; + } + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > + EXT4_INODE_SIZE(inode->i_sb)) { + ret = -EIO; + goto bad_inode; + } + if (ei->i_extra_isize == 0) { + /* The extra space is currently unused. Use it. */ + ei->i_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + } else { + __le32 *magic = (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + } + } else + ei->i_extra_isize = 0; + + EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); + EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); + EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); + + inode->i_version = le32_to_cpu(raw_inode->i_disk_version); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + inode->i_version |= + (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + } + + ret = 0; + if (ei->i_file_acl && + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { + EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", + ei->i_file_acl); + ret = -EIO; + goto bad_inode; + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) + /* Validate extent which is part of inode */ + ret = ext4_ext_check_inode(inode); + } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) { + /* Validate block references which are part of inode */ + ret = ext4_check_inode_blockref(inode); + } + if (ret) + goto bad_inode; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (ext4_inode_is_fast_symlink(inode)) { + inode->i_op = &ext4_fast_symlink_inode_operations; + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { + inode->i_op = &ext4_symlink_inode_operations; + ext4_set_aops(inode); + } + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + inode->i_op = &ext4_special_inode_operations; + if (raw_inode->i_block[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } else { + ret = -EIO; + EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); + goto bad_inode; + } + brelse(iloc.bh); + ext4_set_inode_flags(inode); + unlock_new_inode(inode); + return inode; + +bad_inode: + brelse(iloc.bh); + iget_failed(inode); + return ERR_PTR(ret); +} + +static int ext4_inode_blocks_set(handle_t *handle, + struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + struct inode *inode = &(ei->vfs_inode); + u64 i_blocks = inode->i_blocks; + struct super_block *sb = inode->i_sb; + + if (i_blocks <= ~0U) { + /* + * i_blocks can be represnted in a 32 bit variable + * as multiple of 512 bytes + */ + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = 0; + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); + return 0; + } + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) + return -EFBIG; + + if (i_blocks <= 0xffffffffffffULL) { + /* + * i_blocks can be represented in a 48 bit variable + * as multiple of 512 bytes + */ + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); + } else { + ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); + /* i_block is stored in file system block size */ + i_blocks = i_blocks >> (inode->i_blkbits - 9); + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); + } + return 0; +} + +/* + * Post the struct inode info into an on-disk inode location in the + * buffer-cache. This gobbles the caller's reference to the + * buffer_head in the inode location struct. + * + * The caller must have write access to iloc->bh. + */ +static int ext4_do_update_inode(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_inode *raw_inode = ext4_raw_inode(iloc); + struct ext4_inode_info *ei = EXT4_I(inode); + struct buffer_head *bh = iloc->bh; + int err = 0, rc, block; + + /* For fields not not tracking in the in-memory inode, + * initialise them to zero for new inodes. */ + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) + memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); + + ext4_get_inode_flags(ei); + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if (!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); +/* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if (!ei->i_dtime) { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(inode->i_uid)); + raw_inode->i_gid_high = + cpu_to_le16(high_16_bits(inode->i_gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = + cpu_to_le16(fs_high2lowuid(inode->i_uid)); + raw_inode->i_gid_low = + cpu_to_le16(fs_high2lowgid(inode->i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + + EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); + + if (ext4_inode_blocks_set(handle, raw_inode, ei)) + goto out_brelse; + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); + raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_HURD)) + raw_inode->i_file_acl_high = + cpu_to_le16(ei->i_file_acl >> 32); + raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); + ext4_isize_set(raw_inode, ei->i_disksize); + if (ei->i_disksize > 0x7fffffffULL) { + struct super_block *sb = inode->i_sb; + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || + EXT4_SB(sb)->s_es->s_rev_level == + cpu_to_le32(EXT4_GOOD_OLD_REV)) { + /* If this is the first large file + * created, add a flag to the superblock. + */ + err = ext4_journal_get_write_access(handle, + EXT4_SB(sb)->s_sbh); + if (err) + goto out_brelse; + ext4_update_dynamic_rev(sb); + EXT4_SET_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE); + sb->s_dirt = 1; + ext4_handle_sync(handle); + err = ext4_handle_dirty_metadata(handle, NULL, + EXT4_SB(sb)->s_sbh); + } + } + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + raw_inode->i_block[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + raw_inode->i_block[1] = 0; + } else { + raw_inode->i_block[0] = 0; + raw_inode->i_block[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + raw_inode->i_block[2] = 0; + } + } else + for (block = 0; block < EXT4_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + + raw_inode->i_disk_version = cpu_to_le32(inode->i_version); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + raw_inode->i_version_hi = + cpu_to_le32(inode->i_version >> 32); + raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + } + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + rc = ext4_handle_dirty_metadata(handle, NULL, bh); + if (!err) + err = rc; + ext4_clear_inode_state(inode, EXT4_STATE_NEW); + + ext4_update_inode_fsync_trans(handle, inode, 0); +out_brelse: + brelse(bh); + ext4_std_error(inode->i_sb, err); + return err; +} + +/* + * ext4_write_inode() + * + * We are called from a few places: + * + * - Within generic_file_write() for O_SYNC files. + * Here, there will be no transaction running. We wait for any running + * trasnaction to commit. + * + * - Within sys_sync(), kupdate and such. + * We wait on commit, if tol to. + * + * - Within prune_icache() (PF_MEMALLOC == true) + * Here we simply return. We can't afford to block kswapd on the + * journal commit. + * + * In all cases it is actually safe for us to return without doing anything, + * because the inode has been copied into a raw inode buffer in + * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for + * knfsd. + * + * Note that we are absolutely dependent upon all inode dirtiers doing the + * right thing: they *must* call mark_inode_dirty() after dirtying info in + * which we are interested. + * + * It would be a bug for them to not do this. The code: + * + * mark_inode_dirty(inode) + * stuff(); + * inode->i_size = expr; + * + * is in error because a kswapd-driven write_inode() could occur while + * `stuff()' is running, and the new i_size will be lost. Plus the inode + * will no longer be on the superblock's dirty inode list. + */ +int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int err; + + if (current->flags & PF_MEMALLOC) + return 0; + + if (EXT4_SB(inode->i_sb)->s_journal) { + if (ext4_journal_current_handle()) { + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); + dump_stack(); + return -EIO; + } + + if (wbc->sync_mode != WB_SYNC_ALL) + return 0; + + err = ext4_force_commit(inode->i_sb); + } else { + struct ext4_iloc iloc; + + err = __ext4_get_inode_loc(inode, &iloc, 0); + if (err) + return err; + if (wbc->sync_mode == WB_SYNC_ALL) + sync_dirty_buffer(iloc.bh); + if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { + EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, + "IO error syncing inode"); + err = -EIO; + } + brelse(iloc.bh); + } + return err; +} + +/* + * ext4_setattr() + * + * Called from notify_change. + * + * We want to trap VFS attempts to truncate the file as soon as + * possible. In particular, we want to make sure that when the VFS + * shrinks i_size, we put the inode on the orphan list and modify + * i_disksize immediately, so that during the subsequent flushing of + * dirty pages and freeing of disk blocks, we can guarantee that any + * commit will leave the blocks being flushed in an unused state on + * disk. (On recovery, the inode will get truncated and the blocks will + * be freed, so we have a strong guarantee that no future commit will + * leave these blocks visible to the user.) + * + * Another thing we have to assure is that if we are in ordered mode + * and inode is still attached to the committing transaction, we must + * we start writeout of all the dirty pages which are being truncated. + * This way we are sure that all the data written in the previous + * transaction are already on disk (truncate waits for pages under + * writeback). + * + * Called with inode->i_mutex down. + */ +int ext4_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error, rc = 0; + int orphan = 0; + const unsigned int ia_valid = attr->ia_valid; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + handle_t *handle; + + /* (user+group)*(old+new) structure, inode write (sb, + * inode block, ? - but truncate inode update has it) */ + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + error = dquot_transfer(inode, attr); + if (error) { + ext4_journal_stop(handle); + return error; + } + /* Update corresponding info in inode so that everything is in + * one transaction */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + error = ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + } + + if (attr->ia_valid & ATTR_SIZE) { + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (attr->ia_size > sbi->s_bitmap_maxbytes) + return -EFBIG; + } + } + + if (S_ISREG(inode->i_mode) && + attr->ia_valid & ATTR_SIZE && + (attr->ia_size < inode->i_size)) { + handle_t *handle; + + handle = ext4_journal_start(inode, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + if (ext4_handle_valid(handle)) { + error = ext4_orphan_add(handle, inode); + orphan = 1; + } + EXT4_I(inode)->i_disksize = attr->ia_size; + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; + ext4_journal_stop(handle); + + if (ext4_should_order_data(inode)) { + error = ext4_begin_ordered_truncate(inode, + attr->ia_size); + if (error) { + /* Do as much error cleanup as possible */ + handle = ext4_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + goto err_out; + } + ext4_orphan_del(handle, inode); + orphan = 0; + ext4_journal_stop(handle); + goto err_out; + } + } + } + + if (attr->ia_valid & ATTR_SIZE) { + if (attr->ia_size != i_size_read(inode)) { + truncate_setsize(inode, attr->ia_size); + ext4_truncate(inode); + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) + ext4_truncate(inode); + } + + if (!rc) { + setattr_copy(inode, attr); + mark_inode_dirty(inode); + } + + /* + * If the call to ext4_truncate failed to get a transaction handle at + * all, we need to clean up the in-core orphan list manually. + */ + if (orphan && inode->i_nlink) + ext4_orphan_del(NULL, inode); + + if (!rc && (ia_valid & ATTR_MODE)) + rc = ext4_acl_chmod(inode); + +err_out: + ext4_std_error(inode->i_sb, error); + if (!error) + error = rc; + return error; +} + +int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode; + unsigned long delalloc_blocks; + + inode = dentry->d_inode; + generic_fillattr(inode, stat); + + /* + * We can't update i_blocks if the block allocation is delayed + * otherwise in the case of system crash before the real block + * allocation is done, we will have i_blocks inconsistent with + * on-disk file blocks. + * We always keep i_blocks updated together with real + * allocation. But to not confuse with user, stat + * will return the blocks that include the delayed allocation + * blocks for this file. + */ + delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; + + stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; + return 0; +} + +static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, + int chunk) +{ + int indirects; + + /* if nrblocks are contiguous */ + if (chunk) { + /* + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block + */ + return DIV_ROUND_UP(nrblocks, + EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; + } + /* + * if nrblocks are not contiguous, worse case, each block touch + * a indirect block, and each indirect block touch a double indirect + * block, plus a triple indirect block + */ + indirects = nrblocks * 2 + 1; + return indirects; +} + +static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return ext4_indirect_trans_blocks(inode, nrblocks, chunk); + return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); +} + +/* + * Account for index blocks, block groups bitmaps and block group + * descriptor blocks if modify datablocks and index blocks + * worse case, the indexs blocks spread over different block groups + * + * If datablocks are discontiguous, they are possible to spread over + * different block groups too. If they are contiuguous, with flexbg, + * they could still across block group boundary. + * + * Also account for superblock, inode, quota and xattr blocks + */ +static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); + int gdpblocks; + int idxblocks; + int ret = 0; + + /* + * How many index blocks need to touch to modify nrblocks? + * The "Chunk" flag indicating whether the nrblocks is + * physically contiguous on disk + * + * For Direct IO and fallocate, they calls get_block to allocate + * one single extent at a time, so they could set the "Chunk" flag + */ + idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); + + ret = idxblocks; + + /* + * Now let's see how many group bitmaps and group descriptors need + * to account + */ + groups = idxblocks; + if (chunk) + groups += 1; + else + groups += nrblocks; + + gdpblocks = groups; + if (groups > ngroups) + groups = ngroups; + if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) + gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; + + /* bitmaps and block group descriptor blocks */ + ret += groups + gdpblocks; + + /* Blocks for super block, inode, quota and xattr blocks */ + ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); + + return ret; +} + +/* + * Calculate the total number of credits to reserve to fit + * the modification of a single pages into a single transaction, + * which may include multiple chunks of block allocations. + * + * This could be called via ext4_write_begin() + * + * We need to consider the worse case, when + * one new block per extent. + */ +int ext4_writepage_trans_blocks(struct inode *inode) +{ + int bpp = ext4_journal_blocks_per_page(inode); + int ret; + + ret = ext4_meta_trans_blocks(inode, bpp, 0); + + /* Account for data blocks for journalled mode */ + if (ext4_should_journal_data(inode)) + ret += bpp; + return ret; +} + +/* + * Calculate the journal credits for a chunk of data modification. + * + * This is called from DIO, fallocate or whoever calling + * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. + * + * journal buffers for data blocks are not included here, as DIO + * and fallocate do no need to journal data buffers. + */ +int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) +{ + return ext4_meta_trans_blocks(inode, nrblocks, 1); +} + +/* + * The caller must have previously called ext4_reserve_inode_write(). + * Give this, we know that the caller already has write access to iloc->bh. + */ +int ext4_mark_iloc_dirty(handle_t *handle, + struct inode *inode, struct ext4_iloc *iloc) +{ + int err = 0; + + if (test_opt(inode->i_sb, I_VERSION)) + inode_inc_iversion(inode); + + /* the do_update_inode consumes one bh->b_count */ + get_bh(iloc->bh); + + /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ + err = ext4_do_update_inode(handle, inode, iloc); + put_bh(iloc->bh); + return err; +} + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int +ext4_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc) +{ + int err; + + err = ext4_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; + } + } + ext4_std_error(inode->i_sb, err); + return err; +} + +/* + * Expand an inode by new_extra_isize bytes. + * Returns 0 on success or negative error number on failure. + */ +static int ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc iloc, + handle_t *handle) +{ + struct ext4_inode *raw_inode; + struct ext4_xattr_ibody_header *header; + + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) + return 0; + + raw_inode = ext4_raw_inode(&iloc); + + header = IHDR(inode, raw_inode); + + /* No extended attributes present */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || + header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, + new_extra_isize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + return 0; + } + + /* try to expand with EAs present */ + return ext4_expand_extra_isize_ea(inode, new_extra_isize, + raw_inode, handle); +} + +/* + * What we do here is to mark the in-core inode as clean with respect to inode + * dirtiness (it may still be data-dirty). + * This means that the in-core inode may be reaped by prune_icache + * without having to perform any I/O. This is a very good thing, + * because *any* task may call prune_icache - even ones which + * have a transaction open against a different journal. + * + * Is this cheating? Not really. Sure, we haven't written the + * inode out, but prune_icache isn't a user-visible syncing function. + * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) + * we start and wait on commits. + * + * Is this efficient/effective? Well, we're being nice to the system + * by cleaning up our inodes proactively so they can be reaped + * without I/O. But we are potentially leaving up to five seconds' + * worth of inodes floating about which prune_icache wants us to + * write out. One way to fix that would be to get prune_icache() + * to do a write_super() to free up some memory. It has the desired + * effect. + */ +int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) +{ + struct ext4_iloc iloc; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + static unsigned int mnt_count; + int err, ret; + + might_sleep(); + trace_ext4_mark_inode_dirty(inode, _RET_IP_); + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (ext4_handle_valid(handle) && + EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && + !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { + /* + * We need extra buffer credits since we may write into EA block + * with this same handle. If journal_extend fails, then it will + * only result in a minor loss of functionality for that inode. + * If this is felt to be critical, then e2fsck should be run to + * force a large enough s_min_extra_isize. + */ + if ((jbd2_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { + ret = ext4_expand_extra_isize(inode, + sbi->s_want_extra_isize, + iloc, handle); + if (ret) { + ext4_set_inode_state(inode, + EXT4_STATE_NO_EXPAND); + if (mnt_count != + le16_to_cpu(sbi->s_es->s_mnt_count)) { + ext4_warning(inode->i_sb, + "Unable to expand inode %lu. Delete" + " some EAs or run e2fsck.", + inode->i_ino); + mnt_count = + le16_to_cpu(sbi->s_es->s_mnt_count); + } + } + } + } + if (!err) + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + return err; +} + +/* + * ext4_dirty_inode() is called from __mark_inode_dirty() + * + * We're really interested in the case where a file is being extended. + * i_size has been changed by generic_commit_write() and we thus need + * to include the updated inode in the current transaction. + * + * Also, dquot_alloc_block() will always dirty the inode when blocks + * are allocated to the file. + * + * If the inode is marked synchronous, we don't honour that here - doing + * so would cause a commit on atime updates, which we don't bother doing. + * We handle synchronous inodes at the highest possible level. + */ +void ext4_dirty_inode(struct inode *inode, int flags) +{ + handle_t *handle; + + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) + goto out; + + ext4_mark_inode_dirty(handle, inode); + + ext4_journal_stop(handle); +out: + return; +} + +#if 0 +/* + * Bind an inode's backing buffer_head into this transaction, to prevent + * it from being flushed to disk early. Unlike + * ext4_reserve_inode_write, this leaves behind no bh reference and + * returns no iloc structure, so the caller needs to repeat the iloc + * lookup to mark the inode dirty later. + */ +static int ext4_pin_inode(handle_t *handle, struct inode *inode) +{ + struct ext4_iloc iloc; + + int err = 0; + if (handle) { + err = ext4_get_inode_loc(inode, &iloc); + if (!err) { + BUFFER_TRACE(iloc.bh, "get_write_access"); + err = jbd2_journal_get_write_access(handle, iloc.bh); + if (!err) + err = ext4_handle_dirty_metadata(handle, + NULL, + iloc.bh); + brelse(iloc.bh); + } + } + ext4_std_error(inode->i_sb, err); + return err; +} +#endif + +int ext4_change_inode_journal_flag(struct inode *inode, int val) +{ + journal_t *journal; + handle_t *handle; + int err; + + /* + * We have to be very careful here: changing a data block's + * journaling status dynamically is dangerous. If we write a + * data block to the journal, change the status and then delete + * that block, we risk forgetting to revoke the old log record + * from the journal and so a subsequent replay can corrupt data. + * So, first we make sure that the journal is empty and that + * nobody is changing anything. + */ + + journal = EXT4_JOURNAL(inode); + if (!journal) + return 0; + if (is_journal_aborted(journal)) + return -EROFS; + + jbd2_journal_lock_updates(journal); + jbd2_journal_flush(journal); + + /* + * OK, there are no updates running now, and all cached data is + * synced to disk. We are now in a completely consistent state + * which doesn't have anything in the journal, and we know that + * no filesystem updates are running, so it is safe to modify + * the inode's in-core data-journaling state flag now. + */ + + if (val) + ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + else + ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + ext4_set_aops(inode); + + jbd2_journal_unlock_updates(journal); + + /* Finally we can mark the inode as dirty. */ + + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_mark_inode_dirty(handle, inode); + ext4_handle_sync(handle); + ext4_journal_stop(handle); + ext4_std_error(inode->i_sb, err); + + return err; +} + +static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) +{ + return !buffer_mapped(bh); +} + +int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + loff_t size; + unsigned long len; + int ret = -EINVAL; + void *fsdata; + struct file *file = vma->vm_file; + struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + + /* + * Get i_alloc_sem to stop truncates messing with the inode. We cannot + * get i_mutex because we are already holding mmap_sem. + */ + down_read(&inode->i_alloc_sem); + size = i_size_read(inode); + if (page->mapping != mapping || size <= page_offset(page) + || !PageUptodate(page)) { + /* page got truncated from under us? */ + goto out_unlock; + } + ret = 0; + + lock_page(page); + wait_on_page_writeback(page); + if (PageMappedToDisk(page)) { + up_read(&inode->i_alloc_sem); + return VM_FAULT_LOCKED; + } + + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + /* + * return if we have all the buffers mapped. This avoid + * the need to call write_begin/write_end which does a + * journal_start/journal_stop which can block and take + * long time + */ + if (page_has_buffers(page)) { + if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped)) { + up_read(&inode->i_alloc_sem); + return VM_FAULT_LOCKED; + } + } + unlock_page(page); + /* + * OK, we need to fill the hole... Do write_begin write_end + * to do block allocation/reservation.We are not holding + * inode.i__mutex here. That allow * parallel write_begin, + * write_end call. lock_page prevent this from happening + * on the same page though + */ + ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), + len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); + if (ret < 0) + goto out_unlock; + ret = mapping->a_ops->write_end(file, mapping, page_offset(page), + len, len, page, fsdata); + if (ret < 0) + goto out_unlock; + ret = 0; + + /* + * write_begin/end might have created a dirty page and someone + * could wander in and start the IO. Make sure that hasn't + * happened. + */ + lock_page(page); + wait_on_page_writeback(page); + up_read(&inode->i_alloc_sem); + return VM_FAULT_LOCKED; +out_unlock: + if (ret) + ret = VM_FAULT_SIGBUS; + up_read(&inode->i_alloc_sem); + return ret; +} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c new file mode 100644 index 00000000..4cbe1c2c --- /dev/null +++ b/fs/ext4/ioctl.c @@ -0,0 +1,442 @@ +/* + * linux/fs/ext4/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" + +long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int flags; + + ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT4_IOC_GETFLAGS: + ext4_get_inode_flags(ei); + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; + return put_user(flags, (int __user *) arg); + case EXT4_IOC_SETFLAGS: { + handle_t *handle = NULL; + int err, migrate = 0; + struct ext4_iloc iloc; + unsigned int oldflags, mask, i; + unsigned int jflag; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + + flags = ext4_mask_flags(inode->i_mode, flags); + + err = -EPERM; + mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto flags_out; + + oldflags = ei->i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT4_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; + } + if (oldflags & EXT4_EXTENTS_FL) { + /* We don't support clearning extent flags */ + if (!(flags & EXT4_EXTENTS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (flags & EXT4_EXTENTS_FL) { + /* migrate the file */ + migrate = 1; + flags &= ~EXT4_EXTENTS_FL; + } + + if (flags & EXT4_EOFBLOCKS_FL) { + /* we don't support adding EOFBLOCKS flag */ + if (!(oldflags & EXT4_EOFBLOCKS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (oldflags & EXT4_EOFBLOCKS_FL) + ext4_truncate(inode); + + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto flags_out; + } + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { + if (!(mask & EXT4_FL_USER_MODIFIABLE)) + continue; + if (mask & flags) + ext4_set_inode_flag(inode, i); + else + ext4_clear_inode_flag(inode, i); + } + + ext4_set_inode_flags(inode); + inode->i_ctime = ext4_current_time(inode); + + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext4_journal_stop(handle); + if (err) + goto flags_out; + + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) + err = ext4_change_inode_journal_flag(inode, jflag); + if (err) + goto flags_out; + if (migrate) + err = ext4_ext_migrate(inode); +flags_out: + mutex_unlock(&inode->i_mutex); + mnt_drop_write(filp->f_path.mnt); + return err; + } + case EXT4_IOC_GETVERSION: + case EXT4_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); + case EXT4_IOC_SETVERSION: + case EXT4_IOC_SETVERSION_OLD: { + handle_t *handle; + struct ext4_iloc iloc; + __u32 generation; + int err; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (get_user(generation, (int __user *) arg)) { + err = -EFAULT; + goto setversion_out; + } + + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto setversion_out; + } + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err == 0) { + inode->i_ctime = ext4_current_time(inode); + inode->i_generation = generation; + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + } + ext4_journal_stop(handle); +setversion_out: + mnt_drop_write(filp->f_path.mnt); + return err; + } +#ifdef CONFIG_JBD2_DEBUG + case EXT4_IOC_WAIT_FOR_READONLY: + /* + * This is racy - by the time we're woken up and running, + * the superblock could be released. And the module could + * have been unloaded. So sue me. + * + * Returns 1 if it slept, else zero. + */ + { + struct super_block *sb = inode->i_sb; + DECLARE_WAITQUEUE(wait, current); + int ret = 0; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait); + if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) { + schedule(); + ret = 1; + } + remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait); + return ret; + } +#endif + case EXT4_IOC_GROUP_EXTEND: { + ext4_fsblk_t n_blocks_count; + struct super_block *sb = inode->i_sb; + int err, err2=0; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (get_user(n_blocks_count, (__u32 __user *)arg)) + return -EFAULT; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + + err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write(filp->f_path.mnt); + + return err; + } + + case EXT4_IOC_MOVE_EXT: { + struct move_extent me; + struct file *donor_filp; + int err; + + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&me, + (struct move_extent __user *)arg, sizeof(me))) + return -EFAULT; + me.moved_len = 0; + + donor_filp = fget(me.donor_fd); + if (!donor_filp) + return -EBADF; + + if (!(donor_filp->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto mext_out; + } + + err = mnt_want_write(filp->f_path.mnt); + if (err) + goto mext_out; + + err = ext4_move_extents(filp, donor_filp, me.orig_start, + me.donor_start, me.len, &me.moved_len); + mnt_drop_write(filp->f_path.mnt); + if (me.moved_len > 0) + file_remove_suid(donor_filp); + + if (copy_to_user((struct move_extent __user *)arg, + &me, sizeof(me))) + err = -EFAULT; +mext_out: + fput(donor_filp); + return err; + } + + case EXT4_IOC_GROUP_ADD: { + struct ext4_new_group_data input; + struct super_block *sb = inode->i_sb; + int err, err2=0; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, + sizeof(input))) + return -EFAULT; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + + err = ext4_group_add(sb, &input); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write(filp->f_path.mnt); + + return err; + } + + case EXT4_IOC_MIGRATE: + { + int err; + if (!inode_owner_or_capable(inode)) + return -EACCES; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + /* + * inode_mutex prevent write and truncate on the file. + * Read still goes through. We take i_data_sem in + * ext4_ext_swap_inode_data before we switch the + * inode format to prevent read. + */ + mutex_lock(&(inode->i_mutex)); + err = ext4_ext_migrate(inode); + mutex_unlock(&(inode->i_mutex)); + mnt_drop_write(filp->f_path.mnt); + return err; + } + + case EXT4_IOC_ALLOC_DA_BLKS: + { + int err; + if (!inode_owner_or_capable(inode)) + return -EACCES; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + err = ext4_alloc_da_blocks(inode); + mnt_drop_write(filp->f_path.mnt); + return err; + } + + case FITRIM: + { + struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (copy_from_user(&range, (struct fstrim_range *)arg, + sizeof(range))) + return -EFAULT; + + range.minlen = max((unsigned int)range.minlen, + q->limits.discard_granularity); + ret = ext4_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case EXT4_IOC32_GETFLAGS: + cmd = EXT4_IOC_GETFLAGS; + break; + case EXT4_IOC32_SETFLAGS: + cmd = EXT4_IOC_SETFLAGS; + break; + case EXT4_IOC32_GETVERSION: + cmd = EXT4_IOC_GETVERSION; + break; + case EXT4_IOC32_SETVERSION: + cmd = EXT4_IOC_SETVERSION; + break; + case EXT4_IOC32_GROUP_EXTEND: + cmd = EXT4_IOC_GROUP_EXTEND; + break; + case EXT4_IOC32_GETVERSION_OLD: + cmd = EXT4_IOC_GETVERSION_OLD; + break; + case EXT4_IOC32_SETVERSION_OLD: + cmd = EXT4_IOC_SETVERSION_OLD; + break; +#ifdef CONFIG_JBD2_DEBUG + case EXT4_IOC32_WAIT_FOR_READONLY: + cmd = EXT4_IOC_WAIT_FOR_READONLY; + break; +#endif + case EXT4_IOC32_GETRSVSZ: + cmd = EXT4_IOC_GETRSVSZ; + break; + case EXT4_IOC32_SETRSVSZ: + cmd = EXT4_IOC_SETRSVSZ; + break; + case EXT4_IOC32_GROUP_ADD: { + struct compat_ext4_new_group_input __user *uinput; + struct ext4_new_group_input input; + mm_segment_t old_fs; + int err; + + uinput = compat_ptr(arg); + err = get_user(input.group, &uinput->group); + err |= get_user(input.block_bitmap, &uinput->block_bitmap); + err |= get_user(input.inode_bitmap, &uinput->inode_bitmap); + err |= get_user(input.inode_table, &uinput->inode_table); + err |= get_user(input.blocks_count, &uinput->blocks_count); + err |= get_user(input.reserved_blocks, + &uinput->reserved_blocks); + if (err) + return -EFAULT; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD, + (unsigned long) &input); + set_fs(old_fs); + return err; + } + case EXT4_IOC_MOVE_EXT: + case FITRIM: + break; + default: + return -ENOIOCTLCMD; + } + return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c new file mode 100644 index 00000000..b6adf68a --- /dev/null +++ b/fs/ext4/mballoc.c @@ -0,0 +1,4958 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + + +/* + * mballoc.c contains the multiblocks allocation routines + */ + +#include "mballoc.h" +#include +#include +#include + +/* + * MUSTDO: + * - test ext4_ext_search_left() and ext4_ext_search_right() + * - search for metadata in few groups + * + * TODO v4: + * - normalization should take into account whether file is still open + * - discard preallocations if no free space left (policy?) + * - don't normalize tails + * - quota + * - reservation for superuser + * + * TODO v3: + * - bitmap read-ahead (proposed by Oleg Drokin aka green) + * - track min/max extents in each group for better group selection + * - mb_mark_used() may allocate chunk right after splitting buddy + * - tree of groups sorted by number of free blocks + * - error handling + */ + +/* + * The allocation request involve request for multiple number of blocks + * near to the goal(block) value specified. + * + * During initialization phase of the allocator we decide to use the + * group preallocation or inode preallocation depending on the size of + * the file. The size of the file could be the resulting file size we + * would have after allocation, or the current file size, which ever + * is larger. If the size is less than sbi->s_mb_stream_request we + * select to use the group preallocation. The default value of + * s_mb_stream_request is 16 blocks. This can also be tuned via + * /sys/fs/ext4//mb_stream_req. The value is represented in + * terms of number of blocks. + * + * The main motivation for having small file use group preallocation is to + * ensure that we have small files closer together on the disk. + * + * First stage the allocator looks at the inode prealloc list, + * ext4_inode_info->i_prealloc_list, which contains list of prealloc + * spaces for this particular inode. The inode prealloc space is + * represented as: + * + * pa_lstart -> the logical start block for this prealloc space + * pa_pstart -> the physical start block for this prealloc space + * pa_len -> length for this prealloc space + * pa_free -> free space available in this prealloc space + * + * The inode preallocation space is used looking at the _logical_ start + * block. If only the logical file block falls within the range of prealloc + * space we will consume the particular prealloc space. This make sure that + * that the we have contiguous physical blocks representing the file blocks + * + * The important thing to be noted in case of inode prealloc space is that + * we don't modify the values associated to inode prealloc space except + * pa_free. + * + * If we are not able to find blocks in the inode prealloc space and if we + * have the group allocation flag set then we look at the locality group + * prealloc space. These are per CPU prealloc list repreasented as + * + * ext4_sb_info.s_locality_groups[smp_processor_id()] + * + * The reason for having a per cpu locality group is to reduce the contention + * between CPUs. It is possible to get scheduled at this point. + * + * The locality group prealloc space is used looking at whether we have + * enough free space (pa_free) within the prealloc space. + * + * If we can't allocate blocks via inode prealloc or/and locality group + * prealloc then we look at the buddy cache. The buddy cache is represented + * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets + * mapped to the buddy and bitmap information regarding different + * groups. The buddy information is attached to buddy cache inode so that + * we can access them through the page cache. The information regarding + * each group is loaded via ext4_mb_load_buddy. The information involve + * block bitmap and buddy information. The information are stored in the + * inode as: + * + * { page } + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... + * + * + * one block each for bitmap and buddy information. So for each group we + * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / + * blocksize) blocks. So it can have information regarding groups_per_page + * which is blocks_per_page/2 + * + * The buddy cache inode is not stored on disk. The inode is thrown + * away when the filesystem is unmounted. + * + * We look for count number of blocks in the buddy cache. If we were able + * to locate that many free blocks we return with additional information + * regarding rest of the contiguous physical block available + * + * Before allocating blocks via buddy cache we normalize the request + * blocks. This ensure we ask for more blocks that we needed. The extra + * blocks that we get after allocation is added to the respective prealloc + * list. In case of inode preallocation we follow a list of heuristics + * based on file size. This can be found in ext4_mb_normalize_request. If + * we are doing a group prealloc we try to normalize the request to + * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is + * 512 blocks. This can be tuned via + * /sys/fs/ext4/ option the group prealloc request is normalized to the + * stripe value (sbi->s_stripe) + * + * The regular allocator(using the buddy cache) supports few tunables. + * + * /sys/fs/ext4//mb_min_to_scan + * /sys/fs/ext4//mb_max_to_scan + * /sys/fs/ext4//mb_order2_req + * + * The regular allocator uses buddy scan only if the request len is power of + * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The + * value of s_mb_order2_reqs can be tuned via + * /sys/fs/ext4//mb_order2_req. If the request len is equal to + * stripe size (sbi->s_stripe), we try to search for contiguous block in + * stripe size. This should result in better allocation on RAID setups. If + * not, we search in the specific group using bitmap for best extents. The + * tunable min_to_scan and max_to_scan control the behaviour here. + * min_to_scan indicate how long the mballoc __must__ look for a best + * extent and max_to_scan indicates how long the mballoc __can__ look for a + * best extent in the found extents. Searching for the blocks starts with + * the group specified as the goal value in allocation context via + * ac_g_ex. Each group is first checked based on the criteria whether it + * can used for allocation. ext4_mb_good_group explains how the groups are + * checked. + * + * Both the prealloc space are getting populated as above. So for the first + * request we will hit the buddy cache which will result in this prealloc + * space getting filled. The prealloc space is then later used for the + * subsequent request. + */ + +/* + * mballoc operates on the following data: + * - on-disk bitmap + * - in-core buddy (actually includes buddy and bitmap) + * - preallocation descriptors (PAs) + * + * there are two types of preallocations: + * - inode + * assiged to specific inode and can be used for this inode only. + * it describes part of inode's space preallocated to specific + * physical blocks. any block from that preallocated can be used + * independent. the descriptor just tracks number of blocks left + * unused. so, before taking some block from descriptor, one must + * make sure corresponded logical block isn't allocated yet. this + * also means that freeing any block within descriptor's range + * must discard all preallocated blocks. + * - locality group + * assigned to specific locality group which does not translate to + * permanent set of inodes: inode can join and leave group. space + * from this type of preallocation can be used for any inode. thus + * it's consumed from the beginning to the end. + * + * relation between them can be expressed as: + * in-core buddy = on-disk bitmap + preallocation descriptors + * + * this mean blocks mballoc considers used are: + * - allocated blocks (persistent) + * - preallocated blocks (non-persistent) + * + * consistency in mballoc world means that at any time a block is either + * free or used in ALL structures. notice: "any time" should not be read + * literally -- time is discrete and delimited by locks. + * + * to keep it simple, we don't use block numbers, instead we count number of + * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. + * + * all operations can be expressed as: + * - init buddy: buddy = on-disk + PAs + * - new PA: buddy += N; PA = N + * - use inode PA: on-disk += N; PA -= N + * - discard inode PA buddy -= on-disk - PA; PA = 0 + * - use locality group PA on-disk += N; PA -= N + * - discard locality group PA buddy -= PA; PA = 0 + * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap + * is used in real operation because we can't know actual used + * bits from PA, only from on-disk bitmap + * + * if we follow this strict logic, then all operations above should be atomic. + * given some of them can block, we'd have to use something like semaphores + * killing performance on high-end SMP hardware. let's try to relax it using + * the following knowledge: + * 1) if buddy is referenced, it's already initialized + * 2) while block is used in buddy and the buddy is referenced, + * nobody can re-allocate that block + * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has + * bit set and PA claims same block, it's OK. IOW, one can set bit in + * on-disk bitmap if buddy has same bit set or/and PA covers corresponded + * block + * + * so, now we're building a concurrency table: + * - init buddy vs. + * - new PA + * blocks for PA are allocated in the buddy, buddy must be referenced + * until PA is linked to allocation group to avoid concurrent buddy init + * - use inode PA + * we need to make sure that either on-disk bitmap or PA has uptodate data + * given (3) we care that PA-=N operation doesn't interfere with init + * - discard inode PA + * the simplest way would be to have buddy initialized by the discard + * - use locality group PA + * again PA-=N must be serialized with init + * - discard locality group PA + * the simplest way would be to have buddy initialized by the discard + * - new PA vs. + * - use inode PA + * i_data_sem serializes them + * - discard inode PA + * discard process must wait until PA isn't used by another process + * - use locality group PA + * some mutex should serialize them + * - discard locality group PA + * discard process must wait until PA isn't used by another process + * - use inode PA + * - use inode PA + * i_data_sem or another mutex should serializes them + * - discard inode PA + * discard process must wait until PA isn't used by another process + * - use locality group PA + * nothing wrong here -- they're different PAs covering different blocks + * - discard locality group PA + * discard process must wait until PA isn't used by another process + * + * now we're ready to make few consequences: + * - PA is referenced and while it is no discard is possible + * - PA is referenced until block isn't marked in on-disk bitmap + * - PA changes only after on-disk bitmap + * - discard must not compete with init. either init is done before + * any discard or they're serialized somehow + * - buddy init as sum of on-disk bitmap and PAs is done atomically + * + * a special case when we've used PA to emptiness. no need to modify buddy + * in this case, but we should care about concurrent init + * + */ + + /* + * Logic in few words: + * + * - allocation: + * load group + * find blocks + * mark bits in on-disk bitmap + * release group + * + * - use preallocation: + * find proper PA (per-inode or group) + * load group + * mark bits in on-disk bitmap + * release group + * release PA + * + * - free: + * load group + * mark bits in on-disk bitmap + * release group + * + * - discard preallocations in group: + * mark PAs deleted + * move them onto local list + * load on-disk bitmap + * load group + * remove PA from object (inode or locality group) + * mark free blocks in-core + * + * - discard inode's preallocations: + */ + +/* + * Locking rules + * + * Locks: + * - bitlock on a group (group) + * - object (inode/locality) (object) + * - per-pa lock (pa) + * + * Paths: + * - new pa + * object + * group + * + * - find and use pa: + * pa + * + * - release consumed pa: + * pa + * group + * object + * + * - generate in-core bitmap: + * group + * pa + * + * - discard all for given object (inode, locality group): + * object + * pa + * group + * + * - discard all for given group: + * group + * pa + * group + * object + * + */ +static struct kmem_cache *ext4_pspace_cachep; +static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_ext_cachep; + +/* We create slab caches for groupinfo data structures based on the + * superblock block size. There will be one per mounted filesystem for + * each unique s_blocksize_bits */ +#define NR_GRPINFO_CACHES 8 +static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; + +static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { + "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", + "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", + "ext4_groupinfo_64k", "ext4_groupinfo_128k" +}; + +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); + +static inline void *mb_correct_addr_and_bit(int *bit, void *addr) +{ +#if BITS_PER_LONG == 64 + *bit += ((unsigned long) addr & 7UL) << 3; + addr = (void *) ((unsigned long) addr & ~7UL); +#elif BITS_PER_LONG == 32 + *bit += ((unsigned long) addr & 3UL) << 3; + addr = (void *) ((unsigned long) addr & ~3UL); +#else +#error "how many bits you are?!" +#endif + return addr; +} + +static inline int mb_test_bit(int bit, void *addr) +{ + /* + * ext4_test_bit on architecture like powerpc + * needs unsigned long aligned address + */ + addr = mb_correct_addr_and_bit(&bit, addr); + return ext4_test_bit(bit, addr); +} + +static inline void mb_set_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + ext4_set_bit(bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + ext4_clear_bit(bit, addr); +} + +static inline int mb_find_next_zero_bit(void *addr, int max, int start) +{ + int fix = 0, ret, tmpmax; + addr = mb_correct_addr_and_bit(&fix, addr); + tmpmax = max + fix; + start += fix; + + ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; +} + +static inline int mb_find_next_bit(void *addr, int max, int start) +{ + int fix = 0, ret, tmpmax; + addr = mb_correct_addr_and_bit(&fix, addr); + tmpmax = max + fix; + start += fix; + + ret = ext4_find_next_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; +} + +static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) +{ + char *bb; + + BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(max == NULL); + + if (order > e4b->bd_blkbits + 1) { + *max = 0; + return NULL; + } + + /* at order 0 we see each particular block */ + if (order == 0) { + *max = 1 << (e4b->bd_blkbits + 3); + return EXT4_MB_BITMAP(e4b); + } + + bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; + *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; + + return bb; +} + +#ifdef DOUBLE_CHECK +static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, + int first, int count) +{ + int i; + struct super_block *sb = e4b->bd_sb; + + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) + return; + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + for (i = 0; i < count; i++) { + if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { + ext4_fsblk_t blocknr; + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += first + i; + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing block already freed " + "(bit %u)", + first + i); + } + mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); + } +} + +static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) +{ + int i; + + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) + return; + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); + for (i = 0; i < count; i++) { + BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); + mb_set_bit(first + i, e4b->bd_info->bb_bitmap); + } +} + +static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) +{ + if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { + unsigned char *b1, *b2; + int i; + b1 = (unsigned char *) e4b->bd_info->bb_bitmap; + b2 = (unsigned char *) bitmap; + for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { + if (b1[i] != b2[i]) { + printk(KERN_ERR "corruption in group %u " + "at byte %u(%u): %x in copy != %x " + "on disk/prealloc\n", + e4b->bd_group, i, i * 8, b1[i], b2[i]); + BUG(); + } + } + } +} + +#else +static inline void mb_free_blocks_double(struct inode *inode, + struct ext4_buddy *e4b, int first, int count) +{ + return; +} +static inline void mb_mark_used_double(struct ext4_buddy *e4b, + int first, int count) +{ + return; +} +static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) +{ + return; +} +#endif + +#ifdef AGGRESSIVE_CHECK + +#define MB_CHECK_ASSERT(assert) \ +do { \ + if (!(assert)) { \ + printk(KERN_EMERG \ + "Assertion failure in %s() at %s:%d: \"%s\"\n", \ + function, file, line, # assert); \ + BUG(); \ + } \ +} while (0) + +static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, + const char *function, int line) +{ + struct super_block *sb = e4b->bd_sb; + int order = e4b->bd_blkbits + 1; + int max; + int max2; + int i; + int j; + int k; + int count; + struct ext4_group_info *grp; + int fragments = 0; + int fstart; + struct list_head *cur; + void *buddy; + void *buddy2; + + { + static int mb_check_counter; + if (mb_check_counter++ % 100 != 0) + return 0; + } + + while (order > 1) { + buddy = mb_find_buddy(e4b, order, &max); + MB_CHECK_ASSERT(buddy); + buddy2 = mb_find_buddy(e4b, order - 1, &max2); + MB_CHECK_ASSERT(buddy2); + MB_CHECK_ASSERT(buddy != buddy2); + MB_CHECK_ASSERT(max * 2 == max2); + + count = 0; + for (i = 0; i < max; i++) { + + if (mb_test_bit(i, buddy)) { + /* only single bit in buddy2 may be 1 */ + if (!mb_test_bit(i << 1, buddy2)) { + MB_CHECK_ASSERT( + mb_test_bit((i<<1)+1, buddy2)); + } else if (!mb_test_bit((i << 1) + 1, buddy2)) { + MB_CHECK_ASSERT( + mb_test_bit(i << 1, buddy2)); + } + continue; + } + + /* both bits in buddy2 must be 0 */ + MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); + MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); + + for (j = 0; j < (1 << order); j++) { + k = (i * (1 << order)) + j; + MB_CHECK_ASSERT( + !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); + } + count++; + } + MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); + order--; + } + + fstart = -1; + buddy = mb_find_buddy(e4b, 0, &max); + for (i = 0; i < max; i++) { + if (!mb_test_bit(i, buddy)) { + MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); + if (fstart == -1) { + fragments++; + fstart = i; + } + continue; + } + fstart = -1; + /* check used bits only */ + for (j = 0; j < e4b->bd_blkbits + 1; j++) { + buddy2 = mb_find_buddy(e4b, j, &max2); + k = i >> j; + MB_CHECK_ASSERT(k < max2); + MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); + } + } + MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); + MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); + + grp = ext4_get_group_info(sb, e4b->bd_group); + list_for_each(cur, &grp->bb_prealloc_list) { + ext4_group_t groupnr; + struct ext4_prealloc_space *pa; + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); + MB_CHECK_ASSERT(groupnr == e4b->bd_group); + for (i = 0; i < pa->pa_len; i++) + MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); + } + return 0; +} +#undef MB_CHECK_ASSERT +#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ + __FILE__, __func__, __LINE__) +#else +#define mb_check_buddy(e4b) +#endif + +/* + * Divide blocks started from @first with length @len into + * smaller chunks with power of 2 blocks. + * Clear the bits in bitmap which the blocks of the chunk(s) covered, + * then increase bb_counters[] for corresponded chunk size. + */ +static void ext4_mb_mark_free_simple(struct super_block *sb, + void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, + struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_grpblk_t min; + ext4_grpblk_t max; + ext4_grpblk_t chunk; + unsigned short border; + + BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); + + border = 2 << sb->s_blocksize_bits; + + while (len > 0) { + /* find how many blocks can be covered since this position */ + max = ffs(first | border) - 1; + + /* find how many blocks of power 2 we need to mark */ + min = fls(len) - 1; + + if (max < min) + min = max; + chunk = 1 << min; + + /* mark multiblock chunks only */ + grp->bb_counters[min]++; + if (min > 0) + mb_clear_bit(first >> min, + buddy + sbi->s_mb_offsets[min]); + + len -= chunk; + first += chunk; + } +} + +/* + * Cache the order of the largest free extent we have available in this block + * group. + */ +static void +mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) +{ + int i; + int bits; + + grp->bb_largest_free_order = -1; /* uninit */ + + bits = sb->s_blocksize_bits + 1; + for (i = bits; i >= 0; i--) { + if (grp->bb_counters[i] > 0) { + grp->bb_largest_free_order = i; + break; + } + } +} + +static noinline_for_stack +void ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb); + ext4_grpblk_t i = 0; + ext4_grpblk_t first; + ext4_grpblk_t len; + unsigned free = 0; + unsigned fragments = 0; + unsigned long long period = get_cycles(); + + /* initialize buddy from bitmap which is aggregation + * of on-disk bitmap and preallocations */ + i = mb_find_next_zero_bit(bitmap, max, 0); + grp->bb_first_free = i; + while (i < max) { + fragments++; + first = i; + i = mb_find_next_bit(bitmap, max, i); + len = i - first; + free += len; + if (len > 1) + ext4_mb_mark_free_simple(sb, buddy, first, len, grp); + else + grp->bb_counters[0]++; + if (i < max) + i = mb_find_next_zero_bit(bitmap, max, i); + } + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { + ext4_grp_locked_error(sb, group, 0, 0, + "%u blocks in bitmap, %u in gd", + free, grp->bb_free); + /* + * If we intent to continue, we consider group descritor + * corrupt and update bb_free using bitmap value + */ + grp->bb_free = free; + } + mb_set_largest_free_order(sb, grp); + + clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + + period = get_cycles() - period; + spin_lock(&EXT4_SB(sb)->s_bal_lock); + EXT4_SB(sb)->s_mb_buddies_generated++; + EXT4_SB(sb)->s_mb_generation_time += period; + spin_unlock(&EXT4_SB(sb)->s_bal_lock); +} + +/* The buddy information is attached the buddy cache inode + * for convenience. The information regarding each group + * is loaded via ext4_mb_load_buddy. The information involve + * block bitmap and buddy information. The information are + * stored in the inode as + * + * { page } + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... + * + * + * one block each for bitmap and buddy information. + * So for each group we take up 2 blocks. A page can + * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. + * So it can have information regarding groups_per_page which + * is blocks_per_page/2 + * + * Locking note: This routine takes the block group lock of all groups + * for this page; do not hold this lock when calling this routine! + */ + +static int ext4_mb_init_cache(struct page *page, char *incore) +{ + ext4_group_t ngroups; + int blocksize; + int blocks_per_page; + int groups_per_page; + int err = 0; + int i; + ext4_group_t first_group; + int first_block; + struct super_block *sb; + struct buffer_head *bhs; + struct buffer_head **bh; + struct inode *inode; + char *data; + char *bitmap; + struct ext4_group_info *grinfo; + + mb_debug(1, "init page %lu\n", page->index); + + inode = page->mapping->host; + sb = inode->i_sb; + ngroups = ext4_get_groups_count(sb); + blocksize = 1 << inode->i_blkbits; + blocks_per_page = PAGE_CACHE_SIZE / blocksize; + + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; + + /* allocate buffer_heads to read bitmaps */ + if (groups_per_page > 1) { + err = -ENOMEM; + i = sizeof(struct buffer_head *) * groups_per_page; + bh = kzalloc(i, GFP_NOFS); + if (bh == NULL) + goto out; + } else + bh = &bhs; + + first_group = page->index * blocks_per_page / 2; + + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + struct ext4_group_desc *desc; + + if (first_group + i >= ngroups) + break; + + grinfo = ext4_get_group_info(sb, first_group + i); + /* + * If page is uptodate then we came here after online resize + * which added some new uninitialized group info structs, so + * we must skip all initialized uptodate buddies on the page, + * which may be currently in use by an allocating task. + */ + if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { + bh[i] = NULL; + continue; + } + + err = -EIO; + desc = ext4_get_group_desc(sb, first_group + i, NULL); + if (desc == NULL) + goto out; + + err = -ENOMEM; + bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); + if (bh[i] == NULL) + goto out; + + if (bitmap_uptodate(bh[i])) + continue; + + lock_buffer(bh[i]); + if (bitmap_uptodate(bh[i])) { + unlock_buffer(bh[i]); + continue; + } + ext4_lock_group(sb, first_group + i); + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + ext4_init_block_bitmap(sb, bh[i], + first_group + i, desc); + set_bitmap_uptodate(bh[i]); + set_buffer_uptodate(bh[i]); + ext4_unlock_group(sb, first_group + i); + unlock_buffer(bh[i]); + continue; + } + ext4_unlock_group(sb, first_group + i); + if (buffer_uptodate(bh[i])) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh[i]); + unlock_buffer(bh[i]); + continue; + } + get_bh(bh[i]); + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + set_bitmap_uptodate(bh[i]); + bh[i]->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh[i]); + mb_debug(1, "read bitmap for group %u\n", first_group + i); + } + + /* wait for I/O completion */ + for (i = 0; i < groups_per_page; i++) + if (bh[i]) + wait_on_buffer(bh[i]); + + err = -EIO; + for (i = 0; i < groups_per_page; i++) + if (bh[i] && !buffer_uptodate(bh[i])) + goto out; + + err = 0; + first_block = page->index * blocks_per_page; + for (i = 0; i < blocks_per_page; i++) { + int group; + + group = (first_block + i) >> 1; + if (group >= ngroups) + break; + + if (!bh[group - first_group]) + /* skip initialized uptodate buddy */ + continue; + + /* + * data carry information regarding this + * particular group in the format specified + * above + * + */ + data = page_address(page) + (i * blocksize); + bitmap = bh[group - first_group]->b_data; + + /* + * We place the buddy block and bitmap block + * close together + */ + if ((first_block + i) & 1) { + /* this is block of buddy */ + BUG_ON(incore == NULL); + mb_debug(1, "put buddy for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + trace_ext4_mb_buddy_bitmap_load(sb, group); + grinfo = ext4_get_group_info(sb, group); + grinfo->bb_fragments = 0; + memset(grinfo->bb_counters, 0, + sizeof(*grinfo->bb_counters) * + (sb->s_blocksize_bits+2)); + /* + * incore got set to the group block bitmap below + */ + ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); + ext4_mb_generate_buddy(sb, data, incore, group); + ext4_unlock_group(sb, group); + incore = NULL; + } else { + /* this is block of bitmap */ + BUG_ON(incore != NULL); + mb_debug(1, "put bitmap for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + trace_ext4_mb_bitmap_load(sb, group); + + /* see comments in ext4_mb_put_pa() */ + ext4_lock_group(sb, group); + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blks used in in-core bitmap */ + ext4_mb_generate_from_pa(sb, data, group); + ext4_mb_generate_from_freelist(sb, data, group); + ext4_unlock_group(sb, group); + + /* set incore so that the buddy information can be + * generated using this + */ + incore = data; + } + } + SetPageUptodate(page); + +out: + if (bh) { + for (i = 0; i < groups_per_page; i++) + brelse(bh[i]); + if (bh != &bhs) + kfree(bh); + } + return err; +} + +/* + * Lock the buddy and bitmap pages. This make sure other parallel init_group + * on the same buddy page doesn't happen whild holding the buddy page lock. + * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap + * are on the same page e4b->bd_buddy_page is NULL and return value is 0. + */ +static int ext4_mb_get_buddy_page_lock(struct super_block *sb, + ext4_group_t group, struct ext4_buddy *e4b) +{ + struct inode *inode = EXT4_SB(sb)->s_buddy_cache; + int block, pnum, poff; + int blocks_per_page; + struct page *page; + + e4b->bd_buddy_page = NULL; + e4b->bd_bitmap_page = NULL; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (!page) + return -EIO; + BUG_ON(page->mapping != inode->i_mapping); + e4b->bd_bitmap_page = page; + e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + + if (blocks_per_page >= 2) { + /* buddy and bitmap are on the same page */ + return 0; + } + + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (!page) + return -EIO; + BUG_ON(page->mapping != inode->i_mapping); + e4b->bd_buddy_page = page; + return 0; +} + +static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) +{ + if (e4b->bd_bitmap_page) { + unlock_page(e4b->bd_bitmap_page); + page_cache_release(e4b->bd_bitmap_page); + } + if (e4b->bd_buddy_page) { + unlock_page(e4b->bd_buddy_page); + page_cache_release(e4b->bd_buddy_page); + } +} + +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ +static noinline_for_stack +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +{ + + struct ext4_group_info *this_grp; + struct ext4_buddy e4b; + struct page *page; + int ret = 0; + + mb_debug(1, "init group %u\n", group); + this_grp = ext4_get_group_info(sb, group); + /* + * This ensures that we don't reinit the buddy cache + * page which map to the group from which we are already + * allocating. If we are looking at the buddy cache we would + * have taken a reference using ext4_mb_load_buddy and that + * would have pinned buddy page to page cache. + */ + ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); + if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { + /* + * somebody initialized the group + * return without doing anything + */ + goto err; + } + + page = e4b.bd_bitmap_page; + ret = ext4_mb_init_cache(page, NULL); + if (ret) + goto err; + if (!PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); + + if (e4b.bd_buddy_page == NULL) { + /* + * If both the bitmap and buddy are in + * the same page we don't need to force + * init the buddy + */ + ret = 0; + goto err; + } + /* init buddy cache */ + page = e4b.bd_buddy_page; + ret = ext4_mb_init_cache(page, e4b.bd_bitmap); + if (ret) + goto err; + if (!PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); +err: + ext4_mb_put_buddy_page_lock(&e4b); + return ret; +} + +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ +static noinline_for_stack int +ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) +{ + int blocks_per_page; + int block; + int pnum; + int poff; + struct page *page; + int ret; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + + mb_debug(1, "load group %u\n", group); + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + grp = ext4_get_group_info(sb, group); + + e4b->bd_blkbits = sb->s_blocksize_bits; + e4b->bd_info = ext4_get_group_info(sb, group); + e4b->bd_sb = sb; + e4b->bd_group = group; + e4b->bd_buddy_page = NULL; + e4b->bd_bitmap_page = NULL; + + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + /* + * we need full data about the group + * to make a good selection + */ + ret = ext4_mb_init_group(sb, group); + if (ret) + return ret; + } + + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + + /* we could use find_or_create_page(), but it locks page + * what we'd like to avoid in fast path ... */ + page = find_get_page(inode->i_mapping, pnum); + if (page == NULL || !PageUptodate(page)) { + if (page) + /* + * drop the page reference and try + * to get the page with lock. If we + * are not uptodate that implies + * somebody just created the page but + * is yet to initialize the same. So + * wait for it to initialize. + */ + page_cache_release(page); + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + if (!PageUptodate(page)) { + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } + mb_cmp_bitmaps(e4b, page_address(page) + + (poff * sb->s_blocksize)); + } + unlock_page(page); + } + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + e4b->bd_bitmap_page = page; + e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + mark_page_accessed(page); + + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + + page = find_get_page(inode->i_mapping, pnum); + if (page == NULL || !PageUptodate(page)) { + if (page) + page_cache_release(page); + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + if (!PageUptodate(page)) { + ret = ext4_mb_init_cache(page, e4b->bd_bitmap); + if (ret) { + unlock_page(page); + goto err; + } + } + unlock_page(page); + } + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + e4b->bd_buddy_page = page; + e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); + mark_page_accessed(page); + + BUG_ON(e4b->bd_bitmap_page == NULL); + BUG_ON(e4b->bd_buddy_page == NULL); + + return 0; + +err: + if (page) + page_cache_release(page); + if (e4b->bd_bitmap_page) + page_cache_release(e4b->bd_bitmap_page); + if (e4b->bd_buddy_page) + page_cache_release(e4b->bd_buddy_page); + e4b->bd_buddy = NULL; + e4b->bd_bitmap = NULL; + return ret; +} + +static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) +{ + if (e4b->bd_bitmap_page) + page_cache_release(e4b->bd_bitmap_page); + if (e4b->bd_buddy_page) + page_cache_release(e4b->bd_buddy_page); +} + + +static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) +{ + int order = 1; + void *bb; + + BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); + + bb = EXT4_MB_BUDDY(e4b); + while (order <= e4b->bd_blkbits + 1) { + block = block >> 1; + if (!mb_test_bit(block, bb)) { + /* this block is part of buddy of order 'order' */ + return order; + } + bb += 1 << (e4b->bd_blkbits - order); + order++; + } + return 0; +} + +static void mb_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + *addr = 0; + cur += 32; + continue; + } + mb_clear_bit(cur, bm); + cur++; + } +} + +static void mb_set_bits(void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: set whole word at once */ + addr = bm + (cur >> 3); + *addr = 0xffffffff; + cur += 32; + continue; + } + mb_set_bit(cur, bm); + cur++; + } +} + +static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, + int first, int count) +{ + int block = 0; + int max = 0; + int order; + void *buddy; + void *buddy2; + struct super_block *sb = e4b->bd_sb; + + BUG_ON(first + count > (sb->s_blocksize << 3)); + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + mb_check_buddy(e4b); + mb_free_blocks_double(inode, e4b, first, count); + + e4b->bd_info->bb_free += count; + if (first < e4b->bd_info->bb_first_free) + e4b->bd_info->bb_first_free = first; + + /* let's maintain fragments counter */ + if (first != 0) + block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); + if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) + max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); + if (block && max) + e4b->bd_info->bb_fragments--; + else if (!block && !max) + e4b->bd_info->bb_fragments++; + + /* let's maintain buddy itself */ + while (count-- > 0) { + block = first++; + order = 0; + + if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { + ext4_fsblk_t blocknr; + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += block; + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing already freed block " + "(bit %u)", block); + } + mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); + e4b->bd_info->bb_counters[order]++; + + /* start of the buddy */ + buddy = mb_find_buddy(e4b, order, &max); + + do { + block &= ~1UL; + if (mb_test_bit(block, buddy) || + mb_test_bit(block + 1, buddy)) + break; + + /* both the buddies are free, try to coalesce them */ + buddy2 = mb_find_buddy(e4b, order + 1, &max); + + if (!buddy2) + break; + + if (order > 0) { + /* for special purposes, we don't set + * free bits in bitmap */ + mb_set_bit(block, buddy); + mb_set_bit(block + 1, buddy); + } + e4b->bd_info->bb_counters[order]--; + e4b->bd_info->bb_counters[order]--; + + block = block >> 1; + order++; + e4b->bd_info->bb_counters[order]++; + + mb_clear_bit(block, buddy2); + buddy = buddy2; + } while (1); + } + mb_set_largest_free_order(sb, e4b->bd_info); + mb_check_buddy(e4b); +} + +static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, + int needed, struct ext4_free_extent *ex) +{ + int next = block; + int max; + int ord; + void *buddy; + + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); + BUG_ON(ex == NULL); + + buddy = mb_find_buddy(e4b, order, &max); + BUG_ON(buddy == NULL); + BUG_ON(block >= max); + if (mb_test_bit(block, buddy)) { + ex->fe_len = 0; + ex->fe_start = 0; + ex->fe_group = 0; + return 0; + } + + /* FIXME dorp order completely ? */ + if (likely(order == 0)) { + /* find actual order */ + order = mb_find_order_for_block(e4b, block); + block = block >> order; + } + + ex->fe_len = 1 << order; + ex->fe_start = block << order; + ex->fe_group = e4b->bd_group; + + /* calc difference from given start */ + next = next - ex->fe_start; + ex->fe_len -= next; + ex->fe_start += next; + + while (needed > ex->fe_len && + (buddy = mb_find_buddy(e4b, order, &max))) { + + if (block + 1 >= max) + break; + + next = (block + 1) * (1 << order); + if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) + break; + + ord = mb_find_order_for_block(e4b, next); + + order = ord; + block = next >> order; + ex->fe_len += 1 << order; + } + + BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); + return ex->fe_len; +} + +static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) +{ + int ord; + int mlen = 0; + int max = 0; + int cur; + int start = ex->fe_start; + int len = ex->fe_len; + unsigned ret = 0; + int len0 = len; + void *buddy; + + BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); + BUG_ON(e4b->bd_group != ex->fe_group); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); + mb_check_buddy(e4b); + mb_mark_used_double(e4b, start, len); + + e4b->bd_info->bb_free -= len; + if (e4b->bd_info->bb_first_free == start) + e4b->bd_info->bb_first_free += len; + + /* let's maintain fragments counter */ + if (start != 0) + mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); + if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) + max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); + if (mlen && max) + e4b->bd_info->bb_fragments++; + else if (!mlen && !max) + e4b->bd_info->bb_fragments--; + + /* let's maintain buddy itself */ + while (len) { + ord = mb_find_order_for_block(e4b, start); + + if (((start >> ord) << ord) == start && len >= (1 << ord)) { + /* the whole chunk may be allocated at once! */ + mlen = 1 << ord; + buddy = mb_find_buddy(e4b, ord, &max); + BUG_ON((start >> ord) >= max); + mb_set_bit(start >> ord, buddy); + e4b->bd_info->bb_counters[ord]--; + start += mlen; + len -= mlen; + BUG_ON(len < 0); + continue; + } + + /* store for history */ + if (ret == 0) + ret = len | (ord << 16); + + /* we have to split large buddy */ + BUG_ON(ord <= 0); + buddy = mb_find_buddy(e4b, ord, &max); + mb_set_bit(start >> ord, buddy); + e4b->bd_info->bb_counters[ord]--; + + ord--; + cur = (start >> ord) & ~1U; + buddy = mb_find_buddy(e4b, ord, &max); + mb_clear_bit(cur, buddy); + mb_clear_bit(cur + 1, buddy); + e4b->bd_info->bb_counters[ord]++; + e4b->bd_info->bb_counters[ord]++; + } + mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); + + mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + mb_check_buddy(e4b); + + return ret; +} + +/* + * Must be called under group lock! + */ +static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int ret; + + BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); + BUG_ON(ac->ac_status == AC_STATUS_FOUND); + + ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); + ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; + ret = mb_mark_used(e4b, &ac->ac_b_ex); + + /* preallocation can change ac_b_ex, thus we store actually + * allocated blocks for history */ + ac->ac_f_ex = ac->ac_b_ex; + + ac->ac_status = AC_STATUS_FOUND; + ac->ac_tail = ret & 0xffff; + ac->ac_buddy = ret >> 16; + + /* + * take the page reference. We want the page to be pinned + * so that we don't get a ext4_mb_init_cache_call for this + * group until we update the bitmap. That would mean we + * double allocate blocks. The reference is dropped + * in ext4_mb_release_context + */ + ac->ac_bitmap_page = e4b->bd_bitmap_page; + get_page(ac->ac_bitmap_page); + ac->ac_buddy_page = e4b->bd_buddy_page; + get_page(ac->ac_buddy_page); + /* store last allocated for subsequent stream allocation */ + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { + spin_lock(&sbi->s_md_lock); + sbi->s_mb_last_group = ac->ac_f_ex.fe_group; + sbi->s_mb_last_start = ac->ac_f_ex.fe_start; + spin_unlock(&sbi->s_md_lock); + } +} + +/* + * regular allocator, for general purposes allocation + */ + +static void ext4_mb_check_limits(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b, + int finish_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_free_extent *bex = &ac->ac_b_ex; + struct ext4_free_extent *gex = &ac->ac_g_ex; + struct ext4_free_extent ex; + int max; + + if (ac->ac_status == AC_STATUS_FOUND) + return; + /* + * We don't want to scan for a whole year + */ + if (ac->ac_found > sbi->s_mb_max_to_scan && + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + ac->ac_status = AC_STATUS_BREAK; + return; + } + + /* + * Haven't found good chunk so far, let's continue + */ + if (bex->fe_len < gex->fe_len) + return; + + if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) + && bex->fe_group == e4b->bd_group) { + /* recheck chunk's availability - we don't know + * when it was found (within this lock-unlock + * period or not) */ + max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); + if (max >= gex->fe_len) { + ext4_mb_use_best_found(ac, e4b); + return; + } + } +} + +/* + * The routine checks whether found extent is good enough. If it is, + * then the extent gets marked used and flag is set to the context + * to stop scanning. Otherwise, the extent is compared with the + * previous found extent and if new one is better, then it's stored + * in the context. Later, the best found extent will be used, if + * mballoc can't find good enough extent. + * + * FIXME: real allocation policy is to be designed yet! + */ +static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, + struct ext4_free_extent *ex, + struct ext4_buddy *e4b) +{ + struct ext4_free_extent *bex = &ac->ac_b_ex; + struct ext4_free_extent *gex = &ac->ac_g_ex; + + BUG_ON(ex->fe_len <= 0); + BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); + + ac->ac_found++; + + /* + * The special case - take what you catch first + */ + if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + *bex = *ex; + ext4_mb_use_best_found(ac, e4b); + return; + } + + /* + * Let's check whether the chuck is good enough + */ + if (ex->fe_len == gex->fe_len) { + *bex = *ex; + ext4_mb_use_best_found(ac, e4b); + return; + } + + /* + * If this is first found extent, just store it in the context + */ + if (bex->fe_len == 0) { + *bex = *ex; + return; + } + + /* + * If new found extent is better, store it in the context + */ + if (bex->fe_len < gex->fe_len) { + /* if the request isn't satisfied, any found extent + * larger than previous best one is better */ + if (ex->fe_len > bex->fe_len) + *bex = *ex; + } else if (ex->fe_len > gex->fe_len) { + /* if the request is satisfied, then we try to find + * an extent that still satisfy the request, but is + * smaller than previous one */ + if (ex->fe_len < bex->fe_len) + *bex = *ex; + } + + ext4_mb_check_limits(ac, e4b, 0); +} + +static noinline_for_stack +int ext4_mb_try_best_found(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct ext4_free_extent ex = ac->ac_b_ex; + ext4_group_t group = ex.fe_group; + int max; + int err; + + BUG_ON(ex.fe_len <= 0); + err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); + if (err) + return err; + + ext4_lock_group(ac->ac_sb, group); + max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); + + if (max > 0) { + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } + + ext4_unlock_group(ac->ac_sb, group); + ext4_mb_unload_buddy(e4b); + + return 0; +} + +static noinline_for_stack +int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + ext4_group_t group = ac->ac_g_ex.fe_group; + int max; + int err; + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_free_extent ex; + + if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) + return 0; + + err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); + if (err) + return err; + + ext4_lock_group(ac->ac_sb, group); + max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); + + if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { + ext4_fsblk_t start; + + start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + + ex.fe_start; + /* use do_div to get remainder (would be 64-bit modulo) */ + if (do_div(start, sbi->s_stripe) == 0) { + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } + } else if (max >= ac->ac_g_ex.fe_len) { + BUG_ON(ex.fe_len <= 0); + BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); + BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { + /* Sometimes, caller may want to merge even small + * number of blocks to an existing extent */ + BUG_ON(ex.fe_len <= 0); + BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); + BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } + ext4_unlock_group(ac->ac_sb, group); + ext4_mb_unload_buddy(e4b); + + return 0; +} + +/* + * The routine scans buddy structures (not bitmap!) from given order + * to max order and tries to find big enough chunk to satisfy the req + */ +static noinline_for_stack +void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_group_info *grp = e4b->bd_info; + void *buddy; + int i; + int k; + int max; + + BUG_ON(ac->ac_2order <= 0); + for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + if (grp->bb_counters[i] == 0) + continue; + + buddy = mb_find_buddy(e4b, i, &max); + BUG_ON(buddy == NULL); + + k = mb_find_next_zero_bit(buddy, max, 0); + BUG_ON(k >= max); + + ac->ac_found++; + + ac->ac_b_ex.fe_len = 1 << i; + ac->ac_b_ex.fe_start = k << i; + ac->ac_b_ex.fe_group = e4b->bd_group; + + ext4_mb_use_best_found(ac, e4b); + + BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); + + if (EXT4_SB(sb)->s_mb_stats) + atomic_inc(&EXT4_SB(sb)->s_bal_2orders); + + break; + } +} + +/* + * The routine scans the group and measures all found extents. + * In order to optimize scanning, caller must pass number of + * free blocks in the group, so the routine can know upper limit. + */ +static noinline_for_stack +void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + void *bitmap = EXT4_MB_BITMAP(e4b); + struct ext4_free_extent ex; + int i; + int free; + + free = e4b->bd_info->bb_free; + BUG_ON(free <= 0); + + i = e4b->bd_info->bb_first_free; + + while (free && ac->ac_status == AC_STATUS_CONTINUE) { + i = mb_find_next_zero_bit(bitmap, + EXT4_BLOCKS_PER_GROUP(sb), i); + if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { + /* + * IF we have corrupt bitmap, we won't find any + * free blocks even though group info says we + * we have free blocks + */ + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free blocks as per " + "group info. But bitmap says 0", + free); + break; + } + + mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); + BUG_ON(ex.fe_len <= 0); + if (free < ex.fe_len) { + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free blocks as per " + "group info. But got %d blocks", + free, ex.fe_len); + /* + * The number of free blocks differs. This mostly + * indicate that the bitmap is corrupt. So exit + * without claiming the space. + */ + break; + } + + ext4_mb_measure_extent(ac, &ex, e4b); + + i += ex.fe_len; + free -= ex.fe_len; + } + + ext4_mb_check_limits(ac, e4b, 1); +} + +/* + * This is a special case for storages like raid5 + * we try to find stripe-aligned chunks for stripe-size-multiple requests + */ +static noinline_for_stack +void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + void *bitmap = EXT4_MB_BITMAP(e4b); + struct ext4_free_extent ex; + ext4_fsblk_t first_group_block; + ext4_fsblk_t a; + ext4_grpblk_t i; + int max; + + BUG_ON(sbi->s_stripe == 0); + + /* find first stripe-aligned block in group */ + first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); + + a = first_group_block + sbi->s_stripe - 1; + do_div(a, sbi->s_stripe); + i = (a * sbi->s_stripe) - first_group_block; + + while (i < EXT4_BLOCKS_PER_GROUP(sb)) { + if (!mb_test_bit(i, bitmap)) { + max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); + if (max >= sbi->s_stripe) { + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + break; + } + } + i += sbi->s_stripe; + } +} + +/* This is now called BEFORE we load the buddy bitmap. */ +static int ext4_mb_good_group(struct ext4_allocation_context *ac, + ext4_group_t group, int cr) +{ + unsigned free, fragments; + int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + + BUG_ON(cr < 0 || cr >= 4); + + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + int ret = ext4_mb_init_group(ac->ac_sb, group); + if (ret) + return 0; + } + + free = grp->bb_free; + fragments = grp->bb_fragments; + if (free == 0) + return 0; + if (fragments == 0) + return 0; + + switch (cr) { + case 0: + BUG_ON(ac->ac_2order == 0); + + if (grp->bb_largest_free_order < ac->ac_2order) + return 0; + + /* Avoid using the first bg of a flexgroup for data files */ + if ((ac->ac_flags & EXT4_MB_HINT_DATA) && + (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && + ((group % flex_size) == 0)) + return 0; + + return 1; + case 1: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return 1; + break; + case 2: + if (free >= ac->ac_g_ex.fe_len) + return 1; + break; + case 3: + return 1; + default: + BUG(); + } + + return 0; +} + +static noinline_for_stack int +ext4_mb_regular_allocator(struct ext4_allocation_context *ac) +{ + ext4_group_t ngroups, group, i; + int cr; + int err = 0; + struct ext4_sb_info *sbi; + struct super_block *sb; + struct ext4_buddy e4b; + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); + ngroups = ext4_get_groups_count(sb); + /* non-extent files are limited to low blocks/groups */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) + ngroups = sbi->s_blockfile_groups; + + BUG_ON(ac->ac_status == AC_STATUS_FOUND); + + /* first, try the goal */ + err = ext4_mb_find_by_goal(ac, &e4b); + if (err || ac->ac_status == AC_STATUS_FOUND) + goto out; + + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + goto out; + + /* + * ac->ac2_order is set only if the fe_len is a power of 2 + * if ac2_order is set we also set criteria to 0 so that we + * try exact allocation using buddy. + */ + i = fls(ac->ac_g_ex.fe_len); + ac->ac_2order = 0; + /* + * We search using buddy data only if the order of the request + * is greater than equal to the sbi_s_mb_order2_reqs + * You can tune it via /sys/fs/ext4//mb_order2_req + */ + if (i >= sbi->s_mb_order2_reqs) { + /* + * This should tell if fe_len is exactly power of 2 + */ + if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) + ac->ac_2order = i - 1; + } + + /* if stream allocation is enabled, use global goal */ + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { + /* TBD: may be hot point */ + spin_lock(&sbi->s_md_lock); + ac->ac_g_ex.fe_group = sbi->s_mb_last_group; + ac->ac_g_ex.fe_start = sbi->s_mb_last_start; + spin_unlock(&sbi->s_md_lock); + } + + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac->ac_2order ? 0 : 1; + /* + * cr == 0 try to get exact allocation, + * cr == 3 try to get anything + */ +repeat: + for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { + ac->ac_criteria = cr; + /* + * searching for the right group start + * from the goal value specified + */ + group = ac->ac_g_ex.fe_group; + + for (i = 0; i < ngroups; group++, i++) { + if (group == ngroups) + group = 0; + + /* This now checks without needing the buddy page */ + if (!ext4_mb_good_group(ac, group, cr)) + continue; + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) + goto out; + + ext4_lock_group(sb, group); + + /* + * We need to check again after locking the + * block group + */ + if (!ext4_mb_good_group(ac, group, cr)) { + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + continue; + } + + ac->ac_groups_scanned++; + if (cr == 0) + ext4_mb_simple_scan_group(ac, &e4b); + else if (cr == 1 && sbi->s_stripe && + !(ac->ac_g_ex.fe_len % sbi->s_stripe)) + ext4_mb_scan_aligned(ac, &e4b); + else + ext4_mb_complex_scan_group(ac, &e4b); + + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + + if (ac->ac_status != AC_STATUS_CONTINUE) + break; + } + } + + if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far + */ + + ext4_mb_try_best_found(ac, &e4b); + if (ac->ac_status != AC_STATUS_FOUND) { + /* + * Someone more lucky has already allocated it. + * The only thing we can do is just take first + * found block(s) + printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); + */ + ac->ac_b_ex.fe_group = 0; + ac->ac_b_ex.fe_start = 0; + ac->ac_b_ex.fe_len = 0; + ac->ac_status = AC_STATUS_CONTINUE; + ac->ac_flags |= EXT4_MB_HINT_FIRST; + cr = 3; + atomic_inc(&sbi->s_mb_lost_chunks); + goto repeat; + } + } +out: + return err; +} + +static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) +{ + struct super_block *sb = seq->private; + ext4_group_t group; + + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) + return NULL; + group = *pos + 1; + return (void *) ((unsigned long) group); +} + +static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct super_block *sb = seq->private; + ext4_group_t group; + + ++*pos; + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) + return NULL; + group = *pos + 1; + return (void *) ((unsigned long) group); +} + +static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) +{ + struct super_block *sb = seq->private; + ext4_group_t group = (ext4_group_t) ((unsigned long) v); + int i; + int err; + struct ext4_buddy e4b; + struct sg { + struct ext4_group_info info; + ext4_grpblk_t counters[16]; + } sg; + + group--; + if (group == 0) + seq_printf(seq, "#%-5s: %-5s %-5s %-5s " + "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " + "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", + "group", "free", "frags", "first", + "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", + "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); + + i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + sizeof(struct ext4_group_info); + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + seq_printf(seq, "#%-5u: I/O error\n", group); + return 0; + } + ext4_lock_group(sb, group); + memcpy(&sg, ext4_get_group_info(sb, group), i); + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + + seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, + sg.info.bb_fragments, sg.info.bb_first_free); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); + seq_printf(seq, " ]\n"); + + return 0; +} + +static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations ext4_mb_seq_groups_ops = { + .start = ext4_mb_seq_groups_start, + .next = ext4_mb_seq_groups_next, + .stop = ext4_mb_seq_groups_stop, + .show = ext4_mb_seq_groups_show, +}; + +static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) +{ + struct super_block *sb = PDE(inode)->data; + int rc; + + rc = seq_open(file, &ext4_mb_seq_groups_ops); + if (rc == 0) { + struct seq_file *m = file->private_data; + m->private = sb; + } + return rc; + +} + +static const struct file_operations ext4_mb_seq_groups_fops = { + .owner = THIS_MODULE, + .open = ext4_mb_seq_groups_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) +{ + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; + + BUG_ON(!cachep); + return cachep; +} + +/* Create and initialize ext4_group_info data for the given group. */ +int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *desc) +{ + int i; + int metalen = 0; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info **meta_group_info; + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + + /* + * First check if this group is the first of a reserved block. + * If it's true, we have to allocate a new table of pointers + * to ext4_group_info structures + */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { + metalen = sizeof(*meta_group_info) << + EXT4_DESC_PER_BLOCK_BITS(sb); + meta_group_info = kmalloc(metalen, GFP_KERNEL); + if (meta_group_info == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate mem for a " + "buddy group\n"); + goto exit_meta_group_info; + } + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = + meta_group_info; + } + + meta_group_info = + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; + i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); + + meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); + if (meta_group_info[i] == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); + goto exit_group_info; + } + memset(meta_group_info[i], 0, kmem_cache_size(cachep)); + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, + &(meta_group_info[i]->bb_state)); + + /* + * initialize bb_free to be able to skip + * empty groups without initialization + */ + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + meta_group_info[i]->bb_free = + ext4_free_blocks_after_init(sb, group, desc); + } else { + meta_group_info[i]->bb_free = + ext4_free_blks_count(sb, desc); + } + + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + init_rwsem(&meta_group_info[i]->alloc_sem); + meta_group_info[i]->bb_free_root = RB_ROOT; + meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ + +#ifdef DOUBLE_CHECK + { + struct buffer_head *bh; + meta_group_info[i]->bb_bitmap = + kmalloc(sb->s_blocksize, GFP_KERNEL); + BUG_ON(meta_group_info[i]->bb_bitmap == NULL); + bh = ext4_read_block_bitmap(sb, group); + BUG_ON(bh == NULL); + memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, + sb->s_blocksize); + put_bh(bh); + } +#endif + + return 0; + +exit_group_info: + /* If a meta_group_info table has been allocated, release it now */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) + kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); +exit_meta_group_info: + return -ENOMEM; +} /* ext4_mb_add_groupinfo */ + +static int ext4_mb_init_backend(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t i; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int num_meta_group_infos; + int num_meta_group_infos_max; + int array_size; + struct ext4_group_desc *desc; + struct kmem_cache *cachep; + + /* This is the number of blocks used by GDT */ + num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - + 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); + + /* + * This is the total number of blocks used by GDT including + * the number of reserved blocks for GDT. + * The s_group_info array is allocated with this value + * to allow a clean online resize without a complex + * manipulation of pointer. + * The drawback is the unused memory when no resize + * occurs but it's very low in terms of pages + * (see comments below) + * Need to handle this properly when META_BG resizing is allowed + */ + num_meta_group_infos_max = num_meta_group_infos + + le16_to_cpu(es->s_reserved_gdt_blocks); + + /* + * array_size is the size of s_group_info array. We round it + * to the next power of two because this approximation is done + * internally by kmalloc so we can have some more memory + * for free here (e.g. may be used for META_BG resize). + */ + array_size = 1; + while (array_size < sizeof(*sbi->s_group_info) * + num_meta_group_infos_max) + array_size = array_size << 1; + /* An 8TB filesystem with 64-bit pointers requires a 4096 byte + * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. + * So a two level scheme suffices for now. */ + sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); + if (sbi->s_group_info == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); + return -ENOMEM; + } + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { + printk(KERN_ERR "EXT4-fs: can't get new inode\n"); + goto err_freesgi; + } + sbi->s_buddy_cache->i_ino = get_next_ino(); + EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; + for (i = 0; i < ngroups; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc == NULL) { + printk(KERN_ERR + "EXT4-fs: can't read descriptor %u\n", i); + goto err_freebuddy; + } + if (ext4_mb_add_groupinfo(sb, i, desc) != 0) + goto err_freebuddy; + } + + return 0; + +err_freebuddy: + cachep = get_groupinfo_cache(sb->s_blocksize_bits); + while (i-- > 0) + kmem_cache_free(cachep, ext4_get_group_info(sb, i)); + i = num_meta_group_infos; + while (i-- > 0) + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); +err_freesgi: + kfree(sbi->s_group_info); + return -ENOMEM; +} + +static void ext4_groupinfo_destroy_slabs(void) +{ + int i; + + for (i = 0; i < NR_GRPINFO_CACHES; i++) { + if (ext4_groupinfo_caches[i]) + kmem_cache_destroy(ext4_groupinfo_caches[i]); + ext4_groupinfo_caches[i] = NULL; + } +} + +static int ext4_groupinfo_create_slab(size_t size) +{ + static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); + int slab_size; + int blocksize_bits = order_base_2(size); + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep; + + if (cache_index >= NR_GRPINFO_CACHES) + return -EINVAL; + + if (unlikely(cache_index < 0)) + cache_index = 0; + + mutex_lock(&ext4_grpinfo_slab_create_mutex); + if (ext4_groupinfo_caches[cache_index]) { + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + return 0; /* Already created */ + } + + slab_size = offsetof(struct ext4_group_info, + bb_counters[blocksize_bits + 2]); + + cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], + slab_size, 0, SLAB_RECLAIM_ACCOUNT, + NULL); + + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + if (!cachep) { + printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); + return -ENOMEM; + } + + ext4_groupinfo_caches[cache_index] = cachep; + + return 0; +} + +int ext4_mb_init(struct super_block *sb, int needs_recovery) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned i, j; + unsigned offset; + unsigned max; + int ret; + + i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); + + sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_offsets == NULL) { + ret = -ENOMEM; + goto out; + } + + i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); + sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_maxs == NULL) { + ret = -ENOMEM; + goto out; + } + + ret = ext4_groupinfo_create_slab(sb->s_blocksize); + if (ret < 0) + goto out; + + /* order 0 is regular bitmap */ + sbi->s_mb_maxs[0] = sb->s_blocksize << 3; + sbi->s_mb_offsets[0] = 0; + + i = 1; + offset = 0; + max = sb->s_blocksize << 2; + do { + sbi->s_mb_offsets[i] = offset; + sbi->s_mb_maxs[i] = max; + offset += 1 << (sb->s_blocksize_bits - i); + max = max >> 1; + i++; + } while (i <= sb->s_blocksize_bits + 1); + + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) { + goto out; + } + + spin_lock_init(&sbi->s_md_lock); + spin_lock_init(&sbi->s_bal_lock); + + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; + sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); + if (sbi->s_locality_groups == NULL) { + ret = -ENOMEM; + goto out; + } + for_each_possible_cpu(i) { + struct ext4_locality_group *lg; + lg = per_cpu_ptr(sbi->s_locality_groups, i); + mutex_init(&lg->lg_mutex); + for (j = 0; j < PREALLOC_TB_SIZE; j++) + INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); + spin_lock_init(&lg->lg_prealloc_lock); + } + + if (sbi->s_proc) + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_fops, sb); + + if (sbi->s_journal) + sbi->s_journal->j_commit_callback = release_blocks_on_commit; +out: + if (ret) { + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + } + return ret; +} + +/* need to called with the ext4 group lock held */ +static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) +{ + struct ext4_prealloc_space *pa; + struct list_head *cur, *tmp; + int count = 0; + + list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + list_del(&pa->pa_group_list); + count++; + kmem_cache_free(ext4_pspace_cachep, pa); + } + if (count) + mb_debug(1, "mballoc: %u PAs left\n", count); + +} + +int ext4_mb_release(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t i; + int num_meta_group_infos; + struct ext4_group_info *grinfo; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + + if (sbi->s_proc) + remove_proc_entry("mb_groups", sbi->s_proc); + + if (sbi->s_group_info) { + for (i = 0; i < ngroups; i++) { + grinfo = ext4_get_group_info(sb, i); +#ifdef DOUBLE_CHECK + kfree(grinfo->bb_bitmap); +#endif + ext4_lock_group(sb, i); + ext4_mb_cleanup_pa(grinfo); + ext4_unlock_group(sb, i); + kmem_cache_free(cachep, grinfo); + } + num_meta_group_infos = (ngroups + + EXT4_DESC_PER_BLOCK(sb) - 1) >> + EXT4_DESC_PER_BLOCK_BITS(sb); + for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); + kfree(sbi->s_group_info); + } + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + if (sbi->s_buddy_cache) + iput(sbi->s_buddy_cache); + if (sbi->s_mb_stats) { + printk(KERN_INFO + "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", + atomic_read(&sbi->s_bal_allocated), + atomic_read(&sbi->s_bal_reqs), + atomic_read(&sbi->s_bal_success)); + printk(KERN_INFO + "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost\n", + atomic_read(&sbi->s_bal_ex_scanned), + atomic_read(&sbi->s_bal_goals), + atomic_read(&sbi->s_bal_2orders), + atomic_read(&sbi->s_bal_breaks), + atomic_read(&sbi->s_mb_lost_chunks)); + printk(KERN_INFO + "EXT4-fs: mballoc: %lu generated and it took %Lu\n", + sbi->s_mb_buddies_generated++, + sbi->s_mb_generation_time); + printk(KERN_INFO + "EXT4-fs: mballoc: %u preallocated, %u discarded\n", + atomic_read(&sbi->s_mb_preallocated), + atomic_read(&sbi->s_mb_discarded)); + } + + free_percpu(sbi->s_locality_groups); + + return 0; +} + +static inline int ext4_issue_discard(struct super_block *sb, + ext4_group_t block_group, ext4_grpblk_t block, int count) +{ + ext4_fsblk_t discard_block; + + discard_block = block + ext4_group_first_block_no(sb, block_group); + trace_ext4_discard_blocks(sb, + (unsigned long long) discard_block, count); + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); +} + +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) +{ + struct super_block *sb = journal->j_private; + struct ext4_buddy e4b; + struct ext4_group_info *db; + int err, count = 0, count2 = 0; + struct ext4_free_data *entry; + struct list_head *l, *ltmp; + + list_for_each_safe(l, ltmp, &txn->t_private_list) { + entry = list_entry(l, struct ext4_free_data, list); + + mb_debug(1, "gonna free %u blocks in group %u (0x%p):", + entry->count, entry->group, entry); + + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, entry->group, + entry->start_blk, entry->count); + + err = ext4_mb_load_buddy(sb, entry->group, &e4b); + /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); + + db = e4b.bd_info; + /* there are blocks to put in buddy to make them really free */ + count += entry->count; + count2++; + ext4_lock_group(sb, entry->group); + /* Take it out of per group rb tree */ + rb_erase(&entry->node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); + } + ext4_unlock_group(sb, entry->group); + kmem_cache_free(ext4_free_ext_cachep, entry); + ext4_mb_unload_buddy(&e4b); + } + + mb_debug(1, "freed %u blocks in %u structures\n", count, count2); +} + +#ifdef CONFIG_EXT4_DEBUG +u8 mb_enable_debug __read_mostly; + +static struct dentry *debugfs_dir; +static struct dentry *debugfs_debug; + +static void __init ext4_create_debugfs_entry(void) +{ + debugfs_dir = debugfs_create_dir("ext4", NULL); + if (debugfs_dir) + debugfs_debug = debugfs_create_u8("mballoc-debug", + S_IRUGO | S_IWUSR, + debugfs_dir, + &mb_enable_debug); +} + +static void ext4_remove_debugfs_entry(void) +{ + debugfs_remove(debugfs_debug); + debugfs_remove(debugfs_dir); +} + +#else + +static void __init ext4_create_debugfs_entry(void) +{ +} + +static void ext4_remove_debugfs_entry(void) +{ +} + +#endif + +int __init ext4_init_mballoc(void) +{ + ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, + SLAB_RECLAIM_ACCOUNT); + if (ext4_pspace_cachep == NULL) + return -ENOMEM; + + ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, + SLAB_RECLAIM_ACCOUNT); + if (ext4_ac_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + return -ENOMEM; + } + + ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, + SLAB_RECLAIM_ACCOUNT); + if (ext4_free_ext_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + return -ENOMEM; + } + ext4_create_debugfs_entry(); + return 0; +} + +void ext4_exit_mballoc(void) +{ + /* + * Wait for completion of call_rcu()'s on ext4_pspace_cachep + * before destroying the slab cache. + */ + rcu_barrier(); + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + kmem_cache_destroy(ext4_free_ext_cachep); + ext4_groupinfo_destroy_slabs(); + ext4_remove_debugfs_entry(); +} + + +/* + * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps + * Returns 0 if success or error code + */ +static noinline_for_stack int +ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, + handle_t *handle, unsigned int reserv_blks) +{ + struct buffer_head *bitmap_bh = NULL; + struct ext4_group_desc *gdp; + struct buffer_head *gdp_bh; + struct ext4_sb_info *sbi; + struct super_block *sb; + ext4_fsblk_t block; + int err, len; + + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(ac->ac_b_ex.fe_len <= 0); + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); + + err = -EIO; + bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); + if (!bitmap_bh) + goto out_err; + + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto out_err; + + err = -EIO; + gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); + if (!gdp) + goto out_err; + + ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, + ext4_free_blks_count(sb, gdp)); + + err = ext4_journal_get_write_access(handle, gdp_bh); + if (err) + goto out_err; + + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + + len = ac->ac_b_ex.fe_len; + if (!ext4_data_block_valid(sbi, block, len)) { + ext4_error(sb, "Allocating blocks %llu-%llu which overlap " + "fs metadata\n", block, block+len); + /* File system mounted not to panic on error + * Fix the bitmap and repeat the block allocation + * We leak some of the blocks here. + */ + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (!err) + err = -EAGAIN; + goto out_err; + } + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); +#ifdef AGGRESSIVE_CHECK + { + int i; + for (i = 0; i < ac->ac_b_ex.fe_len; i++) { + BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, + bitmap_bh->b_data)); + } + } +#endif + mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_blks_set(sb, gdp, + ext4_free_blocks_after_init(sb, + ac->ac_b_ex.fe_group, gdp)); + } + len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; + ext4_free_blks_set(sb, gdp, len); + gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); + + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); + /* + * Now reduce the dirty block count also. Should not go negative + */ + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, + ac->ac_b_ex.fe_group); + atomic_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_blocks); + } + + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (err) + goto out_err; + err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); + +out_err: + ext4_mark_super_dirty(sb); + brelse(bitmap_bh); + return err; +} + +/* + * here we normalize request for locality group + * Group request are normalized to s_strip size if we set the same via mount + * option. If not we set it to s_mb_group_prealloc which can be configured via + * /sys/fs/ext4//mb_group_prealloc + * + * XXX: should we try to preallocate more than the group has now? + */ +static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg = ac->ac_lg; + + BUG_ON(lg == NULL); + if (EXT4_SB(sb)->s_stripe) + ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; + else + ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; + mb_debug(1, "#%u: goal %u blocks for locality group\n", + current->pid, ac->ac_g_ex.fe_len); +} + +/* + * Normalization means making request better in terms of + * size and alignment + */ +static noinline_for_stack void +ext4_mb_normalize_request(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) +{ + int bsbits, max; + ext4_lblk_t end; + loff_t size, orig_size, start_off; + ext4_lblk_t start; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_prealloc_space *pa; + + /* do normalize only data requests, metadata requests + do not need preallocation */ + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; + + /* sometime caller may want exact blocks */ + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + + /* caller may indicate that preallocation isn't + * required (it's a tail, for example) */ + if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) + return; + + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { + ext4_mb_normalize_group_request(ac); + return ; + } + + bsbits = ac->ac_sb->s_blocksize_bits; + + /* first, let's learn actual file size + * given current request is allocated */ + size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); + orig_size = size; + + /* max size of free chunks */ + max = 2 << bsbits; + +#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ + (req <= (size) || max <= (chunk_size)) + + /* first, try to predict filesize */ + /* XXX: should this table be tunable? */ + start_off = 0; + if (size <= 16 * 1024) { + size = 16 * 1024; + } else if (size <= 32 * 1024) { + size = 32 * 1024; + } else if (size <= 64 * 1024) { + size = 64 * 1024; + } else if (size <= 128 * 1024) { + size = 128 * 1024; + } else if (size <= 256 * 1024) { + size = 256 * 1024; + } else if (size <= 512 * 1024) { + size = 512 * 1024; + } else if (size <= 1024 * 1024) { + size = 1024 * 1024; + } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (21 - bsbits)) << 21; + size = 2 * 1024 * 1024; + } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (22 - bsbits)) << 22; + size = 4 * 1024 * 1024; + } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, + (8<<20)>>bsbits, max, 8 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (23 - bsbits)) << 23; + size = 8 * 1024 * 1024; + } else { + start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; + size = ac->ac_o_ex.fe_len << bsbits; + } + size = size >> bsbits; + start = start_off >> bsbits; + + /* don't cover already allocated blocks in selected range */ + if (ar->pleft && start <= ar->lleft) { + size -= ar->lleft + 1 - start; + start = ar->lleft + 1; + } + if (ar->pright && start + size - 1 >= ar->lright) + size -= start + size - ar->lright; + + end = start + size; + + /* check we don't cross already preallocated blocks */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + ext4_lblk_t pa_end; + + if (pa->pa_deleted) + continue; + spin_lock(&pa->pa_lock); + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + + pa_end = pa->pa_lstart + pa->pa_len; + + /* PA must not overlap original request */ + BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || + ac->ac_o_ex.fe_logical < pa->pa_lstart)); + + /* skip PAs this normalized request doesn't overlap with */ + if (pa->pa_lstart >= end || pa_end <= start) { + spin_unlock(&pa->pa_lock); + continue; + } + BUG_ON(pa->pa_lstart <= start && pa_end >= end); + + /* adjust start or end to be adjacent to this pa */ + if (pa_end <= ac->ac_o_ex.fe_logical) { + BUG_ON(pa_end < start); + start = pa_end; + } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { + BUG_ON(pa->pa_lstart > end); + end = pa->pa_lstart; + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + size = end - start; + + /* XXX: extra loop to check we really don't overlap preallocations */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + ext4_lblk_t pa_end; + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0) { + pa_end = pa->pa_lstart + pa->pa_len; + BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + + if (start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical) { + printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", + (unsigned long) start, (unsigned long) size, + (unsigned long) ac->ac_o_ex.fe_logical); + } + BUG_ON(start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical); + BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + + /* XXX: is it better to align blocks WRT to logical + * placement or satisfy big request as is */ + ac->ac_g_ex.fe_logical = start; + ac->ac_g_ex.fe_len = size; + + /* define goal start in order to merge */ + if (ar->pright && (ar->lright == (start + size))) { + /* merge to the right */ + ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, + &ac->ac_f_ex.fe_group, + &ac->ac_f_ex.fe_start); + ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; + } + if (ar->pleft && (ar->lleft + 1 == start)) { + /* merge to the left */ + ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, + &ac->ac_f_ex.fe_group, + &ac->ac_f_ex.fe_start); + ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; + } + + mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, + (unsigned) orig_size, (unsigned) start); +} + +static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + + if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { + atomic_inc(&sbi->s_bal_reqs); + atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); + if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) + atomic_inc(&sbi->s_bal_success); + atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); + if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && + ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) + atomic_inc(&sbi->s_bal_goals); + if (ac->ac_found > sbi->s_mb_max_to_scan) + atomic_inc(&sbi->s_bal_breaks); + } + + if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) + trace_ext4_mballoc_alloc(ac); + else + trace_ext4_mballoc_prealloc(ac); +} + +/* + * Called on failure; free up any blocks from the inode PA for this + * context. We don't need this for MB_GROUP_PA because we only change + * pa_free in ext4_mb_release_context(), but on failure, we've already + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. + */ +static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + int len; + + if (pa && pa->pa_type == MB_INODE_PA) { + len = ac->ac_b_ex.fe_len; + pa->pa_free += len; + } + +} + +/* + * use blocks preallocated to inode + */ +static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa) +{ + ext4_fsblk_t start; + ext4_fsblk_t end; + int len; + + /* found preallocated blocks, use them */ + start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); + end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); + len = end - start; + ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, + &ac->ac_b_ex.fe_start); + ac->ac_b_ex.fe_len = len; + ac->ac_status = AC_STATUS_FOUND; + ac->ac_pa = pa; + + BUG_ON(start < pa->pa_pstart); + BUG_ON(start + len > pa->pa_pstart + pa->pa_len); + BUG_ON(pa->pa_free < len); + pa->pa_free -= len; + + mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); +} + +/* + * use blocks preallocated to locality group + */ +static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa) +{ + unsigned int len = ac->ac_o_ex.fe_len; + + ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, + &ac->ac_b_ex.fe_group, + &ac->ac_b_ex.fe_start); + ac->ac_b_ex.fe_len = len; + ac->ac_status = AC_STATUS_FOUND; + ac->ac_pa = pa; + + /* we don't correct pa_pstart or pa_plen here to avoid + * possible race when the group is being loaded concurrently + * instead we correct pa later, after blocks are marked + * in on-disk bitmap -- see ext4_mb_release_context() + * Other CPUs are prevented from allocating from this pa by lg_mutex + */ + mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); +} + +/* + * Return the prealloc space that have minimal distance + * from the goal block. @cpa is the prealloc + * space that is having currently known minimal distance + * from the goal block. + */ +static struct ext4_prealloc_space * +ext4_mb_check_group_pa(ext4_fsblk_t goal_block, + struct ext4_prealloc_space *pa, + struct ext4_prealloc_space *cpa) +{ + ext4_fsblk_t cur_distance, new_distance; + + if (cpa == NULL) { + atomic_inc(&pa->pa_count); + return pa; + } + cur_distance = abs(goal_block - cpa->pa_pstart); + new_distance = abs(goal_block - pa->pa_pstart); + + if (cur_distance <= new_distance) + return cpa; + + /* drop the previous reference */ + atomic_dec(&cpa->pa_count); + atomic_inc(&pa->pa_count); + return pa; +} + +/* + * search goal blocks in preallocated space + */ +static noinline_for_stack int +ext4_mb_use_preallocated(struct ext4_allocation_context *ac) +{ + int order, i; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_locality_group *lg; + struct ext4_prealloc_space *pa, *cpa = NULL; + ext4_fsblk_t goal_block; + + /* only data can be preallocated */ + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return 0; + + /* first, try per-file preallocation */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + + /* all fields in this condition don't change, + * so we can skip locking for them */ + if (ac->ac_o_ex.fe_logical < pa->pa_lstart || + ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) + continue; + + /* non-extent files can't have physical blocks past 2^32 */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && + pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) + continue; + + /* found preallocated blocks, use them */ + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0 && pa->pa_free) { + atomic_inc(&pa->pa_count); + ext4_mb_use_inode_pa(ac, pa); + spin_unlock(&pa->pa_lock); + ac->ac_criteria = 10; + rcu_read_unlock(); + return 1; + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + + /* can we use group allocation? */ + if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) + return 0; + + /* inode may have no locality group for some reason */ + lg = ac->ac_lg; + if (lg == NULL) + return 0; + order = fls(ac->ac_o_ex.fe_len) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + + goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); + /* + * search for the prealloc space that is having + * minimal distance from the goal block. + */ + for (i = order; i < PREALLOC_TB_SIZE; i++) { + rcu_read_lock(); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], + pa_inode_list) { + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0 && + pa->pa_free >= ac->ac_o_ex.fe_len) { + + cpa = ext4_mb_check_group_pa(goal_block, + pa, cpa); + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + } + if (cpa) { + ext4_mb_use_group_pa(ac, cpa); + ac->ac_criteria = 20; + return 1; + } + return 0; +} + +/* + * the function goes through all block freed in the group + * but not yet committed and marks them used in in-core bitmap. + * buddy must be generated from this bitmap + * Need to be called with the ext4 group lock held + */ +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group) +{ + struct rb_node *n; + struct ext4_group_info *grp; + struct ext4_free_data *entry; + + grp = ext4_get_group_info(sb, group); + n = rb_first(&(grp->bb_free_root)); + + while (n) { + entry = rb_entry(n, struct ext4_free_data, node); + mb_set_bits(bitmap, entry->start_blk, entry->count); + n = rb_next(n); + } + return; +} + +/* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + * Need to be called with ext4 group lock held + */ +static noinline_for_stack +void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_prealloc_space *pa; + struct list_head *cur; + ext4_group_t groupnr; + ext4_grpblk_t start; + int preallocated = 0; + int count = 0; + int len; + + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. + * we don't need any locking here + * notice we do NOT ignore preallocations with pa_deleted + * otherwise we could leave used blocks available for + * allocation in buddy when concurrent ext4_mb_put_pa() + * is dropping preallocation + */ + list_for_each(cur, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + spin_lock(&pa->pa_lock); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, + &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); + if (unlikely(len == 0)) + continue; + BUG_ON(groupnr != group); + mb_set_bits(bitmap, start, len); + preallocated += len; + count++; + } + mb_debug(1, "prellocated %u for group %u\n", preallocated, group); +} + +static void ext4_mb_pa_callback(struct rcu_head *head) +{ + struct ext4_prealloc_space *pa; + pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + kmem_cache_free(ext4_pspace_cachep, pa); +} + +/* + * drops a reference to preallocated space descriptor + * if this was the last reference and the space is consumed + */ +static void ext4_mb_put_pa(struct ext4_allocation_context *ac, + struct super_block *sb, struct ext4_prealloc_space *pa) +{ + ext4_group_t grp; + ext4_fsblk_t grp_blk; + + if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) + return; + + /* in this short window concurrent discard can set pa_deleted */ + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 1) { + spin_unlock(&pa->pa_lock); + return; + } + + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + + grp_blk = pa->pa_pstart; + /* + * If doing group-based preallocation, pa_pstart may be in the + * next group when pa is used up + */ + if (pa->pa_type == MB_GROUP_PA) + grp_blk--; + + ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); + + /* + * possible race: + * + * P1 (buddy init) P2 (regular allocation) + * find block B in PA + * copy on-disk bitmap to buddy + * mark B in on-disk bitmap + * drop PA from group + * mark all PAs in buddy + * + * thus, P1 initializes buddy with B available. to prevent this + * we make "copy" and "mark all PAs" atomic and serialize "drop PA" + * against that pair + */ + ext4_lock_group(sb, grp); + list_del(&pa->pa_group_list); + ext4_unlock_group(sb, grp); + + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); +} + +/* + * creates new preallocated space for given inode + */ +static noinline_for_stack int +ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_prealloc_space *pa; + struct ext4_group_info *grp; + struct ext4_inode_info *ei; + + /* preallocate only when found space is larger then requested */ + BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + + pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); + if (pa == NULL) + return -ENOMEM; + + if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { + int winl; + int wins; + int win; + int offs; + + /* we can't allocate as much as normalizer wants. + * so, found space must get proper lstart + * to cover original request */ + BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); + BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); + + /* we're limited by original request in that + * logical block must be covered any way + * winl is window we can move our chunk within */ + winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; + + /* also, we should cover whole original request */ + wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; + + /* the smallest one defines real window */ + win = min(winl, wins); + + offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; + if (offs && offs < win) + win = offs; + + ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; + BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); + BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); + } + + /* preallocation can change ac_b_ex, thus we store actually + * allocated blocks for history */ + ac->ac_f_ex = ac->ac_b_ex; + + pa->pa_lstart = ac->ac_b_ex.fe_logical; + pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + pa->pa_len = ac->ac_b_ex.fe_len; + pa->pa_free = pa->pa_len; + atomic_set(&pa->pa_count, 1); + spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_INODE_PA; + + mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_ext4_mb_new_inode_pa(ac, pa); + + ext4_mb_use_inode_pa(ac, pa); + atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); + + ei = EXT4_I(ac->ac_inode); + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); + + pa->pa_obj_lock = &ei->i_prealloc_lock; + pa->pa_inode = ac->ac_inode; + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + spin_lock(pa->pa_obj_lock); + list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); + spin_unlock(pa->pa_obj_lock); + + return 0; +} + +/* + * creates new preallocated space for locality group inodes belongs to + */ +static noinline_for_stack int +ext4_mb_new_group_pa(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg; + struct ext4_prealloc_space *pa; + struct ext4_group_info *grp; + + /* preallocate only when found space is larger then requested */ + BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + + BUG_ON(ext4_pspace_cachep == NULL); + pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); + if (pa == NULL) + return -ENOMEM; + + /* preallocation can change ac_b_ex, thus we store actually + * allocated blocks for history */ + ac->ac_f_ex = ac->ac_b_ex; + + pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + pa->pa_lstart = pa->pa_pstart; + pa->pa_len = ac->ac_b_ex.fe_len; + pa->pa_free = pa->pa_len; + atomic_set(&pa->pa_count, 1); + spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_GROUP_PA; + + mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_ext4_mb_new_group_pa(ac, pa); + + ext4_mb_use_group_pa(ac, pa); + atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); + + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); + lg = ac->ac_lg; + BUG_ON(lg == NULL); + + pa->pa_obj_lock = &lg->lg_prealloc_lock; + pa->pa_inode = NULL; + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + /* + * We will later add the new pa to the right bucket + * after updating the pa_free in ext4_mb_release_context + */ + return 0; +} + +static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) +{ + int err; + + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) + err = ext4_mb_new_group_pa(ac); + else + err = ext4_mb_new_inode_pa(ac); + return err; +} + +/* + * finds all unused blocks in on-disk bitmap, frees them in + * in-core bitmap and buddy. + * @pa must be unlinked from inode and group lists, so that + * nobody else can find/use it. + * the caller MUST hold group/inode locks. + * TODO: optimize the case when there are no in-core structures yet + */ +static noinline_for_stack int +ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + struct ext4_prealloc_space *pa) +{ + struct super_block *sb = e4b->bd_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned int end; + unsigned int next; + ext4_group_t group; + ext4_grpblk_t bit; + unsigned long long grp_blk_start; + int err = 0; + int free = 0; + + BUG_ON(pa->pa_deleted == 0); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + grp_blk_start = pa->pa_pstart - bit; + BUG_ON(group != e4b->bd_group && pa->pa_len != 0); + end = bit + pa->pa_len; + + while (bit < end) { + bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); + if (bit >= end) + break; + next = mb_find_next_bit(bitmap_bh->b_data, end, bit); + mb_debug(1, " free preallocated %u/%u in group %u\n", + (unsigned) ext4_group_first_block_no(sb, group) + bit, + (unsigned) next - bit, (unsigned) group); + free += next - bit; + + trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); + trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit, + next - bit); + mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); + bit = next + 1; + } + if (free != pa->pa_free) { + printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", + pa, (unsigned long) pa->pa_lstart, + (unsigned long) pa->pa_pstart, + (unsigned long) pa->pa_len); + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", + free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. + */ + } + atomic_add(free, &sbi->s_mb_discarded); + + return err; +} + +static noinline_for_stack int +ext4_mb_release_group_pa(struct ext4_buddy *e4b, + struct ext4_prealloc_space *pa) +{ + struct super_block *sb = e4b->bd_sb; + ext4_group_t group; + ext4_grpblk_t bit; + + trace_ext4_mb_release_group_pa(pa); + BUG_ON(pa->pa_deleted == 0); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + BUG_ON(group != e4b->bd_group && pa->pa_len != 0); + mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); + atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); + trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); + + return 0; +} + +/* + * releases all preallocations in given group + * + * first, we need to decide discard policy: + * - when do we discard + * 1) ENOSPC + * - how many do we discard + * 1) how many requested + */ +static noinline_for_stack int +ext4_mb_discard_group_preallocations(struct super_block *sb, + ext4_group_t group, int needed) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct buffer_head *bitmap_bh = NULL; + struct ext4_prealloc_space *pa, *tmp; + struct list_head list; + struct ext4_buddy e4b; + int err; + int busy = 0; + int free = 0; + + mb_debug(1, "discard preallocation for group %u\n", group); + + if (list_empty(&grp->bb_prealloc_list)) + return 0; + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (bitmap_bh == NULL) { + ext4_error(sb, "Error reading block bitmap for %u", group); + return 0; + } + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + ext4_error(sb, "Error loading buddy information for %u", group); + put_bh(bitmap_bh); + return 0; + } + + if (needed == 0) + needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; + + INIT_LIST_HEAD(&list); +repeat: + ext4_lock_group(sb, group); + list_for_each_entry_safe(pa, tmp, + &grp->bb_prealloc_list, pa_group_list) { + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + spin_unlock(&pa->pa_lock); + busy = 1; + continue; + } + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + + /* seems this one can be freed ... */ + pa->pa_deleted = 1; + + /* we can trust pa_free ... */ + free += pa->pa_free; + + spin_unlock(&pa->pa_lock); + + list_del(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } + + /* if we still need more blocks and some PAs were used, try again */ + if (free < needed && busy) { + busy = 0; + ext4_unlock_group(sb, group); + /* + * Yield the CPU here so that we don't get soft lockup + * in non preempt case. + */ + yield(); + goto repeat; + } + + /* found anything to free? */ + if (list_empty(&list)) { + BUG_ON(free != 0); + goto out; + } + + /* now free all selected PAs */ + list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { + + /* remove from object (inode or locality group) */ + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + + if (pa->pa_type == MB_GROUP_PA) + ext4_mb_release_group_pa(&e4b, pa); + else + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } + +out: + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + put_bh(bitmap_bh); + return free; +} + +/* + * releases all non-used preallocated blocks for given inode + * + * It's important to discard preallocations under i_data_sem + * We don't want another block to be served from the prealloc + * space when we are discarding the inode prealloc space. + * + * FIXME!! Make sure it is valid at all the call sites + */ +void ext4_discard_preallocations(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct super_block *sb = inode->i_sb; + struct buffer_head *bitmap_bh = NULL; + struct ext4_prealloc_space *pa, *tmp; + ext4_group_t group = 0; + struct list_head list; + struct ext4_buddy e4b; + int err; + + if (!S_ISREG(inode->i_mode)) { + /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ + return; + } + + mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); + trace_ext4_discard_preallocations(inode); + + INIT_LIST_HEAD(&list); + +repeat: + /* first, collect all pa's in the inode */ + spin_lock(&ei->i_prealloc_lock); + while (!list_empty(&ei->i_prealloc_list)) { + pa = list_entry(ei->i_prealloc_list.next, + struct ext4_prealloc_space, pa_inode_list); + BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + /* this shouldn't happen often - nobody should + * use preallocation while we're discarding it */ + spin_unlock(&pa->pa_lock); + spin_unlock(&ei->i_prealloc_lock); + printk(KERN_ERR "uh-oh! used pa while discarding\n"); + WARN_ON(1); + schedule_timeout_uninterruptible(HZ); + goto repeat; + + } + if (pa->pa_deleted == 0) { + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + list_del_rcu(&pa->pa_inode_list); + list_add(&pa->u.pa_tmp_list, &list); + continue; + } + + /* someone is deleting pa right now */ + spin_unlock(&pa->pa_lock); + spin_unlock(&ei->i_prealloc_lock); + + /* we have to wait here because pa_deleted + * doesn't mean pa is already unlinked from + * the list. as we might be called from + * ->clear_inode() the inode will get freed + * and concurrent thread which is unlinking + * pa from inode's list may access already + * freed memory, bad-bad-bad */ + + /* XXX: if this happens too often, we can + * add a flag to force wait only in case + * of ->clear_inode(), but not in case of + * regular truncate */ + schedule_timeout_uninterruptible(HZ); + goto repeat; + } + spin_unlock(&ei->i_prealloc_lock); + + list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { + BUG_ON(pa->pa_type != MB_INODE_PA); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + ext4_error(sb, "Error loading buddy information for %u", + group); + continue; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (bitmap_bh == NULL) { + ext4_error(sb, "Error reading block bitmap for %u", + group); + ext4_mb_unload_buddy(&e4b); + continue; + } + + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + ext4_unlock_group(sb, group); + + ext4_mb_unload_buddy(&e4b); + put_bh(bitmap_bh); + + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } +} + +#ifdef CONFIG_EXT4_DEBUG +static void ext4_mb_show_ac(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + ext4_group_t ngroups, i; + + if (!mb_enable_debug || + (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) + return; + + printk(KERN_ERR "EXT4-fs: Can't allocate:" + " Allocation context details:\n"); + printk(KERN_ERR "EXT4-fs: status %d flags %d\n", + ac->ac_status, ac->ac_flags); + printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " + "best %lu/%lu/%lu@%lu cr %d\n", + (unsigned long)ac->ac_o_ex.fe_group, + (unsigned long)ac->ac_o_ex.fe_start, + (unsigned long)ac->ac_o_ex.fe_len, + (unsigned long)ac->ac_o_ex.fe_logical, + (unsigned long)ac->ac_g_ex.fe_group, + (unsigned long)ac->ac_g_ex.fe_start, + (unsigned long)ac->ac_g_ex.fe_len, + (unsigned long)ac->ac_g_ex.fe_logical, + (unsigned long)ac->ac_b_ex.fe_group, + (unsigned long)ac->ac_b_ex.fe_start, + (unsigned long)ac->ac_b_ex.fe_len, + (unsigned long)ac->ac_b_ex.fe_logical, + (int)ac->ac_criteria); + printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, + ac->ac_found); + printk(KERN_ERR "EXT4-fs: groups: \n"); + ngroups = ext4_get_groups_count(sb); + for (i = 0; i < ngroups; i++) { + struct ext4_group_info *grp = ext4_get_group_info(sb, i); + struct ext4_prealloc_space *pa; + ext4_grpblk_t start; + struct list_head *cur; + ext4_lock_group(sb, i); + list_for_each(cur, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, + pa_group_list); + spin_lock(&pa->pa_lock); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, + NULL, &start); + spin_unlock(&pa->pa_lock); + printk(KERN_ERR "PA:%u:%d:%u \n", i, + start, pa->pa_len); + } + ext4_unlock_group(sb, i); + + if (grp->bb_free == 0) + continue; + printk(KERN_ERR "%u: %d/%d \n", + i, grp->bb_free, grp->bb_fragments); + } + printk(KERN_ERR "\n"); +} +#else +static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) +{ + return; +} +#endif + +/* + * We use locality group preallocation for small size file. The size of the + * file is determined by the current size or the resulting size after + * allocation which ever is larger + * + * One can tune this size via /sys/fs/ext4//mb_stream_req + */ +static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int bsbits = ac->ac_sb->s_blocksize_bits; + loff_t size, isize; + + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; + + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + + size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) + >> bsbits; + + if ((size == isize) && + !ext4_fs_is_busy(sbi) && + (atomic_read(&ac->ac_inode->i_writecount) == 0)) { + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; + return; + } + + /* don't use group allocation for large files */ + size = max(size, isize); + if (size > sbi->s_mb_stream_request) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having + * per cpu locality group is to reduce the contention between block + * request from multiple CPUs. + */ + ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups); + + /* we're going to use group allocation */ + ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; + + /* serialize all allocations in the group */ + mutex_lock(&ac->ac_lg->lg_mutex); +} + +static noinline_for_stack int +ext4_mb_initialize_context(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) +{ + struct super_block *sb = ar->inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_group_t group; + unsigned int len; + ext4_fsblk_t goal; + ext4_grpblk_t block; + + /* we can't allocate > group size */ + len = ar->len; + + /* just a dirty hack to filter too big requests */ + if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) + len = EXT4_BLOCKS_PER_GROUP(sb) - 10; + + /* start searching from the goal */ + goal = ar->goal; + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= ext4_blocks_count(es)) + goal = le32_to_cpu(es->s_first_data_block); + ext4_get_group_no_and_offset(sb, goal, &group, &block); + + /* set up allocation goals */ + memset(ac, 0, sizeof(struct ext4_allocation_context)); + ac->ac_b_ex.fe_logical = ar->logical; + ac->ac_status = AC_STATUS_CONTINUE; + ac->ac_sb = sb; + ac->ac_inode = ar->inode; + ac->ac_o_ex.fe_logical = ar->logical; + ac->ac_o_ex.fe_group = group; + ac->ac_o_ex.fe_start = block; + ac->ac_o_ex.fe_len = len; + ac->ac_g_ex.fe_logical = ar->logical; + ac->ac_g_ex.fe_group = group; + ac->ac_g_ex.fe_start = block; + ac->ac_g_ex.fe_len = len; + ac->ac_flags = ar->flags; + + /* we have to define context: we'll we work with a file or + * locality group. this is a policy, actually */ + ext4_mb_group_or_file(ac); + + mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " + "left: %u/%u, right %u/%u to %swritable\n", + (unsigned) ar->len, (unsigned) ar->logical, + (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, + (unsigned) ar->lleft, (unsigned) ar->pleft, + (unsigned) ar->lright, (unsigned) ar->pright, + atomic_read(&ar->inode->i_writecount) ? "" : "non-"); + return 0; + +} + +static noinline_for_stack void +ext4_mb_discard_lg_preallocations(struct super_block *sb, + struct ext4_locality_group *lg, + int order, int total_entries) +{ + ext4_group_t group = 0; + struct ext4_buddy e4b; + struct list_head discard_list; + struct ext4_prealloc_space *pa, *tmp; + + mb_debug(1, "discard locality group preallocation\n"); + + INIT_LIST_HEAD(&discard_list); + + spin_lock(&lg->lg_prealloc_lock); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], + pa_inode_list) { + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + /* + * This is the pa that we just used + * for block allocation. So don't + * free that + */ + spin_unlock(&pa->pa_lock); + continue; + } + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + /* only lg prealloc space */ + BUG_ON(pa->pa_type != MB_GROUP_PA); + + /* seems this one can be freed ... */ + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + + list_del_rcu(&pa->pa_inode_list); + list_add(&pa->u.pa_tmp_list, &discard_list); + + total_entries--; + if (total_entries <= 5) { + /* + * we want to keep only 5 entries + * allowing it to grow to 8. This + * mak sure we don't call discard + * soon for this list. + */ + break; + } + } + spin_unlock(&lg->lg_prealloc_lock); + + list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { + + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + if (ext4_mb_load_buddy(sb, group, &e4b)) { + ext4_error(sb, "Error loading buddy information for %u", + group); + continue; + } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_mb_release_group_pa(&e4b, pa); + ext4_unlock_group(sb, group); + + ext4_mb_unload_buddy(&e4b); + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } +} + +/* + * We have incremented pa_count. So it cannot be freed at this + * point. Also we hold lg_mutex. So no parallel allocation is + * possible from this lg. That means pa_free cannot be updated. + * + * A parallel ext4_mb_discard_group_preallocations is possible. + * which can cause the lg_prealloc_list to be updated. + */ + +static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) +{ + int order, added = 0, lg_prealloc_count = 1; + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg = ac->ac_lg; + struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; + + order = fls(pa->pa_free) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + /* Add the prealloc space to lg */ + rcu_read_lock(); + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], + pa_inode_list) { + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted) { + spin_unlock(&tmp_pa->pa_lock); + continue; + } + if (!added && pa->pa_free < tmp_pa->pa_free) { + /* Add to the tail of the previous entry */ + list_add_tail_rcu(&pa->pa_inode_list, + &tmp_pa->pa_inode_list); + added = 1; + /* + * we want to count the total + * number of entries in the list + */ + } + spin_unlock(&tmp_pa->pa_lock); + lg_prealloc_count++; + } + if (!added) + list_add_tail_rcu(&pa->pa_inode_list, + &lg->lg_prealloc_list[order]); + rcu_read_unlock(); + + /* Now trim the list to be not more than 8 elements */ + if (lg_prealloc_count > 8) { + ext4_mb_discard_lg_preallocations(sb, lg, + order, lg_prealloc_count); + return; + } + return ; +} + +/* + * release all resource we used in allocation + */ +static int ext4_mb_release_context(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + if (pa) { + if (pa->pa_type == MB_GROUP_PA) { + /* see comment in ext4_mb_use_group_pa() */ + spin_lock(&pa->pa_lock); + pa->pa_pstart += ac->ac_b_ex.fe_len; + pa->pa_lstart += ac->ac_b_ex.fe_len; + pa->pa_free -= ac->ac_b_ex.fe_len; + pa->pa_len -= ac->ac_b_ex.fe_len; + spin_unlock(&pa->pa_lock); + } + } + if (pa) { + /* + * We want to add the pa to the right bucket. + * Remove it from the list and while adding + * make sure the list to which we are adding + * doesn't grow big. + */ + if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + ext4_mb_add_n_trim(ac); + } + ext4_mb_put_pa(ac, ac->ac_sb, pa); + } + if (ac->ac_bitmap_page) + page_cache_release(ac->ac_bitmap_page); + if (ac->ac_buddy_page) + page_cache_release(ac->ac_buddy_page); + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) + mutex_unlock(&ac->ac_lg->lg_mutex); + ext4_mb_collect_stats(ac); + return 0; +} + +static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) +{ + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + int ret; + int freed = 0; + + trace_ext4_mb_discard_preallocations(sb, needed); + for (i = 0; i < ngroups && needed > 0; i++) { + ret = ext4_mb_discard_group_preallocations(sb, i, needed); + freed += ret; + needed -= ret; + } + + return freed; +} + +/* + * Main entry point into mballoc to allocate blocks + * it tries to use preallocation first, then falls back + * to usual allocation + */ +ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + struct ext4_allocation_request *ar, int *errp) +{ + int freed; + struct ext4_allocation_context *ac = NULL; + struct ext4_sb_info *sbi; + struct super_block *sb; + ext4_fsblk_t block = 0; + unsigned int inquota = 0; + unsigned int reserv_blks = 0; + + sb = ar->inode->i_sb; + sbi = EXT4_SB(sb); + + trace_ext4_request_blocks(ar); + + /* + * For delayed allocation, we could skip the ENOSPC and + * EDQUOT check, as blocks and quotas have been already + * reserved when data being copied into pagecache. + */ + if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) + ar->flags |= EXT4_MB_DELALLOC_RESERVED; + else { + /* Without delayed allocation we need to verify + * there is enough free blocks to do block allocation + * and verify allocation doesn't exceed the quota limits. + */ + while (ar->len && + ext4_claim_free_blocks(sbi, ar->len, ar->flags)) { + + /* let others to free the space */ + yield(); + ar->len = ar->len >> 1; + } + if (!ar->len) { + *errp = -ENOSPC; + return 0; + } + reserv_blks = ar->len; + if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { + dquot_alloc_block_nofail(ar->inode, ar->len); + } else { + while (ar->len && + dquot_alloc_block(ar->inode, ar->len)) { + + ar->flags |= EXT4_MB_HINT_NOPREALLOC; + ar->len--; + } + } + inquota = ar->len; + if (ar->len == 0) { + *errp = -EDQUOT; + goto out; + } + } + + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + if (!ac) { + ar->len = 0; + *errp = -ENOMEM; + goto out; + } + + *errp = ext4_mb_initialize_context(ac, ar); + if (*errp) { + ar->len = 0; + goto out; + } + + ac->ac_op = EXT4_MB_HISTORY_PREALLOC; + if (!ext4_mb_use_preallocated(ac)) { + ac->ac_op = EXT4_MB_HISTORY_ALLOC; + ext4_mb_normalize_request(ac, ar); +repeat: + /* allocate space in core */ + *errp = ext4_mb_regular_allocator(ac); + if (*errp) + goto errout; + + /* as we've just preallocated more space than + * user requested orinally, we store allocated + * space in a special descriptor */ + if (ac->ac_status == AC_STATUS_FOUND && + ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) + ext4_mb_new_preallocation(ac); + } + if (likely(ac->ac_status == AC_STATUS_FOUND)) { + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); + if (*errp == -EAGAIN) { + /* + * drop the reference that we took + * in ext4_mb_use_best_found + */ + ext4_mb_release_context(ac); + ac->ac_b_ex.fe_group = 0; + ac->ac_b_ex.fe_start = 0; + ac->ac_b_ex.fe_len = 0; + ac->ac_status = AC_STATUS_CONTINUE; + goto repeat; + } else if (*errp) + errout: + ext4_discard_allocated_blocks(ac); + else { + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + ar->len = ac->ac_b_ex.fe_len; + } + } else { + freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); + if (freed) + goto repeat; + *errp = -ENOSPC; + } + + if (*errp) { + ac->ac_b_ex.fe_len = 0; + ar->len = 0; + ext4_mb_show_ac(ac); + } + ext4_mb_release_context(ac); +out: + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); + if (inquota && ar->len < inquota) + dquot_free_block(ar->inode, inquota - ar->len); + if (!ar->len) { + if (!ext4_test_inode_state(ar->inode, + EXT4_STATE_DELALLOC_RESERVED)) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + reserv_blks); + } + + trace_ext4_allocate_blocks(ar, (unsigned long long)block); + + return block; +} + +/* + * We can merge two free data extents only if the physical blocks + * are contiguous, AND the extents were freed by the same transaction, + * AND the blocks are associated with the same group. + */ +static int can_merge(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + if ((entry1->t_tid == entry2->t_tid) && + (entry1->group == entry2->group) && + ((entry1->start_blk + entry1->count) == entry2->start_blk)) + return 1; + return 0; +} + +static noinline_for_stack int +ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, + struct ext4_free_data *new_entry) +{ + ext4_group_t group = e4b->bd_group; + ext4_grpblk_t block; + struct ext4_free_data *entry; + struct ext4_group_info *db = e4b->bd_info; + struct super_block *sb = e4b->bd_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + + BUG_ON(!ext4_handle_valid(handle)); + BUG_ON(e4b->bd_bitmap_page == NULL); + BUG_ON(e4b->bd_buddy_page == NULL); + + new_node = &new_entry->node; + block = new_entry->start_blk; + + if (!*n) { + /* first free block exent. We need to + protect buddy cache from being freed, + * otherwise we'll refresh it from + * on-disk bitmap and lose not-yet-available + * blocks */ + page_cache_get(e4b->bd_buddy_page); + page_cache_get(e4b->bd_bitmap_page); + } + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_free_data, node); + if (block < entry->start_blk) + n = &(*n)->rb_left; + else if (block >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + ext4_grp_locked_error(sb, group, 0, + ext4_group_first_block_no(sb, group) + block, + "Block already on to-be-freed list"); + return 0; + } + } + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &db->bb_free_root); + + /* Now try to see the extent can be merged to left and right */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); + } + } + + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); + } + } + /* Add the extent to transaction's private list */ + spin_lock(&sbi->s_md_lock); + list_add(&new_entry->list, &handle->h_transaction->t_private_list); + spin_unlock(&sbi->s_md_lock); + return 0; +} + +/** + * ext4_free_blocks() -- Free given blocks and update quota + * @handle: handle for this transaction + * @inode: inode + * @block: start physical block to free + * @count: number of blocks to count + * @flags: flags used by ext4_free_blocks + */ +void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags) +{ + struct buffer_head *bitmap_bh = NULL; + struct super_block *sb = inode->i_sb; + struct ext4_group_desc *gdp; + unsigned long freed = 0; + unsigned int overflow; + ext4_grpblk_t bit; + struct buffer_head *gd_bh; + ext4_group_t block_group; + struct ext4_sb_info *sbi; + struct ext4_buddy e4b; + int err = 0; + int ret; + + if (bh) { + if (block) + BUG_ON(block != bh->b_blocknr); + else + block = bh->b_blocknr; + } + + sbi = EXT4_SB(sb); + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_data_block_valid(sbi, block, count)) { + ext4_error(sb, "Freeing blocks not in datazone - " + "block = %llu, count = %lu", block, count); + goto error_return; + } + + ext4_debug("freeing block %llu\n", block); + trace_ext4_free_blocks(inode, block, count, flags); + + if (flags & EXT4_FREE_BLOCKS_FORGET) { + struct buffer_head *tbh = bh; + int i; + + BUG_ON(bh && (count > 1)); + + for (i = 0; i < count; i++) { + if (!bh) + tbh = sb_find_get_block(inode->i_sb, + block + i); + if (unlikely(!tbh)) + continue; + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + inode, tbh, block + i); + } + } + + /* + * We need to make sure we don't reuse the freed block until + * after the transaction is committed, which we can do by + * treating the block as metadata, below. We make an + * exception if the inode is to be written in writeback mode + * since writeback mode has weak data consistency guarantees. + */ + if (!ext4_should_writeback_data(inode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + +do_more: + overflow = 0; + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { + overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) { + err = -EIO; + goto error_return; + } + gdp = ext4_get_group_desc(sb, block_group, &gd_bh); + if (!gdp) { + err = -EIO; + goto error_return; + } + + if (in_range(ext4_block_bitmap(sb, gdp), block, count) || + in_range(ext4_inode_bitmap(sb, gdp), block, count) || + in_range(block, ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group) || + in_range(block + count - 1, ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group)) { + + ext4_error(sb, "Freeing blocks in system zone - " + "Block = %llu, count = %lu", block, count); + /* err = 0. ext4_std_error should be a no op */ + goto error_return; + } + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; +#ifdef AGGRESSIVE_CHECK + { + int i; + for (i = 0; i < count; i++) + BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); + } +#endif + trace_ext4_mballoc_free(sb, inode, block_group, bit, count); + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; + + if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { + struct ext4_free_data *new_entry; + /* + * blocks being freed are metadata. these blocks shouldn't + * be used until this transaction is committed + */ + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + if (!new_entry) { + ext4_mb_unload_buddy(&e4b); + err = -ENOMEM; + goto error_return; + } + new_entry->start_blk = bit; + new_entry->group = block_group; + new_entry->count = count; + new_entry->t_tid = handle->h_transaction->t_tid; + + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count); + ext4_mb_free_metadata(handle, &e4b, new_entry); + } else { + /* need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count); + mb_free_blocks(inode, &e4b, bit, count); + } + + ret = ext4_free_blks_count(sb, gdp) + count; + ext4_free_blks_set(sb, gdp, ret); + gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_unlock_group(sb, block_group); + percpu_counter_add(&sbi->s_freeblocks_counter, count); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); + } + + ext4_mb_unload_buddy(&e4b); + + freed += count; + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); + if (!err) + err = ret; + + if (overflow && !err) { + block += count; + count = overflow; + put_bh(bitmap_bh); + goto do_more; + } + ext4_mark_super_dirty(sb); +error_return: + if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_free_block(inode, freed); + brelse(bitmap_bh); + ext4_std_error(sb, err); + return; +} + +/** + * ext4_add_groupblocks() -- Add given blocks to an existing group + * @handle: handle to this transaction + * @sb: super block + * @block: start physcial block to add to the block group + * @count: number of blocks to free + * + * This marks the blocks as free in the bitmap and buddy. + */ +void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gd_bh; + ext4_group_t block_group; + ext4_grpblk_t bit; + unsigned int i; + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_buddy e4b; + int err = 0, ret, blk_free_count; + ext4_grpblk_t blocks_freed; + struct ext4_group_info *grp; + + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); + + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + grp = ext4_get_group_info(sb, block_group); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) + goto error_return; + + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + desc = ext4_get_group_desc(sb, block_group, &gd_bh); + if (!desc) + goto error_return; + + if (in_range(ext4_block_bitmap(sb, desc), block, count) || + in_range(ext4_inode_bitmap(sb, desc), block, count) || + in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || + in_range(block + count - 1, ext4_inode_table(sb, desc), + sbi->s_itb_per_group)) { + ext4_error(sb, "Adding blocks in system zones - " + "Block = %llu, count = %lu", + block, count); + goto error_return; + } + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + for (i = 0, blocks_freed = 0; i < count; i++) { + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { + ext4_error(sb, "bit already cleared for block %llu", + (ext4_fsblk_t)(block + i)); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + blocks_freed++; + } + } + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; + + /* + * need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count); + mb_free_blocks(NULL, &e4b, bit, count); + blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); + ext4_free_blks_set(sb, desc, blk_free_count); + desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); + ext4_unlock_group(sb, block_group); + percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic_add(blocks_freed, + &sbi->s_flex_groups[flex_group].free_blocks); + } + + ext4_mb_unload_buddy(&e4b); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); + if (!err) + err = ret; + +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, err); + return; +} + +/** + * ext4_trim_extent -- function to TRIM one single free extent in the group + * @sb: super block for the file system + * @start: starting block of the free extent in the alloc. group + * @count: number of blocks to TRIM + * @group: alloc. group we are working with + * @e4b: ext4 buddy for the group + * + * Trim "count" blocks starting at "start" in the "group". To assure that no + * one will allocate those blocks, mark it as used in buddy bitmap. This must + * be called with under the group lock. + */ +static void ext4_trim_extent(struct super_block *sb, int start, int count, + ext4_group_t group, struct ext4_buddy *e4b) +{ + struct ext4_free_extent ex; + + assert_spin_locked(ext4_group_lock_ptr(sb, group)); + + ex.fe_start = start; + ex.fe_group = group; + ex.fe_len = count; + + /* + * Mark blocks used, so no one can reuse them while + * being trimmed. + */ + mb_mark_used(e4b, &ex); + ext4_unlock_group(sb, group); + ext4_issue_discard(sb, group, start, count); + ext4_lock_group(sb, group); + mb_free_blocks(NULL, e4b, start, ex.fe_len); +} + +/** + * ext4_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @e4b: ext4 buddy + * @start: first group block to examine + * @max: last group block to examine + * @minblocks: minimum extent block count + * + * ext4_trim_all_free walks through group's buddy bitmap searching for free + * extents. When the free block is found, ext4_trim_extent is called to TRIM + * the extent. + * + * + * ext4_trim_all_free walks through group's block bitmap searching for free + * extents. When the free extent is found, mark it as used in group buddy + * bitmap. Then issue a TRIM command on this extent and free the extent in + * the group buddy bitmap. This is done until whole group is scanned. + */ +static ext4_grpblk_t +ext4_trim_all_free(struct super_block *sb, ext4_group_t group, + ext4_grpblk_t start, ext4_grpblk_t max, + ext4_grpblk_t minblocks) +{ + void *bitmap; + ext4_grpblk_t next, count = 0; + struct ext4_buddy e4b; + int ret; + + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { + ext4_error(sb, "Error in loading buddy " + "information for %u", group); + return ret; + } + bitmap = e4b.bd_bitmap; + + ext4_lock_group(sb, group); + start = (e4b.bd_info->bb_first_free > start) ? + e4b.bd_info->bb_first_free : start; + + while (start < max) { + start = mb_find_next_zero_bit(bitmap, max, start); + if (start >= max) + break; + next = mb_find_next_bit(bitmap, max, start); + + if ((next - start) >= minblocks) { + ext4_trim_extent(sb, start, + next - start, group, &e4b); + count += next - start; + } + start = next + 1; + + if (fatal_signal_pending(current)) { + count = -ERESTARTSYS; + break; + } + + if (need_resched()) { + ext4_unlock_group(sb, group); + cond_resched(); + ext4_lock_group(sb, group); + } + + if ((e4b.bd_info->bb_free - count) < minblocks) + break; + } + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + + ext4_debug("trimmed %d blocks in the group %d\n", + count, group); + + return count; +} + +/** + * ext4_trim_fs() -- trim ioctl handle function + * @sb: superblock for filesystem + * @range: fstrim_range structure + * + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + * ext4_trim_fs goes through all allocation groups containing Bytes from + * start to start+len. For each such a group ext4_trim_all_free function + * is invoked to trim all free space. + */ +int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + struct ext4_group_info *grp; + ext4_group_t first_group, last_group; + ext4_group_t group, ngroups = ext4_get_groups_count(sb); + ext4_grpblk_t cnt = 0, first_block, last_block; + uint64_t start, len, minlen, trimmed = 0; + ext4_fsblk_t first_data_blk = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + int ret = 0; + + start = range->start >> sb->s_blocksize_bits; + len = range->len >> sb->s_blocksize_bits; + minlen = range->minlen >> sb->s_blocksize_bits; + + if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) + return -EINVAL; + if (start < first_data_blk) { + len -= first_data_blk - start; + start = first_data_blk; + } + + /* Determine first and last group to examine based on start and len */ + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, + &first_group, &first_block); + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), + &last_group, &last_block); + last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; + last_block = EXT4_BLOCKS_PER_GROUP(sb); + + if (first_group > last_group) + return -EINVAL; + + for (group = first_group; group <= last_group; group++) { + grp = ext4_get_group_info(sb, group); + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + ret = ext4_mb_init_group(sb, group); + if (ret) + break; + } + + /* + * For all the groups except the last one, last block will + * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to + * change it for the last group in which case start + + * len < EXT4_BLOCKS_PER_GROUP(sb). + */ + if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb)) + last_block = first_block + len; + len -= last_block - first_block; + + if (grp->bb_free >= minlen) { + cnt = ext4_trim_all_free(sb, group, first_block, + last_block, minlen); + if (cnt < 0) { + ret = cnt; + break; + } + } + trimmed += cnt; + first_block = 0; + } + range->len = trimmed * sb->s_blocksize; + + return ret; +} diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h new file mode 100644 index 00000000..20b5e7bf --- /dev/null +++ b/fs/ext4/mballoc.h @@ -0,0 +1,222 @@ +/* + * fs/ext4/mballoc.h + * + * Written by: Alex Tomas + * + */ +#ifndef _EXT4_MBALLOC_H +#define _EXT4_MBALLOC_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" + +/* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* + */ +#ifdef CONFIG_EXT4_DEBUG +extern u8 mb_enable_debug; + +#define mb_debug(n, fmt, a...) \ + do { \ + if ((n) <= mb_enable_debug) { \ + printk(KERN_DEBUG "(%s, %d): %s: ", \ + __FILE__, __LINE__, __func__); \ + printk(fmt, ## a); \ + } \ + } while (0) +#else +#define mb_debug(n, fmt, a...) +#endif + +#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ +#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ + +/* + * How long mballoc can look for a best extent (in found extents) + */ +#define MB_DEFAULT_MAX_TO_SCAN 200 + +/* + * How long mballoc must look for a best extent + */ +#define MB_DEFAULT_MIN_TO_SCAN 10 + +/* + * How many groups mballoc will scan looking for the best chunk + */ +#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 + +/* + * with 'ext4_mb_stats' allocator will collect stats that will be + * shown at umount. The collecting costs though! + */ +#define MB_DEFAULT_STATS 0 + +/* + * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served + * by the stream allocator, which purpose is to pack requests + * as close each to other as possible to produce smooth I/O traffic + * We use locality group prealloc space for stream request. + * We can tune the same via /proc/fs/ext4//stream_req + */ +#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ + +/* + * for which requests use 2^N search using buddies + */ +#define MB_DEFAULT_ORDER2_REQS 2 + +/* + * default group prealloc size 512 blocks + */ +#define MB_DEFAULT_GROUP_PREALLOC 512 + + +struct ext4_free_data { + /* this links the free block information from group_info */ + struct rb_node node; + + /* this links the free block information from ext4_sb_info */ + struct list_head list; + + /* group which free block extent belongs */ + ext4_group_t group; + + /* free block extent */ + ext4_grpblk_t start_blk; + ext4_grpblk_t count; + + /* transaction which freed this extent */ + tid_t t_tid; +}; + +struct ext4_prealloc_space { + struct list_head pa_inode_list; + struct list_head pa_group_list; + union { + struct list_head pa_tmp_list; + struct rcu_head pa_rcu; + } u; + spinlock_t pa_lock; + atomic_t pa_count; + unsigned pa_deleted; + ext4_fsblk_t pa_pstart; /* phys. block */ + ext4_lblk_t pa_lstart; /* log. block */ + ext4_grpblk_t pa_len; /* len of preallocated chunk */ + ext4_grpblk_t pa_free; /* how many blocks are free */ + unsigned short pa_type; /* pa type. inode or group */ + spinlock_t *pa_obj_lock; + struct inode *pa_inode; /* hack, for history only */ +}; + +enum { + MB_INODE_PA = 0, + MB_GROUP_PA = 1 +}; + +struct ext4_free_extent { + ext4_lblk_t fe_logical; + ext4_grpblk_t fe_start; + ext4_group_t fe_group; + ext4_grpblk_t fe_len; +}; + +/* + * Locality group: + * we try to group all related changes together + * so that writeback can flush/allocate them together as well + * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC + * (512). We store prealloc space into the hash based on the pa_free blocks + * order value.ie, fls(pa_free)-1; + */ +#define PREALLOC_TB_SIZE 10 +struct ext4_locality_group { + /* for allocator */ + /* to serialize allocates */ + struct mutex lg_mutex; + /* list of preallocations */ + struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; + spinlock_t lg_prealloc_lock; +}; + +struct ext4_allocation_context { + struct inode *ac_inode; + struct super_block *ac_sb; + + /* original request */ + struct ext4_free_extent ac_o_ex; + + /* goal request (normalized ac_o_ex) */ + struct ext4_free_extent ac_g_ex; + + /* the best found extent */ + struct ext4_free_extent ac_b_ex; + + /* copy of the bext found extent taken before preallocation efforts */ + struct ext4_free_extent ac_f_ex; + + /* number of iterations done. we have to track to limit searching */ + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; + __u16 ac_found; + __u16 ac_tail; + __u16 ac_buddy; + __u16 ac_flags; /* allocation hints */ + __u8 ac_status; + __u8 ac_criteria; + __u8 ac_repeats; + __u8 ac_2order; /* if request is to allocate 2^N blocks and + * N > 0, the field stores N, otherwise 0 */ + __u8 ac_op; /* operation, for history only */ + struct page *ac_bitmap_page; + struct page *ac_buddy_page; + struct ext4_prealloc_space *ac_pa; + struct ext4_locality_group *ac_lg; +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 +#define AC_STATUS_BREAK 3 + +struct ext4_buddy { + struct page *bd_buddy_page; + void *bd_buddy; + struct page *bd_bitmap_page; + void *bd_bitmap; + struct ext4_group_info *bd_info; + struct super_block *bd_sb; + __u16 bd_blkbits; + ext4_group_t bd_group; +}; +#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) +#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) + +static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, + struct ext4_free_extent *fex) +{ + return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start; +} +#endif diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c new file mode 100644 index 00000000..b57b98fb --- /dev/null +++ b/fs/ext4/migrate.c @@ -0,0 +1,634 @@ +/* + * Copyright IBM Corporation, 2007 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include +#include "ext4_jbd2.h" +#include "ext4_extents.h" + +/* + * The contiguous blocks details which can be + * represented by a single extent + */ +struct list_blocks_struct { + ext4_lblk_t first_block, last_block; + ext4_fsblk_t first_pblock, last_pblock; +}; + +static int finish_range(handle_t *handle, struct inode *inode, + struct list_blocks_struct *lb) + +{ + int retval = 0, needed; + struct ext4_extent newext; + struct ext4_ext_path *path; + if (lb->first_pblock == 0) + return 0; + + /* Add the extent to temp inode*/ + newext.ee_block = cpu_to_le32(lb->first_block); + newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); + ext4_ext_store_pblock(&newext, lb->first_pblock); + path = ext4_ext_find_extent(inode, lb->first_block, NULL); + + if (IS_ERR(path)) { + retval = PTR_ERR(path); + path = NULL; + goto err_out; + } + + /* + * Calculate the credit needed to inserting this extent + * Since we are doing this in loop we may accumalate extra + * credit. But below we try to not accumalate too much + * of them by restarting the journal. + */ + needed = ext4_ext_calc_credits_for_single_extent(inode, + lb->last_block - lb->first_block + 1, path); + + /* + * Make sure the credit we accumalated is not really high + */ + if (needed && ext4_handle_has_enough_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS)) { + retval = ext4_journal_restart(handle, needed); + if (retval) + goto err_out; + } else if (needed) { + retval = ext4_journal_extend(handle, needed); + if (retval) { + /* + * IF not able to extend the journal restart the journal + */ + retval = ext4_journal_restart(handle, needed); + if (retval) + goto err_out; + } + } + retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); +err_out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + lb->first_pblock = 0; + return retval; +} + +static int update_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, ext4_lblk_t blk_num, + struct list_blocks_struct *lb) +{ + int retval; + /* + * See if we can add on to the existing range (if it exists) + */ + if (lb->first_pblock && + (lb->last_pblock+1 == pblock) && + (lb->last_block+1 == blk_num)) { + lb->last_pblock = pblock; + lb->last_block = blk_num; + return 0; + } + /* + * Start a new range. + */ + retval = finish_range(handle, inode, lb); + lb->first_pblock = lb->last_pblock = pblock; + lb->first_block = lb->last_block = blk_num; + + return retval; +} + +static int update_ind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, + struct list_blocks_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + ext4_lblk_t blk_count = *blk_nump; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + if (!pblock) { + /* Only update the file block number */ + *blk_nump += max_entries; + return 0; + } + + bh = sb_bread(inode->i_sb, pblock); + if (!bh) + return -EIO; + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++, blk_count++) { + if (i_data[i]) { + retval = update_extent_range(handle, inode, + le32_to_cpu(i_data[i]), + blk_count, lb); + if (retval) + break; + } + } + + /* Update the file block number */ + *blk_nump = blk_count; + put_bh(bh); + return retval; + +} + +static int update_dind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, + struct list_blocks_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + ext4_lblk_t blk_count = *blk_nump; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + if (!pblock) { + /* Only update the file block number */ + *blk_nump += max_entries * max_entries; + return 0; + } + bh = sb_bread(inode->i_sb, pblock); + if (!bh) + return -EIO; + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (i_data[i]) { + retval = update_ind_extent_range(handle, inode, + le32_to_cpu(i_data[i]), + &blk_count, lb); + if (retval) + break; + } else { + /* Only update the file block number */ + blk_count += max_entries; + } + } + + /* Update the file block number */ + *blk_nump = blk_count; + put_bh(bh); + return retval; + +} + +static int update_tind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, + struct list_blocks_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + ext4_lblk_t blk_count = *blk_nump; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + if (!pblock) { + /* Only update the file block number */ + *blk_nump += max_entries * max_entries * max_entries; + return 0; + } + bh = sb_bread(inode->i_sb, pblock); + if (!bh) + return -EIO; + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (i_data[i]) { + retval = update_dind_extent_range(handle, inode, + le32_to_cpu(i_data[i]), + &blk_count, lb); + if (retval) + break; + } else + /* Only update the file block number */ + blk_count += max_entries * max_entries; + } + /* Update the file block number */ + *blk_nump = blk_count; + put_bh(bh); + return retval; + +} + +static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) +{ + int retval = 0, needed; + + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) + return 0; + /* + * We are freeing a blocks. During this we touch + * superblock, group descriptor and block bitmap. + * So allocate a credit of 3. We may update + * quota (user and group). + */ + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); + + if (ext4_journal_extend(handle, needed) != 0) + retval = ext4_journal_restart(handle, needed); + + return retval; +} + +static int free_dind_blocks(handle_t *handle, + struct inode *inode, __le32 i_data) +{ + int i; + __le32 *tmp_idata; + struct buffer_head *bh; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); + if (!bh) + return -EIO; + + tmp_idata = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (tmp_idata[i]) { + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, + le32_to_cpu(tmp_idata[i]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + } + } + put_bh(bh); + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + return 0; +} + +static int free_tind_blocks(handle_t *handle, + struct inode *inode, __le32 i_data) +{ + int i, retval = 0; + __le32 *tmp_idata; + struct buffer_head *bh; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); + if (!bh) + return -EIO; + + tmp_idata = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (tmp_idata[i]) { + retval = free_dind_blocks(handle, + inode, tmp_idata[i]); + if (retval) { + put_bh(bh); + return retval; + } + } + } + put_bh(bh); + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + return 0; +} + +static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) +{ + int retval; + + /* ei->i_data[EXT4_IND_BLOCK] */ + if (i_data[0]) { + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, + le32_to_cpu(i_data[0]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + } + + /* ei->i_data[EXT4_DIND_BLOCK] */ + if (i_data[1]) { + retval = free_dind_blocks(handle, inode, i_data[1]); + if (retval) + return retval; + } + + /* ei->i_data[EXT4_TIND_BLOCK] */ + if (i_data[2]) { + retval = free_tind_blocks(handle, inode, i_data[2]); + if (retval) + return retval; + } + return 0; +} + +static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, + struct inode *tmp_inode) +{ + int retval; + __le32 i_data[3]; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); + + /* + * One credit accounted for writing the + * i_data field of the original inode + */ + retval = ext4_journal_extend(handle, 1); + if (retval) { + retval = ext4_journal_restart(handle, 1); + if (retval) + goto err_out; + } + + i_data[0] = ei->i_data[EXT4_IND_BLOCK]; + i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; + i_data[2] = ei->i_data[EXT4_TIND_BLOCK]; + + down_write(&EXT4_I(inode)->i_data_sem); + /* + * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation + * happened after we started the migrate. We need to + * fail the migrate + */ + if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) { + retval = -EAGAIN; + up_write(&EXT4_I(inode)->i_data_sem); + goto err_out; + } else + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + /* + * We have the extent map build with the tmp inode. + * Now copy the i_data across + */ + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); + + /* + * Update i_blocks with the new blocks that got + * allocated while adding extents for extent index + * blocks. + * + * While converting to extents we need not + * update the orignal inode i_blocks for extent blocks + * via quota APIs. The quota update happened via tmp_inode already. + */ + spin_lock(&inode->i_lock); + inode->i_blocks += tmp_inode->i_blocks; + spin_unlock(&inode->i_lock); + up_write(&EXT4_I(inode)->i_data_sem); + + /* + * We mark the inode dirty after, because we decrement the + * i_blocks when freeing the indirect meta-data blocks + */ + retval = free_ind_block(handle, inode, i_data); + ext4_mark_inode_dirty(handle, inode); + +err_out: + return retval; +} + +static int free_ext_idx(handle_t *handle, struct inode *inode, + struct ext4_extent_idx *ix) +{ + int i, retval = 0; + ext4_fsblk_t block; + struct buffer_head *bh; + struct ext4_extent_header *eh; + + block = ext4_idx_pblock(ix); + bh = sb_bread(inode->i_sb, block); + if (!bh) + return -EIO; + + eh = (struct ext4_extent_header *)bh->b_data; + if (eh->eh_depth != 0) { + ix = EXT_FIRST_INDEX(eh); + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { + retval = free_ext_idx(handle, inode, ix); + if (retval) + break; + } + } + put_bh(bh); + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, block, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + return retval; +} + +/* + * Free the extent meta data blocks only + */ +static int free_ext_block(handle_t *handle, struct inode *inode) +{ + int i, retval = 0; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data; + struct ext4_extent_idx *ix; + if (eh->eh_depth == 0) + /* + * No extra blocks allocated for extent meta data + */ + return 0; + ix = EXT_FIRST_INDEX(eh); + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { + retval = free_ext_idx(handle, inode, ix); + if (retval) + return retval; + } + return retval; + +} + +int ext4_ext_migrate(struct inode *inode) +{ + handle_t *handle; + int retval = 0, i; + __le32 *i_data; + ext4_lblk_t blk_count = 0; + struct ext4_inode_info *ei; + struct inode *tmp_inode = NULL; + struct list_blocks_struct lb; + unsigned long max_entries; + __u32 goal; + + /* + * If the filesystem does not support extents, or the inode + * already is extent-based, error out. + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || + (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EINVAL; + + if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) + /* + * don't migrate fast symlink + */ + return retval; + + handle = ext4_journal_start(inode, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + + 1); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + return retval; + } + goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * + EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; + tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, + S_IFREG, NULL, goal); + if (IS_ERR(tmp_inode)) { + retval = -ENOMEM; + ext4_journal_stop(handle); + return retval; + } + i_size_write(tmp_inode, i_size_read(inode)); + /* + * Set the i_nlink to zero so it will be deleted later + * when we drop inode reference. + */ + tmp_inode->i_nlink = 0; + + ext4_ext_tree_init(handle, tmp_inode); + ext4_orphan_add(handle, tmp_inode); + ext4_journal_stop(handle); + + /* + * start with one credit accounted for + * superblock modification. + * + * For the tmp_inode we already have committed the + * trascation that created the inode. Later as and + * when we add extents we extent the journal + */ + /* + * Even though we take i_mutex we can still cause block + * allocation via mmap write to holes. If we have allocated + * new blocks we fail migrate. New block allocation will + * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated + * with i_data_sem held to prevent racing with block + * allocation. + */ + down_read((&EXT4_I(inode)->i_data_sem)); + ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + up_read((&EXT4_I(inode)->i_data_sem)); + + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + /* + * It is impossible to update on-disk structures without + * a handle, so just rollback in-core changes and live other + * work to orphan_list_cleanup() + */ + ext4_orphan_del(NULL, tmp_inode); + retval = PTR_ERR(handle); + goto out; + } + + ei = EXT4_I(inode); + i_data = ei->i_data; + memset(&lb, 0, sizeof(lb)); + + /* 32 bit block address 4 bytes */ + max_entries = inode->i_sb->s_blocksize >> 2; + for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) { + if (i_data[i]) { + retval = update_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[i]), + blk_count, &lb); + if (retval) + goto err_out; + } + } + if (i_data[EXT4_IND_BLOCK]) { + retval = update_ind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_IND_BLOCK]), + &blk_count, &lb); + if (retval) + goto err_out; + } else + blk_count += max_entries; + if (i_data[EXT4_DIND_BLOCK]) { + retval = update_dind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_DIND_BLOCK]), + &blk_count, &lb); + if (retval) + goto err_out; + } else + blk_count += max_entries * max_entries; + if (i_data[EXT4_TIND_BLOCK]) { + retval = update_tind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_TIND_BLOCK]), + &blk_count, &lb); + if (retval) + goto err_out; + } + /* + * Build the last extent + */ + retval = finish_range(handle, tmp_inode, &lb); +err_out: + if (retval) + /* + * Failure case delete the extent information with the + * tmp_inode + */ + free_ext_block(handle, tmp_inode); + else { + retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode); + if (retval) + /* + * if we fail to swap inode data free the extent + * details of the tmp inode + */ + free_ext_block(handle, tmp_inode); + } + + /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ + if (ext4_journal_extend(handle, 1) != 0) + ext4_journal_restart(handle, 1); + + /* + * Mark the tmp_inode as of size zero + */ + i_size_write(tmp_inode, 0); + + /* + * set the i_blocks count to zero + * so that the ext4_delete_inode does the + * right job + * + * We don't need to take the i_lock because + * the inode is not visible to user space. + */ + tmp_inode->i_blocks = 0; + + /* Reset the extent details */ + ext4_ext_tree_init(handle, tmp_inode); + ext4_journal_stop(handle); +out: + unlock_new_inode(tmp_inode); + iput(tmp_inode); + + return retval; +} diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c new file mode 100644 index 00000000..9bdef3f5 --- /dev/null +++ b/fs/ext4/mmp.c @@ -0,0 +1,351 @@ +#include +#include +#include +#include +#include + +#include "ext4.h" + +/* + * Write the MMP block using WRITE_SYNC to try to get the block on-disk + * faster. + */ +static int write_mmp_block(struct buffer_head *bh) +{ + mark_buffer_dirty(bh); + lock_buffer(bh); + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + submit_bh(WRITE_SYNC, bh); + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + return 1; + + return 0; +} + +/* + * Read the MMP block. It _must_ be read from disk and hence we clear the + * uptodate flag on the buffer. + */ +static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, + ext4_fsblk_t mmp_block) +{ + struct mmp_struct *mmp; + + if (*bh) + clear_buffer_uptodate(*bh); + + /* This would be sb_bread(sb, mmp_block), except we need to be sure + * that the MD RAID device cache has been bypassed, and that the read + * is not blocked in the elevator. */ + if (!*bh) + *bh = sb_getblk(sb, mmp_block); + if (*bh) { + get_bh(*bh); + lock_buffer(*bh); + (*bh)->b_end_io = end_buffer_read_sync; + submit_bh(READ_SYNC, *bh); + wait_on_buffer(*bh); + if (!buffer_uptodate(*bh)) { + brelse(*bh); + *bh = NULL; + } + } + if (!*bh) { + ext4_warning(sb, "Error while reading MMP block %llu", + mmp_block); + return -EIO; + } + + mmp = (struct mmp_struct *)((*bh)->b_data); + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) + return -EINVAL; + + return 0; +} + +/* + * Dump as much information as possible to help the admin. + */ +void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, + const char *function, unsigned int line, const char *msg) +{ + __ext4_warning(sb, function, line, msg); + __ext4_warning(sb, function, line, + "MMP failure info: last update time: %llu, last update " + "node: %s, last update device: %s\n", + (long long unsigned int) le64_to_cpu(mmp->mmp_time), + mmp->mmp_nodename, mmp->mmp_bdevname); +} + +/* + * kmmpd will update the MMP sequence every s_mmp_update_interval seconds + */ +static int kmmpd(void *data) +{ + struct super_block *sb = ((struct mmpd_data *) data)->sb; + struct buffer_head *bh = ((struct mmpd_data *) data)->bh; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct mmp_struct *mmp; + ext4_fsblk_t mmp_block; + u32 seq = 0; + unsigned long failed_writes = 0; + int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned mmp_check_interval; + unsigned long last_update_time; + unsigned long diff; + int retval; + + mmp_block = le64_to_cpu(es->s_mmp_block); + mmp = (struct mmp_struct *)(bh->b_data); + mmp->mmp_time = cpu_to_le64(get_seconds()); + /* + * Start with the higher mmp_check_interval and reduce it if + * the MMP block is being updated on time. + */ + mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, + EXT4_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + bdevname(bh->b_bdev, mmp->mmp_bdevname); + + memcpy(mmp->mmp_nodename, init_utsname()->sysname, + sizeof(mmp->mmp_nodename)); + + while (!kthread_should_stop()) { + if (++seq > EXT4_MMP_SEQ_MAX) + seq = 1; + + mmp->mmp_seq = cpu_to_le32(seq); + mmp->mmp_time = cpu_to_le64(get_seconds()); + last_update_time = jiffies; + + retval = write_mmp_block(bh); + /* + * Don't spew too many error messages. Print one every + * (s_mmp_update_interval * 60) seconds. + */ + if (retval && (failed_writes % 60) == 0) { + ext4_error(sb, "Error writing to MMP block"); + failed_writes++; + } + + if (!(le32_to_cpu(es->s_feature_incompat) & + EXT4_FEATURE_INCOMPAT_MMP)) { + ext4_warning(sb, "kmmpd being stopped since MMP feature" + " has been disabled."); + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + if (sb->s_flags & MS_RDONLY) { + ext4_warning(sb, "kmmpd being stopped since filesystem " + "has been remounted as readonly."); + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + diff = jiffies - last_update_time; + if (diff < mmp_update_interval * HZ) + schedule_timeout_interruptible(mmp_update_interval * + HZ - diff); + + /* + * We need to make sure that more than mmp_check_interval + * seconds have not passed since writing. If that has happened + * we need to check if the MMP block is as we left it. + */ + diff = jiffies - last_update_time; + if (diff > mmp_check_interval * HZ) { + struct buffer_head *bh_check = NULL; + struct mmp_struct *mmp_check; + + retval = read_mmp_block(sb, &bh_check, mmp_block); + if (retval) { + ext4_error(sb, "error reading MMP data: %d", + retval); + + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + mmp_check = (struct mmp_struct *)(bh_check->b_data); + if (mmp->mmp_seq != mmp_check->mmp_seq || + memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, + sizeof(mmp->mmp_nodename))) { + dump_mmp_msg(sb, mmp_check, + "Error while updating MMP info. " + "The filesystem seems to have been" + " multiply mounted."); + ext4_error(sb, "abort"); + goto failed; + } + put_bh(bh_check); + } + + /* + * Adjust the mmp_check_interval depending on how much time + * it took for the MMP block to be written. + */ + mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, + EXT4_MMP_MAX_CHECK_INTERVAL), + EXT4_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + } + + /* + * Unmount seems to be clean. + */ + mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); + mmp->mmp_time = cpu_to_le64(get_seconds()); + + retval = write_mmp_block(bh); + +failed: + kfree(data); + brelse(bh); + return retval; +} + +/* + * Get a random new sequence number but make sure it is not greater than + * EXT4_MMP_SEQ_MAX. + */ +static unsigned int mmp_new_seq(void) +{ + u32 new_seq; + + do { + get_random_bytes(&new_seq, sizeof(u32)); + } while (new_seq > EXT4_MMP_SEQ_MAX); + + return new_seq; +} + +/* + * Protect the filesystem from being mounted more than once. + */ +int ext4_multi_mount_protect(struct super_block *sb, + ext4_fsblk_t mmp_block) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *bh = NULL; + struct mmp_struct *mmp = NULL; + struct mmpd_data *mmpd_data; + u32 seq; + unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned int wait_time = 0; + int retval; + + if (mmp_block < le32_to_cpu(es->s_first_data_block) || + mmp_block >= ext4_blocks_count(es)) { + ext4_warning(sb, "Invalid MMP block in superblock"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + + mmp = (struct mmp_struct *)(bh->b_data); + + if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) + mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; + + /* + * If check_interval in MMP block is larger, use that instead of + * update_interval from the superblock. + */ + if (mmp->mmp_check_interval > mmp_check_interval) + mmp_check_interval = mmp->mmp_check_interval; + + seq = le32_to_cpu(mmp->mmp_seq); + if (seq == EXT4_MMP_SEQ_CLEAN) + goto skip; + + if (seq == EXT4_MMP_SEQ_FSCK) { + dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); + goto failed; + } + + wait_time = min(mmp_check_interval * 2 + 1, + mmp_check_interval + 60); + + /* Print MMP interval if more than 20 secs. */ + if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) + ext4_warning(sb, "MMP interval %u higher than expected, please" + " wait.\n", wait_time * 2); + + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, + "Device is already active on another node."); + goto failed; + } + +skip: + /* + * write a new random sequence number. + */ + mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); + + retval = write_mmp_block(bh); + if (retval) + goto failed; + + /* + * wait for MMP interval and check mmp_seq. + */ + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, + "Device is already active on another node."); + goto failed; + } + + mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); + if (!mmpd_data) { + ext4_warning(sb, "not enough memory for mmpd_data"); + goto failed; + } + mmpd_data->sb = sb; + mmpd_data->bh = bh; + + /* + * Start a kernel thread to update the MMP block periodically. + */ + EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", + bdevname(bh->b_bdev, + mmp->mmp_bdevname)); + if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { + EXT4_SB(sb)->s_mmp_tsk = NULL; + kfree(mmpd_data); + ext4_warning(sb, "Unable to create kmmpd thread for %s.", + sb->s_id); + goto failed; + } + + return 0; + +failed: + brelse(bh); + return 1; +} + + diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c new file mode 100644 index 00000000..f57455a1 --- /dev/null +++ b/fs/ext4/move_extent.c @@ -0,0 +1,1424 @@ +/* + * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd. + * Written by Takashi Sato + * Akira Fujita + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4_extents.h" +#include "ext4.h" + +/** + * get_ext_path - Find an extent path for designated logical block number. + * + * @inode: an inode which is searched + * @lblock: logical block number to find an extent path + * @path: pointer to an extent path pointer (for output) + * + * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value + * on failure. + */ +static inline int +get_ext_path(struct inode *inode, ext4_lblk_t lblock, + struct ext4_ext_path **path) +{ + int ret = 0; + + *path = ext4_ext_find_extent(inode, lblock, *path); + if (IS_ERR(*path)) { + ret = PTR_ERR(*path); + *path = NULL; + } else if ((*path)[ext_depth(inode)].p_ext == NULL) + ret = -ENODATA; + + return ret; +} + +/** + * copy_extent_status - Copy the extent's initialization status + * + * @src: an extent for getting initialize status + * @dest: an extent to be set the status + */ +static void +copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) +{ + if (ext4_ext_is_uninitialized(src)) + ext4_ext_mark_uninitialized(dest); + else + dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest)); +} + +/** + * mext_next_extent - Search for the next extent and set it to "extent" + * + * @inode: inode which is searched + * @path: this will obtain data for the next extent + * @extent: pointer to the next extent we have just gotten + * + * Search the next extent in the array of ext4_ext_path structure (@path) + * and set it to ext4_extent structure (@extent). In addition, the member of + * @path (->p_ext) also points the next extent. Return 0 on success, 1 if + * ext4_ext_path structure refers to the last extent, or a negative error + * value on failure. + */ +static int +mext_next_extent(struct inode *inode, struct ext4_ext_path *path, + struct ext4_extent **extent) +{ + struct ext4_extent_header *eh; + int ppos, leaf_ppos = path->p_depth; + + ppos = leaf_ppos; + if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { + /* leaf block */ + *extent = ++path[ppos].p_ext; + path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); + return 0; + } + + while (--ppos >= 0) { + if (EXT_LAST_INDEX(path[ppos].p_hdr) > + path[ppos].p_idx) { + int cur_ppos = ppos; + + /* index block */ + path[ppos].p_idx++; + path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); + if (path[ppos+1].p_bh) + brelse(path[ppos+1].p_bh); + path[ppos+1].p_bh = + sb_bread(inode->i_sb, path[ppos].p_block); + if (!path[ppos+1].p_bh) + return -EIO; + path[ppos+1].p_hdr = + ext_block_hdr(path[ppos+1].p_bh); + + /* Halfway index block */ + while (++cur_ppos < leaf_ppos) { + path[cur_ppos].p_idx = + EXT_FIRST_INDEX(path[cur_ppos].p_hdr); + path[cur_ppos].p_block = + ext4_idx_pblock(path[cur_ppos].p_idx); + if (path[cur_ppos+1].p_bh) + brelse(path[cur_ppos+1].p_bh); + path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, + path[cur_ppos].p_block); + if (!path[cur_ppos+1].p_bh) + return -EIO; + path[cur_ppos+1].p_hdr = + ext_block_hdr(path[cur_ppos+1].p_bh); + } + + path[leaf_ppos].p_ext = *extent = NULL; + + eh = path[leaf_ppos].p_hdr; + if (le16_to_cpu(eh->eh_entries) == 0) + /* empty leaf is found */ + return -ENODATA; + + /* leaf block */ + path[leaf_ppos].p_ext = *extent = + EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); + path[leaf_ppos].p_block = + ext4_ext_pblock(path[leaf_ppos].p_ext); + return 0; + } + } + /* We found the last extent */ + return 1; +} + +/** + * mext_check_null_inode - NULL check for two inodes + * + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + */ +static int +mext_check_null_inode(struct inode *inode1, struct inode *inode2, + const char *function, unsigned int line) +{ + int ret = 0; + + if (inode1 == NULL) { + __ext4_error(inode2->i_sb, function, line, + "Both inodes should not be NULL: " + "inode1 NULL inode2 %lu", inode2->i_ino); + ret = -EIO; + } else if (inode2 == NULL) { + __ext4_error(inode1->i_sb, function, line, + "Both inodes should not be NULL: " + "inode1 %lu inode2 NULL", inode1->i_ino); + ret = -EIO; + } + return ret; +} + +/** + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem + * + * @orig_inode: original inode structure + * @donor_inode: donor inode structure + * Acquire write lock of i_data_sem of the two inodes (orig and donor) by + * i_ino order. + */ +static void +double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) +{ + struct inode *first = orig_inode, *second = donor_inode; + + /* + * Use the inode number to provide the stable locking order instead + * of its address, because the C language doesn't guarantee you can + * compare pointers that don't come from the same array. + */ + if (donor_inode->i_ino < orig_inode->i_ino) { + first = donor_inode; + second = orig_inode; + } + + down_write(&EXT4_I(first)->i_data_sem); + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); +} + +/** + * double_up_write_data_sem - Release two inodes' write lock of i_data_sem + * + * @orig_inode: original inode structure to be released its lock first + * @donor_inode: donor inode structure to be released its lock second + * Release write lock of i_data_sem of two inodes (orig and donor). + */ +static void +double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) +{ + up_write(&EXT4_I(orig_inode)->i_data_sem); + up_write(&EXT4_I(donor_inode)->i_data_sem); +} + +/** + * mext_insert_across_blocks - Insert extents across leaf block + * + * @handle: journal handle + * @orig_inode: original inode + * @o_start: first original extent to be changed + * @o_end: last original extent to be changed + * @start_ext: first new extent to be inserted + * @new_ext: middle of new extent to be inserted + * @end_ext: last new extent to be inserted + * + * Allocate a new leaf block and insert extents into it. Return 0 on success, + * or a negative error value on failure. + */ +static int +mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, + struct ext4_extent *o_start, struct ext4_extent *o_end, + struct ext4_extent *start_ext, struct ext4_extent *new_ext, + struct ext4_extent *end_ext) +{ + struct ext4_ext_path *orig_path = NULL; + ext4_lblk_t eblock = 0; + int new_flag = 0; + int end_flag = 0; + int err = 0; + + if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) { + if (o_start == o_end) { + + /* start_ext new_ext end_ext + * donor |---------|-----------|--------| + * orig |------------------------------| + */ + end_flag = 1; + } else { + + /* start_ext new_ext end_ext + * donor |---------|----------|---------| + * orig |---------------|--------------| + */ + o_end->ee_block = end_ext->ee_block; + o_end->ee_len = end_ext->ee_len; + ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); + } + + o_start->ee_len = start_ext->ee_len; + eblock = le32_to_cpu(start_ext->ee_block); + new_flag = 1; + + } else if (start_ext->ee_len && new_ext->ee_len && + !end_ext->ee_len && o_start == o_end) { + + /* start_ext new_ext + * donor |--------------|---------------| + * orig |------------------------------| + */ + o_start->ee_len = start_ext->ee_len; + eblock = le32_to_cpu(start_ext->ee_block); + new_flag = 1; + + } else if (!start_ext->ee_len && new_ext->ee_len && + end_ext->ee_len && o_start == o_end) { + + /* new_ext end_ext + * donor |--------------|---------------| + * orig |------------------------------| + */ + o_end->ee_block = end_ext->ee_block; + o_end->ee_len = end_ext->ee_len; + ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); + + /* + * Set 0 to the extent block if new_ext was + * the first block. + */ + if (new_ext->ee_block) + eblock = le32_to_cpu(new_ext->ee_block); + + new_flag = 1; + } else { + ext4_debug("ext4 move extent: Unexpected insert case\n"); + return -EIO; + } + + if (new_flag) { + err = get_ext_path(orig_inode, eblock, &orig_path); + if (err) + goto out; + + if (ext4_ext_insert_extent(handle, orig_inode, + orig_path, new_ext, 0)) + goto out; + } + + if (end_flag) { + err = get_ext_path(orig_inode, + le32_to_cpu(end_ext->ee_block) - 1, &orig_path); + if (err) + goto out; + + if (ext4_ext_insert_extent(handle, orig_inode, + orig_path, end_ext, 0)) + goto out; + } +out: + if (orig_path) { + ext4_ext_drop_refs(orig_path); + kfree(orig_path); + } + + return err; + +} + +/** + * mext_insert_inside_block - Insert new extent to the extent block + * + * @o_start: first original extent to be moved + * @o_end: last original extent to be moved + * @start_ext: first new extent to be inserted + * @new_ext: middle of new extent to be inserted + * @end_ext: last new extent to be inserted + * @eh: extent header of target leaf block + * @range_to_move: used to decide how to insert extent + * + * Insert extents into the leaf block. The extent (@o_start) is overwritten + * by inserted extents. + */ +static void +mext_insert_inside_block(struct ext4_extent *o_start, + struct ext4_extent *o_end, + struct ext4_extent *start_ext, + struct ext4_extent *new_ext, + struct ext4_extent *end_ext, + struct ext4_extent_header *eh, + int range_to_move) +{ + int i = 0; + unsigned long len; + + /* Move the existing extents */ + if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) { + len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) - + (unsigned long)(o_end + 1); + memmove(o_end + 1 + range_to_move, o_end + 1, len); + } + + /* Insert start entry */ + if (start_ext->ee_len) + o_start[i++].ee_len = start_ext->ee_len; + + /* Insert new entry */ + if (new_ext->ee_len) { + o_start[i] = *new_ext; + ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext)); + } + + /* Insert end entry */ + if (end_ext->ee_len) + o_start[i] = *end_ext; + + /* Increment the total entries counter on the extent block */ + le16_add_cpu(&eh->eh_entries, range_to_move); +} + +/** + * mext_insert_extents - Insert new extent + * + * @handle: journal handle + * @orig_inode: original inode + * @orig_path: path indicates first extent to be changed + * @o_start: first original extent to be changed + * @o_end: last original extent to be changed + * @start_ext: first new extent to be inserted + * @new_ext: middle of new extent to be inserted + * @end_ext: last new extent to be inserted + * + * Call the function to insert extents. If we cannot add more extents into + * the leaf block, we call mext_insert_across_blocks() to create a + * new leaf block. Otherwise call mext_insert_inside_block(). Return 0 + * on success, or a negative error value on failure. + */ +static int +mext_insert_extents(handle_t *handle, struct inode *orig_inode, + struct ext4_ext_path *orig_path, + struct ext4_extent *o_start, + struct ext4_extent *o_end, + struct ext4_extent *start_ext, + struct ext4_extent *new_ext, + struct ext4_extent *end_ext) +{ + struct ext4_extent_header *eh; + unsigned long need_slots, slots_range; + int range_to_move, depth, ret; + + /* + * The extents need to be inserted + * start_extent + new_extent + end_extent. + */ + need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) + + (new_ext->ee_len ? 1 : 0); + + /* The number of slots between start and end */ + slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1) + / sizeof(struct ext4_extent); + + /* Range to move the end of extent */ + range_to_move = need_slots - slots_range; + depth = orig_path->p_depth; + orig_path += depth; + eh = orig_path->p_hdr; + + if (depth) { + /* Register to journal */ + ret = ext4_journal_get_write_access(handle, orig_path->p_bh); + if (ret) + return ret; + } + + /* Expansion */ + if (range_to_move > 0 && + (range_to_move > le16_to_cpu(eh->eh_max) + - le16_to_cpu(eh->eh_entries))) { + + ret = mext_insert_across_blocks(handle, orig_inode, o_start, + o_end, start_ext, new_ext, end_ext); + if (ret < 0) + return ret; + } else + mext_insert_inside_block(o_start, o_end, start_ext, new_ext, + end_ext, eh, range_to_move); + + if (depth) { + ret = ext4_handle_dirty_metadata(handle, orig_inode, + orig_path->p_bh); + if (ret) + return ret; + } else { + ret = ext4_mark_inode_dirty(handle, orig_inode); + if (ret < 0) + return ret; + } + + return 0; +} + +/** + * mext_leaf_block - Move one leaf extent block into the inode. + * + * @handle: journal handle + * @orig_inode: original inode + * @orig_path: path indicates first extent to be changed + * @dext: donor extent + * @from: start offset on the target file + * + * In order to insert extents into the leaf block, we must divide the extent + * in the leaf block into three extents. The one is located to be inserted + * extents, and the others are located around it. + * + * Therefore, this function creates structures to save extents of the leaf + * block, and inserts extents by calling mext_insert_extents() with + * created extents. Return 0 on success, or a negative error value on failure. + */ +static int +mext_leaf_block(handle_t *handle, struct inode *orig_inode, + struct ext4_ext_path *orig_path, struct ext4_extent *dext, + ext4_lblk_t *from) +{ + struct ext4_extent *oext, *o_start, *o_end, *prev_ext; + struct ext4_extent new_ext, start_ext, end_ext; + ext4_lblk_t new_ext_end; + int oext_alen, new_ext_alen, end_ext_alen; + int depth = ext_depth(orig_inode); + int ret; + + start_ext.ee_block = end_ext.ee_block = 0; + o_start = o_end = oext = orig_path[depth].p_ext; + oext_alen = ext4_ext_get_actual_len(oext); + start_ext.ee_len = end_ext.ee_len = 0; + + new_ext.ee_block = cpu_to_le32(*from); + ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext)); + new_ext.ee_len = dext->ee_len; + new_ext_alen = ext4_ext_get_actual_len(&new_ext); + new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; + + /* + * Case: original extent is first + * oext |--------| + * new_ext |--| + * start_ext |--| + */ + if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) && + le32_to_cpu(new_ext.ee_block) < + le32_to_cpu(oext->ee_block) + oext_alen) { + start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - + le32_to_cpu(oext->ee_block)); + start_ext.ee_block = oext->ee_block; + copy_extent_status(oext, &start_ext); + } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { + prev_ext = oext - 1; + /* + * We can merge new_ext into previous extent, + * if these are contiguous and same extent type. + */ + if (ext4_can_extents_be_merged(orig_inode, prev_ext, + &new_ext)) { + o_start = prev_ext; + start_ext.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(prev_ext) + + new_ext_alen); + start_ext.ee_block = oext->ee_block; + copy_extent_status(prev_ext, &start_ext); + new_ext.ee_len = 0; + } + } + + /* + * Case: new_ext_end must be less than oext + * oext |-----------| + * new_ext |-------| + */ + if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { + EXT4_ERROR_INODE(orig_inode, + "new_ext_end(%u) should be less than or equal to " + "oext->ee_block(%u) + oext_alen(%d) - 1", + new_ext_end, le32_to_cpu(oext->ee_block), + oext_alen); + ret = -EIO; + goto out; + } + + /* + * Case: new_ext is smaller than original extent + * oext |---------------| + * new_ext |-----------| + * end_ext |---| + */ + if (le32_to_cpu(oext->ee_block) <= new_ext_end && + new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) { + end_ext.ee_len = + cpu_to_le16(le32_to_cpu(oext->ee_block) + + oext_alen - 1 - new_ext_end); + copy_extent_status(oext, &end_ext); + end_ext_alen = ext4_ext_get_actual_len(&end_ext); + ext4_ext_store_pblock(&end_ext, + (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen)); + end_ext.ee_block = + cpu_to_le32(le32_to_cpu(o_end->ee_block) + + oext_alen - end_ext_alen); + } + + ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, + o_end, &start_ext, &new_ext, &end_ext); +out: + return ret; +} + +/** + * mext_calc_swap_extents - Calculate extents for extent swapping. + * + * @tmp_dext: the extent that will belong to the original inode + * @tmp_oext: the extent that will belong to the donor inode + * @orig_off: block offset of original inode + * @donor_off: block offset of donor inode + * @max_count: the maximum length of extents + * + * Return 0 on success, or a negative error value on failure. + */ +static int +mext_calc_swap_extents(struct ext4_extent *tmp_dext, + struct ext4_extent *tmp_oext, + ext4_lblk_t orig_off, ext4_lblk_t donor_off, + ext4_lblk_t max_count) +{ + ext4_lblk_t diff, orig_diff; + struct ext4_extent dext_old, oext_old; + + BUG_ON(orig_off != donor_off); + + /* original and donor extents have to cover the same block offset */ + if (orig_off < le32_to_cpu(tmp_oext->ee_block) || + le32_to_cpu(tmp_oext->ee_block) + + ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off) + return -ENODATA; + + if (orig_off < le32_to_cpu(tmp_dext->ee_block) || + le32_to_cpu(tmp_dext->ee_block) + + ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off) + return -ENODATA; + + dext_old = *tmp_dext; + oext_old = *tmp_oext; + + /* When tmp_dext is too large, pick up the target range. */ + diff = donor_off - le32_to_cpu(tmp_dext->ee_block); + + ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); + tmp_dext->ee_block = + cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); + tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); + + if (max_count < ext4_ext_get_actual_len(tmp_dext)) + tmp_dext->ee_len = cpu_to_le16(max_count); + + orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); + ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff); + + /* Adjust extent length if donor extent is larger than orig */ + if (ext4_ext_get_actual_len(tmp_dext) > + ext4_ext_get_actual_len(tmp_oext) - orig_diff) + tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) - + orig_diff); + + tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext)); + + copy_extent_status(&oext_old, tmp_dext); + copy_extent_status(&dext_old, tmp_oext); + + return 0; +} + +/** + * mext_replace_branches - Replace original extents with new extents + * + * @handle: journal handle + * @orig_inode: original inode + * @donor_inode: donor inode + * @from: block offset of orig_inode + * @count: block count to be replaced + * @err: pointer to save return value + * + * Replace original inode extents and donor inode extents page by page. + * We implement this replacement in the following three steps: + * 1. Save the block information of original and donor inodes into + * dummy extents. + * 2. Change the block information of original inode to point at the + * donor inode blocks. + * 3. Change the block information of donor inode to point at the saved + * original inode blocks in the dummy extents. + * + * Return replaced block count. + */ +static int +mext_replace_branches(handle_t *handle, struct inode *orig_inode, + struct inode *donor_inode, ext4_lblk_t from, + ext4_lblk_t count, int *err) +{ + struct ext4_ext_path *orig_path = NULL; + struct ext4_ext_path *donor_path = NULL; + struct ext4_extent *oext, *dext; + struct ext4_extent tmp_dext, tmp_oext; + ext4_lblk_t orig_off = from, donor_off = from; + int depth; + int replaced_count = 0; + int dext_alen; + + /* Protect extent trees against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); + + /* Get the original extent for the block "orig_off" */ + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) + goto out; + + /* Get the donor extent for the head */ + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) + goto out; + depth = ext_depth(orig_inode); + oext = orig_path[depth].p_ext; + tmp_oext = *oext; + + depth = ext_depth(donor_inode); + dext = donor_path[depth].p_ext; + tmp_dext = *dext; + + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + donor_off, count); + if (*err) + goto out; + + /* Loop for the donor extents */ + while (1) { + /* The extent for donor must be found. */ + if (!dext) { + EXT4_ERROR_INODE(donor_inode, + "The extent for donor must be found"); + *err = -EIO; + goto out; + } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { + EXT4_ERROR_INODE(donor_inode, + "Donor offset(%u) and the first block of donor " + "extent(%u) should be equal", + donor_off, + le32_to_cpu(tmp_dext.ee_block)); + *err = -EIO; + goto out; + } + + /* Set donor extent to orig extent */ + *err = mext_leaf_block(handle, orig_inode, + orig_path, &tmp_dext, &orig_off); + if (*err) + goto out; + + /* Set orig extent to donor extent */ + *err = mext_leaf_block(handle, donor_inode, + donor_path, &tmp_oext, &donor_off); + if (*err) + goto out; + + dext_alen = ext4_ext_get_actual_len(&tmp_dext); + replaced_count += dext_alen; + donor_off += dext_alen; + orig_off += dext_alen; + + /* Already moved the expected blocks */ + if (replaced_count >= count) + break; + + if (orig_path) + ext4_ext_drop_refs(orig_path); + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) + goto out; + depth = ext_depth(orig_inode); + oext = orig_path[depth].p_ext; + tmp_oext = *oext; + + if (donor_path) + ext4_ext_drop_refs(donor_path); + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) + goto out; + depth = ext_depth(donor_inode); + dext = donor_path[depth].p_ext; + tmp_dext = *dext; + + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + donor_off, count - replaced_count); + if (*err) + goto out; + } + +out: + if (orig_path) { + ext4_ext_drop_refs(orig_path); + kfree(orig_path); + } + if (donor_path) { + ext4_ext_drop_refs(donor_path); + kfree(donor_path); + } + + ext4_ext_invalidate_cache(orig_inode); + ext4_ext_invalidate_cache(donor_inode); + + double_up_write_data_sem(orig_inode, donor_inode); + + return replaced_count; +} + +/** + * move_extent_per_page - Move extent data per page + * + * @o_filp: file structure of original file + * @donor_inode: donor inode + * @orig_page_offset: page index on original file + * @data_offset_in_page: block index where data swapping starts + * @block_len_in_page: the number of blocks to be swapped + * @uninit: orig extent is uninitialized or not + * @err: pointer to save return value + * + * Save the data in original inode blocks and replace original inode extents + * with donor inode extents by calling mext_replace_branches(). + * Finally, write out the saved data in new original inode blocks. Return + * replaced block count. + */ +static int +move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + pgoff_t orig_page_offset, int data_offset_in_page, + int block_len_in_page, int uninit, int *err) +{ + struct inode *orig_inode = o_filp->f_dentry->d_inode; + struct address_space *mapping = orig_inode->i_mapping; + struct buffer_head *bh; + struct page *page = NULL; + const struct address_space_operations *a_ops = mapping->a_ops; + handle_t *handle; + ext4_lblk_t orig_blk_offset; + long long offs = orig_page_offset << PAGE_CACHE_SHIFT; + unsigned long blocksize = orig_inode->i_sb->s_blocksize; + unsigned int w_flags = 0; + unsigned int tmp_data_size, data_size, replaced_size; + void *fsdata; + int i, jblocks; + int err2 = 0; + int replaced_count = 0; + int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + + /* + * It needs twice the amount of ordinary journal buffers because + * inode and donor_inode may change each different metadata blocks. + */ + jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; + handle = ext4_journal_start(orig_inode, jblocks); + if (IS_ERR(handle)) { + *err = PTR_ERR(handle); + return 0; + } + + if (segment_eq(get_fs(), KERNEL_DS)) + w_flags |= AOP_FLAG_UNINTERRUPTIBLE; + + orig_blk_offset = orig_page_offset * blocks_per_page + + data_offset_in_page; + + /* + * If orig extent is uninitialized one, + * it's not necessary force the page into memory + * and then force it to be written out again. + * Just swap data blocks between orig and donor. + */ + if (uninit) { + replaced_count = mext_replace_branches(handle, orig_inode, + donor_inode, orig_blk_offset, + block_len_in_page, err); + goto out2; + } + + offs = (long long)orig_blk_offset << orig_inode->i_blkbits; + + /* Calculate data_size */ + if ((orig_blk_offset + block_len_in_page - 1) == + ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { + /* Replace the last block */ + tmp_data_size = orig_inode->i_size & (blocksize - 1); + /* + * If data_size equal zero, it shows data_size is multiples of + * blocksize. So we set appropriate value. + */ + if (tmp_data_size == 0) + tmp_data_size = blocksize; + + data_size = tmp_data_size + + ((block_len_in_page - 1) << orig_inode->i_blkbits); + } else + data_size = block_len_in_page << orig_inode->i_blkbits; + + replaced_size = data_size; + + *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, + &page, &fsdata); + if (unlikely(*err < 0)) + goto out; + + if (!PageUptodate(page)) { + mapping->a_ops->readpage(o_filp, page); + lock_page(page); + } + + /* + * try_to_release_page() doesn't call releasepage in writeback mode. + * We should care about the order of writing to the same file + * by multiple move extent processes. + * It needs to call wait_on_page_writeback() to wait for the + * writeback of the page. + */ + wait_on_page_writeback(page); + + /* Release old bh and drop refs */ + try_to_release_page(page, 0); + + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, + orig_blk_offset, block_len_in_page, + &err2); + if (err2) { + if (replaced_count) { + block_len_in_page = replaced_count; + replaced_size = + block_len_in_page << orig_inode->i_blkbits; + } else + goto out; + } + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); + + bh = page_buffers(page); + for (i = 0; i < data_offset_in_page; i++) + bh = bh->b_this_page; + + for (i = 0; i < block_len_in_page; i++) { + *err = ext4_get_block(orig_inode, + (sector_t)(orig_blk_offset + i), bh, 0); + if (*err < 0) + goto out; + + if (bh->b_this_page != NULL) + bh = bh->b_this_page; + } + + *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, + page, fsdata); + page = NULL; + +out: + if (unlikely(page)) { + if (PageLocked(page)) + unlock_page(page); + page_cache_release(page); + ext4_journal_stop(handle); + } +out2: + ext4_journal_stop(handle); + + if (err2) + *err = err2; + + return replaced_count; +} + +/** + * mext_check_arguments - Check whether move extent can be done + * + * @orig_inode: original inode + * @donor_inode: donor inode + * @orig_start: logical start offset in block for orig + * @donor_start: logical start offset in block for donor + * @len: the number of blocks to be moved + * + * Check the arguments of ext4_move_extents() whether the files can be + * exchanged with each other. + * Return 0 on success, or a negative error value on failure. + */ +static int +mext_check_arguments(struct inode *orig_inode, + struct inode *donor_inode, __u64 orig_start, + __u64 donor_start, __u64 *len) +{ + ext4_lblk_t orig_blocks, donor_blocks; + unsigned int blkbits = orig_inode->i_blkbits; + unsigned int blocksize = 1 << blkbits; + + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { + ext4_debug("ext4 move extent: suid or sgid is set" + " to donor file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) + return -EPERM; + + /* Ext4 move extent does not support swapfile */ + if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { + ext4_debug("ext4 move extent: The argument files should " + "not be swapfile [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Files should be in the same ext4 FS */ + if (orig_inode->i_sb != donor_inode->i_sb) { + ext4_debug("ext4 move extent: The argument files " + "should be in same FS [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Ext4 move extent supports only extent based file */ + if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { + ext4_debug("ext4 move extent: orig file is not extents " + "based file [ino:orig %lu]\n", orig_inode->i_ino); + return -EOPNOTSUPP; + } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { + ext4_debug("ext4 move extent: donor file is not extents " + "based file [ino:donor %lu]\n", donor_inode->i_ino); + return -EOPNOTSUPP; + } + + if ((!orig_inode->i_size) || (!donor_inode->i_size)) { + ext4_debug("ext4 move extent: File size is 0 byte\n"); + return -EINVAL; + } + + /* Start offset should be same */ + if (orig_start != donor_start) { + ext4_debug("ext4 move extent: orig and donor's start " + "offset are not same [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if ((orig_start >= EXT_MAX_BLOCKS) || + (donor_start >= EXT_MAX_BLOCKS) || + (*len > EXT_MAX_BLOCKS) || + (orig_start + *len >= EXT_MAX_BLOCKS)) { + ext4_debug("ext4 move extent: Can't handle over [%u] blocks " + "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (orig_inode->i_size > donor_inode->i_size) { + donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; + /* TODO: eliminate this artificial restriction */ + if (orig_start >= donor_blocks) { + ext4_debug("ext4 move extent: orig start offset " + "[%llu] should be less than donor file blocks " + "[%u] [ino:orig %lu, donor %lu]\n", + orig_start, donor_blocks, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* TODO: eliminate this artificial restriction */ + if (orig_start + *len > donor_blocks) { + ext4_debug("ext4 move extent: End offset [%llu] should " + "be less than donor file blocks [%u]." + "So adjust length from %llu to %llu " + "[ino:orig %lu, donor %lu]\n", + orig_start + *len, donor_blocks, + *len, donor_blocks - orig_start, + orig_inode->i_ino, donor_inode->i_ino); + *len = donor_blocks - orig_start; + } + } else { + orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits; + if (orig_start >= orig_blocks) { + ext4_debug("ext4 move extent: start offset [%llu] " + "should be less than original file blocks " + "[%u] [ino:orig %lu, donor %lu]\n", + orig_start, orig_blocks, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (orig_start + *len > orig_blocks) { + ext4_debug("ext4 move extent: Adjust length " + "from %llu to %llu. Because it should be " + "less than original file blocks " + "[ino:orig %lu, donor %lu]\n", + *len, orig_blocks - orig_start, + orig_inode->i_ino, donor_inode->i_ino); + *len = orig_blocks - orig_start; + } + } + + if (!*len) { + ext4_debug("ext4 move extent: len should not be 0 " + "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, + donor_inode->i_ino); + return -EINVAL; + } + + return 0; +} + +/** + * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 + * + * @inode1: the inode structure + * @inode2: the inode structure + * + * Lock two inodes' i_mutex by i_ino order. + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + */ +static int +mext_inode_double_lock(struct inode *inode1, struct inode *inode2) +{ + int ret = 0; + + BUG_ON(inode1 == NULL && inode2 == NULL); + + ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); + if (ret < 0) + goto out; + + if (inode1 == inode2) { + mutex_lock(&inode1->i_mutex); + goto out; + } + + if (inode1->i_ino < inode2->i_ino) { + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); + } + +out: + return ret; +} + +/** + * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 + * + * @inode1: the inode that is released first + * @inode2: the inode that is released second + * + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + */ + +static int +mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) +{ + int ret = 0; + + BUG_ON(inode1 == NULL && inode2 == NULL); + + ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); + if (ret < 0) + goto out; + + if (inode1) + mutex_unlock(&inode1->i_mutex); + + if (inode2 && inode2 != inode1) + mutex_unlock(&inode2->i_mutex); + +out: + return ret; +} + +/** + * ext4_move_extents - Exchange the specified range of a file + * + * @o_filp: file structure of the original file + * @d_filp: file structure of the donor file + * @orig_start: start offset in block for orig + * @donor_start: start offset in block for donor + * @len: the number of blocks to be moved + * @moved_len: moved block length + * + * This function returns 0 and moved block length is set in moved_len + * if succeed, otherwise returns error value. + * + * Note: ext4_move_extents() proceeds the following order. + * 1:ext4_move_extents() calculates the last block number of moving extent + * function by the start block number (orig_start) and the number of blocks + * to be moved (len) specified as arguments. + * If the {orig, donor}_start points a hole, the extent's start offset + * pointed by ext_cur (current extent), holecheck_path, orig_path are set + * after hole behind. + * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent + * or the ext_cur exceeds the block_end which is last logical block number. + * 3:To get the length of continues area, call mext_next_extent() + * specified with the ext_cur (initial value is holecheck_path) re-cursive, + * until find un-continuous extent, the start logical block number exceeds + * the block_end or the extent points to the last extent. + * 4:Exchange the original inode data with donor inode data + * from orig_page_offset to seq_end_page. + * The start indexes of data are specified as arguments. + * That of the original inode is orig_page_offset, + * and the donor inode is also orig_page_offset + * (To easily handle blocksize != pagesize case, the offset for the + * donor inode is block unit). + * 5:Update holecheck_path and orig_path to points a next proceeding extent, + * then returns to step 2. + * 6:Release holecheck_path, orig_path and set the len to moved_len + * which shows the number of moved blocks. + * The moved_len is useful for the command to calculate the file offset + * for starting next move extent ioctl. + * 7:Return 0 on success, or a negative error value on failure. + */ +int +ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 orig_start, __u64 donor_start, __u64 len, + __u64 *moved_len) +{ + struct inode *orig_inode = o_filp->f_dentry->d_inode; + struct inode *donor_inode = d_filp->f_dentry->d_inode; + struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL; + struct ext4_extent *ext_prev, *ext_cur, *ext_dummy; + ext4_lblk_t block_start = orig_start; + ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; + ext4_lblk_t rest_blocks; + pgoff_t orig_page_offset = 0, seq_end_page; + int ret1, ret2, depth, last_extent = 0; + int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + int data_offset_in_page; + int block_len_in_page; + int uninit; + + /* orig and donor should be different file */ + if (orig_inode->i_ino == donor_inode->i_ino) { + ext4_debug("ext4 move extent: The argument files should not " + "be same file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Regular file check */ + if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { + ext4_debug("ext4 move extent: The argument files should be " + "regular file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Protect orig and donor inodes against a truncate */ + ret1 = mext_inode_double_lock(orig_inode, donor_inode); + if (ret1 < 0) + return ret1; + + /* Protect extent tree against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); + /* Check the filesystem environment whether move_extent can be done */ + ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, + donor_start, &len); + if (ret1) + goto out; + + file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; + block_end = block_start + len - 1; + if (file_end < block_end) + len -= block_end - file_end; + + ret1 = get_ext_path(orig_inode, block_start, &orig_path); + if (ret1) + goto out; + + /* Get path structure to check the hole */ + ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); + if (ret1) + goto out; + + depth = ext_depth(orig_inode); + ext_cur = holecheck_path[depth].p_ext; + + /* + * Get proper starting location of block replacement if block_start was + * within the hole. + */ + if (le32_to_cpu(ext_cur->ee_block) + + ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { + /* + * The hole exists between extents or the tail of + * original file. + */ + last_extent = mext_next_extent(orig_inode, + holecheck_path, &ext_cur); + if (last_extent < 0) { + ret1 = last_extent; + goto out; + } + last_extent = mext_next_extent(orig_inode, orig_path, + &ext_dummy); + if (last_extent < 0) { + ret1 = last_extent; + goto out; + } + seq_start = le32_to_cpu(ext_cur->ee_block); + } else if (le32_to_cpu(ext_cur->ee_block) > block_start) + /* The hole exists at the beginning of original file. */ + seq_start = le32_to_cpu(ext_cur->ee_block); + else + seq_start = block_start; + + /* No blocks within the specified range. */ + if (le32_to_cpu(ext_cur->ee_block) > block_end) { + ext4_debug("ext4 move extent: The specified range of file " + "may be the hole\n"); + ret1 = -EINVAL; + goto out; + } + + /* Adjust start blocks */ + add_blocks = min(le32_to_cpu(ext_cur->ee_block) + + ext4_ext_get_actual_len(ext_cur), block_end + 1) - + max(le32_to_cpu(ext_cur->ee_block), block_start); + + while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { + seq_blocks += add_blocks; + + /* Adjust tail blocks */ + if (seq_start + seq_blocks - 1 > block_end) + seq_blocks = block_end - seq_start + 1; + + ext_prev = ext_cur; + last_extent = mext_next_extent(orig_inode, holecheck_path, + &ext_cur); + if (last_extent < 0) { + ret1 = last_extent; + break; + } + add_blocks = ext4_ext_get_actual_len(ext_cur); + + /* + * Extend the length of contiguous block (seq_blocks) + * if extents are contiguous. + */ + if (ext4_can_extents_be_merged(orig_inode, + ext_prev, ext_cur) && + block_end >= le32_to_cpu(ext_cur->ee_block) && + !last_extent) + continue; + + /* Is original extent is uninitialized */ + uninit = ext4_ext_is_uninitialized(ext_prev); + + data_offset_in_page = seq_start % blocks_per_page; + + /* + * Calculate data blocks count that should be swapped + * at the first page. + */ + if (data_offset_in_page + seq_blocks > blocks_per_page) { + /* Swapped blocks are across pages */ + block_len_in_page = + blocks_per_page - data_offset_in_page; + } else { + /* Swapped blocks are in a page */ + block_len_in_page = seq_blocks; + } + + orig_page_offset = seq_start >> + (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); + seq_end_page = (seq_start + seq_blocks - 1) >> + (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); + seq_start = le32_to_cpu(ext_cur->ee_block); + rest_blocks = seq_blocks; + + /* + * Up semaphore to avoid following problems: + * a. transaction deadlock among ext4_journal_start, + * ->write_begin via pagefault, and jbd2_journal_commit + * b. racing with ->readpage, ->write_begin, and ext4_get_block + * in move_extent_per_page + */ + double_up_write_data_sem(orig_inode, donor_inode); + + while (orig_page_offset <= seq_end_page) { + + /* Swap original branches with new branches */ + block_len_in_page = move_extent_per_page( + o_filp, donor_inode, + orig_page_offset, + data_offset_in_page, + block_len_in_page, uninit, + &ret1); + + /* Count how many blocks we have exchanged */ + *moved_len += block_len_in_page; + if (ret1 < 0) + break; + if (*moved_len > len) { + EXT4_ERROR_INODE(orig_inode, + "We replaced blocks too much! " + "sum of replaced: %llu requested: %llu", + *moved_len, len); + ret1 = -EIO; + break; + } + + orig_page_offset++; + data_offset_in_page = 0; + rest_blocks -= block_len_in_page; + if (rest_blocks > blocks_per_page) + block_len_in_page = blocks_per_page; + else + block_len_in_page = rest_blocks; + } + + double_down_write_data_sem(orig_inode, donor_inode); + if (ret1 < 0) + break; + + /* Decrease buffer counter */ + if (holecheck_path) + ext4_ext_drop_refs(holecheck_path); + ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); + if (ret1) + break; + depth = holecheck_path->p_depth; + + /* Decrease buffer counter */ + if (orig_path) + ext4_ext_drop_refs(orig_path); + ret1 = get_ext_path(orig_inode, seq_start, &orig_path); + if (ret1) + break; + + ext_cur = holecheck_path[depth].p_ext; + add_blocks = ext4_ext_get_actual_len(ext_cur); + seq_blocks = 0; + + } +out: + if (*moved_len) { + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); + } + + if (orig_path) { + ext4_ext_drop_refs(orig_path); + kfree(orig_path); + } + if (holecheck_path) { + ext4_ext_drop_refs(holecheck_path); + kfree(holecheck_path); + } + double_up_write_data_sem(orig_inode, donor_inode); + ret2 = mext_inode_double_unlock(orig_inode, donor_inode); + + if (ret1) + return ret1; + else if (ret2) + return ret2; + + return 0; +} diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c new file mode 100644 index 00000000..3d36d5a1 --- /dev/null +++ b/fs/ext4/namei.c @@ -0,0 +1,2612 @@ +/* + * linux/fs/ext4/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 + * Hash Tree Directory indexing (c) + * Daniel Phillips, 2001 + * Hash Tree Directory indexing porting + * Christopher Li, 2002 + * Hash Tree Directory indexing cleanup + * Theodore Ts'o, 2002 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4.h" +#include "ext4_jbd2.h" + +#include "xattr.h" +#include "acl.h" + +#include +/* + * define how far ahead to read directories while searching them. + */ +#define NAMEI_RA_CHUNKS 2 +#define NAMEI_RA_BLOCKS 4 +#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) +#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + +static struct buffer_head *ext4_append(handle_t *handle, + struct inode *inode, + ext4_lblk_t *block, int *err) +{ + struct buffer_head *bh; + + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + bh = ext4_bread(handle, inode, *block, 1, err); + if (bh) { + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + *err = ext4_journal_get_write_access(handle, bh); + if (*err) { + brelse(bh); + bh = NULL; + } + } + return bh; +} + +#ifndef assert +#define assert(test) J_ASSERT(test) +#endif + +#ifdef DX_DEBUG +#define dxtrace(command) command +#else +#define dxtrace(command) +#endif + +struct fake_dirent +{ + __le32 inode; + __le16 rec_len; + u8 name_len; + u8 file_type; +}; + +struct dx_countlimit +{ + __le16 limit; + __le16 count; +}; + +struct dx_entry +{ + __le32 hash; + __le32 block; +}; + +/* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +struct dx_root +{ + struct fake_dirent dot; + char dot_name[4]; + struct fake_dirent dotdot; + char dotdot_name[4]; + struct dx_root_info + { + __le32 reserved_zero; + u8 hash_version; + u8 info_length; /* 8 */ + u8 indirect_levels; + u8 unused_flags; + } + info; + struct dx_entry entries[0]; +}; + +struct dx_node +{ + struct fake_dirent fake; + struct dx_entry entries[0]; +}; + + +struct dx_frame +{ + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; +}; + +struct dx_map_entry +{ + u32 hash; + u16 offs; + u16 size; +}; + +static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); +static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); +static inline unsigned dx_get_hash(struct dx_entry *entry); +static void dx_set_hash(struct dx_entry *entry, unsigned value); +static unsigned dx_get_count(struct dx_entry *entries); +static unsigned dx_get_limit(struct dx_entry *entries); +static void dx_set_count(struct dx_entry *entries, unsigned value); +static void dx_set_limit(struct dx_entry *entries, unsigned value); +static unsigned dx_root_limit(struct inode *dir, unsigned infosize); +static unsigned dx_node_limit(struct inode *dir); +static struct dx_frame *dx_probe(const struct qstr *d_name, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, + int *err); +static void dx_release(struct dx_frame *frames); +static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); +static void dx_sort_map(struct dx_map_entry *map, unsigned count); +static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, + struct dx_map_entry *offsets, int count, unsigned blocksize); +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize); +static void dx_insert_block(struct dx_frame *frame, + u32 hash, ext4_lblk_t block); +static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash); +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *err); +static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); + +/* + * p is at least 6 bytes before the end of page + */ +static inline struct ext4_dir_entry_2 * +ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) +{ + return (struct ext4_dir_entry_2 *)((char *)p + + ext4_rec_len_from_disk(p->rec_len, blocksize)); +} + +/* + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ + +static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) +{ + return le32_to_cpu(entry->block) & 0x00ffffff; +} + +static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) +{ + entry->block = cpu_to_le32(value); +} + +static inline unsigned dx_get_hash(struct dx_entry *entry) +{ + return le32_to_cpu(entry->hash); +} + +static inline void dx_set_hash(struct dx_entry *entry, unsigned value) +{ + entry->hash = cpu_to_le32(value); +} + +static inline unsigned dx_get_count(struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->count); +} + +static inline unsigned dx_get_limit(struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->limit); +} + +static inline void dx_set_count(struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); +} + +static inline void dx_set_limit(struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); +} + +static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - + EXT4_DIR_REC_LEN(2) - infosize; + return entry_space / sizeof(struct dx_entry); +} + +static inline unsigned dx_node_limit(struct inode *dir) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); + return entry_space / sizeof(struct dx_entry); +} + +/* + * Debug + */ +#ifdef DX_DEBUG +static void dx_show_index(char * label, struct dx_entry *entries) +{ + int i, n = dx_get_count (entries); + printk(KERN_DEBUG "%s index ", label); + for (i = 0; i < n; i++) { + printk("%x->%lu ", i ? dx_get_hash(entries + i) : + 0, (unsigned long)dx_get_block(entries + i)); + } + printk("\n"); +} + +struct stats +{ + unsigned names; + unsigned space; + unsigned bcount; +}; + +static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de, + int size, int show_names) +{ + unsigned names = 0, space = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + printk("names: "); + while ((char *) de < base + size) + { + if (de->inode) + { + if (show_names) + { + int len = de->name_len; + char *name = de->name; + while (len--) printk("%c", *name++); + ext4fs_dirhash(de->name, de->name_len, &h); + printk(":%x.%u ", h.hash, + ((char *) de - base)); + } + space += EXT4_DIR_REC_LEN(de->name_len); + names++; + } + de = ext4_next_entry(de, size); + } + printk("(%i)\n", names); + return (struct stats) { names, space, 1 }; +} + +struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, + struct dx_entry *entries, int levels) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count = dx_get_count(entries), names = 0, space = 0, i; + unsigned bcount = 0; + struct buffer_head *bh; + int err; + printk("%i indexed blocks...\n", count); + for (i = 0; i < count; i++, entries++) + { + ext4_lblk_t block = dx_get_block(entries); + ext4_lblk_t hash = i ? dx_get_hash(entries): 0; + u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; + struct stats stats; + printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); + if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue; + stats = levels? + dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): + dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); + names += stats.names; + space += stats.space; + bcount += stats.bcount; + brelse(bh); + } + if (bcount) + printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", + levels ? "" : " ", names, space/bcount, + (space/bcount)*100/blocksize); + return (struct stats) { names, space, bcount}; +} +#endif /* DX_DEBUG */ + +/* + * Probe for a directory leaf block to search. + * + * dx_probe can return ERR_BAD_DX_DIR, which means there was a format + * error in the directory index, and the caller should fall back to + * searching the directory normally. The callers of dx_probe **MUST** + * check for this error code, and make sure it never gets reflected + * back to userspace. + */ +static struct dx_frame * +dx_probe(const struct qstr *d_name, struct inode *dir, + struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) +{ + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; + struct dx_root *root; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; + + frame->bh = NULL; + if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) + goto fail; + root = (struct dx_root *) bh->b_data; + if (root->info.hash_version != DX_HASH_TEA && + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY) { + ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", + root->info.hash_version); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; + if (d_name) + ext4fs_dirhash(d_name->name, d_name->len, hinfo); + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { + ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", + root->info.unused_flags); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + if ((indirect = root->info.indirect_levels) > 1) { + ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + entries = (struct dx_entry *) (((char *)&root->info) + + root->info.info_length); + + if (dx_get_limit(entries) != dx_root_limit(dir, + root->info.info_length)) { + ext4_warning(dir->i_sb, "dx entry: limit != root limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + dxtrace(printk("Look up %x", hash)); + while (1) + { + count = dx_get_count(entries); + if (!count || count > dx_get_limit(entries)) { + ext4_warning(dir->i_sb, + "dx entry: no count or count > limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail2; + } + + p = entries + 1; + q = entries + count - 1; + while (p <= q) + { + m = p + (q - p)/2; + dxtrace(printk(".")); + if (dx_get_hash(m) > hash) + q = m - 1; + else + p = m + 1; + } + + if (0) // linear search cross check + { + unsigned n = count - 1; + at = entries; + while (n--) + { + dxtrace(printk(",")); + if (dx_get_hash(++at) > hash) + { + at--; + break; + } + } + assert (at == p - 1); + } + + at = p - 1; + dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); + frame->bh = bh; + frame->entries = entries; + frame->at = at; + if (!indirect--) return frame; + if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) + goto fail2; + at = entries = ((struct dx_node *) bh->b_data)->entries; + if (dx_get_limit(entries) != dx_node_limit (dir)) { + ext4_warning(dir->i_sb, + "dx entry: limit != node limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail2; + } + frame++; + frame->bh = NULL; + } +fail2: + while (frame >= frame_in) { + brelse(frame->bh); + frame--; + } +fail: + if (*err == ERR_BAD_DX_DIR) + ext4_warning(dir->i_sb, + "Corrupt dir inode %ld, running e2fsck is " + "recommended.", dir->i_ino); + return NULL; +} + +static void dx_release (struct dx_frame *frames) +{ + if (frames[0].bh == NULL) + return; + + if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) + brelse(frames[1].bh); + brelse(frames[0].bh); +} + +/* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search + * should be necessary. Whether or not the search is necessary is + * controlled by the hash parameter. If the hash value is even, then + * the search is only continued if the next block starts with that + * hash value. This is used if we are searching for a specific file. + * + * If the hash value is HASH_NB_ALWAYS, then always go to the next block. + * + * This function returns 1 if the caller should continue to search, + * or 0 if it should not. If there is an error reading one of the + * index blocks, it will a negative error code. + * + * If start_hash is non-null, it will be filled in with the starting + * hash of the next page. + */ +static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash) +{ + struct dx_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + + p = frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and + * increment pointer in the parent node. When we break out of + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ + while (1) { + if (++(p->at) < p->entries + dx_get_count(p->entries)) + break; + if (p == frames) + return 0; + num_frames++; + p--; + } + + /* + * If the hash is 1, then continue only if the next page has a + * continuation hash of any value. This is used for readdir + * handling. Otherwise, check to see if the hash matches the + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ + bhash = dx_get_hash(p->at); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { + if ((bhash & ~1) != hash) + return 0; + } + /* + * If the hash is HASH_NB_ALWAYS, we always go to the next + * block so no check is necessary + */ + while (num_frames--) { + if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), + 0, &err))) + return err; /* Failure */ + p++; + brelse(p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } + return 1; +} + + +/* + * This function fills a red-black tree with information from a + * directory block. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +static int htree_dirblock_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash) +{ + struct buffer_head *bh; + struct ext4_dir_entry_2 *de, *top; + int err, count = 0; + + dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", + (unsigned long)block)); + if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) + return err; + + de = (struct ext4_dir_entry_2 *) bh->b_data; + top = (struct ext4_dir_entry_2 *) ((char *) de + + dir->i_sb->s_blocksize - + EXT4_DIR_REC_LEN(0)); + for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + (block<i_sb)) + + ((char *)de - bh->b_data))) { + /* On error, skip the f_pos to the next block. */ + dir_file->f_pos = (dir_file->f_pos | + (dir->i_sb->s_blocksize - 1)) + 1; + brelse(bh); + return count; + } + ext4fs_dirhash(de->name, de->name_len, hinfo); + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + if ((err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de)) != 0) { + brelse(bh); + return err; + } + count++; + } + brelse(bh); + return count; +} + + +/* + * This function fills a red-black tree with information from a + * directory. We start scanning the directory in hash order, starting + * at start_hash and start_minor_hash. + * + * This function returns the number of entries inserted into the tree, + * or a negative error code. + */ +int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash) +{ + struct dx_hash_info hinfo; + struct ext4_dir_entry_2 *de; + struct dx_frame frames[2], *frame; + struct inode *dir; + ext4_lblk_t block; + int count = 0; + int ret, err; + __u32 hashval; + + dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", + start_hash, start_minor_hash)); + dir = dir_file->f_path.dentry->d_inode; + if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { + hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, + start_hash, start_minor_hash); + *next_hash = ~0; + return count; + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; + frame = dx_probe(NULL, dir, &hinfo, frames, &err); + if (!frame) + return err; + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; + if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0) + goto errout; + count++; + } + if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; + de = ext4_next_entry(de, dir->i_sb->s_blocksize); + if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) + goto errout; + count++; + } + + while (1) { + block = dx_get_block(frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { + err = ret; + goto errout; + } + count += ret; + hashval = ~0; + ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, + frame, frames, &hashval); + *next_hash = hashval; + if (ret < 0) { + err = ret; + goto errout; + } + /* + * Stop if: (a) there are no more entries, or + * (b) we have inserted at least one entry and the + * next hash value is not a continuation + */ + if ((ret == 0) || + (count && ((hashval & 1) == 0))) + break; + } + dx_release(frames); + dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " + "next hash: %x\n", count, *next_hash)); + return count; +errout: + dx_release(frames); + return (err); +} + + +/* + * Directory block splitting, compacting + */ + +/* + * Create map of hash values, offsets, and sizes, stored at end of block. + * Returns number of entries mapped. + */ +static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, + struct dx_map_entry *map_tail) +{ + int count = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + while ((char *) de < base + blocksize) { + if (de->name_len && de->inode) { + ext4fs_dirhash(de->name, de->name_len, &h); + map_tail--; + map_tail->hash = h.hash; + map_tail->offs = ((char *) de - base)>>2; + map_tail->size = le16_to_cpu(de->rec_len); + count++; + cond_resched(); + } + /* XXX: do we need to check rec_len == 0 case? -Chris */ + de = ext4_next_entry(de, blocksize); + } + return count; +} + +/* Sort map by hash value */ +static void dx_sort_map (struct dx_map_entry *map, unsigned count) +{ + struct dx_map_entry *p, *q, *top = map + count - 1; + int more; + /* Combsort until bubble sort doesn't suck */ + while (count > 2) { + count = count*10/13; + if (count - 9 < 2) /* 9, 10 -> 11 */ + count = 11; + for (p = top, q = p - count; q >= map; p--, q--) + if (p->hash < q->hash) + swap(*p, *q); + } + /* Garden variety bubble sort */ + do { + more = 0; + q = top; + while (q-- > map) { + if (q[1].hash >= q[0].hash) + continue; + swap(*(q+1), *q); + more = 1; + } + } while(more); +} + +static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) +{ + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; + int count = dx_get_count(entries); + + assert(count < dx_get_limit(entries)); + assert(old < entries + count); + memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); +} + +static void ext4_update_dx_flag(struct inode *inode) +{ + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); +} + +/* + * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. + * + * `len <= EXT4_NAME_LEN' is guaranteed by caller. + * `de != NULL' is guaranteed by caller. + */ +static inline int ext4_match (int len, const char * const name, + struct ext4_dir_entry_2 * de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * Returns 0 if not found, -1 on failure, and 1 on success + */ +static inline int search_dirblock(struct buffer_head *bh, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 ** res_dir) +{ + struct ext4_dir_entry_2 * de; + char * dlimit; + int de_len; + const char *name = d_name->name; + int namelen = d_name->len; + + de = (struct ext4_dir_entry_2 *) bh->b_data; + dlimit = bh->b_data + dir->i_sb->s_blocksize; + while ((char *) de < dlimit) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + + if ((char *) de + namelen <= dlimit && + ext4_match (namelen, name, de)) { + /* found a match - just to be sure, do a full check */ + if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) + return -1; + *res_dir = de; + return 1; + } + /* prevent looping on a bad block */ + de_len = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); + if (de_len <= 0) + return -1; + offset += de_len; + de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); + } + return 0; +} + + +/* + * ext4_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + * + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +static struct buffer_head * ext4_find_entry (struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 ** res_dir) +{ + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; + struct buffer_head *bh, *ret = NULL; + ext4_lblk_t start, block, b; + const u8 *name = d_name->name; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead + buffer */ + int num = 0; + ext4_lblk_t nblocks; + int i, err; + int namelen; + + *res_dir = NULL; + sb = dir->i_sb; + namelen = d_name->len; + if (namelen > EXT4_NAME_LEN) + return NULL; + if ((namelen <= 2) && (name[0] == '.') && + (name[1] == '.' || name[1] == '\0')) { + /* + * "." or ".." will only be in the first block + * NFS may look up ".."; "." should be handled by the VFS + */ + block = start = 0; + nblocks = 1; + goto restart; + } + if (is_dx(dir)) { + bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the + * old fashioned way. + */ + if (bh || (err != ERR_BAD_DX_DIR)) + return bh; + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); + } + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + start = EXT4_I(dir)->i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; +restart: + do { + /* + * We deal with the read-ahead logic here. + */ + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + b = block; + for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { + /* + * Terminate if we reach the end of the + * directory and must wrap, or if our + * search has finished at this block. + */ + if (b >= nblocks || (num && block == start)) { + bh_use[ra_max] = NULL; + break; + } + num++; + bh = ext4_getblk(NULL, dir, b++, 0, &err); + bh_use[ra_max] = bh; + if (bh) + ll_rw_block(READ_META, 1, &bh); + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* read error, skip block & hope for the best */ + EXT4_ERROR_INODE(dir, "reading directory lblock %lu", + (unsigned long) block); + brelse(bh); + goto next; + } + i = search_dirblock(bh, dir, d_name, + block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { + EXT4_I(dir)->i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) + goto cleanup_and_exit; + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + if (block < nblocks) { + start = 0; + goto restart; + } + +cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse(bh_use[ra_ptr]); + return ret; +} + +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, int *err) +{ + struct super_block * sb = dir->i_sb; + struct dx_hash_info hinfo; + struct dx_frame frames[2], *frame; + struct buffer_head *bh; + ext4_lblk_t block; + int retval; + + if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) + return NULL; + do { + block = dx_get_block(frame->at); + if (!(bh = ext4_bread(NULL, dir, block, 0, err))) + goto errout; + + retval = search_dirblock(bh, dir, d_name, + block << EXT4_BLOCK_SIZE_BITS(sb), + res_dir); + if (retval == 1) { /* Success! */ + dx_release(frames); + return bh; + } + brelse(bh); + if (retval == -1) { + *err = ERR_BAD_DX_DIR; + goto errout; + } + + /* Check to see if we should continue to search */ + retval = ext4_htree_next_block(dir, hinfo.hash, frame, + frames, NULL); + if (retval < 0) { + ext4_warning(sb, + "error reading index page in directory #%lu", + dir->i_ino); + *err = retval; + goto errout; + } + } while (retval == 1); + + *err = -ENOENT; +errout: + dxtrace(printk(KERN_DEBUG "%s not found\n", name)); + dx_release (frames); + return NULL; +} + +static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode; + struct ext4_dir_entry_2 *de; + struct buffer_head *bh; + + if (dentry->d_name.len > EXT4_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + bh = ext4_find_entry(dir, &dentry->d_name, &de); + inode = NULL; + if (bh) { + __u32 ino = le32_to_cpu(de->inode); + brelse(bh); + if (!ext4_valid_inum(dir->i_sb, ino)) { + EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); + return ERR_PTR(-EIO); + } + inode = ext4_iget(dir->i_sb, ino); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ESTALE) { + EXT4_ERROR_INODE(dir, + "deleted inode referenced: %u", + ino); + return ERR_PTR(-EIO); + } else { + return ERR_CAST(inode); + } + } + } + return d_splice_alias(inode, dentry); +} + + +struct dentry *ext4_get_parent(struct dentry *child) +{ + __u32 ino; + static const struct qstr dotdot = { + .name = "..", + .len = 2, + }; + struct ext4_dir_entry_2 * de; + struct buffer_head *bh; + + bh = ext4_find_entry(child->d_inode, &dotdot, &de); + if (!bh) + return ERR_PTR(-ENOENT); + ino = le32_to_cpu(de->inode); + brelse(bh); + + if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { + EXT4_ERROR_INODE(child->d_inode, + "bad parent inode number: %u", ino); + return ERR_PTR(-EIO); + } + + return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); +} + +#define S_SHIFT 12 +static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +/* + * Move count entries from end of map between two memory locations. + * Returns pointer to last entry moved. + */ +static struct ext4_dir_entry_2 * +dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, + unsigned blocksize) +{ + unsigned rec_len = 0; + + while (count--) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + (from + (map->offs<<2)); + rec_len = EXT4_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); + ((struct ext4_dir_entry_2 *) to)->rec_len = + ext4_rec_len_to_disk(rec_len, blocksize); + de->inode = 0; + map++; + to += rec_len; + } + return (struct ext4_dir_entry_2 *) (to - rec_len); +} + +/* + * Compact each dir entry in the range to the minimal rec_len. + * Returns pointer to last entry in range. + */ +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) +{ + struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; + unsigned rec_len = 0; + + prev = to = de; + while ((char*)de < base + blocksize) { + next = ext4_next_entry(de, blocksize); + if (de->inode && de->name_len) { + rec_len = EXT4_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); + prev = to; + to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); + } + de = next; + } + return prev; +} + +/* + * Split a full leaf block to make room for a new dir entry. + * Allocate a new block, and move entries so that they are approx. equally full. + * Returns pointer to de in block into which the new entry will be inserted. + */ +static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo, int *error) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; + struct buffer_head *bh2; + ext4_lblk_t newblock; + u32 hash2; + struct dx_map_entry *map; + char *data1 = (*bh)->b_data, *data2; + unsigned split, move, size; + struct ext4_dir_entry_2 *de = NULL, *de2; + int err = 0, i; + + bh2 = ext4_append (handle, dir, &newblock, &err); + if (!(bh2)) { + brelse(*bh); + *bh = NULL; + goto errout; + } + + BUFFER_TRACE(*bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, *bh); + if (err) + goto journal_error; + + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + + data2 = bh2->b_data; + + /* create map in the end of data2 block */ + map = (struct dx_map_entry *) (data2 + blocksize); + count = dx_make_map((struct ext4_dir_entry_2 *) data1, + blocksize, hinfo, map); + map -= count; + dx_sort_map(map, count); + /* Split the existing block in the middle, size-wise */ + size = 0; + move = 0; + for (i = count-1; i >= 0; i--) { + /* is more than half of this entry in 2nd half of the block? */ + if (size + map[i].size/2 > blocksize/2) + break; + size += map[i].size; + move++; + } + /* map index at which we will split */ + split = count - move; + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", + (unsigned long)dx_get_block(frame->at), + hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); + de = dx_pack_dirents(data1, blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + blocksize); + de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2, + blocksize); + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); + + /* Which block gets the new entry? */ + if (hinfo->hash >= hash2) + { + swap(*bh, bh2); + de = de2; + } + dx_insert_block(frame, hash2 + continued, newblock); + err = ext4_handle_dirty_metadata(handle, dir, bh2); + if (err) + goto journal_error; + err = ext4_handle_dirty_metadata(handle, dir, frame->bh); + if (err) + goto journal_error; + brelse(bh2); + dxtrace(dx_show_index("frame", frame->entries)); + return de; + +journal_error: + brelse(*bh); + brelse(bh2); + *bh = NULL; + ext4_std_error(dir->i_sb, err); +errout: + *error = err; + return NULL; +} + +/* + * Add a new entry into a directory (leaf) block. If de is non-NULL, + * it points to a directory entry which is guaranteed to be large + * enough for new directory entry. If de is NULL, then + * add_dirent_to_buf will attempt search the directory block for + * space. It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + */ +static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct ext4_dir_entry_2 *de, + struct buffer_head *bh) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned int offset = 0; + unsigned int blocksize = dir->i_sb->s_blocksize; + unsigned short reclen; + int nlen, rlen, err; + char *top; + + reclen = EXT4_DIR_REC_LEN(namelen); + if (!de) { + de = (struct ext4_dir_entry_2 *)bh->b_data; + top = bh->b_data + blocksize - reclen; + while ((char *) de <= top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) + return -EIO; + if (ext4_match(namelen, name, de)) + return -EEXIST; + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); + if ((de->inode? rlen - nlen: rlen) >= reclen) + break; + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -ENOSPC; + } + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) { + ext4_std_error(dir->i_sb, err); + return err; + } + + /* By now the buffer is marked for journaling */ + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize); + de->rec_len = ext4_rec_len_to_disk(nlen, blocksize); + de = de1; + } + de->file_type = EXT4_FT_UNKNOWN; + if (inode) { + de->inode = cpu_to_le32(inode->i_ino); + ext4_set_de_type(dir->i_sb, de, inode->i_mode); + } else + de->inode = 0; + de->name_len = namelen; + memcpy(de->name, name, namelen); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext4_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + ext4_update_dx_flag(dir); + dir->i_version++; + ext4_mark_inode_dirty(handle, dir); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, dir, bh); + if (err) + ext4_std_error(dir->i_sb, err); + return 0; +} + +/* + * This converts a one block unindexed directory to a 3 block indexed + * directory, and adds the dentry to the indexed directory. + */ +static int make_indexed_dir(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct buffer_head *bh) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; + struct dx_frame frames[2], *frame; + struct dx_entry *entries; + struct ext4_dir_entry_2 *de, *de2; + char *data1, *top; + unsigned len; + int retval; + unsigned blocksize; + struct dx_hash_info hinfo; + ext4_lblk_t block; + struct fake_dirent *fde; + + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); + retval = ext4_journal_get_write_access(handle, bh); + if (retval) { + ext4_std_error(dir->i_sb, retval); + brelse(bh); + return retval; + } + root = (struct dx_root *) bh->b_data; + + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext4_dir_entry_2 *)((char *)fde + + ext4_rec_len_from_disk(fde->rec_len, blocksize)); + if ((char *) de >= (((char *) root) + blocksize)) { + EXT4_ERROR_INODE(dir, "invalid rec_len for '..'"); + brelse(bh); + return -EIO; + } + len = ((char *) root) + blocksize - (char *) de; + + /* Allocate new block for the 0th block's dirents */ + bh2 = ext4_append(handle, dir, &block, &retval); + if (!(bh2)) { + brelse(bh); + return retval; + } + ext4_set_inode_flag(dir, EXT4_INODE_INDEX); + data1 = bh2->b_data; + + memcpy (data1, de, len); + de = (struct ext4_dir_entry_2 *) data1; + top = data1 + len; + while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) + de = de2; + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + blocksize); + /* Initialize the root; the dot dirents already exist */ + de = (struct ext4_dir_entry_2 *) (&root->dotdot); + de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), + blocksize); + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + entries = root->entries; + dx_set_block(entries, 1); + dx_set_count(entries, 1); + dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); + + /* Initialize as for dx_probe */ + hinfo.hash_version = root->info.hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + ext4fs_dirhash(name, namelen, &hinfo); + frame = frames; + frame->entries = entries; + frame->at = entries; + frame->bh = bh; + bh = bh2; + + ext4_handle_dirty_metadata(handle, dir, frame->bh); + ext4_handle_dirty_metadata(handle, dir, bh); + + de = do_split(handle,dir, &bh, frame, &hinfo, &retval); + if (!de) { + /* + * Even if the block split failed, we have to properly write + * out all the changes we did so far. Otherwise we can end up + * with corrupted filesystem. + */ + ext4_mark_inode_dirty(handle, dir); + dx_release(frames); + return retval; + } + dx_release(frames); + + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + return retval; +} + +/* + * ext4_add_entry() + * + * adds a file entry to the specified directory, using the same + * semantics as ext4_find_entry(). It returns NULL if it failed. + * + * NOTE!! The inode part of 'de' is left at 0 - which means you + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + struct super_block *sb; + int retval; + int dx_fallback=0; + unsigned blocksize; + ext4_lblk_t block, blocks; + + sb = dir->i_sb; + blocksize = sb->s_blocksize; + if (!dentry->d_name.len) + return -EINVAL; + if (is_dx(dir)) { + retval = ext4_dx_add_entry(handle, dentry, inode); + if (!retval || (retval != ERR_BAD_DX_DIR)) + return retval; + ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); + dx_fallback++; + ext4_mark_inode_dirty(handle, dir); + } + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0; block < blocks; block++) { + bh = ext4_bread(handle, dir, block, 0, &retval); + if(!bh) + return retval; + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (retval != -ENOSPC) { + brelse(bh); + return retval; + } + + if (blocks == 1 && !dx_fallback && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) + return make_indexed_dir(handle, dentry, inode, bh); + brelse(bh); + } + bh = ext4_append(handle, dir, &block, &retval); + if (!bh) + return retval; + de = (struct ext4_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + if (retval == 0) + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); + return retval; +} + +/* + * Returns 0 for success, or a negative error value + */ +static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct dx_frame frames[2], *frame; + struct dx_entry *entries, *at; + struct dx_hash_info hinfo; + struct buffer_head *bh; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = dir->i_sb; + struct ext4_dir_entry_2 *de; + int err; + + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); + if (!frame) + return err; + entries = frame->entries; + at = frame->at; + + if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + goto cleanup; + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto journal_error; + + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (err != -ENOSPC) + goto cleanup; + + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + ext4_lblk_t newblock; + unsigned icount = dx_get_count(entries); + int levels = frame - frames; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + + if (levels && (dx_get_count(frames->entries) == + dx_get_limit(frames->entries))) { + ext4_warning(sb, "Directory index full!"); + err = -ENOSPC; + goto cleanup; + } + bh2 = ext4_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + memset(&node2->fake, 0, sizeof(struct fake_dirent)); + node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, + sb->s_blocksize); + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + if (levels) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", + icount1, icount2)); + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext4_journal_get_write_access(handle, + frames[0].bh); + if (err) + goto journal_error; + + memcpy((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); + dx_set_count(entries, icount1); + dx_set_count(entries2, icount2); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { + frame->at = at = at - entries - icount1 + entries2; + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } + dx_insert_block(frames + 0, hash2, newblock); + dxtrace(dx_show_index("node", frames[1].entries)); + dxtrace(dx_show_index("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext4_handle_dirty_metadata(handle, dir, bh2); + if (err) + goto journal_error; + brelse (bh2); + } else { + dxtrace(printk(KERN_DEBUG + "Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; + + /* Add new access path frame */ + frame = frames + 1; + frame->at = at = at - entries + entries2; + frame->entries = entries = entries2; + frame->bh = bh2; + err = ext4_journal_get_write_access(handle, + frame->bh); + if (err) + goto journal_error; + } + err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); + if (err) { + ext4_std_error(inode->i_sb, err); + goto cleanup; + } + } + de = do_split(handle, dir, &bh, frame, &hinfo, &err); + if (!de) + goto cleanup; + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + goto cleanup; + +journal_error: + ext4_std_error(dir->i_sb, err); +cleanup: + if (bh) + brelse(bh); + dx_release(frames); + return err; +} + +/* + * ext4_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +static int ext4_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh) +{ + struct ext4_dir_entry_2 *de, *pde; + unsigned int blocksize = dir->i_sb->s_blocksize; + int i, err; + + i = 0; + pde = NULL; + de = (struct ext4_dir_entry_2 *) bh->b_data; + while (i < bh->b_size) { + if (ext4_check_dir_entry(dir, NULL, de, bh, i)) + return -EIO; + if (de == de_del) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) { + ext4_std_error(dir->i_sb, err); + return err; + } + if (pde) + pde->rec_len = ext4_rec_len_to_disk( + ext4_rec_len_from_disk(pde->rec_len, + blocksize) + + ext4_rec_len_from_disk(de->rec_len, + blocksize), + blocksize); + else + de->inode = 0; + dir->i_version++; + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, dir, bh); + if (unlikely(err)) { + ext4_std_error(dir->i_sb, err); + return err; + } + return 0; + } + i += ext4_rec_len_from_disk(de->rec_len, blocksize); + pde = de; + de = ext4_next_entry(de, blocksize); + } + return -ENOENT; +} + +/* + * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, + * since this indicates that nlinks count was previously 1. + */ +static void ext4_inc_count(handle_t *handle, struct inode *inode) +{ + inc_nlink(inode); + if (is_dx(inode) && inode->i_nlink > 1) { + /* limit is 16-bit i_links_count */ + if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { + inode->i_nlink = 1; + EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_DIR_NLINK); + } + } +} + +/* + * If a directory had nlink == 1, then we should let it be 1. This indicates + * directory has >EXT4_LINK_MAX subdirs. + */ +static void ext4_dec_count(handle_t *handle, struct inode *inode) +{ + drop_nlink(inode); + if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0) + inc_nlink(inode); +} + + +static int ext4_add_nondir(handle_t *handle, + struct dentry *dentry, struct inode *inode) +{ + int err = ext4_add_entry(handle, dentry, inode); + if (!err) { + ext4_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); + unlock_new_inode(inode); + return 0; + } + drop_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return err; +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + dquot_initialize(dir); + +retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + err = ext4_add_nondir(handle, dentry, inode); + } + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext4_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + dquot_initialize(dir); + +retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); +#ifdef CONFIG_EXT4_FS_XATTR + inode->i_op = &ext4_special_inode_operations; +#endif + err = ext4_add_nondir(handle, dentry, inode); + } + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + handle_t *handle; + struct inode *inode; + struct buffer_head *dir_block = NULL; + struct ext4_dir_entry_2 *de; + unsigned int blocksize = dir->i_sb->s_blocksize; + int err, retries = 0; + + if (EXT4_DIR_LINK_MAX(dir)) + return -EMLINK; + + dquot_initialize(dir); + +retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + inode = ext4_new_inode(handle, dir, S_IFDIR | mode, + &dentry->d_name, 0); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; + inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + dir_block = ext4_bread(handle, inode, 0, 1, &err); + if (!dir_block) + goto out_clear_inode; + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir_block); + if (err) + goto out_clear_inode; + de = (struct ext4_dir_entry_2 *) dir_block->b_data; + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), + blocksize); + strcpy(de->name, "."); + ext4_set_de_type(dir->i_sb, de, S_IFDIR); + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(dir->i_ino); + de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1), + blocksize); + de->name_len = 2; + strcpy(de->name, ".."); + ext4_set_de_type(dir->i_sb, de, S_IFDIR); + inode->i_nlink = 2; + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, dir_block); + if (err) + goto out_clear_inode; + err = ext4_mark_inode_dirty(handle, inode); + if (!err) + err = ext4_add_entry(handle, dentry, inode); + if (err) { +out_clear_inode: + clear_nlink(inode); + unlock_new_inode(inode); + ext4_mark_inode_dirty(handle, inode); + iput(inode); + goto out_stop; + } + ext4_inc_count(handle, dir); + ext4_update_dx_flag(dir); + err = ext4_mark_inode_dirty(handle, dir); + if (err) + goto out_clear_inode; + d_instantiate(dentry, inode); + unlock_new_inode(inode); +out_stop: + brelse(dir_block); + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +static int empty_dir(struct inode *inode) +{ + unsigned int offset; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de, *de1; + struct super_block *sb; + int err = 0; + + sb = inode->i_sb; + if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || + !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { + if (err) + EXT4_ERROR_INODE(inode, + "error %d reading directory lblock 0", err); + else + ext4_warning(inode->i_sb, + "bad directory (dir #%lu) - no data block", + inode->i_ino); + return 1; + } + de = (struct ext4_dir_entry_2 *) bh->b_data; + de1 = ext4_next_entry(de, sb->s_blocksize); + if (le32_to_cpu(de->inode) != inode->i_ino || + !le32_to_cpu(de1->inode) || + strcmp(".", de->name) || + strcmp("..", de1->name)) { + ext4_warning(inode->i_sb, + "bad directory (dir #%lu) - no `.' or `..'", + inode->i_ino); + brelse(bh); + return 1; + } + offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) + + ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); + de = ext4_next_entry(de1, sb->s_blocksize); + while (offset < inode->i_size) { + if (!bh || + (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + unsigned int lblock; + err = 0; + brelse(bh); + lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); + bh = ext4_bread(NULL, inode, lblock, 0, &err); + if (!bh) { + if (err) + EXT4_ERROR_INODE(inode, + "error %d reading directory " + "lblock %u", err, lblock); + offset += sb->s_blocksize; + continue; + } + de = (struct ext4_dir_entry_2 *) bh->b_data; + } + if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { + de = (struct ext4_dir_entry_2 *)(bh->b_data + + sb->s_blocksize); + offset = (offset | (sb->s_blocksize - 1)) + 1; + continue; + } + if (le32_to_cpu(de->inode)) { + brelse(bh); + return 0; + } + offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); + de = ext4_next_entry(de, sb->s_blocksize); + } + brelse(bh); + return 1; +} + +/* ext4_orphan_add() links an unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext4_orphan_cleanup(). + */ +int ext4_orphan_add(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ext4_iloc iloc; + int err = 0, rc; + + if (!ext4_handle_valid(handle)) + return 0; + + mutex_lock(&EXT4_SB(sb)->s_orphan_lock); + if (!list_empty(&EXT4_I(inode)->i_orphan)) + goto out_unlock; + + /* Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. */ + + /* @@@ FIXME: Observation from aviro: + * I think I can trigger J_ASSERT in ext4_orphan_add(). We block + * here (on s_orphan_lock), so race with ext4_link() which might bump + * ->i_nlink. For, say it, character device. Not a regular file, + * not a directory, not a symlink and ->i_nlink > 0. + * + * tytso, 4/25/2009: I'm not sure how that could happen; + * shouldn't the fs core protect us from these sort of + * unlink()/link() races? + */ + J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) + goto out_unlock; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_unlock; + /* + * Due to previous errors inode may be already a part of on-disk + * orphan list. If so skip on-disk list modification. + */ + if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <= + (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) + goto mem_insert; + + /* Insert this inode at the head of the on-disk orphan list... */ + NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); + EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + + /* Only add to the head of the in-memory list if all the + * previous operations succeeded. If the orphan_add is going to + * fail (possibly taking the journal offline), we can't risk + * leaving the inode on the orphan list: stray orphan-list + * entries can cause panics at unmount time. + * + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ +mem_insert: + if (!err) + list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); + + jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); + jbd_debug(4, "orphan inode %lu will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); +out_unlock: + mutex_unlock(&EXT4_SB(sb)->s_orphan_lock); + ext4_std_error(inode->i_sb, err); + return err; +} + +/* + * ext4_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ +int ext4_orphan_del(handle_t *handle, struct inode *inode) +{ + struct list_head *prev; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi; + __u32 ino_next; + struct ext4_iloc iloc; + int err = 0; + + /* ext4_handle_valid() assumes a valid handle_t pointer */ + if (handle && !ext4_handle_valid(handle)) + return 0; + + mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); + if (list_empty(&ei->i_orphan)) + goto out; + + ino_next = NEXT_ORPHAN(inode); + prev = ei->i_orphan.prev; + sbi = EXT4_SB(inode->i_sb); + + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + + list_del_init(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on + * disk, but we still need to remove the inode from the linked + * list in memory. */ + if (sbi->s_journal && !handle) + goto out; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_err; + + if (prev == &sbi->s_orphan) { + jbd_debug(4, "superblock will point to %u\n", ino_next); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto out_brelse; + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + } else { + struct ext4_iloc iloc2; + struct inode *i_prev = + &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; + + jbd_debug(4, "orphan inode %lu will point to %u\n", + i_prev->i_ino, ino_next); + err = ext4_reserve_inode_write(handle, i_prev, &iloc2); + if (err) + goto out_brelse; + NEXT_ORPHAN(i_prev) = ino_next; + err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); + } + if (err) + goto out_brelse; + NEXT_ORPHAN(inode) = 0; + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + +out_err: + ext4_std_error(inode->i_sb, err); +out: + mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock); + return err; + +out_brelse: + brelse(iloc.bh); + goto out_err; +} + +static int ext4_rmdir(struct inode *dir, struct dentry *dentry) +{ + int retval; + struct inode *inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + handle_t *handle; + + /* Initialize quotas before so that eventual writes go in + * separate transaction */ + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + retval = -ENOENT; + bh = ext4_find_entry(dir, &dentry->d_name, &de); + if (!bh) + goto end_rmdir; + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + inode = dentry->d_inode; + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_rmdir; + + retval = -ENOTEMPTY; + if (!empty_dir(inode)) + goto end_rmdir; + + retval = ext4_delete_entry(handle, dir, de, bh); + if (retval) + goto end_rmdir; + if (!EXT4_DIR_LINK_EMPTY(inode)) + ext4_warning(inode->i_sb, + "empty directory has too many links (%d)", + inode->i_nlink); + inode->i_version++; + clear_nlink(inode); + /* There's no need to set i_disksize: the fact that i_nlink is + * zero will ensure that the right thing happens during any + * recovery. */ + inode->i_size = 0; + ext4_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_dec_count(handle, dir); + ext4_update_dx_flag(dir); + ext4_mark_inode_dirty(handle, dir); + +end_rmdir: + ext4_journal_stop(handle); + brelse(bh); + return retval; +} + +static int ext4_unlink(struct inode *dir, struct dentry *dentry) +{ + int retval; + struct inode *inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + handle_t *handle; + + trace_ext4_unlink_enter(dir, dentry); + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + retval = -ENOENT; + bh = ext4_find_entry(dir, &dentry->d_name, &de); + if (!bh) + goto end_unlink; + + inode = dentry->d_inode; + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_unlink; + + if (!inode->i_nlink) { + ext4_warning(inode->i_sb, + "Deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); + inode->i_nlink = 1; + } + retval = ext4_delete_entry(handle, dir, de, bh); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = ext4_current_time(dir); + ext4_update_dx_flag(dir); + ext4_mark_inode_dirty(handle, dir); + drop_nlink(inode); + if (!inode->i_nlink) + ext4_orphan_add(handle, inode); + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + retval = 0; + +end_unlink: + ext4_journal_stop(handle); + brelse(bh); + trace_ext4_unlink_exit(dentry, retval); + return retval; +} + +static int ext4_symlink(struct inode *dir, + struct dentry *dentry, const char *symname) +{ + handle_t *handle; + struct inode *inode; + int l, err, retries = 0; + int credits; + + l = strlen(symname)+1; + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + + dquot_initialize(dir); + + if (l > EXT4_N_BLOCKS * 4) { + /* + * For non-fast symlinks, we just allocate inode and put it on + * orphan list in the first transaction => we need bitmap, + * group descriptor, sb, inode block, quota blocks, and + * possibly selinux xattr blocks. + */ + credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT4_XATTR_TRANS_BLOCKS; + } else { + /* + * Fast symlink. We have to add entry to directory + * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS), + * allocate new inode (bitmap, group descriptor, inode block, + * quota blocks, sb is already counted in previous macros). + */ + credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } +retry: + handle = ext4_journal_start(dir, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, + &dentry->d_name, 0); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + if (l > EXT4_N_BLOCKS * 4) { + inode->i_op = &ext4_symlink_inode_operations; + ext4_set_aops(inode); + /* + * We cannot call page_symlink() with transaction started + * because it calls into ext4_write_begin() which can wait + * for transaction commit if we are running out of space + * and thus we deadlock. So we have to stop transaction now + * and restart it when symlink contents is written. + * + * To keep fs consistent in case of crash, we have to put inode + * to orphan list in the mean time. + */ + drop_nlink(inode); + err = ext4_orphan_add(handle, inode); + ext4_journal_stop(handle); + if (err) + goto err_drop_inode; + err = __page_symlink(inode, symname, l, 1); + if (err) + goto err_drop_inode; + /* + * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS + * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified + */ + handle = ext4_journal_start(dir, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto err_drop_inode; + } + inc_nlink(inode); + err = ext4_orphan_del(handle, inode); + if (err) { + ext4_journal_stop(handle); + clear_nlink(inode); + goto err_drop_inode; + } + } else { + /* clear the extent format for fast symlink */ + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + inode->i_op = &ext4_fast_symlink_inode_operations; + memcpy((char *)&EXT4_I(inode)->i_data, symname, l); + inode->i_size = l-1; + } + EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_add_nondir(handle, dentry, inode); +out_stop: + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +err_drop_inode: + unlock_new_inode(inode); + iput(inode); + return err; +} + +static int ext4_link(struct dentry *old_dentry, + struct inode *dir, struct dentry *dentry) +{ + handle_t *handle; + struct inode *inode = old_dentry->d_inode; + int err, retries = 0; + + if (inode->i_nlink >= EXT4_LINK_MAX) + return -EMLINK; + + dquot_initialize(dir); + +retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + inode->i_ctime = ext4_current_time(inode); + ext4_inc_count(handle, inode); + ihold(inode); + + err = ext4_add_entry(handle, dentry, inode); + if (!err) { + ext4_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); + } else { + drop_nlink(inode); + iput(inode); + } + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +#define PARENT_INO(buffer, size) \ + (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode) + +/* + * Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + */ +static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + handle_t *handle; + struct inode *old_inode, *new_inode; + struct buffer_head *old_bh, *new_bh, *dir_bh; + struct ext4_dir_entry_2 *old_de, *new_de; + int retval, force_da_alloc = 0; + + dquot_initialize(old_dir); + dquot_initialize(new_dir); + + old_bh = new_bh = dir_bh = NULL; + + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + if (new_dentry->d_inode) + dquot_initialize(new_dentry->d_inode); + handle = ext4_journal_start(old_dir, 2 * + EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + ext4_handle_sync(handle); + + old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + old_inode = old_dentry->d_inode; + retval = -ENOENT; + if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) + goto end_rename; + + new_inode = new_dentry->d_inode; + new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); + if (new_bh) { + if (!new_inode) { + brelse(new_bh); + new_bh = NULL; + } + } + if (S_ISDIR(old_inode->i_mode)) { + if (new_inode) { + retval = -ENOTEMPTY; + if (!empty_dir(new_inode)) + goto end_rename; + } + retval = -EIO; + dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); + if (!dir_bh) + goto end_rename; + if (le32_to_cpu(PARENT_INO(dir_bh->b_data, + old_dir->i_sb->s_blocksize)) != old_dir->i_ino) + goto end_rename; + retval = -EMLINK; + if (!new_inode && new_dir != old_dir && + EXT4_DIR_LINK_MAX(new_dir)) + goto end_rename; + BUFFER_TRACE(dir_bh, "get_write_access"); + retval = ext4_journal_get_write_access(handle, dir_bh); + if (retval) + goto end_rename; + } + if (!new_bh) { + retval = ext4_add_entry(handle, new_dentry, old_inode); + if (retval) + goto end_rename; + } else { + BUFFER_TRACE(new_bh, "get write access"); + retval = ext4_journal_get_write_access(handle, new_bh); + if (retval) + goto end_rename; + new_de->inode = cpu_to_le32(old_inode->i_ino); + if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb, + EXT4_FEATURE_INCOMPAT_FILETYPE)) + new_de->file_type = old_de->file_type; + new_dir->i_version++; + new_dir->i_ctime = new_dir->i_mtime = + ext4_current_time(new_dir); + ext4_mark_inode_dirty(handle, new_dir); + BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); + retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh); + if (unlikely(retval)) { + ext4_std_error(new_dir->i_sb, retval); + goto end_rename; + } + brelse(new_bh); + new_bh = NULL; + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = ext4_current_time(old_inode); + ext4_mark_inode_dirty(handle, old_inode); + + /* + * ok, that's it + */ + if (le32_to_cpu(old_de->inode) != old_inode->i_ino || + old_de->name_len != old_dentry->d_name.len || + strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || + (retval = ext4_delete_entry(handle, old_dir, + old_de, old_bh)) == -ENOENT) { + /* old_de could have moved from under us during htree split, so + * make sure that we are deleting the right entry. We might + * also be pointing to a stale entry in the unused part of + * old_bh so just checking inum and the name isn't enough. */ + struct buffer_head *old_bh2; + struct ext4_dir_entry_2 *old_de2; + + old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); + if (old_bh2) { + retval = ext4_delete_entry(handle, old_dir, + old_de2, old_bh2); + brelse(old_bh2); + } + } + if (retval) { + ext4_warning(old_dir->i_sb, + "Deleting old file (%lu), %d, error=%d", + old_dir->i_ino, old_dir->i_nlink, retval); + } + + if (new_inode) { + ext4_dec_count(handle, new_inode); + new_inode->i_ctime = ext4_current_time(new_inode); + } + old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); + ext4_update_dx_flag(old_dir); + if (dir_bh) { + PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = + cpu_to_le32(new_dir->i_ino); + BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); + retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh); + if (retval) { + ext4_std_error(old_dir->i_sb, retval); + goto end_rename; + } + ext4_dec_count(handle, old_dir); + if (new_inode) { + /* checked empty_dir above, can't have another parent, + * ext4_dec_count() won't work for many-linked dirs */ + new_inode->i_nlink = 0; + } else { + ext4_inc_count(handle, new_dir); + ext4_update_dx_flag(new_dir); + ext4_mark_inode_dirty(handle, new_dir); + } + } + ext4_mark_inode_dirty(handle, old_dir); + if (new_inode) { + ext4_mark_inode_dirty(handle, new_inode); + if (!new_inode->i_nlink) + ext4_orphan_add(handle, new_inode); + if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) + force_da_alloc = 1; + } + retval = 0; + +end_rename: + brelse(dir_bh); + brelse(old_bh); + brelse(new_bh); + ext4_journal_stop(handle); + if (retval == 0 && force_da_alloc) + ext4_alloc_da_blocks(old_inode); + return retval; +} + +/* + * directories can handle most operations... + */ +const struct inode_operations ext4_dir_inode_operations = { + .create = ext4_create, + .lookup = ext4_lookup, + .link = ext4_link, + .unlink = ext4_unlink, + .symlink = ext4_symlink, + .mkdir = ext4_mkdir, + .rmdir = ext4_rmdir, + .mknod = ext4_mknod, + .rename = ext4_rename, + .setattr = ext4_setattr, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif + .check_acl = ext4_check_acl, + .fiemap = ext4_fiemap, +}; + +const struct inode_operations ext4_special_inode_operations = { + .setattr = ext4_setattr, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif + .check_acl = ext4_check_acl, +}; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c new file mode 100644 index 00000000..d99d74ac --- /dev/null +++ b/fs/ext4/page-io.c @@ -0,0 +1,447 @@ +/* + * linux/fs/ext4/page-io.c + * + * This contains the new page_io functions for ext4 + * + * Written by Theodore Ts'o, 2010. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "ext4_extents.h" + +static struct kmem_cache *io_page_cachep, *io_end_cachep; + +int __init ext4_init_pageio(void) +{ + io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); + if (io_page_cachep == NULL) + return -ENOMEM; + io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); + if (io_end_cachep == NULL) { + kmem_cache_destroy(io_page_cachep); + return -ENOMEM; + } + return 0; +} + +void ext4_exit_pageio(void) +{ + kmem_cache_destroy(io_end_cachep); + kmem_cache_destroy(io_page_cachep); +} + +void ext4_ioend_wait(struct inode *inode) +{ + wait_queue_head_t *wq = ext4_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); +} + +static void put_io_page(struct ext4_io_page *io_page) +{ + if (atomic_dec_and_test(&io_page->p_count)) { + end_page_writeback(io_page->p_page); + put_page(io_page->p_page); + kmem_cache_free(io_page_cachep, io_page); + } +} + +void ext4_free_io_end(ext4_io_end_t *io) +{ + int i; + wait_queue_head_t *wq; + + BUG_ON(!io); + if (io->page) + put_page(io->page); + for (i = 0; i < io->num_io_pages; i++) + put_io_page(io->pages[i]); + io->num_io_pages = 0; + wq = ext4_ioend_wq(io->inode); + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && + waitqueue_active(wq)) + wake_up_all(wq); + kmem_cache_free(io_end_cachep, io); +} + +/* + * check a range of space and convert unwritten extents to written. + */ +int ext4_end_io_nolock(ext4_io_end_t *io) +{ + struct inode *inode = io->inode; + loff_t offset = io->offset; + ssize_t size = io->size; + wait_queue_head_t *wq; + int ret = 0; + + ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," + "list->prev 0x%p\n", + io, inode->i_ino, io->list.next, io->list.prev); + + if (list_empty(&io->list)) + return ret; + + if (!(io->flag & EXT4_IO_END_UNWRITTEN)) + return ret; + + ret = ext4_convert_unwritten_extents(inode, offset, size); + if (ret < 0) { + printk(KERN_EMERG "%s: failed to convert unwritten " + "extents to written extents, error is %d " + "io is still on inode %lu aio dio list\n", + __func__, ret, inode->i_ino); + return ret; + } + + if (io->iocb) + aio_complete(io->iocb, io->result, 0); + /* clear the DIO AIO unwritten flag */ + if (io->flag & EXT4_IO_END_UNWRITTEN) { + io->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + wq = ext4_ioend_wq(io->inode); + if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) && + waitqueue_active(wq)) { + wake_up_all(wq); + } + } + + return ret; +} + +/* + * work on completed aio dio IO, to convert unwritten extents to extents + */ +static void ext4_end_io_work(struct work_struct *work) +{ + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; + int ret; + + if (!mutex_trylock(&inode->i_mutex)) { + /* + * Requeue the work instead of waiting so that the work + * items queued after this can be processed. + */ + queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); + /* + * To prevent the ext4-dio-unwritten thread from keeping + * requeueing end_io requests and occupying cpu for too long, + * yield the cpu if it sees an end_io request that has already + * been requeued. + */ + if (io->flag & EXT4_IO_END_QUEUED) + yield(); + io->flag |= EXT4_IO_END_QUEUED; + return; + } + ret = ext4_end_io_nolock(io); + if (ret < 0) { + mutex_unlock(&inode->i_mutex); + return; + } + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (!list_empty(&io->list)) + list_del_init(&io->list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + mutex_unlock(&inode->i_mutex); + ext4_free_io_end(io); +} + +ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) +{ + ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); + if (io) { + atomic_inc(&EXT4_I(inode)->i_ioend_count); + io->inode = inode; + INIT_WORK(&io->work, ext4_end_io_work); + INIT_LIST_HEAD(&io->list); + } + return io; +} + +/* + * Print an buffer I/O error compatible with the fs/buffer.c. This + * provides compatibility with dmesg scrapers that look for a specific + * buffer I/O error message. We really need a unified error reporting + * structure to userspace ala Digital Unix's uerf system, but it's + * probably not going to happen in my lifetime, due to LKML politics... + */ +static void buffer_io_error(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); +} + +static void ext4_end_bio(struct bio *bio, int error) +{ + ext4_io_end_t *io_end = bio->bi_private; + struct workqueue_struct *wq; + struct inode *inode; + unsigned long flags; + int i; + sector_t bi_sector = bio->bi_sector; + + BUG_ON(!io_end); + bio->bi_private = NULL; + bio->bi_end_io = NULL; + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = 0; + bio_put(bio); + + for (i = 0; i < io_end->num_io_pages; i++) { + struct page *page = io_end->pages[i]->p_page; + struct buffer_head *bh, *head; + loff_t offset; + loff_t io_end_offset; + + if (error) { + SetPageError(page); + set_bit(AS_EIO, &page->mapping->flags); + head = page_buffers(page); + BUG_ON(!head); + + io_end_offset = io_end->offset + io_end->size; + + offset = (sector_t) page->index << PAGE_CACHE_SHIFT; + bh = head; + do { + if ((offset >= io_end->offset) && + (offset+bh->b_size <= io_end_offset)) + buffer_io_error(bh); + + offset += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); + } + + put_io_page(io_end->pages[i]); + } + io_end->num_io_pages = 0; + inode = io_end->inode; + + if (error) { + io_end->flag |= EXT4_IO_END_ERROR; + ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + "(offset %llu size %ld starting block %llu)", + inode->i_ino, + (unsigned long long) io_end->offset, + (long) io_end->size, + (unsigned long long) + bi_sector >> (inode->i_blkbits - 9)); + } + + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + ext4_free_io_end(io_end); + return; + } + + /* Add the io_end to per-inode completed io list*/ + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); + + wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); +} + +void ext4_io_submit(struct ext4_io_submit *io) +{ + struct bio *bio = io->io_bio; + + if (bio) { + bio_get(io->io_bio); + submit_bio(io->io_op, io->io_bio); + BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); + bio_put(io->io_bio); + } + io->io_bio = NULL; + io->io_op = 0; + io->io_end = NULL; +} + +static int io_submit_init(struct ext4_io_submit *io, + struct inode *inode, + struct writeback_control *wbc, + struct buffer_head *bh) +{ + ext4_io_end_t *io_end; + struct page *page = bh->b_page; + int nvecs = bio_get_nr_vecs(bh->b_bdev); + struct bio *bio; + + io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) + return -ENOMEM; + do { + bio = bio_alloc(GFP_NOIO, nvecs); + nvecs >>= 1; + } while (bio == NULL); + + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + bio->bi_private = io->io_end = io_end; + bio->bi_end_io = ext4_end_bio; + + io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); + + io->io_bio = bio; + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + io->io_next_block = bh->b_blocknr; + return 0; +} + +static int io_submit_add_bh(struct ext4_io_submit *io, + struct ext4_io_page *io_page, + struct inode *inode, + struct writeback_control *wbc, + struct buffer_head *bh) +{ + ext4_io_end_t *io_end; + int ret; + + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + + if (!buffer_mapped(bh) || buffer_delay(bh)) { + if (!buffer_mapped(bh)) + clear_buffer_dirty(bh); + if (io->io_bio) + ext4_io_submit(io); + return 0; + } + + if (io->io_bio && bh->b_blocknr != io->io_next_block) { +submit_and_retry: + ext4_io_submit(io); + } + if (io->io_bio == NULL) { + ret = io_submit_init(io, inode, wbc, bh); + if (ret) + return ret; + } + io_end = io->io_end; + if ((io_end->num_io_pages >= MAX_IO_PAGES) && + (io_end->pages[io_end->num_io_pages-1] != io_page)) + goto submit_and_retry; + if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } + io->io_end->size += bh->b_size; + io->io_next_block++; + ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); + if (ret != bh->b_size) + goto submit_and_retry; + if ((io_end->num_io_pages == 0) || + (io_end->pages[io_end->num_io_pages-1] != io_page)) { + io_end->pages[io_end->num_io_pages++] = io_page; + atomic_inc(&io_page->p_count); + } + return 0; +} + +int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + unsigned block_start, block_end, blocksize; + struct ext4_io_page *io_page; + struct buffer_head *bh, *head; + int ret = 0; + + blocksize = 1 << inode->i_blkbits; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + + io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); + if (!io_page) { + set_page_dirty(page); + unlock_page(page); + return -ENOMEM; + } + io_page->p_page = page; + atomic_set(&io_page->p_count, 1); + get_page(page); + set_page_writeback(page); + ClearPageError(page); + + for (bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + + block_end = block_start + blocksize; + if (block_start >= len) { + /* + * Comments copied from block_write_full_page_endio: + * + * The page straddles i_size. It must be zeroed out on + * each and every writepage invocation because it may + * be mmapped. "A file is mapped in multiples of the + * page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when + * mapped, and writes to that region are not written + * out to the file." + */ + zero_user_segment(page, block_start, block_end); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + clear_buffer_dirty(bh); + ret = io_submit_add_bh(io, io_page, inode, wbc, bh); + if (ret) { + /* + * We only get here on ENOMEM. Not much else + * we can do but mark the page as dirty, and + * better luck next time. + */ + set_page_dirty(page); + break; + } + } + unlock_page(page); + /* + * If the page was truncated before we could do the writeback, + * or we had a memory allocation error while trying to write + * the first buffer head, we won't have submitted any pages for + * I/O. In that case we need to make sure we've cleared the + * PageWriteback bit from the page to prevent the system from + * wedging later on. + */ + put_io_page(io_page); + return ret; +} diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c new file mode 100644 index 00000000..80bbc9c6 --- /dev/null +++ b/fs/ext4/resize.c @@ -0,0 +1,1076 @@ +/* + * linux/fs/ext4/resize.c + * + * Support for resizing an ext4 filesystem while it is mounted. + * + * Copyright (C) 2001, 2002 Andreas Dilger + * + * This could probably be made into a module, because it is not often in use. + */ + + +#define EXT4FS_DEBUG + +#include +#include + +#include "ext4_jbd2.h" + +#define outside(b, first, last) ((b) < (first) || (b) >= (last)) +#define inside(b, first, last) ((b) >= (first) && (b) < (last)) + +static int verify_group_input(struct super_block *sb, + struct ext4_new_group_data *input) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_fsblk_t start = ext4_blocks_count(es); + ext4_fsblk_t end = start + input->blocks_count; + ext4_group_t group = input->group; + ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; + unsigned overhead = ext4_bg_has_super(sb, group) ? + (1 + ext4_bg_num_gdb(sb, group) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + ext4_fsblk_t metaend = start + overhead; + struct buffer_head *bh = NULL; + ext4_grpblk_t free_blocks_count, offset; + int err = -EINVAL; + + input->free_blocks_count = free_blocks_count = + input->blocks_count - 2 - overhead - sbi->s_itb_per_group; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks " + "(%d free, %u reserved)\n", + ext4_bg_has_super(sb, input->group) ? "normal" : + "no-super", input->group, input->blocks_count, + free_blocks_count, input->reserved_blocks); + + ext4_get_group_no_and_offset(sb, start, NULL, &offset); + if (group != sbi->s_groups_count) + ext4_warning(sb, "Cannot add at group %u (only %u groups)", + input->group, sbi->s_groups_count); + else if (offset != 0) + ext4_warning(sb, "Last group not full"); + else if (input->reserved_blocks > input->blocks_count / 5) + ext4_warning(sb, "Reserved blocks too high (%u)", + input->reserved_blocks); + else if (free_blocks_count < 0) + ext4_warning(sb, "Bad blocks count %u", + input->blocks_count); + else if (!(bh = sb_bread(sb, end - 1))) + ext4_warning(sb, "Cannot read last block (%llu)", + end - 1); + else if (outside(input->block_bitmap, start, end)) + ext4_warning(sb, "Block bitmap not in group (block %llu)", + (unsigned long long)input->block_bitmap); + else if (outside(input->inode_bitmap, start, end)) + ext4_warning(sb, "Inode bitmap not in group (block %llu)", + (unsigned long long)input->inode_bitmap); + else if (outside(input->inode_table, start, end) || + outside(itend - 1, start, end)) + ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)", + (unsigned long long)input->inode_table, itend - 1); + else if (input->inode_bitmap == input->block_bitmap) + ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)", + (unsigned long long)input->block_bitmap); + else if (inside(input->block_bitmap, input->inode_table, itend)) + ext4_warning(sb, "Block bitmap (%llu) in inode table " + "(%llu-%llu)", + (unsigned long long)input->block_bitmap, + (unsigned long long)input->inode_table, itend - 1); + else if (inside(input->inode_bitmap, input->inode_table, itend)) + ext4_warning(sb, "Inode bitmap (%llu) in inode table " + "(%llu-%llu)", + (unsigned long long)input->inode_bitmap, + (unsigned long long)input->inode_table, itend - 1); + else if (inside(input->block_bitmap, start, metaend)) + ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)", + (unsigned long long)input->block_bitmap, + start, metaend - 1); + else if (inside(input->inode_bitmap, start, metaend)) + ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)", + (unsigned long long)input->inode_bitmap, + start, metaend - 1); + else if (inside(input->inode_table, start, metaend) || + inside(itend - 1, start, metaend)) + ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table " + "(%llu-%llu)", + (unsigned long long)input->inode_table, + itend - 1, start, metaend - 1); + else + err = 0; + brelse(bh); + + return err; +} + +static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, + ext4_fsblk_t blk) +{ + struct buffer_head *bh; + int err; + + bh = sb_getblk(sb, blk); + if (!bh) + return ERR_PTR(-EIO); + if ((err = ext4_journal_get_write_access(handle, bh))) { + brelse(bh); + bh = ERR_PTR(err); + } else { + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + } + + return bh; +} + +/* + * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA. + * If that fails, restart the transaction & regain write access for the + * buffer head which is used for block_bitmap modifications. + */ +static int extend_or_restart_transaction(handle_t *handle, int thresh, + struct buffer_head *bh) +{ + int err; + + if (ext4_handle_has_enough_credits(handle, thresh)) + return 0; + + err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); + if (err < 0) + return err; + if (err) { + if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) + return err; + if ((err = ext4_journal_get_write_access(handle, bh))) + return err; + } + + return 0; +} + +/* + * Set up the block and inode bitmaps, and the inode table for the new group. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to + * ensure the recovery is correct in case of a failure just after resize. + * If any part of this fails, we simply abort the resize. + */ +static int setup_new_group_blocks(struct super_block *sb, + struct ext4_new_group_data *input) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group); + int reserved_gdb = ext4_bg_has_super(sb, input->group) ? + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; + unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group); + struct buffer_head *bh; + handle_t *handle; + ext4_fsblk_t block; + ext4_grpblk_t bit; + int i; + int err = 0, err2; + + /* This transaction may be extended/restarted along the way */ + handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); + + if (IS_ERR(handle)) + return PTR_ERR(handle); + + mutex_lock(&sbi->s_resize_lock); + if (input->group != sbi->s_groups_count) { + err = -EBUSY; + goto exit_journal; + } + + if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { + err = PTR_ERR(bh); + goto exit_journal; + } + + if (ext4_bg_has_super(sb, input->group)) { + ext4_debug("mark backup superblock %#04llx (+0)\n", start); + ext4_set_bit(0, bh->b_data); + } + + /* Copy all of the GDT blocks into the backup in this group */ + for (i = 0, bit = 1, block = start + 1; + i < gdblocks; i++, block++, bit++) { + struct buffer_head *gdb; + + ext4_debug("update backup group %#04llx (+%d)\n", block, bit); + + if ((err = extend_or_restart_transaction(handle, 1, bh))) + goto exit_bh; + + gdb = sb_getblk(sb, block); + if (!gdb) { + err = -EIO; + goto exit_bh; + } + if ((err = ext4_journal_get_write_access(handle, gdb))) { + brelse(gdb); + goto exit_bh; + } + lock_buffer(gdb); + memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); + set_buffer_uptodate(gdb); + unlock_buffer(gdb); + err = ext4_handle_dirty_metadata(handle, NULL, gdb); + if (unlikely(err)) { + brelse(gdb); + goto exit_bh; + } + ext4_set_bit(bit, bh->b_data); + brelse(gdb); + } + + /* Zero out all of the reserved backup group descriptor table blocks */ + ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", + block, sbi->s_itb_per_group); + err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, + GFP_NOFS); + if (err) + goto exit_bh; + for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++) + ext4_set_bit(bit, bh->b_data); + + ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, + input->block_bitmap - start); + ext4_set_bit(input->block_bitmap - start, bh->b_data); + ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap, + input->inode_bitmap - start); + ext4_set_bit(input->inode_bitmap - start, bh->b_data); + + /* Zero out all of the inode table blocks */ + block = input->inode_table; + ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", + block, sbi->s_itb_per_group); + err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); + if (err) + goto exit_bh; + for (i = 0, bit = input->inode_table - start; + i < sbi->s_itb_per_group; i++, bit++) + ext4_set_bit(bit, bh->b_data); + + if ((err = extend_or_restart_transaction(handle, 2, bh))) + goto exit_bh; + + ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, + bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_bh; + } + brelse(bh); + /* Mark unused entries in inode bitmap used */ + ext4_debug("clear inode bitmap %#04llx (+%llu)\n", + input->inode_bitmap, input->inode_bitmap - start); + if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { + err = PTR_ERR(bh); + goto exit_journal; + } + + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, + bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + ext4_std_error(sb, err); +exit_bh: + brelse(bh); + +exit_journal: + mutex_unlock(&sbi->s_resize_lock); + if ((err2 = ext4_journal_stop(handle)) && !err) + err = err2; + + return err; +} + +/* + * Iterate through the groups which hold BACKUP superblock/GDT copies in an + * ext4 filesystem. The counters should be initialized to 1, 5, and 7 before + * calling this for the first time. In a sparse filesystem it will be the + * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... + * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... + */ +static unsigned ext4_list_backups(struct super_block *sb, unsigned *three, + unsigned *five, unsigned *seven) +{ + unsigned *min = three; + int mult = 3; + unsigned ret; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ret = *min; + *min += 1; + return ret; + } + + if (*five < *min) { + min = five; + mult = 5; + } + if (*seven < *min) { + min = seven; + mult = 7; + } + + ret = *min; + *min *= mult; + + return ret; +} + +/* + * Check that all of the backup GDT blocks are held in the primary GDT block. + * It is assumed that they are stored in group order. Returns the number of + * groups in current filesystem that have BACKUPS, or -ve error code. + */ +static int verify_reserved_gdb(struct super_block *sb, + struct buffer_head *primary) +{ + const ext4_fsblk_t blk = primary->b_blocknr; + const ext4_group_t end = EXT4_SB(sb)->s_groups_count; + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned grp; + __le32 *p = (__le32 *)primary->b_data; + int gdbackups = 0; + + while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { + if (le32_to_cpu(*p++) != + grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ + ext4_warning(sb, "reserved GDT %llu" + " missing grp %d (%llu)", + blk, grp, + grp * + (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + blk); + return -EINVAL; + } + if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb)) + return -EFBIG; + } + + return gdbackups; +} + +/* + * Called when we need to bring a reserved group descriptor table block into + * use from the resize inode. The primary copy of the new GDT block currently + * is an indirect block (under the double indirect block in the resize inode). + * The new backup GDT blocks will be stored as leaf blocks in this indirect + * block, in group order. Even though we know all the block numbers we need, + * we check to ensure that the resize inode has actually reserved these blocks. + * + * Don't need to update the block bitmaps because the blocks are still in use. + * + * We get all of the error cases out of the way, so that we are sure to not + * fail once we start modifying the data on disk, because JBD has no rollback. + */ +static int add_new_gdb(handle_t *handle, struct inode *inode, + struct ext4_new_group_data *input, + struct buffer_head **primary) +{ + struct super_block *sb = inode->i_sb; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); + ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; + struct buffer_head **o_group_desc, **n_group_desc; + struct buffer_head *dind; + int gdbackups; + struct ext4_iloc iloc; + __le32 *data; + int err; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG + "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", + gdb_num); + + /* + * If we are not using the primary superblock/GDT copy don't resize, + * because the user tools have no way of handling this. Probably a + * bad time to do it anyways. + */ + if (EXT4_SB(sb)->s_sbh->b_blocknr != + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { + ext4_warning(sb, "won't resize using backup superblock at %llu", + (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); + return -EPERM; + } + + *primary = sb_bread(sb, gdblock); + if (!*primary) + return -EIO; + + if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { + err = gdbackups; + goto exit_bh; + } + + data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_bh; + } + + data = (__le32 *)dind->b_data; + if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { + ext4_warning(sb, "new group %u GDT block %llu not reserved", + input->group, gdblock); + err = -EINVAL; + goto exit_dind; + } + + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (unlikely(err)) + goto exit_dind; + + err = ext4_journal_get_write_access(handle, *primary); + if (unlikely(err)) + goto exit_sbh; + + err = ext4_journal_get_write_access(handle, dind); + if (unlikely(err)) + ext4_std_error(sb, err); + + /* ext4_reserve_inode_write() gets a reference on the iloc */ + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (unlikely(err)) + goto exit_dindj; + + n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_NOFS); + if (!n_group_desc) { + err = -ENOMEM; + ext4_warning(sb, + "not enough memory for %lu groups", gdb_num + 1); + goto exit_inode; + } + + /* + * Finally, we have all of the possible failures behind us... + * + * Remove new GDT block from inode double-indirect block and clear out + * the new GDT block for use (which also "frees" the backup GDT blocks + * from the reserved inode). We don't need to change the bitmaps for + * these blocks, because they are marked as in-use from being in the + * reserved inode, and will become GDT blocks (primary and backup). + */ + data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; + err = ext4_handle_dirty_metadata(handle, NULL, dind); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_inode; + } + inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; + ext4_mark_iloc_dirty(handle, inode, &iloc); + memset((*primary)->b_data, 0, sb->s_blocksize); + err = ext4_handle_dirty_metadata(handle, NULL, *primary); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_inode; + } + brelse(dind); + + o_group_desc = EXT4_SB(sb)->s_group_desc; + memcpy(n_group_desc, o_group_desc, + EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + n_group_desc[gdb_num] = *primary; + EXT4_SB(sb)->s_group_desc = n_group_desc; + EXT4_SB(sb)->s_gdb_count++; + kfree(o_group_desc); + + le16_add_cpu(&es->s_reserved_gdt_blocks, -1); + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + if (err) + ext4_std_error(sb, err); + + return err; + +exit_inode: + /* ext4_handle_release_buffer(handle, iloc.bh); */ + brelse(iloc.bh); +exit_dindj: + /* ext4_handle_release_buffer(handle, dind); */ +exit_sbh: + /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ +exit_dind: + brelse(dind); +exit_bh: + brelse(*primary); + + ext4_debug("leaving with error %d\n", err); + return err; +} + +/* + * Called when we are adding a new group which has a backup copy of each of + * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. + * We need to add these reserved backup GDT blocks to the resize inode, so + * that they are kept for future resizing and not allocated to files. + * + * Each reserved backup GDT block will go into a different indirect block. + * The indirect blocks are actually the primary reserved GDT blocks, + * so we know in advance what their block numbers are. We only get the + * double-indirect block to verify it is pointing to the primary reserved + * GDT blocks so we don't overwrite a data block by accident. The reserved + * backup GDT blocks are stored in their reserved primary GDT block. + */ +static int reserve_backup_gdb(handle_t *handle, struct inode *inode, + struct ext4_new_group_data *input) +{ + struct super_block *sb = inode->i_sb; + int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); + struct buffer_head **primary; + struct buffer_head *dind; + struct ext4_iloc iloc; + ext4_fsblk_t blk; + __le32 *data, *end; + int gdbackups = 0; + int res, i; + int err; + + primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS); + if (!primary) + return -ENOMEM; + + data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_free; + } + + blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count; + data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count % + EXT4_ADDR_PER_BLOCK(sb)); + end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb); + + /* Get each reserved primary GDT block and verify it holds backups */ + for (res = 0; res < reserved_gdb; res++, blk++) { + if (le32_to_cpu(*data) != blk) { + ext4_warning(sb, "reserved block %llu" + " not at offset %ld", + blk, + (long)(data - (__le32 *)dind->b_data)); + err = -EINVAL; + goto exit_bh; + } + primary[res] = sb_bread(sb, blk); + if (!primary[res]) { + err = -EIO; + goto exit_bh; + } + if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { + brelse(primary[res]); + err = gdbackups; + goto exit_bh; + } + if (++data >= end) + data = (__le32 *)dind->b_data; + } + + for (i = 0; i < reserved_gdb; i++) { + if ((err = ext4_journal_get_write_access(handle, primary[i]))) { + /* + int j; + for (j = 0; j < i; j++) + ext4_handle_release_buffer(handle, primary[j]); + */ + goto exit_bh; + } + } + + if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) + goto exit_bh; + + /* + * Finally we can add each of the reserved backup GDT blocks from + * the new group to its reserved primary GDT block. + */ + blk = input->group * EXT4_BLOCKS_PER_GROUP(sb); + for (i = 0; i < reserved_gdb; i++) { + int err2; + data = (__le32 *)primary[i]->b_data; + /* printk("reserving backup %lu[%u] = %lu\n", + primary[i]->b_blocknr, gdbackups, + blk + primary[i]->b_blocknr); */ + data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); + err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); + if (!err) + err = err2; + } + inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; + ext4_mark_iloc_dirty(handle, inode, &iloc); + +exit_bh: + while (--res >= 0) + brelse(primary[res]); + brelse(dind); + +exit_free: + kfree(primary); + + return err; +} + +/* + * Update the backup copies of the ext4 metadata. These don't need to be part + * of the main resize transaction, because e2fsck will re-write them if there + * is a problem (basically only OOM will cause a problem). However, we + * _should_ update the backups if possible, in case the primary gets trashed + * for some reason and we need to run e2fsck from a backup superblock. The + * important part is that the new block and inode counts are in the backup + * superblocks, and the location of the new group metadata in the GDT backups. + * + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. + */ +static void update_backups(struct super_block *sb, + int blk_off, char *data, int size) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + const ext4_group_t last = sbi->s_groups_count; + const int bpg = EXT4_BLOCKS_PER_GROUP(sb); + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + ext4_group_t group; + int rest = sb->s_blocksize - size; + handle_t *handle; + int err = 0, err2; + + handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); + if (IS_ERR(handle)) { + group = 1; + err = PTR_ERR(handle); + goto exit_err; + } + + while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) { + struct buffer_head *bh; + + /* Out of journal space, and can't get more - abort - so sad */ + if (ext4_handle_valid(handle) && + handle->h_buffer_credits == 0 && + ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && + (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) + break; + + bh = sb_getblk(sb, group * bpg + blk_off); + if (!bh) { + err = -EIO; + break; + } + ext4_debug("update metadata backup %#04lx\n", + (unsigned long)bh->b_blocknr); + if ((err = ext4_journal_get_write_access(handle, bh))) + break; + lock_buffer(bh); + memcpy(bh->b_data, data, size); + if (rest) + memset(bh->b_data + size, 0, rest); + set_buffer_uptodate(bh); + unlock_buffer(bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + ext4_std_error(sb, err); + brelse(bh); + } + if ((err2 = ext4_journal_stop(handle)) && !err) + err = err2; + + /* + * Ugh! Need to have e2fsck write the backup copies. It is too + * late to revert the resize, we shouldn't fail just because of + * the backup copies (they are only needed in case of corruption). + * + * However, if we got here we have a journal problem too, so we + * can't really start a transaction to mark the superblock. + * Chicken out and just set the flag on the hope it will be written + * to disk, and if not - we will simply wait until next fsck. + */ +exit_err: + if (err) { + ext4_warning(sb, "can't update backup for group %u (err %d), " + "forcing fsck on next reboot", group, err); + sbi->s_mount_state &= ~EXT4_VALID_FS; + sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); + mark_buffer_dirty(sbi->s_sbh); + } +} + +/* Add group descriptor data to an existing or new group descriptor block. + * Ensure we handle all possible error conditions _before_ we start modifying + * the filesystem, because we cannot abort the transaction and not have it + * write the data to disk. + * + * If we are on a GDT block boundary, we need to get the reserved GDT block. + * Otherwise, we may need to add backup GDT blocks for a sparse group. + * + * We only need to hold the superblock lock while we are actually adding + * in the new group's counts to the superblock. Prior to that we have + * not really "added" the group at all. We re-check that we are still + * adding in the last group in case things have changed since verifying. + */ +int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int reserved_gdb = ext4_bg_has_super(sb, input->group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + struct buffer_head *primary = NULL; + struct ext4_group_desc *gdp; + struct inode *inode = NULL; + handle_t *handle; + int gdb_off, gdb_num; + int err, err2; + + gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); + gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); + + if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ext4_warning(sb, "Can't resize non-sparse filesystem further"); + return -EPERM; + } + + if (ext4_blocks_count(es) + input->blocks_count < + ext4_blocks_count(es)) { + ext4_warning(sb, "blocks_count overflow"); + return -EINVAL; + } + + if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < + le32_to_cpu(es->s_inodes_count)) { + ext4_warning(sb, "inodes_count overflow"); + return -EINVAL; + } + + if (reserved_gdb || gdb_off == 0) { + if (!EXT4_HAS_COMPAT_FEATURE(sb, + EXT4_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { + ext4_warning(sb, + "No reserved GDT blocks, can't resize"); + return -EPERM; + } + inode = ext4_iget(sb, EXT4_RESIZE_INO); + if (IS_ERR(inode)) { + ext4_warning(sb, "Error opening resize inode"); + return PTR_ERR(inode); + } + } + + + if ((err = verify_group_input(sb, input))) + goto exit_put; + + if ((err = setup_new_group_blocks(sb, input))) + goto exit_put; + + /* + * We will always be modifying at least the superblock and a GDT + * block. If we are adding a group past the last current GDT block, + * we will also modify the inode and the dindirect block. If we + * are adding a group with superblock/GDT backups we will also + * modify each of the reserved GDT dindirect blocks. + */ + handle = ext4_journal_start_sb(sb, + ext4_bg_has_super(sb, input->group) ? + 3 + reserved_gdb : 4); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto exit_put; + } + + mutex_lock(&sbi->s_resize_lock); + if (input->group != sbi->s_groups_count) { + ext4_warning(sb, "multiple resizers run on filesystem!"); + err = -EBUSY; + goto exit_journal; + } + + if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) + goto exit_journal; + + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { + primary = sbi->s_group_desc[gdb_num]; + if ((err = ext4_journal_get_write_access(handle, primary))) + goto exit_journal; + + if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) && + (err = reserve_backup_gdb(handle, inode, input))) + goto exit_journal; + } else if ((err = add_new_gdb(handle, inode, input, &primary))) + goto exit_journal; + + /* + * OK, now we've set up the new group. Time to make it active. + * + * We do not lock all allocations via s_resize_lock + * so we have to be safe wrt. concurrent accesses the group + * data. So we need to be careful to set all of the relevant + * group descriptor data etc. *before* we enable the group. + * + * The key field here is sbi->s_groups_count: as long as + * that retains its old value, nobody is going to access the new + * group. + * + * So first we update all the descriptor metadata for the new + * group; then we update the total disk blocks count; then we + * update the groups count to enable the group; then finally we + * update the free space counts so that the system can start + * using the new disk blocks. + */ + + /* Update group descriptor block for new group */ + gdp = (struct ext4_group_desc *)((char *)primary->b_data + + gdb_off * EXT4_DESC_SIZE(sb)); + + memset(gdp, 0, EXT4_DESC_SIZE(sb)); + ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ + ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ + ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ + ext4_free_blks_set(sb, gdp, input->free_blocks_count); + ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); + gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED); + gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); + + /* + * We can allocate memory for mb_alloc based on the new group + * descriptor + */ + err = ext4_mb_add_groupinfo(sb, input->group, gdp); + if (err) + goto exit_journal; + + /* + * Make the new blocks and inodes valid next. We do this before + * increasing the group count so that once the group is enabled, + * all of its blocks and inodes are already valid. + * + * We always allocate group-by-group, then block-by-block or + * inode-by-inode within a group, so enabling these + * blocks/inodes before the group is live won't actually let us + * allocate the new space yet. + */ + ext4_blocks_count_set(es, ext4_blocks_count(es) + + input->blocks_count); + le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb)); + + /* + * We need to protect s_groups_count against other CPUs seeing + * inconsistent state in the superblock. + * + * The precise rules we use are: + * + * * Writers of s_groups_count *must* hold s_resize_lock + * AND + * * Writers must perform a smp_wmb() after updating all dependent + * data and before modifying the groups count + * + * * Readers must hold s_resize_lock over the access + * OR + * * Readers must perform an smp_rmb() after reading the groups count + * and before reading any dependent data. + * + * NB. These rules can be relaxed when checking the group count + * while freeing data, as we can only allocate from a block + * group after serialising against the group count, and we can + * only then free after serialising in turn against that + * allocation. + */ + smp_wmb(); + + /* Update the global fs size fields */ + sbi->s_groups_count++; + + err = ext4_handle_dirty_metadata(handle, NULL, primary); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_journal; + } + + /* Update the reserved block counts only once the new group is + * active. */ + ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + + input->reserved_blocks); + + /* Update the free space counts */ + percpu_counter_add(&sbi->s_freeblocks_counter, + input->free_blocks_count); + percpu_counter_add(&sbi->s_freeinodes_counter, + EXT4_INODES_PER_GROUP(sb)); + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && + sbi->s_log_groups_per_flex) { + ext4_group_t flex_group; + flex_group = ext4_flex_group(sbi, input->group); + atomic_add(input->free_blocks_count, + &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(EXT4_INODES_PER_GROUP(sb), + &sbi->s_flex_groups[flex_group].free_inodes); + } + + ext4_handle_dirty_super(handle, sb); + +exit_journal: + mutex_unlock(&sbi->s_resize_lock); + if ((err2 = ext4_journal_stop(handle)) && !err) + err = err2; + if (!err) { + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext4_super_block)); + update_backups(sb, primary->b_blocknr, primary->b_data, + primary->b_size); + } +exit_put: + iput(inode); + return err; +} /* ext4_group_add */ + +/* + * Extend the filesystem to the new number of blocks specified. This entry + * point is only used to extend the current filesystem to the end of the last + * existing group. It can be accessed via ioctl, or by "remount,resize=" + * for emergencies (because it has no dependencies on reserved blocks). + * + * If we _really_ wanted, we could use default values to call ext4_group_add() + * allow the "remount" trick to work for arbitrary resizing, assuming enough + * GDT blocks are reserved to grow to the desired size. + */ +int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count) +{ + ext4_fsblk_t o_blocks_count; + ext4_grpblk_t last; + ext4_grpblk_t add; + struct buffer_head *bh; + handle_t *handle; + int err; + ext4_group_t group; + + /* We don't need to worry about locking wrt other resizers just + * yet: we're going to revalidate es->s_blocks_count after + * taking the s_resize_lock below. */ + o_blocks_count = ext4_blocks_count(es); + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", + o_blocks_count, n_blocks_count); + + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + + if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + printk(KERN_ERR "EXT4-fs: filesystem on %s:" + " too large to resize to %llu blocks safely\n", + sb->s_id, n_blocks_count); + if (sizeof(sector_t) < 8) + ext4_warning(sb, "CONFIG_LBDAF not enabled"); + return -EINVAL; + } + + if (n_blocks_count < o_blocks_count) { + ext4_warning(sb, "can't shrink FS - resize aborted"); + return -EBUSY; + } + + /* Handle the remaining blocks in the last group only. */ + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + + if (last == 0) { + ext4_warning(sb, "need to use ext2online to resize further"); + return -EPERM; + } + + add = EXT4_BLOCKS_PER_GROUP(sb) - last; + + if (o_blocks_count + add < o_blocks_count) { + ext4_warning(sb, "blocks_count overflow"); + return -EINVAL; + } + + if (o_blocks_count + add > n_blocks_count) + add = n_blocks_count - o_blocks_count; + + if (o_blocks_count + add < n_blocks_count) + ext4_warning(sb, "will only finish group (%llu blocks, %u new)", + o_blocks_count + add, add); + + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, o_blocks_count + add - 1); + if (!bh) { + ext4_warning(sb, "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + /* We will update the superblock, one block bitmap, and + * one group descriptor via ext4_free_blocks(). + */ + handle = ext4_journal_start_sb(sb, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext4_warning(sb, "error %d on journal start", err); + goto exit_put; + } + + mutex_lock(&EXT4_SB(sb)->s_resize_lock); + if (o_blocks_count != ext4_blocks_count(es)) { + ext4_warning(sb, "multiple resizers run on filesystem!"); + mutex_unlock(&EXT4_SB(sb)->s_resize_lock); + ext4_journal_stop(handle); + err = -EBUSY; + goto exit_put; + } + + if ((err = ext4_journal_get_write_access(handle, + EXT4_SB(sb)->s_sbh))) { + ext4_warning(sb, "error %d on journal write access", err); + mutex_unlock(&EXT4_SB(sb)->s_resize_lock); + ext4_journal_stop(handle); + goto exit_put; + } + ext4_blocks_count_set(es, o_blocks_count + add); + mutex_unlock(&EXT4_SB(sb)->s_resize_lock); + ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); + /* We add the blocks to the bitmap and set the group need init bit */ + ext4_add_groupblocks(handle, sb, o_blocks_count, add); + ext4_handle_dirty_super(handle, sb); + ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); + if ((err = ext4_journal_stop(handle))) + goto exit_put; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", + ext4_blocks_count(es)); + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext4_super_block)); +exit_put: + return err; +} /* ext4_group_extend */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c new file mode 100644 index 00000000..113b1076 --- /dev/null +++ b/fs/ext4/super.c @@ -0,0 +1,5027 @@ +/* + * linux/fs/ext4/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "ext4.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "mballoc.h" + +#define CREATE_TRACE_POINTS +#include + +static struct proc_dir_entry *ext4_proc_root; +static struct kset *ext4_kset; +static struct ext4_lazy_init *ext4_li_info; +static struct mutex ext4_li_mtx; +static struct ext4_features *ext4_feat; + +static int ext4_load_journal(struct super_block *, struct ext4_super_block *, + unsigned long journal_devnum); +static int ext4_commit_super(struct super_block *sb, int sync); +static void ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es); +static void ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es); +static int ext4_sync_fs(struct super_block *sb, int wait); +static const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); +static int ext4_remount(struct super_block *sb, int *flags, char *data); +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); +static int ext4_unfreeze(struct super_block *sb); +static void ext4_write_super(struct super_block *sb); +static int ext4_freeze(struct super_block *sb); +static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data); +static inline int ext2_feature_set_ok(struct super_block *sb); +static inline int ext3_feature_set_ok(struct super_block *sb); +static int ext4_feature_set_ok(struct super_block *sb, int readonly); +static void ext4_destroy_lazyinit_thread(void); +static void ext4_unregister_li_request(struct super_block *sb); +static void ext4_clear_request_list(void); + +#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext2_fs_type = { + .owner = THIS_MODULE, + .name = "ext2", + .mount = ext4_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) +#else +#define IS_EXT2_SB(sb) (0) +#endif + + +#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext3_fs_type = { + .owner = THIS_MODULE, + .name = "ext3", + .mount = ext4_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) +#else +#define IS_EXT3_SB(sb) (0) +#endif + +ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_block_bitmap_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); +} + +ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_inode_bitmap_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); +} + +ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_inode_table_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); +} + +__u32 ext4_free_blks_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_free_blocks_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); +} + +__u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_free_inodes_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); +} + +__u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_used_dirs_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); +} + +__u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_itable_unused_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); +} + +void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); +} + +void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); +} + +void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_inode_table_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); +} + +void ext4_free_blks_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); +} + + +/* Just increment the non-pointer handle value */ +static handle_t *ext4_get_nojournal(void) +{ + handle_t *handle = current->journal_info; + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); + + ref_cnt++; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; + return handle; +} + + +/* Decrement the non-pointer handle value */ +static void ext4_put_nojournal(handle_t *handle) +{ + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt == 0); + + ref_cnt--; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; +} + +/* + * Wrappers for jbd2_journal_start/end. + * + * The only special thing we need to do here is to make sure that all + * journal_end calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + * + * To avoid j_barrier hold in userspace when a user calls freeze(), + * ext4 prevents a new handle from being started by s_frozen, which + * is in an upper layer. + */ +handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) +{ + journal_t *journal; + handle_t *handle; + + if (sb->s_flags & MS_RDONLY) + return ERR_PTR(-EROFS); + + journal = EXT4_SB(sb)->s_journal; + handle = ext4_journal_current_handle(); + + /* + * If a handle has been started, it should be allowed to + * finish, otherwise deadlock could happen between freeze + * and others(e.g. truncate) due to the restart of the + * journal handle if the filesystem is forzen and active + * handles are not stopped. + */ + if (!handle) + vfs_check_frozen(sb, SB_FREEZE_TRANS); + + if (!journal) + return ext4_get_nojournal(); + /* + * Special case here: if the journal has aborted behind our + * backs (eg. EIO in the commit thread), then we still need to + * take the FS itself readonly cleanly. + */ + if (is_journal_aborted(journal)) { + ext4_abort(sb, "Detected aborted journal"); + return ERR_PTR(-EROFS); + } + return jbd2_journal_start(journal, nblocks); +} + +/* + * The only special thing we need to do here is to make sure that all + * jbd2_journal_stop calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + */ +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) +{ + struct super_block *sb; + int err; + int rc; + + if (!ext4_handle_valid(handle)) { + ext4_put_nojournal(handle); + return 0; + } + sb = handle->h_transaction->t_journal->j_private; + err = handle->h_err; + rc = jbd2_journal_stop(handle); + + if (!err) + err = rc; + if (err) + __ext4_std_error(sb, where, line, err); + return err; +} + +void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, struct buffer_head *bh, + handle_t *handle, int err) +{ + char nbuf[16]; + const char *errstr = ext4_decode_error(NULL, err, nbuf); + + BUG_ON(!ext4_handle_valid(handle)); + + if (bh) + BUFFER_TRACE(bh, "abort"); + + if (!handle->h_err) + handle->h_err = err; + + if (is_handle_aborted(handle)) + return; + + printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n", + caller, line, errstr, err_fn); + + jbd2_journal_abort_handle(handle); +} + +static void __save_error_info(struct super_block *sb, const char *func, + unsigned int line) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + es->s_last_error_time = cpu_to_le32(get_seconds()); + strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); + es->s_last_error_line = cpu_to_le32(line); + if (!es->s_first_error_time) { + es->s_first_error_time = es->s_last_error_time; + strncpy(es->s_first_error_func, func, + sizeof(es->s_first_error_func)); + es->s_first_error_line = cpu_to_le32(line); + es->s_first_error_ino = es->s_last_error_ino; + es->s_first_error_block = es->s_last_error_block; + } + /* + * Start the daily error reporting function if it hasn't been + * started already + */ + if (!es->s_error_count) + mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); + es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1); +} + +static void save_error_info(struct super_block *sb, const char *func, + unsigned int line) +{ + __save_error_info(sb, func, line); + ext4_commit_super(sb, 1); +} + + +/* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * + * On ext2, we can store the error state of the filesystem in the + * superblock. That is not possible on ext4, because we may have other + * write ordering constraints on the superblock which prevent us from + * writing it out straight away; and given that the journal is about to + * be aborted, we can't rely on the current, or future, transactions to + * write out the superblock safely. + * + * We'll just use the jbd2_journal_abort() error code to record an error in + * the journal instead. On recovery, the journal will complain about + * that error until we've noted it down and cleared it. + */ + +static void ext4_handle_error(struct super_block *sb) +{ + if (sb->s_flags & MS_RDONLY) + return; + + if (!test_opt(sb, ERRORS_CONT)) { + journal_t *journal = EXT4_SB(sb)->s_journal; + + EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + if (journal) + jbd2_journal_abort(journal, -EIO); + } + if (test_opt(sb, ERRORS_RO)) { + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); + sb->s_flags |= MS_RDONLY; + } + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT4-fs (device %s): panic forced after error\n", + sb->s_id); +} + +void __ext4_error(struct super_block *sb, const char *function, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", + sb->s_id, function, line, current->comm, &vaf); + va_end(args); + save_error_info(sb, function, line); + + ext4_handle_error(sb); +} + +void ext4_error_inode(struct inode *inode, const char *function, + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) +{ + va_list args; + struct va_format vaf; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + + es->s_last_error_ino = cpu_to_le32(inode->i_ino); + es->s_last_error_block = cpu_to_le64(block); + save_error_info(inode->i_sb, function, line); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", + inode->i_sb->s_id, function, line, inode->i_ino); + if (block) + printk(KERN_CONT "block %llu: ", block); + printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf); + va_end(args); + + ext4_handle_error(inode->i_sb); +} + +void ext4_error_file(struct file *file, const char *function, + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) +{ + va_list args; + struct va_format vaf; + struct ext4_super_block *es; + struct inode *inode = file->f_dentry->d_inode; + char pathname[80], *path; + + es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_ino = cpu_to_le32(inode->i_ino); + save_error_info(inode->i_sb, function, line); + path = d_path(&(file->f_path), pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: ", + inode->i_sb->s_id, function, line, inode->i_ino); + if (block) + printk(KERN_CONT "block %llu: ", block); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf); + va_end(args); + + ext4_handle_error(inode->i_sb); +} + +static const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]) +{ + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + if (!sb || (EXT4_SB(sb)->s_journal && + EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) + errstr = "Journal has aborted"; + else + errstr = "Readonly filesystem"; + break; + default: + /* If the caller passed in an extra buffer for unknown + * errors, textualise them now. Else we just return + * NULL. */ + if (nbuf) { + /* Check for truncated error codes... */ + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + break; + } + + return errstr; +} + +/* __ext4_std_error decodes expected errors from journaling functions + * automatically and invokes the appropriate error response. */ + +void __ext4_std_error(struct super_block *sb, const char *function, + unsigned int line, int errno) +{ + char nbuf[16]; + const char *errstr; + + /* Special case: if the error is EROFS, and we're not already + * inside a transaction, then there's really no point in logging + * an error. */ + if (errno == -EROFS && journal_current_handle() == NULL && + (sb->s_flags & MS_RDONLY)) + return; + + errstr = ext4_decode_error(sb, errno, nbuf); + printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + save_error_info(sb, function, line); + + ext4_handle_error(sb); +} + +/* + * ext4_abort is a much stronger failure handler than ext4_error. The + * abort function may be used to deal with unrecoverable failures such + * as journal IO errors or ENOMEM at a critical moment in log management. + * + * We unconditionally force the filesystem into an ABORT|READONLY state, + * unless the error response on the fs has been set to panic in which + * case we take the easy way out and panic immediately. + */ + +void __ext4_abort(struct super_block *sb, const char *function, + unsigned int line, const char *fmt, ...) +{ + va_list args; + + save_error_info(sb, function, line); + va_start(args, fmt); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id, + function, line); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + if ((sb->s_flags & MS_RDONLY) == 0) { + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); + sb->s_flags |= MS_RDONLY; + EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + if (EXT4_SB(sb)->s_journal) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); + save_error_info(sb, function, line); + } + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT4-fs panic from previous error\n"); +} + +void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + va_end(args); +} + +void __ext4_warning(struct super_block *sb, const char *function, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", + sb->s_id, function, line, &vaf); + va_end(args); +} + +void __ext4_grp_locked_error(const char *function, unsigned int line, + struct super_block *sb, ext4_group_t grp, + unsigned long ino, ext4_fsblk_t block, + const char *fmt, ...) +__releases(bitlock) +__acquires(bitlock) +{ + struct va_format vaf; + va_list args; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + es->s_last_error_ino = cpu_to_le32(ino); + es->s_last_error_block = cpu_to_le64(block); + __save_error_info(sb, function, line); + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", + sb->s_id, function, line, grp); + if (ino) + printk(KERN_CONT "inode %lu: ", ino); + if (block) + printk(KERN_CONT "block %llu:", (unsigned long long) block); + printk(KERN_CONT "%pV\n", &vaf); + va_end(args); + + if (test_opt(sb, ERRORS_CONT)) { + ext4_commit_super(sb, 0); + return; + } + + ext4_unlock_group(sb, grp); + ext4_handle_error(sb); + /* + * We only get here in the ERRORS_RO case; relocking the group + * may be dangerous, but nothing bad will happen since the + * filesystem will have already been marked read/only and the + * journal has been aborted. We return 1 as a hint to callers + * who might what to use the return value from + * ext4_grp_locked_error() to distinguish between the + * ERRORS_CONT and ERRORS_RO case, and perhaps return more + * aggressively from the ext4 function in question, with a + * more appropriate error code. + */ + ext4_lock_group(sb, grp); + return; +} + +void ext4_update_dynamic_rev(struct super_block *sb) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) + return; + + ext4_warning(sb, + "updating to rev %d because of new feature flag, " + "running e2fsck is recommended", + EXT4_DYNAMIC_REV); + + es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); + es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); + es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); + /* leave es->s_feature_*compat flags alone */ + /* es->s_uuid will be set by e2fsck if empty */ + + /* + * The rest of the superblock fields should be zero, and if not it + * means they are likely already in use, so leave them alone. We + * can leave it up to e2fsck to clean up any inconsistencies there. + */ +} + +/* + * Open the external journal device + */ +static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) +{ + struct block_device *bdev; + char b[BDEVNAME_SIZE]; + + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); + if (IS_ERR(bdev)) + goto fail; + return bdev; + +fail: + ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld", + __bdevname(dev, b), PTR_ERR(bdev)); + return NULL; +} + +/* + * Release the journal device + */ +static int ext4_blkdev_put(struct block_device *bdev) +{ + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +} + +static int ext4_blkdev_remove(struct ext4_sb_info *sbi) +{ + struct block_device *bdev; + int ret = -ENODEV; + + bdev = sbi->journal_bdev; + if (bdev) { + ret = ext4_blkdev_put(bdev); + sbi->journal_bdev = NULL; + } + return ret; +} + +static inline struct inode *orphan_list_entry(struct list_head *l) +{ + return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; +} + +static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) +{ + struct list_head *l; + + ext4_msg(sb, KERN_ERR, "sb orphan head is %d", + le32_to_cpu(sbi->s_es->s_last_orphan)); + + printk(KERN_ERR "sb_info orphan list:\n"); + list_for_each(l, &sbi->s_orphan) { + struct inode *inode = orphan_list_entry(l); + printk(KERN_ERR " " + "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", + inode->i_sb->s_id, inode->i_ino, inode, + inode->i_mode, inode->i_nlink, + NEXT_ORPHAN(inode)); + } +} + +static void ext4_put_super(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int i, err; + + ext4_unregister_li_request(sb); + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + + flush_workqueue(sbi->dio_unwritten_wq); + destroy_workqueue(sbi->dio_unwritten_wq); + + lock_super(sb); + if (sb->s_dirt) + ext4_commit_super(sb, 1); + + if (sbi->s_journal) { + err = jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + if (err < 0) + ext4_abort(sb, "Couldn't clean up the journal"); + } + + del_timer(&sbi->s_err_report); + ext4_release_system_zone(sb); + ext4_mb_release(sb); + ext4_ext_release(sb); + ext4_xattr_put_super(sb); + + if (!(sb->s_flags & MS_RDONLY)) { + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + es->s_state = cpu_to_le16(sbi->s_mount_state); + ext4_commit_super(sb, 1); + } + if (sbi->s_proc) { + remove_proc_entry(sb->s_id, ext4_proc_root); + } + kobject_del(&sbi->s_kobj); + + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); + if (is_vmalloc_addr(sbi->s_flex_groups)) + vfree(sbi->s_flex_groups); + else + kfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + brelse(sbi->s_sbh); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + + /* Debugging code just in case the in-memory inode orphan list + * isn't empty. The on-disk one can be non-empty if we've + * detected an error and taken the fs readonly, but the + * in-memory list had better be clean by this point. */ + if (!list_empty(&sbi->s_orphan)) + dump_orphan_list(sb, sbi); + J_ASSERT(list_empty(&sbi->s_orphan)); + + invalidate_bdev(sb->s_bdev); + if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + sync_blockdev(sbi->journal_bdev); + invalidate_bdev(sbi->journal_bdev); + ext4_blkdev_remove(sbi); + } + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); + sb->s_fs_info = NULL; + /* + * Now that we are completely done shutting down the + * superblock, we need to actually destroy the kobject. + */ + unlock_super(sb); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + kfree(sbi->s_blockgroup_lock); + kfree(sbi); +} + +static struct kmem_cache *ext4_inode_cachep; + +/* + * Called inside transaction, so use GFP_NOFS + */ +static struct inode *ext4_alloc_inode(struct super_block *sb) +{ + struct ext4_inode_info *ei; + + ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + + ei->vfs_inode.i_version = 1; + ei->vfs_inode.i_data.writeback_index = 0; + memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); + INIT_LIST_HEAD(&ei->i_prealloc_list); + spin_lock_init(&ei->i_prealloc_lock); + ei->i_reserved_data_blocks = 0; + ei->i_reserved_meta_blocks = 0; + ei->i_allocated_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; + spin_lock_init(&(ei->i_block_reservation_lock)); +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif + ei->jinode = NULL; + INIT_LIST_HEAD(&ei->i_completed_io_list); + spin_lock_init(&ei->i_completed_io_lock); + ei->cur_aio_dio = NULL; + ei->i_sync_tid = 0; + ei->i_datasync_tid = 0; + atomic_set(&ei->i_ioend_count, 0); + atomic_set(&ei->i_aiodio_unwritten, 0); + + return &ei->vfs_inode; +} + +static int ext4_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext4_drop_inode(inode, drop); + return drop; +} + +static void ext4_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); +} + +static void ext4_destroy_inode(struct inode *inode) +{ + if (!list_empty(&(EXT4_I(inode)->i_orphan))) { + ext4_msg(inode->i_sb, KERN_ERR, + "Inode %lu (%p): orphan list check failed!", + inode->i_ino, EXT4_I(inode)); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, + EXT4_I(inode), sizeof(struct ext4_inode_info), + true); + dump_stack(); + } + call_rcu(&inode->i_rcu, ext4_i_callback); +} + +static void init_once(void *foo) +{ + struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; + + INIT_LIST_HEAD(&ei->i_orphan); +#ifdef CONFIG_EXT4_FS_XATTR + init_rwsem(&ei->xattr_sem); +#endif + init_rwsem(&ei->i_data_sem); + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", + sizeof(struct ext4_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (ext4_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(ext4_inode_cachep); +} + +void ext4_clear_inode(struct inode *inode) +{ + invalidate_inode_buffers(inode); + end_writeback(inode); + dquot_drop(inode); + ext4_discard_preallocations(inode); + if (EXT4_I(inode)->jinode) { + jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode); + jbd2_free_inode(EXT4_I(inode)->jinode); + EXT4_I(inode)->jinode = NULL; + } +} + +static inline void ext4_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#if defined(CONFIG_QUOTA) + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + + if (test_opt(sb, USRQUOTA)) + seq_puts(seq, ",usrquota"); + + if (test_opt(sb, GRPQUOTA)) + seq_puts(seq, ",grpquota"); +#endif +} + +/* + * Show an option if + * - it's set to a non-default value OR + * - if the per-sb default is different from the global default + */ +static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + int def_errors; + unsigned long def_mount_opts; + struct super_block *sb = vfs->mnt_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + def_errors = le16_to_cpu(es->s_errors); + + if (sbi->s_sb_block != 1) + seq_printf(seq, ",sb=%llu", sbi->s_sb_block); + if (test_opt(sb, MINIX_DF)) + seq_puts(seq, ",minixdf"); + if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS)) + seq_puts(seq, ",grpid"); + if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS)) + seq_puts(seq, ",nogrpid"); + if (sbi->s_resuid != EXT4_DEF_RESUID || + le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) { + seq_printf(seq, ",resuid=%u", sbi->s_resuid); + } + if (sbi->s_resgid != EXT4_DEF_RESGID || + le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) { + seq_printf(seq, ",resgid=%u", sbi->s_resgid); + } + if (test_opt(sb, ERRORS_RO)) { + if (def_errors == EXT4_ERRORS_PANIC || + def_errors == EXT4_ERRORS_CONTINUE) { + seq_puts(seq, ",errors=remount-ro"); + } + } + if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) + seq_puts(seq, ",errors=continue"); + if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) + seq_puts(seq, ",errors=panic"); + if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16)) + seq_puts(seq, ",nouid32"); + if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG)) + seq_puts(seq, ",debug"); + if (test_opt(sb, OLDALLOC)) + seq_puts(seq, ",oldalloc"); +#ifdef CONFIG_EXT4_FS_XATTR + if (test_opt(sb, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + if (!test_opt(sb, XATTR_USER)) + seq_puts(seq, ",nouser_xattr"); +#endif +#ifdef CONFIG_EXT4_FS_POSIX_ACL + if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) + seq_puts(seq, ",acl"); + if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) + seq_puts(seq, ",noacl"); +#endif + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { + seq_printf(seq, ",commit=%u", + (unsigned) (sbi->s_commit_interval / HZ)); + } + if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) { + seq_printf(seq, ",min_batch_time=%u", + (unsigned) sbi->s_min_batch_time); + } + if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { + seq_printf(seq, ",max_batch_time=%u", + (unsigned) sbi->s_min_batch_time); + } + + /* + * We're changing the default of barrier mount option, so + * let's always display its mount state so it's clear what its + * status is. + */ + seq_puts(seq, ",barrier="); + seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) + seq_puts(seq, ",journal_async_commit"); + else if (test_opt(sb, JOURNAL_CHECKSUM)) + seq_puts(seq, ",journal_checksum"); + if (test_opt(sb, I_VERSION)) + seq_puts(seq, ",i_version"); + if (!test_opt(sb, DELALLOC) && + !(def_mount_opts & EXT4_DEFM_NODELALLOC)) + seq_puts(seq, ",nodelalloc"); + + if (!test_opt(sb, MBLK_IO_SUBMIT)) + seq_puts(seq, ",nomblk_io_submit"); + if (sbi->s_stripe) + seq_printf(seq, ",stripe=%lu", sbi->s_stripe); + /* + * journal mode get enabled in different ways + * So just print the value even if we didn't specify it + */ + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + seq_puts(seq, ",data=journal"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + seq_puts(seq, ",data=ordered"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + seq_puts(seq, ",data=writeback"); + + if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) + seq_printf(seq, ",inode_readahead_blks=%u", + sbi->s_inode_readahead_blks); + + if (test_opt(sb, DATA_ERR_ABORT)) + seq_puts(seq, ",data_err=abort"); + + if (test_opt(sb, NO_AUTO_DA_ALLOC)) + seq_puts(seq, ",noauto_da_alloc"); + + if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD)) + seq_puts(seq, ",discard"); + + if (test_opt(sb, NOLOAD)) + seq_puts(seq, ",norecovery"); + + if (test_opt(sb, DIOREAD_NOLOCK)) + seq_puts(seq, ",dioread_nolock"); + + if (test_opt(sb, BLOCK_VALIDITY) && + !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) + seq_puts(seq, ",block_validity"); + + if (!test_opt(sb, INIT_INODE_TABLE)) + seq_puts(seq, ",noinit_itable"); + else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) + seq_printf(seq, ",init_itable=%u", + (unsigned) sbi->s_li_wait_mult); + + ext4_show_quota_options(seq, sb); + + return 0; +} + +static struct inode *ext4_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) + return ERR_PTR(-ESTALE); + if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) + return ERR_PTR(-ESTALE); + + /* iget isn't really right if the inode is currently unallocated!! + * + * ext4_read_inode will return a bad_inode if the inode had been + * deleted, so we should be safe. + * + * Currently we don't know the generation for parent directory, so + * a generation of 0 means "accept any" + */ + inode = ext4_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ext4_nfs_get_inode); +} + +static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ext4_nfs_get_inode); +} + +/* + * Try to release metadata pages (indirect blocks, directories) which are + * mapped via the block device. Since these pages could have journal heads + * which would prevent try_to_free_buffers() from freeing them, we must use + * jbd2 layer's try_to_free_buffers() function to release them. + */ +static int bdev_try_to_free_page(struct super_block *sb, struct page *page, + gfp_t wait) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return jbd2_journal_try_to_free_buffers(journal, page, + wait & ~__GFP_WAIT); + return try_to_free_buffers(page); +} + +#ifdef CONFIG_QUOTA +#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") +#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) + +static int ext4_write_dquot(struct dquot *dquot); +static int ext4_acquire_dquot(struct dquot *dquot); +static int ext4_release_dquot(struct dquot *dquot); +static int ext4_mark_dquot_dirty(struct dquot *dquot); +static int ext4_write_info(struct super_block *sb, int type); +static int ext4_quota_on(struct super_block *sb, int type, int format_id, + struct path *path); +static int ext4_quota_off(struct super_block *sb, int type); +static int ext4_quota_on_mount(struct super_block *sb, int type); +static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off); +static ssize_t ext4_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); + +static const struct dquot_operations ext4_quota_operations = { + .get_reserved_space = ext4_get_reserved_space, + .write_dquot = ext4_write_dquot, + .acquire_dquot = ext4_acquire_dquot, + .release_dquot = ext4_release_dquot, + .mark_dirty = ext4_mark_dquot_dirty, + .write_info = ext4_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +}; + +static const struct quotactl_ops ext4_qctl_operations = { + .quota_on = ext4_quota_on, + .quota_off = ext4_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk +}; +#endif + +static const struct super_operations ext4_sops = { + .alloc_inode = ext4_alloc_inode, + .destroy_inode = ext4_destroy_inode, + .write_inode = ext4_write_inode, + .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, + .evict_inode = ext4_evict_inode, + .put_super = ext4_put_super, + .sync_fs = ext4_sync_fs, + .freeze_fs = ext4_freeze, + .unfreeze_fs = ext4_unfreeze, + .statfs = ext4_statfs, + .remount_fs = ext4_remount, + .show_options = ext4_show_options, +#ifdef CONFIG_QUOTA + .quota_read = ext4_quota_read, + .quota_write = ext4_quota_write, +#endif + .bdev_try_to_free_page = bdev_try_to_free_page, +}; + +static const struct super_operations ext4_nojournal_sops = { + .alloc_inode = ext4_alloc_inode, + .destroy_inode = ext4_destroy_inode, + .write_inode = ext4_write_inode, + .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, + .evict_inode = ext4_evict_inode, + .write_super = ext4_write_super, + .put_super = ext4_put_super, + .statfs = ext4_statfs, + .remount_fs = ext4_remount, + .show_options = ext4_show_options, +#ifdef CONFIG_QUOTA + .quota_read = ext4_quota_read, + .quota_write = ext4_quota_write, +#endif + .bdev_try_to_free_page = bdev_try_to_free_page, +}; + +static const struct export_operations ext4_export_ops = { + .fh_to_dentry = ext4_fh_to_dentry, + .fh_to_parent = ext4_fh_to_parent, + .get_parent = ext4_get_parent, +}; + +enum { + Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, + Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, + Opt_commit, Opt_min_batch_time, Opt_max_batch_time, + Opt_journal_update, Opt_journal_dev, + Opt_journal_checksum, Opt_journal_async_commit, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, + Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_dioread_nolock, Opt_dioread_lock, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, +}; + +static const match_table_t tokens = { + {Opt_bsd_df, "bsddf"}, + {Opt_minix_df, "minixdf"}, + {Opt_grpid, "grpid"}, + {Opt_grpid, "bsdgroups"}, + {Opt_nogrpid, "nogrpid"}, + {Opt_nogrpid, "sysvgroups"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, + {Opt_sb, "sb=%u"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_nouid32, "nouid32"}, + {Opt_debug, "debug"}, + {Opt_oldalloc, "oldalloc"}, + {Opt_orlov, "orlov"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_noload, "noload"}, + {Opt_noload, "norecovery"}, + {Opt_nobh, "nobh"}, + {Opt_bh, "bh"}, + {Opt_commit, "commit=%u"}, + {Opt_min_batch_time, "min_batch_time=%u"}, + {Opt_max_batch_time, "max_batch_time=%u"}, + {Opt_journal_update, "journal=update"}, + {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_checksum, "journal_checksum"}, + {Opt_journal_async_commit, "journal_async_commit"}, + {Opt_abort, "abort"}, + {Opt_data_journal, "data=journal"}, + {Opt_data_ordered, "data=ordered"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, + {Opt_grpquota, "grpquota"}, + {Opt_noquota, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, + {Opt_barrier, "barrier=%u"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_i_version, "i_version"}, + {Opt_stripe, "stripe=%u"}, + {Opt_resize, "resize"}, + {Opt_delalloc, "delalloc"}, + {Opt_nodelalloc, "nodelalloc"}, + {Opt_mblk_io_submit, "mblk_io_submit"}, + {Opt_nomblk_io_submit, "nomblk_io_submit"}, + {Opt_block_validity, "block_validity"}, + {Opt_noblock_validity, "noblock_validity"}, + {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, + {Opt_journal_ioprio, "journal_ioprio=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc"}, + {Opt_noauto_da_alloc, "noauto_da_alloc"}, + {Opt_dioread_nolock, "dioread_nolock"}, + {Opt_dioread_lock, "dioread_lock"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_init_itable, "init_itable=%u"}, + {Opt_init_itable, "init_itable"}, + {Opt_noinit_itable, "noinit_itable"}, + {Opt_err, NULL}, +}; + +static ext4_fsblk_t get_sb_block(void **data) +{ + ext4_fsblk_t sb_block; + char *options = (char *) *data; + + if (!options || strncmp(options, "sb=", 3) != 0) + return 1; /* Default location */ + + options += 3; + /* TODO: use simple_strtoll with >32bit ext4 */ + sb_block = simple_strtoul(options, &options, 0); + if (*options && *options != ',') { + printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n", + (char *) *data); + return 1; + } + if (*options == ',') + options++; + *data = (void *) options; + + return sb_block; +} + +#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" + "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; + +#ifdef CONFIG_QUOTA +static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *qname; + + if (sb_any_quota_loaded(sb) && + !sbi->s_qf_names[qtype]) { + ext4_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return 0; + } + qname = match_strdup(args); + if (!qname) { + ext4_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return 0; + } + if (sbi->s_qf_names[qtype] && + strcmp(sbi->s_qf_names[qtype], qname)) { + ext4_msg(sb, KERN_ERR, + "%s quota file already specified", QTYPE2NAME(qtype)); + kfree(qname); + return 0; + } + sbi->s_qf_names[qtype] = qname; + if (strchr(sbi->s_qf_names[qtype], '/')) { + ext4_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; + } + set_opt(sb, QUOTA); + return 1; +} + +static int clear_qf_name(struct super_block *sb, int qtype) +{ + + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sb_any_quota_loaded(sb) && + sbi->s_qf_names[qtype]) { + ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return 0; + } + /* + * The space will be released later when all options are confirmed + * to be correct + */ + sbi->s_qf_names[qtype] = NULL; + return 1; +} +#endif + +static int parse_options(char *options, struct super_block *sb, + unsigned long *journal_devnum, + unsigned int *journal_ioprio, + ext4_fsblk_t *n_blocks_count, int is_remount) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *p; + substring_t args[MAX_OPT_ARGS]; + int data_opt = 0; + int option; +#ifdef CONFIG_QUOTA + int qfmt; +#endif + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = NULL; + token = match_token(p, tokens, args); + switch (token) { + case Opt_bsd_df: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); + clear_opt(sb, MINIX_DF); + break; + case Opt_minix_df: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); + set_opt(sb, MINIX_DF); + + break; + case Opt_grpid: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); + set_opt(sb, GRPID); + + break; + case Opt_nogrpid: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); + clear_opt(sb, GRPID); + + break; + case Opt_resuid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_resuid = option; + break; + case Opt_resgid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_resgid = option; + break; + case Opt_sb: + /* handled by get_sb_block() instead of here */ + /* *sb_block = match_int(&args[0]); */ + break; + case Opt_err_panic: + clear_opt(sb, ERRORS_CONT); + clear_opt(sb, ERRORS_RO); + set_opt(sb, ERRORS_PANIC); + break; + case Opt_err_ro: + clear_opt(sb, ERRORS_CONT); + clear_opt(sb, ERRORS_PANIC); + set_opt(sb, ERRORS_RO); + break; + case Opt_err_cont: + clear_opt(sb, ERRORS_RO); + clear_opt(sb, ERRORS_PANIC); + set_opt(sb, ERRORS_CONT); + break; + case Opt_nouid32: + set_opt(sb, NO_UID32); + break; + case Opt_debug: + set_opt(sb, DEBUG); + break; + case Opt_oldalloc: + set_opt(sb, OLDALLOC); + break; + case Opt_orlov: + clear_opt(sb, OLDALLOC); + break; +#ifdef CONFIG_EXT4_FS_XATTR + case Opt_user_xattr: + set_opt(sb, XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt(sb, XATTR_USER); + break; +#else + case Opt_user_xattr: + case Opt_nouser_xattr: + ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported"); + break; +#endif +#ifdef CONFIG_EXT4_FS_POSIX_ACL + case Opt_acl: + set_opt(sb, POSIX_ACL); + break; + case Opt_noacl: + clear_opt(sb, POSIX_ACL); + break; +#else + case Opt_acl: + case Opt_noacl: + ext4_msg(sb, KERN_ERR, "(no)acl options not supported"); + break; +#endif + case Opt_journal_update: + /* @@@ FIXME */ + /* Eventually we will want to be able to create + a journal file here. For now, only allow the + user to specify an existing inode to be the + journal file. */ + if (is_remount) { + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); + return 0; + } + set_opt(sb, UPDATE_JOURNAL); + break; + case Opt_journal_dev: + if (is_remount) { + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); + return 0; + } + if (match_int(&args[0], &option)) + return 0; + *journal_devnum = option; + break; + case Opt_journal_checksum: + set_opt(sb, JOURNAL_CHECKSUM); + break; + case Opt_journal_async_commit: + set_opt(sb, JOURNAL_ASYNC_COMMIT); + set_opt(sb, JOURNAL_CHECKSUM); + break; + case Opt_noload: + set_opt(sb, NOLOAD); + break; + case Opt_commit: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + if (option == 0) + option = JBD2_DEFAULT_MAX_COMMIT_AGE; + sbi->s_commit_interval = HZ * option; + break; + case Opt_max_batch_time: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + if (option == 0) + option = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_max_batch_time = option; + break; + case Opt_min_batch_time: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + sbi->s_min_batch_time = option; + break; + case Opt_data_journal: + data_opt = EXT4_MOUNT_JOURNAL_DATA; + goto datacheck; + case Opt_data_ordered: + data_opt = EXT4_MOUNT_ORDERED_DATA; + goto datacheck; + case Opt_data_writeback: + data_opt = EXT4_MOUNT_WRITEBACK_DATA; + datacheck: + if (is_remount) { + if (test_opt(sb, DATA_FLAGS) != data_opt) { + ext4_msg(sb, KERN_ERR, + "Cannot change data mode on remount"); + return 0; + } + } else { + clear_opt(sb, DATA_FLAGS); + sbi->s_mount_opt |= data_opt; + } + break; + case Opt_data_err_abort: + set_opt(sb, DATA_ERR_ABORT); + break; + case Opt_data_err_ignore: + clear_opt(sb, DATA_ERR_ABORT); + break; +#ifdef CONFIG_QUOTA + case Opt_usrjquota: + if (!set_qf_name(sb, USRQUOTA, &args[0])) + return 0; + break; + case Opt_grpjquota: + if (!set_qf_name(sb, GRPQUOTA, &args[0])) + return 0; + break; + case Opt_offusrjquota: + if (!clear_qf_name(sb, USRQUOTA)) + return 0; + break; + case Opt_offgrpjquota: + if (!clear_qf_name(sb, GRPQUOTA)) + return 0; + break; + + case Opt_jqfmt_vfsold: + qfmt = QFMT_VFS_OLD; + goto set_qf_format; + case Opt_jqfmt_vfsv0: + qfmt = QFMT_VFS_V0; + goto set_qf_format; + case Opt_jqfmt_vfsv1: + qfmt = QFMT_VFS_V1; +set_qf_format: + if (sb_any_quota_loaded(sb) && + sbi->s_jquota_fmt != qfmt) { + ext4_msg(sb, KERN_ERR, "Cannot change " + "journaled quota options when " + "quota turned on"); + return 0; + } + sbi->s_jquota_fmt = qfmt; + break; + case Opt_quota: + case Opt_usrquota: + set_opt(sb, QUOTA); + set_opt(sb, USRQUOTA); + break; + case Opt_grpquota: + set_opt(sb, QUOTA); + set_opt(sb, GRPQUOTA); + break; + case Opt_noquota: + if (sb_any_quota_loaded(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot change quota " + "options when quota turned on"); + return 0; + } + clear_opt(sb, QUOTA); + clear_opt(sb, USRQUOTA); + clear_opt(sb, GRPQUOTA); + break; +#else + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + ext4_msg(sb, KERN_ERR, + "quota options not supported"); + break; + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: + ext4_msg(sb, KERN_ERR, + "journaled quota options not supported"); + break; + case Opt_noquota: + break; +#endif + case Opt_abort: + sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; + break; + case Opt_nobarrier: + clear_opt(sb, BARRIER); + break; + case Opt_barrier: + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = 1; /* No argument, default to 1 */ + if (option) + set_opt(sb, BARRIER); + else + clear_opt(sb, BARRIER); + break; + case Opt_ignore: + break; + case Opt_resize: + if (!is_remount) { + ext4_msg(sb, KERN_ERR, + "resize option only available " + "for remount"); + return 0; + } + if (match_int(&args[0], &option) != 0) + return 0; + *n_blocks_count = option; + break; + case Opt_nobh: + ext4_msg(sb, KERN_WARNING, + "Ignoring deprecated nobh option"); + break; + case Opt_bh: + ext4_msg(sb, KERN_WARNING, + "Ignoring deprecated bh option"); + break; + case Opt_i_version: + set_opt(sb, I_VERSION); + sb->s_flags |= MS_I_VERSION; + break; + case Opt_nodelalloc: + clear_opt(sb, DELALLOC); + break; + case Opt_mblk_io_submit: + set_opt(sb, MBLK_IO_SUBMIT); + break; + case Opt_nomblk_io_submit: + clear_opt(sb, MBLK_IO_SUBMIT); + break; + case Opt_stripe: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + sbi->s_stripe = option; + break; + case Opt_delalloc: + set_opt(sb, DELALLOC); + break; + case Opt_block_validity: + set_opt(sb, BLOCK_VALIDITY); + break; + case Opt_noblock_validity: + clear_opt(sb, BLOCK_VALIDITY); + break; + case Opt_inode_readahead_blks: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > (1 << 30)) + return 0; + if (option && !is_power_of_2(option)) { + ext4_msg(sb, KERN_ERR, + "EXT4-fs: inode_readahead_blks" + " must be a power of 2"); + return 0; + } + sbi->s_inode_readahead_blks = option; + break; + case Opt_journal_ioprio: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > 7) + break; + *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, + option); + break; + case Opt_noauto_da_alloc: + set_opt(sb, NO_AUTO_DA_ALLOC); + break; + case Opt_auto_da_alloc: + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = 1; /* No argument, default to 1 */ + if (option) + clear_opt(sb, NO_AUTO_DA_ALLOC); + else + set_opt(sb,NO_AUTO_DA_ALLOC); + break; + case Opt_discard: + set_opt(sb, DISCARD); + break; + case Opt_nodiscard: + clear_opt(sb, DISCARD); + break; + case Opt_dioread_nolock: + set_opt(sb, DIOREAD_NOLOCK); + break; + case Opt_dioread_lock: + clear_opt(sb, DIOREAD_NOLOCK); + break; + case Opt_init_itable: + set_opt(sb, INIT_INODE_TABLE); + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = EXT4_DEF_LI_WAIT_MULT; + if (option < 0) + return 0; + sbi->s_li_wait_mult = option; + break; + case Opt_noinit_itable: + clear_opt(sb, INIT_INODE_TABLE); + break; + default: + ext4_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " + "or missing value", p); + return 0; + } + } +#ifdef CONFIG_QUOTA + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + clear_opt(sb, USRQUOTA); + + if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + clear_opt(sb, GRPQUOTA); + + if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { + ext4_msg(sb, KERN_ERR, "old and new quota " + "format mixing"); + return 0; + } + + if (!sbi->s_jquota_fmt) { + ext4_msg(sb, KERN_ERR, "journaled quota format " + "not specified"); + return 0; + } + } else { + if (sbi->s_jquota_fmt) { + ext4_msg(sb, KERN_ERR, "journaled quota format " + "specified with no journaling " + "enabled"); + return 0; + } + } +#endif + return 1; +} + +static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, + int read_only) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int res = 0; + + if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { + ext4_msg(sb, KERN_ERR, "revision level too high, " + "forcing read-only mode"); + res = MS_RDONLY; + } + if (read_only) + return res; + if (!(sbi->s_mount_state & EXT4_VALID_FS)) + ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " + "running e2fsck is recommended"); + else if ((sbi->s_mount_state & EXT4_ERROR_FS)) + ext4_msg(sb, KERN_WARNING, + "warning: mounting fs with errors, " + "running e2fsck is recommended"); + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && + le16_to_cpu(es->s_mnt_count) >= + (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) + ext4_msg(sb, KERN_WARNING, + "warning: maximal mount count reached, " + "running e2fsck is recommended"); + else if (le32_to_cpu(es->s_checkinterval) && + (le32_to_cpu(es->s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= get_seconds())) + ext4_msg(sb, KERN_WARNING, + "warning: checktime reached, " + "running e2fsck is recommended"); + if (!sbi->s_journal) + es->s_state &= cpu_to_le16(~EXT4_VALID_FS); + if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) + es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); + le16_add_cpu(&es->s_mnt_count, 1); + es->s_mtime = cpu_to_le32(get_seconds()); + ext4_update_dynamic_rev(sb); + if (sbi->s_journal) + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + + ext4_commit_super(sb, 1); + if (test_opt(sb, DEBUG)) + printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " + "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", + sb->s_blocksize, + sbi->s_groups_count, + EXT4_BLOCKS_PER_GROUP(sb), + EXT4_INODES_PER_GROUP(sb), + sbi->s_mount_opt, sbi->s_mount_opt2); + + cleancache_init_fs(sb); + return res; +} + +static int ext4_fill_flex_info(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + ext4_group_t flex_group_count; + ext4_group_t flex_group; + unsigned int groups_per_flex = 0; + size_t size; + int i; + + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; + if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { + sbi->s_log_groups_per_flex = 0; + return 1; + } + groups_per_flex = 1 << sbi->s_log_groups_per_flex; + + /* We allocate both existing and potentially added groups */ + flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << + EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; + size = flex_group_count * sizeof(struct flex_groups); + sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); + if (sbi->s_flex_groups == NULL) { + sbi->s_flex_groups = vzalloc(size); + if (sbi->s_flex_groups == NULL) { + ext4_msg(sb, KERN_ERR, + "not enough memory for %u flex groups", + flex_group_count); + goto failed; + } + } + + for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + + flex_group = ext4_flex_group(sbi, i); + atomic_add(ext4_free_inodes_count(sb, gdp), + &sbi->s_flex_groups[flex_group].free_inodes); + atomic_add(ext4_free_blks_count(sb, gdp), + &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(ext4_used_dirs_count(sb, gdp), + &sbi->s_flex_groups[flex_group].used_dirs); + } + + return 1; +failed: + return 0; +} + +__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, + struct ext4_group_desc *gdp) +{ + __u16 crc = 0; + + if (sbi->s_es->s_feature_ro_compat & + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + int offset = offsetof(struct ext4_group_desc, bg_checksum); + __le32 le_group = cpu_to_le32(block_group); + + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); + crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); + crc = crc16(crc, (__u8 *)gdp, offset); + offset += sizeof(gdp->bg_checksum); /* skip checksum */ + /* for checksum of struct ext4_group_desc do the rest...*/ + if ((sbi->s_es->s_feature_incompat & + cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && + offset < le16_to_cpu(sbi->s_es->s_desc_size)) + crc = crc16(crc, (__u8 *)gdp + offset, + le16_to_cpu(sbi->s_es->s_desc_size) - + offset); + } + + return cpu_to_le16(crc); +} + +int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group, + struct ext4_group_desc *gdp) +{ + if ((sbi->s_es->s_feature_ro_compat & + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) && + (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp))) + return 0; + + return 1; +} + +/* Called at mount-time, super-block is locked */ +static int ext4_check_descriptors(struct super_block *sb, + ext4_group_t *first_not_zeroed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); + ext4_fsblk_t last_block; + ext4_fsblk_t block_bitmap; + ext4_fsblk_t inode_bitmap; + ext4_fsblk_t inode_table; + int flexbg_flag = 0; + ext4_group_t i, grp = sbi->s_groups_count; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + flexbg_flag = 1; + + ext4_debug("Checking group descriptors"); + + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); + + if (i == sbi->s_groups_count - 1 || flexbg_flag) + last_block = ext4_blocks_count(sbi->s_es) - 1; + else + last_block = first_block + + (EXT4_BLOCKS_PER_GROUP(sb) - 1); + + if ((grp == sbi->s_groups_count) && + !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + grp = i; + + block_bitmap = ext4_block_bitmap(sb, gdp); + if (block_bitmap < first_block || block_bitmap > last_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Block bitmap for group %u not in group " + "(block %llu)!", i, block_bitmap); + return 0; + } + inode_bitmap = ext4_inode_bitmap(sb, gdp); + if (inode_bitmap < first_block || inode_bitmap > last_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode bitmap for group %u not in group " + "(block %llu)!", i, inode_bitmap); + return 0; + } + inode_table = ext4_inode_table(sb, gdp); + if (inode_table < first_block || + inode_table + sbi->s_itb_per_group - 1 > last_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode table for group %u not in group " + "(block %llu)!", i, inode_table); + return 0; + } + ext4_lock_group(sb, i); + if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Checksum for group %u failed (%u!=%u)", + i, le16_to_cpu(ext4_group_desc_csum(sbi, i, + gdp)), le16_to_cpu(gdp->bg_checksum)); + if (!(sb->s_flags & MS_RDONLY)) { + ext4_unlock_group(sb, i); + return 0; + } + } + ext4_unlock_group(sb, i); + if (!flexbg_flag) + first_block += EXT4_BLOCKS_PER_GROUP(sb); + } + if (NULL != first_not_zeroed) + *first_not_zeroed = grp; + + ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); + sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); + return 1; +} + +/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext4_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ +static void ext4_orphan_cleanup(struct super_block *sb, + struct ext4_super_block *es) +{ + unsigned int s_flags = sb->s_flags; + int nr_orphans = 0, nr_truncates = 0; +#ifdef CONFIG_QUOTA + int i; +#endif + if (!es->s_last_orphan) { + jbd_debug(4, "no orphan inodes to clean up\n"); + return; + } + + if (bdev_read_only(sb->s_bdev)) { + ext4_msg(sb, KERN_ERR, "write access " + "unavailable, skipping orphan cleanup"); + return; + } + + /* Check if feature set would not allow a r/w mount */ + if (!ext4_feature_set_ok(sb, 0)) { + ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); + return; + } + + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + if (es->s_last_orphan) + jbd_debug(1, "Errors on filesystem, " + "clearing orphan list.\n"); + es->s_last_orphan = 0; + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + return; + } + + if (s_flags & MS_RDONLY) { + ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); + sb->s_flags &= ~MS_RDONLY; + } +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + for (i = 0; i < MAXQUOTAS; i++) { + if (EXT4_SB(sb)->s_qf_names[i]) { + int ret = ext4_quota_on_mount(sb, i); + if (ret < 0) + ext4_msg(sb, KERN_ERR, + "Cannot turn on journaled " + "quota: error %d", ret); + } + } +#endif + + while (es->s_last_orphan) { + struct inode *inode; + + inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); + if (IS_ERR(inode)) { + es->s_last_orphan = 0; + break; + } + + list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); + dquot_initialize(inode); + if (inode->i_nlink) { + ext4_msg(sb, KERN_DEBUG, + "%s: truncating inode %lu to %lld bytes", + __func__, inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %lu to %lld bytes\n", + inode->i_ino, inode->i_size); + ext4_truncate(inode); + nr_truncates++; + } else { + ext4_msg(sb, KERN_DEBUG, + "%s: deleting unreferenced inode %lu", + __func__, inode->i_ino); + jbd_debug(2, "deleting unreferenced inode %lu\n", + inode->i_ino); + nr_orphans++; + } + iput(inode); /* The delete magic happens here! */ + } + +#define PLURAL(x) (x), ((x) == 1) ? "" : "s" + + if (nr_orphans) + ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", + PLURAL(nr_orphans)); + if (nr_truncates) + ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", + PLURAL(nr_truncates)); +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + for (i = 0; i < MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } +#endif + sb->s_flags = s_flags; /* Restore MS_RDONLY status */ +} + +/* + * Maximal extent format file size. + * Resulting logical blkno at s_maxbytes must fit in our on-disk + * extent format containers, within a sector_t, and within i_blocks + * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, + * so that won't be a limiting factor. + * + * However there is other limiting factor. We do store extents in the form + * of starting block and length, hence the resulting length of the extent + * covering maximum file size must fit into on-disk format containers as + * well. Given that length is always by 1 unit bigger than max unit (because + * we count 0 as well) we have to lower the s_maxbytes by one fs block. + * + * Note, this does *not* consider any metadata overhead for vfs i_blocks. + */ +static loff_t ext4_max_size(int blkbits, int has_huge_files) +{ + loff_t res; + loff_t upper_limit = MAX_LFS_FILESIZE; + + /* small i_blocks in vfs inode? */ + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { + /* + * CONFIG_LBDAF is not enabled implies the inode + * i_block represent total blocks in 512 bytes + * 32 == size of vfs inode i_blocks * 8 + */ + upper_limit = (1LL << 32) - 1; + + /* total blocks in file system block size */ + upper_limit >>= (blkbits - 9); + upper_limit <<= blkbits; + } + + /* + * 32-bit extent-start container, ee_block. We lower the maxbytes + * by one fs block, so ee_len can cover the extent of maximum file + * size + */ + res = (1LL << 32) - 1; + res <<= blkbits; + + /* Sanity check against vm- & vfs- imposed limits */ + if (res > upper_limit) + res = upper_limit; + + return res; +} + +/* + * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect + * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. + * We need to be 1 filesystem block less than the 2^48 sector limit. + */ +static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) +{ + loff_t res = EXT4_NDIR_BLOCKS; + int meta_blocks; + loff_t upper_limit; + /* This is calculated to be the largest file size for a dense, block + * mapped file such that the file's total number of 512-byte sectors, + * including data and all indirect blocks, does not exceed (2^48 - 1). + * + * __u32 i_blocks_lo and _u16 i_blocks_high represent the total + * number of 512-byte sectors of the file. + */ + + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { + /* + * !has_huge_files or CONFIG_LBDAF not enabled implies that + * the inode i_block field represents total file blocks in + * 2^32 512-byte sectors == size of vfs inode i_blocks * 8 + */ + upper_limit = (1LL << 32) - 1; + + /* total blocks in file system block size */ + upper_limit >>= (bits - 9); + + } else { + /* + * We use 48 bit ext4_inode i_blocks + * With EXT4_HUGE_FILE_FL set the i_blocks + * represent total number of blocks in + * file system block size + */ + upper_limit = (1LL << 48) - 1; + + } + + /* indirect blocks */ + meta_blocks = 1; + /* double indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)); + /* tripple indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); + + upper_limit -= meta_blocks; + upper_limit <<= bits; + + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + res <<= bits; + if (res > upper_limit) + res = upper_limit; + + if (res > MAX_LFS_FILESIZE) + res = MAX_LFS_FILESIZE; + + return res; +} + +static ext4_fsblk_t descriptor_loc(struct super_block *sb, + ext4_fsblk_t logical_sb_block, int nr) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t bg, first_meta_bg; + int has_super = 0; + + first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || + nr < first_meta_bg) + return logical_sb_block + nr + 1; + bg = sbi->s_desc_per_block * nr; + if (ext4_bg_has_super(sb, bg)) + has_super = 1; + + return (has_super + ext4_group_first_block_no(sb, bg)); +} + +/** + * ext4_get_stripe_size: Get the stripe size. + * @sbi: In memory super block info + * + * If we have specified it via mount option, then + * use the mount option value. If the value specified at mount time is + * greater than the blocks per group use the super block value. + * If the super block value is greater than blocks per group return 0. + * Allocator needs it be less than blocks per group. + * + */ +static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) +{ + unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); + unsigned long stripe_width = + le32_to_cpu(sbi->s_es->s_raid_stripe_width); + + if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) + return sbi->s_stripe; + + if (stripe_width <= sbi->s_blocks_per_group) + return stripe_width; + + if (stride <= sbi->s_blocks_per_group) + return stride; + + return 0; +} + +/* sysfs supprt */ + +struct ext4_attr { + struct attribute attr; + ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); + ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, + const char *, size_t); + int offset; +}; + +static int parse_strtoul(const char *buf, + unsigned long max, unsigned long *value) +{ + char *endp; + + *value = simple_strtoul(skip_spaces(buf), &endp, 0); + endp = skip_spaces(endp); + if (*endp || *value > max) + return -EINVAL; + + return 0; +} + +static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); +} + +static ssize_t session_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + return snprintf(buf, PAGE_SIZE, "%lu\n", + (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + sbi->s_sectors_written_start) >> 1); +} + +static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1))); +} + +static ssize_t extent_cache_hits_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits); +} + +static ssize_t extent_cache_misses_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses); +} + +static ssize_t inode_readahead_blks_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long t; + + if (parse_strtoul(buf, 0x40000000, &t)) + return -EINVAL; + + if (t && !is_power_of_2(t)) + return -EINVAL; + + sbi->s_inode_readahead_blks = t; + return count; +} + +static ssize_t sbi_ui_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t sbi_ui_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + unsigned long t; + + if (parse_strtoul(buf, 0xffffffff, &t)) + return -EINVAL; + *ui = t; + return count; +} + +#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .offset = offsetof(struct ext4_sb_info, _elname), \ +} +#define EXT4_ATTR(name, mode, show, store) \ +static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) + +#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL) +#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) +#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) +#define EXT4_RW_ATTR_SBI_UI(name, elname) \ + EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) +#define ATTR_LIST(name) &ext4_attr_##name.attr + +EXT4_RO_ATTR(delayed_allocation_blocks); +EXT4_RO_ATTR(session_write_kbytes); +EXT4_RO_ATTR(lifetime_write_kbytes); +EXT4_RO_ATTR(extent_cache_hits); +EXT4_RO_ATTR(extent_cache_misses); +EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, + inode_readahead_blks_store, s_inode_readahead_blks); +EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); +EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); +EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); +EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); +EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); + +static struct attribute *ext4_attrs[] = { + ATTR_LIST(delayed_allocation_blocks), + ATTR_LIST(session_write_kbytes), + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(extent_cache_hits), + ATTR_LIST(extent_cache_misses), + ATTR_LIST(inode_readahead_blks), + ATTR_LIST(inode_goal), + ATTR_LIST(mb_stats), + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), + ATTR_LIST(mb_stream_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), + NULL, +}; + +/* Features this copy of ext4 supports */ +EXT4_INFO_ATTR(lazy_itable_init); +EXT4_INFO_ATTR(batched_discard); + +static struct attribute *ext4_feat_attrs[] = { + ATTR_LIST(lazy_itable_init), + ATTR_LIST(batched_discard), + NULL, +}; + +static ssize_t ext4_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t ext4_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void ext4_sb_release(struct kobject *kobj) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +static const struct sysfs_ops ext4_attr_ops = { + .show = ext4_attr_show, + .store = ext4_attr_store, +}; + +static struct kobj_type ext4_ktype = { + .default_attrs = ext4_attrs, + .sysfs_ops = &ext4_attr_ops, + .release = ext4_sb_release, +}; + +static void ext4_feat_release(struct kobject *kobj) +{ + complete(&ext4_feat->f_kobj_unregister); +} + +static struct kobj_type ext4_feat_ktype = { + .default_attrs = ext4_feat_attrs, + .sysfs_ops = &ext4_attr_ops, + .release = ext4_feat_release, +}; + +/* + * Check whether this filesystem can be mounted based on + * the features present and the RDONLY/RDWR mount requested. + * Returns 1 if this filesystem can be mounted as requested, + * 0 if it cannot be. + */ +static int ext4_feature_set_ok(struct super_block *sb, int readonly) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) { + ext4_msg(sb, KERN_ERR, + "Couldn't mount because of " + "unsupported optional features (%x)", + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & + ~EXT4_FEATURE_INCOMPAT_SUPP)); + return 0; + } + + if (readonly) + return 1; + + /* Check that feature set is OK for a read-write mount */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { + ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " + "unsupported optional features (%x)", + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & + ~EXT4_FEATURE_RO_COMPAT_SUPP)); + return 0; + } + /* + * Large file size enabled file system can only be mounted + * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF + */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + if (sizeof(blkcnt_t) < sizeof(u64)) { + ext4_msg(sb, KERN_ERR, "Filesystem with huge files " + "cannot be mounted RDWR without " + "CONFIG_LBDAF"); + return 0; + } + } + return 1; +} + +/* + * This function is called once a day if we have errors logged + * on the file system + */ +static void print_daily_error_info(unsigned long arg) +{ + struct super_block *sb = (struct super_block *) arg; + struct ext4_sb_info *sbi; + struct ext4_super_block *es; + + sbi = EXT4_SB(sb); + es = sbi->s_es; + + if (es->s_error_count) + ext4_msg(sb, KERN_NOTICE, "error count: %u", + le32_to_cpu(es->s_error_count)); + if (es->s_first_error_time) { + printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", + sb->s_id, le32_to_cpu(es->s_first_error_time), + (int) sizeof(es->s_first_error_func), + es->s_first_error_func, + le32_to_cpu(es->s_first_error_line)); + if (es->s_first_error_ino) + printk(": inode %u", + le32_to_cpu(es->s_first_error_ino)); + if (es->s_first_error_block) + printk(": block %llu", (unsigned long long) + le64_to_cpu(es->s_first_error_block)); + printk("\n"); + } + if (es->s_last_error_time) { + printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", + sb->s_id, le32_to_cpu(es->s_last_error_time), + (int) sizeof(es->s_last_error_func), + es->s_last_error_func, + le32_to_cpu(es->s_last_error_line)); + if (es->s_last_error_ino) + printk(": inode %u", + le32_to_cpu(es->s_last_error_ino)); + if (es->s_last_error_block) + printk(": block %llu", (unsigned long long) + le64_to_cpu(es->s_last_error_block)); + printk("\n"); + } + mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ +} + +/* Find next suitable group and run ext4_init_inode_table */ +static int ext4_run_li_request(struct ext4_li_request *elr) +{ + struct ext4_group_desc *gdp = NULL; + ext4_group_t group, ngroups; + struct super_block *sb; + unsigned long timeout = 0; + int ret = 0; + + sb = elr->lr_super; + ngroups = EXT4_SB(sb)->s_groups_count; + + for (group = elr->lr_next_group; group < ngroups; group++) { + gdp = ext4_get_group_desc(sb, group, NULL); + if (!gdp) { + ret = 1; + break; + } + + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + break; + } + + if (group == ngroups) + ret = 1; + + if (!ret) { + timeout = jiffies; + ret = ext4_init_inode_table(sb, group, + elr->lr_timeout ? 0 : 1); + if (elr->lr_timeout == 0) { + timeout = (jiffies - timeout) * + elr->lr_sbi->s_li_wait_mult; + elr->lr_timeout = timeout; + } + elr->lr_next_sched = jiffies + elr->lr_timeout; + elr->lr_next_group = group + 1; + } + + return ret; +} + +/* + * Remove lr_request from the list_request and free the + * request structure. Should be called with li_list_mtx held + */ +static void ext4_remove_li_request(struct ext4_li_request *elr) +{ + struct ext4_sb_info *sbi; + + if (!elr) + return; + + sbi = elr->lr_sbi; + + list_del(&elr->lr_request); + sbi->s_li_request = NULL; + kfree(elr); +} + +static void ext4_unregister_li_request(struct super_block *sb) +{ + mutex_lock(&ext4_li_mtx); + if (!ext4_li_info) { + mutex_unlock(&ext4_li_mtx); + return; + } + + mutex_lock(&ext4_li_info->li_list_mtx); + ext4_remove_li_request(EXT4_SB(sb)->s_li_request); + mutex_unlock(&ext4_li_info->li_list_mtx); + mutex_unlock(&ext4_li_mtx); +} + +static struct task_struct *ext4_lazyinit_task; + +/* + * This is the function where ext4lazyinit thread lives. It walks + * through the request list searching for next scheduled filesystem. + * When such a fs is found, run the lazy initialization request + * (ext4_rn_li_request) and keep track of the time spend in this + * function. Based on that time we compute next schedule time of + * the request. When walking through the list is complete, compute + * next waking time and put itself into sleep. + */ +static int ext4_lazyinit_thread(void *arg) +{ + struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; + struct list_head *pos, *n; + struct ext4_li_request *elr; + unsigned long next_wakeup, cur; + + BUG_ON(NULL == eli); + +cont_thread: + while (true) { + next_wakeup = MAX_JIFFY_OFFSET; + + mutex_lock(&eli->li_list_mtx); + if (list_empty(&eli->li_request_list)) { + mutex_unlock(&eli->li_list_mtx); + goto exit_thread; + } + + list_for_each_safe(pos, n, &eli->li_request_list) { + elr = list_entry(pos, struct ext4_li_request, + lr_request); + + if (time_after_eq(jiffies, elr->lr_next_sched)) { + if (ext4_run_li_request(elr) != 0) { + /* error, remove the lazy_init job */ + ext4_remove_li_request(elr); + continue; + } + } + + if (time_before(elr->lr_next_sched, next_wakeup)) + next_wakeup = elr->lr_next_sched; + } + mutex_unlock(&eli->li_list_mtx); + + if (freezing(current)) + refrigerator(); + + cur = jiffies; + if ((time_after_eq(cur, next_wakeup)) || + (MAX_JIFFY_OFFSET == next_wakeup)) { + cond_resched(); + continue; + } + + schedule_timeout_interruptible(next_wakeup - cur); + + if (kthread_should_stop()) { + ext4_clear_request_list(); + goto exit_thread; + } + } + +exit_thread: + /* + * It looks like the request list is empty, but we need + * to check it under the li_list_mtx lock, to prevent any + * additions into it, and of course we should lock ext4_li_mtx + * to atomically free the list and ext4_li_info, because at + * this point another ext4 filesystem could be registering + * new one. + */ + mutex_lock(&ext4_li_mtx); + mutex_lock(&eli->li_list_mtx); + if (!list_empty(&eli->li_request_list)) { + mutex_unlock(&eli->li_list_mtx); + mutex_unlock(&ext4_li_mtx); + goto cont_thread; + } + mutex_unlock(&eli->li_list_mtx); + kfree(ext4_li_info); + ext4_li_info = NULL; + mutex_unlock(&ext4_li_mtx); + + return 0; +} + +static void ext4_clear_request_list(void) +{ + struct list_head *pos, *n; + struct ext4_li_request *elr; + + mutex_lock(&ext4_li_info->li_list_mtx); + list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { + elr = list_entry(pos, struct ext4_li_request, + lr_request); + ext4_remove_li_request(elr); + } + mutex_unlock(&ext4_li_info->li_list_mtx); +} + +static int ext4_run_lazyinit_thread(void) +{ + ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, + ext4_li_info, "ext4lazyinit"); + if (IS_ERR(ext4_lazyinit_task)) { + int err = PTR_ERR(ext4_lazyinit_task); + ext4_clear_request_list(); + kfree(ext4_li_info); + ext4_li_info = NULL; + printk(KERN_CRIT "EXT4: error %d creating inode table " + "initialization thread\n", + err); + return err; + } + ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; + return 0; +} + +/* + * Check whether it make sense to run itable init. thread or not. + * If there is at least one uninitialized inode table, return + * corresponding group number, else the loop goes through all + * groups and return total number of groups. + */ +static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) +{ + ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; + struct ext4_group_desc *gdp = NULL; + + for (group = 0; group < ngroups; group++) { + gdp = ext4_get_group_desc(sb, group, NULL); + if (!gdp) + continue; + + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + break; + } + + return group; +} + +static int ext4_li_info_new(void) +{ + struct ext4_lazy_init *eli = NULL; + + eli = kzalloc(sizeof(*eli), GFP_KERNEL); + if (!eli) + return -ENOMEM; + + INIT_LIST_HEAD(&eli->li_request_list); + mutex_init(&eli->li_list_mtx); + + eli->li_state |= EXT4_LAZYINIT_QUIT; + + ext4_li_info = eli; + + return 0; +} + +static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, + ext4_group_t start) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_li_request *elr; + unsigned long rnd; + + elr = kzalloc(sizeof(*elr), GFP_KERNEL); + if (!elr) + return NULL; + + elr->lr_super = sb; + elr->lr_sbi = sbi; + elr->lr_next_group = start; + + /* + * Randomize first schedule time of the request to + * spread the inode table initialization requests + * better. + */ + get_random_bytes(&rnd, sizeof(rnd)); + elr->lr_next_sched = jiffies + (unsigned long)rnd % + (EXT4_DEF_LI_MAX_START_DELAY * HZ); + + return elr; +} + +static int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_li_request *elr; + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + int ret = 0; + + if (sbi->s_li_request != NULL) { + /* + * Reset timeout so it can be computed again, because + * s_li_wait_mult might have changed. + */ + sbi->s_li_request->lr_timeout = 0; + return 0; + } + + if (first_not_zeroed == ngroups || + (sb->s_flags & MS_RDONLY) || + !test_opt(sb, INIT_INODE_TABLE)) + return 0; + + elr = ext4_li_request_new(sb, first_not_zeroed); + if (!elr) + return -ENOMEM; + + mutex_lock(&ext4_li_mtx); + + if (NULL == ext4_li_info) { + ret = ext4_li_info_new(); + if (ret) + goto out; + } + + mutex_lock(&ext4_li_info->li_list_mtx); + list_add(&elr->lr_request, &ext4_li_info->li_request_list); + mutex_unlock(&ext4_li_info->li_list_mtx); + + sbi->s_li_request = elr; + /* + * set elr to NULL here since it has been inserted to + * the request_list and the removal and free of it is + * handled by ext4_clear_request_list from now on. + */ + elr = NULL; + + if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { + ret = ext4_run_lazyinit_thread(); + if (ret) + goto out; + } +out: + mutex_unlock(&ext4_li_mtx); + if (ret) + kfree(elr); + return ret; +} + +/* + * We do not need to lock anything since this is called on + * module unload. + */ +static void ext4_destroy_lazyinit_thread(void) +{ + /* + * If thread exited earlier + * there's nothing to be done. + */ + if (!ext4_li_info || !ext4_lazyinit_task) + return; + + kthread_stop(ext4_lazyinit_task); +} + +static int ext4_fill_super(struct super_block *sb, void *data, int silent) + __releases(kernel_lock) + __acquires(kernel_lock) +{ + char *orig_data = kstrdup(data, GFP_KERNEL); + struct buffer_head *bh; + struct ext4_super_block *es = NULL; + struct ext4_sb_info *sbi; + ext4_fsblk_t block; + ext4_fsblk_t sb_block = get_sb_block(&data); + ext4_fsblk_t logical_sb_block; + unsigned long offset = 0; + unsigned long journal_devnum = 0; + unsigned long def_mount_opts; + struct inode *root; + char *cp; + const char *descr; + int ret = -ENOMEM; + int blocksize; + unsigned int db_count; + unsigned int i; + int needs_recovery, has_huge_files; + __u64 blocks_count; + int err; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ext4_group_t first_not_zeroed; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + goto out_free_orig; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + goto out_free_orig; + } + sb->s_fs_info = sbi; + sbi->s_mount_opt = 0; + sbi->s_resuid = EXT4_DEF_RESUID; + sbi->s_resgid = EXT4_DEF_RESGID; + sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; + sbi->s_sb_block = sb_block; + if (sb->s_bdev->bd_part) + sbi->s_sectors_written_start = + part_stat_read(sb->s_bdev->bd_part, sectors[1]); + + /* Cleanup superblock name */ + for (cp = sb->s_id; (cp = strchr(cp, '/'));) + *cp = '!'; + + ret = -EINVAL; + blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); + if (!blocksize) { + ext4_msg(sb, KERN_ERR, "unable to set blocksize"); + goto out_fail; + } + + /* + * The ext4 superblock will not be buffer aligned for other than 1kB + * block sizes. We need to calculate the offset from buffer start. + */ + if (blocksize != EXT4_MIN_BLOCK_SIZE) { + logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + } else { + logical_sb_block = sb_block; + } + + if (!(bh = sb_bread(sb, logical_sb_block))) { + ext4_msg(sb, KERN_ERR, "unable to read superblock"); + goto out_fail; + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext4 macro-instructions depend on its value + */ + es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + if (sb->s_magic != EXT4_SUPER_MAGIC) + goto cantfind_ext4; + sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); + + /* Set defaults before we parse the mount options */ + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + set_opt(sb, INIT_INODE_TABLE); + if (def_mount_opts & EXT4_DEFM_DEBUG) + set_opt(sb, DEBUG); + if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { + ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", + "2.6.38"); + set_opt(sb, GRPID); + } + if (def_mount_opts & EXT4_DEFM_UID16) + set_opt(sb, NO_UID32); + /* xattr user namespace & acls are now defaulted on */ +#ifdef CONFIG_EXT4_FS_XATTR + set_opt(sb, XATTR_USER); +#endif +#ifdef CONFIG_EXT4_FS_POSIX_ACL + set_opt(sb, POSIX_ACL); +#endif + set_opt(sb, MBLK_IO_SUBMIT); + if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) + set_opt(sb, JOURNAL_DATA); + else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) + set_opt(sb, ORDERED_DATA); + else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) + set_opt(sb, WRITEBACK_DATA); + + if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) + set_opt(sb, ERRORS_PANIC); + else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) + set_opt(sb, ERRORS_CONT); + else + set_opt(sb, ERRORS_RO); + if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) + set_opt(sb, BLOCK_VALIDITY); + if (def_mount_opts & EXT4_DEFM_DISCARD) + set_opt(sb, DISCARD); + + sbi->s_resuid = le16_to_cpu(es->s_def_resuid); + sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; + sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; + sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + + if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) + set_opt(sb, BARRIER); + + /* + * enable delayed allocation by default + * Use -o nodelalloc to turn it off + */ + if (!IS_EXT3_SB(sb) && + ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) + set_opt(sb, DELALLOC); + + /* + * set default s_li_wait_mult for lazyinit, for the case there is + * no mount option specified. + */ + sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + + if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, + &journal_devnum, &journal_ioprio, NULL, 0)) { + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + sbi->s_es->s_mount_opts); + } + if (!parse_options((char *) data, sb, &journal_devnum, + &journal_ioprio, NULL, 0)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && + (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || + EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) || + EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U))) + ext4_msg(sb, KERN_WARNING, + "feature flags set on rev 0 fs, " + "running e2fsck is recommended"); + + if (IS_EXT2_SB(sb)) { + if (ext2_feature_set_ok(sb)) + ext4_msg(sb, KERN_INFO, "mounting ext2 file system " + "using the ext4 subsystem"); + else { + ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " + "to feature incompatibilities"); + goto failed_mount; + } + } + + if (IS_EXT3_SB(sb)) { + if (ext3_feature_set_ok(sb)) + ext4_msg(sb, KERN_INFO, "mounting ext3 file system " + "using the ext4 subsystem"); + else { + ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " + "to feature incompatibilities"); + goto failed_mount; + } + } + + /* + * Check feature flags regardless of the revision level, since we + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. + */ + if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) + goto failed_mount; + + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + if (blocksize < EXT4_MIN_BLOCK_SIZE || + blocksize > EXT4_MAX_BLOCK_SIZE) { + ext4_msg(sb, KERN_ERR, + "Unsupported filesystem blocksize %d", blocksize); + goto failed_mount; + } + + if (sb->s_blocksize != blocksize) { + /* Validate the filesystem blocksize */ + if (!sb_set_blocksize(sb, blocksize)) { + ext4_msg(sb, KERN_ERR, "bad block size %d", + blocksize); + goto failed_mount; + } + + brelse(bh); + logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + bh = sb_bread(sb, logical_sb_block); + if (!bh) { + ext4_msg(sb, KERN_ERR, + "Can't read superblock on 2nd try"); + goto failed_mount; + } + es = (struct ext4_super_block *)(((char *)bh->b_data) + offset); + sbi->s_es = es; + if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { + ext4_msg(sb, KERN_ERR, + "Magic mismatch, very weird!"); + goto failed_mount; + } + } + + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); + + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { + sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || + (!is_power_of_2(sbi->s_inode_size)) || + (sbi->s_inode_size > blocksize)) { + ext4_msg(sb, KERN_ERR, + "unsupported inode size: %d", + sbi->s_inode_size); + goto failed_mount; + } + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) + sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2); + } + + sbi->s_desc_size = le16_to_cpu(es->s_desc_size); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) { + if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || + sbi->s_desc_size > EXT4_MAX_DESC_SIZE || + !is_power_of_2(sbi->s_desc_size)) { + ext4_msg(sb, KERN_ERR, + "unsupported descriptor size %lu", + sbi->s_desc_size); + goto failed_mount; + } + } else + sbi->s_desc_size = EXT4_MIN_DESC_SIZE; + + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) + goto cantfind_ext4; + + sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0) + goto cantfind_ext4; + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); + sbi->s_sbh = bh; + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); + + for (i = 0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + sbi->s_def_hash_version = es->s_def_hash_version; + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + sb->s_dirt = 1; + } + + if (sbi->s_blocks_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", + sbi->s_blocks_per_group); + goto failed_mount; + } + if (sbi->s_inodes_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#inodes per group too big: %lu", + sbi->s_inodes_per_group); + goto failed_mount; + } + + /* + * Test whether we have more sectors than will fit in sector_t, + * and whether the max offset is addressable by the page cache. + */ + err = generic_check_addressable(sb->s_blocksize_bits, + ext4_blocks_count(es)); + if (err) { + ext4_msg(sb, KERN_ERR, "filesystem" + " too large to mount safely on this system"); + if (sizeof(sector_t) < 8) + ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); + ret = err; + goto failed_mount; + } + + if (EXT4_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext4; + + /* check blocks count against device size */ + blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + if (blocks_count && ext4_blocks_count(es) > blocks_count) { + ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " + "exceeds size of device (%llu blocks)", + ext4_blocks_count(es), blocks_count); + goto failed_mount; + } + + /* + * It makes no sense for the first data block to be beyond the end + * of the filesystem. + */ + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { + ext4_msg(sb, KERN_WARNING, "bad geometry: first data" + "block %u is beyond end of filesystem (%llu)", + le32_to_cpu(es->s_first_data_block), + ext4_blocks_count(es)); + goto failed_mount; + } + blocks_count = (ext4_blocks_count(es) - + le32_to_cpu(es->s_first_data_block) + + EXT4_BLOCKS_PER_GROUP(sb) - 1); + do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { + ext4_msg(sb, KERN_WARNING, "groups count too large: %u " + "(block count %llu, first data block %u, " + "blocks per group %lu)", sbi->s_groups_count, + ext4_blocks_count(es), + le32_to_cpu(es->s_first_data_block), + EXT4_BLOCKS_PER_GROUP(sb)); + goto failed_mount; + } + sbi->s_groups_count = blocks_count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); + sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), + GFP_KERNEL); + if (sbi->s_group_desc == NULL) { + ext4_msg(sb, KERN_ERR, "not enough memory"); + goto failed_mount; + } + +#ifdef CONFIG_PROC_FS + if (ext4_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); +#endif + + bgl_lock_init(sbi->s_blockgroup_lock); + + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logical_sb_block, i); + sbi->s_group_desc[i] = sb_bread(sb, block); + if (!sbi->s_group_desc[i]) { + ext4_msg(sb, KERN_ERR, + "can't read group descriptor %d", i); + db_count = i; + goto failed_mount2; + } + } + if (!ext4_check_descriptors(sb, &first_not_zeroed)) { + ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); + goto failed_mount2; + } + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (!ext4_fill_flex_info(sb)) { + ext4_msg(sb, KERN_ERR, + "unable to initialize " + "flex_bg meta info!"); + goto failed_mount2; + } + + sbi->s_gdb_count = db_count; + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + + init_timer(&sbi->s_err_report); + sbi->s_err_report.function = print_daily_error_info; + sbi->s_err_report.data = (unsigned long) sb; + + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + } + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount3; + } + + sbi->s_stripe = ext4_get_stripe_size(sbi); + sbi->s_max_writeback_mb_bump = 128; + + /* + * set up enough so that it can read an inode + */ + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; + sb->s_export_op = &ext4_export_ops; + sb->s_xattr = ext4_xattr_handlers; +#ifdef CONFIG_QUOTA + sb->s_qcop = &ext4_qctl_operations; + sb->dq_op = &ext4_quota_operations; +#endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); + mutex_init(&sbi->s_resize_lock); + + sb->s_root = NULL; + + needs_recovery = (es->s_last_orphan != 0 || + EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_RECOVER)); + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && + !(sb->s_flags & MS_RDONLY)) + if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) + goto failed_mount3; + + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! + */ + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + if (ext4_load_journal(sb, es, journal_devnum)) + goto failed_mount3; + } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && + EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + ext4_msg(sb, KERN_ERR, "required journal recovery " + "suppressed and not mounted read-only"); + goto failed_mount_wq; + } else { + clear_opt(sb, DATA_FLAGS); + sbi->s_journal = NULL; + needs_recovery = 0; + goto no_journal; + } + + if (ext4_blocks_count(es) > 0xffffffffULL && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); + goto failed_mount_wq; + } + + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else if (test_opt(sb, JOURNAL_CHECKSUM)) { + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); + jbd2_journal_clear_features(sbi->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else { + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } + + /* We have now updated the journal if required, so we can + * validate the data journaling mode. */ + switch (test_opt(sb, DATA_FLAGS)) { + case 0: + /* No mode set, assume a default based on the journal + * capabilities: ORDERED_DATA if the journal can + * cope, else JOURNAL_DATA + */ + if (jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) + set_opt(sb, ORDERED_DATA); + else + set_opt(sb, JOURNAL_DATA); + break; + + case EXT4_MOUNT_ORDERED_DATA: + case EXT4_MOUNT_WRITEBACK_DATA: + if (!jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { + ext4_msg(sb, KERN_ERR, "Journal does not support " + "requested data journaling mode"); + goto failed_mount_wq; + } + default: + break; + } + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + + /* + * The journal may have updated the bg summary counts, so we + * need to update the global counters. + */ + percpu_counter_set(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + percpu_counter_set(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + percpu_counter_set(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); + +no_journal: + /* + * The maximum number of concurrent works can be high and + * concurrency isn't really necessary. Limit it to 1. + */ + EXT4_SB(sb)->dio_unwritten_wq = + alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!EXT4_SB(sb)->dio_unwritten_wq) { + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); + goto failed_mount_wq; + } + + /* + * The jbd2_journal_load will have done any necessary log recovery, + * so we can safely mount the rest of the filesystem now. + */ + + root = ext4_iget(sb, EXT4_ROOT_INO); + if (IS_ERR(root)) { + ext4_msg(sb, KERN_ERR, "get root inode failed"); + ret = PTR_ERR(root); + root = NULL; + goto failed_mount4; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); + goto failed_mount4; + } + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + ext4_msg(sb, KERN_ERR, "get root dentry failed"); + ret = -ENOMEM; + goto failed_mount4; + } + + if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY)) + sb->s_flags |= MS_RDONLY; + + /* determine the minimum size of new large inodes, if present */ + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) { + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_want_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_want_extra_isize); + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_min_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_min_extra_isize); + } + } + /* Check if enough inode space is available */ + if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > + sbi->s_inode_size) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + ext4_msg(sb, KERN_INFO, "required extra inode space not" + "available"); + } + + if (test_opt(sb, DELALLOC) && + (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { + ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " + "requested data journaling mode"); + clear_opt(sb, DELALLOC); + } + if (test_opt(sb, DIOREAD_NOLOCK)) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " + "option - requested data journaling mode"); + clear_opt(sb, DIOREAD_NOLOCK); + } + if (sb->s_blocksize < PAGE_SIZE) { + ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " + "option - block size is too small"); + clear_opt(sb, DIOREAD_NOLOCK); + } + } + + err = ext4_setup_system_zone(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize system " + "zone (%d)", err); + goto failed_mount4; + } + + ext4_ext_init(sb); + err = ext4_mb_init(sb, needs_recovery); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); + goto failed_mount4; + } + + err = ext4_register_li_request(sb, first_not_zeroed); + if (err) + goto failed_mount4; + + sbi->s_kobj.kset = ext4_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, + "%s", sb->s_id); + if (err) { + ext4_mb_release(sb); + ext4_ext_release(sb); + goto failed_mount4; + }; + + EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; + ext4_orphan_cleanup(sb, es); + EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; + if (needs_recovery) { + ext4_msg(sb, KERN_INFO, "recovery complete"); + ext4_mark_recovery_complete(sb, es); + } + if (EXT4_SB(sb)->s_journal) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + descr = " journalled data mode"; + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + descr = " ordered data mode"; + else + descr = " writeback data mode"; + } else + descr = "out journal"; + + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " + "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + *sbi->s_es->s_mount_opts ? "; " : "", orig_data); + + if (es->s_error_count) + mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ + + kfree(orig_data); + return 0; + +cantfind_ext4: + if (!silent) + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + goto failed_mount; + +failed_mount4: + iput(root); + sb->s_root = NULL; + ext4_msg(sb, KERN_ERR, "mount failed"); + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); +failed_mount_wq: + ext4_release_system_zone(sb); + if (sbi->s_journal) { + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + } +failed_mount3: + del_timer(&sbi->s_err_report); + if (sbi->s_flex_groups) { + if (is_vmalloc_addr(sbi->s_flex_groups)) + vfree(sbi->s_flex_groups); + else + kfree(sbi->s_flex_groups); + } + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); +failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); +failed_mount: + if (sbi->s_proc) { + remove_proc_entry(sb->s_id, ext4_proc_root); + } +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + ext4_blkdev_remove(sbi); + brelse(bh); +out_fail: + sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); + kfree(sbi); +out_free_orig: + kfree(orig_data); + return ret; +} + +/* + * Setup any per-fs journal parameters now. We'll do this both on + * initial mount, once the journal has been initialised but before we've + * done any recovery; and again on any subsequent remount. + */ +static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + journal->j_commit_interval = sbi->s_commit_interval; + journal->j_min_batch_time = sbi->s_min_batch_time; + journal->j_max_batch_time = sbi->s_max_batch_time; + + write_lock(&journal->j_state_lock); + if (test_opt(sb, BARRIER)) + journal->j_flags |= JBD2_BARRIER; + else + journal->j_flags &= ~JBD2_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; + write_unlock(&journal->j_state_lock); +} + +static journal_t *ext4_get_journal(struct super_block *sb, + unsigned int journal_inum) +{ + struct inode *journal_inode; + journal_t *journal; + + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + + /* First, test for the existence of a valid inode on disk. Bad + * things happen if we iget() an unused inode, as the subsequent + * iput() will try to delete it. */ + + journal_inode = ext4_iget(sb, journal_inum); + if (IS_ERR(journal_inode)) { + ext4_msg(sb, KERN_ERR, "no journal found"); + return NULL; + } + if (!journal_inode->i_nlink) { + make_bad_inode(journal_inode); + iput(journal_inode); + ext4_msg(sb, KERN_ERR, "journal inode is deleted"); + return NULL; + } + + jbd_debug(2, "Journal inode found at %p: %lld bytes\n", + journal_inode, journal_inode->i_size); + if (!S_ISREG(journal_inode->i_mode)) { + ext4_msg(sb, KERN_ERR, "invalid journal inode"); + iput(journal_inode); + return NULL; + } + + journal = jbd2_journal_init_inode(journal_inode); + if (!journal) { + ext4_msg(sb, KERN_ERR, "Could not load journal inode"); + iput(journal_inode); + return NULL; + } + journal->j_private = sb; + ext4_init_journal_params(sb, journal); + return journal; +} + +static journal_t *ext4_get_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + struct buffer_head *bh; + journal_t *journal; + ext4_fsblk_t start; + ext4_fsblk_t len; + int hblock, blocksize; + ext4_fsblk_t sb_block; + unsigned long offset; + struct ext4_super_block *es; + struct block_device *bdev; + + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + + bdev = ext4_blkdev_get(j_dev, sb); + if (bdev == NULL) + return NULL; + + blocksize = sb->s_blocksize; + hblock = bdev_logical_block_size(bdev); + if (blocksize < hblock) { + ext4_msg(sb, KERN_ERR, + "blocksize too small for journal device"); + goto out_bdev; + } + + sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; + offset = EXT4_MIN_BLOCK_SIZE % blocksize; + set_blocksize(bdev, blocksize); + if (!(bh = __bread(bdev, sb_block, blocksize))) { + ext4_msg(sb, KERN_ERR, "couldn't read superblock of " + "external journal"); + goto out_bdev; + } + + es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); + if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || + !(le32_to_cpu(es->s_feature_incompat) & + EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { + ext4_msg(sb, KERN_ERR, "external journal has " + "bad superblock"); + brelse(bh); + goto out_bdev; + } + + if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { + ext4_msg(sb, KERN_ERR, "journal UUID does not match"); + brelse(bh); + goto out_bdev; + } + + len = ext4_blocks_count(es); + start = sb_block + 1; + brelse(bh); /* we're done with the superblock */ + + journal = jbd2_journal_init_dev(bdev, sb->s_bdev, + start, len, blocksize); + if (!journal) { + ext4_msg(sb, KERN_ERR, "failed to create device journal"); + goto out_bdev; + } + journal->j_private = sb; + ll_rw_block(READ, 1, &journal->j_sb_buffer); + wait_on_buffer(journal->j_sb_buffer); + if (!buffer_uptodate(journal->j_sb_buffer)) { + ext4_msg(sb, KERN_ERR, "I/O error on journal device"); + goto out_journal; + } + if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { + ext4_msg(sb, KERN_ERR, "External journal has more than one " + "user (unsupported) - %d", + be32_to_cpu(journal->j_superblock->s_nr_users)); + goto out_journal; + } + EXT4_SB(sb)->journal_bdev = bdev; + ext4_init_journal_params(sb, journal); + return journal; + +out_journal: + jbd2_journal_destroy(journal); +out_bdev: + ext4_blkdev_put(bdev); + return NULL; +} + +static int ext4_load_journal(struct super_block *sb, + struct ext4_super_block *es, + unsigned long journal_devnum) +{ + journal_t *journal; + unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); + dev_t journal_dev; + int err = 0; + int really_read_only; + + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + + if (journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + ext4_msg(sb, KERN_INFO, "external journal device major/minor " + "numbers have changed"); + journal_dev = new_decode_dev(journal_devnum); + } else + journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); + + really_read_only = bdev_read_only(sb->s_bdev); + + /* + * Are we loading a blank journal or performing recovery after a + * crash? For recovery, we need to check in advance whether we + * can get read-write access to the device. + */ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + if (sb->s_flags & MS_RDONLY) { + ext4_msg(sb, KERN_INFO, "INFO: recovery " + "required on readonly filesystem"); + if (really_read_only) { + ext4_msg(sb, KERN_ERR, "write access " + "unavailable, cannot proceed"); + return -EROFS; + } + ext4_msg(sb, KERN_INFO, "write access will " + "be enabled during recovery"); + } + } + + if (journal_inum && journal_dev) { + ext4_msg(sb, KERN_ERR, "filesystem has both journal " + "and inode journals!"); + return -EINVAL; + } + + if (journal_inum) { + if (!(journal = ext4_get_journal(sb, journal_inum))) + return -EINVAL; + } else { + if (!(journal = ext4_get_dev_journal(sb, journal_dev))) + return -EINVAL; + } + + if (!(journal->j_flags & JBD2_BARRIER)) + ext4_msg(sb, KERN_INFO, "barriers disabled"); + + if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { + err = jbd2_journal_update_format(journal); + if (err) { + ext4_msg(sb, KERN_ERR, "error updating journal"); + jbd2_journal_destroy(journal); + return err; + } + } + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) + err = jbd2_journal_wipe(journal, !really_read_only); + if (!err) { + char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); + if (save) + memcpy(save, ((char *) es) + + EXT4_S_ERR_START, EXT4_S_ERR_LEN); + err = jbd2_journal_load(journal); + if (save) + memcpy(((char *) es) + EXT4_S_ERR_START, + save, EXT4_S_ERR_LEN); + kfree(save); + } + + if (err) { + ext4_msg(sb, KERN_ERR, "error loading journal"); + jbd2_journal_destroy(journal); + return err; + } + + EXT4_SB(sb)->s_journal = journal; + ext4_clear_journal_err(sb, es); + + if (!really_read_only && journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + es->s_journal_dev = cpu_to_le32(journal_devnum); + + /* Make sure we flush the recovery flag to disk. */ + ext4_commit_super(sb, 1); + } + + return 0; +} + +static int ext4_commit_super(struct super_block *sb, int sync) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; + int error = 0; + + if (!sbh) + return error; + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + ext4_msg(sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + /* + * If the file system is mounted read-only, don't update the + * superblock write time. This avoids updating the superblock + * write time when we are mounting the root file system + * read/only but we need to replay the journal; at that point, + * for people who are east of GMT and who make their clock + * tick in localtime for Windows bug-for-bug compatibility, + * the clock is set in the future, and this will cause e2fsck + * to complain and force a full file system check. + */ + if (!(sb->s_flags & MS_RDONLY)) + es->s_wtime = cpu_to_le32(get_seconds()); + if (sb->s_bdev->bd_part) + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1)); + else + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); + ext4_free_blocks_count_set(es, percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeblocks_counter)); + es->s_free_inodes_count = + cpu_to_le32(percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeinodes_counter)); + sb->s_dirt = 0; + BUFFER_TRACE(sbh, "marking dirty"); + mark_buffer_dirty(sbh); + if (sync) { + error = sync_dirty_buffer(sbh); + if (error) + return error; + + error = buffer_write_io_error(sbh); + if (error) { + ext4_msg(sb, KERN_ERR, "I/O error while writing " + "superblock"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + } + return error; +} + +/* + * Have we just finished recovery? If so, and if we are mounting (or + * remounting) the filesystem readonly, then we will end up with a + * consistent fs on disk. Record that fact. + */ +static void ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + + if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + BUG_ON(journal != NULL); + return; + } + jbd2_journal_lock_updates(journal); + if (jbd2_journal_flush(journal) < 0) + goto out; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && + sb->s_flags & MS_RDONLY) { + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_commit_super(sb, 1); + } + +out: + jbd2_journal_unlock_updates(journal); +} + +/* + * If we are mounting (or read-write remounting) a filesystem whose journal + * has recorded an error from a previous lifetime, move that error to the + * main filesystem now. + */ +static void ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es) +{ + journal_t *journal; + int j_errno; + const char *errstr; + + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + + journal = EXT4_SB(sb)->s_journal; + + /* + * Now check for any error status which may have been recorded in the + * journal by a prior ext4_error() or ext4_abort() + */ + + j_errno = jbd2_journal_errno(journal); + if (j_errno) { + char nbuf[16]; + + errstr = ext4_decode_error(sb, j_errno, nbuf); + ext4_warning(sb, "Filesystem error recorded " + "from previous mount: %s", errstr); + ext4_warning(sb, "Marking fs in need of filesystem check."); + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + ext4_commit_super(sb, 1); + + jbd2_journal_clear_err(journal); + } +} + +/* + * Force the running and committing transactions to commit, + * and wait on the commit. + */ +int ext4_force_commit(struct super_block *sb) +{ + journal_t *journal; + int ret = 0; + + if (sb->s_flags & MS_RDONLY) + return 0; + + journal = EXT4_SB(sb)->s_journal; + if (journal) { + vfs_check_frozen(sb, SB_FREEZE_TRANS); + ret = ext4_journal_force_commit(journal); + } + + return ret; +} + +static void ext4_write_super(struct super_block *sb) +{ + lock_super(sb); + ext4_commit_super(sb, 1); + unlock_super(sb); +} + +static int ext4_sync_fs(struct super_block *sb, int wait) +{ + int ret = 0; + tid_t target; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + trace_ext4_sync_fs(sb, wait); + flush_workqueue(sbi->dio_unwritten_wq); + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { + if (wait) + jbd2_log_wait_commit(sbi->s_journal, target); + } + return ret; +} + +/* + * LVM calls this function before a (read-only) snapshot is created. This + * gives us a chance to flush the journal completely and mark the fs clean. + * + * Note that only this function cannot bring a filesystem to be in a clean + * state independently, because ext4 prevents a new handle from being started + * by @sb->s_frozen, which stays in an upper layer. It thus needs help from + * the upper layer. + */ +static int ext4_freeze(struct super_block *sb) +{ + int error = 0; + journal_t *journal; + + if (sb->s_flags & MS_RDONLY) + return 0; + + journal = EXT4_SB(sb)->s_journal; + + /* Now we set up the journal barrier. */ + jbd2_journal_lock_updates(journal); + + /* + * Don't clear the needs_recovery flag if we failed to flush + * the journal. + */ + error = jbd2_journal_flush(journal); + if (error < 0) + goto out; + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + error = ext4_commit_super(sb, 1); +out: + /* we rely on s_frozen to stop further updates */ + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + return error; +} + +/* + * Called by LVM after the snapshot is done. We need to reset the RECOVER + * flag here, even though the filesystem is not technically dirty yet. + */ +static int ext4_unfreeze(struct super_block *sb) +{ + if (sb->s_flags & MS_RDONLY) + return 0; + + lock_super(sb); + /* Reset the needs_recovery flag before the fs is unlocked. */ + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_commit_super(sb, 1); + unlock_super(sb); + return 0; +} + +/* + * Structure to save mount options for ext4_remount's benefit + */ +struct ext4_mount_options { + unsigned long s_mount_opt; + unsigned long s_mount_opt2; + uid_t s_resuid; + gid_t s_resgid; + unsigned long s_commit_interval; + u32 s_min_batch_time, s_max_batch_time; +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; +#endif +}; + +static int ext4_remount(struct super_block *sb, int *flags, char *data) +{ + struct ext4_super_block *es; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t n_blocks_count = 0; + unsigned long old_sb_flags; + struct ext4_mount_options old_opts; + int enable_quota = 0; + ext4_group_t g; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + int err = 0; +#ifdef CONFIG_QUOTA + int i; +#endif + char *orig_data = kstrdup(data, GFP_KERNEL); + + /* Store the original options */ + lock_super(sb); + old_sb_flags = sb->s_flags; + old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_mount_opt2 = sbi->s_mount_opt2; + old_opts.s_resuid = sbi->s_resuid; + old_opts.s_resgid = sbi->s_resgid; + old_opts.s_commit_interval = sbi->s_commit_interval; + old_opts.s_min_batch_time = sbi->s_min_batch_time; + old_opts.s_max_batch_time = sbi->s_max_batch_time; +#ifdef CONFIG_QUOTA + old_opts.s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) + old_opts.s_qf_names[i] = sbi->s_qf_names[i]; +#endif + if (sbi->s_journal && sbi->s_journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + + /* + * Allow the "check" option to be passed as a remount option. + */ + if (!parse_options(data, sb, NULL, &journal_ioprio, + &n_blocks_count, 1)) { + err = -EINVAL; + goto restore_opts; + } + + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) + ext4_abort(sb, "Abort forced by user"); + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + + es = sbi->s_es; + + if (sbi->s_journal) { + ext4_init_journal_params(sb, sbi->s_journal); + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + } + + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || + n_blocks_count > ext4_blocks_count(es)) { + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { + err = -EROFS; + goto restore_opts; + } + + if (*flags & MS_RDONLY) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + + /* + * First of all, the unconditional stuff we have to do + * to disable replay of the journal when we next remount + */ + sb->s_flags |= MS_RDONLY; + + /* + * OK, test if we are remounting a valid rw partition + * readonly, and if so set the rdonly flag and then + * mark the partition as valid again. + */ + if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) && + (sbi->s_mount_state & EXT4_VALID_FS)) + es->s_state = cpu_to_le16(sbi->s_mount_state); + + if (sbi->s_journal) + ext4_mark_recovery_complete(sb, es); + } else { + /* Make sure we can mount this feature set readwrite */ + if (!ext4_feature_set_ok(sb, 0)) { + err = -EROFS; + goto restore_opts; + } + /* + * Make sure the group descriptor checksums + * are sane. If they aren't, refuse to remount r/w. + */ + for (g = 0; g < sbi->s_groups_count; g++) { + struct ext4_group_desc *gdp = + ext4_get_group_desc(sb, g, NULL); + + if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { + ext4_msg(sb, KERN_ERR, + "ext4_remount: Checksum for group %u failed (%u!=%u)", + g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), + le16_to_cpu(gdp->bg_checksum)); + err = -EINVAL; + goto restore_opts; + } + } + + /* + * If we have an unprocessed orphan list hanging + * around from a previously readonly bdev mount, + * require a full umount/remount for now. + */ + if (es->s_last_orphan) { + ext4_msg(sb, KERN_WARNING, "Couldn't " + "remount RDWR because of unprocessed " + "orphan inode list. Please " + "umount/remount instead"); + err = -EINVAL; + goto restore_opts; + } + + /* + * Mounting a RDONLY partition read-write, so reread + * and store the current valid flag. (It may have + * been changed by e2fsck since we originally mounted + * the partition.) + */ + if (sbi->s_journal) + ext4_clear_journal_err(sb, es); + sbi->s_mount_state = le16_to_cpu(es->s_state); + if ((err = ext4_group_extend(sb, es, n_blocks_count))) + goto restore_opts; + if (!ext4_setup_super(sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_MMP)) + if (ext4_multi_mount_protect(sb, + le64_to_cpu(es->s_mmp_block))) { + err = -EROFS; + goto restore_opts; + } + enable_quota = 1; + } + } + + /* + * Reinitialize lazy itable initialization thread based on + * current settings + */ + if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE)) + ext4_unregister_li_request(sb); + else { + ext4_group_t first_not_zeroed; + first_not_zeroed = ext4_has_uninit_itable(sb); + ext4_register_li_request(sb, first_not_zeroed); + } + + ext4_setup_system_zone(sb); + if (sbi->s_journal == NULL) + ext4_commit_super(sb, 1); + +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + if (old_opts.s_qf_names[i] && + old_opts.s_qf_names[i] != sbi->s_qf_names[i]) + kfree(old_opts.s_qf_names[i]); +#endif + unlock_super(sb); + if (enable_quota) + dquot_resume(sb, -1); + + ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); + kfree(orig_data); + return 0; + +restore_opts: + sb->s_flags = old_sb_flags; + sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_mount_opt2 = old_opts.s_mount_opt2; + sbi->s_resuid = old_opts.s_resuid; + sbi->s_resgid = old_opts.s_resgid; + sbi->s_commit_interval = old_opts.s_commit_interval; + sbi->s_min_batch_time = old_opts.s_min_batch_time; + sbi->s_max_batch_time = old_opts.s_max_batch_time; +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = old_opts.s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i] && + old_opts.s_qf_names[i] != sbi->s_qf_names[i]) + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = old_opts.s_qf_names[i]; + } +#endif + unlock_super(sb); + kfree(orig_data); + return err; +} + +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + u64 fsid; + s64 bfree; + + if (test_opt(sb, MINIX_DF)) { + sbi->s_overhead_last = 0; + } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + ext4_fsblk_t overhead = 0; + + /* + * Compute the overhead (FS structures). This is constant + * for a given filesystem unless the number of block groups + * changes so we cache the previous value until it does. + */ + + /* + * All of the blocks before first_data_block are + * overhead + */ + overhead = le32_to_cpu(es->s_first_data_block); + + /* + * Add the overhead attributed to the superblock and + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ + for (i = 0; i < ngroups; i++) { + overhead += ext4_bg_has_super(sb, i) + + ext4_bg_num_gdb(sb, i); + cond_resched(); + } + + /* + * Every block group has an inode bitmap, a block + * bitmap, and an inode table. + */ + overhead += ngroups * (2 + sbi->s_itb_per_group); + sbi->s_overhead_last = overhead; + smp_wmb(); + sbi->s_blocks_last = ext4_blocks_count(es); + } + + buf->f_type = EXT4_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; + bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - + percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); + /* prevent underflow in case that few free space is available */ + buf->f_bfree = max_t(s64, bfree, 0); + buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); + if (buf->f_bfree < ext4_r_blocks_count(es)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); + buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); + buf->f_namelen = EXT4_NAME_LEN; + fsid = le64_to_cpup((void *)es->s_uuid) ^ + le64_to_cpup((void *)es->s_uuid + sizeof(u64)); + buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + + return 0; +} + +/* Helper function for writing quotas on sync - we need to start transaction + * before quota file is locked for write. Otherwise the are possible deadlocks: + * Process 1 Process 2 + * ext4_create() quota_sync() + * jbd2_journal_start() write_dquot() + * dquot_initialize() down(dqio_mutex) + * down(dqio_mutex) jbd2_journal_start() + * + */ + +#ifdef CONFIG_QUOTA + +static inline struct inode *dquot_to_inode(struct dquot *dquot) +{ + return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; +} + +static int ext4_write_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + struct inode *inode; + + inode = dquot_to_inode(dquot); + handle = ext4_journal_start(inode, + EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit(dquot); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_acquire_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext4_journal_start(dquot_to_inode(dquot), + EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_acquire(dquot); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_release_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext4_journal_start(dquot_to_inode(dquot), + EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) { + /* Release dquot anyway to avoid endless cycle in dqput() */ + dquot_release(dquot); + return PTR_ERR(handle); + } + ret = dquot_release(dquot); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_mark_dquot_dirty(struct dquot *dquot) +{ + /* Are we journaling quotas? */ + if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || + EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + dquot_mark_dquot_dirty(dquot); + return ext4_write_dquot(dquot); + } else { + return dquot_mark_dquot_dirty(dquot); + } +} + +static int ext4_write_info(struct super_block *sb, int type) +{ + int ret, err; + handle_t *handle; + + /* Data block + inode block */ + handle = ext4_journal_start(sb->s_root->d_inode, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit_info(sb, type); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +/* + * Turn on quotas during mount time - we need to find + * the quota file and such... + */ +static int ext4_quota_on_mount(struct super_block *sb, int type) +{ + return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], + EXT4_SB(sb)->s_jquota_fmt, type); +} + +/* + * Standard function to be called on quota_on + */ +static int ext4_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + int err; + + if (!test_opt(sb, QUOTA)) + return -EINVAL; + + /* Quotafile not on the same filesystem? */ + if (path->mnt->mnt_sb != sb) + return -EXDEV; + /* Journaling quota? */ + if (EXT4_SB(sb)->s_qf_names[type]) { + /* Quotafile not in fs root? */ + if (path->dentry->d_parent != sb->s_root) + ext4_msg(sb, KERN_WARNING, + "Quota file not on filesystem root. " + "Journaled quota will not work"); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (EXT4_SB(sb)->s_journal && + ext4_should_journal_data(path->dentry->d_inode)) { + /* + * We don't need to lock updates but journal_flush() could + * otherwise be livelocked... + */ + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (err) + return err; + } + + return dquot_quota_on(sb, type, format_id, path); +} + +static int ext4_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + handle_t *handle; + + /* Force all delayed allocation blocks to be allocated. + * Caller already holds s_umount sem */ + if (test_opt(sb, DELALLOC)) + sync_filesystem(sb); + + if (!inode) + goto out; + + /* Update modification times of quota files when userspace can + * start looking at them */ + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) + goto out; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + +out: + return dquot_quota_off(sb, type); +} + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) + * we don't have to be afraid of races */ +static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + bh = ext4_bread(NULL, inode, blk, 0, &err); + if (err) + return err; + if (!bh) /* A hole? */ + memset(data, 0, tocopy); + else + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +static ssize_t ext4_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + struct buffer_head *bh; + handle_t *handle = journal_current_handle(); + + if (EXT4_SB(sb)->s_journal && !handle) { + ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because transaction is not started", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + /* + * Since we account only one data block in transaction credits, + * then it is impossible to cross a block boundary. + */ + if (sb->s_blocksize - offset < len) { + ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because not block aligned", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + bh = ext4_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + err = ext4_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + goto out; + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, len); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + brelse(bh); +out: + if (err) { + mutex_unlock(&inode->i_mutex); + return err; + } + if (inode->i_size < off + len) { + i_size_write(inode, off + len); + EXT4_I(inode)->i_disksize = inode->i_size; + ext4_mark_inode_dirty(handle, inode); + } + mutex_unlock(&inode->i_mutex); + return len; +} + +#endif + +static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super); +} + +#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static inline void register_as_ext2(void) +{ + int err = register_filesystem(&ext2_fs_type); + if (err) + printk(KERN_WARNING + "EXT4-fs: Unable to register as ext2 (%d)\n", err); +} + +static inline void unregister_as_ext2(void) +{ + unregister_filesystem(&ext2_fs_type); +} + +static inline int ext2_feature_set_ok(struct super_block *sb) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP)) + return 0; + if (sb->s_flags & MS_RDONLY) + return 1; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP)) + return 0; + return 1; +} +MODULE_ALIAS("ext2"); +#else +static inline void register_as_ext2(void) { } +static inline void unregister_as_ext2(void) { } +static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } +#endif + +#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static inline void register_as_ext3(void) +{ + int err = register_filesystem(&ext3_fs_type); + if (err) + printk(KERN_WARNING + "EXT4-fs: Unable to register as ext3 (%d)\n", err); +} + +static inline void unregister_as_ext3(void) +{ + unregister_filesystem(&ext3_fs_type); +} + +static inline int ext3_feature_set_ok(struct super_block *sb) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP)) + return 0; + if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + return 0; + if (sb->s_flags & MS_RDONLY) + return 1; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) + return 0; + return 1; +} +MODULE_ALIAS("ext3"); +#else +static inline void register_as_ext3(void) { } +static inline void unregister_as_ext3(void) { } +static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; } +#endif + +static struct file_system_type ext4_fs_type = { + .owner = THIS_MODULE, + .name = "ext4", + .mount = ext4_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init ext4_init_feat_adverts(void) +{ + struct ext4_features *ef; + int ret = -ENOMEM; + + ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL); + if (!ef) + goto out; + + ef->f_kobj.kset = ext4_kset; + init_completion(&ef->f_kobj_unregister); + ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL, + "features"); + if (ret) { + kfree(ef); + goto out; + } + + ext4_feat = ef; + ret = 0; +out: + return ret; +} + +static void ext4_exit_feat_adverts(void) +{ + kobject_put(&ext4_feat->f_kobj); + wait_for_completion(&ext4_feat->f_kobj_unregister); + kfree(ext4_feat); +} + +/* Shared across all ext4 file systems */ +wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; +struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; + +static int __init ext4_init_fs(void) +{ + int i, err; + + ext4_check_flag_values(); + + for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { + mutex_init(&ext4__aio_mutex[i]); + init_waitqueue_head(&ext4__ioend_wq[i]); + } + + err = ext4_init_pageio(); + if (err) + return err; + err = ext4_init_system_zone(); + if (err) + goto out7; + ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); + if (!ext4_kset) + goto out6; + ext4_proc_root = proc_mkdir("fs/ext4", NULL); + if (!ext4_proc_root) + goto out5; + + err = ext4_init_feat_adverts(); + if (err) + goto out4; + + err = ext4_init_mballoc(); + if (err) + goto out3; + + err = ext4_init_xattr(); + if (err) + goto out2; + err = init_inodecache(); + if (err) + goto out1; + register_as_ext3(); + register_as_ext2(); + err = register_filesystem(&ext4_fs_type); + if (err) + goto out; + + ext4_li_info = NULL; + mutex_init(&ext4_li_mtx); + return 0; +out: + unregister_as_ext2(); + unregister_as_ext3(); + destroy_inodecache(); +out1: + ext4_exit_xattr(); +out2: + ext4_exit_mballoc(); +out3: + ext4_exit_feat_adverts(); +out4: + remove_proc_entry("fs/ext4", NULL); +out5: + kset_unregister(ext4_kset); +out6: + ext4_exit_system_zone(); +out7: + ext4_exit_pageio(); + return err; +} + +static void __exit ext4_exit_fs(void) +{ + ext4_destroy_lazyinit_thread(); + unregister_as_ext2(); + unregister_as_ext3(); + unregister_filesystem(&ext4_fs_type); + destroy_inodecache(); + ext4_exit_xattr(); + ext4_exit_mballoc(); + ext4_exit_feat_adverts(); + remove_proc_entry("fs/ext4", NULL); + kset_unregister(ext4_kset); + ext4_exit_system_zone(); + ext4_exit_pageio(); +} + +MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); +MODULE_DESCRIPTION("Fourth Extended Filesystem"); +MODULE_LICENSE("GPL"); +module_init(ext4_init_fs) +module_exit(ext4_exit_fs) diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c new file mode 100644 index 00000000..ed9354af --- /dev/null +++ b/fs/ext4/symlink.c @@ -0,0 +1,56 @@ +/* + * linux/fs/ext4/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 symlink handling code + */ + +#include +#include +#include +#include "ext4.h" +#include "xattr.h" + +static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); + nd_set_link(nd, (char *) ei->i_data); + return NULL; +} + +const struct inode_operations ext4_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = ext4_setattr, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +const struct inode_operations ext4_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ext4_follow_link, + .setattr = ext4_setattr, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif +}; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c new file mode 100644 index 00000000..c2865cc3 --- /dev/null +++ b/fs/ext4/xattr.c @@ -0,0 +1,1612 @@ +/* + * linux/fs/ext4/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * Fix by Harrison Xing . + * Ext4 code with a lot of help from Eric Jarman . + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko . + * xattr consolidation Copyright (c) 2004 James Morris , + * Red Hat Inc. + * ea-in-inode support by Alex Tomas aka bzzz + * and Andreas Gruenbacher . + */ + +/* + * Extended attributes are stored directly in inodes (on file systems with + * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl + * field contains the block number if an inode uses an additional block. All + * attributes must fit in the inode and one additional block. Blocks that + * contain the identical set of attributes may be shared among several inodes. + * Identical blocks are detected by keeping a cache of blocks that have + * recently been accessed. + * + * The attributes in inodes and on blocks have a different header; the entries + * are stored in the same format: + * + * +------------------+ + * | header | + * | entry 1 | | + * | entry 2 | | growing downwards + * | entry 3 | v + * | four null bytes | + * | . . . | + * | value 1 | ^ + * | value 3 | | growing upwards + * | value 2 | | + * +------------------+ + * + * The header is followed by multiple entry descriptors. In disk blocks, the + * entry descriptors are kept sorted. In inodes, they are unsorted. The + * attribute values are aligned to the end of the block in no specific order. + * + * Locking strategy + * ---------------- + * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem. + * EA blocks are only changed if they are exclusive to an inode, so + * holding xattr_sem also means that nothing but the EA block's reference + * count can change. Multiple writers to the same block are synchronized + * by the buffer lock. + */ + +#include +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "acl.h" + +#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) +#define BFIRST(bh) ENTRY(BHDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#ifdef EXT4_XATTR_DEBUG +# define ea_idebug(inode, f...) do { \ + printk(KERN_DEBUG "inode %s:%lu: ", \ + inode->i_sb->s_id, inode->i_ino); \ + printk(f); \ + printk("\n"); \ + } while (0) +# define ea_bdebug(bh, f...) do { \ + char b[BDEVNAME_SIZE]; \ + printk(KERN_DEBUG "block %s:%lu: ", \ + bdevname(bh->b_bdev, b), \ + (unsigned long) bh->b_blocknr); \ + printk(f); \ + printk("\n"); \ + } while (0) +#else +# define ea_idebug(f...) +# define ea_bdebug(f...) +#endif + +static void ext4_xattr_cache_insert(struct buffer_head *); +static struct buffer_head *ext4_xattr_cache_find(struct inode *, + struct ext4_xattr_header *, + struct mb_cache_entry **); +static void ext4_xattr_rehash(struct ext4_xattr_header *, + struct ext4_xattr_entry *); +static int ext4_xattr_list(struct dentry *dentry, char *buffer, + size_t buffer_size); + +static struct mb_cache *ext4_xattr_cache; + +static const struct xattr_handler *ext4_xattr_handler_map[] = { + [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, +#ifdef CONFIG_EXT4_FS_POSIX_ACL + [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, +#endif + [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, +#ifdef CONFIG_EXT4_FS_SECURITY + [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, +#endif +}; + +const struct xattr_handler *ext4_xattr_handlers[] = { + &ext4_xattr_user_handler, + &ext4_xattr_trusted_handler, +#ifdef CONFIG_EXT4_FS_POSIX_ACL + &ext4_xattr_acl_access_handler, + &ext4_xattr_acl_default_handler, +#endif +#ifdef CONFIG_EXT4_FS_SECURITY + &ext4_xattr_security_handler, +#endif + NULL +}; + +static inline const struct xattr_handler * +ext4_xattr_handler(int name_index) +{ + const struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) + handler = ext4_xattr_handler_map[name_index]; + return handler; +} + +/* + * Inode operation listxattr() + * + * dentry->d_inode->i_mutex: don't care + */ +ssize_t +ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + return ext4_xattr_list(dentry, buffer, size); +} + +static int +ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) +{ + while (!IS_LAST_ENTRY(entry)) { + struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry); + if ((void *)next >= end) + return -EIO; + entry = next; + } + return 0; +} + +static inline int +ext4_xattr_check_block(struct buffer_head *bh) +{ + int error; + + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) + return -EIO; + error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); + return error; +} + +static inline int +ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) +{ + size_t value_size = le32_to_cpu(entry->e_value_size); + + if (entry->e_value_block != 0 || value_size > size || + le16_to_cpu(entry->e_value_offs) + value_size > size) + return -EIO; + return 0; +} + +static int +ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, + const char *name, size_t size, int sorted) +{ + struct ext4_xattr_entry *entry; + size_t name_len; + int cmp = 1; + + if (name == NULL) + return -EINVAL; + name_len = strlen(name); + entry = *pentry; + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + cmp = name_index - entry->e_name_index; + if (!cmp) + cmp = name_len - entry->e_name_len; + if (!cmp) + cmp = memcmp(name, entry->e_name, name_len); + if (cmp <= 0 && (sorted || cmp == 0)) + break; + } + *pentry = entry; + if (!cmp && ext4_xattr_check_entry(entry, size)) + return -EIO; + return cmp ? -ENODATA : 0; +} + +static int +ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct buffer_head *bh = NULL; + struct ext4_xattr_entry *entry; + size_t size; + int error; + + ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", + name_index, name, buffer, (long)buffer_size); + + error = -ENODATA; + if (!EXT4_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext4_xattr_check_block(bh)) { +bad_block: + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext4_xattr_cache_insert(bh); + entry = BFIRST(bh); + error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), + size); + } + error = size; + +cleanup: + brelse(bh); + return error; +} + +static int +ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + size_t size; + void *end; + int error; + + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return -ENODATA; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + error = ext4_xattr_check_names(entry, end); + if (error) + goto cleanup; + error = ext4_xattr_find_entry(&entry, name_index, name, + end - (void *)entry, 0); + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, (void *)IFIRST(header) + + le16_to_cpu(entry->e_value_offs), size); + } + error = size; + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext4_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +int +ext4_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + int error; + + down_read(&EXT4_I(inode)->xattr_sem); + error = ext4_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size); + if (error == -ENODATA) + error = ext4_xattr_block_get(inode, name_index, name, buffer, + buffer_size); + up_read(&EXT4_I(inode)->xattr_sem); + return error; +} + +static int +ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, + char *buffer, size_t buffer_size) +{ + size_t rest = buffer_size; + + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + const struct xattr_handler *handler = + ext4_xattr_handler(entry->e_name_index); + + if (handler) { + size_t size = handler->list(dentry, buffer, rest, + entry->e_name, + entry->e_name_len, + handler->flags); + if (buffer) { + if (size > rest) + return -ERANGE; + buffer += size; + } + rest -= size; + } + } + return buffer_size - rest; +} + +static int +ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = dentry->d_inode; + struct buffer_head *bh = NULL; + int error; + + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); + + error = 0; + if (!EXT4_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext4_xattr_check_block(bh)) { + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext4_xattr_cache_insert(bh); + error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); + +cleanup: + brelse(bh); + + return error; +} + +static int +ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = dentry->d_inode; + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + void *end; + int error; + + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return 0; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + error = ext4_xattr_check_names(IFIRST(header), end); + if (error) + goto cleanup; + error = ext4_xattr_list_entries(dentry, IFIRST(header), + buffer, buffer_size); + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext4_xattr_list() + * + * Copy a list of attribute names into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +static int +ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + int ret, ret2; + + down_read(&EXT4_I(dentry->d_inode)->xattr_sem); + ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; + if (buffer) { + buffer += ret; + buffer_size -= ret; + } + ret = ext4_xattr_block_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; + ret += ret2; +errout: + up_read(&EXT4_I(dentry->d_inode)->xattr_sem); + return ret; +} + +/* + * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is + * not set, set it. + */ +static void ext4_xattr_update_super_block(handle_t *handle, + struct super_block *sb) +{ + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR)) + return; + + if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { + EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); + ext4_handle_dirty_super(handle, sb); + } +} + +/* + * Release the xattr block BH: If the reference count is > 1, decrement + * it; otherwise free the block. + */ +static void +ext4_xattr_release_block(handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + struct mb_cache_entry *ce = NULL; + int error = 0; + + ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); + error = ext4_journal_get_write_access(handle, bh); + if (error) + goto out; + + lock_buffer(bh); + if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); + get_bh(bh); + ext4_free_blocks(handle, inode, bh, 0, 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + unlock_buffer(bh); + } else { + le32_add_cpu(&BHDR(bh)->h_refcount, -1); + if (ce) + mb_cache_entry_release(ce); + unlock_buffer(bh); + error = ext4_handle_dirty_metadata(handle, inode, bh); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + dquot_free_block(inode, 1); + ea_bdebug(bh, "refcount now=%d; releasing", + le32_to_cpu(BHDR(bh)->h_refcount)); + } +out: + ext4_std_error(inode->i_sb, error); + return; +} + +/* + * Find the available free space for EAs. This also returns the total number of + * bytes used by EA entries. + */ +static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, + size_t *min_offs, void *base, int *total) +{ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + *total += EXT4_XATTR_LEN(last->e_name_len); + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < *min_offs) + *min_offs = offs; + } + } + return (*min_offs - ((void *)last - base) - sizeof(__u32)); +} + +struct ext4_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ext4_xattr_search { + struct ext4_xattr_entry *first; + void *base; + void *end; + struct ext4_xattr_entry *here; + int not_found; +}; + +static int +ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) +{ + struct ext4_xattr_entry *last; + size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); + + /* Compute min_offs and last. */ + last = s->first; + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + } + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) { + if (!s->here->e_value_block && s->here->e_value_size) { + size_t size = le32_to_cpu(s->here->e_value_size); + free += EXT4_XATTR_SIZE(size); + } + free += EXT4_XATTR_LEN(name_len); + } + if (i->value) { + if (free < EXT4_XATTR_SIZE(i->value_len) || + free < EXT4_XATTR_LEN(name_len) + + EXT4_XATTR_SIZE(i->value_len)) + return -ENOSPC; + } + + if (i->value && s->not_found) { + /* Insert the new name. */ + size_t size = EXT4_XATTR_LEN(name_len); + size_t rest = (void *)last - (void *)s->here + sizeof(__u32); + memmove((void *)s->here + size, s->here, rest); + memset(s->here, 0, size); + s->here->e_name_index = i->name_index; + s->here->e_name_len = name_len; + memcpy(s->here->e_name, i->name, name_len); + } else { + if (!s->here->e_value_block && s->here->e_value_size) { + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(s->here->e_value_offs); + void *val = s->base + offs; + size_t size = EXT4_XATTR_SIZE( + le32_to_cpu(s->here->e_value_size)); + + if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) { + /* The old and the new value have the same + size. Just replace. */ + s->here->e_value_size = + cpu_to_le32(i->value_len); + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); /* Clear pad bytes. */ + memcpy(val, i->value, i->value_len); + return 0; + } + + /* Remove the old value. */ + memmove(first_val + size, first_val, val - first_val); + memset(first_val, 0, size); + s->here->e_value_size = 0; + s->here->e_value_offs = 0; + min_offs += size; + + /* Adjust all value offsets. */ + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + if (!last->e_value_block && + last->e_value_size && o < offs) + last->e_value_offs = + cpu_to_le16(o + size); + last = EXT4_XATTR_NEXT(last); + } + } + if (!i->value) { + /* Remove the old name. */ + size_t size = EXT4_XATTR_LEN(name_len); + last = ENTRY((void *)last - size); + memmove(s->here, (void *)s->here + size, + (void *)last - (void *)s->here + sizeof(__u32)); + memset(last, 0, size); + } + } + + if (i->value) { + /* Insert the new value. */ + s->here->e_value_size = cpu_to_le32(i->value_len); + if (i->value_len) { + size_t size = EXT4_XATTR_SIZE(i->value_len); + void *val = s->base + min_offs - size; + s->here->e_value_offs = cpu_to_le16(min_offs - size); + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); /* Clear the pad bytes. */ + memcpy(val, i->value, i->value_len); + } + } + return 0; +} + +struct ext4_xattr_block_find { + struct ext4_xattr_search s; + struct buffer_head *bh; +}; + +static int +ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + int error; + + ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", + i->name_index, i->name, i->value, (long)i->value_len); + + if (EXT4_I(inode)->i_file_acl) { + /* The inode already has an extended attribute block. */ + bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl); + error = -EIO; + if (!bs->bh) + goto cleanup; + ea_bdebug(bs->bh, "b_count=%d, refcount=%d", + atomic_read(&(bs->bh->b_count)), + le32_to_cpu(BHDR(bs->bh)->h_refcount)); + if (ext4_xattr_check_block(bs->bh)) { + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + /* Find the named attribute. */ + bs->s.base = BHDR(bs->bh); + bs->s.first = BFIRST(bs->bh); + bs->s.end = bs->bh->b_data + bs->bh->b_size; + bs->s.here = bs->s.first; + error = ext4_xattr_find_entry(&bs->s.here, i->name_index, + i->name, bs->bh->b_size, 1); + if (error && error != -ENODATA) + goto cleanup; + bs->s.not_found = error; + } + error = 0; + +cleanup: + return error; +} + +static int +ext4_xattr_block_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; + struct ext4_xattr_search *s = &bs->s; + struct mb_cache_entry *ce = NULL; + int error = 0; + +#define header(x) ((struct ext4_xattr_header *)(x)) + + if (i->value && i->value_len > sb->s_blocksize) + return -ENOSPC; + if (s->base) { + ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, + bs->bh->b_blocknr); + error = ext4_journal_get_write_access(handle, bs->bh); + if (error) + goto cleanup; + lock_buffer(bs->bh); + + if (header(s->base)->h_refcount == cpu_to_le32(1)) { + if (ce) { + mb_cache_entry_free(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "modifying in-place"); + error = ext4_xattr_set_entry(i, s); + if (!error) { + if (!IS_LAST_ENTRY(s->first)) + ext4_xattr_rehash(header(s->base), + s->here); + ext4_xattr_cache_insert(bs->bh); + } + unlock_buffer(bs->bh); + if (error == -EIO) + goto bad_block; + if (!error) + error = ext4_handle_dirty_metadata(handle, + inode, + bs->bh); + if (error) + goto cleanup; + goto inserted; + } else { + int offset = (char *)s->here - bs->bh->b_data; + + unlock_buffer(bs->bh); + ext4_handle_release_buffer(handle, bs->bh); + if (ce) { + mb_cache_entry_release(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "cloning"); + s->base = kmalloc(bs->bh->b_size, GFP_NOFS); + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); + s->first = ENTRY(header(s->base)+1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->here = ENTRY(s->base + offset); + s->end = s->base + bs->bh->b_size; + } + } else { + /* Allocate a buffer where we construct the new block. */ + s->base = kzalloc(sb->s_blocksize, GFP_NOFS); + /* assert(header == s->base) */ + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + header(s->base)->h_blocks = cpu_to_le32(1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->first = ENTRY(header(s->base)+1); + s->here = ENTRY(header(s->base)+1); + s->end = s->base + sb->s_blocksize; + } + + error = ext4_xattr_set_entry(i, s); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + if (!IS_LAST_ENTRY(s->first)) + ext4_xattr_rehash(header(s->base), s->here); + +inserted: + if (!IS_LAST_ENTRY(s->first)) { + new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce); + if (new_bh) { + /* We found an identical block in the cache. */ + if (new_bh == bs->bh) + ea_bdebug(new_bh, "keeping"); + else { + /* The old block is released after updating + the inode. */ + error = dquot_alloc_block(inode, 1); + if (error) + goto cleanup; + error = ext4_journal_get_write_access(handle, + new_bh); + if (error) + goto cleanup_dquot; + lock_buffer(new_bh); + le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); + ea_bdebug(new_bh, "reusing; refcount now=%d", + le32_to_cpu(BHDR(new_bh)->h_refcount)); + unlock_buffer(new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, + new_bh); + if (error) + goto cleanup_dquot; + } + mb_cache_entry_release(ce); + ce = NULL; + } else if (bs->bh && s->base == bs->bh->b_data) { + /* We were modifying this block in-place. */ + ea_bdebug(bs->bh, "keeping this block"); + new_bh = bs->bh; + get_bh(new_bh); + } else { + /* We need to allocate a new block */ + ext4_fsblk_t goal, block; + + goal = ext4_group_first_block_no(sb, + EXT4_I(inode)->i_block_group); + + /* non-extent files can't have physical blocks past 2^32 */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + + /* + * take i_data_sem because we will test + * i_delalloc_reserved_flag in ext4_mb_new_blocks + */ + down_read((&EXT4_I(inode)->i_data_sem)); + block = ext4_new_meta_blocks(handle, inode, goal, 0, + NULL, &error); + up_read((&EXT4_I(inode)->i_data_sem)); + if (error) + goto cleanup; + + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); + + ea_idebug(inode, "creating block %d", block); + + new_bh = sb_getblk(sb, block); + if (!new_bh) { +getblk_failed: + ext4_free_blocks(handle, inode, NULL, block, 1, + EXT4_FREE_BLOCKS_METADATA); + error = -EIO; + goto cleanup; + } + lock_buffer(new_bh); + error = ext4_journal_get_create_access(handle, new_bh); + if (error) { + unlock_buffer(new_bh); + goto getblk_failed; + } + memcpy(new_bh->b_data, s->base, new_bh->b_size); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); + ext4_xattr_cache_insert(new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, new_bh); + if (error) + goto cleanup; + } + } + + /* Update the inode. */ + EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; + + /* Drop the previous xattr block. */ + if (bs->bh && bs->bh != new_bh) + ext4_xattr_release_block(handle, inode, bs->bh); + error = 0; + +cleanup: + if (ce) + mb_cache_entry_release(ce); + brelse(new_bh); + if (!(bs->bh && s->base == bs->bh->b_data)) + kfree(s->base); + + return error; + +cleanup_dquot: + dquot_free_block(inode, 1); + goto cleanup; + +bad_block: + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + goto cleanup; + +#undef header +} + +struct ext4_xattr_ibody_find { + struct ext4_xattr_search s; + struct ext4_iloc iloc; +}; + +static int +ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + raw_inode = ext4_raw_inode(&is->iloc); + header = IHDR(inode, raw_inode); + is->s.base = is->s.first = IFIRST(header); + is->s.here = is->s.first; + is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + error = ext4_xattr_check_names(IFIRST(header), is->s.end); + if (error) + return error; + /* Find the named attribute. */ + error = ext4_xattr_find_entry(&is->s.here, i->name_index, + i->name, is->s.end - + (void *)is->s.base, 0); + if (error && error != -ENODATA) + return error; + is->s.not_found = error; + } + return 0; +} + +static int +ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_search *s = &is->s; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; + error = ext4_xattr_set_entry(i, s); + if (error) + return error; + header = IHDR(inode, ext4_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + } else { + header->h_magic = cpu_to_le32(0); + ext4_clear_inode_state(inode, EXT4_STATE_XATTR); + } + return 0; +} + +/* + * ext4_xattr_set_handle() + * + * Create, replace or remove an extended attribute for this inode. Value + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int +ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len, + int flags) +{ + struct ext4_xattr_info i = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + + }; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_block_find bs = { + .s = { .not_found = -ENODATA, }, + }; + unsigned long no_expand; + int error; + + if (!name) + return -EINVAL; + if (strlen(name) > 255) + return -ERANGE; + down_write(&EXT4_I(inode)->xattr_sem); + no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + goto cleanup; + + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto cleanup; + + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) { + struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); + memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); + ext4_clear_inode_state(inode, EXT4_STATE_NEW); + } + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto cleanup; + if (is.s.not_found) + error = ext4_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + if (is.s.not_found && bs.s.not_found) { + error = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + error = 0; + if (!value) + goto cleanup; + } else { + error = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + if (!value) { + if (!is.s.not_found) + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + else if (!bs.s.not_found) + error = ext4_xattr_block_set(handle, inode, &i, &bs); + } else { + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (!error && !bs.s.not_found) { + i.value = NULL; + error = ext4_xattr_block_set(handle, inode, &i, &bs); + } else if (error == -ENOSPC) { + if (EXT4_I(inode)->i_file_acl && !bs.s.base) { + error = ext4_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + } + error = ext4_xattr_block_set(handle, inode, &i, &bs); + if (error) + goto cleanup; + if (!is.s.not_found) { + i.value = NULL; + error = ext4_xattr_ibody_set(handle, inode, &i, + &is); + } + } + } + if (!error) { + ext4_xattr_update_super_block(handle, inode->i_sb); + inode->i_ctime = ext4_current_time(inode); + if (!value) + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + /* + * The bh is consumed by ext4_mark_iloc_dirty, even with + * error != 0. + */ + is.iloc.bh = NULL; + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + } + +cleanup: + brelse(is.iloc.bh); + brelse(bs.bh); + if (no_expand == 0) + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + up_write(&EXT4_I(inode)->xattr_sem); + return error; +} + +/* + * ext4_xattr_set() + * + * Like ext4_xattr_set_handle, but start from an inode. This extended + * attribute modification is a filesystem transaction by itself. + * + * Returns 0, or a negative error number on failure. + */ +int +ext4_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len, int flags) +{ + handle_t *handle; + int error, retries = 0; + +retry: + handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + } else { + int error2; + + error = ext4_xattr_set_handle(handle, inode, name_index, name, + value, value_len, flags); + error2 = ext4_journal_stop(handle); + if (error == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + if (error == 0) + error = error2; + } + + return error; +} + +/* + * Shift the EA entries in the inode to create space for the increased + * i_extra_isize. + */ +static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, + int value_offs_shift, void *to, + void *from, size_t n, int blocksize) +{ + struct ext4_xattr_entry *last = entry; + int new_offs; + + /* Adjust the value offsets of the entries */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + if (!last->e_value_block && last->e_value_size) { + new_offs = le16_to_cpu(last->e_value_offs) + + value_offs_shift; + BUG_ON(new_offs + le32_to_cpu(last->e_value_size) + > blocksize); + last->e_value_offs = cpu_to_le16(new_offs); + } + } + /* Shift the entries by n bytes */ + memmove(to, from, n); +} + +/* + * Expand an inode by new_extra_isize bytes when EAs are present. + * Returns 0 on success or negative error number on failure. + */ +int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry, *last, *first; + struct buffer_head *bh = NULL; + struct ext4_xattr_ibody_find *is = NULL; + struct ext4_xattr_block_find *bs = NULL; + char *buffer = NULL, *b_entry_name = NULL; + size_t min_offs, free; + int total_ino, total_blk; + void *base, *start, *end; + int extra_isize = 0, error = 0, tried_min_extra_isize = 0; + int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); + + down_write(&EXT4_I(inode)->xattr_sem); +retry: + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) { + up_write(&EXT4_I(inode)->xattr_sem); + return 0; + } + + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + + /* + * Check if enough free space is available in the inode to shift the + * entries ahead by new_extra_isize. + */ + + base = start = entry; + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + min_offs = end - base; + last = entry; + total_ino = sizeof(struct ext4_xattr_ibody_header); + + free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); + if (free >= new_extra_isize) { + entry = IFIRST(header); + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize + - new_extra_isize, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, + (void *)header, total_ino, + inode->i_sb->s_blocksize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + error = 0; + goto cleanup; + } + + /* + * Enough free space isn't available in the inode, check if + * EA block can hold new_extra_isize bytes. + */ + if (EXT4_I(inode)->i_file_acl) { + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + if (ext4_xattr_check_block(bh)) { + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + base = BHDR(bh); + first = BFIRST(bh); + end = bh->b_data + bh->b_size; + min_offs = end - base; + free = ext4_xattr_free_space(first, &min_offs, base, + &total_blk); + if (free < new_extra_isize) { + if (!tried_min_extra_isize && s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + brelse(bh); + goto retry; + } + error = -1; + goto cleanup; + } + } else { + free = inode->i_sb->s_blocksize; + } + + while (new_extra_isize > 0) { + size_t offs, size, entry_size; + struct ext4_xattr_entry *small_entry = NULL; + struct ext4_xattr_info i = { + .value = NULL, + .value_len = 0, + }; + unsigned int total_size; /* EA entry size + value size */ + unsigned int shift_bytes; /* No. of bytes to shift EAs by? */ + unsigned int min_total_size = ~0U; + + is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); + bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); + if (!is || !bs) { + error = -ENOMEM; + goto cleanup; + } + + is->s.not_found = -ENODATA; + bs->s.not_found = -ENODATA; + is->iloc.bh = NULL; + bs->bh = NULL; + + last = IFIRST(header); + /* Find the entry best suited to be pushed into EA block */ + entry = NULL; + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + total_size = + EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + + EXT4_XATTR_LEN(last->e_name_len); + if (total_size <= free && total_size < min_total_size) { + if (total_size < new_extra_isize) { + small_entry = last; + } else { + entry = last; + min_total_size = total_size; + } + } + } + + if (entry == NULL) { + if (small_entry) { + entry = small_entry; + } else { + if (!tried_min_extra_isize && + s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + goto retry; + } + error = -1; + goto cleanup; + } + } + offs = le16_to_cpu(entry->e_value_offs); + size = le32_to_cpu(entry->e_value_size); + entry_size = EXT4_XATTR_LEN(entry->e_name_len); + i.name_index = entry->e_name_index, + buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS); + b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); + if (!buffer || !b_entry_name) { + error = -ENOMEM; + goto cleanup; + } + /* Save the entry name and the entry value */ + memcpy(buffer, (void *)IFIRST(header) + offs, + EXT4_XATTR_SIZE(size)); + memcpy(b_entry_name, entry->e_name, entry->e_name_len); + b_entry_name[entry->e_name_len] = '\0'; + i.name = b_entry_name; + + error = ext4_get_inode_loc(inode, &is->iloc); + if (error) + goto cleanup; + + error = ext4_xattr_ibody_find(inode, &i, is); + if (error) + goto cleanup; + + /* Remove the chosen entry from the inode */ + error = ext4_xattr_ibody_set(handle, inode, &i, is); + if (error) + goto cleanup; + + entry = IFIRST(header); + if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) + shift_bytes = new_extra_isize; + else + shift_bytes = entry_size + size; + /* Adjust the offsets and shift the remaining entries ahead */ + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - + shift_bytes, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes, + (void *)header, total_ino - entry_size, + inode->i_sb->s_blocksize); + + extra_isize += shift_bytes; + new_extra_isize -= shift_bytes; + EXT4_I(inode)->i_extra_isize = extra_isize; + + i.name = b_entry_name; + i.value = buffer; + i.value_len = size; + error = ext4_xattr_block_find(inode, &i, bs); + if (error) + goto cleanup; + + /* Add entry which was removed from the inode into the block */ + error = ext4_xattr_block_set(handle, inode, &i, bs); + if (error) + goto cleanup; + kfree(b_entry_name); + kfree(buffer); + b_entry_name = NULL; + buffer = NULL; + brelse(is->iloc.bh); + kfree(is); + kfree(bs); + } + brelse(bh); + up_write(&EXT4_I(inode)->xattr_sem); + return 0; + +cleanup: + kfree(b_entry_name); + kfree(buffer); + if (is) + brelse(is->iloc.bh); + kfree(is); + kfree(bs); + brelse(bh); + up_write(&EXT4_I(inode)->xattr_sem); + return error; +} + + + +/* + * ext4_xattr_delete_inode() + * + * Free extended attribute resources associated with this inode. This + * is called immediately before an inode is freed. We have exclusive + * access to the inode. + */ +void +ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ + struct buffer_head *bh = NULL; + + if (!EXT4_I(inode)->i_file_acl) + goto cleanup; + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) { + EXT4_ERROR_INODE(inode, "block %llu read error", + EXT4_I(inode)->i_file_acl); + goto cleanup; + } + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) { + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + goto cleanup; + } + ext4_xattr_release_block(handle, inode, bh); + EXT4_I(inode)->i_file_acl = 0; + +cleanup: + brelse(bh); +} + +/* + * ext4_xattr_put_super() + * + * This is called when a file system is unmounted. + */ +void +ext4_xattr_put_super(struct super_block *sb) +{ + mb_cache_shrink(sb->s_bdev); +} + +/* + * ext4_xattr_cache_insert() + * + * Create a new entry in the extended attribute cache, and insert + * it unless such an entry is already in the cache. + * + * Returns 0, or a negative error number on failure. + */ +static void +ext4_xattr_cache_insert(struct buffer_head *bh) +{ + __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); + struct mb_cache_entry *ce; + int error; + + ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); + if (!ce) { + ea_bdebug(bh, "out of memory"); + return; + } + error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); + if (error) { + mb_cache_entry_free(ce); + if (error == -EBUSY) { + ea_bdebug(bh, "already in cache"); + error = 0; + } + } else { + ea_bdebug(bh, "inserting [%x]", (int)hash); + mb_cache_entry_release(ce); + } +} + +/* + * ext4_xattr_cmp() + * + * Compare two extended attribute blocks for equality. + * + * Returns 0 if the blocks are equal, 1 if they differ, and + * a negative error number on errors. + */ +static int +ext4_xattr_cmp(struct ext4_xattr_header *header1, + struct ext4_xattr_header *header2) +{ + struct ext4_xattr_entry *entry1, *entry2; + + entry1 = ENTRY(header1+1); + entry2 = ENTRY(header2+1); + while (!IS_LAST_ENTRY(entry1)) { + if (IS_LAST_ENTRY(entry2)) + return 1; + if (entry1->e_hash != entry2->e_hash || + entry1->e_name_index != entry2->e_name_index || + entry1->e_name_len != entry2->e_name_len || + entry1->e_value_size != entry2->e_value_size || + memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) + return 1; + if (entry1->e_value_block != 0 || entry2->e_value_block != 0) + return -EIO; + if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + (char *)header2 + le16_to_cpu(entry2->e_value_offs), + le32_to_cpu(entry1->e_value_size))) + return 1; + + entry1 = EXT4_XATTR_NEXT(entry1); + entry2 = EXT4_XATTR_NEXT(entry2); + } + if (!IS_LAST_ENTRY(entry2)) + return 1; + return 0; +} + +/* + * ext4_xattr_cache_find() + * + * Find an identical extended attribute block. + * + * Returns a pointer to the block found, or NULL if such a block was + * not found or an error occurred. + */ +static struct buffer_head * +ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, + struct mb_cache_entry **pce) +{ + __u32 hash = le32_to_cpu(header->h_hash); + struct mb_cache_entry *ce; + + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); +again: + ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev, + hash); + while (ce) { + struct buffer_head *bh; + + if (IS_ERR(ce)) { + if (PTR_ERR(ce) == -EAGAIN) + goto again; + break; + } + bh = sb_bread(inode->i_sb, ce->e_block); + if (!bh) { + EXT4_ERROR_INODE(inode, "block %lu read error", + (unsigned long) ce->e_block); + } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= + EXT4_XATTR_REFCOUNT_MAX) { + ea_idebug(inode, "block %lu refcount %d>=%d", + (unsigned long) ce->e_block, + le32_to_cpu(BHDR(bh)->h_refcount), + EXT4_XATTR_REFCOUNT_MAX); + } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { + *pce = ce; + return bh; + } + brelse(bh); + ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); + } + return NULL; +} + +#define NAME_HASH_SHIFT 5 +#define VALUE_HASH_SHIFT 16 + +/* + * ext4_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, + struct ext4_xattr_entry *entry) +{ + __u32 hash = 0; + char *name = entry->e_name; + int n; + + for (n = 0; n < entry->e_name_len; n++) { + hash = (hash << NAME_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ + *name++; + } + + if (entry->e_value_block == 0 && entry->e_value_size != 0) { + __le32 *value = (__le32 *)((char *)header + + le16_to_cpu(entry->e_value_offs)); + for (n = (le32_to_cpu(entry->e_value_size) + + EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); + } + } + entry->e_hash = cpu_to_le32(hash); +} + +#undef NAME_HASH_SHIFT +#undef VALUE_HASH_SHIFT + +#define BLOCK_HASH_SHIFT 16 + +/* + * ext4_xattr_rehash() + * + * Re-compute the extended attribute hash value after an entry has changed. + */ +static void ext4_xattr_rehash(struct ext4_xattr_header *header, + struct ext4_xattr_entry *entry) +{ + struct ext4_xattr_entry *here; + __u32 hash = 0; + + ext4_xattr_hash_entry(header, entry); + here = ENTRY(header+1); + while (!IS_LAST_ENTRY(here)) { + if (!here->e_hash) { + /* Block is not shared if an entry's hash value == 0 */ + hash = 0; + break; + } + hash = (hash << BLOCK_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ + le32_to_cpu(here->e_hash); + here = EXT4_XATTR_NEXT(here); + } + header->h_hash = cpu_to_le32(hash); +} + +#undef BLOCK_HASH_SHIFT + +int __init +ext4_init_xattr(void) +{ + ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); + if (!ext4_xattr_cache) + return -ENOMEM; + return 0; +} + +void +ext4_exit_xattr(void) +{ + if (ext4_xattr_cache) + mb_cache_destroy(ext4_xattr_cache); + ext4_xattr_cache = NULL; +} diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h new file mode 100644 index 00000000..25b7387f --- /dev/null +++ b/fs/ext4/xattr.h @@ -0,0 +1,155 @@ +/* + File: fs/ext4/xattr.h + + On-disk format of extended attributes for the ext4 filesystem. + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +/* Magic value in attribute blocks */ +#define EXT4_XATTR_MAGIC 0xEA020000 + +/* Maximum number of references to one attribute block */ +#define EXT4_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define EXT4_XATTR_INDEX_USER 1 +#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EXT4_XATTR_INDEX_TRUSTED 4 +#define EXT4_XATTR_INDEX_LUSTRE 5 +#define EXT4_XATTR_INDEX_SECURITY 6 + +struct ext4_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __le32 h_blocks; /* number of disk blocks used */ + __le32 h_hash; /* hash value of all attributes */ + __u32 h_reserved[4]; /* zero right now */ +}; + +struct ext4_xattr_ibody_header { + __le32 h_magic; /* magic number for identification */ +}; + +struct ext4_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ + __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[0]; /* attribute name */ +}; + +#define EXT4_XATTR_PAD_BITS 2 +#define EXT4_XATTR_PAD (1<e_name_len))) +#define EXT4_XATTR_SIZE(size) \ + (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) + +#define IHDR(inode, raw_inode) \ + ((struct ext4_xattr_ibody_header *) \ + ((void *)raw_inode + \ + EXT4_GOOD_OLD_INODE_SIZE + \ + EXT4_I(inode)->i_extra_isize)) +#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) + +# ifdef CONFIG_EXT4_FS_XATTR + +extern const struct xattr_handler ext4_xattr_user_handler; +extern const struct xattr_handler ext4_xattr_trusted_handler; +extern const struct xattr_handler ext4_xattr_acl_access_handler; +extern const struct xattr_handler ext4_xattr_acl_default_handler; +extern const struct xattr_handler ext4_xattr_security_handler; + +extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); + +extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); +extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); + +extern void ext4_xattr_delete_inode(handle_t *, struct inode *); +extern void ext4_xattr_put_super(struct super_block *); + +extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle); + +extern int __init ext4_init_xattr(void); +extern void ext4_exit_xattr(void); + +extern const struct xattr_handler *ext4_xattr_handlers[]; + +# else /* CONFIG_EXT4_FS_XATTR */ + +static inline int +ext4_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext4_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline void +ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ +} + +static inline void +ext4_xattr_put_super(struct super_block *sb) +{ +} + +static __init inline int +ext4_init_xattr(void) +{ + return 0; +} + +static inline void +ext4_exit_xattr(void) +{ +} + +static inline int +ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle) +{ + return -EOPNOTSUPP; +} + +#define ext4_xattr_handlers NULL + +# endif /* CONFIG_EXT4_FS_XATTR */ + +#ifdef CONFIG_EXT4_FS_SECURITY +extern int ext4_init_security(handle_t *handle, struct inode *inode, + struct inode *dir, const struct qstr *qstr); +#else +static inline int ext4_init_security(handle_t *handle, struct inode *inode, + struct inode *dir, const struct qstr *qstr) +{ + return 0; +} +#endif diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c new file mode 100644 index 00000000..007c3bfb --- /dev/null +++ b/fs/ext4/xattr_security.c @@ -0,0 +1,78 @@ +/* + * linux/fs/ext4/xattr_security.c + * Handler for storing security labels as extended attributes. + */ + +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static size_t +ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; + const size_t total_len = prefix_len + name_len + 1; + + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext4_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + name, buffer, size); +} + +static int +ext4_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + name, value, size, flags); +} + +int +ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + int err; + size_t len; + void *value; + char *name; + + err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY, + name, value, len, 0); + kfree(name); + kfree(value); + return err; +} + +const struct xattr_handler ext4_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = ext4_xattr_security_list, + .get = ext4_xattr_security_get, + .set = ext4_xattr_security_set, +}; diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c new file mode 100644 index 00000000..37e6ebca --- /dev/null +++ b/fs/ext4/xattr_trusted.c @@ -0,0 +1,59 @@ +/* + * linux/fs/ext4/xattr_trusted.c + * Handler for trusted extended attributes. + * + * Copyright (C) 2003 by Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static size_t +ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!capable(CAP_SYS_ADMIN)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + name, buffer, size); +} + +static int +ext4_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + name, value, size, flags); +} + +const struct xattr_handler ext4_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ext4_xattr_trusted_list, + .get = ext4_xattr_trusted_get, + .set = ext4_xattr_trusted_set, +}; diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c new file mode 100644 index 00000000..98c37535 --- /dev/null +++ b/fs/ext4/xattr_user.c @@ -0,0 +1,62 @@ +/* + * linux/fs/ext4/xattr_user.c + * Handler for extended user attributes. + * + * Copyright (C) 2001 by Andreas Gruenbacher, + */ + +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static size_t +ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = XATTR_USER_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!test_opt(dentry->d_sb, XATTR_USER)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext4_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER, + name, buffer, size); +} + +static int +ext4_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER, + name, value, size, flags); +} + +const struct xattr_handler ext4_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ext4_xattr_user_list, + .get = ext4_xattr_user_get, + .set = ext4_xattr_user_set, +}; diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig new file mode 100644 index 00000000..182f9ffe --- /dev/null +++ b/fs/fat/Kconfig @@ -0,0 +1,100 @@ +config FAT_FS + tristate + select NLS + help + If you want to use one of the FAT-based file systems (the MS-DOS and + VFAT (Windows 95) file systems), then you must say Y or M here + to include FAT support. You will then be able to mount partitions or + diskettes with FAT-based file systems and transparently access the + files on them, i.e. MSDOS files will look and behave just like all + other Unix files. + + This FAT support is not a file system in itself, it only provides + the foundation for the other file systems. You will have to say Y or + M to at least one of "MSDOS fs support" or "VFAT fs support" in + order to make use of it. + + Another way to read and write MSDOS floppies and hard drive + partitions from within Linux (but not transparently) is with the + mtools ("man mtools") program suite. You don't need to say Y here in + order to do that. + + If you need to move large files on floppies between a DOS and a + Linux box, say Y here, mount the floppy under Linux with an MSDOS + file system and use GNU tar's M option. GNU tar is a program + available for Unix and DOS ("man tar" or "info tar"). + + The FAT support will enlarge your kernel by about 37 KB. If unsure, + say Y. + + To compile this as a module, choose M here: the module will be called + fat. Note that if you compile the FAT support as a module, you + cannot compile any of the FAT-based file systems into the kernel + -- they will have to be modules as well. + +config MSDOS_FS + tristate "MSDOS fs support" + select FAT_FS + help + This allows you to mount MSDOS partitions of your hard drive (unless + they are compressed; to access compressed MSDOS partitions under + Linux, you can either use the DOS emulator DOSEMU, described in the + DOSEMU-HOWTO, available from + , or try dmsdosfs in + . If you + intend to use dosemu with a non-compressed MSDOS partition, say Y + here) and MSDOS floppies. This means that file access becomes + transparent, i.e. the MSDOS files look and behave just like all + other Unix files. + + If you have Windows 95 or Windows NT installed on your MSDOS + partitions, you should use the VFAT file system (say Y to "VFAT fs + support" below), or you will not be able to see the long filenames + generated by Windows 95 / Windows NT. + + This option will enlarge your kernel by about 7 KB. If unsure, + answer Y. This will only work if you said Y to "DOS FAT fs support" + as well. To compile this as a module, choose M here: the module will + be called msdos. + +config VFAT_FS + tristate "VFAT (Windows-95) fs support" + select FAT_FS + help + This option provides support for normal Windows file systems with + long filenames. That includes non-compressed FAT-based file systems + used by Windows 95, Windows 98, Windows NT 4.0, and the Unix + programs from the mtools package. + + The VFAT support enlarges your kernel by about 10 KB and it only + works if you said Y to the "DOS FAT fs support" above. Please read + the file for details. If + unsure, say Y. + + To compile this as a module, choose M here: the module will be called + vfat. + +config FAT_DEFAULT_CODEPAGE + int "Default codepage for FAT" + depends on MSDOS_FS || VFAT_FS + default 437 + help + This option should be set to the codepage of your FAT filesystems. + It can be overridden with the "codepage" mount option. + See for more information. + +config FAT_DEFAULT_IOCHARSET + string "Default iocharset for FAT" + depends on VFAT_FS + default "iso8859-1" + help + Set this to the default input/output character set you'd + like FAT to use. It should probably match the character set + that most of your FAT filesystems use, and can be overridden + with the "iocharset" mount option for FAT filesystems. + Note that "utf8" is not recommended for FAT filesystems. + If unsure, you shouldn't set "utf8" here. + See for more information. + + Enable any character sets you need in File Systems/Native Language + Support. diff --git a/fs/fat/Makefile b/fs/fat/Makefile new file mode 100644 index 00000000..e0619032 --- /dev/null +++ b/fs/fat/Makefile @@ -0,0 +1,11 @@ +# +# Makefile for the Linux fat filesystem support. +# + +obj-$(CONFIG_FAT_FS) += fat.o +obj-$(CONFIG_VFAT_FS) += vfat.o +obj-$(CONFIG_MSDOS_FS) += msdos.o + +fat-y := cache.o dir.o fatent.o file.o inode.o misc.o +vfat-y := namei_vfat.o +msdos-y := namei_msdos.o diff --git a/fs/fat/cache.c b/fs/fat/cache.c new file mode 100644 index 00000000..1cc7038e --- /dev/null +++ b/fs/fat/cache.c @@ -0,0 +1,351 @@ +/* + * linux/fs/fat/cache.c + * + * Written 1992,1993 by Werner Almesberger + * + * Mar 1999. AV. Changed cache, so that it uses the starting cluster instead + * of inode number. + * May 1999. AV. Fixed the bogosity with FAT32 (read "FAT28"). Fscking lusers. + */ + +#include +#include +#include +#include "fat.h" + +/* this must be > 0. */ +#define FAT_MAX_CACHE 8 + +struct fat_cache { + struct list_head cache_list; + int nr_contig; /* number of contiguous clusters */ + int fcluster; /* cluster number in the file. */ + int dcluster; /* cluster number on disk. */ +}; + +struct fat_cache_id { + unsigned int id; + int nr_contig; + int fcluster; + int dcluster; +}; + +static inline int fat_max_cache(struct inode *inode) +{ + return FAT_MAX_CACHE; +} + +static struct kmem_cache *fat_cache_cachep; + +static void init_once(void *foo) +{ + struct fat_cache *cache = (struct fat_cache *)foo; + + INIT_LIST_HEAD(&cache->cache_list); +} + +int __init fat_cache_init(void) +{ + fat_cache_cachep = kmem_cache_create("fat_cache", + sizeof(struct fat_cache), + 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + init_once); + if (fat_cache_cachep == NULL) + return -ENOMEM; + return 0; +} + +void fat_cache_destroy(void) +{ + kmem_cache_destroy(fat_cache_cachep); +} + +static inline struct fat_cache *fat_cache_alloc(struct inode *inode) +{ + return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS); +} + +static inline void fat_cache_free(struct fat_cache *cache) +{ + BUG_ON(!list_empty(&cache->cache_list)); + kmem_cache_free(fat_cache_cachep, cache); +} + +static inline void fat_cache_update_lru(struct inode *inode, + struct fat_cache *cache) +{ + if (MSDOS_I(inode)->cache_lru.next != &cache->cache_list) + list_move(&cache->cache_list, &MSDOS_I(inode)->cache_lru); +} + +static int fat_cache_lookup(struct inode *inode, int fclus, + struct fat_cache_id *cid, + int *cached_fclus, int *cached_dclus) +{ + static struct fat_cache nohit = { .fcluster = 0, }; + + struct fat_cache *hit = &nohit, *p; + int offset = -1; + + spin_lock(&MSDOS_I(inode)->cache_lru_lock); + list_for_each_entry(p, &MSDOS_I(inode)->cache_lru, cache_list) { + /* Find the cache of "fclus" or nearest cache. */ + if (p->fcluster <= fclus && hit->fcluster < p->fcluster) { + hit = p; + if ((hit->fcluster + hit->nr_contig) < fclus) { + offset = hit->nr_contig; + } else { + offset = fclus - hit->fcluster; + break; + } + } + } + if (hit != &nohit) { + fat_cache_update_lru(inode, hit); + + cid->id = MSDOS_I(inode)->cache_valid_id; + cid->nr_contig = hit->nr_contig; + cid->fcluster = hit->fcluster; + cid->dcluster = hit->dcluster; + *cached_fclus = cid->fcluster + offset; + *cached_dclus = cid->dcluster + offset; + } + spin_unlock(&MSDOS_I(inode)->cache_lru_lock); + + return offset; +} + +static struct fat_cache *fat_cache_merge(struct inode *inode, + struct fat_cache_id *new) +{ + struct fat_cache *p; + + list_for_each_entry(p, &MSDOS_I(inode)->cache_lru, cache_list) { + /* Find the same part as "new" in cluster-chain. */ + if (p->fcluster == new->fcluster) { + BUG_ON(p->dcluster != new->dcluster); + if (new->nr_contig > p->nr_contig) + p->nr_contig = new->nr_contig; + return p; + } + } + return NULL; +} + +static void fat_cache_add(struct inode *inode, struct fat_cache_id *new) +{ + struct fat_cache *cache, *tmp; + + if (new->fcluster == -1) /* dummy cache */ + return; + + spin_lock(&MSDOS_I(inode)->cache_lru_lock); + if (new->id != FAT_CACHE_VALID && + new->id != MSDOS_I(inode)->cache_valid_id) + goto out; /* this cache was invalidated */ + + cache = fat_cache_merge(inode, new); + if (cache == NULL) { + if (MSDOS_I(inode)->nr_caches < fat_max_cache(inode)) { + MSDOS_I(inode)->nr_caches++; + spin_unlock(&MSDOS_I(inode)->cache_lru_lock); + + tmp = fat_cache_alloc(inode); + if (!tmp) { + spin_lock(&MSDOS_I(inode)->cache_lru_lock); + MSDOS_I(inode)->nr_caches--; + spin_unlock(&MSDOS_I(inode)->cache_lru_lock); + return; + } + + spin_lock(&MSDOS_I(inode)->cache_lru_lock); + cache = fat_cache_merge(inode, new); + if (cache != NULL) { + MSDOS_I(inode)->nr_caches--; + fat_cache_free(tmp); + goto out_update_lru; + } + cache = tmp; + } else { + struct list_head *p = MSDOS_I(inode)->cache_lru.prev; + cache = list_entry(p, struct fat_cache, cache_list); + } + cache->fcluster = new->fcluster; + cache->dcluster = new->dcluster; + cache->nr_contig = new->nr_contig; + } +out_update_lru: + fat_cache_update_lru(inode, cache); +out: + spin_unlock(&MSDOS_I(inode)->cache_lru_lock); +} + +/* + * Cache invalidation occurs rarely, thus the LRU chain is not updated. It + * fixes itself after a while. + */ +static void __fat_cache_inval_inode(struct inode *inode) +{ + struct msdos_inode_info *i = MSDOS_I(inode); + struct fat_cache *cache; + + while (!list_empty(&i->cache_lru)) { + cache = list_entry(i->cache_lru.next, struct fat_cache, cache_list); + list_del_init(&cache->cache_list); + i->nr_caches--; + fat_cache_free(cache); + } + /* Update. The copy of caches before this id is discarded. */ + i->cache_valid_id++; + if (i->cache_valid_id == FAT_CACHE_VALID) + i->cache_valid_id++; +} + +void fat_cache_inval_inode(struct inode *inode) +{ + spin_lock(&MSDOS_I(inode)->cache_lru_lock); + __fat_cache_inval_inode(inode); + spin_unlock(&MSDOS_I(inode)->cache_lru_lock); +} + +static inline int cache_contiguous(struct fat_cache_id *cid, int dclus) +{ + cid->nr_contig++; + return ((cid->dcluster + cid->nr_contig) == dclus); +} + +static inline void cache_init(struct fat_cache_id *cid, int fclus, int dclus) +{ + cid->id = FAT_CACHE_VALID; + cid->fcluster = fclus; + cid->dcluster = dclus; + cid->nr_contig = 0; +} + +int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) +{ + struct super_block *sb = inode->i_sb; + const int limit = sb->s_maxbytes >> MSDOS_SB(sb)->cluster_bits; + struct fat_entry fatent; + struct fat_cache_id cid; + int nr; + + BUG_ON(MSDOS_I(inode)->i_start == 0); + + *fclus = 0; + *dclus = MSDOS_I(inode)->i_start; + if (cluster == 0) + return 0; + + if (fat_cache_lookup(inode, cluster, &cid, fclus, dclus) < 0) { + /* + * dummy, always not contiguous + * This is reinitialized by cache_init(), later. + */ + cache_init(&cid, -1, -1); + } + + fatent_init(&fatent); + while (*fclus < cluster) { + /* prevent the infinite loop of cluster chain */ + if (*fclus > limit) { + fat_fs_error_ratelimit(sb, + "%s: detected the cluster chain loop" + " (i_pos %lld)", __func__, + MSDOS_I(inode)->i_pos); + nr = -EIO; + goto out; + } + + nr = fat_ent_read(inode, &fatent, *dclus); + if (nr < 0) + goto out; + else if (nr == FAT_ENT_FREE) { + fat_fs_error_ratelimit(sb, "%s: invalid cluster chain" + " (i_pos %lld)", __func__, + MSDOS_I(inode)->i_pos); + nr = -EIO; + goto out; + } else if (nr == FAT_ENT_EOF) { + fat_cache_add(inode, &cid); + goto out; + } + (*fclus)++; + *dclus = nr; + if (!cache_contiguous(&cid, *dclus)) + cache_init(&cid, *fclus, *dclus); + } + nr = 0; + fat_cache_add(inode, &cid); +out: + fatent_brelse(&fatent); + return nr; +} + +static int fat_bmap_cluster(struct inode *inode, int cluster) +{ + struct super_block *sb = inode->i_sb; + int ret, fclus, dclus; + + if (MSDOS_I(inode)->i_start == 0) + return 0; + + ret = fat_get_cluster(inode, cluster, &fclus, &dclus); + if (ret < 0) + return ret; + else if (ret == FAT_ENT_EOF) { + fat_fs_error(sb, "%s: request beyond EOF (i_pos %lld)", + __func__, MSDOS_I(inode)->i_pos); + return -EIO; + } + return dclus; +} + +int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, + unsigned long *mapped_blocks, int create) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + const unsigned long blocksize = sb->s_blocksize; + const unsigned char blocksize_bits = sb->s_blocksize_bits; + sector_t last_block; + int cluster, offset; + + *phys = 0; + *mapped_blocks = 0; + if ((sbi->fat_bits != 32) && (inode->i_ino == MSDOS_ROOT_INO)) { + if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) { + *phys = sector + sbi->dir_start; + *mapped_blocks = 1; + } + return 0; + } + + last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits; + if (sector >= last_block) { + if (!create) + return 0; + + /* + * ->mmu_private can access on only allocation path. + * (caller must hold ->i_mutex) + */ + last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) + >> blocksize_bits; + if (sector >= last_block) + return 0; + } + + cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); + offset = sector & (sbi->sec_per_clus - 1); + cluster = fat_bmap_cluster(inode, cluster); + if (cluster < 0) + return cluster; + else if (cluster) { + *phys = fat_clus_to_blknr(sbi, cluster) + offset; + *mapped_blocks = sbi->sec_per_clus - offset; + if (*mapped_blocks > last_block - sector) + *mapped_blocks = last_block - sector; + } + return 0; +} diff --git a/fs/fat/dir.c b/fs/fat/dir.c new file mode 100644 index 00000000..dc563788 --- /dev/null +++ b/fs/fat/dir.c @@ -0,0 +1,1371 @@ +/* + * linux/fs/fat/dir.c + * + * directory handling functions for fat-based filesystems + * + * Written 1992,1993 by Werner Almesberger + * + * Hidden files 1995 by Albert Cahalan + * + * VFAT extensions by Gordon Chaffee + * Merged with msdos fs by Henrik Storner + * Rewritten for constant inumbers. Plugged buffer overrun in readdir(). AV + * Short name translation 1999, 2001 by Wolfram Pienkoss + */ + +#include +#include +#include +#include +#include +#include +#include +#include "fat.h" + +/* + * Maximum buffer size of short name. + * [(MSDOS_NAME + '.') * max one char + nul] + * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul] + */ +#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1) +/* + * Maximum buffer size of unicode chars from slots. + * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)] + */ +#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1) +#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t)) + +static inline loff_t fat_make_i_pos(struct super_block *sb, + struct buffer_head *bh, + struct msdos_dir_entry *de) +{ + return ((loff_t)bh->b_blocknr << MSDOS_SB(sb)->dir_per_block_bits) + | (de - (struct msdos_dir_entry *)bh->b_data); +} + +static inline void fat_dir_readahead(struct inode *dir, sector_t iblock, + sector_t phys) +{ + struct super_block *sb = dir->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bh; + int sec; + + /* This is not a first sector of cluster, or sec_per_clus == 1 */ + if ((iblock & (sbi->sec_per_clus - 1)) || sbi->sec_per_clus == 1) + return; + /* root dir of FAT12/FAT16 */ + if ((sbi->fat_bits != 32) && (dir->i_ino == MSDOS_ROOT_INO)) + return; + + bh = sb_find_get_block(sb, phys); + if (bh == NULL || !buffer_uptodate(bh)) { + for (sec = 0; sec < sbi->sec_per_clus; sec++) + sb_breadahead(sb, phys + sec); + } + brelse(bh); +} + +/* Returns the inode number of the directory entry at offset pos. If bh is + non-NULL, it is brelse'd before. Pos is incremented. The buffer header is + returned in bh. + AV. Most often we do it item-by-item. Makes sense to optimize. + AV. OK, there we go: if both bh and de are non-NULL we assume that we just + AV. want the next entry (took one explicit de=NULL in vfat/namei.c). + AV. It's done in fat_get_entry() (inlined), here the slow case lives. + AV. Additionally, when we return -1 (i.e. reached the end of directory) + AV. we make bh NULL. + */ +static int fat__get_entry(struct inode *dir, loff_t *pos, + struct buffer_head **bh, struct msdos_dir_entry **de) +{ + struct super_block *sb = dir->i_sb; + sector_t phys, iblock; + unsigned long mapped_blocks; + int err, offset; + +next: + if (*bh) + brelse(*bh); + + *bh = NULL; + iblock = *pos >> sb->s_blocksize_bits; + err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0); + if (err || !phys) + return -1; /* beyond EOF or error */ + + fat_dir_readahead(dir, iblock, phys); + + *bh = sb_bread(sb, phys); + if (*bh == NULL) { + fat_msg(sb, KERN_ERR, "Directory bread(block %llu) failed", + (llu)phys); + /* skip this block */ + *pos = (iblock + 1) << sb->s_blocksize_bits; + goto next; + } + + offset = *pos & (sb->s_blocksize - 1); + *pos += sizeof(struct msdos_dir_entry); + *de = (struct msdos_dir_entry *)((*bh)->b_data + offset); + + return 0; +} + +static inline int fat_get_entry(struct inode *dir, loff_t *pos, + struct buffer_head **bh, + struct msdos_dir_entry **de) +{ + /* Fast stuff first */ + if (*bh && *de && + (*de - (struct msdos_dir_entry *)(*bh)->b_data) < MSDOS_SB(dir->i_sb)->dir_per_block - 1) { + *pos += sizeof(struct msdos_dir_entry); + (*de)++; + return 0; + } + return fat__get_entry(dir, pos, bh, de); +} + +/* + * Convert Unicode 16 to UTF-8, translated Unicode, or ASCII. + * If uni_xlate is enabled and we can't get a 1:1 conversion, use a + * colon as an escape character since it is normally invalid on the vfat + * filesystem. The following four characters are the hexadecimal digits + * of Unicode value. This lets us do a full dump and restore of Unicode + * filenames. We could get into some trouble with long Unicode names, + * but ignore that right now. + * Ahem... Stack smashing in ring 0 isn't fun. Fixed. + */ +static int uni16_to_x8(struct super_block *sb, unsigned char *ascii, + const wchar_t *uni, int len, struct nls_table *nls) +{ + int uni_xlate = MSDOS_SB(sb)->options.unicode_xlate; + const wchar_t *ip; + wchar_t ec; + unsigned char *op; + int charlen; + + ip = uni; + op = ascii; + + while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { + ec = *ip++; + if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { + op += charlen; + len -= charlen; + } else { + if (uni_xlate == 1) { + *op++ = ':'; + op = pack_hex_byte(op, ec >> 8); + op = pack_hex_byte(op, ec); + len -= 5; + } else { + *op++ = '?'; + len--; + } + } + } + + if (unlikely(*ip)) { + fat_msg(sb, KERN_WARNING, "filename was truncated while " + "converting."); + } + + *op = 0; + return (op - ascii); +} + +static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni, + unsigned char *buf, int size) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + if (sbi->options.utf8) + return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS, + UTF16_HOST_ENDIAN, buf, size); + else + return uni16_to_x8(sb, buf, uni, size, sbi->nls_io); +} + +static inline int +fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni) +{ + int charlen; + + charlen = t->char2uni(c, clen, uni); + if (charlen < 0) { + *uni = 0x003f; /* a question mark */ + charlen = 1; + } + return charlen; +} + +static inline int +fat_short2lower_uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni) +{ + int charlen; + wchar_t wc; + + charlen = t->char2uni(c, clen, &wc); + if (charlen < 0) { + *uni = 0x003f; /* a question mark */ + charlen = 1; + } else if (charlen <= 1) { + unsigned char nc = t->charset2lower[*c]; + + if (!nc) + nc = *c; + + if ( (charlen = t->char2uni(&nc, 1, uni)) < 0) { + *uni = 0x003f; /* a question mark */ + charlen = 1; + } + } else + *uni = wc; + + return charlen; +} + +static inline int +fat_shortname2uni(struct nls_table *nls, unsigned char *buf, int buf_size, + wchar_t *uni_buf, unsigned short opt, int lower) +{ + int len = 0; + + if (opt & VFAT_SFN_DISPLAY_LOWER) + len = fat_short2lower_uni(nls, buf, buf_size, uni_buf); + else if (opt & VFAT_SFN_DISPLAY_WIN95) + len = fat_short2uni(nls, buf, buf_size, uni_buf); + else if (opt & VFAT_SFN_DISPLAY_WINNT) { + if (lower) + len = fat_short2lower_uni(nls, buf, buf_size, uni_buf); + else + len = fat_short2uni(nls, buf, buf_size, uni_buf); + } else + len = fat_short2uni(nls, buf, buf_size, uni_buf); + + return len; +} + +static inline int fat_name_match(struct msdos_sb_info *sbi, + const unsigned char *a, int a_len, + const unsigned char *b, int b_len) +{ + if (a_len != b_len) + return 0; + + if (sbi->options.name_check != 's') + return !nls_strnicmp(sbi->nls_io, a, b, a_len); + else + return !memcmp(a, b, a_len); +} + +enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, }; + +/** + * fat_parse_long - Parse extended directory entry. + * + * This function returns zero on success, negative value on error, or one of + * the following: + * + * %PARSE_INVALID - Directory entry is invalid. + * %PARSE_NOT_LONGNAME - Directory entry does not contain longname. + * %PARSE_EOF - Directory has no more entries. + */ +static int fat_parse_long(struct inode *dir, loff_t *pos, + struct buffer_head **bh, struct msdos_dir_entry **de, + wchar_t **unicode, unsigned char *nr_slots) +{ + struct msdos_dir_slot *ds; + unsigned char id, slot, slots, alias_checksum; + + if (!*unicode) { + *unicode = __getname(); + if (!*unicode) { + brelse(*bh); + return -ENOMEM; + } + } +parse_long: + slots = 0; + ds = (struct msdos_dir_slot *)*de; + id = ds->id; + if (!(id & 0x40)) + return PARSE_INVALID; + slots = id & ~0x40; + if (slots > 20 || !slots) /* ceil(256 * 2 / 26) */ + return PARSE_INVALID; + *nr_slots = slots; + alias_checksum = ds->alias_checksum; + + slot = slots; + while (1) { + int offset; + + slot--; + offset = slot * 13; + fat16_towchar(*unicode + offset, ds->name0_4, 5); + fat16_towchar(*unicode + offset + 5, ds->name5_10, 6); + fat16_towchar(*unicode + offset + 11, ds->name11_12, 2); + + if (ds->id & 0x40) + (*unicode)[offset + 13] = 0; + if (fat_get_entry(dir, pos, bh, de) < 0) + return PARSE_EOF; + if (slot == 0) + break; + ds = (struct msdos_dir_slot *)*de; + if (ds->attr != ATTR_EXT) + return PARSE_NOT_LONGNAME; + if ((ds->id & ~0x40) != slot) + goto parse_long; + if (ds->alias_checksum != alias_checksum) + goto parse_long; + } + if ((*de)->name[0] == DELETED_FLAG) + return PARSE_INVALID; + if ((*de)->attr == ATTR_EXT) + goto parse_long; + if (IS_FREE((*de)->name) || ((*de)->attr & ATTR_VOLUME)) + return PARSE_INVALID; + if (fat_checksum((*de)->name) != alias_checksum) + *nr_slots = 0; + + return 0; +} + +/* + * Return values: negative -> error, 0 -> not found, positive -> found, + * value is the total amount of slots, including the shortname entry. + */ +int fat_search_long(struct inode *inode, const unsigned char *name, + int name_len, struct fat_slot_info *sinfo) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bh = NULL; + struct msdos_dir_entry *de; + struct nls_table *nls_disk = sbi->nls_disk; + unsigned char nr_slots; + wchar_t bufuname[14]; + wchar_t *unicode = NULL; + unsigned char work[MSDOS_NAME]; + unsigned char bufname[FAT_MAX_SHORT_SIZE]; + unsigned short opt_shortname = sbi->options.shortname; + loff_t cpos = 0; + int chl, i, j, last_u, err, len; + + err = -ENOENT; + while (1) { + if (fat_get_entry(inode, &cpos, &bh, &de) == -1) + goto end_of_dir; +parse_record: + nr_slots = 0; + if (de->name[0] == DELETED_FLAG) + continue; + if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME)) + continue; + if (de->attr != ATTR_EXT && IS_FREE(de->name)) + continue; + if (de->attr == ATTR_EXT) { + int status = fat_parse_long(inode, &cpos, &bh, &de, + &unicode, &nr_slots); + if (status < 0) { + err = status; + goto end_of_dir; + } else if (status == PARSE_INVALID) + continue; + else if (status == PARSE_NOT_LONGNAME) + goto parse_record; + else if (status == PARSE_EOF) + goto end_of_dir; + } + + memcpy(work, de->name, sizeof(de->name)); + /* see namei.c, msdos_format_name */ + if (work[0] == 0x05) + work[0] = 0xE5; + for (i = 0, j = 0, last_u = 0; i < 8;) { + if (!work[i]) + break; + chl = fat_shortname2uni(nls_disk, &work[i], 8 - i, + &bufuname[j++], opt_shortname, + de->lcase & CASE_LOWER_BASE); + if (chl <= 1) { + if (work[i] != ' ') + last_u = j; + } else { + last_u = j; + } + i += chl; + } + j = last_u; + fat_short2uni(nls_disk, ".", 1, &bufuname[j++]); + for (i = 8; i < MSDOS_NAME;) { + if (!work[i]) + break; + chl = fat_shortname2uni(nls_disk, &work[i], + MSDOS_NAME - i, + &bufuname[j++], opt_shortname, + de->lcase & CASE_LOWER_EXT); + if (chl <= 1) { + if (work[i] != ' ') + last_u = j; + } else { + last_u = j; + } + i += chl; + } + if (!last_u) + continue; + + /* Compare shortname */ + bufuname[last_u] = 0x0000; + len = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname)); + if (fat_name_match(sbi, name, name_len, bufname, len)) + goto found; + + if (nr_slots) { + void *longname = unicode + FAT_MAX_UNI_CHARS; + int size = PATH_MAX - FAT_MAX_UNI_SIZE; + + /* Compare longname */ + len = fat_uni_to_x8(sb, unicode, longname, size); + if (fat_name_match(sbi, name, name_len, longname, len)) + goto found; + } + } + +found: + nr_slots++; /* include the de */ + sinfo->slot_off = cpos - nr_slots * sizeof(*de); + sinfo->nr_slots = nr_slots; + sinfo->de = de; + sinfo->bh = bh; + sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); + err = 0; +end_of_dir: + if (unicode) + __putname(unicode); + + return err; +} + +EXPORT_SYMBOL_GPL(fat_search_long); + +struct fat_ioctl_filldir_callback { + void __user *dirent; + int result; + /* for dir ioctl */ + const char *longname; + int long_len; + const char *shortname; + int short_len; +}; + +static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, + filldir_t filldir, int short_only, int both) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bh; + struct msdos_dir_entry *de; + struct nls_table *nls_disk = sbi->nls_disk; + unsigned char nr_slots; + wchar_t bufuname[14]; + wchar_t *unicode = NULL; + unsigned char c, work[MSDOS_NAME]; + unsigned char bufname[FAT_MAX_SHORT_SIZE], *ptname = bufname; + unsigned short opt_shortname = sbi->options.shortname; + int isvfat = sbi->options.isvfat; + int nocase = sbi->options.nocase; + const char *fill_name = NULL; + unsigned long inum; + unsigned long lpos, dummy, *furrfu = &lpos; + loff_t cpos; + int chi, chl, i, i2, j, last, last_u, dotoffset = 0, fill_len = 0; + int ret = 0; + + lock_super(sb); + + cpos = filp->f_pos; + /* Fake . and .. for the root directory. */ + if (inode->i_ino == MSDOS_ROOT_INO) { + while (cpos < 2) { + if (filldir(dirent, "..", cpos+1, cpos, MSDOS_ROOT_INO, DT_DIR) < 0) + goto out; + cpos++; + filp->f_pos++; + } + if (cpos == 2) { + dummy = 2; + furrfu = &dummy; + cpos = 0; + } + } + if (cpos & (sizeof(struct msdos_dir_entry) - 1)) { + ret = -ENOENT; + goto out; + } + + bh = NULL; +get_new: + if (fat_get_entry(inode, &cpos, &bh, &de) == -1) + goto end_of_dir; +parse_record: + nr_slots = 0; + /* + * Check for long filename entry, but if short_only, we don't + * need to parse long filename. + */ + if (isvfat && !short_only) { + if (de->name[0] == DELETED_FLAG) + goto record_end; + if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME)) + goto record_end; + if (de->attr != ATTR_EXT && IS_FREE(de->name)) + goto record_end; + } else { + if ((de->attr & ATTR_VOLUME) || IS_FREE(de->name)) + goto record_end; + } + + if (isvfat && de->attr == ATTR_EXT) { + int status = fat_parse_long(inode, &cpos, &bh, &de, + &unicode, &nr_slots); + if (status < 0) { + filp->f_pos = cpos; + ret = status; + goto out; + } else if (status == PARSE_INVALID) + goto record_end; + else if (status == PARSE_NOT_LONGNAME) + goto parse_record; + else if (status == PARSE_EOF) + goto end_of_dir; + + if (nr_slots) { + void *longname = unicode + FAT_MAX_UNI_CHARS; + int size = PATH_MAX - FAT_MAX_UNI_SIZE; + int len = fat_uni_to_x8(sb, unicode, longname, size); + + fill_name = longname; + fill_len = len; + /* !both && !short_only, so we don't need shortname. */ + if (!both) + goto start_filldir; + } + } + + if (sbi->options.dotsOK) { + ptname = bufname; + dotoffset = 0; + if (de->attr & ATTR_HIDDEN) { + *ptname++ = '.'; + dotoffset = 1; + } + } + + memcpy(work, de->name, sizeof(de->name)); + /* see namei.c, msdos_format_name */ + if (work[0] == 0x05) + work[0] = 0xE5; + for (i = 0, j = 0, last = 0, last_u = 0; i < 8;) { + if (!(c = work[i])) + break; + chl = fat_shortname2uni(nls_disk, &work[i], 8 - i, + &bufuname[j++], opt_shortname, + de->lcase & CASE_LOWER_BASE); + if (chl <= 1) { + ptname[i++] = (!nocase && c>='A' && c<='Z') ? c+32 : c; + if (c != ' ') { + last = i; + last_u = j; + } + } else { + last_u = j; + for (chi = 0; chi < chl && i < 8; chi++) { + ptname[i] = work[i]; + i++; last = i; + } + } + } + i = last; + j = last_u; + fat_short2uni(nls_disk, ".", 1, &bufuname[j++]); + ptname[i++] = '.'; + for (i2 = 8; i2 < MSDOS_NAME;) { + if (!(c = work[i2])) + break; + chl = fat_shortname2uni(nls_disk, &work[i2], MSDOS_NAME - i2, + &bufuname[j++], opt_shortname, + de->lcase & CASE_LOWER_EXT); + if (chl <= 1) { + i2++; + ptname[i++] = (!nocase && c>='A' && c<='Z') ? c+32 : c; + if (c != ' ') { + last = i; + last_u = j; + } + } else { + last_u = j; + for (chi = 0; chi < chl && i2 < MSDOS_NAME; chi++) { + ptname[i++] = work[i2++]; + last = i; + } + } + } + if (!last) + goto record_end; + + i = last + dotoffset; + j = last_u; + + if (isvfat) { + bufuname[j] = 0x0000; + i = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname)); + } + if (nr_slots) { + /* hack for fat_ioctl_filldir() */ + struct fat_ioctl_filldir_callback *p = dirent; + + p->longname = fill_name; + p->long_len = fill_len; + p->shortname = bufname; + p->short_len = i; + fill_name = NULL; + fill_len = 0; + } else { + fill_name = bufname; + fill_len = i; + } + +start_filldir: + lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); + if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) + inum = inode->i_ino; + else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { + inum = parent_ino(filp->f_path.dentry); + } else { + loff_t i_pos = fat_make_i_pos(sb, bh, de); + struct inode *tmp = fat_iget(sb, i_pos); + if (tmp) { + inum = tmp->i_ino; + iput(tmp); + } else + inum = iunique(sb, MSDOS_ROOT_INO); + } + + if (filldir(dirent, fill_name, fill_len, *furrfu, inum, + (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0) + goto fill_failed; + +record_end: + furrfu = &lpos; + filp->f_pos = cpos; + goto get_new; +end_of_dir: + filp->f_pos = cpos; +fill_failed: + brelse(bh); + if (unicode) + __putname(unicode); +out: + unlock_super(sb); + return ret; +} + +static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + return __fat_readdir(inode, filp, dirent, filldir, 0, 0); +} + +#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \ +static int func(void *__buf, const char *name, int name_len, \ + loff_t offset, u64 ino, unsigned int d_type) \ +{ \ + struct fat_ioctl_filldir_callback *buf = __buf; \ + struct dirent_type __user *d1 = buf->dirent; \ + struct dirent_type __user *d2 = d1 + 1; \ + \ + if (buf->result) \ + return -EINVAL; \ + buf->result++; \ + \ + if (name != NULL) { \ + /* dirent has only short name */ \ + if (name_len >= sizeof(d1->d_name)) \ + name_len = sizeof(d1->d_name) - 1; \ + \ + if (put_user(0, d2->d_name) || \ + put_user(0, &d2->d_reclen) || \ + copy_to_user(d1->d_name, name, name_len) || \ + put_user(0, d1->d_name + name_len) || \ + put_user(name_len, &d1->d_reclen)) \ + goto efault; \ + } else { \ + /* dirent has short and long name */ \ + const char *longname = buf->longname; \ + int long_len = buf->long_len; \ + const char *shortname = buf->shortname; \ + int short_len = buf->short_len; \ + \ + if (long_len >= sizeof(d1->d_name)) \ + long_len = sizeof(d1->d_name) - 1; \ + if (short_len >= sizeof(d1->d_name)) \ + short_len = sizeof(d1->d_name) - 1; \ + \ + if (copy_to_user(d2->d_name, longname, long_len) || \ + put_user(0, d2->d_name + long_len) || \ + put_user(long_len, &d2->d_reclen) || \ + put_user(ino, &d2->d_ino) || \ + put_user(offset, &d2->d_off) || \ + copy_to_user(d1->d_name, shortname, short_len) || \ + put_user(0, d1->d_name + short_len) || \ + put_user(short_len, &d1->d_reclen)) \ + goto efault; \ + } \ + return 0; \ +efault: \ + buf->result = -EFAULT; \ + return -EFAULT; \ +} + +FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent) + +static int fat_ioctl_readdir(struct inode *inode, struct file *filp, + void __user *dirent, filldir_t filldir, + int short_only, int both) +{ + struct fat_ioctl_filldir_callback buf; + int ret; + + buf.dirent = dirent; + buf.result = 0; + mutex_lock(&inode->i_mutex); + ret = -ENOENT; + if (!IS_DEADDIR(inode)) { + ret = __fat_readdir(inode, filp, &buf, filldir, + short_only, both); + } + mutex_unlock(&inode->i_mutex); + if (ret >= 0) + ret = buf.result; + return ret; +} + +static int fat_ioctl_volume_id(struct inode *dir) +{ + struct super_block *sb = dir->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + return sbi->vol_id; +} + +static long fat_dir_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; + int short_only, both; + + switch (cmd) { + case VFAT_IOCTL_READDIR_SHORT: + short_only = 1; + both = 0; + break; + case VFAT_IOCTL_READDIR_BOTH: + short_only = 0; + both = 1; + break; + case VFAT_IOCTL_GET_VOLUME_ID: + return fat_ioctl_volume_id(inode); + default: + return fat_generic_ioctl(filp, cmd, arg); + } + + if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2]))) + return -EFAULT; + /* + * Yes, we don't need this put_user() absolutely. However old + * code didn't return the right value. So, app use this value, + * in order to check whether it is EOF. + */ + if (put_user(0, &d1->d_reclen)) + return -EFAULT; + + return fat_ioctl_readdir(inode, filp, d1, fat_ioctl_filldir, + short_only, both); +} + +#ifdef CONFIG_COMPAT +#define VFAT_IOCTL_READDIR_BOTH32 _IOR('r', 1, struct compat_dirent[2]) +#define VFAT_IOCTL_READDIR_SHORT32 _IOR('r', 2, struct compat_dirent[2]) + +FAT_IOCTL_FILLDIR_FUNC(fat_compat_ioctl_filldir, compat_dirent) + +static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd, + unsigned long arg) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct compat_dirent __user *d1 = compat_ptr(arg); + int short_only, both; + + switch (cmd) { + case VFAT_IOCTL_READDIR_SHORT32: + short_only = 1; + both = 0; + break; + case VFAT_IOCTL_READDIR_BOTH32: + short_only = 0; + both = 1; + break; + default: + return fat_generic_ioctl(filp, cmd, (unsigned long)arg); + } + + if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2]))) + return -EFAULT; + /* + * Yes, we don't need this put_user() absolutely. However old + * code didn't return the right value. So, app use this value, + * in order to check whether it is EOF. + */ + if (put_user(0, &d1->d_reclen)) + return -EFAULT; + + return fat_ioctl_readdir(inode, filp, d1, fat_compat_ioctl_filldir, + short_only, both); +} +#endif /* CONFIG_COMPAT */ + +const struct file_operations fat_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = fat_readdir, + .unlocked_ioctl = fat_dir_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = fat_compat_dir_ioctl, +#endif + .fsync = fat_file_fsync, +}; + +static int fat_get_short_entry(struct inode *dir, loff_t *pos, + struct buffer_head **bh, + struct msdos_dir_entry **de) +{ + while (fat_get_entry(dir, pos, bh, de) >= 0) { + /* free entry or long name entry or volume label */ + if (!IS_FREE((*de)->name) && !((*de)->attr & ATTR_VOLUME)) + return 0; + } + return -ENOENT; +} + +/* + * The ".." entry can not provide the "struct fat_slot_info" informations + * for inode. So, this function provide the some informations only. + */ +int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, + struct msdos_dir_entry **de, loff_t *i_pos) +{ + loff_t offset; + + offset = 0; + *bh = NULL; + while (fat_get_short_entry(dir, &offset, bh, de) >= 0) { + if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME)) { + *i_pos = fat_make_i_pos(dir->i_sb, *bh, *de); + return 0; + } + } + return -ENOENT; +} + +EXPORT_SYMBOL_GPL(fat_get_dotdot_entry); + +/* See if directory is empty */ +int fat_dir_empty(struct inode *dir) +{ + struct buffer_head *bh; + struct msdos_dir_entry *de; + loff_t cpos; + int result = 0; + + bh = NULL; + cpos = 0; + while (fat_get_short_entry(dir, &cpos, &bh, &de) >= 0) { + if (strncmp(de->name, MSDOS_DOT , MSDOS_NAME) && + strncmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { + result = -ENOTEMPTY; + break; + } + } + brelse(bh); + return result; +} + +EXPORT_SYMBOL_GPL(fat_dir_empty); + +/* + * fat_subdirs counts the number of sub-directories of dir. It can be run + * on directories being created. + */ +int fat_subdirs(struct inode *dir) +{ + struct buffer_head *bh; + struct msdos_dir_entry *de; + loff_t cpos; + int count = 0; + + bh = NULL; + cpos = 0; + while (fat_get_short_entry(dir, &cpos, &bh, &de) >= 0) { + if (de->attr & ATTR_DIR) + count++; + } + brelse(bh); + return count; +} + +/* + * Scans a directory for a given file (name points to its formatted name). + * Returns an error code or zero. + */ +int fat_scan(struct inode *dir, const unsigned char *name, + struct fat_slot_info *sinfo) +{ + struct super_block *sb = dir->i_sb; + + sinfo->slot_off = 0; + sinfo->bh = NULL; + while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh, + &sinfo->de) >= 0) { + if (!strncmp(sinfo->de->name, name, MSDOS_NAME)) { + sinfo->slot_off -= sizeof(*sinfo->de); + sinfo->nr_slots = 1; + sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); + return 0; + } + } + return -ENOENT; +} + +EXPORT_SYMBOL_GPL(fat_scan); + +static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots) +{ + struct super_block *sb = dir->i_sb; + struct buffer_head *bh; + struct msdos_dir_entry *de, *endp; + int err = 0, orig_slots; + + while (nr_slots) { + bh = NULL; + if (fat_get_entry(dir, &pos, &bh, &de) < 0) { + err = -EIO; + break; + } + + orig_slots = nr_slots; + endp = (struct msdos_dir_entry *)(bh->b_data + sb->s_blocksize); + while (nr_slots && de < endp) { + de->name[0] = DELETED_FLAG; + de++; + nr_slots--; + } + mark_buffer_dirty_inode(bh, dir); + if (IS_DIRSYNC(dir)) + err = sync_dirty_buffer(bh); + brelse(bh); + if (err) + break; + + /* pos is *next* de's position, so this does `- sizeof(de)' */ + pos += ((orig_slots - nr_slots) * sizeof(*de)) - sizeof(*de); + } + + return err; +} + +int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) +{ + struct super_block *sb = dir->i_sb; + struct msdos_dir_entry *de; + struct buffer_head *bh; + int err = 0, nr_slots; + + /* + * First stage: Remove the shortname. By this, the directory + * entry is removed. + */ + nr_slots = sinfo->nr_slots; + de = sinfo->de; + sinfo->de = NULL; + bh = sinfo->bh; + sinfo->bh = NULL; + while (nr_slots && de >= (struct msdos_dir_entry *)bh->b_data) { + de->name[0] = DELETED_FLAG; + de--; + nr_slots--; + } + mark_buffer_dirty_inode(bh, dir); + if (IS_DIRSYNC(dir)) + err = sync_dirty_buffer(bh); + brelse(bh); + if (err) + return err; + dir->i_version++; + + if (nr_slots) { + /* + * Second stage: remove the remaining longname slots. + * (This directory entry is already removed, and so return + * the success) + */ + err = __fat_remove_entries(dir, sinfo->slot_off, nr_slots); + if (err) { + fat_msg(sb, KERN_WARNING, + "Couldn't remove the long name slots"); + } + } + + dir->i_mtime = dir->i_atime = CURRENT_TIME_SEC; + if (IS_DIRSYNC(dir)) + (void)fat_sync_inode(dir); + else + mark_inode_dirty(dir); + + return 0; +} + +EXPORT_SYMBOL_GPL(fat_remove_entries); + +static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used, + struct buffer_head **bhs, int nr_bhs) +{ + struct super_block *sb = dir->i_sb; + sector_t last_blknr = blknr + MSDOS_SB(sb)->sec_per_clus; + int err, i, n; + + /* Zeroing the unused blocks on this cluster */ + blknr += nr_used; + n = nr_used; + while (blknr < last_blknr) { + bhs[n] = sb_getblk(sb, blknr); + if (!bhs[n]) { + err = -ENOMEM; + goto error; + } + memset(bhs[n]->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bhs[n]); + mark_buffer_dirty_inode(bhs[n], dir); + + n++; + blknr++; + if (n == nr_bhs) { + if (IS_DIRSYNC(dir)) { + err = fat_sync_bhs(bhs, n); + if (err) + goto error; + } + for (i = 0; i < n; i++) + brelse(bhs[i]); + n = 0; + } + } + if (IS_DIRSYNC(dir)) { + err = fat_sync_bhs(bhs, n); + if (err) + goto error; + } + for (i = 0; i < n; i++) + brelse(bhs[i]); + + return 0; + +error: + for (i = 0; i < n; i++) + bforget(bhs[i]); + return err; +} + +int fat_alloc_new_dir(struct inode *dir, struct timespec *ts) +{ + struct super_block *sb = dir->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; + struct msdos_dir_entry *de; + sector_t blknr; + __le16 date, time; + u8 time_cs; + int err, cluster; + + err = fat_alloc_clusters(dir, &cluster, 1); + if (err) + goto error; + + blknr = fat_clus_to_blknr(sbi, cluster); + bhs[0] = sb_getblk(sb, blknr); + if (!bhs[0]) { + err = -ENOMEM; + goto error_free; + } + + fat_time_unix2fat(sbi, ts, &time, &date, &time_cs); + + de = (struct msdos_dir_entry *)bhs[0]->b_data; + /* filling the new directory slots ("." and ".." entries) */ + memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME); + memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME); + de->attr = de[1].attr = ATTR_DIR; + de[0].lcase = de[1].lcase = 0; + de[0].time = de[1].time = time; + de[0].date = de[1].date = date; + if (sbi->options.isvfat) { + /* extra timestamps */ + de[0].ctime = de[1].ctime = time; + de[0].ctime_cs = de[1].ctime_cs = time_cs; + de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = date; + } else { + de[0].ctime = de[1].ctime = 0; + de[0].ctime_cs = de[1].ctime_cs = 0; + de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0; + } + de[0].start = cpu_to_le16(cluster); + de[0].starthi = cpu_to_le16(cluster >> 16); + de[1].start = cpu_to_le16(MSDOS_I(dir)->i_logstart); + de[1].starthi = cpu_to_le16(MSDOS_I(dir)->i_logstart >> 16); + de[0].size = de[1].size = 0; + memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); + set_buffer_uptodate(bhs[0]); + mark_buffer_dirty_inode(bhs[0], dir); + + err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); + if (err) + goto error_free; + + return cluster; + +error_free: + fat_free_clusters(dir, cluster); +error: + return err; +} + +EXPORT_SYMBOL_GPL(fat_alloc_new_dir); + +static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, + int *nr_cluster, struct msdos_dir_entry **de, + struct buffer_head **bh, loff_t *i_pos) +{ + struct super_block *sb = dir->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; + sector_t blknr, start_blknr, last_blknr; + unsigned long size, copy; + int err, i, n, offset, cluster[2]; + + /* + * The minimum cluster size is 512bytes, and maximum entry + * size is 32*slots (672bytes). So, iff the cluster size is + * 512bytes, we may need two clusters. + */ + size = nr_slots * sizeof(struct msdos_dir_entry); + *nr_cluster = (size + (sbi->cluster_size - 1)) >> sbi->cluster_bits; + BUG_ON(*nr_cluster > 2); + + err = fat_alloc_clusters(dir, cluster, *nr_cluster); + if (err) + goto error; + + /* + * First stage: Fill the directory entry. NOTE: This cluster + * is not referenced from any inode yet, so updates order is + * not important. + */ + i = n = copy = 0; + do { + start_blknr = blknr = fat_clus_to_blknr(sbi, cluster[i]); + last_blknr = start_blknr + sbi->sec_per_clus; + while (blknr < last_blknr) { + bhs[n] = sb_getblk(sb, blknr); + if (!bhs[n]) { + err = -ENOMEM; + goto error_nomem; + } + + /* fill the directory entry */ + copy = min(size, sb->s_blocksize); + memcpy(bhs[n]->b_data, slots, copy); + slots += copy; + size -= copy; + set_buffer_uptodate(bhs[n]); + mark_buffer_dirty_inode(bhs[n], dir); + if (!size) + break; + n++; + blknr++; + } + } while (++i < *nr_cluster); + + memset(bhs[n]->b_data + copy, 0, sb->s_blocksize - copy); + offset = copy - sizeof(struct msdos_dir_entry); + get_bh(bhs[n]); + *bh = bhs[n]; + *de = (struct msdos_dir_entry *)((*bh)->b_data + offset); + *i_pos = fat_make_i_pos(sb, *bh, *de); + + /* Second stage: clear the rest of cluster, and write outs */ + err = fat_zeroed_cluster(dir, start_blknr, ++n, bhs, MAX_BUF_PER_PAGE); + if (err) + goto error_free; + + return cluster[0]; + +error_free: + brelse(*bh); + *bh = NULL; + n = 0; +error_nomem: + for (i = 0; i < n; i++) + bforget(bhs[i]); + fat_free_clusters(dir, cluster[0]); +error: + return err; +} + +int fat_add_entries(struct inode *dir, void *slots, int nr_slots, + struct fat_slot_info *sinfo) +{ + struct super_block *sb = dir->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */ + struct msdos_dir_entry *de; + int err, free_slots, i, nr_bhs; + loff_t pos, i_pos; + + sinfo->nr_slots = nr_slots; + + /* First stage: search free direcotry entries */ + free_slots = nr_bhs = 0; + bh = prev = NULL; + pos = 0; + err = -ENOSPC; + while (fat_get_entry(dir, &pos, &bh, &de) > -1) { + /* check the maximum size of directory */ + if (pos >= FAT_MAX_DIR_SIZE) + goto error; + + if (IS_FREE(de->name)) { + if (prev != bh) { + get_bh(bh); + bhs[nr_bhs] = prev = bh; + nr_bhs++; + } + free_slots++; + if (free_slots == nr_slots) + goto found; + } else { + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + prev = NULL; + free_slots = nr_bhs = 0; + } + } + if (dir->i_ino == MSDOS_ROOT_INO) { + if (sbi->fat_bits != 32) + goto error; + } else if (MSDOS_I(dir)->i_start == 0) { + fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)", + MSDOS_I(dir)->i_pos); + err = -EIO; + goto error; + } + +found: + err = 0; + pos -= free_slots * sizeof(*de); + nr_slots -= free_slots; + if (free_slots) { + /* + * Second stage: filling the free entries with new entries. + * NOTE: If this slots has shortname, first, we write + * the long name slots, then write the short name. + */ + int size = free_slots * sizeof(*de); + int offset = pos & (sb->s_blocksize - 1); + int long_bhs = nr_bhs - (nr_slots == 0); + + /* Fill the long name slots. */ + for (i = 0; i < long_bhs; i++) { + int copy = min_t(int, sb->s_blocksize - offset, size); + memcpy(bhs[i]->b_data + offset, slots, copy); + mark_buffer_dirty_inode(bhs[i], dir); + offset = 0; + slots += copy; + size -= copy; + } + if (long_bhs && IS_DIRSYNC(dir)) + err = fat_sync_bhs(bhs, long_bhs); + if (!err && i < nr_bhs) { + /* Fill the short name slot. */ + int copy = min_t(int, sb->s_blocksize - offset, size); + memcpy(bhs[i]->b_data + offset, slots, copy); + mark_buffer_dirty_inode(bhs[i], dir); + if (IS_DIRSYNC(dir)) + err = sync_dirty_buffer(bhs[i]); + } + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + if (err) + goto error_remove; + } + + if (nr_slots) { + int cluster, nr_cluster; + + /* + * Third stage: allocate the cluster for new entries. + * And initialize the cluster with new entries, then + * add the cluster to dir. + */ + cluster = fat_add_new_entries(dir, slots, nr_slots, &nr_cluster, + &de, &bh, &i_pos); + if (cluster < 0) { + err = cluster; + goto error_remove; + } + err = fat_chain_add(dir, cluster, nr_cluster); + if (err) { + fat_free_clusters(dir, cluster); + goto error_remove; + } + if (dir->i_size & (sbi->cluster_size - 1)) { + fat_fs_error(sb, "Odd directory size"); + dir->i_size = (dir->i_size + sbi->cluster_size - 1) + & ~((loff_t)sbi->cluster_size - 1); + } + dir->i_size += nr_cluster << sbi->cluster_bits; + MSDOS_I(dir)->mmu_private += nr_cluster << sbi->cluster_bits; + } + sinfo->slot_off = pos; + sinfo->de = de; + sinfo->bh = bh; + sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); + + return 0; + +error: + brelse(bh); + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + return err; + +error_remove: + brelse(bh); + if (free_slots) + __fat_remove_entries(dir, pos, free_slots); + return err; +} + +EXPORT_SYMBOL_GPL(fat_add_entries); diff --git a/fs/fat/fat.h b/fs/fat/fat.h new file mode 100644 index 00000000..74fe5d3e --- /dev/null +++ b/fs/fat/fat.h @@ -0,0 +1,351 @@ +#ifndef _FAT_H +#define _FAT_H + +#include +#include +#include +#include +#include +#include +#include + +/* + * vfat shortname flags + */ +#define VFAT_SFN_DISPLAY_LOWER 0x0001 /* convert to lowercase for display */ +#define VFAT_SFN_DISPLAY_WIN95 0x0002 /* emulate win95 rule for display */ +#define VFAT_SFN_DISPLAY_WINNT 0x0004 /* emulate winnt rule for display */ +#define VFAT_SFN_CREATE_WIN95 0x0100 /* emulate win95 rule for create */ +#define VFAT_SFN_CREATE_WINNT 0x0200 /* emulate winnt rule for create */ + +#define FAT_ERRORS_CONT 1 /* ignore error and continue */ +#define FAT_ERRORS_PANIC 2 /* panic on error */ +#define FAT_ERRORS_RO 3 /* remount r/o on error */ + +struct fat_mount_options { + uid_t fs_uid; + gid_t fs_gid; + unsigned short fs_fmask; + unsigned short fs_dmask; + unsigned short codepage; /* Codepage for shortname conversions */ + char *iocharset; /* Charset used for filename input/display */ + unsigned short shortname; /* flags for shortname display/create rule */ + unsigned char name_check; /* r = relaxed, n = normal, s = strict */ + unsigned char errors; /* On error: continue, panic, remount-ro */ + unsigned short allow_utime;/* permission for setting the [am]time */ + unsigned quiet:1, /* set = fake successful chmods and chowns */ + showexec:1, /* set = only set x bit for com/exe/bat */ + sys_immutable:1, /* set = system files are immutable */ + dotsOK:1, /* set = hidden and system files are named '.filename' */ + isvfat:1, /* 0=no vfat long filename support, 1=vfat support */ + utf8:1, /* Use of UTF-8 character set (Default) */ + unicode_xlate:1, /* create escape sequences for unhandled Unicode */ + numtail:1, /* Does first alias have a numeric '~1' type tail? */ + flush:1, /* write things quickly */ + nocase:1, /* Does this need case conversion? 0=need case conversion*/ + usefree:1, /* Use free_clusters for FAT32 */ + tz_utc:1, /* Filesystem timestamps are in UTC */ + rodir:1, /* allow ATTR_RO for directory */ + discard:1; /* Issue discard requests on deletions */ +}; + +#define FAT_HASH_BITS 8 +#define FAT_HASH_SIZE (1UL << FAT_HASH_BITS) + +/* + * MS-DOS file system in-core superblock data + */ +struct msdos_sb_info { + unsigned short sec_per_clus; /* sectors/cluster */ + unsigned short cluster_bits; /* log2(cluster_size) */ + unsigned int cluster_size; /* cluster size */ + unsigned char fats,fat_bits; /* number of FATs, FAT bits (12 or 16) */ + unsigned short fat_start; + unsigned long fat_length; /* FAT start & length (sec.) */ + unsigned long dir_start; + unsigned short dir_entries; /* root dir start & entries */ + unsigned long data_start; /* first data sector */ + unsigned long max_cluster; /* maximum cluster number */ + unsigned long root_cluster; /* first cluster of the root directory */ + unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */ + struct mutex fat_lock; + unsigned int prev_free; /* previously allocated cluster number */ + unsigned int free_clusters; /* -1 if undefined */ + unsigned int free_clus_valid; /* is free_clusters valid? */ + struct fat_mount_options options; + struct nls_table *nls_disk; /* Codepage used on disk */ + struct nls_table *nls_io; /* Charset used for input and display */ + const void *dir_ops; /* Opaque; default directory operations */ + int dir_per_block; /* dir entries per block */ + int dir_per_block_bits; /* log2(dir_per_block) */ + unsigned long vol_id; /* volume ID */ + + int fatent_shift; + struct fatent_operations *fatent_ops; + struct inode *fat_inode; + + struct ratelimit_state ratelimit; + + spinlock_t inode_hash_lock; + struct hlist_head inode_hashtable[FAT_HASH_SIZE]; +}; + +#define FAT_CACHE_VALID 0 /* special case for valid cache */ + +/* + * MS-DOS file system inode data in memory + */ +struct msdos_inode_info { + spinlock_t cache_lru_lock; + struct list_head cache_lru; + int nr_caches; + /* for avoiding the race between fat_free() and fat_get_cluster() */ + unsigned int cache_valid_id; + + /* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */ + loff_t mmu_private; /* physically allocated size */ + + int i_start; /* first cluster or 0 */ + int i_logstart; /* logical first cluster */ + int i_attrs; /* unused attribute bits */ + loff_t i_pos; /* on-disk position of directory entry or 0 */ + struct hlist_node i_fat_hash; /* hash by i_location */ + struct inode vfs_inode; +}; + +struct fat_slot_info { + loff_t i_pos; /* on-disk position of directory entry */ + loff_t slot_off; /* offset for slot or de start */ + int nr_slots; /* number of slots + 1(de) in filename */ + struct msdos_dir_entry *de; + struct buffer_head *bh; +}; + +static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct msdos_inode_info *MSDOS_I(struct inode *inode) +{ + return container_of(inode, struct msdos_inode_info, vfs_inode); +} + +/* + * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to + * save ATTR_RO instead of ->i_mode. + * + * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only + * bit, it's just used as flag for app. + */ +static inline int fat_mode_can_hold_ro(struct inode *inode) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + mode_t mask; + + if (S_ISDIR(inode->i_mode)) { + if (!sbi->options.rodir) + return 0; + mask = ~sbi->options.fs_dmask; + } else + mask = ~sbi->options.fs_fmask; + + if (!(mask & S_IWUGO)) + return 0; + return 1; +} + +/* Convert attribute bits and a mask to the UNIX mode. */ +static inline mode_t fat_make_mode(struct msdos_sb_info *sbi, + u8 attrs, mode_t mode) +{ + if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir)) + mode &= ~S_IWUGO; + + if (attrs & ATTR_DIR) + return (mode & ~sbi->options.fs_dmask) | S_IFDIR; + else + return (mode & ~sbi->options.fs_fmask) | S_IFREG; +} + +/* Return the FAT attribute byte for this inode */ +static inline u8 fat_make_attrs(struct inode *inode) +{ + u8 attrs = MSDOS_I(inode)->i_attrs; + if (S_ISDIR(inode->i_mode)) + attrs |= ATTR_DIR; + if (fat_mode_can_hold_ro(inode) && !(inode->i_mode & S_IWUGO)) + attrs |= ATTR_RO; + return attrs; +} + +static inline void fat_save_attrs(struct inode *inode, u8 attrs) +{ + if (fat_mode_can_hold_ro(inode)) + MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED; + else + MSDOS_I(inode)->i_attrs = attrs & (ATTR_UNUSED | ATTR_RO); +} + +static inline unsigned char fat_checksum(const __u8 *name) +{ + unsigned char s = name[0]; + s = (s<<7) + (s>>1) + name[1]; s = (s<<7) + (s>>1) + name[2]; + s = (s<<7) + (s>>1) + name[3]; s = (s<<7) + (s>>1) + name[4]; + s = (s<<7) + (s>>1) + name[5]; s = (s<<7) + (s>>1) + name[6]; + s = (s<<7) + (s>>1) + name[7]; s = (s<<7) + (s>>1) + name[8]; + s = (s<<7) + (s>>1) + name[9]; s = (s<<7) + (s>>1) + name[10]; + return s; +} + +static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus) +{ + return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus + + sbi->data_start; +} + +static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len) +{ +#ifdef __BIG_ENDIAN + while (len--) { + *dst++ = src[0] | (src[1] << 8); + src += 2; + } +#else + memcpy(dst, src, len * 2); +#endif +} + +static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len) +{ +#ifdef __BIG_ENDIAN + while (len--) { + dst[0] = *src & 0x00FF; + dst[1] = (*src & 0xFF00) >> 8; + dst += 2; + src++; + } +#else + memcpy(dst, src, len * 2); +#endif +} + +/* fat/cache.c */ +extern void fat_cache_inval_inode(struct inode *inode); +extern int fat_get_cluster(struct inode *inode, int cluster, + int *fclus, int *dclus); +extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, + unsigned long *mapped_blocks, int create); + +/* fat/dir.c */ +extern const struct file_operations fat_dir_operations; +extern int fat_search_long(struct inode *inode, const unsigned char *name, + int name_len, struct fat_slot_info *sinfo); +extern int fat_dir_empty(struct inode *dir); +extern int fat_subdirs(struct inode *dir); +extern int fat_scan(struct inode *dir, const unsigned char *name, + struct fat_slot_info *sinfo); +extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, + struct msdos_dir_entry **de, loff_t *i_pos); +extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts); +extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots, + struct fat_slot_info *sinfo); +extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo); + +/* fat/fatent.c */ +struct fat_entry { + int entry; + union { + u8 *ent12_p[2]; + __le16 *ent16_p; + __le32 *ent32_p; + } u; + int nr_bhs; + struct buffer_head *bhs[2]; + struct inode *fat_inode; +}; + +static inline void fatent_init(struct fat_entry *fatent) +{ + fatent->nr_bhs = 0; + fatent->entry = 0; + fatent->u.ent32_p = NULL; + fatent->bhs[0] = fatent->bhs[1] = NULL; + fatent->fat_inode = NULL; +} + +static inline void fatent_set_entry(struct fat_entry *fatent, int entry) +{ + fatent->entry = entry; + fatent->u.ent32_p = NULL; +} + +static inline void fatent_brelse(struct fat_entry *fatent) +{ + int i; + fatent->u.ent32_p = NULL; + for (i = 0; i < fatent->nr_bhs; i++) + brelse(fatent->bhs[i]); + fatent->nr_bhs = 0; + fatent->bhs[0] = fatent->bhs[1] = NULL; + fatent->fat_inode = NULL; +} + +extern void fat_ent_access_init(struct super_block *sb); +extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent, + int entry); +extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent, + int new, int wait); +extern int fat_alloc_clusters(struct inode *inode, int *cluster, + int nr_cluster); +extern int fat_free_clusters(struct inode *inode, int cluster); +extern int fat_count_free_clusters(struct super_block *sb); + +/* fat/file.c */ +extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg); +extern const struct file_operations fat_file_operations; +extern const struct inode_operations fat_file_inode_operations; +extern int fat_setattr(struct dentry * dentry, struct iattr * attr); +extern void fat_truncate_blocks(struct inode *inode, loff_t offset); +extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +extern int fat_file_fsync(struct file *file, int datasync); + +/* fat/inode.c */ +extern void fat_attach(struct inode *inode, loff_t i_pos); +extern void fat_detach(struct inode *inode); +extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos); +extern struct inode *fat_build_inode(struct super_block *sb, + struct msdos_dir_entry *de, loff_t i_pos); +extern int fat_sync_inode(struct inode *inode); +extern int fat_fill_super(struct super_block *sb, void *data, int silent, + int isvfat, void (*setup)(struct super_block *)); + +extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, + struct inode *i2); +/* fat/misc.c */ +extern void +__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))) __cold; +#define fat_fs_error(sb, fmt, args...) \ + __fat_fs_error(sb, 1, fmt , ## args) +#define fat_fs_error_ratelimit(sb, fmt, args...) \ + __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args) +void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))) __cold; +extern int fat_clusters_flush(struct super_block *sb); +extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); +extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, + __le16 __time, __le16 __date, u8 time_cs); +extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts, + __le16 *time, __le16 *date, u8 *time_cs); +extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs); + +int fat_cache_init(void); +void fat_cache_destroy(void); + +/* helper for printk */ +typedef unsigned long long llu; + +#endif /* !_FAT_H */ diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c new file mode 100644 index 00000000..2e81ac0d --- /dev/null +++ b/fs/fat/fatent.c @@ -0,0 +1,685 @@ +/* + * Copyright (C) 2004, OGAWA Hirofumi + * Released under GPL v2. + */ + +#include +#include +#include +#include +#include "fat.h" + +struct fatent_operations { + void (*ent_blocknr)(struct super_block *, int, int *, sector_t *); + void (*ent_set_ptr)(struct fat_entry *, int); + int (*ent_bread)(struct super_block *, struct fat_entry *, + int, sector_t); + int (*ent_get)(struct fat_entry *); + void (*ent_put)(struct fat_entry *, int); + int (*ent_next)(struct fat_entry *); +}; + +static DEFINE_SPINLOCK(fat12_entry_lock); + +static void fat12_ent_blocknr(struct super_block *sb, int entry, + int *offset, sector_t *blocknr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int bytes = entry + (entry >> 1); + WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry); + *offset = bytes & (sb->s_blocksize - 1); + *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); +} + +static void fat_ent_blocknr(struct super_block *sb, int entry, + int *offset, sector_t *blocknr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int bytes = (entry << sbi->fatent_shift); + WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry); + *offset = bytes & (sb->s_blocksize - 1); + *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); +} + +static void fat12_ent_set_ptr(struct fat_entry *fatent, int offset) +{ + struct buffer_head **bhs = fatent->bhs; + if (fatent->nr_bhs == 1) { + WARN_ON(offset >= (bhs[0]->b_size - 1)); + fatent->u.ent12_p[0] = bhs[0]->b_data + offset; + fatent->u.ent12_p[1] = bhs[0]->b_data + (offset + 1); + } else { + WARN_ON(offset != (bhs[0]->b_size - 1)); + fatent->u.ent12_p[0] = bhs[0]->b_data + offset; + fatent->u.ent12_p[1] = bhs[1]->b_data; + } +} + +static void fat16_ent_set_ptr(struct fat_entry *fatent, int offset) +{ + WARN_ON(offset & (2 - 1)); + fatent->u.ent16_p = (__le16 *)(fatent->bhs[0]->b_data + offset); +} + +static void fat32_ent_set_ptr(struct fat_entry *fatent, int offset) +{ + WARN_ON(offset & (4 - 1)); + fatent->u.ent32_p = (__le32 *)(fatent->bhs[0]->b_data + offset); +} + +static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent, + int offset, sector_t blocknr) +{ + struct buffer_head **bhs = fatent->bhs; + + WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); + fatent->fat_inode = MSDOS_SB(sb)->fat_inode; + + bhs[0] = sb_bread(sb, blocknr); + if (!bhs[0]) + goto err; + + if ((offset + 1) < sb->s_blocksize) + fatent->nr_bhs = 1; + else { + /* This entry is block boundary, it needs the next block */ + blocknr++; + bhs[1] = sb_bread(sb, blocknr); + if (!bhs[1]) + goto err_brelse; + fatent->nr_bhs = 2; + } + fat12_ent_set_ptr(fatent, offset); + return 0; + +err_brelse: + brelse(bhs[0]); +err: + fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); + return -EIO; +} + +static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, + int offset, sector_t blocknr) +{ + struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + + WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); + fatent->fat_inode = MSDOS_SB(sb)->fat_inode; + fatent->bhs[0] = sb_bread(sb, blocknr); + if (!fatent->bhs[0]) { + fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", + (llu)blocknr); + return -EIO; + } + fatent->nr_bhs = 1; + ops->ent_set_ptr(fatent, offset); + return 0; +} + +static int fat12_ent_get(struct fat_entry *fatent) +{ + u8 **ent12_p = fatent->u.ent12_p; + int next; + + spin_lock(&fat12_entry_lock); + if (fatent->entry & 1) + next = (*ent12_p[0] >> 4) | (*ent12_p[1] << 4); + else + next = (*ent12_p[1] << 8) | *ent12_p[0]; + spin_unlock(&fat12_entry_lock); + + next &= 0x0fff; + if (next >= BAD_FAT12) + next = FAT_ENT_EOF; + return next; +} + +static int fat16_ent_get(struct fat_entry *fatent) +{ + int next = le16_to_cpu(*fatent->u.ent16_p); + WARN_ON((unsigned long)fatent->u.ent16_p & (2 - 1)); + if (next >= BAD_FAT16) + next = FAT_ENT_EOF; + return next; +} + +static int fat32_ent_get(struct fat_entry *fatent) +{ + int next = le32_to_cpu(*fatent->u.ent32_p) & 0x0fffffff; + WARN_ON((unsigned long)fatent->u.ent32_p & (4 - 1)); + if (next >= BAD_FAT32) + next = FAT_ENT_EOF; + return next; +} + +static void fat12_ent_put(struct fat_entry *fatent, int new) +{ + u8 **ent12_p = fatent->u.ent12_p; + + if (new == FAT_ENT_EOF) + new = EOF_FAT12; + + spin_lock(&fat12_entry_lock); + if (fatent->entry & 1) { + *ent12_p[0] = (new << 4) | (*ent12_p[0] & 0x0f); + *ent12_p[1] = new >> 4; + } else { + *ent12_p[0] = new & 0xff; + *ent12_p[1] = (*ent12_p[1] & 0xf0) | (new >> 8); + } + spin_unlock(&fat12_entry_lock); + + mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); + if (fatent->nr_bhs == 2) + mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode); +} + +static void fat16_ent_put(struct fat_entry *fatent, int new) +{ + if (new == FAT_ENT_EOF) + new = EOF_FAT16; + + *fatent->u.ent16_p = cpu_to_le16(new); + mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); +} + +static void fat32_ent_put(struct fat_entry *fatent, int new) +{ + if (new == FAT_ENT_EOF) + new = EOF_FAT32; + + WARN_ON(new & 0xf0000000); + new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff; + *fatent->u.ent32_p = cpu_to_le32(new); + mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); +} + +static int fat12_ent_next(struct fat_entry *fatent) +{ + u8 **ent12_p = fatent->u.ent12_p; + struct buffer_head **bhs = fatent->bhs; + u8 *nextp = ent12_p[1] + 1 + (fatent->entry & 1); + + fatent->entry++; + if (fatent->nr_bhs == 1) { + WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 2))); + WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))); + if (nextp < (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))) { + ent12_p[0] = nextp - 1; + ent12_p[1] = nextp; + return 1; + } + } else { + WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))); + WARN_ON(ent12_p[1] != (u8 *)bhs[1]->b_data); + ent12_p[0] = nextp - 1; + ent12_p[1] = nextp; + brelse(bhs[0]); + bhs[0] = bhs[1]; + fatent->nr_bhs = 1; + return 1; + } + ent12_p[0] = NULL; + ent12_p[1] = NULL; + return 0; +} + +static int fat16_ent_next(struct fat_entry *fatent) +{ + const struct buffer_head *bh = fatent->bhs[0]; + fatent->entry++; + if (fatent->u.ent16_p < (__le16 *)(bh->b_data + (bh->b_size - 2))) { + fatent->u.ent16_p++; + return 1; + } + fatent->u.ent16_p = NULL; + return 0; +} + +static int fat32_ent_next(struct fat_entry *fatent) +{ + const struct buffer_head *bh = fatent->bhs[0]; + fatent->entry++; + if (fatent->u.ent32_p < (__le32 *)(bh->b_data + (bh->b_size - 4))) { + fatent->u.ent32_p++; + return 1; + } + fatent->u.ent32_p = NULL; + return 0; +} + +static struct fatent_operations fat12_ops = { + .ent_blocknr = fat12_ent_blocknr, + .ent_set_ptr = fat12_ent_set_ptr, + .ent_bread = fat12_ent_bread, + .ent_get = fat12_ent_get, + .ent_put = fat12_ent_put, + .ent_next = fat12_ent_next, +}; + +static struct fatent_operations fat16_ops = { + .ent_blocknr = fat_ent_blocknr, + .ent_set_ptr = fat16_ent_set_ptr, + .ent_bread = fat_ent_bread, + .ent_get = fat16_ent_get, + .ent_put = fat16_ent_put, + .ent_next = fat16_ent_next, +}; + +static struct fatent_operations fat32_ops = { + .ent_blocknr = fat_ent_blocknr, + .ent_set_ptr = fat32_ent_set_ptr, + .ent_bread = fat_ent_bread, + .ent_get = fat32_ent_get, + .ent_put = fat32_ent_put, + .ent_next = fat32_ent_next, +}; + +static inline void lock_fat(struct msdos_sb_info *sbi) +{ + mutex_lock(&sbi->fat_lock); +} + +static inline void unlock_fat(struct msdos_sb_info *sbi) +{ + mutex_unlock(&sbi->fat_lock); +} + +void fat_ent_access_init(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + + mutex_init(&sbi->fat_lock); + + switch (sbi->fat_bits) { + case 32: + sbi->fatent_shift = 2; + sbi->fatent_ops = &fat32_ops; + break; + case 16: + sbi->fatent_shift = 1; + sbi->fatent_ops = &fat16_ops; + break; + case 12: + sbi->fatent_shift = -1; + sbi->fatent_ops = &fat12_ops; + break; + } +} + +static inline int fat_ent_update_ptr(struct super_block *sb, + struct fat_entry *fatent, + int offset, sector_t blocknr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct fatent_operations *ops = sbi->fatent_ops; + struct buffer_head **bhs = fatent->bhs; + + /* Is this fatent's blocks including this entry? */ + if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr) + return 0; + if (sbi->fat_bits == 12) { + if ((offset + 1) < sb->s_blocksize) { + /* This entry is on bhs[0]. */ + if (fatent->nr_bhs == 2) { + brelse(bhs[1]); + fatent->nr_bhs = 1; + } + } else { + /* This entry needs the next block. */ + if (fatent->nr_bhs != 2) + return 0; + if (bhs[1]->b_blocknr != (blocknr + 1)) + return 0; + } + } + ops->ent_set_ptr(fatent, offset); + return 1; +} + +int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + struct fatent_operations *ops = sbi->fatent_ops; + int err, offset; + sector_t blocknr; + + if (entry < FAT_START_ENT || sbi->max_cluster <= entry) { + fatent_brelse(fatent); + fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry); + return -EIO; + } + + fatent_set_entry(fatent, entry); + ops->ent_blocknr(sb, entry, &offset, &blocknr); + + if (!fat_ent_update_ptr(sb, fatent, offset, blocknr)) { + fatent_brelse(fatent); + err = ops->ent_bread(sb, fatent, offset, blocknr); + if (err) + return err; + } + return ops->ent_get(fatent); +} + +/* FIXME: We can write the blocks as more big chunk. */ +static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs, + int nr_bhs) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *c_bh; + int err, n, copy; + + err = 0; + for (copy = 1; copy < sbi->fats; copy++) { + sector_t backup_fat = sbi->fat_length * copy; + + for (n = 0; n < nr_bhs; n++) { + c_bh = sb_getblk(sb, backup_fat + bhs[n]->b_blocknr); + if (!c_bh) { + err = -ENOMEM; + goto error; + } + memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); + set_buffer_uptodate(c_bh); + mark_buffer_dirty_inode(c_bh, sbi->fat_inode); + if (sb->s_flags & MS_SYNCHRONOUS) + err = sync_dirty_buffer(c_bh); + brelse(c_bh); + if (err) + goto error; + } + } +error: + return err; +} + +int fat_ent_write(struct inode *inode, struct fat_entry *fatent, + int new, int wait) +{ + struct super_block *sb = inode->i_sb; + struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + int err; + + ops->ent_put(fatent, new); + if (wait) { + err = fat_sync_bhs(fatent->bhs, fatent->nr_bhs); + if (err) + return err; + } + return fat_mirror_bhs(sb, fatent->bhs, fatent->nr_bhs); +} + +static inline int fat_ent_next(struct msdos_sb_info *sbi, + struct fat_entry *fatent) +{ + if (sbi->fatent_ops->ent_next(fatent)) { + if (fatent->entry < sbi->max_cluster) + return 1; + } + return 0; +} + +static inline int fat_ent_read_block(struct super_block *sb, + struct fat_entry *fatent) +{ + struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + sector_t blocknr; + int offset; + + fatent_brelse(fatent); + ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); + return ops->ent_bread(sb, fatent, offset, blocknr); +} + +static void fat_collect_bhs(struct buffer_head **bhs, int *nr_bhs, + struct fat_entry *fatent) +{ + int n, i; + + for (n = 0; n < fatent->nr_bhs; n++) { + for (i = 0; i < *nr_bhs; i++) { + if (fatent->bhs[n] == bhs[i]) + break; + } + if (i == *nr_bhs) { + get_bh(fatent->bhs[n]); + bhs[i] = fatent->bhs[n]; + (*nr_bhs)++; + } + } +} + +int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct fatent_operations *ops = sbi->fatent_ops; + struct fat_entry fatent, prev_ent; + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; + int i, count, err, nr_bhs, idx_clus; + + BUG_ON(nr_cluster > (MAX_BUF_PER_PAGE / 2)); /* fixed limit */ + + lock_fat(sbi); + if (sbi->free_clusters != -1 && sbi->free_clus_valid && + sbi->free_clusters < nr_cluster) { + unlock_fat(sbi); + return -ENOSPC; + } + + err = nr_bhs = idx_clus = 0; + count = FAT_START_ENT; + fatent_init(&prev_ent); + fatent_init(&fatent); + fatent_set_entry(&fatent, sbi->prev_free + 1); + while (count < sbi->max_cluster) { + if (fatent.entry >= sbi->max_cluster) + fatent.entry = FAT_START_ENT; + fatent_set_entry(&fatent, fatent.entry); + err = fat_ent_read_block(sb, &fatent); + if (err) + goto out; + + /* Find the free entries in a block */ + do { + if (ops->ent_get(&fatent) == FAT_ENT_FREE) { + int entry = fatent.entry; + + /* make the cluster chain */ + ops->ent_put(&fatent, FAT_ENT_EOF); + if (prev_ent.nr_bhs) + ops->ent_put(&prev_ent, entry); + + fat_collect_bhs(bhs, &nr_bhs, &fatent); + + sbi->prev_free = entry; + if (sbi->free_clusters != -1) + sbi->free_clusters--; + sb->s_dirt = 1; + + cluster[idx_clus] = entry; + idx_clus++; + if (idx_clus == nr_cluster) + goto out; + + /* + * fat_collect_bhs() gets ref-count of bhs, + * so we can still use the prev_ent. + */ + prev_ent = fatent; + } + count++; + if (count == sbi->max_cluster) + break; + } while (fat_ent_next(sbi, &fatent)); + } + + /* Couldn't allocate the free entries */ + sbi->free_clusters = 0; + sbi->free_clus_valid = 1; + sb->s_dirt = 1; + err = -ENOSPC; + +out: + unlock_fat(sbi); + fatent_brelse(&fatent); + if (!err) { + if (inode_needs_sync(inode)) + err = fat_sync_bhs(bhs, nr_bhs); + if (!err) + err = fat_mirror_bhs(sb, bhs, nr_bhs); + } + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + + if (err && idx_clus) + fat_free_clusters(inode, cluster[0]); + + return err; +} + +int fat_free_clusters(struct inode *inode, int cluster) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct fatent_operations *ops = sbi->fatent_ops; + struct fat_entry fatent; + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; + int i, err, nr_bhs; + int first_cl = cluster; + + nr_bhs = 0; + fatent_init(&fatent); + lock_fat(sbi); + do { + cluster = fat_ent_read(inode, &fatent, cluster); + if (cluster < 0) { + err = cluster; + goto error; + } else if (cluster == FAT_ENT_FREE) { + fat_fs_error(sb, "%s: deleting FAT entry beyond EOF", + __func__); + err = -EIO; + goto error; + } + + if (sbi->options.discard) { + /* + * Issue discard for the sectors we no longer + * care about, batching contiguous clusters + * into one request + */ + if (cluster != fatent.entry + 1) { + int nr_clus = fatent.entry - first_cl + 1; + + sb_issue_discard(sb, + fat_clus_to_blknr(sbi, first_cl), + nr_clus * sbi->sec_per_clus, + GFP_NOFS, 0); + + first_cl = cluster; + } + } + + ops->ent_put(&fatent, FAT_ENT_FREE); + if (sbi->free_clusters != -1) { + sbi->free_clusters++; + sb->s_dirt = 1; + } + + if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) { + if (sb->s_flags & MS_SYNCHRONOUS) { + err = fat_sync_bhs(bhs, nr_bhs); + if (err) + goto error; + } + err = fat_mirror_bhs(sb, bhs, nr_bhs); + if (err) + goto error; + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + nr_bhs = 0; + } + fat_collect_bhs(bhs, &nr_bhs, &fatent); + } while (cluster != FAT_ENT_EOF); + + if (sb->s_flags & MS_SYNCHRONOUS) { + err = fat_sync_bhs(bhs, nr_bhs); + if (err) + goto error; + } + err = fat_mirror_bhs(sb, bhs, nr_bhs); +error: + fatent_brelse(&fatent); + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + unlock_fat(sbi); + + return err; +} + +EXPORT_SYMBOL_GPL(fat_free_clusters); + +/* 128kb is the whole sectors for FAT12 and FAT16 */ +#define FAT_READA_SIZE (128 * 1024) + +static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent, + unsigned long reada_blocks) +{ + struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + sector_t blocknr; + int i, offset; + + ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); + + for (i = 0; i < reada_blocks; i++) + sb_breadahead(sb, blocknr + i); +} + +int fat_count_free_clusters(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct fatent_operations *ops = sbi->fatent_ops; + struct fat_entry fatent; + unsigned long reada_blocks, reada_mask, cur_block; + int err = 0, free; + + lock_fat(sbi); + if (sbi->free_clusters != -1 && sbi->free_clus_valid) + goto out; + + reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits; + reada_mask = reada_blocks - 1; + cur_block = 0; + + free = 0; + fatent_init(&fatent); + fatent_set_entry(&fatent, FAT_START_ENT); + while (fatent.entry < sbi->max_cluster) { + /* readahead of fat blocks */ + if ((cur_block & reada_mask) == 0) { + unsigned long rest = sbi->fat_length - cur_block; + fat_ent_reada(sb, &fatent, min(reada_blocks, rest)); + } + cur_block++; + + err = fat_ent_read_block(sb, &fatent); + if (err) + goto out; + + do { + if (ops->ent_get(&fatent) == FAT_ENT_FREE) + free++; + } while (fat_ent_next(sbi, &fatent)); + } + sbi->free_clusters = free; + sbi->free_clus_valid = 1; + sb->s_dirt = 1; + fatent_brelse(&fatent); +out: + unlock_fat(sbi); + return err; +} diff --git a/fs/fat/file.c b/fs/fat/file.c new file mode 100644 index 00000000..7018e1d8 --- /dev/null +++ b/fs/fat/file.c @@ -0,0 +1,446 @@ +/* + * linux/fs/fat/file.c + * + * Written 1992,1993 by Werner Almesberger + * + * regular file handling primitives for fat-based filesystems + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fat.h" + +static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) +{ + u32 attr; + + mutex_lock(&inode->i_mutex); + attr = fat_make_attrs(inode); + mutex_unlock(&inode->i_mutex); + + return put_user(attr, user_attr); +} + +static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + int is_dir = S_ISDIR(inode->i_mode); + u32 attr, oldattr; + struct iattr ia; + int err; + + err = get_user(attr, user_attr); + if (err) + goto out; + + mutex_lock(&inode->i_mutex); + err = mnt_want_write(file->f_path.mnt); + if (err) + goto out_unlock_inode; + + /* + * ATTR_VOLUME and ATTR_DIR cannot be changed; this also + * prevents the user from turning us into a VFAT + * longname entry. Also, we obviously can't set + * any of the NTFS attributes in the high 24 bits. + */ + attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR); + /* Merge in ATTR_VOLUME and ATTR_DIR */ + attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) | + (is_dir ? ATTR_DIR : 0); + oldattr = fat_make_attrs(inode); + + /* Equivalent to a chmod() */ + ia.ia_valid = ATTR_MODE | ATTR_CTIME; + ia.ia_ctime = current_fs_time(inode->i_sb); + if (is_dir) + ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO); + else { + ia.ia_mode = fat_make_mode(sbi, attr, + S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO)); + } + + /* The root directory has no attributes */ + if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) { + err = -EINVAL; + goto out_drop_write; + } + + if (sbi->options.sys_immutable && + ((attr | oldattr) & ATTR_SYS) && + !capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto out_drop_write; + } + + /* + * The security check is questionable... We single + * out the RO attribute for checking by the security + * module, just because it maps to a file mode. + */ + err = security_inode_setattr(file->f_path.dentry, &ia); + if (err) + goto out_drop_write; + + /* This MUST be done before doing anything irreversible... */ + err = fat_setattr(file->f_path.dentry, &ia); + if (err) + goto out_drop_write; + + fsnotify_change(file->f_path.dentry, ia.ia_valid); + if (sbi->options.sys_immutable) { + if (attr & ATTR_SYS) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + } + + fat_save_attrs(inode, attr); + mark_inode_dirty(inode); +out_drop_write: + mnt_drop_write(file->f_path.mnt); +out_unlock_inode: + mutex_unlock(&inode->i_mutex); +out: + return err; +} + +long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + u32 __user *user_attr = (u32 __user *)arg; + + switch (cmd) { + case FAT_IOCTL_GET_ATTRIBUTES: + return fat_ioctl_get_attributes(inode, user_attr); + case FAT_IOCTL_SET_ATTRIBUTES: + return fat_ioctl_set_attributes(filp, user_attr); + default: + return -ENOTTY; /* Inappropriate ioctl for device */ + } +} + +#ifdef CONFIG_COMPAT +static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) + +{ + return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + +static int fat_file_release(struct inode *inode, struct file *filp) +{ + if ((filp->f_mode & FMODE_WRITE) && + MSDOS_SB(inode->i_sb)->options.flush) { + fat_flush_inodes(inode->i_sb, inode, NULL); + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + return 0; +} + +int fat_file_fsync(struct file *filp, int datasync) +{ + struct inode *inode = filp->f_mapping->host; + int res, err; + + res = generic_file_fsync(filp, datasync); + err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); + + return res ? res : err; +} + + +const struct file_operations fat_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .release = fat_file_release, + .unlocked_ioctl = fat_generic_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = fat_generic_compat_ioctl, +#endif + .fsync = fat_file_fsync, + .splice_read = generic_file_splice_read, +}; + +static int fat_cont_expand(struct inode *inode, loff_t size) +{ + struct address_space *mapping = inode->i_mapping; + loff_t start = inode->i_size, count = size - inode->i_size; + int err; + + err = generic_cont_expand_simple(inode, size); + if (err) + goto out; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + if (IS_SYNC(inode)) { + int err2; + + /* + * Opencode syncing since we don't have a file open to use + * standard fsync path. + */ + err = filemap_fdatawrite_range(mapping, start, + start + count - 1); + err2 = sync_mapping_buffers(mapping); + if (!err) + err = err2; + err2 = write_inode_now(inode, 1); + if (!err) + err = err2; + if (!err) { + err = filemap_fdatawait_range(mapping, start, + start + count - 1); + } + } +out: + return err; +} + +/* Free all clusters after the skip'th cluster. */ +static int fat_free(struct inode *inode, int skip) +{ + struct super_block *sb = inode->i_sb; + int err, wait, free_start, i_start, i_logstart; + + if (MSDOS_I(inode)->i_start == 0) + return 0; + + fat_cache_inval_inode(inode); + + wait = IS_DIRSYNC(inode); + i_start = free_start = MSDOS_I(inode)->i_start; + i_logstart = MSDOS_I(inode)->i_logstart; + + /* First, we write the new file size. */ + if (!skip) { + MSDOS_I(inode)->i_start = 0; + MSDOS_I(inode)->i_logstart = 0; + } + MSDOS_I(inode)->i_attrs |= ATTR_ARCH; + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + if (wait) { + err = fat_sync_inode(inode); + if (err) { + MSDOS_I(inode)->i_start = i_start; + MSDOS_I(inode)->i_logstart = i_logstart; + return err; + } + } else + mark_inode_dirty(inode); + + /* Write a new EOF, and get the remaining cluster chain for freeing. */ + if (skip) { + struct fat_entry fatent; + int ret, fclus, dclus; + + ret = fat_get_cluster(inode, skip - 1, &fclus, &dclus); + if (ret < 0) + return ret; + else if (ret == FAT_ENT_EOF) + return 0; + + fatent_init(&fatent); + ret = fat_ent_read(inode, &fatent, dclus); + if (ret == FAT_ENT_EOF) { + fatent_brelse(&fatent); + return 0; + } else if (ret == FAT_ENT_FREE) { + fat_fs_error(sb, + "%s: invalid cluster chain (i_pos %lld)", + __func__, MSDOS_I(inode)->i_pos); + ret = -EIO; + } else if (ret > 0) { + err = fat_ent_write(inode, &fatent, FAT_ENT_EOF, wait); + if (err) + ret = err; + } + fatent_brelse(&fatent); + if (ret < 0) + return ret; + + free_start = ret; + } + inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9); + + /* Freeing the remained cluster chain */ + return fat_free_clusters(inode, free_start); +} + +void fat_truncate_blocks(struct inode *inode, loff_t offset) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + const unsigned int cluster_size = sbi->cluster_size; + int nr_clusters; + + /* + * This protects against truncating a file bigger than it was then + * trying to write into the hole. + */ + if (MSDOS_I(inode)->mmu_private > offset) + MSDOS_I(inode)->mmu_private = offset; + + nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits; + + fat_free(inode, nr_clusters); + fat_flush_inodes(inode->i_sb, inode, NULL); +} + +int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + generic_fillattr(inode, stat); + stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; + return 0; +} +EXPORT_SYMBOL_GPL(fat_getattr); + +static int fat_sanitize_mode(const struct msdos_sb_info *sbi, + struct inode *inode, umode_t *mode_ptr) +{ + mode_t mask, perm; + + /* + * Note, the basic check is already done by a caller of + * (attr->ia_mode & ~FAT_VALID_MODE) + */ + + if (S_ISREG(inode->i_mode)) + mask = sbi->options.fs_fmask; + else + mask = sbi->options.fs_dmask; + + perm = *mode_ptr & ~(S_IFMT | mask); + + /* + * Of the r and x bits, all (subject to umask) must be present. Of the + * w bits, either all (subject to umask) or none must be present. + * + * If fat_mode_can_hold_ro(inode) is false, can't change w bits. + */ + if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO))) + return -EPERM; + if (fat_mode_can_hold_ro(inode)) { + if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask))) + return -EPERM; + } else { + if ((perm & S_IWUGO) != (S_IWUGO & ~mask)) + return -EPERM; + } + + *mode_ptr &= S_IFMT | perm; + + return 0; +} + +static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) +{ + mode_t allow_utime = sbi->options.allow_utime; + + if (current_fsuid() != inode->i_uid) { + if (in_group_p(inode->i_gid)) + allow_utime >>= 3; + if (allow_utime & MAY_WRITE) + return 1; + } + + /* use a default check */ + return 0; +} + +#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) +/* valid file mode bits */ +#define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) + +int fat_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); + struct inode *inode = dentry->d_inode; + unsigned int ia_valid; + int error; + + /* Check for setting the inode time. */ + ia_valid = attr->ia_valid; + if (ia_valid & TIMES_SET_FLAGS) { + if (fat_allow_set_time(sbi, inode)) + attr->ia_valid &= ~TIMES_SET_FLAGS; + } + + error = inode_change_ok(inode, attr); + attr->ia_valid = ia_valid; + if (error) { + if (sbi->options.quiet) + error = 0; + goto out; + } + + /* + * Expand the file. Since inode_setattr() updates ->i_size + * before calling the ->truncate(), but FAT needs to fill the + * hole before it. XXX: this is no longer true with new truncate + * sequence. + */ + if (attr->ia_valid & ATTR_SIZE) { + if (attr->ia_size > inode->i_size) { + error = fat_cont_expand(inode, attr->ia_size); + if (error || attr->ia_valid == ATTR_SIZE) + goto out; + attr->ia_valid &= ~ATTR_SIZE; + } + } + + if (((attr->ia_valid & ATTR_UID) && + (attr->ia_uid != sbi->options.fs_uid)) || + ((attr->ia_valid & ATTR_GID) && + (attr->ia_gid != sbi->options.fs_gid)) || + ((attr->ia_valid & ATTR_MODE) && + (attr->ia_mode & ~FAT_VALID_MODE))) + error = -EPERM; + + if (error) { + if (sbi->options.quiet) + error = 0; + goto out; + } + + /* + * We don't return -EPERM here. Yes, strange, but this is too + * old behavior. + */ + if (attr->ia_valid & ATTR_MODE) { + if (fat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0) + attr->ia_valid &= ~ATTR_MODE; + } + + if (attr->ia_valid & ATTR_SIZE) { + truncate_setsize(inode, attr->ia_size); + fat_truncate_blocks(inode, attr->ia_size); + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); +out: + return error; +} +EXPORT_SYMBOL_GPL(fat_setattr); + +const struct inode_operations fat_file_inode_operations = { + .setattr = fat_setattr, + .getattr = fat_getattr, +}; diff --git a/fs/fat/inode.c b/fs/fat/inode.c new file mode 100644 index 00000000..9836839e --- /dev/null +++ b/fs/fat/inode.c @@ -0,0 +1,1619 @@ +/* + * linux/fs/fat/inode.c + * + * Written 1992,1993 by Werner Almesberger + * VFAT extensions by Gordon Chaffee, merged with msdos fs by Henrik Storner + * Rewritten for the constant inumbers support by Al Viro + * + * Fixes: + * + * Max Cohan: Fixed invalid FSINFO offset when info_sector is 0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fat.h" + +#ifndef CONFIG_FAT_DEFAULT_IOCHARSET +/* if user don't select VFAT, this is undefined. */ +#define CONFIG_FAT_DEFAULT_IOCHARSET "" +#endif + +static int fat_default_codepage = CONFIG_FAT_DEFAULT_CODEPAGE; +static char fat_default_iocharset[] = CONFIG_FAT_DEFAULT_IOCHARSET; + + +static int fat_add_cluster(struct inode *inode) +{ + int err, cluster; + + err = fat_alloc_clusters(inode, &cluster, 1); + if (err) + return err; + /* FIXME: this cluster should be added after data of this + * cluster is writed */ + err = fat_chain_add(inode, cluster, 1); + if (err) + fat_free_clusters(inode, cluster); + return err; +} + +static inline int __fat_get_block(struct inode *inode, sector_t iblock, + unsigned long *max_blocks, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + unsigned long mapped_blocks; + sector_t phys; + int err, offset; + + err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); + if (err) + return err; + if (phys) { + map_bh(bh_result, sb, phys); + *max_blocks = min(mapped_blocks, *max_blocks); + return 0; + } + if (!create) + return 0; + + if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) { + fat_fs_error(sb, "corrupted file size (i_pos %lld, %lld)", + MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private); + return -EIO; + } + + offset = (unsigned long)iblock & (sbi->sec_per_clus - 1); + if (!offset) { + /* TODO: multiple cluster allocation would be desirable. */ + err = fat_add_cluster(inode); + if (err) + return err; + } + /* available blocks on this cluster */ + mapped_blocks = sbi->sec_per_clus - offset; + + *max_blocks = min(mapped_blocks, *max_blocks); + MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits; + + err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); + if (err) + return err; + + BUG_ON(!phys); + BUG_ON(*max_blocks != mapped_blocks); + set_buffer_new(bh_result); + map_bh(bh_result, sb, phys); + + return 0; +} + +static int fat_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + int err; + + err = __fat_get_block(inode, iblock, &max_blocks, bh_result, create); + if (err) + return err; + bh_result->b_size = max_blocks << sb->s_blocksize_bits; + return 0; +} + +static int fat_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, fat_get_block, wbc); +} + +static int fat_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, fat_get_block); +} + +static int fat_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, fat_get_block); +} + +static int fat_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, fat_get_block); +} + +static void fat_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + fat_truncate_blocks(inode, inode->i_size); + } +} + +static int fat_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int err; + + *pagep = NULL; + err = cont_write_begin(file, mapping, pos, len, flags, + pagep, fsdata, fat_get_block, + &MSDOS_I(mapping->host)->mmu_private); + if (err < 0) + fat_write_failed(mapping, pos + len); + return err; +} + +static int fat_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *pagep, void *fsdata) +{ + struct inode *inode = mapping->host; + int err; + err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + if (err < len) + fat_write_failed(mapping, pos + len); + if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + MSDOS_I(inode)->i_attrs |= ATTR_ARCH; + mark_inode_dirty(inode); + } + return err; +} + +static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + + if (rw == WRITE) { + /* + * FIXME: blockdev_direct_IO() doesn't use ->write_begin(), + * so we need to update the ->mmu_private to block boundary. + * + * But we must fill the remaining area or hole by nul for + * updating ->mmu_private. + * + * Return 0, and fallback to normal buffered write. + */ + loff_t size = offset + iov_length(iov, nr_segs); + if (MSDOS_I(inode)->mmu_private < size) + return 0; + } + + /* + * FAT need to use the DIO_LOCKING for avoiding the race + * condition of fat_get_block() and ->truncate(). + */ + ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, + iov, offset, nr_segs, fat_get_block, NULL); + if (ret < 0 && (rw & WRITE)) + fat_write_failed(mapping, offset + iov_length(iov, nr_segs)); + + return ret; +} + +static sector_t _fat_bmap(struct address_space *mapping, sector_t block) +{ + sector_t blocknr; + + /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ + down_read(&mapping->host->i_alloc_sem); + blocknr = generic_block_bmap(mapping, block, fat_get_block); + up_read(&mapping->host->i_alloc_sem); + + return blocknr; +} + +static const struct address_space_operations fat_aops = { + .readpage = fat_readpage, + .readpages = fat_readpages, + .writepage = fat_writepage, + .writepages = fat_writepages, + .write_begin = fat_write_begin, + .write_end = fat_write_end, + .direct_IO = fat_direct_IO, + .bmap = _fat_bmap +}; + +/* + * New FAT inode stuff. We do the following: + * a) i_ino is constant and has nothing with on-disk location. + * b) FAT manages its own cache of directory entries. + * c) *This* cache is indexed by on-disk location. + * d) inode has an associated directory entry, all right, but + * it may be unhashed. + * e) currently entries are stored within struct inode. That should + * change. + * f) we deal with races in the following way: + * 1. readdir() and lookup() do FAT-dir-cache lookup. + * 2. rename() unhashes the F-d-c entry and rehashes it in + * a new place. + * 3. unlink() and rmdir() unhash F-d-c entry. + * 4. fat_write_inode() checks whether the thing is unhashed. + * If it is we silently return. If it isn't we do bread(), + * check if the location is still valid and retry if it + * isn't. Otherwise we do changes. + * 5. Spinlock is used to protect hash/unhash/location check/lookup + * 6. fat_evict_inode() unhashes the F-d-c entry. + * 7. lookup() and readdir() do igrab() if they find a F-d-c entry + * and consider negative result as cache miss. + */ + +static void fat_hash_init(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int i; + + spin_lock_init(&sbi->inode_hash_lock); + for (i = 0; i < FAT_HASH_SIZE; i++) + INIT_HLIST_HEAD(&sbi->inode_hashtable[i]); +} + +static inline unsigned long fat_hash(loff_t i_pos) +{ + return hash_32(i_pos, FAT_HASH_BITS); +} + +void fat_attach(struct inode *inode, loff_t i_pos) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos); + + spin_lock(&sbi->inode_hash_lock); + MSDOS_I(inode)->i_pos = i_pos; + hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head); + spin_unlock(&sbi->inode_hash_lock); +} +EXPORT_SYMBOL_GPL(fat_attach); + +void fat_detach(struct inode *inode) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + spin_lock(&sbi->inode_hash_lock); + MSDOS_I(inode)->i_pos = 0; + hlist_del_init(&MSDOS_I(inode)->i_fat_hash); + spin_unlock(&sbi->inode_hash_lock); +} +EXPORT_SYMBOL_GPL(fat_detach); + +struct inode *fat_iget(struct super_block *sb, loff_t i_pos) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos); + struct hlist_node *_p; + struct msdos_inode_info *i; + struct inode *inode = NULL; + + spin_lock(&sbi->inode_hash_lock); + hlist_for_each_entry(i, _p, head, i_fat_hash) { + BUG_ON(i->vfs_inode.i_sb != sb); + if (i->i_pos != i_pos) + continue; + inode = igrab(&i->vfs_inode); + if (inode) + break; + } + spin_unlock(&sbi->inode_hash_lock); + return inode; +} + +static int is_exec(unsigned char *extension) +{ + unsigned char *exe_extensions = "EXECOMBAT", *walk; + + for (walk = exe_extensions; *walk; walk += 3) + if (!strncmp(extension, walk, 3)) + return 1; + return 0; +} + +static int fat_calc_dir_size(struct inode *inode) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + int ret, fclus, dclus; + + inode->i_size = 0; + if (MSDOS_I(inode)->i_start == 0) + return 0; + + ret = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus); + if (ret < 0) + return ret; + inode->i_size = (fclus + 1) << sbi->cluster_bits; + + return 0; +} + +/* doesn't deal with root inode */ +static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + int error; + + MSDOS_I(inode)->i_pos = 0; + inode->i_uid = sbi->options.fs_uid; + inode->i_gid = sbi->options.fs_gid; + inode->i_version++; + inode->i_generation = get_seconds(); + + if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) { + inode->i_generation &= ~1; + inode->i_mode = fat_make_mode(sbi, de->attr, S_IRWXUGO); + inode->i_op = sbi->dir_ops; + inode->i_fop = &fat_dir_operations; + + MSDOS_I(inode)->i_start = le16_to_cpu(de->start); + if (sbi->fat_bits == 32) + MSDOS_I(inode)->i_start |= (le16_to_cpu(de->starthi) << 16); + + MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start; + error = fat_calc_dir_size(inode); + if (error < 0) + return error; + MSDOS_I(inode)->mmu_private = inode->i_size; + + inode->i_nlink = fat_subdirs(inode); + } else { /* not a directory */ + inode->i_generation |= 1; + inode->i_mode = fat_make_mode(sbi, de->attr, + ((sbi->options.showexec && !is_exec(de->name + 8)) + ? S_IRUGO|S_IWUGO : S_IRWXUGO)); + MSDOS_I(inode)->i_start = le16_to_cpu(de->start); + if (sbi->fat_bits == 32) + MSDOS_I(inode)->i_start |= (le16_to_cpu(de->starthi) << 16); + + MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start; + inode->i_size = le32_to_cpu(de->size); + inode->i_op = &fat_file_inode_operations; + inode->i_fop = &fat_file_operations; + inode->i_mapping->a_ops = &fat_aops; + MSDOS_I(inode)->mmu_private = inode->i_size; + } + if (de->attr & ATTR_SYS) { + if (sbi->options.sys_immutable) + inode->i_flags |= S_IMMUTABLE; + } + fat_save_attrs(inode, de->attr); + + inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) + & ~((loff_t)sbi->cluster_size - 1)) >> 9; + + fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); + if (sbi->options.isvfat) { + fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime, + de->cdate, de->ctime_cs); + fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); + } else + inode->i_ctime = inode->i_atime = inode->i_mtime; + + return 0; +} + +struct inode *fat_build_inode(struct super_block *sb, + struct msdos_dir_entry *de, loff_t i_pos) +{ + struct inode *inode; + int err; + + inode = fat_iget(sb, i_pos); + if (inode) + goto out; + inode = new_inode(sb); + if (!inode) { + inode = ERR_PTR(-ENOMEM); + goto out; + } + inode->i_ino = iunique(sb, MSDOS_ROOT_INO); + inode->i_version = 1; + err = fat_fill_inode(inode, de); + if (err) { + iput(inode); + inode = ERR_PTR(err); + goto out; + } + fat_attach(inode, i_pos); + insert_inode_hash(inode); +out: + return inode; +} + +EXPORT_SYMBOL_GPL(fat_build_inode); + +static void fat_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + if (!inode->i_nlink) { + inode->i_size = 0; + fat_truncate_blocks(inode, 0); + } + invalidate_inode_buffers(inode); + end_writeback(inode); + fat_cache_inval_inode(inode); + fat_detach(inode); +} + +static void fat_write_super(struct super_block *sb) +{ + lock_super(sb); + sb->s_dirt = 0; + + if (!(sb->s_flags & MS_RDONLY)) + fat_clusters_flush(sb); + unlock_super(sb); +} + +static int fat_sync_fs(struct super_block *sb, int wait) +{ + int err = 0; + + if (sb->s_dirt) { + lock_super(sb); + sb->s_dirt = 0; + err = fat_clusters_flush(sb); + unlock_super(sb); + } + + return err; +} + +static void fat_put_super(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + + if (sb->s_dirt) + fat_write_super(sb); + + iput(sbi->fat_inode); + + unload_nls(sbi->nls_disk); + unload_nls(sbi->nls_io); + + if (sbi->options.iocharset != fat_default_iocharset) + kfree(sbi->options.iocharset); + + sb->s_fs_info = NULL; + kfree(sbi); +} + +static struct kmem_cache *fat_inode_cachep; + +static struct inode *fat_alloc_inode(struct super_block *sb) +{ + struct msdos_inode_info *ei; + ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void fat_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(fat_inode_cachep, MSDOS_I(inode)); +} + +static void fat_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, fat_i_callback); +} + +static void init_once(void *foo) +{ + struct msdos_inode_info *ei = (struct msdos_inode_info *)foo; + + spin_lock_init(&ei->cache_lru_lock); + ei->nr_caches = 0; + ei->cache_valid_id = FAT_CACHE_VALID + 1; + INIT_LIST_HEAD(&ei->cache_lru); + INIT_HLIST_NODE(&ei->i_fat_hash); + inode_init_once(&ei->vfs_inode); +} + +static int __init fat_init_inodecache(void) +{ + fat_inode_cachep = kmem_cache_create("fat_inode_cache", + sizeof(struct msdos_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (fat_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void __exit fat_destroy_inodecache(void) +{ + kmem_cache_destroy(fat_inode_cachep); +} + +static int fat_remount(struct super_block *sb, int *flags, char *data) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); + return 0; +} + +static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + /* If the count of free cluster is still unknown, counts it here. */ + if (sbi->free_clusters == -1 || !sbi->free_clus_valid) { + int err = fat_count_free_clusters(dentry->d_sb); + if (err) + return err; + } + + buf->f_type = dentry->d_sb->s_magic; + buf->f_bsize = sbi->cluster_size; + buf->f_blocks = sbi->max_cluster - FAT_START_ENT; + buf->f_bfree = sbi->free_clusters; + buf->f_bavail = sbi->free_clusters; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = + (sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE; + + return 0; +} + +static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi, + struct inode *inode) +{ + loff_t i_pos; +#if BITS_PER_LONG == 32 + spin_lock(&sbi->inode_hash_lock); +#endif + i_pos = MSDOS_I(inode)->i_pos; +#if BITS_PER_LONG == 32 + spin_unlock(&sbi->inode_hash_lock); +#endif + return i_pos; +} + +static int __fat_write_inode(struct inode *inode, int wait) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bh; + struct msdos_dir_entry *raw_entry; + loff_t i_pos; + int err; + + if (inode->i_ino == MSDOS_ROOT_INO) + return 0; + +retry: + i_pos = fat_i_pos_read(sbi, inode); + if (!i_pos) + return 0; + + bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits); + if (!bh) { + fat_msg(sb, KERN_ERR, "unable to read inode block " + "for updating (i_pos %lld)", i_pos); + return -EIO; + } + spin_lock(&sbi->inode_hash_lock); + if (i_pos != MSDOS_I(inode)->i_pos) { + spin_unlock(&sbi->inode_hash_lock); + brelse(bh); + goto retry; + } + + raw_entry = &((struct msdos_dir_entry *) (bh->b_data)) + [i_pos & (sbi->dir_per_block - 1)]; + if (S_ISDIR(inode->i_mode)) + raw_entry->size = 0; + else + raw_entry->size = cpu_to_le32(inode->i_size); + raw_entry->attr = fat_make_attrs(inode); + raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart); + raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16); + fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time, + &raw_entry->date, NULL); + if (sbi->options.isvfat) { + __le16 atime; + fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime, + &raw_entry->cdate, &raw_entry->ctime_cs); + fat_time_unix2fat(sbi, &inode->i_atime, &atime, + &raw_entry->adate, NULL); + } + spin_unlock(&sbi->inode_hash_lock); + mark_buffer_dirty(bh); + err = 0; + if (wait) + err = sync_dirty_buffer(bh); + brelse(bh); + return err; +} + +static int fat_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + +int fat_sync_inode(struct inode *inode) +{ + return __fat_write_inode(inode, 1); +} + +EXPORT_SYMBOL_GPL(fat_sync_inode); + +static int fat_show_options(struct seq_file *m, struct vfsmount *mnt); +static const struct super_operations fat_sops = { + .alloc_inode = fat_alloc_inode, + .destroy_inode = fat_destroy_inode, + .write_inode = fat_write_inode, + .evict_inode = fat_evict_inode, + .put_super = fat_put_super, + .write_super = fat_write_super, + .sync_fs = fat_sync_fs, + .statfs = fat_statfs, + .remount_fs = fat_remount, + + .show_options = fat_show_options, +}; + +/* + * a FAT file handle with fhtype 3 is + * 0/ i_ino - for fast, reliable lookup if still in the cache + * 1/ i_generation - to see if i_ino is still valid + * bit 0 == 0 iff directory + * 2/ i_pos(8-39) - if ino has changed, but still in cache + * 3/ i_pos(4-7)|i_logstart - to semi-verify inode found at i_pos + * 4/ i_pos(0-3)|parent->i_logstart - maybe used to hunt for the file on disc + * + * Hack for NFSv2: Maximum FAT entry number is 28bits and maximum + * i_pos is 40bits (blocknr(32) + dir offset(8)), so two 4bits + * of i_logstart is used to store the directory entry offset. + */ + +static struct dentry *fat_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct inode *inode = NULL; + u32 *fh = fid->raw; + + if (fh_len < 5 || fh_type != 3) + return NULL; + + inode = ilookup(sb, fh[0]); + if (!inode || inode->i_generation != fh[1]) { + if (inode) + iput(inode); + inode = NULL; + } + if (!inode) { + loff_t i_pos; + int i_logstart = fh[3] & 0x0fffffff; + + i_pos = (loff_t)fh[2] << 8; + i_pos |= ((fh[3] >> 24) & 0xf0) | (fh[4] >> 28); + + /* try 2 - see if i_pos is in F-d-c + * require i_logstart to be the same + * Will fail if you truncate and then re-write + */ + + inode = fat_iget(sb, i_pos); + if (inode && MSDOS_I(inode)->i_logstart != i_logstart) { + iput(inode); + inode = NULL; + } + } + + /* + * For now, do nothing if the inode is not found. + * + * What we could do is: + * + * - follow the file starting at fh[4], and record the ".." entry, + * and the name of the fh[2] entry. + * - then follow the ".." file finding the next step up. + * + * This way we build a path to the root of the tree. If this works, we + * lookup the path and so get this inode into the cache. Finally try + * the fat_iget lookup again. If that fails, then we are totally out + * of luck. But all that is for another day + */ + return d_obtain_alias(inode); +} + +static int +fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable) +{ + int len = *lenp; + struct inode *inode = de->d_inode; + u32 ipos_h, ipos_m, ipos_l; + + if (len < 5) { + *lenp = 5; + return 255; /* no room */ + } + + ipos_h = MSDOS_I(inode)->i_pos >> 8; + ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24; + ipos_l = (MSDOS_I(inode)->i_pos & 0x0f) << 28; + *lenp = 5; + fh[0] = inode->i_ino; + fh[1] = inode->i_generation; + fh[2] = ipos_h; + fh[3] = ipos_m | MSDOS_I(inode)->i_logstart; + spin_lock(&de->d_lock); + fh[4] = ipos_l | MSDOS_I(de->d_parent->d_inode)->i_logstart; + spin_unlock(&de->d_lock); + return 3; +} + +static struct dentry *fat_get_parent(struct dentry *child) +{ + struct super_block *sb = child->d_sb; + struct buffer_head *bh; + struct msdos_dir_entry *de; + loff_t i_pos; + struct dentry *parent; + struct inode *inode; + int err; + + lock_super(sb); + + err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos); + if (err) { + parent = ERR_PTR(err); + goto out; + } + inode = fat_build_inode(sb, de, i_pos); + brelse(bh); + + parent = d_obtain_alias(inode); +out: + unlock_super(sb); + + return parent; +} + +static const struct export_operations fat_export_ops = { + .encode_fh = fat_encode_fh, + .fh_to_dentry = fat_fh_to_dentry, + .get_parent = fat_get_parent, +}; + +static int fat_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + struct msdos_sb_info *sbi = MSDOS_SB(mnt->mnt_sb); + struct fat_mount_options *opts = &sbi->options; + int isvfat = opts->isvfat; + + if (opts->fs_uid != 0) + seq_printf(m, ",uid=%u", opts->fs_uid); + if (opts->fs_gid != 0) + seq_printf(m, ",gid=%u", opts->fs_gid); + seq_printf(m, ",fmask=%04o", opts->fs_fmask); + seq_printf(m, ",dmask=%04o", opts->fs_dmask); + if (opts->allow_utime) + seq_printf(m, ",allow_utime=%04o", opts->allow_utime); + if (sbi->nls_disk) + seq_printf(m, ",codepage=%s", sbi->nls_disk->charset); + if (isvfat) { + if (sbi->nls_io) + seq_printf(m, ",iocharset=%s", sbi->nls_io->charset); + + switch (opts->shortname) { + case VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95: + seq_puts(m, ",shortname=win95"); + break; + case VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WINNT: + seq_puts(m, ",shortname=winnt"); + break; + case VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WIN95: + seq_puts(m, ",shortname=mixed"); + break; + case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95: + seq_puts(m, ",shortname=lower"); + break; + default: + seq_puts(m, ",shortname=unknown"); + break; + } + } + if (opts->name_check != 'n') + seq_printf(m, ",check=%c", opts->name_check); + if (opts->usefree) + seq_puts(m, ",usefree"); + if (opts->quiet) + seq_puts(m, ",quiet"); + if (opts->showexec) + seq_puts(m, ",showexec"); + if (opts->sys_immutable) + seq_puts(m, ",sys_immutable"); + if (!isvfat) { + if (opts->dotsOK) + seq_puts(m, ",dotsOK=yes"); + if (opts->nocase) + seq_puts(m, ",nocase"); + } else { + if (opts->utf8) + seq_puts(m, ",utf8"); + if (opts->unicode_xlate) + seq_puts(m, ",uni_xlate"); + if (!opts->numtail) + seq_puts(m, ",nonumtail"); + if (opts->rodir) + seq_puts(m, ",rodir"); + } + if (opts->flush) + seq_puts(m, ",flush"); + if (opts->tz_utc) + seq_puts(m, ",tz=UTC"); + if (opts->errors == FAT_ERRORS_CONT) + seq_puts(m, ",errors=continue"); + else if (opts->errors == FAT_ERRORS_PANIC) + seq_puts(m, ",errors=panic"); + else + seq_puts(m, ",errors=remount-ro"); + if (opts->discard) + seq_puts(m, ",discard"); + + return 0; +} + +enum { + Opt_check_n, Opt_check_r, Opt_check_s, Opt_uid, Opt_gid, + Opt_umask, Opt_dmask, Opt_fmask, Opt_allow_utime, Opt_codepage, + Opt_usefree, Opt_nocase, Opt_quiet, Opt_showexec, Opt_debug, + Opt_immutable, Opt_dots, Opt_nodots, + Opt_charset, Opt_shortname_lower, Opt_shortname_win95, + Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, + Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, + Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, + Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err, +}; + +static const match_table_t fat_tokens = { + {Opt_check_r, "check=relaxed"}, + {Opt_check_s, "check=strict"}, + {Opt_check_n, "check=normal"}, + {Opt_check_r, "check=r"}, + {Opt_check_s, "check=s"}, + {Opt_check_n, "check=n"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_umask, "umask=%o"}, + {Opt_dmask, "dmask=%o"}, + {Opt_fmask, "fmask=%o"}, + {Opt_allow_utime, "allow_utime=%o"}, + {Opt_codepage, "codepage=%u"}, + {Opt_usefree, "usefree"}, + {Opt_nocase, "nocase"}, + {Opt_quiet, "quiet"}, + {Opt_showexec, "showexec"}, + {Opt_debug, "debug"}, + {Opt_immutable, "sys_immutable"}, + {Opt_flush, "flush"}, + {Opt_tz_utc, "tz=UTC"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_discard, "discard"}, + {Opt_obsolate, "conv=binary"}, + {Opt_obsolate, "conv=text"}, + {Opt_obsolate, "conv=auto"}, + {Opt_obsolate, "conv=b"}, + {Opt_obsolate, "conv=t"}, + {Opt_obsolate, "conv=a"}, + {Opt_obsolate, "fat=%u"}, + {Opt_obsolate, "blocksize=%u"}, + {Opt_obsolate, "cvf_format=%20s"}, + {Opt_obsolate, "cvf_options=%100s"}, + {Opt_obsolate, "posix"}, + {Opt_err, NULL}, +}; +static const match_table_t msdos_tokens = { + {Opt_nodots, "nodots"}, + {Opt_nodots, "dotsOK=no"}, + {Opt_dots, "dots"}, + {Opt_dots, "dotsOK=yes"}, + {Opt_err, NULL} +}; +static const match_table_t vfat_tokens = { + {Opt_charset, "iocharset=%s"}, + {Opt_shortname_lower, "shortname=lower"}, + {Opt_shortname_win95, "shortname=win95"}, + {Opt_shortname_winnt, "shortname=winnt"}, + {Opt_shortname_mixed, "shortname=mixed"}, + {Opt_utf8_no, "utf8=0"}, /* 0 or no or false */ + {Opt_utf8_no, "utf8=no"}, + {Opt_utf8_no, "utf8=false"}, + {Opt_utf8_yes, "utf8=1"}, /* empty or 1 or yes or true */ + {Opt_utf8_yes, "utf8=yes"}, + {Opt_utf8_yes, "utf8=true"}, + {Opt_utf8_yes, "utf8"}, + {Opt_uni_xl_no, "uni_xlate=0"}, /* 0 or no or false */ + {Opt_uni_xl_no, "uni_xlate=no"}, + {Opt_uni_xl_no, "uni_xlate=false"}, + {Opt_uni_xl_yes, "uni_xlate=1"}, /* empty or 1 or yes or true */ + {Opt_uni_xl_yes, "uni_xlate=yes"}, + {Opt_uni_xl_yes, "uni_xlate=true"}, + {Opt_uni_xl_yes, "uni_xlate"}, + {Opt_nonumtail_no, "nonumtail=0"}, /* 0 or no or false */ + {Opt_nonumtail_no, "nonumtail=no"}, + {Opt_nonumtail_no, "nonumtail=false"}, + {Opt_nonumtail_yes, "nonumtail=1"}, /* empty or 1 or yes or true */ + {Opt_nonumtail_yes, "nonumtail=yes"}, + {Opt_nonumtail_yes, "nonumtail=true"}, + {Opt_nonumtail_yes, "nonumtail"}, + {Opt_rodir, "rodir"}, + {Opt_err, NULL} +}; + +static int parse_options(struct super_block *sb, char *options, int is_vfat, + int silent, int *debug, struct fat_mount_options *opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + char *iocharset; + + opts->isvfat = is_vfat; + + opts->fs_uid = current_uid(); + opts->fs_gid = current_gid(); + opts->fs_fmask = opts->fs_dmask = current_umask(); + opts->allow_utime = -1; + opts->codepage = fat_default_codepage; + opts->iocharset = fat_default_iocharset; + if (is_vfat) { + opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95; + opts->rodir = 0; + } else { + opts->shortname = 0; + opts->rodir = 1; + } + opts->name_check = 'n'; + opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0; + opts->utf8 = opts->unicode_xlate = 0; + opts->numtail = 1; + opts->usefree = opts->nocase = 0; + opts->tz_utc = 0; + opts->errors = FAT_ERRORS_RO; + *debug = 0; + + if (!options) + goto out; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, fat_tokens, args); + if (token == Opt_err) { + if (is_vfat) + token = match_token(p, vfat_tokens, args); + else + token = match_token(p, msdos_tokens, args); + } + switch (token) { + case Opt_check_s: + opts->name_check = 's'; + break; + case Opt_check_r: + opts->name_check = 'r'; + break; + case Opt_check_n: + opts->name_check = 'n'; + break; + case Opt_usefree: + opts->usefree = 1; + break; + case Opt_nocase: + if (!is_vfat) + opts->nocase = 1; + else { + /* for backward compatibility */ + opts->shortname = VFAT_SFN_DISPLAY_WIN95 + | VFAT_SFN_CREATE_WIN95; + } + break; + case Opt_quiet: + opts->quiet = 1; + break; + case Opt_showexec: + opts->showexec = 1; + break; + case Opt_debug: + *debug = 1; + break; + case Opt_immutable: + opts->sys_immutable = 1; + break; + case Opt_uid: + if (match_int(&args[0], &option)) + return 0; + opts->fs_uid = option; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + opts->fs_gid = option; + break; + case Opt_umask: + if (match_octal(&args[0], &option)) + return 0; + opts->fs_fmask = opts->fs_dmask = option; + break; + case Opt_dmask: + if (match_octal(&args[0], &option)) + return 0; + opts->fs_dmask = option; + break; + case Opt_fmask: + if (match_octal(&args[0], &option)) + return 0; + opts->fs_fmask = option; + break; + case Opt_allow_utime: + if (match_octal(&args[0], &option)) + return 0; + opts->allow_utime = option & (S_IWGRP | S_IWOTH); + break; + case Opt_codepage: + if (match_int(&args[0], &option)) + return 0; + opts->codepage = option; + break; + case Opt_flush: + opts->flush = 1; + break; + case Opt_tz_utc: + opts->tz_utc = 1; + break; + case Opt_err_cont: + opts->errors = FAT_ERRORS_CONT; + break; + case Opt_err_panic: + opts->errors = FAT_ERRORS_PANIC; + break; + case Opt_err_ro: + opts->errors = FAT_ERRORS_RO; + break; + + /* msdos specific */ + case Opt_dots: + opts->dotsOK = 1; + break; + case Opt_nodots: + opts->dotsOK = 0; + break; + + /* vfat specific */ + case Opt_charset: + if (opts->iocharset != fat_default_iocharset) + kfree(opts->iocharset); + iocharset = match_strdup(&args[0]); + if (!iocharset) + return -ENOMEM; + opts->iocharset = iocharset; + break; + case Opt_shortname_lower: + opts->shortname = VFAT_SFN_DISPLAY_LOWER + | VFAT_SFN_CREATE_WIN95; + break; + case Opt_shortname_win95: + opts->shortname = VFAT_SFN_DISPLAY_WIN95 + | VFAT_SFN_CREATE_WIN95; + break; + case Opt_shortname_winnt: + opts->shortname = VFAT_SFN_DISPLAY_WINNT + | VFAT_SFN_CREATE_WINNT; + break; + case Opt_shortname_mixed: + opts->shortname = VFAT_SFN_DISPLAY_WINNT + | VFAT_SFN_CREATE_WIN95; + break; + case Opt_utf8_no: /* 0 or no or false */ + opts->utf8 = 0; + break; + case Opt_utf8_yes: /* empty or 1 or yes or true */ + opts->utf8 = 1; + break; + case Opt_uni_xl_no: /* 0 or no or false */ + opts->unicode_xlate = 0; + break; + case Opt_uni_xl_yes: /* empty or 1 or yes or true */ + opts->unicode_xlate = 1; + break; + case Opt_nonumtail_no: /* 0 or no or false */ + opts->numtail = 1; /* negated option */ + break; + case Opt_nonumtail_yes: /* empty or 1 or yes or true */ + opts->numtail = 0; /* negated option */ + break; + case Opt_rodir: + opts->rodir = 1; + break; + case Opt_discard: + opts->discard = 1; + break; + + /* obsolete mount options */ + case Opt_obsolate: + fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, " + "not supported now", p); + break; + /* unknown option */ + default: + if (!silent) { + fat_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " + "or missing value", p); + } + return -EINVAL; + } + } + +out: + /* UTF-8 doesn't provide FAT semantics */ + if (!strcmp(opts->iocharset, "utf8")) { + fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset" + " for FAT filesystems, filesystem will be " + "case sensitive!\n"); + } + + /* If user doesn't specify allow_utime, it's initialized from dmask. */ + if (opts->allow_utime == (unsigned short)-1) + opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH); + if (opts->unicode_xlate) + opts->utf8 = 0; + + return 0; +} + +static int fat_read_root(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int error; + + MSDOS_I(inode)->i_pos = 0; + inode->i_uid = sbi->options.fs_uid; + inode->i_gid = sbi->options.fs_gid; + inode->i_version++; + inode->i_generation = 0; + inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO); + inode->i_op = sbi->dir_ops; + inode->i_fop = &fat_dir_operations; + if (sbi->fat_bits == 32) { + MSDOS_I(inode)->i_start = sbi->root_cluster; + error = fat_calc_dir_size(inode); + if (error < 0) + return error; + } else { + MSDOS_I(inode)->i_start = 0; + inode->i_size = sbi->dir_entries * sizeof(struct msdos_dir_entry); + } + inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) + & ~((loff_t)sbi->cluster_size - 1)) >> 9; + MSDOS_I(inode)->i_logstart = 0; + MSDOS_I(inode)->mmu_private = inode->i_size; + + fat_save_attrs(inode, ATTR_DIR); + inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0; + inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0; + inode->i_nlink = fat_subdirs(inode)+2; + + return 0; +} + +/* + * Read the super block of an MS-DOS FS. + */ +int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, + void (*setup)(struct super_block *)) +{ + struct inode *root_inode = NULL, *fat_inode = NULL; + struct buffer_head *bh; + struct fat_boot_sector *b; + struct fat_boot_bsx *bsx; + struct msdos_sb_info *sbi; + u16 logical_sector_size; + u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors; + int debug; + unsigned int media; + long error; + char buf[50]; + + /* + * GFP_KERNEL is ok here, because while we do hold the + * supeblock lock, memory pressure can't call back into + * the filesystem, since we're only just about to mount + * it and have no inodes etc active! + */ + sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + + sb->s_flags |= MS_NODIRATIME; + sb->s_magic = MSDOS_SUPER_MAGIC; + sb->s_op = &fat_sops; + sb->s_export_op = &fat_export_ops; + ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options); + if (error) + goto out_fail; + + setup(sb); /* flavour-specific stuff that needs options */ + + error = -EIO; + sb_min_blocksize(sb, 512); + bh = sb_bread(sb, 0); + if (bh == NULL) { + fat_msg(sb, KERN_ERR, "unable to read boot sector"); + goto out_fail; + } + + b = (struct fat_boot_sector *) bh->b_data; + if (!b->reserved) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus number of reserved sectors"); + brelse(bh); + goto out_invalid; + } + if (!b->fats) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus number of FAT structure"); + brelse(bh); + goto out_invalid; + } + + /* + * Earlier we checked here that b->secs_track and b->head are nonzero, + * but it turns out valid FAT filesystems can have zero there. + */ + + media = b->media; + if (!fat_valid_media(media)) { + if (!silent) + fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)", + media); + brelse(bh); + goto out_invalid; + } + logical_sector_size = get_unaligned_le16(&b->sector_size); + if (!is_power_of_2(logical_sector_size) + || (logical_sector_size < 512) + || (logical_sector_size > 4096)) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus logical sector size %u", + logical_sector_size); + brelse(bh); + goto out_invalid; + } + sbi->sec_per_clus = b->sec_per_clus; + if (!is_power_of_2(sbi->sec_per_clus)) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u", + sbi->sec_per_clus); + brelse(bh); + goto out_invalid; + } + + if (logical_sector_size < sb->s_blocksize) { + fat_msg(sb, KERN_ERR, "logical sector size too small for device" + " (logical sector size = %u)", logical_sector_size); + brelse(bh); + goto out_fail; + } + if (logical_sector_size > sb->s_blocksize) { + brelse(bh); + + if (!sb_set_blocksize(sb, logical_sector_size)) { + fat_msg(sb, KERN_ERR, "unable to set blocksize %u", + logical_sector_size); + goto out_fail; + } + bh = sb_bread(sb, 0); + if (bh == NULL) { + fat_msg(sb, KERN_ERR, "unable to read boot sector" + " (logical sector size = %lu)", + sb->s_blocksize); + goto out_fail; + } + b = (struct fat_boot_sector *) bh->b_data; + } + + sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus; + sbi->cluster_bits = ffs(sbi->cluster_size) - 1; + sbi->fats = b->fats; + sbi->fat_bits = 0; /* Don't know yet */ + sbi->fat_start = le16_to_cpu(b->reserved); + sbi->fat_length = le16_to_cpu(b->fat_length); + sbi->root_cluster = 0; + sbi->free_clusters = -1; /* Don't know yet */ + sbi->free_clus_valid = 0; + sbi->prev_free = FAT_START_ENT; + + if (!sbi->fat_length && b->fat32_length) { + struct fat_boot_fsinfo *fsinfo; + struct buffer_head *fsinfo_bh; + + /* Must be FAT32 */ + sbi->fat_bits = 32; + sbi->fat_length = le32_to_cpu(b->fat32_length); + sbi->root_cluster = le32_to_cpu(b->root_cluster); + + sb->s_maxbytes = 0xffffffff; + + /* MC - if info_sector is 0, don't multiply by 0 */ + sbi->fsinfo_sector = le16_to_cpu(b->info_sector); + if (sbi->fsinfo_sector == 0) + sbi->fsinfo_sector = 1; + + fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector); + if (fsinfo_bh == NULL) { + fat_msg(sb, KERN_ERR, "bread failed, FSINFO block" + " (sector = %lu)", sbi->fsinfo_sector); + brelse(bh); + goto out_fail; + } + + bsx = (struct fat_boot_bsx *)(bh->b_data + FAT32_BSX_OFFSET); + + fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data; + if (!IS_FSINFO(fsinfo)) { + fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: " + "0x%08x, 0x%08x (sector = %lu)", + le32_to_cpu(fsinfo->signature1), + le32_to_cpu(fsinfo->signature2), + sbi->fsinfo_sector); + } else { + if (sbi->options.usefree) + sbi->free_clus_valid = 1; + sbi->free_clusters = le32_to_cpu(fsinfo->free_clusters); + sbi->prev_free = le32_to_cpu(fsinfo->next_cluster); + } + + brelse(fsinfo_bh); + } else { + bsx = (struct fat_boot_bsx *)(bh->b_data + FAT16_BSX_OFFSET); + } + + /* interpret volume ID as a little endian 32 bit integer */ + sbi->vol_id = (((u32)bsx->vol_id[0]) | ((u32)bsx->vol_id[1] << 8) | + ((u32)bsx->vol_id[2] << 16) | ((u32)bsx->vol_id[3] << 24)); + + sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); + sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; + + sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length; + sbi->dir_entries = get_unaligned_le16(&b->dir_entries); + if (sbi->dir_entries & (sbi->dir_per_block - 1)) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus directroy-entries per block" + " (%u)", sbi->dir_entries); + brelse(bh); + goto out_invalid; + } + + rootdir_sectors = sbi->dir_entries + * sizeof(struct msdos_dir_entry) / sb->s_blocksize; + sbi->data_start = sbi->dir_start + rootdir_sectors; + total_sectors = get_unaligned_le16(&b->sectors); + if (total_sectors == 0) + total_sectors = le32_to_cpu(b->total_sect); + + total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus; + + if (sbi->fat_bits != 32) + sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12; + + /* check that FAT table does not overflow */ + fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; + total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); + if (total_clusters > MAX_FAT(sb)) { + if (!silent) + fat_msg(sb, KERN_ERR, "count of clusters too big (%u)", + total_clusters); + brelse(bh); + goto out_invalid; + } + + sbi->max_cluster = total_clusters + FAT_START_ENT; + /* check the free_clusters, it's not necessarily correct */ + if (sbi->free_clusters != -1 && sbi->free_clusters > total_clusters) + sbi->free_clusters = -1; + /* check the prev_free, it's not necessarily correct */ + sbi->prev_free %= sbi->max_cluster; + if (sbi->prev_free < FAT_START_ENT) + sbi->prev_free = FAT_START_ENT; + + brelse(bh); + + /* set up enough so that it can read an inode */ + fat_hash_init(sb); + fat_ent_access_init(sb); + + /* + * The low byte of FAT's first entry must have same value with + * media-field. But in real world, too many devices is + * writing wrong value. So, removed that validity check. + * + * if (FAT_FIRST_ENT(sb, media) != first) + */ + + error = -EINVAL; + sprintf(buf, "cp%d", sbi->options.codepage); + sbi->nls_disk = load_nls(buf); + if (!sbi->nls_disk) { + fat_msg(sb, KERN_ERR, "codepage %s not found", buf); + goto out_fail; + } + + /* FIXME: utf8 is using iocharset for upper/lower conversion */ + if (sbi->options.isvfat) { + sbi->nls_io = load_nls(sbi->options.iocharset); + if (!sbi->nls_io) { + fat_msg(sb, KERN_ERR, "IO charset %s not found", + sbi->options.iocharset); + goto out_fail; + } + } + + error = -ENOMEM; + fat_inode = new_inode(sb); + if (!fat_inode) + goto out_fail; + MSDOS_I(fat_inode)->i_pos = 0; + sbi->fat_inode = fat_inode; + root_inode = new_inode(sb); + if (!root_inode) + goto out_fail; + root_inode->i_ino = MSDOS_ROOT_INO; + root_inode->i_version = 1; + error = fat_read_root(root_inode); + if (error < 0) + goto out_fail; + error = -ENOMEM; + insert_inode_hash(root_inode); + sb->s_root = d_alloc_root(root_inode); + if (!sb->s_root) { + fat_msg(sb, KERN_ERR, "get root inode failed"); + goto out_fail; + } + + return 0; + +out_invalid: + error = -EINVAL; + if (!silent) + fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem"); + +out_fail: + if (fat_inode) + iput(fat_inode); + if (root_inode) + iput(root_inode); + unload_nls(sbi->nls_io); + unload_nls(sbi->nls_disk); + if (sbi->options.iocharset != fat_default_iocharset) + kfree(sbi->options.iocharset); + sb->s_fs_info = NULL; + kfree(sbi); + return error; +} + +EXPORT_SYMBOL_GPL(fat_fill_super); + +/* + * helper function for fat_flush_inodes. This writes both the inode + * and the file data blocks, waiting for in flight data blocks before + * the start of the call. It does not wait for any io started + * during the call + */ +static int writeback_inode(struct inode *inode) +{ + + int ret; + struct address_space *mapping = inode->i_mapping; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = 0, + }; + /* if we used WB_SYNC_ALL, sync_inode waits for the io for the + * inode to finish. So WB_SYNC_NONE is sent down to sync_inode + * and filemap_fdatawrite is used for the data blocks + */ + ret = sync_inode(inode, &wbc); + if (!ret) + ret = filemap_fdatawrite(mapping); + return ret; +} + +/* + * write data and metadata corresponding to i1 and i2. The io is + * started but we do not wait for any of it to finish. + * + * filemap_flush is used for the block device, so if there is a dirty + * page for a block already in flight, we will not wait and start the + * io over again + */ +int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2) +{ + int ret = 0; + if (!MSDOS_SB(sb)->options.flush) + return 0; + if (i1) + ret = writeback_inode(i1); + if (!ret && i2) + ret = writeback_inode(i2); + if (!ret) { + struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + ret = filemap_flush(mapping); + } + return ret; +} +EXPORT_SYMBOL_GPL(fat_flush_inodes); + +static int __init init_fat_fs(void) +{ + int err; + + err = fat_cache_init(); + if (err) + return err; + + err = fat_init_inodecache(); + if (err) + goto failed; + + return 0; + +failed: + fat_cache_destroy(); + return err; +} + +static void __exit exit_fat_fs(void) +{ + fat_cache_destroy(); + fat_destroy_inodecache(); +} + +module_init(init_fat_fs) +module_exit(exit_fat_fs) + +MODULE_LICENSE("GPL"); diff --git a/fs/fat/misc.c b/fs/fat/misc.c new file mode 100644 index 00000000..6d93360c --- /dev/null +++ b/fs/fat/misc.c @@ -0,0 +1,278 @@ +/* + * linux/fs/fat/misc.c + * + * Written 1992,1993 by Werner Almesberger + * 22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980 + * and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru) + */ + +#include +#include +#include +#include +#include "fat.h" + +/* + * fat_fs_error reports a file system problem that might indicate fa data + * corruption/inconsistency. Depending on 'errors' mount option the + * panic() is called, or error message is printed FAT and nothing is done, + * or filesystem is remounted read-only (default behavior). + * In case the file system is remounted read-only, it can be made writable + * again by remounting it. + */ +void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) +{ + struct fat_mount_options *opts = &MSDOS_SB(sb)->options; + va_list args; + struct va_format vaf; + + if (report) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf); + va_end(args); + } + + if (opts->errors == FAT_ERRORS_PANIC) + panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); + else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) { + sb->s_flags |= MS_RDONLY; + printk(KERN_ERR "FAT-fs (%s): Filesystem has been " + "set read-only\n", sb->s_id); + } +} +EXPORT_SYMBOL_GPL(__fat_fs_error); + +/** + * fat_msg() - print preformated FAT specific messages. Every thing what is + * not fat_fs_error() should be fat_msg(). + */ +void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf); + va_end(args); +} + +/* Flushes the number of free clusters on FAT32 */ +/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ +int fat_clusters_flush(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct buffer_head *bh; + struct fat_boot_fsinfo *fsinfo; + + if (sbi->fat_bits != 32) + return 0; + + bh = sb_bread(sb, sbi->fsinfo_sector); + if (bh == NULL) { + fat_msg(sb, KERN_ERR, "bread failed in fat_clusters_flush"); + return -EIO; + } + + fsinfo = (struct fat_boot_fsinfo *)bh->b_data; + /* Sanity check */ + if (!IS_FSINFO(fsinfo)) { + fat_msg(sb, KERN_ERR, "Invalid FSINFO signature: " + "0x%08x, 0x%08x (sector = %lu)", + le32_to_cpu(fsinfo->signature1), + le32_to_cpu(fsinfo->signature2), + sbi->fsinfo_sector); + } else { + if (sbi->free_clusters != -1) + fsinfo->free_clusters = cpu_to_le32(sbi->free_clusters); + if (sbi->prev_free != -1) + fsinfo->next_cluster = cpu_to_le32(sbi->prev_free); + mark_buffer_dirty(bh); + } + brelse(bh); + + return 0; +} + +/* + * fat_chain_add() adds a new cluster to the chain of clusters represented + * by inode. + */ +int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int ret, new_fclus, last; + + /* + * We must locate the last cluster of the file to add this new + * one (new_dclus) to the end of the link list (the FAT). + */ + last = new_fclus = 0; + if (MSDOS_I(inode)->i_start) { + int fclus, dclus; + + ret = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus); + if (ret < 0) + return ret; + new_fclus = fclus + 1; + last = dclus; + } + + /* add new one to the last of the cluster chain */ + if (last) { + struct fat_entry fatent; + + fatent_init(&fatent); + ret = fat_ent_read(inode, &fatent, last); + if (ret >= 0) { + int wait = inode_needs_sync(inode); + ret = fat_ent_write(inode, &fatent, new_dclus, wait); + fatent_brelse(&fatent); + } + if (ret < 0) + return ret; +// fat_cache_add(inode, new_fclus, new_dclus); + } else { + MSDOS_I(inode)->i_start = new_dclus; + MSDOS_I(inode)->i_logstart = new_dclus; + /* + * Since generic_write_sync() synchronizes regular files later, + * we sync here only directories. + */ + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) { + ret = fat_sync_inode(inode); + if (ret) + return ret; + } else + mark_inode_dirty(inode); + } + if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) { + fat_fs_error(sb, "clusters badly computed (%d != %llu)", + new_fclus, + (llu)(inode->i_blocks >> (sbi->cluster_bits - 9))); + fat_cache_inval_inode(inode); + } + inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9); + + return 0; +} + +extern struct timezone sys_tz; + +/* + * The epoch of FAT timestamp is 1980. + * : bits : value + * date: 0 - 4: day (1 - 31) + * date: 5 - 8: month (1 - 12) + * date: 9 - 15: year (0 - 127) from 1980 + * time: 0 - 4: sec (0 - 29) 2sec counts + * time: 5 - 10: min (0 - 59) + * time: 11 - 15: hour (0 - 23) + */ +#define SECS_PER_MIN 60 +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) +/* days between 1.1.70 and 1.1.80 (2 leap days) */ +#define DAYS_DELTA (365 * 10 + 2) +/* 120 (2100 - 1980) isn't leap year */ +#define YEAR_2100 120 +#define IS_LEAP_YEAR(y) (!((y) & 3) && (y) != YEAR_2100) + +/* Linear day numbers of the respective 1sts in non-leap years. */ +static time_t days_in_year[] = { + /* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */ + 0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, +}; + +/* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */ +void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, + __le16 __time, __le16 __date, u8 time_cs) +{ + u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date); + time_t second, day, leap_day, month, year; + + year = date >> 9; + month = max(1, (date >> 5) & 0xf); + day = max(1, date & 0x1f) - 1; + + leap_day = (year + 3) / 4; + if (year > YEAR_2100) /* 2100 isn't leap year */ + leap_day--; + if (IS_LEAP_YEAR(year) && month > 2) + leap_day++; + + second = (time & 0x1f) << 1; + second += ((time >> 5) & 0x3f) * SECS_PER_MIN; + second += (time >> 11) * SECS_PER_HOUR; + second += (year * 365 + leap_day + + days_in_year[month] + day + + DAYS_DELTA) * SECS_PER_DAY; + + if (!sbi->options.tz_utc) + second += sys_tz.tz_minuteswest * SECS_PER_MIN; + + if (time_cs) { + ts->tv_sec = second + (time_cs / 100); + ts->tv_nsec = (time_cs % 100) * 10000000; + } else { + ts->tv_sec = second; + ts->tv_nsec = 0; + } +} + +/* Convert linear UNIX date to a FAT time/date pair. */ +void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts, + __le16 *time, __le16 *date, u8 *time_cs) +{ + struct tm tm; + time_to_tm(ts->tv_sec, sbi->options.tz_utc ? 0 : + -sys_tz.tz_minuteswest * 60, &tm); + + /* FAT can only support year between 1980 to 2107 */ + if (tm.tm_year < 1980 - 1900) { + *time = 0; + *date = cpu_to_le16((0 << 9) | (1 << 5) | 1); + if (time_cs) + *time_cs = 0; + return; + } + if (tm.tm_year > 2107 - 1900) { + *time = cpu_to_le16((23 << 11) | (59 << 5) | 29); + *date = cpu_to_le16((127 << 9) | (12 << 5) | 31); + if (time_cs) + *time_cs = 199; + return; + } + + /* from 1900 -> from 1980 */ + tm.tm_year -= 80; + /* 0~11 -> 1~12 */ + tm.tm_mon++; + /* 0~59 -> 0~29(2sec counts) */ + tm.tm_sec >>= 1; + + *time = cpu_to_le16(tm.tm_hour << 11 | tm.tm_min << 5 | tm.tm_sec); + *date = cpu_to_le16(tm.tm_year << 9 | tm.tm_mon << 5 | tm.tm_mday); + if (time_cs) + *time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000; +} +EXPORT_SYMBOL_GPL(fat_time_unix2fat); + +int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) +{ + int i, err = 0; + + for (i = 0; i < nr_bhs; i++) + write_dirty_buffer(bhs[i], WRITE); + + for (i = 0; i < nr_bhs; i++) { + wait_on_buffer(bhs[i]); + if (!err && !buffer_uptodate(bhs[i])) + err = -EIO; + } + return err; +} diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c new file mode 100644 index 00000000..3b222daf --- /dev/null +++ b/fs/fat/namei_msdos.c @@ -0,0 +1,702 @@ +/* + * linux/fs/msdos/namei.c + * + * Written 1992,1993 by Werner Almesberger + * Hidden files 1995 by Albert Cahalan + * Rewritten for constant inumbers 1999 by Al Viro + */ + +#include +#include +#include +#include "fat.h" + +/* Characters that are undesirable in an MS-DOS file name */ +static unsigned char bad_chars[] = "*?<>|\""; +static unsigned char bad_if_strict[] = "+=,; "; + +/***** Formats an MS-DOS file name. Rejects invalid names. */ +static int msdos_format_name(const unsigned char *name, int len, + unsigned char *res, struct fat_mount_options *opts) + /* + * name is the proposed name, len is its length, res is + * the resulting name, opts->name_check is either (r)elaxed, + * (n)ormal or (s)trict, opts->dotsOK allows dots at the + * beginning of name (for hidden files) + */ +{ + unsigned char *walk; + unsigned char c; + int space; + + if (name[0] == '.') { /* dotfile because . and .. already done */ + if (opts->dotsOK) { + /* Get rid of dot - test for it elsewhere */ + name++; + len--; + } else + return -EINVAL; + } + /* + * disallow names that _really_ start with a dot + */ + space = 1; + c = 0; + for (walk = res; len && walk - res < 8; walk++) { + c = *name++; + len--; + if (opts->name_check != 'r' && strchr(bad_chars, c)) + return -EINVAL; + if (opts->name_check == 's' && strchr(bad_if_strict, c)) + return -EINVAL; + if (c >= 'A' && c <= 'Z' && opts->name_check == 's') + return -EINVAL; + if (c < ' ' || c == ':' || c == '\\') + return -EINVAL; + /* + * 0xE5 is legal as a first character, but we must substitute + * 0x05 because 0xE5 marks deleted files. Yes, DOS really + * does this. + * It seems that Microsoft hacked DOS to support non-US + * characters after the 0xE5 character was already in use to + * mark deleted files. + */ + if ((res == walk) && (c == 0xE5)) + c = 0x05; + if (c == '.') + break; + space = (c == ' '); + *walk = (!opts->nocase && c >= 'a' && c <= 'z') ? c - 32 : c; + } + if (space) + return -EINVAL; + if (opts->name_check == 's' && len && c != '.') { + c = *name++; + len--; + if (c != '.') + return -EINVAL; + } + while (c != '.' && len--) + c = *name++; + if (c == '.') { + while (walk - res < 8) + *walk++ = ' '; + while (len > 0 && walk - res < MSDOS_NAME) { + c = *name++; + len--; + if (opts->name_check != 'r' && strchr(bad_chars, c)) + return -EINVAL; + if (opts->name_check == 's' && + strchr(bad_if_strict, c)) + return -EINVAL; + if (c < ' ' || c == ':' || c == '\\') + return -EINVAL; + if (c == '.') { + if (opts->name_check == 's') + return -EINVAL; + break; + } + if (c >= 'A' && c <= 'Z' && opts->name_check == 's') + return -EINVAL; + space = c == ' '; + if (!opts->nocase && c >= 'a' && c <= 'z') + *walk++ = c - 32; + else + *walk++ = c; + } + if (space) + return -EINVAL; + if (opts->name_check == 's' && len) + return -EINVAL; + } + while (walk - res < MSDOS_NAME) + *walk++ = ' '; + + return 0; +} + +/***** Locates a directory entry. Uses unformatted name. */ +static int msdos_find(struct inode *dir, const unsigned char *name, int len, + struct fat_slot_info *sinfo) +{ + struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb); + unsigned char msdos_name[MSDOS_NAME]; + int err; + + err = msdos_format_name(name, len, msdos_name, &sbi->options); + if (err) + return -ENOENT; + + err = fat_scan(dir, msdos_name, sinfo); + if (!err && sbi->options.dotsOK) { + if (name[0] == '.') { + if (!(sinfo->de->attr & ATTR_HIDDEN)) + err = -ENOENT; + } else { + if (sinfo->de->attr & ATTR_HIDDEN) + err = -ENOENT; + } + if (err) + brelse(sinfo->bh); + } + return err; +} + +/* + * Compute the hash for the msdos name corresponding to the dentry. + * Note: if the name is invalid, we leave the hash code unchanged so + * that the existing dentry can be used. The msdos fs routines will + * return ENOENT or EINVAL as appropriate. + */ +static int msdos_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) +{ + struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; + unsigned char msdos_name[MSDOS_NAME]; + int error; + + error = msdos_format_name(qstr->name, qstr->len, msdos_name, options); + if (!error) + qstr->hash = full_name_hash(msdos_name, MSDOS_NAME); + return 0; +} + +/* + * Compare two msdos names. If either of the names are invalid, + * we fall back to doing the standard name comparison. + */ +static int msdos_cmp(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options; + unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME]; + int error; + + error = msdos_format_name(name->name, name->len, a_msdos_name, options); + if (error) + goto old_compare; + error = msdos_format_name(str, len, b_msdos_name, options); + if (error) + goto old_compare; + error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME); +out: + return error; + +old_compare: + error = 1; + if (name->len == len) + error = memcmp(name->name, str, len); + goto out; +} + +static const struct dentry_operations msdos_dentry_operations = { + .d_hash = msdos_hash, + .d_compare = msdos_cmp, +}; + +/* + * AV. Wrappers for FAT sb operations. Is it wise? + */ + +/***** Get inode using directory and name */ +static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct super_block *sb = dir->i_sb; + struct fat_slot_info sinfo; + struct inode *inode; + int err; + + lock_super(sb); + + err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); + if (err) { + if (err == -ENOENT) { + inode = NULL; + goto out; + } + goto error; + } + + inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + brelse(sinfo.bh); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } +out: + unlock_super(sb); + return d_splice_alias(inode, dentry); + +error: + unlock_super(sb); + return ERR_PTR(err); +} + +/***** Creates a directory entry (name is already formatted). */ +static int msdos_add_entry(struct inode *dir, const unsigned char *name, + int is_dir, int is_hid, int cluster, + struct timespec *ts, struct fat_slot_info *sinfo) +{ + struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb); + struct msdos_dir_entry de; + __le16 time, date; + int err; + + memcpy(de.name, name, MSDOS_NAME); + de.attr = is_dir ? ATTR_DIR : ATTR_ARCH; + if (is_hid) + de.attr |= ATTR_HIDDEN; + de.lcase = 0; + fat_time_unix2fat(sbi, ts, &time, &date, NULL); + de.cdate = de.adate = 0; + de.ctime = 0; + de.ctime_cs = 0; + de.time = time; + de.date = date; + de.start = cpu_to_le16(cluster); + de.starthi = cpu_to_le16(cluster >> 16); + de.size = 0; + + err = fat_add_entries(dir, &de, 1, sinfo); + if (err) + return err; + + dir->i_ctime = dir->i_mtime = *ts; + if (IS_DIRSYNC(dir)) + (void)fat_sync_inode(dir); + else + mark_inode_dirty(dir); + + return 0; +} + +/***** Create a file */ +static int msdos_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode = NULL; + struct fat_slot_info sinfo; + struct timespec ts; + unsigned char msdos_name[MSDOS_NAME]; + int err, is_hid; + + lock_super(sb); + + err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, + msdos_name, &MSDOS_SB(sb)->options); + if (err) + goto out; + is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.'); + /* Have to do it due to foo vs. .foo conflicts */ + if (!fat_scan(dir, msdos_name, &sinfo)) { + brelse(sinfo.bh); + err = -EINVAL; + goto out; + } + + ts = CURRENT_TIME_SEC; + err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo); + if (err) + goto out; + inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + brelse(sinfo.bh); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + /* timestamp is already written, so mark_inode_dirty() is unneeded. */ + + d_instantiate(dentry, inode); +out: + unlock_super(sb); + if (!err) + err = fat_flush_inodes(sb, dir, inode); + return err; +} + +/***** Remove a directory */ +static int msdos_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode = dentry->d_inode; + struct fat_slot_info sinfo; + int err; + + lock_super(sb); + /* + * Check whether the directory is not in use, then check + * whether it is empty. + */ + err = fat_dir_empty(inode); + if (err) + goto out; + err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); + if (err) + goto out; + + err = fat_remove_entries(dir, &sinfo); /* and releases bh */ + if (err) + goto out; + drop_nlink(dir); + + clear_nlink(inode); + inode->i_ctime = CURRENT_TIME_SEC; + fat_detach(inode); +out: + unlock_super(sb); + if (!err) + err = fat_flush_inodes(sb, dir, inode); + + return err; +} + +/***** Make a directory */ +static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct super_block *sb = dir->i_sb; + struct fat_slot_info sinfo; + struct inode *inode; + unsigned char msdos_name[MSDOS_NAME]; + struct timespec ts; + int err, is_hid, cluster; + + lock_super(sb); + + err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, + msdos_name, &MSDOS_SB(sb)->options); + if (err) + goto out; + is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.'); + /* foo vs .foo situation */ + if (!fat_scan(dir, msdos_name, &sinfo)) { + brelse(sinfo.bh); + err = -EINVAL; + goto out; + } + + ts = CURRENT_TIME_SEC; + cluster = fat_alloc_new_dir(dir, &ts); + if (cluster < 0) { + err = cluster; + goto out; + } + err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo); + if (err) + goto out_free; + inc_nlink(dir); + + inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + brelse(sinfo.bh); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + /* the directory was completed, just return a error */ + goto out; + } + inode->i_nlink = 2; + inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + /* timestamp is already written, so mark_inode_dirty() is unneeded. */ + + d_instantiate(dentry, inode); + + unlock_super(sb); + fat_flush_inodes(sb, dir, inode); + return 0; + +out_free: + fat_free_clusters(dir, cluster); +out: + unlock_super(sb); + return err; +} + +/***** Unlink a file */ +static int msdos_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct super_block *sb= inode->i_sb; + struct fat_slot_info sinfo; + int err; + + lock_super(sb); + err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); + if (err) + goto out; + + err = fat_remove_entries(dir, &sinfo); /* and releases bh */ + if (err) + goto out; + clear_nlink(inode); + inode->i_ctime = CURRENT_TIME_SEC; + fat_detach(inode); +out: + unlock_super(sb); + if (!err) + err = fat_flush_inodes(sb, dir, inode); + + return err; +} + +static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, + struct dentry *old_dentry, + struct inode *new_dir, unsigned char *new_name, + struct dentry *new_dentry, int is_hid) +{ + struct buffer_head *dotdot_bh; + struct msdos_dir_entry *dotdot_de; + struct inode *old_inode, *new_inode; + struct fat_slot_info old_sinfo, sinfo; + struct timespec ts; + loff_t dotdot_i_pos, new_i_pos; + int err, old_attrs, is_dir, update_dotdot, corrupt = 0; + + old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; + old_inode = old_dentry->d_inode; + new_inode = new_dentry->d_inode; + + err = fat_scan(old_dir, old_name, &old_sinfo); + if (err) { + err = -EIO; + goto out; + } + + is_dir = S_ISDIR(old_inode->i_mode); + update_dotdot = (is_dir && old_dir != new_dir); + if (update_dotdot) { + if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de, + &dotdot_i_pos) < 0) { + err = -EIO; + goto out; + } + } + + old_attrs = MSDOS_I(old_inode)->i_attrs; + err = fat_scan(new_dir, new_name, &sinfo); + if (!err) { + if (!new_inode) { + /* "foo" -> ".foo" case. just change the ATTR_HIDDEN */ + if (sinfo.de != old_sinfo.de) { + err = -EINVAL; + goto out; + } + if (is_hid) + MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN; + else + MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN; + if (IS_DIRSYNC(old_dir)) { + err = fat_sync_inode(old_inode); + if (err) { + MSDOS_I(old_inode)->i_attrs = old_attrs; + goto out; + } + } else + mark_inode_dirty(old_inode); + + old_dir->i_version++; + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; + if (IS_DIRSYNC(old_dir)) + (void)fat_sync_inode(old_dir); + else + mark_inode_dirty(old_dir); + goto out; + } + } + + ts = CURRENT_TIME_SEC; + if (new_inode) { + if (err) + goto out; + if (is_dir) { + err = fat_dir_empty(new_inode); + if (err) + goto out; + } + new_i_pos = MSDOS_I(new_inode)->i_pos; + fat_detach(new_inode); + } else { + err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0, + &ts, &sinfo); + if (err) + goto out; + new_i_pos = sinfo.i_pos; + } + new_dir->i_version++; + + fat_detach(old_inode); + fat_attach(old_inode, new_i_pos); + if (is_hid) + MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN; + else + MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN; + if (IS_DIRSYNC(new_dir)) { + err = fat_sync_inode(old_inode); + if (err) + goto error_inode; + } else + mark_inode_dirty(old_inode); + + if (update_dotdot) { + int start = MSDOS_I(new_dir)->i_logstart; + dotdot_de->start = cpu_to_le16(start); + dotdot_de->starthi = cpu_to_le16(start >> 16); + mark_buffer_dirty_inode(dotdot_bh, old_inode); + if (IS_DIRSYNC(new_dir)) { + err = sync_dirty_buffer(dotdot_bh); + if (err) + goto error_dotdot; + } + drop_nlink(old_dir); + if (!new_inode) + inc_nlink(new_dir); + } + + err = fat_remove_entries(old_dir, &old_sinfo); /* and releases bh */ + old_sinfo.bh = NULL; + if (err) + goto error_dotdot; + old_dir->i_version++; + old_dir->i_ctime = old_dir->i_mtime = ts; + if (IS_DIRSYNC(old_dir)) + (void)fat_sync_inode(old_dir); + else + mark_inode_dirty(old_dir); + + if (new_inode) { + drop_nlink(new_inode); + if (is_dir) + drop_nlink(new_inode); + new_inode->i_ctime = ts; + } +out: + brelse(sinfo.bh); + brelse(dotdot_bh); + brelse(old_sinfo.bh); + return err; + +error_dotdot: + /* data cluster is shared, serious corruption */ + corrupt = 1; + + if (update_dotdot) { + int start = MSDOS_I(old_dir)->i_logstart; + dotdot_de->start = cpu_to_le16(start); + dotdot_de->starthi = cpu_to_le16(start >> 16); + mark_buffer_dirty_inode(dotdot_bh, old_inode); + corrupt |= sync_dirty_buffer(dotdot_bh); + } +error_inode: + fat_detach(old_inode); + fat_attach(old_inode, old_sinfo.i_pos); + MSDOS_I(old_inode)->i_attrs = old_attrs; + if (new_inode) { + fat_attach(new_inode, new_i_pos); + if (corrupt) + corrupt |= fat_sync_inode(new_inode); + } else { + /* + * If new entry was not sharing the data cluster, it + * shouldn't be serious corruption. + */ + int err2 = fat_remove_entries(new_dir, &sinfo); + if (corrupt) + corrupt |= err2; + sinfo.bh = NULL; + } + if (corrupt < 0) { + fat_fs_error(new_dir->i_sb, + "%s: Filesystem corrupted (i_pos %lld)", + __func__, sinfo.i_pos); + } + goto out; +} + +/***** Rename, a wrapper for rename_same_dir & rename_diff_dir */ +static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct super_block *sb = old_dir->i_sb; + unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME]; + int err, is_hid; + + lock_super(sb); + + err = msdos_format_name(old_dentry->d_name.name, + old_dentry->d_name.len, old_msdos_name, + &MSDOS_SB(old_dir->i_sb)->options); + if (err) + goto out; + err = msdos_format_name(new_dentry->d_name.name, + new_dentry->d_name.len, new_msdos_name, + &MSDOS_SB(new_dir->i_sb)->options); + if (err) + goto out; + + is_hid = + (new_dentry->d_name.name[0] == '.') && (new_msdos_name[0] != '.'); + + err = do_msdos_rename(old_dir, old_msdos_name, old_dentry, + new_dir, new_msdos_name, new_dentry, is_hid); +out: + unlock_super(sb); + if (!err) + err = fat_flush_inodes(sb, old_dir, new_dir); + return err; +} + +static const struct inode_operations msdos_dir_inode_operations = { + .create = msdos_create, + .lookup = msdos_lookup, + .unlink = msdos_unlink, + .mkdir = msdos_mkdir, + .rmdir = msdos_rmdir, + .rename = msdos_rename, + .setattr = fat_setattr, + .getattr = fat_getattr, +}; + +static void setup(struct super_block *sb) +{ + MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations; + sb->s_d_op = &msdos_dentry_operations; + sb->s_flags |= MS_NOATIME; +} + +static int msdos_fill_super(struct super_block *sb, void *data, int silent) +{ + return fat_fill_super(sb, data, silent, 0, setup); +} + +static struct dentry *msdos_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super); +} + +static struct file_system_type msdos_fs_type = { + .owner = THIS_MODULE, + .name = "msdos", + .mount = msdos_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_msdos_fs(void) +{ + return register_filesystem(&msdos_fs_type); +} + +static void __exit exit_msdos_fs(void) +{ + unregister_filesystem(&msdos_fs_type); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Werner Almesberger"); +MODULE_DESCRIPTION("MS-DOS filesystem support"); + +module_init(init_msdos_fs) +module_exit(exit_msdos_fs) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c new file mode 100644 index 00000000..20b4ea53 --- /dev/null +++ b/fs/fat/namei_vfat.c @@ -0,0 +1,1110 @@ +/* + * linux/fs/vfat/namei.c + * + * Written 1992,1993 by Werner Almesberger + * + * Windows95/Windows NT compatible extended MSDOS filesystem + * by Gordon Chaffee Copyright (C) 1995. Send bug reports for the + * VFAT filesystem to . Specify + * what file operation caused you trouble and if you can duplicate + * the problem, send a script that demonstrates it. + * + * Short name translation 1999, 2001 by Wolfram Pienkoss + * + * Support Multibyte characters and cleanup by + * OGAWA Hirofumi + */ + +#include +#include +#include +#include +#include +#include +#include "fat.h" + +/* + * If new entry was created in the parent, it could create the 8.3 + * alias (the shortname of logname). So, the parent may have the + * negative-dentry which matches the created 8.3 alias. + * + * If it happened, the negative dentry isn't actually negative + * anymore. So, drop it. + */ +static int vfat_revalidate_shortname(struct dentry *dentry) +{ + int ret = 1; + spin_lock(&dentry->d_lock); + if (dentry->d_time != dentry->d_parent->d_inode->i_version) + ret = 0; + spin_unlock(&dentry->d_lock); + return ret; +} + +static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + /* This is not negative dentry. Always valid. */ + if (dentry->d_inode) + return 1; + return vfat_revalidate_shortname(dentry); +} + +static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) +{ + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + /* + * This is not negative dentry. Always valid. + * + * Note, rename() to existing directory entry will have ->d_inode, + * and will use existing name which isn't specified name by user. + * + * We may be able to drop this positive dentry here. But dropping + * positive dentry isn't good idea. So it's unsupported like + * rename("filename", "FILENAME") for now. + */ + if (dentry->d_inode) + return 1; + + /* + * This may be nfsd (or something), anyway, we can't see the + * intent of this. So, since this can be for creation, drop it. + */ + if (!nd) + return 0; + + /* + * Drop the negative dentry, in order to make sure to use the + * case sensitive name which is specified by user if this is + * for creation. + */ + if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) { + if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + return 0; + } + + return vfat_revalidate_shortname(dentry); +} + +/* returns the length of a struct qstr, ignoring trailing dots */ +static unsigned int __vfat_striptail_len(unsigned int len, const char *name) +{ + while (len && name[len - 1] == '.') + len--; + return len; +} + +static unsigned int vfat_striptail_len(const struct qstr *qstr) +{ + return __vfat_striptail_len(qstr->len, qstr->name); +} + +/* + * Compute the hash for the vfat name corresponding to the dentry. + * Note: if the name is invalid, we leave the hash code unchanged so + * that the existing dentry can be used. The vfat fs routines will + * return ENOENT or EINVAL as appropriate. + */ +static int vfat_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) +{ + qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr)); + return 0; +} + +/* + * Compute the hash for the vfat name corresponding to the dentry. + * Note: if the name is invalid, we leave the hash code unchanged so + * that the existing dentry can be used. The vfat fs routines will + * return ENOENT or EINVAL as appropriate. + */ +static int vfat_hashi(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) +{ + struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io; + const unsigned char *name; + unsigned int len; + unsigned long hash; + + name = qstr->name; + len = vfat_striptail_len(qstr); + + hash = init_name_hash(); + while (len--) + hash = partial_name_hash(nls_tolower(t, *name++), hash); + qstr->hash = end_name_hash(hash); + + return 0; +} + +/* + * Case insensitive compare of two vfat names. + */ +static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io; + unsigned int alen, blen; + + /* A filename cannot end in '.' or we treat it like it has none */ + alen = vfat_striptail_len(name); + blen = __vfat_striptail_len(len, str); + if (alen == blen) { + if (nls_strnicmp(t, name->name, str, alen) == 0) + return 0; + } + return 1; +} + +/* + * Case sensitive compare of two vfat names. + */ +static int vfat_cmp(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + unsigned int alen, blen; + + /* A filename cannot end in '.' or we treat it like it has none */ + alen = vfat_striptail_len(name); + blen = __vfat_striptail_len(len, str); + if (alen == blen) { + if (strncmp(name->name, str, alen) == 0) + return 0; + } + return 1; +} + +static const struct dentry_operations vfat_ci_dentry_ops = { + .d_revalidate = vfat_revalidate_ci, + .d_hash = vfat_hashi, + .d_compare = vfat_cmpi, +}; + +static const struct dentry_operations vfat_dentry_ops = { + .d_revalidate = vfat_revalidate, + .d_hash = vfat_hash, + .d_compare = vfat_cmp, +}; + +/* Characters that are undesirable in an MS-DOS file name */ + +static inline wchar_t vfat_bad_char(wchar_t w) +{ + return (w < 0x0020) + || (w == '*') || (w == '?') || (w == '<') || (w == '>') + || (w == '|') || (w == '"') || (w == ':') || (w == '/') + || (w == '\\'); +} + +static inline wchar_t vfat_replace_char(wchar_t w) +{ + return (w == '[') || (w == ']') || (w == ';') || (w == ',') + || (w == '+') || (w == '='); +} + +static wchar_t vfat_skip_char(wchar_t w) +{ + return (w == '.') || (w == ' '); +} + +static inline int vfat_is_used_badchars(const wchar_t *s, int len) +{ + int i; + + for (i = 0; i < len; i++) + if (vfat_bad_char(s[i])) + return -EINVAL; + + if (s[i - 1] == ' ') /* last character cannot be space */ + return -EINVAL; + + return 0; +} + +static int vfat_find_form(struct inode *dir, unsigned char *name) +{ + struct fat_slot_info sinfo; + int err = fat_scan(dir, name, &sinfo); + if (err) + return -ENOENT; + brelse(sinfo.bh); + return 0; +} + +/* + * 1) Valid characters for the 8.3 format alias are any combination of + * letters, uppercase alphabets, digits, any of the + * following special characters: + * $ % ' ` - @ { } ~ ! # ( ) & _ ^ + * In this case Longfilename is not stored in disk. + * + * WinNT's Extension: + * File name and extension name is contain uppercase/lowercase + * only. And it is expressed by CASE_LOWER_BASE and CASE_LOWER_EXT. + * + * 2) File name is 8.3 format, but it contain the uppercase and + * lowercase char, muliti bytes char, etc. In this case numtail is not + * added, but Longfilename is stored. + * + * 3) When the one except for the above, or the following special + * character are contained: + * . [ ] ; , + = + * numtail is added, and Longfilename must be stored in disk . + */ +struct shortname_info { + unsigned char lower:1, + upper:1, + valid:1; +}; +#define INIT_SHORTNAME_INFO(x) do { \ + (x)->lower = 1; \ + (x)->upper = 1; \ + (x)->valid = 1; \ +} while (0) + +static inline int to_shortname_char(struct nls_table *nls, + unsigned char *buf, int buf_size, + wchar_t *src, struct shortname_info *info) +{ + int len; + + if (vfat_skip_char(*src)) { + info->valid = 0; + return 0; + } + if (vfat_replace_char(*src)) { + info->valid = 0; + buf[0] = '_'; + return 1; + } + + len = nls->uni2char(*src, buf, buf_size); + if (len <= 0) { + info->valid = 0; + buf[0] = '_'; + len = 1; + } else if (len == 1) { + unsigned char prev = buf[0]; + + if (buf[0] >= 0x7F) { + info->lower = 0; + info->upper = 0; + } + + buf[0] = nls_toupper(nls, buf[0]); + if (isalpha(buf[0])) { + if (buf[0] == prev) + info->lower = 0; + else + info->upper = 0; + } + } else { + info->lower = 0; + info->upper = 0; + } + + return len; +} + +/* + * Given a valid longname, create a unique shortname. Make sure the + * shortname does not exist + * Returns negative number on error, 0 for a normal + * return, and 1 for valid shortname + */ +static int vfat_create_shortname(struct inode *dir, struct nls_table *nls, + wchar_t *uname, int ulen, + unsigned char *name_res, unsigned char *lcase) +{ + struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options; + wchar_t *ip, *ext_start, *end, *name_start; + unsigned char base[9], ext[4], buf[5], *p; + unsigned char charbuf[NLS_MAX_CHARSET_SIZE]; + int chl, chi; + int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen; + int is_shortname; + struct shortname_info base_info, ext_info; + + is_shortname = 1; + INIT_SHORTNAME_INFO(&base_info); + INIT_SHORTNAME_INFO(&ext_info); + + /* Now, we need to create a shortname from the long name */ + ext_start = end = &uname[ulen]; + while (--ext_start >= uname) { + if (*ext_start == 0x002E) { /* is `.' */ + if (ext_start == end - 1) { + sz = ulen; + ext_start = NULL; + } + break; + } + } + + if (ext_start == uname - 1) { + sz = ulen; + ext_start = NULL; + } else if (ext_start) { + /* + * Names which start with a dot could be just + * an extension eg. "...test". In this case Win95 + * uses the extension as the name and sets no extension. + */ + name_start = &uname[0]; + while (name_start < ext_start) { + if (!vfat_skip_char(*name_start)) + break; + name_start++; + } + if (name_start != ext_start) { + sz = ext_start - uname; + ext_start++; + } else { + sz = ulen; + ext_start = NULL; + } + } + + numtail_baselen = 6; + numtail2_baselen = 2; + for (baselen = i = 0, p = base, ip = uname; i < sz; i++, ip++) { + chl = to_shortname_char(nls, charbuf, sizeof(charbuf), + ip, &base_info); + if (chl == 0) + continue; + + if (baselen < 2 && (baselen + chl) > 2) + numtail2_baselen = baselen; + if (baselen < 6 && (baselen + chl) > 6) + numtail_baselen = baselen; + for (chi = 0; chi < chl; chi++) { + *p++ = charbuf[chi]; + baselen++; + if (baselen >= 8) + break; + } + if (baselen >= 8) { + if ((chi < chl - 1) || (ip + 1) - uname < sz) + is_shortname = 0; + break; + } + } + if (baselen == 0) { + return -EINVAL; + } + + extlen = 0; + if (ext_start) { + for (p = ext, ip = ext_start; extlen < 3 && ip < end; ip++) { + chl = to_shortname_char(nls, charbuf, sizeof(charbuf), + ip, &ext_info); + if (chl == 0) + continue; + + if ((extlen + chl) > 3) { + is_shortname = 0; + break; + } + for (chi = 0; chi < chl; chi++) { + *p++ = charbuf[chi]; + extlen++; + } + if (extlen >= 3) { + if (ip + 1 != end) + is_shortname = 0; + break; + } + } + } + ext[extlen] = '\0'; + base[baselen] = '\0'; + + /* Yes, it can happen. ".\xe5" would do it. */ + if (base[0] == DELETED_FLAG) + base[0] = 0x05; + + /* OK, at this point we know that base is not longer than 8 symbols, + * ext is not longer than 3, base is nonempty, both don't contain + * any bad symbols (lowercase transformed to uppercase). + */ + + memset(name_res, ' ', MSDOS_NAME); + memcpy(name_res, base, baselen); + memcpy(name_res + 8, ext, extlen); + *lcase = 0; + if (is_shortname && base_info.valid && ext_info.valid) { + if (vfat_find_form(dir, name_res) == 0) + return -EEXIST; + + if (opts->shortname & VFAT_SFN_CREATE_WIN95) { + return (base_info.upper && ext_info.upper); + } else if (opts->shortname & VFAT_SFN_CREATE_WINNT) { + if ((base_info.upper || base_info.lower) && + (ext_info.upper || ext_info.lower)) { + if (!base_info.upper && base_info.lower) + *lcase |= CASE_LOWER_BASE; + if (!ext_info.upper && ext_info.lower) + *lcase |= CASE_LOWER_EXT; + return 1; + } + return 0; + } else { + BUG(); + } + } + + if (opts->numtail == 0) + if (vfat_find_form(dir, name_res) < 0) + return 0; + + /* + * Try to find a unique extension. This used to + * iterate through all possibilities sequentially, + * but that gave extremely bad performance. Windows + * only tries a few cases before using random + * values for part of the base. + */ + + if (baselen > 6) { + baselen = numtail_baselen; + name_res[7] = ' '; + } + name_res[baselen] = '~'; + for (i = 1; i < 10; i++) { + name_res[baselen + 1] = i + '0'; + if (vfat_find_form(dir, name_res) < 0) + return 0; + } + + i = jiffies; + sz = (jiffies >> 16) & 0x7; + if (baselen > 2) { + baselen = numtail2_baselen; + name_res[7] = ' '; + } + name_res[baselen + 4] = '~'; + name_res[baselen + 5] = '1' + sz; + while (1) { + snprintf(buf, sizeof(buf), "%04X", i & 0xffff); + memcpy(&name_res[baselen], buf, 4); + if (vfat_find_form(dir, name_res) < 0) + break; + i -= 11; + } + return 0; +} + +/* Translate a string, including coded sequences into Unicode */ +static int +xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, + int *longlen, int *outlen, int escape, int utf8, + struct nls_table *nls) +{ + const unsigned char *ip; + unsigned char nc; + unsigned char *op; + unsigned int ec; + int i, k, fill; + int charlen; + + if (utf8) { + *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname); + if (*outlen < 0) + return *outlen; + else if (*outlen > FAT_LFN_LEN) + return -ENAMETOOLONG; + + op = &outname[*outlen * sizeof(wchar_t)]; + } else { + if (nls) { + for (i = 0, ip = name, op = outname, *outlen = 0; + i < len && *outlen <= FAT_LFN_LEN; + *outlen += 1) + { + if (escape && (*ip == ':')) { + if (i > len - 5) + return -EINVAL; + ec = 0; + for (k = 1; k < 5; k++) { + nc = ip[k]; + ec <<= 4; + if (nc >= '0' && nc <= '9') { + ec |= nc - '0'; + continue; + } + if (nc >= 'a' && nc <= 'f') { + ec |= nc - ('a' - 10); + continue; + } + if (nc >= 'A' && nc <= 'F') { + ec |= nc - ('A' - 10); + continue; + } + return -EINVAL; + } + *op++ = ec & 0xFF; + *op++ = ec >> 8; + ip += 5; + i += 5; + } else { + if ((charlen = nls->char2uni(ip, len - i, (wchar_t *)op)) < 0) + return -EINVAL; + ip += charlen; + i += charlen; + op += 2; + } + } + if (i < len) + return -ENAMETOOLONG; + } else { + for (i = 0, ip = name, op = outname, *outlen = 0; + i < len && *outlen <= FAT_LFN_LEN; + i++, *outlen += 1) + { + *op++ = *ip++; + *op++ = 0; + } + if (i < len) + return -ENAMETOOLONG; + } + } + + *longlen = *outlen; + if (*outlen % 13) { + *op++ = 0; + *op++ = 0; + *outlen += 1; + if (*outlen % 13) { + fill = 13 - (*outlen % 13); + for (i = 0; i < fill; i++) { + *op++ = 0xff; + *op++ = 0xff; + } + *outlen += fill; + } + } + + return 0; +} + +static int vfat_build_slots(struct inode *dir, const unsigned char *name, + int len, int is_dir, int cluster, + struct timespec *ts, + struct msdos_dir_slot *slots, int *nr_slots) +{ + struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb); + struct fat_mount_options *opts = &sbi->options; + struct msdos_dir_slot *ps; + struct msdos_dir_entry *de; + unsigned char cksum, lcase; + unsigned char msdos_name[MSDOS_NAME]; + wchar_t *uname; + __le16 time, date; + u8 time_cs; + int err, ulen, usize, i; + loff_t offset; + + *nr_slots = 0; + + uname = __getname(); + if (!uname) + return -ENOMEM; + + err = xlate_to_uni(name, len, (unsigned char *)uname, &ulen, &usize, + opts->unicode_xlate, opts->utf8, sbi->nls_io); + if (err) + goto out_free; + + err = vfat_is_used_badchars(uname, ulen); + if (err) + goto out_free; + + err = vfat_create_shortname(dir, sbi->nls_disk, uname, ulen, + msdos_name, &lcase); + if (err < 0) + goto out_free; + else if (err == 1) { + de = (struct msdos_dir_entry *)slots; + err = 0; + goto shortname; + } + + /* build the entry of long file name */ + cksum = fat_checksum(msdos_name); + + *nr_slots = usize / 13; + for (ps = slots, i = *nr_slots; i > 0; i--, ps++) { + ps->id = i; + ps->attr = ATTR_EXT; + ps->reserved = 0; + ps->alias_checksum = cksum; + ps->start = 0; + offset = (i - 1) * 13; + fatwchar_to16(ps->name0_4, uname + offset, 5); + fatwchar_to16(ps->name5_10, uname + offset + 5, 6); + fatwchar_to16(ps->name11_12, uname + offset + 11, 2); + } + slots[0].id |= 0x40; + de = (struct msdos_dir_entry *)ps; + +shortname: + /* build the entry of 8.3 alias name */ + (*nr_slots)++; + memcpy(de->name, msdos_name, MSDOS_NAME); + de->attr = is_dir ? ATTR_DIR : ATTR_ARCH; + de->lcase = lcase; + fat_time_unix2fat(sbi, ts, &time, &date, &time_cs); + de->time = de->ctime = time; + de->date = de->cdate = de->adate = date; + de->ctime_cs = time_cs; + de->start = cpu_to_le16(cluster); + de->starthi = cpu_to_le16(cluster >> 16); + de->size = 0; +out_free: + __putname(uname); + return err; +} + +static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir, + int cluster, struct timespec *ts, + struct fat_slot_info *sinfo) +{ + struct msdos_dir_slot *slots; + unsigned int len; + int err, nr_slots; + + len = vfat_striptail_len(qname); + if (len == 0) + return -ENOENT; + + slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS); + if (slots == NULL) + return -ENOMEM; + + err = vfat_build_slots(dir, qname->name, len, is_dir, cluster, ts, + slots, &nr_slots); + if (err) + goto cleanup; + + err = fat_add_entries(dir, slots, nr_slots, sinfo); + if (err) + goto cleanup; + + /* update timestamp */ + dir->i_ctime = dir->i_mtime = dir->i_atime = *ts; + if (IS_DIRSYNC(dir)) + (void)fat_sync_inode(dir); + else + mark_inode_dirty(dir); +cleanup: + kfree(slots); + return err; +} + +static int vfat_find(struct inode *dir, struct qstr *qname, + struct fat_slot_info *sinfo) +{ + unsigned int len = vfat_striptail_len(qname); + if (len == 0) + return -ENOENT; + return fat_search_long(dir, qname->name, len, sinfo); +} + +/* + * (nfsd's) anonymous disconnected dentry? + * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job). + */ +static int vfat_d_anon_disconn(struct dentry *dentry) +{ + return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED); +} + +static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct super_block *sb = dir->i_sb; + struct fat_slot_info sinfo; + struct inode *inode; + struct dentry *alias; + int err; + + lock_super(sb); + + err = vfat_find(dir, &dentry->d_name, &sinfo); + if (err) { + if (err == -ENOENT) { + inode = NULL; + goto out; + } + goto error; + } + + inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + brelse(sinfo.bh); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + + alias = d_find_alias(inode); + if (alias && !vfat_d_anon_disconn(alias)) { + /* + * This inode has non anonymous-DCACHE_DISCONNECTED + * dentry. This means, the user did ->lookup() by an + * another name (longname vs 8.3 alias of it) in past. + * + * Switch to new one for reason of locality if possible. + */ + BUG_ON(d_unhashed(alias)); + if (!S_ISDIR(inode->i_mode)) + d_move(alias, dentry); + iput(inode); + unlock_super(sb); + return alias; + } else + dput(alias); + +out: + unlock_super(sb); + dentry->d_time = dentry->d_parent->d_inode->i_version; + dentry = d_splice_alias(inode, dentry); + if (dentry) + dentry->d_time = dentry->d_parent->d_inode->i_version; + return dentry; + +error: + unlock_super(sb); + return ERR_PTR(err); +} + +static int vfat_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + struct fat_slot_info sinfo; + struct timespec ts; + int err; + + lock_super(sb); + + ts = CURRENT_TIME_SEC; + err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo); + if (err) + goto out; + dir->i_version++; + + inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + brelse(sinfo.bh); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + inode->i_version++; + inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + /* timestamp is already written, so mark_inode_dirty() is unneeded. */ + + dentry->d_time = dentry->d_parent->d_inode->i_version; + d_instantiate(dentry, inode); +out: + unlock_super(sb); + return err; +} + +static int vfat_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct super_block *sb = dir->i_sb; + struct fat_slot_info sinfo; + int err; + + lock_super(sb); + + err = fat_dir_empty(inode); + if (err) + goto out; + err = vfat_find(dir, &dentry->d_name, &sinfo); + if (err) + goto out; + + err = fat_remove_entries(dir, &sinfo); /* and releases bh */ + if (err) + goto out; + drop_nlink(dir); + + clear_nlink(inode); + inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; + fat_detach(inode); +out: + unlock_super(sb); + + return err; +} + +static int vfat_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct super_block *sb = dir->i_sb; + struct fat_slot_info sinfo; + int err; + + lock_super(sb); + + err = vfat_find(dir, &dentry->d_name, &sinfo); + if (err) + goto out; + + err = fat_remove_entries(dir, &sinfo); /* and releases bh */ + if (err) + goto out; + clear_nlink(inode); + inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; + fat_detach(inode); +out: + unlock_super(sb); + + return err; +} + +static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + struct fat_slot_info sinfo; + struct timespec ts; + int err, cluster; + + lock_super(sb); + + ts = CURRENT_TIME_SEC; + cluster = fat_alloc_new_dir(dir, &ts); + if (cluster < 0) { + err = cluster; + goto out; + } + err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo); + if (err) + goto out_free; + dir->i_version++; + inc_nlink(dir); + + inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + brelse(sinfo.bh); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + /* the directory was completed, just return a error */ + goto out; + } + inode->i_version++; + inode->i_nlink = 2; + inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + /* timestamp is already written, so mark_inode_dirty() is unneeded. */ + + dentry->d_time = dentry->d_parent->d_inode->i_version; + d_instantiate(dentry, inode); + + unlock_super(sb); + return 0; + +out_free: + fat_free_clusters(dir, cluster); +out: + unlock_super(sb); + return err; +} + +static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct buffer_head *dotdot_bh; + struct msdos_dir_entry *dotdot_de; + struct inode *old_inode, *new_inode; + struct fat_slot_info old_sinfo, sinfo; + struct timespec ts; + loff_t dotdot_i_pos, new_i_pos; + int err, is_dir, update_dotdot, corrupt = 0; + struct super_block *sb = old_dir->i_sb; + + old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; + old_inode = old_dentry->d_inode; + new_inode = new_dentry->d_inode; + lock_super(sb); + err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo); + if (err) + goto out; + + is_dir = S_ISDIR(old_inode->i_mode); + update_dotdot = (is_dir && old_dir != new_dir); + if (update_dotdot) { + if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de, + &dotdot_i_pos) < 0) { + err = -EIO; + goto out; + } + } + + ts = CURRENT_TIME_SEC; + if (new_inode) { + if (is_dir) { + err = fat_dir_empty(new_inode); + if (err) + goto out; + } + new_i_pos = MSDOS_I(new_inode)->i_pos; + fat_detach(new_inode); + } else { + err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0, + &ts, &sinfo); + if (err) + goto out; + new_i_pos = sinfo.i_pos; + } + new_dir->i_version++; + + fat_detach(old_inode); + fat_attach(old_inode, new_i_pos); + if (IS_DIRSYNC(new_dir)) { + err = fat_sync_inode(old_inode); + if (err) + goto error_inode; + } else + mark_inode_dirty(old_inode); + + if (update_dotdot) { + int start = MSDOS_I(new_dir)->i_logstart; + dotdot_de->start = cpu_to_le16(start); + dotdot_de->starthi = cpu_to_le16(start >> 16); + mark_buffer_dirty_inode(dotdot_bh, old_inode); + if (IS_DIRSYNC(new_dir)) { + err = sync_dirty_buffer(dotdot_bh); + if (err) + goto error_dotdot; + } + drop_nlink(old_dir); + if (!new_inode) + inc_nlink(new_dir); + } + + err = fat_remove_entries(old_dir, &old_sinfo); /* and releases bh */ + old_sinfo.bh = NULL; + if (err) + goto error_dotdot; + old_dir->i_version++; + old_dir->i_ctime = old_dir->i_mtime = ts; + if (IS_DIRSYNC(old_dir)) + (void)fat_sync_inode(old_dir); + else + mark_inode_dirty(old_dir); + + if (new_inode) { + drop_nlink(new_inode); + if (is_dir) + drop_nlink(new_inode); + new_inode->i_ctime = ts; + } +out: + brelse(sinfo.bh); + brelse(dotdot_bh); + brelse(old_sinfo.bh); + unlock_super(sb); + + return err; + +error_dotdot: + /* data cluster is shared, serious corruption */ + corrupt = 1; + + if (update_dotdot) { + int start = MSDOS_I(old_dir)->i_logstart; + dotdot_de->start = cpu_to_le16(start); + dotdot_de->starthi = cpu_to_le16(start >> 16); + mark_buffer_dirty_inode(dotdot_bh, old_inode); + corrupt |= sync_dirty_buffer(dotdot_bh); + } +error_inode: + fat_detach(old_inode); + fat_attach(old_inode, old_sinfo.i_pos); + if (new_inode) { + fat_attach(new_inode, new_i_pos); + if (corrupt) + corrupt |= fat_sync_inode(new_inode); + } else { + /* + * If new entry was not sharing the data cluster, it + * shouldn't be serious corruption. + */ + int err2 = fat_remove_entries(new_dir, &sinfo); + if (corrupt) + corrupt |= err2; + sinfo.bh = NULL; + } + if (corrupt < 0) { + fat_fs_error(new_dir->i_sb, + "%s: Filesystem corrupted (i_pos %lld)", + __func__, sinfo.i_pos); + } + goto out; +} + +static const struct inode_operations vfat_dir_inode_operations = { + .create = vfat_create, + .lookup = vfat_lookup, + .unlink = vfat_unlink, + .mkdir = vfat_mkdir, + .rmdir = vfat_rmdir, + .rename = vfat_rename, + .setattr = fat_setattr, + .getattr = fat_getattr, +}; + +static void setup(struct super_block *sb) +{ + MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations; + if (MSDOS_SB(sb)->options.name_check != 's') + sb->s_d_op = &vfat_ci_dentry_ops; + else + sb->s_d_op = &vfat_dentry_ops; +} + +static int vfat_fill_super(struct super_block *sb, void *data, int silent) +{ + return fat_fill_super(sb, data, silent, 1, setup); +} + +static struct dentry *vfat_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super); +} + +static struct file_system_type vfat_fs_type = { + .owner = THIS_MODULE, + .name = "vfat", + .mount = vfat_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_vfat_fs(void) +{ + return register_filesystem(&vfat_fs_type); +} + +static void __exit exit_vfat_fs(void) +{ + unregister_filesystem(&vfat_fs_type); +} + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("VFAT filesystem support"); +MODULE_AUTHOR("Gordon Chaffee"); + +module_init(init_vfat_fs) +module_exit(exit_vfat_fs) diff --git a/fs/fcntl.c b/fs/fcntl.c new file mode 100644 index 00000000..22764c7c --- /dev/null +++ b/fs/fcntl.c @@ -0,0 +1,853 @@ +/* + * linux/fs/fcntl.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +void set_close_on_exec(unsigned int fd, int flag) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + if (flag) + FD_SET(fd, fdt->close_on_exec); + else + FD_CLR(fd, fdt->close_on_exec); + spin_unlock(&files->file_lock); +} + +static int get_close_on_exec(unsigned int fd) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + int res; + rcu_read_lock(); + fdt = files_fdtable(files); + res = FD_ISSET(fd, fdt->close_on_exec); + rcu_read_unlock(); + return res; +} + +SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) +{ + int err = -EBADF; + struct file * file, *tofree; + struct files_struct * files = current->files; + struct fdtable *fdt; + + if ((flags & ~O_CLOEXEC) != 0) + return -EINVAL; + + if (unlikely(oldfd == newfd)) + return -EINVAL; + + spin_lock(&files->file_lock); + err = expand_files(files, newfd); + file = fcheck(oldfd); + if (unlikely(!file)) + goto Ebadf; + if (unlikely(err < 0)) { + if (err == -EMFILE) + goto Ebadf; + goto out_unlock; + } + /* + * We need to detect attempts to do dup2() over allocated but still + * not finished descriptor. NB: OpenBSD avoids that at the price of + * extra work in their equivalent of fget() - they insert struct + * file immediately after grabbing descriptor, mark it larval if + * more work (e.g. actual opening) is needed and make sure that + * fget() treats larval files as absent. Potentially interesting, + * but while extra work in fget() is trivial, locking implications + * and amount of surgery on open()-related paths in VFS are not. + * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" + * deadlocks in rather amusing ways, AFAICS. All of that is out of + * scope of POSIX or SUS, since neither considers shared descriptor + * tables and this condition does not arise without those. + */ + err = -EBUSY; + fdt = files_fdtable(files); + tofree = fdt->fd[newfd]; + if (!tofree && FD_ISSET(newfd, fdt->open_fds)) + goto out_unlock; + get_file(file); + rcu_assign_pointer(fdt->fd[newfd], file); + FD_SET(newfd, fdt->open_fds); + if (flags & O_CLOEXEC) + FD_SET(newfd, fdt->close_on_exec); + else + FD_CLR(newfd, fdt->close_on_exec); + spin_unlock(&files->file_lock); + + if (tofree) + filp_close(tofree, files); + + return newfd; + +Ebadf: + err = -EBADF; +out_unlock: + spin_unlock(&files->file_lock); + return err; +} + +SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) +{ + if (unlikely(newfd == oldfd)) { /* corner case */ + struct files_struct *files = current->files; + int retval = oldfd; + + rcu_read_lock(); + if (!fcheck_files(files, oldfd)) + retval = -EBADF; + rcu_read_unlock(); + return retval; + } + return sys_dup3(oldfd, newfd, 0); +} + +SYSCALL_DEFINE1(dup, unsigned int, fildes) +{ + int ret = -EBADF; + struct file *file = fget_raw(fildes); + + if (file) { + ret = get_unused_fd(); + if (ret >= 0) + fd_install(ret, file); + else + fput(file); + } + return ret; +} + +#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) + +static int setfl(int fd, struct file * filp, unsigned long arg) +{ + struct inode * inode = filp->f_path.dentry->d_inode; + int error = 0; + + /* + * O_APPEND cannot be cleared if the file is marked as append-only + * and the file is open for write. + */ + if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode)) + return -EPERM; + + /* O_NOATIME can only be set by the owner or superuser */ + if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) + if (!inode_owner_or_capable(inode)) + return -EPERM; + + /* required for strict SunOS emulation */ + if (O_NONBLOCK != O_NDELAY) + if (arg & O_NDELAY) + arg |= O_NONBLOCK; + + if (arg & O_DIRECT) { + if (!filp->f_mapping || !filp->f_mapping->a_ops || + !filp->f_mapping->a_ops->direct_IO) + return -EINVAL; + } + + if (filp->f_op && filp->f_op->check_flags) + error = filp->f_op->check_flags(arg); + if (error) + return error; + + /* + * ->fasync() is responsible for setting the FASYNC bit. + */ + if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op && + filp->f_op->fasync) { + error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0); + if (error < 0) + goto out; + if (error > 0) + error = 0; + } + spin_lock(&filp->f_lock); + filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); + spin_unlock(&filp->f_lock); + + out: + return error; +} + +static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, + int force) +{ + write_lock_irq(&filp->f_owner.lock); + if (force || !filp->f_owner.pid) { + put_pid(filp->f_owner.pid); + filp->f_owner.pid = get_pid(pid); + filp->f_owner.pid_type = type; + + if (pid) { + const struct cred *cred = current_cred(); + filp->f_owner.uid = cred->uid; + filp->f_owner.euid = cred->euid; + } + } + write_unlock_irq(&filp->f_owner.lock); +} + +int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, + int force) +{ + int err; + + err = security_file_set_fowner(filp); + if (err) + return err; + + f_modown(filp, pid, type, force); + return 0; +} +EXPORT_SYMBOL(__f_setown); + +int f_setown(struct file *filp, unsigned long arg, int force) +{ + enum pid_type type; + struct pid *pid; + int who = arg; + int result; + type = PIDTYPE_PID; + if (who < 0) { + type = PIDTYPE_PGID; + who = -who; + } + rcu_read_lock(); + pid = find_vpid(who); + result = __f_setown(filp, pid, type, force); + rcu_read_unlock(); + return result; +} +EXPORT_SYMBOL(f_setown); + +void f_delown(struct file *filp) +{ + f_modown(filp, NULL, PIDTYPE_PID, 1); +} + +pid_t f_getown(struct file *filp) +{ + pid_t pid; + read_lock(&filp->f_owner.lock); + pid = pid_vnr(filp->f_owner.pid); + if (filp->f_owner.pid_type == PIDTYPE_PGID) + pid = -pid; + read_unlock(&filp->f_owner.lock); + return pid; +} + +static int f_setown_ex(struct file *filp, unsigned long arg) +{ + struct f_owner_ex * __user owner_p = (void * __user)arg; + struct f_owner_ex owner; + struct pid *pid; + int type; + int ret; + + ret = copy_from_user(&owner, owner_p, sizeof(owner)); + if (ret) + return -EFAULT; + + switch (owner.type) { + case F_OWNER_TID: + type = PIDTYPE_MAX; + break; + + case F_OWNER_PID: + type = PIDTYPE_PID; + break; + + case F_OWNER_PGRP: + type = PIDTYPE_PGID; + break; + + default: + return -EINVAL; + } + + rcu_read_lock(); + pid = find_vpid(owner.pid); + if (owner.pid && !pid) + ret = -ESRCH; + else + ret = __f_setown(filp, pid, type, 1); + rcu_read_unlock(); + + return ret; +} + +static int f_getown_ex(struct file *filp, unsigned long arg) +{ + struct f_owner_ex * __user owner_p = (void * __user)arg; + struct f_owner_ex owner; + int ret = 0; + + read_lock(&filp->f_owner.lock); + owner.pid = pid_vnr(filp->f_owner.pid); + switch (filp->f_owner.pid_type) { + case PIDTYPE_MAX: + owner.type = F_OWNER_TID; + break; + + case PIDTYPE_PID: + owner.type = F_OWNER_PID; + break; + + case PIDTYPE_PGID: + owner.type = F_OWNER_PGRP; + break; + + default: + WARN_ON(1); + ret = -EINVAL; + break; + } + read_unlock(&filp->f_owner.lock); + + if (!ret) { + ret = copy_to_user(owner_p, &owner, sizeof(owner)); + if (ret) + ret = -EFAULT; + } + return ret; +} + +static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, + struct file *filp) +{ + long err = -EINVAL; + + switch (cmd) { + case F_DUPFD: + case F_DUPFD_CLOEXEC: + if (arg >= rlimit(RLIMIT_NOFILE)) + break; + err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0); + if (err >= 0) { + get_file(filp); + fd_install(err, filp); + } + break; + case F_GETFD: + err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; + break; + case F_SETFD: + err = 0; + set_close_on_exec(fd, arg & FD_CLOEXEC); + break; + case F_GETFL: + err = filp->f_flags; + break; + case F_SETFL: + err = setfl(fd, filp, arg); + break; + case F_GETLK: + err = fcntl_getlk(filp, (struct flock __user *) arg); + break; + case F_SETLK: + case F_SETLKW: + err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg); + break; + case F_GETOWN: + /* + * XXX If f_owner is a process group, the + * negative return value will get converted + * into an error. Oops. If we keep the + * current syscall conventions, the only way + * to fix this will be in libc. + */ + err = f_getown(filp); + force_successful_syscall_return(); + break; + case F_SETOWN: + err = f_setown(filp, arg, 1); + break; + case F_GETOWN_EX: + err = f_getown_ex(filp, arg); + break; + case F_SETOWN_EX: + err = f_setown_ex(filp, arg); + break; + case F_GETSIG: + err = filp->f_owner.signum; + break; + case F_SETSIG: + /* arg == 0 restores default behaviour. */ + if (!valid_signal(arg)) { + break; + } + err = 0; + filp->f_owner.signum = arg; + break; + case F_GETLEASE: + err = fcntl_getlease(filp); + break; + case F_SETLEASE: + err = fcntl_setlease(fd, filp, arg); + break; + case F_NOTIFY: + err = fcntl_dirnotify(fd, filp, arg); + break; + case F_SETPIPE_SZ: + case F_GETPIPE_SZ: + err = pipe_fcntl(filp, cmd, arg); + break; + default: + break; + } + return err; +} + +static int check_fcntl_cmd(unsigned cmd) +{ + switch (cmd) { + case F_DUPFD: + case F_DUPFD_CLOEXEC: + case F_GETFD: + case F_SETFD: + case F_GETFL: + return 1; + } + return 0; +} + +SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) +{ + struct file *filp; + long err = -EBADF; + + filp = fget_raw(fd); + if (!filp) + goto out; + + if (unlikely(filp->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) { + fput(filp); + goto out; + } + } + + err = security_file_fcntl(filp, cmd, arg); + if (err) { + fput(filp); + return err; + } + + err = do_fcntl(fd, cmd, arg, filp); + + fput(filp); +out: + return err; +} + +#if BITS_PER_LONG == 32 +SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, + unsigned long, arg) +{ + struct file * filp; + long err; + + err = -EBADF; + filp = fget_raw(fd); + if (!filp) + goto out; + + if (unlikely(filp->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) { + fput(filp); + goto out; + } + } + + err = security_file_fcntl(filp, cmd, arg); + if (err) { + fput(filp); + return err; + } + err = -EBADF; + + switch (cmd) { + case F_GETLK64: + err = fcntl_getlk64(filp, (struct flock64 __user *) arg); + break; + case F_SETLK64: + case F_SETLKW64: + err = fcntl_setlk64(fd, filp, cmd, + (struct flock64 __user *) arg); + break; + default: + err = do_fcntl(fd, cmd, arg, filp); + break; + } + fput(filp); +out: + return err; +} +#endif + +/* Table to convert sigio signal codes into poll band bitmaps */ + +static const long band_table[NSIGPOLL] = { + POLLIN | POLLRDNORM, /* POLL_IN */ + POLLOUT | POLLWRNORM | POLLWRBAND, /* POLL_OUT */ + POLLIN | POLLRDNORM | POLLMSG, /* POLL_MSG */ + POLLERR, /* POLL_ERR */ + POLLPRI | POLLRDBAND, /* POLL_PRI */ + POLLHUP | POLLERR /* POLL_HUP */ +}; + +static inline int sigio_perm(struct task_struct *p, + struct fown_struct *fown, int sig) +{ + const struct cred *cred; + int ret; + + rcu_read_lock(); + cred = __task_cred(p); + ret = ((fown->euid == 0 || + fown->euid == cred->suid || fown->euid == cred->uid || + fown->uid == cred->suid || fown->uid == cred->uid) && + !security_file_send_sigiotask(p, fown, sig)); + rcu_read_unlock(); + return ret; +} + +static void send_sigio_to_task(struct task_struct *p, + struct fown_struct *fown, + int fd, int reason, int group) +{ + /* + * F_SETSIG can change ->signum lockless in parallel, make + * sure we read it once and use the same value throughout. + */ + int signum = ACCESS_ONCE(fown->signum); + + if (!sigio_perm(p, fown, signum)) + return; + + switch (signum) { + siginfo_t si; + default: + /* Queue a rt signal with the appropriate fd as its + value. We use SI_SIGIO as the source, not + SI_KERNEL, since kernel signals always get + delivered even if we can't queue. Failure to + queue in this case _should_ be reported; we fall + back to SIGIO in that case. --sct */ + si.si_signo = signum; + si.si_errno = 0; + si.si_code = reason; + /* Make sure we are called with one of the POLL_* + reasons, otherwise we could leak kernel stack into + userspace. */ + BUG_ON((reason & __SI_MASK) != __SI_POLL); + if (reason - POLL_IN >= NSIGPOLL) + si.si_band = ~0L; + else + si.si_band = band_table[reason - POLL_IN]; + si.si_fd = fd; + if (!do_send_sig_info(signum, &si, p, group)) + break; + /* fall-through: fall back on the old plain SIGIO signal */ + case 0: + do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group); + } +} + +void send_sigio(struct fown_struct *fown, int fd, int band) +{ + struct task_struct *p; + enum pid_type type; + struct pid *pid; + int group = 1; + + read_lock(&fown->lock); + + type = fown->pid_type; + if (type == PIDTYPE_MAX) { + group = 0; + type = PIDTYPE_PID; + } + + pid = fown->pid; + if (!pid) + goto out_unlock_fown; + + read_lock(&tasklist_lock); + do_each_pid_task(pid, type, p) { + send_sigio_to_task(p, fown, fd, band, group); + } while_each_pid_task(pid, type, p); + read_unlock(&tasklist_lock); + out_unlock_fown: + read_unlock(&fown->lock); +} + +static void send_sigurg_to_task(struct task_struct *p, + struct fown_struct *fown, int group) +{ + if (sigio_perm(p, fown, SIGURG)) + do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group); +} + +int send_sigurg(struct fown_struct *fown) +{ + struct task_struct *p; + enum pid_type type; + struct pid *pid; + int group = 1; + int ret = 0; + + read_lock(&fown->lock); + + type = fown->pid_type; + if (type == PIDTYPE_MAX) { + group = 0; + type = PIDTYPE_PID; + } + + pid = fown->pid; + if (!pid) + goto out_unlock_fown; + + ret = 1; + + read_lock(&tasklist_lock); + do_each_pid_task(pid, type, p) { + send_sigurg_to_task(p, fown, group); + } while_each_pid_task(pid, type, p); + read_unlock(&tasklist_lock); + out_unlock_fown: + read_unlock(&fown->lock); + return ret; +} + +static DEFINE_SPINLOCK(fasync_lock); +static struct kmem_cache *fasync_cache __read_mostly; + +static void fasync_free_rcu(struct rcu_head *head) +{ + kmem_cache_free(fasync_cache, + container_of(head, struct fasync_struct, fa_rcu)); +} + +/* + * Remove a fasync entry. If successfully removed, return + * positive and clear the FASYNC flag. If no entry exists, + * do nothing and return 0. + * + * NOTE! It is very important that the FASYNC flag always + * match the state "is the filp on a fasync list". + * + */ +int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) +{ + struct fasync_struct *fa, **fp; + int result = 0; + + spin_lock(&filp->f_lock); + spin_lock(&fasync_lock); + for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { + if (fa->fa_file != filp) + continue; + + spin_lock_irq(&fa->fa_lock); + fa->fa_file = NULL; + spin_unlock_irq(&fa->fa_lock); + + *fp = fa->fa_next; + call_rcu(&fa->fa_rcu, fasync_free_rcu); + filp->f_flags &= ~FASYNC; + result = 1; + break; + } + spin_unlock(&fasync_lock); + spin_unlock(&filp->f_lock); + return result; +} + +struct fasync_struct *fasync_alloc(void) +{ + return kmem_cache_alloc(fasync_cache, GFP_KERNEL); +} + +/* + * NOTE! This can be used only for unused fasync entries: + * entries that actually got inserted on the fasync list + * need to be released by rcu - see fasync_remove_entry. + */ +void fasync_free(struct fasync_struct *new) +{ + kmem_cache_free(fasync_cache, new); +} + +/* + * Insert a new entry into the fasync list. Return the pointer to the + * old one if we didn't use the new one. + * + * NOTE! It is very important that the FASYNC flag always + * match the state "is the filp on a fasync list". + */ +struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new) +{ + struct fasync_struct *fa, **fp; + + spin_lock(&filp->f_lock); + spin_lock(&fasync_lock); + for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { + if (fa->fa_file != filp) + continue; + + spin_lock_irq(&fa->fa_lock); + fa->fa_fd = fd; + spin_unlock_irq(&fa->fa_lock); + goto out; + } + + spin_lock_init(&new->fa_lock); + new->magic = FASYNC_MAGIC; + new->fa_file = filp; + new->fa_fd = fd; + new->fa_next = *fapp; + rcu_assign_pointer(*fapp, new); + filp->f_flags |= FASYNC; + +out: + spin_unlock(&fasync_lock); + spin_unlock(&filp->f_lock); + return fa; +} + +/* + * Add a fasync entry. Return negative on error, positive if + * added, and zero if did nothing but change an existing one. + */ +static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) +{ + struct fasync_struct *new; + + new = fasync_alloc(); + if (!new) + return -ENOMEM; + + /* + * fasync_insert_entry() returns the old (update) entry if + * it existed. + * + * So free the (unused) new entry and return 0 to let the + * caller know that we didn't add any new fasync entries. + */ + if (fasync_insert_entry(fd, filp, fapp, new)) { + fasync_free(new); + return 0; + } + + return 1; +} + +/* + * fasync_helper() is used by almost all character device drivers + * to set up the fasync queue, and for regular files by the file + * lease code. It returns negative on error, 0 if it did no changes + * and positive if it added/deleted the entry. + */ +int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) +{ + if (!on) + return fasync_remove_entry(filp, fapp); + return fasync_add_entry(fd, filp, fapp); +} + +EXPORT_SYMBOL(fasync_helper); + +/* + * rcu_read_lock() is held + */ +static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) +{ + while (fa) { + struct fown_struct *fown; + unsigned long flags; + + if (fa->magic != FASYNC_MAGIC) { + printk(KERN_ERR "kill_fasync: bad magic number in " + "fasync_struct!\n"); + return; + } + spin_lock_irqsave(&fa->fa_lock, flags); + if (fa->fa_file) { + fown = &fa->fa_file->f_owner; + /* Don't send SIGURG to processes which have not set a + queued signum: SIGURG has its own default signalling + mechanism. */ + if (!(sig == SIGURG && fown->signum == 0)) + send_sigio(fown, fa->fa_fd, band); + } + spin_unlock_irqrestore(&fa->fa_lock, flags); + fa = rcu_dereference(fa->fa_next); + } +} + +void kill_fasync(struct fasync_struct **fp, int sig, int band) +{ + /* First a quick test without locking: usually + * the list is empty. + */ + if (*fp) { + rcu_read_lock(); + kill_fasync_rcu(rcu_dereference(*fp), sig, band); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(kill_fasync); + +static int __init fcntl_init(void) +{ + /* + * Please add new bits here to ensure allocation uniqueness. + * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY + * is defined as O_NONBLOCK on some platforms and not on others. + */ + BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + O_RDONLY | O_WRONLY | O_RDWR | + O_CREAT | O_EXCL | O_NOCTTY | + O_TRUNC | O_APPEND | /* O_NONBLOCK | */ + __O_SYNC | O_DSYNC | FASYNC | + O_DIRECT | O_LARGEFILE | O_DIRECTORY | + O_NOFOLLOW | O_NOATIME | O_CLOEXEC | + __FMODE_EXEC | O_PATH + )); + + fasync_cache = kmem_cache_create("fasync_cache", + sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); + return 0; +} + +module_init(fcntl_init) diff --git a/fs/fhandle.c b/fs/fhandle.c new file mode 100644 index 00000000..6b088641 --- /dev/null +++ b/fs/fhandle.c @@ -0,0 +1,266 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static long do_sys_name_to_handle(struct path *path, + struct file_handle __user *ufh, + int __user *mnt_id) +{ + long retval; + struct file_handle f_handle; + int handle_dwords, handle_bytes; + struct file_handle *handle = NULL; + + /* + * We need t make sure wether the file system + * support decoding of the file handle + */ + if (!path->mnt->mnt_sb->s_export_op || + !path->mnt->mnt_sb->s_export_op->fh_to_dentry) + return -EOPNOTSUPP; + + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) + return -EFAULT; + + if (f_handle.handle_bytes > MAX_HANDLE_SZ) + return -EINVAL; + + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + GFP_KERNEL); + if (!handle) + return -ENOMEM; + + /* convert handle size to multiple of sizeof(u32) */ + handle_dwords = f_handle.handle_bytes >> 2; + + /* we ask for a non connected handle */ + retval = exportfs_encode_fh(path->dentry, + (struct fid *)handle->f_handle, + &handle_dwords, 0); + handle->handle_type = retval; + /* convert handle size to bytes */ + handle_bytes = handle_dwords * sizeof(u32); + handle->handle_bytes = handle_bytes; + if ((handle->handle_bytes > f_handle.handle_bytes) || + (retval == 255) || (retval == -ENOSPC)) { + /* As per old exportfs_encode_fh documentation + * we could return ENOSPC to indicate overflow + * But file system returned 255 always. So handle + * both the values + */ + /* + * set the handle size to zero so we copy only + * non variable part of the file_handle + */ + handle_bytes = 0; + retval = -EOVERFLOW; + } else + retval = 0; + /* copy the mount id */ + if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) || + copy_to_user(ufh, handle, + sizeof(struct file_handle) + handle_bytes)) + retval = -EFAULT; + kfree(handle); + return retval; +} + +/** + * sys_name_to_handle_at: convert name to handle + * @dfd: directory relative to which name is interpreted if not absolute + * @name: name that should be converted to handle. + * @handle: resulting file handle + * @mnt_id: mount id of the file system containing the file + * @flag: flag value to indicate whether to follow symlink or not + * + * @handle->handle_size indicate the space available to store the + * variable part of the file handle in bytes. If there is not + * enough space, the field is updated to return the minimum + * value required. + */ +SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, + struct file_handle __user *, handle, int __user *, mnt_id, + int, flag) +{ + struct path path; + int lookup_flags; + int err; + + if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + err = user_path_at(dfd, name, lookup_flags, &path); + if (!err) { + err = do_sys_name_to_handle(&path, handle, mnt_id); + path_put(&path); + } + return err; +} + +static struct vfsmount *get_vfsmount_from_fd(int fd) +{ + struct path path; + + if (fd == AT_FDCWD) { + struct fs_struct *fs = current->fs; + spin_lock(&fs->lock); + path = fs->pwd; + mntget(path.mnt); + spin_unlock(&fs->lock); + } else { + int fput_needed; + struct file *file = fget_light(fd, &fput_needed); + if (!file) + return ERR_PTR(-EBADF); + path = file->f_path; + mntget(path.mnt); + fput_light(file, fput_needed); + } + return path.mnt; +} + +static int vfs_dentry_acceptable(void *context, struct dentry *dentry) +{ + return 1; +} + +static int do_handle_to_path(int mountdirfd, struct file_handle *handle, + struct path *path) +{ + int retval = 0; + int handle_dwords; + + path->mnt = get_vfsmount_from_fd(mountdirfd); + if (IS_ERR(path->mnt)) { + retval = PTR_ERR(path->mnt); + goto out_err; + } + /* change the handle size to multiple of sizeof(u32) */ + handle_dwords = handle->handle_bytes >> 2; + path->dentry = exportfs_decode_fh(path->mnt, + (struct fid *)handle->f_handle, + handle_dwords, handle->handle_type, + vfs_dentry_acceptable, NULL); + if (IS_ERR(path->dentry)) { + retval = PTR_ERR(path->dentry); + goto out_mnt; + } + return 0; +out_mnt: + mntput(path->mnt); +out_err: + return retval; +} + +static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, + struct path *path) +{ + int retval = 0; + struct file_handle f_handle; + struct file_handle *handle = NULL; + + /* + * With handle we don't look at the execute bit on the + * the directory. Ideally we would like CAP_DAC_SEARCH. + * But we don't have that + */ + if (!capable(CAP_DAC_READ_SEARCH)) { + retval = -EPERM; + goto out_err; + } + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { + retval = -EFAULT; + goto out_err; + } + if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || + (f_handle.handle_bytes == 0)) { + retval = -EINVAL; + goto out_err; + } + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + GFP_KERNEL); + if (!handle) { + retval = -ENOMEM; + goto out_err; + } + /* copy the full handle */ + if (copy_from_user(handle, ufh, + sizeof(struct file_handle) + + f_handle.handle_bytes)) { + retval = -EFAULT; + goto out_handle; + } + + retval = do_handle_to_path(mountdirfd, handle, path); + +out_handle: + kfree(handle); +out_err: + return retval; +} + +long do_handle_open(int mountdirfd, + struct file_handle __user *ufh, int open_flag) +{ + long retval = 0; + struct path path; + struct file *file; + int fd; + + retval = handle_to_path(mountdirfd, ufh, &path); + if (retval) + return retval; + + fd = get_unused_fd_flags(open_flag); + if (fd < 0) { + path_put(&path); + return fd; + } + file = file_open_root(path.dentry, path.mnt, "", open_flag); + if (IS_ERR(file)) { + put_unused_fd(fd); + retval = PTR_ERR(file); + } else { + retval = fd; + fsnotify_open(file); + fd_install(fd, file); + } + path_put(&path); + return retval; +} + +/** + * sys_open_by_handle_at: Open the file handle + * @mountdirfd: directory file descriptor + * @handle: file handle to be opened + * @flag: open flags. + * + * @mountdirfd indicate the directory file descriptor + * of the mount point. file handle is decoded relative + * to the vfsmount pointed by the @mountdirfd. @flags + * value is same as the open(2) flags. + */ +SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, + struct file_handle __user *, handle, + int, flags) +{ + long ret; + + if (force_o_largefile()) + flags |= O_LARGEFILE; + + ret = do_handle_open(mountdirfd, handle, flags); + return ret; +} diff --git a/fs/fifo.c b/fs/fifo.c new file mode 100644 index 00000000..b1a524d7 --- /dev/null +++ b/fs/fifo.c @@ -0,0 +1,154 @@ +/* + * linux/fs/fifo.c + * + * written by Paul H. Hargrove + * + * Fixes: + * 10-06-1999, AV: fixed OOM handling in fifo_open(), moved + * initialization there, switched to external + * allocation of pipe_inode_info. + */ + +#include +#include +#include +#include + +static void wait_for_partner(struct inode* inode, unsigned int *cnt) +{ + int cur = *cnt; + + while (cur == *cnt) { + pipe_wait(inode->i_pipe); + if (signal_pending(current)) + break; + } +} + +static void wake_up_partner(struct inode* inode) +{ + wake_up_interruptible(&inode->i_pipe->wait); +} + +static int fifo_open(struct inode *inode, struct file *filp) +{ + struct pipe_inode_info *pipe; + int ret; + + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; + if (!pipe) { + ret = -ENOMEM; + pipe = alloc_pipe_info(inode); + if (!pipe) + goto err_nocleanup; + inode->i_pipe = pipe; + } + filp->f_version = 0; + + /* We can only do regular read/write on fifos */ + filp->f_mode &= (FMODE_READ | FMODE_WRITE); + + switch (filp->f_mode) { + case FMODE_READ: + /* + * O_RDONLY + * POSIX.1 says that O_NONBLOCK means return with the FIFO + * opened, even when there is no process writing the FIFO. + */ + filp->f_op = &read_pipefifo_fops; + pipe->r_counter++; + if (pipe->readers++ == 0) + wake_up_partner(inode); + + if (!pipe->writers) { + if ((filp->f_flags & O_NONBLOCK)) { + /* suppress POLLHUP until we have + * seen a writer */ + filp->f_version = pipe->w_counter; + } else { + wait_for_partner(inode, &pipe->w_counter); + if(signal_pending(current)) + goto err_rd; + } + } + break; + + case FMODE_WRITE: + /* + * O_WRONLY + * POSIX.1 says that O_NONBLOCK means return -1 with + * errno=ENXIO when there is no process reading the FIFO. + */ + ret = -ENXIO; + if ((filp->f_flags & O_NONBLOCK) && !pipe->readers) + goto err; + + filp->f_op = &write_pipefifo_fops; + pipe->w_counter++; + if (!pipe->writers++) + wake_up_partner(inode); + + if (!pipe->readers) { + wait_for_partner(inode, &pipe->r_counter); + if (signal_pending(current)) + goto err_wr; + } + break; + + case FMODE_READ | FMODE_WRITE: + /* + * O_RDWR + * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. + * This implementation will NEVER block on a O_RDWR open, since + * the process can at least talk to itself. + */ + filp->f_op = &rdwr_pipefifo_fops; + + pipe->readers++; + pipe->writers++; + pipe->r_counter++; + pipe->w_counter++; + if (pipe->readers == 1 || pipe->writers == 1) + wake_up_partner(inode); + break; + + default: + ret = -EINVAL; + goto err; + } + + /* Ok! */ + mutex_unlock(&inode->i_mutex); + return 0; + +err_rd: + if (!--pipe->readers) + wake_up_interruptible(&pipe->wait); + ret = -ERESTARTSYS; + goto err; + +err_wr: + if (!--pipe->writers) + wake_up_interruptible(&pipe->wait); + ret = -ERESTARTSYS; + goto err; + +err: + if (!pipe->readers && !pipe->writers) + free_pipe_info(inode); + +err_nocleanup: + mutex_unlock(&inode->i_mutex); + return ret; +} + +/* + * Dummy default file-operations: the only thing this does + * is contain the open that then fills in the correct operations + * depending on the access mode of the file... + */ +const struct file_operations def_fifo_fops = { + .open = fifo_open, /* will set read_ or write_pipefifo_fops */ + .llseek = noop_llseek, +}; diff --git a/fs/file.c b/fs/file.c new file mode 100644 index 00000000..4c6992d8 --- /dev/null +++ b/fs/file.c @@ -0,0 +1,486 @@ +/* + * linux/fs/file.c + * + * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes + * + * Manage the dynamic fd arrays in the process files_struct. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct fdtable_defer { + spinlock_t lock; + struct work_struct wq; + struct fdtable *next; +}; + +int sysctl_nr_open __read_mostly = 1024*1024; +int sysctl_nr_open_min = BITS_PER_LONG; +int sysctl_nr_open_max = 1024 * 1024; /* raised later */ + +/* + * We use this list to defer free fdtables that have vmalloced + * sets/arrays. By keeping a per-cpu list, we avoid having to embed + * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in + * this per-task structure. + */ +static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); + +static void *alloc_fdmem(unsigned int size) +{ + /* + * Very large allocations can stress page reclaim, so fall back to + * vmalloc() if the allocation size will be considered "large" by the VM. + */ + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (data != NULL) + return data; + } + return vmalloc(size); +} + +static void free_fdmem(void *ptr) +{ + is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr); +} + +static void __free_fdtable(struct fdtable *fdt) +{ + free_fdmem(fdt->fd); + free_fdmem(fdt->open_fds); + kfree(fdt); +} + +static void free_fdtable_work(struct work_struct *work) +{ + struct fdtable_defer *f = + container_of(work, struct fdtable_defer, wq); + struct fdtable *fdt; + + spin_lock_bh(&f->lock); + fdt = f->next; + f->next = NULL; + spin_unlock_bh(&f->lock); + while(fdt) { + struct fdtable *next = fdt->next; + + __free_fdtable(fdt); + fdt = next; + } +} + +void free_fdtable_rcu(struct rcu_head *rcu) +{ + struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); + struct fdtable_defer *fddef; + + BUG_ON(!fdt); + + if (fdt->max_fds <= NR_OPEN_DEFAULT) { + /* + * This fdtable is embedded in the files structure and that + * structure itself is getting destroyed. + */ + kmem_cache_free(files_cachep, + container_of(fdt, struct files_struct, fdtab)); + return; + } + if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) { + kfree(fdt->fd); + kfree(fdt->open_fds); + kfree(fdt); + } else { + fddef = &get_cpu_var(fdtable_defer_list); + spin_lock(&fddef->lock); + fdt->next = fddef->next; + fddef->next = fdt; + /* vmallocs are handled from the workqueue context */ + schedule_work(&fddef->wq); + spin_unlock(&fddef->lock); + put_cpu_var(fdtable_defer_list); + } +} + +/* + * Expand the fdset in the files_struct. Called with the files spinlock + * held for write. + */ +static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) +{ + unsigned int cpy, set; + + BUG_ON(nfdt->max_fds < ofdt->max_fds); + + cpy = ofdt->max_fds * sizeof(struct file *); + set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); + memcpy(nfdt->fd, ofdt->fd, cpy); + memset((char *)(nfdt->fd) + cpy, 0, set); + + cpy = ofdt->max_fds / BITS_PER_BYTE; + set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE; + memcpy(nfdt->open_fds, ofdt->open_fds, cpy); + memset((char *)(nfdt->open_fds) + cpy, 0, set); + memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); + memset((char *)(nfdt->close_on_exec) + cpy, 0, set); +} + +static struct fdtable * alloc_fdtable(unsigned int nr) +{ + struct fdtable *fdt; + char *data; + + /* + * Figure out how many fds we actually want to support in this fdtable. + * Allocation steps are keyed to the size of the fdarray, since it + * grows far faster than any of the other dynamic data. We try to fit + * the fdarray into comfortable page-tuned chunks: starting at 1024B + * and growing in powers of two from there on. + */ + nr /= (1024 / sizeof(struct file *)); + nr = roundup_pow_of_two(nr + 1); + nr *= (1024 / sizeof(struct file *)); + /* + * Note that this can drive nr *below* what we had passed if sysctl_nr_open + * had been set lower between the check in expand_files() and here. Deal + * with that in caller, it's cheaper that way. + * + * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise + * bitmaps handling below becomes unpleasant, to put it mildly... + */ + if (unlikely(nr > sysctl_nr_open)) + nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; + + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); + if (!fdt) + goto out; + fdt->max_fds = nr; + data = alloc_fdmem(nr * sizeof(struct file *)); + if (!data) + goto out_fdt; + fdt->fd = (struct file **)data; + data = alloc_fdmem(max_t(unsigned int, + 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); + if (!data) + goto out_arr; + fdt->open_fds = (fd_set *)data; + data += nr / BITS_PER_BYTE; + fdt->close_on_exec = (fd_set *)data; + fdt->next = NULL; + + return fdt; + +out_arr: + free_fdmem(fdt->fd); +out_fdt: + kfree(fdt); +out: + return NULL; +} + +/* + * Expand the file descriptor table. + * This function will allocate a new fdtable and both fd array and fdset, of + * the given size. + * Return <0 error code on error; 1 on successful completion. + * The files->file_lock should be held on entry, and will be held on exit. + */ +static int expand_fdtable(struct files_struct *files, int nr) + __releases(files->file_lock) + __acquires(files->file_lock) +{ + struct fdtable *new_fdt, *cur_fdt; + + spin_unlock(&files->file_lock); + new_fdt = alloc_fdtable(nr); + spin_lock(&files->file_lock); + if (!new_fdt) + return -ENOMEM; + /* + * extremely unlikely race - sysctl_nr_open decreased between the check in + * caller and alloc_fdtable(). Cheaper to catch it here... + */ + if (unlikely(new_fdt->max_fds <= nr)) { + __free_fdtable(new_fdt); + return -EMFILE; + } + /* + * Check again since another task may have expanded the fd table while + * we dropped the lock + */ + cur_fdt = files_fdtable(files); + if (nr >= cur_fdt->max_fds) { + /* Continue as planned */ + copy_fdtable(new_fdt, cur_fdt); + rcu_assign_pointer(files->fdt, new_fdt); + if (cur_fdt->max_fds > NR_OPEN_DEFAULT) + free_fdtable(cur_fdt); + } else { + /* Somebody else expanded, so undo our attempt */ + __free_fdtable(new_fdt); + } + return 1; +} + +/* + * Expand files. + * This function will expand the file structures, if the requested size exceeds + * the current capacity and there is room for expansion. + * Return <0 error code on error; 0 when nothing done; 1 when files were + * expanded and execution may have blocked. + * The files->file_lock should be held on entry, and will be held on exit. + */ +int expand_files(struct files_struct *files, int nr) +{ + struct fdtable *fdt; + + fdt = files_fdtable(files); + + /* + * N.B. For clone tasks sharing a files structure, this test + * will limit the total number of files that can be opened. + */ + if (nr >= rlimit(RLIMIT_NOFILE)) + return -EMFILE; + + /* Do we need to expand? */ + if (nr < fdt->max_fds) + return 0; + + /* Can we expand? */ + if (nr >= sysctl_nr_open) + return -EMFILE; + + /* All good, so we try */ + return expand_fdtable(files, nr); +} + +static int count_open_files(struct fdtable *fdt) +{ + int size = fdt->max_fds; + int i; + + /* Find the last open fd */ + for (i = size/(8*sizeof(long)); i > 0; ) { + if (fdt->open_fds->fds_bits[--i]) + break; + } + i = (i+1) * 8 * sizeof(long); + return i; +} + +/* + * Allocate a new files structure and copy contents from the + * passed in files structure. + * errorp will be valid only when the returned files_struct is NULL. + */ +struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) +{ + struct files_struct *newf; + struct file **old_fds, **new_fds; + int open_files, size, i; + struct fdtable *old_fdt, *new_fdt; + + *errorp = -ENOMEM; + newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); + if (!newf) + goto out; + + atomic_set(&newf->count, 1); + + spin_lock_init(&newf->file_lock); + newf->next_fd = 0; + new_fdt = &newf->fdtab; + new_fdt->max_fds = NR_OPEN_DEFAULT; + new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; + new_fdt->open_fds = (fd_set *)&newf->open_fds_init; + new_fdt->fd = &newf->fd_array[0]; + new_fdt->next = NULL; + + spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); + open_files = count_open_files(old_fdt); + + /* + * Check whether we need to allocate a larger fd array and fd set. + */ + while (unlikely(open_files > new_fdt->max_fds)) { + spin_unlock(&oldf->file_lock); + + if (new_fdt != &newf->fdtab) + __free_fdtable(new_fdt); + + new_fdt = alloc_fdtable(open_files - 1); + if (!new_fdt) { + *errorp = -ENOMEM; + goto out_release; + } + + /* beyond sysctl_nr_open; nothing to do */ + if (unlikely(new_fdt->max_fds < open_files)) { + __free_fdtable(new_fdt); + *errorp = -EMFILE; + goto out_release; + } + + /* + * Reacquire the oldf lock and a pointer to its fd table + * who knows it may have a new bigger fd table. We need + * the latest pointer. + */ + spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); + open_files = count_open_files(old_fdt); + } + + old_fds = old_fdt->fd; + new_fds = new_fdt->fd; + + memcpy(new_fdt->open_fds->fds_bits, + old_fdt->open_fds->fds_bits, open_files/8); + memcpy(new_fdt->close_on_exec->fds_bits, + old_fdt->close_on_exec->fds_bits, open_files/8); + + for (i = open_files; i != 0; i--) { + struct file *f = *old_fds++; + if (f) { + get_file(f); + } else { + /* + * The fd may be claimed in the fd bitmap but not yet + * instantiated in the files array if a sibling thread + * is partway through open(). So make sure that this + * fd is available to the new process. + */ + FD_CLR(open_files - i, new_fdt->open_fds); + } + rcu_assign_pointer(*new_fds++, f); + } + spin_unlock(&oldf->file_lock); + + /* compute the remainder to be cleared */ + size = (new_fdt->max_fds - open_files) * sizeof(struct file *); + + /* This is long word aligned thus could use a optimized version */ + memset(new_fds, 0, size); + + if (new_fdt->max_fds > open_files) { + int left = (new_fdt->max_fds-open_files)/8; + int start = open_files / (8 * sizeof(unsigned long)); + + memset(&new_fdt->open_fds->fds_bits[start], 0, left); + memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); + } + + rcu_assign_pointer(newf->fdt, new_fdt); + + return newf; + +out_release: + kmem_cache_free(files_cachep, newf); +out: + return NULL; +} + +static void __devinit fdtable_defer_list_init(int cpu) +{ + struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); + spin_lock_init(&fddef->lock); + INIT_WORK(&fddef->wq, free_fdtable_work); + fddef->next = NULL; +} + +void __init files_defer_init(void) +{ + int i; + for_each_possible_cpu(i) + fdtable_defer_list_init(i); + sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & + -BITS_PER_LONG; +} + +struct files_struct init_files = { + .count = ATOMIC_INIT(1), + .fdt = &init_files.fdtab, + .fdtab = { + .max_fds = NR_OPEN_DEFAULT, + .fd = &init_files.fd_array[0], + .close_on_exec = (fd_set *)&init_files.close_on_exec_init, + .open_fds = (fd_set *)&init_files.open_fds_init, + }, + .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), +}; + +/* + * allocate a file descriptor, mark it busy. + */ +int alloc_fd(unsigned start, unsigned flags) +{ + struct files_struct *files = current->files; + unsigned int fd; + int error; + struct fdtable *fdt; + + spin_lock(&files->file_lock); +repeat: + fdt = files_fdtable(files); + fd = start; + if (fd < files->next_fd) + fd = files->next_fd; + + if (fd < fdt->max_fds) + fd = find_next_zero_bit(fdt->open_fds->fds_bits, + fdt->max_fds, fd); + + error = expand_files(files, fd); + if (error < 0) + goto out; + + /* + * If we needed to expand the fs array we + * might have blocked - try again. + */ + if (error) + goto repeat; + + if (start <= files->next_fd) + files->next_fd = fd + 1; + + FD_SET(fd, fdt->open_fds); + if (flags & O_CLOEXEC) + FD_SET(fd, fdt->close_on_exec); + else + FD_CLR(fd, fdt->close_on_exec); + error = fd; +#if 1 + /* Sanity check */ + if (rcu_dereference_raw(fdt->fd[fd]) != NULL) { + printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); + rcu_assign_pointer(fdt->fd[fd], NULL); + } +#endif + +out: + spin_unlock(&files->file_lock); + return error; +} + +int get_unused_fd(void) +{ + return alloc_fd(0, 0); +} +EXPORT_SYMBOL(get_unused_fd); diff --git a/fs/file_table.c b/fs/file_table.c new file mode 100644 index 00000000..01e4c1e8 --- /dev/null +++ b/fs/file_table.c @@ -0,0 +1,554 @@ +/* + * linux/fs/file_table.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +/* sysctl tunables... */ +struct files_stat_struct files_stat = { + .max_files = NR_FILE +}; + +DECLARE_LGLOCK(files_lglock); +DEFINE_LGLOCK(files_lglock); + +/* SLAB cache for file structures */ +static struct kmem_cache *filp_cachep __read_mostly; + +static struct percpu_counter nr_files __cacheline_aligned_in_smp; + +static inline void file_free_rcu(struct rcu_head *head) +{ + struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + + put_cred(f->f_cred); + kmem_cache_free(filp_cachep, f); +} + +static inline void file_free(struct file *f) +{ + percpu_counter_dec(&nr_files); + file_check_state(f); + call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); +} + +/* + * Return the total number of open files in the system + */ +static long get_nr_files(void) +{ + return percpu_counter_read_positive(&nr_files); +} + +/* + * Return the maximum number of open files in the system + */ +unsigned long get_max_files(void) +{ + return files_stat.max_files; +} +EXPORT_SYMBOL_GPL(get_max_files); + +/* + * Handle nr_files sysctl + */ +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +int proc_nr_files(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + files_stat.nr_files = get_nr_files(); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} +#else +int proc_nr_files(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +#endif + +/* Find an unused file structure and return a pointer to it. + * Returns NULL, if there are no more free file structures or + * we run out of memory. + * + * Be very careful using this. You are responsible for + * getting write access to any mount that you might assign + * to this filp, if it is opened for write. If this is not + * done, you will imbalance int the mount's writer count + * and a warning at __fput() time. + */ +struct file *get_empty_filp(void) +{ + const struct cred *cred = current_cred(); + static long old_max; + struct file * f; + + /* + * Privileged users can go above max_files + */ + if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + /* + * percpu_counters are inaccurate. Do an expensive check before + * we go and fail. + */ + if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files) + goto over; + } + + f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + if (f == NULL) + goto fail; + + percpu_counter_inc(&nr_files); + f->f_cred = get_cred(cred); + if (security_file_alloc(f)) + goto fail_sec; + + INIT_LIST_HEAD(&f->f_u.fu_list); + atomic_long_set(&f->f_count, 1); + rwlock_init(&f->f_owner.lock); + spin_lock_init(&f->f_lock); + eventpoll_init_file(f); + /* f->f_version: 0 */ + return f; + +over: + /* Ran out of filps - report that */ + if (get_nr_files() > old_max) { + pr_info("VFS: file-max limit %lu reached\n", get_max_files()); + old_max = get_nr_files(); + } + goto fail; + +fail_sec: + file_free(f); +fail: + return NULL; +} + +/** + * alloc_file - allocate and initialize a 'struct file' + * @mnt: the vfsmount on which the file will reside + * @dentry: the dentry representing the new file + * @mode: the mode with which the new file will be opened + * @fop: the 'struct file_operations' for the new file + * + * Use this instead of get_empty_filp() to get a new + * 'struct file'. Do so because of the same initialization + * pitfalls reasons listed for init_file(). This is a + * preferred interface to using init_file(). + * + * If all the callers of init_file() are eliminated, its + * code should be moved into this function. + */ +struct file *alloc_file(struct path *path, fmode_t mode, + const struct file_operations *fop) +{ + struct file *file; + + file = get_empty_filp(); + if (!file) + return NULL; + + file->f_path = *path; + file->f_mapping = path->dentry->d_inode->i_mapping; + file->f_mode = mode; + file->f_op = fop; + + /* + * These mounts don't really matter in practice + * for r/o bind mounts. They aren't userspace- + * visible. We do this for consistency, and so + * that we can do debugging checks at __fput() + */ + if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) { + file_take_write(file); + WARN_ON(mnt_clone_write(path->mnt)); + } + if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_inc(path->dentry->d_inode); + return file; +} +EXPORT_SYMBOL(alloc_file); + +/** + * drop_file_write_access - give up ability to write to a file + * @file: the file to which we will stop writing + * + * This is a central place which will give up the ability + * to write to @file, along with access to write through + * its vfsmount. + */ +void drop_file_write_access(struct file *file) +{ + struct vfsmount *mnt = file->f_path.mnt; + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + + put_write_access(inode); + + if (special_file(inode->i_mode)) + return; + if (file_check_writeable(file) != 0) + return; + mnt_drop_write(mnt); + file_release_write(file); +} +EXPORT_SYMBOL_GPL(drop_file_write_access); + +/* the real guts of fput() - releasing the last reference to file + */ +static void __fput(struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct vfsmount *mnt = file->f_path.mnt; + struct inode *inode = dentry->d_inode; + + might_sleep(); + + fsnotify_close(file); + /* + * The function eventpoll_release() should be the first called + * in the file cleanup chain. + */ + eventpoll_release(file); + locks_remove_flock(file); + + if (unlikely(file->f_flags & FASYNC)) { + if (file->f_op && file->f_op->fasync) + file->f_op->fasync(-1, file, 0); + } + if (file->f_op && file->f_op->release) + file->f_op->release(inode, file); + security_file_free(file); + ima_file_free(file); + if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && + !(file->f_mode & FMODE_PATH))) { + cdev_put(inode->i_cdev); + } + fops_put(file->f_op); + put_pid(file->f_owner.pid); + file_sb_list_del(file); + if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_dec(inode); + if (file->f_mode & FMODE_WRITE) + drop_file_write_access(file); + file->f_path.dentry = NULL; + file->f_path.mnt = NULL; + file_free(file); + dput(dentry); + mntput(mnt); +} + +void fput(struct file *file) +{ + if (atomic_long_dec_and_test(&file->f_count)) + __fput(file); +} + +EXPORT_SYMBOL(fput); + +struct file *fget(unsigned int fd) +{ + struct file *file; + struct files_struct *files = current->files; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + /* File object ref couldn't be taken */ + if (file->f_mode & FMODE_PATH || + !atomic_long_inc_not_zero(&file->f_count)) + file = NULL; + } + rcu_read_unlock(); + + return file; +} + +EXPORT_SYMBOL(fget); + +struct file *fget_raw(unsigned int fd) +{ + struct file *file; + struct files_struct *files = current->files; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + /* File object ref couldn't be taken */ + if (!atomic_long_inc_not_zero(&file->f_count)) + file = NULL; + } + rcu_read_unlock(); + + return file; +} + +EXPORT_SYMBOL(fget_raw); + +/* + * Lightweight file lookup - no refcnt increment if fd table isn't shared. + * + * You can use this instead of fget if you satisfy all of the following + * conditions: + * 1) You must call fput_light before exiting the syscall and returning control + * to userspace (i.e. you cannot remember the returned struct file * after + * returning to userspace). + * 2) You must not call filp_close on the returned struct file * in between + * calls to fget_light and fput_light. + * 3) You must not clone the current task in between the calls to fget_light + * and fput_light. + * + * The fput_needed flag returned by fget_light should be passed to the + * corresponding fput_light. + */ +struct file *fget_light(unsigned int fd, int *fput_needed) +{ + struct file *file; + struct files_struct *files = current->files; + + *fput_needed = 0; + if (atomic_read(&files->count) == 1) { + file = fcheck_files(files, fd); + if (file && (file->f_mode & FMODE_PATH)) + file = NULL; + } else { + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + if (!(file->f_mode & FMODE_PATH) && + atomic_long_inc_not_zero(&file->f_count)) + *fput_needed = 1; + else + /* Didn't get the reference, someone's freed */ + file = NULL; + } + rcu_read_unlock(); + } + + return file; +} + +struct file *fget_raw_light(unsigned int fd, int *fput_needed) +{ + struct file *file; + struct files_struct *files = current->files; + + *fput_needed = 0; + if (atomic_read(&files->count) == 1) { + file = fcheck_files(files, fd); + } else { + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + if (atomic_long_inc_not_zero(&file->f_count)) + *fput_needed = 1; + else + /* Didn't get the reference, someone's freed */ + file = NULL; + } + rcu_read_unlock(); + } + + return file; +} + +void put_filp(struct file *file) +{ + if (atomic_long_dec_and_test(&file->f_count)) { + security_file_free(file); + file_sb_list_del(file); + file_free(file); + } +} + +static inline int file_list_cpu(struct file *file) +{ +#ifdef CONFIG_SMP + return file->f_sb_list_cpu; +#else + return smp_processor_id(); +#endif +} + +/* helper for file_sb_list_add to reduce ifdefs */ +static inline void __file_sb_list_add(struct file *file, struct super_block *sb) +{ + struct list_head *list; +#ifdef CONFIG_SMP + int cpu; + cpu = smp_processor_id(); + file->f_sb_list_cpu = cpu; + list = per_cpu_ptr(sb->s_files, cpu); +#else + list = &sb->s_files; +#endif + list_add(&file->f_u.fu_list, list); +} + +/** + * file_sb_list_add - add a file to the sb's file list + * @file: file to add + * @sb: sb to add it to + * + * Use this function to associate a file with the superblock of the inode it + * refers to. + */ +void file_sb_list_add(struct file *file, struct super_block *sb) +{ + lg_local_lock(files_lglock); + __file_sb_list_add(file, sb); + lg_local_unlock(files_lglock); +} + +/** + * file_sb_list_del - remove a file from the sb's file list + * @file: file to remove + * @sb: sb to remove it from + * + * Use this function to remove a file from its superblock. + */ +void file_sb_list_del(struct file *file) +{ + if (!list_empty(&file->f_u.fu_list)) { + lg_local_lock_cpu(files_lglock, file_list_cpu(file)); + list_del_init(&file->f_u.fu_list); + lg_local_unlock_cpu(files_lglock, file_list_cpu(file)); + } +} + +#ifdef CONFIG_SMP + +/* + * These macros iterate all files on all CPUs for a given superblock. + * files_lglock must be held globally. + */ +#define do_file_list_for_each_entry(__sb, __file) \ +{ \ + int i; \ + for_each_possible_cpu(i) { \ + struct list_head *list; \ + list = per_cpu_ptr((__sb)->s_files, i); \ + list_for_each_entry((__file), list, f_u.fu_list) + +#define while_file_list_for_each_entry \ + } \ +} + +#else + +#define do_file_list_for_each_entry(__sb, __file) \ +{ \ + struct list_head *list; \ + list = &(sb)->s_files; \ + list_for_each_entry((__file), list, f_u.fu_list) + +#define while_file_list_for_each_entry \ +} + +#endif + +int fs_may_remount_ro(struct super_block *sb) +{ + struct file *file; + /* Check that no files are currently opened for writing. */ + lg_global_lock(files_lglock); + do_file_list_for_each_entry(sb, file) { + struct inode *inode = file->f_path.dentry->d_inode; + + /* File with pending delete? */ + if (inode->i_nlink == 0) + goto too_bad; + + /* Writeable file? */ + if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) + goto too_bad; + } while_file_list_for_each_entry; + lg_global_unlock(files_lglock); + return 1; /* Tis' cool bro. */ +too_bad: + lg_global_unlock(files_lglock); + return 0; +} + +/** + * mark_files_ro - mark all files read-only + * @sb: superblock in question + * + * All files are marked read-only. We don't care about pending + * delete files so this should be used in 'force' mode only. + */ +void mark_files_ro(struct super_block *sb) +{ + struct file *f; + +retry: + lg_global_lock(files_lglock); + do_file_list_for_each_entry(sb, f) { + struct vfsmount *mnt; + if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) + continue; + if (!file_count(f)) + continue; + if (!(f->f_mode & FMODE_WRITE)) + continue; + spin_lock(&f->f_lock); + f->f_mode &= ~FMODE_WRITE; + spin_unlock(&f->f_lock); + if (file_check_writeable(f) != 0) + continue; + file_release_write(f); + mnt = mntget(f->f_path.mnt); + /* This can sleep, so we can't hold the spinlock. */ + lg_global_unlock(files_lglock); + mnt_drop_write(mnt); + mntput(mnt); + goto retry; + } while_file_list_for_each_entry; + lg_global_unlock(files_lglock); +} + +void __init files_init(unsigned long mempages) +{ + unsigned long n; + + filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + /* + * One file with associated inode and dcache is very roughly 1K. + * Per default don't use more than 10% of our memory for files. + */ + + n = (mempages * (PAGE_SIZE / 1024)) / 10; + files_stat.max_files = max_t(unsigned long, n, NR_FILE); + files_defer_init(); + lg_lock_init(files_lglock); + percpu_counter_init(&nr_files, 0); +} diff --git a/fs/filesystems.c b/fs/filesystems.c new file mode 100644 index 00000000..0845f84f --- /dev/null +++ b/fs/filesystems.c @@ -0,0 +1,287 @@ +/* + * linux/fs/filesystems.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * table of configured filesystems + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Handling of filesystem drivers list. + * Rules: + * Inclusion to/removals from/scanning of list are protected by spinlock. + * During the unload module must call unregister_filesystem(). + * We can access the fields of list element if: + * 1) spinlock is held or + * 2) we hold the reference to the module. + * The latter can be guaranteed by call of try_module_get(); if it + * returned 0 we must skip the element, otherwise we got the reference. + * Once the reference is obtained we can drop the spinlock. + */ + +static struct file_system_type *file_systems; +static DEFINE_RWLOCK(file_systems_lock); + +/* WARNING: This can be used only if we _already_ own a reference */ +void get_filesystem(struct file_system_type *fs) +{ + __module_get(fs->owner); +} + +void put_filesystem(struct file_system_type *fs) +{ + module_put(fs->owner); +} + +static struct file_system_type **find_filesystem(const char *name, unsigned len) +{ + struct file_system_type **p; + for (p=&file_systems; *p; p=&(*p)->next) + if (strlen((*p)->name) == len && + strncmp((*p)->name, name, len) == 0) + break; + return p; +} + +/** + * register_filesystem - register a new filesystem + * @fs: the file system structure + * + * Adds the file system passed to the list of file systems the kernel + * is aware of for mount and other syscalls. Returns 0 on success, + * or a negative errno code on an error. + * + * The &struct file_system_type that is passed is linked into the kernel + * structures and must not be freed until the file system has been + * unregistered. + */ + +int register_filesystem(struct file_system_type * fs) +{ + int res = 0; + struct file_system_type ** p; + + BUG_ON(strchr(fs->name, '.')); + if (fs->next) + return -EBUSY; + INIT_LIST_HEAD(&fs->fs_supers); + write_lock(&file_systems_lock); + p = find_filesystem(fs->name, strlen(fs->name)); + if (*p) + res = -EBUSY; + else + *p = fs; + write_unlock(&file_systems_lock); + return res; +} + +EXPORT_SYMBOL(register_filesystem); + +/** + * unregister_filesystem - unregister a file system + * @fs: filesystem to unregister + * + * Remove a file system that was previously successfully registered + * with the kernel. An error is returned if the file system is not found. + * Zero is returned on a success. + * + * Once this function has returned the &struct file_system_type structure + * may be freed or reused. + */ + +int unregister_filesystem(struct file_system_type * fs) +{ + struct file_system_type ** tmp; + + write_lock(&file_systems_lock); + tmp = &file_systems; + while (*tmp) { + if (fs == *tmp) { + *tmp = fs->next; + fs->next = NULL; + write_unlock(&file_systems_lock); + synchronize_rcu(); + return 0; + } + tmp = &(*tmp)->next; + } + write_unlock(&file_systems_lock); + + return -EINVAL; +} + +EXPORT_SYMBOL(unregister_filesystem); + +static int fs_index(const char __user * __name) +{ + struct file_system_type * tmp; + char * name; + int err, index; + + name = getname(__name); + err = PTR_ERR(name); + if (IS_ERR(name)) + return err; + + err = -EINVAL; + read_lock(&file_systems_lock); + for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { + if (strcmp(tmp->name,name) == 0) { + err = index; + break; + } + } + read_unlock(&file_systems_lock); + putname(name); + return err; +} + +static int fs_name(unsigned int index, char __user * buf) +{ + struct file_system_type * tmp; + int len, res; + + read_lock(&file_systems_lock); + for (tmp = file_systems; tmp; tmp = tmp->next, index--) + if (index <= 0 && try_module_get(tmp->owner)) + break; + read_unlock(&file_systems_lock); + if (!tmp) + return -EINVAL; + + /* OK, we got the reference, so we can safely block */ + len = strlen(tmp->name) + 1; + res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; + put_filesystem(tmp); + return res; +} + +static int fs_maxindex(void) +{ + struct file_system_type * tmp; + int index; + + read_lock(&file_systems_lock); + for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) + ; + read_unlock(&file_systems_lock); + return index; +} + +/* + * Whee.. Weird sysv syscall. + */ +SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) +{ + int retval = -EINVAL; + + switch (option) { + case 1: + retval = fs_index((const char __user *) arg1); + break; + + case 2: + retval = fs_name(arg1, (char __user *) arg2); + break; + + case 3: + retval = fs_maxindex(); + break; + } + return retval; +} + +int __init get_filesystem_list(char *buf) +{ + int len = 0; + struct file_system_type * tmp; + + read_lock(&file_systems_lock); + tmp = file_systems; + while (tmp && len < PAGE_SIZE - 80) { + len += sprintf(buf+len, "%s\t%s\n", + (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + tmp->name); + tmp = tmp->next; + } + read_unlock(&file_systems_lock); + return len; +} + +#ifdef CONFIG_PROC_FS +static int filesystems_proc_show(struct seq_file *m, void *v) +{ + struct file_system_type * tmp; + + read_lock(&file_systems_lock); + tmp = file_systems; + while (tmp) { + seq_printf(m, "%s\t%s\n", + (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + tmp->name); + tmp = tmp->next; + } + read_unlock(&file_systems_lock); + return 0; +} + +static int filesystems_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, filesystems_proc_show, NULL); +} + +static const struct file_operations filesystems_proc_fops = { + .open = filesystems_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_filesystems_init(void) +{ + proc_create("filesystems", 0, NULL, &filesystems_proc_fops); + return 0; +} +module_init(proc_filesystems_init); +#endif + +static struct file_system_type *__get_fs_type(const char *name, int len) +{ + struct file_system_type *fs; + + read_lock(&file_systems_lock); + fs = *(find_filesystem(name, len)); + if (fs && !try_module_get(fs->owner)) + fs = NULL; + read_unlock(&file_systems_lock); + return fs; +} + +struct file_system_type *get_fs_type(const char *name) +{ + struct file_system_type *fs; + const char *dot = strchr(name, '.'); + int len = dot ? dot - name : strlen(name); + + fs = __get_fs_type(name, len); + if (!fs && (request_module("%.*s", len, name) == 0)) + fs = __get_fs_type(name, len); + + if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { + put_filesystem(fs); + fs = NULL; + } + return fs; +} + +EXPORT_SYMBOL(get_fs_type); diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig new file mode 100644 index 00000000..8dc1cd5c --- /dev/null +++ b/fs/freevxfs/Kconfig @@ -0,0 +1,16 @@ +config VXFS_FS + tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" + depends on BLOCK + help + FreeVxFS is a file system driver that support the VERITAS VxFS(TM) + file system format. VERITAS VxFS(TM) is the standard file system + of SCO UnixWare (and possibly others) and optionally available + for Sunsoft Solaris, HP-UX and many other operating systems. + Currently only readonly access is supported. + + NOTE: the file system type as used by mount(1), mount(2) and + fstab(5) is 'vxfs' as it describes the file system format, not + the actual driver. + + To compile this as a module, choose M here: the module will be + called freevxfs. If unsure, say N. diff --git a/fs/freevxfs/Makefile b/fs/freevxfs/Makefile new file mode 100644 index 00000000..87ad0974 --- /dev/null +++ b/fs/freevxfs/Makefile @@ -0,0 +1,8 @@ +# +# VxFS Makefile +# + +obj-$(CONFIG_VXFS_FS) += freevxfs.o + +freevxfs-objs := vxfs_bmap.o vxfs_fshead.o vxfs_immed.o vxfs_inode.o \ + vxfs_lookup.o vxfs_olt.o vxfs_subr.o vxfs_super.o diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h new file mode 100644 index 00000000..c8a92652 --- /dev/null +++ b/fs/freevxfs/vxfs.h @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#ifndef _VXFS_SUPER_H_ +#define _VXFS_SUPER_H_ + +/* + * Veritas filesystem driver - superblock structure. + * + * This file contains the definition of the disk and core + * superblocks of the Veritas Filesystem. + */ +#include + + +/* + * Data types for use with the VxFS ondisk format. + */ +typedef int32_t vx_daddr_t; +typedef int32_t vx_ino_t; + +/* + * Superblock magic number (vxfs_super->vs_magic). + */ +#define VXFS_SUPER_MAGIC 0xa501FCF5 + +/* + * The root inode. + */ +#define VXFS_ROOT_INO 2 + +/* + * Num of entries in free extent array + */ +#define VXFS_NEFREE 32 + + +/* + * VxFS superblock (disk). + */ +struct vxfs_sb { + /* + * Readonly fields for the version 1 superblock. + * + * Lots of this fields are no more used by version 2 + * and never filesystems. + */ + u_int32_t vs_magic; /* Magic number */ + int32_t vs_version; /* VxFS version */ + u_int32_t vs_ctime; /* create time - secs */ + u_int32_t vs_cutime; /* create time - usecs */ + int32_t __unused1; /* unused */ + int32_t __unused2; /* unused */ + vx_daddr_t vs_old_logstart; /* obsolete */ + vx_daddr_t vs_old_logend; /* obsolete */ + int32_t vs_bsize; /* block size */ + int32_t vs_size; /* number of blocks */ + int32_t vs_dsize; /* number of data blocks */ + u_int32_t vs_old_ninode; /* obsolete */ + int32_t vs_old_nau; /* obsolete */ + int32_t __unused3; /* unused */ + int32_t vs_old_defiextsize; /* obsolete */ + int32_t vs_old_ilbsize; /* obsolete */ + int32_t vs_immedlen; /* size of immediate data area */ + int32_t vs_ndaddr; /* number of direct extentes */ + vx_daddr_t vs_firstau; /* address of first AU */ + vx_daddr_t vs_emap; /* offset of extent map in AU */ + vx_daddr_t vs_imap; /* offset of inode map in AU */ + vx_daddr_t vs_iextop; /* offset of ExtOp. map in AU */ + vx_daddr_t vs_istart; /* offset of inode list in AU */ + vx_daddr_t vs_bstart; /* offset of fdblock in AU */ + vx_daddr_t vs_femap; /* aufirst + emap */ + vx_daddr_t vs_fimap; /* aufirst + imap */ + vx_daddr_t vs_fiextop; /* aufirst + iextop */ + vx_daddr_t vs_fistart; /* aufirst + istart */ + vx_daddr_t vs_fbstart; /* aufirst + bstart */ + int32_t vs_nindir; /* number of entries in indir */ + int32_t vs_aulen; /* length of AU in blocks */ + int32_t vs_auimlen; /* length of imap in blocks */ + int32_t vs_auemlen; /* length of emap in blocks */ + int32_t vs_auilen; /* length of ilist in blocks */ + int32_t vs_aupad; /* length of pad in blocks */ + int32_t vs_aublocks; /* data blocks in AU */ + int32_t vs_maxtier; /* log base 2 of aublocks */ + int32_t vs_inopb; /* number of inodes per blk */ + int32_t vs_old_inopau; /* obsolete */ + int32_t vs_old_inopilb; /* obsolete */ + int32_t vs_old_ndiripau; /* obsolete */ + int32_t vs_iaddrlen; /* size of indirect addr ext. */ + int32_t vs_bshift; /* log base 2 of bsize */ + int32_t vs_inoshift; /* log base 2 of inobp */ + int32_t vs_bmask; /* ~( bsize - 1 ) */ + int32_t vs_boffmask; /* bsize - 1 */ + int32_t vs_old_inomask; /* old_inopilb - 1 */ + int32_t vs_checksum; /* checksum of V1 data */ + + /* + * Version 1, writable + */ + int32_t vs_free; /* number of free blocks */ + int32_t vs_ifree; /* number of free inodes */ + int32_t vs_efree[VXFS_NEFREE]; /* number of free extents by size */ + int32_t vs_flags; /* flags ?!? */ + u_int8_t vs_mod; /* filesystem has been changed */ + u_int8_t vs_clean; /* clean FS */ + u_int16_t __unused4; /* unused */ + u_int32_t vs_firstlogid; /* mount time log ID */ + u_int32_t vs_wtime; /* last time written - sec */ + u_int32_t vs_wutime; /* last time written - usec */ + u_int8_t vs_fname[6]; /* FS name */ + u_int8_t vs_fpack[6]; /* FS pack name */ + int32_t vs_logversion; /* log format version */ + int32_t __unused5; /* unused */ + + /* + * Version 2, Read-only + */ + vx_daddr_t vs_oltext[2]; /* OLT extent and replica */ + int32_t vs_oltsize; /* OLT extent size */ + int32_t vs_iauimlen; /* size of inode map */ + int32_t vs_iausize; /* size of IAU in blocks */ + int32_t vs_dinosize; /* size of inode in bytes */ + int32_t vs_old_dniaddr; /* indir levels per inode */ + int32_t vs_checksum2; /* checksum of V2 RO */ + + /* + * Actually much more... + */ +}; + + +/* + * In core superblock filesystem private data for VxFS. + */ +struct vxfs_sb_info { + struct vxfs_sb *vsi_raw; /* raw (on disk) superblock */ + struct buffer_head *vsi_bp; /* buffer for raw superblock*/ + struct inode *vsi_fship; /* fileset header inode */ + struct inode *vsi_ilist; /* inode list inode */ + struct inode *vsi_stilist; /* structural inode list inode */ + u_long vsi_iext; /* initial inode list */ + ino_t vsi_fshino; /* fileset header inode */ + daddr_t vsi_oltext; /* OLT extent */ + daddr_t vsi_oltsize; /* OLT size */ +}; + + +/* + * File modes. File types above 0xf000 are vxfs internal only, they should + * not be passed back to higher levels of the system. vxfs file types must + * never have one of the regular file type bits set. + */ +enum vxfs_mode { + VXFS_ISUID = 0x00000800, /* setuid */ + VXFS_ISGID = 0x00000400, /* setgid */ + VXFS_ISVTX = 0x00000200, /* sticky bit */ + VXFS_IREAD = 0x00000100, /* read */ + VXFS_IWRITE = 0x00000080, /* write */ + VXFS_IEXEC = 0x00000040, /* exec */ + + VXFS_IFIFO = 0x00001000, /* Named pipe */ + VXFS_IFCHR = 0x00002000, /* Character device */ + VXFS_IFDIR = 0x00004000, /* Directory */ + VXFS_IFNAM = 0x00005000, /* Xenix device ?? */ + VXFS_IFBLK = 0x00006000, /* Block device */ + VXFS_IFREG = 0x00008000, /* Regular file */ + VXFS_IFCMP = 0x00009000, /* Compressed file ?!? */ + VXFS_IFLNK = 0x0000a000, /* Symlink */ + VXFS_IFSOC = 0x0000c000, /* Socket */ + + /* VxFS internal */ + VXFS_IFFSH = 0x10000000, /* Fileset header */ + VXFS_IFILT = 0x20000000, /* Inode list */ + VXFS_IFIAU = 0x30000000, /* Inode allocation unit */ + VXFS_IFCUT = 0x40000000, /* Current usage table */ + VXFS_IFATT = 0x50000000, /* Attr. inode */ + VXFS_IFLCT = 0x60000000, /* Link count table */ + VXFS_IFIAT = 0x70000000, /* Indirect attribute file */ + VXFS_IFEMR = 0x80000000, /* Extent map reorg file */ + VXFS_IFQUO = 0x90000000, /* BSD quota file */ + VXFS_IFPTI = 0xa0000000, /* "Pass through" inode */ + VXFS_IFLAB = 0x11000000, /* Device label file */ + VXFS_IFOLT = 0x12000000, /* OLT file */ + VXFS_IFLOG = 0x13000000, /* Log file */ + VXFS_IFEMP = 0x14000000, /* Extent map file */ + VXFS_IFEAU = 0x15000000, /* Extent AU file */ + VXFS_IFAUS = 0x16000000, /* Extent AU summary file */ + VXFS_IFDEV = 0x17000000, /* Device config file */ + +}; + +#define VXFS_TYPE_MASK 0xfffff000 + +#define VXFS_IS_TYPE(ip,type) (((ip)->vii_mode & VXFS_TYPE_MASK) == (type)) +#define VXFS_ISFIFO(x) VXFS_IS_TYPE((x),VXFS_IFIFO) +#define VXFS_ISCHR(x) VXFS_IS_TYPE((x),VXFS_IFCHR) +#define VXFS_ISDIR(x) VXFS_IS_TYPE((x),VXFS_IFDIR) +#define VXFS_ISNAM(x) VXFS_IS_TYPE((x),VXFS_IFNAM) +#define VXFS_ISBLK(x) VXFS_IS_TYPE((x),VXFS_IFBLK) +#define VXFS_ISLNK(x) VXFS_IS_TYPE((x),VXFS_IFLNK) +#define VXFS_ISREG(x) VXFS_IS_TYPE((x),VXFS_IFREG) +#define VXFS_ISCMP(x) VXFS_IS_TYPE((x),VXFS_IFCMP) +#define VXFS_ISSOC(x) VXFS_IS_TYPE((x),VXFS_IFSOC) + +#define VXFS_ISFSH(x) VXFS_IS_TYPE((x),VXFS_IFFSH) +#define VXFS_ISILT(x) VXFS_IS_TYPE((x),VXFS_IFILT) + +/* + * Inmode organisation types. + */ +enum { + VXFS_ORG_NONE = 0, /* Inode has *no* format ?!? */ + VXFS_ORG_EXT4 = 1, /* Ext4 */ + VXFS_ORG_IMMED = 2, /* All data stored in inode */ + VXFS_ORG_TYPED = 3, /* Typed extents */ +}; + +#define VXFS_IS_ORG(ip,org) ((ip)->vii_orgtype == (org)) +#define VXFS_ISNONE(ip) VXFS_IS_ORG((ip), VXFS_ORG_NONE) +#define VXFS_ISEXT4(ip) VXFS_IS_ORG((ip), VXFS_ORG_EXT4) +#define VXFS_ISIMMED(ip) VXFS_IS_ORG((ip), VXFS_ORG_IMMED) +#define VXFS_ISTYPED(ip) VXFS_IS_ORG((ip), VXFS_ORG_TYPED) + + +/* + * Get filesystem private data from VFS inode. + */ +#define VXFS_INO(ip) \ + ((struct vxfs_inode_info *)(ip)->i_private) + +/* + * Get filesystem private data from VFS superblock. + */ +#define VXFS_SBI(sbp) \ + ((struct vxfs_sb_info *)(sbp)->s_fs_info) + +#endif /* _VXFS_SUPER_H_ */ diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c new file mode 100644 index 00000000..f86fd3ca --- /dev/null +++ b/fs/freevxfs/vxfs_bmap.c @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - filesystem to disk block mapping. + */ +#include +#include +#include + +#include "vxfs.h" +#include "vxfs_inode.h" +#include "vxfs_extern.h" + + +#ifdef DIAGNOSTIC +static void +vxfs_typdump(struct vxfs_typed *typ) +{ + printk(KERN_DEBUG "type=%Lu ", typ->vt_hdr >> VXFS_TYPED_TYPESHIFT); + printk("offset=%Lx ", typ->vt_hdr & VXFS_TYPED_OFFSETMASK); + printk("block=%x ", typ->vt_block); + printk("size=%x\n", typ->vt_size); +} +#endif + +/** + * vxfs_bmap_ext4 - do bmap for ext4 extents + * @ip: pointer to the inode we do bmap for + * @iblock: logical block. + * + * Description: + * vxfs_bmap_ext4 performs the bmap operation for inodes with + * ext4-style extents (which are much like the traditional UNIX + * inode organisation). + * + * Returns: + * The physical block number on success, else Zero. + */ +static daddr_t +vxfs_bmap_ext4(struct inode *ip, long bn) +{ + struct super_block *sb = ip->i_sb; + struct vxfs_inode_info *vip = VXFS_INO(ip); + unsigned long bsize = sb->s_blocksize; + u32 indsize = vip->vii_ext4.ve4_indsize; + int i; + + if (indsize > sb->s_blocksize) + goto fail_size; + + for (i = 0; i < VXFS_NDADDR; i++) { + struct direct *d = vip->vii_ext4.ve4_direct + i; + if (bn >= 0 && bn < d->size) + return (bn + d->extent); + bn -= d->size; + } + + if ((bn / (indsize * indsize * bsize / 4)) == 0) { + struct buffer_head *buf; + daddr_t bno; + u32 *indir; + + buf = sb_bread(sb, vip->vii_ext4.ve4_indir[0]); + if (!buf || !buffer_mapped(buf)) + goto fail_buf; + + indir = (u32 *)buf->b_data; + bno = indir[(bn/indsize) % (indsize*bn)] + (bn%indsize); + + brelse(buf); + return bno; + } else + printk(KERN_WARNING "no matching indir?"); + + return 0; + +fail_size: + printk("vxfs: indirect extent too big!\n"); +fail_buf: + return 0; +} + +/** + * vxfs_bmap_indir - recursion for vxfs_bmap_typed + * @ip: pointer to the inode we do bmap for + * @indir: indirect block we start reading at + * @size: size of the typed area to search + * @block: partially result from further searches + * + * Description: + * vxfs_bmap_indir reads a &struct vxfs_typed at @indir + * and performs the type-defined action. + * + * Return Value: + * The physical block number on success, else Zero. + * + * Note: + * Kernelstack is rare. Unrecurse? + */ +static daddr_t +vxfs_bmap_indir(struct inode *ip, long indir, int size, long block) +{ + struct buffer_head *bp = NULL; + daddr_t pblock = 0; + int i; + + for (i = 0; i < size * VXFS_TYPED_PER_BLOCK(ip->i_sb); i++) { + struct vxfs_typed *typ; + int64_t off; + + bp = sb_bread(ip->i_sb, + indir + (i / VXFS_TYPED_PER_BLOCK(ip->i_sb))); + if (!bp || !buffer_mapped(bp)) + return 0; + + typ = ((struct vxfs_typed *)bp->b_data) + + (i % VXFS_TYPED_PER_BLOCK(ip->i_sb)); + off = (typ->vt_hdr & VXFS_TYPED_OFFSETMASK); + + if (block < off) { + brelse(bp); + continue; + } + + switch ((u_int32_t)(typ->vt_hdr >> VXFS_TYPED_TYPESHIFT)) { + case VXFS_TYPED_INDIRECT: + pblock = vxfs_bmap_indir(ip, typ->vt_block, + typ->vt_size, block - off); + if (pblock == -2) + break; + goto out; + case VXFS_TYPED_DATA: + if ((block - off) >= typ->vt_size) + break; + pblock = (typ->vt_block + block - off); + goto out; + case VXFS_TYPED_INDIRECT_DEV4: + case VXFS_TYPED_DATA_DEV4: { + struct vxfs_typed_dev4 *typ4 = + (struct vxfs_typed_dev4 *)typ; + + printk(KERN_INFO "\n\nTYPED_DEV4 detected!\n"); + printk(KERN_INFO "block: %Lu\tsize: %Ld\tdev: %d\n", + (unsigned long long) typ4->vd4_block, + (unsigned long long) typ4->vd4_size, + typ4->vd4_dev); + goto fail; + } + default: + BUG(); + } + brelse(bp); + } + +fail: + pblock = 0; +out: + brelse(bp); + return (pblock); +} + +/** + * vxfs_bmap_typed - bmap for typed extents + * @ip: pointer to the inode we do bmap for + * @iblock: logical block + * + * Description: + * Performs the bmap operation for typed extents. + * + * Return Value: + * The physical block number on success, else Zero. + */ +static daddr_t +vxfs_bmap_typed(struct inode *ip, long iblock) +{ + struct vxfs_inode_info *vip = VXFS_INO(ip); + daddr_t pblock = 0; + int i; + + for (i = 0; i < VXFS_NTYPED; i++) { + struct vxfs_typed *typ = vip->vii_org.typed + i; + int64_t off = (typ->vt_hdr & VXFS_TYPED_OFFSETMASK); + +#ifdef DIAGNOSTIC + vxfs_typdump(typ); +#endif + if (iblock < off) + continue; + switch ((u_int32_t)(typ->vt_hdr >> VXFS_TYPED_TYPESHIFT)) { + case VXFS_TYPED_INDIRECT: + pblock = vxfs_bmap_indir(ip, typ->vt_block, + typ->vt_size, iblock - off); + if (pblock == -2) + break; + return (pblock); + case VXFS_TYPED_DATA: + if ((iblock - off) < typ->vt_size) + return (typ->vt_block + iblock - off); + break; + case VXFS_TYPED_INDIRECT_DEV4: + case VXFS_TYPED_DATA_DEV4: { + struct vxfs_typed_dev4 *typ4 = + (struct vxfs_typed_dev4 *)typ; + + printk(KERN_INFO "\n\nTYPED_DEV4 detected!\n"); + printk(KERN_INFO "block: %Lu\tsize: %Ld\tdev: %d\n", + (unsigned long long) typ4->vd4_block, + (unsigned long long) typ4->vd4_size, + typ4->vd4_dev); + return 0; + } + default: + BUG(); + } + } + + return 0; +} + +/** + * vxfs_bmap1 - vxfs-internal bmap operation + * @ip: pointer to the inode we do bmap for + * @iblock: logical block + * + * Description: + * vxfs_bmap1 perfoms a logical to physical block mapping + * for vxfs-internal purposes. + * + * Return Value: + * The physical block number on success, else Zero. + */ +daddr_t +vxfs_bmap1(struct inode *ip, long iblock) +{ + struct vxfs_inode_info *vip = VXFS_INO(ip); + + if (VXFS_ISEXT4(vip)) + return vxfs_bmap_ext4(ip, iblock); + if (VXFS_ISTYPED(vip)) + return vxfs_bmap_typed(ip, iblock); + if (VXFS_ISNONE(vip)) + goto unsupp; + if (VXFS_ISIMMED(vip)) + goto unsupp; + + printk(KERN_WARNING "vxfs: inode %ld has no valid orgtype (%x)\n", + ip->i_ino, vip->vii_orgtype); + BUG(); + +unsupp: + printk(KERN_WARNING "vxfs: inode %ld has an unsupported orgtype (%x)\n", + ip->i_ino, vip->vii_orgtype); + return 0; +} diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h new file mode 100644 index 00000000..aaf1fb09 --- /dev/null +++ b/fs/freevxfs/vxfs_dir.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#ifndef _VXFS_DIR_H_ +#define _VXFS_DIR_H_ + +/* + * Veritas filesystem driver - directory structure. + * + * This file contains the definition of the vxfs directory format. + */ + + +/* + * VxFS directory block header. + * + * This entry is the head of every filesystem block in a directory. + * It is used for free space management and additionally includes + * a hash for speeding up directory search (lookup). + * + * The hash may be empty and in fact we do not use it all in the + * Linux driver for now. + */ +struct vxfs_dirblk { + u_int16_t d_free; /* free space in dirblock */ + u_int16_t d_nhash; /* no of hash chains */ + u_int16_t d_hash[1]; /* hash chain */ +}; + +/* + * VXFS_NAMELEN is the maximum length of the d_name field + * of an VxFS directory entry. + */ +#define VXFS_NAMELEN 256 + +/* + * VxFS directory entry. + */ +struct vxfs_direct { + vx_ino_t d_ino; /* inode number */ + u_int16_t d_reclen; /* record length */ + u_int16_t d_namelen; /* d_name length */ + u_int16_t d_hashnext; /* next hash entry */ + char d_name[VXFS_NAMELEN]; /* name */ +}; + +/* + * VXFS_DIRPAD defines the directory entry boundaries, is _must_ be + * a multiple of four. + * VXFS_NAMEMIN is the length of a directory entry with a NULL d_name. + * VXFS_DIRROUND is an internal macros that rounds a length to a value + * usable for directory sizes. + * VXFS_DIRLEN calculates the directory entry size for an entry with + * a d_name with size len. + */ +#define VXFS_DIRPAD 4 +#define VXFS_NAMEMIN offsetof(struct vxfs_direct, d_name) +#define VXFS_DIRROUND(len) ((VXFS_DIRPAD + (len) - 1) & ~(VXFS_DIRPAD -1)) +#define VXFS_DIRLEN(len) (VXFS_DIRROUND(VXFS_NAMEMIN + (len))) + +/* + * VXFS_DIRBLKOV is the overhead of a specific dirblock. + */ +#define VXFS_DIRBLKOV(dbp) ((sizeof(short) * dbp->d_nhash) + 4) + +#endif /* _VXFS_DIR_H_ */ diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h new file mode 100644 index 00000000..881aa3d2 --- /dev/null +++ b/fs/freevxfs/vxfs_extern.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#ifndef _VXFS_EXTERN_H_ +#define _VXFS_EXTERN_H_ + +/* + * Veritas filesystem driver - external prototypes. + * + * This file contains prototypes for all vxfs functions used + * outside their respective source files. + */ + + +struct kmem_cache; +struct super_block; +struct vxfs_inode_info; +struct inode; + + +/* vxfs_bmap.c */ +extern daddr_t vxfs_bmap1(struct inode *, long); + +/* vxfs_fshead.c */ +extern int vxfs_read_fshead(struct super_block *); + +/* vxfs_immed.c */ +extern const struct inode_operations vxfs_immed_symlink_iops; + +/* vxfs_inode.c */ +extern const struct address_space_operations vxfs_immed_aops; +extern struct kmem_cache *vxfs_inode_cachep; +extern void vxfs_dumpi(struct vxfs_inode_info *, ino_t); +extern struct inode * vxfs_get_fake_inode(struct super_block *, + struct vxfs_inode_info *); +extern void vxfs_put_fake_inode(struct inode *); +extern struct vxfs_inode_info * vxfs_blkiget(struct super_block *, u_long, ino_t); +extern struct vxfs_inode_info * vxfs_stiget(struct super_block *, ino_t); +extern struct inode * vxfs_iget(struct super_block *, ino_t); +extern void vxfs_evict_inode(struct inode *); + +/* vxfs_lookup.c */ +extern const struct inode_operations vxfs_dir_inode_ops; +extern const struct file_operations vxfs_dir_operations; + +/* vxfs_olt.c */ +extern int vxfs_read_olt(struct super_block *, u_long); + +/* vxfs_subr.c */ +extern const struct address_space_operations vxfs_aops; +extern struct page * vxfs_get_page(struct address_space *, u_long); +extern void vxfs_put_page(struct page *); +extern struct buffer_head * vxfs_bread(struct inode *, int); + +#endif /* _VXFS_EXTERN_H_ */ diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c new file mode 100644 index 00000000..c9a6a94e --- /dev/null +++ b/fs/freevxfs/vxfs_fshead.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - fileset header routines. + */ +#include +#include +#include +#include +#include + +#include "vxfs.h" +#include "vxfs_inode.h" +#include "vxfs_extern.h" +#include "vxfs_fshead.h" + + +#ifdef DIAGNOSTIC +static void +vxfs_dumpfsh(struct vxfs_fsh *fhp) +{ + printk("\n\ndumping fileset header:\n"); + printk("----------------------------\n"); + printk("version: %u\n", fhp->fsh_version); + printk("fsindex: %u\n", fhp->fsh_fsindex); + printk("iauino: %u\tninodes:%u\n", + fhp->fsh_iauino, fhp->fsh_ninodes); + printk("maxinode: %u\tlctino: %u\n", + fhp->fsh_maxinode, fhp->fsh_lctino); + printk("nau: %u\n", fhp->fsh_nau); + printk("ilistino[0]: %u\tilistino[1]: %u\n", + fhp->fsh_ilistino[0], fhp->fsh_ilistino[1]); +} +#endif + +/** + * vxfs_getfsh - read fileset header into memory + * @ip: the (fake) fileset header inode + * @which: 0 for the structural, 1 for the primary fsh. + * + * Description: + * vxfs_getfsh reads either the structural or primary fileset header + * described by @ip into memory. + * + * Returns: + * The fileset header structure on success, else Zero. + */ +static struct vxfs_fsh * +vxfs_getfsh(struct inode *ip, int which) +{ + struct buffer_head *bp; + + bp = vxfs_bread(ip, which); + if (bp) { + struct vxfs_fsh *fhp; + + if (!(fhp = kmalloc(sizeof(*fhp), GFP_KERNEL))) + goto out; + memcpy(fhp, bp->b_data, sizeof(*fhp)); + + put_bh(bp); + return (fhp); + } +out: + brelse(bp); + return NULL; +} + +/** + * vxfs_read_fshead - read the fileset headers + * @sbp: superblock to which the fileset belongs + * + * Description: + * vxfs_read_fshead will fill the inode and structural inode list in @sb. + * + * Returns: + * Zero on success, else a negative error code (-EINVAL). + */ +int +vxfs_read_fshead(struct super_block *sbp) +{ + struct vxfs_sb_info *infp = VXFS_SBI(sbp); + struct vxfs_fsh *pfp, *sfp; + struct vxfs_inode_info *vip, *tip; + + vip = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino); + if (!vip) { + printk(KERN_ERR "vxfs: unable to read fsh inode\n"); + return -EINVAL; + } + if (!VXFS_ISFSH(vip)) { + printk(KERN_ERR "vxfs: fsh list inode is of wrong type (%x)\n", + vip->vii_mode & VXFS_TYPE_MASK); + goto out_free_fship; + } + + +#ifdef DIAGNOSTIC + printk("vxfs: fsh inode dump:\n"); + vxfs_dumpi(vip, infp->vsi_fshino); +#endif + + infp->vsi_fship = vxfs_get_fake_inode(sbp, vip); + if (!infp->vsi_fship) { + printk(KERN_ERR "vxfs: unable to get fsh inode\n"); + goto out_free_fship; + } + + sfp = vxfs_getfsh(infp->vsi_fship, 0); + if (!sfp) { + printk(KERN_ERR "vxfs: unable to get structural fsh\n"); + goto out_iput_fship; + } + +#ifdef DIAGNOSTIC + vxfs_dumpfsh(sfp); +#endif + + pfp = vxfs_getfsh(infp->vsi_fship, 1); + if (!pfp) { + printk(KERN_ERR "vxfs: unable to get primary fsh\n"); + goto out_free_sfp; + } + +#ifdef DIAGNOSTIC + vxfs_dumpfsh(pfp); +#endif + + tip = vxfs_blkiget(sbp, infp->vsi_iext, sfp->fsh_ilistino[0]); + if (!tip) + goto out_free_pfp; + + infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip); + if (!infp->vsi_stilist) { + printk(KERN_ERR "vxfs: unable to get structural list inode\n"); + kfree(tip); + goto out_free_pfp; + } + if (!VXFS_ISILT(VXFS_INO(infp->vsi_stilist))) { + printk(KERN_ERR "vxfs: structural list inode is of wrong type (%x)\n", + VXFS_INO(infp->vsi_stilist)->vii_mode & VXFS_TYPE_MASK); + goto out_iput_stilist; + } + + tip = vxfs_stiget(sbp, pfp->fsh_ilistino[0]); + if (!tip) + goto out_iput_stilist; + infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip); + if (!infp->vsi_ilist) { + printk(KERN_ERR "vxfs: unable to get inode list inode\n"); + kfree(tip); + goto out_iput_stilist; + } + if (!VXFS_ISILT(VXFS_INO(infp->vsi_ilist))) { + printk(KERN_ERR "vxfs: inode list inode is of wrong type (%x)\n", + VXFS_INO(infp->vsi_ilist)->vii_mode & VXFS_TYPE_MASK); + goto out_iput_ilist; + } + + return 0; + + out_iput_ilist: + iput(infp->vsi_ilist); + out_iput_stilist: + iput(infp->vsi_stilist); + out_free_pfp: + kfree(pfp); + out_free_sfp: + kfree(sfp); + out_iput_fship: + iput(infp->vsi_fship); + return -EINVAL; + out_free_fship: + kfree(vip); + return -EINVAL; +} diff --git a/fs/freevxfs/vxfs_fshead.h b/fs/freevxfs/vxfs_fshead.h new file mode 100644 index 00000000..ead0d640 --- /dev/null +++ b/fs/freevxfs/vxfs_fshead.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#ifndef _VXFS_FSHEAD_H_ +#define _VXFS_FSHEAD_H_ + +/* + * Veritas filesystem driver - fileset header structures. + * + * This file contains the physical structure of the VxFS + * fileset header. + */ + + +/* + * Fileset header + */ +struct vxfs_fsh { + u_int32_t fsh_version; /* fileset header version */ + u_int32_t fsh_fsindex; /* fileset index */ + u_int32_t fsh_time; /* modification time - sec */ + u_int32_t fsh_utime; /* modification time - usec */ + u_int32_t fsh_extop; /* extop flags */ + vx_ino_t fsh_ninodes; /* allocated inodes */ + u_int32_t fsh_nau; /* number of IAUs */ + u_int32_t fsh_old_ilesize; /* old size of ilist */ + u_int32_t fsh_dflags; /* flags */ + u_int32_t fsh_quota; /* quota limit */ + vx_ino_t fsh_maxinode; /* maximum inode number */ + vx_ino_t fsh_iauino; /* IAU inode */ + vx_ino_t fsh_ilistino[2]; /* ilist inodes */ + vx_ino_t fsh_lctino; /* link count table inode */ + + /* + * Slightly more fields follow, but they + * a) are not of any interest for us, and + * b) differ a lot in different vxfs versions/ports + */ +}; + +#endif /* _VXFS_FSHEAD_H_ */ diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c new file mode 100644 index 00000000..c36aeaf9 --- /dev/null +++ b/fs/freevxfs/vxfs_immed.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - support for 'immed' inodes. + */ +#include +#include +#include + +#include "vxfs.h" +#include "vxfs_extern.h" +#include "vxfs_inode.h" + + +static void * vxfs_immed_follow_link(struct dentry *, struct nameidata *); + +static int vxfs_immed_readpage(struct file *, struct page *); + +/* + * Inode operations for immed symlinks. + * + * Unliked all other operations we do not go through the pagecache, + * but do all work directly on the inode. + */ +const struct inode_operations vxfs_immed_symlink_iops = { + .readlink = generic_readlink, + .follow_link = vxfs_immed_follow_link, +}; + +/* + * Address space operations for immed files and directories. + */ +const struct address_space_operations vxfs_immed_aops = { + .readpage = vxfs_immed_readpage, +}; + +/** + * vxfs_immed_follow_link - follow immed symlink + * @dp: dentry for the link + * @np: pathname lookup data for the current path walk + * + * Description: + * vxfs_immed_follow_link restarts the pathname lookup with + * the data obtained from @dp. + * + * Returns: + * Zero on success, else a negative error code. + */ +static void * +vxfs_immed_follow_link(struct dentry *dp, struct nameidata *np) +{ + struct vxfs_inode_info *vip = VXFS_INO(dp->d_inode); + nd_set_link(np, vip->vii_immed.vi_immed); + return NULL; +} + +/** + * vxfs_immed_readpage - read part of an immed inode into pagecache + * @file: file context (unused) + * @page: page frame to fill in. + * + * Description: + * vxfs_immed_readpage reads a part of the immed area of the + * file that hosts @pp into the pagecache. + * + * Returns: + * Zero on success, else a negative error code. + * + * Locking status: + * @page is locked and will be unlocked. + */ +static int +vxfs_immed_readpage(struct file *fp, struct page *pp) +{ + struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host); + u_int64_t offset = (u_int64_t)pp->index << PAGE_CACHE_SHIFT; + caddr_t kaddr; + + kaddr = kmap(pp); + memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_CACHE_SIZE); + kunmap(pp); + + flush_dcache_page(pp); + SetPageUptodate(pp); + unlock_page(pp); + + return 0; +} diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c new file mode 100644 index 00000000..1a431143 --- /dev/null +++ b/fs/freevxfs/vxfs_inode.c @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - inode routines. + */ +#include +#include +#include +#include +#include + +#include "vxfs.h" +#include "vxfs_inode.h" +#include "vxfs_extern.h" + + +struct kmem_cache *vxfs_inode_cachep; + + +#ifdef DIAGNOSTIC +/* + * Dump inode contents (partially). + */ +void +vxfs_dumpi(struct vxfs_inode_info *vip, ino_t ino) +{ + printk(KERN_DEBUG "\n\n"); + if (ino) + printk(KERN_DEBUG "dumping vxfs inode %ld\n", ino); + else + printk(KERN_DEBUG "dumping unknown vxfs inode\n"); + + printk(KERN_DEBUG "---------------------------\n"); + printk(KERN_DEBUG "mode is %x\n", vip->vii_mode); + printk(KERN_DEBUG "nlink:%u, uid:%u, gid:%u\n", + vip->vii_nlink, vip->vii_uid, vip->vii_gid); + printk(KERN_DEBUG "size:%Lx, blocks:%u\n", + vip->vii_size, vip->vii_blocks); + printk(KERN_DEBUG "orgtype:%u\n", vip->vii_orgtype); +} +#endif + + +/** + * vxfs_blkiget - find inode based on extent # + * @sbp: superblock of the filesystem we search in + * @extent: number of the extent to search + * @ino: inode number to search + * + * Description: + * vxfs_blkiget searches inode @ino in the filesystem described by + * @sbp in the extent @extent. + * Returns the matching VxFS inode on success, else a NULL pointer. + * + * NOTE: + * While __vxfs_iget uses the pagecache vxfs_blkiget uses the + * buffercache. This function should not be used outside the + * read_super() method, otherwise the data may be incoherent. + */ +struct vxfs_inode_info * +vxfs_blkiget(struct super_block *sbp, u_long extent, ino_t ino) +{ + struct buffer_head *bp; + u_long block, offset; + + block = extent + ((ino * VXFS_ISIZE) / sbp->s_blocksize); + offset = ((ino % (sbp->s_blocksize / VXFS_ISIZE)) * VXFS_ISIZE); + bp = sb_bread(sbp, block); + + if (bp && buffer_mapped(bp)) { + struct vxfs_inode_info *vip; + struct vxfs_dinode *dip; + + if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL))) + goto fail; + dip = (struct vxfs_dinode *)(bp->b_data + offset); + memcpy(vip, dip, sizeof(*vip)); +#ifdef DIAGNOSTIC + vxfs_dumpi(vip, ino); +#endif + brelse(bp); + return (vip); + } + +fail: + printk(KERN_WARNING "vxfs: unable to read block %ld\n", block); + brelse(bp); + return NULL; +} + +/** + * __vxfs_iget - generic find inode facility + * @sbp: VFS superblock + * @ino: inode number + * @ilistp: inode list + * + * Description: + * Search the for inode number @ino in the filesystem + * described by @sbp. Use the specified inode table (@ilistp). + * Returns the matching VxFS inode on success, else an error code. + */ +static struct vxfs_inode_info * +__vxfs_iget(ino_t ino, struct inode *ilistp) +{ + struct page *pp; + u_long offset; + + offset = (ino % (PAGE_SIZE / VXFS_ISIZE)) * VXFS_ISIZE; + pp = vxfs_get_page(ilistp->i_mapping, ino * VXFS_ISIZE / PAGE_SIZE); + + if (!IS_ERR(pp)) { + struct vxfs_inode_info *vip; + struct vxfs_dinode *dip; + caddr_t kaddr = (char *)page_address(pp); + + if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL))) + goto fail; + dip = (struct vxfs_dinode *)(kaddr + offset); + memcpy(vip, dip, sizeof(*vip)); +#ifdef DIAGNOSTIC + vxfs_dumpi(vip, ino); +#endif + vxfs_put_page(pp); + return (vip); + } + + printk(KERN_WARNING "vxfs: error on page %p\n", pp); + return ERR_CAST(pp); + +fail: + printk(KERN_WARNING "vxfs: unable to read inode %ld\n", (unsigned long)ino); + vxfs_put_page(pp); + return ERR_PTR(-ENOMEM); +} + +/** + * vxfs_stiget - find inode using the structural inode list + * @sbp: VFS superblock + * @ino: inode # + * + * Description: + * Find inode @ino in the filesystem described by @sbp using + * the structural inode list. + * Returns the matching VxFS inode on success, else a NULL pointer. + */ +struct vxfs_inode_info * +vxfs_stiget(struct super_block *sbp, ino_t ino) +{ + struct vxfs_inode_info *vip; + + vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_stilist); + return IS_ERR(vip) ? NULL : vip; +} + +/** + * vxfs_transmod - mode for a VxFS inode + * @vip: VxFS inode + * + * Description: + * vxfs_transmod returns a Linux mode_t for a given + * VxFS inode structure. + */ +static __inline__ mode_t +vxfs_transmod(struct vxfs_inode_info *vip) +{ + mode_t ret = vip->vii_mode & ~VXFS_TYPE_MASK; + + if (VXFS_ISFIFO(vip)) + ret |= S_IFIFO; + if (VXFS_ISCHR(vip)) + ret |= S_IFCHR; + if (VXFS_ISDIR(vip)) + ret |= S_IFDIR; + if (VXFS_ISBLK(vip)) + ret |= S_IFBLK; + if (VXFS_ISLNK(vip)) + ret |= S_IFLNK; + if (VXFS_ISREG(vip)) + ret |= S_IFREG; + if (VXFS_ISSOC(vip)) + ret |= S_IFSOCK; + + return (ret); +} + +/** + * vxfs_iinit- helper to fill inode fields + * @ip: VFS inode + * @vip: VxFS inode + * + * Description: + * vxfs_instino is a helper function to fill in all relevant + * fields in @ip from @vip. + */ +static void +vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip) +{ + + ip->i_mode = vxfs_transmod(vip); + ip->i_uid = (uid_t)vip->vii_uid; + ip->i_gid = (gid_t)vip->vii_gid; + + ip->i_nlink = vip->vii_nlink; + ip->i_size = vip->vii_size; + + ip->i_atime.tv_sec = vip->vii_atime; + ip->i_ctime.tv_sec = vip->vii_ctime; + ip->i_mtime.tv_sec = vip->vii_mtime; + ip->i_atime.tv_nsec = 0; + ip->i_ctime.tv_nsec = 0; + ip->i_mtime.tv_nsec = 0; + + ip->i_blocks = vip->vii_blocks; + ip->i_generation = vip->vii_gen; + + ip->i_private = vip; + +} + +/** + * vxfs_get_fake_inode - get fake inode structure + * @sbp: filesystem superblock + * @vip: fspriv inode + * + * Description: + * vxfs_fake_inode gets a fake inode (not in the inode hash) for a + * superblock, vxfs_inode pair. + * Returns the filled VFS inode. + */ +struct inode * +vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip) +{ + struct inode *ip = NULL; + + if ((ip = new_inode(sbp))) { + ip->i_ino = get_next_ino(); + vxfs_iinit(ip, vip); + ip->i_mapping->a_ops = &vxfs_aops; + } + return (ip); +} + +/** + * vxfs_put_fake_inode - free faked inode + * *ip: VFS inode + * + * Description: + * vxfs_put_fake_inode frees all data associated with @ip. + */ +void +vxfs_put_fake_inode(struct inode *ip) +{ + iput(ip); +} + +/** + * vxfs_iget - get an inode + * @sbp: the superblock to get the inode for + * @ino: the number of the inode to get + * + * Description: + * vxfs_read_inode creates an inode, reads the disk inode for @ino and fills + * in all relevant fields in the new inode. + */ +struct inode * +vxfs_iget(struct super_block *sbp, ino_t ino) +{ + struct vxfs_inode_info *vip; + const struct address_space_operations *aops; + struct inode *ip; + + ip = iget_locked(sbp, ino); + if (!ip) + return ERR_PTR(-ENOMEM); + if (!(ip->i_state & I_NEW)) + return ip; + + vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_ilist); + if (IS_ERR(vip)) { + iget_failed(ip); + return ERR_CAST(vip); + } + + vxfs_iinit(ip, vip); + + if (VXFS_ISIMMED(vip)) + aops = &vxfs_immed_aops; + else + aops = &vxfs_aops; + + if (S_ISREG(ip->i_mode)) { + ip->i_fop = &generic_ro_fops; + ip->i_mapping->a_ops = aops; + } else if (S_ISDIR(ip->i_mode)) { + ip->i_op = &vxfs_dir_inode_ops; + ip->i_fop = &vxfs_dir_operations; + ip->i_mapping->a_ops = aops; + } else if (S_ISLNK(ip->i_mode)) { + if (!VXFS_ISIMMED(vip)) { + ip->i_op = &page_symlink_inode_operations; + ip->i_mapping->a_ops = &vxfs_aops; + } else { + ip->i_op = &vxfs_immed_symlink_iops; + vip->vii_immed.vi_immed[ip->i_size] = '\0'; + } + } else + init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev)); + + unlock_new_inode(ip); + return ip; +} + +static void vxfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(vxfs_inode_cachep, inode->i_private); +} + +/** + * vxfs_evict_inode - remove inode from main memory + * @ip: inode to discard. + * + * Description: + * vxfs_evict_inode() is called on the final iput and frees the private + * inode area. + */ +void +vxfs_evict_inode(struct inode *ip) +{ + truncate_inode_pages(&ip->i_data, 0); + end_writeback(ip); + call_rcu(&ip->i_rcu, vxfs_i_callback); +} diff --git a/fs/freevxfs/vxfs_inode.h b/fs/freevxfs/vxfs_inode.h new file mode 100644 index 00000000..240aeb11 --- /dev/null +++ b/fs/freevxfs/vxfs_inode.h @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#ifndef _VXFS_INODE_H_ +#define _VXFS_INODE_H_ + +/* + * Veritas filesystem driver - inode structure. + * + * This file contains the definition of the disk and core + * inodes of the Veritas Filesystem. + */ + + +#define VXFS_ISIZE 0x100 /* Inode size */ + +#define VXFS_NDADDR 10 /* Number of direct addrs in inode */ +#define VXFS_NIADDR 2 /* Number of indirect addrs in inode */ +#define VXFS_NIMMED 96 /* Size of immediate data in inode */ +#define VXFS_NTYPED 6 /* Num of typed extents */ + +#define VXFS_TYPED_OFFSETMASK (0x00FFFFFFFFFFFFFFULL) +#define VXFS_TYPED_TYPEMASK (0xFF00000000000000ULL) +#define VXFS_TYPED_TYPESHIFT 56 + +#define VXFS_TYPED_PER_BLOCK(sbp) \ + ((sbp)->s_blocksize / sizeof(struct vxfs_typed)) + +/* + * Possible extent descriptor types for %VXFS_ORG_TYPED extents. + */ +enum { + VXFS_TYPED_INDIRECT = 1, + VXFS_TYPED_DATA = 2, + VXFS_TYPED_INDIRECT_DEV4 = 3, + VXFS_TYPED_DATA_DEV4 = 4, +}; + +/* + * Data stored immediately in the inode. + */ +struct vxfs_immed { + u_int8_t vi_immed[VXFS_NIMMED]; +}; + +struct vxfs_ext4 { + u_int32_t ve4_spare; /* ?? */ + u_int32_t ve4_indsize; /* Indirect extent size */ + vx_daddr_t ve4_indir[VXFS_NIADDR]; /* Indirect extents */ + struct direct { /* Direct extents */ + vx_daddr_t extent; /* Extent number */ + int32_t size; /* Size of extent */ + } ve4_direct[VXFS_NDADDR]; +}; + +struct vxfs_typed { + u_int64_t vt_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */ + vx_daddr_t vt_block; /* Extent block */ + int32_t vt_size; /* Size in blocks */ +}; + +struct vxfs_typed_dev4 { + u_int64_t vd4_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */ + u_int64_t vd4_block; /* Extent block */ + u_int64_t vd4_size; /* Size in blocks */ + int32_t vd4_dev; /* Device ID */ + u_int32_t __pad1; +}; + +/* + * The inode as contained on the physical device. + */ +struct vxfs_dinode { + int32_t vdi_mode; + u_int32_t vdi_nlink; /* Link count */ + u_int32_t vdi_uid; /* UID */ + u_int32_t vdi_gid; /* GID */ + u_int64_t vdi_size; /* Inode size in bytes */ + u_int32_t vdi_atime; /* Last time accessed - sec */ + u_int32_t vdi_autime; /* Last time accessed - usec */ + u_int32_t vdi_mtime; /* Last modify time - sec */ + u_int32_t vdi_mutime; /* Last modify time - usec */ + u_int32_t vdi_ctime; /* Create time - sec */ + u_int32_t vdi_cutime; /* Create time - usec */ + u_int8_t vdi_aflags; /* Allocation flags */ + u_int8_t vdi_orgtype; /* Organisation type */ + u_int16_t vdi_eopflags; + u_int32_t vdi_eopdata; + union { + u_int32_t rdev; + u_int32_t dotdot; + struct { + u_int32_t reserved; + u_int32_t fixextsize; + } i_regular; + struct { + u_int32_t matchino; + u_int32_t fsetindex; + } i_vxspec; + u_int64_t align; + } vdi_ftarea; + u_int32_t vdi_blocks; /* How much blocks does inode occupy */ + u_int32_t vdi_gen; /* Inode generation */ + u_int64_t vdi_version; /* Version */ + union { + struct vxfs_immed immed; + struct vxfs_ext4 ext4; + struct vxfs_typed typed[VXFS_NTYPED]; + } vdi_org; + u_int32_t vdi_iattrino; +}; + +#define vdi_rdev vdi_ftarea.rdev +#define vdi_dotdot vdi_ftarea.dotdot +#define vdi_fixextsize vdi_ftarea.regular.fixextsize +#define vdi_matchino vdi_ftarea.vxspec.matchino +#define vdi_fsetindex vdi_ftarea.vxspec.fsetindex + +#define vdi_immed vdi_org.immed +#define vdi_ext4 vdi_org.ext4 +#define vdi_typed vdi_org.typed + + +/* + * The inode as represented in the main memory. + * + * TBD: This should become a separate structure... + */ +#define vxfs_inode_info vxfs_dinode + +#define vii_mode vdi_mode +#define vii_uid vdi_uid +#define vii_gid vdi_gid +#define vii_nlink vdi_nlink +#define vii_size vdi_size +#define vii_atime vdi_atime +#define vii_ctime vdi_ctime +#define vii_mtime vdi_mtime +#define vii_blocks vdi_blocks +#define vii_org vdi_org +#define vii_orgtype vdi_orgtype +#define vii_gen vdi_gen + +#define vii_rdev vdi_ftarea.rdev +#define vii_dotdot vdi_ftarea.dotdot +#define vii_fixextsize vdi_ftarea.regular.fixextsize +#define vii_matchino vdi_ftarea.vxspec.matchino +#define vii_fsetindex vdi_ftarea.vxspec.fsetindex + +#define vii_immed vdi_org.immed +#define vii_ext4 vdi_org.ext4 +#define vii_typed vdi_org.typed + +#endif /* _VXFS_INODE_H_ */ diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c new file mode 100644 index 00000000..3360f1e6 --- /dev/null +++ b/fs/freevxfs/vxfs_lookup.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - lookup and other directory related code. + */ +#include +#include +#include +#include +#include +#include + +#include "vxfs.h" +#include "vxfs_dir.h" +#include "vxfs_inode.h" +#include "vxfs_extern.h" + +/* + * Number of VxFS blocks per page. + */ +#define VXFS_BLOCK_PER_PAGE(sbp) ((PAGE_CACHE_SIZE / (sbp)->s_blocksize)) + + +static struct dentry * vxfs_lookup(struct inode *, struct dentry *, struct nameidata *); +static int vxfs_readdir(struct file *, void *, filldir_t); + +const struct inode_operations vxfs_dir_inode_ops = { + .lookup = vxfs_lookup, +}; + +const struct file_operations vxfs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = vxfs_readdir, +}; + + +static inline u_long +dir_pages(struct inode *inode) +{ + return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +} + +static inline u_long +dir_blocks(struct inode *ip) +{ + u_long bsize = ip->i_sb->s_blocksize; + return (ip->i_size + bsize - 1) & ~(bsize - 1); +} + +/* + * NOTE! unlike strncmp, vxfs_match returns 1 for success, 0 for failure. + * + * len <= VXFS_NAMELEN and de != NULL are guaranteed by caller. + */ +static inline int +vxfs_match(int len, const char * const name, struct vxfs_direct *de) +{ + if (len != de->d_namelen) + return 0; + if (!de->d_ino) + return 0; + return !memcmp(name, de->d_name, len); +} + +static inline struct vxfs_direct * +vxfs_next_entry(struct vxfs_direct *de) +{ + return ((struct vxfs_direct *)((char*)de + de->d_reclen)); +} + +/** + * vxfs_find_entry - find a mathing directory entry for a dentry + * @ip: directory inode + * @dp: dentry for which we want to find a direct + * @ppp: gets filled with the page the return value sits in + * + * Description: + * vxfs_find_entry finds a &struct vxfs_direct for the VFS directory + * cache entry @dp. @ppp will be filled with the page the return + * value resides in. + * + * Returns: + * The wanted direct on success, else a NULL pointer. + */ +static struct vxfs_direct * +vxfs_find_entry(struct inode *ip, struct dentry *dp, struct page **ppp) +{ + u_long npages, page, nblocks, pblocks, block; + u_long bsize = ip->i_sb->s_blocksize; + const char *name = dp->d_name.name; + int namelen = dp->d_name.len; + + npages = dir_pages(ip); + nblocks = dir_blocks(ip); + pblocks = VXFS_BLOCK_PER_PAGE(ip->i_sb); + + for (page = 0; page < npages; page++) { + caddr_t kaddr; + struct page *pp; + + pp = vxfs_get_page(ip->i_mapping, page); + if (IS_ERR(pp)) + continue; + kaddr = (caddr_t)page_address(pp); + + for (block = 0; block <= nblocks && block <= pblocks; block++) { + caddr_t baddr, limit; + struct vxfs_dirblk *dbp; + struct vxfs_direct *de; + + baddr = kaddr + (block * bsize); + limit = baddr + bsize - VXFS_DIRLEN(1); + + dbp = (struct vxfs_dirblk *)baddr; + de = (struct vxfs_direct *)(baddr + VXFS_DIRBLKOV(dbp)); + + for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) { + if (!de->d_reclen) + break; + if (!de->d_ino) + continue; + if (vxfs_match(namelen, name, de)) { + *ppp = pp; + return (de); + } + } + } + vxfs_put_page(pp); + } + + return NULL; +} + +/** + * vxfs_inode_by_name - find inode number for dentry + * @dip: directory to search in + * @dp: dentry we search for + * + * Description: + * vxfs_inode_by_name finds out the inode number of + * the path component described by @dp in @dip. + * + * Returns: + * The wanted inode number on success, else Zero. + */ +static ino_t +vxfs_inode_by_name(struct inode *dip, struct dentry *dp) +{ + struct vxfs_direct *de; + struct page *pp; + ino_t ino = 0; + + de = vxfs_find_entry(dip, dp, &pp); + if (de) { + ino = de->d_ino; + kunmap(pp); + page_cache_release(pp); + } + + return (ino); +} + +/** + * vxfs_lookup - lookup pathname component + * @dip: dir in which we lookup + * @dp: dentry we lookup + * @nd: lookup nameidata + * + * Description: + * vxfs_lookup tries to lookup the pathname component described + * by @dp in @dip. + * + * Returns: + * A NULL-pointer on success, else an negative error code encoded + * in the return pointer. + */ +static struct dentry * +vxfs_lookup(struct inode *dip, struct dentry *dp, struct nameidata *nd) +{ + struct inode *ip = NULL; + ino_t ino; + + if (dp->d_name.len > VXFS_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + ino = vxfs_inode_by_name(dip, dp); + if (ino) { + ip = vxfs_iget(dip->i_sb, ino); + if (IS_ERR(ip)) + return ERR_CAST(ip); + } + d_add(dp, ip); + return NULL; +} + +/** + * vxfs_readdir - read a directory + * @fp: the directory to read + * @retp: return buffer + * @filler: filldir callback + * + * Description: + * vxfs_readdir fills @retp with directory entries from @fp + * using the VFS supplied callback @filler. + * + * Returns: + * Zero. + */ +static int +vxfs_readdir(struct file *fp, void *retp, filldir_t filler) +{ + struct inode *ip = fp->f_path.dentry->d_inode; + struct super_block *sbp = ip->i_sb; + u_long bsize = sbp->s_blocksize; + u_long page, npages, block, pblocks, nblocks, offset; + loff_t pos; + + switch ((long)fp->f_pos) { + case 0: + if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0) + goto out; + fp->f_pos++; + /* fallthrough */ + case 1: + if (filler(retp, "..", 2, fp->f_pos, VXFS_INO(ip)->vii_dotdot, DT_DIR) < 0) + goto out; + fp->f_pos++; + /* fallthrough */ + } + + pos = fp->f_pos - 2; + + if (pos > VXFS_DIRROUND(ip->i_size)) + return 0; + + npages = dir_pages(ip); + nblocks = dir_blocks(ip); + pblocks = VXFS_BLOCK_PER_PAGE(sbp); + + page = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_CACHE_MASK; + block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks; + + for (; page < npages; page++, block = 0) { + caddr_t kaddr; + struct page *pp; + + pp = vxfs_get_page(ip->i_mapping, page); + if (IS_ERR(pp)) + continue; + kaddr = (caddr_t)page_address(pp); + + for (; block <= nblocks && block <= pblocks; block++) { + caddr_t baddr, limit; + struct vxfs_dirblk *dbp; + struct vxfs_direct *de; + + baddr = kaddr + (block * bsize); + limit = baddr + bsize - VXFS_DIRLEN(1); + + dbp = (struct vxfs_dirblk *)baddr; + de = (struct vxfs_direct *) + (offset ? + (kaddr + offset) : + (baddr + VXFS_DIRBLKOV(dbp))); + + for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) { + int over; + + if (!de->d_reclen) + break; + if (!de->d_ino) + continue; + + offset = (caddr_t)de - kaddr; + over = filler(retp, de->d_name, de->d_namelen, + ((page << PAGE_CACHE_SHIFT) | offset) + 2, + de->d_ino, DT_UNKNOWN); + if (over) { + vxfs_put_page(pp); + goto done; + } + } + offset = 0; + } + vxfs_put_page(pp); + offset = 0; + } + +done: + fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; +out: + return 0; +} diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c new file mode 100644 index 00000000..04950084 --- /dev/null +++ b/fs/freevxfs/vxfs_olt.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - object location table support. + */ +#include +#include +#include + +#include "vxfs.h" +#include "vxfs_olt.h" +#include "vxfs_extern.h" + + +static inline void +vxfs_get_fshead(struct vxfs_oltfshead *fshp, struct vxfs_sb_info *infp) +{ + BUG_ON(infp->vsi_fshino); + infp->vsi_fshino = fshp->olt_fsino[0]; +} + +static inline void +vxfs_get_ilist(struct vxfs_oltilist *ilistp, struct vxfs_sb_info *infp) +{ + BUG_ON(infp->vsi_iext); + infp->vsi_iext = ilistp->olt_iext[0]; +} + +static inline u_long +vxfs_oblock(struct super_block *sbp, daddr_t block, u_long bsize) +{ + BUG_ON(sbp->s_blocksize % bsize); + return (block * (sbp->s_blocksize / bsize)); +} + + +/** + * vxfs_read_olt - read olt + * @sbp: superblock of the filesystem + * @bsize: blocksize of the filesystem + * + * Description: + * vxfs_read_olt reads the olt of the filesystem described by @sbp + * into main memory and does some basic setup. + * + * Returns: + * Zero on success, else a negative error code. + */ +int +vxfs_read_olt(struct super_block *sbp, u_long bsize) +{ + struct vxfs_sb_info *infp = VXFS_SBI(sbp); + struct buffer_head *bp; + struct vxfs_olt *op; + char *oaddr, *eaddr; + + + bp = sb_bread(sbp, vxfs_oblock(sbp, infp->vsi_oltext, bsize)); + if (!bp || !bp->b_data) + goto fail; + + op = (struct vxfs_olt *)bp->b_data; + if (op->olt_magic != VXFS_OLT_MAGIC) { + printk(KERN_NOTICE "vxfs: ivalid olt magic number\n"); + goto fail; + } + + /* + * It is in theory possible that vsi_oltsize is > 1. + * I've not seen any such filesystem yet and I'm lazy.. --hch + */ + if (infp->vsi_oltsize > 1) { + printk(KERN_NOTICE "vxfs: oltsize > 1 detected.\n"); + printk(KERN_NOTICE "vxfs: please notify hch@infradead.org\n"); + goto fail; + } + + oaddr = bp->b_data + op->olt_size; + eaddr = bp->b_data + (infp->vsi_oltsize * sbp->s_blocksize); + + while (oaddr < eaddr) { + struct vxfs_oltcommon *ocp = + (struct vxfs_oltcommon *)oaddr; + + switch (ocp->olt_type) { + case VXFS_OLT_FSHEAD: + vxfs_get_fshead((struct vxfs_oltfshead *)oaddr, infp); + break; + case VXFS_OLT_ILIST: + vxfs_get_ilist((struct vxfs_oltilist *)oaddr, infp); + break; + } + + oaddr += ocp->olt_size; + } + + brelse(bp); + return 0; + +fail: + brelse(bp); + return -EINVAL; +} diff --git a/fs/freevxfs/vxfs_olt.h b/fs/freevxfs/vxfs_olt.h new file mode 100644 index 00000000..b7b3af50 --- /dev/null +++ b/fs/freevxfs/vxfs_olt.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#ifndef _VXFS_OLT_H_ +#define _VXFS_OLT_H_ + +/* + * Veritas filesystem driver - Object Location Table data structures. + * + * This file contains definitions for the Object Location Table used + * by the Veritas Filesystem version 2 and newer. + */ + + +/* + * OLT magic number (vxfs_olt->olt_magic). + */ +#define VXFS_OLT_MAGIC 0xa504FCF5 + +/* + * VxFS OLT entry types. + */ +enum { + VXFS_OLT_FREE = 1, + VXFS_OLT_FSHEAD = 2, + VXFS_OLT_CUT = 3, + VXFS_OLT_ILIST = 4, + VXFS_OLT_DEV = 5, + VXFS_OLT_SB = 6 +}; + +/* + * VxFS OLT header. + * + * The Object Location Table header is placed at the beginning of each + * OLT extent. It is used to fing certain filesystem-wide metadata, e.g. + * the initial inode list, the fileset header or the device configuration. + */ +struct vxfs_olt { + u_int32_t olt_magic; /* magic number */ + u_int32_t olt_size; /* size of this entry */ + u_int32_t olt_checksum; /* checksum of extent */ + u_int32_t __unused1; /* ??? */ + u_int32_t olt_mtime; /* time of last mod. (sec) */ + u_int32_t olt_mutime; /* time of last mod. (usec) */ + u_int32_t olt_totfree; /* free space in OLT extent */ + vx_daddr_t olt_extents[2]; /* addr of this extent, replica */ + u_int32_t olt_esize; /* size of this extent */ + vx_daddr_t olt_next[2]; /* addr of next extent, replica */ + u_int32_t olt_nsize; /* size of next extent */ + u_int32_t __unused2; /* align to 8 byte boundary */ +}; + +/* + * VxFS common OLT entry (on disk). + */ +struct vxfs_oltcommon { + u_int32_t olt_type; /* type of this record */ + u_int32_t olt_size; /* size of this record */ +}; + +/* + * VxFS free OLT entry (on disk). + */ +struct vxfs_oltfree { + u_int32_t olt_type; /* type of this record */ + u_int32_t olt_fsize; /* size of this free record */ +}; + +/* + * VxFS initial-inode list (on disk). + */ +struct vxfs_oltilist { + u_int32_t olt_type; /* type of this record */ + u_int32_t olt_size; /* size of this record */ + vx_ino_t olt_iext[2]; /* initial inode list, replica */ +}; + +/* + * Current Usage Table + */ +struct vxfs_oltcut { + u_int32_t olt_type; /* type of this record */ + u_int32_t olt_size; /* size of this record */ + vx_ino_t olt_cutino; /* inode of current usage table */ + u_int32_t __pad; /* unused, 8 byte align */ +}; + +/* + * Inodes containing Superblock, Intent log and OLTs + */ +struct vxfs_oltsb { + u_int32_t olt_type; /* type of this record */ + u_int32_t olt_size; /* size of this record */ + vx_ino_t olt_sbino; /* inode of superblock file */ + u_int32_t __unused1; /* ??? */ + vx_ino_t olt_logino[2]; /* inode of log file,replica */ + vx_ino_t olt_oltino[2]; /* inode of OLT, replica */ +}; + +/* + * Inode containing device configuration + it's replica + */ +struct vxfs_oltdev { + u_int32_t olt_type; /* type of this record */ + u_int32_t olt_size; /* size of this record */ + vx_ino_t olt_devino[2]; /* inode of device config files */ +}; + +/* + * Fileset header + */ +struct vxfs_oltfshead { + u_int32_t olt_type; /* type number */ + u_int32_t olt_size; /* size of this record */ + vx_ino_t olt_fsino[2]; /* inodes of fileset header */ +}; + +#endif /* _VXFS_OLT_H_ */ diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c new file mode 100644 index 00000000..5d318c44 --- /dev/null +++ b/fs/freevxfs/vxfs_subr.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - shared subroutines. + */ +#include +#include +#include +#include + +#include "vxfs_extern.h" + + +static int vxfs_readpage(struct file *, struct page *); +static sector_t vxfs_bmap(struct address_space *, sector_t); + +const struct address_space_operations vxfs_aops = { + .readpage = vxfs_readpage, + .bmap = vxfs_bmap, +}; + +inline void +vxfs_put_page(struct page *pp) +{ + kunmap(pp); + page_cache_release(pp); +} + +/** + * vxfs_get_page - read a page into memory. + * @ip: inode to read from + * @n: page number + * + * Description: + * vxfs_get_page reads the @n th page of @ip into the pagecache. + * + * Returns: + * The wanted page on success, else a NULL pointer. + */ +struct page * +vxfs_get_page(struct address_space *mapping, u_long n) +{ + struct page * pp; + + pp = read_mapping_page(mapping, n, NULL); + + if (!IS_ERR(pp)) { + kmap(pp); + /** if (!PageChecked(pp)) **/ + /** vxfs_check_page(pp); **/ + if (PageError(pp)) + goto fail; + } + + return (pp); + +fail: + vxfs_put_page(pp); + return ERR_PTR(-EIO); +} + +/** + * vxfs_bread - read buffer for a give inode,block tuple + * @ip: inode + * @block: logical block + * + * Description: + * The vxfs_bread function reads block no @block of + * @ip into the buffercache. + * + * Returns: + * The resulting &struct buffer_head. + */ +struct buffer_head * +vxfs_bread(struct inode *ip, int block) +{ + struct buffer_head *bp; + daddr_t pblock; + + pblock = vxfs_bmap1(ip, block); + bp = sb_bread(ip->i_sb, pblock); + + return (bp); +} + +/** + * vxfs_get_block - locate buffer for given inode,block tuple + * @ip: inode + * @iblock: logical block + * @bp: buffer skeleton + * @create: %TRUE if blocks may be newly allocated. + * + * Description: + * The vxfs_get_block function fills @bp with the right physical + * block and device number to perform a lowlevel read/write on + * it. + * + * Returns: + * Zero on success, else a negativ error code (-EIO). + */ +static int +vxfs_getblk(struct inode *ip, sector_t iblock, + struct buffer_head *bp, int create) +{ + daddr_t pblock; + + pblock = vxfs_bmap1(ip, iblock); + if (pblock != 0) { + map_bh(bp, ip->i_sb, pblock); + return 0; + } + + return -EIO; +} + +/** + * vxfs_readpage - read one page synchronously into the pagecache + * @file: file context (unused) + * @page: page frame to fill in. + * + * Description: + * The vxfs_readpage routine reads @page synchronously into the + * pagecache. + * + * Returns: + * Zero on success, else a negative error code. + * + * Locking status: + * @page is locked and will be unlocked. + */ +static int +vxfs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, vxfs_getblk); +} + +/** + * vxfs_bmap - perform logical to physical block mapping + * @mapping: logical to physical mapping to use + * @block: logical block (relative to @mapping). + * + * Description: + * Vxfs_bmap find out the corresponding phsical block to the + * @mapping, @block pair. + * + * Returns: + * Physical block number on success, else Zero. + * + * Locking status: + * We are under the bkl. + */ +static sector_t +vxfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, vxfs_getblk); +} diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c new file mode 100644 index 00000000..9d1c9955 --- /dev/null +++ b/fs/freevxfs/vxfs_super.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2000-2001 Christoph Hellwig. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Veritas filesystem driver - superblock related routines. + */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vxfs.h" +#include "vxfs_extern.h" +#include "vxfs_dir.h" +#include "vxfs_inode.h" + + +MODULE_AUTHOR("Christoph Hellwig"); +MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver"); +MODULE_LICENSE("Dual BSD/GPL"); + +MODULE_ALIAS("vxfs"); /* makes mount -t vxfs autoload the module */ + + +static void vxfs_put_super(struct super_block *); +static int vxfs_statfs(struct dentry *, struct kstatfs *); +static int vxfs_remount(struct super_block *, int *, char *); + +static const struct super_operations vxfs_super_ops = { + .evict_inode = vxfs_evict_inode, + .put_super = vxfs_put_super, + .statfs = vxfs_statfs, + .remount_fs = vxfs_remount, +}; + +/** + * vxfs_put_super - free superblock resources + * @sbp: VFS superblock. + * + * Description: + * vxfs_put_super frees all resources allocated for @sbp + * after the last instance of the filesystem is unmounted. + */ + +static void +vxfs_put_super(struct super_block *sbp) +{ + struct vxfs_sb_info *infp = VXFS_SBI(sbp); + + vxfs_put_fake_inode(infp->vsi_fship); + vxfs_put_fake_inode(infp->vsi_ilist); + vxfs_put_fake_inode(infp->vsi_stilist); + + brelse(infp->vsi_bp); + kfree(infp); +} + +/** + * vxfs_statfs - get filesystem information + * @dentry: VFS dentry to locate superblock + * @bufp: output buffer + * + * Description: + * vxfs_statfs fills the statfs buffer @bufp with information + * about the filesystem described by @dentry. + * + * Returns: + * Zero. + * + * Locking: + * No locks held. + * + * Notes: + * This is everything but complete... + */ +static int +vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) +{ + struct vxfs_sb_info *infp = VXFS_SBI(dentry->d_sb); + + bufp->f_type = VXFS_SUPER_MAGIC; + bufp->f_bsize = dentry->d_sb->s_blocksize; + bufp->f_blocks = infp->vsi_raw->vs_dsize; + bufp->f_bfree = infp->vsi_raw->vs_free; + bufp->f_bavail = 0; + bufp->f_files = 0; + bufp->f_ffree = infp->vsi_raw->vs_ifree; + bufp->f_namelen = VXFS_NAMELEN; + + return 0; +} + +static int vxfs_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_RDONLY; + return 0; +} + +/** + * vxfs_read_super - read superblock into memory and initialize filesystem + * @sbp: VFS superblock (to fill) + * @dp: fs private mount data + * @silent: do not complain loudly when sth is wrong + * + * Description: + * We are called on the first mount of a filesystem to read the + * superblock into memory and do some basic setup. + * + * Returns: + * The superblock on success, else %NULL. + * + * Locking: + * We are under @sbp->s_lock. + */ +static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) +{ + struct vxfs_sb_info *infp; + struct vxfs_sb *rsbp; + struct buffer_head *bp = NULL; + u_long bsize; + struct inode *root; + int ret = -EINVAL; + + sbp->s_flags |= MS_RDONLY; + + infp = kzalloc(sizeof(*infp), GFP_KERNEL); + if (!infp) { + printk(KERN_WARNING "vxfs: unable to allocate incore superblock\n"); + return -ENOMEM; + } + + bsize = sb_min_blocksize(sbp, BLOCK_SIZE); + if (!bsize) { + printk(KERN_WARNING "vxfs: unable to set blocksize\n"); + goto out; + } + + bp = sb_bread(sbp, 1); + if (!bp || !buffer_mapped(bp)) { + if (!silent) { + printk(KERN_WARNING + "vxfs: unable to read disk superblock\n"); + } + goto out; + } + + rsbp = (struct vxfs_sb *)bp->b_data; + if (rsbp->vs_magic != VXFS_SUPER_MAGIC) { + if (!silent) + printk(KERN_NOTICE "vxfs: WRONG superblock magic\n"); + goto out; + } + + if ((rsbp->vs_version < 2 || rsbp->vs_version > 4) && !silent) { + printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n", + rsbp->vs_version); + goto out; + } + +#ifdef DIAGNOSTIC + printk(KERN_DEBUG "vxfs: supported VxFS version (%d)\n", rsbp->vs_version); + printk(KERN_DEBUG "vxfs: blocksize: %d\n", rsbp->vs_bsize); +#endif + + sbp->s_magic = rsbp->vs_magic; + sbp->s_fs_info = infp; + + infp->vsi_raw = rsbp; + infp->vsi_bp = bp; + infp->vsi_oltext = rsbp->vs_oltext[0]; + infp->vsi_oltsize = rsbp->vs_oltsize; + + if (!sb_set_blocksize(sbp, rsbp->vs_bsize)) { + printk(KERN_WARNING "vxfs: unable to set final block size\n"); + goto out; + } + + if (vxfs_read_olt(sbp, bsize)) { + printk(KERN_WARNING "vxfs: unable to read olt\n"); + goto out; + } + + if (vxfs_read_fshead(sbp)) { + printk(KERN_WARNING "vxfs: unable to read fshead\n"); + goto out; + } + + sbp->s_op = &vxfs_super_ops; + root = vxfs_iget(sbp, VXFS_ROOT_INO); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } + sbp->s_root = d_alloc_root(root); + if (!sbp->s_root) { + iput(root); + printk(KERN_WARNING "vxfs: unable to get root dentry.\n"); + goto out_free_ilist; + } + + return 0; + +out_free_ilist: + vxfs_put_fake_inode(infp->vsi_fship); + vxfs_put_fake_inode(infp->vsi_ilist); + vxfs_put_fake_inode(infp->vsi_stilist); +out: + brelse(bp); + kfree(infp); + return ret; +} + +/* + * The usual module blurb. + */ +static struct dentry *vxfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super); +} + +static struct file_system_type vxfs_fs_type = { + .owner = THIS_MODULE, + .name = "vxfs", + .mount = vxfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init +vxfs_init(void) +{ + int rv; + + vxfs_inode_cachep = kmem_cache_create("vxfs_inode", + sizeof(struct vxfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + if (!vxfs_inode_cachep) + return -ENOMEM; + rv = register_filesystem(&vxfs_fs_type); + if (rv < 0) + kmem_cache_destroy(vxfs_inode_cachep); + return rv; +} + +static void __exit +vxfs_cleanup(void) +{ + unregister_filesystem(&vxfs_fs_type); + kmem_cache_destroy(vxfs_inode_cachep); +} + +module_init(vxfs_init); +module_exit(vxfs_cleanup); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c new file mode 100644 index 00000000..539d0d25 --- /dev/null +++ b/fs/fs-writeback.c @@ -0,0 +1,1373 @@ +/* + * fs/fs-writeback.c + * + * Copyright (C) 2002, Linus Torvalds. + * + * Contains all the functions related to writing back and waiting + * upon dirty inodes against superblocks, and writing back dirty + * pages against inodes. ie: data writeback. Writeout of the + * inode itself is not handled here. + * + * 10Apr2002 Andrew Morton + * Split out of fs/inode.c + * Additions for address_space-based writeback + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * Passed into wb_writeback(), essentially a subset of writeback_control + */ +struct wb_writeback_work { + long nr_pages; + struct super_block *sb; + enum writeback_sync_modes sync_mode; + unsigned int tagged_writepages:1; + unsigned int for_kupdate:1; + unsigned int range_cyclic:1; + unsigned int for_background:1; + + struct list_head list; /* pending work list */ + struct completion *done; /* set if the caller waits */ +}; + +/* + * Include the creation of the trace points after defining the + * wb_writeback_work structure so that the definition remains local to this + * file. + */ +#define CREATE_TRACE_POINTS +#include + +/* + * We don't actually have pdflush, but this one is exported though /proc... + */ +int nr_pdflush_threads; + +/** + * writeback_in_progress - determine whether there is writeback in progress + * @bdi: the device's backing_dev_info structure. + * + * Determine whether there is writeback waiting to be handled against a + * backing device. + */ +int writeback_in_progress(struct backing_dev_info *bdi) +{ + return test_bit(BDI_writeback_running, &bdi->state); +} + +static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (strcmp(sb->s_type->name, "bdev") == 0) + return inode->i_mapping->backing_dev_info; + + return sb->s_bdi; +} + +static inline struct inode *wb_inode(struct list_head *head) +{ + return list_entry(head, struct inode, i_wb_list); +} + +/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ +static void bdi_wakeup_flusher(struct backing_dev_info *bdi) +{ + if (bdi->wb.task) { + wake_up_process(bdi->wb.task); + } else { + /* + * The bdi thread isn't there, wake up the forker thread which + * will create and run it. + */ + wake_up_process(default_backing_dev_info.wb.task); + } +} + +static void bdi_queue_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + trace_writeback_queue(bdi, work); + + spin_lock_bh(&bdi->wb_lock); + list_add_tail(&work->list, &bdi->work_list); + if (!bdi->wb.task) + trace_writeback_nothread(bdi, work); + bdi_wakeup_flusher(bdi); + spin_unlock_bh(&bdi->wb_lock); +} + +static void +__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, + bool range_cyclic) +{ + struct wb_writeback_work *work; + + /* + * This is WB_SYNC_NONE writeback, so if allocation fails just + * wakeup the thread for old dirty data writeback + */ + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) { + if (bdi->wb.task) { + trace_writeback_nowork(bdi); + wake_up_process(bdi->wb.task); + } + return; + } + + work->sync_mode = WB_SYNC_NONE; + work->nr_pages = nr_pages; + work->range_cyclic = range_cyclic; + + bdi_queue_work(bdi, work); +} + +/** + * bdi_start_writeback - start writeback + * @bdi: the backing device to write from + * @nr_pages: the number of pages to write + * + * Description: + * This does WB_SYNC_NONE opportunistic writeback. The IO is only + * started when this function returns, we make no guarantees on + * completion. Caller need not hold sb s_umount semaphore. + * + */ +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) +{ + __bdi_start_writeback(bdi, nr_pages, true); +} + +/** + * bdi_start_background_writeback - start background writeback + * @bdi: the backing device to write from + * + * Description: + * This makes sure WB_SYNC_NONE background writeback happens. When + * this function returns, it is only guaranteed that for given BDI + * some IO is happening if we are over background dirty threshold. + * Caller need not hold sb s_umount semaphore. + */ +void bdi_start_background_writeback(struct backing_dev_info *bdi) +{ + /* + * We just wake up the flusher thread. It will perform background + * writeback as soon as there is no other work to do. + */ + trace_writeback_wake_background(bdi); + spin_lock_bh(&bdi->wb_lock); + bdi_wakeup_flusher(bdi); + spin_unlock_bh(&bdi->wb_lock); +} + +/* + * Remove the inode from the writeback list it is on. + */ +void inode_wb_list_del(struct inode *inode) +{ + spin_lock(&inode_wb_list_lock); + list_del_init(&inode->i_wb_list); + spin_unlock(&inode_wb_list_lock); +} + + +/* + * Redirty an inode: set its when-it-was dirtied timestamp and move it to the + * furthest end of its superblock's dirty-inode list. + * + * Before stamping the inode's ->dirtied_when, we check to see whether it is + * already the most-recently-dirtied inode on the b_dirty list. If that is + * the case then the inode must have been redirtied while it was being written + * out and we don't reset its dirtied_when. + */ +static void redirty_tail(struct inode *inode) +{ + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + + assert_spin_locked(&inode_wb_list_lock); + if (!list_empty(&wb->b_dirty)) { + struct inode *tail; + + tail = wb_inode(wb->b_dirty.next); + if (time_before(inode->dirtied_when, tail->dirtied_when)) + inode->dirtied_when = jiffies; + } + list_move(&inode->i_wb_list, &wb->b_dirty); +} + +/* + * requeue inode for re-scanning after bdi->b_io list is exhausted. + */ +static void requeue_io(struct inode *inode) +{ + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + + assert_spin_locked(&inode_wb_list_lock); + list_move(&inode->i_wb_list, &wb->b_more_io); +} + +static void inode_sync_complete(struct inode *inode) +{ + /* + * Prevent speculative execution through + * spin_unlock(&inode_wb_list_lock); + */ + + smp_mb(); + wake_up_bit(&inode->i_state, __I_SYNC); +} + +static bool inode_dirtied_after(struct inode *inode, unsigned long t) +{ + bool ret = time_after(inode->dirtied_when, t); +#ifndef CONFIG_64BIT + /* + * For inodes being constantly redirtied, dirtied_when can get stuck. + * It _appears_ to be in the future, but is actually in distant past. + * This test is necessary to prevent such wrapped-around relative times + * from permanently stopping the whole bdi writeback. + */ + ret = ret && time_before_eq(inode->dirtied_when, jiffies); +#endif + return ret; +} + +/* + * Move expired dirty inodes from @delaying_queue to @dispatch_queue. + */ +static void move_expired_inodes(struct list_head *delaying_queue, + struct list_head *dispatch_queue, + unsigned long *older_than_this) +{ + LIST_HEAD(tmp); + struct list_head *pos, *node; + struct super_block *sb = NULL; + struct inode *inode; + int do_sb_sort = 0; + + while (!list_empty(delaying_queue)) { + inode = wb_inode(delaying_queue->prev); + if (older_than_this && + inode_dirtied_after(inode, *older_than_this)) + break; + if (sb && sb != inode->i_sb) + do_sb_sort = 1; + sb = inode->i_sb; + list_move(&inode->i_wb_list, &tmp); + } + + /* just one sb in list, splice to dispatch_queue and we're done */ + if (!do_sb_sort) { + list_splice(&tmp, dispatch_queue); + return; + } + + /* Move inodes from one superblock together */ + while (!list_empty(&tmp)) { + sb = wb_inode(tmp.prev)->i_sb; + list_for_each_prev_safe(pos, node, &tmp) { + inode = wb_inode(pos); + if (inode->i_sb == sb) + list_move(&inode->i_wb_list, dispatch_queue); + } + } +} + +/* + * Queue all expired dirty inodes for io, eldest first. + * Before + * newly dirtied b_dirty b_io b_more_io + * =============> gf edc BA + * After + * newly dirtied b_dirty b_io b_more_io + * =============> g fBAedc + * | + * +--> dequeue for IO + */ +static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) +{ + assert_spin_locked(&inode_wb_list_lock); + list_splice_init(&wb->b_more_io, &wb->b_io); + move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); +} + +static int write_inode(struct inode *inode, struct writeback_control *wbc) +{ + if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) + return inode->i_sb->s_op->write_inode(inode, wbc); + return 0; +} + +/* + * Wait for writeback on an inode to complete. + */ +static void inode_wait_for_writeback(struct inode *inode) +{ + DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); + wait_queue_head_t *wqh; + + wqh = bit_waitqueue(&inode->i_state, __I_SYNC); + while (inode->i_state & I_SYNC) { + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); + __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); + } +} + +/* + * Write out an inode's dirty pages. Called under inode_wb_list_lock and + * inode->i_lock. Either the caller has an active reference on the inode or + * the inode has I_WILL_FREE set. + * + * If `wait' is set, wait on the writeout. + * + * The whole writeout design is quite complex and fragile. We want to avoid + * starvation of particular inodes when others are being redirtied, prevent + * livelocks, etc. + */ +static int +writeback_single_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct address_space *mapping = inode->i_mapping; + unsigned dirty; + int ret; + + assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&inode->i_lock); + + if (!atomic_read(&inode->i_count)) + WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); + else + WARN_ON(inode->i_state & I_WILL_FREE); + + if (inode->i_state & I_SYNC) { + /* + * If this inode is locked for writeback and we are not doing + * writeback-for-data-integrity, move it to b_more_io so that + * writeback can proceed with the other inodes on s_io. + * + * We'll have another go at writing back this inode when we + * completed a full scan of b_io. + */ + if (wbc->sync_mode != WB_SYNC_ALL) { + requeue_io(inode); + return 0; + } + + /* + * It's a data-integrity sync. We must wait. + */ + inode_wait_for_writeback(inode); + } + + BUG_ON(inode->i_state & I_SYNC); + + /* Set I_SYNC, reset I_DIRTY_PAGES */ + inode->i_state |= I_SYNC; + inode->i_state &= ~I_DIRTY_PAGES; + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); + + ret = do_writepages(mapping, wbc); + + /* + * Make sure to wait on the data before writing out the metadata. + * This is important for filesystems that modify metadata on data + * I/O completion. + */ + if (wbc->sync_mode == WB_SYNC_ALL) { + int err = filemap_fdatawait(mapping); + if (ret == 0) + ret = err; + } + + /* + * Some filesystems may redirty the inode during the writeback + * due to delalloc, clear dirty metadata flags right before + * write_inode() + */ + spin_lock(&inode->i_lock); + dirty = inode->i_state & I_DIRTY; + inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); + spin_unlock(&inode->i_lock); + /* Don't write the inode if only I_DIRTY_PAGES was set */ + if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + int err = write_inode(inode, wbc); + if (ret == 0) + ret = err; + } + + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); + inode->i_state &= ~I_SYNC; + if (!(inode->i_state & I_FREEING)) { + /* + * Sync livelock prevention. Each inode is tagged and synced in + * one shot. If still dirty, it will be redirty_tail()'ed below. + * Update the dirty time to prevent enqueue and sync it again. + */ + if ((inode->i_state & I_DIRTY) && + (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) + inode->dirtied_when = jiffies; + + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + /* + * We didn't write back all the pages. nfs_writepages() + * sometimes bales out without doing anything. + */ + inode->i_state |= I_DIRTY_PAGES; + if (wbc->nr_to_write <= 0) { + /* + * slice used up: queue for next turn + */ + requeue_io(inode); + } else { + /* + * Writeback blocked by something other than + * congestion. Delay the inode for some time to + * avoid spinning on the CPU (100% iowait) + * retrying writeback of the dirty page/inode + * that cannot be performed immediately. + */ + redirty_tail(inode); + } + } else if (inode->i_state & I_DIRTY) { + /* + * Filesystems can dirty the inode during writeback + * operations, such as delayed allocation during + * submission or metadata updates after data IO + * completion. + */ + redirty_tail(inode); + } else { + /* + * The inode is clean. At this point we either have + * a reference to the inode or it's on it's way out. + * No need to add it back to the LRU. + */ + list_del_init(&inode->i_wb_list); + } + } + inode_sync_complete(inode); + return ret; +} + +/* + * For background writeback the caller does not have the sb pinned + * before calling writeback. So make sure that we do pin it, so it doesn't + * go away while we are writing inodes from it. + */ +static bool pin_sb_for_writeback(struct super_block *sb) +{ + spin_lock(&sb_lock); + if (list_empty(&sb->s_instances)) { + spin_unlock(&sb_lock); + return false; + } + + sb->s_count++; + spin_unlock(&sb_lock); + + if (down_read_trylock(&sb->s_umount)) { + if (sb->s_root) + return true; + up_read(&sb->s_umount); + } + + put_super(sb); + return false; +} + +/* + * Write a portion of b_io inodes which belong to @sb. + * + * If @only_this_sb is true, then find and write all such + * inodes. Otherwise write only ones which go sequentially + * in reverse order. + * + * Return 1, if the caller writeback routine should be + * interrupted. Otherwise return 0. + */ +static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, + struct writeback_control *wbc, bool only_this_sb) +{ + while (!list_empty(&wb->b_io)) { + long pages_skipped; + struct inode *inode = wb_inode(wb->b_io.prev); + + if (inode->i_sb != sb) { + if (only_this_sb) { + /* + * We only want to write back data for this + * superblock, move all inodes not belonging + * to it back onto the dirty list. + */ + redirty_tail(inode); + continue; + } + + /* + * The inode belongs to a different superblock. + * Bounce back to the caller to unpin this and + * pin the next superblock. + */ + return 0; + } + + /* + * Don't bother with new inodes or inodes beeing freed, first + * kind does not need peridic writeout yet, and for the latter + * kind writeout is handled by the freer. + */ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); + requeue_io(inode); + continue; + } + + /* + * Was this inode dirtied after sync_sb_inodes was called? + * This keeps sync from extra jobs and livelock. + */ + if (inode_dirtied_after(inode, wbc->wb_start)) { + spin_unlock(&inode->i_lock); + return 1; + } + + __iget(inode); + + pages_skipped = wbc->pages_skipped; + writeback_single_inode(inode, wbc); + if (wbc->pages_skipped != pages_skipped) { + /* + * writeback is not making progress due to locked + * buffers. Skip this inode for now. + */ + redirty_tail(inode); + } + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); + iput(inode); + cond_resched(); + spin_lock(&inode_wb_list_lock); + if (wbc->nr_to_write <= 0) { + wbc->more_io = 1; + return 1; + } + if (!list_empty(&wb->b_more_io)) + wbc->more_io = 1; + } + /* b_io is empty */ + return 1; +} + +void writeback_inodes_wb(struct bdi_writeback *wb, + struct writeback_control *wbc) +{ + int ret = 0; + + if (!wbc->wb_start) + wbc->wb_start = jiffies; /* livelock avoidance */ + spin_lock(&inode_wb_list_lock); + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); + + while (!list_empty(&wb->b_io)) { + struct inode *inode = wb_inode(wb->b_io.prev); + struct super_block *sb = inode->i_sb; + + if (!pin_sb_for_writeback(sb)) { + requeue_io(inode); + continue; + } + ret = writeback_sb_inodes(sb, wb, wbc, false); + drop_super(sb); + + if (ret) + break; + } + spin_unlock(&inode_wb_list_lock); + /* Leave any unwritten inodes on b_io */ +} + +static void __writeback_inodes_sb(struct super_block *sb, + struct bdi_writeback *wb, struct writeback_control *wbc) +{ + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + spin_lock(&inode_wb_list_lock); + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); + writeback_sb_inodes(sb, wb, wbc, true); + spin_unlock(&inode_wb_list_lock); +} + +/* + * The maximum number of pages to writeout in a single bdi flush/kupdate + * operation. We do this so we don't hold I_SYNC against an inode for + * enormous amounts of time, which would block a userspace task which has + * been forced to throttle against that inode. Also, the code reevaluates + * the dirty each time it has written this many pages. + */ +#define MAX_WRITEBACK_PAGES 1024 + +static inline bool over_bground_thresh(void) +{ + unsigned long background_thresh, dirty_thresh; + + global_dirty_limits(&background_thresh, &dirty_thresh); + + return (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) > background_thresh); +} + +/* + * Explicit flushing or periodic writeback of "old" data. + * + * Define "old": the first time one of an inode's pages is dirtied, we mark the + * dirtying-time in the inode's address_space. So this periodic writeback code + * just walks the superblock inode list, writing back any inodes which are + * older than a specific point in time. + * + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a + * one-second gap. + * + * older_than_this takes precedence over nr_to_write. So we'll only write back + * all dirty pages if they are all attached to "old" mappings. + */ +static long wb_writeback(struct bdi_writeback *wb, + struct wb_writeback_work *work) +{ + struct writeback_control wbc = { + .sync_mode = work->sync_mode, + .tagged_writepages = work->tagged_writepages, + .older_than_this = NULL, + .for_kupdate = work->for_kupdate, + .for_background = work->for_background, + .range_cyclic = work->range_cyclic, + }; + unsigned long oldest_jif; + long wrote = 0; + long write_chunk = MAX_WRITEBACK_PAGES; + struct inode *inode; + + if (wbc.for_kupdate) { + wbc.older_than_this = &oldest_jif; + oldest_jif = jiffies - + msecs_to_jiffies(dirty_expire_interval * 10); + } + if (!wbc.range_cyclic) { + wbc.range_start = 0; + wbc.range_end = LLONG_MAX; + } + + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * __writeback_inodes_sb() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages) + write_chunk = LONG_MAX; + + wbc.wb_start = jiffies; /* livelock avoidance */ + for (;;) { + /* + * Stop writeback when nr_pages has been consumed + */ + if (work->nr_pages <= 0) + break; + + /* + * Background writeout and kupdate-style writeback may + * run forever. Stop them if there is other work to do + * so that e.g. sync can proceed. They'll be restarted + * after the other works are all done. + */ + if ((work->for_background || work->for_kupdate) && + !list_empty(&wb->bdi->work_list)) + break; + + /* + * For background writeout, stop when we are below the + * background dirty threshold + */ + if (work->for_background && !over_bground_thresh()) + break; + + wbc.more_io = 0; + wbc.nr_to_write = write_chunk; + wbc.pages_skipped = 0; + + trace_wbc_writeback_start(&wbc, wb->bdi); + if (work->sb) + __writeback_inodes_sb(work->sb, wb, &wbc); + else + writeback_inodes_wb(wb, &wbc); + trace_wbc_writeback_written(&wbc, wb->bdi); + + work->nr_pages -= write_chunk - wbc.nr_to_write; + wrote += write_chunk - wbc.nr_to_write; + + /* + * If we consumed everything, see if we have more + */ + if (wbc.nr_to_write <= 0) + continue; + /* + * Didn't write everything and we don't have more IO, bail + */ + if (!wbc.more_io) + break; + /* + * Did we write something? Try for more + */ + if (wbc.nr_to_write < write_chunk) + continue; + /* + * Nothing written. Wait for some inode to + * become available for writeback. Otherwise + * we'll just busyloop. + */ + spin_lock(&inode_wb_list_lock); + if (!list_empty(&wb->b_more_io)) { + inode = wb_inode(wb->b_more_io.prev); + trace_wbc_writeback_wait(&wbc, wb->bdi); + spin_lock(&inode->i_lock); + inode_wait_for_writeback(inode); + spin_unlock(&inode->i_lock); + } + spin_unlock(&inode_wb_list_lock); + } + + return wrote; +} + +/* + * Return the next wb_writeback_work struct that hasn't been processed yet. + */ +static struct wb_writeback_work * +get_next_work_item(struct backing_dev_info *bdi) +{ + struct wb_writeback_work *work = NULL; + + spin_lock_bh(&bdi->wb_lock); + if (!list_empty(&bdi->work_list)) { + work = list_entry(bdi->work_list.next, + struct wb_writeback_work, list); + list_del_init(&work->list); + } + spin_unlock_bh(&bdi->wb_lock); + return work; +} + +/* + * Add in the number of potentially dirty inodes, because each inode + * write can dirty pagecache in the underlying blockdev. + */ +static unsigned long get_nr_dirty_pages(void) +{ + return global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + + get_nr_dirty_inodes(); +} + +static long wb_check_background_flush(struct bdi_writeback *wb) +{ + if (over_bground_thresh()) { + + struct wb_writeback_work work = { + .nr_pages = LONG_MAX, + .sync_mode = WB_SYNC_NONE, + .for_background = 1, + .range_cyclic = 1, + }; + + return wb_writeback(wb, &work); + } + + return 0; +} + +static long wb_check_old_data_flush(struct bdi_writeback *wb) +{ + unsigned long expired; + long nr_pages; + + /* + * When set to zero, disable periodic writeback + */ + if (!dirty_writeback_interval) + return 0; + + expired = wb->last_old_flush + + msecs_to_jiffies(dirty_writeback_interval * 10); + if (time_before(jiffies, expired)) + return 0; + + wb->last_old_flush = jiffies; + nr_pages = get_nr_dirty_pages(); + + if (nr_pages) { + struct wb_writeback_work work = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .for_kupdate = 1, + .range_cyclic = 1, + }; + + return wb_writeback(wb, &work); + } + + return 0; +} + +/* + * Retrieve work items and do the writeback they describe + */ +long wb_do_writeback(struct bdi_writeback *wb, int force_wait) +{ + struct backing_dev_info *bdi = wb->bdi; + struct wb_writeback_work *work; + long wrote = 0; + + set_bit(BDI_writeback_running, &wb->bdi->state); + while ((work = get_next_work_item(bdi)) != NULL) { + /* + * Override sync mode, in case we must wait for completion + * because this thread is exiting now. + */ + if (force_wait) + work->sync_mode = WB_SYNC_ALL; + + trace_writeback_exec(bdi, work); + + wrote += wb_writeback(wb, work); + + /* + * Notify the caller of completion if this is a synchronous + * work item, otherwise just free it. + */ + if (work->done) + complete(work->done); + else + kfree(work); + } + + /* + * Check for periodic writeback, kupdated() style + */ + wrote += wb_check_old_data_flush(wb); + wrote += wb_check_background_flush(wb); + clear_bit(BDI_writeback_running, &wb->bdi->state); + + return wrote; +} + +/* + * Handle writeback of dirty data for the device backed by this bdi. Also + * wakes up periodically and does kupdated style flushing. + */ +int bdi_writeback_thread(void *data) +{ + struct bdi_writeback *wb = data; + struct backing_dev_info *bdi = wb->bdi; + long pages_written; + + current->flags |= PF_SWAPWRITE; + set_freezable(); + wb->last_active = jiffies; + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(current, 0); + + trace_writeback_thread_start(bdi); + + while (!kthread_should_stop()) { + /* + * Remove own delayed wake-up timer, since we are already awake + * and we'll take care of the preriodic write-back. + */ + del_timer(&wb->wakeup_timer); + + pages_written = wb_do_writeback(wb, 0); + + trace_writeback_pages_written(pages_written); + + if (pages_written) + wb->last_active = jiffies; + + set_current_state(TASK_INTERRUPTIBLE); + if (!list_empty(&bdi->work_list) || kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + continue; + } + + if (wb_has_dirty_io(wb) && dirty_writeback_interval) + schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); + else { + /* + * We have nothing to do, so can go sleep without any + * timeout and save power. When a work is queued or + * something is made dirty - we will be woken up. + */ + schedule(); + } + + try_to_freeze(); + } + + /* Flush any work that raced with us exiting */ + if (!list_empty(&bdi->work_list)) + wb_do_writeback(wb, 1); + + trace_writeback_thread_stop(bdi); + return 0; +} + + +/* + * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back + * the whole world. + */ +void wakeup_flusher_threads(long nr_pages) +{ + struct backing_dev_info *bdi; + + if (!nr_pages) { + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + } + + rcu_read_lock(); + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { + if (!bdi_has_dirty_io(bdi)) + continue; + __bdi_start_writeback(bdi, nr_pages, false); + } + rcu_read_unlock(); +} + +static noinline void block_dump___mark_inode_dirty(struct inode *inode) +{ + if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { + struct dentry *dentry; + const char *name = "?"; + + dentry = d_find_alias(inode); + if (dentry) { + spin_lock(&dentry->d_lock); + name = (const char *) dentry->d_name.name; + } + printk(KERN_DEBUG + "%s(%d): dirtied inode %lu (%s) on %s\n", + current->comm, task_pid_nr(current), inode->i_ino, + name, inode->i_sb->s_id); + if (dentry) { + spin_unlock(&dentry->d_lock); + dput(dentry); + } + } +} + +/** + * __mark_inode_dirty - internal function + * @inode: inode to mark + * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * Mark an inode as dirty. Callers should use mark_inode_dirty or + * mark_inode_dirty_sync. + * + * Put the inode on the super block's dirty list. + * + * CAREFUL! We mark it dirty unconditionally, but move it onto the + * dirty list only if it is hashed or if it refers to a blockdev. + * If it was not hashed, it will never be added to the dirty list + * even if it is later hashed, as it will have been marked dirty already. + * + * In short, make sure you hash any inodes _before_ you start marking + * them dirty. + * + * Note that for blockdevs, inode->dirtied_when represents the dirtying time of + * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of + * the kernel-internal blockdev inode represents the dirtying time of the + * blockdev's pages. This is why for I_DIRTY_PAGES we always use + * page->mapping->host, so the page-dirtying time is recorded in the internal + * blockdev inode. + */ +void __mark_inode_dirty(struct inode *inode, int flags) +{ + struct super_block *sb = inode->i_sb; + struct backing_dev_info *bdi = NULL; + + /* + * Don't do this for I_DIRTY_PAGES - that doesn't actually + * dirty the inode itself + */ + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (sb->s_op->dirty_inode) + sb->s_op->dirty_inode(inode, flags); + } + + /* + * make sure that changes are seen by all cpus before we test i_state + * -- mikulas + */ + smp_mb(); + + /* avoid the locking if we can */ + if ((inode->i_state & flags) == flags) + return; + + if (unlikely(block_dump > 1)) + block_dump___mark_inode_dirty(inode); + + spin_lock(&inode->i_lock); + if ((inode->i_state & flags) != flags) { + const int was_dirty = inode->i_state & I_DIRTY; + + inode->i_state |= flags; + + /* + * If the inode is being synced, just update its dirty state. + * The unlocker will place the inode on the appropriate + * superblock list, based upon its state. + */ + if (inode->i_state & I_SYNC) + goto out_unlock_inode; + + /* + * Only add valid (hashed) inodes to the superblock's + * dirty list. Add blockdev inodes as well. + */ + if (!S_ISBLK(inode->i_mode)) { + if (inode_unhashed(inode)) + goto out_unlock_inode; + } + if (inode->i_state & I_FREEING) + goto out_unlock_inode; + + /* + * If the inode was already on b_dirty/b_io/b_more_io, don't + * reposition it (that would break b_dirty time-ordering). + */ + if (!was_dirty) { + bool wakeup_bdi = false; + bdi = inode_to_bdi(inode); + if (!bdi) + goto out_unlock_inode; + + if (bdi_cap_writeback_dirty(bdi)) { + WARN(!test_bit(BDI_registered, &bdi->state), + "bdi-%s not registered\n", bdi->name); + + /* + * If this is the first dirty inode for this + * bdi, we have to wake-up the corresponding + * bdi thread to make sure background + * write-back happens later. + */ + if (!wb_has_dirty_io(&bdi->wb)) + wakeup_bdi = true; + } + + spin_unlock(&inode->i_lock); + spin_lock(&inode_wb_list_lock); + inode->dirtied_when = jiffies; + list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + spin_unlock(&inode_wb_list_lock); + + if (wakeup_bdi) + bdi_wakeup_thread_delayed(bdi); + return; + } + } +out_unlock_inode: + spin_unlock(&inode->i_lock); + +} +EXPORT_SYMBOL(__mark_inode_dirty); + +/* + * Write out a superblock's list of dirty inodes. A wait will be performed + * upon no inodes, all inodes or the final one, depending upon sync_mode. + * + * If older_than_this is non-NULL, then only write out inodes which + * had their first dirtying at a time earlier than *older_than_this. + * + * If `bdi' is non-zero then we're being asked to writeback a specific queue. + * This function assumes that the blockdev superblock's inodes are backed by + * a variety of queues, so all inodes are searched. For other superblocks, + * assume that all inodes are backed by the same queue. + * + * The inodes to be written are parked on bdi->b_io. They are moved back onto + * bdi->b_dirty as they are selected for writing. This way, none can be missed + * on the writer throttling path, and we get decent balancing between many + * throttled threads: we don't want them all piling up on inode_sync_wait. + */ +static void wait_sb_inodes(struct super_block *sb) +{ + struct inode *inode, *old_inode = NULL; + + /* + * We need to be protected against the filesystem going from + * r/o to r/w or vice versa. + */ + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + spin_lock(&inode_sb_list_lock); + + /* + * Data integrity sync. Must wait for all pages under writeback, + * because there may have been pages dirtied before our sync + * call, but which had writeout started before we write it out. + * In which case, the inode may not be on the dirty list, but + * we still have to wait for that writeout. + */ + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + struct address_space *mapping = inode->i_mapping; + + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); + + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. + */ + iput(old_inode); + old_inode = inode; + + filemap_fdatawait(mapping); + + cond_resched(); + + spin_lock(&inode_sb_list_lock); + } + spin_unlock(&inode_sb_list_lock); + iput(old_inode); +} + +/** + * writeback_inodes_sb_nr - writeback dirty inodes from given super_block + * @sb: the superblock + * @nr: the number of pages to write + * + * Start writeback on some inodes on this super_block. No guarantees are made + * on how many (if any) will be written, and this function does not wait + * for IO completion of submitted IO. + */ +void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .tagged_writepages = 1, + .done = &done, + .nr_pages = nr, + }; + + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + bdi_queue_work(sb->s_bdi, &work); + wait_for_completion(&done); +} +EXPORT_SYMBOL(writeback_inodes_sb_nr); + +/** + * writeback_inodes_sb - writeback dirty inodes from given super_block + * @sb: the superblock + * + * Start writeback on some inodes on this super_block. No guarantees are made + * on how many (if any) will be written, and this function does not wait + * for IO completion of submitted IO. + */ +void writeback_inodes_sb(struct super_block *sb) +{ + return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); +} +EXPORT_SYMBOL(writeback_inodes_sb); + +/** + * writeback_inodes_sb_if_idle - start writeback if none underway + * @sb: the superblock + * + * Invoke writeback_inodes_sb if no writeback is currently underway. + * Returns 1 if writeback was started, 0 if not. + */ +int writeback_inodes_sb_if_idle(struct super_block *sb) +{ + if (!writeback_in_progress(sb->s_bdi)) { + down_read(&sb->s_umount); + writeback_inodes_sb(sb); + up_read(&sb->s_umount); + return 1; + } else + return 0; +} +EXPORT_SYMBOL(writeback_inodes_sb_if_idle); + +/** + * writeback_inodes_sb_if_idle - start writeback if none underway + * @sb: the superblock + * @nr: the number of pages to write + * + * Invoke writeback_inodes_sb if no writeback is currently underway. + * Returns 1 if writeback was started, 0 if not. + */ +int writeback_inodes_sb_nr_if_idle(struct super_block *sb, + unsigned long nr) +{ + if (!writeback_in_progress(sb->s_bdi)) { + down_read(&sb->s_umount); + writeback_inodes_sb_nr(sb, nr); + up_read(&sb->s_umount); + return 1; + } else + return 0; +} +EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); + +/** + * sync_inodes_sb - sync sb inode pages + * @sb: the superblock + * + * This function writes and waits on any dirty inode belonging to this + * super_block. + */ +void sync_inodes_sb(struct super_block *sb) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { + .sb = sb, + .sync_mode = WB_SYNC_ALL, + .nr_pages = LONG_MAX, + .range_cyclic = 0, + .done = &done, + }; + + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + bdi_queue_work(sb->s_bdi, &work); + wait_for_completion(&done); + + wait_sb_inodes(sb); +} +EXPORT_SYMBOL(sync_inodes_sb); + +/** + * write_inode_now - write an inode to disk + * @inode: inode to write to disk + * @sync: whether the write should be synchronous or not + * + * This function commits an inode to disk immediately if it is dirty. This is + * primarily needed by knfsd. + * + * The caller must either have a ref on the inode or must have set I_WILL_FREE. + */ +int write_inode_now(struct inode *inode, int sync) +{ + int ret; + struct writeback_control wbc = { + .nr_to_write = LONG_MAX, + .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, + .range_start = 0, + .range_end = LLONG_MAX, + }; + + if (!mapping_cap_writeback_dirty(inode->i_mapping)) + wbc.nr_to_write = 0; + + might_sleep(); + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); + ret = writeback_single_inode(inode, &wbc); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); + if (sync) + inode_sync_wait(inode); + return ret; +} +EXPORT_SYMBOL(write_inode_now); + +/** + * sync_inode - write an inode and its pages to disk. + * @inode: the inode to sync + * @wbc: controls the writeback mode + * + * sync_inode() will write an inode and its pages to disk. It will also + * correctly update the inode on its superblock's dirty inode lists and will + * update inode->i_state. + * + * The caller must have a ref on the inode. + */ +int sync_inode(struct inode *inode, struct writeback_control *wbc) +{ + int ret; + + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); + ret = writeback_single_inode(inode, wbc); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); + return ret; +} +EXPORT_SYMBOL(sync_inode); + +/** + * sync_inode_metadata - write an inode to disk + * @inode: the inode to sync + * @wait: wait for I/O to complete. + * + * Write an inode to disk and adjust its dirty state after completion. + * + * Note: only writes the actual inode, no associated data or other metadata. + */ +int sync_inode_metadata(struct inode *inode, int wait) +{ + struct writeback_control wbc = { + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, + .nr_to_write = 0, /* metadata-only */ + }; + + return sync_inode(inode, &wbc); +} +EXPORT_SYMBOL(sync_inode_metadata); diff --git a/fs/fs_struct.c b/fs/fs_struct.c new file mode 100644 index 00000000..78b519c1 --- /dev/null +++ b/fs/fs_struct.c @@ -0,0 +1,201 @@ +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static inline void path_get_longterm(struct path *path) +{ + path_get(path); + mnt_make_longterm(path->mnt); +} + +static inline void path_put_longterm(struct path *path) +{ + mnt_make_shortterm(path->mnt); + path_put(path); +} + +/* + * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. + * It can block. + */ +void set_fs_root(struct fs_struct *fs, struct path *path) +{ + struct path old_root; + + spin_lock(&fs->lock); + write_seqcount_begin(&fs->seq); + old_root = fs->root; + fs->root = *path; + path_get_longterm(path); + write_seqcount_end(&fs->seq); + spin_unlock(&fs->lock); + if (old_root.dentry) + path_put_longterm(&old_root); +} + +/* + * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. + * It can block. + */ +void set_fs_pwd(struct fs_struct *fs, struct path *path) +{ + struct path old_pwd; + + spin_lock(&fs->lock); + write_seqcount_begin(&fs->seq); + old_pwd = fs->pwd; + fs->pwd = *path; + path_get_longterm(path); + write_seqcount_end(&fs->seq); + spin_unlock(&fs->lock); + + if (old_pwd.dentry) + path_put_longterm(&old_pwd); +} + +void chroot_fs_refs(struct path *old_root, struct path *new_root) +{ + struct task_struct *g, *p; + struct fs_struct *fs; + int count = 0; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + task_lock(p); + fs = p->fs; + if (fs) { + spin_lock(&fs->lock); + write_seqcount_begin(&fs->seq); + if (fs->root.dentry == old_root->dentry + && fs->root.mnt == old_root->mnt) { + path_get_longterm(new_root); + fs->root = *new_root; + count++; + } + if (fs->pwd.dentry == old_root->dentry + && fs->pwd.mnt == old_root->mnt) { + path_get_longterm(new_root); + fs->pwd = *new_root; + count++; + } + write_seqcount_end(&fs->seq); + spin_unlock(&fs->lock); + } + task_unlock(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + while (count--) + path_put_longterm(old_root); +} + +void free_fs_struct(struct fs_struct *fs) +{ + path_put_longterm(&fs->root); + path_put_longterm(&fs->pwd); + kmem_cache_free(fs_cachep, fs); +} + +void exit_fs(struct task_struct *tsk) +{ + struct fs_struct *fs = tsk->fs; + + if (fs) { + int kill; + task_lock(tsk); + spin_lock(&fs->lock); + write_seqcount_begin(&fs->seq); + tsk->fs = NULL; + kill = !--fs->users; + write_seqcount_end(&fs->seq); + spin_unlock(&fs->lock); + task_unlock(tsk); + if (kill) + free_fs_struct(fs); + } +} + +struct fs_struct *copy_fs_struct(struct fs_struct *old) +{ + struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); + /* We don't need to lock fs - think why ;-) */ + if (fs) { + fs->users = 1; + fs->in_exec = 0; + spin_lock_init(&fs->lock); + seqcount_init(&fs->seq); + fs->umask = old->umask; + + spin_lock(&old->lock); + fs->root = old->root; + path_get_longterm(&fs->root); + fs->pwd = old->pwd; + path_get_longterm(&fs->pwd); + spin_unlock(&old->lock); + } + return fs; +} + +int unshare_fs_struct(void) +{ + struct fs_struct *fs = current->fs; + struct fs_struct *new_fs = copy_fs_struct(fs); + int kill; + + if (!new_fs) + return -ENOMEM; + + task_lock(current); + spin_lock(&fs->lock); + kill = !--fs->users; + current->fs = new_fs; + spin_unlock(&fs->lock); + task_unlock(current); + + if (kill) + free_fs_struct(fs); + + return 0; +} +EXPORT_SYMBOL_GPL(unshare_fs_struct); + +int current_umask(void) +{ + return current->fs->umask; +} +EXPORT_SYMBOL(current_umask); + +/* to be mentioned only in INIT_TASK */ +struct fs_struct init_fs = { + .users = 1, + .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), + .seq = SEQCNT_ZERO, + .umask = 0022, +}; + +void daemonize_fs_struct(void) +{ + struct fs_struct *fs = current->fs; + + if (fs) { + int kill; + + task_lock(current); + + spin_lock(&init_fs.lock); + init_fs.users++; + spin_unlock(&init_fs.lock); + + spin_lock(&fs->lock); + current->fs = &init_fs; + kill = !--fs->users; + spin_unlock(&fs->lock); + + task_unlock(current); + if (kill) + free_fs_struct(fs); + } +} diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig new file mode 100644 index 00000000..3f6dfa98 --- /dev/null +++ b/fs/fscache/Kconfig @@ -0,0 +1,61 @@ + +config FSCACHE + tristate "General filesystem local caching manager" + help + This option enables a generic filesystem caching manager that can be + used by various network and other filesystems to cache data locally. + Different sorts of caches can be plugged in, depending on the + resources available. + + See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_STATS + bool "Gather statistical information on local caching" + depends on FSCACHE && PROC_FS + help + This option causes statistical information to be gathered on local + caching and exported through file: + + /proc/fs/fscache/stats + + The gathering of statistics adds a certain amount of overhead to + execution as there are a quite a few stats gathered, and on a + multi-CPU system these may be on cachelines that keep bouncing + between CPUs. On the other hand, the stats are very useful for + debugging purposes. Saying 'Y' here is recommended. + + See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_HISTOGRAM + bool "Gather latency information on local caching" + depends on FSCACHE && PROC_FS + help + This option causes latency information to be gathered on local + caching and exported through file: + + /proc/fs/fscache/histogram + + The generation of this histogram adds a certain amount of overhead to + execution as there are a number of points at which data is gathered, + and on a multi-CPU system these may be on cachelines that keep + bouncing between CPUs. On the other hand, the histogram may be + useful for debugging purposes. Saying 'N' here is recommended. + + See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_DEBUG + bool "Debug FS-Cache" + depends on FSCACHE + help + This permits debugging to be dynamically enabled in the local caching + management module. If this is set, the debugging output may be + enabled by setting bits in /sys/modules/fscache/parameter/debug. + + See Documentation/filesystems/caching/fscache.txt for more information. + +config FSCACHE_OBJECT_LIST + bool "Maintain global object list for debugging purposes" + depends on FSCACHE && PROC_FS + help + Maintain a global list of active fscache objects that can be + retrieved through /proc/fs/fscache/objects for debugging purposes diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile new file mode 100644 index 00000000..6d561531 --- /dev/null +++ b/fs/fscache/Makefile @@ -0,0 +1,20 @@ +# +# Makefile for general filesystem caching code +# + +fscache-y := \ + cache.o \ + cookie.o \ + fsdef.o \ + main.o \ + netfs.o \ + object.o \ + operation.o \ + page.o + +fscache-$(CONFIG_PROC_FS) += proc.o +fscache-$(CONFIG_FSCACHE_STATS) += stats.o +fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o +fscache-$(CONFIG_FSCACHE_OBJECT_LIST) += object-list.o + +obj-$(CONFIG_FSCACHE) := fscache.o diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c new file mode 100644 index 00000000..6a3c48ab --- /dev/null +++ b/fs/fscache/cache.c @@ -0,0 +1,420 @@ +/* FS-Cache cache handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL CACHE +#include +#include +#include "internal.h" + +LIST_HEAD(fscache_cache_list); +DECLARE_RWSEM(fscache_addremove_sem); +DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq); +EXPORT_SYMBOL(fscache_cache_cleared_wq); + +static LIST_HEAD(fscache_cache_tag_list); + +/* + * look up a cache tag + */ +struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name) +{ + struct fscache_cache_tag *tag, *xtag; + + /* firstly check for the existence of the tag under read lock */ + down_read(&fscache_addremove_sem); + + list_for_each_entry(tag, &fscache_cache_tag_list, link) { + if (strcmp(tag->name, name) == 0) { + atomic_inc(&tag->usage); + up_read(&fscache_addremove_sem); + return tag; + } + } + + up_read(&fscache_addremove_sem); + + /* the tag does not exist - create a candidate */ + xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL); + if (!xtag) + /* return a dummy tag if out of memory */ + return ERR_PTR(-ENOMEM); + + atomic_set(&xtag->usage, 1); + strcpy(xtag->name, name); + + /* write lock, search again and add if still not present */ + down_write(&fscache_addremove_sem); + + list_for_each_entry(tag, &fscache_cache_tag_list, link) { + if (strcmp(tag->name, name) == 0) { + atomic_inc(&tag->usage); + up_write(&fscache_addremove_sem); + kfree(xtag); + return tag; + } + } + + list_add_tail(&xtag->link, &fscache_cache_tag_list); + up_write(&fscache_addremove_sem); + return xtag; +} + +/* + * release a reference to a cache tag + */ +void __fscache_release_cache_tag(struct fscache_cache_tag *tag) +{ + if (tag != ERR_PTR(-ENOMEM)) { + down_write(&fscache_addremove_sem); + + if (atomic_dec_and_test(&tag->usage)) + list_del_init(&tag->link); + else + tag = NULL; + + up_write(&fscache_addremove_sem); + + kfree(tag); + } +} + +/* + * select a cache in which to store an object + * - the cache addremove semaphore must be at least read-locked by the caller + * - the object will never be an index + */ +struct fscache_cache *fscache_select_cache_for_object( + struct fscache_cookie *cookie) +{ + struct fscache_cache_tag *tag; + struct fscache_object *object; + struct fscache_cache *cache; + + _enter(""); + + if (list_empty(&fscache_cache_list)) { + _leave(" = NULL [no cache]"); + return NULL; + } + + /* we check the parent to determine the cache to use */ + spin_lock(&cookie->lock); + + /* the first in the parent's backing list should be the preferred + * cache */ + if (!hlist_empty(&cookie->backing_objects)) { + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + cache = object->cache; + if (object->state >= FSCACHE_OBJECT_DYING || + test_bit(FSCACHE_IOERROR, &cache->flags)) + cache = NULL; + + spin_unlock(&cookie->lock); + _leave(" = %p [parent]", cache); + return cache; + } + + /* the parent is unbacked */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + /* cookie not an index and is unbacked */ + spin_unlock(&cookie->lock); + _leave(" = NULL [cookie ub,ni]"); + return NULL; + } + + spin_unlock(&cookie->lock); + + if (!cookie->def->select_cache) + goto no_preference; + + /* ask the netfs for its preference */ + tag = cookie->def->select_cache(cookie->parent->netfs_data, + cookie->netfs_data); + if (!tag) + goto no_preference; + + if (tag == ERR_PTR(-ENOMEM)) { + _leave(" = NULL [nomem tag]"); + return NULL; + } + + if (!tag->cache) { + _leave(" = NULL [unbacked tag]"); + return NULL; + } + + if (test_bit(FSCACHE_IOERROR, &tag->cache->flags)) + return NULL; + + _leave(" = %p [specific]", tag->cache); + return tag->cache; + +no_preference: + /* netfs has no preference - just select first cache */ + cache = list_entry(fscache_cache_list.next, + struct fscache_cache, link); + _leave(" = %p [first]", cache); + return cache; +} + +/** + * fscache_init_cache - Initialise a cache record + * @cache: The cache record to be initialised + * @ops: The cache operations to be installed in that record + * @idfmt: Format string to define identifier + * @...: sprintf-style arguments + * + * Initialise a record of a cache and fill in the name. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_init_cache(struct fscache_cache *cache, + const struct fscache_cache_ops *ops, + const char *idfmt, + ...) +{ + va_list va; + + memset(cache, 0, sizeof(*cache)); + + cache->ops = ops; + + va_start(va, idfmt); + vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va); + va_end(va); + + INIT_WORK(&cache->op_gc, fscache_operation_gc); + INIT_LIST_HEAD(&cache->link); + INIT_LIST_HEAD(&cache->object_list); + INIT_LIST_HEAD(&cache->op_gc_list); + spin_lock_init(&cache->object_list_lock); + spin_lock_init(&cache->op_gc_list_lock); +} +EXPORT_SYMBOL(fscache_init_cache); + +/** + * fscache_add_cache - Declare a cache as being open for business + * @cache: The record describing the cache + * @ifsdef: The record of the cache object describing the top-level index + * @tagname: The tag describing this cache + * + * Add a cache to the system, making it available for netfs's to use. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +int fscache_add_cache(struct fscache_cache *cache, + struct fscache_object *ifsdef, + const char *tagname) +{ + struct fscache_cache_tag *tag; + + BUG_ON(!cache->ops); + BUG_ON(!ifsdef); + + cache->flags = 0; + ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED); + ifsdef->state = FSCACHE_OBJECT_ACTIVE; + + if (!tagname) + tagname = cache->identifier; + + BUG_ON(!tagname[0]); + + _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname); + + /* we use the cache tag to uniquely identify caches */ + tag = __fscache_lookup_cache_tag(tagname); + if (IS_ERR(tag)) + goto nomem; + + if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags)) + goto tag_in_use; + + cache->kobj = kobject_create_and_add(tagname, fscache_root); + if (!cache->kobj) + goto error; + + ifsdef->cookie = &fscache_fsdef_index; + ifsdef->cache = cache; + cache->fsdef = ifsdef; + + down_write(&fscache_addremove_sem); + + tag->cache = cache; + cache->tag = tag; + + /* add the cache to the list */ + list_add(&cache->link, &fscache_cache_list); + + /* add the cache's netfs definition index object to the cache's + * list */ + spin_lock(&cache->object_list_lock); + list_add_tail(&ifsdef->cache_link, &cache->object_list); + spin_unlock(&cache->object_list_lock); + fscache_objlist_add(ifsdef); + + /* add the cache's netfs definition index object to the top level index + * cookie as a known backing object */ + spin_lock(&fscache_fsdef_index.lock); + + hlist_add_head(&ifsdef->cookie_link, + &fscache_fsdef_index.backing_objects); + + atomic_inc(&fscache_fsdef_index.usage); + + /* done */ + spin_unlock(&fscache_fsdef_index.lock); + up_write(&fscache_addremove_sem); + + printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n", + cache->tag->name, cache->ops->name); + kobject_uevent(cache->kobj, KOBJ_ADD); + + _leave(" = 0 [%s]", cache->identifier); + return 0; + +tag_in_use: + printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname); + __fscache_release_cache_tag(tag); + _leave(" = -EXIST"); + return -EEXIST; + +error: + __fscache_release_cache_tag(tag); + _leave(" = -EINVAL"); + return -EINVAL; + +nomem: + _leave(" = -ENOMEM"); + return -ENOMEM; +} +EXPORT_SYMBOL(fscache_add_cache); + +/** + * fscache_io_error - Note a cache I/O error + * @cache: The record describing the cache + * + * Note that an I/O error occurred in a cache and that it should no longer be + * used for anything. This also reports the error into the kernel log. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_io_error(struct fscache_cache *cache) +{ + set_bit(FSCACHE_IOERROR, &cache->flags); + + printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n", + cache->ops->name); +} +EXPORT_SYMBOL(fscache_io_error); + +/* + * request withdrawal of all the objects in a cache + * - all the objects being withdrawn are moved onto the supplied list + */ +static void fscache_withdraw_all_objects(struct fscache_cache *cache, + struct list_head *dying_objects) +{ + struct fscache_object *object; + + spin_lock(&cache->object_list_lock); + + while (!list_empty(&cache->object_list)) { + object = list_entry(cache->object_list.next, + struct fscache_object, cache_link); + list_move_tail(&object->cache_link, dying_objects); + + _debug("withdraw %p", object->cookie); + + spin_lock(&object->lock); + spin_unlock(&cache->object_list_lock); + fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW); + spin_unlock(&object->lock); + + cond_resched(); + spin_lock(&cache->object_list_lock); + } + + spin_unlock(&cache->object_list_lock); +} + +/** + * fscache_withdraw_cache - Withdraw a cache from the active service + * @cache: The record describing the cache + * + * Withdraw a cache from service, unbinding all its cache objects from the + * netfs cookies they're currently representing. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_withdraw_cache(struct fscache_cache *cache) +{ + LIST_HEAD(dying_objects); + + _enter(""); + + printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n", + cache->tag->name); + + /* make the cache unavailable for cookie acquisition */ + if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags)) + BUG(); + + down_write(&fscache_addremove_sem); + list_del_init(&cache->link); + cache->tag->cache = NULL; + up_write(&fscache_addremove_sem); + + /* make sure all pages pinned by operations on behalf of the netfs are + * written to disk */ + fscache_stat(&fscache_n_cop_sync_cache); + cache->ops->sync_cache(cache); + fscache_stat_d(&fscache_n_cop_sync_cache); + + /* dissociate all the netfs pages backed by this cache from the block + * mappings in the cache */ + fscache_stat(&fscache_n_cop_dissociate_pages); + cache->ops->dissociate_pages(cache); + fscache_stat_d(&fscache_n_cop_dissociate_pages); + + /* we now have to destroy all the active objects pertaining to this + * cache - which we do by passing them off to thread pool to be + * disposed of */ + _debug("destroy"); + + fscache_withdraw_all_objects(cache, &dying_objects); + + /* wait for all extant objects to finish their outstanding operations + * and go away */ + _debug("wait for finish"); + wait_event(fscache_cache_cleared_wq, + atomic_read(&cache->object_count) == 0); + _debug("wait for clearance"); + wait_event(fscache_cache_cleared_wq, + list_empty(&cache->object_list)); + _debug("cleared"); + ASSERT(list_empty(&dying_objects)); + + kobject_put(cache->kobj); + + clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags); + fscache_release_cache_tag(cache->tag); + cache->tag = NULL; + + _leave(""); +} +EXPORT_SYMBOL(fscache_withdraw_cache); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c new file mode 100644 index 00000000..99053507 --- /dev/null +++ b/fs/fscache/cookie.c @@ -0,0 +1,514 @@ +/* netfs cookie management + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * See Documentation/filesystems/caching/netfs-api.txt for more information on + * the netfs API. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include +#include +#include "internal.h" + +struct kmem_cache *fscache_cookie_jar; + +static atomic_t fscache_object_debug_id = ATOMIC_INIT(0); + +static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie); +static int fscache_alloc_object(struct fscache_cache *cache, + struct fscache_cookie *cookie); +static int fscache_attach_object(struct fscache_cookie *cookie, + struct fscache_object *object); + +/* + * initialise an cookie jar slab element prior to any use + */ +void fscache_cookie_init_once(void *_cookie) +{ + struct fscache_cookie *cookie = _cookie; + + memset(cookie, 0, sizeof(*cookie)); + spin_lock_init(&cookie->lock); + spin_lock_init(&cookie->stores_lock); + INIT_HLIST_HEAD(&cookie->backing_objects); +} + +/* + * request a cookie to represent an object (index, datafile, xattr, etc) + * - parent specifies the parent object + * - the top level index cookie for each netfs is stored in the fscache_netfs + * struct upon registration + * - def points to the definition + * - the netfs_data will be passed to the functions pointed to in *def + * - all attached caches will be searched to see if they contain this object + * - index objects aren't stored on disk until there's a dependent file that + * needs storing + * - other objects are stored in a selected cache immediately, and all the + * indices forming the path to it are instantiated if necessary + * - we never let on to the netfs about errors + * - we may set a negative cookie pointer, but that's okay + */ +struct fscache_cookie *__fscache_acquire_cookie( + struct fscache_cookie *parent, + const struct fscache_cookie_def *def, + void *netfs_data) +{ + struct fscache_cookie *cookie; + + BUG_ON(!def); + + _enter("{%s},{%s},%p", + parent ? (char *) parent->def->name : "", + def->name, netfs_data); + + fscache_stat(&fscache_n_acquires); + + /* if there's no parent cookie, then we don't create one here either */ + if (!parent) { + fscache_stat(&fscache_n_acquires_null); + _leave(" [no parent]"); + return NULL; + } + + /* validate the definition */ + BUG_ON(!def->get_key); + BUG_ON(!def->name[0]); + + BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX && + parent->def->type != FSCACHE_COOKIE_TYPE_INDEX); + + /* allocate and initialise a cookie */ + cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL); + if (!cookie) { + fscache_stat(&fscache_n_acquires_oom); + _leave(" [ENOMEM]"); + return NULL; + } + + atomic_set(&cookie->usage, 1); + atomic_set(&cookie->n_children, 0); + + atomic_inc(&parent->usage); + atomic_inc(&parent->n_children); + + cookie->def = def; + cookie->parent = parent; + cookie->netfs_data = netfs_data; + cookie->flags = 0; + + /* radix tree insertion won't use the preallocation pool unless it's + * told it may not wait */ + INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT); + + switch (cookie->def->type) { + case FSCACHE_COOKIE_TYPE_INDEX: + fscache_stat(&fscache_n_cookie_index); + break; + case FSCACHE_COOKIE_TYPE_DATAFILE: + fscache_stat(&fscache_n_cookie_data); + break; + default: + fscache_stat(&fscache_n_cookie_special); + break; + } + + /* if the object is an index then we need do nothing more here - we + * create indices on disk when we need them as an index may exist in + * multiple caches */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (fscache_acquire_non_index_cookie(cookie) < 0) { + atomic_dec(&parent->n_children); + __fscache_cookie_put(cookie); + fscache_stat(&fscache_n_acquires_nobufs); + _leave(" = NULL"); + return NULL; + } + } + + fscache_stat(&fscache_n_acquires_ok); + _leave(" = %p", cookie); + return cookie; +} +EXPORT_SYMBOL(__fscache_acquire_cookie); + +/* + * acquire a non-index cookie + * - this must make sure the index chain is instantiated and instantiate the + * object representation too + */ +static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) +{ + struct fscache_object *object; + struct fscache_cache *cache; + uint64_t i_size; + int ret; + + _enter(""); + + cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE; + + /* now we need to see whether the backing objects for this cookie yet + * exist, if not there'll be nothing to search */ + down_read(&fscache_addremove_sem); + + if (list_empty(&fscache_cache_list)) { + up_read(&fscache_addremove_sem); + _leave(" = 0 [no caches]"); + return 0; + } + + /* select a cache in which to store the object */ + cache = fscache_select_cache_for_object(cookie->parent); + if (!cache) { + up_read(&fscache_addremove_sem); + fscache_stat(&fscache_n_acquires_no_cache); + _leave(" = -ENOMEDIUM [no cache]"); + return -ENOMEDIUM; + } + + _debug("cache %s", cache->tag->name); + + cookie->flags = + (1 << FSCACHE_COOKIE_LOOKING_UP) | + (1 << FSCACHE_COOKIE_CREATING) | + (1 << FSCACHE_COOKIE_NO_DATA_YET); + + /* ask the cache to allocate objects for this cookie and its parent + * chain */ + ret = fscache_alloc_object(cache, cookie); + if (ret < 0) { + up_read(&fscache_addremove_sem); + _leave(" = %d", ret); + return ret; + } + + /* pass on how big the object we're caching is supposed to be */ + cookie->def->get_attr(cookie->netfs_data, &i_size); + + spin_lock(&cookie->lock); + if (hlist_empty(&cookie->backing_objects)) { + spin_unlock(&cookie->lock); + goto unavailable; + } + + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + fscache_set_store_limit(object, i_size); + + /* initiate the process of looking up all the objects in the chain + * (done by fscache_initialise_object()) */ + fscache_enqueue_object(object); + + spin_unlock(&cookie->lock); + + /* we may be required to wait for lookup to complete at this point */ + if (!fscache_defer_lookup) { + _debug("non-deferred lookup %p", &cookie->flags); + wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + _debug("complete"); + if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) + goto unavailable; + } + + up_read(&fscache_addremove_sem); + _leave(" = 0 [deferred]"); + return 0; + +unavailable: + up_read(&fscache_addremove_sem); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} + +/* + * recursively allocate cache object records for a cookie/cache combination + * - caller must be holding the addremove sem + */ +static int fscache_alloc_object(struct fscache_cache *cache, + struct fscache_cookie *cookie) +{ + struct fscache_object *object; + struct hlist_node *_n; + int ret; + + _enter("%p,%p{%s}", cache, cookie, cookie->def->name); + + spin_lock(&cookie->lock); + hlist_for_each_entry(object, _n, &cookie->backing_objects, + cookie_link) { + if (object->cache == cache) + goto object_already_extant; + } + spin_unlock(&cookie->lock); + + /* ask the cache to allocate an object (we may end up with duplicate + * objects at this stage, but we sort that out later) */ + fscache_stat(&fscache_n_cop_alloc_object); + object = cache->ops->alloc_object(cache, cookie); + fscache_stat_d(&fscache_n_cop_alloc_object); + if (IS_ERR(object)) { + fscache_stat(&fscache_n_object_no_alloc); + ret = PTR_ERR(object); + goto error; + } + + fscache_stat(&fscache_n_object_alloc); + + object->debug_id = atomic_inc_return(&fscache_object_debug_id); + + _debug("ALLOC OBJ%x: %s {%lx}", + object->debug_id, cookie->def->name, object->events); + + ret = fscache_alloc_object(cache, cookie->parent); + if (ret < 0) + goto error_put; + + /* only attach if we managed to allocate all we needed, otherwise + * discard the object we just allocated and instead use the one + * attached to the cookie */ + if (fscache_attach_object(cookie, object) < 0) { + fscache_stat(&fscache_n_cop_put_object); + cache->ops->put_object(object); + fscache_stat_d(&fscache_n_cop_put_object); + } + + _leave(" = 0"); + return 0; + +object_already_extant: + ret = -ENOBUFS; + if (object->state >= FSCACHE_OBJECT_DYING) { + spin_unlock(&cookie->lock); + goto error; + } + spin_unlock(&cookie->lock); + _leave(" = 0 [found]"); + return 0; + +error_put: + fscache_stat(&fscache_n_cop_put_object); + cache->ops->put_object(object); + fscache_stat_d(&fscache_n_cop_put_object); +error: + _leave(" = %d", ret); + return ret; +} + +/* + * attach a cache object to a cookie + */ +static int fscache_attach_object(struct fscache_cookie *cookie, + struct fscache_object *object) +{ + struct fscache_object *p; + struct fscache_cache *cache = object->cache; + struct hlist_node *_n; + int ret; + + _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id); + + spin_lock(&cookie->lock); + + /* there may be multiple initial creations of this object, but we only + * want one */ + ret = -EEXIST; + hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) { + if (p->cache == object->cache) { + if (p->state >= FSCACHE_OBJECT_DYING) + ret = -ENOBUFS; + goto cant_attach_object; + } + } + + /* pin the parent object */ + spin_lock_nested(&cookie->parent->lock, 1); + hlist_for_each_entry(p, _n, &cookie->parent->backing_objects, + cookie_link) { + if (p->cache == object->cache) { + if (p->state >= FSCACHE_OBJECT_DYING) { + ret = -ENOBUFS; + spin_unlock(&cookie->parent->lock); + goto cant_attach_object; + } + object->parent = p; + spin_lock(&p->lock); + p->n_children++; + spin_unlock(&p->lock); + break; + } + } + spin_unlock(&cookie->parent->lock); + + /* attach to the cache's object list */ + if (list_empty(&object->cache_link)) { + spin_lock(&cache->object_list_lock); + list_add(&object->cache_link, &cache->object_list); + spin_unlock(&cache->object_list_lock); + } + + /* attach to the cookie */ + object->cookie = cookie; + atomic_inc(&cookie->usage); + hlist_add_head(&object->cookie_link, &cookie->backing_objects); + + fscache_objlist_add(object); + ret = 0; + +cant_attach_object: + spin_unlock(&cookie->lock); + _leave(" = %d", ret); + return ret; +} + +/* + * update the index entries backing a cookie + */ +void __fscache_update_cookie(struct fscache_cookie *cookie) +{ + struct fscache_object *object; + struct hlist_node *_p; + + fscache_stat(&fscache_n_updates); + + if (!cookie) { + fscache_stat(&fscache_n_updates_null); + _leave(" [no cookie]"); + return; + } + + _enter("{%s}", cookie->def->name); + + BUG_ON(!cookie->def->get_aux); + + spin_lock(&cookie->lock); + + /* update the index entry on disk in each cache backing this cookie */ + hlist_for_each_entry(object, _p, + &cookie->backing_objects, cookie_link) { + fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); + } + + spin_unlock(&cookie->lock); + _leave(""); +} +EXPORT_SYMBOL(__fscache_update_cookie); + +/* + * release a cookie back to the cache + * - the object will be marked as recyclable on disk if retire is true + * - all dependents of this cookie must have already been unregistered + * (indices/files/pages) + */ +void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) +{ + struct fscache_cache *cache; + struct fscache_object *object; + unsigned long event; + + fscache_stat(&fscache_n_relinquishes); + if (retire) + fscache_stat(&fscache_n_relinquishes_retire); + + if (!cookie) { + fscache_stat(&fscache_n_relinquishes_null); + _leave(" [no cookie]"); + return; + } + + _enter("%p{%s,%p},%d", + cookie, cookie->def->name, cookie->netfs_data, retire); + + if (atomic_read(&cookie->n_children) != 0) { + printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n", + cookie->def->name); + BUG(); + } + + /* wait for the cookie to finish being instantiated (or to fail) */ + if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) { + fscache_stat(&fscache_n_relinquishes_waitcrt); + wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + } + + event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE; + + spin_lock(&cookie->lock); + + /* break links with all the active objects */ + while (!hlist_empty(&cookie->backing_objects)) { + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, + cookie_link); + + _debug("RELEASE OBJ%x", object->debug_id); + + /* detach each cache object from the object cookie */ + spin_lock(&object->lock); + hlist_del_init(&object->cookie_link); + + cache = object->cache; + object->cookie = NULL; + fscache_raise_event(object, event); + spin_unlock(&object->lock); + + if (atomic_dec_and_test(&cookie->usage)) + /* the cookie refcount shouldn't be reduced to 0 yet */ + BUG(); + } + + /* detach pointers back to the netfs */ + cookie->netfs_data = NULL; + cookie->def = NULL; + + spin_unlock(&cookie->lock); + + if (cookie->parent) { + ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); + ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0); + atomic_dec(&cookie->parent->n_children); + } + + /* finally dispose of the cookie */ + ASSERTCMP(atomic_read(&cookie->usage), >, 0); + fscache_cookie_put(cookie); + + _leave(""); +} +EXPORT_SYMBOL(__fscache_relinquish_cookie); + +/* + * destroy a cookie + */ +void __fscache_cookie_put(struct fscache_cookie *cookie) +{ + struct fscache_cookie *parent; + + _enter("%p", cookie); + + for (;;) { + _debug("FREE COOKIE %p", cookie); + parent = cookie->parent; + BUG_ON(!hlist_empty(&cookie->backing_objects)); + kmem_cache_free(fscache_cookie_jar, cookie); + + if (!parent) + break; + + cookie = parent; + BUG_ON(atomic_read(&cookie->usage) <= 0); + if (!atomic_dec_and_test(&cookie->usage)) + break; + } + + _leave(""); +} diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c new file mode 100644 index 00000000..f5b4baee --- /dev/null +++ b/fs/fscache/fsdef.c @@ -0,0 +1,144 @@ +/* Filesystem index definition + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL CACHE +#include +#include "internal.h" + +static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax); + +static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax); + +static +enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen); + +/* + * The root index is owned by FS-Cache itself. + * + * When a netfs requests caching facilities, FS-Cache will, if one doesn't + * already exist, create an entry in the root index with the key being the name + * of the netfs ("AFS" for example), and the auxiliary data holding the index + * structure version supplied by the netfs: + * + * FSDEF + * | + * +-----------+ + * | | + * NFS AFS + * [v=1] [v=1] + * + * If an entry with the appropriate name does already exist, the version is + * compared. If the version is different, the entire subtree from that entry + * will be discarded and a new entry created. + * + * The new entry will be an index, and a cookie referring to it will be passed + * to the netfs. This is then the root handle by which the netfs accesses the + * cache. It can create whatever objects it likes in that index, including + * further indices. + */ +static struct fscache_cookie_def fscache_fsdef_index_def = { + .name = ".FS-Cache", + .type = FSCACHE_COOKIE_TYPE_INDEX, +}; + +struct fscache_cookie fscache_fsdef_index = { + .usage = ATOMIC_INIT(1), + .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), + .backing_objects = HLIST_HEAD_INIT, + .def = &fscache_fsdef_index_def, +}; +EXPORT_SYMBOL(fscache_fsdef_index); + +/* + * Definition of an entry in the root index. Each entry is an index, keyed to + * a specific netfs and only applicable to a particular version of the index + * structure used by that netfs. + */ +struct fscache_cookie_def fscache_fsdef_netfs_def = { + .name = "FSDEF.netfs", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = fscache_fsdef_netfs_get_key, + .get_aux = fscache_fsdef_netfs_get_aux, + .check_aux = fscache_fsdef_netfs_check_aux, +}; + +/* + * get the key data for an FSDEF index record - this is the name of the netfs + * for which this entry is created + */ +static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct fscache_netfs *netfs = cookie_netfs_data; + unsigned klen; + + _enter("{%s.%u},", netfs->name, netfs->version); + + klen = strlen(netfs->name); + if (klen > bufmax) + return 0; + + memcpy(buffer, netfs->name, klen); + return klen; +} + +/* + * get the auxiliary data for an FSDEF index record - this is the index + * structure version number of the netfs for which this version is created + */ +static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct fscache_netfs *netfs = cookie_netfs_data; + unsigned dlen; + + _enter("{%s.%u},", netfs->name, netfs->version); + + dlen = sizeof(uint32_t); + if (dlen > bufmax) + return 0; + + memcpy(buffer, &netfs->version, dlen); + return dlen; +} + +/* + * check that the index structure version number stored in the auxiliary data + * matches the one the netfs gave us + */ +static enum fscache_checkaux fscache_fsdef_netfs_check_aux( + void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct fscache_netfs *netfs = cookie_netfs_data; + uint32_t version; + + _enter("{%s},,%hu", netfs->name, datalen); + + if (datalen != sizeof(version)) { + _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version)); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + memcpy(&version, data, sizeof(version)); + if (version != netfs->version) { + _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version); + return FSCACHE_CHECKAUX_OBSOLETE; + } + + _leave(" = OKAY"); + return FSCACHE_CHECKAUX_OKAY; +} diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c new file mode 100644 index 00000000..bad49674 --- /dev/null +++ b/fs/fscache/histogram.c @@ -0,0 +1,109 @@ +/* FS-Cache latency histogram + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL THREAD +#include +#include +#include +#include "internal.h" + +atomic_t fscache_obj_instantiate_histogram[HZ]; +atomic_t fscache_objs_histogram[HZ]; +atomic_t fscache_ops_histogram[HZ]; +atomic_t fscache_retrieval_delay_histogram[HZ]; +atomic_t fscache_retrieval_histogram[HZ]; + +/* + * display the time-taken histogram + */ +static int fscache_histogram_show(struct seq_file *m, void *v) +{ + unsigned long index; + unsigned n[5], t; + + switch ((unsigned long) v) { + case 1: + seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS " + " RETRV DLY RETRIEVLS\n"); + return 0; + case 2: + seq_puts(m, "===== ===== ========= ========= =========" + " ========= =========\n"); + return 0; + default: + index = (unsigned long) v - 3; + n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]); + n[1] = atomic_read(&fscache_ops_histogram[index]); + n[2] = atomic_read(&fscache_objs_histogram[index]); + n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]); + n[4] = atomic_read(&fscache_retrieval_histogram[index]); + if (!(n[0] | n[1] | n[2] | n[3] | n[4])) + return 0; + + t = (index * 1000) / HZ; + + seq_printf(m, "%4lu 0.%03u %9u %9u %9u %9u %9u\n", + index, t, n[0], n[1], n[2], n[3], n[4]); + return 0; + } +} + +/* + * set up the iterator to start reading from the first line + */ +static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos) +{ + if ((unsigned long long)*_pos >= HZ + 2) + return NULL; + if (*_pos == 0) + *_pos = 1; + return (void *)(unsigned long) *_pos; +} + +/* + * move to the next line + */ +static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return (unsigned long long)*pos > HZ + 2 ? + NULL : (void *)(unsigned long) *pos; +} + +/* + * clean up after reading + */ +static void fscache_histogram_stop(struct seq_file *m, void *v) +{ +} + +static const struct seq_operations fscache_histogram_ops = { + .start = fscache_histogram_start, + .stop = fscache_histogram_stop, + .next = fscache_histogram_next, + .show = fscache_histogram_show, +}; + +/* + * open "/proc/fs/fscache/histogram" to provide latency data + */ +static int fscache_histogram_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &fscache_histogram_ops); +} + +const struct file_operations fscache_histogram_fops = { + .owner = THIS_MODULE, + .open = fscache_histogram_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h new file mode 100644 index 00000000..f6aad48d --- /dev/null +++ b/fs/fscache/internal.h @@ -0,0 +1,438 @@ +/* Internal definitions for FS-Cache + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Lock order, in the order in which multiple locks should be obtained: + * - fscache_addremove_sem + * - cookie->lock + * - cookie->parent->lock + * - cache->object_list_lock + * - object->lock + * - object->parent->lock + * - cookie->stores_lock + * - fscache_thread_lock + * + */ + +#include +#include + +#define FSCACHE_MIN_THREADS 4 +#define FSCACHE_MAX_THREADS 32 + +/* + * cache.c + */ +extern struct list_head fscache_cache_list; +extern struct rw_semaphore fscache_addremove_sem; + +extern struct fscache_cache *fscache_select_cache_for_object( + struct fscache_cookie *); + +/* + * cookie.c + */ +extern struct kmem_cache *fscache_cookie_jar; + +extern void fscache_cookie_init_once(void *); +extern void __fscache_cookie_put(struct fscache_cookie *); + +/* + * fsdef.c + */ +extern struct fscache_cookie fscache_fsdef_index; +extern struct fscache_cookie_def fscache_fsdef_netfs_def; + +/* + * histogram.c + */ +#ifdef CONFIG_FSCACHE_HISTOGRAM +extern atomic_t fscache_obj_instantiate_histogram[HZ]; +extern atomic_t fscache_objs_histogram[HZ]; +extern atomic_t fscache_ops_histogram[HZ]; +extern atomic_t fscache_retrieval_delay_histogram[HZ]; +extern atomic_t fscache_retrieval_histogram[HZ]; + +static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif) +{ + unsigned long jif = jiffies - start_jif; + if (jif >= HZ) + jif = HZ - 1; + atomic_inc(&histogram[jif]); +} + +extern const struct file_operations fscache_histogram_fops; + +#else +#define fscache_hist(hist, start_jif) do {} while (0) +#endif + +/* + * main.c + */ +extern unsigned fscache_defer_lookup; +extern unsigned fscache_defer_create; +extern unsigned fscache_debug; +extern struct kobject *fscache_root; +extern struct workqueue_struct *fscache_object_wq; +extern struct workqueue_struct *fscache_op_wq; +DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); + +static inline bool fscache_object_congested(void) +{ + return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); +} + +extern int fscache_wait_bit(void *); +extern int fscache_wait_bit_interruptible(void *); + +/* + * object.c + */ +extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5]; + +extern void fscache_withdrawing_object(struct fscache_cache *, + struct fscache_object *); +extern void fscache_enqueue_object(struct fscache_object *); + +/* + * object-list.c + */ +#ifdef CONFIG_FSCACHE_OBJECT_LIST +extern const struct file_operations fscache_objlist_fops; + +extern void fscache_objlist_add(struct fscache_object *); +#else +#define fscache_objlist_add(object) do {} while(0) +#endif + +/* + * operation.c + */ +extern int fscache_submit_exclusive_op(struct fscache_object *, + struct fscache_operation *); +extern int fscache_submit_op(struct fscache_object *, + struct fscache_operation *); +extern int fscache_cancel_op(struct fscache_operation *); +extern void fscache_abort_object(struct fscache_object *); +extern void fscache_start_operations(struct fscache_object *); +extern void fscache_operation_gc(struct work_struct *); + +/* + * proc.c + */ +#ifdef CONFIG_PROC_FS +extern int __init fscache_proc_init(void); +extern void fscache_proc_cleanup(void); +#else +#define fscache_proc_init() (0) +#define fscache_proc_cleanup() do {} while (0) +#endif + +/* + * stats.c + */ +#ifdef CONFIG_FSCACHE_STATS +extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS]; +extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS]; + +extern atomic_t fscache_n_op_pend; +extern atomic_t fscache_n_op_run; +extern atomic_t fscache_n_op_enqueue; +extern atomic_t fscache_n_op_deferred_release; +extern atomic_t fscache_n_op_release; +extern atomic_t fscache_n_op_gc; +extern atomic_t fscache_n_op_cancelled; +extern atomic_t fscache_n_op_rejected; + +extern atomic_t fscache_n_attr_changed; +extern atomic_t fscache_n_attr_changed_ok; +extern atomic_t fscache_n_attr_changed_nobufs; +extern atomic_t fscache_n_attr_changed_nomem; +extern atomic_t fscache_n_attr_changed_calls; + +extern atomic_t fscache_n_allocs; +extern atomic_t fscache_n_allocs_ok; +extern atomic_t fscache_n_allocs_wait; +extern atomic_t fscache_n_allocs_nobufs; +extern atomic_t fscache_n_allocs_intr; +extern atomic_t fscache_n_allocs_object_dead; +extern atomic_t fscache_n_alloc_ops; +extern atomic_t fscache_n_alloc_op_waits; + +extern atomic_t fscache_n_retrievals; +extern atomic_t fscache_n_retrievals_ok; +extern atomic_t fscache_n_retrievals_wait; +extern atomic_t fscache_n_retrievals_nodata; +extern atomic_t fscache_n_retrievals_nobufs; +extern atomic_t fscache_n_retrievals_intr; +extern atomic_t fscache_n_retrievals_nomem; +extern atomic_t fscache_n_retrievals_object_dead; +extern atomic_t fscache_n_retrieval_ops; +extern atomic_t fscache_n_retrieval_op_waits; + +extern atomic_t fscache_n_stores; +extern atomic_t fscache_n_stores_ok; +extern atomic_t fscache_n_stores_again; +extern atomic_t fscache_n_stores_nobufs; +extern atomic_t fscache_n_stores_oom; +extern atomic_t fscache_n_store_ops; +extern atomic_t fscache_n_store_calls; +extern atomic_t fscache_n_store_pages; +extern atomic_t fscache_n_store_radix_deletes; +extern atomic_t fscache_n_store_pages_over_limit; + +extern atomic_t fscache_n_store_vmscan_not_storing; +extern atomic_t fscache_n_store_vmscan_gone; +extern atomic_t fscache_n_store_vmscan_busy; +extern atomic_t fscache_n_store_vmscan_cancelled; + +extern atomic_t fscache_n_marks; +extern atomic_t fscache_n_uncaches; + +extern atomic_t fscache_n_acquires; +extern atomic_t fscache_n_acquires_null; +extern atomic_t fscache_n_acquires_no_cache; +extern atomic_t fscache_n_acquires_ok; +extern atomic_t fscache_n_acquires_nobufs; +extern atomic_t fscache_n_acquires_oom; + +extern atomic_t fscache_n_updates; +extern atomic_t fscache_n_updates_null; +extern atomic_t fscache_n_updates_run; + +extern atomic_t fscache_n_relinquishes; +extern atomic_t fscache_n_relinquishes_null; +extern atomic_t fscache_n_relinquishes_waitcrt; +extern atomic_t fscache_n_relinquishes_retire; + +extern atomic_t fscache_n_cookie_index; +extern atomic_t fscache_n_cookie_data; +extern atomic_t fscache_n_cookie_special; + +extern atomic_t fscache_n_object_alloc; +extern atomic_t fscache_n_object_no_alloc; +extern atomic_t fscache_n_object_lookups; +extern atomic_t fscache_n_object_lookups_negative; +extern atomic_t fscache_n_object_lookups_positive; +extern atomic_t fscache_n_object_lookups_timed_out; +extern atomic_t fscache_n_object_created; +extern atomic_t fscache_n_object_avail; +extern atomic_t fscache_n_object_dead; + +extern atomic_t fscache_n_checkaux_none; +extern atomic_t fscache_n_checkaux_okay; +extern atomic_t fscache_n_checkaux_update; +extern atomic_t fscache_n_checkaux_obsolete; + +extern atomic_t fscache_n_cop_alloc_object; +extern atomic_t fscache_n_cop_lookup_object; +extern atomic_t fscache_n_cop_lookup_complete; +extern atomic_t fscache_n_cop_grab_object; +extern atomic_t fscache_n_cop_update_object; +extern atomic_t fscache_n_cop_drop_object; +extern atomic_t fscache_n_cop_put_object; +extern atomic_t fscache_n_cop_sync_cache; +extern atomic_t fscache_n_cop_attr_changed; +extern atomic_t fscache_n_cop_read_or_alloc_page; +extern atomic_t fscache_n_cop_read_or_alloc_pages; +extern atomic_t fscache_n_cop_allocate_page; +extern atomic_t fscache_n_cop_allocate_pages; +extern atomic_t fscache_n_cop_write_page; +extern atomic_t fscache_n_cop_uncache_page; +extern atomic_t fscache_n_cop_dissociate_pages; + +static inline void fscache_stat(atomic_t *stat) +{ + atomic_inc(stat); +} + +static inline void fscache_stat_d(atomic_t *stat) +{ + atomic_dec(stat); +} + +#define __fscache_stat(stat) (stat) + +extern const struct file_operations fscache_stats_fops; +#else + +#define __fscache_stat(stat) (NULL) +#define fscache_stat(stat) do {} while (0) +#define fscache_stat_d(stat) do {} while (0) +#endif + +/* + * raise an event on an object + * - if the event is not masked for that object, then the object is + * queued for attention by the thread pool. + */ +static inline void fscache_raise_event(struct fscache_object *object, + unsigned event) +{ + if (!test_and_set_bit(event, &object->events) && + test_bit(event, &object->event_mask)) + fscache_enqueue_object(object); +} + +/* + * drop a reference to a cookie + */ +static inline void fscache_cookie_put(struct fscache_cookie *cookie) +{ + BUG_ON(atomic_read(&cookie->usage) <= 0); + if (atomic_dec_and_test(&cookie->usage)) + __fscache_cookie_put(cookie); +} + +/* + * get an extra reference to a netfs retrieval context + */ +static inline +void *fscache_get_context(struct fscache_cookie *cookie, void *context) +{ + if (cookie->def->get_context) + cookie->def->get_context(cookie->netfs_data, context); + return context; +} + +/* + * release a reference to a netfs retrieval context + */ +static inline +void fscache_put_context(struct fscache_cookie *cookie, void *context) +{ + if (cookie->def->put_context) + cookie->def->put_context(cookie->netfs_data, context); +} + +/*****************************************************************************/ +/* + * debug tracing + */ +#define dbgprintk(FMT, ...) \ + printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + +#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) + +#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__) + +#ifdef __KDEBUG +#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) +#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) +#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) + +#elif defined(CONFIG_FSCACHE_DEBUG) +#define _enter(FMT, ...) \ +do { \ + if (__do_kdebug(ENTER)) \ + kenter(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _leave(FMT, ...) \ +do { \ + if (__do_kdebug(LEAVE)) \ + kleave(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _debug(FMT, ...) \ +do { \ + if (__do_kdebug(DEBUG)) \ + kdebug(FMT, ##__VA_ARGS__); \ +} while (0) + +#else +#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) +#endif + +/* + * determine whether a particular optional debugging point should be logged + * - we need to go through three steps to persuade cpp to correctly join the + * shorthand in FSCACHE_DEBUG_LEVEL with its prefix + */ +#define ____do_kdebug(LEVEL, POINT) \ + unlikely((fscache_debug & \ + (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3)))) +#define ___do_kdebug(LEVEL, POINT) \ + ____do_kdebug(LEVEL, POINT) +#define __do_kdebug(POINT) \ + ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT) + +#define FSCACHE_DEBUG_CACHE 0 +#define FSCACHE_DEBUG_COOKIE 1 +#define FSCACHE_DEBUG_PAGE 2 +#define FSCACHE_DEBUG_OPERATION 3 + +#define FSCACHE_POINT_ENTER 1 +#define FSCACHE_POINT_LEAVE 2 +#define FSCACHE_POINT_DEBUG 4 + +#ifndef FSCACHE_DEBUG_LEVEL +#define FSCACHE_DEBUG_LEVEL CACHE +#endif + +/* + * assertions + */ +#if 1 /* defined(__KDEBUGALL) */ + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#else + +#define ASSERT(X) do {} while (0) +#define ASSERTCMP(X, OP, Y) do {} while (0) +#define ASSERTIF(C, X) do {} while (0) +#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) + +#endif /* assert or not */ diff --git a/fs/fscache/main.c b/fs/fscache/main.c new file mode 100644 index 00000000..f9d85677 --- /dev/null +++ b/fs/fscache/main.c @@ -0,0 +1,218 @@ +/* General filesystem local caching manager + * + * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL CACHE +#include +#include +#include +#include +#include +#include +#include "internal.h" + +MODULE_DESCRIPTION("FS Cache Manager"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +unsigned fscache_defer_lookup = 1; +module_param_named(defer_lookup, fscache_defer_lookup, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(fscache_defer_lookup, + "Defer cookie lookup to background thread"); + +unsigned fscache_defer_create = 1; +module_param_named(defer_create, fscache_defer_create, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(fscache_defer_create, + "Defer cookie creation to background thread"); + +unsigned fscache_debug; +module_param_named(debug, fscache_debug, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(fscache_debug, + "FS-Cache debugging mask"); + +struct kobject *fscache_root; +struct workqueue_struct *fscache_object_wq; +struct workqueue_struct *fscache_op_wq; + +DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); + +/* these values serve as lower bounds, will be adjusted in fscache_init() */ +static unsigned fscache_object_max_active = 4; +static unsigned fscache_op_max_active = 2; + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *fscache_sysctl_header; + +static int fscache_max_active_sysctl(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + struct workqueue_struct **wqp = table->extra1; + unsigned int *datap = table->data; + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret == 0) + workqueue_set_max_active(*wqp, *datap); + return ret; +} + +ctl_table fscache_sysctls[] = { + { + .procname = "object_max_active", + .data = &fscache_object_max_active, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = fscache_max_active_sysctl, + .extra1 = &fscache_object_wq, + }, + { + .procname = "operation_max_active", + .data = &fscache_op_max_active, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = fscache_max_active_sysctl, + .extra1 = &fscache_op_wq, + }, + {} +}; + +ctl_table fscache_sysctls_root[] = { + { + .procname = "fscache", + .mode = 0555, + .child = fscache_sysctls, + }, + {} +}; +#endif + +/* + * initialise the fs caching module + */ +static int __init fscache_init(void) +{ + unsigned int nr_cpus = num_possible_cpus(); + unsigned int cpu; + int ret; + + fscache_object_max_active = + clamp_val(nr_cpus, + fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE); + + ret = -ENOMEM; + fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND, + fscache_object_max_active); + if (!fscache_object_wq) + goto error_object_wq; + + fscache_op_max_active = + clamp_val(fscache_object_max_active / 2, + fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE); + + ret = -ENOMEM; + fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND, + fscache_op_max_active); + if (!fscache_op_wq) + goto error_op_wq; + + for_each_possible_cpu(cpu) + init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); + + ret = fscache_proc_init(); + if (ret < 0) + goto error_proc; + +#ifdef CONFIG_SYSCTL + ret = -ENOMEM; + fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root); + if (!fscache_sysctl_header) + goto error_sysctl; +#endif + + fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", + sizeof(struct fscache_cookie), + 0, + 0, + fscache_cookie_init_once); + if (!fscache_cookie_jar) { + printk(KERN_NOTICE + "FS-Cache: Failed to allocate a cookie jar\n"); + ret = -ENOMEM; + goto error_cookie_jar; + } + + fscache_root = kobject_create_and_add("fscache", kernel_kobj); + if (!fscache_root) + goto error_kobj; + + printk(KERN_NOTICE "FS-Cache: Loaded\n"); + return 0; + +error_kobj: + kmem_cache_destroy(fscache_cookie_jar); +error_cookie_jar: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(fscache_sysctl_header); +error_sysctl: +#endif + fscache_proc_cleanup(); +error_proc: + destroy_workqueue(fscache_op_wq); +error_op_wq: + destroy_workqueue(fscache_object_wq); +error_object_wq: + return ret; +} + +fs_initcall(fscache_init); + +/* + * clean up on module removal + */ +static void __exit fscache_exit(void) +{ + _enter(""); + + kobject_put(fscache_root); + kmem_cache_destroy(fscache_cookie_jar); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(fscache_sysctl_header); +#endif + fscache_proc_cleanup(); + destroy_workqueue(fscache_op_wq); + destroy_workqueue(fscache_object_wq); + printk(KERN_NOTICE "FS-Cache: Unloaded\n"); +} + +module_exit(fscache_exit); + +/* + * wait_on_bit() sleep function for uninterruptible waiting + */ +int fscache_wait_bit(void *flags) +{ + schedule(); + return 0; +} +EXPORT_SYMBOL(fscache_wait_bit); + +/* + * wait_on_bit() sleep function for interruptible waiting + */ +int fscache_wait_bit_interruptible(void *flags) +{ + schedule(); + return signal_pending(current); +} +EXPORT_SYMBOL(fscache_wait_bit_interruptible); diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c new file mode 100644 index 00000000..e028b8eb --- /dev/null +++ b/fs/fscache/netfs.c @@ -0,0 +1,103 @@ +/* FS-Cache netfs (client) registration + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include +#include +#include "internal.h" + +static LIST_HEAD(fscache_netfs_list); + +/* + * register a network filesystem for caching + */ +int __fscache_register_netfs(struct fscache_netfs *netfs) +{ + struct fscache_netfs *ptr; + int ret; + + _enter("{%s}", netfs->name); + + INIT_LIST_HEAD(&netfs->link); + + /* allocate a cookie for the primary index */ + netfs->primary_index = + kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL); + + if (!netfs->primary_index) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + /* initialise the primary index cookie */ + atomic_set(&netfs->primary_index->usage, 1); + atomic_set(&netfs->primary_index->n_children, 0); + + netfs->primary_index->def = &fscache_fsdef_netfs_def; + netfs->primary_index->parent = &fscache_fsdef_index; + netfs->primary_index->netfs_data = netfs; + + atomic_inc(&netfs->primary_index->parent->usage); + atomic_inc(&netfs->primary_index->parent->n_children); + + spin_lock_init(&netfs->primary_index->lock); + INIT_HLIST_HEAD(&netfs->primary_index->backing_objects); + + /* check the netfs type is not already present */ + down_write(&fscache_addremove_sem); + + ret = -EEXIST; + list_for_each_entry(ptr, &fscache_netfs_list, link) { + if (strcmp(ptr->name, netfs->name) == 0) + goto already_registered; + } + + list_add(&netfs->link, &fscache_netfs_list); + ret = 0; + + printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n", + netfs->name); + +already_registered: + up_write(&fscache_addremove_sem); + + if (ret < 0) { + netfs->primary_index->parent = NULL; + __fscache_cookie_put(netfs->primary_index); + netfs->primary_index = NULL; + } + + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(__fscache_register_netfs); + +/* + * unregister a network filesystem from the cache + * - all cookies must have been released first + */ +void __fscache_unregister_netfs(struct fscache_netfs *netfs) +{ + _enter("{%s.%u}", netfs->name, netfs->version); + + down_write(&fscache_addremove_sem); + + list_del(&netfs->link); + fscache_relinquish_cookie(netfs->primary_index, 0); + + up_write(&fscache_addremove_sem); + + printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n", + netfs->name); + + _leave(""); +} +EXPORT_SYMBOL(__fscache_unregister_netfs); diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c new file mode 100644 index 00000000..ebe29c58 --- /dev/null +++ b/fs/fscache/object-list.c @@ -0,0 +1,432 @@ +/* Global fscache object list maintainer and viewer + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include +#include +#include +#include +#include +#include "internal.h" + +static struct rb_root fscache_object_list; +static DEFINE_RWLOCK(fscache_object_list_lock); + +struct fscache_objlist_data { + unsigned long config; /* display configuration */ +#define FSCACHE_OBJLIST_CONFIG_KEY 0x00000001 /* show object keys */ +#define FSCACHE_OBJLIST_CONFIG_AUX 0x00000002 /* show object auxdata */ +#define FSCACHE_OBJLIST_CONFIG_COOKIE 0x00000004 /* show objects with cookies */ +#define FSCACHE_OBJLIST_CONFIG_NOCOOKIE 0x00000008 /* show objects without cookies */ +#define FSCACHE_OBJLIST_CONFIG_BUSY 0x00000010 /* show busy objects */ +#define FSCACHE_OBJLIST_CONFIG_IDLE 0x00000020 /* show idle objects */ +#define FSCACHE_OBJLIST_CONFIG_PENDWR 0x00000040 /* show objects with pending writes */ +#define FSCACHE_OBJLIST_CONFIG_NOPENDWR 0x00000080 /* show objects without pending writes */ +#define FSCACHE_OBJLIST_CONFIG_READS 0x00000100 /* show objects with active reads */ +#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */ +#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */ +#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */ +#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */ +#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */ + + u8 buf[512]; /* key and aux data buffer */ +}; + +/* + * Add an object to the object list + * - we use the address of the fscache_object structure as the key into the + * tree + */ +void fscache_objlist_add(struct fscache_object *obj) +{ + struct fscache_object *xobj; + struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL; + + write_lock(&fscache_object_list_lock); + + while (*p) { + parent = *p; + xobj = rb_entry(parent, struct fscache_object, objlist_link); + + if (obj < xobj) + p = &(*p)->rb_left; + else if (obj > xobj) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&obj->objlist_link, parent, p); + rb_insert_color(&obj->objlist_link, &fscache_object_list); + + write_unlock(&fscache_object_list_lock); +} + +/** + * fscache_object_destroy - Note that a cache object is about to be destroyed + * @object: The object to be destroyed + * + * Note the imminent destruction and deallocation of a cache object record. + */ +void fscache_object_destroy(struct fscache_object *obj) +{ + write_lock(&fscache_object_list_lock); + + BUG_ON(RB_EMPTY_ROOT(&fscache_object_list)); + rb_erase(&obj->objlist_link, &fscache_object_list); + + write_unlock(&fscache_object_list_lock); +} +EXPORT_SYMBOL(fscache_object_destroy); + +/* + * find the object in the tree on or after the specified index + */ +static struct fscache_object *fscache_objlist_lookup(loff_t *_pos) +{ + struct fscache_object *pobj, *obj = NULL, *minobj = NULL; + struct rb_node *p; + unsigned long pos; + + if (*_pos >= (unsigned long) ERR_PTR(-ENOENT)) + return NULL; + pos = *_pos; + + /* banners (can't represent line 0 by pos 0 as that would involve + * returning a NULL pointer) */ + if (pos == 0) + return (struct fscache_object *)(long)++(*_pos); + if (pos < 3) + return (struct fscache_object *)pos; + + pobj = (struct fscache_object *)pos; + p = fscache_object_list.rb_node; + while (p) { + obj = rb_entry(p, struct fscache_object, objlist_link); + if (pobj < obj) { + if (!minobj || minobj > obj) + minobj = obj; + p = p->rb_left; + } else if (pobj > obj) { + p = p->rb_right; + } else { + minobj = obj; + break; + } + obj = NULL; + } + + if (!minobj) + *_pos = (unsigned long) ERR_PTR(-ENOENT); + else if (minobj != obj) + *_pos = (unsigned long) minobj; + return minobj; +} + +/* + * set up the iterator to start reading from the first line + */ +static void *fscache_objlist_start(struct seq_file *m, loff_t *_pos) + __acquires(&fscache_object_list_lock) +{ + read_lock(&fscache_object_list_lock); + return fscache_objlist_lookup(_pos); +} + +/* + * move to the next line + */ +static void *fscache_objlist_next(struct seq_file *m, void *v, loff_t *_pos) +{ + (*_pos)++; + return fscache_objlist_lookup(_pos); +} + +/* + * clean up after reading + */ +static void fscache_objlist_stop(struct seq_file *m, void *v) + __releases(&fscache_object_list_lock) +{ + read_unlock(&fscache_object_list_lock); +} + +/* + * display an object + */ +static int fscache_objlist_show(struct seq_file *m, void *v) +{ + struct fscache_objlist_data *data = m->private; + struct fscache_object *obj = v; + unsigned long config = data->config; + uint16_t keylen, auxlen; + char _type[3], *type; + bool no_cookie; + u8 *buf = data->buf, *p; + + if ((unsigned long) v == 1) { + seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS" + " EM EV F S" + " | NETFS_COOKIE_DEF TY FL NETFS_DATA"); + if (config & (FSCACHE_OBJLIST_CONFIG_KEY | + FSCACHE_OBJLIST_CONFIG_AUX)) + seq_puts(m, " "); + if (config & FSCACHE_OBJLIST_CONFIG_KEY) + seq_puts(m, "OBJECT_KEY"); + if ((config & (FSCACHE_OBJLIST_CONFIG_KEY | + FSCACHE_OBJLIST_CONFIG_AUX)) == + (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX)) + seq_puts(m, ", "); + if (config & FSCACHE_OBJLIST_CONFIG_AUX) + seq_puts(m, "AUX_DATA"); + seq_puts(m, "\n"); + return 0; + } + + if ((unsigned long) v == 2) { + seq_puts(m, "======== ======== ==== ===== === === === == =====" + " == == = =" + " | ================ == == ================"); + if (config & (FSCACHE_OBJLIST_CONFIG_KEY | + FSCACHE_OBJLIST_CONFIG_AUX)) + seq_puts(m, " ================"); + seq_puts(m, "\n"); + return 0; + } + + /* filter out any unwanted objects */ +#define FILTER(criterion, _yes, _no) \ + do { \ + unsigned long yes = FSCACHE_OBJLIST_CONFIG_##_yes; \ + unsigned long no = FSCACHE_OBJLIST_CONFIG_##_no; \ + if (criterion) { \ + if (!(config & yes)) \ + return 0; \ + } else { \ + if (!(config & no)) \ + return 0; \ + } \ + } while(0) + + if (~config) { + FILTER(obj->cookie, + COOKIE, NOCOOKIE); + FILTER(obj->state != FSCACHE_OBJECT_ACTIVE || + obj->n_ops != 0 || + obj->n_obj_ops != 0 || + obj->flags || + !list_empty(&obj->dependents), + BUSY, IDLE); + FILTER(test_bit(FSCACHE_OBJECT_PENDING_WRITE, &obj->flags), + PENDWR, NOPENDWR); + FILTER(atomic_read(&obj->n_reads), + READS, NOREADS); + FILTER(obj->events & obj->event_mask, + EVENTS, NOEVENTS); + FILTER(work_busy(&obj->work), WORK, NOWORK); + } + + seq_printf(m, + "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ", + obj->debug_id, + obj->parent ? obj->parent->debug_id : -1, + fscache_object_states_short[obj->state], + obj->n_children, + obj->n_ops, + obj->n_obj_ops, + obj->n_in_progress, + obj->n_exclusive, + atomic_read(&obj->n_reads), + obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK, + obj->events, + obj->flags, + work_busy(&obj->work)); + + no_cookie = true; + keylen = auxlen = 0; + if (obj->cookie) { + spin_lock(&obj->lock); + if (obj->cookie) { + switch (obj->cookie->def->type) { + case 0: + type = "IX"; + break; + case 1: + type = "DT"; + break; + default: + sprintf(_type, "%02u", + obj->cookie->def->type); + type = _type; + break; + } + + seq_printf(m, "%-16s %s %2lx %16p", + obj->cookie->def->name, + type, + obj->cookie->flags, + obj->cookie->netfs_data); + + if (obj->cookie->def->get_key && + config & FSCACHE_OBJLIST_CONFIG_KEY) + keylen = obj->cookie->def->get_key( + obj->cookie->netfs_data, + buf, 400); + + if (obj->cookie->def->get_aux && + config & FSCACHE_OBJLIST_CONFIG_AUX) + auxlen = obj->cookie->def->get_aux( + obj->cookie->netfs_data, + buf + keylen, 512 - keylen); + + no_cookie = false; + } + spin_unlock(&obj->lock); + + if (!no_cookie && (keylen > 0 || auxlen > 0)) { + seq_printf(m, " "); + for (p = buf; keylen > 0; keylen--) + seq_printf(m, "%02x", *p++); + if (auxlen > 0) { + if (config & FSCACHE_OBJLIST_CONFIG_KEY) + seq_printf(m, ", "); + for (; auxlen > 0; auxlen--) + seq_printf(m, "%02x", *p++); + } + } + } + + if (no_cookie) + seq_printf(m, "\n"); + else + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations fscache_objlist_ops = { + .start = fscache_objlist_start, + .stop = fscache_objlist_stop, + .next = fscache_objlist_next, + .show = fscache_objlist_show, +}; + +/* + * get the configuration for filtering the list + */ +static void fscache_objlist_config(struct fscache_objlist_data *data) +{ +#ifdef CONFIG_KEYS + struct user_key_payload *confkey; + unsigned long config; + struct key *key; + const char *buf; + int len; + + key = request_key(&key_type_user, "fscache:objlist", NULL); + if (IS_ERR(key)) + goto no_config; + + config = 0; + rcu_read_lock(); + + confkey = key->payload.data; + buf = confkey->data; + + for (len = confkey->datalen - 1; len >= 0; len--) { + switch (buf[len]) { + case 'K': config |= FSCACHE_OBJLIST_CONFIG_KEY; break; + case 'A': config |= FSCACHE_OBJLIST_CONFIG_AUX; break; + case 'C': config |= FSCACHE_OBJLIST_CONFIG_COOKIE; break; + case 'c': config |= FSCACHE_OBJLIST_CONFIG_NOCOOKIE; break; + case 'B': config |= FSCACHE_OBJLIST_CONFIG_BUSY; break; + case 'b': config |= FSCACHE_OBJLIST_CONFIG_IDLE; break; + case 'W': config |= FSCACHE_OBJLIST_CONFIG_PENDWR; break; + case 'w': config |= FSCACHE_OBJLIST_CONFIG_NOPENDWR; break; + case 'R': config |= FSCACHE_OBJLIST_CONFIG_READS; break; + case 'r': config |= FSCACHE_OBJLIST_CONFIG_NOREADS; break; + case 'S': config |= FSCACHE_OBJLIST_CONFIG_WORK; break; + case 's': config |= FSCACHE_OBJLIST_CONFIG_NOWORK; break; + } + } + + rcu_read_unlock(); + key_put(key); + + if (!(config & (FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE))) + config |= FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE; + if (!(config & (FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE))) + config |= FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE; + if (!(config & (FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR))) + config |= FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR; + if (!(config & (FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS))) + config |= FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS; + if (!(config & (FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS))) + config |= FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS; + if (!(config & (FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK))) + config |= FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK; + + data->config = config; + return; + +no_config: +#endif + data->config = ULONG_MAX; +} + +/* + * open "/proc/fs/fscache/objects" to provide a list of active objects + * - can be configured by a user-defined key added to the caller's keyrings + */ +static int fscache_objlist_open(struct inode *inode, struct file *file) +{ + struct fscache_objlist_data *data; + struct seq_file *m; + int ret; + + ret = seq_open(file, &fscache_objlist_ops); + if (ret < 0) + return ret; + + m = file->private_data; + + /* buffer for key extraction */ + data = kmalloc(sizeof(struct fscache_objlist_data), GFP_KERNEL); + if (!data) { + seq_release(inode, file); + return -ENOMEM; + } + + /* get the configuration key */ + fscache_objlist_config(data); + + m->private = data; + return 0; +} + +/* + * clean up on close + */ +static int fscache_objlist_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + + kfree(m->private); + m->private = NULL; + return seq_release(inode, file); +} + +const struct file_operations fscache_objlist_fops = { + .owner = THIS_MODULE, + .open = fscache_objlist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = fscache_objlist_release, +}; diff --git a/fs/fscache/object.c b/fs/fscache/object.c new file mode 100644 index 00000000..b6b897c5 --- /dev/null +++ b/fs/fscache/object.c @@ -0,0 +1,892 @@ +/* FS-Cache object state machine handler + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * See Documentation/filesystems/caching/object.txt for a description of the + * object state machine and the in-kernel representations. + */ + +#define FSCACHE_DEBUG_LEVEL COOKIE +#include +#include "internal.h" + +const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { + [FSCACHE_OBJECT_INIT] = "OBJECT_INIT", + [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP", + [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING", + [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE", + [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE", + [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING", + [FSCACHE_OBJECT_DYING] = "OBJECT_DYING", + [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING", + [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT", + [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING", + [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING", + [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING", + [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD", +}; +EXPORT_SYMBOL(fscache_object_states); + +const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { + [FSCACHE_OBJECT_INIT] = "INIT", + [FSCACHE_OBJECT_LOOKING_UP] = "LOOK", + [FSCACHE_OBJECT_CREATING] = "CRTN", + [FSCACHE_OBJECT_AVAILABLE] = "AVBL", + [FSCACHE_OBJECT_ACTIVE] = "ACTV", + [FSCACHE_OBJECT_UPDATING] = "UPDT", + [FSCACHE_OBJECT_DYING] = "DYNG", + [FSCACHE_OBJECT_LC_DYING] = "LCDY", + [FSCACHE_OBJECT_ABORT_INIT] = "ABTI", + [FSCACHE_OBJECT_RELEASING] = "RELS", + [FSCACHE_OBJECT_RECYCLING] = "RCYC", + [FSCACHE_OBJECT_WITHDRAWING] = "WTHD", + [FSCACHE_OBJECT_DEAD] = "DEAD", +}; + +static int fscache_get_object(struct fscache_object *); +static void fscache_put_object(struct fscache_object *); +static void fscache_initialise_object(struct fscache_object *); +static void fscache_lookup_object(struct fscache_object *); +static void fscache_object_available(struct fscache_object *); +static void fscache_release_object(struct fscache_object *); +static void fscache_withdraw_object(struct fscache_object *); +static void fscache_enqueue_dependents(struct fscache_object *); +static void fscache_dequeue_object(struct fscache_object *); + +/* + * we need to notify the parent when an op completes that we had outstanding + * upon it + */ +static inline void fscache_done_parent_op(struct fscache_object *object) +{ + struct fscache_object *parent = object->parent; + + _enter("OBJ%x {OBJ%x,%x}", + object->debug_id, parent->debug_id, parent->n_ops); + + spin_lock_nested(&parent->lock, 1); + parent->n_ops--; + parent->n_obj_ops--; + if (parent->n_ops == 0) + fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); + spin_unlock(&parent->lock); +} + +/* + * process events that have been sent to an object's state machine + * - initiates parent lookup + * - does object lookup + * - does object creation + * - does object recycling and retirement + * - does object withdrawal + */ +static void fscache_object_state_machine(struct fscache_object *object) +{ + enum fscache_object_state new_state; + struct fscache_cookie *cookie; + + ASSERT(object != NULL); + + _enter("{OBJ%x,%s,%lx}", + object->debug_id, fscache_object_states[object->state], + object->events); + + switch (object->state) { + /* wait for the parent object to become ready */ + case FSCACHE_OBJECT_INIT: + object->event_mask = + ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED); + fscache_initialise_object(object); + goto done; + + /* look up the object metadata on disk */ + case FSCACHE_OBJECT_LOOKING_UP: + fscache_lookup_object(object); + goto lookup_transit; + + /* create the object metadata on disk */ + case FSCACHE_OBJECT_CREATING: + fscache_lookup_object(object); + goto lookup_transit; + + /* handle an object becoming available; start pending + * operations and queue dependent operations for processing */ + case FSCACHE_OBJECT_AVAILABLE: + fscache_object_available(object); + goto active_transit; + + /* normal running state */ + case FSCACHE_OBJECT_ACTIVE: + goto active_transit; + + /* update the object metadata on disk */ + case FSCACHE_OBJECT_UPDATING: + clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events); + fscache_stat(&fscache_n_updates_run); + fscache_stat(&fscache_n_cop_update_object); + object->cache->ops->update_object(object); + fscache_stat_d(&fscache_n_cop_update_object); + goto active_transit; + + /* handle an object dying during lookup or creation */ + case FSCACHE_OBJECT_LC_DYING: + object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE); + fscache_stat(&fscache_n_cop_lookup_complete); + object->cache->ops->lookup_complete(object); + fscache_stat_d(&fscache_n_cop_lookup_complete); + + spin_lock(&object->lock); + object->state = FSCACHE_OBJECT_DYING; + cookie = object->cookie; + if (cookie) { + if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, + &cookie->flags)) + wake_up_bit(&cookie->flags, + FSCACHE_COOKIE_LOOKING_UP); + if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, + &cookie->flags)) + wake_up_bit(&cookie->flags, + FSCACHE_COOKIE_CREATING); + } + spin_unlock(&object->lock); + + fscache_done_parent_op(object); + + /* wait for completion of all active operations on this object + * and the death of all child objects of this object */ + case FSCACHE_OBJECT_DYING: + dying: + clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events); + spin_lock(&object->lock); + _debug("dying OBJ%x {%d,%d}", + object->debug_id, object->n_ops, object->n_children); + if (object->n_ops == 0 && object->n_children == 0) { + object->event_mask &= + ~(1 << FSCACHE_OBJECT_EV_CLEARED); + object->event_mask |= + (1 << FSCACHE_OBJECT_EV_WITHDRAW) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_ERROR); + } else { + object->event_mask &= + ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_ERROR)); + object->event_mask |= + 1 << FSCACHE_OBJECT_EV_CLEARED; + } + spin_unlock(&object->lock); + fscache_enqueue_dependents(object); + fscache_start_operations(object); + goto terminal_transit; + + /* handle an abort during initialisation */ + case FSCACHE_OBJECT_ABORT_INIT: + _debug("handle abort init %lx", object->events); + object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE); + + spin_lock(&object->lock); + fscache_dequeue_object(object); + + object->state = FSCACHE_OBJECT_DYING; + if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, + &object->cookie->flags)) + wake_up_bit(&object->cookie->flags, + FSCACHE_COOKIE_CREATING); + spin_unlock(&object->lock); + goto dying; + + /* handle the netfs releasing an object and possibly marking it + * obsolete too */ + case FSCACHE_OBJECT_RELEASING: + case FSCACHE_OBJECT_RECYCLING: + object->event_mask &= + ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_ERROR)); + fscache_release_object(object); + spin_lock(&object->lock); + object->state = FSCACHE_OBJECT_DEAD; + spin_unlock(&object->lock); + fscache_stat(&fscache_n_object_dead); + goto terminal_transit; + + /* handle the parent cache of this object being withdrawn from + * active service */ + case FSCACHE_OBJECT_WITHDRAWING: + object->event_mask &= + ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_ERROR)); + fscache_withdraw_object(object); + spin_lock(&object->lock); + object->state = FSCACHE_OBJECT_DEAD; + spin_unlock(&object->lock); + fscache_stat(&fscache_n_object_dead); + goto terminal_transit; + + /* complain about the object being woken up once it is + * deceased */ + case FSCACHE_OBJECT_DEAD: + printk(KERN_ERR "FS-Cache:" + " Unexpected event in dead state %lx\n", + object->events & object->event_mask); + BUG(); + + default: + printk(KERN_ERR "FS-Cache: Unknown object state %u\n", + object->state); + BUG(); + } + + /* determine the transition from a lookup state */ +lookup_transit: + switch (fls(object->events & object->event_mask) - 1) { + case FSCACHE_OBJECT_EV_WITHDRAW: + case FSCACHE_OBJECT_EV_RETIRE: + case FSCACHE_OBJECT_EV_RELEASE: + case FSCACHE_OBJECT_EV_ERROR: + new_state = FSCACHE_OBJECT_LC_DYING; + goto change_state; + case FSCACHE_OBJECT_EV_REQUEUE: + goto done; + case -1: + goto done; /* sleep until event */ + default: + goto unsupported_event; + } + + /* determine the transition from an active state */ +active_transit: + switch (fls(object->events & object->event_mask) - 1) { + case FSCACHE_OBJECT_EV_WITHDRAW: + case FSCACHE_OBJECT_EV_RETIRE: + case FSCACHE_OBJECT_EV_RELEASE: + case FSCACHE_OBJECT_EV_ERROR: + new_state = FSCACHE_OBJECT_DYING; + goto change_state; + case FSCACHE_OBJECT_EV_UPDATE: + new_state = FSCACHE_OBJECT_UPDATING; + goto change_state; + case -1: + new_state = FSCACHE_OBJECT_ACTIVE; + goto change_state; /* sleep until event */ + default: + goto unsupported_event; + } + + /* determine the transition from a terminal state */ +terminal_transit: + switch (fls(object->events & object->event_mask) - 1) { + case FSCACHE_OBJECT_EV_WITHDRAW: + new_state = FSCACHE_OBJECT_WITHDRAWING; + goto change_state; + case FSCACHE_OBJECT_EV_RETIRE: + new_state = FSCACHE_OBJECT_RECYCLING; + goto change_state; + case FSCACHE_OBJECT_EV_RELEASE: + new_state = FSCACHE_OBJECT_RELEASING; + goto change_state; + case FSCACHE_OBJECT_EV_ERROR: + new_state = FSCACHE_OBJECT_WITHDRAWING; + goto change_state; + case FSCACHE_OBJECT_EV_CLEARED: + new_state = FSCACHE_OBJECT_DYING; + goto change_state; + case -1: + goto done; /* sleep until event */ + default: + goto unsupported_event; + } + +change_state: + spin_lock(&object->lock); + object->state = new_state; + spin_unlock(&object->lock); + +done: + _leave(" [->%s]", fscache_object_states[object->state]); + return; + +unsupported_event: + printk(KERN_ERR "FS-Cache:" + " Unsupported event %lx [mask %lx] in state %s\n", + object->events, object->event_mask, + fscache_object_states[object->state]); + BUG(); +} + +/* + * execute an object + */ +void fscache_object_work_func(struct work_struct *work) +{ + struct fscache_object *object = + container_of(work, struct fscache_object, work); + unsigned long start; + + _enter("{OBJ%x}", object->debug_id); + + start = jiffies; + fscache_object_state_machine(object); + fscache_hist(fscache_objs_histogram, start); + if (object->events & object->event_mask) + fscache_enqueue_object(object); + clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + fscache_put_object(object); +} +EXPORT_SYMBOL(fscache_object_work_func); + +/* + * initialise an object + * - check the specified object's parent to see if we can make use of it + * immediately to do a creation + * - we may need to start the process of creating a parent and we need to wait + * for the parent's lookup and creation to complete if it's not there yet + * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the + * leaf-most cookies of the object and all its children + */ +static void fscache_initialise_object(struct fscache_object *object) +{ + struct fscache_object *parent; + + _enter(""); + ASSERT(object->cookie != NULL); + ASSERT(object->cookie->parent != NULL); + + if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) | + (1 << FSCACHE_OBJECT_EV_RELEASE) | + (1 << FSCACHE_OBJECT_EV_RETIRE) | + (1 << FSCACHE_OBJECT_EV_WITHDRAW))) { + _debug("abort init %lx", object->events); + spin_lock(&object->lock); + object->state = FSCACHE_OBJECT_ABORT_INIT; + spin_unlock(&object->lock); + return; + } + + spin_lock(&object->cookie->lock); + spin_lock_nested(&object->cookie->parent->lock, 1); + + parent = object->parent; + if (!parent) { + _debug("no parent"); + set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); + } else { + spin_lock(&object->lock); + spin_lock_nested(&parent->lock, 1); + _debug("parent %s", fscache_object_states[parent->state]); + + if (parent->state >= FSCACHE_OBJECT_DYING) { + _debug("bad parent"); + set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); + } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) { + _debug("wait"); + + /* we may get woken up in this state by child objects + * binding on to us, so we need to make sure we don't + * add ourself to the list multiple times */ + if (list_empty(&object->dep_link)) { + fscache_stat(&fscache_n_cop_grab_object); + object->cache->ops->grab_object(object); + fscache_stat_d(&fscache_n_cop_grab_object); + list_add(&object->dep_link, + &parent->dependents); + + /* fscache_acquire_non_index_cookie() uses this + * to wake the chain up */ + if (parent->state == FSCACHE_OBJECT_INIT) + fscache_enqueue_object(parent); + } + } else { + _debug("go"); + parent->n_ops++; + parent->n_obj_ops++; + object->lookup_jif = jiffies; + object->state = FSCACHE_OBJECT_LOOKING_UP; + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + } + + spin_unlock(&parent->lock); + spin_unlock(&object->lock); + } + + spin_unlock(&object->cookie->parent->lock); + spin_unlock(&object->cookie->lock); + _leave(""); +} + +/* + * look an object up in the cache from which it was allocated + * - we hold an "access lock" on the parent object, so the parent object cannot + * be withdrawn by either party till we've finished + * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the + * leaf-most cookies of the object and all its children + */ +static void fscache_lookup_object(struct fscache_object *object) +{ + struct fscache_cookie *cookie = object->cookie; + struct fscache_object *parent; + int ret; + + _enter(""); + + parent = object->parent; + ASSERT(parent != NULL); + ASSERTCMP(parent->n_ops, >, 0); + ASSERTCMP(parent->n_obj_ops, >, 0); + + /* make sure the parent is still available */ + ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE); + + if (parent->state >= FSCACHE_OBJECT_DYING || + test_bit(FSCACHE_IOERROR, &object->cache->flags)) { + _debug("unavailable"); + set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); + _leave(""); + return; + } + + _debug("LOOKUP \"%s/%s\" in \"%s\"", + parent->cookie->def->name, cookie->def->name, + object->cache->tag->name); + + fscache_stat(&fscache_n_object_lookups); + fscache_stat(&fscache_n_cop_lookup_object); + ret = object->cache->ops->lookup_object(object); + fscache_stat_d(&fscache_n_cop_lookup_object); + + if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events)) + set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); + + if (ret == -ETIMEDOUT) { + /* probably stuck behind another object, so move this one to + * the back of the queue */ + fscache_stat(&fscache_n_object_lookups_timed_out); + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + } + + _leave(""); +} + +/** + * fscache_object_lookup_negative - Note negative cookie lookup + * @object: Object pointing to cookie to mark + * + * Note negative lookup, permitting those waiting to read data from an already + * existing backing object to continue as there's no data for them to read. + */ +void fscache_object_lookup_negative(struct fscache_object *object) +{ + struct fscache_cookie *cookie = object->cookie; + + _enter("{OBJ%x,%s}", + object->debug_id, fscache_object_states[object->state]); + + spin_lock(&object->lock); + if (object->state == FSCACHE_OBJECT_LOOKING_UP) { + fscache_stat(&fscache_n_object_lookups_negative); + + /* transit here to allow write requests to begin stacking up + * and read requests to begin returning ENODATA */ + object->state = FSCACHE_OBJECT_CREATING; + spin_unlock(&object->lock); + + set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags); + set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + + _debug("wake up lookup %p", &cookie->flags); + smp_mb__before_clear_bit(); + clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + } else { + ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING); + spin_unlock(&object->lock); + } + + _leave(""); +} +EXPORT_SYMBOL(fscache_object_lookup_negative); + +/** + * fscache_obtained_object - Note successful object lookup or creation + * @object: Object pointing to cookie to mark + * + * Note successful lookup and/or creation, permitting those waiting to write + * data to a backing object to continue. + * + * Note that after calling this, an object's cookie may be relinquished by the + * netfs, and so must be accessed with object lock held. + */ +void fscache_obtained_object(struct fscache_object *object) +{ + struct fscache_cookie *cookie = object->cookie; + + _enter("{OBJ%x,%s}", + object->debug_id, fscache_object_states[object->state]); + + /* if we were still looking up, then we must have a positive lookup + * result, in which case there may be data available */ + spin_lock(&object->lock); + if (object->state == FSCACHE_OBJECT_LOOKING_UP) { + fscache_stat(&fscache_n_object_lookups_positive); + + clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + + object->state = FSCACHE_OBJECT_AVAILABLE; + spin_unlock(&object->lock); + + smp_mb__before_clear_bit(); + clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + } else { + ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING); + fscache_stat(&fscache_n_object_created); + + object->state = FSCACHE_OBJECT_AVAILABLE; + spin_unlock(&object->lock); + set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + smp_wmb(); + } + + if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING); + + _leave(""); +} +EXPORT_SYMBOL(fscache_obtained_object); + +/* + * handle an object that has just become available + */ +static void fscache_object_available(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + spin_lock(&object->lock); + + if (object->cookie && + test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags)) + wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING); + + fscache_done_parent_op(object); + if (object->n_in_progress == 0) { + if (object->n_ops > 0) { + ASSERTCMP(object->n_ops, >=, object->n_obj_ops); + ASSERTIF(object->n_ops > object->n_obj_ops, + !list_empty(&object->pending_ops)); + fscache_start_operations(object); + } else { + ASSERT(list_empty(&object->pending_ops)); + } + } + spin_unlock(&object->lock); + + fscache_stat(&fscache_n_cop_lookup_complete); + object->cache->ops->lookup_complete(object); + fscache_stat_d(&fscache_n_cop_lookup_complete); + fscache_enqueue_dependents(object); + + fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif); + fscache_stat(&fscache_n_object_avail); + + _leave(""); +} + +/* + * drop an object's attachments + */ +static void fscache_drop_object(struct fscache_object *object) +{ + struct fscache_object *parent = object->parent; + struct fscache_cache *cache = object->cache; + + _enter("{OBJ%x,%d}", object->debug_id, object->n_children); + + ASSERTCMP(object->cookie, ==, NULL); + ASSERT(hlist_unhashed(&object->cookie_link)); + + spin_lock(&cache->object_list_lock); + list_del_init(&object->cache_link); + spin_unlock(&cache->object_list_lock); + + fscache_stat(&fscache_n_cop_drop_object); + cache->ops->drop_object(object); + fscache_stat_d(&fscache_n_cop_drop_object); + + if (parent) { + _debug("release parent OBJ%x {%d}", + parent->debug_id, parent->n_children); + + spin_lock(&parent->lock); + parent->n_children--; + if (parent->n_children == 0) + fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); + spin_unlock(&parent->lock); + object->parent = NULL; + } + + /* this just shifts the object release to the work processor */ + fscache_put_object(object); + + _leave(""); +} + +/* + * release or recycle an object that the netfs has discarded + */ +static void fscache_release_object(struct fscache_object *object) +{ + _enter(""); + + fscache_drop_object(object); +} + +/* + * withdraw an object from active service + */ +static void fscache_withdraw_object(struct fscache_object *object) +{ + struct fscache_cookie *cookie; + bool detached; + + _enter(""); + + spin_lock(&object->lock); + cookie = object->cookie; + if (cookie) { + /* need to get the cookie lock before the object lock, starting + * from the object pointer */ + atomic_inc(&cookie->usage); + spin_unlock(&object->lock); + + detached = false; + spin_lock(&cookie->lock); + spin_lock(&object->lock); + + if (object->cookie == cookie) { + hlist_del_init(&object->cookie_link); + object->cookie = NULL; + detached = true; + } + spin_unlock(&cookie->lock); + fscache_cookie_put(cookie); + if (detached) + fscache_cookie_put(cookie); + } + + spin_unlock(&object->lock); + + fscache_drop_object(object); +} + +/* + * withdraw an object from active service at the behest of the cache + * - need break the links to a cached object cookie + * - called under two situations: + * (1) recycler decides to reclaim an in-use object + * (2) a cache is unmounted + * - have to take care as the cookie can be being relinquished by the netfs + * simultaneously + * - the object is pinned by the caller holding a refcount on it + */ +void fscache_withdrawing_object(struct fscache_cache *cache, + struct fscache_object *object) +{ + bool enqueue = false; + + _enter(",OBJ%x", object->debug_id); + + spin_lock(&object->lock); + if (object->state < FSCACHE_OBJECT_WITHDRAWING) { + object->state = FSCACHE_OBJECT_WITHDRAWING; + enqueue = true; + } + spin_unlock(&object->lock); + + if (enqueue) + fscache_enqueue_object(object); + + _leave(""); +} + +/* + * get a ref on an object + */ +static int fscache_get_object(struct fscache_object *object) +{ + int ret; + + fscache_stat(&fscache_n_cop_grab_object); + ret = object->cache->ops->grab_object(object) ? 0 : -EAGAIN; + fscache_stat_d(&fscache_n_cop_grab_object); + return ret; +} + +/* + * discard a ref on a work item + */ +static void fscache_put_object(struct fscache_object *object) +{ + fscache_stat(&fscache_n_cop_put_object); + object->cache->ops->put_object(object); + fscache_stat_d(&fscache_n_cop_put_object); +} + +/* + * enqueue an object for metadata-type processing + */ +void fscache_enqueue_object(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + if (fscache_get_object(object) >= 0) { + wait_queue_head_t *cong_wq = + &get_cpu_var(fscache_object_cong_wait); + + if (queue_work(fscache_object_wq, &object->work)) { + if (fscache_object_congested()) + wake_up(cong_wq); + } else + fscache_put_object(object); + + put_cpu_var(fscache_object_cong_wait); + } +} + +/** + * fscache_object_sleep_till_congested - Sleep until object wq is congested + * @timoutp: Scheduler sleep timeout + * + * Allow an object handler to sleep until the object workqueue is congested. + * + * The caller must set up a wake up event before calling this and must have set + * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own + * condition before calling this function as no test is made here. + * + * %true is returned if the object wq is congested, %false otherwise. + */ +bool fscache_object_sleep_till_congested(signed long *timeoutp) +{ + wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait); + DEFINE_WAIT(wait); + + if (fscache_object_congested()) + return true; + + add_wait_queue_exclusive(cong_wq, &wait); + if (!fscache_object_congested()) + *timeoutp = schedule_timeout(*timeoutp); + finish_wait(cong_wq, &wait); + + return fscache_object_congested(); +} +EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested); + +/* + * enqueue the dependents of an object for metadata-type processing + * - the caller must hold the object's lock + * - this may cause an already locked object to wind up being processed again + */ +static void fscache_enqueue_dependents(struct fscache_object *object) +{ + struct fscache_object *dep; + + _enter("{OBJ%x}", object->debug_id); + + if (list_empty(&object->dependents)) + return; + + spin_lock(&object->lock); + + while (!list_empty(&object->dependents)) { + dep = list_entry(object->dependents.next, + struct fscache_object, dep_link); + list_del_init(&dep->dep_link); + + + /* sort onto appropriate lists */ + fscache_enqueue_object(dep); + fscache_put_object(dep); + + if (!list_empty(&object->dependents)) + cond_resched_lock(&object->lock); + } + + spin_unlock(&object->lock); +} + +/* + * remove an object from whatever queue it's waiting on + * - the caller must hold object->lock + */ +void fscache_dequeue_object(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + if (!list_empty(&object->dep_link)) { + spin_lock(&object->parent->lock); + list_del_init(&object->dep_link); + spin_unlock(&object->parent->lock); + } + + _leave(""); +} + +/** + * fscache_check_aux - Ask the netfs whether an object on disk is still valid + * @object: The object to ask about + * @data: The auxiliary data for the object + * @datalen: The size of the auxiliary data + * + * This function consults the netfs about the coherency state of an object + */ +enum fscache_checkaux fscache_check_aux(struct fscache_object *object, + const void *data, uint16_t datalen) +{ + enum fscache_checkaux result; + + if (!object->cookie->def->check_aux) { + fscache_stat(&fscache_n_checkaux_none); + return FSCACHE_CHECKAUX_OKAY; + } + + result = object->cookie->def->check_aux(object->cookie->netfs_data, + data, datalen); + switch (result) { + /* entry okay as is */ + case FSCACHE_CHECKAUX_OKAY: + fscache_stat(&fscache_n_checkaux_okay); + break; + + /* entry requires update */ + case FSCACHE_CHECKAUX_NEEDS_UPDATE: + fscache_stat(&fscache_n_checkaux_update); + break; + + /* entry requires deletion */ + case FSCACHE_CHECKAUX_OBSOLETE: + fscache_stat(&fscache_n_checkaux_obsolete); + break; + + default: + BUG(); + } + + return result; +} +EXPORT_SYMBOL(fscache_check_aux); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c new file mode 100644 index 00000000..30afdfa7 --- /dev/null +++ b/fs/fscache/operation.c @@ -0,0 +1,463 @@ +/* FS-Cache worker operation management routines + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * See Documentation/filesystems/caching/operations.txt + */ + +#define FSCACHE_DEBUG_LEVEL OPERATION +#include +#include +#include +#include "internal.h" + +atomic_t fscache_op_debug_id; +EXPORT_SYMBOL(fscache_op_debug_id); + +/** + * fscache_enqueue_operation - Enqueue an operation for processing + * @op: The operation to enqueue + * + * Enqueue an operation for processing by the FS-Cache thread pool. + * + * This will get its own ref on the object. + */ +void fscache_enqueue_operation(struct fscache_operation *op) +{ + _enter("{OBJ%x OP%x,%u}", + op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERT(list_empty(&op->pend_link)); + ASSERT(op->processor != NULL); + ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); + ASSERTCMP(atomic_read(&op->usage), >, 0); + + fscache_stat(&fscache_n_op_enqueue); + switch (op->flags & FSCACHE_OP_TYPE) { + case FSCACHE_OP_ASYNC: + _debug("queue async"); + atomic_inc(&op->usage); + if (!queue_work(fscache_op_wq, &op->work)) + fscache_put_operation(op); + break; + case FSCACHE_OP_MYTHREAD: + _debug("queue for caller's attention"); + break; + default: + printk(KERN_ERR "FS-Cache: Unexpected op type %lx", + op->flags); + BUG(); + break; + } +} +EXPORT_SYMBOL(fscache_enqueue_operation); + +/* + * start an op running + */ +static void fscache_run_op(struct fscache_object *object, + struct fscache_operation *op) +{ + object->n_in_progress++; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + if (op->processor) + fscache_enqueue_operation(op); + fscache_stat(&fscache_n_op_run); +} + +/* + * submit an exclusive operation for an object + * - other ops are excluded from running simultaneously with this one + * - this gets any extra refs it needs on an op + */ +int fscache_submit_exclusive_op(struct fscache_object *object, + struct fscache_operation *op) +{ + int ret; + + _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); + + spin_lock(&object->lock); + ASSERTCMP(object->n_ops, >=, object->n_in_progress); + ASSERTCMP(object->n_ops, >=, object->n_exclusive); + ASSERT(list_empty(&op->pend_link)); + + ret = -ENOBUFS; + if (fscache_object_is_active(object)) { + op->object = object; + object->n_ops++; + object->n_exclusive++; /* reads and writes must wait */ + + if (object->n_ops > 1) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + } else if (!list_empty(&object->pending_ops)) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + fscache_start_operations(object); + } else { + ASSERTCMP(object->n_in_progress, ==, 0); + fscache_run_op(object, op); + } + + /* need to issue a new write op after this */ + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + ret = 0; + } else if (object->state == FSCACHE_OBJECT_CREATING) { + op->object = object; + object->n_ops++; + object->n_exclusive++; /* reads and writes must wait */ + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + ret = 0; + } else { + /* not allowed to submit ops in any other state */ + BUG(); + } + + spin_unlock(&object->lock); + return ret; +} + +/* + * report an unexpected submission + */ +static void fscache_report_unexpected_submission(struct fscache_object *object, + struct fscache_operation *op, + unsigned long ostate) +{ + static bool once_only; + struct fscache_operation *p; + unsigned n; + + if (once_only) + return; + once_only = true; + + kdebug("unexpected submission OP%x [OBJ%x %s]", + op->debug_id, object->debug_id, + fscache_object_states[object->state]); + kdebug("objstate=%s [%s]", + fscache_object_states[object->state], + fscache_object_states[ostate]); + kdebug("objflags=%lx", object->flags); + kdebug("objevent=%lx [%lx]", object->events, object->event_mask); + kdebug("ops=%u inp=%u exc=%u", + object->n_ops, object->n_in_progress, object->n_exclusive); + + if (!list_empty(&object->pending_ops)) { + n = 0; + list_for_each_entry(p, &object->pending_ops, pend_link) { + ASSERTCMP(p->object, ==, object); + kdebug("%p %p", op->processor, op->release); + n++; + } + + kdebug("n=%u", n); + } + + dump_stack(); +} + +/* + * submit an operation for an object + * - objects may be submitted only in the following states: + * - during object creation (write ops may be submitted) + * - whilst the object is active + * - after an I/O error incurred in one of the two above states (op rejected) + * - this gets any extra refs it needs on an op + */ +int fscache_submit_op(struct fscache_object *object, + struct fscache_operation *op) +{ + unsigned long ostate; + int ret; + + _enter("{OBJ%x OP%x},{%u}", + object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERTCMP(atomic_read(&op->usage), >, 0); + + spin_lock(&object->lock); + ASSERTCMP(object->n_ops, >=, object->n_in_progress); + ASSERTCMP(object->n_ops, >=, object->n_exclusive); + ASSERT(list_empty(&op->pend_link)); + + ostate = object->state; + smp_rmb(); + + if (fscache_object_is_active(object)) { + op->object = object; + object->n_ops++; + + if (object->n_exclusive > 0) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + } else if (!list_empty(&object->pending_ops)) { + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + fscache_start_operations(object); + } else { + ASSERTCMP(object->n_exclusive, ==, 0); + fscache_run_op(object, op); + } + ret = 0; + } else if (object->state == FSCACHE_OBJECT_CREATING) { + op->object = object; + object->n_ops++; + atomic_inc(&op->usage); + list_add_tail(&op->pend_link, &object->pending_ops); + fscache_stat(&fscache_n_op_pend); + ret = 0; + } else if (object->state == FSCACHE_OBJECT_DYING || + object->state == FSCACHE_OBJECT_LC_DYING || + object->state == FSCACHE_OBJECT_WITHDRAWING) { + fscache_stat(&fscache_n_op_rejected); + ret = -ENOBUFS; + } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { + fscache_report_unexpected_submission(object, op, ostate); + ASSERT(!fscache_object_is_active(object)); + ret = -ENOBUFS; + } else { + ret = -ENOBUFS; + } + + spin_unlock(&object->lock); + return ret; +} + +/* + * queue an object for withdrawal on error, aborting all following asynchronous + * operations + */ +void fscache_abort_object(struct fscache_object *object) +{ + _enter("{OBJ%x}", object->debug_id); + + fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); +} + +/* + * jump start the operation processing on an object + * - caller must hold object->lock + */ +void fscache_start_operations(struct fscache_object *object) +{ + struct fscache_operation *op; + bool stop = false; + + while (!list_empty(&object->pending_ops) && !stop) { + op = list_entry(object->pending_ops.next, + struct fscache_operation, pend_link); + + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { + if (object->n_in_progress > 0) + break; + stop = true; + } + list_del_init(&op->pend_link); + fscache_run_op(object, op); + + /* the pending queue was holding a ref on the object */ + fscache_put_operation(op); + } + + ASSERTCMP(object->n_in_progress, <=, object->n_ops); + + _debug("woke %d ops on OBJ%x", + object->n_in_progress, object->debug_id); +} + +/* + * cancel an operation that's pending on an object + */ +int fscache_cancel_op(struct fscache_operation *op) +{ + struct fscache_object *object = op->object; + int ret; + + _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); + + spin_lock(&object->lock); + + ret = -EBUSY; + if (!list_empty(&op->pend_link)) { + fscache_stat(&fscache_n_op_cancelled); + list_del_init(&op->pend_link); + object->n_ops--; + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + fscache_put_operation(op); + ret = 0; + } + + spin_unlock(&object->lock); + _leave(" = %d", ret); + return ret; +} + +/* + * release an operation + * - queues pending ops if this is the last in-progress op + */ +void fscache_put_operation(struct fscache_operation *op) +{ + struct fscache_object *object; + struct fscache_cache *cache; + + _enter("{OBJ%x OP%x,%d}", + op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERTCMP(atomic_read(&op->usage), >, 0); + + if (!atomic_dec_and_test(&op->usage)) + return; + + _debug("PUT OP"); + if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags)) + BUG(); + + fscache_stat(&fscache_n_op_release); + + if (op->release) { + op->release(op); + op->release = NULL; + } + + object = op->object; + + if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) + atomic_dec(&object->n_reads); + + /* now... we may get called with the object spinlock held, so we + * complete the cleanup here only if we can immediately acquire the + * lock, and defer it otherwise */ + if (!spin_trylock(&object->lock)) { + _debug("defer put"); + fscache_stat(&fscache_n_op_deferred_release); + + cache = object->cache; + spin_lock(&cache->op_gc_list_lock); + list_add_tail(&op->pend_link, &cache->op_gc_list); + spin_unlock(&cache->op_gc_list_lock); + schedule_work(&cache->op_gc); + _leave(" [defer]"); + return; + } + + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { + ASSERTCMP(object->n_exclusive, >, 0); + object->n_exclusive--; + } + + ASSERTCMP(object->n_in_progress, >, 0); + object->n_in_progress--; + if (object->n_in_progress == 0) + fscache_start_operations(object); + + ASSERTCMP(object->n_ops, >, 0); + object->n_ops--; + if (object->n_ops == 0) + fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); + + spin_unlock(&object->lock); + + kfree(op); + _leave(" [done]"); +} +EXPORT_SYMBOL(fscache_put_operation); + +/* + * garbage collect operations that have had their release deferred + */ +void fscache_operation_gc(struct work_struct *work) +{ + struct fscache_operation *op; + struct fscache_object *object; + struct fscache_cache *cache = + container_of(work, struct fscache_cache, op_gc); + int count = 0; + + _enter(""); + + do { + spin_lock(&cache->op_gc_list_lock); + if (list_empty(&cache->op_gc_list)) { + spin_unlock(&cache->op_gc_list_lock); + break; + } + + op = list_entry(cache->op_gc_list.next, + struct fscache_operation, pend_link); + list_del(&op->pend_link); + spin_unlock(&cache->op_gc_list_lock); + + object = op->object; + + _debug("GC DEFERRED REL OBJ%x OP%x", + object->debug_id, op->debug_id); + fscache_stat(&fscache_n_op_gc); + + ASSERTCMP(atomic_read(&op->usage), ==, 0); + + spin_lock(&object->lock); + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) { + ASSERTCMP(object->n_exclusive, >, 0); + object->n_exclusive--; + } + + ASSERTCMP(object->n_in_progress, >, 0); + object->n_in_progress--; + if (object->n_in_progress == 0) + fscache_start_operations(object); + + ASSERTCMP(object->n_ops, >, 0); + object->n_ops--; + if (object->n_ops == 0) + fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); + + spin_unlock(&object->lock); + + } while (count++ < 20); + + if (!list_empty(&cache->op_gc_list)) + schedule_work(&cache->op_gc); + + _leave(""); +} + +/* + * execute an operation using fs_op_wq to provide processing context - + * the caller holds a ref to this object, so we don't need to hold one + */ +void fscache_op_work_func(struct work_struct *work) +{ + struct fscache_operation *op = + container_of(work, struct fscache_operation, work); + unsigned long start; + + _enter("{OBJ%x OP%x,%d}", + op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + + ASSERT(op->processor != NULL); + start = jiffies; + op->processor(op); + fscache_hist(fscache_ops_histogram, start); + fscache_put_operation(op); + + _leave(""); +} diff --git a/fs/fscache/page.c b/fs/fscache/page.c new file mode 100644 index 00000000..3f7a59bf --- /dev/null +++ b/fs/fscache/page.c @@ -0,0 +1,996 @@ +/* Cache page management and data I/O routines + * + * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL PAGE +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * check to see if a page is being written to the cache + */ +bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page) +{ + void *val; + + rcu_read_lock(); + val = radix_tree_lookup(&cookie->stores, page->index); + rcu_read_unlock(); + + return val != NULL; +} +EXPORT_SYMBOL(__fscache_check_page_write); + +/* + * wait for a page to finish being written to the cache + */ +void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page) +{ + wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0); + + wait_event(*wq, !__fscache_check_page_write(cookie, page)); +} +EXPORT_SYMBOL(__fscache_wait_on_page_write); + +/* + * decide whether a page can be released, possibly by cancelling a store to it + * - we're allowed to sleep if __GFP_WAIT is flagged + */ +bool __fscache_maybe_release_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp) +{ + struct page *xpage; + void *val; + + _enter("%p,%p,%x", cookie, page, gfp); + + rcu_read_lock(); + val = radix_tree_lookup(&cookie->stores, page->index); + if (!val) { + rcu_read_unlock(); + fscache_stat(&fscache_n_store_vmscan_not_storing); + __fscache_uncache_page(cookie, page); + return true; + } + + /* see if the page is actually undergoing storage - if so we can't get + * rid of it till the cache has finished with it */ + if (radix_tree_tag_get(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG)) { + rcu_read_unlock(); + goto page_busy; + } + + /* the page is pending storage, so we attempt to cancel the store and + * discard the store request so that the page can be reclaimed */ + spin_lock(&cookie->stores_lock); + rcu_read_unlock(); + + if (radix_tree_tag_get(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG)) { + /* the page started to undergo storage whilst we were looking, + * so now we can only wait or return */ + spin_unlock(&cookie->stores_lock); + goto page_busy; + } + + xpage = radix_tree_delete(&cookie->stores, page->index); + spin_unlock(&cookie->stores_lock); + + if (xpage) { + fscache_stat(&fscache_n_store_vmscan_cancelled); + fscache_stat(&fscache_n_store_radix_deletes); + ASSERTCMP(xpage, ==, page); + } else { + fscache_stat(&fscache_n_store_vmscan_gone); + } + + wake_up_bit(&cookie->flags, 0); + if (xpage) + page_cache_release(xpage); + __fscache_uncache_page(cookie, page); + return true; + +page_busy: + /* we might want to wait here, but that could deadlock the allocator as + * the work threads writing to the cache may all end up sleeping + * on memory allocation */ + fscache_stat(&fscache_n_store_vmscan_busy); + return false; +} +EXPORT_SYMBOL(__fscache_maybe_release_page); + +/* + * note that a page has finished being written to the cache + */ +static void fscache_end_page_write(struct fscache_object *object, + struct page *page) +{ + struct fscache_cookie *cookie; + struct page *xpage = NULL; + + spin_lock(&object->lock); + cookie = object->cookie; + if (cookie) { + /* delete the page from the tree if it is now no longer + * pending */ + spin_lock(&cookie->stores_lock); + radix_tree_tag_clear(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG); + if (!radix_tree_tag_get(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG)) { + fscache_stat(&fscache_n_store_radix_deletes); + xpage = radix_tree_delete(&cookie->stores, page->index); + } + spin_unlock(&cookie->stores_lock); + wake_up_bit(&cookie->flags, 0); + } + spin_unlock(&object->lock); + if (xpage) + page_cache_release(xpage); +} + +/* + * actually apply the changed attributes to a cache object + */ +static void fscache_attr_changed_op(struct fscache_operation *op) +{ + struct fscache_object *object = op->object; + int ret; + + _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id); + + fscache_stat(&fscache_n_attr_changed_calls); + + if (fscache_object_is_active(object)) { + fscache_stat(&fscache_n_cop_attr_changed); + ret = object->cache->ops->attr_changed(object); + fscache_stat_d(&fscache_n_cop_attr_changed); + if (ret < 0) + fscache_abort_object(object); + } + + _leave(""); +} + +/* + * notification that the attributes on an object have changed + */ +int __fscache_attr_changed(struct fscache_cookie *cookie) +{ + struct fscache_operation *op; + struct fscache_object *object; + + _enter("%p", cookie); + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + + fscache_stat(&fscache_n_attr_changed); + + op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) { + fscache_stat(&fscache_n_attr_changed_nomem); + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + fscache_operation_init(op, fscache_attr_changed_op, NULL); + op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); + + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + if (fscache_submit_exclusive_op(object, op) < 0) + goto nobufs; + spin_unlock(&cookie->lock); + fscache_stat(&fscache_n_attr_changed_ok); + fscache_put_operation(op); + _leave(" = 0"); + return 0; + +nobufs: + spin_unlock(&cookie->lock); + kfree(op); + fscache_stat(&fscache_n_attr_changed_nobufs); + _leave(" = %d", -ENOBUFS); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_attr_changed); + +/* + * release a retrieval op reference + */ +static void fscache_release_retrieval_op(struct fscache_operation *_op) +{ + struct fscache_retrieval *op = + container_of(_op, struct fscache_retrieval, op); + + _enter("{OP%x}", op->op.debug_id); + + fscache_hist(fscache_retrieval_histogram, op->start_time); + if (op->context) + fscache_put_context(op->op.object->cookie, op->context); + + _leave(""); +} + +/* + * allocate a retrieval op + */ +static struct fscache_retrieval *fscache_alloc_retrieval( + struct address_space *mapping, + fscache_rw_complete_t end_io_func, + void *context) +{ + struct fscache_retrieval *op; + + /* allocate a retrieval operation and attempt to submit it */ + op = kzalloc(sizeof(*op), GFP_NOIO); + if (!op) { + fscache_stat(&fscache_n_retrievals_nomem); + return NULL; + } + + fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); + op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING); + op->mapping = mapping; + op->end_io_func = end_io_func; + op->context = context; + op->start_time = jiffies; + INIT_LIST_HEAD(&op->to_do); + return op; +} + +/* + * wait for a deferred lookup to complete + */ +static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) +{ + unsigned long jif; + + _enter(""); + + if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) { + _leave(" = 0 [imm]"); + return 0; + } + + fscache_stat(&fscache_n_retrievals_wait); + + jif = jiffies; + if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, + fscache_wait_bit_interruptible, + TASK_INTERRUPTIBLE) != 0) { + fscache_stat(&fscache_n_retrievals_intr); + _leave(" = -ERESTARTSYS"); + return -ERESTARTSYS; + } + + ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)); + + smp_rmb(); + fscache_hist(fscache_retrieval_delay_histogram, jif); + _leave(" = 0 [dly]"); + return 0; +} + +/* + * wait for an object to become active (or dead) + */ +static int fscache_wait_for_retrieval_activation(struct fscache_object *object, + struct fscache_retrieval *op, + atomic_t *stat_op_waits, + atomic_t *stat_object_dead) +{ + int ret; + + if (!test_bit(FSCACHE_OP_WAITING, &op->op.flags)) + goto check_if_dead; + + _debug(">>> WT"); + fscache_stat(stat_op_waits); + if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit_interruptible, + TASK_INTERRUPTIBLE) < 0) { + ret = fscache_cancel_op(&op->op); + if (ret == 0) + return -ERESTARTSYS; + + /* it's been removed from the pending queue by another party, + * so we should get to run shortly */ + wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + fscache_wait_bit, TASK_UNINTERRUPTIBLE); + } + _debug("<<< GO"); + +check_if_dead: + if (unlikely(fscache_object_is_dead(object))) { + fscache_stat(stat_object_dead); + return -ENOBUFS; + } + return 0; +} + +/* + * read a page from the cache or allocate a block in which to store it + * - we return: + * -ENOMEM - out of memory, nothing done + * -ERESTARTSYS - interrupted + * -ENOBUFS - no backing object available in which to cache the block + * -ENODATA - no data available in the backing object for this block + * 0 - dispatched a read - it'll call end_io_func() when finished + */ +int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, + struct page *page, + fscache_rw_complete_t end_io_func, + void *context, + gfp_t gfp) +{ + struct fscache_retrieval *op; + struct fscache_object *object; + int ret; + + _enter("%p,%p,,,", cookie, page); + + fscache_stat(&fscache_n_retrievals); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(page, !=, NULL); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + op = fscache_alloc_retrieval(page->mapping, end_io_func, context); + if (!op) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs_unlock; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP); + + atomic_inc(&object->n_reads); + set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); + + if (fscache_submit_op(object, &op->op) < 0) + goto nobufs_unlock; + spin_unlock(&cookie->lock); + + fscache_stat(&fscache_n_retrieval_ops); + + /* pin the netfs read context in case we need to do the actual netfs + * read because we've encountered a cache read failure */ + fscache_get_context(object->cookie, op->context); + + /* we wait for the operation to become active, and then process it + * *here*, in this thread, and not in the thread pool */ + ret = fscache_wait_for_retrieval_activation( + object, op, + __fscache_stat(&fscache_n_retrieval_op_waits), + __fscache_stat(&fscache_n_retrievals_object_dead)); + if (ret < 0) + goto error; + + /* ask the cache to honour the operation */ + if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { + fscache_stat(&fscache_n_cop_allocate_page); + ret = object->cache->ops->allocate_page(op, page, gfp); + fscache_stat_d(&fscache_n_cop_allocate_page); + if (ret == 0) + ret = -ENODATA; + } else { + fscache_stat(&fscache_n_cop_read_or_alloc_page); + ret = object->cache->ops->read_or_alloc_page(op, page, gfp); + fscache_stat_d(&fscache_n_cop_read_or_alloc_page); + } + +error: + if (ret == -ENOMEM) + fscache_stat(&fscache_n_retrievals_nomem); + else if (ret == -ERESTARTSYS) + fscache_stat(&fscache_n_retrievals_intr); + else if (ret == -ENODATA) + fscache_stat(&fscache_n_retrievals_nodata); + else if (ret < 0) + fscache_stat(&fscache_n_retrievals_nobufs); + else + fscache_stat(&fscache_n_retrievals_ok); + + fscache_put_retrieval(op); + _leave(" = %d", ret); + return ret; + +nobufs_unlock: + spin_unlock(&cookie->lock); + kfree(op); +nobufs: + fscache_stat(&fscache_n_retrievals_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_read_or_alloc_page); + +/* + * read a list of page from the cache or allocate a block in which to store + * them + * - we return: + * -ENOMEM - out of memory, some pages may be being read + * -ERESTARTSYS - interrupted, some pages may be being read + * -ENOBUFS - no backing object or space available in which to cache any + * pages not being read + * -ENODATA - no data available in the backing object for some or all of + * the pages + * 0 - dispatched a read on all pages + * + * end_io_func() will be called for each page read from the cache as it is + * finishes being read + * + * any pages for which a read is dispatched will be removed from pages and + * nr_pages + */ +int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages, + fscache_rw_complete_t end_io_func, + void *context, + gfp_t gfp) +{ + struct fscache_retrieval *op; + struct fscache_object *object; + int ret; + + _enter("%p,,%d,,,", cookie, *nr_pages); + + fscache_stat(&fscache_n_retrievals); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(*nr_pages, >, 0); + ASSERT(!list_empty(pages)); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + op = fscache_alloc_retrieval(mapping, end_io_func, context); + if (!op) + return -ENOMEM; + + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs_unlock; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + atomic_inc(&object->n_reads); + set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); + + if (fscache_submit_op(object, &op->op) < 0) + goto nobufs_unlock; + spin_unlock(&cookie->lock); + + fscache_stat(&fscache_n_retrieval_ops); + + /* pin the netfs read context in case we need to do the actual netfs + * read because we've encountered a cache read failure */ + fscache_get_context(object->cookie, op->context); + + /* we wait for the operation to become active, and then process it + * *here*, in this thread, and not in the thread pool */ + ret = fscache_wait_for_retrieval_activation( + object, op, + __fscache_stat(&fscache_n_retrieval_op_waits), + __fscache_stat(&fscache_n_retrievals_object_dead)); + if (ret < 0) + goto error; + + /* ask the cache to honour the operation */ + if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { + fscache_stat(&fscache_n_cop_allocate_pages); + ret = object->cache->ops->allocate_pages( + op, pages, nr_pages, gfp); + fscache_stat_d(&fscache_n_cop_allocate_pages); + } else { + fscache_stat(&fscache_n_cop_read_or_alloc_pages); + ret = object->cache->ops->read_or_alloc_pages( + op, pages, nr_pages, gfp); + fscache_stat_d(&fscache_n_cop_read_or_alloc_pages); + } + +error: + if (ret == -ENOMEM) + fscache_stat(&fscache_n_retrievals_nomem); + else if (ret == -ERESTARTSYS) + fscache_stat(&fscache_n_retrievals_intr); + else if (ret == -ENODATA) + fscache_stat(&fscache_n_retrievals_nodata); + else if (ret < 0) + fscache_stat(&fscache_n_retrievals_nobufs); + else + fscache_stat(&fscache_n_retrievals_ok); + + fscache_put_retrieval(op); + _leave(" = %d", ret); + return ret; + +nobufs_unlock: + spin_unlock(&cookie->lock); + kfree(op); +nobufs: + fscache_stat(&fscache_n_retrievals_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_read_or_alloc_pages); + +/* + * allocate a block in the cache on which to store a page + * - we return: + * -ENOMEM - out of memory, nothing done + * -ERESTARTSYS - interrupted + * -ENOBUFS - no backing object available in which to cache the block + * 0 - block allocated + */ +int __fscache_alloc_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp) +{ + struct fscache_retrieval *op; + struct fscache_object *object; + int ret; + + _enter("%p,%p,,,", cookie, page); + + fscache_stat(&fscache_n_allocs); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(page, !=, NULL); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + op = fscache_alloc_retrieval(page->mapping, NULL, NULL); + if (!op) + return -ENOMEM; + + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs_unlock; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + if (fscache_submit_op(object, &op->op) < 0) + goto nobufs_unlock; + spin_unlock(&cookie->lock); + + fscache_stat(&fscache_n_alloc_ops); + + ret = fscache_wait_for_retrieval_activation( + object, op, + __fscache_stat(&fscache_n_alloc_op_waits), + __fscache_stat(&fscache_n_allocs_object_dead)); + if (ret < 0) + goto error; + + /* ask the cache to honour the operation */ + fscache_stat(&fscache_n_cop_allocate_page); + ret = object->cache->ops->allocate_page(op, page, gfp); + fscache_stat_d(&fscache_n_cop_allocate_page); + +error: + if (ret == -ERESTARTSYS) + fscache_stat(&fscache_n_allocs_intr); + else if (ret < 0) + fscache_stat(&fscache_n_allocs_nobufs); + else + fscache_stat(&fscache_n_allocs_ok); + + fscache_put_retrieval(op); + _leave(" = %d", ret); + return ret; + +nobufs_unlock: + spin_unlock(&cookie->lock); + kfree(op); +nobufs: + fscache_stat(&fscache_n_allocs_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_alloc_page); + +/* + * release a write op reference + */ +static void fscache_release_write_op(struct fscache_operation *_op) +{ + _enter("{OP%x}", _op->debug_id); +} + +/* + * perform the background storage of a page into the cache + */ +static void fscache_write_op(struct fscache_operation *_op) +{ + struct fscache_storage *op = + container_of(_op, struct fscache_storage, op); + struct fscache_object *object = op->op.object; + struct fscache_cookie *cookie; + struct page *page; + unsigned n; + void *results[1]; + int ret; + + _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); + + spin_lock(&object->lock); + cookie = object->cookie; + + if (!fscache_object_is_active(object) || !cookie) { + spin_unlock(&object->lock); + _leave(""); + return; + } + + spin_lock(&cookie->stores_lock); + + fscache_stat(&fscache_n_store_calls); + + /* find a page to store */ + page = NULL; + n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1, + FSCACHE_COOKIE_PENDING_TAG); + if (n != 1) + goto superseded; + page = results[0]; + _debug("gang %d [%lx]", n, page->index); + if (page->index > op->store_limit) { + fscache_stat(&fscache_n_store_pages_over_limit); + goto superseded; + } + + radix_tree_tag_set(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG); + radix_tree_tag_clear(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG); + + spin_unlock(&cookie->stores_lock); + spin_unlock(&object->lock); + + fscache_stat(&fscache_n_store_pages); + fscache_stat(&fscache_n_cop_write_page); + ret = object->cache->ops->write_page(op, page); + fscache_stat_d(&fscache_n_cop_write_page); + fscache_end_page_write(object, page); + if (ret < 0) { + fscache_abort_object(object); + } else { + fscache_enqueue_operation(&op->op); + } + + _leave(""); + return; + +superseded: + /* this writer is going away and there aren't any more things to + * write */ + _debug("cease"); + spin_unlock(&cookie->stores_lock); + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + spin_unlock(&object->lock); + _leave(""); +} + +/* + * request a page be stored in the cache + * - returns: + * -ENOMEM - out of memory, nothing done + * -ENOBUFS - no backing object available in which to cache the page + * 0 - dispatched a write - it'll call end_io_func() when finished + * + * if the cookie still has a backing object at this point, that object can be + * in one of a few states with respect to storage processing: + * + * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is + * set) + * + * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred + * fill op) + * + * (b) writes deferred till post-creation (mark page for writing and + * return immediately) + * + * (2) negative lookup, object created, initial fill being made from netfs + * (FSCACHE_COOKIE_INITIAL_FILL is set) + * + * (a) fill point not yet reached this page (mark page for writing and + * return) + * + * (b) fill point passed this page (queue op to store this page) + * + * (3) object extant (queue op to store this page) + * + * any other state is invalid + */ +int __fscache_write_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp) +{ + struct fscache_storage *op; + struct fscache_object *object; + int ret; + + _enter("%p,%x,", cookie, (u32) page->flags); + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERT(PageFsCache(page)); + + fscache_stat(&fscache_n_stores); + + op = kzalloc(sizeof(*op), GFP_NOIO); + if (!op) + goto nomem; + + fscache_operation_init(&op->op, fscache_write_op, + fscache_release_write_op); + op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING); + + ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); + if (ret < 0) + goto nomem_free; + + ret = -ENOBUFS; + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto nobufs; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + if (test_bit(FSCACHE_IOERROR, &object->cache->flags)) + goto nobufs; + + /* add the page to the pending-storage radix tree on the backing + * object */ + spin_lock(&object->lock); + spin_lock(&cookie->stores_lock); + + _debug("store limit %llx", (unsigned long long) object->store_limit); + + ret = radix_tree_insert(&cookie->stores, page->index, page); + if (ret < 0) { + if (ret == -EEXIST) + goto already_queued; + _debug("insert failed %d", ret); + goto nobufs_unlock_obj; + } + + radix_tree_tag_set(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG); + page_cache_get(page); + + /* we only want one writer at a time, but we do need to queue new + * writers after exclusive ops */ + if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags)) + goto already_pending; + + spin_unlock(&cookie->stores_lock); + spin_unlock(&object->lock); + + op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); + op->store_limit = object->store_limit; + + if (fscache_submit_op(object, &op->op) < 0) + goto submit_failed; + + spin_unlock(&cookie->lock); + radix_tree_preload_end(); + fscache_stat(&fscache_n_store_ops); + fscache_stat(&fscache_n_stores_ok); + + /* the work queue now carries its own ref on the object */ + fscache_put_operation(&op->op); + _leave(" = 0"); + return 0; + +already_queued: + fscache_stat(&fscache_n_stores_again); +already_pending: + spin_unlock(&cookie->stores_lock); + spin_unlock(&object->lock); + spin_unlock(&cookie->lock); + radix_tree_preload_end(); + kfree(op); + fscache_stat(&fscache_n_stores_ok); + _leave(" = 0"); + return 0; + +submit_failed: + spin_lock(&cookie->stores_lock); + radix_tree_delete(&cookie->stores, page->index); + spin_unlock(&cookie->stores_lock); + page_cache_release(page); + ret = -ENOBUFS; + goto nobufs; + +nobufs_unlock_obj: + spin_unlock(&cookie->stores_lock); + spin_unlock(&object->lock); +nobufs: + spin_unlock(&cookie->lock); + radix_tree_preload_end(); + kfree(op); + fscache_stat(&fscache_n_stores_nobufs); + _leave(" = -ENOBUFS"); + return -ENOBUFS; + +nomem_free: + kfree(op); +nomem: + fscache_stat(&fscache_n_stores_oom); + _leave(" = -ENOMEM"); + return -ENOMEM; +} +EXPORT_SYMBOL(__fscache_write_page); + +/* + * remove a page from the cache + */ +void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page) +{ + struct fscache_object *object; + + _enter(",%p", page); + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + ASSERTCMP(page, !=, NULL); + + fscache_stat(&fscache_n_uncaches); + + /* cache withdrawal may beat us to it */ + if (!PageFsCache(page)) + goto done; + + /* get the object */ + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) { + ClearPageFsCache(page); + goto done_unlock; + } + + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + /* there might now be stuff on disk we could read */ + clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + + /* only invoke the cache backend if we managed to mark the page + * uncached here; this deals with synchronisation vs withdrawal */ + if (TestClearPageFsCache(page) && + object->cache->ops->uncache_page) { + /* the cache backend releases the cookie lock */ + fscache_stat(&fscache_n_cop_uncache_page); + object->cache->ops->uncache_page(object, page); + fscache_stat_d(&fscache_n_cop_uncache_page); + goto done; + } + +done_unlock: + spin_unlock(&cookie->lock); +done: + _leave(""); +} +EXPORT_SYMBOL(__fscache_uncache_page); + +/** + * fscache_mark_pages_cached - Mark pages as being cached + * @op: The retrieval op pages are being marked for + * @pagevec: The pages to be marked + * + * Mark a bunch of netfs pages as being cached. After this is called, + * the netfs must call fscache_uncache_page() to remove the mark. + */ +void fscache_mark_pages_cached(struct fscache_retrieval *op, + struct pagevec *pagevec) +{ + struct fscache_cookie *cookie = op->op.object->cookie; + unsigned long loop; + +#ifdef CONFIG_FSCACHE_STATS + atomic_add(pagevec->nr, &fscache_n_marks); +#endif + + for (loop = 0; loop < pagevec->nr; loop++) { + struct page *page = pagevec->pages[loop]; + + _debug("- mark %p{%lx}", page, page->index); + if (TestSetPageFsCache(page)) { + static bool once_only; + if (!once_only) { + once_only = true; + printk(KERN_WARNING "FS-Cache:" + " Cookie type %s marked page %lx" + " multiple times\n", + cookie->def->name, page->index); + } + } + } + + if (cookie->def->mark_pages_cached) + cookie->def->mark_pages_cached(cookie->netfs_data, + op->mapping, pagevec); + pagevec_reinit(pagevec); +} +EXPORT_SYMBOL(fscache_mark_pages_cached); + +/* + * Uncache all the pages in an inode that are marked PG_fscache, assuming them + * to be associated with the given cookie. + */ +void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, + struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + pgoff_t next; + int i; + + _enter("%p,%p", cookie, inode); + + if (!mapping || mapping->nrpages == 0) { + _leave(" [no pages]"); + return; + } + + pagevec_init(&pvec, 0); + next = 0; + do { + if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) + break; + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + next = page->index; + if (PageFsCache(page)) { + __fscache_wait_on_page_write(cookie, page); + __fscache_uncache_page(cookie, page); + } + } + pagevec_release(&pvec); + cond_resched(); + } while (++next); + + _leave(""); +} +EXPORT_SYMBOL(__fscache_uncache_all_inode_pages); diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c new file mode 100644 index 00000000..1d9e4951 --- /dev/null +++ b/fs/fscache/proc.c @@ -0,0 +1,81 @@ +/* FS-Cache statistics viewing interface + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL OPERATION +#include +#include +#include +#include "internal.h" + +/* + * initialise the /proc/fs/fscache/ directory + */ +int __init fscache_proc_init(void) +{ + _enter(""); + + if (!proc_mkdir("fs/fscache", NULL)) + goto error_dir; + +#ifdef CONFIG_FSCACHE_STATS + if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL, + &fscache_stats_fops)) + goto error_stats; +#endif + +#ifdef CONFIG_FSCACHE_HISTOGRAM + if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL, + &fscache_histogram_fops)) + goto error_histogram; +#endif + +#ifdef CONFIG_FSCACHE_OBJECT_LIST + if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL, + &fscache_objlist_fops)) + goto error_objects; +#endif + + _leave(" = 0"); + return 0; + +#ifdef CONFIG_FSCACHE_OBJECT_LIST +error_objects: +#endif +#ifdef CONFIG_FSCACHE_HISTOGRAM + remove_proc_entry("fs/fscache/histogram", NULL); +error_histogram: +#endif +#ifdef CONFIG_FSCACHE_STATS + remove_proc_entry("fs/fscache/stats", NULL); +error_stats: +#endif + remove_proc_entry("fs/fscache", NULL); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * clean up the /proc/fs/fscache/ directory + */ +void fscache_proc_cleanup(void) +{ +#ifdef CONFIG_FSCACHE_OBJECT_LIST + remove_proc_entry("fs/fscache/objects", NULL); +#endif +#ifdef CONFIG_FSCACHE_HISTOGRAM + remove_proc_entry("fs/fscache/histogram", NULL); +#endif +#ifdef CONFIG_FSCACHE_STATS + remove_proc_entry("fs/fscache/stats", NULL); +#endif + remove_proc_entry("fs/fscache", NULL); +} diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c new file mode 100644 index 00000000..4765190d --- /dev/null +++ b/fs/fscache/stats.c @@ -0,0 +1,280 @@ +/* FS-Cache statistics + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define FSCACHE_DEBUG_LEVEL THREAD +#include +#include +#include +#include "internal.h" + +/* + * operation counters + */ +atomic_t fscache_n_op_pend; +atomic_t fscache_n_op_run; +atomic_t fscache_n_op_enqueue; +atomic_t fscache_n_op_requeue; +atomic_t fscache_n_op_deferred_release; +atomic_t fscache_n_op_release; +atomic_t fscache_n_op_gc; +atomic_t fscache_n_op_cancelled; +atomic_t fscache_n_op_rejected; + +atomic_t fscache_n_attr_changed; +atomic_t fscache_n_attr_changed_ok; +atomic_t fscache_n_attr_changed_nobufs; +atomic_t fscache_n_attr_changed_nomem; +atomic_t fscache_n_attr_changed_calls; + +atomic_t fscache_n_allocs; +atomic_t fscache_n_allocs_ok; +atomic_t fscache_n_allocs_wait; +atomic_t fscache_n_allocs_nobufs; +atomic_t fscache_n_allocs_intr; +atomic_t fscache_n_allocs_object_dead; +atomic_t fscache_n_alloc_ops; +atomic_t fscache_n_alloc_op_waits; + +atomic_t fscache_n_retrievals; +atomic_t fscache_n_retrievals_ok; +atomic_t fscache_n_retrievals_wait; +atomic_t fscache_n_retrievals_nodata; +atomic_t fscache_n_retrievals_nobufs; +atomic_t fscache_n_retrievals_intr; +atomic_t fscache_n_retrievals_nomem; +atomic_t fscache_n_retrievals_object_dead; +atomic_t fscache_n_retrieval_ops; +atomic_t fscache_n_retrieval_op_waits; + +atomic_t fscache_n_stores; +atomic_t fscache_n_stores_ok; +atomic_t fscache_n_stores_again; +atomic_t fscache_n_stores_nobufs; +atomic_t fscache_n_stores_oom; +atomic_t fscache_n_store_ops; +atomic_t fscache_n_store_calls; +atomic_t fscache_n_store_pages; +atomic_t fscache_n_store_radix_deletes; +atomic_t fscache_n_store_pages_over_limit; + +atomic_t fscache_n_store_vmscan_not_storing; +atomic_t fscache_n_store_vmscan_gone; +atomic_t fscache_n_store_vmscan_busy; +atomic_t fscache_n_store_vmscan_cancelled; + +atomic_t fscache_n_marks; +atomic_t fscache_n_uncaches; + +atomic_t fscache_n_acquires; +atomic_t fscache_n_acquires_null; +atomic_t fscache_n_acquires_no_cache; +atomic_t fscache_n_acquires_ok; +atomic_t fscache_n_acquires_nobufs; +atomic_t fscache_n_acquires_oom; + +atomic_t fscache_n_updates; +atomic_t fscache_n_updates_null; +atomic_t fscache_n_updates_run; + +atomic_t fscache_n_relinquishes; +atomic_t fscache_n_relinquishes_null; +atomic_t fscache_n_relinquishes_waitcrt; +atomic_t fscache_n_relinquishes_retire; + +atomic_t fscache_n_cookie_index; +atomic_t fscache_n_cookie_data; +atomic_t fscache_n_cookie_special; + +atomic_t fscache_n_object_alloc; +atomic_t fscache_n_object_no_alloc; +atomic_t fscache_n_object_lookups; +atomic_t fscache_n_object_lookups_negative; +atomic_t fscache_n_object_lookups_positive; +atomic_t fscache_n_object_lookups_timed_out; +atomic_t fscache_n_object_created; +atomic_t fscache_n_object_avail; +atomic_t fscache_n_object_dead; + +atomic_t fscache_n_checkaux_none; +atomic_t fscache_n_checkaux_okay; +atomic_t fscache_n_checkaux_update; +atomic_t fscache_n_checkaux_obsolete; + +atomic_t fscache_n_cop_alloc_object; +atomic_t fscache_n_cop_lookup_object; +atomic_t fscache_n_cop_lookup_complete; +atomic_t fscache_n_cop_grab_object; +atomic_t fscache_n_cop_update_object; +atomic_t fscache_n_cop_drop_object; +atomic_t fscache_n_cop_put_object; +atomic_t fscache_n_cop_sync_cache; +atomic_t fscache_n_cop_attr_changed; +atomic_t fscache_n_cop_read_or_alloc_page; +atomic_t fscache_n_cop_read_or_alloc_pages; +atomic_t fscache_n_cop_allocate_page; +atomic_t fscache_n_cop_allocate_pages; +atomic_t fscache_n_cop_write_page; +atomic_t fscache_n_cop_uncache_page; +atomic_t fscache_n_cop_dissociate_pages; + +/* + * display the general statistics + */ +static int fscache_stats_show(struct seq_file *m, void *v) +{ + seq_puts(m, "FS-Cache statistics\n"); + + seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n", + atomic_read(&fscache_n_cookie_index), + atomic_read(&fscache_n_cookie_data), + atomic_read(&fscache_n_cookie_special)); + + seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n", + atomic_read(&fscache_n_object_alloc), + atomic_read(&fscache_n_object_no_alloc), + atomic_read(&fscache_n_object_avail), + atomic_read(&fscache_n_object_dead)); + seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n", + atomic_read(&fscache_n_checkaux_none), + atomic_read(&fscache_n_checkaux_okay), + atomic_read(&fscache_n_checkaux_update), + atomic_read(&fscache_n_checkaux_obsolete)); + + seq_printf(m, "Pages : mrk=%u unc=%u\n", + atomic_read(&fscache_n_marks), + atomic_read(&fscache_n_uncaches)); + + seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u" + " oom=%u\n", + atomic_read(&fscache_n_acquires), + atomic_read(&fscache_n_acquires_null), + atomic_read(&fscache_n_acquires_no_cache), + atomic_read(&fscache_n_acquires_ok), + atomic_read(&fscache_n_acquires_nobufs), + atomic_read(&fscache_n_acquires_oom)); + + seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u tmo=%u\n", + atomic_read(&fscache_n_object_lookups), + atomic_read(&fscache_n_object_lookups_negative), + atomic_read(&fscache_n_object_lookups_positive), + atomic_read(&fscache_n_object_created), + atomic_read(&fscache_n_object_lookups_timed_out)); + + seq_printf(m, "Updates: n=%u nul=%u run=%u\n", + atomic_read(&fscache_n_updates), + atomic_read(&fscache_n_updates_null), + atomic_read(&fscache_n_updates_run)); + + seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u rtr=%u\n", + atomic_read(&fscache_n_relinquishes), + atomic_read(&fscache_n_relinquishes_null), + atomic_read(&fscache_n_relinquishes_waitcrt), + atomic_read(&fscache_n_relinquishes_retire)); + + seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n", + atomic_read(&fscache_n_attr_changed), + atomic_read(&fscache_n_attr_changed_ok), + atomic_read(&fscache_n_attr_changed_nobufs), + atomic_read(&fscache_n_attr_changed_nomem), + atomic_read(&fscache_n_attr_changed_calls)); + + seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u int=%u\n", + atomic_read(&fscache_n_allocs), + atomic_read(&fscache_n_allocs_ok), + atomic_read(&fscache_n_allocs_wait), + atomic_read(&fscache_n_allocs_nobufs), + atomic_read(&fscache_n_allocs_intr)); + seq_printf(m, "Allocs : ops=%u owt=%u abt=%u\n", + atomic_read(&fscache_n_alloc_ops), + atomic_read(&fscache_n_alloc_op_waits), + atomic_read(&fscache_n_allocs_object_dead)); + + seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u" + " int=%u oom=%u\n", + atomic_read(&fscache_n_retrievals), + atomic_read(&fscache_n_retrievals_ok), + atomic_read(&fscache_n_retrievals_wait), + atomic_read(&fscache_n_retrievals_nodata), + atomic_read(&fscache_n_retrievals_nobufs), + atomic_read(&fscache_n_retrievals_intr), + atomic_read(&fscache_n_retrievals_nomem)); + seq_printf(m, "Retrvls: ops=%u owt=%u abt=%u\n", + atomic_read(&fscache_n_retrieval_ops), + atomic_read(&fscache_n_retrieval_op_waits), + atomic_read(&fscache_n_retrievals_object_dead)); + + seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n", + atomic_read(&fscache_n_stores), + atomic_read(&fscache_n_stores_ok), + atomic_read(&fscache_n_stores_again), + atomic_read(&fscache_n_stores_nobufs), + atomic_read(&fscache_n_stores_oom)); + seq_printf(m, "Stores : ops=%u run=%u pgs=%u rxd=%u olm=%u\n", + atomic_read(&fscache_n_store_ops), + atomic_read(&fscache_n_store_calls), + atomic_read(&fscache_n_store_pages), + atomic_read(&fscache_n_store_radix_deletes), + atomic_read(&fscache_n_store_pages_over_limit)); + + seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n", + atomic_read(&fscache_n_store_vmscan_not_storing), + atomic_read(&fscache_n_store_vmscan_gone), + atomic_read(&fscache_n_store_vmscan_busy), + atomic_read(&fscache_n_store_vmscan_cancelled)); + + seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n", + atomic_read(&fscache_n_op_pend), + atomic_read(&fscache_n_op_run), + atomic_read(&fscache_n_op_enqueue), + atomic_read(&fscache_n_op_cancelled), + atomic_read(&fscache_n_op_rejected)); + seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n", + atomic_read(&fscache_n_op_deferred_release), + atomic_read(&fscache_n_op_release), + atomic_read(&fscache_n_op_gc)); + + seq_printf(m, "CacheOp: alo=%d luo=%d luc=%d gro=%d\n", + atomic_read(&fscache_n_cop_alloc_object), + atomic_read(&fscache_n_cop_lookup_object), + atomic_read(&fscache_n_cop_lookup_complete), + atomic_read(&fscache_n_cop_grab_object)); + seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n", + atomic_read(&fscache_n_cop_update_object), + atomic_read(&fscache_n_cop_drop_object), + atomic_read(&fscache_n_cop_put_object), + atomic_read(&fscache_n_cop_attr_changed), + atomic_read(&fscache_n_cop_sync_cache)); + seq_printf(m, "CacheOp: rap=%d ras=%d alp=%d als=%d wrp=%d ucp=%d dsp=%d\n", + atomic_read(&fscache_n_cop_read_or_alloc_page), + atomic_read(&fscache_n_cop_read_or_alloc_pages), + atomic_read(&fscache_n_cop_allocate_page), + atomic_read(&fscache_n_cop_allocate_pages), + atomic_read(&fscache_n_cop_write_page), + atomic_read(&fscache_n_cop_uncache_page), + atomic_read(&fscache_n_cop_dissociate_pages)); + return 0; +} + +/* + * open "/proc/fs/fscache/stats" allowing provision of a statistical summary + */ +static int fscache_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, fscache_stats_show, NULL); +} + +const struct file_operations fscache_stats_fops = { + .owner = THIS_MODULE, + .open = fscache_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig new file mode 100644 index 00000000..0cf160a9 --- /dev/null +++ b/fs/fuse/Kconfig @@ -0,0 +1,15 @@ +config FUSE_FS + tristate "FUSE (Filesystem in Userspace) support" + help + With FUSE it is possible to implement a fully functional filesystem + in a userspace program. + + There's also companion library: libfuse. This library along with + utilities is available from the FUSE homepage: + + + See for more information. + See for needed library/utility version. + + If you want to develop a userspace FS, or if you want to use + a filesystem based on FUSE, answer Y or M. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile new file mode 100644 index 00000000..e95eeb44 --- /dev/null +++ b/fs/fuse/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the FUSE filesystem. +# + +obj-$(CONFIG_FUSE_FS) += fuse.o +obj-$(CONFIG_CUSE) += cuse.o + +fuse-objs := dev.o dir.o file.o inode.o control.o diff --git a/fs/fuse/control.c b/fs/fuse/control.c new file mode 100644 index 00000000..85542a7d --- /dev/null +++ b/fs/fuse/control.c @@ -0,0 +1,359 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include + +#define FUSE_CTL_SUPER_MAGIC 0x65735543 + +/* + * This is non-NULL when the single instance of the control filesystem + * exists. Protected by fuse_mutex + */ +static struct super_block *fuse_control_sb; + +static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) +{ + struct fuse_conn *fc; + mutex_lock(&fuse_mutex); + fc = file->f_path.dentry->d_inode->i_private; + if (fc) + fc = fuse_conn_get(fc); + mutex_unlock(&fuse_mutex); + return fc; +} + +static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (fc) { + fuse_abort_conn(fc); + fuse_conn_put(fc); + } + return count; +} + +static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + char tmp[32]; + size_t size; + + if (!*ppos) { + long value; + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + value = atomic_read(&fc->num_waiting); + file->private_data = (void *)value; + fuse_conn_put(fc); + } + size = sprintf(tmp, "%ld\n", (long)file->private_data); + return simple_read_from_buffer(buf, len, ppos, tmp, size); +} + +static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos, unsigned val) +{ + char tmp[32]; + size_t size = sprintf(tmp, "%u\n", val); + + return simple_read_from_buffer(buf, len, ppos, tmp, size); +} + +static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, unsigned *val, + unsigned global_limit) +{ + unsigned long t; + char tmp[32]; + unsigned limit = (1 << 16) - 1; + int err; + + if (*ppos || count >= sizeof(tmp) - 1) + return -EINVAL; + + if (copy_from_user(tmp, buf, count)) + return -EINVAL; + + tmp[count] = '\0'; + + err = strict_strtoul(tmp, 0, &t); + if (err) + return err; + + if (!capable(CAP_SYS_ADMIN)) + limit = min(limit, global_limit); + + if (t > limit) + return -EINVAL; + + *val = t; + + return count; +} + +static ssize_t fuse_conn_max_background_read(struct file *file, + char __user *buf, size_t len, + loff_t *ppos) +{ + struct fuse_conn *fc; + unsigned val; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + val = fc->max_background; + fuse_conn_put(fc); + + return fuse_conn_limit_read(file, buf, len, ppos, val); +} + +static ssize_t fuse_conn_max_background_write(struct file *file, + const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned val; + ssize_t ret; + + ret = fuse_conn_limit_write(file, buf, count, ppos, &val, + max_user_bgreq); + if (ret > 0) { + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (fc) { + fc->max_background = val; + fuse_conn_put(fc); + } + } + + return ret; +} + +static ssize_t fuse_conn_congestion_threshold_read(struct file *file, + char __user *buf, size_t len, + loff_t *ppos) +{ + struct fuse_conn *fc; + unsigned val; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + val = fc->congestion_threshold; + fuse_conn_put(fc); + + return fuse_conn_limit_read(file, buf, len, ppos, val); +} + +static ssize_t fuse_conn_congestion_threshold_write(struct file *file, + const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned val; + ssize_t ret; + + ret = fuse_conn_limit_write(file, buf, count, ppos, &val, + max_user_congthresh); + if (ret > 0) { + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (fc) { + fc->congestion_threshold = val; + fuse_conn_put(fc); + } + } + + return ret; +} + +static const struct file_operations fuse_ctl_abort_ops = { + .open = nonseekable_open, + .write = fuse_conn_abort_write, + .llseek = no_llseek, +}; + +static const struct file_operations fuse_ctl_waiting_ops = { + .open = nonseekable_open, + .read = fuse_conn_waiting_read, + .llseek = no_llseek, +}; + +static const struct file_operations fuse_conn_max_background_ops = { + .open = nonseekable_open, + .read = fuse_conn_max_background_read, + .write = fuse_conn_max_background_write, + .llseek = no_llseek, +}; + +static const struct file_operations fuse_conn_congestion_threshold_ops = { + .open = nonseekable_open, + .read = fuse_conn_congestion_threshold_read, + .write = fuse_conn_congestion_threshold_write, + .llseek = no_llseek, +}; + +static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, + struct fuse_conn *fc, + const char *name, + int mode, int nlink, + const struct inode_operations *iop, + const struct file_operations *fop) +{ + struct dentry *dentry; + struct inode *inode; + + BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES); + dentry = d_alloc_name(parent, name); + if (!dentry) + return NULL; + + fc->ctl_dentry[fc->ctl_ndents++] = dentry; + inode = new_inode(fuse_control_sb); + if (!inode) + return NULL; + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_uid = fc->user_id; + inode->i_gid = fc->group_id; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + /* setting ->i_op to NULL is not allowed */ + if (iop) + inode->i_op = iop; + inode->i_fop = fop; + inode->i_nlink = nlink; + inode->i_private = fc; + d_add(dentry, inode); + return dentry; +} + +/* + * Add a connection to the control filesystem (if it exists). Caller + * must hold fuse_mutex + */ +int fuse_ctl_add_conn(struct fuse_conn *fc) +{ + struct dentry *parent; + char name[32]; + + if (!fuse_control_sb) + return 0; + + parent = fuse_control_sb->s_root; + inc_nlink(parent->d_inode); + sprintf(name, "%u", fc->dev); + parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2, + &simple_dir_inode_operations, + &simple_dir_operations); + if (!parent) + goto err; + + if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1, + NULL, &fuse_ctl_waiting_ops) || + !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1, + NULL, &fuse_ctl_abort_ops) || + !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600, + 1, NULL, &fuse_conn_max_background_ops) || + !fuse_ctl_add_dentry(parent, fc, "congestion_threshold", + S_IFREG | 0600, 1, NULL, + &fuse_conn_congestion_threshold_ops)) + goto err; + + return 0; + + err: + fuse_ctl_remove_conn(fc); + return -ENOMEM; +} + +/* + * Remove a connection from the control filesystem (if it exists). + * Caller must hold fuse_mutex + */ +void fuse_ctl_remove_conn(struct fuse_conn *fc) +{ + int i; + + if (!fuse_control_sb) + return; + + for (i = fc->ctl_ndents - 1; i >= 0; i--) { + struct dentry *dentry = fc->ctl_dentry[i]; + dentry->d_inode->i_private = NULL; + d_drop(dentry); + dput(dentry); + } + drop_nlink(fuse_control_sb->s_root->d_inode); +} + +static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct tree_descr empty_descr = {""}; + struct fuse_conn *fc; + int err; + + err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr); + if (err) + return err; + + mutex_lock(&fuse_mutex); + BUG_ON(fuse_control_sb); + fuse_control_sb = sb; + list_for_each_entry(fc, &fuse_conn_list, entry) { + err = fuse_ctl_add_conn(fc); + if (err) { + fuse_control_sb = NULL; + mutex_unlock(&fuse_mutex); + return err; + } + } + mutex_unlock(&fuse_mutex); + + return 0; +} + +static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) +{ + return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super); +} + +static void fuse_ctl_kill_sb(struct super_block *sb) +{ + struct fuse_conn *fc; + + mutex_lock(&fuse_mutex); + fuse_control_sb = NULL; + list_for_each_entry(fc, &fuse_conn_list, entry) + fc->ctl_ndents = 0; + mutex_unlock(&fuse_mutex); + + kill_litter_super(sb); +} + +static struct file_system_type fuse_ctl_fs_type = { + .owner = THIS_MODULE, + .name = "fusectl", + .mount = fuse_ctl_mount, + .kill_sb = fuse_ctl_kill_sb, +}; + +int __init fuse_ctl_init(void) +{ + return register_filesystem(&fuse_ctl_fs_type); +} + +void fuse_ctl_cleanup(void) +{ + unregister_filesystem(&fuse_ctl_fs_type); +} diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c new file mode 100644 index 00000000..b6cca47f --- /dev/null +++ b/fs/fuse/cuse.c @@ -0,0 +1,620 @@ +/* + * CUSE: Character device in Userspace + * + * Copyright (C) 2008-2009 SUSE Linux Products GmbH + * Copyright (C) 2008-2009 Tejun Heo + * + * This file is released under the GPLv2. + * + * CUSE enables character devices to be implemented from userland much + * like FUSE allows filesystems. On initialization /dev/cuse is + * created. By opening the file and replying to the CUSE_INIT request + * userland CUSE server can create a character device. After that the + * operation is very similar to FUSE. + * + * A CUSE instance involves the following objects. + * + * cuse_conn : contains fuse_conn and serves as bonding structure + * channel : file handle connected to the userland CUSE server + * cdev : the implemented character device + * dev : generic device for cdev + * + * Note that 'channel' is what 'dev' is in FUSE. As CUSE deals with + * devices, it's called 'channel' to reduce confusion. + * + * channel determines when the character device dies. When channel is + * closed, everything begins to destruct. The cuse_conn is taken off + * the lookup table preventing further access from cdev, cdev and + * generic device are removed and the base reference of cuse_conn is + * put. + * + * On each open, the matching cuse_conn is looked up and if found an + * additional reference is taken which is released when the file is + * closed. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fuse_i.h" + +#define CUSE_CONNTBL_LEN 64 + +struct cuse_conn { + struct list_head list; /* linked on cuse_conntbl */ + struct fuse_conn fc; /* fuse connection */ + struct cdev *cdev; /* associated character device */ + struct device *dev; /* device representing @cdev */ + + /* init parameters, set once during initialization */ + bool unrestricted_ioctl; +}; + +static DEFINE_SPINLOCK(cuse_lock); /* protects cuse_conntbl */ +static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN]; +static struct class *cuse_class; + +static struct cuse_conn *fc_to_cc(struct fuse_conn *fc) +{ + return container_of(fc, struct cuse_conn, fc); +} + +static struct list_head *cuse_conntbl_head(dev_t devt) +{ + return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN]; +} + + +/************************************************************************** + * CUSE frontend operations + * + * These are file operations for the character device. + * + * On open, CUSE opens a file from the FUSE mnt and stores it to + * private_data of the open file. All other ops call FUSE ops on the + * FUSE file. + */ + +static ssize_t cuse_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + loff_t pos = 0; + + return fuse_direct_io(file, buf, count, &pos, 0); +} + +static ssize_t cuse_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + loff_t pos = 0; + /* + * No locking or generic_write_checks(), the server is + * responsible for locking and sanity checks. + */ + return fuse_direct_io(file, buf, count, &pos, 1); +} + +static int cuse_open(struct inode *inode, struct file *file) +{ + dev_t devt = inode->i_cdev->dev; + struct cuse_conn *cc = NULL, *pos; + int rc; + + /* look up and get the connection */ + spin_lock(&cuse_lock); + list_for_each_entry(pos, cuse_conntbl_head(devt), list) + if (pos->dev->devt == devt) { + fuse_conn_get(&pos->fc); + cc = pos; + break; + } + spin_unlock(&cuse_lock); + + /* dead? */ + if (!cc) + return -ENODEV; + + /* + * Generic permission check is already done against the chrdev + * file, proceed to open. + */ + rc = fuse_do_open(&cc->fc, 0, file, 0); + if (rc) + fuse_conn_put(&cc->fc); + return rc; +} + +static int cuse_release(struct inode *inode, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + + fuse_sync_release(ff, file->f_flags); + fuse_conn_put(fc); + + return 0; +} + +static long cuse_file_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_file *ff = file->private_data; + struct cuse_conn *cc = fc_to_cc(ff->fc); + unsigned int flags = 0; + + if (cc->unrestricted_ioctl) + flags |= FUSE_IOCTL_UNRESTRICTED; + + return fuse_do_ioctl(file, cmd, arg, flags); +} + +static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_file *ff = file->private_data; + struct cuse_conn *cc = fc_to_cc(ff->fc); + unsigned int flags = FUSE_IOCTL_COMPAT; + + if (cc->unrestricted_ioctl) + flags |= FUSE_IOCTL_UNRESTRICTED; + + return fuse_do_ioctl(file, cmd, arg, flags); +} + +static const struct file_operations cuse_frontend_fops = { + .owner = THIS_MODULE, + .read = cuse_read, + .write = cuse_write, + .open = cuse_open, + .release = cuse_release, + .unlocked_ioctl = cuse_file_ioctl, + .compat_ioctl = cuse_file_compat_ioctl, + .poll = fuse_file_poll, + .llseek = noop_llseek, +}; + + +/************************************************************************** + * CUSE channel initialization and destruction + */ + +struct cuse_devinfo { + const char *name; +}; + +/** + * cuse_parse_one - parse one key=value pair + * @pp: i/o parameter for the current position + * @end: points to one past the end of the packed string + * @keyp: out parameter for key + * @valp: out parameter for value + * + * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends + * at @end - 1. This function parses one pair and set *@keyp to the + * start of the key and *@valp to the start of the value. Note that + * the original string is modified such that the key string is + * terminated with '\0'. *@pp is updated to point to the next string. + * + * RETURNS: + * 1 on successful parse, 0 on EOF, -errno on failure. + */ +static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp) +{ + char *p = *pp; + char *key, *val; + + while (p < end && *p == '\0') + p++; + if (p == end) + return 0; + + if (end[-1] != '\0') { + printk(KERN_ERR "CUSE: info not properly terminated\n"); + return -EINVAL; + } + + key = val = p; + p += strlen(p); + + if (valp) { + strsep(&val, "="); + if (!val) + val = key + strlen(key); + key = strstrip(key); + val = strstrip(val); + } else + key = strstrip(key); + + if (!strlen(key)) { + printk(KERN_ERR "CUSE: zero length info key specified\n"); + return -EINVAL; + } + + *pp = p; + *keyp = key; + if (valp) + *valp = val; + + return 1; +} + +/** + * cuse_parse_dev_info - parse device info + * @p: device info string + * @len: length of device info string + * @devinfo: out parameter for parsed device info + * + * Parse @p to extract device info and store it into @devinfo. String + * pointed to by @p is modified by parsing and @devinfo points into + * them, so @p shouldn't be freed while @devinfo is in use. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo) +{ + char *end = p + len; + char *key, *val; + int rc; + + while (true) { + rc = cuse_parse_one(&p, end, &key, &val); + if (rc < 0) + return rc; + if (!rc) + break; + if (strcmp(key, "DEVNAME") == 0) + devinfo->name = val; + else + printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n", + key); + } + + if (!devinfo->name || !strlen(devinfo->name)) { + printk(KERN_ERR "CUSE: DEVNAME unspecified\n"); + return -EINVAL; + } + + return 0; +} + +static void cuse_gendev_release(struct device *dev) +{ + kfree(dev); +} + +/** + * cuse_process_init_reply - finish initializing CUSE channel + * + * This function creates the character device and sets up all the + * required data structures for it. Please read the comment at the + * top of this file for high level overview. + */ +static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) +{ + struct cuse_conn *cc = fc_to_cc(fc); + struct cuse_init_out *arg = req->out.args[0].value; + struct page *page = req->pages[0]; + struct cuse_devinfo devinfo = { }; + struct device *dev; + struct cdev *cdev; + dev_t devt; + int rc; + + if (req->out.h.error || + arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { + goto err; + } + + fc->minor = arg->minor; + fc->max_read = max_t(unsigned, arg->max_read, 4096); + fc->max_write = max_t(unsigned, arg->max_write, 4096); + + /* parse init reply */ + cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL; + + rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size, + &devinfo); + if (rc) + goto err; + + /* determine and reserve devt */ + devt = MKDEV(arg->dev_major, arg->dev_minor); + if (!MAJOR(devt)) + rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name); + else + rc = register_chrdev_region(devt, 1, devinfo.name); + if (rc) { + printk(KERN_ERR "CUSE: failed to register chrdev region\n"); + goto err; + } + + /* devt determined, create device */ + rc = -ENOMEM; + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + goto err_region; + + device_initialize(dev); + dev_set_uevent_suppress(dev, 1); + dev->class = cuse_class; + dev->devt = devt; + dev->release = cuse_gendev_release; + dev_set_drvdata(dev, cc); + dev_set_name(dev, "%s", devinfo.name); + + rc = device_add(dev); + if (rc) + goto err_device; + + /* register cdev */ + rc = -ENOMEM; + cdev = cdev_alloc(); + if (!cdev) + goto err_device; + + cdev->owner = THIS_MODULE; + cdev->ops = &cuse_frontend_fops; + + rc = cdev_add(cdev, devt, 1); + if (rc) + goto err_cdev; + + cc->dev = dev; + cc->cdev = cdev; + + /* make the device available */ + spin_lock(&cuse_lock); + list_add(&cc->list, cuse_conntbl_head(devt)); + spin_unlock(&cuse_lock); + + /* announce device availability */ + dev_set_uevent_suppress(dev, 0); + kobject_uevent(&dev->kobj, KOBJ_ADD); +out: + kfree(arg); + __free_page(page); + return; + +err_cdev: + cdev_del(cdev); +err_device: + put_device(dev); +err_region: + unregister_chrdev_region(devt, 1); +err: + fc->conn_error = 1; + goto out; +} + +static int cuse_send_init(struct cuse_conn *cc) +{ + int rc; + struct fuse_req *req; + struct page *page; + struct fuse_conn *fc = &cc->fc; + struct cuse_init_in *arg; + void *outarg; + + BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); + + req = fuse_get_req(fc); + if (IS_ERR(req)) { + rc = PTR_ERR(req); + goto err; + } + + rc = -ENOMEM; + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto err_put_req; + + outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL); + if (!outarg) + goto err_free_page; + + arg = &req->misc.cuse_init_in; + arg->major = FUSE_KERNEL_VERSION; + arg->minor = FUSE_KERNEL_MINOR_VERSION; + arg->flags |= CUSE_UNRESTRICTED_IOCTL; + req->in.h.opcode = CUSE_INIT; + req->in.numargs = 1; + req->in.args[0].size = sizeof(struct cuse_init_in); + req->in.args[0].value = arg; + req->out.numargs = 2; + req->out.args[0].size = sizeof(struct cuse_init_out); + req->out.args[0].value = outarg; + req->out.args[1].size = CUSE_INIT_INFO_MAX; + req->out.argvar = 1; + req->out.argpages = 1; + req->pages[0] = page; + req->num_pages = 1; + req->end = cuse_process_init_reply; + fuse_request_send_background(fc, req); + + return 0; + +err_free_page: + __free_page(page); +err_put_req: + fuse_put_request(fc, req); +err: + return rc; +} + +static void cuse_fc_release(struct fuse_conn *fc) +{ + struct cuse_conn *cc = fc_to_cc(fc); + kfree(cc); +} + +/** + * cuse_channel_open - open method for /dev/cuse + * @inode: inode for /dev/cuse + * @file: file struct being opened + * + * Userland CUSE server can create a CUSE device by opening /dev/cuse + * and replying to the initialization request kernel sends. This + * function is responsible for handling CUSE device initialization. + * Because the fd opened by this function is used during + * initialization, this function only creates cuse_conn and sends + * init. The rest is delegated to a kthread. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int cuse_channel_open(struct inode *inode, struct file *file) +{ + struct cuse_conn *cc; + int rc; + + /* set up cuse_conn */ + cc = kzalloc(sizeof(*cc), GFP_KERNEL); + if (!cc) + return -ENOMEM; + + fuse_conn_init(&cc->fc); + + INIT_LIST_HEAD(&cc->list); + cc->fc.release = cuse_fc_release; + + cc->fc.connected = 1; + cc->fc.blocked = 0; + rc = cuse_send_init(cc); + if (rc) { + fuse_conn_put(&cc->fc); + return rc; + } + file->private_data = &cc->fc; /* channel owns base reference to cc */ + + return 0; +} + +/** + * cuse_channel_release - release method for /dev/cuse + * @inode: inode for /dev/cuse + * @file: file struct being closed + * + * Disconnect the channel, deregister CUSE device and initiate + * destruction by putting the default reference. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int cuse_channel_release(struct inode *inode, struct file *file) +{ + struct cuse_conn *cc = fc_to_cc(file->private_data); + int rc; + + /* remove from the conntbl, no more access from this point on */ + spin_lock(&cuse_lock); + list_del_init(&cc->list); + spin_unlock(&cuse_lock); + + /* remove device */ + if (cc->dev) + device_unregister(cc->dev); + if (cc->cdev) { + unregister_chrdev_region(cc->cdev->dev, 1); + cdev_del(cc->cdev); + } + + /* kill connection and shutdown channel */ + fuse_conn_kill(&cc->fc); + rc = fuse_dev_release(inode, file); /* puts the base reference */ + + return rc; +} + +static struct file_operations cuse_channel_fops; /* initialized during init */ + + +/************************************************************************** + * Misc stuff and module initializatiion + * + * CUSE exports the same set of attributes to sysfs as fusectl. + */ + +static ssize_t cuse_class_waiting_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cuse_conn *cc = dev_get_drvdata(dev); + + return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting)); +} + +static ssize_t cuse_class_abort_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cuse_conn *cc = dev_get_drvdata(dev); + + fuse_abort_conn(&cc->fc); + return count; +} + +static struct device_attribute cuse_class_dev_attrs[] = { + __ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL), + __ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store), + { } +}; + +static struct miscdevice cuse_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cuse", + .fops = &cuse_channel_fops, +}; + +static int __init cuse_init(void) +{ + int i, rc; + + /* init conntbl */ + for (i = 0; i < CUSE_CONNTBL_LEN; i++) + INIT_LIST_HEAD(&cuse_conntbl[i]); + + /* inherit and extend fuse_dev_operations */ + cuse_channel_fops = fuse_dev_operations; + cuse_channel_fops.owner = THIS_MODULE; + cuse_channel_fops.open = cuse_channel_open; + cuse_channel_fops.release = cuse_channel_release; + + cuse_class = class_create(THIS_MODULE, "cuse"); + if (IS_ERR(cuse_class)) + return PTR_ERR(cuse_class); + + cuse_class->dev_attrs = cuse_class_dev_attrs; + + rc = misc_register(&cuse_miscdev); + if (rc) { + class_destroy(cuse_class); + return rc; + } + + return 0; +} + +static void __exit cuse_exit(void) +{ + misc_deregister(&cuse_miscdev); + class_destroy(cuse_class); +} + +module_init(cuse_init); +module_exit(cuse_exit); + +MODULE_AUTHOR("Tejun Heo "); +MODULE_DESCRIPTION("Character device in Userspace"); +MODULE_LICENSE("GPL"); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c new file mode 100644 index 00000000..c858b5c8 --- /dev/null +++ b/fs/fuse/dev.c @@ -0,0 +1,2049 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_ALIAS_MISCDEV(FUSE_MINOR); +MODULE_ALIAS("devname:fuse"); + +static struct kmem_cache *fuse_req_cachep; + +static struct fuse_conn *fuse_get_conn(struct file *file) +{ + /* + * Lockless access is OK, because file->private data is set + * once during mount and is valid until the file is released. + */ + return file->private_data; +} + +static void fuse_request_init(struct fuse_req *req) +{ + memset(req, 0, sizeof(*req)); + INIT_LIST_HEAD(&req->list); + INIT_LIST_HEAD(&req->intr_entry); + init_waitqueue_head(&req->waitq); + atomic_set(&req->count, 1); +} + +struct fuse_req *fuse_request_alloc(void) +{ + struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL); + if (req) + fuse_request_init(req); + return req; +} +EXPORT_SYMBOL_GPL(fuse_request_alloc); + +struct fuse_req *fuse_request_alloc_nofs(void) +{ + struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS); + if (req) + fuse_request_init(req); + return req; +} + +void fuse_request_free(struct fuse_req *req) +{ + kmem_cache_free(fuse_req_cachep, req); +} + +static void block_sigs(sigset_t *oldset) +{ + sigset_t mask; + + siginitsetinv(&mask, sigmask(SIGKILL)); + sigprocmask(SIG_BLOCK, &mask, oldset); +} + +static void restore_sigs(sigset_t *oldset) +{ + sigprocmask(SIG_SETMASK, oldset, NULL); +} + +static void __fuse_get_request(struct fuse_req *req) +{ + atomic_inc(&req->count); +} + +/* Must be called with > 1 refcount */ +static void __fuse_put_request(struct fuse_req *req) +{ + BUG_ON(atomic_read(&req->count) < 2); + atomic_dec(&req->count); +} + +static void fuse_req_init_context(struct fuse_req *req) +{ + req->in.h.uid = current_fsuid(); + req->in.h.gid = current_fsgid(); + req->in.h.pid = current->pid; +} + +struct fuse_req *fuse_get_req(struct fuse_conn *fc) +{ + struct fuse_req *req; + sigset_t oldset; + int intr; + int err; + + atomic_inc(&fc->num_waiting); + block_sigs(&oldset); + intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); + restore_sigs(&oldset); + err = -EINTR; + if (intr) + goto out; + + err = -ENOTCONN; + if (!fc->connected) + goto out; + + req = fuse_request_alloc(); + err = -ENOMEM; + if (!req) + goto out; + + fuse_req_init_context(req); + req->waiting = 1; + return req; + + out: + atomic_dec(&fc->num_waiting); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(fuse_get_req); + +/* + * Return request in fuse_file->reserved_req. However that may + * currently be in use. If that is the case, wait for it to become + * available. + */ +static struct fuse_req *get_reserved_req(struct fuse_conn *fc, + struct file *file) +{ + struct fuse_req *req = NULL; + struct fuse_file *ff = file->private_data; + + do { + wait_event(fc->reserved_req_waitq, ff->reserved_req); + spin_lock(&fc->lock); + if (ff->reserved_req) { + req = ff->reserved_req; + ff->reserved_req = NULL; + get_file(file); + req->stolen_file = file; + } + spin_unlock(&fc->lock); + } while (!req); + + return req; +} + +/* + * Put stolen request back into fuse_file->reserved_req + */ +static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) +{ + struct file *file = req->stolen_file; + struct fuse_file *ff = file->private_data; + + spin_lock(&fc->lock); + fuse_request_init(req); + BUG_ON(ff->reserved_req); + ff->reserved_req = req; + wake_up_all(&fc->reserved_req_waitq); + spin_unlock(&fc->lock); + fput(file); +} + +/* + * Gets a requests for a file operation, always succeeds + * + * This is used for sending the FLUSH request, which must get to + * userspace, due to POSIX locks which may need to be unlocked. + * + * If allocation fails due to OOM, use the reserved request in + * fuse_file. + * + * This is very unlikely to deadlock accidentally, since the + * filesystem should not have it's own file open. If deadlock is + * intentional, it can still be broken by "aborting" the filesystem. + */ +struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file) +{ + struct fuse_req *req; + + atomic_inc(&fc->num_waiting); + wait_event(fc->blocked_waitq, !fc->blocked); + req = fuse_request_alloc(); + if (!req) + req = get_reserved_req(fc, file); + + fuse_req_init_context(req); + req->waiting = 1; + return req; +} + +void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +{ + if (atomic_dec_and_test(&req->count)) { + if (req->waiting) + atomic_dec(&fc->num_waiting); + + if (req->stolen_file) + put_reserved_req(fc, req); + else + fuse_request_free(req); + } +} +EXPORT_SYMBOL_GPL(fuse_put_request); + +static unsigned len_args(unsigned numargs, struct fuse_arg *args) +{ + unsigned nbytes = 0; + unsigned i; + + for (i = 0; i < numargs; i++) + nbytes += args[i].size; + + return nbytes; +} + +static u64 fuse_get_unique(struct fuse_conn *fc) +{ + fc->reqctr++; + /* zero is special */ + if (fc->reqctr == 0) + fc->reqctr = 1; + + return fc->reqctr; +} + +static void queue_request(struct fuse_conn *fc, struct fuse_req *req) +{ + req->in.h.len = sizeof(struct fuse_in_header) + + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); + list_add_tail(&req->list, &fc->pending); + req->state = FUSE_REQ_PENDING; + if (!req->waiting) { + req->waiting = 1; + atomic_inc(&fc->num_waiting); + } + wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); +} + +void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, + u64 nodeid, u64 nlookup) +{ + forget->forget_one.nodeid = nodeid; + forget->forget_one.nlookup = nlookup; + + spin_lock(&fc->lock); + if (fc->connected) { + fc->forget_list_tail->next = forget; + fc->forget_list_tail = forget; + wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); + } else { + kfree(forget); + } + spin_unlock(&fc->lock); +} + +static void flush_bg_queue(struct fuse_conn *fc) +{ + while (fc->active_background < fc->max_background && + !list_empty(&fc->bg_queue)) { + struct fuse_req *req; + + req = list_entry(fc->bg_queue.next, struct fuse_req, list); + list_del(&req->list); + fc->active_background++; + req->in.h.unique = fuse_get_unique(fc); + queue_request(fc, req); + } +} + +/* + * This function is called when a request is finished. Either a reply + * has arrived or it was aborted (and not yet sent) or some error + * occurred during communication with userspace, or the device file + * was closed. The requester thread is woken up (if still waiting), + * the 'end' callback is called if given, else the reference to the + * request is released + * + * Called with fc->lock, unlocks it + */ +static void request_end(struct fuse_conn *fc, struct fuse_req *req) +__releases(fc->lock) +{ + void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; + req->end = NULL; + list_del(&req->list); + list_del(&req->intr_entry); + req->state = FUSE_REQ_FINISHED; + if (req->background) { + if (fc->num_background == fc->max_background) { + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); + } + if (fc->num_background == fc->congestion_threshold && + fc->connected && fc->bdi_initialized) { + clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); + clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC); + } + fc->num_background--; + fc->active_background--; + flush_bg_queue(fc); + } + spin_unlock(&fc->lock); + wake_up(&req->waitq); + if (end) + end(fc, req); + fuse_put_request(fc, req); +} + +static void wait_answer_interruptible(struct fuse_conn *fc, + struct fuse_req *req) +__releases(fc->lock) +__acquires(fc->lock) +{ + if (signal_pending(current)) + return; + + spin_unlock(&fc->lock); + wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED); + spin_lock(&fc->lock); +} + +static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) +{ + list_add_tail(&req->intr_entry, &fc->interrupts); + wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); +} + +static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) +__releases(fc->lock) +__acquires(fc->lock) +{ + if (!fc->no_interrupt) { + /* Any signal may interrupt this */ + wait_answer_interruptible(fc, req); + + if (req->aborted) + goto aborted; + if (req->state == FUSE_REQ_FINISHED) + return; + + req->interrupted = 1; + if (req->state == FUSE_REQ_SENT) + queue_interrupt(fc, req); + } + + if (!req->force) { + sigset_t oldset; + + /* Only fatal signals may interrupt this */ + block_sigs(&oldset); + wait_answer_interruptible(fc, req); + restore_sigs(&oldset); + + if (req->aborted) + goto aborted; + if (req->state == FUSE_REQ_FINISHED) + return; + + /* Request is not yet in userspace, bail out */ + if (req->state == FUSE_REQ_PENDING) { + list_del(&req->list); + __fuse_put_request(req); + req->out.h.error = -EINTR; + return; + } + } + + /* + * Either request is already in userspace, or it was forced. + * Wait it out. + */ + spin_unlock(&fc->lock); + + while (req->state != FUSE_REQ_FINISHED) + wait_event_freezable(req->waitq, + req->state == FUSE_REQ_FINISHED); + spin_lock(&fc->lock); + + if (!req->aborted) + return; + + aborted: + BUG_ON(req->state != FUSE_REQ_FINISHED); + if (req->locked) { + /* This is uninterruptible sleep, because data is + being copied to/from the buffers of req. During + locked state, there mustn't be any filesystem + operation (e.g. page fault), since that could lead + to deadlock */ + spin_unlock(&fc->lock); + wait_event(req->waitq, !req->locked); + spin_lock(&fc->lock); + } +} + +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +{ + req->isreply = 1; + spin_lock(&fc->lock); + if (!fc->connected) + req->out.h.error = -ENOTCONN; + else if (fc->conn_error) + req->out.h.error = -ECONNREFUSED; + else { + req->in.h.unique = fuse_get_unique(fc); + queue_request(fc, req); + /* acquire extra reference, since request is still needed + after request_end() */ + __fuse_get_request(req); + + request_wait_answer(fc, req); + } + spin_unlock(&fc->lock); +} +EXPORT_SYMBOL_GPL(fuse_request_send); + +static void fuse_request_send_nowait_locked(struct fuse_conn *fc, + struct fuse_req *req) +{ + req->background = 1; + fc->num_background++; + if (fc->num_background == fc->max_background) + fc->blocked = 1; + if (fc->num_background == fc->congestion_threshold && + fc->bdi_initialized) { + set_bdi_congested(&fc->bdi, BLK_RW_SYNC); + set_bdi_congested(&fc->bdi, BLK_RW_ASYNC); + } + list_add_tail(&req->list, &fc->bg_queue); + flush_bg_queue(fc); +} + +static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) +{ + spin_lock(&fc->lock); + if (fc->connected) { + fuse_request_send_nowait_locked(fc, req); + spin_unlock(&fc->lock); + } else { + req->out.h.error = -ENOTCONN; + request_end(fc, req); + } +} + +void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) +{ + req->isreply = 1; + fuse_request_send_nowait(fc, req); +} +EXPORT_SYMBOL_GPL(fuse_request_send_background); + +static int fuse_request_send_notify_reply(struct fuse_conn *fc, + struct fuse_req *req, u64 unique) +{ + int err = -ENODEV; + + req->isreply = 0; + req->in.h.unique = unique; + spin_lock(&fc->lock); + if (fc->connected) { + queue_request(fc, req); + err = 0; + } + spin_unlock(&fc->lock); + + return err; +} + +/* + * Called under fc->lock + * + * fc->connected must have been checked previously + */ +void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req) +{ + req->isreply = 1; + fuse_request_send_nowait_locked(fc, req); +} + +/* + * Lock the request. Up to the next unlock_request() there mustn't be + * anything that could cause a page-fault. If the request was already + * aborted bail out. + */ +static int lock_request(struct fuse_conn *fc, struct fuse_req *req) +{ + int err = 0; + if (req) { + spin_lock(&fc->lock); + if (req->aborted) + err = -ENOENT; + else + req->locked = 1; + spin_unlock(&fc->lock); + } + return err; +} + +/* + * Unlock request. If it was aborted during being locked, the + * requester thread is currently waiting for it to be unlocked, so + * wake it up. + */ +static void unlock_request(struct fuse_conn *fc, struct fuse_req *req) +{ + if (req) { + spin_lock(&fc->lock); + req->locked = 0; + if (req->aborted) + wake_up(&req->waitq); + spin_unlock(&fc->lock); + } +} + +struct fuse_copy_state { + struct fuse_conn *fc; + int write; + struct fuse_req *req; + const struct iovec *iov; + struct pipe_buffer *pipebufs; + struct pipe_buffer *currbuf; + struct pipe_inode_info *pipe; + unsigned long nr_segs; + unsigned long seglen; + unsigned long addr; + struct page *pg; + void *mapaddr; + void *buf; + unsigned len; + unsigned move_pages:1; +}; + +static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, + int write, + const struct iovec *iov, unsigned long nr_segs) +{ + memset(cs, 0, sizeof(*cs)); + cs->fc = fc; + cs->write = write; + cs->iov = iov; + cs->nr_segs = nr_segs; +} + +/* Unmap and put previous page of userspace buffer */ +static void fuse_copy_finish(struct fuse_copy_state *cs) +{ + if (cs->currbuf) { + struct pipe_buffer *buf = cs->currbuf; + + if (!cs->write) { + buf->ops->unmap(cs->pipe, buf, cs->mapaddr); + } else { + kunmap(buf->page); + buf->len = PAGE_SIZE - cs->len; + } + cs->currbuf = NULL; + cs->mapaddr = NULL; + } else if (cs->mapaddr) { + kunmap(cs->pg); + if (cs->write) { + flush_dcache_page(cs->pg); + set_page_dirty_lock(cs->pg); + } + put_page(cs->pg); + cs->mapaddr = NULL; + } +} + +/* + * Get another pagefull of userspace buffer, and map it to kernel + * address space, and lock request + */ +static int fuse_copy_fill(struct fuse_copy_state *cs) +{ + unsigned long offset; + int err; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + if (cs->pipebufs) { + struct pipe_buffer *buf = cs->pipebufs; + + if (!cs->write) { + err = buf->ops->confirm(cs->pipe, buf); + if (err) + return err; + + BUG_ON(!cs->nr_segs); + cs->currbuf = buf; + cs->mapaddr = buf->ops->map(cs->pipe, buf, 0); + cs->len = buf->len; + cs->buf = cs->mapaddr + buf->offset; + cs->pipebufs++; + cs->nr_segs--; + } else { + struct page *page; + + if (cs->nr_segs == cs->pipe->buffers) + return -EIO; + + page = alloc_page(GFP_HIGHUSER); + if (!page) + return -ENOMEM; + + buf->page = page; + buf->offset = 0; + buf->len = 0; + + cs->currbuf = buf; + cs->mapaddr = kmap(page); + cs->buf = cs->mapaddr; + cs->len = PAGE_SIZE; + cs->pipebufs++; + cs->nr_segs++; + } + } else { + if (!cs->seglen) { + BUG_ON(!cs->nr_segs); + cs->seglen = cs->iov[0].iov_len; + cs->addr = (unsigned long) cs->iov[0].iov_base; + cs->iov++; + cs->nr_segs--; + } + err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); + if (err < 0) + return err; + BUG_ON(err != 1); + offset = cs->addr % PAGE_SIZE; + cs->mapaddr = kmap(cs->pg); + cs->buf = cs->mapaddr + offset; + cs->len = min(PAGE_SIZE - offset, cs->seglen); + cs->seglen -= cs->len; + cs->addr += cs->len; + } + + return lock_request(cs->fc, cs->req); +} + +/* Do as much copy to/from userspace buffer as we can */ +static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) +{ + unsigned ncpy = min(*size, cs->len); + if (val) { + if (cs->write) + memcpy(cs->buf, *val, ncpy); + else + memcpy(*val, cs->buf, ncpy); + *val += ncpy; + } + *size -= ncpy; + cs->len -= ncpy; + cs->buf += ncpy; + return ncpy; +} + +static int fuse_check_page(struct page *page) +{ + if (page_mapcount(page) || + page->mapping != NULL || + page_count(page) != 1 || + (page->flags & PAGE_FLAGS_CHECK_AT_PREP & + ~(1 << PG_locked | + 1 << PG_referenced | + 1 << PG_uptodate | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_reclaim))) { + printk(KERN_WARNING "fuse: trying to steal weird page\n"); + printk(KERN_WARNING " page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping); + return 1; + } + return 0; +} + +static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) +{ + int err; + struct page *oldpage = *pagep; + struct page *newpage; + struct pipe_buffer *buf = cs->pipebufs; + struct address_space *mapping; + pgoff_t index; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + + err = buf->ops->confirm(cs->pipe, buf); + if (err) + return err; + + BUG_ON(!cs->nr_segs); + cs->currbuf = buf; + cs->len = buf->len; + cs->pipebufs++; + cs->nr_segs--; + + if (cs->len != PAGE_SIZE) + goto out_fallback; + + if (buf->ops->steal(cs->pipe, buf) != 0) + goto out_fallback; + + newpage = buf->page; + + if (WARN_ON(!PageUptodate(newpage))) + return -EIO; + + ClearPageMappedToDisk(newpage); + + if (fuse_check_page(newpage) != 0) + goto out_fallback_unlock; + + mapping = oldpage->mapping; + index = oldpage->index; + + /* + * This is a new and locked page, it shouldn't be mapped or + * have any special flags on it + */ + if (WARN_ON(page_mapped(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(page_has_private(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(PageMlocked(oldpage))) + goto out_fallback_unlock; + + err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL); + if (err) { + unlock_page(newpage); + return err; + } + + page_cache_get(newpage); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add_file(newpage); + + err = 0; + spin_lock(&cs->fc->lock); + if (cs->req->aborted) + err = -ENOENT; + else + *pagep = newpage; + spin_unlock(&cs->fc->lock); + + if (err) { + unlock_page(newpage); + page_cache_release(newpage); + return err; + } + + unlock_page(oldpage); + page_cache_release(oldpage); + cs->len = 0; + + return 0; + +out_fallback_unlock: + unlock_page(newpage); +out_fallback: + cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->buf = cs->mapaddr + buf->offset; + + err = lock_request(cs->fc, cs->req); + if (err) + return err; + + return 1; +} + +static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, + unsigned offset, unsigned count) +{ + struct pipe_buffer *buf; + + if (cs->nr_segs == cs->pipe->buffers) + return -EIO; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + + buf = cs->pipebufs; + page_cache_get(page); + buf->page = page; + buf->offset = offset; + buf->len = count; + + cs->pipebufs++; + cs->nr_segs++; + cs->len = 0; + + return 0; +} + +/* + * Copy a page in the request to/from the userspace buffer. Must be + * done atomically + */ +static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, + unsigned offset, unsigned count, int zeroing) +{ + int err; + struct page *page = *pagep; + + if (page && zeroing && count < PAGE_SIZE) + clear_highpage(page); + + while (count) { + if (cs->write && cs->pipebufs && page) { + return fuse_ref_page(cs, page, offset, count); + } else if (!cs->len) { + if (cs->move_pages && page && + offset == 0 && count == PAGE_SIZE) { + err = fuse_try_move_page(cs, pagep); + if (err <= 0) + return err; + } else { + err = fuse_copy_fill(cs); + if (err) + return err; + } + } + if (page) { + void *mapaddr = kmap_atomic(page, KM_USER0); + void *buf = mapaddr + offset; + offset += fuse_copy_do(cs, &buf, &count); + kunmap_atomic(mapaddr, KM_USER0); + } else + offset += fuse_copy_do(cs, NULL, &count); + } + if (page && !cs->write) + flush_dcache_page(page); + return 0; +} + +/* Copy pages in the request to/from userspace buffer */ +static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, + int zeroing) +{ + unsigned i; + struct fuse_req *req = cs->req; + unsigned offset = req->page_offset; + unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); + + for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { + int err; + + err = fuse_copy_page(cs, &req->pages[i], offset, count, + zeroing); + if (err) + return err; + + nbytes -= count; + count = min(nbytes, (unsigned) PAGE_SIZE); + offset = 0; + } + return 0; +} + +/* Copy a single argument in the request to/from userspace buffer */ +static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) +{ + while (size) { + if (!cs->len) { + int err = fuse_copy_fill(cs); + if (err) + return err; + } + fuse_copy_do(cs, &val, &size); + } + return 0; +} + +/* Copy request arguments to/from userspace buffer */ +static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, + unsigned argpages, struct fuse_arg *args, + int zeroing) +{ + int err = 0; + unsigned i; + + for (i = 0; !err && i < numargs; i++) { + struct fuse_arg *arg = &args[i]; + if (i == numargs - 1 && argpages) + err = fuse_copy_pages(cs, arg->size, zeroing); + else + err = fuse_copy_one(cs, arg->value, arg->size); + } + return err; +} + +static int forget_pending(struct fuse_conn *fc) +{ + return fc->forget_list_head.next != NULL; +} + +static int request_pending(struct fuse_conn *fc) +{ + return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) || + forget_pending(fc); +} + +/* Wait until a request is available on the pending list */ +static void request_wait(struct fuse_conn *fc) +__releases(fc->lock) +__acquires(fc->lock) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&fc->waitq, &wait); + while (fc->connected && !request_pending(fc)) { + set_current_state(TASK_INTERRUPTIBLE); + if (signal_pending(current)) + break; + + spin_unlock(&fc->lock); + schedule(); + spin_lock(&fc->lock); + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&fc->waitq, &wait); +} + +/* + * Transfer an interrupt request to userspace + * + * Unlike other requests this is assembled on demand, without a need + * to allocate a separate fuse_req structure. + * + * Called with fc->lock held, releases it + */ +static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, + size_t nbytes, struct fuse_req *req) +__releases(fc->lock) +{ + struct fuse_in_header ih; + struct fuse_interrupt_in arg; + unsigned reqsize = sizeof(ih) + sizeof(arg); + int err; + + list_del_init(&req->intr_entry); + req->intr_unique = fuse_get_unique(fc); + memset(&ih, 0, sizeof(ih)); + memset(&arg, 0, sizeof(arg)); + ih.len = reqsize; + ih.opcode = FUSE_INTERRUPT; + ih.unique = req->intr_unique; + arg.unique = req->in.h.unique; + + spin_unlock(&fc->lock); + if (nbytes < reqsize) + return -EINVAL; + + err = fuse_copy_one(cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(cs, &arg, sizeof(arg)); + fuse_copy_finish(cs); + + return err ? err : reqsize; +} + +static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc, + unsigned max, + unsigned *countp) +{ + struct fuse_forget_link *head = fc->forget_list_head.next; + struct fuse_forget_link **newhead = &head; + unsigned count; + + for (count = 0; *newhead != NULL && count < max; count++) + newhead = &(*newhead)->next; + + fc->forget_list_head.next = *newhead; + *newhead = NULL; + if (fc->forget_list_head.next == NULL) + fc->forget_list_tail = &fc->forget_list_head; + + if (countp != NULL) + *countp = count; + + return head; +} + +static int fuse_read_single_forget(struct fuse_conn *fc, + struct fuse_copy_state *cs, + size_t nbytes) +__releases(fc->lock) +{ + int err; + struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL); + struct fuse_forget_in arg = { + .nlookup = forget->forget_one.nlookup, + }; + struct fuse_in_header ih = { + .opcode = FUSE_FORGET, + .nodeid = forget->forget_one.nodeid, + .unique = fuse_get_unique(fc), + .len = sizeof(ih) + sizeof(arg), + }; + + spin_unlock(&fc->lock); + kfree(forget); + if (nbytes < ih.len) + return -EINVAL; + + err = fuse_copy_one(cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(cs, &arg, sizeof(arg)); + fuse_copy_finish(cs); + + if (err) + return err; + + return ih.len; +} + +static int fuse_read_batch_forget(struct fuse_conn *fc, + struct fuse_copy_state *cs, size_t nbytes) +__releases(fc->lock) +{ + int err; + unsigned max_forgets; + unsigned count; + struct fuse_forget_link *head; + struct fuse_batch_forget_in arg = { .count = 0 }; + struct fuse_in_header ih = { + .opcode = FUSE_BATCH_FORGET, + .unique = fuse_get_unique(fc), + .len = sizeof(ih) + sizeof(arg), + }; + + if (nbytes < ih.len) { + spin_unlock(&fc->lock); + return -EINVAL; + } + + max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); + head = dequeue_forget(fc, max_forgets, &count); + spin_unlock(&fc->lock); + + arg.count = count; + ih.len += count * sizeof(struct fuse_forget_one); + err = fuse_copy_one(cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(cs, &arg, sizeof(arg)); + + while (head) { + struct fuse_forget_link *forget = head; + + if (!err) { + err = fuse_copy_one(cs, &forget->forget_one, + sizeof(forget->forget_one)); + } + head = forget->next; + kfree(forget); + } + + fuse_copy_finish(cs); + + if (err) + return err; + + return ih.len; +} + +static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs, + size_t nbytes) +__releases(fc->lock) +{ + if (fc->minor < 16 || fc->forget_list_head.next->next == NULL) + return fuse_read_single_forget(fc, cs, nbytes); + else + return fuse_read_batch_forget(fc, cs, nbytes); +} + +/* + * Read a single request into the userspace filesystem's buffer. This + * function waits until a request is available, then removes it from + * the pending list and copies request data to userspace buffer. If + * no reply is needed (FORGET) or request has been aborted or there + * was an error during the copying then it's finished by calling + * request_end(). Otherwise add it to the processing list, and set + * the 'sent' flag. + */ +static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, + struct fuse_copy_state *cs, size_t nbytes) +{ + int err; + struct fuse_req *req; + struct fuse_in *in; + unsigned reqsize; + + restart: + spin_lock(&fc->lock); + err = -EAGAIN; + if ((file->f_flags & O_NONBLOCK) && fc->connected && + !request_pending(fc)) + goto err_unlock; + + request_wait(fc); + err = -ENODEV; + if (!fc->connected) + goto err_unlock; + err = -ERESTARTSYS; + if (!request_pending(fc)) + goto err_unlock; + + if (!list_empty(&fc->interrupts)) { + req = list_entry(fc->interrupts.next, struct fuse_req, + intr_entry); + return fuse_read_interrupt(fc, cs, nbytes, req); + } + + if (forget_pending(fc)) { + if (list_empty(&fc->pending) || fc->forget_batch-- > 0) + return fuse_read_forget(fc, cs, nbytes); + + if (fc->forget_batch <= -8) + fc->forget_batch = 16; + } + + req = list_entry(fc->pending.next, struct fuse_req, list); + req->state = FUSE_REQ_READING; + list_move(&req->list, &fc->io); + + in = &req->in; + reqsize = in->h.len; + /* If request is too large, reply with an error and restart the read */ + if (nbytes < reqsize) { + req->out.h.error = -EIO; + /* SETXATTR is special, since it may contain too large data */ + if (in->h.opcode == FUSE_SETXATTR) + req->out.h.error = -E2BIG; + request_end(fc, req); + goto restart; + } + spin_unlock(&fc->lock); + cs->req = req; + err = fuse_copy_one(cs, &in->h, sizeof(in->h)); + if (!err) + err = fuse_copy_args(cs, in->numargs, in->argpages, + (struct fuse_arg *) in->args, 0); + fuse_copy_finish(cs); + spin_lock(&fc->lock); + req->locked = 0; + if (req->aborted) { + request_end(fc, req); + return -ENODEV; + } + if (err) { + req->out.h.error = -EIO; + request_end(fc, req); + return err; + } + if (!req->isreply) + request_end(fc, req); + else { + req->state = FUSE_REQ_SENT; + list_move_tail(&req->list, &fc->processing); + if (req->interrupted) + queue_interrupt(fc, req); + spin_unlock(&fc->lock); + } + return reqsize; + + err_unlock: + spin_unlock(&fc->lock); + return err; +} + +static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct fuse_copy_state cs; + struct file *file = iocb->ki_filp; + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; + + fuse_copy_init(&cs, fc, 1, iov, nr_segs); + + return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs)); +} + +static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = fuse_dev_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + int ret; + int page_nr = 0; + int do_wakeup = 0; + struct pipe_buffer *bufs; + struct fuse_copy_state cs; + struct fuse_conn *fc = fuse_get_conn(in); + if (!fc) + return -EPERM; + + bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); + if (!bufs) + return -ENOMEM; + + fuse_copy_init(&cs, fc, 1, NULL, 0); + cs.pipebufs = bufs; + cs.pipe = pipe; + ret = fuse_dev_do_read(fc, in, &cs, len); + if (ret < 0) + goto out; + + ret = 0; + pipe_lock(pipe); + + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + goto out_unlock; + } + + if (pipe->nrbufs + cs.nr_segs > pipe->buffers) { + ret = -EIO; + goto out_unlock; + } + + while (page_nr < cs.nr_segs) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + + buf->page = bufs[page_nr].page; + buf->offset = bufs[page_nr].offset; + buf->len = bufs[page_nr].len; + buf->ops = &fuse_dev_pipe_buf_ops; + + pipe->nrbufs++; + page_nr++; + ret += buf->len; + + if (pipe->inode) + do_wakeup = 1; + } + +out_unlock: + pipe_unlock(pipe); + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + } + +out: + for (; page_nr < cs.nr_segs; page_nr++) + page_cache_release(bufs[page_nr].page); + + kfree(bufs); + return ret; +} + +static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_poll_wakeup_out outarg; + int err = -EINVAL; + + if (size != sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + + fuse_copy_finish(cs); + return fuse_notify_poll_wakeup(fc, &outarg); + +err: + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_inval_inode_out outarg; + int err = -EINVAL; + + if (size != sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + fuse_copy_finish(cs); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) { + err = fuse_reverse_inval_inode(fc->sb, outarg.ino, + outarg.off, outarg.len); + } + up_read(&fc->killsb); + return err; + +err: + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_inval_entry_out outarg; + int err = -ENOMEM; + char *buf; + struct qstr name; + + buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); + if (!buf) + goto err; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + + err = -ENAMETOOLONG; + if (outarg.namelen > FUSE_NAME_MAX) + goto err; + + err = -EINVAL; + if (size != sizeof(outarg) + outarg.namelen + 1) + goto err; + + name.name = buf; + name.len = outarg.namelen; + err = fuse_copy_one(cs, buf, outarg.namelen + 1); + if (err) + goto err; + fuse_copy_finish(cs); + buf[outarg.namelen] = 0; + name.hash = full_name_hash(name.name, name.len); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) + err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name); + up_read(&fc->killsb); + kfree(buf); + return err; + +err: + kfree(buf); + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_store_out outarg; + struct inode *inode; + struct address_space *mapping; + u64 nodeid; + int err; + pgoff_t index; + unsigned int offset; + unsigned int num; + loff_t file_size; + loff_t end; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto out_finish; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto out_finish; + + err = -EINVAL; + if (size - sizeof(outarg) != outarg.size) + goto out_finish; + + nodeid = outarg.nodeid; + + down_read(&fc->killsb); + + err = -ENOENT; + if (!fc->sb) + goto out_up_killsb; + + inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + if (!inode) + goto out_up_killsb; + + mapping = inode->i_mapping; + index = outarg.offset >> PAGE_CACHE_SHIFT; + offset = outarg.offset & ~PAGE_CACHE_MASK; + file_size = i_size_read(inode); + end = outarg.offset + outarg.size; + if (end > file_size) { + file_size = end; + fuse_write_update_size(inode, file_size); + } + + num = outarg.size; + while (num) { + struct page *page; + unsigned int this_num; + + err = -ENOMEM; + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + if (!page) + goto out_iput; + + this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); + err = fuse_copy_page(cs, &page, offset, this_num, 0); + if (!err && offset == 0 && (num != 0 || file_size == end)) + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + + if (err) + goto out_iput; + + num -= this_num; + offset = 0; + index++; + } + + err = 0; + +out_iput: + iput(inode); +out_up_killsb: + up_read(&fc->killsb); +out_finish: + fuse_copy_finish(cs); + return err; +} + +static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) +{ + release_pages(req->pages, req->num_pages, 0); +} + +static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, + struct fuse_notify_retrieve_out *outarg) +{ + int err; + struct address_space *mapping = inode->i_mapping; + struct fuse_req *req; + pgoff_t index; + loff_t file_size; + unsigned int num; + unsigned int offset; + size_t total_len = 0; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + offset = outarg->offset & ~PAGE_CACHE_MASK; + + req->in.h.opcode = FUSE_NOTIFY_REPLY; + req->in.h.nodeid = outarg->nodeid; + req->in.numargs = 2; + req->in.argpages = 1; + req->page_offset = offset; + req->end = fuse_retrieve_end; + + index = outarg->offset >> PAGE_CACHE_SHIFT; + file_size = i_size_read(inode); + num = outarg->size; + if (outarg->offset > file_size) + num = 0; + else if (outarg->offset + num > file_size) + num = file_size - outarg->offset; + + while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) { + struct page *page; + unsigned int this_num; + + page = find_get_page(mapping, index); + if (!page) + break; + + this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); + req->pages[req->num_pages] = page; + req->num_pages++; + + num -= this_num; + total_len += this_num; + index++; + } + req->misc.retrieve_in.offset = outarg->offset; + req->misc.retrieve_in.size = total_len; + req->in.args[0].size = sizeof(req->misc.retrieve_in); + req->in.args[0].value = &req->misc.retrieve_in; + req->in.args[1].size = total_len; + + err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique); + if (err) + fuse_retrieve_end(fc, req); + + return err; +} + +static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_retrieve_out outarg; + struct inode *inode; + int err; + + err = -EINVAL; + if (size != sizeof(outarg)) + goto copy_finish; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto copy_finish; + + fuse_copy_finish(cs); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) { + u64 nodeid = outarg.nodeid; + + inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + if (inode) { + err = fuse_retrieve(fc, inode, &outarg); + iput(inode); + } + } + up_read(&fc->killsb); + + return err; + +copy_finish: + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, + unsigned int size, struct fuse_copy_state *cs) +{ + switch (code) { + case FUSE_NOTIFY_POLL: + return fuse_notify_poll(fc, size, cs); + + case FUSE_NOTIFY_INVAL_INODE: + return fuse_notify_inval_inode(fc, size, cs); + + case FUSE_NOTIFY_INVAL_ENTRY: + return fuse_notify_inval_entry(fc, size, cs); + + case FUSE_NOTIFY_STORE: + return fuse_notify_store(fc, size, cs); + + case FUSE_NOTIFY_RETRIEVE: + return fuse_notify_retrieve(fc, size, cs); + + default: + fuse_copy_finish(cs); + return -EINVAL; + } +} + +/* Look up request on processing list by unique ID */ +static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) +{ + struct list_head *entry; + + list_for_each(entry, &fc->processing) { + struct fuse_req *req; + req = list_entry(entry, struct fuse_req, list); + if (req->in.h.unique == unique || req->intr_unique == unique) + return req; + } + return NULL; +} + +static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, + unsigned nbytes) +{ + unsigned reqsize = sizeof(struct fuse_out_header); + + if (out->h.error) + return nbytes != reqsize ? -EINVAL : 0; + + reqsize += len_args(out->numargs, out->args); + + if (reqsize < nbytes || (reqsize > nbytes && !out->argvar)) + return -EINVAL; + else if (reqsize > nbytes) { + struct fuse_arg *lastarg = &out->args[out->numargs-1]; + unsigned diffsize = reqsize - nbytes; + if (diffsize > lastarg->size) + return -EINVAL; + lastarg->size -= diffsize; + } + return fuse_copy_args(cs, out->numargs, out->argpages, out->args, + out->page_zeroing); +} + +/* + * Write a single reply to a request. First the header is copied from + * the write buffer. The request is then searched on the processing + * list by the unique ID found in the header. If found, then remove + * it from the list and copy the rest of the buffer to the request. + * The request is finished by calling request_end() + */ +static ssize_t fuse_dev_do_write(struct fuse_conn *fc, + struct fuse_copy_state *cs, size_t nbytes) +{ + int err; + struct fuse_req *req; + struct fuse_out_header oh; + + if (nbytes < sizeof(struct fuse_out_header)) + return -EINVAL; + + err = fuse_copy_one(cs, &oh, sizeof(oh)); + if (err) + goto err_finish; + + err = -EINVAL; + if (oh.len != nbytes) + goto err_finish; + + /* + * Zero oh.unique indicates unsolicited notification message + * and error contains notification code. + */ + if (!oh.unique) { + err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); + return err ? err : nbytes; + } + + err = -EINVAL; + if (oh.error <= -1000 || oh.error > 0) + goto err_finish; + + spin_lock(&fc->lock); + err = -ENOENT; + if (!fc->connected) + goto err_unlock; + + req = request_find(fc, oh.unique); + if (!req) + goto err_unlock; + + if (req->aborted) { + spin_unlock(&fc->lock); + fuse_copy_finish(cs); + spin_lock(&fc->lock); + request_end(fc, req); + return -ENOENT; + } + /* Is it an interrupt reply? */ + if (req->intr_unique == oh.unique) { + err = -EINVAL; + if (nbytes != sizeof(struct fuse_out_header)) + goto err_unlock; + + if (oh.error == -ENOSYS) + fc->no_interrupt = 1; + else if (oh.error == -EAGAIN) + queue_interrupt(fc, req); + + spin_unlock(&fc->lock); + fuse_copy_finish(cs); + return nbytes; + } + + req->state = FUSE_REQ_WRITING; + list_move(&req->list, &fc->io); + req->out.h = oh; + req->locked = 1; + cs->req = req; + if (!req->out.page_replace) + cs->move_pages = 0; + spin_unlock(&fc->lock); + + err = copy_out_args(cs, &req->out, nbytes); + fuse_copy_finish(cs); + + spin_lock(&fc->lock); + req->locked = 0; + if (!err) { + if (req->aborted) + err = -ENOENT; + } else if (!req->aborted) + req->out.h.error = -EIO; + request_end(fc, req); + + return err ? err : nbytes; + + err_unlock: + spin_unlock(&fc->lock); + err_finish: + fuse_copy_finish(cs); + return err; +} + +static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct fuse_copy_state cs; + struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp); + if (!fc) + return -EPERM; + + fuse_copy_init(&cs, fc, 0, iov, nr_segs); + + return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs)); +} + +static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + unsigned nbuf; + unsigned idx; + struct pipe_buffer *bufs; + struct fuse_copy_state cs; + struct fuse_conn *fc; + size_t rem; + ssize_t ret; + + fc = fuse_get_conn(out); + if (!fc) + return -EPERM; + + bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); + if (!bufs) + return -ENOMEM; + + pipe_lock(pipe); + nbuf = 0; + rem = 0; + for (idx = 0; idx < pipe->nrbufs && rem < len; idx++) + rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; + + ret = -EINVAL; + if (rem < len) { + pipe_unlock(pipe); + goto out; + } + + rem = len; + while (rem) { + struct pipe_buffer *ibuf; + struct pipe_buffer *obuf; + + BUG_ON(nbuf >= pipe->buffers); + BUG_ON(!pipe->nrbufs); + ibuf = &pipe->bufs[pipe->curbuf]; + obuf = &bufs[nbuf]; + + if (rem >= ibuf->len) { + *obuf = *ibuf; + ibuf->ops = NULL; + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); + pipe->nrbufs--; + } else { + ibuf->ops->get(pipe, ibuf); + *obuf = *ibuf; + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + obuf->len = rem; + ibuf->offset += obuf->len; + ibuf->len -= obuf->len; + } + nbuf++; + rem -= obuf->len; + } + pipe_unlock(pipe); + + fuse_copy_init(&cs, fc, 0, NULL, nbuf); + cs.pipebufs = bufs; + cs.pipe = pipe; + + if (flags & SPLICE_F_MOVE) + cs.move_pages = 1; + + ret = fuse_dev_do_write(fc, &cs, len); + + for (idx = 0; idx < nbuf; idx++) { + struct pipe_buffer *buf = &bufs[idx]; + buf->ops->release(pipe, buf); + } +out: + kfree(bufs); + return ret; +} + +static unsigned fuse_dev_poll(struct file *file, poll_table *wait) +{ + unsigned mask = POLLOUT | POLLWRNORM; + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return POLLERR; + + poll_wait(file, &fc->waitq, wait); + + spin_lock(&fc->lock); + if (!fc->connected) + mask = POLLERR; + else if (request_pending(fc)) + mask |= POLLIN | POLLRDNORM; + spin_unlock(&fc->lock); + + return mask; +} + +/* + * Abort all requests on the given list (pending or processing) + * + * This function releases and reacquires fc->lock + */ +static void end_requests(struct fuse_conn *fc, struct list_head *head) +__releases(fc->lock) +__acquires(fc->lock) +{ + while (!list_empty(head)) { + struct fuse_req *req; + req = list_entry(head->next, struct fuse_req, list); + req->out.h.error = -ECONNABORTED; + request_end(fc, req); + spin_lock(&fc->lock); + } +} + +/* + * Abort requests under I/O + * + * The requests are set to aborted and finished, and the request + * waiter is woken up. This will make request_wait_answer() wait + * until the request is unlocked and then return. + * + * If the request is asynchronous, then the end function needs to be + * called after waiting for the request to be unlocked (if it was + * locked). + */ +static void end_io_requests(struct fuse_conn *fc) +__releases(fc->lock) +__acquires(fc->lock) +{ + while (!list_empty(&fc->io)) { + struct fuse_req *req = + list_entry(fc->io.next, struct fuse_req, list); + void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; + + req->aborted = 1; + req->out.h.error = -ECONNABORTED; + req->state = FUSE_REQ_FINISHED; + list_del_init(&req->list); + wake_up(&req->waitq); + if (end) { + req->end = NULL; + __fuse_get_request(req); + spin_unlock(&fc->lock); + wait_event(req->waitq, !req->locked); + end(fc, req); + fuse_put_request(fc, req); + spin_lock(&fc->lock); + } + } +} + +static void end_queued_requests(struct fuse_conn *fc) +__releases(fc->lock) +__acquires(fc->lock) +{ + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + end_requests(fc, &fc->pending); + end_requests(fc, &fc->processing); + while (forget_pending(fc)) + kfree(dequeue_forget(fc, 1, NULL)); +} + +static void end_polls(struct fuse_conn *fc) +{ + struct rb_node *p; + + p = rb_first(&fc->polled_files); + + while (p) { + struct fuse_file *ff; + ff = rb_entry(p, struct fuse_file, polled_node); + wake_up_interruptible_all(&ff->poll_wait); + + p = rb_next(p); + } +} + +/* + * Abort all requests. + * + * Emergency exit in case of a malicious or accidental deadlock, or + * just a hung filesystem. + * + * The same effect is usually achievable through killing the + * filesystem daemon and all users of the filesystem. The exception + * is the combination of an asynchronous request and the tricky + * deadlock (see Documentation/filesystems/fuse.txt). + * + * During the aborting, progression of requests from the pending and + * processing lists onto the io list, and progression of new requests + * onto the pending list is prevented by req->connected being false. + * + * Progression of requests under I/O to the processing list is + * prevented by the req->aborted flag being true for these requests. + * For this reason requests on the io list must be aborted first. + */ +void fuse_abort_conn(struct fuse_conn *fc) +{ + spin_lock(&fc->lock); + if (fc->connected) { + fc->connected = 0; + fc->blocked = 0; + end_io_requests(fc); + end_queued_requests(fc); + end_polls(fc); + wake_up_all(&fc->waitq); + wake_up_all(&fc->blocked_waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); + } + spin_unlock(&fc->lock); +} +EXPORT_SYMBOL_GPL(fuse_abort_conn); + +int fuse_dev_release(struct inode *inode, struct file *file) +{ + struct fuse_conn *fc = fuse_get_conn(file); + if (fc) { + spin_lock(&fc->lock); + fc->connected = 0; + fc->blocked = 0; + end_queued_requests(fc); + end_polls(fc); + wake_up_all(&fc->blocked_waitq); + spin_unlock(&fc->lock); + fuse_conn_put(fc); + } + + return 0; +} +EXPORT_SYMBOL_GPL(fuse_dev_release); + +static int fuse_dev_fasync(int fd, struct file *file, int on) +{ + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; + + /* No locking - fasync_helper does its own locking */ + return fasync_helper(fd, file, on, &fc->fasync); +} + +const struct file_operations fuse_dev_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = do_sync_read, + .aio_read = fuse_dev_read, + .splice_read = fuse_dev_splice_read, + .write = do_sync_write, + .aio_write = fuse_dev_write, + .splice_write = fuse_dev_splice_write, + .poll = fuse_dev_poll, + .release = fuse_dev_release, + .fasync = fuse_dev_fasync, +}; +EXPORT_SYMBOL_GPL(fuse_dev_operations); + +static struct miscdevice fuse_miscdevice = { + .minor = FUSE_MINOR, + .name = "fuse", + .fops = &fuse_dev_operations, +}; + +int __init fuse_dev_init(void) +{ + int err = -ENOMEM; + fuse_req_cachep = kmem_cache_create("fuse_request", + sizeof(struct fuse_req), + 0, 0, NULL); + if (!fuse_req_cachep) + goto out; + + err = misc_register(&fuse_miscdevice); + if (err) + goto out_cache_clean; + + return 0; + + out_cache_clean: + kmem_cache_destroy(fuse_req_cachep); + out: + return err; +} + +void fuse_dev_cleanup(void) +{ + misc_deregister(&fuse_miscdevice); + kmem_cache_destroy(fuse_req_cachep); +} diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c new file mode 100644 index 00000000..c04a025c --- /dev/null +++ b/fs/fuse/dir.c @@ -0,0 +1,1638 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include + +#if BITS_PER_LONG >= 64 +static inline void fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_time = time; +} + +static inline u64 fuse_dentry_time(struct dentry *entry) +{ + return entry->d_time; +} +#else +/* + * On 32 bit archs store the high 32 bits of time in d_fsdata + */ +static void fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_time = time; + entry->d_fsdata = (void *) (unsigned long) (time >> 32); +} + +static u64 fuse_dentry_time(struct dentry *entry) +{ + return (u64) entry->d_time + + ((u64) (unsigned long) entry->d_fsdata << 32); +} +#endif + +/* + * FUSE caches dentries and attributes with separate timeout. The + * time in jiffies until the dentry/attributes are valid is stored in + * dentry->d_time and fuse_inode->i_time respectively. + */ + +/* + * Calculate the time in jiffies until a dentry/attributes are valid + */ +static u64 time_to_jiffies(unsigned long sec, unsigned long nsec) +{ + if (sec || nsec) { + struct timespec ts = {sec, nsec}; + return get_jiffies_64() + timespec_to_jiffies(&ts); + } else + return 0; +} + +/* + * Set dentry and possibly attribute timeouts from the lookup/mk* + * replies + */ +static void fuse_change_entry_timeout(struct dentry *entry, + struct fuse_entry_out *o) +{ + fuse_dentry_settime(entry, + time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); +} + +static u64 attr_timeout(struct fuse_attr_out *o) +{ + return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); +} + +static u64 entry_attr_timeout(struct fuse_entry_out *o) +{ + return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); +} + +/* + * Mark the attributes as stale, so that at the next call to + * ->getattr() they will be fetched from userspace + */ +void fuse_invalidate_attr(struct inode *inode) +{ + get_fuse_inode(inode)->i_time = 0; +} + +/* + * Just mark the entry as stale, so that a next attempt to look it up + * will result in a new lookup call to userspace + * + * This is called when a dentry is about to become negative and the + * timeout is unknown (unlink, rmdir, rename and in some cases + * lookup) + */ +void fuse_invalidate_entry_cache(struct dentry *entry) +{ + fuse_dentry_settime(entry, 0); +} + +/* + * Same as fuse_invalidate_entry_cache(), but also try to remove the + * dentry from the hash + */ +static void fuse_invalidate_entry(struct dentry *entry) +{ + d_invalidate(entry); + fuse_invalidate_entry_cache(entry); +} + +static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_req *req, + u64 nodeid, struct qstr *name, + struct fuse_entry_out *outarg) +{ + memset(outarg, 0, sizeof(struct fuse_entry_out)); + req->in.h.opcode = FUSE_LOOKUP; + req->in.h.nodeid = nodeid; + req->in.numargs = 1; + req->in.args[0].size = name->len + 1; + req->in.args[0].value = name->name; + req->out.numargs = 1; + if (fc->minor < 9) + req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + else + req->out.args[0].size = sizeof(struct fuse_entry_out); + req->out.args[0].value = outarg; +} + +u64 fuse_get_attr_version(struct fuse_conn *fc) +{ + u64 curr_version; + + /* + * The spin lock isn't actually needed on 64bit archs, but we + * don't yet care too much about such optimizations. + */ + spin_lock(&fc->lock); + curr_version = fc->attr_version; + spin_unlock(&fc->lock); + + return curr_version; +} + +/* + * Check whether the dentry is still valid + * + * If the entry validity timeout has expired and the dentry is + * positive, try to redo the lookup. If the lookup results in a + * different inode, then let the VFS invalidate the dentry and redo + * the lookup once more. If the lookup results in the same inode, + * then refresh the attributes, timeouts and mark the dentry valid. + */ +static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) +{ + struct inode *inode; + + inode = ACCESS_ONCE(entry->d_inode); + if (inode && is_bad_inode(inode)) + return 0; + else if (fuse_dentry_time(entry) < get_jiffies_64()) { + int err; + struct fuse_entry_out outarg; + struct fuse_conn *fc; + struct fuse_req *req; + struct fuse_forget_link *forget; + struct dentry *parent; + u64 attr_version; + + /* For negative dentries, always do a fresh lookup */ + if (!inode) + return 0; + + if (nd && (nd->flags & LOOKUP_RCU)) + return -ECHILD; + + fc = get_fuse_conn(inode); + req = fuse_get_req(fc); + if (IS_ERR(req)) + return 0; + + forget = fuse_alloc_forget(); + if (!forget) { + fuse_put_request(fc, req); + return 0; + } + + attr_version = fuse_get_attr_version(fc); + + parent = dget_parent(entry); + fuse_lookup_init(fc, req, get_node_id(parent->d_inode), + &entry->d_name, &outarg); + fuse_request_send(fc, req); + dput(parent); + err = req->out.h.error; + fuse_put_request(fc, req); + /* Zero nodeid is same as -ENOENT */ + if (!err && !outarg.nodeid) + err = -ENOENT; + if (!err) { + struct fuse_inode *fi = get_fuse_inode(inode); + if (outarg.nodeid != get_node_id(inode)) { + fuse_queue_forget(fc, forget, outarg.nodeid, 1); + return 0; + } + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + } + kfree(forget); + if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) + return 0; + + fuse_change_attributes(inode, &outarg.attr, + entry_attr_timeout(&outarg), + attr_version); + fuse_change_entry_timeout(entry, &outarg); + } + return 1; +} + +static int invalid_nodeid(u64 nodeid) +{ + return !nodeid || nodeid == FUSE_ROOT_ID; +} + +const struct dentry_operations fuse_dentry_operations = { + .d_revalidate = fuse_dentry_revalidate, +}; + +int fuse_valid_type(int m) +{ + return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) || + S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); +} + +/* + * Add a directory inode to a dentry, ensuring that no other dentry + * refers to this inode. Called with fc->inst_mutex. + */ +static struct dentry *fuse_d_add_directory(struct dentry *entry, + struct inode *inode) +{ + struct dentry *alias = d_find_alias(inode); + if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) { + /* This tries to shrink the subtree below alias */ + fuse_invalidate_entry(alias); + dput(alias); + if (!list_empty(&inode->i_dentry)) + return ERR_PTR(-EBUSY); + } else { + dput(alias); + } + return d_splice_alias(inode, entry); +} + +int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, + struct fuse_entry_out *outarg, struct inode **inode) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_req *req; + struct fuse_forget_link *forget; + u64 attr_version; + int err; + + *inode = NULL; + err = -ENAMETOOLONG; + if (name->len > FUSE_NAME_MAX) + goto out; + + req = fuse_get_req(fc); + err = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + forget = fuse_alloc_forget(); + err = -ENOMEM; + if (!forget) { + fuse_put_request(fc, req); + goto out; + } + + attr_version = fuse_get_attr_version(fc); + + fuse_lookup_init(fc, req, nodeid, name, outarg); + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + /* Zero nodeid is same as -ENOENT, but with valid timeout */ + if (err || !outarg->nodeid) + goto out_put_forget; + + err = -EIO; + if (!outarg->nodeid) + goto out_put_forget; + if (!fuse_valid_type(outarg->attr.mode)) + goto out_put_forget; + + *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, + &outarg->attr, entry_attr_timeout(outarg), + attr_version); + err = -ENOMEM; + if (!*inode) { + fuse_queue_forget(fc, forget, outarg->nodeid, 1); + goto out; + } + err = 0; + + out_put_forget: + kfree(forget); + out: + return err; +} + +static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, + struct nameidata *nd) +{ + int err; + struct fuse_entry_out outarg; + struct inode *inode; + struct dentry *newent; + struct fuse_conn *fc = get_fuse_conn(dir); + bool outarg_valid = true; + + err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, + &outarg, &inode); + if (err == -ENOENT) { + outarg_valid = false; + err = 0; + } + if (err) + goto out_err; + + err = -EIO; + if (inode && get_node_id(inode) == FUSE_ROOT_ID) + goto out_iput; + + if (inode && S_ISDIR(inode->i_mode)) { + mutex_lock(&fc->inst_mutex); + newent = fuse_d_add_directory(entry, inode); + mutex_unlock(&fc->inst_mutex); + err = PTR_ERR(newent); + if (IS_ERR(newent)) + goto out_iput; + } else { + newent = d_splice_alias(inode, entry); + } + + entry = newent ? newent : entry; + if (outarg_valid) + fuse_change_entry_timeout(entry, &outarg); + else + fuse_invalidate_entry_cache(entry); + + return newent; + + out_iput: + iput(inode); + out_err: + return ERR_PTR(err); +} + +/* + * Atomic create+open operation + * + * If the filesystem doesn't support this, then fall back to separate + * 'mknod' + 'open' requests. + */ +static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, + struct nameidata *nd) +{ + int err; + struct inode *inode; + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_req *req; + struct fuse_forget_link *forget; + struct fuse_create_in inarg; + struct fuse_open_out outopen; + struct fuse_entry_out outentry; + struct fuse_file *ff; + struct file *file; + int flags = nd->intent.open.flags - 1; + + if (fc->no_create) + return -ENOSYS; + + if (flags & O_DIRECT) + return -EINVAL; + + forget = fuse_alloc_forget(); + if (!forget) + return -ENOMEM; + + req = fuse_get_req(fc); + err = PTR_ERR(req); + if (IS_ERR(req)) + goto out_put_forget_req; + + err = -ENOMEM; + ff = fuse_file_alloc(fc); + if (!ff) + goto out_put_request; + + if (!fc->dont_mask) + mode &= ~current_umask(); + + flags &= ~O_NOCTTY; + memset(&inarg, 0, sizeof(inarg)); + memset(&outentry, 0, sizeof(outentry)); + inarg.flags = flags; + inarg.mode = mode; + inarg.umask = current_umask(); + req->in.h.opcode = FUSE_CREATE; + req->in.h.nodeid = get_node_id(dir); + req->in.numargs = 2; + req->in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) : + sizeof(inarg); + req->in.args[0].value = &inarg; + req->in.args[1].size = entry->d_name.len + 1; + req->in.args[1].value = entry->d_name.name; + req->out.numargs = 2; + if (fc->minor < 9) + req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + else + req->out.args[0].size = sizeof(outentry); + req->out.args[0].value = &outentry; + req->out.args[1].size = sizeof(outopen); + req->out.args[1].value = &outopen; + fuse_request_send(fc, req); + err = req->out.h.error; + if (err) { + if (err == -ENOSYS) + fc->no_create = 1; + goto out_free_ff; + } + + err = -EIO; + if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid)) + goto out_free_ff; + + fuse_put_request(fc, req); + ff->fh = outopen.fh; + ff->nodeid = outentry.nodeid; + ff->open_flags = outopen.open_flags; + inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, + &outentry.attr, entry_attr_timeout(&outentry), 0); + if (!inode) { + flags &= ~(O_CREAT | O_EXCL | O_TRUNC); + fuse_sync_release(ff, flags); + fuse_queue_forget(fc, forget, outentry.nodeid, 1); + return -ENOMEM; + } + kfree(forget); + d_instantiate(entry, inode); + fuse_change_entry_timeout(entry, &outentry); + fuse_invalidate_attr(dir); + file = lookup_instantiate_filp(nd, entry, generic_file_open); + if (IS_ERR(file)) { + fuse_sync_release(ff, flags); + return PTR_ERR(file); + } + file->private_data = fuse_file_get(ff); + fuse_finish_open(inode, file); + return 0; + + out_free_ff: + fuse_file_free(ff); + out_put_request: + fuse_put_request(fc, req); + out_put_forget_req: + kfree(forget); + return err; +} + +/* + * Code shared between mknod, mkdir, symlink and link + */ +static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, + struct inode *dir, struct dentry *entry, + int mode) +{ + struct fuse_entry_out outarg; + struct inode *inode; + int err; + struct fuse_forget_link *forget; + + forget = fuse_alloc_forget(); + if (!forget) { + fuse_put_request(fc, req); + return -ENOMEM; + } + + memset(&outarg, 0, sizeof(outarg)); + req->in.h.nodeid = get_node_id(dir); + req->out.numargs = 1; + if (fc->minor < 9) + req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + else + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err) + goto out_put_forget_req; + + err = -EIO; + if (invalid_nodeid(outarg.nodeid)) + goto out_put_forget_req; + + if ((outarg.attr.mode ^ mode) & S_IFMT) + goto out_put_forget_req; + + inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, + &outarg.attr, entry_attr_timeout(&outarg), 0); + if (!inode) { + fuse_queue_forget(fc, forget, outarg.nodeid, 1); + return -ENOMEM; + } + kfree(forget); + + if (S_ISDIR(inode->i_mode)) { + struct dentry *alias; + mutex_lock(&fc->inst_mutex); + alias = d_find_alias(inode); + if (alias) { + /* New directory must have moved since mkdir */ + mutex_unlock(&fc->inst_mutex); + dput(alias); + iput(inode); + return -EBUSY; + } + d_instantiate(entry, inode); + mutex_unlock(&fc->inst_mutex); + } else + d_instantiate(entry, inode); + + fuse_change_entry_timeout(entry, &outarg); + fuse_invalidate_attr(dir); + return 0; + + out_put_forget_req: + kfree(forget); + return err; +} + +static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, + dev_t rdev) +{ + struct fuse_mknod_in inarg; + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + if (!fc->dont_mask) + mode &= ~current_umask(); + + memset(&inarg, 0, sizeof(inarg)); + inarg.mode = mode; + inarg.rdev = new_encode_dev(rdev); + inarg.umask = current_umask(); + req->in.h.opcode = FUSE_MKNOD; + req->in.numargs = 2; + req->in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE : + sizeof(inarg); + req->in.args[0].value = &inarg; + req->in.args[1].size = entry->d_name.len + 1; + req->in.args[1].value = entry->d_name.name; + return create_new_entry(fc, req, dir, entry, mode); +} + +static int fuse_create(struct inode *dir, struct dentry *entry, int mode, + struct nameidata *nd) +{ + if (nd && (nd->flags & LOOKUP_OPEN)) { + int err = fuse_create_open(dir, entry, mode, nd); + if (err != -ENOSYS) + return err; + /* Fall back on mknod */ + } + return fuse_mknod(dir, entry, mode, 0); +} + +static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode) +{ + struct fuse_mkdir_in inarg; + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + if (!fc->dont_mask) + mode &= ~current_umask(); + + memset(&inarg, 0, sizeof(inarg)); + inarg.mode = mode; + inarg.umask = current_umask(); + req->in.h.opcode = FUSE_MKDIR; + req->in.numargs = 2; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->in.args[1].size = entry->d_name.len + 1; + req->in.args[1].value = entry->d_name.name; + return create_new_entry(fc, req, dir, entry, S_IFDIR); +} + +static int fuse_symlink(struct inode *dir, struct dentry *entry, + const char *link) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + unsigned len = strlen(link) + 1; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.h.opcode = FUSE_SYMLINK; + req->in.numargs = 2; + req->in.args[0].size = entry->d_name.len + 1; + req->in.args[0].value = entry->d_name.name; + req->in.args[1].size = len; + req->in.args[1].value = link; + return create_new_entry(fc, req, dir, entry, S_IFLNK); +} + +static int fuse_unlink(struct inode *dir, struct dentry *entry) +{ + int err; + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.h.opcode = FUSE_UNLINK; + req->in.h.nodeid = get_node_id(dir); + req->in.numargs = 1; + req->in.args[0].size = entry->d_name.len + 1; + req->in.args[0].value = entry->d_name.name; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) { + struct inode *inode = entry->d_inode; + + /* + * Set nlink to zero so the inode can be cleared, if the inode + * does have more links this will be discovered at the next + * lookup/getattr. + */ + clear_nlink(inode); + fuse_invalidate_attr(inode); + fuse_invalidate_attr(dir); + fuse_invalidate_entry_cache(entry); + } else if (err == -EINTR) + fuse_invalidate_entry(entry); + return err; +} + +static int fuse_rmdir(struct inode *dir, struct dentry *entry) +{ + int err; + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.h.opcode = FUSE_RMDIR; + req->in.h.nodeid = get_node_id(dir); + req->in.numargs = 1; + req->in.args[0].size = entry->d_name.len + 1; + req->in.args[0].value = entry->d_name.name; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) { + clear_nlink(entry->d_inode); + fuse_invalidate_attr(dir); + fuse_invalidate_entry_cache(entry); + } else if (err == -EINTR) + fuse_invalidate_entry(entry); + return err; +} + +static int fuse_rename(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + int err; + struct fuse_rename_in inarg; + struct fuse_conn *fc = get_fuse_conn(olddir); + struct fuse_req *req = fuse_get_req(fc); + + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.newdir = get_node_id(newdir); + req->in.h.opcode = FUSE_RENAME; + req->in.h.nodeid = get_node_id(olddir); + req->in.numargs = 3; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->in.args[1].size = oldent->d_name.len + 1; + req->in.args[1].value = oldent->d_name.name; + req->in.args[2].size = newent->d_name.len + 1; + req->in.args[2].value = newent->d_name.name; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) { + /* ctime changes */ + fuse_invalidate_attr(oldent->d_inode); + + fuse_invalidate_attr(olddir); + if (olddir != newdir) + fuse_invalidate_attr(newdir); + + /* newent will end up negative */ + if (newent->d_inode) { + fuse_invalidate_attr(newent->d_inode); + fuse_invalidate_entry_cache(newent); + } + } else if (err == -EINTR) { + /* If request was interrupted, DEITY only knows if the + rename actually took place. If the invalidation + fails (e.g. some process has CWD under the renamed + directory), then there can be inconsistency between + the dcache and the real filesystem. Tough luck. */ + fuse_invalidate_entry(oldent); + if (newent->d_inode) + fuse_invalidate_entry(newent); + } + + return err; +} + +static int fuse_link(struct dentry *entry, struct inode *newdir, + struct dentry *newent) +{ + int err; + struct fuse_link_in inarg; + struct inode *inode = entry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.oldnodeid = get_node_id(inode); + req->in.h.opcode = FUSE_LINK; + req->in.numargs = 2; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->in.args[1].size = newent->d_name.len + 1; + req->in.args[1].value = newent->d_name.name; + err = create_new_entry(fc, req, newdir, newent, inode->i_mode); + /* Contrary to "normal" filesystems it can happen that link + makes two "logical" inodes point to the same "physical" + inode. We invalidate the attributes of the old one, so it + will reflect changes in the backing inode (link count, + etc.) + */ + if (!err || err == -EINTR) + fuse_invalidate_attr(inode); + return err; +} + +static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, + struct kstat *stat) +{ + stat->dev = inode->i_sb->s_dev; + stat->ino = attr->ino; + stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); + stat->nlink = attr->nlink; + stat->uid = attr->uid; + stat->gid = attr->gid; + stat->rdev = inode->i_rdev; + stat->atime.tv_sec = attr->atime; + stat->atime.tv_nsec = attr->atimensec; + stat->mtime.tv_sec = attr->mtime; + stat->mtime.tv_nsec = attr->mtimensec; + stat->ctime.tv_sec = attr->ctime; + stat->ctime.tv_nsec = attr->ctimensec; + stat->size = attr->size; + stat->blocks = attr->blocks; + stat->blksize = (1 << inode->i_blkbits); +} + +static int fuse_do_getattr(struct inode *inode, struct kstat *stat, + struct file *file) +{ + int err; + struct fuse_getattr_in inarg; + struct fuse_attr_out outarg; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + u64 attr_version; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + attr_version = fuse_get_attr_version(fc); + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + /* Directories have separate file-handle space */ + if (file && S_ISREG(inode->i_mode)) { + struct fuse_file *ff = file->private_data; + + inarg.getattr_flags |= FUSE_GETATTR_FH; + inarg.fh = ff->fh; + } + req->in.h.opcode = FUSE_GETATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->out.numargs = 1; + if (fc->minor < 9) + req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + else + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) { + if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + make_bad_inode(inode); + err = -EIO; + } else { + fuse_change_attributes(inode, &outarg.attr, + attr_timeout(&outarg), + attr_version); + if (stat) + fuse_fillattr(inode, &outarg.attr, stat); + } + } + return err; +} + +int fuse_update_attributes(struct inode *inode, struct kstat *stat, + struct file *file, bool *refreshed) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + int err; + bool r; + + if (fi->i_time < get_jiffies_64()) { + r = true; + err = fuse_do_getattr(inode, stat, file); + } else { + r = false; + err = 0; + if (stat) { + generic_fillattr(inode, stat); + stat->mode = fi->orig_i_mode; + stat->ino = fi->orig_ino; + } + } + + if (refreshed != NULL) + *refreshed = r; + + return err; +} + +int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, + struct qstr *name) +{ + int err = -ENOTDIR; + struct inode *parent; + struct dentry *dir; + struct dentry *entry; + + parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid); + if (!parent) + return -ENOENT; + + mutex_lock(&parent->i_mutex); + if (!S_ISDIR(parent->i_mode)) + goto unlock; + + err = -ENOENT; + dir = d_find_alias(parent); + if (!dir) + goto unlock; + + entry = d_lookup(dir, name); + dput(dir); + if (!entry) + goto unlock; + + fuse_invalidate_attr(parent); + fuse_invalidate_entry(entry); + dput(entry); + err = 0; + + unlock: + mutex_unlock(&parent->i_mutex); + iput(parent); + return err; +} + +/* + * Calling into a user-controlled filesystem gives the filesystem + * daemon ptrace-like capabilities over the requester process. This + * means, that the filesystem daemon is able to record the exact + * filesystem operations performed, and can also control the behavior + * of the requester process in otherwise impossible ways. For example + * it can delay the operation for arbitrary length of time allowing + * DoS against the requester. + * + * For this reason only those processes can call into the filesystem, + * for which the owner of the mount has ptrace privilege. This + * excludes processes started by other users, suid or sgid processes. + */ +int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task) +{ + const struct cred *cred; + int ret; + + if (fc->flags & FUSE_ALLOW_OTHER) + return 1; + + rcu_read_lock(); + ret = 0; + cred = __task_cred(task); + if (cred->euid == fc->user_id && + cred->suid == fc->user_id && + cred->uid == fc->user_id && + cred->egid == fc->group_id && + cred->sgid == fc->group_id && + cred->gid == fc->group_id) + ret = 1; + rcu_read_unlock(); + + return ret; +} + +static int fuse_access(struct inode *inode, int mask) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_access_in inarg; + int err; + + if (fc->no_access) + return 0; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); + req->in.h.opcode = FUSE_ACCESS; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err == -ENOSYS) { + fc->no_access = 1; + err = 0; + } + return err; +} + +static int fuse_perm_getattr(struct inode *inode, int flags) +{ + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + return fuse_do_getattr(inode, NULL, NULL); +} + +/* + * Check permission. The two basic access models of FUSE are: + * + * 1) Local access checking ('default_permissions' mount option) based + * on file mode. This is the plain old disk filesystem permission + * modell. + * + * 2) "Remote" access checking, where server is responsible for + * checking permission in each inode operation. An exception to this + * is if ->permission() was invoked from sys_access() in which case an + * access request is sent. Execute permission is still checked + * locally based on file mode. + */ +static int fuse_permission(struct inode *inode, int mask, unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + bool refreshed = false; + int err = 0; + + if (!fuse_allow_task(fc, current)) + return -EACCES; + + /* + * If attributes are needed, refresh them before proceeding + */ + if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) || + ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { + struct fuse_inode *fi = get_fuse_inode(inode); + + if (fi->i_time < get_jiffies_64()) { + refreshed = true; + + err = fuse_perm_getattr(inode, flags); + if (err) + return err; + } + } + + if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { + err = generic_permission(inode, mask, flags, NULL); + + /* If permission is denied, try to refresh file + attributes. This is also needed, because the root + node will at first have no permissions */ + if (err == -EACCES && !refreshed) { + err = fuse_perm_getattr(inode, flags); + if (!err) + err = generic_permission(inode, mask, + flags, NULL); + } + + /* Note: the opposite of the above test does not + exist. So if permissions are revoked this won't be + noticed immediately, only after the attribute + timeout has expired */ + } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + err = fuse_access(inode, mask); + } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { + if (!(inode->i_mode & S_IXUGO)) { + if (refreshed) + return -EACCES; + + err = fuse_perm_getattr(inode, flags); + if (!err && !(inode->i_mode & S_IXUGO)) + return -EACCES; + } + } + return err; +} + +static int parse_dirfile(char *buf, size_t nbytes, struct file *file, + void *dstbuf, filldir_t filldir) +{ + while (nbytes >= FUSE_NAME_OFFSET) { + struct fuse_dirent *dirent = (struct fuse_dirent *) buf; + size_t reclen = FUSE_DIRENT_SIZE(dirent); + int over; + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + + over = filldir(dstbuf, dirent->name, dirent->namelen, + file->f_pos, dirent->ino, dirent->type); + if (over) + break; + + buf += reclen; + nbytes -= reclen; + file->f_pos = dirent->off; + } + + return 0; +} + +static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +{ + int err; + size_t nbytes; + struct page *page; + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + + if (is_bad_inode(inode)) + return -EIO; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + page = alloc_page(GFP_KERNEL); + if (!page) { + fuse_put_request(fc, req); + return -ENOMEM; + } + req->out.argpages = 1; + req->num_pages = 1; + req->pages[0] = page; + fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR); + fuse_request_send(fc, req); + nbytes = req->out.args[0].size; + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) + err = parse_dirfile(page_address(page), nbytes, file, dstbuf, + filldir); + + __free_page(page); + fuse_invalidate_attr(inode); /* atime changed */ + return err; +} + +static char *read_link(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req = fuse_get_req(fc); + char *link; + + if (IS_ERR(req)) + return ERR_CAST(req); + + link = (char *) __get_free_page(GFP_KERNEL); + if (!link) { + link = ERR_PTR(-ENOMEM); + goto out; + } + req->in.h.opcode = FUSE_READLINK; + req->in.h.nodeid = get_node_id(inode); + req->out.argvar = 1; + req->out.numargs = 1; + req->out.args[0].size = PAGE_SIZE - 1; + req->out.args[0].value = link; + fuse_request_send(fc, req); + if (req->out.h.error) { + free_page((unsigned long) link); + link = ERR_PTR(req->out.h.error); + } else + link[req->out.args[0].size] = '\0'; + out: + fuse_put_request(fc, req); + fuse_invalidate_attr(inode); /* atime changed */ + return link; +} + +static void free_link(char *link) +{ + if (!IS_ERR(link)) + free_page((unsigned long) link); +} + +static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, read_link(dentry)); + return NULL; +} + +static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c) +{ + free_link(nd_get_link(nd)); +} + +static int fuse_dir_open(struct inode *inode, struct file *file) +{ + return fuse_open_common(inode, file, true); +} + +static int fuse_dir_release(struct inode *inode, struct file *file) +{ + fuse_release_common(file, FUSE_RELEASEDIR); + + return 0; +} + +static int fuse_dir_fsync(struct file *file, int datasync) +{ + return fuse_fsync_common(file, datasync, 1); +} + +static bool update_mtime(unsigned ivalid) +{ + /* Always update if mtime is explicitly set */ + if (ivalid & ATTR_MTIME_SET) + return true; + + /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ + if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) + return false; + + /* In all other cases update */ + return true; +} + +static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) +{ + unsigned ivalid = iattr->ia_valid; + + if (ivalid & ATTR_MODE) + arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; + if (ivalid & ATTR_UID) + arg->valid |= FATTR_UID, arg->uid = iattr->ia_uid; + if (ivalid & ATTR_GID) + arg->valid |= FATTR_GID, arg->gid = iattr->ia_gid; + if (ivalid & ATTR_SIZE) + arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; + if (ivalid & ATTR_ATIME) { + arg->valid |= FATTR_ATIME; + arg->atime = iattr->ia_atime.tv_sec; + arg->atimensec = iattr->ia_atime.tv_nsec; + if (!(ivalid & ATTR_ATIME_SET)) + arg->valid |= FATTR_ATIME_NOW; + } + if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) { + arg->valid |= FATTR_MTIME; + arg->mtime = iattr->ia_mtime.tv_sec; + arg->mtimensec = iattr->ia_mtime.tv_nsec; + if (!(ivalid & ATTR_MTIME_SET)) + arg->valid |= FATTR_MTIME_NOW; + } +} + +/* + * Prevent concurrent writepages on inode + * + * This is done by adding a negative bias to the inode write counter + * and waiting for all pending writes to finish. + */ +void fuse_set_nowrite(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + BUG_ON(!mutex_is_locked(&inode->i_mutex)); + + spin_lock(&fc->lock); + BUG_ON(fi->writectr < 0); + fi->writectr += FUSE_NOWRITE; + spin_unlock(&fc->lock); + wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE); +} + +/* + * Allow writepages on inode + * + * Remove the bias from the writecounter and send any queued + * writepages. + */ +static void __fuse_release_nowrite(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + BUG_ON(fi->writectr != FUSE_NOWRITE); + fi->writectr = 0; + fuse_flush_writepages(inode); +} + +void fuse_release_nowrite(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + spin_lock(&fc->lock); + __fuse_release_nowrite(inode); + spin_unlock(&fc->lock); +} + +/* + * Set attributes, and at the same time refresh them. + * + * Truncation is slightly complicated, because the 'truncate' request + * may fail, in which case we don't want to touch the mapping. + * vmtruncate() doesn't allow for this case, so do the rlimit checking + * and the actual truncation by hand. + */ +static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, + struct file *file) +{ + struct inode *inode = entry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_setattr_in inarg; + struct fuse_attr_out outarg; + bool is_truncate = false; + loff_t oldsize; + int err; + + if (!fuse_allow_task(fc, current)) + return -EACCES; + + if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) + attr->ia_valid |= ATTR_FORCE; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (attr->ia_valid & ATTR_OPEN) { + if (fc->atomic_o_trunc) + return 0; + file = NULL; + } + + if (attr->ia_valid & ATTR_SIZE) + is_truncate = true; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + if (is_truncate) + fuse_set_nowrite(inode); + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + iattr_to_fattr(attr, &inarg); + if (file) { + struct fuse_file *ff = file->private_data; + inarg.valid |= FATTR_FH; + inarg.fh = ff->fh; + } + if (attr->ia_valid & ATTR_SIZE) { + /* For mandatory locking in truncate */ + inarg.valid |= FATTR_LOCKOWNER; + inarg.lock_owner = fuse_lock_owner_id(fc, current->files); + } + req->in.h.opcode = FUSE_SETATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->out.numargs = 1; + if (fc->minor < 9) + req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + else + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err) { + if (err == -EINTR) + fuse_invalidate_attr(inode); + goto error; + } + + if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + make_bad_inode(inode); + err = -EIO; + goto error; + } + + spin_lock(&fc->lock); + fuse_change_attributes_common(inode, &outarg.attr, + attr_timeout(&outarg)); + oldsize = inode->i_size; + i_size_write(inode, outarg.attr.size); + + if (is_truncate) { + /* NOTE: this may release/reacquire fc->lock */ + __fuse_release_nowrite(inode); + } + spin_unlock(&fc->lock); + + /* + * Only call invalidate_inode_pages2() after removing + * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. + */ + if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + truncate_pagecache(inode, oldsize, outarg.attr.size); + invalidate_inode_pages2(inode->i_mapping); + } + + return 0; + +error: + if (is_truncate) + fuse_release_nowrite(inode); + + return err; +} + +static int fuse_setattr(struct dentry *entry, struct iattr *attr) +{ + if (attr->ia_valid & ATTR_FILE) + return fuse_do_setattr(entry, attr, attr->ia_file); + else + return fuse_do_setattr(entry, attr, NULL); +} + +static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, + struct kstat *stat) +{ + struct inode *inode = entry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fuse_allow_task(fc, current)) + return -EACCES; + + return fuse_update_attributes(inode, stat, NULL, NULL); +} + +static int fuse_setxattr(struct dentry *entry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = entry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_setxattr_in inarg; + int err; + + if (fc->no_setxattr) + return -EOPNOTSUPP; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + inarg.flags = flags; + req->in.h.opcode = FUSE_SETXATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 3; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->in.args[1].size = strlen(name) + 1; + req->in.args[1].value = name; + req->in.args[2].size = size; + req->in.args[2].value = value; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err == -ENOSYS) { + fc->no_setxattr = 1; + err = -EOPNOTSUPP; + } + return err; +} + +static ssize_t fuse_getxattr(struct dentry *entry, const char *name, + void *value, size_t size) +{ + struct inode *inode = entry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_getxattr_in inarg; + struct fuse_getxattr_out outarg; + ssize_t ret; + + if (fc->no_getxattr) + return -EOPNOTSUPP; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + req->in.h.opcode = FUSE_GETXATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 2; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->in.args[1].size = strlen(name) + 1; + req->in.args[1].value = name; + /* This is really two different operations rolled into one */ + req->out.numargs = 1; + if (size) { + req->out.argvar = 1; + req->out.args[0].size = size; + req->out.args[0].value = value; + } else { + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + } + fuse_request_send(fc, req); + ret = req->out.h.error; + if (!ret) + ret = size ? req->out.args[0].size : outarg.size; + else { + if (ret == -ENOSYS) { + fc->no_getxattr = 1; + ret = -EOPNOTSUPP; + } + } + fuse_put_request(fc, req); + return ret; +} + +static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) +{ + struct inode *inode = entry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_getxattr_in inarg; + struct fuse_getxattr_out outarg; + ssize_t ret; + + if (!fuse_allow_task(fc, current)) + return -EACCES; + + if (fc->no_listxattr) + return -EOPNOTSUPP; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + req->in.h.opcode = FUSE_LISTXATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + /* This is really two different operations rolled into one */ + req->out.numargs = 1; + if (size) { + req->out.argvar = 1; + req->out.args[0].size = size; + req->out.args[0].value = list; + } else { + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + } + fuse_request_send(fc, req); + ret = req->out.h.error; + if (!ret) + ret = size ? req->out.args[0].size : outarg.size; + else { + if (ret == -ENOSYS) { + fc->no_listxattr = 1; + ret = -EOPNOTSUPP; + } + } + fuse_put_request(fc, req); + return ret; +} + +static int fuse_removexattr(struct dentry *entry, const char *name) +{ + struct inode *inode = entry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + int err; + + if (fc->no_removexattr) + return -EOPNOTSUPP; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.h.opcode = FUSE_REMOVEXATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = strlen(name) + 1; + req->in.args[0].value = name; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err == -ENOSYS) { + fc->no_removexattr = 1; + err = -EOPNOTSUPP; + } + return err; +} + +static const struct inode_operations fuse_dir_inode_operations = { + .lookup = fuse_lookup, + .mkdir = fuse_mkdir, + .symlink = fuse_symlink, + .unlink = fuse_unlink, + .rmdir = fuse_rmdir, + .rename = fuse_rename, + .link = fuse_link, + .setattr = fuse_setattr, + .create = fuse_create, + .mknod = fuse_mknod, + .permission = fuse_permission, + .getattr = fuse_getattr, + .setxattr = fuse_setxattr, + .getxattr = fuse_getxattr, + .listxattr = fuse_listxattr, + .removexattr = fuse_removexattr, +}; + +static const struct file_operations fuse_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = fuse_readdir, + .open = fuse_dir_open, + .release = fuse_dir_release, + .fsync = fuse_dir_fsync, +}; + +static const struct inode_operations fuse_common_inode_operations = { + .setattr = fuse_setattr, + .permission = fuse_permission, + .getattr = fuse_getattr, + .setxattr = fuse_setxattr, + .getxattr = fuse_getxattr, + .listxattr = fuse_listxattr, + .removexattr = fuse_removexattr, +}; + +static const struct inode_operations fuse_symlink_inode_operations = { + .setattr = fuse_setattr, + .follow_link = fuse_follow_link, + .put_link = fuse_put_link, + .readlink = generic_readlink, + .getattr = fuse_getattr, + .setxattr = fuse_setxattr, + .getxattr = fuse_getxattr, + .listxattr = fuse_listxattr, + .removexattr = fuse_removexattr, +}; + +void fuse_init_common(struct inode *inode) +{ + inode->i_op = &fuse_common_inode_operations; +} + +void fuse_init_dir(struct inode *inode) +{ + inode->i_op = &fuse_dir_inode_operations; + inode->i_fop = &fuse_dir_operations; +} + +void fuse_init_symlink(struct inode *inode) +{ + inode->i_op = &fuse_symlink_inode_operations; +} diff --git a/fs/fuse/file.c b/fs/fuse/file.c new file mode 100644 index 00000000..82a66466 --- /dev/null +++ b/fs/fuse/file.c @@ -0,0 +1,2186 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include +#include + +static const struct file_operations fuse_direct_io_file_operations; + +static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, + int opcode, struct fuse_open_out *outargp) +{ + struct fuse_open_in inarg; + struct fuse_req *req; + int err; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); + if (!fc->atomic_o_trunc) + inarg.flags &= ~O_TRUNC; + req->in.h.opcode = opcode; + req->in.h.nodeid = nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->out.numargs = 1; + req->out.args[0].size = sizeof(*outargp); + req->out.args[0].value = outargp; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + + return err; +} + +struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) +{ + struct fuse_file *ff; + + ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); + if (unlikely(!ff)) + return NULL; + + ff->fc = fc; + ff->reserved_req = fuse_request_alloc(); + if (unlikely(!ff->reserved_req)) { + kfree(ff); + return NULL; + } + + INIT_LIST_HEAD(&ff->write_entry); + atomic_set(&ff->count, 0); + RB_CLEAR_NODE(&ff->polled_node); + init_waitqueue_head(&ff->poll_wait); + + spin_lock(&fc->lock); + ff->kh = ++fc->khctr; + spin_unlock(&fc->lock); + + return ff; +} + +void fuse_file_free(struct fuse_file *ff) +{ + fuse_request_free(ff->reserved_req); + kfree(ff); +} + +struct fuse_file *fuse_file_get(struct fuse_file *ff) +{ + atomic_inc(&ff->count); + return ff; +} + +static void fuse_release_async(struct work_struct *work) +{ + struct fuse_req *req; + struct fuse_conn *fc; + struct path path; + + req = container_of(work, struct fuse_req, misc.release.work); + path = req->misc.release.path; + fc = get_fuse_conn(path.dentry->d_inode); + + fuse_put_request(fc, req); + path_put(&path); +} + +static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) +{ + if (fc->destroy_req) { + /* + * If this is a fuseblk mount, then it's possible that + * releasing the path will result in releasing the + * super block and sending the DESTROY request. If + * the server is single threaded, this would hang. + * For this reason do the path_put() in a separate + * thread. + */ + atomic_inc(&req->count); + INIT_WORK(&req->misc.release.work, fuse_release_async); + schedule_work(&req->misc.release.work); + } else { + path_put(&req->misc.release.path); + } +} + +static void fuse_file_put(struct fuse_file *ff, bool sync) +{ + if (atomic_dec_and_test(&ff->count)) { + struct fuse_req *req = ff->reserved_req; + + if (sync) { + fuse_request_send(ff->fc, req); + path_put(&req->misc.release.path); + fuse_put_request(ff->fc, req); + } else { + req->end = fuse_release_end; + fuse_request_send_background(ff->fc, req); + } + kfree(ff); + } +} + +int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, + bool isdir) +{ + struct fuse_open_out outarg; + struct fuse_file *ff; + int err; + int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; + + ff = fuse_file_alloc(fc); + if (!ff) + return -ENOMEM; + + err = fuse_send_open(fc, nodeid, file, opcode, &outarg); + if (err) { + fuse_file_free(ff); + return err; + } + + if (isdir) + outarg.open_flags &= ~FOPEN_DIRECT_IO; + + ff->fh = outarg.fh; + ff->nodeid = nodeid; + ff->open_flags = outarg.open_flags; + file->private_data = fuse_file_get(ff); + + return 0; +} +EXPORT_SYMBOL_GPL(fuse_do_open); + +void fuse_finish_open(struct inode *inode, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + + if (ff->open_flags & FOPEN_DIRECT_IO) + file->f_op = &fuse_direct_io_file_operations; + if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); + if (ff->open_flags & FOPEN_NONSEEKABLE) + nonseekable_open(inode, file); + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + i_size_write(inode, 0); + spin_unlock(&fc->lock); + fuse_invalidate_attr(inode); + } +} + +int fuse_open_common(struct inode *inode, struct file *file, bool isdir) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + /* VFS checks this, but only _after_ ->open() */ + if (file->f_flags & O_DIRECT) + return -EINVAL; + + err = generic_file_open(inode, file); + if (err) + return err; + + err = fuse_do_open(fc, get_node_id(inode), file, isdir); + if (err) + return err; + + fuse_finish_open(inode, file); + + return 0; +} + +static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) +{ + struct fuse_conn *fc = ff->fc; + struct fuse_req *req = ff->reserved_req; + struct fuse_release_in *inarg = &req->misc.release.in; + + spin_lock(&fc->lock); + list_del(&ff->write_entry); + if (!RB_EMPTY_NODE(&ff->polled_node)) + rb_erase(&ff->polled_node, &fc->polled_files); + spin_unlock(&fc->lock); + + wake_up_interruptible_all(&ff->poll_wait); + + inarg->fh = ff->fh; + inarg->flags = flags; + req->in.h.opcode = opcode; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(struct fuse_release_in); + req->in.args[0].value = inarg; +} + +void fuse_release_common(struct file *file, int opcode) +{ + struct fuse_file *ff; + struct fuse_req *req; + + ff = file->private_data; + if (unlikely(!ff)) + return; + + req = ff->reserved_req; + fuse_prepare_release(ff, file->f_flags, opcode); + + /* Hold vfsmount and dentry until release is finished */ + path_get(&file->f_path); + req->misc.release.path = file->f_path; + + /* + * Normally this will send the RELEASE request, however if + * some asynchronous READ or WRITE requests are outstanding, + * the sending will be delayed. + * + * Make the release synchronous if this is a fuseblk mount, + * synchronous RELEASE is allowed (and desirable) in this case + * because the server can be trusted not to screw up. + */ + fuse_file_put(ff, ff->fc->destroy_req != NULL); +} + +static int fuse_open(struct inode *inode, struct file *file) +{ + return fuse_open_common(inode, file, false); +} + +static int fuse_release(struct inode *inode, struct file *file) +{ + fuse_release_common(file, FUSE_RELEASE); + + /* return value is ignored by VFS */ + return 0; +} + +void fuse_sync_release(struct fuse_file *ff, int flags) +{ + WARN_ON(atomic_read(&ff->count) > 1); + fuse_prepare_release(ff, flags, FUSE_RELEASE); + ff->reserved_req->force = 1; + fuse_request_send(ff->fc, ff->reserved_req); + fuse_put_request(ff->fc, ff->reserved_req); + kfree(ff); +} +EXPORT_SYMBOL_GPL(fuse_sync_release); + +/* + * Scramble the ID space with XTEA, so that the value of the files_struct + * pointer is not exposed to userspace. + */ +u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) +{ + u32 *k = fc->scramble_key; + u64 v = (unsigned long) id; + u32 v0 = v; + u32 v1 = v >> 32; + u32 sum = 0; + int i; + + for (i = 0; i < 32; i++) { + v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); + sum += 0x9E3779B9; + v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); + } + + return (u64) v0 + ((u64) v1 << 32); +} + +/* + * Check if page is under writeback + * + * This is currently done by walking the list of writepage requests + * for the inode, which can be pretty inefficient. + */ +static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_req *req; + bool found = false; + + spin_lock(&fc->lock); + list_for_each_entry(req, &fi->writepages, writepages_entry) { + pgoff_t curr_index; + + BUG_ON(req->inode != inode); + curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; + if (curr_index == index) { + found = true; + break; + } + } + spin_unlock(&fc->lock); + + return found; +} + +/* + * Wait for page writeback to be completed. + * + * Since fuse doesn't rely on the VM writeback tracking, this has to + * use some other means. + */ +static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); + return 0; +} + +static int fuse_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_file *ff = file->private_data; + struct fuse_req *req; + struct fuse_flush_in inarg; + int err; + + if (is_bad_inode(inode)) + return -EIO; + + if (fc->no_flush) + return 0; + + req = fuse_get_req_nofail(fc, file); + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.lock_owner = fuse_lock_owner_id(fc, id); + req->in.h.opcode = FUSE_FLUSH; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->force = 1; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err == -ENOSYS) { + fc->no_flush = 1; + err = 0; + } + return err; +} + +/* + * Wait for all pending writepages on the inode to finish. + * + * This is currently done by blocking further writes with FUSE_NOWRITE + * and waiting for all sent writes to complete. + * + * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage + * could conflict with truncation. + */ +static void fuse_sync_writes(struct inode *inode) +{ + fuse_set_nowrite(inode); + fuse_release_nowrite(inode); +} + +int fuse_fsync_common(struct file *file, int datasync, int isdir) +{ + struct inode *inode = file->f_mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_file *ff = file->private_data; + struct fuse_req *req; + struct fuse_fsync_in inarg; + int err; + + if (is_bad_inode(inode)) + return -EIO; + + if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) + return 0; + + /* + * Start writeback against all dirty pages of the inode, then + * wait for all outstanding writes, before sending the FSYNC + * request. + */ + err = write_inode_now(inode, 0); + if (err) + return err; + + fuse_sync_writes(inode); + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.fsync_flags = datasync ? 1 : 0; + req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err == -ENOSYS) { + if (isdir) + fc->no_fsyncdir = 1; + else + fc->no_fsync = 1; + err = 0; + } + return err; +} + +static int fuse_fsync(struct file *file, int datasync) +{ + return fuse_fsync_common(file, datasync, 0); +} + +void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, + size_t count, int opcode) +{ + struct fuse_read_in *inarg = &req->misc.read.in; + struct fuse_file *ff = file->private_data; + + inarg->fh = ff->fh; + inarg->offset = pos; + inarg->size = count; + inarg->flags = file->f_flags; + req->in.h.opcode = opcode; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(struct fuse_read_in); + req->in.args[0].value = inarg; + req->out.argvar = 1; + req->out.numargs = 1; + req->out.args[0].size = count; +} + +static size_t fuse_send_read(struct fuse_req *req, struct file *file, + loff_t pos, size_t count, fl_owner_t owner) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + + fuse_read_fill(req, file, pos, count, FUSE_READ); + if (owner != NULL) { + struct fuse_read_in *inarg = &req->misc.read.in; + + inarg->read_flags |= FUSE_READ_LOCKOWNER; + inarg->lock_owner = fuse_lock_owner_id(fc, owner); + } + fuse_request_send(fc, req); + return req->out.args[0].size; +} + +static void fuse_read_update_size(struct inode *inode, loff_t size, + u64 attr_ver) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + if (attr_ver == fi->attr_version && size < inode->i_size) { + fi->attr_version = ++fc->attr_version; + i_size_write(inode, size); + } + spin_unlock(&fc->lock); +} + +static int fuse_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + size_t num_read; + loff_t pos = page_offset(page); + size_t count = PAGE_CACHE_SIZE; + u64 attr_ver; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + /* + * Page writeback can extend beyond the lifetime of the + * page-cache page, so make sure we read a properly synced + * page. + */ + fuse_wait_on_page_writeback(inode, page->index); + + req = fuse_get_req(fc); + err = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + attr_ver = fuse_get_attr_version(fc); + + req->out.page_zeroing = 1; + req->out.argpages = 1; + req->num_pages = 1; + req->pages[0] = page; + num_read = fuse_send_read(req, file, pos, count, NULL); + err = req->out.h.error; + fuse_put_request(fc, req); + + if (!err) { + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (num_read < count) + fuse_read_update_size(inode, pos + num_read, attr_ver); + + SetPageUptodate(page); + } + + fuse_invalidate_attr(inode); /* atime changed */ + out: + unlock_page(page); + return err; +} + +static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) +{ + int i; + size_t count = req->misc.read.in.size; + size_t num_read = req->out.args[0].size; + struct address_space *mapping = NULL; + + for (i = 0; mapping == NULL && i < req->num_pages; i++) + mapping = req->pages[i]->mapping; + + if (mapping) { + struct inode *inode = mapping->host; + + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (!req->out.h.error && num_read < count) { + loff_t pos; + + pos = page_offset(req->pages[0]) + num_read; + fuse_read_update_size(inode, pos, + req->misc.read.attr_ver); + } + fuse_invalidate_attr(inode); /* atime changed */ + } + + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + if (!req->out.h.error) + SetPageUptodate(page); + else + SetPageError(page); + unlock_page(page); + page_cache_release(page); + } + if (req->ff) + fuse_file_put(req->ff, false); +} + +static void fuse_send_readpages(struct fuse_req *req, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + loff_t pos = page_offset(req->pages[0]); + size_t count = req->num_pages << PAGE_CACHE_SHIFT; + + req->out.argpages = 1; + req->out.page_zeroing = 1; + req->out.page_replace = 1; + fuse_read_fill(req, file, pos, count, FUSE_READ); + req->misc.read.attr_ver = fuse_get_attr_version(fc); + if (fc->async_read) { + req->ff = fuse_file_get(ff); + req->end = fuse_readpages_end; + fuse_request_send_background(fc, req); + } else { + fuse_request_send(fc, req); + fuse_readpages_end(fc, req); + fuse_put_request(fc, req); + } +} + +struct fuse_fill_data { + struct fuse_req *req; + struct file *file; + struct inode *inode; +}; + +static int fuse_readpages_fill(void *_data, struct page *page) +{ + struct fuse_fill_data *data = _data; + struct fuse_req *req = data->req; + struct inode *inode = data->inode; + struct fuse_conn *fc = get_fuse_conn(inode); + + fuse_wait_on_page_writeback(inode, page->index); + + if (req->num_pages && + (req->num_pages == FUSE_MAX_PAGES_PER_REQ || + (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || + req->pages[req->num_pages - 1]->index + 1 != page->index)) { + fuse_send_readpages(req, data->file); + data->req = req = fuse_get_req(fc); + if (IS_ERR(req)) { + unlock_page(page); + return PTR_ERR(req); + } + } + page_cache_get(page); + req->pages[req->num_pages] = page; + req->num_pages++; + return 0; +} + +static int fuse_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_fill_data data; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + data.file = file; + data.inode = inode; + data.req = fuse_get_req(fc); + err = PTR_ERR(data.req); + if (IS_ERR(data.req)) + goto out; + + err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); + if (!err) { + if (data.req->num_pages) + fuse_send_readpages(data.req, file); + else + fuse_put_request(fc, data.req); + } +out: + return err; +} + +static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + + if (pos + iov_length(iov, nr_segs) > i_size_read(inode)) { + int err; + /* + * If trying to read past EOF, make sure the i_size + * attribute is up-to-date. + */ + err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL); + if (err) + return err; + } + + return generic_file_aio_read(iocb, iov, nr_segs, pos); +} + +static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, + loff_t pos, size_t count) +{ + struct fuse_write_in *inarg = &req->misc.write.in; + struct fuse_write_out *outarg = &req->misc.write.out; + + inarg->fh = ff->fh; + inarg->offset = pos; + inarg->size = count; + req->in.h.opcode = FUSE_WRITE; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 2; + if (ff->fc->minor < 9) + req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; + else + req->in.args[0].size = sizeof(struct fuse_write_in); + req->in.args[0].value = inarg; + req->in.args[1].size = count; + req->out.numargs = 1; + req->out.args[0].size = sizeof(struct fuse_write_out); + req->out.args[0].value = outarg; +} + +static size_t fuse_send_write(struct fuse_req *req, struct file *file, + loff_t pos, size_t count, fl_owner_t owner) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + struct fuse_write_in *inarg = &req->misc.write.in; + + fuse_write_fill(req, ff, pos, count); + inarg->flags = file->f_flags; + if (owner != NULL) { + inarg->write_flags |= FUSE_WRITE_LOCKOWNER; + inarg->lock_owner = fuse_lock_owner_id(fc, owner); + } + fuse_request_send(fc, req); + return req->misc.write.out.size; +} + +static int fuse_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + *pagep = grab_cache_page_write_begin(mapping, index, flags); + if (!*pagep) + return -ENOMEM; + return 0; +} + +void fuse_write_update_size(struct inode *inode, loff_t pos) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + if (pos > inode->i_size) + i_size_write(inode, pos); + spin_unlock(&fc->lock); +} + +static int fuse_buffered_write(struct file *file, struct inode *inode, + loff_t pos, unsigned count, struct page *page) +{ + int err; + size_t nres; + struct fuse_conn *fc = get_fuse_conn(inode); + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct fuse_req *req; + + if (is_bad_inode(inode)) + return -EIO; + + /* + * Make sure writepages on the same page are not mixed up with + * plain writes. + */ + fuse_wait_on_page_writeback(inode, page->index); + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.argpages = 1; + req->num_pages = 1; + req->pages[0] = page; + req->page_offset = offset; + nres = fuse_send_write(req, file, pos, count, NULL); + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err && !nres) + err = -EIO; + if (!err) { + pos += nres; + fuse_write_update_size(inode, pos); + if (count == PAGE_CACHE_SIZE) + SetPageUptodate(page); + } + fuse_invalidate_attr(inode); + return err ? err : nres; +} + +static int fuse_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int res = 0; + + if (copied) + res = fuse_buffered_write(file, inode, pos, copied, page); + + unlock_page(page); + page_cache_release(page); + return res; +} + +static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, + struct inode *inode, loff_t pos, + size_t count) +{ + size_t res; + unsigned offset; + unsigned i; + + for (i = 0; i < req->num_pages; i++) + fuse_wait_on_page_writeback(inode, req->pages[i]->index); + + res = fuse_send_write(req, file, pos, count, NULL); + + offset = req->page_offset; + count = res; + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + + if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE) + SetPageUptodate(page); + + if (count > PAGE_CACHE_SIZE - offset) + count -= PAGE_CACHE_SIZE - offset; + else + count = 0; + offset = 0; + + unlock_page(page); + page_cache_release(page); + } + + return res; +} + +static ssize_t fuse_fill_write_pages(struct fuse_req *req, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos) +{ + struct fuse_conn *fc = get_fuse_conn(mapping->host); + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + size_t count = 0; + int err; + + req->in.argpages = 1; + req->page_offset = offset; + + do { + size_t tmp; + struct page *page; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset, + iov_iter_count(ii)); + + bytes = min_t(size_t, bytes, fc->max_write - count); + + again: + err = -EFAULT; + if (iov_iter_fault_in_readable(ii, bytes)) + break; + + err = -ENOMEM; + page = grab_cache_page_write_begin(mapping, index, 0); + if (!page) + break; + + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + + pagefault_disable(); + tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); + pagefault_enable(); + flush_dcache_page(page); + + if (!tmp) { + unlock_page(page); + page_cache_release(page); + bytes = min(bytes, iov_iter_single_seg_count(ii)); + goto again; + } + + err = 0; + req->pages[req->num_pages] = page; + req->num_pages++; + + iov_iter_advance(ii, tmp); + count += tmp; + pos += tmp; + offset += tmp; + if (offset == PAGE_CACHE_SIZE) + offset = 0; + + if (!fc->big_writes) + break; + } while (iov_iter_count(ii) && count < fc->max_write && + req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0); + + return count > 0 ? count : err; +} + +static ssize_t fuse_perform_write(struct file *file, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + int err = 0; + ssize_t res = 0; + + if (is_bad_inode(inode)) + return -EIO; + + do { + struct fuse_req *req; + ssize_t count; + + req = fuse_get_req(fc); + if (IS_ERR(req)) { + err = PTR_ERR(req); + break; + } + + count = fuse_fill_write_pages(req, mapping, ii, pos); + if (count <= 0) { + err = count; + } else { + size_t num_written; + + num_written = fuse_send_write_pages(req, file, inode, + pos, count); + err = req->out.h.error; + if (!err) { + res += num_written; + pos += num_written; + + /* break out of the loop on short write */ + if (num_written != count) + err = -EIO; + } + } + fuse_put_request(fc, req); + } while (!err && iov_iter_count(ii)); + + if (res > 0) + fuse_write_update_size(inode, pos); + + fuse_invalidate_attr(inode); + + return res > 0 ? res : err; +} + +static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + size_t count = 0; + ssize_t written = 0; + struct inode *inode = mapping->host; + ssize_t err; + struct iov_iter i; + + WARN_ON(iocb->ki_pos != pos); + + err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + + if (count == 0) + goto out; + + err = file_remove_suid(file); + if (err) + goto out; + + file_update_time(file); + + iov_iter_init(&i, iov, nr_segs, count, 0); + written = fuse_perform_write(file, mapping, &i, pos); + if (written >= 0) + iocb->ki_pos = pos + written; + +out: + current->backing_dev_info = NULL; + mutex_unlock(&inode->i_mutex); + + return written ? written : err; +} + +static void fuse_release_user_pages(struct fuse_req *req, int write) +{ + unsigned i; + + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + if (write) + set_page_dirty_lock(page); + put_page(page); + } +} + +static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, + size_t *nbytesp, int write) +{ + size_t nbytes = *nbytesp; + unsigned long user_addr = (unsigned long) buf; + unsigned offset = user_addr & ~PAGE_MASK; + int npages; + + /* Special case for kernel I/O: can copy directly into the buffer */ + if (segment_eq(get_fs(), KERNEL_DS)) { + if (write) + req->in.args[1].value = (void *) user_addr; + else + req->out.args[0].value = (void *) user_addr; + + return 0; + } + + nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); + npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); + npages = get_user_pages_fast(user_addr, npages, !write, req->pages); + if (npages < 0) + return npages; + + req->num_pages = npages; + req->page_offset = offset; + + if (write) + req->in.argpages = 1; + else + req->out.argpages = 1; + + nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; + *nbytesp = min(*nbytesp, nbytes); + + return 0; +} + +ssize_t fuse_direct_io(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, int write) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + size_t nmax = write ? fc->max_write : fc->max_read; + loff_t pos = *ppos; + ssize_t res = 0; + struct fuse_req *req; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + while (count) { + size_t nres; + fl_owner_t owner = current->files; + size_t nbytes = min(count, nmax); + int err = fuse_get_user_pages(req, buf, &nbytes, write); + if (err) { + res = err; + break; + } + + if (write) + nres = fuse_send_write(req, file, pos, nbytes, owner); + else + nres = fuse_send_read(req, file, pos, nbytes, owner); + + fuse_release_user_pages(req, !write); + if (req->out.h.error) { + if (!res) + res = req->out.h.error; + break; + } else if (nres > nbytes) { + res = -EIO; + break; + } + count -= nres; + res += nres; + pos += nres; + buf += nres; + if (nres != nbytes) + break; + if (count) { + fuse_put_request(fc, req); + req = fuse_get_req(fc); + if (IS_ERR(req)) + break; + } + } + if (!IS_ERR(req)) + fuse_put_request(fc, req); + if (res > 0) + *ppos = pos; + + return res; +} +EXPORT_SYMBOL_GPL(fuse_direct_io); + +static ssize_t fuse_direct_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + ssize_t res; + struct inode *inode = file->f_path.dentry->d_inode; + + if (is_bad_inode(inode)) + return -EIO; + + res = fuse_direct_io(file, buf, count, ppos, 0); + + fuse_invalidate_attr(inode); + + return res; +} + +static ssize_t fuse_direct_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file->f_path.dentry->d_inode; + ssize_t res; + + if (is_bad_inode(inode)) + return -EIO; + + /* Don't allow parallel writes to the same file */ + mutex_lock(&inode->i_mutex); + res = generic_write_checks(file, ppos, &count, 0); + if (!res) { + res = fuse_direct_io(file, buf, count, ppos, 1); + if (res > 0) + fuse_write_update_size(inode, *ppos); + } + mutex_unlock(&inode->i_mutex); + + fuse_invalidate_attr(inode); + + return res; +} + +static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) +{ + __free_page(req->pages[0]); + fuse_file_put(req->ff, false); +} + +static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) +{ + struct inode *inode = req->inode; + struct fuse_inode *fi = get_fuse_inode(inode); + struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; + + list_del(&req->writepages_entry); + dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP); + bdi_writeout_inc(bdi); + wake_up(&fi->page_waitq); +} + +/* Called under fc->lock, may release and reacquire it */ +static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) +__releases(fc->lock) +__acquires(fc->lock) +{ + struct fuse_inode *fi = get_fuse_inode(req->inode); + loff_t size = i_size_read(req->inode); + struct fuse_write_in *inarg = &req->misc.write.in; + + if (!fc->connected) + goto out_free; + + if (inarg->offset + PAGE_CACHE_SIZE <= size) { + inarg->size = PAGE_CACHE_SIZE; + } else if (inarg->offset < size) { + inarg->size = size & (PAGE_CACHE_SIZE - 1); + } else { + /* Got truncated off completely */ + goto out_free; + } + + req->in.args[1].size = inarg->size; + fi->writectr++; + fuse_request_send_background_locked(fc, req); + return; + + out_free: + fuse_writepage_finish(fc, req); + spin_unlock(&fc->lock); + fuse_writepage_free(fc, req); + fuse_put_request(fc, req); + spin_lock(&fc->lock); +} + +/* + * If fi->writectr is positive (no truncate or fsync going on) send + * all queued writepage requests. + * + * Called with fc->lock + */ +void fuse_flush_writepages(struct inode *inode) +__releases(fc->lock) +__acquires(fc->lock) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_req *req; + + while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { + req = list_entry(fi->queued_writes.next, struct fuse_req, list); + list_del_init(&req->list); + fuse_send_writepage(fc, req); + } +} + +static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) +{ + struct inode *inode = req->inode; + struct fuse_inode *fi = get_fuse_inode(inode); + + mapping_set_error(inode->i_mapping, req->out.h.error); + spin_lock(&fc->lock); + fi->writectr--; + fuse_writepage_finish(fc, req); + spin_unlock(&fc->lock); + fuse_writepage_free(fc, req); +} + +static int fuse_writepage_locked(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_req *req; + struct fuse_file *ff; + struct page *tmp_page; + + set_page_writeback(page); + + req = fuse_request_alloc_nofs(); + if (!req) + goto err; + + tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!tmp_page) + goto err_free; + + spin_lock(&fc->lock); + BUG_ON(list_empty(&fi->write_files)); + ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); + req->ff = fuse_file_get(ff); + spin_unlock(&fc->lock); + + fuse_write_fill(req, ff, page_offset(page), 0); + + copy_highpage(tmp_page, page); + req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; + req->in.argpages = 1; + req->num_pages = 1; + req->pages[0] = tmp_page; + req->page_offset = 0; + req->end = fuse_writepage_end; + req->inode = inode; + + inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK); + inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); + end_page_writeback(page); + + spin_lock(&fc->lock); + list_add(&req->writepages_entry, &fi->writepages); + list_add_tail(&req->list, &fi->queued_writes); + fuse_flush_writepages(inode); + spin_unlock(&fc->lock); + + return 0; + +err_free: + fuse_request_free(req); +err: + end_page_writeback(page); + return -ENOMEM; +} + +static int fuse_writepage(struct page *page, struct writeback_control *wbc) +{ + int err; + + err = fuse_writepage_locked(page); + unlock_page(page); + + return err; +} + +static int fuse_launder_page(struct page *page) +{ + int err = 0; + if (clear_page_dirty_for_io(page)) { + struct inode *inode = page->mapping->host; + err = fuse_writepage_locked(page); + if (!err) + fuse_wait_on_page_writeback(inode, page->index); + } + return err; +} + +/* + * Write back dirty pages now, because there may not be any suitable + * open files later + */ +static void fuse_vma_close(struct vm_area_struct *vma) +{ + filemap_write_and_wait(vma->vm_file->f_mapping); +} + +/* + * Wait for writeback against this page to complete before allowing it + * to be marked dirty again, and hence written back again, possibly + * before the previous writepage completed. + * + * Block here, instead of in ->writepage(), so that the userspace fs + * can only block processes actually operating on the filesystem. + * + * Otherwise unprivileged userspace fs would be able to block + * unrelated: + * + * - page migration + * - sync(2) + * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER + */ +static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + /* + * Don't use page->mapping as it may become NULL from a + * concurrent truncate. + */ + struct inode *inode = vma->vm_file->f_mapping->host; + + fuse_wait_on_page_writeback(inode, page->index); + return 0; +} + +static const struct vm_operations_struct fuse_file_vm_ops = { + .close = fuse_vma_close, + .fault = filemap_fault, + .page_mkwrite = fuse_page_mkwrite, +}; + +static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { + struct inode *inode = file->f_dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff = file->private_data; + /* + * file may be written through mmap, so chain it onto the + * inodes's write_file list + */ + spin_lock(&fc->lock); + if (list_empty(&ff->write_entry)) + list_add(&ff->write_entry, &fi->write_files); + spin_unlock(&fc->lock); + } + file_accessed(file); + vma->vm_ops = &fuse_file_vm_ops; + return 0; +} + +static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* Can't provide the coherency needed for MAP_SHARED */ + if (vma->vm_flags & VM_MAYSHARE) + return -ENODEV; + + invalidate_inode_pages2(file->f_mapping); + + return generic_file_mmap(file, vma); +} + +static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, + struct file_lock *fl) +{ + switch (ffl->type) { + case F_UNLCK: + break; + + case F_RDLCK: + case F_WRLCK: + if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX || + ffl->end < ffl->start) + return -EIO; + + fl->fl_start = ffl->start; + fl->fl_end = ffl->end; + fl->fl_pid = ffl->pid; + break; + + default: + return -EIO; + } + fl->fl_type = ffl->type; + return 0; +} + +static void fuse_lk_fill(struct fuse_req *req, struct file *file, + const struct file_lock *fl, int opcode, pid_t pid, + int flock) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_file *ff = file->private_data; + struct fuse_lk_in *arg = &req->misc.lk_in; + + arg->fh = ff->fh; + arg->owner = fuse_lock_owner_id(fc, fl->fl_owner); + arg->lk.start = fl->fl_start; + arg->lk.end = fl->fl_end; + arg->lk.type = fl->fl_type; + arg->lk.pid = pid; + if (flock) + arg->lk_flags |= FUSE_LK_FLOCK; + req->in.h.opcode = opcode; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(*arg); + req->in.args[0].value = arg; +} + +static int fuse_getlk(struct file *file, struct file_lock *fl) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_lk_out outarg; + int err; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + fuse_lk_fill(req, file, fl, FUSE_GETLK, 0, 0); + req->out.numargs = 1; + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) + err = convert_fuse_file_lock(&outarg.lk, fl); + + return err; +} + +static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; + pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0; + int err; + + if (fl->fl_lmops && fl->fl_lmops->fl_grant) { + /* NLM needs asynchronous locks, which we don't support yet */ + return -ENOLCK; + } + + /* Unlock on close is handled by the flush method */ + if (fl->fl_flags & FL_CLOSE) + return 0; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + fuse_lk_fill(req, file, fl, opcode, pid, flock); + fuse_request_send(fc, req); + err = req->out.h.error; + /* locking is restartable */ + if (err == -EINTR) + err = -ERESTARTSYS; + fuse_put_request(fc, req); + return err; +} + +static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + if (cmd == F_CANCELLK) { + err = 0; + } else if (cmd == F_GETLK) { + if (fc->no_lock) { + posix_test_lock(file, fl); + err = 0; + } else + err = fuse_getlk(file, fl); + } else { + if (fc->no_lock) + err = posix_lock_file(file, fl, NULL); + else + err = fuse_setlk(file, fl, 0); + } + return err; +} + +static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + if (fc->no_lock) { + err = flock_lock_file_wait(file, fl); + } else { + /* emulate flock with POSIX locks */ + fl->fl_owner = (fl_owner_t) file; + err = fuse_setlk(file, fl, 1); + } + + return err; +} + +static sector_t fuse_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_bmap_in inarg; + struct fuse_bmap_out outarg; + int err; + + if (!inode->i_sb->s_bdev || fc->no_bmap) + return 0; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return 0; + + memset(&inarg, 0, sizeof(inarg)); + inarg.block = block; + inarg.blocksize = inode->i_sb->s_blocksize; + req->in.h.opcode = FUSE_BMAP; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->out.numargs = 1; + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err == -ENOSYS) + fc->no_bmap = 1; + + return err ? 0 : outarg.block; +} + +static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t retval; + struct inode *inode = file->f_path.dentry->d_inode; + + mutex_lock(&inode->i_mutex); + switch (origin) { + case SEEK_END: + retval = fuse_update_attributes(inode, NULL, file, NULL); + if (retval) + goto exit; + offset += i_size_read(inode); + break; + case SEEK_CUR: + offset += file->f_pos; + } + retval = -EINVAL; + if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + retval = offset; + } +exit: + mutex_unlock(&inode->i_mutex); + return retval; +} + +static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, + unsigned int nr_segs, size_t bytes, bool to_user) +{ + struct iov_iter ii; + int page_idx = 0; + + if (!bytes) + return 0; + + iov_iter_init(&ii, iov, nr_segs, bytes, 0); + + while (iov_iter_count(&ii)) { + struct page *page = pages[page_idx++]; + size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); + void *kaddr; + + kaddr = kmap(page); + + while (todo) { + char __user *uaddr = ii.iov->iov_base + ii.iov_offset; + size_t iov_len = ii.iov->iov_len - ii.iov_offset; + size_t copy = min(todo, iov_len); + size_t left; + + if (!to_user) + left = copy_from_user(kaddr, uaddr, copy); + else + left = copy_to_user(uaddr, kaddr, copy); + + if (unlikely(left)) + return -EFAULT; + + iov_iter_advance(&ii, copy); + todo -= copy; + kaddr += copy; + } + + kunmap(page); + } + + return 0; +} + +/* + * CUSE servers compiled on 32bit broke on 64bit kernels because the + * ABI was defined to be 'struct iovec' which is different on 32bit + * and 64bit. Fortunately we can determine which structure the server + * used from the size of the reply. + */ +static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, + size_t transferred, unsigned count, + bool is_compat) +{ +#ifdef CONFIG_COMPAT + if (count * sizeof(struct compat_iovec) == transferred) { + struct compat_iovec *ciov = src; + unsigned i; + + /* + * With this interface a 32bit server cannot support + * non-compat (i.e. ones coming from 64bit apps) ioctl + * requests + */ + if (!is_compat) + return -EINVAL; + + for (i = 0; i < count; i++) { + dst[i].iov_base = compat_ptr(ciov[i].iov_base); + dst[i].iov_len = ciov[i].iov_len; + } + return 0; + } +#endif + + if (count * sizeof(struct iovec) != transferred) + return -EIO; + + memcpy(dst, src, transferred); + return 0; +} + +/* Make sure iov_length() won't overflow */ +static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) +{ + size_t n; + u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; + + for (n = 0; n < count; n++) { + if (iov->iov_len > (size_t) max) + return -ENOMEM; + max -= iov->iov_len; + } + return 0; +} + +static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, + void *src, size_t transferred, unsigned count, + bool is_compat) +{ + unsigned i; + struct fuse_ioctl_iovec *fiov = src; + + if (fc->minor < 16) { + return fuse_copy_ioctl_iovec_old(dst, src, transferred, + count, is_compat); + } + + if (count * sizeof(struct fuse_ioctl_iovec) != transferred) + return -EIO; + + for (i = 0; i < count; i++) { + /* Did the server supply an inappropriate value? */ + if (fiov[i].base != (unsigned long) fiov[i].base || + fiov[i].len != (unsigned long) fiov[i].len) + return -EIO; + + dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base; + dst[i].iov_len = (size_t) fiov[i].len; + +#ifdef CONFIG_COMPAT + if (is_compat && + (ptr_to_compat(dst[i].iov_base) != fiov[i].base || + (compat_size_t) dst[i].iov_len != fiov[i].len)) + return -EIO; +#endif + } + + return 0; +} + + +/* + * For ioctls, there is no generic way to determine how much memory + * needs to be read and/or written. Furthermore, ioctls are allowed + * to dereference the passed pointer, so the parameter requires deep + * copying but FUSE has no idea whatsoever about what to copy in or + * out. + * + * This is solved by allowing FUSE server to retry ioctl with + * necessary in/out iovecs. Let's assume the ioctl implementation + * needs to read in the following structure. + * + * struct a { + * char *buf; + * size_t buflen; + * } + * + * On the first callout to FUSE server, inarg->in_size and + * inarg->out_size will be NULL; then, the server completes the ioctl + * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and + * the actual iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } + * + * which tells FUSE to copy in the requested area and retry the ioctl. + * On the second round, the server has access to the structure and + * from that it can tell what to look for next, so on the invocation, + * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, + * { .iov_base = a.buf, .iov_len = a.buflen } } + * + * FUSE will copy both struct a and the pointed buffer from the + * process doing the ioctl and retry ioctl with both struct a and the + * buffer. + * + * This time, FUSE server has everything it needs and completes ioctl + * without FUSE_IOCTL_RETRY which finishes the ioctl call. + * + * Copying data out works the same way. + * + * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel + * automatically initializes in and out iovs by decoding @cmd with + * _IOC_* macros and the server is not allowed to request RETRY. This + * limits ioctl data transfers to well-formed ioctls and is the forced + * behavior for all FUSE servers. + */ +long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, + unsigned int flags) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + struct fuse_ioctl_in inarg = { + .fh = ff->fh, + .cmd = cmd, + .arg = arg, + .flags = flags + }; + struct fuse_ioctl_out outarg; + struct fuse_req *req = NULL; + struct page **pages = NULL; + struct iovec *iov_page = NULL; + struct iovec *in_iov = NULL, *out_iov = NULL; + unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; + size_t in_size, out_size, transferred; + int err; + +#if BITS_PER_LONG == 32 + inarg.flags |= FUSE_IOCTL_32BIT; +#else + if (flags & FUSE_IOCTL_COMPAT) + inarg.flags |= FUSE_IOCTL_32BIT; +#endif + + /* assume all the iovs returned by client always fits in a page */ + BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); + + err = -ENOMEM; + pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); + iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); + if (!pages || !iov_page) + goto out; + + /* + * If restricted, initialize IO parameters as encoded in @cmd. + * RETRY from server is not allowed. + */ + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { + struct iovec *iov = iov_page; + + iov->iov_base = (void __user *)arg; + iov->iov_len = _IOC_SIZE(cmd); + + if (_IOC_DIR(cmd) & _IOC_WRITE) { + in_iov = iov; + in_iovs = 1; + } + + if (_IOC_DIR(cmd) & _IOC_READ) { + out_iov = iov; + out_iovs = 1; + } + } + + retry: + inarg.in_size = in_size = iov_length(in_iov, in_iovs); + inarg.out_size = out_size = iov_length(out_iov, out_iovs); + + /* + * Out data can be used either for actual out data or iovs, + * make sure there always is at least one page. + */ + out_size = max_t(size_t, out_size, PAGE_SIZE); + max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); + + /* make sure there are enough buffer pages and init request with them */ + err = -ENOMEM; + if (max_pages > FUSE_MAX_PAGES_PER_REQ) + goto out; + while (num_pages < max_pages) { + pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!pages[num_pages]) + goto out; + num_pages++; + } + + req = fuse_get_req(fc); + if (IS_ERR(req)) { + err = PTR_ERR(req); + req = NULL; + goto out; + } + memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); + req->num_pages = num_pages; + + /* okay, let's send it to the client */ + req->in.h.opcode = FUSE_IOCTL; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + if (in_size) { + req->in.numargs++; + req->in.args[1].size = in_size; + req->in.argpages = 1; + + err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size, + false); + if (err) + goto out; + } + + req->out.numargs = 2; + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + req->out.args[1].size = out_size; + req->out.argpages = 1; + req->out.argvar = 1; + + fuse_request_send(fc, req); + err = req->out.h.error; + transferred = req->out.args[1].size; + fuse_put_request(fc, req); + req = NULL; + if (err) + goto out; + + /* did it ask for retry? */ + if (outarg.flags & FUSE_IOCTL_RETRY) { + void *vaddr; + + /* no retry if in restricted mode */ + err = -EIO; + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) + goto out; + + in_iovs = outarg.in_iovs; + out_iovs = outarg.out_iovs; + + /* + * Make sure things are in boundary, separate checks + * are to protect against overflow. + */ + err = -ENOMEM; + if (in_iovs > FUSE_IOCTL_MAX_IOV || + out_iovs > FUSE_IOCTL_MAX_IOV || + in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) + goto out; + + vaddr = kmap_atomic(pages[0], KM_USER0); + err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, + transferred, in_iovs + out_iovs, + (flags & FUSE_IOCTL_COMPAT) != 0); + kunmap_atomic(vaddr, KM_USER0); + if (err) + goto out; + + in_iov = iov_page; + out_iov = in_iov + in_iovs; + + err = fuse_verify_ioctl_iov(in_iov, in_iovs); + if (err) + goto out; + + err = fuse_verify_ioctl_iov(out_iov, out_iovs); + if (err) + goto out; + + goto retry; + } + + err = -EIO; + if (transferred > inarg.out_size) + goto out; + + err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true); + out: + if (req) + fuse_put_request(fc, req); + free_page((unsigned long) iov_page); + while (num_pages) + __free_page(pages[--num_pages]); + kfree(pages); + + return err ? err : outarg.result; +} +EXPORT_SYMBOL_GPL(fuse_do_ioctl); + +static long fuse_file_ioctl_common(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags) +{ + struct inode *inode = file->f_dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fuse_allow_task(fc, current)) + return -EACCES; + + if (is_bad_inode(inode)) + return -EIO; + + return fuse_do_ioctl(file, cmd, arg, flags); +} + +static long fuse_file_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return fuse_file_ioctl_common(file, cmd, arg, 0); +} + +static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); +} + +/* + * All files which have been polled are linked to RB tree + * fuse_conn->polled_files which is indexed by kh. Walk the tree and + * find the matching one. + */ +static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh, + struct rb_node **parent_out) +{ + struct rb_node **link = &fc->polled_files.rb_node; + struct rb_node *last = NULL; + + while (*link) { + struct fuse_file *ff; + + last = *link; + ff = rb_entry(last, struct fuse_file, polled_node); + + if (kh < ff->kh) + link = &last->rb_left; + else if (kh > ff->kh) + link = &last->rb_right; + else + return link; + } + + if (parent_out) + *parent_out = last; + return link; +} + +/* + * The file is about to be polled. Make sure it's on the polled_files + * RB tree. Note that files once added to the polled_files tree are + * not removed before the file is released. This is because a file + * polled once is likely to be polled again. + */ +static void fuse_register_polled_file(struct fuse_conn *fc, + struct fuse_file *ff) +{ + spin_lock(&fc->lock); + if (RB_EMPTY_NODE(&ff->polled_node)) { + struct rb_node **link, *parent; + + link = fuse_find_polled_node(fc, ff->kh, &parent); + BUG_ON(*link); + rb_link_node(&ff->polled_node, parent, link); + rb_insert_color(&ff->polled_node, &fc->polled_files); + } + spin_unlock(&fc->lock); +} + +unsigned fuse_file_poll(struct file *file, poll_table *wait) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; + struct fuse_poll_out outarg; + struct fuse_req *req; + int err; + + if (fc->no_poll) + return DEFAULT_POLLMASK; + + poll_wait(file, &ff->poll_wait, wait); + + /* + * Ask for notification iff there's someone waiting for it. + * The client may ignore the flag and always notify. + */ + if (waitqueue_active(&ff->poll_wait)) { + inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; + fuse_register_polled_file(fc, ff); + } + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return POLLERR; + + req->in.h.opcode = FUSE_POLL; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->out.numargs = 1; + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + + if (!err) + return outarg.revents; + if (err == -ENOSYS) { + fc->no_poll = 1; + return DEFAULT_POLLMASK; + } + return POLLERR; +} +EXPORT_SYMBOL_GPL(fuse_file_poll); + +/* + * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and + * wakes up the poll waiters. + */ +int fuse_notify_poll_wakeup(struct fuse_conn *fc, + struct fuse_notify_poll_wakeup_out *outarg) +{ + u64 kh = outarg->kh; + struct rb_node **link; + + spin_lock(&fc->lock); + + link = fuse_find_polled_node(fc, kh, NULL); + if (*link) { + struct fuse_file *ff; + + ff = rb_entry(*link, struct fuse_file, polled_node); + wake_up_interruptible_sync(&ff->poll_wait); + } + + spin_unlock(&fc->lock); + return 0; +} + +static const struct file_operations fuse_file_operations = { + .llseek = fuse_file_llseek, + .read = do_sync_read, + .aio_read = fuse_file_aio_read, + .write = do_sync_write, + .aio_write = fuse_file_aio_write, + .mmap = fuse_file_mmap, + .open = fuse_open, + .flush = fuse_flush, + .release = fuse_release, + .fsync = fuse_fsync, + .lock = fuse_file_lock, + .flock = fuse_file_flock, + .splice_read = generic_file_splice_read, + .unlocked_ioctl = fuse_file_ioctl, + .compat_ioctl = fuse_file_compat_ioctl, + .poll = fuse_file_poll, +}; + +static const struct file_operations fuse_direct_io_file_operations = { + .llseek = fuse_file_llseek, + .read = fuse_direct_read, + .write = fuse_direct_write, + .mmap = fuse_direct_mmap, + .open = fuse_open, + .flush = fuse_flush, + .release = fuse_release, + .fsync = fuse_fsync, + .lock = fuse_file_lock, + .flock = fuse_file_flock, + .unlocked_ioctl = fuse_file_ioctl, + .compat_ioctl = fuse_file_compat_ioctl, + .poll = fuse_file_poll, + /* no splice_read */ +}; + +static const struct address_space_operations fuse_file_aops = { + .readpage = fuse_readpage, + .writepage = fuse_writepage, + .launder_page = fuse_launder_page, + .write_begin = fuse_write_begin, + .write_end = fuse_write_end, + .readpages = fuse_readpages, + .set_page_dirty = __set_page_dirty_nobuffers, + .bmap = fuse_bmap, +}; + +void fuse_init_file_inode(struct inode *inode) +{ + inode->i_fop = &fuse_file_operations; + inode->i_data.a_ops = &fuse_file_aops; +} diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h new file mode 100644 index 00000000..f6215501 --- /dev/null +++ b/fs/fuse/fuse_i.h @@ -0,0 +1,769 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#ifndef _FS_FUSE_I_H +#define _FS_FUSE_I_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** Max number of pages that can be used in a single read request */ +#define FUSE_MAX_PAGES_PER_REQ 32 + +/** Bias for fi->writectr, meaning new writepages must not be sent */ +#define FUSE_NOWRITE INT_MIN + +/** It could be as large as PATH_MAX, but would that have any uses? */ +#define FUSE_NAME_MAX 1024 + +/** Number of dentries for each connection in the control filesystem */ +#define FUSE_CTL_NUM_DENTRIES 5 + +/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem + module will check permissions based on the file mode. Otherwise no + permission checking is done in the kernel */ +#define FUSE_DEFAULT_PERMISSIONS (1 << 0) + +/** If the FUSE_ALLOW_OTHER flag is given, then not only the user + doing the mount will be allowed to access the filesystem */ +#define FUSE_ALLOW_OTHER (1 << 1) + +/** List of active connections */ +extern struct list_head fuse_conn_list; + +/** Global mutex protecting fuse_conn_list and the control filesystem */ +extern struct mutex fuse_mutex; + +/** Module parameters */ +extern unsigned max_user_bgreq; +extern unsigned max_user_congthresh; + +/* One forget request */ +struct fuse_forget_link { + struct fuse_forget_one forget_one; + struct fuse_forget_link *next; +}; + +/** FUSE inode */ +struct fuse_inode { + /** Inode data */ + struct inode inode; + + /** Unique ID, which identifies the inode between userspace + * and kernel */ + u64 nodeid; + + /** Number of lookups on this inode */ + u64 nlookup; + + /** The request used for sending the FORGET message */ + struct fuse_forget_link *forget; + + /** Time in jiffies until the file attributes are valid */ + u64 i_time; + + /** The sticky bit in inode->i_mode may have been removed, so + preserve the original mode */ + mode_t orig_i_mode; + + /** 64 bit inode number */ + u64 orig_ino; + + /** Version of last attribute change */ + u64 attr_version; + + /** Files usable in writepage. Protected by fc->lock */ + struct list_head write_files; + + /** Writepages pending on truncate or fsync */ + struct list_head queued_writes; + + /** Number of sent writes, a negative bias (FUSE_NOWRITE) + * means more writes are blocked */ + int writectr; + + /** Waitq for writepage completion */ + wait_queue_head_t page_waitq; + + /** List of writepage requestst (pending or sent) */ + struct list_head writepages; +}; + +struct fuse_conn; + +/** FUSE specific file data */ +struct fuse_file { + /** Fuse connection for this file */ + struct fuse_conn *fc; + + /** Request reserved for flush and release */ + struct fuse_req *reserved_req; + + /** Kernel file handle guaranteed to be unique */ + u64 kh; + + /** File handle used by userspace */ + u64 fh; + + /** Node id of this file */ + u64 nodeid; + + /** Refcount */ + atomic_t count; + + /** FOPEN_* flags returned by open */ + u32 open_flags; + + /** Entry on inode's write_files list */ + struct list_head write_entry; + + /** RB node to be linked on fuse_conn->polled_files */ + struct rb_node polled_node; + + /** Wait queue head for poll */ + wait_queue_head_t poll_wait; +}; + +/** One input argument of a request */ +struct fuse_in_arg { + unsigned size; + const void *value; +}; + +/** The request input */ +struct fuse_in { + /** The request header */ + struct fuse_in_header h; + + /** True if the data for the last argument is in req->pages */ + unsigned argpages:1; + + /** Number of arguments */ + unsigned numargs; + + /** Array of arguments */ + struct fuse_in_arg args[3]; +}; + +/** One output argument of a request */ +struct fuse_arg { + unsigned size; + void *value; +}; + +/** The request output */ +struct fuse_out { + /** Header returned from userspace */ + struct fuse_out_header h; + + /* + * The following bitfields are not changed during the request + * processing + */ + + /** Last argument is variable length (can be shorter than + arg->size) */ + unsigned argvar:1; + + /** Last argument is a list of pages to copy data to */ + unsigned argpages:1; + + /** Zero partially or not copied pages */ + unsigned page_zeroing:1; + + /** Pages may be replaced with new ones */ + unsigned page_replace:1; + + /** Number or arguments */ + unsigned numargs; + + /** Array of arguments */ + struct fuse_arg args[3]; +}; + +/** The request state */ +enum fuse_req_state { + FUSE_REQ_INIT = 0, + FUSE_REQ_PENDING, + FUSE_REQ_READING, + FUSE_REQ_SENT, + FUSE_REQ_WRITING, + FUSE_REQ_FINISHED +}; + +/** + * A request to the client + */ +struct fuse_req { + /** This can be on either pending processing or io lists in + fuse_conn */ + struct list_head list; + + /** Entry on the interrupts list */ + struct list_head intr_entry; + + /** refcount */ + atomic_t count; + + /** Unique ID for the interrupt request */ + u64 intr_unique; + + /* + * The following bitfields are either set once before the + * request is queued or setting/clearing them is protected by + * fuse_conn->lock + */ + + /** True if the request has reply */ + unsigned isreply:1; + + /** Force sending of the request even if interrupted */ + unsigned force:1; + + /** The request was aborted */ + unsigned aborted:1; + + /** Request is sent in the background */ + unsigned background:1; + + /** The request has been interrupted */ + unsigned interrupted:1; + + /** Data is being copied to/from the request */ + unsigned locked:1; + + /** Request is counted as "waiting" */ + unsigned waiting:1; + + /** State of the request */ + enum fuse_req_state state; + + /** The request input */ + struct fuse_in in; + + /** The request output */ + struct fuse_out out; + + /** Used to wake up the task waiting for completion of request*/ + wait_queue_head_t waitq; + + /** Data for asynchronous requests */ + union { + struct { + union { + struct fuse_release_in in; + struct work_struct work; + }; + struct path path; + } release; + struct fuse_init_in init_in; + struct fuse_init_out init_out; + struct cuse_init_in cuse_init_in; + struct { + struct fuse_read_in in; + u64 attr_ver; + } read; + struct { + struct fuse_write_in in; + struct fuse_write_out out; + } write; + struct fuse_notify_retrieve_in retrieve_in; + struct fuse_lk_in lk_in; + } misc; + + /** page vector */ + struct page *pages[FUSE_MAX_PAGES_PER_REQ]; + + /** number of pages in vector */ + unsigned num_pages; + + /** offset of data on first page */ + unsigned page_offset; + + /** File used in the request (or NULL) */ + struct fuse_file *ff; + + /** Inode used in the request or NULL */ + struct inode *inode; + + /** Link on fi->writepages */ + struct list_head writepages_entry; + + /** Request completion callback */ + void (*end)(struct fuse_conn *, struct fuse_req *); + + /** Request is stolen from fuse_file->reserved_req */ + struct file *stolen_file; +}; + +/** + * A Fuse connection. + * + * This structure is created, when the filesystem is mounted, and is + * destroyed, when the client device is closed and the filesystem is + * unmounted. + */ +struct fuse_conn { + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /** Mutex protecting against directory alias creation */ + struct mutex inst_mutex; + + /** Refcount */ + atomic_t count; + + /** The user id for this mount */ + uid_t user_id; + + /** The group id for this mount */ + gid_t group_id; + + /** The fuse mount flags for this mount */ + unsigned flags; + + /** Maximum read size */ + unsigned max_read; + + /** Maximum write size */ + unsigned max_write; + + /** Readers of the connection are waiting on this */ + wait_queue_head_t waitq; + + /** The list of pending requests */ + struct list_head pending; + + /** The list of requests being processed */ + struct list_head processing; + + /** The list of requests under I/O */ + struct list_head io; + + /** The next unique kernel file handle */ + u64 khctr; + + /** rbtree of fuse_files waiting for poll events indexed by ph */ + struct rb_root polled_files; + + /** Maximum number of outstanding background requests */ + unsigned max_background; + + /** Number of background requests at which congestion starts */ + unsigned congestion_threshold; + + /** Number of requests currently in the background */ + unsigned num_background; + + /** Number of background requests currently queued for userspace */ + unsigned active_background; + + /** The list of background requests set aside for later queuing */ + struct list_head bg_queue; + + /** Pending interrupts */ + struct list_head interrupts; + + /** Queue of pending forgets */ + struct fuse_forget_link forget_list_head; + struct fuse_forget_link *forget_list_tail; + + /** Batching of FORGET requests (positive indicates FORGET batch) */ + int forget_batch; + + /** Flag indicating if connection is blocked. This will be + the case before the INIT reply is received, and if there + are too many outstading backgrounds requests */ + int blocked; + + /** waitq for blocked connection */ + wait_queue_head_t blocked_waitq; + + /** waitq for reserved requests */ + wait_queue_head_t reserved_req_waitq; + + /** The next unique request id */ + u64 reqctr; + + /** Connection established, cleared on umount, connection + abort and device release */ + unsigned connected; + + /** Connection failed (version mismatch). Cannot race with + setting other bitfields since it is only set once in INIT + reply, before any other request, and never cleared */ + unsigned conn_error:1; + + /** Connection successful. Only set in INIT */ + unsigned conn_init:1; + + /** Do readpages asynchronously? Only set in INIT */ + unsigned async_read:1; + + /** Do not send separate SETATTR request before open(O_TRUNC) */ + unsigned atomic_o_trunc:1; + + /** Filesystem supports NFS exporting. Only set in INIT */ + unsigned export_support:1; + + /** Set if bdi is valid */ + unsigned bdi_initialized:1; + + /* + * The following bitfields are only for optimization purposes + * and hence races in setting them will not cause malfunction + */ + + /** Is fsync not implemented by fs? */ + unsigned no_fsync:1; + + /** Is fsyncdir not implemented by fs? */ + unsigned no_fsyncdir:1; + + /** Is flush not implemented by fs? */ + unsigned no_flush:1; + + /** Is setxattr not implemented by fs? */ + unsigned no_setxattr:1; + + /** Is getxattr not implemented by fs? */ + unsigned no_getxattr:1; + + /** Is listxattr not implemented by fs? */ + unsigned no_listxattr:1; + + /** Is removexattr not implemented by fs? */ + unsigned no_removexattr:1; + + /** Are file locking primitives not implemented by fs? */ + unsigned no_lock:1; + + /** Is access not implemented by fs? */ + unsigned no_access:1; + + /** Is create not implemented by fs? */ + unsigned no_create:1; + + /** Is interrupt not implemented by fs? */ + unsigned no_interrupt:1; + + /** Is bmap not implemented by fs? */ + unsigned no_bmap:1; + + /** Is poll not implemented by fs? */ + unsigned no_poll:1; + + /** Do multi-page cached writes */ + unsigned big_writes:1; + + /** Don't apply umask to creation modes */ + unsigned dont_mask:1; + + /** The number of requests waiting for completion */ + atomic_t num_waiting; + + /** Negotiated minor version */ + unsigned minor; + + /** Backing dev info */ + struct backing_dev_info bdi; + + /** Entry on the fuse_conn_list */ + struct list_head entry; + + /** Device ID from super block */ + dev_t dev; + + /** Dentries in the control filesystem */ + struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES]; + + /** number of dentries used in the above array */ + int ctl_ndents; + + /** O_ASYNC requests */ + struct fasync_struct *fasync; + + /** Key for lock owner ID scrambling */ + u32 scramble_key[4]; + + /** Reserved request for the DESTROY message */ + struct fuse_req *destroy_req; + + /** Version counter for attribute changes */ + u64 attr_version; + + /** Called on final put */ + void (*release)(struct fuse_conn *); + + /** Super block for this connection. */ + struct super_block *sb; + + /** Read/write semaphore to hold when accessing sb. */ + struct rw_semaphore killsb; +}; + +static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct fuse_conn *get_fuse_conn(struct inode *inode) +{ + return get_fuse_conn_super(inode->i_sb); +} + +static inline struct fuse_inode *get_fuse_inode(struct inode *inode) +{ + return container_of(inode, struct fuse_inode, inode); +} + +static inline u64 get_node_id(struct inode *inode) +{ + return get_fuse_inode(inode)->nodeid; +} + +/** Device operations */ +extern const struct file_operations fuse_dev_operations; + +extern const struct dentry_operations fuse_dentry_operations; + +/** + * Inode to nodeid comparison. + */ +int fuse_inode_eq(struct inode *inode, void *_nodeidp); + +/** + * Get a filled in inode + */ +struct inode *fuse_iget(struct super_block *sb, u64 nodeid, + int generation, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version); + +int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, + struct fuse_entry_out *outarg, struct inode **inode); + +/** + * Send FORGET command + */ +void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, + u64 nodeid, u64 nlookup); + +struct fuse_forget_link *fuse_alloc_forget(void); + +/** + * Initialize READ or READDIR request + */ +void fuse_read_fill(struct fuse_req *req, struct file *file, + loff_t pos, size_t count, int opcode); + +/** + * Send OPEN or OPENDIR request + */ +int fuse_open_common(struct inode *inode, struct file *file, bool isdir); + +struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); +struct fuse_file *fuse_file_get(struct fuse_file *ff); +void fuse_file_free(struct fuse_file *ff); +void fuse_finish_open(struct inode *inode, struct file *file); + +void fuse_sync_release(struct fuse_file *ff, int flags); + +/** + * Send RELEASE or RELEASEDIR request + */ +void fuse_release_common(struct file *file, int opcode); + +/** + * Send FSYNC or FSYNCDIR request + */ +int fuse_fsync_common(struct file *file, int datasync, int isdir); + +/** + * Notify poll wakeup + */ +int fuse_notify_poll_wakeup(struct fuse_conn *fc, + struct fuse_notify_poll_wakeup_out *outarg); + +/** + * Initialize file operations on a regular file + */ +void fuse_init_file_inode(struct inode *inode); + +/** + * Initialize inode operations on regular files and special files + */ +void fuse_init_common(struct inode *inode); + +/** + * Initialize inode and file operations on a directory + */ +void fuse_init_dir(struct inode *inode); + +/** + * Initialize inode operations on a symlink + */ +void fuse_init_symlink(struct inode *inode); + +/** + * Change attributes of an inode + */ +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version); + +void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid); + +/** + * Initialize the client device + */ +int fuse_dev_init(void); + +/** + * Cleanup the client device + */ +void fuse_dev_cleanup(void); + +int fuse_ctl_init(void); +void fuse_ctl_cleanup(void); + +/** + * Allocate a request + */ +struct fuse_req *fuse_request_alloc(void); + +struct fuse_req *fuse_request_alloc_nofs(void); + +/** + * Free a request + */ +void fuse_request_free(struct fuse_req *req); + +/** + * Get a request, may fail with -ENOMEM + */ +struct fuse_req *fuse_get_req(struct fuse_conn *fc); + +/** + * Gets a requests for a file operation, always succeeds + */ +struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file); + +/** + * Decrement reference count of a request. If count goes to zero free + * the request. + */ +void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); + +/** + * Send a request (synchronous) + */ +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); + +/** + * Send a request in the background + */ +void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); + +void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req); + +/* Abort all requests */ +void fuse_abort_conn(struct fuse_conn *fc); + +/** + * Invalidate inode attributes + */ +void fuse_invalidate_attr(struct inode *inode); + +void fuse_invalidate_entry_cache(struct dentry *entry); + +/** + * Acquire reference to fuse_conn + */ +struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); + +void fuse_conn_kill(struct fuse_conn *fc); + +/** + * Initialize fuse_conn + */ +void fuse_conn_init(struct fuse_conn *fc); + +/** + * Release reference to fuse_conn + */ +void fuse_conn_put(struct fuse_conn *fc); + +/** + * Add connection to control filesystem + */ +int fuse_ctl_add_conn(struct fuse_conn *fc); + +/** + * Remove connection from control filesystem + */ +void fuse_ctl_remove_conn(struct fuse_conn *fc); + +/** + * Is file type valid? + */ +int fuse_valid_type(int m); + +/** + * Is task allowed to perform filesystem operation? + */ +int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task); + +u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); + +int fuse_update_attributes(struct inode *inode, struct kstat *stat, + struct file *file, bool *refreshed); + +void fuse_flush_writepages(struct inode *inode); + +void fuse_set_nowrite(struct inode *inode); +void fuse_release_nowrite(struct inode *inode); + +u64 fuse_get_attr_version(struct fuse_conn *fc); + +/** + * File-system tells the kernel to invalidate cache for the given node id. + */ +int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, + loff_t offset, loff_t len); + +/** + * File-system tells the kernel to invalidate parent attributes and + * the dentry matching parent/name. + */ +int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, + struct qstr *name); + +int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, + bool isdir); +ssize_t fuse_direct_io(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, int write); +long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, + unsigned int flags); +unsigned fuse_file_poll(struct file *file, poll_table *wait); +int fuse_dev_release(struct inode *inode, struct file *file); + +void fuse_write_update_size(struct inode *inode, loff_t pos); + +#endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c new file mode 100644 index 00000000..69a1e0f0 --- /dev/null +++ b/fs/fuse/inode.c @@ -0,0 +1,1263 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Miklos Szeredi "); +MODULE_DESCRIPTION("Filesystem in Userspace"); +MODULE_LICENSE("GPL"); + +static struct kmem_cache *fuse_inode_cachep; +struct list_head fuse_conn_list; +DEFINE_MUTEX(fuse_mutex); + +static int set_global_limit(const char *val, struct kernel_param *kp); + +unsigned max_user_bgreq; +module_param_call(max_user_bgreq, set_global_limit, param_get_uint, + &max_user_bgreq, 0644); +__MODULE_PARM_TYPE(max_user_bgreq, "uint"); +MODULE_PARM_DESC(max_user_bgreq, + "Global limit for the maximum number of backgrounded requests an " + "unprivileged user can set"); + +unsigned max_user_congthresh; +module_param_call(max_user_congthresh, set_global_limit, param_get_uint, + &max_user_congthresh, 0644); +__MODULE_PARM_TYPE(max_user_congthresh, "uint"); +MODULE_PARM_DESC(max_user_congthresh, + "Global limit for the maximum congestion threshold an " + "unprivileged user can set"); + +#define FUSE_SUPER_MAGIC 0x65735546 + +#define FUSE_DEFAULT_BLKSIZE 512 + +/** Maximum number of outstanding background requests */ +#define FUSE_DEFAULT_MAX_BACKGROUND 12 + +/** Congestion starts at 75% of maximum */ +#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) + +struct fuse_mount_data { + int fd; + unsigned rootmode; + unsigned user_id; + unsigned group_id; + unsigned fd_present:1; + unsigned rootmode_present:1; + unsigned user_id_present:1; + unsigned group_id_present:1; + unsigned flags; + unsigned max_read; + unsigned blksize; +}; + +struct fuse_forget_link *fuse_alloc_forget() +{ + return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); +} + +static struct inode *fuse_alloc_inode(struct super_block *sb) +{ + struct inode *inode; + struct fuse_inode *fi; + + inode = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL); + if (!inode) + return NULL; + + fi = get_fuse_inode(inode); + fi->i_time = 0; + fi->nodeid = 0; + fi->nlookup = 0; + fi->attr_version = 0; + fi->writectr = 0; + fi->orig_ino = 0; + INIT_LIST_HEAD(&fi->write_files); + INIT_LIST_HEAD(&fi->queued_writes); + INIT_LIST_HEAD(&fi->writepages); + init_waitqueue_head(&fi->page_waitq); + fi->forget = fuse_alloc_forget(); + if (!fi->forget) { + kmem_cache_free(fuse_inode_cachep, inode); + return NULL; + } + + return inode; +} + +static void fuse_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(fuse_inode_cachep, inode); +} + +static void fuse_destroy_inode(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + BUG_ON(!list_empty(&fi->write_files)); + BUG_ON(!list_empty(&fi->queued_writes)); + kfree(fi->forget); + call_rcu(&inode->i_rcu, fuse_i_callback); +} + +static void fuse_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + if (inode->i_sb->s_flags & MS_ACTIVE) { + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); + fi->forget = NULL; + } +} + +static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) +{ + if (*flags & MS_MANDLOCK) + return -EINVAL; + + return 0; +} + +/* + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. + */ +static ino_t fuse_squash_ino(u64 ino64) +{ + ino_t ino = (ino_t) ino64; + if (sizeof(ino_t) < sizeof(u64)) + ino ^= ino64 >> (sizeof(u64) - sizeof(ino_t)) * 8; + return ino; +} + +void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + fi->attr_version = ++fc->attr_version; + fi->i_time = attr_valid; + + inode->i_ino = fuse_squash_ino(attr->ino); + inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); + inode->i_nlink = attr->nlink; + inode->i_uid = attr->uid; + inode->i_gid = attr->gid; + inode->i_blocks = attr->blocks; + inode->i_atime.tv_sec = attr->atime; + inode->i_atime.tv_nsec = attr->atimensec; + inode->i_mtime.tv_sec = attr->mtime; + inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; + + if (attr->blksize != 0) + inode->i_blkbits = ilog2(attr->blksize); + else + inode->i_blkbits = inode->i_sb->s_blocksize_bits; + + /* + * Don't set the sticky bit in i_mode, unless we want the VFS + * to check permissions. This prevents failures due to the + * check in may_delete(). + */ + fi->orig_i_mode = inode->i_mode; + if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) + inode->i_mode &= ~S_ISVTX; + + fi->orig_ino = attr->ino; +} + +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + loff_t oldsize; + + spin_lock(&fc->lock); + if (attr_version != 0 && fi->attr_version > attr_version) { + spin_unlock(&fc->lock); + return; + } + + fuse_change_attributes_common(inode, attr, attr_valid); + + oldsize = inode->i_size; + i_size_write(inode, attr->size); + spin_unlock(&fc->lock); + + if (S_ISREG(inode->i_mode) && oldsize != attr->size) { + truncate_pagecache(inode, oldsize, attr->size); + invalidate_inode_pages2(inode->i_mapping); + } +} + +static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) +{ + inode->i_mode = attr->mode & S_IFMT; + inode->i_size = attr->size; + if (S_ISREG(inode->i_mode)) { + fuse_init_common(inode); + fuse_init_file_inode(inode); + } else if (S_ISDIR(inode->i_mode)) + fuse_init_dir(inode); + else if (S_ISLNK(inode->i_mode)) + fuse_init_symlink(inode); + else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + fuse_init_common(inode); + init_special_inode(inode, inode->i_mode, + new_decode_dev(attr->rdev)); + } else + BUG(); +} + +int fuse_inode_eq(struct inode *inode, void *_nodeidp) +{ + u64 nodeid = *(u64 *) _nodeidp; + if (get_node_id(inode) == nodeid) + return 1; + else + return 0; +} + +static int fuse_inode_set(struct inode *inode, void *_nodeidp) +{ + u64 nodeid = *(u64 *) _nodeidp; + get_fuse_inode(inode)->nodeid = nodeid; + return 0; +} + +struct inode *fuse_iget(struct super_block *sb, u64 nodeid, + int generation, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version) +{ + struct inode *inode; + struct fuse_inode *fi; + struct fuse_conn *fc = get_fuse_conn_super(sb); + + retry: + inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid); + if (!inode) + return NULL; + + if ((inode->i_state & I_NEW)) { + inode->i_flags |= S_NOATIME|S_NOCMTIME; + inode->i_generation = generation; + inode->i_data.backing_dev_info = &fc->bdi; + fuse_init_inode(inode, attr); + unlock_new_inode(inode); + } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { + /* Inode has changed type, any I/O on the old should fail */ + make_bad_inode(inode); + iput(inode); + goto retry; + } + + fi = get_fuse_inode(inode); + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + fuse_change_attributes(inode, attr, attr_valid, attr_version); + + return inode; +} + +int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, + loff_t offset, loff_t len) +{ + struct inode *inode; + pgoff_t pg_start; + pgoff_t pg_end; + + inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid); + if (!inode) + return -ENOENT; + + fuse_invalidate_attr(inode); + if (offset >= 0) { + pg_start = offset >> PAGE_CACHE_SHIFT; + if (len <= 0) + pg_end = -1; + else + pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT; + invalidate_inode_pages2_range(inode->i_mapping, + pg_start, pg_end); + } + iput(inode); + return 0; +} + +static void fuse_umount_begin(struct super_block *sb) +{ + fuse_abort_conn(get_fuse_conn_super(sb)); +} + +static void fuse_send_destroy(struct fuse_conn *fc) +{ + struct fuse_req *req = fc->destroy_req; + if (req && fc->conn_init) { + fc->destroy_req = NULL; + req->in.h.opcode = FUSE_DESTROY; + req->force = 1; + fuse_request_send(fc, req); + fuse_put_request(fc, req); + } +} + +static void fuse_bdi_destroy(struct fuse_conn *fc) +{ + if (fc->bdi_initialized) + bdi_destroy(&fc->bdi); +} + +void fuse_conn_kill(struct fuse_conn *fc) +{ + spin_lock(&fc->lock); + fc->connected = 0; + fc->blocked = 0; + spin_unlock(&fc->lock); + /* Flush all readers on this fs */ + kill_fasync(&fc->fasync, SIGIO, POLL_IN); + wake_up_all(&fc->waitq); + wake_up_all(&fc->blocked_waitq); + wake_up_all(&fc->reserved_req_waitq); + mutex_lock(&fuse_mutex); + list_del(&fc->entry); + fuse_ctl_remove_conn(fc); + mutex_unlock(&fuse_mutex); + fuse_bdi_destroy(fc); +} +EXPORT_SYMBOL_GPL(fuse_conn_kill); + +static void fuse_put_super(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + fuse_send_destroy(fc); + fuse_conn_kill(fc); + fuse_conn_put(fc); +} + +static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) +{ + stbuf->f_type = FUSE_SUPER_MAGIC; + stbuf->f_bsize = attr->bsize; + stbuf->f_frsize = attr->frsize; + stbuf->f_blocks = attr->blocks; + stbuf->f_bfree = attr->bfree; + stbuf->f_bavail = attr->bavail; + stbuf->f_files = attr->files; + stbuf->f_ffree = attr->ffree; + stbuf->f_namelen = attr->namelen; + /* fsid is left zero */ +} + +static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_req *req; + struct fuse_statfs_out outarg; + int err; + + if (!fuse_allow_task(fc, current)) { + buf->f_type = FUSE_SUPER_MAGIC; + return 0; + } + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&outarg, 0, sizeof(outarg)); + req->in.numargs = 0; + req->in.h.opcode = FUSE_STATFS; + req->in.h.nodeid = get_node_id(dentry->d_inode); + req->out.numargs = 1; + req->out.args[0].size = + fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + if (!err) + convert_fuse_statfs(buf, &outarg.st); + fuse_put_request(fc, req); + return err; +} + +enum { + OPT_FD, + OPT_ROOTMODE, + OPT_USER_ID, + OPT_GROUP_ID, + OPT_DEFAULT_PERMISSIONS, + OPT_ALLOW_OTHER, + OPT_MAX_READ, + OPT_BLKSIZE, + OPT_ERR +}; + +static const match_table_t tokens = { + {OPT_FD, "fd=%u"}, + {OPT_ROOTMODE, "rootmode=%o"}, + {OPT_USER_ID, "user_id=%u"}, + {OPT_GROUP_ID, "group_id=%u"}, + {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, + {OPT_ALLOW_OTHER, "allow_other"}, + {OPT_MAX_READ, "max_read=%u"}, + {OPT_BLKSIZE, "blksize=%u"}, + {OPT_ERR, NULL} +}; + +static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) +{ + char *p; + memset(d, 0, sizeof(struct fuse_mount_data)); + d->max_read = ~0; + d->blksize = FUSE_DEFAULT_BLKSIZE; + + while ((p = strsep(&opt, ",")) != NULL) { + int token; + int value; + substring_t args[MAX_OPT_ARGS]; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case OPT_FD: + if (match_int(&args[0], &value)) + return 0; + d->fd = value; + d->fd_present = 1; + break; + + case OPT_ROOTMODE: + if (match_octal(&args[0], &value)) + return 0; + if (!fuse_valid_type(value)) + return 0; + d->rootmode = value; + d->rootmode_present = 1; + break; + + case OPT_USER_ID: + if (match_int(&args[0], &value)) + return 0; + d->user_id = value; + d->user_id_present = 1; + break; + + case OPT_GROUP_ID: + if (match_int(&args[0], &value)) + return 0; + d->group_id = value; + d->group_id_present = 1; + break; + + case OPT_DEFAULT_PERMISSIONS: + d->flags |= FUSE_DEFAULT_PERMISSIONS; + break; + + case OPT_ALLOW_OTHER: + d->flags |= FUSE_ALLOW_OTHER; + break; + + case OPT_MAX_READ: + if (match_int(&args[0], &value)) + return 0; + d->max_read = value; + break; + + case OPT_BLKSIZE: + if (!is_bdev || match_int(&args[0], &value)) + return 0; + d->blksize = value; + break; + + default: + return 0; + } + } + + if (!d->fd_present || !d->rootmode_present || + !d->user_id_present || !d->group_id_present) + return 0; + + return 1; +} + +static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + struct fuse_conn *fc = get_fuse_conn_super(mnt->mnt_sb); + + seq_printf(m, ",user_id=%u", fc->user_id); + seq_printf(m, ",group_id=%u", fc->group_id); + if (fc->flags & FUSE_DEFAULT_PERMISSIONS) + seq_puts(m, ",default_permissions"); + if (fc->flags & FUSE_ALLOW_OTHER) + seq_puts(m, ",allow_other"); + if (fc->max_read != ~0) + seq_printf(m, ",max_read=%u", fc->max_read); + if (mnt->mnt_sb->s_bdev && + mnt->mnt_sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) + seq_printf(m, ",blksize=%lu", mnt->mnt_sb->s_blocksize); + return 0; +} + +void fuse_conn_init(struct fuse_conn *fc) +{ + memset(fc, 0, sizeof(*fc)); + spin_lock_init(&fc->lock); + mutex_init(&fc->inst_mutex); + init_rwsem(&fc->killsb); + atomic_set(&fc->count, 1); + init_waitqueue_head(&fc->waitq); + init_waitqueue_head(&fc->blocked_waitq); + init_waitqueue_head(&fc->reserved_req_waitq); + INIT_LIST_HEAD(&fc->pending); + INIT_LIST_HEAD(&fc->processing); + INIT_LIST_HEAD(&fc->io); + INIT_LIST_HEAD(&fc->interrupts); + INIT_LIST_HEAD(&fc->bg_queue); + INIT_LIST_HEAD(&fc->entry); + fc->forget_list_tail = &fc->forget_list_head; + atomic_set(&fc->num_waiting, 0); + fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; + fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; + fc->khctr = 0; + fc->polled_files = RB_ROOT; + fc->reqctr = 0; + fc->blocked = 1; + fc->attr_version = 1; + get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); +} +EXPORT_SYMBOL_GPL(fuse_conn_init); + +void fuse_conn_put(struct fuse_conn *fc) +{ + if (atomic_dec_and_test(&fc->count)) { + if (fc->destroy_req) + fuse_request_free(fc->destroy_req); + mutex_destroy(&fc->inst_mutex); + fc->release(fc); + } +} +EXPORT_SYMBOL_GPL(fuse_conn_put); + +struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) +{ + atomic_inc(&fc->count); + return fc; +} +EXPORT_SYMBOL_GPL(fuse_conn_get); + +static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) +{ + struct fuse_attr attr; + memset(&attr, 0, sizeof(attr)); + + attr.mode = mode; + attr.ino = FUSE_ROOT_ID; + attr.nlink = 1; + return fuse_iget(sb, 1, 0, &attr, 0, 0); +} + +struct fuse_inode_handle { + u64 nodeid; + u32 generation; +}; + +static struct dentry *fuse_get_dentry(struct super_block *sb, + struct fuse_inode_handle *handle) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct inode *inode; + struct dentry *entry; + int err = -ESTALE; + + if (handle->nodeid == 0) + goto out_err; + + inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid); + if (!inode) { + struct fuse_entry_out outarg; + struct qstr name; + + if (!fc->export_support) + goto out_err; + + name.len = 1; + name.name = "."; + err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg, + &inode); + if (err && err != -ENOENT) + goto out_err; + if (err || !inode) { + err = -ESTALE; + goto out_err; + } + err = -EIO; + if (get_node_id(inode) != handle->nodeid) + goto out_iput; + } + err = -ESTALE; + if (inode->i_generation != handle->generation) + goto out_iput; + + entry = d_obtain_alias(inode); + if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) + fuse_invalidate_entry_cache(entry); + + return entry; + + out_iput: + iput(inode); + out_err: + return ERR_PTR(err); +} + +static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + int connectable) +{ + struct inode *inode = dentry->d_inode; + bool encode_parent = connectable && !S_ISDIR(inode->i_mode); + int len = encode_parent ? 6 : 3; + u64 nodeid; + u32 generation; + + if (*max_len < len) { + *max_len = len; + return 255; + } + + nodeid = get_fuse_inode(inode)->nodeid; + generation = inode->i_generation; + + fh[0] = (u32)(nodeid >> 32); + fh[1] = (u32)(nodeid & 0xffffffff); + fh[2] = generation; + + if (encode_parent) { + struct inode *parent; + + spin_lock(&dentry->d_lock); + parent = dentry->d_parent->d_inode; + nodeid = get_fuse_inode(parent)->nodeid; + generation = parent->i_generation; + spin_unlock(&dentry->d_lock); + + fh[3] = (u32)(nodeid >> 32); + fh[4] = (u32)(nodeid & 0xffffffff); + fh[5] = generation; + } + + *max_len = len; + return encode_parent ? 0x82 : 0x81; +} + +static struct dentry *fuse_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct fuse_inode_handle handle; + + if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3) + return NULL; + + handle.nodeid = (u64) fid->raw[0] << 32; + handle.nodeid |= (u64) fid->raw[1]; + handle.generation = fid->raw[2]; + return fuse_get_dentry(sb, &handle); +} + +static struct dentry *fuse_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct fuse_inode_handle parent; + + if (fh_type != 0x82 || fh_len < 6) + return NULL; + + parent.nodeid = (u64) fid->raw[3] << 32; + parent.nodeid |= (u64) fid->raw[4]; + parent.generation = fid->raw[5]; + return fuse_get_dentry(sb, &parent); +} + +static struct dentry *fuse_get_parent(struct dentry *child) +{ + struct inode *child_inode = child->d_inode; + struct fuse_conn *fc = get_fuse_conn(child_inode); + struct inode *inode; + struct dentry *parent; + struct fuse_entry_out outarg; + struct qstr name; + int err; + + if (!fc->export_support) + return ERR_PTR(-ESTALE); + + name.len = 2; + name.name = ".."; + err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode), + &name, &outarg, &inode); + if (err) { + if (err == -ENOENT) + return ERR_PTR(-ESTALE); + return ERR_PTR(err); + } + + parent = d_obtain_alias(inode); + if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) + fuse_invalidate_entry_cache(parent); + + return parent; +} + +static const struct export_operations fuse_export_operations = { + .fh_to_dentry = fuse_fh_to_dentry, + .fh_to_parent = fuse_fh_to_parent, + .encode_fh = fuse_encode_fh, + .get_parent = fuse_get_parent, +}; + +static const struct super_operations fuse_super_operations = { + .alloc_inode = fuse_alloc_inode, + .destroy_inode = fuse_destroy_inode, + .evict_inode = fuse_evict_inode, + .drop_inode = generic_delete_inode, + .remount_fs = fuse_remount_fs, + .put_super = fuse_put_super, + .umount_begin = fuse_umount_begin, + .statfs = fuse_statfs, + .show_options = fuse_show_options, +}; + +static void sanitize_global_limit(unsigned *limit) +{ + if (*limit == 0) + *limit = ((num_physpages << PAGE_SHIFT) >> 13) / + sizeof(struct fuse_req); + + if (*limit >= 1 << 16) + *limit = (1 << 16) - 1; +} + +static int set_global_limit(const char *val, struct kernel_param *kp) +{ + int rv; + + rv = param_set_uint(val, kp); + if (rv) + return rv; + + sanitize_global_limit((unsigned *)kp->arg); + + return 0; +} + +static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) +{ + int cap_sys_admin = capable(CAP_SYS_ADMIN); + + if (arg->minor < 13) + return; + + sanitize_global_limit(&max_user_bgreq); + sanitize_global_limit(&max_user_congthresh); + + if (arg->max_background) { + fc->max_background = arg->max_background; + + if (!cap_sys_admin && fc->max_background > max_user_bgreq) + fc->max_background = max_user_bgreq; + } + if (arg->congestion_threshold) { + fc->congestion_threshold = arg->congestion_threshold; + + if (!cap_sys_admin && + fc->congestion_threshold > max_user_congthresh) + fc->congestion_threshold = max_user_congthresh; + } +} + +static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) +{ + struct fuse_init_out *arg = &req->misc.init_out; + + if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) + fc->conn_error = 1; + else { + unsigned long ra_pages; + + process_init_limits(fc, arg); + + if (arg->minor >= 6) { + ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; + if (arg->flags & FUSE_ASYNC_READ) + fc->async_read = 1; + if (!(arg->flags & FUSE_POSIX_LOCKS)) + fc->no_lock = 1; + if (arg->flags & FUSE_ATOMIC_O_TRUNC) + fc->atomic_o_trunc = 1; + if (arg->minor >= 9) { + /* LOOKUP has dependency on proto version */ + if (arg->flags & FUSE_EXPORT_SUPPORT) + fc->export_support = 1; + } + if (arg->flags & FUSE_BIG_WRITES) + fc->big_writes = 1; + if (arg->flags & FUSE_DONT_MASK) + fc->dont_mask = 1; + } else { + ra_pages = fc->max_read / PAGE_CACHE_SIZE; + fc->no_lock = 1; + } + + fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); + fc->minor = arg->minor; + fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; + fc->max_write = max_t(unsigned, 4096, fc->max_write); + fc->conn_init = 1; + } + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); +} + +static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) +{ + struct fuse_init_in *arg = &req->misc.init_in; + + arg->major = FUSE_KERNEL_VERSION; + arg->minor = FUSE_KERNEL_MINOR_VERSION; + arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; + arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | + FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK; + req->in.h.opcode = FUSE_INIT; + req->in.numargs = 1; + req->in.args[0].size = sizeof(*arg); + req->in.args[0].value = arg; + req->out.numargs = 1; + /* Variable length argument used for backward compatibility + with interface version < 7.5. Rest of init_out is zeroed + by do_get_request(), so a short reply is not a problem */ + req->out.argvar = 1; + req->out.args[0].size = sizeof(struct fuse_init_out); + req->out.args[0].value = &req->misc.init_out; + req->end = process_init_reply; + fuse_request_send_background(fc, req); +} + +static void fuse_free_conn(struct fuse_conn *fc) +{ + kfree(fc); +} + +static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) +{ + int err; + + fc->bdi.name = "fuse"; + fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; + /* fuse does it's own writeback accounting */ + fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; + + err = bdi_init(&fc->bdi); + if (err) + return err; + + fc->bdi_initialized = 1; + + if (sb->s_bdev) { + err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", + MAJOR(fc->dev), MINOR(fc->dev)); + } else { + err = bdi_register_dev(&fc->bdi, fc->dev); + } + + if (err) + return err; + + /* + * For a single fuse filesystem use max 1% of dirty + + * writeback threshold. + * + * This gives about 1M of write buffer for memory maps on a + * machine with 1G and 10% dirty_ratio, which should be more + * than enough. + * + * Privileged users can raise it by writing to + * + * /sys/class/bdi//max_ratio + */ + bdi_set_max_ratio(&fc->bdi, 1); + + return 0; +} + +static int fuse_fill_super(struct super_block *sb, void *data, int silent) +{ + struct fuse_conn *fc; + struct inode *root; + struct fuse_mount_data d; + struct file *file; + struct dentry *root_dentry; + struct fuse_req *init_req; + int err; + int is_bdev = sb->s_bdev != NULL; + + err = -EINVAL; + if (sb->s_flags & MS_MANDLOCK) + goto err; + + sb->s_flags &= ~MS_NOSEC; + + if (!parse_fuse_opt((char *) data, &d, is_bdev)) + goto err; + + if (is_bdev) { +#ifdef CONFIG_BLOCK + err = -EINVAL; + if (!sb_set_blocksize(sb, d.blksize)) + goto err; +#endif + } else { + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + } + sb->s_magic = FUSE_SUPER_MAGIC; + sb->s_op = &fuse_super_operations; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_export_op = &fuse_export_operations; + + file = fget(d.fd); + err = -EINVAL; + if (!file) + goto err; + + if (file->f_op != &fuse_dev_operations) + goto err_fput; + + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + err = -ENOMEM; + if (!fc) + goto err_fput; + + fuse_conn_init(fc); + + fc->dev = sb->s_dev; + fc->sb = sb; + err = fuse_bdi_init(fc, sb); + if (err) + goto err_put_conn; + + sb->s_bdi = &fc->bdi; + + /* Handle umasking inside the fuse code */ + if (sb->s_flags & MS_POSIXACL) + fc->dont_mask = 1; + sb->s_flags |= MS_POSIXACL; + + fc->release = fuse_free_conn; + fc->flags = d.flags; + fc->user_id = d.user_id; + fc->group_id = d.group_id; + fc->max_read = max_t(unsigned, 4096, d.max_read); + + /* Used by get_root_inode() */ + sb->s_fs_info = fc; + + err = -ENOMEM; + root = fuse_get_root_inode(sb, d.rootmode); + if (!root) + goto err_put_conn; + + root_dentry = d_alloc_root(root); + if (!root_dentry) { + iput(root); + goto err_put_conn; + } + /* only now - we want root dentry with NULL ->d_op */ + sb->s_d_op = &fuse_dentry_operations; + + init_req = fuse_request_alloc(); + if (!init_req) + goto err_put_root; + + if (is_bdev) { + fc->destroy_req = fuse_request_alloc(); + if (!fc->destroy_req) + goto err_free_init_req; + } + + mutex_lock(&fuse_mutex); + err = -EINVAL; + if (file->private_data) + goto err_unlock; + + err = fuse_ctl_add_conn(fc); + if (err) + goto err_unlock; + + list_add_tail(&fc->entry, &fuse_conn_list); + sb->s_root = root_dentry; + fc->connected = 1; + file->private_data = fuse_conn_get(fc); + mutex_unlock(&fuse_mutex); + /* + * atomic_dec_and_test() in fput() provides the necessary + * memory barrier for file->private_data to be visible on all + * CPUs after this + */ + fput(file); + + fuse_send_init(fc, init_req); + + return 0; + + err_unlock: + mutex_unlock(&fuse_mutex); + err_free_init_req: + fuse_request_free(init_req); + err_put_root: + dput(root_dentry); + err_put_conn: + fuse_bdi_destroy(fc); + fuse_conn_put(fc); + err_fput: + fput(file); + err: + return err; +} + +static struct dentry *fuse_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *raw_data) +{ + return mount_nodev(fs_type, flags, raw_data, fuse_fill_super); +} + +static void fuse_kill_sb_anon(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + if (fc) { + down_write(&fc->killsb); + fc->sb = NULL; + up_write(&fc->killsb); + } + + kill_anon_super(sb); +} + +static struct file_system_type fuse_fs_type = { + .owner = THIS_MODULE, + .name = "fuse", + .fs_flags = FS_HAS_SUBTYPE, + .mount = fuse_mount, + .kill_sb = fuse_kill_sb_anon, +}; + +#ifdef CONFIG_BLOCK +static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *raw_data) +{ + return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super); +} + +static void fuse_kill_sb_blk(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + if (fc) { + down_write(&fc->killsb); + fc->sb = NULL; + up_write(&fc->killsb); + } + + kill_block_super(sb); +} + +static struct file_system_type fuseblk_fs_type = { + .owner = THIS_MODULE, + .name = "fuseblk", + .mount = fuse_mount_blk, + .kill_sb = fuse_kill_sb_blk, + .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, +}; + +static inline int register_fuseblk(void) +{ + return register_filesystem(&fuseblk_fs_type); +} + +static inline void unregister_fuseblk(void) +{ + unregister_filesystem(&fuseblk_fs_type); +} +#else +static inline int register_fuseblk(void) +{ + return 0; +} + +static inline void unregister_fuseblk(void) +{ +} +#endif + +static void fuse_inode_init_once(void *foo) +{ + struct inode *inode = foo; + + inode_init_once(inode); +} + +static int __init fuse_fs_init(void) +{ + int err; + + err = register_filesystem(&fuse_fs_type); + if (err) + goto out; + + err = register_fuseblk(); + if (err) + goto out_unreg; + + fuse_inode_cachep = kmem_cache_create("fuse_inode", + sizeof(struct fuse_inode), + 0, SLAB_HWCACHE_ALIGN, + fuse_inode_init_once); + err = -ENOMEM; + if (!fuse_inode_cachep) + goto out_unreg2; + + return 0; + + out_unreg2: + unregister_fuseblk(); + out_unreg: + unregister_filesystem(&fuse_fs_type); + out: + return err; +} + +static void fuse_fs_cleanup(void) +{ + unregister_filesystem(&fuse_fs_type); + unregister_fuseblk(); + kmem_cache_destroy(fuse_inode_cachep); +} + +static struct kobject *fuse_kobj; +static struct kobject *connections_kobj; + +static int fuse_sysfs_init(void) +{ + int err; + + fuse_kobj = kobject_create_and_add("fuse", fs_kobj); + if (!fuse_kobj) { + err = -ENOMEM; + goto out_err; + } + + connections_kobj = kobject_create_and_add("connections", fuse_kobj); + if (!connections_kobj) { + err = -ENOMEM; + goto out_fuse_unregister; + } + + return 0; + + out_fuse_unregister: + kobject_put(fuse_kobj); + out_err: + return err; +} + +static void fuse_sysfs_cleanup(void) +{ + kobject_put(connections_kobj); + kobject_put(fuse_kobj); +} + +static int __init fuse_init(void) +{ + int res; + + printk(KERN_INFO "fuse init (API version %i.%i)\n", + FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); + + INIT_LIST_HEAD(&fuse_conn_list); + res = fuse_fs_init(); + if (res) + goto err; + + res = fuse_dev_init(); + if (res) + goto err_fs_cleanup; + + res = fuse_sysfs_init(); + if (res) + goto err_dev_cleanup; + + res = fuse_ctl_init(); + if (res) + goto err_sysfs_cleanup; + + sanitize_global_limit(&max_user_bgreq); + sanitize_global_limit(&max_user_congthresh); + + return 0; + + err_sysfs_cleanup: + fuse_sysfs_cleanup(); + err_dev_cleanup: + fuse_dev_cleanup(); + err_fs_cleanup: + fuse_fs_cleanup(); + err: + return res; +} + +static void __exit fuse_exit(void) +{ + printk(KERN_DEBUG "fuse exit\n"); + + fuse_ctl_cleanup(); + fuse_sysfs_cleanup(); + fuse_fs_cleanup(); + fuse_dev_cleanup(); +} + +module_init(fuse_init); +module_exit(fuse_exit); diff --git a/fs/generic_acl.c b/fs/generic_acl.c new file mode 100644 index 00000000..8f26d1a5 --- /dev/null +++ b/fs/generic_acl.c @@ -0,0 +1,225 @@ +/* + * (C) 2005 Andreas Gruenbacher + * + * This file is released under the GPL. + * + * Generic ACL support for in-memory filesystems. + */ + +#include +#include +#include +#include +#include +#include + + +static size_t +generic_acl_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + struct posix_acl *acl; + const char *xname; + size_t size; + + acl = get_cached_acl(dentry->d_inode, type); + if (!acl) + return 0; + posix_acl_release(acl); + + switch (type) { + case ACL_TYPE_ACCESS: + xname = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + xname = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return 0; + } + size = strlen(xname) + 1; + if (list && size <= list_size) + memcpy(list, xname, size); + return size; +} + +static int +generic_acl_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) +{ + struct posix_acl *acl; + int error; + + if (strcmp(name, "") != 0) + return -EINVAL; + + acl = get_cached_acl(dentry->d_inode, type); + if (!acl) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +static int +generic_acl_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl = NULL; + int error; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + if (!inode_owner_or_capable(inode)) + return -EPERM; + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (acl) { + mode_t mode; + + error = posix_acl_valid(acl); + if (error) + goto failed; + switch (type) { + case ACL_TYPE_ACCESS: + mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + goto failed; + inode->i_mode = mode; + inode->i_ctime = CURRENT_TIME; + if (error == 0) { + posix_acl_release(acl); + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + error = -EINVAL; + goto failed; + } + break; + } + } + set_cached_acl(inode, type, acl); + error = 0; +failed: + posix_acl_release(acl); + return error; +} + +/** + * generic_acl_init - Take care of acl inheritance at @inode create time + * + * Files created inside a directory with a default ACL inherit the + * directory's default ACL. + */ +int +generic_acl_init(struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + mode_t mode = inode->i_mode; + int error; + + inode->i_mode = mode & ~current_umask(); + if (!S_ISLNK(inode->i_mode)) + acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); + if (acl) { + struct posix_acl *clone; + + if (S_ISDIR(inode->i_mode)) { + clone = posix_acl_clone(acl, GFP_KERNEL); + error = -ENOMEM; + if (!clone) + goto cleanup; + set_cached_acl(inode, ACL_TYPE_DEFAULT, clone); + posix_acl_release(clone); + } + clone = posix_acl_clone(acl, GFP_KERNEL); + error = -ENOMEM; + if (!clone) + goto cleanup; + error = posix_acl_create_masq(clone, &mode); + if (error >= 0) { + inode->i_mode = mode; + if (error > 0) + set_cached_acl(inode, ACL_TYPE_ACCESS, clone); + } + posix_acl_release(clone); + } + error = 0; + +cleanup: + posix_acl_release(acl); + return error; +} + +/** + * generic_acl_chmod - change the access acl of @inode upon chmod() + * + * A chmod also changes the permissions of the owner, group/mask, and + * other ACL entries. + */ +int +generic_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int error = 0; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + acl = get_cached_acl(inode, ACL_TYPE_ACCESS); + if (acl) { + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + error = posix_acl_chmod_masq(clone, inode->i_mode); + if (!error) + set_cached_acl(inode, ACL_TYPE_ACCESS, clone); + posix_acl_release(clone); + } + return error; +} + +int +generic_check_acl(struct inode *inode, int mask, unsigned int flags) +{ + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -ECHILD; + } else { + struct posix_acl *acl; + + acl = get_cached_acl(inode, ACL_TYPE_ACCESS); + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + } + return -EAGAIN; +} + +const struct xattr_handler generic_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = generic_acl_list, + .get = generic_acl_get, + .set = generic_acl_set, +}; + +const struct xattr_handler generic_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = generic_acl_list, + .get = generic_acl_get, + .set = generic_acl_set, +}; diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig new file mode 100644 index 00000000..c465ae06 --- /dev/null +++ b/fs/gfs2/Kconfig @@ -0,0 +1,38 @@ +config GFS2_FS + tristate "GFS2 file system support" + depends on (64BIT || LBDAF) + select DLM if GFS2_FS_LOCKING_DLM + select CONFIGFS_FS if GFS2_FS_LOCKING_DLM + select SYSFS if GFS2_FS_LOCKING_DLM + select IP_SCTP if DLM_SCTP + select FS_POSIX_ACL + select CRC32 + select QUOTACTL + help + A cluster filesystem. + + Allows a cluster of computers to simultaneously use a block device + that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads + and writes to the block device like a local filesystem, but also uses + a lock module to allow the computers coordinate their I/O so + filesystem consistency is maintained. One of the nifty features of + GFS is perfect consistency -- changes made to the filesystem on one + machine show up immediately on all other machines in the cluster. + + To use the GFS2 filesystem in a cluster, you will need to enable + the locking module below. Documentation and utilities for GFS2 can + be found here: http://sources.redhat.com/cluster + + The "nolock" lock module is now built in to GFS2 by default. If + you want to use the DLM, be sure to enable HOTPLUG and IPv4/6 + networking. + +config GFS2_FS_LOCKING_DLM + bool "GFS2 DLM locking" + depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && HOTPLUG + help + Multiple node locking module for GFS2 + + Most users of GFS2 will require this. It provides the locking + interface between GFS2 and the DLM, which is required to use GFS2 + in a cluster environment. diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile new file mode 100644 index 00000000..86128202 --- /dev/null +++ b/fs/gfs2/Makefile @@ -0,0 +1,10 @@ +ccflags-y := -I$(src) +obj-$(CONFIG_GFS2_FS) += gfs2.o +gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \ + glops.o log.o lops.o main.o meta_io.o \ + aops.o dentry.o export.o file.o \ + ops_fstype.o inode.o quota.o \ + recovery.o rgrp.o super.o sys.o trans.o util.o + +gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o + diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c new file mode 100644 index 00000000..cbc07155 --- /dev/null +++ b/fs/gfs2/acl.c @@ -0,0 +1,354 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "acl.h" +#include "xattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "trans.h" +#include "util.h" + +static const char *gfs2_acl_name(int type) +{ + switch (type) { + case ACL_TYPE_ACCESS: + return GFS2_POSIX_ACL_ACCESS; + case ACL_TYPE_DEFAULT: + return GFS2_POSIX_ACL_DEFAULT; + } + return NULL; +} + +static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type) +{ + struct posix_acl *acl; + const char *name; + char *data; + int len; + + if (!ip->i_eattr) + return NULL; + + acl = get_cached_acl(&ip->i_inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + name = gfs2_acl_name(type); + if (name == NULL) + return ERR_PTR(-EINVAL); + + len = gfs2_xattr_acl_get(ip, name, &data); + if (len < 0) + return ERR_PTR(len); + if (len == 0) + return NULL; + + acl = posix_acl_from_xattr(data, len); + kfree(data); + return acl; +} + +/** + * gfs2_check_acl - Check an ACL to see if we're allowed to do something + * @inode: the file we want to do something to + * @mask: what we want to do + * + * Returns: errno + */ + +int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags) +{ + struct posix_acl *acl; + int error; + + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -ECHILD; + return -EAGAIN; + } + + acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + + return -EAGAIN; +} + +static int gfs2_set_mode(struct inode *inode, mode_t mode) +{ + int error = 0; + + if (mode != inode->i_mode) { + struct iattr iattr; + + iattr.ia_valid = ATTR_MODE; + iattr.ia_mode = mode; + + error = gfs2_setattr_simple(GFS2_I(inode), &iattr); + } + + return error; +} + +static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) +{ + int error; + int len; + char *data; + const char *name = gfs2_acl_name(type); + + BUG_ON(name == NULL); + len = posix_acl_to_xattr(acl, NULL, 0); + if (len == 0) + return 0; + data = kmalloc(len, GFP_NOFS); + if (data == NULL) + return -ENOMEM; + error = posix_acl_to_xattr(acl, data, len); + if (error < 0) + goto out; + error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); + if (!error) + set_cached_acl(inode, type, acl); +out: + kfree(data); + return error; +} + +int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct posix_acl *acl, *clone; + mode_t mode = inode->i_mode; + int error = 0; + + if (!sdp->sd_args.ar_posix_acl) + return 0; + if (S_ISLNK(inode->i_mode)) + return 0; + + acl = gfs2_acl_get(dip, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (!acl) { + mode &= ~current_umask(); + if (mode != inode->i_mode) + error = gfs2_set_mode(inode, mode); + return error; + } + + if (S_ISDIR(inode->i_mode)) { + error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl); + if (error) + goto out; + } + + clone = posix_acl_clone(acl, GFP_NOFS); + error = -ENOMEM; + if (!clone) + goto out; + posix_acl_release(acl); + acl = clone; + + error = posix_acl_create_masq(acl, &mode); + if (error < 0) + goto out; + if (error == 0) + goto munge; + + error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl); + if (error) + goto out; +munge: + error = gfs2_set_mode(inode, mode); +out: + posix_acl_release(acl); + return error; +} + +int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) +{ + struct posix_acl *acl, *clone; + char *data; + unsigned int len; + int error; + + acl = gfs2_acl_get(ip, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (!acl) + return gfs2_setattr_simple(ip, attr); + + clone = posix_acl_clone(acl, GFP_NOFS); + error = -ENOMEM; + if (!clone) + goto out; + posix_acl_release(acl); + acl = clone; + + error = posix_acl_chmod_masq(acl, attr->ia_mode); + if (!error) { + len = posix_acl_to_xattr(acl, NULL, 0); + data = kmalloc(len, GFP_NOFS); + error = -ENOMEM; + if (data == NULL) + goto out; + posix_acl_to_xattr(acl, data, len); + error = gfs2_xattr_acl_chmod(ip, attr, data); + kfree(data); + set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl); + } + +out: + posix_acl_release(acl); + return error; +} + +static int gfs2_acl_type(const char *name) +{ + if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0) + return ACL_TYPE_ACCESS; + if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0) + return ACL_TYPE_DEFAULT; + return -EINVAL; +} + +static int gfs2_xattr_system_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int xtype) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct posix_acl *acl; + int type; + int error; + + if (!sdp->sd_args.ar_posix_acl) + return -EOPNOTSUPP; + + type = gfs2_acl_type(name); + if (type < 0) + return type; + + acl = gfs2_acl_get(GFS2_I(inode), type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, + int xtype) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct posix_acl *acl = NULL; + int error = 0, type; + + if (!sdp->sd_args.ar_posix_acl) + return -EOPNOTSUPP; + + type = gfs2_acl_type(name); + if (type < 0) + return type; + if (flags & XATTR_CREATE) + return -EINVAL; + if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + return value ? -EACCES : 0; + if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!value) + goto set_acl; + + acl = posix_acl_from_xattr(value, size); + if (!acl) { + /* + * acl_set_file(3) may request that we set default ACLs with + * zero length -- defend (gracefully) against that here. + */ + goto out; + } + if (IS_ERR(acl)) { + error = PTR_ERR(acl); + goto out; + } + + error = posix_acl_valid(acl); + if (error) + goto out_release; + + error = -EINVAL; + if (acl->a_count > GFS2_ACL_MAX_ENTRIES) + goto out_release; + + if (type == ACL_TYPE_ACCESS) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + + if (error <= 0) { + posix_acl_release(acl); + acl = NULL; + + if (error < 0) + return error; + } + + error = gfs2_set_mode(inode, mode); + if (error) + goto out_release; + } + +set_acl: + error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS); + if (!error) { + if (acl) + set_cached_acl(inode, type, acl); + else + forget_cached_acl(inode, type); + } +out_release: + posix_acl_release(acl); +out: + return error; +} + +const struct xattr_handler gfs2_xattr_system_handler = { + .prefix = XATTR_SYSTEM_PREFIX, + .flags = GFS2_EATYPE_SYS, + .get = gfs2_xattr_system_get, + .set = gfs2_xattr_system_set, +}; + diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h new file mode 100644 index 00000000..a93907c8 --- /dev/null +++ b/fs/gfs2/acl.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __ACL_DOT_H__ +#define __ACL_DOT_H__ + +#include "incore.h" + +#define GFS2_POSIX_ACL_ACCESS "posix_acl_access" +#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" +#define GFS2_ACL_MAX_ENTRIES 25 + +extern int gfs2_check_acl(struct inode *inode, int mask, unsigned int); +extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); +extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); +extern const struct xattr_handler gfs2_xattr_system_handler; + +#endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c new file mode 100644 index 00000000..f9fbbe96 --- /dev/null +++ b/fs/gfs2/aops.c @@ -0,0 +1,1182 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "trans.h" +#include "rgrp.h" +#include "super.h" +#include "util.h" +#include "glops.h" + + +void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, + unsigned int from, unsigned int to) +{ + struct buffer_head *head = page_buffers(page); + unsigned int bsize = head->b_size; + struct buffer_head *bh; + unsigned int start, end; + + for (bh = head, start = 0; bh != head || !start; + bh = bh->b_this_page, start = end) { + end = start + bsize; + if (end <= from || start >= to) + continue; + if (gfs2_is_jdata(ip)) + set_buffer_uptodate(bh); + gfs2_trans_add_bh(ip->i_gl, bh, 0); + } +} + +/** + * gfs2_get_block_noalloc - Fills in a buffer head with details about a block + * @inode: The inode + * @lblock: The block number to look up + * @bh_result: The buffer head to return the result in + * @create: Non-zero if we may add block to the file + * + * Returns: errno + */ + +static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + int error; + + error = gfs2_block_map(inode, lblock, bh_result, 0); + if (error) + return error; + if (!buffer_mapped(bh_result)) + return -EIO; + return 0; +} + +static int gfs2_get_block_direct(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + return gfs2_block_map(inode, lblock, bh_result, 0); +} + +/** + * gfs2_writepage_common - Common bits of writepage + * @page: The page to be written + * @wbc: The writeback control + * + * Returns: 1 if writepage is ok, otherwise an error code or zero if no error. + */ + +static int gfs2_writepage_common(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + + if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) + goto out; + if (current->journal_info) + goto redirty; + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index > end_index || (page->index == end_index && !offset)) { + page->mapping->a_ops->invalidatepage(page, 0); + goto out; + } + return 1; +redirty: + redirty_page_for_writepage(wbc, page); +out: + unlock_page(page); + return 0; +} + +/** + * gfs2_writeback_writepage - Write page for writeback mappings + * @page: The page + * @wbc: The writeback control + * + */ + +static int gfs2_writeback_writepage(struct page *page, + struct writeback_control *wbc) +{ + int ret; + + ret = gfs2_writepage_common(page, wbc); + if (ret <= 0) + return ret; + + return nobh_writepage(page, gfs2_get_block_noalloc, wbc); +} + +/** + * gfs2_ordered_writepage - Write page for ordered data files + * @page: The page to write + * @wbc: The writeback control + * + */ + +static int gfs2_ordered_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + int ret; + + ret = gfs2_writepage_common(page, wbc); + if (ret <= 0) + return ret; + + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1); + return block_write_full_page(page, gfs2_get_block_noalloc, wbc); +} + +/** + * __gfs2_jdata_writepage - The core of jdata writepage + * @page: The page to write + * @wbc: The writeback control + * + * This is shared between writepage and writepages and implements the + * core of the writepage operation. If a transaction is required then + * PageChecked will have been set and the transaction will have + * already been started before this is called. + */ + +static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + + if (PageChecked(page)) { + ClearPageChecked(page); + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); + } + return block_write_full_page(page, gfs2_get_block_noalloc, wbc); +} + +/** + * gfs2_jdata_writepage - Write complete page + * @page: Page to write + * + * Returns: errno + * + */ + +static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_sbd *sdp = GFS2_SB(inode); + int ret; + int done_trans = 0; + + if (PageChecked(page)) { + if (wbc->sync_mode != WB_SYNC_ALL) + goto out_ignore; + ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); + if (ret) + goto out_ignore; + done_trans = 1; + } + ret = gfs2_writepage_common(page, wbc); + if (ret > 0) + ret = __gfs2_jdata_writepage(page, wbc); + if (done_trans) + gfs2_trans_end(sdp); + return ret; + +out_ignore: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +} + +/** + * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk + * @mapping: The mapping to write + * @wbc: Write-back control + * + * For the data=writeback case we can already ignore buffer heads + * and write whole extents at once. This is a big reduction in the + * number of I/O requests we send and the bmap calls we make in this case. + */ +static int gfs2_writeback_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); +} + +/** + * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages + * @mapping: The mapping + * @wbc: The writeback control + * @writepage: The writepage function to call for each page + * @pvec: The vector of pages + * @nr_pages: The number of pages to write + * + * Returns: non-zero if loop should terminate, zero otherwise + */ + +static int gfs2_write_jdata_pagevec(struct address_space *mapping, + struct writeback_control *wbc, + struct pagevec *pvec, + int nr_pages, pgoff_t end) +{ + struct inode *inode = mapping->host; + struct gfs2_sbd *sdp = GFS2_SB(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset = i_size & (PAGE_CACHE_SIZE-1); + unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); + int i; + int ret; + + ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); + if (ret < 0) + return ret; + + for(i = 0; i < nr_pages; i++) { + struct page *page = pvec->pages[i]; + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + ret = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + /* Is the page fully outside i_size? (truncate in progress) */ + if (page->index > end_index || (page->index == end_index && !offset)) { + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + continue; + } + + ret = __gfs2_jdata_writepage(page, wbc); + + if (ret || (--(wbc->nr_to_write) <= 0)) + ret = 1; + } + gfs2_trans_end(sdp); + return ret; +} + +/** + * gfs2_write_cache_jdata - Like write_cache_pages but different + * @mapping: The mapping to write + * @wbc: The writeback control + * @writepage: The writepage function to call + * @data: The data to pass to writepage + * + * The reason that we use our own function here is that we need to + * start transactions before we grab page locks. This allows us + * to get the ordering right. + */ + +static int gfs2_write_cache_jdata(struct address_space *mapping, + struct writeback_control *wbc) +{ + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; + int scanned = 0; + int range_whole = 0; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; + } + +retry: + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + scanned = 1; + ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end); + if (ret) + done = 1; + if (ret > 0) + ret = 0; + + pagevec_release(&pvec); + cond_resched(); + } + + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + return ret; +} + + +/** + * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk + * @mapping: The mapping to write + * @wbc: The writeback control + * + */ + +static int gfs2_jdata_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(mapping->host); + int ret; + + ret = gfs2_write_cache_jdata(mapping, wbc); + if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) { + gfs2_log_flush(sdp, ip->i_gl); + ret = gfs2_write_cache_jdata(mapping, wbc); + } + return ret; +} + +/** + * stuffed_readpage - Fill in a Linux page with stuffed file data + * @ip: the inode + * @page: the page + * + * Returns: errno + */ + +static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) +{ + struct buffer_head *dibh; + u64 dsize = i_size_read(&ip->i_inode); + void *kaddr; + int error; + + /* + * Due to the order of unstuffing files and ->fault(), we can be + * asked for a zero page in the case of a stuffed file being extended, + * so we need to supply one here. It doesn't happen often. + */ + if (unlikely(page->index)) { + zero_user(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return 0; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + kaddr = kmap_atomic(page, KM_USER0); + if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) + dsize = (dibh->b_size - sizeof(struct gfs2_dinode)); + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); + memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page); + brelse(dibh); + SetPageUptodate(page); + + return 0; +} + + +/** + * __gfs2_readpage - readpage + * @file: The file to read a page for + * @page: The page to read + * + * This is the core of gfs2's readpage. Its used by the internal file + * reading code as in that case we already hold the glock. Also its + * called by gfs2_readpage() once the required lock has been granted. + * + */ + +static int __gfs2_readpage(void *file, struct page *page) +{ + struct gfs2_inode *ip = GFS2_I(page->mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + int error; + + if (gfs2_is_stuffed(ip)) { + error = stuffed_readpage(ip, page); + unlock_page(page); + } else { + error = mpage_readpage(page, gfs2_block_map); + } + + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + return error; +} + +/** + * gfs2_readpage - read a page of a file + * @file: The file to read + * @page: The page of the file + * + * This deals with the locking required. We have to unlock and + * relock the page in order to get the locking in the right + * order. + */ + +static int gfs2_readpage(struct file *file, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_holder gh; + int error; + + unlock_page(page); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + error = gfs2_glock_nq(&gh); + if (unlikely(error)) + goto out; + error = AOP_TRUNCATED_PAGE; + lock_page(page); + if (page->mapping == mapping && !PageUptodate(page)) + error = __gfs2_readpage(file, page); + else + unlock_page(page); + gfs2_glock_dq(&gh); +out: + gfs2_holder_uninit(&gh); + if (error && error != AOP_TRUNCATED_PAGE) + lock_page(page); + return error; +} + +/** + * gfs2_internal_read - read an internal file + * @ip: The gfs2 inode + * @ra_state: The readahead state (or NULL for no readahead) + * @buf: The buffer to fill + * @pos: The file position + * @size: The amount to read + * + */ + +int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state, + char *buf, loff_t *pos, unsigned size) +{ + struct address_space *mapping = ip->i_inode.i_mapping; + unsigned long index = *pos / PAGE_CACHE_SIZE; + unsigned offset = *pos & (PAGE_CACHE_SIZE - 1); + unsigned copied = 0; + unsigned amt; + struct page *page; + void *p; + + do { + amt = size - copied; + if (offset + size > PAGE_CACHE_SIZE) + amt = PAGE_CACHE_SIZE - offset; + page = read_cache_page(mapping, index, __gfs2_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + p = kmap_atomic(page, KM_USER0); + memcpy(buf + copied, p + offset, amt); + kunmap_atomic(p, KM_USER0); + mark_page_accessed(page); + page_cache_release(page); + copied += amt; + index++; + offset = 0; + } while(copied < size); + (*pos) += size; + return size; +} + +/** + * gfs2_readpages - Read a bunch of pages at once + * + * Some notes: + * 1. This is only for readahead, so we can simply ignore any things + * which are slightly inconvenient (such as locking conflicts between + * the page lock and the glock) and return having done no I/O. Its + * obviously not something we'd want to do on too regular a basis. + * Any I/O we ignore at this time will be done via readpage later. + * 2. We don't handle stuffed files here we let readpage do the honours. + * 3. mpage_readpages() does most of the heavy lifting in the common case. + * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places. + */ + +static int gfs2_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *inode = mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder gh; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (unlikely(ret)) + goto out_uninit; + if (!gfs2_is_stuffed(ip)) + ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map); + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + ret = -EIO; + return ret; +} + +/** + * gfs2_write_begin - Begin to write to a file + * @file: The file to write to + * @mapping: The mapping in which to write + * @pos: The file offset at which to start writing + * @len: Length of the write + * @flags: Various flags + * @pagep: Pointer to return the page + * @fsdata: Pointer to return fs data (unused by GFS2) + * + * Returns: errno + */ + +static int gfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(mapping->host); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + int alloc_required; + int error = 0; + struct gfs2_alloc *al = NULL; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + struct page *page; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); + error = gfs2_glock_nq(&ip->i_gh); + if (unlikely(error)) + goto out_uninit; + if (&ip->i_inode == sdp->sd_rindex) { + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, &m_ip->i_gh); + if (unlikely(error)) { + gfs2_glock_dq(&ip->i_gh); + goto out_uninit; + } + } + + alloc_required = gfs2_write_alloc_required(ip, pos, len); + + if (alloc_required || gfs2_is_jdata(ip)) + gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); + + if (alloc_required) { + al = gfs2_alloc_get(ip); + if (!al) { + error = -ENOMEM; + goto out_unlock; + } + + error = gfs2_quota_lock_check(ip); + if (error) + goto out_alloc_put; + + al->al_requested = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip); + if (error) + goto out_qunlock; + } + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + if (ind_blocks || data_blocks) + rblocks += RES_STATFS + RES_QUOTA; + if (&ip->i_inode == sdp->sd_rindex) + rblocks += 2 * RES_STATFS; + if (alloc_required) + rblocks += gfs2_rg_blocks(al); + + error = gfs2_trans_begin(sdp, rblocks, + PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); + if (error) + goto out_trans_fail; + + error = -ENOMEM; + flags |= AOP_FLAG_NOFS; + page = grab_cache_page_write_begin(mapping, index, flags); + *pagep = page; + if (unlikely(!page)) + goto out_endtrans; + + if (gfs2_is_stuffed(ip)) { + error = 0; + if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { + error = gfs2_unstuff_dinode(ip, page); + if (error == 0) + goto prepare_write; + } else if (!PageUptodate(page)) { + error = stuffed_readpage(ip, page); + } + goto out; + } + +prepare_write: + error = __block_write_begin(page, from, len, gfs2_block_map); +out: + if (error == 0) + return 0; + + unlock_page(page); + page_cache_release(page); + + gfs2_trans_end(sdp); + if (pos + len > ip->i_inode.i_size) + gfs2_trim_blocks(&ip->i_inode); + goto out_trans_fail; + +out_endtrans: + gfs2_trans_end(sdp); +out_trans_fail: + if (alloc_required) { + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); + } +out_unlock: + if (&ip->i_inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } + gfs2_glock_dq(&ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); + return error; +} + +/** + * adjust_fs_space - Adjusts the free space available due to gfs2_grow + * @inode: the rindex inode + */ +static void adjust_fs_space(struct inode *inode) +{ + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct buffer_head *m_bh, *l_bh; + u64 fs_total, new_free; + + /* Total up the file system space, according to the latest rindex. */ + fs_total = gfs2_ri_total(sdp); + if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0) + return; + + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + if (fs_total > (m_sc->sc_total + l_sc->sc_total)) + new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); + else + new_free = 0; + spin_unlock(&sdp->sd_statfs_spin); + fs_warn(sdp, "File system extended by %llu blocks.\n", + (unsigned long long)new_free); + gfs2_statfs_change(sdp, new_free, new_free, 0); + + if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0) + goto out; + update_statfs(sdp, m_bh, l_bh); + brelse(l_bh); +out: + brelse(m_bh); +} + +/** + * gfs2_stuffed_write_end - Write end for stuffed files + * @inode: The inode + * @dibh: The buffer_head containing the on-disk inode + * @pos: The file position + * @len: The length of the write + * @copied: How much was actually copied by the VFS + * @page: The page + * + * This copies the data from the page into the inode block after + * the inode data structure itself. + * + * Returns: errno + */ +static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, + loff_t pos, unsigned len, unsigned copied, + struct page *page) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + u64 to = pos + copied; + void *kaddr; + unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + + BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); + kaddr = kmap_atomic(page, KM_USER0); + memcpy(buf + pos, kaddr + pos, copied); + memset(kaddr + pos + copied, 0, len - copied); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + if (!PageUptodate(page)) + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + + if (copied) { + if (inode->i_size < to) + i_size_write(inode, to); + gfs2_dinode_out(ip, di); + mark_inode_dirty(inode); + } + + if (inode == sdp->sd_rindex) { + adjust_fs_space(inode); + ip->i_gh.gh_flags |= GL_NOCACHE; + } + + brelse(dibh); + gfs2_trans_end(sdp); + if (inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } + gfs2_glock_dq(&ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); + return copied; +} + +/** + * gfs2_write_end + * @file: The file to write to + * @mapping: The address space to write to + * @pos: The file position + * @len: The length of the data + * @copied: + * @page: The page that has been written + * @fsdata: The fsdata (unused in GFS2) + * + * The main write_end function for GFS2. We have a separate one for + * stuffed files as they are slightly different, otherwise we just + * put our locking around the VFS provided functions. + * + * Returns: errno + */ + +static int gfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct buffer_head *dibh; + struct gfs2_alloc *al = ip->i_alloc; + unsigned int from = pos & (PAGE_CACHE_SIZE - 1); + unsigned int to = from + len; + int ret; + + BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL); + + ret = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(ret)) { + unlock_page(page); + page_cache_release(page); + goto failed; + } + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + + if (gfs2_is_stuffed(ip)) + return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); + + if (!gfs2_is_writeback(ip)) + gfs2_page_add_databufs(ip, page, from, to); + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ret > 0) { + gfs2_dinode_out(ip, dibh->b_data); + mark_inode_dirty(inode); + } + + if (inode == sdp->sd_rindex) { + adjust_fs_space(inode); + ip->i_gh.gh_flags |= GL_NOCACHE; + } + + brelse(dibh); +failed: + gfs2_trans_end(sdp); + if (al) { + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + if (inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } + gfs2_glock_dq(&ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); + return ret; +} + +/** + * gfs2_set_page_dirty - Page dirtying function + * @page: The page to dirty + * + * Returns: 1 if it dirtyed the page, or 0 otherwise + */ + +static int gfs2_set_page_dirty(struct page *page) +{ + SetPageChecked(page); + return __set_page_dirty_buffers(page); +} + +/** + * gfs2_bmap - Block map function + * @mapping: Address space info + * @lblock: The block to map + * + * Returns: The disk address for the block or 0 on hole or error + */ + +static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_holder i_gh; + sector_t dblock = 0; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return 0; + + if (!gfs2_is_stuffed(ip)) + dblock = generic_block_bmap(mapping, lblock, gfs2_block_map); + + gfs2_glock_dq_uninit(&i_gh); + + return dblock; +} + +static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + struct gfs2_bufdata *bd; + + lock_buffer(bh); + gfs2_log_lock(sdp); + clear_buffer_dirty(bh); + bd = bh->b_private; + if (bd) { + if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh)) + list_del_init(&bd->bd_le.le_list); + else + gfs2_remove_from_journal(bh, current->journal_info, 0); + } + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + gfs2_log_unlock(sdp); + unlock_buffer(bh); +} + +static void gfs2_invalidatepage(struct page *page, unsigned long offset) +{ + struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + struct buffer_head *bh, *head; + unsigned long pos = 0; + + BUG_ON(!PageLocked(page)); + if (offset == 0) + ClearPageChecked(page); + if (!page_has_buffers(page)) + goto out; + + bh = head = page_buffers(page); + do { + if (offset <= pos) + gfs2_discard(sdp, bh); + pos += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); +out: + if (offset == 0) + try_to_release_page(page, 0); +} + +/** + * gfs2_ok_for_dio - check that dio is valid on this file + * @ip: The inode + * @rw: READ or WRITE + * @offset: The offset at which we are reading or writing + * + * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o) + * 1 (to accept the i/o request) + */ +static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) +{ + /* + * Should we return an error here? I can't see that O_DIRECT for + * a stuffed file makes any sense. For now we'll silently fall + * back to buffered I/O + */ + if (gfs2_is_stuffed(ip)) + return 0; + + if (offset >= i_size_read(&ip->i_inode)) + return 0; + return 1; +} + + + +static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int rv; + + /* + * Deferred lock, even if its a write, since we do no allocation + * on this path. All we need change is atime, and this lock mode + * ensures that other nodes have flushed their buffered read caches + * (i.e. their page cache entries for this inode). We do not, + * unfortunately have the option of only flushing a range like + * the VFS does. + */ + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); + rv = gfs2_glock_nq(&gh); + if (rv) + return rv; + rv = gfs2_ok_for_dio(ip, rw, offset); + if (rv != 1) + goto out; /* dio not valid, fall back to buffered i/o */ + + rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, gfs2_get_block_direct, + NULL, NULL, 0); +out: + gfs2_glock_dq_m(1, &gh); + gfs2_holder_uninit(&gh); + return rv; +} + +/** + * gfs2_releasepage - free the metadata associated with a page + * @page: the page that's being released + * @gfp_mask: passed from Linux VFS, ignored by us + * + * Call try_to_free_buffers() if the buffers in this page can be + * released. + * + * Returns: 0 + */ + +int gfs2_releasepage(struct page *page, gfp_t gfp_mask) +{ + struct address_space *mapping = page->mapping; + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); + struct buffer_head *bh, *head; + struct gfs2_bufdata *bd; + + if (!page_has_buffers(page)) + return 0; + + gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); + head = bh = page_buffers(page); + do { + if (atomic_read(&bh->b_count)) + goto cannot_release; + bd = bh->b_private; + if (bd && bd->bd_ail) + goto cannot_release; + if (buffer_pinned(bh) || buffer_dirty(bh)) + goto not_possible; + bh = bh->b_this_page; + } while(bh != head); + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); + + head = bh = page_buffers(page); + do { + gfs2_log_lock(sdp); + bd = bh->b_private; + if (bd) { + gfs2_assert_warn(sdp, bd->bd_bh == bh); + gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr)); + if (!list_empty(&bd->bd_le.le_list)) { + if (!buffer_pinned(bh)) + list_del_init(&bd->bd_le.le_list); + else + bd = NULL; + } + if (bd) + bd->bd_bh = NULL; + bh->b_private = NULL; + } + gfs2_log_unlock(sdp); + if (bd) + kmem_cache_free(gfs2_bufdata_cachep, bd); + + bh = bh->b_this_page; + } while (bh != head); + + return try_to_free_buffers(page); + +not_possible: /* Should never happen */ + WARN_ON(buffer_dirty(bh)); + WARN_ON(buffer_pinned(bh)); +cannot_release: + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); + return 0; +} + +static const struct address_space_operations gfs2_writeback_aops = { + .writepage = gfs2_writeback_writepage, + .writepages = gfs2_writeback_writepages, + .readpage = gfs2_readpage, + .readpages = gfs2_readpages, + .write_begin = gfs2_write_begin, + .write_end = gfs2_write_end, + .bmap = gfs2_bmap, + .invalidatepage = gfs2_invalidatepage, + .releasepage = gfs2_releasepage, + .direct_IO = gfs2_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations gfs2_ordered_aops = { + .writepage = gfs2_ordered_writepage, + .readpage = gfs2_readpage, + .readpages = gfs2_readpages, + .write_begin = gfs2_write_begin, + .write_end = gfs2_write_end, + .set_page_dirty = gfs2_set_page_dirty, + .bmap = gfs2_bmap, + .invalidatepage = gfs2_invalidatepage, + .releasepage = gfs2_releasepage, + .direct_IO = gfs2_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations gfs2_jdata_aops = { + .writepage = gfs2_jdata_writepage, + .writepages = gfs2_jdata_writepages, + .readpage = gfs2_readpage, + .readpages = gfs2_readpages, + .write_begin = gfs2_write_begin, + .write_end = gfs2_write_end, + .set_page_dirty = gfs2_set_page_dirty, + .bmap = gfs2_bmap, + .invalidatepage = gfs2_invalidatepage, + .releasepage = gfs2_releasepage, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +void gfs2_set_aops(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + if (gfs2_is_writeback(ip)) + inode->i_mapping->a_ops = &gfs2_writeback_aops; + else if (gfs2_is_ordered(ip)) + inode->i_mapping->a_ops = &gfs2_ordered_aops; + else if (gfs2_is_jdata(ip)) + inode->i_mapping->a_ops = &gfs2_jdata_aops; + else + BUG(); +} + diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c new file mode 100644 index 00000000..e65493a8 --- /dev/null +++ b/fs/gfs2/bmap.c @@ -0,0 +1,1297 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "dir.h" +#include "util.h" +#include "trace_gfs2.h" + +/* This doesn't need to be that large as max 64 bit pointers in a 4k + * block is 512, so __u16 is fine for that. It saves stack space to + * keep it small. + */ +struct metapath { + struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT]; + __u16 mp_list[GFS2_MAX_META_HEIGHT]; +}; + +typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh, + struct buffer_head *bh, __be64 *top, + __be64 *bottom, unsigned int height, + void *data); + +struct strip_mine { + int sm_first; + unsigned int sm_height; +}; + +/** + * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page + * @ip: the inode + * @dibh: the dinode buffer + * @block: the block number that was allocated + * @page: The (optional) page. This is looked up if @page is NULL + * + * Returns: errno + */ + +static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, + u64 block, struct page *page) +{ + struct inode *inode = &ip->i_inode; + struct buffer_head *bh; + int release = 0; + + if (!page || page->index) { + page = grab_cache_page(inode->i_mapping, 0); + if (!page) + return -ENOMEM; + release = 1; + } + + if (!PageUptodate(page)) { + void *kaddr = kmap(page); + u64 dsize = i_size_read(inode); + + if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) + dsize = dibh->b_size - sizeof(struct gfs2_dinode); + + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); + memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); + kunmap(page); + + SetPageUptodate(page); + } + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << inode->i_blkbits, + (1 << BH_Uptodate)); + + bh = page_buffers(page); + + if (!buffer_mapped(bh)) + map_bh(bh, inode->i_sb, block); + + set_buffer_uptodate(bh); + if (!gfs2_is_jdata(ip)) + mark_buffer_dirty(bh); + if (!gfs2_is_writeback(ip)) + gfs2_trans_add_bh(ip->i_gl, bh, 0); + + if (release) { + unlock_page(page); + page_cache_release(page); + } + + return 0; +} + +/** + * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big + * @ip: The GFS2 inode to unstuff + * @page: The (optional) page. This is looked up if the @page is NULL + * + * This routine unstuffs a dinode and returns it to a "normal" state such + * that the height can be grown in the traditional way. + * + * Returns: errno + */ + +int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) +{ + struct buffer_head *bh, *dibh; + struct gfs2_dinode *di; + u64 block = 0; + int isdir = gfs2_is_dir(ip); + int error; + + down_write(&ip->i_rw_mutex); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + if (i_size_read(&ip->i_inode)) { + /* Get a free block, fill it with the stuffed data, + and write it out to disk */ + + unsigned int n = 1; + error = gfs2_alloc_block(ip, &block, &n); + if (error) + goto out_brelse; + if (isdir) { + gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1); + error = gfs2_dir_get_new_buffer(ip, block, &bh); + if (error) + goto out_brelse; + gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header), + dibh, sizeof(struct gfs2_dinode)); + brelse(bh); + } else { + error = gfs2_unstuffer_page(ip, dibh, block, page); + if (error) + goto out_brelse; + } + } + + /* Set up the pointer to the new block */ + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + di = (struct gfs2_dinode *)dibh->b_data; + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + + if (i_size_read(&ip->i_inode)) { + *(__be64 *)(di + 1) = cpu_to_be64(block); + gfs2_add_inode_blocks(&ip->i_inode, 1); + di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); + } + + ip->i_height = 1; + di->di_height = cpu_to_be16(1); + +out_brelse: + brelse(dibh); +out: + up_write(&ip->i_rw_mutex); + return error; +} + + +/** + * find_metapath - Find path through the metadata tree + * @sdp: The superblock + * @mp: The metapath to return the result in + * @block: The disk block to look up + * @height: The pre-calculated height of the metadata tree + * + * This routine returns a struct metapath structure that defines a path + * through the metadata of inode "ip" to get to block "block". + * + * Example: + * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a + * filesystem with a blocksize of 4096. + * + * find_metapath() would return a struct metapath structure set to: + * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48, + * and mp_list[2] = 165. + * + * That means that in order to get to the block containing the byte at + * offset 101342453, we would load the indirect block pointed to by pointer + * 0 in the dinode. We would then load the indirect block pointed to by + * pointer 48 in that indirect block. We would then load the data block + * pointed to by pointer 165 in that indirect block. + * + * ---------------------------------------- + * | Dinode | | + * | | 4| + * | |0 1 2 3 4 5 9| + * | | 6| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Indirect Block | + * | 5| + * | 4 4 4 4 4 5 5 1| + * |0 5 6 7 8 9 0 1 2| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Indirect Block | + * | 1 1 1 1 1 5| + * | 6 6 6 6 6 1| + * |0 3 4 5 6 7 2| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Data block containing offset | + * | 101342453 | + * | | + * | | + * ---------------------------------------- + * + */ + +static void find_metapath(const struct gfs2_sbd *sdp, u64 block, + struct metapath *mp, unsigned int height) +{ + unsigned int i; + + for (i = height; i--;) + mp->mp_list[i] = do_div(block, sdp->sd_inptrs); + +} + +static inline unsigned int metapath_branch_start(const struct metapath *mp) +{ + if (mp->mp_list[0] == 0) + return 2; + return 1; +} + +/** + * metapointer - Return pointer to start of metadata in a buffer + * @height: The metadata height (0 = dinode) + * @mp: The metapath + * + * Return a pointer to the block number of the next height of the metadata + * tree given a buffer containing the pointer to the current height of the + * metadata tree. + */ + +static inline __be64 *metapointer(unsigned int height, const struct metapath *mp) +{ + struct buffer_head *bh = mp->mp_bh[height]; + unsigned int head_size = (height > 0) ? + sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode); + return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; +} + +/** + * lookup_metapath - Walk the metadata tree to a specific point + * @ip: The inode + * @mp: The metapath + * + * Assumes that the inode's buffer has already been looked up and + * hooked onto mp->mp_bh[0] and that the metapath has been initialised + * by find_metapath(). + * + * If this function encounters part of the tree which has not been + * allocated, it returns the current height of the tree at the point + * at which it found the unallocated block. Blocks which are found are + * added to the mp->mp_bh[] list. + * + * Returns: error or height of metadata tree + */ + +static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) +{ + unsigned int end_of_metadata = ip->i_height - 1; + unsigned int x; + __be64 *ptr; + u64 dblock; + int ret; + + for (x = 0; x < end_of_metadata; x++) { + ptr = metapointer(x, mp); + dblock = be64_to_cpu(*ptr); + if (!dblock) + return x + 1; + + ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, 0, &mp->mp_bh[x+1]); + if (ret) + return ret; + } + + return ip->i_height; +} + +static inline void release_metapath(struct metapath *mp) +{ + int i; + + for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) { + if (mp->mp_bh[i] == NULL) + break; + brelse(mp->mp_bh[i]); + } +} + +/** + * gfs2_extent_length - Returns length of an extent of blocks + * @start: Start of the buffer + * @len: Length of the buffer in bytes + * @ptr: Current position in the buffer + * @limit: Max extent length to return (0 = unlimited) + * @eob: Set to 1 if we hit "end of block" + * + * If the first block is zero (unallocated) it will return the number of + * unallocated blocks in the extent, otherwise it will return the number + * of contiguous blocks in the extent. + * + * Returns: The length of the extent (minimum of one block) + */ + +static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob) +{ + const __be64 *end = (start + len); + const __be64 *first = ptr; + u64 d = be64_to_cpu(*ptr); + + *eob = 0; + do { + ptr++; + if (ptr >= end) + break; + if (limit && --limit == 0) + break; + if (d) + d++; + } while(be64_to_cpu(*ptr) == d); + if (ptr >= end) + *eob = 1; + return (ptr - first); +} + +static inline void bmap_lock(struct gfs2_inode *ip, int create) +{ + if (create) + down_write(&ip->i_rw_mutex); + else + down_read(&ip->i_rw_mutex); +} + +static inline void bmap_unlock(struct gfs2_inode *ip, int create) +{ + if (create) + up_write(&ip->i_rw_mutex); + else + up_read(&ip->i_rw_mutex); +} + +static inline __be64 *gfs2_indirect_init(struct metapath *mp, + struct gfs2_glock *gl, unsigned int i, + unsigned offset, u64 bn) +{ + __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data + + ((i > 1) ? sizeof(struct gfs2_meta_header) : + sizeof(struct gfs2_dinode))); + BUG_ON(i < 1); + BUG_ON(mp->mp_bh[i] != NULL); + mp->mp_bh[i] = gfs2_meta_new(gl, bn); + gfs2_trans_add_bh(gl, mp->mp_bh[i], 1); + gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header)); + ptr += offset; + *ptr = cpu_to_be64(bn); + return ptr; +} + +enum alloc_state { + ALLOC_DATA = 0, + ALLOC_GROW_DEPTH = 1, + ALLOC_GROW_HEIGHT = 2, + /* ALLOC_UNSTUFF = 3, TBD and rather complicated */ +}; + +/** + * gfs2_bmap_alloc - Build a metadata tree of the requested height + * @inode: The GFS2 inode + * @lblock: The logical starting block of the extent + * @bh_map: This is used to return the mapping details + * @mp: The metapath + * @sheight: The starting height (i.e. whats already mapped) + * @height: The height to build to + * @maxlen: The max number of data blocks to alloc + * + * In this routine we may have to alloc: + * i) Indirect blocks to grow the metadata tree height + * ii) Indirect blocks to fill in lower part of the metadata tree + * iii) Data blocks + * + * The function is in two parts. The first part works out the total + * number of blocks which we need. The second part does the actual + * allocation asking for an extent at a time (if enough contiguous free + * blocks are available, there will only be one request per bmap call) + * and uses the state machine to initialise the blocks in order. + * + * Returns: errno on error + */ + +static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, + struct buffer_head *bh_map, struct metapath *mp, + const unsigned int sheight, + const unsigned int height, + const unsigned int maxlen) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *dibh = mp->mp_bh[0]; + u64 bn, dblock = 0; + unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; + unsigned dblks = 0; + unsigned ptrs_per_blk; + const unsigned end_of_metadata = height - 1; + int eob = 0; + enum alloc_state state; + __be64 *ptr; + __be64 zero_bn = 0; + + BUG_ON(sheight < 1); + BUG_ON(dibh == NULL); + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + + if (height == sheight) { + struct buffer_head *bh; + /* Bottom indirect block exists, find unalloced extent size */ + ptr = metapointer(end_of_metadata, mp); + bh = mp->mp_bh[end_of_metadata]; + dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, + &eob); + BUG_ON(dblks < 1); + state = ALLOC_DATA; + } else { + /* Need to allocate indirect blocks */ + ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs; + dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]); + if (height == ip->i_height) { + /* Writing into existing tree, extend tree down */ + iblks = height - sheight; + state = ALLOC_GROW_DEPTH; + } else { + /* Building up tree height */ + state = ALLOC_GROW_HEIGHT; + iblks = height - ip->i_height; + branch_start = metapath_branch_start(mp); + iblks += (height - branch_start); + } + } + + /* start of the second part of the function (state machine) */ + + blks = dblks + iblks; + i = sheight; + do { + int error; + n = blks - alloced; + error = gfs2_alloc_block(ip, &bn, &n); + if (error) + return error; + alloced += n; + if (state != ALLOC_DATA || gfs2_is_jdata(ip)) + gfs2_trans_add_unrevoke(sdp, bn, n); + switch (state) { + /* Growing height of tree */ + case ALLOC_GROW_HEIGHT: + if (i == 1) { + ptr = (__be64 *)(dibh->b_data + + sizeof(struct gfs2_dinode)); + zero_bn = *ptr; + } + for (; i - 1 < height - ip->i_height && n > 0; i++, n--) + gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++); + if (i - 1 == height - ip->i_height) { + i--; + gfs2_buffer_copy_tail(mp->mp_bh[i], + sizeof(struct gfs2_meta_header), + dibh, sizeof(struct gfs2_dinode)); + gfs2_buffer_clear_tail(dibh, + sizeof(struct gfs2_dinode) + + sizeof(__be64)); + ptr = (__be64 *)(mp->mp_bh[i]->b_data + + sizeof(struct gfs2_meta_header)); + *ptr = zero_bn; + state = ALLOC_GROW_DEPTH; + for(i = branch_start; i < height; i++) { + if (mp->mp_bh[i] == NULL) + break; + brelse(mp->mp_bh[i]); + mp->mp_bh[i] = NULL; + } + i = branch_start; + } + if (n == 0) + break; + /* Branching from existing tree */ + case ALLOC_GROW_DEPTH: + if (i > 1 && i < height) + gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1); + for (; i < height && n > 0; i++, n--) + gfs2_indirect_init(mp, ip->i_gl, i, + mp->mp_list[i-1], bn++); + if (i == height) + state = ALLOC_DATA; + if (n == 0) + break; + /* Tree complete, adding data blocks */ + case ALLOC_DATA: + BUG_ON(n > dblks); + BUG_ON(mp->mp_bh[end_of_metadata] == NULL); + gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1); + dblks = n; + ptr = metapointer(end_of_metadata, mp); + dblock = bn; + while (n-- > 0) + *ptr++ = cpu_to_be64(bn++); + break; + } + } while ((state != ALLOC_DATA) || !dblock); + + ip->i_height = height; + gfs2_add_inode_blocks(&ip->i_inode, alloced); + gfs2_dinode_out(ip, mp->mp_bh[0]->b_data); + map_bh(bh_map, inode->i_sb, dblock); + bh_map->b_size = dblks << inode->i_blkbits; + set_buffer_new(bh_map); + return 0; +} + +/** + * gfs2_block_map - Map a block from an inode to a disk block + * @inode: The inode + * @lblock: The logical block number + * @bh_map: The bh to be mapped + * @create: True if its ok to alloc blocks to satify the request + * + * Sets buffer_mapped() if successful, sets buffer_boundary() if a + * read of metadata will be required before the next block can be + * mapped. Sets buffer_new() if new blocks were allocated. + * + * Returns: errno + */ + +int gfs2_block_map(struct inode *inode, sector_t lblock, + struct buffer_head *bh_map, int create) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + unsigned int bsize = sdp->sd_sb.sb_bsize; + const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits; + const u64 *arr = sdp->sd_heightsize; + __be64 *ptr; + u64 size; + struct metapath mp; + int ret; + int eob; + unsigned int len; + struct buffer_head *bh; + u8 height; + + BUG_ON(maxlen == 0); + + memset(mp.mp_bh, 0, sizeof(mp.mp_bh)); + bmap_lock(ip, create); + clear_buffer_mapped(bh_map); + clear_buffer_new(bh_map); + clear_buffer_boundary(bh_map); + trace_gfs2_bmap(ip, bh_map, lblock, create, 1); + if (gfs2_is_dir(ip)) { + bsize = sdp->sd_jbsize; + arr = sdp->sd_jheightsize; + } + + ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]); + if (ret) + goto out; + + height = ip->i_height; + size = (lblock + 1) * bsize; + while (size > arr[height]) + height++; + find_metapath(sdp, lblock, &mp, height); + ret = 1; + if (height > ip->i_height || gfs2_is_stuffed(ip)) + goto do_alloc; + ret = lookup_metapath(ip, &mp); + if (ret < 0) + goto out; + if (ret != ip->i_height) + goto do_alloc; + ptr = metapointer(ip->i_height - 1, &mp); + if (*ptr == 0) + goto do_alloc; + map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr)); + bh = mp.mp_bh[ip->i_height - 1]; + len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob); + bh_map->b_size = (len << inode->i_blkbits); + if (eob) + set_buffer_boundary(bh_map); + ret = 0; +out: + release_metapath(&mp); + trace_gfs2_bmap(ip, bh_map, lblock, create, ret); + bmap_unlock(ip, create); + return ret; + +do_alloc: + /* All allocations are done here, firstly check create flag */ + if (!create) { + BUG_ON(gfs2_is_stuffed(ip)); + ret = 0; + goto out; + } + + /* At this point ret is the tree depth of already allocated blocks */ + ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen); + goto out; +} + +/* + * Deprecated: do not use in new code + */ +int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen) +{ + struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 }; + int ret; + int create = *new; + + BUG_ON(!extlen); + BUG_ON(!dblock); + BUG_ON(!new); + + bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5)); + ret = gfs2_block_map(inode, lblock, &bh, create); + *extlen = bh.b_size >> inode->i_blkbits; + *dblock = bh.b_blocknr; + if (buffer_new(&bh)) + *new = 1; + else + *new = 0; + return ret; +} + +/** + * recursive_scan - recursively scan through the end of a file + * @ip: the inode + * @dibh: the dinode buffer + * @mp: the path through the metadata to the point to start + * @height: the height the recursion is at + * @block: the indirect block to look at + * @first: 1 if this is the first block + * @bc: the call to make for each piece of metadata + * @data: data opaque to this function to pass to @bc + * + * When this is first called @height and @block should be zero and + * @first should be 1. + * + * Returns: errno + */ + +static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, + struct metapath *mp, unsigned int height, + u64 block, int first, block_call_t bc, + void *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh = NULL; + __be64 *top, *bottom; + u64 bn; + int error; + int mh_size = sizeof(struct gfs2_meta_header); + + if (!height) { + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return error; + dibh = bh; + + top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; + bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; + } else { + error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh); + if (error) + return error; + + top = (__be64 *)(bh->b_data + mh_size) + + (first ? mp->mp_list[height] : 0); + + bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; + } + + error = bc(ip, dibh, bh, top, bottom, height, data); + if (error) + goto out; + + if (height < ip->i_height - 1) + for (; top < bottom; top++, first = 0) { + if (!*top) + continue; + + bn = be64_to_cpu(*top); + + error = recursive_scan(ip, dibh, mp, height + 1, bn, + first, bc, data); + if (error) + break; + } + +out: + brelse(bh); + return error; +} + +/** + * do_strip - Look for a layer a particular layer of the file and strip it off + * @ip: the inode + * @dibh: the dinode buffer + * @bh: A buffer of pointers + * @top: The first pointer in the buffer + * @bottom: One more than the last pointer + * @height: the height this buffer is at + * @data: a pointer to a struct strip_mine + * + * Returns: errno + */ + +static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, + struct buffer_head *bh, __be64 *top, __be64 *bottom, + unsigned int height, void *data) +{ + struct strip_mine *sm = data; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrp_list rlist; + u64 bn, bstart; + u32 blen, btotal; + __be64 *p; + unsigned int rg_blocks = 0; + int metadata; + unsigned int revokes = 0; + int x; + int error = 0; + + if (!*top) + sm->sm_first = 0; + + if (height != sm->sm_height) + return 0; + + if (sm->sm_first) { + top++; + sm->sm_first = 0; + } + + metadata = (height != ip->i_height - 1); + if (metadata) + revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; + else if (ip->i_depth) + revokes = sdp->sd_inptrs; + + if (ip != GFS2_I(sdp->sd_rindex)) + error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); + else if (!sdp->sd_rgrps) + error = gfs2_ri_update(ip); + + if (error) + return error; + + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); + bstart = 0; + blen = 0; + + for (p = top; p < bottom; p++) { + if (!*p) + continue; + + bn = be64_to_cpu(*p); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_rlist_add(sdp, &rlist, bstart); + + bstart = bn; + blen = 1; + } + } + + if (bstart) + gfs2_rlist_add(sdp, &rlist, bstart); + else + goto out; /* Nothing to do */ + + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); + + for (x = 0; x < rlist.rl_rgrps; x++) { + struct gfs2_rgrpd *rgd; + rgd = rlist.rl_ghs[x].gh_gl->gl_object; + rg_blocks += rgd->rd_length; + } + + error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); + if (error) + goto out_rlist; + + error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + + RES_INDIRECT + RES_STATFS + RES_QUOTA, + revokes); + if (error) + goto out_rg_gunlock; + + down_write(&ip->i_rw_mutex); + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + + bstart = 0; + blen = 0; + btotal = 0; + + for (p = top; p < bottom; p++) { + if (!*p) + continue; + + bn = be64_to_cpu(*p); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) { + if (metadata) + __gfs2_free_meta(ip, bstart, blen); + else + __gfs2_free_data(ip, bstart, blen); + + btotal += blen; + } + + bstart = bn; + blen = 1; + } + + *p = 0; + gfs2_add_inode_blocks(&ip->i_inode, -1); + } + if (bstart) { + if (metadata) + __gfs2_free_meta(ip, bstart, blen); + else + __gfs2_free_data(ip, bstart, blen); + + btotal += blen; + } + + gfs2_statfs_change(sdp, 0, +btotal, 0); + gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, + ip->i_inode.i_gid); + + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + + gfs2_dinode_out(ip, dibh->b_data); + + up_write(&ip->i_rw_mutex); + + gfs2_trans_end(sdp); + +out_rg_gunlock: + gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); +out_rlist: + gfs2_rlist_free(&rlist); +out: + if (ip != GFS2_I(sdp->sd_rindex)) + gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); + return error; +} + +/** + * gfs2_block_truncate_page - Deal with zeroing out data for truncate + * + * This is partly borrowed from ext3. + */ +static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) +{ + struct inode *inode = mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + unsigned long index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, iblock, length, pos; + struct buffer_head *bh; + struct page *page; + int err; + + page = grab_cache_page(mapping, index); + if (!page) + return 0; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + + if (!buffer_mapped(bh)) { + gfs2_block_map(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + err = 0; + } + + if (!gfs2_is_writeback(ip)) + gfs2_trans_add_bh(ip->i_gl, bh, 0); + + zero_user(page, offset, length); + mark_buffer_dirty(bh); +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} + +static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct buffer_head *dibh; + int journaled = gfs2_is_jdata(ip); + int error; + + error = gfs2_trans_begin(sdp, + RES_DINODE + (journaled ? RES_JDATA : 0), 0); + if (error) + return error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + + if (gfs2_is_stuffed(ip)) { + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); + } else { + if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) { + error = gfs2_block_truncate_page(mapping, newsize); + if (error) + goto out_brelse; + } + ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; + } + + i_size_write(inode, newsize); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_dinode_out(ip, dibh->b_data); + + truncate_pagecache(inode, oldsize, newsize); +out_brelse: + brelse(dibh); +out: + gfs2_trans_end(sdp); + return error; +} + +static int trunc_dealloc(struct gfs2_inode *ip, u64 size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int height = ip->i_height; + u64 lblock; + struct metapath mp; + int error; + + if (!size) + lblock = 0; + else + lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift; + + find_metapath(sdp, lblock, &mp, ip->i_height); + if (!gfs2_alloc_get(ip)) + return -ENOMEM; + + error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + while (height--) { + struct strip_mine sm; + sm.sm_first = !!size; + sm.sm_height = height; + + error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm); + if (error) + break; + } + + gfs2_quota_unhold(ip); + +out: + gfs2_alloc_put(ip); + return error; +} + +static int trunc_end(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + + down_write(&ip->i_rw_mutex); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + if (!i_size_read(&ip->i_inode)) { + ip->i_height = 0; + ip->i_goal = ip->i_no_addr; + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + } + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + +out: + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + return error; +} + +/** + * do_shrink - make a file smaller + * @inode: the inode + * @oldsize: the current inode size + * @newsize: the size to make the file + * + * Called with an exclusive lock on @inode. The @size must + * be equal to or smaller than the current inode size. + * + * Returns: errno + */ + +static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize) +{ + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + error = trunc_start(inode, oldsize, newsize); + if (error < 0) + return error; + if (gfs2_is_stuffed(ip)) + return 0; + + error = trunc_dealloc(ip, newsize); + if (error == 0) + error = trunc_end(ip); + + return error; +} + +void gfs2_trim_blocks(struct inode *inode) +{ + u64 size = inode->i_size; + int ret; + + ret = do_shrink(inode, size, size); + WARN_ON(ret != 0); +} + +/** + * do_grow - Touch and update inode size + * @inode: The inode + * @size: The new size + * + * This function updates the timestamps on the inode and + * may also increase the size of the inode. This function + * must not be called with @size any smaller than the current + * inode size. + * + * Although it is not strictly required to unstuff files here, + * earlier versions of GFS2 have a bug in the stuffed file reading + * code which will result in a buffer overrun if the size is larger + * than the max stuffed file size. In order to prevent this from + * occurring, such files are unstuffed, but in other cases we can + * just update the inode size directly. + * + * Returns: 0 on success, or -ve on error + */ + +static int do_grow(struct inode *inode, u64 size) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *dibh; + struct gfs2_alloc *al = NULL; + int error; + + if (gfs2_is_stuffed(ip) && + (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) { + al = gfs2_alloc_get(ip); + if (al == NULL) + return -ENOMEM; + + error = gfs2_quota_lock_check(ip); + if (error) + goto do_grow_alloc_put; + + al->al_requested = 1; + error = gfs2_inplace_reserve(ip); + if (error) + goto do_grow_qunlock; + } + + error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0); + if (error) + goto do_grow_release; + + if (al) { + error = gfs2_unstuff_dinode(ip, NULL); + if (error) + goto do_end_trans; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto do_end_trans; + + i_size_write(inode, size); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + +do_end_trans: + gfs2_trans_end(sdp); +do_grow_release: + if (al) { + gfs2_inplace_release(ip); +do_grow_qunlock: + gfs2_quota_unlock(ip); +do_grow_alloc_put: + gfs2_alloc_put(ip); + } + return error; +} + +/** + * gfs2_setattr_size - make a file a given size + * @inode: the inode + * @newsize: the size to make the file + * + * The file size can grow, shrink, or stay the same size. This + * is called holding i_mutex and an exclusive glock on the inode + * in question. + * + * Returns: errno + */ + +int gfs2_setattr_size(struct inode *inode, u64 newsize) +{ + int ret; + u64 oldsize; + + BUG_ON(!S_ISREG(inode->i_mode)); + + ret = inode_newsize_ok(inode, newsize); + if (ret) + return ret; + + oldsize = inode->i_size; + if (newsize >= oldsize) + return do_grow(inode, newsize); + + return do_shrink(inode, oldsize, newsize); +} + +int gfs2_truncatei_resume(struct gfs2_inode *ip) +{ + int error; + error = trunc_dealloc(ip, i_size_read(&ip->i_inode)); + if (!error) + error = trunc_end(ip); + return error; +} + +int gfs2_file_dealloc(struct gfs2_inode *ip) +{ + return trunc_dealloc(ip, 0); +} + +/** + * gfs2_write_alloc_required - figure out if a write will require an allocation + * @ip: the file being written to + * @offset: the offset to write to + * @len: the number of bytes being written + * + * Returns: 1 if an alloc is required, 0 otherwise + */ + +int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, + unsigned int len) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head bh; + unsigned int shift; + u64 lblock, lblock_stop, size; + u64 end_of_file; + + if (!len) + return 0; + + if (gfs2_is_stuffed(ip)) { + if (offset + len > + sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) + return 1; + return 0; + } + + shift = sdp->sd_sb.sb_bsize_shift; + BUG_ON(gfs2_is_dir(ip)); + end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift; + lblock = offset >> shift; + lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; + if (lblock_stop > end_of_file) + return 1; + + size = (lblock_stop - lblock) << shift; + do { + bh.b_state = 0; + bh.b_size = size; + gfs2_block_map(&ip->i_inode, lblock, &bh, 0); + if (!buffer_mapped(&bh)) + return 1; + size -= bh.b_size; + lblock += (bh.b_size >> ip->i_inode.i_blkbits); + } while(size > 0); + + return 0; +} + diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h new file mode 100644 index 00000000..42fea03e --- /dev/null +++ b/fs/gfs2/bmap.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __BMAP_DOT_H__ +#define __BMAP_DOT_H__ + +#include "inode.h" + +struct inode; +struct gfs2_inode; +struct page; + + +/** + * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file + * @ip: the file + * @len: the number of bytes to be written to the file + * @data_blocks: returns the number of data blocks required + * @ind_blocks: returns the number of indirect blocks required + * + */ + +static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, + unsigned int len, + unsigned int *data_blocks, + unsigned int *ind_blocks) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int tmp; + + BUG_ON(gfs2_is_dir(ip)); + *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3; + *ind_blocks = 3 * (sdp->sd_max_height - 1); + + for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + *ind_blocks += tmp; + } +} + +extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); +extern int gfs2_block_map(struct inode *inode, sector_t lblock, + struct buffer_head *bh, int create); +extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, + u64 *dblock, unsigned *extlen); +extern int gfs2_setattr_size(struct inode *inode, u64 size); +extern void gfs2_trim_blocks(struct inode *inode); +extern int gfs2_truncatei_resume(struct gfs2_inode *ip); +extern int gfs2_file_dealloc(struct gfs2_inode *ip); +extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, + unsigned int len); + +#endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c new file mode 100644 index 00000000..0da8da2c --- /dev/null +++ b/fs/gfs2/dentry.c @@ -0,0 +1,141 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "super.h" +#include "util.h" +#include "inode.h" + +/** + * gfs2_drevalidate - Check directory lookup consistency + * @dentry: the mapping to check + * @nd: + * + * Check to make sure the lookup necessary to arrive at this inode from its + * parent is still good. + * + * Returns: 1 if the dentry is ok, 0 if it isn't + */ + +static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *parent; + struct gfs2_sbd *sdp; + struct gfs2_inode *dip; + struct inode *inode; + struct gfs2_holder d_gh; + struct gfs2_inode *ip = NULL; + int error; + int had_lock = 0; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + parent = dget_parent(dentry); + sdp = GFS2_SB(parent->d_inode); + dip = GFS2_I(parent->d_inode); + inode = dentry->d_inode; + + if (inode) { + if (is_bad_inode(inode)) + goto invalid; + ip = GFS2_I(inode); + } + + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto valid; + + had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); + if (!had_lock) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + goto fail; + } + + error = gfs2_dir_check(parent->d_inode, &dentry->d_name, ip); + switch (error) { + case 0: + if (!inode) + goto invalid_gunlock; + break; + case -ENOENT: + if (!inode) + goto valid_gunlock; + goto invalid_gunlock; + default: + goto fail_gunlock; + } + +valid_gunlock: + if (!had_lock) + gfs2_glock_dq_uninit(&d_gh); +valid: + dput(parent); + return 1; + +invalid_gunlock: + if (!had_lock) + gfs2_glock_dq_uninit(&d_gh); +invalid: + if (inode && S_ISDIR(inode->i_mode)) { + if (have_submounts(dentry)) + goto valid; + shrink_dcache_parent(dentry); + } + d_drop(dentry); + dput(parent); + return 0; + +fail_gunlock: + gfs2_glock_dq_uninit(&d_gh); +fail: + dput(parent); + return 0; +} + +static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode, + struct qstr *str) +{ + str->hash = gfs2_disk_hash(str->name, str->len); + return 0; +} + +static int gfs2_dentry_delete(const struct dentry *dentry) +{ + struct gfs2_inode *ginode; + + if (!dentry->d_inode) + return 0; + + ginode = GFS2_I(dentry->d_inode); + if (!ginode->i_iopen_gh.gh_gl) + return 0; + + if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags)) + return 1; + + return 0; +} + +const struct dentry_operations gfs2_dops = { + .d_revalidate = gfs2_drevalidate, + .d_hash = gfs2_dhash, + .d_delete = gfs2_dentry_delete, +}; + diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c new file mode 100644 index 00000000..091ee477 --- /dev/null +++ b/fs/gfs2/dir.c @@ -0,0 +1,2006 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +/* + * Implements Extendible Hashing as described in: + * "Extendible Hashing" by Fagin, et al in + * __ACM Trans. on Database Systems__, Sept 1979. + * + * + * Here's the layout of dirents which is essentially the same as that of ext2 + * within a single block. The field de_name_len is the number of bytes + * actually required for the name (no null terminator). The field de_rec_len + * is the number of bytes allocated to the dirent. The offset of the next + * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is + * deleted, the preceding dirent inherits its allocated space, ie + * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained + * by adding de_rec_len to the current dirent, this essentially causes the + * deleted dirent to get jumped over when iterating through all the dirents. + * + * When deleting the first dirent in a block, there is no previous dirent so + * the field de_ino is set to zero to designate it as deleted. When allocating + * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the + * first dirent has (de_ino == 0) and de_rec_len is large enough, this first + * dirent is allocated. Otherwise it must go through all the 'used' dirents + * searching for one in which the amount of total space minus the amount of + * used space will provide enough space for the new dirent. + * + * There are two types of blocks in which dirents reside. In a stuffed dinode, + * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of + * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the + * beginning of the leaf block. The dirents reside in leaves when + * + * dip->i_diskflags & GFS2_DIF_EXHASH is true + * + * Otherwise, the dirents are "linear", within a single stuffed dinode block. + * + * When the dirents are in leaves, the actual contents of the directory file are + * used as an array of 64-bit block pointers pointing to the leaf blocks. The + * dirents are NOT in the directory file itself. There can be more than one + * block pointer in the array that points to the same leaf. In fact, when a + * directory is first converted from linear to exhash, all of the pointers + * point to the same leaf. + * + * When a leaf is completely full, the size of the hash table can be + * doubled unless it is already at the maximum size which is hard coded into + * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list, + * but never before the maximum hash table size has been reached. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "bmap.h" +#include "util.h" + +#define IS_LEAF 1 /* Hashed (leaf) directory */ +#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */ + +#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) +#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) + +struct qstr gfs2_qdot __read_mostly; +struct qstr gfs2_qdotdot __read_mostly; + +typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, + const struct qstr *name, void *opaque); + +int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + + bh = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD); + gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); + *bhp = bh; + return 0; +} + +static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + int error; + + error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh); + if (error) + return error; + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) { + brelse(bh); + return -EIO; + } + *bhp = bh; + return 0; +} + +static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, + unsigned int offset, unsigned int size) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); + if (ip->i_inode.i_size < offset + size) + i_size_write(&ip->i_inode, offset + size); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_dinode_out(ip, dibh->b_data); + + brelse(dibh); + + return size; +} + + + +/** + * gfs2_dir_write_data - Write directory information to the inode + * @ip: The GFS2 inode + * @buf: The buffer containing information to be written + * @offset: The file offset to start writing at + * @size: The amount of data to write + * + * Returns: The number of bytes correctly written or error code + */ +static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, + u64 offset, unsigned int size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + u64 lblock, dblock; + u32 extlen = 0; + unsigned int o; + int copied = 0; + int error = 0; + int new = 0; + + if (!size) + return 0; + + if (gfs2_is_stuffed(ip) && + offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) + return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset, + size); + + if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) + return -EINVAL; + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (error) + return error; + } + + lblock = offset; + o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); + + while (copied < size) { + unsigned int amount; + struct buffer_head *bh; + + amount = size - copied; + if (amount > sdp->sd_sb.sb_bsize - o) + amount = sdp->sd_sb.sb_bsize - o; + + if (!extlen) { + new = 1; + error = gfs2_extent_map(&ip->i_inode, lblock, &new, + &dblock, &extlen); + if (error) + goto fail; + error = -EIO; + if (gfs2_assert_withdraw(sdp, dblock)) + goto fail; + } + + if (amount == sdp->sd_jbsize || new) + error = gfs2_dir_get_new_buffer(ip, dblock, &bh); + else + error = gfs2_dir_get_existing_buffer(ip, dblock, &bh); + + if (error) + goto fail; + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + memcpy(bh->b_data + o, buf, amount); + brelse(bh); + + buf += amount; + copied += amount; + lblock++; + dblock++; + extlen--; + + o = sizeof(struct gfs2_meta_header); + } + +out: + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + if (ip->i_inode.i_size < offset + copied) + i_size_write(&ip->i_inode, offset + copied); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + + return copied; +fail: + if (copied) + goto out; + return error; +} + +static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf, + u64 offset, unsigned int size) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + offset += sizeof(struct gfs2_dinode); + memcpy(buf, dibh->b_data + offset, size); + brelse(dibh); + } + + return (error) ? error : size; +} + + +/** + * gfs2_dir_read_data - Read a data from a directory inode + * @ip: The GFS2 Inode + * @buf: The buffer to place result into + * @offset: File offset to begin jdata_readng from + * @size: Amount of data to transfer + * + * Returns: The amount of data actually copied or the error + */ +static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, + unsigned int size, unsigned ra) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u64 lblock, dblock; + u32 extlen = 0; + unsigned int o; + int copied = 0; + int error = 0; + u64 disksize = i_size_read(&ip->i_inode); + + if (offset >= disksize) + return 0; + + if (offset + size > disksize) + size = disksize - offset; + + if (!size) + return 0; + + if (gfs2_is_stuffed(ip)) + return gfs2_dir_read_stuffed(ip, buf, offset, size); + + if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) + return -EINVAL; + + lblock = offset; + o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); + + while (copied < size) { + unsigned int amount; + struct buffer_head *bh; + int new; + + amount = size - copied; + if (amount > sdp->sd_sb.sb_bsize - o) + amount = sdp->sd_sb.sb_bsize - o; + + if (!extlen) { + new = 0; + error = gfs2_extent_map(&ip->i_inode, lblock, &new, + &dblock, &extlen); + if (error || !dblock) + goto fail; + BUG_ON(extlen < 1); + if (!ra) + extlen = 1; + bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); + } else { + error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh); + if (error) + goto fail; + } + error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD); + if (error) { + brelse(bh); + goto fail; + } + dblock++; + extlen--; + memcpy(buf, bh->b_data + o, amount); + brelse(bh); + buf += amount; + copied += amount; + lblock++; + o = sizeof(struct gfs2_meta_header); + } + + return copied; +fail: + return (copied) ? copied : error; +} + +static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent) +{ + return dent->de_inum.no_addr == 0 || dent->de_inum.no_formal_ino == 0; +} + +static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent, + const struct qstr *name, int ret) +{ + if (!gfs2_dirent_sentinel(dent) && + be32_to_cpu(dent->de_hash) == name->hash && + be16_to_cpu(dent->de_name_len) == name->len && + memcmp(dent+1, name->name, name->len) == 0) + return ret; + return 0; +} + +static int gfs2_dirent_find(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + return __gfs2_dirent_find(dent, name, 1); +} + +static int gfs2_dirent_prev(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + return __gfs2_dirent_find(dent, name, 2); +} + +/* + * name->name holds ptr to start of block. + * name->len holds size of block. + */ +static int gfs2_dirent_last(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + const char *start = name->name; + const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len); + if (name->len == (end - start)) + return 1; + return 0; +} + +static int gfs2_dirent_find_space(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + unsigned required = GFS2_DIRENT_SIZE(name->len); + unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); + unsigned totlen = be16_to_cpu(dent->de_rec_len); + + if (gfs2_dirent_sentinel(dent)) + actual = 0; + if (totlen - actual >= required) + return 1; + return 0; +} + +struct dirent_gather { + const struct gfs2_dirent **pdent; + unsigned offset; +}; + +static int gfs2_dirent_gather(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + struct dirent_gather *g = opaque; + if (!gfs2_dirent_sentinel(dent)) { + g->pdent[g->offset++] = dent; + } + return 0; +} + +/* + * Other possible things to check: + * - Inode located within filesystem size (and on valid block) + * - Valid directory entry type + * Not sure how heavy-weight we want to make this... could also check + * hash is correct for example, but that would take a lot of extra time. + * For now the most important thing is to check that the various sizes + * are correct. + */ +static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset, + unsigned int size, unsigned int len, int first) +{ + const char *msg = "gfs2_dirent too small"; + if (unlikely(size < sizeof(struct gfs2_dirent))) + goto error; + msg = "gfs2_dirent misaligned"; + if (unlikely(offset & 0x7)) + goto error; + msg = "gfs2_dirent points beyond end of block"; + if (unlikely(offset + size > len)) + goto error; + msg = "zero inode number"; + if (unlikely(!first && gfs2_dirent_sentinel(dent))) + goto error; + msg = "name length is greater than space in dirent"; + if (!gfs2_dirent_sentinel(dent) && + unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) > + size)) + goto error; + return 0; +error: + printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg, + first ? "first in block" : "not first in block"); + return -EIO; +} + +static int gfs2_dirent_offset(const void *buf) +{ + const struct gfs2_meta_header *h = buf; + int offset; + + BUG_ON(buf == NULL); + + switch(be32_to_cpu(h->mh_type)) { + case GFS2_METATYPE_LF: + offset = sizeof(struct gfs2_leaf); + break; + case GFS2_METATYPE_DI: + offset = sizeof(struct gfs2_dinode); + break; + default: + goto wrong_type; + } + return offset; +wrong_type: + printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n", + be32_to_cpu(h->mh_type)); + return -1; +} + +static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf, + unsigned int len, gfs2_dscan_t scan, + const struct qstr *name, + void *opaque) +{ + struct gfs2_dirent *dent, *prev; + unsigned offset; + unsigned size; + int ret = 0; + + ret = gfs2_dirent_offset(buf); + if (ret < 0) + goto consist_inode; + + offset = ret; + prev = NULL; + dent = buf + offset; + size = be16_to_cpu(dent->de_rec_len); + if (gfs2_check_dirent(dent, offset, size, len, 1)) + goto consist_inode; + do { + ret = scan(dent, name, opaque); + if (ret) + break; + offset += size; + if (offset == len) + break; + prev = dent; + dent = buf + offset; + size = be16_to_cpu(dent->de_rec_len); + if (gfs2_check_dirent(dent, offset, size, len, 0)) + goto consist_inode; + } while(1); + + switch(ret) { + case 0: + return NULL; + case 1: + return dent; + case 2: + return prev ? prev : dent; + default: + BUG_ON(ret > 0); + return ERR_PTR(ret); + } + +consist_inode: + gfs2_consist_inode(GFS2_I(inode)); + return ERR_PTR(-EIO); +} + +static int dirent_check_reclen(struct gfs2_inode *dip, + const struct gfs2_dirent *d, const void *end_p) +{ + const void *ptr = d; + u16 rec_len = be16_to_cpu(d->de_rec_len); + + if (unlikely(rec_len < sizeof(struct gfs2_dirent))) + goto broken; + ptr += rec_len; + if (ptr < end_p) + return rec_len; + if (ptr == end_p) + return -ENOENT; +broken: + gfs2_consist_inode(dip); + return -EIO; +} + +/** + * dirent_next - Next dirent + * @dip: the directory + * @bh: The buffer + * @dent: Pointer to list of dirents + * + * Returns: 0 on success, error code otherwise + */ + +static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh, + struct gfs2_dirent **dent) +{ + struct gfs2_dirent *cur = *dent, *tmp; + char *bh_end = bh->b_data + bh->b_size; + int ret; + + ret = dirent_check_reclen(dip, cur, bh_end); + if (ret < 0) + return ret; + + tmp = (void *)cur + ret; + ret = dirent_check_reclen(dip, tmp, bh_end); + if (ret == -EIO) + return ret; + + /* Only the first dent could ever have de_inum.no_addr == 0 */ + if (gfs2_dirent_sentinel(tmp)) { + gfs2_consist_inode(dip); + return -EIO; + } + + *dent = tmp; + return 0; +} + +/** + * dirent_del - Delete a dirent + * @dip: The GFS2 inode + * @bh: The buffer + * @prev: The previous dirent + * @cur: The current dirent + * + */ + +static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh, + struct gfs2_dirent *prev, struct gfs2_dirent *cur) +{ + u16 cur_rec_len, prev_rec_len; + + if (gfs2_dirent_sentinel(cur)) { + gfs2_consist_inode(dip); + return; + } + + gfs2_trans_add_bh(dip->i_gl, bh, 1); + + /* If there is no prev entry, this is the first entry in the block. + The de_rec_len is already as big as it needs to be. Just zero + out the inode number and return. */ + + if (!prev) { + cur->de_inum.no_addr = 0; + cur->de_inum.no_formal_ino = 0; + return; + } + + /* Combine this dentry with the previous one. */ + + prev_rec_len = be16_to_cpu(prev->de_rec_len); + cur_rec_len = be16_to_cpu(cur->de_rec_len); + + if ((char *)prev + prev_rec_len != (char *)cur) + gfs2_consist_inode(dip); + if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size) + gfs2_consist_inode(dip); + + prev_rec_len += cur_rec_len; + prev->de_rec_len = cpu_to_be16(prev_rec_len); +} + +/* + * Takes a dent from which to grab space as an argument. Returns the + * newly created dent. + */ +static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode, + struct gfs2_dirent *dent, + const struct qstr *name, + struct buffer_head *bh) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_dirent *ndent; + unsigned offset = 0, totlen; + + if (!gfs2_dirent_sentinel(dent)) + offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); + totlen = be16_to_cpu(dent->de_rec_len); + BUG_ON(offset + name->len > totlen); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + ndent = (struct gfs2_dirent *)((char *)dent + offset); + dent->de_rec_len = cpu_to_be16(offset); + gfs2_qstr2dirent(name, totlen - offset, ndent); + return ndent; +} + +static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode, + struct buffer_head *bh, + const struct qstr *name) +{ + struct gfs2_dirent *dent; + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + gfs2_dirent_find_space, name, NULL); + if (!dent || IS_ERR(dent)) + return dent; + return gfs2_init_dirent(inode, dent, name, bh); +} + +static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, + struct buffer_head **bhp) +{ + int error; + + error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp); + if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { + /* printk(KERN_INFO "block num=%llu\n", leaf_no); */ + error = -EIO; + } + + return error; +} + +/** + * get_leaf_nr - Get a leaf number associated with the index + * @dip: The GFS2 inode + * @index: + * @leaf_out: + * + * Returns: 0 on success, error code otherwise + */ + +static int get_leaf_nr(struct gfs2_inode *dip, u32 index, + u64 *leaf_out) +{ + __be64 leaf_no; + int error; + + error = gfs2_dir_read_data(dip, (char *)&leaf_no, + index * sizeof(__be64), + sizeof(__be64), 0); + if (error != sizeof(u64)) + return (error < 0) ? error : -EIO; + + *leaf_out = be64_to_cpu(leaf_no); + + return 0; +} + +static int get_first_leaf(struct gfs2_inode *dip, u32 index, + struct buffer_head **bh_out) +{ + u64 leaf_no; + int error; + + error = get_leaf_nr(dip, index, &leaf_no); + if (!error) + error = get_leaf(dip, leaf_no, bh_out); + + return error; +} + +static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, + const struct qstr *name, + gfs2_dscan_t scan, + struct buffer_head **pbh) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + if (ip->i_diskflags & GFS2_DIF_EXHASH) { + struct gfs2_leaf *leaf; + unsigned hsize = 1 << ip->i_depth; + unsigned index; + u64 ln; + if (hsize * sizeof(u64) != i_size_read(inode)) { + gfs2_consist_inode(ip); + return ERR_PTR(-EIO); + } + + index = name->hash >> (32 - ip->i_depth); + error = get_first_leaf(ip, index, &bh); + if (error) + return ERR_PTR(error); + do { + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + scan, name, NULL); + if (dent) + goto got_dent; + leaf = (struct gfs2_leaf *)bh->b_data; + ln = be64_to_cpu(leaf->lf_next); + brelse(bh); + if (!ln) + break; + + error = get_leaf(ip, ln, &bh); + } while(!error); + + return error ? ERR_PTR(error) : NULL; + } + + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return ERR_PTR(error); + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL); +got_dent: + if (unlikely(dent == NULL || IS_ERR(dent))) { + brelse(bh); + bh = NULL; + } + *pbh = bh; + return dent; +} + +static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth) +{ + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int n = 1; + u64 bn; + int error; + struct buffer_head *bh; + struct gfs2_leaf *leaf; + struct gfs2_dirent *dent; + struct qstr name = { .name = "", .len = 0, .hash = 0 }; + + error = gfs2_alloc_block(ip, &bn, &n); + if (error) + return NULL; + bh = gfs2_meta_new(ip->i_gl, bn); + if (!bh) + return NULL; + + gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); + leaf = (struct gfs2_leaf *)bh->b_data; + leaf->lf_depth = cpu_to_be16(depth); + leaf->lf_entries = 0; + leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE); + leaf->lf_next = 0; + memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved)); + dent = (struct gfs2_dirent *)(leaf+1); + gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent); + *pbh = bh; + return leaf; +} + +/** + * dir_make_exhash - Convert a stuffed directory into an ExHash directory + * @dip: The GFS2 inode + * + * Returns: 0 on success, error code otherwise + */ + +static int dir_make_exhash(struct inode *inode) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_dirent *dent; + struct qstr args; + struct buffer_head *bh, *dibh; + struct gfs2_leaf *leaf; + int y; + u32 x; + __be64 *lp; + u64 bn; + int error; + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + return error; + + /* Turn over a new leaf */ + + leaf = new_leaf(inode, &bh, 0); + if (!leaf) + return -ENOSPC; + bn = bh->b_blocknr; + + gfs2_assert(sdp, dip->i_entries < (1 << 16)); + leaf->lf_entries = cpu_to_be16(dip->i_entries); + + /* Copy dirents */ + + gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh, + sizeof(struct gfs2_dinode)); + + /* Find last entry */ + + x = 0; + args.len = bh->b_size - sizeof(struct gfs2_dinode) + + sizeof(struct gfs2_leaf); + args.name = bh->b_data; + dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size, + gfs2_dirent_last, &args, NULL); + if (!dent) { + brelse(bh); + brelse(dibh); + return -EIO; + } + if (IS_ERR(dent)) { + brelse(bh); + brelse(dibh); + return PTR_ERR(dent); + } + + /* Adjust the last dirent's record length + (Remember that dent still points to the last entry.) */ + + dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) + + sizeof(struct gfs2_dinode) - + sizeof(struct gfs2_leaf)); + + brelse(bh); + + /* We're done with the new leaf block, now setup the new + hash table. */ + + gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + + lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode)); + + for (x = sdp->sd_hash_ptrs; x--; lp++) + *lp = cpu_to_be64(bn); + + i_size_write(inode, sdp->sd_sb.sb_bsize / 2); + gfs2_add_inode_blocks(&dip->i_inode, 1); + dip->i_diskflags |= GFS2_DIF_EXHASH; + + for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; + dip->i_depth = y; + + gfs2_dinode_out(dip, dibh->b_data); + + brelse(dibh); + + return 0; +} + +/** + * dir_split_leaf - Split a leaf block into two + * @dip: The GFS2 inode + * @index: + * @leaf_no: + * + * Returns: 0 on success, error code on failure + */ + +static int dir_split_leaf(struct inode *inode, const struct qstr *name) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct buffer_head *nbh, *obh, *dibh; + struct gfs2_leaf *nleaf, *oleaf; + struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new; + u32 start, len, half_len, divider; + u64 bn, leaf_no; + __be64 *lp; + u32 index; + int x, moved = 0; + int error; + + index = name->hash >> (32 - dip->i_depth); + error = get_leaf_nr(dip, index, &leaf_no); + if (error) + return error; + + /* Get the old leaf block */ + error = get_leaf(dip, leaf_no, &obh); + if (error) + return error; + + oleaf = (struct gfs2_leaf *)obh->b_data; + if (dip->i_depth == be16_to_cpu(oleaf->lf_depth)) { + brelse(obh); + return 1; /* can't split */ + } + + gfs2_trans_add_bh(dip->i_gl, obh, 1); + + nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1); + if (!nleaf) { + brelse(obh); + return -ENOSPC; + } + bn = nbh->b_blocknr; + + /* Compute the start and len of leaf pointers in the hash table. */ + len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth)); + half_len = len >> 1; + if (!half_len) { + printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index); + gfs2_consist_inode(dip); + error = -EIO; + goto fail_brelse; + } + + start = (index & ~(len - 1)); + + /* Change the pointers. + Don't bother distinguishing stuffed from non-stuffed. + This code is complicated enough already. */ + lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS); + if (!lp) { + error = -ENOMEM; + goto fail_brelse; + } + + /* Change the pointers */ + for (x = 0; x < half_len; x++) + lp[x] = cpu_to_be64(bn); + + error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64), + half_len * sizeof(u64)); + if (error != half_len * sizeof(u64)) { + if (error >= 0) + error = -EIO; + goto fail_lpfree; + } + + kfree(lp); + + /* Compute the divider */ + divider = (start + half_len) << (32 - dip->i_depth); + + /* Copy the entries */ + dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf)); + + do { + next = dent; + if (dirent_next(dip, obh, &next)) + next = NULL; + + if (!gfs2_dirent_sentinel(dent) && + be32_to_cpu(dent->de_hash) < divider) { + struct qstr str; + str.name = (char*)(dent+1); + str.len = be16_to_cpu(dent->de_name_len); + str.hash = be32_to_cpu(dent->de_hash); + new = gfs2_dirent_alloc(inode, nbh, &str); + if (IS_ERR(new)) { + error = PTR_ERR(new); + break; + } + + new->de_inum = dent->de_inum; /* No endian worries */ + new->de_type = dent->de_type; /* No endian worries */ + be16_add_cpu(&nleaf->lf_entries, 1); + + dirent_del(dip, obh, prev, dent); + + if (!oleaf->lf_entries) + gfs2_consist_inode(dip); + be16_add_cpu(&oleaf->lf_entries, -1); + + if (!prev) + prev = dent; + + moved = 1; + } else { + prev = dent; + } + dent = next; + } while (dent); + + oleaf->lf_depth = nleaf->lf_depth; + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { + gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_add_inode_blocks(&dip->i_inode, 1); + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); + } + + brelse(obh); + brelse(nbh); + + return error; + +fail_lpfree: + kfree(lp); + +fail_brelse: + brelse(obh); + brelse(nbh); + return error; +} + +/** + * dir_double_exhash - Double size of ExHash table + * @dip: The GFS2 dinode + * + * Returns: 0 on success, error code on failure + */ + +static int dir_double_exhash(struct gfs2_inode *dip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct buffer_head *dibh; + u32 hsize; + u64 *buf; + u64 *from, *to; + u64 block; + u64 disksize = i_size_read(&dip->i_inode); + int x; + int error = 0; + + hsize = 1 << dip->i_depth; + if (hsize * sizeof(u64) != disksize) { + gfs2_consist_inode(dip); + return -EIO; + } + + /* Allocate both the "from" and "to" buffers in one big chunk */ + + buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS); + if (!buf) + return -ENOMEM; + + for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) { + error = gfs2_dir_read_data(dip, (char *)buf, + block * sdp->sd_hash_bsize, + sdp->sd_hash_bsize, 1); + if (error != sdp->sd_hash_bsize) { + if (error >= 0) + error = -EIO; + goto fail; + } + + from = buf; + to = (u64 *)((char *)buf + sdp->sd_hash_bsize); + + for (x = sdp->sd_hash_ptrs; x--; from++) { + *to++ = *from; /* No endianess worries */ + *to++ = *from; + } + + error = gfs2_dir_write_data(dip, + (char *)buf + sdp->sd_hash_bsize, + block * sdp->sd_sb.sb_bsize, + sdp->sd_sb.sb_bsize); + if (error != sdp->sd_sb.sb_bsize) { + if (error >= 0) + error = -EIO; + goto fail; + } + } + + kfree(buf); + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (!gfs2_assert_withdraw(sdp, !error)) { + dip->i_depth++; + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); + } + + return error; + +fail: + kfree(buf); + return error; +} + +/** + * compare_dents - compare directory entries by hash value + * @a: first dent + * @b: second dent + * + * When comparing the hash entries of @a to @b: + * gt: returns 1 + * lt: returns -1 + * eq: returns 0 + */ + +static int compare_dents(const void *a, const void *b) +{ + const struct gfs2_dirent *dent_a, *dent_b; + u32 hash_a, hash_b; + int ret = 0; + + dent_a = *(const struct gfs2_dirent **)a; + hash_a = be32_to_cpu(dent_a->de_hash); + + dent_b = *(const struct gfs2_dirent **)b; + hash_b = be32_to_cpu(dent_b->de_hash); + + if (hash_a > hash_b) + ret = 1; + else if (hash_a < hash_b) + ret = -1; + else { + unsigned int len_a = be16_to_cpu(dent_a->de_name_len); + unsigned int len_b = be16_to_cpu(dent_b->de_name_len); + + if (len_a > len_b) + ret = 1; + else if (len_a < len_b) + ret = -1; + else + ret = memcmp(dent_a + 1, dent_b + 1, len_a); + } + + return ret; +} + +/** + * do_filldir_main - read out directory entries + * @dip: The GFS2 inode + * @offset: The offset in the file to read from + * @opaque: opaque data to pass to filldir + * @filldir: The function to pass entries to + * @darr: an array of struct gfs2_dirent pointers to read + * @entries: the number of entries in darr + * @copied: pointer to int that's non-zero if a entry has been copied out + * + * Jump through some hoops to make sure that if there are hash collsions, + * they are read out at the beginning of a buffer. We want to minimize + * the possibility that they will fall into different readdir buffers or + * that someone will want to seek to that location. + * + * Returns: errno, >0 on exception from filldir + */ + +static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, + void *opaque, filldir_t filldir, + const struct gfs2_dirent **darr, u32 entries, + int *copied) +{ + const struct gfs2_dirent *dent, *dent_next; + u64 off, off_next; + unsigned int x, y; + int run = 0; + int error = 0; + + sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL); + + dent_next = darr[0]; + off_next = be32_to_cpu(dent_next->de_hash); + off_next = gfs2_disk_hash2offset(off_next); + + for (x = 0, y = 1; x < entries; x++, y++) { + dent = dent_next; + off = off_next; + + if (y < entries) { + dent_next = darr[y]; + off_next = be32_to_cpu(dent_next->de_hash); + off_next = gfs2_disk_hash2offset(off_next); + + if (off < *offset) + continue; + *offset = off; + + if (off_next == off) { + if (*copied && !run) + return 1; + run = 1; + } else + run = 0; + } else { + if (off < *offset) + continue; + *offset = off; + } + + error = filldir(opaque, (const char *)(dent + 1), + be16_to_cpu(dent->de_name_len), + off, be64_to_cpu(dent->de_inum.no_addr), + be16_to_cpu(dent->de_type)); + if (error) + return 1; + + *copied = 1; + } + + /* Increment the *offset by one, so the next time we come into the + do_filldir fxn, we get the next entry instead of the last one in the + current leaf */ + + (*offset)++; + + return 0; +} + +static void *gfs2_alloc_sort_buffer(unsigned size) +{ + void *ptr = NULL; + + if (size < KMALLOC_MAX_SIZE) + ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); + if (!ptr) + ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + return ptr; +} + +static void gfs2_free_sort_buffer(void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); +} + +static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, + filldir_t filldir, int *copied, unsigned *depth, + u64 leaf_no) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + struct gfs2_leaf *lf; + unsigned entries = 0, entries2 = 0; + unsigned leaves = 0; + const struct gfs2_dirent **darr, *dent; + struct dirent_gather g; + struct buffer_head **larr; + int leaf = 0; + int error, i; + u64 lfn = leaf_no; + + do { + error = get_leaf(ip, lfn, &bh); + if (error) + goto out; + lf = (struct gfs2_leaf *)bh->b_data; + if (leaves == 0) + *depth = be16_to_cpu(lf->lf_depth); + entries += be16_to_cpu(lf->lf_entries); + leaves++; + lfn = be64_to_cpu(lf->lf_next); + brelse(bh); + } while(lfn); + + if (!entries) + return 0; + + error = -ENOMEM; + /* + * The extra 99 entries are not normally used, but are a buffer + * zone in case the number of entries in the leaf is corrupt. + * 99 is the maximum number of entries that can fit in a single + * leaf block. + */ + larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *)); + if (!larr) + goto out; + darr = (const struct gfs2_dirent **)(larr + leaves); + g.pdent = darr; + g.offset = 0; + lfn = leaf_no; + + do { + error = get_leaf(ip, lfn, &bh); + if (error) + goto out_free; + lf = (struct gfs2_leaf *)bh->b_data; + lfn = be64_to_cpu(lf->lf_next); + if (lf->lf_entries) { + entries2 += be16_to_cpu(lf->lf_entries); + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + gfs2_dirent_gather, NULL, &g); + error = PTR_ERR(dent); + if (IS_ERR(dent)) + goto out_free; + if (entries2 != g.offset) { + fs_warn(sdp, "Number of entries corrupt in dir " + "leaf %llu, entries2 (%u) != " + "g.offset (%u)\n", + (unsigned long long)bh->b_blocknr, + entries2, g.offset); + + error = -EIO; + goto out_free; + } + error = 0; + larr[leaf++] = bh; + } else { + brelse(bh); + } + } while(lfn); + + BUG_ON(entries2 != entries); + error = do_filldir_main(ip, offset, opaque, filldir, darr, + entries, copied); +out_free: + for(i = 0; i < leaf; i++) + brelse(larr[i]); + gfs2_free_sort_buffer(larr); +out: + return error; +} + +/** + * dir_e_read - Reads the entries from a directory into a filldir buffer + * @dip: dinode pointer + * @offset: the hash of the last entry read shifted to the right once + * @opaque: buffer for the filldir function to fill + * @filldir: points to the filldir function to use + * + * Returns: errno + */ + +static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, + filldir_t filldir) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + u32 hsize, len = 0; + u32 ht_offset, lp_offset, ht_offset_cur = -1; + u32 hash, index; + __be64 *lp; + int copied = 0; + int error = 0; + unsigned depth = 0; + + hsize = 1 << dip->i_depth; + if (hsize * sizeof(u64) != i_size_read(inode)) { + gfs2_consist_inode(dip); + return -EIO; + } + + hash = gfs2_dir_offset2hash(*offset); + index = hash >> (32 - dip->i_depth); + + lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS); + if (!lp) + return -ENOMEM; + + while (index < hsize) { + lp_offset = index & (sdp->sd_hash_ptrs - 1); + ht_offset = index - lp_offset; + + if (ht_offset_cur != ht_offset) { + error = gfs2_dir_read_data(dip, (char *)lp, + ht_offset * sizeof(__be64), + sdp->sd_hash_bsize, 1); + if (error != sdp->sd_hash_bsize) { + if (error >= 0) + error = -EIO; + goto out; + } + ht_offset_cur = ht_offset; + } + + error = gfs2_dir_read_leaf(inode, offset, opaque, filldir, + &copied, &depth, + be64_to_cpu(lp[lp_offset])); + if (error) + break; + + len = 1 << (dip->i_depth - depth); + index = (index & ~(len - 1)) + len; + } + +out: + kfree(lp); + if (error > 0) + error = 0; + return error; +} + +int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, + filldir_t filldir) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct dirent_gather g; + const struct gfs2_dirent **darr, *dent; + struct buffer_head *dibh; + int copied = 0; + int error; + + if (!dip->i_entries) + return 0; + + if (dip->i_diskflags & GFS2_DIF_EXHASH) + return dir_e_read(inode, offset, opaque, filldir); + + if (!gfs2_is_stuffed(dip)) { + gfs2_consist_inode(dip); + return -EIO; + } + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + return error; + + error = -ENOMEM; + /* 96 is max number of dirents which can be stuffed into an inode */ + darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS); + if (darr) { + g.pdent = darr; + g.offset = 0; + dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size, + gfs2_dirent_gather, NULL, &g); + if (IS_ERR(dent)) { + error = PTR_ERR(dent); + goto out; + } + if (dip->i_entries != g.offset) { + fs_warn(sdp, "Number of entries corrupt in dir %llu, " + "ip->i_entries (%u) != g.offset (%u)\n", + (unsigned long long)dip->i_no_addr, + dip->i_entries, + g.offset); + error = -EIO; + goto out; + } + error = do_filldir_main(dip, offset, opaque, filldir, darr, + dip->i_entries, &copied); +out: + kfree(darr); + } + + if (error > 0) + error = 0; + + brelse(dibh); + + return error; +} + +/** + * gfs2_dir_search - Search a directory + * @dip: The GFS2 inode + * @filename: + * @inode: + * + * This routine searches a directory for a file or another directory. + * Assumes a glock is held on dip. + * + * Returns: errno + */ + +struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + struct inode *inode; + + dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); + if (dent) { + if (IS_ERR(dent)) + return ERR_CAST(dent); + inode = gfs2_inode_lookup(dir->i_sb, + be16_to_cpu(dent->de_type), + be64_to_cpu(dent->de_inum.no_addr), + be64_to_cpu(dent->de_inum.no_formal_ino), 0); + brelse(bh); + return inode; + } + return ERR_PTR(-ENOENT); +} + +int gfs2_dir_check(struct inode *dir, const struct qstr *name, + const struct gfs2_inode *ip) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + int ret = -ENOENT; + + dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); + if (dent) { + if (IS_ERR(dent)) + return PTR_ERR(dent); + if (ip) { + if (be64_to_cpu(dent->de_inum.no_addr) != ip->i_no_addr) + goto out; + if (be64_to_cpu(dent->de_inum.no_formal_ino) != + ip->i_no_formal_ino) + goto out; + if (unlikely(IF2DT(ip->i_inode.i_mode) != + be16_to_cpu(dent->de_type))) { + gfs2_consist_inode(GFS2_I(dir)); + ret = -EIO; + goto out; + } + } + ret = 0; +out: + brelse(bh); + } + return ret; +} + +static int dir_new_leaf(struct inode *inode, const struct qstr *name) +{ + struct buffer_head *bh, *obh; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_leaf *leaf, *oleaf; + int error; + u32 index; + u64 bn; + + index = name->hash >> (32 - ip->i_depth); + error = get_first_leaf(ip, index, &obh); + if (error) + return error; + do { + oleaf = (struct gfs2_leaf *)obh->b_data; + bn = be64_to_cpu(oleaf->lf_next); + if (!bn) + break; + brelse(obh); + error = get_leaf(ip, bn, &obh); + if (error) + return error; + } while(1); + + gfs2_trans_add_bh(ip->i_gl, obh, 1); + + leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth)); + if (!leaf) { + brelse(obh); + return -ENOSPC; + } + oleaf->lf_next = cpu_to_be64(bh->b_blocknr); + brelse(bh); + brelse(obh); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return error; + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_add_inode_blocks(&ip->i_inode, 1); + gfs2_dinode_out(ip, bh->b_data); + brelse(bh); + return 0; +} + +/** + * gfs2_dir_add - Add new filename into directory + * @dip: The GFS2 inode + * @filename: The new name + * @inode: The inode number of the entry + * @type: The type of the entry + * + * Returns: 0 on success, error code on failure + */ + +int gfs2_dir_add(struct inode *inode, const struct qstr *name, + const struct gfs2_inode *nip) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *bh; + struct gfs2_dirent *dent; + struct gfs2_leaf *leaf; + int error; + + while(1) { + dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, + &bh); + if (dent) { + if (IS_ERR(dent)) + return PTR_ERR(dent); + dent = gfs2_init_dirent(inode, dent, name, bh); + gfs2_inum_out(nip, dent); + dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); + if (ip->i_diskflags & GFS2_DIF_EXHASH) { + leaf = (struct gfs2_leaf *)bh->b_data; + be16_add_cpu(&leaf->lf_entries, 1); + } + brelse(bh); + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + break; + gfs2_trans_add_bh(ip->i_gl, bh, 1); + ip->i_entries++; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + if (S_ISDIR(nip->i_inode.i_mode)) + inc_nlink(&ip->i_inode); + gfs2_dinode_out(ip, bh->b_data); + brelse(bh); + error = 0; + break; + } + if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) { + error = dir_make_exhash(inode); + if (error) + break; + continue; + } + error = dir_split_leaf(inode, name); + if (error == 0) + continue; + if (error < 0) + break; + if (ip->i_depth < GFS2_DIR_MAX_DEPTH) { + error = dir_double_exhash(ip); + if (error) + break; + error = dir_split_leaf(inode, name); + if (error < 0) + break; + if (error == 0) + continue; + } + error = dir_new_leaf(inode, name); + if (!error) + continue; + error = -ENOSPC; + break; + } + return error; +} + + +/** + * gfs2_dir_del - Delete a directory entry + * @dip: The GFS2 inode + * @filename: The filename + * + * Returns: 0 on success, error code on failure + */ + +int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) +{ + const struct qstr *name = &dentry->d_name; + struct gfs2_dirent *dent, *prev = NULL; + struct buffer_head *bh; + int error; + + /* Returns _either_ the entry (if its first in block) or the + previous entry otherwise */ + dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh); + if (!dent) { + gfs2_consist_inode(dip); + return -EIO; + } + if (IS_ERR(dent)) { + gfs2_consist_inode(dip); + return PTR_ERR(dent); + } + /* If not first in block, adjust pointers accordingly */ + if (gfs2_dirent_find(dent, name, NULL) == 0) { + prev = dent; + dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len)); + } + + dirent_del(dip, bh, prev, dent); + if (dip->i_diskflags & GFS2_DIF_EXHASH) { + struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; + u16 entries = be16_to_cpu(leaf->lf_entries); + if (!entries) + gfs2_consist_inode(dip); + leaf->lf_entries = cpu_to_be16(--entries); + } + brelse(bh); + + error = gfs2_meta_inode_buffer(dip, &bh); + if (error) + return error; + + if (!dip->i_entries) + gfs2_consist_inode(dip); + gfs2_trans_add_bh(dip->i_gl, bh, 1); + dip->i_entries--; + dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; + if (S_ISDIR(dentry->d_inode->i_mode)) + drop_nlink(&dip->i_inode); + gfs2_dinode_out(dip, bh->b_data); + brelse(bh); + mark_inode_dirty(&dip->i_inode); + + return error; +} + +/** + * gfs2_dir_mvino - Change inode number of directory entry + * @dip: The GFS2 inode + * @filename: + * @new_inode: + * + * This routine changes the inode number of a directory entry. It's used + * by rename to change ".." when a directory is moved. + * Assumes a glock is held on dvp. + * + * Returns: errno + */ + +int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + const struct gfs2_inode *nip, unsigned int new_type) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + int error; + + dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh); + if (!dent) { + gfs2_consist_inode(dip); + return -EIO; + } + if (IS_ERR(dent)) + return PTR_ERR(dent); + + gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_inum_out(nip, dent); + dent->de_type = cpu_to_be16(new_type); + + if (dip->i_diskflags & GFS2_DIF_EXHASH) { + brelse(bh); + error = gfs2_meta_inode_buffer(dip, &bh); + if (error) + return error; + gfs2_trans_add_bh(dip->i_gl, bh, 1); + } + + dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; + gfs2_dinode_out(dip, bh->b_data); + brelse(bh); + return 0; +} + +/** + * leaf_dealloc - Deallocate a directory leaf + * @dip: the directory + * @index: the hash table offset in the directory + * @len: the number of pointers to this leaf + * @leaf_no: the leaf number + * @leaf_bh: buffer_head for the starting leaf + * last_dealloc: 1 if this is the final dealloc for the leaf, else 0 + * + * Returns: errno + */ + +static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, + u64 leaf_no, struct buffer_head *leaf_bh, + int last_dealloc) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_leaf *tmp_leaf; + struct gfs2_rgrp_list rlist; + struct buffer_head *bh, *dibh; + u64 blk, nblk; + unsigned int rg_blocks = 0, l_blocks = 0; + char *ht; + unsigned int x, size = len * sizeof(u64); + int error; + + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); + + ht = kzalloc(size, GFP_NOFS); + if (!ht) + return -ENOMEM; + + if (!gfs2_alloc_get(dip)) { + error = -ENOMEM; + goto out; + } + + error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_put; + + error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh); + if (error) + goto out_qs; + + /* Count the number of leaves */ + bh = leaf_bh; + + for (blk = leaf_no; blk; blk = nblk) { + if (blk != leaf_no) { + error = get_leaf(dip, blk, &bh); + if (error) + goto out_rlist; + } + tmp_leaf = (struct gfs2_leaf *)bh->b_data; + nblk = be64_to_cpu(tmp_leaf->lf_next); + if (blk != leaf_no) + brelse(bh); + + gfs2_rlist_add(sdp, &rlist, blk); + l_blocks++; + } + + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); + + for (x = 0; x < rlist.rl_rgrps; x++) { + struct gfs2_rgrpd *rgd; + rgd = rlist.rl_ghs[x].gh_gl->gl_object; + rg_blocks += rgd->rd_length; + } + + error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); + if (error) + goto out_rlist; + + error = gfs2_trans_begin(sdp, + rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) + + RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks); + if (error) + goto out_rg_gunlock; + + bh = leaf_bh; + + for (blk = leaf_no; blk; blk = nblk) { + if (blk != leaf_no) { + error = get_leaf(dip, blk, &bh); + if (error) + goto out_end_trans; + } + tmp_leaf = (struct gfs2_leaf *)bh->b_data; + nblk = be64_to_cpu(tmp_leaf->lf_next); + if (blk != leaf_no) + brelse(bh); + + gfs2_free_meta(dip, blk, 1); + gfs2_add_inode_blocks(&dip->i_inode, -1); + } + + error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size); + if (error != size) { + if (error >= 0) + error = -EIO; + goto out_end_trans; + } + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + goto out_end_trans; + + gfs2_trans_add_bh(dip->i_gl, dibh, 1); + /* On the last dealloc, make this a regular file in case we crash. + (We don't want to free these blocks a second time.) */ + if (last_dealloc) + dip->i_inode.i_mode = S_IFREG; + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); + +out_end_trans: + gfs2_trans_end(sdp); +out_rg_gunlock: + gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); +out_rlist: + gfs2_rlist_free(&rlist); + gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh); +out_qs: + gfs2_quota_unhold(dip); +out_put: + gfs2_alloc_put(dip); +out: + kfree(ht); + return error; +} + +/** + * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory + * @dip: the directory + * + * Dealloc all on-disk directory leaves to FREEMETA state + * Change on-disk inode type to "regular file" + * + * Returns: errno + */ + +int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct buffer_head *bh; + struct gfs2_leaf *leaf; + u32 hsize, len; + u32 ht_offset, lp_offset, ht_offset_cur = -1; + u32 index = 0, next_index; + __be64 *lp; + u64 leaf_no; + int error = 0, last; + + hsize = 1 << dip->i_depth; + if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) { + gfs2_consist_inode(dip); + return -EIO; + } + + lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS); + if (!lp) + return -ENOMEM; + + while (index < hsize) { + lp_offset = index & (sdp->sd_hash_ptrs - 1); + ht_offset = index - lp_offset; + + if (ht_offset_cur != ht_offset) { + error = gfs2_dir_read_data(dip, (char *)lp, + ht_offset * sizeof(__be64), + sdp->sd_hash_bsize, 1); + if (error != sdp->sd_hash_bsize) { + if (error >= 0) + error = -EIO; + goto out; + } + ht_offset_cur = ht_offset; + } + + leaf_no = be64_to_cpu(lp[lp_offset]); + if (leaf_no) { + error = get_leaf(dip, leaf_no, &bh); + if (error) + goto out; + leaf = (struct gfs2_leaf *)bh->b_data; + len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth)); + + next_index = (index & ~(len - 1)) + len; + last = ((next_index >= hsize) ? 1 : 0); + error = leaf_dealloc(dip, index, len, leaf_no, bh, + last); + brelse(bh); + if (error) + goto out; + index = next_index; + } else + index++; + } + + if (index != hsize) { + gfs2_consist_inode(dip); + error = -EIO; + } + +out: + kfree(lp); + + return error; +} + +/** + * gfs2_diradd_alloc_required - find if adding entry will require an allocation + * @ip: the file being written to + * @filname: the filename that's going to be added + * + * Returns: 1 if alloc required, 0 if not, -ve on error + */ + +int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name) +{ + struct gfs2_dirent *dent; + struct buffer_head *bh; + + dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh); + if (!dent) { + return 1; + } + if (IS_ERR(dent)) + return PTR_ERR(dent); + brelse(bh); + return 0; +} + diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h new file mode 100644 index 00000000..e686af11 --- /dev/null +++ b/fs/gfs2/dir.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __DIR_DOT_H__ +#define __DIR_DOT_H__ + +#include +#include + +struct inode; +struct gfs2_inode; +struct gfs2_inum; + +extern struct inode *gfs2_dir_search(struct inode *dir, + const struct qstr *filename); +extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, + const struct gfs2_inode *ip); +extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, + const struct gfs2_inode *ip); +extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); +extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, + filldir_t filldir); +extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + const struct gfs2_inode *nip, unsigned int new_type); + +extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); + +extern int gfs2_diradd_alloc_required(struct inode *dir, + const struct qstr *filename); +extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp); + +static inline u32 gfs2_disk_hash(const char *data, int len) +{ + return crc32_le((u32)~0, data, len) ^ (u32)~0; +} + + +static inline void gfs2_str2qstr(struct qstr *name, const char *fname) +{ + name->name = fname; + name->len = strlen(fname); + name->hash = gfs2_disk_hash(name->name, name->len); +} + +/* N.B. This probably ought to take inum & type as args as well */ +static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent) +{ + dent->de_inum.no_addr = cpu_to_be64(0); + dent->de_inum.no_formal_ino = cpu_to_be64(0); + dent->de_hash = cpu_to_be32(name->hash); + dent->de_rec_len = cpu_to_be16(reclen); + dent->de_name_len = cpu_to_be16(name->len); + dent->de_type = cpu_to_be16(0); + memset(dent->__pad, 0, sizeof(dent->__pad)); + memcpy(dent + 1, name->name, name->len); +} + +extern struct qstr gfs2_qdot; +extern struct qstr gfs2_qdotdot; + +#endif /* __DIR_DOT_H__ */ diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c new file mode 100644 index 00000000..fe9945f2 --- /dev/null +++ b/fs/gfs2/export.c @@ -0,0 +1,206 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "super.h" +#include "rgrp.h" +#include "util.h" + +#define GFS2_SMALL_FH_SIZE 4 +#define GFS2_LARGE_FH_SIZE 8 +#define GFS2_OLD_FH_SIZE 10 + +static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, + int connectable) +{ + __be32 *fh = (__force __be32 *)p; + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct gfs2_inode *ip = GFS2_I(inode); + + if (connectable && (*len < GFS2_LARGE_FH_SIZE)) { + *len = GFS2_LARGE_FH_SIZE; + return 255; + } else if (*len < GFS2_SMALL_FH_SIZE) { + *len = GFS2_SMALL_FH_SIZE; + return 255; + } + + fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); + fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); + fh[2] = cpu_to_be32(ip->i_no_addr >> 32); + fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); + *len = GFS2_SMALL_FH_SIZE; + + if (!connectable || inode == sb->s_root->d_inode) + return *len; + + spin_lock(&dentry->d_lock); + inode = dentry->d_parent->d_inode; + ip = GFS2_I(inode); + igrab(inode); + spin_unlock(&dentry->d_lock); + + fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32); + fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); + fh[6] = cpu_to_be32(ip->i_no_addr >> 32); + fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); + *len = GFS2_LARGE_FH_SIZE; + + iput(inode); + + return *len; +} + +struct get_name_filldir { + struct gfs2_inum_host inum; + char *name; +}; + +static int get_name_filldir(void *opaque, const char *name, int length, + loff_t offset, u64 inum, unsigned int type) +{ + struct get_name_filldir *gnfd = opaque; + + if (inum != gnfd->inum.no_addr) + return 0; + + memcpy(gnfd->name, name, length); + gnfd->name[length] = 0; + + return 1; +} + +static int gfs2_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *dir = parent->d_inode; + struct inode *inode = child->d_inode; + struct gfs2_inode *dip, *ip; + struct get_name_filldir gnfd; + struct gfs2_holder gh; + u64 offset = 0; + int error; + + if (!dir) + return -EINVAL; + + if (!S_ISDIR(dir->i_mode) || !inode) + return -EINVAL; + + dip = GFS2_I(dir); + ip = GFS2_I(inode); + + *name = 0; + gnfd.inum.no_addr = ip->i_no_addr; + gnfd.inum.no_formal_ino = ip->i_no_formal_ino; + gnfd.name = name; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh); + if (error) + return error; + + error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir); + + gfs2_glock_dq_uninit(&gh); + + if (!error && !*name) + error = -ENOENT; + + return error; +} + +static struct dentry *gfs2_get_parent(struct dentry *child) +{ + return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1)); +} + +static struct dentry *gfs2_get_dentry(struct super_block *sb, + struct gfs2_inum_host *inum) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct inode *inode; + + inode = gfs2_ilookup(sb, inum->no_addr, 0); + if (inode) { + if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { + iput(inode); + return ERR_PTR(-ESTALE); + } + goto out_inode; + } + + inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino, + GFS2_BLKST_DINODE); + if (IS_ERR(inode)) + return ERR_CAST(inode); + +out_inode: + return d_obtain_alias(inode); +} + +static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct gfs2_inum_host this; + __be32 *fh = (__force __be32 *)fid->raw; + + switch (fh_type) { + case GFS2_SMALL_FH_SIZE: + case GFS2_LARGE_FH_SIZE: + case GFS2_OLD_FH_SIZE: + this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32; + this.no_formal_ino |= be32_to_cpu(fh[1]); + this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32; + this.no_addr |= be32_to_cpu(fh[3]); + return gfs2_get_dentry(sb, &this); + default: + return NULL; + } +} + +static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct gfs2_inum_host parent; + __be32 *fh = (__force __be32 *)fid->raw; + + switch (fh_type) { + case GFS2_LARGE_FH_SIZE: + case GFS2_OLD_FH_SIZE: + parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32; + parent.no_formal_ino |= be32_to_cpu(fh[5]); + parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32; + parent.no_addr |= be32_to_cpu(fh[7]); + return gfs2_get_dentry(sb, &parent); + default: + return NULL; + } +} + +const struct export_operations gfs2_export_ops = { + .encode_fh = gfs2_encode_fh, + .fh_to_dentry = gfs2_fh_to_dentry, + .fh_to_parent = gfs2_fh_to_parent, + .get_name = gfs2_get_name, + .get_parent = gfs2_get_parent, +}; + diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c new file mode 100644 index 00000000..a9f5cbe4 --- /dev/null +++ b/fs/gfs2/file.c @@ -0,0 +1,1112 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +/** + * gfs2_llseek - seek to a location in a file + * @file: the file + * @offset: the offset + * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) + * + * SEEK_END requires the glock for the file because it references the + * file's size. + * + * Returns: The new offset, or errno + */ + +static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_holder i_gh; + loff_t error; + + if (origin == 2) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (!error) { + error = generic_file_llseek_unlocked(file, offset, origin); + gfs2_glock_dq_uninit(&i_gh); + } + } else + error = generic_file_llseek_unlocked(file, offset, origin); + + return error; +} + +/** + * gfs2_readdir - Read directory entries from a directory + * @file: The directory to read from + * @dirent: Buffer for dirents + * @filldir: Function used to do the copying + * + * Returns: errno + */ + +static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + struct inode *dir = file->f_mapping->host; + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_holder d_gh; + u64 offset = file->f_pos; + int error; + + gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + error = gfs2_glock_nq(&d_gh); + if (error) { + gfs2_holder_uninit(&d_gh); + return error; + } + + error = gfs2_dir_read(dir, &offset, dirent, filldir); + + gfs2_glock_dq_uninit(&d_gh); + + file->f_pos = offset; + + return error; +} + +/** + * fsflags_cvt + * @table: A table of 32 u32 flags + * @val: a 32 bit value to convert + * + * This function can be used to convert between fsflags values and + * GFS2's own flags values. + * + * Returns: the converted flags + */ +static u32 fsflags_cvt(const u32 *table, u32 val) +{ + u32 res = 0; + while(val) { + if (val & 1) + res |= *table; + table++; + val >>= 1; + } + return res; +} + +static const u32 fsflags_to_gfs2[32] = { + [3] = GFS2_DIF_SYNC, + [4] = GFS2_DIF_IMMUTABLE, + [5] = GFS2_DIF_APPENDONLY, + [7] = GFS2_DIF_NOATIME, + [12] = GFS2_DIF_EXHASH, + [14] = GFS2_DIF_INHERIT_JDATA, +}; + +static const u32 gfs2_to_fsflags[32] = { + [gfs2fl_Sync] = FS_SYNC_FL, + [gfs2fl_Immutable] = FS_IMMUTABLE_FL, + [gfs2fl_AppendOnly] = FS_APPEND_FL, + [gfs2fl_NoAtime] = FS_NOATIME_FL, + [gfs2fl_ExHash] = FS_INDEX_FL, + [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, +}; + +static int gfs2_get_flags(struct file *filp, u32 __user *ptr) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + u32 fsflags; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + error = gfs2_glock_nq(&gh); + if (error) + return error; + + fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags); + if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA) + fsflags |= FS_JOURNAL_DATA_FL; + if (put_user(fsflags, ptr)) + error = -EFAULT; + + gfs2_glock_dq(&gh); + gfs2_holder_uninit(&gh); + return error; +} + +void gfs2_set_inode_flags(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int flags = inode->i_flags; + + flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) + flags |= S_IMMUTABLE; + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) + flags |= S_APPEND; + if (ip->i_diskflags & GFS2_DIF_NOATIME) + flags |= S_NOATIME; + if (ip->i_diskflags & GFS2_DIF_SYNC) + flags |= S_SYNC; + inode->i_flags = flags; +} + +/* Flags that can be set by user space */ +#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \ + GFS2_DIF_IMMUTABLE| \ + GFS2_DIF_APPENDONLY| \ + GFS2_DIF_NOATIME| \ + GFS2_DIF_SYNC| \ + GFS2_DIF_SYSTEM| \ + GFS2_DIF_INHERIT_JDATA) + +/** + * gfs2_set_flags - set flags on an inode + * @inode: The inode + * @flags: The flags to set + * @mask: Indicates which flags are valid + * + */ +static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + struct gfs2_holder gh; + int error; + u32 new_flags, flags; + + error = mnt_want_write(filp->f_path.mnt); + if (error) + return error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (error) + goto out_drop_write; + + error = -EACCES; + if (!inode_owner_or_capable(inode)) + goto out; + + error = 0; + flags = ip->i_diskflags; + new_flags = (flags & ~mask) | (reqflags & mask); + if ((new_flags ^ flags) == 0) + goto out; + + error = -EINVAL; + if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET) + goto out; + + error = -EPERM; + if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE)) + goto out; + if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY)) + goto out; + if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) && + !capable(CAP_LINUX_IMMUTABLE)) + goto out; + if (!IS_IMMUTABLE(inode)) { + error = gfs2_permission(inode, MAY_WRITE, 0); + if (error) + goto out; + } + if ((flags ^ new_flags) & GFS2_DIF_JDATA) { + if (flags & GFS2_DIF_JDATA) + gfs2_log_flush(sdp, ip->i_gl); + error = filemap_fdatawrite(inode->i_mapping); + if (error) + goto out; + error = filemap_fdatawait(inode->i_mapping); + if (error) + goto out; + } + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + goto out; + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out_trans_end; + gfs2_trans_add_bh(ip->i_gl, bh, 1); + ip->i_diskflags = new_flags; + gfs2_dinode_out(ip, bh->b_data); + brelse(bh); + gfs2_set_inode_flags(inode); + gfs2_set_aops(inode); +out_trans_end: + gfs2_trans_end(sdp); +out: + gfs2_glock_dq_uninit(&gh); +out_drop_write: + mnt_drop_write(filp->f_path.mnt); + return error; +} + +static int gfs2_set_flags(struct file *filp, u32 __user *ptr) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + u32 fsflags, gfsflags; + + if (get_user(fsflags, ptr)) + return -EFAULT; + + gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); + if (!S_ISDIR(inode->i_mode)) { + if (gfsflags & GFS2_DIF_INHERIT_JDATA) + gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); + return do_gfs2_set_flags(filp, gfsflags, ~0); + } + return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); +} + +static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch(cmd) { + case FS_IOC_GETFLAGS: + return gfs2_get_flags(filp, (u32 __user *)arg); + case FS_IOC_SETFLAGS: + return gfs2_set_flags(filp, (u32 __user *)arg); + } + return -ENOTTY; +} + +/** + * gfs2_allocate_page_backing - Use bmap to allocate blocks + * @page: The (locked) page to allocate backing for + * + * We try to allocate all the blocks required for the page in + * one go. This might fail for various reasons, so we keep + * trying until all the blocks to back this page are allocated. + * If some of the blocks are already allocated, thats ok too. + */ + +static int gfs2_allocate_page_backing(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct buffer_head bh; + unsigned long size = PAGE_CACHE_SIZE; + u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + do { + bh.b_state = 0; + bh.b_size = size; + gfs2_block_map(inode, lblock, &bh, 1); + if (!buffer_mapped(&bh)) + return -EIO; + size -= bh.b_size; + lblock += (bh.b_size >> inode->i_blkbits); + } while(size > 0); + return 0; +} + +/** + * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable + * @vma: The virtual memory area + * @page: The page which is about to become writable + * + * When the page becomes writable, we need to ensure that we have + * blocks allocated on disk to back that page. + */ + +static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + unsigned long last_index; + u64 pos = page->index << PAGE_CACHE_SHIFT; + unsigned int data_blocks, ind_blocks, rblocks; + struct gfs2_holder gh; + struct gfs2_alloc *al; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret) + goto out; + + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); + set_bit(GIF_SW_PAGED, &ip->i_flags); + + if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) + goto out_unlock; + ret = -ENOMEM; + al = gfs2_alloc_get(ip); + if (al == NULL) + goto out_unlock; + + ret = gfs2_quota_lock_check(ip); + if (ret) + goto out_alloc_put; + gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); + al->al_requested = data_blocks + ind_blocks; + ret = gfs2_inplace_reserve(ip); + if (ret) + goto out_quota_unlock; + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + if (ind_blocks || data_blocks) { + rblocks += RES_STATFS + RES_QUOTA; + rblocks += gfs2_rg_blocks(al); + } + ret = gfs2_trans_begin(sdp, rblocks, 0); + if (ret) + goto out_trans_fail; + + lock_page(page); + ret = -EINVAL; + last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT; + if (page->index > last_index) + goto out_unlock_page; + ret = 0; + if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping) + goto out_unlock_page; + if (gfs2_is_stuffed(ip)) { + ret = gfs2_unstuff_dinode(ip, page); + if (ret) + goto out_unlock_page; + } + ret = gfs2_allocate_page_backing(page); + +out_unlock_page: + unlock_page(page); + gfs2_trans_end(sdp); +out_trans_fail: + gfs2_inplace_release(ip); +out_quota_unlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); +out_unlock: + gfs2_glock_dq(&gh); +out: + gfs2_holder_uninit(&gh); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else if (ret) + ret = VM_FAULT_SIGBUS; + return ret; +} + +static const struct vm_operations_struct gfs2_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = gfs2_page_mkwrite, +}; + +/** + * gfs2_mmap - + * @file: The file to map + * @vma: The VMA which described the mapping + * + * There is no need to get a lock here unless we should be updating + * atime. We ignore any locking errors since the only consequence is + * a missed atime update (which will just be deferred until later). + * + * Returns: 0 + */ + +static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + + if (!(file->f_flags & O_NOATIME) && + !IS_NOATIME(&ip->i_inode)) { + struct gfs2_holder i_gh; + int error; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + error = gfs2_glock_nq(&i_gh); + if (error == 0) { + file_accessed(file); + gfs2_glock_dq(&i_gh); + } + gfs2_holder_uninit(&i_gh); + if (error) + return error; + } + vma->vm_ops = &gfs2_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + + return 0; +} + +/** + * gfs2_open - open a file + * @inode: the inode to open + * @file: the struct file for this opening + * + * Returns: errno + */ + +static int gfs2_open(struct inode *inode, struct file *file) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + struct gfs2_file *fp; + int error; + + fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL); + if (!fp) + return -ENOMEM; + + mutex_init(&fp->f_fl_mutex); + + gfs2_assert_warn(GFS2_SB(inode), !file->private_data); + file->private_data = fp; + + if (S_ISREG(ip->i_inode.i_mode)) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (error) + goto fail; + + if (!(file->f_flags & O_LARGEFILE) && + i_size_read(inode) > MAX_NON_LFS) { + error = -EOVERFLOW; + goto fail_gunlock; + } + + gfs2_glock_dq_uninit(&i_gh); + } + + return 0; + +fail_gunlock: + gfs2_glock_dq_uninit(&i_gh); +fail: + file->private_data = NULL; + kfree(fp); + return error; +} + +/** + * gfs2_close - called to close a struct file + * @inode: the inode the struct file belongs to + * @file: the struct file being closed + * + * Returns: errno + */ + +static int gfs2_close(struct inode *inode, struct file *file) +{ + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_file *fp; + + fp = file->private_data; + file->private_data = NULL; + + if (gfs2_assert_warn(sdp, fp)) + return -EIO; + + kfree(fp); + + return 0; +} + +/** + * gfs2_fsync - sync the dirty data for a file (across the cluster) + * @file: the file that points to the dentry (we ignore this) + * @datasync: set if we can ignore timestamp changes + * + * The VFS will flush data for us. We only need to worry + * about metadata here. + * + * Returns: errno + */ + +static int gfs2_fsync(struct file *file, int datasync) +{ + struct inode *inode = file->f_mapping->host; + int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); + struct gfs2_inode *ip = GFS2_I(inode); + int ret; + + if (datasync) + sync_state &= ~I_DIRTY_SYNC; + + if (sync_state) { + ret = sync_inode_metadata(inode, 1); + if (ret) + return ret; + gfs2_ail_flush(ip->i_gl); + } + + return 0; +} + +/** + * gfs2_file_aio_write - Perform a write to a file + * @iocb: The io context + * @iov: The data to write + * @nr_segs: Number of @iov segments + * @pos: The file position + * + * We have to do a lock/unlock here to refresh the inode size for + * O_APPEND writes, otherwise we can land up writing at the wrong + * offset. There is still a race, but provided the app is using its + * own file locking, this will make O_APPEND work as expected. + * + */ + +static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + + if (file->f_flags & O_APPEND) { + struct dentry *dentry = file->f_dentry; + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_holder gh; + int ret; + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) + return ret; + gfs2_glock_dq_uninit(&gh); + } + + return generic_file_aio_write(iocb, iov, nr_segs, pos); +} + +static int empty_write_end(struct page *page, unsigned from, + unsigned to, int mode) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *bh; + unsigned offset, blksize = 1 << inode->i_blkbits; + pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT; + + zero_user(page, from, to-from); + mark_page_accessed(page); + + if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) { + if (!gfs2_is_writeback(ip)) + gfs2_page_add_databufs(ip, page, from, to); + + block_commit_write(page, from, to); + return 0; + } + + offset = 0; + bh = page_buffers(page); + while (offset < to) { + if (offset >= from) { + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + clear_buffer_new(bh); + write_dirty_buffer(bh, WRITE); + } + offset += blksize; + bh = bh->b_this_page; + } + + offset = 0; + bh = page_buffers(page); + while (offset < to) { + if (offset >= from) { + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + return -EIO; + } + offset += blksize; + bh = bh->b_this_page; + } + return 0; +} + +static int needs_empty_write(sector_t block, struct inode *inode) +{ + int error; + struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; + + bh_map.b_size = 1 << inode->i_blkbits; + error = gfs2_block_map(inode, block, &bh_map, 0); + if (unlikely(error)) + return error; + return !buffer_mapped(&bh_map); +} + +static int write_empty_blocks(struct page *page, unsigned from, unsigned to, + int mode) +{ + struct inode *inode = page->mapping->host; + unsigned start, end, next, blksize; + sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + int ret; + + blksize = 1 << inode->i_blkbits; + next = end = 0; + while (next < from) { + next += blksize; + block++; + } + start = next; + do { + next += blksize; + ret = needs_empty_write(block, inode); + if (unlikely(ret < 0)) + return ret; + if (ret == 0) { + if (end) { + ret = __block_write_begin(page, start, end - start, + gfs2_block_map); + if (unlikely(ret)) + return ret; + ret = empty_write_end(page, start, end, mode); + if (unlikely(ret)) + return ret; + end = 0; + } + start = next; + } + else + end = next; + block++; + } while (next < to); + + if (end) { + ret = __block_write_begin(page, start, end - start, gfs2_block_map); + if (unlikely(ret)) + return ret; + ret = empty_write_end(page, start, end, mode); + if (unlikely(ret)) + return ret; + } + + return 0; +} + +static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, + int mode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *dibh; + int error; + u64 start = offset >> PAGE_CACHE_SHIFT; + unsigned int start_offset = offset & ~PAGE_CACHE_MASK; + u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT; + pgoff_t curr; + struct page *page; + unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK; + unsigned int from, to; + + if (!end_offset) + end_offset = PAGE_CACHE_SIZE; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(error)) + goto out; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (unlikely(error)) + goto out; + } + + curr = start; + offset = start << PAGE_CACHE_SHIFT; + from = start_offset; + to = PAGE_CACHE_SIZE; + while (curr <= end) { + page = grab_cache_page_write_begin(inode->i_mapping, curr, + AOP_FLAG_NOFS); + if (unlikely(!page)) { + error = -ENOMEM; + goto out; + } + + if (curr == end) + to = end_offset; + error = write_empty_blocks(page, from, to, mode); + if (!error && offset + to > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + i_size_write(inode, offset + to); + } + unlock_page(page); + page_cache_release(page); + if (error) + goto out; + curr++; + offset += PAGE_CACHE_SIZE; + from = 0; + } + + gfs2_dinode_out(ip, dibh->b_data); + mark_inode_dirty(inode); + + brelse(dibh); + +out: + return error; +} + +static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, + unsigned int *data_blocks, unsigned int *ind_blocks) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; + unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); + + for (tmp = max_data; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + max_data -= tmp; + } + /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, + so it might end up with fewer data blocks */ + if (max_data <= *data_blocks) + return; + *data_blocks = max_data; + *ind_blocks = max_blocks - max_data; + *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; + if (*len > max) { + *len = max; + gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); + } +} + +static long gfs2_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + loff_t bytes, max_bytes; + struct gfs2_alloc *al; + int error; + loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); + loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; + next = (next + 1) << sdp->sd_sb.sb_bsize_shift; + + /* We only support the FALLOC_FL_KEEP_SIZE mode */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + offset &= bsize_mask; + + len = next - offset; + bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; + if (!bytes) + bytes = UINT_MAX; + bytes &= bsize_mask; + if (bytes == 0) + bytes = sdp->sd_sb.sb_bsize; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); + error = gfs2_glock_nq(&ip->i_gh); + if (unlikely(error)) + goto out_uninit; + + if (!gfs2_write_alloc_required(ip, offset, len)) + goto out_unlock; + + while (len > 0) { + if (len < bytes) + bytes = len; + al = gfs2_alloc_get(ip); + if (!al) { + error = -ENOMEM; + goto out_unlock; + } + + error = gfs2_quota_lock_check(ip); + if (error) + goto out_alloc_put; + +retry: + gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); + + al->al_requested = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip); + if (error) { + if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { + bytes >>= 1; + bytes &= bsize_mask; + if (bytes == 0) + bytes = sdp->sd_sb.sb_bsize; + goto retry; + } + goto out_qunlock; + } + max_bytes = bytes; + calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); + al->al_requested = data_blocks + ind_blocks; + + rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + + RES_RG_HDR + gfs2_rg_blocks(al); + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + + error = gfs2_trans_begin(sdp, rblocks, + PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); + if (error) + goto out_trans_fail; + + error = fallocate_chunk(inode, offset, max_bytes, mode); + gfs2_trans_end(sdp); + + if (error) + goto out_trans_fail; + + len -= max_bytes; + offset += max_bytes; + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + goto out_unlock; + +out_trans_fail: + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); +out_unlock: + gfs2_glock_dq(&ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); + return error; +} + +#ifdef CONFIG_GFS2_FS_LOCKING_DLM + +/** + * gfs2_setlease - acquire/release a file lease + * @file: the file pointer + * @arg: lease type + * @fl: file lock + * + * We don't currently have a way to enforce a lease across the whole + * cluster; until we do, disable leases (by just returning -EINVAL), + * unless the administrator has requested purely local locking. + * + * Locking: called under lock_flocks + * + * Returns: errno + */ + +static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl) +{ + return -EINVAL; +} + +/** + * gfs2_lock - acquire/release a posix lock on a file + * @file: the file pointer + * @cmd: either modify or retrieve lock state, possibly wait + * @fl: type and range of lock + * + * Returns: errno + */ + +static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + if (cmd == F_CANCELLK) { + /* Hack: */ + cmd = F_SETLK; + fl->fl_type = F_UNLCK; + } + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + if (IS_GETLK(cmd)) + return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (fl->fl_type == F_UNLCK) + return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); + else + return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); +} + +static int do_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_file *fp = file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode); + struct gfs2_glock *gl; + unsigned int state; + int flags; + int error = 0; + + state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; + flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE; + + mutex_lock(&fp->f_fl_mutex); + + gl = fl_gh->gh_gl; + if (gl) { + if (fl_gh->gh_state == state) + goto out; + flock_lock_file_wait(file, + &(struct file_lock){.fl_type = F_UNLCK}); + gfs2_glock_dq_wait(fl_gh); + gfs2_holder_reinit(state, flags, fl_gh); + } else { + error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr, + &gfs2_flock_glops, CREATE, &gl); + if (error) + goto out; + gfs2_holder_init(gl, state, flags, fl_gh); + gfs2_glock_put(gl); + } + error = gfs2_glock_nq(fl_gh); + if (error) { + gfs2_holder_uninit(fl_gh); + if (error == GLR_TRYFAILED) + error = -EAGAIN; + } else { + error = flock_lock_file_wait(file, fl); + gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); + } + +out: + mutex_unlock(&fp->f_fl_mutex); + return error; +} + +static void do_unflock(struct file *file, struct file_lock *fl) +{ + struct gfs2_file *fp = file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + + mutex_lock(&fp->f_fl_mutex); + flock_lock_file_wait(file, fl); + if (fl_gh->gh_gl) { + gfs2_glock_dq_wait(fl_gh); + gfs2_holder_uninit(fl_gh); + } + mutex_unlock(&fp->f_fl_mutex); +} + +/** + * gfs2_flock - acquire/release a flock lock on a file + * @file: the file pointer + * @cmd: either modify or retrieve lock state, possibly wait + * @fl: type and range of lock + * + * Returns: errno + */ + +static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) +{ + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + if (fl->fl_type & LOCK_MAND) + return -EOPNOTSUPP; + + if (fl->fl_type == F_UNLCK) { + do_unflock(file, fl); + return 0; + } else { + return do_flock(file, cmd, fl); + } +} + +const struct file_operations gfs2_file_fops = { + .llseek = gfs2_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = gfs2_file_aio_write, + .unlocked_ioctl = gfs2_ioctl, + .mmap = gfs2_mmap, + .open = gfs2_open, + .release = gfs2_close, + .fsync = gfs2_fsync, + .lock = gfs2_lock, + .flock = gfs2_flock, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, + .setlease = gfs2_setlease, + .fallocate = gfs2_fallocate, +}; + +const struct file_operations gfs2_dir_fops = { + .readdir = gfs2_readdir, + .unlocked_ioctl = gfs2_ioctl, + .open = gfs2_open, + .release = gfs2_close, + .fsync = gfs2_fsync, + .lock = gfs2_lock, + .flock = gfs2_flock, + .llseek = default_llseek, +}; + +#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ + +const struct file_operations gfs2_file_fops_nolock = { + .llseek = gfs2_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = gfs2_file_aio_write, + .unlocked_ioctl = gfs2_ioctl, + .mmap = gfs2_mmap, + .open = gfs2_open, + .release = gfs2_close, + .fsync = gfs2_fsync, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, + .setlease = generic_setlease, + .fallocate = gfs2_fallocate, +}; + +const struct file_operations gfs2_dir_fops_nolock = { + .readdir = gfs2_readdir, + .unlocked_ioctl = gfs2_ioctl, + .open = gfs2_open, + .release = gfs2_close, + .fsync = gfs2_fsync, + .llseek = default_llseek, +}; + diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h new file mode 100644 index 00000000..ef606e3a --- /dev/null +++ b/fs/gfs2/gfs2.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __GFS2_DOT_H__ +#define __GFS2_DOT_H__ + +enum { + NO_CREATE = 0, + CREATE = 1, +}; + +enum { + NO_FORCE = 0, + FORCE = 1, +}; + +#define GFS2_FAST_NAME_SIZE 8 + +#endif /* __GFS2_DOT_H__ */ + diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c new file mode 100644 index 00000000..1c1336e7 --- /dev/null +++ b/fs/gfs2/glock.c @@ -0,0 +1,1870 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "lops.h" +#include "meta_io.h" +#include "quota.h" +#include "super.h" +#include "util.h" +#include "bmap.h" +#define CREATE_TRACE_POINTS +#include "trace_gfs2.h" + +struct gfs2_glock_iter { + int hash; /* hash bucket index */ + struct gfs2_sbd *sdp; /* incore superblock */ + struct gfs2_glock *gl; /* current glock struct */ + char string[512]; /* scratch space */ +}; + +typedef void (*glock_examiner) (struct gfs2_glock * gl); + +static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); +#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0) +static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); + +static struct dentry *gfs2_root; +static struct workqueue_struct *glock_workqueue; +struct workqueue_struct *gfs2_delete_workqueue; +static LIST_HEAD(lru_list); +static atomic_t lru_count = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(lru_lock); + +#define GFS2_GL_HASH_SHIFT 15 +#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) +#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1) + +static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE]; +static struct dentry *gfs2_root; + +/** + * gl_hash() - Turn glock number into hash bucket number + * @lock: The glock number + * + * Returns: The number of the corresponding hash bucket + */ + +static unsigned int gl_hash(const struct gfs2_sbd *sdp, + const struct lm_lockname *name) +{ + unsigned int h; + + h = jhash(&name->ln_number, sizeof(u64), 0); + h = jhash(&name->ln_type, sizeof(unsigned int), h); + h = jhash(&sdp, sizeof(struct gfs2_sbd *), h); + h &= GFS2_GL_HASH_MASK; + + return h; +} + +static inline void spin_lock_bucket(unsigned int hash) +{ + hlist_bl_lock(&gl_hash_table[hash]); +} + +static inline void spin_unlock_bucket(unsigned int hash) +{ + hlist_bl_unlock(&gl_hash_table[hash]); +} + +static void gfs2_glock_dealloc(struct rcu_head *rcu) +{ + struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); + + if (gl->gl_ops->go_flags & GLOF_ASPACE) + kmem_cache_free(gfs2_glock_aspace_cachep, gl); + else + kmem_cache_free(gfs2_glock_cachep, gl); +} + +void gfs2_glock_free(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + + call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_glock_wait); +} + +/** + * gfs2_glock_hold() - increment reference count on glock + * @gl: The glock to hold + * + */ + +void gfs2_glock_hold(struct gfs2_glock *gl) +{ + GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0); + atomic_inc(&gl->gl_ref); +} + +/** + * demote_ok - Check to see if it's ok to unlock a glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int demote_ok(const struct gfs2_glock *gl) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (gl->gl_state == LM_ST_UNLOCKED) + return 0; + if (!list_empty(&gl->gl_holders)) + return 0; + if (glops->go_demote_ok) + return glops->go_demote_ok(gl); + return 1; +} + + +void gfs2_glock_add_to_lru(struct gfs2_glock *gl) +{ + spin_lock(&lru_lock); + + if (!list_empty(&gl->gl_lru)) + list_del_init(&gl->gl_lru); + else + atomic_inc(&lru_count); + + list_add_tail(&gl->gl_lru, &lru_list); + set_bit(GLF_LRU, &gl->gl_flags); + spin_unlock(&lru_lock); +} + +static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) +{ + spin_lock(&lru_lock); + if (!list_empty(&gl->gl_lru)) { + list_del_init(&gl->gl_lru); + atomic_dec(&lru_count); + clear_bit(GLF_LRU, &gl->gl_flags); + } + spin_unlock(&lru_lock); +} + +/** + * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list + * @gl: the glock + * + * If the glock is demotable, then we add it (or move it) to the end + * of the glock LRU list. + */ + +static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) +{ + if (demote_ok(gl)) + gfs2_glock_add_to_lru(gl); +} + +/** + * gfs2_glock_put_nolock() - Decrement reference count on glock + * @gl: The glock to put + * + * This function should only be used if the caller has its own reference + * to the glock, in addition to the one it is dropping. + */ + +void gfs2_glock_put_nolock(struct gfs2_glock *gl) +{ + if (atomic_dec_and_test(&gl->gl_ref)) + GLOCK_BUG_ON(gl, 1); +} + +/** + * gfs2_glock_put() - Decrement reference count on glock + * @gl: The glock to put + * + */ + +void gfs2_glock_put(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = gfs2_glock2aspace(gl); + + if (atomic_dec_and_test(&gl->gl_ref)) { + spin_lock_bucket(gl->gl_hash); + hlist_bl_del_rcu(&gl->gl_list); + spin_unlock_bucket(gl->gl_hash); + gfs2_glock_remove_from_lru(gl); + GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); + GLOCK_BUG_ON(gl, mapping && mapping->nrpages); + trace_gfs2_glock_put(gl); + sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); + } +} + +/** + * search_bucket() - Find struct gfs2_glock by lock number + * @bucket: the bucket to search + * @name: The lock name + * + * Returns: NULL, or the struct gfs2_glock with the requested number + */ + +static struct gfs2_glock *search_bucket(unsigned int hash, + const struct gfs2_sbd *sdp, + const struct lm_lockname *name) +{ + struct gfs2_glock *gl; + struct hlist_bl_node *h; + + hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) { + if (!lm_name_equal(&gl->gl_name, name)) + continue; + if (gl->gl_sbd != sdp) + continue; + if (atomic_inc_not_zero(&gl->gl_ref)) + return gl; + } + + return NULL; +} + +/** + * may_grant - check if its ok to grant a new lock + * @gl: The glock + * @gh: The lock request which we wish to grant + * + * Returns: true if its ok to grant the lock + */ + +static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh) +{ + const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list); + if ((gh->gh_state == LM_ST_EXCLUSIVE || + gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head) + return 0; + if (gl->gl_state == gh->gh_state) + return 1; + if (gh->gh_flags & GL_EXACT) + return 0; + if (gl->gl_state == LM_ST_EXCLUSIVE) { + if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED) + return 1; + if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED) + return 1; + } + if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY)) + return 1; + return 0; +} + +static void gfs2_holder_wake(struct gfs2_holder *gh) +{ + clear_bit(HIF_WAIT, &gh->gh_iflags); + smp_mb__after_clear_bit(); + wake_up_bit(&gh->gh_iflags, HIF_WAIT); +} + +/** + * do_error - Something unexpected has happened during a lock request + * + */ + +static inline void do_error(struct gfs2_glock *gl, const int ret) +{ + struct gfs2_holder *gh, *tmp; + + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (ret & LM_OUT_ERROR) + gh->gh_error = -EIO; + else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) + gh->gh_error = GLR_TRYFAILED; + else + continue; + list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); + gfs2_holder_wake(gh); + } +} + +/** + * do_promote - promote as many requests as possible on the current queue + * @gl: The glock + * + * Returns: 1 if there is a blocked holder at the head of the list, or 2 + * if a type specific operation is underway. + */ + +static int do_promote(struct gfs2_glock *gl) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_holder *gh, *tmp; + int ret; + +restart: + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (may_grant(gl, gh)) { + if (gh->gh_list.prev == &gl->gl_holders && + glops->go_lock) { + spin_unlock(&gl->gl_spin); + /* FIXME: eliminate this eventually */ + ret = glops->go_lock(gh); + spin_lock(&gl->gl_spin); + if (ret) { + if (ret == 1) + return 2; + gh->gh_error = ret; + list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); + gfs2_holder_wake(gh); + goto restart; + } + set_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_promote(gh, 1); + gfs2_holder_wake(gh); + goto restart; + } + set_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_promote(gh, 0); + gfs2_holder_wake(gh); + continue; + } + if (gh->gh_list.prev == &gl->gl_holders) + return 1; + do_error(gl, 0); + break; + } + return 0; +} + +/** + * find_first_waiter - find the first gh that's waiting for the glock + * @gl: the glock + */ + +static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) + return gh; + } + return NULL; +} + +/** + * state_change - record that the glock is now in a different state + * @gl: the glock + * @new_state the new state + * + */ + +static void state_change(struct gfs2_glock *gl, unsigned int new_state) +{ + int held1, held2; + + held1 = (gl->gl_state != LM_ST_UNLOCKED); + held2 = (new_state != LM_ST_UNLOCKED); + + if (held1 != held2) { + if (held2) + gfs2_glock_hold(gl); + else + gfs2_glock_put_nolock(gl); + } + if (held1 && held2 && list_empty(&gl->gl_holders)) + clear_bit(GLF_QUEUED, &gl->gl_flags); + + gl->gl_state = new_state; + gl->gl_tchange = jiffies; +} + +static void gfs2_demote_wake(struct gfs2_glock *gl) +{ + gl->gl_demote_state = LM_ST_EXCLUSIVE; + clear_bit(GLF_DEMOTE, &gl->gl_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&gl->gl_flags, GLF_DEMOTE); +} + +/** + * finish_xmote - The DLM has replied to one of our lock requests + * @gl: The glock + * @ret: The status from the DLM + * + */ + +static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_holder *gh; + unsigned state = ret & LM_OUT_ST_MASK; + int rv; + + spin_lock(&gl->gl_spin); + trace_gfs2_glock_state_change(gl, state); + state_change(gl, state); + gh = find_first_waiter(gl); + + /* Demote to UN request arrived during demote to SH or DF */ + if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && + state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED) + gl->gl_target = LM_ST_UNLOCKED; + + /* Check for state != intended state */ + if (unlikely(state != gl->gl_target)) { + if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { + /* move to back of queue and try next entry */ + if (ret & LM_OUT_CANCELED) { + if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0) + list_move_tail(&gh->gh_list, &gl->gl_holders); + gh = find_first_waiter(gl); + gl->gl_target = gh->gh_state; + goto retry; + } + /* Some error or failed "try lock" - report it */ + if ((ret & LM_OUT_ERROR) || + (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { + gl->gl_target = gl->gl_state; + do_error(gl, ret); + goto out; + } + } + switch(state) { + /* Unlocked due to conversion deadlock, try again */ + case LM_ST_UNLOCKED: +retry: + do_xmote(gl, gh, gl->gl_target); + break; + /* Conversion fails, unlock and try again */ + case LM_ST_SHARED: + case LM_ST_DEFERRED: + do_xmote(gl, gh, LM_ST_UNLOCKED); + break; + default: /* Everything else */ + printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state); + GLOCK_BUG_ON(gl, 1); + } + spin_unlock(&gl->gl_spin); + return; + } + + /* Fast path - we got what we asked for */ + if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) + gfs2_demote_wake(gl); + if (state != LM_ST_UNLOCKED) { + if (glops->go_xmote_bh) { + spin_unlock(&gl->gl_spin); + rv = glops->go_xmote_bh(gl, gh); + spin_lock(&gl->gl_spin); + if (rv) { + do_error(gl, rv); + goto out; + } + } + rv = do_promote(gl); + if (rv == 2) + goto out_locked; + } +out: + clear_bit(GLF_LOCK, &gl->gl_flags); +out_locked: + spin_unlock(&gl->gl_spin); +} + +/** + * do_xmote - Calls the DLM to change the state of a lock + * @gl: The lock state + * @gh: The holder (only for promotes) + * @target: The target lock state + * + */ + +static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_sbd *sdp = gl->gl_sbd; + unsigned int lck_flags = gh ? gh->gh_flags : 0; + int ret; + + lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | + LM_FLAG_PRIORITY); + GLOCK_BUG_ON(gl, gl->gl_state == target); + GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); + if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && + glops->go_inval) { + set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); + do_error(gl, 0); /* Fail queued try locks */ + } + gl->gl_req = target; + spin_unlock(&gl->gl_spin); + if (glops->go_xmote_th) + glops->go_xmote_th(gl); + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) + glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); + clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); + + gfs2_glock_hold(gl); + if (sdp->sd_lockstruct.ls_ops->lm_lock) { + /* lock_dlm */ + ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); + GLOCK_BUG_ON(gl, ret); + } else { /* lock_nolock */ + finish_xmote(gl, target); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); + } + + spin_lock(&gl->gl_spin); +} + +/** + * find_first_holder - find the first "holder" gh + * @gl: the glock + */ + +static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + + if (!list_empty(&gl->gl_holders)) { + gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + return gh; + } + return NULL; +} + +/** + * run_queue - do all outstanding tasks related to a glock + * @gl: The glock in question + * @nonblock: True if we must not block in run_queue + * + */ + +static void run_queue(struct gfs2_glock *gl, const int nonblock) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) +{ + struct gfs2_holder *gh = NULL; + int ret; + + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) + return; + + GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); + + if (test_bit(GLF_DEMOTE, &gl->gl_flags) && + gl->gl_demote_state != gl->gl_state) { + if (find_first_holder(gl)) + goto out_unlock; + if (nonblock) + goto out_sched; + set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); + GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); + gl->gl_target = gl->gl_demote_state; + } else { + if (test_bit(GLF_DEMOTE, &gl->gl_flags)) + gfs2_demote_wake(gl); + ret = do_promote(gl); + if (ret == 0) + goto out_unlock; + if (ret == 2) + goto out; + gh = find_first_waiter(gl); + gl->gl_target = gh->gh_state; + if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) + do_error(gl, 0); /* Fail queued try locks */ + } + do_xmote(gl, gh, gl->gl_target); +out: + return; + +out_sched: + clear_bit(GLF_LOCK, &gl->gl_flags); + smp_mb__after_clear_bit(); + gfs2_glock_hold(gl); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put_nolock(gl); + return; + +out_unlock: + clear_bit(GLF_LOCK, &gl->gl_flags); + smp_mb__after_clear_bit(); + return; +} + +static void delete_work_func(struct work_struct *work) +{ + struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip; + struct inode *inode; + u64 no_addr = gl->gl_name.ln_number; + + ip = gl->gl_object; + /* Note: Unsafe to dereference ip as we don't hold right refs/locks */ + + if (ip) + inode = gfs2_ilookup(sdp->sd_vfs, no_addr, 1); + else + inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); + if (inode && !IS_ERR(inode)) { + d_prune_aliases(inode); + iput(inode); + } + gfs2_glock_put(gl); +} + +static void glock_work_func(struct work_struct *work) +{ + unsigned long delay = 0; + struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); + int drop_ref = 0; + + if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { + finish_xmote(gl, gl->gl_reply); + drop_ref = 1; + } + spin_lock(&gl->gl_spin); + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + gl->gl_state != LM_ST_UNLOCKED && + gl->gl_demote_state != LM_ST_EXCLUSIVE) { + unsigned long holdtime, now = jiffies; + + holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; + if (time_before(now, holdtime)) + delay = holdtime - now; + + if (!delay) { + clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); + set_bit(GLF_DEMOTE, &gl->gl_flags); + } + } + run_queue(gl, 0); + spin_unlock(&gl->gl_spin); + if (!delay || + queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + gfs2_glock_put(gl); + if (drop_ref) + gfs2_glock_put(gl); +} + +/** + * gfs2_glock_get() - Get a glock, or create one if one doesn't exist + * @sdp: The GFS2 superblock + * @number: the lock number + * @glops: The glock_operations to use + * @create: If 0, don't create the glock if it doesn't exist + * @glp: the glock is returned here + * + * This does not lock a glock, just finds/creates structures for one. + * + * Returns: errno + */ + +int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, int create, + struct gfs2_glock **glp) +{ + struct super_block *s = sdp->sd_vfs; + struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type }; + struct gfs2_glock *gl, *tmp; + unsigned int hash = gl_hash(sdp, &name); + struct address_space *mapping; + struct kmem_cache *cachep; + + rcu_read_lock(); + gl = search_bucket(hash, sdp, &name); + rcu_read_unlock(); + + *glp = gl; + if (gl) + return 0; + if (!create) + return -ENOENT; + + if (glops->go_flags & GLOF_ASPACE) + cachep = gfs2_glock_aspace_cachep; + else + cachep = gfs2_glock_cachep; + gl = kmem_cache_alloc(cachep, GFP_KERNEL); + if (!gl) + return -ENOMEM; + + atomic_inc(&sdp->sd_glock_disposal); + gl->gl_flags = 0; + gl->gl_name = name; + atomic_set(&gl->gl_ref, 1); + gl->gl_state = LM_ST_UNLOCKED; + gl->gl_target = LM_ST_UNLOCKED; + gl->gl_demote_state = LM_ST_EXCLUSIVE; + gl->gl_hash = hash; + gl->gl_ops = glops; + snprintf(gl->gl_strname, GDLM_STRNAME_BYTES, "%8x%16llx", name.ln_type, (unsigned long long)number); + memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); + gl->gl_lksb.sb_lvbptr = gl->gl_lvb; + gl->gl_tchange = jiffies; + gl->gl_object = NULL; + gl->gl_sbd = sdp; + INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); + INIT_WORK(&gl->gl_delete, delete_work_func); + + mapping = gfs2_glock2aspace(gl); + if (mapping) { + mapping->a_ops = &gfs2_meta_aops; + mapping->host = s->s_bdev->bd_inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = s->s_bdi; + mapping->writeback_index = 0; + } + + spin_lock_bucket(hash); + tmp = search_bucket(hash, sdp, &name); + if (tmp) { + spin_unlock_bucket(hash); + kmem_cache_free(cachep, gl); + atomic_dec(&sdp->sd_glock_disposal); + gl = tmp; + } else { + hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]); + spin_unlock_bucket(hash); + } + + *glp = gl; + + return 0; +} + +/** + * gfs2_holder_init - initialize a struct gfs2_holder in the default way + * @gl: the glock + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + */ + +void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, + struct gfs2_holder *gh) +{ + INIT_LIST_HEAD(&gh->gh_list); + gh->gh_gl = gl; + gh->gh_ip = (unsigned long)__builtin_return_address(0); + gh->gh_owner_pid = get_pid(task_pid(current)); + gh->gh_state = state; + gh->gh_flags = flags; + gh->gh_error = 0; + gh->gh_iflags = 0; + gfs2_glock_hold(gl); +} + +/** + * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + * Don't mess with the glock. + * + */ + +void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh) +{ + gh->gh_state = state; + gh->gh_flags = flags; + gh->gh_iflags = 0; + gh->gh_ip = (unsigned long)__builtin_return_address(0); + if (gh->gh_owner_pid) + put_pid(gh->gh_owner_pid); + gh->gh_owner_pid = get_pid(task_pid(current)); +} + +/** + * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference) + * @gh: the holder structure + * + */ + +void gfs2_holder_uninit(struct gfs2_holder *gh) +{ + put_pid(gh->gh_owner_pid); + gfs2_glock_put(gh->gh_gl); + gh->gh_gl = NULL; + gh->gh_ip = 0; +} + +/** + * gfs2_glock_holder_wait + * @word: unused + * + * This function and gfs2_glock_demote_wait both show up in the WCHAN + * field. Thus I've separated these otherwise identical functions in + * order to be more informative to the user. + */ + +static int gfs2_glock_holder_wait(void *word) +{ + schedule(); + return 0; +} + +static int gfs2_glock_demote_wait(void *word) +{ + schedule(); + return 0; +} + +static void wait_on_holder(struct gfs2_holder *gh) +{ + might_sleep(); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE); +} + +static void wait_on_demote(struct gfs2_glock *gl) +{ + might_sleep(); + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE); +} + +/** + * handle_callback - process a demote request + * @gl: the glock + * @state: the state the caller wants us to change to + * + * There are only two requests that we are going to see in actual + * practise: LM_ST_SHARED and LM_ST_UNLOCKED + */ + +static void handle_callback(struct gfs2_glock *gl, unsigned int state, + unsigned long delay) +{ + int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; + + set_bit(bit, &gl->gl_flags); + if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { + gl->gl_demote_state = state; + gl->gl_demote_time = jiffies; + } else if (gl->gl_demote_state != LM_ST_UNLOCKED && + gl->gl_demote_state != state) { + gl->gl_demote_state = LM_ST_UNLOCKED; + } + if (gl->gl_ops->go_callback) + gl->gl_ops->go_callback(gl); + trace_gfs2_demote_rq(gl); +} + +/** + * gfs2_glock_wait - wait on a glock acquisition + * @gh: the glock holder + * + * Returns: 0 on success + */ + +int gfs2_glock_wait(struct gfs2_holder *gh) +{ + wait_on_holder(gh); + return gh->gh_error; +} + +void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + if (seq) { + struct gfs2_glock_iter *gi = seq->private; + vsprintf(gi->string, fmt, args); + seq_printf(seq, gi->string); + } else { + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_ERR " %pV", &vaf); + } + + va_end(args); +} + +/** + * add_to_queue - Add a holder to the wait queue (but look for recursion) + * @gh: the holder structure to add + * + * Eventually we should move the recursive locking trap to a + * debugging option or something like that. This is the fast + * path and needs to have the minimum number of distractions. + * + */ + +static inline void add_to_queue(struct gfs2_holder *gh) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct list_head *insert_pt = NULL; + struct gfs2_holder *gh2; + int try_lock = 0; + + BUG_ON(gh->gh_owner_pid == NULL); + if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) + BUG(); + + if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { + if (test_bit(GLF_LOCK, &gl->gl_flags)) + try_lock = 1; + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) + goto fail; + } + + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { + if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid && + (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK))) + goto trap_recursive; + if (try_lock && + !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) && + !may_grant(gl, gh)) { +fail: + gh->gh_error = GLR_TRYFAILED; + gfs2_holder_wake(gh); + return; + } + if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) + continue; + if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) + insert_pt = &gh2->gh_list; + } + set_bit(GLF_QUEUED, &gl->gl_flags); + trace_gfs2_glock_queue(gh, 1); + if (likely(insert_pt == NULL)) { + list_add_tail(&gh->gh_list, &gl->gl_holders); + if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) + goto do_cancel; + return; + } + list_add_tail(&gh->gh_list, insert_pt); +do_cancel: + gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + if (!(gh->gh_flags & LM_FLAG_PRIORITY)) { + spin_unlock(&gl->gl_spin); + if (sdp->sd_lockstruct.ls_ops->lm_cancel) + sdp->sd_lockstruct.ls_ops->lm_cancel(gl); + spin_lock(&gl->gl_spin); + } + return; + +trap_recursive: + print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip); + printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid)); + printk(KERN_ERR "lock type: %d req lock state : %d\n", + gh2->gh_gl->gl_name.ln_type, gh2->gh_state); + print_symbol(KERN_ERR "new: %s\n", gh->gh_ip); + printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid)); + printk(KERN_ERR "lock type: %d req lock state : %d\n", + gh->gh_gl->gl_name.ln_type, gh->gh_state); + __dump_glock(NULL, gl); + BUG(); +} + +/** + * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock) + * @gh: the holder structure + * + * if (gh->gh_flags & GL_ASYNC), this never returns an error + * + * Returns: 0, GLR_TRYFAILED, or errno on failure + */ + +int gfs2_glock_nq(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + int error = 0; + + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + if (test_bit(GLF_LRU, &gl->gl_flags)) + gfs2_glock_remove_from_lru(gl); + + spin_lock(&gl->gl_spin); + add_to_queue(gh); + if ((LM_FLAG_NOEXP & gh->gh_flags) && + test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + run_queue(gl, 1); + spin_unlock(&gl->gl_spin); + + if (!(gh->gh_flags & GL_ASYNC)) + error = gfs2_glock_wait(gh); + + return error; +} + +/** + * gfs2_glock_poll - poll to see if an async request has been completed + * @gh: the holder + * + * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on + */ + +int gfs2_glock_poll(struct gfs2_holder *gh) +{ + return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1; +} + +/** + * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) + * @gh: the glock holder + * + */ + +void gfs2_glock_dq(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + const struct gfs2_glock_operations *glops = gl->gl_ops; + unsigned delay = 0; + int fast_path = 0; + + spin_lock(&gl->gl_spin); + if (gh->gh_flags & GL_NOCACHE) + handle_callback(gl, LM_ST_UNLOCKED, 0); + + list_del_init(&gh->gh_list); + if (find_first_holder(gl) == NULL) { + if (glops->go_unlock) { + GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags)); + spin_unlock(&gl->gl_spin); + glops->go_unlock(gh); + spin_lock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); + } + if (list_empty(&gl->gl_holders) && + !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags)) + fast_path = 1; + } + if (!test_bit(GLF_LFLUSH, &gl->gl_flags)) + __gfs2_glock_schedule_for_reclaim(gl); + trace_gfs2_glock_queue(gh, 0); + spin_unlock(&gl->gl_spin); + if (likely(fast_path)) + return; + + gfs2_glock_hold(gl); + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags)) + delay = gl->gl_ops->go_min_hold_time; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + gfs2_glock_put(gl); +} + +void gfs2_glock_dq_wait(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + gfs2_glock_dq(gh); + wait_on_demote(gl); +} + +/** + * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it + * @gh: the holder structure + * + */ + +void gfs2_glock_dq_uninit(struct gfs2_holder *gh) +{ + gfs2_glock_dq(gh); + gfs2_holder_uninit(gh); +} + +/** + * gfs2_glock_nq_num - acquire a glock based on lock number + * @sdp: the filesystem + * @number: the lock number + * @glops: the glock operations for the type of glock + * @state: the state to acquire the glock in + * @flags: modifier flags for the acquisition + * @gh: the struct gfs2_holder + * + * Returns: errno + */ + +int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + unsigned int state, int flags, struct gfs2_holder *gh) +{ + struct gfs2_glock *gl; + int error; + + error = gfs2_glock_get(sdp, number, glops, CREATE, &gl); + if (!error) { + error = gfs2_glock_nq_init(gl, state, flags, gh); + gfs2_glock_put(gl); + } + + return error; +} + +/** + * glock_compare - Compare two struct gfs2_glock structures for sorting + * @arg_a: the first structure + * @arg_b: the second structure + * + */ + +static int glock_compare(const void *arg_a, const void *arg_b) +{ + const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a; + const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b; + const struct lm_lockname *a = &gh_a->gh_gl->gl_name; + const struct lm_lockname *b = &gh_b->gh_gl->gl_name; + + if (a->ln_number > b->ln_number) + return 1; + if (a->ln_number < b->ln_number) + return -1; + BUG_ON(gh_a->gh_gl->gl_ops->go_type == gh_b->gh_gl->gl_ops->go_type); + return 0; +} + +/** + * nq_m_sync - synchonously acquire more than one glock in deadlock free order + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + * Returns: 0 on success (all glocks acquired), + * errno on failure (no glocks acquired) + */ + +static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs, + struct gfs2_holder **p) +{ + unsigned int x; + int error = 0; + + for (x = 0; x < num_gh; x++) + p[x] = &ghs[x]; + + sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL); + + for (x = 0; x < num_gh; x++) { + p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); + + error = gfs2_glock_nq(p[x]); + if (error) { + while (x--) + gfs2_glock_dq(p[x]); + break; + } + } + + return error; +} + +/** + * gfs2_glock_nq_m - acquire multiple glocks + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + * + * Returns: 0 on success (all glocks acquired), + * errno on failure (no glocks acquired) + */ + +int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) +{ + struct gfs2_holder *tmp[4]; + struct gfs2_holder **pph = tmp; + int error = 0; + + switch(num_gh) { + case 0: + return 0; + case 1: + ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); + return gfs2_glock_nq(ghs); + default: + if (num_gh <= 4) + break; + pph = kmalloc(num_gh * sizeof(struct gfs2_holder *), GFP_NOFS); + if (!pph) + return -ENOMEM; + } + + error = nq_m_sync(num_gh, ghs, pph); + + if (pph != tmp) + kfree(pph); + + return error; +} + +/** + * gfs2_glock_dq_m - release multiple glocks + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + */ + +void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) +{ + while (num_gh--) + gfs2_glock_dq(&ghs[num_gh]); +} + +/** + * gfs2_glock_dq_uninit_m - release multiple glocks + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + */ + +void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) +{ + while (num_gh--) + gfs2_glock_dq_uninit(&ghs[num_gh]); +} + +void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) +{ + unsigned long delay = 0; + unsigned long holdtime; + unsigned long now = jiffies; + + gfs2_glock_hold(gl); + holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; + if (test_bit(GLF_QUEUED, &gl->gl_flags)) { + if (time_before(now, holdtime)) + delay = holdtime - now; + if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) + delay = gl->gl_ops->go_min_hold_time; + } + + spin_lock(&gl->gl_spin); + handle_callback(gl, state, delay); + spin_unlock(&gl->gl_spin); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + gfs2_glock_put(gl); +} + +/** + * gfs2_should_freeze - Figure out if glock should be frozen + * @gl: The glock in question + * + * Glocks are not frozen if (a) the result of the dlm operation is + * an error, (b) the locking operation was an unlock operation or + * (c) if there is a "noexp" flagged request anywhere in the queue + * + * Returns: 1 if freezing should occur, 0 otherwise + */ + +static int gfs2_should_freeze(const struct gfs2_glock *gl) +{ + const struct gfs2_holder *gh; + + if (gl->gl_reply & ~LM_OUT_ST_MASK) + return 0; + if (gl->gl_target == LM_ST_UNLOCKED) + return 0; + + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (LM_FLAG_NOEXP & gh->gh_flags) + return 0; + } + + return 1; +} + +/** + * gfs2_glock_complete - Callback used by locking + * @gl: Pointer to the glock + * @ret: The return value from the dlm + * + * The gl_reply field is under the gl_spin lock so that it is ok + * to use a bitfield shared with other glock state fields. + */ + +void gfs2_glock_complete(struct gfs2_glock *gl, int ret) +{ + struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + + spin_lock(&gl->gl_spin); + gl->gl_reply = ret; + + if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { + if (gfs2_should_freeze(gl)) { + set_bit(GLF_FROZEN, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + return; + } + } + + spin_unlock(&gl->gl_spin); + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + smp_wmb(); + gfs2_glock_hold(gl); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); +} + + +static int gfs2_shrink_glock_memory(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct gfs2_glock *gl; + int may_demote; + int nr_skipped = 0; + int nr = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + LIST_HEAD(skipped); + + if (nr == 0) + goto out; + + if (!(gfp_mask & __GFP_FS)) + return -1; + + spin_lock(&lru_lock); + while(nr && !list_empty(&lru_list)) { + gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); + list_del_init(&gl->gl_lru); + clear_bit(GLF_LRU, &gl->gl_flags); + atomic_dec(&lru_count); + + /* Test for being demotable */ + if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + gfs2_glock_hold(gl); + spin_unlock(&lru_lock); + spin_lock(&gl->gl_spin); + may_demote = demote_ok(gl); + if (may_demote) { + handle_callback(gl, LM_ST_UNLOCKED, 0); + nr--; + } + clear_bit(GLF_LOCK, &gl->gl_flags); + smp_mb__after_clear_bit(); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put_nolock(gl); + spin_unlock(&gl->gl_spin); + spin_lock(&lru_lock); + continue; + } + nr_skipped++; + list_add(&gl->gl_lru, &skipped); + set_bit(GLF_LRU, &gl->gl_flags); + } + list_splice(&skipped, &lru_list); + atomic_add(nr_skipped, &lru_count); + spin_unlock(&lru_lock); +out: + return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure; +} + +static struct shrinker glock_shrinker = { + .shrink = gfs2_shrink_glock_memory, + .seeks = DEFAULT_SEEKS, +}; + +/** + * examine_bucket - Call a function for glock in a hash bucket + * @examiner: the function + * @sdp: the filesystem + * @bucket: the bucket + * + */ + +static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp, + unsigned int hash) +{ + struct gfs2_glock *gl; + struct hlist_bl_head *head = &gl_hash_table[hash]; + struct hlist_bl_node *pos; + + rcu_read_lock(); + hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) { + if ((gl->gl_sbd == sdp) && atomic_read(&gl->gl_ref)) + examiner(gl); + } + rcu_read_unlock(); + cond_resched(); +} + +static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) +{ + unsigned x; + + for (x = 0; x < GFS2_GL_HASH_SIZE; x++) + examine_bucket(examiner, sdp, x); +} + + +/** + * thaw_glock - thaw out a glock which has an unprocessed reply waiting + * @gl: The glock to thaw + * + * N.B. When we freeze a glock, we leave a ref to the glock outstanding, + * so this has to result in the ref count being dropped by one. + */ + +static void thaw_glock(struct gfs2_glock *gl) +{ + if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) + return; + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + gfs2_glock_hold(gl); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); +} + +/** + * clear_glock - look at a glock and see if we can free it from glock cache + * @gl: the glock to look at + * + */ + +static void clear_glock(struct gfs2_glock *gl) +{ + gfs2_glock_remove_from_lru(gl); + + spin_lock(&gl->gl_spin); + if (gl->gl_state != LM_ST_UNLOCKED) + handle_callback(gl, LM_ST_UNLOCKED, 0); + spin_unlock(&gl->gl_spin); + gfs2_glock_hold(gl); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); +} + +/** + * gfs2_glock_thaw - Thaw any frozen glocks + * @sdp: The super block + * + */ + +void gfs2_glock_thaw(struct gfs2_sbd *sdp) +{ + glock_hash_walk(thaw_glock, sdp); +} + +static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) +{ + int ret; + spin_lock(&gl->gl_spin); + ret = __dump_glock(seq, gl); + spin_unlock(&gl->gl_spin); + return ret; +} + +static void dump_glock_func(struct gfs2_glock *gl) +{ + dump_glock(NULL, gl); +} + +/** + * gfs2_gl_hash_clear - Empty out the glock hash table + * @sdp: the filesystem + * @wait: wait until it's all gone + * + * Called when unmounting the filesystem. + */ + +void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) +{ + glock_hash_walk(clear_glock, sdp); + flush_workqueue(glock_workqueue); + wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); + glock_hash_walk(dump_glock_func, sdp); +} + +void gfs2_glock_finish_truncate(struct gfs2_inode *ip) +{ + struct gfs2_glock *gl = ip->i_gl; + int ret; + + ret = gfs2_truncatei_resume(ip); + gfs2_assert_withdraw(gl->gl_sbd, ret == 0); + + spin_lock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); + run_queue(gl, 1); + spin_unlock(&gl->gl_spin); +} + +static const char *state2str(unsigned state) +{ + switch(state) { + case LM_ST_UNLOCKED: + return "UN"; + case LM_ST_SHARED: + return "SH"; + case LM_ST_DEFERRED: + return "DF"; + case LM_ST_EXCLUSIVE: + return "EX"; + } + return "??"; +} + +static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) +{ + char *p = buf; + if (flags & LM_FLAG_TRY) + *p++ = 't'; + if (flags & LM_FLAG_TRY_1CB) + *p++ = 'T'; + if (flags & LM_FLAG_NOEXP) + *p++ = 'e'; + if (flags & LM_FLAG_ANY) + *p++ = 'A'; + if (flags & LM_FLAG_PRIORITY) + *p++ = 'p'; + if (flags & GL_ASYNC) + *p++ = 'a'; + if (flags & GL_EXACT) + *p++ = 'E'; + if (flags & GL_NOCACHE) + *p++ = 'c'; + if (test_bit(HIF_HOLDER, &iflags)) + *p++ = 'H'; + if (test_bit(HIF_WAIT, &iflags)) + *p++ = 'W'; + if (test_bit(HIF_FIRST, &iflags)) + *p++ = 'F'; + *p = 0; + return buf; +} + +/** + * dump_holder - print information about a glock holder + * @seq: the seq_file struct + * @gh: the glock holder + * + * Returns: 0 on success, -ENOBUFS when we run out of space + */ + +static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) +{ + struct task_struct *gh_owner = NULL; + char flags_buf[32]; + + if (gh->gh_owner_pid) + gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); + gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n", + state2str(gh->gh_state), + hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), + gh->gh_error, + gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, + gh_owner ? gh_owner->comm : "(ended)", + (void *)gh->gh_ip); + return 0; +} + +static const char *gflags2str(char *buf, const struct gfs2_glock *gl) +{ + const unsigned long *gflags = &gl->gl_flags; + char *p = buf; + + if (test_bit(GLF_LOCK, gflags)) + *p++ = 'l'; + if (test_bit(GLF_DEMOTE, gflags)) + *p++ = 'D'; + if (test_bit(GLF_PENDING_DEMOTE, gflags)) + *p++ = 'd'; + if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags)) + *p++ = 'p'; + if (test_bit(GLF_DIRTY, gflags)) + *p++ = 'y'; + if (test_bit(GLF_LFLUSH, gflags)) + *p++ = 'f'; + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags)) + *p++ = 'i'; + if (test_bit(GLF_REPLY_PENDING, gflags)) + *p++ = 'r'; + if (test_bit(GLF_INITIAL, gflags)) + *p++ = 'I'; + if (test_bit(GLF_FROZEN, gflags)) + *p++ = 'F'; + if (test_bit(GLF_QUEUED, gflags)) + *p++ = 'q'; + if (test_bit(GLF_LRU, gflags)) + *p++ = 'L'; + if (gl->gl_object) + *p++ = 'o'; + *p = 0; + return buf; +} + +/** + * __dump_glock - print information about a glock + * @seq: The seq_file struct + * @gl: the glock + * + * The file format is as follows: + * One line per object, capital letters are used to indicate objects + * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented, + * other objects are indented by a single space and follow the glock to + * which they are related. Fields are indicated by lower case letters + * followed by a colon and the field value, except for strings which are in + * [] so that its possible to see if they are composed of spaces for + * example. The field's are n = number (id of the object), f = flags, + * t = type, s = state, r = refcount, e = error, p = pid. + * + * Returns: 0 on success, -ENOBUFS when we run out of space + */ + +static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + unsigned long long dtime; + const struct gfs2_holder *gh; + char gflags_buf[32]; + int error = 0; + + dtime = jiffies - gl->gl_demote_time; + dtime *= 1000000/HZ; /* demote time in uSec */ + if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) + dtime = 0; + gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d\n", + state2str(gl->gl_state), + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, + gflags2str(gflags_buf, gl), + state2str(gl->gl_target), + state2str(gl->gl_demote_state), dtime, + atomic_read(&gl->gl_ail_count), + atomic_read(&gl->gl_revokes), + atomic_read(&gl->gl_ref)); + + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + error = dump_holder(seq, gh); + if (error) + goto out; + } + if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) + error = glops->go_dump(seq, gl); +out: + return error; +} + + + + +int __init gfs2_glock_init(void) +{ + unsigned i; + for(i = 0; i < GFS2_GL_HASH_SIZE; i++) { + INIT_HLIST_BL_HEAD(&gl_hash_table[i]); + } + + glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_FREEZABLE, 0); + if (IS_ERR(glock_workqueue)) + return PTR_ERR(glock_workqueue); + gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", + WQ_MEM_RECLAIM | WQ_FREEZABLE, + 0); + if (IS_ERR(gfs2_delete_workqueue)) { + destroy_workqueue(glock_workqueue); + return PTR_ERR(gfs2_delete_workqueue); + } + + register_shrinker(&glock_shrinker); + + return 0; +} + +void gfs2_glock_exit(void) +{ + unregister_shrinker(&glock_shrinker); + destroy_workqueue(glock_workqueue); + destroy_workqueue(gfs2_delete_workqueue); +} + +static inline struct gfs2_glock *glock_hash_chain(unsigned hash) +{ + return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]), + struct gfs2_glock, gl_list); +} + +static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl) +{ + return hlist_bl_entry(rcu_dereference(gl->gl_list.next), + struct gfs2_glock, gl_list); +} + +static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) +{ + struct gfs2_glock *gl; + + do { + gl = gi->gl; + if (gl) { + gi->gl = glock_hash_next(gl); + } else { + gi->gl = glock_hash_chain(gi->hash); + } + while (gi->gl == NULL) { + gi->hash++; + if (gi->hash >= GFS2_GL_HASH_SIZE) { + rcu_read_unlock(); + return 1; + } + gi->gl = glock_hash_chain(gi->hash); + } + /* Skip entries for other sb and dead entries */ + } while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0); + + return 0; +} + +static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct gfs2_glock_iter *gi = seq->private; + loff_t n = *pos; + + gi->hash = 0; + rcu_read_lock(); + + do { + if (gfs2_glock_iter_next(gi)) + return NULL; + } while (n--); + + return gi->gl; +} + +static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, + loff_t *pos) +{ + struct gfs2_glock_iter *gi = seq->private; + + (*pos)++; + + if (gfs2_glock_iter_next(gi)) + return NULL; + + return gi->gl; +} + +static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glock_iter *gi = seq->private; + + if (gi->gl) + rcu_read_unlock(); + gi->gl = NULL; +} + +static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) +{ + return dump_glock(seq, iter_ptr); +} + +static const struct seq_operations gfs2_glock_seq_ops = { + .start = gfs2_glock_seq_start, + .next = gfs2_glock_seq_next, + .stop = gfs2_glock_seq_stop, + .show = gfs2_glock_seq_show, +}; + +static int gfs2_debugfs_open(struct inode *inode, struct file *file) +{ + int ret = seq_open_private(file, &gfs2_glock_seq_ops, + sizeof(struct gfs2_glock_iter)); + if (ret == 0) { + struct seq_file *seq = file->private_data; + struct gfs2_glock_iter *gi = seq->private; + gi->sdp = inode->i_private; + } + return ret; +} + +static const struct file_operations gfs2_debug_fops = { + .owner = THIS_MODULE, + .open = gfs2_debugfs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int gfs2_create_debugfs_file(struct gfs2_sbd *sdp) +{ + sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root); + if (!sdp->debugfs_dir) + return -ENOMEM; + sdp->debugfs_dentry_glocks = debugfs_create_file("glocks", + S_IFREG | S_IRUGO, + sdp->debugfs_dir, sdp, + &gfs2_debug_fops); + if (!sdp->debugfs_dentry_glocks) + return -ENOMEM; + + return 0; +} + +void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp) +{ + if (sdp && sdp->debugfs_dir) { + if (sdp->debugfs_dentry_glocks) { + debugfs_remove(sdp->debugfs_dentry_glocks); + sdp->debugfs_dentry_glocks = NULL; + } + debugfs_remove(sdp->debugfs_dir); + sdp->debugfs_dir = NULL; + } +} + +int gfs2_register_debugfs(void) +{ + gfs2_root = debugfs_create_dir("gfs2", NULL); + return gfs2_root ? 0 : -ENOMEM; +} + +void gfs2_unregister_debugfs(void) +{ + debugfs_remove(gfs2_root); + gfs2_root = NULL; +} diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h new file mode 100644 index 00000000..6b2f757b --- /dev/null +++ b/fs/gfs2/glock.h @@ -0,0 +1,244 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __GLOCK_DOT_H__ +#define __GLOCK_DOT_H__ + +#include +#include +#include "incore.h" + +/* Options for hostdata parser */ + +enum { + Opt_jid, + Opt_id, + Opt_first, + Opt_nodir, + Opt_err, +}; + +/* + * lm_lockname types + */ + +#define LM_TYPE_RESERVED 0x00 +#define LM_TYPE_NONDISK 0x01 +#define LM_TYPE_INODE 0x02 +#define LM_TYPE_RGRP 0x03 +#define LM_TYPE_META 0x04 +#define LM_TYPE_IOPEN 0x05 +#define LM_TYPE_FLOCK 0x06 +#define LM_TYPE_PLOCK 0x07 +#define LM_TYPE_QUOTA 0x08 +#define LM_TYPE_JOURNAL 0x09 + +/* + * lm_lock() states + * + * SHARED is compatible with SHARED, not with DEFERRED or EX. + * DEFERRED is compatible with DEFERRED, not with SHARED or EX. + */ + +#define LM_ST_UNLOCKED 0 +#define LM_ST_EXCLUSIVE 1 +#define LM_ST_DEFERRED 2 +#define LM_ST_SHARED 3 + +/* + * lm_lock() flags + * + * LM_FLAG_TRY + * Don't wait to acquire the lock if it can't be granted immediately. + * + * LM_FLAG_TRY_1CB + * Send one blocking callback if TRY is set and the lock is not granted. + * + * LM_FLAG_NOEXP + * GFS sets this flag on lock requests it makes while doing journal recovery. + * These special requests should not be blocked due to the recovery like + * ordinary locks would be. + * + * LM_FLAG_ANY + * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may + * also be granted in SHARED. The preferred state is whichever is compatible + * with other granted locks, or the specified state if no other locks exist. + * + * LM_FLAG_PRIORITY + * Override fairness considerations. Suppose a lock is held in a shared state + * and there is a pending request for the deferred state. A shared lock + * request with the priority flag would be allowed to bypass the deferred + * request and directly join the other shared lock. A shared lock request + * without the priority flag might be forced to wait until the deferred + * requested had acquired and released the lock. + */ + +#define LM_FLAG_TRY 0x00000001 +#define LM_FLAG_TRY_1CB 0x00000002 +#define LM_FLAG_NOEXP 0x00000004 +#define LM_FLAG_ANY 0x00000008 +#define LM_FLAG_PRIORITY 0x00000010 +#define GL_ASYNC 0x00000040 +#define GL_EXACT 0x00000080 +#define GL_SKIP 0x00000100 +#define GL_NOCACHE 0x00000400 + +/* + * lm_async_cb return flags + * + * LM_OUT_ST_MASK + * Masks the lower two bits of lock state in the returned value. + * + * LM_OUT_CANCELED + * The lock request was canceled. + * + */ + +#define LM_OUT_ST_MASK 0x00000003 +#define LM_OUT_CANCELED 0x00000008 +#define LM_OUT_ERROR 0x00000004 + +/* + * lm_recovery_done() messages + */ + +#define LM_RD_GAVEUP 308 +#define LM_RD_SUCCESS 309 + +#define GLR_TRYFAILED 13 + +struct lm_lockops { + const char *lm_proto_name; + int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); + void (*lm_unmount) (struct gfs2_sbd *sdp); + void (*lm_withdraw) (struct gfs2_sbd *sdp); + void (*lm_put_lock) (struct gfs2_glock *gl); + int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, + unsigned int flags); + void (*lm_cancel) (struct gfs2_glock *gl); + const match_table_t *lm_tokens; +}; + +extern struct workqueue_struct *gfs2_delete_workqueue; +static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + struct pid *pid; + + /* Look in glock's list of holders for one with current task as owner */ + spin_lock(&gl->gl_spin); + pid = task_pid(current); + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) + break; + if (gh->gh_owner_pid == pid) + goto out; + } + gh = NULL; +out: + spin_unlock(&gl->gl_spin); + + return gh; +} + +static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl) +{ + return gl->gl_state == LM_ST_EXCLUSIVE; +} + +static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl) +{ + return gl->gl_state == LM_ST_DEFERRED; +} + +static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl) +{ + return gl->gl_state == LM_ST_SHARED; +} + +static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) +{ + if (gl->gl_ops->go_flags & GLOF_ASPACE) + return (struct address_space *)(gl + 1); + return NULL; +} + +int gfs2_glock_get(struct gfs2_sbd *sdp, + u64 number, const struct gfs2_glock_operations *glops, + int create, struct gfs2_glock **glp); +void gfs2_glock_hold(struct gfs2_glock *gl); +void gfs2_glock_put_nolock(struct gfs2_glock *gl); +void gfs2_glock_put(struct gfs2_glock *gl); +void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, + struct gfs2_holder *gh); +void gfs2_holder_reinit(unsigned int state, unsigned flags, + struct gfs2_holder *gh); +void gfs2_holder_uninit(struct gfs2_holder *gh); +int gfs2_glock_nq(struct gfs2_holder *gh); +int gfs2_glock_poll(struct gfs2_holder *gh); +int gfs2_glock_wait(struct gfs2_holder *gh); +void gfs2_glock_dq(struct gfs2_holder *gh); +void gfs2_glock_dq_wait(struct gfs2_holder *gh); + +void gfs2_glock_dq_uninit(struct gfs2_holder *gh); +int gfs2_glock_nq_num(struct gfs2_sbd *sdp, + u64 number, const struct gfs2_glock_operations *glops, + unsigned int state, int flags, struct gfs2_holder *gh); + +int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); +void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); +void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); + +__attribute__ ((format(printf, 2, 3))) +void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); + +/** + * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock + * @gl: the glock + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + * Returns: 0, GLR_*, or errno + */ + +static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, + unsigned int state, int flags, + struct gfs2_holder *gh) +{ + int error; + + gfs2_holder_init(gl, state, flags, gh); + + error = gfs2_glock_nq(gh); + if (error) + gfs2_holder_uninit(gh); + + return error; +} + +extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); +extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret); +extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); +extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip); +extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); +extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl); +extern void gfs2_glock_free(struct gfs2_glock *gl); + +extern int __init gfs2_glock_init(void); +extern void gfs2_glock_exit(void); + +extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp); +extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); +extern int gfs2_register_debugfs(void); +extern void gfs2_unregister_debugfs(void); + +extern const struct lm_lockops gfs2_dlm_ops; + +#endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c new file mode 100644 index 00000000..2cca2931 --- /dev/null +++ b/fs/gfs2/glops.c @@ -0,0 +1,605 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "recovery.h" +#include "rgrp.h" +#include "util.h" +#include "trans.h" + +/** + * __gfs2_ail_flush - remove all buffers for a given lock from the AIL + * @gl: the glock + * + * None of the buffers should be dirty, locked, or pinned. + */ + +static void __gfs2_ail_flush(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct list_head *head = &gl->gl_ail_list; + struct gfs2_bufdata *bd; + struct buffer_head *bh; + + spin_lock(&sdp->sd_ail_lock); + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, + bd_ail_gl_list); + bh = bd->bd_bh; + gfs2_remove_from_ail(bd); + bd->bd_bh = NULL; + bh->b_private = NULL; + spin_unlock(&sdp->sd_ail_lock); + + bd->bd_blkno = bh->b_blocknr; + gfs2_log_lock(sdp); + gfs2_assert_withdraw(sdp, !buffer_busy(bh)); + gfs2_trans_add_revoke(sdp, bd); + gfs2_log_unlock(sdp); + + spin_lock(&sdp->sd_ail_lock); + } + gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + spin_unlock(&sdp->sd_ail_lock); +} + + +static void gfs2_ail_empty_gl(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_trans tr; + + memset(&tr, 0, sizeof(tr)); + tr.tr_revokes = atomic_read(&gl->gl_ail_count); + + if (!tr.tr_revokes) + return; + + /* A shortened, inline version of gfs2_trans_begin() */ + tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); + tr.tr_ip = (unsigned long)__builtin_return_address(0); + INIT_LIST_HEAD(&tr.tr_list_buf); + gfs2_log_reserve(sdp, tr.tr_reserved); + BUG_ON(current->journal_info); + current->journal_info = &tr; + + __gfs2_ail_flush(gl); + + gfs2_trans_end(sdp); + gfs2_log_flush(sdp, NULL); +} + +void gfs2_ail_flush(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + unsigned int revokes = atomic_read(&gl->gl_ail_count); + int ret; + + if (!revokes) + return; + + ret = gfs2_trans_begin(sdp, 0, revokes); + if (ret) + return; + __gfs2_ail_flush(gl); + gfs2_trans_end(sdp); + gfs2_log_flush(sdp, NULL); +} + +/** + * rgrp_go_sync - sync out the metadata for this glock + * @gl: the glock + * + * Called when demoting or unlocking an EX glock. We must flush + * to disk all dirty buffers/pages relating to this glock, and must not + * not return to caller to demote/unlock the glock until I/O is complete. + */ + +static void rgrp_go_sync(struct gfs2_glock *gl) +{ + struct address_space *metamapping = gfs2_glock2aspace(gl); + int error; + + if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) + return; + BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE); + + gfs2_log_flush(gl->gl_sbd, gl); + filemap_fdatawrite(metamapping); + error = filemap_fdatawait(metamapping); + mapping_set_error(metamapping, error); + gfs2_ail_empty_gl(gl); +} + +/** + * rgrp_go_inval - invalidate the metadata for this glock + * @gl: the glock + * @flags: + * + * We never used LM_ST_DEFERRED with resource groups, so that we + * should always see the metadata flag set here. + * + */ + +static void rgrp_go_inval(struct gfs2_glock *gl, int flags) +{ + struct address_space *mapping = gfs2_glock2aspace(gl); + + BUG_ON(!(flags & DIO_METADATA)); + gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); + truncate_inode_pages(mapping, 0); + + if (gl->gl_object) { + struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; + rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + } +} + +/** + * inode_go_sync - Sync the dirty data and/or metadata for an inode glock + * @gl: the glock protecting the inode + * + */ + +static void inode_go_sync(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip = gl->gl_object; + struct address_space *metamapping = gfs2_glock2aspace(gl); + int error; + + if (ip && !S_ISREG(ip->i_inode.i_mode)) + ip = NULL; + if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) + return; + + BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE); + + gfs2_log_flush(gl->gl_sbd, gl); + filemap_fdatawrite(metamapping); + if (ip) { + struct address_space *mapping = ip->i_inode.i_mapping; + filemap_fdatawrite(mapping); + error = filemap_fdatawait(mapping); + mapping_set_error(mapping, error); + } + error = filemap_fdatawait(metamapping); + mapping_set_error(metamapping, error); + gfs2_ail_empty_gl(gl); + /* + * Writeback of the data mapping may cause the dirty flag to be set + * so we have to clear it again here. + */ + smp_mb__before_clear_bit(); + clear_bit(GLF_DIRTY, &gl->gl_flags); +} + +/** + * inode_go_inval - prepare a inode glock to be released + * @gl: the glock + * @flags: + * + * Normally we invlidate everything, but if we are moving into + * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we + * can keep hold of the metadata, since it won't have changed. + * + */ + +static void inode_go_inval(struct gfs2_glock *gl, int flags) +{ + struct gfs2_inode *ip = gl->gl_object; + + gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); + + if (flags & DIO_METADATA) { + struct address_space *mapping = gfs2_glock2aspace(gl); + truncate_inode_pages(mapping, 0); + if (ip) { + set_bit(GIF_INVALID, &ip->i_flags); + forget_all_cached_acls(&ip->i_inode); + } + } + + if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) { + gfs2_log_flush(gl->gl_sbd, NULL); + gl->gl_sbd->sd_rindex_uptodate = 0; + } + if (ip && S_ISREG(ip->i_inode.i_mode)) + truncate_inode_pages(ip->i_inode.i_mapping, 0); +} + +/** + * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int inode_go_demote_ok(const struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_holder *gh; + + if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object) + return 0; + + if (!list_empty(&gl->gl_holders)) { + gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + if (gh->gh_list.next != &gl->gl_holders) + return 0; + } + + return 1; +} + +/** + * gfs2_set_nlink - Set the inode's link count based on on-disk info + * @inode: The inode in question + * @nlink: The link count + * + * If the link count has hit zero, it must never be raised, whatever the + * on-disk inode might say. When new struct inodes are created the link + * count is set to 1, so that we can safely use this test even when reading + * in on disk information for the first time. + */ + +static void gfs2_set_nlink(struct inode *inode, u32 nlink) +{ + /* + * We will need to review setting the nlink count here in the + * light of the forthcoming ro bind mount work. This is a reminder + * to do that. + */ + if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) { + if (nlink == 0) + clear_nlink(inode); + else + inode->i_nlink = nlink; + } +} + +static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) +{ + const struct gfs2_dinode *str = buf; + struct timespec atime; + u16 height, depth; + + if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) + goto corrupt; + ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino); + ip->i_inode.i_mode = be32_to_cpu(str->di_mode); + ip->i_inode.i_rdev = 0; + switch (ip->i_inode.i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), + be32_to_cpu(str->di_minor)); + break; + }; + + ip->i_inode.i_uid = be32_to_cpu(str->di_uid); + ip->i_inode.i_gid = be32_to_cpu(str->di_gid); + gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); + i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); + gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); + atime.tv_sec = be64_to_cpu(str->di_atime); + atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); + if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0) + ip->i_inode.i_atime = atime; + ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); + ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); + ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); + ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); + + ip->i_goal = be64_to_cpu(str->di_goal_meta); + ip->i_generation = be64_to_cpu(str->di_generation); + + ip->i_diskflags = be32_to_cpu(str->di_flags); + gfs2_set_inode_flags(&ip->i_inode); + height = be16_to_cpu(str->di_height); + if (unlikely(height > GFS2_MAX_META_HEIGHT)) + goto corrupt; + ip->i_height = (u8)height; + + depth = be16_to_cpu(str->di_depth); + if (unlikely(depth > GFS2_DIR_MAX_DEPTH)) + goto corrupt; + ip->i_depth = (u8)depth; + ip->i_entries = be32_to_cpu(str->di_entries); + + ip->i_eattr = be64_to_cpu(str->di_eattr); + if (S_ISREG(ip->i_inode.i_mode)) + gfs2_set_aops(&ip->i_inode); + + return 0; +corrupt: + gfs2_consist_inode(ip); + return -EIO; +} + +/** + * gfs2_inode_refresh - Refresh the incore copy of the dinode + * @ip: The GFS2 inode + * + * Returns: errno + */ + +int gfs2_inode_refresh(struct gfs2_inode *ip) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) { + brelse(dibh); + return -EIO; + } + + error = gfs2_dinode_in(ip, dibh->b_data); + brelse(dibh); + clear_bit(GIF_INVALID, &ip->i_flags); + + return error; +} + +/** + * inode_go_lock - operation done after an inode lock is locked by a process + * @gl: the glock + * @flags: + * + * Returns: errno + */ + +static int inode_go_lock(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip = gl->gl_object; + int error = 0; + + if (!ip || (gh->gh_flags & GL_SKIP)) + return 0; + + if (test_bit(GIF_INVALID, &ip->i_flags)) { + error = gfs2_inode_refresh(ip); + if (error) + return error; + } + + if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && + (gl->gl_state == LM_ST_EXCLUSIVE) && + (gh->gh_state == LM_ST_EXCLUSIVE)) { + spin_lock(&sdp->sd_trunc_lock); + if (list_empty(&ip->i_trunc_list)) + list_add(&sdp->sd_trunc_list, &ip->i_trunc_list); + spin_unlock(&sdp->sd_trunc_lock); + wake_up(&sdp->sd_quota_wait); + return 1; + } + + return error; +} + +/** + * inode_go_dump - print information about an inode + * @seq: The iterator + * @ip: the inode + * + * Returns: 0 on success, -ENOBUFS when we run out of space + */ + +static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) +{ + const struct gfs2_inode *ip = gl->gl_object; + if (ip == NULL) + return 0; + gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", + (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr, + IF2DT(ip->i_inode.i_mode), ip->i_flags, + (unsigned int)ip->i_diskflags, + (unsigned long long)i_size_read(&ip->i_inode)); + return 0; +} + +/** + * rgrp_go_lock - operation done after an rgrp lock is locked by + * a first holder on this node. + * @gl: the glock + * @flags: + * + * Returns: errno + */ + +static int rgrp_go_lock(struct gfs2_holder *gh) +{ + return gfs2_rgrp_bh_get(gh->gh_gl->gl_object); +} + +/** + * rgrp_go_unlock - operation done before an rgrp lock is unlocked by + * a last holder on this node. + * @gl: the glock + * @flags: + * + */ + +static void rgrp_go_unlock(struct gfs2_holder *gh) +{ + gfs2_rgrp_bh_put(gh->gh_gl->gl_object); +} + +/** + * trans_go_sync - promote/demote the transaction glock + * @gl: the glock + * @state: the requested state + * @flags: + * + */ + +static void trans_go_sync(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + + if (gl->gl_state != LM_ST_UNLOCKED && + test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + gfs2_meta_syncfs(sdp); + gfs2_log_shutdown(sdp); + } +} + +/** + * trans_go_xmote_bh - After promoting/demoting the transaction glock + * @gl: the glock + * + */ + +static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); + struct gfs2_glock *j_gl = ip->i_gl; + struct gfs2_log_header_host head; + int error; + + if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); + + error = gfs2_find_jhead(sdp->sd_jdesc, &head); + if (error) + gfs2_consist(sdp); + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) + gfs2_consist(sdp); + + /* Initialize some head of the log stuff */ + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) { + sdp->sd_log_sequence = head.lh_sequence + 1; + gfs2_log_pointers_init(sdp, head.lh_blkno); + } + } + return 0; +} + +/** + * trans_go_demote_ok + * @gl: the glock + * + * Always returns 0 + */ + +static int trans_go_demote_ok(const struct gfs2_glock *gl) +{ + return 0; +} + +/** + * iopen_go_callback - schedule the dcache entry for the inode to be deleted + * @gl: the glock + * + * gl_spin lock is held while calling this + */ +static void iopen_go_callback(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; + struct gfs2_sbd *sdp = gl->gl_sbd; + + if (sdp->sd_vfs->s_flags & MS_RDONLY) + return; + + if (gl->gl_demote_state == LM_ST_UNLOCKED && + gl->gl_state == LM_ST_SHARED && ip) { + gfs2_glock_hold(gl); + if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_put_nolock(gl); + } +} + +const struct gfs2_glock_operations gfs2_meta_glops = { + .go_type = LM_TYPE_META, +}; + +const struct gfs2_glock_operations gfs2_inode_glops = { + .go_xmote_th = inode_go_sync, + .go_inval = inode_go_inval, + .go_demote_ok = inode_go_demote_ok, + .go_lock = inode_go_lock, + .go_dump = inode_go_dump, + .go_type = LM_TYPE_INODE, + .go_min_hold_time = HZ / 5, + .go_flags = GLOF_ASPACE, +}; + +const struct gfs2_glock_operations gfs2_rgrp_glops = { + .go_xmote_th = rgrp_go_sync, + .go_inval = rgrp_go_inval, + .go_lock = rgrp_go_lock, + .go_unlock = rgrp_go_unlock, + .go_dump = gfs2_rgrp_dump, + .go_type = LM_TYPE_RGRP, + .go_min_hold_time = HZ / 5, + .go_flags = GLOF_ASPACE, +}; + +const struct gfs2_glock_operations gfs2_trans_glops = { + .go_xmote_th = trans_go_sync, + .go_xmote_bh = trans_go_xmote_bh, + .go_demote_ok = trans_go_demote_ok, + .go_type = LM_TYPE_NONDISK, +}; + +const struct gfs2_glock_operations gfs2_iopen_glops = { + .go_type = LM_TYPE_IOPEN, + .go_callback = iopen_go_callback, +}; + +const struct gfs2_glock_operations gfs2_flock_glops = { + .go_type = LM_TYPE_FLOCK, +}; + +const struct gfs2_glock_operations gfs2_nondisk_glops = { + .go_type = LM_TYPE_NONDISK, +}; + +const struct gfs2_glock_operations gfs2_quota_glops = { + .go_type = LM_TYPE_QUOTA, +}; + +const struct gfs2_glock_operations gfs2_journal_glops = { + .go_type = LM_TYPE_JOURNAL, +}; + +const struct gfs2_glock_operations *gfs2_glops_list[] = { + [LM_TYPE_META] = &gfs2_meta_glops, + [LM_TYPE_INODE] = &gfs2_inode_glops, + [LM_TYPE_RGRP] = &gfs2_rgrp_glops, + [LM_TYPE_IOPEN] = &gfs2_iopen_glops, + [LM_TYPE_FLOCK] = &gfs2_flock_glops, + [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, + [LM_TYPE_QUOTA] = &gfs2_quota_glops, + [LM_TYPE_JOURNAL] = &gfs2_journal_glops, +}; + diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h new file mode 100644 index 00000000..6fce409b --- /dev/null +++ b/fs/gfs2/glops.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __GLOPS_DOT_H__ +#define __GLOPS_DOT_H__ + +#include "incore.h" + +extern const struct gfs2_glock_operations gfs2_meta_glops; +extern const struct gfs2_glock_operations gfs2_inode_glops; +extern const struct gfs2_glock_operations gfs2_rgrp_glops; +extern const struct gfs2_glock_operations gfs2_trans_glops; +extern const struct gfs2_glock_operations gfs2_iopen_glops; +extern const struct gfs2_glock_operations gfs2_flock_glops; +extern const struct gfs2_glock_operations gfs2_nondisk_glops; +extern const struct gfs2_glock_operations gfs2_quota_glops; +extern const struct gfs2_glock_operations gfs2_journal_glops; +extern const struct gfs2_glock_operations *gfs2_glops_list[]; + +extern void gfs2_ail_flush(struct gfs2_glock *gl); + +#endif /* __GLOPS_DOT_H__ */ diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h new file mode 100644 index 00000000..81206e70 --- /dev/null +++ b/fs/gfs2/incore.h @@ -0,0 +1,686 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __INCORE_DOT_H__ +#define __INCORE_DOT_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DIO_WAIT 0x00000010 +#define DIO_METADATA 0x00000020 + +struct gfs2_log_operations; +struct gfs2_log_element; +struct gfs2_holder; +struct gfs2_glock; +struct gfs2_quota_data; +struct gfs2_trans; +struct gfs2_ail; +struct gfs2_jdesc; +struct gfs2_sbd; +struct lm_lockops; + +typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); + +struct gfs2_log_header_host { + u64 lh_sequence; /* Sequence number of this transaction */ + u32 lh_flags; /* GFS2_LOG_HEAD_... */ + u32 lh_tail; /* Block number of log tail */ + u32 lh_blkno; + u32 lh_hash; +}; + +/* + * Structure of operations that are associated with each + * type of element in the log. + */ + +struct gfs2_log_operations { + void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le); + void (*lo_before_commit) (struct gfs2_sbd *sdp); + void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai); + void (*lo_before_scan) (struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, int pass); + int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass); + void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass); + const char *lo_name; +}; + +struct gfs2_log_element { + struct list_head le_list; + const struct gfs2_log_operations *le_ops; +}; + +#define GBF_FULL 1 + +struct gfs2_bitmap { + struct buffer_head *bi_bh; + char *bi_clone; + unsigned long bi_flags; + u32 bi_offset; + u32 bi_start; + u32 bi_len; +}; + +struct gfs2_rgrpd { + struct list_head rd_list; /* Link with superblock */ + struct list_head rd_list_mru; + struct gfs2_glock *rd_gl; /* Glock for this rgrp */ + u64 rd_addr; /* grp block disk address */ + u64 rd_data0; /* first data location */ + u32 rd_length; /* length of rgrp header in fs blocks */ + u32 rd_data; /* num of data blocks in rgrp */ + u32 rd_bitbytes; /* number of bytes in data bitmaps */ + u32 rd_free; + u32 rd_free_clone; + u32 rd_dinodes; + u64 rd_igeneration; + struct gfs2_bitmap *rd_bits; + struct mutex rd_mutex; + struct gfs2_log_element rd_le; + struct gfs2_sbd *rd_sbd; + unsigned int rd_bh_count; + u32 rd_last_alloc; + u32 rd_flags; +#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ +#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ +#define GFS2_RDF_ERROR 0x40000000 /* error in rg */ +#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ +}; + +enum gfs2_state_bits { + BH_Pinned = BH_PrivateStart, + BH_Escaped = BH_PrivateStart + 1, +}; + +BUFFER_FNS(Pinned, pinned) +TAS_BUFFER_FNS(Pinned, pinned) +BUFFER_FNS(Escaped, escaped) +TAS_BUFFER_FNS(Escaped, escaped) + +struct gfs2_bufdata { + struct buffer_head *bd_bh; + struct gfs2_glock *bd_gl; + + union { + struct list_head list_tr; + u64 blkno; + } u; +#define bd_list_tr u.list_tr +#define bd_blkno u.blkno + + struct gfs2_log_element bd_le; + + struct gfs2_ail *bd_ail; + struct list_head bd_ail_st_list; + struct list_head bd_ail_gl_list; +}; + +/* + * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a + * prefix of lock_dlm_ gets awkward. + */ + +#define GDLM_STRNAME_BYTES 25 +#define GDLM_LVB_SIZE 32 + +enum { + DFL_BLOCK_LOCKS = 0, +}; + +struct lm_lockname { + u64 ln_number; + unsigned int ln_type; +}; + +#define lm_name_equal(name1, name2) \ + (((name1)->ln_number == (name2)->ln_number) && \ + ((name1)->ln_type == (name2)->ln_type)) + + +struct gfs2_glock_operations { + void (*go_xmote_th) (struct gfs2_glock *gl); + int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); + void (*go_inval) (struct gfs2_glock *gl, int flags); + int (*go_demote_ok) (const struct gfs2_glock *gl); + int (*go_lock) (struct gfs2_holder *gh); + void (*go_unlock) (struct gfs2_holder *gh); + int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); + void (*go_callback) (struct gfs2_glock *gl); + const int go_type; + const unsigned long go_min_hold_time; + const unsigned long go_flags; +#define GLOF_ASPACE 1 +}; + +enum { + /* States */ + HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ + HIF_FIRST = 7, + HIF_WAIT = 10, +}; + +struct gfs2_holder { + struct list_head gh_list; + + struct gfs2_glock *gh_gl; + struct pid *gh_owner_pid; + unsigned int gh_state; + unsigned gh_flags; + + int gh_error; + unsigned long gh_iflags; /* HIF_... */ + unsigned long gh_ip; +}; + +enum { + GLF_LOCK = 1, + GLF_DEMOTE = 3, + GLF_PENDING_DEMOTE = 4, + GLF_DEMOTE_IN_PROGRESS = 5, + GLF_DIRTY = 6, + GLF_LFLUSH = 7, + GLF_INVALIDATE_IN_PROGRESS = 8, + GLF_REPLY_PENDING = 9, + GLF_INITIAL = 10, + GLF_FROZEN = 11, + GLF_QUEUED = 12, + GLF_LRU = 13, + GLF_OBJECT = 14, /* Used only for tracing */ +}; + +struct gfs2_glock { + struct hlist_bl_node gl_list; + unsigned long gl_flags; /* GLF_... */ + struct lm_lockname gl_name; + atomic_t gl_ref; + + spinlock_t gl_spin; + + /* State fields protected by gl_spin */ + unsigned int gl_state:2, /* Current state */ + gl_target:2, /* Target state */ + gl_demote_state:2, /* State requested by remote node */ + gl_req:2, /* State in last dlm request */ + gl_reply:8; /* Last reply from the dlm */ + + unsigned int gl_hash; + unsigned long gl_demote_time; /* time of first demote request */ + struct list_head gl_holders; + + const struct gfs2_glock_operations *gl_ops; + char gl_strname[GDLM_STRNAME_BYTES]; + struct dlm_lksb gl_lksb; + char gl_lvb[32]; + unsigned long gl_tchange; + void *gl_object; + + struct list_head gl_lru; + + struct gfs2_sbd *gl_sbd; + + struct list_head gl_ail_list; + atomic_t gl_ail_count; + atomic_t gl_revokes; + struct delayed_work gl_work; + struct work_struct gl_delete; + struct rcu_head gl_rcu; +}; + +#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ + +struct gfs2_alloc { + /* Quota stuff */ + + struct gfs2_quota_data *al_qd[2*MAXQUOTAS]; + struct gfs2_holder al_qd_ghs[2*MAXQUOTAS]; + unsigned int al_qd_num; + + u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */ + u32 al_alloced; /* Filled in by gfs2_alloc_*() */ + + /* Filled in by gfs2_inplace_reserve() */ + + unsigned int al_line; + char *al_file; + struct gfs2_holder al_ri_gh; + struct gfs2_holder al_rgd_gh; + struct gfs2_rgrpd *al_rgd; + +}; + +enum { + GIF_INVALID = 0, + GIF_QD_LOCKED = 1, + GIF_SW_PAGED = 3, +}; + + +struct gfs2_inode { + struct inode i_inode; + u64 i_no_addr; + u64 i_no_formal_ino; + u64 i_generation; + u64 i_eattr; + unsigned long i_flags; /* GIF_... */ + struct gfs2_glock *i_gl; /* Move into i_gh? */ + struct gfs2_holder i_iopen_gh; + struct gfs2_holder i_gh; /* for prepare/commit_write only */ + struct gfs2_alloc *i_alloc; + u64 i_goal; /* goal block for allocations */ + struct rw_semaphore i_rw_mutex; + struct list_head i_trunc_list; + u32 i_entries; + u32 i_diskflags; + u8 i_height; + u8 i_depth; +}; + +/* + * Since i_inode is the first element of struct gfs2_inode, + * this is effectively a cast. + */ +static inline struct gfs2_inode *GFS2_I(struct inode *inode) +{ + return container_of(inode, struct gfs2_inode, i_inode); +} + +static inline struct gfs2_sbd *GFS2_SB(const struct inode *inode) +{ + return inode->i_sb->s_fs_info; +} + +struct gfs2_file { + struct mutex f_fl_mutex; + struct gfs2_holder f_fl_gh; +}; + +struct gfs2_revoke_replay { + struct list_head rr_list; + u64 rr_blkno; + unsigned int rr_where; +}; + +enum { + QDF_USER = 0, + QDF_CHANGE = 1, + QDF_LOCKED = 2, + QDF_REFRESH = 3, +}; + +struct gfs2_quota_data { + struct list_head qd_list; + struct list_head qd_reclaim; + + atomic_t qd_count; + + u32 qd_id; + unsigned long qd_flags; /* QDF_... */ + + s64 qd_change; + s64 qd_change_sync; + + unsigned int qd_slot; + unsigned int qd_slot_count; + + struct buffer_head *qd_bh; + struct gfs2_quota_change *qd_bh_qc; + unsigned int qd_bh_count; + + struct gfs2_glock *qd_gl; + struct gfs2_quota_lvb qd_qb; + + u64 qd_sync_gen; + unsigned long qd_last_warn; +}; + +struct gfs2_trans { + unsigned long tr_ip; + + unsigned int tr_blocks; + unsigned int tr_revokes; + unsigned int tr_reserved; + + struct gfs2_holder tr_t_gh; + + int tr_touched; + + unsigned int tr_num_buf; + unsigned int tr_num_buf_new; + unsigned int tr_num_databuf_new; + unsigned int tr_num_buf_rm; + unsigned int tr_num_databuf_rm; + struct list_head tr_list_buf; + + unsigned int tr_num_revoke; + unsigned int tr_num_revoke_rm; +}; + +struct gfs2_ail { + struct list_head ai_list; + + unsigned int ai_first; + struct list_head ai_ail1_list; + struct list_head ai_ail2_list; +}; + +struct gfs2_journal_extent { + struct list_head extent_list; + + unsigned int lblock; /* First logical block */ + u64 dblock; /* First disk block */ + u64 blocks; +}; + +struct gfs2_jdesc { + struct list_head jd_list; + struct list_head extent_list; + struct work_struct jd_work; + struct inode *jd_inode; + unsigned long jd_flags; +#define JDF_RECOVERY 1 + unsigned int jd_jid; + unsigned int jd_blocks; +}; + +struct gfs2_statfs_change_host { + s64 sc_total; + s64 sc_free; + s64 sc_dinodes; +}; + +#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF +#define GFS2_QUOTA_OFF 0 +#define GFS2_QUOTA_ACCOUNT 1 +#define GFS2_QUOTA_ON 2 + +#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED +#define GFS2_DATA_WRITEBACK 1 +#define GFS2_DATA_ORDERED 2 + +#define GFS2_ERRORS_DEFAULT GFS2_ERRORS_WITHDRAW +#define GFS2_ERRORS_WITHDRAW 0 +#define GFS2_ERRORS_CONTINUE 1 /* place holder for future feature */ +#define GFS2_ERRORS_RO 2 /* place holder for future feature */ +#define GFS2_ERRORS_PANIC 3 + +struct gfs2_args { + char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ + char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ + char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ + unsigned int ar_spectator:1; /* Don't get a journal */ + unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ + unsigned int ar_debug:1; /* Oops on errors */ + unsigned int ar_posix_acl:1; /* Enable posix acls */ + unsigned int ar_quota:2; /* off/account/on */ + unsigned int ar_suiddir:1; /* suiddir support */ + unsigned int ar_data:2; /* ordered/writeback */ + unsigned int ar_meta:1; /* mount metafs */ + unsigned int ar_discard:1; /* discard requests */ + unsigned int ar_errors:2; /* errors=withdraw | panic */ + unsigned int ar_nobarrier:1; /* do not send barriers */ + int ar_commit; /* Commit interval */ + int ar_statfs_quantum; /* The fast statfs interval */ + int ar_quota_quantum; /* The quota interval */ + int ar_statfs_percent; /* The % change to force sync */ +}; + +struct gfs2_tune { + spinlock_t gt_spin; + + unsigned int gt_logd_secs; + + unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ + unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ + unsigned int gt_quota_scale_num; /* Numerator */ + unsigned int gt_quota_scale_den; /* Denominator */ + unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ + unsigned int gt_new_files_jdata; + unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ + unsigned int gt_complain_secs; + unsigned int gt_statfs_quantum; + unsigned int gt_statfs_slow; +}; + +enum { + SDF_JOURNAL_CHECKED = 0, + SDF_JOURNAL_LIVE = 1, + SDF_SHUTDOWN = 2, + SDF_NOBARRIERS = 3, + SDF_NORECOVERY = 4, + SDF_DEMOTE = 5, + SDF_NOJOURNALID = 6, +}; + +#define GFS2_FSNAME_LEN 256 + +struct gfs2_inum_host { + u64 no_formal_ino; + u64 no_addr; +}; + +struct gfs2_sb_host { + u32 sb_magic; + u32 sb_type; + u32 sb_format; + + u32 sb_fs_format; + u32 sb_multihost_format; + u32 sb_bsize; + u32 sb_bsize_shift; + + struct gfs2_inum_host sb_master_dir; + struct gfs2_inum_host sb_root_dir; + + char sb_lockproto[GFS2_LOCKNAME_LEN]; + char sb_locktable[GFS2_LOCKNAME_LEN]; +}; + +/* + * lm_mount() return values + * + * ls_jid - the journal ID this node should use + * ls_first - this node is the first to mount the file system + * ls_lockspace - lock module's context for this file system + * ls_ops - lock module's functions + */ + +struct lm_lockstruct { + int ls_jid; + unsigned int ls_first; + unsigned int ls_first_done; + unsigned int ls_nodir; + const struct lm_lockops *ls_ops; + unsigned long ls_flags; + dlm_lockspace_t *ls_dlm; + + int ls_recover_jid_done; + int ls_recover_jid_status; +}; + +struct gfs2_sbd { + struct super_block *sd_vfs; + struct kobject sd_kobj; + unsigned long sd_flags; /* SDF_... */ + struct gfs2_sb_host sd_sb; + + /* Constants computed on mount */ + + u32 sd_fsb2bb; + u32 sd_fsb2bb_shift; + u32 sd_diptrs; /* Number of pointers in a dinode */ + u32 sd_inptrs; /* Number of pointers in a indirect block */ + u32 sd_jbsize; /* Size of a journaled data block */ + u32 sd_hash_bsize; /* sizeof(exhash block) */ + u32 sd_hash_bsize_shift; + u32 sd_hash_ptrs; /* Number of pointers in a hash block */ + u32 sd_qc_per_block; + u32 sd_max_dirres; /* Max blocks needed to add a directory entry */ + u32 sd_max_height; /* Max height of a file's metadata tree */ + u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; + u32 sd_max_jheight; /* Max height of journaled file's meta tree */ + u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1]; + + struct gfs2_args sd_args; /* Mount arguments */ + struct gfs2_tune sd_tune; /* Filesystem tuning structure */ + + /* Lock Stuff */ + + struct lm_lockstruct sd_lockstruct; + struct gfs2_holder sd_live_gh; + struct gfs2_glock *sd_rename_gl; + struct gfs2_glock *sd_trans_gl; + wait_queue_head_t sd_glock_wait; + atomic_t sd_glock_disposal; + struct completion sd_locking_init; + + /* Inode Stuff */ + + struct dentry *sd_master_dir; + struct dentry *sd_root_dir; + + struct inode *sd_jindex; + struct inode *sd_statfs_inode; + struct inode *sd_sc_inode; + struct inode *sd_qc_inode; + struct inode *sd_rindex; + struct inode *sd_quota_inode; + + /* StatFS stuff */ + + spinlock_t sd_statfs_spin; + struct gfs2_statfs_change_host sd_statfs_master; + struct gfs2_statfs_change_host sd_statfs_local; + int sd_statfs_force_sync; + + /* Resource group stuff */ + + int sd_rindex_uptodate; + spinlock_t sd_rindex_spin; + struct mutex sd_rindex_mutex; + struct list_head sd_rindex_list; + struct list_head sd_rindex_mru_list; + struct gfs2_rgrpd *sd_rindex_forward; + unsigned int sd_rgrps; + unsigned int sd_max_rg_data; + + /* Journal index stuff */ + + struct list_head sd_jindex_list; + spinlock_t sd_jindex_spin; + struct mutex sd_jindex_mutex; + unsigned int sd_journals; + + struct gfs2_jdesc *sd_jdesc; + struct gfs2_holder sd_journal_gh; + struct gfs2_holder sd_jinode_gh; + + struct gfs2_holder sd_sc_gh; + struct gfs2_holder sd_qc_gh; + + /* Daemon stuff */ + + struct task_struct *sd_logd_process; + struct task_struct *sd_quotad_process; + + /* Quota stuff */ + + struct list_head sd_quota_list; + atomic_t sd_quota_count; + struct mutex sd_quota_mutex; + wait_queue_head_t sd_quota_wait; + struct list_head sd_trunc_list; + spinlock_t sd_trunc_lock; + + unsigned int sd_quota_slots; + unsigned int sd_quota_chunks; + unsigned char **sd_quota_bitmap; + + u64 sd_quota_sync_gen; + + /* Log stuff */ + + spinlock_t sd_log_lock; + + unsigned int sd_log_blks_reserved; + unsigned int sd_log_commited_buf; + unsigned int sd_log_commited_databuf; + int sd_log_commited_revoke; + + atomic_t sd_log_pinned; + unsigned int sd_log_num_buf; + unsigned int sd_log_num_revoke; + unsigned int sd_log_num_rg; + unsigned int sd_log_num_databuf; + + struct list_head sd_log_le_buf; + struct list_head sd_log_le_revoke; + struct list_head sd_log_le_rg; + struct list_head sd_log_le_databuf; + struct list_head sd_log_le_ordered; + + atomic_t sd_log_thresh1; + atomic_t sd_log_thresh2; + atomic_t sd_log_blks_free; + wait_queue_head_t sd_log_waitq; + wait_queue_head_t sd_logd_waitq; + + u64 sd_log_sequence; + unsigned int sd_log_head; + unsigned int sd_log_tail; + int sd_log_idle; + + struct rw_semaphore sd_log_flush_lock; + atomic_t sd_log_in_flight; + wait_queue_head_t sd_log_flush_wait; + + unsigned int sd_log_flush_head; + u64 sd_log_flush_wrapped; + + spinlock_t sd_ail_lock; + struct list_head sd_ail1_list; + struct list_head sd_ail2_list; + + /* Replay stuff */ + + struct list_head sd_revoke_list; + unsigned int sd_replay_tail; + + unsigned int sd_found_blocks; + unsigned int sd_found_revokes; + unsigned int sd_replayed_blocks; + + /* For quiescing the filesystem */ + + struct gfs2_holder sd_freeze_gh; + struct mutex sd_freeze_lock; + unsigned int sd_freeze_count; + + char sd_fsname[GFS2_FSNAME_LEN]; + char sd_table_name[GFS2_FSNAME_LEN]; + char sd_proto_name[GFS2_FSNAME_LEN]; + + /* Debugging crud */ + + unsigned long sd_last_warning; + struct dentry *debugfs_dir; /* debugfs directory */ + struct dentry *debugfs_dentry_glocks; /* for debugfs */ +}; + +#endif /* __INCORE_DOT_H__ */ + diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c new file mode 100644 index 00000000..03e0c529 --- /dev/null +++ b/fs/gfs2/inode.c @@ -0,0 +1,1892 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "acl.h" +#include "bmap.h" +#include "dir.h" +#include "xattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "super.h" +#include "glops.h" + +struct gfs2_skip_data { + u64 no_addr; + int skipped; + int non_block; +}; + +static int iget_test(struct inode *inode, void *opaque) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_skip_data *data = opaque; + + if (ip->i_no_addr == data->no_addr) { + if (data->non_block && + inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { + data->skipped = 1; + return 0; + } + return 1; + } + return 0; +} + +static int iget_set(struct inode *inode, void *opaque) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_skip_data *data = opaque; + + if (data->skipped) + return -ENOENT; + inode->i_ino = (unsigned long)(data->no_addr); + ip->i_no_addr = data->no_addr; + return 0; +} + +struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int non_block) +{ + unsigned long hash = (unsigned long)no_addr; + struct gfs2_skip_data data; + + data.no_addr = no_addr; + data.skipped = 0; + data.non_block = non_block; + return ilookup5(sb, hash, iget_test, &data); +} + +static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr, + int non_block) +{ + struct gfs2_skip_data data; + unsigned long hash = (unsigned long)no_addr; + + data.no_addr = no_addr; + data.skipped = 0; + data.non_block = non_block; + return iget5_locked(sb, hash, iget_test, iget_set, &data); +} + +/** + * gfs2_set_iop - Sets inode operations + * @inode: The inode with correct i_mode filled in + * + * GFS2 lookup code fills in vfs inode contents based on info obtained + * from directory entry inside gfs2_inode_lookup(). + */ + +static void gfs2_set_iop(struct inode *inode) +{ + struct gfs2_sbd *sdp = GFS2_SB(inode); + umode_t mode = inode->i_mode; + + if (S_ISREG(mode)) { + inode->i_op = &gfs2_file_iops; + if (gfs2_localflocks(sdp)) + inode->i_fop = &gfs2_file_fops_nolock; + else + inode->i_fop = &gfs2_file_fops; + } else if (S_ISDIR(mode)) { + inode->i_op = &gfs2_dir_iops; + if (gfs2_localflocks(sdp)) + inode->i_fop = &gfs2_dir_fops_nolock; + else + inode->i_fop = &gfs2_dir_fops; + } else if (S_ISLNK(mode)) { + inode->i_op = &gfs2_symlink_iops; + } else { + inode->i_op = &gfs2_file_iops; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + } +} + +/** + * gfs2_inode_lookup - Lookup an inode + * @sb: The super block + * @no_addr: The inode number + * @type: The type of the inode + * non_block: Can we block on inodes that are being freed? + * + * Returns: A VFS inode, or an error + */ + +struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, + u64 no_addr, u64 no_formal_ino, int non_block) +{ + struct inode *inode; + struct gfs2_inode *ip; + struct gfs2_glock *io_gl = NULL; + int error; + + inode = gfs2_iget(sb, no_addr, non_block); + ip = GFS2_I(inode); + + if (!inode) + return ERR_PTR(-ENOBUFS); + + if (inode->i_state & I_NEW) { + struct gfs2_sbd *sdp = GFS2_SB(inode); + ip->i_no_formal_ino = no_formal_ino; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + if (unlikely(error)) + goto fail; + ip->i_gl->gl_object = ip; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); + if (unlikely(error)) + goto fail_put; + + set_bit(GIF_INVALID, &ip->i_flags); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); + if (unlikely(error)) + goto fail_iopen; + + ip->i_iopen_gh.gh_gl->gl_object = ip; + gfs2_glock_put(io_gl); + io_gl = NULL; + + if (type == DT_UNKNOWN) { + /* Inode glock must be locked already */ + error = gfs2_inode_refresh(GFS2_I(inode)); + if (error) + goto fail_refresh; + } else { + inode->i_mode = DT2IF(type); + } + + gfs2_set_iop(inode); + unlock_new_inode(inode); + } + + return inode; + +fail_refresh: + ip->i_iopen_gh.gh_gl->gl_object = NULL; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); +fail_iopen: + if (io_gl) + gfs2_glock_put(io_gl); +fail_put: + ip->i_gl->gl_object = NULL; + gfs2_glock_put(ip->i_gl); +fail: + iget_failed(inode); + return ERR_PTR(error); +} + +struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 *no_formal_ino, unsigned int blktype) +{ + struct super_block *sb = sdp->sd_vfs; + struct gfs2_holder i_gh; + struct inode *inode = NULL; + int error; + + /* Must not read in block until block type is verified */ + error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops, + LM_ST_EXCLUSIVE, GL_SKIP, &i_gh); + if (error) + return ERR_PTR(error); + + error = gfs2_check_blk_type(sdp, no_addr, blktype); + if (error) + goto fail; + + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, 1); + if (IS_ERR(inode)) + goto fail; + + /* Two extra checks for NFS only */ + if (no_formal_ino) { + error = -ESTALE; + if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino) + goto fail_iput; + + error = -EIO; + if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) + goto fail_iput; + + error = 0; + } + +fail: + gfs2_glock_dq_uninit(&i_gh); + return error ? ERR_PTR(error) : inode; +fail_iput: + iput(inode); + goto fail; +} + + +struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) +{ + struct qstr qstr; + struct inode *inode; + gfs2_str2qstr(&qstr, name); + inode = gfs2_lookupi(dip, &qstr, 1); + /* gfs2_lookupi has inconsistent callers: vfs + * related routines expect NULL for no entry found, + * gfs2_lookup_simple callers expect ENOENT + * and do not check for NULL. + */ + if (inode == NULL) + return ERR_PTR(-ENOENT); + else + return inode; +} + + +/** + * gfs2_lookupi - Look up a filename in a directory and return its inode + * @d_gh: An initialized holder for the directory glock + * @name: The name of the inode to look for + * @is_root: If 1, ignore the caller's permissions + * @i_gh: An uninitialized holder for the new inode glock + * + * This can be called via the VFS filldir function when NFS is doing + * a readdirplus and the inode which its intending to stat isn't + * already in cache. In this case we must not take the directory glock + * again, since the readdir call will have already taken that lock. + * + * Returns: errno + */ + +struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root) +{ + struct super_block *sb = dir->i_sb; + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_holder d_gh; + int error = 0; + struct inode *inode = NULL; + int unlock = 0; + + if (!name->len || name->len > GFS2_FNAMESIZE) + return ERR_PTR(-ENAMETOOLONG); + + if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) || + (name->len == 2 && memcmp(name->name, "..", 2) == 0 && + dir == sb->s_root->d_inode)) { + igrab(dir); + return dir; + } + + if (gfs2_glock_is_locked_by_me(dip->i_gl) == NULL) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + return ERR_PTR(error); + unlock = 1; + } + + if (!is_root) { + error = gfs2_permission(dir, MAY_EXEC, 0); + if (error) + goto out; + } + + inode = gfs2_dir_search(dir, name); + if (IS_ERR(inode)) + error = PTR_ERR(inode); +out: + if (unlock) + gfs2_glock_dq_uninit(&d_gh); + if (error == -ENOENT) + return NULL; + return inode ? inode : ERR_PTR(error); +} + +/** + * create_ok - OK to create a new on-disk inode here? + * @dip: Directory in which dinode is to be created + * @name: Name of new dinode + * @mode: + * + * Returns: errno + */ + +static int create_ok(struct gfs2_inode *dip, const struct qstr *name, + unsigned int mode) +{ + int error; + + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0); + if (error) + return error; + + /* Don't create entries in an unlinked directory */ + if (!dip->i_inode.i_nlink) + return -ENOENT; + + error = gfs2_dir_check(&dip->i_inode, name, NULL); + switch (error) { + case -ENOENT: + error = 0; + break; + case 0: + return -EEXIST; + default: + return error; + } + + if (dip->i_entries == (u32)-1) + return -EFBIG; + if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1) + return -EMLINK; + + return 0; +} + +static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode, + unsigned int *uid, unsigned int *gid) +{ + if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && + (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) { + if (S_ISDIR(*mode)) + *mode |= S_ISUID; + else if (dip->i_inode.i_uid != current_fsuid()) + *mode &= ~07111; + *uid = dip->i_inode.i_uid; + } else + *uid = current_fsuid(); + + if (dip->i_inode.i_mode & S_ISGID) { + if (S_ISDIR(*mode)) + *mode |= S_ISGID; + *gid = dip->i_inode.i_gid; + } else + *gid = current_fsgid(); +} + +static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + int error; + + if (gfs2_alloc_get(dip) == NULL) + return -ENOMEM; + + dip->i_alloc->al_requested = RES_DINODE; + error = gfs2_inplace_reserve(dip); + if (error) + goto out; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0); + if (error) + goto out_ipreserv; + + error = gfs2_alloc_di(dip, no_addr, generation); + + gfs2_trans_end(sdp); + +out_ipreserv: + gfs2_inplace_release(dip); +out: + gfs2_alloc_put(dip); + return error; +} + +static void gfs2_init_dir(struct buffer_head *dibh, + const struct gfs2_inode *parent) +{ + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); + + gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent); + dent->de_inum = di->di_num; /* already GFS2 endian */ + dent->de_type = cpu_to_be16(DT_DIR); + + dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); + gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); + gfs2_inum_out(parent, dent); + dent->de_type = cpu_to_be16(DT_DIR); + +} + +/** + * init_dinode - Fill in a new dinode structure + * @dip: The directory this inode is being created in + * @gl: The glock covering the new inode + * @inum: The inode number + * @mode: The file permissions + * @uid: The uid of the new inode + * @gid: The gid of the new inode + * @generation: The generation number of the new inode + * @dev: The device number (if a device node) + * @symname: The symlink destination (if a symlink) + * @size: The inode size (ignored for directories) + * @bhp: The buffer head (returned to caller) + * + */ + +static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, + const struct gfs2_inum_host *inum, unsigned int mode, + unsigned int uid, unsigned int gid, + const u64 *generation, dev_t dev, const char *symname, + unsigned size, struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_dinode *di; + struct buffer_head *dibh; + struct timespec tv = CURRENT_TIME; + + dibh = gfs2_meta_new(gl, inum->no_addr); + gfs2_trans_add_bh(gl, dibh, 1); + gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + di = (struct gfs2_dinode *)dibh->b_data; + + di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino); + di->di_num.no_addr = cpu_to_be64(inum->no_addr); + di->di_mode = cpu_to_be32(mode); + di->di_uid = cpu_to_be32(uid); + di->di_gid = cpu_to_be32(gid); + di->di_nlink = 0; + di->di_size = cpu_to_be64(size); + di->di_blocks = cpu_to_be64(1); + di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec); + di->di_major = cpu_to_be32(MAJOR(dev)); + di->di_minor = cpu_to_be32(MINOR(dev)); + di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr); + di->di_generation = cpu_to_be64(*generation); + di->di_flags = 0; + di->__pad1 = 0; + di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0); + di->di_height = 0; + di->__pad2 = 0; + di->__pad3 = 0; + di->di_depth = 0; + di->di_entries = 0; + memset(&di->__pad4, 0, sizeof(di->__pad4)); + di->di_eattr = 0; + di->di_atime_nsec = cpu_to_be32(tv.tv_nsec); + di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec); + di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec); + memset(&di->di_reserved, 0, sizeof(di->di_reserved)); + + switch(mode & S_IFMT) { + case S_IFREG: + if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || + gfs2_tune_get(sdp, gt_new_files_jdata)) + di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); + break; + case S_IFDIR: + di->di_flags |= cpu_to_be32(dip->i_diskflags & + GFS2_DIF_INHERIT_JDATA); + di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); + di->di_size = cpu_to_be64(sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)); + di->di_entries = cpu_to_be32(2); + gfs2_init_dir(dibh, dip); + break; + case S_IFLNK: + memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, size); + break; + } + + set_buffer_uptodate(dibh); + + *bhp = dibh; +} + +static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, + unsigned int mode, const struct gfs2_inum_host *inum, + const u64 *generation, dev_t dev, const char *symname, + unsigned int size, struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + unsigned int uid, gid; + int error; + + munge_mode_uid_gid(dip, &mode, &uid, &gid); + if (!gfs2_alloc_get(dip)) + return -ENOMEM; + + error = gfs2_quota_lock(dip, uid, gid); + if (error) + goto out; + + error = gfs2_quota_check(dip, uid, gid); + if (error) + goto out_quota; + + error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0); + if (error) + goto out_quota; + + init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, symname, size, bhp); + gfs2_quota_change(dip, +1, uid, gid); + gfs2_trans_end(sdp); + +out_quota: + gfs2_quota_unlock(dip); +out: + gfs2_alloc_put(dip); + return error; +} + +static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_alloc *al; + int alloc_required; + struct buffer_head *dibh; + int error; + + al = gfs2_alloc_get(dip); + if (!al) + return -ENOMEM; + + error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto fail; + + error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name); + if (alloc_required < 0) + goto fail_quota_locks; + if (alloc_required) { + error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid); + if (error) + goto fail_quota_locks; + + al->al_requested = sdp->sd_max_dirres; + + error = gfs2_inplace_reserve(dip); + if (error) + goto fail_quota_locks; + + error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + + al->al_rgd->rd_length + + 2 * RES_DINODE + + RES_STATFS + RES_QUOTA, 0); + if (error) + goto fail_ipreserv; + } else { + error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0); + if (error) + goto fail_quota_locks; + } + + error = gfs2_dir_add(&dip->i_inode, name, ip); + if (error) + goto fail_end_trans; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto fail_end_trans; + inc_nlink(&ip->i_inode); + if (S_ISDIR(ip->i_inode.i_mode)) + inc_nlink(&ip->i_inode); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + return 0; + +fail_end_trans: + gfs2_trans_end(sdp); + +fail_ipreserv: + if (dip->i_alloc->al_rgd) + gfs2_inplace_release(dip); + +fail_quota_locks: + gfs2_quota_unlock(dip); + +fail: + gfs2_alloc_put(dip); + return error; +} + +static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, + const struct qstr *qstr) +{ + int err; + size_t len; + void *value; + char *name; + + err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr, + &name, &value, &len); + + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0, + GFS2_EATYPE_SECURITY); + kfree(value); + kfree(name); + + return err; +} + +/** + * gfs2_create_inode - Create a new inode + * @dir: The parent directory + * @dentry: The new dentry + * @mode: The permissions on the new inode + * @dev: For device nodes, this is the device number + * @symname: For symlinks, this is the link destination + * @size: The initial size of the inode (ignored for directories) + * + * Returns: 0 on success, or error code + */ + +static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, + unsigned int mode, dev_t dev, const char *symname, + unsigned int size) +{ + const struct qstr *name = &dentry->d_name; + struct gfs2_holder ghs[2]; + struct inode *inode = NULL; + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; + int error; + u64 generation; + struct buffer_head *bh = NULL; + + if (!name->len || name->len > GFS2_FNAMESIZE) + return -ENAMETOOLONG; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + if (error) + goto fail; + + error = create_ok(dip, name, mode); + if (error) + goto fail_gunlock; + + error = alloc_dinode(dip, &inum.no_addr, &generation); + if (error) + goto fail_gunlock; + inum.no_formal_ino = generation; + + error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops, + LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + if (error) + goto fail_gunlock; + + error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, symname, size, &bh); + if (error) + goto fail_gunlock2; + + inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, + inum.no_formal_ino, 0); + if (IS_ERR(inode)) + goto fail_gunlock2; + + error = gfs2_inode_refresh(GFS2_I(inode)); + if (error) + goto fail_gunlock2; + + error = gfs2_acl_create(dip, inode); + if (error) + goto fail_gunlock2; + + error = gfs2_security_init(dip, GFS2_I(inode), name); + if (error) + goto fail_gunlock2; + + error = link_dinode(dip, name, GFS2_I(inode)); + if (error) + goto fail_gunlock2; + + if (bh) + brelse(bh); + + gfs2_trans_end(sdp); + if (dip->i_alloc->al_rgd) + gfs2_inplace_release(dip); + gfs2_quota_unlock(dip); + gfs2_alloc_put(dip); + gfs2_glock_dq_uninit_m(2, ghs); + mark_inode_dirty(inode); + d_instantiate(dentry, inode); + return 0; + +fail_gunlock2: + gfs2_glock_dq_uninit(ghs + 1); + if (inode && !IS_ERR(inode)) + iput(inode); +fail_gunlock: + gfs2_glock_dq_uninit(ghs); +fail: + if (bh) + brelse(bh); + return error; +} + +/** + * gfs2_create - Create a file + * @dir: The directory in which to create the file + * @dentry: The dentry of the new file + * @mode: The mode of the new file + * + * Returns: errno + */ + +static int gfs2_create(struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + struct inode *inode; + int ret; + + for (;;) { + ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0); + if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL))) + return ret; + + inode = gfs2_lookupi(dir, &dentry->d_name, 0); + if (inode) { + if (!IS_ERR(inode)) + break; + return PTR_ERR(inode); + } + } + + d_instantiate(dentry, inode); + return 0; +} + +/** + * gfs2_lookup - Look up a filename in a directory and return its inode + * @dir: The directory inode + * @dentry: The dentry of the new inode + * @nd: passed from Linux VFS, ignored by us + * + * Called by the VFS layer. Lock dir and call gfs2_lookupi() + * + * Returns: errno + */ + +static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode = NULL; + + inode = gfs2_lookupi(dir, &dentry->d_name, 0); + if (inode && IS_ERR(inode)) + return ERR_CAST(inode); + + if (inode) { + struct gfs2_glock *gl = GFS2_I(inode)->i_gl; + struct gfs2_holder gh; + int error; + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) { + iput(inode); + return ERR_PTR(error); + } + gfs2_glock_dq_uninit(&gh); + return d_splice_alias(inode, dentry); + } + d_add(dentry, inode); + + return NULL; +} + +/** + * gfs2_link - Link to a file + * @old_dentry: The inode to link + * @dir: Add link to this directory + * @dentry: The name of the link + * + * Link the inode in "old_dentry" into the directory "dir" with the + * name in "dentry". + * + * Returns: errno + */ + +static int gfs2_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct inode *inode = old_dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder ghs[2]; + struct buffer_head *dibh; + int alloc_required; + int error; + + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + + error = gfs2_glock_nq(ghs); /* parent */ + if (error) + goto out_parent; + + error = gfs2_glock_nq(ghs + 1); /* child */ + if (error) + goto out_child; + + error = -ENOENT; + if (inode->i_nlink == 0) + goto out_gunlock; + + error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0); + if (error) + goto out_gunlock; + + error = gfs2_dir_check(dir, &dentry->d_name, NULL); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + default: + goto out_gunlock; + } + + error = -EINVAL; + if (!dip->i_inode.i_nlink) + goto out_gunlock; + error = -EFBIG; + if (dip->i_entries == (u32)-1) + goto out_gunlock; + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out_gunlock; + error = -EINVAL; + if (!ip->i_inode.i_nlink) + goto out_gunlock; + error = -EMLINK; + if (ip->i_inode.i_nlink == (u32)-1) + goto out_gunlock; + + alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name); + if (error < 0) + goto out_gunlock; + error = 0; + + if (alloc_required) { + struct gfs2_alloc *al = gfs2_alloc_get(dip); + if (!al) { + error = -ENOMEM; + goto out_gunlock; + } + + error = gfs2_quota_lock_check(dip); + if (error) + goto out_alloc; + + al->al_requested = sdp->sd_max_dirres; + + error = gfs2_inplace_reserve(dip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + + gfs2_rg_blocks(al) + + 2 * RES_DINODE + RES_STATFS + + RES_QUOTA, 0); + if (error) + goto out_ipres; + } else { + error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0); + if (error) + goto out_ipres; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + + error = gfs2_dir_add(dir, &dentry->d_name, ip); + if (error) + goto out_brelse; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + inc_nlink(&ip->i_inode); + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_dinode_out(ip, dibh->b_data); + mark_inode_dirty(&ip->i_inode); + +out_brelse: + brelse(dibh); +out_end_trans: + gfs2_trans_end(sdp); +out_ipres: + if (alloc_required) + gfs2_inplace_release(dip); +out_gunlock_q: + if (alloc_required) + gfs2_quota_unlock(dip); +out_alloc: + if (alloc_required) + gfs2_alloc_put(dip); +out_gunlock: + gfs2_glock_dq(ghs + 1); +out_child: + gfs2_glock_dq(ghs); +out_parent: + gfs2_holder_uninit(ghs); + gfs2_holder_uninit(ghs + 1); + if (!error) { + ihold(inode); + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + } + return error; +} + +/* + * gfs2_unlink_ok - check to see that a inode is still in a directory + * @dip: the directory + * @name: the name of the file + * @ip: the inode + * + * Assumes that the lock on (at least) @dip is held. + * + * Returns: 0 if the parent/child relationship is correct, errno if it isn't + */ + +static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, + const struct gfs2_inode *ip) +{ + int error; + + if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) + return -EPERM; + + if ((dip->i_inode.i_mode & S_ISVTX) && + dip->i_inode.i_uid != current_fsuid() && + ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER)) + return -EPERM; + + if (IS_APPEND(&dip->i_inode)) + return -EPERM; + + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0); + if (error) + return error; + + error = gfs2_dir_check(&dip->i_inode, name, ip); + if (error) + return error; + + return 0; +} + +/** + * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it + * @dip: The parent directory + * @name: The name of the entry in the parent directory + * @bh: The inode buffer for the inode to be removed + * @inode: The inode to be removed + * + * Called with all the locks and in a transaction. This will only be + * called for a directory after it has been checked to ensure it is empty. + * + * Returns: 0 on success, or an error + */ + +static int gfs2_unlink_inode(struct gfs2_inode *dip, + const struct dentry *dentry, + struct buffer_head *bh) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + error = gfs2_dir_del(dip, dentry); + if (error) + return error; + + ip->i_entries = 0; + inode->i_ctime = CURRENT_TIME; + if (S_ISDIR(inode->i_mode)) + clear_nlink(inode); + else + drop_nlink(inode); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_dinode_out(ip, bh->b_data); + mark_inode_dirty(inode); + if (inode->i_nlink == 0) + gfs2_unlink_di(inode); + return 0; +} + + +/** + * gfs2_unlink - Unlink an inode (this does rmdir as well) + * @dir: The inode of the directory containing the inode to unlink + * @dentry: The file itself + * + * This routine uses the type of the inode as a flag to figure out + * whether this is an unlink or an rmdir. + * + * Returns: errno + */ + +static int gfs2_unlink(struct inode *dir, struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *bh; + struct gfs2_holder ghs[3]; + struct gfs2_rgrpd *rgd; + struct gfs2_holder ri_gh; + int error; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + return error; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); + + + error = gfs2_glock_nq(ghs); /* parent */ + if (error) + goto out_parent; + + error = gfs2_glock_nq(ghs + 1); /* child */ + if (error) + goto out_child; + + error = -ENOENT; + if (inode->i_nlink == 0) + goto out_rgrp; + + if (S_ISDIR(inode->i_mode)) { + error = -ENOTEMPTY; + if (ip->i_entries > 2 || inode->i_nlink > 2) + goto out_rgrp; + } + + error = gfs2_glock_nq(ghs + 2); /* rgrp */ + if (error) + goto out_rgrp; + + error = gfs2_unlink_ok(dip, &dentry->d_name, ip); + if (error) + goto out_gunlock; + + error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0); + if (error) + goto out_gunlock; + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out_end_trans; + + error = gfs2_unlink_inode(dip, dentry, bh); + brelse(bh); + +out_end_trans: + gfs2_trans_end(sdp); +out_gunlock: + gfs2_glock_dq(ghs + 2); +out_rgrp: + gfs2_holder_uninit(ghs + 2); + gfs2_glock_dq(ghs + 1); +out_child: + gfs2_holder_uninit(ghs + 1); + gfs2_glock_dq(ghs); +out_parent: + gfs2_holder_uninit(ghs); + gfs2_glock_dq_uninit(&ri_gh); + return error; +} + +/** + * gfs2_symlink - Create a symlink + * @dir: The directory to create the symlink in + * @dentry: The dentry to put the symlink in + * @symname: The thing which the link points to + * + * Returns: errno + */ + +static int gfs2_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct gfs2_sbd *sdp = GFS2_SB(dir); + unsigned int size; + + size = strlen(symname); + if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) + return -ENAMETOOLONG; + + return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size); +} + +/** + * gfs2_mkdir - Make a directory + * @dir: The parent directory of the new one + * @dentry: The dentry of the new directory + * @mode: The mode of the new directory + * + * Returns: errno + */ + +static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0); +} + +/** + * gfs2_mknod - Make a special file + * @dir: The directory in which the special file will reside + * @dentry: The dentry of the special file + * @mode: The mode of the special file + * @dev: The device specification of the special file + * + */ + +static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t dev) +{ + return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0); +} + +/* + * gfs2_ok_to_move - check if it's ok to move a directory to another directory + * @this: move this + * @to: to here + * + * Follow @to back to the root and make sure we don't encounter @this + * Assumes we already hold the rename lock. + * + * Returns: errno + */ + +static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) +{ + struct inode *dir = &to->i_inode; + struct super_block *sb = dir->i_sb; + struct inode *tmp; + int error = 0; + + igrab(dir); + + for (;;) { + if (dir == &this->i_inode) { + error = -EINVAL; + break; + } + if (dir == sb->s_root->d_inode) { + error = 0; + break; + } + + tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1); + if (IS_ERR(tmp)) { + error = PTR_ERR(tmp); + break; + } + + iput(dir); + dir = tmp; + } + + iput(dir); + + return error; +} + +/** + * gfs2_rename - Rename a file + * @odir: Parent directory of old file name + * @odentry: The old dentry of the file + * @ndir: Parent directory of new file name + * @ndentry: The new dentry of the file + * + * Returns: errno + */ + +static int gfs2_rename(struct inode *odir, struct dentry *odentry, + struct inode *ndir, struct dentry *ndentry) +{ + struct gfs2_inode *odip = GFS2_I(odir); + struct gfs2_inode *ndip = GFS2_I(ndir); + struct gfs2_inode *ip = GFS2_I(odentry->d_inode); + struct gfs2_inode *nip = NULL; + struct gfs2_sbd *sdp = GFS2_SB(odir); + struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh; + struct gfs2_rgrpd *nrgd; + unsigned int num_gh; + int dir_rename = 0; + int alloc_required = 0; + unsigned int x; + int error; + + if (ndentry->d_inode) { + nip = GFS2_I(ndentry->d_inode); + if (ip == nip) + return 0; + } + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + return error; + + if (odip != ndip) { + error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, + 0, &r_gh); + if (error) + goto out; + + if (S_ISDIR(ip->i_inode.i_mode)) { + dir_rename = 1; + /* don't move a dirctory into it's subdir */ + error = gfs2_ok_to_move(ip, ndip); + if (error) + goto out_gunlock_r; + } + } + + num_gh = 1; + gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + if (odip != ndip) { + gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + } + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + + if (nip) { + gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + /* grab the resource lock for unlink flag twiddling + * this is the case of the target file already existing + * so we unlink before doing the rename + */ + nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr); + if (nrgd) + gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); + } + + for (x = 0; x < num_gh; x++) { + error = gfs2_glock_nq(ghs + x); + if (error) + goto out_gunlock; + } + + error = -ENOENT; + if (ip->i_inode.i_nlink == 0) + goto out_gunlock; + + /* Check out the old directory */ + + error = gfs2_unlink_ok(odip, &odentry->d_name, ip); + if (error) + goto out_gunlock; + + /* Check out the new directory */ + + if (nip) { + error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip); + if (error) + goto out_gunlock; + + if (nip->i_inode.i_nlink == 0) { + error = -EAGAIN; + goto out_gunlock; + } + + if (S_ISDIR(nip->i_inode.i_mode)) { + if (nip->i_entries < 2) { + gfs2_consist_inode(nip); + error = -EIO; + goto out_gunlock; + } + if (nip->i_entries > 2) { + error = -ENOTEMPTY; + goto out_gunlock; + } + } + } else { + error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0); + if (error) + goto out_gunlock; + + error = gfs2_dir_check(ndir, &ndentry->d_name, NULL); + switch (error) { + case -ENOENT: + error = 0; + break; + case 0: + error = -EEXIST; + default: + goto out_gunlock; + }; + + if (odip != ndip) { + if (!ndip->i_inode.i_nlink) { + error = -ENOENT; + goto out_gunlock; + } + if (ndip->i_entries == (u32)-1) { + error = -EFBIG; + goto out_gunlock; + } + if (S_ISDIR(ip->i_inode.i_mode) && + ndip->i_inode.i_nlink == (u32)-1) { + error = -EMLINK; + goto out_gunlock; + } + } + } + + /* Check out the dir to be renamed */ + + if (dir_rename) { + error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0); + if (error) + goto out_gunlock; + } + + if (nip == NULL) + alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); + error = alloc_required; + if (error < 0) + goto out_gunlock; + error = 0; + + if (alloc_required) { + struct gfs2_alloc *al = gfs2_alloc_get(ndip); + if (!al) { + error = -ENOMEM; + goto out_gunlock; + } + + error = gfs2_quota_lock_check(ndip); + if (error) + goto out_alloc; + + al->al_requested = sdp->sd_max_dirres; + + error = gfs2_inplace_reserve_ri(ndip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + + gfs2_rg_blocks(al) + + 4 * RES_DINODE + 4 * RES_LEAF + + RES_STATFS + RES_QUOTA + 4, 0); + if (error) + goto out_ipreserv; + } else { + error = gfs2_trans_begin(sdp, 4 * RES_DINODE + + 5 * RES_LEAF + 4, 0); + if (error) + goto out_gunlock; + } + + /* Remove the target file, if it exists */ + + if (nip) { + struct buffer_head *bh; + error = gfs2_meta_inode_buffer(nip, &bh); + if (error) + goto out_end_trans; + error = gfs2_unlink_inode(ndip, ndentry, bh); + brelse(bh); + } + + if (dir_rename) { + error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); + if (error) + goto out_end_trans; + } else { + struct buffer_head *dibh; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + + error = gfs2_dir_del(odip, odentry); + if (error) + goto out_end_trans; + + error = gfs2_dir_add(ndir, &ndentry->d_name, ip); + if (error) + goto out_end_trans; + +out_end_trans: + gfs2_trans_end(sdp); +out_ipreserv: + if (alloc_required) + gfs2_inplace_release(ndip); +out_gunlock_q: + if (alloc_required) + gfs2_quota_unlock(ndip); +out_alloc: + if (alloc_required) + gfs2_alloc_put(ndip); +out_gunlock: + while (x--) { + gfs2_glock_dq(ghs + x); + gfs2_holder_uninit(ghs + x); + } +out_gunlock_r: + if (r_gh.gh_gl) + gfs2_glock_dq_uninit(&r_gh); +out: + gfs2_glock_dq_uninit(&ri_gh); + return error; +} + +/** + * gfs2_follow_link - Follow a symbolic link + * @dentry: The dentry of the link + * @nd: Data that we pass to vfs_follow_link() + * + * This can handle symlinks of any size. + * + * Returns: 0 on success or error code + */ + +static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_holder i_gh; + struct buffer_head *dibh; + unsigned int size; + char *buf; + int error; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + error = gfs2_glock_nq(&i_gh); + if (error) { + gfs2_holder_uninit(&i_gh); + nd_set_link(nd, ERR_PTR(error)); + return NULL; + } + + size = (unsigned int)i_size_read(&ip->i_inode); + if (size == 0) { + gfs2_consist_inode(ip); + buf = ERR_PTR(-EIO); + goto out; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) { + buf = ERR_PTR(error); + goto out; + } + + buf = kzalloc(size + 1, GFP_NOFS); + if (!buf) + buf = ERR_PTR(-ENOMEM); + else + memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size); + brelse(dibh); +out: + gfs2_glock_dq_uninit(&i_gh); + nd_set_link(nd, buf); + return NULL; +} + +static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + kfree(s); +} + +/** + * gfs2_permission - + * @inode: The inode + * @mask: The mask to be tested + * @flags: Indicates whether this is an RCU path walk or not + * + * This may be called from the VFS directly, or from within GFS2 with the + * inode locked, so we look to see if the glock is already locked and only + * lock the glock if its not already been done. + * + * Returns: errno + */ + +int gfs2_permission(struct inode *inode, int mask, unsigned int flags) +{ + struct gfs2_inode *ip; + struct gfs2_holder i_gh; + int error; + int unlock = 0; + + + ip = GFS2_I(inode); + if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return error; + unlock = 1; + } + + if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) + error = -EACCES; + else + error = generic_permission(inode, mask, flags, gfs2_check_acl); + if (unlock) + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +{ + struct inode *inode = &ip->i_inode; + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + return 0; +} + +/** + * gfs2_setattr_simple - + * @ip: + * @attr: + * + * Returns: errno + */ + +int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +{ + int error; + + if (current->journal_info) + return __gfs2_setattr_simple(ip, attr); + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0); + if (error) + return error; + + error = __gfs2_setattr_simple(ip, attr); + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + return error; +} + +static int setattr_chown(struct inode *inode, struct iattr *attr) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + u32 ouid, ogid, nuid, ngid; + int error; + + ouid = inode->i_uid; + ogid = inode->i_gid; + nuid = attr->ia_uid; + ngid = attr->ia_gid; + + if (!(attr->ia_valid & ATTR_UID) || ouid == nuid) + ouid = nuid = NO_QUOTA_CHANGE; + if (!(attr->ia_valid & ATTR_GID) || ogid == ngid) + ogid = ngid = NO_QUOTA_CHANGE; + + if (!gfs2_alloc_get(ip)) + return -ENOMEM; + + error = gfs2_quota_lock(ip, nuid, ngid); + if (error) + goto out_alloc; + + if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { + error = gfs2_quota_check(ip, nuid, ngid); + if (error) + goto out_gunlock_q; + } + + error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0); + if (error) + goto out_gunlock_q; + + error = gfs2_setattr_simple(ip, attr); + if (error) + goto out_end_trans; + + if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { + u64 blocks = gfs2_get_inode_blocks(&ip->i_inode); + gfs2_quota_change(ip, -blocks, ouid, ogid); + gfs2_quota_change(ip, blocks, nuid, ngid); + } + +out_end_trans: + gfs2_trans_end(sdp); +out_gunlock_q: + gfs2_quota_unlock(ip); +out_alloc: + gfs2_alloc_put(ip); + return error; +} + +/** + * gfs2_setattr - Change attributes on an inode + * @dentry: The dentry which is changing + * @attr: The structure describing the change + * + * The VFS layer wants to change one or more of an inodes attributes. Write + * that change out to disk. + * + * Returns: errno + */ + +static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + return error; + + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out; + + error = inode_change_ok(inode, attr); + if (error) + goto out; + + if (attr->ia_valid & ATTR_SIZE) + error = gfs2_setattr_size(inode, attr->ia_size); + else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) + error = setattr_chown(inode, attr); + else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) + error = gfs2_acl_chmod(ip, attr); + else + error = gfs2_setattr_simple(ip, attr); + +out: + gfs2_glock_dq_uninit(&i_gh); + if (!error) + mark_inode_dirty(inode); + return error; +} + +/** + * gfs2_getattr - Read out an inode's attributes + * @mnt: The vfsmount the inode is being accessed from + * @dentry: The dentry to stat + * @stat: The inode's stats + * + * This may be called from the VFS directly, or from within GFS2 with the + * inode locked, so we look to see if the glock is already locked and only + * lock the glock if its not already been done. Note that its the NFS + * readdirplus operation which causes this to be called (from filldir) + * with the glock already held. + * + * Returns: errno + */ + +static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + int unlock = 0; + + if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) + return error; + unlock = 1; + } + + generic_fillattr(inode, stat); + if (unlock) + gfs2_glock_dq_uninit(&gh); + + return 0; +} + +static int gfs2_setxattr(struct dentry *dentry, const char *name, + const void *data, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_setxattr(dentry, name, data, size, flags); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; +} + +static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, + void *data, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_getxattr(dentry, name, data, size); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; +} + +static int gfs2_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_removexattr(dentry, name); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; +} + +static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) + goto out; + + if (gfs2_is_stuffed(ip)) { + u64 phys = ip->i_no_addr << inode->i_blkbits; + u64 size = i_size_read(inode); + u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED| + FIEMAP_EXTENT_DATA_INLINE; + phys += sizeof(struct gfs2_dinode); + phys += start; + if (start + len > size) + len = size - start; + if (start < size) + ret = fiemap_fill_next_extent(fieinfo, start, phys, + len, flags); + if (ret == 1) + ret = 0; + } else { + ret = __generic_block_fiemap(inode, fieinfo, start, len, + gfs2_block_map); + } + + gfs2_glock_dq_uninit(&gh); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +const struct inode_operations gfs2_file_iops = { + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, +}; + +const struct inode_operations gfs2_dir_iops = { + .create = gfs2_create, + .lookup = gfs2_lookup, + .link = gfs2_link, + .unlink = gfs2_unlink, + .symlink = gfs2_symlink, + .mkdir = gfs2_mkdir, + .rmdir = gfs2_unlink, + .mknod = gfs2_mknod, + .rename = gfs2_rename, + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, +}; + +const struct inode_operations gfs2_symlink_iops = { + .readlink = generic_readlink, + .follow_link = gfs2_follow_link, + .put_link = gfs2_put_link, + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, +}; + diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h new file mode 100644 index 00000000..31606076 --- /dev/null +++ b/fs/gfs2/inode.h @@ -0,0 +1,143 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __INODE_DOT_H__ +#define __INODE_DOT_H__ + +#include +#include +#include +#include "util.h" + +extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); +extern int gfs2_internal_read(struct gfs2_inode *ip, + struct file_ra_state *ra_state, + char *buf, loff_t *pos, unsigned size); +extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, + unsigned int from, unsigned int to); +extern void gfs2_set_aops(struct inode *inode); + +static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) +{ + return !ip->i_height; +} + +static inline int gfs2_is_jdata(const struct gfs2_inode *ip) +{ + return ip->i_diskflags & GFS2_DIF_JDATA; +} + +static inline int gfs2_is_writeback(const struct gfs2_inode *ip) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip); +} + +static inline int gfs2_is_ordered(const struct gfs2_inode *ip) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip); +} + +static inline int gfs2_is_dir(const struct gfs2_inode *ip) +{ + return S_ISDIR(ip->i_inode.i_mode); +} + +static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks) +{ + inode->i_blocks = blocks << + (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); +} + +static inline u64 gfs2_get_inode_blocks(const struct inode *inode) +{ + return inode->i_blocks >> + (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); +} + +static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change) +{ + gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change)); + change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK); + inode->i_blocks += change; +} + +static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr, + u64 no_formal_ino) +{ + return ip->i_no_addr == no_addr && ip->i_no_formal_ino == no_formal_ino; +} + +static inline void gfs2_inum_out(const struct gfs2_inode *ip, + struct gfs2_dirent *dent) +{ + dent->de_inum.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); + dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); +} + +static inline int gfs2_check_internal_file_size(struct inode *inode, + u64 minsize, u64 maxsize) +{ + u64 size = i_size_read(inode); + if (size < minsize || size > maxsize) + goto err; + if (size & ((1 << inode->i_blkbits) - 1)) + goto err; + return 0; +err: + gfs2_consist_inode(GFS2_I(inode)); + return -EIO; +} + +extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, + u64 no_addr, u64 no_formal_ino, + int non_block); +extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 *no_formal_ino, + unsigned int blktype); +extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock); + +extern int gfs2_inode_refresh(struct gfs2_inode *ip); + +extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root); +extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags); +extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); +extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); +extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); + +extern const struct inode_operations gfs2_file_iops; +extern const struct inode_operations gfs2_dir_iops; +extern const struct inode_operations gfs2_symlink_iops; +extern const struct file_operations gfs2_file_fops_nolock; +extern const struct file_operations gfs2_dir_fops_nolock; + +extern void gfs2_set_inode_flags(struct inode *inode); + +#ifdef CONFIG_GFS2_FS_LOCKING_DLM +extern const struct file_operations gfs2_file_fops; +extern const struct file_operations gfs2_dir_fops; + +static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) +{ + return sdp->sd_args.ar_localflocks; +} +#else /* Single node only */ +#define gfs2_file_fops gfs2_file_fops_nolock +#define gfs2_dir_fops gfs2_dir_fops_nolock + +static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) +{ + return 1; +} +#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ + +#endif /* __INODE_DOT_H__ */ + diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c new file mode 100644 index 00000000..98c80d8c --- /dev/null +++ b/fs/gfs2/lock_dlm.c @@ -0,0 +1,235 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include + +#include "incore.h" +#include "glock.h" +#include "util.h" + + +static void gdlm_ast(void *arg) +{ + struct gfs2_glock *gl = arg; + unsigned ret = gl->gl_state; + + BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); + + if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) + memset(gl->gl_lvb, 0, GDLM_LVB_SIZE); + + switch (gl->gl_lksb.sb_status) { + case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ + gfs2_glock_free(gl); + return; + case -DLM_ECANCEL: /* Cancel while getting lock */ + ret |= LM_OUT_CANCELED; + goto out; + case -EAGAIN: /* Try lock fails */ + case -EDEADLK: /* Deadlock detected */ + goto out; + case -ETIMEDOUT: /* Canceled due to timeout */ + ret |= LM_OUT_ERROR; + goto out; + case 0: /* Success */ + break; + default: /* Something unexpected */ + BUG(); + } + + ret = gl->gl_req; + if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) { + if (gl->gl_req == LM_ST_SHARED) + ret = LM_ST_DEFERRED; + else if (gl->gl_req == LM_ST_DEFERRED) + ret = LM_ST_SHARED; + else + BUG(); + } + + set_bit(GLF_INITIAL, &gl->gl_flags); + gfs2_glock_complete(gl, ret); + return; +out: + if (!test_bit(GLF_INITIAL, &gl->gl_flags)) + gl->gl_lksb.sb_lkid = 0; + gfs2_glock_complete(gl, ret); +} + +static void gdlm_bast(void *arg, int mode) +{ + struct gfs2_glock *gl = arg; + + switch (mode) { + case DLM_LOCK_EX: + gfs2_glock_cb(gl, LM_ST_UNLOCKED); + break; + case DLM_LOCK_CW: + gfs2_glock_cb(gl, LM_ST_DEFERRED); + break; + case DLM_LOCK_PR: + gfs2_glock_cb(gl, LM_ST_SHARED); + break; + default: + printk(KERN_ERR "unknown bast mode %d", mode); + BUG(); + } +} + +/* convert gfs lock-state to dlm lock-mode */ + +static int make_mode(const unsigned int lmstate) +{ + switch (lmstate) { + case LM_ST_UNLOCKED: + return DLM_LOCK_NL; + case LM_ST_EXCLUSIVE: + return DLM_LOCK_EX; + case LM_ST_DEFERRED: + return DLM_LOCK_CW; + case LM_ST_SHARED: + return DLM_LOCK_PR; + } + printk(KERN_ERR "unknown LM state %d", lmstate); + BUG(); + return -1; +} + +static u32 make_flags(const u32 lkid, const unsigned int gfs_flags, + const int req) +{ + u32 lkf = 0; + + if (gfs_flags & LM_FLAG_TRY) + lkf |= DLM_LKF_NOQUEUE; + + if (gfs_flags & LM_FLAG_TRY_1CB) { + lkf |= DLM_LKF_NOQUEUE; + lkf |= DLM_LKF_NOQUEUEBAST; + } + + if (gfs_flags & LM_FLAG_PRIORITY) { + lkf |= DLM_LKF_NOORDER; + lkf |= DLM_LKF_HEADQUE; + } + + if (gfs_flags & LM_FLAG_ANY) { + if (req == DLM_LOCK_PR) + lkf |= DLM_LKF_ALTCW; + else if (req == DLM_LOCK_CW) + lkf |= DLM_LKF_ALTPR; + else + BUG(); + } + + if (lkid != 0) + lkf |= DLM_LKF_CONVERT; + + lkf |= DLM_LKF_VALBLK; + + return lkf; +} + +static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, + unsigned int flags) +{ + struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + int req; + u32 lkf; + + req = make_mode(req_state); + lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req); + + /* + * Submit the actual lock request. + */ + + return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname, + GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); +} + +static void gdlm_put_lock(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int error; + + if (gl->gl_lksb.sb_lkid == 0) { + gfs2_glock_free(gl); + return; + } + + error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, + NULL, gl); + if (error) { + printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, error); + return; + } +} + +static void gdlm_cancel(struct gfs2_glock *gl) +{ + struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); +} + +static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int error; + + if (fsname == NULL) { + fs_info(sdp, "no fsname found\n"); + return -EINVAL; + } + + error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm, + DLM_LSFL_FS | DLM_LSFL_NEWEXCL | + (ls->ls_nodir ? DLM_LSFL_NODIR : 0), + GDLM_LVB_SIZE); + if (error) + printk(KERN_ERR "dlm_new_lockspace error %d", error); + + return error; +} + +static void gdlm_unmount(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (ls->ls_dlm) { + dlm_release_lockspace(ls->ls_dlm, 2); + ls->ls_dlm = NULL; + } +} + +static const match_table_t dlm_tokens = { + { Opt_jid, "jid=%d"}, + { Opt_id, "id=%d"}, + { Opt_first, "first=%d"}, + { Opt_nodir, "nodir=%d"}, + { Opt_err, NULL }, +}; + +const struct lm_lockops gfs2_dlm_ops = { + .lm_proto_name = "lock_dlm", + .lm_mount = gdlm_mount, + .lm_unmount = gdlm_unmount, + .lm_put_lock = gdlm_put_lock, + .lm_lock = gdlm_lock, + .lm_cancel = gdlm_cancel, + .lm_tokens = &dlm_tokens, +}; + diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c new file mode 100644 index 00000000..85c62923 --- /dev/null +++ b/fs/gfs2/log.c @@ -0,0 +1,972 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "util.h" +#include "dir.h" +#include "trace_gfs2.h" + +#define PULL 1 + +/** + * gfs2_struct2blk - compute stuff + * @sdp: the filesystem + * @nstruct: the number of structures + * @ssize: the size of the structures + * + * Compute the number of log descriptor blocks needed to hold a certain number + * of structures of a certain size. + * + * Returns: the number of blocks needed (minimum is always 1) + */ + +unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, + unsigned int ssize) +{ + unsigned int blks; + unsigned int first, second; + + blks = 1; + first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize; + + if (nstruct > first) { + second = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / ssize; + blks += DIV_ROUND_UP(nstruct - first, second); + } + + return blks; +} + +/** + * gfs2_remove_from_ail - Remove an entry from the ail lists, updating counters + * @mapping: The associated mapping (maybe NULL) + * @bd: The gfs2_bufdata to remove + * + * The ail lock _must_ be held when calling this function + * + */ + +void gfs2_remove_from_ail(struct gfs2_bufdata *bd) +{ + bd->bd_ail = NULL; + list_del_init(&bd->bd_ail_st_list); + list_del_init(&bd->bd_ail_gl_list); + atomic_dec(&bd->bd_gl->gl_ail_count); + brelse(bd->bd_bh); +} + +/** + * gfs2_ail1_start_one - Start I/O on a part of the AIL + * @sdp: the filesystem + * @wbc: The writeback control structure + * @ai: The ail structure + * + */ + +static int gfs2_ail1_start_one(struct gfs2_sbd *sdp, + struct writeback_control *wbc, + struct gfs2_ail *ai) +__releases(&sdp->sd_ail_lock) +__acquires(&sdp->sd_ail_lock) +{ + struct gfs2_glock *gl = NULL; + struct address_space *mapping; + struct gfs2_bufdata *bd, *s; + struct buffer_head *bh; + + list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, bd_ail_st_list) { + bh = bd->bd_bh; + + gfs2_assert(sdp, bd->bd_ail == ai); + + if (!buffer_busy(bh)) { + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + continue; + } + + if (!buffer_dirty(bh)) + continue; + if (gl == bd->bd_gl) + continue; + gl = bd->bd_gl; + list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); + mapping = bh->b_page->mapping; + if (!mapping) + continue; + spin_unlock(&sdp->sd_ail_lock); + generic_writepages(mapping, wbc); + spin_lock(&sdp->sd_ail_lock); + if (wbc->nr_to_write <= 0) + break; + return 1; + } + + return 0; +} + + +/** + * gfs2_ail1_flush - start writeback of some ail1 entries + * @sdp: The super block + * @wbc: The writeback control structure + * + * Writes back some ail1 entries, according to the limits in the + * writeback control structure + */ + +void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) +{ + struct list_head *head = &sdp->sd_ail1_list; + struct gfs2_ail *ai; + + trace_gfs2_ail_flush(sdp, wbc, 1); + spin_lock(&sdp->sd_ail_lock); +restart: + list_for_each_entry_reverse(ai, head, ai_list) { + if (wbc->nr_to_write <= 0) + break; + if (gfs2_ail1_start_one(sdp, wbc, ai)) + goto restart; + } + spin_unlock(&sdp->sd_ail_lock); + trace_gfs2_ail_flush(sdp, wbc, 0); +} + +/** + * gfs2_ail1_start - start writeback of all ail1 entries + * @sdp: The superblock + */ + +static void gfs2_ail1_start(struct gfs2_sbd *sdp) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = LONG_MAX, + .range_start = 0, + .range_end = LLONG_MAX, + }; + + return gfs2_ail1_flush(sdp, &wbc); +} + +/** + * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced + * @sdp: the filesystem + * @ai: the AIL entry + * + */ + +static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct gfs2_bufdata *bd, *s; + struct buffer_head *bh; + + list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, + bd_ail_st_list) { + bh = bd->bd_bh; + gfs2_assert(sdp, bd->bd_ail == ai); + if (buffer_busy(bh)) + continue; + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + } + +} + +/** + * gfs2_ail1_empty - Try to empty the ail1 lists + * @sdp: The superblock + * + * Tries to empty the ail1 lists, starting with the oldest first + */ + +static int gfs2_ail1_empty(struct gfs2_sbd *sdp) +{ + struct gfs2_ail *ai, *s; + int ret; + + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) { + gfs2_ail1_empty_one(sdp, ai); + if (list_empty(&ai->ai_ail1_list)) + list_move(&ai->ai_list, &sdp->sd_ail2_list); + else + break; + } + ret = list_empty(&sdp->sd_ail1_list); + spin_unlock(&sdp->sd_ail_lock); + + return ret; +} + +static void gfs2_ail1_wait(struct gfs2_sbd *sdp) +{ + struct gfs2_ail *ai; + struct gfs2_bufdata *bd; + struct buffer_head *bh; + + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_reverse(ai, &sdp->sd_ail1_list, ai_list) { + list_for_each_entry(bd, &ai->ai_ail1_list, bd_ail_st_list) { + bh = bd->bd_bh; + if (!buffer_locked(bh)) + continue; + get_bh(bh); + spin_unlock(&sdp->sd_ail_lock); + wait_on_buffer(bh); + brelse(bh); + return; + } + } + spin_unlock(&sdp->sd_ail_lock); +} + +/** + * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced + * @sdp: the filesystem + * @ai: the AIL entry + * + */ + +static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &ai->ai_ail2_list; + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_entry(head->prev, struct gfs2_bufdata, + bd_ail_st_list); + gfs2_assert(sdp, bd->bd_ail == ai); + gfs2_remove_from_ail(bd); + } +} + +static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) +{ + struct gfs2_ail *ai, *safe; + unsigned int old_tail = sdp->sd_log_tail; + int wrap = (new_tail < old_tail); + int a, b, rm; + + spin_lock(&sdp->sd_ail_lock); + + list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) { + a = (old_tail <= ai->ai_first); + b = (ai->ai_first < new_tail); + rm = (wrap) ? (a || b) : (a && b); + if (!rm) + continue; + + gfs2_ail2_empty_one(sdp, ai); + list_del(&ai->ai_list); + gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list)); + gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list)); + kfree(ai); + } + + spin_unlock(&sdp->sd_ail_lock); +} + +/** + * gfs2_log_reserve - Make a log reservation + * @sdp: The GFS2 superblock + * @blks: The number of blocks to reserve + * + * Note that we never give out the last few blocks of the journal. Thats + * due to the fact that there is a small number of header blocks + * associated with each log flush. The exact number can't be known until + * flush time, so we ensure that we have just enough free blocks at all + * times to avoid running out during a log flush. + * + * We no longer flush the log here, instead we wake up logd to do that + * for us. To avoid the thundering herd and to ensure that we deal fairly + * with queued waiters, we use an exclusive wait. This means that when we + * get woken with enough journal space to get our reservation, we need to + * wake the next waiter on the list. + * + * Returns: errno + */ + +int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) +{ + unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize); + unsigned wanted = blks + reserved_blks; + DEFINE_WAIT(wait); + int did_wait = 0; + unsigned int free_blocks; + + if (gfs2_assert_warn(sdp, blks) || + gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) + return -EINVAL; +retry: + free_blocks = atomic_read(&sdp->sd_log_blks_free); + if (unlikely(free_blocks <= wanted)) { + do { + prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sdp->sd_logd_waitq); + did_wait = 1; + if (atomic_read(&sdp->sd_log_blks_free) <= wanted) + io_schedule(); + free_blocks = atomic_read(&sdp->sd_log_blks_free); + } while(free_blocks <= wanted); + finish_wait(&sdp->sd_log_waitq, &wait); + } + if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks, + free_blocks - blks) != free_blocks) + goto retry; + trace_gfs2_log_blocks(sdp, -blks); + + /* + * If we waited, then so might others, wake them up _after_ we get + * our share of the log. + */ + if (unlikely(did_wait)) + wake_up(&sdp->sd_log_waitq); + + down_read(&sdp->sd_log_flush_lock); + + return 0; +} + +static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) +{ + struct gfs2_journal_extent *je; + + list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) { + if (lbn >= je->lblock && lbn < je->lblock + je->blocks) + return je->dblock + lbn - je->lblock; + } + + return -1; +} + +/** + * log_distance - Compute distance between two journal blocks + * @sdp: The GFS2 superblock + * @newer: The most recent journal block of the pair + * @older: The older journal block of the pair + * + * Compute the distance (in the journal direction) between two + * blocks in the journal + * + * Returns: the distance in blocks + */ + +static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer, + unsigned int older) +{ + int dist; + + dist = newer - older; + if (dist < 0) + dist += sdp->sd_jdesc->jd_blocks; + + return dist; +} + +/** + * calc_reserved - Calculate the number of blocks to reserve when + * refunding a transaction's unused buffers. + * @sdp: The GFS2 superblock + * + * This is complex. We need to reserve room for all our currently used + * metadata buffers (e.g. normal file I/O rewriting file time stamps) and + * all our journaled data buffers for journaled files (e.g. files in the + * meta_fs like rindex, or files for which chattr +j was done.) + * If we don't reserve enough space, gfs2_log_refund and gfs2_log_flush + * will count it as free space (sd_log_blks_free) and corruption will follow. + * + * We can have metadata bufs and jdata bufs in the same journal. So each + * type gets its own log header, for which we need to reserve a block. + * In fact, each type has the potential for needing more than one header + * in cases where we have more buffers than will fit on a journal page. + * Metadata journal entries take up half the space of journaled buffer entries. + * Thus, metadata entries have buf_limit (502) and journaled buffers have + * databuf_limit (251) before they cause a wrap around. + * + * Also, we need to reserve blocks for revoke journal entries and one for an + * overall header for the lot. + * + * Returns: the number of blocks reserved + */ +static unsigned int calc_reserved(struct gfs2_sbd *sdp) +{ + unsigned int reserved = 0; + unsigned int mbuf_limit, metabufhdrs_needed; + unsigned int dbuf_limit, databufhdrs_needed; + unsigned int revokes = 0; + + mbuf_limit = buf_limit(sdp); + metabufhdrs_needed = (sdp->sd_log_commited_buf + + (mbuf_limit - 1)) / mbuf_limit; + dbuf_limit = databuf_limit(sdp); + databufhdrs_needed = (sdp->sd_log_commited_databuf + + (dbuf_limit - 1)) / dbuf_limit; + + if (sdp->sd_log_commited_revoke > 0) + revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, + sizeof(u64)); + + reserved = sdp->sd_log_commited_buf + metabufhdrs_needed + + sdp->sd_log_commited_databuf + databufhdrs_needed + + revokes; + /* One for the overall header */ + if (reserved) + reserved++; + return reserved; +} + +static unsigned int current_tail(struct gfs2_sbd *sdp) +{ + struct gfs2_ail *ai; + unsigned int tail; + + spin_lock(&sdp->sd_ail_lock); + + if (list_empty(&sdp->sd_ail1_list)) { + tail = sdp->sd_log_head; + } else { + ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list); + tail = ai->ai_first; + } + + spin_unlock(&sdp->sd_ail_lock); + + return tail; +} + +void gfs2_log_incr_head(struct gfs2_sbd *sdp) +{ + if (sdp->sd_log_flush_head == sdp->sd_log_tail) + BUG_ON(sdp->sd_log_flush_head != sdp->sd_log_head); + + if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) { + sdp->sd_log_flush_head = 0; + sdp->sd_log_flush_wrapped = 1; + } +} + +/** + * gfs2_log_write_endio - End of I/O for a log buffer + * @bh: The buffer head + * @uptodate: I/O Status + * + */ + +static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate) +{ + struct gfs2_sbd *sdp = bh->b_private; + bh->b_private = NULL; + + end_buffer_write_sync(bh, uptodate); + if (atomic_dec_and_test(&sdp->sd_log_in_flight)) + wake_up(&sdp->sd_log_flush_wait); +} + +/** + * gfs2_log_get_buf - Get and initialize a buffer to use for log control data + * @sdp: The GFS2 superblock + * + * Returns: the buffer_head + */ + +struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp) +{ + u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); + struct buffer_head *bh; + + bh = sb_getblk(sdp->sd_vfs, blkno); + lock_buffer(bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + gfs2_log_incr_head(sdp); + atomic_inc(&sdp->sd_log_in_flight); + bh->b_private = sdp; + bh->b_end_io = gfs2_log_write_endio; + + return bh; +} + +/** + * gfs2_fake_write_endio - + * @bh: The buffer head + * @uptodate: The I/O Status + * + */ + +static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate) +{ + struct buffer_head *real_bh = bh->b_private; + struct gfs2_bufdata *bd = real_bh->b_private; + struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd; + + end_buffer_write_sync(bh, uptodate); + free_buffer_head(bh); + unlock_buffer(real_bh); + brelse(real_bh); + if (atomic_dec_and_test(&sdp->sd_log_in_flight)) + wake_up(&sdp->sd_log_flush_wait); +} + +/** + * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log + * @sdp: the filesystem + * @data: the data the buffer_head should point to + * + * Returns: the log buffer descriptor + */ + +struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, + struct buffer_head *real) +{ + u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); + struct buffer_head *bh; + + bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL); + atomic_set(&bh->b_count, 1); + bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock); + set_bh_page(bh, real->b_page, bh_offset(real)); + bh->b_blocknr = blkno; + bh->b_size = sdp->sd_sb.sb_bsize; + bh->b_bdev = sdp->sd_vfs->s_bdev; + bh->b_private = real; + bh->b_end_io = gfs2_fake_write_endio; + + gfs2_log_incr_head(sdp); + atomic_inc(&sdp->sd_log_in_flight); + + return bh; +} + +static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) +{ + unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail); + + ail2_empty(sdp, new_tail); + + atomic_add(dist, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, dist); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); + + sdp->sd_log_tail = new_tail; +} + +/** + * log_write_header - Get and initialize a journal header buffer + * @sdp: The GFS2 superblock + * + * Returns: the initialized log buffer descriptor + */ + +static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) +{ + u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); + struct buffer_head *bh; + struct gfs2_log_header *lh; + unsigned int tail; + u32 hash; + + bh = sb_getblk(sdp->sd_vfs, blkno); + lock_buffer(bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + + gfs2_ail1_empty(sdp); + tail = current_tail(sdp); + + lh = (struct gfs2_log_header *)bh->b_data; + memset(lh, 0, sizeof(struct gfs2_log_header)); + lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.__pad0 = cpu_to_be64(0); + lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); + lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++); + lh->lh_flags = cpu_to_be32(flags); + lh->lh_tail = cpu_to_be32(tail); + lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head); + hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header)); + lh->lh_hash = cpu_to_be32(hash); + + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) + submit_bh(WRITE_SYNC | REQ_META, bh); + else + submit_bh(WRITE_FLUSH_FUA | REQ_META, bh); + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + brelse(bh); + + if (sdp->sd_log_tail != tail) + log_pull_tail(sdp, tail); + else + gfs2_assert_withdraw(sdp, !pull); + + sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); + gfs2_log_incr_head(sdp); +} + +static void log_flush_commit(struct gfs2_sbd *sdp) +{ + DEFINE_WAIT(wait); + + if (atomic_read(&sdp->sd_log_in_flight)) { + do { + prepare_to_wait(&sdp->sd_log_flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (atomic_read(&sdp->sd_log_in_flight)) + io_schedule(); + } while(atomic_read(&sdp->sd_log_in_flight)); + finish_wait(&sdp->sd_log_flush_wait, &wait); + } + + log_write_header(sdp, 0, 0); +} + +static void gfs2_ordered_write(struct gfs2_sbd *sdp) +{ + struct gfs2_bufdata *bd; + struct buffer_head *bh; + LIST_HEAD(written); + + gfs2_log_lock(sdp); + while (!list_empty(&sdp->sd_log_le_ordered)) { + bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_le.le_list); + list_move(&bd->bd_le.le_list, &written); + bh = bd->bd_bh; + if (!buffer_dirty(bh)) + continue; + get_bh(bh); + gfs2_log_unlock(sdp); + lock_buffer(bh); + if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { + bh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE_SYNC, bh); + } else { + unlock_buffer(bh); + brelse(bh); + } + gfs2_log_lock(sdp); + } + list_splice(&written, &sdp->sd_log_le_ordered); + gfs2_log_unlock(sdp); +} + +static void gfs2_ordered_wait(struct gfs2_sbd *sdp) +{ + struct gfs2_bufdata *bd; + struct buffer_head *bh; + + gfs2_log_lock(sdp); + while (!list_empty(&sdp->sd_log_le_ordered)) { + bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_le.le_list); + bh = bd->bd_bh; + if (buffer_locked(bh)) { + get_bh(bh); + gfs2_log_unlock(sdp); + wait_on_buffer(bh); + brelse(bh); + gfs2_log_lock(sdp); + continue; + } + list_del_init(&bd->bd_le.le_list); + } + gfs2_log_unlock(sdp); +} + +/** + * gfs2_log_flush - flush incore transaction(s) + * @sdp: the filesystem + * @gl: The glock structure to flush. If NULL, flush the whole incore log + * + */ + +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) +{ + struct gfs2_ail *ai; + + down_write(&sdp->sd_log_flush_lock); + + /* Log might have been flushed while we waited for the flush lock */ + if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) { + up_write(&sdp->sd_log_flush_lock); + return; + } + trace_gfs2_log_flush(sdp, 1); + + ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); + INIT_LIST_HEAD(&ai->ai_ail1_list); + INIT_LIST_HEAD(&ai->ai_ail2_list); + + if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) { + printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf, + sdp->sd_log_commited_buf); + gfs2_assert_withdraw(sdp, 0); + } + if (sdp->sd_log_num_databuf != sdp->sd_log_commited_databuf) { + printk(KERN_INFO "GFS2: log databuf %u %u\n", + sdp->sd_log_num_databuf, sdp->sd_log_commited_databuf); + gfs2_assert_withdraw(sdp, 0); + } + gfs2_assert_withdraw(sdp, + sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); + + sdp->sd_log_flush_head = sdp->sd_log_head; + sdp->sd_log_flush_wrapped = 0; + ai->ai_first = sdp->sd_log_flush_head; + + gfs2_ordered_write(sdp); + lops_before_commit(sdp); + gfs2_ordered_wait(sdp); + + if (sdp->sd_log_head != sdp->sd_log_flush_head) + log_flush_commit(sdp); + else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ + gfs2_log_lock(sdp); + atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ + trace_gfs2_log_blocks(sdp, -1); + gfs2_log_unlock(sdp); + log_write_header(sdp, 0, PULL); + } + lops_after_commit(sdp, ai); + + gfs2_log_lock(sdp); + sdp->sd_log_head = sdp->sd_log_flush_head; + sdp->sd_log_blks_reserved = 0; + sdp->sd_log_commited_buf = 0; + sdp->sd_log_commited_databuf = 0; + sdp->sd_log_commited_revoke = 0; + + spin_lock(&sdp->sd_ail_lock); + if (!list_empty(&ai->ai_ail1_list)) { + list_add(&ai->ai_list, &sdp->sd_ail1_list); + ai = NULL; + } + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); + trace_gfs2_log_flush(sdp, 0); + up_write(&sdp->sd_log_flush_lock); + + kfree(ai); +} + +static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + unsigned int reserved; + unsigned int unused; + + gfs2_log_lock(sdp); + + sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm; + sdp->sd_log_commited_databuf += tr->tr_num_databuf_new - + tr->tr_num_databuf_rm; + gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) || + (((int)sdp->sd_log_commited_databuf) >= 0)); + sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; + reserved = calc_reserved(sdp); + gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); + unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; + atomic_add(unused, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, unused); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); + sdp->sd_log_blks_reserved = reserved; + + gfs2_log_unlock(sdp); +} + +static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct list_head *head = &tr->tr_list_buf; + struct gfs2_bufdata *bd; + + gfs2_log_lock(sdp); + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr); + list_del_init(&bd->bd_list_tr); + tr->tr_num_buf--; + } + gfs2_log_unlock(sdp); + gfs2_assert_warn(sdp, !tr->tr_num_buf); +} + +/** + * gfs2_log_commit - Commit a transaction to the log + * @sdp: the filesystem + * @tr: the transaction + * + * We wake up gfs2_logd if the number of pinned blocks exceed thresh1 + * or the total number of used blocks (pinned blocks plus AIL blocks) + * is greater than thresh2. + * + * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of + * journal size. + * + * Returns: errno + */ + +void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + log_refund(sdp, tr); + buf_lo_incore_commit(sdp, tr); + + up_read(&sdp->sd_log_flush_lock); + + if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) || + ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) > + atomic_read(&sdp->sd_log_thresh2))) + wake_up(&sdp->sd_logd_waitq); +} + +/** + * gfs2_log_shutdown - write a shutdown header into a journal + * @sdp: the filesystem + * + */ + +void gfs2_log_shutdown(struct gfs2_sbd *sdp) +{ + down_write(&sdp->sd_log_flush_lock); + + gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf); + gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); + + sdp->sd_log_flush_head = sdp->sd_log_head; + sdp->sd_log_flush_wrapped = 0; + + log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, + (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL); + + gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); + gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); + gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); + + sdp->sd_log_head = sdp->sd_log_flush_head; + sdp->sd_log_tail = sdp->sd_log_head; + + up_write(&sdp->sd_log_flush_lock); +} + + +/** + * gfs2_meta_syncfs - sync all the buffers in a filesystem + * @sdp: the filesystem + * + */ + +void gfs2_meta_syncfs(struct gfs2_sbd *sdp) +{ + gfs2_log_flush(sdp, NULL); + for (;;) { + gfs2_ail1_start(sdp); + gfs2_ail1_wait(sdp); + if (gfs2_ail1_empty(sdp)) + break; + } + gfs2_log_flush(sdp, NULL); +} + +static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) +{ + return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1)); +} + +static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) +{ + unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); + return used_blocks >= atomic_read(&sdp->sd_log_thresh2); +} + +/** + * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks + * @sdp: Pointer to GFS2 superblock + * + * Also, periodically check to make sure that we're using the most recent + * journal index. + */ + +int gfs2_logd(void *data) +{ + struct gfs2_sbd *sdp = data; + unsigned long t = 1; + DEFINE_WAIT(wait); + unsigned preflush; + + while (!kthread_should_stop()) { + + preflush = atomic_read(&sdp->sd_log_pinned); + if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { + gfs2_ail1_empty(sdp); + gfs2_log_flush(sdp, NULL); + } + + if (gfs2_ail_flush_reqd(sdp)) { + gfs2_ail1_start(sdp); + gfs2_ail1_wait(sdp); + gfs2_ail1_empty(sdp); + gfs2_log_flush(sdp, NULL); + } + + if (!gfs2_ail_flush_reqd(sdp)) + wake_up(&sdp->sd_log_waitq); + + t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; + if (freezing(current)) + refrigerator(); + + do { + prepare_to_wait(&sdp->sd_logd_waitq, &wait, + TASK_INTERRUPTIBLE); + if (!gfs2_ail_flush_reqd(sdp) && + !gfs2_jrnl_flush_reqd(sdp) && + !kthread_should_stop()) + t = schedule_timeout(t); + } while(t && !gfs2_ail_flush_reqd(sdp) && + !gfs2_jrnl_flush_reqd(sdp) && + !kthread_should_stop()); + finish_wait(&sdp->sd_logd_waitq, &wait); + } + + return 0; +} + diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h new file mode 100644 index 00000000..ab062169 --- /dev/null +++ b/fs/gfs2/log.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __LOG_DOT_H__ +#define __LOG_DOT_H__ + +#include +#include +#include +#include "incore.h" + +/** + * gfs2_log_lock - acquire the right to mess with the log manager + * @sdp: the filesystem + * + */ + +static inline void gfs2_log_lock(struct gfs2_sbd *sdp) +__acquires(&sdp->sd_log_lock) +{ + spin_lock(&sdp->sd_log_lock); +} + +/** + * gfs2_log_unlock - release the right to mess with the log manager + * @sdp: the filesystem + * + */ + +static inline void gfs2_log_unlock(struct gfs2_sbd *sdp) +__releases(&sdp->sd_log_lock) +{ + spin_unlock(&sdp->sd_log_lock); +} + +static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, + unsigned int value) +{ + if (++value == sdp->sd_jdesc->jd_blocks) { + value = 0; + } + sdp->sd_log_head = sdp->sd_log_tail = value; +} + +extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, + unsigned int ssize); + +extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); +extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); + +extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); +extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, + struct buffer_head *real); +extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); +extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); +extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); +extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); + +extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); +extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp); +extern int gfs2_logd(void *data); + +#endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c new file mode 100644 index 00000000..05bbb124 --- /dev/null +++ b/fs/gfs2/lops.c @@ -0,0 +1,795 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "inode.h" +#include "glock.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "recovery.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "trace_gfs2.h" + +/** + * gfs2_pin - Pin a buffer in memory + * @sdp: The superblock + * @bh: The buffer to be pinned + * + * The log lock must be held when calling this function + */ +static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + struct gfs2_bufdata *bd; + + BUG_ON(!current->journal_info); + + clear_buffer_dirty(bh); + if (test_set_buffer_pinned(bh)) + gfs2_assert_withdraw(sdp, 0); + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + bd = bh->b_private; + /* If this buffer is in the AIL and it has already been written + * to in-place disk block, remove it from the AIL. + */ + spin_lock(&sdp->sd_ail_lock); + if (bd->bd_ail) + list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); + spin_unlock(&sdp->sd_ail_lock); + get_bh(bh); + atomic_inc(&sdp->sd_log_pinned); + trace_gfs2_pin(bd, 1); +} + +/** + * gfs2_unpin - Unpin a buffer + * @sdp: the filesystem the buffer belongs to + * @bh: The buffer to unpin + * @ai: + * @flags: The inode dirty flags + * + */ + +static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, + struct gfs2_ail *ai) +{ + struct gfs2_bufdata *bd = bh->b_private; + + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(!buffer_pinned(bh)); + + lock_buffer(bh); + mark_buffer_dirty(bh); + clear_buffer_pinned(bh); + + spin_lock(&sdp->sd_ail_lock); + if (bd->bd_ail) { + list_del(&bd->bd_ail_st_list); + brelse(bh); + } else { + struct gfs2_glock *gl = bd->bd_gl; + list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list); + atomic_inc(&gl->gl_ail_count); + } + bd->bd_ail = ai; + list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); + spin_unlock(&sdp->sd_ail_lock); + + clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + trace_gfs2_pin(bd, 0); + unlock_buffer(bh); + atomic_dec(&sdp->sd_log_pinned); +} + + +static inline struct gfs2_log_descriptor *bh_log_desc(struct buffer_head *bh) +{ + return (struct gfs2_log_descriptor *)bh->b_data; +} + +static inline __be64 *bh_log_ptr(struct buffer_head *bh) +{ + struct gfs2_log_descriptor *ld = bh_log_desc(bh); + return (__force __be64 *)(ld + 1); +} + +static inline __be64 *bh_ptr_end(struct buffer_head *bh) +{ + return (__force __be64 *)(bh->b_data + bh->b_size); +} + + +static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type) +{ + struct buffer_head *bh = gfs2_log_get_buf(sdp); + struct gfs2_log_descriptor *ld = bh_log_desc(bh); + ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD); + ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD); + ld->ld_type = cpu_to_be32(ld_type); + ld->ld_length = 0; + ld->ld_data1 = 0; + ld->ld_data2 = 0; + memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved)); + return bh; +} + +static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); + struct gfs2_meta_header *mh; + struct gfs2_trans *tr; + + lock_buffer(bd->bd_bh); + gfs2_log_lock(sdp); + if (!list_empty(&bd->bd_list_tr)) + goto out; + tr = current->journal_info; + tr->tr_touched = 1; + tr->tr_num_buf++; + list_add(&bd->bd_list_tr, &tr->tr_list_buf); + if (!list_empty(&le->le_list)) + goto out; + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + gfs2_meta_check(sdp, bd->bd_bh); + gfs2_pin(sdp, bd->bd_bh); + mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; + mh->__pad0 = cpu_to_be64(0); + mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); + sdp->sd_log_num_buf++; + list_add(&le->le_list, &sdp->sd_log_le_buf); + tr->tr_num_buf_new++; +out: + gfs2_log_unlock(sdp); + unlock_buffer(bd->bd_bh); +} + +static void buf_lo_before_commit(struct gfs2_sbd *sdp) +{ + struct buffer_head *bh; + struct gfs2_log_descriptor *ld; + struct gfs2_bufdata *bd1 = NULL, *bd2; + unsigned int total; + unsigned int limit; + unsigned int num; + unsigned n; + __be64 *ptr; + + limit = buf_limit(sdp); + /* for 4k blocks, limit = 503 */ + + gfs2_log_lock(sdp); + total = sdp->sd_log_num_buf; + bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list); + while(total) { + num = total; + if (total > limit) + num = limit; + gfs2_log_unlock(sdp); + bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA); + gfs2_log_lock(sdp); + ld = bh_log_desc(bh); + ptr = bh_log_ptr(bh); + ld->ld_length = cpu_to_be32(num + 1); + ld->ld_data1 = cpu_to_be32(num); + + n = 0; + list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf, + bd_le.le_list) { + *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr); + if (++n >= num) + break; + } + + gfs2_log_unlock(sdp); + submit_bh(WRITE_SYNC, bh); + gfs2_log_lock(sdp); + + n = 0; + list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf, + bd_le.le_list) { + get_bh(bd2->bd_bh); + gfs2_log_unlock(sdp); + lock_buffer(bd2->bd_bh); + bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); + submit_bh(WRITE_SYNC, bh); + gfs2_log_lock(sdp); + if (++n >= num) + break; + } + + BUG_ON(total < num); + total -= num; + } + gfs2_log_unlock(sdp); +} + +static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &sdp->sd_log_le_buf; + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); + list_del_init(&bd->bd_le.le_list); + sdp->sd_log_num_buf--; + + gfs2_unpin(sdp, bd->bd_bh, ai); + } + gfs2_assert_warn(sdp, !sdp->sd_log_num_buf); +} + +static void buf_lo_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (pass != 0) + return; + + sdp->sd_found_blocks = 0; + sdp->sd_replayed_blocks = 0; +} + +static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + unsigned int blks = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh_log, *bh_ip; + u64 blkno; + int error = 0; + + if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA) + return 0; + + gfs2_replay_incr_blk(sdp, &start); + + for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + blkno = be64_to_cpu(*ptr++); + + sdp->sd_found_blocks++; + + if (gfs2_revoke_check(sdp, blkno, start)) + continue; + + error = gfs2_replay_read_block(jd, start, &bh_log); + if (error) + return error; + + bh_ip = gfs2_meta_new(gl, blkno); + memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size); + + if (gfs2_meta_check(sdp, bh_ip)) + error = -EIO; + else + mark_buffer_dirty(bh_ip); + + brelse(bh_log); + brelse(bh_ip); + + if (error) + break; + + sdp->sd_replayed_blocks++; + } + + return error; +} + +static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_meta_sync(ip->i_gl); + return; + } + if (pass != 1) + return; + + gfs2_meta_sync(ip->i_gl); + + fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n", + jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); +} + +static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); + struct gfs2_glock *gl = bd->bd_gl; + struct gfs2_trans *tr; + + tr = current->journal_info; + tr->tr_touched = 1; + tr->tr_num_revoke++; + sdp->sd_log_num_revoke++; + atomic_inc(&gl->gl_revokes); + set_bit(GLF_LFLUSH, &gl->gl_flags); + list_add(&le->le_list, &sdp->sd_log_le_revoke); +} + +static void revoke_lo_before_commit(struct gfs2_sbd *sdp) +{ + struct gfs2_log_descriptor *ld; + struct gfs2_meta_header *mh; + struct buffer_head *bh; + unsigned int offset; + struct list_head *head = &sdp->sd_log_le_revoke; + struct gfs2_bufdata *bd; + + if (!sdp->sd_log_num_revoke) + return; + + bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE); + ld = bh_log_desc(bh); + ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, + sizeof(u64))); + ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke); + offset = sizeof(struct gfs2_log_descriptor); + + list_for_each_entry(bd, head, bd_le.le_list) { + sdp->sd_log_num_revoke--; + + if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { + submit_bh(WRITE_SYNC, bh); + + bh = gfs2_log_get_buf(sdp); + mh = (struct gfs2_meta_header *)bh->b_data; + mh->mh_magic = cpu_to_be32(GFS2_MAGIC); + mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB); + mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB); + offset = sizeof(struct gfs2_meta_header); + } + + *(__be64 *)(bh->b_data + offset) = cpu_to_be64(bd->bd_blkno); + offset += sizeof(u64); + } + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); + + submit_bh(WRITE_SYNC, bh); +} + +static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &sdp->sd_log_le_revoke; + struct gfs2_bufdata *bd; + struct gfs2_glock *gl; + + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); + list_del_init(&bd->bd_le.le_list); + gl = bd->bd_gl; + atomic_dec(&gl->gl_revokes); + clear_bit(GLF_LFLUSH, &gl->gl_flags); + kmem_cache_free(gfs2_bufdata_cachep, bd); + } +} + +static void revoke_lo_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (pass != 0) + return; + + sdp->sd_found_revokes = 0; + sdp->sd_replay_tail = head->lh_tail; +} + +static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + unsigned int blks = be32_to_cpu(ld->ld_length); + unsigned int revokes = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh; + unsigned int offset; + u64 blkno; + int first = 1; + int error; + + if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE) + return 0; + + offset = sizeof(struct gfs2_log_descriptor); + + for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + error = gfs2_replay_read_block(jd, start, &bh); + if (error) + return error; + + if (!first) + gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB); + + while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) { + blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset)); + + error = gfs2_revoke_add(sdp, blkno, start); + if (error < 0) { + brelse(bh); + return error; + } + else if (error) + sdp->sd_found_revokes++; + + if (!--revokes) + break; + offset += sizeof(u64); + } + + brelse(bh); + offset = sizeof(struct gfs2_meta_header); + first = 0; + } + + return 0; +} + +static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_revoke_clean(sdp); + return; + } + if (pass != 1) + return; + + fs_info(sdp, "jid=%u: Found %u revoke tags\n", + jd->jd_jid, sdp->sd_found_revokes); + + gfs2_revoke_clean(sdp); +} + +static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_trans *tr = current->journal_info; + + tr->tr_touched = 1; + + rgd = container_of(le, struct gfs2_rgrpd, rd_le); + + gfs2_log_lock(sdp); + if (!list_empty(&le->le_list)){ + gfs2_log_unlock(sdp); + return; + } + gfs2_rgrp_bh_hold(rgd); + sdp->sd_log_num_rg++; + list_add(&le->le_list, &sdp->sd_log_le_rg); + gfs2_log_unlock(sdp); +} + +static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &sdp->sd_log_le_rg; + struct gfs2_rgrpd *rgd; + + while (!list_empty(head)) { + rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list); + list_del_init(&rgd->rd_le.le_list); + sdp->sd_log_num_rg--; + + gfs2_rgrp_repolish_clones(rgd); + gfs2_rgrp_bh_put(rgd); + } + gfs2_assert_warn(sdp, !sdp->sd_log_num_rg); +} + +/** + * databuf_lo_add - Add a databuf to the transaction. + * + * This is used in two distinct cases: + * i) In ordered write mode + * We put the data buffer on a list so that we can ensure that its + * synced to disk at the right time + * ii) In journaled data mode + * We need to journal the data block in the same way as metadata in + * the functions above. The difference is that here we have a tag + * which is two __be64's being the block number (as per meta data) + * and a flag which says whether the data block needs escaping or + * not. This means we need a new log entry for each 251 or so data + * blocks, which isn't an enormous overhead but twice as much as + * for normal metadata blocks. + */ +static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); + struct gfs2_trans *tr = current->journal_info; + struct address_space *mapping = bd->bd_bh->b_page->mapping; + struct gfs2_inode *ip = GFS2_I(mapping->host); + + lock_buffer(bd->bd_bh); + gfs2_log_lock(sdp); + if (tr) { + if (!list_empty(&bd->bd_list_tr)) + goto out; + tr->tr_touched = 1; + if (gfs2_is_jdata(ip)) { + tr->tr_num_buf++; + list_add(&bd->bd_list_tr, &tr->tr_list_buf); + } + } + if (!list_empty(&le->le_list)) + goto out; + + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + if (gfs2_is_jdata(ip)) { + gfs2_pin(sdp, bd->bd_bh); + tr->tr_num_databuf_new++; + sdp->sd_log_num_databuf++; + list_add_tail(&le->le_list, &sdp->sd_log_le_databuf); + } else { + list_add_tail(&le->le_list, &sdp->sd_log_le_ordered); + } +out: + gfs2_log_unlock(sdp); + unlock_buffer(bd->bd_bh); +} + +static void gfs2_check_magic(struct buffer_head *bh) +{ + void *kaddr; + __be32 *ptr; + + clear_buffer_escaped(bh); + kaddr = kmap_atomic(bh->b_page, KM_USER0); + ptr = kaddr + bh_offset(bh); + if (*ptr == cpu_to_be32(GFS2_MAGIC)) + set_buffer_escaped(bh); + kunmap_atomic(kaddr, KM_USER0); +} + +static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh, + struct list_head *list, struct list_head *done, + unsigned int n) +{ + struct buffer_head *bh1; + struct gfs2_log_descriptor *ld; + struct gfs2_bufdata *bd; + __be64 *ptr; + + if (!bh) + return; + + ld = bh_log_desc(bh); + ld->ld_length = cpu_to_be32(n + 1); + ld->ld_data1 = cpu_to_be32(n); + + ptr = bh_log_ptr(bh); + + get_bh(bh); + submit_bh(WRITE_SYNC, bh); + gfs2_log_lock(sdp); + while(!list_empty(list)) { + bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list); + list_move_tail(&bd->bd_le.le_list, done); + get_bh(bd->bd_bh); + while (be64_to_cpu(*ptr) != bd->bd_bh->b_blocknr) { + gfs2_log_incr_head(sdp); + ptr += 2; + } + gfs2_log_unlock(sdp); + lock_buffer(bd->bd_bh); + if (buffer_escaped(bd->bd_bh)) { + void *kaddr; + bh1 = gfs2_log_get_buf(sdp); + kaddr = kmap_atomic(bd->bd_bh->b_page, KM_USER0); + memcpy(bh1->b_data, kaddr + bh_offset(bd->bd_bh), + bh1->b_size); + kunmap_atomic(kaddr, KM_USER0); + *(__be32 *)bh1->b_data = 0; + clear_buffer_escaped(bd->bd_bh); + unlock_buffer(bd->bd_bh); + brelse(bd->bd_bh); + } else { + bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh); + } + submit_bh(WRITE_SYNC, bh1); + gfs2_log_lock(sdp); + ptr += 2; + } + gfs2_log_unlock(sdp); + brelse(bh); +} + +/** + * databuf_lo_before_commit - Scan the data buffers, writing as we go + * + */ + +static void databuf_lo_before_commit(struct gfs2_sbd *sdp) +{ + struct gfs2_bufdata *bd = NULL; + struct buffer_head *bh = NULL; + unsigned int n = 0; + __be64 *ptr = NULL, *end = NULL; + LIST_HEAD(processed); + LIST_HEAD(in_progress); + + gfs2_log_lock(sdp); + while (!list_empty(&sdp->sd_log_le_databuf)) { + if (ptr == end) { + gfs2_log_unlock(sdp); + gfs2_write_blocks(sdp, bh, &in_progress, &processed, n); + n = 0; + bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_JDATA); + ptr = bh_log_ptr(bh); + end = bh_ptr_end(bh) - 1; + gfs2_log_lock(sdp); + continue; + } + bd = list_entry(sdp->sd_log_le_databuf.next, struct gfs2_bufdata, bd_le.le_list); + list_move_tail(&bd->bd_le.le_list, &in_progress); + gfs2_check_magic(bd->bd_bh); + *ptr++ = cpu_to_be64(bd->bd_bh->b_blocknr); + *ptr++ = cpu_to_be64(buffer_escaped(bh) ? 1 : 0); + n++; + } + gfs2_log_unlock(sdp); + gfs2_write_blocks(sdp, bh, &in_progress, &processed, n); + gfs2_log_lock(sdp); + list_splice(&processed, &sdp->sd_log_le_databuf); + gfs2_log_unlock(sdp); +} + +static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, + __be64 *ptr, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + unsigned int blks = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh_log, *bh_ip; + u64 blkno; + u64 esc; + int error = 0; + + if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA) + return 0; + + gfs2_replay_incr_blk(sdp, &start); + for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + blkno = be64_to_cpu(*ptr++); + esc = be64_to_cpu(*ptr++); + + sdp->sd_found_blocks++; + + if (gfs2_revoke_check(sdp, blkno, start)) + continue; + + error = gfs2_replay_read_block(jd, start, &bh_log); + if (error) + return error; + + bh_ip = gfs2_meta_new(gl, blkno); + memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size); + + /* Unescape */ + if (esc) { + __be32 *eptr = (__be32 *)bh_ip->b_data; + *eptr = cpu_to_be32(GFS2_MAGIC); + } + mark_buffer_dirty(bh_ip); + + brelse(bh_log); + brelse(bh_ip); + if (error) + break; + + sdp->sd_replayed_blocks++; + } + + return error; +} + +/* FIXME: sort out accounting for log blocks etc. */ + +static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_meta_sync(ip->i_gl); + return; + } + if (pass != 1) + return; + + /* data sync? */ + gfs2_meta_sync(ip->i_gl); + + fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n", + jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); +} + +static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &sdp->sd_log_le_databuf; + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); + list_del_init(&bd->bd_le.le_list); + sdp->sd_log_num_databuf--; + gfs2_unpin(sdp, bd->bd_bh, ai); + } + gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf); +} + + +const struct gfs2_log_operations gfs2_buf_lops = { + .lo_add = buf_lo_add, + .lo_before_commit = buf_lo_before_commit, + .lo_after_commit = buf_lo_after_commit, + .lo_before_scan = buf_lo_before_scan, + .lo_scan_elements = buf_lo_scan_elements, + .lo_after_scan = buf_lo_after_scan, + .lo_name = "buf", +}; + +const struct gfs2_log_operations gfs2_revoke_lops = { + .lo_add = revoke_lo_add, + .lo_before_commit = revoke_lo_before_commit, + .lo_after_commit = revoke_lo_after_commit, + .lo_before_scan = revoke_lo_before_scan, + .lo_scan_elements = revoke_lo_scan_elements, + .lo_after_scan = revoke_lo_after_scan, + .lo_name = "revoke", +}; + +const struct gfs2_log_operations gfs2_rg_lops = { + .lo_add = rg_lo_add, + .lo_after_commit = rg_lo_after_commit, + .lo_name = "rg", +}; + +const struct gfs2_log_operations gfs2_databuf_lops = { + .lo_add = databuf_lo_add, + .lo_before_commit = databuf_lo_before_commit, + .lo_after_commit = databuf_lo_after_commit, + .lo_scan_elements = databuf_lo_scan_elements, + .lo_after_scan = databuf_lo_after_scan, + .lo_name = "databuf", +}; + +const struct gfs2_log_operations *gfs2_log_ops[] = { + &gfs2_databuf_lops, + &gfs2_buf_lops, + &gfs2_rg_lops, + &gfs2_revoke_lops, + NULL, +}; + diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h new file mode 100644 index 00000000..3c0b2737 --- /dev/null +++ b/fs/gfs2/lops.h @@ -0,0 +1,113 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __LOPS_DOT_H__ +#define __LOPS_DOT_H__ + +#include +#include "incore.h" + +#define BUF_OFFSET \ + ((sizeof(struct gfs2_log_descriptor) + sizeof(__be64) - 1) & \ + ~(sizeof(__be64) - 1)) +#define DATABUF_OFFSET \ + ((sizeof(struct gfs2_log_descriptor) + (2 * sizeof(__be64) - 1)) & \ + ~(2 * sizeof(__be64) - 1)) + +extern const struct gfs2_log_operations gfs2_glock_lops; +extern const struct gfs2_log_operations gfs2_buf_lops; +extern const struct gfs2_log_operations gfs2_revoke_lops; +extern const struct gfs2_log_operations gfs2_rg_lops; +extern const struct gfs2_log_operations gfs2_databuf_lops; + +extern const struct gfs2_log_operations *gfs2_log_ops[]; + +static inline unsigned int buf_limit(struct gfs2_sbd *sdp) +{ + unsigned int limit; + + limit = (sdp->sd_sb.sb_bsize - BUF_OFFSET) / sizeof(__be64); + return limit; +} + +static inline unsigned int databuf_limit(struct gfs2_sbd *sdp) +{ + unsigned int limit; + + limit = (sdp->sd_sb.sb_bsize - DATABUF_OFFSET) / (2 * sizeof(__be64)); + return limit; +} + +static inline void lops_init_le(struct gfs2_log_element *le, + const struct gfs2_log_operations *lops) +{ + INIT_LIST_HEAD(&le->le_list); + le->le_ops = lops; +} + +static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + if (le->le_ops->lo_add) + le->le_ops->lo_add(sdp, le); +} + +static inline void lops_before_commit(struct gfs2_sbd *sdp) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_commit) + gfs2_log_ops[x]->lo_before_commit(sdp); +} + +static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_after_commit) + gfs2_log_ops[x]->lo_after_commit(sdp, ai); +} + +static inline void lops_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + unsigned int pass) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_scan) + gfs2_log_ops[x]->lo_before_scan(jd, head, pass); +} + +static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, + __be64 *ptr, + unsigned int pass) +{ + int x, error; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_scan_elements) { + error = gfs2_log_ops[x]->lo_scan_elements(jd, start, + ld, ptr, pass); + if (error) + return error; + } + + return 0; +} + +static inline void lops_after_scan(struct gfs2_jdesc *jd, int error, + unsigned int pass) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_scan) + gfs2_log_ops[x]->lo_after_scan(jd, error, pass); +} + +#endif /* __LOPS_DOT_H__ */ + diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c new file mode 100644 index 00000000..c2b34cd2 --- /dev/null +++ b/fs/gfs2/main.c @@ -0,0 +1,215 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "super.h" +#include "sys.h" +#include "util.h" +#include "glock.h" +#include "quota.h" +#include "recovery.h" +#include "dir.h" + +static struct shrinker qd_shrinker = { + .shrink = gfs2_shrink_qd_memory, + .seeks = DEFAULT_SEEKS, +}; + +static void gfs2_init_inode_once(void *foo) +{ + struct gfs2_inode *ip = foo; + + inode_init_once(&ip->i_inode); + init_rwsem(&ip->i_rw_mutex); + INIT_LIST_HEAD(&ip->i_trunc_list); + ip->i_alloc = NULL; +} + +static void gfs2_init_glock_once(void *foo) +{ + struct gfs2_glock *gl = foo; + + INIT_HLIST_BL_NODE(&gl->gl_list); + spin_lock_init(&gl->gl_spin); + INIT_LIST_HEAD(&gl->gl_holders); + INIT_LIST_HEAD(&gl->gl_lru); + INIT_LIST_HEAD(&gl->gl_ail_list); + atomic_set(&gl->gl_ail_count, 0); + atomic_set(&gl->gl_revokes, 0); +} + +static void gfs2_init_gl_aspace_once(void *foo) +{ + struct gfs2_glock *gl = foo; + struct address_space *mapping = (struct address_space *)(gl + 1); + + gfs2_init_glock_once(gl); + address_space_init_once(mapping); +} + +/** + * init_gfs2_fs - Register GFS2 as a filesystem + * + * Returns: 0 on success, error code on failure + */ + +static int __init init_gfs2_fs(void) +{ + int error; + + gfs2_str2qstr(&gfs2_qdot, "."); + gfs2_str2qstr(&gfs2_qdotdot, ".."); + + error = gfs2_sys_init(); + if (error) + return error; + + error = gfs2_glock_init(); + if (error) + goto fail; + + error = -ENOMEM; + gfs2_glock_cachep = kmem_cache_create("gfs2_glock", + sizeof(struct gfs2_glock), + 0, 0, + gfs2_init_glock_once); + if (!gfs2_glock_cachep) + goto fail; + + gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)", + sizeof(struct gfs2_glock) + + sizeof(struct address_space), + 0, 0, gfs2_init_gl_aspace_once); + + if (!gfs2_glock_aspace_cachep) + goto fail; + + gfs2_inode_cachep = kmem_cache_create("gfs2_inode", + sizeof(struct gfs2_inode), + 0, SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD, + gfs2_init_inode_once); + if (!gfs2_inode_cachep) + goto fail; + + gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata", + sizeof(struct gfs2_bufdata), + 0, 0, NULL); + if (!gfs2_bufdata_cachep) + goto fail; + + gfs2_rgrpd_cachep = kmem_cache_create("gfs2_rgrpd", + sizeof(struct gfs2_rgrpd), + 0, 0, NULL); + if (!gfs2_rgrpd_cachep) + goto fail; + + gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad", + sizeof(struct gfs2_quota_data), + 0, 0, NULL); + if (!gfs2_quotad_cachep) + goto fail; + + register_shrinker(&qd_shrinker); + + error = register_filesystem(&gfs2_fs_type); + if (error) + goto fail; + + error = register_filesystem(&gfs2meta_fs_type); + if (error) + goto fail_unregister; + + error = -ENOMEM; + gfs_recovery_wq = alloc_workqueue("gfs_recovery", + WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); + if (!gfs_recovery_wq) + goto fail_wq; + + gfs2_register_debugfs(); + + printk("GFS2 installed\n"); + + return 0; + +fail_wq: + unregister_filesystem(&gfs2meta_fs_type); +fail_unregister: + unregister_filesystem(&gfs2_fs_type); +fail: + unregister_shrinker(&qd_shrinker); + gfs2_glock_exit(); + + if (gfs2_quotad_cachep) + kmem_cache_destroy(gfs2_quotad_cachep); + + if (gfs2_rgrpd_cachep) + kmem_cache_destroy(gfs2_rgrpd_cachep); + + if (gfs2_bufdata_cachep) + kmem_cache_destroy(gfs2_bufdata_cachep); + + if (gfs2_inode_cachep) + kmem_cache_destroy(gfs2_inode_cachep); + + if (gfs2_glock_aspace_cachep) + kmem_cache_destroy(gfs2_glock_aspace_cachep); + + if (gfs2_glock_cachep) + kmem_cache_destroy(gfs2_glock_cachep); + + gfs2_sys_uninit(); + return error; +} + +/** + * exit_gfs2_fs - Unregister the file system + * + */ + +static void __exit exit_gfs2_fs(void) +{ + unregister_shrinker(&qd_shrinker); + gfs2_glock_exit(); + gfs2_unregister_debugfs(); + unregister_filesystem(&gfs2_fs_type); + unregister_filesystem(&gfs2meta_fs_type); + destroy_workqueue(gfs_recovery_wq); + + rcu_barrier(); + + kmem_cache_destroy(gfs2_quotad_cachep); + kmem_cache_destroy(gfs2_rgrpd_cachep); + kmem_cache_destroy(gfs2_bufdata_cachep); + kmem_cache_destroy(gfs2_inode_cachep); + kmem_cache_destroy(gfs2_glock_aspace_cachep); + kmem_cache_destroy(gfs2_glock_cachep); + + gfs2_sys_uninit(); +} + +MODULE_DESCRIPTION("Global File System"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +module_init(init_gfs2_fs); +module_exit(exit_gfs2_fs); + diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c new file mode 100644 index 00000000..747238cd --- /dev/null +++ b/fs/gfs2/meta_io.c @@ -0,0 +1,459 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "trace_gfs2.h" + +static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) +{ + struct buffer_head *bh, *head; + int nr_underway = 0; + int write_op = REQ_META | + (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + + BUG_ON(!PageLocked(page)); + BUG_ON(!page_has_buffers(page)); + + head = page_buffers(page); + bh = head; + + do { + if (!buffer_mapped(bh)) + continue; + /* + * If it's a fully non-blocking write attempt and we cannot + * lock the buffer then redirty the page. Note that this can + * potentially cause a busy-wait loop from pdflush and kswapd + * activity, but those code paths have their own higher-level + * throttling. + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + lock_buffer(bh); + } else if (!trylock_buffer(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + if (test_clear_buffer_dirty(bh)) { + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } while ((bh = bh->b_this_page) != head); + + /* + * The page and its buffers are protected by PageWriteback(), so we can + * drop the bh refcounts early. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(write_op, bh); + nr_underway++; + } + bh = next; + } while (bh != head); + unlock_page(page); + + if (nr_underway == 0) + end_page_writeback(page); + + return 0; +} + +const struct address_space_operations gfs2_meta_aops = { + .writepage = gfs2_aspace_writepage, + .releasepage = gfs2_releasepage, +}; + +/** + * gfs2_meta_sync - Sync all buffers associated with a glock + * @gl: The glock + * + */ + +void gfs2_meta_sync(struct gfs2_glock *gl) +{ + struct address_space *mapping = gfs2_glock2aspace(gl); + int error; + + filemap_fdatawrite(mapping); + error = filemap_fdatawait(mapping); + + if (error) + gfs2_io_error(gl->gl_sbd); +} + +/** + * gfs2_getbuf - Get a buffer with a given address space + * @gl: the glock + * @blkno: the block number (filesystem scope) + * @create: 1 if the buffer should be created + * + * Returns: the buffer + */ + +struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) +{ + struct address_space *mapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct page *page; + struct buffer_head *bh; + unsigned int shift; + unsigned long index; + unsigned int bufnum; + + shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift; + index = blkno >> shift; /* convert block to page */ + bufnum = blkno - (index << shift); /* block buf index within page */ + + if (create) { + for (;;) { + page = grab_cache_page(mapping, index); + if (page) + break; + yield(); + } + } else { + page = find_lock_page(mapping, index); + if (!page) + return NULL; + } + + if (!page_has_buffers(page)) + create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0); + + /* Locate header for our buffer within our page */ + for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) + /* Do nothing */; + get_bh(bh); + + if (!buffer_mapped(bh)) + map_bh(bh, sdp->sd_vfs, blkno); + + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + + return bh; +} + +static void meta_prep_new(struct buffer_head *bh) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + mh->mh_magic = cpu_to_be32(GFS2_MAGIC); +} + +/** + * gfs2_meta_new - Get a block + * @gl: The glock associated with this block + * @blkno: The block number + * + * Returns: The buffer + */ + +struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) +{ + struct buffer_head *bh; + bh = gfs2_getbuf(gl, blkno, CREATE); + meta_prep_new(bh); + return bh; +} + +/** + * gfs2_meta_read - Read a block from disk + * @gl: The glock covering the block + * @blkno: The block number + * @flags: flags + * @bhp: the place where the buffer is returned (NULL on failure) + * + * Returns: errno + */ + +int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, + struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct buffer_head *bh; + + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + *bhp = bh = gfs2_getbuf(gl, blkno, CREATE); + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + return 0; + } + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(READ_SYNC | REQ_META, bh); + if (!(flags & DIO_WAIT)) + return 0; + + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) { + struct gfs2_trans *tr = current->journal_info; + if (tr && tr->tr_touched) + gfs2_io_error_bh(sdp, bh); + brelse(bh); + return -EIO; + } + + return 0; +} + +/** + * gfs2_meta_wait - Reread a block from disk + * @sdp: the filesystem + * @bh: The block to wait for + * + * Returns: errno + */ + +int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + struct gfs2_trans *tr = current->journal_info; + if (tr && tr->tr_touched) + gfs2_io_error_bh(sdp, bh); + return -EIO; + } + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + return 0; +} + +/** + * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer + * @gl: the glock the buffer belongs to + * @bh: The buffer to be attached to + * @meta: Flag to indicate whether its metadata or not + */ + +void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, + int meta) +{ + struct gfs2_bufdata *bd; + + if (meta) + lock_page(bh->b_page); + + if (bh->b_private) { + if (meta) + unlock_page(bh->b_page); + return; + } + + bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); + bd->bd_bh = bh; + bd->bd_gl = gl; + + INIT_LIST_HEAD(&bd->bd_list_tr); + if (meta) + lops_init_le(&bd->bd_le, &gfs2_buf_lops); + else + lops_init_le(&bd->bd_le, &gfs2_databuf_lops); + bh->b_private = bd; + + if (meta) + unlock_page(bh->b_page); +} + +void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta) +{ + struct address_space *mapping = bh->b_page->mapping; + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); + struct gfs2_bufdata *bd = bh->b_private; + + if (test_clear_buffer_pinned(bh)) { + trace_gfs2_pin(bd, 0); + atomic_dec(&sdp->sd_log_pinned); + list_del_init(&bd->bd_le.le_list); + if (meta) { + gfs2_assert_warn(sdp, sdp->sd_log_num_buf); + sdp->sd_log_num_buf--; + tr->tr_num_buf_rm++; + } else { + gfs2_assert_warn(sdp, sdp->sd_log_num_databuf); + sdp->sd_log_num_databuf--; + tr->tr_num_databuf_rm++; + } + tr->tr_touched = 1; + brelse(bh); + } + if (bd) { + spin_lock(&sdp->sd_ail_lock); + if (bd->bd_ail) { + gfs2_remove_from_ail(bd); + bh->b_private = NULL; + bd->bd_bh = NULL; + bd->bd_blkno = bh->b_blocknr; + gfs2_trans_add_revoke(sdp, bd); + } + spin_unlock(&sdp->sd_ail_lock); + } + clear_buffer_dirty(bh); + clear_buffer_uptodate(bh); +} + +/** + * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore + * @ip: the inode who owns the buffers + * @bstart: the first buffer in the run + * @blen: the number of buffers in the run + * + */ + +void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh; + + while (blen) { + bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE); + if (bh) { + lock_buffer(bh); + gfs2_log_lock(sdp); + gfs2_remove_from_journal(bh, current->journal_info, 1); + gfs2_log_unlock(sdp); + unlock_buffer(bh); + brelse(bh); + } + + bstart++; + blen--; + } +} + +/** + * gfs2_meta_indirect_buffer - Get a metadata buffer + * @ip: The GFS2 inode + * @height: The level of this buf in the metadata (indir addr) tree (if any) + * @num: The block number (device relative) of the buffer + * @new: Non-zero if we may create a new buffer + * @bhp: the buffer is returned here + * + * Returns: errno + */ + +int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, + int new, struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_glock *gl = ip->i_gl; + struct buffer_head *bh; + int ret = 0; + + if (new) { + BUG_ON(height == 0); + bh = gfs2_meta_new(gl, num); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); + } else { + u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; + ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh); + if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { + brelse(bh); + ret = -EIO; + } + } + *bhp = bh; + return ret; +} + +/** + * gfs2_meta_ra - start readahead on an extent of a file + * @gl: the glock the blocks belong to + * @dblock: the starting disk block + * @extlen: the number of blocks in the extent + * + * returns: the first buffer in the extent + */ + +struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct buffer_head *first_bh, *bh; + u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >> + sdp->sd_sb.sb_bsize_shift; + + BUG_ON(!extlen); + + if (max_ra < 1) + max_ra = 1; + if (extlen > max_ra) + extlen = max_ra; + + first_bh = gfs2_getbuf(gl, dblock, CREATE); + + if (buffer_uptodate(first_bh)) + goto out; + if (!buffer_locked(first_bh)) + ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh); + + dblock++; + extlen--; + + while (extlen) { + bh = gfs2_getbuf(gl, dblock, CREATE); + + if (!buffer_uptodate(bh) && !buffer_locked(bh)) + ll_rw_block(READA, 1, &bh); + brelse(bh); + dblock++; + extlen--; + if (!buffer_locked(first_bh) && buffer_uptodate(first_bh)) + goto out; + } + + wait_on_buffer(first_bh); +out: + return first_bh; +} + diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h new file mode 100644 index 00000000..22c52659 --- /dev/null +++ b/fs/gfs2/meta_io.h @@ -0,0 +1,82 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __DIO_DOT_H__ +#define __DIO_DOT_H__ + +#include +#include +#include "incore.h" + +static inline void gfs2_buffer_clear(struct buffer_head *bh) +{ + memset(bh->b_data, 0, bh->b_size); +} + +static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head) +{ + BUG_ON(head > bh->b_size); + memset(bh->b_data + head, 0, bh->b_size - head); +} + +static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh, + int to_head, + struct buffer_head *from_bh, + int from_head) +{ + BUG_ON(from_head < to_head); + memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head, + from_bh->b_size - from_head); + memset(to_bh->b_data + to_bh->b_size + to_head - from_head, + 0, from_head - to_head); +} + +extern const struct address_space_operations gfs2_meta_aops; + +static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + if (mapping->a_ops == &gfs2_meta_aops) + return (((struct gfs2_glock *)mapping) - 1)->gl_sbd; + else + return inode->i_sb->s_fs_info; +} + +void gfs2_meta_sync(struct gfs2_glock *gl); + +struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); +int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, + int flags, struct buffer_head **bhp); +int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); +struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create); + +void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, + int meta); + +void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, + int meta); + +void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); + +int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, + int new, struct buffer_head **bhp); + +static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, + struct buffer_head **bhp) +{ + return gfs2_meta_indirect_buffer(ip, 0, ip->i_no_addr, 0, bhp); +} + +struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen); + +#define buffer_busy(bh) \ +((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned))) + +#endif /* __DIO_DOT_H__ */ + diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c new file mode 100644 index 00000000..fa780e66 --- /dev/null +++ b/fs/gfs2/ops_fstype.c @@ -0,0 +1,1407 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "recovery.h" +#include "rgrp.h" +#include "super.h" +#include "sys.h" +#include "util.h" +#include "log.h" +#include "quota.h" +#include "dir.h" +#include "trace_gfs2.h" + +#define DO 0 +#define UNDO 1 + +/** + * gfs2_tune_init - Fill a gfs2_tune structure with default values + * @gt: tune + * + */ + +static void gfs2_tune_init(struct gfs2_tune *gt) +{ + spin_lock_init(>->gt_spin); + + gt->gt_quota_simul_sync = 64; + gt->gt_quota_warn_period = 10; + gt->gt_quota_scale_num = 1; + gt->gt_quota_scale_den = 1; + gt->gt_new_files_jdata = 0; + gt->gt_max_readahead = 1 << 18; + gt->gt_complain_secs = 10; +} + +static struct gfs2_sbd *init_sbd(struct super_block *sb) +{ + struct gfs2_sbd *sdp; + + sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL); + if (!sdp) + return NULL; + + sb->s_fs_info = sdp; + sdp->sd_vfs = sb; + set_bit(SDF_NOJOURNALID, &sdp->sd_flags); + gfs2_tune_init(&sdp->sd_tune); + + init_waitqueue_head(&sdp->sd_glock_wait); + atomic_set(&sdp->sd_glock_disposal, 0); + init_completion(&sdp->sd_locking_init); + spin_lock_init(&sdp->sd_statfs_spin); + + spin_lock_init(&sdp->sd_rindex_spin); + mutex_init(&sdp->sd_rindex_mutex); + INIT_LIST_HEAD(&sdp->sd_rindex_list); + INIT_LIST_HEAD(&sdp->sd_rindex_mru_list); + + INIT_LIST_HEAD(&sdp->sd_jindex_list); + spin_lock_init(&sdp->sd_jindex_spin); + mutex_init(&sdp->sd_jindex_mutex); + + INIT_LIST_HEAD(&sdp->sd_quota_list); + mutex_init(&sdp->sd_quota_mutex); + init_waitqueue_head(&sdp->sd_quota_wait); + INIT_LIST_HEAD(&sdp->sd_trunc_list); + spin_lock_init(&sdp->sd_trunc_lock); + + spin_lock_init(&sdp->sd_log_lock); + atomic_set(&sdp->sd_log_pinned, 0); + INIT_LIST_HEAD(&sdp->sd_log_le_buf); + INIT_LIST_HEAD(&sdp->sd_log_le_revoke); + INIT_LIST_HEAD(&sdp->sd_log_le_rg); + INIT_LIST_HEAD(&sdp->sd_log_le_databuf); + INIT_LIST_HEAD(&sdp->sd_log_le_ordered); + + init_waitqueue_head(&sdp->sd_log_waitq); + init_waitqueue_head(&sdp->sd_logd_waitq); + spin_lock_init(&sdp->sd_ail_lock); + INIT_LIST_HEAD(&sdp->sd_ail1_list); + INIT_LIST_HEAD(&sdp->sd_ail2_list); + + init_rwsem(&sdp->sd_log_flush_lock); + atomic_set(&sdp->sd_log_in_flight, 0); + init_waitqueue_head(&sdp->sd_log_flush_wait); + + INIT_LIST_HEAD(&sdp->sd_revoke_list); + + mutex_init(&sdp->sd_freeze_lock); + + return sdp; +} + + +/** + * gfs2_check_sb - Check superblock + * @sdp: the filesystem + * @sb: The superblock + * @silent: Don't print a message if the check fails + * + * Checks the version code of the FS is one that we understand how to + * read and that the sizes of the various on-disk structures have not + * changed. + */ + +static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) +{ + struct gfs2_sb_host *sb = &sdp->sd_sb; + + if (sb->sb_magic != GFS2_MAGIC || + sb->sb_type != GFS2_METATYPE_SB) { + if (!silent) + printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); + return -EINVAL; + } + + /* If format numbers match exactly, we're done. */ + + if (sb->sb_fs_format == GFS2_FORMAT_FS && + sb->sb_multihost_format == GFS2_FORMAT_MULTI) + return 0; + + fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); + + return -EINVAL; +} + +static void end_bio_io_page(struct bio *bio, int error) +{ + struct page *page = bio->bi_private; + + if (!error) + SetPageUptodate(page); + else + printk(KERN_WARNING "gfs2: error %d reading superblock\n", error); + unlock_page(page); +} + +static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf) +{ + struct gfs2_sb_host *sb = &sdp->sd_sb; + struct super_block *s = sdp->sd_vfs; + const struct gfs2_sb *str = buf; + + sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic); + sb->sb_type = be32_to_cpu(str->sb_header.mh_type); + sb->sb_format = be32_to_cpu(str->sb_header.mh_format); + sb->sb_fs_format = be32_to_cpu(str->sb_fs_format); + sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); + sb->sb_bsize = be32_to_cpu(str->sb_bsize); + sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift); + sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr); + sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino); + sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr); + sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino); + + memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); + memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); + memcpy(s->s_uuid, str->sb_uuid, 16); +} + +/** + * gfs2_read_super - Read the gfs2 super block from disk + * @sdp: The GFS2 super block + * @sector: The location of the super block + * @error: The error code to return + * + * This uses the bio functions to read the super block from disk + * because we want to be 100% sure that we never read cached data. + * A super block is read twice only during each GFS2 mount and is + * never written to by the filesystem. The first time its read no + * locks are held, and the only details which are looked at are those + * relating to the locking protocol. Once locking is up and working, + * the sb is read again under the lock to establish the location of + * the master directory (contains pointers to journals etc) and the + * root directory. + * + * Returns: 0 on success or error + */ + +static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) +{ + struct super_block *sb = sdp->sd_vfs; + struct gfs2_sb *p; + struct page *page; + struct bio *bio; + + page = alloc_page(GFP_NOFS); + if (unlikely(!page)) + return -ENOBUFS; + + ClearPageUptodate(page); + ClearPageDirty(page); + lock_page(page); + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_sector = sector * (sb->s_blocksize >> 9); + bio->bi_bdev = sb->s_bdev; + bio_add_page(bio, page, PAGE_SIZE, 0); + + bio->bi_end_io = end_bio_io_page; + bio->bi_private = page; + submit_bio(READ_SYNC | REQ_META, bio); + wait_on_page_locked(page); + bio_put(bio); + if (!PageUptodate(page)) { + __free_page(page); + return -EIO; + } + p = kmap(page); + gfs2_sb_in(sdp, p); + kunmap(page); + __free_page(page); + return gfs2_check_sb(sdp, silent); +} + +/** + * gfs2_read_sb - Read super block + * @sdp: The GFS2 superblock + * @silent: Don't print message if mount fails + * + */ + +static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) +{ + u32 hash_blocks, ind_blocks, leaf_blocks; + u32 tmp_blocks; + unsigned int x; + int error; + + error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent); + if (error) { + if (!silent) + fs_err(sdp, "can't read superblock\n"); + return error; + } + + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - + GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode)) / sizeof(u64); + sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / sizeof(u64); + sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); + sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2; + sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; + sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64); + sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / + sizeof(struct gfs2_quota_change); + + /* Compute maximum reservation required to add a entry to a directory */ + + hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH), + sdp->sd_jbsize); + + ind_blocks = 0; + for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) { + tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs); + ind_blocks += tmp_blocks; + } + + leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH; + + sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks; + + sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs; + for (x = 2;; x++) { + u64 space, d; + u32 m; + + space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_heightsize[x - 1] || m) + break; + sdp->sd_heightsize[x] = space; + } + sdp->sd_max_height = x; + sdp->sd_heightsize[x] = ~0; + gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); + + sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs; + for (x = 2;; x++) { + u64 space, d; + u32 m; + + space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_jheightsize[x - 1] || m) + break; + sdp->sd_jheightsize[x] = space; + } + sdp->sd_max_jheight = x; + sdp->sd_jheightsize[x] = ~0; + gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); + + return 0; +} + +static int init_names(struct gfs2_sbd *sdp, int silent) +{ + char *proto, *table; + int error = 0; + + proto = sdp->sd_args.ar_lockproto; + table = sdp->sd_args.ar_locktable; + + /* Try to autodetect */ + + if (!proto[0] || !table[0]) { + error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent); + if (error) + return error; + + if (!proto[0]) + proto = sdp->sd_sb.sb_lockproto; + if (!table[0]) + table = sdp->sd_sb.sb_locktable; + } + + if (!table[0]) + table = sdp->sd_vfs->s_id; + + strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN); + strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN); + + table = sdp->sd_table_name; + while ((table = strchr(table, '/'))) + *table = '_'; + + return error; +} + +static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, + int undo) +{ + int error = 0; + + if (undo) + goto fail_trans; + + error = gfs2_glock_nq_num(sdp, + GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, + LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, + mount_gh); + if (error) { + fs_err(sdp, "can't acquire mount glock: %d\n", error); + goto fail; + } + + error = gfs2_glock_nq_num(sdp, + GFS2_LIVE_LOCK, &gfs2_nondisk_glops, + LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT, + &sdp->sd_live_gh); + if (error) { + fs_err(sdp, "can't acquire live glock: %d\n", error); + goto fail_mount; + } + + error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops, + CREATE, &sdp->sd_rename_gl); + if (error) { + fs_err(sdp, "can't create rename glock: %d\n", error); + goto fail_live; + } + + error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops, + CREATE, &sdp->sd_trans_gl); + if (error) { + fs_err(sdp, "can't create transaction glock: %d\n", error); + goto fail_rename; + } + + return 0; + +fail_trans: + gfs2_glock_put(sdp->sd_trans_gl); +fail_rename: + gfs2_glock_put(sdp->sd_rename_gl); +fail_live: + gfs2_glock_dq_uninit(&sdp->sd_live_gh); +fail_mount: + gfs2_glock_dq_uninit(mount_gh); +fail: + return error; +} + +static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, + u64 no_addr, const char *name) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct dentry *dentry; + struct inode *inode; + + inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); + if (IS_ERR(inode)) { + fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); + return PTR_ERR(inode); + } + dentry = d_alloc_root(inode); + if (!dentry) { + fs_err(sdp, "can't alloc %s dentry\n", name); + iput(inode); + return -ENOMEM; + } + *dptr = dentry; + return 0; +} + +static int init_sb(struct gfs2_sbd *sdp, int silent) +{ + struct super_block *sb = sdp->sd_vfs; + struct gfs2_holder sb_gh; + u64 no_addr; + int ret; + + ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops, + LM_ST_SHARED, 0, &sb_gh); + if (ret) { + fs_err(sdp, "can't acquire superblock glock: %d\n", ret); + return ret; + } + + ret = gfs2_read_sb(sdp, silent); + if (ret) { + fs_err(sdp, "can't read superblock: %d\n", ret); + goto out; + } + + /* Set up the buffer cache and SB for real */ + if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) { + ret = -EINVAL; + fs_err(sdp, "FS block size (%u) is too small for device " + "block size (%u)\n", + sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev)); + goto out; + } + if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { + ret = -EINVAL; + fs_err(sdp, "FS block size (%u) is too big for machine " + "page size (%u)\n", + sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE); + goto out; + } + sb_set_blocksize(sb, sdp->sd_sb.sb_bsize); + + /* Get the root inode */ + no_addr = sdp->sd_sb.sb_root_dir.no_addr; + ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root"); + if (ret) + goto out; + + /* Get the master inode */ + no_addr = sdp->sd_sb.sb_master_dir.no_addr; + ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master"); + if (ret) { + dput(sdp->sd_root_dir); + goto out; + } + sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir); +out: + gfs2_glock_dq_uninit(&sb_gh); + return ret; +} + +/** + * map_journal_extents - create a reusable "extent" mapping from all logical + * blocks to all physical blocks for the given journal. This will save + * us time when writing journal blocks. Most journals will have only one + * extent that maps all their logical blocks. That's because gfs2.mkfs + * arranges the journal blocks sequentially to maximize performance. + * So the extent would map the first block for the entire file length. + * However, gfs2_jadd can happen while file activity is happening, so + * those journals may not be sequential. Less likely is the case where + * the users created their own journals by mounting the metafs and + * laying it out. But it's still possible. These journals might have + * several extents. + * + * TODO: This should be done in bigger chunks rather than one block at a time, + * but since it's only done at mount time, I'm not worried about the + * time it takes. + */ +static int map_journal_extents(struct gfs2_sbd *sdp) +{ + struct gfs2_jdesc *jd = sdp->sd_jdesc; + unsigned int lb; + u64 db, prev_db; /* logical block, disk block, prev disk block */ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_journal_extent *jext = NULL; + struct buffer_head bh; + int rc = 0; + + prev_db = 0; + + for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) { + bh.b_state = 0; + bh.b_blocknr = 0; + bh.b_size = 1 << ip->i_inode.i_blkbits; + rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0); + db = bh.b_blocknr; + if (rc || !db) { + printk(KERN_INFO "GFS2 journal mapping error %d: lb=" + "%u db=%llu\n", rc, lb, (unsigned long long)db); + break; + } + if (!prev_db || db != prev_db + 1) { + jext = kzalloc(sizeof(struct gfs2_journal_extent), + GFP_KERNEL); + if (!jext) { + printk(KERN_INFO "GFS2 error: out of memory " + "mapping journal extents.\n"); + rc = -ENOMEM; + break; + } + jext->dblock = db; + jext->lblock = lb; + jext->blocks = 1; + list_add_tail(&jext->extent_list, &jd->extent_list); + } else { + jext->blocks++; + } + prev_db = db; + } + return rc; +} + +static void gfs2_others_may_mount(struct gfs2_sbd *sdp) +{ + char *message = "FIRSTMOUNT=Done"; + char *envp[] = { message, NULL }; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ls->ls_first_done = 1; + kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); +} + +/** + * gfs2_jindex_hold - Grab a lock on the jindex + * @sdp: The GFS2 superblock + * @ji_gh: the holder for the jindex glock + * + * Returns: errno + */ + +static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) +{ + struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex); + struct qstr name; + char buf[20]; + struct gfs2_jdesc *jd; + int error; + + name.name = buf; + + mutex_lock(&sdp->sd_jindex_mutex); + + for (;;) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh); + if (error) + break; + + name.len = sprintf(buf, "journal%u", sdp->sd_journals); + name.hash = gfs2_disk_hash(name.name, name.len); + + error = gfs2_dir_check(sdp->sd_jindex, &name, NULL); + if (error == -ENOENT) { + error = 0; + break; + } + + gfs2_glock_dq_uninit(ji_gh); + + if (error) + break; + + error = -ENOMEM; + jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL); + if (!jd) + break; + + INIT_LIST_HEAD(&jd->extent_list); + INIT_WORK(&jd->jd_work, gfs2_recover_func); + jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); + if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { + if (!jd->jd_inode) + error = -ENOENT; + else + error = PTR_ERR(jd->jd_inode); + kfree(jd); + break; + } + + spin_lock(&sdp->sd_jindex_spin); + jd->jd_jid = sdp->sd_journals++; + list_add_tail(&jd->jd_list, &sdp->sd_jindex_list); + spin_unlock(&sdp->sd_jindex_spin); + } + + mutex_unlock(&sdp->sd_jindex_mutex); + + return error; +} + +static int init_journal(struct gfs2_sbd *sdp, int undo) +{ + struct inode *master = sdp->sd_master_dir->d_inode; + struct gfs2_holder ji_gh; + struct gfs2_inode *ip; + int jindex = 1; + int error = 0; + + if (undo) { + jindex = 0; + goto fail_jinode_gh; + } + + sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); + if (IS_ERR(sdp->sd_jindex)) { + fs_err(sdp, "can't lookup journal index: %d\n", error); + return PTR_ERR(sdp->sd_jindex); + } + ip = GFS2_I(sdp->sd_jindex); + + /* Load in the journal index special file */ + + error = gfs2_jindex_hold(sdp, &ji_gh); + if (error) { + fs_err(sdp, "can't read journal index: %d\n", error); + goto fail; + } + + error = -EUSERS; + if (!gfs2_jindex_size(sdp)) { + fs_err(sdp, "no journals!\n"); + goto fail_jindex; + } + + if (sdp->sd_args.ar_spectator) { + sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); + atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5); + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); + } else { + if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { + fs_err(sdp, "can't mount journal #%u\n", + sdp->sd_lockstruct.ls_jid); + fs_err(sdp, "there are only %u journals (0 - %u)\n", + gfs2_jindex_size(sdp), + gfs2_jindex_size(sdp) - 1); + goto fail_jindex; + } + sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid); + + error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid, + &gfs2_journal_glops, + LM_ST_EXCLUSIVE, LM_FLAG_NOEXP, + &sdp->sd_journal_gh); + if (error) { + fs_err(sdp, "can't acquire journal glock: %d\n", error); + goto fail_jindex; + } + + ip = GFS2_I(sdp->sd_jdesc->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE, + &sdp->sd_jinode_gh); + if (error) { + fs_err(sdp, "can't acquire journal inode glock: %d\n", + error); + goto fail_journal_gh; + } + + error = gfs2_jdesc_check(sdp->sd_jdesc); + if (error) { + fs_err(sdp, "my journal (%u) is bad: %d\n", + sdp->sd_jdesc->jd_jid, error); + goto fail_jinode_gh; + } + atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5); + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); + + /* Map the extents for this journal's blocks */ + map_journal_extents(sdp); + } + trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); + + if (sdp->sd_lockstruct.ls_first) { + unsigned int x; + for (x = 0; x < sdp->sd_journals; x++) { + error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x), + true); + if (error) { + fs_err(sdp, "error recovering journal %u: %d\n", + x, error); + goto fail_jinode_gh; + } + } + + gfs2_others_may_mount(sdp); + } else if (!sdp->sd_args.ar_spectator) { + error = gfs2_recover_journal(sdp->sd_jdesc, true); + if (error) { + fs_err(sdp, "error recovering my journal: %d\n", error); + goto fail_jinode_gh; + } + } + + set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags); + gfs2_glock_dq_uninit(&ji_gh); + jindex = 0; + + return 0; + +fail_jinode_gh: + if (!sdp->sd_args.ar_spectator) + gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); +fail_journal_gh: + if (!sdp->sd_args.ar_spectator) + gfs2_glock_dq_uninit(&sdp->sd_journal_gh); +fail_jindex: + gfs2_jindex_free(sdp); + if (jindex) + gfs2_glock_dq_uninit(&ji_gh); +fail: + iput(sdp->sd_jindex); + return error; +} + + +static int init_inodes(struct gfs2_sbd *sdp, int undo) +{ + int error = 0; + struct gfs2_inode *ip; + struct inode *master = sdp->sd_master_dir->d_inode; + + if (undo) + goto fail_qinode; + + error = init_journal(sdp, undo); + if (error) + goto fail; + + /* Read in the master statfs inode */ + sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); + if (IS_ERR(sdp->sd_statfs_inode)) { + error = PTR_ERR(sdp->sd_statfs_inode); + fs_err(sdp, "can't read in statfs inode: %d\n", error); + goto fail_journal; + } + + /* Read in the resource index inode */ + sdp->sd_rindex = gfs2_lookup_simple(master, "rindex"); + if (IS_ERR(sdp->sd_rindex)) { + error = PTR_ERR(sdp->sd_rindex); + fs_err(sdp, "can't get resource index inode: %d\n", error); + goto fail_statfs; + } + ip = GFS2_I(sdp->sd_rindex); + sdp->sd_rindex_uptodate = 0; + + /* Read in the quota inode */ + sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota"); + if (IS_ERR(sdp->sd_quota_inode)) { + error = PTR_ERR(sdp->sd_quota_inode); + fs_err(sdp, "can't get quota file inode: %d\n", error); + goto fail_rindex; + } + return 0; + +fail_qinode: + iput(sdp->sd_quota_inode); +fail_rindex: + gfs2_clear_rgrpd(sdp); + iput(sdp->sd_rindex); +fail_statfs: + iput(sdp->sd_statfs_inode); +fail_journal: + init_journal(sdp, UNDO); +fail: + return error; +} + +static int init_per_node(struct gfs2_sbd *sdp, int undo) +{ + struct inode *pn = NULL; + char buf[30]; + int error = 0; + struct gfs2_inode *ip; + struct inode *master = sdp->sd_master_dir->d_inode; + + if (sdp->sd_args.ar_spectator) + return 0; + + if (undo) + goto fail_qc_gh; + + pn = gfs2_lookup_simple(master, "per_node"); + if (IS_ERR(pn)) { + error = PTR_ERR(pn); + fs_err(sdp, "can't find per_node directory: %d\n", error); + return error; + } + + sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); + sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(sdp->sd_sc_inode)) { + error = PTR_ERR(sdp->sd_sc_inode); + fs_err(sdp, "can't find local \"sc\" file: %d\n", error); + goto fail; + } + + sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); + sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(sdp->sd_qc_inode)) { + error = PTR_ERR(sdp->sd_qc_inode); + fs_err(sdp, "can't find local \"qc\" file: %d\n", error); + goto fail_ut_i; + } + + iput(pn); + pn = NULL; + + ip = GFS2_I(sdp->sd_sc_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + &sdp->sd_sc_gh); + if (error) { + fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); + goto fail_qc_i; + } + + ip = GFS2_I(sdp->sd_qc_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + &sdp->sd_qc_gh); + if (error) { + fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); + goto fail_ut_gh; + } + + return 0; + +fail_qc_gh: + gfs2_glock_dq_uninit(&sdp->sd_qc_gh); +fail_ut_gh: + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); +fail_qc_i: + iput(sdp->sd_qc_inode); +fail_ut_i: + iput(sdp->sd_sc_inode); +fail: + if (pn) + iput(pn); + return error; +} + +static int init_threads(struct gfs2_sbd *sdp, int undo) +{ + struct task_struct *p; + int error = 0; + + if (undo) + goto fail_quotad; + + p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); + error = IS_ERR(p); + if (error) { + fs_err(sdp, "can't start logd thread: %d\n", error); + return error; + } + sdp->sd_logd_process = p; + + p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); + error = IS_ERR(p); + if (error) { + fs_err(sdp, "can't start quotad thread: %d\n", error); + goto fail; + } + sdp->sd_quotad_process = p; + + return 0; + + +fail_quotad: + kthread_stop(sdp->sd_quotad_process); +fail: + kthread_stop(sdp->sd_logd_process); + return error; +} + +static const match_table_t nolock_tokens = { + { Opt_jid, "jid=%d\n", }, + { Opt_err, NULL }, +}; + +static const struct lm_lockops nolock_ops = { + .lm_proto_name = "lock_nolock", + .lm_put_lock = gfs2_glock_free, + .lm_tokens = &nolock_tokens, +}; + +/** + * gfs2_lm_mount - mount a locking protocol + * @sdp: the filesystem + * @args: mount arguments + * @silent: if 1, don't complain if the FS isn't a GFS2 fs + * + * Returns: errno + */ + +static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) +{ + const struct lm_lockops *lm; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + struct gfs2_args *args = &sdp->sd_args; + const char *proto = sdp->sd_proto_name; + const char *table = sdp->sd_table_name; + const char *fsname; + char *o, *options; + int ret; + + if (!strcmp("lock_nolock", proto)) { + lm = &nolock_ops; + sdp->sd_args.ar_localflocks = 1; +#ifdef CONFIG_GFS2_FS_LOCKING_DLM + } else if (!strcmp("lock_dlm", proto)) { + lm = &gfs2_dlm_ops; +#endif + } else { + printk(KERN_INFO "GFS2: can't find protocol %s\n", proto); + return -ENOENT; + } + + fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table); + + ls->ls_ops = lm; + ls->ls_first = 1; + + for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) { + substring_t tmp[MAX_OPT_ARGS]; + int token, option; + + if (!o || !*o) + continue; + + token = match_token(o, *lm->lm_tokens, tmp); + switch (token) { + case Opt_jid: + ret = match_int(&tmp[0], &option); + if (ret || option < 0) + goto hostdata_error; + if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags)) + ls->ls_jid = option; + break; + case Opt_id: + /* Obsolete, but left for backward compat purposes */ + break; + case Opt_first: + ret = match_int(&tmp[0], &option); + if (ret || (option != 0 && option != 1)) + goto hostdata_error; + ls->ls_first = option; + break; + case Opt_nodir: + ret = match_int(&tmp[0], &option); + if (ret || (option != 0 && option != 1)) + goto hostdata_error; + ls->ls_nodir = option; + break; + case Opt_err: + default: +hostdata_error: + fs_info(sdp, "unknown hostdata (%s)\n", o); + return -EINVAL; + } + } + + if (sdp->sd_args.ar_spectator) + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table); + else + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table, + sdp->sd_lockstruct.ls_jid); + + fsname = strchr(table, ':'); + if (fsname) + fsname++; + if (lm->lm_mount == NULL) { + fs_info(sdp, "Now mounting FS...\n"); + complete_all(&sdp->sd_locking_init); + return 0; + } + ret = lm->lm_mount(sdp, fsname); + if (ret == 0) + fs_info(sdp, "Joined cluster. Now mounting FS...\n"); + complete_all(&sdp->sd_locking_init); + return ret; +} + +void gfs2_lm_unmount(struct gfs2_sbd *sdp) +{ + const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) && + lm->lm_unmount) + lm->lm_unmount(sdp); +} + +static int gfs2_journalid_wait(void *word) +{ + if (signal_pending(current)) + return -EINTR; + schedule(); + return 0; +} + +static int wait_on_journal(struct gfs2_sbd *sdp) +{ + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + return 0; + + return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE); +} + +void gfs2_online_uevent(struct gfs2_sbd *sdp) +{ + struct super_block *sb = sdp->sd_vfs; + char ro[20]; + char spectator[20]; + char *envp[] = { ro, spectator, NULL }; + sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); + sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp); +} + +/** + * fill_super - Read in superblock + * @sb: The VFS superblock + * @data: Mount options + * @silent: Don't complain if it's not a GFS2 filesystem + * + * Returns: errno + */ + +static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent) +{ + struct gfs2_sbd *sdp; + struct gfs2_holder mount_gh; + int error; + + sdp = init_sbd(sb); + if (!sdp) { + printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n"); + return -ENOMEM; + } + sdp->sd_args = *args; + + if (sdp->sd_args.ar_spectator) { + sb->s_flags |= MS_RDONLY; + set_bit(SDF_NORECOVERY, &sdp->sd_flags); + } + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= MS_POSIXACL; + if (sdp->sd_args.ar_nobarrier) + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_d_op = &gfs2_dops; + sb->s_export_op = &gfs2_export_ops; + sb->s_xattr = gfs2_xattr_handlers; + sb->s_qcop = &gfs2_quotactl_ops; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + sb->s_time_gran = 1; + sb->s_maxbytes = MAX_LFS_FILESIZE; + + /* Set up the buffer cache and fill in some fake block size values + to allow us to read-in the on-disk superblock. */ + sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK); + sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits; + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - + GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + + sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit; + sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; + if (sdp->sd_args.ar_statfs_quantum) { + sdp->sd_tune.gt_statfs_slow = 0; + sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum; + } else { + sdp->sd_tune.gt_statfs_slow = 1; + sdp->sd_tune.gt_statfs_quantum = 30; + } + + error = init_names(sdp, silent); + if (error) + goto fail; + + gfs2_create_debugfs_file(sdp); + + error = gfs2_sys_fs_add(sdp); + if (error) + goto fail; + + error = gfs2_lm_mount(sdp, silent); + if (error) + goto fail_sys; + + error = init_locking(sdp, &mount_gh, DO); + if (error) + goto fail_lm; + + error = init_sb(sdp, silent); + if (error) + goto fail_locking; + + error = wait_on_journal(sdp); + if (error) + goto fail_sb; + + /* + * If user space has failed to join the cluster or some similar + * failure has occurred, then the journal id will contain a + * negative (error) number. This will then be returned to the + * caller (of the mount syscall). We do this even for spectator + * mounts (which just write a jid of 0 to indicate "ok" even though + * the jid is unused in the spectator case) + */ + if (sdp->sd_lockstruct.ls_jid < 0) { + error = sdp->sd_lockstruct.ls_jid; + sdp->sd_lockstruct.ls_jid = 0; + goto fail_sb; + } + + error = init_inodes(sdp, DO); + if (error) + goto fail_sb; + + error = init_per_node(sdp, DO); + if (error) + goto fail_inodes; + + error = gfs2_statfs_init(sdp); + if (error) { + fs_err(sdp, "can't initialize statfs subsystem: %d\n", error); + goto fail_per_node; + } + + error = init_threads(sdp, DO); + if (error) + goto fail_per_node; + + if (!(sb->s_flags & MS_RDONLY)) { + error = gfs2_make_fs_rw(sdp); + if (error) { + fs_err(sdp, "can't make FS RW: %d\n", error); + goto fail_threads; + } + } + + gfs2_glock_dq_uninit(&mount_gh); + gfs2_online_uevent(sdp); + return 0; + +fail_threads: + init_threads(sdp, UNDO); +fail_per_node: + init_per_node(sdp, UNDO); +fail_inodes: + init_inodes(sdp, UNDO); +fail_sb: + if (sdp->sd_root_dir) + dput(sdp->sd_root_dir); + if (sdp->sd_master_dir) + dput(sdp->sd_master_dir); + if (sb->s_root) + dput(sb->s_root); + sb->s_root = NULL; +fail_locking: + init_locking(sdp, &mount_gh, UNDO); +fail_lm: + gfs2_gl_hash_clear(sdp); + gfs2_lm_unmount(sdp); +fail_sys: + gfs2_sys_fs_del(sdp); +fail: + gfs2_delete_debugfs_file(sdp); + kfree(sdp); + sb->s_fs_info = NULL; + return error; +} + +static int set_gfs2_super(struct super_block *s, void *data) +{ + s->s_bdev = data; + s->s_dev = s->s_bdev->bd_dev; + + /* + * We set the bdi here to the queue backing, file systems can + * overwrite this in ->fill_super() + */ + s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info; + return 0; +} + +static int test_gfs2_super(struct super_block *s, void *ptr) +{ + struct block_device *bdev = ptr; + return (bdev == s->s_bdev); +} + +/** + * gfs2_mount - Get the GFS2 superblock + * @fs_type: The GFS2 filesystem type + * @flags: Mount flags + * @dev_name: The name of the device + * @data: The mount arguments + * + * Q. Why not use get_sb_bdev() ? + * A. We need to select one of two root directories to mount, independent + * of whether this is the initial, or subsequent, mount of this sb + * + * Returns: 0 or -ve on error + */ + +static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + struct block_device *bdev; + struct super_block *s; + fmode_t mode = FMODE_READ | FMODE_EXCL; + int error; + struct gfs2_args args; + struct gfs2_sbd *sdp; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + bdev = blkdev_get_by_path(dev_name, mode, fs_type); + if (IS_ERR(bdev)) + return ERR_CAST(bdev); + + /* + * once the super is inserted into the list by sget, s_umount + * will protect the lockfs code from trying to start a snapshot + * while we are mounting + */ + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + error = -EBUSY; + goto error_bdev; + } + s = sget(fs_type, test_gfs2_super, set_gfs2_super, bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + error = PTR_ERR(s); + if (IS_ERR(s)) + goto error_bdev; + + if (s->s_root) + blkdev_put(bdev, mode); + + memset(&args, 0, sizeof(args)); + args.ar_quota = GFS2_QUOTA_DEFAULT; + args.ar_data = GFS2_DATA_DEFAULT; + args.ar_commit = 30; + args.ar_statfs_quantum = 30; + args.ar_quota_quantum = 60; + args.ar_errors = GFS2_ERRORS_DEFAULT; + + error = gfs2_mount_args(&args, data); + if (error) { + printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); + goto error_super; + } + + if (s->s_root) { + error = -EBUSY; + if ((flags ^ s->s_flags) & MS_RDONLY) + goto error_super; + } else { + char b[BDEVNAME_SIZE]; + + s->s_flags = flags; + s->s_mode = mode; + strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + sb_set_blocksize(s, block_size(bdev)); + error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0); + if (error) + goto error_super; + s->s_flags |= MS_ACTIVE; + bdev->bd_super = s; + } + + sdp = s->s_fs_info; + if (args.ar_meta) + return dget(sdp->sd_master_dir); + else + return dget(sdp->sd_root_dir); + +error_super: + deactivate_locked_super(s); + return ERR_PTR(error); +error_bdev: + blkdev_put(bdev, mode); + return ERR_PTR(error); +} + +static int set_meta_super(struct super_block *s, void *ptr) +{ + return -EINVAL; +} + +static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct super_block *s; + struct gfs2_sbd *sdp; + struct path path; + int error; + + error = kern_path(dev_name, LOOKUP_FOLLOW, &path); + if (error) { + printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", + dev_name, error); + return ERR_PTR(error); + } + s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, + path.dentry->d_inode->i_sb->s_bdev); + path_put(&path); + if (IS_ERR(s)) { + printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); + return ERR_CAST(s); + } + if ((flags ^ s->s_flags) & MS_RDONLY) { + deactivate_locked_super(s); + return ERR_PTR(-EBUSY); + } + sdp = s->s_fs_info; + return dget(sdp->sd_master_dir); +} + +static void gfs2_kill_sb(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + if (sdp == NULL) { + kill_block_super(sb); + return; + } + + gfs2_meta_syncfs(sdp); + dput(sdp->sd_root_dir); + dput(sdp->sd_master_dir); + sdp->sd_root_dir = NULL; + sdp->sd_master_dir = NULL; + shrink_dcache_sb(sb); + kill_block_super(sb); + gfs2_delete_debugfs_file(sdp); + kfree(sdp); +} + +struct file_system_type gfs2_fs_type = { + .name = "gfs2", + .fs_flags = FS_REQUIRES_DEV, + .mount = gfs2_mount, + .kill_sb = gfs2_kill_sb, + .owner = THIS_MODULE, +}; + +struct file_system_type gfs2meta_fs_type = { + .name = "gfs2meta", + .fs_flags = FS_REQUIRES_DEV, + .mount = gfs2_mount_meta, + .owner = THIS_MODULE, +}; + diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c new file mode 100644 index 00000000..42e8d23b --- /dev/null +++ b/fs/gfs2/quota.c @@ -0,0 +1,1644 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +/* + * Quota change tags are associated with each transaction that allocates or + * deallocates space. Those changes are accumulated locally to each node (in a + * per-node file) and then are periodically synced to the quota file. This + * avoids the bottleneck of constantly touching the quota file, but introduces + * fuzziness in the current usage value of IDs that are being used on different + * nodes in the cluster simultaneously. So, it is possible for a user on + * multiple nodes to overrun their quota, but that overrun is controlable. + * Since quota tags are part of transactions, there is no need for a quota check + * program to be run on node crashes or anything like that. + * + * There are couple of knobs that let the administrator manage the quota + * fuzziness. "quota_quantum" sets the maximum time a quota change can be + * sitting on one node before being synced to the quota file. (The default is + * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency + * of quota file syncs increases as the user moves closer to their limit. The + * more frequent the syncs, the more accurate the quota enforcement, but that + * means that there is more contention between the nodes for the quota file. + * The default value is one. This sets the maximum theoretical quota overrun + * (with infinite node with infinite bandwidth) to twice the user's limit. (In + * practice, the maximum overrun you see should be much less.) A "quota_scale" + * number greater than one makes quota syncs more frequent and reduces the + * maximum overrun. Numbers less than one (but greater than zero) make quota + * syncs less frequent. + * + * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of + * the quota file, so it is not being constantly read. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "inode.h" +#include "util.h" + +#define QUOTA_USER 1 +#define QUOTA_GROUP 0 + +struct gfs2_quota_change_host { + u64 qc_change; + u32 qc_flags; /* GFS2_QCF_... */ + u32 qc_id; +}; + +static LIST_HEAD(qd_lru_list); +static atomic_t qd_lru_count = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(qd_lru_lock); + +int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc) +{ + struct gfs2_quota_data *qd; + struct gfs2_sbd *sdp; + int nr_to_scan = sc->nr_to_scan; + + if (nr_to_scan == 0) + goto out; + + if (!(sc->gfp_mask & __GFP_FS)) + return -1; + + spin_lock(&qd_lru_lock); + while (nr_to_scan && !list_empty(&qd_lru_list)) { + qd = list_entry(qd_lru_list.next, + struct gfs2_quota_data, qd_reclaim); + sdp = qd->qd_gl->gl_sbd; + + /* Free from the filesystem-specific list */ + list_del(&qd->qd_list); + + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_count); + gfs2_assert_warn(sdp, !qd->qd_bh_count); + + gfs2_glock_put(qd->qd_gl); + atomic_dec(&sdp->sd_quota_count); + + /* Delete it from the common reclaim list */ + list_del_init(&qd->qd_reclaim); + atomic_dec(&qd_lru_count); + spin_unlock(&qd_lru_lock); + kmem_cache_free(gfs2_quotad_cachep, qd); + spin_lock(&qd_lru_lock); + nr_to_scan--; + } + spin_unlock(&qd_lru_lock); + +out: + return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100; +} + +static u64 qd2offset(struct gfs2_quota_data *qd) +{ + u64 offset; + + offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags); + offset *= sizeof(struct gfs2_quota); + + return offset; +} + +static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd; + int error; + + qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); + if (!qd) + return -ENOMEM; + + atomic_set(&qd->qd_count, 1); + qd->qd_id = id; + if (user) + set_bit(QDF_USER, &qd->qd_flags); + qd->qd_slot = -1; + INIT_LIST_HEAD(&qd->qd_reclaim); + + error = gfs2_glock_get(sdp, 2 * (u64)id + !user, + &gfs2_quota_glops, CREATE, &qd->qd_gl); + if (error) + goto fail; + + *qdp = qd; + + return 0; + +fail: + kmem_cache_free(gfs2_quotad_cachep, qd); + return error; +} + +static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd = NULL, *new_qd = NULL; + int error, found; + + *qdp = NULL; + + for (;;) { + found = 0; + spin_lock(&qd_lru_lock); + list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { + if (qd->qd_id == id && + !test_bit(QDF_USER, &qd->qd_flags) == !user) { + if (!atomic_read(&qd->qd_count) && + !list_empty(&qd->qd_reclaim)) { + /* Remove it from reclaim list */ + list_del_init(&qd->qd_reclaim); + atomic_dec(&qd_lru_count); + } + atomic_inc(&qd->qd_count); + found = 1; + break; + } + } + + if (!found) + qd = NULL; + + if (!qd && new_qd) { + qd = new_qd; + list_add(&qd->qd_list, &sdp->sd_quota_list); + atomic_inc(&sdp->sd_quota_count); + new_qd = NULL; + } + + spin_unlock(&qd_lru_lock); + + if (qd) { + if (new_qd) { + gfs2_glock_put(new_qd->qd_gl); + kmem_cache_free(gfs2_quotad_cachep, new_qd); + } + *qdp = qd; + return 0; + } + + error = qd_alloc(sdp, user, id, &new_qd); + if (error) + return error; + } +} + +static void qd_hold(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + gfs2_assert(sdp, atomic_read(&qd->qd_count)); + atomic_inc(&qd->qd_count); +} + +static void qd_put(struct gfs2_quota_data *qd) +{ + if (atomic_dec_and_lock(&qd->qd_count, &qd_lru_lock)) { + /* Add to the reclaim list */ + list_add_tail(&qd->qd_reclaim, &qd_lru_list); + atomic_inc(&qd_lru_count); + spin_unlock(&qd_lru_lock); + } +} + +static int slot_get(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + unsigned int c, o = 0, b; + unsigned char byte = 0; + + spin_lock(&qd_lru_lock); + + if (qd->qd_slot_count++) { + spin_unlock(&qd_lru_lock); + return 0; + } + + for (c = 0; c < sdp->sd_quota_chunks; c++) + for (o = 0; o < PAGE_SIZE; o++) { + byte = sdp->sd_quota_bitmap[c][o]; + if (byte != 0xFF) + goto found; + } + + goto fail; + +found: + for (b = 0; b < 8; b++) + if (!(byte & (1 << b))) + break; + qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b; + + if (qd->qd_slot >= sdp->sd_quota_slots) + goto fail; + + sdp->sd_quota_bitmap[c][o] |= 1 << b; + + spin_unlock(&qd_lru_lock); + + return 0; + +fail: + qd->qd_slot_count--; + spin_unlock(&qd_lru_lock); + return -ENOSPC; +} + +static void slot_hold(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + spin_lock(&qd_lru_lock); + gfs2_assert(sdp, qd->qd_slot_count); + qd->qd_slot_count++; + spin_unlock(&qd_lru_lock); +} + +static void slot_put(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + spin_lock(&qd_lru_lock); + gfs2_assert(sdp, qd->qd_slot_count); + if (!--qd->qd_slot_count) { + gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); + qd->qd_slot = -1; + } + spin_unlock(&qd_lru_lock); +} + +static int bh_get(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + unsigned int block, offset; + struct buffer_head *bh; + int error; + struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; + + mutex_lock(&sdp->sd_quota_mutex); + + if (qd->qd_bh_count++) { + mutex_unlock(&sdp->sd_quota_mutex); + return 0; + } + + block = qd->qd_slot / sdp->sd_qc_per_block; + offset = qd->qd_slot % sdp->sd_qc_per_block; + + bh_map.b_size = 1 << ip->i_inode.i_blkbits; + error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0); + if (error) + goto fail; + error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh); + if (error) + goto fail; + error = -EIO; + if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) + goto fail_brelse; + + qd->qd_bh = bh; + qd->qd_bh_qc = (struct gfs2_quota_change *) + (bh->b_data + sizeof(struct gfs2_meta_header) + + offset * sizeof(struct gfs2_quota_change)); + + mutex_unlock(&sdp->sd_quota_mutex); + + return 0; + +fail_brelse: + brelse(bh); +fail: + qd->qd_bh_count--; + mutex_unlock(&sdp->sd_quota_mutex); + return error; +} + +static void bh_put(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + mutex_lock(&sdp->sd_quota_mutex); + gfs2_assert(sdp, qd->qd_bh_count); + if (!--qd->qd_bh_count) { + brelse(qd->qd_bh); + qd->qd_bh = NULL; + qd->qd_bh_qc = NULL; + } + mutex_unlock(&sdp->sd_quota_mutex); +} + +static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd = NULL; + int error; + int found = 0; + + *qdp = NULL; + + if (sdp->sd_vfs->s_flags & MS_RDONLY) + return 0; + + spin_lock(&qd_lru_lock); + + list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { + if (test_bit(QDF_LOCKED, &qd->qd_flags) || + !test_bit(QDF_CHANGE, &qd->qd_flags) || + qd->qd_sync_gen >= sdp->sd_quota_sync_gen) + continue; + + list_move_tail(&qd->qd_list, &sdp->sd_quota_list); + + set_bit(QDF_LOCKED, &qd->qd_flags); + gfs2_assert_warn(sdp, atomic_read(&qd->qd_count)); + atomic_inc(&qd->qd_count); + qd->qd_change_sync = qd->qd_change; + gfs2_assert_warn(sdp, qd->qd_slot_count); + qd->qd_slot_count++; + found = 1; + + break; + } + + if (!found) + qd = NULL; + + spin_unlock(&qd_lru_lock); + + if (qd) { + gfs2_assert_warn(sdp, qd->qd_change_sync); + error = bh_get(qd); + if (error) { + clear_bit(QDF_LOCKED, &qd->qd_flags); + slot_put(qd); + qd_put(qd); + return error; + } + } + + *qdp = qd; + + return 0; +} + +static int qd_trylock(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + if (sdp->sd_vfs->s_flags & MS_RDONLY) + return 0; + + spin_lock(&qd_lru_lock); + + if (test_bit(QDF_LOCKED, &qd->qd_flags) || + !test_bit(QDF_CHANGE, &qd->qd_flags)) { + spin_unlock(&qd_lru_lock); + return 0; + } + + list_move_tail(&qd->qd_list, &sdp->sd_quota_list); + + set_bit(QDF_LOCKED, &qd->qd_flags); + gfs2_assert_warn(sdp, atomic_read(&qd->qd_count)); + atomic_inc(&qd->qd_count); + qd->qd_change_sync = qd->qd_change; + gfs2_assert_warn(sdp, qd->qd_slot_count); + qd->qd_slot_count++; + + spin_unlock(&qd_lru_lock); + + gfs2_assert_warn(sdp, qd->qd_change_sync); + if (bh_get(qd)) { + clear_bit(QDF_LOCKED, &qd->qd_flags); + slot_put(qd); + qd_put(qd); + return 0; + } + + return 1; +} + +static void qd_unlock(struct gfs2_quota_data *qd) +{ + gfs2_assert_warn(qd->qd_gl->gl_sbd, + test_bit(QDF_LOCKED, &qd->qd_flags)); + clear_bit(QDF_LOCKED, &qd->qd_flags); + bh_put(qd); + slot_put(qd); + qd_put(qd); +} + +static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, + struct gfs2_quota_data **qdp) +{ + int error; + + error = qd_get(sdp, user, id, qdp); + if (error) + return error; + + error = slot_get(*qdp); + if (error) + goto fail; + + error = bh_get(*qdp); + if (error) + goto fail_slot; + + return 0; + +fail_slot: + slot_put(*qdp); +fail: + qd_put(*qdp); + return error; +} + +static void qdsb_put(struct gfs2_quota_data *qd) +{ + bh_put(qd); + slot_put(qd); + qd_put(qd); +} + +int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_quota_data **qd = al->al_qd; + int error; + + if (gfs2_assert_warn(sdp, !al->al_qd_num) || + gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) + return -EIO; + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + + error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd); + if (error) + goto out; + al->al_qd_num++; + qd++; + + error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd); + if (error) + goto out; + al->al_qd_num++; + qd++; + + if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) { + error = qdsb_get(sdp, QUOTA_USER, uid, qd); + if (error) + goto out; + al->al_qd_num++; + qd++; + } + + if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) { + error = qdsb_get(sdp, QUOTA_GROUP, gid, qd); + if (error) + goto out; + al->al_qd_num++; + qd++; + } + +out: + if (error) + gfs2_quota_unhold(ip); + return error; +} + +void gfs2_quota_unhold(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = ip->i_alloc; + unsigned int x; + + gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); + + for (x = 0; x < al->al_qd_num; x++) { + qdsb_put(al->al_qd[x]); + al->al_qd[x] = NULL; + } + al->al_qd_num = 0; +} + +static int sort_qd(const void *a, const void *b) +{ + const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a; + const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b; + + if (!test_bit(QDF_USER, &qd_a->qd_flags) != + !test_bit(QDF_USER, &qd_b->qd_flags)) { + if (test_bit(QDF_USER, &qd_a->qd_flags)) + return -1; + else + return 1; + } + if (qd_a->qd_id < qd_b->qd_id) + return -1; + if (qd_a->qd_id > qd_b->qd_id) + return 1; + + return 0; +} + +static void do_qc(struct gfs2_quota_data *qd, s64 change) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + struct gfs2_quota_change *qc = qd->qd_bh_qc; + s64 x; + + mutex_lock(&sdp->sd_quota_mutex); + gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1); + + if (!test_bit(QDF_CHANGE, &qd->qd_flags)) { + qc->qc_change = 0; + qc->qc_flags = 0; + if (test_bit(QDF_USER, &qd->qd_flags)) + qc->qc_flags = cpu_to_be32(GFS2_QCF_USER); + qc->qc_id = cpu_to_be32(qd->qd_id); + } + + x = be64_to_cpu(qc->qc_change) + change; + qc->qc_change = cpu_to_be64(x); + + spin_lock(&qd_lru_lock); + qd->qd_change = x; + spin_unlock(&qd_lru_lock); + + if (!x) { + gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags)); + clear_bit(QDF_CHANGE, &qd->qd_flags); + qc->qc_flags = 0; + qc->qc_id = 0; + slot_put(qd); + qd_put(qd); + } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) { + qd_hold(qd); + slot_hold(qd); + } + + mutex_unlock(&sdp->sd_quota_mutex); +} + +/** + * gfs2_adjust_quota - adjust record of current block usage + * @ip: The quota inode + * @loc: Offset of the entry in the quota file + * @change: The amount of usage change to record + * @qd: The quota data + * @fdq: The updated limits to record + * + * This function was mostly borrowed from gfs2_block_truncate_page which was + * in turn mostly borrowed from ext3 + * + * Returns: 0 or -ve on error + */ + +static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, + s64 change, struct gfs2_quota_data *qd, + struct fs_disk_quota *fdq) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *mapping = inode->i_mapping; + unsigned long index = loc >> PAGE_CACHE_SHIFT; + unsigned offset = loc & (PAGE_CACHE_SIZE - 1); + unsigned blocksize, iblock, pos; + struct buffer_head *bh, *dibh; + struct page *page; + void *kaddr, *ptr; + struct gfs2_quota q, *qp; + int err, nbytes; + u64 size; + + if (gfs2_is_stuffed(ip)) + gfs2_unstuff_dinode(ip, NULL); + + memset(&q, 0, sizeof(struct gfs2_quota)); + err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q)); + if (err < 0) + return err; + + err = -EIO; + qp = &q; + qp->qu_value = be64_to_cpu(qp->qu_value); + qp->qu_value += change; + qp->qu_value = cpu_to_be64(qp->qu_value); + qd->qd_qb.qb_value = qp->qu_value; + if (fdq) { + if (fdq->d_fieldmask & FS_DQ_BSOFT) { + qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_warn = qp->qu_warn; + } + if (fdq->d_fieldmask & FS_DQ_BHARD) { + qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_limit = qp->qu_limit; + } + if (fdq->d_fieldmask & FS_DQ_BCOUNT) { + qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_value = qp->qu_value; + } + } + + /* Write the quota into the quota file on disk */ + ptr = qp; + nbytes = sizeof(struct gfs2_quota); +get_a_page: + page = grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + + blocksize = inode->i_sb->s_blocksize; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + if (!buffer_mapped(bh)) { + gfs2_block_map(inode, iblock, bh, 1); + if (!buffer_mapped(bh)) + goto unlock_out; + /* If it's a newly allocated disk block for quota, zero it */ + if (buffer_new(bh)) + zero_user(page, pos - blocksize, bh->b_size); + } + + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + ll_rw_block(READ_META, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + goto unlock_out; + } + + gfs2_trans_add_bh(ip->i_gl, bh, 0); + + kaddr = kmap_atomic(page, KM_USER0); + if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) + nbytes = PAGE_CACHE_SIZE - offset; + memcpy(kaddr + offset, ptr, nbytes); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + unlock_page(page); + page_cache_release(page); + + /* If quota straddles page boundary, we need to update the rest of the + * quota at the beginning of the next page */ + if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) { + ptr = ptr + nbytes; + nbytes = sizeof(struct gfs2_quota) - nbytes; + offset = 0; + index++; + goto get_a_page; + } + + /* Update the disk inode timestamp and size (if extended) */ + err = gfs2_meta_inode_buffer(ip, &dibh); + if (err) + goto out; + + size = loc + sizeof(struct gfs2_quota); + if (size > inode->i_size) + i_size_write(inode, size); + inode->i_mtime = inode->i_atime = CURRENT_TIME; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + mark_inode_dirty(inode); + +out: + return err; +unlock_out: + unlock_page(page); + page_cache_release(page); + return err; +} + +static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) +{ + struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + unsigned int data_blocks, ind_blocks; + struct gfs2_holder *ghs, i_gh; + unsigned int qx, x; + struct gfs2_quota_data *qd; + loff_t offset; + unsigned int nalloc = 0, blocks; + struct gfs2_alloc *al = NULL; + int error; + + gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), + &data_blocks, &ind_blocks); + + ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_NOFS); + if (!ghs) + return -ENOMEM; + + sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); + mutex_lock_nested(&ip->i_inode.i_mutex, I_MUTEX_QUOTA); + for (qx = 0; qx < num_qd; qx++) { + error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, &ghs[qx]); + if (error) + goto out; + } + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + goto out; + + for (x = 0; x < num_qd; x++) { + offset = qd2offset(qda[x]); + if (gfs2_write_alloc_required(ip, offset, + sizeof(struct gfs2_quota))) + nalloc++; + } + + al = gfs2_alloc_get(ip); + if (!al) { + error = -ENOMEM; + goto out_gunlock; + } + /* + * 1 blk for unstuffing inode if stuffed. We add this extra + * block to the reservation unconditionally. If the inode + * doesn't need unstuffing, the block will be released to the + * rgrp since it won't be allocated during the transaction + */ + al->al_requested = 1; + /* +3 in the end for unstuffing block, inode size update block + * and another block in case quota straddles page boundary and + * two blocks need to be updated instead of 1 */ + blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; + + if (nalloc) + al->al_requested += nalloc * (data_blocks + ind_blocks); + error = gfs2_inplace_reserve(ip); + if (error) + goto out_alloc; + + if (nalloc) + blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS; + + error = gfs2_trans_begin(sdp, blocks, 0); + if (error) + goto out_ipres; + + for (x = 0; x < num_qd; x++) { + qd = qda[x]; + offset = qd2offset(qd); + error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL); + if (error) + goto out_end_trans; + + do_qc(qd, -qd->qd_change_sync); + set_bit(QDF_REFRESH, &qd->qd_flags); + } + + error = 0; + +out_end_trans: + gfs2_trans_end(sdp); +out_ipres: + gfs2_inplace_release(ip); +out_alloc: + gfs2_alloc_put(ip); +out_gunlock: + gfs2_glock_dq_uninit(&i_gh); +out: + while (qx--) + gfs2_glock_dq_uninit(&ghs[qx]); + mutex_unlock(&ip->i_inode.i_mutex); + kfree(ghs); + gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl); + return error; +} + +static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_quota q; + struct gfs2_quota_lvb *qlvb; + loff_t pos; + int error; + + memset(&q, 0, sizeof(struct gfs2_quota)); + pos = qd2offset(qd); + error = gfs2_internal_read(ip, NULL, (char *)&q, &pos, sizeof(q)); + if (error < 0) + return error; + + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; + qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); + qlvb->__pad = 0; + qlvb->qb_limit = q.qu_limit; + qlvb->qb_warn = q.qu_warn; + qlvb->qb_value = q.qu_value; + qd->qd_qb = *qlvb; + + return 0; +} + +static int do_glock(struct gfs2_quota_data *qd, int force_refresh, + struct gfs2_holder *q_gh) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_holder i_gh; + int error; + +restart: + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); + if (error) + return error; + + qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; + + if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { + gfs2_glock_dq_uninit(q_gh); + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, q_gh); + if (error) + return error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + if (error) + goto fail; + + error = update_qd(sdp, qd); + if (error) + goto fail_gunlock; + + gfs2_glock_dq_uninit(&i_gh); + gfs2_glock_dq_uninit(q_gh); + force_refresh = 0; + goto restart; + } + + return 0; + +fail_gunlock: + gfs2_glock_dq_uninit(&i_gh); +fail: + gfs2_glock_dq_uninit(q_gh); + return error; +} + +int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_quota_data *qd; + unsigned int x; + int error = 0; + + gfs2_quota_hold(ip, uid, gid); + + if (capable(CAP_SYS_RESOURCE) || + sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + return 0; + + sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *), + sort_qd, NULL); + + for (x = 0; x < al->al_qd_num; x++) { + int force = NO_FORCE; + qd = al->al_qd[x]; + if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) + force = FORCE; + error = do_glock(qd, force, &al->al_qd_ghs[x]); + if (error) + break; + } + + if (!error) + set_bit(GIF_QD_LOCKED, &ip->i_flags); + else { + while (x--) + gfs2_glock_dq_uninit(&al->al_qd_ghs[x]); + gfs2_quota_unhold(ip); + } + + return error; +} + +static int need_sync(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_tune *gt = &sdp->sd_tune; + s64 value; + unsigned int num, den; + int do_sync = 1; + + if (!qd->qd_qb.qb_limit) + return 0; + + spin_lock(&qd_lru_lock); + value = qd->qd_change; + spin_unlock(&qd_lru_lock); + + spin_lock(>->gt_spin); + num = gt->gt_quota_scale_num; + den = gt->gt_quota_scale_den; + spin_unlock(>->gt_spin); + + if (value < 0) + do_sync = 0; + else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >= + (s64)be64_to_cpu(qd->qd_qb.qb_limit)) + do_sync = 0; + else { + value *= gfs2_jindex_size(sdp) * num; + value = div_s64(value, den); + value += (s64)be64_to_cpu(qd->qd_qb.qb_value); + if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit)) + do_sync = 0; + } + + return do_sync; +} + +void gfs2_quota_unlock(struct gfs2_inode *ip) +{ + struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_quota_data *qda[4]; + unsigned int count = 0; + unsigned int x; + + if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) + goto out; + + for (x = 0; x < al->al_qd_num; x++) { + struct gfs2_quota_data *qd; + int sync; + + qd = al->al_qd[x]; + sync = need_sync(qd); + + gfs2_glock_dq_uninit(&al->al_qd_ghs[x]); + + if (sync && qd_trylock(qd)) + qda[count++] = qd; + } + + if (count) { + do_sync(count, qda); + for (x = 0; x < count; x++) + qd_unlock(qda[x]); + } + +out: + gfs2_quota_unhold(ip); +} + +#define MAX_LINE 256 + +static int print_message(struct gfs2_quota_data *qd, char *type) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n", + sdp->sd_fsname, type, + (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group", + qd->qd_id); + + return 0; +} + +int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_quota_data *qd; + s64 value; + unsigned int x; + int error = 0; + + if (!test_bit(GIF_QD_LOCKED, &ip->i_flags)) + return 0; + + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + return 0; + + for (x = 0; x < al->al_qd_num; x++) { + qd = al->al_qd[x]; + + if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || + (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags)))) + continue; + + value = (s64)be64_to_cpu(qd->qd_qb.qb_value); + spin_lock(&qd_lru_lock); + value += qd->qd_change; + spin_unlock(&qd_lru_lock); + + if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { + print_message(qd, "exceeded"); + quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ? + USRQUOTA : GRPQUOTA, qd->qd_id, + sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN); + + error = -EDQUOT; + break; + } else if (be64_to_cpu(qd->qd_qb.qb_warn) && + (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value && + time_after_eq(jiffies, qd->qd_last_warn + + gfs2_tune_get(sdp, + gt_quota_warn_period) * HZ)) { + quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ? + USRQUOTA : GRPQUOTA, qd->qd_id, + sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN); + error = print_message(qd, "warning"); + qd->qd_last_warn = jiffies; + } + } + + return error; +} + +void gfs2_quota_change(struct gfs2_inode *ip, s64 change, + u32 uid, u32 gid) +{ + struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_quota_data *qd; + unsigned int x; + + if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) + return; + if (ip->i_diskflags & GFS2_DIF_SYSTEM) + return; + + for (x = 0; x < al->al_qd_num; x++) { + qd = al->al_qd[x]; + + if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || + (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) { + do_qc(qd, change); + } + } +} + +int gfs2_quota_sync(struct super_block *sb, int type, int wait) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_quota_data **qda; + unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync); + unsigned int num_qd; + unsigned int x; + int error = 0; + + sdp->sd_quota_sync_gen++; + + qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL); + if (!qda) + return -ENOMEM; + + do { + num_qd = 0; + + for (;;) { + error = qd_fish(sdp, qda + num_qd); + if (error || !qda[num_qd]) + break; + if (++num_qd == max_qd) + break; + } + + if (num_qd) { + if (!error) + error = do_sync(num_qd, qda); + if (!error) + for (x = 0; x < num_qd; x++) + qda[x]->qd_sync_gen = + sdp->sd_quota_sync_gen; + + for (x = 0; x < num_qd; x++) + qd_unlock(qda[x]); + } + } while (!error && num_qd == max_qd); + + kfree(qda); + + return error; +} + +static int gfs2_quota_sync_timeo(struct super_block *sb, int type) +{ + return gfs2_quota_sync(sb, type, 0); +} + +int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) +{ + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh; + int error; + + error = qd_get(sdp, user, id, &qd); + if (error) + return error; + + error = do_glock(qd, FORCE, &q_gh); + if (!error) + gfs2_glock_dq_uninit(&q_gh); + + qd_put(qd); + return error; +} + +static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf) +{ + const struct gfs2_quota_change *str = buf; + + qc->qc_change = be64_to_cpu(str->qc_change); + qc->qc_flags = be32_to_cpu(str->qc_flags); + qc->qc_id = be32_to_cpu(str->qc_id); +} + +int gfs2_quota_init(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + u64 size = i_size_read(sdp->sd_qc_inode); + unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; + unsigned int x, slot = 0; + unsigned int found = 0; + u64 dblock; + u32 extlen = 0; + int error; + + if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20)) + return -EIO; + + sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; + sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); + + error = -ENOMEM; + + sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks, + sizeof(unsigned char *), GFP_NOFS); + if (!sdp->sd_quota_bitmap) + return error; + + for (x = 0; x < sdp->sd_quota_chunks; x++) { + sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS); + if (!sdp->sd_quota_bitmap[x]) + goto fail; + } + + for (x = 0; x < blocks; x++) { + struct buffer_head *bh; + unsigned int y; + + if (!extlen) { + int new = 0; + error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen); + if (error) + goto fail; + } + error = -EIO; + bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); + if (!bh) + goto fail; + if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) { + brelse(bh); + goto fail; + } + + for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; + y++, slot++) { + struct gfs2_quota_change_host qc; + struct gfs2_quota_data *qd; + + gfs2_quota_change_in(&qc, bh->b_data + + sizeof(struct gfs2_meta_header) + + y * sizeof(struct gfs2_quota_change)); + if (!qc.qc_change) + continue; + + error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER), + qc.qc_id, &qd); + if (error) { + brelse(bh); + goto fail; + } + + set_bit(QDF_CHANGE, &qd->qd_flags); + qd->qd_change = qc.qc_change; + qd->qd_slot = slot; + qd->qd_slot_count = 1; + + spin_lock(&qd_lru_lock); + gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); + list_add(&qd->qd_list, &sdp->sd_quota_list); + atomic_inc(&sdp->sd_quota_count); + spin_unlock(&qd_lru_lock); + + found++; + } + + brelse(bh); + dblock++; + extlen--; + } + + if (found) + fs_info(sdp, "found %u quota changes\n", found); + + return 0; + +fail: + gfs2_quota_cleanup(sdp); + return error; +} + +void gfs2_quota_cleanup(struct gfs2_sbd *sdp) +{ + struct list_head *head = &sdp->sd_quota_list; + struct gfs2_quota_data *qd; + unsigned int x; + + spin_lock(&qd_lru_lock); + while (!list_empty(head)) { + qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); + + if (atomic_read(&qd->qd_count) > 1 || + (atomic_read(&qd->qd_count) && + !test_bit(QDF_CHANGE, &qd->qd_flags))) { + list_move(&qd->qd_list, head); + spin_unlock(&qd_lru_lock); + schedule(); + spin_lock(&qd_lru_lock); + continue; + } + + list_del(&qd->qd_list); + /* Also remove if this qd exists in the reclaim list */ + if (!list_empty(&qd->qd_reclaim)) { + list_del_init(&qd->qd_reclaim); + atomic_dec(&qd_lru_count); + } + atomic_dec(&sdp->sd_quota_count); + spin_unlock(&qd_lru_lock); + + if (!atomic_read(&qd->qd_count)) { + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_count); + } else + gfs2_assert_warn(sdp, qd->qd_slot_count == 1); + gfs2_assert_warn(sdp, !qd->qd_bh_count); + + gfs2_glock_put(qd->qd_gl); + kmem_cache_free(gfs2_quotad_cachep, qd); + + spin_lock(&qd_lru_lock); + } + spin_unlock(&qd_lru_lock); + + gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); + + if (sdp->sd_quota_bitmap) { + for (x = 0; x < sdp->sd_quota_chunks; x++) + kfree(sdp->sd_quota_bitmap[x]); + kfree(sdp->sd_quota_bitmap); + } +} + +static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) +{ + if (error == 0 || error == -EROFS) + return; + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); +} + +static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, + int (*fxn)(struct super_block *sb, int type), + unsigned long t, unsigned long *timeo, + unsigned int *new_timeo) +{ + if (t >= *timeo) { + int error = fxn(sdp->sd_vfs, 0); + quotad_error(sdp, msg, error); + *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ; + } else { + *timeo -= t; + } +} + +static void quotad_check_trunc_list(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip; + + while(1) { + ip = NULL; + spin_lock(&sdp->sd_trunc_lock); + if (!list_empty(&sdp->sd_trunc_list)) { + ip = list_entry(sdp->sd_trunc_list.next, + struct gfs2_inode, i_trunc_list); + list_del_init(&ip->i_trunc_list); + } + spin_unlock(&sdp->sd_trunc_lock); + if (ip == NULL) + return; + gfs2_glock_finish_truncate(ip); + } +} + +void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) { + if (!sdp->sd_statfs_force_sync) { + sdp->sd_statfs_force_sync = 1; + wake_up(&sdp->sd_quota_wait); + } +} + + +/** + * gfs2_quotad - Write cached quota changes into the quota file + * @sdp: Pointer to GFS2 superblock + * + */ + +int gfs2_quotad(void *data) +{ + struct gfs2_sbd *sdp = data; + struct gfs2_tune *tune = &sdp->sd_tune; + unsigned long statfs_timeo = 0; + unsigned long quotad_timeo = 0; + unsigned long t = 0; + DEFINE_WAIT(wait); + int empty; + + while (!kthread_should_stop()) { + + /* Update the master statfs file */ + if (sdp->sd_statfs_force_sync) { + int error = gfs2_statfs_sync(sdp->sd_vfs, 0); + quotad_error(sdp, "statfs", error); + statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ; + } + else + quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t, + &statfs_timeo, + &tune->gt_statfs_quantum); + + /* Update quota file */ + quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t, + "ad_timeo, &tune->gt_quota_quantum); + + /* Check for & recover partially truncated inodes */ + quotad_check_trunc_list(sdp); + + if (freezing(current)) + refrigerator(); + t = min(quotad_timeo, statfs_timeo); + + prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE); + spin_lock(&sdp->sd_trunc_lock); + empty = list_empty(&sdp->sd_trunc_list); + spin_unlock(&sdp->sd_trunc_lock); + if (empty && !sdp->sd_statfs_force_sync) + t -= schedule_timeout(t); + else + t = 0; + finish_wait(&sdp->sd_quota_wait, &wait); + } + + return 0; +} + +static int gfs2_quota_get_xstate(struct super_block *sb, + struct fs_quota_stat *fqs) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + memset(fqs, 0, sizeof(struct fs_quota_stat)); + fqs->qs_version = FS_QSTAT_VERSION; + + switch (sdp->sd_args.ar_quota) { + case GFS2_QUOTA_ON: + fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD); + /*FALLTHRU*/ + case GFS2_QUOTA_ACCOUNT: + fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT); + break; + case GFS2_QUOTA_OFF: + break; + } + + if (sdp->sd_quota_inode) { + fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr; + fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks; + } + fqs->qs_uquota.qfs_nextents = 1; /* unsupported */ + fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */ + fqs->qs_incoredqs = atomic_read(&qd_lru_count); + return 0; +} + +static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id, + struct fs_disk_quota *fdq) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_quota_lvb *qlvb; + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh; + int error; + + memset(fdq, 0, sizeof(struct fs_disk_quota)); + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return -ESRCH; /* Crazy XFS error code */ + + if (type == USRQUOTA) + type = QUOTA_USER; + else if (type == GRPQUOTA) + type = QUOTA_GROUP; + else + return -EINVAL; + + error = qd_get(sdp, type, id, &qd); + if (error) + return error; + error = do_glock(qd, FORCE, &q_gh); + if (error) + goto out; + + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; + fdq->d_version = FS_DQUOT_VERSION; + fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; + fdq->d_id = id; + fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift; + fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift; + fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift; + + gfs2_glock_dq_uninit(&q_gh); +out: + qd_put(qd); + return error; +} + +/* GFS2 only supports a subset of the XFS fields */ +#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT) + +static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, + struct fs_disk_quota *fdq) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh, i_gh; + unsigned int data_blocks, ind_blocks; + unsigned int blocks = 0; + int alloc_required; + struct gfs2_alloc *al; + loff_t offset; + int error; + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return -ESRCH; /* Crazy XFS error code */ + + switch(type) { + case USRQUOTA: + type = QUOTA_USER; + if (fdq->d_flags != FS_USER_QUOTA) + return -EINVAL; + break; + case GRPQUOTA: + type = QUOTA_GROUP; + if (fdq->d_flags != FS_GROUP_QUOTA) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (fdq->d_fieldmask & ~GFS2_FIELDMASK) + return -EINVAL; + if (fdq->d_id != id) + return -EINVAL; + + error = qd_get(sdp, type, id, &qd); + if (error) + return error; + + mutex_lock(&ip->i_inode.i_mutex); + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh); + if (error) + goto out_put; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + goto out_q; + + /* Check for existing entry, if none then alloc new blocks */ + error = update_qd(sdp, qd); + if (error) + goto out_i; + + /* If nothing has changed, this is a no-op */ + if ((fdq->d_fieldmask & FS_DQ_BSOFT) && + ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) + fdq->d_fieldmask ^= FS_DQ_BSOFT; + + if ((fdq->d_fieldmask & FS_DQ_BHARD) && + ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) + fdq->d_fieldmask ^= FS_DQ_BHARD; + + if ((fdq->d_fieldmask & FS_DQ_BCOUNT) && + ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value))) + fdq->d_fieldmask ^= FS_DQ_BCOUNT; + + if (fdq->d_fieldmask == 0) + goto out_i; + + offset = qd2offset(qd); + alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota)); + if (gfs2_is_stuffed(ip)) + alloc_required = 1; + if (alloc_required) { + al = gfs2_alloc_get(ip); + if (al == NULL) + goto out_i; + gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), + &data_blocks, &ind_blocks); + blocks = al->al_requested = 1 + data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip); + if (error) + goto out_alloc; + blocks += gfs2_rg_blocks(al); + } + + /* Some quotas span block boundaries and can update two blocks, + adding an extra block to the transaction to handle such quotas */ + error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0); + if (error) + goto out_release; + + /* Apply changes */ + error = gfs2_adjust_quota(ip, offset, 0, qd, fdq); + + gfs2_trans_end(sdp); +out_release: + if (alloc_required) { + gfs2_inplace_release(ip); +out_alloc: + gfs2_alloc_put(ip); + } +out_i: + gfs2_glock_dq_uninit(&i_gh); +out_q: + gfs2_glock_dq_uninit(&q_gh); +out_put: + mutex_unlock(&ip->i_inode.i_mutex); + qd_put(qd); + return error; +} + +const struct quotactl_ops gfs2_quotactl_ops = { + .quota_sync = gfs2_quota_sync, + .get_xstate = gfs2_quota_get_xstate, + .get_dqblk = gfs2_get_dqblk, + .set_dqblk = gfs2_set_dqblk, +}; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h new file mode 100644 index 00000000..90bf1c30 --- /dev/null +++ b/fs/gfs2/quota.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __QUOTA_DOT_H__ +#define __QUOTA_DOT_H__ + +struct gfs2_inode; +struct gfs2_sbd; +struct shrink_control; + +#define NO_QUOTA_CHANGE ((u32)-1) + +extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid); +extern void gfs2_quota_unhold(struct gfs2_inode *ip); + +extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid); +extern void gfs2_quota_unlock(struct gfs2_inode *ip); + +extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); +extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, + u32 uid, u32 gid); + +extern int gfs2_quota_sync(struct super_block *sb, int type, int wait); +extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); + +extern int gfs2_quota_init(struct gfs2_sbd *sdp); +extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp); +extern int gfs2_quotad(void *data); + +extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp); + +static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int ret; + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (ret) + return ret; + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + return 0; + ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); + if (ret) + gfs2_quota_unlock(ip); + return ret; +} + +extern int gfs2_shrink_qd_memory(struct shrinker *shrink, + struct shrink_control *sc); +extern const struct quotactl_ops gfs2_quotactl_ops; + +#endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c new file mode 100644 index 00000000..f2a02edc --- /dev/null +++ b/fs/gfs2/recovery.c @@ -0,0 +1,610 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "lops.h" +#include "meta_io.h" +#include "recovery.h" +#include "super.h" +#include "util.h" +#include "dir.h" + +struct workqueue_struct *gfs_recovery_wq; + +int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, + struct buffer_head **bh) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + int new = 0; + u64 dblock; + u32 extlen; + int error; + + error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen); + if (error) + return error; + if (!dblock) { + gfs2_consist_inode(ip); + return -EIO; + } + + *bh = gfs2_meta_ra(gl, dblock, extlen); + + return error; +} + +int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) +{ + struct list_head *head = &sdp->sd_revoke_list; + struct gfs2_revoke_replay *rr; + int found = 0; + + list_for_each_entry(rr, head, rr_list) { + if (rr->rr_blkno == blkno) { + found = 1; + break; + } + } + + if (found) { + rr->rr_where = where; + return 0; + } + + rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS); + if (!rr) + return -ENOMEM; + + rr->rr_blkno = blkno; + rr->rr_where = where; + list_add(&rr->rr_list, head); + + return 1; +} + +int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) +{ + struct gfs2_revoke_replay *rr; + int wrap, a, b, revoke; + int found = 0; + + list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) { + if (rr->rr_blkno == blkno) { + found = 1; + break; + } + } + + if (!found) + return 0; + + wrap = (rr->rr_where < sdp->sd_replay_tail); + a = (sdp->sd_replay_tail < where); + b = (where < rr->rr_where); + revoke = (wrap) ? (a || b) : (a && b); + + return revoke; +} + +void gfs2_revoke_clean(struct gfs2_sbd *sdp) +{ + struct list_head *head = &sdp->sd_revoke_list; + struct gfs2_revoke_replay *rr; + + while (!list_empty(head)) { + rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list); + list_del(&rr->rr_list); + kfree(rr); + } +} + +static int gfs2_log_header_in(struct gfs2_log_header_host *lh, const void *buf) +{ + const struct gfs2_log_header *str = buf; + + if (str->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || + str->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH)) + return 1; + + lh->lh_sequence = be64_to_cpu(str->lh_sequence); + lh->lh_flags = be32_to_cpu(str->lh_flags); + lh->lh_tail = be32_to_cpu(str->lh_tail); + lh->lh_blkno = be32_to_cpu(str->lh_blkno); + lh->lh_hash = be32_to_cpu(str->lh_hash); + return 0; +} + +/** + * get_log_header - read the log header for a given segment + * @jd: the journal + * @blk: the block to look at + * @lh: the log header to return + * + * Read the log header for a given segement in a given journal. Do a few + * sanity checks on it. + * + * Returns: 0 on success, + * 1 if the header was invalid or incomplete, + * errno on error + */ + +static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, + struct gfs2_log_header_host *head) +{ + struct buffer_head *bh; + struct gfs2_log_header_host uninitialized_var(lh); + const u32 nothing = 0; + u32 hash; + int error; + + error = gfs2_replay_read_block(jd, blk, &bh); + if (error) + return error; + + hash = crc32_le((u32)~0, bh->b_data, sizeof(struct gfs2_log_header) - + sizeof(u32)); + hash = crc32_le(hash, (unsigned char const *)¬hing, sizeof(nothing)); + hash ^= (u32)~0; + error = gfs2_log_header_in(&lh, bh->b_data); + brelse(bh); + + if (error || lh.lh_blkno != blk || lh.lh_hash != hash) + return 1; + + *head = lh; + + return 0; +} + +/** + * find_good_lh - find a good log header + * @jd: the journal + * @blk: the segment to start searching from + * @lh: the log header to fill in + * @forward: if true search forward in the log, else search backward + * + * Call get_log_header() to get a log header for a segment, but if the + * segment is bad, either scan forward or backward until we find a good one. + * + * Returns: errno + */ + +static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk, + struct gfs2_log_header_host *head) +{ + unsigned int orig_blk = *blk; + int error; + + for (;;) { + error = get_log_header(jd, *blk, head); + if (error <= 0) + return error; + + if (++*blk == jd->jd_blocks) + *blk = 0; + + if (*blk == orig_blk) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + return -EIO; + } + } +} + +/** + * jhead_scan - make sure we've found the head of the log + * @jd: the journal + * @head: this is filled in with the log descriptor of the head + * + * At this point, seg and lh should be either the head of the log or just + * before. Scan forward until we find the head. + * + * Returns: errno + */ + +static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) +{ + unsigned int blk = head->lh_blkno; + struct gfs2_log_header_host lh; + int error; + + for (;;) { + if (++blk == jd->jd_blocks) + blk = 0; + + error = get_log_header(jd, blk, &lh); + if (error < 0) + return error; + if (error == 1) + continue; + + if (lh.lh_sequence == head->lh_sequence) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + return -EIO; + } + if (lh.lh_sequence < head->lh_sequence) + break; + + *head = lh; + } + + return 0; +} + +/** + * gfs2_find_jhead - find the head of a log + * @jd: the journal + * @head: the log descriptor for the head of the log is returned here + * + * Do a binary search of a journal and find the valid log entry with the + * highest sequence number. (i.e. the log head) + * + * Returns: errno + */ + +int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) +{ + struct gfs2_log_header_host lh_1, lh_m; + u32 blk_1, blk_2, blk_m; + int error; + + blk_1 = 0; + blk_2 = jd->jd_blocks - 1; + + for (;;) { + blk_m = (blk_1 + blk_2) / 2; + + error = find_good_lh(jd, &blk_1, &lh_1); + if (error) + return error; + + error = find_good_lh(jd, &blk_m, &lh_m); + if (error) + return error; + + if (blk_1 == blk_m || blk_m == blk_2) + break; + + if (lh_1.lh_sequence <= lh_m.lh_sequence) + blk_1 = blk_m; + else + blk_2 = blk_m; + } + + error = jhead_scan(jd, &lh_1); + if (error) + return error; + + *head = lh_1; + + return error; +} + +/** + * foreach_descriptor - go through the active part of the log + * @jd: the journal + * @start: the first log header in the active region + * @end: the last log header (don't process the contents of this entry)) + * + * Call a given function once for every log descriptor in the active + * portion of the log. + * + * Returns: errno + */ + +static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start, + unsigned int end, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct buffer_head *bh; + struct gfs2_log_descriptor *ld; + int error = 0; + u32 length; + __be64 *ptr; + unsigned int offset = sizeof(struct gfs2_log_descriptor); + offset += sizeof(__be64) - 1; + offset &= ~(sizeof(__be64) - 1); + + while (start != end) { + error = gfs2_replay_read_block(jd, start, &bh); + if (error) + return error; + if (gfs2_meta_check(sdp, bh)) { + brelse(bh); + return -EIO; + } + ld = (struct gfs2_log_descriptor *)bh->b_data; + length = be32_to_cpu(ld->ld_length); + + if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) { + struct gfs2_log_header_host lh; + error = get_log_header(jd, start, &lh); + if (!error) { + gfs2_replay_incr_blk(sdp, &start); + brelse(bh); + continue; + } + if (error == 1) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + error = -EIO; + } + brelse(bh); + return error; + } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) { + brelse(bh); + return -EIO; + } + ptr = (__be64 *)(bh->b_data + offset); + error = lops_scan_elements(jd, start, ld, ptr, pass); + if (error) { + brelse(bh); + return error; + } + + while (length--) + gfs2_replay_incr_blk(sdp, &start); + + brelse(bh); + } + + return 0; +} + +/** + * clean_journal - mark a dirty journal as being clean + * @sdp: the filesystem + * @jd: the journal + * @gl: the journal's glock + * @head: the head journal to start from + * + * Returns: errno + */ + +static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + unsigned int lblock; + struct gfs2_log_header *lh; + u32 hash; + struct buffer_head *bh; + int error; + struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; + + lblock = head->lh_blkno; + gfs2_replay_incr_blk(sdp, &lblock); + bh_map.b_size = 1 << ip->i_inode.i_blkbits; + error = gfs2_block_map(&ip->i_inode, lblock, &bh_map, 0); + if (error) + return error; + if (!bh_map.b_blocknr) { + gfs2_consist_inode(ip); + return -EIO; + } + + bh = sb_getblk(sdp->sd_vfs, bh_map.b_blocknr); + lock_buffer(bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + unlock_buffer(bh); + + lh = (struct gfs2_log_header *)bh->b_data; + memset(lh, 0, sizeof(struct gfs2_log_header)); + lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.__pad0 = cpu_to_be64(0); + lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); + lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1); + lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT); + lh->lh_blkno = cpu_to_be32(lblock); + hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header)); + lh->lh_hash = cpu_to_be32(hash); + + set_buffer_dirty(bh); + if (sync_dirty_buffer(bh)) + gfs2_io_error_bh(sdp, bh); + brelse(bh); + + return error; +} + + +static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int message) +{ + char env_jid[20]; + char env_status[20]; + char *envp[] = { env_jid, env_status, NULL }; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ls->ls_recover_jid_done = jid; + ls->ls_recover_jid_status = message; + sprintf(env_jid, "JID=%d", jid); + sprintf(env_status, "RECOVERY=%s", + message == LM_RD_SUCCESS ? "Done" : "Failed"); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); +} + +void gfs2_recover_func(struct work_struct *work) +{ + struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_log_header_host head; + struct gfs2_holder j_gh, ji_gh, t_gh; + unsigned long t; + int ro = 0; + unsigned int pass; + int error; + int jlocked = 0; + + if (sdp->sd_args.ar_spectator || + (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) { + fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", + jd->jd_jid); + jlocked = 1; + /* Acquire the journal lock so we can do recovery */ + + error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, + LM_ST_EXCLUSIVE, + LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE, + &j_gh); + switch (error) { + case 0: + break; + + case GLR_TRYFAILED: + fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid); + error = 0; + + default: + goto fail; + }; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | GL_NOCACHE, &ji_gh); + if (error) + goto fail_gunlock_j; + } else { + fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid); + } + + fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid); + + error = gfs2_jdesc_check(jd); + if (error) + goto fail_gunlock_ji; + + error = gfs2_find_jhead(jd, &head); + if (error) + goto fail_gunlock_ji; + + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n", + jd->jd_jid); + + t = jiffies; + + /* Acquire a shared hold on the transaction lock */ + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | LM_FLAG_PRIORITY | + GL_NOCACHE, &t_gh); + if (error) + goto fail_gunlock_ji; + + if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + ro = 1; + } else { + if (sdp->sd_vfs->s_flags & MS_RDONLY) { + /* check if device itself is read-only */ + ro = bdev_read_only(sdp->sd_vfs->s_bdev); + if (!ro) { + fs_info(sdp, "recovery required on " + "read-only filesystem.\n"); + fs_info(sdp, "write access will be " + "enabled during recovery.\n"); + } + } + } + + if (ro) { + fs_warn(sdp, "jid=%u: Can't replay: read-only block " + "device\n", jd->jd_jid); + error = -EROFS; + goto fail_gunlock_tr; + } + + fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid); + + for (pass = 0; pass < 2; pass++) { + lops_before_scan(jd, &head, pass); + error = foreach_descriptor(jd, head.lh_tail, + head.lh_blkno, pass); + lops_after_scan(jd, error, pass); + if (error) + goto fail_gunlock_tr; + } + + error = clean_journal(jd, &head); + if (error) + goto fail_gunlock_tr; + + gfs2_glock_dq_uninit(&t_gh); + t = DIV_ROUND_UP(jiffies - t, HZ); + fs_info(sdp, "jid=%u: Journal replayed in %lus\n", + jd->jd_jid, t); + } + + gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); + + if (jlocked) { + gfs2_glock_dq_uninit(&ji_gh); + gfs2_glock_dq_uninit(&j_gh); + } + + fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); + goto done; + +fail_gunlock_tr: + gfs2_glock_dq_uninit(&t_gh); +fail_gunlock_ji: + if (jlocked) { + gfs2_glock_dq_uninit(&ji_gh); +fail_gunlock_j: + gfs2_glock_dq_uninit(&j_gh); + } + + fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); +fail: + gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); +done: + clear_bit(JDF_RECOVERY, &jd->jd_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&jd->jd_flags, JDF_RECOVERY); +} + +static int gfs2_recovery_wait(void *word) +{ + schedule(); + return 0; +} + +int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) +{ + int rv; + + if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags)) + return -EBUSY; + + /* we have JDF_RECOVERY, queue should always succeed */ + rv = queue_work(gfs_recovery_wq, &jd->jd_work); + BUG_ON(!rv); + + if (wait) + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, + TASK_UNINTERRUPTIBLE); + + return 0; +} + diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h new file mode 100644 index 00000000..2226136c --- /dev/null +++ b/fs/gfs2/recovery.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __RECOVERY_DOT_H__ +#define __RECOVERY_DOT_H__ + +#include "incore.h" + +extern struct workqueue_struct *gfs_recovery_wq; + +static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) +{ + if (++*blk == sdp->sd_jdesc->jd_blocks) + *blk = 0; +} + +extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, + struct buffer_head **bh); + +extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); +extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); +extern void gfs2_revoke_clean(struct gfs2_sbd *sdp); + +extern int gfs2_find_jhead(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head); +extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); +extern void gfs2_recover_func(struct work_struct *work); + +#endif /* __RECOVERY_DOT_H__ */ + diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c new file mode 100644 index 00000000..9b780df3 --- /dev/null +++ b/fs/gfs2/rgrp.c @@ -0,0 +1,1885 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "lops.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "util.h" +#include "log.h" +#include "inode.h" +#include "trace_gfs2.h" + +#define BFITNOENT ((u32)~0) +#define NO_BLOCK ((u64)~0) + +#if BITS_PER_LONG == 32 +#define LBITMASK (0x55555555UL) +#define LBITSKIP55 (0x55555555UL) +#define LBITSKIP00 (0x00000000UL) +#else +#define LBITMASK (0x5555555555555555UL) +#define LBITSKIP55 (0x5555555555555555UL) +#define LBITSKIP00 (0x0000000000000000UL) +#endif + +/* + * These routines are used by the resource group routines (rgrp.c) + * to keep track of block allocation. Each block is represented by two + * bits. So, each byte represents GFS2_NBBY (i.e. 4) blocks. + * + * 0 = Free + * 1 = Used (not metadata) + * 2 = Unlinked (still in use) inode + * 3 = Used (metadata) + */ + +static const char valid_change[16] = { + /* current */ + /* n */ 0, 1, 1, 1, + /* e */ 1, 0, 0, 0, + /* w */ 0, 0, 0, 1, + 1, 0, 0, 0 +}; + +static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, + unsigned char old_state, unsigned char new_state, + unsigned int *n); + +/** + * gfs2_setbit - Set a bit in the bitmaps + * @buffer: the buffer that holds the bitmaps + * @buflen: the length (in bytes) of the buffer + * @block: the block to set + * @new_state: the new state of the block + * + */ + +static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1, + unsigned char *buf2, unsigned int offset, + struct gfs2_bitmap *bi, u32 block, + unsigned char new_state) +{ + unsigned char *byte1, *byte2, *end, cur_state; + unsigned int buflen = bi->bi_len; + const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; + + byte1 = buf1 + offset + (block / GFS2_NBBY); + end = buf1 + offset + buflen; + + BUG_ON(byte1 >= end); + + cur_state = (*byte1 >> bit) & GFS2_BIT_MASK; + + if (unlikely(!valid_change[new_state * 4 + cur_state])) { + printk(KERN_WARNING "GFS2: buf_blk = 0x%llx old_state=%d, " + "new_state=%d\n", + (unsigned long long)block, cur_state, new_state); + printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%lx\n", + (unsigned long long)rgd->rd_addr, + (unsigned long)bi->bi_start); + printk(KERN_WARNING "GFS2: bi_offset=0x%lx bi_len=0x%lx\n", + (unsigned long)bi->bi_offset, + (unsigned long)bi->bi_len); + dump_stack(); + gfs2_consist_rgrpd(rgd); + return; + } + *byte1 ^= (cur_state ^ new_state) << bit; + + if (buf2) { + byte2 = buf2 + offset + (block / GFS2_NBBY); + cur_state = (*byte2 >> bit) & GFS2_BIT_MASK; + *byte2 ^= (cur_state ^ new_state) << bit; + } +} + +/** + * gfs2_testbit - test a bit in the bitmaps + * @buffer: the buffer that holds the bitmaps + * @buflen: the length (in bytes) of the buffer + * @block: the block to read + * + */ + +static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, + const unsigned char *buffer, + unsigned int buflen, u32 block) +{ + const unsigned char *byte, *end; + unsigned char cur_state; + unsigned int bit; + + byte = buffer + (block / GFS2_NBBY); + bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; + end = buffer + buflen; + + gfs2_assert(rgd->rd_sbd, byte < end); + + cur_state = (*byte >> bit) & GFS2_BIT_MASK; + + return cur_state; +} + +/** + * gfs2_bit_search + * @ptr: Pointer to bitmap data + * @mask: Mask to use (normally 0x55555.... but adjusted for search start) + * @state: The state we are searching for + * + * We xor the bitmap data with a patter which is the bitwise opposite + * of what we are looking for, this gives rise to a pattern of ones + * wherever there is a match. Since we have two bits per entry, we + * take this pattern, shift it down by one place and then and it with + * the original. All the even bit positions (0,2,4, etc) then represent + * successful matches, so we mask with 0x55555..... to remove the unwanted + * odd bit positions. + * + * This allows searching of a whole u64 at once (32 blocks) with a + * single test (on 64 bit arches). + */ + +static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) +{ + u64 tmp; + static const u64 search[] = { + [0] = 0xffffffffffffffffULL, + [1] = 0xaaaaaaaaaaaaaaaaULL, + [2] = 0x5555555555555555ULL, + [3] = 0x0000000000000000ULL, + }; + tmp = le64_to_cpu(*ptr) ^ search[state]; + tmp &= (tmp >> 1); + tmp &= mask; + return tmp; +} + +/** + * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing + * a block in a given allocation state. + * @buffer: the buffer that holds the bitmaps + * @len: the length (in bytes) of the buffer + * @goal: start search at this block's bit-pair (within @buffer) + * @state: GFS2_BLKST_XXX the state of the block we're looking for. + * + * Scope of @goal and returned block number is only within this bitmap buffer, + * not entire rgrp or filesystem. @buffer will be offset from the actual + * beginning of a bitmap block buffer, skipping any header structures, but + * headers are always a multiple of 64 bits long so that the buffer is + * always aligned to a 64 bit boundary. + * + * The size of the buffer is in bytes, but is it assumed that it is + * always ok to read a complete multiple of 64 bits at the end + * of the block in case the end is no aligned to a natural boundary. + * + * Return: the block number (bitmap buffer scope) that was found + */ + +static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, + u32 goal, u8 state) +{ + u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1); + const __le64 *ptr = ((__le64 *)buf) + (goal >> 5); + const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64))); + u64 tmp; + u64 mask = 0x5555555555555555ULL; + u32 bit; + + BUG_ON(state > 3); + + /* Mask off bits we don't care about at the start of the search */ + mask <<= spoint; + tmp = gfs2_bit_search(ptr, mask, state); + ptr++; + while(tmp == 0 && ptr < end) { + tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state); + ptr++; + } + /* Mask off any bits which are more than len bytes from the start */ + if (ptr == end && (len & (sizeof(u64) - 1))) + tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1)))); + /* Didn't find anything, so return */ + if (tmp == 0) + return BFITNOENT; + ptr--; + bit = __ffs64(tmp); + bit /= 2; /* two bits per entry in the bitmap */ + return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit; +} + +/** + * gfs2_bitcount - count the number of bits in a certain state + * @buffer: the buffer that holds the bitmaps + * @buflen: the length (in bytes) of the buffer + * @state: the state of the block we're looking for + * + * Returns: The number of bits + */ + +static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer, + unsigned int buflen, u8 state) +{ + const u8 *byte = buffer; + const u8 *end = buffer + buflen; + const u8 state1 = state << 2; + const u8 state2 = state << 4; + const u8 state3 = state << 6; + u32 count = 0; + + for (; byte < end; byte++) { + if (((*byte) & 0x03) == state) + count++; + if (((*byte) & 0x0C) == state1) + count++; + if (((*byte) & 0x30) == state2) + count++; + if (((*byte) & 0xC0) == state3) + count++; + } + + return count; +} + +/** + * gfs2_rgrp_verify - Verify that a resource group is consistent + * @sdp: the filesystem + * @rgd: the rgrp + * + */ + +void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_bitmap *bi = NULL; + u32 length = rgd->rd_length; + u32 count[4], tmp; + int buf, x; + + memset(count, 0, 4 * sizeof(u32)); + + /* Count # blocks in each of 4 possible allocation states */ + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + for (x = 0; x < 4; x++) + count[x] += gfs2_bitcount(rgd, + bi->bi_bh->b_data + + bi->bi_offset, + bi->bi_len, x); + } + + if (count[0] != rgd->rd_free) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, "free data mismatch: %u != %u\n", + count[0], rgd->rd_free); + return; + } + + tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes; + if (count[1] != tmp) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, "used data mismatch: %u != %u\n", + count[1], tmp); + return; + } + + if (count[2] + count[3] != rgd->rd_dinodes) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, "used metadata mismatch: %u != %u\n", + count[2] + count[3], rgd->rd_dinodes); + return; + } +} + +static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) +{ + u64 first = rgd->rd_data0; + u64 last = first + rgd->rd_data; + return first <= block && block < last; +} + +/** + * gfs2_blk2rgrpd - Find resource group for a given data/meta block number + * @sdp: The GFS2 superblock + * @n: The data block number + * + * Returns: The resource group, or NULL if not found + */ + +struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) +{ + struct gfs2_rgrpd *rgd; + + spin_lock(&sdp->sd_rindex_spin); + + list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) { + if (rgrp_contains_block(rgd, blk)) { + list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); + spin_unlock(&sdp->sd_rindex_spin); + return rgd; + } + } + + spin_unlock(&sdp->sd_rindex_spin); + + return NULL; +} + +/** + * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem + * @sdp: The GFS2 superblock + * + * Returns: The first rgrp in the filesystem + */ + +struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) +{ + gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list)); + return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list); +} + +/** + * gfs2_rgrpd_get_next - get the next RG + * @rgd: A RG + * + * Returns: The next rgrp + */ + +struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) +{ + if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list) + return NULL; + return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list); +} + +static void clear_rgrpdi(struct gfs2_sbd *sdp) +{ + struct list_head *head; + struct gfs2_rgrpd *rgd; + struct gfs2_glock *gl; + + spin_lock(&sdp->sd_rindex_spin); + sdp->sd_rindex_forward = NULL; + spin_unlock(&sdp->sd_rindex_spin); + + head = &sdp->sd_rindex_list; + while (!list_empty(head)) { + rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list); + gl = rgd->rd_gl; + + list_del(&rgd->rd_list); + list_del(&rgd->rd_list_mru); + + if (gl) { + gl->gl_object = NULL; + gfs2_glock_add_to_lru(gl); + gfs2_glock_put(gl); + } + + kfree(rgd->rd_bits); + kmem_cache_free(gfs2_rgrpd_cachep, rgd); + } +} + +void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) +{ + mutex_lock(&sdp->sd_rindex_mutex); + clear_rgrpdi(sdp); + mutex_unlock(&sdp->sd_rindex_mutex); +} + +static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) +{ + printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); + printk(KERN_INFO " ri_length = %u\n", rgd->rd_length); + printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); + printk(KERN_INFO " ri_data = %u\n", rgd->rd_data); + printk(KERN_INFO " ri_bitbytes = %u\n", rgd->rd_bitbytes); +} + +/** + * gfs2_compute_bitstructs - Compute the bitmap sizes + * @rgd: The resource group descriptor + * + * Calculates bitmap descriptors, one for each block that contains bitmap data + * + * Returns: errno + */ + +static int compute_bitstructs(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_bitmap *bi; + u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */ + u32 bytes_left, bytes; + int x; + + if (!length) + return -EINVAL; + + rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS); + if (!rgd->rd_bits) + return -ENOMEM; + + bytes_left = rgd->rd_bitbytes; + + for (x = 0; x < length; x++) { + bi = rgd->rd_bits + x; + + bi->bi_flags = 0; + /* small rgrp; bitmap stored completely in header block */ + if (length == 1) { + bytes = bytes_left; + bi->bi_offset = sizeof(struct gfs2_rgrp); + bi->bi_start = 0; + bi->bi_len = bytes; + /* header block */ + } else if (x == 0) { + bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp); + bi->bi_offset = sizeof(struct gfs2_rgrp); + bi->bi_start = 0; + bi->bi_len = bytes; + /* last block */ + } else if (x + 1 == length) { + bytes = bytes_left; + bi->bi_offset = sizeof(struct gfs2_meta_header); + bi->bi_start = rgd->rd_bitbytes - bytes_left; + bi->bi_len = bytes; + /* other blocks */ + } else { + bytes = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header); + bi->bi_offset = sizeof(struct gfs2_meta_header); + bi->bi_start = rgd->rd_bitbytes - bytes_left; + bi->bi_len = bytes; + } + + bytes_left -= bytes; + } + + if (bytes_left) { + gfs2_consist_rgrpd(rgd); + return -EIO; + } + bi = rgd->rd_bits + (length - 1); + if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_data) { + if (gfs2_consist_rgrpd(rgd)) { + gfs2_rindex_print(rgd); + fs_err(sdp, "start=%u len=%u offset=%u\n", + bi->bi_start, bi->bi_len, bi->bi_offset); + } + return -EIO; + } + + return 0; +} + +/** + * gfs2_ri_total - Total up the file system space, according to the rindex. + * + */ +u64 gfs2_ri_total(struct gfs2_sbd *sdp) +{ + u64 total_data = 0; + struct inode *inode = sdp->sd_rindex; + struct gfs2_inode *ip = GFS2_I(inode); + char buf[sizeof(struct gfs2_rindex)]; + struct file_ra_state ra_state; + int error, rgrps; + + mutex_lock(&sdp->sd_rindex_mutex); + file_ra_state_init(&ra_state, inode->i_mapping); + for (rgrps = 0;; rgrps++) { + loff_t pos = rgrps * sizeof(struct gfs2_rindex); + + if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode)) + break; + error = gfs2_internal_read(ip, &ra_state, buf, &pos, + sizeof(struct gfs2_rindex)); + if (error != sizeof(struct gfs2_rindex)) + break; + total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data); + } + mutex_unlock(&sdp->sd_rindex_mutex); + return total_data; +} + +static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf) +{ + const struct gfs2_rindex *str = buf; + + rgd->rd_addr = be64_to_cpu(str->ri_addr); + rgd->rd_length = be32_to_cpu(str->ri_length); + rgd->rd_data0 = be64_to_cpu(str->ri_data0); + rgd->rd_data = be32_to_cpu(str->ri_data); + rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes); +} + +/** + * read_rindex_entry - Pull in a new resource index entry from the disk + * @gl: The glock covering the rindex inode + * + * Returns: 0 on success, error code otherwise + */ + +static int read_rindex_entry(struct gfs2_inode *ip, + struct file_ra_state *ra_state) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); + char buf[sizeof(struct gfs2_rindex)]; + int error; + struct gfs2_rgrpd *rgd; + + error = gfs2_internal_read(ip, ra_state, buf, &pos, + sizeof(struct gfs2_rindex)); + if (!error) + return 0; + if (error != sizeof(struct gfs2_rindex)) { + if (error > 0) + error = -EIO; + return error; + } + + rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS); + error = -ENOMEM; + if (!rgd) + return error; + + mutex_init(&rgd->rd_mutex); + lops_init_le(&rgd->rd_le, &gfs2_rg_lops); + rgd->rd_sbd = sdp; + + list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list); + list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); + + gfs2_rindex_in(rgd, buf); + error = compute_bitstructs(rgd); + if (error) + return error; + + error = gfs2_glock_get(sdp, rgd->rd_addr, + &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); + if (error) + return error; + + rgd->rd_gl->gl_object = rgd; + rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + return error; +} + +/** + * gfs2_ri_update - Pull in a new resource index from the disk + * @ip: pointer to the rindex inode + * + * Returns: 0 on successful update, error code otherwise + */ + +int gfs2_ri_update(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct inode *inode = &ip->i_inode; + struct file_ra_state ra_state; + u64 rgrp_count = i_size_read(inode); + struct gfs2_rgrpd *rgd; + unsigned int max_data = 0; + int error; + + do_div(rgrp_count, sizeof(struct gfs2_rindex)); + clear_rgrpdi(sdp); + + file_ra_state_init(&ra_state, inode->i_mapping); + for (sdp->sd_rgrps = 0; sdp->sd_rgrps < rgrp_count; sdp->sd_rgrps++) { + error = read_rindex_entry(ip, &ra_state); + if (error) { + clear_rgrpdi(sdp); + return error; + } + } + + list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) + if (rgd->rd_data > max_data) + max_data = rgd->rd_data; + sdp->sd_max_rg_data = max_data; + sdp->sd_rindex_uptodate = 1; + return 0; +} + +/** + * gfs2_rindex_hold - Grab a lock on the rindex + * @sdp: The GFS2 superblock + * @ri_gh: the glock holder + * + * We grab a lock on the rindex inode to make sure that it doesn't + * change whilst we are performing an operation. We keep this lock + * for quite long periods of time compared to other locks. This + * doesn't matter, since it is shared and it is very, very rarely + * accessed in the exclusive mode (i.e. only when expanding the filesystem). + * + * This makes sure that we're using the latest copy of the resource index + * special file, which might have been updated if someone expanded the + * filesystem (via gfs2_grow utility), which adds new resource groups. + * + * Returns: 0 on success, error code otherwise + */ + +int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); + struct gfs2_glock *gl = ip->i_gl; + int error; + + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh); + if (error) + return error; + + /* Read new copy from disk if we don't have the latest */ + if (!sdp->sd_rindex_uptodate) { + mutex_lock(&sdp->sd_rindex_mutex); + if (!sdp->sd_rindex_uptodate) { + error = gfs2_ri_update(ip); + if (error) + gfs2_glock_dq_uninit(ri_gh); + } + mutex_unlock(&sdp->sd_rindex_mutex); + } + + return error; +} + +static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) +{ + const struct gfs2_rgrp *str = buf; + u32 rg_flags; + + rg_flags = be32_to_cpu(str->rg_flags); + rg_flags &= ~GFS2_RDF_MASK; + rgd->rd_flags &= GFS2_RDF_MASK; + rgd->rd_flags |= rg_flags; + rgd->rd_free = be32_to_cpu(str->rg_free); + rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes); + rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration); +} + +static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) +{ + struct gfs2_rgrp *str = buf; + + str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK); + str->rg_free = cpu_to_be32(rgd->rd_free); + str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes); + str->__pad = cpu_to_be32(0); + str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration); + memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); +} + +/** + * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps + * @rgd: the struct gfs2_rgrpd describing the RG to read in + * + * Read in all of a Resource Group's header and bitmap blocks. + * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps. + * + * Returns: errno + */ + +int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_glock *gl = rgd->rd_gl; + unsigned int length = rgd->rd_length; + struct gfs2_bitmap *bi; + unsigned int x, y; + int error; + + mutex_lock(&rgd->rd_mutex); + + spin_lock(&sdp->sd_rindex_spin); + if (rgd->rd_bh_count) { + rgd->rd_bh_count++; + spin_unlock(&sdp->sd_rindex_spin); + mutex_unlock(&rgd->rd_mutex); + return 0; + } + spin_unlock(&sdp->sd_rindex_spin); + + for (x = 0; x < length; x++) { + bi = rgd->rd_bits + x; + error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh); + if (error) + goto fail; + } + + for (y = length; y--;) { + bi = rgd->rd_bits + y; + error = gfs2_meta_wait(sdp, bi->bi_bh); + if (error) + goto fail; + if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB : + GFS2_METATYPE_RG)) { + error = -EIO; + goto fail; + } + } + + if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) { + for (x = 0; x < length; x++) + clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags); + gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); + rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); + } + + spin_lock(&sdp->sd_rindex_spin); + rgd->rd_free_clone = rgd->rd_free; + rgd->rd_bh_count++; + spin_unlock(&sdp->sd_rindex_spin); + + mutex_unlock(&rgd->rd_mutex); + + return 0; + +fail: + while (x--) { + bi = rgd->rd_bits + x; + brelse(bi->bi_bh); + bi->bi_bh = NULL; + gfs2_assert_warn(sdp, !bi->bi_clone); + } + mutex_unlock(&rgd->rd_mutex); + + return error; +} + +void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + + spin_lock(&sdp->sd_rindex_spin); + gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); + rgd->rd_bh_count++; + spin_unlock(&sdp->sd_rindex_spin); +} + +/** + * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get() + * @rgd: the struct gfs2_rgrpd describing the RG to read in + * + */ + +void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + int x, length = rgd->rd_length; + + spin_lock(&sdp->sd_rindex_spin); + gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); + if (--rgd->rd_bh_count) { + spin_unlock(&sdp->sd_rindex_spin); + return; + } + + for (x = 0; x < length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + kfree(bi->bi_clone); + bi->bi_clone = NULL; + brelse(bi->bi_bh); + bi->bi_bh = NULL; + } + + spin_unlock(&sdp->sd_rindex_spin); +} + +static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + const struct gfs2_bitmap *bi) +{ + struct super_block *sb = sdp->sd_vfs; + struct block_device *bdev = sb->s_bdev; + const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize / + bdev_logical_block_size(sb->s_bdev); + u64 blk; + sector_t start = 0; + sector_t nr_sects = 0; + int rv; + unsigned int x; + + for (x = 0; x < bi->bi_len; x++) { + const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x; + const u8 *clone = bi->bi_clone + bi->bi_offset + x; + u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); + diff &= 0x55; + if (diff == 0) + continue; + blk = offset + ((bi->bi_start + x) * GFS2_NBBY); + blk *= sects_per_blk; /* convert to sectors */ + while(diff) { + if (diff & 1) { + if (nr_sects == 0) + goto start_new_extent; + if ((start + nr_sects) != blk) { + rv = blkdev_issue_discard(bdev, start, + nr_sects, GFP_NOFS, + 0); + if (rv) + goto fail; + nr_sects = 0; +start_new_extent: + start = blk; + } + nr_sects += sects_per_blk; + } + diff >>= 2; + blk += sects_per_blk; + } + } + if (nr_sects) { + rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0); + if (rv) + goto fail; + } + return; +fail: + fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv); + sdp->sd_args.ar_discard = 0; +} + +void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + unsigned int length = rgd->rd_length; + unsigned int x; + + for (x = 0; x < length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + if (!bi->bi_clone) + continue; + if (sdp->sd_args.ar_discard) + gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi); + clear_bit(GBF_FULL, &bi->bi_flags); + memcpy(bi->bi_clone + bi->bi_offset, + bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); + } + + spin_lock(&sdp->sd_rindex_spin); + rgd->rd_free_clone = rgd->rd_free; + spin_unlock(&sdp->sd_rindex_spin); +} + +/** + * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode + * @ip: the incore GFS2 inode structure + * + * Returns: the struct gfs2_alloc + */ + +struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) +{ + BUG_ON(ip->i_alloc != NULL); + ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS); + return ip->i_alloc; +} + +/** + * try_rgrp_fit - See if a given reservation will fit in a given RG + * @rgd: the RG data + * @al: the struct gfs2_alloc structure describing the reservation + * + * If there's room for the requested blocks to be allocated from the RG: + * Sets the $al_rgd field in @al. + * + * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) + */ + +static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + int ret = 0; + + if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) + return 0; + + spin_lock(&sdp->sd_rindex_spin); + if (rgd->rd_free_clone >= al->al_requested) { + al->al_rgd = rgd; + ret = 1; + } + spin_unlock(&sdp->sd_rindex_spin); + + return ret; +} + +/** + * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes + * @rgd: The rgrp + * + * Returns: 0 if no error + * The inode, if one has been found, in inode. + */ + +static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip) +{ + u32 goal = 0, block; + u64 no_addr; + struct gfs2_sbd *sdp = rgd->rd_sbd; + unsigned int n; + struct gfs2_glock *gl; + struct gfs2_inode *ip; + int error; + int found = 0; + + while (goal < rgd->rd_data) { + down_write(&sdp->sd_log_flush_lock); + n = 1; + block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, + GFS2_BLKST_UNLINKED, &n); + up_write(&sdp->sd_log_flush_lock); + if (block == BFITNOENT) + break; + /* rgblk_search can return a block < goal, so we need to + keep it marching forward. */ + no_addr = block + rgd->rd_data0; + goal = max(block + 1, goal + 1); + if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked) + continue; + if (no_addr == skip) + continue; + *last_unlinked = no_addr; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl); + if (error) + continue; + + /* If the inode is already in cache, we can ignore it here + * because the existing inode disposal code will deal with + * it when all refs have gone away. Accessing gl_object like + * this is not safe in general. Here it is ok because we do + * not dereference the pointer, and we only need an approx + * answer to whether it is NULL or not. + */ + ip = gl->gl_object; + + if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_put(gl); + else + found++; + + /* Limit reclaim to sensible number of tasks */ + if (found > NR_CPUS) + return; + } + + rgd->rd_flags &= ~GFS2_RDF_CHECK; + return; +} + +/** + * recent_rgrp_next - get next RG from "recent" list + * @cur_rgd: current rgrp + * + * Returns: The next rgrp in the recent list + */ + +static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd) +{ + struct gfs2_sbd *sdp = cur_rgd->rd_sbd; + struct list_head *head; + struct gfs2_rgrpd *rgd; + + spin_lock(&sdp->sd_rindex_spin); + head = &sdp->sd_rindex_mru_list; + if (unlikely(cur_rgd->rd_list_mru.next == head)) { + spin_unlock(&sdp->sd_rindex_spin); + return NULL; + } + rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru); + spin_unlock(&sdp->sd_rindex_spin); + return rgd; +} + +/** + * forward_rgrp_get - get an rgrp to try next from full list + * @sdp: The GFS2 superblock + * + * Returns: The rgrp to try next + */ + +static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp) +{ + struct gfs2_rgrpd *rgd; + unsigned int journals = gfs2_jindex_size(sdp); + unsigned int rg = 0, x; + + spin_lock(&sdp->sd_rindex_spin); + + rgd = sdp->sd_rindex_forward; + if (!rgd) { + if (sdp->sd_rgrps >= journals) + rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals; + + for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg; + x++, rgd = gfs2_rgrpd_get_next(rgd)) + /* Do Nothing */; + + sdp->sd_rindex_forward = rgd; + } + + spin_unlock(&sdp->sd_rindex_spin); + + return rgd; +} + +/** + * forward_rgrp_set - set the forward rgrp pointer + * @sdp: the filesystem + * @rgd: The new forward rgrp + * + */ + +static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) +{ + spin_lock(&sdp->sd_rindex_spin); + sdp->sd_rindex_forward = rgd; + spin_unlock(&sdp->sd_rindex_spin); +} + +/** + * get_local_rgrp - Choose and lock a rgrp for allocation + * @ip: the inode to reserve space for + * @rgp: the chosen and locked rgrp + * + * Try to acquire rgrp in way which avoids contending with others. + * + * Returns: errno + */ + +static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd, *begin = NULL; + struct gfs2_alloc *al = ip->i_alloc; + int flags = LM_FLAG_TRY; + int skipped = 0; + int loops = 0; + int error, rg_locked; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); + + while (rgd) { + rg_locked = 0; + + if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { + rg_locked = 1; + error = 0; + } else { + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_TRY, &al->al_rgd_gh); + } + switch (error) { + case 0: + if (try_rgrp_fit(rgd, al)) + goto out; + if (rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); + if (!rg_locked) + gfs2_glock_dq_uninit(&al->al_rgd_gh); + /* fall through */ + case GLR_TRYFAILED: + rgd = recent_rgrp_next(rgd); + break; + + default: + return error; + } + } + + /* Go through full list of rgrps */ + + begin = rgd = forward_rgrp_get(sdp); + + for (;;) { + rg_locked = 0; + + if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { + rg_locked = 1; + error = 0; + } else { + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags, + &al->al_rgd_gh); + } + switch (error) { + case 0: + if (try_rgrp_fit(rgd, al)) + goto out; + if (rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); + if (!rg_locked) + gfs2_glock_dq_uninit(&al->al_rgd_gh); + break; + + case GLR_TRYFAILED: + skipped++; + break; + + default: + return error; + } + + rgd = gfs2_rgrpd_get_next(rgd); + if (!rgd) + rgd = gfs2_rgrpd_get_first(sdp); + + if (rgd == begin) { + if (++loops >= 3) + return -ENOSPC; + if (!skipped) + loops++; + flags = 0; + if (loops == 2) + gfs2_log_flush(sdp, NULL); + } + } + +out: + if (begin) { + spin_lock(&sdp->sd_rindex_spin); + list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); + spin_unlock(&sdp->sd_rindex_spin); + rgd = gfs2_rgrpd_get_next(rgd); + if (!rgd) + rgd = gfs2_rgrpd_get_first(sdp); + forward_rgrp_set(sdp, rgd); + } + + return 0; +} + +/** + * gfs2_inplace_reserve_i - Reserve space in the filesystem + * @ip: the inode to reserve space for + * + * Returns: errno + */ + +int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, + char *file, unsigned int line) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = ip->i_alloc; + int error = 0; + u64 last_unlinked = NO_BLOCK; + int tries = 0; + + if (gfs2_assert_warn(sdp, al->al_requested)) + return -EINVAL; + + if (hold_rindex) { + /* We need to hold the rindex unless the inode we're using is + the rindex itself, in which case it's already held. */ + if (ip != GFS2_I(sdp->sd_rindex)) + error = gfs2_rindex_hold(sdp, &al->al_ri_gh); + else if (!sdp->sd_rgrps) /* We may not have the rindex read + in, so: */ + error = gfs2_ri_update(ip); + if (error) + return error; + } + +try_again: + do { + error = get_local_rgrp(ip, &last_unlinked); + /* If there is no space, flushing the log may release some */ + if (error) { + if (ip == GFS2_I(sdp->sd_rindex) && + !sdp->sd_rindex_uptodate) { + error = gfs2_ri_update(ip); + if (error) + return error; + goto try_again; + } + gfs2_log_flush(sdp, NULL); + } + } while (error && tries++ < 3); + + if (error) { + if (hold_rindex && ip != GFS2_I(sdp->sd_rindex)) + gfs2_glock_dq_uninit(&al->al_ri_gh); + return error; + } + + /* no error, so we have the rgrp set in the inode's allocation. */ + al->al_file = file; + al->al_line = line; + + return 0; +} + +/** + * gfs2_inplace_release - release an inplace reservation + * @ip: the inode the reservation was taken out on + * + * Release a reservation made by gfs2_inplace_reserve(). + */ + +void gfs2_inplace_release(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = ip->i_alloc; + + if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1) + fs_warn(sdp, "al_alloced = %u, al_requested = %u " + "al_file = %s, al_line = %u\n", + al->al_alloced, al->al_requested, al->al_file, + al->al_line); + + al->al_rgd = NULL; + if (al->al_rgd_gh.gh_gl) + gfs2_glock_dq_uninit(&al->al_rgd_gh); + if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl) + gfs2_glock_dq_uninit(&al->al_ri_gh); +} + +/** + * gfs2_get_block_type - Check a block in a RG is of given type + * @rgd: the resource group holding the block + * @block: the block number + * + * Returns: The block type (GFS2_BLKST_*) + */ + +static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) +{ + struct gfs2_bitmap *bi = NULL; + u32 length, rgrp_block, buf_block; + unsigned int buf; + unsigned char type; + + length = rgd->rd_length; + rgrp_block = block - rgd->rd_data0; + + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY) + break; + } + + gfs2_assert(rgd->rd_sbd, buf < length); + buf_block = rgrp_block - bi->bi_start * GFS2_NBBY; + + type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len, buf_block); + + return type; +} + +/** + * rgblk_search - find a block in @old_state, change allocation + * state to @new_state + * @rgd: the resource group descriptor + * @goal: the goal block within the RG (start here to search for avail block) + * @old_state: GFS2_BLKST_XXX the before-allocation state to find + * @new_state: GFS2_BLKST_XXX the after-allocation block state + * @n: The extent length + * + * Walk rgrp's bitmap to find bits that represent a block in @old_state. + * Add the found bitmap buffer to the transaction. + * Set the found bits to @new_state to change block's allocation state. + * + * This function never fails, because we wouldn't call it unless we + * know (from reservation results, etc.) that a block is available. + * + * Scope of @goal and returned block is just within rgrp, not the whole + * filesystem. + * + * Returns: the block number allocated + */ + +static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, + unsigned char old_state, unsigned char new_state, + unsigned int *n) +{ + struct gfs2_bitmap *bi = NULL; + const u32 length = rgd->rd_length; + u32 blk = BFITNOENT; + unsigned int buf, x; + const unsigned int elen = *n; + const u8 *buffer = NULL; + + *n = 0; + /* Find bitmap block that contains bits for goal block */ + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + /* Convert scope of "goal" from rgrp-wide to within found bit block */ + if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) { + goal -= bi->bi_start * GFS2_NBBY; + goto do_search; + } + } + buf = 0; + goal = 0; + +do_search: + /* Search (up to entire) bitmap in this rgrp for allocatable block. + "x <= length", instead of "x < length", because we typically start + the search in the middle of a bit block, but if we can't find an + allocatable block anywhere else, we want to be able wrap around and + search in the first part of our first-searched bit block. */ + for (x = 0; x <= length; x++) { + bi = rgd->rd_bits + buf; + + if (test_bit(GBF_FULL, &bi->bi_flags) && + (old_state == GFS2_BLKST_FREE)) + goto skip; + + /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone + bitmaps, so we must search the originals for that. */ + buffer = bi->bi_bh->b_data + bi->bi_offset; + if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone) + buffer = bi->bi_clone + bi->bi_offset; + + blk = gfs2_bitfit(buffer, bi->bi_len, goal, old_state); + if (blk != BFITNOENT) + break; + + if ((goal == 0) && (old_state == GFS2_BLKST_FREE)) + set_bit(GBF_FULL, &bi->bi_flags); + + /* Try next bitmap block (wrap back to rgrp header if at end) */ +skip: + buf++; + buf %= length; + goal = 0; + } + + if (blk == BFITNOENT) + return blk; + *n = 1; + if (old_state == new_state) + goto out; + + gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); + gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, + bi, blk, new_state); + goal = blk; + while (*n < elen) { + goal++; + if (goal >= (bi->bi_len * GFS2_NBBY)) + break; + if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) != + GFS2_BLKST_FREE) + break; + gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, + bi, goal, new_state); + (*n)++; + } +out: + return (bi->bi_start * GFS2_NBBY) + blk; +} + +/** + * rgblk_free - Change alloc state of given block(s) + * @sdp: the filesystem + * @bstart: the start of a run of blocks to free + * @blen: the length of the block run (all must lie within ONE RG!) + * @new_state: GFS2_BLKST_XXX the after-allocation block state + * + * Returns: Resource group containing the block(s) + */ + +static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, + u32 blen, unsigned char new_state) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_bitmap *bi = NULL; + u32 length, rgrp_blk, buf_blk; + unsigned int buf; + + rgd = gfs2_blk2rgrpd(sdp, bstart); + if (!rgd) { + if (gfs2_consist(sdp)) + fs_err(sdp, "block = %llu\n", (unsigned long long)bstart); + return NULL; + } + + length = rgd->rd_length; + + rgrp_blk = bstart - rgd->rd_data0; + + while (blen--) { + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY) + break; + } + + gfs2_assert(rgd->rd_sbd, buf < length); + + buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY; + rgrp_blk++; + + if (!bi->bi_clone) { + bi->bi_clone = kmalloc(bi->bi_bh->b_size, + GFP_NOFS | __GFP_NOFAIL); + memcpy(bi->bi_clone + bi->bi_offset, + bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len); + } + gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); + gfs2_setbit(rgd, bi->bi_bh->b_data, NULL, bi->bi_offset, + bi, buf_blk, new_state); + } + + return rgd; +} + +/** + * gfs2_rgrp_dump - print out an rgrp + * @seq: The iterator + * @gl: The glock in question + * + */ + +int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) +{ + const struct gfs2_rgrpd *rgd = gl->gl_object; + if (rgd == NULL) + return 0; + gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n", + (unsigned long long)rgd->rd_addr, rgd->rd_flags, + rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes); + return 0; +} + +static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", + (unsigned long long)rgd->rd_addr); + fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); + gfs2_rgrp_dump(NULL, rgd->rd_gl); + rgd->rd_flags |= GFS2_RDF_ERROR; +} + +/** + * gfs2_alloc_block - Allocate one or more blocks + * @ip: the inode to allocate the block for + * @bn: Used to return the starting block number + * @n: requested number of blocks/extent length (value/result) + * + * Returns: 0 or error + */ + +int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_rgrpd *rgd; + u32 goal, blk; + u64 block; + int error; + + /* Only happens if there is a bug in gfs2, return something distinctive + * to ensure that it is noticed. + */ + if (al == NULL) + return -ECANCELED; + + rgd = al->al_rgd; + + if (rgrp_contains_block(rgd, ip->i_goal)) + goal = ip->i_goal - rgd->rd_data0; + else + goal = rgd->rd_last_alloc; + + blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n); + + /* Since all blocks are reserved in advance, this shouldn't happen */ + if (blk == BFITNOENT) + goto rgrp_error; + + rgd->rd_last_alloc = blk; + block = rgd->rd_data0 + blk; + ip->i_goal = block; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error == 0) { + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal); + brelse(dibh); + } + if (rgd->rd_free < *n) + goto rgrp_error; + + rgd->rd_free -= *n; + + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + + al->al_alloced += *n; + + gfs2_statfs_change(sdp, 0, -(s64)*n, 0); + gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid); + + spin_lock(&sdp->sd_rindex_spin); + rgd->rd_free_clone -= *n; + spin_unlock(&sdp->sd_rindex_spin); + trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED); + *bn = block; + return 0; + +rgrp_error: + gfs2_rgrp_error(rgd); + return -EIO; +} + +/** + * gfs2_alloc_di - Allocate a dinode + * @dip: the directory that the inode is going in + * @bn: the block number which is allocated + * @generation: the generation number of the inode + * + * Returns: 0 on success or error + */ + +int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_alloc *al = dip->i_alloc; + struct gfs2_rgrpd *rgd = al->al_rgd; + u32 blk; + u64 block; + unsigned int n = 1; + + blk = rgblk_search(rgd, rgd->rd_last_alloc, + GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n); + + /* Since all blocks are reserved in advance, this shouldn't happen */ + if (blk == BFITNOENT) + goto rgrp_error; + + rgd->rd_last_alloc = blk; + block = rgd->rd_data0 + blk; + if (rgd->rd_free == 0) + goto rgrp_error; + + rgd->rd_free--; + rgd->rd_dinodes++; + *generation = rgd->rd_igeneration++; + if (*generation == 0) + *generation = rgd->rd_igeneration++; + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + + al->al_alloced++; + + gfs2_statfs_change(sdp, 0, -1, +1); + gfs2_trans_add_unrevoke(sdp, block, 1); + + spin_lock(&sdp->sd_rindex_spin); + rgd->rd_free_clone--; + spin_unlock(&sdp->sd_rindex_spin); + trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE); + *bn = block; + return 0; + +rgrp_error: + gfs2_rgrp_error(rgd); + return -EIO; +} + +/** + * gfs2_free_data - free a contiguous run of data block(s) + * @ip: the inode these blocks are being freed from + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * + */ + +void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + + rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); + if (!rgd) + return; + trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE); + rgd->rd_free += blen; + + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + + gfs2_trans_add_rg(rgd); + + /* Directories keep their data in the metadata address space */ + if (ip->i_depth) + gfs2_meta_wipe(ip, bstart, blen); +} + +/** + * gfs2_free_data - free a contiguous run of data block(s) + * @ip: the inode these blocks are being freed from + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * + */ + +void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + __gfs2_free_data(ip, bstart, blen); + gfs2_statfs_change(sdp, 0, +blen, 0); + gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); +} + +/** + * gfs2_free_meta - free a contiguous run of data block(s) + * @ip: the inode these blocks are being freed from + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * + */ + +void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + + rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); + if (!rgd) + return; + trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE); + rgd->rd_free += blen; + + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + + gfs2_trans_add_rg(rgd); + gfs2_meta_wipe(ip, bstart, blen); +} + +/** + * gfs2_free_meta - free a contiguous run of data block(s) + * @ip: the inode these blocks are being freed from + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * + */ + +void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + __gfs2_free_meta(ip, bstart, blen); + gfs2_statfs_change(sdp, 0, +blen, 0); + gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); +} + +void gfs2_unlink_di(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_rgrpd *rgd; + u64 blkno = ip->i_no_addr; + + rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED); + if (!rgd) + return; + trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED); + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + gfs2_trans_add_rg(rgd); +} + +static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_rgrpd *tmp_rgd; + + tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE); + if (!tmp_rgd) + return; + gfs2_assert_withdraw(sdp, rgd == tmp_rgd); + + if (!rgd->rd_dinodes) + gfs2_consist_rgrpd(rgd); + rgd->rd_dinodes--; + rgd->rd_free++; + + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + + gfs2_statfs_change(sdp, 0, +1, -1); + gfs2_trans_add_rg(rgd); +} + + +void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) +{ + gfs2_free_uninit_di(rgd, ip->i_no_addr); + trace_gfs2_block_alloc(ip, ip->i_no_addr, 1, GFS2_BLKST_FREE); + gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid); + gfs2_meta_wipe(ip, ip->i_no_addr, 1); +} + +/** + * gfs2_check_blk_type - Check the type of a block + * @sdp: The superblock + * @no_addr: The block number to check + * @type: The block type we are looking for + * + * Returns: 0 if the block type matches the expected type + * -ESTALE if it doesn't match + * or -ve errno if something went wrong while checking + */ + +int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_holder ri_gh, rgd_gh; + struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); + int ri_locked = 0; + int error; + + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto fail; + ri_locked = 1; + } + + error = -EINVAL; + rgd = gfs2_blk2rgrpd(sdp, no_addr); + if (!rgd) + goto fail_rindex; + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); + if (error) + goto fail_rindex; + + if (gfs2_get_block_type(rgd, no_addr) != type) + error = -ESTALE; + + gfs2_glock_dq_uninit(&rgd_gh); +fail_rindex: + if (ri_locked) + gfs2_glock_dq_uninit(&ri_gh); +fail: + return error; +} + +/** + * gfs2_rlist_add - add a RG to a list of RGs + * @sdp: the filesystem + * @rlist: the list of resource groups + * @block: the block + * + * Figure out what RG a block belongs to and add that RG to the list + * + * FIXME: Don't use NOFAIL + * + */ + +void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, + u64 block) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_rgrpd **tmp; + unsigned int new_space; + unsigned int x; + + if (gfs2_assert_warn(sdp, !rlist->rl_ghs)) + return; + + rgd = gfs2_blk2rgrpd(sdp, block); + if (!rgd) { + if (gfs2_consist(sdp)) + fs_err(sdp, "block = %llu\n", (unsigned long long)block); + return; + } + + for (x = 0; x < rlist->rl_rgrps; x++) + if (rlist->rl_rgd[x] == rgd) + return; + + if (rlist->rl_rgrps == rlist->rl_space) { + new_space = rlist->rl_space + 10; + + tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *), + GFP_NOFS | __GFP_NOFAIL); + + if (rlist->rl_rgd) { + memcpy(tmp, rlist->rl_rgd, + rlist->rl_space * sizeof(struct gfs2_rgrpd *)); + kfree(rlist->rl_rgd); + } + + rlist->rl_space = new_space; + rlist->rl_rgd = tmp; + } + + rlist->rl_rgd[rlist->rl_rgrps++] = rgd; +} + +/** + * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate + * and initialize an array of glock holders for them + * @rlist: the list of resource groups + * @state: the lock state to acquire the RG lock in + * @flags: the modifier flags for the holder structures + * + * FIXME: Don't use NOFAIL + * + */ + +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) +{ + unsigned int x; + + rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder), + GFP_NOFS | __GFP_NOFAIL); + for (x = 0; x < rlist->rl_rgrps; x++) + gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, + state, 0, + &rlist->rl_ghs[x]); +} + +/** + * gfs2_rlist_free - free a resource group list + * @list: the list of resource groups + * + */ + +void gfs2_rlist_free(struct gfs2_rgrp_list *rlist) +{ + unsigned int x; + + kfree(rlist->rl_rgd); + + if (rlist->rl_ghs) { + for (x = 0; x < rlist->rl_rgrps; x++) + gfs2_holder_uninit(&rlist->rl_ghs[x]); + kfree(rlist->rl_ghs); + } +} + diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h new file mode 100644 index 00000000..a80e3034 --- /dev/null +++ b/fs/gfs2/rgrp.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __RGRP_DOT_H__ +#define __RGRP_DOT_H__ + +#include + +struct gfs2_rgrpd; +struct gfs2_sbd; +struct gfs2_holder; + +extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); + +struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); +struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); +struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); + +extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); +extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); + +extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); +extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); +extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd); + +extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd); + +extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); +static inline void gfs2_alloc_put(struct gfs2_inode *ip) +{ + BUG_ON(ip->i_alloc == NULL); + kfree(ip->i_alloc); + ip->i_alloc = NULL; +} + +extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, + char *file, unsigned int line); +#define gfs2_inplace_reserve(ip) \ + gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__) +#define gfs2_inplace_reserve_ri(ip) \ + gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__) + +extern void gfs2_inplace_release(struct gfs2_inode *ip); + +extern int gfs2_ri_update(struct gfs2_inode *ip); +extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); +extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); + +extern void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); +extern void gfs2_unlink_di(struct inode *inode); +extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, + unsigned int type); + +struct gfs2_rgrp_list { + unsigned int rl_rgrps; + unsigned int rl_space; + struct gfs2_rgrpd **rl_rgd; + struct gfs2_holder *rl_ghs; +}; + +extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, + u64 block); +extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); +extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); +extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); +extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); + +#endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c new file mode 100644 index 00000000..fb0edf73 --- /dev/null +++ b/fs/gfs2/super.c @@ -0,0 +1,1585 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "recovery.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "util.h" +#include "sys.h" +#include "xattr.h" + +#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) + +enum { + Opt_lockproto, + Opt_locktable, + Opt_hostdata, + Opt_spectator, + Opt_ignore_local_fs, + Opt_localflocks, + Opt_localcaching, + Opt_debug, + Opt_nodebug, + Opt_upgrade, + Opt_acl, + Opt_noacl, + Opt_quota_off, + Opt_quota_account, + Opt_quota_on, + Opt_quota, + Opt_noquota, + Opt_suiddir, + Opt_nosuiddir, + Opt_data_writeback, + Opt_data_ordered, + Opt_meta, + Opt_discard, + Opt_nodiscard, + Opt_commit, + Opt_err_withdraw, + Opt_err_panic, + Opt_statfs_quantum, + Opt_statfs_percent, + Opt_quota_quantum, + Opt_barrier, + Opt_nobarrier, + Opt_error, +}; + +static const match_table_t tokens = { + {Opt_lockproto, "lockproto=%s"}, + {Opt_locktable, "locktable=%s"}, + {Opt_hostdata, "hostdata=%s"}, + {Opt_spectator, "spectator"}, + {Opt_spectator, "norecovery"}, + {Opt_ignore_local_fs, "ignore_local_fs"}, + {Opt_localflocks, "localflocks"}, + {Opt_localcaching, "localcaching"}, + {Opt_debug, "debug"}, + {Opt_nodebug, "nodebug"}, + {Opt_upgrade, "upgrade"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_quota_off, "quota=off"}, + {Opt_quota_account, "quota=account"}, + {Opt_quota_on, "quota=on"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, + {Opt_suiddir, "suiddir"}, + {Opt_nosuiddir, "nosuiddir"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_data_ordered, "data=ordered"}, + {Opt_meta, "meta"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_commit, "commit=%d"}, + {Opt_err_withdraw, "errors=withdraw"}, + {Opt_err_panic, "errors=panic"}, + {Opt_statfs_quantum, "statfs_quantum=%d"}, + {Opt_statfs_percent, "statfs_percent=%d"}, + {Opt_quota_quantum, "quota_quantum=%d"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_error, NULL} +}; + +/** + * gfs2_mount_args - Parse mount options + * @args: The structure into which the parsed options will be written + * @options: The options to parse + * + * Return: errno + */ + +int gfs2_mount_args(struct gfs2_args *args, char *options) +{ + char *o; + int token; + substring_t tmp[MAX_OPT_ARGS]; + int rv; + + /* Split the options into tokens with the "," character and + process them */ + + while (1) { + o = strsep(&options, ","); + if (o == NULL) + break; + if (*o == '\0') + continue; + + token = match_token(o, tokens, tmp); + switch (token) { + case Opt_lockproto: + match_strlcpy(args->ar_lockproto, &tmp[0], + GFS2_LOCKNAME_LEN); + break; + case Opt_locktable: + match_strlcpy(args->ar_locktable, &tmp[0], + GFS2_LOCKNAME_LEN); + break; + case Opt_hostdata: + match_strlcpy(args->ar_hostdata, &tmp[0], + GFS2_LOCKNAME_LEN); + break; + case Opt_spectator: + args->ar_spectator = 1; + break; + case Opt_ignore_local_fs: + /* Retained for backwards compat only */ + break; + case Opt_localflocks: + args->ar_localflocks = 1; + break; + case Opt_localcaching: + /* Retained for backwards compat only */ + break; + case Opt_debug: + if (args->ar_errors == GFS2_ERRORS_PANIC) { + printk(KERN_WARNING "GFS2: -o debug and -o errors=panic " + "are mutually exclusive.\n"); + return -EINVAL; + } + args->ar_debug = 1; + break; + case Opt_nodebug: + args->ar_debug = 0; + break; + case Opt_upgrade: + /* Retained for backwards compat only */ + break; + case Opt_acl: + args->ar_posix_acl = 1; + break; + case Opt_noacl: + args->ar_posix_acl = 0; + break; + case Opt_quota_off: + case Opt_noquota: + args->ar_quota = GFS2_QUOTA_OFF; + break; + case Opt_quota_account: + args->ar_quota = GFS2_QUOTA_ACCOUNT; + break; + case Opt_quota_on: + case Opt_quota: + args->ar_quota = GFS2_QUOTA_ON; + break; + case Opt_suiddir: + args->ar_suiddir = 1; + break; + case Opt_nosuiddir: + args->ar_suiddir = 0; + break; + case Opt_data_writeback: + args->ar_data = GFS2_DATA_WRITEBACK; + break; + case Opt_data_ordered: + args->ar_data = GFS2_DATA_ORDERED; + break; + case Opt_meta: + args->ar_meta = 1; + break; + case Opt_discard: + args->ar_discard = 1; + break; + case Opt_nodiscard: + args->ar_discard = 0; + break; + case Opt_commit: + rv = match_int(&tmp[0], &args->ar_commit); + if (rv || args->ar_commit <= 0) { + printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_statfs_quantum: + rv = match_int(&tmp[0], &args->ar_statfs_quantum); + if (rv || args->ar_statfs_quantum < 0) { + printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_quota_quantum: + rv = match_int(&tmp[0], &args->ar_quota_quantum); + if (rv || args->ar_quota_quantum <= 0) { + printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_statfs_percent: + rv = match_int(&tmp[0], &args->ar_statfs_percent); + if (rv || args->ar_statfs_percent < 0 || + args->ar_statfs_percent > 100) { + printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_err_withdraw: + args->ar_errors = GFS2_ERRORS_WITHDRAW; + break; + case Opt_err_panic: + if (args->ar_debug) { + printk(KERN_WARNING "GFS2: -o debug and -o errors=panic " + "are mutually exclusive.\n"); + return -EINVAL; + } + args->ar_errors = GFS2_ERRORS_PANIC; + break; + case Opt_barrier: + args->ar_nobarrier = 0; + break; + case Opt_nobarrier: + args->ar_nobarrier = 1; + break; + case Opt_error: + default: + printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o); + return -EINVAL; + } + } + + return 0; +} + +/** + * gfs2_jindex_free - Clear all the journal index information + * @sdp: The GFS2 superblock + * + */ + +void gfs2_jindex_free(struct gfs2_sbd *sdp) +{ + struct list_head list, *head; + struct gfs2_jdesc *jd; + struct gfs2_journal_extent *jext; + + spin_lock(&sdp->sd_jindex_spin); + list_add(&list, &sdp->sd_jindex_list); + list_del_init(&sdp->sd_jindex_list); + sdp->sd_journals = 0; + spin_unlock(&sdp->sd_jindex_spin); + + while (!list_empty(&list)) { + jd = list_entry(list.next, struct gfs2_jdesc, jd_list); + head = &jd->extent_list; + while (!list_empty(head)) { + jext = list_entry(head->next, + struct gfs2_journal_extent, + extent_list); + list_del(&jext->extent_list); + kfree(jext); + } + list_del(&jd->jd_list); + iput(jd->jd_inode); + kfree(jd); + } +} + +static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid) +{ + struct gfs2_jdesc *jd; + int found = 0; + + list_for_each_entry(jd, head, jd_list) { + if (jd->jd_jid == jid) { + found = 1; + break; + } + } + + if (!found) + jd = NULL; + + return jd; +} + +struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid) +{ + struct gfs2_jdesc *jd; + + spin_lock(&sdp->sd_jindex_spin); + jd = jdesc_find_i(&sdp->sd_jindex_list, jid); + spin_unlock(&sdp->sd_jindex_spin); + + return jd; +} + +int gfs2_jdesc_check(struct gfs2_jdesc *jd) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + u64 size = i_size_read(jd->jd_inode); + + if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30)) + return -EIO; + + jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift; + + if (gfs2_write_alloc_required(ip, 0, size)) { + gfs2_consist_inode(ip); + return -EIO; + } + + return 0; +} + +/** + * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one + * @sdp: the filesystem + * + * Returns: errno + */ + +int gfs2_make_fs_rw(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); + struct gfs2_glock *j_gl = ip->i_gl; + struct gfs2_holder t_gh; + struct gfs2_log_header_host head; + int error; + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); + if (error) + return error; + + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); + + error = gfs2_find_jhead(sdp->sd_jdesc, &head); + if (error) + goto fail; + + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + gfs2_consist(sdp); + error = -EIO; + goto fail; + } + + /* Initialize some head of the log stuff */ + sdp->sd_log_sequence = head.lh_sequence + 1; + gfs2_log_pointers_init(sdp, head.lh_blkno); + + error = gfs2_quota_init(sdp); + if (error) + goto fail; + + set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + + gfs2_glock_dq_uninit(&t_gh); + + return 0; + +fail: + t_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&t_gh); + + return error; +} + +void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) +{ + const struct gfs2_statfs_change *str = buf; + + sc->sc_total = be64_to_cpu(str->sc_total); + sc->sc_free = be64_to_cpu(str->sc_free); + sc->sc_dinodes = be64_to_cpu(str->sc_dinodes); +} + +static void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) +{ + struct gfs2_statfs_change *str = buf; + + str->sc_total = cpu_to_be64(sc->sc_total); + str->sc_free = cpu_to_be64(sc->sc_free); + str->sc_dinodes = cpu_to_be64(sc->sc_dinodes); +} + +int gfs2_statfs_init(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct buffer_head *m_bh, *l_bh; + struct gfs2_holder gh; + int error; + + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, + &gh); + if (error) + return error; + + error = gfs2_meta_inode_buffer(m_ip, &m_bh); + if (error) + goto out; + + if (sdp->sd_args.ar_spectator) { + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); + } else { + error = gfs2_meta_inode_buffer(l_ip, &l_bh); + if (error) + goto out_m_bh; + + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + gfs2_statfs_change_in(l_sc, l_bh->b_data + + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); + + brelse(l_bh); + } + +out_m_bh: + brelse(m_bh); +out: + gfs2_glock_dq_uninit(&gh); + return 0; +} + +void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, + s64 dinodes) +{ + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct buffer_head *l_bh; + s64 x, y; + int need_sync = 0; + int error; + + error = gfs2_meta_inode_buffer(l_ip, &l_bh); + if (error) + return; + + gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + + spin_lock(&sdp->sd_statfs_spin); + l_sc->sc_total += total; + l_sc->sc_free += free; + l_sc->sc_dinodes += dinodes; + gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode)); + if (sdp->sd_args.ar_statfs_percent) { + x = 100 * l_sc->sc_free; + y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent; + if (x >= y || x <= -y) + need_sync = 1; + } + spin_unlock(&sdp->sd_statfs_spin); + + brelse(l_bh); + if (need_sync) + gfs2_wake_up_statfs(sdp); +} + +void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, + struct buffer_head *l_bh) +{ + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + + spin_lock(&sdp->sd_statfs_spin); + m_sc->sc_total += l_sc->sc_total; + m_sc->sc_free += l_sc->sc_free; + m_sc->sc_dinodes += l_sc->sc_dinodes; + memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); + memset(l_bh->b_data + sizeof(struct gfs2_dinode), + 0, sizeof(struct gfs2_statfs_change)); + spin_unlock(&sdp->sd_statfs_spin); + + gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); + gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); +} + +int gfs2_statfs_sync(struct super_block *sb, int type) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct gfs2_holder gh; + struct buffer_head *m_bh, *l_bh; + int error; + + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, + &gh); + if (error) + return error; + + error = gfs2_meta_inode_buffer(m_ip, &m_bh); + if (error) + goto out; + + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) { + spin_unlock(&sdp->sd_statfs_spin); + goto out_bh; + } + spin_unlock(&sdp->sd_statfs_spin); + + error = gfs2_meta_inode_buffer(l_ip, &l_bh); + if (error) + goto out_bh; + + error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0); + if (error) + goto out_bh2; + + update_statfs(sdp, m_bh, l_bh); + sdp->sd_statfs_force_sync = 0; + + gfs2_trans_end(sdp); + +out_bh2: + brelse(l_bh); +out_bh: + brelse(m_bh); +out: + gfs2_glock_dq_uninit(&gh); + return error; +} + +struct lfcc { + struct list_head list; + struct gfs2_holder gh; +}; + +/** + * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all + * journals are clean + * @sdp: the file system + * @state: the state to put the transaction lock into + * @t_gh: the hold on the transaction lock + * + * Returns: errno + */ + +static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, + struct gfs2_holder *t_gh) +{ + struct gfs2_inode *ip; + struct gfs2_jdesc *jd; + struct lfcc *lfcc; + LIST_HEAD(list); + struct gfs2_log_header_host lh; + int error; + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); + if (!lfcc) { + error = -ENOMEM; + goto out; + } + ip = GFS2_I(jd->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh); + if (error) { + kfree(lfcc); + goto out; + } + list_add(&lfcc->list, &list); + } + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED, + GL_NOCACHE, t_gh); + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + error = gfs2_jdesc_check(jd); + if (error) + break; + error = gfs2_find_jhead(jd, &lh); + if (error) + break; + if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + error = -EBUSY; + break; + } + } + + if (error) + gfs2_glock_dq_uninit(t_gh); + +out: + while (!list_empty(&list)) { + lfcc = list_entry(list.next, struct lfcc, list); + list_del(&lfcc->list); + gfs2_glock_dq_uninit(&lfcc->gh); + kfree(lfcc); + } + return error; +} + +/** + * gfs2_freeze_fs - freezes the file system + * @sdp: the file system + * + * This function flushes data and meta data for all machines by + * acquiring the transaction log exclusively. All journals are + * ensured to be in a clean state as well. + * + * Returns: errno + */ + +int gfs2_freeze_fs(struct gfs2_sbd *sdp) +{ + int error = 0; + + mutex_lock(&sdp->sd_freeze_lock); + + if (!sdp->sd_freeze_count++) { + error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); + if (error) + sdp->sd_freeze_count--; + } + + mutex_unlock(&sdp->sd_freeze_lock); + + return error; +} + +/** + * gfs2_unfreeze_fs - unfreezes the file system + * @sdp: the file system + * + * This function allows the file system to proceed by unlocking + * the exclusively held transaction lock. Other GFS2 nodes are + * now free to acquire the lock shared and go on with their lives. + * + */ + +void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) +{ + mutex_lock(&sdp->sd_freeze_lock); + + if (sdp->sd_freeze_count && !--sdp->sd_freeze_count) + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + + mutex_unlock(&sdp->sd_freeze_lock); +} + +void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) +{ + struct gfs2_dinode *str = buf; + + str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI); + str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); + str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); + str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); + str->di_mode = cpu_to_be32(ip->i_inode.i_mode); + str->di_uid = cpu_to_be32(ip->i_inode.i_uid); + str->di_gid = cpu_to_be32(ip->i_inode.i_gid); + str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); + str->di_size = cpu_to_be64(i_size_read(&ip->i_inode)); + str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); + str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); + str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); + str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec); + + str->di_goal_meta = cpu_to_be64(ip->i_goal); + str->di_goal_data = cpu_to_be64(ip->i_goal); + str->di_generation = cpu_to_be64(ip->i_generation); + + str->di_flags = cpu_to_be32(ip->i_diskflags); + str->di_height = cpu_to_be16(ip->i_height); + str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && + !(ip->i_diskflags & GFS2_DIF_EXHASH) ? + GFS2_FORMAT_DE : 0); + str->di_depth = cpu_to_be16(ip->i_depth); + str->di_entries = cpu_to_be32(ip->i_entries); + + str->di_eattr = cpu_to_be64(ip->i_eattr); + str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); + str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec); + str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec); +} + +/** + * gfs2_write_inode - Make sure the inode is stable on the disk + * @inode: The inode + * @wbc: The writeback control structure + * + * Returns: errno + */ + +static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); + struct backing_dev_info *bdi = metamapping->backing_dev_info; + struct gfs2_holder gh; + struct buffer_head *bh; + struct timespec atime; + struct gfs2_dinode *di; + int ret = -EAGAIN; + int unlock_required = 0; + + /* Skip timestamp update, if this is from a memalloc */ + if (current->flags & PF_MEMALLOC) + goto do_flush; + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + goto do_flush; + unlock_required = 1; + } + ret = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (ret) + goto do_unlock; + ret = gfs2_meta_inode_buffer(ip, &bh); + if (ret == 0) { + di = (struct gfs2_dinode *)bh->b_data; + atime.tv_sec = be64_to_cpu(di->di_atime); + atime.tv_nsec = be32_to_cpu(di->di_atime_nsec); + if (timespec_compare(&inode->i_atime, &atime) > 0) { + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_dinode_out(ip, bh->b_data); + } + brelse(bh); + } + gfs2_trans_end(sdp); +do_unlock: + if (unlock_required) + gfs2_glock_dq_uninit(&gh); +do_flush: + if (wbc->sync_mode == WB_SYNC_ALL) + gfs2_log_flush(GFS2_SB(inode), ip->i_gl); + filemap_fdatawrite(metamapping); + if (bdi->dirty_exceeded) + gfs2_ail1_flush(sdp, wbc); + if (!ret && (wbc->sync_mode == WB_SYNC_ALL)) + ret = filemap_fdatawait(metamapping); + if (ret) + mark_inode_dirty_sync(inode); + return ret; +} + +/** + * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one + * @sdp: the filesystem + * + * Returns: errno + */ + +static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) +{ + struct gfs2_holder t_gh; + int error; + + flush_workqueue(gfs2_delete_workqueue); + gfs2_quota_sync(sdp->sd_vfs, 0, 1); + gfs2_statfs_sync(sdp->sd_vfs, 0); + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, + &t_gh); + if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return error; + + gfs2_meta_syncfs(sdp); + gfs2_log_shutdown(sdp); + + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + + if (t_gh.gh_gl) + gfs2_glock_dq_uninit(&t_gh); + + gfs2_quota_cleanup(sdp); + + return error; +} + +static int gfs2_umount_recovery_wait(void *word) +{ + schedule(); + return 0; +} + +/** + * gfs2_put_super - Unmount the filesystem + * @sb: The VFS superblock + * + */ + +static void gfs2_put_super(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + struct gfs2_jdesc *jd; + + /* Unfreeze the filesystem, if we need to */ + + mutex_lock(&sdp->sd_freeze_lock); + if (sdp->sd_freeze_count) + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + mutex_unlock(&sdp->sd_freeze_lock); + + /* No more recovery requests */ + set_bit(SDF_NORECOVERY, &sdp->sd_flags); + smp_mb(); + + /* Wait on outstanding recovery */ +restart: + spin_lock(&sdp->sd_jindex_spin); + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (!test_bit(JDF_RECOVERY, &jd->jd_flags)) + continue; + spin_unlock(&sdp->sd_jindex_spin); + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, + gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE); + goto restart; + } + spin_unlock(&sdp->sd_jindex_spin); + + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); + + if (!(sb->s_flags & MS_RDONLY)) { + error = gfs2_make_fs_ro(sdp); + if (error) + gfs2_io_error(sdp); + } + /* At this point, we're through modifying the disk */ + + /* Release stuff */ + + iput(sdp->sd_jindex); + iput(sdp->sd_statfs_inode); + iput(sdp->sd_rindex); + iput(sdp->sd_quota_inode); + + gfs2_glock_put(sdp->sd_rename_gl); + gfs2_glock_put(sdp->sd_trans_gl); + + if (!sdp->sd_args.ar_spectator) { + gfs2_glock_dq_uninit(&sdp->sd_journal_gh); + gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + gfs2_glock_dq_uninit(&sdp->sd_qc_gh); + iput(sdp->sd_sc_inode); + iput(sdp->sd_qc_inode); + } + + gfs2_glock_dq_uninit(&sdp->sd_live_gh); + gfs2_clear_rgrpd(sdp); + gfs2_jindex_free(sdp); + /* Take apart glock structures and buffer lists */ + gfs2_gl_hash_clear(sdp); + /* Unmount the locking protocol */ + gfs2_lm_unmount(sdp); + + /* At this point, we're through participating in the lockspace */ + gfs2_sys_fs_del(sdp); +} + +/** + * gfs2_sync_fs - sync the filesystem + * @sb: the superblock + * + * Flushes the log to disk. + */ + +static int gfs2_sync_fs(struct super_block *sb, int wait) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + if (wait && sdp) + gfs2_log_flush(sdp, NULL); + return 0; +} + +/** + * gfs2_freeze - prevent further writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static int gfs2_freeze(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + + if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return -EINVAL; + + for (;;) { + error = gfs2_freeze_fs(sdp); + if (!error) + break; + + switch (error) { + case -EBUSY: + fs_err(sdp, "waiting for recovery before freeze\n"); + break; + + default: + fs_err(sdp, "error freezing FS: %d\n", error); + break; + } + + fs_err(sdp, "retrying...\n"); + msleep(1000); + } + return 0; +} + +/** + * gfs2_unfreeze - reallow writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static int gfs2_unfreeze(struct super_block *sb) +{ + gfs2_unfreeze_fs(sb->s_fs_info); + return 0; +} + +/** + * statfs_fill - fill in the sg for a given RG + * @rgd: the RG + * @sc: the sc structure + * + * Returns: 0 on success, -ESTALE if the LVB is invalid + */ + +static int statfs_slow_fill(struct gfs2_rgrpd *rgd, + struct gfs2_statfs_change_host *sc) +{ + gfs2_rgrp_verify(rgd); + sc->sc_total += rgd->rd_data; + sc->sc_free += rgd->rd_free; + sc->sc_dinodes += rgd->rd_dinodes; + return 0; +} + +/** + * gfs2_statfs_slow - Stat a filesystem using asynchronous locking + * @sdp: the filesystem + * @sc: the sc info that will be returned + * + * Any error (other than a signal) will cause this routine to fall back + * to the synchronous version. + * + * FIXME: This really shouldn't busy wait like this. + * + * Returns: errno + */ + +static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_holder ri_gh; + struct gfs2_rgrpd *rgd_next; + struct gfs2_holder *gha, *gh; + unsigned int slots = 64; + unsigned int x; + int done; + int error = 0, err; + + memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); + gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL); + if (!gha) + return -ENOMEM; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto out; + + rgd_next = gfs2_rgrpd_get_first(sdp); + + for (;;) { + done = 1; + + for (x = 0; x < slots; x++) { + gh = gha + x; + + if (gh->gh_gl && gfs2_glock_poll(gh)) { + err = gfs2_glock_wait(gh); + if (err) { + gfs2_holder_uninit(gh); + error = err; + } else { + if (!error) + error = statfs_slow_fill( + gh->gh_gl->gl_object, sc); + gfs2_glock_dq_uninit(gh); + } + } + + if (gh->gh_gl) + done = 0; + else if (rgd_next && !error) { + error = gfs2_glock_nq_init(rgd_next->rd_gl, + LM_ST_SHARED, + GL_ASYNC, + gh); + rgd_next = gfs2_rgrpd_get_next(rgd_next); + done = 0; + } + + if (signal_pending(current)) + error = -ERESTARTSYS; + } + + if (done) + break; + + yield(); + } + + gfs2_glock_dq_uninit(&ri_gh); + +out: + kfree(gha); + return error; +} + +/** + * gfs2_statfs_i - Do a statfs + * @sdp: the filesystem + * @sg: the sg structure + * + * Returns: errno + */ + +static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + spin_lock(&sdp->sd_statfs_spin); + + *sc = *m_sc; + sc->sc_total += l_sc->sc_total; + sc->sc_free += l_sc->sc_free; + sc->sc_dinodes += l_sc->sc_dinodes; + + spin_unlock(&sdp->sd_statfs_spin); + + if (sc->sc_free < 0) + sc->sc_free = 0; + if (sc->sc_free > sc->sc_total) + sc->sc_free = sc->sc_total; + if (sc->sc_dinodes < 0) + sc->sc_dinodes = 0; + + return 0; +} + +/** + * gfs2_statfs - Gather and return stats about the filesystem + * @sb: The superblock + * @statfsbuf: The buffer + * + * Returns: 0 on success or error code + */ + +static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_statfs_change_host sc; + int error; + + if (gfs2_tune_get(sdp, gt_statfs_slow)) + error = gfs2_statfs_slow(sdp, &sc); + else + error = gfs2_statfs_i(sdp, &sc); + + if (error) + return error; + + buf->f_type = GFS2_MAGIC; + buf->f_bsize = sdp->sd_sb.sb_bsize; + buf->f_blocks = sc.sc_total; + buf->f_bfree = sc.sc_free; + buf->f_bavail = sc.sc_free; + buf->f_files = sc.sc_dinodes + sc.sc_free; + buf->f_ffree = sc.sc_free; + buf->f_namelen = GFS2_FNAMESIZE; + + return 0; +} + +/** + * gfs2_remount_fs - called when the FS is remounted + * @sb: the filesystem + * @flags: the remount flags + * @data: extra data passed in (not used right now) + * + * Returns: errno + */ + +static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_args args = sdp->sd_args; /* Default to current settings */ + struct gfs2_tune *gt = &sdp->sd_tune; + int error; + + spin_lock(>->gt_spin); + args.ar_commit = gt->gt_logd_secs; + args.ar_quota_quantum = gt->gt_quota_quantum; + if (gt->gt_statfs_slow) + args.ar_statfs_quantum = 0; + else + args.ar_statfs_quantum = gt->gt_statfs_quantum; + spin_unlock(>->gt_spin); + error = gfs2_mount_args(&args, data); + if (error) + return error; + + /* Not allowed to change locking details */ + if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) || + strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) || + strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata)) + return -EINVAL; + + /* Some flags must not be changed */ + if (args_neq(&args, &sdp->sd_args, spectator) || + args_neq(&args, &sdp->sd_args, localflocks) || + args_neq(&args, &sdp->sd_args, meta)) + return -EINVAL; + + if (sdp->sd_args.ar_spectator) + *flags |= MS_RDONLY; + + if ((sb->s_flags ^ *flags) & MS_RDONLY) { + if (*flags & MS_RDONLY) + error = gfs2_make_fs_ro(sdp); + else + error = gfs2_make_fs_rw(sdp); + if (error) + return error; + } + + sdp->sd_args = args; + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= MS_POSIXACL; + else + sb->s_flags &= ~MS_POSIXACL; + if (sdp->sd_args.ar_nobarrier) + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + else + clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); + spin_lock(>->gt_spin); + gt->gt_logd_secs = args.ar_commit; + gt->gt_quota_quantum = args.ar_quota_quantum; + if (args.ar_statfs_quantum) { + gt->gt_statfs_slow = 0; + gt->gt_statfs_quantum = args.ar_statfs_quantum; + } + else { + gt->gt_statfs_slow = 1; + gt->gt_statfs_quantum = 30; + } + spin_unlock(>->gt_spin); + + gfs2_online_uevent(sdp); + return 0; +} + +/** + * gfs2_drop_inode - Drop an inode (test for remote unlink) + * @inode: The inode to drop + * + * If we've received a callback on an iopen lock then its because a + * remote node tried to deallocate the inode but failed due to this node + * still having the inode open. Here we mark the link count zero + * since we know that it must have reached zero if the GLF_DEMOTE flag + * is set on the iopen glock. If we didn't do a disk read since the + * remote node removed the final link then we might otherwise miss + * this event. This check ensures that this node will deallocate the + * inode's blocks, or alternatively pass the baton on to another + * node for later deallocation. + */ + +static int gfs2_drop_inode(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + if (inode->i_nlink) { + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) + clear_nlink(inode); + } + return generic_drop_inode(inode); +} + +static int is_ancestor(const struct dentry *d1, const struct dentry *d2) +{ + do { + if (d1 == d2) + return 1; + d1 = d1->d_parent; + } while (!IS_ROOT(d1)); + return 0; +} + +/** + * gfs2_show_options - Show mount options for /proc/mounts + * @s: seq_file structure + * @mnt: vfsmount + * + * Returns: 0 on success or error code + */ + +static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) +{ + struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; + struct gfs2_args *args = &sdp->sd_args; + int val; + + if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir)) + seq_printf(s, ",meta"); + if (args->ar_lockproto[0]) + seq_printf(s, ",lockproto=%s", args->ar_lockproto); + if (args->ar_locktable[0]) + seq_printf(s, ",locktable=%s", args->ar_locktable); + if (args->ar_hostdata[0]) + seq_printf(s, ",hostdata=%s", args->ar_hostdata); + if (args->ar_spectator) + seq_printf(s, ",spectator"); + if (args->ar_localflocks) + seq_printf(s, ",localflocks"); + if (args->ar_debug) + seq_printf(s, ",debug"); + if (args->ar_posix_acl) + seq_printf(s, ",acl"); + if (args->ar_quota != GFS2_QUOTA_DEFAULT) { + char *state; + switch (args->ar_quota) { + case GFS2_QUOTA_OFF: + state = "off"; + break; + case GFS2_QUOTA_ACCOUNT: + state = "account"; + break; + case GFS2_QUOTA_ON: + state = "on"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",quota=%s", state); + } + if (args->ar_suiddir) + seq_printf(s, ",suiddir"); + if (args->ar_data != GFS2_DATA_DEFAULT) { + char *state; + switch (args->ar_data) { + case GFS2_DATA_WRITEBACK: + state = "writeback"; + break; + case GFS2_DATA_ORDERED: + state = "ordered"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",data=%s", state); + } + if (args->ar_discard) + seq_printf(s, ",discard"); + val = sdp->sd_tune.gt_logd_secs; + if (val != 30) + seq_printf(s, ",commit=%d", val); + val = sdp->sd_tune.gt_statfs_quantum; + if (val != 30) + seq_printf(s, ",statfs_quantum=%d", val); + val = sdp->sd_tune.gt_quota_quantum; + if (val != 60) + seq_printf(s, ",quota_quantum=%d", val); + if (args->ar_statfs_percent) + seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent); + if (args->ar_errors != GFS2_ERRORS_DEFAULT) { + const char *state; + + switch (args->ar_errors) { + case GFS2_ERRORS_WITHDRAW: + state = "withdraw"; + break; + case GFS2_ERRORS_PANIC: + state = "panic"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",errors=%s", state); + } + if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) + seq_printf(s, ",nobarrier"); + if (test_bit(SDF_DEMOTE, &sdp->sd_flags)) + seq_printf(s, ",demote_interface_used"); + return 0; +} + +static void gfs2_final_release_pages(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_glock *gl = ip->i_gl; + + truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0); + truncate_inode_pages(&inode->i_data, 0); + + if (atomic_read(&gl->gl_revokes) == 0) { + clear_bit(GLF_LFLUSH, &gl->gl_flags); + clear_bit(GLF_DIRTY, &gl->gl_flags); + } +} + +static int gfs2_dinode_dealloc(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al; + struct gfs2_rgrpd *rgd; + int error; + + if (gfs2_get_inode_blocks(&ip->i_inode) != 1) { + gfs2_consist_inode(ip); + return -EIO; + } + + al = gfs2_alloc_get(ip); + if (!al) + return -ENOMEM; + + error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + error = gfs2_rindex_hold(sdp, &al->al_ri_gh); + if (error) + goto out_qs; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); + if (!rgd) { + gfs2_consist_inode(ip); + error = -EIO; + goto out_rindex_relse; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, + &al->al_rgd_gh); + if (error) + goto out_rindex_relse; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, + sdp->sd_jdesc->jd_blocks); + if (error) + goto out_rg_gunlock; + + gfs2_free_di(rgd, ip); + + gfs2_final_release_pages(ip); + + gfs2_trans_end(sdp); + +out_rg_gunlock: + gfs2_glock_dq_uninit(&al->al_rgd_gh); +out_rindex_relse: + gfs2_glock_dq_uninit(&al->al_ri_gh); +out_qs: + gfs2_quota_unhold(ip); +out: + gfs2_alloc_put(ip); + return error; +} + +/** + * gfs2_evict_inode - Remove an inode from cache + * @inode: The inode to evict + * + * There are three cases to consider: + * 1. i_nlink == 0, we are final opener (and must deallocate) + * 2. i_nlink == 0, we are not the final opener (and cannot deallocate) + * 3. i_nlink > 0 + * + * If the fs is read only, then we have to treat all cases as per #3 + * since we are unable to do any deallocation. The inode will be + * deallocated by the next read/write node to attempt an allocation + * in the same resource group + * + * We have to (at the moment) hold the inodes main lock to cover + * the gap between unlocking the shared lock on the iopen lock and + * taking the exclusive lock. I'd rather do a shared -> exclusive + * conversion on the iopen lock, but we can change that later. This + * is safe, just less efficient. + */ + +static void gfs2_evict_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + + if (inode->i_nlink || (sb->s_flags & MS_RDONLY)) + goto out; + + /* Must not read inode block until block type has been verified */ + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); + if (unlikely(error)) { + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + goto out; + } + + error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (error) + goto out_truncate; + + if (test_bit(GIF_INVALID, &ip->i_flags)) { + error = gfs2_inode_refresh(ip); + if (error) + goto out_truncate; + } + + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); + error = gfs2_glock_nq(&ip->i_iopen_gh); + if (error) + goto out_truncate; + + /* Case 1 starts here */ + + if (S_ISDIR(inode->i_mode) && + (ip->i_diskflags & GFS2_DIF_EXHASH)) { + error = gfs2_dir_exhash_dealloc(ip); + if (error) + goto out_unlock; + } + + if (ip->i_eattr) { + error = gfs2_ea_dealloc(ip); + if (error) + goto out_unlock; + } + + if (!gfs2_is_stuffed(ip)) { + error = gfs2_file_dealloc(ip); + if (error) + goto out_unlock; + } + + error = gfs2_dinode_dealloc(ip); + goto out_unlock; + +out_truncate: + /* Case 2 starts here */ + error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); + if (error) + goto out_unlock; + /* Needs to be done before glock release & also in a transaction */ + truncate_inode_pages(&inode->i_data, 0); + gfs2_trans_end(sdp); + +out_unlock: + /* Error path for case 1 */ + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) + gfs2_glock_dq(&ip->i_iopen_gh); + gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&gh); + if (error && error != GLR_TRYFAILED && error != -EROFS) + fs_warn(sdp, "gfs2_evict_inode: %d\n", error); +out: + /* Case 3 starts here */ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + + ip->i_gl->gl_object = NULL; + gfs2_glock_add_to_lru(ip->i_gl); + gfs2_glock_put(ip->i_gl); + ip->i_gl = NULL; + if (ip->i_iopen_gh.gh_gl) { + ip->i_iopen_gh.gh_gl->gl_object = NULL; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + } +} + +static struct inode *gfs2_alloc_inode(struct super_block *sb) +{ + struct gfs2_inode *ip; + + ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); + if (ip) { + ip->i_flags = 0; + ip->i_gl = NULL; + } + return &ip->i_inode; +} + +static void gfs2_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(gfs2_inode_cachep, inode); +} + +static void gfs2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, gfs2_i_callback); +} + +const struct super_operations gfs2_super_ops = { + .alloc_inode = gfs2_alloc_inode, + .destroy_inode = gfs2_destroy_inode, + .write_inode = gfs2_write_inode, + .evict_inode = gfs2_evict_inode, + .put_super = gfs2_put_super, + .sync_fs = gfs2_sync_fs, + .freeze_fs = gfs2_freeze, + .unfreeze_fs = gfs2_unfreeze, + .statfs = gfs2_statfs, + .remount_fs = gfs2_remount_fs, + .drop_inode = gfs2_drop_inode, + .show_options = gfs2_show_options, +}; + diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h new file mode 100644 index 00000000..a0464680 --- /dev/null +++ b/fs/gfs2/super.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __SUPER_DOT_H__ +#define __SUPER_DOT_H__ + +#include +#include +#include "incore.h" + +extern void gfs2_lm_unmount(struct gfs2_sbd *sdp); + +static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) +{ + unsigned int x; + spin_lock(&sdp->sd_jindex_spin); + x = sdp->sd_journals; + spin_unlock(&sdp->sd_jindex_spin); + return x; +} + +extern void gfs2_jindex_free(struct gfs2_sbd *sdp); + +extern int gfs2_mount_args(struct gfs2_args *args, char *data); + +extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); +extern int gfs2_jdesc_check(struct gfs2_jdesc *jd); + +extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, + struct gfs2_inode **ipp); + +extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); +extern void gfs2_online_uevent(struct gfs2_sbd *sdp); +extern int gfs2_statfs_init(struct gfs2_sbd *sdp); +extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, + s64 dinodes); +extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, + const void *buf); +extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, + struct buffer_head *l_bh); +extern int gfs2_statfs_sync(struct super_block *sb, int type); + +extern int gfs2_freeze_fs(struct gfs2_sbd *sdp); +extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); + +extern struct file_system_type gfs2_fs_type; +extern struct file_system_type gfs2meta_fs_type; +extern const struct export_operations gfs2_export_ops; +extern const struct super_operations gfs2_super_ops; +extern const struct dentry_operations gfs2_dops; +extern const struct xattr_handler *gfs2_xattr_handlers[]; + +#endif /* __SUPER_DOT_H__ */ + diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c new file mode 100644 index 00000000..443cabcf --- /dev/null +++ b/fs/gfs2/sys.c @@ -0,0 +1,653 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "sys.h" +#include "super.h" +#include "glock.h" +#include "quota.h" +#include "util.h" +#include "glops.h" +#include "recovery.h" + +struct gfs2_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); + ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); +}; + +static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->show ? a->show(sdp, buf) : 0; +} + +static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->store ? a->store(sdp, buf, len) : len; +} + +static const struct sysfs_ops gfs2_attr_ops = { + .show = gfs2_attr_show, + .store = gfs2_attr_store, +}; + + +static struct kset *gfs2_kset; + +static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u:%u\n", + MAJOR(sdp->sd_vfs->s_dev), MINOR(sdp->sd_vfs->s_dev)); +} + +static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname); +} + +static int gfs2_uuid_valid(const u8 *uuid) +{ + int i; + + for (i = 0; i < 16; i++) { + if (uuid[i]) + return 1; + } + return 0; +} + +static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf) +{ + struct super_block *s = sdp->sd_vfs; + const u8 *uuid = s->s_uuid; + buf[0] = '\0'; + if (!gfs2_uuid_valid(uuid)) + return 0; + return snprintf(buf, PAGE_SIZE, "%pUB\n", uuid); +} + +static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) +{ + unsigned int count; + + mutex_lock(&sdp->sd_freeze_lock); + count = sdp->sd_freeze_count; + mutex_unlock(&sdp->sd_freeze_lock); + + return snprintf(buf, PAGE_SIZE, "%u\n", count); +} + +static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + ssize_t ret = len; + int error = 0; + int n = simple_strtol(buf, NULL, 0); + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + switch (n) { + case 0: + gfs2_unfreeze_fs(sdp); + break; + case 1: + error = gfs2_freeze_fs(sdp); + break; + default: + ret = -EINVAL; + } + + if (error) + fs_warn(sdp, "freeze %d error %d", n, error); + + return ret; +} + +static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) +{ + unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags); + return snprintf(buf, PAGE_SIZE, "%u\n", b); +} + +static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (simple_strtol(buf, NULL, 0) != 1) + return -EINVAL; + + gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: withdrawing from cluster at user's request\n", + sdp->sd_fsname); + return len; +} + +static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (simple_strtol(buf, NULL, 0) != 1) + return -EINVAL; + + gfs2_statfs_sync(sdp->sd_vfs, 0); + return len; +} + +static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (simple_strtol(buf, NULL, 0) != 1) + return -EINVAL; + + gfs2_quota_sync(sdp->sd_vfs, 0, 1); + return len; +} + +static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + int error; + u32 id; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + id = simple_strtoul(buf, NULL, 0); + + error = gfs2_quota_refresh(sdp, 1, id); + return error ? error : len; +} + +static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + int error; + u32 id; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + id = simple_strtoul(buf, NULL, 0); + + error = gfs2_quota_refresh(sdp, 0, id); + return error ? error : len; +} + +static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + struct gfs2_glock *gl; + const struct gfs2_glock_operations *glops; + unsigned int glmode; + unsigned int gltype; + unsigned long long glnum; + char mode[16]; + int rv; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum, + mode); + if (rv != 3) + return -EINVAL; + + if (strcmp(mode, "EX") == 0) + glmode = LM_ST_UNLOCKED; + else if ((strcmp(mode, "CW") == 0) || (strcmp(mode, "DF") == 0)) + glmode = LM_ST_DEFERRED; + else if ((strcmp(mode, "PR") == 0) || (strcmp(mode, "SH") == 0)) + glmode = LM_ST_SHARED; + else + return -EINVAL; + + if (gltype > LM_TYPE_JOURNAL) + return -EINVAL; + if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK) + glops = &gfs2_trans_glops; + else + glops = gfs2_glops_list[gltype]; + if (glops == NULL) + return -EINVAL; + if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) + fs_info(sdp, "demote interface used\n"); + rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl); + if (rv) + return rv; + gfs2_glock_cb(gl, glmode); + gfs2_glock_put(gl); + return len; +} + + +#define GFS2_ATTR(name, mode, show, store) \ +static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store) + +GFS2_ATTR(id, 0444, id_show, NULL); +GFS2_ATTR(fsname, 0444, fsname_show, NULL); +GFS2_ATTR(uuid, 0444, uuid_show, NULL); +GFS2_ATTR(freeze, 0644, freeze_show, freeze_store); +GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store); +GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store); +GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store); +GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store); +GFS2_ATTR(demote_rq, 0200, NULL, demote_rq_store); + +static struct attribute *gfs2_attrs[] = { + &gfs2_attr_id.attr, + &gfs2_attr_fsname.attr, + &gfs2_attr_uuid.attr, + &gfs2_attr_freeze.attr, + &gfs2_attr_withdraw.attr, + &gfs2_attr_statfs_sync.attr, + &gfs2_attr_quota_sync.attr, + &gfs2_attr_quota_refresh_user.attr, + &gfs2_attr_quota_refresh_group.attr, + &gfs2_attr_demote_rq.attr, + NULL, +}; + +static struct kobj_type gfs2_ktype = { + .default_attrs = gfs2_attrs, + .sysfs_ops = &gfs2_attr_ops, +}; + + +/* + * lock_module. Originally from lock_dlm + */ + +static ssize_t proto_name_show(struct gfs2_sbd *sdp, char *buf) +{ + const struct lm_lockops *ops = sdp->sd_lockstruct.ls_ops; + return sprintf(buf, "%s\n", ops->lm_proto_name); +} + +static ssize_t block_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ssize_t ret; + int val = 0; + + if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags)) + val = 1; + ret = sprintf(buf, "%d\n", val); + return ret; +} + +static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ssize_t ret = len; + int val; + + val = simple_strtol(buf, NULL, 0); + + if (val == 1) + set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags); + else if (val == 0) { + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags); + smp_mb__after_clear_bit(); + gfs2_glock_thaw(sdp); + } else { + ret = -EINVAL; + } + return ret; +} + +static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_first); +} + +static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned first; + int rv; + + rv = sscanf(buf, "%u", &first); + if (rv != 1 || first > 1) + return -EINVAL; + rv = wait_for_completion_killable(&sdp->sd_locking_init); + if (rv) + return rv; + spin_lock(&sdp->sd_jindex_spin); + rv = -EBUSY; + if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + goto out; + rv = -EINVAL; + if (sdp->sd_args.ar_spectator) + goto out; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto out; + sdp->sd_lockstruct.ls_first = first; + rv = 0; +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + +static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_first_done); +} + +static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned jid; + struct gfs2_jdesc *jd; + int rv; + + rv = sscanf(buf, "%u", &jid); + if (rv != 1) + return -EINVAL; + + rv = -ESHUTDOWN; + spin_lock(&sdp->sd_jindex_spin); + if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) + goto out; + rv = -EBUSY; + if (sdp->sd_jdesc->jd_jid == jid) + goto out; + rv = -ENOENT; + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (jd->jd_jid != jid) + continue; + rv = gfs2_recover_journal(jd, false); + break; + } +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + +static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_recover_jid_done); +} + +static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_recover_jid_status); +} + +static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) +{ + return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid); +} + +static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + int jid; + int rv; + + rv = sscanf(buf, "%d", &jid); + if (rv != 1) + return -EINVAL; + rv = wait_for_completion_killable(&sdp->sd_locking_init); + if (rv) + return rv; + spin_lock(&sdp->sd_jindex_spin); + rv = -EINVAL; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto out; + rv = -EBUSY; + if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + goto out; + rv = 0; + if (sdp->sd_args.ar_spectator && jid > 0) + rv = jid = -EINVAL; + sdp->sd_lockstruct.ls_jid = jid; + clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + +#define GDLM_ATTR(_name,_mode,_show,_store) \ +static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) + +GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); +GDLM_ATTR(block, 0644, block_show, block_store); +GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GDLM_ATTR(jid, 0644, jid_show, jid_store); +GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store); +GDLM_ATTR(first_done, 0444, first_done_show, NULL); +GDLM_ATTR(recover, 0600, NULL, recover_store); +GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); +GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); + +static struct attribute *lock_module_attrs[] = { + &gdlm_attr_proto_name.attr, + &gdlm_attr_block.attr, + &gdlm_attr_withdraw.attr, + &gdlm_attr_jid.attr, + &gdlm_attr_first.attr, + &gdlm_attr_first_done.attr, + &gdlm_attr_recover.attr, + &gdlm_attr_recover_done.attr, + &gdlm_attr_recover_status.attr, + NULL, +}; + +/* + * get and set struct gfs2_tune fields + */ + +static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u %u\n", + sdp->sd_tune.gt_quota_scale_num, + sdp->sd_tune.gt_quota_scale_den); +} + +static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + struct gfs2_tune *gt = &sdp->sd_tune; + unsigned int x, y; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (sscanf(buf, "%u %u", &x, &y) != 2 || !y) + return -EINVAL; + + spin_lock(>->gt_spin); + gt->gt_quota_scale_num = x; + gt->gt_quota_scale_den = y; + spin_unlock(>->gt_spin); + return len; +} + +static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, + int check_zero, const char *buf, size_t len) +{ + struct gfs2_tune *gt = &sdp->sd_tune; + unsigned int x; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + x = simple_strtoul(buf, NULL, 0); + + if (check_zero && !x) + return -EINVAL; + + spin_lock(>->gt_spin); + *field = x; + spin_unlock(>->gt_spin); + return len; +} + +#define TUNE_ATTR_3(name, show, store) \ +static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store) + +#define TUNE_ATTR_2(name, store) \ +static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ +{ \ + return snprintf(buf, PAGE_SIZE, "%u\n", sdp->sd_tune.gt_##name); \ +} \ +TUNE_ATTR_3(name, name##_show, store) + +#define TUNE_ATTR(name, check_zero) \ +static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ +{ \ + return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \ +} \ +TUNE_ATTR_2(name, name##_store) + +TUNE_ATTR(quota_warn_period, 0); +TUNE_ATTR(quota_quantum, 0); +TUNE_ATTR(max_readahead, 0); +TUNE_ATTR(complain_secs, 0); +TUNE_ATTR(statfs_slow, 0); +TUNE_ATTR(new_files_jdata, 0); +TUNE_ATTR(quota_simul_sync, 1); +TUNE_ATTR(statfs_quantum, 1); +TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); + +static struct attribute *tune_attrs[] = { + &tune_attr_quota_warn_period.attr, + &tune_attr_quota_quantum.attr, + &tune_attr_max_readahead.attr, + &tune_attr_complain_secs.attr, + &tune_attr_statfs_slow.attr, + &tune_attr_quota_simul_sync.attr, + &tune_attr_statfs_quantum.attr, + &tune_attr_quota_scale.attr, + &tune_attr_new_files_jdata.attr, + NULL, +}; + +static struct attribute_group tune_group = { + .name = "tune", + .attrs = tune_attrs, +}; + +static struct attribute_group lock_module_group = { + .name = "lock_module", + .attrs = lock_module_attrs, +}; + +int gfs2_sys_fs_add(struct gfs2_sbd *sdp) +{ + struct super_block *sb = sdp->sd_vfs; + int error; + char ro[20]; + char spectator[20]; + char *envp[] = { ro, spectator, NULL }; + + sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); + sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); + + sdp->sd_kobj.kset = gfs2_kset; + error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, + "%s", sdp->sd_table_name); + if (error) + goto fail; + + error = sysfs_create_group(&sdp->sd_kobj, &tune_group); + if (error) + goto fail_reg; + + error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group); + if (error) + goto fail_tune; + + error = sysfs_create_link(&sdp->sd_kobj, + &disk_to_dev(sb->s_bdev->bd_disk)->kobj, + "device"); + if (error) + goto fail_lock_module; + + kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp); + return 0; + +fail_lock_module: + sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); +fail_tune: + sysfs_remove_group(&sdp->sd_kobj, &tune_group); +fail_reg: + kobject_put(&sdp->sd_kobj); +fail: + fs_err(sdp, "error %d adding sysfs files", error); + return error; +} + +void gfs2_sys_fs_del(struct gfs2_sbd *sdp) +{ + sysfs_remove_link(&sdp->sd_kobj, "device"); + sysfs_remove_group(&sdp->sd_kobj, &tune_group); + sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); + kobject_put(&sdp->sd_kobj); +} + +static int gfs2_uevent(struct kset *kset, struct kobject *kobj, + struct kobj_uevent_env *env) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct super_block *s = sdp->sd_vfs; + const u8 *uuid = s->s_uuid; + + add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); + add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); + if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) + add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid); + if (gfs2_uuid_valid(uuid)) + add_uevent_var(env, "UUID=%pUB", uuid); + return 0; +} + +static const struct kset_uevent_ops gfs2_uevent_ops = { + .uevent = gfs2_uevent, +}; + +int gfs2_sys_init(void) +{ + gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj); + if (!gfs2_kset) + return -ENOMEM; + return 0; +} + +void gfs2_sys_uninit(void) +{ + kset_unregister(gfs2_kset); +} + diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h new file mode 100644 index 00000000..e94560e8 --- /dev/null +++ b/fs/gfs2/sys.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __SYS_DOT_H__ +#define __SYS_DOT_H__ + +#include +struct gfs2_sbd; + +int gfs2_sys_fs_add(struct gfs2_sbd *sdp); +void gfs2_sys_fs_del(struct gfs2_sbd *sdp); + +int gfs2_sys_init(void); +void gfs2_sys_uninit(void); + +#endif /* __SYS_DOT_H__ */ + diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h new file mode 100644 index 00000000..5d07609e --- /dev/null +++ b/fs/gfs2/trace_gfs2.h @@ -0,0 +1,438 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM gfs2 + +#if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_GFS2_H + +#include + +#include +#include +#include +#include +#include +#include "incore.h" +#include "glock.h" + +#define dlm_state_name(nn) { DLM_LOCK_##nn, #nn } +#define glock_trace_name(x) __print_symbolic(x, \ + dlm_state_name(IV), \ + dlm_state_name(NL), \ + dlm_state_name(CR), \ + dlm_state_name(CW), \ + dlm_state_name(PR), \ + dlm_state_name(PW), \ + dlm_state_name(EX)) + +#define block_state_name(x) __print_symbolic(x, \ + { GFS2_BLKST_FREE, "free" }, \ + { GFS2_BLKST_USED, "used" }, \ + { GFS2_BLKST_DINODE, "dinode" }, \ + { GFS2_BLKST_UNLINKED, "unlinked" }) + +#define show_glock_flags(flags) __print_flags(flags, "", \ + {(1UL << GLF_LOCK), "l" }, \ + {(1UL << GLF_DEMOTE), "D" }, \ + {(1UL << GLF_PENDING_DEMOTE), "d" }, \ + {(1UL << GLF_DEMOTE_IN_PROGRESS), "p" }, \ + {(1UL << GLF_DIRTY), "y" }, \ + {(1UL << GLF_LFLUSH), "f" }, \ + {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ + {(1UL << GLF_REPLY_PENDING), "r" }, \ + {(1UL << GLF_INITIAL), "I" }, \ + {(1UL << GLF_FROZEN), "F" }, \ + {(1UL << GLF_QUEUED), "q" }, \ + {(1UL << GLF_LRU), "L" }, \ + {(1UL << GLF_OBJECT), "o" }) + +#ifndef NUMPTY +#define NUMPTY +static inline u8 glock_trace_state(unsigned int state) +{ + switch(state) { + case LM_ST_SHARED: + return DLM_LOCK_PR; + case LM_ST_DEFERRED: + return DLM_LOCK_CW; + case LM_ST_EXCLUSIVE: + return DLM_LOCK_EX; + } + return DLM_LOCK_NL; +} +#endif + +/* Section 1 - Locking + * + * Objectives: + * Latency: Remote demote request to state change + * Latency: Local lock request to state change + * Latency: State change to lock grant + * Correctness: Ordering of local lock state vs. I/O requests + * Correctness: Responses to remote demote requests + */ + +/* General glock state change (DLM lock request completes) */ +TRACE_EVENT(gfs2_glock_state_change, + + TP_PROTO(const struct gfs2_glock *gl, unsigned int new_state), + + TP_ARGS(gl, new_state), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( u8, new_state ) + __field( u8, dmt_state ) + __field( u8, tgt_state ) + __field( unsigned long, flags ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gl->gl_name.ln_number; + __entry->gltype = gl->gl_name.ln_type; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->new_state = glock_trace_state(new_state); + __entry->tgt_state = glock_trace_state(gl->gl_target); + __entry->dmt_state = glock_trace_state(gl->gl_demote_state); + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(__entry->new_state), + glock_trace_name(__entry->tgt_state), + glock_trace_name(__entry->dmt_state), + show_glock_flags(__entry->flags)) +); + +/* State change -> unlocked, glock is being deallocated */ +TRACE_EVENT(gfs2_glock_put, + + TP_PROTO(const struct gfs2_glock *gl), + + TP_ARGS(gl), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( unsigned long, flags ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->gltype = gl->gl_name.ln_type; + __entry->glnum = gl->gl_name.ln_number; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL< %s flags:%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->gltype, (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(DLM_LOCK_IV), + show_glock_flags(__entry->flags)) + +); + +/* Callback (local or remote) requesting lock demotion */ +TRACE_EVENT(gfs2_demote_rq, + + TP_PROTO(const struct gfs2_glock *gl), + + TP_ARGS(gl), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( u8, dmt_state ) + __field( unsigned long, flags ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->gltype = gl->gl_name.ln_type; + __entry->glnum = gl->gl_name.ln_number; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->dmt_state = glock_trace_state(gl->gl_demote_state); + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(__entry->dmt_state), + show_glock_flags(__entry->flags)) + +); + +/* Promotion/grant of a glock */ +TRACE_EVENT(gfs2_promote, + + TP_PROTO(const struct gfs2_holder *gh, int first), + + TP_ARGS(gh, first), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( int, first ) + __field( u8, state ) + ), + + TP_fast_assign( + __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gh->gh_gl->gl_name.ln_number; + __entry->gltype = gh->gh_gl->gl_name.ln_type; + __entry->first = first; + __entry->state = glock_trace_state(gh->gh_state); + ), + + TP_printk("%u,%u glock %u:%llu promote %s %s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + __entry->first ? "first": "other", + glock_trace_name(__entry->state)) +); + +/* Queue/dequeue a lock request */ +TRACE_EVENT(gfs2_glock_queue, + + TP_PROTO(const struct gfs2_holder *gh, int queue), + + TP_ARGS(gh, queue), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( int, queue ) + __field( u8, state ) + ), + + TP_fast_assign( + __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gh->gh_gl->gl_name.ln_number; + __entry->gltype = gh->gh_gl->gl_name.ln_type; + __entry->queue = queue; + __entry->state = glock_trace_state(gh->gh_state); + ), + + TP_printk("%u,%u glock %u:%llu %squeue %s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + __entry->queue ? "" : "de", + glock_trace_name(__entry->state)) +); + +/* Section 2 - Log/journal + * + * Objectives: + * Latency: Log flush time + * Correctness: pin/unpin vs. disk I/O ordering + * Performance: Log usage stats + */ + +/* Pin/unpin a block in the log */ +TRACE_EVENT(gfs2_pin, + + TP_PROTO(const struct gfs2_bufdata *bd, int pin), + + TP_ARGS(bd, pin), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, pin ) + __field( u32, len ) + __field( sector_t, block ) + __field( u64, ino ) + ), + + TP_fast_assign( + __entry->dev = bd->bd_gl->gl_sbd->sd_vfs->s_dev; + __entry->pin = pin; + __entry->len = bd->bd_bh->b_size; + __entry->block = bd->bd_bh->b_blocknr; + __entry->ino = bd->bd_gl->gl_name.ln_number; + ), + + TP_printk("%u,%u log %s %llu/%lu inode %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->pin ? "pin" : "unpin", + (unsigned long long)__entry->block, + (unsigned long)__entry->len, + (unsigned long long)__entry->ino) +); + +/* Flushing the log */ +TRACE_EVENT(gfs2_log_flush, + + TP_PROTO(const struct gfs2_sbd *sdp, int start), + + TP_ARGS(sdp, start), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, start ) + __field( u64, log_seq ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->start = start; + __entry->log_seq = sdp->sd_log_sequence; + ), + + TP_printk("%u,%u log flush %s %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->start ? "start" : "end", + (unsigned long long)__entry->log_seq) +); + +/* Reserving/releasing blocks in the log */ +TRACE_EVENT(gfs2_log_blocks, + + TP_PROTO(const struct gfs2_sbd *sdp, int blocks), + + TP_ARGS(sdp, blocks), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, blocks ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->blocks = blocks; + ), + + TP_printk("%u,%u log reserve %d", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->blocks) +); + +/* Writing back the AIL */ +TRACE_EVENT(gfs2_ail_flush, + + TP_PROTO(const struct gfs2_sbd *sdp, const struct writeback_control *wbc, int start), + + TP_ARGS(sdp, wbc, start), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, start ) + __field( int, sync_mode ) + __field( long, nr_to_write ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->start = start; + __entry->sync_mode = wbc->sync_mode; + __entry->nr_to_write = wbc->nr_to_write; + ), + + TP_printk("%u,%u ail flush %s %s %ld", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->start ? "start" : "end", + __entry->sync_mode == WB_SYNC_ALL ? "all" : "none", + __entry->nr_to_write) +); + +/* Section 3 - bmap + * + * Objectives: + * Latency: Bmap request time + * Performance: Block allocator tracing + * Correctness: Test of disard generation vs. blocks allocated + */ + +/* Map an extent of blocks, possibly a new allocation */ +TRACE_EVENT(gfs2_bmap, + + TP_PROTO(const struct gfs2_inode *ip, const struct buffer_head *bh, + sector_t lblock, int create, int errno), + + TP_ARGS(ip, bh, lblock, create, errno), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, lblock ) + __field( sector_t, pblock ) + __field( u64, inum ) + __field( unsigned long, state ) + __field( u32, len ) + __field( int, create ) + __field( int, errno ) + ), + + TP_fast_assign( + __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev; + __entry->lblock = lblock; + __entry->pblock = buffer_mapped(bh) ? bh->b_blocknr : 0; + __entry->inum = ip->i_no_addr; + __entry->state = bh->b_state; + __entry->len = bh->b_size; + __entry->create = create; + __entry->errno = errno; + ), + + TP_printk("%u,%u bmap %llu map %llu/%lu to %llu flags:%08lx %s %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->lblock, + (unsigned long)__entry->len, + (unsigned long long)__entry->pblock, + __entry->state, __entry->create ? "create " : "nocreate", + __entry->errno) +); + +/* Keep track of blocks as they are allocated/freed */ +TRACE_EVENT(gfs2_block_alloc, + + TP_PROTO(const struct gfs2_inode *ip, u64 block, unsigned len, + u8 block_state), + + TP_ARGS(ip, block, len, block_state), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, start ) + __field( u64, inum ) + __field( u32, len ) + __field( u8, block_state ) + ), + + TP_fast_assign( + __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev; + __entry->start = block; + __entry->inum = ip->i_no_addr; + __entry->len = len; + __entry->block_state = block_state; + ), + + TP_printk("%u,%u bmap %llu alloc %llu/%lu %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->start, + (unsigned long)__entry->len, + block_state_name(__entry->block_state)) +); + +#endif /* _TRACE_GFS2_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_gfs2 +#include + diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c new file mode 100644 index 00000000..9ec73a85 --- /dev/null +++ b/fs/gfs2/trans.c @@ -0,0 +1,192 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "trans.h" +#include "util.h" +#include "trace_gfs2.h" + +int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes) +{ + struct gfs2_trans *tr; + int error; + + BUG_ON(current->journal_info); + BUG_ON(blocks == 0 && revokes == 0); + + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + return -EROFS; + + tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS); + if (!tr) + return -ENOMEM; + + tr->tr_ip = (unsigned long)__builtin_return_address(0); + tr->tr_blocks = blocks; + tr->tr_revokes = revokes; + tr->tr_reserved = 1; + if (blocks) + tr->tr_reserved += 6 + blocks; + if (revokes) + tr->tr_reserved += gfs2_struct2blk(sdp, revokes, + sizeof(u64)); + INIT_LIST_HEAD(&tr->tr_list_buf); + + gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); + + error = gfs2_glock_nq(&tr->tr_t_gh); + if (error) + goto fail_holder_uninit; + + error = gfs2_log_reserve(sdp, tr->tr_reserved); + if (error) + goto fail_gunlock; + + current->journal_info = tr; + + return 0; + +fail_gunlock: + gfs2_glock_dq(&tr->tr_t_gh); + +fail_holder_uninit: + gfs2_holder_uninit(&tr->tr_t_gh); + kfree(tr); + + return error; +} + +/** + * gfs2_log_release - Release a given number of log blocks + * @sdp: The GFS2 superblock + * @blks: The number of blocks + * + */ + +static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) +{ + + atomic_add(blks, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, blks); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); + up_read(&sdp->sd_log_flush_lock); +} + +void gfs2_trans_end(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr = current->journal_info; + + BUG_ON(!tr); + current->journal_info = NULL; + + if (!tr->tr_touched) { + gfs2_log_release(sdp, tr->tr_reserved); + if (tr->tr_t_gh.gh_gl) { + gfs2_glock_dq(&tr->tr_t_gh); + gfs2_holder_uninit(&tr->tr_t_gh); + kfree(tr); + } + return; + } + + if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) { + fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ", + tr->tr_num_buf, tr->tr_blocks); + print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip); + } + if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) { + fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ", + tr->tr_num_revoke, tr->tr_revokes); + print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip); + } + + gfs2_log_commit(sdp, tr); + if (tr->tr_t_gh.gh_gl) { + gfs2_glock_dq(&tr->tr_t_gh); + gfs2_holder_uninit(&tr->tr_t_gh); + kfree(tr); + } + + if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) + gfs2_log_flush(sdp, NULL); +} + +/** + * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction + * @gl: the glock the buffer belongs to + * @bh: The buffer to add + * @meta: True in the case of adding metadata + * + */ + +void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_bufdata *bd; + + bd = bh->b_private; + if (bd) + gfs2_assert(sdp, bd->bd_gl == gl); + else { + gfs2_attach_bufdata(gl, bh, meta); + bd = bh->b_private; + } + lops_add(sdp, &bd->bd_le); +} + +void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) +{ + BUG_ON(!list_empty(&bd->bd_le.le_list)); + BUG_ON(!list_empty(&bd->bd_ail_st_list)); + BUG_ON(!list_empty(&bd->bd_ail_gl_list)); + lops_init_le(&bd->bd_le, &gfs2_revoke_lops); + lops_add(sdp, &bd->bd_le); +} + +void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) +{ + struct gfs2_bufdata *bd, *tmp; + struct gfs2_trans *tr = current->journal_info; + unsigned int n = len; + + gfs2_log_lock(sdp); + list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_le.le_list) { + if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) { + list_del_init(&bd->bd_le.le_list); + gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); + sdp->sd_log_num_revoke--; + kmem_cache_free(gfs2_bufdata_cachep, bd); + tr->tr_num_revoke_rm++; + if (--n == 0) + break; + } + } + gfs2_log_unlock(sdp); +} + +void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd) +{ + lops_add(rgd->rd_sbd, &rgd->rd_le); +} + diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h new file mode 100644 index 00000000..fb56b783 --- /dev/null +++ b/fs/gfs2/trans.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __TRANS_DOT_H__ +#define __TRANS_DOT_H__ + +#include +struct gfs2_sbd; +struct gfs2_rgrpd; +struct gfs2_glock; + +#define RES_DINODE 1 +#define RES_INDIRECT 1 +#define RES_JDATA 1 +#define RES_DATA 1 +#define RES_LEAF 1 +#define RES_RG_HDR 1 +#define RES_RG_BIT 2 +#define RES_EATTR 1 +#define RES_STATFS 1 +#define RES_QUOTA 2 + +/* reserve either the number of blocks to be allocated plus the rg header + * block, or all of the blocks in the rg, whichever is smaller */ +static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al) +{ + return (al->al_requested < al->al_rgd->rd_length)? + al->al_requested + 1 : al->al_rgd->rd_length; +} + +int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes); + +void gfs2_trans_end(struct gfs2_sbd *sdp); + +void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); +void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); +void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd); + +#endif /* __TRANS_DOT_H__ */ diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c new file mode 100644 index 00000000..53511291 --- /dev/null +++ b/fs/gfs2/util.c @@ -0,0 +1,285 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "util.h" + +struct kmem_cache *gfs2_glock_cachep __read_mostly; +struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly; +struct kmem_cache *gfs2_inode_cachep __read_mostly; +struct kmem_cache *gfs2_bufdata_cachep __read_mostly; +struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; +struct kmem_cache *gfs2_quotad_cachep __read_mostly; + +void gfs2_assert_i(struct gfs2_sbd *sdp) +{ + printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n", + sdp->sd_fsname); +} + +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + const struct lm_lockops *lm = ls->ls_ops; + va_list args; + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && + test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return 0; + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { + fs_err(sdp, "about to withdraw this file system\n"); + BUG_ON(sdp->sd_args.ar_debug); + + kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); + + if (lm->lm_unmount) { + fs_err(sdp, "telling LM to unmount\n"); + lm->lm_unmount(sdp); + } + fs_err(sdp, "withdrawn\n"); + dump_stack(); + } + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname); + + return -1; +} + +/** + * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false + * Returns: -1 if this call withdrew the machine, + * -2 if it was already withdrawn + */ + +int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line) +{ + int me; + me = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); + dump_stack(); + return (me) ? -1 : -2; +} + +/** + * gfs2_assert_warn_i - Print a message to the console if @assertion is false + * Returns: -1 if we printed something + * -2 if we didn't + */ + +int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line) +{ + if (time_before(jiffies, + sdp->sd_last_warning + + gfs2_tune_get(sdp, gt_complain_secs) * HZ)) + return -2; + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) + printk(KERN_WARNING + "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); + + if (sdp->sd_args.ar_debug) + BUG(); + else + dump_stack(); + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); + + sdp->sd_last_warning = jiffies; + + return -1; +} + +/** + * gfs2_consist_i - Flag a filesystem consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function, + char *file, unsigned int line) +{ + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: filesystem consistency error\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, function, file, line); + return rv; +} + +/** + * gfs2_consist_inode_i - Flag an inode consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, + const char *function, char *file, unsigned int line) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: filesystem consistency error\n" + "GFS2: fsid=%s: inode = %llu %llu\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr, + sdp->sd_fsname, function, file, line); + return rv; +} + +/** + * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, + const char *function, char *file, unsigned int line) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: filesystem consistency error\n" + "GFS2: fsid=%s: RG = %llu\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)rgd->rd_addr, + sdp->sd_fsname, function, file, line); + return rv; +} + +/** + * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * -2 if it was already withdrawn + */ + +int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *type, const char *function, char *file, + unsigned int line) +{ + int me; + me = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: invalid metadata block\n" + "GFS2: fsid=%s: bh = %llu (%s)\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, + sdp->sd_fsname, function, file, line); + return (me) ? -1 : -2; +} + +/** + * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * -2 if it was already withdrawn + */ + +int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + u16 type, u16 t, const char *function, + char *file, unsigned int line) +{ + int me; + me = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: invalid metadata block\n" + "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t, + sdp->sd_fsname, function, file, line); + return (me) ? -1 : -2; +} + +/** + * gfs2_io_error_i - Flag an I/O error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, + unsigned int line) +{ + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: I/O error\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, function, file, line); + return rv; +} + +/** + * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, unsigned int line) +{ + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: I/O error\n" + "GFS2: fsid=%s: block = %llu\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)bh->b_blocknr, + sdp->sd_fsname, function, file, line); + return rv; +} + +void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, + unsigned int bit, int new_value) +{ + unsigned int c, o, b = bit; + int old_value; + + c = b / (8 * PAGE_SIZE); + b %= 8 * PAGE_SIZE; + o = b / 8; + b %= 8; + + old_value = (bitmap[c][o] & (1 << b)); + gfs2_assert_withdraw(sdp, !old_value != !new_value); + + if (new_value) + bitmap[c][o] |= 1 << b; + else + bitmap[c][o] &= ~(1 << b); +} + diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h new file mode 100644 index 00000000..b432e046 --- /dev/null +++ b/fs/gfs2/util.h @@ -0,0 +1,172 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __UTIL_DOT_H__ +#define __UTIL_DOT_H__ + +#include "incore.h" + +#define fs_printk(level, fs, fmt, arg...) \ + printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg) + +#define fs_info(fs, fmt, arg...) \ + fs_printk(KERN_INFO , fs , fmt , ## arg) + +#define fs_warn(fs, fmt, arg...) \ + fs_printk(KERN_WARNING , fs , fmt , ## arg) + +#define fs_err(fs, fmt, arg...) \ + fs_printk(KERN_ERR, fs , fmt , ## arg) + + +void gfs2_assert_i(struct gfs2_sbd *sdp); + +#define gfs2_assert(sdp, assertion) \ +do { \ + if (unlikely(!(assertion))) { \ + gfs2_assert_i(sdp); \ + BUG(); \ + } \ +} while (0) + + +int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line); + +#define gfs2_assert_withdraw(sdp, assertion) \ +((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \ + __func__, __FILE__, __LINE__)) + + +int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line); + +#define gfs2_assert_warn(sdp, assertion) \ +((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \ + __func__, __FILE__, __LINE__)) + + +int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, + const char *function, char *file, unsigned int line); + +#define gfs2_consist(sdp) \ +gfs2_consist_i((sdp), 0, __func__, __FILE__, __LINE__) + + +int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, + const char *function, char *file, unsigned int line); + +#define gfs2_consist_inode(ip) \ +gfs2_consist_inode_i((ip), 0, __func__, __FILE__, __LINE__) + + +int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, + const char *function, char *file, unsigned int line); + +#define gfs2_consist_rgrpd(rgd) \ +gfs2_consist_rgrpd_i((rgd), 0, __func__, __FILE__, __LINE__) + + +int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *type, const char *function, + char *file, unsigned int line); + +static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp, + struct buffer_head *bh, + const char *function, + char *file, unsigned int line) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + u32 magic = be32_to_cpu(mh->mh_magic); + if (unlikely(magic != GFS2_MAGIC)) + return gfs2_meta_check_ii(sdp, bh, "magic number", function, + file, line); + return 0; +} + +#define gfs2_meta_check(sdp, bh) \ +gfs2_meta_check_i((sdp), (bh), __func__, __FILE__, __LINE__) + + +int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + u16 type, u16 t, + const char *function, + char *file, unsigned int line); + +static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp, + struct buffer_head *bh, + u16 type, + const char *function, + char *file, unsigned int line) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + u32 magic = be32_to_cpu(mh->mh_magic); + u16 t = be32_to_cpu(mh->mh_type); + if (unlikely(magic != GFS2_MAGIC)) + return gfs2_meta_check_ii(sdp, bh, "magic number", function, + file, line); + if (unlikely(t != type)) + return gfs2_metatype_check_ii(sdp, bh, type, t, function, + file, line); + return 0; +} + +#define gfs2_metatype_check(sdp, bh, type) \ +gfs2_metatype_check_i((sdp), (bh), (type), __func__, __FILE__, __LINE__) + +static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type, + u16 format) +{ + struct gfs2_meta_header *mh; + mh = (struct gfs2_meta_header *)bh->b_data; + mh->mh_type = cpu_to_be32(type); + mh->mh_format = cpu_to_be32(format); +} + + +int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, + char *file, unsigned int line); + +#define gfs2_io_error(sdp) \ +gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__); + + +int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, unsigned int line); + +#define gfs2_io_error_bh(sdp, bh) \ +gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__); + + +extern struct kmem_cache *gfs2_glock_cachep; +extern struct kmem_cache *gfs2_glock_aspace_cachep; +extern struct kmem_cache *gfs2_inode_cachep; +extern struct kmem_cache *gfs2_bufdata_cachep; +extern struct kmem_cache *gfs2_rgrpd_cachep; +extern struct kmem_cache *gfs2_quotad_cachep; + +static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, + unsigned int *p) +{ + unsigned int x; + spin_lock(>->gt_spin); + x = *p; + spin_unlock(>->gt_spin); + return x; +} + +#define gfs2_tune_get(sdp, field) \ +gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) + +void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, + unsigned int bit, int new_value); +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...); + +#endif /* __UTIL_DOT_H__ */ + diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c new file mode 100644 index 00000000..439b61c0 --- /dev/null +++ b/fs/gfs2/xattr.c @@ -0,0 +1,1549 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "acl.h" +#include "xattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +/** + * ea_calc_size - returns the acutal number of bytes the request will take up + * (not counting any unstuffed data blocks) + * @sdp: + * @er: + * @size: + * + * Returns: 1 if the EA should be stuffed + */ + +static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize, + unsigned int *size) +{ + unsigned int jbsize = sdp->sd_jbsize; + + /* Stuffed */ + *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8); + + if (*size <= jbsize) + return 1; + + /* Unstuffed */ + *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + + (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8); + + return 0; +} + +static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize) +{ + unsigned int size; + + if (dsize > GFS2_EA_MAX_DATA_LEN) + return -ERANGE; + + ea_calc_size(sdp, nsize, dsize, &size); + + /* This can only happen with 512 byte blocks */ + if (size > sdp->sd_jbsize) + return -ERANGE; + + return 0; +} + +typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, void *private); + +static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh, + ea_call_t ea_call, void *data) +{ + struct gfs2_ea_header *ea, *prev = NULL; + int error = 0; + + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA)) + return -EIO; + + for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) { + if (!GFS2_EA_REC_LEN(ea)) + goto fail; + if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <= + bh->b_data + bh->b_size)) + goto fail; + if (!GFS2_EATYPE_VALID(ea->ea_type)) + goto fail; + + error = ea_call(ip, bh, ea, prev, data); + if (error) + return error; + + if (GFS2_EA_IS_LAST(ea)) { + if ((char *)GFS2_EA2NEXT(ea) != + bh->b_data + bh->b_size) + goto fail; + break; + } + } + + return error; + +fail: + gfs2_consist_inode(ip); + return -EIO; +} + +static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) +{ + struct buffer_head *bh, *eabh; + __be64 *eablk, *end; + int error; + + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh); + if (error) + return error; + + if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) { + error = ea_foreach_i(ip, bh, ea_call, data); + goto out; + } + + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)); + end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs; + + for (; eablk < end; eablk++) { + u64 bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh); + if (error) + break; + error = ea_foreach_i(ip, eabh, ea_call, data); + brelse(eabh); + if (error) + break; + } +out: + brelse(bh); + return error; +} + +struct ea_find { + int type; + const char *name; + size_t namel; + struct gfs2_ea_location *ef_el; +}; + +static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct ea_find *ef = private; + + if (ea->ea_type == GFS2_EATYPE_UNUSED) + return 0; + + if (ea->ea_type == ef->type) { + if (ea->ea_name_len == ef->namel && + !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) { + struct gfs2_ea_location *el = ef->ef_el; + get_bh(bh); + el->el_bh = bh; + el->el_ea = ea; + el->el_prev = prev; + return 1; + } + } + + return 0; +} + +static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, + struct gfs2_ea_location *el) +{ + struct ea_find ef; + int error; + + ef.type = type; + ef.name = name; + ef.namel = strlen(name); + ef.ef_el = el; + + memset(el, 0, sizeof(struct gfs2_ea_location)); + + error = ea_foreach(ip, ea_find_i, &ef); + if (error > 0) + return 0; + + return error; +} + +/** + * ea_dealloc_unstuffed - + * @ip: + * @bh: + * @ea: + * @prev: + * @private: + * + * Take advantage of the fact that all unstuffed blocks are + * allocated from the same RG. But watch, this may not always + * be true. + * + * Returns: errno + */ + +static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, void *private) +{ + int *leave = private; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_holder rg_gh; + struct buffer_head *dibh; + __be64 *dataptrs; + u64 bn = 0; + u64 bstart = 0; + unsigned int blen = 0; + unsigned int blks = 0; + unsigned int x; + int error; + + if (GFS2_EA_IS_STUFFED(ea)) + return 0; + + dataptrs = GFS2_EA2DATAPTRS(ea); + for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { + if (*dataptrs) { + blks++; + bn = be64_to_cpu(*dataptrs); + } + } + if (!blks) + return 0; + + rgd = gfs2_blk2rgrpd(sdp, bn); + if (!rgd) { + gfs2_consist_inode(ip); + return -EIO; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh); + if (error) + return error; + + error = gfs2_trans_begin(sdp, rgd->rd_length + RES_DINODE + + RES_EATTR + RES_STATFS + RES_QUOTA, blks); + if (error) + goto out_gunlock; + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + + dataptrs = GFS2_EA2DATAPTRS(ea); + for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { + if (!*dataptrs) + break; + bn = be64_to_cpu(*dataptrs); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_free_meta(ip, bstart, blen); + bstart = bn; + blen = 1; + } + + *dataptrs = 0; + gfs2_add_inode_blocks(&ip->i_inode, -1); + } + if (bstart) + gfs2_free_meta(ip, bstart, blen); + + if (prev && !leave) { + u32 len; + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; + } else { + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_num_ptrs = 0; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + +out_gunlock: + gfs2_glock_dq_uninit(&rg_gh); + return error; +} + +static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, int leave) +{ + struct gfs2_alloc *al; + int error; + + al = gfs2_alloc_get(ip); + if (!al) + return -ENOMEM; + + error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_alloc; + + error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh); + if (error) + goto out_quota; + + error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL); + + gfs2_glock_dq_uninit(&al->al_ri_gh); + +out_quota: + gfs2_quota_unhold(ip); +out_alloc: + gfs2_alloc_put(ip); + return error; +} + +struct ea_list { + struct gfs2_ea_request *ei_er; + unsigned int ei_size; +}; + +static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea) +{ + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + return 5 + ea->ea_name_len + 1; + case GFS2_EATYPE_SYS: + return 7 + ea->ea_name_len + 1; + case GFS2_EATYPE_SECURITY: + return 9 + ea->ea_name_len + 1; + default: + return 0; + } +} + +static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct ea_list *ei = private; + struct gfs2_ea_request *er = ei->ei_er; + unsigned int ea_size = gfs2_ea_strlen(ea); + + if (ea->ea_type == GFS2_EATYPE_UNUSED) + return 0; + + if (er->er_data_len) { + char *prefix = NULL; + unsigned int l = 0; + char c = 0; + + if (ei->ei_size + ea_size > er->er_data_len) + return -ERANGE; + + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + prefix = "user."; + l = 5; + break; + case GFS2_EATYPE_SYS: + prefix = "system."; + l = 7; + break; + case GFS2_EATYPE_SECURITY: + prefix = "security."; + l = 9; + break; + } + + BUG_ON(l == 0); + + memcpy(er->er_data + ei->ei_size, prefix, l); + memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea), + ea->ea_name_len); + memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1); + } + + ei->ei_size += ea_size; + + return 0; +} + +/** + * gfs2_listxattr - List gfs2 extended attributes + * @dentry: The dentry whose inode we are interested in + * @buffer: The buffer to write the results + * @size: The size of the buffer + * + * Returns: actual size of data on success, -errno on error + */ + +ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_ea_request er; + struct gfs2_holder i_gh; + int error; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + if (size) { + er.er_data = buffer; + er.er_data_len = size; + } + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return error; + + if (ip->i_eattr) { + struct ea_list ei = { .ei_er = &er, .ei_size = 0 }; + + error = ea_foreach(ip, ea_list_i, &ei); + if (!error) + error = ei.ei_size; + } + + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +/** + * ea_get_unstuffed - actually copies the unstuffed data into the + * request buffer + * @ip: The GFS2 inode + * @ea: The extended attribute header structure + * @data: The data to be copied + * + * Returns: errno + */ + +static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, + char *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head **bh; + unsigned int amount = GFS2_EA_DATA_LEN(ea); + unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); + __be64 *dataptrs = GFS2_EA2DATAPTRS(ea); + unsigned int x; + int error = 0; + + bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS); + if (!bh) + return -ENOMEM; + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, + bh + x); + if (error) { + while (x--) + brelse(bh[x]); + goto out; + } + dataptrs++; + } + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_wait(sdp, bh[x]); + if (error) { + for (; x < nptrs; x++) + brelse(bh[x]); + goto out; + } + if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) { + for (; x < nptrs; x++) + brelse(bh[x]); + error = -EIO; + goto out; + } + + memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header), + (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize); + + amount -= sdp->sd_jbsize; + data += sdp->sd_jbsize; + + brelse(bh[x]); + } + +out: + kfree(bh); + return error; +} + +static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, + char *data, size_t size) +{ + int ret; + size_t len = GFS2_EA_DATA_LEN(el->el_ea); + if (len > size) + return -ERANGE; + + if (GFS2_EA_IS_STUFFED(el->el_ea)) { + memcpy(data, GFS2_EA2DATA(el->el_ea), len); + return len; + } + ret = ea_get_unstuffed(ip, el->el_ea, data); + if (ret < 0) + return ret; + return len; +} + +int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata) +{ + struct gfs2_ea_location el; + int error; + int len; + char *data; + + error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el); + if (error) + return error; + if (!el.el_ea) + goto out; + if (!GFS2_EA_DATA_LEN(el.el_ea)) + goto out; + + len = GFS2_EA_DATA_LEN(el.el_ea); + data = kmalloc(len, GFP_NOFS); + error = -ENOMEM; + if (data == NULL) + goto out; + + error = gfs2_ea_get_copy(ip, &el, data, len); + if (error == 0) + error = len; + *ppdata = data; +out: + brelse(el.el_bh); + return error; +} + +/** + * gfs2_xattr_get - Get a GFS2 extended attribute + * @inode: The inode + * @name: The name of the extended attribute + * @buffer: The buffer to write the result into + * @size: The size of the buffer + * @type: The type of extended attribute + * + * Returns: actual size of data on success, -errno on error + */ +static int gfs2_xattr_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_ea_location el; + int error; + + if (!ip->i_eattr) + return -ENODATA; + if (strlen(name) > GFS2_EA_MAX_NAME_LEN) + return -EINVAL; + + error = gfs2_ea_find(ip, type, name, &el); + if (error) + return error; + if (!el.el_ea) + return -ENODATA; + if (size) + error = gfs2_ea_get_copy(ip, &el, buffer, size); + else + error = GFS2_EA_DATA_LEN(el.el_ea); + brelse(el.el_bh); + + return error; +} + +/** + * ea_alloc_blk - allocates a new block for extended attributes. + * @ip: A pointer to the inode that's getting extended attributes + * @bhp: Pointer to pointer to a struct buffer_head + * + * Returns: errno + */ + +static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_ea_header *ea; + unsigned int n = 1; + u64 block; + int error; + + error = gfs2_alloc_block(ip, &block, &n); + if (error) + return error; + gfs2_trans_add_unrevoke(sdp, block, 1); + *bhp = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_bh(ip->i_gl, *bhp, 1); + gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA); + gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header)); + + ea = GFS2_EA_BH2FIRST(*bhp); + ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize); + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_flags = GFS2_EAFLAG_LAST; + ea->ea_num_ptrs = 0; + + gfs2_add_inode_blocks(&ip->i_inode, 1); + + return 0; +} + +/** + * ea_write - writes the request info to an ea, creating new blocks if + * necessary + * @ip: inode that is being modified + * @ea: the location of the new ea in a block + * @er: the write request + * + * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags + * + * returns : errno + */ + +static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, + struct gfs2_ea_request *er) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int error; + + ea->ea_data_len = cpu_to_be32(er->er_data_len); + ea->ea_name_len = er->er_name_len; + ea->ea_type = er->er_type; + ea->__pad = 0; + + memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len); + + if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) { + ea->ea_num_ptrs = 0; + memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len); + } else { + __be64 *dataptr = GFS2_EA2DATAPTRS(ea); + const char *data = er->er_data; + unsigned int data_len = er->er_data_len; + unsigned int copy; + unsigned int x; + + ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize); + for (x = 0; x < ea->ea_num_ptrs; x++) { + struct buffer_head *bh; + u64 block; + int mh_size = sizeof(struct gfs2_meta_header); + unsigned int n = 1; + + error = gfs2_alloc_block(ip, &block, &n); + if (error) + return error; + gfs2_trans_add_unrevoke(sdp, block, 1); + bh = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED); + + gfs2_add_inode_blocks(&ip->i_inode, 1); + + copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize : + data_len; + memcpy(bh->b_data + mh_size, data, copy); + if (copy < sdp->sd_jbsize) + memset(bh->b_data + mh_size + copy, 0, + sdp->sd_jbsize - copy); + + *dataptr++ = cpu_to_be64(bh->b_blocknr); + data += copy; + data_len -= copy; + + brelse(bh); + } + + gfs2_assert_withdraw(sdp, !data_len); + } + + return 0; +} + +typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip, + struct gfs2_ea_request *er, void *private); + +static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, + unsigned int blks, + ea_skeleton_call_t skeleton_call, void *private) +{ + struct gfs2_alloc *al; + struct buffer_head *dibh; + int error; + + al = gfs2_alloc_get(ip); + if (!al) + return -ENOMEM; + + error = gfs2_quota_lock_check(ip); + if (error) + goto out; + + al->al_requested = blks; + + error = gfs2_inplace_reserve(ip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), + blks + gfs2_rg_blocks(al) + + RES_DINODE + RES_STATFS + RES_QUOTA, 0); + if (error) + goto out_ipres; + + error = skeleton_call(ip, er, private); + if (error) + goto out_end_trans; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + +out_end_trans: + gfs2_trans_end(GFS2_SB(&ip->i_inode)); +out_ipres: + gfs2_inplace_release(ip); +out_gunlock_q: + gfs2_quota_unlock(ip); +out: + gfs2_alloc_put(ip); + return error; +} + +static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, + void *private) +{ + struct buffer_head *bh; + int error; + + error = ea_alloc_blk(ip, &bh); + if (error) + return error; + + ip->i_eattr = bh->b_blocknr; + error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er); + + brelse(bh); + + return error; +} + +/** + * ea_init - initializes a new eattr block + * @ip: + * @er: + * + * Returns: errno + */ + +static int ea_init(struct gfs2_inode *ip, int type, const char *name, + const void *data, size_t size) +{ + struct gfs2_ea_request er; + unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize; + unsigned int blks = 1; + + er.er_type = type; + er.er_name = name; + er.er_name_len = strlen(name); + er.er_data = (void *)data; + er.er_data_len = size; + + if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize) + blks += DIV_ROUND_UP(er.er_data_len, jbsize); + + return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL); +} + +static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea) +{ + u32 ea_size = GFS2_EA_SIZE(ea); + struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea + + ea_size); + u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size; + int last = ea->ea_flags & GFS2_EAFLAG_LAST; + + ea->ea_rec_len = cpu_to_be32(ea_size); + ea->ea_flags ^= last; + + new->ea_rec_len = cpu_to_be32(new_size); + new->ea_flags = last; + + return new; +} + +static void ea_set_remove_stuffed(struct gfs2_inode *ip, + struct gfs2_ea_location *el) +{ + struct gfs2_ea_header *ea = el->el_ea; + struct gfs2_ea_header *prev = el->el_prev; + u32 len; + + gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + + if (!prev || !GFS2_EA_IS_STUFFED(ea)) { + ea->ea_type = GFS2_EATYPE_UNUSED; + return; + } else if (GFS2_EA2NEXT(prev) != ea) { + prev = GFS2_EA2NEXT(prev); + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea); + } + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; +} + +struct ea_set { + int ea_split; + + struct gfs2_ea_request *es_er; + struct gfs2_ea_location *es_el; + + struct buffer_head *es_bh; + struct gfs2_ea_header *es_ea; +}; + +static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct ea_set *es) +{ + struct gfs2_ea_request *er = es->es_er; + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0); + if (error) + return error; + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + + if (es->ea_split) + ea = ea_split_ea(ea); + + ea_write(ip, ea, er); + + if (es->es_el) + ea_set_remove_stuffed(ip, es->es_el); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); +out: + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + return error; +} + +static int ea_set_simple_alloc(struct gfs2_inode *ip, + struct gfs2_ea_request *er, void *private) +{ + struct ea_set *es = private; + struct gfs2_ea_header *ea = es->es_ea; + int error; + + gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1); + + if (es->ea_split) + ea = ea_split_ea(ea); + + error = ea_write(ip, ea, er); + if (error) + return error; + + if (es->es_el) + ea_set_remove_stuffed(ip, es->es_el); + + return 0; +} + +static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct ea_set *es = private; + unsigned int size; + int stuffed; + int error; + + stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len, + es->es_er->er_data_len, &size); + + if (ea->ea_type == GFS2_EATYPE_UNUSED) { + if (GFS2_EA_REC_LEN(ea) < size) + return 0; + if (!GFS2_EA_IS_STUFFED(ea)) { + error = ea_remove_unstuffed(ip, bh, ea, prev, 1); + if (error) + return error; + } + es->ea_split = 0; + } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size) + es->ea_split = 1; + else + return 0; + + if (stuffed) { + error = ea_set_simple_noalloc(ip, bh, ea, es); + if (error) + return error; + } else { + unsigned int blks; + + es->es_bh = bh; + es->es_ea = ea; + blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len, + GFS2_SB(&ip->i_inode)->sd_jbsize); + + error = ea_alloc_skeleton(ip, es->es_er, blks, + ea_set_simple_alloc, es); + if (error) + return error; + } + + return 1; +} + +static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, + void *private) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *indbh, *newbh; + __be64 *eablk; + int error; + int mh_size = sizeof(struct gfs2_meta_header); + + if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { + __be64 *end; + + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, + &indbh); + if (error) + return error; + + if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (__be64 *)(indbh->b_data + mh_size); + end = eablk + sdp->sd_inptrs; + + for (; eablk < end; eablk++) + if (!*eablk) + break; + + if (eablk == end) { + error = -ENOSPC; + goto out; + } + + gfs2_trans_add_bh(ip->i_gl, indbh, 1); + } else { + u64 blk; + unsigned int n = 1; + error = gfs2_alloc_block(ip, &blk, &n); + if (error) + return error; + gfs2_trans_add_unrevoke(sdp, blk, 1); + indbh = gfs2_meta_new(ip->i_gl, blk); + gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(indbh, mh_size); + + eablk = (__be64 *)(indbh->b_data + mh_size); + *eablk = cpu_to_be64(ip->i_eattr); + ip->i_eattr = blk; + ip->i_diskflags |= GFS2_DIF_EA_INDIRECT; + gfs2_add_inode_blocks(&ip->i_inode, 1); + + eablk++; + } + + error = ea_alloc_blk(ip, &newbh); + if (error) + goto out; + + *eablk = cpu_to_be64((u64)newbh->b_blocknr); + error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er); + brelse(newbh); + if (error) + goto out; + + if (private) + ea_set_remove_stuffed(ip, private); + +out: + brelse(indbh); + return error; +} + +static int ea_set_i(struct gfs2_inode *ip, int type, const char *name, + const void *value, size_t size, struct gfs2_ea_location *el) +{ + struct gfs2_ea_request er; + struct ea_set es; + unsigned int blks = 2; + int error; + + er.er_type = type; + er.er_name = name; + er.er_data = (void *)value; + er.er_name_len = strlen(name); + er.er_data_len = size; + + memset(&es, 0, sizeof(struct ea_set)); + es.es_er = &er; + es.es_el = el; + + error = ea_foreach(ip, ea_set_simple, &es); + if (error > 0) + return 0; + if (error) + return error; + + if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) + blks++; + if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize) + blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); + + return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el); +} + +static int ea_set_remove_unstuffed(struct gfs2_inode *ip, + struct gfs2_ea_location *el) +{ + if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) { + el->el_prev = GFS2_EA2NEXT(el->el_prev); + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), + GFS2_EA2NEXT(el->el_prev) == el->el_ea); + } + + return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0); +} + +static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) +{ + struct gfs2_ea_header *ea = el->el_ea; + struct gfs2_ea_header *prev = el->el_prev; + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); + if (error) + return error; + + gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + + if (prev) { + u32 len; + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; + } else { + ea->ea_type = GFS2_EATYPE_UNUSED; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + + return error; +} + +/** + * gfs2_xattr_remove - Remove a GFS2 extended attribute + * @ip: The inode + * @type: The type of the extended attribute + * @name: The name of the extended attribute + * + * This is not called directly by the VFS since we use the (common) + * scheme of making a "set with NULL data" mean a remove request. Note + * that this is different from a set with zero length data. + * + * Returns: 0, or errno on failure + */ + +static int gfs2_xattr_remove(struct gfs2_inode *ip, int type, const char *name) +{ + struct gfs2_ea_location el; + int error; + + if (!ip->i_eattr) + return -ENODATA; + + error = gfs2_ea_find(ip, type, name, &el); + if (error) + return error; + if (!el.el_ea) + return -ENODATA; + + if (GFS2_EA_IS_STUFFED(el.el_ea)) + error = ea_remove_stuffed(ip, &el); + else + error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0); + + brelse(el.el_bh); + + return error; +} + +/** + * __gfs2_xattr_set - Set (or remove) a GFS2 extended attribute + * @ip: The inode + * @name: The name of the extended attribute + * @value: The value of the extended attribute (NULL for remove) + * @size: The size of the @value argument + * @flags: Create or Replace + * @type: The type of the extended attribute + * + * See gfs2_xattr_remove() for details of the removal of xattrs. + * + * Returns: 0 or errno on failure + */ + +int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_ea_location el; + unsigned int namel = strlen(name); + int error; + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + if (namel > GFS2_EA_MAX_NAME_LEN) + return -ERANGE; + + if (value == NULL) + return gfs2_xattr_remove(ip, type, name); + + if (ea_check_size(sdp, namel, size)) + return -ERANGE; + + if (!ip->i_eattr) { + if (flags & XATTR_REPLACE) + return -ENODATA; + return ea_init(ip, type, name, value, size); + } + + error = gfs2_ea_find(ip, type, name, &el); + if (error) + return error; + + if (el.el_ea) { + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) { + brelse(el.el_bh); + return -EPERM; + } + + error = -EEXIST; + if (!(flags & XATTR_CREATE)) { + int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea); + error = ea_set_i(ip, type, name, value, size, &el); + if (!error && unstuffed) + ea_set_remove_unstuffed(ip, &el); + } + + brelse(el.el_bh); + return error; + } + + error = -ENODATA; + if (!(flags & XATTR_REPLACE)) + error = ea_set_i(ip, type, name, value, size, NULL); + + return error; +} + +static int gfs2_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + return __gfs2_xattr_set(dentry->d_inode, name, value, + size, flags, type); +} + +static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip, + struct gfs2_ea_header *ea, char *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head **bh; + unsigned int amount = GFS2_EA_DATA_LEN(ea); + unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); + __be64 *dataptrs = GFS2_EA2DATAPTRS(ea); + unsigned int x; + int error; + + bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS); + if (!bh) + return -ENOMEM; + + error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0); + if (error) + goto out; + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, + bh + x); + if (error) { + while (x--) + brelse(bh[x]); + goto fail; + } + dataptrs++; + } + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_wait(sdp, bh[x]); + if (error) { + for (; x < nptrs; x++) + brelse(bh[x]); + goto fail; + } + if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) { + for (; x < nptrs; x++) + brelse(bh[x]); + error = -EIO; + goto fail; + } + + gfs2_trans_add_bh(ip->i_gl, bh[x], 1); + + memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data, + (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize); + + amount -= sdp->sd_jbsize; + data += sdp->sd_jbsize; + + brelse(bh[x]); + } + +out: + kfree(bh); + return error; + +fail: + gfs2_trans_end(sdp); + kfree(bh); + return error; +} + +int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_ea_location el; + int error; + + error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el); + if (error) + return error; + + if (GFS2_EA_IS_STUFFED(el.el_ea)) { + error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0); + if (error == 0) { + gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1); + memcpy(GFS2_EA2DATA(el.el_ea), data, + GFS2_EA_DATA_LEN(el.el_ea)); + } + } else { + error = ea_acl_chmod_unstuffed(ip, el.el_ea, data); + } + + brelse(el.el_bh); + if (error) + return error; + + error = gfs2_setattr_simple(ip, attr); + gfs2_trans_end(sdp); + return error; +} + +static int ea_dealloc_indirect(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrp_list rlist; + struct buffer_head *indbh, *dibh; + __be64 *eablk, *end; + unsigned int rg_blocks = 0; + u64 bstart = 0; + unsigned int blen = 0; + unsigned int blks = 0; + unsigned int x; + int error; + + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); + + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh); + if (error) + return error; + + if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); + end = eablk + sdp->sd_inptrs; + + for (; eablk < end; eablk++) { + u64 bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_rlist_add(sdp, &rlist, bstart); + bstart = bn; + blen = 1; + } + blks++; + } + if (bstart) + gfs2_rlist_add(sdp, &rlist, bstart); + else + goto out; + + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); + + for (x = 0; x < rlist.rl_rgrps; x++) { + struct gfs2_rgrpd *rgd; + rgd = rlist.rl_ghs[x].gh_gl->gl_object; + rg_blocks += rgd->rd_length; + } + + error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); + if (error) + goto out_rlist_free; + + error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT + + RES_STATFS + RES_QUOTA, blks); + if (error) + goto out_gunlock; + + gfs2_trans_add_bh(ip->i_gl, indbh, 1); + + eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); + bstart = 0; + blen = 0; + + for (; eablk < end; eablk++) { + u64 bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_free_meta(ip, bstart, blen); + bstart = bn; + blen = 1; + } + + *eablk = 0; + gfs2_add_inode_blocks(&ip->i_inode, -1); + } + if (bstart) + gfs2_free_meta(ip, bstart, blen); + + ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + +out_gunlock: + gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); +out_rlist_free: + gfs2_rlist_free(&rlist); +out: + brelse(indbh); + return error; +} + +static int ea_dealloc_block(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_rgrpd *rgd; + struct buffer_head *dibh; + int error; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr); + if (!rgd) { + gfs2_consist_inode(ip); + return -EIO; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, + &al->al_rgd_gh); + if (error) + return error; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS + + RES_QUOTA, 1); + if (error) + goto out_gunlock; + + gfs2_free_meta(ip, ip->i_eattr, 1); + + ip->i_eattr = 0; + gfs2_add_inode_blocks(&ip->i_inode, -1); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + +out_gunlock: + gfs2_glock_dq_uninit(&al->al_rgd_gh); + return error; +} + +/** + * gfs2_ea_dealloc - deallocate the extended attribute fork + * @ip: the inode + * + * Returns: errno + */ + +int gfs2_ea_dealloc(struct gfs2_inode *ip) +{ + struct gfs2_alloc *al; + int error; + + al = gfs2_alloc_get(ip); + if (!al) + return -ENOMEM; + + error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_alloc; + + error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh); + if (error) + goto out_quota; + + error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); + if (error) + goto out_rindex; + + if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { + error = ea_dealloc_indirect(ip); + if (error) + goto out_rindex; + } + + error = ea_dealloc_block(ip); + +out_rindex: + gfs2_glock_dq_uninit(&al->al_ri_gh); +out_quota: + gfs2_quota_unhold(ip); +out_alloc: + gfs2_alloc_put(ip); + return error; +} + +static const struct xattr_handler gfs2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = GFS2_EATYPE_USR, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, +}; + +static const struct xattr_handler gfs2_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = GFS2_EATYPE_SECURITY, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, +}; + +const struct xattr_handler *gfs2_xattr_handlers[] = { + &gfs2_xattr_user_handler, + &gfs2_xattr_security_handler, + &gfs2_xattr_system_handler, + NULL, +}; + diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h new file mode 100644 index 00000000..d392f835 --- /dev/null +++ b/fs/gfs2/xattr.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __EATTR_DOT_H__ +#define __EATTR_DOT_H__ + +struct gfs2_inode; +struct iattr; + +#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len) +#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len) + +#define GFS2_EA_SIZE(ea) \ +ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ + ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \ + (sizeof(__be64) * (ea)->ea_num_ptrs)), 8) + +#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs) +#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST) + +#define GFS2_EAREQ_SIZE_STUFFED(er) \ +ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8) + +#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1)) +#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len) + +#define GFS2_EA2DATAPTRS(ea) \ +((__be64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8))) + +#define GFS2_EA2NEXT(ea) \ +((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea))) + +#define GFS2_EA_BH2FIRST(bh) \ +((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header))) + +struct gfs2_ea_request { + const char *er_name; + char *er_data; + unsigned int er_name_len; + unsigned int er_data_len; + unsigned int er_type; /* GFS2_EATYPE_... */ +}; + +struct gfs2_ea_location { + struct buffer_head *el_bh; + struct gfs2_ea_header *el_ea; + struct gfs2_ea_header *el_prev; +}; + +extern int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, + int flags, int type); +extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); +extern int gfs2_ea_dealloc(struct gfs2_inode *ip); + +/* Exported to acl.c */ + +extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data); +extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data); + +#endif /* __EATTR_DOT_H__ */ diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig new file mode 100644 index 00000000..b77c5bc2 --- /dev/null +++ b/fs/hfs/Kconfig @@ -0,0 +1,12 @@ +config HFS_FS + tristate "Apple Macintosh file system support (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + select NLS + help + If you say Y here, you will be able to mount Macintosh-formatted + floppy disks and hard drive partitions with full read-write access. + Please read to learn about + the available mount options. + + To compile this file system support as a module, choose M here: the + module will be called hfs. diff --git a/fs/hfs/Makefile b/fs/hfs/Makefile new file mode 100644 index 00000000..c41f5a85 --- /dev/null +++ b/fs/hfs/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the Linux hfs filesystem routines. +# + +obj-$(CONFIG_HFS_FS) += hfs.o + +hfs-objs := bitmap.o bfind.o bnode.o brec.o btree.o \ + catalog.o dir.o extent.o inode.o attr.o mdb.o \ + part_tbl.o string.o super.o sysdep.o trans.o + diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c new file mode 100644 index 00000000..e057ec54 --- /dev/null +++ b/fs/hfs/attr.c @@ -0,0 +1,121 @@ +/* + * linux/fs/hfs/attr.c + * + * (C) 2003 Ardis Technologies + * + * Export hfs data via xattr + */ + + +#include +#include + +#include "hfs_fs.h" +#include "btree.h" + +int hfs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + struct hfs_find_data fd; + hfs_cat_rec rec; + struct hfs_cat_file *file; + int res; + + if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + res = hfs_find_init(HFS_SB(inode->i_sb)->cat_tree, &fd); + if (res) + return res; + fd.search_key->cat = HFS_I(inode)->cat_key; + res = hfs_brec_find(&fd); + if (res) + goto out; + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); + file = &rec.file; + + if (!strcmp(name, "hfs.type")) { + if (size == 4) + memcpy(&file->UsrWds.fdType, value, 4); + else + res = -ERANGE; + } else if (!strcmp(name, "hfs.creator")) { + if (size == 4) + memcpy(&file->UsrWds.fdCreator, value, 4); + else + res = -ERANGE; + } else + res = -EOPNOTSUPP; + if (!res) + hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); +out: + hfs_find_exit(&fd); + return res; +} + +ssize_t hfs_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct hfs_find_data fd; + hfs_cat_rec rec; + struct hfs_cat_file *file; + ssize_t res = 0; + + if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + if (size) { + res = hfs_find_init(HFS_SB(inode->i_sb)->cat_tree, &fd); + if (res) + return res; + fd.search_key->cat = HFS_I(inode)->cat_key; + res = hfs_brec_find(&fd); + if (res) + goto out; + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); + } + file = &rec.file; + + if (!strcmp(name, "hfs.type")) { + if (size >= 4) { + memcpy(value, &file->UsrWds.fdType, 4); + res = 4; + } else + res = size ? -ERANGE : 4; + } else if (!strcmp(name, "hfs.creator")) { + if (size >= 4) { + memcpy(value, &file->UsrWds.fdCreator, 4); + res = 4; + } else + res = size ? -ERANGE : 4; + } else + res = -ENODATA; +out: + if (size) + hfs_find_exit(&fd); + return res; +} + +#define HFS_ATTRLIST_SIZE (sizeof("hfs.creator")+sizeof("hfs.type")) + +ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = dentry->d_inode; + + if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + if (!buffer || !size) + return HFS_ATTRLIST_SIZE; + if (size < HFS_ATTRLIST_SIZE) + return -ERANGE; + strcpy(buffer, "hfs.type"); + strcpy(buffer + sizeof("hfs.type"), "hfs.creator"); + + return HFS_ATTRLIST_SIZE; +} diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c new file mode 100644 index 00000000..571abe97 --- /dev/null +++ b/fs/hfs/bfind.c @@ -0,0 +1,222 @@ +/* + * linux/fs/hfs/bfind.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Search routines for btrees + */ + +#include +#include "btree.h" + +int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) +{ + void *ptr; + + fd->tree = tree; + fd->bnode = NULL; + ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); + if (!ptr) + return -ENOMEM; + fd->search_key = ptr; + fd->key = ptr + tree->max_key_len + 2; + dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); + mutex_lock(&tree->tree_lock); + return 0; +} + +void hfs_find_exit(struct hfs_find_data *fd) +{ + hfs_bnode_put(fd->bnode); + kfree(fd->search_key); + dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); + mutex_unlock(&fd->tree->tree_lock); + fd->tree = NULL; +} + +/* Find the record in bnode that best matches key (not greater than...)*/ +int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) +{ + int cmpval; + u16 off, len, keylen; + int rec; + int b, e; + int res; + + b = 0; + e = bnode->num_recs - 1; + res = -ENOENT; + do { + rec = (e + b) / 2; + len = hfs_brec_lenoff(bnode, rec, &off); + keylen = hfs_brec_keylen(bnode, rec); + if (keylen == 0) { + res = -EINVAL; + goto fail; + } + hfs_bnode_read(bnode, fd->key, off, keylen); + cmpval = bnode->tree->keycmp(fd->key, fd->search_key); + if (!cmpval) { + e = rec; + res = 0; + goto done; + } + if (cmpval < 0) + b = rec + 1; + else + e = rec - 1; + } while (b <= e); + if (rec != e && e >= 0) { + len = hfs_brec_lenoff(bnode, e, &off); + keylen = hfs_brec_keylen(bnode, e); + if (keylen == 0) { + res = -EINVAL; + goto fail; + } + hfs_bnode_read(bnode, fd->key, off, keylen); + } +done: + fd->record = e; + fd->keyoffset = off; + fd->keylength = keylen; + fd->entryoffset = off + keylen; + fd->entrylength = len - keylen; +fail: + return res; +} + +/* Traverse a B*Tree from the root to a leaf finding best fit to key */ +/* Return allocated copy of node found, set recnum to best record */ +int hfs_brec_find(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *bnode; + u32 nidx, parent; + __be32 data; + int height, res; + + tree = fd->tree; + if (fd->bnode) + hfs_bnode_put(fd->bnode); + fd->bnode = NULL; + nidx = tree->root; + if (!nidx) + return -ENOENT; + height = tree->depth; + res = 0; + parent = 0; + for (;;) { + bnode = hfs_bnode_find(tree, nidx); + if (IS_ERR(bnode)) { + res = PTR_ERR(bnode); + bnode = NULL; + break; + } + if (bnode->height != height) + goto invalid; + if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF)) + goto invalid; + bnode->parent = parent; + + res = __hfs_brec_find(bnode, fd); + if (!height) + break; + if (fd->record < 0) + goto release; + + parent = nidx; + hfs_bnode_read(bnode, &data, fd->entryoffset, 4); + nidx = be32_to_cpu(data); + hfs_bnode_put(bnode); + } + fd->bnode = bnode; + return res; + +invalid: + printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", + height, bnode->height, bnode->type, nidx, parent); + res = -EIO; +release: + hfs_bnode_put(bnode); + return res; +} + +int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len) +{ + int res; + + res = hfs_brec_find(fd); + if (res) + return res; + if (fd->entrylength > rec_len) + return -EINVAL; + hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength); + return 0; +} + +int hfs_brec_goto(struct hfs_find_data *fd, int cnt) +{ + struct hfs_btree *tree; + struct hfs_bnode *bnode; + int idx, res = 0; + u16 off, len, keylen; + + bnode = fd->bnode; + tree = bnode->tree; + + if (cnt < 0) { + cnt = -cnt; + while (cnt > fd->record) { + cnt -= fd->record + 1; + fd->record = bnode->num_recs - 1; + idx = bnode->prev; + if (!idx) { + res = -ENOENT; + goto out; + } + hfs_bnode_put(bnode); + bnode = hfs_bnode_find(tree, idx); + if (IS_ERR(bnode)) { + res = PTR_ERR(bnode); + bnode = NULL; + goto out; + } + } + fd->record -= cnt; + } else { + while (cnt >= bnode->num_recs - fd->record) { + cnt -= bnode->num_recs - fd->record; + fd->record = 0; + idx = bnode->next; + if (!idx) { + res = -ENOENT; + goto out; + } + hfs_bnode_put(bnode); + bnode = hfs_bnode_find(tree, idx); + if (IS_ERR(bnode)) { + res = PTR_ERR(bnode); + bnode = NULL; + goto out; + } + } + fd->record += cnt; + } + + len = hfs_brec_lenoff(bnode, fd->record, &off); + keylen = hfs_brec_keylen(bnode, fd->record); + if (keylen == 0) { + res = -EINVAL; + goto out; + } + fd->keyoffset = off; + fd->keylength = keylen; + fd->entryoffset = off + keylen; + fd->entrylength = len - keylen; + hfs_bnode_read(bnode, fd->key, off, keylen); +out: + fd->bnode = bnode; + return res; +} diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c new file mode 100644 index 00000000..c6e97366 --- /dev/null +++ b/fs/hfs/bitmap.c @@ -0,0 +1,243 @@ +/* + * linux/fs/hfs/bitmap.c + * + * Copyright (C) 1996-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * Based on GPLed code Copyright (C) 1995 Michael Dreher + * + * This file contains the code to modify the volume bitmap: + * search/set/clear bits. + */ + +#include "hfs_fs.h" + +/* + * hfs_find_zero_bit() + * + * Description: + * Given a block of memory, its length in bits, and a starting bit number, + * determine the number of the first zero bits (in left-to-right ordering) + * in that range. + * + * Returns >= 'size' if no zero bits are found in the range. + * + * Accesses memory in 32-bit aligned chunks of 32-bits and thus + * may read beyond the 'size'th bit. + */ +static u32 hfs_find_set_zero_bits(__be32 *bitmap, u32 size, u32 offset, u32 *max) +{ + __be32 *curr, *end; + u32 mask, start, len, n; + __be32 val; + int i; + + len = *max; + if (!len) + return size; + + curr = bitmap + (offset / 32); + end = bitmap + ((size + 31) / 32); + + /* scan the first partial u32 for zero bits */ + val = *curr; + if (~val) { + n = be32_to_cpu(val); + i = offset % 32; + mask = (1U << 31) >> i; + for (; i < 32; mask >>= 1, i++) { + if (!(n & mask)) + goto found; + } + } + + /* scan complete u32s for the first zero bit */ + while (++curr < end) { + val = *curr; + if (~val) { + n = be32_to_cpu(val); + mask = 1 << 31; + for (i = 0; i < 32; mask >>= 1, i++) { + if (!(n & mask)) + goto found; + } + } + } + return size; + +found: + start = (curr - bitmap) * 32 + i; + if (start >= size) + return start; + /* do any partial u32 at the start */ + len = min(size - start, len); + while (1) { + n |= mask; + if (++i >= 32) + break; + mask >>= 1; + if (!--len || n & mask) + goto done; + } + if (!--len) + goto done; + *curr++ = cpu_to_be32(n); + /* do full u32s */ + while (1) { + n = be32_to_cpu(*curr); + if (len < 32) + break; + if (n) { + len = 32; + break; + } + *curr++ = cpu_to_be32(0xffffffff); + len -= 32; + } + /* do any partial u32 at end */ + mask = 1U << 31; + for (i = 0; i < len; i++) { + if (n & mask) + break; + n |= mask; + mask >>= 1; + } +done: + *curr = cpu_to_be32(n); + *max = (curr - bitmap) * 32 + i - start; + return start; +} + +/* + * hfs_vbm_search_free() + * + * Description: + * Search for 'num_bits' consecutive cleared bits in the bitmap blocks of + * the hfs MDB. 'mdb' had better be locked or the returned range + * may be no longer free, when this functions returns! + * XXX Currently the search starts from bit 0, but it should start with + * the bit number stored in 's_alloc_ptr' of the MDB. + * Input Variable(s): + * struct hfs_mdb *mdb: Pointer to the hfs MDB + * u16 *num_bits: Pointer to the number of cleared bits + * to search for + * Output Variable(s): + * u16 *num_bits: The number of consecutive clear bits of the + * returned range. If the bitmap is fragmented, this will be less than + * requested and it will be zero, when the disk is full. + * Returns: + * The number of the first bit of the range of cleared bits which has been + * found. When 'num_bits' is zero, this is invalid! + * Preconditions: + * 'mdb' points to a "valid" (struct hfs_mdb). + * 'num_bits' points to a variable of type (u16), which contains + * the number of cleared bits to find. + * Postconditions: + * 'num_bits' is set to the length of the found sequence. + */ +u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) +{ + void *bitmap; + u32 pos; + + /* make sure we have actual work to perform */ + if (!*num_bits) + return 0; + + mutex_lock(&HFS_SB(sb)->bitmap_lock); + bitmap = HFS_SB(sb)->bitmap; + + pos = hfs_find_set_zero_bits(bitmap, HFS_SB(sb)->fs_ablocks, goal, num_bits); + if (pos >= HFS_SB(sb)->fs_ablocks) { + if (goal) + pos = hfs_find_set_zero_bits(bitmap, goal, 0, num_bits); + if (pos >= HFS_SB(sb)->fs_ablocks) { + *num_bits = pos = 0; + goto out; + } + } + + dprint(DBG_BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits); + HFS_SB(sb)->free_ablocks -= *num_bits; + hfs_bitmap_dirty(sb); +out: + mutex_unlock(&HFS_SB(sb)->bitmap_lock); + return pos; +} + + +/* + * hfs_clear_vbm_bits() + * + * Description: + * Clear the requested bits in the volume bitmap of the hfs filesystem + * Input Variable(s): + * struct hfs_mdb *mdb: Pointer to the hfs MDB + * u16 start: The offset of the first bit + * u16 count: The number of bits + * Output Variable(s): + * None + * Returns: + * 0: no error + * -1: One of the bits was already clear. This is a strange + * error and when it happens, the filesystem must be repaired! + * -2: One or more of the bits are out of range of the bitmap. + * Preconditions: + * 'mdb' points to a "valid" (struct hfs_mdb). + * Postconditions: + * Starting with bit number 'start', 'count' bits in the volume bitmap + * are cleared. The affected bitmap blocks are marked "dirty", the free + * block count of the MDB is updated and the MDB is marked dirty. + */ +int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) +{ + __be32 *curr; + u32 mask; + int i, len; + + /* is there any actual work to be done? */ + if (!count) + return 0; + + dprint(DBG_BITMAP, "clear_bits: %u,%u\n", start, count); + /* are all of the bits in range? */ + if ((start + count) > HFS_SB(sb)->fs_ablocks) + return -2; + + mutex_lock(&HFS_SB(sb)->bitmap_lock); + /* bitmap is always on a 32-bit boundary */ + curr = HFS_SB(sb)->bitmap + (start / 32); + len = count; + + /* do any partial u32 at the start */ + i = start % 32; + if (i) { + int j = 32 - i; + mask = 0xffffffffU << j; + if (j > count) { + mask |= 0xffffffffU >> (i + count); + *curr &= cpu_to_be32(mask); + goto out; + } + *curr++ &= cpu_to_be32(mask); + count -= j; + } + + /* do full u32s */ + while (count >= 32) { + *curr++ = 0; + count -= 32; + } + /* do any partial u32 at end */ + if (count) { + mask = 0xffffffffU >> count; + *curr &= cpu_to_be32(mask); + } +out: + HFS_SB(sb)->free_ablocks += len; + mutex_unlock(&HFS_SB(sb)->bitmap_lock); + hfs_bitmap_dirty(sb); + + return 0; +} diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c new file mode 100644 index 00000000..cdb41a1f --- /dev/null +++ b/fs/hfs/bnode.c @@ -0,0 +1,478 @@ +/* + * linux/fs/hfs/bnode.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handle basic btree node operations + */ + +#include +#include +#include + +#include "btree.h" + +void hfs_bnode_read(struct hfs_bnode *node, void *buf, + int off, int len) +{ + struct page *page; + + off += node->page_offset; + page = node->page[0]; + + memcpy(buf, kmap(page) + off, len); + kunmap(page); +} + +u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off) +{ + __be16 data; + // optimize later... + hfs_bnode_read(node, &data, off, 2); + return be16_to_cpu(data); +} + +u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off) +{ + u8 data; + // optimize later... + hfs_bnode_read(node, &data, off, 1); + return data; +} + +void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) +{ + struct hfs_btree *tree; + int key_len; + + tree = node->tree; + if (node->type == HFS_NODE_LEAF || + tree->attributes & HFS_TREE_VARIDXKEYS) + key_len = hfs_bnode_read_u8(node, off) + 1; + else + key_len = tree->max_key_len + 1; + + hfs_bnode_read(node, key, off, key_len); +} + +void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) +{ + struct page *page; + + off += node->page_offset; + page = node->page[0]; + + memcpy(kmap(page) + off, buf, len); + kunmap(page); + set_page_dirty(page); +} + +void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data) +{ + __be16 v = cpu_to_be16(data); + // optimize later... + hfs_bnode_write(node, &v, off, 2); +} + +void hfs_bnode_write_u8(struct hfs_bnode *node, int off, u8 data) +{ + // optimize later... + hfs_bnode_write(node, &data, off, 1); +} + +void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) +{ + struct page *page; + + off += node->page_offset; + page = node->page[0]; + + memset(kmap(page) + off, 0, len); + kunmap(page); + set_page_dirty(page); +} + +void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, + struct hfs_bnode *src_node, int src, int len) +{ + struct hfs_btree *tree; + struct page *src_page, *dst_page; + + dprint(DBG_BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + if (!len) + return; + tree = src_node->tree; + src += src_node->page_offset; + dst += dst_node->page_offset; + src_page = src_node->page[0]; + dst_page = dst_node->page[0]; + + memcpy(kmap(dst_page) + dst, kmap(src_page) + src, len); + kunmap(src_page); + kunmap(dst_page); + set_page_dirty(dst_page); +} + +void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) +{ + struct page *page; + void *ptr; + + dprint(DBG_BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + if (!len) + return; + src += node->page_offset; + dst += node->page_offset; + page = node->page[0]; + ptr = kmap(page); + memmove(ptr + dst, ptr + src, len); + kunmap(page); + set_page_dirty(page); +} + +void hfs_bnode_dump(struct hfs_bnode *node) +{ + struct hfs_bnode_desc desc; + __be32 cnid; + int i, off, key_off; + + dprint(DBG_BNODE_MOD, "bnode: %d\n", node->this); + hfs_bnode_read(node, &desc, 0, sizeof(desc)); + dprint(DBG_BNODE_MOD, "%d, %d, %d, %d, %d\n", + be32_to_cpu(desc.next), be32_to_cpu(desc.prev), + desc.type, desc.height, be16_to_cpu(desc.num_recs)); + + off = node->tree->node_size - 2; + for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { + key_off = hfs_bnode_read_u16(node, off); + dprint(DBG_BNODE_MOD, " %d", key_off); + if (i && node->type == HFS_NODE_INDEX) { + int tmp; + + if (node->tree->attributes & HFS_TREE_VARIDXKEYS) + tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1; + else + tmp = node->tree->max_key_len + 1; + dprint(DBG_BNODE_MOD, " (%d,%d", tmp, hfs_bnode_read_u8(node, key_off)); + hfs_bnode_read(node, &cnid, key_off + tmp, 4); + dprint(DBG_BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + } else if (i && node->type == HFS_NODE_LEAF) { + int tmp; + + tmp = hfs_bnode_read_u8(node, key_off); + dprint(DBG_BNODE_MOD, " (%d)", tmp); + } + } + dprint(DBG_BNODE_MOD, "\n"); +} + +void hfs_bnode_unlink(struct hfs_bnode *node) +{ + struct hfs_btree *tree; + struct hfs_bnode *tmp; + __be32 cnid; + + tree = node->tree; + if (node->prev) { + tmp = hfs_bnode_find(tree, node->prev); + if (IS_ERR(tmp)) + return; + tmp->next = node->next; + cnid = cpu_to_be32(tmp->next); + hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, next), 4); + hfs_bnode_put(tmp); + } else if (node->type == HFS_NODE_LEAF) + tree->leaf_head = node->next; + + if (node->next) { + tmp = hfs_bnode_find(tree, node->next); + if (IS_ERR(tmp)) + return; + tmp->prev = node->prev; + cnid = cpu_to_be32(tmp->prev); + hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, prev), 4); + hfs_bnode_put(tmp); + } else if (node->type == HFS_NODE_LEAF) + tree->leaf_tail = node->prev; + + // move down? + if (!node->prev && !node->next) { + printk(KERN_DEBUG "hfs_btree_del_level\n"); + } + if (!node->parent) { + tree->root = 0; + tree->depth = 0; + } + set_bit(HFS_BNODE_DELETED, &node->flags); +} + +static inline int hfs_bnode_hash(u32 num) +{ + num = (num >> 16) + num; + num += num >> 8; + return num & (NODE_HASH_SIZE - 1); +} + +struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) +{ + struct hfs_bnode *node; + + if (cnid >= tree->node_count) { + printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid); + return NULL; + } + + for (node = tree->node_hash[hfs_bnode_hash(cnid)]; + node; node = node->next_hash) { + if (node->this == cnid) { + return node; + } + } + return NULL; +} + +static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) +{ + struct super_block *sb; + struct hfs_bnode *node, *node2; + struct address_space *mapping; + struct page *page; + int size, block, i, hash; + loff_t off; + + if (cnid >= tree->node_count) { + printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid); + return NULL; + } + + sb = tree->inode->i_sb; + size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * + sizeof(struct page *); + node = kzalloc(size, GFP_KERNEL); + if (!node) + return NULL; + node->tree = tree; + node->this = cnid; + set_bit(HFS_BNODE_NEW, &node->flags); + atomic_set(&node->refcnt, 1); + dprint(DBG_BNODE_REFS, "new_node(%d:%d): 1\n", + node->tree->cnid, node->this); + init_waitqueue_head(&node->lock_wq); + spin_lock(&tree->hash_lock); + node2 = hfs_bnode_findhash(tree, cnid); + if (!node2) { + hash = hfs_bnode_hash(cnid); + node->next_hash = tree->node_hash[hash]; + tree->node_hash[hash] = node; + tree->node_hash_cnt++; + } else { + spin_unlock(&tree->hash_lock); + kfree(node); + wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags)); + return node2; + } + spin_unlock(&tree->hash_lock); + + mapping = tree->inode->i_mapping; + off = (loff_t)cnid * tree->node_size; + block = off >> PAGE_CACHE_SHIFT; + node->page_offset = off & ~PAGE_CACHE_MASK; + for (i = 0; i < tree->pages_per_bnode; i++) { + page = read_mapping_page(mapping, block++, NULL); + if (IS_ERR(page)) + goto fail; + if (PageError(page)) { + page_cache_release(page); + goto fail; + } + page_cache_release(page); + node->page[i] = page; + } + + return node; +fail: + set_bit(HFS_BNODE_ERROR, &node->flags); + return node; +} + +void hfs_bnode_unhash(struct hfs_bnode *node) +{ + struct hfs_bnode **p; + + dprint(DBG_BNODE_REFS, "remove_node(%d:%d): %d\n", + node->tree->cnid, node->this, atomic_read(&node->refcnt)); + for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; + *p && *p != node; p = &(*p)->next_hash) + ; + BUG_ON(!*p); + *p = node->next_hash; + node->tree->node_hash_cnt--; +} + +/* Load a particular node out of a tree */ +struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) +{ + struct hfs_bnode *node; + struct hfs_bnode_desc *desc; + int i, rec_off, off, next_off; + int entry_size, key_size; + + spin_lock(&tree->hash_lock); + node = hfs_bnode_findhash(tree, num); + if (node) { + hfs_bnode_get(node); + spin_unlock(&tree->hash_lock); + wait_event(node->lock_wq, !test_bit(HFS_BNODE_NEW, &node->flags)); + if (test_bit(HFS_BNODE_ERROR, &node->flags)) + goto node_error; + return node; + } + spin_unlock(&tree->hash_lock); + node = __hfs_bnode_create(tree, num); + if (!node) + return ERR_PTR(-ENOMEM); + if (test_bit(HFS_BNODE_ERROR, &node->flags)) + goto node_error; + if (!test_bit(HFS_BNODE_NEW, &node->flags)) + return node; + + desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset); + node->prev = be32_to_cpu(desc->prev); + node->next = be32_to_cpu(desc->next); + node->num_recs = be16_to_cpu(desc->num_recs); + node->type = desc->type; + node->height = desc->height; + kunmap(node->page[0]); + + switch (node->type) { + case HFS_NODE_HEADER: + case HFS_NODE_MAP: + if (node->height != 0) + goto node_error; + break; + case HFS_NODE_LEAF: + if (node->height != 1) + goto node_error; + break; + case HFS_NODE_INDEX: + if (node->height <= 1 || node->height > tree->depth) + goto node_error; + break; + default: + goto node_error; + } + + rec_off = tree->node_size - 2; + off = hfs_bnode_read_u16(node, rec_off); + if (off != sizeof(struct hfs_bnode_desc)) + goto node_error; + for (i = 1; i <= node->num_recs; off = next_off, i++) { + rec_off -= 2; + next_off = hfs_bnode_read_u16(node, rec_off); + if (next_off <= off || + next_off > tree->node_size || + next_off & 1) + goto node_error; + entry_size = next_off - off; + if (node->type != HFS_NODE_INDEX && + node->type != HFS_NODE_LEAF) + continue; + key_size = hfs_bnode_read_u8(node, off) + 1; + if (key_size >= entry_size /*|| key_size & 1*/) + goto node_error; + } + clear_bit(HFS_BNODE_NEW, &node->flags); + wake_up(&node->lock_wq); + return node; + +node_error: + set_bit(HFS_BNODE_ERROR, &node->flags); + clear_bit(HFS_BNODE_NEW, &node->flags); + wake_up(&node->lock_wq); + hfs_bnode_put(node); + return ERR_PTR(-EIO); +} + +void hfs_bnode_free(struct hfs_bnode *node) +{ + //int i; + + //for (i = 0; i < node->tree->pages_per_bnode; i++) + // if (node->page[i]) + // page_cache_release(node->page[i]); + kfree(node); +} + +struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) +{ + struct hfs_bnode *node; + struct page **pagep; + int i; + + spin_lock(&tree->hash_lock); + node = hfs_bnode_findhash(tree, num); + spin_unlock(&tree->hash_lock); + BUG_ON(node); + node = __hfs_bnode_create(tree, num); + if (!node) + return ERR_PTR(-ENOMEM); + if (test_bit(HFS_BNODE_ERROR, &node->flags)) { + hfs_bnode_put(node); + return ERR_PTR(-EIO); + } + + pagep = node->page; + memset(kmap(*pagep) + node->page_offset, 0, + min((int)PAGE_CACHE_SIZE, (int)tree->node_size)); + set_page_dirty(*pagep); + kunmap(*pagep); + for (i = 1; i < tree->pages_per_bnode; i++) { + memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE); + set_page_dirty(*pagep); + kunmap(*pagep); + } + clear_bit(HFS_BNODE_NEW, &node->flags); + wake_up(&node->lock_wq); + + return node; +} + +void hfs_bnode_get(struct hfs_bnode *node) +{ + if (node) { + atomic_inc(&node->refcnt); + dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n", + node->tree->cnid, node->this, atomic_read(&node->refcnt)); + } +} + +/* Dispose of resources used by a node */ +void hfs_bnode_put(struct hfs_bnode *node) +{ + if (node) { + struct hfs_btree *tree = node->tree; + int i; + + dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n", + node->tree->cnid, node->this, atomic_read(&node->refcnt)); + BUG_ON(!atomic_read(&node->refcnt)); + if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) + return; + for (i = 0; i < tree->pages_per_bnode; i++) { + if (!node->page[i]) + continue; + mark_page_accessed(node->page[i]); + } + + if (test_bit(HFS_BNODE_DELETED, &node->flags)) { + hfs_bnode_unhash(node); + spin_unlock(&tree->hash_lock); + hfs_bmap_free(node); + hfs_bnode_free(node); + return; + } + spin_unlock(&tree->hash_lock); + } +} diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c new file mode 100644 index 00000000..92fb358c --- /dev/null +++ b/fs/hfs/brec.c @@ -0,0 +1,519 @@ +/* + * linux/fs/hfs/brec.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handle individual btree records + */ + +#include "btree.h" + +static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd); +static int hfs_brec_update_parent(struct hfs_find_data *fd); +static int hfs_btree_inc_height(struct hfs_btree *tree); + +/* Get the length and offset of the given record in the given node */ +u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off) +{ + __be16 retval[2]; + u16 dataoff; + + dataoff = node->tree->node_size - (rec + 2) * 2; + hfs_bnode_read(node, retval, dataoff, 4); + *off = be16_to_cpu(retval[1]); + return be16_to_cpu(retval[0]) - *off; +} + +/* Get the length of the key from a keyed record */ +u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) +{ + u16 retval, recoff; + + if (node->type != HFS_NODE_INDEX && node->type != HFS_NODE_LEAF) + return 0; + + if ((node->type == HFS_NODE_INDEX) && + !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) { + if (node->tree->attributes & HFS_TREE_BIGKEYS) + retval = node->tree->max_key_len + 2; + else + retval = node->tree->max_key_len + 1; + } else { + recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); + if (!recoff) + return 0; + if (node->tree->attributes & HFS_TREE_BIGKEYS) { + retval = hfs_bnode_read_u16(node, recoff) + 2; + if (retval > node->tree->max_key_len + 2) { + printk(KERN_ERR "hfs: keylen %d too large\n", + retval); + retval = 0; + } + } else { + retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; + if (retval > node->tree->max_key_len + 1) { + printk(KERN_ERR "hfs: keylen %d too large\n", + retval); + retval = 0; + } + } + } + return retval; +} + +int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *new_node; + int size, key_len, rec; + int data_off, end_off; + int idx_rec_off, data_rec_off, end_rec_off; + __be32 cnid; + + tree = fd->tree; + if (!fd->bnode) { + if (!tree->root) + hfs_btree_inc_height(tree); + fd->bnode = hfs_bnode_find(tree, tree->leaf_head); + if (IS_ERR(fd->bnode)) + return PTR_ERR(fd->bnode); + fd->record = -1; + } + new_node = NULL; + key_len = (fd->search_key->key_len | 1) + 1; +again: + /* new record idx and complete record size */ + rec = fd->record + 1; + size = key_len + entry_len; + + node = fd->bnode; + hfs_bnode_dump(node); + /* get last offset */ + end_rec_off = tree->node_size - (node->num_recs + 1) * 2; + end_off = hfs_bnode_read_u16(node, end_rec_off); + end_rec_off -= 2; + dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off); + if (size > end_rec_off - end_off) { + if (new_node) + panic("not enough room!\n"); + new_node = hfs_bnode_split(fd); + if (IS_ERR(new_node)) + return PTR_ERR(new_node); + goto again; + } + if (node->type == HFS_NODE_LEAF) { + tree->leaf_count++; + mark_inode_dirty(tree->inode); + } + node->num_recs++; + /* write new last offset */ + hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); + hfs_bnode_write_u16(node, end_rec_off, end_off + size); + data_off = end_off; + data_rec_off = end_rec_off + 2; + idx_rec_off = tree->node_size - (rec + 1) * 2; + if (idx_rec_off == data_rec_off) + goto skip; + /* move all following entries */ + do { + data_off = hfs_bnode_read_u16(node, data_rec_off + 2); + hfs_bnode_write_u16(node, data_rec_off, data_off + size); + data_rec_off += 2; + } while (data_rec_off < idx_rec_off); + + /* move data away */ + hfs_bnode_move(node, data_off + size, data_off, + end_off - data_off); + +skip: + hfs_bnode_write(node, fd->search_key, data_off, key_len); + hfs_bnode_write(node, entry, data_off + key_len, entry_len); + hfs_bnode_dump(node); + + if (new_node) { + /* update parent key if we inserted a key + * at the start of the first node + */ + if (!rec && new_node != node) + hfs_brec_update_parent(fd); + + hfs_bnode_put(fd->bnode); + if (!new_node->parent) { + hfs_btree_inc_height(tree); + new_node->parent = tree->root; + } + fd->bnode = hfs_bnode_find(tree, new_node->parent); + + /* create index data entry */ + cnid = cpu_to_be32(new_node->this); + entry = &cnid; + entry_len = sizeof(cnid); + + /* get index key */ + hfs_bnode_read_key(new_node, fd->search_key, 14); + __hfs_brec_find(fd->bnode, fd); + + hfs_bnode_put(new_node); + new_node = NULL; + + if (tree->attributes & HFS_TREE_VARIDXKEYS) + key_len = fd->search_key->key_len + 1; + else { + fd->search_key->key_len = tree->max_key_len; + key_len = tree->max_key_len + 1; + } + goto again; + } + + if (!rec) + hfs_brec_update_parent(fd); + + return 0; +} + +int hfs_brec_remove(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *parent; + int end_off, rec_off, data_off, size; + + tree = fd->tree; + node = fd->bnode; +again: + rec_off = tree->node_size - (fd->record + 2) * 2; + end_off = tree->node_size - (node->num_recs + 1) * 2; + + if (node->type == HFS_NODE_LEAF) { + tree->leaf_count--; + mark_inode_dirty(tree->inode); + } + hfs_bnode_dump(node); + dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength); + if (!--node->num_recs) { + hfs_bnode_unlink(node); + if (!node->parent) + return 0; + parent = hfs_bnode_find(tree, node->parent); + if (IS_ERR(parent)) + return PTR_ERR(parent); + hfs_bnode_put(node); + node = fd->bnode = parent; + + __hfs_brec_find(node, fd); + goto again; + } + hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); + + if (rec_off == end_off) + goto skip; + size = fd->keylength + fd->entrylength; + + do { + data_off = hfs_bnode_read_u16(node, rec_off); + hfs_bnode_write_u16(node, rec_off + 2, data_off - size); + rec_off -= 2; + } while (rec_off >= end_off); + + /* fill hole */ + hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size, + data_off - fd->keyoffset - size); +skip: + hfs_bnode_dump(node); + if (!fd->record) + hfs_brec_update_parent(fd); + return 0; +} + +static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *new_node, *next_node; + struct hfs_bnode_desc node_desc; + int num_recs, new_rec_off, new_off, old_rec_off; + int data_start, data_end, size; + + tree = fd->tree; + node = fd->bnode; + new_node = hfs_bmap_alloc(tree); + if (IS_ERR(new_node)) + return new_node; + hfs_bnode_get(node); + dprint(DBG_BNODE_MOD, "split_nodes: %d - %d - %d\n", + node->this, new_node->this, node->next); + new_node->next = node->next; + new_node->prev = node->this; + new_node->parent = node->parent; + new_node->type = node->type; + new_node->height = node->height; + + if (node->next) + next_node = hfs_bnode_find(tree, node->next); + else + next_node = NULL; + + if (IS_ERR(next_node)) { + hfs_bnode_put(node); + hfs_bnode_put(new_node); + return next_node; + } + + size = tree->node_size / 2 - node->num_recs * 2 - 14; + old_rec_off = tree->node_size - 4; + num_recs = 1; + for (;;) { + data_start = hfs_bnode_read_u16(node, old_rec_off); + if (data_start > size) + break; + old_rec_off -= 2; + if (++num_recs < node->num_recs) + continue; + /* panic? */ + hfs_bnode_put(node); + hfs_bnode_put(new_node); + if (next_node) + hfs_bnode_put(next_node); + return ERR_PTR(-ENOSPC); + } + + if (fd->record + 1 < num_recs) { + /* new record is in the lower half, + * so leave some more space there + */ + old_rec_off += 2; + num_recs--; + data_start = hfs_bnode_read_u16(node, old_rec_off); + } else { + hfs_bnode_put(node); + hfs_bnode_get(new_node); + fd->bnode = new_node; + fd->record -= num_recs; + fd->keyoffset -= data_start - 14; + fd->entryoffset -= data_start - 14; + } + new_node->num_recs = node->num_recs - num_recs; + node->num_recs = num_recs; + + new_rec_off = tree->node_size - 2; + new_off = 14; + size = data_start - new_off; + num_recs = new_node->num_recs; + data_end = data_start; + while (num_recs) { + hfs_bnode_write_u16(new_node, new_rec_off, new_off); + old_rec_off -= 2; + new_rec_off -= 2; + data_end = hfs_bnode_read_u16(node, old_rec_off); + new_off = data_end - size; + num_recs--; + } + hfs_bnode_write_u16(new_node, new_rec_off, new_off); + hfs_bnode_copy(new_node, 14, node, data_start, data_end - data_start); + + /* update new bnode header */ + node_desc.next = cpu_to_be32(new_node->next); + node_desc.prev = cpu_to_be32(new_node->prev); + node_desc.type = new_node->type; + node_desc.height = new_node->height; + node_desc.num_recs = cpu_to_be16(new_node->num_recs); + node_desc.reserved = 0; + hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc)); + + /* update previous bnode header */ + node->next = new_node->this; + hfs_bnode_read(node, &node_desc, 0, sizeof(node_desc)); + node_desc.next = cpu_to_be32(node->next); + node_desc.num_recs = cpu_to_be16(node->num_recs); + hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); + + /* update next bnode header */ + if (next_node) { + next_node->prev = new_node->this; + hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); + node_desc.prev = cpu_to_be32(next_node->prev); + hfs_bnode_write(next_node, &node_desc, 0, sizeof(node_desc)); + hfs_bnode_put(next_node); + } else if (node->this == tree->leaf_tail) { + /* if there is no next node, this might be the new tail */ + tree->leaf_tail = new_node->this; + mark_inode_dirty(tree->inode); + } + + hfs_bnode_dump(node); + hfs_bnode_dump(new_node); + hfs_bnode_put(node); + + return new_node; +} + +static int hfs_brec_update_parent(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *new_node, *parent; + int newkeylen, diff; + int rec, rec_off, end_rec_off; + int start_off, end_off; + + tree = fd->tree; + node = fd->bnode; + new_node = NULL; + if (!node->parent) + return 0; + +again: + parent = hfs_bnode_find(tree, node->parent); + if (IS_ERR(parent)) + return PTR_ERR(parent); + __hfs_brec_find(parent, fd); + hfs_bnode_dump(parent); + rec = fd->record; + + /* size difference between old and new key */ + if (tree->attributes & HFS_TREE_VARIDXKEYS) + newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1; + else + fd->keylength = newkeylen = tree->max_key_len + 1; + dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen); + + rec_off = tree->node_size - (rec + 2) * 2; + end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; + diff = newkeylen - fd->keylength; + if (!diff) + goto skip; + if (diff > 0) { + end_off = hfs_bnode_read_u16(parent, end_rec_off); + if (end_rec_off - end_off < diff) { + + printk(KERN_DEBUG "hfs: splitting index node...\n"); + fd->bnode = parent; + new_node = hfs_bnode_split(fd); + if (IS_ERR(new_node)) + return PTR_ERR(new_node); + parent = fd->bnode; + rec = fd->record; + rec_off = tree->node_size - (rec + 2) * 2; + end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; + } + } + + end_off = start_off = hfs_bnode_read_u16(parent, rec_off); + hfs_bnode_write_u16(parent, rec_off, start_off + diff); + start_off -= 4; /* move previous cnid too */ + + while (rec_off > end_rec_off) { + rec_off -= 2; + end_off = hfs_bnode_read_u16(parent, rec_off); + hfs_bnode_write_u16(parent, rec_off, end_off + diff); + } + hfs_bnode_move(parent, start_off + diff, start_off, + end_off - start_off); +skip: + hfs_bnode_copy(parent, fd->keyoffset, node, 14, newkeylen); + if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) + hfs_bnode_write_u8(parent, fd->keyoffset, newkeylen - 1); + hfs_bnode_dump(parent); + + hfs_bnode_put(node); + node = parent; + + if (new_node) { + __be32 cnid; + + fd->bnode = hfs_bnode_find(tree, new_node->parent); + /* create index key and entry */ + hfs_bnode_read_key(new_node, fd->search_key, 14); + cnid = cpu_to_be32(new_node->this); + + __hfs_brec_find(fd->bnode, fd); + hfs_brec_insert(fd, &cnid, sizeof(cnid)); + hfs_bnode_put(fd->bnode); + hfs_bnode_put(new_node); + + if (!rec) { + if (new_node == node) + goto out; + /* restore search_key */ + hfs_bnode_read_key(node, fd->search_key, 14); + } + } + + if (!rec && node->parent) + goto again; +out: + fd->bnode = node; + return 0; +} + +static int hfs_btree_inc_height(struct hfs_btree *tree) +{ + struct hfs_bnode *node, *new_node; + struct hfs_bnode_desc node_desc; + int key_size, rec; + __be32 cnid; + + node = NULL; + if (tree->root) { + node = hfs_bnode_find(tree, tree->root); + if (IS_ERR(node)) + return PTR_ERR(node); + } + new_node = hfs_bmap_alloc(tree); + if (IS_ERR(new_node)) { + hfs_bnode_put(node); + return PTR_ERR(new_node); + } + + tree->root = new_node->this; + if (!tree->depth) { + tree->leaf_head = tree->leaf_tail = new_node->this; + new_node->type = HFS_NODE_LEAF; + new_node->num_recs = 0; + } else { + new_node->type = HFS_NODE_INDEX; + new_node->num_recs = 1; + } + new_node->parent = 0; + new_node->next = 0; + new_node->prev = 0; + new_node->height = ++tree->depth; + + node_desc.next = cpu_to_be32(new_node->next); + node_desc.prev = cpu_to_be32(new_node->prev); + node_desc.type = new_node->type; + node_desc.height = new_node->height; + node_desc.num_recs = cpu_to_be16(new_node->num_recs); + node_desc.reserved = 0; + hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc)); + + rec = tree->node_size - 2; + hfs_bnode_write_u16(new_node, rec, 14); + + if (node) { + /* insert old root idx into new root */ + node->parent = tree->root; + if (node->type == HFS_NODE_LEAF || + tree->attributes & HFS_TREE_VARIDXKEYS) + key_size = hfs_bnode_read_u8(node, 14) + 1; + else + key_size = tree->max_key_len + 1; + hfs_bnode_copy(new_node, 14, node, 14, key_size); + + if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { + key_size = tree->max_key_len + 1; + hfs_bnode_write_u8(new_node, 14, tree->max_key_len); + } + key_size = (key_size + 1) & -2; + cnid = cpu_to_be32(node->this); + hfs_bnode_write(new_node, &cnid, 14 + key_size, 4); + + rec -= 2; + hfs_bnode_write_u16(new_node, rec, 14 + key_size + 4); + + hfs_bnode_put(node); + } + hfs_bnode_put(new_node); + mark_inode_dirty(tree->inode); + + return 0; +} diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c new file mode 100644 index 00000000..1cbdeea1 --- /dev/null +++ b/fs/hfs/btree.c @@ -0,0 +1,366 @@ +/* + * linux/fs/hfs/btree.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handle opening/closing btree + */ + +#include +#include +#include + +#include "btree.h" + +/* Get a reference to a B*Tree and do some initial checks */ +struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp keycmp) +{ + struct hfs_btree *tree; + struct hfs_btree_header_rec *head; + struct address_space *mapping; + struct page *page; + unsigned int size; + + tree = kzalloc(sizeof(*tree), GFP_KERNEL); + if (!tree) + return NULL; + + mutex_init(&tree->tree_lock); + spin_lock_init(&tree->hash_lock); + /* Set the correct compare function */ + tree->sb = sb; + tree->cnid = id; + tree->keycmp = keycmp; + + tree->inode = iget_locked(sb, id); + if (!tree->inode) + goto free_tree; + BUG_ON(!(tree->inode->i_state & I_NEW)); + { + struct hfs_mdb *mdb = HFS_SB(sb)->mdb; + HFS_I(tree->inode)->flags = 0; + mutex_init(&HFS_I(tree->inode)->extents_lock); + switch (id) { + case HFS_EXT_CNID: + hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize, + mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz)); + if (HFS_I(tree->inode)->alloc_blocks > + HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records\n"); + unlock_new_inode(tree->inode); + goto free_inode; + } + + tree->inode->i_mapping->a_ops = &hfs_btree_aops; + break; + case HFS_CAT_CNID: + hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize, + mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz)); + + if (!HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records " + "(0 size).\n"); + unlock_new_inode(tree->inode); + goto free_inode; + } + + tree->inode->i_mapping->a_ops = &hfs_btree_aops; + break; + default: + BUG(); + } + } + unlock_new_inode(tree->inode); + + mapping = tree->inode->i_mapping; + page = read_mapping_page(mapping, 0, NULL); + if (IS_ERR(page)) + goto free_inode; + + /* Load the header */ + head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + tree->root = be32_to_cpu(head->root); + tree->leaf_count = be32_to_cpu(head->leaf_count); + tree->leaf_head = be32_to_cpu(head->leaf_head); + tree->leaf_tail = be32_to_cpu(head->leaf_tail); + tree->node_count = be32_to_cpu(head->node_count); + tree->free_nodes = be32_to_cpu(head->free_nodes); + tree->attributes = be32_to_cpu(head->attributes); + tree->node_size = be16_to_cpu(head->node_size); + tree->max_key_len = be16_to_cpu(head->max_key_len); + tree->depth = be16_to_cpu(head->depth); + + size = tree->node_size; + if (!is_power_of_2(size)) + goto fail_page; + if (!tree->node_count) + goto fail_page; + switch (id) { + case HFS_EXT_CNID: + if (tree->max_key_len != HFS_MAX_EXT_KEYLEN) { + printk(KERN_ERR "hfs: invalid extent max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + break; + case HFS_CAT_CNID: + if (tree->max_key_len != HFS_MAX_CAT_KEYLEN) { + printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + break; + default: + BUG(); + } + + tree->node_size_shift = ffs(size) - 1; + tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + kunmap(page); + page_cache_release(page); + return tree; + +fail_page: + page_cache_release(page); +free_inode: + tree->inode->i_mapping->a_ops = &hfs_aops; + iput(tree->inode); +free_tree: + kfree(tree); + return NULL; +} + +/* Release resources used by a btree */ +void hfs_btree_close(struct hfs_btree *tree) +{ + struct hfs_bnode *node; + int i; + + if (!tree) + return; + + for (i = 0; i < NODE_HASH_SIZE; i++) { + while ((node = tree->node_hash[i])) { + tree->node_hash[i] = node->next_hash; + if (atomic_read(&node->refcnt)) + printk(KERN_ERR "hfs: node %d:%d still has %d user(s)!\n", + node->tree->cnid, node->this, atomic_read(&node->refcnt)); + hfs_bnode_free(node); + tree->node_hash_cnt--; + } + } + iput(tree->inode); + kfree(tree); +} + +void hfs_btree_write(struct hfs_btree *tree) +{ + struct hfs_btree_header_rec *head; + struct hfs_bnode *node; + struct page *page; + + node = hfs_bnode_find(tree, 0); + if (IS_ERR(node)) + /* panic? */ + return; + /* Load the header */ + page = node->page[0]; + head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + + head->root = cpu_to_be32(tree->root); + head->leaf_count = cpu_to_be32(tree->leaf_count); + head->leaf_head = cpu_to_be32(tree->leaf_head); + head->leaf_tail = cpu_to_be32(tree->leaf_tail); + head->node_count = cpu_to_be32(tree->node_count); + head->free_nodes = cpu_to_be32(tree->free_nodes); + head->attributes = cpu_to_be32(tree->attributes); + head->depth = cpu_to_be16(tree->depth); + + kunmap(page); + set_page_dirty(page); + hfs_bnode_put(node); +} + +static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) +{ + struct hfs_btree *tree = prev->tree; + struct hfs_bnode *node; + struct hfs_bnode_desc desc; + __be32 cnid; + + node = hfs_bnode_create(tree, idx); + if (IS_ERR(node)) + return node; + + if (!tree->free_nodes) + panic("FIXME!!!"); + tree->free_nodes--; + prev->next = idx; + cnid = cpu_to_be32(idx); + hfs_bnode_write(prev, &cnid, offsetof(struct hfs_bnode_desc, next), 4); + + node->type = HFS_NODE_MAP; + node->num_recs = 1; + hfs_bnode_clear(node, 0, tree->node_size); + desc.next = 0; + desc.prev = 0; + desc.type = HFS_NODE_MAP; + desc.height = 0; + desc.num_recs = cpu_to_be16(1); + desc.reserved = 0; + hfs_bnode_write(node, &desc, 0, sizeof(desc)); + hfs_bnode_write_u16(node, 14, 0x8000); + hfs_bnode_write_u16(node, tree->node_size - 2, 14); + hfs_bnode_write_u16(node, tree->node_size - 4, tree->node_size - 6); + + return node; +} + +struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) +{ + struct hfs_bnode *node, *next_node; + struct page **pagep; + u32 nidx, idx; + unsigned off; + u16 off16; + u16 len; + u8 *data, byte, m; + int i; + + while (!tree->free_nodes) { + struct inode *inode = tree->inode; + u32 count; + int res; + + res = hfs_extend_file(inode); + if (res) + return ERR_PTR(res); + HFS_I(inode)->phys_size = inode->i_size = + (loff_t)HFS_I(inode)->alloc_blocks * + HFS_SB(tree->sb)->alloc_blksz; + HFS_I(inode)->fs_blocks = inode->i_size >> + tree->sb->s_blocksize_bits; + inode_set_bytes(inode, inode->i_size); + count = inode->i_size >> tree->node_size_shift; + tree->free_nodes = count - tree->node_count; + tree->node_count = count; + } + + nidx = 0; + node = hfs_bnode_find(tree, nidx); + if (IS_ERR(node)) + return node; + len = hfs_brec_lenoff(node, 2, &off16); + off = off16; + + off += node->page_offset; + pagep = node->page + (off >> PAGE_CACHE_SHIFT); + data = kmap(*pagep); + off &= ~PAGE_CACHE_MASK; + idx = 0; + + for (;;) { + while (len) { + byte = data[off]; + if (byte != 0xff) { + for (m = 0x80, i = 0; i < 8; m >>= 1, i++) { + if (!(byte & m)) { + idx += i; + data[off] |= m; + set_page_dirty(*pagep); + kunmap(*pagep); + tree->free_nodes--; + mark_inode_dirty(tree->inode); + hfs_bnode_put(node); + return hfs_bnode_create(tree, idx); + } + } + } + if (++off >= PAGE_CACHE_SIZE) { + kunmap(*pagep); + data = kmap(*++pagep); + off = 0; + } + idx += 8; + len--; + } + kunmap(*pagep); + nidx = node->next; + if (!nidx) { + printk(KERN_DEBUG "hfs: create new bmap node...\n"); + next_node = hfs_bmap_new_bmap(node, idx); + } else + next_node = hfs_bnode_find(tree, nidx); + hfs_bnode_put(node); + if (IS_ERR(next_node)) + return next_node; + node = next_node; + + len = hfs_brec_lenoff(node, 0, &off16); + off = off16; + off += node->page_offset; + pagep = node->page + (off >> PAGE_CACHE_SHIFT); + data = kmap(*pagep); + off &= ~PAGE_CACHE_MASK; + } +} + +void hfs_bmap_free(struct hfs_bnode *node) +{ + struct hfs_btree *tree; + struct page *page; + u16 off, len; + u32 nidx; + u8 *data, byte, m; + + dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this); + tree = node->tree; + nidx = node->this; + node = hfs_bnode_find(tree, 0); + if (IS_ERR(node)) + return; + len = hfs_brec_lenoff(node, 2, &off); + while (nidx >= len * 8) { + u32 i; + + nidx -= len * 8; + i = node->next; + hfs_bnode_put(node); + if (!i) { + /* panic */; + printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this); + return; + } + node = hfs_bnode_find(tree, i); + if (IS_ERR(node)) + return; + if (node->type != HFS_NODE_MAP) { + /* panic */; + printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type); + hfs_bnode_put(node); + return; + } + len = hfs_brec_lenoff(node, 0, &off); + } + off += node->page_offset + nidx / 8; + page = node->page[off >> PAGE_CACHE_SHIFT]; + data = kmap(page); + off &= ~PAGE_CACHE_MASK; + m = 1 << (~nidx & 7); + byte = data[off]; + if (!(byte & m)) { + printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type); + kunmap(page); + hfs_bnode_put(node); + return; + } + data[off] = byte & ~m; + set_page_dirty(page); + kunmap(page); + hfs_bnode_put(node); + tree->free_nodes++; + mark_inode_dirty(tree->inode); +} diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h new file mode 100644 index 00000000..2a1d712f --- /dev/null +++ b/fs/hfs/btree.h @@ -0,0 +1,168 @@ +/* + * linux/fs/hfs/btree.h + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + */ + +#include "hfs_fs.h" + +typedef int (*btree_keycmp)(const btree_key *, const btree_key *); + +#define NODE_HASH_SIZE 256 + +/* A HFS BTree held in memory */ +struct hfs_btree { + struct super_block *sb; + struct inode *inode; + btree_keycmp keycmp; + + u32 cnid; + u32 root; + u32 leaf_count; + u32 leaf_head; + u32 leaf_tail; + u32 node_count; + u32 free_nodes; + u32 attributes; + + unsigned int node_size; + unsigned int node_size_shift; + unsigned int max_key_len; + unsigned int depth; + + //unsigned int map1_size, map_size; + struct mutex tree_lock; + + unsigned int pages_per_bnode; + spinlock_t hash_lock; + struct hfs_bnode *node_hash[NODE_HASH_SIZE]; + int node_hash_cnt; +}; + +/* A HFS BTree node in memory */ +struct hfs_bnode { + struct hfs_btree *tree; + + u32 prev; + u32 this; + u32 next; + u32 parent; + + u16 num_recs; + u8 type; + u8 height; + + struct hfs_bnode *next_hash; + unsigned long flags; + wait_queue_head_t lock_wq; + atomic_t refcnt; + unsigned int page_offset; + struct page *page[0]; +}; + +#define HFS_BNODE_ERROR 0 +#define HFS_BNODE_NEW 1 +#define HFS_BNODE_DELETED 2 + +struct hfs_find_data { + btree_key *key; + btree_key *search_key; + struct hfs_btree *tree; + struct hfs_bnode *bnode; + int record; + int keyoffset, keylength; + int entryoffset, entrylength; +}; + + +/* btree.c */ +extern struct hfs_btree *hfs_btree_open(struct super_block *, u32, btree_keycmp); +extern void hfs_btree_close(struct hfs_btree *); +extern void hfs_btree_write(struct hfs_btree *); +extern struct hfs_bnode * hfs_bmap_alloc(struct hfs_btree *); +extern void hfs_bmap_free(struct hfs_bnode *node); + +/* bnode.c */ +extern void hfs_bnode_read(struct hfs_bnode *, void *, int, int); +extern u16 hfs_bnode_read_u16(struct hfs_bnode *, int); +extern u8 hfs_bnode_read_u8(struct hfs_bnode *, int); +extern void hfs_bnode_read_key(struct hfs_bnode *, void *, int); +extern void hfs_bnode_write(struct hfs_bnode *, void *, int, int); +extern void hfs_bnode_write_u16(struct hfs_bnode *, int, u16); +extern void hfs_bnode_write_u8(struct hfs_bnode *, int, u8); +extern void hfs_bnode_clear(struct hfs_bnode *, int, int); +extern void hfs_bnode_copy(struct hfs_bnode *, int, + struct hfs_bnode *, int, int); +extern void hfs_bnode_move(struct hfs_bnode *, int, int, int); +extern void hfs_bnode_dump(struct hfs_bnode *); +extern void hfs_bnode_unlink(struct hfs_bnode *); +extern struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *, u32); +extern struct hfs_bnode *hfs_bnode_find(struct hfs_btree *, u32); +extern void hfs_bnode_unhash(struct hfs_bnode *); +extern void hfs_bnode_free(struct hfs_bnode *); +extern struct hfs_bnode *hfs_bnode_create(struct hfs_btree *, u32); +extern void hfs_bnode_get(struct hfs_bnode *); +extern void hfs_bnode_put(struct hfs_bnode *); + +/* brec.c */ +extern u16 hfs_brec_lenoff(struct hfs_bnode *, u16, u16 *); +extern u16 hfs_brec_keylen(struct hfs_bnode *, u16); +extern int hfs_brec_insert(struct hfs_find_data *, void *, int); +extern int hfs_brec_remove(struct hfs_find_data *); + +/* bfind.c */ +extern int hfs_find_init(struct hfs_btree *, struct hfs_find_data *); +extern void hfs_find_exit(struct hfs_find_data *); +extern int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *); +extern int hfs_brec_find(struct hfs_find_data *); +extern int hfs_brec_read(struct hfs_find_data *, void *, int); +extern int hfs_brec_goto(struct hfs_find_data *, int); + + +struct hfs_bnode_desc { + __be32 next; /* (V) Number of the next node at this level */ + __be32 prev; /* (V) Number of the prev node at this level */ + u8 type; /* (F) The type of node */ + u8 height; /* (F) The level of this node (leaves=1) */ + __be16 num_recs; /* (V) The number of records in this node */ + u16 reserved; +} __packed; + +#define HFS_NODE_INDEX 0x00 /* An internal (index) node */ +#define HFS_NODE_HEADER 0x01 /* The tree header node (node 0) */ +#define HFS_NODE_MAP 0x02 /* Holds part of the bitmap of used nodes */ +#define HFS_NODE_LEAF 0xFF /* A leaf (ndNHeight==1) node */ + +struct hfs_btree_header_rec { + __be16 depth; /* (V) The number of levels in this B-tree */ + __be32 root; /* (V) The node number of the root node */ + __be32 leaf_count; /* (V) The number of leaf records */ + __be32 leaf_head; /* (V) The number of the first leaf node */ + __be32 leaf_tail; /* (V) The number of the last leaf node */ + __be16 node_size; /* (F) The number of bytes in a node (=512) */ + __be16 max_key_len; /* (F) The length of a key in an index node */ + __be32 node_count; /* (V) The total number of nodes */ + __be32 free_nodes; /* (V) The number of unused nodes */ + u16 reserved1; + __be32 clump_size; /* (F) clump size. not usually used. */ + u8 btree_type; /* (F) BTree type */ + u8 reserved2; + __be32 attributes; /* (F) attributes */ + u32 reserved3[16]; +} __packed; + +#define HFS_NODE_INDEX 0x00 /* An internal (index) node */ +#define HFS_NODE_HEADER 0x01 /* The tree header node (node 0) */ +#define HFS_NODE_MAP 0x02 /* Holds part of the bitmap of used nodes */ +#define HFS_NODE_LEAF 0xFF /* A leaf (ndNHeight==1) node */ + +#define BTREE_ATTR_BADCLOSE 0x00000001 /* b-tree not closed properly. not + used by hfsplus. */ +#define HFS_TREE_BIGKEYS 0x00000002 /* key length is u16 instead of u8. + used by hfsplus. */ +#define HFS_TREE_VARIDXKEYS 0x00000004 /* variable key length instead of + max key length. use din catalog + b-tree but not in extents + b-tree (hfsplus). */ diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c new file mode 100644 index 00000000..424b0337 --- /dev/null +++ b/fs/hfs/catalog.c @@ -0,0 +1,356 @@ +/* + * linux/fs/hfs/catalog.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains the functions related to the catalog B-tree. + * + * Cache code shamelessly stolen from + * linux/fs/inode.c Copyright (C) 1991, 1992 Linus Torvalds + * re-shamelessly stolen Copyright (C) 1997 Linus Torvalds + */ + +#include "hfs_fs.h" +#include "btree.h" + +/* + * hfs_cat_build_key() + * + * Given the ID of the parent and the name build a search key. + */ +void hfs_cat_build_key(struct super_block *sb, btree_key *key, u32 parent, struct qstr *name) +{ + key->cat.reserved = 0; + key->cat.ParID = cpu_to_be32(parent); + if (name) { + hfs_asc2mac(sb, &key->cat.CName, name); + key->key_len = 6 + key->cat.CName.len; + } else { + memset(&key->cat.CName, 0, sizeof(struct hfs_name)); + key->key_len = 6; + } +} + +static int hfs_cat_build_record(hfs_cat_rec *rec, u32 cnid, struct inode *inode) +{ + __be32 mtime = hfs_mtime(); + + memset(rec, 0, sizeof(*rec)); + if (S_ISDIR(inode->i_mode)) { + rec->type = HFS_CDR_DIR; + rec->dir.DirID = cpu_to_be32(cnid); + rec->dir.CrDat = mtime; + rec->dir.MdDat = mtime; + rec->dir.BkDat = 0; + rec->dir.UsrInfo.frView = cpu_to_be16(0xff); + return sizeof(struct hfs_cat_dir); + } else { + /* init some fields for the file record */ + rec->type = HFS_CDR_FIL; + rec->file.Flags = HFS_FIL_USED | HFS_FIL_THD; + if (!(inode->i_mode & S_IWUSR)) + rec->file.Flags |= HFS_FIL_LOCK; + rec->file.FlNum = cpu_to_be32(cnid); + rec->file.CrDat = mtime; + rec->file.MdDat = mtime; + rec->file.BkDat = 0; + rec->file.UsrWds.fdType = HFS_SB(inode->i_sb)->s_type; + rec->file.UsrWds.fdCreator = HFS_SB(inode->i_sb)->s_creator; + return sizeof(struct hfs_cat_file); + } +} + +static int hfs_cat_build_thread(struct super_block *sb, + hfs_cat_rec *rec, int type, + u32 parentid, struct qstr *name) +{ + rec->type = type; + memset(rec->thread.reserved, 0, sizeof(rec->thread.reserved)); + rec->thread.ParID = cpu_to_be32(parentid); + hfs_asc2mac(sb, &rec->thread.CName, name); + return sizeof(struct hfs_cat_thread); +} + +/* + * create_entry() + * + * Add a new file or directory to the catalog B-tree and + * return a (struct hfs_cat_entry) for it in '*result'. + */ +int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) +{ + struct hfs_find_data fd; + struct super_block *sb; + union hfs_cat_rec entry; + int entry_size; + int err; + + dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); + if (dir->i_size >= HFS_MAX_VALENCE) + return -ENOSPC; + + sb = dir->i_sb; + hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + + hfs_cat_build_key(sb, fd.search_key, cnid, NULL); + entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ? + HFS_CDR_THD : HFS_CDR_FTH, + dir->i_ino, str); + err = hfs_brec_find(&fd); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto err2; + } + err = hfs_brec_insert(&fd, &entry, entry_size); + if (err) + goto err2; + + hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str); + entry_size = hfs_cat_build_record(&entry, cnid, inode); + err = hfs_brec_find(&fd); + if (err != -ENOENT) { + /* panic? */ + if (!err) + err = -EEXIST; + goto err1; + } + err = hfs_brec_insert(&fd, &entry, entry_size); + if (err) + goto err1; + + dir->i_size++; + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); + hfs_find_exit(&fd); + return 0; + +err1: + hfs_cat_build_key(sb, fd.search_key, cnid, NULL); + if (!hfs_brec_find(&fd)) + hfs_brec_remove(&fd); +err2: + hfs_find_exit(&fd); + return err; +} + +/* + * hfs_cat_compare() + * + * Description: + * This is the comparison function used for the catalog B-tree. In + * comparing catalog B-tree entries, the parent id is the most + * significant field (compared as unsigned ints). The name field is + * the least significant (compared in "Macintosh lexical order", + * see hfs_strcmp() in string.c) + * Input Variable(s): + * struct hfs_cat_key *key1: pointer to the first key to compare + * struct hfs_cat_key *key2: pointer to the second key to compare + * Output Variable(s): + * NONE + * Returns: + * int: negative if key1key2, and 0 if key1==key2 + * Preconditions: + * key1 and key2 point to "valid" (struct hfs_cat_key)s. + * Postconditions: + * This function has no side-effects + */ +int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2) +{ + int retval; + + retval = be32_to_cpu(key1->cat.ParID) - be32_to_cpu(key2->cat.ParID); + if (!retval) + retval = hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len, + key2->cat.CName.name, key2->cat.CName.len); + + return retval; +} + +/* Try to get a catalog entry for given catalog id */ +// move to read_super??? +int hfs_cat_find_brec(struct super_block *sb, u32 cnid, + struct hfs_find_data *fd) +{ + hfs_cat_rec rec; + int res, len, type; + + hfs_cat_build_key(sb, fd->search_key, cnid, NULL); + res = hfs_brec_read(fd, &rec, sizeof(rec)); + if (res) + return res; + + type = rec.type; + if (type != HFS_CDR_THD && type != HFS_CDR_FTH) { + printk(KERN_ERR "hfs: found bad thread record in catalog\n"); + return -EIO; + } + + fd->search_key->cat.ParID = rec.thread.ParID; + len = fd->search_key->cat.CName.len = rec.thread.CName.len; + if (len > HFS_NAMELEN) { + printk(KERN_ERR "hfs: bad catalog namelength\n"); + return -EIO; + } + memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len); + return hfs_brec_find(fd); +} + + +/* + * hfs_cat_delete() + * + * Delete the indicated file or directory. + * The associated thread is also removed unless ('with_thread'==0). + */ +int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str) +{ + struct super_block *sb; + struct hfs_find_data fd; + struct list_head *pos; + int res, type; + + dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + sb = dir->i_sb; + hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + + hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str); + res = hfs_brec_find(&fd); + if (res) + goto out; + + type = hfs_bnode_read_u8(fd.bnode, fd.entryoffset); + if (type == HFS_CDR_FIL) { + struct hfs_cat_file file; + hfs_bnode_read(fd.bnode, &file, fd.entryoffset, sizeof(file)); + if (be32_to_cpu(file.FlNum) == cnid) { +#if 0 + hfs_free_fork(sb, &file, HFS_FK_DATA); +#endif + hfs_free_fork(sb, &file, HFS_FK_RSRC); + } + } + + list_for_each(pos, &HFS_I(dir)->open_dir_list) { + struct hfs_readdir_data *rd = + list_entry(pos, struct hfs_readdir_data, list); + if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) + rd->file->f_pos--; + } + + res = hfs_brec_remove(&fd); + if (res) + goto out; + + hfs_cat_build_key(sb, fd.search_key, cnid, NULL); + res = hfs_brec_find(&fd); + if (!res) { + res = hfs_brec_remove(&fd); + if (res) + goto out; + } + + dir->i_size--; + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); + res = 0; +out: + hfs_find_exit(&fd); + + return res; +} + +/* + * hfs_cat_move() + * + * Rename a file or directory, possibly to a new directory. + * If the destination exists it is removed and a + * (struct hfs_cat_entry) for it is returned in '*result'. + */ +int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name, + struct inode *dst_dir, struct qstr *dst_name) +{ + struct super_block *sb; + struct hfs_find_data src_fd, dst_fd; + union hfs_cat_rec entry; + int entry_size, type; + int err; + + dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, + dst_dir->i_ino, dst_name->name); + sb = src_dir->i_sb; + hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd); + dst_fd = src_fd; + + /* find the old dir entry and read the data */ + hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); + err = hfs_brec_find(&src_fd); + if (err) + goto out; + if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) { + err = -EIO; + goto out; + } + + hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset, + src_fd.entrylength); + + /* create new dir entry with the data from the old entry */ + hfs_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name); + err = hfs_brec_find(&dst_fd); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto out; + } + + err = hfs_brec_insert(&dst_fd, &entry, src_fd.entrylength); + if (err) + goto out; + dst_dir->i_size++; + dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dst_dir); + + /* finally remove the old entry */ + hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); + err = hfs_brec_find(&src_fd); + if (err) + goto out; + err = hfs_brec_remove(&src_fd); + if (err) + goto out; + src_dir->i_size--; + src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(src_dir); + + type = entry.type; + if (type == HFS_CDR_FIL && !(entry.file.Flags & HFS_FIL_THD)) + goto out; + + /* remove old thread entry */ + hfs_cat_build_key(sb, src_fd.search_key, cnid, NULL); + err = hfs_brec_find(&src_fd); + if (err) + goto out; + err = hfs_brec_remove(&src_fd); + if (err) + goto out; + + /* create new thread entry */ + hfs_cat_build_key(sb, dst_fd.search_key, cnid, NULL); + entry_size = hfs_cat_build_thread(sb, &entry, type == HFS_CDR_FIL ? HFS_CDR_FTH : HFS_CDR_THD, + dst_dir->i_ino, dst_name); + err = hfs_brec_find(&dst_fd); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto out; + } + err = hfs_brec_insert(&dst_fd, &entry, entry_size); +out: + hfs_bnode_put(dst_fd.bnode); + hfs_find_exit(&src_fd); + return err; +} diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c new file mode 100644 index 00000000..b4d70b13 --- /dev/null +++ b/fs/hfs/dir.c @@ -0,0 +1,316 @@ +/* + * linux/fs/hfs/dir.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains directory-related functions independent of which + * scheme is being used to represent forks. + * + * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds + */ + +#include "hfs_fs.h" +#include "btree.h" + +/* + * hfs_lookup() + */ +static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + hfs_cat_rec rec; + struct hfs_find_data fd; + struct inode *inode = NULL; + int res; + + hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); + res = hfs_brec_read(&fd, &rec, sizeof(rec)); + if (res) { + hfs_find_exit(&fd); + if (res == -ENOENT) { + /* No such entry */ + inode = NULL; + goto done; + } + return ERR_PTR(res); + } + inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec); + hfs_find_exit(&fd); + if (!inode) + return ERR_PTR(-EACCES); +done: + d_add(dentry, inode); + return NULL; +} + +/* + * hfs_readdir + */ +static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + int len, err; + char strbuf[HFS_MAX_NAMELEN]; + union hfs_cat_rec entry; + struct hfs_find_data fd; + struct hfs_readdir_data *rd; + u16 type; + + if (filp->f_pos >= inode->i_size) + return 0; + + hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); + err = hfs_brec_find(&fd); + if (err) + goto out; + + switch ((u32)filp->f_pos) { + case 0: + /* This is completely artificial... */ + if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) + goto out; + filp->f_pos++; + /* fall through */ + case 1: + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); + if (entry.type != HFS_CDR_THD) { + printk(KERN_ERR "hfs: bad catalog folder thread\n"); + err = -EIO; + goto out; + } + //if (fd.entrylength < HFS_MIN_THREAD_SZ) { + // printk(KERN_ERR "hfs: truncated catalog thread\n"); + // err = -EIO; + // goto out; + //} + if (filldir(dirent, "..", 2, 1, + be32_to_cpu(entry.thread.ParID), DT_DIR)) + goto out; + filp->f_pos++; + /* fall through */ + default: + if (filp->f_pos >= inode->i_size) + goto out; + err = hfs_brec_goto(&fd, filp->f_pos - 1); + if (err) + goto out; + } + + for (;;) { + if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) { + printk(KERN_ERR "hfs: walked past end of dir\n"); + err = -EIO; + goto out; + } + + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); + type = entry.type; + len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName); + if (type == HFS_CDR_DIR) { + if (fd.entrylength < sizeof(struct hfs_cat_dir)) { + printk(KERN_ERR "hfs: small dir entry\n"); + err = -EIO; + goto out; + } + if (filldir(dirent, strbuf, len, filp->f_pos, + be32_to_cpu(entry.dir.DirID), DT_DIR)) + break; + } else if (type == HFS_CDR_FIL) { + if (fd.entrylength < sizeof(struct hfs_cat_file)) { + printk(KERN_ERR "hfs: small file entry\n"); + err = -EIO; + goto out; + } + if (filldir(dirent, strbuf, len, filp->f_pos, + be32_to_cpu(entry.file.FlNum), DT_REG)) + break; + } else { + printk(KERN_ERR "hfs: bad catalog entry type %d\n", type); + err = -EIO; + goto out; + } + filp->f_pos++; + if (filp->f_pos >= inode->i_size) + goto out; + err = hfs_brec_goto(&fd, 1); + if (err) + goto out; + } + rd = filp->private_data; + if (!rd) { + rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL); + if (!rd) { + err = -ENOMEM; + goto out; + } + filp->private_data = rd; + rd->file = filp; + list_add(&rd->list, &HFS_I(inode)->open_dir_list); + } + memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key)); +out: + hfs_find_exit(&fd); + return err; +} + +static int hfs_dir_release(struct inode *inode, struct file *file) +{ + struct hfs_readdir_data *rd = file->private_data; + if (rd) { + list_del(&rd->list); + kfree(rd); + } + return 0; +} + +/* + * hfs_create() + * + * This is the create() entry in the inode_operations structure for + * regular HFS directories. The purpose is to create a new file in + * a directory and return a corresponding inode, given the inode for + * the directory and the name (and its length) of the new file. + */ +static int hfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + int res; + + inode = hfs_new_inode(dir, &dentry->d_name, mode); + if (!inode) + return -ENOSPC; + + res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); + if (res) { + inode->i_nlink = 0; + hfs_delete_inode(inode); + iput(inode); + return res; + } + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + return 0; +} + +/* + * hfs_mkdir() + * + * This is the mkdir() entry in the inode_operations structure for + * regular HFS directories. The purpose is to create a new directory + * in a directory, given the inode for the parent directory and the + * name (and its length) of the new directory. + */ +static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + int res; + + inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode); + if (!inode) + return -ENOSPC; + + res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); + if (res) { + inode->i_nlink = 0; + hfs_delete_inode(inode); + iput(inode); + return res; + } + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + return 0; +} + +/* + * hfs_remove() + * + * This serves as both unlink() and rmdir() in the inode_operations + * structure for regular HFS directories. The purpose is to delete + * an existing child, given the inode for the parent directory and + * the name (and its length) of the existing directory. + * + * HFS does not have hardlinks, so both rmdir and unlink set the + * link count to 0. The only difference is the emptiness check. + */ +static int hfs_remove(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int res; + + if (S_ISDIR(inode->i_mode) && inode->i_size != 2) + return -ENOTEMPTY; + res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); + if (res) + return res; + clear_nlink(inode); + inode->i_ctime = CURRENT_TIME_SEC; + hfs_delete_inode(inode); + mark_inode_dirty(inode); + return 0; +} + +/* + * hfs_rename() + * + * This is the rename() entry in the inode_operations structure for + * regular HFS directories. The purpose is to rename an existing + * file or directory, given the inode for the current directory and + * the name (and its length) of the existing file/directory and the + * inode for the new directory and the name (and its length) of the + * new file/directory. + * XXX: how do you handle must_be dir? + */ +static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int res; + + /* Unlink destination if it already exists */ + if (new_dentry->d_inode) { + res = hfs_remove(new_dir, new_dentry); + if (res) + return res; + } + + res = hfs_cat_move(old_dentry->d_inode->i_ino, + old_dir, &old_dentry->d_name, + new_dir, &new_dentry->d_name); + if (!res) + hfs_cat_build_key(old_dir->i_sb, + (btree_key *)&HFS_I(old_dentry->d_inode)->cat_key, + new_dir->i_ino, &new_dentry->d_name); + return res; +} + +const struct file_operations hfs_dir_operations = { + .read = generic_read_dir, + .readdir = hfs_readdir, + .llseek = generic_file_llseek, + .release = hfs_dir_release, +}; + +const struct inode_operations hfs_dir_inode_operations = { + .create = hfs_create, + .lookup = hfs_lookup, + .unlink = hfs_remove, + .mkdir = hfs_mkdir, + .rmdir = hfs_remove, + .rename = hfs_rename, + .setattr = hfs_inode_setattr, +}; diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c new file mode 100644 index 00000000..2c16316d --- /dev/null +++ b/fs/hfs/extent.c @@ -0,0 +1,525 @@ +/* + * linux/fs/hfs/extent.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains the functions related to the extents B-tree. + */ + +#include + +#include "hfs_fs.h" +#include "btree.h" + +/*================ File-local functions ================*/ + +/* + * build_key + */ +static void hfs_ext_build_key(hfs_btree_key *key, u32 cnid, u16 block, u8 type) +{ + key->key_len = 7; + key->ext.FkType = type; + key->ext.FNum = cpu_to_be32(cnid); + key->ext.FABN = cpu_to_be16(block); +} + +/* + * hfs_ext_compare() + * + * Description: + * This is the comparison function used for the extents B-tree. In + * comparing extent B-tree entries, the file id is the most + * significant field (compared as unsigned ints); the fork type is + * the second most significant field (compared as unsigned chars); + * and the allocation block number field is the least significant + * (compared as unsigned ints). + * Input Variable(s): + * struct hfs_ext_key *key1: pointer to the first key to compare + * struct hfs_ext_key *key2: pointer to the second key to compare + * Output Variable(s): + * NONE + * Returns: + * int: negative if key1key2, and 0 if key1==key2 + * Preconditions: + * key1 and key2 point to "valid" (struct hfs_ext_key)s. + * Postconditions: + * This function has no side-effects */ +int hfs_ext_keycmp(const btree_key *key1, const btree_key *key2) +{ + __be32 fnum1, fnum2; + __be16 block1, block2; + + fnum1 = key1->ext.FNum; + fnum2 = key2->ext.FNum; + if (fnum1 != fnum2) + return be32_to_cpu(fnum1) < be32_to_cpu(fnum2) ? -1 : 1; + if (key1->ext.FkType != key2->ext.FkType) + return key1->ext.FkType < key2->ext.FkType ? -1 : 1; + + block1 = key1->ext.FABN; + block2 = key2->ext.FABN; + if (block1 == block2) + return 0; + return be16_to_cpu(block1) < be16_to_cpu(block2) ? -1 : 1; +} + +/* + * hfs_ext_find_block + * + * Find a block within an extent record + */ +static u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off) +{ + int i; + u16 count; + + for (i = 0; i < 3; ext++, i++) { + count = be16_to_cpu(ext->count); + if (off < count) + return be16_to_cpu(ext->block) + off; + off -= count; + } + /* panic? */ + return 0; +} + +static int hfs_ext_block_count(struct hfs_extent *ext) +{ + int i; + u16 count = 0; + + for (i = 0; i < 3; ext++, i++) + count += be16_to_cpu(ext->count); + return count; +} + +static u16 hfs_ext_lastblock(struct hfs_extent *ext) +{ + int i; + + ext += 2; + for (i = 0; i < 2; ext--, i++) + if (ext->count) + break; + return be16_to_cpu(ext->block) + be16_to_cpu(ext->count); +} + +static void __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) +{ + int res; + + hfs_ext_build_key(fd->search_key, inode->i_ino, HFS_I(inode)->cached_start, + HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA); + res = hfs_brec_find(fd); + if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) { + if (res != -ENOENT) + return; + hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec)); + HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); + } else { + if (res) + return; + hfs_bnode_write(fd->bnode, HFS_I(inode)->cached_extents, fd->entryoffset, fd->entrylength); + HFS_I(inode)->flags &= ~HFS_FLG_EXT_DIRTY; + } +} + +void hfs_ext_write_extent(struct inode *inode) +{ + struct hfs_find_data fd; + + if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) { + hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); + __hfs_ext_write_extent(inode, &fd); + hfs_find_exit(&fd); + } +} + +static inline int __hfs_ext_read_extent(struct hfs_find_data *fd, struct hfs_extent *extent, + u32 cnid, u32 block, u8 type) +{ + int res; + + hfs_ext_build_key(fd->search_key, cnid, block, type); + fd->key->ext.FNum = 0; + res = hfs_brec_find(fd); + if (res && res != -ENOENT) + return res; + if (fd->key->ext.FNum != fd->search_key->ext.FNum || + fd->key->ext.FkType != fd->search_key->ext.FkType) + return -ENOENT; + if (fd->entrylength != sizeof(hfs_extent_rec)) + return -EIO; + hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfs_extent_rec)); + return 0; +} + +static inline int __hfs_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) +{ + int res; + + if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) + __hfs_ext_write_extent(inode, fd); + + res = __hfs_ext_read_extent(fd, HFS_I(inode)->cached_extents, inode->i_ino, + block, HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA); + if (!res) { + HFS_I(inode)->cached_start = be16_to_cpu(fd->key->ext.FABN); + HFS_I(inode)->cached_blocks = hfs_ext_block_count(HFS_I(inode)->cached_extents); + } else { + HFS_I(inode)->cached_start = HFS_I(inode)->cached_blocks = 0; + HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); + } + return res; +} + +static int hfs_ext_read_extent(struct inode *inode, u16 block) +{ + struct hfs_find_data fd; + int res; + + if (block >= HFS_I(inode)->cached_start && + block < HFS_I(inode)->cached_start + HFS_I(inode)->cached_blocks) + return 0; + + hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); + res = __hfs_ext_cache_extent(&fd, inode, block); + hfs_find_exit(&fd); + return res; +} + +static void hfs_dump_extent(struct hfs_extent *extent) +{ + int i; + + dprint(DBG_EXTENT, " "); + for (i = 0; i < 3; i++) + dprint(DBG_EXTENT, " %u:%u", be16_to_cpu(extent[i].block), + be16_to_cpu(extent[i].count)); + dprint(DBG_EXTENT, "\n"); +} + +static int hfs_add_extent(struct hfs_extent *extent, u16 offset, + u16 alloc_block, u16 block_count) +{ + u16 count, start; + int i; + + hfs_dump_extent(extent); + for (i = 0; i < 3; extent++, i++) { + count = be16_to_cpu(extent->count); + if (offset == count) { + start = be16_to_cpu(extent->block); + if (alloc_block != start + count) { + if (++i >= 3) + return -ENOSPC; + extent++; + extent->block = cpu_to_be16(alloc_block); + } else + block_count += count; + extent->count = cpu_to_be16(block_count); + return 0; + } else if (offset < count) + break; + offset -= count; + } + /* panic? */ + return -EIO; +} + +static int hfs_free_extents(struct super_block *sb, struct hfs_extent *extent, + u16 offset, u16 block_nr) +{ + u16 count, start; + int i; + + hfs_dump_extent(extent); + for (i = 0; i < 3; extent++, i++) { + count = be16_to_cpu(extent->count); + if (offset == count) + goto found; + else if (offset < count) + break; + offset -= count; + } + /* panic? */ + return -EIO; +found: + for (;;) { + start = be16_to_cpu(extent->block); + if (count <= block_nr) { + hfs_clear_vbm_bits(sb, start, count); + extent->block = 0; + extent->count = 0; + block_nr -= count; + } else { + count -= block_nr; + hfs_clear_vbm_bits(sb, start + count, block_nr); + extent->count = cpu_to_be16(count); + block_nr = 0; + } + if (!block_nr || !i) + return 0; + i--; + extent--; + count = be16_to_cpu(extent->count); + } +} + +int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type) +{ + struct hfs_find_data fd; + u32 total_blocks, blocks, start; + u32 cnid = be32_to_cpu(file->FlNum); + struct hfs_extent *extent; + int res, i; + + if (type == HFS_FK_DATA) { + total_blocks = be32_to_cpu(file->PyLen); + extent = file->ExtRec; + } else { + total_blocks = be32_to_cpu(file->RPyLen); + extent = file->RExtRec; + } + total_blocks /= HFS_SB(sb)->alloc_blksz; + if (!total_blocks) + return 0; + + blocks = 0; + for (i = 0; i < 3; extent++, i++) + blocks += be16_to_cpu(extent[i].count); + + res = hfs_free_extents(sb, extent, blocks, blocks); + if (res) + return res; + if (total_blocks == blocks) + return 0; + + hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + do { + res = __hfs_ext_read_extent(&fd, extent, cnid, total_blocks, type); + if (res) + break; + start = be16_to_cpu(fd.key->ext.FABN); + hfs_free_extents(sb, extent, total_blocks - start, total_blocks); + hfs_brec_remove(&fd); + total_blocks = start; + } while (total_blocks > blocks); + hfs_find_exit(&fd); + + return res; +} + +/* + * hfs_get_block + */ +int hfs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb; + u16 dblock, ablock; + int res; + + sb = inode->i_sb; + /* Convert inode block to disk allocation block */ + ablock = (u32)block / HFS_SB(sb)->fs_div; + + if (block >= HFS_I(inode)->fs_blocks) { + if (block > HFS_I(inode)->fs_blocks || !create) + return -EIO; + if (ablock >= HFS_I(inode)->alloc_blocks) { + res = hfs_extend_file(inode); + if (res) + return res; + } + } else + create = 0; + + if (ablock < HFS_I(inode)->first_blocks) { + dblock = hfs_ext_find_block(HFS_I(inode)->first_extents, ablock); + goto done; + } + + mutex_lock(&HFS_I(inode)->extents_lock); + res = hfs_ext_read_extent(inode, ablock); + if (!res) + dblock = hfs_ext_find_block(HFS_I(inode)->cached_extents, + ablock - HFS_I(inode)->cached_start); + else { + mutex_unlock(&HFS_I(inode)->extents_lock); + return -EIO; + } + mutex_unlock(&HFS_I(inode)->extents_lock); + +done: + map_bh(bh_result, sb, HFS_SB(sb)->fs_start + + dblock * HFS_SB(sb)->fs_div + + (u32)block % HFS_SB(sb)->fs_div); + + if (create) { + set_buffer_new(bh_result); + HFS_I(inode)->phys_size += sb->s_blocksize; + HFS_I(inode)->fs_blocks++; + inode_add_bytes(inode, sb->s_blocksize); + mark_inode_dirty(inode); + } + return 0; +} + +int hfs_extend_file(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + u32 start, len, goal; + int res; + + mutex_lock(&HFS_I(inode)->extents_lock); + if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) + goal = hfs_ext_lastblock(HFS_I(inode)->first_extents); + else { + res = hfs_ext_read_extent(inode, HFS_I(inode)->alloc_blocks); + if (res) + goto out; + goal = hfs_ext_lastblock(HFS_I(inode)->cached_extents); + } + + len = HFS_I(inode)->clump_blocks; + start = hfs_vbm_search_free(sb, goal, &len); + if (!len) { + res = -ENOSPC; + goto out; + } + + dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) { + if (!HFS_I(inode)->first_blocks) { + dprint(DBG_EXTENT, "first extents\n"); + /* no extents yet */ + HFS_I(inode)->first_extents[0].block = cpu_to_be16(start); + HFS_I(inode)->first_extents[0].count = cpu_to_be16(len); + res = 0; + } else { + /* try to append to extents in inode */ + res = hfs_add_extent(HFS_I(inode)->first_extents, + HFS_I(inode)->alloc_blocks, + start, len); + if (res == -ENOSPC) + goto insert_extent; + } + if (!res) { + hfs_dump_extent(HFS_I(inode)->first_extents); + HFS_I(inode)->first_blocks += len; + } + } else { + res = hfs_add_extent(HFS_I(inode)->cached_extents, + HFS_I(inode)->alloc_blocks - + HFS_I(inode)->cached_start, + start, len); + if (!res) { + hfs_dump_extent(HFS_I(inode)->cached_extents); + HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY; + HFS_I(inode)->cached_blocks += len; + } else if (res == -ENOSPC) + goto insert_extent; + } +out: + mutex_unlock(&HFS_I(inode)->extents_lock); + if (!res) { + HFS_I(inode)->alloc_blocks += len; + mark_inode_dirty(inode); + if (inode->i_ino < HFS_FIRSTUSER_CNID) + set_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags); + set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); + sb->s_dirt = 1; + } + return res; + +insert_extent: + dprint(DBG_EXTENT, "insert new extent\n"); + hfs_ext_write_extent(inode); + + memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec)); + HFS_I(inode)->cached_extents[0].block = cpu_to_be16(start); + HFS_I(inode)->cached_extents[0].count = cpu_to_be16(len); + hfs_dump_extent(HFS_I(inode)->cached_extents); + HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW; + HFS_I(inode)->cached_start = HFS_I(inode)->alloc_blocks; + HFS_I(inode)->cached_blocks = len; + + res = 0; + goto out; +} + +void hfs_file_truncate(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct hfs_find_data fd; + u16 blk_cnt, alloc_cnt, start; + u32 size; + int res; + + dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, + (long long)HFS_I(inode)->phys_size, inode->i_size); + if (inode->i_size > HFS_I(inode)->phys_size) { + struct address_space *mapping = inode->i_mapping; + void *fsdata; + struct page *page; + int res; + + /* XXX: Can use generic_cont_expand? */ + size = inode->i_size - 1; + res = pagecache_write_begin(NULL, mapping, size+1, 0, + AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); + if (!res) { + res = pagecache_write_end(NULL, mapping, size+1, 0, 0, + page, fsdata); + } + if (res) + inode->i_size = HFS_I(inode)->phys_size; + return; + } else if (inode->i_size == HFS_I(inode)->phys_size) + return; + size = inode->i_size + HFS_SB(sb)->alloc_blksz - 1; + blk_cnt = size / HFS_SB(sb)->alloc_blksz; + alloc_cnt = HFS_I(inode)->alloc_blocks; + if (blk_cnt == alloc_cnt) + goto out; + + mutex_lock(&HFS_I(inode)->extents_lock); + hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + while (1) { + if (alloc_cnt == HFS_I(inode)->first_blocks) { + hfs_free_extents(sb, HFS_I(inode)->first_extents, + alloc_cnt, alloc_cnt - blk_cnt); + hfs_dump_extent(HFS_I(inode)->first_extents); + HFS_I(inode)->first_blocks = blk_cnt; + break; + } + res = __hfs_ext_cache_extent(&fd, inode, alloc_cnt); + if (res) + break; + start = HFS_I(inode)->cached_start; + hfs_free_extents(sb, HFS_I(inode)->cached_extents, + alloc_cnt - start, alloc_cnt - blk_cnt); + hfs_dump_extent(HFS_I(inode)->cached_extents); + if (blk_cnt > start) { + HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY; + break; + } + alloc_cnt = start; + HFS_I(inode)->cached_start = HFS_I(inode)->cached_blocks = 0; + HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); + hfs_brec_remove(&fd); + } + hfs_find_exit(&fd); + mutex_unlock(&HFS_I(inode)->extents_lock); + + HFS_I(inode)->alloc_blocks = blk_cnt; +out: + HFS_I(inode)->phys_size = inode->i_size; + HFS_I(inode)->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits); + mark_inode_dirty(inode); +} diff --git a/fs/hfs/hfs.h b/fs/hfs/hfs.h new file mode 100644 index 00000000..6f194d07 --- /dev/null +++ b/fs/hfs/hfs.h @@ -0,0 +1,289 @@ +/* + * linux/fs/hfs/hfs.h + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + */ + +#ifndef _HFS_H +#define _HFS_H + +/* offsets to various blocks */ +#define HFS_DD_BLK 0 /* Driver Descriptor block */ +#define HFS_PMAP_BLK 1 /* First block of partition map */ +#define HFS_MDB_BLK 2 /* Block (w/i partition) of MDB */ + +/* magic numbers for various disk blocks */ +#define HFS_DRVR_DESC_MAGIC 0x4552 /* "ER": driver descriptor map */ +#define HFS_OLD_PMAP_MAGIC 0x5453 /* "TS": old-type partition map */ +#define HFS_NEW_PMAP_MAGIC 0x504D /* "PM": new-type partition map */ +#define HFS_SUPER_MAGIC 0x4244 /* "BD": HFS MDB (super block) */ +#define HFS_MFS_SUPER_MAGIC 0xD2D7 /* MFS MDB (super block) */ + +/* various FIXED size parameters */ +#define HFS_SECTOR_SIZE 512 /* size of an HFS sector */ +#define HFS_SECTOR_SIZE_BITS 9 /* log_2(HFS_SECTOR_SIZE) */ +#define HFS_NAMELEN 31 /* maximum length of an HFS filename */ +#define HFS_MAX_NAMELEN 128 +#define HFS_MAX_VALENCE 32767U + +/* Meanings of the drAtrb field of the MDB, + * Reference: _Inside Macintosh: Files_ p. 2-61 + */ +#define HFS_SB_ATTRIB_HLOCK (1 << 7) +#define HFS_SB_ATTRIB_UNMNT (1 << 8) +#define HFS_SB_ATTRIB_SPARED (1 << 9) +#define HFS_SB_ATTRIB_INCNSTNT (1 << 11) +#define HFS_SB_ATTRIB_SLOCK (1 << 15) + +/* Some special File ID numbers */ +#define HFS_POR_CNID 1 /* Parent Of the Root */ +#define HFS_ROOT_CNID 2 /* ROOT directory */ +#define HFS_EXT_CNID 3 /* EXTents B-tree */ +#define HFS_CAT_CNID 4 /* CATalog B-tree */ +#define HFS_BAD_CNID 5 /* BAD blocks file */ +#define HFS_ALLOC_CNID 6 /* ALLOCation file (HFS+) */ +#define HFS_START_CNID 7 /* STARTup file (HFS+) */ +#define HFS_ATTR_CNID 8 /* ATTRibutes file (HFS+) */ +#define HFS_EXCH_CNID 15 /* ExchangeFiles temp id */ +#define HFS_FIRSTUSER_CNID 16 + +/* values for hfs_cat_rec.cdrType */ +#define HFS_CDR_DIR 0x01 /* folder (directory) */ +#define HFS_CDR_FIL 0x02 /* file */ +#define HFS_CDR_THD 0x03 /* folder (directory) thread */ +#define HFS_CDR_FTH 0x04 /* file thread */ + +/* legal values for hfs_ext_key.FkType and hfs_file.fork */ +#define HFS_FK_DATA 0x00 +#define HFS_FK_RSRC 0xFF + +/* bits in hfs_fil_entry.Flags */ +#define HFS_FIL_LOCK 0x01 /* locked */ +#define HFS_FIL_THD 0x02 /* file thread */ +#define HFS_FIL_DOPEN 0x04 /* data fork open */ +#define HFS_FIL_ROPEN 0x08 /* resource fork open */ +#define HFS_FIL_DIR 0x10 /* directory (always clear) */ +#define HFS_FIL_NOCOPY 0x40 /* copy-protected file */ +#define HFS_FIL_USED 0x80 /* open */ + +/* bits in hfs_dir_entry.Flags. dirflags is 16 bits. */ +#define HFS_DIR_LOCK 0x01 /* locked */ +#define HFS_DIR_THD 0x02 /* directory thread */ +#define HFS_DIR_INEXPFOLDER 0x04 /* in a shared area */ +#define HFS_DIR_MOUNTED 0x08 /* mounted */ +#define HFS_DIR_DIR 0x10 /* directory (always set) */ +#define HFS_DIR_EXPFOLDER 0x20 /* share point */ + +/* bits hfs_finfo.fdFlags */ +#define HFS_FLG_INITED 0x0100 +#define HFS_FLG_LOCKED 0x1000 +#define HFS_FLG_INVISIBLE 0x4000 + +/*======== HFS structures as they appear on the disk ========*/ + +/* Pascal-style string of up to 31 characters */ +struct hfs_name { + u8 len; + u8 name[HFS_NAMELEN]; +} __packed; + +struct hfs_point { + __be16 v; + __be16 h; +} __packed; + +struct hfs_rect { + __be16 top; + __be16 left; + __be16 bottom; + __be16 right; +} __packed; + +struct hfs_finfo { + __be32 fdType; + __be32 fdCreator; + __be16 fdFlags; + struct hfs_point fdLocation; + __be16 fdFldr; +} __packed; + +struct hfs_fxinfo { + __be16 fdIconID; + u8 fdUnused[8]; + __be16 fdComment; + __be32 fdPutAway; +} __packed; + +struct hfs_dinfo { + struct hfs_rect frRect; + __be16 frFlags; + struct hfs_point frLocation; + __be16 frView; +} __packed; + +struct hfs_dxinfo { + struct hfs_point frScroll; + __be32 frOpenChain; + __be16 frUnused; + __be16 frComment; + __be32 frPutAway; +} __packed; + +union hfs_finder_info { + struct { + struct hfs_finfo finfo; + struct hfs_fxinfo fxinfo; + } file; + struct { + struct hfs_dinfo dinfo; + struct hfs_dxinfo dxinfo; + } dir; +} __packed; + +/* Cast to a pointer to a generic bkey */ +#define HFS_BKEY(X) (((void)((X)->KeyLen)), ((struct hfs_bkey *)(X))) + +/* The key used in the catalog b-tree: */ +struct hfs_cat_key { + u8 key_len; /* number of bytes in the key */ + u8 reserved; /* padding */ + __be32 ParID; /* CNID of the parent dir */ + struct hfs_name CName; /* The filename of the entry */ +} __packed; + +/* The key used in the extents b-tree: */ +struct hfs_ext_key { + u8 key_len; /* number of bytes in the key */ + u8 FkType; /* HFS_FK_{DATA,RSRC} */ + __be32 FNum; /* The File ID of the file */ + __be16 FABN; /* allocation blocks number*/ +} __packed; + +typedef union hfs_btree_key { + u8 key_len; /* number of bytes in the key */ + struct hfs_cat_key cat; + struct hfs_ext_key ext; +} hfs_btree_key; + +#define HFS_MAX_CAT_KEYLEN (sizeof(struct hfs_cat_key) - sizeof(u8)) +#define HFS_MAX_EXT_KEYLEN (sizeof(struct hfs_ext_key) - sizeof(u8)) + +typedef union hfs_btree_key btree_key; + +struct hfs_extent { + __be16 block; + __be16 count; +}; +typedef struct hfs_extent hfs_extent_rec[3]; + +/* The catalog record for a file */ +struct hfs_cat_file { + s8 type; /* The type of entry */ + u8 reserved; + u8 Flags; /* Flags such as read-only */ + s8 Typ; /* file version number = 0 */ + struct hfs_finfo UsrWds; /* data used by the Finder */ + __be32 FlNum; /* The CNID */ + __be16 StBlk; /* obsolete */ + __be32 LgLen; /* The logical EOF of the data fork*/ + __be32 PyLen; /* The physical EOF of the data fork */ + __be16 RStBlk; /* obsolete */ + __be32 RLgLen; /* The logical EOF of the rsrc fork */ + __be32 RPyLen; /* The physical EOF of the rsrc fork */ + __be32 CrDat; /* The creation date */ + __be32 MdDat; /* The modified date */ + __be32 BkDat; /* The last backup date */ + struct hfs_fxinfo FndrInfo; /* more data for the Finder */ + __be16 ClpSize; /* number of bytes to allocate + when extending files */ + hfs_extent_rec ExtRec; /* first extent record + for the data fork */ + hfs_extent_rec RExtRec; /* first extent record + for the resource fork */ + u32 Resrv; /* reserved by Apple */ +} __packed; + +/* the catalog record for a directory */ +struct hfs_cat_dir { + s8 type; /* The type of entry */ + u8 reserved; + __be16 Flags; /* flags */ + __be16 Val; /* Valence: number of files and + dirs in the directory */ + __be32 DirID; /* The CNID */ + __be32 CrDat; /* The creation date */ + __be32 MdDat; /* The modification date */ + __be32 BkDat; /* The last backup date */ + struct hfs_dinfo UsrInfo; /* data used by the Finder */ + struct hfs_dxinfo FndrInfo; /* more data used by Finder */ + u8 Resrv[16]; /* reserved by Apple */ +} __packed; + +/* the catalog record for a thread */ +struct hfs_cat_thread { + s8 type; /* The type of entry */ + u8 reserved[9]; /* reserved by Apple */ + __be32 ParID; /* CNID of parent directory */ + struct hfs_name CName; /* The name of this entry */ +} __packed; + +/* A catalog tree record */ +typedef union hfs_cat_rec { + s8 type; /* The type of entry */ + struct hfs_cat_file file; + struct hfs_cat_dir dir; + struct hfs_cat_thread thread; +} hfs_cat_rec; + +struct hfs_mdb { + __be16 drSigWord; /* Signature word indicating fs type */ + __be32 drCrDate; /* fs creation date/time */ + __be32 drLsMod; /* fs modification date/time */ + __be16 drAtrb; /* fs attributes */ + __be16 drNmFls; /* number of files in root directory */ + __be16 drVBMSt; /* location (in 512-byte blocks) + of the volume bitmap */ + __be16 drAllocPtr; /* location (in allocation blocks) + to begin next allocation search */ + __be16 drNmAlBlks; /* number of allocation blocks */ + __be32 drAlBlkSiz; /* bytes in an allocation block */ + __be32 drClpSiz; /* clumpsize, the number of bytes to + allocate when extending a file */ + __be16 drAlBlSt; /* location (in 512-byte blocks) + of the first allocation block */ + __be32 drNxtCNID; /* CNID to assign to the next + file or directory created */ + __be16 drFreeBks; /* number of free allocation blocks */ + u8 drVN[28]; /* the volume label */ + __be32 drVolBkUp; /* fs backup date/time */ + __be16 drVSeqNum; /* backup sequence number */ + __be32 drWrCnt; /* fs write count */ + __be32 drXTClpSiz; /* clumpsize for the extents B-tree */ + __be32 drCTClpSiz; /* clumpsize for the catalog B-tree */ + __be16 drNmRtDirs; /* number of directories in + the root directory */ + __be32 drFilCnt; /* number of files in the fs */ + __be32 drDirCnt; /* number of directories in the fs */ + u8 drFndrInfo[32]; /* data used by the Finder */ + __be16 drEmbedSigWord; /* embedded volume signature */ + __be32 drEmbedExtent; /* starting block number (xdrStABN) + and number of allocation blocks + (xdrNumABlks) occupied by embedded + volume */ + __be32 drXTFlSize; /* bytes in the extents B-tree */ + hfs_extent_rec drXTExtRec; /* extents B-tree's first 3 extents */ + __be32 drCTFlSize; /* bytes in the catalog B-tree */ + hfs_extent_rec drCTExtRec; /* catalog B-tree's first 3 extents */ +} __packed; + +/*======== Data structures kept in memory ========*/ + +struct hfs_readdir_data { + struct list_head list; + struct file *file; + struct hfs_cat_key key; +}; + +#endif diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h new file mode 100644 index 00000000..ad97c2d5 --- /dev/null +++ b/fs/hfs/hfs_fs.h @@ -0,0 +1,276 @@ +/* + * linux/fs/hfs/hfs_fs.h + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + */ + +#ifndef _LINUX_HFS_FS_H +#define _LINUX_HFS_FS_H + +#include +#include +#include +#include +#include + +#include +#include + +#include "hfs.h" + +#define DBG_BNODE_REFS 0x00000001 +#define DBG_BNODE_MOD 0x00000002 +#define DBG_CAT_MOD 0x00000004 +#define DBG_INODE 0x00000008 +#define DBG_SUPER 0x00000010 +#define DBG_EXTENT 0x00000020 +#define DBG_BITMAP 0x00000040 + +//#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD|DBG_CAT_MOD|DBG_BITMAP) +//#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) +//#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) +#define DBG_MASK (0) + +#define dprint(flg, fmt, args...) \ + if (flg & DBG_MASK) printk(fmt , ## args) + +/* + * struct hfs_inode_info + * + * The HFS-specific part of a Linux (struct inode) + */ +struct hfs_inode_info { + atomic_t opencnt; + + unsigned int flags; + + /* to deal with localtime ugliness */ + int tz_secondswest; + + struct hfs_cat_key cat_key; + + struct list_head open_dir_list; + struct inode *rsrc_inode; + + struct mutex extents_lock; + + u16 alloc_blocks, clump_blocks; + sector_t fs_blocks; + /* Allocation extents from catlog record or volume header */ + hfs_extent_rec first_extents; + u16 first_blocks; + hfs_extent_rec cached_extents; + u16 cached_start, cached_blocks; + + loff_t phys_size; + struct inode vfs_inode; +}; + +#define HFS_FLG_RSRC 0x0001 +#define HFS_FLG_EXT_DIRTY 0x0002 +#define HFS_FLG_EXT_NEW 0x0004 + +#define HFS_IS_RSRC(inode) (HFS_I(inode)->flags & HFS_FLG_RSRC) + +/* + * struct hfs_sb_info + * + * The HFS-specific part of a Linux (struct super_block) + */ +struct hfs_sb_info { + struct buffer_head *mdb_bh; /* The hfs_buffer + holding the real + superblock (aka VIB + or MDB) */ + struct hfs_mdb *mdb; + struct buffer_head *alt_mdb_bh; /* The hfs_buffer holding + the alternate superblock */ + struct hfs_mdb *alt_mdb; + __be32 *bitmap; /* The page holding the + allocation bitmap */ + struct hfs_btree *ext_tree; /* Information about + the extents b-tree */ + struct hfs_btree *cat_tree; /* Information about + the catalog b-tree */ + u32 file_count; /* The number of + regular files in + the filesystem */ + u32 folder_count; /* The number of + directories in the + filesystem */ + u32 next_id; /* The next available + file id number */ + u32 clumpablks; /* The number of allocation + blocks to try to add when + extending a file */ + u32 fs_start; /* The first 512-byte + block represented + in the bitmap */ + u32 part_start; + u16 root_files; /* The number of + regular + (non-directory) + files in the root + directory */ + u16 root_dirs; /* The number of + directories in the + root directory */ + u16 fs_ablocks; /* The number of + allocation blocks + in the filesystem */ + u16 free_ablocks; /* the number of unused + allocation blocks + in the filesystem */ + u32 alloc_blksz; /* The size of an + "allocation block" */ + int s_quiet; /* Silent failure when + changing owner or mode? */ + __be32 s_type; /* Type for new files */ + __be32 s_creator; /* Creator for new files */ + umode_t s_file_umask; /* The umask applied to the + permissions on all files */ + umode_t s_dir_umask; /* The umask applied to the + permissions on all dirs */ + uid_t s_uid; /* The uid of all files */ + gid_t s_gid; /* The gid of all files */ + + int session, part; + + struct nls_table *nls_io, *nls_disk; + + struct mutex bitmap_lock; + + unsigned long flags; + + u16 blockoffset; + + int fs_div; +}; + +#define HFS_FLG_BITMAP_DIRTY 0 +#define HFS_FLG_MDB_DIRTY 1 +#define HFS_FLG_ALT_MDB_DIRTY 2 + +/* bitmap.c */ +extern u32 hfs_vbm_search_free(struct super_block *, u32, u32 *); +extern int hfs_clear_vbm_bits(struct super_block *, u16, u16); + +/* catalog.c */ +extern int hfs_cat_keycmp(const btree_key *, const btree_key *); +struct hfs_find_data; +extern int hfs_cat_find_brec(struct super_block *, u32, struct hfs_find_data *); +extern int hfs_cat_create(u32, struct inode *, struct qstr *, struct inode *); +extern int hfs_cat_delete(u32, struct inode *, struct qstr *); +extern int hfs_cat_move(u32, struct inode *, struct qstr *, + struct inode *, struct qstr *); +extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, struct qstr *); + +/* dir.c */ +extern const struct file_operations hfs_dir_operations; +extern const struct inode_operations hfs_dir_inode_operations; + +/* extent.c */ +extern int hfs_ext_keycmp(const btree_key *, const btree_key *); +extern int hfs_free_fork(struct super_block *, struct hfs_cat_file *, int); +extern void hfs_ext_write_extent(struct inode *); +extern int hfs_extend_file(struct inode *); +extern void hfs_file_truncate(struct inode *); + +extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int); + +/* inode.c */ +extern const struct address_space_operations hfs_aops; +extern const struct address_space_operations hfs_btree_aops; + +extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int); +extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); +extern int hfs_write_inode(struct inode *, struct writeback_control *); +extern int hfs_inode_setattr(struct dentry *, struct iattr *); +extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, + __be32 log_size, __be32 phys_size, u32 clump_size); +extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *); +extern void hfs_evict_inode(struct inode *); +extern void hfs_delete_inode(struct inode *); + +/* attr.c */ +extern int hfs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +extern ssize_t hfs_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size); +extern ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size); + +/* mdb.c */ +extern int hfs_mdb_get(struct super_block *); +extern void hfs_mdb_commit(struct super_block *); +extern void hfs_mdb_close(struct super_block *); +extern void hfs_mdb_put(struct super_block *); + +/* part_tbl.c */ +extern int hfs_part_find(struct super_block *, sector_t *, sector_t *); + +/* string.c */ +extern const struct dentry_operations hfs_dentry_operations; + +extern int hfs_hash_dentry(const struct dentry *, const struct inode *, + struct qstr *); +extern int hfs_strcmp(const unsigned char *, unsigned int, + const unsigned char *, unsigned int); +extern int hfs_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); + +/* trans.c */ +extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *); +extern int hfs_mac2asc(struct super_block *, char *, const struct hfs_name *); + +extern struct timezone sys_tz; + +/* + * There are two time systems. Both are based on seconds since + * a particular time/date. + * Unix: unsigned lil-endian since 00:00 GMT, Jan. 1, 1970 + * mac: unsigned big-endian since 00:00 GMT, Jan. 1, 1904 + * + */ +#define __hfs_u_to_mtime(sec) cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60) +#define __hfs_m_to_utime(sec) (be32_to_cpu(sec) - 2082844800U + sys_tz.tz_minuteswest * 60) + +#define HFS_I(inode) (list_entry(inode, struct hfs_inode_info, vfs_inode)) +#define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info) + +#define hfs_m_to_utime(time) (struct timespec){ .tv_sec = __hfs_m_to_utime(time) } +#define hfs_u_to_mtime(time) __hfs_u_to_mtime((time).tv_sec) +#define hfs_mtime() __hfs_u_to_mtime(get_seconds()) + +static inline const char *hfs_mdb_name(struct super_block *sb) +{ + return sb->s_id; +} + +static inline void hfs_bitmap_dirty(struct super_block *sb) +{ + set_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags); + sb->s_dirt = 1; +} + +#define sb_bread512(sb, sec, data) ({ \ + struct buffer_head *__bh; \ + sector_t __block; \ + loff_t __start; \ + int __offset; \ + \ + __start = (loff_t)(sec) << HFS_SECTOR_SIZE_BITS;\ + __block = __start >> (sb)->s_blocksize_bits; \ + __offset = __start & ((sb)->s_blocksize - 1); \ + __bh = sb_bread((sb), __block); \ + if (likely(__bh != NULL)) \ + data = (void *)(__bh->b_data + __offset);\ + else \ + data = NULL; \ + __bh; \ +}) + +#endif diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c new file mode 100644 index 00000000..fff16c96 --- /dev/null +++ b/fs/hfs/inode.c @@ -0,0 +1,673 @@ +/* + * linux/fs/hfs/inode.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains inode-related functions which do not depend on + * which scheme is being used to represent forks. + * + * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds + */ + +#include +#include +#include + +#include "hfs_fs.h" +#include "btree.h" + +static const struct file_operations hfs_file_operations; +static const struct inode_operations hfs_file_inode_operations; + +/*================ Variable-like macros ================*/ + +#define HFS_VALID_MODE_BITS (S_IFREG | S_IFDIR | S_IRWXUGO) + +static int hfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, hfs_get_block, wbc); +} + +static int hfs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, hfs_get_block); +} + +static int hfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + *pagep = NULL; + ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + hfs_get_block, + &HFS_I(mapping->host)->phys_size); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} + +static sector_t hfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, hfs_get_block); +} + +static int hfs_releasepage(struct page *page, gfp_t mask) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + struct hfs_btree *tree; + struct hfs_bnode *node; + u32 nidx; + int i, res = 1; + + switch (inode->i_ino) { + case HFS_EXT_CNID: + tree = HFS_SB(sb)->ext_tree; + break; + case HFS_CAT_CNID: + tree = HFS_SB(sb)->cat_tree; + break; + default: + BUG(); + return 0; + } + + if (!tree) + return 0; + + if (tree->node_size >= PAGE_CACHE_SIZE) { + nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT); + spin_lock(&tree->hash_lock); + node = hfs_bnode_findhash(tree, nidx); + if (!node) + ; + else if (atomic_read(&node->refcnt)) + res = 0; + if (res && node) { + hfs_bnode_unhash(node); + hfs_bnode_free(node); + } + spin_unlock(&tree->hash_lock); + } else { + nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift); + i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift); + spin_lock(&tree->hash_lock); + do { + node = hfs_bnode_findhash(tree, nidx++); + if (!node) + continue; + if (atomic_read(&node->refcnt)) { + res = 0; + break; + } + hfs_bnode_unhash(node); + hfs_bnode_free(node); + } while (--i && nidx < tree->node_count); + spin_unlock(&tree->hash_lock); + } + return res ? try_to_free_buffers(page) : 0; +} + +static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; + ssize_t ret; + + ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, hfs_get_block, NULL); + + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely((rw & WRITE) && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + + if (end > isize) + vmtruncate(inode, isize); + } + + return ret; +} + +static int hfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, hfs_get_block); +} + +const struct address_space_operations hfs_btree_aops = { + .readpage = hfs_readpage, + .writepage = hfs_writepage, + .write_begin = hfs_write_begin, + .write_end = generic_write_end, + .bmap = hfs_bmap, + .releasepage = hfs_releasepage, +}; + +const struct address_space_operations hfs_aops = { + .readpage = hfs_readpage, + .writepage = hfs_writepage, + .write_begin = hfs_write_begin, + .write_end = generic_write_end, + .bmap = hfs_bmap, + .direct_IO = hfs_direct_IO, + .writepages = hfs_writepages, +}; + +/* + * hfs_new_inode + */ +struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode = new_inode(sb); + if (!inode) + return NULL; + + mutex_init(&HFS_I(inode)->extents_lock); + INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); + hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name); + inode->i_ino = HFS_SB(sb)->next_id++; + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_nlink = 1; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + HFS_I(inode)->flags = 0; + HFS_I(inode)->rsrc_inode = NULL; + HFS_I(inode)->fs_blocks = 0; + if (S_ISDIR(mode)) { + inode->i_size = 2; + HFS_SB(sb)->folder_count++; + if (dir->i_ino == HFS_ROOT_CNID) + HFS_SB(sb)->root_dirs++; + inode->i_op = &hfs_dir_inode_operations; + inode->i_fop = &hfs_dir_operations; + inode->i_mode |= S_IRWXUGO; + inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask; + } else if (S_ISREG(mode)) { + HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks; + HFS_SB(sb)->file_count++; + if (dir->i_ino == HFS_ROOT_CNID) + HFS_SB(sb)->root_files++; + inode->i_op = &hfs_file_inode_operations; + inode->i_fop = &hfs_file_operations; + inode->i_mapping->a_ops = &hfs_aops; + inode->i_mode |= S_IRUGO|S_IXUGO; + if (mode & S_IWUSR) + inode->i_mode |= S_IWUGO; + inode->i_mode &= ~HFS_SB(inode->i_sb)->s_file_umask; + HFS_I(inode)->phys_size = 0; + HFS_I(inode)->alloc_blocks = 0; + HFS_I(inode)->first_blocks = 0; + HFS_I(inode)->cached_start = 0; + HFS_I(inode)->cached_blocks = 0; + memset(HFS_I(inode)->first_extents, 0, sizeof(hfs_extent_rec)); + memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec)); + } + insert_inode_hash(inode); + mark_inode_dirty(inode); + set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); + sb->s_dirt = 1; + + return inode; +} + +void hfs_delete_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + dprint(DBG_INODE, "delete_inode: %lu\n", inode->i_ino); + if (S_ISDIR(inode->i_mode)) { + HFS_SB(sb)->folder_count--; + if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) + HFS_SB(sb)->root_dirs--; + set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); + sb->s_dirt = 1; + return; + } + HFS_SB(sb)->file_count--; + if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) + HFS_SB(sb)->root_files--; + if (S_ISREG(inode->i_mode)) { + if (!inode->i_nlink) { + inode->i_size = 0; + hfs_file_truncate(inode); + } + } + set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); + sb->s_dirt = 1; +} + +void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, + __be32 __log_size, __be32 phys_size, u32 clump_size) +{ + struct super_block *sb = inode->i_sb; + u32 log_size = be32_to_cpu(__log_size); + u16 count; + int i; + + memcpy(HFS_I(inode)->first_extents, ext, sizeof(hfs_extent_rec)); + for (count = 0, i = 0; i < 3; i++) + count += be16_to_cpu(ext[i].count); + HFS_I(inode)->first_blocks = count; + + inode->i_size = HFS_I(inode)->phys_size = log_size; + HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits); + HFS_I(inode)->alloc_blocks = be32_to_cpu(phys_size) / + HFS_SB(sb)->alloc_blksz; + HFS_I(inode)->clump_blocks = clump_size / HFS_SB(sb)->alloc_blksz; + if (!HFS_I(inode)->clump_blocks) + HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks; +} + +struct hfs_iget_data { + struct hfs_cat_key *key; + hfs_cat_rec *rec; +}; + +static int hfs_test_inode(struct inode *inode, void *data) +{ + struct hfs_iget_data *idata = data; + hfs_cat_rec *rec; + + rec = idata->rec; + switch (rec->type) { + case HFS_CDR_DIR: + return inode->i_ino == be32_to_cpu(rec->dir.DirID); + case HFS_CDR_FIL: + return inode->i_ino == be32_to_cpu(rec->file.FlNum); + default: + BUG(); + return 1; + } +} + +/* + * hfs_read_inode + */ +static int hfs_read_inode(struct inode *inode, void *data) +{ + struct hfs_iget_data *idata = data; + struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); + hfs_cat_rec *rec; + + HFS_I(inode)->flags = 0; + HFS_I(inode)->rsrc_inode = NULL; + mutex_init(&HFS_I(inode)->extents_lock); + INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); + + /* Initialize the inode */ + inode->i_uid = hsb->s_uid; + inode->i_gid = hsb->s_gid; + inode->i_nlink = 1; + + if (idata->key) + HFS_I(inode)->cat_key = *idata->key; + else + HFS_I(inode)->flags |= HFS_FLG_RSRC; + HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60; + + rec = idata->rec; + switch (rec->type) { + case HFS_CDR_FIL: + if (!HFS_IS_RSRC(inode)) { + hfs_inode_read_fork(inode, rec->file.ExtRec, rec->file.LgLen, + rec->file.PyLen, be16_to_cpu(rec->file.ClpSize)); + } else { + hfs_inode_read_fork(inode, rec->file.RExtRec, rec->file.RLgLen, + rec->file.RPyLen, be16_to_cpu(rec->file.ClpSize)); + } + + inode->i_ino = be32_to_cpu(rec->file.FlNum); + inode->i_mode = S_IRUGO | S_IXUGO; + if (!(rec->file.Flags & HFS_FIL_LOCK)) + inode->i_mode |= S_IWUGO; + inode->i_mode &= ~hsb->s_file_umask; + inode->i_mode |= S_IFREG; + inode->i_ctime = inode->i_atime = inode->i_mtime = + hfs_m_to_utime(rec->file.MdDat); + inode->i_op = &hfs_file_inode_operations; + inode->i_fop = &hfs_file_operations; + inode->i_mapping->a_ops = &hfs_aops; + break; + case HFS_CDR_DIR: + inode->i_ino = be32_to_cpu(rec->dir.DirID); + inode->i_size = be16_to_cpu(rec->dir.Val) + 2; + HFS_I(inode)->fs_blocks = 0; + inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask); + inode->i_ctime = inode->i_atime = inode->i_mtime = + hfs_m_to_utime(rec->dir.MdDat); + inode->i_op = &hfs_dir_inode_operations; + inode->i_fop = &hfs_dir_operations; + break; + default: + make_bad_inode(inode); + } + return 0; +} + +/* + * __hfs_iget() + * + * Given the MDB for a HFS filesystem, a 'key' and an 'entry' in + * the catalog B-tree and the 'type' of the desired file return the + * inode for that file/directory or NULL. Note that 'type' indicates + * whether we want the actual file or directory, or the corresponding + * metadata (AppleDouble header file or CAP metadata file). + */ +struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_rec *rec) +{ + struct hfs_iget_data data = { key, rec }; + struct inode *inode; + u32 cnid; + + switch (rec->type) { + case HFS_CDR_DIR: + cnid = be32_to_cpu(rec->dir.DirID); + break; + case HFS_CDR_FIL: + cnid = be32_to_cpu(rec->file.FlNum); + break; + default: + return NULL; + } + inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data); + if (inode && (inode->i_state & I_NEW)) + unlock_new_inode(inode); + return inode; +} + +void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext, + __be32 *log_size, __be32 *phys_size) +{ + memcpy(ext, HFS_I(inode)->first_extents, sizeof(hfs_extent_rec)); + + if (log_size) + *log_size = cpu_to_be32(inode->i_size); + if (phys_size) + *phys_size = cpu_to_be32(HFS_I(inode)->alloc_blocks * + HFS_SB(inode->i_sb)->alloc_blksz); +} + +int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct inode *main_inode = inode; + struct hfs_find_data fd; + hfs_cat_rec rec; + + dprint(DBG_INODE, "hfs_write_inode: %lu\n", inode->i_ino); + hfs_ext_write_extent(inode); + + if (inode->i_ino < HFS_FIRSTUSER_CNID) { + switch (inode->i_ino) { + case HFS_ROOT_CNID: + break; + case HFS_EXT_CNID: + hfs_btree_write(HFS_SB(inode->i_sb)->ext_tree); + return 0; + case HFS_CAT_CNID: + hfs_btree_write(HFS_SB(inode->i_sb)->cat_tree); + return 0; + default: + BUG(); + return -EIO; + } + } + + if (HFS_IS_RSRC(inode)) + main_inode = HFS_I(inode)->rsrc_inode; + + if (!main_inode->i_nlink) + return 0; + + if (hfs_find_init(HFS_SB(main_inode->i_sb)->cat_tree, &fd)) + /* panic? */ + return -EIO; + + fd.search_key->cat = HFS_I(main_inode)->cat_key; + if (hfs_brec_find(&fd)) + /* panic? */ + goto out; + + if (S_ISDIR(main_inode->i_mode)) { + if (fd.entrylength < sizeof(struct hfs_cat_dir)) + /* panic? */; + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_dir)); + if (rec.type != HFS_CDR_DIR || + be32_to_cpu(rec.dir.DirID) != inode->i_ino) { + } + + rec.dir.MdDat = hfs_u_to_mtime(inode->i_mtime); + rec.dir.Val = cpu_to_be16(inode->i_size - 2); + + hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_dir)); + } else if (HFS_IS_RSRC(inode)) { + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); + hfs_inode_write_fork(inode, rec.file.RExtRec, + &rec.file.RLgLen, &rec.file.RPyLen); + hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); + } else { + if (fd.entrylength < sizeof(struct hfs_cat_file)) + /* panic? */; + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); + if (rec.type != HFS_CDR_FIL || + be32_to_cpu(rec.file.FlNum) != inode->i_ino) { + } + + if (inode->i_mode & S_IWUSR) + rec.file.Flags &= ~HFS_FIL_LOCK; + else + rec.file.Flags |= HFS_FIL_LOCK; + hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen); + rec.file.MdDat = hfs_u_to_mtime(inode->i_mtime); + + hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, + sizeof(struct hfs_cat_file)); + } +out: + hfs_find_exit(&fd); + return 0; +} + +static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode = NULL; + hfs_cat_rec rec; + struct hfs_find_data fd; + int res; + + if (HFS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) + goto out; + + inode = HFS_I(dir)->rsrc_inode; + if (inode) + goto out; + + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + fd.search_key->cat = HFS_I(dir)->cat_key; + res = hfs_brec_read(&fd, &rec, sizeof(rec)); + if (!res) { + struct hfs_iget_data idata = { NULL, &rec }; + hfs_read_inode(inode, &idata); + } + hfs_find_exit(&fd); + if (res) { + iput(inode); + return ERR_PTR(res); + } + HFS_I(inode)->rsrc_inode = dir; + HFS_I(dir)->rsrc_inode = inode; + igrab(dir); + hlist_add_fake(&inode->i_hash); + mark_inode_dirty(inode); +out: + d_add(dentry, inode); + return NULL; +} + +void hfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { + HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; + iput(HFS_I(inode)->rsrc_inode); + } +} + +static int hfs_file_open(struct inode *inode, struct file *file) +{ + if (HFS_IS_RSRC(inode)) + inode = HFS_I(inode)->rsrc_inode; + atomic_inc(&HFS_I(inode)->opencnt); + return 0; +} + +static int hfs_file_release(struct inode *inode, struct file *file) +{ + //struct super_block *sb = inode->i_sb; + + if (HFS_IS_RSRC(inode)) + inode = HFS_I(inode)->rsrc_inode; + if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) { + mutex_lock(&inode->i_mutex); + hfs_file_truncate(inode); + //if (inode->i_flags & S_DEAD) { + // hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); + // hfs_delete_inode(inode); + //} + mutex_unlock(&inode->i_mutex); + } + return 0; +} + +/* + * hfs_notify_change() + * + * Based very closely on fs/msdos/inode.c by Werner Almesberger + * + * This is the notify_change() field in the super_operations structure + * for HFS file systems. The purpose is to take that changes made to + * an inode and apply then in a filesystem-dependent manner. In this + * case the process has a few of tasks to do: + * 1) prevent changes to the i_uid and i_gid fields. + * 2) map file permissions to the closest allowable permissions + * 3) Since multiple Linux files can share the same on-disk inode under + * HFS (for instance the data and resource forks of a file) a change + * to permissions must be applied to all other in-core inodes which + * correspond to the same HFS file. + */ + +int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) +{ + struct inode *inode = dentry->d_inode; + struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); + int error; + + error = inode_change_ok(inode, attr); /* basic permission checks */ + if (error) + return error; + + /* no uig/gid changes and limit which mode bits can be set */ + if (((attr->ia_valid & ATTR_UID) && + (attr->ia_uid != hsb->s_uid)) || + ((attr->ia_valid & ATTR_GID) && + (attr->ia_gid != hsb->s_gid)) || + ((attr->ia_valid & ATTR_MODE) && + ((S_ISDIR(inode->i_mode) && + (attr->ia_mode != inode->i_mode)) || + (attr->ia_mode & ~HFS_VALID_MODE_BITS)))) { + return hsb->s_quiet ? 0 : error; + } + + if (attr->ia_valid & ATTR_MODE) { + /* Only the 'w' bits can ever change and only all together. */ + if (attr->ia_mode & S_IWUSR) + attr->ia_mode = inode->i_mode | S_IWUGO; + else + attr->ia_mode = inode->i_mode & ~S_IWUGO; + attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask; + } + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +static int hfs_file_fsync(struct file *filp, int datasync) +{ + struct inode *inode = filp->f_mapping->host; + struct super_block * sb; + int ret, err; + + /* sync the inode to buffers */ + ret = write_inode_now(inode, 0); + + /* sync the superblock to buffers */ + sb = inode->i_sb; + if (sb->s_dirt) { + lock_super(sb); + sb->s_dirt = 0; + if (!(sb->s_flags & MS_RDONLY)) + hfs_mdb_commit(sb); + unlock_super(sb); + } + /* .. finally sync the buffers to disk */ + err = sync_blockdev(sb->s_bdev); + if (!ret) + ret = err; + return ret; +} + +static const struct file_operations hfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .splice_read = generic_file_splice_read, + .fsync = hfs_file_fsync, + .open = hfs_file_open, + .release = hfs_file_release, +}; + +static const struct inode_operations hfs_file_inode_operations = { + .lookup = hfs_file_lookup, + .truncate = hfs_file_truncate, + .setattr = hfs_inode_setattr, + .setxattr = hfs_setxattr, + .getxattr = hfs_getxattr, + .listxattr = hfs_listxattr, +}; diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c new file mode 100644 index 00000000..1563d5ce --- /dev/null +++ b/fs/hfs/mdb.c @@ -0,0 +1,354 @@ +/* + * linux/fs/hfs/mdb.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains functions for reading/writing the MDB. + */ + +#include +#include +#include +#include + +#include "hfs_fs.h" +#include "btree.h" + +/*================ File-local data types ================*/ + +/* + * The HFS Master Directory Block (MDB). + * + * Also known as the Volume Information Block (VIB), this structure is + * the HFS equivalent of a superblock. + * + * Reference: _Inside Macintosh: Files_ pages 2-59 through 2-62 + * + * modified for HFS Extended + */ + +static int hfs_get_last_session(struct super_block *sb, + sector_t *start, sector_t *size) +{ + struct cdrom_multisession ms_info; + struct cdrom_tocentry te; + int res; + + /* default values */ + *start = 0; + *size = sb->s_bdev->bd_inode->i_size >> 9; + + if (HFS_SB(sb)->session >= 0) { + te.cdte_track = HFS_SB(sb)->session; + te.cdte_format = CDROM_LBA; + res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); + if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { + *start = (sector_t)te.cdte_addr.lba << 2; + return 0; + } + printk(KERN_ERR "hfs: invalid session number or type of track\n"); + return -EINVAL; + } + ms_info.addr_format = CDROM_LBA; + res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, (unsigned long)&ms_info); + if (!res && ms_info.xa_flag) + *start = (sector_t)ms_info.addr.lba << 2; + return 0; +} + +/* + * hfs_mdb_get() + * + * Build the in-core MDB for a filesystem, including + * the B-trees and the volume bitmap. + */ +int hfs_mdb_get(struct super_block *sb) +{ + struct buffer_head *bh; + struct hfs_mdb *mdb, *mdb2; + unsigned int block; + char *ptr; + int off2, len, size, sect; + sector_t part_start, part_size; + loff_t off; + __be16 attrib; + + /* set the device driver to 512-byte blocks */ + size = sb_min_blocksize(sb, HFS_SECTOR_SIZE); + if (!size) + return -EINVAL; + + if (hfs_get_last_session(sb, &part_start, &part_size)) + return -EINVAL; + while (1) { + /* See if this is an HFS filesystem */ + bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb); + if (!bh) + goto out; + + if (mdb->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC)) + break; + brelse(bh); + + /* check for a partition block + * (should do this only for cdrom/loop though) + */ + if (hfs_part_find(sb, &part_start, &part_size)) + goto out; + } + + HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz); + if (!size || (size & (HFS_SECTOR_SIZE - 1))) { + printk(KERN_ERR "hfs: bad allocation block size %d\n", size); + goto out_bh; + } + + size = min(HFS_SB(sb)->alloc_blksz, (u32)PAGE_SIZE); + /* size must be a multiple of 512 */ + while (size & (size - 1)) + size -= HFS_SECTOR_SIZE; + sect = be16_to_cpu(mdb->drAlBlSt) + part_start; + /* align block size to first sector */ + while (sect & ((size - 1) >> HFS_SECTOR_SIZE_BITS)) + size >>= 1; + /* align block size to weird alloc size */ + while (HFS_SB(sb)->alloc_blksz & (size - 1)) + size >>= 1; + brelse(bh); + if (!sb_set_blocksize(sb, size)) { + printk(KERN_ERR "hfs: unable to set blocksize to %u\n", size); + goto out; + } + + bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb); + if (!bh) + goto out; + if (mdb->drSigWord != cpu_to_be16(HFS_SUPER_MAGIC)) + goto out_bh; + + HFS_SB(sb)->mdb_bh = bh; + HFS_SB(sb)->mdb = mdb; + + /* These parameters are read from the MDB, and never written */ + HFS_SB(sb)->part_start = part_start; + HFS_SB(sb)->fs_ablocks = be16_to_cpu(mdb->drNmAlBlks); + HFS_SB(sb)->fs_div = HFS_SB(sb)->alloc_blksz >> sb->s_blocksize_bits; + HFS_SB(sb)->clumpablks = be32_to_cpu(mdb->drClpSiz) / + HFS_SB(sb)->alloc_blksz; + if (!HFS_SB(sb)->clumpablks) + HFS_SB(sb)->clumpablks = 1; + HFS_SB(sb)->fs_start = (be16_to_cpu(mdb->drAlBlSt) + part_start) >> + (sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS); + + /* These parameters are read from and written to the MDB */ + HFS_SB(sb)->free_ablocks = be16_to_cpu(mdb->drFreeBks); + HFS_SB(sb)->next_id = be32_to_cpu(mdb->drNxtCNID); + HFS_SB(sb)->root_files = be16_to_cpu(mdb->drNmFls); + HFS_SB(sb)->root_dirs = be16_to_cpu(mdb->drNmRtDirs); + HFS_SB(sb)->file_count = be32_to_cpu(mdb->drFilCnt); + HFS_SB(sb)->folder_count = be32_to_cpu(mdb->drDirCnt); + + /* TRY to get the alternate (backup) MDB. */ + sect = part_start + part_size - 2; + bh = sb_bread512(sb, sect, mdb2); + if (bh) { + if (mdb2->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC)) { + HFS_SB(sb)->alt_mdb_bh = bh; + HFS_SB(sb)->alt_mdb = mdb2; + } else + brelse(bh); + } + + if (!HFS_SB(sb)->alt_mdb) { + printk(KERN_WARNING "hfs: unable to locate alternate MDB\n"); + printk(KERN_WARNING "hfs: continuing without an alternate MDB\n"); + } + + HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0); + if (!HFS_SB(sb)->bitmap) + goto out; + + /* read in the bitmap */ + block = be16_to_cpu(mdb->drVBMSt) + part_start; + off = (loff_t)block << HFS_SECTOR_SIZE_BITS; + size = (HFS_SB(sb)->fs_ablocks + 8) / 8; + ptr = (u8 *)HFS_SB(sb)->bitmap; + while (size) { + bh = sb_bread(sb, off >> sb->s_blocksize_bits); + if (!bh) { + printk(KERN_ERR "hfs: unable to read volume bitmap\n"); + goto out; + } + off2 = off & (sb->s_blocksize - 1); + len = min((int)sb->s_blocksize - off2, size); + memcpy(ptr, bh->b_data + off2, len); + brelse(bh); + ptr += len; + off += len; + size -= len; + } + + HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp); + if (!HFS_SB(sb)->ext_tree) { + printk(KERN_ERR "hfs: unable to open extent tree\n"); + goto out; + } + HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp); + if (!HFS_SB(sb)->cat_tree) { + printk(KERN_ERR "hfs: unable to open catalog tree\n"); + goto out; + } + + attrib = mdb->drAtrb; + if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { + printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, " + "running fsck.hfs is recommended. mounting read-only.\n"); + sb->s_flags |= MS_RDONLY; + } + if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) { + printk(KERN_WARNING "hfs: filesystem is marked locked, mounting read-only.\n"); + sb->s_flags |= MS_RDONLY; + } + if (!(sb->s_flags & MS_RDONLY)) { + /* Mark the volume uncleanly unmounted in case we crash */ + attrib &= cpu_to_be16(~HFS_SB_ATTRIB_UNMNT); + attrib |= cpu_to_be16(HFS_SB_ATTRIB_INCNSTNT); + mdb->drAtrb = attrib; + be32_add_cpu(&mdb->drWrCnt, 1); + mdb->drLsMod = hfs_mtime(); + + mark_buffer_dirty(HFS_SB(sb)->mdb_bh); + sync_dirty_buffer(HFS_SB(sb)->mdb_bh); + } + + return 0; + +out_bh: + brelse(bh); +out: + hfs_mdb_put(sb); + return -EIO; +} + +/* + * hfs_mdb_commit() + * + * Description: + * This updates the MDB on disk (look also at hfs_write_super()). + * It does not check, if the superblock has been modified, or + * if the filesystem has been mounted read-only. It is mainly + * called by hfs_write_super() and hfs_btree_extend(). + * Input Variable(s): + * struct hfs_mdb *mdb: Pointer to the hfs MDB + * int backup; + * Output Variable(s): + * NONE + * Returns: + * void + * Preconditions: + * 'mdb' points to a "valid" (struct hfs_mdb). + * Postconditions: + * The HFS MDB and on disk will be updated, by copying the possibly + * modified fields from the in memory MDB (in native byte order) to + * the disk block buffer. + * If 'backup' is non-zero then the alternate MDB is also written + * and the function doesn't return until it is actually on disk. + */ +void hfs_mdb_commit(struct super_block *sb) +{ + struct hfs_mdb *mdb = HFS_SB(sb)->mdb; + + if (test_and_clear_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags)) { + /* These parameters may have been modified, so write them back */ + mdb->drLsMod = hfs_mtime(); + mdb->drFreeBks = cpu_to_be16(HFS_SB(sb)->free_ablocks); + mdb->drNxtCNID = cpu_to_be32(HFS_SB(sb)->next_id); + mdb->drNmFls = cpu_to_be16(HFS_SB(sb)->root_files); + mdb->drNmRtDirs = cpu_to_be16(HFS_SB(sb)->root_dirs); + mdb->drFilCnt = cpu_to_be32(HFS_SB(sb)->file_count); + mdb->drDirCnt = cpu_to_be32(HFS_SB(sb)->folder_count); + + /* write MDB to disk */ + mark_buffer_dirty(HFS_SB(sb)->mdb_bh); + } + + /* write the backup MDB, not returning until it is written. + * we only do this when either the catalog or extents overflow + * files grow. */ + if (test_and_clear_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags) && + HFS_SB(sb)->alt_mdb) { + hfs_inode_write_fork(HFS_SB(sb)->ext_tree->inode, mdb->drXTExtRec, + &mdb->drXTFlSize, NULL); + hfs_inode_write_fork(HFS_SB(sb)->cat_tree->inode, mdb->drCTExtRec, + &mdb->drCTFlSize, NULL); + memcpy(HFS_SB(sb)->alt_mdb, HFS_SB(sb)->mdb, HFS_SECTOR_SIZE); + HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); + HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); + mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh); + sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh); + } + + if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) { + struct buffer_head *bh; + sector_t block; + char *ptr; + int off, size, len; + + block = be16_to_cpu(HFS_SB(sb)->mdb->drVBMSt) + HFS_SB(sb)->part_start; + off = (block << HFS_SECTOR_SIZE_BITS) & (sb->s_blocksize - 1); + block >>= sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS; + size = (HFS_SB(sb)->fs_ablocks + 7) / 8; + ptr = (u8 *)HFS_SB(sb)->bitmap; + while (size) { + bh = sb_bread(sb, block); + if (!bh) { + printk(KERN_ERR "hfs: unable to read volume bitmap\n"); + break; + } + len = min((int)sb->s_blocksize - off, size); + memcpy(bh->b_data + off, ptr, len); + mark_buffer_dirty(bh); + brelse(bh); + block++; + off = 0; + ptr += len; + size -= len; + } + } +} + +void hfs_mdb_close(struct super_block *sb) +{ + /* update volume attributes */ + if (sb->s_flags & MS_RDONLY) + return; + HFS_SB(sb)->mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); + HFS_SB(sb)->mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); + mark_buffer_dirty(HFS_SB(sb)->mdb_bh); +} + +/* + * hfs_mdb_put() + * + * Release the resources associated with the in-core MDB. */ +void hfs_mdb_put(struct super_block *sb) +{ + if (!HFS_SB(sb)) + return; + /* free the B-trees */ + hfs_btree_close(HFS_SB(sb)->ext_tree); + hfs_btree_close(HFS_SB(sb)->cat_tree); + + /* free the buffers holding the primary and alternate MDBs */ + brelse(HFS_SB(sb)->mdb_bh); + brelse(HFS_SB(sb)->alt_mdb_bh); + + unload_nls(HFS_SB(sb)->nls_io); + unload_nls(HFS_SB(sb)->nls_disk); + + free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0); + kfree(HFS_SB(sb)); + sb->s_fs_info = NULL; +} diff --git a/fs/hfs/part_tbl.c b/fs/hfs/part_tbl.c new file mode 100644 index 00000000..36add537 --- /dev/null +++ b/fs/hfs/part_tbl.c @@ -0,0 +1,117 @@ +/* + * linux/fs/hfs/part_tbl.c + * + * Copyright (C) 1996-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * Original code to handle the new style Mac partition table based on + * a patch contributed by Holger Schemel (aeglos@valinor.owl.de). + */ + +#include "hfs_fs.h" + +/* + * The new style Mac partition map + * + * For each partition on the media there is a physical block (512-byte + * block) containing one of these structures. These blocks are + * contiguous starting at block 1. + */ +struct new_pmap { + __be16 pmSig; /* signature */ + __be16 reSigPad; /* padding */ + __be32 pmMapBlkCnt; /* partition blocks count */ + __be32 pmPyPartStart; /* physical block start of partition */ + __be32 pmPartBlkCnt; /* physical block count of partition */ + u8 pmPartName[32]; /* (null terminated?) string + giving the name of this + partition */ + u8 pmPartType[32]; /* (null terminated?) string + giving the type of this + partition */ + /* a bunch more stuff we don't need */ +} __packed; + +/* + * The old style Mac partition map + * + * The partition map consists for a 2-byte signature followed by an + * array of these structures. The map is terminated with an all-zero + * one of these. + */ +struct old_pmap { + __be16 pdSig; /* Signature bytes */ + struct old_pmap_entry { + __be32 pdStart; + __be32 pdSize; + __be32 pdFSID; + } pdEntry[42]; +} __packed; + +/* + * hfs_part_find() + * + * Parse the partition map looking for the + * start and length of the 'part'th HFS partition. + */ +int hfs_part_find(struct super_block *sb, + sector_t *part_start, sector_t *part_size) +{ + struct buffer_head *bh; + __be16 *data; + int i, size, res; + + res = -ENOENT; + bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK, data); + if (!bh) + return -EIO; + + switch (be16_to_cpu(*data)) { + case HFS_OLD_PMAP_MAGIC: + { + struct old_pmap *pm; + struct old_pmap_entry *p; + + pm = (struct old_pmap *)bh->b_data; + p = pm->pdEntry; + size = 42; + for (i = 0; i < size; p++, i++) { + if (p->pdStart && p->pdSize && + p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && + (HFS_SB(sb)->part < 0 || HFS_SB(sb)->part == i)) { + *part_start += be32_to_cpu(p->pdStart); + *part_size = be32_to_cpu(p->pdSize); + res = 0; + } + } + break; + } + case HFS_NEW_PMAP_MAGIC: + { + struct new_pmap *pm; + + pm = (struct new_pmap *)bh->b_data; + size = be32_to_cpu(pm->pmMapBlkCnt); + for (i = 0; i < size;) { + if (!memcmp(pm->pmPartType,"Apple_HFS", 9) && + (HFS_SB(sb)->part < 0 || HFS_SB(sb)->part == i)) { + *part_start += be32_to_cpu(pm->pmPyPartStart); + *part_size = be32_to_cpu(pm->pmPartBlkCnt); + res = 0; + break; + } + brelse(bh); + bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK + ++i, pm); + if (!bh) + return -EIO; + if (pm->pmSig != cpu_to_be16(HFS_NEW_PMAP_MAGIC)) + break; + } + break; + } + } + brelse(bh); + + return res; +} diff --git a/fs/hfs/string.c b/fs/hfs/string.c new file mode 100644 index 00000000..495a976a --- /dev/null +++ b/fs/hfs/string.c @@ -0,0 +1,116 @@ +/* + * linux/fs/hfs/string.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains the string comparison function for the + * Macintosh character set. + * + * The code in this file is derived from code which is copyright + * 1986, 1989, 1990 by Abacus Research and Development, Inc. (ARDI) + * It is used here by the permission of ARDI's president Cliff Matthews. + */ + +#include "hfs_fs.h" +#include + +/*================ File-local variables ================*/ + +/* + * unsigned char caseorder[] + * + * Defines the lexical ordering of characters on the Macintosh + * + * Composition of the 'casefold' and 'order' tables from ARDI's code + * with the entry for 0x20 changed to match that for 0xCA to remove + * special case for those two characters. + */ +static unsigned char caseorder[256] = { + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, + 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, + 0x20,0x22,0x23,0x28,0x29,0x2A,0x2B,0x2C,0x2F,0x30,0x31,0x32,0x33,0x34,0x35,0x36, + 0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,0x40,0x41,0x42,0x43,0x44,0x45,0x46, + 0x47,0x48,0x57,0x59,0x5D,0x5F,0x66,0x68,0x6A,0x6C,0x72,0x74,0x76,0x78,0x7A,0x7E, + 0x8C,0x8E,0x90,0x92,0x95,0x97,0x9E,0xA0,0xA2,0xA4,0xA7,0xA9,0xAA,0xAB,0xAC,0xAD, + 0x4E,0x48,0x57,0x59,0x5D,0x5F,0x66,0x68,0x6A,0x6C,0x72,0x74,0x76,0x78,0x7A,0x7E, + 0x8C,0x8E,0x90,0x92,0x95,0x97,0x9E,0xA0,0xA2,0xA4,0xA7,0xAF,0xB0,0xB1,0xB2,0xB3, + 0x4A,0x4C,0x5A,0x60,0x7B,0x7F,0x98,0x4F,0x49,0x51,0x4A,0x4B,0x4C,0x5A,0x60,0x63, + 0x64,0x65,0x6E,0x6F,0x70,0x71,0x7B,0x84,0x85,0x86,0x7F,0x80,0x9A,0x9B,0x9C,0x98, + 0xB4,0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0x94,0xBB,0xBC,0xBD,0xBE,0xBF,0xC0,0x4D,0x81, + 0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xCB,0x55,0x8A,0xCC,0x4D,0x81, + 0xCD,0xCE,0xCF,0xD0,0xD1,0xD2,0xD3,0x26,0x27,0xD4,0x20,0x49,0x4B,0x80,0x82,0x82, + 0xD5,0xD6,0x24,0x25,0x2D,0x2E,0xD7,0xD8,0xA6,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF, + 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF, + 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF +}; + +/*================ Global functions ================*/ + +/* + * Hash a string to an integer in a case-independent way + */ +int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *this) +{ + const unsigned char *name = this->name; + unsigned int hash, len = this->len; + + if (len > HFS_NAMELEN) + len = HFS_NAMELEN; + + hash = init_name_hash(); + for (; len; len--) + hash = partial_name_hash(caseorder[*name++], hash); + this->hash = end_name_hash(hash); + return 0; +} + +/* + * Compare two strings in the HFS filename character ordering + * Returns positive, negative, or zero, not just 0 or (+/-)1 + * + * Equivalent to ARDI's call: + * ROMlib_RelString(s1+1, s2+1, true, false, (s1[0]<<16) | s2[0]) + */ +int hfs_strcmp(const unsigned char *s1, unsigned int len1, + const unsigned char *s2, unsigned int len2) +{ + int len, tmp; + + len = (len1 > len2) ? len2 : len1; + + while (len--) { + tmp = (int)caseorder[*(s1++)] - (int)caseorder[*(s2++)]; + if (tmp) + return tmp; + } + return len1 - len2; +} + +/* + * Test for equality of two strings in the HFS filename character ordering. + * return 1 on failure and 0 on success + */ +int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + const unsigned char *n1, *n2; + + if (len >= HFS_NAMELEN) { + if (name->len < HFS_NAMELEN) + return 1; + len = HFS_NAMELEN; + } else if (len != name->len) + return 1; + + n1 = str; + n2 = name->name; + while (len--) { + if (caseorder[*n1++] != caseorder[*n2++]) + return 1; + } + return 0; +} diff --git a/fs/hfs/super.c b/fs/hfs/super.c new file mode 100644 index 00000000..1b55f704 --- /dev/null +++ b/fs/hfs/super.c @@ -0,0 +1,493 @@ +/* + * linux/fs/hfs/super.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains hfs_read_super(), some of the super_ops and + * init_hfs_fs() and exit_hfs_fs(). The remaining super_ops are in + * inode.c since they deal with inodes. + * + * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfs_fs.h" +#include "btree.h" + +static struct kmem_cache *hfs_inode_cachep; + +MODULE_LICENSE("GPL"); + +/* + * hfs_write_super() + * + * Description: + * This function is called by the VFS only. When the filesystem + * is mounted r/w it updates the MDB on disk. + * Input Variable(s): + * struct super_block *sb: Pointer to the hfs superblock + * Output Variable(s): + * NONE + * Returns: + * void + * Preconditions: + * 'sb' points to a "valid" (struct super_block). + * Postconditions: + * The MDB is marked 'unsuccessfully unmounted' by clearing bit 8 of drAtrb + * (hfs_put_super() must set this flag!). Some MDB fields are updated + * and the MDB buffer is written to disk by calling hfs_mdb_commit(). + */ +static void hfs_write_super(struct super_block *sb) +{ + lock_super(sb); + sb->s_dirt = 0; + + /* sync everything to the buffers */ + if (!(sb->s_flags & MS_RDONLY)) + hfs_mdb_commit(sb); + unlock_super(sb); +} + +static int hfs_sync_fs(struct super_block *sb, int wait) +{ + lock_super(sb); + hfs_mdb_commit(sb); + sb->s_dirt = 0; + unlock_super(sb); + + return 0; +} + +/* + * hfs_put_super() + * + * This is the put_super() entry in the super_operations structure for + * HFS filesystems. The purpose is to release the resources + * associated with the superblock sb. + */ +static void hfs_put_super(struct super_block *sb) +{ + if (sb->s_dirt) + hfs_write_super(sb); + hfs_mdb_close(sb); + /* release the MDB's resources */ + hfs_mdb_put(sb); +} + +/* + * hfs_statfs() + * + * This is the statfs() entry in the super_operations structure for + * HFS filesystems. The purpose is to return various data about the + * filesystem. + * + * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks. + */ +static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = HFS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div; + buf->f_bfree = (u32)HFS_SB(sb)->free_ablocks * HFS_SB(sb)->fs_div; + buf->f_bavail = buf->f_bfree; + buf->f_files = HFS_SB(sb)->fs_ablocks; + buf->f_ffree = HFS_SB(sb)->free_ablocks; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = HFS_NAMELEN; + + return 0; +} + +static int hfs_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_NODIRATIME; + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + if (!(*flags & MS_RDONLY)) { + if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { + printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, " + "running fsck.hfs is recommended. leaving read-only.\n"); + sb->s_flags |= MS_RDONLY; + *flags |= MS_RDONLY; + } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) { + printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); + sb->s_flags |= MS_RDONLY; + *flags |= MS_RDONLY; + } + } + return 0; +} + +static int hfs_show_options(struct seq_file *seq, struct vfsmount *mnt) +{ + struct hfs_sb_info *sbi = HFS_SB(mnt->mnt_sb); + + if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f)) + seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator); + if (sbi->s_type != cpu_to_be32(0x3f3f3f3f)) + seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type); + seq_printf(seq, ",uid=%u,gid=%u", sbi->s_uid, sbi->s_gid); + if (sbi->s_file_umask != 0133) + seq_printf(seq, ",file_umask=%o", sbi->s_file_umask); + if (sbi->s_dir_umask != 0022) + seq_printf(seq, ",dir_umask=%o", sbi->s_dir_umask); + if (sbi->part >= 0) + seq_printf(seq, ",part=%u", sbi->part); + if (sbi->session >= 0) + seq_printf(seq, ",session=%u", sbi->session); + if (sbi->nls_disk) + seq_printf(seq, ",codepage=%s", sbi->nls_disk->charset); + if (sbi->nls_io) + seq_printf(seq, ",iocharset=%s", sbi->nls_io->charset); + if (sbi->s_quiet) + seq_printf(seq, ",quiet"); + return 0; +} + +static struct inode *hfs_alloc_inode(struct super_block *sb) +{ + struct hfs_inode_info *i; + + i = kmem_cache_alloc(hfs_inode_cachep, GFP_KERNEL); + return i ? &i->vfs_inode : NULL; +} + +static void hfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(hfs_inode_cachep, HFS_I(inode)); +} + +static void hfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hfs_i_callback); +} + +static const struct super_operations hfs_super_operations = { + .alloc_inode = hfs_alloc_inode, + .destroy_inode = hfs_destroy_inode, + .write_inode = hfs_write_inode, + .evict_inode = hfs_evict_inode, + .put_super = hfs_put_super, + .write_super = hfs_write_super, + .sync_fs = hfs_sync_fs, + .statfs = hfs_statfs, + .remount_fs = hfs_remount, + .show_options = hfs_show_options, +}; + +enum { + opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask, + opt_part, opt_session, opt_type, opt_creator, opt_quiet, + opt_codepage, opt_iocharset, + opt_err +}; + +static const match_table_t tokens = { + { opt_uid, "uid=%u" }, + { opt_gid, "gid=%u" }, + { opt_umask, "umask=%o" }, + { opt_file_umask, "file_umask=%o" }, + { opt_dir_umask, "dir_umask=%o" }, + { opt_part, "part=%u" }, + { opt_session, "session=%u" }, + { opt_type, "type=%s" }, + { opt_creator, "creator=%s" }, + { opt_quiet, "quiet" }, + { opt_codepage, "codepage=%s" }, + { opt_iocharset, "iocharset=%s" }, + { opt_err, NULL } +}; + +static inline int match_fourchar(substring_t *arg, u32 *result) +{ + if (arg->to - arg->from != 4) + return -EINVAL; + memcpy(result, arg->from, 4); + return 0; +} + +/* + * parse_options() + * + * adapted from linux/fs/msdos/inode.c written 1992,93 by Werner Almesberger + * This function is called by hfs_read_super() to parse the mount options. + */ +static int parse_options(char *options, struct hfs_sb_info *hsb) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int tmp, token; + + /* initialize the sb with defaults */ + hsb->s_uid = current_uid(); + hsb->s_gid = current_gid(); + hsb->s_file_umask = 0133; + hsb->s_dir_umask = 0022; + hsb->s_type = hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */ + hsb->s_quiet = 0; + hsb->part = -1; + hsb->session = -1; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case opt_uid: + if (match_int(&args[0], &tmp)) { + printk(KERN_ERR "hfs: uid requires an argument\n"); + return 0; + } + hsb->s_uid = (uid_t)tmp; + break; + case opt_gid: + if (match_int(&args[0], &tmp)) { + printk(KERN_ERR "hfs: gid requires an argument\n"); + return 0; + } + hsb->s_gid = (gid_t)tmp; + break; + case opt_umask: + if (match_octal(&args[0], &tmp)) { + printk(KERN_ERR "hfs: umask requires a value\n"); + return 0; + } + hsb->s_file_umask = (umode_t)tmp; + hsb->s_dir_umask = (umode_t)tmp; + break; + case opt_file_umask: + if (match_octal(&args[0], &tmp)) { + printk(KERN_ERR "hfs: file_umask requires a value\n"); + return 0; + } + hsb->s_file_umask = (umode_t)tmp; + break; + case opt_dir_umask: + if (match_octal(&args[0], &tmp)) { + printk(KERN_ERR "hfs: dir_umask requires a value\n"); + return 0; + } + hsb->s_dir_umask = (umode_t)tmp; + break; + case opt_part: + if (match_int(&args[0], &hsb->part)) { + printk(KERN_ERR "hfs: part requires an argument\n"); + return 0; + } + break; + case opt_session: + if (match_int(&args[0], &hsb->session)) { + printk(KERN_ERR "hfs: session requires an argument\n"); + return 0; + } + break; + case opt_type: + if (match_fourchar(&args[0], &hsb->s_type)) { + printk(KERN_ERR "hfs: type requires a 4 character value\n"); + return 0; + } + break; + case opt_creator: + if (match_fourchar(&args[0], &hsb->s_creator)) { + printk(KERN_ERR "hfs: creator requires a 4 character value\n"); + return 0; + } + break; + case opt_quiet: + hsb->s_quiet = 1; + break; + case opt_codepage: + if (hsb->nls_disk) { + printk(KERN_ERR "hfs: unable to change codepage\n"); + return 0; + } + p = match_strdup(&args[0]); + if (p) + hsb->nls_disk = load_nls(p); + if (!hsb->nls_disk) { + printk(KERN_ERR "hfs: unable to load codepage \"%s\"\n", p); + kfree(p); + return 0; + } + kfree(p); + break; + case opt_iocharset: + if (hsb->nls_io) { + printk(KERN_ERR "hfs: unable to change iocharset\n"); + return 0; + } + p = match_strdup(&args[0]); + if (p) + hsb->nls_io = load_nls(p); + if (!hsb->nls_io) { + printk(KERN_ERR "hfs: unable to load iocharset \"%s\"\n", p); + kfree(p); + return 0; + } + kfree(p); + break; + default: + return 0; + } + } + + if (hsb->nls_disk && !hsb->nls_io) { + hsb->nls_io = load_nls_default(); + if (!hsb->nls_io) { + printk(KERN_ERR "hfs: unable to load default iocharset\n"); + return 0; + } + } + hsb->s_dir_umask &= 0777; + hsb->s_file_umask &= 0577; + + return 1; +} + +/* + * hfs_read_super() + * + * This is the function that is responsible for mounting an HFS + * filesystem. It performs all the tasks necessary to get enough data + * from the disk to read the root inode. This includes parsing the + * mount options, dealing with Macintosh partitions, reading the + * superblock and the allocation bitmap blocks, calling + * hfs_btree_init() to get the necessary data about the extents and + * catalog B-trees and, finally, reading the root inode into memory. + */ +static int hfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct hfs_sb_info *sbi; + struct hfs_find_data fd; + hfs_cat_rec rec; + struct inode *root_inode; + int res; + + sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + + res = -EINVAL; + if (!parse_options((char *)data, sbi)) { + printk(KERN_ERR "hfs: unable to parse mount options.\n"); + goto bail; + } + + sb->s_op = &hfs_super_operations; + sb->s_flags |= MS_NODIRATIME; + mutex_init(&sbi->bitmap_lock); + + res = hfs_mdb_get(sb); + if (res) { + if (!silent) + printk(KERN_WARNING "hfs: can't find a HFS filesystem on dev %s.\n", + hfs_mdb_name(sb)); + res = -EINVAL; + goto bail; + } + + /* try to get the root inode */ + hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); + if (!res) { + if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) { + res = -EIO; + goto bail; + } + hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength); + } + if (res) { + hfs_find_exit(&fd); + goto bail_no_root; + } + res = -EINVAL; + root_inode = hfs_iget(sb, &fd.search_key->cat, &rec); + hfs_find_exit(&fd); + if (!root_inode) + goto bail_no_root; + + sb->s_d_op = &hfs_dentry_operations; + res = -ENOMEM; + sb->s_root = d_alloc_root(root_inode); + if (!sb->s_root) + goto bail_iput; + + /* everything's okay */ + return 0; + +bail_iput: + iput(root_inode); +bail_no_root: + printk(KERN_ERR "hfs: get root inode failed.\n"); +bail: + hfs_mdb_put(sb); + return res; +} + +static struct dentry *hfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super); +} + +static struct file_system_type hfs_fs_type = { + .owner = THIS_MODULE, + .name = "hfs", + .mount = hfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static void hfs_init_once(void *p) +{ + struct hfs_inode_info *i = p; + + inode_init_once(&i->vfs_inode); +} + +static int __init init_hfs_fs(void) +{ + int err; + + hfs_inode_cachep = kmem_cache_create("hfs_inode_cache", + sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN, + hfs_init_once); + if (!hfs_inode_cachep) + return -ENOMEM; + err = register_filesystem(&hfs_fs_type); + if (err) + kmem_cache_destroy(hfs_inode_cachep); + return err; +} + +static void __exit exit_hfs_fs(void) +{ + unregister_filesystem(&hfs_fs_type); + kmem_cache_destroy(hfs_inode_cachep); +} + +module_init(init_hfs_fs) +module_exit(exit_hfs_fs) diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c new file mode 100644 index 00000000..19cf291e --- /dev/null +++ b/fs/hfs/sysdep.c @@ -0,0 +1,45 @@ +/* + * linux/fs/hfs/sysdep.c + * + * Copyright (C) 1996 Paul H. Hargrove + * (C) 2003 Ardis Technologies + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains the code to do various system dependent things. + */ + +#include +#include "hfs_fs.h" + +/* dentry case-handling: just lowercase everything */ + +static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode; + int diff; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + if(!inode) + return 1; + + /* fix up inode on a timezone change */ + diff = sys_tz.tz_minuteswest * 60 - HFS_I(inode)->tz_secondswest; + if (diff) { + inode->i_ctime.tv_sec += diff; + inode->i_atime.tv_sec += diff; + inode->i_mtime.tv_sec += diff; + HFS_I(inode)->tz_secondswest += diff; + } + return 1; +} + +const struct dentry_operations hfs_dentry_operations = +{ + .d_revalidate = hfs_revalidate_dentry, + .d_hash = hfs_hash_dentry, + .d_compare = hfs_compare_dentry, +}; + diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c new file mode 100644 index 00000000..b1ce4c7a --- /dev/null +++ b/fs/hfs/trans.c @@ -0,0 +1,150 @@ +/* + * linux/fs/hfs/trans.c + * + * Copyright (C) 1995-1997 Paul H. Hargrove + * This file may be distributed under the terms of the GNU General Public License. + * + * This file contains routines for converting between the Macintosh + * character set and various other encodings. This includes dealing + * with ':' vs. '/' as the path-element separator. + */ + +#include +#include + +#include "hfs_fs.h" + +/*================ Global functions ================*/ + +/* + * hfs_mac2asc() + * + * Given a 'Pascal String' (a string preceded by a length byte) in + * the Macintosh character set produce the corresponding filename using + * the 'trivial' name-mangling scheme, returning the length of the + * mangled filename. Note that the output string is not NULL + * terminated. + * + * The name-mangling works as follows: + * The character '/', which is illegal in Linux filenames is replaced + * by ':' which never appears in HFS filenames. All other characters + * are passed unchanged from input to output. + */ +int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in) +{ + struct nls_table *nls_disk = HFS_SB(sb)->nls_disk; + struct nls_table *nls_io = HFS_SB(sb)->nls_io; + const char *src; + char *dst; + int srclen, dstlen, size; + + src = in->name; + srclen = in->len; + if (srclen > HFS_NAMELEN) + srclen = HFS_NAMELEN; + dst = out; + dstlen = HFS_MAX_NAMELEN; + if (nls_io) { + wchar_t ch; + + while (srclen > 0) { + if (nls_disk) { + size = nls_disk->char2uni(src, srclen, &ch); + if (size <= 0) { + ch = '?'; + size = 1; + } + src += size; + srclen -= size; + } else { + ch = *src++; + srclen--; + } + if (ch == '/') + ch = ':'; + size = nls_io->uni2char(ch, dst, dstlen); + if (size < 0) { + if (size == -ENAMETOOLONG) + goto out; + *dst = '?'; + size = 1; + } + dst += size; + dstlen -= size; + } + } else { + char ch; + + while (--srclen >= 0) + *dst++ = (ch = *src++) == '/' ? ':' : ch; + } +out: + return dst - out; +} + +/* + * hfs_asc2mac() + * + * Given an ASCII string (not null-terminated) and its length, + * generate the corresponding filename in the Macintosh character set + * using the 'trivial' name-mangling scheme, returning the length of + * the mangled filename. Note that the output string is not NULL + * terminated. + * + * This routine is a inverse to hfs_mac2triv(). + * A ':' is replaced by a '/'. + */ +void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, struct qstr *in) +{ + struct nls_table *nls_disk = HFS_SB(sb)->nls_disk; + struct nls_table *nls_io = HFS_SB(sb)->nls_io; + const char *src; + char *dst; + int srclen, dstlen, size; + + src = in->name; + srclen = in->len; + dst = out->name; + dstlen = HFS_NAMELEN; + if (nls_io) { + wchar_t ch; + + while (srclen > 0) { + size = nls_io->char2uni(src, srclen, &ch); + if (size < 0) { + ch = '?'; + size = 1; + } + src += size; + srclen -= size; + if (ch == ':') + ch = '/'; + if (nls_disk) { + size = nls_disk->uni2char(ch, dst, dstlen); + if (size < 0) { + if (size == -ENAMETOOLONG) + goto out; + *dst = '?'; + size = 1; + } + dst += size; + dstlen -= size; + } else { + *dst++ = ch > 0xff ? '?' : ch; + dstlen--; + } + } + } else { + char ch; + + if (dstlen > srclen) + dstlen = srclen; + while (--dstlen >= 0) + *dst++ = (ch = *src++) == ':' ? '/' : ch; + } +out: + out->len = dst - (char *)out->name; + dstlen = HFS_NAMELEN - out->len; + while (--dstlen >= 0) + *dst++ = 0; +} diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig new file mode 100644 index 00000000..a6337181 --- /dev/null +++ b/fs/hfsplus/Kconfig @@ -0,0 +1,13 @@ +config HFSPLUS_FS + tristate "Apple Extended HFS file system support" + depends on BLOCK + select NLS + select NLS_UTF8 + help + If you say Y here, you will be able to mount extended format + Macintosh-formatted hard drive partitions with full read-write access. + + This file system is often called HFS+ and was introduced with + MacOS 8. It includes all Mac specific filesystem data such as + data forks and creator codes, but it also has several UNIX + style features such as file ownership and permissions. diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile new file mode 100644 index 00000000..3cc0df73 --- /dev/null +++ b/fs/hfsplus/Makefile @@ -0,0 +1,9 @@ +# +## Makefile for the linux hfsplus filesystem routines. +# + +obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o + +hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \ + bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o + diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c new file mode 100644 index 00000000..5d799c13 --- /dev/null +++ b/fs/hfsplus/bfind.c @@ -0,0 +1,224 @@ +/* + * linux/fs/hfsplus/bfind.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Search routines for btrees + */ + +#include +#include "hfsplus_fs.h" + +int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) +{ + void *ptr; + + fd->tree = tree; + fd->bnode = NULL; + ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); + if (!ptr) + return -ENOMEM; + fd->search_key = ptr; + fd->key = ptr + tree->max_key_len + 2; + dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", + tree->cnid, __builtin_return_address(0)); + mutex_lock(&tree->tree_lock); + return 0; +} + +void hfs_find_exit(struct hfs_find_data *fd) +{ + hfs_bnode_put(fd->bnode); + kfree(fd->search_key); + dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", + fd->tree->cnid, __builtin_return_address(0)); + mutex_unlock(&fd->tree->tree_lock); + fd->tree = NULL; +} + +/* Find the record in bnode that best matches key (not greater than...)*/ +int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) +{ + int cmpval; + u16 off, len, keylen; + int rec; + int b, e; + int res; + + b = 0; + e = bnode->num_recs - 1; + res = -ENOENT; + do { + rec = (e + b) / 2; + len = hfs_brec_lenoff(bnode, rec, &off); + keylen = hfs_brec_keylen(bnode, rec); + if (keylen == 0) { + res = -EINVAL; + goto fail; + } + hfs_bnode_read(bnode, fd->key, off, keylen); + cmpval = bnode->tree->keycmp(fd->key, fd->search_key); + if (!cmpval) { + e = rec; + res = 0; + goto done; + } + if (cmpval < 0) + b = rec + 1; + else + e = rec - 1; + } while (b <= e); + if (rec != e && e >= 0) { + len = hfs_brec_lenoff(bnode, e, &off); + keylen = hfs_brec_keylen(bnode, e); + if (keylen == 0) { + res = -EINVAL; + goto fail; + } + hfs_bnode_read(bnode, fd->key, off, keylen); + } +done: + fd->record = e; + fd->keyoffset = off; + fd->keylength = keylen; + fd->entryoffset = off + keylen; + fd->entrylength = len - keylen; +fail: + return res; +} + +/* Traverse a B*Tree from the root to a leaf finding best fit to key */ +/* Return allocated copy of node found, set recnum to best record */ +int hfs_brec_find(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *bnode; + u32 nidx, parent; + __be32 data; + int height, res; + + tree = fd->tree; + if (fd->bnode) + hfs_bnode_put(fd->bnode); + fd->bnode = NULL; + nidx = tree->root; + if (!nidx) + return -ENOENT; + height = tree->depth; + res = 0; + parent = 0; + for (;;) { + bnode = hfs_bnode_find(tree, nidx); + if (IS_ERR(bnode)) { + res = PTR_ERR(bnode); + bnode = NULL; + break; + } + if (bnode->height != height) + goto invalid; + if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF)) + goto invalid; + bnode->parent = parent; + + res = __hfs_brec_find(bnode, fd); + if (!height) + break; + if (fd->record < 0) + goto release; + + parent = nidx; + hfs_bnode_read(bnode, &data, fd->entryoffset, 4); + nidx = be32_to_cpu(data); + hfs_bnode_put(bnode); + } + fd->bnode = bnode; + return res; + +invalid: + printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", + height, bnode->height, bnode->type, nidx, parent); + res = -EIO; +release: + hfs_bnode_put(bnode); + return res; +} + +int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len) +{ + int res; + + res = hfs_brec_find(fd); + if (res) + return res; + if (fd->entrylength > rec_len) + return -EINVAL; + hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength); + return 0; +} + +int hfs_brec_goto(struct hfs_find_data *fd, int cnt) +{ + struct hfs_btree *tree; + struct hfs_bnode *bnode; + int idx, res = 0; + u16 off, len, keylen; + + bnode = fd->bnode; + tree = bnode->tree; + + if (cnt < 0) { + cnt = -cnt; + while (cnt > fd->record) { + cnt -= fd->record + 1; + fd->record = bnode->num_recs - 1; + idx = bnode->prev; + if (!idx) { + res = -ENOENT; + goto out; + } + hfs_bnode_put(bnode); + bnode = hfs_bnode_find(tree, idx); + if (IS_ERR(bnode)) { + res = PTR_ERR(bnode); + bnode = NULL; + goto out; + } + } + fd->record -= cnt; + } else { + while (cnt >= bnode->num_recs - fd->record) { + cnt -= bnode->num_recs - fd->record; + fd->record = 0; + idx = bnode->next; + if (!idx) { + res = -ENOENT; + goto out; + } + hfs_bnode_put(bnode); + bnode = hfs_bnode_find(tree, idx); + if (IS_ERR(bnode)) { + res = PTR_ERR(bnode); + bnode = NULL; + goto out; + } + } + fd->record += cnt; + } + + len = hfs_brec_lenoff(bnode, fd->record, &off); + keylen = hfs_brec_keylen(bnode, fd->record); + if (keylen == 0) { + res = -EINVAL; + goto out; + } + fd->keyoffset = off; + fd->keylength = keylen; + fd->entryoffset = off + keylen; + fd->entrylength = len - keylen; + hfs_bnode_read(bnode, fd->key, off, keylen); +out: + fd->bnode = bnode; + return res; +} diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c new file mode 100644 index 00000000..1cad80c7 --- /dev/null +++ b/fs/hfsplus/bitmap.c @@ -0,0 +1,235 @@ +/* + * linux/fs/hfsplus/bitmap.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handling of allocation file + */ + +#include + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +#define PAGE_CACHE_BITS (PAGE_CACHE_SIZE * 8) + +int hfsplus_block_allocate(struct super_block *sb, u32 size, + u32 offset, u32 *max) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct page *page; + struct address_space *mapping; + __be32 *pptr, *curr, *end; + u32 mask, start, len, n; + __be32 val; + int i; + + len = *max; + if (!len) + return size; + + dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); + mutex_lock(&sbi->alloc_mutex); + mapping = sbi->alloc_file->i_mapping; + page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } + pptr = kmap(page); + curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; + i = offset % 32; + offset &= ~(PAGE_CACHE_BITS - 1); + if ((size ^ offset) / PAGE_CACHE_BITS) + end = pptr + PAGE_CACHE_BITS / 32; + else + end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32; + + /* scan the first partial u32 for zero bits */ + val = *curr; + if (~val) { + n = be32_to_cpu(val); + mask = (1U << 31) >> i; + for (; i < 32; mask >>= 1, i++) { + if (!(n & mask)) + goto found; + } + } + curr++; + + /* scan complete u32s for the first zero bit */ + while (1) { + while (curr < end) { + val = *curr; + if (~val) { + n = be32_to_cpu(val); + mask = 1 << 31; + for (i = 0; i < 32; mask >>= 1, i++) { + if (!(n & mask)) + goto found; + } + } + curr++; + } + kunmap(page); + offset += PAGE_CACHE_BITS; + if (offset >= size) + break; + page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, + NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } + curr = pptr = kmap(page); + if ((size ^ offset) / PAGE_CACHE_BITS) + end = pptr + PAGE_CACHE_BITS / 32; + else + end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32; + } + dprint(DBG_BITMAP, "bitmap full\n"); + start = size; + goto out; + +found: + start = offset + (curr - pptr) * 32 + i; + if (start >= size) { + dprint(DBG_BITMAP, "bitmap full\n"); + goto out; + } + /* do any partial u32 at the start */ + len = min(size - start, len); + while (1) { + n |= mask; + if (++i >= 32) + break; + mask >>= 1; + if (!--len || n & mask) + goto done; + } + if (!--len) + goto done; + *curr++ = cpu_to_be32(n); + /* do full u32s */ + while (1) { + while (curr < end) { + n = be32_to_cpu(*curr); + if (len < 32) + goto last; + if (n) { + len = 32; + goto last; + } + *curr++ = cpu_to_be32(0xffffffff); + len -= 32; + } + set_page_dirty(page); + kunmap(page); + offset += PAGE_CACHE_BITS; + page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, + NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } + pptr = kmap(page); + curr = pptr; + end = pptr + PAGE_CACHE_BITS / 32; + } +last: + /* do any partial u32 at end */ + mask = 1U << 31; + for (i = 0; i < len; i++) { + if (n & mask) + break; + n |= mask; + mask >>= 1; + } +done: + *curr = cpu_to_be32(n); + set_page_dirty(page); + kunmap(page); + *max = offset + (curr - pptr) * 32 + i - start; + sbi->free_blocks -= *max; + sb->s_dirt = 1; + dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); +out: + mutex_unlock(&sbi->alloc_mutex); + return start; +} + +int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct page *page; + struct address_space *mapping; + __be32 *pptr, *curr, *end; + u32 mask, len, pnr; + int i; + + /* is there any actual work to be done? */ + if (!count) + return 0; + + dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); + /* are all of the bits in range? */ + if ((offset + count) > sbi->total_blocks) + return -2; + + mutex_lock(&sbi->alloc_mutex); + mapping = sbi->alloc_file->i_mapping; + pnr = offset / PAGE_CACHE_BITS; + page = read_mapping_page(mapping, pnr, NULL); + pptr = kmap(page); + curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; + end = pptr + PAGE_CACHE_BITS / 32; + len = count; + + /* do any partial u32 at the start */ + i = offset % 32; + if (i) { + int j = 32 - i; + mask = 0xffffffffU << j; + if (j > count) { + mask |= 0xffffffffU >> (i + count); + *curr++ &= cpu_to_be32(mask); + goto out; + } + *curr++ &= cpu_to_be32(mask); + count -= j; + } + + /* do full u32s */ + while (1) { + while (curr < end) { + if (count < 32) + goto done; + *curr++ = 0; + count -= 32; + } + if (!count) + break; + set_page_dirty(page); + kunmap(page); + page = read_mapping_page(mapping, ++pnr, NULL); + pptr = kmap(page); + curr = pptr; + end = pptr + PAGE_CACHE_BITS / 32; + } +done: + /* do any partial u32 at end */ + if (count) { + mask = 0xffffffffU >> count; + *curr &= cpu_to_be32(mask); + } +out: + set_page_dirty(page); + kunmap(page); + sbi->free_blocks += len; + sb->s_dirt = 1; + mutex_unlock(&sbi->alloc_mutex); + + return 0; +} diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c new file mode 100644 index 00000000..1c42cc5b --- /dev/null +++ b/fs/hfsplus/bnode.c @@ -0,0 +1,656 @@ +/* + * linux/fs/hfsplus/bnode.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handle basic btree node operations + */ + +#include +#include +#include +#include +#include + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +/* Copy a specified range of bytes from the raw data of a node */ +void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) +{ + struct page **pagep; + int l; + + off += node->page_offset; + pagep = node->page + (off >> PAGE_CACHE_SHIFT); + off &= ~PAGE_CACHE_MASK; + + l = min(len, (int)PAGE_CACHE_SIZE - off); + memcpy(buf, kmap(*pagep) + off, l); + kunmap(*pagep); + + while ((len -= l) != 0) { + buf += l; + l = min(len, (int)PAGE_CACHE_SIZE); + memcpy(buf, kmap(*++pagep), l); + kunmap(*pagep); + } +} + +u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off) +{ + __be16 data; + /* TODO: optimize later... */ + hfs_bnode_read(node, &data, off, 2); + return be16_to_cpu(data); +} + +u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off) +{ + u8 data; + /* TODO: optimize later... */ + hfs_bnode_read(node, &data, off, 1); + return data; +} + +void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) +{ + struct hfs_btree *tree; + int key_len; + + tree = node->tree; + if (node->type == HFS_NODE_LEAF || + tree->attributes & HFS_TREE_VARIDXKEYS) + key_len = hfs_bnode_read_u16(node, off) + 2; + else + key_len = tree->max_key_len + 2; + + hfs_bnode_read(node, key, off, key_len); +} + +void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) +{ + struct page **pagep; + int l; + + off += node->page_offset; + pagep = node->page + (off >> PAGE_CACHE_SHIFT); + off &= ~PAGE_CACHE_MASK; + + l = min(len, (int)PAGE_CACHE_SIZE - off); + memcpy(kmap(*pagep) + off, buf, l); + set_page_dirty(*pagep); + kunmap(*pagep); + + while ((len -= l) != 0) { + buf += l; + l = min(len, (int)PAGE_CACHE_SIZE); + memcpy(kmap(*++pagep), buf, l); + set_page_dirty(*pagep); + kunmap(*pagep); + } +} + +void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data) +{ + __be16 v = cpu_to_be16(data); + /* TODO: optimize later... */ + hfs_bnode_write(node, &v, off, 2); +} + +void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) +{ + struct page **pagep; + int l; + + off += node->page_offset; + pagep = node->page + (off >> PAGE_CACHE_SHIFT); + off &= ~PAGE_CACHE_MASK; + + l = min(len, (int)PAGE_CACHE_SIZE - off); + memset(kmap(*pagep) + off, 0, l); + set_page_dirty(*pagep); + kunmap(*pagep); + + while ((len -= l) != 0) { + l = min(len, (int)PAGE_CACHE_SIZE); + memset(kmap(*++pagep), 0, l); + set_page_dirty(*pagep); + kunmap(*pagep); + } +} + +void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, + struct hfs_bnode *src_node, int src, int len) +{ + struct hfs_btree *tree; + struct page **src_page, **dst_page; + int l; + + dprint(DBG_BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + if (!len) + return; + tree = src_node->tree; + src += src_node->page_offset; + dst += dst_node->page_offset; + src_page = src_node->page + (src >> PAGE_CACHE_SHIFT); + src &= ~PAGE_CACHE_MASK; + dst_page = dst_node->page + (dst >> PAGE_CACHE_SHIFT); + dst &= ~PAGE_CACHE_MASK; + + if (src == dst) { + l = min(len, (int)PAGE_CACHE_SIZE - src); + memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + + while ((len -= l) != 0) { + l = min(len, (int)PAGE_CACHE_SIZE); + memcpy(kmap(*++dst_page), kmap(*++src_page), l); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + } + } else { + void *src_ptr, *dst_ptr; + + do { + src_ptr = kmap(*src_page) + src; + dst_ptr = kmap(*dst_page) + dst; + if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) { + l = PAGE_CACHE_SIZE - src; + src = 0; + dst += l; + } else { + l = PAGE_CACHE_SIZE - dst; + src += l; + dst = 0; + } + l = min(len, l); + memcpy(dst_ptr, src_ptr, l); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + if (!dst) + dst_page++; + else + src_page++; + } while ((len -= l)); + } +} + +void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) +{ + struct page **src_page, **dst_page; + int l; + + dprint(DBG_BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + if (!len) + return; + src += node->page_offset; + dst += node->page_offset; + if (dst > src) { + src += len - 1; + src_page = node->page + (src >> PAGE_CACHE_SHIFT); + src = (src & ~PAGE_CACHE_MASK) + 1; + dst += len - 1; + dst_page = node->page + (dst >> PAGE_CACHE_SHIFT); + dst = (dst & ~PAGE_CACHE_MASK) + 1; + + if (src == dst) { + while (src < len) { + memmove(kmap(*dst_page), kmap(*src_page), src); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + len -= src; + src = PAGE_CACHE_SIZE; + src_page--; + dst_page--; + } + src -= len; + memmove(kmap(*dst_page) + src, + kmap(*src_page) + src, len); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + } else { + void *src_ptr, *dst_ptr; + + do { + src_ptr = kmap(*src_page) + src; + dst_ptr = kmap(*dst_page) + dst; + if (src < dst) { + l = src; + src = PAGE_CACHE_SIZE; + dst -= l; + } else { + l = dst; + src -= l; + dst = PAGE_CACHE_SIZE; + } + l = min(len, l); + memmove(dst_ptr - l, src_ptr - l, l); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + if (dst == PAGE_CACHE_SIZE) + dst_page--; + else + src_page--; + } while ((len -= l)); + } + } else { + src_page = node->page + (src >> PAGE_CACHE_SHIFT); + src &= ~PAGE_CACHE_MASK; + dst_page = node->page + (dst >> PAGE_CACHE_SHIFT); + dst &= ~PAGE_CACHE_MASK; + + if (src == dst) { + l = min(len, (int)PAGE_CACHE_SIZE - src); + memmove(kmap(*dst_page) + src, + kmap(*src_page) + src, l); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + + while ((len -= l) != 0) { + l = min(len, (int)PAGE_CACHE_SIZE); + memmove(kmap(*++dst_page), + kmap(*++src_page), l); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + } + } else { + void *src_ptr, *dst_ptr; + + do { + src_ptr = kmap(*src_page) + src; + dst_ptr = kmap(*dst_page) + dst; + if (PAGE_CACHE_SIZE - src < + PAGE_CACHE_SIZE - dst) { + l = PAGE_CACHE_SIZE - src; + src = 0; + dst += l; + } else { + l = PAGE_CACHE_SIZE - dst; + src += l; + dst = 0; + } + l = min(len, l); + memmove(dst_ptr, src_ptr, l); + kunmap(*src_page); + set_page_dirty(*dst_page); + kunmap(*dst_page); + if (!dst) + dst_page++; + else + src_page++; + } while ((len -= l)); + } + } +} + +void hfs_bnode_dump(struct hfs_bnode *node) +{ + struct hfs_bnode_desc desc; + __be32 cnid; + int i, off, key_off; + + dprint(DBG_BNODE_MOD, "bnode: %d\n", node->this); + hfs_bnode_read(node, &desc, 0, sizeof(desc)); + dprint(DBG_BNODE_MOD, "%d, %d, %d, %d, %d\n", + be32_to_cpu(desc.next), be32_to_cpu(desc.prev), + desc.type, desc.height, be16_to_cpu(desc.num_recs)); + + off = node->tree->node_size - 2; + for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { + key_off = hfs_bnode_read_u16(node, off); + dprint(DBG_BNODE_MOD, " %d", key_off); + if (i && node->type == HFS_NODE_INDEX) { + int tmp; + + if (node->tree->attributes & HFS_TREE_VARIDXKEYS) + tmp = hfs_bnode_read_u16(node, key_off) + 2; + else + tmp = node->tree->max_key_len + 2; + dprint(DBG_BNODE_MOD, " (%d", tmp); + hfs_bnode_read(node, &cnid, key_off + tmp, 4); + dprint(DBG_BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + } else if (i && node->type == HFS_NODE_LEAF) { + int tmp; + + tmp = hfs_bnode_read_u16(node, key_off); + dprint(DBG_BNODE_MOD, " (%d)", tmp); + } + } + dprint(DBG_BNODE_MOD, "\n"); +} + +void hfs_bnode_unlink(struct hfs_bnode *node) +{ + struct hfs_btree *tree; + struct hfs_bnode *tmp; + __be32 cnid; + + tree = node->tree; + if (node->prev) { + tmp = hfs_bnode_find(tree, node->prev); + if (IS_ERR(tmp)) + return; + tmp->next = node->next; + cnid = cpu_to_be32(tmp->next); + hfs_bnode_write(tmp, &cnid, + offsetof(struct hfs_bnode_desc, next), 4); + hfs_bnode_put(tmp); + } else if (node->type == HFS_NODE_LEAF) + tree->leaf_head = node->next; + + if (node->next) { + tmp = hfs_bnode_find(tree, node->next); + if (IS_ERR(tmp)) + return; + tmp->prev = node->prev; + cnid = cpu_to_be32(tmp->prev); + hfs_bnode_write(tmp, &cnid, + offsetof(struct hfs_bnode_desc, prev), 4); + hfs_bnode_put(tmp); + } else if (node->type == HFS_NODE_LEAF) + tree->leaf_tail = node->prev; + + /* move down? */ + if (!node->prev && !node->next) + dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n"); + if (!node->parent) { + tree->root = 0; + tree->depth = 0; + } + set_bit(HFS_BNODE_DELETED, &node->flags); +} + +static inline int hfs_bnode_hash(u32 num) +{ + num = (num >> 16) + num; + num += num >> 8; + return num & (NODE_HASH_SIZE - 1); +} + +struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) +{ + struct hfs_bnode *node; + + if (cnid >= tree->node_count) { + printk(KERN_ERR "hfs: request for non-existent node " + "%d in B*Tree\n", + cnid); + return NULL; + } + + for (node = tree->node_hash[hfs_bnode_hash(cnid)]; + node; node = node->next_hash) + if (node->this == cnid) + return node; + return NULL; +} + +static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) +{ + struct super_block *sb; + struct hfs_bnode *node, *node2; + struct address_space *mapping; + struct page *page; + int size, block, i, hash; + loff_t off; + + if (cnid >= tree->node_count) { + printk(KERN_ERR "hfs: request for non-existent node " + "%d in B*Tree\n", + cnid); + return NULL; + } + + sb = tree->inode->i_sb; + size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * + sizeof(struct page *); + node = kzalloc(size, GFP_KERNEL); + if (!node) + return NULL; + node->tree = tree; + node->this = cnid; + set_bit(HFS_BNODE_NEW, &node->flags); + atomic_set(&node->refcnt, 1); + dprint(DBG_BNODE_REFS, "new_node(%d:%d): 1\n", + node->tree->cnid, node->this); + init_waitqueue_head(&node->lock_wq); + spin_lock(&tree->hash_lock); + node2 = hfs_bnode_findhash(tree, cnid); + if (!node2) { + hash = hfs_bnode_hash(cnid); + node->next_hash = tree->node_hash[hash]; + tree->node_hash[hash] = node; + tree->node_hash_cnt++; + } else { + spin_unlock(&tree->hash_lock); + kfree(node); + wait_event(node2->lock_wq, + !test_bit(HFS_BNODE_NEW, &node2->flags)); + return node2; + } + spin_unlock(&tree->hash_lock); + + mapping = tree->inode->i_mapping; + off = (loff_t)cnid << tree->node_size_shift; + block = off >> PAGE_CACHE_SHIFT; + node->page_offset = off & ~PAGE_CACHE_MASK; + for (i = 0; i < tree->pages_per_bnode; block++, i++) { + page = read_mapping_page(mapping, block, NULL); + if (IS_ERR(page)) + goto fail; + if (PageError(page)) { + page_cache_release(page); + goto fail; + } + page_cache_release(page); + node->page[i] = page; + } + + return node; +fail: + set_bit(HFS_BNODE_ERROR, &node->flags); + return node; +} + +void hfs_bnode_unhash(struct hfs_bnode *node) +{ + struct hfs_bnode **p; + + dprint(DBG_BNODE_REFS, "remove_node(%d:%d): %d\n", + node->tree->cnid, node->this, atomic_read(&node->refcnt)); + for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; + *p && *p != node; p = &(*p)->next_hash) + ; + BUG_ON(!*p); + *p = node->next_hash; + node->tree->node_hash_cnt--; +} + +/* Load a particular node out of a tree */ +struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) +{ + struct hfs_bnode *node; + struct hfs_bnode_desc *desc; + int i, rec_off, off, next_off; + int entry_size, key_size; + + spin_lock(&tree->hash_lock); + node = hfs_bnode_findhash(tree, num); + if (node) { + hfs_bnode_get(node); + spin_unlock(&tree->hash_lock); + wait_event(node->lock_wq, + !test_bit(HFS_BNODE_NEW, &node->flags)); + if (test_bit(HFS_BNODE_ERROR, &node->flags)) + goto node_error; + return node; + } + spin_unlock(&tree->hash_lock); + node = __hfs_bnode_create(tree, num); + if (!node) + return ERR_PTR(-ENOMEM); + if (test_bit(HFS_BNODE_ERROR, &node->flags)) + goto node_error; + if (!test_bit(HFS_BNODE_NEW, &node->flags)) + return node; + + desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + + node->page_offset); + node->prev = be32_to_cpu(desc->prev); + node->next = be32_to_cpu(desc->next); + node->num_recs = be16_to_cpu(desc->num_recs); + node->type = desc->type; + node->height = desc->height; + kunmap(node->page[0]); + + switch (node->type) { + case HFS_NODE_HEADER: + case HFS_NODE_MAP: + if (node->height != 0) + goto node_error; + break; + case HFS_NODE_LEAF: + if (node->height != 1) + goto node_error; + break; + case HFS_NODE_INDEX: + if (node->height <= 1 || node->height > tree->depth) + goto node_error; + break; + default: + goto node_error; + } + + rec_off = tree->node_size - 2; + off = hfs_bnode_read_u16(node, rec_off); + if (off != sizeof(struct hfs_bnode_desc)) + goto node_error; + for (i = 1; i <= node->num_recs; off = next_off, i++) { + rec_off -= 2; + next_off = hfs_bnode_read_u16(node, rec_off); + if (next_off <= off || + next_off > tree->node_size || + next_off & 1) + goto node_error; + entry_size = next_off - off; + if (node->type != HFS_NODE_INDEX && + node->type != HFS_NODE_LEAF) + continue; + key_size = hfs_bnode_read_u16(node, off) + 2; + if (key_size >= entry_size || key_size & 1) + goto node_error; + } + clear_bit(HFS_BNODE_NEW, &node->flags); + wake_up(&node->lock_wq); + return node; + +node_error: + set_bit(HFS_BNODE_ERROR, &node->flags); + clear_bit(HFS_BNODE_NEW, &node->flags); + wake_up(&node->lock_wq); + hfs_bnode_put(node); + return ERR_PTR(-EIO); +} + +void hfs_bnode_free(struct hfs_bnode *node) +{ +#if 0 + int i; + + for (i = 0; i < node->tree->pages_per_bnode; i++) + if (node->page[i]) + page_cache_release(node->page[i]); +#endif + kfree(node); +} + +struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) +{ + struct hfs_bnode *node; + struct page **pagep; + int i; + + spin_lock(&tree->hash_lock); + node = hfs_bnode_findhash(tree, num); + spin_unlock(&tree->hash_lock); + if (node) { + printk(KERN_CRIT "new node %u already hashed?\n", num); + WARN_ON(1); + return node; + } + node = __hfs_bnode_create(tree, num); + if (!node) + return ERR_PTR(-ENOMEM); + if (test_bit(HFS_BNODE_ERROR, &node->flags)) { + hfs_bnode_put(node); + return ERR_PTR(-EIO); + } + + pagep = node->page; + memset(kmap(*pagep) + node->page_offset, 0, + min((int)PAGE_CACHE_SIZE, (int)tree->node_size)); + set_page_dirty(*pagep); + kunmap(*pagep); + for (i = 1; i < tree->pages_per_bnode; i++) { + memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE); + set_page_dirty(*pagep); + kunmap(*pagep); + } + clear_bit(HFS_BNODE_NEW, &node->flags); + wake_up(&node->lock_wq); + + return node; +} + +void hfs_bnode_get(struct hfs_bnode *node) +{ + if (node) { + atomic_inc(&node->refcnt); + dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); + } +} + +/* Dispose of resources used by a node */ +void hfs_bnode_put(struct hfs_bnode *node) +{ + if (node) { + struct hfs_btree *tree = node->tree; + int i; + + dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); + BUG_ON(!atomic_read(&node->refcnt)); + if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) + return; + for (i = 0; i < tree->pages_per_bnode; i++) { + if (!node->page[i]) + continue; + mark_page_accessed(node->page[i]); + } + + if (test_bit(HFS_BNODE_DELETED, &node->flags)) { + hfs_bnode_unhash(node); + spin_unlock(&tree->hash_lock); + hfs_bmap_free(node); + hfs_bnode_free(node); + return; + } + spin_unlock(&tree->hash_lock); + } +} + diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c new file mode 100644 index 00000000..2312de34 --- /dev/null +++ b/fs/hfsplus/brec.c @@ -0,0 +1,516 @@ +/* + * linux/fs/hfsplus/brec.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handle individual btree records + */ + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd); +static int hfs_brec_update_parent(struct hfs_find_data *fd); +static int hfs_btree_inc_height(struct hfs_btree *); + +/* Get the length and offset of the given record in the given node */ +u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off) +{ + __be16 retval[2]; + u16 dataoff; + + dataoff = node->tree->node_size - (rec + 2) * 2; + hfs_bnode_read(node, retval, dataoff, 4); + *off = be16_to_cpu(retval[1]); + return be16_to_cpu(retval[0]) - *off; +} + +/* Get the length of the key from a keyed record */ +u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) +{ + u16 retval, recoff; + + if (node->type != HFS_NODE_INDEX && node->type != HFS_NODE_LEAF) + return 0; + + if ((node->type == HFS_NODE_INDEX) && + !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) { + retval = node->tree->max_key_len + 2; + } else { + recoff = hfs_bnode_read_u16(node, + node->tree->node_size - (rec + 1) * 2); + if (!recoff) + return 0; + + retval = hfs_bnode_read_u16(node, recoff) + 2; + if (retval > node->tree->max_key_len + 2) { + printk(KERN_ERR "hfs: keylen %d too large\n", + retval); + retval = 0; + } + } + return retval; +} + +int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *new_node; + int size, key_len, rec; + int data_off, end_off; + int idx_rec_off, data_rec_off, end_rec_off; + __be32 cnid; + + tree = fd->tree; + if (!fd->bnode) { + if (!tree->root) + hfs_btree_inc_height(tree); + fd->bnode = hfs_bnode_find(tree, tree->leaf_head); + if (IS_ERR(fd->bnode)) + return PTR_ERR(fd->bnode); + fd->record = -1; + } + new_node = NULL; + key_len = be16_to_cpu(fd->search_key->key_len) + 2; +again: + /* new record idx and complete record size */ + rec = fd->record + 1; + size = key_len + entry_len; + + node = fd->bnode; + hfs_bnode_dump(node); + /* get last offset */ + end_rec_off = tree->node_size - (node->num_recs + 1) * 2; + end_off = hfs_bnode_read_u16(node, end_rec_off); + end_rec_off -= 2; + dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + rec, size, end_off, end_rec_off); + if (size > end_rec_off - end_off) { + if (new_node) + panic("not enough room!\n"); + new_node = hfs_bnode_split(fd); + if (IS_ERR(new_node)) + return PTR_ERR(new_node); + goto again; + } + if (node->type == HFS_NODE_LEAF) { + tree->leaf_count++; + mark_inode_dirty(tree->inode); + } + node->num_recs++; + /* write new last offset */ + hfs_bnode_write_u16(node, + offsetof(struct hfs_bnode_desc, num_recs), + node->num_recs); + hfs_bnode_write_u16(node, end_rec_off, end_off + size); + data_off = end_off; + data_rec_off = end_rec_off + 2; + idx_rec_off = tree->node_size - (rec + 1) * 2; + if (idx_rec_off == data_rec_off) + goto skip; + /* move all following entries */ + do { + data_off = hfs_bnode_read_u16(node, data_rec_off + 2); + hfs_bnode_write_u16(node, data_rec_off, data_off + size); + data_rec_off += 2; + } while (data_rec_off < idx_rec_off); + + /* move data away */ + hfs_bnode_move(node, data_off + size, data_off, + end_off - data_off); + +skip: + hfs_bnode_write(node, fd->search_key, data_off, key_len); + hfs_bnode_write(node, entry, data_off + key_len, entry_len); + hfs_bnode_dump(node); + + if (new_node) { + /* update parent key if we inserted a key + * at the start of the first node + */ + if (!rec && new_node != node) + hfs_brec_update_parent(fd); + + hfs_bnode_put(fd->bnode); + if (!new_node->parent) { + hfs_btree_inc_height(tree); + new_node->parent = tree->root; + } + fd->bnode = hfs_bnode_find(tree, new_node->parent); + + /* create index data entry */ + cnid = cpu_to_be32(new_node->this); + entry = &cnid; + entry_len = sizeof(cnid); + + /* get index key */ + hfs_bnode_read_key(new_node, fd->search_key, 14); + __hfs_brec_find(fd->bnode, fd); + + hfs_bnode_put(new_node); + new_node = NULL; + + if (tree->attributes & HFS_TREE_VARIDXKEYS) + key_len = be16_to_cpu(fd->search_key->key_len) + 2; + else { + fd->search_key->key_len = + cpu_to_be16(tree->max_key_len); + key_len = tree->max_key_len + 2; + } + goto again; + } + + if (!rec) + hfs_brec_update_parent(fd); + + return 0; +} + +int hfs_brec_remove(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *parent; + int end_off, rec_off, data_off, size; + + tree = fd->tree; + node = fd->bnode; +again: + rec_off = tree->node_size - (fd->record + 2) * 2; + end_off = tree->node_size - (node->num_recs + 1) * 2; + + if (node->type == HFS_NODE_LEAF) { + tree->leaf_count--; + mark_inode_dirty(tree->inode); + } + hfs_bnode_dump(node); + dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", + fd->record, fd->keylength + fd->entrylength); + if (!--node->num_recs) { + hfs_bnode_unlink(node); + if (!node->parent) + return 0; + parent = hfs_bnode_find(tree, node->parent); + if (IS_ERR(parent)) + return PTR_ERR(parent); + hfs_bnode_put(node); + node = fd->bnode = parent; + + __hfs_brec_find(node, fd); + goto again; + } + hfs_bnode_write_u16(node, + offsetof(struct hfs_bnode_desc, num_recs), + node->num_recs); + + if (rec_off == end_off) + goto skip; + size = fd->keylength + fd->entrylength; + + do { + data_off = hfs_bnode_read_u16(node, rec_off); + hfs_bnode_write_u16(node, rec_off + 2, data_off - size); + rec_off -= 2; + } while (rec_off >= end_off); + + /* fill hole */ + hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size, + data_off - fd->keyoffset - size); +skip: + hfs_bnode_dump(node); + if (!fd->record) + hfs_brec_update_parent(fd); + return 0; +} + +static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *new_node, *next_node; + struct hfs_bnode_desc node_desc; + int num_recs, new_rec_off, new_off, old_rec_off; + int data_start, data_end, size; + + tree = fd->tree; + node = fd->bnode; + new_node = hfs_bmap_alloc(tree); + if (IS_ERR(new_node)) + return new_node; + hfs_bnode_get(node); + dprint(DBG_BNODE_MOD, "split_nodes: %d - %d - %d\n", + node->this, new_node->this, node->next); + new_node->next = node->next; + new_node->prev = node->this; + new_node->parent = node->parent; + new_node->type = node->type; + new_node->height = node->height; + + if (node->next) + next_node = hfs_bnode_find(tree, node->next); + else + next_node = NULL; + + if (IS_ERR(next_node)) { + hfs_bnode_put(node); + hfs_bnode_put(new_node); + return next_node; + } + + size = tree->node_size / 2 - node->num_recs * 2 - 14; + old_rec_off = tree->node_size - 4; + num_recs = 1; + for (;;) { + data_start = hfs_bnode_read_u16(node, old_rec_off); + if (data_start > size) + break; + old_rec_off -= 2; + if (++num_recs < node->num_recs) + continue; + /* panic? */ + hfs_bnode_put(node); + hfs_bnode_put(new_node); + if (next_node) + hfs_bnode_put(next_node); + return ERR_PTR(-ENOSPC); + } + + if (fd->record + 1 < num_recs) { + /* new record is in the lower half, + * so leave some more space there + */ + old_rec_off += 2; + num_recs--; + data_start = hfs_bnode_read_u16(node, old_rec_off); + } else { + hfs_bnode_put(node); + hfs_bnode_get(new_node); + fd->bnode = new_node; + fd->record -= num_recs; + fd->keyoffset -= data_start - 14; + fd->entryoffset -= data_start - 14; + } + new_node->num_recs = node->num_recs - num_recs; + node->num_recs = num_recs; + + new_rec_off = tree->node_size - 2; + new_off = 14; + size = data_start - new_off; + num_recs = new_node->num_recs; + data_end = data_start; + while (num_recs) { + hfs_bnode_write_u16(new_node, new_rec_off, new_off); + old_rec_off -= 2; + new_rec_off -= 2; + data_end = hfs_bnode_read_u16(node, old_rec_off); + new_off = data_end - size; + num_recs--; + } + hfs_bnode_write_u16(new_node, new_rec_off, new_off); + hfs_bnode_copy(new_node, 14, node, data_start, data_end - data_start); + + /* update new bnode header */ + node_desc.next = cpu_to_be32(new_node->next); + node_desc.prev = cpu_to_be32(new_node->prev); + node_desc.type = new_node->type; + node_desc.height = new_node->height; + node_desc.num_recs = cpu_to_be16(new_node->num_recs); + node_desc.reserved = 0; + hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc)); + + /* update previous bnode header */ + node->next = new_node->this; + hfs_bnode_read(node, &node_desc, 0, sizeof(node_desc)); + node_desc.next = cpu_to_be32(node->next); + node_desc.num_recs = cpu_to_be16(node->num_recs); + hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); + + /* update next bnode header */ + if (next_node) { + next_node->prev = new_node->this; + hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); + node_desc.prev = cpu_to_be32(next_node->prev); + hfs_bnode_write(next_node, &node_desc, 0, sizeof(node_desc)); + hfs_bnode_put(next_node); + } else if (node->this == tree->leaf_tail) { + /* if there is no next node, this might be the new tail */ + tree->leaf_tail = new_node->this; + mark_inode_dirty(tree->inode); + } + + hfs_bnode_dump(node); + hfs_bnode_dump(new_node); + hfs_bnode_put(node); + + return new_node; +} + +static int hfs_brec_update_parent(struct hfs_find_data *fd) +{ + struct hfs_btree *tree; + struct hfs_bnode *node, *new_node, *parent; + int newkeylen, diff; + int rec, rec_off, end_rec_off; + int start_off, end_off; + + tree = fd->tree; + node = fd->bnode; + new_node = NULL; + if (!node->parent) + return 0; + +again: + parent = hfs_bnode_find(tree, node->parent); + if (IS_ERR(parent)) + return PTR_ERR(parent); + __hfs_brec_find(parent, fd); + hfs_bnode_dump(parent); + rec = fd->record; + + /* size difference between old and new key */ + if (tree->attributes & HFS_TREE_VARIDXKEYS) + newkeylen = hfs_bnode_read_u16(node, 14) + 2; + else + fd->keylength = newkeylen = tree->max_key_len + 2; + dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", + rec, fd->keylength, newkeylen); + + rec_off = tree->node_size - (rec + 2) * 2; + end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; + diff = newkeylen - fd->keylength; + if (!diff) + goto skip; + if (diff > 0) { + end_off = hfs_bnode_read_u16(parent, end_rec_off); + if (end_rec_off - end_off < diff) { + + dprint(DBG_BNODE_MOD, "hfs: splitting index node.\n"); + fd->bnode = parent; + new_node = hfs_bnode_split(fd); + if (IS_ERR(new_node)) + return PTR_ERR(new_node); + parent = fd->bnode; + rec = fd->record; + rec_off = tree->node_size - (rec + 2) * 2; + end_rec_off = tree->node_size - + (parent->num_recs + 1) * 2; + } + } + + end_off = start_off = hfs_bnode_read_u16(parent, rec_off); + hfs_bnode_write_u16(parent, rec_off, start_off + diff); + start_off -= 4; /* move previous cnid too */ + + while (rec_off > end_rec_off) { + rec_off -= 2; + end_off = hfs_bnode_read_u16(parent, rec_off); + hfs_bnode_write_u16(parent, rec_off, end_off + diff); + } + hfs_bnode_move(parent, start_off + diff, start_off, + end_off - start_off); +skip: + hfs_bnode_copy(parent, fd->keyoffset, node, 14, newkeylen); + hfs_bnode_dump(parent); + + hfs_bnode_put(node); + node = parent; + + if (new_node) { + __be32 cnid; + + fd->bnode = hfs_bnode_find(tree, new_node->parent); + /* create index key and entry */ + hfs_bnode_read_key(new_node, fd->search_key, 14); + cnid = cpu_to_be32(new_node->this); + + __hfs_brec_find(fd->bnode, fd); + hfs_brec_insert(fd, &cnid, sizeof(cnid)); + hfs_bnode_put(fd->bnode); + hfs_bnode_put(new_node); + + if (!rec) { + if (new_node == node) + goto out; + /* restore search_key */ + hfs_bnode_read_key(node, fd->search_key, 14); + } + } + + if (!rec && node->parent) + goto again; +out: + fd->bnode = node; + return 0; +} + +static int hfs_btree_inc_height(struct hfs_btree *tree) +{ + struct hfs_bnode *node, *new_node; + struct hfs_bnode_desc node_desc; + int key_size, rec; + __be32 cnid; + + node = NULL; + if (tree->root) { + node = hfs_bnode_find(tree, tree->root); + if (IS_ERR(node)) + return PTR_ERR(node); + } + new_node = hfs_bmap_alloc(tree); + if (IS_ERR(new_node)) { + hfs_bnode_put(node); + return PTR_ERR(new_node); + } + + tree->root = new_node->this; + if (!tree->depth) { + tree->leaf_head = tree->leaf_tail = new_node->this; + new_node->type = HFS_NODE_LEAF; + new_node->num_recs = 0; + } else { + new_node->type = HFS_NODE_INDEX; + new_node->num_recs = 1; + } + new_node->parent = 0; + new_node->next = 0; + new_node->prev = 0; + new_node->height = ++tree->depth; + + node_desc.next = cpu_to_be32(new_node->next); + node_desc.prev = cpu_to_be32(new_node->prev); + node_desc.type = new_node->type; + node_desc.height = new_node->height; + node_desc.num_recs = cpu_to_be16(new_node->num_recs); + node_desc.reserved = 0; + hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc)); + + rec = tree->node_size - 2; + hfs_bnode_write_u16(new_node, rec, 14); + + if (node) { + /* insert old root idx into new root */ + node->parent = tree->root; + if (node->type == HFS_NODE_LEAF || + tree->attributes & HFS_TREE_VARIDXKEYS) + key_size = hfs_bnode_read_u16(node, 14) + 2; + else + key_size = tree->max_key_len + 2; + hfs_bnode_copy(new_node, 14, node, 14, key_size); + + if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { + key_size = tree->max_key_len + 2; + hfs_bnode_write_u16(new_node, 14, tree->max_key_len); + } + cnid = cpu_to_be32(node->this); + hfs_bnode_write(new_node, &cnid, 14 + key_size, 4); + + rec -= 2; + hfs_bnode_write_u16(new_node, rec, 14 + key_size + 4); + + hfs_bnode_put(node); + } + hfs_bnode_put(new_node); + mark_inode_dirty(tree->inode); + + return 0; +} diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c new file mode 100644 index 00000000..21023d9f --- /dev/null +++ b/fs/hfsplus/btree.c @@ -0,0 +1,377 @@ +/* + * linux/fs/hfsplus/btree.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handle opening/closing btree + */ + +#include +#include +#include + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + + +/* Get a reference to a B*Tree and do some initial checks */ +struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) +{ + struct hfs_btree *tree; + struct hfs_btree_header_rec *head; + struct address_space *mapping; + struct inode *inode; + struct page *page; + unsigned int size; + + tree = kzalloc(sizeof(*tree), GFP_KERNEL); + if (!tree) + return NULL; + + mutex_init(&tree->tree_lock); + spin_lock_init(&tree->hash_lock); + tree->sb = sb; + tree->cnid = id; + inode = hfsplus_iget(sb, id); + if (IS_ERR(inode)) + goto free_tree; + tree->inode = inode; + + if (!HFSPLUS_I(tree->inode)->first_blocks) { + printk(KERN_ERR + "hfs: invalid btree extent records (0 size).\n"); + goto free_inode; + } + + mapping = tree->inode->i_mapping; + page = read_mapping_page(mapping, 0, NULL); + if (IS_ERR(page)) + goto free_inode; + + /* Load the header */ + head = (struct hfs_btree_header_rec *)(kmap(page) + + sizeof(struct hfs_bnode_desc)); + tree->root = be32_to_cpu(head->root); + tree->leaf_count = be32_to_cpu(head->leaf_count); + tree->leaf_head = be32_to_cpu(head->leaf_head); + tree->leaf_tail = be32_to_cpu(head->leaf_tail); + tree->node_count = be32_to_cpu(head->node_count); + tree->free_nodes = be32_to_cpu(head->free_nodes); + tree->attributes = be32_to_cpu(head->attributes); + tree->node_size = be16_to_cpu(head->node_size); + tree->max_key_len = be16_to_cpu(head->max_key_len); + tree->depth = be16_to_cpu(head->depth); + + /* Verify the tree and set the correct compare function */ + switch (id) { + case HFSPLUS_EXT_CNID: + if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) { + printk(KERN_ERR "hfs: invalid extent max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + if (tree->attributes & HFS_TREE_VARIDXKEYS) { + printk(KERN_ERR "hfs: invalid extent btree flag\n"); + goto fail_page; + } + + tree->keycmp = hfsplus_ext_cmp_key; + break; + case HFSPLUS_CAT_CNID: + if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) { + printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { + printk(KERN_ERR "hfs: invalid catalog btree flag\n"); + goto fail_page; + } + + if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) && + (head->key_type == HFSPLUS_KEY_BINARY)) + tree->keycmp = hfsplus_cat_bin_cmp_key; + else { + tree->keycmp = hfsplus_cat_case_cmp_key; + set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); + } + break; + default: + printk(KERN_ERR "hfs: unknown B*Tree requested\n"); + goto fail_page; + } + + if (!(tree->attributes & HFS_TREE_BIGKEYS)) { + printk(KERN_ERR "hfs: invalid btree flag\n"); + goto fail_page; + } + + size = tree->node_size; + if (!is_power_of_2(size)) + goto fail_page; + if (!tree->node_count) + goto fail_page; + + tree->node_size_shift = ffs(size) - 1; + + tree->pages_per_bnode = + (tree->node_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + kunmap(page); + page_cache_release(page); + return tree; + + fail_page: + page_cache_release(page); + free_inode: + tree->inode->i_mapping->a_ops = &hfsplus_aops; + iput(tree->inode); + free_tree: + kfree(tree); + return NULL; +} + +/* Release resources used by a btree */ +void hfs_btree_close(struct hfs_btree *tree) +{ + struct hfs_bnode *node; + int i; + + if (!tree) + return; + + for (i = 0; i < NODE_HASH_SIZE; i++) { + while ((node = tree->node_hash[i])) { + tree->node_hash[i] = node->next_hash; + if (atomic_read(&node->refcnt)) + printk(KERN_CRIT "hfs: node %d:%d " + "still has %d user(s)!\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); + hfs_bnode_free(node); + tree->node_hash_cnt--; + } + } + iput(tree->inode); + kfree(tree); +} + +void hfs_btree_write(struct hfs_btree *tree) +{ + struct hfs_btree_header_rec *head; + struct hfs_bnode *node; + struct page *page; + + node = hfs_bnode_find(tree, 0); + if (IS_ERR(node)) + /* panic? */ + return; + /* Load the header */ + page = node->page[0]; + head = (struct hfs_btree_header_rec *)(kmap(page) + + sizeof(struct hfs_bnode_desc)); + + head->root = cpu_to_be32(tree->root); + head->leaf_count = cpu_to_be32(tree->leaf_count); + head->leaf_head = cpu_to_be32(tree->leaf_head); + head->leaf_tail = cpu_to_be32(tree->leaf_tail); + head->node_count = cpu_to_be32(tree->node_count); + head->free_nodes = cpu_to_be32(tree->free_nodes); + head->attributes = cpu_to_be32(tree->attributes); + head->depth = cpu_to_be16(tree->depth); + + kunmap(page); + set_page_dirty(page); + hfs_bnode_put(node); +} + +static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) +{ + struct hfs_btree *tree = prev->tree; + struct hfs_bnode *node; + struct hfs_bnode_desc desc; + __be32 cnid; + + node = hfs_bnode_create(tree, idx); + if (IS_ERR(node)) + return node; + + tree->free_nodes--; + prev->next = idx; + cnid = cpu_to_be32(idx); + hfs_bnode_write(prev, &cnid, offsetof(struct hfs_bnode_desc, next), 4); + + node->type = HFS_NODE_MAP; + node->num_recs = 1; + hfs_bnode_clear(node, 0, tree->node_size); + desc.next = 0; + desc.prev = 0; + desc.type = HFS_NODE_MAP; + desc.height = 0; + desc.num_recs = cpu_to_be16(1); + desc.reserved = 0; + hfs_bnode_write(node, &desc, 0, sizeof(desc)); + hfs_bnode_write_u16(node, 14, 0x8000); + hfs_bnode_write_u16(node, tree->node_size - 2, 14); + hfs_bnode_write_u16(node, tree->node_size - 4, tree->node_size - 6); + + return node; +} + +struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) +{ + struct hfs_bnode *node, *next_node; + struct page **pagep; + u32 nidx, idx; + unsigned off; + u16 off16; + u16 len; + u8 *data, byte, m; + int i; + + while (!tree->free_nodes) { + struct inode *inode = tree->inode; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + u32 count; + int res; + + res = hfsplus_file_extend(inode); + if (res) + return ERR_PTR(res); + hip->phys_size = inode->i_size = + (loff_t)hip->alloc_blocks << + HFSPLUS_SB(tree->sb)->alloc_blksz_shift; + hip->fs_blocks = + hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift; + inode_set_bytes(inode, inode->i_size); + count = inode->i_size >> tree->node_size_shift; + tree->free_nodes = count - tree->node_count; + tree->node_count = count; + } + + nidx = 0; + node = hfs_bnode_find(tree, nidx); + if (IS_ERR(node)) + return node; + len = hfs_brec_lenoff(node, 2, &off16); + off = off16; + + off += node->page_offset; + pagep = node->page + (off >> PAGE_CACHE_SHIFT); + data = kmap(*pagep); + off &= ~PAGE_CACHE_MASK; + idx = 0; + + for (;;) { + while (len) { + byte = data[off]; + if (byte != 0xff) { + for (m = 0x80, i = 0; i < 8; m >>= 1, i++) { + if (!(byte & m)) { + idx += i; + data[off] |= m; + set_page_dirty(*pagep); + kunmap(*pagep); + tree->free_nodes--; + mark_inode_dirty(tree->inode); + hfs_bnode_put(node); + return hfs_bnode_create(tree, + idx); + } + } + } + if (++off >= PAGE_CACHE_SIZE) { + kunmap(*pagep); + data = kmap(*++pagep); + off = 0; + } + idx += 8; + len--; + } + kunmap(*pagep); + nidx = node->next; + if (!nidx) { + dprint(DBG_BNODE_MOD, "hfs: create new bmap node.\n"); + next_node = hfs_bmap_new_bmap(node, idx); + } else + next_node = hfs_bnode_find(tree, nidx); + hfs_bnode_put(node); + if (IS_ERR(next_node)) + return next_node; + node = next_node; + + len = hfs_brec_lenoff(node, 0, &off16); + off = off16; + off += node->page_offset; + pagep = node->page + (off >> PAGE_CACHE_SHIFT); + data = kmap(*pagep); + off &= ~PAGE_CACHE_MASK; + } +} + +void hfs_bmap_free(struct hfs_bnode *node) +{ + struct hfs_btree *tree; + struct page *page; + u16 off, len; + u32 nidx; + u8 *data, byte, m; + + dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this); + BUG_ON(!node->this); + tree = node->tree; + nidx = node->this; + node = hfs_bnode_find(tree, 0); + if (IS_ERR(node)) + return; + len = hfs_brec_lenoff(node, 2, &off); + while (nidx >= len * 8) { + u32 i; + + nidx -= len * 8; + i = node->next; + hfs_bnode_put(node); + if (!i) { + /* panic */; + printk(KERN_CRIT "hfs: unable to free bnode %u. " + "bmap not found!\n", + node->this); + return; + } + node = hfs_bnode_find(tree, i); + if (IS_ERR(node)) + return; + if (node->type != HFS_NODE_MAP) { + /* panic */; + printk(KERN_CRIT "hfs: invalid bmap found! " + "(%u,%d)\n", + node->this, node->type); + hfs_bnode_put(node); + return; + } + len = hfs_brec_lenoff(node, 0, &off); + } + off += node->page_offset + nidx / 8; + page = node->page[off >> PAGE_CACHE_SHIFT]; + data = kmap(page); + off &= ~PAGE_CACHE_MASK; + m = 1 << (~nidx & 7); + byte = data[off]; + if (!(byte & m)) { + printk(KERN_CRIT "hfs: trying to free free bnode " + "%u(%d)\n", + node->this, node->type); + kunmap(page); + hfs_bnode_put(node); + return; + } + data[off] = byte & ~m; + set_page_dirty(page); + kunmap(page); + hfs_bnode_put(node); + tree->free_nodes++; + mark_inode_dirty(tree->inode); +} diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c new file mode 100644 index 00000000..408073ae --- /dev/null +++ b/fs/hfsplus/catalog.c @@ -0,0 +1,425 @@ +/* + * linux/fs/hfsplus/catalog.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handling of catalog records + */ + + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2) +{ + __be32 k1p, k2p; + + k1p = k1->cat.parent; + k2p = k2->cat.parent; + if (k1p != k2p) + return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1; + + return hfsplus_strcasecmp(&k1->cat.name, &k2->cat.name); +} + +int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2) +{ + __be32 k1p, k2p; + + k1p = k1->cat.parent; + k2p = k2->cat.parent; + if (k1p != k2p) + return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1; + + return hfsplus_strcmp(&k1->cat.name, &k2->cat.name); +} + +void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, + u32 parent, struct qstr *str) +{ + int len; + + key->cat.parent = cpu_to_be32(parent); + if (str) { + hfsplus_asc2uni(sb, &key->cat.name, str->name, str->len); + len = be16_to_cpu(key->cat.name.length); + } else { + key->cat.name.length = 0; + len = 0; + } + key->key_len = cpu_to_be16(6 + 2 * len); +} + +static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent, + struct hfsplus_unistr *name) +{ + int ustrlen; + + ustrlen = be16_to_cpu(name->length); + key->cat.parent = cpu_to_be32(parent); + key->cat.name.length = cpu_to_be16(ustrlen); + ustrlen *= 2; + memcpy(key->cat.name.unicode, name->unicode, ustrlen); + key->key_len = cpu_to_be16(6 + ustrlen); +} + +void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms) +{ + if (inode->i_flags & S_IMMUTABLE) + perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; + else + perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE; + if (inode->i_flags & S_APPEND) + perms->rootflags |= HFSPLUS_FLG_APPEND; + else + perms->rootflags &= ~HFSPLUS_FLG_APPEND; + + perms->userflags = HFSPLUS_I(inode)->userflags; + perms->mode = cpu_to_be16(inode->i_mode); + perms->owner = cpu_to_be32(inode->i_uid); + perms->group = cpu_to_be32(inode->i_gid); + + if (S_ISREG(inode->i_mode)) + perms->dev = cpu_to_be32(inode->i_nlink); + else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) + perms->dev = cpu_to_be32(inode->i_rdev); + else + perms->dev = 0; +} + +static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, + u32 cnid, struct inode *inode) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + + if (S_ISDIR(inode->i_mode)) { + struct hfsplus_cat_folder *folder; + + folder = &entry->folder; + memset(folder, 0, sizeof(*folder)); + folder->type = cpu_to_be16(HFSPLUS_FOLDER); + folder->id = cpu_to_be32(inode->i_ino); + HFSPLUS_I(inode)->create_date = + folder->create_date = + folder->content_mod_date = + folder->attribute_mod_date = + folder->access_date = hfsp_now2mt(); + hfsplus_cat_set_perms(inode, &folder->permissions); + if (inode == sbi->hidden_dir) + /* invisible and namelocked */ + folder->user_info.frFlags = cpu_to_be16(0x5000); + return sizeof(*folder); + } else { + struct hfsplus_cat_file *file; + + file = &entry->file; + memset(file, 0, sizeof(*file)); + file->type = cpu_to_be16(HFSPLUS_FILE); + file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS); + file->id = cpu_to_be32(cnid); + HFSPLUS_I(inode)->create_date = + file->create_date = + file->content_mod_date = + file->attribute_mod_date = + file->access_date = hfsp_now2mt(); + if (cnid == inode->i_ino) { + hfsplus_cat_set_perms(inode, &file->permissions); + if (S_ISLNK(inode->i_mode)) { + file->user_info.fdType = + cpu_to_be32(HFSP_SYMLINK_TYPE); + file->user_info.fdCreator = + cpu_to_be32(HFSP_SYMLINK_CREATOR); + } else { + file->user_info.fdType = + cpu_to_be32(sbi->type); + file->user_info.fdCreator = + cpu_to_be32(sbi->creator); + } + if (HFSPLUS_FLG_IMMUTABLE & + (file->permissions.rootflags | + file->permissions.userflags)) + file->flags |= + cpu_to_be16(HFSPLUS_FILE_LOCKED); + } else { + file->user_info.fdType = + cpu_to_be32(HFSP_HARDLINK_TYPE); + file->user_info.fdCreator = + cpu_to_be32(HFSP_HFSPLUS_CREATOR); + file->user_info.fdFlags = + cpu_to_be16(0x100); + file->create_date = + HFSPLUS_I(sbi->hidden_dir)->create_date; + file->permissions.dev = + cpu_to_be32(HFSPLUS_I(inode)->linkid); + } + return sizeof(*file); + } +} + +static int hfsplus_fill_cat_thread(struct super_block *sb, + hfsplus_cat_entry *entry, int type, + u32 parentid, struct qstr *str) +{ + entry->type = cpu_to_be16(type); + entry->thread.reserved = 0; + entry->thread.parentID = cpu_to_be32(parentid); + hfsplus_asc2uni(sb, &entry->thread.nodeName, str->name, str->len); + return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2; +} + +/* Try to get a catalog entry for given catalog id */ +int hfsplus_find_cat(struct super_block *sb, u32 cnid, + struct hfs_find_data *fd) +{ + hfsplus_cat_entry tmp; + int err; + u16 type; + + hfsplus_cat_build_key(sb, fd->search_key, cnid, NULL); + err = hfs_brec_read(fd, &tmp, sizeof(hfsplus_cat_entry)); + if (err) + return err; + + type = be16_to_cpu(tmp.type); + if (type != HFSPLUS_FOLDER_THREAD && type != HFSPLUS_FILE_THREAD) { + printk(KERN_ERR "hfs: found bad thread record in catalog\n"); + return -EIO; + } + + if (be16_to_cpu(tmp.thread.nodeName.length) > 255) { + printk(KERN_ERR "hfs: catalog name length corrupted\n"); + return -EIO; + } + + hfsplus_cat_build_key_uni(fd->search_key, + be32_to_cpu(tmp.thread.parentID), + &tmp.thread.nodeName); + return hfs_brec_find(fd); +} + +int hfsplus_create_cat(u32 cnid, struct inode *dir, + struct qstr *str, struct inode *inode) +{ + struct super_block *sb = dir->i_sb; + struct hfs_find_data fd; + hfsplus_cat_entry entry; + int entry_size; + int err; + + dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", + str->name, cnid, inode->i_nlink); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + + hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + entry_size = hfsplus_fill_cat_thread(sb, &entry, + S_ISDIR(inode->i_mode) ? + HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD, + dir->i_ino, str); + err = hfs_brec_find(&fd); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto err2; + } + err = hfs_brec_insert(&fd, &entry, entry_size); + if (err) + goto err2; + + hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + entry_size = hfsplus_cat_build_record(&entry, cnid, inode); + err = hfs_brec_find(&fd); + if (err != -ENOENT) { + /* panic? */ + if (!err) + err = -EEXIST; + goto err1; + } + err = hfs_brec_insert(&fd, &entry, entry_size); + if (err) + goto err1; + + dir->i_size++; + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); + + hfs_find_exit(&fd); + return 0; + +err1: + hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + if (!hfs_brec_find(&fd)) + hfs_brec_remove(&fd); +err2: + hfs_find_exit(&fd); + return err; +} + +int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) +{ + struct super_block *sb = dir->i_sb; + struct hfs_find_data fd; + struct hfsplus_fork_raw fork; + struct list_head *pos; + int err, off; + u16 type; + + dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", + str ? str->name : NULL, cnid); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + + if (!str) { + int len; + + hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + err = hfs_brec_find(&fd); + if (err) + goto out; + + off = fd.entryoffset + + offsetof(struct hfsplus_cat_thread, nodeName); + fd.search_key->cat.parent = cpu_to_be32(dir->i_ino); + hfs_bnode_read(fd.bnode, + &fd.search_key->cat.name.length, off, 2); + len = be16_to_cpu(fd.search_key->cat.name.length) * 2; + hfs_bnode_read(fd.bnode, + &fd.search_key->cat.name.unicode, + off + 2, len); + fd.search_key->key_len = cpu_to_be16(6 + len); + } else + hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + + err = hfs_brec_find(&fd); + if (err) + goto out; + + type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset); + if (type == HFSPLUS_FILE) { +#if 0 + off = fd.entryoffset + offsetof(hfsplus_cat_file, data_fork); + hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork)); + hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_DATA); +#endif + + off = fd.entryoffset + + offsetof(struct hfsplus_cat_file, rsrc_fork); + hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork)); + hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); + } + + list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) { + struct hfsplus_readdir_data *rd = + list_entry(pos, struct hfsplus_readdir_data, list); + if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) + rd->file->f_pos--; + } + + err = hfs_brec_remove(&fd); + if (err) + goto out; + + hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + err = hfs_brec_find(&fd); + if (err) + goto out; + + err = hfs_brec_remove(&fd); + if (err) + goto out; + + dir->i_size--; + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); +out: + hfs_find_exit(&fd); + + return err; +} + +int hfsplus_rename_cat(u32 cnid, + struct inode *src_dir, struct qstr *src_name, + struct inode *dst_dir, struct qstr *dst_name) +{ + struct super_block *sb = src_dir->i_sb; + struct hfs_find_data src_fd, dst_fd; + hfsplus_cat_entry entry; + int entry_size, type; + int err = 0; + + dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + cnid, src_dir->i_ino, src_name->name, + dst_dir->i_ino, dst_name->name); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); + dst_fd = src_fd; + + /* find the old dir entry and read the data */ + hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); + err = hfs_brec_find(&src_fd); + if (err) + goto out; + if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) { + err = -EIO; + goto out; + } + + hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset, + src_fd.entrylength); + + /* create new dir entry with the data from the old entry */ + hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name); + err = hfs_brec_find(&dst_fd); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto out; + } + + err = hfs_brec_insert(&dst_fd, &entry, src_fd.entrylength); + if (err) + goto out; + dst_dir->i_size++; + dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC; + + /* finally remove the old entry */ + hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); + err = hfs_brec_find(&src_fd); + if (err) + goto out; + err = hfs_brec_remove(&src_fd); + if (err) + goto out; + src_dir->i_size--; + src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC; + + /* remove old thread entry */ + hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL); + err = hfs_brec_find(&src_fd); + if (err) + goto out; + type = hfs_bnode_read_u16(src_fd.bnode, src_fd.entryoffset); + err = hfs_brec_remove(&src_fd); + if (err) + goto out; + + /* create new thread entry */ + hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL); + entry_size = hfsplus_fill_cat_thread(sb, &entry, type, + dst_dir->i_ino, dst_name); + err = hfs_brec_find(&dst_fd); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto out; + } + err = hfs_brec_insert(&dst_fd, &entry, entry_size); + + hfsplus_mark_inode_dirty(dst_dir, HFSPLUS_I_CAT_DIRTY); + hfsplus_mark_inode_dirty(src_dir, HFSPLUS_I_CAT_DIRTY); +out: + hfs_bnode_put(dst_fd.bnode); + hfs_find_exit(&src_fd); + return err; +} diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c new file mode 100644 index 00000000..159f5ebf --- /dev/null +++ b/fs/hfsplus/dir.c @@ -0,0 +1,516 @@ +/* + * linux/fs/hfsplus/dir.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handling of directories + */ + +#include +#include +#include +#include + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +static inline void hfsplus_instantiate(struct dentry *dentry, + struct inode *inode, u32 cnid) +{ + dentry->d_fsdata = (void *)(unsigned long)cnid; + d_instantiate(dentry, inode); +} + +/* Find the entry inside dir named dentry->d_name */ +static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode = NULL; + struct hfs_find_data fd; + struct super_block *sb; + hfsplus_cat_entry entry; + int err; + u32 cnid, linkid = 0; + u16 type; + + sb = dir->i_sb; + + dentry->d_fsdata = NULL; + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); +again: + err = hfs_brec_read(&fd, &entry, sizeof(entry)); + if (err) { + if (err == -ENOENT) { + hfs_find_exit(&fd); + /* No such entry */ + inode = NULL; + goto out; + } + goto fail; + } + type = be16_to_cpu(entry.type); + if (type == HFSPLUS_FOLDER) { + if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) { + err = -EIO; + goto fail; + } + cnid = be32_to_cpu(entry.folder.id); + dentry->d_fsdata = (void *)(unsigned long)cnid; + } else if (type == HFSPLUS_FILE) { + if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { + err = -EIO; + goto fail; + } + cnid = be32_to_cpu(entry.file.id); + if (entry.file.user_info.fdType == + cpu_to_be32(HFSP_HARDLINK_TYPE) && + entry.file.user_info.fdCreator == + cpu_to_be32(HFSP_HFSPLUS_CREATOR) && + (entry.file.create_date == + HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)-> + create_date || + entry.file.create_date == + HFSPLUS_I(sb->s_root->d_inode)-> + create_date) && + HFSPLUS_SB(sb)->hidden_dir) { + struct qstr str; + char name[32]; + + if (dentry->d_fsdata) { + /* + * We found a link pointing to another link, + * so ignore it and treat it as regular file. + */ + cnid = (unsigned long)dentry->d_fsdata; + linkid = 0; + } else { + dentry->d_fsdata = (void *)(unsigned long)cnid; + linkid = + be32_to_cpu(entry.file.permissions.dev); + str.len = sprintf(name, "iNode%d", linkid); + str.name = name; + hfsplus_cat_build_key(sb, fd.search_key, + HFSPLUS_SB(sb)->hidden_dir->i_ino, + &str); + goto again; + } + } else if (!dentry->d_fsdata) + dentry->d_fsdata = (void *)(unsigned long)cnid; + } else { + printk(KERN_ERR "hfs: invalid catalog entry type in lookup\n"); + err = -EIO; + goto fail; + } + hfs_find_exit(&fd); + inode = hfsplus_iget(dir->i_sb, cnid); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (S_ISREG(inode->i_mode)) + HFSPLUS_I(inode)->linkid = linkid; +out: + d_add(dentry, inode); + return NULL; +fail: + hfs_find_exit(&fd); + return ERR_PTR(err); +} + +static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + int len, err; + char strbuf[HFSPLUS_MAX_STRLEN + 1]; + hfsplus_cat_entry entry; + struct hfs_find_data fd; + struct hfsplus_readdir_data *rd; + u16 type; + + if (filp->f_pos >= inode->i_size) + return 0; + + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); + err = hfs_brec_find(&fd); + if (err) + goto out; + + switch ((u32)filp->f_pos) { + case 0: + /* This is completely artificial... */ + if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) + goto out; + filp->f_pos++; + /* fall through */ + case 1: + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + fd.entrylength); + if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) { + printk(KERN_ERR "hfs: bad catalog folder thread\n"); + err = -EIO; + goto out; + } + if (fd.entrylength < HFSPLUS_MIN_THREAD_SZ) { + printk(KERN_ERR "hfs: truncated catalog thread\n"); + err = -EIO; + goto out; + } + if (filldir(dirent, "..", 2, 1, + be32_to_cpu(entry.thread.parentID), DT_DIR)) + goto out; + filp->f_pos++; + /* fall through */ + default: + if (filp->f_pos >= inode->i_size) + goto out; + err = hfs_brec_goto(&fd, filp->f_pos - 1); + if (err) + goto out; + } + + for (;;) { + if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) { + printk(KERN_ERR "hfs: walked past end of dir\n"); + err = -EIO; + goto out; + } + + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + fd.entrylength); + type = be16_to_cpu(entry.type); + len = HFSPLUS_MAX_STRLEN; + err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len); + if (err) + goto out; + if (type == HFSPLUS_FOLDER) { + if (fd.entrylength < + sizeof(struct hfsplus_cat_folder)) { + printk(KERN_ERR "hfs: small dir entry\n"); + err = -EIO; + goto out; + } + if (HFSPLUS_SB(sb)->hidden_dir && + HFSPLUS_SB(sb)->hidden_dir->i_ino == + be32_to_cpu(entry.folder.id)) + goto next; + if (filldir(dirent, strbuf, len, filp->f_pos, + be32_to_cpu(entry.folder.id), DT_DIR)) + break; + } else if (type == HFSPLUS_FILE) { + if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { + printk(KERN_ERR "hfs: small file entry\n"); + err = -EIO; + goto out; + } + if (filldir(dirent, strbuf, len, filp->f_pos, + be32_to_cpu(entry.file.id), DT_REG)) + break; + } else { + printk(KERN_ERR "hfs: bad catalog entry type\n"); + err = -EIO; + goto out; + } +next: + filp->f_pos++; + if (filp->f_pos >= inode->i_size) + goto out; + err = hfs_brec_goto(&fd, 1); + if (err) + goto out; + } + rd = filp->private_data; + if (!rd) { + rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL); + if (!rd) { + err = -ENOMEM; + goto out; + } + filp->private_data = rd; + rd->file = filp; + list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list); + } + memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); +out: + hfs_find_exit(&fd); + return err; +} + +static int hfsplus_dir_release(struct inode *inode, struct file *file) +{ + struct hfsplus_readdir_data *rd = file->private_data; + if (rd) { + mutex_lock(&inode->i_mutex); + list_del(&rd->list); + mutex_unlock(&inode->i_mutex); + kfree(rd); + } + return 0; +} + +static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, + struct dentry *dst_dentry) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb); + struct inode *inode = src_dentry->d_inode; + struct inode *src_dir = src_dentry->d_parent->d_inode; + struct qstr str; + char name[32]; + u32 cnid, id; + int res; + + if (HFSPLUS_IS_RSRC(inode)) + return -EPERM; + if (!S_ISREG(inode->i_mode)) + return -EPERM; + + mutex_lock(&sbi->vh_mutex); + if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) { + for (;;) { + get_random_bytes(&id, sizeof(cnid)); + id &= 0x3fffffff; + str.name = name; + str.len = sprintf(name, "iNode%d", id); + res = hfsplus_rename_cat(inode->i_ino, + src_dir, &src_dentry->d_name, + sbi->hidden_dir, &str); + if (!res) + break; + if (res != -EEXIST) + goto out; + } + HFSPLUS_I(inode)->linkid = id; + cnid = sbi->next_cnid++; + src_dentry->d_fsdata = (void *)(unsigned long)cnid; + res = hfsplus_create_cat(cnid, src_dir, + &src_dentry->d_name, inode); + if (res) + /* panic? */ + goto out; + sbi->file_count++; + } + cnid = sbi->next_cnid++; + res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode); + if (res) + goto out; + + inc_nlink(inode); + hfsplus_instantiate(dst_dentry, inode, cnid); + ihold(inode); + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + sbi->file_count++; + dst_dir->i_sb->s_dirt = 1; +out: + mutex_unlock(&sbi->vh_mutex); + return res; +} + +static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); + struct inode *inode = dentry->d_inode; + struct qstr str; + char name[32]; + u32 cnid; + int res; + + if (HFSPLUS_IS_RSRC(inode)) + return -EPERM; + + mutex_lock(&sbi->vh_mutex); + cnid = (u32)(unsigned long)dentry->d_fsdata; + if (inode->i_ino == cnid && + atomic_read(&HFSPLUS_I(inode)->opencnt)) { + str.name = name; + str.len = sprintf(name, "temp%lu", inode->i_ino); + res = hfsplus_rename_cat(inode->i_ino, + dir, &dentry->d_name, + sbi->hidden_dir, &str); + if (!res) { + inode->i_flags |= S_DEAD; + drop_nlink(inode); + } + goto out; + } + res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); + if (res) + goto out; + + if (inode->i_nlink > 0) + drop_nlink(inode); + if (inode->i_ino == cnid) + clear_nlink(inode); + if (!inode->i_nlink) { + if (inode->i_ino != cnid) { + sbi->file_count--; + if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) { + res = hfsplus_delete_cat(inode->i_ino, + sbi->hidden_dir, + NULL); + if (!res) + hfsplus_delete_inode(inode); + } else + inode->i_flags |= S_DEAD; + } else + hfsplus_delete_inode(inode); + } else + sbi->file_count--; + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +out: + mutex_unlock(&sbi->vh_mutex); + return res; +} + +static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); + struct inode *inode = dentry->d_inode; + int res; + + if (inode->i_size != 2) + return -ENOTEMPTY; + + mutex_lock(&sbi->vh_mutex); + res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); + if (res) + goto out; + clear_nlink(inode); + inode->i_ctime = CURRENT_TIME_SEC; + hfsplus_delete_inode(inode); + mark_inode_dirty(inode); +out: + mutex_unlock(&sbi->vh_mutex); + return res; +} + +static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); + struct inode *inode; + int res = -ENOSPC; + + mutex_lock(&sbi->vh_mutex); + inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO); + if (!inode) + goto out; + + res = page_symlink(inode, symname, strlen(symname) + 1); + if (res) + goto out_err; + + res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); + if (res) + goto out_err; + + hfsplus_instantiate(dentry, inode, inode->i_ino); + mark_inode_dirty(inode); + goto out; + +out_err: + inode->i_nlink = 0; + hfsplus_delete_inode(inode); + iput(inode); +out: + mutex_unlock(&sbi->vh_mutex); + return res; +} + +static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); + struct inode *inode; + int res = -ENOSPC; + + mutex_lock(&sbi->vh_mutex); + inode = hfsplus_new_inode(dir->i_sb, mode); + if (!inode) + goto out; + + if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) + init_special_inode(inode, mode, rdev); + + res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); + if (res) { + inode->i_nlink = 0; + hfsplus_delete_inode(inode); + iput(inode); + goto out; + } + + hfsplus_instantiate(dentry, inode, inode->i_ino); + mark_inode_dirty(inode); +out: + mutex_unlock(&sbi->vh_mutex); + return res; +} + +static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + return hfsplus_mknod(dir, dentry, mode, 0); +} + +static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0); +} + +static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int res; + + /* Unlink destination if it already exists */ + if (new_dentry->d_inode) { + if (S_ISDIR(new_dentry->d_inode->i_mode)) + res = hfsplus_rmdir(new_dir, new_dentry); + else + res = hfsplus_unlink(new_dir, new_dentry); + if (res) + return res; + } + + res = hfsplus_rename_cat((u32)(unsigned long)old_dentry->d_fsdata, + old_dir, &old_dentry->d_name, + new_dir, &new_dentry->d_name); + if (!res) + new_dentry->d_fsdata = old_dentry->d_fsdata; + return res; +} + +const struct inode_operations hfsplus_dir_inode_operations = { + .lookup = hfsplus_lookup, + .create = hfsplus_create, + .link = hfsplus_link, + .unlink = hfsplus_unlink, + .mkdir = hfsplus_mkdir, + .rmdir = hfsplus_rmdir, + .symlink = hfsplus_symlink, + .mknod = hfsplus_mknod, + .rename = hfsplus_rename, +}; + +const struct file_operations hfsplus_dir_operations = { + .fsync = hfsplus_file_fsync, + .read = generic_read_dir, + .readdir = hfsplus_readdir, + .unlocked_ioctl = hfsplus_ioctl, + .llseek = generic_file_llseek, + .release = hfsplus_dir_release, +}; diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c new file mode 100644 index 00000000..b1991a2a --- /dev/null +++ b/fs/hfsplus/extents.c @@ -0,0 +1,561 @@ +/* + * linux/fs/hfsplus/extents.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handling of Extents both in catalog and extents overflow trees + */ + +#include +#include +#include + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +/* Compare two extents keys, returns 0 on same, pos/neg for difference */ +int hfsplus_ext_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2) +{ + __be32 k1id, k2id; + __be32 k1s, k2s; + + k1id = k1->ext.cnid; + k2id = k2->ext.cnid; + if (k1id != k2id) + return be32_to_cpu(k1id) < be32_to_cpu(k2id) ? -1 : 1; + + if (k1->ext.fork_type != k2->ext.fork_type) + return k1->ext.fork_type < k2->ext.fork_type ? -1 : 1; + + k1s = k1->ext.start_block; + k2s = k2->ext.start_block; + if (k1s == k2s) + return 0; + return be32_to_cpu(k1s) < be32_to_cpu(k2s) ? -1 : 1; +} + +static void hfsplus_ext_build_key(hfsplus_btree_key *key, u32 cnid, + u32 block, u8 type) +{ + key->key_len = cpu_to_be16(HFSPLUS_EXT_KEYLEN - 2); + key->ext.cnid = cpu_to_be32(cnid); + key->ext.start_block = cpu_to_be32(block); + key->ext.fork_type = type; + key->ext.pad = 0; +} + +static u32 hfsplus_ext_find_block(struct hfsplus_extent *ext, u32 off) +{ + int i; + u32 count; + + for (i = 0; i < 8; ext++, i++) { + count = be32_to_cpu(ext->block_count); + if (off < count) + return be32_to_cpu(ext->start_block) + off; + off -= count; + } + /* panic? */ + return 0; +} + +static int hfsplus_ext_block_count(struct hfsplus_extent *ext) +{ + int i; + u32 count = 0; + + for (i = 0; i < 8; ext++, i++) + count += be32_to_cpu(ext->block_count); + return count; +} + +static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext) +{ + int i; + + ext += 7; + for (i = 0; i < 7; ext--, i++) + if (ext->block_count) + break; + return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count); +} + +static void __hfsplus_ext_write_extent(struct inode *inode, + struct hfs_find_data *fd) +{ + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + int res; + + WARN_ON(!mutex_is_locked(&hip->extents_lock)); + + hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start, + HFSPLUS_IS_RSRC(inode) ? + HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); + + res = hfs_brec_find(fd); + if (hip->extent_state & HFSPLUS_EXT_NEW) { + if (res != -ENOENT) + return; + hfs_brec_insert(fd, hip->cached_extents, + sizeof(hfsplus_extent_rec)); + hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); + } else { + if (res) + return; + hfs_bnode_write(fd->bnode, hip->cached_extents, + fd->entryoffset, fd->entrylength); + hip->extent_state &= ~HFSPLUS_EXT_DIRTY; + } + + /* + * We can't just use hfsplus_mark_inode_dirty here, because we + * also get called from hfsplus_write_inode, which should not + * redirty the inode. Instead the callers have to be careful + * to explicily mark the inode dirty, too. + */ + set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags); +} + +static void hfsplus_ext_write_extent_locked(struct inode *inode) +{ + if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) { + struct hfs_find_data fd; + + hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); + __hfsplus_ext_write_extent(inode, &fd); + hfs_find_exit(&fd); + } +} + +void hfsplus_ext_write_extent(struct inode *inode) +{ + mutex_lock(&HFSPLUS_I(inode)->extents_lock); + hfsplus_ext_write_extent_locked(inode); + mutex_unlock(&HFSPLUS_I(inode)->extents_lock); +} + +static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, + struct hfsplus_extent *extent, + u32 cnid, u32 block, u8 type) +{ + int res; + + hfsplus_ext_build_key(fd->search_key, cnid, block, type); + fd->key->ext.cnid = 0; + res = hfs_brec_find(fd); + if (res && res != -ENOENT) + return res; + if (fd->key->ext.cnid != fd->search_key->ext.cnid || + fd->key->ext.fork_type != fd->search_key->ext.fork_type) + return -ENOENT; + if (fd->entrylength != sizeof(hfsplus_extent_rec)) + return -EIO; + hfs_bnode_read(fd->bnode, extent, fd->entryoffset, + sizeof(hfsplus_extent_rec)); + return 0; +} + +static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, + struct inode *inode, u32 block) +{ + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + int res; + + WARN_ON(!mutex_is_locked(&hip->extents_lock)); + + if (hip->extent_state & HFSPLUS_EXT_DIRTY) + __hfsplus_ext_write_extent(inode, fd); + + res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino, + block, HFSPLUS_IS_RSRC(inode) ? + HFSPLUS_TYPE_RSRC : + HFSPLUS_TYPE_DATA); + if (!res) { + hip->cached_start = be32_to_cpu(fd->key->ext.start_block); + hip->cached_blocks = + hfsplus_ext_block_count(hip->cached_extents); + } else { + hip->cached_start = hip->cached_blocks = 0; + hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); + } + return res; +} + +static int hfsplus_ext_read_extent(struct inode *inode, u32 block) +{ + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + struct hfs_find_data fd; + int res; + + if (block >= hip->cached_start && + block < hip->cached_start + hip->cached_blocks) + return 0; + + hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); + res = __hfsplus_ext_cache_extent(&fd, inode, block); + hfs_find_exit(&fd); + return res; +} + +/* Get a block at iblock for inode, possibly allocating if create */ +int hfsplus_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + int res = -EIO; + u32 ablock, dblock, mask; + int was_dirty = 0; + int shift; + + /* Convert inode block to disk allocation block */ + shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; + ablock = iblock >> sbi->fs_shift; + + if (iblock >= hip->fs_blocks) { + if (iblock > hip->fs_blocks || !create) + return -EIO; + if (ablock >= hip->alloc_blocks) { + res = hfsplus_file_extend(inode); + if (res) + return res; + } + } else + create = 0; + + if (ablock < hip->first_blocks) { + dblock = hfsplus_ext_find_block(hip->first_extents, ablock); + goto done; + } + + if (inode->i_ino == HFSPLUS_EXT_CNID) + return -EIO; + + mutex_lock(&hip->extents_lock); + + /* + * hfsplus_ext_read_extent will write out a cached extent into + * the extents btree. In that case we may have to mark the inode + * dirty even for a pure read of an extent here. + */ + was_dirty = (hip->extent_state & HFSPLUS_EXT_DIRTY); + res = hfsplus_ext_read_extent(inode, ablock); + if (res) { + mutex_unlock(&hip->extents_lock); + return -EIO; + } + dblock = hfsplus_ext_find_block(hip->cached_extents, + ablock - hip->cached_start); + mutex_unlock(&hip->extents_lock); + +done: + dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", + inode->i_ino, (long long)iblock, dblock); + mask = (1 << sbi->fs_shift) - 1; + map_bh(bh_result, sb, + (dblock << sbi->fs_shift) + sbi->blockoffset + + (iblock & mask)); + if (create) { + set_buffer_new(bh_result); + hip->phys_size += sb->s_blocksize; + hip->fs_blocks++; + inode_add_bytes(inode, sb->s_blocksize); + } + if (create || was_dirty) + mark_inode_dirty(inode); + return 0; +} + +static void hfsplus_dump_extent(struct hfsplus_extent *extent) +{ + int i; + + dprint(DBG_EXTENT, " "); + for (i = 0; i < 8; i++) + dprint(DBG_EXTENT, " %u:%u", be32_to_cpu(extent[i].start_block), + be32_to_cpu(extent[i].block_count)); + dprint(DBG_EXTENT, "\n"); +} + +static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset, + u32 alloc_block, u32 block_count) +{ + u32 count, start; + int i; + + hfsplus_dump_extent(extent); + for (i = 0; i < 8; extent++, i++) { + count = be32_to_cpu(extent->block_count); + if (offset == count) { + start = be32_to_cpu(extent->start_block); + if (alloc_block != start + count) { + if (++i >= 8) + return -ENOSPC; + extent++; + extent->start_block = cpu_to_be32(alloc_block); + } else + block_count += count; + extent->block_count = cpu_to_be32(block_count); + return 0; + } else if (offset < count) + break; + offset -= count; + } + /* panic? */ + return -EIO; +} + +static int hfsplus_free_extents(struct super_block *sb, + struct hfsplus_extent *extent, + u32 offset, u32 block_nr) +{ + u32 count, start; + int i; + + hfsplus_dump_extent(extent); + for (i = 0; i < 8; extent++, i++) { + count = be32_to_cpu(extent->block_count); + if (offset == count) + goto found; + else if (offset < count) + break; + offset -= count; + } + /* panic? */ + return -EIO; +found: + for (;;) { + start = be32_to_cpu(extent->start_block); + if (count <= block_nr) { + hfsplus_block_free(sb, start, count); + extent->block_count = 0; + extent->start_block = 0; + block_nr -= count; + } else { + count -= block_nr; + hfsplus_block_free(sb, start + count, block_nr); + extent->block_count = cpu_to_be32(count); + block_nr = 0; + } + if (!block_nr || !i) + return 0; + i--; + extent--; + count = be32_to_cpu(extent->block_count); + } +} + +int hfsplus_free_fork(struct super_block *sb, u32 cnid, + struct hfsplus_fork_raw *fork, int type) +{ + struct hfs_find_data fd; + hfsplus_extent_rec ext_entry; + u32 total_blocks, blocks, start; + int res, i; + + total_blocks = be32_to_cpu(fork->total_blocks); + if (!total_blocks) + return 0; + + blocks = 0; + for (i = 0; i < 8; i++) + blocks += be32_to_cpu(fork->extents[i].block_count); + + res = hfsplus_free_extents(sb, fork->extents, blocks, blocks); + if (res) + return res; + if (total_blocks == blocks) + return 0; + + hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); + do { + res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid, + total_blocks, type); + if (res) + break; + start = be32_to_cpu(fd.key->ext.start_block); + hfsplus_free_extents(sb, ext_entry, + total_blocks - start, + total_blocks); + hfs_brec_remove(&fd); + total_blocks = start; + } while (total_blocks > blocks); + hfs_find_exit(&fd); + + return res; +} + +int hfsplus_file_extend(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + u32 start, len, goal; + int res; + + if (sbi->alloc_file->i_size * 8 < + sbi->total_blocks - sbi->free_blocks + 8) { + /* extend alloc file */ + printk(KERN_ERR "hfs: extend alloc file! " + "(%llu,%u,%u)\n", + sbi->alloc_file->i_size * 8, + sbi->total_blocks, sbi->free_blocks); + return -ENOSPC; + } + + mutex_lock(&hip->extents_lock); + if (hip->alloc_blocks == hip->first_blocks) + goal = hfsplus_ext_lastblock(hip->first_extents); + else { + res = hfsplus_ext_read_extent(inode, hip->alloc_blocks); + if (res) + goto out; + goal = hfsplus_ext_lastblock(hip->cached_extents); + } + + len = hip->clump_blocks; + start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len); + if (start >= sbi->total_blocks) { + start = hfsplus_block_allocate(sb, goal, 0, &len); + if (start >= goal) { + res = -ENOSPC; + goto out; + } + } + + dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + + if (hip->alloc_blocks <= hip->first_blocks) { + if (!hip->first_blocks) { + dprint(DBG_EXTENT, "first extents\n"); + /* no extents yet */ + hip->first_extents[0].start_block = cpu_to_be32(start); + hip->first_extents[0].block_count = cpu_to_be32(len); + res = 0; + } else { + /* try to append to extents in inode */ + res = hfsplus_add_extent(hip->first_extents, + hip->alloc_blocks, + start, len); + if (res == -ENOSPC) + goto insert_extent; + } + if (!res) { + hfsplus_dump_extent(hip->first_extents); + hip->first_blocks += len; + } + } else { + res = hfsplus_add_extent(hip->cached_extents, + hip->alloc_blocks - hip->cached_start, + start, len); + if (!res) { + hfsplus_dump_extent(hip->cached_extents); + hip->extent_state |= HFSPLUS_EXT_DIRTY; + hip->cached_blocks += len; + } else if (res == -ENOSPC) + goto insert_extent; + } +out: + mutex_unlock(&hip->extents_lock); + if (!res) { + hip->alloc_blocks += len; + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY); + } + return res; + +insert_extent: + dprint(DBG_EXTENT, "insert new extent\n"); + hfsplus_ext_write_extent_locked(inode); + + memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); + hip->cached_extents[0].start_block = cpu_to_be32(start); + hip->cached_extents[0].block_count = cpu_to_be32(len); + hfsplus_dump_extent(hip->cached_extents); + hip->extent_state |= HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW; + hip->cached_start = hip->alloc_blocks; + hip->cached_blocks = len; + + res = 0; + goto out; +} + +void hfsplus_file_truncate(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + struct hfs_find_data fd; + u32 alloc_cnt, blk_cnt, start; + int res; + + dprint(DBG_INODE, "truncate: %lu, %llu -> %llu\n", + inode->i_ino, (long long)hip->phys_size, + inode->i_size); + + if (inode->i_size > hip->phys_size) { + struct address_space *mapping = inode->i_mapping; + struct page *page; + void *fsdata; + u32 size = inode->i_size; + int res; + + res = pagecache_write_begin(NULL, mapping, size, 0, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (res) + return; + res = pagecache_write_end(NULL, mapping, size, + 0, 0, page, fsdata); + if (res < 0) + return; + mark_inode_dirty(inode); + return; + } else if (inode->i_size == hip->phys_size) + return; + + blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >> + HFSPLUS_SB(sb)->alloc_blksz_shift; + alloc_cnt = hip->alloc_blocks; + if (blk_cnt == alloc_cnt) + goto out; + + mutex_lock(&hip->extents_lock); + hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); + while (1) { + if (alloc_cnt == hip->first_blocks) { + hfsplus_free_extents(sb, hip->first_extents, + alloc_cnt, alloc_cnt - blk_cnt); + hfsplus_dump_extent(hip->first_extents); + hip->first_blocks = blk_cnt; + break; + } + res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); + if (res) + break; + start = hip->cached_start; + hfsplus_free_extents(sb, hip->cached_extents, + alloc_cnt - start, alloc_cnt - blk_cnt); + hfsplus_dump_extent(hip->cached_extents); + if (blk_cnt > start) { + hip->extent_state |= HFSPLUS_EXT_DIRTY; + break; + } + alloc_cnt = start; + hip->cached_start = hip->cached_blocks = 0; + hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); + hfs_brec_remove(&fd); + } + hfs_find_exit(&fd); + mutex_unlock(&hip->extents_lock); + + hip->alloc_blocks = blk_cnt; +out: + hip->phys_size = inode->i_size; + hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY); +} diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h new file mode 100644 index 00000000..4e7f64b7 --- /dev/null +++ b/fs/hfsplus/hfsplus_fs.h @@ -0,0 +1,463 @@ +/* + * linux/include/linux/hfsplus_fs.h + * + * Copyright (C) 1999 + * Brad Boyer (flar@pants.nu) + * (C) 2003 Ardis Technologies + * + */ + +#ifndef _LINUX_HFSPLUS_FS_H +#define _LINUX_HFSPLUS_FS_H + +#include +#include +#include +#include +#include "hfsplus_raw.h" + +#define DBG_BNODE_REFS 0x00000001 +#define DBG_BNODE_MOD 0x00000002 +#define DBG_CAT_MOD 0x00000004 +#define DBG_INODE 0x00000008 +#define DBG_SUPER 0x00000010 +#define DBG_EXTENT 0x00000020 +#define DBG_BITMAP 0x00000040 + +#if 0 +#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) +#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) +#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) +#endif +#define DBG_MASK (0) + +#define dprint(flg, fmt, args...) \ + if (flg & DBG_MASK) \ + printk(fmt , ## args) + +/* Runtime config options */ +#define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */ + +#define HFSPLUS_TYPE_DATA 0x00 +#define HFSPLUS_TYPE_RSRC 0xFF + +typedef int (*btree_keycmp)(const hfsplus_btree_key *, + const hfsplus_btree_key *); + +#define NODE_HASH_SIZE 256 + +/* An HFS+ BTree held in memory */ +struct hfs_btree { + struct super_block *sb; + struct inode *inode; + btree_keycmp keycmp; + + u32 cnid; + u32 root; + u32 leaf_count; + u32 leaf_head; + u32 leaf_tail; + u32 node_count; + u32 free_nodes; + u32 attributes; + + unsigned int node_size; + unsigned int node_size_shift; + unsigned int max_key_len; + unsigned int depth; + + struct mutex tree_lock; + + unsigned int pages_per_bnode; + spinlock_t hash_lock; + struct hfs_bnode *node_hash[NODE_HASH_SIZE]; + int node_hash_cnt; +}; + +struct page; + +/* An HFS+ BTree node in memory */ +struct hfs_bnode { + struct hfs_btree *tree; + + u32 prev; + u32 this; + u32 next; + u32 parent; + + u16 num_recs; + u8 type; + u8 height; + + struct hfs_bnode *next_hash; + unsigned long flags; + wait_queue_head_t lock_wq; + atomic_t refcnt; + unsigned int page_offset; + struct page *page[0]; +}; + +#define HFS_BNODE_LOCK 0 +#define HFS_BNODE_ERROR 1 +#define HFS_BNODE_NEW 2 +#define HFS_BNODE_DIRTY 3 +#define HFS_BNODE_DELETED 4 + +/* + * HFS+ superblock info (built from Volume Header on disk) + */ + +struct hfsplus_vh; +struct hfs_btree; + +struct hfsplus_sb_info { + void *s_vhdr_buf; + struct hfsplus_vh *s_vhdr; + void *s_backup_vhdr_buf; + struct hfsplus_vh *s_backup_vhdr; + struct hfs_btree *ext_tree; + struct hfs_btree *cat_tree; + struct hfs_btree *attr_tree; + struct inode *alloc_file; + struct inode *hidden_dir; + struct nls_table *nls; + + /* Runtime variables */ + u32 blockoffset; + sector_t part_start; + sector_t sect_count; + int fs_shift; + + /* immutable data from the volume header */ + u32 alloc_blksz; + int alloc_blksz_shift; + u32 total_blocks; + u32 data_clump_blocks, rsrc_clump_blocks; + + /* mutable data from the volume header, protected by alloc_mutex */ + u32 free_blocks; + struct mutex alloc_mutex; + + /* mutable data from the volume header, protected by vh_mutex */ + u32 next_cnid; + u32 file_count; + u32 folder_count; + struct mutex vh_mutex; + + /* Config options */ + u32 creator; + u32 type; + + umode_t umask; + uid_t uid; + gid_t gid; + + int part, session; + + unsigned long flags; +}; + +#define HFSPLUS_SB_WRITEBACKUP 0 +#define HFSPLUS_SB_NODECOMPOSE 1 +#define HFSPLUS_SB_FORCE 2 +#define HFSPLUS_SB_HFSX 3 +#define HFSPLUS_SB_CASEFOLD 4 +#define HFSPLUS_SB_NOBARRIER 5 + +static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + + +struct hfsplus_inode_info { + atomic_t opencnt; + + /* + * Extent allocation information, protected by extents_lock. + */ + u32 first_blocks; + u32 clump_blocks; + u32 alloc_blocks; + u32 cached_start; + u32 cached_blocks; + hfsplus_extent_rec first_extents; + hfsplus_extent_rec cached_extents; + unsigned int extent_state; + struct mutex extents_lock; + + /* + * Immutable data. + */ + struct inode *rsrc_inode; + __be32 create_date; + + /* + * Protected by sbi->vh_mutex. + */ + u32 linkid; + + /* + * Accessed using atomic bitops. + */ + unsigned long flags; + + /* + * Protected by i_mutex. + */ + sector_t fs_blocks; + u8 userflags; /* BSD user file flags */ + struct list_head open_dir_list; + loff_t phys_size; + + struct inode vfs_inode; +}; + +#define HFSPLUS_EXT_DIRTY 0x0001 +#define HFSPLUS_EXT_NEW 0x0002 + +#define HFSPLUS_I_RSRC 0 /* represents a resource fork */ +#define HFSPLUS_I_CAT_DIRTY 1 /* has changes in the catalog tree */ +#define HFSPLUS_I_EXT_DIRTY 2 /* has changes in the extent tree */ +#define HFSPLUS_I_ALLOC_DIRTY 3 /* has changes in the allocation file */ + +#define HFSPLUS_IS_RSRC(inode) \ + test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags) + +static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) +{ + return list_entry(inode, struct hfsplus_inode_info, vfs_inode); +} + +/* + * Mark an inode dirty, and also mark the btree in which the + * specific type of metadata is stored. + * For data or metadata that gets written back by into the catalog btree + * by hfsplus_write_inode a plain mark_inode_dirty call is enough. + */ +static inline void hfsplus_mark_inode_dirty(struct inode *inode, + unsigned int flag) +{ + set_bit(flag, &HFSPLUS_I(inode)->flags); + mark_inode_dirty(inode); +} + +struct hfs_find_data { + /* filled by caller */ + hfsplus_btree_key *search_key; + hfsplus_btree_key *key; + /* filled by find */ + struct hfs_btree *tree; + struct hfs_bnode *bnode; + /* filled by findrec */ + int record; + int keyoffset, keylength; + int entryoffset, entrylength; +}; + +struct hfsplus_readdir_data { + struct list_head list; + struct file *file; + struct hfsplus_cat_key key; +}; + +/* + * Find minimum acceptible I/O size for an hfsplus sb. + */ +static inline unsigned short hfsplus_min_io_size(struct super_block *sb) +{ + return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev), + HFSPLUS_SECTOR_SIZE); +} + +#define hfs_btree_open hfsplus_btree_open +#define hfs_btree_close hfsplus_btree_close +#define hfs_btree_write hfsplus_btree_write +#define hfs_bmap_alloc hfsplus_bmap_alloc +#define hfs_bmap_free hfsplus_bmap_free +#define hfs_bnode_read hfsplus_bnode_read +#define hfs_bnode_read_u16 hfsplus_bnode_read_u16 +#define hfs_bnode_read_u8 hfsplus_bnode_read_u8 +#define hfs_bnode_read_key hfsplus_bnode_read_key +#define hfs_bnode_write hfsplus_bnode_write +#define hfs_bnode_write_u16 hfsplus_bnode_write_u16 +#define hfs_bnode_clear hfsplus_bnode_clear +#define hfs_bnode_copy hfsplus_bnode_copy +#define hfs_bnode_move hfsplus_bnode_move +#define hfs_bnode_dump hfsplus_bnode_dump +#define hfs_bnode_unlink hfsplus_bnode_unlink +#define hfs_bnode_findhash hfsplus_bnode_findhash +#define hfs_bnode_find hfsplus_bnode_find +#define hfs_bnode_unhash hfsplus_bnode_unhash +#define hfs_bnode_free hfsplus_bnode_free +#define hfs_bnode_create hfsplus_bnode_create +#define hfs_bnode_get hfsplus_bnode_get +#define hfs_bnode_put hfsplus_bnode_put +#define hfs_brec_lenoff hfsplus_brec_lenoff +#define hfs_brec_keylen hfsplus_brec_keylen +#define hfs_brec_insert hfsplus_brec_insert +#define hfs_brec_remove hfsplus_brec_remove +#define hfs_find_init hfsplus_find_init +#define hfs_find_exit hfsplus_find_exit +#define __hfs_brec_find __hplusfs_brec_find +#define hfs_brec_find hfsplus_brec_find +#define hfs_brec_read hfsplus_brec_read +#define hfs_brec_goto hfsplus_brec_goto +#define hfs_part_find hfsplus_part_find + +/* + * definitions for ext2 flag ioctls (linux really needs a generic + * interface for this). + */ + +/* ext2 ioctls (EXT2_IOC_GETFLAGS and EXT2_IOC_SETFLAGS) to support + * chattr/lsattr */ +#define HFSPLUS_IOC_EXT2_GETFLAGS FS_IOC_GETFLAGS +#define HFSPLUS_IOC_EXT2_SETFLAGS FS_IOC_SETFLAGS + + +/* + * Functions in any *.c used in other files + */ + +/* bitmap.c */ +int hfsplus_block_allocate(struct super_block *, u32, u32, u32 *); +int hfsplus_block_free(struct super_block *, u32, u32); + +/* btree.c */ +struct hfs_btree *hfs_btree_open(struct super_block *, u32); +void hfs_btree_close(struct hfs_btree *); +void hfs_btree_write(struct hfs_btree *); +struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *); +void hfs_bmap_free(struct hfs_bnode *); + +/* bnode.c */ +void hfs_bnode_read(struct hfs_bnode *, void *, int, int); +u16 hfs_bnode_read_u16(struct hfs_bnode *, int); +u8 hfs_bnode_read_u8(struct hfs_bnode *, int); +void hfs_bnode_read_key(struct hfs_bnode *, void *, int); +void hfs_bnode_write(struct hfs_bnode *, void *, int, int); +void hfs_bnode_write_u16(struct hfs_bnode *, int, u16); +void hfs_bnode_clear(struct hfs_bnode *, int, int); +void hfs_bnode_copy(struct hfs_bnode *, int, + struct hfs_bnode *, int, int); +void hfs_bnode_move(struct hfs_bnode *, int, int, int); +void hfs_bnode_dump(struct hfs_bnode *); +void hfs_bnode_unlink(struct hfs_bnode *); +struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *, u32); +struct hfs_bnode *hfs_bnode_find(struct hfs_btree *, u32); +void hfs_bnode_unhash(struct hfs_bnode *); +void hfs_bnode_free(struct hfs_bnode *); +struct hfs_bnode *hfs_bnode_create(struct hfs_btree *, u32); +void hfs_bnode_get(struct hfs_bnode *); +void hfs_bnode_put(struct hfs_bnode *); + +/* brec.c */ +u16 hfs_brec_lenoff(struct hfs_bnode *, u16, u16 *); +u16 hfs_brec_keylen(struct hfs_bnode *, u16); +int hfs_brec_insert(struct hfs_find_data *, void *, int); +int hfs_brec_remove(struct hfs_find_data *); + +/* bfind.c */ +int hfs_find_init(struct hfs_btree *, struct hfs_find_data *); +void hfs_find_exit(struct hfs_find_data *); +int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *); +int hfs_brec_find(struct hfs_find_data *); +int hfs_brec_read(struct hfs_find_data *, void *, int); +int hfs_brec_goto(struct hfs_find_data *, int); + +/* catalog.c */ +int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *, + const hfsplus_btree_key *); +int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *, + const hfsplus_btree_key *); +void hfsplus_cat_build_key(struct super_block *sb, + hfsplus_btree_key *, u32, struct qstr *); +int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *); +int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *); +int hfsplus_delete_cat(u32, struct inode *, struct qstr *); +int hfsplus_rename_cat(u32, struct inode *, struct qstr *, + struct inode *, struct qstr *); +void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms); + +/* dir.c */ +extern const struct inode_operations hfsplus_dir_inode_operations; +extern const struct file_operations hfsplus_dir_operations; + +/* extents.c */ +int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); +void hfsplus_ext_write_extent(struct inode *); +int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int); +int hfsplus_free_fork(struct super_block *, u32, + struct hfsplus_fork_raw *, int); +int hfsplus_file_extend(struct inode *); +void hfsplus_file_truncate(struct inode *); + +/* inode.c */ +extern const struct address_space_operations hfsplus_aops; +extern const struct address_space_operations hfsplus_btree_aops; +extern const struct dentry_operations hfsplus_dentry_operations; + +void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *); +void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *); +int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *); +int hfsplus_cat_write_inode(struct inode *); +struct inode *hfsplus_new_inode(struct super_block *, int); +void hfsplus_delete_inode(struct inode *); +int hfsplus_file_fsync(struct file *file, int datasync); + +/* ioctl.c */ +long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +int hfsplus_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size); +ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); + +/* options.c */ +int hfsplus_parse_options(char *, struct hfsplus_sb_info *); +int hfsplus_parse_options_remount(char *input, int *force); +void hfsplus_fill_defaults(struct hfsplus_sb_info *); +int hfsplus_show_options(struct seq_file *, struct vfsmount *); + +/* super.c */ +struct inode *hfsplus_iget(struct super_block *, unsigned long); +int hfsplus_sync_fs(struct super_block *sb, int wait); + +/* tables.c */ +extern u16 hfsplus_case_fold_table[]; +extern u16 hfsplus_decompose_table[]; +extern u16 hfsplus_compose_table[]; + +/* unicode.c */ +int hfsplus_strcasecmp(const struct hfsplus_unistr *, + const struct hfsplus_unistr *); +int hfsplus_strcmp(const struct hfsplus_unistr *, + const struct hfsplus_unistr *); +int hfsplus_uni2asc(struct super_block *, + const struct hfsplus_unistr *, char *, int *); +int hfsplus_asc2uni(struct super_block *, + struct hfsplus_unistr *, const char *, int); +int hfsplus_hash_dentry(const struct dentry *dentry, + const struct inode *inode, struct qstr *str); +int hfsplus_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); + +/* wrapper.c */ +int hfsplus_read_wrapper(struct super_block *); +int hfs_part_find(struct super_block *, sector_t *, sector_t *); +int hfsplus_submit_bio(struct super_block *sb, sector_t sector, + void *buf, void **data, int rw); + +/* time macros */ +#define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U) +#define __hfsp_ut2mt(t) (cpu_to_be32(t + 2082844800U)) + +/* compatibility */ +#define hfsp_mt2ut(t) (struct timespec){ .tv_sec = __hfsp_mt2ut(t) } +#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) +#define hfsp_now2mt() __hfsp_ut2mt(get_seconds()) + +#endif diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h new file mode 100644 index 00000000..927cdd6d --- /dev/null +++ b/fs/hfsplus/hfsplus_raw.h @@ -0,0 +1,337 @@ +/* + * linux/include/linux/hfsplus_raw.h + * + * Copyright (C) 1999 + * Brad Boyer (flar@pants.nu) + * (C) 2003 Ardis Technologies + * + * Format of structures on disk + * Information taken from Apple Technote #1150 (HFS Plus Volume Format) + * + */ + +#ifndef _LINUX_HFSPLUS_RAW_H +#define _LINUX_HFSPLUS_RAW_H + +#include + +/* Some constants */ +#define HFSPLUS_SECTOR_SIZE 512 +#define HFSPLUS_SECTOR_SHIFT 9 +#define HFSPLUS_VOLHEAD_SECTOR 2 +#define HFSPLUS_VOLHEAD_SIG 0x482b +#define HFSPLUS_VOLHEAD_SIGX 0x4858 +#define HFSPLUS_SUPER_MAGIC 0x482b +#define HFSPLUS_MIN_VERSION 4 +#define HFSPLUS_CURRENT_VERSION 5 + +#define HFSP_WRAP_MAGIC 0x4244 +#define HFSP_WRAP_ATTRIB_SLOCK 0x8000 +#define HFSP_WRAP_ATTRIB_SPARED 0x0200 + +#define HFSP_WRAPOFF_SIG 0x00 +#define HFSP_WRAPOFF_ATTRIB 0x0A +#define HFSP_WRAPOFF_ABLKSIZE 0x14 +#define HFSP_WRAPOFF_ABLKSTART 0x1C +#define HFSP_WRAPOFF_EMBEDSIG 0x7C +#define HFSP_WRAPOFF_EMBEDEXT 0x7E + +#define HFSP_HIDDENDIR_NAME \ + "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data" + +#define HFSP_HARDLINK_TYPE 0x686c6e6b /* 'hlnk' */ +#define HFSP_HFSPLUS_CREATOR 0x6866732b /* 'hfs+' */ + +#define HFSP_SYMLINK_TYPE 0x736c6e6b /* 'slnk' */ +#define HFSP_SYMLINK_CREATOR 0x72686170 /* 'rhap' */ + +#define HFSP_MOUNT_VERSION 0x482b4c78 /* 'H+Lx' */ + +/* Structures used on disk */ + +typedef __be32 hfsplus_cnid; +typedef __be16 hfsplus_unichr; + +/* A "string" as used in filenames, etc. */ +struct hfsplus_unistr { + __be16 length; + hfsplus_unichr unicode[255]; +} __packed; + +#define HFSPLUS_MAX_STRLEN 255 + +/* POSIX permissions */ +struct hfsplus_perm { + __be32 owner; + __be32 group; + u8 rootflags; + u8 userflags; + __be16 mode; + __be32 dev; +} __packed; + +#define HFSPLUS_FLG_NODUMP 0x01 +#define HFSPLUS_FLG_IMMUTABLE 0x02 +#define HFSPLUS_FLG_APPEND 0x04 + +/* A single contiguous area of a file */ +struct hfsplus_extent { + __be32 start_block; + __be32 block_count; +} __packed; +typedef struct hfsplus_extent hfsplus_extent_rec[8]; + +/* Information for a "Fork" in a file */ +struct hfsplus_fork_raw { + __be64 total_size; + __be32 clump_size; + __be32 total_blocks; + hfsplus_extent_rec extents; +} __packed; + +/* HFS+ Volume Header */ +struct hfsplus_vh { + __be16 signature; + __be16 version; + __be32 attributes; + __be32 last_mount_vers; + u32 reserved; + + __be32 create_date; + __be32 modify_date; + __be32 backup_date; + __be32 checked_date; + + __be32 file_count; + __be32 folder_count; + + __be32 blocksize; + __be32 total_blocks; + __be32 free_blocks; + + __be32 next_alloc; + __be32 rsrc_clump_sz; + __be32 data_clump_sz; + hfsplus_cnid next_cnid; + + __be32 write_count; + __be64 encodings_bmp; + + u8 finder_info[32]; + + struct hfsplus_fork_raw alloc_file; + struct hfsplus_fork_raw ext_file; + struct hfsplus_fork_raw cat_file; + struct hfsplus_fork_raw attr_file; + struct hfsplus_fork_raw start_file; +} __packed; + +/* HFS+ volume attributes */ +#define HFSPLUS_VOL_UNMNT (1 << 8) +#define HFSPLUS_VOL_SPARE_BLK (1 << 9) +#define HFSPLUS_VOL_NOCACHE (1 << 10) +#define HFSPLUS_VOL_INCNSTNT (1 << 11) +#define HFSPLUS_VOL_NODEID_REUSED (1 << 12) +#define HFSPLUS_VOL_JOURNALED (1 << 13) +#define HFSPLUS_VOL_SOFTLOCK (1 << 15) + +/* HFS+ BTree node descriptor */ +struct hfs_bnode_desc { + __be32 next; + __be32 prev; + s8 type; + u8 height; + __be16 num_recs; + u16 reserved; +} __packed; + +/* HFS+ BTree node types */ +#define HFS_NODE_INDEX 0x00 +#define HFS_NODE_HEADER 0x01 +#define HFS_NODE_MAP 0x02 +#define HFS_NODE_LEAF 0xFF + +/* HFS+ BTree header */ +struct hfs_btree_header_rec { + __be16 depth; + __be32 root; + __be32 leaf_count; + __be32 leaf_head; + __be32 leaf_tail; + __be16 node_size; + __be16 max_key_len; + __be32 node_count; + __be32 free_nodes; + u16 reserved1; + __be32 clump_size; + u8 btree_type; + u8 key_type; + __be32 attributes; + u32 reserved3[16]; +} __packed; + +/* BTree attributes */ +#define HFS_TREE_BIGKEYS 2 +#define HFS_TREE_VARIDXKEYS 4 + +/* HFS+ BTree misc info */ +#define HFSPLUS_TREE_HEAD 0 +#define HFSPLUS_NODE_MXSZ 32768 + +/* Some special File ID numbers (stolen from hfs.h) */ +#define HFSPLUS_POR_CNID 1 /* Parent Of the Root */ +#define HFSPLUS_ROOT_CNID 2 /* ROOT directory */ +#define HFSPLUS_EXT_CNID 3 /* EXTents B-tree */ +#define HFSPLUS_CAT_CNID 4 /* CATalog B-tree */ +#define HFSPLUS_BAD_CNID 5 /* BAD blocks file */ +#define HFSPLUS_ALLOC_CNID 6 /* ALLOCation file */ +#define HFSPLUS_START_CNID 7 /* STARTup file */ +#define HFSPLUS_ATTR_CNID 8 /* ATTRibutes file */ +#define HFSPLUS_EXCH_CNID 15 /* ExchangeFiles temp id */ +#define HFSPLUS_FIRSTUSER_CNID 16 /* first available user id */ + +/* btree key type */ +#define HFSPLUS_KEY_CASEFOLDING 0xCF /* case-insensitive */ +#define HFSPLUS_KEY_BINARY 0xBC /* case-sensitive */ + +/* HFS+ catalog entry key */ +struct hfsplus_cat_key { + __be16 key_len; + hfsplus_cnid parent; + struct hfsplus_unistr name; +} __packed; + +#define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key)) + +/* Structs from hfs.h */ +struct hfsp_point { + __be16 v; + __be16 h; +} __packed; + +struct hfsp_rect { + __be16 top; + __be16 left; + __be16 bottom; + __be16 right; +} __packed; + + +/* HFS directory info (stolen from hfs.h */ +struct DInfo { + struct hfsp_rect frRect; + __be16 frFlags; + struct hfsp_point frLocation; + __be16 frView; +} __packed; + +struct DXInfo { + struct hfsp_point frScroll; + __be32 frOpenChain; + __be16 frUnused; + __be16 frComment; + __be32 frPutAway; +} __packed; + +/* HFS+ folder data (part of an hfsplus_cat_entry) */ +struct hfsplus_cat_folder { + __be16 type; + __be16 flags; + __be32 valence; + hfsplus_cnid id; + __be32 create_date; + __be32 content_mod_date; + __be32 attribute_mod_date; + __be32 access_date; + __be32 backup_date; + struct hfsplus_perm permissions; + struct DInfo user_info; + struct DXInfo finder_info; + __be32 text_encoding; + u32 reserved; +} __packed; + +/* HFS file info (stolen from hfs.h) */ +struct FInfo { + __be32 fdType; + __be32 fdCreator; + __be16 fdFlags; + struct hfsp_point fdLocation; + __be16 fdFldr; +} __packed; + +struct FXInfo { + __be16 fdIconID; + u8 fdUnused[8]; + __be16 fdComment; + __be32 fdPutAway; +} __packed; + +/* HFS+ file data (part of a cat_entry) */ +struct hfsplus_cat_file { + __be16 type; + __be16 flags; + u32 reserved1; + hfsplus_cnid id; + __be32 create_date; + __be32 content_mod_date; + __be32 attribute_mod_date; + __be32 access_date; + __be32 backup_date; + struct hfsplus_perm permissions; + struct FInfo user_info; + struct FXInfo finder_info; + __be32 text_encoding; + u32 reserved2; + + struct hfsplus_fork_raw data_fork; + struct hfsplus_fork_raw rsrc_fork; +} __packed; + +/* File attribute bits */ +#define HFSPLUS_FILE_LOCKED 0x0001 +#define HFSPLUS_FILE_THREAD_EXISTS 0x0002 + +/* HFS+ catalog thread (part of a cat_entry) */ +struct hfsplus_cat_thread { + __be16 type; + s16 reserved; + hfsplus_cnid parentID; + struct hfsplus_unistr nodeName; +} __packed; + +#define HFSPLUS_MIN_THREAD_SZ 10 + +/* A data record in the catalog tree */ +typedef union { + __be16 type; + struct hfsplus_cat_folder folder; + struct hfsplus_cat_file file; + struct hfsplus_cat_thread thread; +} __packed hfsplus_cat_entry; + +/* HFS+ catalog entry type */ +#define HFSPLUS_FOLDER 0x0001 +#define HFSPLUS_FILE 0x0002 +#define HFSPLUS_FOLDER_THREAD 0x0003 +#define HFSPLUS_FILE_THREAD 0x0004 + +/* HFS+ extents tree key */ +struct hfsplus_ext_key { + __be16 key_len; + u8 fork_type; + u8 pad; + hfsplus_cnid cnid; + __be32 start_block; +} __packed; + +#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key) + +/* HFS+ generic BTree key */ +typedef union { + __be16 key_len; + struct hfsplus_cat_key cat; + struct hfsplus_ext_key ext; +} __packed hfsplus_btree_key; + +#endif diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c new file mode 100644 index 00000000..b248a6cf --- /dev/null +++ b/fs/hfsplus/inode.c @@ -0,0 +1,617 @@ +/* + * linux/fs/hfsplus/inode.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Inode handling routines + */ + +#include +#include +#include +#include +#include +#include + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +static int hfsplus_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, hfsplus_get_block); +} + +static int hfsplus_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, hfsplus_get_block, wbc); +} + +static int hfsplus_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + *pagep = NULL; + ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + hfsplus_get_block, + &HFSPLUS_I(mapping->host)->phys_size); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} + +static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, hfsplus_get_block); +} + +static int hfsplus_releasepage(struct page *page, gfp_t mask) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + struct hfs_btree *tree; + struct hfs_bnode *node; + u32 nidx; + int i, res = 1; + + switch (inode->i_ino) { + case HFSPLUS_EXT_CNID: + tree = HFSPLUS_SB(sb)->ext_tree; + break; + case HFSPLUS_CAT_CNID: + tree = HFSPLUS_SB(sb)->cat_tree; + break; + case HFSPLUS_ATTR_CNID: + tree = HFSPLUS_SB(sb)->attr_tree; + break; + default: + BUG(); + return 0; + } + if (!tree) + return 0; + if (tree->node_size >= PAGE_CACHE_SIZE) { + nidx = page->index >> + (tree->node_size_shift - PAGE_CACHE_SHIFT); + spin_lock(&tree->hash_lock); + node = hfs_bnode_findhash(tree, nidx); + if (!node) + ; + else if (atomic_read(&node->refcnt)) + res = 0; + if (res && node) { + hfs_bnode_unhash(node); + hfs_bnode_free(node); + } + spin_unlock(&tree->hash_lock); + } else { + nidx = page->index << + (PAGE_CACHE_SHIFT - tree->node_size_shift); + i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift); + spin_lock(&tree->hash_lock); + do { + node = hfs_bnode_findhash(tree, nidx++); + if (!node) + continue; + if (atomic_read(&node->refcnt)) { + res = 0; + break; + } + hfs_bnode_unhash(node); + hfs_bnode_free(node); + } while (--i && nidx < tree->node_count); + spin_unlock(&tree->hash_lock); + } + return res ? try_to_free_buffers(page) : 0; +} + +static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; + ssize_t ret; + + ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, hfsplus_get_block, NULL); + + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely((rw & WRITE) && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + + if (end > isize) + vmtruncate(inode, isize); + } + + return ret; +} + +static int hfsplus_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, hfsplus_get_block); +} + +const struct address_space_operations hfsplus_btree_aops = { + .readpage = hfsplus_readpage, + .writepage = hfsplus_writepage, + .write_begin = hfsplus_write_begin, + .write_end = generic_write_end, + .bmap = hfsplus_bmap, + .releasepage = hfsplus_releasepage, +}; + +const struct address_space_operations hfsplus_aops = { + .readpage = hfsplus_readpage, + .writepage = hfsplus_writepage, + .write_begin = hfsplus_write_begin, + .write_end = generic_write_end, + .bmap = hfsplus_bmap, + .direct_IO = hfsplus_direct_IO, + .writepages = hfsplus_writepages, +}; + +const struct dentry_operations hfsplus_dentry_operations = { + .d_hash = hfsplus_hash_dentry, + .d_compare = hfsplus_compare_dentry, +}; + +static struct dentry *hfsplus_file_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + struct hfs_find_data fd; + struct super_block *sb = dir->i_sb; + struct inode *inode = NULL; + struct hfsplus_inode_info *hip; + int err; + + if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) + goto out; + + inode = HFSPLUS_I(dir)->rsrc_inode; + if (inode) + goto out; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + hip = HFSPLUS_I(inode); + inode->i_ino = dir->i_ino; + INIT_LIST_HEAD(&hip->open_dir_list); + mutex_init(&hip->extents_lock); + hip->extent_state = 0; + hip->flags = 0; + set_bit(HFSPLUS_I_RSRC, &hip->flags); + + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + err = hfsplus_find_cat(sb, dir->i_ino, &fd); + if (!err) + err = hfsplus_cat_read_inode(inode, &fd); + hfs_find_exit(&fd); + if (err) { + iput(inode); + return ERR_PTR(err); + } + hip->rsrc_inode = dir; + HFSPLUS_I(dir)->rsrc_inode = inode; + igrab(dir); + + /* + * __mark_inode_dirty expects inodes to be hashed. Since we don't + * want resource fork inodes in the regular inode space, we make them + * appear hashed, but do not put on any lists. hlist_del() + * will work fine and require no locking. + */ + hlist_add_fake(&inode->i_hash); + + mark_inode_dirty(inode); +out: + d_add(dentry, inode); + return NULL; +} + +static void hfsplus_get_perms(struct inode *inode, + struct hfsplus_perm *perms, int dir) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + u16 mode; + + mode = be16_to_cpu(perms->mode); + + inode->i_uid = be32_to_cpu(perms->owner); + if (!inode->i_uid && !mode) + inode->i_uid = sbi->uid; + + inode->i_gid = be32_to_cpu(perms->group); + if (!inode->i_gid && !mode) + inode->i_gid = sbi->gid; + + if (dir) { + mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask)); + mode |= S_IFDIR; + } else if (!mode) + mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask)); + inode->i_mode = mode; + + HFSPLUS_I(inode)->userflags = perms->userflags; + if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (perms->rootflags & HFSPLUS_FLG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; +} + +static int hfsplus_file_open(struct inode *inode, struct file *file) +{ + if (HFSPLUS_IS_RSRC(inode)) + inode = HFSPLUS_I(inode)->rsrc_inode; + if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + return -EOVERFLOW; + atomic_inc(&HFSPLUS_I(inode)->opencnt); + return 0; +} + +static int hfsplus_file_release(struct inode *inode, struct file *file) +{ + struct super_block *sb = inode->i_sb; + + if (HFSPLUS_IS_RSRC(inode)) + inode = HFSPLUS_I(inode)->rsrc_inode; + if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) { + mutex_lock(&inode->i_mutex); + hfsplus_file_truncate(inode); + if (inode->i_flags & S_DEAD) { + hfsplus_delete_cat(inode->i_ino, + HFSPLUS_SB(sb)->hidden_dir, NULL); + hfsplus_delete_inode(inode); + } + mutex_unlock(&inode->i_mutex); + } + return 0; +} + +static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +int hfsplus_file_fsync(struct file *file, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + int error = 0, error2; + + /* + * Sync inode metadata into the catalog and extent trees. + */ + sync_inode_metadata(inode, 1); + + /* + * And explicitly write out the btrees. + */ + if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags)) + error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping); + + if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) { + error2 = + filemap_write_and_wait(sbi->ext_tree->inode->i_mapping); + if (!error) + error = error2; + } + + if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) { + error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); + if (!error) + error = error2; + } + + if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + + return error; +} + +static const struct inode_operations hfsplus_file_inode_operations = { + .lookup = hfsplus_file_lookup, + .truncate = hfsplus_file_truncate, + .setattr = hfsplus_setattr, + .setxattr = hfsplus_setxattr, + .getxattr = hfsplus_getxattr, + .listxattr = hfsplus_listxattr, +}; + +static const struct file_operations hfsplus_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .splice_read = generic_file_splice_read, + .fsync = hfsplus_file_fsync, + .open = hfsplus_file_open, + .release = hfsplus_file_release, + .unlocked_ioctl = hfsplus_ioctl, +}; + +struct inode *hfsplus_new_inode(struct super_block *sb, int mode) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct inode *inode = new_inode(sb); + struct hfsplus_inode_info *hip; + + if (!inode) + return NULL; + + inode->i_ino = sbi->next_cnid++; + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_nlink = 1; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + + hip = HFSPLUS_I(inode); + INIT_LIST_HEAD(&hip->open_dir_list); + mutex_init(&hip->extents_lock); + atomic_set(&hip->opencnt, 0); + hip->extent_state = 0; + hip->flags = 0; + memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec)); + memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); + hip->alloc_blocks = 0; + hip->first_blocks = 0; + hip->cached_start = 0; + hip->cached_blocks = 0; + hip->phys_size = 0; + hip->fs_blocks = 0; + hip->rsrc_inode = NULL; + if (S_ISDIR(inode->i_mode)) { + inode->i_size = 2; + sbi->folder_count++; + inode->i_op = &hfsplus_dir_inode_operations; + inode->i_fop = &hfsplus_dir_operations; + } else if (S_ISREG(inode->i_mode)) { + sbi->file_count++; + inode->i_op = &hfsplus_file_inode_operations; + inode->i_fop = &hfsplus_file_operations; + inode->i_mapping->a_ops = &hfsplus_aops; + hip->clump_blocks = sbi->data_clump_blocks; + } else if (S_ISLNK(inode->i_mode)) { + sbi->file_count++; + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &hfsplus_aops; + hip->clump_blocks = 1; + } else + sbi->file_count++; + insert_inode_hash(inode); + mark_inode_dirty(inode); + sb->s_dirt = 1; + + return inode; +} + +void hfsplus_delete_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (S_ISDIR(inode->i_mode)) { + HFSPLUS_SB(sb)->folder_count--; + sb->s_dirt = 1; + return; + } + HFSPLUS_SB(sb)->file_count--; + if (S_ISREG(inode->i_mode)) { + if (!inode->i_nlink) { + inode->i_size = 0; + hfsplus_file_truncate(inode); + } + } else if (S_ISLNK(inode->i_mode)) { + inode->i_size = 0; + hfsplus_file_truncate(inode); + } + sb->s_dirt = 1; +} + +void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) +{ + struct super_block *sb = inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + u32 count; + int i; + + memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec)); + for (count = 0, i = 0; i < 8; i++) + count += be32_to_cpu(fork->extents[i].block_count); + hip->first_blocks = count; + memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); + hip->cached_start = 0; + hip->cached_blocks = 0; + + hip->alloc_blocks = be32_to_cpu(fork->total_blocks); + hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size); + hip->fs_blocks = + (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits); + hip->clump_blocks = + be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift; + if (!hip->clump_blocks) { + hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ? + sbi->rsrc_clump_blocks : + sbi->data_clump_blocks; + } +} + +void hfsplus_inode_write_fork(struct inode *inode, + struct hfsplus_fork_raw *fork) +{ + memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents, + sizeof(hfsplus_extent_rec)); + fork->total_size = cpu_to_be64(inode->i_size); + fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks); +} + +int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) +{ + hfsplus_cat_entry entry; + int res = 0; + u16 type; + + type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); + + HFSPLUS_I(inode)->linkid = 0; + if (type == HFSPLUS_FOLDER) { + struct hfsplus_cat_folder *folder = &entry.folder; + + if (fd->entrylength < sizeof(struct hfsplus_cat_folder)) + /* panic? */; + hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, + sizeof(struct hfsplus_cat_folder)); + hfsplus_get_perms(inode, &folder->permissions, 1); + inode->i_nlink = 1; + inode->i_size = 2 + be32_to_cpu(folder->valence); + inode->i_atime = hfsp_mt2ut(folder->access_date); + inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); + inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); + HFSPLUS_I(inode)->create_date = folder->create_date; + HFSPLUS_I(inode)->fs_blocks = 0; + inode->i_op = &hfsplus_dir_inode_operations; + inode->i_fop = &hfsplus_dir_operations; + } else if (type == HFSPLUS_FILE) { + struct hfsplus_cat_file *file = &entry.file; + + if (fd->entrylength < sizeof(struct hfsplus_cat_file)) + /* panic? */; + hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, + sizeof(struct hfsplus_cat_file)); + + hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ? + &file->rsrc_fork : &file->data_fork); + hfsplus_get_perms(inode, &file->permissions, 0); + inode->i_nlink = 1; + if (S_ISREG(inode->i_mode)) { + if (file->permissions.dev) + inode->i_nlink = + be32_to_cpu(file->permissions.dev); + inode->i_op = &hfsplus_file_inode_operations; + inode->i_fop = &hfsplus_file_operations; + inode->i_mapping->a_ops = &hfsplus_aops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &hfsplus_aops; + } else { + init_special_inode(inode, inode->i_mode, + be32_to_cpu(file->permissions.dev)); + } + inode->i_atime = hfsp_mt2ut(file->access_date); + inode->i_mtime = hfsp_mt2ut(file->content_mod_date); + inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); + HFSPLUS_I(inode)->create_date = file->create_date; + } else { + printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); + res = -EIO; + } + return res; +} + +int hfsplus_cat_write_inode(struct inode *inode) +{ + struct inode *main_inode = inode; + struct hfs_find_data fd; + hfsplus_cat_entry entry; + + if (HFSPLUS_IS_RSRC(inode)) + main_inode = HFSPLUS_I(inode)->rsrc_inode; + + if (!main_inode->i_nlink) + return 0; + + if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd)) + /* panic? */ + return -EIO; + + if (hfsplus_find_cat(main_inode->i_sb, main_inode->i_ino, &fd)) + /* panic? */ + goto out; + + if (S_ISDIR(main_inode->i_mode)) { + struct hfsplus_cat_folder *folder = &entry.folder; + + if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) + /* panic? */; + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_folder)); + /* simple node checks? */ + hfsplus_cat_set_perms(inode, &folder->permissions); + folder->access_date = hfsp_ut2mt(inode->i_atime); + folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); + folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); + folder->valence = cpu_to_be32(inode->i_size - 2); + hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_folder)); + } else if (HFSPLUS_IS_RSRC(inode)) { + struct hfsplus_cat_file *file = &entry.file; + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + hfsplus_inode_write_fork(inode, &file->rsrc_fork); + hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + } else { + struct hfsplus_cat_file *file = &entry.file; + + if (fd.entrylength < sizeof(struct hfsplus_cat_file)) + /* panic? */; + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + hfsplus_inode_write_fork(inode, &file->data_fork); + hfsplus_cat_set_perms(inode, &file->permissions); + if (HFSPLUS_FLG_IMMUTABLE & + (file->permissions.rootflags | + file->permissions.userflags)) + file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); + else + file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED); + file->access_date = hfsp_ut2mt(inode->i_atime); + file->content_mod_date = hfsp_ut2mt(inode->i_mtime); + file->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); + hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + } + + set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags); +out: + hfs_find_exit(&fd); + return 0; +} diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c new file mode 100644 index 00000000..fbaa6690 --- /dev/null +++ b/fs/hfsplus/ioctl.c @@ -0,0 +1,221 @@ +/* + * linux/fs/hfsplus/ioctl.c + * + * Copyright (C) 2003 + * Ethan Benson + * partially derived from linux/fs/ext2/ioctl.c + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * hfsplus ioctls + */ + +#include +#include +#include +#include +#include +#include +#include "hfsplus_fs.h" + +static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + unsigned int flags = 0; + + if (inode->i_flags & S_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; + if (inode->i_flags & S_APPEND) + flags |= FS_APPEND_FL; + if (hip->userflags & HFSPLUS_FLG_NODUMP) + flags |= FS_NODUMP_FL; + + return put_user(flags, user_flags); +} + +static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + unsigned int flags; + int err = 0; + + err = mnt_want_write(file->f_path.mnt); + if (err) + goto out; + + if (!inode_owner_or_capable(inode)) { + err = -EACCES; + goto out_drop_write; + } + + if (get_user(flags, user_flags)) { + err = -EFAULT; + goto out_drop_write; + } + + mutex_lock(&inode->i_mutex); + + if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) || + inode->i_flags & (S_IMMUTABLE|S_APPEND)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto out_unlock_inode; + } + } + + /* don't silently ignore unsupported ext2 flags */ + if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { + err = -EOPNOTSUPP; + goto out_unlock_inode; + } + + if (flags & FS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + + if (flags & FS_APPEND_FL) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + + if (flags & FS_NODUMP_FL) + hip->userflags |= HFSPLUS_FLG_NODUMP; + else + hip->userflags &= ~HFSPLUS_FLG_NODUMP; + + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + +out_unlock_inode: + mutex_unlock(&inode->i_mutex); +out_drop_write: + mnt_drop_write(file->f_path.mnt); +out: + return err; +} + +long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + + switch (cmd) { + case HFSPLUS_IOC_EXT2_GETFLAGS: + return hfsplus_ioctl_getflags(file, argp); + case HFSPLUS_IOC_EXT2_SETFLAGS: + return hfsplus_ioctl_setflags(file, argp); + default: + return -ENOTTY; + } +} + +int hfsplus_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + struct hfs_find_data fd; + hfsplus_cat_entry entry; + struct hfsplus_cat_file *file; + int res; + + if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); + if (res) + return res; + res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); + if (res) + goto out; + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + file = &entry.file; + + if (!strcmp(name, "hfs.type")) { + if (size == 4) + memcpy(&file->user_info.fdType, value, 4); + else + res = -ERANGE; + } else if (!strcmp(name, "hfs.creator")) { + if (size == 4) + memcpy(&file->user_info.fdCreator, value, 4); + else + res = -ERANGE; + } else + res = -EOPNOTSUPP; + if (!res) { + hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); + } +out: + hfs_find_exit(&fd); + return res; +} + +ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct hfs_find_data fd; + hfsplus_cat_entry entry; + struct hfsplus_cat_file *file; + ssize_t res = 0; + + if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + if (size) { + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); + if (res) + return res; + res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); + if (res) + goto out; + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + } + file = &entry.file; + + if (!strcmp(name, "hfs.type")) { + if (size >= 4) { + memcpy(value, &file->user_info.fdType, 4); + res = 4; + } else + res = size ? -ERANGE : 4; + } else if (!strcmp(name, "hfs.creator")) { + if (size >= 4) { + memcpy(value, &file->user_info.fdCreator, 4); + res = 4; + } else + res = size ? -ERANGE : 4; + } else + res = -EOPNOTSUPP; +out: + if (size) + hfs_find_exit(&fd); + return res; +} + +#define HFSPLUS_ATTRLIST_SIZE (sizeof("hfs.creator")+sizeof("hfs.type")) + +ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = dentry->d_inode; + + if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + if (!buffer || !size) + return HFSPLUS_ATTRLIST_SIZE; + if (size < HFSPLUS_ATTRLIST_SIZE) + return -ERANGE; + strcpy(buffer, "hfs.type"); + strcpy(buffer + sizeof("hfs.type"), "hfs.creator"); + + return HFSPLUS_ATTRLIST_SIZE; +} diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c new file mode 100644 index 00000000..bb62a588 --- /dev/null +++ b/fs/hfsplus/options.c @@ -0,0 +1,230 @@ +/* + * linux/fs/hfsplus/options.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Option parsing + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "hfsplus_fs.h" + +enum { + opt_creator, opt_type, + opt_umask, opt_uid, opt_gid, + opt_part, opt_session, opt_nls, + opt_nodecompose, opt_decompose, + opt_barrier, opt_nobarrier, + opt_force, opt_err +}; + +static const match_table_t tokens = { + { opt_creator, "creator=%s" }, + { opt_type, "type=%s" }, + { opt_umask, "umask=%o" }, + { opt_uid, "uid=%u" }, + { opt_gid, "gid=%u" }, + { opt_part, "part=%u" }, + { opt_session, "session=%u" }, + { opt_nls, "nls=%s" }, + { opt_decompose, "decompose" }, + { opt_nodecompose, "nodecompose" }, + { opt_barrier, "barrier" }, + { opt_nobarrier, "nobarrier" }, + { opt_force, "force" }, + { opt_err, NULL } +}; + +/* Initialize an options object to reasonable defaults */ +void hfsplus_fill_defaults(struct hfsplus_sb_info *opts) +{ + if (!opts) + return; + + opts->creator = HFSPLUS_DEF_CR_TYPE; + opts->type = HFSPLUS_DEF_CR_TYPE; + opts->umask = current_umask(); + opts->uid = current_uid(); + opts->gid = current_gid(); + opts->part = -1; + opts->session = -1; +} + +/* convert a "four byte character" to a 32 bit int with error checks */ +static inline int match_fourchar(substring_t *arg, u32 *result) +{ + if (arg->to - arg->from != 4) + return -EINVAL; + memcpy(result, arg->from, 4); + return 0; +} + +int hfsplus_parse_options_remount(char *input, int *force) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int token; + + if (!input) + return 0; + + while ((p = strsep(&input, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case opt_force: + *force = 1; + break; + default: + break; + } + } + + return 1; +} + +/* Parse options from mount. Returns 0 on failure */ +/* input is the options passed to mount() as a string */ +int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int tmp, token; + + if (!input) + goto done; + + while ((p = strsep(&input, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case opt_creator: + if (match_fourchar(&args[0], &sbi->creator)) { + printk(KERN_ERR "hfs: creator requires a 4 character value\n"); + return 0; + } + break; + case opt_type: + if (match_fourchar(&args[0], &sbi->type)) { + printk(KERN_ERR "hfs: type requires a 4 character value\n"); + return 0; + } + break; + case opt_umask: + if (match_octal(&args[0], &tmp)) { + printk(KERN_ERR "hfs: umask requires a value\n"); + return 0; + } + sbi->umask = (umode_t)tmp; + break; + case opt_uid: + if (match_int(&args[0], &tmp)) { + printk(KERN_ERR "hfs: uid requires an argument\n"); + return 0; + } + sbi->uid = (uid_t)tmp; + break; + case opt_gid: + if (match_int(&args[0], &tmp)) { + printk(KERN_ERR "hfs: gid requires an argument\n"); + return 0; + } + sbi->gid = (gid_t)tmp; + break; + case opt_part: + if (match_int(&args[0], &sbi->part)) { + printk(KERN_ERR "hfs: part requires an argument\n"); + return 0; + } + break; + case opt_session: + if (match_int(&args[0], &sbi->session)) { + printk(KERN_ERR "hfs: session requires an argument\n"); + return 0; + } + break; + case opt_nls: + if (sbi->nls) { + printk(KERN_ERR "hfs: unable to change nls mapping\n"); + return 0; + } + p = match_strdup(&args[0]); + if (p) + sbi->nls = load_nls(p); + if (!sbi->nls) { + printk(KERN_ERR "hfs: unable to load " + "nls mapping \"%s\"\n", + p); + kfree(p); + return 0; + } + kfree(p); + break; + case opt_decompose: + clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); + break; + case opt_nodecompose: + set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); + break; + case opt_barrier: + clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); + break; + case opt_nobarrier: + set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); + break; + case opt_force: + set_bit(HFSPLUS_SB_FORCE, &sbi->flags); + break; + default: + return 0; + } + } + +done: + if (!sbi->nls) { + /* try utf8 first, as this is the old default behaviour */ + sbi->nls = load_nls("utf8"); + if (!sbi->nls) + sbi->nls = load_nls_default(); + if (!sbi->nls) + return 0; + } + + return 1; +} + +int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb); + + if (sbi->creator != HFSPLUS_DEF_CR_TYPE) + seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); + if (sbi->type != HFSPLUS_DEF_CR_TYPE) + seq_printf(seq, ",type=%.4s", (char *)&sbi->type); + seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, + sbi->uid, sbi->gid); + if (sbi->part >= 0) + seq_printf(seq, ",part=%u", sbi->part); + if (sbi->session >= 0) + seq_printf(seq, ",session=%u", sbi->session); + if (sbi->nls) + seq_printf(seq, ",nls=%s", sbi->nls->charset); + if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags)) + seq_printf(seq, ",nodecompose"); + if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) + seq_printf(seq, ",nobarrier"); + return 0; +} diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c new file mode 100644 index 00000000..eb355d81 --- /dev/null +++ b/fs/hfsplus/part_tbl.c @@ -0,0 +1,157 @@ +/* + * linux/fs/hfsplus/part_tbl.c + * + * Copyright (C) 1996-1997 Paul H. Hargrove + * This file may be distributed under the terms of + * the GNU General Public License. + * + * Original code to handle the new style Mac partition table based on + * a patch contributed by Holger Schemel (aeglos@valinor.owl.de). + * + * In function preconditions the term "valid" applied to a pointer to + * a structure means that the pointer is non-NULL and the structure it + * points to has all fields initialized to consistent values. + * + */ + +#include +#include "hfsplus_fs.h" + +/* offsets to various blocks */ +#define HFS_DD_BLK 0 /* Driver Descriptor block */ +#define HFS_PMAP_BLK 1 /* First block of partition map */ +#define HFS_MDB_BLK 2 /* Block (w/i partition) of MDB */ + +/* magic numbers for various disk blocks */ +#define HFS_DRVR_DESC_MAGIC 0x4552 /* "ER": driver descriptor map */ +#define HFS_OLD_PMAP_MAGIC 0x5453 /* "TS": old-type partition map */ +#define HFS_NEW_PMAP_MAGIC 0x504D /* "PM": new-type partition map */ +#define HFS_SUPER_MAGIC 0x4244 /* "BD": HFS MDB (super block) */ +#define HFS_MFS_SUPER_MAGIC 0xD2D7 /* MFS MDB (super block) */ + +/* + * The new style Mac partition map + * + * For each partition on the media there is a physical block (512-byte + * block) containing one of these structures. These blocks are + * contiguous starting at block 1. + */ +struct new_pmap { + __be16 pmSig; /* signature */ + __be16 reSigPad; /* padding */ + __be32 pmMapBlkCnt; /* partition blocks count */ + __be32 pmPyPartStart; /* physical block start of partition */ + __be32 pmPartBlkCnt; /* physical block count of partition */ + u8 pmPartName[32]; /* (null terminated?) string + giving the name of this + partition */ + u8 pmPartType[32]; /* (null terminated?) string + giving the type of this + partition */ + /* a bunch more stuff we don't need */ +} __packed; + +/* + * The old style Mac partition map + * + * The partition map consists for a 2-byte signature followed by an + * array of these structures. The map is terminated with an all-zero + * one of these. + */ +struct old_pmap { + __be16 pdSig; /* Signature bytes */ + struct old_pmap_entry { + __be32 pdStart; + __be32 pdSize; + __be32 pdFSID; + } pdEntry[42]; +} __packed; + +static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm, + sector_t *part_start, sector_t *part_size) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + int i; + + for (i = 0; i < 42; i++) { + struct old_pmap_entry *p = &pm->pdEntry[i]; + + if (p->pdStart && p->pdSize && + p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && + (sbi->part < 0 || sbi->part == i)) { + *part_start += be32_to_cpu(p->pdStart); + *part_size = be32_to_cpu(p->pdSize); + return 0; + } + } + + return -ENOENT; +} + +static int hfs_parse_new_pmap(struct super_block *sb, void *buf, + struct new_pmap *pm, sector_t *part_start, sector_t *part_size) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + int size = be32_to_cpu(pm->pmMapBlkCnt); + int buf_size = hfsplus_min_io_size(sb); + int res; + int i = 0; + + do { + if (!memcmp(pm->pmPartType, "Apple_HFS", 9) && + (sbi->part < 0 || sbi->part == i)) { + *part_start += be32_to_cpu(pm->pmPyPartStart); + *part_size = be32_to_cpu(pm->pmPartBlkCnt); + return 0; + } + + if (++i >= size) + return -ENOENT; + + pm = (struct new_pmap *)((u8 *)pm + HFSPLUS_SECTOR_SIZE); + if ((u8 *)pm - (u8 *)buf >= buf_size) { + res = hfsplus_submit_bio(sb, + *part_start + HFS_PMAP_BLK + i, + buf, (void **)&pm, READ); + if (res) + return res; + } + } while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC)); + + return -ENOENT; +} + +/* + * Parse the partition map looking for the start and length of a + * HFS/HFS+ partition. + */ +int hfs_part_find(struct super_block *sb, + sector_t *part_start, sector_t *part_size) +{ + void *buf, *data; + int res; + + buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK, + buf, &data, READ); + if (res) + goto out; + + switch (be16_to_cpu(*((__be16 *)data))) { + case HFS_OLD_PMAP_MAGIC: + res = hfs_parse_old_pmap(sb, data, part_start, part_size); + break; + case HFS_NEW_PMAP_MAGIC: + res = hfs_parse_new_pmap(sb, buf, data, part_start, part_size); + break; + default: + res = -ENOENT; + break; + } +out: + kfree(buf); + return res; +} diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c new file mode 100644 index 00000000..c3a76fd2 --- /dev/null +++ b/fs/hfsplus/super.c @@ -0,0 +1,593 @@ +/* + * linux/fs/hfsplus/super.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct inode *hfsplus_alloc_inode(struct super_block *sb); +static void hfsplus_destroy_inode(struct inode *inode); + +#include "hfsplus_fs.h" + +static int hfsplus_system_read_inode(struct inode *inode) +{ + struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr; + + switch (inode->i_ino) { + case HFSPLUS_EXT_CNID: + hfsplus_inode_read_fork(inode, &vhdr->ext_file); + inode->i_mapping->a_ops = &hfsplus_btree_aops; + break; + case HFSPLUS_CAT_CNID: + hfsplus_inode_read_fork(inode, &vhdr->cat_file); + inode->i_mapping->a_ops = &hfsplus_btree_aops; + break; + case HFSPLUS_ALLOC_CNID: + hfsplus_inode_read_fork(inode, &vhdr->alloc_file); + inode->i_mapping->a_ops = &hfsplus_aops; + break; + case HFSPLUS_START_CNID: + hfsplus_inode_read_fork(inode, &vhdr->start_file); + break; + case HFSPLUS_ATTR_CNID: + hfsplus_inode_read_fork(inode, &vhdr->attr_file); + inode->i_mapping->a_ops = &hfsplus_btree_aops; + break; + default: + return -EIO; + } + + return 0; +} + +struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) +{ + struct hfs_find_data fd; + struct inode *inode; + int err; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); + mutex_init(&HFSPLUS_I(inode)->extents_lock); + HFSPLUS_I(inode)->flags = 0; + HFSPLUS_I(inode)->extent_state = 0; + HFSPLUS_I(inode)->rsrc_inode = NULL; + atomic_set(&HFSPLUS_I(inode)->opencnt, 0); + + if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || + inode->i_ino == HFSPLUS_ROOT_CNID) { + hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); + err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); + if (!err) + err = hfsplus_cat_read_inode(inode, &fd); + hfs_find_exit(&fd); + } else { + err = hfsplus_system_read_inode(inode); + } + + if (err) { + iget_failed(inode); + return ERR_PTR(err); + } + + unlock_new_inode(inode); + return inode; +} + +static int hfsplus_system_write_inode(struct inode *inode) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + struct hfsplus_vh *vhdr = sbi->s_vhdr; + struct hfsplus_fork_raw *fork; + struct hfs_btree *tree = NULL; + + switch (inode->i_ino) { + case HFSPLUS_EXT_CNID: + fork = &vhdr->ext_file; + tree = sbi->ext_tree; + break; + case HFSPLUS_CAT_CNID: + fork = &vhdr->cat_file; + tree = sbi->cat_tree; + break; + case HFSPLUS_ALLOC_CNID: + fork = &vhdr->alloc_file; + break; + case HFSPLUS_START_CNID: + fork = &vhdr->start_file; + break; + case HFSPLUS_ATTR_CNID: + fork = &vhdr->attr_file; + tree = sbi->attr_tree; + default: + return -EIO; + } + + if (fork->total_size != cpu_to_be64(inode->i_size)) { + set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags); + inode->i_sb->s_dirt = 1; + } + hfsplus_inode_write_fork(inode, fork); + if (tree) + hfs_btree_write(tree); + return 0; +} + +static int hfsplus_write_inode(struct inode *inode, + struct writeback_control *wbc) +{ + dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); + + hfsplus_ext_write_extent(inode); + + if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || + inode->i_ino == HFSPLUS_ROOT_CNID) + return hfsplus_cat_write_inode(inode); + else + return hfsplus_system_write_inode(inode); +} + +static void hfsplus_evict_inode(struct inode *inode) +{ + dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + if (HFSPLUS_IS_RSRC(inode)) { + HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; + iput(HFSPLUS_I(inode)->rsrc_inode); + } +} + +int hfsplus_sync_fs(struct super_block *sb, int wait) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_vh *vhdr = sbi->s_vhdr; + int write_backup = 0; + int error, error2; + + if (!wait) + return 0; + + dprint(DBG_SUPER, "hfsplus_write_super\n"); + + sb->s_dirt = 0; + + /* + * Explicitly write out the special metadata inodes. + * + * While these special inodes are marked as hashed and written + * out peridocically by the flusher threads we redirty them + * during writeout of normal inodes, and thus the life lock + * prevents us from getting the latest state to disk. + */ + error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping); + error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping); + if (!error) + error = error2; + error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); + if (!error) + error = error2; + + mutex_lock(&sbi->vh_mutex); + mutex_lock(&sbi->alloc_mutex); + vhdr->free_blocks = cpu_to_be32(sbi->free_blocks); + vhdr->next_cnid = cpu_to_be32(sbi->next_cnid); + vhdr->folder_count = cpu_to_be32(sbi->folder_count); + vhdr->file_count = cpu_to_be32(sbi->file_count); + + if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) { + memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr)); + write_backup = 1; + } + + error2 = hfsplus_submit_bio(sb, + sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, + sbi->s_vhdr_buf, NULL, WRITE_SYNC); + if (!error) + error = error2; + if (!write_backup) + goto out; + + error2 = hfsplus_submit_bio(sb, + sbi->part_start + sbi->sect_count - 2, + sbi->s_backup_vhdr_buf, NULL, WRITE_SYNC); + if (!error) + error2 = error; +out: + mutex_unlock(&sbi->alloc_mutex); + mutex_unlock(&sbi->vh_mutex); + + if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) + blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + + return error; +} + +static void hfsplus_write_super(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) + hfsplus_sync_fs(sb, 1); + else + sb->s_dirt = 0; +} + +static void hfsplus_put_super(struct super_block *sb) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + + dprint(DBG_SUPER, "hfsplus_put_super\n"); + + if (!sb->s_fs_info) + return; + + if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) { + struct hfsplus_vh *vhdr = sbi->s_vhdr; + + vhdr->modify_date = hfsp_now2mt(); + vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); + vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); + + hfsplus_sync_fs(sb, 1); + } + + hfs_btree_close(sbi->cat_tree); + hfs_btree_close(sbi->ext_tree); + iput(sbi->alloc_file); + iput(sbi->hidden_dir); + kfree(sbi->s_vhdr_buf); + kfree(sbi->s_backup_vhdr_buf); + unload_nls(sbi->nls); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; +} + +static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = HFSPLUS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = sbi->total_blocks << sbi->fs_shift; + buf->f_bfree = sbi->free_blocks << sbi->fs_shift; + buf->f_bavail = buf->f_bfree; + buf->f_files = 0xFFFFFFFF; + buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = HFSPLUS_MAX_STRLEN; + + return 0; +} + +static int hfsplus_remount(struct super_block *sb, int *flags, char *data) +{ + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + if (!(*flags & MS_RDONLY)) { + struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr; + int force = 0; + + if (!hfsplus_parse_options_remount(data, &force)) + return -EINVAL; + + if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { + printk(KERN_WARNING "hfs: filesystem was " + "not cleanly unmounted, " + "running fsck.hfsplus is recommended. " + "leaving read-only.\n"); + sb->s_flags |= MS_RDONLY; + *flags |= MS_RDONLY; + } else if (force) { + /* nothing */ + } else if (vhdr->attributes & + cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { + printk(KERN_WARNING "hfs: filesystem is marked locked, " + "leaving read-only.\n"); + sb->s_flags |= MS_RDONLY; + *flags |= MS_RDONLY; + } else if (vhdr->attributes & + cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { + printk(KERN_WARNING "hfs: filesystem is " + "marked journaled, " + "leaving read-only.\n"); + sb->s_flags |= MS_RDONLY; + *flags |= MS_RDONLY; + } + } + return 0; +} + +static const struct super_operations hfsplus_sops = { + .alloc_inode = hfsplus_alloc_inode, + .destroy_inode = hfsplus_destroy_inode, + .write_inode = hfsplus_write_inode, + .evict_inode = hfsplus_evict_inode, + .put_super = hfsplus_put_super, + .write_super = hfsplus_write_super, + .sync_fs = hfsplus_sync_fs, + .statfs = hfsplus_statfs, + .remount_fs = hfsplus_remount, + .show_options = hfsplus_show_options, +}; + +static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) +{ + struct hfsplus_vh *vhdr; + struct hfsplus_sb_info *sbi; + hfsplus_cat_entry entry; + struct hfs_find_data fd; + struct inode *root, *inode; + struct qstr str; + struct nls_table *nls = NULL; + int err; + + err = -EINVAL; + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + goto out; + + sb->s_fs_info = sbi; + mutex_init(&sbi->alloc_mutex); + mutex_init(&sbi->vh_mutex); + hfsplus_fill_defaults(sbi); + + err = -EINVAL; + if (!hfsplus_parse_options(data, sbi)) { + printk(KERN_ERR "hfs: unable to parse mount options\n"); + goto out_unload_nls; + } + + /* temporarily use utf8 to correctly find the hidden dir below */ + nls = sbi->nls; + sbi->nls = load_nls("utf8"); + if (!sbi->nls) { + printk(KERN_ERR "hfs: unable to load nls for utf8\n"); + goto out_unload_nls; + } + + /* Grab the volume header */ + if (hfsplus_read_wrapper(sb)) { + if (!silent) + printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n"); + goto out_unload_nls; + } + vhdr = sbi->s_vhdr; + + /* Copy parts of the volume header into the superblock */ + sb->s_magic = HFSPLUS_VOLHEAD_SIG; + if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION || + be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) { + printk(KERN_ERR "hfs: wrong filesystem version\n"); + goto out_free_vhdr; + } + sbi->total_blocks = be32_to_cpu(vhdr->total_blocks); + sbi->free_blocks = be32_to_cpu(vhdr->free_blocks); + sbi->next_cnid = be32_to_cpu(vhdr->next_cnid); + sbi->file_count = be32_to_cpu(vhdr->file_count); + sbi->folder_count = be32_to_cpu(vhdr->folder_count); + sbi->data_clump_blocks = + be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift; + if (!sbi->data_clump_blocks) + sbi->data_clump_blocks = 1; + sbi->rsrc_clump_blocks = + be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift; + if (!sbi->rsrc_clump_blocks) + sbi->rsrc_clump_blocks = 1; + + /* Set up operations so we can load metadata */ + sb->s_op = &hfsplus_sops; + sb->s_maxbytes = MAX_LFS_FILESIZE; + + if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { + printk(KERN_WARNING "hfs: Filesystem was " + "not cleanly unmounted, " + "running fsck.hfsplus is recommended. " + "mounting read-only.\n"); + sb->s_flags |= MS_RDONLY; + } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { + /* nothing */ + } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { + printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); + sb->s_flags |= MS_RDONLY; + } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && + !(sb->s_flags & MS_RDONLY)) { + printk(KERN_WARNING "hfs: write access to " + "a journaled filesystem is not supported, " + "use the force option at your own risk, " + "mounting read-only.\n"); + sb->s_flags |= MS_RDONLY; + } + + /* Load metadata objects (B*Trees) */ + sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); + if (!sbi->ext_tree) { + printk(KERN_ERR "hfs: failed to load extents file\n"); + goto out_free_vhdr; + } + sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); + if (!sbi->cat_tree) { + printk(KERN_ERR "hfs: failed to load catalog file\n"); + goto out_close_ext_tree; + } + + inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID); + if (IS_ERR(inode)) { + printk(KERN_ERR "hfs: failed to load allocation file\n"); + err = PTR_ERR(inode); + goto out_close_cat_tree; + } + sbi->alloc_file = inode; + + /* Load the root directory */ + root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); + if (IS_ERR(root)) { + printk(KERN_ERR "hfs: failed to load root directory\n"); + err = PTR_ERR(root); + goto out_put_alloc_file; + } + + str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; + str.name = HFSP_HIDDENDIR_NAME; + hfs_find_init(sbi->cat_tree, &fd); + hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); + if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { + hfs_find_exit(&fd); + if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) + goto out_put_root; + inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_put_root; + } + sbi->hidden_dir = inode; + } else + hfs_find_exit(&fd); + + if (!(sb->s_flags & MS_RDONLY)) { + /* + * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused + * all three are registered with Apple for our use + */ + vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION); + vhdr->modify_date = hfsp_now2mt(); + be32_add_cpu(&vhdr->write_count, 1); + vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); + vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); + hfsplus_sync_fs(sb, 1); + + if (!sbi->hidden_dir) { + mutex_lock(&sbi->vh_mutex); + sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); + hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str, + sbi->hidden_dir); + mutex_unlock(&sbi->vh_mutex); + + hfsplus_mark_inode_dirty(sbi->hidden_dir, + HFSPLUS_I_CAT_DIRTY); + } + } + + sb->s_d_op = &hfsplus_dentry_operations; + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + err = -ENOMEM; + goto out_put_hidden_dir; + } + + unload_nls(sbi->nls); + sbi->nls = nls; + return 0; + +out_put_hidden_dir: + iput(sbi->hidden_dir); +out_put_root: + iput(root); +out_put_alloc_file: + iput(sbi->alloc_file); +out_close_cat_tree: + hfs_btree_close(sbi->cat_tree); +out_close_ext_tree: + hfs_btree_close(sbi->ext_tree); +out_free_vhdr: + kfree(sbi->s_vhdr_buf); + kfree(sbi->s_backup_vhdr_buf); +out_unload_nls: + unload_nls(sbi->nls); + unload_nls(nls); + kfree(sbi); +out: + return err; +} + +MODULE_AUTHOR("Brad Boyer"); +MODULE_DESCRIPTION("Extended Macintosh Filesystem"); +MODULE_LICENSE("GPL"); + +static struct kmem_cache *hfsplus_inode_cachep; + +static struct inode *hfsplus_alloc_inode(struct super_block *sb) +{ + struct hfsplus_inode_info *i; + + i = kmem_cache_alloc(hfsplus_inode_cachep, GFP_KERNEL); + return i ? &i->vfs_inode : NULL; +} + +static void hfsplus_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode)); +} + +static void hfsplus_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hfsplus_i_callback); +} + +#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) + +static struct dentry *hfsplus_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super); +} + +static struct file_system_type hfsplus_fs_type = { + .owner = THIS_MODULE, + .name = "hfsplus", + .mount = hfsplus_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static void hfsplus_init_once(void *p) +{ + struct hfsplus_inode_info *i = p; + + inode_init_once(&i->vfs_inode); +} + +static int __init init_hfsplus_fs(void) +{ + int err; + + hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache", + HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN, + hfsplus_init_once); + if (!hfsplus_inode_cachep) + return -ENOMEM; + err = register_filesystem(&hfsplus_fs_type); + if (err) + kmem_cache_destroy(hfsplus_inode_cachep); + return err; +} + +static void __exit exit_hfsplus_fs(void) +{ + unregister_filesystem(&hfsplus_fs_type); + kmem_cache_destroy(hfsplus_inode_cachep); +} + +module_init(init_hfsplus_fs) +module_exit(exit_hfsplus_fs) diff --git a/fs/hfsplus/tables.c b/fs/hfsplus/tables.c new file mode 100644 index 00000000..1b911730 --- /dev/null +++ b/fs/hfsplus/tables.c @@ -0,0 +1,3245 @@ +/* + * linux/fs/hfsplus/tables.c + * + * Various data tables + */ + +#include "hfsplus_fs.h" + +/* + * Unicode case folding table taken from Apple Technote #1150 + * (HFS Plus Volume Format) + */ + +u16 hfsplus_case_fold_table[] = { +/* + * The lower case table consists of a 256-entry high-byte table followed by + * some number of 256-entry subtables. The high-byte table contains either an + * offset to the subtable for characters with that high byte or zero, which + * means that there are no case mappings or ignored characters in that block. + * Ignored characters are mapped to zero. + */ + + // High-byte indices ( == 0 iff no case mapping and no ignorables ) + + + /* 0 */ 0x0100, 0x0200, 0x0000, 0x0300, 0x0400, 0x0500, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 1 */ 0x0600, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 2 */ 0x0700, 0x0800, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 3 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 4 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 5 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 6 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 7 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 8 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 9 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* A */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* B */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* C */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* D */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* E */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* F */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0900, 0x0A00, + + // Table 1 (for high byte 0x00) + + /* 0 */ 0xFFFF, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, + /* 1 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, + /* 2 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, + /* 3 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, + /* 4 */ 0x0040, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, + /* 5 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, + /* 6 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, + /* 7 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, + /* 8 */ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F, + /* 9 */ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, + /* A */ 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, + 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, + /* B */ 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, + 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, + /* C */ 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00E6, 0x00C7, + 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, + /* D */ 0x00F0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, + 0x00F8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00FE, 0x00DF, + /* E */ 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, + 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, + /* F */ 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, + 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF, + + // Table 2 (for high byte 0x01) + + /* 0 */ 0x0100, 0x0101, 0x0102, 0x0103, 0x0104, 0x0105, 0x0106, 0x0107, + 0x0108, 0x0109, 0x010A, 0x010B, 0x010C, 0x010D, 0x010E, 0x010F, + /* 1 */ 0x0111, 0x0111, 0x0112, 0x0113, 0x0114, 0x0115, 0x0116, 0x0117, + 0x0118, 0x0119, 0x011A, 0x011B, 0x011C, 0x011D, 0x011E, 0x011F, + /* 2 */ 0x0120, 0x0121, 0x0122, 0x0123, 0x0124, 0x0125, 0x0127, 0x0127, + 0x0128, 0x0129, 0x012A, 0x012B, 0x012C, 0x012D, 0x012E, 0x012F, + /* 3 */ 0x0130, 0x0131, 0x0133, 0x0133, 0x0134, 0x0135, 0x0136, 0x0137, + 0x0138, 0x0139, 0x013A, 0x013B, 0x013C, 0x013D, 0x013E, 0x0140, + /* 4 */ 0x0140, 0x0142, 0x0142, 0x0143, 0x0144, 0x0145, 0x0146, 0x0147, + 0x0148, 0x0149, 0x014B, 0x014B, 0x014C, 0x014D, 0x014E, 0x014F, + /* 5 */ 0x0150, 0x0151, 0x0153, 0x0153, 0x0154, 0x0155, 0x0156, 0x0157, + 0x0158, 0x0159, 0x015A, 0x015B, 0x015C, 0x015D, 0x015E, 0x015F, + /* 6 */ 0x0160, 0x0161, 0x0162, 0x0163, 0x0164, 0x0165, 0x0167, 0x0167, + 0x0168, 0x0169, 0x016A, 0x016B, 0x016C, 0x016D, 0x016E, 0x016F, + /* 7 */ 0x0170, 0x0171, 0x0172, 0x0173, 0x0174, 0x0175, 0x0176, 0x0177, + 0x0178, 0x0179, 0x017A, 0x017B, 0x017C, 0x017D, 0x017E, 0x017F, + /* 8 */ 0x0180, 0x0253, 0x0183, 0x0183, 0x0185, 0x0185, 0x0254, 0x0188, + 0x0188, 0x0256, 0x0257, 0x018C, 0x018C, 0x018D, 0x01DD, 0x0259, + /* 9 */ 0x025B, 0x0192, 0x0192, 0x0260, 0x0263, 0x0195, 0x0269, 0x0268, + 0x0199, 0x0199, 0x019A, 0x019B, 0x026F, 0x0272, 0x019E, 0x0275, + /* A */ 0x01A0, 0x01A1, 0x01A3, 0x01A3, 0x01A5, 0x01A5, 0x01A6, 0x01A8, + 0x01A8, 0x0283, 0x01AA, 0x01AB, 0x01AD, 0x01AD, 0x0288, 0x01AF, + /* B */ 0x01B0, 0x028A, 0x028B, 0x01B4, 0x01B4, 0x01B6, 0x01B6, 0x0292, + 0x01B9, 0x01B9, 0x01BA, 0x01BB, 0x01BD, 0x01BD, 0x01BE, 0x01BF, + /* C */ 0x01C0, 0x01C1, 0x01C2, 0x01C3, 0x01C6, 0x01C6, 0x01C6, 0x01C9, + 0x01C9, 0x01C9, 0x01CC, 0x01CC, 0x01CC, 0x01CD, 0x01CE, 0x01CF, + /* D */ 0x01D0, 0x01D1, 0x01D2, 0x01D3, 0x01D4, 0x01D5, 0x01D6, 0x01D7, + 0x01D8, 0x01D9, 0x01DA, 0x01DB, 0x01DC, 0x01DD, 0x01DE, 0x01DF, + /* E */ 0x01E0, 0x01E1, 0x01E2, 0x01E3, 0x01E5, 0x01E5, 0x01E6, 0x01E7, + 0x01E8, 0x01E9, 0x01EA, 0x01EB, 0x01EC, 0x01ED, 0x01EE, 0x01EF, + /* F */ 0x01F0, 0x01F3, 0x01F3, 0x01F3, 0x01F4, 0x01F5, 0x01F6, 0x01F7, + 0x01F8, 0x01F9, 0x01FA, 0x01FB, 0x01FC, 0x01FD, 0x01FE, 0x01FF, + + // Table 3 (for high byte 0x03) + + /* 0 */ 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 0x0307, + 0x0308, 0x0309, 0x030A, 0x030B, 0x030C, 0x030D, 0x030E, 0x030F, + /* 1 */ 0x0310, 0x0311, 0x0312, 0x0313, 0x0314, 0x0315, 0x0316, 0x0317, + 0x0318, 0x0319, 0x031A, 0x031B, 0x031C, 0x031D, 0x031E, 0x031F, + /* 2 */ 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327, + 0x0328, 0x0329, 0x032A, 0x032B, 0x032C, 0x032D, 0x032E, 0x032F, + /* 3 */ 0x0330, 0x0331, 0x0332, 0x0333, 0x0334, 0x0335, 0x0336, 0x0337, + 0x0338, 0x0339, 0x033A, 0x033B, 0x033C, 0x033D, 0x033E, 0x033F, + /* 4 */ 0x0340, 0x0341, 0x0342, 0x0343, 0x0344, 0x0345, 0x0346, 0x0347, + 0x0348, 0x0349, 0x034A, 0x034B, 0x034C, 0x034D, 0x034E, 0x034F, + /* 5 */ 0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357, + 0x0358, 0x0359, 0x035A, 0x035B, 0x035C, 0x035D, 0x035E, 0x035F, + /* 6 */ 0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367, + 0x0368, 0x0369, 0x036A, 0x036B, 0x036C, 0x036D, 0x036E, 0x036F, + /* 7 */ 0x0370, 0x0371, 0x0372, 0x0373, 0x0374, 0x0375, 0x0376, 0x0377, + 0x0378, 0x0379, 0x037A, 0x037B, 0x037C, 0x037D, 0x037E, 0x037F, + /* 8 */ 0x0380, 0x0381, 0x0382, 0x0383, 0x0384, 0x0385, 0x0386, 0x0387, + 0x0388, 0x0389, 0x038A, 0x038B, 0x038C, 0x038D, 0x038E, 0x038F, + /* 9 */ 0x0390, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, + 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, + /* A */ 0x03C0, 0x03C1, 0x03A2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, + 0x03C8, 0x03C9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF, + /* B */ 0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, + 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, + /* C */ 0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, + 0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0x03CF, + /* D */ 0x03D0, 0x03D1, 0x03D2, 0x03D3, 0x03D4, 0x03D5, 0x03D6, 0x03D7, + 0x03D8, 0x03D9, 0x03DA, 0x03DB, 0x03DC, 0x03DD, 0x03DE, 0x03DF, + /* E */ 0x03E0, 0x03E1, 0x03E3, 0x03E3, 0x03E5, 0x03E5, 0x03E7, 0x03E7, + 0x03E9, 0x03E9, 0x03EB, 0x03EB, 0x03ED, 0x03ED, 0x03EF, 0x03EF, + /* F */ 0x03F0, 0x03F1, 0x03F2, 0x03F3, 0x03F4, 0x03F5, 0x03F6, 0x03F7, + 0x03F8, 0x03F9, 0x03FA, 0x03FB, 0x03FC, 0x03FD, 0x03FE, 0x03FF, + + // Table 4 (for high byte 0x04) + + /* 0 */ 0x0400, 0x0401, 0x0452, 0x0403, 0x0454, 0x0455, 0x0456, 0x0407, + 0x0458, 0x0459, 0x045A, 0x045B, 0x040C, 0x040D, 0x040E, 0x045F, + /* 1 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0419, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, + /* 2 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, + /* 3 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, + /* 4 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, + /* 5 */ 0x0450, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, + 0x0458, 0x0459, 0x045A, 0x045B, 0x045C, 0x045D, 0x045E, 0x045F, + /* 6 */ 0x0461, 0x0461, 0x0463, 0x0463, 0x0465, 0x0465, 0x0467, 0x0467, + 0x0469, 0x0469, 0x046B, 0x046B, 0x046D, 0x046D, 0x046F, 0x046F, + /* 7 */ 0x0471, 0x0471, 0x0473, 0x0473, 0x0475, 0x0475, 0x0476, 0x0477, + 0x0479, 0x0479, 0x047B, 0x047B, 0x047D, 0x047D, 0x047F, 0x047F, + /* 8 */ 0x0481, 0x0481, 0x0482, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487, + 0x0488, 0x0489, 0x048A, 0x048B, 0x048C, 0x048D, 0x048E, 0x048F, + /* 9 */ 0x0491, 0x0491, 0x0493, 0x0493, 0x0495, 0x0495, 0x0497, 0x0497, + 0x0499, 0x0499, 0x049B, 0x049B, 0x049D, 0x049D, 0x049F, 0x049F, + /* A */ 0x04A1, 0x04A1, 0x04A3, 0x04A3, 0x04A5, 0x04A5, 0x04A7, 0x04A7, + 0x04A9, 0x04A9, 0x04AB, 0x04AB, 0x04AD, 0x04AD, 0x04AF, 0x04AF, + /* B */ 0x04B1, 0x04B1, 0x04B3, 0x04B3, 0x04B5, 0x04B5, 0x04B7, 0x04B7, + 0x04B9, 0x04B9, 0x04BB, 0x04BB, 0x04BD, 0x04BD, 0x04BF, 0x04BF, + /* C */ 0x04C0, 0x04C1, 0x04C2, 0x04C4, 0x04C4, 0x04C5, 0x04C6, 0x04C8, + 0x04C8, 0x04C9, 0x04CA, 0x04CC, 0x04CC, 0x04CD, 0x04CE, 0x04CF, + /* D */ 0x04D0, 0x04D1, 0x04D2, 0x04D3, 0x04D4, 0x04D5, 0x04D6, 0x04D7, + 0x04D8, 0x04D9, 0x04DA, 0x04DB, 0x04DC, 0x04DD, 0x04DE, 0x04DF, + /* E */ 0x04E0, 0x04E1, 0x04E2, 0x04E3, 0x04E4, 0x04E5, 0x04E6, 0x04E7, + 0x04E8, 0x04E9, 0x04EA, 0x04EB, 0x04EC, 0x04ED, 0x04EE, 0x04EF, + /* F */ 0x04F0, 0x04F1, 0x04F2, 0x04F3, 0x04F4, 0x04F5, 0x04F6, 0x04F7, + 0x04F8, 0x04F9, 0x04FA, 0x04FB, 0x04FC, 0x04FD, 0x04FE, 0x04FF, + + // Table 5 (for high byte 0x05) + + /* 0 */ 0x0500, 0x0501, 0x0502, 0x0503, 0x0504, 0x0505, 0x0506, 0x0507, + 0x0508, 0x0509, 0x050A, 0x050B, 0x050C, 0x050D, 0x050E, 0x050F, + /* 1 */ 0x0510, 0x0511, 0x0512, 0x0513, 0x0514, 0x0515, 0x0516, 0x0517, + 0x0518, 0x0519, 0x051A, 0x051B, 0x051C, 0x051D, 0x051E, 0x051F, + /* 2 */ 0x0520, 0x0521, 0x0522, 0x0523, 0x0524, 0x0525, 0x0526, 0x0527, + 0x0528, 0x0529, 0x052A, 0x052B, 0x052C, 0x052D, 0x052E, 0x052F, + /* 3 */ 0x0530, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567, + 0x0568, 0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F, + /* 4 */ 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577, + 0x0578, 0x0579, 0x057A, 0x057B, 0x057C, 0x057D, 0x057E, 0x057F, + /* 5 */ 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586, 0x0557, + 0x0558, 0x0559, 0x055A, 0x055B, 0x055C, 0x055D, 0x055E, 0x055F, + /* 6 */ 0x0560, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567, + 0x0568, 0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F, + /* 7 */ 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577, + 0x0578, 0x0579, 0x057A, 0x057B, 0x057C, 0x057D, 0x057E, 0x057F, + /* 8 */ 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586, 0x0587, + 0x0588, 0x0589, 0x058A, 0x058B, 0x058C, 0x058D, 0x058E, 0x058F, + /* 9 */ 0x0590, 0x0591, 0x0592, 0x0593, 0x0594, 0x0595, 0x0596, 0x0597, + 0x0598, 0x0599, 0x059A, 0x059B, 0x059C, 0x059D, 0x059E, 0x059F, + /* A */ 0x05A0, 0x05A1, 0x05A2, 0x05A3, 0x05A4, 0x05A5, 0x05A6, 0x05A7, + 0x05A8, 0x05A9, 0x05AA, 0x05AB, 0x05AC, 0x05AD, 0x05AE, 0x05AF, + /* B */ 0x05B0, 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, + 0x05B8, 0x05B9, 0x05BA, 0x05BB, 0x05BC, 0x05BD, 0x05BE, 0x05BF, + /* C */ 0x05C0, 0x05C1, 0x05C2, 0x05C3, 0x05C4, 0x05C5, 0x05C6, 0x05C7, + 0x05C8, 0x05C9, 0x05CA, 0x05CB, 0x05CC, 0x05CD, 0x05CE, 0x05CF, + /* D */ 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, + 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF, + /* E */ 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, + 0x05E8, 0x05E9, 0x05EA, 0x05EB, 0x05EC, 0x05ED, 0x05EE, 0x05EF, + /* F */ 0x05F0, 0x05F1, 0x05F2, 0x05F3, 0x05F4, 0x05F5, 0x05F6, 0x05F7, + 0x05F8, 0x05F9, 0x05FA, 0x05FB, 0x05FC, 0x05FD, 0x05FE, 0x05FF, + + // Table 6 (for high byte 0x10) + + /* 0 */ 0x1000, 0x1001, 0x1002, 0x1003, 0x1004, 0x1005, 0x1006, 0x1007, + 0x1008, 0x1009, 0x100A, 0x100B, 0x100C, 0x100D, 0x100E, 0x100F, + /* 1 */ 0x1010, 0x1011, 0x1012, 0x1013, 0x1014, 0x1015, 0x1016, 0x1017, + 0x1018, 0x1019, 0x101A, 0x101B, 0x101C, 0x101D, 0x101E, 0x101F, + /* 2 */ 0x1020, 0x1021, 0x1022, 0x1023, 0x1024, 0x1025, 0x1026, 0x1027, + 0x1028, 0x1029, 0x102A, 0x102B, 0x102C, 0x102D, 0x102E, 0x102F, + /* 3 */ 0x1030, 0x1031, 0x1032, 0x1033, 0x1034, 0x1035, 0x1036, 0x1037, + 0x1038, 0x1039, 0x103A, 0x103B, 0x103C, 0x103D, 0x103E, 0x103F, + /* 4 */ 0x1040, 0x1041, 0x1042, 0x1043, 0x1044, 0x1045, 0x1046, 0x1047, + 0x1048, 0x1049, 0x104A, 0x104B, 0x104C, 0x104D, 0x104E, 0x104F, + /* 5 */ 0x1050, 0x1051, 0x1052, 0x1053, 0x1054, 0x1055, 0x1056, 0x1057, + 0x1058, 0x1059, 0x105A, 0x105B, 0x105C, 0x105D, 0x105E, 0x105F, + /* 6 */ 0x1060, 0x1061, 0x1062, 0x1063, 0x1064, 0x1065, 0x1066, 0x1067, + 0x1068, 0x1069, 0x106A, 0x106B, 0x106C, 0x106D, 0x106E, 0x106F, + /* 7 */ 0x1070, 0x1071, 0x1072, 0x1073, 0x1074, 0x1075, 0x1076, 0x1077, + 0x1078, 0x1079, 0x107A, 0x107B, 0x107C, 0x107D, 0x107E, 0x107F, + /* 8 */ 0x1080, 0x1081, 0x1082, 0x1083, 0x1084, 0x1085, 0x1086, 0x1087, + 0x1088, 0x1089, 0x108A, 0x108B, 0x108C, 0x108D, 0x108E, 0x108F, + /* 9 */ 0x1090, 0x1091, 0x1092, 0x1093, 0x1094, 0x1095, 0x1096, 0x1097, + 0x1098, 0x1099, 0x109A, 0x109B, 0x109C, 0x109D, 0x109E, 0x109F, + /* A */ 0x10D0, 0x10D1, 0x10D2, 0x10D3, 0x10D4, 0x10D5, 0x10D6, 0x10D7, + 0x10D8, 0x10D9, 0x10DA, 0x10DB, 0x10DC, 0x10DD, 0x10DE, 0x10DF, + /* B */ 0x10E0, 0x10E1, 0x10E2, 0x10E3, 0x10E4, 0x10E5, 0x10E6, 0x10E7, + 0x10E8, 0x10E9, 0x10EA, 0x10EB, 0x10EC, 0x10ED, 0x10EE, 0x10EF, + /* C */ 0x10F0, 0x10F1, 0x10F2, 0x10F3, 0x10F4, 0x10F5, 0x10C6, 0x10C7, + 0x10C8, 0x10C9, 0x10CA, 0x10CB, 0x10CC, 0x10CD, 0x10CE, 0x10CF, + /* D */ 0x10D0, 0x10D1, 0x10D2, 0x10D3, 0x10D4, 0x10D5, 0x10D6, 0x10D7, + 0x10D8, 0x10D9, 0x10DA, 0x10DB, 0x10DC, 0x10DD, 0x10DE, 0x10DF, + /* E */ 0x10E0, 0x10E1, 0x10E2, 0x10E3, 0x10E4, 0x10E5, 0x10E6, 0x10E7, + 0x10E8, 0x10E9, 0x10EA, 0x10EB, 0x10EC, 0x10ED, 0x10EE, 0x10EF, + /* F */ 0x10F0, 0x10F1, 0x10F2, 0x10F3, 0x10F4, 0x10F5, 0x10F6, 0x10F7, + 0x10F8, 0x10F9, 0x10FA, 0x10FB, 0x10FC, 0x10FD, 0x10FE, 0x10FF, + + // Table 7 (for high byte 0x20) + + /* 0 */ 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, + 0x2008, 0x2009, 0x200A, 0x200B, 0x0000, 0x0000, 0x0000, 0x0000, + /* 1 */ 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2016, 0x2017, + 0x2018, 0x2019, 0x201A, 0x201B, 0x201C, 0x201D, 0x201E, 0x201F, + /* 2 */ 0x2020, 0x2021, 0x2022, 0x2023, 0x2024, 0x2025, 0x2026, 0x2027, + 0x2028, 0x2029, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x202F, + /* 3 */ 0x2030, 0x2031, 0x2032, 0x2033, 0x2034, 0x2035, 0x2036, 0x2037, + 0x2038, 0x2039, 0x203A, 0x203B, 0x203C, 0x203D, 0x203E, 0x203F, + /* 4 */ 0x2040, 0x2041, 0x2042, 0x2043, 0x2044, 0x2045, 0x2046, 0x2047, + 0x2048, 0x2049, 0x204A, 0x204B, 0x204C, 0x204D, 0x204E, 0x204F, + /* 5 */ 0x2050, 0x2051, 0x2052, 0x2053, 0x2054, 0x2055, 0x2056, 0x2057, + 0x2058, 0x2059, 0x205A, 0x205B, 0x205C, 0x205D, 0x205E, 0x205F, + /* 6 */ 0x2060, 0x2061, 0x2062, 0x2063, 0x2064, 0x2065, 0x2066, 0x2067, + 0x2068, 0x2069, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 7 */ 0x2070, 0x2071, 0x2072, 0x2073, 0x2074, 0x2075, 0x2076, 0x2077, + 0x2078, 0x2079, 0x207A, 0x207B, 0x207C, 0x207D, 0x207E, 0x207F, + /* 8 */ 0x2080, 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087, + 0x2088, 0x2089, 0x208A, 0x208B, 0x208C, 0x208D, 0x208E, 0x208F, + /* 9 */ 0x2090, 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096, 0x2097, + 0x2098, 0x2099, 0x209A, 0x209B, 0x209C, 0x209D, 0x209E, 0x209F, + /* A */ 0x20A0, 0x20A1, 0x20A2, 0x20A3, 0x20A4, 0x20A5, 0x20A6, 0x20A7, + 0x20A8, 0x20A9, 0x20AA, 0x20AB, 0x20AC, 0x20AD, 0x20AE, 0x20AF, + /* B */ 0x20B0, 0x20B1, 0x20B2, 0x20B3, 0x20B4, 0x20B5, 0x20B6, 0x20B7, + 0x20B8, 0x20B9, 0x20BA, 0x20BB, 0x20BC, 0x20BD, 0x20BE, 0x20BF, + /* C */ 0x20C0, 0x20C1, 0x20C2, 0x20C3, 0x20C4, 0x20C5, 0x20C6, 0x20C7, + 0x20C8, 0x20C9, 0x20CA, 0x20CB, 0x20CC, 0x20CD, 0x20CE, 0x20CF, + /* D */ 0x20D0, 0x20D1, 0x20D2, 0x20D3, 0x20D4, 0x20D5, 0x20D6, 0x20D7, + 0x20D8, 0x20D9, 0x20DA, 0x20DB, 0x20DC, 0x20DD, 0x20DE, 0x20DF, + /* E */ 0x20E0, 0x20E1, 0x20E2, 0x20E3, 0x20E4, 0x20E5, 0x20E6, 0x20E7, + 0x20E8, 0x20E9, 0x20EA, 0x20EB, 0x20EC, 0x20ED, 0x20EE, 0x20EF, + /* F */ 0x20F0, 0x20F1, 0x20F2, 0x20F3, 0x20F4, 0x20F5, 0x20F6, 0x20F7, + 0x20F8, 0x20F9, 0x20FA, 0x20FB, 0x20FC, 0x20FD, 0x20FE, 0x20FF, + + // Table 8 (for high byte 0x21) + + /* 0 */ 0x2100, 0x2101, 0x2102, 0x2103, 0x2104, 0x2105, 0x2106, 0x2107, + 0x2108, 0x2109, 0x210A, 0x210B, 0x210C, 0x210D, 0x210E, 0x210F, + /* 1 */ 0x2110, 0x2111, 0x2112, 0x2113, 0x2114, 0x2115, 0x2116, 0x2117, + 0x2118, 0x2119, 0x211A, 0x211B, 0x211C, 0x211D, 0x211E, 0x211F, + /* 2 */ 0x2120, 0x2121, 0x2122, 0x2123, 0x2124, 0x2125, 0x2126, 0x2127, + 0x2128, 0x2129, 0x212A, 0x212B, 0x212C, 0x212D, 0x212E, 0x212F, + /* 3 */ 0x2130, 0x2131, 0x2132, 0x2133, 0x2134, 0x2135, 0x2136, 0x2137, + 0x2138, 0x2139, 0x213A, 0x213B, 0x213C, 0x213D, 0x213E, 0x213F, + /* 4 */ 0x2140, 0x2141, 0x2142, 0x2143, 0x2144, 0x2145, 0x2146, 0x2147, + 0x2148, 0x2149, 0x214A, 0x214B, 0x214C, 0x214D, 0x214E, 0x214F, + /* 5 */ 0x2150, 0x2151, 0x2152, 0x2153, 0x2154, 0x2155, 0x2156, 0x2157, + 0x2158, 0x2159, 0x215A, 0x215B, 0x215C, 0x215D, 0x215E, 0x215F, + /* 6 */ 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, + 0x2178, 0x2179, 0x217A, 0x217B, 0x217C, 0x217D, 0x217E, 0x217F, + /* 7 */ 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, + 0x2178, 0x2179, 0x217A, 0x217B, 0x217C, 0x217D, 0x217E, 0x217F, + /* 8 */ 0x2180, 0x2181, 0x2182, 0x2183, 0x2184, 0x2185, 0x2186, 0x2187, + 0x2188, 0x2189, 0x218A, 0x218B, 0x218C, 0x218D, 0x218E, 0x218F, + /* 9 */ 0x2190, 0x2191, 0x2192, 0x2193, 0x2194, 0x2195, 0x2196, 0x2197, + 0x2198, 0x2199, 0x219A, 0x219B, 0x219C, 0x219D, 0x219E, 0x219F, + /* A */ 0x21A0, 0x21A1, 0x21A2, 0x21A3, 0x21A4, 0x21A5, 0x21A6, 0x21A7, + 0x21A8, 0x21A9, 0x21AA, 0x21AB, 0x21AC, 0x21AD, 0x21AE, 0x21AF, + /* B */ 0x21B0, 0x21B1, 0x21B2, 0x21B3, 0x21B4, 0x21B5, 0x21B6, 0x21B7, + 0x21B8, 0x21B9, 0x21BA, 0x21BB, 0x21BC, 0x21BD, 0x21BE, 0x21BF, + /* C */ 0x21C0, 0x21C1, 0x21C2, 0x21C3, 0x21C4, 0x21C5, 0x21C6, 0x21C7, + 0x21C8, 0x21C9, 0x21CA, 0x21CB, 0x21CC, 0x21CD, 0x21CE, 0x21CF, + /* D */ 0x21D0, 0x21D1, 0x21D2, 0x21D3, 0x21D4, 0x21D5, 0x21D6, 0x21D7, + 0x21D8, 0x21D9, 0x21DA, 0x21DB, 0x21DC, 0x21DD, 0x21DE, 0x21DF, + /* E */ 0x21E0, 0x21E1, 0x21E2, 0x21E3, 0x21E4, 0x21E5, 0x21E6, 0x21E7, + 0x21E8, 0x21E9, 0x21EA, 0x21EB, 0x21EC, 0x21ED, 0x21EE, 0x21EF, + /* F */ 0x21F0, 0x21F1, 0x21F2, 0x21F3, 0x21F4, 0x21F5, 0x21F6, 0x21F7, + 0x21F8, 0x21F9, 0x21FA, 0x21FB, 0x21FC, 0x21FD, 0x21FE, 0x21FF, + + // Table 9 (for high byte 0xFE) + + /* 0 */ 0xFE00, 0xFE01, 0xFE02, 0xFE03, 0xFE04, 0xFE05, 0xFE06, 0xFE07, + 0xFE08, 0xFE09, 0xFE0A, 0xFE0B, 0xFE0C, 0xFE0D, 0xFE0E, 0xFE0F, + /* 1 */ 0xFE10, 0xFE11, 0xFE12, 0xFE13, 0xFE14, 0xFE15, 0xFE16, 0xFE17, + 0xFE18, 0xFE19, 0xFE1A, 0xFE1B, 0xFE1C, 0xFE1D, 0xFE1E, 0xFE1F, + /* 2 */ 0xFE20, 0xFE21, 0xFE22, 0xFE23, 0xFE24, 0xFE25, 0xFE26, 0xFE27, + 0xFE28, 0xFE29, 0xFE2A, 0xFE2B, 0xFE2C, 0xFE2D, 0xFE2E, 0xFE2F, + /* 3 */ 0xFE30, 0xFE31, 0xFE32, 0xFE33, 0xFE34, 0xFE35, 0xFE36, 0xFE37, + 0xFE38, 0xFE39, 0xFE3A, 0xFE3B, 0xFE3C, 0xFE3D, 0xFE3E, 0xFE3F, + /* 4 */ 0xFE40, 0xFE41, 0xFE42, 0xFE43, 0xFE44, 0xFE45, 0xFE46, 0xFE47, + 0xFE48, 0xFE49, 0xFE4A, 0xFE4B, 0xFE4C, 0xFE4D, 0xFE4E, 0xFE4F, + /* 5 */ 0xFE50, 0xFE51, 0xFE52, 0xFE53, 0xFE54, 0xFE55, 0xFE56, 0xFE57, + 0xFE58, 0xFE59, 0xFE5A, 0xFE5B, 0xFE5C, 0xFE5D, 0xFE5E, 0xFE5F, + /* 6 */ 0xFE60, 0xFE61, 0xFE62, 0xFE63, 0xFE64, 0xFE65, 0xFE66, 0xFE67, + 0xFE68, 0xFE69, 0xFE6A, 0xFE6B, 0xFE6C, 0xFE6D, 0xFE6E, 0xFE6F, + /* 7 */ 0xFE70, 0xFE71, 0xFE72, 0xFE73, 0xFE74, 0xFE75, 0xFE76, 0xFE77, + 0xFE78, 0xFE79, 0xFE7A, 0xFE7B, 0xFE7C, 0xFE7D, 0xFE7E, 0xFE7F, + /* 8 */ 0xFE80, 0xFE81, 0xFE82, 0xFE83, 0xFE84, 0xFE85, 0xFE86, 0xFE87, + 0xFE88, 0xFE89, 0xFE8A, 0xFE8B, 0xFE8C, 0xFE8D, 0xFE8E, 0xFE8F, + /* 9 */ 0xFE90, 0xFE91, 0xFE92, 0xFE93, 0xFE94, 0xFE95, 0xFE96, 0xFE97, + 0xFE98, 0xFE99, 0xFE9A, 0xFE9B, 0xFE9C, 0xFE9D, 0xFE9E, 0xFE9F, + /* A */ 0xFEA0, 0xFEA1, 0xFEA2, 0xFEA3, 0xFEA4, 0xFEA5, 0xFEA6, 0xFEA7, + 0xFEA8, 0xFEA9, 0xFEAA, 0xFEAB, 0xFEAC, 0xFEAD, 0xFEAE, 0xFEAF, + /* B */ 0xFEB0, 0xFEB1, 0xFEB2, 0xFEB3, 0xFEB4, 0xFEB5, 0xFEB6, 0xFEB7, + 0xFEB8, 0xFEB9, 0xFEBA, 0xFEBB, 0xFEBC, 0xFEBD, 0xFEBE, 0xFEBF, + /* C */ 0xFEC0, 0xFEC1, 0xFEC2, 0xFEC3, 0xFEC4, 0xFEC5, 0xFEC6, 0xFEC7, + 0xFEC8, 0xFEC9, 0xFECA, 0xFECB, 0xFECC, 0xFECD, 0xFECE, 0xFECF, + /* D */ 0xFED0, 0xFED1, 0xFED2, 0xFED3, 0xFED4, 0xFED5, 0xFED6, 0xFED7, + 0xFED8, 0xFED9, 0xFEDA, 0xFEDB, 0xFEDC, 0xFEDD, 0xFEDE, 0xFEDF, + /* E */ 0xFEE0, 0xFEE1, 0xFEE2, 0xFEE3, 0xFEE4, 0xFEE5, 0xFEE6, 0xFEE7, + 0xFEE8, 0xFEE9, 0xFEEA, 0xFEEB, 0xFEEC, 0xFEED, 0xFEEE, 0xFEEF, + /* F */ 0xFEF0, 0xFEF1, 0xFEF2, 0xFEF3, 0xFEF4, 0xFEF5, 0xFEF6, 0xFEF7, + 0xFEF8, 0xFEF9, 0xFEFA, 0xFEFB, 0xFEFC, 0xFEFD, 0xFEFE, 0x0000, + + // Table 10 (for high byte 0xFF) + + /* 0 */ 0xFF00, 0xFF01, 0xFF02, 0xFF03, 0xFF04, 0xFF05, 0xFF06, 0xFF07, + 0xFF08, 0xFF09, 0xFF0A, 0xFF0B, 0xFF0C, 0xFF0D, 0xFF0E, 0xFF0F, + /* 1 */ 0xFF10, 0xFF11, 0xFF12, 0xFF13, 0xFF14, 0xFF15, 0xFF16, 0xFF17, + 0xFF18, 0xFF19, 0xFF1A, 0xFF1B, 0xFF1C, 0xFF1D, 0xFF1E, 0xFF1F, + /* 2 */ 0xFF20, 0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47, + 0xFF48, 0xFF49, 0xFF4A, 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F, + /* 3 */ 0xFF50, 0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, 0xFF56, 0xFF57, + 0xFF58, 0xFF59, 0xFF5A, 0xFF3B, 0xFF3C, 0xFF3D, 0xFF3E, 0xFF3F, + /* 4 */ 0xFF40, 0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47, + 0xFF48, 0xFF49, 0xFF4A, 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F, + /* 5 */ 0xFF50, 0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, 0xFF56, 0xFF57, + 0xFF58, 0xFF59, 0xFF5A, 0xFF5B, 0xFF5C, 0xFF5D, 0xFF5E, 0xFF5F, + /* 6 */ 0xFF60, 0xFF61, 0xFF62, 0xFF63, 0xFF64, 0xFF65, 0xFF66, 0xFF67, + 0xFF68, 0xFF69, 0xFF6A, 0xFF6B, 0xFF6C, 0xFF6D, 0xFF6E, 0xFF6F, + /* 7 */ 0xFF70, 0xFF71, 0xFF72, 0xFF73, 0xFF74, 0xFF75, 0xFF76, 0xFF77, + 0xFF78, 0xFF79, 0xFF7A, 0xFF7B, 0xFF7C, 0xFF7D, 0xFF7E, 0xFF7F, + /* 8 */ 0xFF80, 0xFF81, 0xFF82, 0xFF83, 0xFF84, 0xFF85, 0xFF86, 0xFF87, + 0xFF88, 0xFF89, 0xFF8A, 0xFF8B, 0xFF8C, 0xFF8D, 0xFF8E, 0xFF8F, + /* 9 */ 0xFF90, 0xFF91, 0xFF92, 0xFF93, 0xFF94, 0xFF95, 0xFF96, 0xFF97, + 0xFF98, 0xFF99, 0xFF9A, 0xFF9B, 0xFF9C, 0xFF9D, 0xFF9E, 0xFF9F, + /* A */ 0xFFA0, 0xFFA1, 0xFFA2, 0xFFA3, 0xFFA4, 0xFFA5, 0xFFA6, 0xFFA7, + 0xFFA8, 0xFFA9, 0xFFAA, 0xFFAB, 0xFFAC, 0xFFAD, 0xFFAE, 0xFFAF, + /* B */ 0xFFB0, 0xFFB1, 0xFFB2, 0xFFB3, 0xFFB4, 0xFFB5, 0xFFB6, 0xFFB7, + 0xFFB8, 0xFFB9, 0xFFBA, 0xFFBB, 0xFFBC, 0xFFBD, 0xFFBE, 0xFFBF, + /* C */ 0xFFC0, 0xFFC1, 0xFFC2, 0xFFC3, 0xFFC4, 0xFFC5, 0xFFC6, 0xFFC7, + 0xFFC8, 0xFFC9, 0xFFCA, 0xFFCB, 0xFFCC, 0xFFCD, 0xFFCE, 0xFFCF, + /* D */ 0xFFD0, 0xFFD1, 0xFFD2, 0xFFD3, 0xFFD4, 0xFFD5, 0xFFD6, 0xFFD7, + 0xFFD8, 0xFFD9, 0xFFDA, 0xFFDB, 0xFFDC, 0xFFDD, 0xFFDE, 0xFFDF, + /* E */ 0xFFE0, 0xFFE1, 0xFFE2, 0xFFE3, 0xFFE4, 0xFFE5, 0xFFE6, 0xFFE7, + 0xFFE8, 0xFFE9, 0xFFEA, 0xFFEB, 0xFFEC, 0xFFED, 0xFFEE, 0xFFEF, + /* F */ 0xFFF0, 0xFFF1, 0xFFF2, 0xFFF3, 0xFFF4, 0xFFF5, 0xFFF6, 0xFFF7, + 0xFFF8, 0xFFF9, 0xFFFA, 0xFFFB, 0xFFFC, 0xFFFD, 0xFFFE, 0xFFFF, +}; + +u16 hfsplus_decompose_table[] = { + /* base table */ + 0x0010, 0x04c0, 0x0000, 0x06f0, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x07b0, + /* char table 0x0___ */ + 0x0020, 0x0070, 0x0160, 0x0190, 0x0230, 0x0000, 0x0000, 0x0000, + 0x0000, 0x02d0, 0x0340, 0x0360, 0x03b0, 0x03e0, 0x0400, 0x0430, + /* char table 0x00__ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0030, 0x0040, 0x0050, 0x0060, + /* char values 0x00c_ */ + 0x2042, 0x204a, 0x2052, 0x205a, 0x2062, 0x206a, 0x0000, 0x2072, + 0x207a, 0x2082, 0x208a, 0x2092, 0x209a, 0x20a2, 0x20aa, 0x20b2, + /* char values 0x00d_ */ + 0x0000, 0x20ba, 0x20c2, 0x20ca, 0x20d2, 0x20da, 0x20e2, 0x0000, + 0x0000, 0x20ea, 0x20f2, 0x20fa, 0x2102, 0x210a, 0x0000, 0x0000, + /* char values 0x00e_ */ + 0x2112, 0x211a, 0x2122, 0x212a, 0x2132, 0x213a, 0x0000, 0x2142, + 0x214a, 0x2152, 0x215a, 0x2162, 0x216a, 0x2172, 0x217a, 0x2182, + /* char values 0x00f_ */ + 0x0000, 0x218a, 0x2192, 0x219a, 0x21a2, 0x21aa, 0x21b2, 0x0000, + 0x0000, 0x21ba, 0x21c2, 0x21ca, 0x21d2, 0x21da, 0x0000, 0x21e2, + /* char table 0x01__ */ + 0x0080, 0x0090, 0x00a0, 0x00b0, 0x00c0, 0x00d0, 0x00e0, 0x00f0, + 0x0000, 0x0000, 0x0100, 0x0110, 0x0120, 0x0130, 0x0140, 0x0150, + /* char values 0x010_ */ + 0x21ea, 0x21f2, 0x21fa, 0x2202, 0x220a, 0x2212, 0x221a, 0x2222, + 0x222a, 0x2232, 0x223a, 0x2242, 0x224a, 0x2252, 0x225a, 0x2262, + /* char values 0x011_ */ + 0x0000, 0x0000, 0x226a, 0x2272, 0x227a, 0x2282, 0x228a, 0x2292, + 0x229a, 0x22a2, 0x22aa, 0x22b2, 0x22ba, 0x22c2, 0x22ca, 0x22d2, + /* char values 0x012_ */ + 0x22da, 0x22e2, 0x22ea, 0x22f2, 0x22fa, 0x2302, 0x0000, 0x0000, + 0x230a, 0x2312, 0x231a, 0x2322, 0x232a, 0x2332, 0x233a, 0x2342, + /* char values 0x013_ */ + 0x234a, 0x0000, 0x0000, 0x0000, 0x2352, 0x235a, 0x2362, 0x236a, + 0x0000, 0x2372, 0x237a, 0x2382, 0x238a, 0x2392, 0x239a, 0x0000, + /* char values 0x014_ */ + 0x0000, 0x0000, 0x0000, 0x23a2, 0x23aa, 0x23b2, 0x23ba, 0x23c2, + 0x23ca, 0x0000, 0x0000, 0x0000, 0x23d2, 0x23da, 0x23e2, 0x23ea, + /* char values 0x015_ */ + 0x23f2, 0x23fa, 0x0000, 0x0000, 0x2402, 0x240a, 0x2412, 0x241a, + 0x2422, 0x242a, 0x2432, 0x243a, 0x2442, 0x244a, 0x2452, 0x245a, + /* char values 0x016_ */ + 0x2462, 0x246a, 0x2472, 0x247a, 0x2482, 0x248a, 0x0000, 0x0000, + 0x2492, 0x249a, 0x24a2, 0x24aa, 0x24b2, 0x24ba, 0x24c2, 0x24ca, + /* char values 0x017_ */ + 0x24d2, 0x24da, 0x24e2, 0x24ea, 0x24f2, 0x24fa, 0x2502, 0x250a, + 0x2512, 0x251a, 0x2522, 0x252a, 0x2532, 0x253a, 0x2542, 0x0000, + /* char values 0x01a_ */ + 0x254a, 0x2552, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x255a, + /* char values 0x01b_ */ + 0x2562, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x01c_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x256a, 0x2572, 0x257a, + /* char values 0x01d_ */ + 0x2582, 0x258a, 0x2592, 0x259a, 0x25a2, 0x25ab, 0x25b7, 0x25c3, + 0x25cf, 0x25db, 0x25e7, 0x25f3, 0x25ff, 0x0000, 0x260b, 0x2617, + /* char values 0x01e_ */ + 0x2623, 0x262f, 0x263a, 0x2642, 0x0000, 0x0000, 0x264a, 0x2652, + 0x265a, 0x2662, 0x266a, 0x2672, 0x267b, 0x2687, 0x2692, 0x269a, + /* char values 0x01f_ */ + 0x26a2, 0x0000, 0x0000, 0x0000, 0x26aa, 0x26b2, 0x0000, 0x0000, + 0x0000, 0x0000, 0x26bb, 0x26c7, 0x26d2, 0x26da, 0x26e2, 0x26ea, + /* char table 0x02__ */ + 0x0170, 0x0180, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x020_ */ + 0x26f2, 0x26fa, 0x2702, 0x270a, 0x2712, 0x271a, 0x2722, 0x272a, + 0x2732, 0x273a, 0x2742, 0x274a, 0x2752, 0x275a, 0x2762, 0x276a, + /* char values 0x021_ */ + 0x2772, 0x277a, 0x2782, 0x278a, 0x2792, 0x279a, 0x27a2, 0x27aa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x03__ */ + 0x0000, 0x01a0, 0x0000, 0x0000, 0x01b0, 0x0000, 0x0000, 0x01c0, + 0x01d0, 0x01e0, 0x01f0, 0x0200, 0x0210, 0x0220, 0x0000, 0x0000, + /* char values 0x031_ */ + 0x27b2, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x034_ */ + 0x27b9, 0x27bd, 0x0000, 0x27c1, 0x27c6, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x037_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x27cd, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x27d1, 0x0000, + /* char values 0x038_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x27d6, 0x27de, 0x27e5, + 0x27ea, 0x27f2, 0x27fa, 0x0000, 0x2802, 0x0000, 0x280a, 0x2812, + /* char values 0x039_ */ + 0x281b, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x03a_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x2826, 0x282e, 0x2836, 0x283e, 0x2846, 0x284e, + /* char values 0x03b_ */ + 0x2857, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x03c_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x2862, 0x286a, 0x2872, 0x287a, 0x2882, 0x0000, + /* char values 0x03d_ */ + 0x0000, 0x0000, 0x0000, 0x288a, 0x2892, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x04__ */ + 0x0240, 0x0250, 0x0000, 0x0260, 0x0000, 0x0270, 0x0000, 0x0280, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0290, 0x02a0, 0x02b0, 0x02c0, + /* char values 0x040_ */ + 0x0000, 0x289a, 0x0000, 0x28a2, 0x0000, 0x0000, 0x0000, 0x28aa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x28b2, 0x0000, 0x28ba, 0x0000, + /* char values 0x041_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x28c2, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x043_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x28ca, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x045_ */ + 0x0000, 0x28d2, 0x0000, 0x28da, 0x0000, 0x0000, 0x0000, 0x28e2, + 0x0000, 0x0000, 0x0000, 0x0000, 0x28ea, 0x0000, 0x28f2, 0x0000, + /* char values 0x047_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x28fa, 0x2902, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x04c_ */ + 0x0000, 0x290a, 0x2912, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x04d_ */ + 0x291a, 0x2922, 0x292a, 0x2932, 0x2939, 0x293d, 0x2942, 0x294a, + 0x2951, 0x2955, 0x295a, 0x2962, 0x296a, 0x2972, 0x297a, 0x2982, + /* char values 0x04e_ */ + 0x2989, 0x298d, 0x2992, 0x299a, 0x29a2, 0x29aa, 0x29b2, 0x29ba, + 0x29c1, 0x29c5, 0x29ca, 0x29d2, 0x0000, 0x0000, 0x29da, 0x29e2, + /* char values 0x04f_ */ + 0x29ea, 0x29f2, 0x29fa, 0x2a02, 0x2a0a, 0x2a12, 0x0000, 0x0000, + 0x2a1a, 0x2a22, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x09__ */ + 0x0000, 0x0000, 0x02e0, 0x02f0, 0x0000, 0x0300, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0310, 0x0320, 0x0330, 0x0000, 0x0000, + /* char values 0x092_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x2a2a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x093_ */ + 0x0000, 0x2a32, 0x0000, 0x0000, 0x2a3a, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x095_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x2a42, 0x2a4a, 0x2a52, 0x2a5a, 0x2a62, 0x2a6a, 0x2a72, 0x2a7a, + /* char values 0x09b_ */ + 0x2a82, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x09c_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x2a8a, 0x2a92, 0x0000, 0x0000, 0x0000, + /* char values 0x09d_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x2a9a, 0x2aa2, 0x0000, 0x2aaa, + /* char table 0x0a__ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0350, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0a5_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x2ab2, 0x2aba, 0x2ac2, 0x2aca, 0x0000, 0x2ad2, 0x0000, + /* char table 0x0b__ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0370, 0x0380, 0x0000, 0x0000, + 0x0000, 0x0390, 0x0000, 0x0000, 0x03a0, 0x0000, 0x0000, 0x0000, + /* char values 0x0b4_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x2ada, 0x0000, 0x0000, 0x2ae2, 0x2aea, 0x0000, 0x0000, 0x0000, + /* char values 0x0b5_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x2af2, 0x2afa, 0x0000, 0x2b02, + /* char values 0x0b9_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x2b0a, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0bc_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x2b12, 0x2b1a, 0x2b22, 0x0000, 0x0000, 0x0000, + /* char table 0x0c__ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x03c0, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x03d0, 0x0000, 0x0000, 0x0000, + /* char values 0x0c4_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x2b2a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0cc_ */ + 0x2b32, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2b3a, + 0x2b42, 0x0000, 0x2b4a, 0x2b53, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x0d__ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x03f0, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0d4_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x2b5e, 0x2b66, 0x2b6e, 0x0000, 0x0000, 0x0000, + /* char table 0x0e__ */ + 0x0000, 0x0000, 0x0000, 0x0410, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0420, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0e3_ */ + 0x0000, 0x0000, 0x0000, 0x2b76, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0eb_ */ + 0x0000, 0x0000, 0x0000, 0x2b7e, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x0f__ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0440, 0x0450, 0x0460, 0x0470, + 0x0480, 0x0490, 0x04a0, 0x04b0, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0f4_ */ + 0x0000, 0x0000, 0x0000, 0x2b86, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2b8e, 0x0000, 0x0000, + /* char values 0x0f5_ */ + 0x0000, 0x0000, 0x2b96, 0x0000, 0x0000, 0x0000, 0x0000, 0x2b9e, + 0x0000, 0x0000, 0x0000, 0x0000, 0x2ba6, 0x0000, 0x0000, 0x0000, + /* char values 0x0f6_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x2bae, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0f7_ */ + 0x0000, 0x0000, 0x0000, 0x2bb6, 0x0000, 0x2bbe, 0x2bc6, 0x2bcf, + 0x2bda, 0x2be3, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0f8_ */ + 0x0000, 0x2bee, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x0f9_ */ + 0x0000, 0x0000, 0x0000, 0x2bf6, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2bfe, 0x0000, 0x0000, + /* char values 0x0fa_ */ + 0x0000, 0x0000, 0x2c06, 0x0000, 0x0000, 0x0000, 0x0000, 0x2c0e, + 0x0000, 0x0000, 0x0000, 0x0000, 0x2c16, 0x0000, 0x0000, 0x0000, + /* char values 0x0fb_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x2c1e, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x1___ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x04d0, 0x05e0, + /* char table 0x1e__ */ + 0x04e0, 0x04f0, 0x0500, 0x0510, 0x0520, 0x0530, 0x0540, 0x0550, + 0x0560, 0x0570, 0x0580, 0x0590, 0x05a0, 0x05b0, 0x05c0, 0x05d0, + /* char values 0x1e0_ */ + 0x2c26, 0x2c2e, 0x2c36, 0x2c3e, 0x2c46, 0x2c4e, 0x2c56, 0x2c5e, + 0x2c67, 0x2c73, 0x2c7e, 0x2c86, 0x2c8e, 0x2c96, 0x2c9e, 0x2ca6, + /* char values 0x1e1_ */ + 0x2cae, 0x2cb6, 0x2cbe, 0x2cc6, 0x2ccf, 0x2cdb, 0x2ce7, 0x2cf3, + 0x2cfe, 0x2d06, 0x2d0e, 0x2d16, 0x2d1f, 0x2d2b, 0x2d36, 0x2d3e, + /* char values 0x1e2_ */ + 0x2d46, 0x2d4e, 0x2d56, 0x2d5e, 0x2d66, 0x2d6e, 0x2d76, 0x2d7e, + 0x2d86, 0x2d8e, 0x2d96, 0x2d9e, 0x2da6, 0x2dae, 0x2db7, 0x2dc3, + /* char values 0x1e3_ */ + 0x2dce, 0x2dd6, 0x2dde, 0x2de6, 0x2dee, 0x2df6, 0x2dfe, 0x2e06, + 0x2e0f, 0x2e1b, 0x2e26, 0x2e2e, 0x2e36, 0x2e3e, 0x2e46, 0x2e4e, + /* char values 0x1e4_ */ + 0x2e56, 0x2e5e, 0x2e66, 0x2e6e, 0x2e76, 0x2e7e, 0x2e86, 0x2e8e, + 0x2e96, 0x2e9e, 0x2ea6, 0x2eae, 0x2eb7, 0x2ec3, 0x2ecf, 0x2edb, + /* char values 0x1e5_ */ + 0x2ee7, 0x2ef3, 0x2eff, 0x2f0b, 0x2f16, 0x2f1e, 0x2f26, 0x2f2e, + 0x2f36, 0x2f3e, 0x2f46, 0x2f4e, 0x2f57, 0x2f63, 0x2f6e, 0x2f76, + /* char values 0x1e6_ */ + 0x2f7e, 0x2f86, 0x2f8e, 0x2f96, 0x2f9f, 0x2fab, 0x2fb7, 0x2fc3, + 0x2fcf, 0x2fdb, 0x2fe6, 0x2fee, 0x2ff6, 0x2ffe, 0x3006, 0x300e, + /* char values 0x1e7_ */ + 0x3016, 0x301e, 0x3026, 0x302e, 0x3036, 0x303e, 0x3046, 0x304e, + 0x3057, 0x3063, 0x306f, 0x307b, 0x3086, 0x308e, 0x3096, 0x309e, + /* char values 0x1e8_ */ + 0x30a6, 0x30ae, 0x30b6, 0x30be, 0x30c6, 0x30ce, 0x30d6, 0x30de, + 0x30e6, 0x30ee, 0x30f6, 0x30fe, 0x3106, 0x310e, 0x3116, 0x311e, + /* char values 0x1e9_ */ + 0x3126, 0x312e, 0x3136, 0x313e, 0x3146, 0x314e, 0x3156, 0x315e, + 0x3166, 0x316e, 0x0000, 0x3176, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x1ea_ */ + 0x317e, 0x3186, 0x318e, 0x3196, 0x319f, 0x31ab, 0x31b7, 0x31c3, + 0x31cf, 0x31db, 0x31e7, 0x31f3, 0x31ff, 0x320b, 0x3217, 0x3223, + /* char values 0x1eb_ */ + 0x322f, 0x323b, 0x3247, 0x3253, 0x325f, 0x326b, 0x3277, 0x3283, + 0x328e, 0x3296, 0x329e, 0x32a6, 0x32ae, 0x32b6, 0x32bf, 0x32cb, + /* char values 0x1ec_ */ + 0x32d7, 0x32e3, 0x32ef, 0x32fb, 0x3307, 0x3313, 0x331f, 0x332b, + 0x3336, 0x333e, 0x3346, 0x334e, 0x3356, 0x335e, 0x3366, 0x336e, + /* char values 0x1ed_ */ + 0x3377, 0x3383, 0x338f, 0x339b, 0x33a7, 0x33b3, 0x33bf, 0x33cb, + 0x33d7, 0x33e3, 0x33ef, 0x33fb, 0x3407, 0x3413, 0x341f, 0x342b, + /* char values 0x1ee_ */ + 0x3437, 0x3443, 0x344f, 0x345b, 0x3466, 0x346e, 0x3476, 0x347e, + 0x3487, 0x3493, 0x349f, 0x34ab, 0x34b7, 0x34c3, 0x34cf, 0x34db, + /* char values 0x1ef_ */ + 0x34e7, 0x34f3, 0x34fe, 0x3506, 0x350e, 0x3516, 0x351e, 0x3526, + 0x352e, 0x3536, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x1f__ */ + 0x05f0, 0x0600, 0x0610, 0x0620, 0x0630, 0x0640, 0x0650, 0x0660, + 0x0670, 0x0680, 0x0690, 0x06a0, 0x06b0, 0x06c0, 0x06d0, 0x06e0, + /* char values 0x1f0_ */ + 0x353e, 0x3546, 0x354f, 0x355b, 0x3567, 0x3573, 0x357f, 0x358b, + 0x3596, 0x359e, 0x35a7, 0x35b3, 0x35bf, 0x35cb, 0x35d7, 0x35e3, + /* char values 0x1f1_ */ + 0x35ee, 0x35f6, 0x35ff, 0x360b, 0x3617, 0x3623, 0x0000, 0x0000, + 0x362e, 0x3636, 0x363f, 0x364b, 0x3657, 0x3663, 0x0000, 0x0000, + /* char values 0x1f2_ */ + 0x366e, 0x3676, 0x367f, 0x368b, 0x3697, 0x36a3, 0x36af, 0x36bb, + 0x36c6, 0x36ce, 0x36d7, 0x36e3, 0x36ef, 0x36fb, 0x3707, 0x3713, + /* char values 0x1f3_ */ + 0x371e, 0x3726, 0x372f, 0x373b, 0x3747, 0x3753, 0x375f, 0x376b, + 0x3776, 0x377e, 0x3787, 0x3793, 0x379f, 0x37ab, 0x37b7, 0x37c3, + /* char values 0x1f4_ */ + 0x37ce, 0x37d6, 0x37df, 0x37eb, 0x37f7, 0x3803, 0x0000, 0x0000, + 0x380e, 0x3816, 0x381f, 0x382b, 0x3837, 0x3843, 0x0000, 0x0000, + /* char values 0x1f5_ */ + 0x384e, 0x3856, 0x385f, 0x386b, 0x3877, 0x3883, 0x388f, 0x389b, + 0x0000, 0x38a6, 0x0000, 0x38af, 0x0000, 0x38bb, 0x0000, 0x38c7, + /* char values 0x1f6_ */ + 0x38d2, 0x38da, 0x38e3, 0x38ef, 0x38fb, 0x3907, 0x3913, 0x391f, + 0x392a, 0x3932, 0x393b, 0x3947, 0x3953, 0x395f, 0x396b, 0x3977, + /* char values 0x1f7_ */ + 0x3982, 0x398a, 0x3992, 0x399a, 0x39a2, 0x39aa, 0x39b2, 0x39ba, + 0x39c2, 0x39ca, 0x39d2, 0x39da, 0x39e2, 0x39ea, 0x0000, 0x0000, + /* char values 0x1f8_ */ + 0x39f3, 0x39ff, 0x3a0c, 0x3a1c, 0x3a2c, 0x3a3c, 0x3a4c, 0x3a5c, + 0x3a6b, 0x3a77, 0x3a84, 0x3a94, 0x3aa4, 0x3ab4, 0x3ac4, 0x3ad4, + /* char values 0x1f9_ */ + 0x3ae3, 0x3aef, 0x3afc, 0x3b0c, 0x3b1c, 0x3b2c, 0x3b3c, 0x3b4c, + 0x3b5b, 0x3b67, 0x3b74, 0x3b84, 0x3b94, 0x3ba4, 0x3bb4, 0x3bc4, + /* char values 0x1fa_ */ + 0x3bd3, 0x3bdf, 0x3bec, 0x3bfc, 0x3c0c, 0x3c1c, 0x3c2c, 0x3c3c, + 0x3c4b, 0x3c57, 0x3c64, 0x3c74, 0x3c84, 0x3c94, 0x3ca4, 0x3cb4, + /* char values 0x1fb_ */ + 0x3cc2, 0x3cca, 0x3cd3, 0x3cde, 0x3ce7, 0x0000, 0x3cf2, 0x3cfb, + 0x3d06, 0x3d0e, 0x3d16, 0x3d1e, 0x3d26, 0x0000, 0x3d2d, 0x0000, + /* char values 0x1fc_ */ + 0x0000, 0x3d32, 0x3d3b, 0x3d46, 0x3d4f, 0x0000, 0x3d5a, 0x3d63, + 0x3d6e, 0x3d76, 0x3d7e, 0x3d86, 0x3d8e, 0x3d96, 0x3d9e, 0x3da6, + /* char values 0x1fd_ */ + 0x3dae, 0x3db6, 0x3dbf, 0x3dcb, 0x0000, 0x0000, 0x3dd6, 0x3ddf, + 0x3dea, 0x3df2, 0x3dfa, 0x3e02, 0x0000, 0x3e0a, 0x3e12, 0x3e1a, + /* char values 0x1fe_ */ + 0x3e22, 0x3e2a, 0x3e33, 0x3e3f, 0x3e4a, 0x3e52, 0x3e5a, 0x3e63, + 0x3e6e, 0x3e76, 0x3e7e, 0x3e86, 0x3e8e, 0x3e96, 0x3e9e, 0x3ea5, + /* char values 0x1ff_ */ + 0x0000, 0x0000, 0x3eab, 0x3eb6, 0x3ebf, 0x0000, 0x3eca, 0x3ed3, + 0x3ede, 0x3ee6, 0x3eee, 0x3ef6, 0x3efe, 0x3f05, 0x0000, 0x0000, + /* char table 0x3___ */ + 0x0700, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0x30__ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0710, 0x0720, 0x0730, 0x0740, + 0x0000, 0x0750, 0x0760, 0x0770, 0x0780, 0x0790, 0x0000, 0x07a0, + /* char values 0x304_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x3f0a, 0x0000, 0x3f12, 0x0000, + /* char values 0x305_ */ + 0x3f1a, 0x0000, 0x3f22, 0x0000, 0x3f2a, 0x0000, 0x3f32, 0x0000, + 0x3f3a, 0x0000, 0x3f42, 0x0000, 0x3f4a, 0x0000, 0x3f52, 0x0000, + /* char values 0x306_ */ + 0x3f5a, 0x0000, 0x3f62, 0x0000, 0x0000, 0x3f6a, 0x0000, 0x3f72, + 0x0000, 0x3f7a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x307_ */ + 0x3f82, 0x3f8a, 0x0000, 0x3f92, 0x3f9a, 0x0000, 0x3fa2, 0x3faa, + 0x0000, 0x3fb2, 0x3fba, 0x0000, 0x3fc2, 0x3fca, 0x0000, 0x0000, + /* char values 0x309_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x3fd2, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x3fda, 0x0000, + /* char values 0x30a_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x3fe2, 0x0000, 0x3fea, 0x0000, + /* char values 0x30b_ */ + 0x3ff2, 0x0000, 0x3ffa, 0x0000, 0x4002, 0x0000, 0x400a, 0x0000, + 0x4012, 0x0000, 0x401a, 0x0000, 0x4022, 0x0000, 0x402a, 0x0000, + /* char values 0x30c_ */ + 0x4032, 0x0000, 0x403a, 0x0000, 0x0000, 0x4042, 0x0000, 0x404a, + 0x0000, 0x4052, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0x30d_ */ + 0x405a, 0x4062, 0x0000, 0x406a, 0x4072, 0x0000, 0x407a, 0x4082, + 0x0000, 0x408a, 0x4092, 0x0000, 0x409a, 0x40a2, 0x0000, 0x0000, + /* char values 0x30f_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x40aa, 0x0000, 0x0000, 0x40b2, + 0x40ba, 0x40c2, 0x40ca, 0x0000, 0x0000, 0x0000, 0x40d2, 0x0000, + /* char table 0xf___ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x07c0, 0x0000, 0x0000, 0x0000, 0x0000, + /* char table 0xfb__ */ + 0x0000, 0x07d0, 0x07e0, 0x07f0, 0x0800, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* char values 0xfb1_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x40da, + /* char values 0xfb2_ */ + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x40e2, 0x40ea, 0x40f3, 0x40ff, 0x410a, 0x4112, + /* char values 0xfb3_ */ + 0x411a, 0x4122, 0x412a, 0x4132, 0x413a, 0x4142, 0x414a, 0x0000, + 0x4152, 0x415a, 0x4162, 0x416a, 0x4172, 0x0000, 0x417a, 0x0000, + /* char values 0xfb4_ */ + 0x4182, 0x418a, 0x0000, 0x4192, 0x419a, 0x0000, 0x41a2, 0x41aa, + 0x41b2, 0x41ba, 0x41c2, 0x41ca, 0x41d2, 0x41da, 0x41e2, 0x0000, + /* decomposed characters */ + 0x0041, 0x0300, 0x0041, 0x0301, 0x0041, 0x0302, 0x0041, 0x0303, + 0x0041, 0x0308, 0x0041, 0x030a, 0x0043, 0x0327, 0x0045, 0x0300, + 0x0045, 0x0301, 0x0045, 0x0302, 0x0045, 0x0308, 0x0049, 0x0300, + 0x0049, 0x0301, 0x0049, 0x0302, 0x0049, 0x0308, 0x004e, 0x0303, + 0x004f, 0x0300, 0x004f, 0x0301, 0x004f, 0x0302, 0x004f, 0x0303, + 0x004f, 0x0308, 0x0055, 0x0300, 0x0055, 0x0301, 0x0055, 0x0302, + 0x0055, 0x0308, 0x0059, 0x0301, 0x0061, 0x0300, 0x0061, 0x0301, + 0x0061, 0x0302, 0x0061, 0x0303, 0x0061, 0x0308, 0x0061, 0x030a, + 0x0063, 0x0327, 0x0065, 0x0300, 0x0065, 0x0301, 0x0065, 0x0302, + 0x0065, 0x0308, 0x0069, 0x0300, 0x0069, 0x0301, 0x0069, 0x0302, + 0x0069, 0x0308, 0x006e, 0x0303, 0x006f, 0x0300, 0x006f, 0x0301, + 0x006f, 0x0302, 0x006f, 0x0303, 0x006f, 0x0308, 0x0075, 0x0300, + 0x0075, 0x0301, 0x0075, 0x0302, 0x0075, 0x0308, 0x0079, 0x0301, + 0x0079, 0x0308, 0x0041, 0x0304, 0x0061, 0x0304, 0x0041, 0x0306, + 0x0061, 0x0306, 0x0041, 0x0328, 0x0061, 0x0328, 0x0043, 0x0301, + 0x0063, 0x0301, 0x0043, 0x0302, 0x0063, 0x0302, 0x0043, 0x0307, + 0x0063, 0x0307, 0x0043, 0x030c, 0x0063, 0x030c, 0x0044, 0x030c, + 0x0064, 0x030c, 0x0045, 0x0304, 0x0065, 0x0304, 0x0045, 0x0306, + 0x0065, 0x0306, 0x0045, 0x0307, 0x0065, 0x0307, 0x0045, 0x0328, + 0x0065, 0x0328, 0x0045, 0x030c, 0x0065, 0x030c, 0x0047, 0x0302, + 0x0067, 0x0302, 0x0047, 0x0306, 0x0067, 0x0306, 0x0047, 0x0307, + 0x0067, 0x0307, 0x0047, 0x0327, 0x0067, 0x0327, 0x0048, 0x0302, + 0x0068, 0x0302, 0x0049, 0x0303, 0x0069, 0x0303, 0x0049, 0x0304, + 0x0069, 0x0304, 0x0049, 0x0306, 0x0069, 0x0306, 0x0049, 0x0328, + 0x0069, 0x0328, 0x0049, 0x0307, 0x004a, 0x0302, 0x006a, 0x0302, + 0x004b, 0x0327, 0x006b, 0x0327, 0x004c, 0x0301, 0x006c, 0x0301, + 0x004c, 0x0327, 0x006c, 0x0327, 0x004c, 0x030c, 0x006c, 0x030c, + 0x004e, 0x0301, 0x006e, 0x0301, 0x004e, 0x0327, 0x006e, 0x0327, + 0x004e, 0x030c, 0x006e, 0x030c, 0x004f, 0x0304, 0x006f, 0x0304, + 0x004f, 0x0306, 0x006f, 0x0306, 0x004f, 0x030b, 0x006f, 0x030b, + 0x0052, 0x0301, 0x0072, 0x0301, 0x0052, 0x0327, 0x0072, 0x0327, + 0x0052, 0x030c, 0x0072, 0x030c, 0x0053, 0x0301, 0x0073, 0x0301, + 0x0053, 0x0302, 0x0073, 0x0302, 0x0053, 0x0327, 0x0073, 0x0327, + 0x0053, 0x030c, 0x0073, 0x030c, 0x0054, 0x0327, 0x0074, 0x0327, + 0x0054, 0x030c, 0x0074, 0x030c, 0x0055, 0x0303, 0x0075, 0x0303, + 0x0055, 0x0304, 0x0075, 0x0304, 0x0055, 0x0306, 0x0075, 0x0306, + 0x0055, 0x030a, 0x0075, 0x030a, 0x0055, 0x030b, 0x0075, 0x030b, + 0x0055, 0x0328, 0x0075, 0x0328, 0x0057, 0x0302, 0x0077, 0x0302, + 0x0059, 0x0302, 0x0079, 0x0302, 0x0059, 0x0308, 0x005a, 0x0301, + 0x007a, 0x0301, 0x005a, 0x0307, 0x007a, 0x0307, 0x005a, 0x030c, + 0x007a, 0x030c, 0x004f, 0x031b, 0x006f, 0x031b, 0x0055, 0x031b, + 0x0075, 0x031b, 0x0041, 0x030c, 0x0061, 0x030c, 0x0049, 0x030c, + 0x0069, 0x030c, 0x004f, 0x030c, 0x006f, 0x030c, 0x0055, 0x030c, + 0x0075, 0x030c, 0x0055, 0x0308, 0x0304, 0x0075, 0x0308, 0x0304, + 0x0055, 0x0308, 0x0301, 0x0075, 0x0308, 0x0301, 0x0055, 0x0308, + 0x030c, 0x0075, 0x0308, 0x030c, 0x0055, 0x0308, 0x0300, 0x0075, + 0x0308, 0x0300, 0x0041, 0x0308, 0x0304, 0x0061, 0x0308, 0x0304, + 0x0041, 0x0307, 0x0304, 0x0061, 0x0307, 0x0304, 0x00c6, 0x0304, + 0x00e6, 0x0304, 0x0047, 0x030c, 0x0067, 0x030c, 0x004b, 0x030c, + 0x006b, 0x030c, 0x004f, 0x0328, 0x006f, 0x0328, 0x004f, 0x0328, + 0x0304, 0x006f, 0x0328, 0x0304, 0x01b7, 0x030c, 0x0292, 0x030c, + 0x006a, 0x030c, 0x0047, 0x0301, 0x0067, 0x0301, 0x0041, 0x030a, + 0x0301, 0x0061, 0x030a, 0x0301, 0x00c6, 0x0301, 0x00e6, 0x0301, + 0x00d8, 0x0301, 0x00f8, 0x0301, 0x0041, 0x030f, 0x0061, 0x030f, + 0x0041, 0x0311, 0x0061, 0x0311, 0x0045, 0x030f, 0x0065, 0x030f, + 0x0045, 0x0311, 0x0065, 0x0311, 0x0049, 0x030f, 0x0069, 0x030f, + 0x0049, 0x0311, 0x0069, 0x0311, 0x004f, 0x030f, 0x006f, 0x030f, + 0x004f, 0x0311, 0x006f, 0x0311, 0x0052, 0x030f, 0x0072, 0x030f, + 0x0052, 0x0311, 0x0072, 0x0311, 0x0055, 0x030f, 0x0075, 0x030f, + 0x0055, 0x0311, 0x0075, 0x0311, 0x0306, 0x0307, 0x0300, 0x0301, + 0x0313, 0x0308, 0x030d, 0x02b9, 0x003b, 0x00a8, 0x030d, 0x0391, + 0x030d, 0x00b7, 0x0395, 0x030d, 0x0397, 0x030d, 0x0399, 0x030d, + 0x039f, 0x030d, 0x03a5, 0x030d, 0x03a9, 0x030d, 0x03b9, 0x0308, + 0x030d, 0x0399, 0x0308, 0x03a5, 0x0308, 0x03b1, 0x030d, 0x03b5, + 0x030d, 0x03b7, 0x030d, 0x03b9, 0x030d, 0x03c5, 0x0308, 0x030d, + 0x03b9, 0x0308, 0x03c5, 0x0308, 0x03bf, 0x030d, 0x03c5, 0x030d, + 0x03c9, 0x030d, 0x03d2, 0x030d, 0x03d2, 0x0308, 0x0415, 0x0308, + 0x0413, 0x0301, 0x0406, 0x0308, 0x041a, 0x0301, 0x0423, 0x0306, + 0x0418, 0x0306, 0x0438, 0x0306, 0x0435, 0x0308, 0x0433, 0x0301, + 0x0456, 0x0308, 0x043a, 0x0301, 0x0443, 0x0306, 0x0474, 0x030f, + 0x0475, 0x030f, 0x0416, 0x0306, 0x0436, 0x0306, 0x0410, 0x0306, + 0x0430, 0x0306, 0x0410, 0x0308, 0x0430, 0x0308, 0x00c6, 0x00e6, + 0x0415, 0x0306, 0x0435, 0x0306, 0x018f, 0x0259, 0x018f, 0x0308, + 0x0259, 0x0308, 0x0416, 0x0308, 0x0436, 0x0308, 0x0417, 0x0308, + 0x0437, 0x0308, 0x01b7, 0x0292, 0x0418, 0x0304, 0x0438, 0x0304, + 0x0418, 0x0308, 0x0438, 0x0308, 0x041e, 0x0308, 0x043e, 0x0308, + 0x019f, 0x0275, 0x019f, 0x0308, 0x0275, 0x0308, 0x0423, 0x0304, + 0x0443, 0x0304, 0x0423, 0x0308, 0x0443, 0x0308, 0x0423, 0x030b, + 0x0443, 0x030b, 0x0427, 0x0308, 0x0447, 0x0308, 0x042b, 0x0308, + 0x044b, 0x0308, 0x0928, 0x093c, 0x0930, 0x093c, 0x0933, 0x093c, + 0x0915, 0x093c, 0x0916, 0x093c, 0x0917, 0x093c, 0x091c, 0x093c, + 0x0921, 0x093c, 0x0922, 0x093c, 0x092b, 0x093c, 0x092f, 0x093c, + 0x09ac, 0x09bc, 0x09c7, 0x09be, 0x09c7, 0x09d7, 0x09a1, 0x09bc, + 0x09a2, 0x09bc, 0x09af, 0x09bc, 0x0a16, 0x0a3c, 0x0a17, 0x0a3c, + 0x0a1c, 0x0a3c, 0x0a21, 0x0a3c, 0x0a2b, 0x0a3c, 0x0b47, 0x0b56, + 0x0b47, 0x0b3e, 0x0b47, 0x0b57, 0x0b21, 0x0b3c, 0x0b22, 0x0b3c, + 0x0b2f, 0x0b3c, 0x0b92, 0x0bd7, 0x0bc6, 0x0bbe, 0x0bc7, 0x0bbe, + 0x0bc6, 0x0bd7, 0x0c46, 0x0c56, 0x0cbf, 0x0cd5, 0x0cc6, 0x0cd5, + 0x0cc6, 0x0cd6, 0x0cc6, 0x0cc2, 0x0cc6, 0x0cc2, 0x0cd5, 0x0d46, + 0x0d3e, 0x0d47, 0x0d3e, 0x0d46, 0x0d57, 0x0e4d, 0x0e32, 0x0ecd, + 0x0eb2, 0x0f42, 0x0fb7, 0x0f4c, 0x0fb7, 0x0f51, 0x0fb7, 0x0f56, + 0x0fb7, 0x0f5b, 0x0fb7, 0x0f40, 0x0fb5, 0x0f72, 0x0f71, 0x0f74, + 0x0f71, 0x0fb2, 0x0f80, 0x0fb2, 0x0f80, 0x0f71, 0x0fb3, 0x0f80, + 0x0fb3, 0x0f80, 0x0f71, 0x0f80, 0x0f71, 0x0f92, 0x0fb7, 0x0f9c, + 0x0fb7, 0x0fa1, 0x0fb7, 0x0fa6, 0x0fb7, 0x0fab, 0x0fb7, 0x0f90, + 0x0fb5, 0x0041, 0x0325, 0x0061, 0x0325, 0x0042, 0x0307, 0x0062, + 0x0307, 0x0042, 0x0323, 0x0062, 0x0323, 0x0042, 0x0331, 0x0062, + 0x0331, 0x0043, 0x0327, 0x0301, 0x0063, 0x0327, 0x0301, 0x0044, + 0x0307, 0x0064, 0x0307, 0x0044, 0x0323, 0x0064, 0x0323, 0x0044, + 0x0331, 0x0064, 0x0331, 0x0044, 0x0327, 0x0064, 0x0327, 0x0044, + 0x032d, 0x0064, 0x032d, 0x0045, 0x0304, 0x0300, 0x0065, 0x0304, + 0x0300, 0x0045, 0x0304, 0x0301, 0x0065, 0x0304, 0x0301, 0x0045, + 0x032d, 0x0065, 0x032d, 0x0045, 0x0330, 0x0065, 0x0330, 0x0045, + 0x0327, 0x0306, 0x0065, 0x0327, 0x0306, 0x0046, 0x0307, 0x0066, + 0x0307, 0x0047, 0x0304, 0x0067, 0x0304, 0x0048, 0x0307, 0x0068, + 0x0307, 0x0048, 0x0323, 0x0068, 0x0323, 0x0048, 0x0308, 0x0068, + 0x0308, 0x0048, 0x0327, 0x0068, 0x0327, 0x0048, 0x032e, 0x0068, + 0x032e, 0x0049, 0x0330, 0x0069, 0x0330, 0x0049, 0x0308, 0x0301, + 0x0069, 0x0308, 0x0301, 0x004b, 0x0301, 0x006b, 0x0301, 0x004b, + 0x0323, 0x006b, 0x0323, 0x004b, 0x0331, 0x006b, 0x0331, 0x004c, + 0x0323, 0x006c, 0x0323, 0x004c, 0x0323, 0x0304, 0x006c, 0x0323, + 0x0304, 0x004c, 0x0331, 0x006c, 0x0331, 0x004c, 0x032d, 0x006c, + 0x032d, 0x004d, 0x0301, 0x006d, 0x0301, 0x004d, 0x0307, 0x006d, + 0x0307, 0x004d, 0x0323, 0x006d, 0x0323, 0x004e, 0x0307, 0x006e, + 0x0307, 0x004e, 0x0323, 0x006e, 0x0323, 0x004e, 0x0331, 0x006e, + 0x0331, 0x004e, 0x032d, 0x006e, 0x032d, 0x004f, 0x0303, 0x0301, + 0x006f, 0x0303, 0x0301, 0x004f, 0x0303, 0x0308, 0x006f, 0x0303, + 0x0308, 0x004f, 0x0304, 0x0300, 0x006f, 0x0304, 0x0300, 0x004f, + 0x0304, 0x0301, 0x006f, 0x0304, 0x0301, 0x0050, 0x0301, 0x0070, + 0x0301, 0x0050, 0x0307, 0x0070, 0x0307, 0x0052, 0x0307, 0x0072, + 0x0307, 0x0052, 0x0323, 0x0072, 0x0323, 0x0052, 0x0323, 0x0304, + 0x0072, 0x0323, 0x0304, 0x0052, 0x0331, 0x0072, 0x0331, 0x0053, + 0x0307, 0x0073, 0x0307, 0x0053, 0x0323, 0x0073, 0x0323, 0x0053, + 0x0301, 0x0307, 0x0073, 0x0301, 0x0307, 0x0053, 0x030c, 0x0307, + 0x0073, 0x030c, 0x0307, 0x0053, 0x0323, 0x0307, 0x0073, 0x0323, + 0x0307, 0x0054, 0x0307, 0x0074, 0x0307, 0x0054, 0x0323, 0x0074, + 0x0323, 0x0054, 0x0331, 0x0074, 0x0331, 0x0054, 0x032d, 0x0074, + 0x032d, 0x0055, 0x0324, 0x0075, 0x0324, 0x0055, 0x0330, 0x0075, + 0x0330, 0x0055, 0x032d, 0x0075, 0x032d, 0x0055, 0x0303, 0x0301, + 0x0075, 0x0303, 0x0301, 0x0055, 0x0304, 0x0308, 0x0075, 0x0304, + 0x0308, 0x0056, 0x0303, 0x0076, 0x0303, 0x0056, 0x0323, 0x0076, + 0x0323, 0x0057, 0x0300, 0x0077, 0x0300, 0x0057, 0x0301, 0x0077, + 0x0301, 0x0057, 0x0308, 0x0077, 0x0308, 0x0057, 0x0307, 0x0077, + 0x0307, 0x0057, 0x0323, 0x0077, 0x0323, 0x0058, 0x0307, 0x0078, + 0x0307, 0x0058, 0x0308, 0x0078, 0x0308, 0x0059, 0x0307, 0x0079, + 0x0307, 0x005a, 0x0302, 0x007a, 0x0302, 0x005a, 0x0323, 0x007a, + 0x0323, 0x005a, 0x0331, 0x007a, 0x0331, 0x0068, 0x0331, 0x0074, + 0x0308, 0x0077, 0x030a, 0x0079, 0x030a, 0x017f, 0x0307, 0x0041, + 0x0323, 0x0061, 0x0323, 0x0041, 0x0309, 0x0061, 0x0309, 0x0041, + 0x0302, 0x0301, 0x0061, 0x0302, 0x0301, 0x0041, 0x0302, 0x0300, + 0x0061, 0x0302, 0x0300, 0x0041, 0x0302, 0x0309, 0x0061, 0x0302, + 0x0309, 0x0041, 0x0302, 0x0303, 0x0061, 0x0302, 0x0303, 0x0041, + 0x0323, 0x0302, 0x0061, 0x0323, 0x0302, 0x0041, 0x0306, 0x0301, + 0x0061, 0x0306, 0x0301, 0x0041, 0x0306, 0x0300, 0x0061, 0x0306, + 0x0300, 0x0041, 0x0306, 0x0309, 0x0061, 0x0306, 0x0309, 0x0041, + 0x0306, 0x0303, 0x0061, 0x0306, 0x0303, 0x0041, 0x0323, 0x0306, + 0x0061, 0x0323, 0x0306, 0x0045, 0x0323, 0x0065, 0x0323, 0x0045, + 0x0309, 0x0065, 0x0309, 0x0045, 0x0303, 0x0065, 0x0303, 0x0045, + 0x0302, 0x0301, 0x0065, 0x0302, 0x0301, 0x0045, 0x0302, 0x0300, + 0x0065, 0x0302, 0x0300, 0x0045, 0x0302, 0x0309, 0x0065, 0x0302, + 0x0309, 0x0045, 0x0302, 0x0303, 0x0065, 0x0302, 0x0303, 0x0045, + 0x0323, 0x0302, 0x0065, 0x0323, 0x0302, 0x0049, 0x0309, 0x0069, + 0x0309, 0x0049, 0x0323, 0x0069, 0x0323, 0x004f, 0x0323, 0x006f, + 0x0323, 0x004f, 0x0309, 0x006f, 0x0309, 0x004f, 0x0302, 0x0301, + 0x006f, 0x0302, 0x0301, 0x004f, 0x0302, 0x0300, 0x006f, 0x0302, + 0x0300, 0x004f, 0x0302, 0x0309, 0x006f, 0x0302, 0x0309, 0x004f, + 0x0302, 0x0303, 0x006f, 0x0302, 0x0303, 0x004f, 0x0323, 0x0302, + 0x006f, 0x0323, 0x0302, 0x004f, 0x031b, 0x0301, 0x006f, 0x031b, + 0x0301, 0x004f, 0x031b, 0x0300, 0x006f, 0x031b, 0x0300, 0x004f, + 0x031b, 0x0309, 0x006f, 0x031b, 0x0309, 0x004f, 0x031b, 0x0303, + 0x006f, 0x031b, 0x0303, 0x004f, 0x031b, 0x0323, 0x006f, 0x031b, + 0x0323, 0x0055, 0x0323, 0x0075, 0x0323, 0x0055, 0x0309, 0x0075, + 0x0309, 0x0055, 0x031b, 0x0301, 0x0075, 0x031b, 0x0301, 0x0055, + 0x031b, 0x0300, 0x0075, 0x031b, 0x0300, 0x0055, 0x031b, 0x0309, + 0x0075, 0x031b, 0x0309, 0x0055, 0x031b, 0x0303, 0x0075, 0x031b, + 0x0303, 0x0055, 0x031b, 0x0323, 0x0075, 0x031b, 0x0323, 0x0059, + 0x0300, 0x0079, 0x0300, 0x0059, 0x0323, 0x0079, 0x0323, 0x0059, + 0x0309, 0x0079, 0x0309, 0x0059, 0x0303, 0x0079, 0x0303, 0x03b1, + 0x0313, 0x03b1, 0x0314, 0x03b1, 0x0313, 0x0300, 0x03b1, 0x0314, + 0x0300, 0x03b1, 0x0313, 0x0301, 0x03b1, 0x0314, 0x0301, 0x03b1, + 0x0313, 0x0342, 0x03b1, 0x0314, 0x0342, 0x0391, 0x0313, 0x0391, + 0x0314, 0x0391, 0x0313, 0x0300, 0x0391, 0x0314, 0x0300, 0x0391, + 0x0313, 0x0301, 0x0391, 0x0314, 0x0301, 0x0391, 0x0313, 0x0342, + 0x0391, 0x0314, 0x0342, 0x03b5, 0x0313, 0x03b5, 0x0314, 0x03b5, + 0x0313, 0x0300, 0x03b5, 0x0314, 0x0300, 0x03b5, 0x0313, 0x0301, + 0x03b5, 0x0314, 0x0301, 0x0395, 0x0313, 0x0395, 0x0314, 0x0395, + 0x0313, 0x0300, 0x0395, 0x0314, 0x0300, 0x0395, 0x0313, 0x0301, + 0x0395, 0x0314, 0x0301, 0x03b7, 0x0313, 0x03b7, 0x0314, 0x03b7, + 0x0313, 0x0300, 0x03b7, 0x0314, 0x0300, 0x03b7, 0x0313, 0x0301, + 0x03b7, 0x0314, 0x0301, 0x03b7, 0x0313, 0x0342, 0x03b7, 0x0314, + 0x0342, 0x0397, 0x0313, 0x0397, 0x0314, 0x0397, 0x0313, 0x0300, + 0x0397, 0x0314, 0x0300, 0x0397, 0x0313, 0x0301, 0x0397, 0x0314, + 0x0301, 0x0397, 0x0313, 0x0342, 0x0397, 0x0314, 0x0342, 0x03b9, + 0x0313, 0x03b9, 0x0314, 0x03b9, 0x0313, 0x0300, 0x03b9, 0x0314, + 0x0300, 0x03b9, 0x0313, 0x0301, 0x03b9, 0x0314, 0x0301, 0x03b9, + 0x0313, 0x0342, 0x03b9, 0x0314, 0x0342, 0x0399, 0x0313, 0x0399, + 0x0314, 0x0399, 0x0313, 0x0300, 0x0399, 0x0314, 0x0300, 0x0399, + 0x0313, 0x0301, 0x0399, 0x0314, 0x0301, 0x0399, 0x0313, 0x0342, + 0x0399, 0x0314, 0x0342, 0x03bf, 0x0313, 0x03bf, 0x0314, 0x03bf, + 0x0313, 0x0300, 0x03bf, 0x0314, 0x0300, 0x03bf, 0x0313, 0x0301, + 0x03bf, 0x0314, 0x0301, 0x039f, 0x0313, 0x039f, 0x0314, 0x039f, + 0x0313, 0x0300, 0x039f, 0x0314, 0x0300, 0x039f, 0x0313, 0x0301, + 0x039f, 0x0314, 0x0301, 0x03c5, 0x0313, 0x03c5, 0x0314, 0x03c5, + 0x0313, 0x0300, 0x03c5, 0x0314, 0x0300, 0x03c5, 0x0313, 0x0301, + 0x03c5, 0x0314, 0x0301, 0x03c5, 0x0313, 0x0342, 0x03c5, 0x0314, + 0x0342, 0x03a5, 0x0314, 0x03a5, 0x0314, 0x0300, 0x03a5, 0x0314, + 0x0301, 0x03a5, 0x0314, 0x0342, 0x03c9, 0x0313, 0x03c9, 0x0314, + 0x03c9, 0x0313, 0x0300, 0x03c9, 0x0314, 0x0300, 0x03c9, 0x0313, + 0x0301, 0x03c9, 0x0314, 0x0301, 0x03c9, 0x0313, 0x0342, 0x03c9, + 0x0314, 0x0342, 0x03a9, 0x0313, 0x03a9, 0x0314, 0x03a9, 0x0313, + 0x0300, 0x03a9, 0x0314, 0x0300, 0x03a9, 0x0313, 0x0301, 0x03a9, + 0x0314, 0x0301, 0x03a9, 0x0313, 0x0342, 0x03a9, 0x0314, 0x0342, + 0x03b1, 0x0300, 0x03b1, 0x0301, 0x03b5, 0x0300, 0x03b5, 0x0301, + 0x03b7, 0x0300, 0x03b7, 0x0301, 0x03b9, 0x0300, 0x03b9, 0x0301, + 0x03bf, 0x0300, 0x03bf, 0x0301, 0x03c5, 0x0300, 0x03c5, 0x0301, + 0x03c9, 0x0300, 0x03c9, 0x0301, 0x03b1, 0x0345, 0x0313, 0x03b1, + 0x0345, 0x0314, 0x03b1, 0x0345, 0x0313, 0x0300, 0x03b1, 0x0345, + 0x0314, 0x0300, 0x03b1, 0x0345, 0x0313, 0x0301, 0x03b1, 0x0345, + 0x0314, 0x0301, 0x03b1, 0x0345, 0x0313, 0x0342, 0x03b1, 0x0345, + 0x0314, 0x0342, 0x0391, 0x0345, 0x0313, 0x0391, 0x0345, 0x0314, + 0x0391, 0x0345, 0x0313, 0x0300, 0x0391, 0x0345, 0x0314, 0x0300, + 0x0391, 0x0345, 0x0313, 0x0301, 0x0391, 0x0345, 0x0314, 0x0301, + 0x0391, 0x0345, 0x0313, 0x0342, 0x0391, 0x0345, 0x0314, 0x0342, + 0x03b7, 0x0345, 0x0313, 0x03b7, 0x0345, 0x0314, 0x03b7, 0x0345, + 0x0313, 0x0300, 0x03b7, 0x0345, 0x0314, 0x0300, 0x03b7, 0x0345, + 0x0313, 0x0301, 0x03b7, 0x0345, 0x0314, 0x0301, 0x03b7, 0x0345, + 0x0313, 0x0342, 0x03b7, 0x0345, 0x0314, 0x0342, 0x0397, 0x0345, + 0x0313, 0x0397, 0x0345, 0x0314, 0x0397, 0x0345, 0x0313, 0x0300, + 0x0397, 0x0345, 0x0314, 0x0300, 0x0397, 0x0345, 0x0313, 0x0301, + 0x0397, 0x0345, 0x0314, 0x0301, 0x0397, 0x0345, 0x0313, 0x0342, + 0x0397, 0x0345, 0x0314, 0x0342, 0x03c9, 0x0345, 0x0313, 0x03c9, + 0x0345, 0x0314, 0x03c9, 0x0345, 0x0313, 0x0300, 0x03c9, 0x0345, + 0x0314, 0x0300, 0x03c9, 0x0345, 0x0313, 0x0301, 0x03c9, 0x0345, + 0x0314, 0x0301, 0x03c9, 0x0345, 0x0313, 0x0342, 0x03c9, 0x0345, + 0x0314, 0x0342, 0x03a9, 0x0345, 0x0313, 0x03a9, 0x0345, 0x0314, + 0x03a9, 0x0345, 0x0313, 0x0300, 0x03a9, 0x0345, 0x0314, 0x0300, + 0x03a9, 0x0345, 0x0313, 0x0301, 0x03a9, 0x0345, 0x0314, 0x0301, + 0x03a9, 0x0345, 0x0313, 0x0342, 0x03a9, 0x0345, 0x0314, 0x0342, + 0x03b1, 0x0306, 0x03b1, 0x0304, 0x03b1, 0x0345, 0x0300, 0x03b1, + 0x0345, 0x03b1, 0x0345, 0x0301, 0x03b1, 0x0342, 0x03b1, 0x0345, + 0x0342, 0x0391, 0x0306, 0x0391, 0x0304, 0x0391, 0x0300, 0x0391, + 0x0301, 0x0391, 0x0345, 0x03b9, 0x00a8, 0x0342, 0x03b7, 0x0345, + 0x0300, 0x03b7, 0x0345, 0x03b7, 0x0345, 0x0301, 0x03b7, 0x0342, + 0x03b7, 0x0345, 0x0342, 0x0395, 0x0300, 0x0395, 0x0301, 0x0397, + 0x0300, 0x0397, 0x0301, 0x0397, 0x0345, 0x1fbf, 0x0300, 0x1fbf, + 0x0301, 0x1fbf, 0x0342, 0x03b9, 0x0306, 0x03b9, 0x0304, 0x03b9, + 0x0308, 0x0300, 0x03b9, 0x0308, 0x0301, 0x03b9, 0x0342, 0x03b9, + 0x0308, 0x0342, 0x0399, 0x0306, 0x0399, 0x0304, 0x0399, 0x0300, + 0x0399, 0x0301, 0x1ffe, 0x0300, 0x1ffe, 0x0301, 0x1ffe, 0x0342, + 0x03c5, 0x0306, 0x03c5, 0x0304, 0x03c5, 0x0308, 0x0300, 0x03c5, + 0x0308, 0x0301, 0x03c1, 0x0313, 0x03c1, 0x0314, 0x03c5, 0x0342, + 0x03c5, 0x0308, 0x0342, 0x03a5, 0x0306, 0x03a5, 0x0304, 0x03a5, + 0x0300, 0x03a5, 0x0301, 0x03a1, 0x0314, 0x00a8, 0x0300, 0x00a8, + 0x0301, 0x0060, 0x03c9, 0x0345, 0x0300, 0x03c9, 0x0345, 0x03bf, + 0x0345, 0x0301, 0x03c9, 0x0342, 0x03c9, 0x0345, 0x0342, 0x039f, + 0x0300, 0x039f, 0x0301, 0x03a9, 0x0300, 0x03a9, 0x0301, 0x03a9, + 0x0345, 0x00b4, 0x304b, 0x3099, 0x304d, 0x3099, 0x304f, 0x3099, + 0x3051, 0x3099, 0x3053, 0x3099, 0x3055, 0x3099, 0x3057, 0x3099, + 0x3059, 0x3099, 0x305b, 0x3099, 0x305d, 0x3099, 0x305f, 0x3099, + 0x3061, 0x3099, 0x3064, 0x3099, 0x3066, 0x3099, 0x3068, 0x3099, + 0x306f, 0x3099, 0x306f, 0x309a, 0x3072, 0x3099, 0x3072, 0x309a, + 0x3075, 0x3099, 0x3075, 0x309a, 0x3078, 0x3099, 0x3078, 0x309a, + 0x307b, 0x3099, 0x307b, 0x309a, 0x3046, 0x3099, 0x309d, 0x3099, + 0x30ab, 0x3099, 0x30ad, 0x3099, 0x30af, 0x3099, 0x30b1, 0x3099, + 0x30b3, 0x3099, 0x30b5, 0x3099, 0x30b7, 0x3099, 0x30b9, 0x3099, + 0x30bb, 0x3099, 0x30bd, 0x3099, 0x30bf, 0x3099, 0x30c1, 0x3099, + 0x30c4, 0x3099, 0x30c6, 0x3099, 0x30c8, 0x3099, 0x30cf, 0x3099, + 0x30cf, 0x309a, 0x30d2, 0x3099, 0x30d2, 0x309a, 0x30d5, 0x3099, + 0x30d5, 0x309a, 0x30d8, 0x3099, 0x30d8, 0x309a, 0x30db, 0x3099, + 0x30db, 0x309a, 0x30a6, 0x3099, 0x30ef, 0x3099, 0x30f0, 0x3099, + 0x30f1, 0x3099, 0x30f2, 0x3099, 0x30fd, 0x3099, 0x05f2, 0x05b7, + 0x05e9, 0x05c1, 0x05e9, 0x05c2, 0x05e9, 0x05bc, 0x05c1, 0x05e9, + 0x05bc, 0x05c2, 0x05d0, 0x05b7, 0x05d0, 0x05b8, 0x05d0, 0x05bc, + 0x05d1, 0x05bc, 0x05d2, 0x05bc, 0x05d3, 0x05bc, 0x05d4, 0x05bc, + 0x05d5, 0x05bc, 0x05d6, 0x05bc, 0x05d8, 0x05bc, 0x05d9, 0x05bc, + 0x05da, 0x05bc, 0x05db, 0x05bc, 0x05dc, 0x05bc, 0x05de, 0x05bc, + 0x05e0, 0x05bc, 0x05e1, 0x05bc, 0x05e3, 0x05bc, 0x05e4, 0x05bc, + 0x05e6, 0x05bc, 0x05e7, 0x05bc, 0x05e8, 0x05bc, 0x05e9, 0x05bc, + 0x05ea, 0x05bc, 0x05d5, 0x05b9, 0x05d1, 0x05bf, 0x05db, 0x05bf, + 0x05e4, 0x05bf +}; + +u16 hfsplus_compose_table[] = { + /* base */ + 0x0000, 0x0050, 0x0300, 0x00a4, 0x0301, 0x00e4, 0x0302, 0x015c, + 0x0303, 0x0192, 0x0304, 0x01b4, 0x0306, 0x01e6, 0x0307, 0x0220, + 0x0308, 0x0270, 0x0309, 0x02d2, 0x030a, 0x02ec, 0x030b, 0x02fa, + 0x030c, 0x0308, 0x030d, 0x034c, 0x030f, 0x0370, 0x0311, 0x038e, + 0x0313, 0x03a8, 0x0314, 0x03c6, 0x031b, 0x03e8, 0x0323, 0x03f2, + 0x0324, 0x0440, 0x0325, 0x0446, 0x0327, 0x044c, 0x0328, 0x047a, + 0x032d, 0x0490, 0x032e, 0x04aa, 0x0330, 0x04b0, 0x0331, 0x04be, + 0x0342, 0x04e2, 0x0345, 0x04f4, 0x05b7, 0x0504, 0x05b8, 0x050a, + 0x05b9, 0x050e, 0x05bc, 0x0512, 0x05bf, 0x0540, 0x05c1, 0x0548, + 0x05c2, 0x054c, 0x093c, 0x0550, 0x09bc, 0x0568, 0x09be, 0x0572, + 0x09d7, 0x0576, 0x0a3c, 0x057a, 0x0b3c, 0x0586, 0x0b3e, 0x058e, + 0x0b56, 0x0592, 0x0b57, 0x0596, 0x0bbe, 0x059a, 0x0bd7, 0x05a0, + 0x0c56, 0x05a6, 0x0cc2, 0x05aa, 0x0cd5, 0x05ae, 0x0cd6, 0x05b4, + 0x0d3e, 0x05b8, 0x0d57, 0x05be, 0x0e32, 0x05c2, 0x0eb2, 0x05c6, + 0x0f71, 0x05ca, 0x0f80, 0x05d2, 0x0fb5, 0x05d8, 0x0fb7, 0x05de, + 0x1100, 0x00a2, 0x1101, 0x00a2, 0x1102, 0x00a2, 0x1103, 0x00a2, + 0x1104, 0x00a2, 0x1105, 0x00a2, 0x1106, 0x00a2, 0x1107, 0x00a2, + 0x1108, 0x00a2, 0x1109, 0x00a2, 0x110a, 0x00a2, 0x110b, 0x00a2, + 0x110c, 0x00a2, 0x110d, 0x00a2, 0x110e, 0x00a2, 0x110f, 0x00a2, + 0x1110, 0x00a2, 0x1111, 0x00a2, 0x1112, 0x00a2, 0x3099, 0x05f4, + 0x309a, 0x0656, + /* hangul marker */ + 0xffff, 0x0000, + /* 0x0300 */ + 0x0340, 0x001f, 0x0041, 0x066c, 0x0045, 0x066e, 0x0049, 0x0670, + 0x004f, 0x0672, 0x0055, 0x0674, 0x0057, 0x0676, 0x0059, 0x0678, + 0x0061, 0x067a, 0x0065, 0x067c, 0x0069, 0x067e, 0x006f, 0x0680, + 0x0075, 0x0682, 0x0077, 0x0684, 0x0079, 0x0686, 0x00a8, 0x0688, + 0x0391, 0x068a, 0x0395, 0x068c, 0x0397, 0x068e, 0x0399, 0x0690, + 0x039f, 0x0692, 0x03a5, 0x0694, 0x03a9, 0x0696, 0x03b1, 0x0698, + 0x03b5, 0x069a, 0x03b7, 0x069c, 0x03b9, 0x069e, 0x03bf, 0x06a0, + 0x03c5, 0x06a2, 0x03c9, 0x06a4, 0x1fbf, 0x06a6, 0x1ffe, 0x06a8, + /* 0x0301 */ + 0x0341, 0x003b, 0x0041, 0x06aa, 0x0043, 0x06ac, 0x0045, 0x06ae, + 0x0047, 0x06b0, 0x0049, 0x06b2, 0x004b, 0x06b4, 0x004c, 0x06b6, + 0x004d, 0x06b8, 0x004e, 0x06ba, 0x004f, 0x06bc, 0x0050, 0x06be, + 0x0052, 0x06c0, 0x0053, 0x06c2, 0x0055, 0x06c6, 0x0057, 0x06c8, + 0x0059, 0x06ca, 0x005a, 0x06cc, 0x0061, 0x06ce, 0x0063, 0x06d0, + 0x0065, 0x06d2, 0x0067, 0x06d4, 0x0069, 0x06d6, 0x006b, 0x06d8, + 0x006c, 0x06da, 0x006d, 0x06dc, 0x006e, 0x06de, 0x006f, 0x06e0, + 0x0070, 0x06e2, 0x0072, 0x06e4, 0x0073, 0x06e6, 0x0075, 0x06ea, + 0x0077, 0x06ec, 0x0079, 0x06ee, 0x007a, 0x06f0, 0x00a8, 0x06f2, + 0x00c6, 0x06f4, 0x00d8, 0x06f6, 0x00e6, 0x06f8, 0x00f8, 0x06fa, + 0x0391, 0x06fc, 0x0395, 0x06fe, 0x0397, 0x0700, 0x0399, 0x0702, + 0x039f, 0x0704, 0x03a5, 0x0706, 0x03a9, 0x0708, 0x03b1, 0x070a, + 0x03b5, 0x070c, 0x03b7, 0x070e, 0x03b9, 0x0710, 0x03bf, 0x0712, + 0x03c5, 0x0714, 0x03c9, 0x0716, 0x0413, 0x0718, 0x041a, 0x071a, + 0x0433, 0x071c, 0x043a, 0x071e, 0x1fbf, 0x0720, 0x1ffe, 0x0722, + /* 0x0302 */ + 0x0000, 0x001a, 0x0041, 0x0724, 0x0043, 0x072e, 0x0045, 0x0730, + 0x0047, 0x073a, 0x0048, 0x073c, 0x0049, 0x073e, 0x004a, 0x0740, + 0x004f, 0x0742, 0x0053, 0x074c, 0x0055, 0x074e, 0x0057, 0x0750, + 0x0059, 0x0752, 0x005a, 0x0754, 0x0061, 0x0756, 0x0063, 0x0760, + 0x0065, 0x0762, 0x0067, 0x076c, 0x0068, 0x076e, 0x0069, 0x0770, + 0x006a, 0x0772, 0x006f, 0x0774, 0x0073, 0x077e, 0x0075, 0x0780, + 0x0077, 0x0782, 0x0079, 0x0784, 0x007a, 0x0786, + /* 0x0303 */ + 0x0000, 0x0010, 0x0041, 0x0788, 0x0045, 0x078a, 0x0049, 0x078c, + 0x004e, 0x078e, 0x004f, 0x0790, 0x0055, 0x0796, 0x0056, 0x079a, + 0x0059, 0x079c, 0x0061, 0x079e, 0x0065, 0x07a0, 0x0069, 0x07a2, + 0x006e, 0x07a4, 0x006f, 0x07a6, 0x0075, 0x07ac, 0x0076, 0x07b0, + 0x0079, 0x07b2, + /* 0x0304 */ + 0x0000, 0x0018, 0x0041, 0x07b4, 0x0045, 0x07b6, 0x0047, 0x07bc, + 0x0049, 0x07be, 0x004f, 0x07c0, 0x0055, 0x07c6, 0x0061, 0x07ca, + 0x0065, 0x07cc, 0x0067, 0x07d2, 0x0069, 0x07d4, 0x006f, 0x07d6, + 0x0075, 0x07dc, 0x00c6, 0x07e0, 0x00e6, 0x07e2, 0x0391, 0x07e4, + 0x0399, 0x07e6, 0x03a5, 0x07e8, 0x03b1, 0x07ea, 0x03b9, 0x07ec, + 0x03c5, 0x07ee, 0x0418, 0x07f0, 0x0423, 0x07f2, 0x0438, 0x07f4, + 0x0443, 0x07f6, + /* 0x0306 */ + 0x0000, 0x001c, 0x0041, 0x07f8, 0x0045, 0x0802, 0x0047, 0x0804, + 0x0049, 0x0806, 0x004f, 0x0808, 0x0055, 0x080a, 0x0061, 0x080c, + 0x0065, 0x0816, 0x0067, 0x0818, 0x0069, 0x081a, 0x006f, 0x081c, + 0x0075, 0x081e, 0x0391, 0x0820, 0x0399, 0x0822, 0x03a5, 0x0824, + 0x03b1, 0x0826, 0x03b9, 0x0828, 0x03c5, 0x082a, 0x0410, 0x082c, + 0x0415, 0x082e, 0x0416, 0x0830, 0x0418, 0x0832, 0x0423, 0x0834, + 0x0430, 0x0836, 0x0435, 0x0838, 0x0436, 0x083a, 0x0438, 0x083c, + 0x0443, 0x083e, + /* 0x0307 */ + 0x0000, 0x0027, 0x0041, 0x0840, 0x0042, 0x0844, 0x0043, 0x0846, + 0x0044, 0x0848, 0x0045, 0x084a, 0x0046, 0x084c, 0x0047, 0x084e, + 0x0048, 0x0850, 0x0049, 0x0852, 0x004d, 0x0854, 0x004e, 0x0856, + 0x0050, 0x0858, 0x0052, 0x085a, 0x0053, 0x085c, 0x0054, 0x085e, + 0x0057, 0x0860, 0x0058, 0x0862, 0x0059, 0x0864, 0x005a, 0x0866, + 0x0061, 0x0868, 0x0062, 0x086c, 0x0063, 0x086e, 0x0064, 0x0870, + 0x0065, 0x0872, 0x0066, 0x0874, 0x0067, 0x0876, 0x0068, 0x0878, + 0x006d, 0x087a, 0x006e, 0x087c, 0x0070, 0x087e, 0x0072, 0x0880, + 0x0073, 0x0882, 0x0074, 0x0884, 0x0077, 0x0886, 0x0078, 0x0888, + 0x0079, 0x088a, 0x007a, 0x088c, 0x017f, 0x088e, 0x0306, 0x0890, + /* 0x0308 */ + 0x0000, 0x0030, 0x0041, 0x0892, 0x0045, 0x0896, 0x0048, 0x0898, + 0x0049, 0x089a, 0x004f, 0x089e, 0x0055, 0x08a0, 0x0057, 0x08aa, + 0x0058, 0x08ac, 0x0059, 0x08ae, 0x0061, 0x08b0, 0x0065, 0x08b4, + 0x0068, 0x08b6, 0x0069, 0x08b8, 0x006f, 0x08bc, 0x0074, 0x08be, + 0x0075, 0x08c0, 0x0077, 0x08ca, 0x0078, 0x08cc, 0x0079, 0x08ce, + 0x018f, 0x08d0, 0x019f, 0x08d2, 0x0259, 0x08d4, 0x0275, 0x08d6, + 0x0399, 0x08d8, 0x03a5, 0x08da, 0x03b9, 0x08dc, 0x03c5, 0x08e6, + 0x03d2, 0x08f0, 0x0406, 0x08f2, 0x0410, 0x08f4, 0x0415, 0x08f6, + 0x0416, 0x08f8, 0x0417, 0x08fa, 0x0418, 0x08fc, 0x041e, 0x08fe, + 0x0423, 0x0900, 0x0427, 0x0902, 0x042b, 0x0904, 0x0430, 0x0906, + 0x0435, 0x0908, 0x0436, 0x090a, 0x0437, 0x090c, 0x0438, 0x090e, + 0x043e, 0x0910, 0x0443, 0x0912, 0x0447, 0x0914, 0x044b, 0x0916, + 0x0456, 0x0918, + /* 0x0309 */ + 0x0000, 0x000c, 0x0041, 0x091a, 0x0045, 0x091c, 0x0049, 0x091e, + 0x004f, 0x0920, 0x0055, 0x0922, 0x0059, 0x0924, 0x0061, 0x0926, + 0x0065, 0x0928, 0x0069, 0x092a, 0x006f, 0x092c, 0x0075, 0x092e, + 0x0079, 0x0930, + /* 0x030a */ + 0x0000, 0x0006, 0x0041, 0x0932, 0x0055, 0x0936, 0x0061, 0x0938, + 0x0075, 0x093c, 0x0077, 0x093e, 0x0079, 0x0940, + /* 0x030b */ + 0x0000, 0x0006, 0x004f, 0x0942, 0x0055, 0x0944, 0x006f, 0x0946, + 0x0075, 0x0948, 0x0423, 0x094a, 0x0443, 0x094c, + /* 0x030c */ + 0x0000, 0x0021, 0x0041, 0x094e, 0x0043, 0x0950, 0x0044, 0x0952, + 0x0045, 0x0954, 0x0047, 0x0956, 0x0049, 0x0958, 0x004b, 0x095a, + 0x004c, 0x095c, 0x004e, 0x095e, 0x004f, 0x0960, 0x0052, 0x0962, + 0x0053, 0x0964, 0x0054, 0x0968, 0x0055, 0x096a, 0x005a, 0x096c, + 0x0061, 0x096e, 0x0063, 0x0970, 0x0064, 0x0972, 0x0065, 0x0974, + 0x0067, 0x0976, 0x0069, 0x0978, 0x006a, 0x097a, 0x006b, 0x097c, + 0x006c, 0x097e, 0x006e, 0x0980, 0x006f, 0x0982, 0x0072, 0x0984, + 0x0073, 0x0986, 0x0074, 0x098a, 0x0075, 0x098c, 0x007a, 0x098e, + 0x01b7, 0x0990, 0x0292, 0x0992, + /* 0x030d */ + 0x0000, 0x0011, 0x00a8, 0x0994, 0x0308, 0x0996, 0x0391, 0x0998, + 0x0395, 0x099a, 0x0397, 0x099c, 0x0399, 0x099e, 0x039f, 0x09a0, + 0x03a5, 0x09a2, 0x03a9, 0x09a4, 0x03b1, 0x09a6, 0x03b5, 0x09a8, + 0x03b7, 0x09aa, 0x03b9, 0x09ac, 0x03bf, 0x09ae, 0x03c5, 0x09b0, + 0x03c9, 0x09b2, 0x03d2, 0x09b4, + /* 0x030f */ + 0x0000, 0x000e, 0x0041, 0x09b6, 0x0045, 0x09b8, 0x0049, 0x09ba, + 0x004f, 0x09bc, 0x0052, 0x09be, 0x0055, 0x09c0, 0x0061, 0x09c2, + 0x0065, 0x09c4, 0x0069, 0x09c6, 0x006f, 0x09c8, 0x0072, 0x09ca, + 0x0075, 0x09cc, 0x0474, 0x09ce, 0x0475, 0x09d0, + /* 0x0311 */ + 0x0000, 0x000c, 0x0041, 0x09d2, 0x0045, 0x09d4, 0x0049, 0x09d6, + 0x004f, 0x09d8, 0x0052, 0x09da, 0x0055, 0x09dc, 0x0061, 0x09de, + 0x0065, 0x09e0, 0x0069, 0x09e2, 0x006f, 0x09e4, 0x0072, 0x09e6, + 0x0075, 0x09e8, + /* 0x0313 */ + 0x0343, 0x000e, 0x0391, 0x09ea, 0x0395, 0x09f2, 0x0397, 0x09f8, + 0x0399, 0x0a00, 0x039f, 0x0a08, 0x03a9, 0x0a0e, 0x03b1, 0x0a16, + 0x03b5, 0x0a1e, 0x03b7, 0x0a24, 0x03b9, 0x0a2c, 0x03bf, 0x0a34, + 0x03c1, 0x0a3a, 0x03c5, 0x0a3c, 0x03c9, 0x0a44, + /* 0x0314 */ + 0x0000, 0x0010, 0x0391, 0x0a4c, 0x0395, 0x0a54, 0x0397, 0x0a5a, + 0x0399, 0x0a62, 0x039f, 0x0a6a, 0x03a1, 0x0a70, 0x03a5, 0x0a72, + 0x03a9, 0x0a7a, 0x03b1, 0x0a82, 0x03b5, 0x0a8a, 0x03b7, 0x0a90, + 0x03b9, 0x0a98, 0x03bf, 0x0aa0, 0x03c1, 0x0aa6, 0x03c5, 0x0aa8, + 0x03c9, 0x0ab0, + /* 0x031b */ + 0x0000, 0x0004, 0x004f, 0x0ab8, 0x0055, 0x0ac4, 0x006f, 0x0ad0, + 0x0075, 0x0adc, + /* 0x0323 */ + 0x0000, 0x0026, 0x0041, 0x0ae8, 0x0042, 0x0aee, 0x0044, 0x0af0, + 0x0045, 0x0af2, 0x0048, 0x0af6, 0x0049, 0x0af8, 0x004b, 0x0afa, + 0x004c, 0x0afc, 0x004d, 0x0b00, 0x004e, 0x0b02, 0x004f, 0x0b04, + 0x0052, 0x0b08, 0x0053, 0x0b0c, 0x0054, 0x0b10, 0x0055, 0x0b12, + 0x0056, 0x0b14, 0x0057, 0x0b16, 0x0059, 0x0b18, 0x005a, 0x0b1a, + 0x0061, 0x0b1c, 0x0062, 0x0b22, 0x0064, 0x0b24, 0x0065, 0x0b26, + 0x0068, 0x0b2a, 0x0069, 0x0b2c, 0x006b, 0x0b2e, 0x006c, 0x0b30, + 0x006d, 0x0b34, 0x006e, 0x0b36, 0x006f, 0x0b38, 0x0072, 0x0b3c, + 0x0073, 0x0b40, 0x0074, 0x0b44, 0x0075, 0x0b46, 0x0076, 0x0b48, + 0x0077, 0x0b4a, 0x0079, 0x0b4c, 0x007a, 0x0b4e, + /* 0x0324 */ + 0x0000, 0x0002, 0x0055, 0x0b50, 0x0075, 0x0b52, + /* 0x0325 */ + 0x0000, 0x0002, 0x0041, 0x0b54, 0x0061, 0x0b56, + /* 0x0327 */ + 0x0000, 0x0016, 0x0043, 0x0b58, 0x0044, 0x0b5c, 0x0045, 0x0b5e, + 0x0047, 0x0b62, 0x0048, 0x0b64, 0x004b, 0x0b66, 0x004c, 0x0b68, + 0x004e, 0x0b6a, 0x0052, 0x0b6c, 0x0053, 0x0b6e, 0x0054, 0x0b70, + 0x0063, 0x0b72, 0x0064, 0x0b76, 0x0065, 0x0b78, 0x0067, 0x0b7c, + 0x0068, 0x0b7e, 0x006b, 0x0b80, 0x006c, 0x0b82, 0x006e, 0x0b84, + 0x0072, 0x0b86, 0x0073, 0x0b88, 0x0074, 0x0b8a, + /* 0x0328 */ + 0x0000, 0x000a, 0x0041, 0x0b8c, 0x0045, 0x0b8e, 0x0049, 0x0b90, + 0x004f, 0x0b92, 0x0055, 0x0b96, 0x0061, 0x0b98, 0x0065, 0x0b9a, + 0x0069, 0x0b9c, 0x006f, 0x0b9e, 0x0075, 0x0ba2, + /* 0x032d */ + 0x0000, 0x000c, 0x0044, 0x0ba4, 0x0045, 0x0ba6, 0x004c, 0x0ba8, + 0x004e, 0x0baa, 0x0054, 0x0bac, 0x0055, 0x0bae, 0x0064, 0x0bb0, + 0x0065, 0x0bb2, 0x006c, 0x0bb4, 0x006e, 0x0bb6, 0x0074, 0x0bb8, + 0x0075, 0x0bba, + /* 0x032e */ + 0x0000, 0x0002, 0x0048, 0x0bbc, 0x0068, 0x0bbe, + /* 0x0330 */ + 0x0000, 0x0006, 0x0045, 0x0bc0, 0x0049, 0x0bc2, 0x0055, 0x0bc4, + 0x0065, 0x0bc6, 0x0069, 0x0bc8, 0x0075, 0x0bca, + /* 0x0331 */ + 0x0000, 0x0011, 0x0042, 0x0bcc, 0x0044, 0x0bce, 0x004b, 0x0bd0, + 0x004c, 0x0bd2, 0x004e, 0x0bd4, 0x0052, 0x0bd6, 0x0054, 0x0bd8, + 0x005a, 0x0bda, 0x0062, 0x0bdc, 0x0064, 0x0bde, 0x0068, 0x0be0, + 0x006b, 0x0be2, 0x006c, 0x0be4, 0x006e, 0x0be6, 0x0072, 0x0be8, + 0x0074, 0x0bea, 0x007a, 0x0bec, + /* 0x0342 */ + 0x0000, 0x0008, 0x00a8, 0x0bee, 0x03b1, 0x0bf0, 0x03b7, 0x0bf2, + 0x03b9, 0x0bf4, 0x03c5, 0x0bf6, 0x03c9, 0x0bf8, 0x1fbf, 0x0bfa, + 0x1ffe, 0x0bfc, + /* 0x0345 */ + 0x0000, 0x0007, 0x0391, 0x0bfe, 0x0397, 0x0c04, 0x03a9, 0x0c0a, + 0x03b1, 0x0c10, 0x03b7, 0x0c1c, 0x03bf, 0x0c28, 0x03c9, 0x0c2c, + /* 0x05b7 */ + 0x0000, 0x0002, 0x05d0, 0x0c36, 0x05f2, 0x0c38, + /* 0x05b8 */ + 0x0000, 0x0001, 0x05d0, 0x0c3a, + /* 0x05b9 */ + 0x0000, 0x0001, 0x05d5, 0x0c3c, + /* 0x05bc */ + 0x0000, 0x0016, 0x05d0, 0x0c3e, 0x05d1, 0x0c40, 0x05d2, 0x0c42, + 0x05d3, 0x0c44, 0x05d4, 0x0c46, 0x05d5, 0x0c48, 0x05d6, 0x0c4a, + 0x05d8, 0x0c4c, 0x05d9, 0x0c4e, 0x05da, 0x0c50, 0x05db, 0x0c52, + 0x05dc, 0x0c54, 0x05de, 0x0c56, 0x05e0, 0x0c58, 0x05e1, 0x0c5a, + 0x05e3, 0x0c5c, 0x05e4, 0x0c5e, 0x05e6, 0x0c60, 0x05e7, 0x0c62, + 0x05e8, 0x0c64, 0x05e9, 0x0c66, 0x05ea, 0x0c6c, + /* 0x05bf */ + 0x0000, 0x0003, 0x05d1, 0x0c6e, 0x05db, 0x0c70, 0x05e4, 0x0c72, + /* 0x05c1 */ + 0x0000, 0x0001, 0x05e9, 0x0c74, + /* 0x05c2 */ + 0x0000, 0x0001, 0x05e9, 0x0c76, + /* 0x093c */ + 0x0000, 0x000b, 0x0915, 0x0c78, 0x0916, 0x0c7a, 0x0917, 0x0c7c, + 0x091c, 0x0c7e, 0x0921, 0x0c80, 0x0922, 0x0c82, 0x0928, 0x0c84, + 0x092b, 0x0c86, 0x092f, 0x0c88, 0x0930, 0x0c8a, 0x0933, 0x0c8c, + /* 0x09bc */ + 0x0000, 0x0004, 0x09a1, 0x0c8e, 0x09a2, 0x0c90, 0x09ac, 0x0c92, + 0x09af, 0x0c94, + /* 0x09be */ + 0x0000, 0x0001, 0x09c7, 0x0c96, + /* 0x09d7 */ + 0x0000, 0x0001, 0x09c7, 0x0c98, + /* 0x0a3c */ + 0x0000, 0x0005, 0x0a16, 0x0c9a, 0x0a17, 0x0c9c, 0x0a1c, 0x0c9e, + 0x0a21, 0x0ca0, 0x0a2b, 0x0ca2, + /* 0x0b3c */ + 0x0000, 0x0003, 0x0b21, 0x0ca4, 0x0b22, 0x0ca6, 0x0b2f, 0x0ca8, + /* 0x0b3e */ + 0x0000, 0x0001, 0x0b47, 0x0caa, + /* 0x0b56 */ + 0x0000, 0x0001, 0x0b47, 0x0cac, + /* 0x0b57 */ + 0x0000, 0x0001, 0x0b47, 0x0cae, + /* 0x0bbe */ + 0x0000, 0x0002, 0x0bc6, 0x0cb0, 0x0bc7, 0x0cb2, + /* 0x0bd7 */ + 0x0000, 0x0002, 0x0b92, 0x0cb4, 0x0bc6, 0x0cb6, + /* 0x0c56 */ + 0x0000, 0x0001, 0x0c46, 0x0cb8, + /* 0x0cc2 */ + 0x0000, 0x0001, 0x0cc6, 0x0cba, + /* 0x0cd5 */ + 0x0000, 0x0002, 0x0cbf, 0x0cbe, 0x0cc6, 0x0cc0, + /* 0x0cd6 */ + 0x0000, 0x0001, 0x0cc6, 0x0cc2, + /* 0x0d3e */ + 0x0000, 0x0002, 0x0d46, 0x0cc4, 0x0d47, 0x0cc6, + /* 0x0d57 */ + 0x0000, 0x0001, 0x0d46, 0x0cc8, + /* 0x0e32 */ + 0x0000, 0x0001, 0x0e4d, 0x0cca, + /* 0x0eb2 */ + 0x0000, 0x0001, 0x0ecd, 0x0ccc, + /* 0x0f71 */ + 0x0000, 0x0003, 0x0f72, 0x0cce, 0x0f74, 0x0cd0, 0x0f80, 0x0cd2, + /* 0x0f80 */ + 0x0000, 0x0002, 0x0fb2, 0x0cd4, 0x0fb3, 0x0cd8, + /* 0x0fb5 */ + 0x0000, 0x0002, 0x0f40, 0x0cdc, 0x0f90, 0x0cde, + /* 0x0fb7 */ + 0x0000, 0x000a, 0x0f42, 0x0ce0, 0x0f4c, 0x0ce2, 0x0f51, 0x0ce4, + 0x0f56, 0x0ce6, 0x0f5b, 0x0ce8, 0x0f92, 0x0cea, 0x0f9c, 0x0cec, + 0x0fa1, 0x0cee, 0x0fa6, 0x0cf0, 0x0fab, 0x0cf2, + /* 0x3099 */ + 0x0000, 0x0030, 0x3046, 0x0cf4, 0x304b, 0x0cf6, 0x304d, 0x0cf8, + 0x304f, 0x0cfa, 0x3051, 0x0cfc, 0x3053, 0x0cfe, 0x3055, 0x0d00, + 0x3057, 0x0d02, 0x3059, 0x0d04, 0x305b, 0x0d06, 0x305d, 0x0d08, + 0x305f, 0x0d0a, 0x3061, 0x0d0c, 0x3064, 0x0d0e, 0x3066, 0x0d10, + 0x3068, 0x0d12, 0x306f, 0x0d14, 0x3072, 0x0d16, 0x3075, 0x0d18, + 0x3078, 0x0d1a, 0x307b, 0x0d1c, 0x309d, 0x0d1e, 0x30a6, 0x0d20, + 0x30ab, 0x0d22, 0x30ad, 0x0d24, 0x30af, 0x0d26, 0x30b1, 0x0d28, + 0x30b3, 0x0d2a, 0x30b5, 0x0d2c, 0x30b7, 0x0d2e, 0x30b9, 0x0d30, + 0x30bb, 0x0d32, 0x30bd, 0x0d34, 0x30bf, 0x0d36, 0x30c1, 0x0d38, + 0x30c4, 0x0d3a, 0x30c6, 0x0d3c, 0x30c8, 0x0d3e, 0x30cf, 0x0d40, + 0x30d2, 0x0d42, 0x30d5, 0x0d44, 0x30d8, 0x0d46, 0x30db, 0x0d48, + 0x30ef, 0x0d4a, 0x30f0, 0x0d4c, 0x30f1, 0x0d4e, 0x30f2, 0x0d50, + 0x30fd, 0x0d52, + /* 0x309a */ + 0x0000, 0x000a, 0x306f, 0x0d54, 0x3072, 0x0d56, 0x3075, 0x0d58, + 0x3078, 0x0d5a, 0x307b, 0x0d5c, 0x30cf, 0x0d5e, 0x30d2, 0x0d60, + 0x30d5, 0x0d62, 0x30d8, 0x0d64, 0x30db, 0x0d66, + /* 0x0041 0x0300 */ + 0x00c0, 0x0000, + /* 0x0045 0x0300 */ + 0x00c8, 0x0000, + /* 0x0049 0x0300 */ + 0x00cc, 0x0000, + /* 0x004f 0x0300 */ + 0x00d2, 0x0000, + /* 0x0055 0x0300 */ + 0x00d9, 0x0000, + /* 0x0057 0x0300 */ + 0x1e80, 0x0000, + /* 0x0059 0x0300 */ + 0x1ef2, 0x0000, + /* 0x0061 0x0300 */ + 0x00e0, 0x0000, + /* 0x0065 0x0300 */ + 0x00e8, 0x0000, + /* 0x0069 0x0300 */ + 0x00ec, 0x0000, + /* 0x006f 0x0300 */ + 0x00f2, 0x0000, + /* 0x0075 0x0300 */ + 0x00f9, 0x0000, + /* 0x0077 0x0300 */ + 0x1e81, 0x0000, + /* 0x0079 0x0300 */ + 0x1ef3, 0x0000, + /* 0x00a8 0x0300 */ + 0x1fed, 0x0000, + /* 0x0391 0x0300 */ + 0x1fba, 0x0000, + /* 0x0395 0x0300 */ + 0x1fc8, 0x0000, + /* 0x0397 0x0300 */ + 0x1fca, 0x0000, + /* 0x0399 0x0300 */ + 0x1fda, 0x0000, + /* 0x039f 0x0300 */ + 0x1ff8, 0x0000, + /* 0x03a5 0x0300 */ + 0x1fea, 0x0000, + /* 0x03a9 0x0300 */ + 0x1ffa, 0x0000, + /* 0x03b1 0x0300 */ + 0x1f70, 0x0000, + /* 0x03b5 0x0300 */ + 0x1f72, 0x0000, + /* 0x03b7 0x0300 */ + 0x1f74, 0x0000, + /* 0x03b9 0x0300 */ + 0x1f76, 0x0000, + /* 0x03bf 0x0300 */ + 0x1f78, 0x0000, + /* 0x03c5 0x0300 */ + 0x1f7a, 0x0000, + /* 0x03c9 0x0300 */ + 0x1f7c, 0x0000, + /* 0x1fbf 0x0300 */ + 0x1fcd, 0x0000, + /* 0x1ffe 0x0300 */ + 0x1fdd, 0x0000, + /* 0x0041 0x0301 */ + 0x00c1, 0x0000, + /* 0x0043 0x0301 */ + 0x0106, 0x0000, + /* 0x0045 0x0301 */ + 0x00c9, 0x0000, + /* 0x0047 0x0301 */ + 0x01f4, 0x0000, + /* 0x0049 0x0301 */ + 0x00cd, 0x0000, + /* 0x004b 0x0301 */ + 0x1e30, 0x0000, + /* 0x004c 0x0301 */ + 0x0139, 0x0000, + /* 0x004d 0x0301 */ + 0x1e3e, 0x0000, + /* 0x004e 0x0301 */ + 0x0143, 0x0000, + /* 0x004f 0x0301 */ + 0x00d3, 0x0000, + /* 0x0050 0x0301 */ + 0x1e54, 0x0000, + /* 0x0052 0x0301 */ + 0x0154, 0x0000, + /* 0x0053 0x0301 */ + 0x015a, 0x0001, 0x0307, 0x0d68, + /* 0x0055 0x0301 */ + 0x00da, 0x0000, + /* 0x0057 0x0301 */ + 0x1e82, 0x0000, + /* 0x0059 0x0301 */ + 0x00dd, 0x0000, + /* 0x005a 0x0301 */ + 0x0179, 0x0000, + /* 0x0061 0x0301 */ + 0x00e1, 0x0000, + /* 0x0063 0x0301 */ + 0x0107, 0x0000, + /* 0x0065 0x0301 */ + 0x00e9, 0x0000, + /* 0x0067 0x0301 */ + 0x01f5, 0x0000, + /* 0x0069 0x0301 */ + 0x00ed, 0x0000, + /* 0x006b 0x0301 */ + 0x1e31, 0x0000, + /* 0x006c 0x0301 */ + 0x013a, 0x0000, + /* 0x006d 0x0301 */ + 0x1e3f, 0x0000, + /* 0x006e 0x0301 */ + 0x0144, 0x0000, + /* 0x006f 0x0301 */ + 0x00f3, 0x0000, + /* 0x0070 0x0301 */ + 0x1e55, 0x0000, + /* 0x0072 0x0301 */ + 0x0155, 0x0000, + /* 0x0073 0x0301 */ + 0x015b, 0x0001, 0x0307, 0x0d6a, + /* 0x0075 0x0301 */ + 0x00fa, 0x0000, + /* 0x0077 0x0301 */ + 0x1e83, 0x0000, + /* 0x0079 0x0301 */ + 0x00fd, 0x0000, + /* 0x007a 0x0301 */ + 0x017a, 0x0000, + /* 0x00a8 0x0301 */ + 0x1fee, 0x0000, + /* 0x00c6 0x0301 */ + 0x01fc, 0x0000, + /* 0x00d8 0x0301 */ + 0x01fe, 0x0000, + /* 0x00e6 0x0301 */ + 0x01fd, 0x0000, + /* 0x00f8 0x0301 */ + 0x01ff, 0x0000, + /* 0x0391 0x0301 */ + 0x1fbb, 0x0000, + /* 0x0395 0x0301 */ + 0x1fc9, 0x0000, + /* 0x0397 0x0301 */ + 0x1fcb, 0x0000, + /* 0x0399 0x0301 */ + 0x1fdb, 0x0000, + /* 0x039f 0x0301 */ + 0x1ff9, 0x0000, + /* 0x03a5 0x0301 */ + 0x1feb, 0x0000, + /* 0x03a9 0x0301 */ + 0x1ffb, 0x0000, + /* 0x03b1 0x0301 */ + 0x1f71, 0x0000, + /* 0x03b5 0x0301 */ + 0x1f73, 0x0000, + /* 0x03b7 0x0301 */ + 0x1f75, 0x0000, + /* 0x03b9 0x0301 */ + 0x1f77, 0x0000, + /* 0x03bf 0x0301 */ + 0x1f79, 0x0000, + /* 0x03c5 0x0301 */ + 0x1f7b, 0x0000, + /* 0x03c9 0x0301 */ + 0x1f7d, 0x0000, + /* 0x0413 0x0301 */ + 0x0403, 0x0000, + /* 0x041a 0x0301 */ + 0x040c, 0x0000, + /* 0x0433 0x0301 */ + 0x0453, 0x0000, + /* 0x043a 0x0301 */ + 0x045c, 0x0000, + /* 0x1fbf 0x0301 */ + 0x1fce, 0x0000, + /* 0x1ffe 0x0301 */ + 0x1fde, 0x0000, + /* 0x0041 0x0302 */ + 0x00c2, 0x0004, 0x0300, 0x0d6c, 0x0301, 0x0d6e, 0x0303, 0x0d70, + 0x0309, 0x0d72, + /* 0x0043 0x0302 */ + 0x0108, 0x0000, + /* 0x0045 0x0302 */ + 0x00ca, 0x0004, 0x0300, 0x0d74, 0x0301, 0x0d76, 0x0303, 0x0d78, + 0x0309, 0x0d7a, + /* 0x0047 0x0302 */ + 0x011c, 0x0000, + /* 0x0048 0x0302 */ + 0x0124, 0x0000, + /* 0x0049 0x0302 */ + 0x00ce, 0x0000, + /* 0x004a 0x0302 */ + 0x0134, 0x0000, + /* 0x004f 0x0302 */ + 0x00d4, 0x0004, 0x0300, 0x0d7c, 0x0301, 0x0d7e, 0x0303, 0x0d80, + 0x0309, 0x0d82, + /* 0x0053 0x0302 */ + 0x015c, 0x0000, + /* 0x0055 0x0302 */ + 0x00db, 0x0000, + /* 0x0057 0x0302 */ + 0x0174, 0x0000, + /* 0x0059 0x0302 */ + 0x0176, 0x0000, + /* 0x005a 0x0302 */ + 0x1e90, 0x0000, + /* 0x0061 0x0302 */ + 0x00e2, 0x0004, 0x0300, 0x0d84, 0x0301, 0x0d86, 0x0303, 0x0d88, + 0x0309, 0x0d8a, + /* 0x0063 0x0302 */ + 0x0109, 0x0000, + /* 0x0065 0x0302 */ + 0x00ea, 0x0004, 0x0300, 0x0d8c, 0x0301, 0x0d8e, 0x0303, 0x0d90, + 0x0309, 0x0d92, + /* 0x0067 0x0302 */ + 0x011d, 0x0000, + /* 0x0068 0x0302 */ + 0x0125, 0x0000, + /* 0x0069 0x0302 */ + 0x00ee, 0x0000, + /* 0x006a 0x0302 */ + 0x0135, 0x0000, + /* 0x006f 0x0302 */ + 0x00f4, 0x0004, 0x0300, 0x0d94, 0x0301, 0x0d96, 0x0303, 0x0d98, + 0x0309, 0x0d9a, + /* 0x0073 0x0302 */ + 0x015d, 0x0000, + /* 0x0075 0x0302 */ + 0x00fb, 0x0000, + /* 0x0077 0x0302 */ + 0x0175, 0x0000, + /* 0x0079 0x0302 */ + 0x0177, 0x0000, + /* 0x007a 0x0302 */ + 0x1e91, 0x0000, + /* 0x0041 0x0303 */ + 0x00c3, 0x0000, + /* 0x0045 0x0303 */ + 0x1ebc, 0x0000, + /* 0x0049 0x0303 */ + 0x0128, 0x0000, + /* 0x004e 0x0303 */ + 0x00d1, 0x0000, + /* 0x004f 0x0303 */ + 0x00d5, 0x0002, 0x0301, 0x0d9c, 0x0308, 0x0d9e, + /* 0x0055 0x0303 */ + 0x0168, 0x0001, 0x0301, 0x0da0, + /* 0x0056 0x0303 */ + 0x1e7c, 0x0000, + /* 0x0059 0x0303 */ + 0x1ef8, 0x0000, + /* 0x0061 0x0303 */ + 0x00e3, 0x0000, + /* 0x0065 0x0303 */ + 0x1ebd, 0x0000, + /* 0x0069 0x0303 */ + 0x0129, 0x0000, + /* 0x006e 0x0303 */ + 0x00f1, 0x0000, + /* 0x006f 0x0303 */ + 0x00f5, 0x0002, 0x0301, 0x0da2, 0x0308, 0x0da4, + /* 0x0075 0x0303 */ + 0x0169, 0x0001, 0x0301, 0x0da6, + /* 0x0076 0x0303 */ + 0x1e7d, 0x0000, + /* 0x0079 0x0303 */ + 0x1ef9, 0x0000, + /* 0x0041 0x0304 */ + 0x0100, 0x0000, + /* 0x0045 0x0304 */ + 0x0112, 0x0002, 0x0300, 0x0da8, 0x0301, 0x0daa, + /* 0x0047 0x0304 */ + 0x1e20, 0x0000, + /* 0x0049 0x0304 */ + 0x012a, 0x0000, + /* 0x004f 0x0304 */ + 0x014c, 0x0002, 0x0300, 0x0dac, 0x0301, 0x0dae, + /* 0x0055 0x0304 */ + 0x016a, 0x0001, 0x0308, 0x0db0, + /* 0x0061 0x0304 */ + 0x0101, 0x0000, + /* 0x0065 0x0304 */ + 0x0113, 0x0002, 0x0300, 0x0db2, 0x0301, 0x0db4, + /* 0x0067 0x0304 */ + 0x1e21, 0x0000, + /* 0x0069 0x0304 */ + 0x012b, 0x0000, + /* 0x006f 0x0304 */ + 0x014d, 0x0002, 0x0300, 0x0db6, 0x0301, 0x0db8, + /* 0x0075 0x0304 */ + 0x016b, 0x0001, 0x0308, 0x0dba, + /* 0x00c6 0x0304 */ + 0x01e2, 0x0000, + /* 0x00e6 0x0304 */ + 0x01e3, 0x0000, + /* 0x0391 0x0304 */ + 0x1fb9, 0x0000, + /* 0x0399 0x0304 */ + 0x1fd9, 0x0000, + /* 0x03a5 0x0304 */ + 0x1fe9, 0x0000, + /* 0x03b1 0x0304 */ + 0x1fb1, 0x0000, + /* 0x03b9 0x0304 */ + 0x1fd1, 0x0000, + /* 0x03c5 0x0304 */ + 0x1fe1, 0x0000, + /* 0x0418 0x0304 */ + 0x04e2, 0x0000, + /* 0x0423 0x0304 */ + 0x04ee, 0x0000, + /* 0x0438 0x0304 */ + 0x04e3, 0x0000, + /* 0x0443 0x0304 */ + 0x04ef, 0x0000, + /* 0x0041 0x0306 */ + 0x0102, 0x0004, 0x0300, 0x0dbc, 0x0301, 0x0dbe, 0x0303, 0x0dc0, + 0x0309, 0x0dc2, + /* 0x0045 0x0306 */ + 0x0114, 0x0000, + /* 0x0047 0x0306 */ + 0x011e, 0x0000, + /* 0x0049 0x0306 */ + 0x012c, 0x0000, + /* 0x004f 0x0306 */ + 0x014e, 0x0000, + /* 0x0055 0x0306 */ + 0x016c, 0x0000, + /* 0x0061 0x0306 */ + 0x0103, 0x0004, 0x0300, 0x0dc4, 0x0301, 0x0dc6, 0x0303, 0x0dc8, + 0x0309, 0x0dca, + /* 0x0065 0x0306 */ + 0x0115, 0x0000, + /* 0x0067 0x0306 */ + 0x011f, 0x0000, + /* 0x0069 0x0306 */ + 0x012d, 0x0000, + /* 0x006f 0x0306 */ + 0x014f, 0x0000, + /* 0x0075 0x0306 */ + 0x016d, 0x0000, + /* 0x0391 0x0306 */ + 0x1fb8, 0x0000, + /* 0x0399 0x0306 */ + 0x1fd8, 0x0000, + /* 0x03a5 0x0306 */ + 0x1fe8, 0x0000, + /* 0x03b1 0x0306 */ + 0x1fb0, 0x0000, + /* 0x03b9 0x0306 */ + 0x1fd0, 0x0000, + /* 0x03c5 0x0306 */ + 0x1fe0, 0x0000, + /* 0x0410 0x0306 */ + 0x04d0, 0x0000, + /* 0x0415 0x0306 */ + 0x04d6, 0x0000, + /* 0x0416 0x0306 */ + 0x04c1, 0x0000, + /* 0x0418 0x0306 */ + 0x0419, 0x0000, + /* 0x0423 0x0306 */ + 0x040e, 0x0000, + /* 0x0430 0x0306 */ + 0x04d1, 0x0000, + /* 0x0435 0x0306 */ + 0x04d7, 0x0000, + /* 0x0436 0x0306 */ + 0x04c2, 0x0000, + /* 0x0438 0x0306 */ + 0x0439, 0x0000, + /* 0x0443 0x0306 */ + 0x045e, 0x0000, + /* 0x0041 0x0307 */ + 0x0000, 0x0001, 0x0304, 0x0dcc, + /* 0x0042 0x0307 */ + 0x1e02, 0x0000, + /* 0x0043 0x0307 */ + 0x010a, 0x0000, + /* 0x0044 0x0307 */ + 0x1e0a, 0x0000, + /* 0x0045 0x0307 */ + 0x0116, 0x0000, + /* 0x0046 0x0307 */ + 0x1e1e, 0x0000, + /* 0x0047 0x0307 */ + 0x0120, 0x0000, + /* 0x0048 0x0307 */ + 0x1e22, 0x0000, + /* 0x0049 0x0307 */ + 0x0130, 0x0000, + /* 0x004d 0x0307 */ + 0x1e40, 0x0000, + /* 0x004e 0x0307 */ + 0x1e44, 0x0000, + /* 0x0050 0x0307 */ + 0x1e56, 0x0000, + /* 0x0052 0x0307 */ + 0x1e58, 0x0000, + /* 0x0053 0x0307 */ + 0x1e60, 0x0000, + /* 0x0054 0x0307 */ + 0x1e6a, 0x0000, + /* 0x0057 0x0307 */ + 0x1e86, 0x0000, + /* 0x0058 0x0307 */ + 0x1e8a, 0x0000, + /* 0x0059 0x0307 */ + 0x1e8e, 0x0000, + /* 0x005a 0x0307 */ + 0x017b, 0x0000, + /* 0x0061 0x0307 */ + 0x0000, 0x0001, 0x0304, 0x0dce, + /* 0x0062 0x0307 */ + 0x1e03, 0x0000, + /* 0x0063 0x0307 */ + 0x010b, 0x0000, + /* 0x0064 0x0307 */ + 0x1e0b, 0x0000, + /* 0x0065 0x0307 */ + 0x0117, 0x0000, + /* 0x0066 0x0307 */ + 0x1e1f, 0x0000, + /* 0x0067 0x0307 */ + 0x0121, 0x0000, + /* 0x0068 0x0307 */ + 0x1e23, 0x0000, + /* 0x006d 0x0307 */ + 0x1e41, 0x0000, + /* 0x006e 0x0307 */ + 0x1e45, 0x0000, + /* 0x0070 0x0307 */ + 0x1e57, 0x0000, + /* 0x0072 0x0307 */ + 0x1e59, 0x0000, + /* 0x0073 0x0307 */ + 0x1e61, 0x0000, + /* 0x0074 0x0307 */ + 0x1e6b, 0x0000, + /* 0x0077 0x0307 */ + 0x1e87, 0x0000, + /* 0x0078 0x0307 */ + 0x1e8b, 0x0000, + /* 0x0079 0x0307 */ + 0x1e8f, 0x0000, + /* 0x007a 0x0307 */ + 0x017c, 0x0000, + /* 0x017f 0x0307 */ + 0x1e9b, 0x0000, + /* 0x0306 0x0307 */ + 0x0310, 0x0000, + /* 0x0041 0x0308 */ + 0x00c4, 0x0001, 0x0304, 0x0dd0, + /* 0x0045 0x0308 */ + 0x00cb, 0x0000, + /* 0x0048 0x0308 */ + 0x1e26, 0x0000, + /* 0x0049 0x0308 */ + 0x00cf, 0x0001, 0x0301, 0x0dd2, + /* 0x004f 0x0308 */ + 0x00d6, 0x0000, + /* 0x0055 0x0308 */ + 0x00dc, 0x0004, 0x0300, 0x0dd4, 0x0301, 0x0dd6, 0x0304, 0x0dd8, + 0x030c, 0x0dda, + /* 0x0057 0x0308 */ + 0x1e84, 0x0000, + /* 0x0058 0x0308 */ + 0x1e8c, 0x0000, + /* 0x0059 0x0308 */ + 0x0178, 0x0000, + /* 0x0061 0x0308 */ + 0x00e4, 0x0001, 0x0304, 0x0ddc, + /* 0x0065 0x0308 */ + 0x00eb, 0x0000, + /* 0x0068 0x0308 */ + 0x1e27, 0x0000, + /* 0x0069 0x0308 */ + 0x00ef, 0x0001, 0x0301, 0x0dde, + /* 0x006f 0x0308 */ + 0x00f6, 0x0000, + /* 0x0074 0x0308 */ + 0x1e97, 0x0000, + /* 0x0075 0x0308 */ + 0x00fc, 0x0004, 0x0300, 0x0de0, 0x0301, 0x0de2, 0x0304, 0x0de4, + 0x030c, 0x0de6, + /* 0x0077 0x0308 */ + 0x1e85, 0x0000, + /* 0x0078 0x0308 */ + 0x1e8d, 0x0000, + /* 0x0079 0x0308 */ + 0x00ff, 0x0000, + /* 0x018f 0x0308 */ + 0x04da, 0x0000, + /* 0x019f 0x0308 */ + 0x04ea, 0x0000, + /* 0x0259 0x0308 */ + 0x04db, 0x0000, + /* 0x0275 0x0308 */ + 0x04eb, 0x0000, + /* 0x0399 0x0308 */ + 0x03aa, 0x0000, + /* 0x03a5 0x0308 */ + 0x03ab, 0x0000, + /* 0x03b9 0x0308 */ + 0x03ca, 0x0004, 0x0300, 0x0de8, 0x0301, 0x0dea, 0x030d, 0x0dec, + 0x0342, 0x0dee, + /* 0x03c5 0x0308 */ + 0x03cb, 0x0004, 0x0300, 0x0df0, 0x0301, 0x0df2, 0x030d, 0x0df4, + 0x0342, 0x0df6, + /* 0x03d2 0x0308 */ + 0x03d4, 0x0000, + /* 0x0406 0x0308 */ + 0x0407, 0x0000, + /* 0x0410 0x0308 */ + 0x04d2, 0x0000, + /* 0x0415 0x0308 */ + 0x0401, 0x0000, + /* 0x0416 0x0308 */ + 0x04dc, 0x0000, + /* 0x0417 0x0308 */ + 0x04de, 0x0000, + /* 0x0418 0x0308 */ + 0x04e4, 0x0000, + /* 0x041e 0x0308 */ + 0x04e6, 0x0000, + /* 0x0423 0x0308 */ + 0x04f0, 0x0000, + /* 0x0427 0x0308 */ + 0x04f4, 0x0000, + /* 0x042b 0x0308 */ + 0x04f8, 0x0000, + /* 0x0430 0x0308 */ + 0x04d3, 0x0000, + /* 0x0435 0x0308 */ + 0x0451, 0x0000, + /* 0x0436 0x0308 */ + 0x04dd, 0x0000, + /* 0x0437 0x0308 */ + 0x04df, 0x0000, + /* 0x0438 0x0308 */ + 0x04e5, 0x0000, + /* 0x043e 0x0308 */ + 0x04e7, 0x0000, + /* 0x0443 0x0308 */ + 0x04f1, 0x0000, + /* 0x0447 0x0308 */ + 0x04f5, 0x0000, + /* 0x044b 0x0308 */ + 0x04f9, 0x0000, + /* 0x0456 0x0308 */ + 0x0457, 0x0000, + /* 0x0041 0x0309 */ + 0x1ea2, 0x0000, + /* 0x0045 0x0309 */ + 0x1eba, 0x0000, + /* 0x0049 0x0309 */ + 0x1ec8, 0x0000, + /* 0x004f 0x0309 */ + 0x1ece, 0x0000, + /* 0x0055 0x0309 */ + 0x1ee6, 0x0000, + /* 0x0059 0x0309 */ + 0x1ef6, 0x0000, + /* 0x0061 0x0309 */ + 0x1ea3, 0x0000, + /* 0x0065 0x0309 */ + 0x1ebb, 0x0000, + /* 0x0069 0x0309 */ + 0x1ec9, 0x0000, + /* 0x006f 0x0309 */ + 0x1ecf, 0x0000, + /* 0x0075 0x0309 */ + 0x1ee7, 0x0000, + /* 0x0079 0x0309 */ + 0x1ef7, 0x0000, + /* 0x0041 0x030a */ + 0x00c5, 0x0001, 0x0301, 0x0df8, + /* 0x0055 0x030a */ + 0x016e, 0x0000, + /* 0x0061 0x030a */ + 0x00e5, 0x0001, 0x0301, 0x0dfa, + /* 0x0075 0x030a */ + 0x016f, 0x0000, + /* 0x0077 0x030a */ + 0x1e98, 0x0000, + /* 0x0079 0x030a */ + 0x1e99, 0x0000, + /* 0x004f 0x030b */ + 0x0150, 0x0000, + /* 0x0055 0x030b */ + 0x0170, 0x0000, + /* 0x006f 0x030b */ + 0x0151, 0x0000, + /* 0x0075 0x030b */ + 0x0171, 0x0000, + /* 0x0423 0x030b */ + 0x04f2, 0x0000, + /* 0x0443 0x030b */ + 0x04f3, 0x0000, + /* 0x0041 0x030c */ + 0x01cd, 0x0000, + /* 0x0043 0x030c */ + 0x010c, 0x0000, + /* 0x0044 0x030c */ + 0x010e, 0x0000, + /* 0x0045 0x030c */ + 0x011a, 0x0000, + /* 0x0047 0x030c */ + 0x01e6, 0x0000, + /* 0x0049 0x030c */ + 0x01cf, 0x0000, + /* 0x004b 0x030c */ + 0x01e8, 0x0000, + /* 0x004c 0x030c */ + 0x013d, 0x0000, + /* 0x004e 0x030c */ + 0x0147, 0x0000, + /* 0x004f 0x030c */ + 0x01d1, 0x0000, + /* 0x0052 0x030c */ + 0x0158, 0x0000, + /* 0x0053 0x030c */ + 0x0160, 0x0001, 0x0307, 0x0dfc, + /* 0x0054 0x030c */ + 0x0164, 0x0000, + /* 0x0055 0x030c */ + 0x01d3, 0x0000, + /* 0x005a 0x030c */ + 0x017d, 0x0000, + /* 0x0061 0x030c */ + 0x01ce, 0x0000, + /* 0x0063 0x030c */ + 0x010d, 0x0000, + /* 0x0064 0x030c */ + 0x010f, 0x0000, + /* 0x0065 0x030c */ + 0x011b, 0x0000, + /* 0x0067 0x030c */ + 0x01e7, 0x0000, + /* 0x0069 0x030c */ + 0x01d0, 0x0000, + /* 0x006a 0x030c */ + 0x01f0, 0x0000, + /* 0x006b 0x030c */ + 0x01e9, 0x0000, + /* 0x006c 0x030c */ + 0x013e, 0x0000, + /* 0x006e 0x030c */ + 0x0148, 0x0000, + /* 0x006f 0x030c */ + 0x01d2, 0x0000, + /* 0x0072 0x030c */ + 0x0159, 0x0000, + /* 0x0073 0x030c */ + 0x0161, 0x0001, 0x0307, 0x0dfe, + /* 0x0074 0x030c */ + 0x0165, 0x0000, + /* 0x0075 0x030c */ + 0x01d4, 0x0000, + /* 0x007a 0x030c */ + 0x017e, 0x0000, + /* 0x01b7 0x030c */ + 0x01ee, 0x0000, + /* 0x0292 0x030c */ + 0x01ef, 0x0000, + /* 0x00a8 0x030d */ + 0x0385, 0x0000, + /* 0x0308 0x030d */ + 0x0344, 0x0000, + /* 0x0391 0x030d */ + 0x0386, 0x0000, + /* 0x0395 0x030d */ + 0x0388, 0x0000, + /* 0x0397 0x030d */ + 0x0389, 0x0000, + /* 0x0399 0x030d */ + 0x038a, 0x0000, + /* 0x039f 0x030d */ + 0x038c, 0x0000, + /* 0x03a5 0x030d */ + 0x038e, 0x0000, + /* 0x03a9 0x030d */ + 0x038f, 0x0000, + /* 0x03b1 0x030d */ + 0x03ac, 0x0000, + /* 0x03b5 0x030d */ + 0x03ad, 0x0000, + /* 0x03b7 0x030d */ + 0x03ae, 0x0000, + /* 0x03b9 0x030d */ + 0x03af, 0x0000, + /* 0x03bf 0x030d */ + 0x03cc, 0x0000, + /* 0x03c5 0x030d */ + 0x03cd, 0x0000, + /* 0x03c9 0x030d */ + 0x03ce, 0x0000, + /* 0x03d2 0x030d */ + 0x03d3, 0x0000, + /* 0x0041 0x030f */ + 0x0200, 0x0000, + /* 0x0045 0x030f */ + 0x0204, 0x0000, + /* 0x0049 0x030f */ + 0x0208, 0x0000, + /* 0x004f 0x030f */ + 0x020c, 0x0000, + /* 0x0052 0x030f */ + 0x0210, 0x0000, + /* 0x0055 0x030f */ + 0x0214, 0x0000, + /* 0x0061 0x030f */ + 0x0201, 0x0000, + /* 0x0065 0x030f */ + 0x0205, 0x0000, + /* 0x0069 0x030f */ + 0x0209, 0x0000, + /* 0x006f 0x030f */ + 0x020d, 0x0000, + /* 0x0072 0x030f */ + 0x0211, 0x0000, + /* 0x0075 0x030f */ + 0x0215, 0x0000, + /* 0x0474 0x030f */ + 0x0476, 0x0000, + /* 0x0475 0x030f */ + 0x0477, 0x0000, + /* 0x0041 0x0311 */ + 0x0202, 0x0000, + /* 0x0045 0x0311 */ + 0x0206, 0x0000, + /* 0x0049 0x0311 */ + 0x020a, 0x0000, + /* 0x004f 0x0311 */ + 0x020e, 0x0000, + /* 0x0052 0x0311 */ + 0x0212, 0x0000, + /* 0x0055 0x0311 */ + 0x0216, 0x0000, + /* 0x0061 0x0311 */ + 0x0203, 0x0000, + /* 0x0065 0x0311 */ + 0x0207, 0x0000, + /* 0x0069 0x0311 */ + 0x020b, 0x0000, + /* 0x006f 0x0311 */ + 0x020f, 0x0000, + /* 0x0072 0x0311 */ + 0x0213, 0x0000, + /* 0x0075 0x0311 */ + 0x0217, 0x0000, + /* 0x0391 0x0313 */ + 0x1f08, 0x0003, 0x0300, 0x0e00, 0x0301, 0x0e02, 0x0342, 0x0e04, + /* 0x0395 0x0313 */ + 0x1f18, 0x0002, 0x0300, 0x0e06, 0x0301, 0x0e08, + /* 0x0397 0x0313 */ + 0x1f28, 0x0003, 0x0300, 0x0e0a, 0x0301, 0x0e0c, 0x0342, 0x0e0e, + /* 0x0399 0x0313 */ + 0x1f38, 0x0003, 0x0300, 0x0e10, 0x0301, 0x0e12, 0x0342, 0x0e14, + /* 0x039f 0x0313 */ + 0x1f48, 0x0002, 0x0300, 0x0e16, 0x0301, 0x0e18, + /* 0x03a9 0x0313 */ + 0x1f68, 0x0003, 0x0300, 0x0e1a, 0x0301, 0x0e1c, 0x0342, 0x0e1e, + /* 0x03b1 0x0313 */ + 0x1f00, 0x0003, 0x0300, 0x0e20, 0x0301, 0x0e22, 0x0342, 0x0e24, + /* 0x03b5 0x0313 */ + 0x1f10, 0x0002, 0x0300, 0x0e26, 0x0301, 0x0e28, + /* 0x03b7 0x0313 */ + 0x1f20, 0x0003, 0x0300, 0x0e2a, 0x0301, 0x0e2c, 0x0342, 0x0e2e, + /* 0x03b9 0x0313 */ + 0x1f30, 0x0003, 0x0300, 0x0e30, 0x0301, 0x0e32, 0x0342, 0x0e34, + /* 0x03bf 0x0313 */ + 0x1f40, 0x0002, 0x0300, 0x0e36, 0x0301, 0x0e38, + /* 0x03c1 0x0313 */ + 0x1fe4, 0x0000, + /* 0x03c5 0x0313 */ + 0x1f50, 0x0003, 0x0300, 0x0e3a, 0x0301, 0x0e3c, 0x0342, 0x0e3e, + /* 0x03c9 0x0313 */ + 0x1f60, 0x0003, 0x0300, 0x0e40, 0x0301, 0x0e42, 0x0342, 0x0e44, + /* 0x0391 0x0314 */ + 0x1f09, 0x0003, 0x0300, 0x0e46, 0x0301, 0x0e48, 0x0342, 0x0e4a, + /* 0x0395 0x0314 */ + 0x1f19, 0x0002, 0x0300, 0x0e4c, 0x0301, 0x0e4e, + /* 0x0397 0x0314 */ + 0x1f29, 0x0003, 0x0300, 0x0e50, 0x0301, 0x0e52, 0x0342, 0x0e54, + /* 0x0399 0x0314 */ + 0x1f39, 0x0003, 0x0300, 0x0e56, 0x0301, 0x0e58, 0x0342, 0x0e5a, + /* 0x039f 0x0314 */ + 0x1f49, 0x0002, 0x0300, 0x0e5c, 0x0301, 0x0e5e, + /* 0x03a1 0x0314 */ + 0x1fec, 0x0000, + /* 0x03a5 0x0314 */ + 0x1f59, 0x0003, 0x0300, 0x0e60, 0x0301, 0x0e62, 0x0342, 0x0e64, + /* 0x03a9 0x0314 */ + 0x1f69, 0x0003, 0x0300, 0x0e66, 0x0301, 0x0e68, 0x0342, 0x0e6a, + /* 0x03b1 0x0314 */ + 0x1f01, 0x0003, 0x0300, 0x0e6c, 0x0301, 0x0e6e, 0x0342, 0x0e70, + /* 0x03b5 0x0314 */ + 0x1f11, 0x0002, 0x0300, 0x0e72, 0x0301, 0x0e74, + /* 0x03b7 0x0314 */ + 0x1f21, 0x0003, 0x0300, 0x0e76, 0x0301, 0x0e78, 0x0342, 0x0e7a, + /* 0x03b9 0x0314 */ + 0x1f31, 0x0003, 0x0300, 0x0e7c, 0x0301, 0x0e7e, 0x0342, 0x0e80, + /* 0x03bf 0x0314 */ + 0x1f41, 0x0002, 0x0300, 0x0e82, 0x0301, 0x0e84, + /* 0x03c1 0x0314 */ + 0x1fe5, 0x0000, + /* 0x03c5 0x0314 */ + 0x1f51, 0x0003, 0x0300, 0x0e86, 0x0301, 0x0e88, 0x0342, 0x0e8a, + /* 0x03c9 0x0314 */ + 0x1f61, 0x0003, 0x0300, 0x0e8c, 0x0301, 0x0e8e, 0x0342, 0x0e90, + /* 0x004f 0x031b */ + 0x01a0, 0x0005, 0x0300, 0x0e92, 0x0301, 0x0e94, 0x0303, 0x0e96, + 0x0309, 0x0e98, 0x0323, 0x0e9a, + /* 0x0055 0x031b */ + 0x01af, 0x0005, 0x0300, 0x0e9c, 0x0301, 0x0e9e, 0x0303, 0x0ea0, + 0x0309, 0x0ea2, 0x0323, 0x0ea4, + /* 0x006f 0x031b */ + 0x01a1, 0x0005, 0x0300, 0x0ea6, 0x0301, 0x0ea8, 0x0303, 0x0eaa, + 0x0309, 0x0eac, 0x0323, 0x0eae, + /* 0x0075 0x031b */ + 0x01b0, 0x0005, 0x0300, 0x0eb0, 0x0301, 0x0eb2, 0x0303, 0x0eb4, + 0x0309, 0x0eb6, 0x0323, 0x0eb8, + /* 0x0041 0x0323 */ + 0x1ea0, 0x0002, 0x0302, 0x0eba, 0x0306, 0x0ebc, + /* 0x0042 0x0323 */ + 0x1e04, 0x0000, + /* 0x0044 0x0323 */ + 0x1e0c, 0x0000, + /* 0x0045 0x0323 */ + 0x1eb8, 0x0001, 0x0302, 0x0ebe, + /* 0x0048 0x0323 */ + 0x1e24, 0x0000, + /* 0x0049 0x0323 */ + 0x1eca, 0x0000, + /* 0x004b 0x0323 */ + 0x1e32, 0x0000, + /* 0x004c 0x0323 */ + 0x1e36, 0x0001, 0x0304, 0x0ec0, + /* 0x004d 0x0323 */ + 0x1e42, 0x0000, + /* 0x004e 0x0323 */ + 0x1e46, 0x0000, + /* 0x004f 0x0323 */ + 0x1ecc, 0x0001, 0x0302, 0x0ec2, + /* 0x0052 0x0323 */ + 0x1e5a, 0x0001, 0x0304, 0x0ec4, + /* 0x0053 0x0323 */ + 0x1e62, 0x0001, 0x0307, 0x0ec6, + /* 0x0054 0x0323 */ + 0x1e6c, 0x0000, + /* 0x0055 0x0323 */ + 0x1ee4, 0x0000, + /* 0x0056 0x0323 */ + 0x1e7e, 0x0000, + /* 0x0057 0x0323 */ + 0x1e88, 0x0000, + /* 0x0059 0x0323 */ + 0x1ef4, 0x0000, + /* 0x005a 0x0323 */ + 0x1e92, 0x0000, + /* 0x0061 0x0323 */ + 0x1ea1, 0x0002, 0x0302, 0x0ec8, 0x0306, 0x0eca, + /* 0x0062 0x0323 */ + 0x1e05, 0x0000, + /* 0x0064 0x0323 */ + 0x1e0d, 0x0000, + /* 0x0065 0x0323 */ + 0x1eb9, 0x0001, 0x0302, 0x0ecc, + /* 0x0068 0x0323 */ + 0x1e25, 0x0000, + /* 0x0069 0x0323 */ + 0x1ecb, 0x0000, + /* 0x006b 0x0323 */ + 0x1e33, 0x0000, + /* 0x006c 0x0323 */ + 0x1e37, 0x0001, 0x0304, 0x0ece, + /* 0x006d 0x0323 */ + 0x1e43, 0x0000, + /* 0x006e 0x0323 */ + 0x1e47, 0x0000, + /* 0x006f 0x0323 */ + 0x1ecd, 0x0001, 0x0302, 0x0ed0, + /* 0x0072 0x0323 */ + 0x1e5b, 0x0001, 0x0304, 0x0ed2, + /* 0x0073 0x0323 */ + 0x1e63, 0x0001, 0x0307, 0x0ed4, + /* 0x0074 0x0323 */ + 0x1e6d, 0x0000, + /* 0x0075 0x0323 */ + 0x1ee5, 0x0000, + /* 0x0076 0x0323 */ + 0x1e7f, 0x0000, + /* 0x0077 0x0323 */ + 0x1e89, 0x0000, + /* 0x0079 0x0323 */ + 0x1ef5, 0x0000, + /* 0x007a 0x0323 */ + 0x1e93, 0x0000, + /* 0x0055 0x0324 */ + 0x1e72, 0x0000, + /* 0x0075 0x0324 */ + 0x1e73, 0x0000, + /* 0x0041 0x0325 */ + 0x1e00, 0x0000, + /* 0x0061 0x0325 */ + 0x1e01, 0x0000, + /* 0x0043 0x0327 */ + 0x00c7, 0x0001, 0x0301, 0x0ed6, + /* 0x0044 0x0327 */ + 0x1e10, 0x0000, + /* 0x0045 0x0327 */ + 0x0000, 0x0001, 0x0306, 0x0ed8, + /* 0x0047 0x0327 */ + 0x0122, 0x0000, + /* 0x0048 0x0327 */ + 0x1e28, 0x0000, + /* 0x004b 0x0327 */ + 0x0136, 0x0000, + /* 0x004c 0x0327 */ + 0x013b, 0x0000, + /* 0x004e 0x0327 */ + 0x0145, 0x0000, + /* 0x0052 0x0327 */ + 0x0156, 0x0000, + /* 0x0053 0x0327 */ + 0x015e, 0x0000, + /* 0x0054 0x0327 */ + 0x0162, 0x0000, + /* 0x0063 0x0327 */ + 0x00e7, 0x0001, 0x0301, 0x0eda, + /* 0x0064 0x0327 */ + 0x1e11, 0x0000, + /* 0x0065 0x0327 */ + 0x0000, 0x0001, 0x0306, 0x0edc, + /* 0x0067 0x0327 */ + 0x0123, 0x0000, + /* 0x0068 0x0327 */ + 0x1e29, 0x0000, + /* 0x006b 0x0327 */ + 0x0137, 0x0000, + /* 0x006c 0x0327 */ + 0x013c, 0x0000, + /* 0x006e 0x0327 */ + 0x0146, 0x0000, + /* 0x0072 0x0327 */ + 0x0157, 0x0000, + /* 0x0073 0x0327 */ + 0x015f, 0x0000, + /* 0x0074 0x0327 */ + 0x0163, 0x0000, + /* 0x0041 0x0328 */ + 0x0104, 0x0000, + /* 0x0045 0x0328 */ + 0x0118, 0x0000, + /* 0x0049 0x0328 */ + 0x012e, 0x0000, + /* 0x004f 0x0328 */ + 0x01ea, 0x0001, 0x0304, 0x0ede, + /* 0x0055 0x0328 */ + 0x0172, 0x0000, + /* 0x0061 0x0328 */ + 0x0105, 0x0000, + /* 0x0065 0x0328 */ + 0x0119, 0x0000, + /* 0x0069 0x0328 */ + 0x012f, 0x0000, + /* 0x006f 0x0328 */ + 0x01eb, 0x0001, 0x0304, 0x0ee0, + /* 0x0075 0x0328 */ + 0x0173, 0x0000, + /* 0x0044 0x032d */ + 0x1e12, 0x0000, + /* 0x0045 0x032d */ + 0x1e18, 0x0000, + /* 0x004c 0x032d */ + 0x1e3c, 0x0000, + /* 0x004e 0x032d */ + 0x1e4a, 0x0000, + /* 0x0054 0x032d */ + 0x1e70, 0x0000, + /* 0x0055 0x032d */ + 0x1e76, 0x0000, + /* 0x0064 0x032d */ + 0x1e13, 0x0000, + /* 0x0065 0x032d */ + 0x1e19, 0x0000, + /* 0x006c 0x032d */ + 0x1e3d, 0x0000, + /* 0x006e 0x032d */ + 0x1e4b, 0x0000, + /* 0x0074 0x032d */ + 0x1e71, 0x0000, + /* 0x0075 0x032d */ + 0x1e77, 0x0000, + /* 0x0048 0x032e */ + 0x1e2a, 0x0000, + /* 0x0068 0x032e */ + 0x1e2b, 0x0000, + /* 0x0045 0x0330 */ + 0x1e1a, 0x0000, + /* 0x0049 0x0330 */ + 0x1e2c, 0x0000, + /* 0x0055 0x0330 */ + 0x1e74, 0x0000, + /* 0x0065 0x0330 */ + 0x1e1b, 0x0000, + /* 0x0069 0x0330 */ + 0x1e2d, 0x0000, + /* 0x0075 0x0330 */ + 0x1e75, 0x0000, + /* 0x0042 0x0331 */ + 0x1e06, 0x0000, + /* 0x0044 0x0331 */ + 0x1e0e, 0x0000, + /* 0x004b 0x0331 */ + 0x1e34, 0x0000, + /* 0x004c 0x0331 */ + 0x1e3a, 0x0000, + /* 0x004e 0x0331 */ + 0x1e48, 0x0000, + /* 0x0052 0x0331 */ + 0x1e5e, 0x0000, + /* 0x0054 0x0331 */ + 0x1e6e, 0x0000, + /* 0x005a 0x0331 */ + 0x1e94, 0x0000, + /* 0x0062 0x0331 */ + 0x1e07, 0x0000, + /* 0x0064 0x0331 */ + 0x1e0f, 0x0000, + /* 0x0068 0x0331 */ + 0x1e96, 0x0000, + /* 0x006b 0x0331 */ + 0x1e35, 0x0000, + /* 0x006c 0x0331 */ + 0x1e3b, 0x0000, + /* 0x006e 0x0331 */ + 0x1e49, 0x0000, + /* 0x0072 0x0331 */ + 0x1e5f, 0x0000, + /* 0x0074 0x0331 */ + 0x1e6f, 0x0000, + /* 0x007a 0x0331 */ + 0x1e95, 0x0000, + /* 0x00a8 0x0342 */ + 0x1fc1, 0x0000, + /* 0x03b1 0x0342 */ + 0x1fb6, 0x0000, + /* 0x03b7 0x0342 */ + 0x1fc6, 0x0000, + /* 0x03b9 0x0342 */ + 0x1fd6, 0x0000, + /* 0x03c5 0x0342 */ + 0x1fe6, 0x0000, + /* 0x03c9 0x0342 */ + 0x1ff6, 0x0000, + /* 0x1fbf 0x0342 */ + 0x1fcf, 0x0000, + /* 0x1ffe 0x0342 */ + 0x1fdf, 0x0000, + /* 0x0391 0x0345 */ + 0x1fbc, 0x0002, 0x0313, 0x0ee2, 0x0314, 0x0eea, + /* 0x0397 0x0345 */ + 0x1fcc, 0x0002, 0x0313, 0x0ef2, 0x0314, 0x0efa, + /* 0x03a9 0x0345 */ + 0x1ffc, 0x0002, 0x0313, 0x0f02, 0x0314, 0x0f0a, + /* 0x03b1 0x0345 */ + 0x1fb3, 0x0005, 0x0300, 0x0f12, 0x0301, 0x0f14, 0x0313, 0x0f16, + 0x0314, 0x0f1e, 0x0342, 0x0f26, + /* 0x03b7 0x0345 */ + 0x1fc3, 0x0005, 0x0300, 0x0f28, 0x0301, 0x0f2a, 0x0313, 0x0f2c, + 0x0314, 0x0f34, 0x0342, 0x0f3c, + /* 0x03bf 0x0345 */ + 0x0000, 0x0001, 0x0301, 0x0f3e, + /* 0x03c9 0x0345 */ + 0x1ff3, 0x0004, 0x0300, 0x0f40, 0x0313, 0x0f42, 0x0314, 0x0f4a, + 0x0342, 0x0f52, + /* 0x05d0 0x05b7 */ + 0xfb2e, 0x0000, + /* 0x05f2 0x05b7 */ + 0xfb1f, 0x0000, + /* 0x05d0 0x05b8 */ + 0xfb2f, 0x0000, + /* 0x05d5 0x05b9 */ + 0xfb4b, 0x0000, + /* 0x05d0 0x05bc */ + 0xfb30, 0x0000, + /* 0x05d1 0x05bc */ + 0xfb31, 0x0000, + /* 0x05d2 0x05bc */ + 0xfb32, 0x0000, + /* 0x05d3 0x05bc */ + 0xfb33, 0x0000, + /* 0x05d4 0x05bc */ + 0xfb34, 0x0000, + /* 0x05d5 0x05bc */ + 0xfb35, 0x0000, + /* 0x05d6 0x05bc */ + 0xfb36, 0x0000, + /* 0x05d8 0x05bc */ + 0xfb38, 0x0000, + /* 0x05d9 0x05bc */ + 0xfb39, 0x0000, + /* 0x05da 0x05bc */ + 0xfb3a, 0x0000, + /* 0x05db 0x05bc */ + 0xfb3b, 0x0000, + /* 0x05dc 0x05bc */ + 0xfb3c, 0x0000, + /* 0x05de 0x05bc */ + 0xfb3e, 0x0000, + /* 0x05e0 0x05bc */ + 0xfb40, 0x0000, + /* 0x05e1 0x05bc */ + 0xfb41, 0x0000, + /* 0x05e3 0x05bc */ + 0xfb43, 0x0000, + /* 0x05e4 0x05bc */ + 0xfb44, 0x0000, + /* 0x05e6 0x05bc */ + 0xfb46, 0x0000, + /* 0x05e7 0x05bc */ + 0xfb47, 0x0000, + /* 0x05e8 0x05bc */ + 0xfb48, 0x0000, + /* 0x05e9 0x05bc */ + 0xfb49, 0x0002, 0x05c1, 0x0f54, 0x05c2, 0x0f56, + /* 0x05ea 0x05bc */ + 0xfb4a, 0x0000, + /* 0x05d1 0x05bf */ + 0xfb4c, 0x0000, + /* 0x05db 0x05bf */ + 0xfb4d, 0x0000, + /* 0x05e4 0x05bf */ + 0xfb4e, 0x0000, + /* 0x05e9 0x05c1 */ + 0xfb2a, 0x0000, + /* 0x05e9 0x05c2 */ + 0xfb2b, 0x0000, + /* 0x0915 0x093c */ + 0x0958, 0x0000, + /* 0x0916 0x093c */ + 0x0959, 0x0000, + /* 0x0917 0x093c */ + 0x095a, 0x0000, + /* 0x091c 0x093c */ + 0x095b, 0x0000, + /* 0x0921 0x093c */ + 0x095c, 0x0000, + /* 0x0922 0x093c */ + 0x095d, 0x0000, + /* 0x0928 0x093c */ + 0x0929, 0x0000, + /* 0x092b 0x093c */ + 0x095e, 0x0000, + /* 0x092f 0x093c */ + 0x095f, 0x0000, + /* 0x0930 0x093c */ + 0x0931, 0x0000, + /* 0x0933 0x093c */ + 0x0934, 0x0000, + /* 0x09a1 0x09bc */ + 0x09dc, 0x0000, + /* 0x09a2 0x09bc */ + 0x09dd, 0x0000, + /* 0x09ac 0x09bc */ + 0x09b0, 0x0000, + /* 0x09af 0x09bc */ + 0x09df, 0x0000, + /* 0x09c7 0x09be */ + 0x09cb, 0x0000, + /* 0x09c7 0x09d7 */ + 0x09cc, 0x0000, + /* 0x0a16 0x0a3c */ + 0x0a59, 0x0000, + /* 0x0a17 0x0a3c */ + 0x0a5a, 0x0000, + /* 0x0a1c 0x0a3c */ + 0x0a5b, 0x0000, + /* 0x0a21 0x0a3c */ + 0x0a5c, 0x0000, + /* 0x0a2b 0x0a3c */ + 0x0a5e, 0x0000, + /* 0x0b21 0x0b3c */ + 0x0b5c, 0x0000, + /* 0x0b22 0x0b3c */ + 0x0b5d, 0x0000, + /* 0x0b2f 0x0b3c */ + 0x0b5f, 0x0000, + /* 0x0b47 0x0b3e */ + 0x0b4b, 0x0000, + /* 0x0b47 0x0b56 */ + 0x0b48, 0x0000, + /* 0x0b47 0x0b57 */ + 0x0b4c, 0x0000, + /* 0x0bc6 0x0bbe */ + 0x0bca, 0x0000, + /* 0x0bc7 0x0bbe */ + 0x0bcb, 0x0000, + /* 0x0b92 0x0bd7 */ + 0x0b94, 0x0000, + /* 0x0bc6 0x0bd7 */ + 0x0bcc, 0x0000, + /* 0x0c46 0x0c56 */ + 0x0c48, 0x0000, + /* 0x0cc6 0x0cc2 */ + 0x0cca, 0x0001, 0x0cd5, 0x0f58, + /* 0x0cbf 0x0cd5 */ + 0x0cc0, 0x0000, + /* 0x0cc6 0x0cd5 */ + 0x0cc7, 0x0000, + /* 0x0cc6 0x0cd6 */ + 0x0cc8, 0x0000, + /* 0x0d46 0x0d3e */ + 0x0d4a, 0x0000, + /* 0x0d47 0x0d3e */ + 0x0d4b, 0x0000, + /* 0x0d46 0x0d57 */ + 0x0d4c, 0x0000, + /* 0x0e4d 0x0e32 */ + 0x0e33, 0x0000, + /* 0x0ecd 0x0eb2 */ + 0x0eb3, 0x0000, + /* 0x0f72 0x0f71 */ + 0x0f73, 0x0000, + /* 0x0f74 0x0f71 */ + 0x0f75, 0x0000, + /* 0x0f80 0x0f71 */ + 0x0f81, 0x0000, + /* 0x0fb2 0x0f80 */ + 0x0f76, 0x0001, 0x0f71, 0x0f5a, + /* 0x0fb3 0x0f80 */ + 0x0f78, 0x0001, 0x0f71, 0x0f5c, + /* 0x0f40 0x0fb5 */ + 0x0f69, 0x0000, + /* 0x0f90 0x0fb5 */ + 0x0fb9, 0x0000, + /* 0x0f42 0x0fb7 */ + 0x0f43, 0x0000, + /* 0x0f4c 0x0fb7 */ + 0x0f4d, 0x0000, + /* 0x0f51 0x0fb7 */ + 0x0f52, 0x0000, + /* 0x0f56 0x0fb7 */ + 0x0f57, 0x0000, + /* 0x0f5b 0x0fb7 */ + 0x0f5c, 0x0000, + /* 0x0f92 0x0fb7 */ + 0x0f93, 0x0000, + /* 0x0f9c 0x0fb7 */ + 0x0f9d, 0x0000, + /* 0x0fa1 0x0fb7 */ + 0x0fa2, 0x0000, + /* 0x0fa6 0x0fb7 */ + 0x0fa7, 0x0000, + /* 0x0fab 0x0fb7 */ + 0x0fac, 0x0000, + /* 0x3046 0x3099 */ + 0x3094, 0x0000, + /* 0x304b 0x3099 */ + 0x304c, 0x0000, + /* 0x304d 0x3099 */ + 0x304e, 0x0000, + /* 0x304f 0x3099 */ + 0x3050, 0x0000, + /* 0x3051 0x3099 */ + 0x3052, 0x0000, + /* 0x3053 0x3099 */ + 0x3054, 0x0000, + /* 0x3055 0x3099 */ + 0x3056, 0x0000, + /* 0x3057 0x3099 */ + 0x3058, 0x0000, + /* 0x3059 0x3099 */ + 0x305a, 0x0000, + /* 0x305b 0x3099 */ + 0x305c, 0x0000, + /* 0x305d 0x3099 */ + 0x305e, 0x0000, + /* 0x305f 0x3099 */ + 0x3060, 0x0000, + /* 0x3061 0x3099 */ + 0x3062, 0x0000, + /* 0x3064 0x3099 */ + 0x3065, 0x0000, + /* 0x3066 0x3099 */ + 0x3067, 0x0000, + /* 0x3068 0x3099 */ + 0x3069, 0x0000, + /* 0x306f 0x3099 */ + 0x3070, 0x0000, + /* 0x3072 0x3099 */ + 0x3073, 0x0000, + /* 0x3075 0x3099 */ + 0x3076, 0x0000, + /* 0x3078 0x3099 */ + 0x3079, 0x0000, + /* 0x307b 0x3099 */ + 0x307c, 0x0000, + /* 0x309d 0x3099 */ + 0x309e, 0x0000, + /* 0x30a6 0x3099 */ + 0x30f4, 0x0000, + /* 0x30ab 0x3099 */ + 0x30ac, 0x0000, + /* 0x30ad 0x3099 */ + 0x30ae, 0x0000, + /* 0x30af 0x3099 */ + 0x30b0, 0x0000, + /* 0x30b1 0x3099 */ + 0x30b2, 0x0000, + /* 0x30b3 0x3099 */ + 0x30b4, 0x0000, + /* 0x30b5 0x3099 */ + 0x30b6, 0x0000, + /* 0x30b7 0x3099 */ + 0x30b8, 0x0000, + /* 0x30b9 0x3099 */ + 0x30ba, 0x0000, + /* 0x30bb 0x3099 */ + 0x30bc, 0x0000, + /* 0x30bd 0x3099 */ + 0x30be, 0x0000, + /* 0x30bf 0x3099 */ + 0x30c0, 0x0000, + /* 0x30c1 0x3099 */ + 0x30c2, 0x0000, + /* 0x30c4 0x3099 */ + 0x30c5, 0x0000, + /* 0x30c6 0x3099 */ + 0x30c7, 0x0000, + /* 0x30c8 0x3099 */ + 0x30c9, 0x0000, + /* 0x30cf 0x3099 */ + 0x30d0, 0x0000, + /* 0x30d2 0x3099 */ + 0x30d3, 0x0000, + /* 0x30d5 0x3099 */ + 0x30d6, 0x0000, + /* 0x30d8 0x3099 */ + 0x30d9, 0x0000, + /* 0x30db 0x3099 */ + 0x30dc, 0x0000, + /* 0x30ef 0x3099 */ + 0x30f7, 0x0000, + /* 0x30f0 0x3099 */ + 0x30f8, 0x0000, + /* 0x30f1 0x3099 */ + 0x30f9, 0x0000, + /* 0x30f2 0x3099 */ + 0x30fa, 0x0000, + /* 0x30fd 0x3099 */ + 0x30fe, 0x0000, + /* 0x306f 0x309a */ + 0x3071, 0x0000, + /* 0x3072 0x309a */ + 0x3074, 0x0000, + /* 0x3075 0x309a */ + 0x3077, 0x0000, + /* 0x3078 0x309a */ + 0x307a, 0x0000, + /* 0x307b 0x309a */ + 0x307d, 0x0000, + /* 0x30cf 0x309a */ + 0x30d1, 0x0000, + /* 0x30d2 0x309a */ + 0x30d4, 0x0000, + /* 0x30d5 0x309a */ + 0x30d7, 0x0000, + /* 0x30d8 0x309a */ + 0x30da, 0x0000, + /* 0x30db 0x309a */ + 0x30dd, 0x0000, + /* 0x0307 0x0053 0x0301 */ + 0x1e64, 0x0000, + /* 0x0307 0x0073 0x0301 */ + 0x1e65, 0x0000, + /* 0x0300 0x0041 0x0302 */ + 0x1ea6, 0x0000, + /* 0x0301 0x0041 0x0302 */ + 0x1ea4, 0x0000, + /* 0x0303 0x0041 0x0302 */ + 0x1eaa, 0x0000, + /* 0x0309 0x0041 0x0302 */ + 0x1ea8, 0x0000, + /* 0x0300 0x0045 0x0302 */ + 0x1ec0, 0x0000, + /* 0x0301 0x0045 0x0302 */ + 0x1ebe, 0x0000, + /* 0x0303 0x0045 0x0302 */ + 0x1ec4, 0x0000, + /* 0x0309 0x0045 0x0302 */ + 0x1ec2, 0x0000, + /* 0x0300 0x004f 0x0302 */ + 0x1ed2, 0x0000, + /* 0x0301 0x004f 0x0302 */ + 0x1ed0, 0x0000, + /* 0x0303 0x004f 0x0302 */ + 0x1ed6, 0x0000, + /* 0x0309 0x004f 0x0302 */ + 0x1ed4, 0x0000, + /* 0x0300 0x0061 0x0302 */ + 0x1ea7, 0x0000, + /* 0x0301 0x0061 0x0302 */ + 0x1ea5, 0x0000, + /* 0x0303 0x0061 0x0302 */ + 0x1eab, 0x0000, + /* 0x0309 0x0061 0x0302 */ + 0x1ea9, 0x0000, + /* 0x0300 0x0065 0x0302 */ + 0x1ec1, 0x0000, + /* 0x0301 0x0065 0x0302 */ + 0x1ebf, 0x0000, + /* 0x0303 0x0065 0x0302 */ + 0x1ec5, 0x0000, + /* 0x0309 0x0065 0x0302 */ + 0x1ec3, 0x0000, + /* 0x0300 0x006f 0x0302 */ + 0x1ed3, 0x0000, + /* 0x0301 0x006f 0x0302 */ + 0x1ed1, 0x0000, + /* 0x0303 0x006f 0x0302 */ + 0x1ed7, 0x0000, + /* 0x0309 0x006f 0x0302 */ + 0x1ed5, 0x0000, + /* 0x0301 0x004f 0x0303 */ + 0x1e4c, 0x0000, + /* 0x0308 0x004f 0x0303 */ + 0x1e4e, 0x0000, + /* 0x0301 0x0055 0x0303 */ + 0x1e78, 0x0000, + /* 0x0301 0x006f 0x0303 */ + 0x1e4d, 0x0000, + /* 0x0308 0x006f 0x0303 */ + 0x1e4f, 0x0000, + /* 0x0301 0x0075 0x0303 */ + 0x1e79, 0x0000, + /* 0x0300 0x0045 0x0304 */ + 0x1e14, 0x0000, + /* 0x0301 0x0045 0x0304 */ + 0x1e16, 0x0000, + /* 0x0300 0x004f 0x0304 */ + 0x1e50, 0x0000, + /* 0x0301 0x004f 0x0304 */ + 0x1e52, 0x0000, + /* 0x0308 0x0055 0x0304 */ + 0x1e7a, 0x0000, + /* 0x0300 0x0065 0x0304 */ + 0x1e15, 0x0000, + /* 0x0301 0x0065 0x0304 */ + 0x1e17, 0x0000, + /* 0x0300 0x006f 0x0304 */ + 0x1e51, 0x0000, + /* 0x0301 0x006f 0x0304 */ + 0x1e53, 0x0000, + /* 0x0308 0x0075 0x0304 */ + 0x1e7b, 0x0000, + /* 0x0300 0x0041 0x0306 */ + 0x1eb0, 0x0000, + /* 0x0301 0x0041 0x0306 */ + 0x1eae, 0x0000, + /* 0x0303 0x0041 0x0306 */ + 0x1eb4, 0x0000, + /* 0x0309 0x0041 0x0306 */ + 0x1eb2, 0x0000, + /* 0x0300 0x0061 0x0306 */ + 0x1eb1, 0x0000, + /* 0x0301 0x0061 0x0306 */ + 0x1eaf, 0x0000, + /* 0x0303 0x0061 0x0306 */ + 0x1eb5, 0x0000, + /* 0x0309 0x0061 0x0306 */ + 0x1eb3, 0x0000, + /* 0x0304 0x0041 0x0307 */ + 0x01e0, 0x0000, + /* 0x0304 0x0061 0x0307 */ + 0x01e1, 0x0000, + /* 0x0304 0x0041 0x0308 */ + 0x01de, 0x0000, + /* 0x0301 0x0049 0x0308 */ + 0x1e2e, 0x0000, + /* 0x0300 0x0055 0x0308 */ + 0x01db, 0x0000, + /* 0x0301 0x0055 0x0308 */ + 0x01d7, 0x0000, + /* 0x0304 0x0055 0x0308 */ + 0x01d5, 0x0000, + /* 0x030c 0x0055 0x0308 */ + 0x01d9, 0x0000, + /* 0x0304 0x0061 0x0308 */ + 0x01df, 0x0000, + /* 0x0301 0x0069 0x0308 */ + 0x1e2f, 0x0000, + /* 0x0300 0x0075 0x0308 */ + 0x01dc, 0x0000, + /* 0x0301 0x0075 0x0308 */ + 0x01d8, 0x0000, + /* 0x0304 0x0075 0x0308 */ + 0x01d6, 0x0000, + /* 0x030c 0x0075 0x0308 */ + 0x01da, 0x0000, + /* 0x0300 0x03b9 0x0308 */ + 0x1fd2, 0x0000, + /* 0x0301 0x03b9 0x0308 */ + 0x1fd3, 0x0000, + /* 0x030d 0x03b9 0x0308 */ + 0x0390, 0x0000, + /* 0x0342 0x03b9 0x0308 */ + 0x1fd7, 0x0000, + /* 0x0300 0x03c5 0x0308 */ + 0x1fe2, 0x0000, + /* 0x0301 0x03c5 0x0308 */ + 0x1fe3, 0x0000, + /* 0x030d 0x03c5 0x0308 */ + 0x03b0, 0x0000, + /* 0x0342 0x03c5 0x0308 */ + 0x1fe7, 0x0000, + /* 0x0301 0x0041 0x030a */ + 0x01fa, 0x0000, + /* 0x0301 0x0061 0x030a */ + 0x01fb, 0x0000, + /* 0x0307 0x0053 0x030c */ + 0x1e66, 0x0000, + /* 0x0307 0x0073 0x030c */ + 0x1e67, 0x0000, + /* 0x0300 0x0391 0x0313 */ + 0x1f0a, 0x0000, + /* 0x0301 0x0391 0x0313 */ + 0x1f0c, 0x0000, + /* 0x0342 0x0391 0x0313 */ + 0x1f0e, 0x0000, + /* 0x0300 0x0395 0x0313 */ + 0x1f1a, 0x0000, + /* 0x0301 0x0395 0x0313 */ + 0x1f1c, 0x0000, + /* 0x0300 0x0397 0x0313 */ + 0x1f2a, 0x0000, + /* 0x0301 0x0397 0x0313 */ + 0x1f2c, 0x0000, + /* 0x0342 0x0397 0x0313 */ + 0x1f2e, 0x0000, + /* 0x0300 0x0399 0x0313 */ + 0x1f3a, 0x0000, + /* 0x0301 0x0399 0x0313 */ + 0x1f3c, 0x0000, + /* 0x0342 0x0399 0x0313 */ + 0x1f3e, 0x0000, + /* 0x0300 0x039f 0x0313 */ + 0x1f4a, 0x0000, + /* 0x0301 0x039f 0x0313 */ + 0x1f4c, 0x0000, + /* 0x0300 0x03a9 0x0313 */ + 0x1f6a, 0x0000, + /* 0x0301 0x03a9 0x0313 */ + 0x1f6c, 0x0000, + /* 0x0342 0x03a9 0x0313 */ + 0x1f6e, 0x0000, + /* 0x0300 0x03b1 0x0313 */ + 0x1f02, 0x0000, + /* 0x0301 0x03b1 0x0313 */ + 0x1f04, 0x0000, + /* 0x0342 0x03b1 0x0313 */ + 0x1f06, 0x0000, + /* 0x0300 0x03b5 0x0313 */ + 0x1f12, 0x0000, + /* 0x0301 0x03b5 0x0313 */ + 0x1f14, 0x0000, + /* 0x0300 0x03b7 0x0313 */ + 0x1f22, 0x0000, + /* 0x0301 0x03b7 0x0313 */ + 0x1f24, 0x0000, + /* 0x0342 0x03b7 0x0313 */ + 0x1f26, 0x0000, + /* 0x0300 0x03b9 0x0313 */ + 0x1f32, 0x0000, + /* 0x0301 0x03b9 0x0313 */ + 0x1f34, 0x0000, + /* 0x0342 0x03b9 0x0313 */ + 0x1f36, 0x0000, + /* 0x0300 0x03bf 0x0313 */ + 0x1f42, 0x0000, + /* 0x0301 0x03bf 0x0313 */ + 0x1f44, 0x0000, + /* 0x0300 0x03c5 0x0313 */ + 0x1f52, 0x0000, + /* 0x0301 0x03c5 0x0313 */ + 0x1f54, 0x0000, + /* 0x0342 0x03c5 0x0313 */ + 0x1f56, 0x0000, + /* 0x0300 0x03c9 0x0313 */ + 0x1f62, 0x0000, + /* 0x0301 0x03c9 0x0313 */ + 0x1f64, 0x0000, + /* 0x0342 0x03c9 0x0313 */ + 0x1f66, 0x0000, + /* 0x0300 0x0391 0x0314 */ + 0x1f0b, 0x0000, + /* 0x0301 0x0391 0x0314 */ + 0x1f0d, 0x0000, + /* 0x0342 0x0391 0x0314 */ + 0x1f0f, 0x0000, + /* 0x0300 0x0395 0x0314 */ + 0x1f1b, 0x0000, + /* 0x0301 0x0395 0x0314 */ + 0x1f1d, 0x0000, + /* 0x0300 0x0397 0x0314 */ + 0x1f2b, 0x0000, + /* 0x0301 0x0397 0x0314 */ + 0x1f2d, 0x0000, + /* 0x0342 0x0397 0x0314 */ + 0x1f2f, 0x0000, + /* 0x0300 0x0399 0x0314 */ + 0x1f3b, 0x0000, + /* 0x0301 0x0399 0x0314 */ + 0x1f3d, 0x0000, + /* 0x0342 0x0399 0x0314 */ + 0x1f3f, 0x0000, + /* 0x0300 0x039f 0x0314 */ + 0x1f4b, 0x0000, + /* 0x0301 0x039f 0x0314 */ + 0x1f4d, 0x0000, + /* 0x0300 0x03a5 0x0314 */ + 0x1f5b, 0x0000, + /* 0x0301 0x03a5 0x0314 */ + 0x1f5d, 0x0000, + /* 0x0342 0x03a5 0x0314 */ + 0x1f5f, 0x0000, + /* 0x0300 0x03a9 0x0314 */ + 0x1f6b, 0x0000, + /* 0x0301 0x03a9 0x0314 */ + 0x1f6d, 0x0000, + /* 0x0342 0x03a9 0x0314 */ + 0x1f6f, 0x0000, + /* 0x0300 0x03b1 0x0314 */ + 0x1f03, 0x0000, + /* 0x0301 0x03b1 0x0314 */ + 0x1f05, 0x0000, + /* 0x0342 0x03b1 0x0314 */ + 0x1f07, 0x0000, + /* 0x0300 0x03b5 0x0314 */ + 0x1f13, 0x0000, + /* 0x0301 0x03b5 0x0314 */ + 0x1f15, 0x0000, + /* 0x0300 0x03b7 0x0314 */ + 0x1f23, 0x0000, + /* 0x0301 0x03b7 0x0314 */ + 0x1f25, 0x0000, + /* 0x0342 0x03b7 0x0314 */ + 0x1f27, 0x0000, + /* 0x0300 0x03b9 0x0314 */ + 0x1f33, 0x0000, + /* 0x0301 0x03b9 0x0314 */ + 0x1f35, 0x0000, + /* 0x0342 0x03b9 0x0314 */ + 0x1f37, 0x0000, + /* 0x0300 0x03bf 0x0314 */ + 0x1f43, 0x0000, + /* 0x0301 0x03bf 0x0314 */ + 0x1f45, 0x0000, + /* 0x0300 0x03c5 0x0314 */ + 0x1f53, 0x0000, + /* 0x0301 0x03c5 0x0314 */ + 0x1f55, 0x0000, + /* 0x0342 0x03c5 0x0314 */ + 0x1f57, 0x0000, + /* 0x0300 0x03c9 0x0314 */ + 0x1f63, 0x0000, + /* 0x0301 0x03c9 0x0314 */ + 0x1f65, 0x0000, + /* 0x0342 0x03c9 0x0314 */ + 0x1f67, 0x0000, + /* 0x0300 0x004f 0x031b */ + 0x1edc, 0x0000, + /* 0x0301 0x004f 0x031b */ + 0x1eda, 0x0000, + /* 0x0303 0x004f 0x031b */ + 0x1ee0, 0x0000, + /* 0x0309 0x004f 0x031b */ + 0x1ede, 0x0000, + /* 0x0323 0x004f 0x031b */ + 0x1ee2, 0x0000, + /* 0x0300 0x0055 0x031b */ + 0x1eea, 0x0000, + /* 0x0301 0x0055 0x031b */ + 0x1ee8, 0x0000, + /* 0x0303 0x0055 0x031b */ + 0x1eee, 0x0000, + /* 0x0309 0x0055 0x031b */ + 0x1eec, 0x0000, + /* 0x0323 0x0055 0x031b */ + 0x1ef0, 0x0000, + /* 0x0300 0x006f 0x031b */ + 0x1edd, 0x0000, + /* 0x0301 0x006f 0x031b */ + 0x1edb, 0x0000, + /* 0x0303 0x006f 0x031b */ + 0x1ee1, 0x0000, + /* 0x0309 0x006f 0x031b */ + 0x1edf, 0x0000, + /* 0x0323 0x006f 0x031b */ + 0x1ee3, 0x0000, + /* 0x0300 0x0075 0x031b */ + 0x1eeb, 0x0000, + /* 0x0301 0x0075 0x031b */ + 0x1ee9, 0x0000, + /* 0x0303 0x0075 0x031b */ + 0x1eef, 0x0000, + /* 0x0309 0x0075 0x031b */ + 0x1eed, 0x0000, + /* 0x0323 0x0075 0x031b */ + 0x1ef1, 0x0000, + /* 0x0302 0x0041 0x0323 */ + 0x1eac, 0x0000, + /* 0x0306 0x0041 0x0323 */ + 0x1eb6, 0x0000, + /* 0x0302 0x0045 0x0323 */ + 0x1ec6, 0x0000, + /* 0x0304 0x004c 0x0323 */ + 0x1e38, 0x0000, + /* 0x0302 0x004f 0x0323 */ + 0x1ed8, 0x0000, + /* 0x0304 0x0052 0x0323 */ + 0x1e5c, 0x0000, + /* 0x0307 0x0053 0x0323 */ + 0x1e68, 0x0000, + /* 0x0302 0x0061 0x0323 */ + 0x1ead, 0x0000, + /* 0x0306 0x0061 0x0323 */ + 0x1eb7, 0x0000, + /* 0x0302 0x0065 0x0323 */ + 0x1ec7, 0x0000, + /* 0x0304 0x006c 0x0323 */ + 0x1e39, 0x0000, + /* 0x0302 0x006f 0x0323 */ + 0x1ed9, 0x0000, + /* 0x0304 0x0072 0x0323 */ + 0x1e5d, 0x0000, + /* 0x0307 0x0073 0x0323 */ + 0x1e69, 0x0000, + /* 0x0301 0x0043 0x0327 */ + 0x1e08, 0x0000, + /* 0x0306 0x0045 0x0327 */ + 0x1e1c, 0x0000, + /* 0x0301 0x0063 0x0327 */ + 0x1e09, 0x0000, + /* 0x0306 0x0065 0x0327 */ + 0x1e1d, 0x0000, + /* 0x0304 0x004f 0x0328 */ + 0x01ec, 0x0000, + /* 0x0304 0x006f 0x0328 */ + 0x01ed, 0x0000, + /* 0x0313 0x0391 0x0345 */ + 0x1f88, 0x0003, 0x0300, 0x0f5e, 0x0301, 0x0f60, 0x0342, 0x0f62, + /* 0x0314 0x0391 0x0345 */ + 0x1f89, 0x0003, 0x0300, 0x0f64, 0x0301, 0x0f66, 0x0342, 0x0f68, + /* 0x0313 0x0397 0x0345 */ + 0x1f98, 0x0003, 0x0300, 0x0f6a, 0x0301, 0x0f6c, 0x0342, 0x0f6e, + /* 0x0314 0x0397 0x0345 */ + 0x1f99, 0x0003, 0x0300, 0x0f70, 0x0301, 0x0f72, 0x0342, 0x0f74, + /* 0x0313 0x03a9 0x0345 */ + 0x1fa8, 0x0003, 0x0300, 0x0f76, 0x0301, 0x0f78, 0x0342, 0x0f7a, + /* 0x0314 0x03a9 0x0345 */ + 0x1fa9, 0x0003, 0x0300, 0x0f7c, 0x0301, 0x0f7e, 0x0342, 0x0f80, + /* 0x0300 0x03b1 0x0345 */ + 0x1fb2, 0x0000, + /* 0x0301 0x03b1 0x0345 */ + 0x1fb4, 0x0000, + /* 0x0313 0x03b1 0x0345 */ + 0x1f80, 0x0003, 0x0300, 0x0f82, 0x0301, 0x0f84, 0x0342, 0x0f86, + /* 0x0314 0x03b1 0x0345 */ + 0x1f81, 0x0003, 0x0300, 0x0f88, 0x0301, 0x0f8a, 0x0342, 0x0f8c, + /* 0x0342 0x03b1 0x0345 */ + 0x1fb7, 0x0000, + /* 0x0300 0x03b7 0x0345 */ + 0x1fc2, 0x0000, + /* 0x0301 0x03b7 0x0345 */ + 0x1fc4, 0x0000, + /* 0x0313 0x03b7 0x0345 */ + 0x1f90, 0x0003, 0x0300, 0x0f8e, 0x0301, 0x0f90, 0x0342, 0x0f92, + /* 0x0314 0x03b7 0x0345 */ + 0x1f91, 0x0003, 0x0300, 0x0f94, 0x0301, 0x0f96, 0x0342, 0x0f98, + /* 0x0342 0x03b7 0x0345 */ + 0x1fc7, 0x0000, + /* 0x0301 0x03bf 0x0345 */ + 0x1ff4, 0x0000, + /* 0x0300 0x03c9 0x0345 */ + 0x1ff2, 0x0000, + /* 0x0313 0x03c9 0x0345 */ + 0x1fa0, 0x0003, 0x0300, 0x0f9a, 0x0301, 0x0f9c, 0x0342, 0x0f9e, + /* 0x0314 0x03c9 0x0345 */ + 0x1fa1, 0x0003, 0x0300, 0x0fa0, 0x0301, 0x0fa2, 0x0342, 0x0fa4, + /* 0x0342 0x03c9 0x0345 */ + 0x1ff7, 0x0000, + /* 0x05c1 0x05e9 0x05bc */ + 0xfb2c, 0x0000, + /* 0x05c2 0x05e9 0x05bc */ + 0xfb2d, 0x0000, + /* 0x0cd5 0x0cc6 0x0cc2 */ + 0x0ccb, 0x0000, + /* 0x0f71 0x0fb2 0x0f80 */ + 0x0f77, 0x0000, + /* 0x0f71 0x0fb3 0x0f80 */ + 0x0f79, 0x0000, + /* 0x0300 0x0313 0x0391 0x0345 */ + 0x1f8a, 0x0000, + /* 0x0301 0x0313 0x0391 0x0345 */ + 0x1f8c, 0x0000, + /* 0x0342 0x0313 0x0391 0x0345 */ + 0x1f8e, 0x0000, + /* 0x0300 0x0314 0x0391 0x0345 */ + 0x1f8b, 0x0000, + /* 0x0301 0x0314 0x0391 0x0345 */ + 0x1f8d, 0x0000, + /* 0x0342 0x0314 0x0391 0x0345 */ + 0x1f8f, 0x0000, + /* 0x0300 0x0313 0x0397 0x0345 */ + 0x1f9a, 0x0000, + /* 0x0301 0x0313 0x0397 0x0345 */ + 0x1f9c, 0x0000, + /* 0x0342 0x0313 0x0397 0x0345 */ + 0x1f9e, 0x0000, + /* 0x0300 0x0314 0x0397 0x0345 */ + 0x1f9b, 0x0000, + /* 0x0301 0x0314 0x0397 0x0345 */ + 0x1f9d, 0x0000, + /* 0x0342 0x0314 0x0397 0x0345 */ + 0x1f9f, 0x0000, + /* 0x0300 0x0313 0x03a9 0x0345 */ + 0x1faa, 0x0000, + /* 0x0301 0x0313 0x03a9 0x0345 */ + 0x1fac, 0x0000, + /* 0x0342 0x0313 0x03a9 0x0345 */ + 0x1fae, 0x0000, + /* 0x0300 0x0314 0x03a9 0x0345 */ + 0x1fab, 0x0000, + /* 0x0301 0x0314 0x03a9 0x0345 */ + 0x1fad, 0x0000, + /* 0x0342 0x0314 0x03a9 0x0345 */ + 0x1faf, 0x0000, + /* 0x0300 0x0313 0x03b1 0x0345 */ + 0x1f82, 0x0000, + /* 0x0301 0x0313 0x03b1 0x0345 */ + 0x1f84, 0x0000, + /* 0x0342 0x0313 0x03b1 0x0345 */ + 0x1f86, 0x0000, + /* 0x0300 0x0314 0x03b1 0x0345 */ + 0x1f83, 0x0000, + /* 0x0301 0x0314 0x03b1 0x0345 */ + 0x1f85, 0x0000, + /* 0x0342 0x0314 0x03b1 0x0345 */ + 0x1f87, 0x0000, + /* 0x0300 0x0313 0x03b7 0x0345 */ + 0x1f92, 0x0000, + /* 0x0301 0x0313 0x03b7 0x0345 */ + 0x1f94, 0x0000, + /* 0x0342 0x0313 0x03b7 0x0345 */ + 0x1f96, 0x0000, + /* 0x0300 0x0314 0x03b7 0x0345 */ + 0x1f93, 0x0000, + /* 0x0301 0x0314 0x03b7 0x0345 */ + 0x1f95, 0x0000, + /* 0x0342 0x0314 0x03b7 0x0345 */ + 0x1f97, 0x0000, + /* 0x0300 0x0313 0x03c9 0x0345 */ + 0x1fa2, 0x0000, + /* 0x0301 0x0313 0x03c9 0x0345 */ + 0x1fa4, 0x0000, + /* 0x0342 0x0313 0x03c9 0x0345 */ + 0x1fa6, 0x0000, + /* 0x0300 0x0314 0x03c9 0x0345 */ + 0x1fa3, 0x0000, + /* 0x0301 0x0314 0x03c9 0x0345 */ + 0x1fa5, 0x0000, + /* 0x0342 0x0314 0x03c9 0x0345 */ + 0x1fa7, 0x0000, +}; diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c new file mode 100644 index 00000000..a3f0bfcc --- /dev/null +++ b/fs/hfsplus/unicode.c @@ -0,0 +1,452 @@ +/* + * linux/fs/hfsplus/unicode.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handler routines for unicode strings + */ + +#include +#include +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +/* Fold the case of a unicode char, given the 16 bit value */ +/* Returns folded char, or 0 if ignorable */ +static inline u16 case_fold(u16 c) +{ + u16 tmp; + + tmp = hfsplus_case_fold_table[c >> 8]; + if (tmp) + tmp = hfsplus_case_fold_table[tmp + (c & 0xff)]; + else + tmp = c; + return tmp; +} + +/* Compare unicode strings, return values like normal strcmp */ +int hfsplus_strcasecmp(const struct hfsplus_unistr *s1, + const struct hfsplus_unistr *s2) +{ + u16 len1, len2, c1, c2; + const hfsplus_unichr *p1, *p2; + + len1 = be16_to_cpu(s1->length); + len2 = be16_to_cpu(s2->length); + p1 = s1->unicode; + p2 = s2->unicode; + + while (1) { + c1 = c2 = 0; + + while (len1 && !c1) { + c1 = case_fold(be16_to_cpu(*p1)); + p1++; + len1--; + } + while (len2 && !c2) { + c2 = case_fold(be16_to_cpu(*p2)); + p2++; + len2--; + } + + if (c1 != c2) + return (c1 < c2) ? -1 : 1; + if (!c1 && !c2) + return 0; + } +} + +/* Compare names as a sequence of 16-bit unsigned integers */ +int hfsplus_strcmp(const struct hfsplus_unistr *s1, + const struct hfsplus_unistr *s2) +{ + u16 len1, len2, c1, c2; + const hfsplus_unichr *p1, *p2; + int len; + + len1 = be16_to_cpu(s1->length); + len2 = be16_to_cpu(s2->length); + p1 = s1->unicode; + p2 = s2->unicode; + + for (len = min(len1, len2); len > 0; len--) { + c1 = be16_to_cpu(*p1); + c2 = be16_to_cpu(*p2); + if (c1 != c2) + return c1 < c2 ? -1 : 1; + p1++; + p2++; + } + + return len1 < len2 ? -1 : + len1 > len2 ? 1 : 0; +} + + +#define Hangul_SBase 0xac00 +#define Hangul_LBase 0x1100 +#define Hangul_VBase 0x1161 +#define Hangul_TBase 0x11a7 +#define Hangul_SCount 11172 +#define Hangul_LCount 19 +#define Hangul_VCount 21 +#define Hangul_TCount 28 +#define Hangul_NCount (Hangul_VCount * Hangul_TCount) + + +static u16 *hfsplus_compose_lookup(u16 *p, u16 cc) +{ + int i, s, e; + + s = 1; + e = p[1]; + if (!e || cc < p[s * 2] || cc > p[e * 2]) + return NULL; + do { + i = (s + e) / 2; + if (cc > p[i * 2]) + s = i + 1; + else if (cc < p[i * 2]) + e = i - 1; + else + return hfsplus_compose_table + p[i * 2 + 1]; + } while (s <= e); + return NULL; +} + +int hfsplus_uni2asc(struct super_block *sb, + const struct hfsplus_unistr *ustr, + char *astr, int *len_p) +{ + const hfsplus_unichr *ip; + struct nls_table *nls = HFSPLUS_SB(sb)->nls; + u8 *op; + u16 cc, c0, c1; + u16 *ce1, *ce2; + int i, len, ustrlen, res, compose; + + op = astr; + ip = ustr->unicode; + ustrlen = be16_to_cpu(ustr->length); + len = *len_p; + ce1 = NULL; + compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); + + while (ustrlen > 0) { + c0 = be16_to_cpu(*ip++); + ustrlen--; + /* search for single decomposed char */ + if (likely(compose)) + ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c0); + if (ce1 && (cc = ce1[0])) { + /* start of a possibly decomposed Hangul char */ + if (cc != 0xffff) + goto done; + if (!ustrlen) + goto same; + c1 = be16_to_cpu(*ip) - Hangul_VBase; + if (c1 < Hangul_VCount) { + /* compose the Hangul char */ + cc = (c0 - Hangul_LBase) * Hangul_VCount; + cc = (cc + c1) * Hangul_TCount; + cc += Hangul_SBase; + ip++; + ustrlen--; + if (!ustrlen) + goto done; + c1 = be16_to_cpu(*ip) - Hangul_TBase; + if (c1 > 0 && c1 < Hangul_TCount) { + cc += c1; + ip++; + ustrlen--; + } + goto done; + } + } + while (1) { + /* main loop for common case of not composed chars */ + if (!ustrlen) + goto same; + c1 = be16_to_cpu(*ip); + if (likely(compose)) + ce1 = hfsplus_compose_lookup( + hfsplus_compose_table, c1); + if (ce1) + break; + switch (c0) { + case 0: + c0 = 0x2400; + break; + case '/': + c0 = ':'; + break; + } + res = nls->uni2char(c0, op, len); + if (res < 0) { + if (res == -ENAMETOOLONG) + goto out; + *op = '?'; + res = 1; + } + op += res; + len -= res; + c0 = c1; + ip++; + ustrlen--; + } + ce2 = hfsplus_compose_lookup(ce1, c0); + if (ce2) { + i = 1; + while (i < ustrlen) { + ce1 = hfsplus_compose_lookup(ce2, + be16_to_cpu(ip[i])); + if (!ce1) + break; + i++; + ce2 = ce1; + } + if ((cc = ce2[0])) { + ip += i; + ustrlen -= i; + goto done; + } + } +same: + switch (c0) { + case 0: + cc = 0x2400; + break; + case '/': + cc = ':'; + break; + default: + cc = c0; + } +done: + res = nls->uni2char(cc, op, len); + if (res < 0) { + if (res == -ENAMETOOLONG) + goto out; + *op = '?'; + res = 1; + } + op += res; + len -= res; + } + res = 0; +out: + *len_p = (char *)op - astr; + return res; +} + +/* + * Convert one or more ASCII characters into a single unicode character. + * Returns the number of ASCII characters corresponding to the unicode char. + */ +static inline int asc2unichar(struct super_block *sb, const char *astr, int len, + wchar_t *uc) +{ + int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc); + if (size <= 0) { + *uc = '?'; + size = 1; + } + switch (*uc) { + case 0x2400: + *uc = 0; + break; + case ':': + *uc = '/'; + break; + } + return size; +} + +/* Decomposes a single unicode character. */ +static inline u16 *decompose_unichar(wchar_t uc, int *size) +{ + int off; + + off = hfsplus_decompose_table[(uc >> 12) & 0xf]; + if (off == 0 || off == 0xffff) + return NULL; + + off = hfsplus_decompose_table[off + ((uc >> 8) & 0xf)]; + if (!off) + return NULL; + + off = hfsplus_decompose_table[off + ((uc >> 4) & 0xf)]; + if (!off) + return NULL; + + off = hfsplus_decompose_table[off + (uc & 0xf)]; + *size = off & 3; + if (*size == 0) + return NULL; + return hfsplus_decompose_table + (off / 4); +} + +int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, + const char *astr, int len) +{ + int size, dsize, decompose; + u16 *dstr, outlen = 0; + wchar_t c; + + decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); + while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { + size = asc2unichar(sb, astr, len, &c); + + if (decompose && (dstr = decompose_unichar(c, &dsize))) { + if (outlen + dsize > HFSPLUS_MAX_STRLEN) + break; + do { + ustr->unicode[outlen++] = cpu_to_be16(*dstr++); + } while (--dsize > 0); + } else + ustr->unicode[outlen++] = cpu_to_be16(c); + + astr += size; + len -= size; + } + ustr->length = cpu_to_be16(outlen); + if (len > 0) + return -ENAMETOOLONG; + return 0; +} + +/* + * Hash a string to an integer as appropriate for the HFS+ filesystem. + * Composed unicode characters are decomposed and case-folding is performed + * if the appropriate bits are (un)set on the superblock. + */ +int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *str) +{ + struct super_block *sb = dentry->d_sb; + const char *astr; + const u16 *dstr; + int casefold, decompose, size, len; + unsigned long hash; + wchar_t c; + u16 c2; + + casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); + decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); + hash = init_name_hash(); + astr = str->name; + len = str->len; + while (len > 0) { + int uninitialized_var(dsize); + size = asc2unichar(sb, astr, len, &c); + astr += size; + len -= size; + + if (decompose && (dstr = decompose_unichar(c, &dsize))) { + do { + c2 = *dstr++; + if (!casefold || (c2 = case_fold(c2))) + hash = partial_name_hash(c2, hash); + } while (--dsize > 0); + } else { + c2 = c; + if (!casefold || (c2 = case_fold(c2))) + hash = partial_name_hash(c2, hash); + } + } + str->hash = end_name_hash(hash); + + return 0; +} + +/* + * Compare strings with HFS+ filename ordering. + * Composed unicode characters are decomposed and case-folding is performed + * if the appropriate bits are (un)set on the superblock. + */ +int hfsplus_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + struct super_block *sb = parent->d_sb; + int casefold, decompose, size; + int dsize1, dsize2, len1, len2; + const u16 *dstr1, *dstr2; + const char *astr1, *astr2; + u16 c1, c2; + wchar_t c; + + casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); + decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); + astr1 = str; + len1 = len; + astr2 = name->name; + len2 = name->len; + dsize1 = dsize2 = 0; + dstr1 = dstr2 = NULL; + + while (len1 > 0 && len2 > 0) { + if (!dsize1) { + size = asc2unichar(sb, astr1, len1, &c); + astr1 += size; + len1 -= size; + + if (decompose) + dstr1 = decompose_unichar(c, &dsize1); + if (!decompose || !dstr1) { + c1 = c; + dstr1 = &c1; + dsize1 = 1; + } + } + + if (!dsize2) { + size = asc2unichar(sb, astr2, len2, &c); + astr2 += size; + len2 -= size; + + if (decompose) + dstr2 = decompose_unichar(c, &dsize2); + if (!decompose || !dstr2) { + c2 = c; + dstr2 = &c2; + dsize2 = 1; + } + } + + c1 = *dstr1; + c2 = *dstr2; + if (casefold) { + if (!(c1 = case_fold(c1))) { + dstr1++; + dsize1--; + continue; + } + if (!(c2 = case_fold(c2))) { + dstr2++; + dsize2--; + continue; + } + } + if (c1 < c2) + return -1; + else if (c1 > c2) + return 1; + + dstr1++; + dsize1--; + dstr2++; + dsize2--; + } + + if (len1 < len2) + return -1; + if (len1 > len2) + return 1; + return 0; +} diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c new file mode 100644 index 00000000..7b8112da --- /dev/null +++ b/fs/hfsplus/wrapper.c @@ -0,0 +1,283 @@ +/* + * linux/fs/hfsplus/wrapper.c + * + * Copyright (C) 2001 + * Brad Boyer (flar@allandria.com) + * (C) 2003 Ardis Technologies + * + * Handling of HFS wrappers around HFS+ volumes + */ + +#include +#include +#include +#include +#include + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +struct hfsplus_wd { + u32 ablk_size; + u16 ablk_start; + u16 embed_start; + u16 embed_count; +}; + +static void hfsplus_end_io_sync(struct bio *bio, int err) +{ + if (err) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + complete(bio->bi_private); +} + +/* + * hfsplus_submit_bio - Perfrom block I/O + * @sb: super block of volume for I/O + * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes + * @buf: buffer for I/O + * @data: output pointer for location of requested data + * @rw: direction of I/O + * + * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than + * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads + * @data will return a pointer to the start of the requested sector, + * which may not be the same location as @buf. + * + * If @sector is not aligned to the bdev logical block size it will + * be rounded down. For writes this means that @buf should contain data + * that starts at the rounded-down address. As long as the data was + * read using hfsplus_submit_bio() and the same buffer is used things + * will work correctly. + */ +int hfsplus_submit_bio(struct super_block *sb, sector_t sector, + void *buf, void **data, int rw) +{ + DECLARE_COMPLETION_ONSTACK(wait); + struct bio *bio; + int ret = 0; + unsigned int io_size; + loff_t start; + int offset; + + /* + * Align sector to hardware sector size and find offset. We + * assume that io_size is a power of two, which _should_ + * be true. + */ + io_size = hfsplus_min_io_size(sb); + start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT; + offset = start & (io_size - 1); + sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); + + bio = bio_alloc(GFP_NOIO, 1); + bio->bi_sector = sector; + bio->bi_bdev = sb->s_bdev; + bio->bi_end_io = hfsplus_end_io_sync; + bio->bi_private = &wait; + + if (!(rw & WRITE) && data) + *data = (u8 *)buf + offset; + + while (io_size > 0) { + unsigned int page_offset = offset_in_page(buf); + unsigned int len = min_t(unsigned int, PAGE_SIZE - page_offset, + io_size); + + ret = bio_add_page(bio, virt_to_page(buf), len, page_offset); + if (ret != len) { + ret = -EIO; + goto out; + } + io_size -= len; + buf = (u8 *)buf + len; + } + + submit_bio(rw, bio); + wait_for_completion(&wait); + + if (!bio_flagged(bio, BIO_UPTODATE)) + ret = -EIO; + +out: + bio_put(bio); + return ret < 0 ? ret : 0; +} + +static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) +{ + u32 extent; + u16 attrib; + __be16 sig; + + sig = *(__be16 *)(bufptr + HFSP_WRAPOFF_EMBEDSIG); + if (sig != cpu_to_be16(HFSPLUS_VOLHEAD_SIG) && + sig != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) + return 0; + + attrib = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ATTRIB)); + if (!(attrib & HFSP_WRAP_ATTRIB_SLOCK) || + !(attrib & HFSP_WRAP_ATTRIB_SPARED)) + return 0; + + wd->ablk_size = + be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE)); + if (wd->ablk_size < HFSPLUS_SECTOR_SIZE) + return 0; + if (wd->ablk_size % HFSPLUS_SECTOR_SIZE) + return 0; + wd->ablk_start = + be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART)); + + extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT); + wd->embed_start = (extent >> 16) & 0xFFFF; + wd->embed_count = extent & 0xFFFF; + + return 1; +} + +static int hfsplus_get_last_session(struct super_block *sb, + sector_t *start, sector_t *size) +{ + struct cdrom_multisession ms_info; + struct cdrom_tocentry te; + int res; + + /* default values */ + *start = 0; + *size = sb->s_bdev->bd_inode->i_size >> 9; + + if (HFSPLUS_SB(sb)->session >= 0) { + te.cdte_track = HFSPLUS_SB(sb)->session; + te.cdte_format = CDROM_LBA; + res = ioctl_by_bdev(sb->s_bdev, + CDROMREADTOCENTRY, (unsigned long)&te); + if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { + *start = (sector_t)te.cdte_addr.lba << 2; + return 0; + } + printk(KERN_ERR "hfs: invalid session number or type of track\n"); + return -EINVAL; + } + ms_info.addr_format = CDROM_LBA; + res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, + (unsigned long)&ms_info); + if (!res && ms_info.xa_flag) + *start = (sector_t)ms_info.addr.lba << 2; + return 0; +} + +/* Find the volume header and fill in some minimum bits in superblock */ +/* Takes in super block, returns true if good data read */ +int hfsplus_read_wrapper(struct super_block *sb) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_wd wd; + sector_t part_start, part_size; + u32 blocksize; + int error = 0; + + error = -EINVAL; + blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE); + if (!blocksize) + goto out; + + if (hfsplus_get_last_session(sb, &part_start, &part_size)) + goto out; + if ((u64)part_start + part_size > 0x100000000ULL) { + pr_err("hfs: volumes larger than 2TB are not supported yet\n"); + goto out; + } + + error = -ENOMEM; + sbi->s_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!sbi->s_vhdr_buf) + goto out; + sbi->s_backup_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!sbi->s_backup_vhdr_buf) + goto out_free_vhdr; + +reread: + error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, + sbi->s_vhdr_buf, (void **)&sbi->s_vhdr, + READ); + if (error) + goto out_free_backup_vhdr; + + error = -EINVAL; + switch (sbi->s_vhdr->signature) { + case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX): + set_bit(HFSPLUS_SB_HFSX, &sbi->flags); + /*FALLTHRU*/ + case cpu_to_be16(HFSPLUS_VOLHEAD_SIG): + break; + case cpu_to_be16(HFSP_WRAP_MAGIC): + if (!hfsplus_read_mdb(sbi->s_vhdr, &wd)) + goto out_free_backup_vhdr; + wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT; + part_start += wd.ablk_start + wd.embed_start * wd.ablk_size; + part_size = wd.embed_count * wd.ablk_size; + goto reread; + default: + /* + * Check for a partition block. + * + * (should do this only for cdrom/loop though) + */ + if (hfs_part_find(sb, &part_start, &part_size)) + goto out_free_backup_vhdr; + goto reread; + } + + error = hfsplus_submit_bio(sb, part_start + part_size - 2, + sbi->s_backup_vhdr_buf, + (void **)&sbi->s_backup_vhdr, READ); + if (error) + goto out_free_backup_vhdr; + + error = -EINVAL; + if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) { + printk(KERN_WARNING + "hfs: invalid secondary volume header\n"); + goto out_free_backup_vhdr; + } + + blocksize = be32_to_cpu(sbi->s_vhdr->blocksize); + + /* + * Block size must be at least as large as a sector and a multiple of 2. + */ + if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize)) + goto out_free_backup_vhdr; + sbi->alloc_blksz = blocksize; + sbi->alloc_blksz_shift = 0; + while ((blocksize >>= 1) != 0) + sbi->alloc_blksz_shift++; + blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE); + + /* + * Align block size to block offset. + */ + while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) + blocksize >>= 1; + + if (sb_set_blocksize(sb, blocksize) != blocksize) { + printk(KERN_ERR "hfs: unable to set blocksize to %u!\n", + blocksize); + goto out_free_backup_vhdr; + } + + sbi->blockoffset = + part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); + sbi->part_start = part_start; + sbi->sect_count = part_size; + sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; + return 0; + +out_free_backup_vhdr: + kfree(sbi->s_backup_vhdr_buf); +out_free_vhdr: + kfree(sbi->s_vhdr_buf); +out: + return error; +} diff --git a/fs/hostfs/Makefile b/fs/hostfs/Makefile new file mode 100644 index 00000000..d5beaffa --- /dev/null +++ b/fs/hostfs/Makefile @@ -0,0 +1,11 @@ +# +# Copyright (C) 2000 Jeff Dike (jdike@karaya.com) +# Licensed under the GPL +# + +hostfs-objs := hostfs_kern.o hostfs_user.o + +obj-y := +obj-$(CONFIG_HOSTFS) += hostfs.o + +include arch/um/scripts/Makefile.rules diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h new file mode 100644 index 00000000..bf15a430 --- /dev/null +++ b/fs/hostfs/hostfs.h @@ -0,0 +1,96 @@ +#ifndef __UM_FS_HOSTFS +#define __UM_FS_HOSTFS + +#include "os.h" + +/* + * These are exactly the same definitions as in fs.h, but the names are + * changed so that this file can be included in both kernel and user files. + */ + +#define HOSTFS_ATTR_MODE 1 +#define HOSTFS_ATTR_UID 2 +#define HOSTFS_ATTR_GID 4 +#define HOSTFS_ATTR_SIZE 8 +#define HOSTFS_ATTR_ATIME 16 +#define HOSTFS_ATTR_MTIME 32 +#define HOSTFS_ATTR_CTIME 64 +#define HOSTFS_ATTR_ATIME_SET 128 +#define HOSTFS_ATTR_MTIME_SET 256 + +/* These two are unused by hostfs. */ +#define HOSTFS_ATTR_FORCE 512 /* Not a change, but a change it */ +#define HOSTFS_ATTR_ATTR_FLAG 1024 + +/* + * If you are very careful, you'll notice that these two are missing: + * + * #define ATTR_KILL_SUID 2048 + * #define ATTR_KILL_SGID 4096 + * + * and this is because they were added in 2.5 development. + * Actually, they are not needed by most ->setattr() methods - they are set by + * callers of notify_change() to notify that the setuid/setgid bits must be + * dropped. + * notify_change() will delete those flags, make sure attr->ia_valid & ATTR_MODE + * is on, and remove the appropriate bits from attr->ia_mode (attr is a + * "struct iattr *"). -BlaisorBlade + */ + +struct hostfs_iattr { + unsigned int ia_valid; + mode_t ia_mode; + uid_t ia_uid; + gid_t ia_gid; + loff_t ia_size; + struct timespec ia_atime; + struct timespec ia_mtime; + struct timespec ia_ctime; +}; + +struct hostfs_stat { + unsigned long long ino; + unsigned int mode; + unsigned int nlink; + unsigned int uid; + unsigned int gid; + unsigned long long size; + struct timespec atime, mtime, ctime; + unsigned int blksize; + unsigned long long blocks; + unsigned int maj; + unsigned int min; +}; + +extern int stat_file(const char *path, struct hostfs_stat *p, int fd); +extern int access_file(char *path, int r, int w, int x); +extern int open_file(char *path, int r, int w, int append); +extern void *open_dir(char *path, int *err_out); +extern char *read_dir(void *stream, unsigned long long *pos, + unsigned long long *ino_out, int *len_out); +extern void close_file(void *stream); +extern int replace_file(int oldfd, int fd); +extern void close_dir(void *stream); +extern int read_file(int fd, unsigned long long *offset, char *buf, int len); +extern int write_file(int fd, unsigned long long *offset, const char *buf, + int len); +extern int lseek_file(int fd, long long offset, int whence); +extern int fsync_file(int fd, int datasync); +extern int file_create(char *name, int ur, int uw, int ux, int gr, + int gw, int gx, int or, int ow, int ox); +extern int set_attr(const char *file, struct hostfs_iattr *attrs, int fd); +extern int make_symlink(const char *from, const char *to); +extern int unlink_file(const char *file); +extern int do_mkdir(const char *file, int mode); +extern int do_rmdir(const char *file); +extern int do_mknod(const char *file, int mode, unsigned int major, + unsigned int minor); +extern int link_file(const char *from, const char *to); +extern int hostfs_do_readlink(char *file, char *buf, int size); +extern int rename_file(char *from, char *to); +extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, + long long *bfree_out, long long *bavail_out, + long long *files_out, long long *ffree_out, + void *fsid_out, int fsid_size, long *namelen_out); + +#endif diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c new file mode 100644 index 00000000..2638c834 --- /dev/null +++ b/fs/hostfs/hostfs_kern.c @@ -0,0 +1,1004 @@ +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + * + * Ported the filesystem routines to 2.5. + * 2003-02-10 Petr Baudis + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hostfs.h" +#include "init.h" +#include "kern.h" + +struct hostfs_inode_info { + int fd; + fmode_t mode; + struct inode vfs_inode; +}; + +static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) +{ + return list_entry(inode, struct hostfs_inode_info, vfs_inode); +} + +#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode) + +static int hostfs_d_delete(const struct dentry *dentry) +{ + return 1; +} + +static const struct dentry_operations hostfs_dentry_ops = { + .d_delete = hostfs_d_delete, +}; + +/* Changed in hostfs_args before the kernel starts running */ +static char *root_ino = ""; +static int append = 0; + +#define HOSTFS_SUPER_MAGIC 0x00c0ffee + +static const struct inode_operations hostfs_iops; +static const struct inode_operations hostfs_dir_iops; +static const struct inode_operations hostfs_link_iops; + +#ifndef MODULE +static int __init hostfs_args(char *options, int *add) +{ + char *ptr; + + ptr = strchr(options, ','); + if (ptr != NULL) + *ptr++ = '\0'; + if (*options != '\0') + root_ino = options; + + options = ptr; + while (options) { + ptr = strchr(options, ','); + if (ptr != NULL) + *ptr++ = '\0'; + if (*options != '\0') { + if (!strcmp(options, "append")) + append = 1; + else printf("hostfs_args - unsupported option - %s\n", + options); + } + options = ptr; + } + return 0; +} + +__uml_setup("hostfs=", hostfs_args, +"hostfs=,,...\n" +" This is used to set hostfs parameters. The root directory argument\n" +" is used to confine all hostfs mounts to within the specified directory\n" +" tree on the host. If this isn't specified, then a user inside UML can\n" +" mount anything on the host that's accessible to the user that's running\n" +" it.\n" +" The only flag currently supported is 'append', which specifies that all\n" +" files opened by hostfs will be opened in append mode.\n\n" +); +#endif + +static char *__dentry_name(struct dentry *dentry, char *name) +{ + char *p = dentry_path_raw(dentry, name, PATH_MAX); + char *root; + size_t len; + + root = dentry->d_sb->s_fs_info; + len = strlen(root); + if (IS_ERR(p)) { + __putname(name); + return NULL; + } + strlcpy(name, root, PATH_MAX); + if (len > p - name) { + __putname(name); + return NULL; + } + if (p > name + len) { + char *s = name + len; + while ((*s++ = *p++) != '\0') + ; + } + return name; +} + +static char *dentry_name(struct dentry *dentry) +{ + char *name = __getname(); + if (!name) + return NULL; + + return __dentry_name(dentry, name); /* will unlock */ +} + +static char *inode_name(struct inode *ino) +{ + struct dentry *dentry; + char *name; + + dentry = d_find_alias(ino); + if (!dentry) + return NULL; + + name = dentry_name(dentry); + + dput(dentry); + + return name; +} + +static char *follow_link(char *link) +{ + int len, n; + char *name, *resolved, *end; + + len = 64; + while (1) { + n = -ENOMEM; + name = kmalloc(len, GFP_KERNEL); + if (name == NULL) + goto out; + + n = hostfs_do_readlink(link, name, len); + if (n < len) + break; + len *= 2; + kfree(name); + } + if (n < 0) + goto out_free; + + if (*name == '/') + return name; + + end = strrchr(link, '/'); + if (end == NULL) + return name; + + *(end + 1) = '\0'; + len = strlen(link) + strlen(name) + 1; + + resolved = kmalloc(len, GFP_KERNEL); + if (resolved == NULL) { + n = -ENOMEM; + goto out_free; + } + + sprintf(resolved, "%s%s", link, name); + kfree(name); + kfree(link); + return resolved; + + out_free: + kfree(name); + out: + return ERR_PTR(n); +} + +static struct inode *hostfs_iget(struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + return inode; +} + +int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) +{ + /* + * do_statfs uses struct statfs64 internally, but the linux kernel + * struct statfs still has 32-bit versions for most of these fields, + * so we convert them here + */ + int err; + long long f_blocks; + long long f_bfree; + long long f_bavail; + long long f_files; + long long f_ffree; + + err = do_statfs(dentry->d_sb->s_fs_info, + &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, + &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), + &sf->f_namelen); + if (err) + return err; + sf->f_blocks = f_blocks; + sf->f_bfree = f_bfree; + sf->f_bavail = f_bavail; + sf->f_files = f_files; + sf->f_ffree = f_ffree; + sf->f_type = HOSTFS_SUPER_MAGIC; + return 0; +} + +static struct inode *hostfs_alloc_inode(struct super_block *sb) +{ + struct hostfs_inode_info *hi; + + hi = kzalloc(sizeof(*hi), GFP_KERNEL); + if (hi == NULL) + return NULL; + hi->fd = -1; + inode_init_once(&hi->vfs_inode); + return &hi->vfs_inode; +} + +static void hostfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + if (HOSTFS_I(inode)->fd != -1) { + close_file(&HOSTFS_I(inode)->fd); + HOSTFS_I(inode)->fd = -1; + } +} + +static void hostfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kfree(HOSTFS_I(inode)); +} + +static void hostfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hostfs_i_callback); +} + +static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + const char *root_path = vfs->mnt_sb->s_fs_info; + size_t offset = strlen(root_ino) + 1; + + if (strlen(root_path) > offset) + seq_printf(seq, ",%s", root_path + offset); + + return 0; +} + +static const struct super_operations hostfs_sbops = { + .alloc_inode = hostfs_alloc_inode, + .destroy_inode = hostfs_destroy_inode, + .evict_inode = hostfs_evict_inode, + .statfs = hostfs_statfs, + .show_options = hostfs_show_options, +}; + +int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) +{ + void *dir; + char *name; + unsigned long long next, ino; + int error, len; + + name = dentry_name(file->f_path.dentry); + if (name == NULL) + return -ENOMEM; + dir = open_dir(name, &error); + __putname(name); + if (dir == NULL) + return -error; + next = file->f_pos; + while ((name = read_dir(dir, &next, &ino, &len)) != NULL) { + error = (*filldir)(ent, name, len, file->f_pos, + ino, DT_UNKNOWN); + if (error) break; + file->f_pos = next; + } + close_dir(dir); + return 0; +} + +int hostfs_file_open(struct inode *ino, struct file *file) +{ + static DEFINE_MUTEX(open_mutex); + char *name; + fmode_t mode = 0; + int err; + int r = 0, w = 0, fd; + + mode = file->f_mode & (FMODE_READ | FMODE_WRITE); + if ((mode & HOSTFS_I(ino)->mode) == mode) + return 0; + + mode |= HOSTFS_I(ino)->mode; + +retry: + if (mode & FMODE_READ) + r = 1; + if (mode & FMODE_WRITE) + w = 1; + if (w) + r = 1; + + name = dentry_name(file->f_path.dentry); + if (name == NULL) + return -ENOMEM; + + fd = open_file(name, r, w, append); + __putname(name); + if (fd < 0) + return fd; + + mutex_lock(&open_mutex); + /* somebody else had handled it first? */ + if ((mode & HOSTFS_I(ino)->mode) == mode) { + mutex_unlock(&open_mutex); + return 0; + } + if ((mode | HOSTFS_I(ino)->mode) != mode) { + mode |= HOSTFS_I(ino)->mode; + mutex_unlock(&open_mutex); + close_file(&fd); + goto retry; + } + if (HOSTFS_I(ino)->fd == -1) { + HOSTFS_I(ino)->fd = fd; + } else { + err = replace_file(fd, HOSTFS_I(ino)->fd); + close_file(&fd); + if (err < 0) { + mutex_unlock(&open_mutex); + return err; + } + } + HOSTFS_I(ino)->mode = mode; + mutex_unlock(&open_mutex); + + return 0; +} + +int hostfs_fsync(struct file *file, int datasync) +{ + return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync); +} + +static const struct file_operations hostfs_file_fops = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .splice_read = generic_file_splice_read, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .write = do_sync_write, + .mmap = generic_file_mmap, + .open = hostfs_file_open, + .release = NULL, + .fsync = hostfs_fsync, +}; + +static const struct file_operations hostfs_dir_fops = { + .llseek = generic_file_llseek, + .readdir = hostfs_readdir, + .read = generic_read_dir, +}; + +int hostfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + char *buffer; + unsigned long long base; + int count = PAGE_CACHE_SIZE; + int end_index = inode->i_size >> PAGE_CACHE_SHIFT; + int err; + + if (page->index >= end_index) + count = inode->i_size & (PAGE_CACHE_SIZE-1); + + buffer = kmap(page); + base = ((unsigned long long) page->index) << PAGE_CACHE_SHIFT; + + err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count); + if (err != count) { + ClearPageUptodate(page); + goto out; + } + + if (base > inode->i_size) + inode->i_size = base; + + if (PageError(page)) + ClearPageError(page); + err = 0; + + out: + kunmap(page); + + unlock_page(page); + return err; +} + +int hostfs_readpage(struct file *file, struct page *page) +{ + char *buffer; + long long start; + int err = 0; + + start = (long long) page->index << PAGE_CACHE_SHIFT; + buffer = kmap(page); + err = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer, + PAGE_CACHE_SIZE); + if (err < 0) + goto out; + + memset(&buffer[err], 0, PAGE_CACHE_SIZE - err); + + flush_dcache_page(page); + SetPageUptodate(page); + if (PageError(page)) ClearPageError(page); + err = 0; + out: + kunmap(page); + unlock_page(page); + return err; +} + +int hostfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + *pagep = grab_cache_page_write_begin(mapping, index, flags); + if (!*pagep) + return -ENOMEM; + return 0; +} + +int hostfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + void *buffer; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + int err; + + buffer = kmap(page); + err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied); + kunmap(page); + + if (!PageUptodate(page) && err == PAGE_CACHE_SIZE) + SetPageUptodate(page); + + /* + * If err > 0, write_file has added err to pos, so we are comparing + * i_size against the last byte written. + */ + if (err > 0 && (pos > inode->i_size)) + inode->i_size = pos; + unlock_page(page); + page_cache_release(page); + + return err; +} + +static const struct address_space_operations hostfs_aops = { + .writepage = hostfs_writepage, + .readpage = hostfs_readpage, + .set_page_dirty = __set_page_dirty_nobuffers, + .write_begin = hostfs_write_begin, + .write_end = hostfs_write_end, +}; + +static int read_name(struct inode *ino, char *name) +{ + dev_t rdev; + struct hostfs_stat st; + int err = stat_file(name, &st, -1); + if (err) + return err; + + /* Reencode maj and min with the kernel encoding.*/ + rdev = MKDEV(st.maj, st.min); + + switch (st.mode & S_IFMT) { + case S_IFLNK: + ino->i_op = &hostfs_link_iops; + break; + case S_IFDIR: + ino->i_op = &hostfs_dir_iops; + ino->i_fop = &hostfs_dir_fops; + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + init_special_inode(ino, st.mode & S_IFMT, rdev); + ino->i_op = &hostfs_iops; + break; + + default: + ino->i_op = &hostfs_iops; + ino->i_fop = &hostfs_file_fops; + ino->i_mapping->a_ops = &hostfs_aops; + } + + ino->i_ino = st.ino; + ino->i_mode = st.mode; + ino->i_nlink = st.nlink; + ino->i_uid = st.uid; + ino->i_gid = st.gid; + ino->i_atime = st.atime; + ino->i_mtime = st.mtime; + ino->i_ctime = st.ctime; + ino->i_size = st.size; + ino->i_blocks = st.blocks; + return 0; +} + +int hostfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + char *name; + int error, fd; + + inode = hostfs_iget(dir->i_sb); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto out; + } + + error = -ENOMEM; + name = dentry_name(dentry); + if (name == NULL) + goto out_put; + + fd = file_create(name, + mode & S_IRUSR, mode & S_IWUSR, mode & S_IXUSR, + mode & S_IRGRP, mode & S_IWGRP, mode & S_IXGRP, + mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH); + if (fd < 0) + error = fd; + else + error = read_name(inode, name); + + __putname(name); + if (error) + goto out_put; + + HOSTFS_I(inode)->fd = fd; + HOSTFS_I(inode)->mode = FMODE_READ | FMODE_WRITE; + d_instantiate(dentry, inode); + return 0; + + out_put: + iput(inode); + out: + return error; +} + +struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + char *name; + int err; + + inode = hostfs_iget(ino->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + + err = -ENOMEM; + name = dentry_name(dentry); + if (name == NULL) + goto out_put; + + err = read_name(inode, name); + + __putname(name); + if (err == -ENOENT) { + iput(inode); + inode = NULL; + } + else if (err) + goto out_put; + + d_add(dentry, inode); + return NULL; + + out_put: + iput(inode); + out: + return ERR_PTR(err); +} + +int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from) +{ + char *from_name, *to_name; + int err; + + if ((from_name = dentry_name(from)) == NULL) + return -ENOMEM; + to_name = dentry_name(to); + if (to_name == NULL) { + __putname(from_name); + return -ENOMEM; + } + err = link_file(to_name, from_name); + __putname(from_name); + __putname(to_name); + return err; +} + +int hostfs_unlink(struct inode *ino, struct dentry *dentry) +{ + char *file; + int err; + + if (append) + return -EPERM; + + if ((file = dentry_name(dentry)) == NULL) + return -ENOMEM; + + err = unlink_file(file); + __putname(file); + return err; +} + +int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to) +{ + char *file; + int err; + + if ((file = dentry_name(dentry)) == NULL) + return -ENOMEM; + err = make_symlink(file, to); + __putname(file); + return err; +} + +int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode) +{ + char *file; + int err; + + if ((file = dentry_name(dentry)) == NULL) + return -ENOMEM; + err = do_mkdir(file, mode); + __putname(file); + return err; +} + +int hostfs_rmdir(struct inode *ino, struct dentry *dentry) +{ + char *file; + int err; + + if ((file = dentry_name(dentry)) == NULL) + return -ENOMEM; + err = do_rmdir(file); + __putname(file); + return err; +} + +int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +{ + struct inode *inode; + char *name; + int err; + + inode = hostfs_iget(dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + + err = -ENOMEM; + name = dentry_name(dentry); + if (name == NULL) + goto out_put; + + init_special_inode(inode, mode, dev); + err = do_mknod(name, mode, MAJOR(dev), MINOR(dev)); + if (!err) + goto out_free; + + err = read_name(inode, name); + __putname(name); + if (err) + goto out_put; + if (err) + goto out_put; + + d_instantiate(dentry, inode); + return 0; + + out_free: + __putname(name); + out_put: + iput(inode); + out: + return err; +} + +int hostfs_rename(struct inode *from_ino, struct dentry *from, + struct inode *to_ino, struct dentry *to) +{ + char *from_name, *to_name; + int err; + + if ((from_name = dentry_name(from)) == NULL) + return -ENOMEM; + if ((to_name = dentry_name(to)) == NULL) { + __putname(from_name); + return -ENOMEM; + } + err = rename_file(from_name, to_name); + __putname(from_name); + __putname(to_name); + return err; +} + +int hostfs_permission(struct inode *ino, int desired, unsigned int flags) +{ + char *name; + int r = 0, w = 0, x = 0, err; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + if (desired & MAY_READ) r = 1; + if (desired & MAY_WRITE) w = 1; + if (desired & MAY_EXEC) x = 1; + name = inode_name(ino); + if (name == NULL) + return -ENOMEM; + + if (S_ISCHR(ino->i_mode) || S_ISBLK(ino->i_mode) || + S_ISFIFO(ino->i_mode) || S_ISSOCK(ino->i_mode)) + err = 0; + else + err = access_file(name, r, w, x); + __putname(name); + if (!err) + err = generic_permission(ino, desired, flags, NULL); + return err; +} + +int hostfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct hostfs_iattr attrs; + char *name; + int err; + + int fd = HOSTFS_I(inode)->fd; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (append) + attr->ia_valid &= ~ATTR_SIZE; + + attrs.ia_valid = 0; + if (attr->ia_valid & ATTR_MODE) { + attrs.ia_valid |= HOSTFS_ATTR_MODE; + attrs.ia_mode = attr->ia_mode; + } + if (attr->ia_valid & ATTR_UID) { + attrs.ia_valid |= HOSTFS_ATTR_UID; + attrs.ia_uid = attr->ia_uid; + } + if (attr->ia_valid & ATTR_GID) { + attrs.ia_valid |= HOSTFS_ATTR_GID; + attrs.ia_gid = attr->ia_gid; + } + if (attr->ia_valid & ATTR_SIZE) { + attrs.ia_valid |= HOSTFS_ATTR_SIZE; + attrs.ia_size = attr->ia_size; + } + if (attr->ia_valid & ATTR_ATIME) { + attrs.ia_valid |= HOSTFS_ATTR_ATIME; + attrs.ia_atime = attr->ia_atime; + } + if (attr->ia_valid & ATTR_MTIME) { + attrs.ia_valid |= HOSTFS_ATTR_MTIME; + attrs.ia_mtime = attr->ia_mtime; + } + if (attr->ia_valid & ATTR_CTIME) { + attrs.ia_valid |= HOSTFS_ATTR_CTIME; + attrs.ia_ctime = attr->ia_ctime; + } + if (attr->ia_valid & ATTR_ATIME_SET) { + attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET; + } + if (attr->ia_valid & ATTR_MTIME_SET) { + attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET; + } + name = dentry_name(dentry); + if (name == NULL) + return -ENOMEM; + err = set_attr(name, &attrs, fd); + __putname(name); + if (err) + return err; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + int error; + + error = vmtruncate(inode, attr->ia_size); + if (err) + return err; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +static const struct inode_operations hostfs_iops = { + .create = hostfs_create, + .link = hostfs_link, + .unlink = hostfs_unlink, + .symlink = hostfs_symlink, + .mkdir = hostfs_mkdir, + .rmdir = hostfs_rmdir, + .mknod = hostfs_mknod, + .rename = hostfs_rename, + .permission = hostfs_permission, + .setattr = hostfs_setattr, +}; + +static const struct inode_operations hostfs_dir_iops = { + .create = hostfs_create, + .lookup = hostfs_lookup, + .link = hostfs_link, + .unlink = hostfs_unlink, + .symlink = hostfs_symlink, + .mkdir = hostfs_mkdir, + .rmdir = hostfs_rmdir, + .mknod = hostfs_mknod, + .rename = hostfs_rename, + .permission = hostfs_permission, + .setattr = hostfs_setattr, +}; + +static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + char *link = __getname(); + if (link) { + char *path = dentry_name(dentry); + int err = -ENOMEM; + if (path) { + err = hostfs_do_readlink(path, link, PATH_MAX); + if (err == PATH_MAX) + err = -E2BIG; + __putname(path); + } + if (err < 0) { + __putname(link); + link = ERR_PTR(err); + } + } else { + link = ERR_PTR(-ENOMEM); + } + + nd_set_link(nd, link); + return NULL; +} + +static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + __putname(s); +} + +static const struct inode_operations hostfs_link_iops = { + .readlink = generic_readlink, + .follow_link = hostfs_follow_link, + .put_link = hostfs_put_link, +}; + +static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) +{ + struct inode *root_inode; + char *host_root_path, *req_root = d; + int err; + + sb->s_blocksize = 1024; + sb->s_blocksize_bits = 10; + sb->s_magic = HOSTFS_SUPER_MAGIC; + sb->s_op = &hostfs_sbops; + sb->s_d_op = &hostfs_dentry_ops; + sb->s_maxbytes = MAX_LFS_FILESIZE; + + /* NULL is printed as by sprintf: avoid that. */ + if (req_root == NULL) + req_root = ""; + + err = -ENOMEM; + sb->s_fs_info = host_root_path = + kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL); + if (host_root_path == NULL) + goto out; + + sprintf(host_root_path, "%s/%s", root_ino, req_root); + + root_inode = new_inode(sb); + if (!root_inode) + goto out; + + err = read_name(root_inode, host_root_path); + if (err) + goto out_put; + + if (S_ISLNK(root_inode->i_mode)) { + char *name = follow_link(host_root_path); + if (IS_ERR(name)) + err = PTR_ERR(name); + else + err = read_name(root_inode, name); + kfree(name); + if (err) + goto out_put; + } + + err = -ENOMEM; + sb->s_root = d_alloc_root(root_inode); + if (sb->s_root == NULL) + goto out_put; + + return 0; + +out_put: + iput(root_inode); +out: + return err; +} + +static struct dentry *hostfs_read_sb(struct file_system_type *type, + int flags, const char *dev_name, + void *data) +{ + return mount_nodev(type, flags, data, hostfs_fill_sb_common); +} + +static void hostfs_kill_sb(struct super_block *s) +{ + kill_anon_super(s); + kfree(s->s_fs_info); +} + +static struct file_system_type hostfs_type = { + .owner = THIS_MODULE, + .name = "hostfs", + .mount = hostfs_read_sb, + .kill_sb = hostfs_kill_sb, + .fs_flags = 0, +}; + +static int __init init_hostfs(void) +{ + return register_filesystem(&hostfs_type); +} + +static void __exit exit_hostfs(void) +{ + unregister_filesystem(&hostfs_type); +} + +module_init(init_hostfs) +module_exit(exit_hostfs) +MODULE_LICENSE("GPL"); diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c new file mode 100644 index 00000000..d51a9838 --- /dev/null +++ b/fs/hostfs/hostfs_user.c @@ -0,0 +1,387 @@ +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hostfs.h" +#include "os.h" +#include "user.h" +#include + +static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) +{ + p->ino = buf->st_ino; + p->mode = buf->st_mode; + p->nlink = buf->st_nlink; + p->uid = buf->st_uid; + p->gid = buf->st_gid; + p->size = buf->st_size; + p->atime.tv_sec = buf->st_atime; + p->atime.tv_nsec = 0; + p->ctime.tv_sec = buf->st_ctime; + p->ctime.tv_nsec = 0; + p->mtime.tv_sec = buf->st_mtime; + p->mtime.tv_nsec = 0; + p->blksize = buf->st_blksize; + p->blocks = buf->st_blocks; + p->maj = os_major(buf->st_rdev); + p->min = os_minor(buf->st_rdev); +} + +int stat_file(const char *path, struct hostfs_stat *p, int fd) +{ + struct stat64 buf; + + if (fd >= 0) { + if (fstat64(fd, &buf) < 0) + return -errno; + } else if (lstat64(path, &buf) < 0) { + return -errno; + } + stat64_to_hostfs(&buf, p); + return 0; +} + +int access_file(char *path, int r, int w, int x) +{ + int mode = 0; + + if (r) + mode = R_OK; + if (w) + mode |= W_OK; + if (x) + mode |= X_OK; + if (access(path, mode) != 0) + return -errno; + else return 0; +} + +int open_file(char *path, int r, int w, int append) +{ + int mode = 0, fd; + + if (r && !w) + mode = O_RDONLY; + else if (!r && w) + mode = O_WRONLY; + else if (r && w) + mode = O_RDWR; + else panic("Impossible mode in open_file"); + + if (append) + mode |= O_APPEND; + fd = open64(path, mode); + if (fd < 0) + return -errno; + else return fd; +} + +void *open_dir(char *path, int *err_out) +{ + DIR *dir; + + dir = opendir(path); + *err_out = errno; + + return dir; +} + +char *read_dir(void *stream, unsigned long long *pos, + unsigned long long *ino_out, int *len_out) +{ + DIR *dir = stream; + struct dirent *ent; + + seekdir(dir, *pos); + ent = readdir(dir); + if (ent == NULL) + return NULL; + *len_out = strlen(ent->d_name); + *ino_out = ent->d_ino; + *pos = telldir(dir); + return ent->d_name; +} + +int read_file(int fd, unsigned long long *offset, char *buf, int len) +{ + int n; + + n = pread64(fd, buf, len, *offset); + if (n < 0) + return -errno; + *offset += n; + return n; +} + +int write_file(int fd, unsigned long long *offset, const char *buf, int len) +{ + int n; + + n = pwrite64(fd, buf, len, *offset); + if (n < 0) + return -errno; + *offset += n; + return n; +} + +int lseek_file(int fd, long long offset, int whence) +{ + int ret; + + ret = lseek64(fd, offset, whence); + if (ret < 0) + return -errno; + return 0; +} + +int fsync_file(int fd, int datasync) +{ + int ret; + if (datasync) + ret = fdatasync(fd); + else + ret = fsync(fd); + + if (ret < 0) + return -errno; + return 0; +} + +int replace_file(int oldfd, int fd) +{ + return dup2(oldfd, fd); +} + +void close_file(void *stream) +{ + close(*((int *) stream)); +} + +void close_dir(void *stream) +{ + closedir(stream); +} + +int file_create(char *name, int ur, int uw, int ux, int gr, + int gw, int gx, int or, int ow, int ox) +{ + int mode, fd; + + mode = 0; + mode |= ur ? S_IRUSR : 0; + mode |= uw ? S_IWUSR : 0; + mode |= ux ? S_IXUSR : 0; + mode |= gr ? S_IRGRP : 0; + mode |= gw ? S_IWGRP : 0; + mode |= gx ? S_IXGRP : 0; + mode |= or ? S_IROTH : 0; + mode |= ow ? S_IWOTH : 0; + mode |= ox ? S_IXOTH : 0; + fd = open64(name, O_CREAT | O_RDWR, mode); + if (fd < 0) + return -errno; + return fd; +} + +int set_attr(const char *file, struct hostfs_iattr *attrs, int fd) +{ + struct hostfs_stat st; + struct timeval times[2]; + int err, ma; + + if (attrs->ia_valid & HOSTFS_ATTR_MODE) { + if (fd >= 0) { + if (fchmod(fd, attrs->ia_mode) != 0) + return -errno; + } else if (chmod(file, attrs->ia_mode) != 0) { + return -errno; + } + } + if (attrs->ia_valid & HOSTFS_ATTR_UID) { + if (fd >= 0) { + if (fchown(fd, attrs->ia_uid, -1)) + return -errno; + } else if (chown(file, attrs->ia_uid, -1)) { + return -errno; + } + } + if (attrs->ia_valid & HOSTFS_ATTR_GID) { + if (fd >= 0) { + if (fchown(fd, -1, attrs->ia_gid)) + return -errno; + } else if (chown(file, -1, attrs->ia_gid)) { + return -errno; + } + } + if (attrs->ia_valid & HOSTFS_ATTR_SIZE) { + if (fd >= 0) { + if (ftruncate(fd, attrs->ia_size)) + return -errno; + } else if (truncate(file, attrs->ia_size)) { + return -errno; + } + } + + /* + * Update accessed and/or modified time, in two parts: first set + * times according to the changes to perform, and then call futimes() + * or utimes() to apply them. + */ + ma = (HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET); + if (attrs->ia_valid & ma) { + err = stat_file(file, &st, fd); + if (err != 0) + return err; + + times[0].tv_sec = st.atime.tv_sec; + times[0].tv_usec = st.atime.tv_nsec / 1000; + times[1].tv_sec = st.mtime.tv_sec; + times[1].tv_usec = st.mtime.tv_nsec / 1000; + + if (attrs->ia_valid & HOSTFS_ATTR_ATIME_SET) { + times[0].tv_sec = attrs->ia_atime.tv_sec; + times[0].tv_usec = attrs->ia_atime.tv_nsec / 1000; + } + if (attrs->ia_valid & HOSTFS_ATTR_MTIME_SET) { + times[1].tv_sec = attrs->ia_mtime.tv_sec; + times[1].tv_usec = attrs->ia_mtime.tv_nsec / 1000; + } + + if (fd >= 0) { + if (futimes(fd, times) != 0) + return -errno; + } else if (utimes(file, times) != 0) { + return -errno; + } + } + + /* Note: ctime is not handled */ + if (attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)) { + err = stat_file(file, &st, fd); + attrs->ia_atime = st.atime; + attrs->ia_mtime = st.mtime; + if (err != 0) + return err; + } + return 0; +} + +int make_symlink(const char *from, const char *to) +{ + int err; + + err = symlink(to, from); + if (err) + return -errno; + return 0; +} + +int unlink_file(const char *file) +{ + int err; + + err = unlink(file); + if (err) + return -errno; + return 0; +} + +int do_mkdir(const char *file, int mode) +{ + int err; + + err = mkdir(file, mode); + if (err) + return -errno; + return 0; +} + +int do_rmdir(const char *file) +{ + int err; + + err = rmdir(file); + if (err) + return -errno; + return 0; +} + +int do_mknod(const char *file, int mode, unsigned int major, unsigned int minor) +{ + int err; + + err = mknod(file, mode, os_makedev(major, minor)); + if (err) + return -errno; + return 0; +} + +int link_file(const char *to, const char *from) +{ + int err; + + err = link(to, from); + if (err) + return -errno; + return 0; +} + +int hostfs_do_readlink(char *file, char *buf, int size) +{ + int n; + + n = readlink(file, buf, size); + if (n < 0) + return -errno; + if (n < size) + buf[n] = '\0'; + return n; +} + +int rename_file(char *from, char *to) +{ + int err; + + err = rename(from, to); + if (err < 0) + return -errno; + return 0; +} + +int do_statfs(char *root, long *bsize_out, long long *blocks_out, + long long *bfree_out, long long *bavail_out, + long long *files_out, long long *ffree_out, + void *fsid_out, int fsid_size, long *namelen_out) +{ + struct statfs64 buf; + int err; + + err = statfs64(root, &buf); + if (err < 0) + return -errno; + + *bsize_out = buf.f_bsize; + *blocks_out = buf.f_blocks; + *bfree_out = buf.f_bfree; + *bavail_out = buf.f_bavail; + *files_out = buf.f_files; + *ffree_out = buf.f_ffree; + memcpy(fsid_out, &buf.f_fsid, + sizeof(buf.f_fsid) > fsid_size ? fsid_size : + sizeof(buf.f_fsid)); + *namelen_out = buf.f_namelen; + + return 0; +} diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig new file mode 100644 index 00000000..56bd15c5 --- /dev/null +++ b/fs/hpfs/Kconfig @@ -0,0 +1,14 @@ +config HPFS_FS + tristate "OS/2 HPFS file system support" + depends on BLOCK + help + OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS + is the file system used for organizing files on OS/2 hard disk + partitions. Say Y if you want to be able to read files from and + write files to an OS/2 HPFS partition on your hard drive. OS/2 + floppies however are in regular MSDOS format, so you don't need this + option in order to be able to read them. Read + . + + To compile this file system support as a module, choose M here: the + module will be called hpfs. If unsure, say N. diff --git a/fs/hpfs/Makefile b/fs/hpfs/Makefile new file mode 100644 index 00000000..57b786fb --- /dev/null +++ b/fs/hpfs/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the Linux hpfs filesystem routines. +# + +obj-$(CONFIG_HPFS_FS) += hpfs.o + +hpfs-objs := alloc.o anode.o buffer.o dentry.o dir.o dnode.o ea.o file.o \ + inode.o map.o name.o namei.o super.o diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c new file mode 100644 index 00000000..7a5eb2c7 --- /dev/null +++ b/fs/hpfs/alloc.c @@ -0,0 +1,424 @@ +/* + * linux/fs/hpfs/alloc.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * HPFS bitmap operations + */ + +#include "hpfs_fn.h" + +/* + * Check if a sector is allocated in bitmap + * This is really slow. Turned on only if chk==2 + */ + +static int chk_if_allocated(struct super_block *s, secno sec, char *msg) +{ + struct quad_buffer_head qbh; + u32 *bmp; + if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "chk"))) goto fail; + if ((cpu_to_le32(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) { + hpfs_error(s, "sector '%s' - %08x not allocated in bitmap", msg, sec); + goto fail1; + } + hpfs_brelse4(&qbh); + if (sec >= hpfs_sb(s)->sb_dirband_start && sec < hpfs_sb(s)->sb_dirband_start + hpfs_sb(s)->sb_dirband_size) { + unsigned ssec = (sec - hpfs_sb(s)->sb_dirband_start) / 4; + if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) goto fail; + if ((le32_to_cpu(bmp[ssec >> 5]) >> (ssec & 0x1f)) & 1) { + hpfs_error(s, "sector '%s' - %08x not allocated in directory bitmap", msg, sec); + goto fail1; + } + hpfs_brelse4(&qbh); + } + return 0; + fail1: + hpfs_brelse4(&qbh); + fail: + return 1; +} + +/* + * Check if sector(s) have proper number and additionally check if they're + * allocated in bitmap. + */ + +int hpfs_chk_sectors(struct super_block *s, secno start, int len, char *msg) +{ + if (start + len < start || start < 0x12 || + start + len > hpfs_sb(s)->sb_fs_size) { + hpfs_error(s, "sector(s) '%s' badly placed at %08x", msg, start); + return 1; + } + if (hpfs_sb(s)->sb_chk>=2) { + int i; + for (i = 0; i < len; i++) + if (chk_if_allocated(s, start + i, msg)) return 1; + } + return 0; +} + +static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigned forward) +{ + struct quad_buffer_head qbh; + unsigned *bmp; + unsigned bs = near & ~0x3fff; + unsigned nr = (near & 0x3fff) & ~(n - 1); + /*unsigned mnr;*/ + unsigned i, q; + int a, b; + secno ret = 0; + if (n != 1 && n != 4) { + hpfs_error(s, "Bad allocation size: %d", n); + return 0; + } + if (bs != ~0x3fff) { + if (!(bmp = hpfs_map_bitmap(s, near >> 14, &qbh, "aib"))) goto uls; + } else { + if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) goto uls; + } + if (!tstbits(bmp, nr, n + forward)) { + ret = bs + nr; + goto rt; + } + q = nr + n; b = 0; + while ((a = tstbits(bmp, q, n + forward)) != 0) { + q += a; + if (n != 1) q = ((q-1)&~(n-1))+n; + if (!b) { + if (q>>5 != nr>>5) { + b = 1; + q = nr & 0x1f; + } + } else if (q > nr) break; + } + if (!a) { + ret = bs + q; + goto rt; + } + nr >>= 5; + /*for (i = nr + 1; i != nr; i++, i &= 0x1ff) */ + i = nr; + do { + if (!le32_to_cpu(bmp[i])) goto cont; + if (n + forward >= 0x3f && le32_to_cpu(bmp[i]) != 0xffffffff) goto cont; + q = i<<5; + if (i > 0) { + unsigned k = le32_to_cpu(bmp[i-1]); + while (k & 0x80000000) { + q--; k <<= 1; + } + } + if (n != 1) q = ((q-1)&~(n-1))+n; + while ((a = tstbits(bmp, q, n + forward)) != 0) { + q += a; + if (n != 1) q = ((q-1)&~(n-1))+n; + if (q>>5 > i) break; + } + if (!a) { + ret = bs + q; + goto rt; + } + cont: + i++, i &= 0x1ff; + } while (i != nr); + rt: + if (ret) { + if (hpfs_sb(s)->sb_chk && ((ret >> 14) != (bs >> 14) || (le32_to_cpu(bmp[(ret & 0x3fff) >> 5]) | ~(((1 << n) - 1) << (ret & 0x1f))) != 0xffffffff)) { + hpfs_error(s, "Allocation doesn't work! Wanted %d, allocated at %08x", n, ret); + ret = 0; + goto b; + } + bmp[(ret & 0x3fff) >> 5] &= cpu_to_le32(~(((1 << n) - 1) << (ret & 0x1f))); + hpfs_mark_4buffers_dirty(&qbh); + } + b: + hpfs_brelse4(&qbh); + uls: + return ret; +} + +/* + * Allocation strategy: 1) search place near the sector specified + * 2) search bitmap where free sectors last found + * 3) search all bitmaps + * 4) search all bitmaps ignoring number of pre-allocated + * sectors + */ + +secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forward) +{ + secno sec; + int i; + unsigned n_bmps; + struct hpfs_sb_info *sbi = hpfs_sb(s); + int f_p = 0; + int near_bmp; + if (forward < 0) { + forward = -forward; + f_p = 1; + } + n_bmps = (sbi->sb_fs_size + 0x4000 - 1) >> 14; + if (near && near < sbi->sb_fs_size) { + if ((sec = alloc_in_bmp(s, near, n, f_p ? forward : forward/4))) goto ret; + near_bmp = near >> 14; + } else near_bmp = n_bmps / 2; + /* + if (b != -1) { + if ((sec = alloc_in_bmp(s, b<<14, n, f_p ? forward : forward/2))) { + b &= 0x0fffffff; + goto ret; + } + if (b > 0x10000000) if ((sec = alloc_in_bmp(s, (b&0xfffffff)<<14, n, f_p ? forward : 0))) goto ret; + */ + if (!f_p) if (forward > sbi->sb_max_fwd_alloc) forward = sbi->sb_max_fwd_alloc; + less_fwd: + for (i = 0; i < n_bmps; i++) { + if (near_bmp+i < n_bmps && ((sec = alloc_in_bmp(s, (near_bmp+i) << 14, n, forward)))) { + sbi->sb_c_bitmap = near_bmp+i; + goto ret; + } + if (!forward) { + if (near_bmp-i-1 >= 0 && ((sec = alloc_in_bmp(s, (near_bmp-i-1) << 14, n, forward)))) { + sbi->sb_c_bitmap = near_bmp-i-1; + goto ret; + } + } else { + if (near_bmp+i >= n_bmps && ((sec = alloc_in_bmp(s, (near_bmp+i-n_bmps) << 14, n, forward)))) { + sbi->sb_c_bitmap = near_bmp+i-n_bmps; + goto ret; + } + } + if (i == 1 && sbi->sb_c_bitmap != -1 && ((sec = alloc_in_bmp(s, (sbi->sb_c_bitmap) << 14, n, forward)))) { + goto ret; + } + } + if (!f_p) { + if (forward) { + sbi->sb_max_fwd_alloc = forward * 3 / 4; + forward /= 2; + goto less_fwd; + } + } + sec = 0; + ret: + if (sec && f_p) { + for (i = 0; i < forward; i++) { + if (!hpfs_alloc_if_possible(s, sec + i + 1)) { + hpfs_error(s, "Prealloc doesn't work! Wanted %d, allocated at %08x, can't allocate %d", forward, sec, i); + sec = 0; + break; + } + } + } + return sec; +} + +static secno alloc_in_dirband(struct super_block *s, secno near) +{ + unsigned nr = near; + secno sec; + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (nr < sbi->sb_dirband_start) + nr = sbi->sb_dirband_start; + if (nr >= sbi->sb_dirband_start + sbi->sb_dirband_size) + nr = sbi->sb_dirband_start + sbi->sb_dirband_size - 4; + nr -= sbi->sb_dirband_start; + nr >>= 2; + sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0); + if (!sec) return 0; + return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start; +} + +/* Alloc sector if it's free */ + +int hpfs_alloc_if_possible(struct super_block *s, secno sec) +{ + struct quad_buffer_head qbh; + u32 *bmp; + if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "aip"))) goto end; + if (le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) & (1 << (sec & 0x1f))) { + bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f))); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + return 1; + } + hpfs_brelse4(&qbh); + end: + return 0; +} + +/* Free sectors in bitmaps */ + +void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n) +{ + struct quad_buffer_head qbh; + u32 *bmp; + struct hpfs_sb_info *sbi = hpfs_sb(s); + /*printk("2 - ");*/ + if (!n) return; + if (sec < 0x12) { + hpfs_error(s, "Trying to free reserved sector %08x", sec); + return; + } + sbi->sb_max_fwd_alloc += n > 0xffff ? 0xffff : n; + if (sbi->sb_max_fwd_alloc > 0xffffff) sbi->sb_max_fwd_alloc = 0xffffff; + new_map: + if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "free"))) { + return; + } + new_tst: + if ((le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f) & 1)) { + hpfs_error(s, "sector %08x not allocated", sec); + hpfs_brelse4(&qbh); + return; + } + bmp[(sec & 0x3fff) >> 5] |= cpu_to_le32(1 << (sec & 0x1f)); + if (!--n) { + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + return; + } + if (!(++sec & 0x3fff)) { + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + goto new_map; + } + goto new_tst; +} + +/* + * Check if there are at least n free dnodes on the filesystem. + * Called before adding to dnode. If we run out of space while + * splitting dnodes, it would corrupt dnode tree. + */ + +int hpfs_check_free_dnodes(struct super_block *s, int n) +{ + int n_bmps = (hpfs_sb(s)->sb_fs_size + 0x4000 - 1) >> 14; + int b = hpfs_sb(s)->sb_c_bitmap & 0x0fffffff; + int i, j; + u32 *bmp; + struct quad_buffer_head qbh; + if ((bmp = hpfs_map_dnode_bitmap(s, &qbh))) { + for (j = 0; j < 512; j++) { + unsigned k; + if (!le32_to_cpu(bmp[j])) continue; + for (k = le32_to_cpu(bmp[j]); k; k >>= 1) if (k & 1) if (!--n) { + hpfs_brelse4(&qbh); + return 0; + } + } + } + hpfs_brelse4(&qbh); + i = 0; + if (hpfs_sb(s)->sb_c_bitmap != -1) { + bmp = hpfs_map_bitmap(s, b, &qbh, "chkdn1"); + goto chk_bmp; + } + chk_next: + if (i == b) i++; + if (i >= n_bmps) return 1; + bmp = hpfs_map_bitmap(s, i, &qbh, "chkdn2"); + chk_bmp: + if (bmp) { + for (j = 0; j < 512; j++) { + u32 k; + if (!le32_to_cpu(bmp[j])) continue; + for (k = 0xf; k; k <<= 4) + if ((le32_to_cpu(bmp[j]) & k) == k) { + if (!--n) { + hpfs_brelse4(&qbh); + return 0; + } + } + } + hpfs_brelse4(&qbh); + } + i++; + goto chk_next; +} + +void hpfs_free_dnode(struct super_block *s, dnode_secno dno) +{ + if (hpfs_sb(s)->sb_chk) if (dno & 3) { + hpfs_error(s, "hpfs_free_dnode: dnode %08x not aligned", dno); + return; + } + if (dno < hpfs_sb(s)->sb_dirband_start || + dno >= hpfs_sb(s)->sb_dirband_start + hpfs_sb(s)->sb_dirband_size) { + hpfs_free_sectors(s, dno, 4); + } else { + struct quad_buffer_head qbh; + u32 *bmp; + unsigned ssec = (dno - hpfs_sb(s)->sb_dirband_start) / 4; + if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) { + return; + } + bmp[ssec >> 5] |= cpu_to_le32(1 << (ssec & 0x1f)); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + } +} + +struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near, + dnode_secno *dno, struct quad_buffer_head *qbh) +{ + struct dnode *d; + if (hpfs_count_one_bitmap(s, hpfs_sb(s)->sb_dmap) > FREE_DNODES_ADD) { + if (!(*dno = alloc_in_dirband(s, near))) + if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) return NULL; + } else { + if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) + if (!(*dno = alloc_in_dirband(s, near))) return NULL; + } + if (!(d = hpfs_get_4sectors(s, *dno, qbh))) { + hpfs_free_dnode(s, *dno); + return NULL; + } + memset(d, 0, 2048); + d->magic = cpu_to_le32(DNODE_MAGIC); + d->first_free = cpu_to_le32(52); + d->dirent[0] = 32; + d->dirent[2] = 8; + d->dirent[30] = 1; + d->dirent[31] = 255; + d->self = cpu_to_le32(*dno); + return d; +} + +struct fnode *hpfs_alloc_fnode(struct super_block *s, secno near, fnode_secno *fno, + struct buffer_head **bh) +{ + struct fnode *f; + if (!(*fno = hpfs_alloc_sector(s, near, 1, FNODE_ALLOC_FWD))) return NULL; + if (!(f = hpfs_get_sector(s, *fno, bh))) { + hpfs_free_sectors(s, *fno, 1); + return NULL; + } + memset(f, 0, 512); + f->magic = cpu_to_le32(FNODE_MAGIC); + f->ea_offs = cpu_to_le16(0xc4); + f->btree.n_free_nodes = 8; + f->btree.first_free = cpu_to_le16(8); + return f; +} + +struct anode *hpfs_alloc_anode(struct super_block *s, secno near, anode_secno *ano, + struct buffer_head **bh) +{ + struct anode *a; + if (!(*ano = hpfs_alloc_sector(s, near, 1, ANODE_ALLOC_FWD))) return NULL; + if (!(a = hpfs_get_sector(s, *ano, bh))) { + hpfs_free_sectors(s, *ano, 1); + return NULL; + } + memset(a, 0, 512); + a->magic = cpu_to_le32(ANODE_MAGIC); + a->self = cpu_to_le32(*ano); + a->btree.n_free_nodes = 40; + a->btree.n_used_nodes = 0; + a->btree.first_free = cpu_to_le16(8); + return a; +} diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c new file mode 100644 index 00000000..08b503e8 --- /dev/null +++ b/fs/hpfs/anode.c @@ -0,0 +1,491 @@ +/* + * linux/fs/hpfs/anode.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * handling HPFS anode tree that contains file allocation info + */ + +#include "hpfs_fn.h" + +/* Find a sector in allocation tree */ + +secno hpfs_bplus_lookup(struct super_block *s, struct inode *inode, + struct bplus_header *btree, unsigned sec, + struct buffer_head *bh) +{ + anode_secno a = -1; + struct anode *anode; + int i; + int c1, c2 = 0; + go_down: + if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_bplus_lookup")) return -1; + if (btree->internal) { + for (i = 0; i < btree->n_used_nodes; i++) + if (le32_to_cpu(btree->u.internal[i].file_secno) > sec) { + a = le32_to_cpu(btree->u.internal[i].down); + brelse(bh); + if (!(anode = hpfs_map_anode(s, a, &bh))) return -1; + btree = &anode->btree; + goto go_down; + } + hpfs_error(s, "sector %08x not found in internal anode %08x", sec, a); + brelse(bh); + return -1; + } + for (i = 0; i < btree->n_used_nodes; i++) + if (le32_to_cpu(btree->u.external[i].file_secno) <= sec && + le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) > sec) { + a = le32_to_cpu(btree->u.external[i].disk_secno) + sec - le32_to_cpu(btree->u.external[i].file_secno); + if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, a, 1, "data")) { + brelse(bh); + return -1; + } + if (inode) { + struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); + hpfs_inode->i_file_sec = le32_to_cpu(btree->u.external[i].file_secno); + hpfs_inode->i_disk_sec = le32_to_cpu(btree->u.external[i].disk_secno); + hpfs_inode->i_n_secs = le32_to_cpu(btree->u.external[i].length); + } + brelse(bh); + return a; + } + hpfs_error(s, "sector %08x not found in external anode %08x", sec, a); + brelse(bh); + return -1; +} + +/* Add a sector to tree */ + +secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsigned fsecno) +{ + struct bplus_header *btree; + struct anode *anode = NULL, *ranode = NULL; + struct fnode *fnode; + anode_secno a, na = -1, ra, up = -1; + secno se; + struct buffer_head *bh, *bh1, *bh2; + int n; + unsigned fs; + int c1, c2 = 0; + if (fnod) { + if (!(fnode = hpfs_map_fnode(s, node, &bh))) return -1; + btree = &fnode->btree; + } else { + if (!(anode = hpfs_map_anode(s, node, &bh))) return -1; + btree = &anode->btree; + } + a = node; + go_down: + if ((n = btree->n_used_nodes - 1) < -!!fnod) { + hpfs_error(s, "anode %08x has no entries", a); + brelse(bh); + return -1; + } + if (btree->internal) { + a = le32_to_cpu(btree->u.internal[n].down); + btree->u.internal[n].file_secno = cpu_to_le32(-1); + mark_buffer_dirty(bh); + brelse(bh); + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_add_sector_to_btree #1")) return -1; + if (!(anode = hpfs_map_anode(s, a, &bh))) return -1; + btree = &anode->btree; + goto go_down; + } + if (n >= 0) { + if (le32_to_cpu(btree->u.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length) != fsecno) { + hpfs_error(s, "allocated size %08x, trying to add sector %08x, %cnode %08x", + le32_to_cpu(btree->u.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length), fsecno, + fnod?'f':'a', node); + brelse(bh); + return -1; + } + if (hpfs_alloc_if_possible(s, se = le32_to_cpu(btree->u.external[n].disk_secno) + le32_to_cpu(btree->u.external[n].length))) { + btree->u.external[n].length = cpu_to_le32(le32_to_cpu(btree->u.external[n].length) + 1); + mark_buffer_dirty(bh); + brelse(bh); + return se; + } + } else { + if (fsecno) { + hpfs_error(s, "empty file %08x, trying to add sector %08x", node, fsecno); + brelse(bh); + return -1; + } + se = !fnod ? node : (node + 16384) & ~16383; + } + if (!(se = hpfs_alloc_sector(s, se, 1, fsecno*ALLOC_M>ALLOC_FWD_MAX ? ALLOC_FWD_MAX : fsecno*ALLOC_Mu.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length); + if (!btree->n_free_nodes) { + up = a != node ? le32_to_cpu(anode->up) : -1; + if (!(anode = hpfs_alloc_anode(s, a, &na, &bh1))) { + brelse(bh); + hpfs_free_sectors(s, se, 1); + return -1; + } + if (a == node && fnod) { + anode->up = cpu_to_le32(node); + anode->btree.fnode_parent = 1; + anode->btree.n_used_nodes = btree->n_used_nodes; + anode->btree.first_free = btree->first_free; + anode->btree.n_free_nodes = 40 - anode->btree.n_used_nodes; + memcpy(&anode->u, &btree->u, btree->n_used_nodes * 12); + btree->internal = 1; + btree->n_free_nodes = 11; + btree->n_used_nodes = 1; + btree->first_free = cpu_to_le16((char *)&(btree->u.internal[1]) - (char *)btree); + btree->u.internal[0].file_secno = cpu_to_le32(-1); + btree->u.internal[0].down = cpu_to_le32(na); + mark_buffer_dirty(bh); + } else if (!(ranode = hpfs_alloc_anode(s, /*a*/0, &ra, &bh2))) { + brelse(bh); + brelse(bh1); + hpfs_free_sectors(s, se, 1); + hpfs_free_sectors(s, na, 1); + return -1; + } + brelse(bh); + bh = bh1; + btree = &anode->btree; + } + btree->n_free_nodes--; n = btree->n_used_nodes++; + btree->first_free = cpu_to_le16(le16_to_cpu(btree->first_free) + 12); + btree->u.external[n].disk_secno = cpu_to_le32(se); + btree->u.external[n].file_secno = cpu_to_le32(fs); + btree->u.external[n].length = cpu_to_le32(1); + mark_buffer_dirty(bh); + brelse(bh); + if ((a == node && fnod) || na == -1) return se; + c2 = 0; + while (up != (anode_secno)-1) { + struct anode *new_anode; + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, up, &c1, &c2, "hpfs_add_sector_to_btree #2")) return -1; + if (up != node || !fnod) { + if (!(anode = hpfs_map_anode(s, up, &bh))) return -1; + btree = &anode->btree; + } else { + if (!(fnode = hpfs_map_fnode(s, up, &bh))) return -1; + btree = &fnode->btree; + } + if (btree->n_free_nodes) { + btree->n_free_nodes--; n = btree->n_used_nodes++; + btree->first_free = cpu_to_le16(le16_to_cpu(btree->first_free) + 8); + btree->u.internal[n].file_secno = cpu_to_le32(-1); + btree->u.internal[n].down = cpu_to_le32(na); + btree->u.internal[n-1].file_secno = cpu_to_le32(fs); + mark_buffer_dirty(bh); + brelse(bh); + brelse(bh2); + hpfs_free_sectors(s, ra, 1); + if ((anode = hpfs_map_anode(s, na, &bh))) { + anode->up = cpu_to_le32(up); + anode->btree.fnode_parent = up == node && fnod; + mark_buffer_dirty(bh); + brelse(bh); + } + return se; + } + up = up != node ? le32_to_cpu(anode->up) : -1; + btree->u.internal[btree->n_used_nodes - 1].file_secno = cpu_to_le32(/*fs*/-1); + mark_buffer_dirty(bh); + brelse(bh); + a = na; + if ((new_anode = hpfs_alloc_anode(s, a, &na, &bh))) { + anode = new_anode; + /*anode->up = cpu_to_le32(up != -1 ? up : ra);*/ + anode->btree.internal = 1; + anode->btree.n_used_nodes = 1; + anode->btree.n_free_nodes = 59; + anode->btree.first_free = cpu_to_le16(16); + anode->btree.u.internal[0].down = cpu_to_le32(a); + anode->btree.u.internal[0].file_secno = cpu_to_le32(-1); + mark_buffer_dirty(bh); + brelse(bh); + if ((anode = hpfs_map_anode(s, a, &bh))) { + anode->up = cpu_to_le32(na); + mark_buffer_dirty(bh); + brelse(bh); + } + } else na = a; + } + if ((anode = hpfs_map_anode(s, na, &bh))) { + anode->up = cpu_to_le32(node); + if (fnod) anode->btree.fnode_parent = 1; + mark_buffer_dirty(bh); + brelse(bh); + } + if (!fnod) { + if (!(anode = hpfs_map_anode(s, node, &bh))) { + brelse(bh2); + return -1; + } + btree = &anode->btree; + } else { + if (!(fnode = hpfs_map_fnode(s, node, &bh))) { + brelse(bh2); + return -1; + } + btree = &fnode->btree; + } + ranode->up = cpu_to_le32(node); + memcpy(&ranode->btree, btree, le16_to_cpu(btree->first_free)); + if (fnod) ranode->btree.fnode_parent = 1; + ranode->btree.n_free_nodes = (ranode->btree.internal ? 60 : 40) - ranode->btree.n_used_nodes; + if (ranode->btree.internal) for (n = 0; n < ranode->btree.n_used_nodes; n++) { + struct anode *unode; + if ((unode = hpfs_map_anode(s, le32_to_cpu(ranode->u.internal[n].down), &bh1))) { + unode->up = cpu_to_le32(ra); + unode->btree.fnode_parent = 0; + mark_buffer_dirty(bh1); + brelse(bh1); + } + } + btree->internal = 1; + btree->n_free_nodes = fnod ? 10 : 58; + btree->n_used_nodes = 2; + btree->first_free = cpu_to_le16((char *)&btree->u.internal[2] - (char *)btree); + btree->u.internal[0].file_secno = cpu_to_le32(fs); + btree->u.internal[0].down = cpu_to_le32(ra); + btree->u.internal[1].file_secno = cpu_to_le32(-1); + btree->u.internal[1].down = cpu_to_le32(na); + mark_buffer_dirty(bh); + brelse(bh); + mark_buffer_dirty(bh2); + brelse(bh2); + return se; +} + +/* + * Remove allocation tree. Recursion would look much nicer but + * I want to avoid it because it can cause stack overflow. + */ + +void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree) +{ + struct bplus_header *btree1 = btree; + struct anode *anode = NULL; + anode_secno ano = 0, oano; + struct buffer_head *bh; + int level = 0; + int pos = 0; + int i; + int c1, c2 = 0; + int d1, d2; + go_down: + d2 = 0; + while (btree1->internal) { + ano = le32_to_cpu(btree1->u.internal[pos].down); + if (level) brelse(bh); + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, ano, &d1, &d2, "hpfs_remove_btree #1")) + return; + if (!(anode = hpfs_map_anode(s, ano, &bh))) return; + btree1 = &anode->btree; + level++; + pos = 0; + } + for (i = 0; i < btree1->n_used_nodes; i++) + hpfs_free_sectors(s, le32_to_cpu(btree1->u.external[i].disk_secno), le32_to_cpu(btree1->u.external[i].length)); + go_up: + if (!level) return; + brelse(bh); + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, ano, &c1, &c2, "hpfs_remove_btree #2")) return; + hpfs_free_sectors(s, ano, 1); + oano = ano; + ano = le32_to_cpu(anode->up); + if (--level) { + if (!(anode = hpfs_map_anode(s, ano, &bh))) return; + btree1 = &anode->btree; + } else btree1 = btree; + for (i = 0; i < btree1->n_used_nodes; i++) { + if (le32_to_cpu(btree1->u.internal[i].down) == oano) { + if ((pos = i + 1) < btree1->n_used_nodes) + goto go_down; + else + goto go_up; + } + } + hpfs_error(s, + "reference to anode %08x not found in anode %08x " + "(probably bad up pointer)", + oano, level ? ano : -1); + if (level) + brelse(bh); +} + +/* Just a wrapper around hpfs_bplus_lookup .. used for reading eas */ + +static secno anode_lookup(struct super_block *s, anode_secno a, unsigned sec) +{ + struct anode *anode; + struct buffer_head *bh; + if (!(anode = hpfs_map_anode(s, a, &bh))) return -1; + return hpfs_bplus_lookup(s, NULL, &anode->btree, sec, bh); +} + +int hpfs_ea_read(struct super_block *s, secno a, int ano, unsigned pos, + unsigned len, char *buf) +{ + struct buffer_head *bh; + char *data; + secno sec; + unsigned l; + while (len) { + if (ano) { + if ((sec = anode_lookup(s, a, pos >> 9)) == -1) + return -1; + } else sec = a + (pos >> 9); + if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, sec, 1, "ea #1")) return -1; + if (!(data = hpfs_map_sector(s, sec, &bh, (len - 1) >> 9))) + return -1; + l = 0x200 - (pos & 0x1ff); if (l > len) l = len; + memcpy(buf, data + (pos & 0x1ff), l); + brelse(bh); + buf += l; pos += l; len -= l; + } + return 0; +} + +int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos, + unsigned len, const char *buf) +{ + struct buffer_head *bh; + char *data; + secno sec; + unsigned l; + while (len) { + if (ano) { + if ((sec = anode_lookup(s, a, pos >> 9)) == -1) + return -1; + } else sec = a + (pos >> 9); + if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, sec, 1, "ea #2")) return -1; + if (!(data = hpfs_map_sector(s, sec, &bh, (len - 1) >> 9))) + return -1; + l = 0x200 - (pos & 0x1ff); if (l > len) l = len; + memcpy(data + (pos & 0x1ff), buf, l); + mark_buffer_dirty(bh); + brelse(bh); + buf += l; pos += l; len -= l; + } + return 0; +} + +void hpfs_ea_remove(struct super_block *s, secno a, int ano, unsigned len) +{ + struct anode *anode; + struct buffer_head *bh; + if (ano) { + if (!(anode = hpfs_map_anode(s, a, &bh))) return; + hpfs_remove_btree(s, &anode->btree); + brelse(bh); + hpfs_free_sectors(s, a, 1); + } else hpfs_free_sectors(s, a, (len + 511) >> 9); +} + +/* Truncate allocation tree. Doesn't join anodes - I hope it doesn't matter */ + +void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs) +{ + struct fnode *fnode; + struct anode *anode; + struct buffer_head *bh; + struct bplus_header *btree; + anode_secno node = f; + int i, j, nodes; + int c1, c2 = 0; + if (fno) { + if (!(fnode = hpfs_map_fnode(s, f, &bh))) return; + btree = &fnode->btree; + } else { + if (!(anode = hpfs_map_anode(s, f, &bh))) return; + btree = &anode->btree; + } + if (!secs) { + hpfs_remove_btree(s, btree); + if (fno) { + btree->n_free_nodes = 8; + btree->n_used_nodes = 0; + btree->first_free = cpu_to_le16(8); + btree->internal = 0; + mark_buffer_dirty(bh); + } else hpfs_free_sectors(s, f, 1); + brelse(bh); + return; + } + while (btree->internal) { + nodes = btree->n_used_nodes + btree->n_free_nodes; + for (i = 0; i < btree->n_used_nodes; i++) + if (le32_to_cpu(btree->u.internal[i].file_secno) >= secs) goto f; + brelse(bh); + hpfs_error(s, "internal btree %08x doesn't end with -1", node); + return; + f: + for (j = i + 1; j < btree->n_used_nodes; j++) + hpfs_ea_remove(s, le32_to_cpu(btree->u.internal[j].down), 1, 0); + btree->n_used_nodes = i + 1; + btree->n_free_nodes = nodes - btree->n_used_nodes; + btree->first_free = cpu_to_le16(8 + 8 * btree->n_used_nodes); + mark_buffer_dirty(bh); + if (btree->u.internal[i].file_secno == cpu_to_le32(secs)) { + brelse(bh); + return; + } + node = le32_to_cpu(btree->u.internal[i].down); + brelse(bh); + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, node, &c1, &c2, "hpfs_truncate_btree")) + return; + if (!(anode = hpfs_map_anode(s, node, &bh))) return; + btree = &anode->btree; + } + nodes = btree->n_used_nodes + btree->n_free_nodes; + for (i = 0; i < btree->n_used_nodes; i++) + if (le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) >= secs) goto ff; + brelse(bh); + return; + ff: + if (secs <= le32_to_cpu(btree->u.external[i].file_secno)) { + hpfs_error(s, "there is an allocation error in file %08x, sector %08x", f, secs); + if (i) i--; + } + else if (le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) > secs) { + hpfs_free_sectors(s, le32_to_cpu(btree->u.external[i].disk_secno) + secs - + le32_to_cpu(btree->u.external[i].file_secno), le32_to_cpu(btree->u.external[i].length) + - secs + le32_to_cpu(btree->u.external[i].file_secno)); /* I hope gcc optimizes this :-) */ + btree->u.external[i].length = cpu_to_le32(secs - le32_to_cpu(btree->u.external[i].file_secno)); + } + for (j = i + 1; j < btree->n_used_nodes; j++) + hpfs_free_sectors(s, le32_to_cpu(btree->u.external[j].disk_secno), le32_to_cpu(btree->u.external[j].length)); + btree->n_used_nodes = i + 1; + btree->n_free_nodes = nodes - btree->n_used_nodes; + btree->first_free = cpu_to_le16(8 + 12 * btree->n_used_nodes); + mark_buffer_dirty(bh); + brelse(bh); +} + +/* Remove file or directory and it's eas - note that directory must + be empty when this is called. */ + +void hpfs_remove_fnode(struct super_block *s, fnode_secno fno) +{ + struct buffer_head *bh; + struct fnode *fnode; + struct extended_attribute *ea; + struct extended_attribute *ea_end; + if (!(fnode = hpfs_map_fnode(s, fno, &bh))) return; + if (!fnode->dirflag) hpfs_remove_btree(s, &fnode->btree); + else hpfs_remove_dtree(s, le32_to_cpu(fnode->u.external[0].disk_secno)); + ea_end = fnode_end_ea(fnode); + for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) + if (ea->indirect) + hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea)); + hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l)); + brelse(bh); + hpfs_free_sectors(s, fno, 1); +} diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c new file mode 100644 index 00000000..9ecde27d --- /dev/null +++ b/fs/hpfs/buffer.c @@ -0,0 +1,168 @@ +/* + * linux/fs/hpfs/buffer.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * general buffer i/o + */ +#include +#include +#include "hpfs_fn.h" + +/* Map a sector into a buffer and return pointers to it and to the buffer. */ + +void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp, + int ahead) +{ + struct buffer_head *bh; + + hpfs_lock_assert(s); + + cond_resched(); + + *bhp = bh = sb_bread(s, secno); + if (bh != NULL) + return bh->b_data; + else { + printk("HPFS: hpfs_map_sector: read error\n"); + return NULL; + } +} + +/* Like hpfs_map_sector but don't read anything */ + +void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp) +{ + struct buffer_head *bh; + /*return hpfs_map_sector(s, secno, bhp, 0);*/ + + hpfs_lock_assert(s); + + cond_resched(); + + if ((*bhp = bh = sb_getblk(s, secno)) != NULL) { + if (!buffer_uptodate(bh)) wait_on_buffer(bh); + set_buffer_uptodate(bh); + return bh->b_data; + } else { + printk("HPFS: hpfs_get_sector: getblk failed\n"); + return NULL; + } +} + +/* Map 4 sectors into a 4buffer and return pointers to it and to the buffer. */ + +void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffer_head *qbh, + int ahead) +{ + struct buffer_head *bh; + char *data; + + hpfs_lock_assert(s); + + cond_resched(); + + if (secno & 3) { + printk("HPFS: hpfs_map_4sectors: unaligned read\n"); + return NULL; + } + + qbh->data = data = kmalloc(2048, GFP_NOFS); + if (!data) { + printk("HPFS: hpfs_map_4sectors: out of memory\n"); + goto bail; + } + + qbh->bh[0] = bh = sb_bread(s, secno); + if (!bh) + goto bail0; + memcpy(data, bh->b_data, 512); + + qbh->bh[1] = bh = sb_bread(s, secno + 1); + if (!bh) + goto bail1; + memcpy(data + 512, bh->b_data, 512); + + qbh->bh[2] = bh = sb_bread(s, secno + 2); + if (!bh) + goto bail2; + memcpy(data + 2 * 512, bh->b_data, 512); + + qbh->bh[3] = bh = sb_bread(s, secno + 3); + if (!bh) + goto bail3; + memcpy(data + 3 * 512, bh->b_data, 512); + + return data; + + bail3: + brelse(qbh->bh[2]); + bail2: + brelse(qbh->bh[1]); + bail1: + brelse(qbh->bh[0]); + bail0: + kfree(data); + printk("HPFS: hpfs_map_4sectors: read error\n"); + bail: + return NULL; +} + +/* Don't read sectors */ + +void *hpfs_get_4sectors(struct super_block *s, unsigned secno, + struct quad_buffer_head *qbh) +{ + cond_resched(); + + hpfs_lock_assert(s); + + if (secno & 3) { + printk("HPFS: hpfs_get_4sectors: unaligned read\n"); + return NULL; + } + + /*return hpfs_map_4sectors(s, secno, qbh, 0);*/ + if (!(qbh->data = kmalloc(2048, GFP_NOFS))) { + printk("HPFS: hpfs_get_4sectors: out of memory\n"); + return NULL; + } + if (!(hpfs_get_sector(s, secno, &qbh->bh[0]))) goto bail0; + if (!(hpfs_get_sector(s, secno + 1, &qbh->bh[1]))) goto bail1; + if (!(hpfs_get_sector(s, secno + 2, &qbh->bh[2]))) goto bail2; + if (!(hpfs_get_sector(s, secno + 3, &qbh->bh[3]))) goto bail3; + memcpy(qbh->data, qbh->bh[0]->b_data, 512); + memcpy(qbh->data + 512, qbh->bh[1]->b_data, 512); + memcpy(qbh->data + 2*512, qbh->bh[2]->b_data, 512); + memcpy(qbh->data + 3*512, qbh->bh[3]->b_data, 512); + return qbh->data; + + bail3: brelse(qbh->bh[2]); + bail2: brelse(qbh->bh[1]); + bail1: brelse(qbh->bh[0]); + bail0: + return NULL; +} + + +void hpfs_brelse4(struct quad_buffer_head *qbh) +{ + brelse(qbh->bh[3]); + brelse(qbh->bh[2]); + brelse(qbh->bh[1]); + brelse(qbh->bh[0]); + kfree(qbh->data); +} + +void hpfs_mark_4buffers_dirty(struct quad_buffer_head *qbh) +{ + PRINTK(("hpfs_mark_4buffers_dirty\n")); + memcpy(qbh->bh[0]->b_data, qbh->data, 512); + memcpy(qbh->bh[1]->b_data, qbh->data + 512, 512); + memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512); + memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512); + mark_buffer_dirty(qbh->bh[0]); + mark_buffer_dirty(qbh->bh[1]); + mark_buffer_dirty(qbh->bh[2]); + mark_buffer_dirty(qbh->bh[3]); +} diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c new file mode 100644 index 00000000..05d4816e --- /dev/null +++ b/fs/hpfs/dentry.c @@ -0,0 +1,64 @@ +/* + * linux/fs/hpfs/dentry.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * dcache operations + */ + +#include "hpfs_fn.h" + +/* + * Note: the dentry argument is the parent dentry. + */ + +static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) +{ + unsigned long hash; + int i; + unsigned l = qstr->len; + + if (l == 1) if (qstr->name[0]=='.') goto x; + if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x; + hpfs_adjust_length(qstr->name, &l); + /*if (hpfs_chk_name(qstr->name,&l))*/ + /*return -ENAMETOOLONG;*/ + /*return -ENOENT;*/ + x: + + hash = init_name_hash(); + for (i = 0; i < l; i++) + hash = partial_name_hash(hpfs_upcase(hpfs_sb(dentry->d_sb)->sb_cp_table,qstr->name[i]), hash); + qstr->hash = end_name_hash(hash); + + return 0; +} + +static int hpfs_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + unsigned al = len; + unsigned bl = name->len; + + hpfs_adjust_length(str, &al); + /*hpfs_adjust_length(b->name, &bl);*/ + + /* + * 'str' is the nane of an already existing dentry, so the name + * must be valid. 'name' must be validated first. + */ + + if (hpfs_chk_name(name->name, &bl)) + return 1; + if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0)) + return 1; + return 0; +} + +const struct dentry_operations hpfs_dentry_operations = { + .d_hash = hpfs_hash_dentry, + .d_compare = hpfs_compare_dentry, +}; diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c new file mode 100644 index 00000000..f46ae025 --- /dev/null +++ b/fs/hpfs/dir.c @@ -0,0 +1,322 @@ +/* + * linux/fs/hpfs/dir.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * directory VFS functions + */ + +#include +#include "hpfs_fn.h" + +static int hpfs_dir_release(struct inode *inode, struct file *filp) +{ + hpfs_lock(inode->i_sb); + hpfs_del_pos(inode, &filp->f_pos); + /*hpfs_write_if_changed(inode);*/ + hpfs_unlock(inode->i_sb); + return 0; +} + +/* This is slow, but it's not used often */ + +static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) +{ + loff_t new_off = off + (whence == 1 ? filp->f_pos : 0); + loff_t pos; + struct quad_buffer_head qbh; + struct inode *i = filp->f_path.dentry->d_inode; + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + struct super_block *s = i->i_sb; + + hpfs_lock(s); + + /*printk("dir lseek\n");*/ + if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; + mutex_lock(&i->i_mutex); + pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1; + while (pos != new_off) { + if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh); + else goto fail; + if (pos == 12) goto fail; + } + mutex_unlock(&i->i_mutex); +ok: + hpfs_unlock(s); + return filp->f_pos = new_off; +fail: + mutex_unlock(&i->i_mutex); + /*printk("illegal lseek: %016llx\n", new_off);*/ + hpfs_unlock(s); + return -ESPIPE; +} + +static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); + struct quad_buffer_head qbh; + struct hpfs_dirent *de; + int lc; + long old_pos; + unsigned char *tempname; + int c1, c2 = 0; + int ret = 0; + + hpfs_lock(inode->i_sb); + + if (hpfs_sb(inode->i_sb)->sb_chk) { + if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) { + ret = -EFSERROR; + goto out; + } + if (hpfs_chk_sectors(inode->i_sb, hpfs_inode->i_dno, 4, "dir_dnode")) { + ret = -EFSERROR; + goto out; + } + } + if (hpfs_sb(inode->i_sb)->sb_chk >= 2) { + struct buffer_head *bh; + struct fnode *fno; + int e = 0; + if (!(fno = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) { + ret = -EIOERROR; + goto out; + } + if (!fno->dirflag) { + e = 1; + hpfs_error(inode->i_sb, "not a directory, fnode %08lx", + (unsigned long)inode->i_ino); + } + if (hpfs_inode->i_dno != le32_to_cpu(fno->u.external[0].disk_secno)) { + e = 1; + hpfs_error(inode->i_sb, "corrupted inode: i_dno == %08x, fnode -> dnode == %08x", hpfs_inode->i_dno, le32_to_cpu(fno->u.external[0].disk_secno)); + } + brelse(bh); + if (e) { + ret = -EFSERROR; + goto out; + } + } + lc = hpfs_sb(inode->i_sb)->sb_lowercase; + if (filp->f_pos == 12) { /* diff -r requires this (note, that diff -r */ + filp->f_pos = 13; /* also fails on msdos filesystem in 2.0) */ + goto out; + } + if (filp->f_pos == 13) { + ret = -ENOENT; + goto out; + } + + while (1) { + again: + /* This won't work when cycle is longer than number of dirents + accepted by filldir, but what can I do? + maybe killall -9 ls helps */ + if (hpfs_sb(inode->i_sb)->sb_chk) + if (hpfs_stop_cycles(inode->i_sb, filp->f_pos, &c1, &c2, "hpfs_readdir")) { + ret = -EFSERROR; + goto out; + } + if (filp->f_pos == 12) + goto out; + if (filp->f_pos == 3 || filp->f_pos == 4 || filp->f_pos == 5) { + printk("HPFS: warning: pos==%d\n",(int)filp->f_pos); + goto out; + } + if (filp->f_pos == 0) { + if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0) + goto out; + filp->f_pos = 11; + } + if (filp->f_pos == 11) { + if (filldir(dirent, "..", 2, filp->f_pos, hpfs_inode->i_parent_dir, DT_DIR) < 0) + goto out; + filp->f_pos = 1; + } + if (filp->f_pos == 1) { + filp->f_pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1; + hpfs_add_pos(inode, &filp->f_pos); + filp->f_version = inode->i_version; + } + old_pos = filp->f_pos; + if (!(de = map_pos_dirent(inode, &filp->f_pos, &qbh))) { + ret = -EIOERROR; + goto out; + } + if (de->first || de->last) { + if (hpfs_sb(inode->i_sb)->sb_chk) { + if (de->first && !de->last && (de->namelen != 2 + || de ->name[0] != 1 || de->name[1] != 1)) + hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", old_pos); + if (de->last && (de->namelen != 1 || de ->name[0] != 255)) + hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", old_pos); + } + hpfs_brelse4(&qbh); + goto again; + } + tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3); + if (filldir(dirent, tempname, de->namelen, old_pos, le32_to_cpu(de->fnode), DT_UNKNOWN) < 0) { + filp->f_pos = old_pos; + if (tempname != de->name) kfree(tempname); + hpfs_brelse4(&qbh); + goto out; + } + if (tempname != de->name) kfree(tempname); + hpfs_brelse4(&qbh); + } +out: + hpfs_unlock(inode->i_sb); + return ret; +} + +/* + * lookup. Search the specified directory for the specified name, set + * *result to the corresponding inode. + * + * lookup uses the inode number to tell read_inode whether it is reading + * the inode of a directory or a file -- file ino's are odd, directory + * ino's are even. read_inode avoids i/o for file inodes; everything + * needed is up here in the directory. (And file fnodes are out in + * the boondocks.) + * + * - M.P.: this is over, sometimes we've got to read file's fnode for eas + * inode numbers are just fnode sector numbers; iget lock is used + * to tell read_inode to read fnode or not. + */ + +struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + const unsigned char *name = dentry->d_name.name; + unsigned len = dentry->d_name.len; + struct quad_buffer_head qbh; + struct hpfs_dirent *de; + ino_t ino; + int err; + struct inode *result = NULL; + struct hpfs_inode_info *hpfs_result; + + hpfs_lock(dir->i_sb); + if ((err = hpfs_chk_name(name, &len))) { + if (err == -ENAMETOOLONG) { + hpfs_unlock(dir->i_sb); + return ERR_PTR(-ENAMETOOLONG); + } + goto end_add; + } + + /* + * '.' and '..' will never be passed here. + */ + + de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, NULL, &qbh); + + /* + * This is not really a bailout, just means file not found. + */ + + if (!de) goto end; + + /* + * Get inode number, what we're after. + */ + + ino = le32_to_cpu(de->fnode); + + /* + * Go find or make an inode. + */ + + result = iget_locked(dir->i_sb, ino); + if (!result) { + hpfs_error(dir->i_sb, "hpfs_lookup: can't get inode"); + goto bail1; + } + if (result->i_state & I_NEW) { + hpfs_init_inode(result); + if (de->directory) + hpfs_read_inode(result); + else if (le32_to_cpu(de->ea_size) && hpfs_sb(dir->i_sb)->sb_eas) + hpfs_read_inode(result); + else { + result->i_mode |= S_IFREG; + result->i_mode &= ~0111; + result->i_op = &hpfs_file_iops; + result->i_fop = &hpfs_file_ops; + result->i_nlink = 1; + } + unlock_new_inode(result); + } + hpfs_result = hpfs_i(result); + if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino; + + if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) { + hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures"); + goto bail1; + } + + /* + * Fill in the info from the directory if this is a newly created + * inode. + */ + + if (!result->i_ctime.tv_sec) { + if (!(result->i_ctime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date)))) + result->i_ctime.tv_sec = 1; + result->i_ctime.tv_nsec = 0; + result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date)); + result->i_mtime.tv_nsec = 0; + result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date)); + result->i_atime.tv_nsec = 0; + hpfs_result->i_ea_size = le32_to_cpu(de->ea_size); + if (!hpfs_result->i_ea_mode && de->read_only) + result->i_mode &= ~0222; + if (!de->directory) { + if (result->i_size == -1) { + result->i_size = le32_to_cpu(de->file_size); + result->i_data.a_ops = &hpfs_aops; + hpfs_i(result)->mmu_private = result->i_size; + /* + * i_blocks should count the fnode and any anodes. + * We count 1 for the fnode and don't bother about + * anodes -- the disk heads are on the directory band + * and we want them to stay there. + */ + result->i_blocks = 1 + ((result->i_size + 511) >> 9); + } + } + } + + hpfs_brelse4(&qbh); + + /* + * Made it. + */ + + end: + end_add: + hpfs_unlock(dir->i_sb); + d_add(dentry, result); + return NULL; + + /* + * Didn't. + */ + bail1: + + hpfs_brelse4(&qbh); + + /*bail:*/ + + hpfs_unlock(dir->i_sb); + return ERR_PTR(-ENOENT); +} + +const struct file_operations hpfs_dir_ops = +{ + .llseek = hpfs_dir_lseek, + .read = generic_read_dir, + .readdir = hpfs_readdir, + .release = hpfs_dir_release, + .fsync = hpfs_file_fsync, +}; diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c new file mode 100644 index 00000000..1e0e2ac3 --- /dev/null +++ b/fs/hpfs/dnode.c @@ -0,0 +1,1084 @@ +/* + * linux/fs/hpfs/dnode.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * handling directory dnode tree - adding, deleteing & searching for dirents + */ + +#include "hpfs_fn.h" + +static loff_t get_pos(struct dnode *d, struct hpfs_dirent *fde) +{ + struct hpfs_dirent *de; + struct hpfs_dirent *de_end = dnode_end_de(d); + int i = 1; + for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { + if (de == fde) return ((loff_t) le32_to_cpu(d->self) << 4) | (loff_t)i; + i++; + } + printk("HPFS: get_pos: not_found\n"); + return ((loff_t)le32_to_cpu(d->self) << 4) | (loff_t)1; +} + +void hpfs_add_pos(struct inode *inode, loff_t *pos) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); + int i = 0; + loff_t **ppos; + + if (hpfs_inode->i_rddir_off) + for (; hpfs_inode->i_rddir_off[i]; i++) + if (hpfs_inode->i_rddir_off[i] == pos) return; + if (!(i&0x0f)) { + if (!(ppos = kmalloc((i+0x11) * sizeof(loff_t*), GFP_NOFS))) { + printk("HPFS: out of memory for position list\n"); + return; + } + if (hpfs_inode->i_rddir_off) { + memcpy(ppos, hpfs_inode->i_rddir_off, i * sizeof(loff_t)); + kfree(hpfs_inode->i_rddir_off); + } + hpfs_inode->i_rddir_off = ppos; + } + hpfs_inode->i_rddir_off[i] = pos; + hpfs_inode->i_rddir_off[i + 1] = NULL; +} + +void hpfs_del_pos(struct inode *inode, loff_t *pos) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); + loff_t **i, **j; + + if (!hpfs_inode->i_rddir_off) goto not_f; + for (i = hpfs_inode->i_rddir_off; *i; i++) if (*i == pos) goto fnd; + goto not_f; + fnd: + for (j = i + 1; *j; j++) ; + *i = *(j - 1); + *(j - 1) = NULL; + if (j - 1 == hpfs_inode->i_rddir_off) { + kfree(hpfs_inode->i_rddir_off); + hpfs_inode->i_rddir_off = NULL; + } + return; + not_f: + /*printk("HPFS: warning: position pointer %p->%08x not found\n", pos, (int)*pos);*/ + return; +} + +static void for_all_poss(struct inode *inode, void (*f)(loff_t *, loff_t, loff_t), + loff_t p1, loff_t p2) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); + loff_t **i; + + if (!hpfs_inode->i_rddir_off) return; + for (i = hpfs_inode->i_rddir_off; *i; i++) (*f)(*i, p1, p2); + return; +} + +static void hpfs_pos_subst(loff_t *p, loff_t f, loff_t t) +{ + if (*p == f) *p = t; +} + +/*void hpfs_hpfs_pos_substd(loff_t *p, loff_t f, loff_t t) +{ + if ((*p & ~0x3f) == (f & ~0x3f)) *p = (t & ~0x3f) | (*p & 0x3f); +}*/ + +static void hpfs_pos_ins(loff_t *p, loff_t d, loff_t c) +{ + if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) { + int n = (*p & 0x3f) + c; + if (n > 0x3f) printk("HPFS: hpfs_pos_ins: %08x + %d\n", (int)*p, (int)c >> 8); + else *p = (*p & ~0x3f) | n; + } +} + +static void hpfs_pos_del(loff_t *p, loff_t d, loff_t c) +{ + if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) { + int n = (*p & 0x3f) - c; + if (n < 1) printk("HPFS: hpfs_pos_ins: %08x - %d\n", (int)*p, (int)c >> 8); + else *p = (*p & ~0x3f) | n; + } +} + +static struct hpfs_dirent *dnode_pre_last_de(struct dnode *d) +{ + struct hpfs_dirent *de, *de_end, *dee = NULL, *deee = NULL; + de_end = dnode_end_de(d); + for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { + deee = dee; dee = de; + } + return deee; +} + +static struct hpfs_dirent *dnode_last_de(struct dnode *d) +{ + struct hpfs_dirent *de, *de_end, *dee = NULL; + de_end = dnode_end_de(d); + for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { + dee = de; + } + return dee; +} + +static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno ptr) +{ + struct hpfs_dirent *de; + if (!(de = dnode_last_de(d))) { + hpfs_error(s, "set_last_pointer: empty dnode %08x", le32_to_cpu(d->self)); + return; + } + if (hpfs_sb(s)->sb_chk) { + if (de->down) { + hpfs_error(s, "set_last_pointer: dnode %08x has already last pointer %08x", + le32_to_cpu(d->self), de_down_pointer(de)); + return; + } + if (le16_to_cpu(de->length) != 32) { + hpfs_error(s, "set_last_pointer: bad last dirent in dnode %08x", le32_to_cpu(d->self)); + return; + } + } + if (ptr) { + d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) + 4); + if (le32_to_cpu(d->first_free) > 2048) { + hpfs_error(s, "set_last_pointer: too long dnode %08x", le32_to_cpu(d->self)); + d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - 4); + return; + } + de->length = cpu_to_le16(36); + de->down = 1; + *(dnode_secno *)((char *)de + 32) = cpu_to_le32(ptr); + } +} + +/* Add an entry to dnode and don't care if it grows over 2048 bytes */ + +struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, + const unsigned char *name, + unsigned namelen, secno down_ptr) +{ + struct hpfs_dirent *de; + struct hpfs_dirent *de_end = dnode_end_de(d); + unsigned d_size = de_size(namelen, down_ptr); + for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { + int c = hpfs_compare_names(s, name, namelen, de->name, de->namelen, de->last); + if (!c) { + hpfs_error(s, "name (%c,%d) already exists in dnode %08x", *name, namelen, le32_to_cpu(d->self)); + return NULL; + } + if (c < 0) break; + } + memmove((char *)de + d_size, de, (char *)de_end - (char *)de); + memset(de, 0, d_size); + if (down_ptr) { + *(dnode_secno *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr); + de->down = 1; + } + de->length = cpu_to_le16(d_size); + de->not_8x3 = hpfs_is_name_long(name, namelen); + de->namelen = namelen; + memcpy(de->name, name, namelen); + d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) + d_size); + return de; +} + +/* Delete dirent and don't care about its subtree */ + +static void hpfs_delete_de(struct super_block *s, struct dnode *d, + struct hpfs_dirent *de) +{ + if (de->last) { + hpfs_error(s, "attempt to delete last dirent in dnode %08x", le32_to_cpu(d->self)); + return; + } + d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - le16_to_cpu(de->length)); + memmove(de, de_next_de(de), le32_to_cpu(d->first_free) + (char *)d - (char *)de); +} + +static void fix_up_ptrs(struct super_block *s, struct dnode *d) +{ + struct hpfs_dirent *de; + struct hpfs_dirent *de_end = dnode_end_de(d); + dnode_secno dno = le32_to_cpu(d->self); + for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) + if (de->down) { + struct quad_buffer_head qbh; + struct dnode *dd; + if ((dd = hpfs_map_dnode(s, de_down_pointer(de), &qbh))) { + if (le32_to_cpu(dd->up) != dno || dd->root_dnode) { + dd->up = cpu_to_le32(dno); + dd->root_dnode = 0; + hpfs_mark_4buffers_dirty(&qbh); + } + hpfs_brelse4(&qbh); + } + } +} + +/* Add an entry to dnode and do dnode splitting if required */ + +static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, + const unsigned char *name, unsigned namelen, + struct hpfs_dirent *new_de, dnode_secno down_ptr) +{ + struct quad_buffer_head qbh, qbh1, qbh2; + struct dnode *d, *ad, *rd, *nd = NULL; + dnode_secno adno, rdno; + struct hpfs_dirent *de; + struct hpfs_dirent nde; + unsigned char *nname; + int h; + int pos; + struct buffer_head *bh; + struct fnode *fnode; + int c1, c2 = 0; + if (!(nname = kmalloc(256, GFP_NOFS))) { + printk("HPFS: out of memory, can't add to dnode\n"); + return 1; + } + go_up: + if (namelen >= 256) { + hpfs_error(i->i_sb, "hpfs_add_to_dnode: namelen == %d", namelen); + kfree(nd); + kfree(nname); + return 1; + } + if (!(d = hpfs_map_dnode(i->i_sb, dno, &qbh))) { + kfree(nd); + kfree(nname); + return 1; + } + go_up_a: + if (hpfs_sb(i->i_sb)->sb_chk) + if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "hpfs_add_to_dnode")) { + hpfs_brelse4(&qbh); + kfree(nd); + kfree(nname); + return 1; + } + if (le32_to_cpu(d->first_free) + de_size(namelen, down_ptr) <= 2048) { + loff_t t; + copy_de(de=hpfs_add_de(i->i_sb, d, name, namelen, down_ptr), new_de); + t = get_pos(d, de); + for_all_poss(i, hpfs_pos_ins, t, 1); + for_all_poss(i, hpfs_pos_subst, 4, t); + for_all_poss(i, hpfs_pos_subst, 5, t + 1); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + kfree(nd); + kfree(nname); + return 0; + } + if (!nd) if (!(nd = kmalloc(0x924, GFP_NOFS))) { + /* 0x924 is a max size of dnode after adding a dirent with + max name length. We alloc this only once. There must + not be any error while splitting dnodes, otherwise the + whole directory, not only file we're adding, would + be lost. */ + printk("HPFS: out of memory for dnode splitting\n"); + hpfs_brelse4(&qbh); + kfree(nname); + return 1; + } + memcpy(nd, d, le32_to_cpu(d->first_free)); + copy_de(de = hpfs_add_de(i->i_sb, nd, name, namelen, down_ptr), new_de); + for_all_poss(i, hpfs_pos_ins, get_pos(nd, de), 1); + h = ((char *)dnode_last_de(nd) - (char *)nd) / 2 + 10; + if (!(ad = hpfs_alloc_dnode(i->i_sb, le32_to_cpu(d->up), &adno, &qbh1))) { + hpfs_error(i->i_sb, "unable to alloc dnode - dnode tree will be corrupted"); + hpfs_brelse4(&qbh); + kfree(nd); + kfree(nname); + return 1; + } + i->i_size += 2048; + i->i_blocks += 4; + pos = 1; + for (de = dnode_first_de(nd); (char *)de_next_de(de) - (char *)nd < h; de = de_next_de(de)) { + copy_de(hpfs_add_de(i->i_sb, ad, de->name, de->namelen, de->down ? de_down_pointer(de) : 0), de); + for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, ((loff_t)adno << 4) | pos); + pos++; + } + copy_de(new_de = &nde, de); + memcpy(nname, de->name, de->namelen); + name = nname; + namelen = de->namelen; + for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4); + down_ptr = adno; + set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0); + de = de_next_de(de); + memmove((char *)nd + 20, de, le32_to_cpu(nd->first_free) + (char *)nd - (char *)de); + nd->first_free = cpu_to_le32(le32_to_cpu(nd->first_free) - ((char *)de - (char *)nd - 20)); + memcpy(d, nd, le32_to_cpu(nd->first_free)); + for_all_poss(i, hpfs_pos_del, (loff_t)dno << 4, pos); + fix_up_ptrs(i->i_sb, ad); + if (!d->root_dnode) { + ad->up = d->up; + dno = le32_to_cpu(ad->up); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + goto go_up; + } + if (!(rd = hpfs_alloc_dnode(i->i_sb, le32_to_cpu(d->up), &rdno, &qbh2))) { + hpfs_error(i->i_sb, "unable to alloc dnode - dnode tree will be corrupted"); + hpfs_brelse4(&qbh); + hpfs_brelse4(&qbh1); + kfree(nd); + kfree(nname); + return 1; + } + i->i_size += 2048; + i->i_blocks += 4; + rd->root_dnode = 1; + rd->up = d->up; + if (!(fnode = hpfs_map_fnode(i->i_sb, le32_to_cpu(d->up), &bh))) { + hpfs_free_dnode(i->i_sb, rdno); + hpfs_brelse4(&qbh); + hpfs_brelse4(&qbh1); + hpfs_brelse4(&qbh2); + kfree(nd); + kfree(nname); + return 1; + } + fnode->u.external[0].disk_secno = cpu_to_le32(rdno); + mark_buffer_dirty(bh); + brelse(bh); + hpfs_i(i)->i_dno = rdno; + d->up = ad->up = cpu_to_le32(rdno); + d->root_dnode = ad->root_dnode = 0; + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + qbh = qbh2; + set_last_pointer(i->i_sb, rd, dno); + dno = rdno; + d = rd; + goto go_up_a; +} + +/* + * Add an entry to directory btree. + * I hate such crazy directory structure. + * It's easy to read but terrible to write. + * I wrote this directory code 4 times. + * I hope, now it's finally bug-free. + */ + +int hpfs_add_dirent(struct inode *i, + const unsigned char *name, unsigned namelen, + struct hpfs_dirent *new_de) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + struct dnode *d; + struct hpfs_dirent *de, *de_end; + struct quad_buffer_head qbh; + dnode_secno dno; + int c; + int c1, c2 = 0; + dno = hpfs_inode->i_dno; + down: + if (hpfs_sb(i->i_sb)->sb_chk) + if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "hpfs_add_dirent")) return 1; + if (!(d = hpfs_map_dnode(i->i_sb, dno, &qbh))) return 1; + de_end = dnode_end_de(d); + for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { + if (!(c = hpfs_compare_names(i->i_sb, name, namelen, de->name, de->namelen, de->last))) { + hpfs_brelse4(&qbh); + return -1; + } + if (c < 0) { + if (de->down) { + dno = de_down_pointer(de); + hpfs_brelse4(&qbh); + goto down; + } + break; + } + } + hpfs_brelse4(&qbh); + if (hpfs_check_free_dnodes(i->i_sb, FREE_DNODES_ADD)) { + c = 1; + goto ret; + } + i->i_version++; + c = hpfs_add_to_dnode(i, dno, name, namelen, new_de, 0); + ret: + return c; +} + +/* + * Find dirent with higher name in 'from' subtree and move it to 'to' dnode. + * Return the dnode we moved from (to be checked later if it's empty) + */ + +static secno move_to_top(struct inode *i, dnode_secno from, dnode_secno to) +{ + dnode_secno dno, ddno; + dnode_secno chk_up = to; + struct dnode *dnode; + struct quad_buffer_head qbh; + struct hpfs_dirent *de, *nde; + int a; + loff_t t; + int c1, c2 = 0; + dno = from; + while (1) { + if (hpfs_sb(i->i_sb)->sb_chk) + if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "move_to_top")) + return 0; + if (!(dnode = hpfs_map_dnode(i->i_sb, dno, &qbh))) return 0; + if (hpfs_sb(i->i_sb)->sb_chk) { + if (le32_to_cpu(dnode->up) != chk_up) { + hpfs_error(i->i_sb, "move_to_top: up pointer from %08x should be %08x, is %08x", + dno, chk_up, le32_to_cpu(dnode->up)); + hpfs_brelse4(&qbh); + return 0; + } + chk_up = dno; + } + if (!(de = dnode_last_de(dnode))) { + hpfs_error(i->i_sb, "move_to_top: dnode %08x has no last de", dno); + hpfs_brelse4(&qbh); + return 0; + } + if (!de->down) break; + dno = de_down_pointer(de); + hpfs_brelse4(&qbh); + } + while (!(de = dnode_pre_last_de(dnode))) { + dnode_secno up = le32_to_cpu(dnode->up); + hpfs_brelse4(&qbh); + hpfs_free_dnode(i->i_sb, dno); + i->i_size -= 2048; + i->i_blocks -= 4; + for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, 5); + if (up == to) return to; + if (!(dnode = hpfs_map_dnode(i->i_sb, up, &qbh))) return 0; + if (dnode->root_dnode) { + hpfs_error(i->i_sb, "move_to_top: got to root_dnode while moving from %08x to %08x", from, to); + hpfs_brelse4(&qbh); + return 0; + } + de = dnode_last_de(dnode); + if (!de || !de->down) { + hpfs_error(i->i_sb, "move_to_top: dnode %08x doesn't point down to %08x", up, dno); + hpfs_brelse4(&qbh); + return 0; + } + dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) - 4); + de->length = cpu_to_le16(le16_to_cpu(de->length) - 4); + de->down = 0; + hpfs_mark_4buffers_dirty(&qbh); + dno = up; + } + t = get_pos(dnode, de); + for_all_poss(i, hpfs_pos_subst, t, 4); + for_all_poss(i, hpfs_pos_subst, t + 1, 5); + if (!(nde = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) { + hpfs_error(i->i_sb, "out of memory for dirent - directory will be corrupted"); + hpfs_brelse4(&qbh); + return 0; + } + memcpy(nde, de, le16_to_cpu(de->length)); + ddno = de->down ? de_down_pointer(de) : 0; + hpfs_delete_de(i->i_sb, dnode, de); + set_last_pointer(i->i_sb, dnode, ddno); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + a = hpfs_add_to_dnode(i, to, nde->name, nde->namelen, nde, from); + kfree(nde); + if (a) return 0; + return dno; +} + +/* + * Check if a dnode is empty and delete it from the tree + * (chkdsk doesn't like empty dnodes) + */ + +static void delete_empty_dnode(struct inode *i, dnode_secno dno) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + struct quad_buffer_head qbh; + struct dnode *dnode; + dnode_secno down, up, ndown; + int p; + struct hpfs_dirent *de; + int c1, c2 = 0; + try_it_again: + if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "delete_empty_dnode")) return; + if (!(dnode = hpfs_map_dnode(i->i_sb, dno, &qbh))) return; + if (le32_to_cpu(dnode->first_free) > 56) goto end; + if (le32_to_cpu(dnode->first_free) == 52 || le32_to_cpu(dnode->first_free) == 56) { + struct hpfs_dirent *de_end; + int root = dnode->root_dnode; + up = le32_to_cpu(dnode->up); + de = dnode_first_de(dnode); + down = de->down ? de_down_pointer(de) : 0; + if (hpfs_sb(i->i_sb)->sb_chk) if (root && !down) { + hpfs_error(i->i_sb, "delete_empty_dnode: root dnode %08x is empty", dno); + goto end; + } + hpfs_brelse4(&qbh); + hpfs_free_dnode(i->i_sb, dno); + i->i_size -= 2048; + i->i_blocks -= 4; + if (root) { + struct fnode *fnode; + struct buffer_head *bh; + struct dnode *d1; + struct quad_buffer_head qbh1; + if (hpfs_sb(i->i_sb)->sb_chk) + if (up != i->i_ino) { + hpfs_error(i->i_sb, + "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx", + dno, up, (unsigned long)i->i_ino); + return; + } + if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) { + d1->up = cpu_to_le32(up); + d1->root_dnode = 1; + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + } + if ((fnode = hpfs_map_fnode(i->i_sb, up, &bh))) { + fnode->u.external[0].disk_secno = cpu_to_le32(down); + mark_buffer_dirty(bh); + brelse(bh); + } + hpfs_inode->i_dno = down; + for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, (loff_t) 12); + return; + } + if (!(dnode = hpfs_map_dnode(i->i_sb, up, &qbh))) return; + p = 1; + de_end = dnode_end_de(dnode); + for (de = dnode_first_de(dnode); de < de_end; de = de_next_de(de), p++) + if (de->down) if (de_down_pointer(de) == dno) goto fnd; + hpfs_error(i->i_sb, "delete_empty_dnode: pointer to dnode %08x not found in dnode %08x", dno, up); + goto end; + fnd: + for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, ((loff_t)up << 4) | p); + if (!down) { + de->down = 0; + de->length = cpu_to_le16(le16_to_cpu(de->length) - 4); + dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) - 4); + memmove(de_next_de(de), (char *)de_next_de(de) + 4, + (char *)dnode + le32_to_cpu(dnode->first_free) - (char *)de_next_de(de)); + } else { + struct dnode *d1; + struct quad_buffer_head qbh1; + *(dnode_secno *) ((void *) de + le16_to_cpu(de->length) - 4) = down; + if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) { + d1->up = cpu_to_le32(up); + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + } + } + } else { + hpfs_error(i->i_sb, "delete_empty_dnode: dnode %08x, first_free == %03x", dno, le32_to_cpu(dnode->first_free)); + goto end; + } + + if (!de->last) { + struct hpfs_dirent *de_next = de_next_de(de); + struct hpfs_dirent *de_cp; + struct dnode *d1; + struct quad_buffer_head qbh1; + if (!de_next->down) goto endm; + ndown = de_down_pointer(de_next); + if (!(de_cp = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) { + printk("HPFS: out of memory for dtree balancing\n"); + goto endm; + } + memcpy(de_cp, de, le16_to_cpu(de->length)); + hpfs_delete_de(i->i_sb, dnode, de); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | p, 4); + for_all_poss(i, hpfs_pos_del, ((loff_t)up << 4) | p, 1); + if (de_cp->down) if ((d1 = hpfs_map_dnode(i->i_sb, de_down_pointer(de_cp), &qbh1))) { + d1->up = cpu_to_le32(ndown); + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + } + hpfs_add_to_dnode(i, ndown, de_cp->name, de_cp->namelen, de_cp, de_cp->down ? de_down_pointer(de_cp) : 0); + /*printk("UP-TO-DNODE: %08x (ndown = %08x, down = %08x, dno = %08x)\n", up, ndown, down, dno);*/ + dno = up; + kfree(de_cp); + goto try_it_again; + } else { + struct hpfs_dirent *de_prev = dnode_pre_last_de(dnode); + struct hpfs_dirent *de_cp; + struct dnode *d1; + struct quad_buffer_head qbh1; + dnode_secno dlp; + if (!de_prev) { + hpfs_error(i->i_sb, "delete_empty_dnode: empty dnode %08x", up); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + dno = up; + goto try_it_again; + } + if (!de_prev->down) goto endm; + ndown = de_down_pointer(de_prev); + if ((d1 = hpfs_map_dnode(i->i_sb, ndown, &qbh1))) { + struct hpfs_dirent *del = dnode_last_de(d1); + dlp = del->down ? de_down_pointer(del) : 0; + if (!dlp && down) { + if (le32_to_cpu(d1->first_free) > 2044) { + if (hpfs_sb(i->i_sb)->sb_chk >= 2) { + printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n"); + printk("HPFS: warning: terminating balancing operation\n"); + } + hpfs_brelse4(&qbh1); + goto endm; + } + if (hpfs_sb(i->i_sb)->sb_chk >= 2) { + printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n"); + printk("HPFS: warning: goin'on\n"); + } + del->length = cpu_to_le16(le16_to_cpu(del->length) + 4); + del->down = 1; + d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) + 4); + } + if (dlp && !down) { + del->length = cpu_to_le16(le16_to_cpu(del->length) - 4); + del->down = 0; + d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) - 4); + } else if (down) + *(dnode_secno *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down); + } else goto endm; + if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) { + printk("HPFS: out of memory for dtree balancing\n"); + hpfs_brelse4(&qbh1); + goto endm; + } + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + memcpy(de_cp, de_prev, le16_to_cpu(de_prev->length)); + hpfs_delete_de(i->i_sb, dnode, de_prev); + if (!de_prev->down) { + de_prev->length = cpu_to_le16(le16_to_cpu(de_prev->length) + 4); + de_prev->down = 1; + dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) + 4); + } + *(dnode_secno *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | (p - 1), 4); + for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | p, ((loff_t)up << 4) | (p - 1)); + if (down) if ((d1 = hpfs_map_dnode(i->i_sb, de_down_pointer(de), &qbh1))) { + d1->up = cpu_to_le32(ndown); + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + } + hpfs_add_to_dnode(i, ndown, de_cp->name, de_cp->namelen, de_cp, dlp); + dno = up; + kfree(de_cp); + goto try_it_again; + } + endm: + hpfs_mark_4buffers_dirty(&qbh); + end: + hpfs_brelse4(&qbh); +} + + +/* Delete dirent from directory */ + +int hpfs_remove_dirent(struct inode *i, dnode_secno dno, struct hpfs_dirent *de, + struct quad_buffer_head *qbh, int depth) +{ + struct dnode *dnode = qbh->data; + dnode_secno down = 0; + loff_t t; + if (de->first || de->last) { + hpfs_error(i->i_sb, "hpfs_remove_dirent: attempt to delete first or last dirent in dnode %08x", dno); + hpfs_brelse4(qbh); + return 1; + } + if (de->down) down = de_down_pointer(de); + if (depth && (de->down || (de == dnode_first_de(dnode) && de_next_de(de)->last))) { + if (hpfs_check_free_dnodes(i->i_sb, FREE_DNODES_DEL)) { + hpfs_brelse4(qbh); + return 2; + } + } + i->i_version++; + for_all_poss(i, hpfs_pos_del, (t = get_pos(dnode, de)) + 1, 1); + hpfs_delete_de(i->i_sb, dnode, de); + hpfs_mark_4buffers_dirty(qbh); + hpfs_brelse4(qbh); + if (down) { + dnode_secno a = move_to_top(i, down, dno); + for_all_poss(i, hpfs_pos_subst, 5, t); + if (a) delete_empty_dnode(i, a); + return !a; + } + delete_empty_dnode(i, dno); + return 0; +} + +void hpfs_count_dnodes(struct super_block *s, dnode_secno dno, int *n_dnodes, + int *n_subdirs, int *n_items) +{ + struct dnode *dnode; + struct quad_buffer_head qbh; + struct hpfs_dirent *de; + dnode_secno ptr, odno = 0; + int c1, c2 = 0; + int d1, d2 = 0; + go_down: + if (n_dnodes) (*n_dnodes)++; + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, dno, &c1, &c2, "hpfs_count_dnodes #1")) return; + ptr = 0; + go_up: + if (!(dnode = hpfs_map_dnode(s, dno, &qbh))) return; + if (hpfs_sb(s)->sb_chk) if (odno && odno != -1 && le32_to_cpu(dnode->up) != odno) + hpfs_error(s, "hpfs_count_dnodes: bad up pointer; dnode %08x, down %08x points to %08x", odno, dno, le32_to_cpu(dnode->up)); + de = dnode_first_de(dnode); + if (ptr) while(1) { + if (de->down) if (de_down_pointer(de) == ptr) goto process_de; + if (de->last) { + hpfs_brelse4(&qbh); + hpfs_error(s, "hpfs_count_dnodes: pointer to dnode %08x not found in dnode %08x, got here from %08x", + ptr, dno, odno); + return; + } + de = de_next_de(de); + } + next_de: + if (de->down) { + odno = dno; + dno = de_down_pointer(de); + hpfs_brelse4(&qbh); + goto go_down; + } + process_de: + if (!de->first && !de->last && de->directory && n_subdirs) (*n_subdirs)++; + if (!de->first && !de->last && n_items) (*n_items)++; + if ((de = de_next_de(de)) < dnode_end_de(dnode)) goto next_de; + ptr = dno; + dno = le32_to_cpu(dnode->up); + if (dnode->root_dnode) { + hpfs_brelse4(&qbh); + return; + } + hpfs_brelse4(&qbh); + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, ptr, &d1, &d2, "hpfs_count_dnodes #2")) return; + odno = -1; + goto go_up; +} + +static struct hpfs_dirent *map_nth_dirent(struct super_block *s, dnode_secno dno, int n, + struct quad_buffer_head *qbh, struct dnode **dn) +{ + int i; + struct hpfs_dirent *de, *de_end; + struct dnode *dnode; + dnode = hpfs_map_dnode(s, dno, qbh); + if (!dnode) return NULL; + if (dn) *dn=dnode; + de = dnode_first_de(dnode); + de_end = dnode_end_de(dnode); + for (i = 1; de < de_end; i++, de = de_next_de(de)) { + if (i == n) { + return de; + } + if (de->last) break; + } + hpfs_brelse4(qbh); + hpfs_error(s, "map_nth_dirent: n too high; dnode = %08x, requested %08x", dno, n); + return NULL; +} + +dnode_secno hpfs_de_as_down_as_possible(struct super_block *s, dnode_secno dno) +{ + struct quad_buffer_head qbh; + dnode_secno d = dno; + dnode_secno up = 0; + struct hpfs_dirent *de; + int c1, c2 = 0; + + again: + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, d, &c1, &c2, "hpfs_de_as_down_as_possible")) + return d; + if (!(de = map_nth_dirent(s, d, 1, &qbh, NULL))) return dno; + if (hpfs_sb(s)->sb_chk) + if (up && le32_to_cpu(((struct dnode *)qbh.data)->up) != up) + hpfs_error(s, "hpfs_de_as_down_as_possible: bad up pointer; dnode %08x, down %08x points to %08x", up, d, le32_to_cpu(((struct dnode *)qbh.data)->up)); + if (!de->down) { + hpfs_brelse4(&qbh); + return d; + } + up = d; + d = de_down_pointer(de); + hpfs_brelse4(&qbh); + goto again; +} + +struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp, + struct quad_buffer_head *qbh) +{ + loff_t pos; + unsigned c; + dnode_secno dno; + struct hpfs_dirent *de, *d; + struct hpfs_dirent *up_de; + struct hpfs_dirent *end_up_de; + struct dnode *dnode; + struct dnode *up_dnode; + struct quad_buffer_head qbh0; + + pos = *posp; + dno = pos >> 6 << 2; + pos &= 077; + if (!(de = map_nth_dirent(inode->i_sb, dno, pos, qbh, &dnode))) + goto bail; + + /* Going to the next dirent */ + if ((d = de_next_de(de)) < dnode_end_de(dnode)) { + if (!(++*posp & 077)) { + hpfs_error(inode->i_sb, + "map_pos_dirent: pos crossed dnode boundary; pos = %08llx", + (unsigned long long)*posp); + goto bail; + } + /* We're going down the tree */ + if (d->down) { + *posp = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, de_down_pointer(d)) << 4) + 1; + } + + return de; + } + + /* Going up */ + if (dnode->root_dnode) goto bail; + + if (!(up_dnode = hpfs_map_dnode(inode->i_sb, le32_to_cpu(dnode->up), &qbh0))) + goto bail; + + end_up_de = dnode_end_de(up_dnode); + c = 0; + for (up_de = dnode_first_de(up_dnode); up_de < end_up_de; + up_de = de_next_de(up_de)) { + if (!(++c & 077)) hpfs_error(inode->i_sb, + "map_pos_dirent: pos crossed dnode boundary; dnode = %08x", le32_to_cpu(dnode->up)); + if (up_de->down && de_down_pointer(up_de) == dno) { + *posp = ((loff_t) le32_to_cpu(dnode->up) << 4) + c; + hpfs_brelse4(&qbh0); + return de; + } + } + + hpfs_error(inode->i_sb, "map_pos_dirent: pointer to dnode %08x not found in parent dnode %08x", + dno, le32_to_cpu(dnode->up)); + hpfs_brelse4(&qbh0); + + bail: + *posp = 12; + return de; +} + +/* Find a dirent in tree */ + +struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno, + const unsigned char *name, unsigned len, + dnode_secno *dd, struct quad_buffer_head *qbh) +{ + struct dnode *dnode; + struct hpfs_dirent *de; + struct hpfs_dirent *de_end; + int c1, c2 = 0; + + if (!S_ISDIR(inode->i_mode)) hpfs_error(inode->i_sb, "map_dirent: not a directory\n"); + again: + if (hpfs_sb(inode->i_sb)->sb_chk) + if (hpfs_stop_cycles(inode->i_sb, dno, &c1, &c2, "map_dirent")) return NULL; + if (!(dnode = hpfs_map_dnode(inode->i_sb, dno, qbh))) return NULL; + + de_end = dnode_end_de(dnode); + for (de = dnode_first_de(dnode); de < de_end; de = de_next_de(de)) { + int t = hpfs_compare_names(inode->i_sb, name, len, de->name, de->namelen, de->last); + if (!t) { + if (dd) *dd = dno; + return de; + } + if (t < 0) { + if (de->down) { + dno = de_down_pointer(de); + hpfs_brelse4(qbh); + goto again; + } + break; + } + } + hpfs_brelse4(qbh); + return NULL; +} + +/* + * Remove empty directory. In normal cases it is only one dnode with two + * entries, but we must handle also such obscure cases when it's a tree + * of empty dnodes. + */ + +void hpfs_remove_dtree(struct super_block *s, dnode_secno dno) +{ + struct quad_buffer_head qbh; + struct dnode *dnode; + struct hpfs_dirent *de; + dnode_secno d1, d2, rdno = dno; + while (1) { + if (!(dnode = hpfs_map_dnode(s, dno, &qbh))) return; + de = dnode_first_de(dnode); + if (de->last) { + if (de->down) d1 = de_down_pointer(de); + else goto error; + hpfs_brelse4(&qbh); + hpfs_free_dnode(s, dno); + dno = d1; + } else break; + } + if (!de->first) goto error; + d1 = de->down ? de_down_pointer(de) : 0; + de = de_next_de(de); + if (!de->last) goto error; + d2 = de->down ? de_down_pointer(de) : 0; + hpfs_brelse4(&qbh); + hpfs_free_dnode(s, dno); + do { + while (d1) { + if (!(dnode = hpfs_map_dnode(s, dno = d1, &qbh))) return; + de = dnode_first_de(dnode); + if (!de->last) goto error; + d1 = de->down ? de_down_pointer(de) : 0; + hpfs_brelse4(&qbh); + hpfs_free_dnode(s, dno); + } + d1 = d2; + d2 = 0; + } while (d1); + return; + error: + hpfs_brelse4(&qbh); + hpfs_free_dnode(s, dno); + hpfs_error(s, "directory %08x is corrupted or not empty", rdno); +} + +/* + * Find dirent for specified fnode. Use truncated 15-char name in fnode as + * a help for searching. + */ + +struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, + struct fnode *f, struct quad_buffer_head *qbh) +{ + unsigned char *name1; + unsigned char *name2; + int name1len, name2len; + struct dnode *d; + dnode_secno dno, downd; + struct fnode *upf; + struct buffer_head *bh; + struct hpfs_dirent *de, *de_end; + int c; + int c1, c2 = 0; + int d1, d2 = 0; + name1 = f->name; + if (!(name2 = kmalloc(256, GFP_NOFS))) { + printk("HPFS: out of memory, can't map dirent\n"); + return NULL; + } + if (f->len <= 15) + memcpy(name2, name1, name1len = name2len = f->len); + else { + memcpy(name2, name1, 15); + memset(name2 + 15, 0xff, 256 - 15); + /*name2[15] = 0xff;*/ + name1len = 15; name2len = 256; + } + if (!(upf = hpfs_map_fnode(s, le32_to_cpu(f->up), &bh))) { + kfree(name2); + return NULL; + } + if (!upf->dirflag) { + brelse(bh); + hpfs_error(s, "fnode %08x has non-directory parent %08x", fno, le32_to_cpu(f->up)); + kfree(name2); + return NULL; + } + dno = le32_to_cpu(upf->u.external[0].disk_secno); + brelse(bh); + go_down: + downd = 0; + go_up: + if (!(d = hpfs_map_dnode(s, dno, qbh))) { + kfree(name2); + return NULL; + } + de_end = dnode_end_de(d); + de = dnode_first_de(d); + if (downd) { + while (de < de_end) { + if (de->down) if (de_down_pointer(de) == downd) goto f; + de = de_next_de(de); + } + hpfs_error(s, "pointer to dnode %08x not found in dnode %08x", downd, dno); + hpfs_brelse4(qbh); + kfree(name2); + return NULL; + } + next_de: + if (le32_to_cpu(de->fnode) == fno) { + kfree(name2); + return de; + } + c = hpfs_compare_names(s, name1, name1len, de->name, de->namelen, de->last); + if (c < 0 && de->down) { + dno = de_down_pointer(de); + hpfs_brelse4(qbh); + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, dno, &c1, &c2, "map_fnode_dirent #1")) { + kfree(name2); + return NULL; + } + goto go_down; + } + f: + if (le32_to_cpu(de->fnode) == fno) { + kfree(name2); + return de; + } + c = hpfs_compare_names(s, name2, name2len, de->name, de->namelen, de->last); + if (c < 0 && !de->last) goto not_found; + if ((de = de_next_de(de)) < de_end) goto next_de; + if (d->root_dnode) goto not_found; + downd = dno; + dno = le32_to_cpu(d->up); + hpfs_brelse4(qbh); + if (hpfs_sb(s)->sb_chk) + if (hpfs_stop_cycles(s, downd, &d1, &d2, "map_fnode_dirent #2")) { + kfree(name2); + return NULL; + } + goto go_up; + not_found: + hpfs_brelse4(qbh); + hpfs_error(s, "dirent for fnode %08x not found", fno); + kfree(name2); + return NULL; +} diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c new file mode 100644 index 00000000..d8b84d11 --- /dev/null +++ b/fs/hpfs/ea.c @@ -0,0 +1,367 @@ +/* + * linux/fs/hpfs/ea.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * handling extended attributes + */ + +#include "hpfs_fn.h" + +/* Remove external extended attributes. ano specifies whether a is a + direct sector where eas starts or an anode */ + +void hpfs_ea_ext_remove(struct super_block *s, secno a, int ano, unsigned len) +{ + unsigned pos = 0; + while (pos < len) { + char ex[4 + 255 + 1 + 8]; + struct extended_attribute *ea = (struct extended_attribute *)ex; + if (pos + 4 > len) { + hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x", + ano ? "anode" : "sectors", a, len); + return; + } + if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return; + if (ea->indirect) { + if (ea_valuelen(ea) != 8) { + hpfs_error(s, "ea->indirect set while ea->valuelen!=8, %s %08x, pos %08x", + ano ? "anode" : "sectors", a, pos); + return; + } + if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 9, ex+4)) + return; + hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea)); + } + pos += ea->namelen + ea_valuelen(ea) + 5; + } + if (!ano) hpfs_free_sectors(s, a, (len+511) >> 9); + else { + struct buffer_head *bh; + struct anode *anode; + if ((anode = hpfs_map_anode(s, a, &bh))) { + hpfs_remove_btree(s, &anode->btree); + brelse(bh); + hpfs_free_sectors(s, a, 1); + } + } +} + +static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size) +{ + char *ret; + if (!(ret = kmalloc(size + 1, GFP_NOFS))) { + printk("HPFS: out of memory for EA\n"); + return NULL; + } + if (hpfs_ea_read(s, a, ano, 0, size, ret)) { + kfree(ret); + return NULL; + } + ret[size] = 0; + return ret; +} + +static void set_indirect_ea(struct super_block *s, int ano, secno a, + const char *data, int size) +{ + hpfs_ea_write(s, a, ano, 0, size, data); +} + +/* Read an extended attribute named 'key' into the provided buffer */ + +int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key, + char *buf, int size) +{ + unsigned pos; + int ano, len; + secno a; + char ex[4 + 255 + 1 + 8]; + struct extended_attribute *ea; + struct extended_attribute *ea_end = fnode_end_ea(fnode); + for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) + if (!strcmp(ea->name, key)) { + if (ea->indirect) + goto indirect; + if (ea_valuelen(ea) >= size) + return -EINVAL; + memcpy(buf, ea_data(ea), ea_valuelen(ea)); + buf[ea_valuelen(ea)] = 0; + return 0; + } + a = le32_to_cpu(fnode->ea_secno); + len = le32_to_cpu(fnode->ea_size_l); + ano = fnode->ea_anode; + pos = 0; + while (pos < len) { + ea = (struct extended_attribute *)ex; + if (pos + 4 > len) { + hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x", + ano ? "anode" : "sectors", a, len); + return -EIO; + } + if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return -EIO; + if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4)) + return -EIO; + if (!strcmp(ea->name, key)) { + if (ea->indirect) + goto indirect; + if (ea_valuelen(ea) >= size) + return -EINVAL; + if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea_valuelen(ea), buf)) + return -EIO; + buf[ea_valuelen(ea)] = 0; + return 0; + } + pos += ea->namelen + ea_valuelen(ea) + 5; + } + return -ENOENT; +indirect: + if (ea_len(ea) >= size) + return -EINVAL; + if (hpfs_ea_read(s, ea_sec(ea), ea->anode, 0, ea_len(ea), buf)) + return -EIO; + buf[ea_len(ea)] = 0; + return 0; +} + +/* Read an extended attribute named 'key' */ +char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *size) +{ + char *ret; + unsigned pos; + int ano, len; + secno a; + struct extended_attribute *ea; + struct extended_attribute *ea_end = fnode_end_ea(fnode); + for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) + if (!strcmp(ea->name, key)) { + if (ea->indirect) + return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea)); + if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) { + printk("HPFS: out of memory for EA\n"); + return NULL; + } + memcpy(ret, ea_data(ea), ea_valuelen(ea)); + ret[ea_valuelen(ea)] = 0; + return ret; + } + a = le32_to_cpu(fnode->ea_secno); + len = le32_to_cpu(fnode->ea_size_l); + ano = fnode->ea_anode; + pos = 0; + while (pos < len) { + char ex[4 + 255 + 1 + 8]; + ea = (struct extended_attribute *)ex; + if (pos + 4 > len) { + hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x", + ano ? "anode" : "sectors", a, len); + return NULL; + } + if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return NULL; + if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4)) + return NULL; + if (!strcmp(ea->name, key)) { + if (ea->indirect) + return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea)); + if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) { + printk("HPFS: out of memory for EA\n"); + return NULL; + } + if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea_valuelen(ea), ret)) { + kfree(ret); + return NULL; + } + ret[ea_valuelen(ea)] = 0; + return ret; + } + pos += ea->namelen + ea_valuelen(ea) + 5; + } + return NULL; +} + +/* + * Update or create extended attribute 'key' with value 'data'. Note that + * when this ea exists, it MUST have the same size as size of data. + * This driver can't change sizes of eas ('cause I just don't need it). + */ + +void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, + const char *data, int size) +{ + fnode_secno fno = inode->i_ino; + struct super_block *s = inode->i_sb; + unsigned pos; + int ano, len; + secno a; + unsigned char h[4]; + struct extended_attribute *ea; + struct extended_attribute *ea_end = fnode_end_ea(fnode); + for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) + if (!strcmp(ea->name, key)) { + if (ea->indirect) { + if (ea_len(ea) == size) + set_indirect_ea(s, ea->anode, ea_sec(ea), data, size); + } else if (ea_valuelen(ea) == size) { + memcpy(ea_data(ea), data, size); + } + return; + } + a = le32_to_cpu(fnode->ea_secno); + len = le32_to_cpu(fnode->ea_size_l); + ano = fnode->ea_anode; + pos = 0; + while (pos < len) { + char ex[4 + 255 + 1 + 8]; + ea = (struct extended_attribute *)ex; + if (pos + 4 > len) { + hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x", + ano ? "anode" : "sectors", a, len); + return; + } + if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return; + if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4)) + return; + if (!strcmp(ea->name, key)) { + if (ea->indirect) { + if (ea_len(ea) == size) + set_indirect_ea(s, ea->anode, ea_sec(ea), data, size); + } + else { + if (ea_valuelen(ea) == size) + hpfs_ea_write(s, a, ano, pos + 4 + ea->namelen + 1, size, data); + } + return; + } + pos += ea->namelen + ea_valuelen(ea) + 5; + } + if (!le16_to_cpu(fnode->ea_offs)) { + /*if (le16_to_cpu(fnode->ea_size_s)) { + hpfs_error(s, "fnode %08x: ea_size_s == %03x, ea_offs == 0", + inode->i_ino, le16_to_cpu(fnode->ea_size_s)); + return; + }*/ + fnode->ea_offs = cpu_to_le16(0xc4); + } + if (le16_to_cpu(fnode->ea_offs) < 0xc4 || le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200) { + hpfs_error(s, "fnode %08lx: ea_offs == %03x, ea_size_s == %03x", + (unsigned long)inode->i_ino, + le32_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s)); + return; + } + if ((le16_to_cpu(fnode->ea_size_s) || !le32_to_cpu(fnode->ea_size_l)) && + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) + strlen(key) + size + 5 <= 0x200) { + ea = fnode_end_ea(fnode); + *(char *)ea = 0; + ea->namelen = strlen(key); + ea->valuelen_lo = size; + ea->valuelen_hi = size >> 8; + strcpy(ea->name, key); + memcpy(ea_data(ea), data, size); + fnode->ea_size_s = cpu_to_le16(le16_to_cpu(fnode->ea_size_s) + strlen(key) + size + 5); + goto ret; + } + /* Most the code here is 99.9993422% unused. I hope there are no bugs. + But what .. HPFS.IFS has also bugs in ea management. */ + if (le16_to_cpu(fnode->ea_size_s) && !le32_to_cpu(fnode->ea_size_l)) { + secno n; + struct buffer_head *bh; + char *data; + if (!(n = hpfs_alloc_sector(s, fno, 1, 0))) return; + if (!(data = hpfs_get_sector(s, n, &bh))) { + hpfs_free_sectors(s, n, 1); + return; + } + memcpy(data, fnode_ea(fnode), le16_to_cpu(fnode->ea_size_s)); + fnode->ea_size_l = cpu_to_le32(le16_to_cpu(fnode->ea_size_s)); + fnode->ea_size_s = cpu_to_le16(0); + fnode->ea_secno = cpu_to_le32(n); + fnode->ea_anode = cpu_to_le32(0); + mark_buffer_dirty(bh); + brelse(bh); + } + pos = le32_to_cpu(fnode->ea_size_l) + 5 + strlen(key) + size; + len = (le32_to_cpu(fnode->ea_size_l) + 511) >> 9; + if (pos >= 30000) goto bail; + while (((pos + 511) >> 9) > len) { + if (!len) { + secno q = hpfs_alloc_sector(s, fno, 1, 0); + if (!q) goto bail; + fnode->ea_secno = cpu_to_le32(q); + fnode->ea_anode = 0; + len++; + } else if (!fnode->ea_anode) { + if (hpfs_alloc_if_possible(s, le32_to_cpu(fnode->ea_secno) + len)) { + len++; + } else { + /* Aargh... don't know how to create ea anodes :-( */ + /*struct buffer_head *bh; + struct anode *anode; + anode_secno a_s; + if (!(anode = hpfs_alloc_anode(s, fno, &a_s, &bh))) + goto bail; + anode->up = cpu_to_le32(fno); + anode->btree.fnode_parent = 1; + anode->btree.n_free_nodes--; + anode->btree.n_used_nodes++; + anode->btree.first_free = cpu_to_le16(le16_to_cpu(anode->btree.first_free) + 12); + anode->u.external[0].disk_secno = cpu_to_le32(le32_to_cpu(fnode->ea_secno)); + anode->u.external[0].file_secno = cpu_to_le32(0); + anode->u.external[0].length = cpu_to_le32(len); + mark_buffer_dirty(bh); + brelse(bh); + fnode->ea_anode = 1; + fnode->ea_secno = cpu_to_le32(a_s);*/ + secno new_sec; + int i; + if (!(new_sec = hpfs_alloc_sector(s, fno, 1, 1 - ((pos + 511) >> 9)))) + goto bail; + for (i = 0; i < len; i++) { + struct buffer_head *bh1, *bh2; + void *b1, *b2; + if (!(b1 = hpfs_map_sector(s, le32_to_cpu(fnode->ea_secno) + i, &bh1, len - i - 1))) { + hpfs_free_sectors(s, new_sec, (pos + 511) >> 9); + goto bail; + } + if (!(b2 = hpfs_get_sector(s, new_sec + i, &bh2))) { + brelse(bh1); + hpfs_free_sectors(s, new_sec, (pos + 511) >> 9); + goto bail; + } + memcpy(b2, b1, 512); + brelse(bh1); + mark_buffer_dirty(bh2); + brelse(bh2); + } + hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno), len); + fnode->ea_secno = cpu_to_le32(new_sec); + len = (pos + 511) >> 9; + } + } + if (fnode->ea_anode) { + if (hpfs_add_sector_to_btree(s, le32_to_cpu(fnode->ea_secno), + 0, len) != -1) { + len++; + } else { + goto bail; + } + } + } + h[0] = 0; + h[1] = strlen(key); + h[2] = size & 0xff; + h[3] = size >> 8; + if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail; + if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail; + if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail; + fnode->ea_size_l = cpu_to_le32(pos); + ret: + hpfs_i(inode)->i_ea_size += 5 + strlen(key) + size; + return; + bail: + if (le32_to_cpu(fnode->ea_secno)) + if (fnode->ea_anode) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9); + else hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno) + ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9), len - ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9)); + else fnode->ea_secno = fnode->ea_size_l = cpu_to_le32(0); +} + diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c new file mode 100644 index 00000000..89c500ee --- /dev/null +++ b/fs/hpfs/file.c @@ -0,0 +1,166 @@ +/* + * linux/fs/hpfs/file.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * file VFS functions + */ + +#include "hpfs_fn.h" + +#define BLOCKS(size) (((size) + 511) >> 9) + +static int hpfs_file_release(struct inode *inode, struct file *file) +{ + hpfs_lock(inode->i_sb); + hpfs_write_if_changed(inode); + hpfs_unlock(inode->i_sb); + return 0; +} + +int hpfs_file_fsync(struct file *file, int datasync) +{ + struct inode *inode = file->f_mapping->host; + return sync_blockdev(inode->i_sb->s_bdev); +} + +/* + * generic_file_read often calls bmap with non-existing sector, + * so we must ignore such errors. + */ + +static secno hpfs_bmap(struct inode *inode, unsigned file_secno) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); + unsigned n, disk_secno; + struct fnode *fnode; + struct buffer_head *bh; + if (BLOCKS(hpfs_i(inode)->mmu_private) <= file_secno) return 0; + n = file_secno - hpfs_inode->i_file_sec; + if (n < hpfs_inode->i_n_secs) return hpfs_inode->i_disk_sec + n; + if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0; + disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh); + if (disk_secno == -1) return 0; + if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0; + return disk_secno; +} + +static void hpfs_truncate(struct inode *i) +{ + if (IS_IMMUTABLE(i)) return /*-EPERM*/; + hpfs_lock_assert(i->i_sb); + + hpfs_i(i)->i_n_secs = 0; + i->i_blocks = 1 + ((i->i_size + 511) >> 9); + hpfs_i(i)->mmu_private = i->i_size; + hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9)); + hpfs_write_inode(i); + hpfs_i(i)->i_n_secs = 0; +} + +static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) +{ + int r; + secno s; + hpfs_lock(inode->i_sb); + s = hpfs_bmap(inode, iblock); + if (s) { + map_bh(bh_result, inode->i_sb, s); + goto ret_0; + } + if (!create) goto ret_0; + if (iblock<<9 != hpfs_i(inode)->mmu_private) { + BUG(); + r = -EIO; + goto ret_r; + } + if ((s = hpfs_add_sector_to_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1)) == -1) { + hpfs_truncate_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1); + r = -ENOSPC; + goto ret_r; + } + inode->i_blocks++; + hpfs_i(inode)->mmu_private += 512; + set_buffer_new(bh_result); + map_bh(bh_result, inode->i_sb, s); + ret_0: + r = 0; + ret_r: + hpfs_unlock(inode->i_sb); + return r; +} + +static int hpfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page,hpfs_get_block, wbc); +} + +static int hpfs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,hpfs_get_block); +} + +static int hpfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + *pagep = NULL; + ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + hpfs_get_block, + &hpfs_i(mapping->host)->mmu_private); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} + +static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,hpfs_get_block); +} + +const struct address_space_operations hpfs_aops = { + .readpage = hpfs_readpage, + .writepage = hpfs_writepage, + .write_begin = hpfs_write_begin, + .write_end = generic_write_end, + .bmap = _hpfs_bmap +}; + +static ssize_t hpfs_file_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + ssize_t retval; + + retval = do_sync_write(file, buf, count, ppos); + if (retval > 0) { + hpfs_lock(file->f_path.dentry->d_sb); + hpfs_i(file->f_path.dentry->d_inode)->i_dirty = 1; + hpfs_unlock(file->f_path.dentry->d_sb); + } + return retval; +} + +const struct file_operations hpfs_file_ops = +{ + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = hpfs_file_write, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .release = hpfs_file_release, + .fsync = hpfs_file_fsync, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations hpfs_file_iops = +{ + .truncate = hpfs_truncate, + .setattr = hpfs_setattr, +}; diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h new file mode 100644 index 00000000..8b0650aa --- /dev/null +++ b/fs/hpfs/hpfs.h @@ -0,0 +1,568 @@ +/* + * linux/fs/hpfs/hpfs.h + * + * HPFS structures by Chris Smith, 1993 + * + * a little bit modified by Mikulas Patocka, 1998-1999 + */ + +/* The paper + + Duncan, Roy + Design goals and implementation of the new High Performance File System + Microsoft Systems Journal Sept 1989 v4 n5 p1(13) + + describes what HPFS looked like when it was new, and it is the source + of most of the information given here. The rest is conjecture. + + For definitive information on the Duncan paper, see it, not this file. + For definitive information on HPFS, ask somebody else -- this is guesswork. + There are certain to be many mistakes. */ + +#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN) +#error unknown endian +#endif + +/* Notation */ + +typedef u32 secno; /* sector number, partition relative */ + +typedef secno dnode_secno; /* sector number of a dnode */ +typedef secno fnode_secno; /* sector number of an fnode */ +typedef secno anode_secno; /* sector number of an anode */ + +typedef u32 time32_t; /* 32-bit time_t type */ + +/* sector 0 */ + +/* The boot block is very like a FAT boot block, except that the + 29h signature byte is 28h instead, and the ID string is "HPFS". */ + +#define BB_MAGIC 0xaa55 + +struct hpfs_boot_block +{ + u8 jmp[3]; + u8 oem_id[8]; + u8 bytes_per_sector[2]; /* 512 */ + u8 sectors_per_cluster; + u8 n_reserved_sectors[2]; + u8 n_fats; + u8 n_rootdir_entries[2]; + u8 n_sectors_s[2]; + u8 media_byte; + u16 sectors_per_fat; + u16 sectors_per_track; + u16 heads_per_cyl; + u32 n_hidden_sectors; + u32 n_sectors_l; /* size of partition */ + u8 drive_number; + u8 mbz; + u8 sig_28h; /* 28h */ + u8 vol_serno[4]; + u8 vol_label[11]; + u8 sig_hpfs[8]; /* "HPFS " */ + u8 pad[448]; + u16 magic; /* aa55 */ +}; + + +/* sector 16 */ + +/* The super block has the pointer to the root directory. */ + +#define SB_MAGIC 0xf995e849 + +struct hpfs_super_block +{ + u32 magic; /* f995 e849 */ + u32 magic1; /* fa53 e9c5, more magic? */ + u8 version; /* version of a filesystem usually 2 */ + u8 funcversion; /* functional version - oldest version + of filesystem that can understand + this disk */ + u16 zero; /* 0 */ + fnode_secno root; /* fnode of root directory */ + secno n_sectors; /* size of filesystem */ + u32 n_badblocks; /* number of bad blocks */ + secno bitmaps; /* pointers to free space bit maps */ + u32 zero1; /* 0 */ + secno badblocks; /* bad block list */ + u32 zero3; /* 0 */ + time32_t last_chkdsk; /* date last checked, 0 if never */ + time32_t last_optimize; /* date last optimized, 0 if never */ + secno n_dir_band; /* number of sectors in dir band */ + secno dir_band_start; /* first sector in dir band */ + secno dir_band_end; /* last sector in dir band */ + secno dir_band_bitmap; /* free space map, 1 dnode per bit */ + u8 volume_name[32]; /* not used */ + secno user_id_table; /* 8 preallocated sectors - user id */ + u32 zero6[103]; /* 0 */ +}; + + +/* sector 17 */ + +/* The spare block has pointers to spare sectors. */ + +#define SP_MAGIC 0xf9911849 + +struct hpfs_spare_block +{ + u32 magic; /* f991 1849 */ + u32 magic1; /* fa52 29c5, more magic? */ + +#ifdef __LITTLE_ENDIAN + u8 dirty: 1; /* 0 clean, 1 "improperly stopped" */ + u8 sparedir_used: 1; /* spare dirblks used */ + u8 hotfixes_used: 1; /* hotfixes used */ + u8 bad_sector: 1; /* bad sector, corrupted disk (???) */ + u8 bad_bitmap: 1; /* bad bitmap */ + u8 fast: 1; /* partition was fast formatted */ + u8 old_wrote: 1; /* old version wrote to partion */ + u8 old_wrote_1: 1; /* old version wrote to partion (?) */ +#else + u8 old_wrote_1: 1; /* old version wrote to partion (?) */ + u8 old_wrote: 1; /* old version wrote to partion */ + u8 fast: 1; /* partition was fast formatted */ + u8 bad_bitmap: 1; /* bad bitmap */ + u8 bad_sector: 1; /* bad sector, corrupted disk (???) */ + u8 hotfixes_used: 1; /* hotfixes used */ + u8 sparedir_used: 1; /* spare dirblks used */ + u8 dirty: 1; /* 0 clean, 1 "improperly stopped" */ +#endif + +#ifdef __LITTLE_ENDIAN + u8 install_dasd_limits: 1; /* HPFS386 flags */ + u8 resynch_dasd_limits: 1; + u8 dasd_limits_operational: 1; + u8 multimedia_active: 1; + u8 dce_acls_active: 1; + u8 dasd_limits_dirty: 1; + u8 flag67: 2; +#else + u8 flag67: 2; + u8 dasd_limits_dirty: 1; + u8 dce_acls_active: 1; + u8 multimedia_active: 1; + u8 dasd_limits_operational: 1; + u8 resynch_dasd_limits: 1; + u8 install_dasd_limits: 1; /* HPFS386 flags */ +#endif + + u8 mm_contlgulty; + u8 unused; + + secno hotfix_map; /* info about remapped bad sectors */ + u32 n_spares_used; /* number of hotfixes */ + u32 n_spares; /* number of spares in hotfix map */ + u32 n_dnode_spares_free; /* spare dnodes unused */ + u32 n_dnode_spares; /* length of spare_dnodes[] list, + follows in this block*/ + secno code_page_dir; /* code page directory block */ + u32 n_code_pages; /* number of code pages */ + u32 super_crc; /* on HPFS386 and LAN Server this is + checksum of superblock, on normal + OS/2 unused */ + u32 spare_crc; /* on HPFS386 checksum of spareblock */ + u32 zero1[15]; /* unused */ + dnode_secno spare_dnodes[100]; /* emergency free dnode list */ + u32 zero2[1]; /* room for more? */ +}; + +/* The bad block list is 4 sectors long. The first word must be zero, + the remaining words give n_badblocks bad block numbers. + I bet you can see it coming... */ + +#define BAD_MAGIC 0 + +/* The hotfix map is 4 sectors long. It looks like + + secno from[n_spares]; + secno to[n_spares]; + + The to[] list is initialized to point to n_spares preallocated empty + sectors. The from[] list contains the sector numbers of bad blocks + which have been remapped to corresponding sectors in the to[] list. + n_spares_used gives the length of the from[] list. */ + + +/* Sectors 18 and 19 are preallocated and unused. + Maybe they're spares for 16 and 17, but simple substitution fails. */ + + +/* The code page info pointed to by the spare block consists of an index + block and blocks containing uppercasing tables. I don't know what + these are for (CHKDSK, maybe?) -- OS/2 does not seem to use them + itself. Linux doesn't use them either. */ + +/* block pointed to by spareblock->code_page_dir */ + +#define CP_DIR_MAGIC 0x494521f7 + +struct code_page_directory +{ + u32 magic; /* 4945 21f7 */ + u32 n_code_pages; /* number of pointers following */ + u32 zero1[2]; + struct { + u16 ix; /* index */ + u16 code_page_number; /* code page number */ + u32 bounds; /* matches corresponding word + in data block */ + secno code_page_data; /* sector number of a code_page_data + containing c.p. array */ + u16 index; /* index in c.p. array in that sector*/ + u16 unknown; /* some unknown value; usually 0; + 2 in Japanese version */ + } array[31]; /* unknown length */ +}; + +/* blocks pointed to by code_page_directory */ + +#define CP_DATA_MAGIC 0x894521f7 + +struct code_page_data +{ + u32 magic; /* 8945 21f7 */ + u32 n_used; /* # elements used in c_p_data[] */ + u32 bounds[3]; /* looks a bit like + (beg1,end1), (beg2,end2) + one byte each */ + u16 offs[3]; /* offsets from start of sector + to start of c_p_data[ix] */ + struct { + u16 ix; /* index */ + u16 code_page_number; /* code page number */ + u16 unknown; /* the same as in cp directory */ + u8 map[128]; /* upcase table for chars 80..ff */ + u16 zero2; + } code_page[3]; + u8 incognita[78]; +}; + + +/* Free space bitmaps are 4 sectors long, which is 16384 bits. + 16384 sectors is 8 meg, and each 8 meg band has a 4-sector bitmap. + Bit order in the maps is little-endian. 0 means taken, 1 means free. + + Bit map sectors are marked allocated in the bit maps, and so are sectors + off the end of the partition. + + Band 0 is sectors 0-3fff, its map is in sectors 18-1b. + Band 1 is 4000-7fff, its map is in 7ffc-7fff. + Band 2 is 8000-ffff, its map is in 8000-8003. + The remaining bands have maps in their first (even) or last (odd) 4 sectors + -- if the last, partial, band is odd its map is in its last 4 sectors. + + The bitmap locations are given in a table pointed to by the super block. + No doubt they aren't constrained to be at 18, 7ffc, 8000, ...; that is + just where they usually are. + + The "directory band" is a bunch of sectors preallocated for dnodes. + It has a 4-sector free space bitmap of its own. Each bit in the map + corresponds to one 4-sector dnode, bit 0 of the map corresponding to + the first 4 sectors of the directory band. The entire band is marked + allocated in the main bitmap. The super block gives the locations + of the directory band and its bitmap. ("band" doesn't mean it is + 8 meg long; it isn't.) */ + + +/* dnode: directory. 4 sectors long */ + +/* A directory is a tree of dnodes. The fnode for a directory + contains one pointer, to the root dnode of the tree. The fnode + never moves, the dnodes do the B-tree thing, splitting and merging + as files are added and removed. */ + +#define DNODE_MAGIC 0x77e40aae + +struct dnode { + u32 magic; /* 77e4 0aae */ + u32 first_free; /* offset from start of dnode to + first free dir entry */ +#ifdef __LITTLE_ENDIAN + u8 root_dnode: 1; /* Is it root dnode? */ + u8 increment_me: 7; /* some kind of activity counter? */ + /* Neither HPFS.IFS nor CHKDSK cares + if you change this word */ +#else + u8 increment_me: 7; /* some kind of activity counter? */ + /* Neither HPFS.IFS nor CHKDSK cares + if you change this word */ + u8 root_dnode: 1; /* Is it root dnode? */ +#endif + u8 increment_me2[3]; + secno up; /* (root dnode) directory's fnode + (nonroot) parent dnode */ + dnode_secno self; /* pointer to this dnode */ + u8 dirent[2028]; /* one or more dirents */ +}; + +struct hpfs_dirent { + u16 length; /* offset to next dirent */ + +#ifdef __LITTLE_ENDIAN + u8 first: 1; /* set on phony ^A^A (".") entry */ + u8 has_acl: 1; + u8 down: 1; /* down pointer present (after name) */ + u8 last: 1; /* set on phony \377 entry */ + u8 has_ea: 1; /* entry has EA */ + u8 has_xtd_perm: 1; /* has extended perm list (???) */ + u8 has_explicit_acl: 1; + u8 has_needea: 1; /* ?? some EA has NEEDEA set + I have no idea why this is + interesting in a dir entry */ +#else + u8 has_needea: 1; /* ?? some EA has NEEDEA set + I have no idea why this is + interesting in a dir entry */ + u8 has_explicit_acl: 1; + u8 has_xtd_perm: 1; /* has extended perm list (???) */ + u8 has_ea: 1; /* entry has EA */ + u8 last: 1; /* set on phony \377 entry */ + u8 down: 1; /* down pointer present (after name) */ + u8 has_acl: 1; + u8 first: 1; /* set on phony ^A^A (".") entry */ +#endif + +#ifdef __LITTLE_ENDIAN + u8 read_only: 1; /* dos attrib */ + u8 hidden: 1; /* dos attrib */ + u8 system: 1; /* dos attrib */ + u8 flag11: 1; /* would be volume label dos attrib */ + u8 directory: 1; /* dos attrib */ + u8 archive: 1; /* dos attrib */ + u8 not_8x3: 1; /* name is not 8.3 */ + u8 flag15: 1; +#else + u8 flag15: 1; + u8 not_8x3: 1; /* name is not 8.3 */ + u8 archive: 1; /* dos attrib */ + u8 directory: 1; /* dos attrib */ + u8 flag11: 1; /* would be volume label dos attrib */ + u8 system: 1; /* dos attrib */ + u8 hidden: 1; /* dos attrib */ + u8 read_only: 1; /* dos attrib */ +#endif + + fnode_secno fnode; /* fnode giving allocation info */ + time32_t write_date; /* mtime */ + u32 file_size; /* file length, bytes */ + time32_t read_date; /* atime */ + time32_t creation_date; /* ctime */ + u32 ea_size; /* total EA length, bytes */ + u8 no_of_acls; /* number of ACL's (low 3 bits) */ + u8 ix; /* code page index (of filename), see + struct code_page_data */ + u8 namelen, name[1]; /* file name */ + /* dnode_secno down; btree down pointer, if present, + follows name on next word boundary, or maybe it + precedes next dirent, which is on a word boundary. */ +}; + + +/* B+ tree: allocation info in fnodes and anodes */ + +/* dnodes point to fnodes which are responsible for listing the sectors + assigned to the file. This is done with trees of (length,address) + pairs. (Actually triples, of (length, file-address, disk-address) + which can represent holes. Find out if HPFS does that.) + At any rate, fnodes contain a small tree; if subtrees are needed + they occupy essentially a full block in anodes. A leaf-level tree node + has 3-word entries giving sector runs, a non-leaf node has 2-word + entries giving subtree pointers. A flag in the header says which. */ + +struct bplus_leaf_node +{ + u32 file_secno; /* first file sector in extent */ + u32 length; /* length, sectors */ + secno disk_secno; /* first corresponding disk sector */ +}; + +struct bplus_internal_node +{ + u32 file_secno; /* subtree maps sectors < this */ + anode_secno down; /* pointer to subtree */ +}; + +struct bplus_header +{ +#ifdef __LITTLE_ENDIAN + u8 hbff: 1; /* high bit of first free entry offset */ + u8 flag1234: 4; + u8 fnode_parent: 1; /* ? we're pointed to by an fnode, + the data btree or some ea or the + main ea bootage pointer ea_secno */ + /* also can get set in fnodes, which + may be a chkdsk glitch or may mean + this bit is irrelevant in fnodes, + or this interpretation is all wet */ + u8 binary_search: 1; /* suggest binary search (unused) */ + u8 internal: 1; /* 1 -> (internal) tree of anodes + 0 -> (leaf) list of extents */ +#else + u8 internal: 1; /* 1 -> (internal) tree of anodes + 0 -> (leaf) list of extents */ + u8 binary_search: 1; /* suggest binary search (unused) */ + u8 fnode_parent: 1; /* ? we're pointed to by an fnode, + the data btree or some ea or the + main ea bootage pointer ea_secno */ + /* also can get set in fnodes, which + may be a chkdsk glitch or may mean + this bit is irrelevant in fnodes, + or this interpretation is all wet */ + u8 flag1234: 4; + u8 hbff: 1; /* high bit of first free entry offset */ +#endif + u8 fill[3]; + u8 n_free_nodes; /* free nodes in following array */ + u8 n_used_nodes; /* used nodes in following array */ + u16 first_free; /* offset from start of header to + first free node in array */ + union { + struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving + subtree pointers */ + struct bplus_leaf_node external[0]; /* (external) 3-word entries giving + sector runs */ + } u; +}; + +/* fnode: root of allocation b+ tree, and EA's */ + +/* Every file and every directory has one fnode, pointed to by the directory + entry and pointing to the file's sectors or directory's root dnode. EA's + are also stored here, and there are said to be ACL's somewhere here too. */ + +#define FNODE_MAGIC 0xf7e40aae + +struct fnode +{ + u32 magic; /* f7e4 0aae */ + u32 zero1[2]; /* read history */ + u8 len, name[15]; /* true length, truncated name */ + fnode_secno up; /* pointer to file's directory fnode */ + secno acl_size_l; + secno acl_secno; + u16 acl_size_s; + u8 acl_anode; + u8 zero2; /* history bit count */ + u32 ea_size_l; /* length of disk-resident ea's */ + secno ea_secno; /* first sector of disk-resident ea's*/ + u16 ea_size_s; /* length of fnode-resident ea's */ + +#ifdef __LITTLE_ENDIAN + u8 flag0: 1; + u8 ea_anode: 1; /* 1 -> ea_secno is an anode */ + u8 flag234567: 6; +#else + u8 flag234567: 6; + u8 ea_anode: 1; /* 1 -> ea_secno is an anode */ + u8 flag0: 1; +#endif + +#ifdef __LITTLE_ENDIAN + u8 dirflag: 1; /* 1 -> directory. first & only extent + points to dnode. */ + u8 flag9012345: 7; +#else + u8 flag9012345: 7; + u8 dirflag: 1; /* 1 -> directory. first & only extent + points to dnode. */ +#endif + + struct bplus_header btree; /* b+ tree, 8 extents or 12 subtrees */ + union { + struct bplus_leaf_node external[8]; + struct bplus_internal_node internal[12]; + } u; + + u32 file_size; /* file length, bytes */ + u32 n_needea; /* number of EA's with NEEDEA set */ + u8 user_id[16]; /* unused */ + u16 ea_offs; /* offset from start of fnode + to first fnode-resident ea */ + u8 dasd_limit_treshhold; + u8 dasd_limit_delta; + u32 dasd_limit; + u32 dasd_usage; + u8 ea[316]; /* zero or more EA's, packed together + with no alignment padding. + (Do not use this name, get here + via fnode + ea_offs. I think.) */ +}; + + +/* anode: 99.44% pure allocation tree */ + +#define ANODE_MAGIC 0x37e40aae + +struct anode +{ + u32 magic; /* 37e4 0aae */ + anode_secno self; /* pointer to this anode */ + secno up; /* parent anode or fnode */ + + struct bplus_header btree; /* b+tree, 40 extents or 60 subtrees */ + union { + struct bplus_leaf_node external[40]; + struct bplus_internal_node internal[60]; + } u; + + u32 fill[3]; /* unused */ +}; + + +/* extended attributes. + + A file's EA info is stored as a list of (name,value) pairs. It is + usually in the fnode, but (if it's large) it is moved to a single + sector run outside the fnode, or to multiple runs with an anode tree + that points to them. + + The value of a single EA is stored along with the name, or (if large) + it is moved to a single sector run, or multiple runs pointed to by an + anode tree, pointed to by the value field of the (name,value) pair. + + Flags in the EA tell whether the value is immediate, in a single sector + run, or in multiple runs. Flags in the fnode tell whether the EA list + is immediate, in a single run, or in multiple runs. */ + +struct extended_attribute +{ +#ifdef __LITTLE_ENDIAN + u8 indirect: 1; /* 1 -> value gives sector number + where real value starts */ + u8 anode: 1; /* 1 -> sector is an anode + that points to fragmented value */ + u8 flag23456: 5; + u8 needea: 1; /* required ea */ +#else + u8 needea: 1; /* required ea */ + u8 flag23456: 5; + u8 anode: 1; /* 1 -> sector is an anode + that points to fragmented value */ + u8 indirect: 1; /* 1 -> value gives sector number + where real value starts */ +#endif + u8 namelen; /* length of name, bytes */ + u8 valuelen_lo; /* length of value, bytes */ + u8 valuelen_hi; /* length of value, bytes */ + u8 name[0]; + /* + u8 name[namelen]; ascii attrib name + u8 nul; terminating '\0', not counted + u8 value[valuelen]; value, arbitrary + if this.indirect, valuelen is 8 and the value is + u32 length; real length of value, bytes + secno secno; sector address where it starts + if this.anode, the above sector number is the root of an anode tree + which points to the value. + */ +}; + +/* + Local Variables: + comment-column: 40 + End: +*/ diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h new file mode 100644 index 00000000..dd552f86 --- /dev/null +++ b/fs/hpfs/hpfs_fn.h @@ -0,0 +1,360 @@ +/* + * linux/fs/hpfs/hpfs_fn.h + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * function headers + */ + +//#define DBG +//#define DEBUG_LOCKS + +#include +#include +#include +#include +#include + +#include "hpfs.h" + +#define EIOERROR EIO +#define EFSERROR EPERM +#define EMEMERROR ENOMEM + +#define ANODE_ALLOC_FWD 512 +#define FNODE_ALLOC_FWD 0 +#define ALLOC_FWD_MIN 16 +#define ALLOC_FWD_MAX 128 +#define ALLOC_M 1 +#define FNODE_RD_AHEAD 16 +#define ANODE_RD_AHEAD 16 +#define DNODE_RD_AHEAD 4 + +#define FREE_DNODES_ADD 58 +#define FREE_DNODES_DEL 29 + +#define CHKCOND(x,y) if (!(x)) printk y + +#ifdef DBG +#define PRINTK(x) printk x +#else +#undef PRINTK +#define PRINTK(x) +#endif + +struct hpfs_inode_info { + loff_t mmu_private; + ino_t i_parent_dir; /* (directories) gives fnode of parent dir */ + unsigned i_dno; /* (directories) root dnode */ + unsigned i_dpos; /* (directories) temp for readdir */ + unsigned i_dsubdno; /* (directories) temp for readdir */ + unsigned i_file_sec; /* (files) minimalist cache of alloc info */ + unsigned i_disk_sec; /* (files) minimalist cache of alloc info */ + unsigned i_n_secs; /* (files) minimalist cache of alloc info */ + unsigned i_ea_size; /* size of extended attributes */ + unsigned i_ea_mode : 1; /* file's permission is stored in ea */ + unsigned i_ea_uid : 1; /* file's uid is stored in ea */ + unsigned i_ea_gid : 1; /* file's gid is stored in ea */ + unsigned i_dirty : 1; + loff_t **i_rddir_off; + struct inode vfs_inode; +}; + +struct hpfs_sb_info { + struct mutex hpfs_mutex; /* global hpfs lock */ + ino_t sb_root; /* inode number of root dir */ + unsigned sb_fs_size; /* file system size, sectors */ + unsigned sb_bitmaps; /* sector number of bitmap list */ + unsigned sb_dirband_start; /* directory band start sector */ + unsigned sb_dirband_size; /* directory band size, dnodes */ + unsigned sb_dmap; /* sector number of dnode bit map */ + unsigned sb_n_free; /* free blocks for statfs, or -1 */ + unsigned sb_n_free_dnodes; /* free dnodes for statfs, or -1 */ + uid_t sb_uid; /* uid from mount options */ + gid_t sb_gid; /* gid from mount options */ + umode_t sb_mode; /* mode from mount options */ + unsigned sb_eas : 2; /* eas: 0-ignore, 1-ro, 2-rw */ + unsigned sb_err : 2; /* on errs: 0-cont, 1-ro, 2-panic */ + unsigned sb_chk : 2; /* checks: 0-no, 1-normal, 2-strict */ + unsigned sb_lowercase : 1; /* downcase filenames hackery */ + unsigned sb_was_error : 1; /* there was an error, set dirty flag */ + unsigned sb_chkdsk : 2; /* chkdsk: 0-no, 1-on errs, 2-allways */ + unsigned char *sb_cp_table; /* code page tables: */ + /* 128 bytes uppercasing table & */ + /* 128 bytes lowercasing table */ + unsigned *sb_bmp_dir; /* main bitmap directory */ + unsigned sb_c_bitmap; /* current bitmap */ + unsigned sb_max_fwd_alloc; /* max forwad allocation */ + int sb_timeshift; +}; + +/* Four 512-byte buffers and the 2k block obtained by concatenating them */ + +struct quad_buffer_head { + struct buffer_head *bh[4]; + void *data; +}; + +/* The b-tree down pointer from a dir entry */ + +static inline dnode_secno de_down_pointer (struct hpfs_dirent *de) +{ + CHKCOND(de->down,("HPFS: de_down_pointer: !de->down\n")); + return le32_to_cpu(*(dnode_secno *) ((void *) de + le16_to_cpu(de->length) - 4)); +} + +/* The first dir entry in a dnode */ + +static inline struct hpfs_dirent *dnode_first_de (struct dnode *dnode) +{ + return (void *) dnode->dirent; +} + +/* The end+1 of the dir entries */ + +static inline struct hpfs_dirent *dnode_end_de (struct dnode *dnode) +{ + CHKCOND(le32_to_cpu(dnode->first_free)>=0x14 && le32_to_cpu(dnode->first_free)<=0xa00,("HPFS: dnode_end_de: dnode->first_free = %x\n",(unsigned)le32_to_cpu(dnode->first_free))); + return (void *) dnode + le32_to_cpu(dnode->first_free); +} + +/* The dir entry after dir entry de */ + +static inline struct hpfs_dirent *de_next_de (struct hpfs_dirent *de) +{ + CHKCOND(le16_to_cpu(de->length)>=0x20 && le16_to_cpu(de->length)<0x800,("HPFS: de_next_de: de->length = %x\n",(unsigned)le16_to_cpu(de->length))); + return (void *) de + le16_to_cpu(de->length); +} + +static inline struct extended_attribute *fnode_ea(struct fnode *fnode) +{ + return (struct extended_attribute *)((char *)fnode + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s)); +} + +static inline struct extended_attribute *fnode_end_ea(struct fnode *fnode) +{ + return (struct extended_attribute *)((char *)fnode + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s)); +} + +static unsigned ea_valuelen(struct extended_attribute *ea) +{ + return ea->valuelen_lo + 256 * ea->valuelen_hi; +} + +static inline struct extended_attribute *next_ea(struct extended_attribute *ea) +{ + return (struct extended_attribute *)((char *)ea + 5 + ea->namelen + ea_valuelen(ea)); +} + +static inline secno ea_sec(struct extended_attribute *ea) +{ + return le32_to_cpu(get_unaligned((secno *)((char *)ea + 9 + ea->namelen))); +} + +static inline secno ea_len(struct extended_attribute *ea) +{ + return le32_to_cpu(get_unaligned((secno *)((char *)ea + 5 + ea->namelen))); +} + +static inline char *ea_data(struct extended_attribute *ea) +{ + return (char *)((char *)ea + 5 + ea->namelen); +} + +static inline unsigned de_size(int namelen, secno down_ptr) +{ + return ((0x1f + namelen + 3) & ~3) + (down_ptr ? 4 : 0); +} + +static inline void copy_de(struct hpfs_dirent *dst, struct hpfs_dirent *src) +{ + int a; + int n; + if (!dst || !src) return; + a = dst->down; + n = dst->not_8x3; + memcpy((char *)dst + 2, (char *)src + 2, 28); + dst->down = a; + dst->not_8x3 = n; +} + +static inline unsigned tstbits(u32 *bmp, unsigned b, unsigned n) +{ + int i; + if ((b >= 0x4000) || (b + n - 1 >= 0x4000)) return n; + if (!((le32_to_cpu(bmp[(b & 0x3fff) >> 5]) >> (b & 0x1f)) & 1)) return 1; + for (i = 1; i < n; i++) + if (!((le32_to_cpu(bmp[((b+i) & 0x3fff) >> 5]) >> ((b+i) & 0x1f)) & 1)) + return i + 1; + return 0; +} + +/* alloc.c */ + +int hpfs_chk_sectors(struct super_block *, secno, int, char *); +secno hpfs_alloc_sector(struct super_block *, secno, unsigned, int); +int hpfs_alloc_if_possible(struct super_block *, secno); +void hpfs_free_sectors(struct super_block *, secno, unsigned); +int hpfs_check_free_dnodes(struct super_block *, int); +void hpfs_free_dnode(struct super_block *, secno); +struct dnode *hpfs_alloc_dnode(struct super_block *, secno, dnode_secno *, struct quad_buffer_head *); +struct fnode *hpfs_alloc_fnode(struct super_block *, secno, fnode_secno *, struct buffer_head **); +struct anode *hpfs_alloc_anode(struct super_block *, secno, anode_secno *, struct buffer_head **); + +/* anode.c */ + +secno hpfs_bplus_lookup(struct super_block *, struct inode *, struct bplus_header *, unsigned, struct buffer_head *); +secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned); +void hpfs_remove_btree(struct super_block *, struct bplus_header *); +int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *); +int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, const char *); +void hpfs_ea_remove(struct super_block *, secno, int, unsigned); +void hpfs_truncate_btree(struct super_block *, secno, int, unsigned); +void hpfs_remove_fnode(struct super_block *, fnode_secno fno); + +/* buffer.c */ + +void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int); +void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **); +void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int); +void *hpfs_get_4sectors(struct super_block *, unsigned, struct quad_buffer_head *); +void hpfs_brelse4(struct quad_buffer_head *); +void hpfs_mark_4buffers_dirty(struct quad_buffer_head *); + +/* dentry.c */ + +extern const struct dentry_operations hpfs_dentry_operations; + +/* dir.c */ + +struct dentry *hpfs_lookup(struct inode *, struct dentry *, struct nameidata *); +extern const struct file_operations hpfs_dir_ops; + +/* dnode.c */ + +void hpfs_add_pos(struct inode *, loff_t *); +void hpfs_del_pos(struct inode *, loff_t *); +struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *, + const unsigned char *, unsigned, secno); +int hpfs_add_dirent(struct inode *, const unsigned char *, unsigned, + struct hpfs_dirent *); +int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int); +void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *); +dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno); +struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *); +struct hpfs_dirent *map_dirent(struct inode *, dnode_secno, + const unsigned char *, unsigned, dnode_secno *, + struct quad_buffer_head *); +void hpfs_remove_dtree(struct super_block *, dnode_secno); +struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *); + +/* ea.c */ + +void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned); +int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int); +char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *); +void hpfs_set_ea(struct inode *, struct fnode *, const char *, + const char *, int); + +/* file.c */ + +int hpfs_file_fsync(struct file *, int); +extern const struct file_operations hpfs_file_ops; +extern const struct inode_operations hpfs_file_iops; +extern const struct address_space_operations hpfs_aops; + +/* inode.c */ + +void hpfs_init_inode(struct inode *); +void hpfs_read_inode(struct inode *); +void hpfs_write_inode(struct inode *); +void hpfs_write_inode_nolock(struct inode *); +int hpfs_setattr(struct dentry *, struct iattr *); +void hpfs_write_if_changed(struct inode *); +void hpfs_evict_inode(struct inode *); + +/* map.c */ + +unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *); +unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *); +unsigned char *hpfs_load_code_page(struct super_block *, secno); +secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp); +struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); +struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **); +struct dnode *hpfs_map_dnode(struct super_block *s, dnode_secno, struct quad_buffer_head *); +dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino); + +/* name.c */ + +unsigned char hpfs_upcase(unsigned char *, unsigned char); +int hpfs_chk_name(const unsigned char *, unsigned *); +unsigned char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int); +int hpfs_compare_names(struct super_block *, const unsigned char *, unsigned, + const unsigned char *, unsigned, int); +int hpfs_is_name_long(const unsigned char *, unsigned); +void hpfs_adjust_length(const unsigned char *, unsigned *); + +/* namei.c */ + +extern const struct inode_operations hpfs_dir_iops; +extern const struct address_space_operations hpfs_symlink_aops; + +static inline struct hpfs_inode_info *hpfs_i(struct inode *inode) +{ + return list_entry(inode, struct hpfs_inode_info, vfs_inode); +} + +static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* super.c */ + +void hpfs_error(struct super_block *, const char *, ...) + __attribute__((format (printf, 2, 3))); +int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *); +unsigned hpfs_count_one_bitmap(struct super_block *, secno); + +/* + * local time (HPFS) to GMT (Unix) + */ + +static inline time_t local_to_gmt(struct super_block *s, time32_t t) +{ + extern struct timezone sys_tz; + return t + sys_tz.tz_minuteswest * 60 + hpfs_sb(s)->sb_timeshift; +} + +static inline time32_t gmt_to_local(struct super_block *s, time_t t) +{ + extern struct timezone sys_tz; + return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift; +} + +/* + * Locking: + * + * hpfs_lock() locks the whole filesystem. It must be taken + * on any method called by the VFS. + * + * We don't do any per-file locking anymore, it is hard to + * review and HPFS is not performance-sensitive anyway. + */ +static inline void hpfs_lock(struct super_block *s) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + mutex_lock(&sbi->hpfs_mutex); +} + +static inline void hpfs_unlock(struct super_block *s) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + mutex_unlock(&sbi->hpfs_mutex); +} + +static inline void hpfs_lock_assert(struct super_block *s) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + WARN_ON(!mutex_is_locked(&sbi->hpfs_mutex)); +} diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c new file mode 100644 index 00000000..338cd836 --- /dev/null +++ b/fs/hpfs/inode.c @@ -0,0 +1,308 @@ +/* + * linux/fs/hpfs/inode.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * inode VFS functions + */ + +#include +#include "hpfs_fn.h" + +void hpfs_init_inode(struct inode *i) +{ + struct super_block *sb = i->i_sb; + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + + i->i_uid = hpfs_sb(sb)->sb_uid; + i->i_gid = hpfs_sb(sb)->sb_gid; + i->i_mode = hpfs_sb(sb)->sb_mode; + i->i_size = -1; + i->i_blocks = -1; + + hpfs_inode->i_dno = 0; + hpfs_inode->i_n_secs = 0; + hpfs_inode->i_file_sec = 0; + hpfs_inode->i_disk_sec = 0; + hpfs_inode->i_dpos = 0; + hpfs_inode->i_dsubdno = 0; + hpfs_inode->i_ea_mode = 0; + hpfs_inode->i_ea_uid = 0; + hpfs_inode->i_ea_gid = 0; + hpfs_inode->i_ea_size = 0; + + hpfs_inode->i_rddir_off = NULL; + hpfs_inode->i_dirty = 0; + + i->i_ctime.tv_sec = i->i_ctime.tv_nsec = 0; + i->i_mtime.tv_sec = i->i_mtime.tv_nsec = 0; + i->i_atime.tv_sec = i->i_atime.tv_nsec = 0; +} + +void hpfs_read_inode(struct inode *i) +{ + struct buffer_head *bh; + struct fnode *fnode; + struct super_block *sb = i->i_sb; + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + void *ea; + int ea_size; + + if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) { + /*i->i_mode |= S_IFREG; + i->i_mode &= ~0111; + i->i_op = &hpfs_file_iops; + i->i_fop = &hpfs_file_ops; + i->i_nlink = 0;*/ + make_bad_inode(i); + return; + } + if (hpfs_sb(i->i_sb)->sb_eas) { + if ((ea = hpfs_get_ea(i->i_sb, fnode, "UID", &ea_size))) { + if (ea_size == 2) { + i->i_uid = le16_to_cpu(*(__le16*)ea); + hpfs_inode->i_ea_uid = 1; + } + kfree(ea); + } + if ((ea = hpfs_get_ea(i->i_sb, fnode, "GID", &ea_size))) { + if (ea_size == 2) { + i->i_gid = le16_to_cpu(*(__le16*)ea); + hpfs_inode->i_ea_gid = 1; + } + kfree(ea); + } + if ((ea = hpfs_get_ea(i->i_sb, fnode, "SYMLINK", &ea_size))) { + kfree(ea); + i->i_mode = S_IFLNK | 0777; + i->i_op = &page_symlink_inode_operations; + i->i_data.a_ops = &hpfs_symlink_aops; + i->i_nlink = 1; + i->i_size = ea_size; + i->i_blocks = 1; + brelse(bh); + return; + } + if ((ea = hpfs_get_ea(i->i_sb, fnode, "MODE", &ea_size))) { + int rdev = 0; + umode_t mode = hpfs_sb(sb)->sb_mode; + if (ea_size == 2) { + mode = le16_to_cpu(*(__le16*)ea); + hpfs_inode->i_ea_mode = 1; + } + kfree(ea); + i->i_mode = mode; + if (S_ISBLK(mode) || S_ISCHR(mode)) { + if ((ea = hpfs_get_ea(i->i_sb, fnode, "DEV", &ea_size))) { + if (ea_size == 4) + rdev = le32_to_cpu(*(__le32*)ea); + kfree(ea); + } + } + if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { + brelse(bh); + i->i_nlink = 1; + i->i_size = 0; + i->i_blocks = 1; + init_special_inode(i, mode, + new_decode_dev(rdev)); + return; + } + } + } + if (fnode->dirflag) { + int n_dnodes, n_subdirs; + i->i_mode |= S_IFDIR; + i->i_op = &hpfs_dir_iops; + i->i_fop = &hpfs_dir_ops; + hpfs_inode->i_parent_dir = le32_to_cpu(fnode->up); + hpfs_inode->i_dno = le32_to_cpu(fnode->u.external[0].disk_secno); + if (hpfs_sb(sb)->sb_chk >= 2) { + struct buffer_head *bh0; + if (hpfs_map_fnode(sb, hpfs_inode->i_parent_dir, &bh0)) brelse(bh0); + } + n_dnodes = 0; n_subdirs = 0; + hpfs_count_dnodes(i->i_sb, hpfs_inode->i_dno, &n_dnodes, &n_subdirs, NULL); + i->i_blocks = 4 * n_dnodes; + i->i_size = 2048 * n_dnodes; + i->i_nlink = 2 + n_subdirs; + } else { + i->i_mode |= S_IFREG; + if (!hpfs_inode->i_ea_mode) i->i_mode &= ~0111; + i->i_op = &hpfs_file_iops; + i->i_fop = &hpfs_file_ops; + i->i_nlink = 1; + i->i_size = le32_to_cpu(fnode->file_size); + i->i_blocks = ((i->i_size + 511) >> 9) + 1; + i->i_data.a_ops = &hpfs_aops; + hpfs_i(i)->mmu_private = i->i_size; + } + brelse(bh); +} + +static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + /*if (le32_to_cpu(fnode->acl_size_l) || le16_to_cpu(fnode->acl_size_s)) { + Some unknown structures like ACL may be in fnode, + we'd better not overwrite them + hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino); + } else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) { + __le32 ea; + if ((i->i_uid != hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) { + ea = cpu_to_le32(i->i_uid); + hpfs_set_ea(i, fnode, "UID", (char*)&ea, 2); + hpfs_inode->i_ea_uid = 1; + } + if ((i->i_gid != hpfs_sb(i->i_sb)->sb_gid) || hpfs_inode->i_ea_gid) { + ea = cpu_to_le32(i->i_gid); + hpfs_set_ea(i, fnode, "GID", (char *)&ea, 2); + hpfs_inode->i_ea_gid = 1; + } + if (!S_ISLNK(i->i_mode)) + if ((i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0 : 0111)) + | (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG)) + && i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0222 : 0333)) + | (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG))) || hpfs_inode->i_ea_mode) { + ea = cpu_to_le32(i->i_mode); + /* sick, but legal */ + hpfs_set_ea(i, fnode, "MODE", (char *)&ea, 2); + hpfs_inode->i_ea_mode = 1; + } + if (S_ISBLK(i->i_mode) || S_ISCHR(i->i_mode)) { + ea = cpu_to_le32(new_encode_dev(i->i_rdev)); + hpfs_set_ea(i, fnode, "DEV", (char *)&ea, 4); + } + } +} + +void hpfs_write_inode(struct inode *i) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + struct inode *parent; + if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; + if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) { + if (*hpfs_inode->i_rddir_off) printk("HPFS: write_inode: some position still there\n"); + kfree(hpfs_inode->i_rddir_off); + hpfs_inode->i_rddir_off = NULL; + } + if (!i->i_nlink) { + return; + } + parent = iget_locked(i->i_sb, hpfs_inode->i_parent_dir); + if (parent) { + hpfs_inode->i_dirty = 0; + if (parent->i_state & I_NEW) { + hpfs_init_inode(parent); + hpfs_read_inode(parent); + unlock_new_inode(parent); + } + hpfs_write_inode_nolock(i); + iput(parent); + } +} + +void hpfs_write_inode_nolock(struct inode *i) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(i); + struct buffer_head *bh; + struct fnode *fnode; + struct quad_buffer_head qbh; + struct hpfs_dirent *de; + if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; + if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) return; + if (i->i_ino != hpfs_sb(i->i_sb)->sb_root && i->i_nlink) { + if (!(de = map_fnode_dirent(i->i_sb, i->i_ino, fnode, &qbh))) { + brelse(bh); + return; + } + } else de = NULL; + if (S_ISREG(i->i_mode)) { + fnode->file_size = cpu_to_le32(i->i_size); + if (de) de->file_size = cpu_to_le32(i->i_size); + } else if (S_ISDIR(i->i_mode)) { + fnode->file_size = cpu_to_le32(0); + if (de) de->file_size = cpu_to_le32(0); + } + hpfs_write_inode_ea(i, fnode); + if (de) { + de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec)); + de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec)); + de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec)); + de->read_only = !(i->i_mode & 0222); + de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + } + if (S_ISDIR(i->i_mode)) { + if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) { + de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec)); + de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec)); + de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec)); + de->read_only = !(i->i_mode & 0222); + de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0); + de->file_size = cpu_to_le32(0); + hpfs_mark_4buffers_dirty(&qbh); + hpfs_brelse4(&qbh); + } else + hpfs_error(i->i_sb, + "directory %08lx doesn't have '.' entry", + (unsigned long)i->i_ino); + } + mark_buffer_dirty(bh); + brelse(bh); +} + +int hpfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error = -EINVAL; + + hpfs_lock(inode->i_sb); + if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root) + goto out_unlock; + if ((attr->ia_valid & ATTR_UID) && attr->ia_uid >= 0x10000) + goto out_unlock; + if ((attr->ia_valid & ATTR_GID) && attr->ia_gid >= 0x10000) + goto out_unlock; + if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) + goto out_unlock; + + error = inode_change_ok(inode, attr); + if (error) + goto out_unlock; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = vmtruncate(inode, attr->ia_size); + if (error) + goto out_unlock; + } + + setattr_copy(inode, attr); + + hpfs_write_inode(inode); + + out_unlock: + hpfs_unlock(inode->i_sb); + return error; +} + +void hpfs_write_if_changed(struct inode *inode) +{ + struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); + + if (hpfs_inode->i_dirty) + hpfs_write_inode(inode); +} + +void hpfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + if (!inode->i_nlink) { + hpfs_lock(inode->i_sb); + hpfs_remove_fnode(inode->i_sb, inode->i_ino); + hpfs_unlock(inode->i_sb); + } +} diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c new file mode 100644 index 00000000..a7908213 --- /dev/null +++ b/fs/hpfs/map.c @@ -0,0 +1,287 @@ +/* + * linux/fs/hpfs/map.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * mapping structures to memory with some minimal checks + */ + +#include "hpfs_fn.h" + +unsigned *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh) +{ + return hpfs_map_4sectors(s, hpfs_sb(s)->sb_dmap, qbh, 0); +} + +unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block, + struct quad_buffer_head *qbh, char *id) +{ + secno sec; + if (hpfs_sb(s)->sb_chk) if (bmp_block * 16384 > hpfs_sb(s)->sb_fs_size) { + hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id); + return NULL; + } + sec = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]); + if (!sec || sec > hpfs_sb(s)->sb_fs_size-4) { + hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id); + return NULL; + } + return hpfs_map_4sectors(s, sec, qbh, 4); +} + +/* + * Load first code page into kernel memory, return pointer to 256-byte array, + * first 128 bytes are uppercasing table for chars 128-255, next 128 bytes are + * lowercasing table + */ + +unsigned char *hpfs_load_code_page(struct super_block *s, secno cps) +{ + struct buffer_head *bh; + secno cpds; + unsigned cpi; + unsigned char *ptr; + unsigned char *cp_table; + int i; + struct code_page_data *cpd; + struct code_page_directory *cp = hpfs_map_sector(s, cps, &bh, 0); + if (!cp) return NULL; + if (le32_to_cpu(cp->magic) != CP_DIR_MAGIC) { + printk("HPFS: Code page directory magic doesn't match (magic = %08x)\n", le32_to_cpu(cp->magic)); + brelse(bh); + return NULL; + } + if (!le32_to_cpu(cp->n_code_pages)) { + printk("HPFS: n_code_pages == 0\n"); + brelse(bh); + return NULL; + } + cpds = le32_to_cpu(cp->array[0].code_page_data); + cpi = le16_to_cpu(cp->array[0].index); + brelse(bh); + + if (cpi >= 3) { + printk("HPFS: Code page index out of array\n"); + return NULL; + } + + if (!(cpd = hpfs_map_sector(s, cpds, &bh, 0))) return NULL; + if (le16_to_cpu(cpd->offs[cpi]) > 0x178) { + printk("HPFS: Code page index out of sector\n"); + brelse(bh); + return NULL; + } + ptr = (unsigned char *)cpd + le16_to_cpu(cpd->offs[cpi]) + 6; + if (!(cp_table = kmalloc(256, GFP_KERNEL))) { + printk("HPFS: out of memory for code page table\n"); + brelse(bh); + return NULL; + } + memcpy(cp_table, ptr, 128); + brelse(bh); + + /* Try to build lowercasing table from uppercasing one */ + + for (i=128; i<256; i++) cp_table[i]=i; + for (i=128; i<256; i++) if (cp_table[i-128]!=i && cp_table[i-128]>=128) + cp_table[cp_table[i-128]] = i; + + return cp_table; +} + +secno *hpfs_load_bitmap_directory(struct super_block *s, secno bmp) +{ + struct buffer_head *bh; + int n = (hpfs_sb(s)->sb_fs_size + 0x200000 - 1) >> 21; + int i; + secno *b; + if (!(b = kmalloc(n * 512, GFP_KERNEL))) { + printk("HPFS: can't allocate memory for bitmap directory\n"); + return NULL; + } + for (i=0;isb_chk) if (hpfs_chk_sectors(s, ino, 1, "fnode")) { + return NULL; + } + if ((fnode = hpfs_map_sector(s, ino, bhp, FNODE_RD_AHEAD))) { + if (hpfs_sb(s)->sb_chk) { + struct extended_attribute *ea; + struct extended_attribute *ea_end; + if (le32_to_cpu(fnode->magic) != FNODE_MAGIC) { + hpfs_error(s, "bad magic on fnode %08lx", + (unsigned long)ino); + goto bail; + } + if (!fnode->dirflag) { + if ((unsigned)fnode->btree.n_used_nodes + (unsigned)fnode->btree.n_free_nodes != + (fnode->btree.internal ? 12 : 8)) { + hpfs_error(s, + "bad number of nodes in fnode %08lx", + (unsigned long)ino); + goto bail; + } + if (le16_to_cpu(fnode->btree.first_free) != + 8 + fnode->btree.n_used_nodes * (fnode->btree.internal ? 8 : 12)) { + hpfs_error(s, + "bad first_free pointer in fnode %08lx", + (unsigned long)ino); + goto bail; + } + } + if (le16_to_cpu(fnode->ea_size_s) && (le16_to_cpu(fnode->ea_offs) < 0xc4 || + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200)) { + hpfs_error(s, + "bad EA info in fnode %08lx: ea_offs == %04x ea_size_s == %04x", + (unsigned long)ino, + le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s)); + goto bail; + } + ea = fnode_ea(fnode); + ea_end = fnode_end_ea(fnode); + while (ea != ea_end) { + if (ea > ea_end) { + hpfs_error(s, "bad EA in fnode %08lx", + (unsigned long)ino); + goto bail; + } + ea = next_ea(ea); + } + } + } + return fnode; + bail: + brelse(*bhp); + return NULL; +} + +struct anode *hpfs_map_anode(struct super_block *s, anode_secno ano, struct buffer_head **bhp) +{ + struct anode *anode; + if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, ano, 1, "anode")) return NULL; + if ((anode = hpfs_map_sector(s, ano, bhp, ANODE_RD_AHEAD))) + if (hpfs_sb(s)->sb_chk) { + if (le32_to_cpu(anode->magic) != ANODE_MAGIC) { + hpfs_error(s, "bad magic on anode %08x", ano); + goto bail; + } + if (le32_to_cpu(anode->self) != ano) { + hpfs_error(s, "self pointer invalid on anode %08x", ano); + goto bail; + } + if ((unsigned)anode->btree.n_used_nodes + (unsigned)anode->btree.n_free_nodes != + (anode->btree.internal ? 60 : 40)) { + hpfs_error(s, "bad number of nodes in anode %08x", ano); + goto bail; + } + if (le16_to_cpu(anode->btree.first_free) != + 8 + anode->btree.n_used_nodes * (anode->btree.internal ? 8 : 12)) { + hpfs_error(s, "bad first_free pointer in anode %08x", ano); + goto bail; + } + } + return anode; + bail: + brelse(*bhp); + return NULL; +} + +/* + * Load dnode to memory and do some checks + */ + +struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno, + struct quad_buffer_head *qbh) +{ + struct dnode *dnode; + if (hpfs_sb(s)->sb_chk) { + if (hpfs_chk_sectors(s, secno, 4, "dnode")) return NULL; + if (secno & 3) { + hpfs_error(s, "dnode %08x not byte-aligned", secno); + return NULL; + } + } + if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD))) + if (hpfs_sb(s)->sb_chk) { + unsigned p, pp = 0; + unsigned char *d = (unsigned char *)dnode; + int b = 0; + if (le32_to_cpu(dnode->magic) != DNODE_MAGIC) { + hpfs_error(s, "bad magic on dnode %08x", secno); + goto bail; + } + if (le32_to_cpu(dnode->self) != secno) + hpfs_error(s, "bad self pointer on dnode %08x self = %08x", secno, le32_to_cpu(dnode->self)); + /* Check dirents - bad dirents would cause infinite + loops or shooting to memory */ + if (le32_to_cpu(dnode->first_free) > 2048) { + hpfs_error(s, "dnode %08x has first_free == %08x", secno, le32_to_cpu(dnode->first_free)); + goto bail; + } + for (p = 20; p < le32_to_cpu(dnode->first_free); p += d[p] + (d[p+1] << 8)) { + struct hpfs_dirent *de = (struct hpfs_dirent *)((char *)dnode + p); + if (le16_to_cpu(de->length) > 292 || (le16_to_cpu(de->length) < 32) || (le16_to_cpu(de->length) & 3) || p + le16_to_cpu(de->length) > 2048) { + hpfs_error(s, "bad dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp); + goto bail; + } + if (((31 + de->namelen + de->down*4 + 3) & ~3) != le16_to_cpu(de->length)) { + if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & MS_RDONLY) goto ok; + hpfs_error(s, "namelen does not match dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp); + goto bail; + } + ok: + if (hpfs_sb(s)->sb_chk >= 2) b |= 1 << de->down; + if (de->down) if (de_down_pointer(de) < 0x10) { + hpfs_error(s, "bad down pointer in dnode %08x, dirent %03x, last %03x", secno, p, pp); + goto bail; + } + pp = p; + + } + if (p != le32_to_cpu(dnode->first_free)) { + hpfs_error(s, "size on last dirent does not match first_free; dnode %08x", secno); + goto bail; + } + if (d[pp + 30] != 1 || d[pp + 31] != 255) { + hpfs_error(s, "dnode %08x does not end with \\377 entry", secno); + goto bail; + } + if (b == 3) printk("HPFS: warning: unbalanced dnode tree, dnode %08x; see hpfs.txt 4 more info\n", secno); + } + return dnode; + bail: + hpfs_brelse4(qbh); + return NULL; +} + +dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino) +{ + struct buffer_head *bh; + struct fnode *fnode; + dnode_secno dno; + + fnode = hpfs_map_fnode(s, ino, &bh); + if (!fnode) + return 0; + + dno = le32_to_cpu(fnode->u.external[0].disk_secno); + brelse(bh); + return dno; +} diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c new file mode 100644 index 00000000..9acdf338 --- /dev/null +++ b/fs/hpfs/name.c @@ -0,0 +1,112 @@ +/* + * linux/fs/hpfs/name.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * operations with filenames + */ + +#include "hpfs_fn.h" + +static inline int not_allowed_char(unsigned char c) +{ + return c<' ' || c=='"' || c=='*' || c=='/' || c==':' || c=='<' || + c=='>' || c=='?' || c=='\\' || c=='|'; +} + +static inline int no_dos_char(unsigned char c) +{ /* Characters that are allowed in HPFS but not in DOS */ + return c=='+' || c==',' || c==';' || c=='=' || c=='[' || c==']'; +} + +static inline unsigned char upcase(unsigned char *dir, unsigned char a) +{ + if (a<128 || a==255) return a>='a' && a<='z' ? a - 0x20 : a; + if (!dir) return a; + return dir[a-128]; +} + +unsigned char hpfs_upcase(unsigned char *dir, unsigned char a) +{ + return upcase(dir, a); +} + +static inline unsigned char locase(unsigned char *dir, unsigned char a) +{ + if (a<128 || a==255) return a>='A' && a<='Z' ? a + 0x20 : a; + if (!dir) return a; + return dir[a]; +} + +int hpfs_chk_name(const unsigned char *name, unsigned *len) +{ + int i; + if (*len > 254) return -ENAMETOOLONG; + hpfs_adjust_length(name, len); + if (!*len) return -EINVAL; + for (i = 0; i < *len; i++) if (not_allowed_char(name[i])) return -EINVAL; + if (*len == 1) if (name[0] == '.') return -EINVAL; + if (*len == 2) if (name[0] == '.' && name[1] == '.') return -EINVAL; + return 0; +} + +unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from, + unsigned len, int lc, int lng) +{ + unsigned char *to; + int i; + if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) { + printk("HPFS: Long name flag mismatch - name "); + for (i=0; isb_cp_table,from[i]); + return to; +} + +int hpfs_compare_names(struct super_block *s, + const unsigned char *n1, unsigned l1, + const unsigned char *n2, unsigned l2, int last) +{ + unsigned l = l1 < l2 ? l1 : l2; + unsigned i; + if (last) return -1; + for (i = 0; i < l; i++) { + unsigned char c1 = upcase(hpfs_sb(s)->sb_cp_table,n1[i]); + unsigned char c2 = upcase(hpfs_sb(s)->sb_cp_table,n2[i]); + if (c1 < c2) return -1; + if (c1 > c2) return 1; + } + if (l1 < l2) return -1; + if (l1 > l2) return 1; + return 0; +} + +int hpfs_is_name_long(const unsigned char *name, unsigned len) +{ + int i,j; + for (i = 0; i < len && name[i] != '.'; i++) + if (no_dos_char(name[i])) return 1; + if (!i || i > 8) return 1; + if (i == len) return 0; + for (j = i + 1; j < len; j++) + if (name[j] == '.' || no_dos_char(name[i])) return 1; + return j - i > 4; +} + +/* OS/2 clears dots and spaces at the end of file name, so we have to */ + +void hpfs_adjust_length(const unsigned char *name, unsigned *len) +{ + if (!*len) return; + if (*len == 1 && name[0] == '.') return; + if (*len == 2 && name[0] == '.' && name[1] == '.') return; + while (*len && (name[*len - 1] == '.' || name[*len - 1] == ' ')) + (*len)--; +} diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c new file mode 100644 index 00000000..acf95dab --- /dev/null +++ b/fs/hpfs/namei.c @@ -0,0 +1,628 @@ +/* + * linux/fs/hpfs/namei.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * adding & removing files & directories + */ +#include +#include "hpfs_fn.h" + +static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + const unsigned char *name = dentry->d_name.name; + unsigned len = dentry->d_name.len; + struct quad_buffer_head qbh0; + struct buffer_head *bh; + struct hpfs_dirent *de; + struct fnode *fnode; + struct dnode *dnode; + struct inode *result; + fnode_secno fno; + dnode_secno dno; + int r; + struct hpfs_dirent dee; + int err; + if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; + hpfs_lock(dir->i_sb); + err = -ENOSPC; + fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); + if (!fnode) + goto bail; + dnode = hpfs_alloc_dnode(dir->i_sb, fno, &dno, &qbh0); + if (!dnode) + goto bail1; + memset(&dee, 0, sizeof dee); + dee.directory = 1; + if (!(mode & 0222)) dee.read_only = 1; + /*dee.archive = 0;*/ + dee.hidden = name[0] == '.'; + dee.fnode = cpu_to_le32(fno); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + result = new_inode(dir->i_sb); + if (!result) + goto bail2; + hpfs_init_inode(result); + result->i_ino = fno; + hpfs_i(result)->i_parent_dir = dir->i_ino; + hpfs_i(result)->i_dno = dno; + result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); + result->i_ctime.tv_nsec = 0; + result->i_mtime.tv_nsec = 0; + result->i_atime.tv_nsec = 0; + hpfs_i(result)->i_ea_size = 0; + result->i_mode |= S_IFDIR; + result->i_op = &hpfs_dir_iops; + result->i_fop = &hpfs_dir_ops; + result->i_blocks = 4; + result->i_size = 2048; + result->i_nlink = 2; + if (dee.read_only) + result->i_mode &= ~0222; + + r = hpfs_add_dirent(dir, name, len, &dee); + if (r == 1) + goto bail3; + if (r == -1) { + err = -EEXIST; + goto bail3; + } + fnode->len = len; + memcpy(fnode->name, name, len > 15 ? 15 : len); + fnode->up = cpu_to_le32(dir->i_ino); + fnode->dirflag = 1; + fnode->btree.n_free_nodes = 7; + fnode->btree.n_used_nodes = 1; + fnode->btree.first_free = cpu_to_le16(0x14); + fnode->u.external[0].disk_secno = cpu_to_le32(dno); + fnode->u.external[0].file_secno = cpu_to_le32(-1); + dnode->root_dnode = 1; + dnode->up = cpu_to_le32(fno); + de = hpfs_add_de(dir->i_sb, dnode, "\001\001", 2, 0); + de->creation_date = de->write_date = de->read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + if (!(mode & 0222)) de->read_only = 1; + de->first = de->directory = 1; + /*de->hidden = de->system = 0;*/ + de->fnode = cpu_to_le32(fno); + mark_buffer_dirty(bh); + brelse(bh); + hpfs_mark_4buffers_dirty(&qbh0); + hpfs_brelse4(&qbh0); + inc_nlink(dir); + insert_inode_hash(result); + + if (result->i_uid != current_fsuid() || + result->i_gid != current_fsgid() || + result->i_mode != (mode | S_IFDIR)) { + result->i_uid = current_fsuid(); + result->i_gid = current_fsgid(); + result->i_mode = mode | S_IFDIR; + hpfs_write_inode_nolock(result); + } + d_instantiate(dentry, result); + hpfs_unlock(dir->i_sb); + return 0; +bail3: + iput(result); +bail2: + hpfs_brelse4(&qbh0); + hpfs_free_dnode(dir->i_sb, dno); +bail1: + brelse(bh); + hpfs_free_sectors(dir->i_sb, fno, 1); +bail: + hpfs_unlock(dir->i_sb); + return err; +} + +static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +{ + const unsigned char *name = dentry->d_name.name; + unsigned len = dentry->d_name.len; + struct inode *result = NULL; + struct buffer_head *bh; + struct fnode *fnode; + fnode_secno fno; + int r; + struct hpfs_dirent dee; + int err; + if ((err = hpfs_chk_name(name, &len))) + return err==-ENOENT ? -EINVAL : err; + hpfs_lock(dir->i_sb); + err = -ENOSPC; + fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); + if (!fnode) + goto bail; + memset(&dee, 0, sizeof dee); + if (!(mode & 0222)) dee.read_only = 1; + dee.archive = 1; + dee.hidden = name[0] == '.'; + dee.fnode = cpu_to_le32(fno); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + + result = new_inode(dir->i_sb); + if (!result) + goto bail1; + + hpfs_init_inode(result); + result->i_ino = fno; + result->i_mode |= S_IFREG; + result->i_mode &= ~0111; + result->i_op = &hpfs_file_iops; + result->i_fop = &hpfs_file_ops; + result->i_nlink = 1; + hpfs_i(result)->i_parent_dir = dir->i_ino; + result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); + result->i_ctime.tv_nsec = 0; + result->i_mtime.tv_nsec = 0; + result->i_atime.tv_nsec = 0; + hpfs_i(result)->i_ea_size = 0; + if (dee.read_only) + result->i_mode &= ~0222; + result->i_blocks = 1; + result->i_size = 0; + result->i_data.a_ops = &hpfs_aops; + hpfs_i(result)->mmu_private = 0; + + r = hpfs_add_dirent(dir, name, len, &dee); + if (r == 1) + goto bail2; + if (r == -1) { + err = -EEXIST; + goto bail2; + } + fnode->len = len; + memcpy(fnode->name, name, len > 15 ? 15 : len); + fnode->up = cpu_to_le32(dir->i_ino); + mark_buffer_dirty(bh); + brelse(bh); + + insert_inode_hash(result); + + if (result->i_uid != current_fsuid() || + result->i_gid != current_fsgid() || + result->i_mode != (mode | S_IFREG)) { + result->i_uid = current_fsuid(); + result->i_gid = current_fsgid(); + result->i_mode = mode | S_IFREG; + hpfs_write_inode_nolock(result); + } + d_instantiate(dentry, result); + hpfs_unlock(dir->i_sb); + return 0; + +bail2: + iput(result); +bail1: + brelse(bh); + hpfs_free_sectors(dir->i_sb, fno, 1); +bail: + hpfs_unlock(dir->i_sb); + return err; +} + +static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +{ + const unsigned char *name = dentry->d_name.name; + unsigned len = dentry->d_name.len; + struct buffer_head *bh; + struct fnode *fnode; + fnode_secno fno; + int r; + struct hpfs_dirent dee; + struct inode *result = NULL; + int err; + if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; + if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM; + if (!new_valid_dev(rdev)) + return -EINVAL; + hpfs_lock(dir->i_sb); + err = -ENOSPC; + fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); + if (!fnode) + goto bail; + memset(&dee, 0, sizeof dee); + if (!(mode & 0222)) dee.read_only = 1; + dee.archive = 1; + dee.hidden = name[0] == '.'; + dee.fnode = cpu_to_le32(fno); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + + result = new_inode(dir->i_sb); + if (!result) + goto bail1; + + hpfs_init_inode(result); + result->i_ino = fno; + hpfs_i(result)->i_parent_dir = dir->i_ino; + result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); + result->i_ctime.tv_nsec = 0; + result->i_mtime.tv_nsec = 0; + result->i_atime.tv_nsec = 0; + hpfs_i(result)->i_ea_size = 0; + result->i_uid = current_fsuid(); + result->i_gid = current_fsgid(); + result->i_nlink = 1; + result->i_size = 0; + result->i_blocks = 1; + init_special_inode(result, mode, rdev); + + r = hpfs_add_dirent(dir, name, len, &dee); + if (r == 1) + goto bail2; + if (r == -1) { + err = -EEXIST; + goto bail2; + } + fnode->len = len; + memcpy(fnode->name, name, len > 15 ? 15 : len); + fnode->up = cpu_to_le32(dir->i_ino); + mark_buffer_dirty(bh); + + insert_inode_hash(result); + + hpfs_write_inode_nolock(result); + d_instantiate(dentry, result); + brelse(bh); + hpfs_unlock(dir->i_sb); + return 0; +bail2: + iput(result); +bail1: + brelse(bh); + hpfs_free_sectors(dir->i_sb, fno, 1); +bail: + hpfs_unlock(dir->i_sb); + return err; +} + +static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink) +{ + const unsigned char *name = dentry->d_name.name; + unsigned len = dentry->d_name.len; + struct buffer_head *bh; + struct fnode *fnode; + fnode_secno fno; + int r; + struct hpfs_dirent dee; + struct inode *result; + int err; + if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; + hpfs_lock(dir->i_sb); + if (hpfs_sb(dir->i_sb)->sb_eas < 2) { + hpfs_unlock(dir->i_sb); + return -EPERM; + } + err = -ENOSPC; + fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); + if (!fnode) + goto bail; + memset(&dee, 0, sizeof dee); + dee.archive = 1; + dee.hidden = name[0] == '.'; + dee.fnode = cpu_to_le32(fno); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + + result = new_inode(dir->i_sb); + if (!result) + goto bail1; + result->i_ino = fno; + hpfs_init_inode(result); + hpfs_i(result)->i_parent_dir = dir->i_ino; + result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); + result->i_ctime.tv_nsec = 0; + result->i_mtime.tv_nsec = 0; + result->i_atime.tv_nsec = 0; + hpfs_i(result)->i_ea_size = 0; + result->i_mode = S_IFLNK | 0777; + result->i_uid = current_fsuid(); + result->i_gid = current_fsgid(); + result->i_blocks = 1; + result->i_nlink = 1; + result->i_size = strlen(symlink); + result->i_op = &page_symlink_inode_operations; + result->i_data.a_ops = &hpfs_symlink_aops; + + r = hpfs_add_dirent(dir, name, len, &dee); + if (r == 1) + goto bail2; + if (r == -1) { + err = -EEXIST; + goto bail2; + } + fnode->len = len; + memcpy(fnode->name, name, len > 15 ? 15 : len); + fnode->up = cpu_to_le32(dir->i_ino); + hpfs_set_ea(result, fnode, "SYMLINK", symlink, strlen(symlink)); + mark_buffer_dirty(bh); + brelse(bh); + + insert_inode_hash(result); + + hpfs_write_inode_nolock(result); + d_instantiate(dentry, result); + hpfs_unlock(dir->i_sb); + return 0; +bail2: + iput(result); +bail1: + brelse(bh); + hpfs_free_sectors(dir->i_sb, fno, 1); +bail: + hpfs_unlock(dir->i_sb); + return err; +} + +static int hpfs_unlink(struct inode *dir, struct dentry *dentry) +{ + const unsigned char *name = dentry->d_name.name; + unsigned len = dentry->d_name.len; + struct quad_buffer_head qbh; + struct hpfs_dirent *de; + struct inode *inode = dentry->d_inode; + dnode_secno dno; + int r; + int rep = 0; + int err; + + hpfs_lock(dir->i_sb); + hpfs_adjust_length(name, &len); +again: + err = -ENOENT; + de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh); + if (!de) + goto out; + + err = -EPERM; + if (de->first) + goto out1; + + err = -EISDIR; + if (de->directory) + goto out1; + + r = hpfs_remove_dirent(dir, dno, de, &qbh, 1); + switch (r) { + case 1: + hpfs_error(dir->i_sb, "there was error when removing dirent"); + err = -EFSERROR; + break; + case 2: /* no space for deleting, try to truncate file */ + + err = -ENOSPC; + if (rep++) + break; + + dentry_unhash(dentry); + if (!d_unhashed(dentry)) { + hpfs_unlock(dir->i_sb); + return -ENOSPC; + } + if (generic_permission(inode, MAY_WRITE, 0, NULL) || + !S_ISREG(inode->i_mode) || + get_write_access(inode)) { + d_rehash(dentry); + } else { + struct iattr newattrs; + /*printk("HPFS: truncating file before delete.\n");*/ + newattrs.ia_size = 0; + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + err = notify_change(dentry, &newattrs); + put_write_access(inode); + if (!err) + goto again; + } + hpfs_unlock(dir->i_sb); + return -ENOSPC; + default: + drop_nlink(inode); + err = 0; + } + goto out; + +out1: + hpfs_brelse4(&qbh); +out: + hpfs_unlock(dir->i_sb); + return err; +} + +static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + const unsigned char *name = dentry->d_name.name; + unsigned len = dentry->d_name.len; + struct quad_buffer_head qbh; + struct hpfs_dirent *de; + struct inode *inode = dentry->d_inode; + dnode_secno dno; + int n_items = 0; + int err; + int r; + + hpfs_adjust_length(name, &len); + hpfs_lock(dir->i_sb); + err = -ENOENT; + de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh); + if (!de) + goto out; + + err = -EPERM; + if (de->first) + goto out1; + + err = -ENOTDIR; + if (!de->directory) + goto out1; + + hpfs_count_dnodes(dir->i_sb, hpfs_i(inode)->i_dno, NULL, NULL, &n_items); + err = -ENOTEMPTY; + if (n_items) + goto out1; + + r = hpfs_remove_dirent(dir, dno, de, &qbh, 1); + switch (r) { + case 1: + hpfs_error(dir->i_sb, "there was error when removing dirent"); + err = -EFSERROR; + break; + case 2: + err = -ENOSPC; + break; + default: + drop_nlink(dir); + clear_nlink(inode); + err = 0; + } + goto out; +out1: + hpfs_brelse4(&qbh); +out: + hpfs_unlock(dir->i_sb); + return err; +} + +static int hpfs_symlink_readpage(struct file *file, struct page *page) +{ + char *link = kmap(page); + struct inode *i = page->mapping->host; + struct fnode *fnode; + struct buffer_head *bh; + int err; + + err = -EIO; + hpfs_lock(i->i_sb); + if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) + goto fail; + err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE); + brelse(bh); + if (err) + goto fail; + hpfs_unlock(i->i_sb); + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + return 0; + +fail: + hpfs_unlock(i->i_sb); + SetPageError(page); + kunmap(page); + unlock_page(page); + return err; +} + +const struct address_space_operations hpfs_symlink_aops = { + .readpage = hpfs_symlink_readpage +}; + +static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + const unsigned char *old_name = old_dentry->d_name.name; + unsigned old_len = old_dentry->d_name.len; + const unsigned char *new_name = new_dentry->d_name.name; + unsigned new_len = new_dentry->d_name.len; + struct inode *i = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct quad_buffer_head qbh, qbh1; + struct hpfs_dirent *dep, *nde; + struct hpfs_dirent de; + dnode_secno dno; + int r; + struct buffer_head *bh; + struct fnode *fnode; + int err; + + if ((err = hpfs_chk_name(new_name, &new_len))) return err; + err = 0; + hpfs_adjust_length(old_name, &old_len); + + hpfs_lock(i->i_sb); + /* order doesn't matter, due to VFS exclusion */ + + /* Erm? Moving over the empty non-busy directory is perfectly legal */ + if (new_inode && S_ISDIR(new_inode->i_mode)) { + err = -EINVAL; + goto end1; + } + + if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) { + hpfs_error(i->i_sb, "lookup succeeded but map dirent failed"); + err = -ENOENT; + goto end1; + } + copy_de(&de, dep); + de.hidden = new_name[0] == '.'; + + if (new_inode) { + int r; + if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) { + if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, new_name, new_len, NULL, &qbh1))) { + clear_nlink(new_inode); + copy_de(nde, &de); + memcpy(nde->name, new_name, new_len); + hpfs_mark_4buffers_dirty(&qbh1); + hpfs_brelse4(&qbh1); + goto end; + } + hpfs_error(new_dir->i_sb, "hpfs_rename: could not find dirent"); + err = -EFSERROR; + goto end1; + } + err = r == 2 ? -ENOSPC : r == 1 ? -EFSERROR : 0; + goto end1; + } + + if (new_dir == old_dir) hpfs_brelse4(&qbh); + + if ((r = hpfs_add_dirent(new_dir, new_name, new_len, &de))) { + if (r == -1) hpfs_error(new_dir->i_sb, "hpfs_rename: dirent already exists!"); + err = r == 1 ? -ENOSPC : -EFSERROR; + if (new_dir != old_dir) hpfs_brelse4(&qbh); + goto end1; + } + + if (new_dir == old_dir) + if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) { + hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2"); + err = -ENOENT; + goto end1; + } + + if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 0))) { + hpfs_error(i->i_sb, "hpfs_rename: could not remove dirent"); + err = r == 2 ? -ENOSPC : -EFSERROR; + goto end1; + } + + end: + hpfs_i(i)->i_parent_dir = new_dir->i_ino; + if (S_ISDIR(i->i_mode)) { + inc_nlink(new_dir); + drop_nlink(old_dir); + } + if ((fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) { + fnode->up = cpu_to_le32(new_dir->i_ino); + fnode->len = new_len; + memcpy(fnode->name, new_name, new_len>15?15:new_len); + if (new_len < 15) memset(&fnode->name[new_len], 0, 15 - new_len); + mark_buffer_dirty(bh); + brelse(bh); + } +end1: + hpfs_unlock(i->i_sb); + return err; +} + +const struct inode_operations hpfs_dir_iops = +{ + .create = hpfs_create, + .lookup = hpfs_lookup, + .unlink = hpfs_unlink, + .symlink = hpfs_symlink, + .mkdir = hpfs_mkdir, + .rmdir = hpfs_rmdir, + .mknod = hpfs_mknod, + .rename = hpfs_rename, + .setattr = hpfs_setattr, +}; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c new file mode 100644 index 00000000..98580a3b --- /dev/null +++ b/fs/hpfs/super.c @@ -0,0 +1,712 @@ +/* + * linux/fs/hpfs/super.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * mounting, unmounting, error handling + */ + +#include "hpfs_fn.h" +#include +#include +#include +#include +#include +#include +#include +#include + +/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ + +static void mark_dirty(struct super_block *s, int remount) +{ + if (hpfs_sb(s)->sb_chkdsk && (remount || !(s->s_flags & MS_RDONLY))) { + struct buffer_head *bh; + struct hpfs_spare_block *sb; + if ((sb = hpfs_map_sector(s, 17, &bh, 0))) { + sb->dirty = 1; + sb->old_wrote = 0; + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + } + } +} + +/* Mark the filesystem clean (mark it dirty for chkdsk if chkdsk==2 or if there + were errors) */ + +static void unmark_dirty(struct super_block *s) +{ + struct buffer_head *bh; + struct hpfs_spare_block *sb; + if (s->s_flags & MS_RDONLY) return; + sync_blockdev(s->s_bdev); + if ((sb = hpfs_map_sector(s, 17, &bh, 0))) { + sb->dirty = hpfs_sb(s)->sb_chkdsk > 1 - hpfs_sb(s)->sb_was_error; + sb->old_wrote = hpfs_sb(s)->sb_chkdsk >= 2 && !hpfs_sb(s)->sb_was_error; + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + } +} + +/* Filesystem error... */ +static char err_buf[1024]; + +void hpfs_error(struct super_block *s, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(err_buf, sizeof(err_buf), fmt, args); + va_end(args); + + printk("HPFS: filesystem error: %s", err_buf); + if (!hpfs_sb(s)->sb_was_error) { + if (hpfs_sb(s)->sb_err == 2) { + printk("; crashing the system because you wanted it\n"); + mark_dirty(s, 0); + panic("HPFS panic"); + } else if (hpfs_sb(s)->sb_err == 1) { + if (s->s_flags & MS_RDONLY) printk("; already mounted read-only\n"); + else { + printk("; remounting read-only\n"); + mark_dirty(s, 0); + s->s_flags |= MS_RDONLY; + } + } else if (s->s_flags & MS_RDONLY) printk("; going on - but anything won't be destroyed because it's read-only\n"); + else printk("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n"); + } else printk("\n"); + hpfs_sb(s)->sb_was_error = 1; +} + +/* + * A little trick to detect cycles in many hpfs structures and don't let the + * kernel crash on corrupted filesystem. When first called, set c2 to 0. + * + * BTW. chkdsk doesn't detect cycles correctly. When I had 2 lost directories + * nested each in other, chkdsk locked up happilly. + */ + +int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2, + char *msg) +{ + if (*c2 && *c1 == key) { + hpfs_error(s, "cycle detected on key %08x in %s", key, msg); + return 1; + } + (*c2)++; + if (!((*c2 - 1) & *c2)) *c1 = key; + return 0; +} + +static void hpfs_put_super(struct super_block *s) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + + hpfs_lock(s); + unmark_dirty(s); + hpfs_unlock(s); + + kfree(sbi->sb_cp_table); + kfree(sbi->sb_bmp_dir); + s->s_fs_info = NULL; + kfree(sbi); +} + +unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) +{ + struct quad_buffer_head qbh; + unsigned long *bits; + unsigned count; + + bits = hpfs_map_4sectors(s, secno, &qbh, 4); + if (!bits) + return 0; + count = bitmap_weight(bits, 2048 * BITS_PER_BYTE); + hpfs_brelse4(&qbh); + return count; +} + +static unsigned count_bitmaps(struct super_block *s) +{ + unsigned n, count, n_bands; + n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; + count = 0; + for (n = 0; n < n_bands; n++) + count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); + return count; +} + +static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *s = dentry->d_sb; + struct hpfs_sb_info *sbi = hpfs_sb(s); + u64 id = huge_encode_dev(s->s_bdev->bd_dev); + hpfs_lock(s); + + /*if (sbi->sb_n_free == -1) {*/ + sbi->sb_n_free = count_bitmaps(s); + sbi->sb_n_free_dnodes = hpfs_count_one_bitmap(s, sbi->sb_dmap); + /*}*/ + buf->f_type = s->s_magic; + buf->f_bsize = 512; + buf->f_blocks = sbi->sb_fs_size; + buf->f_bfree = sbi->sb_n_free; + buf->f_bavail = sbi->sb_n_free; + buf->f_files = sbi->sb_dirband_size / 4; + buf->f_ffree = sbi->sb_n_free_dnodes; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = 254; + + hpfs_unlock(s); + + return 0; +} + +static struct kmem_cache * hpfs_inode_cachep; + +static struct inode *hpfs_alloc_inode(struct super_block *sb) +{ + struct hpfs_inode_info *ei; + ei = (struct hpfs_inode_info *)kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + ei->vfs_inode.i_version = 1; + return &ei->vfs_inode; +} + +static void hpfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode)); +} + +static void hpfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hpfs_i_callback); +} + +static void init_once(void *foo) +{ + struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache", + sizeof(struct hpfs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (hpfs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(hpfs_inode_cachep); +} + +/* + * A tiny parser for option strings, stolen from dosfs. + * Stolen again from read-only hpfs. + * And updated for table-driven option parsing. + */ + +enum { + Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case_lower, Opt_case_asis, + Opt_check_none, Opt_check_normal, Opt_check_strict, + Opt_err_cont, Opt_err_ro, Opt_err_panic, + Opt_eas_no, Opt_eas_ro, Opt_eas_rw, + Opt_chkdsk_no, Opt_chkdsk_errors, Opt_chkdsk_always, + Opt_timeshift, Opt_err, +}; + +static const match_table_t tokens = { + {Opt_help, "help"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_umask, "umask=%o"}, + {Opt_case_lower, "case=lower"}, + {Opt_case_asis, "case=asis"}, + {Opt_check_none, "check=none"}, + {Opt_check_normal, "check=normal"}, + {Opt_check_strict, "check=strict"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_err_panic, "errors=panic"}, + {Opt_eas_no, "eas=no"}, + {Opt_eas_ro, "eas=ro"}, + {Opt_eas_rw, "eas=rw"}, + {Opt_chkdsk_no, "chkdsk=no"}, + {Opt_chkdsk_errors, "chkdsk=errors"}, + {Opt_chkdsk_always, "chkdsk=always"}, + {Opt_timeshift, "timeshift=%d"}, + {Opt_err, NULL}, +}; + +static int parse_opts(char *opts, uid_t *uid, gid_t *gid, umode_t *umask, + int *lowercase, int *eas, int *chk, int *errs, + int *chkdsk, int *timeshift) +{ + char *p; + int option; + + if (!opts) + return 1; + + /*printk("Parsing opts: '%s'\n",opts);*/ + + while ((p = strsep(&opts, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_help: + return 2; + case Opt_uid: + if (match_int(args, &option)) + return 0; + *uid = option; + break; + case Opt_gid: + if (match_int(args, &option)) + return 0; + *gid = option; + break; + case Opt_umask: + if (match_octal(args, &option)) + return 0; + *umask = option; + break; + case Opt_case_lower: + *lowercase = 1; + break; + case Opt_case_asis: + *lowercase = 0; + break; + case Opt_check_none: + *chk = 0; + break; + case Opt_check_normal: + *chk = 1; + break; + case Opt_check_strict: + *chk = 2; + break; + case Opt_err_cont: + *errs = 0; + break; + case Opt_err_ro: + *errs = 1; + break; + case Opt_err_panic: + *errs = 2; + break; + case Opt_eas_no: + *eas = 0; + break; + case Opt_eas_ro: + *eas = 1; + break; + case Opt_eas_rw: + *eas = 2; + break; + case Opt_chkdsk_no: + *chkdsk = 0; + break; + case Opt_chkdsk_errors: + *chkdsk = 1; + break; + case Opt_chkdsk_always: + *chkdsk = 2; + break; + case Opt_timeshift: + { + int m = 1; + char *rhs = args[0].from; + if (!rhs || !*rhs) + return 0; + if (*rhs == '-') m = -1; + if (*rhs == '+' || *rhs == '-') rhs++; + *timeshift = simple_strtoul(rhs, &rhs, 0) * m; + if (*rhs) + return 0; + break; + } + default: + return 0; + } + } + return 1; +} + +static inline void hpfs_help(void) +{ + printk("\n\ +HPFS filesystem options:\n\ + help do not mount and display this text\n\ + uid=xxx set uid of files that don't have uid specified in eas\n\ + gid=xxx set gid of files that don't have gid specified in eas\n\ + umask=xxx set mode of files that don't have mode specified in eas\n\ + case=lower lowercase all files\n\ + case=asis do not lowercase files (default)\n\ + check=none no fs checks - kernel may crash on corrupted filesystem\n\ + check=normal do some checks - it should not crash (default)\n\ + check=strict do extra time-consuming checks, used for debugging\n\ + errors=continue continue on errors\n\ + errors=remount-ro remount read-only if errors found (default)\n\ + errors=panic panic on errors\n\ + chkdsk=no do not mark fs for chkdsking even if there were errors\n\ + chkdsk=errors mark fs dirty if errors found (default)\n\ + chkdsk=always always mark fs dirty - used for debugging\n\ + eas=no ignore extended attributes\n\ + eas=ro read but do not write extended attributes\n\ + eas=rw r/w eas => enables chmod, chown, mknod, ln -s (default)\n\ + timeshift=nnn add nnn seconds to file times\n\ +\n"); +} + +static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) +{ + uid_t uid; + gid_t gid; + umode_t umask; + int lowercase, eas, chk, errs, chkdsk, timeshift; + int o; + struct hpfs_sb_info *sbi = hpfs_sb(s); + char *new_opts = kstrdup(data, GFP_KERNEL); + + *flags |= MS_NOATIME; + + hpfs_lock(s); + lock_super(s); + uid = sbi->sb_uid; gid = sbi->sb_gid; + umask = 0777 & ~sbi->sb_mode; + lowercase = sbi->sb_lowercase; + eas = sbi->sb_eas; chk = sbi->sb_chk; chkdsk = sbi->sb_chkdsk; + errs = sbi->sb_err; timeshift = sbi->sb_timeshift; + + if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase, + &eas, &chk, &errs, &chkdsk, ×hift))) { + printk("HPFS: bad mount options.\n"); + goto out_err; + } + if (o == 2) { + hpfs_help(); + goto out_err; + } + if (timeshift != sbi->sb_timeshift) { + printk("HPFS: timeshift can't be changed using remount.\n"); + goto out_err; + } + + unmark_dirty(s); + + sbi->sb_uid = uid; sbi->sb_gid = gid; + sbi->sb_mode = 0777 & ~umask; + sbi->sb_lowercase = lowercase; + sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk; + sbi->sb_err = errs; sbi->sb_timeshift = timeshift; + + if (!(*flags & MS_RDONLY)) mark_dirty(s, 1); + + replace_mount_options(s, new_opts); + + unlock_super(s); + hpfs_unlock(s); + return 0; + +out_err: + unlock_super(s); + hpfs_unlock(s); + kfree(new_opts); + return -EINVAL; +} + +/* Super operations */ + +static const struct super_operations hpfs_sops = +{ + .alloc_inode = hpfs_alloc_inode, + .destroy_inode = hpfs_destroy_inode, + .evict_inode = hpfs_evict_inode, + .put_super = hpfs_put_super, + .statfs = hpfs_statfs, + .remount_fs = hpfs_remount_fs, + .show_options = generic_show_options, +}; + +static int hpfs_fill_super(struct super_block *s, void *options, int silent) +{ + struct buffer_head *bh0, *bh1, *bh2; + struct hpfs_boot_block *bootblock; + struct hpfs_super_block *superblock; + struct hpfs_spare_block *spareblock; + struct hpfs_sb_info *sbi; + struct inode *root; + + uid_t uid; + gid_t gid; + umode_t umask; + int lowercase, eas, chk, errs, chkdsk, timeshift; + + dnode_secno root_dno; + struct hpfs_dirent *de = NULL; + struct quad_buffer_head qbh; + + int o; + + save_mount_options(s, options); + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) { + return -ENOMEM; + } + s->s_fs_info = sbi; + + sbi->sb_bmp_dir = NULL; + sbi->sb_cp_table = NULL; + + mutex_init(&sbi->hpfs_mutex); + hpfs_lock(s); + + uid = current_uid(); + gid = current_gid(); + umask = current_umask(); + lowercase = 0; + eas = 2; + chk = 1; + errs = 1; + chkdsk = 1; + timeshift = 0; + + if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase, + &eas, &chk, &errs, &chkdsk, ×hift))) { + printk("HPFS: bad mount options.\n"); + goto bail0; + } + if (o==2) { + hpfs_help(); + goto bail0; + } + + /*sbi->sb_mounting = 1;*/ + sb_set_blocksize(s, 512); + sbi->sb_fs_size = -1; + if (!(bootblock = hpfs_map_sector(s, 0, &bh0, 0))) goto bail1; + if (!(superblock = hpfs_map_sector(s, 16, &bh1, 1))) goto bail2; + if (!(spareblock = hpfs_map_sector(s, 17, &bh2, 0))) goto bail3; + + /* Check magics */ + if (/*le16_to_cpu(bootblock->magic) != BB_MAGIC + ||*/ le32_to_cpu(superblock->magic) != SB_MAGIC + || le32_to_cpu(spareblock->magic) != SP_MAGIC) { + if (!silent) printk("HPFS: Bad magic ... probably not HPFS\n"); + goto bail4; + } + + /* Check version */ + if (!(s->s_flags & MS_RDONLY) && + superblock->funcversion != 2 && superblock->funcversion != 3) { + printk("HPFS: Bad version %d,%d. Mount readonly to go around\n", + (int)superblock->version, (int)superblock->funcversion); + printk("HPFS: please try recent version of HPFS driver at http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi and if it still can't understand this format, contact author - mikulas@artax.karlin.mff.cuni.cz\n"); + goto bail4; + } + + s->s_flags |= MS_NOATIME; + + /* Fill superblock stuff */ + s->s_magic = HPFS_SUPER_MAGIC; + s->s_op = &hpfs_sops; + s->s_d_op = &hpfs_dentry_operations; + + sbi->sb_root = le32_to_cpu(superblock->root); + sbi->sb_fs_size = le32_to_cpu(superblock->n_sectors); + sbi->sb_bitmaps = le32_to_cpu(superblock->bitmaps); + sbi->sb_dirband_start = le32_to_cpu(superblock->dir_band_start); + sbi->sb_dirband_size = le32_to_cpu(superblock->n_dir_band); + sbi->sb_dmap = le32_to_cpu(superblock->dir_band_bitmap); + sbi->sb_uid = uid; + sbi->sb_gid = gid; + sbi->sb_mode = 0777 & ~umask; + sbi->sb_n_free = -1; + sbi->sb_n_free_dnodes = -1; + sbi->sb_lowercase = lowercase; + sbi->sb_eas = eas; + sbi->sb_chk = chk; + sbi->sb_chkdsk = chkdsk; + sbi->sb_err = errs; + sbi->sb_timeshift = timeshift; + sbi->sb_was_error = 0; + sbi->sb_cp_table = NULL; + sbi->sb_c_bitmap = -1; + sbi->sb_max_fwd_alloc = 0xffffff; + + /* Load bitmap directory */ + if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps)))) + goto bail4; + + /* Check for general fs errors*/ + if (spareblock->dirty && !spareblock->old_wrote) { + if (errs == 2) { + printk("HPFS: Improperly stopped, not mounted\n"); + goto bail4; + } + hpfs_error(s, "improperly stopped"); + } + + if (!(s->s_flags & MS_RDONLY)) { + spareblock->dirty = 1; + spareblock->old_wrote = 0; + mark_buffer_dirty(bh2); + } + + if (le32_to_cpu(spareblock->hotfixes_used) || le32_to_cpu(spareblock->n_spares_used)) { + if (errs >= 2) { + printk("HPFS: Hotfixes not supported here, try chkdsk\n"); + mark_dirty(s, 0); + goto bail4; + } + hpfs_error(s, "hotfixes not supported here, try chkdsk"); + if (errs == 0) printk("HPFS: Proceeding, but your filesystem will be probably corrupted by this driver...\n"); + else printk("HPFS: This driver may read bad files or crash when operating on disk with hotfixes.\n"); + } + if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) { + if (errs >= 2) { + printk("HPFS: Spare dnodes used, try chkdsk\n"); + mark_dirty(s, 0); + goto bail4; + } + hpfs_error(s, "warning: spare dnodes used, try chkdsk"); + if (errs == 0) printk("HPFS: Proceeding, but your filesystem could be corrupted if you delete files or directories\n"); + } + if (chk) { + unsigned a; + if (le32_to_cpu(superblock->dir_band_end) - le32_to_cpu(superblock->dir_band_start) + 1 != le32_to_cpu(superblock->n_dir_band) || + le32_to_cpu(superblock->dir_band_end) < le32_to_cpu(superblock->dir_band_start) || le32_to_cpu(superblock->n_dir_band) > 0x4000) { + hpfs_error(s, "dir band size mismatch: dir_band_start==%08x, dir_band_end==%08x, n_dir_band==%08x", + le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->dir_band_end), le32_to_cpu(superblock->n_dir_band)); + goto bail4; + } + a = sbi->sb_dirband_size; + sbi->sb_dirband_size = 0; + if (hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->n_dir_band), "dir_band") || + hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_bitmap), 4, "dir_band_bitmap") || + hpfs_chk_sectors(s, le32_to_cpu(superblock->bitmaps), 4, "bitmaps")) { + mark_dirty(s, 0); + goto bail4; + } + sbi->sb_dirband_size = a; + } else printk("HPFS: You really don't want any checks? You are crazy...\n"); + + /* Load code page table */ + if (le32_to_cpu(spareblock->n_code_pages)) + if (!(sbi->sb_cp_table = hpfs_load_code_page(s, le32_to_cpu(spareblock->code_page_dir)))) + printk("HPFS: Warning: code page support is disabled\n"); + + brelse(bh2); + brelse(bh1); + brelse(bh0); + + root = iget_locked(s, sbi->sb_root); + if (!root) + goto bail0; + hpfs_init_inode(root); + hpfs_read_inode(root); + unlock_new_inode(root); + s->s_root = d_alloc_root(root); + if (!s->s_root) { + iput(root); + goto bail0; + } + + /* + * find the root directory's . pointer & finish filling in the inode + */ + + root_dno = hpfs_fnode_dno(s, sbi->sb_root); + if (root_dno) + de = map_dirent(root, root_dno, "\001\001", 2, NULL, &qbh); + if (!de) + hpfs_error(s, "unable to find root dir"); + else { + root->i_atime.tv_sec = local_to_gmt(s, le32_to_cpu(de->read_date)); + root->i_atime.tv_nsec = 0; + root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date)); + root->i_mtime.tv_nsec = 0; + root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date)); + root->i_ctime.tv_nsec = 0; + hpfs_i(root)->i_ea_size = le16_to_cpu(de->ea_size); + hpfs_i(root)->i_parent_dir = root->i_ino; + if (root->i_size == -1) + root->i_size = 2048; + if (root->i_blocks == -1) + root->i_blocks = 5; + hpfs_brelse4(&qbh); + } + hpfs_unlock(s); + return 0; + +bail4: brelse(bh2); +bail3: brelse(bh1); +bail2: brelse(bh0); +bail1: +bail0: + hpfs_unlock(s); + kfree(sbi->sb_bmp_dir); + kfree(sbi->sb_cp_table); + s->s_fs_info = NULL; + kfree(sbi); + return -EINVAL; +} + +static struct dentry *hpfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super); +} + +static struct file_system_type hpfs_fs_type = { + .owner = THIS_MODULE, + .name = "hpfs", + .mount = hpfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_hpfs_fs(void) +{ + int err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&hpfs_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_hpfs_fs(void) +{ + unregister_filesystem(&hpfs_fs_type); + destroy_inodecache(); +} + +module_init(init_hpfs_fs) +module_exit(exit_hpfs_fs) +MODULE_LICENSE("GPL"); diff --git a/fs/hppfs/Makefile b/fs/hppfs/Makefile new file mode 100644 index 00000000..3a982bd9 --- /dev/null +++ b/fs/hppfs/Makefile @@ -0,0 +1,6 @@ +# +# Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com) +# Licensed under the GPL +# + +obj-$(CONFIG_HPPFS) += hppfs.o diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c new file mode 100644 index 00000000..9d71c95b --- /dev/null +++ b/fs/hppfs/hppfs.c @@ -0,0 +1,773 @@ +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "os.h" + +static struct inode *get_inode(struct super_block *, struct dentry *); + +struct hppfs_data { + struct list_head list; + char contents[PAGE_SIZE - sizeof(struct list_head)]; +}; + +struct hppfs_private { + struct file *proc_file; + int host_fd; + loff_t len; + struct hppfs_data *contents; +}; + +struct hppfs_inode_info { + struct dentry *proc_dentry; + struct inode vfs_inode; +}; + +static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode) +{ + return container_of(inode, struct hppfs_inode_info, vfs_inode); +} + +#define HPPFS_SUPER_MAGIC 0xb00000ee + +static const struct super_operations hppfs_sbops; + +static int is_pid(struct dentry *dentry) +{ + struct super_block *sb; + int i; + + sb = dentry->d_sb; + if (dentry->d_parent != sb->s_root) + return 0; + + for (i = 0; i < dentry->d_name.len; i++) { + if (!isdigit(dentry->d_name.name[i])) + return 0; + } + return 1; +} + +static char *dentry_name(struct dentry *dentry, int extra) +{ + struct dentry *parent; + char *root, *name; + const char *seg_name; + int len, seg_len; + + len = 0; + parent = dentry; + while (parent->d_parent != parent) { + if (is_pid(parent)) + len += strlen("pid") + 1; + else len += parent->d_name.len + 1; + parent = parent->d_parent; + } + + root = "proc"; + len += strlen(root); + name = kmalloc(len + extra + 1, GFP_KERNEL); + if (name == NULL) + return NULL; + + name[len] = '\0'; + parent = dentry; + while (parent->d_parent != parent) { + if (is_pid(parent)) { + seg_name = "pid"; + seg_len = strlen("pid"); + } + else { + seg_name = parent->d_name.name; + seg_len = parent->d_name.len; + } + + len -= seg_len + 1; + name[len] = '/'; + strncpy(&name[len + 1], seg_name, seg_len); + parent = parent->d_parent; + } + strncpy(name, root, strlen(root)); + return name; +} + +static int file_removed(struct dentry *dentry, const char *file) +{ + char *host_file; + int extra, fd; + + extra = 0; + if (file != NULL) + extra += strlen(file) + 1; + + host_file = dentry_name(dentry, extra + strlen("/remove")); + if (host_file == NULL) { + printk(KERN_ERR "file_removed : allocation failed\n"); + return -ENOMEM; + } + + if (file != NULL) { + strcat(host_file, "/"); + strcat(host_file, file); + } + strcat(host_file, "/remove"); + + fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); + kfree(host_file); + if (fd > 0) { + os_close_file(fd); + return 1; + } + return 0; +} + +static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, + struct nameidata *nd) +{ + struct dentry *proc_dentry, *parent; + struct qstr *name = &dentry->d_name; + struct inode *inode; + int err, deleted; + + deleted = file_removed(dentry, NULL); + if (deleted < 0) + return ERR_PTR(deleted); + else if (deleted) + return ERR_PTR(-ENOENT); + + parent = HPPFS_I(ino)->proc_dentry; + mutex_lock(&parent->d_inode->i_mutex); + proc_dentry = lookup_one_len(name->name, parent, name->len); + mutex_unlock(&parent->d_inode->i_mutex); + + if (IS_ERR(proc_dentry)) + return proc_dentry; + + err = -ENOMEM; + inode = get_inode(ino->i_sb, proc_dentry); + if (!inode) + goto out; + + d_add(dentry, inode); + return NULL; + + out: + return ERR_PTR(err); +} + +static const struct inode_operations hppfs_file_iops = { +}; + +static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count, + loff_t *ppos, int is_user) +{ + ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); + ssize_t n; + + read = file->f_path.dentry->d_inode->i_fop->read; + + if (!is_user) + set_fs(KERNEL_DS); + + n = (*read)(file, buf, count, &file->f_pos); + + if (!is_user) + set_fs(USER_DS); + + if (ppos) + *ppos = file->f_pos; + return n; +} + +static ssize_t hppfs_read_file(int fd, char __user *buf, ssize_t count) +{ + ssize_t n; + int cur, err; + char *new_buf; + + n = -ENOMEM; + new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (new_buf == NULL) { + printk(KERN_ERR "hppfs_read_file : kmalloc failed\n"); + goto out; + } + n = 0; + while (count > 0) { + cur = min_t(ssize_t, count, PAGE_SIZE); + err = os_read_file(fd, new_buf, cur); + if (err < 0) { + printk(KERN_ERR "hppfs_read : read failed, " + "errno = %d\n", err); + n = err; + goto out_free; + } else if (err == 0) + break; + + if (copy_to_user(buf, new_buf, err)) { + n = -EFAULT; + goto out_free; + } + n += err; + count -= err; + } + out_free: + kfree(new_buf); + out: + return n; +} + +static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct hppfs_private *hppfs = file->private_data; + struct hppfs_data *data; + loff_t off; + int err; + + if (hppfs->contents != NULL) { + int rem; + + if (*ppos >= hppfs->len) + return 0; + + data = hppfs->contents; + off = *ppos; + while (off >= sizeof(data->contents)) { + data = list_entry(data->list.next, struct hppfs_data, + list); + off -= sizeof(data->contents); + } + + if (off + count > hppfs->len) + count = hppfs->len - off; + rem = copy_to_user(buf, &data->contents[off], count); + *ppos += count - rem; + if (rem > 0) + return -EFAULT; + } else if (hppfs->host_fd != -1) { + err = os_seek_file(hppfs->host_fd, *ppos); + if (err) { + printk(KERN_ERR "hppfs_read : seek failed, " + "errno = %d\n", err); + return err; + } + err = hppfs_read_file(hppfs->host_fd, buf, count); + if (err < 0) { + printk(KERN_ERR "hppfs_read: read failed: %d\n", err); + return err; + } + count = err; + if (count > 0) + *ppos += count; + } + else count = read_proc(hppfs->proc_file, buf, count, ppos, 1); + + return count; +} + +static ssize_t hppfs_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + struct hppfs_private *data = file->private_data; + struct file *proc_file = data->proc_file; + ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); + + write = proc_file->f_path.dentry->d_inode->i_fop->write; + return (*write)(proc_file, buf, len, ppos); +} + +static int open_host_sock(char *host_file, int *filter_out) +{ + char *end; + int fd; + + end = &host_file[strlen(host_file)]; + strcpy(end, "/rw"); + *filter_out = 1; + fd = os_connect_socket(host_file); + if (fd > 0) + return fd; + + strcpy(end, "/r"); + *filter_out = 0; + fd = os_connect_socket(host_file); + return fd; +} + +static void free_contents(struct hppfs_data *head) +{ + struct hppfs_data *data; + struct list_head *ele, *next; + + if (head == NULL) + return; + + list_for_each_safe(ele, next, &head->list) { + data = list_entry(ele, struct hppfs_data, list); + kfree(data); + } + kfree(head); +} + +static struct hppfs_data *hppfs_get_data(int fd, int filter, + struct file *proc_file, + struct file *hppfs_file, + loff_t *size_out) +{ + struct hppfs_data *data, *new, *head; + int n, err; + + err = -ENOMEM; + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) { + printk(KERN_ERR "hppfs_get_data : head allocation failed\n"); + goto failed; + } + + INIT_LIST_HEAD(&data->list); + + head = data; + *size_out = 0; + + if (filter) { + while ((n = read_proc(proc_file, data->contents, + sizeof(data->contents), NULL, 0)) > 0) + os_write_file(fd, data->contents, n); + err = os_shutdown_socket(fd, 0, 1); + if (err) { + printk(KERN_ERR "hppfs_get_data : failed to shut down " + "socket\n"); + goto failed_free; + } + } + while (1) { + n = os_read_file(fd, data->contents, sizeof(data->contents)); + if (n < 0) { + err = n; + printk(KERN_ERR "hppfs_get_data : read failed, " + "errno = %d\n", err); + goto failed_free; + } else if (n == 0) + break; + + *size_out += n; + + if (n < sizeof(data->contents)) + break; + + new = kmalloc(sizeof(*data), GFP_KERNEL); + if (new == 0) { + printk(KERN_ERR "hppfs_get_data : data allocation " + "failed\n"); + err = -ENOMEM; + goto failed_free; + } + + INIT_LIST_HEAD(&new->list); + list_add(&new->list, &data->list); + data = new; + } + return head; + + failed_free: + free_contents(head); + failed: + return ERR_PTR(err); +} + +static struct hppfs_private *hppfs_data(void) +{ + struct hppfs_private *data; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return data; + + *data = ((struct hppfs_private ) { .host_fd = -1, + .len = -1, + .contents = NULL } ); + return data; +} + +static int file_mode(int fmode) +{ + if (fmode == (FMODE_READ | FMODE_WRITE)) + return O_RDWR; + if (fmode == FMODE_READ) + return O_RDONLY; + if (fmode == FMODE_WRITE) + return O_WRONLY; + return 0; +} + +static int hppfs_open(struct inode *inode, struct file *file) +{ + const struct cred *cred = file->f_cred; + struct hppfs_private *data; + struct vfsmount *proc_mnt; + struct dentry *proc_dentry; + char *host_file; + int err, fd, type, filter; + + err = -ENOMEM; + data = hppfs_data(); + if (data == NULL) + goto out; + + host_file = dentry_name(file->f_path.dentry, strlen("/rw")); + if (host_file == NULL) + goto out_free2; + + proc_dentry = HPPFS_I(inode)->proc_dentry; + proc_mnt = inode->i_sb->s_fs_info; + + /* XXX This isn't closed anywhere */ + data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt), + file_mode(file->f_mode), cred); + err = PTR_ERR(data->proc_file); + if (IS_ERR(data->proc_file)) + goto out_free1; + + type = os_file_type(host_file); + if (type == OS_TYPE_FILE) { + fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); + if (fd >= 0) + data->host_fd = fd; + else + printk(KERN_ERR "hppfs_open : failed to open '%s', " + "errno = %d\n", host_file, -fd); + + data->contents = NULL; + } else if (type == OS_TYPE_DIR) { + fd = open_host_sock(host_file, &filter); + if (fd > 0) { + data->contents = hppfs_get_data(fd, filter, + data->proc_file, + file, &data->len); + if (!IS_ERR(data->contents)) + data->host_fd = fd; + } else + printk(KERN_ERR "hppfs_open : failed to open a socket " + "in '%s', errno = %d\n", host_file, -fd); + } + kfree(host_file); + + file->private_data = data; + return 0; + + out_free1: + kfree(host_file); + out_free2: + free_contents(data->contents); + kfree(data); + out: + return err; +} + +static int hppfs_dir_open(struct inode *inode, struct file *file) +{ + const struct cred *cred = file->f_cred; + struct hppfs_private *data; + struct vfsmount *proc_mnt; + struct dentry *proc_dentry; + int err; + + err = -ENOMEM; + data = hppfs_data(); + if (data == NULL) + goto out; + + proc_dentry = HPPFS_I(inode)->proc_dentry; + proc_mnt = inode->i_sb->s_fs_info; + data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt), + file_mode(file->f_mode), cred); + err = PTR_ERR(data->proc_file); + if (IS_ERR(data->proc_file)) + goto out_free; + + file->private_data = data; + return 0; + + out_free: + kfree(data); + out: + return err; +} + +static loff_t hppfs_llseek(struct file *file, loff_t off, int where) +{ + struct hppfs_private *data = file->private_data; + struct file *proc_file = data->proc_file; + loff_t (*llseek)(struct file *, loff_t, int); + loff_t ret; + + llseek = proc_file->f_path.dentry->d_inode->i_fop->llseek; + if (llseek != NULL) { + ret = (*llseek)(proc_file, off, where); + if (ret < 0) + return ret; + } + + return default_llseek(file, off, where); +} + +static const struct file_operations hppfs_file_fops = { + .owner = NULL, + .llseek = hppfs_llseek, + .read = hppfs_read, + .write = hppfs_write, + .open = hppfs_open, +}; + +struct hppfs_dirent { + void *vfs_dirent; + filldir_t filldir; + struct dentry *dentry; +}; + +static int hppfs_filldir(void *d, const char *name, int size, + loff_t offset, u64 inode, unsigned int type) +{ + struct hppfs_dirent *dirent = d; + + if (file_removed(dirent->dentry, name)) + return 0; + + return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset, + inode, type); +} + +static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) +{ + struct hppfs_private *data = file->private_data; + struct file *proc_file = data->proc_file; + int (*readdir)(struct file *, void *, filldir_t); + struct hppfs_dirent dirent = ((struct hppfs_dirent) + { .vfs_dirent = ent, + .filldir = filldir, + .dentry = file->f_path.dentry + }); + int err; + + readdir = proc_file->f_path.dentry->d_inode->i_fop->readdir; + + proc_file->f_pos = file->f_pos; + err = (*readdir)(proc_file, &dirent, hppfs_filldir); + file->f_pos = proc_file->f_pos; + + return err; +} + +static int hppfs_fsync(struct file *file, int datasync) +{ + return 0; +} + +static const struct file_operations hppfs_dir_fops = { + .owner = NULL, + .readdir = hppfs_readdir, + .open = hppfs_dir_open, + .fsync = hppfs_fsync, + .llseek = default_llseek, +}; + +static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf) +{ + sf->f_blocks = 0; + sf->f_bfree = 0; + sf->f_bavail = 0; + sf->f_files = 0; + sf->f_ffree = 0; + sf->f_type = HPPFS_SUPER_MAGIC; + return 0; +} + +static struct inode *hppfs_alloc_inode(struct super_block *sb) +{ + struct hppfs_inode_info *hi; + + hi = kmalloc(sizeof(*hi), GFP_KERNEL); + if (!hi) + return NULL; + + hi->proc_dentry = NULL; + inode_init_once(&hi->vfs_inode); + return &hi->vfs_inode; +} + +void hppfs_evict_inode(struct inode *ino) +{ + end_writeback(ino); + dput(HPPFS_I(ino)->proc_dentry); + mntput(ino->i_sb->s_fs_info); +} + +static void hppfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kfree(HPPFS_I(inode)); +} + +static void hppfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hppfs_i_callback); +} + +static const struct super_operations hppfs_sbops = { + .alloc_inode = hppfs_alloc_inode, + .destroy_inode = hppfs_destroy_inode, + .evict_inode = hppfs_evict_inode, + .statfs = hppfs_statfs, +}; + +static int hppfs_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer, + buflen); +} + +static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + + return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd); +} + +static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + + if (proc_dentry->d_inode->i_op->put_link) + proc_dentry->d_inode->i_op->put_link(proc_dentry, nd, cookie); +} + +static const struct inode_operations hppfs_dir_iops = { + .lookup = hppfs_lookup, +}; + +static const struct inode_operations hppfs_link_iops = { + .readlink = hppfs_readlink, + .follow_link = hppfs_follow_link, + .put_link = hppfs_put_link, +}; + +static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) +{ + struct inode *proc_ino = dentry->d_inode; + struct inode *inode = new_inode(sb); + + if (!inode) { + dput(dentry); + return ERR_PTR(-ENOMEM); + } + + if (S_ISDIR(dentry->d_inode->i_mode)) { + inode->i_op = &hppfs_dir_iops; + inode->i_fop = &hppfs_dir_fops; + } else if (S_ISLNK(dentry->d_inode->i_mode)) { + inode->i_op = &hppfs_link_iops; + inode->i_fop = &hppfs_file_fops; + } else { + inode->i_op = &hppfs_file_iops; + inode->i_fop = &hppfs_file_fops; + } + + HPPFS_I(inode)->proc_dentry = dentry; + + inode->i_uid = proc_ino->i_uid; + inode->i_gid = proc_ino->i_gid; + inode->i_atime = proc_ino->i_atime; + inode->i_mtime = proc_ino->i_mtime; + inode->i_ctime = proc_ino->i_ctime; + inode->i_ino = proc_ino->i_ino; + inode->i_mode = proc_ino->i_mode; + inode->i_nlink = proc_ino->i_nlink; + inode->i_size = proc_ino->i_size; + inode->i_blocks = proc_ino->i_blocks; + + return inode; +} + +static int hppfs_fill_super(struct super_block *sb, void *d, int silent) +{ + struct inode *root_inode; + struct vfsmount *proc_mnt; + int err = -ENOENT; + + proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt); + if (IS_ERR(proc_mnt)) + goto out; + + sb->s_blocksize = 1024; + sb->s_blocksize_bits = 10; + sb->s_magic = HPPFS_SUPER_MAGIC; + sb->s_op = &hppfs_sbops; + sb->s_fs_info = proc_mnt; + + err = -ENOMEM; + root_inode = get_inode(sb, dget(proc_mnt->mnt_sb->s_root)); + if (!root_inode) + goto out_mntput; + + sb->s_root = d_alloc_root(root_inode); + if (!sb->s_root) + goto out_iput; + + return 0; + + out_iput: + iput(root_inode); + out_mntput: + mntput(proc_mnt); + out: + return(err); +} + +static struct dentry *hppfs_read_super(struct file_system_type *type, + int flags, const char *dev_name, + void *data) +{ + return mount_nodev(type, flags, data, hppfs_fill_super); +} + +static struct file_system_type hppfs_type = { + .owner = THIS_MODULE, + .name = "hppfs", + .mount = hppfs_read_super, + .kill_sb = kill_anon_super, + .fs_flags = 0, +}; + +static int __init init_hppfs(void) +{ + return register_filesystem(&hppfs_type); +} + +static void __exit exit_hppfs(void) +{ + unregister_filesystem(&hppfs_type); +} + +module_init(init_hppfs) +module_exit(exit_hppfs) +MODULE_LICENSE("GPL"); diff --git a/fs/hugetlbfs/Makefile b/fs/hugetlbfs/Makefile new file mode 100644 index 00000000..6adf870c --- /dev/null +++ b/fs/hugetlbfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux ramfs routines. +# + +obj-$(CONFIG_HUGETLBFS) += hugetlbfs.o + +hugetlbfs-objs := inode.o diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c new file mode 100644 index 00000000..8b0c8753 --- /dev/null +++ b/fs/hugetlbfs/inode.c @@ -0,0 +1,1033 @@ +/* + * hugetlbpage-backed filesystem. Based on ramfs. + * + * William Irwin, 2002 + * + * Copyright (C) 2002 Linus Torvalds. + */ + +#include +#include +#include +#include /* remove ASAP */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static const struct super_operations hugetlbfs_ops; +static const struct address_space_operations hugetlbfs_aops; +const struct file_operations hugetlbfs_file_operations; +static const struct inode_operations hugetlbfs_dir_inode_operations; +static const struct inode_operations hugetlbfs_inode_operations; + +static struct backing_dev_info hugetlbfs_backing_dev_info = { + .name = "hugetlbfs", + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, +}; + +int sysctl_hugetlb_shm_group; + +enum { + Opt_size, Opt_nr_inodes, + Opt_mode, Opt_uid, Opt_gid, + Opt_pagesize, + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_size, "size=%s"}, + {Opt_nr_inodes, "nr_inodes=%s"}, + {Opt_mode, "mode=%o"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_pagesize, "pagesize=%s"}, + {Opt_err, NULL}, +}; + +static void huge_pagevec_release(struct pagevec *pvec) +{ + int i; + + for (i = 0; i < pagevec_count(pvec); ++i) + put_page(pvec->pages[i]); + + pagevec_reinit(pvec); +} + +static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode = file->f_path.dentry->d_inode; + loff_t len, vma_len; + int ret; + struct hstate *h = hstate_file(file); + + /* + * vma address alignment (but not the pgoff alignment) has + * already been checked by prepare_hugepage_range. If you add + * any error returns here, do so after setting VM_HUGETLB, so + * is_vm_hugetlb_page tests below unmap_region go the right + * way when do_mmap_pgoff unwinds (may be important on powerpc + * and ia64). + */ + vma->vm_flags |= VM_HUGETLB | VM_RESERVED; + vma->vm_ops = &hugetlb_vm_ops; + + if (vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT)) + return -EINVAL; + + vma_len = (loff_t)(vma->vm_end - vma->vm_start); + + mutex_lock(&inode->i_mutex); + file_accessed(file); + + ret = -ENOMEM; + len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + + if (hugetlb_reserve_pages(inode, + vma->vm_pgoff >> huge_page_order(h), + len >> huge_page_shift(h), vma, + vma->vm_flags)) + goto out; + + ret = 0; + hugetlb_prefault_arch_hook(vma->vm_mm); + if (vma->vm_flags & VM_WRITE && inode->i_size < len) + inode->i_size = len; +out: + mutex_unlock(&inode->i_mutex); + + return ret; +} + +/* + * Called under down_write(mmap_sem). + */ + +#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA +static unsigned long +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long start_addr; + struct hstate *h = hstate_file(file); + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) { + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + return addr; + } + + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + start_addr = mm->free_area_cache; + + if (len <= mm->cached_hole_size) + start_addr = TASK_UNMAPPED_BASE; + +full_search: + addr = ALIGN(start_addr, huge_page_size(h)); + + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr) { + /* + * Start a new search - just in case we missed + * some holes. + */ + if (start_addr != TASK_UNMAPPED_BASE) { + start_addr = TASK_UNMAPPED_BASE; + goto full_search; + } + return -ENOMEM; + } + + if (!vma || addr + len <= vma->vm_start) + return addr; + addr = ALIGN(vma->vm_end, huge_page_size(h)); + } +} +#endif + +static int +hugetlbfs_read_actor(struct page *page, unsigned long offset, + char __user *buf, unsigned long count, + unsigned long size) +{ + char *kaddr; + unsigned long left, copied = 0; + int i, chunksize; + + if (size > count) + size = count; + + /* Find which 4k chunk and offset with in that chunk */ + i = offset >> PAGE_CACHE_SHIFT; + offset = offset & ~PAGE_CACHE_MASK; + + while (size) { + chunksize = PAGE_CACHE_SIZE; + if (offset) + chunksize -= offset; + if (chunksize > size) + chunksize = size; + kaddr = kmap(&page[i]); + left = __copy_to_user(buf, kaddr + offset, chunksize); + kunmap(&page[i]); + if (left) { + copied += (chunksize - left); + break; + } + offset = 0; + size -= chunksize; + buf += chunksize; + copied += chunksize; + i++; + } + return copied ? copied : -EFAULT; +} + +/* + * Support for read() - Find the page attached to f_mapping and copy out the + * data. Its *very* similar to do_generic_mapping_read(), we can't use that + * since it has PAGE_CACHE_SIZE assumptions. + */ +static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, + size_t len, loff_t *ppos) +{ + struct hstate *h = hstate_file(filp); + struct address_space *mapping = filp->f_mapping; + struct inode *inode = mapping->host; + unsigned long index = *ppos >> huge_page_shift(h); + unsigned long offset = *ppos & ~huge_page_mask(h); + unsigned long end_index; + loff_t isize; + ssize_t retval = 0; + + /* validate length */ + if (len == 0) + goto out; + + for (;;) { + struct page *page; + unsigned long nr, ret; + int ra; + + /* nr is the maximum number of bytes to copy from this page */ + nr = huge_page_size(h); + isize = i_size_read(inode); + if (!isize) + goto out; + end_index = (isize - 1) >> huge_page_shift(h); + if (index >= end_index) { + if (index > end_index) + goto out; + nr = ((isize - 1) & ~huge_page_mask(h)) + 1; + if (nr <= offset) + goto out; + } + nr = nr - offset; + + /* Find the page */ + page = find_lock_page(mapping, index); + if (unlikely(page == NULL)) { + /* + * We have a HOLE, zero out the user-buffer for the + * length of the hole or request. + */ + ret = len < nr ? len : nr; + if (clear_user(buf, ret)) + ra = -EFAULT; + else + ra = 0; + } else { + unlock_page(page); + + /* + * We have the page, copy it to user space buffer. + */ + ra = hugetlbfs_read_actor(page, offset, buf, len, nr); + ret = ra; + page_cache_release(page); + } + if (ra < 0) { + if (retval == 0) + retval = ra; + goto out; + } + + offset += ret; + retval += ret; + len -= ret; + index += offset >> huge_page_shift(h); + offset &= ~huge_page_mask(h); + + /* short read or no more work */ + if ((ret != nr) || (len == 0)) + break; + } +out: + *ppos = ((loff_t)index << huge_page_shift(h)) + offset; + return retval; +} + +static int hugetlbfs_write_begin(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + return -EINVAL; +} + +static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + BUG(); + return -EINVAL; +} + +static void truncate_huge_page(struct page *page) +{ + cancel_dirty_page(page, /* No IO accounting for huge pages? */0); + ClearPageUptodate(page); + delete_from_page_cache(page); +} + +static void truncate_hugepages(struct inode *inode, loff_t lstart) +{ + struct hstate *h = hstate_inode(inode); + struct address_space *mapping = &inode->i_data; + const pgoff_t start = lstart >> huge_page_shift(h); + struct pagevec pvec; + pgoff_t next; + int i, freed = 0; + + pagevec_init(&pvec, 0); + next = start; + while (1) { + if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + if (next == start) + break; + next = start; + continue; + } + + for (i = 0; i < pagevec_count(&pvec); ++i) { + struct page *page = pvec.pages[i]; + + lock_page(page); + if (page->index > next) + next = page->index; + ++next; + truncate_huge_page(page); + unlock_page(page); + freed++; + } + huge_pagevec_release(&pvec); + } + BUG_ON(!lstart && mapping->nrpages); + hugetlb_unreserve_pages(inode, start, freed); +} + +static void hugetlbfs_evict_inode(struct inode *inode) +{ + truncate_hugepages(inode, 0); + end_writeback(inode); +} + +static inline void +hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + + vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) { + unsigned long v_offset; + + /* + * Can the expression below overflow on 32-bit arches? + * No, because the prio_tree returns us only those vmas + * which overlap the truncated area starting at pgoff, + * and no vma on a 32-bit arch can span beyond the 4GB. + */ + if (vma->vm_pgoff < pgoff) + v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; + else + v_offset = 0; + + __unmap_hugepage_range(vma, + vma->vm_start + v_offset, vma->vm_end, NULL); + } +} + +static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) +{ + pgoff_t pgoff; + struct address_space *mapping = inode->i_mapping; + struct hstate *h = hstate_inode(inode); + + BUG_ON(offset & ~huge_page_mask(h)); + pgoff = offset >> PAGE_SHIFT; + + i_size_write(inode, offset); + mutex_lock(&mapping->i_mmap_mutex); + if (!prio_tree_empty(&mapping->i_mmap)) + hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); + mutex_unlock(&mapping->i_mmap_mutex); + truncate_hugepages(inode, offset); + return 0; +} + +static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct hstate *h = hstate_inode(inode); + int error; + unsigned int ia_valid = attr->ia_valid; + + BUG_ON(!inode); + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (ia_valid & ATTR_SIZE) { + error = -EINVAL; + if (attr->ia_size & ~huge_page_mask(h)) + return -EINVAL; + error = hugetlb_vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, + gid_t gid, int mode, dev_t dev) +{ + struct inode *inode; + + inode = new_inode(sb); + if (inode) { + struct hugetlbfs_inode_info *info; + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_uid = uid; + inode->i_gid = gid; + inode->i_mapping->a_ops = &hugetlbfs_aops; + inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + INIT_LIST_HEAD(&inode->i_mapping->private_list); + info = HUGETLBFS_I(inode); + /* + * The policy is initialized here even if we are creating a + * private inode because initialization simply creates an + * an empty rb tree and calls spin_lock_init(), later when we + * call mpol_free_shared_policy() it will just return because + * the rb tree will still be empty. + */ + mpol_shared_policy_init(&info->policy, NULL); + switch (mode & S_IFMT) { + default: + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_op = &hugetlbfs_inode_operations; + inode->i_fop = &hugetlbfs_file_operations; + break; + case S_IFDIR: + inode->i_op = &hugetlbfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + break; + case S_IFLNK: + inode->i_op = &page_symlink_inode_operations; + break; + } + } + return inode; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +static int hugetlbfs_mknod(struct inode *dir, + struct dentry *dentry, int mode, dev_t dev) +{ + struct inode *inode; + int error = -ENOSPC; + gid_t gid; + + if (dir->i_mode & S_ISGID) { + gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else { + gid = current_fsgid(); + } + inode = hugetlbfs_get_inode(dir->i_sb, current_fsuid(), gid, mode, dev); + if (inode) { + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + error = 0; + } + return error; +} + +static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); + if (!retval) + inc_nlink(dir); + return retval; +} + +static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +{ + return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); +} + +static int hugetlbfs_symlink(struct inode *dir, + struct dentry *dentry, const char *symname) +{ + struct inode *inode; + int error = -ENOSPC; + gid_t gid; + + if (dir->i_mode & S_ISGID) + gid = dir->i_gid; + else + gid = current_fsgid(); + + inode = hugetlbfs_get_inode(dir->i_sb, current_fsuid(), + gid, S_IFLNK|S_IRWXUGO, 0); + if (inode) { + int l = strlen(symname)+1; + error = page_symlink(inode, symname, l); + if (!error) { + d_instantiate(dentry, inode); + dget(dentry); + } else + iput(inode); + } + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + return error; +} + +/* + * mark the head page dirty + */ +static int hugetlbfs_set_page_dirty(struct page *page) +{ + struct page *head = compound_head(page); + + SetPageDirty(head); + return 0; +} + +static int hugetlbfs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + int rc; + + rc = migrate_huge_page_move_mapping(mapping, newpage, page); + if (rc) + return rc; + migrate_page_copy(newpage, page); + + return 0; +} + +static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); + struct hstate *h = hstate_inode(dentry->d_inode); + + buf->f_type = HUGETLBFS_MAGIC; + buf->f_bsize = huge_page_size(h); + if (sbinfo) { + spin_lock(&sbinfo->stat_lock); + /* If no limits set, just report 0 for max/free/used + * blocks, like simple_statfs() */ + if (sbinfo->max_blocks >= 0) { + buf->f_blocks = sbinfo->max_blocks; + buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; + buf->f_files = sbinfo->max_inodes; + buf->f_ffree = sbinfo->free_inodes; + } + spin_unlock(&sbinfo->stat_lock); + } + buf->f_namelen = NAME_MAX; + return 0; +} + +static void hugetlbfs_put_super(struct super_block *sb) +{ + struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); + + if (sbi) { + sb->s_fs_info = NULL; + kfree(sbi); + } +} + +static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) +{ + if (sbinfo->free_inodes >= 0) { + spin_lock(&sbinfo->stat_lock); + if (unlikely(!sbinfo->free_inodes)) { + spin_unlock(&sbinfo->stat_lock); + return 0; + } + sbinfo->free_inodes--; + spin_unlock(&sbinfo->stat_lock); + } + + return 1; +} + +static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) +{ + if (sbinfo->free_inodes >= 0) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } +} + + +static struct kmem_cache *hugetlbfs_inode_cachep; + +static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) +{ + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); + struct hugetlbfs_inode_info *p; + + if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) + return NULL; + p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL); + if (unlikely(!p)) { + hugetlbfs_inc_free_inodes(sbinfo); + return NULL; + } + return &p->vfs_inode; +} + +static void hugetlbfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); +} + +static void hugetlbfs_destroy_inode(struct inode *inode) +{ + hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); + mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); + call_rcu(&inode->i_rcu, hugetlbfs_i_callback); +} + +static const struct address_space_operations hugetlbfs_aops = { + .write_begin = hugetlbfs_write_begin, + .write_end = hugetlbfs_write_end, + .set_page_dirty = hugetlbfs_set_page_dirty, + .migratepage = hugetlbfs_migrate_page, +}; + + +static void init_once(void *foo) +{ + struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; + + inode_init_once(&ei->vfs_inode); +} + +const struct file_operations hugetlbfs_file_operations = { + .read = hugetlbfs_read, + .mmap = hugetlbfs_file_mmap, + .fsync = noop_fsync, + .get_unmapped_area = hugetlb_get_unmapped_area, + .llseek = default_llseek, +}; + +static const struct inode_operations hugetlbfs_dir_inode_operations = { + .create = hugetlbfs_create, + .lookup = simple_lookup, + .link = simple_link, + .unlink = simple_unlink, + .symlink = hugetlbfs_symlink, + .mkdir = hugetlbfs_mkdir, + .rmdir = simple_rmdir, + .mknod = hugetlbfs_mknod, + .rename = simple_rename, + .setattr = hugetlbfs_setattr, +}; + +static const struct inode_operations hugetlbfs_inode_operations = { + .setattr = hugetlbfs_setattr, +}; + +static const struct super_operations hugetlbfs_ops = { + .alloc_inode = hugetlbfs_alloc_inode, + .destroy_inode = hugetlbfs_destroy_inode, + .evict_inode = hugetlbfs_evict_inode, + .statfs = hugetlbfs_statfs, + .put_super = hugetlbfs_put_super, + .show_options = generic_show_options, +}; + +static int +hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) +{ + char *p, *rest; + substring_t args[MAX_OPT_ARGS]; + int option; + unsigned long long size = 0; + enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + goto bad_val; + pconfig->uid = option; + break; + + case Opt_gid: + if (match_int(&args[0], &option)) + goto bad_val; + pconfig->gid = option; + break; + + case Opt_mode: + if (match_octal(&args[0], &option)) + goto bad_val; + pconfig->mode = option & 01777U; + break; + + case Opt_size: { + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(*args[0].from)) + goto bad_val; + size = memparse(args[0].from, &rest); + setsize = SIZE_STD; + if (*rest == '%') + setsize = SIZE_PERCENT; + break; + } + + case Opt_nr_inodes: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(*args[0].from)) + goto bad_val; + pconfig->nr_inodes = memparse(args[0].from, &rest); + break; + + case Opt_pagesize: { + unsigned long ps; + ps = memparse(args[0].from, &rest); + pconfig->hstate = size_to_hstate(ps); + if (!pconfig->hstate) { + printk(KERN_ERR + "hugetlbfs: Unsupported page size %lu MB\n", + ps >> 20); + return -EINVAL; + } + break; + } + + default: + printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", + p); + return -EINVAL; + break; + } + } + + /* Do size after hstate is set up */ + if (setsize > NO_SIZE) { + struct hstate *h = pconfig->hstate; + if (setsize == SIZE_PERCENT) { + size <<= huge_page_shift(h); + size *= h->max_huge_pages; + do_div(size, 100); + } + pconfig->nr_blocks = (size >> huge_page_shift(h)); + } + + return 0; + +bad_val: + printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", + args[0].from, p); + return -EINVAL; +} + +static int +hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode * inode; + struct dentry * root; + int ret; + struct hugetlbfs_config config; + struct hugetlbfs_sb_info *sbinfo; + + save_mount_options(sb, data); + + config.nr_blocks = -1; /* No limit on size by default */ + config.nr_inodes = -1; /* No limit on number of inodes by default */ + config.uid = current_fsuid(); + config.gid = current_fsgid(); + config.mode = 0755; + config.hstate = &default_hstate; + ret = hugetlbfs_parse_options(data, &config); + if (ret) + return ret; + + sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); + if (!sbinfo) + return -ENOMEM; + sb->s_fs_info = sbinfo; + sbinfo->hstate = config.hstate; + spin_lock_init(&sbinfo->stat_lock); + sbinfo->max_blocks = config.nr_blocks; + sbinfo->free_blocks = config.nr_blocks; + sbinfo->max_inodes = config.nr_inodes; + sbinfo->free_inodes = config.nr_inodes; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = huge_page_size(config.hstate); + sb->s_blocksize_bits = huge_page_shift(config.hstate); + sb->s_magic = HUGETLBFS_MAGIC; + sb->s_op = &hugetlbfs_ops; + sb->s_time_gran = 1; + inode = hugetlbfs_get_inode(sb, config.uid, config.gid, + S_IFDIR | config.mode, 0); + if (!inode) + goto out_free; + + root = d_alloc_root(inode); + if (!root) { + iput(inode); + goto out_free; + } + sb->s_root = root; + return 0; +out_free: + kfree(sbinfo); + return -ENOMEM; +} + +int hugetlb_get_quota(struct address_space *mapping, long delta) +{ + int ret = 0; + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); + + if (sbinfo->free_blocks > -1) { + spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_blocks - delta >= 0) + sbinfo->free_blocks -= delta; + else + ret = -ENOMEM; + spin_unlock(&sbinfo->stat_lock); + } + + return ret; +} + +void hugetlb_put_quota(struct address_space *mapping, long delta) +{ + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); + + if (sbinfo->free_blocks > -1) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_blocks += delta; + spin_unlock(&sbinfo->stat_lock); + } +} + +static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super); +} + +static struct file_system_type hugetlbfs_fs_type = { + .name = "hugetlbfs", + .mount = hugetlbfs_mount, + .kill_sb = kill_litter_super, +}; + +static struct vfsmount *hugetlbfs_vfsmount; + +static int can_do_hugetlb_shm(void) +{ + return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); +} + +struct file *hugetlb_file_setup(const char *name, size_t size, + vm_flags_t acctflag, + struct user_struct **user, int creat_flags) +{ + int error = -ENOMEM; + struct file *file; + struct inode *inode; + struct path path; + struct dentry *root; + struct qstr quick_string; + + *user = NULL; + if (!hugetlbfs_vfsmount) + return ERR_PTR(-ENOENT); + + if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { + *user = current_user(); + if (user_shm_lock(size, *user)) { + printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n"); + } else { + *user = NULL; + return ERR_PTR(-EPERM); + } + } + + root = hugetlbfs_vfsmount->mnt_root; + quick_string.name = name; + quick_string.len = strlen(quick_string.name); + quick_string.hash = 0; + path.dentry = d_alloc(root, &quick_string); + if (!path.dentry) + goto out_shm_unlock; + + path.mnt = mntget(hugetlbfs_vfsmount); + error = -ENOSPC; + inode = hugetlbfs_get_inode(root->d_sb, current_fsuid(), + current_fsgid(), S_IFREG | S_IRWXUGO, 0); + if (!inode) + goto out_dentry; + + error = -ENOMEM; + if (hugetlb_reserve_pages(inode, 0, + size >> huge_page_shift(hstate_inode(inode)), NULL, + acctflag)) + goto out_inode; + + d_instantiate(path.dentry, inode); + inode->i_size = size; + inode->i_nlink = 0; + + error = -ENFILE; + file = alloc_file(&path, FMODE_WRITE | FMODE_READ, + &hugetlbfs_file_operations); + if (!file) + goto out_dentry; /* inode is already attached */ + + return file; + +out_inode: + iput(inode); +out_dentry: + path_put(&path); +out_shm_unlock: + if (*user) { + user_shm_unlock(size, *user); + *user = NULL; + } + return ERR_PTR(error); +} + +static int __init init_hugetlbfs_fs(void) +{ + int error; + struct vfsmount *vfsmount; + + error = bdi_init(&hugetlbfs_backing_dev_info); + if (error) + return error; + + hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", + sizeof(struct hugetlbfs_inode_info), + 0, 0, init_once); + if (hugetlbfs_inode_cachep == NULL) + goto out2; + + error = register_filesystem(&hugetlbfs_fs_type); + if (error) + goto out; + + vfsmount = kern_mount(&hugetlbfs_fs_type); + + if (!IS_ERR(vfsmount)) { + hugetlbfs_vfsmount = vfsmount; + return 0; + } + + error = PTR_ERR(vfsmount); + + out: + if (error) + kmem_cache_destroy(hugetlbfs_inode_cachep); + out2: + bdi_destroy(&hugetlbfs_backing_dev_info); + return error; +} + +static void __exit exit_hugetlbfs_fs(void) +{ + kmem_cache_destroy(hugetlbfs_inode_cachep); + unregister_filesystem(&hugetlbfs_fs_type); + bdi_destroy(&hugetlbfs_backing_dev_info); +} + +module_init(init_hugetlbfs_fs) +module_exit(exit_hugetlbfs_fs) + +MODULE_LICENSE("GPL"); diff --git a/fs/inode.c b/fs/inode.c new file mode 100644 index 00000000..43566d17 --- /dev/null +++ b/fs/inode.c @@ -0,0 +1,1697 @@ +/* + * (C) 1997 Linus Torvalds + * (C) 1999 Andrea Arcangeli (dynamic inode allocation) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for inode_has_buffers */ +#include "internal.h" + +/* + * Inode locking rules: + * + * inode->i_lock protects: + * inode->i_state, inode->i_hash, __iget() + * inode_lru_lock protects: + * inode_lru, inode->i_lru + * inode_sb_list_lock protects: + * sb->s_inodes, inode->i_sb_list + * inode_wb_list_lock protects: + * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list + * inode_hash_lock protects: + * inode_hashtable, inode->i_hash + * + * Lock ordering: + * + * inode_sb_list_lock + * inode->i_lock + * inode_lru_lock + * + * inode_wb_list_lock + * inode->i_lock + * + * inode_hash_lock + * inode_sb_list_lock + * inode->i_lock + * + * iunique_lock + * inode_hash_lock + */ + +static unsigned int i_hash_mask __read_mostly; +static unsigned int i_hash_shift __read_mostly; +static struct hlist_head *inode_hashtable __read_mostly; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); + +static LIST_HEAD(inode_lru); +static DEFINE_SPINLOCK(inode_lru_lock); + +__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); + +/* + * iprune_sem provides exclusion between the icache shrinking and the + * umount path. + * + * We don't actually need it to protect anything in the umount path, + * but only need to cycle through it to make sure any inode that + * prune_icache took off the LRU list has been fully torn down by the + * time we are past evict_inodes. + */ +static DECLARE_RWSEM(iprune_sem); + +/* + * Empty aops. Can be used for the cases where the user does not + * define any of the address_space operations. + */ +const struct address_space_operations empty_aops = { +}; +EXPORT_SYMBOL(empty_aops); + +/* + * Statistics gathering.. + */ +struct inodes_stat_t inodes_stat; + +static DEFINE_PER_CPU(unsigned int, nr_inodes); + +static struct kmem_cache *inode_cachep __read_mostly; + +static int get_nr_inodes(void) +{ + int i; + int sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_inodes, i); + return sum < 0 ? 0 : sum; +} + +static inline int get_nr_inodes_unused(void) +{ + return inodes_stat.nr_unused; +} + +int get_nr_dirty_inodes(void) +{ + /* not actually dirty inodes, but a wild approximation */ + int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); + return nr_dirty > 0 ? nr_dirty : 0; +} + +/* + * Handle nr_inode sysctl + */ +#ifdef CONFIG_SYSCTL +int proc_nr_inodes(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + inodes_stat.nr_inodes = get_nr_inodes(); + return proc_dointvec(table, write, buffer, lenp, ppos); +} +#endif + +/** + * inode_init_always - perform inode structure intialisation + * @sb: superblock inode belongs to + * @inode: inode to initialise + * + * These are initializations that need to be done on every inode + * allocation as the fields are not initialised by slab allocation. + */ +int inode_init_always(struct super_block *sb, struct inode *inode) +{ + static const struct inode_operations empty_iops; + static const struct file_operations empty_fops; + struct address_space *const mapping = &inode->i_data; + + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; + inode->i_nlink = 1; + inode->i_uid = 0; + inode->i_gid = 0; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; +#ifdef CONFIG_QUOTA + memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); +#endif + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_cdev = NULL; + inode->i_rdev = 0; + inode->dirtied_when = 0; + + if (security_inode_alloc(inode)) + goto out; + spin_lock_init(&inode->i_lock); + lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); + + mutex_init(&inode->i_mutex); + lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); + + init_rwsem(&inode->i_alloc_sem); + lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); + + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + mapping->writeback_index = 0; + + /* + * If the block_device provides a backing_dev_info for client + * inodes then use that. Otherwise the inode share the bdev's + * backing_dev_info. + */ + if (sb->s_bdev) { + struct backing_dev_info *bdi; + + bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + mapping->backing_dev_info = bdi; + } + inode->i_private = NULL; + inode->i_mapping = mapping; +#ifdef CONFIG_FS_POSIX_ACL + inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; +#endif + +#ifdef CONFIG_FSNOTIFY + inode->i_fsnotify_mask = 0; +#endif + + this_cpu_inc(nr_inodes); + + return 0; +out: + return -ENOMEM; +} +EXPORT_SYMBOL(inode_init_always); + +static struct inode *alloc_inode(struct super_block *sb) +{ + struct inode *inode; + + if (sb->s_op->alloc_inode) + inode = sb->s_op->alloc_inode(sb); + else + inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); + + if (!inode) + return NULL; + + if (unlikely(inode_init_always(sb, inode))) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, inode); + return NULL; + } + + return inode; +} + +void free_inode_nonrcu(struct inode *inode) +{ + kmem_cache_free(inode_cachep, inode); +} +EXPORT_SYMBOL(free_inode_nonrcu); + +void __destroy_inode(struct inode *inode) +{ + BUG_ON(inode_has_buffers(inode)); + security_inode_free(inode); + fsnotify_inode_delete(inode); +#ifdef CONFIG_FS_POSIX_ACL + if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED) + posix_acl_release(inode->i_acl); + if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) + posix_acl_release(inode->i_default_acl); +#endif + this_cpu_dec(nr_inodes); +} +EXPORT_SYMBOL(__destroy_inode); + +static void i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(inode_cachep, inode); +} + +static void destroy_inode(struct inode *inode) +{ + BUG_ON(!list_empty(&inode->i_lru)); + __destroy_inode(inode); + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + call_rcu(&inode->i_rcu, i_callback); +} + +void address_space_init_once(struct address_space *mapping) +{ + memset(mapping, 0, sizeof(*mapping)); + INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); + spin_lock_init(&mapping->tree_lock); + mutex_init(&mapping->i_mmap_mutex); + INIT_LIST_HEAD(&mapping->private_list); + spin_lock_init(&mapping->private_lock); + INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); + INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); +} +EXPORT_SYMBOL(address_space_init_once); + +/* + * These are initializations that only need to be done + * once, because the fields are idempotent across use + * of the inode, so let the slab aware of that. + */ +void inode_init_once(struct inode *inode) +{ + memset(inode, 0, sizeof(*inode)); + INIT_HLIST_NODE(&inode->i_hash); + INIT_LIST_HEAD(&inode->i_dentry); + INIT_LIST_HEAD(&inode->i_devices); + INIT_LIST_HEAD(&inode->i_wb_list); + INIT_LIST_HEAD(&inode->i_lru); + address_space_init_once(&inode->i_data); + i_size_ordered_init(inode); +#ifdef CONFIG_FSNOTIFY + INIT_HLIST_HEAD(&inode->i_fsnotify_marks); +#endif +} +EXPORT_SYMBOL(inode_init_once); + +static void init_once(void *foo) +{ + struct inode *inode = (struct inode *) foo; + + inode_init_once(inode); +} + +/* + * inode->i_lock must be held + */ +void __iget(struct inode *inode) +{ + atomic_inc(&inode->i_count); +} + +/* + * get additional reference to inode; caller must already hold one. + */ +void ihold(struct inode *inode) +{ + WARN_ON(atomic_inc_return(&inode->i_count) < 2); +} +EXPORT_SYMBOL(ihold); + +static void inode_lru_list_add(struct inode *inode) +{ + spin_lock(&inode_lru_lock); + if (list_empty(&inode->i_lru)) { + list_add(&inode->i_lru, &inode_lru); + inodes_stat.nr_unused++; + } + spin_unlock(&inode_lru_lock); +} + +static void inode_lru_list_del(struct inode *inode) +{ + spin_lock(&inode_lru_lock); + if (!list_empty(&inode->i_lru)) { + list_del_init(&inode->i_lru); + inodes_stat.nr_unused--; + } + spin_unlock(&inode_lru_lock); +} + +/** + * inode_sb_list_add - add inode to the superblock list of inodes + * @inode: inode to add + */ +void inode_sb_list_add(struct inode *inode) +{ + spin_lock(&inode_sb_list_lock); + list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); + spin_unlock(&inode_sb_list_lock); +} +EXPORT_SYMBOL_GPL(inode_sb_list_add); + +static inline void inode_sb_list_del(struct inode *inode) +{ + spin_lock(&inode_sb_list_lock); + list_del_init(&inode->i_sb_list); + spin_unlock(&inode_sb_list_lock); +} + +static unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp; + + tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / + L1_CACHE_BYTES; + tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); + return tmp & i_hash_mask; +} + +/** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. + * + * Add an inode to the inode hash for this superblock. + */ +void __insert_inode_hash(struct inode *inode, unsigned long hashval) +{ + struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); + + spin_lock(&inode_hash_lock); + spin_lock(&inode->i_lock); + hlist_add_head(&inode->i_hash, b); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); +} +EXPORT_SYMBOL(__insert_inode_hash); + +/** + * remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock. + */ +void remove_inode_hash(struct inode *inode) +{ + spin_lock(&inode_hash_lock); + spin_lock(&inode->i_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); +} +EXPORT_SYMBOL(remove_inode_hash); + +void end_writeback(struct inode *inode) +{ + might_sleep(); + /* + * We have to cycle tree_lock here because reclaim can be still in the + * process of removing the last page (in __delete_from_page_cache()) + * and we must not free mapping under it. + */ + spin_lock_irq(&inode->i_data.tree_lock); + BUG_ON(inode->i_data.nrpages); + spin_unlock_irq(&inode->i_data.tree_lock); + BUG_ON(!list_empty(&inode->i_data.private_list)); + BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(inode->i_state & I_CLEAR); + inode_sync_wait(inode); + /* don't need i_lock here, no concurrent mods to i_state */ + inode->i_state = I_FREEING | I_CLEAR; +} +EXPORT_SYMBOL(end_writeback); + +/* + * Free the inode passed in, removing it from the lists it is still connected + * to. We remove any pages still attached to the inode and wait for any IO that + * is still in progress before finally destroying the inode. + * + * An inode must already be marked I_FREEING so that we avoid the inode being + * moved back onto lists if we race with other code that manipulates the lists + * (e.g. writeback_single_inode). The caller is responsible for setting this. + * + * An inode must already be removed from the LRU list before being evicted from + * the cache. This should occur atomically with setting the I_FREEING state + * flag, so no inodes here should ever be on the LRU when being evicted. + */ +static void evict(struct inode *inode) +{ + const struct super_operations *op = inode->i_sb->s_op; + + BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(!list_empty(&inode->i_lru)); + + inode_wb_list_del(inode); + inode_sb_list_del(inode); + + if (op->evict_inode) { + op->evict_inode(inode); + } else { + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + } + if (S_ISBLK(inode->i_mode) && inode->i_bdev) + bd_forget(inode); + if (S_ISCHR(inode->i_mode) && inode->i_cdev) + cd_forget(inode); + + remove_inode_hash(inode); + + spin_lock(&inode->i_lock); + wake_up_bit(&inode->i_state, __I_NEW); + BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); + spin_unlock(&inode->i_lock); + + destroy_inode(inode); +} + +/* + * dispose_list - dispose of the contents of a local list + * @head: the head of the list to free + * + * Dispose-list gets a local list with local inodes in it, so it doesn't + * need to worry about list corruption and SMP locks. + */ +static void dispose_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct inode *inode; + + inode = list_first_entry(head, struct inode, i_lru); + list_del_init(&inode->i_lru); + + evict(inode); + } +} + +/** + * evict_inodes - evict all evictable inodes for a superblock + * @sb: superblock to operate on + * + * Make sure that no inodes with zero refcount are retained. This is + * called by superblock shutdown after having MS_ACTIVE flag removed, + * so any inode reaching zero refcount during or after that call will + * be immediately evicted. + */ +void evict_inodes(struct super_block *sb) +{ + struct inode *inode, *next; + LIST_HEAD(dispose); + + spin_lock(&inode_sb_list_lock); + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { + if (atomic_read(&inode->i_count)) + continue; + + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); + continue; + } + + inode->i_state |= I_FREEING; + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + list_add(&inode->i_lru, &dispose); + } + spin_unlock(&inode_sb_list_lock); + + dispose_list(&dispose); + + /* + * Cycle through iprune_sem to make sure any inode that prune_icache + * moved off the list before we took the lock has been fully torn + * down. + */ + down_write(&iprune_sem); + up_write(&iprune_sem); +} + +/** + * invalidate_inodes - attempt to free all inodes on a superblock + * @sb: superblock to operate on + * @kill_dirty: flag to guide handling of dirty inodes + * + * Attempts to free all inodes for a given superblock. If there were any + * busy inodes return a non-zero value, else zero. + * If @kill_dirty is set, discard dirty inodes too, otherwise treat + * them as busy. + */ +int invalidate_inodes(struct super_block *sb, bool kill_dirty) +{ + int busy = 0; + struct inode *inode, *next; + LIST_HEAD(dispose); + + spin_lock(&inode_sb_list_lock); + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); + continue; + } + if (inode->i_state & I_DIRTY && !kill_dirty) { + spin_unlock(&inode->i_lock); + busy = 1; + continue; + } + if (atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); + busy = 1; + continue; + } + + inode->i_state |= I_FREEING; + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + list_add(&inode->i_lru, &dispose); + } + spin_unlock(&inode_sb_list_lock); + + dispose_list(&dispose); + + return busy; +} + +static int can_unuse(struct inode *inode) +{ + if (inode->i_state & ~I_REFERENCED) + return 0; + if (inode_has_buffers(inode)) + return 0; + if (atomic_read(&inode->i_count)) + return 0; + if (inode->i_data.nrpages) + return 0; + return 1; +} + +/* + * Scan `goal' inodes on the unused list for freeable ones. They are moved to a + * temporary list and then are freed outside inode_lru_lock by dispose_list(). + * + * Any inodes which are pinned purely because of attached pagecache have their + * pagecache removed. If the inode has metadata buffers attached to + * mapping->private_list then try to remove them. + * + * If the inode has the I_REFERENCED flag set, then it means that it has been + * used recently - the flag is set in iput_final(). When we encounter such an + * inode, clear the flag and move it to the back of the LRU so it gets another + * pass through the LRU before it gets reclaimed. This is necessary because of + * the fact we are doing lazy LRU updates to minimise lock contention so the + * LRU does not have strict ordering. Hence we don't want to reclaim inodes + * with this flag set because they are the inodes that are out of order. + */ +static void prune_icache(int nr_to_scan) +{ + LIST_HEAD(freeable); + int nr_scanned; + unsigned long reap = 0; + + down_read(&iprune_sem); + spin_lock(&inode_lru_lock); + for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { + struct inode *inode; + + if (list_empty(&inode_lru)) + break; + + inode = list_entry(inode_lru.prev, struct inode, i_lru); + + /* + * we are inverting the inode_lru_lock/inode->i_lock here, + * so use a trylock. If we fail to get the lock, just move the + * inode to the back of the list so we don't spin on it. + */ + if (!spin_trylock(&inode->i_lock)) { + list_move(&inode->i_lru, &inode_lru); + continue; + } + + /* + * Referenced or dirty inodes are still in use. Give them + * another pass through the LRU as we canot reclaim them now. + */ + if (atomic_read(&inode->i_count) || + (inode->i_state & ~I_REFERENCED)) { + list_del_init(&inode->i_lru); + spin_unlock(&inode->i_lock); + inodes_stat.nr_unused--; + continue; + } + + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { + inode->i_state &= ~I_REFERENCED; + list_move(&inode->i_lru, &inode_lru); + spin_unlock(&inode->i_lock); + continue; + } + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_lru_lock); + if (remove_inode_buffers(inode)) + reap += invalidate_mapping_pages(&inode->i_data, + 0, -1); + iput(inode); + spin_lock(&inode_lru_lock); + + if (inode != list_entry(inode_lru.next, + struct inode, i_lru)) + continue; /* wrong inode or list_empty */ + /* avoid lock inversions with trylock */ + if (!spin_trylock(&inode->i_lock)) + continue; + if (!can_unuse(inode)) { + spin_unlock(&inode->i_lock); + continue; + } + } + WARN_ON(inode->i_state & I_NEW); + inode->i_state |= I_FREEING; + spin_unlock(&inode->i_lock); + + list_move(&inode->i_lru, &freeable); + inodes_stat.nr_unused--; + } + if (current_is_kswapd()) + __count_vm_events(KSWAPD_INODESTEAL, reap); + else + __count_vm_events(PGINODESTEAL, reap); + spin_unlock(&inode_lru_lock); + + dispose_list(&freeable); + up_read(&iprune_sem); +} + +/* + * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, + * "unused" means that no dentries are referring to the inodes: the files are + * not open and the dcache references to those inodes have already been + * reclaimed. + * + * This function is passed the number of inodes to scan, and it returns the + * total number of remaining possibly-reclaimable inodes. + */ +static int shrink_icache_memory(struct shrinker *shrink, + struct shrink_control *sc) +{ + int nr = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + + if (nr) { + /* + * Nasty deadlock avoidance. We may hold various FS locks, + * and we don't want to recurse into the FS that called us + * in clear_inode() and friends.. + */ + if (!(gfp_mask & __GFP_FS)) + return -1; + prune_icache(nr); + } + return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure; +} + +static struct shrinker icache_shrinker = { + .shrink = shrink_icache_memory, + .seeks = DEFAULT_SEEKS, +}; + +static void __wait_on_freeing_inode(struct inode *inode); +/* + * Called with the inode lock held. + */ +static struct inode *find_inode(struct super_block *sb, + struct hlist_head *head, + int (*test)(struct inode *, void *), + void *data) +{ + struct hlist_node *node; + struct inode *inode = NULL; + +repeat: + hlist_for_each_entry(inode, node, head, i_hash) { + spin_lock(&inode->i_lock); + if (inode->i_sb != sb) { + spin_unlock(&inode->i_lock); + continue; + } + if (!test(inode, data)) { + spin_unlock(&inode->i_lock); + continue; + } + if (inode->i_state & (I_FREEING|I_WILL_FREE)) { + __wait_on_freeing_inode(inode); + goto repeat; + } + __iget(inode); + spin_unlock(&inode->i_lock); + return inode; + } + return NULL; +} + +/* + * find_inode_fast is the fast path version of find_inode, see the comment at + * iget_locked for details. + */ +static struct inode *find_inode_fast(struct super_block *sb, + struct hlist_head *head, unsigned long ino) +{ + struct hlist_node *node; + struct inode *inode = NULL; + +repeat: + hlist_for_each_entry(inode, node, head, i_hash) { + spin_lock(&inode->i_lock); + if (inode->i_ino != ino) { + spin_unlock(&inode->i_lock); + continue; + } + if (inode->i_sb != sb) { + spin_unlock(&inode->i_lock); + continue; + } + if (inode->i_state & (I_FREEING|I_WILL_FREE)) { + __wait_on_freeing_inode(inode); + goto repeat; + } + __iget(inode); + spin_unlock(&inode->i_lock); + return inode; + } + return NULL; +} + +/* + * Each cpu owns a range of LAST_INO_BATCH numbers. + * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, + * to renew the exhausted range. + * + * This does not significantly increase overflow rate because every CPU can + * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is + * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the + * 2^32 range, and is a worst-case. Even a 50% wastage would only increase + * overflow rate by 2x, which does not seem too significant. + * + * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW + * error if st_ino won't fit in target struct field. Use 32bit counter + * here to attempt to avoid that. + */ +#define LAST_INO_BATCH 1024 +static DEFINE_PER_CPU(unsigned int, last_ino); + +unsigned int get_next_ino(void) +{ + unsigned int *p = &get_cpu_var(last_ino); + unsigned int res = *p; + +#ifdef CONFIG_SMP + if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { + static atomic_t shared_last_ino; + int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); + + res = next - LAST_INO_BATCH; + } +#endif + + *p = ++res; + put_cpu_var(last_ino); + return res; +} +EXPORT_SYMBOL(get_next_ino); + +/** + * new_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. The default gfp_mask + * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. + * If HIGHMEM pages are unsuitable or it is known that pages allocated + * for the page cache are not reclaimable or migratable, + * mapping_set_gfp_mask() must be called with suitable flags on the + * newly created inode's mapping + * + */ +struct inode *new_inode(struct super_block *sb) +{ + struct inode *inode; + + spin_lock_prefetch(&inode_sb_list_lock); + + inode = alloc_inode(sb); + if (inode) { + spin_lock(&inode->i_lock); + inode->i_state = 0; + spin_unlock(&inode->i_lock); + inode_sb_list_add(inode); + } + return inode; +} +EXPORT_SYMBOL(new_inode); + +/** + * unlock_new_inode - clear the I_NEW state and wake up any waiters + * @inode: new inode to unlock + * + * Called when the inode is fully initialised to clear the new state of the + * inode and wake up anyone waiting for the inode to finish initialisation. + */ +void unlock_new_inode(struct inode *inode) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (S_ISDIR(inode->i_mode)) { + struct file_system_type *type = inode->i_sb->s_type; + + /* Set new key only if filesystem hasn't already changed it */ + if (!lockdep_match_class(&inode->i_mutex, + &type->i_mutex_key)) { + /* + * ensure nobody is actually holding i_mutex + */ + mutex_destroy(&inode->i_mutex); + mutex_init(&inode->i_mutex); + lockdep_set_class(&inode->i_mutex, + &type->i_mutex_dir_key); + } + } +#endif + spin_lock(&inode->i_lock); + WARN_ON(!(inode->i_state & I_NEW)); + inode->i_state &= ~I_NEW; + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(unlock_new_inode); + +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * Search for the inode specified by @hashval and @data in the inode cache, + * and if present it is return it with an increased reference count. This is + * a generalized version of iget_locked() for file systems where the inode + * number is not sufficient for unique identification of an inode. + * + * If the inode is not in cache, allocate a new inode and return it locked, + * hashed, and with the I_NEW flag set. The file system gets to fill it in + * before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_hash_lock held, so can't + * sleep. + */ +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + spin_lock(&inode_hash_lock); + inode = find_inode(sb, head, test, data); + spin_unlock(&inode_hash_lock); + + if (inode) { + wait_on_inode(inode); + return inode; + } + + inode = alloc_inode(sb); + if (inode) { + struct inode *old; + + spin_lock(&inode_hash_lock); + /* We released the lock, so.. */ + old = find_inode(sb, head, test, data); + if (!old) { + if (set(inode, data)) + goto set_failed; + + spin_lock(&inode->i_lock); + inode->i_state = I_NEW; + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + inode_sb_list_add(inode); + spin_unlock(&inode_hash_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + spin_unlock(&inode_hash_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; + +set_failed: + spin_unlock(&inode_hash_lock); + destroy_inode(inode); + return NULL; +} +EXPORT_SYMBOL(iget5_locked); + +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * Search for the inode specified by @ino in the inode cache and if present + * return it with an increased reference count. This is for file systems + * where the inode number is sufficient for unique identification of an inode. + * + * If the inode is not in cache, allocate a new inode and return it locked, + * hashed, and with the I_NEW flag set. The file system gets to fill it in + * before unlocking it via unlock_new_inode(). + */ +struct inode *iget_locked(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + spin_lock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino); + spin_unlock(&inode_hash_lock); + if (inode) { + wait_on_inode(inode); + return inode; + } + + inode = alloc_inode(sb); + if (inode) { + struct inode *old; + + spin_lock(&inode_hash_lock); + /* We released the lock, so.. */ + old = find_inode_fast(sb, head, ino); + if (!old) { + inode->i_ino = ino; + spin_lock(&inode->i_lock); + inode->i_state = I_NEW; + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + inode_sb_list_add(inode); + spin_unlock(&inode_hash_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + spin_unlock(&inode_hash_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; +} +EXPORT_SYMBOL(iget_locked); + +/* + * search the inode cache for a matching inode number. + * If we find one, then the inode number we are trying to + * allocate is not unique and so we should not use it. + * + * Returns 1 if the inode number is unique, 0 if it is not. + */ +static int test_inode_iunique(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *b = inode_hashtable + hash(sb, ino); + struct hlist_node *node; + struct inode *inode; + + spin_lock(&inode_hash_lock); + hlist_for_each_entry(inode, node, b, i_hash) { + if (inode->i_ino == ino && inode->i_sb == sb) { + spin_unlock(&inode_hash_lock); + return 0; + } + } + spin_unlock(&inode_hash_lock); + + return 1; +} + +/** + * iunique - get a unique inode number + * @sb: superblock + * @max_reserved: highest reserved inode number + * + * Obtain an inode number that is unique on the system for a given + * superblock. This is used by file systems that have no natural + * permanent inode numbering system. An inode number is returned that + * is higher than the reserved limit but unique. + * + * BUGS: + * With a large number of inodes live on the file system this function + * currently becomes quite slow. + */ +ino_t iunique(struct super_block *sb, ino_t max_reserved) +{ + /* + * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW + * error if st_ino won't fit in target struct field. Use 32bit counter + * here to attempt to avoid that. + */ + static DEFINE_SPINLOCK(iunique_lock); + static unsigned int counter; + ino_t res; + + spin_lock(&iunique_lock); + do { + if (counter <= max_reserved) + counter = max_reserved + 1; + res = counter++; + } while (!test_inode_iunique(sb, res)); + spin_unlock(&iunique_lock); + + return res; +} +EXPORT_SYMBOL(iunique); + +struct inode *igrab(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { + __iget(inode); + spin_unlock(&inode->i_lock); + } else { + spin_unlock(&inode->i_lock); + /* + * Handle the case where s_op->clear_inode is not been + * called yet, and somebody is calling igrab + * while the inode is getting freed. + */ + inode = NULL; + } + return inode; +} +EXPORT_SYMBOL(igrab); + +/** + * ilookup5_nowait - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * Search for the inode specified by @hashval and @data in the inode cache. + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Note: I_NEW is not waited upon so you have to be very careful what you do + * with the returned inode. You probably should be using ilookup5() instead. + * + * Note2: @test is called with the inode_hash_lock held, so can't sleep. + */ +struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + spin_lock(&inode_hash_lock); + inode = find_inode(sb, head, test, data); + spin_unlock(&inode_hash_lock); + + return inode; +} +EXPORT_SYMBOL(ilookup5_nowait); + +/** + * ilookup5 - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * Search for the inode specified by @hashval and @data in the inode cache, + * and if the inode is in the cache, return the inode with an incremented + * reference count. Waits on I_NEW before returning the inode. + * returned with an incremented reference count. + * + * This is a generalized version of ilookup() for file systems where the + * inode number is not sufficient for unique identification of an inode. + * + * Note: @test is called with the inode_hash_lock held, so can't sleep. + */ +struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct inode *inode = ilookup5_nowait(sb, hashval, test, data); + + if (inode) + wait_on_inode(inode); + return inode; +} +EXPORT_SYMBOL(ilookup5); + +/** + * ilookup - search for an inode in the inode cache + * @sb: super block of file system to search + * @ino: inode number to search for + * + * Search for the inode @ino in the inode cache, and if the inode is in the + * cache, the inode is returned with an incremented reference count. + */ +struct inode *ilookup(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + spin_lock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino); + spin_unlock(&inode_hash_lock); + + if (inode) + wait_on_inode(inode); + return inode; +} +EXPORT_SYMBOL(ilookup); + +int insert_inode_locked(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + ino_t ino = inode->i_ino; + struct hlist_head *head = inode_hashtable + hash(sb, ino); + + while (1) { + struct hlist_node *node; + struct inode *old = NULL; + spin_lock(&inode_hash_lock); + hlist_for_each_entry(old, node, head, i_hash) { + if (old->i_ino != ino) + continue; + if (old->i_sb != sb) + continue; + spin_lock(&old->i_lock); + if (old->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&old->i_lock); + continue; + } + break; + } + if (likely(!node)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); + return 0; + } + __iget(old); + spin_unlock(&old->i_lock); + spin_unlock(&inode_hash_lock); + wait_on_inode(old); + if (unlikely(!inode_unhashed(old))) { + iput(old); + return -EBUSY; + } + iput(old); + } +} +EXPORT_SYMBOL(insert_inode_locked); + +int insert_inode_locked4(struct inode *inode, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct super_block *sb = inode->i_sb; + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + + while (1) { + struct hlist_node *node; + struct inode *old = NULL; + + spin_lock(&inode_hash_lock); + hlist_for_each_entry(old, node, head, i_hash) { + if (old->i_sb != sb) + continue; + if (!test(old, data)) + continue; + spin_lock(&old->i_lock); + if (old->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&old->i_lock); + continue; + } + break; + } + if (likely(!node)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); + return 0; + } + __iget(old); + spin_unlock(&old->i_lock); + spin_unlock(&inode_hash_lock); + wait_on_inode(old); + if (unlikely(!inode_unhashed(old))) { + iput(old); + return -EBUSY; + } + iput(old); + } +} +EXPORT_SYMBOL(insert_inode_locked4); + + +int generic_delete_inode(struct inode *inode) +{ + return 1; +} +EXPORT_SYMBOL(generic_delete_inode); + +/* + * Normal UNIX filesystem behaviour: delete the + * inode when the usage count drops to zero, and + * i_nlink is zero. + */ +int generic_drop_inode(struct inode *inode) +{ + return !inode->i_nlink || inode_unhashed(inode); +} +EXPORT_SYMBOL_GPL(generic_drop_inode); + +/* + * Called when we're dropping the last reference + * to an inode. + * + * Call the FS "drop_inode()" function, defaulting to + * the legacy UNIX filesystem behaviour. If it tells + * us to evict inode, do so. Otherwise, retain inode + * in cache if fs is alive, sync and evict if fs is + * shutting down. + */ +static void iput_final(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + const struct super_operations *op = inode->i_sb->s_op; + int drop; + + WARN_ON(inode->i_state & I_NEW); + + if (op && op->drop_inode) + drop = op->drop_inode(inode); + else + drop = generic_drop_inode(inode); + + if (!drop && (sb->s_flags & MS_ACTIVE)) { + inode->i_state |= I_REFERENCED; + if (!(inode->i_state & (I_DIRTY|I_SYNC))) + inode_lru_list_add(inode); + spin_unlock(&inode->i_lock); + return; + } + + if (!drop) { + inode->i_state |= I_WILL_FREE; + spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + spin_lock(&inode->i_lock); + WARN_ON(inode->i_state & I_NEW); + inode->i_state &= ~I_WILL_FREE; + } + + inode->i_state |= I_FREEING; + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + + evict(inode); +} + +/** + * iput - put an inode + * @inode: inode to put + * + * Puts an inode, dropping its usage count. If the inode use count hits + * zero, the inode is then freed and may also be destroyed. + * + * Consequently, iput() can sleep. + */ +void iput(struct inode *inode) +{ + if (inode) { + BUG_ON(inode->i_state & I_CLEAR); + + if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) + iput_final(inode); + } +} +EXPORT_SYMBOL(iput); + +/** + * bmap - find a block number in a file + * @inode: inode of file + * @block: block to find + * + * Returns the block number on the device holding the inode that + * is the disk block number for the block of the file requested. + * That is, asked for block 4 of inode 1 the function will return the + * disk block relative to the disk start that holds that block of the + * file. + */ +sector_t bmap(struct inode *inode, sector_t block) +{ + sector_t res = 0; + if (inode->i_mapping->a_ops->bmap) + res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); + return res; +} +EXPORT_SYMBOL(bmap); + +/* + * With relative atime, only update atime if the previous atime is + * earlier than either the ctime or mtime or if at least a day has + * passed since the last atime update. + */ +static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, + struct timespec now) +{ + + if (!(mnt->mnt_flags & MNT_RELATIME)) + return 1; + /* + * Is mtime younger than atime? If yes, update atime: + */ + if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0) + return 1; + /* + * Is ctime younger than atime? If yes, update atime: + */ + if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0) + return 1; + + /* + * Is the previous atime value older than a day? If yes, + * update atime: + */ + if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) + return 1; + /* + * Good, we can skip the atime update: + */ + return 0; +} + +/** + * touch_atime - update the access time + * @mnt: mount the inode is accessed on + * @dentry: dentry accessed + * + * Update the accessed time on an inode and mark it for writeback. + * This function automatically handles read only file systems and media, + * as well as the "noatime" flag and inode specific "noatime" markers. + */ +void touch_atime(struct vfsmount *mnt, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct timespec now; + + if (inode->i_flags & S_NOATIME) + return; + if (IS_NOATIME(inode)) + return; + if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) + return; + + if (mnt->mnt_flags & MNT_NOATIME) + return; + if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) + return; + + now = current_fs_time(inode->i_sb); + + if (!relatime_need_update(mnt, inode, now)) + return; + + if (timespec_equal(&inode->i_atime, &now)) + return; + + if (mnt_want_write(mnt)) + return; + + inode->i_atime = now; + mark_inode_dirty_sync(inode); + mnt_drop_write(mnt); +} +EXPORT_SYMBOL(touch_atime); + +/** + * file_update_time - update mtime and ctime time + * @file: file accessed + * + * Update the mtime and ctime members of an inode and mark the inode + * for writeback. Note that this function is meant exclusively for + * usage in the file write path of filesystems, and filesystems may + * choose to explicitly ignore update via this function with the + * S_NOCMTIME inode flag, e.g. for network filesystem where these + * timestamps are handled by the server. + */ + +void file_update_time(struct file *file) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct timespec now; + enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; + + /* First try to exhaust all avenues to not sync */ + if (IS_NOCMTIME(inode)) + return; + + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_mtime, &now)) + sync_it = S_MTIME; + + if (!timespec_equal(&inode->i_ctime, &now)) + sync_it |= S_CTIME; + + if (IS_I_VERSION(inode)) + sync_it |= S_VERSION; + + if (!sync_it) + return; + + /* Finally allowed to write? Takes lock. */ + if (mnt_want_write_file(file)) + return; + + /* Only change inode inside the lock region */ + if (sync_it & S_VERSION) + inode_inc_iversion(inode); + if (sync_it & S_CTIME) + inode->i_ctime = now; + if (sync_it & S_MTIME) + inode->i_mtime = now; + mark_inode_dirty_sync(inode); + mnt_drop_write(file->f_path.mnt); +} +EXPORT_SYMBOL(file_update_time); + +int inode_needs_sync(struct inode *inode) +{ + if (IS_SYNC(inode)) + return 1; + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) + return 1; + return 0; +} +EXPORT_SYMBOL(inode_needs_sync); + +int inode_wait(void *word) +{ + schedule(); + return 0; +} +EXPORT_SYMBOL(inode_wait); + +/* + * If we try to find an inode in the inode hash while it is being + * deleted, we have to wait until the filesystem completes its + * deletion before reporting that it isn't found. This function waits + * until the deletion _might_ have completed. Callers are responsible + * to recheck inode state. + * + * It doesn't matter if I_NEW is not set initially, a call to + * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list + * will DTRT. + */ +static void __wait_on_freeing_inode(struct inode *inode) +{ + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + wq = bit_waitqueue(&inode->i_state, __I_NEW); + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); + schedule(); + finish_wait(wq, &wait.wait); + spin_lock(&inode_hash_lock); +} + +static __initdata unsigned long ihash_entries; +static int __init set_ihash_entries(char *str) +{ + if (!str) + return 0; + ihash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("ihash_entries=", set_ihash_entries); + +/* + * Initialize the waitqueues and inode hash table. + */ +void __init inode_init_early(void) +{ + int loop; + + /* If hashes are distributed across NUMA nodes, defer + * hash allocation until vmalloc space is available. + */ + if (hashdist) + return; + + inode_hashtable = + alloc_large_system_hash("Inode-cache", + sizeof(struct hlist_head), + ihash_entries, + 14, + HASH_EARLY, + &i_hash_shift, + &i_hash_mask, + 0); + + for (loop = 0; loop < (1 << i_hash_shift); loop++) + INIT_HLIST_HEAD(&inode_hashtable[loop]); +} + +void __init inode_init(void) +{ + int loop; + + /* inode slab cache */ + inode_cachep = kmem_cache_create("inode_cache", + sizeof(struct inode), + 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| + SLAB_MEM_SPREAD), + init_once); + register_shrinker(&icache_shrinker); + + /* Hash may have been set up in inode_init_early */ + if (!hashdist) + return; + + inode_hashtable = + alloc_large_system_hash("Inode-cache", + sizeof(struct hlist_head), + ihash_entries, + 14, + 0, + &i_hash_shift, + &i_hash_mask, + 0); + + for (loop = 0; loop < (1 << i_hash_shift); loop++) + INIT_HLIST_HEAD(&inode_hashtable[loop]); +} + +void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ + inode->i_mode = mode; + if (S_ISCHR(mode)) { + inode->i_fop = &def_chr_fops; + inode->i_rdev = rdev; + } else if (S_ISBLK(mode)) { + inode->i_fop = &def_blk_fops; + inode->i_rdev = rdev; + } else if (S_ISFIFO(mode)) + inode->i_fop = &def_fifo_fops; + else if (S_ISSOCK(mode)) + inode->i_fop = &bad_sock_fops; + else + printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" + " inode %s:%lu\n", mode, inode->i_sb->s_id, + inode->i_ino); +} +EXPORT_SYMBOL(init_special_inode); + +/** + * inode_init_owner - Init uid,gid,mode for new inode according to posix standards + * @inode: New inode + * @dir: Directory inode + * @mode: mode of the new inode + */ +void inode_init_owner(struct inode *inode, const struct inode *dir, + mode_t mode) +{ + inode->i_uid = current_fsuid(); + if (dir && dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; +} +EXPORT_SYMBOL(inode_init_owner); + +/** + * inode_owner_or_capable - check current task permissions to inode + * @inode: inode being checked + * + * Return true if current either has CAP_FOWNER to the inode, or + * owns the file. + */ +bool inode_owner_or_capable(const struct inode *inode) +{ + struct user_namespace *ns = inode_userns(inode); + + if (current_user_ns() == ns && current_fsuid() == inode->i_uid) + return true; + if (ns_capable(ns, CAP_FOWNER)) + return true; + return false; +} +EXPORT_SYMBOL(inode_owner_or_capable); diff --git a/fs/internal.h b/fs/internal.h new file mode 100644 index 00000000..b29c46e4 --- /dev/null +++ b/fs/internal.h @@ -0,0 +1,137 @@ +/* fs/ internal definitions + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +struct super_block; +struct file_system_type; +struct linux_binprm; +struct path; + +/* + * block_dev.c + */ +#ifdef CONFIG_BLOCK +extern struct super_block *blockdev_superblock; +extern void __init bdev_cache_init(void); + +static inline int sb_is_blkdev_sb(struct super_block *sb) +{ + return sb == blockdev_superblock; +} + +extern int __sync_blockdev(struct block_device *bdev, int wait); + +#else +static inline void bdev_cache_init(void) +{ +} + +static inline int sb_is_blkdev_sb(struct super_block *sb) +{ + return 0; +} + +static inline int __sync_blockdev(struct block_device *bdev, int wait) +{ + return 0; +} +#endif + +/* + * char_dev.c + */ +extern void __init chrdev_init(void); + +/* + * exec.c + */ +extern int check_unsafe_exec(struct linux_binprm *); + +/* + * namespace.c + */ +extern int copy_mount_options(const void __user *, unsigned long *); +extern int copy_mount_string(const void __user *, char **); + +extern unsigned int mnt_get_count(struct vfsmount *mnt); +extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int); +extern struct vfsmount *lookup_mnt(struct path *); +extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, + struct vfsmount *); +extern void release_mounts(struct list_head *); +extern void umount_tree(struct vfsmount *, int, struct list_head *); +extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); +extern int finish_automount(struct vfsmount *, struct path *); + +extern void mnt_make_longterm(struct vfsmount *); +extern void mnt_make_shortterm(struct vfsmount *); + +extern void __init mnt_init(void); + +DECLARE_BRLOCK(vfsmount_lock); + + +/* + * fs_struct.c + */ +extern void chroot_fs_refs(struct path *, struct path *); + +/* + * file_table.c + */ +extern void file_sb_list_add(struct file *f, struct super_block *sb); +extern void file_sb_list_del(struct file *f); +extern void mark_files_ro(struct super_block *); +extern struct file *get_empty_filp(void); + +/* + * super.c + */ +extern int do_remount_sb(struct super_block *, int, void *, int); +extern void __put_super(struct super_block *sb); +extern void put_super(struct super_block *sb); +extern struct dentry *mount_fs(struct file_system_type *, + int, const char *, void *); + +/* + * open.c + */ +struct nameidata; +extern struct file *nameidata_to_filp(struct nameidata *); +extern void release_open_intent(struct nameidata *); +struct open_flags { + int open_flag; + int mode; + int acc_mode; + int intent; +}; +extern struct file *do_filp_open(int dfd, const char *pathname, + const struct open_flags *op, int lookup_flags); +extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, + const char *, const struct open_flags *, int lookup_flags); + +extern long do_handle_open(int mountdirfd, + struct file_handle __user *ufh, int open_flag); + +/* + * inode.c + */ +extern spinlock_t inode_sb_list_lock; + +/* + * fs-writeback.c + */ +extern void inode_wb_list_del(struct inode *inode); + +extern int get_nr_dirty_inodes(void); +extern void evict_inodes(struct super_block *); +extern int invalidate_inodes(struct super_block *, bool); diff --git a/fs/ioctl.c b/fs/ioctl.c new file mode 100644 index 00000000..1d9b9fcb --- /dev/null +++ b/fs/ioctl.c @@ -0,0 +1,623 @@ +/* + * linux/fs/ioctl.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* So that the fiemap access checks can't overflow on 32 bit machines. */ +#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) + +/** + * vfs_ioctl - call filesystem specific ioctl methods + * @filp: open file to invoke ioctl method on + * @cmd: ioctl command to execute + * @arg: command-specific argument for ioctl + * + * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise + * returns -ENOTTY. + * + * Returns 0 on success, -errno on error. + */ +static long vfs_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + int error = -ENOTTY; + + if (!filp->f_op || !filp->f_op->unlocked_ioctl) + goto out; + + error = filp->f_op->unlocked_ioctl(filp, cmd, arg); + if (error == -ENOIOCTLCMD) + error = -EINVAL; + out: + return error; +} + +static int ioctl_fibmap(struct file *filp, int __user *p) +{ + struct address_space *mapping = filp->f_mapping; + int res, block; + + /* do we support this mess? */ + if (!mapping->a_ops->bmap) + return -EINVAL; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + res = get_user(block, p); + if (res) + return res; + res = mapping->a_ops->bmap(mapping, block); + return put_user(res, p); +} + +/** + * fiemap_fill_next_extent - Fiemap helper function + * @fieinfo: Fiemap context passed into ->fiemap + * @logical: Extent logical start offset, in bytes + * @phys: Extent physical start offset, in bytes + * @len: Extent length, in bytes + * @flags: FIEMAP_EXTENT flags that describe this extent + * + * Called from file system ->fiemap callback. Will populate extent + * info as passed in via arguments and copy to user memory. On + * success, extent count on fieinfo is incremented. + * + * Returns 0 on success, -errno on error, 1 if this was the last + * extent that will fit in user array. + */ +#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) +#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) +#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) +int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, + u64 phys, u64 len, u32 flags) +{ + struct fiemap_extent extent; + struct fiemap_extent __user *dest = fieinfo->fi_extents_start; + + /* only count the extents */ + if (fieinfo->fi_extents_max == 0) { + fieinfo->fi_extents_mapped++; + return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; + } + + if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) + return 1; + + if (flags & SET_UNKNOWN_FLAGS) + flags |= FIEMAP_EXTENT_UNKNOWN; + if (flags & SET_NO_UNMOUNTED_IO_FLAGS) + flags |= FIEMAP_EXTENT_ENCODED; + if (flags & SET_NOT_ALIGNED_FLAGS) + flags |= FIEMAP_EXTENT_NOT_ALIGNED; + + memset(&extent, 0, sizeof(extent)); + extent.fe_logical = logical; + extent.fe_physical = phys; + extent.fe_length = len; + extent.fe_flags = flags; + + dest += fieinfo->fi_extents_mapped; + if (copy_to_user(dest, &extent, sizeof(extent))) + return -EFAULT; + + fieinfo->fi_extents_mapped++; + if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) + return 1; + return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; +} +EXPORT_SYMBOL(fiemap_fill_next_extent); + +/** + * fiemap_check_flags - check validity of requested flags for fiemap + * @fieinfo: Fiemap context passed into ->fiemap + * @fs_flags: Set of fiemap flags that the file system understands + * + * Called from file system ->fiemap callback. This will compute the + * intersection of valid fiemap flags and those that the fs supports. That + * value is then compared against the user supplied flags. In case of bad user + * flags, the invalid values will be written into the fieinfo structure, and + * -EBADR is returned, which tells ioctl_fiemap() to return those values to + * userspace. For this reason, a return code of -EBADR should be preserved. + * + * Returns 0 on success, -EBADR on bad flags. + */ +int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags) +{ + u32 incompat_flags; + + incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags); + if (incompat_flags) { + fieinfo->fi_flags = incompat_flags; + return -EBADR; + } + return 0; +} +EXPORT_SYMBOL(fiemap_check_flags); + +static int fiemap_check_ranges(struct super_block *sb, + u64 start, u64 len, u64 *new_len) +{ + u64 maxbytes = (u64) sb->s_maxbytes; + + *new_len = len; + + if (len == 0) + return -EINVAL; + + if (start > maxbytes) + return -EFBIG; + + /* + * Shrink request scope to what the fs can actually handle. + */ + if (len > maxbytes || (maxbytes - len) < start) + *new_len = maxbytes - start; + + return 0; +} + +static int ioctl_fiemap(struct file *filp, unsigned long arg) +{ + struct fiemap fiemap; + struct fiemap __user *ufiemap = (struct fiemap __user *) arg; + struct fiemap_extent_info fieinfo = { 0, }; + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + u64 len; + int error; + + if (!inode->i_op->fiemap) + return -EOPNOTSUPP; + + if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) + return -EFAULT; + + if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) + return -EINVAL; + + error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, + &len); + if (error) + return error; + + fieinfo.fi_flags = fiemap.fm_flags; + fieinfo.fi_extents_max = fiemap.fm_extent_count; + fieinfo.fi_extents_start = ufiemap->fm_extents; + + if (fiemap.fm_extent_count != 0 && + !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start, + fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) + return -EFAULT; + + if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) + filemap_write_and_wait(inode->i_mapping); + + error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); + fiemap.fm_flags = fieinfo.fi_flags; + fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; + if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) + error = -EFAULT; + + return error; +} + +#ifdef CONFIG_BLOCK + +static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) +{ + return (offset >> inode->i_blkbits); +} + +static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) +{ + return (blk << inode->i_blkbits); +} + +/** + * __generic_block_fiemap - FIEMAP for block based inodes (no locking) + * @inode: the inode to map + * @fieinfo: the fiemap info struct that will be passed back to userspace + * @start: where to start mapping in the inode + * @len: how much space to map + * @get_block: the fs's get_block function + * + * This does FIEMAP for block based inodes. Basically it will just loop + * through get_block until we hit the number of extents we want to map, or we + * go past the end of the file and hit a hole. + * + * If it is possible to have data blocks beyond a hole past @inode->i_size, then + * please do not use this function, it will stop at the first unmapped block + * beyond i_size. + * + * If you use this function directly, you need to do your own locking. Use + * generic_block_fiemap if you want the locking done for you. + */ + +int __generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, loff_t start, + loff_t len, get_block_t *get_block) +{ + struct buffer_head map_bh; + sector_t start_blk, last_blk; + loff_t isize = i_size_read(inode); + u64 logical = 0, phys = 0, size = 0; + u32 flags = FIEMAP_EXTENT_MERGED; + bool past_eof = false, whole_file = false; + int ret = 0; + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + /* + * Either the i_mutex or other appropriate locking needs to be held + * since we expect isize to not change at all through the duration of + * this call. + */ + if (len >= isize) { + whole_file = true; + len = isize; + } + + /* + * Some filesystems can't deal with being asked to map less than + * blocksize, so make sure our len is at least block length. + */ + if (logical_to_blk(inode, len) == 0) + len = blk_to_logical(inode, 1); + + start_blk = logical_to_blk(inode, start); + last_blk = logical_to_blk(inode, start + len - 1); + + do { + /* + * we set b_size to the total size we want so it will map as + * many contiguous blocks as possible at once + */ + memset(&map_bh, 0, sizeof(struct buffer_head)); + map_bh.b_size = len; + + ret = get_block(inode, start_blk, &map_bh, 0); + if (ret) + break; + + /* HOLE */ + if (!buffer_mapped(&map_bh)) { + start_blk++; + + /* + * We want to handle the case where there is an + * allocated block at the front of the file, and then + * nothing but holes up to the end of the file properly, + * to make sure that extent at the front gets properly + * marked with FIEMAP_EXTENT_LAST + */ + if (!past_eof && + blk_to_logical(inode, start_blk) >= isize) + past_eof = 1; + + /* + * First hole after going past the EOF, this is our + * last extent + */ + if (past_eof && size) { + flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST; + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, + flags); + } else if (size) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + size = 0; + } + + /* if we have holes up to/past EOF then we're done */ + if (start_blk > last_blk || past_eof || ret) + break; + } else { + /* + * We have gone over the length of what we wanted to + * map, and it wasn't the entire file, so add the extent + * we got last time and exit. + * + * This is for the case where say we want to map all the + * way up to the second to the last block in a file, but + * the last block is a hole, making the second to last + * block FIEMAP_EXTENT_LAST. In this case we want to + * see if there is a hole after the second to last block + * so we can mark it properly. If we found data after + * we exceeded the length we were requesting, then we + * are good to go, just add the extent to the fieinfo + * and break + */ + if (start_blk > last_blk && !whole_file) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, + flags); + break; + } + + /* + * if size != 0 then we know we already have an extent + * to add, so add it. + */ + if (size) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, + flags); + if (ret) + break; + } + + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; + flags = FIEMAP_EXTENT_MERGED; + + start_blk += logical_to_blk(inode, size); + + /* + * If we are past the EOF, then we need to make sure as + * soon as we find a hole that the last extent we found + * is marked with FIEMAP_EXTENT_LAST + */ + if (!past_eof && logical + size >= isize) + past_eof = true; + } + cond_resched(); + } while (1); + + /* If ret is 1 then we just hit the end of the extent array */ + if (ret == 1) + ret = 0; + + return ret; +} +EXPORT_SYMBOL(__generic_block_fiemap); + +/** + * generic_block_fiemap - FIEMAP for block based inodes + * @inode: The inode to map + * @fieinfo: The mapping information + * @start: The initial block to map + * @len: The length of the extect to attempt to map + * @get_block: The block mapping function for the fs + * + * Calls __generic_block_fiemap to map the inode, after taking + * the inode's mutex lock. + */ + +int generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, u64 start, + u64 len, get_block_t *get_block) +{ + int ret; + mutex_lock(&inode->i_mutex); + ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block); + mutex_unlock(&inode->i_mutex); + return ret; +} +EXPORT_SYMBOL(generic_block_fiemap); + +#endif /* CONFIG_BLOCK */ + +/* + * This provides compatibility with legacy XFS pre-allocation ioctls + * which predate the fallocate syscall. + * + * Only the l_start, l_len and l_whence fields of the 'struct space_resv' + * are used here, rest are ignored. + */ +int ioctl_preallocate(struct file *filp, void __user *argp) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct space_resv sr; + + if (copy_from_user(&sr, argp, sizeof(sr))) + return -EFAULT; + + switch (sr.l_whence) { + case SEEK_SET: + break; + case SEEK_CUR: + sr.l_start += filp->f_pos; + break; + case SEEK_END: + sr.l_start += i_size_read(inode); + break; + default: + return -EINVAL; + } + + return do_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); +} + +static int file_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + int __user *p = (int __user *)arg; + + switch (cmd) { + case FIBMAP: + return ioctl_fibmap(filp, p); + case FIONREAD: + return put_user(i_size_read(inode) - filp->f_pos, p); + case FS_IOC_RESVSP: + case FS_IOC_RESVSP64: + return ioctl_preallocate(filp, p); + } + + return vfs_ioctl(filp, cmd, arg); +} + +static int ioctl_fionbio(struct file *filp, int __user *argp) +{ + unsigned int flag; + int on, error; + + error = get_user(on, argp); + if (error) + return error; + flag = O_NONBLOCK; +#ifdef __sparc__ + /* SunOS compatibility item. */ + if (O_NONBLOCK != O_NDELAY) + flag |= O_NDELAY; +#endif + spin_lock(&filp->f_lock); + if (on) + filp->f_flags |= flag; + else + filp->f_flags &= ~flag; + spin_unlock(&filp->f_lock); + return error; +} + +static int ioctl_fioasync(unsigned int fd, struct file *filp, + int __user *argp) +{ + unsigned int flag; + int on, error; + + error = get_user(on, argp); + if (error) + return error; + flag = on ? FASYNC : 0; + + /* Did FASYNC state change ? */ + if ((flag ^ filp->f_flags) & FASYNC) { + if (filp->f_op && filp->f_op->fasync) + /* fasync() adjusts filp->f_flags */ + error = filp->f_op->fasync(fd, filp, on); + else + error = -ENOTTY; + } + return error < 0 ? error : 0; +} + +static int ioctl_fsfreeze(struct file *filp) +{ + struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* If filesystem doesn't support freeze feature, return. */ + if (sb->s_op->freeze_fs == NULL) + return -EOPNOTSUPP; + + /* Freeze */ + return freeze_super(sb); +} + +static int ioctl_fsthaw(struct file *filp) +{ + struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* Thaw */ + return thaw_super(sb); +} + +/* + * When you add any new common ioctls to the switches above and below + * please update compat_sys_ioctl() too. + * + * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. + * It's just a simple helper for sys_ioctl and compat_sys_ioctl. + */ +int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, + unsigned long arg) +{ + int error = 0; + int __user *argp = (int __user *)arg; + struct inode *inode = filp->f_path.dentry->d_inode; + + switch (cmd) { + case FIOCLEX: + set_close_on_exec(fd, 1); + break; + + case FIONCLEX: + set_close_on_exec(fd, 0); + break; + + case FIONBIO: + error = ioctl_fionbio(filp, argp); + break; + + case FIOASYNC: + error = ioctl_fioasync(fd, filp, argp); + break; + + case FIOQSIZE: + if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || + S_ISLNK(inode->i_mode)) { + loff_t res = inode_get_bytes(inode); + error = copy_to_user(argp, &res, sizeof(res)) ? + -EFAULT : 0; + } else + error = -ENOTTY; + break; + + case FIFREEZE: + error = ioctl_fsfreeze(filp); + break; + + case FITHAW: + error = ioctl_fsthaw(filp); + break; + + case FS_IOC_FIEMAP: + return ioctl_fiemap(filp, arg); + + case FIGETBSZ: + return put_user(inode->i_sb->s_blocksize, argp); + + default: + if (S_ISREG(inode->i_mode)) + error = file_ioctl(filp, cmd, arg); + else + error = vfs_ioctl(filp, cmd, arg); + break; + } + return error; +} + +SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) +{ + struct file *filp; + int error = -EBADF; + int fput_needed; + + filp = fget_light(fd, &fput_needed); + if (!filp) + goto out; + + error = security_file_ioctl(filp, cmd, arg); + if (error) + goto out_fput; + + error = do_vfs_ioctl(filp, fd, cmd, arg); + out_fput: + fput_light(filp, fput_needed); + out: + return error; +} diff --git a/fs/ioprio.c b/fs/ioprio.c new file mode 100644 index 00000000..7da2a065 --- /dev/null +++ b/fs/ioprio.c @@ -0,0 +1,250 @@ +/* + * fs/ioprio.c + * + * Copyright (C) 2004 Jens Axboe + * + * Helper functions for setting/querying io priorities of processes. The + * system calls closely mimmick getpriority/setpriority, see the man page for + * those. The prio argument is a composite of prio class and prio data, where + * the data argument has meaning within that class. The standard scheduling + * classes have 8 distinct prio levels, with 0 being the highest prio and 7 + * being the lowest. + * + * IOW, setting BE scheduling class with prio 2 is done ala: + * + * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; + * + * ioprio_set(PRIO_PROCESS, pid, prio); + * + * See also Documentation/block/ioprio.txt + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +int set_task_ioprio(struct task_struct *task, int ioprio) +{ + int err; + struct io_context *ioc; + const struct cred *cred = current_cred(), *tcred; + + rcu_read_lock(); + tcred = __task_cred(task); + if (tcred->uid != cred->euid && + tcred->uid != cred->uid && !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); + return -EPERM; + } + rcu_read_unlock(); + + err = security_task_setioprio(task, ioprio); + if (err) + return err; + + task_lock(task); + do { + ioc = task->io_context; + /* see wmb() in current_io_context() */ + smp_read_barrier_depends(); + if (ioc) + break; + + ioc = alloc_io_context(GFP_ATOMIC, -1); + if (!ioc) { + err = -ENOMEM; + break; + } + task->io_context = ioc; + } while (1); + + if (!err) { + ioc->ioprio = ioprio; + ioc->ioprio_changed = 1; + } + + task_unlock(task); + return err; +} +EXPORT_SYMBOL_GPL(set_task_ioprio); + +SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) +{ + int class = IOPRIO_PRIO_CLASS(ioprio); + int data = IOPRIO_PRIO_DATA(ioprio); + struct task_struct *p, *g; + struct user_struct *user; + struct pid *pgrp; + int ret; + + switch (class) { + case IOPRIO_CLASS_RT: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* fall through, rt has prio field too */ + case IOPRIO_CLASS_BE: + if (data >= IOPRIO_BE_NR || data < 0) + return -EINVAL; + + break; + case IOPRIO_CLASS_IDLE: + break; + case IOPRIO_CLASS_NONE: + if (data) + return -EINVAL; + break; + default: + return -EINVAL; + } + + ret = -ESRCH; + rcu_read_lock(); + switch (which) { + case IOPRIO_WHO_PROCESS: + if (!who) + p = current; + else + p = find_task_by_vpid(who); + if (p) + ret = set_task_ioprio(p, ioprio); + break; + case IOPRIO_WHO_PGRP: + if (!who) + pgrp = task_pgrp(current); + else + pgrp = find_vpid(who); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + ret = set_task_ioprio(p, ioprio); + if (ret) + break; + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + if (!who) + user = current_user(); + else + user = find_user(who); + + if (!user) + break; + + do_each_thread(g, p) { + if (__task_cred(p)->uid != who) + continue; + ret = set_task_ioprio(p, ioprio); + if (ret) + goto free_uid; + } while_each_thread(g, p); +free_uid: + if (who) + free_uid(user); + break; + default: + ret = -EINVAL; + } + + rcu_read_unlock(); + return ret; +} + +static int get_task_ioprio(struct task_struct *p) +{ + int ret; + + ret = security_task_getioprio(p); + if (ret) + goto out; + ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + if (p->io_context) + ret = p->io_context->ioprio; +out: + return ret; +} + +int ioprio_best(unsigned short aprio, unsigned short bprio) +{ + unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); + unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); + + if (aclass == IOPRIO_CLASS_NONE) + aclass = IOPRIO_CLASS_BE; + if (bclass == IOPRIO_CLASS_NONE) + bclass = IOPRIO_CLASS_BE; + + if (aclass == bclass) + return min(aprio, bprio); + if (aclass > bclass) + return bprio; + else + return aprio; +} + +SYSCALL_DEFINE2(ioprio_get, int, which, int, who) +{ + struct task_struct *g, *p; + struct user_struct *user; + struct pid *pgrp; + int ret = -ESRCH; + int tmpio; + + rcu_read_lock(); + switch (which) { + case IOPRIO_WHO_PROCESS: + if (!who) + p = current; + else + p = find_task_by_vpid(who); + if (p) + ret = get_task_ioprio(p); + break; + case IOPRIO_WHO_PGRP: + if (!who) + pgrp = task_pgrp(current); + else + pgrp = find_vpid(who); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + tmpio = get_task_ioprio(p); + if (tmpio < 0) + continue; + if (ret == -ESRCH) + ret = tmpio; + else + ret = ioprio_best(ret, tmpio); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + if (!who) + user = current_user(); + else + user = find_user(who); + + if (!user) + break; + + do_each_thread(g, p) { + if (__task_cred(p)->uid != user->uid) + continue; + tmpio = get_task_ioprio(p); + if (tmpio < 0) + continue; + if (ret == -ESRCH) + ret = tmpio; + else + ret = ioprio_best(ret, tmpio); + } while_each_thread(g, p); + + if (who) + free_uid(user); + break; + default: + ret = -EINVAL; + } + + rcu_read_unlock(); + return ret; +} diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig new file mode 100644 index 00000000..8ab9878e --- /dev/null +++ b/fs/isofs/Kconfig @@ -0,0 +1,39 @@ +config ISO9660_FS + tristate "ISO 9660 CDROM file system support" + help + This is the standard file system used on CD-ROMs. It was previously + known as "High Sierra File System" and is called "hsfs" on other + Unix systems. The so-called Rock-Ridge extensions which allow for + long Unix filenames and symbolic links are also supported by this + driver. If you have a CD-ROM drive and want to do more with it than + just listen to audio CDs and watch its LEDs, say Y (and read + and the CD-ROM-HOWTO, + available from ), thereby + enlarging your kernel by about 27 KB; otherwise say N. + + To compile this file system support as a module, choose M here: the + module will be called isofs. + +config JOLIET + bool "Microsoft Joliet CDROM extensions" + depends on ISO9660_FS + select NLS + help + Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system + which allows for long filenames in unicode format (unicode is the + new 16 bit character code, successor to ASCII, which encodes the + characters of almost all languages of the world; see + for more information). Say Y here if you + want to be able to read Joliet CD-ROMs under Linux. + +config ZISOFS + bool "Transparent decompression extension" + depends on ISO9660_FS + select ZLIB_INFLATE + help + This is a Linux-specific extension to RockRidge which lets you store + data in compressed form on a CD-ROM and have it transparently + decompressed when the CD-ROM is accessed. See + for the tools + necessary to create such a filesystem. Say Y here if you want to be + able to read such compressed CD-ROMs. diff --git a/fs/isofs/Makefile b/fs/isofs/Makefile new file mode 100644 index 00000000..bf162f09 --- /dev/null +++ b/fs/isofs/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the Linux isofs filesystem routines. +# + +obj-$(CONFIG_ISO9660_FS) += isofs.o + +isofs-objs-y := namei.o inode.o dir.o util.o rock.o export.o +isofs-objs-$(CONFIG_JOLIET) += joliet.o +isofs-objs-$(CONFIG_ZISOFS) += compress.o +isofs-objs := $(isofs-objs-y) diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c new file mode 100644 index 00000000..0b3fa797 --- /dev/null +++ b/fs/isofs/compress.c @@ -0,0 +1,380 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2001 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * linux/fs/isofs/compress.c + * + * Transparent decompression of files on an iso9660 filesystem + */ + +#include +#include + +#include +#include + +#include "isofs.h" +#include "zisofs.h" + +/* This should probably be global. */ +static char zisofs_sink_page[PAGE_CACHE_SIZE]; + +/* + * This contains the zlib memory allocation and the mutex for the + * allocation; this avoids failures at block-decompression time. + */ +static void *zisofs_zlib_workspace; +static DEFINE_MUTEX(zisofs_zlib_lock); + +/* + * Read data of @inode from @block_start to @block_end and uncompress + * to one zisofs block. Store the data in the @pages array with @pcount + * entries. Start storing at offset @poffset of the first page. + */ +static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, + loff_t block_end, int pcount, + struct page **pages, unsigned poffset, + int *errp) +{ + unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1]; + unsigned int bufsize = ISOFS_BUFFER_SIZE(inode); + unsigned int bufshift = ISOFS_BUFFER_BITS(inode); + unsigned int bufmask = bufsize - 1; + int i, block_size = block_end - block_start; + z_stream stream = { .total_out = 0, + .avail_in = 0, + .avail_out = 0, }; + int zerr; + int needblocks = (block_size + (block_start & bufmask) + bufmask) + >> bufshift; + int haveblocks; + blkcnt_t blocknum; + struct buffer_head *bhs[needblocks + 1]; + int curbh, curpage; + + if (block_size > deflateBound(1UL << zisofs_block_shift)) { + *errp = -EIO; + return 0; + } + /* Empty block? */ + if (block_size == 0) { + for ( i = 0 ; i < pcount ; i++ ) { + if (!pages[i]) + continue; + memset(page_address(pages[i]), 0, PAGE_CACHE_SIZE); + flush_dcache_page(pages[i]); + SetPageUptodate(pages[i]); + } + return ((loff_t)pcount) << PAGE_CACHE_SHIFT; + } + + /* Because zlib is not thread-safe, do all the I/O at the top. */ + blocknum = block_start >> bufshift; + memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *)); + haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks); + ll_rw_block(READ, haveblocks, bhs); + + curbh = 0; + curpage = 0; + /* + * First block is special since it may be fractional. We also wait for + * it before grabbing the zlib mutex; odds are that the subsequent + * blocks are going to come in in short order so we don't hold the zlib + * mutex longer than necessary. + */ + + if (!bhs[0]) + goto b_eio; + + wait_on_buffer(bhs[0]); + if (!buffer_uptodate(bhs[0])) { + *errp = -EIO; + goto b_eio; + } + + stream.workspace = zisofs_zlib_workspace; + mutex_lock(&zisofs_zlib_lock); + + zerr = zlib_inflateInit(&stream); + if (zerr != Z_OK) { + if (zerr == Z_MEM_ERROR) + *errp = -ENOMEM; + else + *errp = -EIO; + printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n", + zerr); + goto z_eio; + } + + while (curpage < pcount && curbh < haveblocks && + zerr != Z_STREAM_END) { + if (!stream.avail_out) { + if (pages[curpage]) { + stream.next_out = page_address(pages[curpage]) + + poffset; + stream.avail_out = PAGE_CACHE_SIZE - poffset; + poffset = 0; + } else { + stream.next_out = (void *)&zisofs_sink_page; + stream.avail_out = PAGE_CACHE_SIZE; + } + } + if (!stream.avail_in) { + wait_on_buffer(bhs[curbh]); + if (!buffer_uptodate(bhs[curbh])) { + *errp = -EIO; + break; + } + stream.next_in = bhs[curbh]->b_data + + (block_start & bufmask); + stream.avail_in = min_t(unsigned, bufsize - + (block_start & bufmask), + block_size); + block_size -= stream.avail_in; + block_start = 0; + } + + while (stream.avail_out && stream.avail_in) { + zerr = zlib_inflate(&stream, Z_SYNC_FLUSH); + if (zerr == Z_BUF_ERROR && stream.avail_in == 0) + break; + if (zerr == Z_STREAM_END) + break; + if (zerr != Z_OK) { + /* EOF, error, or trying to read beyond end of input */ + if (zerr == Z_MEM_ERROR) + *errp = -ENOMEM; + else { + printk(KERN_DEBUG + "zisofs: zisofs_inflate returned" + " %d, inode = %lu," + " page idx = %d, bh idx = %d," + " avail_in = %d," + " avail_out = %d\n", + zerr, inode->i_ino, curpage, + curbh, stream.avail_in, + stream.avail_out); + *errp = -EIO; + } + goto inflate_out; + } + } + + if (!stream.avail_out) { + /* This page completed */ + if (pages[curpage]) { + flush_dcache_page(pages[curpage]); + SetPageUptodate(pages[curpage]); + } + curpage++; + } + if (!stream.avail_in) + curbh++; + } +inflate_out: + zlib_inflateEnd(&stream); + +z_eio: + mutex_unlock(&zisofs_zlib_lock); + +b_eio: + for (i = 0; i < haveblocks; i++) + brelse(bhs[i]); + return stream.total_out; +} + +/* + * Uncompress data so that pages[full_page] is fully uptodate and possibly + * fills in other pages if we have data for them. + */ +static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount, + struct page **pages) +{ + loff_t start_off, end_off; + loff_t block_start, block_end; + unsigned int header_size = ISOFS_I(inode)->i_format_parm[0]; + unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1]; + unsigned int blockptr; + loff_t poffset = 0; + blkcnt_t cstart_block, cend_block; + struct buffer_head *bh; + unsigned int blkbits = ISOFS_BUFFER_BITS(inode); + unsigned int blksize = 1 << blkbits; + int err; + loff_t ret; + + BUG_ON(!pages[full_page]); + + /* + * We want to read at least 'full_page' page. Because we have to + * uncompress the whole compression block anyway, fill the surrounding + * pages with the data we have anyway... + */ + start_off = page_offset(pages[full_page]); + end_off = min_t(loff_t, start_off + PAGE_CACHE_SIZE, inode->i_size); + + cstart_block = start_off >> zisofs_block_shift; + cend_block = (end_off + (1 << zisofs_block_shift) - 1) + >> zisofs_block_shift; + + WARN_ON(start_off - (full_page << PAGE_CACHE_SHIFT) != + ((cstart_block << zisofs_block_shift) & PAGE_CACHE_MASK)); + + /* Find the pointer to this specific chunk */ + /* Note: we're not using isonum_731() here because the data is known aligned */ + /* Note: header_size is in 32-bit words (4 bytes) */ + blockptr = (header_size + cstart_block) << 2; + bh = isofs_bread(inode, blockptr >> blkbits); + if (!bh) + return -EIO; + block_start = le32_to_cpu(*(__le32 *) + (bh->b_data + (blockptr & (blksize - 1)))); + + while (cstart_block < cend_block && pcount > 0) { + /* Load end of the compressed block in the file */ + blockptr += 4; + /* Traversed to next block? */ + if (!(blockptr & (blksize - 1))) { + brelse(bh); + + bh = isofs_bread(inode, blockptr >> blkbits); + if (!bh) + return -EIO; + } + block_end = le32_to_cpu(*(__le32 *) + (bh->b_data + (blockptr & (blksize - 1)))); + if (block_start > block_end) { + brelse(bh); + return -EIO; + } + err = 0; + ret = zisofs_uncompress_block(inode, block_start, block_end, + pcount, pages, poffset, &err); + poffset += ret; + pages += poffset >> PAGE_CACHE_SHIFT; + pcount -= poffset >> PAGE_CACHE_SHIFT; + full_page -= poffset >> PAGE_CACHE_SHIFT; + poffset &= ~PAGE_CACHE_MASK; + + if (err) { + brelse(bh); + /* + * Did we finish reading the page we really wanted + * to read? + */ + if (full_page < 0) + return 0; + return err; + } + + block_start = block_end; + cstart_block++; + } + + if (poffset && *pages) { + memset(page_address(*pages) + poffset, 0, + PAGE_CACHE_SIZE - poffset); + flush_dcache_page(*pages); + SetPageUptodate(*pages); + } + return 0; +} + +/* + * When decompressing, we typically obtain more than one page + * per reference. We inject the additional pages into the page + * cache as a form of readahead. + */ +static int zisofs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + int err; + int i, pcount, full_page; + unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1]; + unsigned int zisofs_pages_per_cblock = + PAGE_CACHE_SHIFT <= zisofs_block_shift ? + (1 << (zisofs_block_shift - PAGE_CACHE_SHIFT)) : 0; + struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)]; + pgoff_t index = page->index, end_index; + + end_index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + /* + * If this page is wholly outside i_size we just return zero; + * do_generic_file_read() will handle this for us + */ + if (index >= end_index) { + SetPageUptodate(page); + unlock_page(page); + return 0; + } + + if (PAGE_CACHE_SHIFT <= zisofs_block_shift) { + /* We have already been given one page, this is the one + we must do. */ + full_page = index & (zisofs_pages_per_cblock - 1); + pcount = min_t(int, zisofs_pages_per_cblock, + end_index - (index & ~(zisofs_pages_per_cblock - 1))); + index -= full_page; + } else { + full_page = 0; + pcount = 1; + } + pages[full_page] = page; + + for (i = 0; i < pcount; i++, index++) { + if (i != full_page) + pages[i] = grab_cache_page_nowait(mapping, index); + if (pages[i]) { + ClearPageError(pages[i]); + kmap(pages[i]); + } + } + + err = zisofs_fill_pages(inode, full_page, pcount, pages); + + /* Release any residual pages, do not SetPageUptodate */ + for (i = 0; i < pcount; i++) { + if (pages[i]) { + flush_dcache_page(pages[i]); + if (i == full_page && err) + SetPageError(pages[i]); + kunmap(pages[i]); + unlock_page(pages[i]); + if (i != full_page) + page_cache_release(pages[i]); + } + } + + /* At this point, err contains 0 or -EIO depending on the "critical" page */ + return err; +} + +const struct address_space_operations zisofs_aops = { + .readpage = zisofs_readpage, + /* No sync_page operation supported? */ + /* No bmap operation supported */ +}; + +int __init zisofs_init(void) +{ + zisofs_zlib_workspace = vmalloc(zlib_inflate_workspacesize()); + if ( !zisofs_zlib_workspace ) + return -ENOMEM; + + return 0; +} + +void zisofs_cleanup(void) +{ + vfree(zisofs_zlib_workspace); +} diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c new file mode 100644 index 00000000..0542b6ee --- /dev/null +++ b/fs/isofs/dir.c @@ -0,0 +1,288 @@ +/* + * linux/fs/isofs/dir.c + * + * (C) 1992, 1993, 1994 Eric Youngdale Modified for ISO 9660 filesystem. + * + * (C) 1991 Linus Torvalds - minix filesystem + * + * Steve Beynon : Missing last directory entries fixed + * (stephen@askone.demon.co.uk) : 21st June 1996 + * + * isofs directory handling functions + */ +#include +#include "isofs.h" + +int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode) +{ + char * old = de->name; + int len = de->name_len[0]; + int i; + + for (i = 0; i < len; i++) { + unsigned char c = old[i]; + if (!c) + break; + + if (c >= 'A' && c <= 'Z') + c |= 0x20; /* lower case */ + + /* Drop trailing '.;1' (ISO 9660:1988 7.5.1 requires period) */ + if (c == '.' && i == len - 3 && old[i + 1] == ';' && old[i + 2] == '1') + break; + + /* Drop trailing ';1' */ + if (c == ';' && i == len - 2 && old[i + 1] == '1') + break; + + /* Convert remaining ';' to '.' */ + /* Also '/' to '.' (broken Acorn-generated ISO9660 images) */ + if (c == ';' || c == '/') + c = '.'; + + new[i] = c; + } + return i; +} + +/* Acorn extensions written by Matthew Wilcox 1998 */ +int get_acorn_filename(struct iso_directory_record *de, + char *retname, struct inode *inode) +{ + int std; + unsigned char *chr; + int retnamlen = isofs_name_translate(de, retname, inode); + + if (retnamlen == 0) + return 0; + std = sizeof(struct iso_directory_record) + de->name_len[0]; + if (std & 1) + std++; + if ((*((unsigned char *) de) - std) != 32) + return retnamlen; + chr = ((unsigned char *) de) + std; + if (strncmp(chr, "ARCHIMEDES", 10)) + return retnamlen; + if ((*retname == '_') && ((chr[19] & 1) == 1)) + *retname = '!'; + if (((de->flags[0] & 2) == 0) && (chr[13] == 0xff) + && ((chr[12] & 0xf0) == 0xf0)) { + retname[retnamlen] = ','; + sprintf(retname+retnamlen+1, "%3.3x", + ((chr[12] & 0xf) << 8) | chr[11]); + retnamlen += 4; + } + return retnamlen; +} + +/* + * This should _really_ be cleaned up some day.. + */ +static int do_isofs_readdir(struct inode *inode, struct file *filp, + void *dirent, filldir_t filldir, + char *tmpname, struct iso_directory_record *tmpde) +{ + unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); + unsigned char bufbits = ISOFS_BUFFER_BITS(inode); + unsigned long block, offset, block_saved, offset_saved; + unsigned long inode_number = 0; /* Quiet GCC */ + struct buffer_head *bh = NULL; + int len; + int map; + int first_de = 1; + char *p = NULL; /* Quiet GCC */ + struct iso_directory_record *de; + struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); + + offset = filp->f_pos & (bufsize - 1); + block = filp->f_pos >> bufbits; + + while (filp->f_pos < inode->i_size) { + int de_len; + + if (!bh) { + bh = isofs_bread(inode, block); + if (!bh) + return 0; + } + + de = (struct iso_directory_record *) (bh->b_data + offset); + + de_len = *(unsigned char *) de; + + /* + * If the length byte is zero, we should move on to the next + * CDROM sector. If we are at the end of the directory, we + * kick out of the while loop. + */ + + if (de_len == 0) { + brelse(bh); + bh = NULL; + filp->f_pos = (filp->f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1); + block = filp->f_pos >> bufbits; + offset = 0; + continue; + } + + block_saved = block; + offset_saved = offset; + offset += de_len; + + /* Make sure we have a full directory entry */ + if (offset >= bufsize) { + int slop = bufsize - offset + de_len; + memcpy(tmpde, de, slop); + offset &= bufsize - 1; + block++; + brelse(bh); + bh = NULL; + if (offset) { + bh = isofs_bread(inode, block); + if (!bh) + return 0; + memcpy((void *) tmpde + slop, bh->b_data, offset); + } + de = tmpde; + } + /* Basic sanity check, whether name doesn't exceed dir entry */ + if (de_len < de->name_len[0] + + sizeof(struct iso_directory_record)) { + printk(KERN_NOTICE "iso9660: Corrupted directory entry" + " in block %lu of inode %lu\n", block, + inode->i_ino); + return -EIO; + } + + if (first_de) { + isofs_normalize_block_and_offset(de, + &block_saved, + &offset_saved); + inode_number = isofs_get_ino(block_saved, + offset_saved, bufbits); + } + + if (de->flags[-sbi->s_high_sierra] & 0x80) { + first_de = 0; + filp->f_pos += de_len; + continue; + } + first_de = 1; + + /* Handle the case of the '.' directory */ + if (de->name_len[0] == 1 && de->name[0] == 0) { + if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0) + break; + filp->f_pos += de_len; + continue; + } + + len = 0; + + /* Handle the case of the '..' directory */ + if (de->name_len[0] == 1 && de->name[0] == 1) { + inode_number = parent_ino(filp->f_path.dentry); + if (filldir(dirent, "..", 2, filp->f_pos, inode_number, DT_DIR) < 0) + break; + filp->f_pos += de_len; + continue; + } + + /* Handle everything else. Do name translation if there + is no Rock Ridge NM field. */ + + /* + * Do not report hidden files if so instructed, or associated + * files unless instructed to do so + */ + if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) || + (!sbi->s_showassoc && + (de->flags[-sbi->s_high_sierra] & 4))) { + filp->f_pos += de_len; + continue; + } + + map = 1; + if (sbi->s_rock) { + len = get_rock_ridge_filename(de, tmpname, inode); + if (len != 0) { /* may be -1 */ + p = tmpname; + map = 0; + } + } + if (map) { +#ifdef CONFIG_JOLIET + if (sbi->s_joliet_level) { + len = get_joliet_filename(de, tmpname, inode); + p = tmpname; + } else +#endif + if (sbi->s_mapping == 'a') { + len = get_acorn_filename(de, tmpname, inode); + p = tmpname; + } else + if (sbi->s_mapping == 'n') { + len = isofs_name_translate(de, tmpname, inode); + p = tmpname; + } else { + p = de->name; + len = de->name_len[0]; + } + } + if (len > 0) { + if (filldir(dirent, p, len, filp->f_pos, inode_number, DT_UNKNOWN) < 0) + break; + } + filp->f_pos += de_len; + + continue; + } + if (bh) + brelse(bh); + return 0; +} + +/* + * Handle allocation of temporary space for name translation and + * handling split directory entries.. The real work is done by + * "do_isofs_readdir()". + */ +static int isofs_readdir(struct file *filp, + void *dirent, filldir_t filldir) +{ + int result; + char *tmpname; + struct iso_directory_record *tmpde; + struct inode *inode = filp->f_path.dentry->d_inode; + struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); + + tmpname = (char *)__get_free_page(GFP_KERNEL); + if (tmpname == NULL) + return -ENOMEM; + + mutex_lock(&sbi->s_mutex); + tmpde = (struct iso_directory_record *) (tmpname+1024); + + result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde); + + free_page((unsigned long) tmpname); + mutex_unlock(&sbi->s_mutex); + return result; +} + +const struct file_operations isofs_dir_operations = +{ + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = isofs_readdir, +}; + +/* + * directories can handle most operations... + */ +const struct inode_operations isofs_dir_inode_operations = +{ + .lookup = isofs_lookup, +}; + + diff --git a/fs/isofs/export.c b/fs/isofs/export.c new file mode 100644 index 00000000..dd4687ff --- /dev/null +++ b/fs/isofs/export.c @@ -0,0 +1,196 @@ +/* + * fs/isofs/export.c + * + * (C) 2004 Paul Serice - The new inode scheme requires switching + * from iget() to iget5_locked() which means + * the NFS export operations have to be hand + * coded because the default routines rely on + * iget(). + * + * The following files are helpful: + * + * Documentation/filesystems/nfs/Exporting + * fs/exportfs/expfs.c. + */ + +#include "isofs.h" + +static struct dentry * +isofs_export_iget(struct super_block *sb, + unsigned long block, + unsigned long offset, + __u32 generation) +{ + struct inode *inode; + + if (block == 0) + return ERR_PTR(-ESTALE); + inode = isofs_iget(sb, block, offset); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + return d_obtain_alias(inode); +} + +/* This function is surprisingly simple. The trick is understanding + * that "child" is always a directory. So, to find its parent, you + * simply need to find its ".." entry, normalize its block and offset, + * and return the underlying inode. See the comments for + * isofs_normalize_block_and_offset(). */ +static struct dentry *isofs_export_get_parent(struct dentry *child) +{ + unsigned long parent_block = 0; + unsigned long parent_offset = 0; + struct inode *child_inode = child->d_inode; + struct iso_inode_info *e_child_inode = ISOFS_I(child_inode); + struct iso_directory_record *de = NULL; + struct buffer_head * bh = NULL; + struct dentry *rv = NULL; + + /* "child" must always be a directory. */ + if (!S_ISDIR(child_inode->i_mode)) { + printk(KERN_ERR "isofs: isofs_export_get_parent(): " + "child is not a directory!\n"); + rv = ERR_PTR(-EACCES); + goto out; + } + + /* It is an invariant that the directory offset is zero. If + * it is not zero, it means the directory failed to be + * normalized for some reason. */ + if (e_child_inode->i_iget5_offset != 0) { + printk(KERN_ERR "isofs: isofs_export_get_parent(): " + "child directory not normalized!\n"); + rv = ERR_PTR(-EACCES); + goto out; + } + + /* The child inode has been normalized such that its + * i_iget5_block value points to the "." entry. Fortunately, + * the ".." entry is located in the same block. */ + parent_block = e_child_inode->i_iget5_block; + + /* Get the block in question. */ + bh = sb_bread(child_inode->i_sb, parent_block); + if (bh == NULL) { + rv = ERR_PTR(-EACCES); + goto out; + } + + /* This is the "." entry. */ + de = (struct iso_directory_record*)bh->b_data; + + /* The ".." entry is always the second entry. */ + parent_offset = (unsigned long)isonum_711(de->length); + de = (struct iso_directory_record*)(bh->b_data + parent_offset); + + /* Verify it is in fact the ".." entry. */ + if ((isonum_711(de->name_len) != 1) || (de->name[0] != 1)) { + printk(KERN_ERR "isofs: Unable to find the \"..\" " + "directory for NFS.\n"); + rv = ERR_PTR(-EACCES); + goto out; + } + + /* Normalize */ + isofs_normalize_block_and_offset(de, &parent_block, &parent_offset); + + rv = d_obtain_alias(isofs_iget(child_inode->i_sb, parent_block, + parent_offset)); + out: + if (bh) + brelse(bh); + return rv; +} + +static int +isofs_export_encode_fh(struct dentry *dentry, + __u32 *fh32, + int *max_len, + int connectable) +{ + struct inode * inode = dentry->d_inode; + struct iso_inode_info * ei = ISOFS_I(inode); + int len = *max_len; + int type = 1; + __u16 *fh16 = (__u16*)fh32; + + /* + * WARNING: max_len is 5 for NFSv2. Because of this + * limitation, we use the lower 16 bits of fh32[1] to hold the + * offset of the inode and the upper 16 bits of fh32[1] to + * hold the offset of the parent. + */ + if (connectable && (len < 5)) { + *max_len = 5; + return 255; + } else if (len < 3) { + *max_len = 3; + return 255; + } + + len = 3; + fh32[0] = ei->i_iget5_block; + fh16[2] = (__u16)ei->i_iget5_offset; /* fh16 [sic] */ + fh32[2] = inode->i_generation; + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + struct iso_inode_info *eparent; + spin_lock(&dentry->d_lock); + parent = dentry->d_parent->d_inode; + eparent = ISOFS_I(parent); + fh32[3] = eparent->i_iget5_block; + fh16[3] = (__u16)eparent->i_iget5_offset; /* fh16 [sic] */ + fh32[4] = parent->i_generation; + spin_unlock(&dentry->d_lock); + len = 5; + type = 2; + } + *max_len = len; + return type; +} + +struct isofs_fid { + u32 block; + u16 offset; + u16 parent_offset; + u32 generation; + u32 parent_block; + u32 parent_generation; +}; + +static struct dentry *isofs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct isofs_fid *ifid = (struct isofs_fid *)fid; + + if (fh_len < 3 || fh_type > 2) + return NULL; + + return isofs_export_iget(sb, ifid->block, ifid->offset, + ifid->generation); +} + +static struct dentry *isofs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct isofs_fid *ifid = (struct isofs_fid *)fid; + + if (fh_type != 2) + return NULL; + + return isofs_export_iget(sb, + fh_len > 2 ? ifid->parent_block : 0, + ifid->parent_offset, + fh_len > 4 ? ifid->parent_generation : 0); +} + +const struct export_operations isofs_export_ops = { + .encode_fh = isofs_export_encode_fh, + .fh_to_dentry = isofs_fh_to_dentry, + .fh_to_parent = isofs_fh_to_parent, + .get_parent = isofs_export_get_parent, +}; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c new file mode 100644 index 00000000..b3cc8586 --- /dev/null +++ b/fs/isofs/inode.c @@ -0,0 +1,1579 @@ +/* + * linux/fs/isofs/inode.c + * + * (C) 1991 Linus Torvalds - minix filesystem + * 1992, 1993, 1994 Eric Youngdale Modified for ISO 9660 filesystem. + * 1994 Eberhard Mönkeberg - multi session handling. + * 1995 Mark Dobie - allow mounting of some weird VideoCDs and PhotoCDs. + * 1997 Gordon Chaffee - Joliet CDs + * 1998 Eric Lammerts - ISO 9660 Level 3 + * 2004 Paul Serice - Inode Support pushed out from 4GB to 128GB + * 2004 Paul Serice - NFS Export Operations + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "isofs.h" +#include "zisofs.h" + +#define BEQUIET + +static int isofs_hashi(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr); +static int isofs_hash(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr); +static int isofs_dentry_cmpi(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); +static int isofs_dentry_cmp(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); + +#ifdef CONFIG_JOLIET +static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr); +static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr); +static int isofs_dentry_cmpi_ms(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); +static int isofs_dentry_cmp_ms(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); +#endif + +static void isofs_put_super(struct super_block *sb) +{ + struct isofs_sb_info *sbi = ISOFS_SB(sb); + +#ifdef CONFIG_JOLIET + unload_nls(sbi->s_nls_iocharset); +#endif + + kfree(sbi); + sb->s_fs_info = NULL; + return; +} + +static int isofs_read_inode(struct inode *); +static int isofs_statfs (struct dentry *, struct kstatfs *); + +static struct kmem_cache *isofs_inode_cachep; + +static struct inode *isofs_alloc_inode(struct super_block *sb) +{ + struct iso_inode_info *ei; + ei = kmem_cache_alloc(isofs_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void isofs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); +} + +static void isofs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, isofs_i_callback); +} + +static void init_once(void *foo) +{ + struct iso_inode_info *ei = foo; + + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", + sizeof(struct iso_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (isofs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(isofs_inode_cachep); +} + +static int isofs_remount(struct super_block *sb, int *flags, char *data) +{ + /* we probably want a lot more here */ + *flags |= MS_RDONLY; + return 0; +} + +static const struct super_operations isofs_sops = { + .alloc_inode = isofs_alloc_inode, + .destroy_inode = isofs_destroy_inode, + .put_super = isofs_put_super, + .statfs = isofs_statfs, + .remount_fs = isofs_remount, + .show_options = generic_show_options, +}; + + +static const struct dentry_operations isofs_dentry_ops[] = { + { + .d_hash = isofs_hash, + .d_compare = isofs_dentry_cmp, + }, + { + .d_hash = isofs_hashi, + .d_compare = isofs_dentry_cmpi, + }, +#ifdef CONFIG_JOLIET + { + .d_hash = isofs_hash_ms, + .d_compare = isofs_dentry_cmp_ms, + }, + { + .d_hash = isofs_hashi_ms, + .d_compare = isofs_dentry_cmpi_ms, + }, +#endif +}; + +struct iso9660_options{ + unsigned int rock:1; + unsigned int joliet:1; + unsigned int cruft:1; + unsigned int hide:1; + unsigned int showassoc:1; + unsigned int nocompress:1; + unsigned int overriderockperm:1; + unsigned int uid_set:1; + unsigned int gid_set:1; + unsigned int utf8:1; + unsigned char map; + unsigned char check; + unsigned int blocksize; + mode_t fmode; + mode_t dmode; + gid_t gid; + uid_t uid; + char *iocharset; + /* LVE */ + s32 session; + s32 sbsector; +}; + +/* + * Compute the hash for the isofs name corresponding to the dentry. + */ +static int +isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms) +{ + const char *name; + int len; + + len = qstr->len; + name = qstr->name; + if (ms) { + while (len && name[len-1] == '.') + len--; + } + + qstr->hash = full_name_hash(name, len); + + return 0; +} + +/* + * Compute the hash for the isofs name corresponding to the dentry. + */ +static int +isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms) +{ + const char *name; + int len; + char c; + unsigned long hash; + + len = qstr->len; + name = qstr->name; + if (ms) { + while (len && name[len-1] == '.') + len--; + } + + hash = init_name_hash(); + while (len--) { + c = tolower(*name++); + hash = partial_name_hash(c, hash); + } + qstr->hash = end_name_hash(hash); + + return 0; +} + +/* + * Compare of two isofs names. + */ +static int isofs_dentry_cmp_common( + unsigned int len, const char *str, + const struct qstr *name, int ms, int ci) +{ + int alen, blen; + + /* A filename cannot end in '.' or we treat it like it has none */ + alen = name->len; + blen = len; + if (ms) { + while (alen && name->name[alen-1] == '.') + alen--; + while (blen && str[blen-1] == '.') + blen--; + } + if (alen == blen) { + if (ci) { + if (strnicmp(name->name, str, alen) == 0) + return 0; + } else { + if (strncmp(name->name, str, alen) == 0) + return 0; + } + } + return 1; +} + +static int +isofs_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) +{ + return isofs_hash_common(dentry, qstr, 0); +} + +static int +isofs_hashi(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) +{ + return isofs_hashi_common(dentry, qstr, 0); +} + +static int +isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + return isofs_dentry_cmp_common(len, str, name, 0, 0); +} + +static int +isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + return isofs_dentry_cmp_common(len, str, name, 0, 1); +} + +#ifdef CONFIG_JOLIET +static int +isofs_hash_ms(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) +{ + return isofs_hash_common(dentry, qstr, 1); +} + +static int +isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) +{ + return isofs_hashi_common(dentry, qstr, 1); +} + +static int +isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + return isofs_dentry_cmp_common(len, str, name, 1, 0); +} + +static int +isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + return isofs_dentry_cmp_common(len, str, name, 1, 1); +} +#endif + +enum { + Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore, + Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet, + Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err, + Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm, +}; + +static const match_table_t tokens = { + {Opt_norock, "norock"}, + {Opt_nojoliet, "nojoliet"}, + {Opt_unhide, "unhide"}, + {Opt_hide, "hide"}, + {Opt_showassoc, "showassoc"}, + {Opt_cruft, "cruft"}, + {Opt_utf8, "utf8"}, + {Opt_iocharset, "iocharset=%s"}, + {Opt_map_a, "map=acorn"}, + {Opt_map_a, "map=a"}, + {Opt_map_n, "map=normal"}, + {Opt_map_n, "map=n"}, + {Opt_map_o, "map=off"}, + {Opt_map_o, "map=o"}, + {Opt_session, "session=%u"}, + {Opt_sb, "sbsector=%u"}, + {Opt_check_r, "check=relaxed"}, + {Opt_check_r, "check=r"}, + {Opt_check_s, "check=strict"}, + {Opt_check_s, "check=s"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_mode, "mode=%u"}, + {Opt_dmode, "dmode=%u"}, + {Opt_overriderockperm, "overriderockperm"}, + {Opt_block, "block=%u"}, + {Opt_ignore, "conv=binary"}, + {Opt_ignore, "conv=b"}, + {Opt_ignore, "conv=text"}, + {Opt_ignore, "conv=t"}, + {Opt_ignore, "conv=mtext"}, + {Opt_ignore, "conv=m"}, + {Opt_ignore, "conv=auto"}, + {Opt_ignore, "conv=a"}, + {Opt_nocompress, "nocompress"}, + {Opt_err, NULL} +}; + +static int parse_options(char *options, struct iso9660_options *popt) +{ + char *p; + int option; + + popt->map = 'n'; + popt->rock = 1; + popt->joliet = 1; + popt->cruft = 0; + popt->hide = 0; + popt->showassoc = 0; + popt->check = 'u'; /* unset */ + popt->nocompress = 0; + popt->blocksize = 1024; + popt->fmode = popt->dmode = ISOFS_INVALID_MODE; + popt->uid_set = 0; + popt->gid_set = 0; + popt->gid = 0; + popt->uid = 0; + popt->iocharset = NULL; + popt->utf8 = 0; + popt->overriderockperm = 0; + popt->session=-1; + popt->sbsector=-1; + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + substring_t args[MAX_OPT_ARGS]; + unsigned n; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_norock: + popt->rock = 0; + break; + case Opt_nojoliet: + popt->joliet = 0; + break; + case Opt_hide: + popt->hide = 1; + break; + case Opt_unhide: + case Opt_showassoc: + popt->showassoc = 1; + break; + case Opt_cruft: + popt->cruft = 1; + break; + case Opt_utf8: + popt->utf8 = 1; + break; +#ifdef CONFIG_JOLIET + case Opt_iocharset: + popt->iocharset = match_strdup(&args[0]); + break; +#endif + case Opt_map_a: + popt->map = 'a'; + break; + case Opt_map_o: + popt->map = 'o'; + break; + case Opt_map_n: + popt->map = 'n'; + break; + case Opt_session: + if (match_int(&args[0], &option)) + return 0; + n = option; + if (n > 99) + return 0; + popt->session = n + 1; + break; + case Opt_sb: + if (match_int(&args[0], &option)) + return 0; + popt->sbsector = option; + break; + case Opt_check_r: + popt->check = 'r'; + break; + case Opt_check_s: + popt->check = 's'; + break; + case Opt_ignore: + break; + case Opt_uid: + if (match_int(&args[0], &option)) + return 0; + popt->uid = option; + popt->uid_set = 1; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + popt->gid = option; + popt->gid_set = 1; + break; + case Opt_mode: + if (match_int(&args[0], &option)) + return 0; + popt->fmode = option; + break; + case Opt_dmode: + if (match_int(&args[0], &option)) + return 0; + popt->dmode = option; + break; + case Opt_overriderockperm: + popt->overriderockperm = 1; + break; + case Opt_block: + if (match_int(&args[0], &option)) + return 0; + n = option; + if (n != 512 && n != 1024 && n != 2048) + return 0; + popt->blocksize = n; + break; + case Opt_nocompress: + popt->nocompress = 1; + break; + default: + return 0; + } + } + return 1; +} + +/* + * look if the driver can tell the multi session redirection value + * + * don't change this if you don't know what you do, please! + * Multisession is legal only with XA disks. + * A non-XA disk with more than one volume descriptor may do it right, but + * usually is written in a nowhere standardized "multi-partition" manner. + * Multisession uses absolute addressing (solely the first frame of the whole + * track is #0), multi-partition uses relative addressing (each first frame of + * each track is #0), and a track is not a session. + * + * A broken CDwriter software or drive firmware does not set new standards, + * at least not if conflicting with the existing ones. + * + * emoenke@gwdg.de + */ +#define WE_OBEY_THE_WRITTEN_STANDARDS 1 + +static unsigned int isofs_get_last_session(struct super_block *sb, s32 session) +{ + struct cdrom_multisession ms_info; + unsigned int vol_desc_start; + struct block_device *bdev = sb->s_bdev; + int i; + + vol_desc_start=0; + ms_info.addr_format=CDROM_LBA; + if(session >= 0 && session <= 99) { + struct cdrom_tocentry Te; + Te.cdte_track=session; + Te.cdte_format=CDROM_LBA; + i = ioctl_by_bdev(bdev, CDROMREADTOCENTRY, (unsigned long) &Te); + if (!i) { + printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n", + session, Te.cdte_addr.lba, + Te.cdte_ctrl&CDROM_DATA_TRACK); + if ((Te.cdte_ctrl&CDROM_DATA_TRACK) == 4) + return Te.cdte_addr.lba; + } + + printk(KERN_ERR "ISOFS: Invalid session number or type of track\n"); + } + i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long) &ms_info); + if (session > 0) + printk(KERN_ERR "ISOFS: Invalid session number\n"); +#if 0 + printk(KERN_DEBUG "isofs.inode: CDROMMULTISESSION: rc=%d\n",i); + if (i==0) { + printk(KERN_DEBUG "isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no"); + printk(KERN_DEBUG "isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba); + } +#endif + if (i==0) +#if WE_OBEY_THE_WRITTEN_STANDARDS + if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ +#endif + vol_desc_start=ms_info.addr.lba; + return vol_desc_start; +} + +/* + * Check if root directory is empty (has less than 3 files). + * + * Used to detect broken CDs where ISO root directory is empty but Joliet root + * directory is OK. If such CD has Rock Ridge extensions, they will be disabled + * (and Joliet used instead) or else no files would be visible. + */ +static bool rootdir_empty(struct super_block *sb, unsigned long block) +{ + int offset = 0, files = 0, de_len; + struct iso_directory_record *de; + struct buffer_head *bh; + + bh = sb_bread(sb, block); + if (!bh) + return true; + while (files < 3) { + de = (struct iso_directory_record *) (bh->b_data + offset); + de_len = *(unsigned char *) de; + if (de_len == 0) + break; + files++; + offset += de_len; + } + brelse(bh); + return files < 3; +} + +/* + * Initialize the superblock and read the root inode. + * + * Note: a check_disk_change() has been done immediately prior + * to this call, so we don't need to check again. + */ +static int isofs_fill_super(struct super_block *s, void *data, int silent) +{ + struct buffer_head *bh = NULL, *pri_bh = NULL; + struct hs_primary_descriptor *h_pri = NULL; + struct iso_primary_descriptor *pri = NULL; + struct iso_supplementary_descriptor *sec = NULL; + struct iso_directory_record *rootp; + struct inode *inode; + struct iso9660_options opt; + struct isofs_sb_info *sbi; + unsigned long first_data_zone; + int joliet_level = 0; + int iso_blknum, block; + int orig_zonesize; + int table, error = -EINVAL; + unsigned int vol_desc_start; + + save_mount_options(s, data); + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + s->s_fs_info = sbi; + + if (!parse_options((char *)data, &opt)) + goto out_freesbi; + + /* + * First of all, get the hardware blocksize for this device. + * If we don't know what it is, or the hardware blocksize is + * larger than the blocksize the user specified, then use + * that value. + */ + /* + * What if bugger tells us to go beyond page size? + */ + opt.blocksize = sb_min_blocksize(s, opt.blocksize); + + sbi->s_high_sierra = 0; /* default is iso9660 */ + + vol_desc_start = (opt.sbsector != -1) ? + opt.sbsector : isofs_get_last_session(s,opt.session); + + for (iso_blknum = vol_desc_start+16; + iso_blknum < vol_desc_start+100; iso_blknum++) { + struct hs_volume_descriptor *hdp; + struct iso_volume_descriptor *vdp; + + block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits); + if (!(bh = sb_bread(s, block))) + goto out_no_read; + + vdp = (struct iso_volume_descriptor *)bh->b_data; + hdp = (struct hs_volume_descriptor *)bh->b_data; + + /* + * Due to the overlapping physical location of the descriptors, + * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure + * proper identification in this case, we first check for ISO. + */ + if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) { + if (isonum_711(vdp->type) == ISO_VD_END) + break; + if (isonum_711(vdp->type) == ISO_VD_PRIMARY) { + if (pri == NULL) { + pri = (struct iso_primary_descriptor *)vdp; + /* Save the buffer in case we need it ... */ + pri_bh = bh; + bh = NULL; + } + } +#ifdef CONFIG_JOLIET + else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) { + sec = (struct iso_supplementary_descriptor *)vdp; + if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) { + if (opt.joliet) { + if (sec->escape[2] == 0x40) + joliet_level = 1; + else if (sec->escape[2] == 0x43) + joliet_level = 2; + else if (sec->escape[2] == 0x45) + joliet_level = 3; + + printk(KERN_DEBUG "ISO 9660 Extensions: " + "Microsoft Joliet Level %d\n", + joliet_level); + } + goto root_found; + } else { + /* Unknown supplementary volume descriptor */ + sec = NULL; + } + } +#endif + } else { + if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) { + if (isonum_711(hdp->type) != ISO_VD_PRIMARY) + goto out_freebh; + + sbi->s_high_sierra = 1; + opt.rock = 0; + h_pri = (struct hs_primary_descriptor *)vdp; + goto root_found; + } + } + + /* Just skip any volume descriptors we don't recognize */ + + brelse(bh); + bh = NULL; + } + /* + * If we fall through, either no volume descriptor was found, + * or else we passed a primary descriptor looking for others. + */ + if (!pri) + goto out_unknown_format; + brelse(bh); + bh = pri_bh; + pri_bh = NULL; + +root_found: + + if (joliet_level && (pri == NULL || !opt.rock)) { + /* This is the case of Joliet with the norock mount flag. + * A disc with both Joliet and Rock Ridge is handled later + */ + pri = (struct iso_primary_descriptor *) sec; + } + + if(sbi->s_high_sierra){ + rootp = (struct iso_directory_record *) h_pri->root_directory_record; + sbi->s_nzones = isonum_733(h_pri->volume_space_size); + sbi->s_log_zone_size = isonum_723(h_pri->logical_block_size); + sbi->s_max_size = isonum_733(h_pri->volume_space_size); + } else { + if (!pri) + goto out_freebh; + rootp = (struct iso_directory_record *) pri->root_directory_record; + sbi->s_nzones = isonum_733(pri->volume_space_size); + sbi->s_log_zone_size = isonum_723(pri->logical_block_size); + sbi->s_max_size = isonum_733(pri->volume_space_size); + } + + sbi->s_ninodes = 0; /* No way to figure this out easily */ + + orig_zonesize = sbi->s_log_zone_size; + /* + * If the zone size is smaller than the hardware sector size, + * this is a fatal error. This would occur if the disc drive + * had sectors that were 2048 bytes, but the filesystem had + * blocks that were 512 bytes (which should only very rarely + * happen.) + */ + if (orig_zonesize < opt.blocksize) + goto out_bad_size; + + /* RDE: convert log zone size to bit shift */ + switch (sbi->s_log_zone_size) { + case 512: sbi->s_log_zone_size = 9; break; + case 1024: sbi->s_log_zone_size = 10; break; + case 2048: sbi->s_log_zone_size = 11; break; + + default: + goto out_bad_zone_size; + } + + s->s_magic = ISOFS_SUPER_MAGIC; + + /* + * With multi-extent files, file size is only limited by the maximum + * size of a file system, which is 8 TB. + */ + s->s_maxbytes = 0x80000000000LL; + + /* + * The CDROM is read-only, has no nodes (devices) on it, and since + * all of the files appear to be owned by root, we really do not want + * to allow suid. (suid or devices will not show up unless we have + * Rock Ridge extensions) + */ + + s->s_flags |= MS_RDONLY /* | MS_NODEV | MS_NOSUID */; + + /* Set this for reference. Its not currently used except on write + which we don't have .. */ + + first_data_zone = isonum_733(rootp->extent) + + isonum_711(rootp->ext_attr_length); + sbi->s_firstdatazone = first_data_zone; +#ifndef BEQUIET + printk(KERN_DEBUG "ISOFS: Max size:%ld Log zone size:%ld\n", + sbi->s_max_size, 1UL << sbi->s_log_zone_size); + printk(KERN_DEBUG "ISOFS: First datazone:%ld\n", sbi->s_firstdatazone); + if(sbi->s_high_sierra) + printk(KERN_DEBUG "ISOFS: Disc in High Sierra format.\n"); +#endif + + /* + * If the Joliet level is set, we _may_ decide to use the + * secondary descriptor, but can't be sure until after we + * read the root inode. But before reading the root inode + * we may need to change the device blocksize, and would + * rather release the old buffer first. So, we cache the + * first_data_zone value from the secondary descriptor. + */ + if (joliet_level) { + pri = (struct iso_primary_descriptor *) sec; + rootp = (struct iso_directory_record *) + pri->root_directory_record; + first_data_zone = isonum_733(rootp->extent) + + isonum_711(rootp->ext_attr_length); + } + + /* + * We're all done using the volume descriptor, and may need + * to change the device blocksize, so release the buffer now. + */ + brelse(pri_bh); + brelse(bh); + + /* + * Force the blocksize to 512 for 512 byte sectors. The file + * read primitives really get it wrong in a bad way if we don't + * do this. + * + * Note - we should never be setting the blocksize to something + * less than the hardware sector size for the device. If we + * do, we would end up having to read larger buffers and split + * out portions to satisfy requests. + * + * Note2- the idea here is that we want to deal with the optimal + * zonesize in the filesystem. If we have it set to something less, + * then we have horrible problems with trying to piece together + * bits of adjacent blocks in order to properly read directory + * entries. By forcing the blocksize in this way, we ensure + * that we will never be required to do this. + */ + sb_set_blocksize(s, orig_zonesize); + + sbi->s_nls_iocharset = NULL; + +#ifdef CONFIG_JOLIET + if (joliet_level && opt.utf8 == 0) { + char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT; + sbi->s_nls_iocharset = load_nls(p); + if (! sbi->s_nls_iocharset) { + /* Fail only if explicit charset specified */ + if (opt.iocharset) + goto out_freesbi; + sbi->s_nls_iocharset = load_nls_default(); + } + } +#endif + s->s_op = &isofs_sops; + s->s_export_op = &isofs_export_ops; + sbi->s_mapping = opt.map; + sbi->s_rock = (opt.rock ? 2 : 0); + sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/ + sbi->s_cruft = opt.cruft; + sbi->s_hide = opt.hide; + sbi->s_showassoc = opt.showassoc; + sbi->s_uid = opt.uid; + sbi->s_gid = opt.gid; + sbi->s_uid_set = opt.uid_set; + sbi->s_gid_set = opt.gid_set; + sbi->s_utf8 = opt.utf8; + sbi->s_nocompress = opt.nocompress; + sbi->s_overriderockperm = opt.overriderockperm; + mutex_init(&sbi->s_mutex); + /* + * It would be incredibly stupid to allow people to mark every file + * on the disk as suid, so we merely allow them to set the default + * permissions. + */ + if (opt.fmode != ISOFS_INVALID_MODE) + sbi->s_fmode = opt.fmode & 0777; + else + sbi->s_fmode = ISOFS_INVALID_MODE; + if (opt.dmode != ISOFS_INVALID_MODE) + sbi->s_dmode = opt.dmode & 0777; + else + sbi->s_dmode = ISOFS_INVALID_MODE; + + /* + * Read the root inode, which _may_ result in changing + * the s_rock flag. Once we have the final s_rock value, + * we then decide whether to use the Joliet descriptor. + */ + inode = isofs_iget(s, sbi->s_firstdatazone, 0); + if (IS_ERR(inode)) + goto out_no_root; + + /* + * Fix for broken CDs with Rock Ridge and empty ISO root directory but + * correct Joliet root directory. + */ + if (sbi->s_rock == 1 && joliet_level && + rootdir_empty(s, sbi->s_firstdatazone)) { + printk(KERN_NOTICE + "ISOFS: primary root directory is empty. " + "Disabling Rock Ridge and switching to Joliet."); + sbi->s_rock = 0; + } + + /* + * If this disk has both Rock Ridge and Joliet on it, then we + * want to use Rock Ridge by default. This can be overridden + * by using the norock mount option. There is still one other + * possibility that is not taken into account: a Rock Ridge + * CD with Unicode names. Until someone sees such a beast, it + * will not be supported. + */ + if (sbi->s_rock == 1) { + joliet_level = 0; + } else if (joliet_level) { + sbi->s_rock = 0; + if (sbi->s_firstdatazone != first_data_zone) { + sbi->s_firstdatazone = first_data_zone; + printk(KERN_DEBUG + "ISOFS: changing to secondary root\n"); + iput(inode); + inode = isofs_iget(s, sbi->s_firstdatazone, 0); + if (IS_ERR(inode)) + goto out_no_root; + } + } + + if (opt.check == 'u') { + /* Only Joliet is case insensitive by default */ + if (joliet_level) + opt.check = 'r'; + else + opt.check = 's'; + } + sbi->s_joliet_level = joliet_level; + + /* Make sure the root inode is a directory */ + if (!S_ISDIR(inode->i_mode)) { + printk(KERN_WARNING + "isofs_fill_super: root inode is not a directory. " + "Corrupted media?\n"); + goto out_iput; + } + + table = 0; + if (joliet_level) + table += 2; + if (opt.check == 'r') + table++; + + s->s_d_op = &isofs_dentry_ops[table]; + + /* get the root dentry */ + s->s_root = d_alloc_root(inode); + if (!(s->s_root)) + goto out_no_root; + + kfree(opt.iocharset); + + return 0; + + /* + * Display error messages and free resources. + */ +out_iput: + iput(inode); + goto out_no_inode; +out_no_root: + error = PTR_ERR(inode); + if (error != -ENOMEM) + printk(KERN_WARNING "%s: get root inode failed\n", __func__); +out_no_inode: +#ifdef CONFIG_JOLIET + unload_nls(sbi->s_nls_iocharset); +#endif + goto out_freesbi; +out_no_read: + printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n", + __func__, s->s_id, iso_blknum, block); + goto out_freebh; +out_bad_zone_size: + printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n", + sbi->s_log_zone_size); + goto out_freebh; +out_bad_size: + printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n", + orig_zonesize, opt.blocksize); + goto out_freebh; +out_unknown_format: + if (!silent) + printk(KERN_WARNING "ISOFS: Unable to identify CD-ROM format.\n"); + +out_freebh: + brelse(bh); + brelse(pri_bh); +out_freesbi: + kfree(opt.iocharset); + kfree(sbi); + s->s_fs_info = NULL; + return error; +} + +static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = ISOFS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = (ISOFS_SB(sb)->s_nzones + << (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits)); + buf->f_bfree = 0; + buf->f_bavail = 0; + buf->f_files = ISOFS_SB(sb)->s_ninodes; + buf->f_ffree = 0; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_namelen = NAME_MAX; + return 0; +} + +/* + * Get a set of blocks; filling in buffer_heads if already allocated + * or getblk() if they are not. Returns the number of blocks inserted + * (-ve == error.) + */ +int isofs_get_blocks(struct inode *inode, sector_t iblock, + struct buffer_head **bh, unsigned long nblocks) +{ + unsigned long b_off = iblock; + unsigned offset, sect_size; + unsigned int firstext; + unsigned long nextblk, nextoff; + int section, rv, error; + struct iso_inode_info *ei = ISOFS_I(inode); + + error = -EIO; + rv = 0; + if (iblock != b_off) { + printk(KERN_DEBUG "%s: block number too large\n", __func__); + goto abort; + } + + + offset = 0; + firstext = ei->i_first_extent; + sect_size = ei->i_section_size >> ISOFS_BUFFER_BITS(inode); + nextblk = ei->i_next_section_block; + nextoff = ei->i_next_section_offset; + section = 0; + + while (nblocks) { + /* If we are *way* beyond the end of the file, print a message. + * Access beyond the end of the file up to the next page boundary + * is normal, however because of the way the page cache works. + * In this case, we just return 0 so that we can properly fill + * the page with useless information without generating any + * I/O errors. + */ + if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { + printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n", + __func__, b_off, + (unsigned long long)inode->i_size); + goto abort; + } + + /* On the last section, nextblk == 0, section size is likely to + * exceed sect_size by a partial block, and access beyond the + * end of the file will reach beyond the section size, too. + */ + while (nextblk && (b_off >= (offset + sect_size))) { + struct inode *ninode; + + offset += sect_size; + ninode = isofs_iget(inode->i_sb, nextblk, nextoff); + if (IS_ERR(ninode)) { + error = PTR_ERR(ninode); + goto abort; + } + firstext = ISOFS_I(ninode)->i_first_extent; + sect_size = ISOFS_I(ninode)->i_section_size >> ISOFS_BUFFER_BITS(ninode); + nextblk = ISOFS_I(ninode)->i_next_section_block; + nextoff = ISOFS_I(ninode)->i_next_section_offset; + iput(ninode); + + if (++section > 100) { + printk(KERN_DEBUG "%s: More than 100 file sections ?!?" + " aborting...\n", __func__); + printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u " + "nextblk=%lu nextoff=%lu\n", __func__, + b_off, firstext, (unsigned) sect_size, + nextblk, nextoff); + goto abort; + } + } + + if (*bh) { + map_bh(*bh, inode->i_sb, firstext + b_off - offset); + } else { + *bh = sb_getblk(inode->i_sb, firstext+b_off-offset); + if (!*bh) + goto abort; + } + bh++; /* Next buffer head */ + b_off++; /* Next buffer offset */ + nblocks--; + rv++; + } + + error = 0; +abort: + return rv != 0 ? rv : error; +} + +/* + * Used by the standard interfaces. + */ +static int isofs_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + + if (create) { + printk(KERN_DEBUG "%s: Kernel tries to allocate a block\n", __func__); + return -EROFS; + } + + ret = isofs_get_blocks(inode, iblock, &bh_result, 1); + return ret < 0 ? ret : 0; +} + +static int isofs_bmap(struct inode *inode, sector_t block) +{ + struct buffer_head dummy; + int error; + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + error = isofs_get_block(inode, block, &dummy, 0); + if (!error) + return dummy.b_blocknr; + return 0; +} + +struct buffer_head *isofs_bread(struct inode *inode, sector_t block) +{ + sector_t blknr = isofs_bmap(inode, block); + if (!blknr) + return NULL; + return sb_bread(inode->i_sb, blknr); +} + +static int isofs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,isofs_get_block); +} + +static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,isofs_get_block); +} + +static const struct address_space_operations isofs_aops = { + .readpage = isofs_readpage, + .bmap = _isofs_bmap +}; + +static int isofs_read_level3_size(struct inode *inode) +{ + unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); + int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra; + struct buffer_head *bh = NULL; + unsigned long block, offset, block_saved, offset_saved; + int i = 0; + int more_entries = 0; + struct iso_directory_record *tmpde = NULL; + struct iso_inode_info *ei = ISOFS_I(inode); + + inode->i_size = 0; + + /* The first 16 blocks are reserved as the System Area. Thus, + * no inodes can appear in block 0. We use this to flag that + * this is the last section. */ + ei->i_next_section_block = 0; + ei->i_next_section_offset = 0; + + block = ei->i_iget5_block; + offset = ei->i_iget5_offset; + + do { + struct iso_directory_record *de; + unsigned int de_len; + + if (!bh) { + bh = sb_bread(inode->i_sb, block); + if (!bh) + goto out_noread; + } + de = (struct iso_directory_record *) (bh->b_data + offset); + de_len = *(unsigned char *) de; + + if (de_len == 0) { + brelse(bh); + bh = NULL; + ++block; + offset = 0; + continue; + } + + block_saved = block; + offset_saved = offset; + offset += de_len; + + /* Make sure we have a full directory entry */ + if (offset >= bufsize) { + int slop = bufsize - offset + de_len; + if (!tmpde) { + tmpde = kmalloc(256, GFP_KERNEL); + if (!tmpde) + goto out_nomem; + } + memcpy(tmpde, de, slop); + offset &= bufsize - 1; + block++; + brelse(bh); + bh = NULL; + if (offset) { + bh = sb_bread(inode->i_sb, block); + if (!bh) + goto out_noread; + memcpy((void *)tmpde+slop, bh->b_data, offset); + } + de = tmpde; + } + + inode->i_size += isonum_733(de->size); + if (i == 1) { + ei->i_next_section_block = block_saved; + ei->i_next_section_offset = offset_saved; + } + + more_entries = de->flags[-high_sierra] & 0x80; + + i++; + if (i > 100) + goto out_toomany; + } while (more_entries); +out: + kfree(tmpde); + if (bh) + brelse(bh); + return 0; + +out_nomem: + if (bh) + brelse(bh); + return -ENOMEM; + +out_noread: + printk(KERN_INFO "ISOFS: unable to read i-node block %lu\n", block); + kfree(tmpde); + return -EIO; + +out_toomany: + printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n" + "isofs_read_level3_size: inode=%lu\n", + __func__, inode->i_ino); + goto out; +} + +static int isofs_read_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct isofs_sb_info *sbi = ISOFS_SB(sb); + unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); + unsigned long block; + int high_sierra = sbi->s_high_sierra; + struct buffer_head *bh = NULL; + struct iso_directory_record *de; + struct iso_directory_record *tmpde = NULL; + unsigned int de_len; + unsigned long offset; + struct iso_inode_info *ei = ISOFS_I(inode); + int ret = -EIO; + + block = ei->i_iget5_block; + bh = sb_bread(inode->i_sb, block); + if (!bh) + goto out_badread; + + offset = ei->i_iget5_offset; + + de = (struct iso_directory_record *) (bh->b_data + offset); + de_len = *(unsigned char *) de; + + if (offset + de_len > bufsize) { + int frag1 = bufsize - offset; + + tmpde = kmalloc(de_len, GFP_KERNEL); + if (tmpde == NULL) { + printk(KERN_INFO "%s: out of memory\n", __func__); + ret = -ENOMEM; + goto fail; + } + memcpy(tmpde, bh->b_data + offset, frag1); + brelse(bh); + bh = sb_bread(inode->i_sb, ++block); + if (!bh) + goto out_badread; + memcpy((char *)tmpde+frag1, bh->b_data, de_len - frag1); + de = tmpde; + } + + inode->i_ino = isofs_get_ino(ei->i_iget5_block, + ei->i_iget5_offset, + ISOFS_BUFFER_BITS(inode)); + + /* Assume it is a normal-format file unless told otherwise */ + ei->i_file_format = isofs_file_normal; + + if (de->flags[-high_sierra] & 2) { + if (sbi->s_dmode != ISOFS_INVALID_MODE) + inode->i_mode = S_IFDIR | sbi->s_dmode; + else + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + inode->i_nlink = 1; /* + * Set to 1. We know there are 2, but + * the find utility tries to optimize + * if it is 2, and it screws up. It is + * easier to give 1 which tells find to + * do it the hard way. + */ + } else { + if (sbi->s_fmode != ISOFS_INVALID_MODE) { + inode->i_mode = S_IFREG | sbi->s_fmode; + } else { + /* + * Set default permissions: r-x for all. The disc + * could be shared with DOS machines so virtually + * anything could be a valid executable. + */ + inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO; + } + inode->i_nlink = 1; + } + inode->i_uid = sbi->s_uid; + inode->i_gid = sbi->s_gid; + inode->i_blocks = 0; + + ei->i_format_parm[0] = 0; + ei->i_format_parm[1] = 0; + ei->i_format_parm[2] = 0; + + ei->i_section_size = isonum_733(de->size); + if (de->flags[-high_sierra] & 0x80) { + ret = isofs_read_level3_size(inode); + if (ret < 0) + goto fail; + ret = -EIO; + } else { + ei->i_next_section_block = 0; + ei->i_next_section_offset = 0; + inode->i_size = isonum_733(de->size); + } + + /* + * Some dipshit decided to store some other bit of information + * in the high byte of the file length. Truncate size in case + * this CDROM was mounted with the cruft option. + */ + + if (sbi->s_cruft) + inode->i_size &= 0x00ffffff; + + if (de->interleave[0]) { + printk(KERN_DEBUG "ISOFS: Interleaved files not (yet) supported.\n"); + inode->i_size = 0; + } + + /* I have no idea what file_unit_size is used for, so + we will flag it for now */ + if (de->file_unit_size[0] != 0) { + printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%ld).\n", + inode->i_ino); + } + + /* I have no idea what other flag bits are used for, so + we will flag it for now */ +#ifdef DEBUG + if((de->flags[-high_sierra] & ~2)!= 0){ + printk(KERN_DEBUG "ISOFS: Unusual flag settings for ISO file " + "(%ld %x).\n", + inode->i_ino, de->flags[-high_sierra]); + } +#endif + + inode->i_mtime.tv_sec = + inode->i_atime.tv_sec = + inode->i_ctime.tv_sec = iso_date(de->date, high_sierra); + inode->i_mtime.tv_nsec = + inode->i_atime.tv_nsec = + inode->i_ctime.tv_nsec = 0; + + ei->i_first_extent = (isonum_733(de->extent) + + isonum_711(de->ext_attr_length)); + + /* Set the number of blocks for stat() - should be done before RR */ + inode->i_blocks = (inode->i_size + 511) >> 9; + + /* + * Now test for possible Rock Ridge extensions which will override + * some of these numbers in the inode structure. + */ + + if (!high_sierra) { + parse_rock_ridge_inode(de, inode); + /* if we want uid/gid set, override the rock ridge setting */ + if (sbi->s_uid_set) + inode->i_uid = sbi->s_uid; + if (sbi->s_gid_set) + inode->i_gid = sbi->s_gid; + } + /* Now set final access rights if overriding rock ridge setting */ + if (S_ISDIR(inode->i_mode) && sbi->s_overriderockperm && + sbi->s_dmode != ISOFS_INVALID_MODE) + inode->i_mode = S_IFDIR | sbi->s_dmode; + if (S_ISREG(inode->i_mode) && sbi->s_overriderockperm && + sbi->s_fmode != ISOFS_INVALID_MODE) + inode->i_mode = S_IFREG | sbi->s_fmode; + + /* Install the inode operations vector */ + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &generic_ro_fops; + switch (ei->i_file_format) { +#ifdef CONFIG_ZISOFS + case isofs_file_compressed: + inode->i_data.a_ops = &zisofs_aops; + break; +#endif + default: + inode->i_data.a_ops = &isofs_aops; + break; + } + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &isofs_dir_inode_operations; + inode->i_fop = &isofs_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_data.a_ops = &isofs_symlink_aops; + } else + /* XXX - parse_rock_ridge_inode() had already set i_rdev. */ + init_special_inode(inode, inode->i_mode, inode->i_rdev); + + ret = 0; +out: + kfree(tmpde); + if (bh) + brelse(bh); + return ret; + +out_badread: + printk(KERN_WARNING "ISOFS: unable to read i-node block\n"); +fail: + goto out; +} + +struct isofs_iget5_callback_data { + unsigned long block; + unsigned long offset; +}; + +static int isofs_iget5_test(struct inode *ino, void *data) +{ + struct iso_inode_info *i = ISOFS_I(ino); + struct isofs_iget5_callback_data *d = + (struct isofs_iget5_callback_data*)data; + return (i->i_iget5_block == d->block) + && (i->i_iget5_offset == d->offset); +} + +static int isofs_iget5_set(struct inode *ino, void *data) +{ + struct iso_inode_info *i = ISOFS_I(ino); + struct isofs_iget5_callback_data *d = + (struct isofs_iget5_callback_data*)data; + i->i_iget5_block = d->block; + i->i_iget5_offset = d->offset; + return 0; +} + +/* Store, in the inode's containing structure, the block and block + * offset that point to the underlying meta-data for the inode. The + * code below is otherwise similar to the iget() code in + * include/linux/fs.h */ +struct inode *isofs_iget(struct super_block *sb, + unsigned long block, + unsigned long offset) +{ + unsigned long hashval; + struct inode *inode; + struct isofs_iget5_callback_data data; + long ret; + + if (offset >= 1ul << sb->s_blocksize_bits) + return ERR_PTR(-EINVAL); + + data.block = block; + data.offset = offset; + + hashval = (block << sb->s_blocksize_bits) | offset; + + inode = iget5_locked(sb, hashval, &isofs_iget5_test, + &isofs_iget5_set, &data); + + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + ret = isofs_read_inode(inode); + if (ret < 0) { + iget_failed(inode); + inode = ERR_PTR(ret); + } else { + unlock_new_inode(inode); + } + } + + return inode; +} + +static struct dentry *isofs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super); +} + +static struct file_system_type iso9660_fs_type = { + .owner = THIS_MODULE, + .name = "iso9660", + .mount = isofs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_iso9660_fs(void) +{ + int err = init_inodecache(); + if (err) + goto out; +#ifdef CONFIG_ZISOFS + err = zisofs_init(); + if (err) + goto out1; +#endif + err = register_filesystem(&iso9660_fs_type); + if (err) + goto out2; + return 0; +out2: +#ifdef CONFIG_ZISOFS + zisofs_cleanup(); +out1: +#endif + destroy_inodecache(); +out: + return err; +} + +static void __exit exit_iso9660_fs(void) +{ + unregister_filesystem(&iso9660_fs_type); +#ifdef CONFIG_ZISOFS + zisofs_cleanup(); +#endif + destroy_inodecache(); +} + +module_init(init_iso9660_fs) +module_exit(exit_iso9660_fs) +MODULE_LICENSE("GPL"); +/* Actual filesystem name is iso9660, as requested in filesystems.c */ +MODULE_ALIAS("iso9660"); diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h new file mode 100644 index 00000000..2882dc08 --- /dev/null +++ b/fs/isofs/isofs.h @@ -0,0 +1,184 @@ +#include +#include +#include +#include +#include + +enum isofs_file_format { + isofs_file_normal = 0, + isofs_file_sparse = 1, + isofs_file_compressed = 2, +}; + +/* + * iso fs inode data in memory + */ +struct iso_inode_info { + unsigned long i_iget5_block; + unsigned long i_iget5_offset; + unsigned int i_first_extent; + unsigned char i_file_format; + unsigned char i_format_parm[3]; + unsigned long i_next_section_block; + unsigned long i_next_section_offset; + off_t i_section_size; + struct inode vfs_inode; +}; + +/* + * iso9660 super-block data in memory + */ +struct isofs_sb_info { + unsigned long s_ninodes; + unsigned long s_nzones; + unsigned long s_firstdatazone; + unsigned long s_log_zone_size; + unsigned long s_max_size; + + int s_rock_offset; /* offset of SUSP fields within SU area */ + unsigned char s_joliet_level; + unsigned char s_mapping; + unsigned int s_high_sierra:1; + unsigned int s_rock:2; + unsigned int s_utf8:1; + unsigned int s_cruft:1; /* Broken disks with high byte of length + * containing junk */ + unsigned int s_nocompress:1; + unsigned int s_hide:1; + unsigned int s_showassoc:1; + unsigned int s_overriderockperm:1; + unsigned int s_uid_set:1; + unsigned int s_gid_set:1; + + mode_t s_fmode; + mode_t s_dmode; + gid_t s_gid; + uid_t s_uid; + struct nls_table *s_nls_iocharset; /* Native language support table */ + struct mutex s_mutex; /* replaces BKL, please remove if possible */ +}; + +#define ISOFS_INVALID_MODE ((mode_t) -1) + +static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct iso_inode_info *ISOFS_I(struct inode *inode) +{ + return container_of(inode, struct iso_inode_info, vfs_inode); +} + +static inline int isonum_711(char *p) +{ + return *(u8 *)p; +} +static inline int isonum_712(char *p) +{ + return *(s8 *)p; +} +static inline unsigned int isonum_721(char *p) +{ + return get_unaligned_le16(p); +} +static inline unsigned int isonum_722(char *p) +{ + return get_unaligned_be16(p); +} +static inline unsigned int isonum_723(char *p) +{ + /* Ignore bigendian datum due to broken mastering programs */ + return get_unaligned_le16(p); +} +static inline unsigned int isonum_731(char *p) +{ + return get_unaligned_le32(p); +} +static inline unsigned int isonum_732(char *p) +{ + return get_unaligned_be32(p); +} +static inline unsigned int isonum_733(char *p) +{ + /* Ignore bigendian datum due to broken mastering programs */ + return get_unaligned_le32(p); +} +extern int iso_date(char *, int); + +struct inode; /* To make gcc happy */ + +extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *); +extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *); +extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *); + +int get_joliet_filename(struct iso_directory_record *, unsigned char *, struct inode *); +int get_acorn_filename(struct iso_directory_record *, char *, struct inode *); + +extern struct dentry *isofs_lookup(struct inode *, struct dentry *, struct nameidata *); +extern struct buffer_head *isofs_bread(struct inode *, sector_t); +extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long); + +extern struct inode *isofs_iget(struct super_block *sb, + unsigned long block, + unsigned long offset); + +/* Because the inode number is no longer relevant to finding the + * underlying meta-data for an inode, we are free to choose a more + * convenient 32-bit number as the inode number. The inode numbering + * scheme was recommended by Sergey Vlasov and Eric Lammerts. */ +static inline unsigned long isofs_get_ino(unsigned long block, + unsigned long offset, + unsigned long bufbits) +{ + return (block << (bufbits - 5)) | (offset >> 5); +} + +/* Every directory can have many redundant directory entries scattered + * throughout the directory tree. First there is the directory entry + * with the name of the directory stored in the parent directory. + * Then, there is the "." directory entry stored in the directory + * itself. Finally, there are possibly many ".." directory entries + * stored in all the subdirectories. + * + * In order for the NFS get_parent() method to work and for the + * general consistency of the dcache, we need to make sure the + * "i_iget5_block" and "i_iget5_offset" all point to exactly one of + * the many redundant entries for each directory. We normalize the + * block and offset by always making them point to the "." directory. + * + * Notice that we do not use the entry for the directory with the name + * that is located in the parent directory. Even though choosing this + * first directory is more natural, it is much easier to find the "." + * entry in the NFS get_parent() method because it is implicitly + * encoded in the "extent + ext_attr_length" fields of _all_ the + * redundant entries for the directory. Thus, it can always be + * reached regardless of which directory entry you have in hand. + * + * This works because the "." entry is simply the first directory + * record when you start reading the file that holds all the directory + * records, and this file starts at "extent + ext_attr_length" blocks. + * Because the "." entry is always the first entry listed in the + * directories file, the normalized "offset" value is always 0. + * + * You should pass the directory entry in "de". On return, "block" + * and "offset" will hold normalized values. Only directories are + * affected making it safe to call even for non-directory file + * types. */ +static inline void +isofs_normalize_block_and_offset(struct iso_directory_record* de, + unsigned long *block, + unsigned long *offset) +{ + /* Only directories are normalized. */ + if (de->flags[0] & 2) { + *offset = 0; + *block = (unsigned long)isonum_733(de->extent) + + (unsigned long)isonum_711(de->ext_attr_length); + } +} + +extern const struct inode_operations isofs_dir_inode_operations; +extern const struct file_operations isofs_dir_operations; +extern const struct address_space_operations isofs_symlink_aops; +extern const struct export_operations isofs_export_ops; diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c new file mode 100644 index 00000000..a048de81 --- /dev/null +++ b/fs/isofs/joliet.c @@ -0,0 +1,69 @@ +/* + * linux/fs/isofs/joliet.c + * + * (C) 1996 Gordon Chaffee + * + * Joliet: Microsoft's Unicode extensions to iso9660 + */ + +#include +#include +#include "isofs.h" + +/* + * Convert Unicode 16 to UTF-8 or ASCII. + */ +static int +uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls) +{ + __be16 *ip, ch; + unsigned char *op; + + ip = uni; + op = ascii; + + while ((ch = get_unaligned(ip)) && len) { + int llen; + llen = nls->uni2char(be16_to_cpu(ch), op, NLS_MAX_CHARSET_SIZE); + if (llen > 0) + op += llen; + else + *op++ = '?'; + ip++; + + len--; + } + *op = 0; + return (op - ascii); +} + +int +get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode) +{ + unsigned char utf8; + struct nls_table *nls; + unsigned char len = 0; + + utf8 = ISOFS_SB(inode->i_sb)->s_utf8; + nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset; + + if (utf8) { + len = utf16s_to_utf8s((const wchar_t *) de->name, + de->name_len[0] >> 1, UTF16_BIG_ENDIAN, + outname, PAGE_SIZE); + } else { + len = uni16_to_x8(outname, (__be16 *) de->name, + de->name_len[0] >> 1, nls); + } + if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1')) + len -= 2; + + /* + * Windows doesn't like periods at the end of a name, + * so neither do we + */ + while (len >= 2 && (outname[len-1] == '.')) + len--; + + return len; +} diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c new file mode 100644 index 00000000..4fb3e807 --- /dev/null +++ b/fs/isofs/namei.c @@ -0,0 +1,196 @@ +/* + * linux/fs/isofs/namei.c + * + * (C) 1992 Eric Youngdale Modified for ISO 9660 filesystem. + * + * (C) 1991 Linus Torvalds - minix filesystem + */ + +#include +#include "isofs.h" + +/* + * ok, we cannot use strncmp, as the name is not in our data space. + * Thus we'll have to use isofs_match. No big problem. Match also makes + * some sanity tests. + */ +static int +isofs_cmp(struct dentry *dentry, const char *compare, int dlen) +{ + struct qstr qstr; + + if (!compare) + return 1; + + /* check special "." and ".." files */ + if (dlen == 1) { + /* "." */ + if (compare[0] == 0) { + if (!dentry->d_name.len) + return 0; + compare = "."; + } else if (compare[0] == 1) { + compare = ".."; + dlen = 2; + } + } + + qstr.name = compare; + qstr.len = dlen; + return dentry->d_op->d_compare(NULL, NULL, NULL, NULL, + dentry->d_name.len, dentry->d_name.name, &qstr); +} + +/* + * isofs_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the inode number of the found entry, or 0 on error. + */ +static unsigned long +isofs_find_entry(struct inode *dir, struct dentry *dentry, + unsigned long *block_rv, unsigned long *offset_rv, + char *tmpname, struct iso_directory_record *tmpde) +{ + unsigned long bufsize = ISOFS_BUFFER_SIZE(dir); + unsigned char bufbits = ISOFS_BUFFER_BITS(dir); + unsigned long block, f_pos, offset, block_saved, offset_saved; + struct buffer_head *bh = NULL; + struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb); + + if (!ISOFS_I(dir)->i_first_extent) + return 0; + + f_pos = 0; + offset = 0; + block = 0; + + while (f_pos < dir->i_size) { + struct iso_directory_record *de; + int de_len, match, i, dlen; + char *dpnt; + + if (!bh) { + bh = isofs_bread(dir, block); + if (!bh) + return 0; + } + + de = (struct iso_directory_record *) (bh->b_data + offset); + + de_len = *(unsigned char *) de; + if (!de_len) { + brelse(bh); + bh = NULL; + f_pos = (f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1); + block = f_pos >> bufbits; + offset = 0; + continue; + } + + block_saved = bh->b_blocknr; + offset_saved = offset; + offset += de_len; + f_pos += de_len; + + /* Make sure we have a full directory entry */ + if (offset >= bufsize) { + int slop = bufsize - offset + de_len; + memcpy(tmpde, de, slop); + offset &= bufsize - 1; + block++; + brelse(bh); + bh = NULL; + if (offset) { + bh = isofs_bread(dir, block); + if (!bh) + return 0; + memcpy((void *) tmpde + slop, bh->b_data, offset); + } + de = tmpde; + } + + dlen = de->name_len[0]; + dpnt = de->name; + /* Basic sanity check, whether name doesn't exceed dir entry */ + if (de_len < dlen + sizeof(struct iso_directory_record)) { + printk(KERN_NOTICE "iso9660: Corrupted directory entry" + " in block %lu of inode %lu\n", block, + dir->i_ino); + return 0; + } + + if (sbi->s_rock && + ((i = get_rock_ridge_filename(de, tmpname, dir)))) { + dlen = i; /* possibly -1 */ + dpnt = tmpname; +#ifdef CONFIG_JOLIET + } else if (sbi->s_joliet_level) { + dlen = get_joliet_filename(de, tmpname, dir); + dpnt = tmpname; +#endif + } else if (sbi->s_mapping == 'a') { + dlen = get_acorn_filename(de, tmpname, dir); + dpnt = tmpname; + } else if (sbi->s_mapping == 'n') { + dlen = isofs_name_translate(de, tmpname, dir); + dpnt = tmpname; + } + + /* + * Skip hidden or associated files unless hide or showassoc, + * respectively, is set + */ + match = 0; + if (dlen > 0 && + (!sbi->s_hide || + (!(de->flags[-sbi->s_high_sierra] & 1))) && + (sbi->s_showassoc || + (!(de->flags[-sbi->s_high_sierra] & 4)))) { + match = (isofs_cmp(dentry, dpnt, dlen) == 0); + } + if (match) { + isofs_normalize_block_and_offset(de, + &block_saved, + &offset_saved); + *block_rv = block_saved; + *offset_rv = offset_saved; + brelse(bh); + return 1; + } + } + brelse(bh); + return 0; +} + +struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + int found; + unsigned long uninitialized_var(block); + unsigned long uninitialized_var(offset); + struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb); + struct inode *inode; + struct page *page; + + page = alloc_page(GFP_USER); + if (!page) + return ERR_PTR(-ENOMEM); + + mutex_lock(&sbi->s_mutex); + found = isofs_find_entry(dir, dentry, + &block, &offset, + page_address(page), + 1024 + page_address(page)); + __free_page(page); + + inode = NULL; + if (found) { + inode = isofs_iget(dir->i_sb, block, offset); + if (IS_ERR(inode)) { + mutex_unlock(&sbi->s_mutex); + return ERR_CAST(inode); + } + } + mutex_unlock(&sbi->s_mutex); + return d_splice_alias(inode, dentry); +} diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c new file mode 100644 index 00000000..f9cd04db --- /dev/null +++ b/fs/isofs/rock.c @@ -0,0 +1,778 @@ +/* + * linux/fs/isofs/rock.c + * + * (C) 1992, 1993 Eric Youngdale + * + * Rock Ridge Extensions to iso9660 + */ + +#include +#include + +#include "isofs.h" +#include "rock.h" + +/* + * These functions are designed to read the system areas of a directory record + * and extract relevant information. There are different functions provided + * depending upon what information we need at the time. One function fills + * out an inode structure, a second one extracts a filename, a third one + * returns a symbolic link name, and a fourth one returns the extent number + * for the file. + */ + +#define SIG(A,B) ((A) | ((B) << 8)) /* isonum_721() */ + +struct rock_state { + void *buffer; + unsigned char *chr; + int len; + int cont_size; + int cont_extent; + int cont_offset; + struct inode *inode; +}; + +/* + * This is a way of ensuring that we have something in the system + * use fields that is compatible with Rock Ridge. Return zero on success. + */ + +static int check_sp(struct rock_ridge *rr, struct inode *inode) +{ + if (rr->u.SP.magic[0] != 0xbe) + return -1; + if (rr->u.SP.magic[1] != 0xef) + return -1; + ISOFS_SB(inode->i_sb)->s_rock_offset = rr->u.SP.skip; + return 0; +} + +static void setup_rock_ridge(struct iso_directory_record *de, + struct inode *inode, struct rock_state *rs) +{ + rs->len = sizeof(struct iso_directory_record) + de->name_len[0]; + if (rs->len & 1) + (rs->len)++; + rs->chr = (unsigned char *)de + rs->len; + rs->len = *((unsigned char *)de) - rs->len; + if (rs->len < 0) + rs->len = 0; + + if (ISOFS_SB(inode->i_sb)->s_rock_offset != -1) { + rs->len -= ISOFS_SB(inode->i_sb)->s_rock_offset; + rs->chr += ISOFS_SB(inode->i_sb)->s_rock_offset; + if (rs->len < 0) + rs->len = 0; + } +} + +static void init_rock_state(struct rock_state *rs, struct inode *inode) +{ + memset(rs, 0, sizeof(*rs)); + rs->inode = inode; +} + +/* + * Returns 0 if the caller should continue scanning, 1 if the scan must end + * and -ve on error. + */ +static int rock_continue(struct rock_state *rs) +{ + int ret = 1; + int blocksize = 1 << rs->inode->i_blkbits; + const int min_de_size = offsetof(struct rock_ridge, u); + + kfree(rs->buffer); + rs->buffer = NULL; + + if ((unsigned)rs->cont_offset > blocksize - min_de_size || + (unsigned)rs->cont_size > blocksize || + (unsigned)(rs->cont_offset + rs->cont_size) > blocksize) { + printk(KERN_NOTICE "rock: corrupted directory entry. " + "extent=%d, offset=%d, size=%d\n", + rs->cont_extent, rs->cont_offset, rs->cont_size); + ret = -EIO; + goto out; + } + + if (rs->cont_extent) { + struct buffer_head *bh; + + rs->buffer = kmalloc(rs->cont_size, GFP_KERNEL); + if (!rs->buffer) { + ret = -ENOMEM; + goto out; + } + ret = -EIO; + bh = sb_bread(rs->inode->i_sb, rs->cont_extent); + if (bh) { + memcpy(rs->buffer, bh->b_data + rs->cont_offset, + rs->cont_size); + put_bh(bh); + rs->chr = rs->buffer; + rs->len = rs->cont_size; + rs->cont_extent = 0; + rs->cont_size = 0; + rs->cont_offset = 0; + return 0; + } + printk("Unable to read rock-ridge attributes\n"); + } +out: + kfree(rs->buffer); + rs->buffer = NULL; + return ret; +} + +/* + * We think there's a record of type `sig' at rs->chr. Parse the signature + * and make sure that there's really room for a record of that type. + */ +static int rock_check_overflow(struct rock_state *rs, int sig) +{ + int len; + + switch (sig) { + case SIG('S', 'P'): + len = sizeof(struct SU_SP_s); + break; + case SIG('C', 'E'): + len = sizeof(struct SU_CE_s); + break; + case SIG('E', 'R'): + len = sizeof(struct SU_ER_s); + break; + case SIG('R', 'R'): + len = sizeof(struct RR_RR_s); + break; + case SIG('P', 'X'): + len = sizeof(struct RR_PX_s); + break; + case SIG('P', 'N'): + len = sizeof(struct RR_PN_s); + break; + case SIG('S', 'L'): + len = sizeof(struct RR_SL_s); + break; + case SIG('N', 'M'): + len = sizeof(struct RR_NM_s); + break; + case SIG('C', 'L'): + len = sizeof(struct RR_CL_s); + break; + case SIG('P', 'L'): + len = sizeof(struct RR_PL_s); + break; + case SIG('T', 'F'): + len = sizeof(struct RR_TF_s); + break; + case SIG('Z', 'F'): + len = sizeof(struct RR_ZF_s); + break; + default: + len = 0; + break; + } + len += offsetof(struct rock_ridge, u); + if (len > rs->len) { + printk(KERN_NOTICE "rock: directory entry would overflow " + "storage\n"); + printk(KERN_NOTICE "rock: sig=0x%02x, size=%d, remaining=%d\n", + sig, len, rs->len); + return -EIO; + } + return 0; +} + +/* + * return length of name field; 0: not found, -1: to be ignored + */ +int get_rock_ridge_filename(struct iso_directory_record *de, + char *retname, struct inode *inode) +{ + struct rock_state rs; + struct rock_ridge *rr; + int sig; + int retnamlen = 0; + int truncate = 0; + int ret = 0; + + if (!ISOFS_SB(inode->i_sb)->s_rock) + return 0; + *retname = 0; + + init_rock_state(&rs, inode); + setup_rock_ridge(de, inode, &rs); +repeat: + + while (rs.len > 2) { /* There may be one byte for padding somewhere */ + rr = (struct rock_ridge *)rs.chr; + /* + * Ignore rock ridge info if rr->len is out of range, but + * don't return -EIO because that would make the file + * invisible. + */ + if (rr->len < 3) + goto out; /* Something got screwed up here */ + sig = isonum_721(rs.chr); + if (rock_check_overflow(&rs, sig)) + goto eio; + rs.chr += rr->len; + rs.len -= rr->len; + /* + * As above, just ignore the rock ridge info if rr->len + * is bogus. + */ + if (rs.len < 0) + goto out; /* Something got screwed up here */ + + switch (sig) { + case SIG('R', 'R'): + if ((rr->u.RR.flags[0] & RR_NM) == 0) + goto out; + break; + case SIG('S', 'P'): + if (check_sp(rr, inode)) + goto out; + break; + case SIG('C', 'E'): + rs.cont_extent = isonum_733(rr->u.CE.extent); + rs.cont_offset = isonum_733(rr->u.CE.offset); + rs.cont_size = isonum_733(rr->u.CE.size); + break; + case SIG('N', 'M'): + if (truncate) + break; + if (rr->len < 5) + break; + /* + * If the flags are 2 or 4, this indicates '.' or '..'. + * We don't want to do anything with this, because it + * screws up the code that calls us. We don't really + * care anyways, since we can just use the non-RR + * name. + */ + if (rr->u.NM.flags & 6) + break; + + if (rr->u.NM.flags & ~1) { + printk("Unsupported NM flag settings (%d)\n", + rr->u.NM.flags); + break; + } + if ((strlen(retname) + rr->len - 5) >= 254) { + truncate = 1; + break; + } + strncat(retname, rr->u.NM.name, rr->len - 5); + retnamlen += rr->len - 5; + break; + case SIG('R', 'E'): + kfree(rs.buffer); + return -1; + default: + break; + } + } + ret = rock_continue(&rs); + if (ret == 0) + goto repeat; + if (ret == 1) + return retnamlen; /* If 0, this file did not have a NM field */ +out: + kfree(rs.buffer); + return ret; +eio: + ret = -EIO; + goto out; +} + +static int +parse_rock_ridge_inode_internal(struct iso_directory_record *de, + struct inode *inode, int regard_xa) +{ + int symlink_len = 0; + int cnt, sig; + struct inode *reloc; + struct rock_ridge *rr; + int rootflag; + struct rock_state rs; + int ret = 0; + + if (!ISOFS_SB(inode->i_sb)->s_rock) + return 0; + + init_rock_state(&rs, inode); + setup_rock_ridge(de, inode, &rs); + if (regard_xa) { + rs.chr += 14; + rs.len -= 14; + if (rs.len < 0) + rs.len = 0; + } + +repeat: + while (rs.len > 2) { /* There may be one byte for padding somewhere */ + rr = (struct rock_ridge *)rs.chr; + /* + * Ignore rock ridge info if rr->len is out of range, but + * don't return -EIO because that would make the file + * invisible. + */ + if (rr->len < 3) + goto out; /* Something got screwed up here */ + sig = isonum_721(rs.chr); + if (rock_check_overflow(&rs, sig)) + goto eio; + rs.chr += rr->len; + rs.len -= rr->len; + /* + * As above, just ignore the rock ridge info if rr->len + * is bogus. + */ + if (rs.len < 0) + goto out; /* Something got screwed up here */ + + switch (sig) { +#ifndef CONFIG_ZISOFS /* No flag for SF or ZF */ + case SIG('R', 'R'): + if ((rr->u.RR.flags[0] & + (RR_PX | RR_TF | RR_SL | RR_CL)) == 0) + goto out; + break; +#endif + case SIG('S', 'P'): + if (check_sp(rr, inode)) + goto out; + break; + case SIG('C', 'E'): + rs.cont_extent = isonum_733(rr->u.CE.extent); + rs.cont_offset = isonum_733(rr->u.CE.offset); + rs.cont_size = isonum_733(rr->u.CE.size); + break; + case SIG('E', 'R'): + ISOFS_SB(inode->i_sb)->s_rock = 1; + printk(KERN_DEBUG "ISO 9660 Extensions: "); + { + int p; + for (p = 0; p < rr->u.ER.len_id; p++) + printk("%c", rr->u.ER.data[p]); + } + printk("\n"); + break; + case SIG('P', 'X'): + inode->i_mode = isonum_733(rr->u.PX.mode); + inode->i_nlink = isonum_733(rr->u.PX.n_links); + inode->i_uid = isonum_733(rr->u.PX.uid); + inode->i_gid = isonum_733(rr->u.PX.gid); + break; + case SIG('P', 'N'): + { + int high, low; + high = isonum_733(rr->u.PN.dev_high); + low = isonum_733(rr->u.PN.dev_low); + /* + * The Rock Ridge standard specifies that if + * sizeof(dev_t) <= 4, then the high field is + * unused, and the device number is completely + * stored in the low field. Some writers may + * ignore this subtlety, + * and as a result we test to see if the entire + * device number is + * stored in the low field, and use that. + */ + if ((low & ~0xff) && high == 0) { + inode->i_rdev = + MKDEV(low >> 8, low & 0xff); + } else { + inode->i_rdev = + MKDEV(high, low); + } + } + break; + case SIG('T', 'F'): + /* + * Some RRIP writers incorrectly place ctime in the + * TF_CREATE field. Try to handle this correctly for + * either case. + */ + /* Rock ridge never appears on a High Sierra disk */ + cnt = 0; + if (rr->u.TF.flags & TF_CREATE) { + inode->i_ctime.tv_sec = + iso_date(rr->u.TF.times[cnt++].time, + 0); + inode->i_ctime.tv_nsec = 0; + } + if (rr->u.TF.flags & TF_MODIFY) { + inode->i_mtime.tv_sec = + iso_date(rr->u.TF.times[cnt++].time, + 0); + inode->i_mtime.tv_nsec = 0; + } + if (rr->u.TF.flags & TF_ACCESS) { + inode->i_atime.tv_sec = + iso_date(rr->u.TF.times[cnt++].time, + 0); + inode->i_atime.tv_nsec = 0; + } + if (rr->u.TF.flags & TF_ATTRIBUTES) { + inode->i_ctime.tv_sec = + iso_date(rr->u.TF.times[cnt++].time, + 0); + inode->i_ctime.tv_nsec = 0; + } + break; + case SIG('S', 'L'): + { + int slen; + struct SL_component *slp; + struct SL_component *oldslp; + slen = rr->len - 5; + slp = &rr->u.SL.link; + inode->i_size = symlink_len; + while (slen > 1) { + rootflag = 0; + switch (slp->flags & ~1) { + case 0: + inode->i_size += + slp->len; + break; + case 2: + inode->i_size += 1; + break; + case 4: + inode->i_size += 2; + break; + case 8: + rootflag = 1; + inode->i_size += 1; + break; + default: + printk("Symlink component flag " + "not implemented\n"); + } + slen -= slp->len + 2; + oldslp = slp; + slp = (struct SL_component *) + (((char *)slp) + slp->len + 2); + + if (slen < 2) { + if (((rr->u.SL. + flags & 1) != 0) + && + ((oldslp-> + flags & 1) == 0)) + inode->i_size += + 1; + break; + } + + /* + * If this component record isn't + * continued, then append a '/'. + */ + if (!rootflag + && (oldslp->flags & 1) == 0) + inode->i_size += 1; + } + } + symlink_len = inode->i_size; + break; + case SIG('R', 'E'): + printk(KERN_WARNING "Attempt to read inode for " + "relocated directory\n"); + goto out; + case SIG('C', 'L'): + ISOFS_I(inode)->i_first_extent = + isonum_733(rr->u.CL.location); + reloc = + isofs_iget(inode->i_sb, + ISOFS_I(inode)->i_first_extent, + 0); + if (IS_ERR(reloc)) { + ret = PTR_ERR(reloc); + goto out; + } + inode->i_mode = reloc->i_mode; + inode->i_nlink = reloc->i_nlink; + inode->i_uid = reloc->i_uid; + inode->i_gid = reloc->i_gid; + inode->i_rdev = reloc->i_rdev; + inode->i_size = reloc->i_size; + inode->i_blocks = reloc->i_blocks; + inode->i_atime = reloc->i_atime; + inode->i_ctime = reloc->i_ctime; + inode->i_mtime = reloc->i_mtime; + iput(reloc); + break; +#ifdef CONFIG_ZISOFS + case SIG('Z', 'F'): { + int algo; + + if (ISOFS_SB(inode->i_sb)->s_nocompress) + break; + algo = isonum_721(rr->u.ZF.algorithm); + if (algo == SIG('p', 'z')) { + int block_shift = + isonum_711(&rr->u.ZF.parms[1]); + if (block_shift > 17) { + printk(KERN_WARNING "isofs: " + "Can't handle ZF block " + "size of 2^%d\n", + block_shift); + } else { + /* + * Note: we don't change + * i_blocks here + */ + ISOFS_I(inode)->i_file_format = + isofs_file_compressed; + /* + * Parameters to compression + * algorithm (header size, + * block size) + */ + ISOFS_I(inode)->i_format_parm[0] = + isonum_711(&rr->u.ZF.parms[0]); + ISOFS_I(inode)->i_format_parm[1] = + isonum_711(&rr->u.ZF.parms[1]); + inode->i_size = + isonum_733(rr->u.ZF. + real_size); + } + } else { + printk(KERN_WARNING + "isofs: Unknown ZF compression " + "algorithm: %c%c\n", + rr->u.ZF.algorithm[0], + rr->u.ZF.algorithm[1]); + } + break; + } +#endif + default: + break; + } + } + ret = rock_continue(&rs); + if (ret == 0) + goto repeat; + if (ret == 1) + ret = 0; +out: + kfree(rs.buffer); + return ret; +eio: + ret = -EIO; + goto out; +} + +static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit) +{ + int slen; + int rootflag; + struct SL_component *oldslp; + struct SL_component *slp; + slen = rr->len - 5; + slp = &rr->u.SL.link; + while (slen > 1) { + rootflag = 0; + switch (slp->flags & ~1) { + case 0: + if (slp->len > plimit - rpnt) + return NULL; + memcpy(rpnt, slp->text, slp->len); + rpnt += slp->len; + break; + case 2: + if (rpnt >= plimit) + return NULL; + *rpnt++ = '.'; + break; + case 4: + if (2 > plimit - rpnt) + return NULL; + *rpnt++ = '.'; + *rpnt++ = '.'; + break; + case 8: + if (rpnt >= plimit) + return NULL; + rootflag = 1; + *rpnt++ = '/'; + break; + default: + printk("Symlink component flag not implemented (%d)\n", + slp->flags); + } + slen -= slp->len + 2; + oldslp = slp; + slp = (struct SL_component *)((char *)slp + slp->len + 2); + + if (slen < 2) { + /* + * If there is another SL record, and this component + * record isn't continued, then add a slash. + */ + if ((!rootflag) && (rr->u.SL.flags & 1) && + !(oldslp->flags & 1)) { + if (rpnt >= plimit) + return NULL; + *rpnt++ = '/'; + } + break; + } + + /* + * If this component record isn't continued, then append a '/'. + */ + if (!rootflag && !(oldslp->flags & 1)) { + if (rpnt >= plimit) + return NULL; + *rpnt++ = '/'; + } + } + return rpnt; +} + +int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode) +{ + int result = parse_rock_ridge_inode_internal(de, inode, 0); + + /* + * if rockridge flag was reset and we didn't look for attributes + * behind eventual XA attributes, have a look there + */ + if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1) + && (ISOFS_SB(inode->i_sb)->s_rock == 2)) { + result = parse_rock_ridge_inode_internal(de, inode, 14); + } + return result; +} + +/* + * readpage() for symlinks: reads symlink contents into the page and either + * makes it uptodate and returns 0 or returns error (-EIO) + */ +static int rock_ridge_symlink_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct iso_inode_info *ei = ISOFS_I(inode); + struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); + char *link = kmap(page); + unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); + struct buffer_head *bh; + char *rpnt = link; + unsigned char *pnt; + struct iso_directory_record *raw_de; + unsigned long block, offset; + int sig; + struct rock_ridge *rr; + struct rock_state rs; + int ret; + + if (!sbi->s_rock) + goto error; + + init_rock_state(&rs, inode); + block = ei->i_iget5_block; + mutex_lock(&sbi->s_mutex); + bh = sb_bread(inode->i_sb, block); + if (!bh) + goto out_noread; + + offset = ei->i_iget5_offset; + pnt = (unsigned char *)bh->b_data + offset; + + raw_de = (struct iso_directory_record *)pnt; + + /* + * If we go past the end of the buffer, there is some sort of error. + */ + if (offset + *pnt > bufsize) + goto out_bad_span; + + /* + * Now test for possible Rock Ridge extensions which will override + * some of these numbers in the inode structure. + */ + + setup_rock_ridge(raw_de, inode, &rs); + +repeat: + while (rs.len > 2) { /* There may be one byte for padding somewhere */ + rr = (struct rock_ridge *)rs.chr; + if (rr->len < 3) + goto out; /* Something got screwed up here */ + sig = isonum_721(rs.chr); + if (rock_check_overflow(&rs, sig)) + goto out; + rs.chr += rr->len; + rs.len -= rr->len; + if (rs.len < 0) + goto out; /* corrupted isofs */ + + switch (sig) { + case SIG('R', 'R'): + if ((rr->u.RR.flags[0] & RR_SL) == 0) + goto out; + break; + case SIG('S', 'P'): + if (check_sp(rr, inode)) + goto out; + break; + case SIG('S', 'L'): + rpnt = get_symlink_chunk(rpnt, rr, + link + (PAGE_SIZE - 1)); + if (rpnt == NULL) + goto out; + break; + case SIG('C', 'E'): + /* This tells is if there is a continuation record */ + rs.cont_extent = isonum_733(rr->u.CE.extent); + rs.cont_offset = isonum_733(rr->u.CE.offset); + rs.cont_size = isonum_733(rr->u.CE.size); + default: + break; + } + } + ret = rock_continue(&rs); + if (ret == 0) + goto repeat; + if (ret < 0) + goto fail; + + if (rpnt == link) + goto fail; + brelse(bh); + *rpnt = '\0'; + mutex_unlock(&sbi->s_mutex); + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + return 0; + + /* error exit from macro */ +out: + kfree(rs.buffer); + goto fail; +out_noread: + printk("unable to read i-node block"); + goto fail; +out_bad_span: + printk("symlink spans iso9660 blocks\n"); +fail: + brelse(bh); + mutex_unlock(&sbi->s_mutex); +error: + SetPageError(page); + kunmap(page); + unlock_page(page); + return -EIO; +} + +const struct address_space_operations isofs_symlink_aops = { + .readpage = rock_ridge_symlink_readpage +}; diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h new file mode 100644 index 00000000..ed09e2b0 --- /dev/null +++ b/fs/isofs/rock.h @@ -0,0 +1,122 @@ +/* + * These structs are used by the system-use-sharing protocol, in which the + * Rock Ridge extensions are embedded. It is quite possible that other + * extensions are present on the disk, and this is fine as long as they + * all use SUSP + */ + +struct SU_SP_s { + unsigned char magic[2]; + unsigned char skip; +} __attribute__ ((packed)); + +struct SU_CE_s { + char extent[8]; + char offset[8]; + char size[8]; +}; + +struct SU_ER_s { + unsigned char len_id; + unsigned char len_des; + unsigned char len_src; + unsigned char ext_ver; + char data[0]; +} __attribute__ ((packed)); + +struct RR_RR_s { + char flags[1]; +} __attribute__ ((packed)); + +struct RR_PX_s { + char mode[8]; + char n_links[8]; + char uid[8]; + char gid[8]; +}; + +struct RR_PN_s { + char dev_high[8]; + char dev_low[8]; +}; + +struct SL_component { + unsigned char flags; + unsigned char len; + char text[0]; +} __attribute__ ((packed)); + +struct RR_SL_s { + unsigned char flags; + struct SL_component link; +} __attribute__ ((packed)); + +struct RR_NM_s { + unsigned char flags; + char name[0]; +} __attribute__ ((packed)); + +struct RR_CL_s { + char location[8]; +}; + +struct RR_PL_s { + char location[8]; +}; + +struct stamp { + char time[7]; +} __attribute__ ((packed)); + +struct RR_TF_s { + char flags; + struct stamp times[0]; /* Variable number of these beasts */ +} __attribute__ ((packed)); + +/* Linux-specific extension for transparent decompression */ +struct RR_ZF_s { + char algorithm[2]; + char parms[2]; + char real_size[8]; +}; + +/* + * These are the bits and their meanings for flags in the TF structure. + */ +#define TF_CREATE 1 +#define TF_MODIFY 2 +#define TF_ACCESS 4 +#define TF_ATTRIBUTES 8 +#define TF_BACKUP 16 +#define TF_EXPIRATION 32 +#define TF_EFFECTIVE 64 +#define TF_LONG_FORM 128 + +struct rock_ridge { + char signature[2]; + unsigned char len; + unsigned char version; + union { + struct SU_SP_s SP; + struct SU_CE_s CE; + struct SU_ER_s ER; + struct RR_RR_s RR; + struct RR_PX_s PX; + struct RR_PN_s PN; + struct RR_SL_s SL; + struct RR_NM_s NM; + struct RR_CL_s CL; + struct RR_PL_s PL; + struct RR_TF_s TF; + struct RR_ZF_s ZF; + } u; +}; + +#define RR_PX 1 /* POSIX attributes */ +#define RR_PN 2 /* POSIX devices */ +#define RR_SL 4 /* Symbolic link */ +#define RR_NM 8 /* Alternate Name */ +#define RR_CL 16 /* Child link */ +#define RR_PL 32 /* Parent link */ +#define RR_RE 64 /* Relocation directory */ +#define RR_TF 128 /* Timestamps */ diff --git a/fs/isofs/util.c b/fs/isofs/util.c new file mode 100644 index 00000000..01e1ee7a --- /dev/null +++ b/fs/isofs/util.c @@ -0,0 +1,80 @@ +/* + * linux/fs/isofs/util.c + */ + +#include "isofs.h" + +/* + * We have to convert from a MM/DD/YY format to the Unix ctime format. + * We have to take into account leap years and all of that good stuff. + * Unfortunately, the kernel does not have the information on hand to + * take into account daylight savings time, but it shouldn't matter. + * The time stored should be localtime (with or without DST in effect), + * and the timezone offset should hold the offset required to get back + * to GMT. Thus we should always be correct. + */ + +int iso_date(char * p, int flag) +{ + int year, month, day, hour, minute, second, tz; + int crtime, days, i; + + year = p[0] - 70; + month = p[1]; + day = p[2]; + hour = p[3]; + minute = p[4]; + second = p[5]; + if (flag == 0) tz = p[6]; /* High sierra has no time zone */ + else tz = 0; + + if (year < 0) { + crtime = 0; + } else { + int monlen[12] = {31,28,31,30,31,30,31,31,30,31,30,31}; + + days = year * 365; + if (year > 2) + days += (year+1) / 4; + for (i = 1; i < month; i++) + days += monlen[i-1]; + if (((year+2) % 4) == 0 && month > 2) + days++; + days += day - 1; + crtime = ((((days * 24) + hour) * 60 + minute) * 60) + + second; + + /* sign extend */ + if (tz & 0x80) + tz |= (-1 << 8); + + /* + * The timezone offset is unreliable on some disks, + * so we make a sanity check. In no case is it ever + * more than 13 hours from GMT, which is 52*15min. + * The time is always stored in localtime with the + * timezone offset being what get added to GMT to + * get to localtime. Thus we need to subtract the offset + * to get to true GMT, which is what we store the time + * as internally. On the local system, the user may set + * their timezone any way they wish, of course, so GMT + * gets converted back to localtime on the receiving + * system. + * + * NOTE: mkisofs in versions prior to mkisofs-1.10 had + * the sign wrong on the timezone offset. This has now + * been corrected there too, but if you are getting screwy + * results this may be the explanation. If enough people + * complain, a user configuration option could be added + * to add the timezone offset in with the wrong sign + * for 'compatibility' with older discs, but I cannot see how + * it will matter that much. + * + * Thanks to kuhlmav@elec.canterbury.ac.nz (Volker Kuhlmann) + * for pointing out the sign error. + */ + if (-52 <= tz && tz <= 52) + crtime -= tz * 15 * 60; + } + return crtime; +} diff --git a/fs/isofs/zisofs.h b/fs/isofs/zisofs.h new file mode 100644 index 00000000..27379570 --- /dev/null +++ b/fs/isofs/zisofs.h @@ -0,0 +1,21 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2001 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * Prototypes for functions exported from the compressed isofs subsystem + */ + +#ifdef CONFIG_ZISOFS +extern const struct address_space_operations zisofs_aops; +extern int __init zisofs_init(void); +extern void zisofs_cleanup(void); +#endif diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig new file mode 100644 index 00000000..4e28beee --- /dev/null +++ b/fs/jbd/Kconfig @@ -0,0 +1,30 @@ +config JBD + tristate + help + This is a generic journalling layer for block devices. It is + currently used by the ext3 file system, but it could also be + used to add journal support to other file systems or block + devices such as RAID or LVM. + + If you are using the ext3 file system, you need to say Y here. + If you are not using ext3 then you will probably want to say N. + + To compile this device as a module, choose M here: the module will be + called jbd. If you are compiling ext3 into the kernel, you + cannot compile this code as a module. + +config JBD_DEBUG + bool "JBD (ext3) debugging support" + depends on JBD && DEBUG_FS + help + If you are using the ext3 journaled file system (or potentially any + other file system/device using JBD), this option allows you to + enable debugging output while the system is running, in order to + help track down any problems you are having. By default the + debugging output will be turned off. + + If you select Y here, then you will be able to turn on debugging + with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a + number between 1 and 5, the higher the number, the more debugging + output is generated. To turn debugging off again, do + "echo 0 > /sys/kernel/debug/jbd/jbd-debug". diff --git a/fs/jbd/Makefile b/fs/jbd/Makefile new file mode 100644 index 00000000..54aca486 --- /dev/null +++ b/fs/jbd/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux journaling routines. +# + +obj-$(CONFIG_JBD) += jbd.o + +jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c new file mode 100644 index 00000000..e4b87bc1 --- /dev/null +++ b/fs/jbd/checkpoint.c @@ -0,0 +1,757 @@ +/* + * linux/fs/jbd/checkpoint.c + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1999 Red Hat Software --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Checkpoint routines for the generic filesystem journaling code. + * Part of the ext2fs journaling system. + * + * Checkpointing is the process of ensuring that a section of the log is + * committed fully to disk, so that that portion of the log can be + * reused. + */ + +#include +#include +#include +#include +#include + +/* + * Unlink a buffer from a transaction checkpoint list. + * + * Called with j_list_lock held. + */ +static inline void __buffer_unlink_first(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + jh->b_cpnext->b_cpprev = jh->b_cpprev; + jh->b_cpprev->b_cpnext = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) { + transaction->t_checkpoint_list = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; + } +} + +/* + * Unlink a buffer from a transaction checkpoint(io) list. + * + * Called with j_list_lock held. + */ +static inline void __buffer_unlink(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + __buffer_unlink_first(jh); + if (transaction->t_checkpoint_io_list == jh) { + transaction->t_checkpoint_io_list = jh->b_cpnext; + if (transaction->t_checkpoint_io_list == jh) + transaction->t_checkpoint_io_list = NULL; + } +} + +/* + * Move a buffer from the checkpoint list to the checkpoint io list + * + * Called with j_list_lock held + */ +static inline void __buffer_relink_io(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + __buffer_unlink_first(jh); + + if (!transaction->t_checkpoint_io_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_io_list; + jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_io_list = jh; +} + +/* + * Try to release a checkpointed buffer from its transaction. + * Returns 1 if we released it and 2 if we also released the + * whole transaction. + * + * Requires j_list_lock + * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it + */ +static int __try_to_free_cp_buf(struct journal_head *jh) +{ + int ret = 0; + struct buffer_head *bh = jh2bh(jh); + + if (jh->b_jlist == BJ_None && !buffer_locked(bh) && + !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + JBUFFER_TRACE(jh, "remove from checkpoint list"); + ret = __journal_remove_checkpoint(jh) + 1; + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + BUFFER_TRACE(bh, "release"); + __brelse(bh); + } else { + jbd_unlock_bh_state(bh); + } + return ret; +} + +/* + * __log_wait_for_space: wait until there is space in the journal. + * + * Called under j-state_lock *only*. It will be unlocked if we have to wait + * for a checkpoint to free up some space in the log. + */ +void __log_wait_for_space(journal_t *journal) +{ + int nblocks, space_left; + assert_spin_locked(&journal->j_state_lock); + + nblocks = jbd_space_needed(journal); + while (__log_space_left(journal) < nblocks) { + if (journal->j_flags & JFS_ABORT) + return; + spin_unlock(&journal->j_state_lock); + mutex_lock(&journal->j_checkpoint_mutex); + + /* + * Test again, another process may have checkpointed while we + * were waiting for the checkpoint lock. If there are no + * transactions ready to be checkpointed, try to recover + * journal space by calling cleanup_journal_tail(), and if + * that doesn't work, by waiting for the currently committing + * transaction to complete. If there is absolutely no way + * to make progress, this is either a BUG or corrupted + * filesystem, so abort the journal and leave a stack + * trace for forensic evidence. + */ + spin_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + nblocks = jbd_space_needed(journal); + space_left = __log_space_left(journal); + if (space_left < nblocks) { + int chkpt = journal->j_checkpoint_transactions != NULL; + tid_t tid = 0; + + if (journal->j_committing_transaction) + tid = journal->j_committing_transaction->t_tid; + spin_unlock(&journal->j_list_lock); + spin_unlock(&journal->j_state_lock); + if (chkpt) { + log_do_checkpoint(journal); + } else if (cleanup_journal_tail(journal) == 0) { + /* We were able to recover space; yay! */ + ; + } else if (tid) { + log_wait_commit(journal, tid); + } else { + printk(KERN_ERR "%s: needed %d blocks and " + "only had %d space available\n", + __func__, nblocks, space_left); + printk(KERN_ERR "%s: no way to get more " + "journal space\n", __func__); + WARN_ON(1); + journal_abort(journal, 0); + } + spin_lock(&journal->j_state_lock); + } else { + spin_unlock(&journal->j_list_lock); + } + mutex_unlock(&journal->j_checkpoint_mutex); + } +} + +/* + * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. + * The caller must restart a list walk. Wait for someone else to run + * jbd_unlock_bh_state(). + */ +static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) + __releases(journal->j_list_lock) +{ + get_bh(bh); + spin_unlock(&journal->j_list_lock); + jbd_lock_bh_state(bh); + jbd_unlock_bh_state(bh); + put_bh(bh); +} + +/* + * Clean up transaction's list of buffers submitted for io. + * We wait for any pending IO to complete and remove any clean + * buffers. Note that we take the buffers in the opposite ordering + * from the one in which they were submitted for IO. + * + * Return 0 on success, and return <0 if some buffers have failed + * to be written out. + * + * Called with j_list_lock held. + */ +static int __wait_cp_io(journal_t *journal, transaction_t *transaction) +{ + struct journal_head *jh; + struct buffer_head *bh; + tid_t this_tid; + int released = 0; + int ret = 0; + + this_tid = transaction->t_tid; +restart: + /* Did somebody clean up the transaction in the meanwhile? */ + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + return ret; + while (!released && transaction->t_checkpoint_io_list) { + jh = transaction->t_checkpoint_io_list; + bh = jh2bh(jh); + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + spin_lock(&journal->j_list_lock); + goto restart; + } + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + spin_lock(&journal->j_list_lock); + goto restart; + } + if (unlikely(buffer_write_io_error(bh))) + ret = -EIO; + + /* + * Now in whatever state the buffer currently is, we know that + * it has been written out and so we can drop it from the list + */ + released = __journal_remove_checkpoint(jh); + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + __brelse(bh); + } + + return ret; +} + +#define NR_BATCH 64 + +static void +__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) +{ + int i; + + for (i = 0; i < *batch_count; i++) + write_dirty_buffer(bhs[i], WRITE); + + for (i = 0; i < *batch_count; i++) { + struct buffer_head *bh = bhs[i]; + clear_buffer_jwrite(bh); + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + } + *batch_count = 0; +} + +/* + * Try to flush one buffer from the checkpoint list to disk. + * + * Return 1 if something happened which requires us to abort the current + * scan of the checkpoint list. Return <0 if the buffer has failed to + * be written out. + * + * Called with j_list_lock held and drops it if 1 is returned + * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it + */ +static int __process_buffer(journal_t *journal, struct journal_head *jh, + struct buffer_head **bhs, int *batch_count) +{ + struct buffer_head *bh = jh2bh(jh); + int ret = 0; + + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + ret = 1; + } else if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; + + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + log_start_commit(journal, tid); + log_wait_commit(journal, tid); + ret = 1; + } else if (!buffer_dirty(bh)) { + ret = 1; + if (unlikely(buffer_write_io_error(bh))) + ret = -EIO; + J_ASSERT_JH(jh, !buffer_jbddirty(bh)); + BUFFER_TRACE(bh, "remove from checkpoint"); + __journal_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + __brelse(bh); + } else { + /* + * Important: we are about to write the buffer, and + * possibly block, while still holding the journal lock. + * We cannot afford to let the transaction logic start + * messing around with this buffer before we write it to + * disk, as that would break recoverability. + */ + BUFFER_TRACE(bh, "queue"); + get_bh(bh); + J_ASSERT_BH(bh, !buffer_jwrite(bh)); + set_buffer_jwrite(bh); + bhs[*batch_count] = bh; + __buffer_relink_io(jh); + jbd_unlock_bh_state(bh); + (*batch_count)++; + if (*batch_count == NR_BATCH) { + spin_unlock(&journal->j_list_lock); + __flush_batch(journal, bhs, batch_count); + ret = 1; + } + } + return ret; +} + +/* + * Perform an actual checkpoint. We take the first transaction on the + * list of transactions to be checkpointed and send all its buffers + * to disk. We submit larger chunks of data at once. + * + * The journal should be locked before calling this function. + * Called with j_checkpoint_mutex held. + */ +int log_do_checkpoint(journal_t *journal) +{ + transaction_t *transaction; + tid_t this_tid; + int result; + + jbd_debug(1, "Start checkpoint\n"); + + /* + * First thing: if there are any transactions in the log which + * don't need checkpointing, just eliminate them from the + * journal straight away. + */ + result = cleanup_journal_tail(journal); + jbd_debug(1, "cleanup_journal_tail returned %d\n", result); + if (result <= 0) + return result; + + /* + * OK, we need to start writing disk blocks. Take one transaction + * and write it. + */ + result = 0; + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) + goto out; + transaction = journal->j_checkpoint_transactions; + this_tid = transaction->t_tid; +restart: + /* + * If someone cleaned up this transaction while we slept, we're + * done (maybe it's a new transaction, but it fell at the same + * address). + */ + if (journal->j_checkpoint_transactions == transaction && + transaction->t_tid == this_tid) { + int batch_count = 0; + struct buffer_head *bhs[NR_BATCH]; + struct journal_head *jh; + int retry = 0, err; + + while (!retry && transaction->t_checkpoint_list) { + struct buffer_head *bh; + + jh = transaction->t_checkpoint_list; + bh = jh2bh(jh); + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + retry = 1; + break; + } + retry = __process_buffer(journal, jh, bhs,&batch_count); + if (retry < 0 && !result) + result = retry; + if (!retry && (need_resched() || + spin_needbreak(&journal->j_list_lock))) { + spin_unlock(&journal->j_list_lock); + retry = 1; + break; + } + } + + if (batch_count) { + if (!retry) { + spin_unlock(&journal->j_list_lock); + retry = 1; + } + __flush_batch(journal, bhs, &batch_count); + } + + if (retry) { + spin_lock(&journal->j_list_lock); + goto restart; + } + /* + * Now we have cleaned up the first transaction's checkpoint + * list. Let's clean up the second one + */ + err = __wait_cp_io(journal, transaction); + if (!result) + result = err; + } +out: + spin_unlock(&journal->j_list_lock); + if (result < 0) + journal_abort(journal, result); + else + result = cleanup_journal_tail(journal); + + return (result < 0) ? result : 0; +} + +/* + * Check the list of checkpoint transactions for the journal to see if + * we have already got rid of any since the last update of the log tail + * in the journal superblock. If so, we can instantly roll the + * superblock forward to remove those transactions from the log. + * + * Return <0 on error, 0 on success, 1 if there was nothing to clean up. + * + * Called with the journal lock held. + * + * This is the only part of the journaling code which really needs to be + * aware of transaction aborts. Checkpointing involves writing to the + * main filesystem area rather than to the journal, so it can proceed + * even in abort state, but we must not update the super block if + * checkpointing may have failed. Otherwise, we would lose some metadata + * buffers which should be written-back to the filesystem. + */ + +int cleanup_journal_tail(journal_t *journal) +{ + transaction_t * transaction; + tid_t first_tid; + unsigned int blocknr, freed; + + if (is_journal_aborted(journal)) + return 1; + + /* OK, work out the oldest transaction remaining in the log, and + * the log block it starts at. + * + * If the log is now empty, we need to work out which is the + * next transaction ID we will write, and where it will + * start. */ + + spin_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + transaction = journal->j_checkpoint_transactions; + if (transaction) { + first_tid = transaction->t_tid; + blocknr = transaction->t_log_start; + } else if ((transaction = journal->j_committing_transaction) != NULL) { + first_tid = transaction->t_tid; + blocknr = transaction->t_log_start; + } else if ((transaction = journal->j_running_transaction) != NULL) { + first_tid = transaction->t_tid; + blocknr = journal->j_head; + } else { + first_tid = journal->j_transaction_sequence; + blocknr = journal->j_head; + } + spin_unlock(&journal->j_list_lock); + J_ASSERT(blocknr != 0); + + /* If the oldest pinned transaction is at the tail of the log + already then there's not much we can do right now. */ + if (journal->j_tail_sequence == first_tid) { + spin_unlock(&journal->j_state_lock); + return 1; + } + + /* OK, update the superblock to recover the freed space. + * Physical blocks come first: have we wrapped beyond the end of + * the log? */ + freed = blocknr - journal->j_tail; + if (blocknr < journal->j_tail) + freed = freed + journal->j_last - journal->j_first; + + jbd_debug(1, + "Cleaning journal tail from %d to %d (offset %u), " + "freeing %u\n", + journal->j_tail_sequence, first_tid, blocknr, freed); + + journal->j_free += freed; + journal->j_tail_sequence = first_tid; + journal->j_tail = blocknr; + spin_unlock(&journal->j_state_lock); + if (!(journal->j_flags & JFS_ABORT)) + journal_update_superblock(journal, 1); + return 0; +} + + +/* Checkpoint list management */ + +/* + * journal_clean_one_cp_list + * + * Find all the written-back checkpoint buffers in the given list and release them. + * + * Called with the journal locked. + * Called with j_list_lock held. + * Returns number of bufers reaped (for debug) + */ + +static int journal_clean_one_cp_list(struct journal_head *jh, int *released) +{ + struct journal_head *last_jh; + struct journal_head *next_jh = jh; + int ret, freed = 0; + + *released = 0; + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + /* Use trylock because of the ranking */ + if (jbd_trylock_bh_state(jh2bh(jh))) { + ret = __try_to_free_cp_buf(jh); + if (ret) { + freed++; + if (ret == 2) { + *released = 1; + return freed; + } + } + } + /* + * This function only frees up some memory + * if possible so we dont have an obligation + * to finish processing. Bail out if preemption + * requested: + */ + if (need_resched()) + return freed; + } while (jh != last_jh); + + return freed; +} + +/* + * journal_clean_checkpoint_list + * + * Find all the written-back checkpoint buffers in the journal and release them. + * + * Called with the journal locked. + * Called with j_list_lock held. + * Returns number of buffers reaped (for debug) + */ + +int __journal_clean_checkpoint_list(journal_t *journal) +{ + transaction_t *transaction, *last_transaction, *next_transaction; + int ret = 0; + int released; + + transaction = journal->j_checkpoint_transactions; + if (!transaction) + goto out; + + last_transaction = transaction->t_cpprev; + next_transaction = transaction; + do { + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + ret += journal_clean_one_cp_list(transaction-> + t_checkpoint_list, &released); + /* + * This function only frees up some memory if possible so we + * dont have an obligation to finish processing. Bail out if + * preemption requested: + */ + if (need_resched()) + goto out; + if (released) + continue; + /* + * It is essential that we are as careful as in the case of + * t_checkpoint_list with removing the buffer from the list as + * we can possibly see not yet submitted buffers on io_list + */ + ret += journal_clean_one_cp_list(transaction-> + t_checkpoint_io_list, &released); + if (need_resched()) + goto out; + } while (transaction != last_transaction); +out: + return ret; +} + +/* + * journal_remove_checkpoint: called after a buffer has been committed + * to disk (either by being write-back flushed to disk, or being + * committed to the log). + * + * We cannot safely clean a transaction out of the log until all of the + * buffer updates committed in that transaction have safely been stored + * elsewhere on disk. To achieve this, all of the buffers in a + * transaction need to be maintained on the transaction's checkpoint + * lists until they have been rewritten, at which point this function is + * called to remove the buffer from the existing transaction's + * checkpoint lists. + * + * The function returns 1 if it frees the transaction, 0 otherwise. + * + * This function is called with the journal locked. + * This function is called with j_list_lock held. + * This function is called with jbd_lock_bh_state(jh2bh(jh)) + */ + +int __journal_remove_checkpoint(struct journal_head *jh) +{ + transaction_t *transaction; + journal_t *journal; + int ret = 0; + + JBUFFER_TRACE(jh, "entry"); + + if ((transaction = jh->b_cp_transaction) == NULL) { + JBUFFER_TRACE(jh, "not on transaction"); + goto out; + } + journal = transaction->t_journal; + + __buffer_unlink(jh); + jh->b_cp_transaction = NULL; + + if (transaction->t_checkpoint_list != NULL || + transaction->t_checkpoint_io_list != NULL) + goto out; + JBUFFER_TRACE(jh, "transaction has no more buffers"); + + /* + * There is one special case to worry about: if we have just pulled the + * buffer off a running or committing transaction's checkpoing list, + * then even if the checkpoint list is empty, the transaction obviously + * cannot be dropped! + * + * The locking here around t_state is a bit sleazy. + * See the comment at the end of journal_commit_transaction(). + */ + if (transaction->t_state != T_FINISHED) { + JBUFFER_TRACE(jh, "belongs to running/committing transaction"); + goto out; + } + + /* OK, that was the last buffer for the transaction: we can now + safely remove this transaction from the log */ + + __journal_drop_transaction(journal, transaction); + + /* Just in case anybody was waiting for more transactions to be + checkpointed... */ + wake_up(&journal->j_wait_logspace); + ret = 1; +out: + JBUFFER_TRACE(jh, "exit"); + return ret; +} + +/* + * journal_insert_checkpoint: put a committed buffer onto a checkpoint + * list so that we know when it is safe to clean the transaction out of + * the log. + * + * Called with the journal locked. + * Called with j_list_lock held. + */ +void __journal_insert_checkpoint(struct journal_head *jh, + transaction_t *transaction) +{ + JBUFFER_TRACE(jh, "entry"); + J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + + jh->b_cp_transaction = transaction; + + if (!transaction->t_checkpoint_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_list; + jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_list = jh; +} + +/* + * We've finished with this transaction structure: adios... + * + * The transaction must have no links except for the checkpoint by this + * point. + * + * Called with the journal locked. + * Called with j_list_lock held. + */ + +void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) +{ + assert_spin_locked(&journal->j_list_lock); + if (transaction->t_cpnext) { + transaction->t_cpnext->t_cpprev = transaction->t_cpprev; + transaction->t_cpprev->t_cpnext = transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = + transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = NULL; + } + + J_ASSERT(transaction->t_state == T_FINISHED); + J_ASSERT(transaction->t_buffers == NULL); + J_ASSERT(transaction->t_sync_datalist == NULL); + J_ASSERT(transaction->t_forget == NULL); + J_ASSERT(transaction->t_iobuf_list == NULL); + J_ASSERT(transaction->t_shadow_list == NULL); + J_ASSERT(transaction->t_log_list == NULL); + J_ASSERT(transaction->t_checkpoint_list == NULL); + J_ASSERT(transaction->t_checkpoint_io_list == NULL); + J_ASSERT(transaction->t_updates == 0); + J_ASSERT(journal->j_committing_transaction != transaction); + J_ASSERT(journal->j_running_transaction != transaction); + + jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); + kfree(transaction); +} diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c new file mode 100644 index 00000000..72ffa974 --- /dev/null +++ b/fs/jbd/commit.c @@ -0,0 +1,953 @@ +/* + * linux/fs/jbd/commit.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal commit routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Default IO end handler for temporary BJ_IO buffer_heads. + */ +static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + BUFFER_TRACE(bh, ""); + if (uptodate) + set_buffer_uptodate(bh); + else + clear_buffer_uptodate(bh); + unlock_buffer(bh); +} + +/* + * When an ext3-ordered file is truncated, it is possible that many pages are + * not successfully freed, because they are attached to a committing transaction. + * After the transaction commits, these pages are left on the LRU, with no + * ->mapping, and with attached buffers. These pages are trivially reclaimable + * by the VM, but their apparent absence upsets the VM accounting, and it makes + * the numbers in /proc/meminfo look odd. + * + * So here, we have a buffer which has just come off the forget list. Look to + * see if we can strip all buffers from the backing page. + * + * Called under journal->j_list_lock. The caller provided us with a ref + * against the buffer, and we drop that here. + */ +static void release_buffer_page(struct buffer_head *bh) +{ + struct page *page; + + if (buffer_dirty(bh)) + goto nope; + if (atomic_read(&bh->b_count) != 1) + goto nope; + page = bh->b_page; + if (!page) + goto nope; + if (page->mapping) + goto nope; + + /* OK, it's a truncated page */ + if (!trylock_page(page)) + goto nope; + + page_cache_get(page); + __brelse(bh); + try_to_free_buffers(page); + unlock_page(page); + page_cache_release(page); + return; + +nope: + __brelse(bh); +} + +/* + * Decrement reference counter for data buffer. If it has been marked + * 'BH_Freed', release it and the page to which it belongs if possible. + */ +static void release_data_buffer(struct buffer_head *bh) +{ + if (buffer_freed(bh)) { + clear_buffer_freed(bh); + release_buffer_page(bh); + } else + put_bh(bh); +} + +/* + * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is + * held. For ranking reasons we must trylock. If we lose, schedule away and + * return 0. j_list_lock is dropped in this case. + */ +static int inverted_lock(journal_t *journal, struct buffer_head *bh) +{ + if (!jbd_trylock_bh_state(bh)) { + spin_unlock(&journal->j_list_lock); + schedule(); + return 0; + } + return 1; +} + +/* Done it all: now write the commit record. We should have + * cleaned up our previous buffers by now, so if we are in abort + * mode we can now just skip the rest of the journal write + * entirely. + * + * Returns 1 if the journal needs to be aborted or 0 on success + */ +static int journal_write_commit_record(journal_t *journal, + transaction_t *commit_transaction) +{ + struct journal_head *descriptor; + struct buffer_head *bh; + journal_header_t *header; + int ret; + + if (is_journal_aborted(journal)) + return 0; + + descriptor = journal_get_descriptor_buffer(journal); + if (!descriptor) + return 1; + + bh = jh2bh(descriptor); + + header = (journal_header_t *)(bh->b_data); + header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); + header->h_sequence = cpu_to_be32(commit_transaction->t_tid); + + JBUFFER_TRACE(descriptor, "write commit block"); + set_buffer_dirty(bh); + + if (journal->j_flags & JFS_BARRIER) + ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA); + else + ret = sync_dirty_buffer(bh); + + put_bh(bh); /* One for getblk() */ + journal_put_journal_head(descriptor); + + return (ret == -EIO); +} + +static void journal_do_submit_data(struct buffer_head **wbuf, int bufs, + int write_op) +{ + int i; + + for (i = 0; i < bufs; i++) { + wbuf[i]->b_end_io = end_buffer_write_sync; + /* We use-up our safety reference in submit_bh() */ + submit_bh(write_op, wbuf[i]); + } +} + +/* + * Submit all the data buffers to disk + */ +static int journal_submit_data_buffers(journal_t *journal, + transaction_t *commit_transaction, + int write_op) +{ + struct journal_head *jh; + struct buffer_head *bh; + int locked; + int bufs = 0; + struct buffer_head **wbuf = journal->j_wbuf; + int err = 0; + + /* + * Whenever we unlock the journal and sleep, things can get added + * onto ->t_sync_datalist, so we have to keep looping back to + * write_out_data until we *know* that the list is empty. + * + * Cleanup any flushed data buffers from the data list. Even in + * abort mode, we want to flush this out as soon as possible. + */ +write_out_data: + cond_resched(); + spin_lock(&journal->j_list_lock); + + while (commit_transaction->t_sync_datalist) { + jh = commit_transaction->t_sync_datalist; + bh = jh2bh(jh); + locked = 0; + + /* Get reference just to make sure buffer does not disappear + * when we are forced to drop various locks */ + get_bh(bh); + /* If the buffer is dirty, we need to submit IO and hence + * we need the buffer lock. We try to lock the buffer without + * blocking. If we fail, we need to drop j_list_lock and do + * blocking lock_buffer(). + */ + if (buffer_dirty(bh)) { + if (!trylock_buffer(bh)) { + BUFFER_TRACE(bh, "needs blocking lock"); + spin_unlock(&journal->j_list_lock); + /* Write out all data to prevent deadlocks */ + journal_do_submit_data(wbuf, bufs, write_op); + bufs = 0; + lock_buffer(bh); + spin_lock(&journal->j_list_lock); + } + locked = 1; + } + /* We have to get bh_state lock. Again out of order, sigh. */ + if (!inverted_lock(journal, bh)) { + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + } + /* Someone already cleaned up the buffer? */ + if (!buffer_jbd(bh) || bh2jh(bh) != jh + || jh->b_transaction != commit_transaction + || jh->b_jlist != BJ_SyncData) { + jbd_unlock_bh_state(bh); + if (locked) + unlock_buffer(bh); + BUFFER_TRACE(bh, "already cleaned up"); + release_data_buffer(bh); + continue; + } + if (locked && test_clear_buffer_dirty(bh)) { + BUFFER_TRACE(bh, "needs writeout, adding to array"); + wbuf[bufs++] = bh; + __journal_file_buffer(jh, commit_transaction, + BJ_Locked); + jbd_unlock_bh_state(bh); + if (bufs == journal->j_wbufsize) { + spin_unlock(&journal->j_list_lock); + journal_do_submit_data(wbuf, bufs, write_op); + bufs = 0; + goto write_out_data; + } + } else if (!locked && buffer_locked(bh)) { + __journal_file_buffer(jh, commit_transaction, + BJ_Locked); + jbd_unlock_bh_state(bh); + put_bh(bh); + } else { + BUFFER_TRACE(bh, "writeout complete: unfile"); + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + __journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + if (locked) + unlock_buffer(bh); + journal_remove_journal_head(bh); + /* One for our safety reference, other for + * journal_remove_journal_head() */ + put_bh(bh); + release_data_buffer(bh); + } + + if (need_resched() || spin_needbreak(&journal->j_list_lock)) { + spin_unlock(&journal->j_list_lock); + goto write_out_data; + } + } + spin_unlock(&journal->j_list_lock); + journal_do_submit_data(wbuf, bufs, write_op); + + return err; +} + +/* + * journal_commit_transaction + * + * The primary function for committing a transaction to the log. This + * function is called by the journal thread to begin a complete commit. + */ +void journal_commit_transaction(journal_t *journal) +{ + transaction_t *commit_transaction; + struct journal_head *jh, *new_jh, *descriptor; + struct buffer_head **wbuf = journal->j_wbuf; + int bufs; + int flags; + int err; + unsigned int blocknr; + ktime_t start_time; + u64 commit_time; + char *tagp = NULL; + journal_header_t *header; + journal_block_tag_t *tag = NULL; + int space_left = 0; + int first_tag = 0; + int tag_flag; + int i; + struct blk_plug plug; + + /* + * First job: lock down the current transaction and wait for + * all outstanding updates to complete. + */ + + /* Do we need to erase the effects of a prior journal_flush? */ + if (journal->j_flags & JFS_FLUSHED) { + jbd_debug(3, "super block updated\n"); + journal_update_superblock(journal, 1); + } else { + jbd_debug(3, "superblock not updated\n"); + } + + J_ASSERT(journal->j_running_transaction != NULL); + J_ASSERT(journal->j_committing_transaction == NULL); + + commit_transaction = journal->j_running_transaction; + J_ASSERT(commit_transaction->t_state == T_RUNNING); + + jbd_debug(1, "JBD: starting commit of transaction %d\n", + commit_transaction->t_tid); + + spin_lock(&journal->j_state_lock); + commit_transaction->t_state = T_LOCKED; + + spin_lock(&commit_transaction->t_handle_lock); + while (commit_transaction->t_updates) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + if (commit_transaction->t_updates) { + spin_unlock(&commit_transaction->t_handle_lock); + spin_unlock(&journal->j_state_lock); + schedule(); + spin_lock(&journal->j_state_lock); + spin_lock(&commit_transaction->t_handle_lock); + } + finish_wait(&journal->j_wait_updates, &wait); + } + spin_unlock(&commit_transaction->t_handle_lock); + + J_ASSERT (commit_transaction->t_outstanding_credits <= + journal->j_max_transaction_buffers); + + /* + * First thing we are allowed to do is to discard any remaining + * BJ_Reserved buffers. Note, it is _not_ permissible to assume + * that there are no such buffers: if a large filesystem + * operation like a truncate needs to split itself over multiple + * transactions, then it may try to do a journal_restart() while + * there are still BJ_Reserved buffers outstanding. These must + * be released cleanly from the current transaction. + * + * In this case, the filesystem must still reserve write access + * again before modifying the buffer in the new transaction, but + * we do not require it to remember exactly which old buffers it + * has reserved. This is consistent with the existing behaviour + * that multiple journal_get_write_access() calls to the same + * buffer are perfectly permissible. + */ + while (commit_transaction->t_reserved_list) { + jh = commit_transaction->t_reserved_list; + JBUFFER_TRACE(jh, "reserved, unused: refile"); + /* + * A journal_get_undo_access()+journal_release_buffer() may + * leave undo-committed data. + */ + if (jh->b_committed_data) { + struct buffer_head *bh = jh2bh(jh); + + jbd_lock_bh_state(bh); + jbd_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; + jbd_unlock_bh_state(bh); + } + journal_refile_buffer(journal, jh); + } + + /* + * Now try to drop any written-back buffers from the journal's + * checkpoint lists. We do this *before* commit because it potentially + * frees some memory + */ + spin_lock(&journal->j_list_lock); + __journal_clean_checkpoint_list(journal); + spin_unlock(&journal->j_list_lock); + + jbd_debug (3, "JBD: commit phase 1\n"); + + /* + * Switch to a new revoke table. + */ + journal_switch_revoke_table(journal); + + commit_transaction->t_state = T_FLUSH; + journal->j_committing_transaction = commit_transaction; + journal->j_running_transaction = NULL; + start_time = ktime_get(); + commit_transaction->t_log_start = journal->j_head; + wake_up(&journal->j_wait_transaction_locked); + spin_unlock(&journal->j_state_lock); + + jbd_debug (3, "JBD: commit phase 2\n"); + + /* + * Now start flushing things to disk, in the order they appear + * on the transaction lists. Data blocks go first. + */ + blk_start_plug(&plug); + err = journal_submit_data_buffers(journal, commit_transaction, + WRITE_SYNC); + blk_finish_plug(&plug); + + /* + * Wait for all previously submitted IO to complete. + */ + spin_lock(&journal->j_list_lock); + while (commit_transaction->t_locked_list) { + struct buffer_head *bh; + + jh = commit_transaction->t_locked_list->b_tprev; + bh = jh2bh(jh); + get_bh(bh); + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + wait_on_buffer(bh); + spin_lock(&journal->j_list_lock); + } + if (unlikely(!buffer_uptodate(bh))) { + if (!trylock_page(bh->b_page)) { + spin_unlock(&journal->j_list_lock); + lock_page(bh->b_page); + spin_lock(&journal->j_list_lock); + } + if (bh->b_page->mapping) + set_bit(AS_EIO, &bh->b_page->mapping->flags); + + unlock_page(bh->b_page); + SetPageError(bh->b_page); + err = -EIO; + } + if (!inverted_lock(journal, bh)) { + put_bh(bh); + spin_lock(&journal->j_list_lock); + continue; + } + if (buffer_jbd(bh) && bh2jh(bh) == jh && + jh->b_transaction == commit_transaction && + jh->b_jlist == BJ_Locked) { + __journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + put_bh(bh); + } else { + jbd_unlock_bh_state(bh); + } + release_data_buffer(bh); + cond_resched_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + + if (err) { + char b[BDEVNAME_SIZE]; + + printk(KERN_WARNING + "JBD: Detected IO errors while flushing file data " + "on %s\n", bdevname(journal->j_fs_dev, b)); + if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) + journal_abort(journal, err); + err = 0; + } + + blk_start_plug(&plug); + + journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC); + + /* + * If we found any dirty or locked buffers, then we should have + * looped back up to the write_out_data label. If there weren't + * any then journal_clean_data_list should have wiped the list + * clean by now, so check that it is in fact empty. + */ + J_ASSERT (commit_transaction->t_sync_datalist == NULL); + + jbd_debug (3, "JBD: commit phase 3\n"); + + /* + * Way to go: we have now written out all of the data for a + * transaction! Now comes the tricky part: we need to write out + * metadata. Loop over the transaction's entire buffer list: + */ + spin_lock(&journal->j_state_lock); + commit_transaction->t_state = T_COMMIT; + spin_unlock(&journal->j_state_lock); + + J_ASSERT(commit_transaction->t_nr_buffers <= + commit_transaction->t_outstanding_credits); + + descriptor = NULL; + bufs = 0; + while (commit_transaction->t_buffers) { + + /* Find the next buffer to be journaled... */ + + jh = commit_transaction->t_buffers; + + /* If we're in abort mode, we just un-journal the buffer and + release it. */ + + if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); + JBUFFER_TRACE(jh, "journal is aborting: refile"); + journal_refile_buffer(journal, jh); + /* If that was the last one, we need to clean up + * any descriptor buffers which may have been + * already allocated, even if we are now + * aborting. */ + if (!commit_transaction->t_buffers) + goto start_journal_io; + continue; + } + + /* Make sure we have a descriptor block in which to + record the metadata buffer. */ + + if (!descriptor) { + struct buffer_head *bh; + + J_ASSERT (bufs == 0); + + jbd_debug(4, "JBD: get descriptor\n"); + + descriptor = journal_get_descriptor_buffer(journal); + if (!descriptor) { + journal_abort(journal, -EIO); + continue; + } + + bh = jh2bh(descriptor); + jbd_debug(4, "JBD: got buffer %llu (%p)\n", + (unsigned long long)bh->b_blocknr, bh->b_data); + header = (journal_header_t *)&bh->b_data[0]; + header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); + header->h_sequence = cpu_to_be32(commit_transaction->t_tid); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + space_left = bh->b_size - sizeof(journal_header_t); + first_tag = 1; + set_buffer_jwrite(bh); + set_buffer_dirty(bh); + wbuf[bufs++] = bh; + + /* Record it so that we can wait for IO + completion later */ + BUFFER_TRACE(bh, "ph3: file as descriptor"); + journal_file_buffer(descriptor, commit_transaction, + BJ_LogCtl); + } + + /* Where is the buffer to be written? */ + + err = journal_next_log_block(journal, &blocknr); + /* If the block mapping failed, just abandon the buffer + and repeat this loop: we'll fall into the + refile-on-abort condition above. */ + if (err) { + journal_abort(journal, err); + continue; + } + + /* + * start_this_handle() uses t_outstanding_credits to determine + * the free space in the log, but this counter is changed + * by journal_next_log_block() also. + */ + commit_transaction->t_outstanding_credits--; + + /* Bump b_count to prevent truncate from stumbling over + the shadowed buffer! @@@ This can go if we ever get + rid of the BJ_IO/BJ_Shadow pairing of buffers. */ + get_bh(jh2bh(jh)); + + /* Make a temporary IO buffer with which to write it out + (this will requeue both the metadata buffer and the + temporary IO buffer). new_bh goes on BJ_IO*/ + + set_buffer_jwrite(jh2bh(jh)); + /* + * akpm: journal_write_metadata_buffer() sets + * new_bh->b_transaction to commit_transaction. + * We need to clean this up before we release new_bh + * (which is of type BJ_IO) + */ + JBUFFER_TRACE(jh, "ph3: write metadata"); + flags = journal_write_metadata_buffer(commit_transaction, + jh, &new_jh, blocknr); + set_buffer_jwrite(jh2bh(new_jh)); + wbuf[bufs++] = jh2bh(new_jh); + + /* Record the new block's tag in the current descriptor + buffer */ + + tag_flag = 0; + if (flags & 1) + tag_flag |= JFS_FLAG_ESCAPE; + if (!first_tag) + tag_flag |= JFS_FLAG_SAME_UUID; + + tag = (journal_block_tag_t *) tagp; + tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); + tag->t_flags = cpu_to_be32(tag_flag); + tagp += sizeof(journal_block_tag_t); + space_left -= sizeof(journal_block_tag_t); + + if (first_tag) { + memcpy (tagp, journal->j_uuid, 16); + tagp += 16; + space_left -= 16; + first_tag = 0; + } + + /* If there's no more to do, or if the descriptor is full, + let the IO rip! */ + + if (bufs == journal->j_wbufsize || + commit_transaction->t_buffers == NULL || + space_left < sizeof(journal_block_tag_t) + 16) { + + jbd_debug(4, "JBD: Submit %d IOs\n", bufs); + + /* Write an end-of-descriptor marker before + submitting the IOs. "tag" still points to + the last tag we set up. */ + + tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); + +start_journal_io: + for (i = 0; i < bufs; i++) { + struct buffer_head *bh = wbuf[i]; + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + submit_bh(WRITE_SYNC, bh); + } + cond_resched(); + + /* Force a new descriptor to be generated next + time round the loop. */ + descriptor = NULL; + bufs = 0; + } + } + + blk_finish_plug(&plug); + + /* Lo and behold: we have just managed to send a transaction to + the log. Before we can commit it, wait for the IO so far to + complete. Control buffers being written are on the + transaction's t_log_list queue, and metadata buffers are on + the t_iobuf_list queue. + + Wait for the buffers in reverse order. That way we are + less likely to be woken up until all IOs have completed, and + so we incur less scheduling load. + */ + + jbd_debug(3, "JBD: commit phase 4\n"); + + /* + * akpm: these are BJ_IO, and j_list_lock is not needed. + * See __journal_try_to_free_buffer. + */ +wait_for_iobuf: + while (commit_transaction->t_iobuf_list != NULL) { + struct buffer_head *bh; + + jh = commit_transaction->t_iobuf_list->b_tprev; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + wait_on_buffer(bh); + goto wait_for_iobuf; + } + if (cond_resched()) + goto wait_for_iobuf; + + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + + clear_buffer_jwrite(bh); + + JBUFFER_TRACE(jh, "ph4: unfile after journal write"); + journal_unfile_buffer(journal, jh); + + /* + * ->t_iobuf_list should contain only dummy buffer_heads + * which were created by journal_write_metadata_buffer(). + */ + BUFFER_TRACE(bh, "dumping temporary bh"); + journal_put_journal_head(jh); + __brelse(bh); + J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); + free_buffer_head(bh); + + /* We also have to unlock and free the corresponding + shadowed buffer */ + jh = commit_transaction->t_shadow_list->b_tprev; + bh = jh2bh(jh); + clear_buffer_jwrite(bh); + J_ASSERT_BH(bh, buffer_jbddirty(bh)); + + /* The metadata is now released for reuse, but we need + to remember it against this transaction so that when + we finally commit, we can do any checkpointing + required. */ + JBUFFER_TRACE(jh, "file as BJ_Forget"); + journal_file_buffer(jh, commit_transaction, BJ_Forget); + /* + * Wake up any transactions which were waiting for this + * IO to complete. The barrier must be here so that changes + * by journal_file_buffer() take effect before wake_up_bit() + * does the waitqueue check. + */ + smp_mb(); + wake_up_bit(&bh->b_state, BH_Unshadow); + JBUFFER_TRACE(jh, "brelse shadowed buffer"); + __brelse(bh); + } + + J_ASSERT (commit_transaction->t_shadow_list == NULL); + + jbd_debug(3, "JBD: commit phase 5\n"); + + /* Here we wait for the revoke record and descriptor record buffers */ + wait_for_ctlbuf: + while (commit_transaction->t_log_list != NULL) { + struct buffer_head *bh; + + jh = commit_transaction->t_log_list->b_tprev; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + wait_on_buffer(bh); + goto wait_for_ctlbuf; + } + if (cond_resched()) + goto wait_for_ctlbuf; + + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + + BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); + clear_buffer_jwrite(bh); + journal_unfile_buffer(journal, jh); + journal_put_journal_head(jh); + __brelse(bh); /* One for getblk */ + /* AKPM: bforget here */ + } + + if (err) + journal_abort(journal, err); + + jbd_debug(3, "JBD: commit phase 6\n"); + + /* All metadata is written, now write commit record and do cleanup */ + spin_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_state = T_COMMIT_RECORD; + spin_unlock(&journal->j_state_lock); + + if (journal_write_commit_record(journal, commit_transaction)) + err = -EIO; + + if (err) + journal_abort(journal, err); + + /* End of a transaction! Finally, we can do checkpoint + processing: any buffers committed as a result of this + transaction can be removed from any checkpoint list it was on + before. */ + + jbd_debug(3, "JBD: commit phase 7\n"); + + J_ASSERT(commit_transaction->t_sync_datalist == NULL); + J_ASSERT(commit_transaction->t_buffers == NULL); + J_ASSERT(commit_transaction->t_checkpoint_list == NULL); + J_ASSERT(commit_transaction->t_iobuf_list == NULL); + J_ASSERT(commit_transaction->t_shadow_list == NULL); + J_ASSERT(commit_transaction->t_log_list == NULL); + +restart_loop: + /* + * As there are other places (journal_unmap_buffer()) adding buffers + * to this list we have to be careful and hold the j_list_lock. + */ + spin_lock(&journal->j_list_lock); + while (commit_transaction->t_forget) { + transaction_t *cp_transaction; + struct buffer_head *bh; + + jh = commit_transaction->t_forget; + spin_unlock(&journal->j_list_lock); + bh = jh2bh(jh); + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || + jh->b_transaction == journal->j_running_transaction); + + /* + * If there is undo-protected committed data against + * this buffer, then we can remove it now. If it is a + * buffer needing such protection, the old frozen_data + * field now points to a committed version of the + * buffer, so rotate that field to the new committed + * data. + * + * Otherwise, we can just throw away the frozen data now. + */ + if (jh->b_committed_data) { + jbd_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; + if (jh->b_frozen_data) { + jh->b_committed_data = jh->b_frozen_data; + jh->b_frozen_data = NULL; + } + } else if (jh->b_frozen_data) { + jbd_free(jh->b_frozen_data, bh->b_size); + jh->b_frozen_data = NULL; + } + + spin_lock(&journal->j_list_lock); + cp_transaction = jh->b_cp_transaction; + if (cp_transaction) { + JBUFFER_TRACE(jh, "remove from old cp transaction"); + __journal_remove_checkpoint(jh); + } + + /* Only re-checkpoint the buffer_head if it is marked + * dirty. If the buffer was added to the BJ_Forget list + * by journal_forget, it may no longer be dirty and + * there's no point in keeping a checkpoint record for + * it. */ + + /* A buffer which has been freed while still being + * journaled by a previous transaction may end up still + * being dirty here, but we want to avoid writing back + * that buffer in the future after the "add to orphan" + * operation been committed, That's not only a performance + * gain, it also stops aliasing problems if the buffer is + * left behind for writeback and gets reallocated for another + * use in a different page. */ + if (buffer_freed(bh) && !jh->b_next_transaction) { + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + } + + if (buffer_jbddirty(bh)) { + JBUFFER_TRACE(jh, "add to new checkpointing trans"); + __journal_insert_checkpoint(jh, commit_transaction); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); + JBUFFER_TRACE(jh, "refile for checkpoint writeback"); + __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + } else { + J_ASSERT_BH(bh, !buffer_dirty(bh)); + /* The buffer on BJ_Forget list and not jbddirty means + * it has been freed by this transaction and hence it + * could not have been reallocated until this + * transaction has committed. *BUT* it could be + * reallocated once we have written all the data to + * disk and before we process the buffer on BJ_Forget + * list. */ + JBUFFER_TRACE(jh, "refile or unfile freed buffer"); + __journal_refile_buffer(jh); + if (!jh->b_transaction) { + jbd_unlock_bh_state(bh); + /* needs a brelse */ + journal_remove_journal_head(bh); + release_buffer_page(bh); + } else + jbd_unlock_bh_state(bh); + } + cond_resched_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + /* + * This is a bit sleazy. We use j_list_lock to protect transition + * of a transaction into T_FINISHED state and calling + * __journal_drop_transaction(). Otherwise we could race with + * other checkpointing code processing the transaction... + */ + spin_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + /* + * Now recheck if some buffers did not get attached to the transaction + * while the lock was dropped... + */ + if (commit_transaction->t_forget) { + spin_unlock(&journal->j_list_lock); + spin_unlock(&journal->j_state_lock); + goto restart_loop; + } + + /* Done with this transaction! */ + + jbd_debug(3, "JBD: commit phase 8\n"); + + J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD); + + commit_transaction->t_state = T_FINISHED; + J_ASSERT(commit_transaction == journal->j_committing_transaction); + journal->j_commit_sequence = commit_transaction->t_tid; + journal->j_committing_transaction = NULL; + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + + /* + * weight the commit time higher than the average time so we don't + * react too strongly to vast changes in commit time + */ + if (likely(journal->j_average_commit_time)) + journal->j_average_commit_time = (commit_time*3 + + journal->j_average_commit_time) / 4; + else + journal->j_average_commit_time = commit_time; + + spin_unlock(&journal->j_state_lock); + + if (commit_transaction->t_checkpoint_list == NULL && + commit_transaction->t_checkpoint_io_list == NULL) { + __journal_drop_transaction(journal, commit_transaction); + } else { + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; + } else { + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = + commit_transaction; + } + } + spin_unlock(&journal->j_list_lock); + + jbd_debug(1, "JBD: commit %d complete, head %d\n", + journal->j_commit_sequence, journal->j_tail_sequence); + + wake_up(&journal->j_wait_done_commit); +} diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c new file mode 100644 index 00000000..9f36384e --- /dev/null +++ b/fs/jbd/journal.c @@ -0,0 +1,2081 @@ +/* + * linux/fs/jbd/journal.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Generic filesystem journal-writing code; part of the ext2fs + * journaling system. + * + * This file manages journals: areas of disk reserved for logging + * transactional updates. This includes the kernel journaling thread + * which is responsible for scheduling updates to the log. + * + * We do not actually manage the physical storage of the journal in this + * file: that is left to a per-journal policy function, which allows us + * to store the journal within a filesystem-specified area for ext2 + * journaling (ext2 can use a reserved inode for storing the log). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +EXPORT_SYMBOL(journal_start); +EXPORT_SYMBOL(journal_restart); +EXPORT_SYMBOL(journal_extend); +EXPORT_SYMBOL(journal_stop); +EXPORT_SYMBOL(journal_lock_updates); +EXPORT_SYMBOL(journal_unlock_updates); +EXPORT_SYMBOL(journal_get_write_access); +EXPORT_SYMBOL(journal_get_create_access); +EXPORT_SYMBOL(journal_get_undo_access); +EXPORT_SYMBOL(journal_dirty_data); +EXPORT_SYMBOL(journal_dirty_metadata); +EXPORT_SYMBOL(journal_release_buffer); +EXPORT_SYMBOL(journal_forget); +#if 0 +EXPORT_SYMBOL(journal_sync_buffer); +#endif +EXPORT_SYMBOL(journal_flush); +EXPORT_SYMBOL(journal_revoke); + +EXPORT_SYMBOL(journal_init_dev); +EXPORT_SYMBOL(journal_init_inode); +EXPORT_SYMBOL(journal_update_format); +EXPORT_SYMBOL(journal_check_used_features); +EXPORT_SYMBOL(journal_check_available_features); +EXPORT_SYMBOL(journal_set_features); +EXPORT_SYMBOL(journal_create); +EXPORT_SYMBOL(journal_load); +EXPORT_SYMBOL(journal_destroy); +EXPORT_SYMBOL(journal_abort); +EXPORT_SYMBOL(journal_errno); +EXPORT_SYMBOL(journal_ack_err); +EXPORT_SYMBOL(journal_clear_err); +EXPORT_SYMBOL(log_wait_commit); +EXPORT_SYMBOL(log_start_commit); +EXPORT_SYMBOL(journal_start_commit); +EXPORT_SYMBOL(journal_force_commit_nested); +EXPORT_SYMBOL(journal_wipe); +EXPORT_SYMBOL(journal_blocks_per_page); +EXPORT_SYMBOL(journal_invalidatepage); +EXPORT_SYMBOL(journal_try_to_free_buffers); +EXPORT_SYMBOL(journal_force_commit); + +static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); +static void __journal_abort_soft (journal_t *journal, int errno); +static const char *journal_dev_name(journal_t *journal, char *buffer); + +/* + * Helper function used to manage commit timeouts + */ + +static void commit_timeout(unsigned long __data) +{ + struct task_struct * p = (struct task_struct *) __data; + + wake_up_process(p); +} + +/* + * kjournald: The main thread function used to manage a logging device + * journal. + * + * This kernel thread is responsible for two things: + * + * 1) COMMIT: Every so often we need to commit the current state of the + * filesystem to disk. The journal thread is responsible for writing + * all of the metadata buffers to disk. + * + * 2) CHECKPOINT: We cannot reuse a used section of the log file until all + * of the data in that part of the log has been rewritten elsewhere on + * the disk. Flushing these old buffers to reclaim space in the log is + * known as checkpointing, and this thread is responsible for that job. + */ + +static int kjournald(void *arg) +{ + journal_t *journal = arg; + transaction_t *transaction; + + /* + * Set up an interval timer which can be used to trigger a commit wakeup + * after the commit interval expires + */ + setup_timer(&journal->j_commit_timer, commit_timeout, + (unsigned long)current); + + /* Record that the journal thread is running */ + journal->j_task = current; + wake_up(&journal->j_wait_done_commit); + + printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", + journal->j_commit_interval / HZ); + + /* + * And now, wait forever for commit wakeup events. + */ + spin_lock(&journal->j_state_lock); + +loop: + if (journal->j_flags & JFS_UNMOUNT) + goto end_loop; + + jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", + journal->j_commit_sequence, journal->j_commit_request); + + if (journal->j_commit_sequence != journal->j_commit_request) { + jbd_debug(1, "OK, requests differ\n"); + spin_unlock(&journal->j_state_lock); + del_timer_sync(&journal->j_commit_timer); + journal_commit_transaction(journal); + spin_lock(&journal->j_state_lock); + goto loop; + } + + wake_up(&journal->j_wait_done_commit); + if (freezing(current)) { + /* + * The simpler the better. Flushing journal isn't a + * good idea, because that depends on threads that may + * be already stopped. + */ + jbd_debug(1, "Now suspending kjournald\n"); + spin_unlock(&journal->j_state_lock); + refrigerator(); + spin_lock(&journal->j_state_lock); + } else { + /* + * We assume on resume that commits are already there, + * so we don't sleep + */ + DEFINE_WAIT(wait); + int should_sleep = 1; + + prepare_to_wait(&journal->j_wait_commit, &wait, + TASK_INTERRUPTIBLE); + if (journal->j_commit_sequence != journal->j_commit_request) + should_sleep = 0; + transaction = journal->j_running_transaction; + if (transaction && time_after_eq(jiffies, + transaction->t_expires)) + should_sleep = 0; + if (journal->j_flags & JFS_UNMOUNT) + should_sleep = 0; + if (should_sleep) { + spin_unlock(&journal->j_state_lock); + schedule(); + spin_lock(&journal->j_state_lock); + } + finish_wait(&journal->j_wait_commit, &wait); + } + + jbd_debug(1, "kjournald wakes\n"); + + /* + * Were we woken up by a commit wakeup event? + */ + transaction = journal->j_running_transaction; + if (transaction && time_after_eq(jiffies, transaction->t_expires)) { + journal->j_commit_request = transaction->t_tid; + jbd_debug(1, "woke because of timeout\n"); + } + goto loop; + +end_loop: + spin_unlock(&journal->j_state_lock); + del_timer_sync(&journal->j_commit_timer); + journal->j_task = NULL; + wake_up(&journal->j_wait_done_commit); + jbd_debug(1, "Journal thread exiting.\n"); + return 0; +} + +static int journal_start_thread(journal_t *journal) +{ + struct task_struct *t; + + t = kthread_run(kjournald, journal, "kjournald"); + if (IS_ERR(t)) + return PTR_ERR(t); + + wait_event(journal->j_wait_done_commit, journal->j_task != NULL); + return 0; +} + +static void journal_kill_thread(journal_t *journal) +{ + spin_lock(&journal->j_state_lock); + journal->j_flags |= JFS_UNMOUNT; + + while (journal->j_task) { + wake_up(&journal->j_wait_commit); + spin_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_done_commit, + journal->j_task == NULL); + spin_lock(&journal->j_state_lock); + } + spin_unlock(&journal->j_state_lock); +} + +/* + * journal_write_metadata_buffer: write a metadata buffer to the journal. + * + * Writes a metadata buffer to a given disk block. The actual IO is not + * performed but a new buffer_head is constructed which labels the data + * to be written with the correct destination disk block. + * + * Any magic-number escaping which needs to be done will cause a + * copy-out here. If the buffer happens to start with the + * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the + * magic number is only written to the log for descripter blocks. In + * this case, we copy the data and replace the first word with 0, and we + * return a result code which indicates that this buffer needs to be + * marked as an escaped buffer in the corresponding log descriptor + * block. The missing word can then be restored when the block is read + * during recovery. + * + * If the source buffer has already been modified by a new transaction + * since we took the last commit snapshot, we use the frozen copy of + * that data for IO. If we end up using the existing buffer_head's data + * for the write, then we *have* to lock the buffer to prevent anyone + * else from using and possibly modifying it while the IO is in + * progress. + * + * The function returns a pointer to the buffer_heads to be used for IO. + * + * We assume that the journal has already been locked in this function. + * + * Return value: + * <0: Error + * >=0: Finished OK + * + * On success: + * Bit 0 set == escape performed on the data + * Bit 1 set == buffer copy-out performed (kfree the data after IO) + */ + +int journal_write_metadata_buffer(transaction_t *transaction, + struct journal_head *jh_in, + struct journal_head **jh_out, + unsigned int blocknr) +{ + int need_copy_out = 0; + int done_copy_out = 0; + int do_escape = 0; + char *mapped_data; + struct buffer_head *new_bh; + struct journal_head *new_jh; + struct page *new_page; + unsigned int new_offset; + struct buffer_head *bh_in = jh2bh(jh_in); + journal_t *journal = transaction->t_journal; + + /* + * The buffer really shouldn't be locked: only the current committing + * transaction is allowed to write it, so nobody else is allowed + * to do any IO. + * + * akpm: except if we're journalling data, and write() output is + * also part of a shared mapping, and another thread has + * decided to launch a writepage() against this buffer. + */ + J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); + + new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); + /* keep subsequent assertions sane */ + new_bh->b_state = 0; + init_buffer(new_bh, NULL, NULL); + atomic_set(&new_bh->b_count, 1); + new_jh = journal_add_journal_head(new_bh); /* This sleeps */ + + /* + * If a new transaction has already done a buffer copy-out, then + * we use that version of the data for the commit. + */ + jbd_lock_bh_state(bh_in); +repeat: + if (jh_in->b_frozen_data) { + done_copy_out = 1; + new_page = virt_to_page(jh_in->b_frozen_data); + new_offset = offset_in_page(jh_in->b_frozen_data); + } else { + new_page = jh2bh(jh_in)->b_page; + new_offset = offset_in_page(jh2bh(jh_in)->b_data); + } + + mapped_data = kmap_atomic(new_page, KM_USER0); + /* + * Check for escaping + */ + if (*((__be32 *)(mapped_data + new_offset)) == + cpu_to_be32(JFS_MAGIC_NUMBER)) { + need_copy_out = 1; + do_escape = 1; + } + kunmap_atomic(mapped_data, KM_USER0); + + /* + * Do we need to do a data copy? + */ + if (need_copy_out && !done_copy_out) { + char *tmp; + + jbd_unlock_bh_state(bh_in); + tmp = jbd_alloc(bh_in->b_size, GFP_NOFS); + jbd_lock_bh_state(bh_in); + if (jh_in->b_frozen_data) { + jbd_free(tmp, bh_in->b_size); + goto repeat; + } + + jh_in->b_frozen_data = tmp; + mapped_data = kmap_atomic(new_page, KM_USER0); + memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); + kunmap_atomic(mapped_data, KM_USER0); + + new_page = virt_to_page(tmp); + new_offset = offset_in_page(tmp); + done_copy_out = 1; + } + + /* + * Did we need to do an escaping? Now we've done all the + * copying, we can finally do so. + */ + if (do_escape) { + mapped_data = kmap_atomic(new_page, KM_USER0); + *((unsigned int *)(mapped_data + new_offset)) = 0; + kunmap_atomic(mapped_data, KM_USER0); + } + + set_bh_page(new_bh, new_page, new_offset); + new_jh->b_transaction = NULL; + new_bh->b_size = jh2bh(jh_in)->b_size; + new_bh->b_bdev = transaction->t_journal->j_dev; + new_bh->b_blocknr = blocknr; + set_buffer_mapped(new_bh); + set_buffer_dirty(new_bh); + + *jh_out = new_jh; + + /* + * The to-be-written buffer needs to get moved to the io queue, + * and the original buffer whose contents we are shadowing or + * copying is moved to the transaction's shadow queue. + */ + JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); + spin_lock(&journal->j_list_lock); + __journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh_in); + + JBUFFER_TRACE(new_jh, "file as BJ_IO"); + journal_file_buffer(new_jh, transaction, BJ_IO); + + return do_escape | (done_copy_out << 1); +} + +/* + * Allocation code for the journal file. Manage the space left in the + * journal, so that we can begin checkpointing when appropriate. + */ + +/* + * __log_space_left: Return the number of free blocks left in the journal. + * + * Called with the journal already locked. + * + * Called under j_state_lock + */ + +int __log_space_left(journal_t *journal) +{ + int left = journal->j_free; + + assert_spin_locked(&journal->j_state_lock); + + /* + * Be pessimistic here about the number of those free blocks which + * might be required for log descriptor control blocks. + */ + +#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ + + left -= MIN_LOG_RESERVED_BLOCKS; + + if (left <= 0) + return 0; + left -= (left >> 3); + return left; +} + +/* + * Called under j_state_lock. Returns true if a transaction commit was started. + */ +int __log_start_commit(journal_t *journal, tid_t target) +{ + /* + * The only transaction we can possibly wait upon is the + * currently running transaction (if it exists). Otherwise, + * the target tid must be an old one. + */ + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == target) { + /* + * We want a new commit: OK, mark the request and wakeup the + * commit thread. We do _not_ do the commit ourselves. + */ + + journal->j_commit_request = target; + jbd_debug(1, "JBD: requesting commit %d/%d\n", + journal->j_commit_request, + journal->j_commit_sequence); + wake_up(&journal->j_wait_commit); + return 1; + } else if (!tid_geq(journal->j_commit_request, target)) + /* This should never happen, but if it does, preserve + the evidence before kjournald goes into a loop and + increments j_commit_sequence beyond all recognition. */ + WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); + return 0; +} + +int log_start_commit(journal_t *journal, tid_t tid) +{ + int ret; + + spin_lock(&journal->j_state_lock); + ret = __log_start_commit(journal, tid); + spin_unlock(&journal->j_state_lock); + return ret; +} + +/* + * Force and wait upon a commit if the calling process is not within + * transaction. This is used for forcing out undo-protected data which contains + * bitmaps, when the fs is running out of space. + * + * We can only force the running transaction if we don't have an active handle; + * otherwise, we will deadlock. + * + * Returns true if a transaction was started. + */ +int journal_force_commit_nested(journal_t *journal) +{ + transaction_t *transaction = NULL; + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction && !current->journal_info) { + transaction = journal->j_running_transaction; + __log_start_commit(journal, transaction->t_tid); + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + if (!transaction) { + spin_unlock(&journal->j_state_lock); + return 0; /* Nothing to retry */ + } + + tid = transaction->t_tid; + spin_unlock(&journal->j_state_lock); + log_wait_commit(journal, tid); + return 1; +} + +/* + * Start a commit of the current running transaction (if any). Returns true + * if a transaction is going to be committed (or is currently already + * committing), and fills its tid in at *ptid + */ +int journal_start_commit(journal_t *journal, tid_t *ptid) +{ + int ret = 0; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) { + tid_t tid = journal->j_running_transaction->t_tid; + + __log_start_commit(journal, tid); + /* There's a running transaction and we've just made sure + * it's commit has been scheduled. */ + if (ptid) + *ptid = tid; + ret = 1; + } else if (journal->j_committing_transaction) { + /* + * If ext3_write_super() recently started a commit, then we + * have to wait for completion of that transaction + */ + if (ptid) + *ptid = journal->j_committing_transaction->t_tid; + ret = 1; + } + spin_unlock(&journal->j_state_lock); + return ret; +} + +/* + * Wait for a specified commit to complete. + * The caller may not hold the journal lock. + */ +int log_wait_commit(journal_t *journal, tid_t tid) +{ + int err = 0; + +#ifdef CONFIG_JBD_DEBUG + spin_lock(&journal->j_state_lock); + if (!tid_geq(journal->j_commit_request, tid)) { + printk(KERN_EMERG + "%s: error: j_commit_request=%d, tid=%d\n", + __func__, journal->j_commit_request, tid); + } + spin_unlock(&journal->j_state_lock); +#endif + spin_lock(&journal->j_state_lock); + while (tid_gt(tid, journal->j_commit_sequence)) { + jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", + tid, journal->j_commit_sequence); + wake_up(&journal->j_wait_commit); + spin_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_done_commit, + !tid_gt(tid, journal->j_commit_sequence)); + spin_lock(&journal->j_state_lock); + } + spin_unlock(&journal->j_state_lock); + + if (unlikely(is_journal_aborted(journal))) { + printk(KERN_EMERG "journal commit I/O error\n"); + err = -EIO; + } + return err; +} + +/* + * Return 1 if a given transaction has not yet sent barrier request + * connected with a transaction commit. If 0 is returned, transaction + * may or may not have sent the barrier. Used to avoid sending barrier + * twice in common cases. + */ +int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid) +{ + int ret = 0; + transaction_t *commit_trans; + + if (!(journal->j_flags & JFS_BARRIER)) + return 0; + spin_lock(&journal->j_state_lock); + /* Transaction already committed? */ + if (tid_geq(journal->j_commit_sequence, tid)) + goto out; + /* + * Transaction is being committed and we already proceeded to + * writing commit record? + */ + commit_trans = journal->j_committing_transaction; + if (commit_trans && commit_trans->t_tid == tid && + commit_trans->t_state >= T_COMMIT_RECORD) + goto out; + ret = 1; +out: + spin_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(journal_trans_will_send_data_barrier); + +/* + * Log buffer allocation routines: + */ + +int journal_next_log_block(journal_t *journal, unsigned int *retp) +{ + unsigned int blocknr; + + spin_lock(&journal->j_state_lock); + J_ASSERT(journal->j_free > 1); + + blocknr = journal->j_head; + journal->j_head++; + journal->j_free--; + if (journal->j_head == journal->j_last) + journal->j_head = journal->j_first; + spin_unlock(&journal->j_state_lock); + return journal_bmap(journal, blocknr, retp); +} + +/* + * Conversion of logical to physical block numbers for the journal + * + * On external journals the journal blocks are identity-mapped, so + * this is a no-op. If needed, we can use j_blk_offset - everything is + * ready. + */ +int journal_bmap(journal_t *journal, unsigned int blocknr, + unsigned int *retp) +{ + int err = 0; + unsigned int ret; + + if (journal->j_inode) { + ret = bmap(journal->j_inode, blocknr); + if (ret) + *retp = ret; + else { + char b[BDEVNAME_SIZE]; + + printk(KERN_ALERT "%s: journal block not found " + "at offset %u on %s\n", + __func__, + blocknr, + bdevname(journal->j_dev, b)); + err = -EIO; + __journal_abort_soft(journal, err); + } + } else { + *retp = blocknr; /* +journal->j_blk_offset */ + } + return err; +} + +/* + * We play buffer_head aliasing tricks to write data/metadata blocks to + * the journal without copying their contents, but for journal + * descriptor blocks we do need to generate bona fide buffers. + * + * After the caller of journal_get_descriptor_buffer() has finished modifying + * the buffer's contents they really should run flush_dcache_page(bh->b_page). + * But we don't bother doing that, so there will be coherency problems with + * mmaps of blockdevs which hold live JBD-controlled filesystems. + */ +struct journal_head *journal_get_descriptor_buffer(journal_t *journal) +{ + struct buffer_head *bh; + unsigned int blocknr; + int err; + + err = journal_next_log_block(journal, &blocknr); + + if (err) + return NULL; + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return NULL; + lock_buffer(bh); + memset(bh->b_data, 0, journal->j_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + BUFFER_TRACE(bh, "return this buffer"); + return journal_add_journal_head(bh); +} + +/* + * Management for journal control blocks: functions to create and + * destroy journal_t structures, and to initialise and read existing + * journal blocks from disk. */ + +/* First: create and setup a journal_t object in memory. We initialise + * very few fields yet: that has to wait until we have created the + * journal structures from from scratch, or loaded them from disk. */ + +static journal_t * journal_init_common (void) +{ + journal_t *journal; + int err; + + journal = kzalloc(sizeof(*journal), GFP_KERNEL); + if (!journal) + goto fail; + + init_waitqueue_head(&journal->j_wait_transaction_locked); + init_waitqueue_head(&journal->j_wait_logspace); + init_waitqueue_head(&journal->j_wait_done_commit); + init_waitqueue_head(&journal->j_wait_checkpoint); + init_waitqueue_head(&journal->j_wait_commit); + init_waitqueue_head(&journal->j_wait_updates); + mutex_init(&journal->j_barrier); + mutex_init(&journal->j_checkpoint_mutex); + spin_lock_init(&journal->j_revoke_lock); + spin_lock_init(&journal->j_list_lock); + spin_lock_init(&journal->j_state_lock); + + journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE); + + /* The journal is marked for error until we succeed with recovery! */ + journal->j_flags = JFS_ABORT; + + /* Set up a default-sized revoke table for the new mount. */ + err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); + if (err) { + kfree(journal); + goto fail; + } + return journal; +fail: + return NULL; +} + +/* journal_init_dev and journal_init_inode: + * + * Create a journal structure assigned some fixed set of disk blocks to + * the journal. We don't actually touch those disk blocks yet, but we + * need to set up all of the mapping information to tell the journaling + * system where the journal blocks are. + * + */ + +/** + * journal_t * journal_init_dev() - creates and initialises a journal structure + * @bdev: Block device on which to create the journal + * @fs_dev: Device which hold journalled filesystem for this journal. + * @start: Block nr Start of journal. + * @len: Length of the journal in blocks. + * @blocksize: blocksize of journalling device + * + * Returns: a newly created journal_t * + * + * journal_init_dev creates a journal which maps a fixed contiguous + * range of blocks on an arbitrary block device. + * + */ +journal_t * journal_init_dev(struct block_device *bdev, + struct block_device *fs_dev, + int start, int len, int blocksize) +{ + journal_t *journal = journal_init_common(); + struct buffer_head *bh; + int n; + + if (!journal) + return NULL; + + /* journal descriptor can store up to n blocks -bzzz */ + journal->j_blocksize = blocksize; + n = journal->j_blocksize / sizeof(journal_block_tag_t); + journal->j_wbufsize = n; + journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); + if (!journal->j_wbuf) { + printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", + __func__); + goto out_err; + } + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + + bh = __getblk(journal->j_dev, start, journal->j_blocksize); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + + return journal; +out_err: + kfree(journal->j_wbuf); + kfree(journal); + return NULL; +} + +/** + * journal_t * journal_init_inode () - creates a journal which maps to a inode. + * @inode: An inode to create the journal in + * + * journal_init_inode creates a journal which maps an on-disk inode as + * the journal. The inode must exist already, must support bmap() and + * must have all data blocks preallocated. + */ +journal_t * journal_init_inode (struct inode *inode) +{ + struct buffer_head *bh; + journal_t *journal = journal_init_common(); + int err; + int n; + unsigned int blocknr; + + if (!journal) + return NULL; + + journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; + journal->j_inode = inode; + jbd_debug(1, + "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", + journal, inode->i_sb->s_id, inode->i_ino, + (long long) inode->i_size, + inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); + + journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; + journal->j_blocksize = inode->i_sb->s_blocksize; + + /* journal descriptor can store up to n blocks -bzzz */ + n = journal->j_blocksize / sizeof(journal_block_tag_t); + journal->j_wbufsize = n; + journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); + if (!journal->j_wbuf) { + printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", + __func__); + goto out_err; + } + + err = journal_bmap(journal, 0, &blocknr); + /* If that failed, give up */ + if (err) { + printk(KERN_ERR "%s: Cannot locate journal superblock\n", + __func__); + goto out_err; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + + return journal; +out_err: + kfree(journal->j_wbuf); + kfree(journal); + return NULL; +} + +/* + * If the journal init or create aborts, we need to mark the journal + * superblock as being NULL to prevent the journal destroy from writing + * back a bogus superblock. + */ +static void journal_fail_superblock (journal_t *journal) +{ + struct buffer_head *bh = journal->j_sb_buffer; + brelse(bh); + journal->j_sb_buffer = NULL; +} + +/* + * Given a journal_t structure, initialise the various fields for + * startup of a new journaling session. We use this both when creating + * a journal, and after recovering an old journal to reset it for + * subsequent use. + */ + +static int journal_reset(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + unsigned int first, last; + + first = be32_to_cpu(sb->s_first); + last = be32_to_cpu(sb->s_maxlen); + if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } + + journal->j_first = first; + journal->j_last = last; + + journal->j_head = first; + journal->j_tail = first; + journal->j_free = last - first; + + journal->j_tail_sequence = journal->j_transaction_sequence; + journal->j_commit_sequence = journal->j_transaction_sequence - 1; + journal->j_commit_request = journal->j_commit_sequence; + + journal->j_max_transaction_buffers = journal->j_maxlen / 4; + + /* Add the dynamic fields and write it to disk. */ + journal_update_superblock(journal, 1); + return journal_start_thread(journal); +} + +/** + * int journal_create() - Initialise the new journal file + * @journal: Journal to create. This structure must have been initialised + * + * Given a journal_t structure which tells us which disk blocks we can + * use, create a new journal superblock and initialise all of the + * journal fields from scratch. + **/ +int journal_create(journal_t *journal) +{ + unsigned int blocknr; + struct buffer_head *bh; + journal_superblock_t *sb; + int i, err; + + if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { + printk (KERN_ERR "Journal length (%d blocks) too short.\n", + journal->j_maxlen); + journal_fail_superblock(journal); + return -EINVAL; + } + + if (journal->j_inode == NULL) { + /* + * We don't know what block to start at! + */ + printk(KERN_EMERG + "%s: creation of journal on external device!\n", + __func__); + BUG(); + } + + /* Zero out the entire journal on disk. We cannot afford to + have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ + jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); + for (i = 0; i < journal->j_maxlen; i++) { + err = journal_bmap(journal, i, &blocknr); + if (err) + return err; + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (unlikely(!bh)) + return -ENOMEM; + lock_buffer(bh); + memset (bh->b_data, 0, journal->j_blocksize); + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + __brelse(bh); + } + + sync_blockdev(journal->j_dev); + jbd_debug(1, "JBD: journal cleared.\n"); + + /* OK, fill in the initial static fields in the new superblock */ + sb = journal->j_superblock; + + sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); + + sb->s_blocksize = cpu_to_be32(journal->j_blocksize); + sb->s_maxlen = cpu_to_be32(journal->j_maxlen); + sb->s_first = cpu_to_be32(1); + + journal->j_transaction_sequence = 1; + + journal->j_flags &= ~JFS_ABORT; + journal->j_format_version = 2; + + return journal_reset(journal); +} + +/** + * void journal_update_superblock() - Update journal sb on disk. + * @journal: The journal to update. + * @wait: Set to '0' if you don't want to wait for IO completion. + * + * Update a journal's dynamic superblock fields and write it to disk, + * optionally waiting for the IO to complete. + */ +void journal_update_superblock(journal_t *journal, int wait) +{ + journal_superblock_t *sb = journal->j_superblock; + struct buffer_head *bh = journal->j_sb_buffer; + + /* + * As a special case, if the on-disk copy is already marked as needing + * no recovery (s_start == 0) and there are no outstanding transactions + * in the filesystem, then we can safely defer the superblock update + * until the next commit by setting JFS_FLUSHED. This avoids + * attempting a write to a potential-readonly device. + */ + if (sb->s_start == 0 && journal->j_tail_sequence == + journal->j_transaction_sequence) { + jbd_debug(1,"JBD: Skipping superblock update on recovered sb " + "(start %u, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, + journal->j_errno); + goto out; + } + + if (buffer_write_io_error(bh)) { + char b[BDEVNAME_SIZE]; + /* + * Oh, dear. A previous attempt to write the journal + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + printk(KERN_ERR "JBD: previous I/O error detected " + "for journal superblock update for %s.\n", + journal_dev_name(journal, b)); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + + spin_lock(&journal->j_state_lock); + jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, journal->j_errno); + + sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); + sb->s_start = cpu_to_be32(journal->j_tail); + sb->s_errno = cpu_to_be32(journal->j_errno); + spin_unlock(&journal->j_state_lock); + + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + if (wait) { + sync_dirty_buffer(bh); + if (buffer_write_io_error(bh)) { + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "JBD: I/O error detected " + "when updating journal superblock for %s.\n", + journal_dev_name(journal, b)); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + } else + write_dirty_buffer(bh, WRITE); + +out: + /* If we have just flushed the log (by marking s_start==0), then + * any future commit will have to be careful to update the + * superblock again to re-record the true start of the log. */ + + spin_lock(&journal->j_state_lock); + if (sb->s_start) + journal->j_flags &= ~JFS_FLUSHED; + else + journal->j_flags |= JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/* + * Read the superblock for a given journal, performing initial + * validation of the format. + */ + +static int journal_get_superblock(journal_t *journal) +{ + struct buffer_head *bh; + journal_superblock_t *sb; + int err = -EIO; + + bh = journal->j_sb_buffer; + + J_ASSERT(bh != NULL); + if (!buffer_uptodate(bh)) { + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + printk (KERN_ERR + "JBD: IO error reading journal superblock\n"); + goto out; + } + } + + sb = journal->j_superblock; + + err = -EINVAL; + + if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) || + sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { + printk(KERN_WARNING "JBD: no valid journal superblock found\n"); + goto out; + } + + switch(be32_to_cpu(sb->s_header.h_blocktype)) { + case JFS_SUPERBLOCK_V1: + journal->j_format_version = 1; + break; + case JFS_SUPERBLOCK_V2: + journal->j_format_version = 2; + break; + default: + printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); + goto out; + } + + if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) + journal->j_maxlen = be32_to_cpu(sb->s_maxlen); + else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { + printk (KERN_WARNING "JBD: journal file too short\n"); + goto out; + } + + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + printk(KERN_WARNING + "JBD: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + goto out; + } + + return 0; + +out: + journal_fail_superblock(journal); + return err; +} + +/* + * Load the on-disk journal superblock and read the key fields into the + * journal_t. + */ + +static int load_superblock(journal_t *journal) +{ + int err; + journal_superblock_t *sb; + + err = journal_get_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); + journal->j_tail = be32_to_cpu(sb->s_start); + journal->j_first = be32_to_cpu(sb->s_first); + journal->j_last = be32_to_cpu(sb->s_maxlen); + journal->j_errno = be32_to_cpu(sb->s_errno); + + return 0; +} + + +/** + * int journal_load() - Read journal from disk. + * @journal: Journal to act on. + * + * Given a journal_t structure which tells us which disk blocks contain + * a journal, read the journal from disk to initialise the in-memory + * structures. + */ +int journal_load(journal_t *journal) +{ + int err; + journal_superblock_t *sb; + + err = load_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + /* If this is a V2 superblock, then we have to check the + * features flags on it. */ + + if (journal->j_format_version >= 2) { + if ((sb->s_feature_ro_compat & + ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || + (sb->s_feature_incompat & + ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { + printk (KERN_WARNING + "JBD: Unrecognised features on journal\n"); + return -EINVAL; + } + } + + /* Let the recovery code check whether it needs to recover any + * data from the journal. */ + if (journal_recover(journal)) + goto recovery_error; + + /* OK, we've finished with the dynamic journal bits: + * reinitialise the dynamic contents of the superblock in memory + * and reset them on disk. */ + if (journal_reset(journal)) + goto recovery_error; + + journal->j_flags &= ~JFS_ABORT; + journal->j_flags |= JFS_LOADED; + return 0; + +recovery_error: + printk (KERN_WARNING "JBD: recovery failed\n"); + return -EIO; +} + +/** + * void journal_destroy() - Release a journal_t structure. + * @journal: Journal to act on. + * + * Release a journal_t structure once it is no longer in use by the + * journaled object. + * Return <0 if we couldn't clean up the journal. + */ +int journal_destroy(journal_t *journal) +{ + int err = 0; + + + /* Wait for the commit thread to wake up and die. */ + journal_kill_thread(journal); + + /* Force a final log commit */ + if (journal->j_running_transaction) + journal_commit_transaction(journal); + + /* Force any old transactions to disk */ + + /* Totally anal locking here... */ + spin_lock(&journal->j_list_lock); + while (journal->j_checkpoint_transactions != NULL) { + spin_unlock(&journal->j_list_lock); + log_do_checkpoint(journal); + spin_lock(&journal->j_list_lock); + } + + J_ASSERT(journal->j_running_transaction == NULL); + J_ASSERT(journal->j_committing_transaction == NULL); + J_ASSERT(journal->j_checkpoint_transactions == NULL); + spin_unlock(&journal->j_list_lock); + + if (journal->j_sb_buffer) { + if (!is_journal_aborted(journal)) { + /* We can now mark the journal as empty. */ + journal->j_tail = 0; + journal->j_tail_sequence = + ++journal->j_transaction_sequence; + journal_update_superblock(journal, 1); + } else { + err = -EIO; + } + brelse(journal->j_sb_buffer); + } + + if (journal->j_inode) + iput(journal->j_inode); + if (journal->j_revoke) + journal_destroy_revoke(journal); + kfree(journal->j_wbuf); + kfree(journal); + + return err; +} + + +/** + *int journal_check_used_features () - Check if features specified are used. + * @journal: Journal to check. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Check whether the journal uses all of a given set of + * features. Return true (non-zero) if it does. + **/ + +int journal_check_used_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + if (!compat && !ro && !incompat) + return 1; + if (journal->j_format_version == 1) + return 0; + + sb = journal->j_superblock; + + if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && + ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && + ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) + return 1; + + return 0; +} + +/** + * int journal_check_available_features() - Check feature set in journalling layer + * @journal: Journal to check. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Check whether the journaling code supports the use of + * all of a given set of features on this journal. Return true + * (non-zero) if it can. */ + +int journal_check_available_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + if (!compat && !ro && !incompat) + return 1; + + /* We can support any known requested features iff the + * superblock is in version 2. Otherwise we fail to support any + * extended sb features. */ + + if (journal->j_format_version != 2) + return 0; + + if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && + (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && + (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) + return 1; + + return 0; +} + +/** + * int journal_set_features () - Mark a given journal feature in the superblock + * @journal: Journal to act on. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Mark a given journal feature as present on the + * superblock. Returns true if the requested features could be set. + * + */ + +int journal_set_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + if (journal_check_used_features(journal, compat, ro, incompat)) + return 1; + + if (!journal_check_available_features(journal, compat, ro, incompat)) + return 0; + + jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", + compat, ro, incompat); + + sb = journal->j_superblock; + + sb->s_feature_compat |= cpu_to_be32(compat); + sb->s_feature_ro_compat |= cpu_to_be32(ro); + sb->s_feature_incompat |= cpu_to_be32(incompat); + + return 1; +} + + +/** + * int journal_update_format () - Update on-disk journal structure. + * @journal: Journal to act on. + * + * Given an initialised but unloaded journal struct, poke about in the + * on-disk structure to update it to the most recent supported version. + */ +int journal_update_format (journal_t *journal) +{ + journal_superblock_t *sb; + int err; + + err = journal_get_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + switch (be32_to_cpu(sb->s_header.h_blocktype)) { + case JFS_SUPERBLOCK_V2: + return 0; + case JFS_SUPERBLOCK_V1: + return journal_convert_superblock_v1(journal, sb); + default: + break; + } + return -EINVAL; +} + +static int journal_convert_superblock_v1(journal_t *journal, + journal_superblock_t *sb) +{ + int offset, blocksize; + struct buffer_head *bh; + + printk(KERN_WARNING + "JBD: Converting superblock from version 1 to 2.\n"); + + /* Pre-initialise new fields to zero */ + offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); + blocksize = be32_to_cpu(sb->s_blocksize); + memset(&sb->s_feature_compat, 0, blocksize-offset); + + sb->s_nr_users = cpu_to_be32(1); + sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); + journal->j_format_version = 2; + + bh = journal->j_sb_buffer; + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + return 0; +} + + +/** + * int journal_flush () - Flush journal + * @journal: Journal to act on. + * + * Flush all data for a given journal to disk and empty the journal. + * Filesystems can use this when remounting readonly to ensure that + * recovery does not need to happen on remount. + */ + +int journal_flush(journal_t *journal) +{ + int err = 0; + transaction_t *transaction = NULL; + unsigned int old_tail; + + spin_lock(&journal->j_state_lock); + + /* Force everything buffered to the log... */ + if (journal->j_running_transaction) { + transaction = journal->j_running_transaction; + __log_start_commit(journal, transaction->t_tid); + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + /* Wait for the log commit to complete... */ + if (transaction) { + tid_t tid = transaction->t_tid; + + spin_unlock(&journal->j_state_lock); + log_wait_commit(journal, tid); + } else { + spin_unlock(&journal->j_state_lock); + } + + /* ...and flush everything in the log out to disk. */ + spin_lock(&journal->j_list_lock); + while (!err && journal->j_checkpoint_transactions != NULL) { + spin_unlock(&journal->j_list_lock); + mutex_lock(&journal->j_checkpoint_mutex); + err = log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + spin_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + + if (is_journal_aborted(journal)) + return -EIO; + + cleanup_journal_tail(journal); + + /* Finally, mark the journal as really needing no recovery. + * This sets s_start==0 in the underlying superblock, which is + * the magic code for a fully-recovered superblock. Any future + * commits of data to the journal will restore the current + * s_start value. */ + spin_lock(&journal->j_state_lock); + old_tail = journal->j_tail; + journal->j_tail = 0; + spin_unlock(&journal->j_state_lock); + journal_update_superblock(journal, 1); + spin_lock(&journal->j_state_lock); + journal->j_tail = old_tail; + + J_ASSERT(!journal->j_running_transaction); + J_ASSERT(!journal->j_committing_transaction); + J_ASSERT(!journal->j_checkpoint_transactions); + J_ASSERT(journal->j_head == journal->j_tail); + J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); + spin_unlock(&journal->j_state_lock); + return 0; +} + +/** + * int journal_wipe() - Wipe journal contents + * @journal: Journal to act on. + * @write: flag (see below) + * + * Wipe out all of the contents of a journal, safely. This will produce + * a warning if the journal contains any valid recovery information. + * Must be called between journal_init_*() and journal_load(). + * + * If 'write' is non-zero, then we wipe out the journal on disk; otherwise + * we merely suppress recovery. + */ + +int journal_wipe(journal_t *journal, int write) +{ + int err = 0; + + J_ASSERT (!(journal->j_flags & JFS_LOADED)); + + err = load_superblock(journal); + if (err) + return err; + + if (!journal->j_tail) + goto no_recovery; + + printk (KERN_WARNING "JBD: %s recovery information on journal\n", + write ? "Clearing" : "Ignoring"); + + err = journal_skip_recovery(journal); + if (write) + journal_update_superblock(journal, 1); + + no_recovery: + return err; +} + +/* + * journal_dev_name: format a character string to describe on what + * device this journal is present. + */ + +static const char *journal_dev_name(journal_t *journal, char *buffer) +{ + struct block_device *bdev; + + if (journal->j_inode) + bdev = journal->j_inode->i_sb->s_bdev; + else + bdev = journal->j_dev; + + return bdevname(bdev, buffer); +} + +/* + * Journal abort has very specific semantics, which we describe + * for journal abort. + * + * Two internal function, which provide abort to te jbd layer + * itself are here. + */ + +/* + * Quick version for internal journal use (doesn't lock the journal). + * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, + * and don't attempt to make any other journal updates. + */ +static void __journal_abort_hard(journal_t *journal) +{ + transaction_t *transaction; + char b[BDEVNAME_SIZE]; + + if (journal->j_flags & JFS_ABORT) + return; + + printk(KERN_ERR "Aborting journal on device %s.\n", + journal_dev_name(journal, b)); + + spin_lock(&journal->j_state_lock); + journal->j_flags |= JFS_ABORT; + transaction = journal->j_running_transaction; + if (transaction) + __log_start_commit(journal, transaction->t_tid); + spin_unlock(&journal->j_state_lock); +} + +/* Soft abort: record the abort error status in the journal superblock, + * but don't do any other IO. */ +static void __journal_abort_soft (journal_t *journal, int errno) +{ + if (journal->j_flags & JFS_ABORT) + return; + + if (!journal->j_errno) + journal->j_errno = errno; + + __journal_abort_hard(journal); + + if (errno) + journal_update_superblock(journal, 1); +} + +/** + * void journal_abort () - Shutdown the journal immediately. + * @journal: the journal to shutdown. + * @errno: an error number to record in the journal indicating + * the reason for the shutdown. + * + * Perform a complete, immediate shutdown of the ENTIRE + * journal (not of a single transaction). This operation cannot be + * undone without closing and reopening the journal. + * + * The journal_abort function is intended to support higher level error + * recovery mechanisms such as the ext2/ext3 remount-readonly error + * mode. + * + * Journal abort has very specific semantics. Any existing dirty, + * unjournaled buffers in the main filesystem will still be written to + * disk by bdflush, but the journaling mechanism will be suspended + * immediately and no further transaction commits will be honoured. + * + * Any dirty, journaled buffers will be written back to disk without + * hitting the journal. Atomicity cannot be guaranteed on an aborted + * filesystem, but we _do_ attempt to leave as much data as possible + * behind for fsck to use for cleanup. + * + * Any attempt to get a new transaction handle on a journal which is in + * ABORT state will just result in an -EROFS error return. A + * journal_stop on an existing handle will return -EIO if we have + * entered abort state during the update. + * + * Recursive transactions are not disturbed by journal abort until the + * final journal_stop, which will receive the -EIO error. + * + * Finally, the journal_abort call allows the caller to supply an errno + * which will be recorded (if possible) in the journal superblock. This + * allows a client to record failure conditions in the middle of a + * transaction without having to complete the transaction to record the + * failure to disk. ext3_error, for example, now uses this + * functionality. + * + * Errors which originate from within the journaling layer will NOT + * supply an errno; a null errno implies that absolutely no further + * writes are done to the journal (unless there are any already in + * progress). + * + */ + +void journal_abort(journal_t *journal, int errno) +{ + __journal_abort_soft(journal, errno); +} + +/** + * int journal_errno () - returns the journal's error state. + * @journal: journal to examine. + * + * This is the errno numbet set with journal_abort(), the last + * time the journal was mounted - if the journal was stopped + * without calling abort this will be 0. + * + * If the journal has been aborted on this mount time -EROFS will + * be returned. + */ +int journal_errno(journal_t *journal) +{ + int err; + + spin_lock(&journal->j_state_lock); + if (journal->j_flags & JFS_ABORT) + err = -EROFS; + else + err = journal->j_errno; + spin_unlock(&journal->j_state_lock); + return err; +} + +/** + * int journal_clear_err () - clears the journal's error state + * @journal: journal to act on. + * + * An error must be cleared or Acked to take a FS out of readonly + * mode. + */ +int journal_clear_err(journal_t *journal) +{ + int err = 0; + + spin_lock(&journal->j_state_lock); + if (journal->j_flags & JFS_ABORT) + err = -EROFS; + else + journal->j_errno = 0; + spin_unlock(&journal->j_state_lock); + return err; +} + +/** + * void journal_ack_err() - Ack journal err. + * @journal: journal to act on. + * + * An error must be cleared or Acked to take a FS out of readonly + * mode. + */ +void journal_ack_err(journal_t *journal) +{ + spin_lock(&journal->j_state_lock); + if (journal->j_errno) + journal->j_flags |= JFS_ACK_ERR; + spin_unlock(&journal->j_state_lock); +} + +int journal_blocks_per_page(struct inode *inode) +{ + return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); +} + +/* + * Journal_head storage management + */ +static struct kmem_cache *journal_head_cache; +#ifdef CONFIG_JBD_DEBUG +static atomic_t nr_journal_heads = ATOMIC_INIT(0); +#endif + +static int journal_init_journal_head_cache(void) +{ + int retval; + + J_ASSERT(journal_head_cache == NULL); + journal_head_cache = kmem_cache_create("journal_head", + sizeof(struct journal_head), + 0, /* offset */ + SLAB_TEMPORARY, /* flags */ + NULL); /* ctor */ + retval = 0; + if (!journal_head_cache) { + retval = -ENOMEM; + printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); + } + return retval; +} + +static void journal_destroy_journal_head_cache(void) +{ + if (journal_head_cache) { + kmem_cache_destroy(journal_head_cache); + journal_head_cache = NULL; + } +} + +/* + * journal_head splicing and dicing + */ +static struct journal_head *journal_alloc_journal_head(void) +{ + struct journal_head *ret; + +#ifdef CONFIG_JBD_DEBUG + atomic_inc(&nr_journal_heads); +#endif + ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + if (ret == NULL) { + jbd_debug(1, "out of memory for journal_head\n"); + printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n", + __func__); + + while (ret == NULL) { + yield(); + ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + } + } + return ret; +} + +static void journal_free_journal_head(struct journal_head *jh) +{ +#ifdef CONFIG_JBD_DEBUG + atomic_dec(&nr_journal_heads); + memset(jh, JBD_POISON_FREE, sizeof(*jh)); +#endif + kmem_cache_free(journal_head_cache, jh); +} + +/* + * A journal_head is attached to a buffer_head whenever JBD has an + * interest in the buffer. + * + * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit + * is set. This bit is tested in core kernel code where we need to take + * JBD-specific actions. Testing the zeroness of ->b_private is not reliable + * there. + * + * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. + * + * When a buffer has its BH_JBD bit set it is immune from being released by + * core kernel code, mainly via ->b_count. + * + * A journal_head may be detached from its buffer_head when the journal_head's + * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. + * Various places in JBD call journal_remove_journal_head() to indicate that the + * journal_head can be dropped if needed. + * + * Various places in the kernel want to attach a journal_head to a buffer_head + * _before_ attaching the journal_head to a transaction. To protect the + * journal_head in this situation, journal_add_journal_head elevates the + * journal_head's b_jcount refcount by one. The caller must call + * journal_put_journal_head() to undo this. + * + * So the typical usage would be: + * + * (Attach a journal_head if needed. Increments b_jcount) + * struct journal_head *jh = journal_add_journal_head(bh); + * ... + * jh->b_transaction = xxx; + * journal_put_journal_head(jh); + * + * Now, the journal_head's b_jcount is zero, but it is safe from being released + * because it has a non-zero b_transaction. + */ + +/* + * Give a buffer_head a journal_head. + * + * Doesn't need the journal lock. + * May sleep. + */ +struct journal_head *journal_add_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh; + struct journal_head *new_jh = NULL; + +repeat: + if (!buffer_jbd(bh)) { + new_jh = journal_alloc_journal_head(); + memset(new_jh, 0, sizeof(*new_jh)); + } + + jbd_lock_bh_journal_head(bh); + if (buffer_jbd(bh)) { + jh = bh2jh(bh); + } else { + J_ASSERT_BH(bh, + (atomic_read(&bh->b_count) > 0) || + (bh->b_page && bh->b_page->mapping)); + + if (!new_jh) { + jbd_unlock_bh_journal_head(bh); + goto repeat; + } + + jh = new_jh; + new_jh = NULL; /* We consumed it */ + set_buffer_jbd(bh); + bh->b_private = jh; + jh->b_bh = bh; + get_bh(bh); + BUFFER_TRACE(bh, "added journal_head"); + } + jh->b_jcount++; + jbd_unlock_bh_journal_head(bh); + if (new_jh) + journal_free_journal_head(new_jh); + return bh->b_private; +} + +/* + * Grab a ref against this buffer_head's journal_head. If it ended up not + * having a journal_head, return NULL + */ +struct journal_head *journal_grab_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh = NULL; + + jbd_lock_bh_journal_head(bh); + if (buffer_jbd(bh)) { + jh = bh2jh(bh); + jh->b_jcount++; + } + jbd_unlock_bh_journal_head(bh); + return jh; +} + +static void __journal_remove_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh = bh2jh(bh); + + J_ASSERT_JH(jh, jh->b_jcount >= 0); + + get_bh(bh); + if (jh->b_jcount == 0) { + if (jh->b_transaction == NULL && + jh->b_next_transaction == NULL && + jh->b_cp_transaction == NULL) { + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing " + "b_frozen_data\n", + __func__); + jbd_free(jh->b_frozen_data, bh->b_size); + } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing " + "b_committed_data\n", + __func__); + jbd_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + __brelse(bh); + journal_free_journal_head(jh); + } else { + BUFFER_TRACE(bh, "journal_head was locked"); + } + } +} + +/* + * journal_remove_journal_head(): if the buffer isn't attached to a transaction + * and has a zero b_jcount then remove and release its journal_head. If we did + * see that the buffer is not used by any transaction we also "logically" + * decrement ->b_count. + * + * We in fact take an additional increment on ->b_count as a convenience, + * because the caller usually wants to do additional things with the bh + * after calling here. + * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some + * time. Once the caller has run __brelse(), the buffer is eligible for + * reaping by try_to_free_buffers(). + */ +void journal_remove_journal_head(struct buffer_head *bh) +{ + jbd_lock_bh_journal_head(bh); + __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); +} + +/* + * Drop a reference on the passed journal_head. If it fell to zero then try to + * release the journal_head from the buffer_head. + */ +void journal_put_journal_head(struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + jbd_lock_bh_journal_head(bh); + J_ASSERT_JH(jh, jh->b_jcount > 0); + --jh->b_jcount; + if (!jh->b_jcount && !jh->b_transaction) { + __journal_remove_journal_head(bh); + __brelse(bh); + } + jbd_unlock_bh_journal_head(bh); +} + +/* + * debugfs tunables + */ +#ifdef CONFIG_JBD_DEBUG + +u8 journal_enable_debug __read_mostly; +EXPORT_SYMBOL(journal_enable_debug); + +static struct dentry *jbd_debugfs_dir; +static struct dentry *jbd_debug; + +static void __init jbd_create_debugfs_entry(void) +{ + jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); + if (jbd_debugfs_dir) + jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR, + jbd_debugfs_dir, + &journal_enable_debug); +} + +static void __exit jbd_remove_debugfs_entry(void) +{ + debugfs_remove(jbd_debug); + debugfs_remove(jbd_debugfs_dir); +} + +#else + +static inline void jbd_create_debugfs_entry(void) +{ +} + +static inline void jbd_remove_debugfs_entry(void) +{ +} + +#endif + +struct kmem_cache *jbd_handle_cache; + +static int __init journal_init_handle_cache(void) +{ + jbd_handle_cache = kmem_cache_create("journal_handle", + sizeof(handle_t), + 0, /* offset */ + SLAB_TEMPORARY, /* flags */ + NULL); /* ctor */ + if (jbd_handle_cache == NULL) { + printk(KERN_EMERG "JBD: failed to create handle cache\n"); + return -ENOMEM; + } + return 0; +} + +static void journal_destroy_handle_cache(void) +{ + if (jbd_handle_cache) + kmem_cache_destroy(jbd_handle_cache); +} + +/* + * Module startup and shutdown + */ + +static int __init journal_init_caches(void) +{ + int ret; + + ret = journal_init_revoke_caches(); + if (ret == 0) + ret = journal_init_journal_head_cache(); + if (ret == 0) + ret = journal_init_handle_cache(); + return ret; +} + +static void journal_destroy_caches(void) +{ + journal_destroy_revoke_caches(); + journal_destroy_journal_head_cache(); + journal_destroy_handle_cache(); +} + +static int __init journal_init(void) +{ + int ret; + + BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); + + ret = journal_init_caches(); + if (ret != 0) + journal_destroy_caches(); + jbd_create_debugfs_entry(); + return ret; +} + +static void __exit journal_exit(void) +{ +#ifdef CONFIG_JBD_DEBUG + int n = atomic_read(&nr_journal_heads); + if (n) + printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); +#endif + jbd_remove_debugfs_entry(); + journal_destroy_caches(); +} + +MODULE_LICENSE("GPL"); +module_init(journal_init); +module_exit(journal_exit); + diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c new file mode 100644 index 00000000..5b43e967 --- /dev/null +++ b/fs/jbd/recovery.c @@ -0,0 +1,587 @@ +/* + * linux/fs/jbd/recovery.c + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1999-2000 Red Hat Software --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal recovery routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + +#ifndef __KERNEL__ +#include "jfs_user.h" +#else +#include +#include +#include +#include +#endif + +/* + * Maintain information about the progress of the recovery job, so that + * the different passes can carry information between them. + */ +struct recovery_info +{ + tid_t start_transaction; + tid_t end_transaction; + + int nr_replays; + int nr_revokes; + int nr_revoke_hits; +}; + +enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; +static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass); +static int scan_revoke_records(journal_t *, struct buffer_head *, + tid_t, struct recovery_info *); + +#ifdef __KERNEL__ + +/* Release readahead buffers after use */ +static void journal_brelse_array(struct buffer_head *b[], int n) +{ + while (--n >= 0) + brelse (b[n]); +} + + +/* + * When reading from the journal, we are going through the block device + * layer directly and so there is no readahead being done for us. We + * need to implement any readahead ourselves if we want it to happen at + * all. Recovery is basically one long sequential read, so make sure we + * do the IO in reasonably large chunks. + * + * This is not so critical that we need to be enormously clever about + * the readahead size, though. 128K is a purely arbitrary, good-enough + * fixed value. + */ + +#define MAXBUF 8 +static int do_readahead(journal_t *journal, unsigned int start) +{ + int err; + unsigned int max, nbufs, next; + unsigned int blocknr; + struct buffer_head *bh; + + struct buffer_head * bufs[MAXBUF]; + + /* Do up to 128K of readahead */ + max = start + (128 * 1024 / journal->j_blocksize); + if (max > journal->j_maxlen) + max = journal->j_maxlen; + + /* Do the readahead itself. We'll submit MAXBUF buffer_heads at + * a time to the block device IO layer. */ + + nbufs = 0; + + for (next = start; next < max; next++) { + err = journal_bmap(journal, next, &blocknr); + + if (err) { + printk (KERN_ERR "JBD: bad block at offset %u\n", + next); + goto failed; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) { + err = -ENOMEM; + goto failed; + } + + if (!buffer_uptodate(bh) && !buffer_locked(bh)) { + bufs[nbufs++] = bh; + if (nbufs == MAXBUF) { + ll_rw_block(READ, nbufs, bufs); + journal_brelse_array(bufs, nbufs); + nbufs = 0; + } + } else + brelse(bh); + } + + if (nbufs) + ll_rw_block(READ, nbufs, bufs); + err = 0; + +failed: + if (nbufs) + journal_brelse_array(bufs, nbufs); + return err; +} + +#endif /* __KERNEL__ */ + + +/* + * Read a block from the journal + */ + +static int jread(struct buffer_head **bhp, journal_t *journal, + unsigned int offset) +{ + int err; + unsigned int blocknr; + struct buffer_head *bh; + + *bhp = NULL; + + if (offset >= journal->j_maxlen) { + printk(KERN_ERR "JBD: corrupted journal superblock\n"); + return -EIO; + } + + err = journal_bmap(journal, offset, &blocknr); + + if (err) { + printk (KERN_ERR "JBD: bad block at offset %u\n", + offset); + return err; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return -ENOMEM; + + if (!buffer_uptodate(bh)) { + /* If this is a brand new buffer, start readahead. + Otherwise, we assume we are already reading it. */ + if (!buffer_req(bh)) + do_readahead(journal, offset); + wait_on_buffer(bh); + } + + if (!buffer_uptodate(bh)) { + printk (KERN_ERR "JBD: Failed to read block at offset %u\n", + offset); + brelse(bh); + return -EIO; + } + + *bhp = bh; + return 0; +} + + +/* + * Count the number of in-use tags in a journal descriptor block. + */ + +static int count_tags(struct buffer_head *bh, int size) +{ + char * tagp; + journal_block_tag_t * tag; + int nr = 0; + + tagp = &bh->b_data[sizeof(journal_header_t)]; + + while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { + tag = (journal_block_tag_t *) tagp; + + nr++; + tagp += sizeof(journal_block_tag_t); + if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID))) + tagp += 16; + + if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG)) + break; + } + + return nr; +} + + +/* Make sure we wrap around the log correctly! */ +#define wrap(journal, var) \ +do { \ + if (var >= (journal)->j_last) \ + var -= ((journal)->j_last - (journal)->j_first); \ +} while (0) + +/** + * journal_recover - recovers a on-disk journal + * @journal: the journal to recover + * + * The primary function for recovering the log contents when mounting a + * journaled device. + * + * Recovery is done in three passes. In the first pass, we look for the + * end of the log. In the second, we assemble the list of revoke + * blocks. In the third and final pass, we replay any un-revoked blocks + * in the log. + */ +int journal_recover(journal_t *journal) +{ + int err, err2; + journal_superblock_t * sb; + + struct recovery_info info; + + memset(&info, 0, sizeof(info)); + sb = journal->j_superblock; + + /* + * The journal superblock's s_start field (the current log head) + * is always zero if, and only if, the journal was cleanly + * unmounted. + */ + + if (!sb->s_start) { + jbd_debug(1, "No recovery required, last transaction %d\n", + be32_to_cpu(sb->s_sequence)); + journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; + return 0; + } + + err = do_one_pass(journal, &info, PASS_SCAN); + if (!err) + err = do_one_pass(journal, &info, PASS_REVOKE); + if (!err) + err = do_one_pass(journal, &info, PASS_REPLAY); + + jbd_debug(1, "JBD: recovery, exit status %d, " + "recovered transactions %u to %u\n", + err, info.start_transaction, info.end_transaction); + jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n", + info.nr_replays, info.nr_revoke_hits, info.nr_revokes); + + /* Restart the log at the next transaction ID, thus invalidating + * any existing commit records in the log. */ + journal->j_transaction_sequence = ++info.end_transaction; + + journal_clear_revoke(journal); + err2 = sync_blockdev(journal->j_fs_dev); + if (!err) + err = err2; + + return err; +} + +/** + * journal_skip_recovery - Start journal and wipe exiting records + * @journal: journal to startup + * + * Locate any valid recovery information from the journal and set up the + * journal structures in memory to ignore it (presumably because the + * caller has evidence that it is out of date). + * This function does'nt appear to be exorted.. + * + * We perform one pass over the journal to allow us to tell the user how + * much recovery information is being erased, and to let us initialise + * the journal transaction sequence numbers to the next unused ID. + */ +int journal_skip_recovery(journal_t *journal) +{ + int err; + struct recovery_info info; + + memset (&info, 0, sizeof(info)); + + err = do_one_pass(journal, &info, PASS_SCAN); + + if (err) { + printk(KERN_ERR "JBD: error %d scanning journal\n", err); + ++journal->j_transaction_sequence; + } else { +#ifdef CONFIG_JBD_DEBUG + int dropped = info.end_transaction - + be32_to_cpu(journal->j_superblock->s_sequence); + jbd_debug(1, + "JBD: ignoring %d transaction%s from the journal.\n", + dropped, (dropped == 1) ? "" : "s"); +#endif + journal->j_transaction_sequence = ++info.end_transaction; + } + + journal->j_tail = 0; + return err; +} + +static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass) +{ + unsigned int first_commit_ID, next_commit_ID; + unsigned int next_log_block; + int err, success = 0; + journal_superblock_t * sb; + journal_header_t * tmp; + struct buffer_head * bh; + unsigned int sequence; + int blocktype; + + /* + * First thing is to establish what we expect to find in the log + * (in terms of transaction IDs), and where (in terms of log + * block offsets): query the superblock. + */ + + sb = journal->j_superblock; + next_commit_ID = be32_to_cpu(sb->s_sequence); + next_log_block = be32_to_cpu(sb->s_start); + + first_commit_ID = next_commit_ID; + if (pass == PASS_SCAN) + info->start_transaction = first_commit_ID; + + jbd_debug(1, "Starting recovery pass %d\n", pass); + + /* + * Now we walk through the log, transaction by transaction, + * making sure that each transaction has a commit block in the + * expected place. Each complete transaction gets replayed back + * into the main filesystem. + */ + + while (1) { + int flags; + char * tagp; + journal_block_tag_t * tag; + struct buffer_head * obh; + struct buffer_head * nbh; + + cond_resched(); + + /* If we already know where to stop the log traversal, + * check right now that we haven't gone past the end of + * the log. */ + + if (pass != PASS_SCAN) + if (tid_geq(next_commit_ID, info->end_transaction)) + break; + + jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n", + next_commit_ID, next_log_block, journal->j_last); + + /* Skip over each chunk of the transaction looking + * either the next descriptor block or the final commit + * record. */ + + jbd_debug(3, "JBD: checking block %u\n", next_log_block); + err = jread(&bh, journal, next_log_block); + if (err) + goto failed; + + next_log_block++; + wrap(journal, next_log_block); + + /* What kind of buffer is it? + * + * If it is a descriptor block, check that it has the + * expected sequence number. Otherwise, we're all done + * here. */ + + tmp = (journal_header_t *)bh->b_data; + + if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) { + brelse(bh); + break; + } + + blocktype = be32_to_cpu(tmp->h_blocktype); + sequence = be32_to_cpu(tmp->h_sequence); + jbd_debug(3, "Found magic %d, sequence %d\n", + blocktype, sequence); + + if (sequence != next_commit_ID) { + brelse(bh); + break; + } + + /* OK, we have a valid descriptor block which matches + * all of the sequence number checks. What are we going + * to do with it? That depends on the pass... */ + + switch(blocktype) { + case JFS_DESCRIPTOR_BLOCK: + /* If it is a valid descriptor block, replay it + * in pass REPLAY; otherwise, just skip over the + * blocks it describes. */ + if (pass != PASS_REPLAY) { + next_log_block += + count_tags(bh, journal->j_blocksize); + wrap(journal, next_log_block); + brelse(bh); + continue; + } + + /* A descriptor block: we can now write all of + * the data blocks. Yay, useful work is finally + * getting done here! */ + + tagp = &bh->b_data[sizeof(journal_header_t)]; + while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) + <= journal->j_blocksize) { + unsigned int io_block; + + tag = (journal_block_tag_t *) tagp; + flags = be32_to_cpu(tag->t_flags); + + io_block = next_log_block++; + wrap(journal, next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + /* Recover what we can, but + * report failure at the end. */ + success = err; + printk (KERN_ERR + "JBD: IO error %d recovering " + "block %u in log\n", + err, io_block); + } else { + unsigned int blocknr; + + J_ASSERT(obh != NULL); + blocknr = be32_to_cpu(tag->t_blocknr); + + /* If the block has been + * revoked, then we're all done + * here. */ + if (journal_test_revoke + (journal, blocknr, + next_commit_ID)) { + brelse(obh); + ++info->nr_revoke_hits; + goto skip_write; + } + + /* Find a buffer for the new + * data being restored */ + nbh = __getblk(journal->j_fs_dev, + blocknr, + journal->j_blocksize); + if (nbh == NULL) { + printk(KERN_ERR + "JBD: Out of memory " + "during recovery.\n"); + err = -ENOMEM; + brelse(bh); + brelse(obh); + goto failed; + } + + lock_buffer(nbh); + memcpy(nbh->b_data, obh->b_data, + journal->j_blocksize); + if (flags & JFS_FLAG_ESCAPE) { + *((__be32 *)nbh->b_data) = + cpu_to_be32(JFS_MAGIC_NUMBER); + } + + BUFFER_TRACE(nbh, "marking dirty"); + set_buffer_uptodate(nbh); + mark_buffer_dirty(nbh); + BUFFER_TRACE(nbh, "marking uptodate"); + ++info->nr_replays; + /* ll_rw_block(WRITE, 1, &nbh); */ + unlock_buffer(nbh); + brelse(obh); + brelse(nbh); + } + + skip_write: + tagp += sizeof(journal_block_tag_t); + if (!(flags & JFS_FLAG_SAME_UUID)) + tagp += 16; + + if (flags & JFS_FLAG_LAST_TAG) + break; + } + + brelse(bh); + continue; + + case JFS_COMMIT_BLOCK: + /* Found an expected commit block: not much to + * do other than move on to the next sequence + * number. */ + brelse(bh); + next_commit_ID++; + continue; + + case JFS_REVOKE_BLOCK: + /* If we aren't in the REVOKE pass, then we can + * just skip over this block. */ + if (pass != PASS_REVOKE) { + brelse(bh); + continue; + } + + err = scan_revoke_records(journal, bh, + next_commit_ID, info); + brelse(bh); + if (err) + goto failed; + continue; + + default: + jbd_debug(3, "Unrecognised magic %d, end of scan.\n", + blocktype); + brelse(bh); + goto done; + } + } + + done: + /* + * We broke out of the log scan loop: either we came to the + * known end of the log or we found an unexpected block in the + * log. If the latter happened, then we know that the "current" + * transaction marks the end of the valid log. + */ + + if (pass == PASS_SCAN) + info->end_transaction = next_commit_ID; + else { + /* It's really bad news if different passes end up at + * different places (but possible due to IO errors). */ + if (info->end_transaction != next_commit_ID) { + printk (KERN_ERR "JBD: recovery pass %d ended at " + "transaction %u, expected %u\n", + pass, next_commit_ID, info->end_transaction); + if (!success) + success = -EIO; + } + } + + return success; + + failed: + return err; +} + + +/* Scan a revoke record, marking all blocks mentioned as revoked. */ + +static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, + tid_t sequence, struct recovery_info *info) +{ + journal_revoke_header_t *header; + int offset, max; + + header = (journal_revoke_header_t *) bh->b_data; + offset = sizeof(journal_revoke_header_t); + max = be32_to_cpu(header->r_count); + + while (offset < max) { + unsigned int blocknr; + int err; + + blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); + offset += 4; + err = journal_set_revoke(journal, blocknr, sequence); + if (err) + return err; + ++info->nr_revokes; + } + return 0; +} diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c new file mode 100644 index 00000000..305a9076 --- /dev/null +++ b/fs/jbd/revoke.c @@ -0,0 +1,706 @@ +/* + * linux/fs/jbd/revoke.c + * + * Written by Stephen C. Tweedie , 2000 + * + * Copyright 2000 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal revoke routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + * + * Revoke is the mechanism used to prevent old log records for deleted + * metadata from being replayed on top of newer data using the same + * blocks. The revoke mechanism is used in two separate places: + * + * + Commit: during commit we write the entire list of the current + * transaction's revoked blocks to the journal + * + * + Recovery: during recovery we record the transaction ID of all + * revoked blocks. If there are multiple revoke records in the log + * for a single block, only the last one counts, and if there is a log + * entry for a block beyond the last revoke, then that log entry still + * gets replayed. + * + * We can get interactions between revokes and new log data within a + * single transaction: + * + * Block is revoked and then journaled: + * The desired end result is the journaling of the new block, so we + * cancel the revoke before the transaction commits. + * + * Block is journaled and then revoked: + * The revoke must take precedence over the write of the block, so we + * need either to cancel the journal entry or to write the revoke + * later in the log than the log block. In this case, we choose the + * latter: journaling a block cancels any revoke record for that block + * in the current transaction, so any revoke for that block in the + * transaction must have happened after the block was journaled and so + * the revoke must take precedence. + * + * Block is revoked and then written as data: + * The data write is allowed to succeed, but the revoke is _not_ + * cancelled. We still need to prevent old log records from + * overwriting the new data. We don't even need to clear the revoke + * bit here. + * + * Revoke information on buffers is a tri-state value: + * + * RevokeValid clear: no cached revoke status, need to look it up + * RevokeValid set, Revoked clear: + * buffer has not been revoked, and cancel_revoke + * need do nothing. + * RevokeValid set, Revoked set: + * buffer has been revoked. + * + * Locking rules: + * We keep two hash tables of revoke records. One hashtable belongs to the + * running transaction (is pointed to by journal->j_revoke), the other one + * belongs to the committing transaction. Accesses to the second hash table + * happen only from the kjournald and no other thread touches this table. Also + * journal_switch_revoke_table() which switches which hashtable belongs to the + * running and which to the committing transaction is called only from + * kjournald. Therefore we need no locks when accessing the hashtable belonging + * to the committing transaction. + * + * All users operating on the hash table belonging to the running transaction + * have a handle to the transaction. Therefore they are safe from kjournald + * switching hash tables under them. For operations on the lists of entries in + * the hash table j_revoke_lock is used. + * + * Finally, also replay code uses the hash tables but at this moment no one else + * can touch them (filesystem isn't mounted yet) and hence no locking is + * needed. + */ + +#ifndef __KERNEL__ +#include "jfs_user.h" +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif +#include + +static struct kmem_cache *revoke_record_cache; +static struct kmem_cache *revoke_table_cache; + +/* Each revoke record represents one single revoked block. During + journal replay, this involves recording the transaction ID of the + last transaction to revoke this block. */ + +struct jbd_revoke_record_s +{ + struct list_head hash; + tid_t sequence; /* Used for recovery only */ + unsigned int blocknr; +}; + + +/* The revoke table is just a simple hash table of revoke records. */ +struct jbd_revoke_table_s +{ + /* It is conceivable that we might want a larger hash table + * for recovery. Must be a power of two. */ + int hash_size; + int hash_shift; + struct list_head *hash_table; +}; + + +#ifdef __KERNEL__ +static void write_one_revoke_record(journal_t *, transaction_t *, + struct journal_head **, int *, + struct jbd_revoke_record_s *, int); +static void flush_descriptor(journal_t *, struct journal_head *, int, int); +#endif + +/* Utility functions to maintain the revoke table */ + +/* Borrowed from buffer.c: this is a tried and tested block hash function */ +static inline int hash(journal_t *journal, unsigned int block) +{ + struct jbd_revoke_table_s *table = journal->j_revoke; + int hash_shift = table->hash_shift; + + return ((block << (hash_shift - 6)) ^ + (block >> 13) ^ + (block << (hash_shift - 12))) & (table->hash_size - 1); +} + +static int insert_revoke_hash(journal_t *journal, unsigned int blocknr, + tid_t seq) +{ + struct list_head *hash_list; + struct jbd_revoke_record_s *record; + +repeat: + record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS); + if (!record) + goto oom; + + record->sequence = seq; + record->blocknr = blocknr; + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + spin_lock(&journal->j_revoke_lock); + list_add(&record->hash, hash_list); + spin_unlock(&journal->j_revoke_lock); + return 0; + +oom: + if (!journal_oom_retry) + return -ENOMEM; + jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); + yield(); + goto repeat; +} + +/* Find a revoke record in the journal's hash table. */ + +static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, + unsigned int blocknr) +{ + struct list_head *hash_list; + struct jbd_revoke_record_s *record; + + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + + spin_lock(&journal->j_revoke_lock); + record = (struct jbd_revoke_record_s *) hash_list->next; + while (&(record->hash) != hash_list) { + if (record->blocknr == blocknr) { + spin_unlock(&journal->j_revoke_lock); + return record; + } + record = (struct jbd_revoke_record_s *) record->hash.next; + } + spin_unlock(&journal->j_revoke_lock); + return NULL; +} + +void journal_destroy_revoke_caches(void) +{ + if (revoke_record_cache) { + kmem_cache_destroy(revoke_record_cache); + revoke_record_cache = NULL; + } + if (revoke_table_cache) { + kmem_cache_destroy(revoke_table_cache); + revoke_table_cache = NULL; + } +} + +int __init journal_init_revoke_caches(void) +{ + J_ASSERT(!revoke_record_cache); + J_ASSERT(!revoke_table_cache); + + revoke_record_cache = kmem_cache_create("revoke_record", + sizeof(struct jbd_revoke_record_s), + 0, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, + NULL); + if (!revoke_record_cache) + goto record_cache_failure; + + revoke_table_cache = kmem_cache_create("revoke_table", + sizeof(struct jbd_revoke_table_s), + 0, SLAB_TEMPORARY, NULL); + if (!revoke_table_cache) + goto table_cache_failure; + + return 0; + +table_cache_failure: + journal_destroy_revoke_caches(); +record_cache_failure: + return -ENOMEM; +} + +static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) +{ + int shift = 0; + int tmp = hash_size; + struct jbd_revoke_table_s *table; + + table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); + if (!table) + goto out; + + while((tmp >>= 1UL) != 0UL) + shift++; + + table->hash_size = hash_size; + table->hash_shift = shift; + table->hash_table = + kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); + if (!table->hash_table) { + kmem_cache_free(revoke_table_cache, table); + table = NULL; + goto out; + } + + for (tmp = 0; tmp < hash_size; tmp++) + INIT_LIST_HEAD(&table->hash_table[tmp]); + +out: + return table; +} + +static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table) +{ + int i; + struct list_head *hash_list; + + for (i = 0; i < table->hash_size; i++) { + hash_list = &table->hash_table[i]; + J_ASSERT(list_empty(hash_list)); + } + + kfree(table->hash_table); + kmem_cache_free(revoke_table_cache, table); +} + +/* Initialise the revoke table for a given journal to a given size. */ +int journal_init_revoke(journal_t *journal, int hash_size) +{ + J_ASSERT(journal->j_revoke_table[0] == NULL); + J_ASSERT(is_power_of_2(hash_size)); + + journal->j_revoke_table[0] = journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[0]) + goto fail0; + + journal->j_revoke_table[1] = journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[1]) + goto fail1; + + journal->j_revoke = journal->j_revoke_table[1]; + + spin_lock_init(&journal->j_revoke_lock); + + return 0; + +fail1: + journal_destroy_revoke_table(journal->j_revoke_table[0]); +fail0: + return -ENOMEM; +} + +/* Destroy a journal's revoke table. The table must already be empty! */ +void journal_destroy_revoke(journal_t *journal) +{ + journal->j_revoke = NULL; + if (journal->j_revoke_table[0]) + journal_destroy_revoke_table(journal->j_revoke_table[0]); + if (journal->j_revoke_table[1]) + journal_destroy_revoke_table(journal->j_revoke_table[1]); +} + + +#ifdef __KERNEL__ + +/* + * journal_revoke: revoke a given buffer_head from the journal. This + * prevents the block from being replayed during recovery if we take a + * crash after this current transaction commits. Any subsequent + * metadata writes of the buffer in this transaction cancel the + * revoke. + * + * Note that this call may block --- it is up to the caller to make + * sure that there are no further calls to journal_write_metadata + * before the revoke is complete. In ext3, this implies calling the + * revoke before clearing the block bitmap when we are deleting + * metadata. + * + * Revoke performs a journal_forget on any buffer_head passed in as a + * parameter, but does _not_ forget the buffer_head if the bh was only + * found implicitly. + * + * bh_in may not be a journalled buffer - it may have come off + * the hash tables without an attached journal_head. + * + * If bh_in is non-zero, journal_revoke() will decrement its b_count + * by one. + */ + +int journal_revoke(handle_t *handle, unsigned int blocknr, + struct buffer_head *bh_in) +{ + struct buffer_head *bh = NULL; + journal_t *journal; + struct block_device *bdev; + int err; + + might_sleep(); + if (bh_in) + BUFFER_TRACE(bh_in, "enter"); + + journal = handle->h_transaction->t_journal; + if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){ + J_ASSERT (!"Cannot set revoke feature!"); + return -EINVAL; + } + + bdev = journal->j_fs_dev; + bh = bh_in; + + if (!bh) { + bh = __find_get_block(bdev, blocknr, journal->j_blocksize); + if (bh) + BUFFER_TRACE(bh, "found on hash"); + } +#ifdef JBD_EXPENSIVE_CHECKING + else { + struct buffer_head *bh2; + + /* If there is a different buffer_head lying around in + * memory anywhere... */ + bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); + if (bh2) { + /* ... and it has RevokeValid status... */ + if (bh2 != bh && buffer_revokevalid(bh2)) + /* ...then it better be revoked too, + * since it's illegal to create a revoke + * record against a buffer_head which is + * not marked revoked --- that would + * risk missing a subsequent revoke + * cancel. */ + J_ASSERT_BH(bh2, buffer_revoked(bh2)); + put_bh(bh2); + } + } +#endif + + /* We really ought not ever to revoke twice in a row without + first having the revoke cancelled: it's illegal to free a + block twice without allocating it in between! */ + if (bh) { + if (!J_EXPECT_BH(bh, !buffer_revoked(bh), + "inconsistent data on disk")) { + if (!bh_in) + brelse(bh); + return -EIO; + } + set_buffer_revoked(bh); + set_buffer_revokevalid(bh); + if (bh_in) { + BUFFER_TRACE(bh_in, "call journal_forget"); + journal_forget(handle, bh_in); + } else { + BUFFER_TRACE(bh, "call brelse"); + __brelse(bh); + } + } + + jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in); + err = insert_revoke_hash(journal, blocknr, + handle->h_transaction->t_tid); + BUFFER_TRACE(bh_in, "exit"); + return err; +} + +/* + * Cancel an outstanding revoke. For use only internally by the + * journaling code (called from journal_get_write_access). + * + * We trust buffer_revoked() on the buffer if the buffer is already + * being journaled: if there is no revoke pending on the buffer, then we + * don't do anything here. + * + * This would break if it were possible for a buffer to be revoked and + * discarded, and then reallocated within the same transaction. In such + * a case we would have lost the revoked bit, but when we arrived here + * the second time we would still have a pending revoke to cancel. So, + * do not trust the Revoked bit on buffers unless RevokeValid is also + * set. + */ +int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) +{ + struct jbd_revoke_record_s *record; + journal_t *journal = handle->h_transaction->t_journal; + int need_cancel; + int did_revoke = 0; /* akpm: debug */ + struct buffer_head *bh = jh2bh(jh); + + jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); + + /* Is the existing Revoke bit valid? If so, we trust it, and + * only perform the full cancel if the revoke bit is set. If + * not, we can't trust the revoke bit, and we need to do the + * full search for a revoke record. */ + if (test_set_buffer_revokevalid(bh)) { + need_cancel = test_clear_buffer_revoked(bh); + } else { + need_cancel = 1; + clear_buffer_revoked(bh); + } + + if (need_cancel) { + record = find_revoke_record(journal, bh->b_blocknr); + if (record) { + jbd_debug(4, "cancelled existing revoke on " + "blocknr %llu\n", (unsigned long long)bh->b_blocknr); + spin_lock(&journal->j_revoke_lock); + list_del(&record->hash); + spin_unlock(&journal->j_revoke_lock); + kmem_cache_free(revoke_record_cache, record); + did_revoke = 1; + } + } + +#ifdef JBD_EXPENSIVE_CHECKING + /* There better not be one left behind by now! */ + record = find_revoke_record(journal, bh->b_blocknr); + J_ASSERT_JH(jh, record == NULL); +#endif + + /* Finally, have we just cleared revoke on an unhashed + * buffer_head? If so, we'd better make sure we clear the + * revoked status on any hashed alias too, otherwise the revoke + * state machine will get very upset later on. */ + if (need_cancel) { + struct buffer_head *bh2; + bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); + if (bh2) { + if (bh2 != bh) + clear_buffer_revoked(bh2); + __brelse(bh2); + } + } + return did_revoke; +} + +/* journal_switch_revoke table select j_revoke for next transaction + * we do not want to suspend any processing until all revokes are + * written -bzzz + */ +void journal_switch_revoke_table(journal_t *journal) +{ + int i; + + if (journal->j_revoke == journal->j_revoke_table[0]) + journal->j_revoke = journal->j_revoke_table[1]; + else + journal->j_revoke = journal->j_revoke_table[0]; + + for (i = 0; i < journal->j_revoke->hash_size; i++) + INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); +} + +/* + * Write revoke records to the journal for all entries in the current + * revoke hash, deleting the entries as we go. + */ +void journal_write_revoke_records(journal_t *journal, + transaction_t *transaction, int write_op) +{ + struct journal_head *descriptor; + struct jbd_revoke_record_s *record; + struct jbd_revoke_table_s *revoke; + struct list_head *hash_list; + int i, offset, count; + + descriptor = NULL; + offset = 0; + count = 0; + + /* select revoke table for committing transaction */ + revoke = journal->j_revoke == journal->j_revoke_table[0] ? + journal->j_revoke_table[1] : journal->j_revoke_table[0]; + + for (i = 0; i < revoke->hash_size; i++) { + hash_list = &revoke->hash_table[i]; + + while (!list_empty(hash_list)) { + record = (struct jbd_revoke_record_s *) + hash_list->next; + write_one_revoke_record(journal, transaction, + &descriptor, &offset, + record, write_op); + count++; + list_del(&record->hash); + kmem_cache_free(revoke_record_cache, record); + } + } + if (descriptor) + flush_descriptor(journal, descriptor, offset, write_op); + jbd_debug(1, "Wrote %d revoke records\n", count); +} + +/* + * Write out one revoke record. We need to create a new descriptor + * block if the old one is full or if we have not already created one. + */ + +static void write_one_revoke_record(journal_t *journal, + transaction_t *transaction, + struct journal_head **descriptorp, + int *offsetp, + struct jbd_revoke_record_s *record, + int write_op) +{ + struct journal_head *descriptor; + int offset; + journal_header_t *header; + + /* If we are already aborting, this all becomes a noop. We + still need to go round the loop in + journal_write_revoke_records in order to free all of the + revoke records: only the IO to the journal is omitted. */ + if (is_journal_aborted(journal)) + return; + + descriptor = *descriptorp; + offset = *offsetp; + + /* Make sure we have a descriptor with space left for the record */ + if (descriptor) { + if (offset == journal->j_blocksize) { + flush_descriptor(journal, descriptor, offset, write_op); + descriptor = NULL; + } + } + + if (!descriptor) { + descriptor = journal_get_descriptor_buffer(journal); + if (!descriptor) + return; + header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; + header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK); + header->h_sequence = cpu_to_be32(transaction->t_tid); + + /* Record it so that we can wait for IO completion later */ + JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); + journal_file_buffer(descriptor, transaction, BJ_LogCtl); + + offset = sizeof(journal_revoke_header_t); + *descriptorp = descriptor; + } + + * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = + cpu_to_be32(record->blocknr); + offset += 4; + *offsetp = offset; +} + +/* + * Flush a revoke descriptor out to the journal. If we are aborting, + * this is a noop; otherwise we are generating a buffer which needs to + * be waited for during commit, so it has to go onto the appropriate + * journal buffer list. + */ + +static void flush_descriptor(journal_t *journal, + struct journal_head *descriptor, + int offset, int write_op) +{ + journal_revoke_header_t *header; + struct buffer_head *bh = jh2bh(descriptor); + + if (is_journal_aborted(journal)) { + put_bh(bh); + return; + } + + header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data; + header->r_count = cpu_to_be32(offset); + set_buffer_jwrite(bh); + BUFFER_TRACE(bh, "write"); + set_buffer_dirty(bh); + write_dirty_buffer(bh, write_op); +} +#endif + +/* + * Revoke support for recovery. + * + * Recovery needs to be able to: + * + * record all revoke records, including the tid of the latest instance + * of each revoke in the journal + * + * check whether a given block in a given transaction should be replayed + * (ie. has not been revoked by a revoke record in that or a subsequent + * transaction) + * + * empty the revoke table after recovery. + */ + +/* + * First, setting revoke records. We create a new revoke record for + * every block ever revoked in the log as we scan it for recovery, and + * we update the existing records if we find multiple revokes for a + * single block. + */ + +int journal_set_revoke(journal_t *journal, + unsigned int blocknr, + tid_t sequence) +{ + struct jbd_revoke_record_s *record; + + record = find_revoke_record(journal, blocknr); + if (record) { + /* If we have multiple occurrences, only record the + * latest sequence number in the hashed record */ + if (tid_gt(sequence, record->sequence)) + record->sequence = sequence; + return 0; + } + return insert_revoke_hash(journal, blocknr, sequence); +} + +/* + * Test revoke records. For a given block referenced in the log, has + * that block been revoked? A revoke record with a given transaction + * sequence number revokes all blocks in that transaction and earlier + * ones, but later transactions still need replayed. + */ + +int journal_test_revoke(journal_t *journal, + unsigned int blocknr, + tid_t sequence) +{ + struct jbd_revoke_record_s *record; + + record = find_revoke_record(journal, blocknr); + if (!record) + return 0; + if (tid_gt(sequence, record->sequence)) + return 0; + return 1; +} + +/* + * Finally, once recovery is over, we need to clear the revoke table so + * that it can be reused by the running filesystem. + */ + +void journal_clear_revoke(journal_t *journal) +{ + int i; + struct list_head *hash_list; + struct jbd_revoke_record_s *record; + struct jbd_revoke_table_s *revoke; + + revoke = journal->j_revoke; + + for (i = 0; i < revoke->hash_size; i++) { + hash_list = &revoke->hash_table[i]; + while (!list_empty(hash_list)) { + record = (struct jbd_revoke_record_s*) hash_list->next; + list_del(&record->hash); + kmem_cache_free(revoke_record_cache, record); + } + } +} diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c new file mode 100644 index 00000000..f7ee81a0 --- /dev/null +++ b/fs/jbd/transaction.c @@ -0,0 +1,2198 @@ +/* + * linux/fs/jbd/transaction.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Generic filesystem transaction handling code; part of the ext2fs + * journaling system. + * + * This file manages transactions (compound commits managed by the + * journaling code) and handles (individual atomic operations by the + * filesystem). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void __journal_temp_unlink_buffer(struct journal_head *jh); + +/* + * get_transaction: obtain a new transaction_t object. + * + * Simply allocate and initialise a new transaction. Create it in + * RUNNING state and add it to the current journal (which should not + * have an existing running transaction: we only make a new transaction + * once we have started to commit the old one). + * + * Preconditions: + * The journal MUST be locked. We don't perform atomic mallocs on the + * new transaction and we can't block without protecting against other + * processes trying to touch the journal while it is in transition. + * + * Called under j_state_lock + */ + +static transaction_t * +get_transaction(journal_t *journal, transaction_t *transaction) +{ + transaction->t_journal = journal; + transaction->t_state = T_RUNNING; + transaction->t_start_time = ktime_get(); + transaction->t_tid = journal->j_transaction_sequence++; + transaction->t_expires = jiffies + journal->j_commit_interval; + spin_lock_init(&transaction->t_handle_lock); + + /* Set up the commit timer for the new transaction. */ + journal->j_commit_timer.expires = + round_jiffies_up(transaction->t_expires); + add_timer(&journal->j_commit_timer); + + J_ASSERT(journal->j_running_transaction == NULL); + journal->j_running_transaction = transaction; + + return transaction; +} + +/* + * Handle management. + * + * A handle_t is an object which represents a single atomic update to a + * filesystem, and which tracks all of the modifications which form part + * of that one update. + */ + +/* + * start_this_handle: Given a handle, deal with any locking or stalling + * needed to make sure that there is enough journal space for the handle + * to begin. Attach the handle to a transaction and set up the + * transaction's buffer credits. + */ + +static int start_this_handle(journal_t *journal, handle_t *handle) +{ + transaction_t *transaction; + int needed; + int nblocks = handle->h_buffer_credits; + transaction_t *new_transaction = NULL; + int ret = 0; + + if (nblocks > journal->j_max_transaction_buffers) { + printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", + current->comm, nblocks, + journal->j_max_transaction_buffers); + ret = -ENOSPC; + goto out; + } + +alloc_transaction: + if (!journal->j_running_transaction) { + new_transaction = kzalloc(sizeof(*new_transaction), + GFP_NOFS|__GFP_NOFAIL); + if (!new_transaction) { + ret = -ENOMEM; + goto out; + } + } + + jbd_debug(3, "New handle %p going live.\n", handle); + +repeat: + + /* + * We need to hold j_state_lock until t_updates has been incremented, + * for proper journal barrier handling + */ + spin_lock(&journal->j_state_lock); +repeat_locked: + if (is_journal_aborted(journal) || + (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { + spin_unlock(&journal->j_state_lock); + ret = -EROFS; + goto out; + } + + /* Wait on the journal's transaction barrier if necessary */ + if (journal->j_barrier_count) { + spin_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_transaction_locked, + journal->j_barrier_count == 0); + goto repeat; + } + + if (!journal->j_running_transaction) { + if (!new_transaction) { + spin_unlock(&journal->j_state_lock); + goto alloc_transaction; + } + get_transaction(journal, new_transaction); + new_transaction = NULL; + } + + transaction = journal->j_running_transaction; + + /* + * If the current transaction is locked down for commit, wait for the + * lock to be released. + */ + if (transaction->t_state == T_LOCKED) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_wait_transaction_locked, + &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&journal->j_state_lock); + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); + goto repeat; + } + + /* + * If there is not enough space left in the log to write all potential + * buffers requested by this operation, we need to stall pending a log + * checkpoint to free some more log space. + */ + spin_lock(&transaction->t_handle_lock); + needed = transaction->t_outstanding_credits + nblocks; + + if (needed > journal->j_max_transaction_buffers) { + /* + * If the current transaction is already too large, then start + * to commit it: we can then go back and attach this handle to + * a new transaction. + */ + DEFINE_WAIT(wait); + + jbd_debug(2, "Handle %p starting new commit...\n", handle); + spin_unlock(&transaction->t_handle_lock); + prepare_to_wait(&journal->j_wait_transaction_locked, &wait, + TASK_UNINTERRUPTIBLE); + __log_start_commit(journal, transaction->t_tid); + spin_unlock(&journal->j_state_lock); + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); + goto repeat; + } + + /* + * The commit code assumes that it can get enough log space + * without forcing a checkpoint. This is *critical* for + * correctness: a checkpoint of a buffer which is also + * associated with a committing transaction creates a deadlock, + * so commit simply cannot force through checkpoints. + * + * We must therefore ensure the necessary space in the journal + * *before* starting to dirty potentially checkpointed buffers + * in the new transaction. + * + * The worst part is, any transaction currently committing can + * reduce the free space arbitrarily. Be careful to account for + * those buffers when checkpointing. + */ + + /* + * @@@ AKPM: This seems rather over-defensive. We're giving commit + * a _lot_ of headroom: 1/4 of the journal plus the size of + * the committing transaction. Really, we only need to give it + * committing_transaction->t_outstanding_credits plus "enough" for + * the log control blocks. + * Also, this test is inconsistent with the matching one in + * journal_extend(). + */ + if (__log_space_left(journal) < jbd_space_needed(journal)) { + jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); + spin_unlock(&transaction->t_handle_lock); + __log_wait_for_space(journal); + goto repeat_locked; + } + + /* OK, account for the buffers that this operation expects to + * use and add the handle to the running transaction. */ + + handle->h_transaction = transaction; + transaction->t_outstanding_credits += nblocks; + transaction->t_updates++; + transaction->t_handle_count++; + jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", + handle, nblocks, transaction->t_outstanding_credits, + __log_space_left(journal)); + spin_unlock(&transaction->t_handle_lock); + spin_unlock(&journal->j_state_lock); + + lock_map_acquire(&handle->h_lockdep_map); +out: + if (unlikely(new_transaction)) /* It's usually NULL */ + kfree(new_transaction); + return ret; +} + +static struct lock_class_key jbd_handle_key; + +/* Allocate a new handle. This should probably be in a slab... */ +static handle_t *new_handle(int nblocks) +{ + handle_t *handle = jbd_alloc_handle(GFP_NOFS); + if (!handle) + return NULL; + memset(handle, 0, sizeof(*handle)); + handle->h_buffer_credits = nblocks; + handle->h_ref = 1; + + lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0); + + return handle; +} + +/** + * handle_t *journal_start() - Obtain a new handle. + * @journal: Journal to start transaction on. + * @nblocks: number of block buffer we might modify + * + * We make sure that the transaction can guarantee at least nblocks of + * modified buffers in the log. We block until the log can guarantee + * that much space. + * + * This function is visible to journal users (like ext3fs), so is not + * called with the journal already locked. + * + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. + */ +handle_t *journal_start(journal_t *journal, int nblocks) +{ + handle_t *handle = journal_current_handle(); + int err; + + if (!journal) + return ERR_PTR(-EROFS); + + if (handle) { + J_ASSERT(handle->h_transaction->t_journal == journal); + handle->h_ref++; + return handle; + } + + handle = new_handle(nblocks); + if (!handle) + return ERR_PTR(-ENOMEM); + + current->journal_info = handle; + + err = start_this_handle(journal, handle); + if (err < 0) { + jbd_free_handle(handle); + current->journal_info = NULL; + handle = ERR_PTR(err); + } + return handle; +} + +/** + * int journal_extend() - extend buffer credits. + * @handle: handle to 'extend' + * @nblocks: nr blocks to try to extend by. + * + * Some transactions, such as large extends and truncates, can be done + * atomically all at once or in several stages. The operation requests + * a credit for a number of buffer modications in advance, but can + * extend its credit if it needs more. + * + * journal_extend tries to give the running handle more buffer credits. + * It does not guarantee that allocation - this is a best-effort only. + * The calling process MUST be able to deal cleanly with a failure to + * extend here. + * + * Return 0 on success, non-zero on failure. + * + * return code < 0 implies an error + * return code > 0 implies normal transaction-full status. + */ +int journal_extend(handle_t *handle, int nblocks) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int result; + int wanted; + + result = -EIO; + if (is_handle_aborted(handle)) + goto out; + + result = 1; + + spin_lock(&journal->j_state_lock); + + /* Don't extend a locked-down transaction! */ + if (handle->h_transaction->t_state != T_RUNNING) { + jbd_debug(3, "denied handle %p %d blocks: " + "transaction not running\n", handle, nblocks); + goto error_out; + } + + spin_lock(&transaction->t_handle_lock); + wanted = transaction->t_outstanding_credits + nblocks; + + if (wanted > journal->j_max_transaction_buffers) { + jbd_debug(3, "denied handle %p %d blocks: " + "transaction too large\n", handle, nblocks); + goto unlock; + } + + if (wanted > __log_space_left(journal)) { + jbd_debug(3, "denied handle %p %d blocks: " + "insufficient log space\n", handle, nblocks); + goto unlock; + } + + handle->h_buffer_credits += nblocks; + transaction->t_outstanding_credits += nblocks; + result = 0; + + jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); +unlock: + spin_unlock(&transaction->t_handle_lock); +error_out: + spin_unlock(&journal->j_state_lock); +out: + return result; +} + + +/** + * int journal_restart() - restart a handle. + * @handle: handle to restart + * @nblocks: nr credits requested + * + * Restart a handle for a multi-transaction filesystem + * operation. + * + * If the journal_extend() call above fails to grant new buffer credits + * to a running handle, a call to journal_restart will commit the + * handle's transaction so far and reattach the handle to a new + * transaction capabable of guaranteeing the requested number of + * credits. + */ + +int journal_restart(handle_t *handle, int nblocks) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int ret; + + /* If we've had an abort of any type, don't even think about + * actually doing the restart! */ + if (is_handle_aborted(handle)) + return 0; + + /* + * First unlink the handle from its current transaction, and start the + * commit on that. + */ + J_ASSERT(transaction->t_updates > 0); + J_ASSERT(journal_current_handle() == handle); + + spin_lock(&journal->j_state_lock); + spin_lock(&transaction->t_handle_lock); + transaction->t_outstanding_credits -= handle->h_buffer_credits; + transaction->t_updates--; + + if (!transaction->t_updates) + wake_up(&journal->j_wait_updates); + spin_unlock(&transaction->t_handle_lock); + + jbd_debug(2, "restarting handle %p\n", handle); + __log_start_commit(journal, transaction->t_tid); + spin_unlock(&journal->j_state_lock); + + lock_map_release(&handle->h_lockdep_map); + handle->h_buffer_credits = nblocks; + ret = start_this_handle(journal, handle); + return ret; +} + + +/** + * void journal_lock_updates () - establish a transaction barrier. + * @journal: Journal to establish a barrier on. + * + * This locks out any further updates from being started, and blocks + * until all existing updates have completed, returning only once the + * journal is in a quiescent state with no updates running. + * + * The journal lock should not be held on entry. + */ +void journal_lock_updates(journal_t *journal) +{ + DEFINE_WAIT(wait); + + spin_lock(&journal->j_state_lock); + ++journal->j_barrier_count; + + /* Wait until there are no running updates */ + while (1) { + transaction_t *transaction = journal->j_running_transaction; + + if (!transaction) + break; + + spin_lock(&transaction->t_handle_lock); + if (!transaction->t_updates) { + spin_unlock(&transaction->t_handle_lock); + break; + } + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&transaction->t_handle_lock); + spin_unlock(&journal->j_state_lock); + schedule(); + finish_wait(&journal->j_wait_updates, &wait); + spin_lock(&journal->j_state_lock); + } + spin_unlock(&journal->j_state_lock); + + /* + * We have now established a barrier against other normal updates, but + * we also need to barrier against other journal_lock_updates() calls + * to make sure that we serialise special journal-locked operations + * too. + */ + mutex_lock(&journal->j_barrier); +} + +/** + * void journal_unlock_updates (journal_t* journal) - release barrier + * @journal: Journal to release the barrier on. + * + * Release a transaction barrier obtained with journal_lock_updates(). + * + * Should be called without the journal lock held. + */ +void journal_unlock_updates (journal_t *journal) +{ + J_ASSERT(journal->j_barrier_count != 0); + + mutex_unlock(&journal->j_barrier); + spin_lock(&journal->j_state_lock); + --journal->j_barrier_count; + spin_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_transaction_locked); +} + +static void warn_dirty_buffer(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + + printk(KERN_WARNING + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); +} + +/* + * If the buffer is already part of the current transaction, then there + * is nothing we need to do. If it is already part of a prior + * transaction which we are still committing to disk, then we need to + * make sure that we do not overwrite the old copy: we do copy-out to + * preserve the copy going to disk. We also account the buffer against + * the handle's metadata buffer credits (unless the buffer is already + * part of the transaction, that is). + * + */ +static int +do_get_write_access(handle_t *handle, struct journal_head *jh, + int force_copy) +{ + struct buffer_head *bh; + transaction_t *transaction; + journal_t *journal; + int error; + char *frozen_buffer = NULL; + int need_copy = 0; + + if (is_handle_aborted(handle)) + return -EROFS; + + transaction = handle->h_transaction; + journal = transaction->t_journal; + + jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); + + JBUFFER_TRACE(jh, "entry"); +repeat: + bh = jh2bh(jh); + + /* @@@ Need to check for errors here at some point. */ + + lock_buffer(bh); + jbd_lock_bh_state(bh); + + /* We now hold the buffer lock so it is safe to query the buffer + * state. Is the buffer dirty? + * + * If so, there are two possibilities. The buffer may be + * non-journaled, and undergoing a quite legitimate writeback. + * Otherwise, it is journaled, and we don't expect dirty buffers + * in that state (the buffers should be marked JBD_Dirty + * instead.) So either the IO is being done under our own + * control and this is a bug, or it's a third party IO such as + * dump(8) (which may leave the buffer scheduled for read --- + * ie. locked but not dirty) or tune2fs (which may actually have + * the buffer dirtied, ugh.) */ + + if (buffer_dirty(bh)) { + /* + * First question: is this buffer already part of the current + * transaction or the existing committing transaction? + */ + if (jh->b_transaction) { + J_ASSERT_JH(jh, + jh->b_transaction == transaction || + jh->b_transaction == + journal->j_committing_transaction); + if (jh->b_next_transaction) + J_ASSERT_JH(jh, jh->b_next_transaction == + transaction); + warn_dirty_buffer(bh); + } + /* + * In any case we need to clean the dirty flag and we must + * do it under the buffer lock to be sure we don't race + * with running write-out. + */ + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + set_buffer_jbddirty(bh); + } + + unlock_buffer(bh); + + error = -EROFS; + if (is_handle_aborted(handle)) { + jbd_unlock_bh_state(bh); + goto out; + } + error = 0; + + /* + * The buffer is already part of this transaction if b_transaction or + * b_next_transaction points to it + */ + if (jh->b_transaction == transaction || + jh->b_next_transaction == transaction) + goto done; + + /* + * this is the first time this transaction is touching this buffer, + * reset the modified flag + */ + jh->b_modified = 0; + + /* + * If there is already a copy-out version of this buffer, then we don't + * need to make another one + */ + if (jh->b_frozen_data) { + JBUFFER_TRACE(jh, "has frozen data"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + jh->b_next_transaction = transaction; + goto done; + } + + /* Is there data here we need to preserve? */ + + if (jh->b_transaction && jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "owned by older transaction"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + + /* There is one case we have to be very careful about. + * If the committing transaction is currently writing + * this buffer out to disk and has NOT made a copy-out, + * then we cannot modify the buffer contents at all + * right now. The essence of copy-out is that it is the + * extra copy, not the primary copy, which gets + * journaled. If the primary copy is already going to + * disk then we cannot do copy-out here. */ + + if (jh->b_jlist == BJ_Shadow) { + DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); + wait_queue_head_t *wqh; + + wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); + + JBUFFER_TRACE(jh, "on shadow: sleep"); + jbd_unlock_bh_state(bh); + /* commit wakes up all shadow buffers after IO */ + for ( ; ; ) { + prepare_to_wait(wqh, &wait.wait, + TASK_UNINTERRUPTIBLE); + if (jh->b_jlist != BJ_Shadow) + break; + schedule(); + } + finish_wait(wqh, &wait.wait); + goto repeat; + } + + /* Only do the copy if the currently-owning transaction + * still needs it. If it is on the Forget list, the + * committing transaction is past that stage. The + * buffer had better remain locked during the kmalloc, + * but that should be true --- we hold the journal lock + * still and the buffer is already on the BUF_JOURNAL + * list so won't be flushed. + * + * Subtle point, though: if this is a get_undo_access, + * then we will be relying on the frozen_data to contain + * the new value of the committed_data record after the + * transaction, so we HAVE to force the frozen_data copy + * in that case. */ + + if (jh->b_jlist != BJ_Forget || force_copy) { + JBUFFER_TRACE(jh, "generate frozen data"); + if (!frozen_buffer) { + JBUFFER_TRACE(jh, "allocate memory for buffer"); + jbd_unlock_bh_state(bh); + frozen_buffer = + jbd_alloc(jh2bh(jh)->b_size, + GFP_NOFS); + if (!frozen_buffer) { + printk(KERN_EMERG + "%s: OOM for frozen_buffer\n", + __func__); + JBUFFER_TRACE(jh, "oom!"); + error = -ENOMEM; + jbd_lock_bh_state(bh); + goto done; + } + goto repeat; + } + jh->b_frozen_data = frozen_buffer; + frozen_buffer = NULL; + need_copy = 1; + } + jh->b_next_transaction = transaction; + } + + + /* + * Finally, if the buffer is not journaled right now, we need to make + * sure it doesn't get written to disk before the caller actually + * commits the new data + */ + if (!jh->b_transaction) { + JBUFFER_TRACE(jh, "no transaction"); + J_ASSERT_JH(jh, !jh->b_next_transaction); + jh->b_transaction = transaction; + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + spin_lock(&journal->j_list_lock); + __journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); + } + +done: + if (need_copy) { + struct page *page; + int offset; + char *source; + + J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), + "Possible IO failure.\n"); + page = jh2bh(jh)->b_page; + offset = offset_in_page(jh2bh(jh)->b_data); + source = kmap_atomic(page, KM_USER0); + memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); + kunmap_atomic(source, KM_USER0); + } + jbd_unlock_bh_state(bh); + + /* + * If we are about to journal a buffer, then any revoke pending on it is + * no longer valid + */ + journal_cancel_revoke(handle, jh); + +out: + if (unlikely(frozen_buffer)) /* It's usually NULL */ + jbd_free(frozen_buffer, bh->b_size); + + JBUFFER_TRACE(jh, "exit"); + return error; +} + +/** + * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. + * @handle: transaction to add buffer modifications to + * @bh: bh to be used for metadata writes + * + * Returns an error code or 0 on success. + * + * In full data journalling mode the buffer may be of type BJ_AsyncData, + * because we're write()ing a buffer which is also part of a shared mapping. + */ + +int journal_get_write_access(handle_t *handle, struct buffer_head *bh) +{ + struct journal_head *jh = journal_add_journal_head(bh); + int rc; + + /* We do not want to get caught playing with fields which the + * log thread also manipulates. Make sure that the buffer + * completes any outstanding IO before proceeding. */ + rc = do_get_write_access(handle, jh, 0); + journal_put_journal_head(jh); + return rc; +} + + +/* + * When the user wants to journal a newly created buffer_head + * (ie. getblk() returned a new buffer and we are going to populate it + * manually rather than reading off disk), then we need to keep the + * buffer_head locked until it has been completely filled with new + * data. In this case, we should be able to make the assertion that + * the bh is not already part of an existing transaction. + * + * The buffer should already be locked by the caller by this point. + * There is no lock ranking violation: it was a newly created, + * unlocked buffer beforehand. */ + +/** + * int journal_get_create_access () - notify intent to use newly created bh + * @handle: transaction to new buffer to + * @bh: new buffer. + * + * Call this if you create a new bh. + */ +int journal_get_create_access(handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = journal_add_journal_head(bh); + int err; + + jbd_debug(5, "journal_head %p\n", jh); + err = -EROFS; + if (is_handle_aborted(handle)) + goto out; + err = 0; + + JBUFFER_TRACE(jh, "entry"); + /* + * The buffer may already belong to this transaction due to pre-zeroing + * in the filesystem's new_block code. It may also be on the previous, + * committing transaction's lists, but it HAS to be in Forget state in + * that case: the transaction must have deleted the buffer for it to be + * reused here. + */ + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + J_ASSERT_JH(jh, (jh->b_transaction == transaction || + jh->b_transaction == NULL || + (jh->b_transaction == journal->j_committing_transaction && + jh->b_jlist == BJ_Forget))); + + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); + + if (jh->b_transaction == NULL) { + /* + * Previous journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); + jh->b_transaction = transaction; + + /* first access by this transaction */ + jh->b_modified = 0; + + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + __journal_file_buffer(jh, transaction, BJ_Reserved); + } else if (jh->b_transaction == journal->j_committing_transaction) { + /* first access by this transaction */ + jh->b_modified = 0; + + JBUFFER_TRACE(jh, "set next transaction"); + jh->b_next_transaction = transaction; + } + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + + /* + * akpm: I added this. ext3_alloc_branch can pick up new indirect + * blocks which contain freed but then revoked metadata. We need + * to cancel the revoke in case we end up freeing it yet again + * and the reallocating as data - this would cause a second revoke, + * which hits an assertion error. + */ + JBUFFER_TRACE(jh, "cancelling revoke"); + journal_cancel_revoke(handle, jh); + journal_put_journal_head(jh); +out: + return err; +} + +/** + * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences + * @handle: transaction + * @bh: buffer to undo + * + * Sometimes there is a need to distinguish between metadata which has + * been committed to disk and that which has not. The ext3fs code uses + * this for freeing and allocating space, we have to make sure that we + * do not reuse freed space until the deallocation has been committed, + * since if we overwrote that space we would make the delete + * un-rewindable in case of a crash. + * + * To deal with that, journal_get_undo_access requests write access to a + * buffer for parts of non-rewindable operations such as delete + * operations on the bitmaps. The journaling code must keep a copy of + * the buffer's contents prior to the undo_access call until such time + * as we know that the buffer has definitely been committed to disk. + * + * We never need to know which transaction the committed data is part + * of, buffers touched here are guaranteed to be dirtied later and so + * will be committed to a new transaction in due course, at which point + * we can discard the old committed data pointer. + * + * Returns error number or 0 on success. + */ +int journal_get_undo_access(handle_t *handle, struct buffer_head *bh) +{ + int err; + struct journal_head *jh = journal_add_journal_head(bh); + char *committed_data = NULL; + + JBUFFER_TRACE(jh, "entry"); + + /* + * Do this first --- it can drop the journal lock, so we want to + * make sure that obtaining the committed_data is done + * atomically wrt. completion of any outstanding commits. + */ + err = do_get_write_access(handle, jh, 1); + if (err) + goto out; + +repeat: + if (!jh->b_committed_data) { + committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); + if (!committed_data) { + printk(KERN_EMERG "%s: No memory for committed data\n", + __func__); + err = -ENOMEM; + goto out; + } + } + + jbd_lock_bh_state(bh); + if (!jh->b_committed_data) { + /* Copy out the current buffer contents into the + * preserved, committed copy. */ + JBUFFER_TRACE(jh, "generate b_committed data"); + if (!committed_data) { + jbd_unlock_bh_state(bh); + goto repeat; + } + + jh->b_committed_data = committed_data; + committed_data = NULL; + memcpy(jh->b_committed_data, bh->b_data, bh->b_size); + } + jbd_unlock_bh_state(bh); +out: + journal_put_journal_head(jh); + if (unlikely(committed_data)) + jbd_free(committed_data, bh->b_size); + return err; +} + +/** + * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed + * @handle: transaction + * @bh: bufferhead to mark + * + * Description: + * Mark a buffer as containing dirty data which needs to be flushed before + * we can commit the current transaction. + * + * The buffer is placed on the transaction's data list and is marked as + * belonging to the transaction. + * + * Returns error number or 0 on success. + * + * journal_dirty_data() can be called via page_launder->ext3_writepage + * by kswapd. + */ +int journal_dirty_data(handle_t *handle, struct buffer_head *bh) +{ + journal_t *journal = handle->h_transaction->t_journal; + int need_brelse = 0; + struct journal_head *jh; + int ret = 0; + + if (is_handle_aborted(handle)) + return ret; + + jh = journal_add_journal_head(bh); + JBUFFER_TRACE(jh, "entry"); + + /* + * The buffer could *already* be dirty. Writeout can start + * at any time. + */ + jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); + + /* + * What if the buffer is already part of a running transaction? + * + * There are two cases: + * 1) It is part of the current running transaction. Refile it, + * just in case we have allocated it as metadata, deallocated + * it, then reallocated it as data. + * 2) It is part of the previous, still-committing transaction. + * If all we want to do is to guarantee that the buffer will be + * written to disk before this new transaction commits, then + * being sure that the *previous* transaction has this same + * property is sufficient for us! Just leave it on its old + * transaction. + * + * In case (2), the buffer must not already exist as metadata + * --- that would violate write ordering (a transaction is free + * to write its data at any point, even before the previous + * committing transaction has committed). The caller must + * never, ever allow this to happen: there's nothing we can do + * about it in this layer. + */ + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + /* Now that we have bh_state locked, are we really still mapped? */ + if (!buffer_mapped(bh)) { + JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); + goto no_journal; + } + + if (jh->b_transaction) { + JBUFFER_TRACE(jh, "has transaction"); + if (jh->b_transaction != handle->h_transaction) { + JBUFFER_TRACE(jh, "belongs to older transaction"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + + /* @@@ IS THIS TRUE ? */ + /* + * Not any more. Scenario: someone does a write() + * in data=journal mode. The buffer's transaction has + * moved into commit. Then someone does another + * write() to the file. We do the frozen data copyout + * and set b_next_transaction to point to j_running_t. + * And while we're in that state, someone does a + * writepage() in an attempt to pageout the same area + * of the file via a shared mapping. At present that + * calls journal_dirty_data(), and we get right here. + * It may be too late to journal the data. Simply + * falling through to the next test will suffice: the + * data will be dirty and wil be checkpointed. The + * ordering comments in the next comment block still + * apply. + */ + //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + + /* + * If we're journalling data, and this buffer was + * subject to a write(), it could be metadata, forget + * or shadow against the committing transaction. Now, + * someone has dirtied the same darn page via a mapping + * and it is being writepage()'d. + * We *could* just steal the page from commit, with some + * fancy locking there. Instead, we just skip it - + * don't tie the page's buffers to the new transaction + * at all. + * Implication: if we crash before the writepage() data + * is written into the filesystem, recovery will replay + * the write() data. + */ + if (jh->b_jlist != BJ_None && + jh->b_jlist != BJ_SyncData && + jh->b_jlist != BJ_Locked) { + JBUFFER_TRACE(jh, "Not stealing"); + goto no_journal; + } + + /* + * This buffer may be undergoing writeout in commit. We + * can't return from here and let the caller dirty it + * again because that can cause the write-out loop in + * commit to never terminate. + */ + if (buffer_dirty(bh)) { + get_bh(bh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + need_brelse = 1; + sync_dirty_buffer(bh); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + /* Since we dropped the lock... */ + if (!buffer_mapped(bh)) { + JBUFFER_TRACE(jh, "buffer got unmapped"); + goto no_journal; + } + /* The buffer may become locked again at any + time if it is redirtied */ + } + + /* + * We cannot remove the buffer with io error from the + * committing transaction, because otherwise it would + * miss the error and the commit would not abort. + */ + if (unlikely(!buffer_uptodate(bh))) { + ret = -EIO; + goto no_journal; + } + + if (jh->b_transaction != NULL) { + JBUFFER_TRACE(jh, "unfile from commit"); + __journal_temp_unlink_buffer(jh); + /* It still points to the committing + * transaction; move it to this one so + * that the refile assert checks are + * happy. */ + jh->b_transaction = handle->h_transaction; + } + /* The buffer will be refiled below */ + + } + /* + * Special case --- the buffer might actually have been + * allocated and then immediately deallocated in the previous, + * committing transaction, so might still be left on that + * transaction's metadata lists. + */ + if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { + JBUFFER_TRACE(jh, "not on correct data list: unfile"); + J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); + __journal_temp_unlink_buffer(jh); + jh->b_transaction = handle->h_transaction; + JBUFFER_TRACE(jh, "file as data"); + __journal_file_buffer(jh, handle->h_transaction, + BJ_SyncData); + } + } else { + JBUFFER_TRACE(jh, "not on a transaction"); + __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); + } +no_journal: + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + if (need_brelse) { + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + } + JBUFFER_TRACE(jh, "exit"); + journal_put_journal_head(jh); + return ret; +} + +/** + * int journal_dirty_metadata() - mark a buffer as containing dirty metadata + * @handle: transaction to add buffer to. + * @bh: buffer to mark + * + * Mark dirty metadata which needs to be journaled as part of the current + * transaction. + * + * The buffer is placed on the transaction's metadata list and is marked + * as belonging to the transaction. + * + * Returns error number or 0 on success. + * + * Special care needs to be taken if the buffer already belongs to the + * current committing transaction (in which case we should have frozen + * data present for that commit). In that case, we don't relink the + * buffer: that only gets done when the old transaction finally + * completes its commit. + */ +int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = bh2jh(bh); + + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); + if (is_handle_aborted(handle)) + goto out; + + jbd_lock_bh_state(bh); + + if (jh->b_modified == 0) { + /* + * This buffer's got modified and becoming part + * of the transaction. This needs to be done + * once a transaction -bzzz + */ + jh->b_modified = 1; + J_ASSERT_JH(jh, handle->h_buffer_credits > 0); + handle->h_buffer_credits--; + } + + /* + * fastpath, to avoid expensive locking. If this buffer is already + * on the running transaction's metadata list there is nothing to do. + * Nobody can take it off again because there is a handle open. + * I _think_ we're OK here with SMP barriers - a mistaken decision will + * result in this test being false, so we go in and take the locks. + */ + if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { + JBUFFER_TRACE(jh, "fastpath"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_running_transaction); + goto out_unlock_bh; + } + + set_buffer_jbddirty(bh); + + /* + * Metadata already on the current transaction list doesn't + * need to be filed. Metadata on another transaction's list must + * be committing, and will be refiled once the commit completes: + * leave it alone for now. + */ + if (jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "already on other transaction"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + /* And this case is illegal: we can't reuse another + * transaction's data buffer, ever. */ + goto out_unlock_bh; + } + + /* That test should have eliminated the following case: */ + J_ASSERT_JH(jh, jh->b_frozen_data == NULL); + + JBUFFER_TRACE(jh, "file as BJ_Metadata"); + spin_lock(&journal->j_list_lock); + __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); + spin_unlock(&journal->j_list_lock); +out_unlock_bh: + jbd_unlock_bh_state(bh); +out: + JBUFFER_TRACE(jh, "exit"); + return 0; +} + +/* + * journal_release_buffer: undo a get_write_access without any buffer + * updates, if the update decided in the end that it didn't need access. + * + */ +void +journal_release_buffer(handle_t *handle, struct buffer_head *bh) +{ + BUFFER_TRACE(bh, "entry"); +} + +/** + * void journal_forget() - bforget() for potentially-journaled buffers. + * @handle: transaction handle + * @bh: bh to 'forget' + * + * We can only do the bforget if there are no commits pending against the + * buffer. If the buffer is dirty in the current running transaction we + * can safely unlink it. + * + * bh may not be a journalled buffer at all - it may be a non-JBD + * buffer which came off the hashtable. Check for this. + * + * Decrements bh->b_count by one. + * + * Allow this call even if the handle has aborted --- it may be part of + * the caller's cleanup after an abort. + */ +int journal_forget (handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh; + int drop_reserve = 0; + int err = 0; + int was_modified = 0; + + BUFFER_TRACE(bh, "entry"); + + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + if (!buffer_jbd(bh)) + goto not_jbd; + jh = bh2jh(bh); + + /* Critical error: attempting to delete a bitmap buffer, maybe? + * Don't do any jbd operations, and return an error. */ + if (!J_EXPECT_JH(jh, !jh->b_committed_data, + "inconsistent data on disk")) { + err = -EIO; + goto not_jbd; + } + + /* keep track of wether or not this transaction modified us */ + was_modified = jh->b_modified; + + /* + * The buffer's going from the transaction, we must drop + * all references -bzzz + */ + jh->b_modified = 0; + + if (jh->b_transaction == handle->h_transaction) { + J_ASSERT_JH(jh, !jh->b_frozen_data); + + /* If we are forgetting a buffer which is already part + * of this transaction, then we can just drop it from + * the transaction immediately. */ + clear_buffer_dirty(bh); + clear_buffer_jbddirty(bh); + + JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); + + /* + * we only want to drop a reference if this transaction + * modified the buffer + */ + if (was_modified) + drop_reserve = 1; + + /* + * We are no longer going to journal this buffer. + * However, the commit of this transaction is still + * important to the buffer: the delete that we are now + * processing might obsolete an old log entry, so by + * committing, we can satisfy the buffer's checkpoint. + * + * So, if we have a checkpoint on the buffer, we should + * now refile the buffer on our BJ_Forget list so that + * we know to remove the checkpoint after we commit. + */ + + if (jh->b_cp_transaction) { + __journal_temp_unlink_buffer(jh); + __journal_file_buffer(jh, transaction, BJ_Forget); + } else { + __journal_unfile_buffer(jh); + journal_remove_journal_head(bh); + __brelse(bh); + if (!buffer_jbd(bh)) { + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __bforget(bh); + goto drop; + } + } + } else if (jh->b_transaction) { + J_ASSERT_JH(jh, (jh->b_transaction == + journal->j_committing_transaction)); + /* However, if the buffer is still owned by a prior + * (committing) transaction, we can't drop it yet... */ + JBUFFER_TRACE(jh, "belongs to older transaction"); + /* ... but we CAN drop it from the new transaction if we + * have also modified it since the original commit. */ + + if (jh->b_next_transaction) { + J_ASSERT(jh->b_next_transaction == transaction); + jh->b_next_transaction = NULL; + + /* + * only drop a reference if this transaction modified + * the buffer + */ + if (was_modified) + drop_reserve = 1; + } + } + +not_jbd: + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __brelse(bh); +drop: + if (drop_reserve) { + /* no need to reserve log space for this block -bzzz */ + handle->h_buffer_credits++; + } + return err; +} + +/** + * int journal_stop() - complete a transaction + * @handle: tranaction to complete. + * + * All done for a particular handle. + * + * There is not much action needed here. We just return any remaining + * buffer credits to the transaction and remove the handle. The only + * complication is that we need to start a commit operation if the + * filesystem is marked for synchronous update. + * + * journal_stop itself will not usually return an error, but it may + * do so in unusual circumstances. In particular, expect it to + * return -EIO if a journal_abort has been executed since the + * transaction began. + */ +int journal_stop(handle_t *handle) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int err; + pid_t pid; + + J_ASSERT(journal_current_handle() == handle); + + if (is_handle_aborted(handle)) + err = -EIO; + else { + J_ASSERT(transaction->t_updates > 0); + err = 0; + } + + if (--handle->h_ref > 0) { + jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + return err; + } + + jbd_debug(4, "Handle %p going down\n", handle); + + /* + * Implement synchronous transaction batching. If the handle + * was synchronous, don't force a commit immediately. Let's + * yield and let another thread piggyback onto this transaction. + * Keep doing that while new threads continue to arrive. + * It doesn't cost much - we're about to run a commit and sleep + * on IO anyway. Speeds up many-threaded, many-dir operations + * by 30x or more... + * + * We try and optimize the sleep time against what the underlying disk + * can do, instead of having a static sleep time. This is useful for + * the case where our storage is so fast that it is more optimal to go + * ahead and force a flush and wait for the transaction to be committed + * than it is to wait for an arbitrary amount of time for new writers to + * join the transaction. We achieve this by measuring how long it takes + * to commit a transaction, and compare it with how long this + * transaction has been running, and if run time < commit time then we + * sleep for the delta and commit. This greatly helps super fast disks + * that would see slowdowns as more threads started doing fsyncs. + * + * But don't do this if this process was the most recent one to + * perform a synchronous write. We do this to detect the case where a + * single process is doing a stream of sync writes. No point in waiting + * for joiners in that case. + */ + pid = current->pid; + if (handle->h_sync && journal->j_last_sync_writer != pid) { + u64 commit_time, trans_time; + + journal->j_last_sync_writer = pid; + + spin_lock(&journal->j_state_lock); + commit_time = journal->j_average_commit_time; + spin_unlock(&journal->j_state_lock); + + trans_time = ktime_to_ns(ktime_sub(ktime_get(), + transaction->t_start_time)); + + commit_time = min_t(u64, commit_time, + 1000*jiffies_to_usecs(1)); + + if (trans_time < commit_time) { + ktime_t expires = ktime_add_ns(ktime_get(), + commit_time); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } + } + + if (handle->h_sync) + transaction->t_synchronous_commit = 1; + current->journal_info = NULL; + spin_lock(&journal->j_state_lock); + spin_lock(&transaction->t_handle_lock); + transaction->t_outstanding_credits -= handle->h_buffer_credits; + transaction->t_updates--; + if (!transaction->t_updates) { + wake_up(&journal->j_wait_updates); + if (journal->j_barrier_count) + wake_up(&journal->j_wait_transaction_locked); + } + + /* + * If the handle is marked SYNC, we need to set another commit + * going! We also want to force a commit if the current + * transaction is occupying too much of the log, or if the + * transaction is too old now. + */ + if (handle->h_sync || + transaction->t_outstanding_credits > + journal->j_max_transaction_buffers || + time_after_eq(jiffies, transaction->t_expires)) { + /* Do this even for aborted journals: an abort still + * completes the commit thread, it just doesn't write + * anything to disk. */ + tid_t tid = transaction->t_tid; + + spin_unlock(&transaction->t_handle_lock); + jbd_debug(2, "transaction too old, requesting commit for " + "handle %p\n", handle); + /* This is non-blocking */ + __log_start_commit(journal, transaction->t_tid); + spin_unlock(&journal->j_state_lock); + + /* + * Special case: JFS_SYNC synchronous updates require us + * to wait for the commit to complete. + */ + if (handle->h_sync && !(current->flags & PF_MEMALLOC)) + err = log_wait_commit(journal, tid); + } else { + spin_unlock(&transaction->t_handle_lock); + spin_unlock(&journal->j_state_lock); + } + + lock_map_release(&handle->h_lockdep_map); + + jbd_free_handle(handle); + return err; +} + +/** + * int journal_force_commit() - force any uncommitted transactions + * @journal: journal to force + * + * For synchronous operations: force any uncommitted transactions + * to disk. May seem kludgy, but it reuses all the handle batching + * code in a very simple manner. + */ +int journal_force_commit(journal_t *journal) +{ + handle_t *handle; + int ret; + + handle = journal_start(journal, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + } else { + handle->h_sync = 1; + ret = journal_stop(handle); + } + return ret; +} + +/* + * + * List management code snippets: various functions for manipulating the + * transaction buffer lists. + * + */ + +/* + * Append a buffer to a transaction list, given the transaction's list head + * pointer. + * + * j_list_lock is held. + * + * jbd_lock_bh_state(jh2bh(jh)) is held. + */ + +static inline void +__blist_add_buffer(struct journal_head **list, struct journal_head *jh) +{ + if (!*list) { + jh->b_tnext = jh->b_tprev = jh; + *list = jh; + } else { + /* Insert at the tail of the list to preserve order */ + struct journal_head *first = *list, *last = first->b_tprev; + jh->b_tprev = last; + jh->b_tnext = first; + last->b_tnext = first->b_tprev = jh; + } +} + +/* + * Remove a buffer from a transaction list, given the transaction's list + * head pointer. + * + * Called with j_list_lock held, and the journal may not be locked. + * + * jbd_lock_bh_state(jh2bh(jh)) is held. + */ + +static inline void +__blist_del_buffer(struct journal_head **list, struct journal_head *jh) +{ + if (*list == jh) { + *list = jh->b_tnext; + if (*list == jh) + *list = NULL; + } + jh->b_tprev->b_tnext = jh->b_tnext; + jh->b_tnext->b_tprev = jh->b_tprev; +} + +/* + * Remove a buffer from the appropriate transaction list. + * + * Note that this function can *change* the value of + * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, + * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller + * is holding onto a copy of one of thee pointers, it could go bad. + * Generally the caller needs to re-read the pointer from the transaction_t. + * + * Called under j_list_lock. The journal may not be locked. + */ +static void __journal_temp_unlink_buffer(struct journal_head *jh) +{ + struct journal_head **list = NULL; + transaction_t *transaction; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + transaction = jh->b_transaction; + if (transaction) + assert_spin_locked(&transaction->t_journal->j_list_lock); + + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + if (jh->b_jlist != BJ_None) + J_ASSERT_JH(jh, transaction != NULL); + + switch (jh->b_jlist) { + case BJ_None: + return; + case BJ_SyncData: + list = &transaction->t_sync_datalist; + break; + case BJ_Metadata: + transaction->t_nr_buffers--; + J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_IO: + list = &transaction->t_iobuf_list; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_LogCtl: + list = &transaction->t_log_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + case BJ_Locked: + list = &transaction->t_locked_list; + break; + } + + __blist_del_buffer(list, jh); + jh->b_jlist = BJ_None; + if (test_clear_buffer_jbddirty(bh)) + mark_buffer_dirty(bh); /* Expose it to the VM */ +} + +void __journal_unfile_buffer(struct journal_head *jh) +{ + __journal_temp_unlink_buffer(jh); + jh->b_transaction = NULL; +} + +void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) +{ + jbd_lock_bh_state(jh2bh(jh)); + spin_lock(&journal->j_list_lock); + __journal_unfile_buffer(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(jh2bh(jh)); +} + +/* + * Called from journal_try_to_free_buffers(). + * + * Called under jbd_lock_bh_state(bh) + */ +static void +__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) +{ + struct journal_head *jh; + + jh = bh2jh(bh); + + if (buffer_locked(bh) || buffer_dirty(bh)) + goto out; + + if (jh->b_next_transaction != NULL) + goto out; + + spin_lock(&journal->j_list_lock); + if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { + if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { + /* A written-back ordered data buffer */ + JBUFFER_TRACE(jh, "release data"); + __journal_unfile_buffer(jh); + journal_remove_journal_head(bh); + __brelse(bh); + } + } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { + /* written-back checkpointed metadata buffer */ + if (jh->b_jlist == BJ_None) { + JBUFFER_TRACE(jh, "remove from checkpoint list"); + __journal_remove_checkpoint(jh); + journal_remove_journal_head(bh); + __brelse(bh); + } + } + spin_unlock(&journal->j_list_lock); +out: + return; +} + +/** + * int journal_try_to_free_buffers() - try to free page buffers. + * @journal: journal for operation + * @page: to try and free + * @gfp_mask: we use the mask to detect how hard should we try to release + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to + * release the buffers. + * + * + * For all the buffers on this page, + * if they are fully written out ordered data, move them onto BUF_CLEAN + * so try_to_free_buffers() can reap them. + * + * This function returns non-zero if we wish try_to_free_buffers() + * to be called. We do this if the page is releasable by try_to_free_buffers(). + * We also do it if the page has locked or dirty buffers and the caller wants + * us to perform sync or async writeout. + * + * This complicates JBD locking somewhat. We aren't protected by the + * BKL here. We wish to remove the buffer from its committing or + * running transaction's ->t_datalist via __journal_unfile_buffer. + * + * This may *change* the value of transaction_t->t_datalist, so anyone + * who looks at t_datalist needs to lock against this function. + * + * Even worse, someone may be doing a journal_dirty_data on this + * buffer. So we need to lock against that. journal_dirty_data() + * will come out of the lock with the buffer dirty, which makes it + * ineligible for release here. + * + * Who else is affected by this? hmm... Really the only contender + * is do_get_write_access() - it could be looking at the buffer while + * journal_try_to_free_buffer() is changing its state. But that + * cannot happen because we never reallocate freed data as metadata + * while the data is part of a transaction. Yes? + * + * Return 0 on failure, 1 on success + */ +int journal_try_to_free_buffers(journal_t *journal, + struct page *page, gfp_t gfp_mask) +{ + struct buffer_head *head; + struct buffer_head *bh; + int ret = 0; + + J_ASSERT(PageLocked(page)); + + head = page_buffers(page); + bh = head; + do { + struct journal_head *jh; + + /* + * We take our own ref against the journal_head here to avoid + * having to add tons of locking around each instance of + * journal_remove_journal_head() and journal_put_journal_head(). + */ + jh = journal_grab_journal_head(bh); + if (!jh) + continue; + + jbd_lock_bh_state(bh); + __journal_try_to_free_buffer(journal, bh); + journal_put_journal_head(jh); + jbd_unlock_bh_state(bh); + if (buffer_jbd(bh)) + goto busy; + } while ((bh = bh->b_this_page) != head); + + ret = try_to_free_buffers(page); + +busy: + return ret; +} + +/* + * This buffer is no longer needed. If it is on an older transaction's + * checkpoint list we need to record it on this transaction's forget list + * to pin this buffer (and hence its checkpointing transaction) down until + * this transaction commits. If the buffer isn't on a checkpoint list, we + * release it. + * Returns non-zero if JBD no longer has an interest in the buffer. + * + * Called under j_list_lock. + * + * Called under jbd_lock_bh_state(bh). + */ +static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) +{ + int may_free = 1; + struct buffer_head *bh = jh2bh(jh); + + __journal_unfile_buffer(jh); + + if (jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "on running+cp transaction"); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __journal_file_buffer + */ + clear_buffer_dirty(bh); + __journal_file_buffer(jh, transaction, BJ_Forget); + may_free = 0; + } else { + JBUFFER_TRACE(jh, "on running transaction"); + journal_remove_journal_head(bh); + __brelse(bh); + } + return may_free; +} + +/* + * journal_invalidatepage + * + * This code is tricky. It has a number of cases to deal with. + * + * There are two invariants which this code relies on: + * + * i_size must be updated on disk before we start calling invalidatepage on the + * data. + * + * This is done in ext3 by defining an ext3_setattr method which + * updates i_size before truncate gets going. By maintaining this + * invariant, we can be sure that it is safe to throw away any buffers + * attached to the current transaction: once the transaction commits, + * we know that the data will not be needed. + * + * Note however that we can *not* throw away data belonging to the + * previous, committing transaction! + * + * Any disk blocks which *are* part of the previous, committing + * transaction (and which therefore cannot be discarded immediately) are + * not going to be reused in the new running transaction + * + * The bitmap committed_data images guarantee this: any block which is + * allocated in one transaction and removed in the next will be marked + * as in-use in the committed_data bitmap, so cannot be reused until + * the next transaction to delete the block commits. This means that + * leaving committing buffers dirty is quite safe: the disk blocks + * cannot be reallocated to a different file and so buffer aliasing is + * not possible. + * + * + * The above applies mainly to ordered data mode. In writeback mode we + * don't make guarantees about the order in which data hits disk --- in + * particular we don't guarantee that new dirty data is flushed before + * transaction commit --- so it is always safe just to discard data + * immediately in that mode. --sct + */ + +/* + * The journal_unmap_buffer helper function returns zero if the buffer + * concerned remains pinned as an anonymous buffer belonging to an older + * transaction. + * + * We're outside-transaction here. Either or both of j_running_transaction + * and j_committing_transaction may be NULL. + */ +static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) +{ + transaction_t *transaction; + struct journal_head *jh; + int may_free = 1; + int ret; + + BUFFER_TRACE(bh, "entry"); + + /* + * It is safe to proceed here without the j_list_lock because the + * buffers cannot be stolen by try_to_free_buffers as long as we are + * holding the page lock. --sct + */ + + if (!buffer_jbd(bh)) + goto zap_buffer_unlocked; + + spin_lock(&journal->j_state_lock); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + jh = journal_grab_journal_head(bh); + if (!jh) + goto zap_buffer_no_jh; + + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * buffer can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + */ + transaction = jh->b_transaction; + if (transaction == NULL) { + /* First case: not on any transaction. If it + * has no checkpoint link, then we can zap it: + * it's a writeback-mode buffer so we don't care + * if it hits disk safely. */ + if (!jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "not on any transaction: zap"); + goto zap_buffer; + } + + if (!buffer_dirty(bh)) { + /* bdflush has written it. We can drop it now */ + goto zap_buffer; + } + + /* OK, it must be in the journal but still not + * written fully to disk: it's metadata or + * journaled data... */ + + if (journal->j_running_transaction) { + /* ... and once the current transaction has + * committed, the buffer won't be needed any + * longer. */ + JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); + ret = __dispose_buffer(jh, + journal->j_running_transaction); + journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); + return ret; + } else { + /* There is no currently-running transaction. So the + * orphan record which we wrote for this file must have + * passed into commit. We must attach this buffer to + * the committing transaction, if it exists. */ + if (journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "give to committing trans"); + ret = __dispose_buffer(jh, + journal->j_committing_transaction); + journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); + return ret; + } else { + /* The orphan record's transaction has + * committed. We can cleanse this buffer */ + clear_buffer_jbddirty(bh); + goto zap_buffer; + } + } + } else if (transaction == journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "on committing transaction"); + if (jh->b_jlist == BJ_Locked) { + /* + * The buffer is on the committing transaction's locked + * list. We have the buffer locked, so I/O has + * completed. So we can nail the buffer now. + */ + may_free = __dispose_buffer(jh, transaction); + goto zap_buffer; + } + /* + * The buffer is committing, we simply cannot touch + * it. So we just set j_next_transaction to the + * running transaction (if there is one) and mark + * buffer as freed so that commit code knows it should + * clear dirty bits when it is done with the buffer. + */ + set_buffer_freed(bh); + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; + journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); + return 0; + } else { + /* Good, the buffer belongs to the running transaction. + * We are writing our own transaction's data, not any + * previous one's, so it is safe to throw it away + * (remember that we expect the filesystem to have set + * i_size already for this truncate so recovery will not + * expose the disk blocks we are discarding here.) */ + J_ASSERT_JH(jh, transaction == journal->j_running_transaction); + JBUFFER_TRACE(jh, "on running transaction"); + may_free = __dispose_buffer(jh, transaction); + } + +zap_buffer: + journal_put_journal_head(jh); +zap_buffer_no_jh: + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); +zap_buffer_unlocked: + clear_buffer_dirty(bh); + J_ASSERT_BH(bh, !buffer_jbddirty(bh)); + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + bh->b_bdev = NULL; + return may_free; +} + +/** + * void journal_invalidatepage() - invalidate a journal page + * @journal: journal to use for flush + * @page: page to flush + * @offset: length of page to invalidate. + * + * Reap page buffers containing data after offset in page. + */ +void journal_invalidatepage(journal_t *journal, + struct page *page, + unsigned long offset) +{ + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + int may_free = 1; + + if (!PageLocked(page)) + BUG(); + if (!page_has_buffers(page)) + return; + + /* We will potentially be playing with lists other than just the + * data lists (especially for journaled data mode), so be + * cautious in our locking. */ + + head = bh = page_buffers(page); + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + if (offset <= curr_off) { + /* This block is wholly outside the truncation point */ + lock_buffer(bh); + may_free &= journal_unmap_buffer(journal, bh); + unlock_buffer(bh); + } + curr_off = next_off; + bh = next; + + } while (bh != head); + + if (!offset) { + if (may_free && try_to_free_buffers(page)) + J_ASSERT(!page_has_buffers(page)); + } +} + +/* + * File a buffer on the given transaction list. + */ +void __journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) +{ + struct journal_head **list = NULL; + int was_dirty = 0; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + assert_spin_locked(&transaction->t_journal->j_list_lock); + + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_transaction == NULL); + + if (jh->b_transaction && jh->b_jlist == jlist) + return; + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); + if (test_clear_buffer_dirty(bh) || + test_clear_buffer_jbddirty(bh)) + was_dirty = 1; + } + + if (jh->b_transaction) + __journal_temp_unlink_buffer(jh); + jh->b_transaction = transaction; + + switch (jlist) { + case BJ_None: + J_ASSERT_JH(jh, !jh->b_committed_data); + J_ASSERT_JH(jh, !jh->b_frozen_data); + return; + case BJ_SyncData: + list = &transaction->t_sync_datalist; + break; + case BJ_Metadata: + transaction->t_nr_buffers++; + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_IO: + list = &transaction->t_iobuf_list; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_LogCtl: + list = &transaction->t_log_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + case BJ_Locked: + list = &transaction->t_locked_list; + break; + } + + __blist_add_buffer(list, jh); + jh->b_jlist = jlist; + + if (was_dirty) + set_buffer_jbddirty(bh); +} + +void journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) +{ + jbd_lock_bh_state(jh2bh(jh)); + spin_lock(&transaction->t_journal->j_list_lock); + __journal_file_buffer(jh, transaction, jlist); + spin_unlock(&transaction->t_journal->j_list_lock); + jbd_unlock_bh_state(jh2bh(jh)); +} + +/* + * Remove a buffer from its current buffer list in preparation for + * dropping it from its current transaction entirely. If the buffer has + * already started to be used by a subsequent transaction, refile the + * buffer on that transaction's metadata list. + * + * Called under journal->j_list_lock + * + * Called under jbd_lock_bh_state(jh2bh(jh)) + */ +void __journal_refile_buffer(struct journal_head *jh) +{ + int was_dirty, jlist; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + if (jh->b_transaction) + assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); + + /* If the buffer is now unused, just drop it. */ + if (jh->b_next_transaction == NULL) { + __journal_unfile_buffer(jh); + return; + } + + /* + * It has been modified by a later transaction: add it to the new + * transaction's metadata list. + */ + + was_dirty = test_clear_buffer_jbddirty(bh); + __journal_temp_unlink_buffer(jh); + jh->b_transaction = jh->b_next_transaction; + jh->b_next_transaction = NULL; + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __journal_file_buffer(jh, jh->b_transaction, jlist); + J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); + + if (was_dirty) + set_buffer_jbddirty(bh); +} + +/* + * For the unlocked version of this call, also make sure that any + * hanging journal_head is cleaned up if necessary. + * + * __journal_refile_buffer is usually called as part of a single locked + * operation on a buffer_head, in which the caller is probably going to + * be hooking the journal_head onto other lists. In that case it is up + * to the caller to remove the journal_head if necessary. For the + * unlocked journal_refile_buffer call, the caller isn't going to be + * doing anything else to the buffer so we need to do the cleanup + * ourselves to avoid a jh leak. + * + * *** The journal_head may be freed by this call! *** + */ +void journal_refile_buffer(journal_t *journal, struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + + spin_unlock(&journal->j_list_lock); + __brelse(bh); +} diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig new file mode 100644 index 00000000..f32f346f --- /dev/null +++ b/fs/jbd2/Kconfig @@ -0,0 +1,33 @@ +config JBD2 + tristate + select CRC32 + help + This is a generic journaling layer for block devices that support + both 32-bit and 64-bit block numbers. It is currently used by + the ext4 and OCFS2 filesystems, but it could also be used to add + journal support to other file systems or block devices such + as RAID or LVM. + + If you are using ext4 or OCFS2, you need to say Y here. + If you are not using ext4 or OCFS2 then you will + probably want to say N. + + To compile this device as a module, choose M here. The module will be + called jbd2. If you are compiling ext4 or OCFS2 into the kernel, + you cannot compile this code as a module. + +config JBD2_DEBUG + bool "JBD2 (ext4) debugging support" + depends on JBD2 && DEBUG_FS + help + If you are using the ext4 journaled file system (or + potentially any other filesystem/device using JBD2), this option + allows you to enable debugging output while the system is running, + in order to help track down any problems you are having. + By default, the debugging output will be turned off. + + If you select Y here, then you will be able to turn on debugging + with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a + number between 1 and 5. The higher the number, the more debugging + output is generated. To turn debugging off again, do + "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug". diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile new file mode 100644 index 00000000..802a3413 --- /dev/null +++ b/fs/jbd2/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux journaling routines. +# + +obj-$(CONFIG_JBD2) += jbd2.o + +jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c new file mode 100644 index 00000000..2c62c5aa --- /dev/null +++ b/fs/jbd2/checkpoint.c @@ -0,0 +1,798 @@ +/* + * linux/fs/jbd2/checkpoint.c + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1999 Red Hat Software --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Checkpoint routines for the generic filesystem journaling code. + * Part of the ext2fs journaling system. + * + * Checkpointing is the process of ensuring that a section of the log is + * committed fully to disk, so that that portion of the log can be + * reused. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Unlink a buffer from a transaction checkpoint list. + * + * Called with j_list_lock held. + */ +static inline void __buffer_unlink_first(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + jh->b_cpnext->b_cpprev = jh->b_cpprev; + jh->b_cpprev->b_cpnext = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) { + transaction->t_checkpoint_list = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; + } +} + +/* + * Unlink a buffer from a transaction checkpoint(io) list. + * + * Called with j_list_lock held. + */ +static inline void __buffer_unlink(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + __buffer_unlink_first(jh); + if (transaction->t_checkpoint_io_list == jh) { + transaction->t_checkpoint_io_list = jh->b_cpnext; + if (transaction->t_checkpoint_io_list == jh) + transaction->t_checkpoint_io_list = NULL; + } +} + +/* + * Move a buffer from the checkpoint list to the checkpoint io list + * + * Called with j_list_lock held + */ +static inline void __buffer_relink_io(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + __buffer_unlink_first(jh); + + if (!transaction->t_checkpoint_io_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_io_list; + jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_io_list = jh; +} + +/* + * Try to release a checkpointed buffer from its transaction. + * Returns 1 if we released it and 2 if we also released the + * whole transaction. + * + * Requires j_list_lock + * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it + */ +static int __try_to_free_cp_buf(struct journal_head *jh) +{ + int ret = 0; + struct buffer_head *bh = jh2bh(jh); + + if (jh->b_jlist == BJ_None && !buffer_locked(bh) && + !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + /* + * Get our reference so that bh cannot be freed before + * we unlock it + */ + get_bh(bh); + JBUFFER_TRACE(jh, "remove from checkpoint list"); + ret = __jbd2_journal_remove_checkpoint(jh) + 1; + jbd_unlock_bh_state(bh); + BUFFER_TRACE(bh, "release"); + __brelse(bh); + } else { + jbd_unlock_bh_state(bh); + } + return ret; +} + +/* + * __jbd2_log_wait_for_space: wait until there is space in the journal. + * + * Called under j-state_lock *only*. It will be unlocked if we have to wait + * for a checkpoint to free up some space in the log. + */ +void __jbd2_log_wait_for_space(journal_t *journal) +{ + int nblocks, space_left; + /* assert_spin_locked(&journal->j_state_lock); */ + + nblocks = jbd_space_needed(journal); + while (__jbd2_log_space_left(journal) < nblocks) { + if (journal->j_flags & JBD2_ABORT) + return; + write_unlock(&journal->j_state_lock); + mutex_lock(&journal->j_checkpoint_mutex); + + /* + * Test again, another process may have checkpointed while we + * were waiting for the checkpoint lock. If there are no + * transactions ready to be checkpointed, try to recover + * journal space by calling cleanup_journal_tail(), and if + * that doesn't work, by waiting for the currently committing + * transaction to complete. If there is absolutely no way + * to make progress, this is either a BUG or corrupted + * filesystem, so abort the journal and leave a stack + * trace for forensic evidence. + */ + write_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + nblocks = jbd_space_needed(journal); + space_left = __jbd2_log_space_left(journal); + if (space_left < nblocks) { + int chkpt = journal->j_checkpoint_transactions != NULL; + tid_t tid = 0; + + if (journal->j_committing_transaction) + tid = journal->j_committing_transaction->t_tid; + spin_unlock(&journal->j_list_lock); + write_unlock(&journal->j_state_lock); + if (chkpt) { + jbd2_log_do_checkpoint(journal); + } else if (jbd2_cleanup_journal_tail(journal) == 0) { + /* We were able to recover space; yay! */ + ; + } else if (tid) { + jbd2_log_wait_commit(journal, tid); + } else { + printk(KERN_ERR "%s: needed %d blocks and " + "only had %d space available\n", + __func__, nblocks, space_left); + printk(KERN_ERR "%s: no way to get more " + "journal space in %s\n", __func__, + journal->j_devname); + WARN_ON(1); + jbd2_journal_abort(journal, 0); + } + write_lock(&journal->j_state_lock); + } else { + spin_unlock(&journal->j_list_lock); + } + mutex_unlock(&journal->j_checkpoint_mutex); + } +} + +/* + * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. + * The caller must restart a list walk. Wait for someone else to run + * jbd_unlock_bh_state(). + */ +static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) + __releases(journal->j_list_lock) +{ + get_bh(bh); + spin_unlock(&journal->j_list_lock); + jbd_lock_bh_state(bh); + jbd_unlock_bh_state(bh); + put_bh(bh); +} + +/* + * Clean up transaction's list of buffers submitted for io. + * We wait for any pending IO to complete and remove any clean + * buffers. Note that we take the buffers in the opposite ordering + * from the one in which they were submitted for IO. + * + * Return 0 on success, and return <0 if some buffers have failed + * to be written out. + * + * Called with j_list_lock held. + */ +static int __wait_cp_io(journal_t *journal, transaction_t *transaction) +{ + struct journal_head *jh; + struct buffer_head *bh; + tid_t this_tid; + int released = 0; + int ret = 0; + + this_tid = transaction->t_tid; +restart: + /* Did somebody clean up the transaction in the meanwhile? */ + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + return ret; + while (!released && transaction->t_checkpoint_io_list) { + jh = transaction->t_checkpoint_io_list; + bh = jh2bh(jh); + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + spin_lock(&journal->j_list_lock); + goto restart; + } + get_bh(bh); + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + spin_lock(&journal->j_list_lock); + goto restart; + } + if (unlikely(buffer_write_io_error(bh))) + ret = -EIO; + + /* + * Now in whatever state the buffer currently is, we know that + * it has been written out and so we can drop it from the list + */ + released = __jbd2_journal_remove_checkpoint(jh); + jbd_unlock_bh_state(bh); + __brelse(bh); + } + + return ret; +} + +static void +__flush_batch(journal_t *journal, int *batch_count) +{ + int i; + + for (i = 0; i < *batch_count; i++) + write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE); + + for (i = 0; i < *batch_count; i++) { + struct buffer_head *bh = journal->j_chkpt_bhs[i]; + clear_buffer_jwrite(bh); + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + } + *batch_count = 0; +} + +/* + * Try to flush one buffer from the checkpoint list to disk. + * + * Return 1 if something happened which requires us to abort the current + * scan of the checkpoint list. Return <0 if the buffer has failed to + * be written out. + * + * Called with j_list_lock held and drops it if 1 is returned + * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it + */ +static int __process_buffer(journal_t *journal, struct journal_head *jh, + int *batch_count, transaction_t *transaction) +{ + struct buffer_head *bh = jh2bh(jh); + int ret = 0; + + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + ret = 1; + } else if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; + + transaction->t_chp_stats.cs_forced_to_close++; + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + if (unlikely(journal->j_flags & JBD2_UNMOUNT)) + /* + * The journal thread is dead; so starting and + * waiting for a commit to finish will cause + * us to wait for a _very_ long time. + */ + printk(KERN_ERR "JBD2: %s: " + "Waiting for Godot: block %llu\n", + journal->j_devname, + (unsigned long long) bh->b_blocknr); + jbd2_log_start_commit(journal, tid); + jbd2_log_wait_commit(journal, tid); + ret = 1; + } else if (!buffer_dirty(bh)) { + ret = 1; + if (unlikely(buffer_write_io_error(bh))) + ret = -EIO; + get_bh(bh); + J_ASSERT_JH(jh, !buffer_jbddirty(bh)); + BUFFER_TRACE(bh, "remove from checkpoint"); + __jbd2_journal_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __brelse(bh); + } else { + /* + * Important: we are about to write the buffer, and + * possibly block, while still holding the journal lock. + * We cannot afford to let the transaction logic start + * messing around with this buffer before we write it to + * disk, as that would break recoverability. + */ + BUFFER_TRACE(bh, "queue"); + get_bh(bh); + J_ASSERT_BH(bh, !buffer_jwrite(bh)); + set_buffer_jwrite(bh); + journal->j_chkpt_bhs[*batch_count] = bh; + __buffer_relink_io(jh); + jbd_unlock_bh_state(bh); + transaction->t_chp_stats.cs_written++; + (*batch_count)++; + if (*batch_count == JBD2_NR_BATCH) { + spin_unlock(&journal->j_list_lock); + __flush_batch(journal, batch_count); + ret = 1; + } + } + return ret; +} + +/* + * Perform an actual checkpoint. We take the first transaction on the + * list of transactions to be checkpointed and send all its buffers + * to disk. We submit larger chunks of data at once. + * + * The journal should be locked before calling this function. + * Called with j_checkpoint_mutex held. + */ +int jbd2_log_do_checkpoint(journal_t *journal) +{ + transaction_t *transaction; + tid_t this_tid; + int result; + + jbd_debug(1, "Start checkpoint\n"); + + /* + * First thing: if there are any transactions in the log which + * don't need checkpointing, just eliminate them from the + * journal straight away. + */ + result = jbd2_cleanup_journal_tail(journal); + trace_jbd2_checkpoint(journal, result); + jbd_debug(1, "cleanup_journal_tail returned %d\n", result); + if (result <= 0) + return result; + + /* + * OK, we need to start writing disk blocks. Take one transaction + * and write it. + */ + result = 0; + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) + goto out; + transaction = journal->j_checkpoint_transactions; + if (transaction->t_chp_stats.cs_chp_time == 0) + transaction->t_chp_stats.cs_chp_time = jiffies; + this_tid = transaction->t_tid; +restart: + /* + * If someone cleaned up this transaction while we slept, we're + * done (maybe it's a new transaction, but it fell at the same + * address). + */ + if (journal->j_checkpoint_transactions == transaction && + transaction->t_tid == this_tid) { + int batch_count = 0; + struct journal_head *jh; + int retry = 0, err; + + while (!retry && transaction->t_checkpoint_list) { + struct buffer_head *bh; + + jh = transaction->t_checkpoint_list; + bh = jh2bh(jh); + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + retry = 1; + break; + } + retry = __process_buffer(journal, jh, &batch_count, + transaction); + if (retry < 0 && !result) + result = retry; + if (!retry && (need_resched() || + spin_needbreak(&journal->j_list_lock))) { + spin_unlock(&journal->j_list_lock); + retry = 1; + break; + } + } + + if (batch_count) { + if (!retry) { + spin_unlock(&journal->j_list_lock); + retry = 1; + } + __flush_batch(journal, &batch_count); + } + + if (retry) { + spin_lock(&journal->j_list_lock); + goto restart; + } + /* + * Now we have cleaned up the first transaction's checkpoint + * list. Let's clean up the second one + */ + err = __wait_cp_io(journal, transaction); + if (!result) + result = err; + } +out: + spin_unlock(&journal->j_list_lock); + if (result < 0) + jbd2_journal_abort(journal, result); + else + result = jbd2_cleanup_journal_tail(journal); + + return (result < 0) ? result : 0; +} + +/* + * Check the list of checkpoint transactions for the journal to see if + * we have already got rid of any since the last update of the log tail + * in the journal superblock. If so, we can instantly roll the + * superblock forward to remove those transactions from the log. + * + * Return <0 on error, 0 on success, 1 if there was nothing to clean up. + * + * Called with the journal lock held. + * + * This is the only part of the journaling code which really needs to be + * aware of transaction aborts. Checkpointing involves writing to the + * main filesystem area rather than to the journal, so it can proceed + * even in abort state, but we must not update the super block if + * checkpointing may have failed. Otherwise, we would lose some metadata + * buffers which should be written-back to the filesystem. + */ + +int jbd2_cleanup_journal_tail(journal_t *journal) +{ + transaction_t * transaction; + tid_t first_tid; + unsigned long blocknr, freed; + + if (is_journal_aborted(journal)) + return 1; + + /* OK, work out the oldest transaction remaining in the log, and + * the log block it starts at. + * + * If the log is now empty, we need to work out which is the + * next transaction ID we will write, and where it will + * start. */ + + write_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + transaction = journal->j_checkpoint_transactions; + if (transaction) { + first_tid = transaction->t_tid; + blocknr = transaction->t_log_start; + } else if ((transaction = journal->j_committing_transaction) != NULL) { + first_tid = transaction->t_tid; + blocknr = transaction->t_log_start; + } else if ((transaction = journal->j_running_transaction) != NULL) { + first_tid = transaction->t_tid; + blocknr = journal->j_head; + } else { + first_tid = journal->j_transaction_sequence; + blocknr = journal->j_head; + } + spin_unlock(&journal->j_list_lock); + J_ASSERT(blocknr != 0); + + /* If the oldest pinned transaction is at the tail of the log + already then there's not much we can do right now. */ + if (journal->j_tail_sequence == first_tid) { + write_unlock(&journal->j_state_lock); + return 1; + } + + /* OK, update the superblock to recover the freed space. + * Physical blocks come first: have we wrapped beyond the end of + * the log? */ + freed = blocknr - journal->j_tail; + if (blocknr < journal->j_tail) + freed = freed + journal->j_last - journal->j_first; + + trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed); + jbd_debug(1, + "Cleaning journal tail from %d to %d (offset %lu), " + "freeing %lu\n", + journal->j_tail_sequence, first_tid, blocknr, freed); + + journal->j_free += freed; + journal->j_tail_sequence = first_tid; + journal->j_tail = blocknr; + write_unlock(&journal->j_state_lock); + + /* + * If there is an external journal, we need to make sure that + * any data blocks that were recently written out --- perhaps + * by jbd2_log_do_checkpoint() --- are flushed out before we + * drop the transactions from the external journal. It's + * unlikely this will be necessary, especially with a + * appropriately sized journal, but we need this to guarantee + * correctness. Fortunately jbd2_cleanup_journal_tail() + * doesn't get called all that often. + */ + if ((journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + if (!(journal->j_flags & JBD2_ABORT)) + jbd2_journal_update_superblock(journal, 1); + return 0; +} + + +/* Checkpoint list management */ + +/* + * journal_clean_one_cp_list + * + * Find all the written-back checkpoint buffers in the given list and + * release them. + * + * Called with the journal locked. + * Called with j_list_lock held. + * Returns number of bufers reaped (for debug) + */ + +static int journal_clean_one_cp_list(struct journal_head *jh, int *released) +{ + struct journal_head *last_jh; + struct journal_head *next_jh = jh; + int ret, freed = 0; + + *released = 0; + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + /* Use trylock because of the ranking */ + if (jbd_trylock_bh_state(jh2bh(jh))) { + ret = __try_to_free_cp_buf(jh); + if (ret) { + freed++; + if (ret == 2) { + *released = 1; + return freed; + } + } + } + /* + * This function only frees up some memory + * if possible so we dont have an obligation + * to finish processing. Bail out if preemption + * requested: + */ + if (need_resched()) + return freed; + } while (jh != last_jh); + + return freed; +} + +/* + * journal_clean_checkpoint_list + * + * Find all the written-back checkpoint buffers in the journal and release them. + * + * Called with the journal locked. + * Called with j_list_lock held. + * Returns number of buffers reaped (for debug) + */ + +int __jbd2_journal_clean_checkpoint_list(journal_t *journal) +{ + transaction_t *transaction, *last_transaction, *next_transaction; + int ret = 0; + int released; + + transaction = journal->j_checkpoint_transactions; + if (!transaction) + goto out; + + last_transaction = transaction->t_cpprev; + next_transaction = transaction; + do { + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + ret += journal_clean_one_cp_list(transaction-> + t_checkpoint_list, &released); + /* + * This function only frees up some memory if possible so we + * dont have an obligation to finish processing. Bail out if + * preemption requested: + */ + if (need_resched()) + goto out; + if (released) + continue; + /* + * It is essential that we are as careful as in the case of + * t_checkpoint_list with removing the buffer from the list as + * we can possibly see not yet submitted buffers on io_list + */ + ret += journal_clean_one_cp_list(transaction-> + t_checkpoint_io_list, &released); + if (need_resched()) + goto out; + } while (transaction != last_transaction); +out: + return ret; +} + +/* + * journal_remove_checkpoint: called after a buffer has been committed + * to disk (either by being write-back flushed to disk, or being + * committed to the log). + * + * We cannot safely clean a transaction out of the log until all of the + * buffer updates committed in that transaction have safely been stored + * elsewhere on disk. To achieve this, all of the buffers in a + * transaction need to be maintained on the transaction's checkpoint + * lists until they have been rewritten, at which point this function is + * called to remove the buffer from the existing transaction's + * checkpoint lists. + * + * The function returns 1 if it frees the transaction, 0 otherwise. + * The function can free jh and bh. + * + * This function is called with j_list_lock held. + * This function is called with jbd_lock_bh_state(jh2bh(jh)) + */ + +int __jbd2_journal_remove_checkpoint(struct journal_head *jh) +{ + struct transaction_chp_stats_s *stats; + transaction_t *transaction; + journal_t *journal; + int ret = 0; + + JBUFFER_TRACE(jh, "entry"); + + if ((transaction = jh->b_cp_transaction) == NULL) { + JBUFFER_TRACE(jh, "not on transaction"); + goto out; + } + journal = transaction->t_journal; + + JBUFFER_TRACE(jh, "removing from transaction"); + __buffer_unlink(jh); + jh->b_cp_transaction = NULL; + jbd2_journal_put_journal_head(jh); + + if (transaction->t_checkpoint_list != NULL || + transaction->t_checkpoint_io_list != NULL) + goto out; + + /* + * There is one special case to worry about: if we have just pulled the + * buffer off a running or committing transaction's checkpoing list, + * then even if the checkpoint list is empty, the transaction obviously + * cannot be dropped! + * + * The locking here around t_state is a bit sleazy. + * See the comment at the end of jbd2_journal_commit_transaction(). + */ + if (transaction->t_state != T_FINISHED) + goto out; + + /* OK, that was the last buffer for the transaction: we can now + safely remove this transaction from the log */ + stats = &transaction->t_chp_stats; + if (stats->cs_chp_time) + stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, + jiffies); + trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev, + transaction->t_tid, stats); + + __jbd2_journal_drop_transaction(journal, transaction); + kfree(transaction); + + /* Just in case anybody was waiting for more transactions to be + checkpointed... */ + wake_up(&journal->j_wait_logspace); + ret = 1; +out: + return ret; +} + +/* + * journal_insert_checkpoint: put a committed buffer onto a checkpoint + * list so that we know when it is safe to clean the transaction out of + * the log. + * + * Called with the journal locked. + * Called with j_list_lock held. + */ +void __jbd2_journal_insert_checkpoint(struct journal_head *jh, + transaction_t *transaction) +{ + JBUFFER_TRACE(jh, "entry"); + J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + + /* Get reference for checkpointing transaction */ + jbd2_journal_grab_journal_head(jh2bh(jh)); + jh->b_cp_transaction = transaction; + + if (!transaction->t_checkpoint_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_list; + jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_list = jh; +} + +/* + * We've finished with this transaction structure: adios... + * + * The transaction must have no links except for the checkpoint by this + * point. + * + * Called with the journal locked. + * Called with j_list_lock held. + */ + +void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) +{ + assert_spin_locked(&journal->j_list_lock); + if (transaction->t_cpnext) { + transaction->t_cpnext->t_cpprev = transaction->t_cpprev; + transaction->t_cpprev->t_cpnext = transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = + transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = NULL; + } + + J_ASSERT(transaction->t_state == T_FINISHED); + J_ASSERT(transaction->t_buffers == NULL); + J_ASSERT(transaction->t_forget == NULL); + J_ASSERT(transaction->t_iobuf_list == NULL); + J_ASSERT(transaction->t_shadow_list == NULL); + J_ASSERT(transaction->t_log_list == NULL); + J_ASSERT(transaction->t_checkpoint_list == NULL); + J_ASSERT(transaction->t_checkpoint_io_list == NULL); + J_ASSERT(atomic_read(&transaction->t_updates) == 0); + J_ASSERT(journal->j_committing_transaction != transaction); + J_ASSERT(journal->j_running_transaction != transaction); + + jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); +} diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c new file mode 100644 index 00000000..36c2e800 --- /dev/null +++ b/fs/jbd2/commit.c @@ -0,0 +1,1048 @@ +/* + * linux/fs/jbd2/commit.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal commit routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Default IO end handler for temporary BJ_IO buffer_heads. + */ +static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + BUFFER_TRACE(bh, ""); + if (uptodate) + set_buffer_uptodate(bh); + else + clear_buffer_uptodate(bh); + unlock_buffer(bh); +} + +/* + * When an ext4 file is truncated, it is possible that some pages are not + * successfully freed, because they are attached to a committing transaction. + * After the transaction commits, these pages are left on the LRU, with no + * ->mapping, and with attached buffers. These pages are trivially reclaimable + * by the VM, but their apparent absence upsets the VM accounting, and it makes + * the numbers in /proc/meminfo look odd. + * + * So here, we have a buffer which has just come off the forget list. Look to + * see if we can strip all buffers from the backing page. + * + * Called under lock_journal(), and possibly under journal_datalist_lock. The + * caller provided us with a ref against the buffer, and we drop that here. + */ +static void release_buffer_page(struct buffer_head *bh) +{ + struct page *page; + + if (buffer_dirty(bh)) + goto nope; + if (atomic_read(&bh->b_count) != 1) + goto nope; + page = bh->b_page; + if (!page) + goto nope; + if (page->mapping) + goto nope; + + /* OK, it's a truncated page */ + if (!trylock_page(page)) + goto nope; + + page_cache_get(page); + __brelse(bh); + try_to_free_buffers(page); + unlock_page(page); + page_cache_release(page); + return; + +nope: + __brelse(bh); +} + +/* + * Done it all: now submit the commit record. We should have + * cleaned up our previous buffers by now, so if we are in abort + * mode we can now just skip the rest of the journal write + * entirely. + * + * Returns 1 if the journal needs to be aborted or 0 on success + */ +static int journal_submit_commit_record(journal_t *journal, + transaction_t *commit_transaction, + struct buffer_head **cbh, + __u32 crc32_sum) +{ + struct journal_head *descriptor; + struct commit_header *tmp; + struct buffer_head *bh; + int ret; + struct timespec now = current_kernel_time(); + + *cbh = NULL; + + if (is_journal_aborted(journal)) + return 0; + + descriptor = jbd2_journal_get_descriptor_buffer(journal); + if (!descriptor) + return 1; + + bh = jh2bh(descriptor); + + tmp = (struct commit_header *)bh->b_data; + tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); + tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); + tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + tmp->h_commit_sec = cpu_to_be64(now.tv_sec); + tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); + + if (JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM)) { + tmp->h_chksum_type = JBD2_CRC32_CHKSUM; + tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; + tmp->h_chksum[0] = cpu_to_be32(crc32_sum); + } + + JBUFFER_TRACE(descriptor, "submit commit block"); + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + + if (journal->j_flags & JBD2_BARRIER && + !JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) + ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh); + else + ret = submit_bh(WRITE_SYNC, bh); + + *cbh = bh; + return ret; +} + +/* + * This function along with journal_submit_commit_record + * allows to write the commit record asynchronously. + */ +static int journal_wait_on_commit_record(journal_t *journal, + struct buffer_head *bh) +{ + int ret = 0; + + clear_buffer_dirty(bh); + wait_on_buffer(bh); + + if (unlikely(!buffer_uptodate(bh))) + ret = -EIO; + put_bh(bh); /* One for getblk() */ + jbd2_journal_put_journal_head(bh2jh(bh)); + + return ret; +} + +/* + * write the filemap data using writepage() address_space_operations. + * We don't do block allocation here even for delalloc. We don't + * use writepages() because with dealyed allocation we may be doing + * block allocation in writepages(). + */ +static int journal_submit_inode_data_buffers(struct address_space *mapping) +{ + int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = mapping->nrpages * 2, + .range_start = 0, + .range_end = i_size_read(mapping->host), + }; + + ret = generic_writepages(mapping, &wbc); + return ret; +} + +/* + * Submit all the data buffers of inode associated with the transaction to + * disk. + * + * We are in a committing transaction. Therefore no new inode can be added to + * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently + * operate on from being released while we write out pages. + */ +static int journal_submit_data_buffers(journal_t *journal, + transaction_t *commit_transaction) +{ + struct jbd2_inode *jinode; + int err, ret = 0; + struct address_space *mapping; + + spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + mapping = jinode->i_vfs_inode->i_mapping; + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + spin_unlock(&journal->j_list_lock); + /* + * submit the inode data buffers. We use writepage + * instead of writepages. Because writepages can do + * block allocation with delalloc. We need to write + * only allocated blocks here. + */ + trace_jbd2_submit_inode_data(jinode->i_vfs_inode); + err = journal_submit_inode_data_buffers(mapping); + if (!ret) + ret = err; + spin_lock(&journal->j_list_lock); + J_ASSERT(jinode->i_transaction == commit_transaction); + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + } + spin_unlock(&journal->j_list_lock); + return ret; +} + +/* + * Wait for data submitted for writeout, refile inodes to proper + * transaction if needed. + * + */ +static int journal_finish_inode_data_buffers(journal_t *journal, + transaction_t *commit_transaction) +{ + struct jbd2_inode *jinode, *next_i; + int err, ret = 0; + + /* For locking, see the comment in journal_submit_data_buffers() */ + spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + spin_unlock(&journal->j_list_lock); + err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); + if (err) { + /* + * Because AS_EIO is cleared by + * filemap_fdatawait_range(), set it again so + * that user process can get -EIO from fsync(). + */ + set_bit(AS_EIO, + &jinode->i_vfs_inode->i_mapping->flags); + + if (!ret) + ret = err; + } + spin_lock(&journal->j_list_lock); + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + } + + /* Now refile inode to proper lists */ + list_for_each_entry_safe(jinode, next_i, + &commit_transaction->t_inode_list, i_list) { + list_del(&jinode->i_list); + if (jinode->i_next_transaction) { + jinode->i_transaction = jinode->i_next_transaction; + jinode->i_next_transaction = NULL; + list_add(&jinode->i_list, + &jinode->i_transaction->t_inode_list); + } else { + jinode->i_transaction = NULL; + } + } + spin_unlock(&journal->j_list_lock); + + return ret; +} + +static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) +{ + struct page *page = bh->b_page; + char *addr; + __u32 checksum; + + addr = kmap_atomic(page, KM_USER0); + checksum = crc32_be(crc32_sum, + (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); + kunmap_atomic(addr, KM_USER0); + + return checksum; +} + +static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, + unsigned long long block) +{ + tag->t_blocknr = cpu_to_be32(block & (u32)~0); + if (tag_bytes > JBD2_TAG_SIZE32) + tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); +} + +/* + * jbd2_journal_commit_transaction + * + * The primary function for committing a transaction to the log. This + * function is called by the journal thread to begin a complete commit. + */ +void jbd2_journal_commit_transaction(journal_t *journal) +{ + struct transaction_stats_s stats; + transaction_t *commit_transaction; + struct journal_head *jh, *new_jh, *descriptor; + struct buffer_head **wbuf = journal->j_wbuf; + int bufs; + int flags; + int err; + unsigned long long blocknr; + ktime_t start_time; + u64 commit_time; + char *tagp = NULL; + journal_header_t *header; + journal_block_tag_t *tag = NULL; + int space_left = 0; + int first_tag = 0; + int tag_flag; + int i, to_free = 0; + int tag_bytes = journal_tag_bytes(journal); + struct buffer_head *cbh = NULL; /* For transactional checksums */ + __u32 crc32_sum = ~0; + struct blk_plug plug; + + /* + * First job: lock down the current transaction and wait for + * all outstanding updates to complete. + */ + + /* Do we need to erase the effects of a prior jbd2_journal_flush? */ + if (journal->j_flags & JBD2_FLUSHED) { + jbd_debug(3, "super block updated\n"); + jbd2_journal_update_superblock(journal, 1); + } else { + jbd_debug(3, "superblock not updated\n"); + } + + J_ASSERT(journal->j_running_transaction != NULL); + J_ASSERT(journal->j_committing_transaction == NULL); + + commit_transaction = journal->j_running_transaction; + J_ASSERT(commit_transaction->t_state == T_RUNNING); + + trace_jbd2_start_commit(journal, commit_transaction); + jbd_debug(1, "JBD: starting commit of transaction %d\n", + commit_transaction->t_tid); + + write_lock(&journal->j_state_lock); + commit_transaction->t_state = T_LOCKED; + + trace_jbd2_commit_locking(journal, commit_transaction); + stats.run.rs_wait = commit_transaction->t_max_wait; + stats.run.rs_locked = jiffies; + stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, + stats.run.rs_locked); + + spin_lock(&commit_transaction->t_handle_lock); + while (atomic_read(&commit_transaction->t_updates)) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + if (atomic_read(&commit_transaction->t_updates)) { + spin_unlock(&commit_transaction->t_handle_lock); + write_unlock(&journal->j_state_lock); + schedule(); + write_lock(&journal->j_state_lock); + spin_lock(&commit_transaction->t_handle_lock); + } + finish_wait(&journal->j_wait_updates, &wait); + } + spin_unlock(&commit_transaction->t_handle_lock); + + J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= + journal->j_max_transaction_buffers); + + /* + * First thing we are allowed to do is to discard any remaining + * BJ_Reserved buffers. Note, it is _not_ permissible to assume + * that there are no such buffers: if a large filesystem + * operation like a truncate needs to split itself over multiple + * transactions, then it may try to do a jbd2_journal_restart() while + * there are still BJ_Reserved buffers outstanding. These must + * be released cleanly from the current transaction. + * + * In this case, the filesystem must still reserve write access + * again before modifying the buffer in the new transaction, but + * we do not require it to remember exactly which old buffers it + * has reserved. This is consistent with the existing behaviour + * that multiple jbd2_journal_get_write_access() calls to the same + * buffer are perfectly permissible. + */ + while (commit_transaction->t_reserved_list) { + jh = commit_transaction->t_reserved_list; + JBUFFER_TRACE(jh, "reserved, unused: refile"); + /* + * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may + * leave undo-committed data. + */ + if (jh->b_committed_data) { + struct buffer_head *bh = jh2bh(jh); + + jbd_lock_bh_state(bh); + jbd2_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; + jbd_unlock_bh_state(bh); + } + jbd2_journal_refile_buffer(journal, jh); + } + + /* + * Now try to drop any written-back buffers from the journal's + * checkpoint lists. We do this *before* commit because it potentially + * frees some memory + */ + spin_lock(&journal->j_list_lock); + __jbd2_journal_clean_checkpoint_list(journal); + spin_unlock(&journal->j_list_lock); + + jbd_debug (3, "JBD: commit phase 1\n"); + + /* + * Switch to a new revoke table. + */ + jbd2_journal_switch_revoke_table(journal); + + trace_jbd2_commit_flushing(journal, commit_transaction); + stats.run.rs_flushing = jiffies; + stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, + stats.run.rs_flushing); + + commit_transaction->t_state = T_FLUSH; + journal->j_committing_transaction = commit_transaction; + journal->j_running_transaction = NULL; + start_time = ktime_get(); + commit_transaction->t_log_start = journal->j_head; + wake_up(&journal->j_wait_transaction_locked); + write_unlock(&journal->j_state_lock); + + jbd_debug (3, "JBD: commit phase 2\n"); + + /* + * Now start flushing things to disk, in the order they appear + * on the transaction lists. Data blocks go first. + */ + err = journal_submit_data_buffers(journal, commit_transaction); + if (err) + jbd2_journal_abort(journal, err); + + blk_start_plug(&plug); + jbd2_journal_write_revoke_records(journal, commit_transaction, + WRITE_SYNC); + blk_finish_plug(&plug); + + jbd_debug(3, "JBD: commit phase 2\n"); + + /* + * Way to go: we have now written out all of the data for a + * transaction! Now comes the tricky part: we need to write out + * metadata. Loop over the transaction's entire buffer list: + */ + write_lock(&journal->j_state_lock); + commit_transaction->t_state = T_COMMIT; + write_unlock(&journal->j_state_lock); + + trace_jbd2_commit_logging(journal, commit_transaction); + stats.run.rs_logging = jiffies; + stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, + stats.run.rs_logging); + stats.run.rs_blocks = + atomic_read(&commit_transaction->t_outstanding_credits); + stats.run.rs_blocks_logged = 0; + + J_ASSERT(commit_transaction->t_nr_buffers <= + atomic_read(&commit_transaction->t_outstanding_credits)); + + err = 0; + descriptor = NULL; + bufs = 0; + blk_start_plug(&plug); + while (commit_transaction->t_buffers) { + + /* Find the next buffer to be journaled... */ + + jh = commit_transaction->t_buffers; + + /* If we're in abort mode, we just un-journal the buffer and + release it. */ + + if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); + JBUFFER_TRACE(jh, "journal is aborting: refile"); + jbd2_buffer_abort_trigger(jh, + jh->b_frozen_data ? + jh->b_frozen_triggers : + jh->b_triggers); + jbd2_journal_refile_buffer(journal, jh); + /* If that was the last one, we need to clean up + * any descriptor buffers which may have been + * already allocated, even if we are now + * aborting. */ + if (!commit_transaction->t_buffers) + goto start_journal_io; + continue; + } + + /* Make sure we have a descriptor block in which to + record the metadata buffer. */ + + if (!descriptor) { + struct buffer_head *bh; + + J_ASSERT (bufs == 0); + + jbd_debug(4, "JBD: get descriptor\n"); + + descriptor = jbd2_journal_get_descriptor_buffer(journal); + if (!descriptor) { + jbd2_journal_abort(journal, -EIO); + continue; + } + + bh = jh2bh(descriptor); + jbd_debug(4, "JBD: got buffer %llu (%p)\n", + (unsigned long long)bh->b_blocknr, bh->b_data); + header = (journal_header_t *)&bh->b_data[0]; + header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); + header->h_sequence = cpu_to_be32(commit_transaction->t_tid); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + space_left = bh->b_size - sizeof(journal_header_t); + first_tag = 1; + set_buffer_jwrite(bh); + set_buffer_dirty(bh); + wbuf[bufs++] = bh; + + /* Record it so that we can wait for IO + completion later */ + BUFFER_TRACE(bh, "ph3: file as descriptor"); + jbd2_journal_file_buffer(descriptor, commit_transaction, + BJ_LogCtl); + } + + /* Where is the buffer to be written? */ + + err = jbd2_journal_next_log_block(journal, &blocknr); + /* If the block mapping failed, just abandon the buffer + and repeat this loop: we'll fall into the + refile-on-abort condition above. */ + if (err) { + jbd2_journal_abort(journal, err); + continue; + } + + /* + * start_this_handle() uses t_outstanding_credits to determine + * the free space in the log, but this counter is changed + * by jbd2_journal_next_log_block() also. + */ + atomic_dec(&commit_transaction->t_outstanding_credits); + + /* Bump b_count to prevent truncate from stumbling over + the shadowed buffer! @@@ This can go if we ever get + rid of the BJ_IO/BJ_Shadow pairing of buffers. */ + atomic_inc(&jh2bh(jh)->b_count); + + /* Make a temporary IO buffer with which to write it out + (this will requeue both the metadata buffer and the + temporary IO buffer). new_bh goes on BJ_IO*/ + + set_bit(BH_JWrite, &jh2bh(jh)->b_state); + /* + * akpm: jbd2_journal_write_metadata_buffer() sets + * new_bh->b_transaction to commit_transaction. + * We need to clean this up before we release new_bh + * (which is of type BJ_IO) + */ + JBUFFER_TRACE(jh, "ph3: write metadata"); + flags = jbd2_journal_write_metadata_buffer(commit_transaction, + jh, &new_jh, blocknr); + if (flags < 0) { + jbd2_journal_abort(journal, flags); + continue; + } + set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); + wbuf[bufs++] = jh2bh(new_jh); + + /* Record the new block's tag in the current descriptor + buffer */ + + tag_flag = 0; + if (flags & 1) + tag_flag |= JBD2_FLAG_ESCAPE; + if (!first_tag) + tag_flag |= JBD2_FLAG_SAME_UUID; + + tag = (journal_block_tag_t *) tagp; + write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); + tag->t_flags = cpu_to_be32(tag_flag); + tagp += tag_bytes; + space_left -= tag_bytes; + + if (first_tag) { + memcpy (tagp, journal->j_uuid, 16); + tagp += 16; + space_left -= 16; + first_tag = 0; + } + + /* If there's no more to do, or if the descriptor is full, + let the IO rip! */ + + if (bufs == journal->j_wbufsize || + commit_transaction->t_buffers == NULL || + space_left < tag_bytes + 16) { + + jbd_debug(4, "JBD: Submit %d IOs\n", bufs); + + /* Write an end-of-descriptor marker before + submitting the IOs. "tag" still points to + the last tag we set up. */ + + tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG); + +start_journal_io: + for (i = 0; i < bufs; i++) { + struct buffer_head *bh = wbuf[i]; + /* + * Compute checksum. + */ + if (JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM)) { + crc32_sum = + jbd2_checksum_data(crc32_sum, bh); + } + + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + submit_bh(WRITE_SYNC, bh); + } + cond_resched(); + stats.run.rs_blocks_logged += bufs; + + /* Force a new descriptor to be generated next + time round the loop. */ + descriptor = NULL; + bufs = 0; + } + } + + err = journal_finish_inode_data_buffers(journal, commit_transaction); + if (err) { + printk(KERN_WARNING + "JBD2: Detected IO errors while flushing file data " + "on %s\n", journal->j_devname); + if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) + jbd2_journal_abort(journal, err); + err = 0; + } + + write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_state = T_COMMIT_DFLUSH; + write_unlock(&journal->j_state_lock); + /* + * If the journal is not located on the file system device, + * then we must flush the file system device before we issue + * the commit record + */ + if (commit_transaction->t_need_data_flush && + (journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); + + /* Done it all: now write the commit record asynchronously. */ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + err = journal_submit_commit_record(journal, commit_transaction, + &cbh, crc32_sum); + if (err) + __jbd2_journal_abort_hard(journal); + } + + blk_finish_plug(&plug); + + /* Lo and behold: we have just managed to send a transaction to + the log. Before we can commit it, wait for the IO so far to + complete. Control buffers being written are on the + transaction's t_log_list queue, and metadata buffers are on + the t_iobuf_list queue. + + Wait for the buffers in reverse order. That way we are + less likely to be woken up until all IOs have completed, and + so we incur less scheduling load. + */ + + jbd_debug(3, "JBD: commit phase 3\n"); + + /* + * akpm: these are BJ_IO, and j_list_lock is not needed. + * See __journal_try_to_free_buffer. + */ +wait_for_iobuf: + while (commit_transaction->t_iobuf_list != NULL) { + struct buffer_head *bh; + + jh = commit_transaction->t_iobuf_list->b_tprev; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + wait_on_buffer(bh); + goto wait_for_iobuf; + } + if (cond_resched()) + goto wait_for_iobuf; + + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + + clear_buffer_jwrite(bh); + + JBUFFER_TRACE(jh, "ph4: unfile after journal write"); + jbd2_journal_unfile_buffer(journal, jh); + + /* + * ->t_iobuf_list should contain only dummy buffer_heads + * which were created by jbd2_journal_write_metadata_buffer(). + */ + BUFFER_TRACE(bh, "dumping temporary bh"); + jbd2_journal_put_journal_head(jh); + __brelse(bh); + J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); + free_buffer_head(bh); + + /* We also have to unlock and free the corresponding + shadowed buffer */ + jh = commit_transaction->t_shadow_list->b_tprev; + bh = jh2bh(jh); + clear_bit(BH_JWrite, &bh->b_state); + J_ASSERT_BH(bh, buffer_jbddirty(bh)); + + /* The metadata is now released for reuse, but we need + to remember it against this transaction so that when + we finally commit, we can do any checkpointing + required. */ + JBUFFER_TRACE(jh, "file as BJ_Forget"); + jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); + /* + * Wake up any transactions which were waiting for this IO to + * complete. The barrier must be here so that changes by + * jbd2_journal_file_buffer() take effect before wake_up_bit() + * does the waitqueue check. + */ + smp_mb(); + wake_up_bit(&bh->b_state, BH_Unshadow); + JBUFFER_TRACE(jh, "brelse shadowed buffer"); + __brelse(bh); + } + + J_ASSERT (commit_transaction->t_shadow_list == NULL); + + jbd_debug(3, "JBD: commit phase 4\n"); + + /* Here we wait for the revoke record and descriptor record buffers */ + wait_for_ctlbuf: + while (commit_transaction->t_log_list != NULL) { + struct buffer_head *bh; + + jh = commit_transaction->t_log_list->b_tprev; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + wait_on_buffer(bh); + goto wait_for_ctlbuf; + } + if (cond_resched()) + goto wait_for_ctlbuf; + + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + + BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); + clear_buffer_jwrite(bh); + jbd2_journal_unfile_buffer(journal, jh); + jbd2_journal_put_journal_head(jh); + __brelse(bh); /* One for getblk */ + /* AKPM: bforget here */ + } + + if (err) + jbd2_journal_abort(journal, err); + + jbd_debug(3, "JBD: commit phase 5\n"); + write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); + commit_transaction->t_state = T_COMMIT_JFLUSH; + write_unlock(&journal->j_state_lock); + + if (!JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + err = journal_submit_commit_record(journal, commit_transaction, + &cbh, crc32_sum); + if (err) + __jbd2_journal_abort_hard(journal); + } + if (cbh) + err = journal_wait_on_commit_record(journal, cbh); + if (JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) && + journal->j_flags & JBD2_BARRIER) { + blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); + } + + if (err) + jbd2_journal_abort(journal, err); + + /* End of a transaction! Finally, we can do checkpoint + processing: any buffers committed as a result of this + transaction can be removed from any checkpoint list it was on + before. */ + + jbd_debug(3, "JBD: commit phase 6\n"); + + J_ASSERT(list_empty(&commit_transaction->t_inode_list)); + J_ASSERT(commit_transaction->t_buffers == NULL); + J_ASSERT(commit_transaction->t_checkpoint_list == NULL); + J_ASSERT(commit_transaction->t_iobuf_list == NULL); + J_ASSERT(commit_transaction->t_shadow_list == NULL); + J_ASSERT(commit_transaction->t_log_list == NULL); + +restart_loop: + /* + * As there are other places (journal_unmap_buffer()) adding buffers + * to this list we have to be careful and hold the j_list_lock. + */ + spin_lock(&journal->j_list_lock); + while (commit_transaction->t_forget) { + transaction_t *cp_transaction; + struct buffer_head *bh; + int try_to_free = 0; + + jh = commit_transaction->t_forget; + spin_unlock(&journal->j_list_lock); + bh = jh2bh(jh); + /* + * Get a reference so that bh cannot be freed before we are + * done with it. + */ + get_bh(bh); + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); + + /* + * If there is undo-protected committed data against + * this buffer, then we can remove it now. If it is a + * buffer needing such protection, the old frozen_data + * field now points to a committed version of the + * buffer, so rotate that field to the new committed + * data. + * + * Otherwise, we can just throw away the frozen data now. + * + * We also know that the frozen data has already fired + * its triggers if they exist, so we can clear that too. + */ + if (jh->b_committed_data) { + jbd2_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; + if (jh->b_frozen_data) { + jh->b_committed_data = jh->b_frozen_data; + jh->b_frozen_data = NULL; + jh->b_frozen_triggers = NULL; + } + } else if (jh->b_frozen_data) { + jbd2_free(jh->b_frozen_data, bh->b_size); + jh->b_frozen_data = NULL; + jh->b_frozen_triggers = NULL; + } + + spin_lock(&journal->j_list_lock); + cp_transaction = jh->b_cp_transaction; + if (cp_transaction) { + JBUFFER_TRACE(jh, "remove from old cp transaction"); + cp_transaction->t_chp_stats.cs_dropped++; + __jbd2_journal_remove_checkpoint(jh); + } + + /* Only re-checkpoint the buffer_head if it is marked + * dirty. If the buffer was added to the BJ_Forget list + * by jbd2_journal_forget, it may no longer be dirty and + * there's no point in keeping a checkpoint record for + * it. */ + + /* A buffer which has been freed while still being + * journaled by a previous transaction may end up still + * being dirty here, but we want to avoid writing back + * that buffer in the future after the "add to orphan" + * operation been committed, That's not only a performance + * gain, it also stops aliasing problems if the buffer is + * left behind for writeback and gets reallocated for another + * use in a different page. */ + if (buffer_freed(bh) && !jh->b_next_transaction) { + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + } + + if (buffer_jbddirty(bh)) { + JBUFFER_TRACE(jh, "add to new checkpointing trans"); + __jbd2_journal_insert_checkpoint(jh, commit_transaction); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); + } else { + J_ASSERT_BH(bh, !buffer_dirty(bh)); + /* + * The buffer on BJ_Forget list and not jbddirty means + * it has been freed by this transaction and hence it + * could not have been reallocated until this + * transaction has committed. *BUT* it could be + * reallocated once we have written all the data to + * disk and before we process the buffer on BJ_Forget + * list. + */ + if (!jh->b_next_transaction) + try_to_free = 1; + } + JBUFFER_TRACE(jh, "refile or unfile buffer"); + __jbd2_journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + if (try_to_free) + release_buffer_page(bh); /* Drops bh reference */ + else + __brelse(bh); + cond_resched_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + /* + * This is a bit sleazy. We use j_list_lock to protect transition + * of a transaction into T_FINISHED state and calling + * __jbd2_journal_drop_transaction(). Otherwise we could race with + * other checkpointing code processing the transaction... + */ + write_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + /* + * Now recheck if some buffers did not get attached to the transaction + * while the lock was dropped... + */ + if (commit_transaction->t_forget) { + spin_unlock(&journal->j_list_lock); + write_unlock(&journal->j_state_lock); + goto restart_loop; + } + + /* Done with this transaction! */ + + jbd_debug(3, "JBD: commit phase 7\n"); + + J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); + + commit_transaction->t_start = jiffies; + stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, + commit_transaction->t_start); + + /* + * File the transaction statistics + */ + stats.ts_tid = commit_transaction->t_tid; + stats.run.rs_handle_count = + atomic_read(&commit_transaction->t_handle_count); + trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, + commit_transaction->t_tid, &stats.run); + + /* + * Calculate overall stats + */ + spin_lock(&journal->j_history_lock); + journal->j_stats.ts_tid++; + journal->j_stats.run.rs_wait += stats.run.rs_wait; + journal->j_stats.run.rs_running += stats.run.rs_running; + journal->j_stats.run.rs_locked += stats.run.rs_locked; + journal->j_stats.run.rs_flushing += stats.run.rs_flushing; + journal->j_stats.run.rs_logging += stats.run.rs_logging; + journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; + journal->j_stats.run.rs_blocks += stats.run.rs_blocks; + journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; + spin_unlock(&journal->j_history_lock); + + commit_transaction->t_state = T_FINISHED; + J_ASSERT(commit_transaction == journal->j_committing_transaction); + journal->j_commit_sequence = commit_transaction->t_tid; + journal->j_committing_transaction = NULL; + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + + /* + * weight the commit time higher than the average time so we don't + * react too strongly to vast changes in the commit time + */ + if (likely(journal->j_average_commit_time)) + journal->j_average_commit_time = (commit_time + + journal->j_average_commit_time*3) / 4; + else + journal->j_average_commit_time = commit_time; + write_unlock(&journal->j_state_lock); + + if (commit_transaction->t_checkpoint_list == NULL && + commit_transaction->t_checkpoint_io_list == NULL) { + __jbd2_journal_drop_transaction(journal, commit_transaction); + to_free = 1; + } else { + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; + } else { + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = + commit_transaction; + } + } + spin_unlock(&journal->j_list_lock); + + if (journal->j_commit_callback) + journal->j_commit_callback(journal, commit_transaction); + + trace_jbd2_end_commit(journal, commit_transaction); + jbd_debug(1, "JBD: commit %d complete, head %d\n", + journal->j_commit_sequence, journal->j_tail_sequence); + if (to_free) + kfree(commit_transaction); + + wake_up(&journal->j_wait_done_commit); +} diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c new file mode 100644 index 00000000..40c5fb73 --- /dev/null +++ b/fs/jbd2/journal.c @@ -0,0 +1,2471 @@ +/* + * linux/fs/jbd2/journal.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Generic filesystem journal-writing code; part of the ext2fs + * journaling system. + * + * This file manages journals: areas of disk reserved for logging + * transactional updates. This includes the kernel journaling thread + * which is responsible for scheduling updates to the log. + * + * We do not actually manage the physical storage of the journal in this + * file: that is left to a per-journal policy function, which allows us + * to store the journal within a filesystem-specified area for ext2 + * journaling (ext2 can use a reserved inode for storing the log). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#include +#include +#include + +EXPORT_SYMBOL(jbd2_journal_extend); +EXPORT_SYMBOL(jbd2_journal_stop); +EXPORT_SYMBOL(jbd2_journal_lock_updates); +EXPORT_SYMBOL(jbd2_journal_unlock_updates); +EXPORT_SYMBOL(jbd2_journal_get_write_access); +EXPORT_SYMBOL(jbd2_journal_get_create_access); +EXPORT_SYMBOL(jbd2_journal_get_undo_access); +EXPORT_SYMBOL(jbd2_journal_set_triggers); +EXPORT_SYMBOL(jbd2_journal_dirty_metadata); +EXPORT_SYMBOL(jbd2_journal_release_buffer); +EXPORT_SYMBOL(jbd2_journal_forget); +#if 0 +EXPORT_SYMBOL(journal_sync_buffer); +#endif +EXPORT_SYMBOL(jbd2_journal_flush); +EXPORT_SYMBOL(jbd2_journal_revoke); + +EXPORT_SYMBOL(jbd2_journal_init_dev); +EXPORT_SYMBOL(jbd2_journal_init_inode); +EXPORT_SYMBOL(jbd2_journal_update_format); +EXPORT_SYMBOL(jbd2_journal_check_used_features); +EXPORT_SYMBOL(jbd2_journal_check_available_features); +EXPORT_SYMBOL(jbd2_journal_set_features); +EXPORT_SYMBOL(jbd2_journal_load); +EXPORT_SYMBOL(jbd2_journal_destroy); +EXPORT_SYMBOL(jbd2_journal_abort); +EXPORT_SYMBOL(jbd2_journal_errno); +EXPORT_SYMBOL(jbd2_journal_ack_err); +EXPORT_SYMBOL(jbd2_journal_clear_err); +EXPORT_SYMBOL(jbd2_log_wait_commit); +EXPORT_SYMBOL(jbd2_log_start_commit); +EXPORT_SYMBOL(jbd2_journal_start_commit); +EXPORT_SYMBOL(jbd2_journal_force_commit_nested); +EXPORT_SYMBOL(jbd2_journal_wipe); +EXPORT_SYMBOL(jbd2_journal_blocks_per_page); +EXPORT_SYMBOL(jbd2_journal_invalidatepage); +EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); +EXPORT_SYMBOL(jbd2_journal_force_commit); +EXPORT_SYMBOL(jbd2_journal_file_inode); +EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); +EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); +EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); +EXPORT_SYMBOL(jbd2_inode_cache); + +static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); +static void __journal_abort_soft (journal_t *journal, int errno); +static int jbd2_journal_create_slab(size_t slab_size); + +/* + * Helper function used to manage commit timeouts + */ + +static void commit_timeout(unsigned long __data) +{ + struct task_struct * p = (struct task_struct *) __data; + + wake_up_process(p); +} + +/* + * kjournald2: The main thread function used to manage a logging device + * journal. + * + * This kernel thread is responsible for two things: + * + * 1) COMMIT: Every so often we need to commit the current state of the + * filesystem to disk. The journal thread is responsible for writing + * all of the metadata buffers to disk. + * + * 2) CHECKPOINT: We cannot reuse a used section of the log file until all + * of the data in that part of the log has been rewritten elsewhere on + * the disk. Flushing these old buffers to reclaim space in the log is + * known as checkpointing, and this thread is responsible for that job. + */ + +static int kjournald2(void *arg) +{ + journal_t *journal = arg; + transaction_t *transaction; + + /* + * Set up an interval timer which can be used to trigger a commit wakeup + * after the commit interval expires + */ + setup_timer(&journal->j_commit_timer, commit_timeout, + (unsigned long)current); + + /* Record that the journal thread is running */ + journal->j_task = current; + wake_up(&journal->j_wait_done_commit); + + /* + * And now, wait forever for commit wakeup events. + */ + write_lock(&journal->j_state_lock); + +loop: + if (journal->j_flags & JBD2_UNMOUNT) + goto end_loop; + + jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", + journal->j_commit_sequence, journal->j_commit_request); + + if (journal->j_commit_sequence != journal->j_commit_request) { + jbd_debug(1, "OK, requests differ\n"); + write_unlock(&journal->j_state_lock); + del_timer_sync(&journal->j_commit_timer); + jbd2_journal_commit_transaction(journal); + write_lock(&journal->j_state_lock); + goto loop; + } + + wake_up(&journal->j_wait_done_commit); + if (freezing(current)) { + /* + * The simpler the better. Flushing journal isn't a + * good idea, because that depends on threads that may + * be already stopped. + */ + jbd_debug(1, "Now suspending kjournald2\n"); + write_unlock(&journal->j_state_lock); + refrigerator(); + write_lock(&journal->j_state_lock); + } else { + /* + * We assume on resume that commits are already there, + * so we don't sleep + */ + DEFINE_WAIT(wait); + int should_sleep = 1; + + prepare_to_wait(&journal->j_wait_commit, &wait, + TASK_INTERRUPTIBLE); + if (journal->j_commit_sequence != journal->j_commit_request) + should_sleep = 0; + transaction = journal->j_running_transaction; + if (transaction && time_after_eq(jiffies, + transaction->t_expires)) + should_sleep = 0; + if (journal->j_flags & JBD2_UNMOUNT) + should_sleep = 0; + if (should_sleep) { + write_unlock(&journal->j_state_lock); + schedule(); + write_lock(&journal->j_state_lock); + } + finish_wait(&journal->j_wait_commit, &wait); + } + + jbd_debug(1, "kjournald2 wakes\n"); + + /* + * Were we woken up by a commit wakeup event? + */ + transaction = journal->j_running_transaction; + if (transaction && time_after_eq(jiffies, transaction->t_expires)) { + journal->j_commit_request = transaction->t_tid; + jbd_debug(1, "woke because of timeout\n"); + } + goto loop; + +end_loop: + write_unlock(&journal->j_state_lock); + del_timer_sync(&journal->j_commit_timer); + journal->j_task = NULL; + wake_up(&journal->j_wait_done_commit); + jbd_debug(1, "Journal thread exiting.\n"); + return 0; +} + +static int jbd2_journal_start_thread(journal_t *journal) +{ + struct task_struct *t; + + t = kthread_run(kjournald2, journal, "jbd2/%s", + journal->j_devname); + if (IS_ERR(t)) + return PTR_ERR(t); + + wait_event(journal->j_wait_done_commit, journal->j_task != NULL); + return 0; +} + +static void journal_kill_thread(journal_t *journal) +{ + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_UNMOUNT; + + while (journal->j_task) { + wake_up(&journal->j_wait_commit); + write_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_done_commit, journal->j_task == NULL); + write_lock(&journal->j_state_lock); + } + write_unlock(&journal->j_state_lock); +} + +/* + * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal. + * + * Writes a metadata buffer to a given disk block. The actual IO is not + * performed but a new buffer_head is constructed which labels the data + * to be written with the correct destination disk block. + * + * Any magic-number escaping which needs to be done will cause a + * copy-out here. If the buffer happens to start with the + * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the + * magic number is only written to the log for descripter blocks. In + * this case, we copy the data and replace the first word with 0, and we + * return a result code which indicates that this buffer needs to be + * marked as an escaped buffer in the corresponding log descriptor + * block. The missing word can then be restored when the block is read + * during recovery. + * + * If the source buffer has already been modified by a new transaction + * since we took the last commit snapshot, we use the frozen copy of + * that data for IO. If we end up using the existing buffer_head's data + * for the write, then we *have* to lock the buffer to prevent anyone + * else from using and possibly modifying it while the IO is in + * progress. + * + * The function returns a pointer to the buffer_heads to be used for IO. + * + * We assume that the journal has already been locked in this function. + * + * Return value: + * <0: Error + * >=0: Finished OK + * + * On success: + * Bit 0 set == escape performed on the data + * Bit 1 set == buffer copy-out performed (kfree the data after IO) + */ + +int jbd2_journal_write_metadata_buffer(transaction_t *transaction, + struct journal_head *jh_in, + struct journal_head **jh_out, + unsigned long long blocknr) +{ + int need_copy_out = 0; + int done_copy_out = 0; + int do_escape = 0; + char *mapped_data; + struct buffer_head *new_bh; + struct journal_head *new_jh; + struct page *new_page; + unsigned int new_offset; + struct buffer_head *bh_in = jh2bh(jh_in); + journal_t *journal = transaction->t_journal; + + /* + * The buffer really shouldn't be locked: only the current committing + * transaction is allowed to write it, so nobody else is allowed + * to do any IO. + * + * akpm: except if we're journalling data, and write() output is + * also part of a shared mapping, and another thread has + * decided to launch a writepage() against this buffer. + */ + J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); + +retry_alloc: + new_bh = alloc_buffer_head(GFP_NOFS); + if (!new_bh) { + /* + * Failure is not an option, but __GFP_NOFAIL is going + * away; so we retry ourselves here. + */ + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_alloc; + } + + /* keep subsequent assertions sane */ + new_bh->b_state = 0; + init_buffer(new_bh, NULL, NULL); + atomic_set(&new_bh->b_count, 1); + new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */ + + /* + * If a new transaction has already done a buffer copy-out, then + * we use that version of the data for the commit. + */ + jbd_lock_bh_state(bh_in); +repeat: + if (jh_in->b_frozen_data) { + done_copy_out = 1; + new_page = virt_to_page(jh_in->b_frozen_data); + new_offset = offset_in_page(jh_in->b_frozen_data); + } else { + new_page = jh2bh(jh_in)->b_page; + new_offset = offset_in_page(jh2bh(jh_in)->b_data); + } + + mapped_data = kmap_atomic(new_page, KM_USER0); + /* + * Fire data frozen trigger if data already wasn't frozen. Do this + * before checking for escaping, as the trigger may modify the magic + * offset. If a copy-out happens afterwards, it will have the correct + * data in the buffer. + */ + if (!done_copy_out) + jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset, + jh_in->b_triggers); + + /* + * Check for escaping + */ + if (*((__be32 *)(mapped_data + new_offset)) == + cpu_to_be32(JBD2_MAGIC_NUMBER)) { + need_copy_out = 1; + do_escape = 1; + } + kunmap_atomic(mapped_data, KM_USER0); + + /* + * Do we need to do a data copy? + */ + if (need_copy_out && !done_copy_out) { + char *tmp; + + jbd_unlock_bh_state(bh_in); + tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); + if (!tmp) { + jbd2_journal_put_journal_head(new_jh); + return -ENOMEM; + } + jbd_lock_bh_state(bh_in); + if (jh_in->b_frozen_data) { + jbd2_free(tmp, bh_in->b_size); + goto repeat; + } + + jh_in->b_frozen_data = tmp; + mapped_data = kmap_atomic(new_page, KM_USER0); + memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); + kunmap_atomic(mapped_data, KM_USER0); + + new_page = virt_to_page(tmp); + new_offset = offset_in_page(tmp); + done_copy_out = 1; + + /* + * This isn't strictly necessary, as we're using frozen + * data for the escaping, but it keeps consistency with + * b_frozen_data usage. + */ + jh_in->b_frozen_triggers = jh_in->b_triggers; + } + + /* + * Did we need to do an escaping? Now we've done all the + * copying, we can finally do so. + */ + if (do_escape) { + mapped_data = kmap_atomic(new_page, KM_USER0); + *((unsigned int *)(mapped_data + new_offset)) = 0; + kunmap_atomic(mapped_data, KM_USER0); + } + + set_bh_page(new_bh, new_page, new_offset); + new_jh->b_transaction = NULL; + new_bh->b_size = jh2bh(jh_in)->b_size; + new_bh->b_bdev = transaction->t_journal->j_dev; + new_bh->b_blocknr = blocknr; + set_buffer_mapped(new_bh); + set_buffer_dirty(new_bh); + + *jh_out = new_jh; + + /* + * The to-be-written buffer needs to get moved to the io queue, + * and the original buffer whose contents we are shadowing or + * copying is moved to the transaction's shadow queue. + */ + JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh_in); + + JBUFFER_TRACE(new_jh, "file as BJ_IO"); + jbd2_journal_file_buffer(new_jh, transaction, BJ_IO); + + return do_escape | (done_copy_out << 1); +} + +/* + * Allocation code for the journal file. Manage the space left in the + * journal, so that we can begin checkpointing when appropriate. + */ + +/* + * __jbd2_log_space_left: Return the number of free blocks left in the journal. + * + * Called with the journal already locked. + * + * Called under j_state_lock + */ + +int __jbd2_log_space_left(journal_t *journal) +{ + int left = journal->j_free; + + /* assert_spin_locked(&journal->j_state_lock); */ + + /* + * Be pessimistic here about the number of those free blocks which + * might be required for log descriptor control blocks. + */ + +#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ + + left -= MIN_LOG_RESERVED_BLOCKS; + + if (left <= 0) + return 0; + left -= (left >> 3); + return left; +} + +/* + * Called with j_state_lock locked for writing. + * Returns true if a transaction commit was started. + */ +int __jbd2_log_start_commit(journal_t *journal, tid_t target) +{ + /* + * The only transaction we can possibly wait upon is the + * currently running transaction (if it exists). Otherwise, + * the target tid must be an old one. + */ + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == target) { + /* + * We want a new commit: OK, mark the request and wakeup the + * commit thread. We do _not_ do the commit ourselves. + */ + + journal->j_commit_request = target; + jbd_debug(1, "JBD: requesting commit %d/%d\n", + journal->j_commit_request, + journal->j_commit_sequence); + wake_up(&journal->j_wait_commit); + return 1; + } else if (!tid_geq(journal->j_commit_request, target)) + /* This should never happen, but if it does, preserve + the evidence before kjournald goes into a loop and + increments j_commit_sequence beyond all recognition. */ + WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, + journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); + return 0; +} + +int jbd2_log_start_commit(journal_t *journal, tid_t tid) +{ + int ret; + + write_lock(&journal->j_state_lock); + ret = __jbd2_log_start_commit(journal, tid); + write_unlock(&journal->j_state_lock); + return ret; +} + +/* + * Force and wait upon a commit if the calling process is not within + * transaction. This is used for forcing out undo-protected data which contains + * bitmaps, when the fs is running out of space. + * + * We can only force the running transaction if we don't have an active handle; + * otherwise, we will deadlock. + * + * Returns true if a transaction was started. + */ +int jbd2_journal_force_commit_nested(journal_t *journal) +{ + transaction_t *transaction = NULL; + tid_t tid; + int need_to_start = 0; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction && !current->journal_info) { + transaction = journal->j_running_transaction; + if (!tid_geq(journal->j_commit_request, transaction->t_tid)) + need_to_start = 1; + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + if (!transaction) { + read_unlock(&journal->j_state_lock); + return 0; /* Nothing to retry */ + } + + tid = transaction->t_tid; + read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); + jbd2_log_wait_commit(journal, tid); + return 1; +} + +/* + * Start a commit of the current running transaction (if any). Returns true + * if a transaction is going to be committed (or is currently already + * committing), and fills its tid in at *ptid + */ +int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) +{ + int ret = 0; + + write_lock(&journal->j_state_lock); + if (journal->j_running_transaction) { + tid_t tid = journal->j_running_transaction->t_tid; + + __jbd2_log_start_commit(journal, tid); + /* There's a running transaction and we've just made sure + * it's commit has been scheduled. */ + if (ptid) + *ptid = tid; + ret = 1; + } else if (journal->j_committing_transaction) { + /* + * If ext3_write_super() recently started a commit, then we + * have to wait for completion of that transaction + */ + if (ptid) + *ptid = journal->j_committing_transaction->t_tid; + ret = 1; + } + write_unlock(&journal->j_state_lock); + return ret; +} + +/* + * Return 1 if a given transaction has not yet sent barrier request + * connected with a transaction commit. If 0 is returned, transaction + * may or may not have sent the barrier. Used to avoid sending barrier + * twice in common cases. + */ +int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) +{ + int ret = 0; + transaction_t *commit_trans; + + if (!(journal->j_flags & JBD2_BARRIER)) + return 0; + read_lock(&journal->j_state_lock); + /* Transaction already committed? */ + if (tid_geq(journal->j_commit_sequence, tid)) + goto out; + commit_trans = journal->j_committing_transaction; + if (!commit_trans || commit_trans->t_tid != tid) { + ret = 1; + goto out; + } + /* + * Transaction is being committed and we already proceeded to + * submitting a flush to fs partition? + */ + if (journal->j_fs_dev != journal->j_dev) { + if (!commit_trans->t_need_data_flush || + commit_trans->t_state >= T_COMMIT_DFLUSH) + goto out; + } else { + if (commit_trans->t_state >= T_COMMIT_JFLUSH) + goto out; + } + ret = 1; +out: + read_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier); + +/* + * Wait for a specified commit to complete. + * The caller may not hold the journal lock. + */ +int jbd2_log_wait_commit(journal_t *journal, tid_t tid) +{ + int err = 0; + + read_lock(&journal->j_state_lock); +#ifdef CONFIG_JBD2_DEBUG + if (!tid_geq(journal->j_commit_request, tid)) { + printk(KERN_EMERG + "%s: error: j_commit_request=%d, tid=%d\n", + __func__, journal->j_commit_request, tid); + } +#endif + while (tid_gt(tid, journal->j_commit_sequence)) { + jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", + tid, journal->j_commit_sequence); + wake_up(&journal->j_wait_commit); + read_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_done_commit, + !tid_gt(tid, journal->j_commit_sequence)); + read_lock(&journal->j_state_lock); + } + read_unlock(&journal->j_state_lock); + + if (unlikely(is_journal_aborted(journal))) { + printk(KERN_EMERG "journal commit I/O error\n"); + err = -EIO; + } + return err; +} + +/* + * Log buffer allocation routines: + */ + +int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) +{ + unsigned long blocknr; + + write_lock(&journal->j_state_lock); + J_ASSERT(journal->j_free > 1); + + blocknr = journal->j_head; + journal->j_head++; + journal->j_free--; + if (journal->j_head == journal->j_last) + journal->j_head = journal->j_first; + write_unlock(&journal->j_state_lock); + return jbd2_journal_bmap(journal, blocknr, retp); +} + +/* + * Conversion of logical to physical block numbers for the journal + * + * On external journals the journal blocks are identity-mapped, so + * this is a no-op. If needed, we can use j_blk_offset - everything is + * ready. + */ +int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, + unsigned long long *retp) +{ + int err = 0; + unsigned long long ret; + + if (journal->j_inode) { + ret = bmap(journal->j_inode, blocknr); + if (ret) + *retp = ret; + else { + printk(KERN_ALERT "%s: journal block not found " + "at offset %lu on %s\n", + __func__, blocknr, journal->j_devname); + err = -EIO; + __journal_abort_soft(journal, err); + } + } else { + *retp = blocknr; /* +journal->j_blk_offset */ + } + return err; +} + +/* + * We play buffer_head aliasing tricks to write data/metadata blocks to + * the journal without copying their contents, but for journal + * descriptor blocks we do need to generate bona fide buffers. + * + * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying + * the buffer's contents they really should run flush_dcache_page(bh->b_page). + * But we don't bother doing that, so there will be coherency problems with + * mmaps of blockdevs which hold live JBD-controlled filesystems. + */ +struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) +{ + struct buffer_head *bh; + unsigned long long blocknr; + int err; + + err = jbd2_journal_next_log_block(journal, &blocknr); + + if (err) + return NULL; + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return NULL; + lock_buffer(bh); + memset(bh->b_data, 0, journal->j_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + BUFFER_TRACE(bh, "return this buffer"); + return jbd2_journal_add_journal_head(bh); +} + +struct jbd2_stats_proc_session { + journal_t *journal; + struct transaction_stats_s *stats; + int start; + int max; +}; + +static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? NULL : SEQ_START_TOKEN; +} + +static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return NULL; +} + +static int jbd2_seq_info_show(struct seq_file *seq, void *v) +{ + struct jbd2_stats_proc_session *s = seq->private; + + if (v != SEQ_START_TOKEN) + return 0; + seq_printf(seq, "%lu transaction, each up to %u blocks\n", + s->stats->ts_tid, + s->journal->j_max_transaction_buffers); + if (s->stats->ts_tid == 0) + return 0; + seq_printf(seq, "average: \n %ums waiting for transaction\n", + jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid)); + seq_printf(seq, " %ums running transaction\n", + jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid)); + seq_printf(seq, " %ums transaction was being locked\n", + jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid)); + seq_printf(seq, " %ums flushing data (in ordered mode)\n", + jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid)); + seq_printf(seq, " %ums logging transaction\n", + jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid)); + seq_printf(seq, " %lluus average transaction commit time\n", + div_u64(s->journal->j_average_commit_time, 1000)); + seq_printf(seq, " %lu handles per transaction\n", + s->stats->run.rs_handle_count / s->stats->ts_tid); + seq_printf(seq, " %lu blocks per transaction\n", + s->stats->run.rs_blocks / s->stats->ts_tid); + seq_printf(seq, " %lu logged blocks per transaction\n", + s->stats->run.rs_blocks_logged / s->stats->ts_tid); + return 0; +} + +static void jbd2_seq_info_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations jbd2_seq_info_ops = { + .start = jbd2_seq_info_start, + .next = jbd2_seq_info_next, + .stop = jbd2_seq_info_stop, + .show = jbd2_seq_info_show, +}; + +static int jbd2_seq_info_open(struct inode *inode, struct file *file) +{ + journal_t *journal = PDE(inode)->data; + struct jbd2_stats_proc_session *s; + int rc, size; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return -ENOMEM; + size = sizeof(struct transaction_stats_s); + s->stats = kmalloc(size, GFP_KERNEL); + if (s->stats == NULL) { + kfree(s); + return -ENOMEM; + } + spin_lock(&journal->j_history_lock); + memcpy(s->stats, &journal->j_stats, size); + s->journal = journal; + spin_unlock(&journal->j_history_lock); + + rc = seq_open(file, &jbd2_seq_info_ops); + if (rc == 0) { + struct seq_file *m = file->private_data; + m->private = s; + } else { + kfree(s->stats); + kfree(s); + } + return rc; + +} + +static int jbd2_seq_info_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct jbd2_stats_proc_session *s = seq->private; + kfree(s->stats); + kfree(s); + return seq_release(inode, file); +} + +static const struct file_operations jbd2_seq_info_fops = { + .owner = THIS_MODULE, + .open = jbd2_seq_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = jbd2_seq_info_release, +}; + +static struct proc_dir_entry *proc_jbd2_stats; + +static void jbd2_stats_proc_init(journal_t *journal) +{ + journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); + if (journal->j_proc_entry) { + proc_create_data("info", S_IRUGO, journal->j_proc_entry, + &jbd2_seq_info_fops, journal); + } +} + +static void jbd2_stats_proc_exit(journal_t *journal) +{ + remove_proc_entry("info", journal->j_proc_entry); + remove_proc_entry(journal->j_devname, proc_jbd2_stats); +} + +/* + * Management for journal control blocks: functions to create and + * destroy journal_t structures, and to initialise and read existing + * journal blocks from disk. */ + +/* First: create and setup a journal_t object in memory. We initialise + * very few fields yet: that has to wait until we have created the + * journal structures from from scratch, or loaded them from disk. */ + +static journal_t * journal_init_common (void) +{ + journal_t *journal; + int err; + + journal = kzalloc(sizeof(*journal), GFP_KERNEL); + if (!journal) + return NULL; + + init_waitqueue_head(&journal->j_wait_transaction_locked); + init_waitqueue_head(&journal->j_wait_logspace); + init_waitqueue_head(&journal->j_wait_done_commit); + init_waitqueue_head(&journal->j_wait_checkpoint); + init_waitqueue_head(&journal->j_wait_commit); + init_waitqueue_head(&journal->j_wait_updates); + mutex_init(&journal->j_barrier); + mutex_init(&journal->j_checkpoint_mutex); + spin_lock_init(&journal->j_revoke_lock); + spin_lock_init(&journal->j_list_lock); + rwlock_init(&journal->j_state_lock); + + journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); + journal->j_min_batch_time = 0; + journal->j_max_batch_time = 15000; /* 15ms */ + + /* The journal is marked for error until we succeed with recovery! */ + journal->j_flags = JBD2_ABORT; + + /* Set up a default-sized revoke table for the new mount. */ + err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); + if (err) { + kfree(journal); + return NULL; + } + + spin_lock_init(&journal->j_history_lock); + + return journal; +} + +/* jbd2_journal_init_dev and jbd2_journal_init_inode: + * + * Create a journal structure assigned some fixed set of disk blocks to + * the journal. We don't actually touch those disk blocks yet, but we + * need to set up all of the mapping information to tell the journaling + * system where the journal blocks are. + * + */ + +/** + * journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure + * @bdev: Block device on which to create the journal + * @fs_dev: Device which hold journalled filesystem for this journal. + * @start: Block nr Start of journal. + * @len: Length of the journal in blocks. + * @blocksize: blocksize of journalling device + * + * Returns: a newly created journal_t * + * + * jbd2_journal_init_dev creates a journal which maps a fixed contiguous + * range of blocks on an arbitrary block device. + * + */ +journal_t * jbd2_journal_init_dev(struct block_device *bdev, + struct block_device *fs_dev, + unsigned long long start, int len, int blocksize) +{ + journal_t *journal = journal_init_common(); + struct buffer_head *bh; + char *p; + int n; + + if (!journal) + return NULL; + + /* journal descriptor can store up to n blocks -bzzz */ + journal->j_blocksize = blocksize; + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + bdevname(journal->j_dev, journal->j_devname); + p = journal->j_devname; + while ((p = strchr(p, '/'))) + *p = '!'; + jbd2_stats_proc_init(journal); + n = journal->j_blocksize / sizeof(journal_block_tag_t); + journal->j_wbufsize = n; + journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); + if (!journal->j_wbuf) { + printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", + __func__); + goto out_err; + } + + bh = __getblk(journal->j_dev, start, journal->j_blocksize); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + + return journal; +out_err: + kfree(journal->j_wbuf); + jbd2_stats_proc_exit(journal); + kfree(journal); + return NULL; +} + +/** + * journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode. + * @inode: An inode to create the journal in + * + * jbd2_journal_init_inode creates a journal which maps an on-disk inode as + * the journal. The inode must exist already, must support bmap() and + * must have all data blocks preallocated. + */ +journal_t * jbd2_journal_init_inode (struct inode *inode) +{ + struct buffer_head *bh; + journal_t *journal = journal_init_common(); + char *p; + int err; + int n; + unsigned long long blocknr; + + if (!journal) + return NULL; + + journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; + journal->j_inode = inode; + bdevname(journal->j_dev, journal->j_devname); + p = journal->j_devname; + while ((p = strchr(p, '/'))) + *p = '!'; + p = journal->j_devname + strlen(journal->j_devname); + sprintf(p, "-%lu", journal->j_inode->i_ino); + jbd_debug(1, + "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", + journal, inode->i_sb->s_id, inode->i_ino, + (long long) inode->i_size, + inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); + + journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; + journal->j_blocksize = inode->i_sb->s_blocksize; + jbd2_stats_proc_init(journal); + + /* journal descriptor can store up to n blocks -bzzz */ + n = journal->j_blocksize / sizeof(journal_block_tag_t); + journal->j_wbufsize = n; + journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); + if (!journal->j_wbuf) { + printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", + __func__); + goto out_err; + } + + err = jbd2_journal_bmap(journal, 0, &blocknr); + /* If that failed, give up */ + if (err) { + printk(KERN_ERR "%s: Cannot locate journal superblock\n", + __func__); + goto out_err; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + + return journal; +out_err: + kfree(journal->j_wbuf); + jbd2_stats_proc_exit(journal); + kfree(journal); + return NULL; +} + +/* + * If the journal init or create aborts, we need to mark the journal + * superblock as being NULL to prevent the journal destroy from writing + * back a bogus superblock. + */ +static void journal_fail_superblock (journal_t *journal) +{ + struct buffer_head *bh = journal->j_sb_buffer; + brelse(bh); + journal->j_sb_buffer = NULL; +} + +/* + * Given a journal_t structure, initialise the various fields for + * startup of a new journaling session. We use this both when creating + * a journal, and after recovering an old journal to reset it for + * subsequent use. + */ + +static int journal_reset(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + unsigned long long first, last; + + first = be32_to_cpu(sb->s_first); + last = be32_to_cpu(sb->s_maxlen); + if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } + + journal->j_first = first; + journal->j_last = last; + + journal->j_head = first; + journal->j_tail = first; + journal->j_free = last - first; + + journal->j_tail_sequence = journal->j_transaction_sequence; + journal->j_commit_sequence = journal->j_transaction_sequence - 1; + journal->j_commit_request = journal->j_commit_sequence; + + journal->j_max_transaction_buffers = journal->j_maxlen / 4; + + /* Add the dynamic fields and write it to disk. */ + jbd2_journal_update_superblock(journal, 1); + return jbd2_journal_start_thread(journal); +} + +/** + * void jbd2_journal_update_superblock() - Update journal sb on disk. + * @journal: The journal to update. + * @wait: Set to '0' if you don't want to wait for IO completion. + * + * Update a journal's dynamic superblock fields and write it to disk, + * optionally waiting for the IO to complete. + */ +void jbd2_journal_update_superblock(journal_t *journal, int wait) +{ + journal_superblock_t *sb = journal->j_superblock; + struct buffer_head *bh = journal->j_sb_buffer; + + /* + * As a special case, if the on-disk copy is already marked as needing + * no recovery (s_start == 0) and there are no outstanding transactions + * in the filesystem, then we can safely defer the superblock update + * until the next commit by setting JBD2_FLUSHED. This avoids + * attempting a write to a potential-readonly device. + */ + if (sb->s_start == 0 && journal->j_tail_sequence == + journal->j_transaction_sequence) { + jbd_debug(1,"JBD: Skipping superblock update on recovered sb " + "(start %ld, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, + journal->j_errno); + goto out; + } + + if (buffer_write_io_error(bh)) { + /* + * Oh, dear. A previous attempt to write the journal + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + printk(KERN_ERR "JBD2: previous I/O error detected " + "for journal superblock update for %s.\n", + journal->j_devname); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + + read_lock(&journal->j_state_lock); + jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, journal->j_errno); + + sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); + sb->s_start = cpu_to_be32(journal->j_tail); + sb->s_errno = cpu_to_be32(journal->j_errno); + read_unlock(&journal->j_state_lock); + + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + if (wait) { + sync_dirty_buffer(bh); + if (buffer_write_io_error(bh)) { + printk(KERN_ERR "JBD2: I/O error detected " + "when updating journal superblock for %s.\n", + journal->j_devname); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + } else + write_dirty_buffer(bh, WRITE); + +out: + /* If we have just flushed the log (by marking s_start==0), then + * any future commit will have to be careful to update the + * superblock again to re-record the true start of the log. */ + + write_lock(&journal->j_state_lock); + if (sb->s_start) + journal->j_flags &= ~JBD2_FLUSHED; + else + journal->j_flags |= JBD2_FLUSHED; + write_unlock(&journal->j_state_lock); +} + +/* + * Read the superblock for a given journal, performing initial + * validation of the format. + */ + +static int journal_get_superblock(journal_t *journal) +{ + struct buffer_head *bh; + journal_superblock_t *sb; + int err = -EIO; + + bh = journal->j_sb_buffer; + + J_ASSERT(bh != NULL); + if (!buffer_uptodate(bh)) { + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + printk (KERN_ERR + "JBD: IO error reading journal superblock\n"); + goto out; + } + } + + sb = journal->j_superblock; + + err = -EINVAL; + + if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || + sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { + printk(KERN_WARNING "JBD: no valid journal superblock found\n"); + goto out; + } + + switch(be32_to_cpu(sb->s_header.h_blocktype)) { + case JBD2_SUPERBLOCK_V1: + journal->j_format_version = 1; + break; + case JBD2_SUPERBLOCK_V2: + journal->j_format_version = 2; + break; + default: + printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); + goto out; + } + + if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) + journal->j_maxlen = be32_to_cpu(sb->s_maxlen); + else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { + printk (KERN_WARNING "JBD: journal file too short\n"); + goto out; + } + + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + printk(KERN_WARNING + "JBD2: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + goto out; + } + + return 0; + +out: + journal_fail_superblock(journal); + return err; +} + +/* + * Load the on-disk journal superblock and read the key fields into the + * journal_t. + */ + +static int load_superblock(journal_t *journal) +{ + int err; + journal_superblock_t *sb; + + err = journal_get_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); + journal->j_tail = be32_to_cpu(sb->s_start); + journal->j_first = be32_to_cpu(sb->s_first); + journal->j_last = be32_to_cpu(sb->s_maxlen); + journal->j_errno = be32_to_cpu(sb->s_errno); + + return 0; +} + + +/** + * int jbd2_journal_load() - Read journal from disk. + * @journal: Journal to act on. + * + * Given a journal_t structure which tells us which disk blocks contain + * a journal, read the journal from disk to initialise the in-memory + * structures. + */ +int jbd2_journal_load(journal_t *journal) +{ + int err; + journal_superblock_t *sb; + + err = load_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + /* If this is a V2 superblock, then we have to check the + * features flags on it. */ + + if (journal->j_format_version >= 2) { + if ((sb->s_feature_ro_compat & + ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || + (sb->s_feature_incompat & + ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { + printk (KERN_WARNING + "JBD: Unrecognised features on journal\n"); + return -EINVAL; + } + } + + /* + * Create a slab for this blocksize + */ + err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize)); + if (err) + return err; + + /* Let the recovery code check whether it needs to recover any + * data from the journal. */ + if (jbd2_journal_recover(journal)) + goto recovery_error; + + if (journal->j_failed_commit) { + printk(KERN_ERR "JBD2: journal transaction %u on %s " + "is corrupt.\n", journal->j_failed_commit, + journal->j_devname); + return -EIO; + } + + /* OK, we've finished with the dynamic journal bits: + * reinitialise the dynamic contents of the superblock in memory + * and reset them on disk. */ + if (journal_reset(journal)) + goto recovery_error; + + journal->j_flags &= ~JBD2_ABORT; + journal->j_flags |= JBD2_LOADED; + return 0; + +recovery_error: + printk (KERN_WARNING "JBD: recovery failed\n"); + return -EIO; +} + +/** + * void jbd2_journal_destroy() - Release a journal_t structure. + * @journal: Journal to act on. + * + * Release a journal_t structure once it is no longer in use by the + * journaled object. + * Return <0 if we couldn't clean up the journal. + */ +int jbd2_journal_destroy(journal_t *journal) +{ + int err = 0; + + /* Wait for the commit thread to wake up and die. */ + journal_kill_thread(journal); + + /* Force a final log commit */ + if (journal->j_running_transaction) + jbd2_journal_commit_transaction(journal); + + /* Force any old transactions to disk */ + + /* Totally anal locking here... */ + spin_lock(&journal->j_list_lock); + while (journal->j_checkpoint_transactions != NULL) { + spin_unlock(&journal->j_list_lock); + mutex_lock(&journal->j_checkpoint_mutex); + jbd2_log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + spin_lock(&journal->j_list_lock); + } + + J_ASSERT(journal->j_running_transaction == NULL); + J_ASSERT(journal->j_committing_transaction == NULL); + J_ASSERT(journal->j_checkpoint_transactions == NULL); + spin_unlock(&journal->j_list_lock); + + if (journal->j_sb_buffer) { + if (!is_journal_aborted(journal)) { + /* We can now mark the journal as empty. */ + journal->j_tail = 0; + journal->j_tail_sequence = + ++journal->j_transaction_sequence; + jbd2_journal_update_superblock(journal, 1); + } else { + err = -EIO; + } + brelse(journal->j_sb_buffer); + } + + if (journal->j_proc_entry) + jbd2_stats_proc_exit(journal); + if (journal->j_inode) + iput(journal->j_inode); + if (journal->j_revoke) + jbd2_journal_destroy_revoke(journal); + kfree(journal->j_wbuf); + kfree(journal); + + return err; +} + + +/** + *int jbd2_journal_check_used_features () - Check if features specified are used. + * @journal: Journal to check. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Check whether the journal uses all of a given set of + * features. Return true (non-zero) if it does. + **/ + +int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + if (!compat && !ro && !incompat) + return 1; + /* Load journal superblock if it is not loaded yet. */ + if (journal->j_format_version == 0 && + journal_get_superblock(journal) != 0) + return 0; + if (journal->j_format_version == 1) + return 0; + + sb = journal->j_superblock; + + if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && + ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && + ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) + return 1; + + return 0; +} + +/** + * int jbd2_journal_check_available_features() - Check feature set in journalling layer + * @journal: Journal to check. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Check whether the journaling code supports the use of + * all of a given set of features on this journal. Return true + * (non-zero) if it can. */ + +int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + if (!compat && !ro && !incompat) + return 1; + + /* We can support any known requested features iff the + * superblock is in version 2. Otherwise we fail to support any + * extended sb features. */ + + if (journal->j_format_version != 2) + return 0; + + if ((compat & JBD2_KNOWN_COMPAT_FEATURES) == compat && + (ro & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro && + (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat) + return 1; + + return 0; +} + +/** + * int jbd2_journal_set_features () - Mark a given journal feature in the superblock + * @journal: Journal to act on. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Mark a given journal feature as present on the + * superblock. Returns true if the requested features could be set. + * + */ + +int jbd2_journal_set_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + if (jbd2_journal_check_used_features(journal, compat, ro, incompat)) + return 1; + + if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) + return 0; + + jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", + compat, ro, incompat); + + sb = journal->j_superblock; + + sb->s_feature_compat |= cpu_to_be32(compat); + sb->s_feature_ro_compat |= cpu_to_be32(ro); + sb->s_feature_incompat |= cpu_to_be32(incompat); + + return 1; +} + +/* + * jbd2_journal_clear_features () - Clear a given journal feature in the + * superblock + * @journal: Journal to act on. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Clear a given journal feature as present on the + * superblock. + */ +void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", + compat, ro, incompat); + + sb = journal->j_superblock; + + sb->s_feature_compat &= ~cpu_to_be32(compat); + sb->s_feature_ro_compat &= ~cpu_to_be32(ro); + sb->s_feature_incompat &= ~cpu_to_be32(incompat); +} +EXPORT_SYMBOL(jbd2_journal_clear_features); + +/** + * int jbd2_journal_update_format () - Update on-disk journal structure. + * @journal: Journal to act on. + * + * Given an initialised but unloaded journal struct, poke about in the + * on-disk structure to update it to the most recent supported version. + */ +int jbd2_journal_update_format (journal_t *journal) +{ + journal_superblock_t *sb; + int err; + + err = journal_get_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + switch (be32_to_cpu(sb->s_header.h_blocktype)) { + case JBD2_SUPERBLOCK_V2: + return 0; + case JBD2_SUPERBLOCK_V1: + return journal_convert_superblock_v1(journal, sb); + default: + break; + } + return -EINVAL; +} + +static int journal_convert_superblock_v1(journal_t *journal, + journal_superblock_t *sb) +{ + int offset, blocksize; + struct buffer_head *bh; + + printk(KERN_WARNING + "JBD: Converting superblock from version 1 to 2.\n"); + + /* Pre-initialise new fields to zero */ + offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); + blocksize = be32_to_cpu(sb->s_blocksize); + memset(&sb->s_feature_compat, 0, blocksize-offset); + + sb->s_nr_users = cpu_to_be32(1); + sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2); + journal->j_format_version = 2; + + bh = journal->j_sb_buffer; + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + return 0; +} + + +/** + * int jbd2_journal_flush () - Flush journal + * @journal: Journal to act on. + * + * Flush all data for a given journal to disk and empty the journal. + * Filesystems can use this when remounting readonly to ensure that + * recovery does not need to happen on remount. + */ + +int jbd2_journal_flush(journal_t *journal) +{ + int err = 0; + transaction_t *transaction = NULL; + unsigned long old_tail; + + write_lock(&journal->j_state_lock); + + /* Force everything buffered to the log... */ + if (journal->j_running_transaction) { + transaction = journal->j_running_transaction; + __jbd2_log_start_commit(journal, transaction->t_tid); + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + /* Wait for the log commit to complete... */ + if (transaction) { + tid_t tid = transaction->t_tid; + + write_unlock(&journal->j_state_lock); + jbd2_log_wait_commit(journal, tid); + } else { + write_unlock(&journal->j_state_lock); + } + + /* ...and flush everything in the log out to disk. */ + spin_lock(&journal->j_list_lock); + while (!err && journal->j_checkpoint_transactions != NULL) { + spin_unlock(&journal->j_list_lock); + mutex_lock(&journal->j_checkpoint_mutex); + err = jbd2_log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + spin_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + + if (is_journal_aborted(journal)) + return -EIO; + + jbd2_cleanup_journal_tail(journal); + + /* Finally, mark the journal as really needing no recovery. + * This sets s_start==0 in the underlying superblock, which is + * the magic code for a fully-recovered superblock. Any future + * commits of data to the journal will restore the current + * s_start value. */ + write_lock(&journal->j_state_lock); + old_tail = journal->j_tail; + journal->j_tail = 0; + write_unlock(&journal->j_state_lock); + jbd2_journal_update_superblock(journal, 1); + write_lock(&journal->j_state_lock); + journal->j_tail = old_tail; + + J_ASSERT(!journal->j_running_transaction); + J_ASSERT(!journal->j_committing_transaction); + J_ASSERT(!journal->j_checkpoint_transactions); + J_ASSERT(journal->j_head == journal->j_tail); + J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); + write_unlock(&journal->j_state_lock); + return 0; +} + +/** + * int jbd2_journal_wipe() - Wipe journal contents + * @journal: Journal to act on. + * @write: flag (see below) + * + * Wipe out all of the contents of a journal, safely. This will produce + * a warning if the journal contains any valid recovery information. + * Must be called between journal_init_*() and jbd2_journal_load(). + * + * If 'write' is non-zero, then we wipe out the journal on disk; otherwise + * we merely suppress recovery. + */ + +int jbd2_journal_wipe(journal_t *journal, int write) +{ + int err = 0; + + J_ASSERT (!(journal->j_flags & JBD2_LOADED)); + + err = load_superblock(journal); + if (err) + return err; + + if (!journal->j_tail) + goto no_recovery; + + printk (KERN_WARNING "JBD: %s recovery information on journal\n", + write ? "Clearing" : "Ignoring"); + + err = jbd2_journal_skip_recovery(journal); + if (write) + jbd2_journal_update_superblock(journal, 1); + + no_recovery: + return err; +} + +/* + * Journal abort has very specific semantics, which we describe + * for journal abort. + * + * Two internal functions, which provide abort to the jbd layer + * itself are here. + */ + +/* + * Quick version for internal journal use (doesn't lock the journal). + * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, + * and don't attempt to make any other journal updates. + */ +void __jbd2_journal_abort_hard(journal_t *journal) +{ + transaction_t *transaction; + + if (journal->j_flags & JBD2_ABORT) + return; + + printk(KERN_ERR "Aborting journal on device %s.\n", + journal->j_devname); + + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_ABORT; + transaction = journal->j_running_transaction; + if (transaction) + __jbd2_log_start_commit(journal, transaction->t_tid); + write_unlock(&journal->j_state_lock); +} + +/* Soft abort: record the abort error status in the journal superblock, + * but don't do any other IO. */ +static void __journal_abort_soft (journal_t *journal, int errno) +{ + if (journal->j_flags & JBD2_ABORT) + return; + + if (!journal->j_errno) + journal->j_errno = errno; + + __jbd2_journal_abort_hard(journal); + + if (errno) + jbd2_journal_update_superblock(journal, 1); +} + +/** + * void jbd2_journal_abort () - Shutdown the journal immediately. + * @journal: the journal to shutdown. + * @errno: an error number to record in the journal indicating + * the reason for the shutdown. + * + * Perform a complete, immediate shutdown of the ENTIRE + * journal (not of a single transaction). This operation cannot be + * undone without closing and reopening the journal. + * + * The jbd2_journal_abort function is intended to support higher level error + * recovery mechanisms such as the ext2/ext3 remount-readonly error + * mode. + * + * Journal abort has very specific semantics. Any existing dirty, + * unjournaled buffers in the main filesystem will still be written to + * disk by bdflush, but the journaling mechanism will be suspended + * immediately and no further transaction commits will be honoured. + * + * Any dirty, journaled buffers will be written back to disk without + * hitting the journal. Atomicity cannot be guaranteed on an aborted + * filesystem, but we _do_ attempt to leave as much data as possible + * behind for fsck to use for cleanup. + * + * Any attempt to get a new transaction handle on a journal which is in + * ABORT state will just result in an -EROFS error return. A + * jbd2_journal_stop on an existing handle will return -EIO if we have + * entered abort state during the update. + * + * Recursive transactions are not disturbed by journal abort until the + * final jbd2_journal_stop, which will receive the -EIO error. + * + * Finally, the jbd2_journal_abort call allows the caller to supply an errno + * which will be recorded (if possible) in the journal superblock. This + * allows a client to record failure conditions in the middle of a + * transaction without having to complete the transaction to record the + * failure to disk. ext3_error, for example, now uses this + * functionality. + * + * Errors which originate from within the journaling layer will NOT + * supply an errno; a null errno implies that absolutely no further + * writes are done to the journal (unless there are any already in + * progress). + * + */ + +void jbd2_journal_abort(journal_t *journal, int errno) +{ + __journal_abort_soft(journal, errno); +} + +/** + * int jbd2_journal_errno () - returns the journal's error state. + * @journal: journal to examine. + * + * This is the errno number set with jbd2_journal_abort(), the last + * time the journal was mounted - if the journal was stopped + * without calling abort this will be 0. + * + * If the journal has been aborted on this mount time -EROFS will + * be returned. + */ +int jbd2_journal_errno(journal_t *journal) +{ + int err; + + read_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) + err = -EROFS; + else + err = journal->j_errno; + read_unlock(&journal->j_state_lock); + return err; +} + +/** + * int jbd2_journal_clear_err () - clears the journal's error state + * @journal: journal to act on. + * + * An error must be cleared or acked to take a FS out of readonly + * mode. + */ +int jbd2_journal_clear_err(journal_t *journal) +{ + int err = 0; + + write_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) + err = -EROFS; + else + journal->j_errno = 0; + write_unlock(&journal->j_state_lock); + return err; +} + +/** + * void jbd2_journal_ack_err() - Ack journal err. + * @journal: journal to act on. + * + * An error must be cleared or acked to take a FS out of readonly + * mode. + */ +void jbd2_journal_ack_err(journal_t *journal) +{ + write_lock(&journal->j_state_lock); + if (journal->j_errno) + journal->j_flags |= JBD2_ACK_ERR; + write_unlock(&journal->j_state_lock); +} + +int jbd2_journal_blocks_per_page(struct inode *inode) +{ + return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); +} + +/* + * helper functions to deal with 32 or 64bit block numbers. + */ +size_t journal_tag_bytes(journal_t *journal) +{ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + return JBD2_TAG_SIZE64; + else + return JBD2_TAG_SIZE32; +} + +/* + * JBD memory management + * + * These functions are used to allocate block-sized chunks of memory + * used for making copies of buffer_head data. Very often it will be + * page-sized chunks of data, but sometimes it will be in + * sub-page-size chunks. (For example, 16k pages on Power systems + * with a 4k block file system.) For blocks smaller than a page, we + * use a SLAB allocator. There are slab caches for each block size, + * which are allocated at mount time, if necessary, and we only free + * (all of) the slab caches when/if the jbd2 module is unloaded. For + * this reason we don't need to a mutex to protect access to + * jbd2_slab[] allocating or releasing memory; only in + * jbd2_journal_create_slab(). + */ +#define JBD2_MAX_SLABS 8 +static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; + +static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { + "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", + "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k" +}; + + +static void jbd2_journal_destroy_slabs(void) +{ + int i; + + for (i = 0; i < JBD2_MAX_SLABS; i++) { + if (jbd2_slab[i]) + kmem_cache_destroy(jbd2_slab[i]); + jbd2_slab[i] = NULL; + } +} + +static int jbd2_journal_create_slab(size_t size) +{ + static DEFINE_MUTEX(jbd2_slab_create_mutex); + int i = order_base_2(size) - 10; + size_t slab_size; + + if (size == PAGE_SIZE) + return 0; + + if (i >= JBD2_MAX_SLABS) + return -EINVAL; + + if (unlikely(i < 0)) + i = 0; + mutex_lock(&jbd2_slab_create_mutex); + if (jbd2_slab[i]) { + mutex_unlock(&jbd2_slab_create_mutex); + return 0; /* Already created */ + } + + slab_size = 1 << (i+10); + jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, + slab_size, 0, NULL); + mutex_unlock(&jbd2_slab_create_mutex); + if (!jbd2_slab[i]) { + printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); + return -ENOMEM; + } + return 0; +} + +static struct kmem_cache *get_slab(size_t size) +{ + int i = order_base_2(size) - 10; + + BUG_ON(i >= JBD2_MAX_SLABS); + if (unlikely(i < 0)) + i = 0; + BUG_ON(jbd2_slab[i] == NULL); + return jbd2_slab[i]; +} + +void *jbd2_alloc(size_t size, gfp_t flags) +{ + void *ptr; + + BUG_ON(size & (size-1)); /* Must be a power of 2 */ + + flags |= __GFP_REPEAT; + if (size == PAGE_SIZE) + ptr = (void *)__get_free_pages(flags, 0); + else if (size > PAGE_SIZE) { + int order = get_order(size); + + if (order < 3) + ptr = (void *)__get_free_pages(flags, order); + else + ptr = vmalloc(size); + } else + ptr = kmem_cache_alloc(get_slab(size), flags); + + /* Check alignment; SLUB has gotten this wrong in the past, + * and this can lead to user data corruption! */ + BUG_ON(((unsigned long) ptr) & (size-1)); + + return ptr; +} + +void jbd2_free(void *ptr, size_t size) +{ + if (size == PAGE_SIZE) { + free_pages((unsigned long)ptr, 0); + return; + } + if (size > PAGE_SIZE) { + int order = get_order(size); + + if (order < 3) + free_pages((unsigned long)ptr, order); + else + vfree(ptr); + return; + } + kmem_cache_free(get_slab(size), ptr); +}; + +/* + * Journal_head storage management + */ +static struct kmem_cache *jbd2_journal_head_cache; +#ifdef CONFIG_JBD2_DEBUG +static atomic_t nr_journal_heads = ATOMIC_INIT(0); +#endif + +static int journal_init_jbd2_journal_head_cache(void) +{ + int retval; + + J_ASSERT(jbd2_journal_head_cache == NULL); + jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", + sizeof(struct journal_head), + 0, /* offset */ + SLAB_TEMPORARY, /* flags */ + NULL); /* ctor */ + retval = 0; + if (!jbd2_journal_head_cache) { + retval = -ENOMEM; + printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); + } + return retval; +} + +static void jbd2_journal_destroy_jbd2_journal_head_cache(void) +{ + if (jbd2_journal_head_cache) { + kmem_cache_destroy(jbd2_journal_head_cache); + jbd2_journal_head_cache = NULL; + } +} + +/* + * journal_head splicing and dicing + */ +static struct journal_head *journal_alloc_journal_head(void) +{ + struct journal_head *ret; + +#ifdef CONFIG_JBD2_DEBUG + atomic_inc(&nr_journal_heads); +#endif + ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); + if (!ret) { + jbd_debug(1, "out of memory for journal_head\n"); + pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); + while (!ret) { + yield(); + ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); + } + } + return ret; +} + +static void journal_free_journal_head(struct journal_head *jh) +{ +#ifdef CONFIG_JBD2_DEBUG + atomic_dec(&nr_journal_heads); + memset(jh, JBD2_POISON_FREE, sizeof(*jh)); +#endif + kmem_cache_free(jbd2_journal_head_cache, jh); +} + +/* + * A journal_head is attached to a buffer_head whenever JBD has an + * interest in the buffer. + * + * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit + * is set. This bit is tested in core kernel code where we need to take + * JBD-specific actions. Testing the zeroness of ->b_private is not reliable + * there. + * + * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. + * + * When a buffer has its BH_JBD bit set it is immune from being released by + * core kernel code, mainly via ->b_count. + * + * A journal_head is detached from its buffer_head when the journal_head's + * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint + * transaction (b_cp_transaction) hold their references to b_jcount. + * + * Various places in the kernel want to attach a journal_head to a buffer_head + * _before_ attaching the journal_head to a transaction. To protect the + * journal_head in this situation, jbd2_journal_add_journal_head elevates the + * journal_head's b_jcount refcount by one. The caller must call + * jbd2_journal_put_journal_head() to undo this. + * + * So the typical usage would be: + * + * (Attach a journal_head if needed. Increments b_jcount) + * struct journal_head *jh = jbd2_journal_add_journal_head(bh); + * ... + * (Get another reference for transaction) + * jbd2_journal_grab_journal_head(bh); + * jh->b_transaction = xxx; + * (Put original reference) + * jbd2_journal_put_journal_head(jh); + */ + +/* + * Give a buffer_head a journal_head. + * + * May sleep. + */ +struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh; + struct journal_head *new_jh = NULL; + +repeat: + if (!buffer_jbd(bh)) { + new_jh = journal_alloc_journal_head(); + memset(new_jh, 0, sizeof(*new_jh)); + } + + jbd_lock_bh_journal_head(bh); + if (buffer_jbd(bh)) { + jh = bh2jh(bh); + } else { + J_ASSERT_BH(bh, + (atomic_read(&bh->b_count) > 0) || + (bh->b_page && bh->b_page->mapping)); + + if (!new_jh) { + jbd_unlock_bh_journal_head(bh); + goto repeat; + } + + jh = new_jh; + new_jh = NULL; /* We consumed it */ + set_buffer_jbd(bh); + bh->b_private = jh; + jh->b_bh = bh; + get_bh(bh); + BUFFER_TRACE(bh, "added journal_head"); + } + jh->b_jcount++; + jbd_unlock_bh_journal_head(bh); + if (new_jh) + journal_free_journal_head(new_jh); + return bh->b_private; +} + +/* + * Grab a ref against this buffer_head's journal_head. If it ended up not + * having a journal_head, return NULL + */ +struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh = NULL; + + jbd_lock_bh_journal_head(bh); + if (buffer_jbd(bh)) { + jh = bh2jh(bh); + jh->b_jcount++; + } + jbd_unlock_bh_journal_head(bh); + return jh; +} + +static void __journal_remove_journal_head(struct buffer_head *bh) +{ + struct journal_head *jh = bh2jh(bh); + + J_ASSERT_JH(jh, jh->b_jcount >= 0); + J_ASSERT_JH(jh, jh->b_transaction == NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd2_free(jh->b_frozen_data, bh->b_size); + } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd2_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + journal_free_journal_head(jh); +} + +/* + * Drop a reference on the passed journal_head. If it fell to zero then + * release the journal_head from the buffer_head. + */ +void jbd2_journal_put_journal_head(struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + jbd_lock_bh_journal_head(bh); + J_ASSERT_JH(jh, jh->b_jcount > 0); + --jh->b_jcount; + if (!jh->b_jcount) { + __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); + __brelse(bh); + } else + jbd_unlock_bh_journal_head(bh); +} + +/* + * Initialize jbd inode head + */ +void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) +{ + jinode->i_transaction = NULL; + jinode->i_next_transaction = NULL; + jinode->i_vfs_inode = inode; + jinode->i_flags = 0; + INIT_LIST_HEAD(&jinode->i_list); +} + +/* + * Function to be called before we start removing inode from memory (i.e., + * clear_inode() is a fine place to be called from). It removes inode from + * transaction's lists. + */ +void jbd2_journal_release_jbd_inode(journal_t *journal, + struct jbd2_inode *jinode) +{ + if (!journal) + return; +restart: + spin_lock(&journal->j_list_lock); + /* Is commit writing out inode - we have to wait */ + if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) { + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); + wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&journal->j_list_lock); + schedule(); + finish_wait(wq, &wait.wait); + goto restart; + } + + if (jinode->i_transaction) { + list_del(&jinode->i_list); + jinode->i_transaction = NULL; + } + spin_unlock(&journal->j_list_lock); +} + +/* + * debugfs tunables + */ +#ifdef CONFIG_JBD2_DEBUG +u8 jbd2_journal_enable_debug __read_mostly; +EXPORT_SYMBOL(jbd2_journal_enable_debug); + +#define JBD2_DEBUG_NAME "jbd2-debug" + +static struct dentry *jbd2_debugfs_dir; +static struct dentry *jbd2_debug; + +static void __init jbd2_create_debugfs_entry(void) +{ + jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL); + if (jbd2_debugfs_dir) + jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, + S_IRUGO | S_IWUSR, + jbd2_debugfs_dir, + &jbd2_journal_enable_debug); +} + +static void __exit jbd2_remove_debugfs_entry(void) +{ + debugfs_remove(jbd2_debug); + debugfs_remove(jbd2_debugfs_dir); +} + +#else + +static void __init jbd2_create_debugfs_entry(void) +{ +} + +static void __exit jbd2_remove_debugfs_entry(void) +{ +} + +#endif + +#ifdef CONFIG_PROC_FS + +#define JBD2_STATS_PROC_NAME "fs/jbd2" + +static void __init jbd2_create_jbd_stats_proc_entry(void) +{ + proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL); +} + +static void __exit jbd2_remove_jbd_stats_proc_entry(void) +{ + if (proc_jbd2_stats) + remove_proc_entry(JBD2_STATS_PROC_NAME, NULL); +} + +#else + +#define jbd2_create_jbd_stats_proc_entry() do {} while (0) +#define jbd2_remove_jbd_stats_proc_entry() do {} while (0) + +#endif + +struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; + +static int __init journal_init_handle_cache(void) +{ + jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); + if (jbd2_handle_cache == NULL) { + printk(KERN_EMERG "JBD2: failed to create handle cache\n"); + return -ENOMEM; + } + jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0); + if (jbd2_inode_cache == NULL) { + printk(KERN_EMERG "JBD2: failed to create inode cache\n"); + kmem_cache_destroy(jbd2_handle_cache); + return -ENOMEM; + } + return 0; +} + +static void jbd2_journal_destroy_handle_cache(void) +{ + if (jbd2_handle_cache) + kmem_cache_destroy(jbd2_handle_cache); + if (jbd2_inode_cache) + kmem_cache_destroy(jbd2_inode_cache); + +} + +/* + * Module startup and shutdown + */ + +static int __init journal_init_caches(void) +{ + int ret; + + ret = jbd2_journal_init_revoke_caches(); + if (ret == 0) + ret = journal_init_jbd2_journal_head_cache(); + if (ret == 0) + ret = journal_init_handle_cache(); + return ret; +} + +static void jbd2_journal_destroy_caches(void) +{ + jbd2_journal_destroy_revoke_caches(); + jbd2_journal_destroy_jbd2_journal_head_cache(); + jbd2_journal_destroy_handle_cache(); + jbd2_journal_destroy_slabs(); +} + +static int __init journal_init(void) +{ + int ret; + + BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); + + ret = journal_init_caches(); + if (ret == 0) { + jbd2_create_debugfs_entry(); + jbd2_create_jbd_stats_proc_entry(); + } else { + jbd2_journal_destroy_caches(); + } + return ret; +} + +static void __exit journal_exit(void) +{ +#ifdef CONFIG_JBD2_DEBUG + int n = atomic_read(&nr_journal_heads); + if (n) + printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); +#endif + jbd2_remove_debugfs_entry(); + jbd2_remove_jbd_stats_proc_entry(); + jbd2_journal_destroy_caches(); +} + +/* + * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 + * tracing infrastructure to map a dev_t to a device name. + * + * The caller should use rcu_read_lock() in order to make sure the + * device name stays valid until its done with it. We use + * rcu_read_lock() as well to make sure we're safe in case the caller + * gets sloppy, and because rcu_read_lock() is cheap and can be safely + * nested. + */ +struct devname_cache { + struct rcu_head rcu; + dev_t device; + char devname[BDEVNAME_SIZE]; +}; +#define CACHE_SIZE_BITS 6 +static struct devname_cache *devcache[1 << CACHE_SIZE_BITS]; +static DEFINE_SPINLOCK(devname_cache_lock); + +static void free_devcache(struct rcu_head *rcu) +{ + kfree(rcu); +} + +const char *jbd2_dev_to_name(dev_t device) +{ + int i = hash_32(device, CACHE_SIZE_BITS); + char *ret; + struct block_device *bd; + static struct devname_cache *new_dev; + + rcu_read_lock(); + if (devcache[i] && devcache[i]->device == device) { + ret = devcache[i]->devname; + rcu_read_unlock(); + return ret; + } + rcu_read_unlock(); + + new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); + if (!new_dev) + return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ + bd = bdget(device); + spin_lock(&devname_cache_lock); + if (devcache[i]) { + if (devcache[i]->device == device) { + kfree(new_dev); + bdput(bd); + ret = devcache[i]->devname; + spin_unlock(&devname_cache_lock); + return ret; + } + call_rcu(&devcache[i]->rcu, free_devcache); + } + devcache[i] = new_dev; + devcache[i]->device = device; + if (bd) { + bdevname(bd, devcache[i]->devname); + bdput(bd); + } else + __bdevname(device, devcache[i]->devname); + ret = devcache[i]->devname; + spin_unlock(&devname_cache_lock); + return ret; +} +EXPORT_SYMBOL(jbd2_dev_to_name); + +MODULE_LICENSE("GPL"); +module_init(journal_init); +module_exit(journal_exit); + diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c new file mode 100644 index 00000000..1cad8694 --- /dev/null +++ b/fs/jbd2/recovery.c @@ -0,0 +1,738 @@ +/* + * linux/fs/jbd2/recovery.c + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1999-2000 Red Hat Software --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal recovery routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + +#ifndef __KERNEL__ +#include "jfs_user.h" +#else +#include +#include +#include +#include +#include +#endif + +/* + * Maintain information about the progress of the recovery job, so that + * the different passes can carry information between them. + */ +struct recovery_info +{ + tid_t start_transaction; + tid_t end_transaction; + + int nr_replays; + int nr_revokes; + int nr_revoke_hits; +}; + +enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; +static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass); +static int scan_revoke_records(journal_t *, struct buffer_head *, + tid_t, struct recovery_info *); + +#ifdef __KERNEL__ + +/* Release readahead buffers after use */ +static void journal_brelse_array(struct buffer_head *b[], int n) +{ + while (--n >= 0) + brelse (b[n]); +} + + +/* + * When reading from the journal, we are going through the block device + * layer directly and so there is no readahead being done for us. We + * need to implement any readahead ourselves if we want it to happen at + * all. Recovery is basically one long sequential read, so make sure we + * do the IO in reasonably large chunks. + * + * This is not so critical that we need to be enormously clever about + * the readahead size, though. 128K is a purely arbitrary, good-enough + * fixed value. + */ + +#define MAXBUF 8 +static int do_readahead(journal_t *journal, unsigned int start) +{ + int err; + unsigned int max, nbufs, next; + unsigned long long blocknr; + struct buffer_head *bh; + + struct buffer_head * bufs[MAXBUF]; + + /* Do up to 128K of readahead */ + max = start + (128 * 1024 / journal->j_blocksize); + if (max > journal->j_maxlen) + max = journal->j_maxlen; + + /* Do the readahead itself. We'll submit MAXBUF buffer_heads at + * a time to the block device IO layer. */ + + nbufs = 0; + + for (next = start; next < max; next++) { + err = jbd2_journal_bmap(journal, next, &blocknr); + + if (err) { + printk (KERN_ERR "JBD: bad block at offset %u\n", + next); + goto failed; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) { + err = -ENOMEM; + goto failed; + } + + if (!buffer_uptodate(bh) && !buffer_locked(bh)) { + bufs[nbufs++] = bh; + if (nbufs == MAXBUF) { + ll_rw_block(READ, nbufs, bufs); + journal_brelse_array(bufs, nbufs); + nbufs = 0; + } + } else + brelse(bh); + } + + if (nbufs) + ll_rw_block(READ, nbufs, bufs); + err = 0; + +failed: + if (nbufs) + journal_brelse_array(bufs, nbufs); + return err; +} + +#endif /* __KERNEL__ */ + + +/* + * Read a block from the journal + */ + +static int jread(struct buffer_head **bhp, journal_t *journal, + unsigned int offset) +{ + int err; + unsigned long long blocknr; + struct buffer_head *bh; + + *bhp = NULL; + + if (offset >= journal->j_maxlen) { + printk(KERN_ERR "JBD: corrupted journal superblock\n"); + return -EIO; + } + + err = jbd2_journal_bmap(journal, offset, &blocknr); + + if (err) { + printk (KERN_ERR "JBD: bad block at offset %u\n", + offset); + return err; + } + + bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return -ENOMEM; + + if (!buffer_uptodate(bh)) { + /* If this is a brand new buffer, start readahead. + Otherwise, we assume we are already reading it. */ + if (!buffer_req(bh)) + do_readahead(journal, offset); + wait_on_buffer(bh); + } + + if (!buffer_uptodate(bh)) { + printk (KERN_ERR "JBD: Failed to read block at offset %u\n", + offset); + brelse(bh); + return -EIO; + } + + *bhp = bh; + return 0; +} + + +/* + * Count the number of in-use tags in a journal descriptor block. + */ + +static int count_tags(journal_t *journal, struct buffer_head *bh) +{ + char * tagp; + journal_block_tag_t * tag; + int nr = 0, size = journal->j_blocksize; + int tag_bytes = journal_tag_bytes(journal); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + + while ((tagp - bh->b_data + tag_bytes) <= size) { + tag = (journal_block_tag_t *) tagp; + + nr++; + tagp += tag_bytes; + if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID))) + tagp += 16; + + if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG)) + break; + } + + return nr; +} + + +/* Make sure we wrap around the log correctly! */ +#define wrap(journal, var) \ +do { \ + if (var >= (journal)->j_last) \ + var -= ((journal)->j_last - (journal)->j_first); \ +} while (0) + +/** + * jbd2_journal_recover - recovers a on-disk journal + * @journal: the journal to recover + * + * The primary function for recovering the log contents when mounting a + * journaled device. + * + * Recovery is done in three passes. In the first pass, we look for the + * end of the log. In the second, we assemble the list of revoke + * blocks. In the third and final pass, we replay any un-revoked blocks + * in the log. + */ +int jbd2_journal_recover(journal_t *journal) +{ + int err, err2; + journal_superblock_t * sb; + + struct recovery_info info; + + memset(&info, 0, sizeof(info)); + sb = journal->j_superblock; + + /* + * The journal superblock's s_start field (the current log head) + * is always zero if, and only if, the journal was cleanly + * unmounted. + */ + + if (!sb->s_start) { + jbd_debug(1, "No recovery required, last transaction %d\n", + be32_to_cpu(sb->s_sequence)); + journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; + return 0; + } + + err = do_one_pass(journal, &info, PASS_SCAN); + if (!err) + err = do_one_pass(journal, &info, PASS_REVOKE); + if (!err) + err = do_one_pass(journal, &info, PASS_REPLAY); + + jbd_debug(1, "JBD: recovery, exit status %d, " + "recovered transactions %u to %u\n", + err, info.start_transaction, info.end_transaction); + jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n", + info.nr_replays, info.nr_revoke_hits, info.nr_revokes); + + /* Restart the log at the next transaction ID, thus invalidating + * any existing commit records in the log. */ + journal->j_transaction_sequence = ++info.end_transaction; + + jbd2_journal_clear_revoke(journal); + err2 = sync_blockdev(journal->j_fs_dev); + if (!err) + err = err2; + + return err; +} + +/** + * jbd2_journal_skip_recovery - Start journal and wipe exiting records + * @journal: journal to startup + * + * Locate any valid recovery information from the journal and set up the + * journal structures in memory to ignore it (presumably because the + * caller has evidence that it is out of date). + * This function does'nt appear to be exorted.. + * + * We perform one pass over the journal to allow us to tell the user how + * much recovery information is being erased, and to let us initialise + * the journal transaction sequence numbers to the next unused ID. + */ +int jbd2_journal_skip_recovery(journal_t *journal) +{ + int err; + + struct recovery_info info; + + memset (&info, 0, sizeof(info)); + + err = do_one_pass(journal, &info, PASS_SCAN); + + if (err) { + printk(KERN_ERR "JBD: error %d scanning journal\n", err); + ++journal->j_transaction_sequence; + } else { +#ifdef CONFIG_JBD2_DEBUG + int dropped = info.end_transaction - + be32_to_cpu(journal->j_superblock->s_sequence); + jbd_debug(1, + "JBD: ignoring %d transaction%s from the journal.\n", + dropped, (dropped == 1) ? "" : "s"); +#endif + journal->j_transaction_sequence = ++info.end_transaction; + } + + journal->j_tail = 0; + return err; +} + +static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag) +{ + unsigned long long block = be32_to_cpu(tag->t_blocknr); + if (tag_bytes > JBD2_TAG_SIZE32) + block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32; + return block; +} + +/* + * calc_chksums calculates the checksums for the blocks described in the + * descriptor block. + */ +static int calc_chksums(journal_t *journal, struct buffer_head *bh, + unsigned long *next_log_block, __u32 *crc32_sum) +{ + int i, num_blks, err; + unsigned long io_block; + struct buffer_head *obh; + + num_blks = count_tags(journal, bh); + /* Calculate checksum of the descriptor block. */ + *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size); + + for (i = 0; i < num_blks; i++) { + io_block = (*next_log_block)++; + wrap(journal, *next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + printk(KERN_ERR "JBD: IO error %d recovering block " + "%lu in log\n", err, io_block); + return 1; + } else { + *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data, + obh->b_size); + } + put_bh(obh); + } + return 0; +} + +static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass) +{ + unsigned int first_commit_ID, next_commit_ID; + unsigned long next_log_block; + int err, success = 0; + journal_superblock_t * sb; + journal_header_t * tmp; + struct buffer_head * bh; + unsigned int sequence; + int blocktype; + int tag_bytes = journal_tag_bytes(journal); + __u32 crc32_sum = ~0; /* Transactional Checksums */ + + /* + * First thing is to establish what we expect to find in the log + * (in terms of transaction IDs), and where (in terms of log + * block offsets): query the superblock. + */ + + sb = journal->j_superblock; + next_commit_ID = be32_to_cpu(sb->s_sequence); + next_log_block = be32_to_cpu(sb->s_start); + + first_commit_ID = next_commit_ID; + if (pass == PASS_SCAN) + info->start_transaction = first_commit_ID; + + jbd_debug(1, "Starting recovery pass %d\n", pass); + + /* + * Now we walk through the log, transaction by transaction, + * making sure that each transaction has a commit block in the + * expected place. Each complete transaction gets replayed back + * into the main filesystem. + */ + + while (1) { + int flags; + char * tagp; + journal_block_tag_t * tag; + struct buffer_head * obh; + struct buffer_head * nbh; + + cond_resched(); + + /* If we already know where to stop the log traversal, + * check right now that we haven't gone past the end of + * the log. */ + + if (pass != PASS_SCAN) + if (tid_geq(next_commit_ID, info->end_transaction)) + break; + + jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", + next_commit_ID, next_log_block, journal->j_last); + + /* Skip over each chunk of the transaction looking + * either the next descriptor block or the final commit + * record. */ + + jbd_debug(3, "JBD: checking block %ld\n", next_log_block); + err = jread(&bh, journal, next_log_block); + if (err) + goto failed; + + next_log_block++; + wrap(journal, next_log_block); + + /* What kind of buffer is it? + * + * If it is a descriptor block, check that it has the + * expected sequence number. Otherwise, we're all done + * here. */ + + tmp = (journal_header_t *)bh->b_data; + + if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) { + brelse(bh); + break; + } + + blocktype = be32_to_cpu(tmp->h_blocktype); + sequence = be32_to_cpu(tmp->h_sequence); + jbd_debug(3, "Found magic %d, sequence %d\n", + blocktype, sequence); + + if (sequence != next_commit_ID) { + brelse(bh); + break; + } + + /* OK, we have a valid descriptor block which matches + * all of the sequence number checks. What are we going + * to do with it? That depends on the pass... */ + + switch(blocktype) { + case JBD2_DESCRIPTOR_BLOCK: + /* If it is a valid descriptor block, replay it + * in pass REPLAY; if journal_checksums enabled, then + * calculate checksums in PASS_SCAN, otherwise, + * just skip over the blocks it describes. */ + if (pass != PASS_REPLAY) { + if (pass == PASS_SCAN && + JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM) && + !info->end_transaction) { + if (calc_chksums(journal, bh, + &next_log_block, + &crc32_sum)) { + put_bh(bh); + break; + } + put_bh(bh); + continue; + } + next_log_block += count_tags(journal, bh); + wrap(journal, next_log_block); + put_bh(bh); + continue; + } + + /* A descriptor block: we can now write all of + * the data blocks. Yay, useful work is finally + * getting done here! */ + + tagp = &bh->b_data[sizeof(journal_header_t)]; + while ((tagp - bh->b_data + tag_bytes) + <= journal->j_blocksize) { + unsigned long io_block; + + tag = (journal_block_tag_t *) tagp; + flags = be32_to_cpu(tag->t_flags); + + io_block = next_log_block++; + wrap(journal, next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + /* Recover what we can, but + * report failure at the end. */ + success = err; + printk (KERN_ERR + "JBD: IO error %d recovering " + "block %ld in log\n", + err, io_block); + } else { + unsigned long long blocknr; + + J_ASSERT(obh != NULL); + blocknr = read_tag_block(tag_bytes, + tag); + + /* If the block has been + * revoked, then we're all done + * here. */ + if (jbd2_journal_test_revoke + (journal, blocknr, + next_commit_ID)) { + brelse(obh); + ++info->nr_revoke_hits; + goto skip_write; + } + + /* Find a buffer for the new + * data being restored */ + nbh = __getblk(journal->j_fs_dev, + blocknr, + journal->j_blocksize); + if (nbh == NULL) { + printk(KERN_ERR + "JBD: Out of memory " + "during recovery.\n"); + err = -ENOMEM; + brelse(bh); + brelse(obh); + goto failed; + } + + lock_buffer(nbh); + memcpy(nbh->b_data, obh->b_data, + journal->j_blocksize); + if (flags & JBD2_FLAG_ESCAPE) { + *((__be32 *)nbh->b_data) = + cpu_to_be32(JBD2_MAGIC_NUMBER); + } + + BUFFER_TRACE(nbh, "marking dirty"); + set_buffer_uptodate(nbh); + mark_buffer_dirty(nbh); + BUFFER_TRACE(nbh, "marking uptodate"); + ++info->nr_replays; + /* ll_rw_block(WRITE, 1, &nbh); */ + unlock_buffer(nbh); + brelse(obh); + brelse(nbh); + } + + skip_write: + tagp += tag_bytes; + if (!(flags & JBD2_FLAG_SAME_UUID)) + tagp += 16; + + if (flags & JBD2_FLAG_LAST_TAG) + break; + } + + brelse(bh); + continue; + + case JBD2_COMMIT_BLOCK: + /* How to differentiate between interrupted commit + * and journal corruption ? + * + * {nth transaction} + * Checksum Verification Failed + * | + * ____________________ + * | | + * async_commit sync_commit + * | | + * | GO TO NEXT "Journal Corruption" + * | TRANSACTION + * | + * {(n+1)th transanction} + * | + * _______|______________ + * | | + * Commit block found Commit block not found + * | | + * "Journal Corruption" | + * _____________|_________ + * | | + * nth trans corrupt OR nth trans + * and (n+1)th interrupted interrupted + * before commit block + * could reach the disk. + * (Cannot find the difference in above + * mentioned conditions. Hence assume + * "Interrupted Commit".) + */ + + /* Found an expected commit block: if checksums + * are present verify them in PASS_SCAN; else not + * much to do other than move on to the next sequence + * number. */ + if (pass == PASS_SCAN && + JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM)) { + int chksum_err, chksum_seen; + struct commit_header *cbh = + (struct commit_header *)bh->b_data; + unsigned found_chksum = + be32_to_cpu(cbh->h_chksum[0]); + + chksum_err = chksum_seen = 0; + + if (info->end_transaction) { + journal->j_failed_commit = + info->end_transaction; + brelse(bh); + break; + } + + if (crc32_sum == found_chksum && + cbh->h_chksum_type == JBD2_CRC32_CHKSUM && + cbh->h_chksum_size == + JBD2_CRC32_CHKSUM_SIZE) + chksum_seen = 1; + else if (!(cbh->h_chksum_type == 0 && + cbh->h_chksum_size == 0 && + found_chksum == 0 && + !chksum_seen)) + /* + * If fs is mounted using an old kernel and then + * kernel with journal_chksum is used then we + * get a situation where the journal flag has + * checksum flag set but checksums are not + * present i.e chksum = 0, in the individual + * commit blocks. + * Hence to avoid checksum failures, in this + * situation, this extra check is added. + */ + chksum_err = 1; + + if (chksum_err) { + info->end_transaction = next_commit_ID; + + if (!JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){ + journal->j_failed_commit = + next_commit_ID; + brelse(bh); + break; + } + } + crc32_sum = ~0; + } + brelse(bh); + next_commit_ID++; + continue; + + case JBD2_REVOKE_BLOCK: + /* If we aren't in the REVOKE pass, then we can + * just skip over this block. */ + if (pass != PASS_REVOKE) { + brelse(bh); + continue; + } + + err = scan_revoke_records(journal, bh, + next_commit_ID, info); + brelse(bh); + if (err) + goto failed; + continue; + + default: + jbd_debug(3, "Unrecognised magic %d, end of scan.\n", + blocktype); + brelse(bh); + goto done; + } + } + + done: + /* + * We broke out of the log scan loop: either we came to the + * known end of the log or we found an unexpected block in the + * log. If the latter happened, then we know that the "current" + * transaction marks the end of the valid log. + */ + + if (pass == PASS_SCAN) { + if (!info->end_transaction) + info->end_transaction = next_commit_ID; + } else { + /* It's really bad news if different passes end up at + * different places (but possible due to IO errors). */ + if (info->end_transaction != next_commit_ID) { + printk (KERN_ERR "JBD: recovery pass %d ended at " + "transaction %u, expected %u\n", + pass, next_commit_ID, info->end_transaction); + if (!success) + success = -EIO; + } + } + + return success; + + failed: + return err; +} + + +/* Scan a revoke record, marking all blocks mentioned as revoked. */ + +static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, + tid_t sequence, struct recovery_info *info) +{ + jbd2_journal_revoke_header_t *header; + int offset, max; + int record_len = 4; + + header = (jbd2_journal_revoke_header_t *) bh->b_data; + offset = sizeof(jbd2_journal_revoke_header_t); + max = be32_to_cpu(header->r_count); + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + record_len = 8; + + while (offset + record_len <= max) { + unsigned long long blocknr; + int err; + + if (record_len == 4) + blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); + else + blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset))); + offset += record_len; + err = jbd2_journal_set_revoke(journal, blocknr, sequence); + if (err) + return err; + ++info->nr_revokes; + } + return 0; +} diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c new file mode 100644 index 00000000..69fd9358 --- /dev/null +++ b/fs/jbd2/revoke.c @@ -0,0 +1,714 @@ +/* + * linux/fs/jbd2/revoke.c + * + * Written by Stephen C. Tweedie , 2000 + * + * Copyright 2000 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal revoke routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + * + * Revoke is the mechanism used to prevent old log records for deleted + * metadata from being replayed on top of newer data using the same + * blocks. The revoke mechanism is used in two separate places: + * + * + Commit: during commit we write the entire list of the current + * transaction's revoked blocks to the journal + * + * + Recovery: during recovery we record the transaction ID of all + * revoked blocks. If there are multiple revoke records in the log + * for a single block, only the last one counts, and if there is a log + * entry for a block beyond the last revoke, then that log entry still + * gets replayed. + * + * We can get interactions between revokes and new log data within a + * single transaction: + * + * Block is revoked and then journaled: + * The desired end result is the journaling of the new block, so we + * cancel the revoke before the transaction commits. + * + * Block is journaled and then revoked: + * The revoke must take precedence over the write of the block, so we + * need either to cancel the journal entry or to write the revoke + * later in the log than the log block. In this case, we choose the + * latter: journaling a block cancels any revoke record for that block + * in the current transaction, so any revoke for that block in the + * transaction must have happened after the block was journaled and so + * the revoke must take precedence. + * + * Block is revoked and then written as data: + * The data write is allowed to succeed, but the revoke is _not_ + * cancelled. We still need to prevent old log records from + * overwriting the new data. We don't even need to clear the revoke + * bit here. + * + * Revoke information on buffers is a tri-state value: + * + * RevokeValid clear: no cached revoke status, need to look it up + * RevokeValid set, Revoked clear: + * buffer has not been revoked, and cancel_revoke + * need do nothing. + * RevokeValid set, Revoked set: + * buffer has been revoked. + * + * Locking rules: + * We keep two hash tables of revoke records. One hashtable belongs to the + * running transaction (is pointed to by journal->j_revoke), the other one + * belongs to the committing transaction. Accesses to the second hash table + * happen only from the kjournald and no other thread touches this table. Also + * journal_switch_revoke_table() which switches which hashtable belongs to the + * running and which to the committing transaction is called only from + * kjournald. Therefore we need no locks when accessing the hashtable belonging + * to the committing transaction. + * + * All users operating on the hash table belonging to the running transaction + * have a handle to the transaction. Therefore they are safe from kjournald + * switching hash tables under them. For operations on the lists of entries in + * the hash table j_revoke_lock is used. + * + * Finally, also replay code uses the hash tables but at this moment no one else + * can touch them (filesystem isn't mounted yet) and hence no locking is + * needed. + */ + +#ifndef __KERNEL__ +#include "jfs_user.h" +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif +#include + +static struct kmem_cache *jbd2_revoke_record_cache; +static struct kmem_cache *jbd2_revoke_table_cache; + +/* Each revoke record represents one single revoked block. During + journal replay, this involves recording the transaction ID of the + last transaction to revoke this block. */ + +struct jbd2_revoke_record_s +{ + struct list_head hash; + tid_t sequence; /* Used for recovery only */ + unsigned long long blocknr; +}; + + +/* The revoke table is just a simple hash table of revoke records. */ +struct jbd2_revoke_table_s +{ + /* It is conceivable that we might want a larger hash table + * for recovery. Must be a power of two. */ + int hash_size; + int hash_shift; + struct list_head *hash_table; +}; + + +#ifdef __KERNEL__ +static void write_one_revoke_record(journal_t *, transaction_t *, + struct journal_head **, int *, + struct jbd2_revoke_record_s *, int); +static void flush_descriptor(journal_t *, struct journal_head *, int, int); +#endif + +/* Utility functions to maintain the revoke table */ + +/* Borrowed from buffer.c: this is a tried and tested block hash function */ +static inline int hash(journal_t *journal, unsigned long long block) +{ + struct jbd2_revoke_table_s *table = journal->j_revoke; + int hash_shift = table->hash_shift; + int hash = (int)block ^ (int)((block >> 31) >> 1); + + return ((hash << (hash_shift - 6)) ^ + (hash >> 13) ^ + (hash << (hash_shift - 12))) & (table->hash_size - 1); +} + +static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, + tid_t seq) +{ + struct list_head *hash_list; + struct jbd2_revoke_record_s *record; + +repeat: + record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS); + if (!record) + goto oom; + + record->sequence = seq; + record->blocknr = blocknr; + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + spin_lock(&journal->j_revoke_lock); + list_add(&record->hash, hash_list); + spin_unlock(&journal->j_revoke_lock); + return 0; + +oom: + if (!journal_oom_retry) + return -ENOMEM; + jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); + yield(); + goto repeat; +} + +/* Find a revoke record in the journal's hash table. */ + +static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, + unsigned long long blocknr) +{ + struct list_head *hash_list; + struct jbd2_revoke_record_s *record; + + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + + spin_lock(&journal->j_revoke_lock); + record = (struct jbd2_revoke_record_s *) hash_list->next; + while (&(record->hash) != hash_list) { + if (record->blocknr == blocknr) { + spin_unlock(&journal->j_revoke_lock); + return record; + } + record = (struct jbd2_revoke_record_s *) record->hash.next; + } + spin_unlock(&journal->j_revoke_lock); + return NULL; +} + +void jbd2_journal_destroy_revoke_caches(void) +{ + if (jbd2_revoke_record_cache) { + kmem_cache_destroy(jbd2_revoke_record_cache); + jbd2_revoke_record_cache = NULL; + } + if (jbd2_revoke_table_cache) { + kmem_cache_destroy(jbd2_revoke_table_cache); + jbd2_revoke_table_cache = NULL; + } +} + +int __init jbd2_journal_init_revoke_caches(void) +{ + J_ASSERT(!jbd2_revoke_record_cache); + J_ASSERT(!jbd2_revoke_table_cache); + + jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record", + sizeof(struct jbd2_revoke_record_s), + 0, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, + NULL); + if (!jbd2_revoke_record_cache) + goto record_cache_failure; + + jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table", + sizeof(struct jbd2_revoke_table_s), + 0, SLAB_TEMPORARY, NULL); + if (!jbd2_revoke_table_cache) + goto table_cache_failure; + return 0; +table_cache_failure: + jbd2_journal_destroy_revoke_caches(); +record_cache_failure: + return -ENOMEM; +} + +static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) +{ + int shift = 0; + int tmp = hash_size; + struct jbd2_revoke_table_s *table; + + table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); + if (!table) + goto out; + + while((tmp >>= 1UL) != 0UL) + shift++; + + table->hash_size = hash_size; + table->hash_shift = shift; + table->hash_table = + kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); + if (!table->hash_table) { + kmem_cache_free(jbd2_revoke_table_cache, table); + table = NULL; + goto out; + } + + for (tmp = 0; tmp < hash_size; tmp++) + INIT_LIST_HEAD(&table->hash_table[tmp]); + +out: + return table; +} + +static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) +{ + int i; + struct list_head *hash_list; + + for (i = 0; i < table->hash_size; i++) { + hash_list = &table->hash_table[i]; + J_ASSERT(list_empty(hash_list)); + } + + kfree(table->hash_table); + kmem_cache_free(jbd2_revoke_table_cache, table); +} + +/* Initialise the revoke table for a given journal to a given size. */ +int jbd2_journal_init_revoke(journal_t *journal, int hash_size) +{ + J_ASSERT(journal->j_revoke_table[0] == NULL); + J_ASSERT(is_power_of_2(hash_size)); + + journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[0]) + goto fail0; + + journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[1]) + goto fail1; + + journal->j_revoke = journal->j_revoke_table[1]; + + spin_lock_init(&journal->j_revoke_lock); + + return 0; + +fail1: + jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); +fail0: + return -ENOMEM; +} + +/* Destroy a journal's revoke table. The table must already be empty! */ +void jbd2_journal_destroy_revoke(journal_t *journal) +{ + journal->j_revoke = NULL; + if (journal->j_revoke_table[0]) + jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); + if (journal->j_revoke_table[1]) + jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]); +} + + +#ifdef __KERNEL__ + +/* + * jbd2_journal_revoke: revoke a given buffer_head from the journal. This + * prevents the block from being replayed during recovery if we take a + * crash after this current transaction commits. Any subsequent + * metadata writes of the buffer in this transaction cancel the + * revoke. + * + * Note that this call may block --- it is up to the caller to make + * sure that there are no further calls to journal_write_metadata + * before the revoke is complete. In ext3, this implies calling the + * revoke before clearing the block bitmap when we are deleting + * metadata. + * + * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a + * parameter, but does _not_ forget the buffer_head if the bh was only + * found implicitly. + * + * bh_in may not be a journalled buffer - it may have come off + * the hash tables without an attached journal_head. + * + * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count + * by one. + */ + +int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, + struct buffer_head *bh_in) +{ + struct buffer_head *bh = NULL; + journal_t *journal; + struct block_device *bdev; + int err; + + might_sleep(); + if (bh_in) + BUFFER_TRACE(bh_in, "enter"); + + journal = handle->h_transaction->t_journal; + if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){ + J_ASSERT (!"Cannot set revoke feature!"); + return -EINVAL; + } + + bdev = journal->j_fs_dev; + bh = bh_in; + + if (!bh) { + bh = __find_get_block(bdev, blocknr, journal->j_blocksize); + if (bh) + BUFFER_TRACE(bh, "found on hash"); + } +#ifdef JBD2_EXPENSIVE_CHECKING + else { + struct buffer_head *bh2; + + /* If there is a different buffer_head lying around in + * memory anywhere... */ + bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); + if (bh2) { + /* ... and it has RevokeValid status... */ + if (bh2 != bh && buffer_revokevalid(bh2)) + /* ...then it better be revoked too, + * since it's illegal to create a revoke + * record against a buffer_head which is + * not marked revoked --- that would + * risk missing a subsequent revoke + * cancel. */ + J_ASSERT_BH(bh2, buffer_revoked(bh2)); + put_bh(bh2); + } + } +#endif + + /* We really ought not ever to revoke twice in a row without + first having the revoke cancelled: it's illegal to free a + block twice without allocating it in between! */ + if (bh) { + if (!J_EXPECT_BH(bh, !buffer_revoked(bh), + "inconsistent data on disk")) { + if (!bh_in) + brelse(bh); + return -EIO; + } + set_buffer_revoked(bh); + set_buffer_revokevalid(bh); + if (bh_in) { + BUFFER_TRACE(bh_in, "call jbd2_journal_forget"); + jbd2_journal_forget(handle, bh_in); + } else { + BUFFER_TRACE(bh, "call brelse"); + __brelse(bh); + } + } + + jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); + err = insert_revoke_hash(journal, blocknr, + handle->h_transaction->t_tid); + BUFFER_TRACE(bh_in, "exit"); + return err; +} + +/* + * Cancel an outstanding revoke. For use only internally by the + * journaling code (called from jbd2_journal_get_write_access). + * + * We trust buffer_revoked() on the buffer if the buffer is already + * being journaled: if there is no revoke pending on the buffer, then we + * don't do anything here. + * + * This would break if it were possible for a buffer to be revoked and + * discarded, and then reallocated within the same transaction. In such + * a case we would have lost the revoked bit, but when we arrived here + * the second time we would still have a pending revoke to cancel. So, + * do not trust the Revoked bit on buffers unless RevokeValid is also + * set. + */ +int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) +{ + struct jbd2_revoke_record_s *record; + journal_t *journal = handle->h_transaction->t_journal; + int need_cancel; + int did_revoke = 0; /* akpm: debug */ + struct buffer_head *bh = jh2bh(jh); + + jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); + + /* Is the existing Revoke bit valid? If so, we trust it, and + * only perform the full cancel if the revoke bit is set. If + * not, we can't trust the revoke bit, and we need to do the + * full search for a revoke record. */ + if (test_set_buffer_revokevalid(bh)) { + need_cancel = test_clear_buffer_revoked(bh); + } else { + need_cancel = 1; + clear_buffer_revoked(bh); + } + + if (need_cancel) { + record = find_revoke_record(journal, bh->b_blocknr); + if (record) { + jbd_debug(4, "cancelled existing revoke on " + "blocknr %llu\n", (unsigned long long)bh->b_blocknr); + spin_lock(&journal->j_revoke_lock); + list_del(&record->hash); + spin_unlock(&journal->j_revoke_lock); + kmem_cache_free(jbd2_revoke_record_cache, record); + did_revoke = 1; + } + } + +#ifdef JBD2_EXPENSIVE_CHECKING + /* There better not be one left behind by now! */ + record = find_revoke_record(journal, bh->b_blocknr); + J_ASSERT_JH(jh, record == NULL); +#endif + + /* Finally, have we just cleared revoke on an unhashed + * buffer_head? If so, we'd better make sure we clear the + * revoked status on any hashed alias too, otherwise the revoke + * state machine will get very upset later on. */ + if (need_cancel) { + struct buffer_head *bh2; + bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); + if (bh2) { + if (bh2 != bh) + clear_buffer_revoked(bh2); + __brelse(bh2); + } + } + return did_revoke; +} + +/* journal_switch_revoke table select j_revoke for next transaction + * we do not want to suspend any processing until all revokes are + * written -bzzz + */ +void jbd2_journal_switch_revoke_table(journal_t *journal) +{ + int i; + + if (journal->j_revoke == journal->j_revoke_table[0]) + journal->j_revoke = journal->j_revoke_table[1]; + else + journal->j_revoke = journal->j_revoke_table[0]; + + for (i = 0; i < journal->j_revoke->hash_size; i++) + INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); +} + +/* + * Write revoke records to the journal for all entries in the current + * revoke hash, deleting the entries as we go. + */ +void jbd2_journal_write_revoke_records(journal_t *journal, + transaction_t *transaction, + int write_op) +{ + struct journal_head *descriptor; + struct jbd2_revoke_record_s *record; + struct jbd2_revoke_table_s *revoke; + struct list_head *hash_list; + int i, offset, count; + + descriptor = NULL; + offset = 0; + count = 0; + + /* select revoke table for committing transaction */ + revoke = journal->j_revoke == journal->j_revoke_table[0] ? + journal->j_revoke_table[1] : journal->j_revoke_table[0]; + + for (i = 0; i < revoke->hash_size; i++) { + hash_list = &revoke->hash_table[i]; + + while (!list_empty(hash_list)) { + record = (struct jbd2_revoke_record_s *) + hash_list->next; + write_one_revoke_record(journal, transaction, + &descriptor, &offset, + record, write_op); + count++; + list_del(&record->hash); + kmem_cache_free(jbd2_revoke_record_cache, record); + } + } + if (descriptor) + flush_descriptor(journal, descriptor, offset, write_op); + jbd_debug(1, "Wrote %d revoke records\n", count); +} + +/* + * Write out one revoke record. We need to create a new descriptor + * block if the old one is full or if we have not already created one. + */ + +static void write_one_revoke_record(journal_t *journal, + transaction_t *transaction, + struct journal_head **descriptorp, + int *offsetp, + struct jbd2_revoke_record_s *record, + int write_op) +{ + struct journal_head *descriptor; + int offset; + journal_header_t *header; + + /* If we are already aborting, this all becomes a noop. We + still need to go round the loop in + jbd2_journal_write_revoke_records in order to free all of the + revoke records: only the IO to the journal is omitted. */ + if (is_journal_aborted(journal)) + return; + + descriptor = *descriptorp; + offset = *offsetp; + + /* Make sure we have a descriptor with space left for the record */ + if (descriptor) { + if (offset == journal->j_blocksize) { + flush_descriptor(journal, descriptor, offset, write_op); + descriptor = NULL; + } + } + + if (!descriptor) { + descriptor = jbd2_journal_get_descriptor_buffer(journal); + if (!descriptor) + return; + header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; + header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK); + header->h_sequence = cpu_to_be32(transaction->t_tid); + + /* Record it so that we can wait for IO completion later */ + JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); + jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl); + + offset = sizeof(jbd2_journal_revoke_header_t); + *descriptorp = descriptor; + } + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) { + * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) = + cpu_to_be64(record->blocknr); + offset += 8; + + } else { + * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = + cpu_to_be32(record->blocknr); + offset += 4; + } + + *offsetp = offset; +} + +/* + * Flush a revoke descriptor out to the journal. If we are aborting, + * this is a noop; otherwise we are generating a buffer which needs to + * be waited for during commit, so it has to go onto the appropriate + * journal buffer list. + */ + +static void flush_descriptor(journal_t *journal, + struct journal_head *descriptor, + int offset, int write_op) +{ + jbd2_journal_revoke_header_t *header; + struct buffer_head *bh = jh2bh(descriptor); + + if (is_journal_aborted(journal)) { + put_bh(bh); + return; + } + + header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data; + header->r_count = cpu_to_be32(offset); + set_buffer_jwrite(bh); + BUFFER_TRACE(bh, "write"); + set_buffer_dirty(bh); + write_dirty_buffer(bh, write_op); +} +#endif + +/* + * Revoke support for recovery. + * + * Recovery needs to be able to: + * + * record all revoke records, including the tid of the latest instance + * of each revoke in the journal + * + * check whether a given block in a given transaction should be replayed + * (ie. has not been revoked by a revoke record in that or a subsequent + * transaction) + * + * empty the revoke table after recovery. + */ + +/* + * First, setting revoke records. We create a new revoke record for + * every block ever revoked in the log as we scan it for recovery, and + * we update the existing records if we find multiple revokes for a + * single block. + */ + +int jbd2_journal_set_revoke(journal_t *journal, + unsigned long long blocknr, + tid_t sequence) +{ + struct jbd2_revoke_record_s *record; + + record = find_revoke_record(journal, blocknr); + if (record) { + /* If we have multiple occurrences, only record the + * latest sequence number in the hashed record */ + if (tid_gt(sequence, record->sequence)) + record->sequence = sequence; + return 0; + } + return insert_revoke_hash(journal, blocknr, sequence); +} + +/* + * Test revoke records. For a given block referenced in the log, has + * that block been revoked? A revoke record with a given transaction + * sequence number revokes all blocks in that transaction and earlier + * ones, but later transactions still need replayed. + */ + +int jbd2_journal_test_revoke(journal_t *journal, + unsigned long long blocknr, + tid_t sequence) +{ + struct jbd2_revoke_record_s *record; + + record = find_revoke_record(journal, blocknr); + if (!record) + return 0; + if (tid_gt(sequence, record->sequence)) + return 0; + return 1; +} + +/* + * Finally, once recovery is over, we need to clear the revoke table so + * that it can be reused by the running filesystem. + */ + +void jbd2_journal_clear_revoke(journal_t *journal) +{ + int i; + struct list_head *hash_list; + struct jbd2_revoke_record_s *record; + struct jbd2_revoke_table_s *revoke; + + revoke = journal->j_revoke; + + for (i = 0; i < revoke->hash_size; i++) { + hash_list = &revoke->hash_table[i]; + while (!list_empty(hash_list)) { + record = (struct jbd2_revoke_record_s*) hash_list->next; + list_del(&record->hash); + kmem_cache_free(jbd2_revoke_record_cache, record); + } + } +} diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c new file mode 100644 index 00000000..9baa39ea --- /dev/null +++ b/fs/jbd2/transaction.c @@ -0,0 +1,2227 @@ +/* + * linux/fs/jbd2/transaction.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Generic filesystem transaction handling code; part of the ext2fs + * journaling system. + * + * This file manages transactions (compound commits managed by the + * journaling code) and handles (individual atomic operations by the + * filesystem). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); +static void __jbd2_journal_unfile_buffer(struct journal_head *jh); + +/* + * jbd2_get_transaction: obtain a new transaction_t object. + * + * Simply allocate and initialise a new transaction. Create it in + * RUNNING state and add it to the current journal (which should not + * have an existing running transaction: we only make a new transaction + * once we have started to commit the old one). + * + * Preconditions: + * The journal MUST be locked. We don't perform atomic mallocs on the + * new transaction and we can't block without protecting against other + * processes trying to touch the journal while it is in transition. + * + */ + +static transaction_t * +jbd2_get_transaction(journal_t *journal, transaction_t *transaction) +{ + transaction->t_journal = journal; + transaction->t_state = T_RUNNING; + transaction->t_start_time = ktime_get(); + transaction->t_tid = journal->j_transaction_sequence++; + transaction->t_expires = jiffies + journal->j_commit_interval; + spin_lock_init(&transaction->t_handle_lock); + atomic_set(&transaction->t_updates, 0); + atomic_set(&transaction->t_outstanding_credits, 0); + atomic_set(&transaction->t_handle_count, 0); + INIT_LIST_HEAD(&transaction->t_inode_list); + INIT_LIST_HEAD(&transaction->t_private_list); + + /* Set up the commit timer for the new transaction. */ + journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); + add_timer(&journal->j_commit_timer); + + J_ASSERT(journal->j_running_transaction == NULL); + journal->j_running_transaction = transaction; + transaction->t_max_wait = 0; + transaction->t_start = jiffies; + + return transaction; +} + +/* + * Handle management. + * + * A handle_t is an object which represents a single atomic update to a + * filesystem, and which tracks all of the modifications which form part + * of that one update. + */ + +/* + * Update transaction's maximum wait time, if debugging is enabled. + * + * In order for t_max_wait to be reliable, it must be protected by a + * lock. But doing so will mean that start_this_handle() can not be + * run in parallel on SMP systems, which limits our scalability. So + * unless debugging is enabled, we no longer update t_max_wait, which + * means that maximum wait time reported by the jbd2_run_stats + * tracepoint will always be zero. + */ +static inline void update_t_max_wait(transaction_t *transaction, + unsigned long ts) +{ +#ifdef CONFIG_JBD2_DEBUG + if (jbd2_journal_enable_debug && + time_after(transaction->t_start, ts)) { + ts = jbd2_time_diff(ts, transaction->t_start); + spin_lock(&transaction->t_handle_lock); + if (ts > transaction->t_max_wait) + transaction->t_max_wait = ts; + spin_unlock(&transaction->t_handle_lock); + } +#endif +} + +/* + * start_this_handle: Given a handle, deal with any locking or stalling + * needed to make sure that there is enough journal space for the handle + * to begin. Attach the handle to a transaction and set up the + * transaction's buffer credits. + */ + +static int start_this_handle(journal_t *journal, handle_t *handle, + int gfp_mask) +{ + transaction_t *transaction, *new_transaction = NULL; + tid_t tid; + int needed, need_to_start; + int nblocks = handle->h_buffer_credits; + unsigned long ts = jiffies; + + if (nblocks > journal->j_max_transaction_buffers) { + printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", + current->comm, nblocks, + journal->j_max_transaction_buffers); + return -ENOSPC; + } + +alloc_transaction: + if (!journal->j_running_transaction) { + new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask); + if (!new_transaction) { + /* + * If __GFP_FS is not present, then we may be + * being called from inside the fs writeback + * layer, so we MUST NOT fail. Since + * __GFP_NOFAIL is going away, we will arrange + * to retry the allocation ourselves. + */ + if ((gfp_mask & __GFP_FS) == 0) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto alloc_transaction; + } + return -ENOMEM; + } + } + + jbd_debug(3, "New handle %p going live.\n", handle); + + /* + * We need to hold j_state_lock until t_updates has been incremented, + * for proper journal barrier handling + */ +repeat: + read_lock(&journal->j_state_lock); + BUG_ON(journal->j_flags & JBD2_UNMOUNT); + if (is_journal_aborted(journal) || + (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { + read_unlock(&journal->j_state_lock); + kfree(new_transaction); + return -EROFS; + } + + /* Wait on the journal's transaction barrier if necessary */ + if (journal->j_barrier_count) { + read_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_transaction_locked, + journal->j_barrier_count == 0); + goto repeat; + } + + if (!journal->j_running_transaction) { + read_unlock(&journal->j_state_lock); + if (!new_transaction) + goto alloc_transaction; + write_lock(&journal->j_state_lock); + if (!journal->j_running_transaction) { + jbd2_get_transaction(journal, new_transaction); + new_transaction = NULL; + } + write_unlock(&journal->j_state_lock); + goto repeat; + } + + transaction = journal->j_running_transaction; + + /* + * If the current transaction is locked down for commit, wait for the + * lock to be released. + */ + if (transaction->t_state == T_LOCKED) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_wait_transaction_locked, + &wait, TASK_UNINTERRUPTIBLE); + read_unlock(&journal->j_state_lock); + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); + goto repeat; + } + + /* + * If there is not enough space left in the log to write all potential + * buffers requested by this operation, we need to stall pending a log + * checkpoint to free some more log space. + */ + needed = atomic_add_return(nblocks, + &transaction->t_outstanding_credits); + + if (needed > journal->j_max_transaction_buffers) { + /* + * If the current transaction is already too large, then start + * to commit it: we can then go back and attach this handle to + * a new transaction. + */ + DEFINE_WAIT(wait); + + jbd_debug(2, "Handle %p starting new commit...\n", handle); + atomic_sub(nblocks, &transaction->t_outstanding_credits); + prepare_to_wait(&journal->j_wait_transaction_locked, &wait, + TASK_UNINTERRUPTIBLE); + tid = transaction->t_tid; + need_to_start = !tid_geq(journal->j_commit_request, tid); + read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); + goto repeat; + } + + /* + * The commit code assumes that it can get enough log space + * without forcing a checkpoint. This is *critical* for + * correctness: a checkpoint of a buffer which is also + * associated with a committing transaction creates a deadlock, + * so commit simply cannot force through checkpoints. + * + * We must therefore ensure the necessary space in the journal + * *before* starting to dirty potentially checkpointed buffers + * in the new transaction. + * + * The worst part is, any transaction currently committing can + * reduce the free space arbitrarily. Be careful to account for + * those buffers when checkpointing. + */ + + /* + * @@@ AKPM: This seems rather over-defensive. We're giving commit + * a _lot_ of headroom: 1/4 of the journal plus the size of + * the committing transaction. Really, we only need to give it + * committing_transaction->t_outstanding_credits plus "enough" for + * the log control blocks. + * Also, this test is inconsistent with the matching one in + * jbd2_journal_extend(). + */ + if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { + jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); + atomic_sub(nblocks, &transaction->t_outstanding_credits); + read_unlock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); + if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) + __jbd2_log_wait_for_space(journal); + write_unlock(&journal->j_state_lock); + goto repeat; + } + + /* OK, account for the buffers that this operation expects to + * use and add the handle to the running transaction. + */ + update_t_max_wait(transaction, ts); + handle->h_transaction = transaction; + atomic_inc(&transaction->t_updates); + atomic_inc(&transaction->t_handle_count); + jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", + handle, nblocks, + atomic_read(&transaction->t_outstanding_credits), + __jbd2_log_space_left(journal)); + read_unlock(&journal->j_state_lock); + + lock_map_acquire(&handle->h_lockdep_map); + kfree(new_transaction); + return 0; +} + +static struct lock_class_key jbd2_handle_key; + +/* Allocate a new handle. This should probably be in a slab... */ +static handle_t *new_handle(int nblocks) +{ + handle_t *handle = jbd2_alloc_handle(GFP_NOFS); + if (!handle) + return NULL; + memset(handle, 0, sizeof(*handle)); + handle->h_buffer_credits = nblocks; + handle->h_ref = 1; + + lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle", + &jbd2_handle_key, 0); + + return handle; +} + +/** + * handle_t *jbd2_journal_start() - Obtain a new handle. + * @journal: Journal to start transaction on. + * @nblocks: number of block buffer we might modify + * + * We make sure that the transaction can guarantee at least nblocks of + * modified buffers in the log. We block until the log can guarantee + * that much space. + * + * This function is visible to journal users (like ext3fs), so is not + * called with the journal already locked. + * + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. + */ +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) +{ + handle_t *handle = journal_current_handle(); + int err; + + if (!journal) + return ERR_PTR(-EROFS); + + if (handle) { + J_ASSERT(handle->h_transaction->t_journal == journal); + handle->h_ref++; + return handle; + } + + handle = new_handle(nblocks); + if (!handle) + return ERR_PTR(-ENOMEM); + + current->journal_info = handle; + + err = start_this_handle(journal, handle, gfp_mask); + if (err < 0) { + jbd2_free_handle(handle); + current->journal_info = NULL; + handle = ERR_PTR(err); + } + return handle; +} +EXPORT_SYMBOL(jbd2__journal_start); + + +handle_t *jbd2_journal_start(journal_t *journal, int nblocks) +{ + return jbd2__journal_start(journal, nblocks, GFP_NOFS); +} +EXPORT_SYMBOL(jbd2_journal_start); + + +/** + * int jbd2_journal_extend() - extend buffer credits. + * @handle: handle to 'extend' + * @nblocks: nr blocks to try to extend by. + * + * Some transactions, such as large extends and truncates, can be done + * atomically all at once or in several stages. The operation requests + * a credit for a number of buffer modications in advance, but can + * extend its credit if it needs more. + * + * jbd2_journal_extend tries to give the running handle more buffer credits. + * It does not guarantee that allocation - this is a best-effort only. + * The calling process MUST be able to deal cleanly with a failure to + * extend here. + * + * Return 0 on success, non-zero on failure. + * + * return code < 0 implies an error + * return code > 0 implies normal transaction-full status. + */ +int jbd2_journal_extend(handle_t *handle, int nblocks) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int result; + int wanted; + + result = -EIO; + if (is_handle_aborted(handle)) + goto out; + + result = 1; + + read_lock(&journal->j_state_lock); + + /* Don't extend a locked-down transaction! */ + if (handle->h_transaction->t_state != T_RUNNING) { + jbd_debug(3, "denied handle %p %d blocks: " + "transaction not running\n", handle, nblocks); + goto error_out; + } + + spin_lock(&transaction->t_handle_lock); + wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks; + + if (wanted > journal->j_max_transaction_buffers) { + jbd_debug(3, "denied handle %p %d blocks: " + "transaction too large\n", handle, nblocks); + goto unlock; + } + + if (wanted > __jbd2_log_space_left(journal)) { + jbd_debug(3, "denied handle %p %d blocks: " + "insufficient log space\n", handle, nblocks); + goto unlock; + } + + handle->h_buffer_credits += nblocks; + atomic_add(nblocks, &transaction->t_outstanding_credits); + result = 0; + + jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); +unlock: + spin_unlock(&transaction->t_handle_lock); +error_out: + read_unlock(&journal->j_state_lock); +out: + return result; +} + + +/** + * int jbd2_journal_restart() - restart a handle . + * @handle: handle to restart + * @nblocks: nr credits requested + * + * Restart a handle for a multi-transaction filesystem + * operation. + * + * If the jbd2_journal_extend() call above fails to grant new buffer credits + * to a running handle, a call to jbd2_journal_restart will commit the + * handle's transaction so far and reattach the handle to a new + * transaction capabable of guaranteeing the requested number of + * credits. + */ +int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + tid_t tid; + int need_to_start, ret; + + /* If we've had an abort of any type, don't even think about + * actually doing the restart! */ + if (is_handle_aborted(handle)) + return 0; + + /* + * First unlink the handle from its current transaction, and start the + * commit on that. + */ + J_ASSERT(atomic_read(&transaction->t_updates) > 0); + J_ASSERT(journal_current_handle() == handle); + + read_lock(&journal->j_state_lock); + spin_lock(&transaction->t_handle_lock); + atomic_sub(handle->h_buffer_credits, + &transaction->t_outstanding_credits); + if (atomic_dec_and_test(&transaction->t_updates)) + wake_up(&journal->j_wait_updates); + spin_unlock(&transaction->t_handle_lock); + + jbd_debug(2, "restarting handle %p\n", handle); + tid = transaction->t_tid; + need_to_start = !tid_geq(journal->j_commit_request, tid); + read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); + + lock_map_release(&handle->h_lockdep_map); + handle->h_buffer_credits = nblocks; + ret = start_this_handle(journal, handle, gfp_mask); + return ret; +} +EXPORT_SYMBOL(jbd2__journal_restart); + + +int jbd2_journal_restart(handle_t *handle, int nblocks) +{ + return jbd2__journal_restart(handle, nblocks, GFP_NOFS); +} +EXPORT_SYMBOL(jbd2_journal_restart); + +/** + * void jbd2_journal_lock_updates () - establish a transaction barrier. + * @journal: Journal to establish a barrier on. + * + * This locks out any further updates from being started, and blocks + * until all existing updates have completed, returning only once the + * journal is in a quiescent state with no updates running. + * + * The journal lock should not be held on entry. + */ +void jbd2_journal_lock_updates(journal_t *journal) +{ + DEFINE_WAIT(wait); + + write_lock(&journal->j_state_lock); + ++journal->j_barrier_count; + + /* Wait until there are no running updates */ + while (1) { + transaction_t *transaction = journal->j_running_transaction; + + if (!transaction) + break; + + spin_lock(&transaction->t_handle_lock); + if (!atomic_read(&transaction->t_updates)) { + spin_unlock(&transaction->t_handle_lock); + break; + } + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&transaction->t_handle_lock); + write_unlock(&journal->j_state_lock); + schedule(); + finish_wait(&journal->j_wait_updates, &wait); + write_lock(&journal->j_state_lock); + } + write_unlock(&journal->j_state_lock); + + /* + * We have now established a barrier against other normal updates, but + * we also need to barrier against other jbd2_journal_lock_updates() calls + * to make sure that we serialise special journal-locked operations + * too. + */ + mutex_lock(&journal->j_barrier); +} + +/** + * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier + * @journal: Journal to release the barrier on. + * + * Release a transaction barrier obtained with jbd2_journal_lock_updates(). + * + * Should be called without the journal lock held. + */ +void jbd2_journal_unlock_updates (journal_t *journal) +{ + J_ASSERT(journal->j_barrier_count != 0); + + mutex_unlock(&journal->j_barrier); + write_lock(&journal->j_state_lock); + --journal->j_barrier_count; + write_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_transaction_locked); +} + +static void warn_dirty_buffer(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + + printk(KERN_WARNING + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); +} + +/* + * If the buffer is already part of the current transaction, then there + * is nothing we need to do. If it is already part of a prior + * transaction which we are still committing to disk, then we need to + * make sure that we do not overwrite the old copy: we do copy-out to + * preserve the copy going to disk. We also account the buffer against + * the handle's metadata buffer credits (unless the buffer is already + * part of the transaction, that is). + * + */ +static int +do_get_write_access(handle_t *handle, struct journal_head *jh, + int force_copy) +{ + struct buffer_head *bh; + transaction_t *transaction; + journal_t *journal; + int error; + char *frozen_buffer = NULL; + int need_copy = 0; + + if (is_handle_aborted(handle)) + return -EROFS; + + transaction = handle->h_transaction; + journal = transaction->t_journal; + + jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); + + JBUFFER_TRACE(jh, "entry"); +repeat: + bh = jh2bh(jh); + + /* @@@ Need to check for errors here at some point. */ + + lock_buffer(bh); + jbd_lock_bh_state(bh); + + /* We now hold the buffer lock so it is safe to query the buffer + * state. Is the buffer dirty? + * + * If so, there are two possibilities. The buffer may be + * non-journaled, and undergoing a quite legitimate writeback. + * Otherwise, it is journaled, and we don't expect dirty buffers + * in that state (the buffers should be marked JBD_Dirty + * instead.) So either the IO is being done under our own + * control and this is a bug, or it's a third party IO such as + * dump(8) (which may leave the buffer scheduled for read --- + * ie. locked but not dirty) or tune2fs (which may actually have + * the buffer dirtied, ugh.) */ + + if (buffer_dirty(bh)) { + /* + * First question: is this buffer already part of the current + * transaction or the existing committing transaction? + */ + if (jh->b_transaction) { + J_ASSERT_JH(jh, + jh->b_transaction == transaction || + jh->b_transaction == + journal->j_committing_transaction); + if (jh->b_next_transaction) + J_ASSERT_JH(jh, jh->b_next_transaction == + transaction); + warn_dirty_buffer(bh); + } + /* + * In any case we need to clean the dirty flag and we must + * do it under the buffer lock to be sure we don't race + * with running write-out. + */ + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + set_buffer_jbddirty(bh); + } + + unlock_buffer(bh); + + error = -EROFS; + if (is_handle_aborted(handle)) { + jbd_unlock_bh_state(bh); + goto out; + } + error = 0; + + /* + * The buffer is already part of this transaction if b_transaction or + * b_next_transaction points to it + */ + if (jh->b_transaction == transaction || + jh->b_next_transaction == transaction) + goto done; + + /* + * this is the first time this transaction is touching this buffer, + * reset the modified flag + */ + jh->b_modified = 0; + + /* + * If there is already a copy-out version of this buffer, then we don't + * need to make another one + */ + if (jh->b_frozen_data) { + JBUFFER_TRACE(jh, "has frozen data"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + jh->b_next_transaction = transaction; + goto done; + } + + /* Is there data here we need to preserve? */ + + if (jh->b_transaction && jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "owned by older transaction"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + + /* There is one case we have to be very careful about. + * If the committing transaction is currently writing + * this buffer out to disk and has NOT made a copy-out, + * then we cannot modify the buffer contents at all + * right now. The essence of copy-out is that it is the + * extra copy, not the primary copy, which gets + * journaled. If the primary copy is already going to + * disk then we cannot do copy-out here. */ + + if (jh->b_jlist == BJ_Shadow) { + DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); + wait_queue_head_t *wqh; + + wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); + + JBUFFER_TRACE(jh, "on shadow: sleep"); + jbd_unlock_bh_state(bh); + /* commit wakes up all shadow buffers after IO */ + for ( ; ; ) { + prepare_to_wait(wqh, &wait.wait, + TASK_UNINTERRUPTIBLE); + if (jh->b_jlist != BJ_Shadow) + break; + schedule(); + } + finish_wait(wqh, &wait.wait); + goto repeat; + } + + /* Only do the copy if the currently-owning transaction + * still needs it. If it is on the Forget list, the + * committing transaction is past that stage. The + * buffer had better remain locked during the kmalloc, + * but that should be true --- we hold the journal lock + * still and the buffer is already on the BUF_JOURNAL + * list so won't be flushed. + * + * Subtle point, though: if this is a get_undo_access, + * then we will be relying on the frozen_data to contain + * the new value of the committed_data record after the + * transaction, so we HAVE to force the frozen_data copy + * in that case. */ + + if (jh->b_jlist != BJ_Forget || force_copy) { + JBUFFER_TRACE(jh, "generate frozen data"); + if (!frozen_buffer) { + JBUFFER_TRACE(jh, "allocate memory for buffer"); + jbd_unlock_bh_state(bh); + frozen_buffer = + jbd2_alloc(jh2bh(jh)->b_size, + GFP_NOFS); + if (!frozen_buffer) { + printk(KERN_EMERG + "%s: OOM for frozen_buffer\n", + __func__); + JBUFFER_TRACE(jh, "oom!"); + error = -ENOMEM; + jbd_lock_bh_state(bh); + goto done; + } + goto repeat; + } + jh->b_frozen_data = frozen_buffer; + frozen_buffer = NULL; + need_copy = 1; + } + jh->b_next_transaction = transaction; + } + + + /* + * Finally, if the buffer is not journaled right now, we need to make + * sure it doesn't get written to disk before the caller actually + * commits the new data + */ + if (!jh->b_transaction) { + JBUFFER_TRACE(jh, "no transaction"); + J_ASSERT_JH(jh, !jh->b_next_transaction); + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); + } + +done: + if (need_copy) { + struct page *page; + int offset; + char *source; + + J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), + "Possible IO failure.\n"); + page = jh2bh(jh)->b_page; + offset = offset_in_page(jh2bh(jh)->b_data); + source = kmap_atomic(page, KM_USER0); + /* Fire data frozen trigger just before we copy the data */ + jbd2_buffer_frozen_trigger(jh, source + offset, + jh->b_triggers); + memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); + kunmap_atomic(source, KM_USER0); + + /* + * Now that the frozen data is saved off, we need to store + * any matching triggers. + */ + jh->b_frozen_triggers = jh->b_triggers; + } + jbd_unlock_bh_state(bh); + + /* + * If we are about to journal a buffer, then any revoke pending on it is + * no longer valid + */ + jbd2_journal_cancel_revoke(handle, jh); + +out: + if (unlikely(frozen_buffer)) /* It's usually NULL */ + jbd2_free(frozen_buffer, bh->b_size); + + JBUFFER_TRACE(jh, "exit"); + return error; +} + +/** + * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. + * @handle: transaction to add buffer modifications to + * @bh: bh to be used for metadata writes + * + * Returns an error code or 0 on success. + * + * In full data journalling mode the buffer may be of type BJ_AsyncData, + * because we're write()ing a buffer which is also part of a shared mapping. + */ + +int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) +{ + struct journal_head *jh = jbd2_journal_add_journal_head(bh); + int rc; + + /* We do not want to get caught playing with fields which the + * log thread also manipulates. Make sure that the buffer + * completes any outstanding IO before proceeding. */ + rc = do_get_write_access(handle, jh, 0); + jbd2_journal_put_journal_head(jh); + return rc; +} + + +/* + * When the user wants to journal a newly created buffer_head + * (ie. getblk() returned a new buffer and we are going to populate it + * manually rather than reading off disk), then we need to keep the + * buffer_head locked until it has been completely filled with new + * data. In this case, we should be able to make the assertion that + * the bh is not already part of an existing transaction. + * + * The buffer should already be locked by the caller by this point. + * There is no lock ranking violation: it was a newly created, + * unlocked buffer beforehand. */ + +/** + * int jbd2_journal_get_create_access () - notify intent to use newly created bh + * @handle: transaction to new buffer to + * @bh: new buffer. + * + * Call this if you create a new bh. + */ +int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = jbd2_journal_add_journal_head(bh); + int err; + + jbd_debug(5, "journal_head %p\n", jh); + err = -EROFS; + if (is_handle_aborted(handle)) + goto out; + err = 0; + + JBUFFER_TRACE(jh, "entry"); + /* + * The buffer may already belong to this transaction due to pre-zeroing + * in the filesystem's new_block code. It may also be on the previous, + * committing transaction's lists, but it HAS to be in Forget state in + * that case: the transaction must have deleted the buffer for it to be + * reused here. + */ + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + J_ASSERT_JH(jh, (jh->b_transaction == transaction || + jh->b_transaction == NULL || + (jh->b_transaction == journal->j_committing_transaction && + jh->b_jlist == BJ_Forget))); + + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); + + if (jh->b_transaction == NULL) { + /* + * Previous jbd2_journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); + /* first access by this transaction */ + jh->b_modified = 0; + + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + } else if (jh->b_transaction == journal->j_committing_transaction) { + /* first access by this transaction */ + jh->b_modified = 0; + + JBUFFER_TRACE(jh, "set next transaction"); + jh->b_next_transaction = transaction; + } + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + + /* + * akpm: I added this. ext3_alloc_branch can pick up new indirect + * blocks which contain freed but then revoked metadata. We need + * to cancel the revoke in case we end up freeing it yet again + * and the reallocating as data - this would cause a second revoke, + * which hits an assertion error. + */ + JBUFFER_TRACE(jh, "cancelling revoke"); + jbd2_journal_cancel_revoke(handle, jh); +out: + jbd2_journal_put_journal_head(jh); + return err; +} + +/** + * int jbd2_journal_get_undo_access() - Notify intent to modify metadata with + * non-rewindable consequences + * @handle: transaction + * @bh: buffer to undo + * + * Sometimes there is a need to distinguish between metadata which has + * been committed to disk and that which has not. The ext3fs code uses + * this for freeing and allocating space, we have to make sure that we + * do not reuse freed space until the deallocation has been committed, + * since if we overwrote that space we would make the delete + * un-rewindable in case of a crash. + * + * To deal with that, jbd2_journal_get_undo_access requests write access to a + * buffer for parts of non-rewindable operations such as delete + * operations on the bitmaps. The journaling code must keep a copy of + * the buffer's contents prior to the undo_access call until such time + * as we know that the buffer has definitely been committed to disk. + * + * We never need to know which transaction the committed data is part + * of, buffers touched here are guaranteed to be dirtied later and so + * will be committed to a new transaction in due course, at which point + * we can discard the old committed data pointer. + * + * Returns error number or 0 on success. + */ +int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) +{ + int err; + struct journal_head *jh = jbd2_journal_add_journal_head(bh); + char *committed_data = NULL; + + JBUFFER_TRACE(jh, "entry"); + + /* + * Do this first --- it can drop the journal lock, so we want to + * make sure that obtaining the committed_data is done + * atomically wrt. completion of any outstanding commits. + */ + err = do_get_write_access(handle, jh, 1); + if (err) + goto out; + +repeat: + if (!jh->b_committed_data) { + committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); + if (!committed_data) { + printk(KERN_EMERG "%s: No memory for committed data\n", + __func__); + err = -ENOMEM; + goto out; + } + } + + jbd_lock_bh_state(bh); + if (!jh->b_committed_data) { + /* Copy out the current buffer contents into the + * preserved, committed copy. */ + JBUFFER_TRACE(jh, "generate b_committed data"); + if (!committed_data) { + jbd_unlock_bh_state(bh); + goto repeat; + } + + jh->b_committed_data = committed_data; + committed_data = NULL; + memcpy(jh->b_committed_data, bh->b_data, bh->b_size); + } + jbd_unlock_bh_state(bh); +out: + jbd2_journal_put_journal_head(jh); + if (unlikely(committed_data)) + jbd2_free(committed_data, bh->b_size); + return err; +} + +/** + * void jbd2_journal_set_triggers() - Add triggers for commit writeout + * @bh: buffer to trigger on + * @type: struct jbd2_buffer_trigger_type containing the trigger(s). + * + * Set any triggers on this journal_head. This is always safe, because + * triggers for a committing buffer will be saved off, and triggers for + * a running transaction will match the buffer in that transaction. + * + * Call with NULL to clear the triggers. + */ +void jbd2_journal_set_triggers(struct buffer_head *bh, + struct jbd2_buffer_trigger_type *type) +{ + struct journal_head *jh = bh2jh(bh); + + jh->b_triggers = type; +} + +void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, + struct jbd2_buffer_trigger_type *triggers) +{ + struct buffer_head *bh = jh2bh(jh); + + if (!triggers || !triggers->t_frozen) + return; + + triggers->t_frozen(triggers, bh, mapped_data, bh->b_size); +} + +void jbd2_buffer_abort_trigger(struct journal_head *jh, + struct jbd2_buffer_trigger_type *triggers) +{ + if (!triggers || !triggers->t_abort) + return; + + triggers->t_abort(triggers, jh2bh(jh)); +} + + + +/** + * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata + * @handle: transaction to add buffer to. + * @bh: buffer to mark + * + * mark dirty metadata which needs to be journaled as part of the current + * transaction. + * + * The buffer is placed on the transaction's metadata list and is marked + * as belonging to the transaction. + * + * Returns error number or 0 on success. + * + * Special care needs to be taken if the buffer already belongs to the + * current committing transaction (in which case we should have frozen + * data present for that commit). In that case, we don't relink the + * buffer: that only gets done when the old transaction finally + * completes its commit. + */ +int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = bh2jh(bh); + + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); + if (is_handle_aborted(handle)) + goto out; + + jbd_lock_bh_state(bh); + + if (jh->b_modified == 0) { + /* + * This buffer's got modified and becoming part + * of the transaction. This needs to be done + * once a transaction -bzzz + */ + jh->b_modified = 1; + J_ASSERT_JH(jh, handle->h_buffer_credits > 0); + handle->h_buffer_credits--; + } + + /* + * fastpath, to avoid expensive locking. If this buffer is already + * on the running transaction's metadata list there is nothing to do. + * Nobody can take it off again because there is a handle open. + * I _think_ we're OK here with SMP barriers - a mistaken decision will + * result in this test being false, so we go in and take the locks. + */ + if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { + JBUFFER_TRACE(jh, "fastpath"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_running_transaction); + goto out_unlock_bh; + } + + set_buffer_jbddirty(bh); + + /* + * Metadata already on the current transaction list doesn't + * need to be filed. Metadata on another transaction's list must + * be committing, and will be refiled once the commit completes: + * leave it alone for now. + */ + if (jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "already on other transaction"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + /* And this case is illegal: we can't reuse another + * transaction's data buffer, ever. */ + goto out_unlock_bh; + } + + /* That test should have eliminated the following case: */ + J_ASSERT_JH(jh, jh->b_frozen_data == NULL); + + JBUFFER_TRACE(jh, "file as BJ_Metadata"); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); + spin_unlock(&journal->j_list_lock); +out_unlock_bh: + jbd_unlock_bh_state(bh); +out: + JBUFFER_TRACE(jh, "exit"); + return 0; +} + +/* + * jbd2_journal_release_buffer: undo a get_write_access without any buffer + * updates, if the update decided in the end that it didn't need access. + * + */ +void +jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh) +{ + BUFFER_TRACE(bh, "entry"); +} + +/** + * void jbd2_journal_forget() - bforget() for potentially-journaled buffers. + * @handle: transaction handle + * @bh: bh to 'forget' + * + * We can only do the bforget if there are no commits pending against the + * buffer. If the buffer is dirty in the current running transaction we + * can safely unlink it. + * + * bh may not be a journalled buffer at all - it may be a non-JBD + * buffer which came off the hashtable. Check for this. + * + * Decrements bh->b_count by one. + * + * Allow this call even if the handle has aborted --- it may be part of + * the caller's cleanup after an abort. + */ +int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh; + int drop_reserve = 0; + int err = 0; + int was_modified = 0; + + BUFFER_TRACE(bh, "entry"); + + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + if (!buffer_jbd(bh)) + goto not_jbd; + jh = bh2jh(bh); + + /* Critical error: attempting to delete a bitmap buffer, maybe? + * Don't do any jbd operations, and return an error. */ + if (!J_EXPECT_JH(jh, !jh->b_committed_data, + "inconsistent data on disk")) { + err = -EIO; + goto not_jbd; + } + + /* keep track of wether or not this transaction modified us */ + was_modified = jh->b_modified; + + /* + * The buffer's going from the transaction, we must drop + * all references -bzzz + */ + jh->b_modified = 0; + + if (jh->b_transaction == handle->h_transaction) { + J_ASSERT_JH(jh, !jh->b_frozen_data); + + /* If we are forgetting a buffer which is already part + * of this transaction, then we can just drop it from + * the transaction immediately. */ + clear_buffer_dirty(bh); + clear_buffer_jbddirty(bh); + + JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); + + /* + * we only want to drop a reference if this transaction + * modified the buffer + */ + if (was_modified) + drop_reserve = 1; + + /* + * We are no longer going to journal this buffer. + * However, the commit of this transaction is still + * important to the buffer: the delete that we are now + * processing might obsolete an old log entry, so by + * committing, we can satisfy the buffer's checkpoint. + * + * So, if we have a checkpoint on the buffer, we should + * now refile the buffer on our BJ_Forget list so that + * we know to remove the checkpoint after we commit. + */ + + if (jh->b_cp_transaction) { + __jbd2_journal_temp_unlink_buffer(jh); + __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); + } else { + __jbd2_journal_unfile_buffer(jh); + if (!buffer_jbd(bh)) { + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __bforget(bh); + goto drop; + } + } + } else if (jh->b_transaction) { + J_ASSERT_JH(jh, (jh->b_transaction == + journal->j_committing_transaction)); + /* However, if the buffer is still owned by a prior + * (committing) transaction, we can't drop it yet... */ + JBUFFER_TRACE(jh, "belongs to older transaction"); + /* ... but we CAN drop it from the new transaction if we + * have also modified it since the original commit. */ + + if (jh->b_next_transaction) { + J_ASSERT(jh->b_next_transaction == transaction); + jh->b_next_transaction = NULL; + + /* + * only drop a reference if this transaction modified + * the buffer + */ + if (was_modified) + drop_reserve = 1; + } + } + +not_jbd: + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __brelse(bh); +drop: + if (drop_reserve) { + /* no need to reserve log space for this block -bzzz */ + handle->h_buffer_credits++; + } + return err; +} + +/** + * int jbd2_journal_stop() - complete a transaction + * @handle: tranaction to complete. + * + * All done for a particular handle. + * + * There is not much action needed here. We just return any remaining + * buffer credits to the transaction and remove the handle. The only + * complication is that we need to start a commit operation if the + * filesystem is marked for synchronous update. + * + * jbd2_journal_stop itself will not usually return an error, but it may + * do so in unusual circumstances. In particular, expect it to + * return -EIO if a jbd2_journal_abort has been executed since the + * transaction began. + */ +int jbd2_journal_stop(handle_t *handle) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int err, wait_for_commit = 0; + tid_t tid; + pid_t pid; + + J_ASSERT(journal_current_handle() == handle); + + if (is_handle_aborted(handle)) + err = -EIO; + else { + J_ASSERT(atomic_read(&transaction->t_updates) > 0); + err = 0; + } + + if (--handle->h_ref > 0) { + jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + return err; + } + + jbd_debug(4, "Handle %p going down\n", handle); + + /* + * Implement synchronous transaction batching. If the handle + * was synchronous, don't force a commit immediately. Let's + * yield and let another thread piggyback onto this + * transaction. Keep doing that while new threads continue to + * arrive. It doesn't cost much - we're about to run a commit + * and sleep on IO anyway. Speeds up many-threaded, many-dir + * operations by 30x or more... + * + * We try and optimize the sleep time against what the + * underlying disk can do, instead of having a static sleep + * time. This is useful for the case where our storage is so + * fast that it is more optimal to go ahead and force a flush + * and wait for the transaction to be committed than it is to + * wait for an arbitrary amount of time for new writers to + * join the transaction. We achieve this by measuring how + * long it takes to commit a transaction, and compare it with + * how long this transaction has been running, and if run time + * < commit time then we sleep for the delta and commit. This + * greatly helps super fast disks that would see slowdowns as + * more threads started doing fsyncs. + * + * But don't do this if this process was the most recent one + * to perform a synchronous write. We do this to detect the + * case where a single process is doing a stream of sync + * writes. No point in waiting for joiners in that case. + */ + pid = current->pid; + if (handle->h_sync && journal->j_last_sync_writer != pid) { + u64 commit_time, trans_time; + + journal->j_last_sync_writer = pid; + + read_lock(&journal->j_state_lock); + commit_time = journal->j_average_commit_time; + read_unlock(&journal->j_state_lock); + + trans_time = ktime_to_ns(ktime_sub(ktime_get(), + transaction->t_start_time)); + + commit_time = max_t(u64, commit_time, + 1000*journal->j_min_batch_time); + commit_time = min_t(u64, commit_time, + 1000*journal->j_max_batch_time); + + if (trans_time < commit_time) { + ktime_t expires = ktime_add_ns(ktime_get(), + commit_time); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } + } + + if (handle->h_sync) + transaction->t_synchronous_commit = 1; + current->journal_info = NULL; + atomic_sub(handle->h_buffer_credits, + &transaction->t_outstanding_credits); + + /* + * If the handle is marked SYNC, we need to set another commit + * going! We also want to force a commit if the current + * transaction is occupying too much of the log, or if the + * transaction is too old now. + */ + if (handle->h_sync || + (atomic_read(&transaction->t_outstanding_credits) > + journal->j_max_transaction_buffers) || + time_after_eq(jiffies, transaction->t_expires)) { + /* Do this even for aborted journals: an abort still + * completes the commit thread, it just doesn't write + * anything to disk. */ + + jbd_debug(2, "transaction too old, requesting commit for " + "handle %p\n", handle); + /* This is non-blocking */ + jbd2_log_start_commit(journal, transaction->t_tid); + + /* + * Special case: JBD2_SYNC synchronous updates require us + * to wait for the commit to complete. + */ + if (handle->h_sync && !(current->flags & PF_MEMALLOC)) + wait_for_commit = 1; + } + + /* + * Once we drop t_updates, if it goes to zero the transaction + * could start committing on us and eventually disappear. So + * once we do this, we must not dereference transaction + * pointer again. + */ + tid = transaction->t_tid; + if (atomic_dec_and_test(&transaction->t_updates)) { + wake_up(&journal->j_wait_updates); + if (journal->j_barrier_count) + wake_up(&journal->j_wait_transaction_locked); + } + + if (wait_for_commit) + err = jbd2_log_wait_commit(journal, tid); + + lock_map_release(&handle->h_lockdep_map); + + jbd2_free_handle(handle); + return err; +} + +/** + * int jbd2_journal_force_commit() - force any uncommitted transactions + * @journal: journal to force + * + * For synchronous operations: force any uncommitted transactions + * to disk. May seem kludgy, but it reuses all the handle batching + * code in a very simple manner. + */ +int jbd2_journal_force_commit(journal_t *journal) +{ + handle_t *handle; + int ret; + + handle = jbd2_journal_start(journal, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + } else { + handle->h_sync = 1; + ret = jbd2_journal_stop(handle); + } + return ret; +} + +/* + * + * List management code snippets: various functions for manipulating the + * transaction buffer lists. + * + */ + +/* + * Append a buffer to a transaction list, given the transaction's list head + * pointer. + * + * j_list_lock is held. + * + * jbd_lock_bh_state(jh2bh(jh)) is held. + */ + +static inline void +__blist_add_buffer(struct journal_head **list, struct journal_head *jh) +{ + if (!*list) { + jh->b_tnext = jh->b_tprev = jh; + *list = jh; + } else { + /* Insert at the tail of the list to preserve order */ + struct journal_head *first = *list, *last = first->b_tprev; + jh->b_tprev = last; + jh->b_tnext = first; + last->b_tnext = first->b_tprev = jh; + } +} + +/* + * Remove a buffer from a transaction list, given the transaction's list + * head pointer. + * + * Called with j_list_lock held, and the journal may not be locked. + * + * jbd_lock_bh_state(jh2bh(jh)) is held. + */ + +static inline void +__blist_del_buffer(struct journal_head **list, struct journal_head *jh) +{ + if (*list == jh) { + *list = jh->b_tnext; + if (*list == jh) + *list = NULL; + } + jh->b_tprev->b_tnext = jh->b_tnext; + jh->b_tnext->b_tprev = jh->b_tprev; +} + +/* + * Remove a buffer from the appropriate transaction list. + * + * Note that this function can *change* the value of + * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, + * t_log_list or t_reserved_list. If the caller is holding onto a copy of one + * of these pointers, it could go bad. Generally the caller needs to re-read + * the pointer from the transaction_t. + * + * Called under j_list_lock. The journal may not be locked. + */ +void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) +{ + struct journal_head **list = NULL; + transaction_t *transaction; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + transaction = jh->b_transaction; + if (transaction) + assert_spin_locked(&transaction->t_journal->j_list_lock); + + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + if (jh->b_jlist != BJ_None) + J_ASSERT_JH(jh, transaction != NULL); + + switch (jh->b_jlist) { + case BJ_None: + return; + case BJ_Metadata: + transaction->t_nr_buffers--; + J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_IO: + list = &transaction->t_iobuf_list; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_LogCtl: + list = &transaction->t_log_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + } + + __blist_del_buffer(list, jh); + jh->b_jlist = BJ_None; + if (test_clear_buffer_jbddirty(bh)) + mark_buffer_dirty(bh); /* Expose it to the VM */ +} + +/* + * Remove buffer from all transactions. + * + * Called with bh_state lock and j_list_lock + * + * jh and bh may be already freed when this function returns. + */ +static void __jbd2_journal_unfile_buffer(struct journal_head *jh) +{ + __jbd2_journal_temp_unlink_buffer(jh); + jh->b_transaction = NULL; + jbd2_journal_put_journal_head(jh); +} + +void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + __jbd2_journal_unfile_buffer(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __brelse(bh); +} + +/* + * Called from jbd2_journal_try_to_free_buffers(). + * + * Called under jbd_lock_bh_state(bh) + */ +static void +__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) +{ + struct journal_head *jh; + + jh = bh2jh(bh); + + if (buffer_locked(bh) || buffer_dirty(bh)) + goto out; + + if (jh->b_next_transaction != NULL) + goto out; + + spin_lock(&journal->j_list_lock); + if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { + /* written-back checkpointed metadata buffer */ + if (jh->b_jlist == BJ_None) { + JBUFFER_TRACE(jh, "remove from checkpoint list"); + __jbd2_journal_remove_checkpoint(jh); + } + } + spin_unlock(&journal->j_list_lock); +out: + return; +} + +/** + * int jbd2_journal_try_to_free_buffers() - try to free page buffers. + * @journal: journal for operation + * @page: to try and free + * @gfp_mask: we use the mask to detect how hard should we try to release + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to + * release the buffers. + * + * + * For all the buffers on this page, + * if they are fully written out ordered data, move them onto BUF_CLEAN + * so try_to_free_buffers() can reap them. + * + * This function returns non-zero if we wish try_to_free_buffers() + * to be called. We do this if the page is releasable by try_to_free_buffers(). + * We also do it if the page has locked or dirty buffers and the caller wants + * us to perform sync or async writeout. + * + * This complicates JBD locking somewhat. We aren't protected by the + * BKL here. We wish to remove the buffer from its committing or + * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer. + * + * This may *change* the value of transaction_t->t_datalist, so anyone + * who looks at t_datalist needs to lock against this function. + * + * Even worse, someone may be doing a jbd2_journal_dirty_data on this + * buffer. So we need to lock against that. jbd2_journal_dirty_data() + * will come out of the lock with the buffer dirty, which makes it + * ineligible for release here. + * + * Who else is affected by this? hmm... Really the only contender + * is do_get_write_access() - it could be looking at the buffer while + * journal_try_to_free_buffer() is changing its state. But that + * cannot happen because we never reallocate freed data as metadata + * while the data is part of a transaction. Yes? + * + * Return 0 on failure, 1 on success + */ +int jbd2_journal_try_to_free_buffers(journal_t *journal, + struct page *page, gfp_t gfp_mask) +{ + struct buffer_head *head; + struct buffer_head *bh; + int ret = 0; + + J_ASSERT(PageLocked(page)); + + head = page_buffers(page); + bh = head; + do { + struct journal_head *jh; + + /* + * We take our own ref against the journal_head here to avoid + * having to add tons of locking around each instance of + * jbd2_journal_put_journal_head(). + */ + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) + continue; + + jbd_lock_bh_state(bh); + __journal_try_to_free_buffer(journal, bh); + jbd2_journal_put_journal_head(jh); + jbd_unlock_bh_state(bh); + if (buffer_jbd(bh)) + goto busy; + } while ((bh = bh->b_this_page) != head); + + ret = try_to_free_buffers(page); + +busy: + return ret; +} + +/* + * This buffer is no longer needed. If it is on an older transaction's + * checkpoint list we need to record it on this transaction's forget list + * to pin this buffer (and hence its checkpointing transaction) down until + * this transaction commits. If the buffer isn't on a checkpoint list, we + * release it. + * Returns non-zero if JBD no longer has an interest in the buffer. + * + * Called under j_list_lock. + * + * Called under jbd_lock_bh_state(bh). + */ +static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) +{ + int may_free = 1; + struct buffer_head *bh = jh2bh(jh); + + if (jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "on running+cp transaction"); + __jbd2_journal_temp_unlink_buffer(jh); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __journal_file_buffer + */ + clear_buffer_dirty(bh); + __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); + may_free = 0; + } else { + JBUFFER_TRACE(jh, "on running transaction"); + __jbd2_journal_unfile_buffer(jh); + } + return may_free; +} + +/* + * jbd2_journal_invalidatepage + * + * This code is tricky. It has a number of cases to deal with. + * + * There are two invariants which this code relies on: + * + * i_size must be updated on disk before we start calling invalidatepage on the + * data. + * + * This is done in ext3 by defining an ext3_setattr method which + * updates i_size before truncate gets going. By maintaining this + * invariant, we can be sure that it is safe to throw away any buffers + * attached to the current transaction: once the transaction commits, + * we know that the data will not be needed. + * + * Note however that we can *not* throw away data belonging to the + * previous, committing transaction! + * + * Any disk blocks which *are* part of the previous, committing + * transaction (and which therefore cannot be discarded immediately) are + * not going to be reused in the new running transaction + * + * The bitmap committed_data images guarantee this: any block which is + * allocated in one transaction and removed in the next will be marked + * as in-use in the committed_data bitmap, so cannot be reused until + * the next transaction to delete the block commits. This means that + * leaving committing buffers dirty is quite safe: the disk blocks + * cannot be reallocated to a different file and so buffer aliasing is + * not possible. + * + * + * The above applies mainly to ordered data mode. In writeback mode we + * don't make guarantees about the order in which data hits disk --- in + * particular we don't guarantee that new dirty data is flushed before + * transaction commit --- so it is always safe just to discard data + * immediately in that mode. --sct + */ + +/* + * The journal_unmap_buffer helper function returns zero if the buffer + * concerned remains pinned as an anonymous buffer belonging to an older + * transaction. + * + * We're outside-transaction here. Either or both of j_running_transaction + * and j_committing_transaction may be NULL. + */ +static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) +{ + transaction_t *transaction; + struct journal_head *jh; + int may_free = 1; + int ret; + + BUFFER_TRACE(bh, "entry"); + + /* + * It is safe to proceed here without the j_list_lock because the + * buffers cannot be stolen by try_to_free_buffers as long as we are + * holding the page lock. --sct + */ + + if (!buffer_jbd(bh)) + goto zap_buffer_unlocked; + + /* OK, we have data buffer in journaled mode */ + write_lock(&journal->j_state_lock); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) + goto zap_buffer_no_jh; + + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * buffer can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + */ + transaction = jh->b_transaction; + if (transaction == NULL) { + /* First case: not on any transaction. If it + * has no checkpoint link, then we can zap it: + * it's a writeback-mode buffer so we don't care + * if it hits disk safely. */ + if (!jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "not on any transaction: zap"); + goto zap_buffer; + } + + if (!buffer_dirty(bh)) { + /* bdflush has written it. We can drop it now */ + goto zap_buffer; + } + + /* OK, it must be in the journal but still not + * written fully to disk: it's metadata or + * journaled data... */ + + if (journal->j_running_transaction) { + /* ... and once the current transaction has + * committed, the buffer won't be needed any + * longer. */ + JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); + ret = __dispose_buffer(jh, + journal->j_running_transaction); + jbd2_journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + write_unlock(&journal->j_state_lock); + return ret; + } else { + /* There is no currently-running transaction. So the + * orphan record which we wrote for this file must have + * passed into commit. We must attach this buffer to + * the committing transaction, if it exists. */ + if (journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "give to committing trans"); + ret = __dispose_buffer(jh, + journal->j_committing_transaction); + jbd2_journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + write_unlock(&journal->j_state_lock); + return ret; + } else { + /* The orphan record's transaction has + * committed. We can cleanse this buffer */ + clear_buffer_jbddirty(bh); + goto zap_buffer; + } + } + } else if (transaction == journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "on committing transaction"); + /* + * The buffer is committing, we simply cannot touch + * it. So we just set j_next_transaction to the + * running transaction (if there is one) and mark + * buffer as freed so that commit code knows it should + * clear dirty bits when it is done with the buffer. + */ + set_buffer_freed(bh); + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; + jbd2_journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + write_unlock(&journal->j_state_lock); + return 0; + } else { + /* Good, the buffer belongs to the running transaction. + * We are writing our own transaction's data, not any + * previous one's, so it is safe to throw it away + * (remember that we expect the filesystem to have set + * i_size already for this truncate so recovery will not + * expose the disk blocks we are discarding here.) */ + J_ASSERT_JH(jh, transaction == journal->j_running_transaction); + JBUFFER_TRACE(jh, "on running transaction"); + may_free = __dispose_buffer(jh, transaction); + } + +zap_buffer: + jbd2_journal_put_journal_head(jh); +zap_buffer_no_jh: + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + write_unlock(&journal->j_state_lock); +zap_buffer_unlocked: + clear_buffer_dirty(bh); + J_ASSERT_BH(bh, !buffer_jbddirty(bh)); + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); + bh->b_bdev = NULL; + return may_free; +} + +/** + * void jbd2_journal_invalidatepage() + * @journal: journal to use for flush... + * @page: page to flush + * @offset: length of page to invalidate. + * + * Reap page buffers containing data after offset in page. + * + */ +void jbd2_journal_invalidatepage(journal_t *journal, + struct page *page, + unsigned long offset) +{ + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + int may_free = 1; + + if (!PageLocked(page)) + BUG(); + if (!page_has_buffers(page)) + return; + + /* We will potentially be playing with lists other than just the + * data lists (especially for journaled data mode), so be + * cautious in our locking. */ + + head = bh = page_buffers(page); + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + if (offset <= curr_off) { + /* This block is wholly outside the truncation point */ + lock_buffer(bh); + may_free &= journal_unmap_buffer(journal, bh); + unlock_buffer(bh); + } + curr_off = next_off; + bh = next; + + } while (bh != head); + + if (!offset) { + if (may_free && try_to_free_buffers(page)) + J_ASSERT(!page_has_buffers(page)); + } +} + +/* + * File a buffer on the given transaction list. + */ +void __jbd2_journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) +{ + struct journal_head **list = NULL; + int was_dirty = 0; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + assert_spin_locked(&transaction->t_journal->j_list_lock); + + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_transaction == NULL); + + if (jh->b_transaction && jh->b_jlist == jlist) + return; + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); + if (test_clear_buffer_dirty(bh) || + test_clear_buffer_jbddirty(bh)) + was_dirty = 1; + } + + if (jh->b_transaction) + __jbd2_journal_temp_unlink_buffer(jh); + else + jbd2_journal_grab_journal_head(bh); + jh->b_transaction = transaction; + + switch (jlist) { + case BJ_None: + J_ASSERT_JH(jh, !jh->b_committed_data); + J_ASSERT_JH(jh, !jh->b_frozen_data); + return; + case BJ_Metadata: + transaction->t_nr_buffers++; + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_IO: + list = &transaction->t_iobuf_list; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_LogCtl: + list = &transaction->t_log_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + } + + __blist_add_buffer(list, jh); + jh->b_jlist = jlist; + + if (was_dirty) + set_buffer_jbddirty(bh); +} + +void jbd2_journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) +{ + jbd_lock_bh_state(jh2bh(jh)); + spin_lock(&transaction->t_journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, jlist); + spin_unlock(&transaction->t_journal->j_list_lock); + jbd_unlock_bh_state(jh2bh(jh)); +} + +/* + * Remove a buffer from its current buffer list in preparation for + * dropping it from its current transaction entirely. If the buffer has + * already started to be used by a subsequent transaction, refile the + * buffer on that transaction's metadata list. + * + * Called under j_list_lock + * Called under jbd_lock_bh_state(jh2bh(jh)) + * + * jh and bh may be already free when this function returns + */ +void __jbd2_journal_refile_buffer(struct journal_head *jh) +{ + int was_dirty, jlist; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + if (jh->b_transaction) + assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); + + /* If the buffer is now unused, just drop it. */ + if (jh->b_next_transaction == NULL) { + __jbd2_journal_unfile_buffer(jh); + return; + } + + /* + * It has been modified by a later transaction: add it to the new + * transaction's metadata list. + */ + + was_dirty = test_clear_buffer_jbddirty(bh); + __jbd2_journal_temp_unlink_buffer(jh); + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __jbd2_journal_file_buffer() must not + * take a new one. + */ + jh->b_transaction = jh->b_next_transaction; + jh->b_next_transaction = NULL; + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist); + J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); + + if (was_dirty) + set_buffer_jbddirty(bh); +} + +/* + * __jbd2_journal_refile_buffer() with necessary locking added. We take our + * bh reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. + */ +void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + __jbd2_journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_list_lock); + __brelse(bh); +} + +/* + * File inode in the inode list of the handle's transaction + */ +int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + + if (is_handle_aborted(handle)) + return -EIO; + + jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, + transaction->t_tid); + + /* + * First check whether inode isn't already on the transaction's + * lists without taking the lock. Note that this check is safe + * without the lock as we cannot race with somebody removing inode + * from the transaction. The reason is that we remove inode from the + * transaction only in journal_release_jbd_inode() and when we commit + * the transaction. We are guarded from the first case by holding + * a reference to the inode. We are safe against the second case + * because if jinode->i_transaction == transaction, commit code + * cannot touch the transaction because we hold reference to it, + * and if jinode->i_next_transaction == transaction, commit code + * will only file the inode where we want it. + */ + if (jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) + return 0; + + spin_lock(&journal->j_list_lock); + + if (jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) + goto done; + + /* + * We only ever set this variable to 1 so the test is safe. Since + * t_need_data_flush is likely to be set, we do the test to save some + * cacheline bouncing + */ + if (!transaction->t_need_data_flush) + transaction->t_need_data_flush = 1; + /* On some different transaction's list - should be + * the committing one */ + if (jinode->i_transaction) { + J_ASSERT(jinode->i_next_transaction == NULL); + J_ASSERT(jinode->i_transaction == + journal->j_committing_transaction); + jinode->i_next_transaction = transaction; + goto done; + } + /* Not on any transaction list... */ + J_ASSERT(!jinode->i_next_transaction); + jinode->i_transaction = transaction; + list_add(&jinode->i_list, &transaction->t_inode_list); +done: + spin_unlock(&journal->j_list_lock); + + return 0; +} + +/* + * File truncate and transaction commit interact with each other in a + * non-trivial way. If a transaction writing data block A is + * committing, we cannot discard the data by truncate until we have + * written them. Otherwise if we crashed after the transaction with + * write has committed but before the transaction with truncate has + * committed, we could see stale data in block A. This function is a + * helper to solve this problem. It starts writeout of the truncated + * part in case it is in the committing transaction. + * + * Filesystem code must call this function when inode is journaled in + * ordered mode before truncation happens and after the inode has been + * placed on orphan list with the new inode size. The second condition + * avoids the race that someone writes new data and we start + * committing the transaction after this function has been called but + * before a transaction for truncate is started (and furthermore it + * allows us to optimize the case where the addition to orphan list + * happens in the same transaction as write --- we don't have to write + * any data in such case). + */ +int jbd2_journal_begin_ordered_truncate(journal_t *journal, + struct jbd2_inode *jinode, + loff_t new_size) +{ + transaction_t *inode_trans, *commit_trans; + int ret = 0; + + /* This is a quick check to avoid locking if not necessary */ + if (!jinode->i_transaction) + goto out; + /* Locks are here just to force reading of recent values, it is + * enough that the transaction was not committing before we started + * a transaction adding the inode to orphan list */ + read_lock(&journal->j_state_lock); + commit_trans = journal->j_committing_transaction; + read_unlock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + inode_trans = jinode->i_transaction; + spin_unlock(&journal->j_list_lock); + if (inode_trans == commit_trans) { + ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, + new_size, LLONG_MAX); + if (ret) + jbd2_journal_abort(journal, ret); + } +out: + return ret; +} diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig new file mode 100644 index 00000000..6ae169cd --- /dev/null +++ b/fs/jffs2/Kconfig @@ -0,0 +1,188 @@ +config JFFS2_FS + tristate "Journalling Flash File System v2 (JFFS2) support" + select CRC32 + depends on MTD + help + JFFS2 is the second generation of the Journalling Flash File System + for use on diskless embedded devices. It provides improved wear + levelling, compression and support for hard links. You cannot use + this on normal block devices, only on 'MTD' devices. + + Further information on the design and implementation of JFFS2 is + available at . + +config JFFS2_FS_DEBUG + int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)" + depends on JFFS2_FS + default "0" + help + This controls the amount of debugging messages produced by the JFFS2 + code. Set it to zero for use in production systems. For evaluation, + testing and debugging, it's advisable to set it to one. This will + enable a few assertions and will print debugging messages at the + KERN_DEBUG loglevel, where they won't normally be visible. Level 2 + is unlikely to be useful - it enables extra debugging in certain + areas which at one point needed debugging, but when the bugs were + located and fixed, the detailed messages were relegated to level 2. + + If reporting bugs, please try to have available a full dump of the + messages at debug level 1 while the misbehaviour was occurring. + +config JFFS2_FS_WRITEBUFFER + bool "JFFS2 write-buffering support" + depends on JFFS2_FS + default y + help + This enables the write-buffering support in JFFS2. + + This functionality is required to support JFFS2 on the following + types of flash devices: + - NAND flash + - NOR flash with transparent ECC + - DataFlash + +config JFFS2_FS_WBUF_VERIFY + bool "Verify JFFS2 write-buffer reads" + depends on JFFS2_FS_WRITEBUFFER + default n + help + This causes JFFS2 to read back every page written through the + write-buffer, and check for errors. + +config JFFS2_SUMMARY + bool "JFFS2 summary support (EXPERIMENTAL)" + depends on JFFS2_FS && EXPERIMENTAL + default n + help + This feature makes it possible to use summary information + for faster filesystem mount. + + The summary information can be inserted into a filesystem image + by the utility 'sumtool'. + + If unsure, say 'N'. + +config JFFS2_FS_XATTR + bool "JFFS2 XATTR support (EXPERIMENTAL)" + depends on JFFS2_FS && EXPERIMENTAL + default n + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + +config JFFS2_FS_POSIX_ACL + bool "JFFS2 POSIX Access Control Lists" + depends on JFFS2_FS_XATTR + default y + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config JFFS2_FS_SECURITY + bool "JFFS2 Security Labels" + depends on JFFS2_FS_XATTR + default y + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the jffs2 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config JFFS2_COMPRESSION_OPTIONS + bool "Advanced compression options for JFFS2" + depends on JFFS2_FS + default n + help + Enabling this option allows you to explicitly choose which + compression modules, if any, are enabled in JFFS2. Removing + compressors can mean you cannot read existing file systems, + and enabling experimental compressors can mean that you + write a file system which cannot be read by a standard kernel. + + If unsure, you should _definitely_ say 'N'. + +config JFFS2_ZLIB + bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS + select ZLIB_INFLATE + select ZLIB_DEFLATE + depends on JFFS2_FS + default y + help + Zlib is designed to be a free, general-purpose, legally unencumbered, + lossless data-compression library for use on virtually any computer + hardware and operating system. See for + further information. + + Say 'Y' if unsure. + +config JFFS2_LZO + bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS + select LZO_COMPRESS + select LZO_DECOMPRESS + depends on JFFS2_FS + default n + help + minilzo-based compression. Generally works better than Zlib. + + This feature was added in July, 2007. Say 'N' if you need + compatibility with older bootloaders or kernels. + +config JFFS2_RTIME + bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS + depends on JFFS2_FS + default y + help + Rtime does manage to recompress already-compressed data. Say 'Y' if unsure. + +config JFFS2_RUBIN + bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS + depends on JFFS2_FS + default n + help + RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure. + +choice + prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS + default JFFS2_CMODE_PRIORITY + depends on JFFS2_FS + help + You can set here the default compression mode of JFFS2 from + the available compression modes. Don't touch if unsure. + +config JFFS2_CMODE_NONE + bool "no compression" + help + Uses no compression. + +config JFFS2_CMODE_PRIORITY + bool "priority" + help + Tries the compressors in a predefined order and chooses the first + successful one. + +config JFFS2_CMODE_SIZE + bool "size (EXPERIMENTAL)" + help + Tries all compressors and chooses the one which has the smallest + result. + +config JFFS2_CMODE_FAVOURLZO + bool "Favour LZO" + help + Tries all compressors and chooses the one which has the smallest + result but gives some preference to LZO (which has faster + decompression) at the expense of size. + +endchoice diff --git a/fs/jffs2/LICENCE b/fs/jffs2/LICENCE new file mode 100644 index 00000000..56288590 --- /dev/null +++ b/fs/jffs2/LICENCE @@ -0,0 +1,30 @@ +The files in this directory and elsewhere which refer to this LICENCE +file are part of JFFS2, the Journalling Flash File System v2. + + Copyright © 2001-2007 Red Hat, Inc. and others + +JFFS2 is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2 or (at your option) any later +version. + +JFFS2 is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with JFFS2; if not, write to the Free Software Foundation, Inc., +59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + +As a special exception, if other files instantiate templates or use +macros or inline functions from these files, or you compile these +files and link them with other works to produce a work based on these +files, these files do not by themselves cause the resulting work to be +covered by the GNU General Public License. However the source code for +these files must still be made available in accordance with section (3) +of the GNU General Public License. + +This exception does not invalidate any other reasons why a work based on +this file might be covered by the GNU General Public License. + diff --git a/fs/jffs2/Makefile b/fs/jffs2/Makefile new file mode 100644 index 00000000..60e5d49c --- /dev/null +++ b/fs/jffs2/Makefile @@ -0,0 +1,21 @@ +# +# Makefile for the Linux Journalling Flash File System v2 (JFFS2) +# +# + +obj-$(CONFIG_JFFS2_FS) += jffs2.o + +jffs2-y := compr.o dir.o file.o ioctl.o nodelist.o malloc.o +jffs2-y += read.o nodemgmt.o readinode.o write.o scan.o gc.o +jffs2-y += symlink.o build.o erase.o background.o fs.o writev.o +jffs2-y += super.o debug.o + +jffs2-$(CONFIG_JFFS2_FS_WRITEBUFFER) += wbuf.o +jffs2-$(CONFIG_JFFS2_FS_XATTR) += xattr.o xattr_trusted.o xattr_user.o +jffs2-$(CONFIG_JFFS2_FS_SECURITY) += security.o +jffs2-$(CONFIG_JFFS2_FS_POSIX_ACL) += acl.o +jffs2-$(CONFIG_JFFS2_RUBIN) += compr_rubin.o +jffs2-$(CONFIG_JFFS2_RTIME) += compr_rtime.o +jffs2-$(CONFIG_JFFS2_ZLIB) += compr_zlib.o +jffs2-$(CONFIG_JFFS2_LZO) += compr_lzo.o +jffs2-$(CONFIG_JFFS2_SUMMARY) += summary.o diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking new file mode 100644 index 00000000..3ea36554 --- /dev/null +++ b/fs/jffs2/README.Locking @@ -0,0 +1,172 @@ + + JFFS2 LOCKING DOCUMENTATION + --------------------------- + +At least theoretically, JFFS2 does not require the Big Kernel Lock +(BKL), which was always helpfully obtained for it by Linux 2.4 VFS +code. It has its own locking, as described below. + +This document attempts to describe the existing locking rules for +JFFS2. It is not expected to remain perfectly up to date, but ought to +be fairly close. + + + alloc_sem + --------- + +The alloc_sem is a per-filesystem mutex, used primarily to ensure +contiguous allocation of space on the medium. It is automatically +obtained during space allocations (jffs2_reserve_space()) and freed +upon write completion (jffs2_complete_reservation()). Note that +the garbage collector will obtain this right at the beginning of +jffs2_garbage_collect_pass() and release it at the end, thereby +preventing any other write activity on the file system during a +garbage collect pass. + +When writing new nodes, the alloc_sem must be held until the new nodes +have been properly linked into the data structures for the inode to +which they belong. This is for the benefit of NAND flash - adding new +nodes to an inode may obsolete old ones, and by holding the alloc_sem +until this happens we ensure that any data in the write-buffer at the +time this happens are part of the new node, not just something that +was written afterwards. Hence, we can ensure the newly-obsoleted nodes +don't actually get erased until the write-buffer has been flushed to +the medium. + +With the introduction of NAND flash support and the write-buffer, +the alloc_sem is also used to protect the wbuf-related members of the +jffs2_sb_info structure. Atomically reading the wbuf_len member to see +if the wbuf is currently holding any data is permitted, though. + +Ordering constraints: See f->sem. + + + File Mutex f->sem + --------------------- + +This is the JFFS2-internal equivalent of the inode mutex i->i_sem. +It protects the contents of the jffs2_inode_info private inode data, +including the linked list of node fragments (but see the notes below on +erase_completion_lock), etc. + +The reason that the i_sem itself isn't used for this purpose is to +avoid deadlocks with garbage collection -- the VFS will lock the i_sem +before calling a function which may need to allocate space. The +allocation may trigger garbage-collection, which may need to move a +node belonging to the inode which was locked in the first place by the +VFS. If the garbage collection code were to attempt to lock the i_sem +of the inode from which it's garbage-collecting a physical node, this +lead to deadlock, unless we played games with unlocking the i_sem +before calling the space allocation functions. + +Instead of playing such games, we just have an extra internal +mutex, which is obtained by the garbage collection code and also +by the normal file system code _after_ allocation of space. + +Ordering constraints: + + 1. Never attempt to allocate space or lock alloc_sem with + any f->sem held. + 2. Never attempt to lock two file mutexes in one thread. + No ordering rules have been made for doing so. + + + erase_completion_lock spinlock + ------------------------------ + +This is used to serialise access to the eraseblock lists, to the +per-eraseblock lists of physical jffs2_raw_node_ref structures, and +(NB) the per-inode list of physical nodes. The latter is a special +case - see below. + +As the MTD API no longer permits erase-completion callback functions +to be called from bottom-half (timer) context (on the basis that nobody +ever actually implemented such a thing), it's now sufficient to use +a simple spin_lock() rather than spin_lock_bh(). + +Note that the per-inode list of physical nodes (f->nodes) is a special +case. Any changes to _valid_ nodes (i.e. ->flash_offset & 1 == 0) in +the list are protected by the file mutex f->sem. But the erase code +may remove _obsolete_ nodes from the list while holding only the +erase_completion_lock. So you can walk the list only while holding the +erase_completion_lock, and can drop the lock temporarily mid-walk as +long as the pointer you're holding is to a _valid_ node, not an +obsolete one. + +The erase_completion_lock is also used to protect the c->gc_task +pointer when the garbage collection thread exits. The code to kill the +GC thread locks it, sends the signal, then unlocks it - while the GC +thread itself locks it, zeroes c->gc_task, then unlocks on the exit path. + + + inocache_lock spinlock + ---------------------- + +This spinlock protects the hashed list (c->inocache_list) of the +in-core jffs2_inode_cache objects (each inode in JFFS2 has the +correspondent jffs2_inode_cache object). So, the inocache_lock +has to be locked while walking the c->inocache_list hash buckets. + +This spinlock also covers allocation of new inode numbers, which is +currently just '++->highest_ino++', but might one day get more complicated +if we need to deal with wrapping after 4 milliard inode numbers are used. + +Note, the f->sem guarantees that the correspondent jffs2_inode_cache +will not be removed. So, it is allowed to access it without locking +the inocache_lock spinlock. + +Ordering constraints: + + If both erase_completion_lock and inocache_lock are needed, the + c->erase_completion has to be acquired first. + + + erase_free_sem + -------------- + +This mutex is only used by the erase code which frees obsolete node +references and the jffs2_garbage_collect_deletion_dirent() function. +The latter function on NAND flash must read _obsolete_ nodes to +determine whether the 'deletion dirent' under consideration can be +discarded or whether it is still required to show that an inode has +been unlinked. Because reading from the flash may sleep, the +erase_completion_lock cannot be held, so an alternative, more +heavyweight lock was required to prevent the erase code from freeing +the jffs2_raw_node_ref structures in question while the garbage +collection code is looking at them. + +Suggestions for alternative solutions to this problem would be welcomed. + + + wbuf_sem + -------- + +This read/write semaphore protects against concurrent access to the +write-behind buffer ('wbuf') used for flash chips where we must write +in blocks. It protects both the contents of the wbuf and the metadata +which indicates which flash region (if any) is currently covered by +the buffer. + +Ordering constraints: + Lock wbuf_sem last, after the alloc_sem or and f->sem. + + + c->xattr_sem + ------------ + +This read/write semaphore protects against concurrent access to the +xattr related objects which include stuff in superblock and ic->xref. +In read-only path, write-semaphore is too much exclusion. It's enough +by read-semaphore. But you must hold write-semaphore when updating, +creating or deleting any xattr related object. + +Once xattr_sem released, there would be no assurance for the existence +of those objects. Thus, a series of processes is often required to retry, +when updating such a object is necessary under holding read semaphore. +For example, do_jffs2_getxattr() holds read-semaphore to scan xref and +xdatum at first. But it retries this process with holding write-semaphore +after release read-semaphore, if it's necessary to load name/value pair +from medium. + +Ordering constraints: + Lock xattr_sem last, after the alloc_sem. diff --git a/fs/jffs2/TODO b/fs/jffs2/TODO new file mode 100644 index 00000000..ca28964a --- /dev/null +++ b/fs/jffs2/TODO @@ -0,0 +1,37 @@ + + - support asynchronous operation -- add a per-fs 'reserved_space' count, + let each outstanding write reserve the _maximum_ amount of physical + space it could take. Let GC flush the outstanding writes because the + reservations will necessarily be pessimistic. With this we could even + do shared writable mmap, if we can have a fs hook for do_wp_page() to + make the reservation. + - disable compression in commit_write()? + - fine-tune the allocation / GC thresholds + - chattr support - turning on/off and tuning compression per-inode + - checkpointing (do we need this? scan is quite fast) + - make the scan code populate real inodes so read_inode just after + mount doesn't have to read the flash twice for large files. + Make this a per-inode option, changeable with chattr, so you can + decide which inodes should be in-core immediately after mount. + - test, test, test + + - NAND flash support: + - almost done :) + - use bad block check instead of the hardwired byte check + + - Optimisations: + - Split writes so they go to two separate blocks rather than just c->nextblock. + By writing _new_ nodes to one block, and garbage-collected REF_PRISTINE + nodes to a different one, we can separate clean nodes from those which + are likely to become dirty, and end up with blocks which are each far + closer to 100% or 0% clean, hence speeding up later GC progress dramatically. + - Stop keeping name in-core with struct jffs2_full_dirent. If we keep the hash in + the full dirent, we only need to go to the flash in lookup() when we think we've + got a match, and in readdir(). + - Doubly-linked next_in_ino list to allow us to free obsoleted raw_node_refs immediately? + - Remove size from jffs2_raw_node_frag. + +dedekind: +1. __jffs2_flush_wbuf() has a strange 'pad' parameter. Eliminate. +2. get_sb()->build_fs()->scan() path... Why get_sb() removes scan()'s crap in + case of failure? scan() does not clean everything. Fix. diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c new file mode 100644 index 00000000..828a0e1e --- /dev/null +++ b/fs/jffs2/acl.c @@ -0,0 +1,440 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2006 NEC Corporation + * + * Created by KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + +static size_t jffs2_acl_size(int count) +{ + if (count <= 4) { + return sizeof(struct jffs2_acl_header) + + count * sizeof(struct jffs2_acl_entry_short); + } else { + return sizeof(struct jffs2_acl_header) + + 4 * sizeof(struct jffs2_acl_entry_short) + + (count - 4) * sizeof(struct jffs2_acl_entry); + } +} + +static int jffs2_acl_count(size_t size) +{ + size_t s; + + size -= sizeof(struct jffs2_acl_header); + if (size < 4 * sizeof(struct jffs2_acl_entry_short)) { + if (size % sizeof(struct jffs2_acl_entry_short)) + return -1; + return size / sizeof(struct jffs2_acl_entry_short); + } else { + s = size - 4 * sizeof(struct jffs2_acl_entry_short); + if (s % sizeof(struct jffs2_acl_entry)) + return -1; + return s / sizeof(struct jffs2_acl_entry) + 4; + } +} + +static struct posix_acl *jffs2_acl_from_medium(void *value, size_t size) +{ + void *end = value + size; + struct jffs2_acl_header *header = value; + struct jffs2_acl_entry *entry; + struct posix_acl *acl; + uint32_t ver; + int i, count; + + if (!value) + return NULL; + if (size < sizeof(struct jffs2_acl_header)) + return ERR_PTR(-EINVAL); + ver = je32_to_cpu(header->a_version); + if (ver != JFFS2_ACL_VERSION) { + JFFS2_WARNING("Invalid ACL version. (=%u)\n", ver); + return ERR_PTR(-EINVAL); + } + + value += sizeof(struct jffs2_acl_header); + count = jffs2_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + + for (i=0; i < count; i++) { + entry = value; + if (value + sizeof(struct jffs2_acl_entry_short) > end) + goto fail; + acl->a_entries[i].e_tag = je16_to_cpu(entry->e_tag); + acl->a_entries[i].e_perm = je16_to_cpu(entry->e_perm); + switch (acl->a_entries[i].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value += sizeof(struct jffs2_acl_entry_short); + acl->a_entries[i].e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + value += sizeof(struct jffs2_acl_entry); + if (value > end) + goto fail; + acl->a_entries[i].e_id = je32_to_cpu(entry->e_id); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size) +{ + struct jffs2_acl_header *header; + struct jffs2_acl_entry *entry; + void *e; + size_t i; + + *size = jffs2_acl_size(acl->a_count); + header = kmalloc(sizeof(*header) + acl->a_count * sizeof(*entry), GFP_KERNEL); + if (!header) + return ERR_PTR(-ENOMEM); + header->a_version = cpu_to_je32(JFFS2_ACL_VERSION); + e = header + 1; + for (i=0; i < acl->a_count; i++) { + entry = e; + entry->e_tag = cpu_to_je16(acl->a_entries[i].e_tag); + entry->e_perm = cpu_to_je16(acl->a_entries[i].e_perm); + switch(acl->a_entries[i].e_tag) { + case ACL_USER: + case ACL_GROUP: + entry->e_id = cpu_to_je32(acl->a_entries[i].e_id); + e += sizeof(struct jffs2_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(struct jffs2_acl_entry_short); + break; + + default: + goto fail; + } + } + return header; + fail: + kfree(header); + return ERR_PTR(-EINVAL); +} + +static struct posix_acl *jffs2_get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + char *value = NULL; + int rc, xprefix; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + switch (type) { + case ACL_TYPE_ACCESS: + xprefix = JFFS2_XPREFIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + xprefix = JFFS2_XPREFIX_ACL_DEFAULT; + break; + default: + BUG(); + } + rc = do_jffs2_getxattr(inode, xprefix, "", NULL, 0); + if (rc > 0) { + value = kmalloc(rc, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + rc = do_jffs2_getxattr(inode, xprefix, "", value, rc); + } + if (rc > 0) { + acl = jffs2_acl_from_medium(value, rc); + } else if (rc == -ENODATA || rc == -ENOSYS) { + acl = NULL; + } else { + acl = ERR_PTR(rc); + } + if (value) + kfree(value); + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + return acl; +} + +static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *acl) +{ + char *value = NULL; + size_t size = 0; + int rc; + + if (acl) { + value = jffs2_acl_to_medium(acl, &size); + if (IS_ERR(value)) + return PTR_ERR(value); + } + rc = do_jffs2_setxattr(inode, xprefix, "", value, size, 0); + if (!value && rc == -ENODATA) + rc = 0; + kfree(value); + + return rc; +} + +static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + int rc, xprefix; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + xprefix = JFFS2_XPREFIX_ACL_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + rc = posix_acl_equiv_mode(acl, &mode); + if (rc < 0) + return rc; + if (inode->i_mode != mode) { + struct iattr attr; + + attr.ia_valid = ATTR_MODE | ATTR_CTIME; + attr.ia_mode = mode; + attr.ia_ctime = CURRENT_TIME_SEC; + rc = jffs2_do_setattr(inode, &attr); + if (rc < 0) + return rc; + } + if (rc == 0) + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + xprefix = JFFS2_XPREFIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + default: + return -EINVAL; + } + rc = __jffs2_set_acl(inode, xprefix, acl); + if (!rc) + set_cached_acl(inode, type, acl); + return rc; +} + +int jffs2_check_acl(struct inode *inode, int mask, unsigned int flags) +{ + struct posix_acl *acl; + int rc; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + rc = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return rc; + } + return -EAGAIN; +} + +int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) +{ + struct posix_acl *acl, *clone; + int rc; + + cache_no_acl(inode); + + if (S_ISLNK(*i_mode)) + return 0; /* Symlink always has no-ACL */ + + acl = jffs2_get_acl(dir_i, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (!acl) { + *i_mode &= ~current_umask(); + } else { + if (S_ISDIR(*i_mode)) + set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); + + clone = posix_acl_clone(acl, GFP_KERNEL); + if (!clone) + return -ENOMEM; + rc = posix_acl_create_masq(clone, (mode_t *)i_mode); + if (rc < 0) { + posix_acl_release(clone); + return rc; + } + if (rc > 0) + set_cached_acl(inode, ACL_TYPE_ACCESS, clone); + + posix_acl_release(clone); + } + return 0; +} + +int jffs2_init_acl_post(struct inode *inode) +{ + int rc; + + if (inode->i_default_acl) { + rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_DEFAULT, inode->i_default_acl); + if (rc) + return rc; + } + + if (inode->i_acl) { + rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_ACCESS, inode->i_acl); + if (rc) + return rc; + } + + return 0; +} + +int jffs2_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int rc; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + rc = posix_acl_chmod_masq(clone, inode->i_mode); + if (!rc) + rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, clone); + posix_acl_release(clone); + return rc; +} + +static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS); + + if (list && retlen <= list_size) + strcpy(list, POSIX_ACL_XATTR_ACCESS); + return retlen; +} + +static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT); + + if (list && retlen <= list_size) + strcpy(list, POSIX_ACL_XATTR_DEFAULT); + return retlen; +} + +static int jffs2_acl_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct posix_acl *acl; + int rc; + + if (name[0] != '\0') + return -EINVAL; + + acl = jffs2_get_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (!acl) + return -ENODATA; + rc = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return rc; +} + +static int jffs2_acl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct posix_acl *acl; + int rc; + + if (name[0] != '\0') + return -EINVAL; + if (!inode_owner_or_capable(dentry->d_inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + rc = posix_acl_valid(acl); + if (rc) + goto out; + } + } else { + acl = NULL; + } + rc = jffs2_set_acl(dentry->d_inode, type, acl); + out: + posix_acl_release(acl); + return rc; +} + +const struct xattr_handler jffs2_acl_access_xattr_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_DEFAULT, + .list = jffs2_acl_access_listxattr, + .get = jffs2_acl_getxattr, + .set = jffs2_acl_setxattr, +}; + +const struct xattr_handler jffs2_acl_default_xattr_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = jffs2_acl_default_listxattr, + .get = jffs2_acl_getxattr, + .set = jffs2_acl_setxattr, +}; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h new file mode 100644 index 00000000..3119f592 --- /dev/null +++ b/fs/jffs2/acl.h @@ -0,0 +1,44 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2006 NEC Corporation + * + * Created by KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +struct jffs2_acl_entry { + jint16_t e_tag; + jint16_t e_perm; + jint32_t e_id; +}; + +struct jffs2_acl_entry_short { + jint16_t e_tag; + jint16_t e_perm; +}; + +struct jffs2_acl_header { + jint32_t a_version; +}; + +#ifdef CONFIG_JFFS2_FS_POSIX_ACL + +extern int jffs2_check_acl(struct inode *, int, unsigned int); +extern int jffs2_acl_chmod(struct inode *); +extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); +extern int jffs2_init_acl_post(struct inode *); + +extern const struct xattr_handler jffs2_acl_access_xattr_handler; +extern const struct xattr_handler jffs2_acl_default_xattr_handler; + +#else + +#define jffs2_check_acl (NULL) +#define jffs2_acl_chmod(inode) (0) +#define jffs2_init_acl_pre(dir_i,inode,mode) (0) +#define jffs2_init_acl_post(inode) (0) + +#endif /* CONFIG_JFFS2_FS_POSIX_ACL */ diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c new file mode 100644 index 00000000..404111b0 --- /dev/null +++ b/fs/jffs2/background.c @@ -0,0 +1,159 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + + +static int jffs2_garbage_collect_thread(void *); + +void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c) +{ + assert_spin_locked(&c->erase_completion_lock); + if (c->gc_task && jffs2_thread_should_wake(c)) + send_sig(SIGHUP, c->gc_task, 1); +} + +/* This must only ever be called when no GC thread is currently running */ +int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c) +{ + struct task_struct *tsk; + int ret = 0; + + BUG_ON(c->gc_task); + + init_completion(&c->gc_thread_start); + init_completion(&c->gc_thread_exit); + + tsk = kthread_run(jffs2_garbage_collect_thread, c, "jffs2_gcd_mtd%d", c->mtd->index); + if (IS_ERR(tsk)) { + printk(KERN_WARNING "fork failed for JFFS2 garbage collect thread: %ld\n", -PTR_ERR(tsk)); + complete(&c->gc_thread_exit); + ret = PTR_ERR(tsk); + } else { + /* Wait for it... */ + D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", tsk->pid)); + wait_for_completion(&c->gc_thread_start); + ret = tsk->pid; + } + + return ret; +} + +void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c) +{ + int wait = 0; + spin_lock(&c->erase_completion_lock); + if (c->gc_task) { + D1(printk(KERN_DEBUG "jffs2: Killing GC task %d\n", c->gc_task->pid)); + send_sig(SIGKILL, c->gc_task, 1); + wait = 1; + } + spin_unlock(&c->erase_completion_lock); + if (wait) + wait_for_completion(&c->gc_thread_exit); +} + +static int jffs2_garbage_collect_thread(void *_c) +{ + struct jffs2_sb_info *c = _c; + + allow_signal(SIGKILL); + allow_signal(SIGSTOP); + allow_signal(SIGCONT); + + c->gc_task = current; + complete(&c->gc_thread_start); + + set_user_nice(current, 10); + + set_freezable(); + for (;;) { + allow_signal(SIGHUP); + again: + spin_lock(&c->erase_completion_lock); + if (!jffs2_thread_should_wake(c)) { + set_current_state (TASK_INTERRUPTIBLE); + spin_unlock(&c->erase_completion_lock); + D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread sleeping...\n")); + schedule(); + } else + spin_unlock(&c->erase_completion_lock); + + + /* Problem - immediately after bootup, the GCD spends a lot + * of time in places like jffs2_kill_fragtree(); so much so + * that userspace processes (like gdm and X) are starved + * despite plenty of cond_resched()s and renicing. Yield() + * doesn't help, either (presumably because userspace and GCD + * are generally competing for a higher latency resource - + * disk). + * This forces the GCD to slow the hell down. Pulling an + * inode in with read_inode() is much preferable to having + * the GC thread get there first. */ + schedule_timeout_interruptible(msecs_to_jiffies(50)); + + if (kthread_should_stop()) { + D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): kthread_stop() called.\n")); + goto die; + } + + /* Put_super will send a SIGKILL and then wait on the sem. + */ + while (signal_pending(current) || freezing(current)) { + siginfo_t info; + unsigned long signr; + + if (try_to_freeze()) + goto again; + + signr = dequeue_signal_lock(current, ¤t->blocked, &info); + + switch(signr) { + case SIGSTOP: + D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): SIGSTOP received.\n")); + set_current_state(TASK_STOPPED); + schedule(); + break; + + case SIGKILL: + D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): SIGKILL received.\n")); + goto die; + + case SIGHUP: + D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): SIGHUP received.\n")); + break; + default: + D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): signal %ld received\n", signr)); + } + } + /* We don't want SIGHUP to interrupt us. STOP and KILL are OK though. */ + disallow_signal(SIGHUP); + + D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): pass\n")); + if (jffs2_garbage_collect_pass(c) == -ENOSPC) { + printk(KERN_NOTICE "No space for garbage collection. Aborting GC thread\n"); + goto die; + } + } + die: + spin_lock(&c->erase_completion_lock); + c->gc_task = NULL; + spin_unlock(&c->erase_completion_lock); + complete_and_exit(&c->gc_thread_exit, 0); +} diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c new file mode 100644 index 00000000..3005ec45 --- /dev/null +++ b/fs/jffs2/build.c @@ -0,0 +1,392 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include "nodelist.h" + +static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *, + struct jffs2_inode_cache *, struct jffs2_full_dirent **); + +static inline struct jffs2_inode_cache * +first_inode_chain(int *i, struct jffs2_sb_info *c) +{ + for (; *i < c->inocache_hashsize; (*i)++) { + if (c->inocache_list[*i]) + return c->inocache_list[*i]; + } + return NULL; +} + +static inline struct jffs2_inode_cache * +next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c) +{ + /* More in this chain? */ + if (ic->next) + return ic->next; + (*i)++; + return first_inode_chain(i, c); +} + +#define for_each_inode(i, c, ic) \ + for (i = 0, ic = first_inode_chain(&i, (c)); \ + ic; \ + ic = next_inode(&i, ic, (c))) + + +static void jffs2_build_inode_pass1(struct jffs2_sb_info *c, + struct jffs2_inode_cache *ic) +{ + struct jffs2_full_dirent *fd; + + dbg_fsbuild("building directory inode #%u\n", ic->ino); + + /* For each child, increase nlink */ + for(fd = ic->scan_dents; fd; fd = fd->next) { + struct jffs2_inode_cache *child_ic; + if (!fd->ino) + continue; + + /* we can get high latency here with huge directories */ + + child_ic = jffs2_get_ino_cache(c, fd->ino); + if (!child_ic) { + dbg_fsbuild("child \"%s\" (ino #%u) of dir ino #%u doesn't exist!\n", + fd->name, fd->ino, ic->ino); + jffs2_mark_node_obsolete(c, fd->raw); + continue; + } + + if (fd->type == DT_DIR) { + if (child_ic->pino_nlink) { + JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n", + fd->name, fd->ino, ic->ino); + /* TODO: What do we do about it? */ + } else { + child_ic->pino_nlink = ic->ino; + } + } else + child_ic->pino_nlink++; + + dbg_fsbuild("increased nlink for child \"%s\" (ino #%u)\n", fd->name, fd->ino); + /* Can't free scan_dents so far. We might need them in pass 2 */ + } +} + +/* Scan plan: + - Scan physical nodes. Build map of inodes/dirents. Allocate inocaches as we go + - Scan directory tree from top down, setting nlink in inocaches + - Scan inocaches for inodes with nlink==0 +*/ +static int jffs2_build_filesystem(struct jffs2_sb_info *c) +{ + int ret; + int i; + struct jffs2_inode_cache *ic; + struct jffs2_full_dirent *fd; + struct jffs2_full_dirent *dead_fds = NULL; + + dbg_fsbuild("build FS data structures\n"); + + /* First, scan the medium and build all the inode caches with + lists of physical nodes */ + + c->flags |= JFFS2_SB_FLAG_SCANNING; + ret = jffs2_scan_medium(c); + c->flags &= ~JFFS2_SB_FLAG_SCANNING; + if (ret) + goto exit; + + dbg_fsbuild("scanned flash completely\n"); + jffs2_dbg_dump_block_lists_nolock(c); + + dbg_fsbuild("pass 1 starting\n"); + c->flags |= JFFS2_SB_FLAG_BUILDING; + /* Now scan the directory tree, increasing nlink according to every dirent found. */ + for_each_inode(i, c, ic) { + if (ic->scan_dents) { + jffs2_build_inode_pass1(c, ic); + cond_resched(); + } + } + + dbg_fsbuild("pass 1 complete\n"); + + /* Next, scan for inodes with nlink == 0 and remove them. If + they were directories, then decrement the nlink of their + children too, and repeat the scan. As that's going to be + a fairly uncommon occurrence, it's not so evil to do it this + way. Recursion bad. */ + dbg_fsbuild("pass 2 starting\n"); + + for_each_inode(i, c, ic) { + if (ic->pino_nlink) + continue; + + jffs2_build_remove_unlinked_inode(c, ic, &dead_fds); + cond_resched(); + } + + dbg_fsbuild("pass 2a starting\n"); + + while (dead_fds) { + fd = dead_fds; + dead_fds = fd->next; + + ic = jffs2_get_ino_cache(c, fd->ino); + + if (ic) + jffs2_build_remove_unlinked_inode(c, ic, &dead_fds); + jffs2_free_full_dirent(fd); + } + + dbg_fsbuild("pass 2a complete\n"); + dbg_fsbuild("freeing temporary data structures\n"); + + /* Finally, we can scan again and free the dirent structs */ + for_each_inode(i, c, ic) { + while(ic->scan_dents) { + fd = ic->scan_dents; + ic->scan_dents = fd->next; + jffs2_free_full_dirent(fd); + } + ic->scan_dents = NULL; + cond_resched(); + } + jffs2_build_xattr_subsystem(c); + c->flags &= ~JFFS2_SB_FLAG_BUILDING; + + dbg_fsbuild("FS build complete\n"); + + /* Rotate the lists by some number to ensure wear levelling */ + jffs2_rotate_lists(c); + + ret = 0; + +exit: + if (ret) { + for_each_inode(i, c, ic) { + while(ic->scan_dents) { + fd = ic->scan_dents; + ic->scan_dents = fd->next; + jffs2_free_full_dirent(fd); + } + } + jffs2_clear_xattr_subsystem(c); + } + + return ret; +} + +static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *c, + struct jffs2_inode_cache *ic, + struct jffs2_full_dirent **dead_fds) +{ + struct jffs2_raw_node_ref *raw; + struct jffs2_full_dirent *fd; + + dbg_fsbuild("removing ino #%u with nlink == zero.\n", ic->ino); + + raw = ic->nodes; + while (raw != (void *)ic) { + struct jffs2_raw_node_ref *next = raw->next_in_ino; + dbg_fsbuild("obsoleting node at 0x%08x\n", ref_offset(raw)); + jffs2_mark_node_obsolete(c, raw); + raw = next; + } + + if (ic->scan_dents) { + int whinged = 0; + dbg_fsbuild("inode #%u was a directory which may have children...\n", ic->ino); + + while(ic->scan_dents) { + struct jffs2_inode_cache *child_ic; + + fd = ic->scan_dents; + ic->scan_dents = fd->next; + + if (!fd->ino) { + /* It's a deletion dirent. Ignore it */ + dbg_fsbuild("child \"%s\" is a deletion dirent, skipping...\n", fd->name); + jffs2_free_full_dirent(fd); + continue; + } + if (!whinged) + whinged = 1; + + dbg_fsbuild("removing child \"%s\", ino #%u\n", fd->name, fd->ino); + + child_ic = jffs2_get_ino_cache(c, fd->ino); + if (!child_ic) { + dbg_fsbuild("cannot remove child \"%s\", ino #%u, because it doesn't exist\n", + fd->name, fd->ino); + jffs2_free_full_dirent(fd); + continue; + } + + /* Reduce nlink of the child. If it's now zero, stick it on the + dead_fds list to be cleaned up later. Else just free the fd */ + + if (fd->type == DT_DIR) + child_ic->pino_nlink = 0; + else + child_ic->pino_nlink--; + + if (!child_ic->pino_nlink) { + dbg_fsbuild("inode #%u (\"%s\") now has no links; adding to dead_fds list.\n", + fd->ino, fd->name); + fd->next = *dead_fds; + *dead_fds = fd; + } else { + dbg_fsbuild("inode #%u (\"%s\") has now got nlink %d. Ignoring.\n", + fd->ino, fd->name, child_ic->pino_nlink); + jffs2_free_full_dirent(fd); + } + } + } + + /* + We don't delete the inocache from the hash list and free it yet. + The erase code will do that, when all the nodes are completely gone. + */ +} + +static void jffs2_calc_trigger_levels(struct jffs2_sb_info *c) +{ + uint32_t size; + + /* Deletion should almost _always_ be allowed. We're fairly + buggered once we stop allowing people to delete stuff + because there's not enough free space... */ + c->resv_blocks_deletion = 2; + + /* Be conservative about how much space we need before we allow writes. + On top of that which is required for deletia, require an extra 2% + of the medium to be available, for overhead caused by nodes being + split across blocks, etc. */ + + size = c->flash_size / 50; /* 2% of flash size */ + size += c->nr_blocks * 100; /* And 100 bytes per eraseblock */ + size += c->sector_size - 1; /* ... and round up */ + + c->resv_blocks_write = c->resv_blocks_deletion + (size / c->sector_size); + + /* When do we let the GC thread run in the background */ + + c->resv_blocks_gctrigger = c->resv_blocks_write + 1; + + /* When do we allow garbage collection to merge nodes to make + long-term progress at the expense of short-term space exhaustion? */ + c->resv_blocks_gcmerge = c->resv_blocks_deletion + 1; + + /* When do we allow garbage collection to eat from bad blocks rather + than actually making progress? */ + c->resv_blocks_gcbad = 0;//c->resv_blocks_deletion + 2; + + /* What number of 'very dirty' eraseblocks do we allow before we + trigger the GC thread even if we don't _need_ the space. When we + can't mark nodes obsolete on the medium, the old dirty nodes cause + performance problems because we have to inspect and discard them. */ + c->vdirty_blocks_gctrigger = c->resv_blocks_gctrigger; + if (jffs2_can_mark_obsolete(c)) + c->vdirty_blocks_gctrigger *= 10; + + /* If there's less than this amount of dirty space, don't bother + trying to GC to make more space. It'll be a fruitless task */ + c->nospc_dirty_size = c->sector_size + (c->flash_size / 100); + + dbg_fsbuild("JFFS2 trigger levels (size %d KiB, block size %d KiB, %d blocks)\n", + c->flash_size / 1024, c->sector_size / 1024, c->nr_blocks); + dbg_fsbuild("Blocks required to allow deletion: %d (%d KiB)\n", + c->resv_blocks_deletion, c->resv_blocks_deletion*c->sector_size/1024); + dbg_fsbuild("Blocks required to allow writes: %d (%d KiB)\n", + c->resv_blocks_write, c->resv_blocks_write*c->sector_size/1024); + dbg_fsbuild("Blocks required to quiesce GC thread: %d (%d KiB)\n", + c->resv_blocks_gctrigger, c->resv_blocks_gctrigger*c->sector_size/1024); + dbg_fsbuild("Blocks required to allow GC merges: %d (%d KiB)\n", + c->resv_blocks_gcmerge, c->resv_blocks_gcmerge*c->sector_size/1024); + dbg_fsbuild("Blocks required to GC bad blocks: %d (%d KiB)\n", + c->resv_blocks_gcbad, c->resv_blocks_gcbad*c->sector_size/1024); + dbg_fsbuild("Amount of dirty space required to GC: %d bytes\n", + c->nospc_dirty_size); + dbg_fsbuild("Very dirty blocks before GC triggered: %d\n", + c->vdirty_blocks_gctrigger); +} + +int jffs2_do_mount_fs(struct jffs2_sb_info *c) +{ + int ret; + int i; + int size; + + c->free_size = c->flash_size; + c->nr_blocks = c->flash_size / c->sector_size; + size = sizeof(struct jffs2_eraseblock) * c->nr_blocks; +#ifndef __ECOS + if (jffs2_blocks_use_vmalloc(c)) + c->blocks = vzalloc(size); + else +#endif + c->blocks = kzalloc(size, GFP_KERNEL); + if (!c->blocks) + return -ENOMEM; + + for (i=0; inr_blocks; i++) { + INIT_LIST_HEAD(&c->blocks[i].list); + c->blocks[i].offset = i * c->sector_size; + c->blocks[i].free_size = c->sector_size; + } + + INIT_LIST_HEAD(&c->clean_list); + INIT_LIST_HEAD(&c->very_dirty_list); + INIT_LIST_HEAD(&c->dirty_list); + INIT_LIST_HEAD(&c->erasable_list); + INIT_LIST_HEAD(&c->erasing_list); + INIT_LIST_HEAD(&c->erase_checking_list); + INIT_LIST_HEAD(&c->erase_pending_list); + INIT_LIST_HEAD(&c->erasable_pending_wbuf_list); + INIT_LIST_HEAD(&c->erase_complete_list); + INIT_LIST_HEAD(&c->free_list); + INIT_LIST_HEAD(&c->bad_list); + INIT_LIST_HEAD(&c->bad_used_list); + c->highest_ino = 1; + c->summary = NULL; + + ret = jffs2_sum_init(c); + if (ret) + goto out_free; + + if (jffs2_build_filesystem(c)) { + dbg_fsbuild("build_fs failed\n"); + jffs2_free_ino_caches(c); + jffs2_free_raw_node_refs(c); + ret = -EIO; + goto out_free; + } + + jffs2_calc_trigger_levels(c); + + return 0; + + out_free: +#ifndef __ECOS + if (jffs2_blocks_use_vmalloc(c)) + vfree(c->blocks); + else +#endif + kfree(c->blocks); + + return ret; +} diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c new file mode 100644 index 00000000..de424702 --- /dev/null +++ b/fs/jffs2/compr.c @@ -0,0 +1,360 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * Copyright © 2004 Ferenc Havasi , + * University of Szeged, Hungary + * + * Created by Arjan van de Ven + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include "compr.h" + +static DEFINE_SPINLOCK(jffs2_compressor_list_lock); + +/* Available compressors are on this list */ +static LIST_HEAD(jffs2_compressor_list); + +/* Actual compression mode */ +static int jffs2_compression_mode = JFFS2_COMPR_MODE_PRIORITY; + +/* Statistics for blocks stored without compression */ +static uint32_t none_stat_compr_blocks=0,none_stat_decompr_blocks=0,none_stat_compr_size=0; + + +/* + * Return 1 to use this compression + */ +static int jffs2_is_best_compression(struct jffs2_compressor *this, + struct jffs2_compressor *best, uint32_t size, uint32_t bestsize) +{ + switch (jffs2_compression_mode) { + case JFFS2_COMPR_MODE_SIZE: + if (bestsize > size) + return 1; + return 0; + case JFFS2_COMPR_MODE_FAVOURLZO: + if ((this->compr == JFFS2_COMPR_LZO) && (bestsize > size)) + return 1; + if ((best->compr != JFFS2_COMPR_LZO) && (bestsize > size)) + return 1; + if ((this->compr == JFFS2_COMPR_LZO) && (bestsize > (size * FAVOUR_LZO_PERCENT / 100))) + return 1; + if ((bestsize * FAVOUR_LZO_PERCENT / 100) > size) + return 1; + + return 0; + } + /* Shouldn't happen */ + return 0; +} + +/* jffs2_compress: + * @data_in: Pointer to uncompressed data + * @cpage_out: Pointer to returned pointer to buffer for compressed data + * @datalen: On entry, holds the amount of data available for compression. + * On exit, expected to hold the amount of data actually compressed. + * @cdatalen: On entry, holds the amount of space available for compressed + * data. On exit, expected to hold the actual size of the compressed + * data. + * + * Returns: Lower byte to be stored with data indicating compression type used. + * Zero is used to show that the data could not be compressed - the + * compressed version was actually larger than the original. + * Upper byte will be used later. (soon) + * + * If the cdata buffer isn't large enough to hold all the uncompressed data, + * jffs2_compress should compress as much as will fit, and should set + * *datalen accordingly to show the amount of data which were compressed. + */ +uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + unsigned char *data_in, unsigned char **cpage_out, + uint32_t *datalen, uint32_t *cdatalen) +{ + int ret = JFFS2_COMPR_NONE; + int compr_ret; + struct jffs2_compressor *this, *best=NULL; + unsigned char *output_buf = NULL, *tmp_buf; + uint32_t orig_slen, orig_dlen; + uint32_t best_slen=0, best_dlen=0; + + switch (jffs2_compression_mode) { + case JFFS2_COMPR_MODE_NONE: + break; + case JFFS2_COMPR_MODE_PRIORITY: + output_buf = kmalloc(*cdatalen,GFP_KERNEL); + if (!output_buf) { + printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n"); + goto out; + } + orig_slen = *datalen; + orig_dlen = *cdatalen; + spin_lock(&jffs2_compressor_list_lock); + list_for_each_entry(this, &jffs2_compressor_list, list) { + /* Skip decompress-only backwards-compatibility and disabled modules */ + if ((!this->compress)||(this->disabled)) + continue; + + this->usecount++; + spin_unlock(&jffs2_compressor_list_lock); + *datalen = orig_slen; + *cdatalen = orig_dlen; + compr_ret = this->compress(data_in, output_buf, datalen, cdatalen); + spin_lock(&jffs2_compressor_list_lock); + this->usecount--; + if (!compr_ret) { + ret = this->compr; + this->stat_compr_blocks++; + this->stat_compr_orig_size += *datalen; + this->stat_compr_new_size += *cdatalen; + break; + } + } + spin_unlock(&jffs2_compressor_list_lock); + if (ret == JFFS2_COMPR_NONE) + kfree(output_buf); + break; + case JFFS2_COMPR_MODE_SIZE: + case JFFS2_COMPR_MODE_FAVOURLZO: + orig_slen = *datalen; + orig_dlen = *cdatalen; + spin_lock(&jffs2_compressor_list_lock); + list_for_each_entry(this, &jffs2_compressor_list, list) { + /* Skip decompress-only backwards-compatibility and disabled modules */ + if ((!this->compress)||(this->disabled)) + continue; + /* Allocating memory for output buffer if necessary */ + if ((this->compr_buf_size < orig_slen) && (this->compr_buf)) { + spin_unlock(&jffs2_compressor_list_lock); + kfree(this->compr_buf); + spin_lock(&jffs2_compressor_list_lock); + this->compr_buf_size=0; + this->compr_buf=NULL; + } + if (!this->compr_buf) { + spin_unlock(&jffs2_compressor_list_lock); + tmp_buf = kmalloc(orig_slen, GFP_KERNEL); + spin_lock(&jffs2_compressor_list_lock); + if (!tmp_buf) { + printk(KERN_WARNING "JFFS2: No memory for compressor allocation. (%d bytes)\n", orig_slen); + continue; + } + else { + this->compr_buf = tmp_buf; + this->compr_buf_size = orig_slen; + } + } + this->usecount++; + spin_unlock(&jffs2_compressor_list_lock); + *datalen = orig_slen; + *cdatalen = orig_dlen; + compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen); + spin_lock(&jffs2_compressor_list_lock); + this->usecount--; + if (!compr_ret) { + if (((!best_dlen) || jffs2_is_best_compression(this, best, *cdatalen, best_dlen)) + && (*cdatalen < *datalen)) { + best_dlen = *cdatalen; + best_slen = *datalen; + best = this; + } + } + } + if (best_dlen) { + *cdatalen = best_dlen; + *datalen = best_slen; + output_buf = best->compr_buf; + best->compr_buf = NULL; + best->compr_buf_size = 0; + best->stat_compr_blocks++; + best->stat_compr_orig_size += best_slen; + best->stat_compr_new_size += best_dlen; + ret = best->compr; + } + spin_unlock(&jffs2_compressor_list_lock); + break; + default: + printk(KERN_ERR "JFFS2: unknown compression mode.\n"); + } + out: + if (ret == JFFS2_COMPR_NONE) { + *cpage_out = data_in; + *datalen = *cdatalen; + none_stat_compr_blocks++; + none_stat_compr_size += *datalen; + } + else { + *cpage_out = output_buf; + } + return ret; +} + +int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + uint16_t comprtype, unsigned char *cdata_in, + unsigned char *data_out, uint32_t cdatalen, uint32_t datalen) +{ + struct jffs2_compressor *this; + int ret; + + /* Older code had a bug where it would write non-zero 'usercompr' + fields. Deal with it. */ + if ((comprtype & 0xff) <= JFFS2_COMPR_ZLIB) + comprtype &= 0xff; + + switch (comprtype & 0xff) { + case JFFS2_COMPR_NONE: + /* This should be special-cased elsewhere, but we might as well deal with it */ + memcpy(data_out, cdata_in, datalen); + none_stat_decompr_blocks++; + break; + case JFFS2_COMPR_ZERO: + memset(data_out, 0, datalen); + break; + default: + spin_lock(&jffs2_compressor_list_lock); + list_for_each_entry(this, &jffs2_compressor_list, list) { + if (comprtype == this->compr) { + this->usecount++; + spin_unlock(&jffs2_compressor_list_lock); + ret = this->decompress(cdata_in, data_out, cdatalen, datalen); + spin_lock(&jffs2_compressor_list_lock); + if (ret) { + printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret); + } + else { + this->stat_decompr_blocks++; + } + this->usecount--; + spin_unlock(&jffs2_compressor_list_lock); + return ret; + } + } + printk(KERN_WARNING "JFFS2 compression type 0x%02x not available.\n", comprtype); + spin_unlock(&jffs2_compressor_list_lock); + return -EIO; + } + return 0; +} + +int jffs2_register_compressor(struct jffs2_compressor *comp) +{ + struct jffs2_compressor *this; + + if (!comp->name) { + printk(KERN_WARNING "NULL compressor name at registering JFFS2 compressor. Failed.\n"); + return -1; + } + comp->compr_buf_size=0; + comp->compr_buf=NULL; + comp->usecount=0; + comp->stat_compr_orig_size=0; + comp->stat_compr_new_size=0; + comp->stat_compr_blocks=0; + comp->stat_decompr_blocks=0; + D1(printk(KERN_DEBUG "Registering JFFS2 compressor \"%s\"\n", comp->name)); + + spin_lock(&jffs2_compressor_list_lock); + + list_for_each_entry(this, &jffs2_compressor_list, list) { + if (this->priority < comp->priority) { + list_add(&comp->list, this->list.prev); + goto out; + } + } + list_add_tail(&comp->list, &jffs2_compressor_list); +out: + D2(list_for_each_entry(this, &jffs2_compressor_list, list) { + printk(KERN_DEBUG "Compressor \"%s\", prio %d\n", this->name, this->priority); + }) + + spin_unlock(&jffs2_compressor_list_lock); + + return 0; +} + +int jffs2_unregister_compressor(struct jffs2_compressor *comp) +{ + D2(struct jffs2_compressor *this;) + + D1(printk(KERN_DEBUG "Unregistering JFFS2 compressor \"%s\"\n", comp->name)); + + spin_lock(&jffs2_compressor_list_lock); + + if (comp->usecount) { + spin_unlock(&jffs2_compressor_list_lock); + printk(KERN_WARNING "JFFS2: Compressor modul is in use. Unregister failed.\n"); + return -1; + } + list_del(&comp->list); + + D2(list_for_each_entry(this, &jffs2_compressor_list, list) { + printk(KERN_DEBUG "Compressor \"%s\", prio %d\n", this->name, this->priority); + }) + spin_unlock(&jffs2_compressor_list_lock); + return 0; +} + +void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig) +{ + if (orig != comprbuf) + kfree(comprbuf); +} + +int __init jffs2_compressors_init(void) +{ +/* Registering compressors */ +#ifdef CONFIG_JFFS2_ZLIB + jffs2_zlib_init(); +#endif +#ifdef CONFIG_JFFS2_RTIME + jffs2_rtime_init(); +#endif +#ifdef CONFIG_JFFS2_RUBIN + jffs2_rubinmips_init(); + jffs2_dynrubin_init(); +#endif +#ifdef CONFIG_JFFS2_LZO + jffs2_lzo_init(); +#endif +/* Setting default compression mode */ +#ifdef CONFIG_JFFS2_CMODE_NONE + jffs2_compression_mode = JFFS2_COMPR_MODE_NONE; + D1(printk(KERN_INFO "JFFS2: default compression mode: none\n");) +#else +#ifdef CONFIG_JFFS2_CMODE_SIZE + jffs2_compression_mode = JFFS2_COMPR_MODE_SIZE; + D1(printk(KERN_INFO "JFFS2: default compression mode: size\n");) +#else +#ifdef CONFIG_JFFS2_CMODE_FAVOURLZO + jffs2_compression_mode = JFFS2_COMPR_MODE_FAVOURLZO; + D1(printk(KERN_INFO "JFFS2: default compression mode: favourlzo\n");) +#else + D1(printk(KERN_INFO "JFFS2: default compression mode: priority\n");) +#endif +#endif +#endif + return 0; +} + +int jffs2_compressors_exit(void) +{ +/* Unregistering compressors */ +#ifdef CONFIG_JFFS2_LZO + jffs2_lzo_exit(); +#endif +#ifdef CONFIG_JFFS2_RUBIN + jffs2_dynrubin_exit(); + jffs2_rubinmips_exit(); +#endif +#ifdef CONFIG_JFFS2_RTIME + jffs2_rtime_exit(); +#endif +#ifdef CONFIG_JFFS2_ZLIB + jffs2_zlib_exit(); +#endif + return 0; +} diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h new file mode 100644 index 00000000..13bb7597 --- /dev/null +++ b/fs/jffs2/compr.h @@ -0,0 +1,103 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2004 Ferenc Havasi , + * University of Szeged, Hungary + * Copyright © 2004-2010 David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef __JFFS2_COMPR_H__ +#define __JFFS2_COMPR_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "jffs2_fs_i.h" +#include "jffs2_fs_sb.h" +#include "nodelist.h" + +#define JFFS2_RUBINMIPS_PRIORITY 10 +#define JFFS2_DYNRUBIN_PRIORITY 20 +#define JFFS2_LZARI_PRIORITY 30 +#define JFFS2_RTIME_PRIORITY 50 +#define JFFS2_ZLIB_PRIORITY 60 +#define JFFS2_LZO_PRIORITY 80 + + +#define JFFS2_RUBINMIPS_DISABLED /* RUBINs will be used only */ +#define JFFS2_DYNRUBIN_DISABLED /* for decompression */ + +#define JFFS2_COMPR_MODE_NONE 0 +#define JFFS2_COMPR_MODE_PRIORITY 1 +#define JFFS2_COMPR_MODE_SIZE 2 +#define JFFS2_COMPR_MODE_FAVOURLZO 3 + +#define FAVOUR_LZO_PERCENT 80 + +struct jffs2_compressor { + struct list_head list; + int priority; /* used by prirority comr. mode */ + char *name; + char compr; /* JFFS2_COMPR_XXX */ + int (*compress)(unsigned char *data_in, unsigned char *cpage_out, + uint32_t *srclen, uint32_t *destlen); + int (*decompress)(unsigned char *cdata_in, unsigned char *data_out, + uint32_t cdatalen, uint32_t datalen); + int usecount; + int disabled; /* if set the compressor won't compress */ + unsigned char *compr_buf; /* used by size compr. mode */ + uint32_t compr_buf_size; /* used by size compr. mode */ + uint32_t stat_compr_orig_size; + uint32_t stat_compr_new_size; + uint32_t stat_compr_blocks; + uint32_t stat_decompr_blocks; +}; + +int jffs2_register_compressor(struct jffs2_compressor *comp); +int jffs2_unregister_compressor(struct jffs2_compressor *comp); + +int jffs2_compressors_init(void); +int jffs2_compressors_exit(void); + +uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + unsigned char *data_in, unsigned char **cpage_out, + uint32_t *datalen, uint32_t *cdatalen); + +int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + uint16_t comprtype, unsigned char *cdata_in, + unsigned char *data_out, uint32_t cdatalen, uint32_t datalen); + +void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig); + +/* Compressor modules */ +/* These functions will be called by jffs2_compressors_init/exit */ + +#ifdef CONFIG_JFFS2_RUBIN +int jffs2_rubinmips_init(void); +void jffs2_rubinmips_exit(void); +int jffs2_dynrubin_init(void); +void jffs2_dynrubin_exit(void); +#endif +#ifdef CONFIG_JFFS2_RTIME +int jffs2_rtime_init(void); +void jffs2_rtime_exit(void); +#endif +#ifdef CONFIG_JFFS2_ZLIB +int jffs2_zlib_init(void); +void jffs2_zlib_exit(void); +#endif +#ifdef CONFIG_JFFS2_LZO +int jffs2_lzo_init(void); +void jffs2_lzo_exit(void); +#endif + +#endif /* __JFFS2_COMPR_H__ */ diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c new file mode 100644 index 00000000..af186ee6 --- /dev/null +++ b/fs/jffs2/compr_lzo.c @@ -0,0 +1,111 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2007 Nokia Corporation. All rights reserved. + * Copyright © 2004-2010 David Woodhouse + * + * Created by Richard Purdie + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include "compr.h" + +static void *lzo_mem; +static void *lzo_compress_buf; +static DEFINE_MUTEX(deflate_mutex); /* for lzo_mem and lzo_compress_buf */ + +static void free_workspace(void) +{ + vfree(lzo_mem); + vfree(lzo_compress_buf); +} + +static int __init alloc_workspace(void) +{ + lzo_mem = vmalloc(LZO1X_MEM_COMPRESS); + lzo_compress_buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); + + if (!lzo_mem || !lzo_compress_buf) { + printk(KERN_WARNING "Failed to allocate lzo deflate workspace\n"); + free_workspace(); + return -ENOMEM; + } + + return 0; +} + +static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out, + uint32_t *sourcelen, uint32_t *dstlen) +{ + size_t compress_size; + int ret; + + mutex_lock(&deflate_mutex); + ret = lzo1x_1_compress(data_in, *sourcelen, lzo_compress_buf, &compress_size, lzo_mem); + if (ret != LZO_E_OK) + goto fail; + + if (compress_size > *dstlen) + goto fail; + + memcpy(cpage_out, lzo_compress_buf, compress_size); + mutex_unlock(&deflate_mutex); + + *dstlen = compress_size; + return 0; + + fail: + mutex_unlock(&deflate_mutex); + return -1; +} + +static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out, + uint32_t srclen, uint32_t destlen) +{ + size_t dl = destlen; + int ret; + + ret = lzo1x_decompress_safe(data_in, srclen, cpage_out, &dl); + + if (ret != LZO_E_OK || dl != destlen) + return -1; + + return 0; +} + +static struct jffs2_compressor jffs2_lzo_comp = { + .priority = JFFS2_LZO_PRIORITY, + .name = "lzo", + .compr = JFFS2_COMPR_LZO, + .compress = &jffs2_lzo_compress, + .decompress = &jffs2_lzo_decompress, + .disabled = 0, +}; + +int __init jffs2_lzo_init(void) +{ + int ret; + + ret = alloc_workspace(); + if (ret < 0) + return ret; + + ret = jffs2_register_compressor(&jffs2_lzo_comp); + if (ret) + free_workspace(); + + return ret; +} + +void jffs2_lzo_exit(void) +{ + jffs2_unregister_compressor(&jffs2_lzo_comp); + free_workspace(); +} diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c new file mode 100644 index 00000000..16a50479 --- /dev/null +++ b/fs/jffs2/compr_rtime.c @@ -0,0 +1,130 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by Arjan van de Ven + * + * For licensing information, see the file 'LICENCE' in this directory. + * + * + * + * Very simple lz77-ish encoder. + * + * Theory of operation: Both encoder and decoder have a list of "last + * occurrences" for every possible source-value; after sending the + * first source-byte, the second byte indicated the "run" length of + * matches + * + * The algorithm is intended to only send "whole bytes", no bit-messing. + * + */ + +#include +#include +#include +#include +#include +#include "compr.h" + +/* _compress returns the compressed size, -1 if bigger */ +static int jffs2_rtime_compress(unsigned char *data_in, + unsigned char *cpage_out, + uint32_t *sourcelen, uint32_t *dstlen) +{ + short positions[256]; + int outpos = 0; + int pos=0; + + memset(positions,0,sizeof(positions)); + + while (pos < (*sourcelen) && outpos <= (*dstlen)-2) { + int backpos, runlen=0; + unsigned char value; + + value = data_in[pos]; + + cpage_out[outpos++] = data_in[pos++]; + + backpos = positions[value]; + positions[value]=pos; + + while ((backpos < pos) && (pos < (*sourcelen)) && + (data_in[pos]==data_in[backpos++]) && (runlen<255)) { + pos++; + runlen++; + } + cpage_out[outpos++] = runlen; + } + + if (outpos >= pos) { + /* We failed */ + return -1; + } + + /* Tell the caller how much we managed to compress, and how much space it took */ + *sourcelen = pos; + *dstlen = outpos; + return 0; +} + + +static int jffs2_rtime_decompress(unsigned char *data_in, + unsigned char *cpage_out, + uint32_t srclen, uint32_t destlen) +{ + short positions[256]; + int outpos = 0; + int pos=0; + + memset(positions,0,sizeof(positions)); + + while (outpos= outpos) { + while(repeat) { + cpage_out[outpos++] = cpage_out[backoffs++]; + repeat--; + } + } else { + memcpy(&cpage_out[outpos],&cpage_out[backoffs],repeat); + outpos+=repeat; + } + } + } + return 0; +} + +static struct jffs2_compressor jffs2_rtime_comp = { + .priority = JFFS2_RTIME_PRIORITY, + .name = "rtime", + .compr = JFFS2_COMPR_RTIME, + .compress = &jffs2_rtime_compress, + .decompress = &jffs2_rtime_decompress, +#ifdef JFFS2_RTIME_DISABLED + .disabled = 1, +#else + .disabled = 0, +#endif +}; + +int jffs2_rtime_init(void) +{ + return jffs2_register_compressor(&jffs2_rtime_comp); +} + +void jffs2_rtime_exit(void) +{ + jffs2_unregister_compressor(&jffs2_rtime_comp); +} diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c new file mode 100644 index 00000000..9e7cec80 --- /dev/null +++ b/fs/jffs2/compr_rubin.c @@ -0,0 +1,455 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by Arjan van de Ven + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include "compr.h" + + +#define RUBIN_REG_SIZE 16 +#define UPPER_BIT_RUBIN (((long) 1)<<(RUBIN_REG_SIZE-1)) +#define LOWER_BITS_RUBIN ((((long) 1)<<(RUBIN_REG_SIZE-1))-1) + + +#define BIT_DIVIDER_MIPS 1043 +static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241}; + +struct pushpull { + unsigned char *buf; + unsigned int buflen; + unsigned int ofs; + unsigned int reserve; +}; + +struct rubin_state { + unsigned long p; + unsigned long q; + unsigned long rec_q; + long bit_number; + struct pushpull pp; + int bit_divider; + int bits[8]; +}; + +static inline void init_pushpull(struct pushpull *pp, char *buf, + unsigned buflen, unsigned ofs, + unsigned reserve) +{ + pp->buf = buf; + pp->buflen = buflen; + pp->ofs = ofs; + pp->reserve = reserve; +} + +static inline int pushbit(struct pushpull *pp, int bit, int use_reserved) +{ + if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) + return -ENOSPC; + + if (bit) + pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7))); + else + pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7))); + + pp->ofs++; + + return 0; +} + +static inline int pushedbits(struct pushpull *pp) +{ + return pp->ofs; +} + +static inline int pullbit(struct pushpull *pp) +{ + int bit; + + bit = (pp->buf[pp->ofs >> 3] >> (7-(pp->ofs & 7))) & 1; + + pp->ofs++; + return bit; +} + +static inline int pulledbits(struct pushpull *pp) +{ + return pp->ofs; +} + + +static void init_rubin(struct rubin_state *rs, int div, int *bits) +{ + int c; + + rs->q = 0; + rs->p = (long) (2 * UPPER_BIT_RUBIN); + rs->bit_number = (long) 0; + rs->bit_divider = div; + + for (c=0; c<8; c++) + rs->bits[c] = bits[c]; +} + + +static int encode(struct rubin_state *rs, long A, long B, int symbol) +{ + + long i0, i1; + int ret; + + while ((rs->q >= UPPER_BIT_RUBIN) || + ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) { + rs->bit_number++; + + ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0); + if (ret) + return ret; + rs->q &= LOWER_BITS_RUBIN; + rs->q <<= 1; + rs->p <<= 1; + } + i0 = A * rs->p / (A + B); + if (i0 <= 0) + i0 = 1; + + if (i0 >= rs->p) + i0 = rs->p - 1; + + i1 = rs->p - i0; + + if (symbol == 0) + rs->p = i0; + else { + rs->p = i1; + rs->q += i0; + } + return 0; +} + + +static void end_rubin(struct rubin_state *rs) +{ + + int i; + + for (i = 0; i < RUBIN_REG_SIZE; i++) { + pushbit(&rs->pp, (UPPER_BIT_RUBIN & rs->q) ? 1 : 0, 1); + rs->q &= LOWER_BITS_RUBIN; + rs->q <<= 1; + } +} + + +static void init_decode(struct rubin_state *rs, int div, int *bits) +{ + init_rubin(rs, div, bits); + + /* behalve lower */ + rs->rec_q = 0; + + for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; + rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp))) + ; +} + +static void __do_decode(struct rubin_state *rs, unsigned long p, + unsigned long q) +{ + register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN; + unsigned long rec_q; + int c, bits = 0; + + /* + * First, work out how many bits we need from the input stream. + * Note that we have already done the initial check on this + * loop prior to calling this function. + */ + do { + bits++; + q &= lower_bits_rubin; + q <<= 1; + p <<= 1; + } while ((q >= UPPER_BIT_RUBIN) || ((p + q) <= UPPER_BIT_RUBIN)); + + rs->p = p; + rs->q = q; + + rs->bit_number += bits; + + /* + * Now get the bits. We really want this to be "get n bits". + */ + rec_q = rs->rec_q; + do { + c = pullbit(&rs->pp); + rec_q &= lower_bits_rubin; + rec_q <<= 1; + rec_q += c; + } while (--bits); + rs->rec_q = rec_q; +} + +static int decode(struct rubin_state *rs, long A, long B) +{ + unsigned long p = rs->p, q = rs->q; + long i0, threshold; + int symbol; + + if (q >= UPPER_BIT_RUBIN || ((p + q) <= UPPER_BIT_RUBIN)) + __do_decode(rs, p, q); + + i0 = A * rs->p / (A + B); + if (i0 <= 0) + i0 = 1; + + if (i0 >= rs->p) + i0 = rs->p - 1; + + threshold = rs->q + i0; + symbol = rs->rec_q >= threshold; + if (rs->rec_q >= threshold) { + rs->q += i0; + i0 = rs->p - i0; + } + + rs->p = i0; + + return symbol; +} + + + +static int out_byte(struct rubin_state *rs, unsigned char byte) +{ + int i, ret; + struct rubin_state rs_copy; + rs_copy = *rs; + + for (i=0; i<8; i++) { + ret = encode(rs, rs->bit_divider-rs->bits[i], + rs->bits[i], byte & 1); + if (ret) { + /* Failed. Restore old state */ + *rs = rs_copy; + return ret; + } + byte >>= 1 ; + } + return 0; +} + +static int in_byte(struct rubin_state *rs) +{ + int i, result = 0, bit_divider = rs->bit_divider; + + for (i = 0; i < 8; i++) + result |= decode(rs, bit_divider - rs->bits[i], + rs->bits[i]) << i; + + return result; +} + + + +static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, + unsigned char *cpage_out, uint32_t *sourcelen, + uint32_t *dstlen) + { + int outpos = 0; + int pos=0; + struct rubin_state rs; + + init_pushpull(&rs.pp, cpage_out, *dstlen * 8, 0, 32); + + init_rubin(&rs, bit_divider, bits); + + while (pos < (*sourcelen) && !out_byte(&rs, data_in[pos])) + pos++; + + end_rubin(&rs); + + if (outpos > pos) { + /* We failed */ + return -1; + } + + /* Tell the caller how much we managed to compress, + * and how much space it took */ + + outpos = (pushedbits(&rs.pp)+7)/8; + + if (outpos >= pos) + return -1; /* We didn't actually compress */ + *sourcelen = pos; + *dstlen = outpos; + return 0; +} +#if 0 +/* _compress returns the compressed size, -1 if bigger */ +int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, + uint32_t *sourcelen, uint32_t *dstlen) +{ + return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, + cpage_out, sourcelen, dstlen); +} +#endif +static int jffs2_dynrubin_compress(unsigned char *data_in, + unsigned char *cpage_out, + uint32_t *sourcelen, uint32_t *dstlen) +{ + int bits[8]; + unsigned char histo[256]; + int i; + int ret; + uint32_t mysrclen, mydstlen; + + mysrclen = *sourcelen; + mydstlen = *dstlen - 8; + + if (*dstlen <= 12) + return -1; + + memset(histo, 0, 256); + for (i=0; i 255) bits[i] = 255; + cpage_out[i] = bits[i]; + } + + ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, + &mydstlen); + if (ret) + return ret; + + /* Add back the 8 bytes we took for the probabilities */ + mydstlen += 8; + + if (mysrclen <= mydstlen) { + /* We compressed */ + return -1; + } + + *sourcelen = mysrclen; + *dstlen = mydstlen; + return 0; +} + +static void rubin_do_decompress(int bit_divider, int *bits, + unsigned char *cdata_in, + unsigned char *page_out, uint32_t srclen, + uint32_t destlen) +{ + int outpos = 0; + struct rubin_state rs; + + init_pushpull(&rs.pp, cdata_in, srclen, 0, 0); + init_decode(&rs, bit_divider, bits); + + while (outpos < destlen) + page_out[outpos++] = in_byte(&rs); +} + + +static int jffs2_rubinmips_decompress(unsigned char *data_in, + unsigned char *cpage_out, + uint32_t sourcelen, uint32_t dstlen) +{ + rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, + cpage_out, sourcelen, dstlen); + return 0; +} + +static int jffs2_dynrubin_decompress(unsigned char *data_in, + unsigned char *cpage_out, + uint32_t sourcelen, uint32_t dstlen) +{ + int bits[8]; + int c; + + for (c=0; c<8; c++) + bits[c] = data_in[c]; + + rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, + dstlen); + return 0; +} + +static struct jffs2_compressor jffs2_rubinmips_comp = { + .priority = JFFS2_RUBINMIPS_PRIORITY, + .name = "rubinmips", + .compr = JFFS2_COMPR_DYNRUBIN, + .compress = NULL, /*&jffs2_rubinmips_compress,*/ + .decompress = &jffs2_rubinmips_decompress, +#ifdef JFFS2_RUBINMIPS_DISABLED + .disabled = 1, +#else + .disabled = 0, +#endif +}; + +int jffs2_rubinmips_init(void) +{ + return jffs2_register_compressor(&jffs2_rubinmips_comp); +} + +void jffs2_rubinmips_exit(void) +{ + jffs2_unregister_compressor(&jffs2_rubinmips_comp); +} + +static struct jffs2_compressor jffs2_dynrubin_comp = { + .priority = JFFS2_DYNRUBIN_PRIORITY, + .name = "dynrubin", + .compr = JFFS2_COMPR_RUBINMIPS, + .compress = jffs2_dynrubin_compress, + .decompress = &jffs2_dynrubin_decompress, +#ifdef JFFS2_DYNRUBIN_DISABLED + .disabled = 1, +#else + .disabled = 0, +#endif +}; + +int jffs2_dynrubin_init(void) +{ + return jffs2_register_compressor(&jffs2_dynrubin_comp); +} + +void jffs2_dynrubin_exit(void) +{ + jffs2_unregister_compressor(&jffs2_dynrubin_comp); +} diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c new file mode 100644 index 00000000..5a001020 --- /dev/null +++ b/fs/jffs2/compr_zlib.c @@ -0,0 +1,218 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#if !defined(__KERNEL__) && !defined(__ECOS) +#error "The userspace support got too messy and was removed. Update your mkfs.jffs2" +#endif + +#include +#include +#include +#include "nodelist.h" +#include "compr.h" + + /* Plan: call deflate() with avail_in == *sourcelen, + avail_out = *dstlen - 12 and flush == Z_FINISH. + If it doesn't manage to finish, call it again with + avail_in == 0 and avail_out set to the remaining 12 + bytes for it to clean up. + Q: Is 12 bytes sufficient? + */ +#define STREAM_END_SPACE 12 + +static DEFINE_MUTEX(deflate_mutex); +static DEFINE_MUTEX(inflate_mutex); +static z_stream inf_strm, def_strm; + +#ifdef __KERNEL__ /* Linux-only */ +#include +#include +#include + +static int __init alloc_workspaces(void) +{ + def_strm.workspace = vmalloc(zlib_deflate_workspacesize(MAX_WBITS, + MAX_MEM_LEVEL)); + if (!def_strm.workspace) { + printk(KERN_WARNING "Failed to allocate %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL)); + return -ENOMEM; + } + D1(printk(KERN_DEBUG "Allocated %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL))); + inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!inf_strm.workspace) { + printk(KERN_WARNING "Failed to allocate %d bytes for inflate workspace\n", zlib_inflate_workspacesize()); + vfree(def_strm.workspace); + return -ENOMEM; + } + D1(printk(KERN_DEBUG "Allocated %d bytes for inflate workspace\n", zlib_inflate_workspacesize())); + return 0; +} + +static void free_workspaces(void) +{ + vfree(def_strm.workspace); + vfree(inf_strm.workspace); +} +#else +#define alloc_workspaces() (0) +#define free_workspaces() do { } while(0) +#endif /* __KERNEL__ */ + +static int jffs2_zlib_compress(unsigned char *data_in, + unsigned char *cpage_out, + uint32_t *sourcelen, uint32_t *dstlen) +{ + int ret; + + if (*dstlen <= STREAM_END_SPACE) + return -1; + + mutex_lock(&deflate_mutex); + + if (Z_OK != zlib_deflateInit(&def_strm, 3)) { + printk(KERN_WARNING "deflateInit failed\n"); + mutex_unlock(&deflate_mutex); + return -1; + } + + def_strm.next_in = data_in; + def_strm.total_in = 0; + + def_strm.next_out = cpage_out; + def_strm.total_out = 0; + + while (def_strm.total_out < *dstlen - STREAM_END_SPACE && def_strm.total_in < *sourcelen) { + def_strm.avail_out = *dstlen - (def_strm.total_out + STREAM_END_SPACE); + def_strm.avail_in = min((unsigned)(*sourcelen-def_strm.total_in), def_strm.avail_out); + D1(printk(KERN_DEBUG "calling deflate with avail_in %d, avail_out %d\n", + def_strm.avail_in, def_strm.avail_out)); + ret = zlib_deflate(&def_strm, Z_PARTIAL_FLUSH); + D1(printk(KERN_DEBUG "deflate returned with avail_in %d, avail_out %d, total_in %ld, total_out %ld\n", + def_strm.avail_in, def_strm.avail_out, def_strm.total_in, def_strm.total_out)); + if (ret != Z_OK) { + D1(printk(KERN_DEBUG "deflate in loop returned %d\n", ret)); + zlib_deflateEnd(&def_strm); + mutex_unlock(&deflate_mutex); + return -1; + } + } + def_strm.avail_out += STREAM_END_SPACE; + def_strm.avail_in = 0; + ret = zlib_deflate(&def_strm, Z_FINISH); + zlib_deflateEnd(&def_strm); + + if (ret != Z_STREAM_END) { + D1(printk(KERN_DEBUG "final deflate returned %d\n", ret)); + ret = -1; + goto out; + } + + if (def_strm.total_out >= def_strm.total_in) { + D1(printk(KERN_DEBUG "zlib compressed %ld bytes into %ld; failing\n", + def_strm.total_in, def_strm.total_out)); + ret = -1; + goto out; + } + + D1(printk(KERN_DEBUG "zlib compressed %ld bytes into %ld\n", + def_strm.total_in, def_strm.total_out)); + + *dstlen = def_strm.total_out; + *sourcelen = def_strm.total_in; + ret = 0; + out: + mutex_unlock(&deflate_mutex); + return ret; +} + +static int jffs2_zlib_decompress(unsigned char *data_in, + unsigned char *cpage_out, + uint32_t srclen, uint32_t destlen) +{ + int ret; + int wbits = MAX_WBITS; + + mutex_lock(&inflate_mutex); + + inf_strm.next_in = data_in; + inf_strm.avail_in = srclen; + inf_strm.total_in = 0; + + inf_strm.next_out = cpage_out; + inf_strm.avail_out = destlen; + inf_strm.total_out = 0; + + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + D2(printk(KERN_DEBUG "inflate skipping adler32\n")); + wbits = -((data_in[0] >> 4) + 8); + inf_strm.next_in += 2; + inf_strm.avail_in -= 2; + } else { + /* Let this remain D1 for now -- it should never happen */ + D1(printk(KERN_DEBUG "inflate not skipping adler32\n")); + } + + + if (Z_OK != zlib_inflateInit2(&inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + mutex_unlock(&inflate_mutex); + return 1; + } + + while((ret = zlib_inflate(&inf_strm, Z_FINISH)) == Z_OK) + ; + if (ret != Z_STREAM_END) { + printk(KERN_NOTICE "inflate returned %d\n", ret); + } + zlib_inflateEnd(&inf_strm); + mutex_unlock(&inflate_mutex); + return 0; +} + +static struct jffs2_compressor jffs2_zlib_comp = { + .priority = JFFS2_ZLIB_PRIORITY, + .name = "zlib", + .compr = JFFS2_COMPR_ZLIB, + .compress = &jffs2_zlib_compress, + .decompress = &jffs2_zlib_decompress, +#ifdef JFFS2_ZLIB_DISABLED + .disabled = 1, +#else + .disabled = 0, +#endif +}; + +int __init jffs2_zlib_init(void) +{ + int ret; + + ret = alloc_workspaces(); + if (ret) + return ret; + + ret = jffs2_register_compressor(&jffs2_zlib_comp); + if (ret) + free_workspaces(); + + return ret; +} + +void jffs2_zlib_exit(void) +{ + jffs2_unregister_compressor(&jffs2_zlib_comp); + free_workspaces(); +} diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c new file mode 100644 index 00000000..e0b76c87 --- /dev/null +++ b/fs/jffs2/debug.c @@ -0,0 +1,860 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" +#include "debug.h" + +#ifdef JFFS2_DBG_SANITY_CHECKS + +void +__jffs2_dbg_acct_sanity_check_nolock(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + if (unlikely(jeb && jeb->used_size + jeb->dirty_size + + jeb->free_size + jeb->wasted_size + + jeb->unchecked_size != c->sector_size)) { + JFFS2_ERROR("eeep, space accounting for block at 0x%08x is screwed.\n", jeb->offset); + JFFS2_ERROR("free %#08x + dirty %#08x + used %#08x + wasted %#08x + unchecked %#08x != total %#08x.\n", + jeb->free_size, jeb->dirty_size, jeb->used_size, + jeb->wasted_size, jeb->unchecked_size, c->sector_size); + BUG(); + } + + if (unlikely(c->used_size + c->dirty_size + c->free_size + c->erasing_size + c->bad_size + + c->wasted_size + c->unchecked_size != c->flash_size)) { + JFFS2_ERROR("eeep, space accounting superblock info is screwed.\n"); + JFFS2_ERROR("free %#08x + dirty %#08x + used %#08x + erasing %#08x + bad %#08x + wasted %#08x + unchecked %#08x != total %#08x.\n", + c->free_size, c->dirty_size, c->used_size, c->erasing_size, c->bad_size, + c->wasted_size, c->unchecked_size, c->flash_size); + BUG(); + } +} + +void +__jffs2_dbg_acct_sanity_check(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + spin_lock(&c->erase_completion_lock); + jffs2_dbg_acct_sanity_check_nolock(c, jeb); + spin_unlock(&c->erase_completion_lock); +} + +#endif /* JFFS2_DBG_SANITY_CHECKS */ + +#ifdef JFFS2_DBG_PARANOIA_CHECKS +/* + * Check the fragtree. + */ +void +__jffs2_dbg_fragtree_paranoia_check(struct jffs2_inode_info *f) +{ + mutex_lock(&f->sem); + __jffs2_dbg_fragtree_paranoia_check_nolock(f); + mutex_unlock(&f->sem); +} + +void +__jffs2_dbg_fragtree_paranoia_check_nolock(struct jffs2_inode_info *f) +{ + struct jffs2_node_frag *frag; + int bitched = 0; + + for (frag = frag_first(&f->fragtree); frag; frag = frag_next(frag)) { + struct jffs2_full_dnode *fn = frag->node; + + if (!fn || !fn->raw) + continue; + + if (ref_flags(fn->raw) == REF_PRISTINE) { + if (fn->frags > 1) { + JFFS2_ERROR("REF_PRISTINE node at 0x%08x had %d frags. Tell dwmw2.\n", + ref_offset(fn->raw), fn->frags); + bitched = 1; + } + + /* A hole node which isn't multi-page should be garbage-collected + and merged anyway, so we just check for the frag size here, + rather than mucking around with actually reading the node + and checking the compression type, which is the real way + to tell a hole node. */ + if (frag->ofs & (PAGE_CACHE_SIZE-1) && frag_prev(frag) + && frag_prev(frag)->size < PAGE_CACHE_SIZE && frag_prev(frag)->node) { + JFFS2_ERROR("REF_PRISTINE node at 0x%08x had a previous non-hole frag in the same page. Tell dwmw2.\n", + ref_offset(fn->raw)); + bitched = 1; + } + + if ((frag->ofs+frag->size) & (PAGE_CACHE_SIZE-1) && frag_next(frag) + && frag_next(frag)->size < PAGE_CACHE_SIZE && frag_next(frag)->node) { + JFFS2_ERROR("REF_PRISTINE node at 0x%08x (%08x-%08x) had a following non-hole frag in the same page. Tell dwmw2.\n", + ref_offset(fn->raw), frag->ofs, frag->ofs+frag->size); + bitched = 1; + } + } + } + + if (bitched) { + JFFS2_ERROR("fragtree is corrupted.\n"); + __jffs2_dbg_dump_fragtree_nolock(f); + BUG(); + } +} + +/* + * Check if the flash contains all 0xFF before we start writing. + */ +void +__jffs2_dbg_prewrite_paranoia_check(struct jffs2_sb_info *c, + uint32_t ofs, int len) +{ + size_t retlen; + int ret, i; + unsigned char *buf; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return; + + ret = jffs2_flash_read(c, ofs, len, &retlen, buf); + if (ret || (retlen != len)) { + JFFS2_WARNING("read %d bytes failed or short. ret %d, retlen %zd.\n", + len, ret, retlen); + kfree(buf); + return; + } + + ret = 0; + for (i = 0; i < len; i++) + if (buf[i] != 0xff) + ret = 1; + + if (ret) { + JFFS2_ERROR("argh, about to write node to %#08x on flash, but there are data already there. The first corrupted byte is at %#08x offset.\n", + ofs, ofs + i); + __jffs2_dbg_dump_buffer(buf, len, ofs); + kfree(buf); + BUG(); + } + + kfree(buf); +} + +void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c) +{ + struct jffs2_eraseblock *jeb; + uint32_t free = 0, dirty = 0, used = 0, wasted = 0, + erasing = 0, bad = 0, unchecked = 0; + int nr_counted = 0; + int dump = 0; + + if (c->gcblock) { + nr_counted++; + free += c->gcblock->free_size; + dirty += c->gcblock->dirty_size; + used += c->gcblock->used_size; + wasted += c->gcblock->wasted_size; + unchecked += c->gcblock->unchecked_size; + } + if (c->nextblock) { + nr_counted++; + free += c->nextblock->free_size; + dirty += c->nextblock->dirty_size; + used += c->nextblock->used_size; + wasted += c->nextblock->wasted_size; + unchecked += c->nextblock->unchecked_size; + } + list_for_each_entry(jeb, &c->clean_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->very_dirty_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->dirty_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->erasable_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->erasable_pending_wbuf_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->erase_pending_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->free_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->bad_used_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + + list_for_each_entry(jeb, &c->erasing_list, list) { + nr_counted++; + erasing += c->sector_size; + } + list_for_each_entry(jeb, &c->erase_checking_list, list) { + nr_counted++; + erasing += c->sector_size; + } + list_for_each_entry(jeb, &c->erase_complete_list, list) { + nr_counted++; + erasing += c->sector_size; + } + list_for_each_entry(jeb, &c->bad_list, list) { + nr_counted++; + bad += c->sector_size; + } + +#define check(sz) \ + if (sz != c->sz##_size) { \ + printk(KERN_WARNING #sz "_size mismatch counted 0x%x, c->" #sz "_size 0x%x\n", \ + sz, c->sz##_size); \ + dump = 1; \ + } + check(free); + check(dirty); + check(used); + check(wasted); + check(unchecked); + check(bad); + check(erasing); +#undef check + + if (nr_counted != c->nr_blocks) { + printk(KERN_WARNING "%s counted only 0x%x blocks of 0x%x. Where are the others?\n", + __func__, nr_counted, c->nr_blocks); + dump = 1; + } + + if (dump) { + __jffs2_dbg_dump_block_lists_nolock(c); + BUG(); + } +} + +/* + * Check the space accounting and node_ref list correctness for the JFFS2 erasable block 'jeb'. + */ +void +__jffs2_dbg_acct_paranoia_check(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + spin_lock(&c->erase_completion_lock); + __jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + spin_unlock(&c->erase_completion_lock); +} + +void +__jffs2_dbg_acct_paranoia_check_nolock(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + uint32_t my_used_size = 0; + uint32_t my_unchecked_size = 0; + uint32_t my_dirty_size = 0; + struct jffs2_raw_node_ref *ref2 = jeb->first_node; + + while (ref2) { + uint32_t totlen = ref_totlen(c, jeb, ref2); + + if (ref_offset(ref2) < jeb->offset || + ref_offset(ref2) > jeb->offset + c->sector_size) { + JFFS2_ERROR("node_ref %#08x shouldn't be in block at %#08x.\n", + ref_offset(ref2), jeb->offset); + goto error; + + } + if (ref_flags(ref2) == REF_UNCHECKED) + my_unchecked_size += totlen; + else if (!ref_obsolete(ref2)) + my_used_size += totlen; + else + my_dirty_size += totlen; + + if ((!ref_next(ref2)) != (ref2 == jeb->last_node)) { + JFFS2_ERROR("node_ref for node at %#08x (mem %p) has next at %#08x (mem %p), last_node is at %#08x (mem %p).\n", + ref_offset(ref2), ref2, ref_offset(ref_next(ref2)), ref_next(ref2), + ref_offset(jeb->last_node), jeb->last_node); + goto error; + } + ref2 = ref_next(ref2); + } + + if (my_used_size != jeb->used_size) { + JFFS2_ERROR("Calculated used size %#08x != stored used size %#08x.\n", + my_used_size, jeb->used_size); + goto error; + } + + if (my_unchecked_size != jeb->unchecked_size) { + JFFS2_ERROR("Calculated unchecked size %#08x != stored unchecked size %#08x.\n", + my_unchecked_size, jeb->unchecked_size); + goto error; + } + +#if 0 + /* This should work when we implement ref->__totlen elemination */ + if (my_dirty_size != jeb->dirty_size + jeb->wasted_size) { + JFFS2_ERROR("Calculated dirty+wasted size %#08x != stored dirty + wasted size %#08x\n", + my_dirty_size, jeb->dirty_size + jeb->wasted_size); + goto error; + } + + if (jeb->free_size == 0 + && my_used_size + my_unchecked_size + my_dirty_size != c->sector_size) { + JFFS2_ERROR("The sum of all nodes in block (%#x) != size of block (%#x)\n", + my_used_size + my_unchecked_size + my_dirty_size, + c->sector_size); + goto error; + } +#endif + + if (!(c->flags & (JFFS2_SB_FLAG_BUILDING|JFFS2_SB_FLAG_SCANNING))) + __jffs2_dbg_superblock_counts(c); + + return; + +error: + __jffs2_dbg_dump_node_refs_nolock(c, jeb); + __jffs2_dbg_dump_jeb_nolock(jeb); + __jffs2_dbg_dump_block_lists_nolock(c); + BUG(); + +} +#endif /* JFFS2_DBG_PARANOIA_CHECKS */ + +#if defined(JFFS2_DBG_DUMPS) || defined(JFFS2_DBG_PARANOIA_CHECKS) +/* + * Dump the node_refs of the 'jeb' JFFS2 eraseblock. + */ +void +__jffs2_dbg_dump_node_refs(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + spin_lock(&c->erase_completion_lock); + __jffs2_dbg_dump_node_refs_nolock(c, jeb); + spin_unlock(&c->erase_completion_lock); +} + +void +__jffs2_dbg_dump_node_refs_nolock(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + struct jffs2_raw_node_ref *ref; + int i = 0; + + printk(JFFS2_DBG_MSG_PREFIX " Dump node_refs of the eraseblock %#08x\n", jeb->offset); + if (!jeb->first_node) { + printk(JFFS2_DBG_MSG_PREFIX " no nodes in the eraseblock %#08x\n", jeb->offset); + return; + } + + printk(JFFS2_DBG); + for (ref = jeb->first_node; ; ref = ref_next(ref)) { + printk("%#08x", ref_offset(ref)); +#ifdef TEST_TOTLEN + printk("(%x)", ref->__totlen); +#endif + if (ref_next(ref)) + printk("->"); + else + break; + if (++i == 4) { + i = 0; + printk("\n" JFFS2_DBG); + } + } + printk("\n"); +} + +/* + * Dump an eraseblock's space accounting. + */ +void +__jffs2_dbg_dump_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +{ + spin_lock(&c->erase_completion_lock); + __jffs2_dbg_dump_jeb_nolock(jeb); + spin_unlock(&c->erase_completion_lock); +} + +void +__jffs2_dbg_dump_jeb_nolock(struct jffs2_eraseblock *jeb) +{ + if (!jeb) + return; + + printk(JFFS2_DBG_MSG_PREFIX " dump space accounting for the eraseblock at %#08x:\n", + jeb->offset); + + printk(JFFS2_DBG "used_size: %#08x\n", jeb->used_size); + printk(JFFS2_DBG "dirty_size: %#08x\n", jeb->dirty_size); + printk(JFFS2_DBG "wasted_size: %#08x\n", jeb->wasted_size); + printk(JFFS2_DBG "unchecked_size: %#08x\n", jeb->unchecked_size); + printk(JFFS2_DBG "free_size: %#08x\n", jeb->free_size); +} + +void +__jffs2_dbg_dump_block_lists(struct jffs2_sb_info *c) +{ + spin_lock(&c->erase_completion_lock); + __jffs2_dbg_dump_block_lists_nolock(c); + spin_unlock(&c->erase_completion_lock); +} + +void +__jffs2_dbg_dump_block_lists_nolock(struct jffs2_sb_info *c) +{ + printk(JFFS2_DBG_MSG_PREFIX " dump JFFS2 blocks lists:\n"); + + printk(JFFS2_DBG "flash_size: %#08x\n", c->flash_size); + printk(JFFS2_DBG "used_size: %#08x\n", c->used_size); + printk(JFFS2_DBG "dirty_size: %#08x\n", c->dirty_size); + printk(JFFS2_DBG "wasted_size: %#08x\n", c->wasted_size); + printk(JFFS2_DBG "unchecked_size: %#08x\n", c->unchecked_size); + printk(JFFS2_DBG "free_size: %#08x\n", c->free_size); + printk(JFFS2_DBG "erasing_size: %#08x\n", c->erasing_size); + printk(JFFS2_DBG "bad_size: %#08x\n", c->bad_size); + printk(JFFS2_DBG "sector_size: %#08x\n", c->sector_size); + printk(JFFS2_DBG "jffs2_reserved_blocks size: %#08x\n", + c->sector_size * c->resv_blocks_write); + + if (c->nextblock) + printk(JFFS2_DBG "nextblock: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + c->nextblock->offset, c->nextblock->used_size, + c->nextblock->dirty_size, c->nextblock->wasted_size, + c->nextblock->unchecked_size, c->nextblock->free_size); + else + printk(JFFS2_DBG "nextblock: NULL\n"); + + if (c->gcblock) + printk(JFFS2_DBG "gcblock: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + c->gcblock->offset, c->gcblock->used_size, c->gcblock->dirty_size, + c->gcblock->wasted_size, c->gcblock->unchecked_size, c->gcblock->free_size); + else + printk(JFFS2_DBG "gcblock: NULL\n"); + + if (list_empty(&c->clean_list)) { + printk(JFFS2_DBG "clean_list: empty\n"); + } else { + struct list_head *this; + int numblocks = 0; + uint32_t dirty = 0; + + list_for_each(this, &c->clean_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + numblocks ++; + dirty += jeb->wasted_size; + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "clean_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + + printk (JFFS2_DBG "Contains %d blocks with total wasted size %u, average wasted size: %u\n", + numblocks, dirty, dirty / numblocks); + } + + if (list_empty(&c->very_dirty_list)) { + printk(JFFS2_DBG "very_dirty_list: empty\n"); + } else { + struct list_head *this; + int numblocks = 0; + uint32_t dirty = 0; + + list_for_each(this, &c->very_dirty_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + numblocks ++; + dirty += jeb->dirty_size; + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "very_dirty_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + + printk (JFFS2_DBG "Contains %d blocks with total dirty size %u, average dirty size: %u\n", + numblocks, dirty, dirty / numblocks); + } + + if (list_empty(&c->dirty_list)) { + printk(JFFS2_DBG "dirty_list: empty\n"); + } else { + struct list_head *this; + int numblocks = 0; + uint32_t dirty = 0; + + list_for_each(this, &c->dirty_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + numblocks ++; + dirty += jeb->dirty_size; + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "dirty_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + + printk (JFFS2_DBG "contains %d blocks with total dirty size %u, average dirty size: %u\n", + numblocks, dirty, dirty / numblocks); + } + + if (list_empty(&c->erasable_list)) { + printk(JFFS2_DBG "erasable_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->erasable_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "erasable_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } + + if (list_empty(&c->erasing_list)) { + printk(JFFS2_DBG "erasing_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->erasing_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "erasing_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } + if (list_empty(&c->erase_checking_list)) { + printk(JFFS2_DBG "erase_checking_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->erase_checking_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "erase_checking_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } + + if (list_empty(&c->erase_pending_list)) { + printk(JFFS2_DBG "erase_pending_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->erase_pending_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "erase_pending_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } + + if (list_empty(&c->erasable_pending_wbuf_list)) { + printk(JFFS2_DBG "erasable_pending_wbuf_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->erasable_pending_wbuf_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "erasable_pending_wbuf_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } + + if (list_empty(&c->free_list)) { + printk(JFFS2_DBG "free_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->free_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "free_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } + + if (list_empty(&c->bad_list)) { + printk(JFFS2_DBG "bad_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->bad_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "bad_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } + + if (list_empty(&c->bad_used_list)) { + printk(JFFS2_DBG "bad_used_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->bad_used_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "bad_used_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } +} + +void +__jffs2_dbg_dump_fragtree(struct jffs2_inode_info *f) +{ + mutex_lock(&f->sem); + jffs2_dbg_dump_fragtree_nolock(f); + mutex_unlock(&f->sem); +} + +void +__jffs2_dbg_dump_fragtree_nolock(struct jffs2_inode_info *f) +{ + struct jffs2_node_frag *this = frag_first(&f->fragtree); + uint32_t lastofs = 0; + int buggy = 0; + + printk(JFFS2_DBG_MSG_PREFIX " dump fragtree of ino #%u\n", f->inocache->ino); + while(this) { + if (this->node) + printk(JFFS2_DBG "frag %#04x-%#04x: %#08x(%d) on flash (*%p), left (%p), right (%p), parent (%p)\n", + this->ofs, this->ofs+this->size, ref_offset(this->node->raw), + ref_flags(this->node->raw), this, frag_left(this), frag_right(this), + frag_parent(this)); + else + printk(JFFS2_DBG "frag %#04x-%#04x: hole (*%p). left (%p), right (%p), parent (%p)\n", + this->ofs, this->ofs+this->size, this, frag_left(this), + frag_right(this), frag_parent(this)); + if (this->ofs != lastofs) + buggy = 1; + lastofs = this->ofs + this->size; + this = frag_next(this); + } + + if (f->metadata) + printk(JFFS2_DBG "metadata at 0x%08x\n", ref_offset(f->metadata->raw)); + + if (buggy) { + JFFS2_ERROR("frag tree got a hole in it.\n"); + BUG(); + } +} + +#define JFFS2_BUFDUMP_BYTES_PER_LINE 32 +void +__jffs2_dbg_dump_buffer(unsigned char *buf, int len, uint32_t offs) +{ + int skip; + int i; + + printk(JFFS2_DBG_MSG_PREFIX " dump from offset %#08x to offset %#08x (%x bytes).\n", + offs, offs + len, len); + i = skip = offs % JFFS2_BUFDUMP_BYTES_PER_LINE; + offs = offs & ~(JFFS2_BUFDUMP_BYTES_PER_LINE - 1); + + if (skip != 0) + printk(JFFS2_DBG "%#08x: ", offs); + + while (skip--) + printk(" "); + + while (i < len) { + if ((i % JFFS2_BUFDUMP_BYTES_PER_LINE) == 0 && i != len -1) { + if (i != 0) + printk("\n"); + offs += JFFS2_BUFDUMP_BYTES_PER_LINE; + printk(JFFS2_DBG "%0#8x: ", offs); + } + + printk("%02x ", buf[i]); + + i += 1; + } + + printk("\n"); +} + +/* + * Dump a JFFS2 node. + */ +void +__jffs2_dbg_dump_node(struct jffs2_sb_info *c, uint32_t ofs) +{ + union jffs2_node_union node; + int len = sizeof(union jffs2_node_union); + size_t retlen; + uint32_t crc; + int ret; + + printk(JFFS2_DBG_MSG_PREFIX " dump node at offset %#08x.\n", ofs); + + ret = jffs2_flash_read(c, ofs, len, &retlen, (unsigned char *)&node); + if (ret || (retlen != len)) { + JFFS2_ERROR("read %d bytes failed or short. ret %d, retlen %zd.\n", + len, ret, retlen); + return; + } + + printk(JFFS2_DBG "magic:\t%#04x\n", je16_to_cpu(node.u.magic)); + printk(JFFS2_DBG "nodetype:\t%#04x\n", je16_to_cpu(node.u.nodetype)); + printk(JFFS2_DBG "totlen:\t%#08x\n", je32_to_cpu(node.u.totlen)); + printk(JFFS2_DBG "hdr_crc:\t%#08x\n", je32_to_cpu(node.u.hdr_crc)); + + crc = crc32(0, &node.u, sizeof(node.u) - 4); + if (crc != je32_to_cpu(node.u.hdr_crc)) { + JFFS2_ERROR("wrong common header CRC.\n"); + return; + } + + if (je16_to_cpu(node.u.magic) != JFFS2_MAGIC_BITMASK && + je16_to_cpu(node.u.magic) != JFFS2_OLD_MAGIC_BITMASK) + { + JFFS2_ERROR("wrong node magic: %#04x instead of %#04x.\n", + je16_to_cpu(node.u.magic), JFFS2_MAGIC_BITMASK); + return; + } + + switch(je16_to_cpu(node.u.nodetype)) { + + case JFFS2_NODETYPE_INODE: + + printk(JFFS2_DBG "the node is inode node\n"); + printk(JFFS2_DBG "ino:\t%#08x\n", je32_to_cpu(node.i.ino)); + printk(JFFS2_DBG "version:\t%#08x\n", je32_to_cpu(node.i.version)); + printk(JFFS2_DBG "mode:\t%#08x\n", node.i.mode.m); + printk(JFFS2_DBG "uid:\t%#04x\n", je16_to_cpu(node.i.uid)); + printk(JFFS2_DBG "gid:\t%#04x\n", je16_to_cpu(node.i.gid)); + printk(JFFS2_DBG "isize:\t%#08x\n", je32_to_cpu(node.i.isize)); + printk(JFFS2_DBG "atime:\t%#08x\n", je32_to_cpu(node.i.atime)); + printk(JFFS2_DBG "mtime:\t%#08x\n", je32_to_cpu(node.i.mtime)); + printk(JFFS2_DBG "ctime:\t%#08x\n", je32_to_cpu(node.i.ctime)); + printk(JFFS2_DBG "offset:\t%#08x\n", je32_to_cpu(node.i.offset)); + printk(JFFS2_DBG "csize:\t%#08x\n", je32_to_cpu(node.i.csize)); + printk(JFFS2_DBG "dsize:\t%#08x\n", je32_to_cpu(node.i.dsize)); + printk(JFFS2_DBG "compr:\t%#02x\n", node.i.compr); + printk(JFFS2_DBG "usercompr:\t%#02x\n", node.i.usercompr); + printk(JFFS2_DBG "flags:\t%#04x\n", je16_to_cpu(node.i.flags)); + printk(JFFS2_DBG "data_crc:\t%#08x\n", je32_to_cpu(node.i.data_crc)); + printk(JFFS2_DBG "node_crc:\t%#08x\n", je32_to_cpu(node.i.node_crc)); + + crc = crc32(0, &node.i, sizeof(node.i) - 8); + if (crc != je32_to_cpu(node.i.node_crc)) { + JFFS2_ERROR("wrong node header CRC.\n"); + return; + } + break; + + case JFFS2_NODETYPE_DIRENT: + + printk(JFFS2_DBG "the node is dirent node\n"); + printk(JFFS2_DBG "pino:\t%#08x\n", je32_to_cpu(node.d.pino)); + printk(JFFS2_DBG "version:\t%#08x\n", je32_to_cpu(node.d.version)); + printk(JFFS2_DBG "ino:\t%#08x\n", je32_to_cpu(node.d.ino)); + printk(JFFS2_DBG "mctime:\t%#08x\n", je32_to_cpu(node.d.mctime)); + printk(JFFS2_DBG "nsize:\t%#02x\n", node.d.nsize); + printk(JFFS2_DBG "type:\t%#02x\n", node.d.type); + printk(JFFS2_DBG "node_crc:\t%#08x\n", je32_to_cpu(node.d.node_crc)); + printk(JFFS2_DBG "name_crc:\t%#08x\n", je32_to_cpu(node.d.name_crc)); + + node.d.name[node.d.nsize] = '\0'; + printk(JFFS2_DBG "name:\t\"%s\"\n", node.d.name); + + crc = crc32(0, &node.d, sizeof(node.d) - 8); + if (crc != je32_to_cpu(node.d.node_crc)) { + JFFS2_ERROR("wrong node header CRC.\n"); + return; + } + break; + + default: + printk(JFFS2_DBG "node type is unknown\n"); + break; + } +} +#endif /* JFFS2_DBG_DUMPS || JFFS2_DBG_PARANOIA_CHECKS */ diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h new file mode 100644 index 00000000..c4f8eef5 --- /dev/null +++ b/fs/jffs2/debug.h @@ -0,0 +1,291 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef _JFFS2_DEBUG_H_ +#define _JFFS2_DEBUG_H_ + +#include + +#ifndef CONFIG_JFFS2_FS_DEBUG +#define CONFIG_JFFS2_FS_DEBUG 0 +#endif + +#if CONFIG_JFFS2_FS_DEBUG > 0 +/* Enable "paranoia" checks and dumps */ +#define JFFS2_DBG_PARANOIA_CHECKS +#define JFFS2_DBG_DUMPS + +/* + * By defining/undefining the below macros one may select debugging messages + * fro specific JFFS2 subsystems. + */ +#define JFFS2_DBG_READINODE_MESSAGES +#define JFFS2_DBG_FRAGTREE_MESSAGES +#define JFFS2_DBG_DENTLIST_MESSAGES +#define JFFS2_DBG_NODEREF_MESSAGES +#define JFFS2_DBG_INOCACHE_MESSAGES +#define JFFS2_DBG_SUMMARY_MESSAGES +#define JFFS2_DBG_FSBUILD_MESSAGES +#endif + +#if CONFIG_JFFS2_FS_DEBUG > 1 +#define JFFS2_DBG_FRAGTREE2_MESSAGES +#define JFFS2_DBG_READINODE2_MESSAGES +#define JFFS2_DBG_MEMALLOC_MESSAGES +#endif + +/* Sanity checks are supposed to be light-weight and enabled by default */ +#define JFFS2_DBG_SANITY_CHECKS + +/* + * Dx() are mainly used for debugging messages, they must go away and be + * superseded by nicer dbg_xxx() macros... + */ +#if CONFIG_JFFS2_FS_DEBUG > 0 +#define D1(x) x +#else +#define D1(x) +#endif + +#if CONFIG_JFFS2_FS_DEBUG > 1 +#define D2(x) x +#else +#define D2(x) +#endif + +/* The prefixes of JFFS2 messages */ +#define JFFS2_DBG_PREFIX "[JFFS2 DBG]" +#define JFFS2_ERR_PREFIX "JFFS2 error:" +#define JFFS2_WARN_PREFIX "JFFS2 warning:" +#define JFFS2_NOTICE_PREFIX "JFFS2 notice:" + +#define JFFS2_ERR KERN_ERR +#define JFFS2_WARN KERN_WARNING +#define JFFS2_NOT KERN_NOTICE +#define JFFS2_DBG KERN_DEBUG + +#define JFFS2_DBG_MSG_PREFIX JFFS2_DBG JFFS2_DBG_PREFIX +#define JFFS2_ERR_MSG_PREFIX JFFS2_ERR JFFS2_ERR_PREFIX +#define JFFS2_WARN_MSG_PREFIX JFFS2_WARN JFFS2_WARN_PREFIX +#define JFFS2_NOTICE_MSG_PREFIX JFFS2_NOT JFFS2_NOTICE_PREFIX + +/* JFFS2 message macros */ +#define JFFS2_ERROR(fmt, ...) \ + do { \ + printk(JFFS2_ERR_MSG_PREFIX \ + " (%d) %s: " fmt, task_pid_nr(current), \ + __func__ , ##__VA_ARGS__); \ + } while(0) + +#define JFFS2_WARNING(fmt, ...) \ + do { \ + printk(JFFS2_WARN_MSG_PREFIX \ + " (%d) %s: " fmt, task_pid_nr(current), \ + __func__ , ##__VA_ARGS__); \ + } while(0) + +#define JFFS2_NOTICE(fmt, ...) \ + do { \ + printk(JFFS2_NOTICE_MSG_PREFIX \ + " (%d) %s: " fmt, task_pid_nr(current), \ + __func__ , ##__VA_ARGS__); \ + } while(0) + +#define JFFS2_DEBUG(fmt, ...) \ + do { \ + printk(JFFS2_DBG_MSG_PREFIX \ + " (%d) %s: " fmt, task_pid_nr(current), \ + __func__ , ##__VA_ARGS__); \ + } while(0) + +/* + * We split our debugging messages on several parts, depending on the JFFS2 + * subsystem the message belongs to. + */ +/* Read inode debugging messages */ +#ifdef JFFS2_DBG_READINODE_MESSAGES +#define dbg_readinode(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_readinode(fmt, ...) +#endif +#ifdef JFFS2_DBG_READINODE2_MESSAGES +#define dbg_readinode2(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_readinode2(fmt, ...) +#endif + +/* Fragtree build debugging messages */ +#ifdef JFFS2_DBG_FRAGTREE_MESSAGES +#define dbg_fragtree(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_fragtree(fmt, ...) +#endif +#ifdef JFFS2_DBG_FRAGTREE2_MESSAGES +#define dbg_fragtree2(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_fragtree2(fmt, ...) +#endif + +/* Directory entry list manilulation debugging messages */ +#ifdef JFFS2_DBG_DENTLIST_MESSAGES +#define dbg_dentlist(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_dentlist(fmt, ...) +#endif + +/* Print the messages about manipulating node_refs */ +#ifdef JFFS2_DBG_NODEREF_MESSAGES +#define dbg_noderef(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_noderef(fmt, ...) +#endif + +/* Manipulations with the list of inodes (JFFS2 inocache) */ +#ifdef JFFS2_DBG_INOCACHE_MESSAGES +#define dbg_inocache(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_inocache(fmt, ...) +#endif + +/* Summary debugging messages */ +#ifdef JFFS2_DBG_SUMMARY_MESSAGES +#define dbg_summary(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_summary(fmt, ...) +#endif + +/* File system build messages */ +#ifdef JFFS2_DBG_FSBUILD_MESSAGES +#define dbg_fsbuild(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_fsbuild(fmt, ...) +#endif + +/* Watch the object allocations */ +#ifdef JFFS2_DBG_MEMALLOC_MESSAGES +#define dbg_memalloc(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_memalloc(fmt, ...) +#endif + +/* Watch the XATTR subsystem */ +#ifdef JFFS2_DBG_XATTR_MESSAGES +#define dbg_xattr(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_xattr(fmt, ...) +#endif + +/* "Sanity" checks */ +void +__jffs2_dbg_acct_sanity_check_nolock(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb); +void +__jffs2_dbg_acct_sanity_check(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb); + +/* "Paranoia" checks */ +void +__jffs2_dbg_fragtree_paranoia_check(struct jffs2_inode_info *f); +void +__jffs2_dbg_fragtree_paranoia_check_nolock(struct jffs2_inode_info *f); +void +__jffs2_dbg_acct_paranoia_check(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb); +void +__jffs2_dbg_acct_paranoia_check_nolock(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb); +void +__jffs2_dbg_prewrite_paranoia_check(struct jffs2_sb_info *c, + uint32_t ofs, int len); + +/* "Dump" functions */ +void +__jffs2_dbg_dump_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); +void +__jffs2_dbg_dump_jeb_nolock(struct jffs2_eraseblock *jeb); +void +__jffs2_dbg_dump_block_lists(struct jffs2_sb_info *c); +void +__jffs2_dbg_dump_block_lists_nolock(struct jffs2_sb_info *c); +void +__jffs2_dbg_dump_node_refs(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb); +void +__jffs2_dbg_dump_node_refs_nolock(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb); +void +__jffs2_dbg_dump_fragtree(struct jffs2_inode_info *f); +void +__jffs2_dbg_dump_fragtree_nolock(struct jffs2_inode_info *f); +void +__jffs2_dbg_dump_buffer(unsigned char *buf, int len, uint32_t offs); +void +__jffs2_dbg_dump_node(struct jffs2_sb_info *c, uint32_t ofs); + +#ifdef JFFS2_DBG_PARANOIA_CHECKS +#define jffs2_dbg_fragtree_paranoia_check(f) \ + __jffs2_dbg_fragtree_paranoia_check(f) +#define jffs2_dbg_fragtree_paranoia_check_nolock(f) \ + __jffs2_dbg_fragtree_paranoia_check_nolock(f) +#define jffs2_dbg_acct_paranoia_check(c, jeb) \ + __jffs2_dbg_acct_paranoia_check(c,jeb) +#define jffs2_dbg_acct_paranoia_check_nolock(c, jeb) \ + __jffs2_dbg_acct_paranoia_check_nolock(c,jeb) +#define jffs2_dbg_prewrite_paranoia_check(c, ofs, len) \ + __jffs2_dbg_prewrite_paranoia_check(c, ofs, len) +#else +#define jffs2_dbg_fragtree_paranoia_check(f) +#define jffs2_dbg_fragtree_paranoia_check_nolock(f) +#define jffs2_dbg_acct_paranoia_check(c, jeb) +#define jffs2_dbg_acct_paranoia_check_nolock(c, jeb) +#define jffs2_dbg_prewrite_paranoia_check(c, ofs, len) +#endif /* !JFFS2_PARANOIA_CHECKS */ + +#ifdef JFFS2_DBG_DUMPS +#define jffs2_dbg_dump_jeb(c, jeb) \ + __jffs2_dbg_dump_jeb(c, jeb); +#define jffs2_dbg_dump_jeb_nolock(jeb) \ + __jffs2_dbg_dump_jeb_nolock(jeb); +#define jffs2_dbg_dump_block_lists(c) \ + __jffs2_dbg_dump_block_lists(c) +#define jffs2_dbg_dump_block_lists_nolock(c) \ + __jffs2_dbg_dump_block_lists_nolock(c) +#define jffs2_dbg_dump_fragtree(f) \ + __jffs2_dbg_dump_fragtree(f); +#define jffs2_dbg_dump_fragtree_nolock(f) \ + __jffs2_dbg_dump_fragtree_nolock(f); +#define jffs2_dbg_dump_buffer(buf, len, offs) \ + __jffs2_dbg_dump_buffer(*buf, len, offs); +#define jffs2_dbg_dump_node(c, ofs) \ + __jffs2_dbg_dump_node(c, ofs); +#else +#define jffs2_dbg_dump_jeb(c, jeb) +#define jffs2_dbg_dump_jeb_nolock(jeb) +#define jffs2_dbg_dump_block_lists(c) +#define jffs2_dbg_dump_block_lists_nolock(c) +#define jffs2_dbg_dump_fragtree(f) +#define jffs2_dbg_dump_fragtree_nolock(f) +#define jffs2_dbg_dump_buffer(buf, len, offs) +#define jffs2_dbg_dump_node(c, ofs) +#endif /* !JFFS2_DBG_DUMPS */ + +#ifdef JFFS2_DBG_SANITY_CHECKS +#define jffs2_dbg_acct_sanity_check(c, jeb) \ + __jffs2_dbg_acct_sanity_check(c, jeb) +#define jffs2_dbg_acct_sanity_check_nolock(c, jeb) \ + __jffs2_dbg_acct_sanity_check_nolock(c, jeb) +#else +#define jffs2_dbg_acct_sanity_check(c, jeb) +#define jffs2_dbg_acct_sanity_check_nolock(c, jeb) +#endif /* !JFFS2_DBG_SANITY_CHECKS */ + +#endif /* _JFFS2_DEBUG_H_ */ diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c new file mode 100644 index 00000000..4bca6a2e --- /dev/null +++ b/fs/jffs2/dir.c @@ -0,0 +1,873 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include "jffs2_fs_i.h" +#include "jffs2_fs_sb.h" +#include +#include "nodelist.h" + +static int jffs2_readdir (struct file *, void *, filldir_t); + +static int jffs2_create (struct inode *,struct dentry *,int, + struct nameidata *); +static struct dentry *jffs2_lookup (struct inode *,struct dentry *, + struct nameidata *); +static int jffs2_link (struct dentry *,struct inode *,struct dentry *); +static int jffs2_unlink (struct inode *,struct dentry *); +static int jffs2_symlink (struct inode *,struct dentry *,const char *); +static int jffs2_mkdir (struct inode *,struct dentry *,int); +static int jffs2_rmdir (struct inode *,struct dentry *); +static int jffs2_mknod (struct inode *,struct dentry *,int,dev_t); +static int jffs2_rename (struct inode *, struct dentry *, + struct inode *, struct dentry *); + +const struct file_operations jffs2_dir_operations = +{ + .read = generic_read_dir, + .readdir = jffs2_readdir, + .unlocked_ioctl=jffs2_ioctl, + .fsync = jffs2_fsync, + .llseek = generic_file_llseek, +}; + + +const struct inode_operations jffs2_dir_inode_operations = +{ + .create = jffs2_create, + .lookup = jffs2_lookup, + .link = jffs2_link, + .unlink = jffs2_unlink, + .symlink = jffs2_symlink, + .mkdir = jffs2_mkdir, + .rmdir = jffs2_rmdir, + .mknod = jffs2_mknod, + .rename = jffs2_rename, + .check_acl = jffs2_check_acl, + .setattr = jffs2_setattr, + .setxattr = jffs2_setxattr, + .getxattr = jffs2_getxattr, + .listxattr = jffs2_listxattr, + .removexattr = jffs2_removexattr +}; + +/***********************************************************************/ + + +/* We keep the dirent list sorted in increasing order of name hash, + and we use the same hash function as the dentries. Makes this + nice and simple +*/ +static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, + struct nameidata *nd) +{ + struct jffs2_inode_info *dir_f; + struct jffs2_full_dirent *fd = NULL, *fd_list; + uint32_t ino = 0; + struct inode *inode = NULL; + + D1(printk(KERN_DEBUG "jffs2_lookup()\n")); + + if (target->d_name.len > JFFS2_MAX_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + dir_f = JFFS2_INODE_INFO(dir_i); + + mutex_lock(&dir_f->sem); + + /* NB: The 2.2 backport will need to explicitly check for '.' and '..' here */ + for (fd_list = dir_f->dents; fd_list && fd_list->nhash <= target->d_name.hash; fd_list = fd_list->next) { + if (fd_list->nhash == target->d_name.hash && + (!fd || fd_list->version > fd->version) && + strlen(fd_list->name) == target->d_name.len && + !strncmp(fd_list->name, target->d_name.name, target->d_name.len)) { + fd = fd_list; + } + } + if (fd) + ino = fd->ino; + mutex_unlock(&dir_f->sem); + if (ino) { + inode = jffs2_iget(dir_i->i_sb, ino); + if (IS_ERR(inode)) { + printk(KERN_WARNING "iget() failed for ino #%u\n", ino); + return ERR_CAST(inode); + } + } + + return d_splice_alias(inode, target); +} + +/***********************************************************************/ + + +static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct jffs2_inode_info *f; + struct inode *inode = filp->f_path.dentry->d_inode; + struct jffs2_full_dirent *fd; + unsigned long offset, curofs; + + D1(printk(KERN_DEBUG "jffs2_readdir() for dir_i #%lu\n", filp->f_path.dentry->d_inode->i_ino)); + + f = JFFS2_INODE_INFO(inode); + + offset = filp->f_pos; + + if (offset == 0) { + D1(printk(KERN_DEBUG "Dirent 0: \".\", ino #%lu\n", inode->i_ino)); + if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) + goto out; + offset++; + } + if (offset == 1) { + unsigned long pino = parent_ino(filp->f_path.dentry); + D1(printk(KERN_DEBUG "Dirent 1: \"..\", ino #%lu\n", pino)); + if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0) + goto out; + offset++; + } + + curofs=1; + mutex_lock(&f->sem); + for (fd = f->dents; fd; fd = fd->next) { + + curofs++; + /* First loop: curofs = 2; offset = 2 */ + if (curofs < offset) { + D2(printk(KERN_DEBUG "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n", + fd->name, fd->ino, fd->type, curofs, offset)); + continue; + } + if (!fd->ino) { + D2(printk(KERN_DEBUG "Skipping deletion dirent \"%s\"\n", fd->name)); + offset++; + continue; + } + D2(printk(KERN_DEBUG "Dirent %ld: \"%s\", ino #%u, type %d\n", offset, fd->name, fd->ino, fd->type)); + if (filldir(dirent, fd->name, strlen(fd->name), offset, fd->ino, fd->type) < 0) + break; + offset++; + } + mutex_unlock(&f->sem); + out: + filp->f_pos = offset; + return 0; +} + +/***********************************************************************/ + + +static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct jffs2_raw_inode *ri; + struct jffs2_inode_info *f, *dir_f; + struct jffs2_sb_info *c; + struct inode *inode; + int ret; + + ri = jffs2_alloc_raw_inode(); + if (!ri) + return -ENOMEM; + + c = JFFS2_SB_INFO(dir_i->i_sb); + + D1(printk(KERN_DEBUG "jffs2_create()\n")); + + inode = jffs2_new_inode(dir_i, mode, ri); + + if (IS_ERR(inode)) { + D1(printk(KERN_DEBUG "jffs2_new_inode() failed\n")); + jffs2_free_raw_inode(ri); + return PTR_ERR(inode); + } + + inode->i_op = &jffs2_file_inode_operations; + inode->i_fop = &jffs2_file_operations; + inode->i_mapping->a_ops = &jffs2_file_address_operations; + inode->i_mapping->nrpages = 0; + + f = JFFS2_INODE_INFO(inode); + dir_f = JFFS2_INODE_INFO(dir_i); + + /* jffs2_do_create() will want to lock it, _after_ reserving + space and taking c-alloc_sem. If we keep it locked here, + lockdep gets unhappy (although it's a false positive; + nothing else will be looking at this inode yet so there's + no chance of AB-BA deadlock involving its f->sem). */ + mutex_unlock(&f->sem); + + ret = jffs2_do_create(c, dir_f, f, ri, &dentry->d_name); + if (ret) + goto fail; + + dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime)); + + jffs2_free_raw_inode(ri); + + D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n", + inode->i_ino, inode->i_mode, inode->i_nlink, + f->inocache->pino_nlink, inode->i_mapping->nrpages)); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + return 0; + + fail: + iget_failed(inode); + jffs2_free_raw_inode(ri); + return ret; +} + +/***********************************************************************/ + + +static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb); + struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); + struct jffs2_inode_info *dead_f = JFFS2_INODE_INFO(dentry->d_inode); + int ret; + uint32_t now = get_seconds(); + + ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, + dentry->d_name.len, dead_f, now); + if (dead_f->inocache) + dentry->d_inode->i_nlink = dead_f->inocache->pino_nlink; + if (!ret) + dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + return ret; +} +/***********************************************************************/ + + +static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct dentry *dentry) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dentry->d_inode->i_sb); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode); + struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); + int ret; + uint8_t type; + uint32_t now; + + /* Don't let people make hard links to bad inodes. */ + if (!f->inocache) + return -EIO; + + if (S_ISDIR(old_dentry->d_inode->i_mode)) + return -EPERM; + + /* XXX: This is ugly */ + type = (old_dentry->d_inode->i_mode & S_IFMT) >> 12; + if (!type) type = DT_REG; + + now = get_seconds(); + ret = jffs2_do_link(c, dir_f, f->inocache->ino, type, dentry->d_name.name, dentry->d_name.len, now); + + if (!ret) { + mutex_lock(&f->sem); + old_dentry->d_inode->i_nlink = ++f->inocache->pino_nlink; + mutex_unlock(&f->sem); + d_instantiate(dentry, old_dentry->d_inode); + dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + ihold(old_dentry->d_inode); + } + return ret; +} + +/***********************************************************************/ + +static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char *target) +{ + struct jffs2_inode_info *f, *dir_f; + struct jffs2_sb_info *c; + struct inode *inode; + struct jffs2_raw_inode *ri; + struct jffs2_raw_dirent *rd; + struct jffs2_full_dnode *fn; + struct jffs2_full_dirent *fd; + int namelen; + uint32_t alloclen; + int ret, targetlen = strlen(target); + + /* FIXME: If you care. We'd need to use frags for the target + if it grows much more than this */ + if (targetlen > 254) + return -ENAMETOOLONG; + + ri = jffs2_alloc_raw_inode(); + + if (!ri) + return -ENOMEM; + + c = JFFS2_SB_INFO(dir_i->i_sb); + + /* Try to reserve enough space for both node and dirent. + * Just the node will do for now, though + */ + namelen = dentry->d_name.len; + ret = jffs2_reserve_space(c, sizeof(*ri) + targetlen, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + + if (ret) { + jffs2_free_raw_inode(ri); + return ret; + } + + inode = jffs2_new_inode(dir_i, S_IFLNK | S_IRWXUGO, ri); + + if (IS_ERR(inode)) { + jffs2_free_raw_inode(ri); + jffs2_complete_reservation(c); + return PTR_ERR(inode); + } + + inode->i_op = &jffs2_symlink_inode_operations; + + f = JFFS2_INODE_INFO(inode); + + inode->i_size = targetlen; + ri->isize = ri->dsize = ri->csize = cpu_to_je32(inode->i_size); + ri->totlen = cpu_to_je32(sizeof(*ri) + inode->i_size); + ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)); + + ri->compr = JFFS2_COMPR_NONE; + ri->data_crc = cpu_to_je32(crc32(0, target, targetlen)); + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + + fn = jffs2_write_dnode(c, f, ri, target, targetlen, ALLOC_NORMAL); + + jffs2_free_raw_inode(ri); + + if (IS_ERR(fn)) { + /* Eeek. Wave bye bye */ + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + ret = PTR_ERR(fn); + goto fail; + } + + /* We use f->target field to store the target path. */ + f->target = kmemdup(target, targetlen + 1, GFP_KERNEL); + if (!f->target) { + printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1); + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + ret = -ENOMEM; + goto fail; + } + + D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target)); + + /* No data here. Only a metadata node, which will be + obsoleted by the first data write + */ + f->metadata = fn; + mutex_unlock(&f->sem); + + jffs2_complete_reservation(c); + + ret = jffs2_init_security(inode, dir_i, &dentry->d_name); + if (ret) + goto fail; + + ret = jffs2_init_acl_post(inode); + if (ret) + goto fail; + + ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + if (ret) + goto fail; + + rd = jffs2_alloc_raw_dirent(); + if (!rd) { + /* Argh. Now we treat it like a normal delete */ + jffs2_complete_reservation(c); + ret = -ENOMEM; + goto fail; + } + + dir_f = JFFS2_INODE_INFO(dir_i); + mutex_lock(&dir_f->sem); + + rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); + rd->totlen = cpu_to_je32(sizeof(*rd) + namelen); + rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); + + rd->pino = cpu_to_je32(dir_i->i_ino); + rd->version = cpu_to_je32(++dir_f->highest_version); + rd->ino = cpu_to_je32(inode->i_ino); + rd->mctime = cpu_to_je32(get_seconds()); + rd->nsize = namelen; + rd->type = DT_LNK; + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen)); + + fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL); + + if (IS_ERR(fd)) { + /* dirent failed to write. Delete the inode normally + as if it were the final unlink() */ + jffs2_complete_reservation(c); + jffs2_free_raw_dirent(rd); + mutex_unlock(&dir_f->sem); + ret = PTR_ERR(fd); + goto fail; + } + + dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + + jffs2_free_raw_dirent(rd); + + /* Link the fd into the inode's list, obsoleting an old + one if necessary. */ + jffs2_add_fd_to_list(c, fd, &dir_f->dents); + + mutex_unlock(&dir_f->sem); + jffs2_complete_reservation(c); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + return 0; + + fail: + iget_failed(inode); + return ret; +} + + +static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) +{ + struct jffs2_inode_info *f, *dir_f; + struct jffs2_sb_info *c; + struct inode *inode; + struct jffs2_raw_inode *ri; + struct jffs2_raw_dirent *rd; + struct jffs2_full_dnode *fn; + struct jffs2_full_dirent *fd; + int namelen; + uint32_t alloclen; + int ret; + + mode |= S_IFDIR; + + ri = jffs2_alloc_raw_inode(); + if (!ri) + return -ENOMEM; + + c = JFFS2_SB_INFO(dir_i->i_sb); + + /* Try to reserve enough space for both node and dirent. + * Just the node will do for now, though + */ + namelen = dentry->d_name.len; + ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL, + JFFS2_SUMMARY_INODE_SIZE); + + if (ret) { + jffs2_free_raw_inode(ri); + return ret; + } + + inode = jffs2_new_inode(dir_i, mode, ri); + + if (IS_ERR(inode)) { + jffs2_free_raw_inode(ri); + jffs2_complete_reservation(c); + return PTR_ERR(inode); + } + + inode->i_op = &jffs2_dir_inode_operations; + inode->i_fop = &jffs2_dir_operations; + + f = JFFS2_INODE_INFO(inode); + + /* Directories get nlink 2 at start */ + inode->i_nlink = 2; + /* but ic->pino_nlink is the parent ino# */ + f->inocache->pino_nlink = dir_i->i_ino; + + ri->data_crc = cpu_to_je32(0); + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + + fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL); + + jffs2_free_raw_inode(ri); + + if (IS_ERR(fn)) { + /* Eeek. Wave bye bye */ + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + ret = PTR_ERR(fn); + goto fail; + } + /* No data here. Only a metadata node, which will be + obsoleted by the first data write + */ + f->metadata = fn; + mutex_unlock(&f->sem); + + jffs2_complete_reservation(c); + + ret = jffs2_init_security(inode, dir_i, &dentry->d_name); + if (ret) + goto fail; + + ret = jffs2_init_acl_post(inode); + if (ret) + goto fail; + + ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + if (ret) + goto fail; + + rd = jffs2_alloc_raw_dirent(); + if (!rd) { + /* Argh. Now we treat it like a normal delete */ + jffs2_complete_reservation(c); + ret = -ENOMEM; + goto fail; + } + + dir_f = JFFS2_INODE_INFO(dir_i); + mutex_lock(&dir_f->sem); + + rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); + rd->totlen = cpu_to_je32(sizeof(*rd) + namelen); + rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); + + rd->pino = cpu_to_je32(dir_i->i_ino); + rd->version = cpu_to_je32(++dir_f->highest_version); + rd->ino = cpu_to_je32(inode->i_ino); + rd->mctime = cpu_to_je32(get_seconds()); + rd->nsize = namelen; + rd->type = DT_DIR; + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen)); + + fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL); + + if (IS_ERR(fd)) { + /* dirent failed to write. Delete the inode normally + as if it were the final unlink() */ + jffs2_complete_reservation(c); + jffs2_free_raw_dirent(rd); + mutex_unlock(&dir_f->sem); + ret = PTR_ERR(fd); + goto fail; + } + + dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + inc_nlink(dir_i); + + jffs2_free_raw_dirent(rd); + + /* Link the fd into the inode's list, obsoleting an old + one if necessary. */ + jffs2_add_fd_to_list(c, fd, &dir_f->dents); + + mutex_unlock(&dir_f->sem); + jffs2_complete_reservation(c); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + return 0; + + fail: + iget_failed(inode); + return ret; +} + +static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb); + struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(dentry->d_inode); + struct jffs2_full_dirent *fd; + int ret; + uint32_t now = get_seconds(); + + for (fd = f->dents ; fd; fd = fd->next) { + if (fd->ino) + return -ENOTEMPTY; + } + + ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, + dentry->d_name.len, f, now); + if (!ret) { + dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + clear_nlink(dentry->d_inode); + drop_nlink(dir_i); + } + return ret; +} + +static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, dev_t rdev) +{ + struct jffs2_inode_info *f, *dir_f; + struct jffs2_sb_info *c; + struct inode *inode; + struct jffs2_raw_inode *ri; + struct jffs2_raw_dirent *rd; + struct jffs2_full_dnode *fn; + struct jffs2_full_dirent *fd; + int namelen; + union jffs2_device_node dev; + int devlen = 0; + uint32_t alloclen; + int ret; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + ri = jffs2_alloc_raw_inode(); + if (!ri) + return -ENOMEM; + + c = JFFS2_SB_INFO(dir_i->i_sb); + + if (S_ISBLK(mode) || S_ISCHR(mode)) + devlen = jffs2_encode_dev(&dev, rdev); + + /* Try to reserve enough space for both node and dirent. + * Just the node will do for now, though + */ + namelen = dentry->d_name.len; + ret = jffs2_reserve_space(c, sizeof(*ri) + devlen, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + + if (ret) { + jffs2_free_raw_inode(ri); + return ret; + } + + inode = jffs2_new_inode(dir_i, mode, ri); + + if (IS_ERR(inode)) { + jffs2_free_raw_inode(ri); + jffs2_complete_reservation(c); + return PTR_ERR(inode); + } + inode->i_op = &jffs2_file_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + + f = JFFS2_INODE_INFO(inode); + + ri->dsize = ri->csize = cpu_to_je32(devlen); + ri->totlen = cpu_to_je32(sizeof(*ri) + devlen); + ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)); + + ri->compr = JFFS2_COMPR_NONE; + ri->data_crc = cpu_to_je32(crc32(0, &dev, devlen)); + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + + fn = jffs2_write_dnode(c, f, ri, (char *)&dev, devlen, ALLOC_NORMAL); + + jffs2_free_raw_inode(ri); + + if (IS_ERR(fn)) { + /* Eeek. Wave bye bye */ + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + ret = PTR_ERR(fn); + goto fail; + } + /* No data here. Only a metadata node, which will be + obsoleted by the first data write + */ + f->metadata = fn; + mutex_unlock(&f->sem); + + jffs2_complete_reservation(c); + + ret = jffs2_init_security(inode, dir_i, &dentry->d_name); + if (ret) + goto fail; + + ret = jffs2_init_acl_post(inode); + if (ret) + goto fail; + + ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + if (ret) + goto fail; + + rd = jffs2_alloc_raw_dirent(); + if (!rd) { + /* Argh. Now we treat it like a normal delete */ + jffs2_complete_reservation(c); + ret = -ENOMEM; + goto fail; + } + + dir_f = JFFS2_INODE_INFO(dir_i); + mutex_lock(&dir_f->sem); + + rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); + rd->totlen = cpu_to_je32(sizeof(*rd) + namelen); + rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); + + rd->pino = cpu_to_je32(dir_i->i_ino); + rd->version = cpu_to_je32(++dir_f->highest_version); + rd->ino = cpu_to_je32(inode->i_ino); + rd->mctime = cpu_to_je32(get_seconds()); + rd->nsize = namelen; + + /* XXX: This is ugly. */ + rd->type = (mode & S_IFMT) >> 12; + + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen)); + + fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL); + + if (IS_ERR(fd)) { + /* dirent failed to write. Delete the inode normally + as if it were the final unlink() */ + jffs2_complete_reservation(c); + jffs2_free_raw_dirent(rd); + mutex_unlock(&dir_f->sem); + ret = PTR_ERR(fd); + goto fail; + } + + dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + + jffs2_free_raw_dirent(rd); + + /* Link the fd into the inode's list, obsoleting an old + one if necessary. */ + jffs2_add_fd_to_list(c, fd, &dir_f->dents); + + mutex_unlock(&dir_f->sem); + jffs2_complete_reservation(c); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + return 0; + + fail: + iget_failed(inode); + return ret; +} + +static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, + struct inode *new_dir_i, struct dentry *new_dentry) +{ + int ret; + struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dir_i->i_sb); + struct jffs2_inode_info *victim_f = NULL; + uint8_t type; + uint32_t now; + + /* The VFS will check for us and prevent trying to rename a + * file over a directory and vice versa, but if it's a directory, + * the VFS can't check whether the victim is empty. The filesystem + * needs to do that for itself. + */ + if (new_dentry->d_inode) { + victim_f = JFFS2_INODE_INFO(new_dentry->d_inode); + if (S_ISDIR(new_dentry->d_inode->i_mode)) { + struct jffs2_full_dirent *fd; + + mutex_lock(&victim_f->sem); + for (fd = victim_f->dents; fd; fd = fd->next) { + if (fd->ino) { + mutex_unlock(&victim_f->sem); + return -ENOTEMPTY; + } + } + mutex_unlock(&victim_f->sem); + } + } + + /* XXX: We probably ought to alloc enough space for + both nodes at the same time. Writing the new link, + then getting -ENOSPC, is quite bad :) + */ + + /* Make a hard link */ + + /* XXX: This is ugly */ + type = (old_dentry->d_inode->i_mode & S_IFMT) >> 12; + if (!type) type = DT_REG; + + now = get_seconds(); + ret = jffs2_do_link(c, JFFS2_INODE_INFO(new_dir_i), + old_dentry->d_inode->i_ino, type, + new_dentry->d_name.name, new_dentry->d_name.len, now); + + if (ret) + return ret; + + if (victim_f) { + /* There was a victim. Kill it off nicely */ + drop_nlink(new_dentry->d_inode); + /* Don't oops if the victim was a dirent pointing to an + inode which didn't exist. */ + if (victim_f->inocache) { + mutex_lock(&victim_f->sem); + if (S_ISDIR(new_dentry->d_inode->i_mode)) + victim_f->inocache->pino_nlink = 0; + else + victim_f->inocache->pino_nlink--; + mutex_unlock(&victim_f->sem); + } + } + + /* If it was a directory we moved, and there was no victim, + increase i_nlink on its new parent */ + if (S_ISDIR(old_dentry->d_inode->i_mode) && !victim_f) + inc_nlink(new_dir_i); + + /* Unlink the original */ + ret = jffs2_do_unlink(c, JFFS2_INODE_INFO(old_dir_i), + old_dentry->d_name.name, old_dentry->d_name.len, NULL, now); + + /* We don't touch inode->i_nlink */ + + if (ret) { + /* Oh shit. We really ought to make a single node which can do both atomically */ + struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode); + mutex_lock(&f->sem); + inc_nlink(old_dentry->d_inode); + if (f->inocache && !S_ISDIR(old_dentry->d_inode->i_mode)) + f->inocache->pino_nlink++; + mutex_unlock(&f->sem); + + printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret); + /* Might as well let the VFS know */ + d_instantiate(new_dentry, old_dentry->d_inode); + ihold(old_dentry->d_inode); + new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); + return ret; + } + + if (S_ISDIR(old_dentry->d_inode->i_mode)) + drop_nlink(old_dir_i); + + new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now); + + return 0; +} + diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c new file mode 100644 index 00000000..e513f191 --- /dev/null +++ b/fs/jffs2/erase.c @@ -0,0 +1,502 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + +struct erase_priv_struct { + struct jffs2_eraseblock *jeb; + struct jffs2_sb_info *c; +}; + +#ifndef __ECOS +static void jffs2_erase_callback(struct erase_info *); +#endif +static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset); +static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); +static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); + +static void jffs2_erase_block(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + int ret; + uint32_t bad_offset; +#ifdef __ECOS + ret = jffs2_flash_erase(c, jeb); + if (!ret) { + jffs2_erase_succeeded(c, jeb); + return; + } + bad_offset = jeb->offset; +#else /* Linux */ + struct erase_info *instr; + + D1(printk(KERN_DEBUG "jffs2_erase_block(): erase block %#08x (range %#08x-%#08x)\n", + jeb->offset, jeb->offset, jeb->offset + c->sector_size)); + instr = kmalloc(sizeof(struct erase_info) + sizeof(struct erase_priv_struct), GFP_KERNEL); + if (!instr) { + printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + list_move(&jeb->list, &c->erase_pending_list); + c->erasing_size -= c->sector_size; + c->dirty_size += c->sector_size; + jeb->dirty_size = c->sector_size; + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + return; + } + + memset(instr, 0, sizeof(*instr)); + + instr->mtd = c->mtd; + instr->addr = jeb->offset; + instr->len = c->sector_size; + instr->callback = jffs2_erase_callback; + instr->priv = (unsigned long)(&instr[1]); + instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; + + ((struct erase_priv_struct *)instr->priv)->jeb = jeb; + ((struct erase_priv_struct *)instr->priv)->c = c; + + ret = c->mtd->erase(c->mtd, instr); + if (!ret) + return; + + bad_offset = instr->fail_addr; + kfree(instr); +#endif /* __ECOS */ + + if (ret == -ENOMEM || ret == -EAGAIN) { + /* Erase failed immediately. Refile it on the list */ + D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret)); + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + list_move(&jeb->list, &c->erase_pending_list); + c->erasing_size -= c->sector_size; + c->dirty_size += c->sector_size; + jeb->dirty_size = c->sector_size; + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + return; + } + + if (ret == -EROFS) + printk(KERN_WARNING "Erase at 0x%08x failed immediately: -EROFS. Is the sector locked?\n", jeb->offset); + else + printk(KERN_WARNING "Erase at 0x%08x failed immediately: errno %d\n", jeb->offset, ret); + + jffs2_erase_failed(c, jeb, bad_offset); +} + +int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) +{ + struct jffs2_eraseblock *jeb; + int work_done = 0; + + mutex_lock(&c->erase_free_sem); + + spin_lock(&c->erase_completion_lock); + + while (!list_empty(&c->erase_complete_list) || + !list_empty(&c->erase_pending_list)) { + + if (!list_empty(&c->erase_complete_list)) { + jeb = list_entry(c->erase_complete_list.next, struct jffs2_eraseblock, list); + list_move(&jeb->list, &c->erase_checking_list); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + jffs2_mark_erased_block(c, jeb); + + work_done++; + if (!--count) { + D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n")); + goto done; + } + + } else if (!list_empty(&c->erase_pending_list)) { + jeb = list_entry(c->erase_pending_list.next, struct jffs2_eraseblock, list); + D1(printk(KERN_DEBUG "Starting erase of pending block 0x%08x\n", jeb->offset)); + list_del(&jeb->list); + c->erasing_size += c->sector_size; + c->wasted_size -= jeb->wasted_size; + c->free_size -= jeb->free_size; + c->used_size -= jeb->used_size; + c->dirty_size -= jeb->dirty_size; + jeb->wasted_size = jeb->used_size = jeb->dirty_size = jeb->free_size = 0; + jffs2_free_jeb_node_refs(c, jeb); + list_add(&jeb->list, &c->erasing_list); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + + jffs2_erase_block(c, jeb); + + } else { + BUG(); + } + + /* Be nice */ + cond_resched(); + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + } + + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + done: + D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n")); + return work_done; +} + +static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +{ + D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset)); + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + list_move_tail(&jeb->list, &c->erase_complete_list); + /* Wake the GC thread to mark them clean */ + jffs2_garbage_collect_trigger(c); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + wake_up(&c->erase_wait); +} + +static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset) +{ + /* For NAND, if the failure did not occur at the device level for a + specific physical page, don't bother updating the bad block table. */ + if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) { + /* We had a device-level failure to erase. Let's see if we've + failed too many times. */ + if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { + /* We'd like to give this block another try. */ + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + list_move(&jeb->list, &c->erase_pending_list); + c->erasing_size -= c->sector_size; + c->dirty_size += c->sector_size; + jeb->dirty_size = c->sector_size; + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + return; + } + } + + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + c->erasing_size -= c->sector_size; + c->bad_size += c->sector_size; + list_move(&jeb->list, &c->bad_list); + c->nr_erasing_blocks--; + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + wake_up(&c->erase_wait); +} + +#ifndef __ECOS +static void jffs2_erase_callback(struct erase_info *instr) +{ + struct erase_priv_struct *priv = (void *)instr->priv; + + if(instr->state != MTD_ERASE_DONE) { + printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", + (unsigned long long)instr->addr, instr->state); + jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr); + } else { + jffs2_erase_succeeded(priv->c, priv->jeb); + } + kfree(instr); +} +#endif /* !__ECOS */ + +/* Hmmm. Maybe we should accept the extra space it takes and make + this a standard doubly-linked list? */ +static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, + struct jffs2_raw_node_ref *ref, struct jffs2_eraseblock *jeb) +{ + struct jffs2_inode_cache *ic = NULL; + struct jffs2_raw_node_ref **prev; + + prev = &ref->next_in_ino; + + /* Walk the inode's list once, removing any nodes from this eraseblock */ + while (1) { + if (!(*prev)->next_in_ino) { + /* We're looking at the jffs2_inode_cache, which is + at the end of the linked list. Stash it and continue + from the beginning of the list */ + ic = (struct jffs2_inode_cache *)(*prev); + prev = &ic->nodes; + continue; + } + + if (SECTOR_ADDR((*prev)->flash_offset) == jeb->offset) { + /* It's in the block we're erasing */ + struct jffs2_raw_node_ref *this; + + this = *prev; + *prev = this->next_in_ino; + this->next_in_ino = NULL; + + if (this == ref) + break; + + continue; + } + /* Not to be deleted. Skip */ + prev = &((*prev)->next_in_ino); + } + + /* PARANOIA */ + if (!ic) { + JFFS2_WARNING("inode_cache/xattr_datum/xattr_ref" + " not found in remove_node_refs()!!\n"); + return; + } + + D1(printk(KERN_DEBUG "Removed nodes in range 0x%08x-0x%08x from ino #%u\n", + jeb->offset, jeb->offset + c->sector_size, ic->ino)); + + D2({ + int i=0; + struct jffs2_raw_node_ref *this; + printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n"); + + this = ic->nodes; + + printk(KERN_DEBUG); + while(this) { + printk(KERN_CONT "0x%08x(%d)->", + ref_offset(this), ref_flags(this)); + if (++i == 5) { + printk(KERN_DEBUG); + i=0; + } + this = this->next_in_ino; + } + printk(KERN_CONT "\n"); + }); + + switch (ic->class) { +#ifdef CONFIG_JFFS2_FS_XATTR + case RAWNODE_CLASS_XATTR_DATUM: + jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic); + break; + case RAWNODE_CLASS_XATTR_REF: + jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic); + break; +#endif + default: + if (ic->nodes == (void *)ic && ic->pino_nlink == 0) + jffs2_del_ino_cache(c, ic); + } +} + +void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +{ + struct jffs2_raw_node_ref *block, *ref; + D1(printk(KERN_DEBUG "Freeing all node refs for eraseblock offset 0x%08x\n", jeb->offset)); + + block = ref = jeb->first_node; + + while (ref) { + if (ref->flash_offset == REF_LINK_NODE) { + ref = ref->next_in_ino; + jffs2_free_refblock(block); + block = ref; + continue; + } + if (ref->flash_offset != REF_EMPTY_NODE && ref->next_in_ino) + jffs2_remove_node_refs_from_ino_list(c, ref, jeb); + /* else it was a non-inode node or already removed, so don't bother */ + + ref++; + } + jeb->first_node = jeb->last_node = NULL; +} + +static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t *bad_offset) +{ + void *ebuf; + uint32_t ofs; + size_t retlen; + int ret = -EIO; + + if (c->mtd->point) { + unsigned long *wordebuf; + + ret = c->mtd->point(c->mtd, jeb->offset, c->sector_size, + &retlen, &ebuf, NULL); + if (ret) { + D1(printk(KERN_DEBUG "MTD point failed %d\n", ret)); + goto do_flash_read; + } + if (retlen < c->sector_size) { + /* Don't muck about if it won't let us point to the whole erase sector */ + D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", retlen)); + c->mtd->unpoint(c->mtd, jeb->offset, retlen); + goto do_flash_read; + } + wordebuf = ebuf-sizeof(*wordebuf); + retlen /= sizeof(*wordebuf); + do { + if (*++wordebuf != ~0) + break; + } while(--retlen); + c->mtd->unpoint(c->mtd, jeb->offset, c->sector_size); + if (retlen) { + printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08tx\n", + *wordebuf, jeb->offset + c->sector_size-retlen*sizeof(*wordebuf)); + return -EIO; + } + return 0; + } + do_flash_read: + ebuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!ebuf) { + printk(KERN_WARNING "Failed to allocate page buffer for verifying erase at 0x%08x. Refiling\n", jeb->offset); + return -EAGAIN; + } + + D1(printk(KERN_DEBUG "Verifying erase at 0x%08x\n", jeb->offset)); + + for (ofs = jeb->offset; ofs < jeb->offset + c->sector_size; ) { + uint32_t readlen = min((uint32_t)PAGE_SIZE, jeb->offset + c->sector_size - ofs); + int i; + + *bad_offset = ofs; + + ret = c->mtd->read(c->mtd, ofs, readlen, &retlen, ebuf); + if (ret) { + printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret); + ret = -EIO; + goto fail; + } + if (retlen != readlen) { + printk(KERN_WARNING "Short read from newly-erased block at 0x%08x. Wanted %d, got %zd\n", ofs, readlen, retlen); + ret = -EIO; + goto fail; + } + for (i=0; ioffset)); + bad_offset = jeb->offset; + + /* Cleanmarker in oob area or no cleanmarker at all ? */ + if (jffs2_cleanmarker_oob(c) || c->cleanmarker_size == 0) { + + if (jffs2_cleanmarker_oob(c)) { + if (jffs2_write_nand_cleanmarker(c, jeb)) + goto filebad; + } + } else { + + struct kvec vecs[1]; + struct jffs2_unknown_node marker = { + .magic = cpu_to_je16(JFFS2_MAGIC_BITMASK), + .nodetype = cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER), + .totlen = cpu_to_je32(c->cleanmarker_size) + }; + + jffs2_prealloc_raw_node_refs(c, jeb, 1); + + marker.hdr_crc = cpu_to_je32(crc32(0, &marker, sizeof(struct jffs2_unknown_node)-4)); + + vecs[0].iov_base = (unsigned char *) ▮ + vecs[0].iov_len = sizeof(marker); + ret = jffs2_flash_direct_writev(c, vecs, 1, jeb->offset, &retlen); + + if (ret || retlen != sizeof(marker)) { + if (ret) + printk(KERN_WARNING "Write clean marker to block at 0x%08x failed: %d\n", + jeb->offset, ret); + else + printk(KERN_WARNING "Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n", + jeb->offset, sizeof(marker), retlen); + + goto filebad; + } + } + /* Everything else got zeroed before the erase */ + jeb->free_size = c->sector_size; + + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + + c->erasing_size -= c->sector_size; + c->free_size += c->sector_size; + + /* Account for cleanmarker now, if it's in-band */ + if (c->cleanmarker_size && !jffs2_cleanmarker_oob(c)) + jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL, c->cleanmarker_size, NULL); + + list_move_tail(&jeb->list, &c->free_list); + c->nr_erasing_blocks--; + c->nr_free_blocks++; + + jffs2_dbg_acct_sanity_check_nolock(c, jeb); + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + wake_up(&c->erase_wait); + return; + +filebad: + jffs2_erase_failed(c, jeb, bad_offset); + return; + +refile: + /* Stick it back on the list from whence it came and come back later */ + mutex_lock(&c->erase_free_sem); + spin_lock(&c->erase_completion_lock); + jffs2_garbage_collect_trigger(c); + list_move(&jeb->list, &c->erase_complete_list); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->erase_free_sem); + return; +} diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c new file mode 100644 index 00000000..1c0a08d7 --- /dev/null +++ b/fs/jffs2/file.c @@ -0,0 +1,321 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + +static int jffs2_write_end(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *pg, void *fsdata); +static int jffs2_write_begin(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +static int jffs2_readpage (struct file *filp, struct page *pg); + +int jffs2_fsync(struct file *filp, int datasync) +{ + struct inode *inode = filp->f_mapping->host; + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + + /* Trigger GC to flush any pending writes for this inode */ + jffs2_flush_wbuf_gc(c, inode->i_ino); + + return 0; +} + +const struct file_operations jffs2_file_operations = +{ + .llseek = generic_file_llseek, + .open = generic_file_open, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .unlocked_ioctl=jffs2_ioctl, + .mmap = generic_file_readonly_mmap, + .fsync = jffs2_fsync, + .splice_read = generic_file_splice_read, +}; + +/* jffs2_file_inode_operations */ + +const struct inode_operations jffs2_file_inode_operations = +{ + .check_acl = jffs2_check_acl, + .setattr = jffs2_setattr, + .setxattr = jffs2_setxattr, + .getxattr = jffs2_getxattr, + .listxattr = jffs2_listxattr, + .removexattr = jffs2_removexattr +}; + +const struct address_space_operations jffs2_file_address_operations = +{ + .readpage = jffs2_readpage, + .write_begin = jffs2_write_begin, + .write_end = jffs2_write_end, +}; + +static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) +{ + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + unsigned char *pg_buf; + int ret; + + D2(printk(KERN_DEBUG "jffs2_do_readpage_nolock(): ino #%lu, page at offset 0x%lx\n", inode->i_ino, pg->index << PAGE_CACHE_SHIFT)); + + BUG_ON(!PageLocked(pg)); + + pg_buf = kmap(pg); + /* FIXME: Can kmap fail? */ + + ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE); + + if (ret) { + ClearPageUptodate(pg); + SetPageError(pg); + } else { + SetPageUptodate(pg); + ClearPageError(pg); + } + + flush_dcache_page(pg); + kunmap(pg); + + D2(printk(KERN_DEBUG "readpage finished\n")); + return ret; +} + +int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg) +{ + int ret = jffs2_do_readpage_nolock(inode, pg); + unlock_page(pg); + return ret; +} + + +static int jffs2_readpage (struct file *filp, struct page *pg) +{ + struct jffs2_inode_info *f = JFFS2_INODE_INFO(pg->mapping->host); + int ret; + + mutex_lock(&f->sem); + ret = jffs2_do_readpage_unlock(pg->mapping->host, pg); + mutex_unlock(&f->sem); + return ret; +} + +static int jffs2_write_begin(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct page *pg; + struct inode *inode = mapping->host; + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + uint32_t pageofs = index << PAGE_CACHE_SHIFT; + int ret = 0; + + pg = grab_cache_page_write_begin(mapping, index, flags); + if (!pg) + return -ENOMEM; + *pagep = pg; + + D1(printk(KERN_DEBUG "jffs2_write_begin()\n")); + + if (pageofs > inode->i_size) { + /* Make new hole frag from old EOF to new page */ + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_raw_inode ri; + struct jffs2_full_dnode *fn; + uint32_t alloc_len; + + D1(printk(KERN_DEBUG "Writing new hole frag 0x%x-0x%x between current EOF and new page\n", + (unsigned int)inode->i_size, pageofs)); + + ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, + ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + if (ret) + goto out_page; + + mutex_lock(&f->sem); + memset(&ri, 0, sizeof(ri)); + + ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); + ri.totlen = cpu_to_je32(sizeof(ri)); + ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4)); + + ri.ino = cpu_to_je32(f->inocache->ino); + ri.version = cpu_to_je32(++f->highest_version); + ri.mode = cpu_to_jemode(inode->i_mode); + ri.uid = cpu_to_je16(inode->i_uid); + ri.gid = cpu_to_je16(inode->i_gid); + ri.isize = cpu_to_je32(max((uint32_t)inode->i_size, pageofs)); + ri.atime = ri.ctime = ri.mtime = cpu_to_je32(get_seconds()); + ri.offset = cpu_to_je32(inode->i_size); + ri.dsize = cpu_to_je32(pageofs - inode->i_size); + ri.csize = cpu_to_je32(0); + ri.compr = JFFS2_COMPR_ZERO; + ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8)); + ri.data_crc = cpu_to_je32(0); + + fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_NORMAL); + + if (IS_ERR(fn)) { + ret = PTR_ERR(fn); + jffs2_complete_reservation(c); + mutex_unlock(&f->sem); + goto out_page; + } + ret = jffs2_add_full_dnode_to_inode(c, f, fn); + if (f->metadata) { + jffs2_mark_node_obsolete(c, f->metadata->raw); + jffs2_free_full_dnode(f->metadata); + f->metadata = NULL; + } + if (ret) { + D1(printk(KERN_DEBUG "Eep. add_full_dnode_to_inode() failed in write_begin, returned %d\n", ret)); + jffs2_mark_node_obsolete(c, fn->raw); + jffs2_free_full_dnode(fn); + jffs2_complete_reservation(c); + mutex_unlock(&f->sem); + goto out_page; + } + jffs2_complete_reservation(c); + inode->i_size = pageofs; + mutex_unlock(&f->sem); + } + + /* + * Read in the page if it wasn't already present. Cannot optimize away + * the whole page write case until jffs2_write_end can handle the + * case of a short-copy. + */ + if (!PageUptodate(pg)) { + mutex_lock(&f->sem); + ret = jffs2_do_readpage_nolock(inode, pg); + mutex_unlock(&f->sem); + if (ret) + goto out_page; + } + D1(printk(KERN_DEBUG "end write_begin(). pg->flags %lx\n", pg->flags)); + return ret; + +out_page: + unlock_page(pg); + page_cache_release(pg); + return ret; +} + +static int jffs2_write_end(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *pg, void *fsdata) +{ + /* Actually commit the write from the page cache page we're looking at. + * For now, we write the full page out each time. It sucks, but it's simple + */ + struct inode *inode = mapping->host; + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_raw_inode *ri; + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + copied; + unsigned aligned_start = start & ~3; + int ret = 0; + uint32_t writtenlen = 0; + + D1(printk(KERN_DEBUG "jffs2_write_end(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n", + inode->i_ino, pg->index << PAGE_CACHE_SHIFT, start, end, pg->flags)); + + /* We need to avoid deadlock with page_cache_read() in + jffs2_garbage_collect_pass(). So the page must be + up to date to prevent page_cache_read() from trying + to re-lock it. */ + BUG_ON(!PageUptodate(pg)); + + if (end == PAGE_CACHE_SIZE) { + /* When writing out the end of a page, write out the + _whole_ page. This helps to reduce the number of + nodes in files which have many short writes, like + syslog files. */ + aligned_start = 0; + } + + ri = jffs2_alloc_raw_inode(); + + if (!ri) { + D1(printk(KERN_DEBUG "jffs2_write_end(): Allocation of raw inode failed\n")); + unlock_page(pg); + page_cache_release(pg); + return -ENOMEM; + } + + /* Set the fields that the generic jffs2_write_inode_range() code can't find */ + ri->ino = cpu_to_je32(inode->i_ino); + ri->mode = cpu_to_jemode(inode->i_mode); + ri->uid = cpu_to_je16(inode->i_uid); + ri->gid = cpu_to_je16(inode->i_gid); + ri->isize = cpu_to_je32((uint32_t)inode->i_size); + ri->atime = ri->ctime = ri->mtime = cpu_to_je32(get_seconds()); + + /* In 2.4, it was already kmapped by generic_file_write(). Doesn't + hurt to do it again. The alternative is ifdefs, which are ugly. */ + kmap(pg); + + ret = jffs2_write_inode_range(c, f, ri, page_address(pg) + aligned_start, + (pg->index << PAGE_CACHE_SHIFT) + aligned_start, + end - aligned_start, &writtenlen); + + kunmap(pg); + + if (ret) { + /* There was an error writing. */ + SetPageError(pg); + } + + /* Adjust writtenlen for the padding we did, so we don't confuse our caller */ + writtenlen -= min(writtenlen, (start - aligned_start)); + + if (writtenlen) { + if (inode->i_size < pos + writtenlen) { + inode->i_size = pos + writtenlen; + inode->i_blocks = (inode->i_size + 511) >> 9; + + inode->i_ctime = inode->i_mtime = ITIME(je32_to_cpu(ri->ctime)); + } + } + + jffs2_free_raw_inode(ri); + + if (start+writtenlen < end) { + /* generic_file_write has written more to the page cache than we've + actually written to the medium. Mark the page !Uptodate so that + it gets reread */ + D1(printk(KERN_DEBUG "jffs2_write_end(): Not all bytes written. Marking page !uptodate\n")); + SetPageError(pg); + ClearPageUptodate(pg); + } + + D1(printk(KERN_DEBUG "jffs2_write_end() returning %d\n", + writtenlen > 0 ? writtenlen : ret)); + unlock_page(pg); + page_cache_release(pg); + return writtenlen > 0 ? writtenlen : ret; +} diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c new file mode 100644 index 00000000..fed08c1d --- /dev/null +++ b/fs/jffs2/fs.c @@ -0,0 +1,748 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + +static int jffs2_flash_setup(struct jffs2_sb_info *c); + +int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) +{ + struct jffs2_full_dnode *old_metadata, *new_metadata; + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_raw_inode *ri; + union jffs2_device_node dev; + unsigned char *mdata = NULL; + int mdatalen = 0; + unsigned int ivalid; + uint32_t alloclen; + int ret; + int alloc_type = ALLOC_NORMAL; + + D1(printk(KERN_DEBUG "jffs2_setattr(): ino #%lu\n", inode->i_ino)); + + /* Special cases - we don't want more than one data node + for these types on the medium at any time. So setattr + must read the original data associated with the node + (i.e. the device numbers or the target name) and write + it out again with the appropriate data attached */ + if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) { + /* For these, we don't actually need to read the old node */ + mdatalen = jffs2_encode_dev(&dev, inode->i_rdev); + mdata = (char *)&dev; + D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of kdev_t\n", mdatalen)); + } else if (S_ISLNK(inode->i_mode)) { + mutex_lock(&f->sem); + mdatalen = f->metadata->size; + mdata = kmalloc(f->metadata->size, GFP_USER); + if (!mdata) { + mutex_unlock(&f->sem); + return -ENOMEM; + } + ret = jffs2_read_dnode(c, f, f->metadata, mdata, 0, mdatalen); + if (ret) { + mutex_unlock(&f->sem); + kfree(mdata); + return ret; + } + mutex_unlock(&f->sem); + D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of symlink target\n", mdatalen)); + } + + ri = jffs2_alloc_raw_inode(); + if (!ri) { + if (S_ISLNK(inode->i_mode)) + kfree(mdata); + return -ENOMEM; + } + + ret = jffs2_reserve_space(c, sizeof(*ri) + mdatalen, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + if (ret) { + jffs2_free_raw_inode(ri); + if (S_ISLNK(inode->i_mode & S_IFMT)) + kfree(mdata); + return ret; + } + mutex_lock(&f->sem); + ivalid = iattr->ia_valid; + + ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + ri->nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); + ri->totlen = cpu_to_je32(sizeof(*ri) + mdatalen); + ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)); + + ri->ino = cpu_to_je32(inode->i_ino); + ri->version = cpu_to_je32(++f->highest_version); + + ri->uid = cpu_to_je16((ivalid & ATTR_UID)?iattr->ia_uid:inode->i_uid); + ri->gid = cpu_to_je16((ivalid & ATTR_GID)?iattr->ia_gid:inode->i_gid); + + if (ivalid & ATTR_MODE) + ri->mode = cpu_to_jemode(iattr->ia_mode); + else + ri->mode = cpu_to_jemode(inode->i_mode); + + + ri->isize = cpu_to_je32((ivalid & ATTR_SIZE)?iattr->ia_size:inode->i_size); + ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode->i_atime)); + ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode->i_mtime)); + ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode->i_ctime)); + + ri->offset = cpu_to_je32(0); + ri->csize = ri->dsize = cpu_to_je32(mdatalen); + ri->compr = JFFS2_COMPR_NONE; + if (ivalid & ATTR_SIZE && inode->i_size < iattr->ia_size) { + /* It's an extension. Make it a hole node */ + ri->compr = JFFS2_COMPR_ZERO; + ri->dsize = cpu_to_je32(iattr->ia_size - inode->i_size); + ri->offset = cpu_to_je32(inode->i_size); + } else if (ivalid & ATTR_SIZE && !iattr->ia_size) { + /* For truncate-to-zero, treat it as deletion because + it'll always be obsoleting all previous nodes */ + alloc_type = ALLOC_DELETION; + } + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + if (mdatalen) + ri->data_crc = cpu_to_je32(crc32(0, mdata, mdatalen)); + else + ri->data_crc = cpu_to_je32(0); + + new_metadata = jffs2_write_dnode(c, f, ri, mdata, mdatalen, alloc_type); + if (S_ISLNK(inode->i_mode)) + kfree(mdata); + + if (IS_ERR(new_metadata)) { + jffs2_complete_reservation(c); + jffs2_free_raw_inode(ri); + mutex_unlock(&f->sem); + return PTR_ERR(new_metadata); + } + /* It worked. Update the inode */ + inode->i_atime = ITIME(je32_to_cpu(ri->atime)); + inode->i_ctime = ITIME(je32_to_cpu(ri->ctime)); + inode->i_mtime = ITIME(je32_to_cpu(ri->mtime)); + inode->i_mode = jemode_to_cpu(ri->mode); + inode->i_uid = je16_to_cpu(ri->uid); + inode->i_gid = je16_to_cpu(ri->gid); + + + old_metadata = f->metadata; + + if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) + jffs2_truncate_fragtree (c, &f->fragtree, iattr->ia_size); + + if (ivalid & ATTR_SIZE && inode->i_size < iattr->ia_size) { + jffs2_add_full_dnode_to_inode(c, f, new_metadata); + inode->i_size = iattr->ia_size; + inode->i_blocks = (inode->i_size + 511) >> 9; + f->metadata = NULL; + } else { + f->metadata = new_metadata; + } + if (old_metadata) { + jffs2_mark_node_obsolete(c, old_metadata->raw); + jffs2_free_full_dnode(old_metadata); + } + jffs2_free_raw_inode(ri); + + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + + /* We have to do the truncate_setsize() without f->sem held, since + some pages may be locked and waiting for it in readpage(). + We are protected from a simultaneous write() extending i_size + back past iattr->ia_size, because do_truncate() holds the + generic inode semaphore. */ + if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) { + truncate_setsize(inode, iattr->ia_size); + inode->i_blocks = (inode->i_size + 511) >> 9; + } + + return 0; +} + +int jffs2_setattr(struct dentry *dentry, struct iattr *iattr) +{ + int rc; + + rc = inode_change_ok(dentry->d_inode, iattr); + if (rc) + return rc; + + rc = jffs2_do_setattr(dentry->d_inode, iattr); + if (!rc && (iattr->ia_valid & ATTR_MODE)) + rc = jffs2_acl_chmod(dentry->d_inode); + + return rc; +} + +int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(dentry->d_sb); + unsigned long avail; + + buf->f_type = JFFS2_SUPER_MAGIC; + buf->f_bsize = 1 << PAGE_SHIFT; + buf->f_blocks = c->flash_size >> PAGE_SHIFT; + buf->f_files = 0; + buf->f_ffree = 0; + buf->f_namelen = JFFS2_MAX_NAME_LEN; + buf->f_fsid.val[0] = JFFS2_SUPER_MAGIC; + buf->f_fsid.val[1] = c->mtd->index; + + spin_lock(&c->erase_completion_lock); + avail = c->dirty_size + c->free_size; + if (avail > c->sector_size * c->resv_blocks_write) + avail -= c->sector_size * c->resv_blocks_write; + else + avail = 0; + spin_unlock(&c->erase_completion_lock); + + buf->f_bavail = buf->f_bfree = avail >> PAGE_SHIFT; + + return 0; +} + + +void jffs2_evict_inode (struct inode *inode) +{ + /* We can forget about this inode for now - drop all + * the nodelists associated with it, etc. + */ + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + + D1(printk(KERN_DEBUG "jffs2_evict_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode)); + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + jffs2_do_clear_inode(c, f); +} + +struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) +{ + struct jffs2_inode_info *f; + struct jffs2_sb_info *c; + struct jffs2_raw_inode latest_node; + union jffs2_device_node jdev; + struct inode *inode; + dev_t rdev = 0; + int ret; + + D1(printk(KERN_DEBUG "jffs2_iget(): ino == %lu\n", ino)); + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + f = JFFS2_INODE_INFO(inode); + c = JFFS2_SB_INFO(inode->i_sb); + + jffs2_init_inode_info(f); + mutex_lock(&f->sem); + + ret = jffs2_do_read_inode(c, f, inode->i_ino, &latest_node); + + if (ret) { + mutex_unlock(&f->sem); + iget_failed(inode); + return ERR_PTR(ret); + } + inode->i_mode = jemode_to_cpu(latest_node.mode); + inode->i_uid = je16_to_cpu(latest_node.uid); + inode->i_gid = je16_to_cpu(latest_node.gid); + inode->i_size = je32_to_cpu(latest_node.isize); + inode->i_atime = ITIME(je32_to_cpu(latest_node.atime)); + inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); + inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime)); + + inode->i_nlink = f->inocache->pino_nlink; + + inode->i_blocks = (inode->i_size + 511) >> 9; + + switch (inode->i_mode & S_IFMT) { + + case S_IFLNK: + inode->i_op = &jffs2_symlink_inode_operations; + break; + + case S_IFDIR: + { + struct jffs2_full_dirent *fd; + inode->i_nlink = 2; /* parent and '.' */ + + for (fd=f->dents; fd; fd = fd->next) { + if (fd->type == DT_DIR && fd->ino) + inc_nlink(inode); + } + /* Root dir gets i_nlink 3 for some reason */ + if (inode->i_ino == 1) + inc_nlink(inode); + + inode->i_op = &jffs2_dir_inode_operations; + inode->i_fop = &jffs2_dir_operations; + break; + } + case S_IFREG: + inode->i_op = &jffs2_file_inode_operations; + inode->i_fop = &jffs2_file_operations; + inode->i_mapping->a_ops = &jffs2_file_address_operations; + inode->i_mapping->nrpages = 0; + break; + + case S_IFBLK: + case S_IFCHR: + /* Read the device numbers from the media */ + if (f->metadata->size != sizeof(jdev.old_id) && + f->metadata->size != sizeof(jdev.new_id)) { + printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size); + goto error_io; + } + D1(printk(KERN_DEBUG "Reading device numbers from flash\n")); + ret = jffs2_read_dnode(c, f, f->metadata, (char *)&jdev, 0, f->metadata->size); + if (ret < 0) { + /* Eep */ + printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino); + goto error; + } + if (f->metadata->size == sizeof(jdev.old_id)) + rdev = old_decode_dev(je16_to_cpu(jdev.old_id)); + else + rdev = new_decode_dev(je32_to_cpu(jdev.new_id)); + + case S_IFSOCK: + case S_IFIFO: + inode->i_op = &jffs2_file_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + break; + + default: + printk(KERN_WARNING "jffs2_read_inode(): Bogus imode %o for ino %lu\n", inode->i_mode, (unsigned long)inode->i_ino); + } + + mutex_unlock(&f->sem); + + D1(printk(KERN_DEBUG "jffs2_read_inode() returning\n")); + unlock_new_inode(inode); + return inode; + +error_io: + ret = -EIO; +error: + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + iget_failed(inode); + return ERR_PTR(ret); +} + +void jffs2_dirty_inode(struct inode *inode, int flags) +{ + struct iattr iattr; + + if (!(inode->i_state & I_DIRTY_DATASYNC)) { + D2(printk(KERN_DEBUG "jffs2_dirty_inode() not calling setattr() for ino #%lu\n", inode->i_ino)); + return; + } + + D1(printk(KERN_DEBUG "jffs2_dirty_inode() calling setattr() for ino #%lu\n", inode->i_ino)); + + iattr.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_MTIME|ATTR_CTIME; + iattr.ia_mode = inode->i_mode; + iattr.ia_uid = inode->i_uid; + iattr.ia_gid = inode->i_gid; + iattr.ia_atime = inode->i_atime; + iattr.ia_mtime = inode->i_mtime; + iattr.ia_ctime = inode->i_ctime; + + jffs2_do_setattr(inode, &iattr); +} + +int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + + if (c->flags & JFFS2_SB_FLAG_RO && !(sb->s_flags & MS_RDONLY)) + return -EROFS; + + /* We stop if it was running, then restart if it needs to. + This also catches the case where it was stopped and this + is just a remount to restart it. + Flush the writebuffer, if neccecary, else we loose it */ + if (!(sb->s_flags & MS_RDONLY)) { + jffs2_stop_garbage_collect_thread(c); + mutex_lock(&c->alloc_sem); + jffs2_flush_wbuf_pad(c); + mutex_unlock(&c->alloc_sem); + } + + if (!(*flags & MS_RDONLY)) + jffs2_start_garbage_collect_thread(c); + + *flags |= MS_NOATIME; + return 0; +} + +/* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, + fill in the raw_inode while you're at it. */ +struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri) +{ + struct inode *inode; + struct super_block *sb = dir_i->i_sb; + struct jffs2_sb_info *c; + struct jffs2_inode_info *f; + int ret; + + D1(printk(KERN_DEBUG "jffs2_new_inode(): dir_i %ld, mode 0x%x\n", dir_i->i_ino, mode)); + + c = JFFS2_SB_INFO(sb); + + inode = new_inode(sb); + + if (!inode) + return ERR_PTR(-ENOMEM); + + f = JFFS2_INODE_INFO(inode); + jffs2_init_inode_info(f); + mutex_lock(&f->sem); + + memset(ri, 0, sizeof(*ri)); + /* Set OS-specific defaults for new inodes */ + ri->uid = cpu_to_je16(current_fsuid()); + + if (dir_i->i_mode & S_ISGID) { + ri->gid = cpu_to_je16(dir_i->i_gid); + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else { + ri->gid = cpu_to_je16(current_fsgid()); + } + + /* POSIX ACLs have to be processed now, at least partly. + The umask is only applied if there's no default ACL */ + ret = jffs2_init_acl_pre(dir_i, inode, &mode); + if (ret) { + make_bad_inode(inode); + iput(inode); + return ERR_PTR(ret); + } + ret = jffs2_do_new_inode (c, f, mode, ri); + if (ret) { + make_bad_inode(inode); + iput(inode); + return ERR_PTR(ret); + } + inode->i_nlink = 1; + inode->i_ino = je32_to_cpu(ri->ino); + inode->i_mode = jemode_to_cpu(ri->mode); + inode->i_gid = je16_to_cpu(ri->gid); + inode->i_uid = je16_to_cpu(ri->uid); + inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime)); + + inode->i_blocks = 0; + inode->i_size = 0; + + if (insert_inode_locked(inode) < 0) { + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(-EINVAL); + } + + return inode; +} + +static int calculate_inocache_hashsize(uint32_t flash_size) +{ + /* + * Pick a inocache hash size based on the size of the medium. + * Count how many megabytes we're dealing with, apply a hashsize twice + * that size, but rounding down to the usual big powers of 2. And keep + * to sensible bounds. + */ + + int size_mb = flash_size / 1024 / 1024; + int hashsize = (size_mb * 2) & ~0x3f; + + if (hashsize < INOCACHE_HASHSIZE_MIN) + return INOCACHE_HASHSIZE_MIN; + if (hashsize > INOCACHE_HASHSIZE_MAX) + return INOCACHE_HASHSIZE_MAX; + + return hashsize; +} + +int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) +{ + struct jffs2_sb_info *c; + struct inode *root_i; + int ret; + size_t blocks; + + c = JFFS2_SB_INFO(sb); + +#ifndef CONFIG_JFFS2_FS_WRITEBUFFER + if (c->mtd->type == MTD_NANDFLASH) { + printk(KERN_ERR "jffs2: Cannot operate on NAND flash unless jffs2 NAND support is compiled in.\n"); + return -EINVAL; + } + if (c->mtd->type == MTD_DATAFLASH) { + printk(KERN_ERR "jffs2: Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in.\n"); + return -EINVAL; + } +#endif + + c->flash_size = c->mtd->size; + c->sector_size = c->mtd->erasesize; + blocks = c->flash_size / c->sector_size; + + /* + * Size alignment check + */ + if ((c->sector_size * blocks) != c->flash_size) { + c->flash_size = c->sector_size * blocks; + printk(KERN_INFO "jffs2: Flash size not aligned to erasesize, reducing to %dKiB\n", + c->flash_size / 1024); + } + + if (c->flash_size < 5*c->sector_size) { + printk(KERN_ERR "jffs2: Too few erase blocks (%d)\n", c->flash_size / c->sector_size); + return -EINVAL; + } + + c->cleanmarker_size = sizeof(struct jffs2_unknown_node); + + /* NAND (or other bizarre) flash... do setup accordingly */ + ret = jffs2_flash_setup(c); + if (ret) + return ret; + + c->inocache_hashsize = calculate_inocache_hashsize(c->flash_size); + c->inocache_list = kcalloc(c->inocache_hashsize, sizeof(struct jffs2_inode_cache *), GFP_KERNEL); + if (!c->inocache_list) { + ret = -ENOMEM; + goto out_wbuf; + } + + jffs2_init_xattr_subsystem(c); + + if ((ret = jffs2_do_mount_fs(c))) + goto out_inohash; + + D1(printk(KERN_DEBUG "jffs2_do_fill_super(): Getting root inode\n")); + root_i = jffs2_iget(sb, 1); + if (IS_ERR(root_i)) { + D1(printk(KERN_WARNING "get root inode failed\n")); + ret = PTR_ERR(root_i); + goto out_root; + } + + ret = -ENOMEM; + + D1(printk(KERN_DEBUG "jffs2_do_fill_super(): d_alloc_root()\n")); + sb->s_root = d_alloc_root(root_i); + if (!sb->s_root) + goto out_root_i; + + sb->s_maxbytes = 0xFFFFFFFF; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = JFFS2_SUPER_MAGIC; + if (!(sb->s_flags & MS_RDONLY)) + jffs2_start_garbage_collect_thread(c); + return 0; + + out_root_i: + iput(root_i); +out_root: + jffs2_free_ino_caches(c); + jffs2_free_raw_node_refs(c); + if (jffs2_blocks_use_vmalloc(c)) + vfree(c->blocks); + else + kfree(c->blocks); + out_inohash: + jffs2_clear_xattr_subsystem(c); + kfree(c->inocache_list); + out_wbuf: + jffs2_flash_cleanup(c); + + return ret; +} + +void jffs2_gc_release_inode(struct jffs2_sb_info *c, + struct jffs2_inode_info *f) +{ + iput(OFNI_EDONI_2SFFJ(f)); +} + +struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c, + int inum, int unlinked) +{ + struct inode *inode; + struct jffs2_inode_cache *ic; + + if (unlinked) { + /* The inode has zero nlink but its nodes weren't yet marked + obsolete. This has to be because we're still waiting for + the final (close() and) iput() to happen. + + There's a possibility that the final iput() could have + happened while we were contemplating. In order to ensure + that we don't cause a new read_inode() (which would fail) + for the inode in question, we use ilookup() in this case + instead of iget(). + + The nlink can't _become_ zero at this point because we're + holding the alloc_sem, and jffs2_do_unlink() would also + need that while decrementing nlink on any inode. + */ + inode = ilookup(OFNI_BS_2SFFJ(c), inum); + if (!inode) { + D1(printk(KERN_DEBUG "ilookup() failed for ino #%u; inode is probably deleted.\n", + inum)); + + spin_lock(&c->inocache_lock); + ic = jffs2_get_ino_cache(c, inum); + if (!ic) { + D1(printk(KERN_DEBUG "Inode cache for ino #%u is gone.\n", inum)); + spin_unlock(&c->inocache_lock); + return NULL; + } + if (ic->state != INO_STATE_CHECKEDABSENT) { + /* Wait for progress. Don't just loop */ + D1(printk(KERN_DEBUG "Waiting for ino #%u in state %d\n", + ic->ino, ic->state)); + sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock); + } else { + spin_unlock(&c->inocache_lock); + } + + return NULL; + } + } else { + /* Inode has links to it still; they're not going away because + jffs2_do_unlink() would need the alloc_sem and we have it. + Just iget() it, and if read_inode() is necessary that's OK. + */ + inode = jffs2_iget(OFNI_BS_2SFFJ(c), inum); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + if (is_bad_inode(inode)) { + printk(KERN_NOTICE "Eep. read_inode() failed for ino #%u. unlinked %d\n", + inum, unlinked); + /* NB. This will happen again. We need to do something appropriate here. */ + iput(inode); + return ERR_PTR(-EIO); + } + + return JFFS2_INODE_INFO(inode); +} + +unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c, + struct jffs2_inode_info *f, + unsigned long offset, + unsigned long *priv) +{ + struct inode *inode = OFNI_EDONI_2SFFJ(f); + struct page *pg; + + pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, + (void *)jffs2_do_readpage_unlock, inode); + if (IS_ERR(pg)) + return (void *)pg; + + *priv = (unsigned long)pg; + return kmap(pg); +} + +void jffs2_gc_release_page(struct jffs2_sb_info *c, + unsigned char *ptr, + unsigned long *priv) +{ + struct page *pg = (void *)*priv; + + kunmap(pg); + page_cache_release(pg); +} + +static int jffs2_flash_setup(struct jffs2_sb_info *c) { + int ret = 0; + + if (c->mtd->type == MTD_NANDFLASH) { + if (!(c->mtd->flags & MTD_OOB_WRITEABLE)) + printk(KERN_INFO "JFFS2 doesn't use OOB.\n"); + /* NAND flash... do setup accordingly */ + ret = jffs2_nand_flash_setup(c); + if (ret) + return ret; + } + + /* and Dataflash */ + if (jffs2_dataflash(c)) { + ret = jffs2_dataflash_setup(c); + if (ret) + return ret; + } + + /* and Intel "Sibley" flash */ + if (jffs2_nor_wbuf_flash(c)) { + ret = jffs2_nor_wbuf_flash_setup(c); + if (ret) + return ret; + } + + /* and an UBI volume */ + if (jffs2_ubivol(c)) { + ret = jffs2_ubivol_setup(c); + if (ret) + return ret; + } + + return ret; +} + +void jffs2_flash_cleanup(struct jffs2_sb_info *c) { + + if (c->mtd->type == MTD_NANDFLASH) { + jffs2_nand_flash_cleanup(c); + } + + /* and DataFlash */ + if (jffs2_dataflash(c)) { + jffs2_dataflash_cleanup(c); + } + + /* and Intel "Sibley" flash */ + if (jffs2_nor_wbuf_flash(c)) { + jffs2_nor_wbuf_flash_cleanup(c); + } + + /* and an UBI volume */ + if (jffs2_ubivol(c)) { + jffs2_ubivol_cleanup(c); + } +} diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c new file mode 100644 index 00000000..4bbd5211 --- /dev/null +++ b/fs/jffs2/gc.c @@ -0,0 +1,1326 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" +#include "compr.h" + +static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c, + struct jffs2_inode_cache *ic, + struct jffs2_raw_node_ref *raw); +static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dnode *fd); +static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dirent *fd); +static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dirent *fd); +static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dnode *fn, + uint32_t start, uint32_t end); +static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dnode *fn, + uint32_t start, uint32_t end); +static int jffs2_garbage_collect_live(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_node_ref *raw, struct jffs2_inode_info *f); + +/* Called with erase_completion_lock held */ +static struct jffs2_eraseblock *jffs2_find_gc_block(struct jffs2_sb_info *c) +{ + struct jffs2_eraseblock *ret; + struct list_head *nextlist = NULL; + int n = jiffies % 128; + + /* Pick an eraseblock to garbage collect next. This is where we'll + put the clever wear-levelling algorithms. Eventually. */ + /* We possibly want to favour the dirtier blocks more when the + number of free blocks is low. */ +again: + if (!list_empty(&c->bad_used_list) && c->nr_free_blocks > c->resv_blocks_gcbad) { + D1(printk(KERN_DEBUG "Picking block from bad_used_list to GC next\n")); + nextlist = &c->bad_used_list; + } else if (n < 50 && !list_empty(&c->erasable_list)) { + /* Note that most of them will have gone directly to be erased. + So don't favour the erasable_list _too_ much. */ + D1(printk(KERN_DEBUG "Picking block from erasable_list to GC next\n")); + nextlist = &c->erasable_list; + } else if (n < 110 && !list_empty(&c->very_dirty_list)) { + /* Most of the time, pick one off the very_dirty list */ + D1(printk(KERN_DEBUG "Picking block from very_dirty_list to GC next\n")); + nextlist = &c->very_dirty_list; + } else if (n < 126 && !list_empty(&c->dirty_list)) { + D1(printk(KERN_DEBUG "Picking block from dirty_list to GC next\n")); + nextlist = &c->dirty_list; + } else if (!list_empty(&c->clean_list)) { + D1(printk(KERN_DEBUG "Picking block from clean_list to GC next\n")); + nextlist = &c->clean_list; + } else if (!list_empty(&c->dirty_list)) { + D1(printk(KERN_DEBUG "Picking block from dirty_list to GC next (clean_list was empty)\n")); + + nextlist = &c->dirty_list; + } else if (!list_empty(&c->very_dirty_list)) { + D1(printk(KERN_DEBUG "Picking block from very_dirty_list to GC next (clean_list and dirty_list were empty)\n")); + nextlist = &c->very_dirty_list; + } else if (!list_empty(&c->erasable_list)) { + D1(printk(KERN_DEBUG "Picking block from erasable_list to GC next (clean_list and {very_,}dirty_list were empty)\n")); + + nextlist = &c->erasable_list; + } else if (!list_empty(&c->erasable_pending_wbuf_list)) { + /* There are blocks are wating for the wbuf sync */ + D1(printk(KERN_DEBUG "Synching wbuf in order to reuse erasable_pending_wbuf_list blocks\n")); + spin_unlock(&c->erase_completion_lock); + jffs2_flush_wbuf_pad(c); + spin_lock(&c->erase_completion_lock); + goto again; + } else { + /* Eep. All were empty */ + D1(printk(KERN_NOTICE "jffs2: No clean, dirty _or_ erasable blocks to GC from! Where are they all?\n")); + return NULL; + } + + ret = list_entry(nextlist->next, struct jffs2_eraseblock, list); + list_del(&ret->list); + c->gcblock = ret; + ret->gc_node = ret->first_node; + if (!ret->gc_node) { + printk(KERN_WARNING "Eep. ret->gc_node for block at 0x%08x is NULL\n", ret->offset); + BUG(); + } + + /* Have we accidentally picked a clean block with wasted space ? */ + if (ret->wasted_size) { + D1(printk(KERN_DEBUG "Converting wasted_size %08x to dirty_size\n", ret->wasted_size)); + ret->dirty_size += ret->wasted_size; + c->wasted_size -= ret->wasted_size; + c->dirty_size += ret->wasted_size; + ret->wasted_size = 0; + } + + return ret; +} + +/* jffs2_garbage_collect_pass + * Make a single attempt to progress GC. Move one node, and possibly + * start erasing one eraseblock. + */ +int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) +{ + struct jffs2_inode_info *f; + struct jffs2_inode_cache *ic; + struct jffs2_eraseblock *jeb; + struct jffs2_raw_node_ref *raw; + uint32_t gcblock_dirty; + int ret = 0, inum, nlink; + int xattr = 0; + + if (mutex_lock_interruptible(&c->alloc_sem)) + return -EINTR; + + for (;;) { + spin_lock(&c->erase_completion_lock); + if (!c->unchecked_size) + break; + + /* We can't start doing GC yet. We haven't finished checking + the node CRCs etc. Do it now. */ + + /* checked_ino is protected by the alloc_sem */ + if (c->checked_ino > c->highest_ino && xattr) { + printk(KERN_CRIT "Checked all inodes but still 0x%x bytes of unchecked space?\n", + c->unchecked_size); + jffs2_dbg_dump_block_lists_nolock(c); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + return -ENOSPC; + } + + spin_unlock(&c->erase_completion_lock); + + if (!xattr) + xattr = jffs2_verify_xattr(c); + + spin_lock(&c->inocache_lock); + + ic = jffs2_get_ino_cache(c, c->checked_ino++); + + if (!ic) { + spin_unlock(&c->inocache_lock); + continue; + } + + if (!ic->pino_nlink) { + D1(printk(KERN_DEBUG "Skipping check of ino #%d with nlink/pino zero\n", + ic->ino)); + spin_unlock(&c->inocache_lock); + jffs2_xattr_delete_inode(c, ic); + continue; + } + switch(ic->state) { + case INO_STATE_CHECKEDABSENT: + case INO_STATE_PRESENT: + D1(printk(KERN_DEBUG "Skipping ino #%u already checked\n", ic->ino)); + spin_unlock(&c->inocache_lock); + continue; + + case INO_STATE_GC: + case INO_STATE_CHECKING: + printk(KERN_WARNING "Inode #%u is in state %d during CRC check phase!\n", ic->ino, ic->state); + spin_unlock(&c->inocache_lock); + BUG(); + + case INO_STATE_READING: + /* We need to wait for it to finish, lest we move on + and trigger the BUG() above while we haven't yet + finished checking all its nodes */ + D1(printk(KERN_DEBUG "Waiting for ino #%u to finish reading\n", ic->ino)); + /* We need to come back again for the _same_ inode. We've + made no progress in this case, but that should be OK */ + c->checked_ino--; + + mutex_unlock(&c->alloc_sem); + sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock); + return 0; + + default: + BUG(); + + case INO_STATE_UNCHECKED: + ; + } + ic->state = INO_STATE_CHECKING; + spin_unlock(&c->inocache_lock); + + D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() triggering inode scan of ino#%u\n", ic->ino)); + + ret = jffs2_do_crccheck_inode(c, ic); + if (ret) + printk(KERN_WARNING "Returned error for crccheck of ino #%u. Expect badness...\n", ic->ino); + + jffs2_set_inocache_state(c, ic, INO_STATE_CHECKEDABSENT); + mutex_unlock(&c->alloc_sem); + return ret; + } + + /* If there are any blocks which need erasing, erase them now */ + if (!list_empty(&c->erase_complete_list) || + !list_empty(&c->erase_pending_list)) { + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n")); + if (jffs2_erase_pending_blocks(c, 1)) + return 0; + + D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n")); + mutex_lock(&c->alloc_sem); + spin_lock(&c->erase_completion_lock); + } + + /* First, work out which block we're garbage-collecting */ + jeb = c->gcblock; + + if (!jeb) + jeb = jffs2_find_gc_block(c); + + if (!jeb) { + /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */ + if (c->nr_erasing_blocks) { + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + return -EAGAIN; + } + D1(printk(KERN_NOTICE "jffs2: Couldn't find erase block to garbage collect!\n")); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + return -EIO; + } + + D1(printk(KERN_DEBUG "GC from block %08x, used_size %08x, dirty_size %08x, free_size %08x\n", jeb->offset, jeb->used_size, jeb->dirty_size, jeb->free_size)); + D1(if (c->nextblock) + printk(KERN_DEBUG "Nextblock at %08x, used_size %08x, dirty_size %08x, wasted_size %08x, free_size %08x\n", c->nextblock->offset, c->nextblock->used_size, c->nextblock->dirty_size, c->nextblock->wasted_size, c->nextblock->free_size)); + + if (!jeb->used_size) { + mutex_unlock(&c->alloc_sem); + goto eraseit; + } + + raw = jeb->gc_node; + gcblock_dirty = jeb->dirty_size; + + while(ref_obsolete(raw)) { + D1(printk(KERN_DEBUG "Node at 0x%08x is obsolete... skipping\n", ref_offset(raw))); + raw = ref_next(raw); + if (unlikely(!raw)) { + printk(KERN_WARNING "eep. End of raw list while still supposedly nodes to GC\n"); + printk(KERN_WARNING "erase block at 0x%08x. free_size 0x%08x, dirty_size 0x%08x, used_size 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size); + jeb->gc_node = raw; + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + BUG(); + } + } + jeb->gc_node = raw; + + D1(printk(KERN_DEBUG "Going to garbage collect node at 0x%08x\n", ref_offset(raw))); + + if (!raw->next_in_ino) { + /* Inode-less node. Clean marker, snapshot or something like that */ + spin_unlock(&c->erase_completion_lock); + if (ref_flags(raw) == REF_PRISTINE) { + /* It's an unknown node with JFFS2_FEATURE_RWCOMPAT_COPY */ + jffs2_garbage_collect_pristine(c, NULL, raw); + } else { + /* Just mark it obsolete */ + jffs2_mark_node_obsolete(c, raw); + } + mutex_unlock(&c->alloc_sem); + goto eraseit_lock; + } + + ic = jffs2_raw_ref_to_ic(raw); + +#ifdef CONFIG_JFFS2_FS_XATTR + /* When 'ic' refers xattr_datum/xattr_ref, this node is GCed as xattr. + * We can decide whether this node is inode or xattr by ic->class. */ + if (ic->class == RAWNODE_CLASS_XATTR_DATUM + || ic->class == RAWNODE_CLASS_XATTR_REF) { + spin_unlock(&c->erase_completion_lock); + + if (ic->class == RAWNODE_CLASS_XATTR_DATUM) { + ret = jffs2_garbage_collect_xattr_datum(c, (struct jffs2_xattr_datum *)ic, raw); + } else { + ret = jffs2_garbage_collect_xattr_ref(c, (struct jffs2_xattr_ref *)ic, raw); + } + goto test_gcnode; + } +#endif + + /* We need to hold the inocache. Either the erase_completion_lock or + the inocache_lock are sufficient; we trade down since the inocache_lock + causes less contention. */ + spin_lock(&c->inocache_lock); + + spin_unlock(&c->erase_completion_lock); + + D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass collecting from block @0x%08x. Node @0x%08x(%d), ino #%u\n", jeb->offset, ref_offset(raw), ref_flags(raw), ic->ino)); + + /* Three possibilities: + 1. Inode is already in-core. We must iget it and do proper + updating to its fragtree, etc. + 2. Inode is not in-core, node is REF_PRISTINE. We lock the + inocache to prevent a read_inode(), copy the node intact. + 3. Inode is not in-core, node is not pristine. We must iget() + and take the slow path. + */ + + switch(ic->state) { + case INO_STATE_CHECKEDABSENT: + /* It's been checked, but it's not currently in-core. + We can just copy any pristine nodes, but have + to prevent anyone else from doing read_inode() while + we're at it, so we set the state accordingly */ + if (ref_flags(raw) == REF_PRISTINE) + ic->state = INO_STATE_GC; + else { + D1(printk(KERN_DEBUG "Ino #%u is absent but node not REF_PRISTINE. Reading.\n", + ic->ino)); + } + break; + + case INO_STATE_PRESENT: + /* It's in-core. GC must iget() it. */ + break; + + case INO_STATE_UNCHECKED: + case INO_STATE_CHECKING: + case INO_STATE_GC: + /* Should never happen. We should have finished checking + by the time we actually start doing any GC, and since + we're holding the alloc_sem, no other garbage collection + can happen. + */ + printk(KERN_CRIT "Inode #%u already in state %d in jffs2_garbage_collect_pass()!\n", + ic->ino, ic->state); + mutex_unlock(&c->alloc_sem); + spin_unlock(&c->inocache_lock); + BUG(); + + case INO_STATE_READING: + /* Someone's currently trying to read it. We must wait for + them to finish and then go through the full iget() route + to do the GC. However, sometimes read_inode() needs to get + the alloc_sem() (for marking nodes invalid) so we must + drop the alloc_sem before sleeping. */ + + mutex_unlock(&c->alloc_sem); + D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() waiting for ino #%u in state %d\n", + ic->ino, ic->state)); + sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock); + /* And because we dropped the alloc_sem we must start again from the + beginning. Ponder chance of livelock here -- we're returning success + without actually making any progress. + + Q: What are the chances that the inode is back in INO_STATE_READING + again by the time we next enter this function? And that this happens + enough times to cause a real delay? + + A: Small enough that I don't care :) + */ + return 0; + } + + /* OK. Now if the inode is in state INO_STATE_GC, we are going to copy the + node intact, and we don't have to muck about with the fragtree etc. + because we know it's not in-core. If it _was_ in-core, we go through + all the iget() crap anyway */ + + if (ic->state == INO_STATE_GC) { + spin_unlock(&c->inocache_lock); + + ret = jffs2_garbage_collect_pristine(c, ic, raw); + + spin_lock(&c->inocache_lock); + ic->state = INO_STATE_CHECKEDABSENT; + wake_up(&c->inocache_wq); + + if (ret != -EBADFD) { + spin_unlock(&c->inocache_lock); + goto test_gcnode; + } + + /* Fall through if it wanted us to, with inocache_lock held */ + } + + /* Prevent the fairly unlikely race where the gcblock is + entirely obsoleted by the final close of a file which had + the only valid nodes in the block, followed by erasure, + followed by freeing of the ic because the erased block(s) + held _all_ the nodes of that inode.... never been seen but + it's vaguely possible. */ + + inum = ic->ino; + nlink = ic->pino_nlink; + spin_unlock(&c->inocache_lock); + + f = jffs2_gc_fetch_inode(c, inum, !nlink); + if (IS_ERR(f)) { + ret = PTR_ERR(f); + goto release_sem; + } + if (!f) { + ret = 0; + goto release_sem; + } + + ret = jffs2_garbage_collect_live(c, jeb, raw, f); + + jffs2_gc_release_inode(c, f); + + test_gcnode: + if (jeb->dirty_size == gcblock_dirty && !ref_obsolete(jeb->gc_node)) { + /* Eep. This really should never happen. GC is broken */ + printk(KERN_ERR "Error garbage collecting node at %08x!\n", ref_offset(jeb->gc_node)); + ret = -ENOSPC; + } + release_sem: + mutex_unlock(&c->alloc_sem); + + eraseit_lock: + /* If we've finished this block, start it erasing */ + spin_lock(&c->erase_completion_lock); + + eraseit: + if (c->gcblock && !c->gcblock->used_size) { + D1(printk(KERN_DEBUG "Block at 0x%08x completely obsoleted by GC. Moving to erase_pending_list\n", c->gcblock->offset)); + /* We're GC'ing an empty block? */ + list_add_tail(&c->gcblock->list, &c->erase_pending_list); + c->gcblock = NULL; + c->nr_erasing_blocks++; + jffs2_garbage_collect_trigger(c); + } + spin_unlock(&c->erase_completion_lock); + + return ret; +} + +static int jffs2_garbage_collect_live(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_node_ref *raw, struct jffs2_inode_info *f) +{ + struct jffs2_node_frag *frag; + struct jffs2_full_dnode *fn = NULL; + struct jffs2_full_dirent *fd; + uint32_t start = 0, end = 0, nrfrags = 0; + int ret = 0; + + mutex_lock(&f->sem); + + /* Now we have the lock for this inode. Check that it's still the one at the head + of the list. */ + + spin_lock(&c->erase_completion_lock); + + if (c->gcblock != jeb) { + spin_unlock(&c->erase_completion_lock); + D1(printk(KERN_DEBUG "GC block is no longer gcblock. Restart\n")); + goto upnout; + } + if (ref_obsolete(raw)) { + spin_unlock(&c->erase_completion_lock); + D1(printk(KERN_DEBUG "node to be GC'd was obsoleted in the meantime.\n")); + /* They'll call again */ + goto upnout; + } + spin_unlock(&c->erase_completion_lock); + + /* OK. Looks safe. And nobody can get us now because we have the semaphore. Move the block */ + if (f->metadata && f->metadata->raw == raw) { + fn = f->metadata; + ret = jffs2_garbage_collect_metadata(c, jeb, f, fn); + goto upnout; + } + + /* FIXME. Read node and do lookup? */ + for (frag = frag_first(&f->fragtree); frag; frag = frag_next(frag)) { + if (frag->node && frag->node->raw == raw) { + fn = frag->node; + end = frag->ofs + frag->size; + if (!nrfrags++) + start = frag->ofs; + if (nrfrags == frag->node->frags) + break; /* We've found them all */ + } + } + if (fn) { + if (ref_flags(raw) == REF_PRISTINE) { + ret = jffs2_garbage_collect_pristine(c, f->inocache, raw); + if (!ret) { + /* Urgh. Return it sensibly. */ + frag->node->raw = f->inocache->nodes; + } + if (ret != -EBADFD) + goto upnout; + } + /* We found a datanode. Do the GC */ + if((start >> PAGE_CACHE_SHIFT) < ((end-1) >> PAGE_CACHE_SHIFT)) { + /* It crosses a page boundary. Therefore, it must be a hole. */ + ret = jffs2_garbage_collect_hole(c, jeb, f, fn, start, end); + } else { + /* It could still be a hole. But we GC the page this way anyway */ + ret = jffs2_garbage_collect_dnode(c, jeb, f, fn, start, end); + } + goto upnout; + } + + /* Wasn't a dnode. Try dirent */ + for (fd = f->dents; fd; fd=fd->next) { + if (fd->raw == raw) + break; + } + + if (fd && fd->ino) { + ret = jffs2_garbage_collect_dirent(c, jeb, f, fd); + } else if (fd) { + ret = jffs2_garbage_collect_deletion_dirent(c, jeb, f, fd); + } else { + printk(KERN_WARNING "Raw node at 0x%08x wasn't in node lists for ino #%u\n", + ref_offset(raw), f->inocache->ino); + if (ref_obsolete(raw)) { + printk(KERN_WARNING "But it's obsolete so we don't mind too much\n"); + } else { + jffs2_dbg_dump_node(c, ref_offset(raw)); + BUG(); + } + } + upnout: + mutex_unlock(&f->sem); + + return ret; +} + +static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c, + struct jffs2_inode_cache *ic, + struct jffs2_raw_node_ref *raw) +{ + union jffs2_node_union *node; + size_t retlen; + int ret; + uint32_t phys_ofs, alloclen; + uint32_t crc, rawlen; + int retried = 0; + + D1(printk(KERN_DEBUG "Going to GC REF_PRISTINE node at 0x%08x\n", ref_offset(raw))); + + alloclen = rawlen = ref_totlen(c, c->gcblock, raw); + + /* Ask for a small amount of space (or the totlen if smaller) because we + don't want to force wastage of the end of a block if splitting would + work. */ + if (ic && alloclen > sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN) + alloclen = sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN; + + ret = jffs2_reserve_space_gc(c, alloclen, &alloclen, rawlen); + /* 'rawlen' is not the exact summary size; it is only an upper estimation */ + + if (ret) + return ret; + + if (alloclen < rawlen) { + /* Doesn't fit untouched. We'll go the old route and split it */ + return -EBADFD; + } + + node = kmalloc(rawlen, GFP_KERNEL); + if (!node) + return -ENOMEM; + + ret = jffs2_flash_read(c, ref_offset(raw), rawlen, &retlen, (char *)node); + if (!ret && retlen != rawlen) + ret = -EIO; + if (ret) + goto out_node; + + crc = crc32(0, node, sizeof(struct jffs2_unknown_node)-4); + if (je32_to_cpu(node->u.hdr_crc) != crc) { + printk(KERN_WARNING "Header CRC failed on REF_PRISTINE node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), je32_to_cpu(node->u.hdr_crc), crc); + goto bail; + } + + switch(je16_to_cpu(node->u.nodetype)) { + case JFFS2_NODETYPE_INODE: + crc = crc32(0, node, sizeof(node->i)-8); + if (je32_to_cpu(node->i.node_crc) != crc) { + printk(KERN_WARNING "Node CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), je32_to_cpu(node->i.node_crc), crc); + goto bail; + } + + if (je32_to_cpu(node->i.dsize)) { + crc = crc32(0, node->i.data, je32_to_cpu(node->i.csize)); + if (je32_to_cpu(node->i.data_crc) != crc) { + printk(KERN_WARNING "Data CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), je32_to_cpu(node->i.data_crc), crc); + goto bail; + } + } + break; + + case JFFS2_NODETYPE_DIRENT: + crc = crc32(0, node, sizeof(node->d)-8); + if (je32_to_cpu(node->d.node_crc) != crc) { + printk(KERN_WARNING "Node CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), je32_to_cpu(node->d.node_crc), crc); + goto bail; + } + + if (strnlen(node->d.name, node->d.nsize) != node->d.nsize) { + printk(KERN_WARNING "Name in dirent node at 0x%08x contains zeroes\n", ref_offset(raw)); + goto bail; + } + + if (node->d.nsize) { + crc = crc32(0, node->d.name, node->d.nsize); + if (je32_to_cpu(node->d.name_crc) != crc) { + printk(KERN_WARNING "Name CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), je32_to_cpu(node->d.name_crc), crc); + goto bail; + } + } + break; + default: + /* If it's inode-less, we don't _know_ what it is. Just copy it intact */ + if (ic) { + printk(KERN_WARNING "Unknown node type for REF_PRISTINE node at 0x%08x: 0x%04x\n", + ref_offset(raw), je16_to_cpu(node->u.nodetype)); + goto bail; + } + } + + /* OK, all the CRCs are good; this node can just be copied as-is. */ + retry: + phys_ofs = write_ofs(c); + + ret = jffs2_flash_write(c, phys_ofs, rawlen, &retlen, (char *)node); + + if (ret || (retlen != rawlen)) { + printk(KERN_NOTICE "Write of %d bytes at 0x%08x failed. returned %d, retlen %zd\n", + rawlen, phys_ofs, ret, retlen); + if (retlen) { + jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, rawlen, NULL); + } else { + printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", phys_ofs); + } + if (!retried) { + /* Try to reallocate space and retry */ + uint32_t dummy; + struct jffs2_eraseblock *jeb = &c->blocks[phys_ofs / c->sector_size]; + + retried = 1; + + D1(printk(KERN_DEBUG "Retrying failed write of REF_PRISTINE node.\n")); + + jffs2_dbg_acct_sanity_check(c,jeb); + jffs2_dbg_acct_paranoia_check(c, jeb); + + ret = jffs2_reserve_space_gc(c, rawlen, &dummy, rawlen); + /* this is not the exact summary size of it, + it is only an upper estimation */ + + if (!ret) { + D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", phys_ofs)); + + jffs2_dbg_acct_sanity_check(c,jeb); + jffs2_dbg_acct_paranoia_check(c, jeb); + + goto retry; + } + D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret)); + } + + if (!ret) + ret = -EIO; + goto out_node; + } + jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, rawlen, ic); + + jffs2_mark_node_obsolete(c, raw); + D1(printk(KERN_DEBUG "WHEEE! GC REF_PRISTINE node at 0x%08x succeeded\n", ref_offset(raw))); + + out_node: + kfree(node); + return ret; + bail: + ret = -EBADFD; + goto out_node; +} + +static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dnode *fn) +{ + struct jffs2_full_dnode *new_fn; + struct jffs2_raw_inode ri; + struct jffs2_node_frag *last_frag; + union jffs2_device_node dev; + char *mdata = NULL; + int mdatalen = 0; + uint32_t alloclen, ilen; + int ret; + + if (S_ISBLK(JFFS2_F_I_MODE(f)) || + S_ISCHR(JFFS2_F_I_MODE(f)) ) { + /* For these, we don't actually need to read the old node */ + mdatalen = jffs2_encode_dev(&dev, JFFS2_F_I_RDEV(f)); + mdata = (char *)&dev; + D1(printk(KERN_DEBUG "jffs2_garbage_collect_metadata(): Writing %d bytes of kdev_t\n", mdatalen)); + } else if (S_ISLNK(JFFS2_F_I_MODE(f))) { + mdatalen = fn->size; + mdata = kmalloc(fn->size, GFP_KERNEL); + if (!mdata) { + printk(KERN_WARNING "kmalloc of mdata failed in jffs2_garbage_collect_metadata()\n"); + return -ENOMEM; + } + ret = jffs2_read_dnode(c, f, fn, mdata, 0, mdatalen); + if (ret) { + printk(KERN_WARNING "read of old metadata failed in jffs2_garbage_collect_metadata(): %d\n", ret); + kfree(mdata); + return ret; + } + D1(printk(KERN_DEBUG "jffs2_garbage_collect_metadata(): Writing %d bites of symlink target\n", mdatalen)); + + } + + ret = jffs2_reserve_space_gc(c, sizeof(ri) + mdatalen, &alloclen, + JFFS2_SUMMARY_INODE_SIZE); + if (ret) { + printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_metadata failed: %d\n", + sizeof(ri)+ mdatalen, ret); + goto out; + } + + last_frag = frag_last(&f->fragtree); + if (last_frag) + /* Fetch the inode length from the fragtree rather then + * from i_size since i_size may have not been updated yet */ + ilen = last_frag->ofs + last_frag->size; + else + ilen = JFFS2_F_I_SIZE(f); + + memset(&ri, 0, sizeof(ri)); + ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); + ri.totlen = cpu_to_je32(sizeof(ri) + mdatalen); + ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4)); + + ri.ino = cpu_to_je32(f->inocache->ino); + ri.version = cpu_to_je32(++f->highest_version); + ri.mode = cpu_to_jemode(JFFS2_F_I_MODE(f)); + ri.uid = cpu_to_je16(JFFS2_F_I_UID(f)); + ri.gid = cpu_to_je16(JFFS2_F_I_GID(f)); + ri.isize = cpu_to_je32(ilen); + ri.atime = cpu_to_je32(JFFS2_F_I_ATIME(f)); + ri.ctime = cpu_to_je32(JFFS2_F_I_CTIME(f)); + ri.mtime = cpu_to_je32(JFFS2_F_I_MTIME(f)); + ri.offset = cpu_to_je32(0); + ri.csize = cpu_to_je32(mdatalen); + ri.dsize = cpu_to_je32(mdatalen); + ri.compr = JFFS2_COMPR_NONE; + ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8)); + ri.data_crc = cpu_to_je32(crc32(0, mdata, mdatalen)); + + new_fn = jffs2_write_dnode(c, f, &ri, mdata, mdatalen, ALLOC_GC); + + if (IS_ERR(new_fn)) { + printk(KERN_WARNING "Error writing new dnode: %ld\n", PTR_ERR(new_fn)); + ret = PTR_ERR(new_fn); + goto out; + } + jffs2_mark_node_obsolete(c, fn->raw); + jffs2_free_full_dnode(fn); + f->metadata = new_fn; + out: + if (S_ISLNK(JFFS2_F_I_MODE(f))) + kfree(mdata); + return ret; +} + +static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dirent *fd) +{ + struct jffs2_full_dirent *new_fd; + struct jffs2_raw_dirent rd; + uint32_t alloclen; + int ret; + + rd.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rd.nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); + rd.nsize = strlen(fd->name); + rd.totlen = cpu_to_je32(sizeof(rd) + rd.nsize); + rd.hdr_crc = cpu_to_je32(crc32(0, &rd, sizeof(struct jffs2_unknown_node)-4)); + + rd.pino = cpu_to_je32(f->inocache->ino); + rd.version = cpu_to_je32(++f->highest_version); + rd.ino = cpu_to_je32(fd->ino); + /* If the times on this inode were set by explicit utime() they can be different, + so refrain from splatting them. */ + if (JFFS2_F_I_MTIME(f) == JFFS2_F_I_CTIME(f)) + rd.mctime = cpu_to_je32(JFFS2_F_I_MTIME(f)); + else + rd.mctime = cpu_to_je32(0); + rd.type = fd->type; + rd.node_crc = cpu_to_je32(crc32(0, &rd, sizeof(rd)-8)); + rd.name_crc = cpu_to_je32(crc32(0, fd->name, rd.nsize)); + + ret = jffs2_reserve_space_gc(c, sizeof(rd)+rd.nsize, &alloclen, + JFFS2_SUMMARY_DIRENT_SIZE(rd.nsize)); + if (ret) { + printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_dirent failed: %d\n", + sizeof(rd)+rd.nsize, ret); + return ret; + } + new_fd = jffs2_write_dirent(c, f, &rd, fd->name, rd.nsize, ALLOC_GC); + + if (IS_ERR(new_fd)) { + printk(KERN_WARNING "jffs2_write_dirent in garbage_collect_dirent failed: %ld\n", PTR_ERR(new_fd)); + return PTR_ERR(new_fd); + } + jffs2_add_fd_to_list(c, new_fd, &f->dents); + return 0; +} + +static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dirent *fd) +{ + struct jffs2_full_dirent **fdp = &f->dents; + int found = 0; + + /* On a medium where we can't actually mark nodes obsolete + pernamently, such as NAND flash, we need to work out + whether this deletion dirent is still needed to actively + delete a 'real' dirent with the same name that's still + somewhere else on the flash. */ + if (!jffs2_can_mark_obsolete(c)) { + struct jffs2_raw_dirent *rd; + struct jffs2_raw_node_ref *raw; + int ret; + size_t retlen; + int name_len = strlen(fd->name); + uint32_t name_crc = crc32(0, fd->name, name_len); + uint32_t rawlen = ref_totlen(c, jeb, fd->raw); + + rd = kmalloc(rawlen, GFP_KERNEL); + if (!rd) + return -ENOMEM; + + /* Prevent the erase code from nicking the obsolete node refs while + we're looking at them. I really don't like this extra lock but + can't see any alternative. Suggestions on a postcard to... */ + mutex_lock(&c->erase_free_sem); + + for (raw = f->inocache->nodes; raw != (void *)f->inocache; raw = raw->next_in_ino) { + + cond_resched(); + + /* We only care about obsolete ones */ + if (!(ref_obsolete(raw))) + continue; + + /* Any dirent with the same name is going to have the same length... */ + if (ref_totlen(c, NULL, raw) != rawlen) + continue; + + /* Doesn't matter if there's one in the same erase block. We're going to + delete it too at the same time. */ + if (SECTOR_ADDR(raw->flash_offset) == SECTOR_ADDR(fd->raw->flash_offset)) + continue; + + D1(printk(KERN_DEBUG "Check potential deletion dirent at %08x\n", ref_offset(raw))); + + /* This is an obsolete node belonging to the same directory, and it's of the right + length. We need to take a closer look...*/ + ret = jffs2_flash_read(c, ref_offset(raw), rawlen, &retlen, (char *)rd); + if (ret) { + printk(KERN_WARNING "jffs2_g_c_deletion_dirent(): Read error (%d) reading obsolete node at %08x\n", ret, ref_offset(raw)); + /* If we can't read it, we don't need to continue to obsolete it. Continue */ + continue; + } + if (retlen != rawlen) { + printk(KERN_WARNING "jffs2_g_c_deletion_dirent(): Short read (%zd not %u) reading header from obsolete node at %08x\n", + retlen, rawlen, ref_offset(raw)); + continue; + } + + if (je16_to_cpu(rd->nodetype) != JFFS2_NODETYPE_DIRENT) + continue; + + /* If the name CRC doesn't match, skip */ + if (je32_to_cpu(rd->name_crc) != name_crc) + continue; + + /* If the name length doesn't match, or it's another deletion dirent, skip */ + if (rd->nsize != name_len || !je32_to_cpu(rd->ino)) + continue; + + /* OK, check the actual name now */ + if (memcmp(rd->name, fd->name, name_len)) + continue; + + /* OK. The name really does match. There really is still an older node on + the flash which our deletion dirent obsoletes. So we have to write out + a new deletion dirent to replace it */ + mutex_unlock(&c->erase_free_sem); + + D1(printk(KERN_DEBUG "Deletion dirent at %08x still obsoletes real dirent \"%s\" at %08x for ino #%u\n", + ref_offset(fd->raw), fd->name, ref_offset(raw), je32_to_cpu(rd->ino))); + kfree(rd); + + return jffs2_garbage_collect_dirent(c, jeb, f, fd); + } + + mutex_unlock(&c->erase_free_sem); + kfree(rd); + } + + /* FIXME: If we're deleting a dirent which contains the current mtime and ctime, + we should update the metadata node with those times accordingly */ + + /* No need for it any more. Just mark it obsolete and remove it from the list */ + while (*fdp) { + if ((*fdp) == fd) { + found = 1; + *fdp = fd->next; + break; + } + fdp = &(*fdp)->next; + } + if (!found) { + printk(KERN_WARNING "Deletion dirent \"%s\" not found in list for ino #%u\n", fd->name, f->inocache->ino); + } + jffs2_mark_node_obsolete(c, fd->raw); + jffs2_free_full_dirent(fd); + return 0; +} + +static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_inode_info *f, struct jffs2_full_dnode *fn, + uint32_t start, uint32_t end) +{ + struct jffs2_raw_inode ri; + struct jffs2_node_frag *frag; + struct jffs2_full_dnode *new_fn; + uint32_t alloclen, ilen; + int ret; + + D1(printk(KERN_DEBUG "Writing replacement hole node for ino #%u from offset 0x%x to 0x%x\n", + f->inocache->ino, start, end)); + + memset(&ri, 0, sizeof(ri)); + + if(fn->frags > 1) { + size_t readlen; + uint32_t crc; + /* It's partially obsoleted by a later write. So we have to + write it out again with the _same_ version as before */ + ret = jffs2_flash_read(c, ref_offset(fn->raw), sizeof(ri), &readlen, (char *)&ri); + if (readlen != sizeof(ri) || ret) { + printk(KERN_WARNING "Node read failed in jffs2_garbage_collect_hole. Ret %d, retlen %zd. Data will be lost by writing new hole node\n", ret, readlen); + goto fill; + } + if (je16_to_cpu(ri.nodetype) != JFFS2_NODETYPE_INODE) { + printk(KERN_WARNING "jffs2_garbage_collect_hole: Node at 0x%08x had node type 0x%04x instead of JFFS2_NODETYPE_INODE(0x%04x)\n", + ref_offset(fn->raw), + je16_to_cpu(ri.nodetype), JFFS2_NODETYPE_INODE); + return -EIO; + } + if (je32_to_cpu(ri.totlen) != sizeof(ri)) { + printk(KERN_WARNING "jffs2_garbage_collect_hole: Node at 0x%08x had totlen 0x%x instead of expected 0x%zx\n", + ref_offset(fn->raw), + je32_to_cpu(ri.totlen), sizeof(ri)); + return -EIO; + } + crc = crc32(0, &ri, sizeof(ri)-8); + if (crc != je32_to_cpu(ri.node_crc)) { + printk(KERN_WARNING "jffs2_garbage_collect_hole: Node at 0x%08x had CRC 0x%08x which doesn't match calculated CRC 0x%08x\n", + ref_offset(fn->raw), + je32_to_cpu(ri.node_crc), crc); + /* FIXME: We could possibly deal with this by writing new holes for each frag */ + printk(KERN_WARNING "Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n", + start, end, f->inocache->ino); + goto fill; + } + if (ri.compr != JFFS2_COMPR_ZERO) { + printk(KERN_WARNING "jffs2_garbage_collect_hole: Node 0x%08x wasn't a hole node!\n", ref_offset(fn->raw)); + printk(KERN_WARNING "Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n", + start, end, f->inocache->ino); + goto fill; + } + } else { + fill: + ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); + ri.totlen = cpu_to_je32(sizeof(ri)); + ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4)); + + ri.ino = cpu_to_je32(f->inocache->ino); + ri.version = cpu_to_je32(++f->highest_version); + ri.offset = cpu_to_je32(start); + ri.dsize = cpu_to_je32(end - start); + ri.csize = cpu_to_je32(0); + ri.compr = JFFS2_COMPR_ZERO; + } + + frag = frag_last(&f->fragtree); + if (frag) + /* Fetch the inode length from the fragtree rather then + * from i_size since i_size may have not been updated yet */ + ilen = frag->ofs + frag->size; + else + ilen = JFFS2_F_I_SIZE(f); + + ri.mode = cpu_to_jemode(JFFS2_F_I_MODE(f)); + ri.uid = cpu_to_je16(JFFS2_F_I_UID(f)); + ri.gid = cpu_to_je16(JFFS2_F_I_GID(f)); + ri.isize = cpu_to_je32(ilen); + ri.atime = cpu_to_je32(JFFS2_F_I_ATIME(f)); + ri.ctime = cpu_to_je32(JFFS2_F_I_CTIME(f)); + ri.mtime = cpu_to_je32(JFFS2_F_I_MTIME(f)); + ri.data_crc = cpu_to_je32(0); + ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8)); + + ret = jffs2_reserve_space_gc(c, sizeof(ri), &alloclen, + JFFS2_SUMMARY_INODE_SIZE); + if (ret) { + printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_hole failed: %d\n", + sizeof(ri), ret); + return ret; + } + new_fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_GC); + + if (IS_ERR(new_fn)) { + printk(KERN_WARNING "Error writing new hole node: %ld\n", PTR_ERR(new_fn)); + return PTR_ERR(new_fn); + } + if (je32_to_cpu(ri.version) == f->highest_version) { + jffs2_add_full_dnode_to_inode(c, f, new_fn); + if (f->metadata) { + jffs2_mark_node_obsolete(c, f->metadata->raw); + jffs2_free_full_dnode(f->metadata); + f->metadata = NULL; + } + return 0; + } + + /* + * We should only get here in the case where the node we are + * replacing had more than one frag, so we kept the same version + * number as before. (Except in case of error -- see 'goto fill;' + * above.) + */ + D1(if(unlikely(fn->frags <= 1)) { + printk(KERN_WARNING "jffs2_garbage_collect_hole: Replacing fn with %d frag(s) but new ver %d != highest_version %d of ino #%d\n", + fn->frags, je32_to_cpu(ri.version), f->highest_version, + je32_to_cpu(ri.ino)); + }); + + /* This is a partially-overlapped hole node. Mark it REF_NORMAL not REF_PRISTINE */ + mark_ref_normal(new_fn->raw); + + for (frag = jffs2_lookup_node_frag(&f->fragtree, fn->ofs); + frag; frag = frag_next(frag)) { + if (frag->ofs > fn->size + fn->ofs) + break; + if (frag->node == fn) { + frag->node = new_fn; + new_fn->frags++; + fn->frags--; + } + } + if (fn->frags) { + printk(KERN_WARNING "jffs2_garbage_collect_hole: Old node still has frags!\n"); + BUG(); + } + if (!new_fn->frags) { + printk(KERN_WARNING "jffs2_garbage_collect_hole: New node has no frags!\n"); + BUG(); + } + + jffs2_mark_node_obsolete(c, fn->raw); + jffs2_free_full_dnode(fn); + + return 0; +} + +static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *orig_jeb, + struct jffs2_inode_info *f, struct jffs2_full_dnode *fn, + uint32_t start, uint32_t end) +{ + struct jffs2_full_dnode *new_fn; + struct jffs2_raw_inode ri; + uint32_t alloclen, offset, orig_end, orig_start; + int ret = 0; + unsigned char *comprbuf = NULL, *writebuf; + unsigned long pg; + unsigned char *pg_ptr; + + memset(&ri, 0, sizeof(ri)); + + D1(printk(KERN_DEBUG "Writing replacement dnode for ino #%u from offset 0x%x to 0x%x\n", + f->inocache->ino, start, end)); + + orig_end = end; + orig_start = start; + + if (c->nr_free_blocks + c->nr_erasing_blocks > c->resv_blocks_gcmerge) { + /* Attempt to do some merging. But only expand to cover logically + adjacent frags if the block containing them is already considered + to be dirty. Otherwise we end up with GC just going round in + circles dirtying the nodes it already wrote out, especially + on NAND where we have small eraseblocks and hence a much higher + chance of nodes having to be split to cross boundaries. */ + + struct jffs2_node_frag *frag; + uint32_t min, max; + + min = start & ~(PAGE_CACHE_SIZE-1); + max = min + PAGE_CACHE_SIZE; + + frag = jffs2_lookup_node_frag(&f->fragtree, start); + + /* BUG_ON(!frag) but that'll happen anyway... */ + + BUG_ON(frag->ofs != start); + + /* First grow down... */ + while((frag = frag_prev(frag)) && frag->ofs >= min) { + + /* If the previous frag doesn't even reach the beginning, there's + excessive fragmentation. Just merge. */ + if (frag->ofs > min) { + D1(printk(KERN_DEBUG "Expanding down to cover partial frag (0x%x-0x%x)\n", + frag->ofs, frag->ofs+frag->size)); + start = frag->ofs; + continue; + } + /* OK. This frag holds the first byte of the page. */ + if (!frag->node || !frag->node->raw) { + D1(printk(KERN_DEBUG "First frag in page is hole (0x%x-0x%x). Not expanding down.\n", + frag->ofs, frag->ofs+frag->size)); + break; + } else { + + /* OK, it's a frag which extends to the beginning of the page. Does it live + in a block which is still considered clean? If so, don't obsolete it. + If not, cover it anyway. */ + + struct jffs2_raw_node_ref *raw = frag->node->raw; + struct jffs2_eraseblock *jeb; + + jeb = &c->blocks[raw->flash_offset / c->sector_size]; + + if (jeb == c->gcblock) { + D1(printk(KERN_DEBUG "Expanding down to cover frag (0x%x-0x%x) in gcblock at %08x\n", + frag->ofs, frag->ofs+frag->size, ref_offset(raw))); + start = frag->ofs; + break; + } + if (!ISDIRTY(jeb->dirty_size + jeb->wasted_size)) { + D1(printk(KERN_DEBUG "Not expanding down to cover frag (0x%x-0x%x) in clean block %08x\n", + frag->ofs, frag->ofs+frag->size, jeb->offset)); + break; + } + + D1(printk(KERN_DEBUG "Expanding down to cover frag (0x%x-0x%x) in dirty block %08x\n", + frag->ofs, frag->ofs+frag->size, jeb->offset)); + start = frag->ofs; + break; + } + } + + /* ... then up */ + + /* Find last frag which is actually part of the node we're to GC. */ + frag = jffs2_lookup_node_frag(&f->fragtree, end-1); + + while((frag = frag_next(frag)) && frag->ofs+frag->size <= max) { + + /* If the previous frag doesn't even reach the beginning, there's lots + of fragmentation. Just merge. */ + if (frag->ofs+frag->size < max) { + D1(printk(KERN_DEBUG "Expanding up to cover partial frag (0x%x-0x%x)\n", + frag->ofs, frag->ofs+frag->size)); + end = frag->ofs + frag->size; + continue; + } + + if (!frag->node || !frag->node->raw) { + D1(printk(KERN_DEBUG "Last frag in page is hole (0x%x-0x%x). Not expanding up.\n", + frag->ofs, frag->ofs+frag->size)); + break; + } else { + + /* OK, it's a frag which extends to the beginning of the page. Does it live + in a block which is still considered clean? If so, don't obsolete it. + If not, cover it anyway. */ + + struct jffs2_raw_node_ref *raw = frag->node->raw; + struct jffs2_eraseblock *jeb; + + jeb = &c->blocks[raw->flash_offset / c->sector_size]; + + if (jeb == c->gcblock) { + D1(printk(KERN_DEBUG "Expanding up to cover frag (0x%x-0x%x) in gcblock at %08x\n", + frag->ofs, frag->ofs+frag->size, ref_offset(raw))); + end = frag->ofs + frag->size; + break; + } + if (!ISDIRTY(jeb->dirty_size + jeb->wasted_size)) { + D1(printk(KERN_DEBUG "Not expanding up to cover frag (0x%x-0x%x) in clean block %08x\n", + frag->ofs, frag->ofs+frag->size, jeb->offset)); + break; + } + + D1(printk(KERN_DEBUG "Expanding up to cover frag (0x%x-0x%x) in dirty block %08x\n", + frag->ofs, frag->ofs+frag->size, jeb->offset)); + end = frag->ofs + frag->size; + break; + } + } + D1(printk(KERN_DEBUG "Expanded dnode to write from (0x%x-0x%x) to (0x%x-0x%x)\n", + orig_start, orig_end, start, end)); + + D1(BUG_ON(end > frag_last(&f->fragtree)->ofs + frag_last(&f->fragtree)->size)); + BUG_ON(end < orig_end); + BUG_ON(start > orig_start); + } + + /* First, use readpage() to read the appropriate page into the page cache */ + /* Q: What happens if we actually try to GC the _same_ page for which commit_write() + * triggered garbage collection in the first place? + * A: I _think_ it's OK. read_cache_page shouldn't deadlock, we'll write out the + * page OK. We'll actually write it out again in commit_write, which is a little + * suboptimal, but at least we're correct. + */ + pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg); + + if (IS_ERR(pg_ptr)) { + printk(KERN_WARNING "read_cache_page() returned error: %ld\n", PTR_ERR(pg_ptr)); + return PTR_ERR(pg_ptr); + } + + offset = start; + while(offset < orig_end) { + uint32_t datalen; + uint32_t cdatalen; + uint16_t comprtype = JFFS2_COMPR_NONE; + + ret = jffs2_reserve_space_gc(c, sizeof(ri) + JFFS2_MIN_DATA_LEN, + &alloclen, JFFS2_SUMMARY_INODE_SIZE); + + if (ret) { + printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_dnode failed: %d\n", + sizeof(ri)+ JFFS2_MIN_DATA_LEN, ret); + break; + } + cdatalen = min_t(uint32_t, alloclen - sizeof(ri), end - offset); + datalen = end - offset; + + writebuf = pg_ptr + (offset & (PAGE_CACHE_SIZE -1)); + + comprtype = jffs2_compress(c, f, writebuf, &comprbuf, &datalen, &cdatalen); + + ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); + ri.totlen = cpu_to_je32(sizeof(ri) + cdatalen); + ri.hdr_crc = cpu_to_je32(crc32(0, &ri, sizeof(struct jffs2_unknown_node)-4)); + + ri.ino = cpu_to_je32(f->inocache->ino); + ri.version = cpu_to_je32(++f->highest_version); + ri.mode = cpu_to_jemode(JFFS2_F_I_MODE(f)); + ri.uid = cpu_to_je16(JFFS2_F_I_UID(f)); + ri.gid = cpu_to_je16(JFFS2_F_I_GID(f)); + ri.isize = cpu_to_je32(JFFS2_F_I_SIZE(f)); + ri.atime = cpu_to_je32(JFFS2_F_I_ATIME(f)); + ri.ctime = cpu_to_je32(JFFS2_F_I_CTIME(f)); + ri.mtime = cpu_to_je32(JFFS2_F_I_MTIME(f)); + ri.offset = cpu_to_je32(offset); + ri.csize = cpu_to_je32(cdatalen); + ri.dsize = cpu_to_je32(datalen); + ri.compr = comprtype & 0xff; + ri.usercompr = (comprtype >> 8) & 0xff; + ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8)); + ri.data_crc = cpu_to_je32(crc32(0, comprbuf, cdatalen)); + + new_fn = jffs2_write_dnode(c, f, &ri, comprbuf, cdatalen, ALLOC_GC); + + jffs2_free_comprbuf(comprbuf, writebuf); + + if (IS_ERR(new_fn)) { + printk(KERN_WARNING "Error writing new dnode: %ld\n", PTR_ERR(new_fn)); + ret = PTR_ERR(new_fn); + break; + } + ret = jffs2_add_full_dnode_to_inode(c, f, new_fn); + offset += datalen; + if (f->metadata) { + jffs2_mark_node_obsolete(c, f->metadata->raw); + jffs2_free_full_dnode(f->metadata); + f->metadata = NULL; + } + } + + jffs2_gc_release_page(c, pg_ptr, &pg); + return ret; +} diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c new file mode 100644 index 00000000..859a598a --- /dev/null +++ b/fs/jffs2/ioctl.c @@ -0,0 +1,22 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include "nodelist.h" + +long jffs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + /* Later, this will provide for lsattr.jffs2 and chattr.jffs2, which + will include compression support etc. */ + return -ENOTTY; +} + diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h new file mode 100644 index 00000000..2e4a8676 --- /dev/null +++ b/fs/jffs2/jffs2_fs_i.h @@ -0,0 +1,56 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef _JFFS2_FS_I +#define _JFFS2_FS_I + +#include +#include +#include + +struct jffs2_inode_info { + /* We need an internal mutex similar to inode->i_mutex. + Unfortunately, we can't used the existing one, because + either the GC would deadlock, or we'd have to release it + before letting GC proceed. Or we'd have to put ugliness + into the GC code so it didn't attempt to obtain the i_mutex + for the inode(s) which are already locked */ + struct mutex sem; + + /* The highest (datanode) version number used for this ino */ + uint32_t highest_version; + + /* List of data fragments which make up the file */ + struct rb_root fragtree; + + /* There may be one datanode which isn't referenced by any of the + above fragments, if it contains a metadata update but no actual + data - or if this is a directory inode */ + /* This also holds the _only_ dnode for symlinks/device nodes, + etc. */ + struct jffs2_full_dnode *metadata; + + /* Directory entries */ + struct jffs2_full_dirent *dents; + + /* The target path if this is the inode of a symlink */ + unsigned char *target; + + /* Some stuff we just have to keep in-core at all times, for each inode. */ + struct jffs2_inode_cache *inocache; + + uint16_t flags; + uint8_t usercompr; + struct inode vfs_inode; +}; + +#endif /* _JFFS2_FS_I */ diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h new file mode 100644 index 00000000..0bc6a6c8 --- /dev/null +++ b/fs/jffs2/jffs2_fs_sb.h @@ -0,0 +1,147 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004-2010 David Woodhouse + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef _JFFS2_FS_SB +#define _JFFS2_FS_SB + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define JFFS2_SB_FLAG_RO 1 +#define JFFS2_SB_FLAG_SCANNING 2 /* Flash scanning is in progress */ +#define JFFS2_SB_FLAG_BUILDING 4 /* File system building is in progress */ + +struct jffs2_inodirty; + +/* A struct for the overall file system control. Pointers to + jffs2_sb_info structs are named `c' in the source code. + Nee jffs_control +*/ +struct jffs2_sb_info { + struct mtd_info *mtd; + + uint32_t highest_ino; + uint32_t checked_ino; + + unsigned int flags; + + struct task_struct *gc_task; /* GC task struct */ + struct completion gc_thread_start; /* GC thread start completion */ + struct completion gc_thread_exit; /* GC thread exit completion port */ + + struct mutex alloc_sem; /* Used to protect all the following + fields, and also to protect against + out-of-order writing of nodes. And GC. */ + uint32_t cleanmarker_size; /* Size of an _inline_ CLEANMARKER + (i.e. zero for OOB CLEANMARKER */ + + uint32_t flash_size; + uint32_t used_size; + uint32_t dirty_size; + uint32_t wasted_size; + uint32_t free_size; + uint32_t erasing_size; + uint32_t bad_size; + uint32_t sector_size; + uint32_t unchecked_size; + + uint32_t nr_free_blocks; + uint32_t nr_erasing_blocks; + + /* Number of free blocks there must be before we... */ + uint8_t resv_blocks_write; /* ... allow a normal filesystem write */ + uint8_t resv_blocks_deletion; /* ... allow a normal filesystem deletion */ + uint8_t resv_blocks_gctrigger; /* ... wake up the GC thread */ + uint8_t resv_blocks_gcbad; /* ... pick a block from the bad_list to GC */ + uint8_t resv_blocks_gcmerge; /* ... merge pages when garbage collecting */ + /* Number of 'very dirty' blocks before we trigger immediate GC */ + uint8_t vdirty_blocks_gctrigger; + + uint32_t nospc_dirty_size; + + uint32_t nr_blocks; + struct jffs2_eraseblock *blocks; /* The whole array of blocks. Used for getting blocks + * from the offset (blocks[ofs / sector_size]) */ + struct jffs2_eraseblock *nextblock; /* The block we're currently filling */ + + struct jffs2_eraseblock *gcblock; /* The block we're currently garbage-collecting */ + + struct list_head clean_list; /* Blocks 100% full of clean data */ + struct list_head very_dirty_list; /* Blocks with lots of dirty space */ + struct list_head dirty_list; /* Blocks with some dirty space */ + struct list_head erasable_list; /* Blocks which are completely dirty, and need erasing */ + struct list_head erasable_pending_wbuf_list; /* Blocks which need erasing but only after the current wbuf is flushed */ + struct list_head erasing_list; /* Blocks which are currently erasing */ + struct list_head erase_checking_list; /* Blocks which are being checked and marked */ + struct list_head erase_pending_list; /* Blocks which need erasing now */ + struct list_head erase_complete_list; /* Blocks which are erased and need the clean marker written to them */ + struct list_head free_list; /* Blocks which are free and ready to be used */ + struct list_head bad_list; /* Bad blocks. */ + struct list_head bad_used_list; /* Bad blocks with valid data in. */ + + spinlock_t erase_completion_lock; /* Protect free_list and erasing_list + against erase completion handler */ + wait_queue_head_t erase_wait; /* For waiting for erases to complete */ + + wait_queue_head_t inocache_wq; + int inocache_hashsize; + struct jffs2_inode_cache **inocache_list; + spinlock_t inocache_lock; + + /* Sem to allow jffs2_garbage_collect_deletion_dirent to + drop the erase_completion_lock while it's holding a pointer + to an obsoleted node. I don't like this. Alternatives welcomed. */ + struct mutex erase_free_sem; + + uint32_t wbuf_pagesize; /* 0 for NOR and other flashes with no wbuf */ + +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + unsigned char *wbuf_verify; /* read-back buffer for verification */ +#endif +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + unsigned char *wbuf; /* Write-behind buffer for NAND flash */ + uint32_t wbuf_ofs; + uint32_t wbuf_len; + struct jffs2_inodirty *wbuf_inodes; + struct rw_semaphore wbuf_sem; /* Protects the write buffer */ + + unsigned char *oobbuf; + int oobavail; /* How many bytes are available for JFFS2 in OOB */ +#endif + + struct jffs2_summary *summary; /* Summary information */ + +#ifdef CONFIG_JFFS2_FS_XATTR +#define XATTRINDEX_HASHSIZE (57) + uint32_t highest_xid; + uint32_t highest_xseqno; + struct list_head xattrindex[XATTRINDEX_HASHSIZE]; + struct list_head xattr_unchecked; + struct list_head xattr_dead_list; + struct jffs2_xattr_ref *xref_dead_list; + struct jffs2_xattr_ref *xref_temp; + struct rw_semaphore xattr_sem; + uint32_t xdatum_mem_usage; + uint32_t xdatum_mem_threshold; +#endif + /* OS-private pointer for getting back to master superblock info */ + void *os_priv; +}; + +#endif /* _JFFS2_FS_SB */ diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c new file mode 100644 index 00000000..c0828689 --- /dev/null +++ b/fs/jffs2/malloc.c @@ -0,0 +1,318 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include "nodelist.h" + +/* These are initialised to NULL in the kernel startup code. + If you're porting to other operating systems, beware */ +static struct kmem_cache *full_dnode_slab; +static struct kmem_cache *raw_dirent_slab; +static struct kmem_cache *raw_inode_slab; +static struct kmem_cache *tmp_dnode_info_slab; +static struct kmem_cache *raw_node_ref_slab; +static struct kmem_cache *node_frag_slab; +static struct kmem_cache *inode_cache_slab; +#ifdef CONFIG_JFFS2_FS_XATTR +static struct kmem_cache *xattr_datum_cache; +static struct kmem_cache *xattr_ref_cache; +#endif + +int __init jffs2_create_slab_caches(void) +{ + full_dnode_slab = kmem_cache_create("jffs2_full_dnode", + sizeof(struct jffs2_full_dnode), + 0, 0, NULL); + if (!full_dnode_slab) + goto err; + + raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent", + sizeof(struct jffs2_raw_dirent), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!raw_dirent_slab) + goto err; + + raw_inode_slab = kmem_cache_create("jffs2_raw_inode", + sizeof(struct jffs2_raw_inode), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!raw_inode_slab) + goto err; + + tmp_dnode_info_slab = kmem_cache_create("jffs2_tmp_dnode", + sizeof(struct jffs2_tmp_dnode_info), + 0, 0, NULL); + if (!tmp_dnode_info_slab) + goto err; + + raw_node_ref_slab = kmem_cache_create("jffs2_refblock", + sizeof(struct jffs2_raw_node_ref) * (REFS_PER_BLOCK + 1), + 0, 0, NULL); + if (!raw_node_ref_slab) + goto err; + + node_frag_slab = kmem_cache_create("jffs2_node_frag", + sizeof(struct jffs2_node_frag), + 0, 0, NULL); + if (!node_frag_slab) + goto err; + + inode_cache_slab = kmem_cache_create("jffs2_inode_cache", + sizeof(struct jffs2_inode_cache), + 0, 0, NULL); + if (!inode_cache_slab) + goto err; + +#ifdef CONFIG_JFFS2_FS_XATTR + xattr_datum_cache = kmem_cache_create("jffs2_xattr_datum", + sizeof(struct jffs2_xattr_datum), + 0, 0, NULL); + if (!xattr_datum_cache) + goto err; + + xattr_ref_cache = kmem_cache_create("jffs2_xattr_ref", + sizeof(struct jffs2_xattr_ref), + 0, 0, NULL); + if (!xattr_ref_cache) + goto err; +#endif + + return 0; + err: + jffs2_destroy_slab_caches(); + return -ENOMEM; +} + +void jffs2_destroy_slab_caches(void) +{ + if(full_dnode_slab) + kmem_cache_destroy(full_dnode_slab); + if(raw_dirent_slab) + kmem_cache_destroy(raw_dirent_slab); + if(raw_inode_slab) + kmem_cache_destroy(raw_inode_slab); + if(tmp_dnode_info_slab) + kmem_cache_destroy(tmp_dnode_info_slab); + if(raw_node_ref_slab) + kmem_cache_destroy(raw_node_ref_slab); + if(node_frag_slab) + kmem_cache_destroy(node_frag_slab); + if(inode_cache_slab) + kmem_cache_destroy(inode_cache_slab); +#ifdef CONFIG_JFFS2_FS_XATTR + if (xattr_datum_cache) + kmem_cache_destroy(xattr_datum_cache); + if (xattr_ref_cache) + kmem_cache_destroy(xattr_ref_cache); +#endif +} + +struct jffs2_full_dirent *jffs2_alloc_full_dirent(int namesize) +{ + struct jffs2_full_dirent *ret; + ret = kmalloc(sizeof(struct jffs2_full_dirent) + namesize, GFP_KERNEL); + dbg_memalloc("%p\n", ret); + return ret; +} + +void jffs2_free_full_dirent(struct jffs2_full_dirent *x) +{ + dbg_memalloc("%p\n", x); + kfree(x); +} + +struct jffs2_full_dnode *jffs2_alloc_full_dnode(void) +{ + struct jffs2_full_dnode *ret; + ret = kmem_cache_alloc(full_dnode_slab, GFP_KERNEL); + dbg_memalloc("%p\n", ret); + return ret; +} + +void jffs2_free_full_dnode(struct jffs2_full_dnode *x) +{ + dbg_memalloc("%p\n", x); + kmem_cache_free(full_dnode_slab, x); +} + +struct jffs2_raw_dirent *jffs2_alloc_raw_dirent(void) +{ + struct jffs2_raw_dirent *ret; + ret = kmem_cache_alloc(raw_dirent_slab, GFP_KERNEL); + dbg_memalloc("%p\n", ret); + return ret; +} + +void jffs2_free_raw_dirent(struct jffs2_raw_dirent *x) +{ + dbg_memalloc("%p\n", x); + kmem_cache_free(raw_dirent_slab, x); +} + +struct jffs2_raw_inode *jffs2_alloc_raw_inode(void) +{ + struct jffs2_raw_inode *ret; + ret = kmem_cache_alloc(raw_inode_slab, GFP_KERNEL); + dbg_memalloc("%p\n", ret); + return ret; +} + +void jffs2_free_raw_inode(struct jffs2_raw_inode *x) +{ + dbg_memalloc("%p\n", x); + kmem_cache_free(raw_inode_slab, x); +} + +struct jffs2_tmp_dnode_info *jffs2_alloc_tmp_dnode_info(void) +{ + struct jffs2_tmp_dnode_info *ret; + ret = kmem_cache_alloc(tmp_dnode_info_slab, GFP_KERNEL); + dbg_memalloc("%p\n", + ret); + return ret; +} + +void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *x) +{ + dbg_memalloc("%p\n", x); + kmem_cache_free(tmp_dnode_info_slab, x); +} + +static struct jffs2_raw_node_ref *jffs2_alloc_refblock(void) +{ + struct jffs2_raw_node_ref *ret; + + ret = kmem_cache_alloc(raw_node_ref_slab, GFP_KERNEL); + if (ret) { + int i = 0; + for (i=0; i < REFS_PER_BLOCK; i++) { + ret[i].flash_offset = REF_EMPTY_NODE; + ret[i].next_in_ino = NULL; + } + ret[i].flash_offset = REF_LINK_NODE; + ret[i].next_in_ino = NULL; + } + return ret; +} + +int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, int nr) +{ + struct jffs2_raw_node_ref **p, *ref; + int i = nr; + + dbg_memalloc("%d\n", nr); + + p = &jeb->last_node; + ref = *p; + + dbg_memalloc("Reserving %d refs for block @0x%08x\n", nr, jeb->offset); + + /* If jeb->last_node is really a valid node then skip over it */ + if (ref && ref->flash_offset != REF_EMPTY_NODE) + ref++; + + while (i) { + if (!ref) { + dbg_memalloc("Allocating new refblock linked from %p\n", p); + ref = *p = jffs2_alloc_refblock(); + if (!ref) + return -ENOMEM; + } + if (ref->flash_offset == REF_LINK_NODE) { + p = &ref->next_in_ino; + ref = *p; + continue; + } + i--; + ref++; + } + jeb->allocated_refs = nr; + + dbg_memalloc("Reserved %d refs for block @0x%08x, last_node is %p (%08x,%p)\n", + nr, jeb->offset, jeb->last_node, jeb->last_node->flash_offset, + jeb->last_node->next_in_ino); + + return 0; +} + +void jffs2_free_refblock(struct jffs2_raw_node_ref *x) +{ + dbg_memalloc("%p\n", x); + kmem_cache_free(raw_node_ref_slab, x); +} + +struct jffs2_node_frag *jffs2_alloc_node_frag(void) +{ + struct jffs2_node_frag *ret; + ret = kmem_cache_alloc(node_frag_slab, GFP_KERNEL); + dbg_memalloc("%p\n", ret); + return ret; +} + +void jffs2_free_node_frag(struct jffs2_node_frag *x) +{ + dbg_memalloc("%p\n", x); + kmem_cache_free(node_frag_slab, x); +} + +struct jffs2_inode_cache *jffs2_alloc_inode_cache(void) +{ + struct jffs2_inode_cache *ret; + ret = kmem_cache_alloc(inode_cache_slab, GFP_KERNEL); + dbg_memalloc("%p\n", ret); + return ret; +} + +void jffs2_free_inode_cache(struct jffs2_inode_cache *x) +{ + dbg_memalloc("%p\n", x); + kmem_cache_free(inode_cache_slab, x); +} + +#ifdef CONFIG_JFFS2_FS_XATTR +struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) +{ + struct jffs2_xattr_datum *xd; + xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL); + dbg_memalloc("%p\n", xd); + + xd->class = RAWNODE_CLASS_XATTR_DATUM; + xd->node = (void *)xd; + INIT_LIST_HEAD(&xd->xindex); + return xd; +} + +void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd) +{ + dbg_memalloc("%p\n", xd); + kmem_cache_free(xattr_datum_cache, xd); +} + +struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) +{ + struct jffs2_xattr_ref *ref; + ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL); + dbg_memalloc("%p\n", ref); + + ref->class = RAWNODE_CLASS_XATTR_REF; + ref->node = (void *)ref; + return ref; +} + +void jffs2_free_xattr_ref(struct jffs2_xattr_ref *ref) +{ + dbg_memalloc("%p\n", ref); + kmem_cache_free(xattr_ref_cache, ref); +} +#endif diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c new file mode 100644 index 00000000..5e03233c --- /dev/null +++ b/fs/jffs2/nodelist.c @@ -0,0 +1,771 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + +static void jffs2_obsolete_node_frag(struct jffs2_sb_info *c, + struct jffs2_node_frag *this); + +void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new, struct jffs2_full_dirent **list) +{ + struct jffs2_full_dirent **prev = list; + + dbg_dentlist("add dirent \"%s\", ino #%u\n", new->name, new->ino); + + while ((*prev) && (*prev)->nhash <= new->nhash) { + if ((*prev)->nhash == new->nhash && !strcmp((*prev)->name, new->name)) { + /* Duplicate. Free one */ + if (new->version < (*prev)->version) { + dbg_dentlist("Eep! Marking new dirent node obsolete, old is \"%s\", ino #%u\n", + (*prev)->name, (*prev)->ino); + jffs2_mark_node_obsolete(c, new->raw); + jffs2_free_full_dirent(new); + } else { + dbg_dentlist("marking old dirent \"%s\", ino #%u obsolete\n", + (*prev)->name, (*prev)->ino); + new->next = (*prev)->next; + /* It may have been a 'placeholder' deletion dirent, + if jffs2_can_mark_obsolete() (see jffs2_do_unlink()) */ + if ((*prev)->raw) + jffs2_mark_node_obsolete(c, ((*prev)->raw)); + jffs2_free_full_dirent(*prev); + *prev = new; + } + return; + } + prev = &((*prev)->next); + } + new->next = *prev; + *prev = new; +} + +uint32_t jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, uint32_t size) +{ + struct jffs2_node_frag *frag = jffs2_lookup_node_frag(list, size); + + dbg_fragtree("truncating fragtree to 0x%08x bytes\n", size); + + /* We know frag->ofs <= size. That's what lookup does for us */ + if (frag && frag->ofs != size) { + if (frag->ofs+frag->size > size) { + frag->size = size - frag->ofs; + } + frag = frag_next(frag); + } + while (frag && frag->ofs >= size) { + struct jffs2_node_frag *next = frag_next(frag); + + frag_erase(frag, list); + jffs2_obsolete_node_frag(c, frag); + frag = next; + } + + if (size == 0) + return 0; + + frag = frag_last(list); + + /* Sanity check for truncation to longer than we started with... */ + if (!frag) + return 0; + if (frag->ofs + frag->size < size) + return frag->ofs + frag->size; + + /* If the last fragment starts at the RAM page boundary, it is + * REF_PRISTINE irrespective of its size. */ + if (frag->node && (frag->ofs & (PAGE_CACHE_SIZE - 1)) == 0) { + dbg_fragtree2("marking the last fragment 0x%08x-0x%08x REF_PRISTINE.\n", + frag->ofs, frag->ofs + frag->size); + frag->node->raw->flash_offset = ref_offset(frag->node->raw) | REF_PRISTINE; + } + return size; +} + +static void jffs2_obsolete_node_frag(struct jffs2_sb_info *c, + struct jffs2_node_frag *this) +{ + if (this->node) { + this->node->frags--; + if (!this->node->frags) { + /* The node has no valid frags left. It's totally obsoleted */ + dbg_fragtree2("marking old node @0x%08x (0x%04x-0x%04x) obsolete\n", + ref_offset(this->node->raw), this->node->ofs, this->node->ofs+this->node->size); + jffs2_mark_node_obsolete(c, this->node->raw); + jffs2_free_full_dnode(this->node); + } else { + dbg_fragtree2("marking old node @0x%08x (0x%04x-0x%04x) REF_NORMAL. frags is %d\n", + ref_offset(this->node->raw), this->node->ofs, this->node->ofs+this->node->size, this->node->frags); + mark_ref_normal(this->node->raw); + } + + } + jffs2_free_node_frag(this); +} + +static void jffs2_fragtree_insert(struct jffs2_node_frag *newfrag, struct jffs2_node_frag *base) +{ + struct rb_node *parent = &base->rb; + struct rb_node **link = &parent; + + dbg_fragtree2("insert frag (0x%04x-0x%04x)\n", newfrag->ofs, newfrag->ofs + newfrag->size); + + while (*link) { + parent = *link; + base = rb_entry(parent, struct jffs2_node_frag, rb); + + if (newfrag->ofs > base->ofs) + link = &base->rb.rb_right; + else if (newfrag->ofs < base->ofs) + link = &base->rb.rb_left; + else { + JFFS2_ERROR("duplicate frag at %08x (%p,%p)\n", newfrag->ofs, newfrag, base); + BUG(); + } + } + + rb_link_node(&newfrag->rb, &base->rb, link); +} + +/* + * Allocate and initializes a new fragment. + */ +static struct jffs2_node_frag * new_fragment(struct jffs2_full_dnode *fn, uint32_t ofs, uint32_t size) +{ + struct jffs2_node_frag *newfrag; + + newfrag = jffs2_alloc_node_frag(); + if (likely(newfrag)) { + newfrag->ofs = ofs; + newfrag->size = size; + newfrag->node = fn; + } else { + JFFS2_ERROR("cannot allocate a jffs2_node_frag object\n"); + } + + return newfrag; +} + +/* + * Called when there is no overlapping fragment exist. Inserts a hole before the new + * fragment and inserts the new fragment to the fragtree. + */ +static int no_overlapping_node(struct jffs2_sb_info *c, struct rb_root *root, + struct jffs2_node_frag *newfrag, + struct jffs2_node_frag *this, uint32_t lastend) +{ + if (lastend < newfrag->node->ofs) { + /* put a hole in before the new fragment */ + struct jffs2_node_frag *holefrag; + + holefrag= new_fragment(NULL, lastend, newfrag->node->ofs - lastend); + if (unlikely(!holefrag)) { + jffs2_free_node_frag(newfrag); + return -ENOMEM; + } + + if (this) { + /* By definition, the 'this' node has no right-hand child, + because there are no frags with offset greater than it. + So that's where we want to put the hole */ + dbg_fragtree2("add hole frag %#04x-%#04x on the right of the new frag.\n", + holefrag->ofs, holefrag->ofs + holefrag->size); + rb_link_node(&holefrag->rb, &this->rb, &this->rb.rb_right); + } else { + dbg_fragtree2("Add hole frag %#04x-%#04x to the root of the tree.\n", + holefrag->ofs, holefrag->ofs + holefrag->size); + rb_link_node(&holefrag->rb, NULL, &root->rb_node); + } + rb_insert_color(&holefrag->rb, root); + this = holefrag; + } + + if (this) { + /* By definition, the 'this' node has no right-hand child, + because there are no frags with offset greater than it. + So that's where we want to put new fragment */ + dbg_fragtree2("add the new node at the right\n"); + rb_link_node(&newfrag->rb, &this->rb, &this->rb.rb_right); + } else { + dbg_fragtree2("insert the new node at the root of the tree\n"); + rb_link_node(&newfrag->rb, NULL, &root->rb_node); + } + rb_insert_color(&newfrag->rb, root); + + return 0; +} + +/* Doesn't set inode->i_size */ +static int jffs2_add_frag_to_fragtree(struct jffs2_sb_info *c, struct rb_root *root, struct jffs2_node_frag *newfrag) +{ + struct jffs2_node_frag *this; + uint32_t lastend; + + /* Skip all the nodes which are completed before this one starts */ + this = jffs2_lookup_node_frag(root, newfrag->node->ofs); + + if (this) { + dbg_fragtree2("lookup gave frag 0x%04x-0x%04x; phys 0x%08x (*%p)\n", + this->ofs, this->ofs+this->size, this->node?(ref_offset(this->node->raw)):0xffffffff, this); + lastend = this->ofs + this->size; + } else { + dbg_fragtree2("lookup gave no frag\n"); + lastend = 0; + } + + /* See if we ran off the end of the fragtree */ + if (lastend <= newfrag->ofs) { + /* We did */ + + /* Check if 'this' node was on the same page as the new node. + If so, both 'this' and the new node get marked REF_NORMAL so + the GC can take a look. + */ + if (lastend && (lastend-1) >> PAGE_CACHE_SHIFT == newfrag->ofs >> PAGE_CACHE_SHIFT) { + if (this->node) + mark_ref_normal(this->node->raw); + mark_ref_normal(newfrag->node->raw); + } + + return no_overlapping_node(c, root, newfrag, this, lastend); + } + + if (this->node) + dbg_fragtree2("dealing with frag %u-%u, phys %#08x(%d).\n", + this->ofs, this->ofs + this->size, + ref_offset(this->node->raw), ref_flags(this->node->raw)); + else + dbg_fragtree2("dealing with hole frag %u-%u.\n", + this->ofs, this->ofs + this->size); + + /* OK. 'this' is pointing at the first frag that newfrag->ofs at least partially obsoletes, + * - i.e. newfrag->ofs < this->ofs+this->size && newfrag->ofs >= this->ofs + */ + if (newfrag->ofs > this->ofs) { + /* This node isn't completely obsoleted. The start of it remains valid */ + + /* Mark the new node and the partially covered node REF_NORMAL -- let + the GC take a look at them */ + mark_ref_normal(newfrag->node->raw); + if (this->node) + mark_ref_normal(this->node->raw); + + if (this->ofs + this->size > newfrag->ofs + newfrag->size) { + /* The new node splits 'this' frag into two */ + struct jffs2_node_frag *newfrag2; + + if (this->node) + dbg_fragtree2("split old frag 0x%04x-0x%04x, phys 0x%08x\n", + this->ofs, this->ofs+this->size, ref_offset(this->node->raw)); + else + dbg_fragtree2("split old hole frag 0x%04x-0x%04x\n", + this->ofs, this->ofs+this->size); + + /* New second frag pointing to this's node */ + newfrag2 = new_fragment(this->node, newfrag->ofs + newfrag->size, + this->ofs + this->size - newfrag->ofs - newfrag->size); + if (unlikely(!newfrag2)) + return -ENOMEM; + if (this->node) + this->node->frags++; + + /* Adjust size of original 'this' */ + this->size = newfrag->ofs - this->ofs; + + /* Now, we know there's no node with offset + greater than this->ofs but smaller than + newfrag2->ofs or newfrag->ofs, for obvious + reasons. So we can do a tree insert from + 'this' to insert newfrag, and a tree insert + from newfrag to insert newfrag2. */ + jffs2_fragtree_insert(newfrag, this); + rb_insert_color(&newfrag->rb, root); + + jffs2_fragtree_insert(newfrag2, newfrag); + rb_insert_color(&newfrag2->rb, root); + + return 0; + } + /* New node just reduces 'this' frag in size, doesn't split it */ + this->size = newfrag->ofs - this->ofs; + + /* Again, we know it lives down here in the tree */ + jffs2_fragtree_insert(newfrag, this); + rb_insert_color(&newfrag->rb, root); + } else { + /* New frag starts at the same point as 'this' used to. Replace + it in the tree without doing a delete and insertion */ + dbg_fragtree2("inserting newfrag (*%p),%d-%d in before 'this' (*%p),%d-%d\n", + newfrag, newfrag->ofs, newfrag->ofs+newfrag->size, this, this->ofs, this->ofs+this->size); + + rb_replace_node(&this->rb, &newfrag->rb, root); + + if (newfrag->ofs + newfrag->size >= this->ofs+this->size) { + dbg_fragtree2("obsoleting node frag %p (%x-%x)\n", this, this->ofs, this->ofs+this->size); + jffs2_obsolete_node_frag(c, this); + } else { + this->ofs += newfrag->size; + this->size -= newfrag->size; + + jffs2_fragtree_insert(this, newfrag); + rb_insert_color(&this->rb, root); + return 0; + } + } + /* OK, now we have newfrag added in the correct place in the tree, but + frag_next(newfrag) may be a fragment which is overlapped by it + */ + while ((this = frag_next(newfrag)) && newfrag->ofs + newfrag->size >= this->ofs + this->size) { + /* 'this' frag is obsoleted completely. */ + dbg_fragtree2("obsoleting node frag %p (%x-%x) and removing from tree\n", + this, this->ofs, this->ofs+this->size); + rb_erase(&this->rb, root); + jffs2_obsolete_node_frag(c, this); + } + /* Now we're pointing at the first frag which isn't totally obsoleted by + the new frag */ + + if (!this || newfrag->ofs + newfrag->size == this->ofs) + return 0; + + /* Still some overlap but we don't need to move it in the tree */ + this->size = (this->ofs + this->size) - (newfrag->ofs + newfrag->size); + this->ofs = newfrag->ofs + newfrag->size; + + /* And mark them REF_NORMAL so the GC takes a look at them */ + if (this->node) + mark_ref_normal(this->node->raw); + mark_ref_normal(newfrag->node->raw); + + return 0; +} + +/* + * Given an inode, probably with existing tree of fragments, add the new node + * to the fragment tree. + */ +int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn) +{ + int ret; + struct jffs2_node_frag *newfrag; + + if (unlikely(!fn->size)) + return 0; + + newfrag = new_fragment(fn, fn->ofs, fn->size); + if (unlikely(!newfrag)) + return -ENOMEM; + newfrag->node->frags = 1; + + dbg_fragtree("adding node %#04x-%#04x @0x%08x on flash, newfrag *%p\n", + fn->ofs, fn->ofs+fn->size, ref_offset(fn->raw), newfrag); + + ret = jffs2_add_frag_to_fragtree(c, &f->fragtree, newfrag); + if (unlikely(ret)) + return ret; + + /* If we now share a page with other nodes, mark either previous + or next node REF_NORMAL, as appropriate. */ + if (newfrag->ofs & (PAGE_CACHE_SIZE-1)) { + struct jffs2_node_frag *prev = frag_prev(newfrag); + + mark_ref_normal(fn->raw); + /* If we don't start at zero there's _always_ a previous */ + if (prev->node) + mark_ref_normal(prev->node->raw); + } + + if ((newfrag->ofs+newfrag->size) & (PAGE_CACHE_SIZE-1)) { + struct jffs2_node_frag *next = frag_next(newfrag); + + if (next) { + mark_ref_normal(fn->raw); + if (next->node) + mark_ref_normal(next->node->raw); + } + } + jffs2_dbg_fragtree_paranoia_check_nolock(f); + + return 0; +} + +void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state) +{ + spin_lock(&c->inocache_lock); + ic->state = state; + wake_up(&c->inocache_wq); + spin_unlock(&c->inocache_lock); +} + +/* During mount, this needs no locking. During normal operation, its + callers want to do other stuff while still holding the inocache_lock. + Rather than introducing special case get_ino_cache functions or + callbacks, we just let the caller do the locking itself. */ + +struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t ino) +{ + struct jffs2_inode_cache *ret; + + ret = c->inocache_list[ino % c->inocache_hashsize]; + while (ret && ret->ino < ino) { + ret = ret->next; + } + + if (ret && ret->ino != ino) + ret = NULL; + + return ret; +} + +void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new) +{ + struct jffs2_inode_cache **prev; + + spin_lock(&c->inocache_lock); + if (!new->ino) + new->ino = ++c->highest_ino; + + dbg_inocache("add %p (ino #%u)\n", new, new->ino); + + prev = &c->inocache_list[new->ino % c->inocache_hashsize]; + + while ((*prev) && (*prev)->ino < new->ino) { + prev = &(*prev)->next; + } + new->next = *prev; + *prev = new; + + spin_unlock(&c->inocache_lock); +} + +void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old) +{ + struct jffs2_inode_cache **prev; + +#ifdef CONFIG_JFFS2_FS_XATTR + BUG_ON(old->xref); +#endif + dbg_inocache("del %p (ino #%u)\n", old, old->ino); + spin_lock(&c->inocache_lock); + + prev = &c->inocache_list[old->ino % c->inocache_hashsize]; + + while ((*prev) && (*prev)->ino < old->ino) { + prev = &(*prev)->next; + } + if ((*prev) == old) { + *prev = old->next; + } + + /* Free it now unless it's in READING or CLEARING state, which + are the transitions upon read_inode() and clear_inode(). The + rest of the time we know nobody else is looking at it, and + if it's held by read_inode() or clear_inode() they'll free it + for themselves. */ + if (old->state != INO_STATE_READING && old->state != INO_STATE_CLEARING) + jffs2_free_inode_cache(old); + + spin_unlock(&c->inocache_lock); +} + +void jffs2_free_ino_caches(struct jffs2_sb_info *c) +{ + int i; + struct jffs2_inode_cache *this, *next; + + for (i=0; i < c->inocache_hashsize; i++) { + this = c->inocache_list[i]; + while (this) { + next = this->next; + jffs2_xattr_free_inode(c, this); + jffs2_free_inode_cache(this); + this = next; + } + c->inocache_list[i] = NULL; + } +} + +void jffs2_free_raw_node_refs(struct jffs2_sb_info *c) +{ + int i; + struct jffs2_raw_node_ref *this, *next; + + for (i=0; inr_blocks; i++) { + this = c->blocks[i].first_node; + while (this) { + if (this[REFS_PER_BLOCK].flash_offset == REF_LINK_NODE) + next = this[REFS_PER_BLOCK].next_in_ino; + else + next = NULL; + + jffs2_free_refblock(this); + this = next; + } + c->blocks[i].first_node = c->blocks[i].last_node = NULL; + } +} + +struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset) +{ + /* The common case in lookup is that there will be a node + which precisely matches. So we go looking for that first */ + struct rb_node *next; + struct jffs2_node_frag *prev = NULL; + struct jffs2_node_frag *frag = NULL; + + dbg_fragtree2("root %p, offset %d\n", fragtree, offset); + + next = fragtree->rb_node; + + while(next) { + frag = rb_entry(next, struct jffs2_node_frag, rb); + + if (frag->ofs + frag->size <= offset) { + /* Remember the closest smaller match on the way down */ + if (!prev || frag->ofs > prev->ofs) + prev = frag; + next = frag->rb.rb_right; + } else if (frag->ofs > offset) { + next = frag->rb.rb_left; + } else { + return frag; + } + } + + /* Exact match not found. Go back up looking at each parent, + and return the closest smaller one */ + + if (prev) + dbg_fragtree2("no match. Returning frag %#04x-%#04x, closest previous\n", + prev->ofs, prev->ofs+prev->size); + else + dbg_fragtree2("returning NULL, empty fragtree\n"); + + return prev; +} + +/* Pass 'c' argument to indicate that nodes should be marked obsolete as + they're killed. */ +void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c) +{ + struct jffs2_node_frag *frag; + struct jffs2_node_frag *parent; + + if (!root->rb_node) + return; + + dbg_fragtree("killing\n"); + + frag = (rb_entry(root->rb_node, struct jffs2_node_frag, rb)); + while(frag) { + if (frag->rb.rb_left) { + frag = frag_left(frag); + continue; + } + if (frag->rb.rb_right) { + frag = frag_right(frag); + continue; + } + + if (frag->node && !(--frag->node->frags)) { + /* Not a hole, and it's the final remaining frag + of this node. Free the node */ + if (c) + jffs2_mark_node_obsolete(c, frag->node->raw); + + jffs2_free_full_dnode(frag->node); + } + parent = frag_parent(frag); + if (parent) { + if (frag_left(parent) == frag) + parent->rb.rb_left = NULL; + else + parent->rb.rb_right = NULL; + } + + jffs2_free_node_frag(frag); + frag = parent; + + cond_resched(); + } +} + +struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, + uint32_t ofs, uint32_t len, + struct jffs2_inode_cache *ic) +{ + struct jffs2_raw_node_ref *ref; + + BUG_ON(!jeb->allocated_refs); + jeb->allocated_refs--; + + ref = jeb->last_node; + + dbg_noderef("Last node at %p is (%08x,%p)\n", ref, ref->flash_offset, + ref->next_in_ino); + + while (ref->flash_offset != REF_EMPTY_NODE) { + if (ref->flash_offset == REF_LINK_NODE) + ref = ref->next_in_ino; + else + ref++; + } + + dbg_noderef("New ref is %p (%08x becomes %08x,%p) len 0x%x\n", ref, + ref->flash_offset, ofs, ref->next_in_ino, len); + + ref->flash_offset = ofs; + + if (!jeb->first_node) { + jeb->first_node = ref; + BUG_ON(ref_offset(ref) != jeb->offset); + } else if (unlikely(ref_offset(ref) != jeb->offset + c->sector_size - jeb->free_size)) { + uint32_t last_len = ref_totlen(c, jeb, jeb->last_node); + + JFFS2_ERROR("Adding new ref %p at (0x%08x-0x%08x) not immediately after previous (0x%08x-0x%08x)\n", + ref, ref_offset(ref), ref_offset(ref)+len, + ref_offset(jeb->last_node), + ref_offset(jeb->last_node)+last_len); + BUG(); + } + jeb->last_node = ref; + + if (ic) { + ref->next_in_ino = ic->nodes; + ic->nodes = ref; + } else { + ref->next_in_ino = NULL; + } + + switch(ref_flags(ref)) { + case REF_UNCHECKED: + c->unchecked_size += len; + jeb->unchecked_size += len; + break; + + case REF_NORMAL: + case REF_PRISTINE: + c->used_size += len; + jeb->used_size += len; + break; + + case REF_OBSOLETE: + c->dirty_size += len; + jeb->dirty_size += len; + break; + } + c->free_size -= len; + jeb->free_size -= len; + +#ifdef TEST_TOTLEN + /* Set (and test) __totlen field... for now */ + ref->__totlen = len; + ref_totlen(c, jeb, ref); +#endif + return ref; +} + +/* No locking, no reservation of 'ref'. Do not use on a live file system */ +int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + uint32_t size) +{ + if (!size) + return 0; + if (unlikely(size > jeb->free_size)) { + printk(KERN_CRIT "Dirty space 0x%x larger then free_size 0x%x (wasted 0x%x)\n", + size, jeb->free_size, jeb->wasted_size); + BUG(); + } + /* REF_EMPTY_NODE is !obsolete, so that works OK */ + if (jeb->last_node && ref_obsolete(jeb->last_node)) { +#ifdef TEST_TOTLEN + jeb->last_node->__totlen += size; +#endif + c->dirty_size += size; + c->free_size -= size; + jeb->dirty_size += size; + jeb->free_size -= size; + } else { + uint32_t ofs = jeb->offset + c->sector_size - jeb->free_size; + ofs |= REF_OBSOLETE; + + jffs2_link_node_ref(c, jeb, ofs, size, NULL); + } + + return 0; +} + +/* Calculate totlen from surrounding nodes or eraseblock */ +static inline uint32_t __ref_totlen(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, + struct jffs2_raw_node_ref *ref) +{ + uint32_t ref_end; + struct jffs2_raw_node_ref *next_ref = ref_next(ref); + + if (next_ref) + ref_end = ref_offset(next_ref); + else { + if (!jeb) + jeb = &c->blocks[ref->flash_offset / c->sector_size]; + + /* Last node in block. Use free_space */ + if (unlikely(ref != jeb->last_node)) { + printk(KERN_CRIT "ref %p @0x%08x is not jeb->last_node (%p @0x%08x)\n", + ref, ref_offset(ref), jeb->last_node, jeb->last_node?ref_offset(jeb->last_node):0); + BUG(); + } + ref_end = jeb->offset + c->sector_size - jeb->free_size; + } + return ref_end - ref_offset(ref); +} + +uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_node_ref *ref) +{ + uint32_t ret; + + ret = __ref_totlen(c, jeb, ref); + +#ifdef TEST_TOTLEN + if (unlikely(ret != ref->__totlen)) { + if (!jeb) + jeb = &c->blocks[ref->flash_offset / c->sector_size]; + + printk(KERN_CRIT "Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n", + ref, ref_offset(ref), ref_offset(ref)+ref->__totlen, + ret, ref->__totlen); + if (ref_next(ref)) { + printk(KERN_CRIT "next %p (0x%08x-0x%08x)\n", ref_next(ref), ref_offset(ref_next(ref)), + ref_offset(ref_next(ref))+ref->__totlen); + } else + printk(KERN_CRIT "No next ref. jeb->last_node is %p\n", jeb->last_node); + + printk(KERN_CRIT "jeb->wasted_size %x, dirty_size %x, used_size %x, free_size %x\n", jeb->wasted_size, jeb->dirty_size, jeb->used_size, jeb->free_size); + +#if defined(JFFS2_DBG_DUMPS) || defined(JFFS2_DBG_PARANOIA_CHECKS) + __jffs2_dbg_dump_node_refs_nolock(c, jeb); +#endif + + WARN_ON(1); + + ret = ref->__totlen; + } +#endif /* TEST_TOTLEN */ + return ret; +} diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h new file mode 100644 index 00000000..e4619b00 --- /dev/null +++ b/fs/jffs2/nodelist.h @@ -0,0 +1,480 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef __JFFS2_NODELIST_H__ +#define __JFFS2_NODELIST_H__ + +#include +#include +#include +#include "jffs2_fs_sb.h" +#include "jffs2_fs_i.h" +#include "xattr.h" +#include "acl.h" +#include "summary.h" + +#ifdef __ECOS +#include "os-ecos.h" +#else +#include "os-linux.h" +#endif + +#define JFFS2_NATIVE_ENDIAN + +/* Note we handle mode bits conversion from JFFS2 (i.e. Linux) to/from + whatever OS we're actually running on here too. */ + +#if defined(JFFS2_NATIVE_ENDIAN) +#define cpu_to_je16(x) ((jint16_t){x}) +#define cpu_to_je32(x) ((jint32_t){x}) +#define cpu_to_jemode(x) ((jmode_t){os_to_jffs2_mode(x)}) + +#define constant_cpu_to_je16(x) ((jint16_t){x}) +#define constant_cpu_to_je32(x) ((jint32_t){x}) + +#define je16_to_cpu(x) ((x).v16) +#define je32_to_cpu(x) ((x).v32) +#define jemode_to_cpu(x) (jffs2_to_os_mode((x).m)) +#elif defined(JFFS2_BIG_ENDIAN) +#define cpu_to_je16(x) ((jint16_t){cpu_to_be16(x)}) +#define cpu_to_je32(x) ((jint32_t){cpu_to_be32(x)}) +#define cpu_to_jemode(x) ((jmode_t){cpu_to_be32(os_to_jffs2_mode(x))}) + +#define constant_cpu_to_je16(x) ((jint16_t){__constant_cpu_to_be16(x)}) +#define constant_cpu_to_je32(x) ((jint32_t){__constant_cpu_to_be32(x)}) + +#define je16_to_cpu(x) (be16_to_cpu(x.v16)) +#define je32_to_cpu(x) (be32_to_cpu(x.v32)) +#define jemode_to_cpu(x) (be32_to_cpu(jffs2_to_os_mode((x).m))) +#elif defined(JFFS2_LITTLE_ENDIAN) +#define cpu_to_je16(x) ((jint16_t){cpu_to_le16(x)}) +#define cpu_to_je32(x) ((jint32_t){cpu_to_le32(x)}) +#define cpu_to_jemode(x) ((jmode_t){cpu_to_le32(os_to_jffs2_mode(x))}) + +#define constant_cpu_to_je16(x) ((jint16_t){__constant_cpu_to_le16(x)}) +#define constant_cpu_to_je32(x) ((jint32_t){__constant_cpu_to_le32(x)}) + +#define je16_to_cpu(x) (le16_to_cpu(x.v16)) +#define je32_to_cpu(x) (le32_to_cpu(x.v32)) +#define jemode_to_cpu(x) (le32_to_cpu(jffs2_to_os_mode((x).m))) +#else +#error wibble +#endif + +/* The minimal node header size */ +#define JFFS2_MIN_NODE_HEADER sizeof(struct jffs2_raw_dirent) + +/* + This is all we need to keep in-core for each raw node during normal + operation. As and when we do read_inode on a particular inode, we can + scan the nodes which are listed for it and build up a proper map of + which nodes are currently valid. JFFSv1 always used to keep that whole + map in core for each inode. +*/ +struct jffs2_raw_node_ref +{ + struct jffs2_raw_node_ref *next_in_ino; /* Points to the next raw_node_ref + for this object. If this _is_ the last, it points to the inode_cache, + xattr_ref or xattr_datum instead. The common part of those structures + has NULL in the first word. See jffs2_raw_ref_to_ic() below */ + uint32_t flash_offset; +#undef TEST_TOTLEN +#ifdef TEST_TOTLEN + uint32_t __totlen; /* This may die; use ref_totlen(c, jeb, ) below */ +#endif +}; + +#define REF_LINK_NODE ((int32_t)-1) +#define REF_EMPTY_NODE ((int32_t)-2) + +/* Use blocks of about 256 bytes */ +#define REFS_PER_BLOCK ((255/sizeof(struct jffs2_raw_node_ref))-1) + +static inline struct jffs2_raw_node_ref *ref_next(struct jffs2_raw_node_ref *ref) +{ + ref++; + + /* Link to another block of refs */ + if (ref->flash_offset == REF_LINK_NODE) { + ref = ref->next_in_ino; + if (!ref) + return ref; + } + + /* End of chain */ + if (ref->flash_offset == REF_EMPTY_NODE) + return NULL; + + return ref; +} + +static inline struct jffs2_inode_cache *jffs2_raw_ref_to_ic(struct jffs2_raw_node_ref *raw) +{ + while(raw->next_in_ino) + raw = raw->next_in_ino; + + /* NB. This can be a jffs2_xattr_datum or jffs2_xattr_ref and + not actually a jffs2_inode_cache. Check ->class */ + return ((struct jffs2_inode_cache *)raw); +} + + /* flash_offset & 3 always has to be zero, because nodes are + always aligned at 4 bytes. So we have a couple of extra bits + to play with, which indicate the node's status; see below: */ +#define REF_UNCHECKED 0 /* We haven't yet checked the CRC or built its inode */ +#define REF_OBSOLETE 1 /* Obsolete, can be completely ignored */ +#define REF_PRISTINE 2 /* Completely clean. GC without looking */ +#define REF_NORMAL 3 /* Possibly overlapped. Read the page and write again on GC */ +#define ref_flags(ref) ((ref)->flash_offset & 3) +#define ref_offset(ref) ((ref)->flash_offset & ~3) +#define ref_obsolete(ref) (((ref)->flash_offset & 3) == REF_OBSOLETE) +#define mark_ref_normal(ref) do { (ref)->flash_offset = ref_offset(ref) | REF_NORMAL; } while(0) + +/* Dirent nodes should be REF_PRISTINE only if they are not a deletion + dirent. Deletion dirents should be REF_NORMAL so that GC gets to + throw them away when appropriate */ +#define dirent_node_state(rd) ( (je32_to_cpu((rd)->ino)?REF_PRISTINE:REF_NORMAL) ) + +/* NB: REF_PRISTINE for an inode-less node (ref->next_in_ino == NULL) indicates + it is an unknown node of type JFFS2_NODETYPE_RWCOMPAT_COPY, so it'll get + copied. If you need to do anything different to GC inode-less nodes, then + you need to modify gc.c accordingly. */ + +/* For each inode in the filesystem, we need to keep a record of + nlink, because it would be a PITA to scan the whole directory tree + at read_inode() time to calculate it, and to keep sufficient information + in the raw_node_ref (basically both parent and child inode number for + dirent nodes) would take more space than this does. We also keep + a pointer to the first physical node which is part of this inode, too. +*/ +struct jffs2_inode_cache { + /* First part of structure is shared with other objects which + can terminate the raw node refs' next_in_ino list -- which + currently struct jffs2_xattr_datum and struct jffs2_xattr_ref. */ + + struct jffs2_full_dirent *scan_dents; /* Used during scan to hold + temporary lists of dirents, and later must be set to + NULL to mark the end of the raw_node_ref->next_in_ino + chain. */ + struct jffs2_raw_node_ref *nodes; + uint8_t class; /* It's used for identification */ + + /* end of shared structure */ + + uint8_t flags; + uint16_t state; + uint32_t ino; + struct jffs2_inode_cache *next; +#ifdef CONFIG_JFFS2_FS_XATTR + struct jffs2_xattr_ref *xref; +#endif + uint32_t pino_nlink; /* Directories store parent inode + here; other inodes store nlink. + Zero always means that it's + completely unlinked. */ +}; + +/* Inode states for 'state' above. We need the 'GC' state to prevent + someone from doing a read_inode() while we're moving a 'REF_PRISTINE' + node without going through all the iget() nonsense */ +#define INO_STATE_UNCHECKED 0 /* CRC checks not yet done */ +#define INO_STATE_CHECKING 1 /* CRC checks in progress */ +#define INO_STATE_PRESENT 2 /* In core */ +#define INO_STATE_CHECKEDABSENT 3 /* Checked, cleared again */ +#define INO_STATE_GC 4 /* GCing a 'pristine' node */ +#define INO_STATE_READING 5 /* In read_inode() */ +#define INO_STATE_CLEARING 6 /* In clear_inode() */ + +#define INO_FLAGS_XATTR_CHECKED 0x01 /* has no duplicate xattr_ref */ + +#define RAWNODE_CLASS_INODE_CACHE 0 +#define RAWNODE_CLASS_XATTR_DATUM 1 +#define RAWNODE_CLASS_XATTR_REF 2 + +#define INOCACHE_HASHSIZE_MIN 128 +#define INOCACHE_HASHSIZE_MAX 1024 + +#define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size) + +/* + Larger representation of a raw node, kept in-core only when the + struct inode for this particular ino is instantiated. +*/ + +struct jffs2_full_dnode +{ + struct jffs2_raw_node_ref *raw; + uint32_t ofs; /* The offset to which the data of this node belongs */ + uint32_t size; + uint32_t frags; /* Number of fragments which currently refer + to this node. When this reaches zero, + the node is obsolete. */ +}; + +/* + Even larger representation of a raw node, kept in-core only while + we're actually building up the original map of which nodes go where, + in read_inode() +*/ +struct jffs2_tmp_dnode_info +{ + struct rb_node rb; + struct jffs2_full_dnode *fn; + uint32_t version; + uint32_t data_crc; + uint32_t partial_crc; + uint16_t csize; + uint16_t overlapped; +}; + +/* Temporary data structure used during readinode. */ +struct jffs2_readinode_info +{ + struct rb_root tn_root; + struct jffs2_tmp_dnode_info *mdata_tn; + uint32_t highest_version; + uint32_t latest_mctime; + uint32_t mctime_ver; + struct jffs2_full_dirent *fds; + struct jffs2_raw_node_ref *latest_ref; +}; + +struct jffs2_full_dirent +{ + struct jffs2_raw_node_ref *raw; + struct jffs2_full_dirent *next; + uint32_t version; + uint32_t ino; /* == zero for unlink */ + unsigned int nhash; + unsigned char type; + unsigned char name[0]; +}; + +/* + Fragments - used to build a map of which raw node to obtain + data from for each part of the ino +*/ +struct jffs2_node_frag +{ + struct rb_node rb; + struct jffs2_full_dnode *node; /* NULL for holes */ + uint32_t size; + uint32_t ofs; /* The offset to which this fragment belongs */ +}; + +struct jffs2_eraseblock +{ + struct list_head list; + int bad_count; + uint32_t offset; /* of this block in the MTD */ + + uint32_t unchecked_size; + uint32_t used_size; + uint32_t dirty_size; + uint32_t wasted_size; + uint32_t free_size; /* Note that sector_size - free_size + is the address of the first free space */ + uint32_t allocated_refs; + struct jffs2_raw_node_ref *first_node; + struct jffs2_raw_node_ref *last_node; + + struct jffs2_raw_node_ref *gc_node; /* Next node to be garbage collected */ +}; + +static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c) +{ + return ((c->flash_size / c->sector_size) * sizeof (struct jffs2_eraseblock)) > (128 * 1024); +} + +#define ref_totlen(a, b, c) __jffs2_ref_totlen((a), (b), (c)) + +#define ALLOC_NORMAL 0 /* Normal allocation */ +#define ALLOC_DELETION 1 /* Deletion node. Best to allow it */ +#define ALLOC_GC 2 /* Space requested for GC. Give it or die */ +#define ALLOC_NORETRY 3 /* For jffs2_write_dnode: On failure, return -EAGAIN instead of retrying */ + +/* How much dirty space before it goes on the very_dirty_list */ +#define VERYDIRTY(c, size) ((size) >= ((c)->sector_size / 2)) + +/* check if dirty space is more than 255 Byte */ +#define ISDIRTY(size) ((size) > sizeof (struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN) + +#define PAD(x) (((x)+3)&~3) + +static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev) +{ + if (old_valid_dev(rdev)) { + jdev->old_id = cpu_to_je16(old_encode_dev(rdev)); + return sizeof(jdev->old_id); + } else { + jdev->new_id = cpu_to_je32(new_encode_dev(rdev)); + return sizeof(jdev->new_id); + } +} + +static inline struct jffs2_node_frag *frag_first(struct rb_root *root) +{ + struct rb_node *node = rb_first(root); + + if (!node) + return NULL; + + return rb_entry(node, struct jffs2_node_frag, rb); +} + +static inline struct jffs2_node_frag *frag_last(struct rb_root *root) +{ + struct rb_node *node = rb_last(root); + + if (!node) + return NULL; + + return rb_entry(node, struct jffs2_node_frag, rb); +} + +#define frag_next(frag) rb_entry(rb_next(&(frag)->rb), struct jffs2_node_frag, rb) +#define frag_prev(frag) rb_entry(rb_prev(&(frag)->rb), struct jffs2_node_frag, rb) +#define frag_parent(frag) rb_entry(rb_parent(&(frag)->rb), struct jffs2_node_frag, rb) +#define frag_left(frag) rb_entry((frag)->rb.rb_left, struct jffs2_node_frag, rb) +#define frag_right(frag) rb_entry((frag)->rb.rb_right, struct jffs2_node_frag, rb) +#define frag_erase(frag, list) rb_erase(&frag->rb, list); + +#define tn_next(tn) rb_entry(rb_next(&(tn)->rb), struct jffs2_tmp_dnode_info, rb) +#define tn_prev(tn) rb_entry(rb_prev(&(tn)->rb), struct jffs2_tmp_dnode_info, rb) +#define tn_parent(tn) rb_entry(rb_parent(&(tn)->rb), struct jffs2_tmp_dnode_info, rb) +#define tn_left(tn) rb_entry((tn)->rb.rb_left, struct jffs2_tmp_dnode_info, rb) +#define tn_right(tn) rb_entry((tn)->rb.rb_right, struct jffs2_tmp_dnode_info, rb) +#define tn_erase(tn, list) rb_erase(&tn->rb, list); +#define tn_last(list) rb_entry(rb_last(list), struct jffs2_tmp_dnode_info, rb) +#define tn_first(list) rb_entry(rb_first(list), struct jffs2_tmp_dnode_info, rb) + +/* nodelist.c */ +void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new, struct jffs2_full_dirent **list); +void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state); +struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t ino); +void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new); +void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old); +void jffs2_free_ino_caches(struct jffs2_sb_info *c); +void jffs2_free_raw_node_refs(struct jffs2_sb_info *c); +struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset); +void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete); +int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn); +uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size); +struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, + uint32_t ofs, uint32_t len, + struct jffs2_inode_cache *ic); +extern uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, + struct jffs2_raw_node_ref *ref); + +/* nodemgmt.c */ +int jffs2_thread_should_wake(struct jffs2_sb_info *c); +int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, + uint32_t *len, int prio, uint32_t sumsize); +int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, + uint32_t *len, uint32_t sumsize); +struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c, + uint32_t ofs, uint32_t len, + struct jffs2_inode_cache *ic); +void jffs2_complete_reservation(struct jffs2_sb_info *c); +void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *raw); + +/* write.c */ +int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint32_t mode, struct jffs2_raw_inode *ri); + +struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_raw_inode *ri, const unsigned char *data, + uint32_t datalen, int alloc_mode); +struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_raw_dirent *rd, const unsigned char *name, + uint32_t namelen, int alloc_mode); +int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_raw_inode *ri, unsigned char *buf, + uint32_t offset, uint32_t writelen, uint32_t *retlen); +int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, + struct jffs2_raw_inode *ri, const struct qstr *qstr); +int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name, + int namelen, struct jffs2_inode_info *dead_f, uint32_t time); +int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino, + uint8_t type, const char *name, int namelen, uint32_t time); + + +/* readinode.c */ +int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + uint32_t ino, struct jffs2_raw_inode *latest_node); +int jffs2_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic); +void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f); + +/* malloc.c */ +int jffs2_create_slab_caches(void); +void jffs2_destroy_slab_caches(void); + +struct jffs2_full_dirent *jffs2_alloc_full_dirent(int namesize); +void jffs2_free_full_dirent(struct jffs2_full_dirent *); +struct jffs2_full_dnode *jffs2_alloc_full_dnode(void); +void jffs2_free_full_dnode(struct jffs2_full_dnode *); +struct jffs2_raw_dirent *jffs2_alloc_raw_dirent(void); +void jffs2_free_raw_dirent(struct jffs2_raw_dirent *); +struct jffs2_raw_inode *jffs2_alloc_raw_inode(void); +void jffs2_free_raw_inode(struct jffs2_raw_inode *); +struct jffs2_tmp_dnode_info *jffs2_alloc_tmp_dnode_info(void); +void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *); +int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, int nr); +void jffs2_free_refblock(struct jffs2_raw_node_ref *); +struct jffs2_node_frag *jffs2_alloc_node_frag(void); +void jffs2_free_node_frag(struct jffs2_node_frag *); +struct jffs2_inode_cache *jffs2_alloc_inode_cache(void); +void jffs2_free_inode_cache(struct jffs2_inode_cache *); +#ifdef CONFIG_JFFS2_FS_XATTR +struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void); +void jffs2_free_xattr_datum(struct jffs2_xattr_datum *); +struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void); +void jffs2_free_xattr_ref(struct jffs2_xattr_ref *); +#endif + +/* gc.c */ +int jffs2_garbage_collect_pass(struct jffs2_sb_info *c); + +/* read.c */ +int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_full_dnode *fd, unsigned char *buf, + int ofs, int len); +int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + unsigned char *buf, uint32_t offset, uint32_t len); +char *jffs2_getlink(struct jffs2_sb_info *c, struct jffs2_inode_info *f); + +/* scan.c */ +int jffs2_scan_medium(struct jffs2_sb_info *c); +void jffs2_rotate_lists(struct jffs2_sb_info *c); +struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uint32_t ino); +int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); +int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t size); + +/* build.c */ +int jffs2_do_mount_fs(struct jffs2_sb_info *c); + +/* erase.c */ +int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count); +void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); + +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER +/* wbuf.c */ +int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino); +int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c); +int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); +int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); +#endif + +#include "debug.h" + +#endif /* __JFFS2_NODELIST_H__ */ diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c new file mode 100644 index 00000000..694aa5b0 --- /dev/null +++ b/fs/jffs2/nodemgmt.c @@ -0,0 +1,787 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include /* For cond_resched() */ +#include "nodelist.h" +#include "debug.h" + +/** + * jffs2_reserve_space - request physical space to write nodes to flash + * @c: superblock info + * @minsize: Minimum acceptable size of allocation + * @len: Returned value of allocation length + * @prio: Allocation type - ALLOC_{NORMAL,DELETION} + * + * Requests a block of physical space on the flash. Returns zero for success + * and puts 'len' into the appropriate place, or returns -ENOSPC or other + * error if appropriate. Doesn't return len since that's + * + * If it returns zero, jffs2_reserve_space() also downs the per-filesystem + * allocation semaphore, to prevent more than one allocation from being + * active at any time. The semaphore is later released by jffs2_commit_allocation() + * + * jffs2_reserve_space() may trigger garbage collection in order to make room + * for the requested allocation. + */ + +static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, + uint32_t *len, uint32_t sumsize); + +int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, + uint32_t *len, int prio, uint32_t sumsize) +{ + int ret = -EAGAIN; + int blocksneeded = c->resv_blocks_write; + /* align it */ + minsize = PAD(minsize); + + D1(printk(KERN_DEBUG "jffs2_reserve_space(): Requested 0x%x bytes\n", minsize)); + mutex_lock(&c->alloc_sem); + + D1(printk(KERN_DEBUG "jffs2_reserve_space(): alloc sem got\n")); + + spin_lock(&c->erase_completion_lock); + + /* this needs a little more thought (true :)) */ + while(ret == -EAGAIN) { + while(c->nr_free_blocks + c->nr_erasing_blocks < blocksneeded) { + uint32_t dirty, avail; + + /* calculate real dirty size + * dirty_size contains blocks on erase_pending_list + * those blocks are counted in c->nr_erasing_blocks. + * If one block is actually erased, it is not longer counted as dirty_space + * but it is counted in c->nr_erasing_blocks, so we add it and subtract it + * with c->nr_erasing_blocks * c->sector_size again. + * Blocks on erasable_list are counted as dirty_size, but not in c->nr_erasing_blocks + * This helps us to force gc and pick eventually a clean block to spread the load. + * We add unchecked_size here, as we hopefully will find some space to use. + * This will affect the sum only once, as gc first finishes checking + * of nodes. + */ + dirty = c->dirty_size + c->erasing_size - c->nr_erasing_blocks * c->sector_size + c->unchecked_size; + if (dirty < c->nospc_dirty_size) { + if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) { + D1(printk(KERN_NOTICE "jffs2_reserve_space(): Low on dirty space to GC, but it's a deletion. Allowing...\n")); + break; + } + D1(printk(KERN_DEBUG "dirty size 0x%08x + unchecked_size 0x%08x < nospc_dirty_size 0x%08x, returning -ENOSPC\n", + dirty, c->unchecked_size, c->sector_size)); + + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + return -ENOSPC; + } + + /* Calc possibly available space. Possibly available means that we + * don't know, if unchecked size contains obsoleted nodes, which could give us some + * more usable space. This will affect the sum only once, as gc first finishes checking + * of nodes. + + Return -ENOSPC, if the maximum possibly available space is less or equal than + * blocksneeded * sector_size. + * This blocks endless gc looping on a filesystem, which is nearly full, even if + * the check above passes. + */ + avail = c->free_size + c->dirty_size + c->erasing_size + c->unchecked_size; + if ( (avail / c->sector_size) <= blocksneeded) { + if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) { + D1(printk(KERN_NOTICE "jffs2_reserve_space(): Low on possibly available space, but it's a deletion. Allowing...\n")); + break; + } + + D1(printk(KERN_DEBUG "max. available size 0x%08x < blocksneeded * sector_size 0x%08x, returning -ENOSPC\n", + avail, blocksneeded * c->sector_size)); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + return -ENOSPC; + } + + mutex_unlock(&c->alloc_sem); + + D1(printk(KERN_DEBUG "Triggering GC pass. nr_free_blocks %d, nr_erasing_blocks %d, free_size 0x%08x, dirty_size 0x%08x, wasted_size 0x%08x, used_size 0x%08x, erasing_size 0x%08x, bad_size 0x%08x (total 0x%08x of 0x%08x)\n", + c->nr_free_blocks, c->nr_erasing_blocks, c->free_size, c->dirty_size, c->wasted_size, c->used_size, c->erasing_size, c->bad_size, + c->free_size + c->dirty_size + c->wasted_size + c->used_size + c->erasing_size + c->bad_size, c->flash_size)); + spin_unlock(&c->erase_completion_lock); + + ret = jffs2_garbage_collect_pass(c); + + if (ret == -EAGAIN) { + spin_lock(&c->erase_completion_lock); + if (c->nr_erasing_blocks && + list_empty(&c->erase_pending_list) && + list_empty(&c->erase_complete_list)) { + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&c->erase_wait, &wait); + D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__)); + spin_unlock(&c->erase_completion_lock); + + schedule(); + } else + spin_unlock(&c->erase_completion_lock); + } else if (ret) + return ret; + + cond_resched(); + + if (signal_pending(current)) + return -EINTR; + + mutex_lock(&c->alloc_sem); + spin_lock(&c->erase_completion_lock); + } + + ret = jffs2_do_reserve_space(c, minsize, len, sumsize); + if (ret) { + D1(printk(KERN_DEBUG "jffs2_reserve_space: ret is %d\n", ret)); + } + } + spin_unlock(&c->erase_completion_lock); + if (!ret) + ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); + if (ret) + mutex_unlock(&c->alloc_sem); + return ret; +} + +int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, + uint32_t *len, uint32_t sumsize) +{ + int ret = -EAGAIN; + minsize = PAD(minsize); + + D1(printk(KERN_DEBUG "jffs2_reserve_space_gc(): Requested 0x%x bytes\n", minsize)); + + spin_lock(&c->erase_completion_lock); + while(ret == -EAGAIN) { + ret = jffs2_do_reserve_space(c, minsize, len, sumsize); + if (ret) { + D1(printk(KERN_DEBUG "jffs2_reserve_space_gc: looping, ret is %d\n", ret)); + } + } + spin_unlock(&c->erase_completion_lock); + if (!ret) + ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); + + return ret; +} + + +/* Classify nextblock (clean, dirty of verydirty) and force to select an other one */ + +static void jffs2_close_nextblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +{ + + if (c->nextblock == NULL) { + D1(printk(KERN_DEBUG "jffs2_close_nextblock: Erase block at 0x%08x has already been placed in a list\n", + jeb->offset)); + return; + } + /* Check, if we have a dirty block now, or if it was dirty already */ + if (ISDIRTY (jeb->wasted_size + jeb->dirty_size)) { + c->dirty_size += jeb->wasted_size; + c->wasted_size -= jeb->wasted_size; + jeb->dirty_size += jeb->wasted_size; + jeb->wasted_size = 0; + if (VERYDIRTY(c, jeb->dirty_size)) { + D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to very_dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size)); + list_add_tail(&jeb->list, &c->very_dirty_list); + } else { + D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size)); + list_add_tail(&jeb->list, &c->dirty_list); + } + } else { + D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size)); + list_add_tail(&jeb->list, &c->clean_list); + } + c->nextblock = NULL; + +} + +/* Select a new jeb for nextblock */ + +static int jffs2_find_nextblock(struct jffs2_sb_info *c) +{ + struct list_head *next; + + /* Take the next block off the 'free' list */ + + if (list_empty(&c->free_list)) { + + if (!c->nr_erasing_blocks && + !list_empty(&c->erasable_list)) { + struct jffs2_eraseblock *ejeb; + + ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list); + list_move_tail(&ejeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + jffs2_garbage_collect_trigger(c); + D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n", + ejeb->offset)); + } + + if (!c->nr_erasing_blocks && + !list_empty(&c->erasable_pending_wbuf_list)) { + D1(printk(KERN_DEBUG "jffs2_find_nextblock: Flushing write buffer\n")); + /* c->nextblock is NULL, no update to c->nextblock allowed */ + spin_unlock(&c->erase_completion_lock); + jffs2_flush_wbuf_pad(c); + spin_lock(&c->erase_completion_lock); + /* Have another go. It'll be on the erasable_list now */ + return -EAGAIN; + } + + if (!c->nr_erasing_blocks) { + /* Ouch. We're in GC, or we wouldn't have got here. + And there's no space left. At all. */ + printk(KERN_CRIT "Argh. No free space left for GC. nr_erasing_blocks is %d. nr_free_blocks is %d. (erasableempty: %s, erasingempty: %s, erasependingempty: %s)\n", + c->nr_erasing_blocks, c->nr_free_blocks, list_empty(&c->erasable_list)?"yes":"no", + list_empty(&c->erasing_list)?"yes":"no", list_empty(&c->erase_pending_list)?"yes":"no"); + return -ENOSPC; + } + + spin_unlock(&c->erase_completion_lock); + /* Don't wait for it; just erase one right now */ + jffs2_erase_pending_blocks(c, 1); + spin_lock(&c->erase_completion_lock); + + /* An erase may have failed, decreasing the + amount of free space available. So we must + restart from the beginning */ + return -EAGAIN; + } + + next = c->free_list.next; + list_del(next); + c->nextblock = list_entry(next, struct jffs2_eraseblock, list); + c->nr_free_blocks--; + + jffs2_sum_reset_collected(c->summary); /* reset collected summary */ + +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + /* adjust write buffer offset, else we get a non contiguous write bug */ + if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len) + c->wbuf_ofs = 0xffffffff; +#endif + + D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset)); + + return 0; +} + +/* Called with alloc sem _and_ erase_completion_lock */ +static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, + uint32_t *len, uint32_t sumsize) +{ + struct jffs2_eraseblock *jeb = c->nextblock; + uint32_t reserved_size; /* for summary information at the end of the jeb */ + int ret; + + restart: + reserved_size = 0; + + if (jffs2_sum_active() && (sumsize != JFFS2_SUMMARY_NOSUM_SIZE)) { + /* NOSUM_SIZE means not to generate summary */ + + if (jeb) { + reserved_size = PAD(sumsize + c->summary->sum_size + JFFS2_SUMMARY_FRAME_SIZE); + dbg_summary("minsize=%d , jeb->free=%d ," + "summary->size=%d , sumsize=%d\n", + minsize, jeb->free_size, + c->summary->sum_size, sumsize); + } + + /* Is there enough space for writing out the current node, or we have to + write out summary information now, close this jeb and select new nextblock? */ + if (jeb && (PAD(minsize) + PAD(c->summary->sum_size + sumsize + + JFFS2_SUMMARY_FRAME_SIZE) > jeb->free_size)) { + + /* Has summary been disabled for this jeb? */ + if (jffs2_sum_is_disabled(c->summary)) { + sumsize = JFFS2_SUMMARY_NOSUM_SIZE; + goto restart; + } + + /* Writing out the collected summary information */ + dbg_summary("generating summary for 0x%08x.\n", jeb->offset); + ret = jffs2_sum_write_sumnode(c); + + if (ret) + return ret; + + if (jffs2_sum_is_disabled(c->summary)) { + /* jffs2_write_sumnode() couldn't write out the summary information + diabling summary for this jeb and free the collected information + */ + sumsize = JFFS2_SUMMARY_NOSUM_SIZE; + goto restart; + } + + jffs2_close_nextblock(c, jeb); + jeb = NULL; + /* keep always valid value in reserved_size */ + reserved_size = PAD(sumsize + c->summary->sum_size + JFFS2_SUMMARY_FRAME_SIZE); + } + } else { + if (jeb && minsize > jeb->free_size) { + uint32_t waste; + + /* Skip the end of this block and file it as having some dirty space */ + /* If there's a pending write to it, flush now */ + + if (jffs2_wbuf_dirty(c)) { + spin_unlock(&c->erase_completion_lock); + D1(printk(KERN_DEBUG "jffs2_do_reserve_space: Flushing write buffer\n")); + jffs2_flush_wbuf_pad(c); + spin_lock(&c->erase_completion_lock); + jeb = c->nextblock; + goto restart; + } + + spin_unlock(&c->erase_completion_lock); + + ret = jffs2_prealloc_raw_node_refs(c, jeb, 1); + if (ret) + return ret; + /* Just lock it again and continue. Nothing much can change because + we hold c->alloc_sem anyway. In fact, it's not entirely clear why + we hold c->erase_completion_lock in the majority of this function... + but that's a question for another (more caffeine-rich) day. */ + spin_lock(&c->erase_completion_lock); + + waste = jeb->free_size; + jffs2_link_node_ref(c, jeb, + (jeb->offset + c->sector_size - waste) | REF_OBSOLETE, + waste, NULL); + /* FIXME: that made it count as dirty. Convert to wasted */ + jeb->dirty_size -= waste; + c->dirty_size -= waste; + jeb->wasted_size += waste; + c->wasted_size += waste; + + jffs2_close_nextblock(c, jeb); + jeb = NULL; + } + } + + if (!jeb) { + + ret = jffs2_find_nextblock(c); + if (ret) + return ret; + + jeb = c->nextblock; + + if (jeb->free_size != c->sector_size - c->cleanmarker_size) { + printk(KERN_WARNING "Eep. Block 0x%08x taken from free_list had free_size of 0x%08x!!\n", jeb->offset, jeb->free_size); + goto restart; + } + } + /* OK, jeb (==c->nextblock) is now pointing at a block which definitely has + enough space */ + *len = jeb->free_size - reserved_size; + + if (c->cleanmarker_size && jeb->used_size == c->cleanmarker_size && + !jeb->first_node->next_in_ino) { + /* Only node in it beforehand was a CLEANMARKER node (we think). + So mark it obsolete now that there's going to be another node + in the block. This will reduce used_size to zero but We've + already set c->nextblock so that jffs2_mark_node_obsolete() + won't try to refile it to the dirty_list. + */ + spin_unlock(&c->erase_completion_lock); + jffs2_mark_node_obsolete(c, jeb->first_node); + spin_lock(&c->erase_completion_lock); + } + + D1(printk(KERN_DEBUG "jffs2_do_reserve_space(): Giving 0x%x bytes at 0x%x\n", + *len, jeb->offset + (c->sector_size - jeb->free_size))); + return 0; +} + +/** + * jffs2_add_physical_node_ref - add a physical node reference to the list + * @c: superblock info + * @new: new node reference to add + * @len: length of this physical node + * + * Should only be used to report nodes for which space has been allocated + * by jffs2_reserve_space. + * + * Must be called with the alloc_sem held. + */ + +struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c, + uint32_t ofs, uint32_t len, + struct jffs2_inode_cache *ic) +{ + struct jffs2_eraseblock *jeb; + struct jffs2_raw_node_ref *new; + + jeb = &c->blocks[ofs / c->sector_size]; + + D1(printk(KERN_DEBUG "jffs2_add_physical_node_ref(): Node at 0x%x(%d), size 0x%x\n", + ofs & ~3, ofs & 3, len)); +#if 1 + /* Allow non-obsolete nodes only to be added at the end of c->nextblock, + if c->nextblock is set. Note that wbuf.c will file obsolete nodes + even after refiling c->nextblock */ + if ((c->nextblock || ((ofs & 3) != REF_OBSOLETE)) + && (jeb != c->nextblock || (ofs & ~3) != jeb->offset + (c->sector_size - jeb->free_size))) { + printk(KERN_WARNING "argh. node added in wrong place at 0x%08x(%d)\n", ofs & ~3, ofs & 3); + if (c->nextblock) + printk(KERN_WARNING "nextblock 0x%08x", c->nextblock->offset); + else + printk(KERN_WARNING "No nextblock"); + printk(", expected at %08x\n", jeb->offset + (c->sector_size - jeb->free_size)); + return ERR_PTR(-EINVAL); + } +#endif + spin_lock(&c->erase_completion_lock); + + new = jffs2_link_node_ref(c, jeb, ofs, len, ic); + + if (!jeb->free_size && !jeb->dirty_size && !ISDIRTY(jeb->wasted_size)) { + /* If it lives on the dirty_list, jffs2_reserve_space will put it there */ + D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size)); + if (jffs2_wbuf_dirty(c)) { + /* Flush the last write in the block if it's outstanding */ + spin_unlock(&c->erase_completion_lock); + jffs2_flush_wbuf_pad(c); + spin_lock(&c->erase_completion_lock); + } + + list_add_tail(&jeb->list, &c->clean_list); + c->nextblock = NULL; + } + jffs2_dbg_acct_sanity_check_nolock(c,jeb); + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + + spin_unlock(&c->erase_completion_lock); + + return new; +} + + +void jffs2_complete_reservation(struct jffs2_sb_info *c) +{ + D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n")); + spin_lock(&c->erase_completion_lock); + jffs2_garbage_collect_trigger(c); + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); +} + +static inline int on_list(struct list_head *obj, struct list_head *head) +{ + struct list_head *this; + + list_for_each(this, head) { + if (this == obj) { + D1(printk("%p is on list at %p\n", obj, head)); + return 1; + + } + } + return 0; +} + +void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref) +{ + struct jffs2_eraseblock *jeb; + int blocknr; + struct jffs2_unknown_node n; + int ret, addedsize; + size_t retlen; + uint32_t freed_len; + + if(unlikely(!ref)) { + printk(KERN_NOTICE "EEEEEK. jffs2_mark_node_obsolete called with NULL node\n"); + return; + } + if (ref_obsolete(ref)) { + D1(printk(KERN_DEBUG "jffs2_mark_node_obsolete called with already obsolete node at 0x%08x\n", ref_offset(ref))); + return; + } + blocknr = ref->flash_offset / c->sector_size; + if (blocknr >= c->nr_blocks) { + printk(KERN_NOTICE "raw node at 0x%08x is off the end of device!\n", ref->flash_offset); + BUG(); + } + jeb = &c->blocks[blocknr]; + + if (jffs2_can_mark_obsolete(c) && !jffs2_is_readonly(c) && + !(c->flags & (JFFS2_SB_FLAG_SCANNING | JFFS2_SB_FLAG_BUILDING))) { + /* Hm. This may confuse static lock analysis. If any of the above + three conditions is false, we're going to return from this + function without actually obliterating any nodes or freeing + any jffs2_raw_node_refs. So we don't need to stop erases from + happening, or protect against people holding an obsolete + jffs2_raw_node_ref without the erase_completion_lock. */ + mutex_lock(&c->erase_free_sem); + } + + spin_lock(&c->erase_completion_lock); + + freed_len = ref_totlen(c, jeb, ref); + + if (ref_flags(ref) == REF_UNCHECKED) { + D1(if (unlikely(jeb->unchecked_size < freed_len)) { + printk(KERN_NOTICE "raw unchecked node of size 0x%08x freed from erase block %d at 0x%08x, but unchecked_size was already 0x%08x\n", + freed_len, blocknr, ref->flash_offset, jeb->used_size); + BUG(); + }) + D1(printk(KERN_DEBUG "Obsoleting previously unchecked node at 0x%08x of len %x: ", ref_offset(ref), freed_len)); + jeb->unchecked_size -= freed_len; + c->unchecked_size -= freed_len; + } else { + D1(if (unlikely(jeb->used_size < freed_len)) { + printk(KERN_NOTICE "raw node of size 0x%08x freed from erase block %d at 0x%08x, but used_size was already 0x%08x\n", + freed_len, blocknr, ref->flash_offset, jeb->used_size); + BUG(); + }) + D1(printk(KERN_DEBUG "Obsoleting node at 0x%08x of len %#x: ", ref_offset(ref), freed_len)); + jeb->used_size -= freed_len; + c->used_size -= freed_len; + } + + // Take care, that wasted size is taken into concern + if ((jeb->dirty_size || ISDIRTY(jeb->wasted_size + freed_len)) && jeb != c->nextblock) { + D1(printk("Dirtying\n")); + addedsize = freed_len; + jeb->dirty_size += freed_len; + c->dirty_size += freed_len; + + /* Convert wasted space to dirty, if not a bad block */ + if (jeb->wasted_size) { + if (on_list(&jeb->list, &c->bad_used_list)) { + D1(printk(KERN_DEBUG "Leaving block at %08x on the bad_used_list\n", + jeb->offset)); + addedsize = 0; /* To fool the refiling code later */ + } else { + D1(printk(KERN_DEBUG "Converting %d bytes of wasted space to dirty in block at %08x\n", + jeb->wasted_size, jeb->offset)); + addedsize += jeb->wasted_size; + jeb->dirty_size += jeb->wasted_size; + c->dirty_size += jeb->wasted_size; + c->wasted_size -= jeb->wasted_size; + jeb->wasted_size = 0; + } + } + } else { + D1(printk("Wasting\n")); + addedsize = 0; + jeb->wasted_size += freed_len; + c->wasted_size += freed_len; + } + ref->flash_offset = ref_offset(ref) | REF_OBSOLETE; + + jffs2_dbg_acct_sanity_check_nolock(c, jeb); + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + + if (c->flags & JFFS2_SB_FLAG_SCANNING) { + /* Flash scanning is in progress. Don't muck about with the block + lists because they're not ready yet, and don't actually + obliterate nodes that look obsolete. If they weren't + marked obsolete on the flash at the time they _became_ + obsolete, there was probably a reason for that. */ + spin_unlock(&c->erase_completion_lock); + /* We didn't lock the erase_free_sem */ + return; + } + + if (jeb == c->nextblock) { + D2(printk(KERN_DEBUG "Not moving nextblock 0x%08x to dirty/erase_pending list\n", jeb->offset)); + } else if (!jeb->used_size && !jeb->unchecked_size) { + if (jeb == c->gcblock) { + D1(printk(KERN_DEBUG "gcblock at 0x%08x completely dirtied. Clearing gcblock...\n", jeb->offset)); + c->gcblock = NULL; + } else { + D1(printk(KERN_DEBUG "Eraseblock at 0x%08x completely dirtied. Removing from (dirty?) list...\n", jeb->offset)); + list_del(&jeb->list); + } + if (jffs2_wbuf_dirty(c)) { + D1(printk(KERN_DEBUG "...and adding to erasable_pending_wbuf_list\n")); + list_add_tail(&jeb->list, &c->erasable_pending_wbuf_list); + } else { + if (jiffies & 127) { + /* Most of the time, we just erase it immediately. Otherwise we + spend ages scanning it on mount, etc. */ + D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); + list_add_tail(&jeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + jffs2_garbage_collect_trigger(c); + } else { + /* Sometimes, however, we leave it elsewhere so it doesn't get + immediately reused, and we spread the load a bit. */ + D1(printk(KERN_DEBUG "...and adding to erasable_list\n")); + list_add_tail(&jeb->list, &c->erasable_list); + } + } + D1(printk(KERN_DEBUG "Done OK\n")); + } else if (jeb == c->gcblock) { + D2(printk(KERN_DEBUG "Not moving gcblock 0x%08x to dirty_list\n", jeb->offset)); + } else if (ISDIRTY(jeb->dirty_size) && !ISDIRTY(jeb->dirty_size - addedsize)) { + D1(printk(KERN_DEBUG "Eraseblock at 0x%08x is freshly dirtied. Removing from clean list...\n", jeb->offset)); + list_del(&jeb->list); + D1(printk(KERN_DEBUG "...and adding to dirty_list\n")); + list_add_tail(&jeb->list, &c->dirty_list); + } else if (VERYDIRTY(c, jeb->dirty_size) && + !VERYDIRTY(c, jeb->dirty_size - addedsize)) { + D1(printk(KERN_DEBUG "Eraseblock at 0x%08x is now very dirty. Removing from dirty list...\n", jeb->offset)); + list_del(&jeb->list); + D1(printk(KERN_DEBUG "...and adding to very_dirty_list\n")); + list_add_tail(&jeb->list, &c->very_dirty_list); + } else { + D1(printk(KERN_DEBUG "Eraseblock at 0x%08x not moved anywhere. (free 0x%08x, dirty 0x%08x, used 0x%08x)\n", + jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size)); + } + + spin_unlock(&c->erase_completion_lock); + + if (!jffs2_can_mark_obsolete(c) || jffs2_is_readonly(c) || + (c->flags & JFFS2_SB_FLAG_BUILDING)) { + /* We didn't lock the erase_free_sem */ + return; + } + + /* The erase_free_sem is locked, and has been since before we marked the node obsolete + and potentially put its eraseblock onto the erase_pending_list. Thus, we know that + the block hasn't _already_ been erased, and that 'ref' itself hasn't been freed yet + by jffs2_free_jeb_node_refs() in erase.c. Which is nice. */ + + D1(printk(KERN_DEBUG "obliterating obsoleted node at 0x%08x\n", ref_offset(ref))); + ret = jffs2_flash_read(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n); + if (ret) { + printk(KERN_WARNING "Read error reading from obsoleted node at 0x%08x: %d\n", ref_offset(ref), ret); + goto out_erase_sem; + } + if (retlen != sizeof(n)) { + printk(KERN_WARNING "Short read from obsoleted node at 0x%08x: %zd\n", ref_offset(ref), retlen); + goto out_erase_sem; + } + if (PAD(je32_to_cpu(n.totlen)) != PAD(freed_len)) { + printk(KERN_WARNING "Node totlen on flash (0x%08x) != totlen from node ref (0x%08x)\n", je32_to_cpu(n.totlen), freed_len); + goto out_erase_sem; + } + if (!(je16_to_cpu(n.nodetype) & JFFS2_NODE_ACCURATE)) { + D1(printk(KERN_DEBUG "Node at 0x%08x was already marked obsolete (nodetype 0x%04x)\n", ref_offset(ref), je16_to_cpu(n.nodetype))); + goto out_erase_sem; + } + /* XXX FIXME: This is ugly now */ + n.nodetype = cpu_to_je16(je16_to_cpu(n.nodetype) & ~JFFS2_NODE_ACCURATE); + ret = jffs2_flash_write(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n); + if (ret) { + printk(KERN_WARNING "Write error in obliterating obsoleted node at 0x%08x: %d\n", ref_offset(ref), ret); + goto out_erase_sem; + } + if (retlen != sizeof(n)) { + printk(KERN_WARNING "Short write in obliterating obsoleted node at 0x%08x: %zd\n", ref_offset(ref), retlen); + goto out_erase_sem; + } + + /* Nodes which have been marked obsolete no longer need to be + associated with any inode. Remove them from the per-inode list. + + Note we can't do this for NAND at the moment because we need + obsolete dirent nodes to stay on the lists, because of the + horridness in jffs2_garbage_collect_deletion_dirent(). Also + because we delete the inocache, and on NAND we need that to + stay around until all the nodes are actually erased, in order + to stop us from giving the same inode number to another newly + created inode. */ + if (ref->next_in_ino) { + struct jffs2_inode_cache *ic; + struct jffs2_raw_node_ref **p; + + spin_lock(&c->erase_completion_lock); + + ic = jffs2_raw_ref_to_ic(ref); + for (p = &ic->nodes; (*p) != ref; p = &((*p)->next_in_ino)) + ; + + *p = ref->next_in_ino; + ref->next_in_ino = NULL; + + switch (ic->class) { +#ifdef CONFIG_JFFS2_FS_XATTR + case RAWNODE_CLASS_XATTR_DATUM: + jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic); + break; + case RAWNODE_CLASS_XATTR_REF: + jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic); + break; +#endif + default: + if (ic->nodes == (void *)ic && ic->pino_nlink == 0) + jffs2_del_ino_cache(c, ic); + break; + } + spin_unlock(&c->erase_completion_lock); + } + + out_erase_sem: + mutex_unlock(&c->erase_free_sem); +} + +int jffs2_thread_should_wake(struct jffs2_sb_info *c) +{ + int ret = 0; + uint32_t dirty; + int nr_very_dirty = 0; + struct jffs2_eraseblock *jeb; + + if (!list_empty(&c->erase_complete_list) || + !list_empty(&c->erase_pending_list)) + return 1; + + if (c->unchecked_size) { + D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n", + c->unchecked_size, c->checked_ino)); + return 1; + } + + /* dirty_size contains blocks on erase_pending_list + * those blocks are counted in c->nr_erasing_blocks. + * If one block is actually erased, it is not longer counted as dirty_space + * but it is counted in c->nr_erasing_blocks, so we add it and subtract it + * with c->nr_erasing_blocks * c->sector_size again. + * Blocks on erasable_list are counted as dirty_size, but not in c->nr_erasing_blocks + * This helps us to force gc and pick eventually a clean block to spread the load. + */ + dirty = c->dirty_size + c->erasing_size - c->nr_erasing_blocks * c->sector_size; + + if (c->nr_free_blocks + c->nr_erasing_blocks < c->resv_blocks_gctrigger && + (dirty > c->nospc_dirty_size)) + ret = 1; + + list_for_each_entry(jeb, &c->very_dirty_list, list) { + nr_very_dirty++; + if (nr_very_dirty == c->vdirty_blocks_gctrigger) { + ret = 1; + /* In debug mode, actually go through and count them all */ + D1(continue); + break; + } + } + + D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): nr_free_blocks %d, nr_erasing_blocks %d, dirty_size 0x%x, vdirty_blocks %d: %s\n", + c->nr_free_blocks, c->nr_erasing_blocks, c->dirty_size, nr_very_dirty, ret?"yes":"no")); + + return ret; +} diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h new file mode 100644 index 00000000..dce73c35 --- /dev/null +++ b/fs/jffs2/os-linux.h @@ -0,0 +1,204 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef __JFFS2_OS_LINUX_H__ +#define __JFFS2_OS_LINUX_H__ + +/* JFFS2 uses Linux mode bits natively -- no need for conversion */ +#define os_to_jffs2_mode(x) (x) +#define jffs2_to_os_mode(x) (x) + +struct kstatfs; +struct kvec; + +#define JFFS2_INODE_INFO(i) (list_entry(i, struct jffs2_inode_info, vfs_inode)) +#define OFNI_EDONI_2SFFJ(f) (&(f)->vfs_inode) +#define JFFS2_SB_INFO(sb) (sb->s_fs_info) +#define OFNI_BS_2SFFJ(c) ((struct super_block *)c->os_priv) + + +#define JFFS2_F_I_SIZE(f) (OFNI_EDONI_2SFFJ(f)->i_size) +#define JFFS2_F_I_MODE(f) (OFNI_EDONI_2SFFJ(f)->i_mode) +#define JFFS2_F_I_UID(f) (OFNI_EDONI_2SFFJ(f)->i_uid) +#define JFFS2_F_I_GID(f) (OFNI_EDONI_2SFFJ(f)->i_gid) +#define JFFS2_F_I_RDEV(f) (OFNI_EDONI_2SFFJ(f)->i_rdev) + +#define ITIME(sec) ((struct timespec){sec, 0}) +#define I_SEC(tv) ((tv).tv_sec) +#define JFFS2_F_I_CTIME(f) (OFNI_EDONI_2SFFJ(f)->i_ctime.tv_sec) +#define JFFS2_F_I_MTIME(f) (OFNI_EDONI_2SFFJ(f)->i_mtime.tv_sec) +#define JFFS2_F_I_ATIME(f) (OFNI_EDONI_2SFFJ(f)->i_atime.tv_sec) + +#define sleep_on_spinunlock(wq, s) \ + do { \ + DECLARE_WAITQUEUE(__wait, current); \ + add_wait_queue((wq), &__wait); \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + spin_unlock(s); \ + schedule(); \ + remove_wait_queue((wq), &__wait); \ + } while(0) + +static inline void jffs2_init_inode_info(struct jffs2_inode_info *f) +{ + f->highest_version = 0; + f->fragtree = RB_ROOT; + f->metadata = NULL; + f->dents = NULL; + f->target = NULL; + f->flags = 0; + f->usercompr = 0; +} + + +#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & MS_RDONLY) + +#define SECTOR_ADDR(x) ( (((unsigned long)(x) / c->sector_size) * c->sector_size) ) +#ifndef CONFIG_JFFS2_FS_WRITEBUFFER + + +#ifdef CONFIG_JFFS2_SUMMARY +#define jffs2_can_mark_obsolete(c) (0) +#else +#define jffs2_can_mark_obsolete(c) (1) +#endif + +#define jffs2_is_writebuffered(c) (0) +#define jffs2_cleanmarker_oob(c) (0) +#define jffs2_write_nand_cleanmarker(c,jeb) (-EIO) + +#define jffs2_flash_write(c, ofs, len, retlen, buf) jffs2_flash_direct_write(c, ofs, len, retlen, buf) +#define jffs2_flash_read(c, ofs, len, retlen, buf) ((c)->mtd->read((c)->mtd, ofs, len, retlen, buf)) +#define jffs2_flush_wbuf_pad(c) ({ do{} while(0); (void)(c), 0; }) +#define jffs2_flush_wbuf_gc(c, i) ({ do{} while(0); (void)(c), (void) i, 0; }) +#define jffs2_write_nand_badblock(c,jeb,bad_offset) (1) +#define jffs2_nand_flash_setup(c) (0) +#define jffs2_nand_flash_cleanup(c) do {} while(0) +#define jffs2_wbuf_dirty(c) (0) +#define jffs2_flash_writev(a,b,c,d,e,f) jffs2_flash_direct_writev(a,b,c,d,e) +#define jffs2_wbuf_timeout NULL +#define jffs2_wbuf_process NULL +#define jffs2_dataflash(c) (0) +#define jffs2_dataflash_setup(c) (0) +#define jffs2_dataflash_cleanup(c) do {} while (0) +#define jffs2_nor_wbuf_flash(c) (0) +#define jffs2_nor_wbuf_flash_setup(c) (0) +#define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0) +#define jffs2_ubivol(c) (0) +#define jffs2_ubivol_setup(c) (0) +#define jffs2_ubivol_cleanup(c) do {} while (0) + +#else /* NAND and/or ECC'd NOR support present */ + +#define jffs2_is_writebuffered(c) (c->wbuf != NULL) + +#ifdef CONFIG_JFFS2_SUMMARY +#define jffs2_can_mark_obsolete(c) (0) +#else +#define jffs2_can_mark_obsolete(c) (c->mtd->flags & (MTD_BIT_WRITEABLE)) +#endif + +#define jffs2_cleanmarker_oob(c) (c->mtd->type == MTD_NANDFLASH && (c->mtd->flags & MTD_OOB_WRITEABLE)) + +#define jffs2_flash_write_oob(c, ofs, len, retlen, buf) ((c)->mtd->write_oob((c)->mtd, ofs, len, retlen, buf)) +#define jffs2_flash_read_oob(c, ofs, len, retlen, buf) ((c)->mtd->read_oob((c)->mtd, ofs, len, retlen, buf)) +#define jffs2_wbuf_dirty(c) (!!(c)->wbuf_len) + +/* wbuf.c */ +int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *vecs, unsigned long count, loff_t to, size_t *retlen, uint32_t ino); +int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, const u_char *buf); +int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, u_char *buf); +int jffs2_check_oob_empty(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,int mode); +int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); +int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); +int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset); +void jffs2_wbuf_timeout(unsigned long data); +void jffs2_wbuf_process(void *data); +int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino); +int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c); +int jffs2_nand_flash_setup(struct jffs2_sb_info *c); +void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c); + +#define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH) +int jffs2_dataflash_setup(struct jffs2_sb_info *c); +void jffs2_dataflash_cleanup(struct jffs2_sb_info *c); +#define jffs2_ubivol(c) (c->mtd->type == MTD_UBIVOLUME) +int jffs2_ubivol_setup(struct jffs2_sb_info *c); +void jffs2_ubivol_cleanup(struct jffs2_sb_info *c); + +#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE)) +int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c); +void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c); + +#endif /* WRITEBUFFER */ + +static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c) +{ + OFNI_BS_2SFFJ(c)->s_dirt = 1; +} + +/* background.c */ +int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c); +void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c); +void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c); + +/* dir.c */ +extern const struct file_operations jffs2_dir_operations; +extern const struct inode_operations jffs2_dir_inode_operations; + +/* file.c */ +extern const struct file_operations jffs2_file_operations; +extern const struct inode_operations jffs2_file_inode_operations; +extern const struct address_space_operations jffs2_file_address_operations; +int jffs2_fsync(struct file *, int); +int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); + +/* ioctl.c */ +long jffs2_ioctl(struct file *, unsigned int, unsigned long); + +/* symlink.c */ +extern const struct inode_operations jffs2_symlink_inode_operations; + +/* fs.c */ +int jffs2_setattr (struct dentry *, struct iattr *); +int jffs2_do_setattr (struct inode *, struct iattr *); +struct inode *jffs2_iget(struct super_block *, unsigned long); +void jffs2_evict_inode (struct inode *); +void jffs2_dirty_inode(struct inode *inode, int flags); +struct inode *jffs2_new_inode (struct inode *dir_i, int mode, + struct jffs2_raw_inode *ri); +int jffs2_statfs (struct dentry *, struct kstatfs *); +int jffs2_remount_fs (struct super_block *, int *, char *); +int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); +void jffs2_gc_release_inode(struct jffs2_sb_info *c, + struct jffs2_inode_info *f); +struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c, + int inum, int unlinked); + +unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c, + struct jffs2_inode_info *f, + unsigned long offset, + unsigned long *priv); +void jffs2_gc_release_page(struct jffs2_sb_info *c, + unsigned char *pg, + unsigned long *priv); +void jffs2_flash_cleanup(struct jffs2_sb_info *c); + + +/* writev.c */ +int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs, + unsigned long count, loff_t to, size_t *retlen); +int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, + size_t *retlen, const u_char *buf); + +#endif /* __JFFS2_OS_LINUX_H__ */ + + diff --git a/fs/jffs2/read.c b/fs/jffs2/read.c new file mode 100644 index 00000000..3f39be1b --- /dev/null +++ b/fs/jffs2/read.c @@ -0,0 +1,216 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include "nodelist.h" +#include "compr.h" + +int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_full_dnode *fd, unsigned char *buf, + int ofs, int len) +{ + struct jffs2_raw_inode *ri; + size_t readlen; + uint32_t crc; + unsigned char *decomprbuf = NULL; + unsigned char *readbuf = NULL; + int ret = 0; + + ri = jffs2_alloc_raw_inode(); + if (!ri) + return -ENOMEM; + + ret = jffs2_flash_read(c, ref_offset(fd->raw), sizeof(*ri), &readlen, (char *)ri); + if (ret) { + jffs2_free_raw_inode(ri); + printk(KERN_WARNING "Error reading node from 0x%08x: %d\n", ref_offset(fd->raw), ret); + return ret; + } + if (readlen != sizeof(*ri)) { + jffs2_free_raw_inode(ri); + printk(KERN_WARNING "Short read from 0x%08x: wanted 0x%zx bytes, got 0x%zx\n", + ref_offset(fd->raw), sizeof(*ri), readlen); + return -EIO; + } + crc = crc32(0, ri, sizeof(*ri)-8); + + D1(printk(KERN_DEBUG "Node read from %08x: node_crc %08x, calculated CRC %08x. dsize %x, csize %x, offset %x, buf %p\n", + ref_offset(fd->raw), je32_to_cpu(ri->node_crc), + crc, je32_to_cpu(ri->dsize), je32_to_cpu(ri->csize), + je32_to_cpu(ri->offset), buf)); + if (crc != je32_to_cpu(ri->node_crc)) { + printk(KERN_WARNING "Node CRC %08x != calculated CRC %08x for node at %08x\n", + je32_to_cpu(ri->node_crc), crc, ref_offset(fd->raw)); + ret = -EIO; + goto out_ri; + } + /* There was a bug where we wrote hole nodes out with csize/dsize + swapped. Deal with it */ + if (ri->compr == JFFS2_COMPR_ZERO && !je32_to_cpu(ri->dsize) && + je32_to_cpu(ri->csize)) { + ri->dsize = ri->csize; + ri->csize = cpu_to_je32(0); + } + + D1(if(ofs + len > je32_to_cpu(ri->dsize)) { + printk(KERN_WARNING "jffs2_read_dnode() asked for %d bytes at %d from %d-byte node\n", + len, ofs, je32_to_cpu(ri->dsize)); + ret = -EINVAL; + goto out_ri; + }); + + + if (ri->compr == JFFS2_COMPR_ZERO) { + memset(buf, 0, len); + goto out_ri; + } + + /* Cases: + Reading whole node and it's uncompressed - read directly to buffer provided, check CRC. + Reading whole node and it's compressed - read into comprbuf, check CRC and decompress to buffer provided + Reading partial node and it's uncompressed - read into readbuf, check CRC, and copy + Reading partial node and it's compressed - read into readbuf, check checksum, decompress to decomprbuf and copy + */ + if (ri->compr == JFFS2_COMPR_NONE && len == je32_to_cpu(ri->dsize)) { + readbuf = buf; + } else { + readbuf = kmalloc(je32_to_cpu(ri->csize), GFP_KERNEL); + if (!readbuf) { + ret = -ENOMEM; + goto out_ri; + } + } + if (ri->compr != JFFS2_COMPR_NONE) { + if (len < je32_to_cpu(ri->dsize)) { + decomprbuf = kmalloc(je32_to_cpu(ri->dsize), GFP_KERNEL); + if (!decomprbuf) { + ret = -ENOMEM; + goto out_readbuf; + } + } else { + decomprbuf = buf; + } + } else { + decomprbuf = readbuf; + } + + D2(printk(KERN_DEBUG "Read %d bytes to %p\n", je32_to_cpu(ri->csize), + readbuf)); + ret = jffs2_flash_read(c, (ref_offset(fd->raw)) + sizeof(*ri), + je32_to_cpu(ri->csize), &readlen, readbuf); + + if (!ret && readlen != je32_to_cpu(ri->csize)) + ret = -EIO; + if (ret) + goto out_decomprbuf; + + crc = crc32(0, readbuf, je32_to_cpu(ri->csize)); + if (crc != je32_to_cpu(ri->data_crc)) { + printk(KERN_WARNING "Data CRC %08x != calculated CRC %08x for node at %08x\n", + je32_to_cpu(ri->data_crc), crc, ref_offset(fd->raw)); + ret = -EIO; + goto out_decomprbuf; + } + D2(printk(KERN_DEBUG "Data CRC matches calculated CRC %08x\n", crc)); + if (ri->compr != JFFS2_COMPR_NONE) { + D2(printk(KERN_DEBUG "Decompress %d bytes from %p to %d bytes at %p\n", + je32_to_cpu(ri->csize), readbuf, je32_to_cpu(ri->dsize), decomprbuf)); + ret = jffs2_decompress(c, f, ri->compr | (ri->usercompr << 8), readbuf, decomprbuf, je32_to_cpu(ri->csize), je32_to_cpu(ri->dsize)); + if (ret) { + printk(KERN_WARNING "Error: jffs2_decompress returned %d\n", ret); + goto out_decomprbuf; + } + } + + if (len < je32_to_cpu(ri->dsize)) { + memcpy(buf, decomprbuf+ofs, len); + } + out_decomprbuf: + if(decomprbuf != buf && decomprbuf != readbuf) + kfree(decomprbuf); + out_readbuf: + if(readbuf != buf) + kfree(readbuf); + out_ri: + jffs2_free_raw_inode(ri); + + return ret; +} + +int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + unsigned char *buf, uint32_t offset, uint32_t len) +{ + uint32_t end = offset + len; + struct jffs2_node_frag *frag; + int ret; + + D1(printk(KERN_DEBUG "jffs2_read_inode_range: ino #%u, range 0x%08x-0x%08x\n", + f->inocache->ino, offset, offset+len)); + + frag = jffs2_lookup_node_frag(&f->fragtree, offset); + + /* XXX FIXME: Where a single physical node actually shows up in two + frags, we read it twice. Don't do that. */ + /* Now we're pointing at the first frag which overlaps our page + * (or perhaps is before it, if we've been asked to read off the + * end of the file). */ + while(offset < end) { + D2(printk(KERN_DEBUG "jffs2_read_inode_range: offset %d, end %d\n", offset, end)); + if (unlikely(!frag || frag->ofs > offset || + frag->ofs + frag->size <= offset)) { + uint32_t holesize = end - offset; + if (frag && frag->ofs > offset) { + D1(printk(KERN_NOTICE "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n", f->inocache->ino, frag->ofs, offset)); + holesize = min(holesize, frag->ofs - offset); + } + D1(printk(KERN_DEBUG "Filling non-frag hole from %d-%d\n", offset, offset+holesize)); + memset(buf, 0, holesize); + buf += holesize; + offset += holesize; + continue; + } else if (unlikely(!frag->node)) { + uint32_t holeend = min(end, frag->ofs + frag->size); + D1(printk(KERN_DEBUG "Filling frag hole from %d-%d (frag 0x%x 0x%x)\n", offset, holeend, frag->ofs, frag->ofs + frag->size)); + memset(buf, 0, holeend - offset); + buf += holeend - offset; + offset = holeend; + frag = frag_next(frag); + continue; + } else { + uint32_t readlen; + uint32_t fragofs; /* offset within the frag to start reading */ + + fragofs = offset - frag->ofs; + readlen = min(frag->size - fragofs, end - offset); + D1(printk(KERN_DEBUG "Reading %d-%d from node at 0x%08x (%d)\n", + frag->ofs+fragofs, frag->ofs+fragofs+readlen, + ref_offset(frag->node->raw), ref_flags(frag->node->raw))); + ret = jffs2_read_dnode(c, f, frag->node, buf, fragofs + frag->ofs - frag->node->ofs, readlen); + D2(printk(KERN_DEBUG "node read done\n")); + if (ret) { + D1(printk(KERN_DEBUG"jffs2_read_inode_range error %d\n",ret)); + memset(buf, 0, readlen); + return ret; + } + buf += readlen; + offset += readlen; + frag = frag_next(frag); + D2(printk(KERN_DEBUG "node read was OK. Looping\n")); + } + } + return 0; +} + diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c new file mode 100644 index 00000000..2ab1a0d9 --- /dev/null +++ b/fs/jffs2/readinode.c @@ -0,0 +1,1461 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + +/* + * Check the data CRC of the node. + * + * Returns: 0 if the data CRC is correct; + * 1 - if incorrect; + * error code if an error occurred. + */ +static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn) +{ + struct jffs2_raw_node_ref *ref = tn->fn->raw; + int err = 0, pointed = 0; + struct jffs2_eraseblock *jeb; + unsigned char *buffer; + uint32_t crc, ofs, len; + size_t retlen; + + BUG_ON(tn->csize == 0); + + /* Calculate how many bytes were already checked */ + ofs = ref_offset(ref) + sizeof(struct jffs2_raw_inode); + len = tn->csize; + + if (jffs2_is_writebuffered(c)) { + int adj = ofs % c->wbuf_pagesize; + if (likely(adj)) + adj = c->wbuf_pagesize - adj; + + if (adj >= tn->csize) { + dbg_readinode("no need to check node at %#08x, data length %u, data starts at %#08x - it has already been checked.\n", + ref_offset(ref), tn->csize, ofs); + goto adj_acc; + } + + ofs += adj; + len -= adj; + } + + dbg_readinode("check node at %#08x, data length %u, partial CRC %#08x, correct CRC %#08x, data starts at %#08x, start checking from %#08x - %u bytes.\n", + ref_offset(ref), tn->csize, tn->partial_crc, tn->data_crc, ofs - len, ofs, len); + +#ifndef __ECOS + /* TODO: instead, incapsulate point() stuff to jffs2_flash_read(), + * adding and jffs2_flash_read_end() interface. */ + if (c->mtd->point) { + err = c->mtd->point(c->mtd, ofs, len, &retlen, + (void **)&buffer, NULL); + if (!err && retlen < len) { + JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize); + c->mtd->unpoint(c->mtd, ofs, retlen); + } else if (err) + JFFS2_WARNING("MTD point failed: error code %d.\n", err); + else + pointed = 1; /* succefully pointed to device */ + } +#endif + + if (!pointed) { + buffer = kmalloc(len, GFP_KERNEL); + if (unlikely(!buffer)) + return -ENOMEM; + + /* TODO: this is very frequent pattern, make it a separate + * routine */ + err = jffs2_flash_read(c, ofs, len, &retlen, buffer); + if (err) { + JFFS2_ERROR("can not read %d bytes from 0x%08x, error code: %d.\n", len, ofs, err); + goto free_out; + } + + if (retlen != len) { + JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", ofs, retlen, len); + err = -EIO; + goto free_out; + } + } + + /* Continue calculating CRC */ + crc = crc32(tn->partial_crc, buffer, len); + if(!pointed) + kfree(buffer); +#ifndef __ECOS + else + c->mtd->unpoint(c->mtd, ofs, len); +#endif + + if (crc != tn->data_crc) { + JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n", + ref_offset(ref), tn->data_crc, crc); + return 1; + } + +adj_acc: + jeb = &c->blocks[ref->flash_offset / c->sector_size]; + len = ref_totlen(c, jeb, ref); + /* If it should be REF_NORMAL, it'll get marked as such when + we build the fragtree, shortly. No need to worry about GC + moving it while it's marked REF_PRISTINE -- GC won't happen + till we've finished checking every inode anyway. */ + ref->flash_offset |= REF_PRISTINE; + /* + * Mark the node as having been checked and fix the + * accounting accordingly. + */ + spin_lock(&c->erase_completion_lock); + jeb->used_size += len; + jeb->unchecked_size -= len; + c->used_size += len; + c->unchecked_size -= len; + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + spin_unlock(&c->erase_completion_lock); + + return 0; + +free_out: + if(!pointed) + kfree(buffer); +#ifndef __ECOS + else + c->mtd->unpoint(c->mtd, ofs, len); +#endif + return err; +} + +/* + * Helper function for jffs2_add_older_frag_to_fragtree(). + * + * Checks the node if we are in the checking stage. + */ +static int check_tn_node(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn) +{ + int ret; + + BUG_ON(ref_obsolete(tn->fn->raw)); + + /* We only check the data CRC of unchecked nodes */ + if (ref_flags(tn->fn->raw) != REF_UNCHECKED) + return 0; + + dbg_readinode("check node %#04x-%#04x, phys offs %#08x\n", + tn->fn->ofs, tn->fn->ofs + tn->fn->size, ref_offset(tn->fn->raw)); + + ret = check_node_data(c, tn); + if (unlikely(ret < 0)) { + JFFS2_ERROR("check_node_data() returned error: %d.\n", + ret); + } else if (unlikely(ret > 0)) { + dbg_readinode("CRC error, mark it obsolete.\n"); + jffs2_mark_node_obsolete(c, tn->fn->raw); + } + + return ret; +} + +static struct jffs2_tmp_dnode_info *jffs2_lookup_tn(struct rb_root *tn_root, uint32_t offset) +{ + struct rb_node *next; + struct jffs2_tmp_dnode_info *tn = NULL; + + dbg_readinode("root %p, offset %d\n", tn_root, offset); + + next = tn_root->rb_node; + + while (next) { + tn = rb_entry(next, struct jffs2_tmp_dnode_info, rb); + + if (tn->fn->ofs < offset) + next = tn->rb.rb_right; + else if (tn->fn->ofs >= offset) + next = tn->rb.rb_left; + else + break; + } + + return tn; +} + + +static void jffs2_kill_tn(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn) +{ + jffs2_mark_node_obsolete(c, tn->fn->raw); + jffs2_free_full_dnode(tn->fn); + jffs2_free_tmp_dnode_info(tn); +} +/* + * This function is used when we read an inode. Data nodes arrive in + * arbitrary order -- they may be older or newer than the nodes which + * are already in the tree. Where overlaps occur, the older node can + * be discarded as long as the newer passes the CRC check. We don't + * bother to keep track of holes in this rbtree, and neither do we deal + * with frags -- we can have multiple entries starting at the same + * offset, and the one with the smallest length will come first in the + * ordering. + * + * Returns 0 if the node was handled (including marking it obsolete) + * < 0 an if error occurred + */ +static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, + struct jffs2_readinode_info *rii, + struct jffs2_tmp_dnode_info *tn) +{ + uint32_t fn_end = tn->fn->ofs + tn->fn->size; + struct jffs2_tmp_dnode_info *this, *ptn; + + dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw)); + + /* If a node has zero dsize, we only have to keep if it if it might be the + node with highest version -- i.e. the one which will end up as f->metadata. + Note that such nodes won't be REF_UNCHECKED since there are no data to + check anyway. */ + if (!tn->fn->size) { + if (rii->mdata_tn) { + if (rii->mdata_tn->version < tn->version) { + /* We had a candidate mdata node already */ + dbg_readinode("kill old mdata with ver %d\n", rii->mdata_tn->version); + jffs2_kill_tn(c, rii->mdata_tn); + } else { + dbg_readinode("kill new mdata with ver %d (older than existing %d\n", + tn->version, rii->mdata_tn->version); + jffs2_kill_tn(c, tn); + return 0; + } + } + rii->mdata_tn = tn; + dbg_readinode("keep new mdata with ver %d\n", tn->version); + return 0; + } + + /* Find the earliest node which _may_ be relevant to this one */ + this = jffs2_lookup_tn(&rii->tn_root, tn->fn->ofs); + if (this) { + /* If the node is coincident with another at a lower address, + back up until the other node is found. It may be relevant */ + while (this->overlapped) { + ptn = tn_prev(this); + if (!ptn) { + /* + * We killed a node which set the overlapped + * flags during the scan. Fix it up. + */ + this->overlapped = 0; + break; + } + this = ptn; + } + dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole"); + } + + while (this) { + if (this->fn->ofs > fn_end) + break; + dbg_readinode("Ponder this ver %d, 0x%x-0x%x\n", + this->version, this->fn->ofs, this->fn->size); + + if (this->version == tn->version) { + /* Version number collision means REF_PRISTINE GC. Accept either of them + as long as the CRC is correct. Check the one we have already... */ + if (!check_tn_node(c, this)) { + /* The one we already had was OK. Keep it and throw away the new one */ + dbg_readinode("Like old node. Throw away new\n"); + jffs2_kill_tn(c, tn); + return 0; + } else { + /* Who cares if the new one is good; keep it for now anyway. */ + dbg_readinode("Like new node. Throw away old\n"); + rb_replace_node(&this->rb, &tn->rb, &rii->tn_root); + jffs2_kill_tn(c, this); + /* Same overlapping from in front and behind */ + return 0; + } + } + if (this->version < tn->version && + this->fn->ofs >= tn->fn->ofs && + this->fn->ofs + this->fn->size <= fn_end) { + /* New node entirely overlaps 'this' */ + if (check_tn_node(c, tn)) { + dbg_readinode("new node bad CRC\n"); + jffs2_kill_tn(c, tn); + return 0; + } + /* ... and is good. Kill 'this' and any subsequent nodes which are also overlapped */ + while (this && this->fn->ofs + this->fn->size <= fn_end) { + struct jffs2_tmp_dnode_info *next = tn_next(this); + if (this->version < tn->version) { + tn_erase(this, &rii->tn_root); + dbg_readinode("Kill overlapped ver %d, 0x%x-0x%x\n", + this->version, this->fn->ofs, + this->fn->ofs+this->fn->size); + jffs2_kill_tn(c, this); + } + this = next; + } + dbg_readinode("Done killing overlapped nodes\n"); + continue; + } + if (this->version > tn->version && + this->fn->ofs <= tn->fn->ofs && + this->fn->ofs+this->fn->size >= fn_end) { + /* New node entirely overlapped by 'this' */ + if (!check_tn_node(c, this)) { + dbg_readinode("Good CRC on old node. Kill new\n"); + jffs2_kill_tn(c, tn); + return 0; + } + /* ... but 'this' was bad. Replace it... */ + dbg_readinode("Bad CRC on old overlapping node. Kill it\n"); + tn_erase(this, &rii->tn_root); + jffs2_kill_tn(c, this); + break; + } + + this = tn_next(this); + } + + /* We neither completely obsoleted nor were completely + obsoleted by an earlier node. Insert into the tree */ + { + struct rb_node *parent; + struct rb_node **link = &rii->tn_root.rb_node; + struct jffs2_tmp_dnode_info *insert_point = NULL; + + while (*link) { + parent = *link; + insert_point = rb_entry(parent, struct jffs2_tmp_dnode_info, rb); + if (tn->fn->ofs > insert_point->fn->ofs) + link = &insert_point->rb.rb_right; + else if (tn->fn->ofs < insert_point->fn->ofs || + tn->fn->size < insert_point->fn->size) + link = &insert_point->rb.rb_left; + else + link = &insert_point->rb.rb_right; + } + rb_link_node(&tn->rb, &insert_point->rb, link); + rb_insert_color(&tn->rb, &rii->tn_root); + } + + /* If there's anything behind that overlaps us, note it */ + this = tn_prev(tn); + if (this) { + while (1) { + if (this->fn->ofs + this->fn->size > tn->fn->ofs) { + dbg_readinode("Node is overlapped by %p (v %d, 0x%x-0x%x)\n", + this, this->version, this->fn->ofs, + this->fn->ofs+this->fn->size); + tn->overlapped = 1; + break; + } + if (!this->overlapped) + break; + + ptn = tn_prev(this); + if (!ptn) { + /* + * We killed a node which set the overlapped + * flags during the scan. Fix it up. + */ + this->overlapped = 0; + break; + } + this = ptn; + } + } + + /* If the new node overlaps anything ahead, note it */ + this = tn_next(tn); + while (this && this->fn->ofs < fn_end) { + this->overlapped = 1; + dbg_readinode("Node ver %d, 0x%x-0x%x is overlapped\n", + this->version, this->fn->ofs, + this->fn->ofs+this->fn->size); + this = tn_next(this); + } + return 0; +} + +/* Trivial function to remove the last node in the tree. Which by definition + has no right-hand -- so can be removed just by making its only child (if + any) take its place under its parent. */ +static void eat_last(struct rb_root *root, struct rb_node *node) +{ + struct rb_node *parent = rb_parent(node); + struct rb_node **link; + + /* LAST! */ + BUG_ON(node->rb_right); + + if (!parent) + link = &root->rb_node; + else if (node == parent->rb_left) + link = &parent->rb_left; + else + link = &parent->rb_right; + + *link = node->rb_left; + /* Colour doesn't matter now. Only the parent pointer. */ + if (node->rb_left) + node->rb_left->rb_parent_color = node->rb_parent_color; +} + +/* We put this in reverse order, so we can just use eat_last */ +static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn) +{ + struct rb_node **link = &ver_root->rb_node; + struct rb_node *parent = NULL; + struct jffs2_tmp_dnode_info *this_tn; + + while (*link) { + parent = *link; + this_tn = rb_entry(parent, struct jffs2_tmp_dnode_info, rb); + + if (tn->version > this_tn->version) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + dbg_readinode("Link new node at %p (root is %p)\n", link, ver_root); + rb_link_node(&tn->rb, parent, link); + rb_insert_color(&tn->rb, ver_root); +} + +/* Build final, normal fragtree from tn tree. It doesn't matter which order + we add nodes to the real fragtree, as long as they don't overlap. And + having thrown away the majority of overlapped nodes as we went, there + really shouldn't be many sets of nodes which do overlap. If we start at + the end, we can use the overlap markers -- we can just eat nodes which + aren't overlapped, and when we encounter nodes which _do_ overlap we + sort them all into a temporary tree in version order before replaying them. */ +static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c, + struct jffs2_inode_info *f, + struct jffs2_readinode_info *rii) +{ + struct jffs2_tmp_dnode_info *pen, *last, *this; + struct rb_root ver_root = RB_ROOT; + uint32_t high_ver = 0; + + if (rii->mdata_tn) { + dbg_readinode("potential mdata is ver %d at %p\n", rii->mdata_tn->version, rii->mdata_tn); + high_ver = rii->mdata_tn->version; + rii->latest_ref = rii->mdata_tn->fn->raw; + } +#ifdef JFFS2_DBG_READINODE_MESSAGES + this = tn_last(&rii->tn_root); + while (this) { + dbg_readinode("tn %p ver %d range 0x%x-0x%x ov %d\n", this, this->version, this->fn->ofs, + this->fn->ofs+this->fn->size, this->overlapped); + this = tn_prev(this); + } +#endif + pen = tn_last(&rii->tn_root); + while ((last = pen)) { + pen = tn_prev(last); + + eat_last(&rii->tn_root, &last->rb); + ver_insert(&ver_root, last); + + if (unlikely(last->overlapped)) { + if (pen) + continue; + /* + * We killed a node which set the overlapped + * flags during the scan. Fix it up. + */ + last->overlapped = 0; + } + + /* Now we have a bunch of nodes in reverse version + order, in the tree at ver_root. Most of the time, + there'll actually be only one node in the 'tree', + in fact. */ + this = tn_last(&ver_root); + + while (this) { + struct jffs2_tmp_dnode_info *vers_next; + int ret; + vers_next = tn_prev(this); + eat_last(&ver_root, &this->rb); + if (check_tn_node(c, this)) { + dbg_readinode("node ver %d, 0x%x-0x%x failed CRC\n", + this->version, this->fn->ofs, + this->fn->ofs+this->fn->size); + jffs2_kill_tn(c, this); + } else { + if (this->version > high_ver) { + /* Note that this is different from the other + highest_version, because this one is only + counting _valid_ nodes which could give the + latest inode metadata */ + high_ver = this->version; + rii->latest_ref = this->fn->raw; + } + dbg_readinode("Add %p (v %d, 0x%x-0x%x, ov %d) to fragtree\n", + this, this->version, this->fn->ofs, + this->fn->ofs+this->fn->size, this->overlapped); + + ret = jffs2_add_full_dnode_to_inode(c, f, this->fn); + if (ret) { + /* Free the nodes in vers_root; let the caller + deal with the rest */ + JFFS2_ERROR("Add node to tree failed %d\n", ret); + while (1) { + vers_next = tn_prev(this); + if (check_tn_node(c, this)) + jffs2_mark_node_obsolete(c, this->fn->raw); + jffs2_free_full_dnode(this->fn); + jffs2_free_tmp_dnode_info(this); + this = vers_next; + if (!this) + break; + eat_last(&ver_root, &vers_next->rb); + } + return ret; + } + jffs2_free_tmp_dnode_info(this); + } + this = vers_next; + } + } + return 0; +} + +static void jffs2_free_tmp_dnode_info_list(struct rb_root *list) +{ + struct rb_node *this; + struct jffs2_tmp_dnode_info *tn; + + this = list->rb_node; + + /* Now at bottom of tree */ + while (this) { + if (this->rb_left) + this = this->rb_left; + else if (this->rb_right) + this = this->rb_right; + else { + tn = rb_entry(this, struct jffs2_tmp_dnode_info, rb); + jffs2_free_full_dnode(tn->fn); + jffs2_free_tmp_dnode_info(tn); + + this = rb_parent(this); + if (!this) + break; + + if (this->rb_left == &tn->rb) + this->rb_left = NULL; + else if (this->rb_right == &tn->rb) + this->rb_right = NULL; + else BUG(); + } + } + *list = RB_ROOT; +} + +static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd) +{ + struct jffs2_full_dirent *next; + + while (fd) { + next = fd->next; + jffs2_free_full_dirent(fd); + fd = next; + } +} + +/* Returns first valid node after 'ref'. May return 'ref' */ +static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_ref *ref) +{ + while (ref && ref->next_in_ino) { + if (!ref_obsolete(ref)) + return ref; + dbg_noderef("node at 0x%08x is obsoleted. Ignoring.\n", ref_offset(ref)); + ref = ref->next_in_ino; + } + return NULL; +} + +/* + * Helper function for jffs2_get_inode_nodes(). + * It is called every time an directory entry node is found. + * + * Returns: 0 on success; + * negative error code on failure. + */ +static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, + struct jffs2_raw_dirent *rd, size_t read, + struct jffs2_readinode_info *rii) +{ + struct jffs2_full_dirent *fd; + uint32_t crc; + + /* Obsoleted. This cannot happen, surely? dwmw2 20020308 */ + BUG_ON(ref_obsolete(ref)); + + crc = crc32(0, rd, sizeof(*rd) - 8); + if (unlikely(crc != je32_to_cpu(rd->node_crc))) { + JFFS2_NOTICE("header CRC failed on dirent node at %#08x: read %#08x, calculated %#08x\n", + ref_offset(ref), je32_to_cpu(rd->node_crc), crc); + jffs2_mark_node_obsolete(c, ref); + return 0; + } + + /* If we've never checked the CRCs on this node, check them now */ + if (ref_flags(ref) == REF_UNCHECKED) { + struct jffs2_eraseblock *jeb; + int len; + + /* Sanity check */ + if (unlikely(PAD((rd->nsize + sizeof(*rd))) != PAD(je32_to_cpu(rd->totlen)))) { + JFFS2_ERROR("illegal nsize in node at %#08x: nsize %#02x, totlen %#04x\n", + ref_offset(ref), rd->nsize, je32_to_cpu(rd->totlen)); + jffs2_mark_node_obsolete(c, ref); + return 0; + } + + jeb = &c->blocks[ref->flash_offset / c->sector_size]; + len = ref_totlen(c, jeb, ref); + + spin_lock(&c->erase_completion_lock); + jeb->used_size += len; + jeb->unchecked_size -= len; + c->used_size += len; + c->unchecked_size -= len; + ref->flash_offset = ref_offset(ref) | dirent_node_state(rd); + spin_unlock(&c->erase_completion_lock); + } + + fd = jffs2_alloc_full_dirent(rd->nsize + 1); + if (unlikely(!fd)) + return -ENOMEM; + + fd->raw = ref; + fd->version = je32_to_cpu(rd->version); + fd->ino = je32_to_cpu(rd->ino); + fd->type = rd->type; + + if (fd->version > rii->highest_version) + rii->highest_version = fd->version; + + /* Pick out the mctime of the latest dirent */ + if(fd->version > rii->mctime_ver && je32_to_cpu(rd->mctime)) { + rii->mctime_ver = fd->version; + rii->latest_mctime = je32_to_cpu(rd->mctime); + } + + /* + * Copy as much of the name as possible from the raw + * dirent we've already read from the flash. + */ + if (read > sizeof(*rd)) + memcpy(&fd->name[0], &rd->name[0], + min_t(uint32_t, rd->nsize, (read - sizeof(*rd)) )); + + /* Do we need to copy any more of the name directly from the flash? */ + if (rd->nsize + sizeof(*rd) > read) { + /* FIXME: point() */ + int err; + int already = read - sizeof(*rd); + + err = jffs2_flash_read(c, (ref_offset(ref)) + read, + rd->nsize - already, &read, &fd->name[already]); + if (unlikely(read != rd->nsize - already) && likely(!err)) + return -EIO; + + if (unlikely(err)) { + JFFS2_ERROR("read remainder of name: error %d\n", err); + jffs2_free_full_dirent(fd); + return -EIO; + } + } + + fd->nhash = full_name_hash(fd->name, rd->nsize); + fd->next = NULL; + fd->name[rd->nsize] = '\0'; + + /* + * Wheee. We now have a complete jffs2_full_dirent structure, with + * the name in it and everything. Link it into the list + */ + jffs2_add_fd_to_list(c, fd, &rii->fds); + + return 0; +} + +/* + * Helper function for jffs2_get_inode_nodes(). + * It is called every time an inode node is found. + * + * Returns: 0 on success (possibly after marking a bad node obsolete); + * negative error code on failure. + */ +static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, + struct jffs2_raw_inode *rd, int rdlen, + struct jffs2_readinode_info *rii) +{ + struct jffs2_tmp_dnode_info *tn; + uint32_t len, csize; + int ret = 0; + uint32_t crc; + + /* Obsoleted. This cannot happen, surely? dwmw2 20020308 */ + BUG_ON(ref_obsolete(ref)); + + crc = crc32(0, rd, sizeof(*rd) - 8); + if (unlikely(crc != je32_to_cpu(rd->node_crc))) { + JFFS2_NOTICE("node CRC failed on dnode at %#08x: read %#08x, calculated %#08x\n", + ref_offset(ref), je32_to_cpu(rd->node_crc), crc); + jffs2_mark_node_obsolete(c, ref); + return 0; + } + + tn = jffs2_alloc_tmp_dnode_info(); + if (!tn) { + JFFS2_ERROR("failed to allocate tn (%zu bytes).\n", sizeof(*tn)); + return -ENOMEM; + } + + tn->partial_crc = 0; + csize = je32_to_cpu(rd->csize); + + /* If we've never checked the CRCs on this node, check them now */ + if (ref_flags(ref) == REF_UNCHECKED) { + + /* Sanity checks */ + if (unlikely(je32_to_cpu(rd->offset) > je32_to_cpu(rd->isize)) || + unlikely(PAD(je32_to_cpu(rd->csize) + sizeof(*rd)) != PAD(je32_to_cpu(rd->totlen)))) { + JFFS2_WARNING("inode node header CRC is corrupted at %#08x\n", ref_offset(ref)); + jffs2_dbg_dump_node(c, ref_offset(ref)); + jffs2_mark_node_obsolete(c, ref); + goto free_out; + } + + if (jffs2_is_writebuffered(c) && csize != 0) { + /* At this point we are supposed to check the data CRC + * of our unchecked node. But thus far, we do not + * know whether the node is valid or obsolete. To + * figure this out, we need to walk all the nodes of + * the inode and build the inode fragtree. We don't + * want to spend time checking data of nodes which may + * later be found to be obsolete. So we put off the full + * data CRC checking until we have read all the inode + * nodes and have started building the fragtree. + * + * The fragtree is being built starting with nodes + * having the highest version number, so we'll be able + * to detect whether a node is valid (i.e., it is not + * overlapped by a node with higher version) or not. + * And we'll be able to check only those nodes, which + * are not obsolete. + * + * Of course, this optimization only makes sense in case + * of NAND flashes (or other flashes with + * !jffs2_can_mark_obsolete()), since on NOR flashes + * nodes are marked obsolete physically. + * + * Since NAND flashes (or other flashes with + * jffs2_is_writebuffered(c)) are anyway read by + * fractions of c->wbuf_pagesize, and we have just read + * the node header, it is likely that the starting part + * of the node data is also read when we read the + * header. So we don't mind to check the CRC of the + * starting part of the data of the node now, and check + * the second part later (in jffs2_check_node_data()). + * Of course, we will not need to re-read and re-check + * the NAND page which we have just read. This is why we + * read the whole NAND page at jffs2_get_inode_nodes(), + * while we needed only the node header. + */ + unsigned char *buf; + + /* 'buf' will point to the start of data */ + buf = (unsigned char *)rd + sizeof(*rd); + /* len will be the read data length */ + len = min_t(uint32_t, rdlen - sizeof(*rd), csize); + tn->partial_crc = crc32(0, buf, len); + + dbg_readinode("Calculates CRC (%#08x) for %d bytes, csize %d\n", tn->partial_crc, len, csize); + + /* If we actually calculated the whole data CRC + * and it is wrong, drop the node. */ + if (len >= csize && unlikely(tn->partial_crc != je32_to_cpu(rd->data_crc))) { + JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n", + ref_offset(ref), tn->partial_crc, je32_to_cpu(rd->data_crc)); + jffs2_mark_node_obsolete(c, ref); + goto free_out; + } + + } else if (csize == 0) { + /* + * We checked the header CRC. If the node has no data, adjust + * the space accounting now. For other nodes this will be done + * later either when the node is marked obsolete or when its + * data is checked. + */ + struct jffs2_eraseblock *jeb; + + dbg_readinode("the node has no data.\n"); + jeb = &c->blocks[ref->flash_offset / c->sector_size]; + len = ref_totlen(c, jeb, ref); + + spin_lock(&c->erase_completion_lock); + jeb->used_size += len; + jeb->unchecked_size -= len; + c->used_size += len; + c->unchecked_size -= len; + ref->flash_offset = ref_offset(ref) | REF_NORMAL; + spin_unlock(&c->erase_completion_lock); + } + } + + tn->fn = jffs2_alloc_full_dnode(); + if (!tn->fn) { + JFFS2_ERROR("alloc fn failed\n"); + ret = -ENOMEM; + goto free_out; + } + + tn->version = je32_to_cpu(rd->version); + tn->fn->ofs = je32_to_cpu(rd->offset); + tn->data_crc = je32_to_cpu(rd->data_crc); + tn->csize = csize; + tn->fn->raw = ref; + tn->overlapped = 0; + + if (tn->version > rii->highest_version) + rii->highest_version = tn->version; + + /* There was a bug where we wrote hole nodes out with + csize/dsize swapped. Deal with it */ + if (rd->compr == JFFS2_COMPR_ZERO && !je32_to_cpu(rd->dsize) && csize) + tn->fn->size = csize; + else // normal case... + tn->fn->size = je32_to_cpu(rd->dsize); + + dbg_readinode2("dnode @%08x: ver %u, offset %#04x, dsize %#04x, csize %#04x\n", + ref_offset(ref), je32_to_cpu(rd->version), + je32_to_cpu(rd->offset), je32_to_cpu(rd->dsize), csize); + + ret = jffs2_add_tn_to_tree(c, rii, tn); + + if (ret) { + jffs2_free_full_dnode(tn->fn); + free_out: + jffs2_free_tmp_dnode_info(tn); + return ret; + } +#ifdef JFFS2_DBG_READINODE2_MESSAGES + dbg_readinode2("After adding ver %d:\n", je32_to_cpu(rd->version)); + tn = tn_first(&rii->tn_root); + while (tn) { + dbg_readinode2("%p: v %d r 0x%x-0x%x ov %d\n", + tn, tn->version, tn->fn->ofs, + tn->fn->ofs+tn->fn->size, tn->overlapped); + tn = tn_next(tn); + } +#endif + return 0; +} + +/* + * Helper function for jffs2_get_inode_nodes(). + * It is called every time an unknown node is found. + * + * Returns: 0 on success; + * negative error code on failure. + */ +static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, struct jffs2_unknown_node *un) +{ + /* We don't mark unknown nodes as REF_UNCHECKED */ + if (ref_flags(ref) == REF_UNCHECKED) { + JFFS2_ERROR("REF_UNCHECKED but unknown node at %#08x\n", + ref_offset(ref)); + JFFS2_ERROR("Node is {%04x,%04x,%08x,%08x}. Please report this error.\n", + je16_to_cpu(un->magic), je16_to_cpu(un->nodetype), + je32_to_cpu(un->totlen), je32_to_cpu(un->hdr_crc)); + jffs2_mark_node_obsolete(c, ref); + return 0; + } + + un->nodetype = cpu_to_je16(JFFS2_NODE_ACCURATE | je16_to_cpu(un->nodetype)); + + switch(je16_to_cpu(un->nodetype) & JFFS2_COMPAT_MASK) { + + case JFFS2_FEATURE_INCOMPAT: + JFFS2_ERROR("unknown INCOMPAT nodetype %#04X at %#08x\n", + je16_to_cpu(un->nodetype), ref_offset(ref)); + /* EEP */ + BUG(); + break; + + case JFFS2_FEATURE_ROCOMPAT: + JFFS2_ERROR("unknown ROCOMPAT nodetype %#04X at %#08x\n", + je16_to_cpu(un->nodetype), ref_offset(ref)); + BUG_ON(!(c->flags & JFFS2_SB_FLAG_RO)); + break; + + case JFFS2_FEATURE_RWCOMPAT_COPY: + JFFS2_NOTICE("unknown RWCOMPAT_COPY nodetype %#04X at %#08x\n", + je16_to_cpu(un->nodetype), ref_offset(ref)); + break; + + case JFFS2_FEATURE_RWCOMPAT_DELETE: + JFFS2_NOTICE("unknown RWCOMPAT_DELETE nodetype %#04X at %#08x\n", + je16_to_cpu(un->nodetype), ref_offset(ref)); + jffs2_mark_node_obsolete(c, ref); + return 0; + } + + return 0; +} + +/* + * Helper function for jffs2_get_inode_nodes(). + * The function detects whether more data should be read and reads it if yes. + * + * Returns: 0 on success; + * negative error code on failure. + */ +static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, + int needed_len, int *rdlen, unsigned char *buf) +{ + int err, to_read = needed_len - *rdlen; + size_t retlen; + uint32_t offs; + + if (jffs2_is_writebuffered(c)) { + int rem = to_read % c->wbuf_pagesize; + + if (rem) + to_read += c->wbuf_pagesize - rem; + } + + /* We need to read more data */ + offs = ref_offset(ref) + *rdlen; + + dbg_readinode("read more %d bytes\n", to_read); + + err = jffs2_flash_read(c, offs, to_read, &retlen, buf + *rdlen); + if (err) { + JFFS2_ERROR("can not read %d bytes from 0x%08x, " + "error code: %d.\n", to_read, offs, err); + return err; + } + + if (retlen < to_read) { + JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n", + offs, retlen, to_read); + return -EIO; + } + + *rdlen += to_read; + return 0; +} + +/* Get tmp_dnode_info and full_dirent for all non-obsolete nodes associated + with this ino. Perform a preliminary ordering on data nodes, throwing away + those which are completely obsoleted by newer ones. The naïve approach we + use to take of just returning them _all_ in version order will cause us to + run out of memory in certain degenerate cases. */ +static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_readinode_info *rii) +{ + struct jffs2_raw_node_ref *ref, *valid_ref; + unsigned char *buf = NULL; + union jffs2_node_union *node; + size_t retlen; + int len, err; + + rii->mctime_ver = 0; + + dbg_readinode("ino #%u\n", f->inocache->ino); + + /* FIXME: in case of NOR and available ->point() this + * needs to be fixed. */ + len = sizeof(union jffs2_node_union) + c->wbuf_pagesize; + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + spin_lock(&c->erase_completion_lock); + valid_ref = jffs2_first_valid_node(f->inocache->nodes); + if (!valid_ref && f->inocache->ino != 1) + JFFS2_WARNING("Eep. No valid nodes for ino #%u.\n", f->inocache->ino); + while (valid_ref) { + /* We can hold a pointer to a non-obsolete node without the spinlock, + but _obsolete_ nodes may disappear at any time, if the block + they're in gets erased. So if we mark 'ref' obsolete while we're + not holding the lock, it can go away immediately. For that reason, + we find the next valid node first, before processing 'ref'. + */ + ref = valid_ref; + valid_ref = jffs2_first_valid_node(ref->next_in_ino); + spin_unlock(&c->erase_completion_lock); + + cond_resched(); + + /* + * At this point we don't know the type of the node we're going + * to read, so we do not know the size of its header. In order + * to minimize the amount of flash IO we assume the header is + * of size = JFFS2_MIN_NODE_HEADER. + */ + len = JFFS2_MIN_NODE_HEADER; + if (jffs2_is_writebuffered(c)) { + int end, rem; + + /* + * We are about to read JFFS2_MIN_NODE_HEADER bytes, + * but this flash has some minimal I/O unit. It is + * possible that we'll need to read more soon, so read + * up to the next min. I/O unit, in order not to + * re-read the same min. I/O unit twice. + */ + end = ref_offset(ref) + len; + rem = end % c->wbuf_pagesize; + if (rem) + end += c->wbuf_pagesize - rem; + len = end - ref_offset(ref); + } + + dbg_readinode("read %d bytes at %#08x(%d).\n", len, ref_offset(ref), ref_flags(ref)); + + /* FIXME: point() */ + err = jffs2_flash_read(c, ref_offset(ref), len, &retlen, buf); + if (err) { + JFFS2_ERROR("can not read %d bytes from 0x%08x, " "error code: %d.\n", len, ref_offset(ref), err); + goto free_out; + } + + if (retlen < len) { + JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n", ref_offset(ref), retlen, len); + err = -EIO; + goto free_out; + } + + node = (union jffs2_node_union *)buf; + + /* No need to mask in the valid bit; it shouldn't be invalid */ + if (je32_to_cpu(node->u.hdr_crc) != crc32(0, node, sizeof(node->u)-4)) { + JFFS2_NOTICE("Node header CRC failed at %#08x. {%04x,%04x,%08x,%08x}\n", + ref_offset(ref), je16_to_cpu(node->u.magic), + je16_to_cpu(node->u.nodetype), + je32_to_cpu(node->u.totlen), + je32_to_cpu(node->u.hdr_crc)); + jffs2_dbg_dump_node(c, ref_offset(ref)); + jffs2_mark_node_obsolete(c, ref); + goto cont; + } + if (je16_to_cpu(node->u.magic) != JFFS2_MAGIC_BITMASK) { + /* Not a JFFS2 node, whinge and move on */ + JFFS2_NOTICE("Wrong magic bitmask 0x%04x in node header at %#08x.\n", + je16_to_cpu(node->u.magic), ref_offset(ref)); + jffs2_mark_node_obsolete(c, ref); + goto cont; + } + + switch (je16_to_cpu(node->u.nodetype)) { + + case JFFS2_NODETYPE_DIRENT: + + if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_dirent) && + len < sizeof(struct jffs2_raw_dirent)) { + err = read_more(c, ref, sizeof(struct jffs2_raw_dirent), &len, buf); + if (unlikely(err)) + goto free_out; + } + + err = read_direntry(c, ref, &node->d, retlen, rii); + if (unlikely(err)) + goto free_out; + + break; + + case JFFS2_NODETYPE_INODE: + + if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_inode) && + len < sizeof(struct jffs2_raw_inode)) { + err = read_more(c, ref, sizeof(struct jffs2_raw_inode), &len, buf); + if (unlikely(err)) + goto free_out; + } + + err = read_dnode(c, ref, &node->i, len, rii); + if (unlikely(err)) + goto free_out; + + break; + + default: + if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_unknown_node) && + len < sizeof(struct jffs2_unknown_node)) { + err = read_more(c, ref, sizeof(struct jffs2_unknown_node), &len, buf); + if (unlikely(err)) + goto free_out; + } + + err = read_unknown(c, ref, &node->u); + if (unlikely(err)) + goto free_out; + + } + cont: + spin_lock(&c->erase_completion_lock); + } + + spin_unlock(&c->erase_completion_lock); + kfree(buf); + + f->highest_version = rii->highest_version; + + dbg_readinode("nodes of inode #%u were read, the highest version is %u, latest_mctime %u, mctime_ver %u.\n", + f->inocache->ino, rii->highest_version, rii->latest_mctime, + rii->mctime_ver); + return 0; + + free_out: + jffs2_free_tmp_dnode_info_list(&rii->tn_root); + jffs2_free_full_dirent_list(rii->fds); + rii->fds = NULL; + kfree(buf); + return err; +} + +static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, + struct jffs2_inode_info *f, + struct jffs2_raw_inode *latest_node) +{ + struct jffs2_readinode_info rii; + uint32_t crc, new_size; + size_t retlen; + int ret; + + dbg_readinode("ino #%u pino/nlink is %d\n", f->inocache->ino, + f->inocache->pino_nlink); + + memset(&rii, 0, sizeof(rii)); + + /* Grab all nodes relevant to this ino */ + ret = jffs2_get_inode_nodes(c, f, &rii); + + if (ret) { + JFFS2_ERROR("cannot read nodes for ino %u, returned error is %d\n", f->inocache->ino, ret); + if (f->inocache->state == INO_STATE_READING) + jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT); + return ret; + } + + ret = jffs2_build_inode_fragtree(c, f, &rii); + if (ret) { + JFFS2_ERROR("Failed to build final fragtree for inode #%u: error %d\n", + f->inocache->ino, ret); + if (f->inocache->state == INO_STATE_READING) + jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT); + jffs2_free_tmp_dnode_info_list(&rii.tn_root); + /* FIXME: We could at least crc-check them all */ + if (rii.mdata_tn) { + jffs2_free_full_dnode(rii.mdata_tn->fn); + jffs2_free_tmp_dnode_info(rii.mdata_tn); + rii.mdata_tn = NULL; + } + return ret; + } + + if (rii.mdata_tn) { + if (rii.mdata_tn->fn->raw == rii.latest_ref) { + f->metadata = rii.mdata_tn->fn; + jffs2_free_tmp_dnode_info(rii.mdata_tn); + } else { + jffs2_kill_tn(c, rii.mdata_tn); + } + rii.mdata_tn = NULL; + } + + f->dents = rii.fds; + + jffs2_dbg_fragtree_paranoia_check_nolock(f); + + if (unlikely(!rii.latest_ref)) { + /* No data nodes for this inode. */ + if (f->inocache->ino != 1) { + JFFS2_WARNING("no data nodes found for ino #%u\n", f->inocache->ino); + if (!rii.fds) { + if (f->inocache->state == INO_STATE_READING) + jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT); + return -EIO; + } + JFFS2_NOTICE("but it has children so we fake some modes for it\n"); + } + latest_node->mode = cpu_to_jemode(S_IFDIR|S_IRUGO|S_IWUSR|S_IXUGO); + latest_node->version = cpu_to_je32(0); + latest_node->atime = latest_node->ctime = latest_node->mtime = cpu_to_je32(0); + latest_node->isize = cpu_to_je32(0); + latest_node->gid = cpu_to_je16(0); + latest_node->uid = cpu_to_je16(0); + if (f->inocache->state == INO_STATE_READING) + jffs2_set_inocache_state(c, f->inocache, INO_STATE_PRESENT); + return 0; + } + + ret = jffs2_flash_read(c, ref_offset(rii.latest_ref), sizeof(*latest_node), &retlen, (void *)latest_node); + if (ret || retlen != sizeof(*latest_node)) { + JFFS2_ERROR("failed to read from flash: error %d, %zd of %zd bytes read\n", + ret, retlen, sizeof(*latest_node)); + /* FIXME: If this fails, there seems to be a memory leak. Find it. */ + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return ret?ret:-EIO; + } + + crc = crc32(0, latest_node, sizeof(*latest_node)-8); + if (crc != je32_to_cpu(latest_node->node_crc)) { + JFFS2_ERROR("CRC failed for read_inode of inode %u at physical location 0x%x\n", + f->inocache->ino, ref_offset(rii.latest_ref)); + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return -EIO; + } + + switch(jemode_to_cpu(latest_node->mode) & S_IFMT) { + case S_IFDIR: + if (rii.mctime_ver > je32_to_cpu(latest_node->version)) { + /* The times in the latest_node are actually older than + mctime in the latest dirent. Cheat. */ + latest_node->ctime = latest_node->mtime = cpu_to_je32(rii.latest_mctime); + } + break; + + + case S_IFREG: + /* If it was a regular file, truncate it to the latest node's isize */ + new_size = jffs2_truncate_fragtree(c, &f->fragtree, je32_to_cpu(latest_node->isize)); + if (new_size != je32_to_cpu(latest_node->isize)) { + JFFS2_WARNING("Truncating ino #%u to %d bytes failed because it only had %d bytes to start with!\n", + f->inocache->ino, je32_to_cpu(latest_node->isize), new_size); + latest_node->isize = cpu_to_je32(new_size); + } + break; + + case S_IFLNK: + /* Hack to work around broken isize in old symlink code. + Remove this when dwmw2 comes to his senses and stops + symlinks from being an entirely gratuitous special + case. */ + if (!je32_to_cpu(latest_node->isize)) + latest_node->isize = latest_node->dsize; + + if (f->inocache->state != INO_STATE_CHECKING) { + /* Symlink's inode data is the target path. Read it and + * keep in RAM to facilitate quick follow symlink + * operation. */ + f->target = kmalloc(je32_to_cpu(latest_node->csize) + 1, GFP_KERNEL); + if (!f->target) { + JFFS2_ERROR("can't allocate %d bytes of memory for the symlink target path cache\n", je32_to_cpu(latest_node->csize)); + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return -ENOMEM; + } + + ret = jffs2_flash_read(c, ref_offset(rii.latest_ref) + sizeof(*latest_node), + je32_to_cpu(latest_node->csize), &retlen, (char *)f->target); + + if (ret || retlen != je32_to_cpu(latest_node->csize)) { + if (retlen != je32_to_cpu(latest_node->csize)) + ret = -EIO; + kfree(f->target); + f->target = NULL; + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return ret; + } + + f->target[je32_to_cpu(latest_node->csize)] = '\0'; + dbg_readinode("symlink's target '%s' cached\n", f->target); + } + + /* fall through... */ + + case S_IFBLK: + case S_IFCHR: + /* Certain inode types should have only one data node, and it's + kept as the metadata node */ + if (f->metadata) { + JFFS2_ERROR("Argh. Special inode #%u with mode 0%o had metadata node\n", + f->inocache->ino, jemode_to_cpu(latest_node->mode)); + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return -EIO; + } + if (!frag_first(&f->fragtree)) { + JFFS2_ERROR("Argh. Special inode #%u with mode 0%o has no fragments\n", + f->inocache->ino, jemode_to_cpu(latest_node->mode)); + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return -EIO; + } + /* ASSERT: f->fraglist != NULL */ + if (frag_next(frag_first(&f->fragtree))) { + JFFS2_ERROR("Argh. Special inode #%u with mode 0x%x had more than one node\n", + f->inocache->ino, jemode_to_cpu(latest_node->mode)); + /* FIXME: Deal with it - check crc32, check for duplicate node, check times and discard the older one */ + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + return -EIO; + } + /* OK. We're happy */ + f->metadata = frag_first(&f->fragtree)->node; + jffs2_free_node_frag(frag_first(&f->fragtree)); + f->fragtree = RB_ROOT; + break; + } + if (f->inocache->state == INO_STATE_READING) + jffs2_set_inocache_state(c, f->inocache, INO_STATE_PRESENT); + + return 0; +} + +/* Scan the list of all nodes present for this ino, build map of versions, etc. */ +int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + uint32_t ino, struct jffs2_raw_inode *latest_node) +{ + dbg_readinode("read inode #%u\n", ino); + + retry_inocache: + spin_lock(&c->inocache_lock); + f->inocache = jffs2_get_ino_cache(c, ino); + + if (f->inocache) { + /* Check its state. We may need to wait before we can use it */ + switch(f->inocache->state) { + case INO_STATE_UNCHECKED: + case INO_STATE_CHECKEDABSENT: + f->inocache->state = INO_STATE_READING; + break; + + case INO_STATE_CHECKING: + case INO_STATE_GC: + /* If it's in either of these states, we need + to wait for whoever's got it to finish and + put it back. */ + dbg_readinode("waiting for ino #%u in state %d\n", ino, f->inocache->state); + sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock); + goto retry_inocache; + + case INO_STATE_READING: + case INO_STATE_PRESENT: + /* Eep. This should never happen. It can + happen if Linux calls read_inode() again + before clear_inode() has finished though. */ + JFFS2_ERROR("Eep. Trying to read_inode #%u when it's already in state %d!\n", ino, f->inocache->state); + /* Fail. That's probably better than allowing it to succeed */ + f->inocache = NULL; + break; + + default: + BUG(); + } + } + spin_unlock(&c->inocache_lock); + + if (!f->inocache && ino == 1) { + /* Special case - no root inode on medium */ + f->inocache = jffs2_alloc_inode_cache(); + if (!f->inocache) { + JFFS2_ERROR("cannot allocate inocache for root inode\n"); + return -ENOMEM; + } + dbg_readinode("creating inocache for root inode\n"); + memset(f->inocache, 0, sizeof(struct jffs2_inode_cache)); + f->inocache->ino = f->inocache->pino_nlink = 1; + f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache; + f->inocache->state = INO_STATE_READING; + jffs2_add_ino_cache(c, f->inocache); + } + if (!f->inocache) { + JFFS2_ERROR("requestied to read an nonexistent ino %u\n", ino); + return -ENOENT; + } + + return jffs2_do_read_inode_internal(c, f, latest_node); +} + +int jffs2_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) +{ + struct jffs2_raw_inode n; + struct jffs2_inode_info *f = kzalloc(sizeof(*f), GFP_KERNEL); + int ret; + + if (!f) + return -ENOMEM; + + mutex_init(&f->sem); + mutex_lock(&f->sem); + f->inocache = ic; + + ret = jffs2_do_read_inode_internal(c, f, &n); + if (!ret) { + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); + } + kfree (f); + return ret; +} + +void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f) +{ + struct jffs2_full_dirent *fd, *fds; + int deleted; + + jffs2_xattr_delete_inode(c, f->inocache); + mutex_lock(&f->sem); + deleted = f->inocache && !f->inocache->pino_nlink; + + if (f->inocache && f->inocache->state != INO_STATE_CHECKING) + jffs2_set_inocache_state(c, f->inocache, INO_STATE_CLEARING); + + if (f->metadata) { + if (deleted) + jffs2_mark_node_obsolete(c, f->metadata->raw); + jffs2_free_full_dnode(f->metadata); + } + + jffs2_kill_fragtree(&f->fragtree, deleted?c:NULL); + + if (f->target) { + kfree(f->target); + f->target = NULL; + } + + fds = f->dents; + while(fds) { + fd = fds; + fds = fd->next; + jffs2_free_full_dirent(fd); + } + + if (f->inocache && f->inocache->state != INO_STATE_CHECKING) { + jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT); + if (f->inocache->nodes == (void *)f->inocache) + jffs2_del_ino_cache(c, f->inocache); + } + + mutex_unlock(&f->sem); +} diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c new file mode 100644 index 00000000..3e021cfb --- /dev/null +++ b/fs/jffs2/scan.c @@ -0,0 +1,1148 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" +#include "summary.h" +#include "debug.h" + +#define DEFAULT_EMPTY_SCAN_SIZE 256 + +#define noisy_printk(noise, args...) do { \ + if (*(noise)) { \ + printk(KERN_NOTICE args); \ + (*(noise))--; \ + if (!(*(noise))) { \ + printk(KERN_NOTICE "Further such events for this erase block will not be printed\n"); \ + } \ + } \ +} while(0) + +static uint32_t pseudo_random; + +static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s); + +/* These helper functions _must_ increase ofs and also do the dirty/used space accounting. + * Returning an error will abort the mount - bad checksums etc. should just mark the space + * as dirty. + */ +static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s); +static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_dirent *rd, uint32_t ofs, struct jffs2_summary *s); + +static inline int min_free(struct jffs2_sb_info *c) +{ + uint32_t min = 2 * sizeof(struct jffs2_raw_inode); +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + if (!jffs2_can_mark_obsolete(c) && min < c->wbuf_pagesize) + return c->wbuf_pagesize; +#endif + return min; + +} + +static inline uint32_t EMPTY_SCAN_SIZE(uint32_t sector_size) { + if (sector_size < DEFAULT_EMPTY_SCAN_SIZE) + return sector_size; + else + return DEFAULT_EMPTY_SCAN_SIZE; +} + +static int file_dirty(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +{ + int ret; + + if ((ret = jffs2_prealloc_raw_node_refs(c, jeb, 1))) + return ret; + if ((ret = jffs2_scan_dirty_space(c, jeb, jeb->free_size))) + return ret; + /* Turned wasted size into dirty, since we apparently + think it's recoverable now. */ + jeb->dirty_size += jeb->wasted_size; + c->dirty_size += jeb->wasted_size; + c->wasted_size -= jeb->wasted_size; + jeb->wasted_size = 0; + if (VERYDIRTY(c, jeb->dirty_size)) { + list_add(&jeb->list, &c->very_dirty_list); + } else { + list_add(&jeb->list, &c->dirty_list); + } + return 0; +} + +int jffs2_scan_medium(struct jffs2_sb_info *c) +{ + int i, ret; + uint32_t empty_blocks = 0, bad_blocks = 0; + unsigned char *flashbuf = NULL; + uint32_t buf_size = 0; + struct jffs2_summary *s = NULL; /* summary info collected by the scan process */ +#ifndef __ECOS + size_t pointlen, try_size; + + if (c->mtd->point) { + ret = c->mtd->point(c->mtd, 0, c->mtd->size, &pointlen, + (void **)&flashbuf, NULL); + if (!ret && pointlen < c->mtd->size) { + /* Don't muck about if it won't let us point to the whole flash */ + D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", pointlen)); + c->mtd->unpoint(c->mtd, 0, pointlen); + flashbuf = NULL; + } + if (ret) + D1(printk(KERN_DEBUG "MTD point failed %d\n", ret)); + } +#endif + if (!flashbuf) { + /* For NAND it's quicker to read a whole eraseblock at a time, + apparently */ + if (jffs2_cleanmarker_oob(c)) + try_size = c->sector_size; + if (c->mtd->type == MTD_NANDFLASH) + buf_size = c->sector_size; + else + try_size = PAGE_SIZE; + + D1(printk(KERN_DEBUG "Trying to allocate readbuf of %zu " + "bytes\n", try_size)); + + flashbuf = mtd_kmalloc_up_to(c->mtd, &try_size); + if (!flashbuf) + return -ENOMEM; + + D1(printk(KERN_DEBUG "Allocated readbuf of %zu bytes\n", + try_size)); + + buf_size = (uint32_t)try_size; + } + + if (jffs2_sum_active()) { + s = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL); + if (!s) { + JFFS2_WARNING("Can't allocate memory for summary\n"); + ret = -ENOMEM; + goto out; + } + } + + for (i=0; inr_blocks; i++) { + struct jffs2_eraseblock *jeb = &c->blocks[i]; + + cond_resched(); + + /* reset summary info for next eraseblock scan */ + jffs2_sum_reset_collected(s); + + ret = jffs2_scan_eraseblock(c, jeb, buf_size?flashbuf:(flashbuf+jeb->offset), + buf_size, s); + + if (ret < 0) + goto out; + + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + + /* Now decide which list to put it on */ + switch(ret) { + case BLK_STATE_ALLFF: + /* + * Empty block. Since we can't be sure it + * was entirely erased, we just queue it for erase + * again. It will be marked as such when the erase + * is complete. Meanwhile we still count it as empty + * for later checks. + */ + empty_blocks++; + list_add(&jeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + break; + + case BLK_STATE_CLEANMARKER: + /* Only a CLEANMARKER node is valid */ + if (!jeb->dirty_size) { + /* It's actually free */ + list_add(&jeb->list, &c->free_list); + c->nr_free_blocks++; + } else { + /* Dirt */ + D1(printk(KERN_DEBUG "Adding all-dirty block at 0x%08x to erase_pending_list\n", jeb->offset)); + list_add(&jeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + } + break; + + case BLK_STATE_CLEAN: + /* Full (or almost full) of clean data. Clean list */ + list_add(&jeb->list, &c->clean_list); + break; + + case BLK_STATE_PARTDIRTY: + /* Some data, but not full. Dirty list. */ + /* We want to remember the block with most free space + and stick it in the 'nextblock' position to start writing to it. */ + if (jeb->free_size > min_free(c) && + (!c->nextblock || c->nextblock->free_size < jeb->free_size)) { + /* Better candidate for the next writes to go to */ + if (c->nextblock) { + ret = file_dirty(c, c->nextblock); + if (ret) + goto out; + /* deleting summary information of the old nextblock */ + jffs2_sum_reset_collected(c->summary); + } + /* update collected summary information for the current nextblock */ + jffs2_sum_move_collected(c, s); + D1(printk(KERN_DEBUG "jffs2_scan_medium(): new nextblock = 0x%08x\n", jeb->offset)); + c->nextblock = jeb; + } else { + ret = file_dirty(c, jeb); + if (ret) + goto out; + } + break; + + case BLK_STATE_ALLDIRTY: + /* Nothing valid - not even a clean marker. Needs erasing. */ + /* For now we just put it on the erasing list. We'll start the erases later */ + D1(printk(KERN_NOTICE "JFFS2: Erase block at 0x%08x is not formatted. It will be erased\n", jeb->offset)); + list_add(&jeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + break; + + case BLK_STATE_BADBLOCK: + D1(printk(KERN_NOTICE "JFFS2: Block at 0x%08x is bad\n", jeb->offset)); + list_add(&jeb->list, &c->bad_list); + c->bad_size += c->sector_size; + c->free_size -= c->sector_size; + bad_blocks++; + break; + default: + printk(KERN_WARNING "jffs2_scan_medium(): unknown block state\n"); + BUG(); + } + } + + /* Nextblock dirty is always seen as wasted, because we cannot recycle it now */ + if (c->nextblock && (c->nextblock->dirty_size)) { + c->nextblock->wasted_size += c->nextblock->dirty_size; + c->wasted_size += c->nextblock->dirty_size; + c->dirty_size -= c->nextblock->dirty_size; + c->nextblock->dirty_size = 0; + } +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + if (!jffs2_can_mark_obsolete(c) && c->wbuf_pagesize && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) { + /* If we're going to start writing into a block which already + contains data, and the end of the data isn't page-aligned, + skip a little and align it. */ + + uint32_t skip = c->nextblock->free_size % c->wbuf_pagesize; + + D1(printk(KERN_DEBUG "jffs2_scan_medium(): Skipping %d bytes in nextblock to ensure page alignment\n", + skip)); + jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); + jffs2_scan_dirty_space(c, c->nextblock, skip); + } +#endif + if (c->nr_erasing_blocks) { + if ( !c->used_size && ((c->nr_free_blocks+empty_blocks+bad_blocks)!= c->nr_blocks || bad_blocks == c->nr_blocks) ) { + printk(KERN_NOTICE "Cowardly refusing to erase blocks on filesystem with no valid JFFS2 nodes\n"); + printk(KERN_NOTICE "empty_blocks %d, bad_blocks %d, c->nr_blocks %d\n",empty_blocks,bad_blocks,c->nr_blocks); + ret = -EIO; + goto out; + } + spin_lock(&c->erase_completion_lock); + jffs2_garbage_collect_trigger(c); + spin_unlock(&c->erase_completion_lock); + } + ret = 0; + out: + if (buf_size) + kfree(flashbuf); +#ifndef __ECOS + else + c->mtd->unpoint(c->mtd, 0, c->mtd->size); +#endif + if (s) + kfree(s); + + return ret; +} + +static int jffs2_fill_scan_buf(struct jffs2_sb_info *c, void *buf, + uint32_t ofs, uint32_t len) +{ + int ret; + size_t retlen; + + ret = jffs2_flash_read(c, ofs, len, &retlen, buf); + if (ret) { + D1(printk(KERN_WARNING "mtd->read(0x%x bytes from 0x%x) returned %d\n", len, ofs, ret)); + return ret; + } + if (retlen < len) { + D1(printk(KERN_WARNING "Read at 0x%x gave only 0x%zx bytes\n", ofs, retlen)); + return -EIO; + } + return 0; +} + +int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +{ + if ((jeb->used_size + jeb->unchecked_size) == PAD(c->cleanmarker_size) && !jeb->dirty_size + && (!jeb->first_node || !ref_next(jeb->first_node)) ) + return BLK_STATE_CLEANMARKER; + + /* move blocks with max 4 byte dirty space to cleanlist */ + else if (!ISDIRTY(c->sector_size - (jeb->used_size + jeb->unchecked_size))) { + c->dirty_size -= jeb->dirty_size; + c->wasted_size += jeb->dirty_size; + jeb->wasted_size += jeb->dirty_size; + jeb->dirty_size = 0; + return BLK_STATE_CLEAN; + } else if (jeb->used_size || jeb->unchecked_size) + return BLK_STATE_PARTDIRTY; + else + return BLK_STATE_ALLDIRTY; +} + +#ifdef CONFIG_JFFS2_FS_XATTR +static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_xattr *rx, uint32_t ofs, + struct jffs2_summary *s) +{ + struct jffs2_xattr_datum *xd; + uint32_t xid, version, totlen, crc; + int err; + + crc = crc32(0, rx, sizeof(struct jffs2_raw_xattr) - 4); + if (crc != je32_to_cpu(rx->node_crc)) { + JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", + ofs, je32_to_cpu(rx->node_crc), crc); + if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen)))) + return err; + return 0; + } + + xid = je32_to_cpu(rx->xid); + version = je32_to_cpu(rx->version); + + totlen = PAD(sizeof(struct jffs2_raw_xattr) + + rx->name_len + 1 + je16_to_cpu(rx->value_len)); + if (totlen != je32_to_cpu(rx->totlen)) { + JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n", + ofs, je32_to_cpu(rx->totlen), totlen); + if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen)))) + return err; + return 0; + } + + xd = jffs2_setup_xattr_datum(c, xid, version); + if (IS_ERR(xd)) + return PTR_ERR(xd); + + if (xd->version > version) { + struct jffs2_raw_node_ref *raw + = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, NULL); + raw->next_in_ino = xd->node->next_in_ino; + xd->node->next_in_ino = raw; + } else { + xd->version = version; + xd->xprefix = rx->xprefix; + xd->name_len = rx->name_len; + xd->value_len = je16_to_cpu(rx->value_len); + xd->data_crc = je32_to_cpu(rx->data_crc); + + jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, (void *)xd); + } + + if (jffs2_sum_active()) + jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset); + dbg_xattr("scaning xdatum at %#08x (xid=%u, version=%u)\n", + ofs, xd->xid, xd->version); + return 0; +} + +static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_xref *rr, uint32_t ofs, + struct jffs2_summary *s) +{ + struct jffs2_xattr_ref *ref; + uint32_t crc; + int err; + + crc = crc32(0, rr, sizeof(*rr) - 4); + if (crc != je32_to_cpu(rr->node_crc)) { + JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", + ofs, je32_to_cpu(rr->node_crc), crc); + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rr->totlen))))) + return err; + return 0; + } + + if (PAD(sizeof(struct jffs2_raw_xref)) != je32_to_cpu(rr->totlen)) { + JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%zd\n", + ofs, je32_to_cpu(rr->totlen), + PAD(sizeof(struct jffs2_raw_xref))); + if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rr->totlen)))) + return err; + return 0; + } + + ref = jffs2_alloc_xattr_ref(); + if (!ref) + return -ENOMEM; + + /* BEFORE jffs2_build_xattr_subsystem() called, + * and AFTER xattr_ref is marked as a dead xref, + * ref->xid is used to store 32bit xid, xd is not used + * ref->ino is used to store 32bit inode-number, ic is not used + * Thoes variables are declared as union, thus using those + * are exclusive. In a similar way, ref->next is temporarily + * used to chain all xattr_ref object. It's re-chained to + * jffs2_inode_cache in jffs2_build_xattr_subsystem() correctly. + */ + ref->ino = je32_to_cpu(rr->ino); + ref->xid = je32_to_cpu(rr->xid); + ref->xseqno = je32_to_cpu(rr->xseqno); + if (ref->xseqno > c->highest_xseqno) + c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER); + ref->next = c->xref_temp; + c->xref_temp = ref; + + jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rr->totlen)), (void *)ref); + + if (jffs2_sum_active()) + jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset); + dbg_xattr("scan xref at %#08x (xid=%u, ino=%u)\n", + ofs, ref->xid, ref->ino); + return 0; +} +#endif + +/* Called with 'buf_size == 0' if buf is in fact a pointer _directly_ into + the flash, XIP-style */ +static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) { + struct jffs2_unknown_node *node; + struct jffs2_unknown_node crcnode; + uint32_t ofs, prevofs, max_ofs; + uint32_t hdr_crc, buf_ofs, buf_len; + int err; + int noise = 0; + + +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + int cleanmarkerfound = 0; +#endif + + ofs = jeb->offset; + prevofs = jeb->offset - 1; + + D1(printk(KERN_DEBUG "jffs2_scan_eraseblock(): Scanning block at 0x%x\n", ofs)); + +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + if (c->mtd->type == MTD_NANDFLASH) { + int ret; + + if (c->mtd->block_isbad(c->mtd, jeb->offset)) + return BLK_STATE_BADBLOCK; + + if (jffs2_cleanmarker_oob(c)) { + ret = jffs2_check_nand_cleanmarker(c, jeb); + D2(printk(KERN_NOTICE "jffs_check_nand_cleanmarker returned %d\n", ret)); + + /* Even if it's not found, we still scan to see + if the block is empty. We use this information + to decide whether to erase it or not. */ + switch (ret) { + case 0: cleanmarkerfound = 1; break; + case 1: break; + default: return ret; + } + } + } +#endif + + if (jffs2_sum_active()) { + struct jffs2_sum_marker *sm; + void *sumptr = NULL; + uint32_t sumlen; + + if (!buf_size) { + /* XIP case. Just look, point at the summary if it's there */ + sm = (void *)buf + c->sector_size - sizeof(*sm); + if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) { + sumptr = buf + je32_to_cpu(sm->offset); + sumlen = c->sector_size - je32_to_cpu(sm->offset); + } + } else { + /* If NAND flash, read a whole page of it. Else just the end */ + if (c->wbuf_pagesize) + buf_len = c->wbuf_pagesize; + else + buf_len = sizeof(*sm); + + /* Read as much as we want into the _end_ of the preallocated buffer */ + err = jffs2_fill_scan_buf(c, buf + buf_size - buf_len, + jeb->offset + c->sector_size - buf_len, + buf_len); + if (err) + return err; + + sm = (void *)buf + buf_size - sizeof(*sm); + if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) { + sumlen = c->sector_size - je32_to_cpu(sm->offset); + sumptr = buf + buf_size - sumlen; + + /* Now, make sure the summary itself is available */ + if (sumlen > buf_size) { + /* Need to kmalloc for this. */ + sumptr = kmalloc(sumlen, GFP_KERNEL); + if (!sumptr) + return -ENOMEM; + memcpy(sumptr + sumlen - buf_len, buf + buf_size - buf_len, buf_len); + } + if (buf_len < sumlen) { + /* Need to read more so that the entire summary node is present */ + err = jffs2_fill_scan_buf(c, sumptr, + jeb->offset + c->sector_size - sumlen, + sumlen - buf_len); + if (err) + return err; + } + } + + } + + if (sumptr) { + err = jffs2_sum_scan_sumnode(c, jeb, sumptr, sumlen, &pseudo_random); + + if (buf_size && sumlen > buf_size) + kfree(sumptr); + /* If it returns with a real error, bail. + If it returns positive, that's a block classification + (i.e. BLK_STATE_xxx) so return that too. + If it returns zero, fall through to full scan. */ + if (err) + return err; + } + } + + buf_ofs = jeb->offset; + + if (!buf_size) { + /* This is the XIP case -- we're reading _directly_ from the flash chip */ + buf_len = c->sector_size; + } else { + buf_len = EMPTY_SCAN_SIZE(c->sector_size); + err = jffs2_fill_scan_buf(c, buf, buf_ofs, buf_len); + if (err) + return err; + } + + /* We temporarily use 'ofs' as a pointer into the buffer/jeb */ + ofs = 0; + max_ofs = EMPTY_SCAN_SIZE(c->sector_size); + /* Scan only EMPTY_SCAN_SIZE of 0xFF before declaring it's empty */ + while(ofs < max_ofs && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF) + ofs += 4; + + if (ofs == max_ofs) { +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + if (jffs2_cleanmarker_oob(c)) { + /* scan oob, take care of cleanmarker */ + int ret = jffs2_check_oob_empty(c, jeb, cleanmarkerfound); + D2(printk(KERN_NOTICE "jffs2_check_oob_empty returned %d\n",ret)); + switch (ret) { + case 0: return cleanmarkerfound ? BLK_STATE_CLEANMARKER : BLK_STATE_ALLFF; + case 1: return BLK_STATE_ALLDIRTY; + default: return ret; + } + } +#endif + D1(printk(KERN_DEBUG "Block at 0x%08x is empty (erased)\n", jeb->offset)); + if (c->cleanmarker_size == 0) + return BLK_STATE_CLEANMARKER; /* don't bother with re-erase */ + else + return BLK_STATE_ALLFF; /* OK to erase if all blocks are like this */ + } + if (ofs) { + D1(printk(KERN_DEBUG "Free space at %08x ends at %08x\n", jeb->offset, + jeb->offset + ofs)); + if ((err = jffs2_prealloc_raw_node_refs(c, jeb, 1))) + return err; + if ((err = jffs2_scan_dirty_space(c, jeb, ofs))) + return err; + } + + /* Now ofs is a complete physical flash offset as it always was... */ + ofs += jeb->offset; + + noise = 10; + + dbg_summary("no summary found in jeb 0x%08x. Apply original scan.\n",jeb->offset); + +scan_more: + while(ofs < jeb->offset + c->sector_size) { + + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + + /* Make sure there are node refs available for use */ + err = jffs2_prealloc_raw_node_refs(c, jeb, 2); + if (err) + return err; + + cond_resched(); + + if (ofs & 3) { + printk(KERN_WARNING "Eep. ofs 0x%08x not word-aligned!\n", ofs); + ofs = PAD(ofs); + continue; + } + if (ofs == prevofs) { + printk(KERN_WARNING "ofs 0x%08x has already been seen. Skipping\n", ofs); + if ((err = jffs2_scan_dirty_space(c, jeb, 4))) + return err; + ofs += 4; + continue; + } + prevofs = ofs; + + if (jeb->offset + c->sector_size < ofs + sizeof(*node)) { + D1(printk(KERN_DEBUG "Fewer than %zd bytes left to end of block. (%x+%x<%x+%zx) Not reading\n", sizeof(struct jffs2_unknown_node), + jeb->offset, c->sector_size, ofs, sizeof(*node))); + if ((err = jffs2_scan_dirty_space(c, jeb, (jeb->offset + c->sector_size)-ofs))) + return err; + break; + } + + if (buf_ofs + buf_len < ofs + sizeof(*node)) { + buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); + D1(printk(KERN_DEBUG "Fewer than %zd bytes (node header) left to end of buf. Reading 0x%x at 0x%08x\n", + sizeof(struct jffs2_unknown_node), buf_len, ofs)); + err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); + if (err) + return err; + buf_ofs = ofs; + } + + node = (struct jffs2_unknown_node *)&buf[ofs-buf_ofs]; + + if (*(uint32_t *)(&buf[ofs-buf_ofs]) == 0xffffffff) { + uint32_t inbuf_ofs; + uint32_t empty_start, scan_end; + + empty_start = ofs; + ofs += 4; + scan_end = min_t(uint32_t, EMPTY_SCAN_SIZE(c->sector_size)/8, buf_len); + + D1(printk(KERN_DEBUG "Found empty flash at 0x%08x\n", ofs)); + more_empty: + inbuf_ofs = ofs - buf_ofs; + while (inbuf_ofs < scan_end) { + if (unlikely(*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff)) { + printk(KERN_WARNING "Empty flash at 0x%08x ends at 0x%08x\n", + empty_start, ofs); + if ((err = jffs2_scan_dirty_space(c, jeb, ofs-empty_start))) + return err; + goto scan_more; + } + + inbuf_ofs+=4; + ofs += 4; + } + /* Ran off end. */ + D1(printk(KERN_DEBUG "Empty flash to end of buffer at 0x%08x\n", ofs)); + + /* If we're only checking the beginning of a block with a cleanmarker, + bail now */ + if (buf_ofs == jeb->offset && jeb->used_size == PAD(c->cleanmarker_size) && + c->cleanmarker_size && !jeb->dirty_size && !ref_next(jeb->first_node)) { + D1(printk(KERN_DEBUG "%d bytes at start of block seems clean... assuming all clean\n", EMPTY_SCAN_SIZE(c->sector_size))); + return BLK_STATE_CLEANMARKER; + } + if (!buf_size && (scan_end != buf_len)) {/* XIP/point case */ + scan_end = buf_len; + goto more_empty; + } + + /* See how much more there is to read in this eraseblock... */ + buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); + if (!buf_len) { + /* No more to read. Break out of main loop without marking + this range of empty space as dirty (because it's not) */ + D1(printk(KERN_DEBUG "Empty flash at %08x runs to end of block. Treating as free_space\n", + empty_start)); + break; + } + /* point never reaches here */ + scan_end = buf_len; + D1(printk(KERN_DEBUG "Reading another 0x%x at 0x%08x\n", buf_len, ofs)); + err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); + if (err) + return err; + buf_ofs = ofs; + goto more_empty; + } + + if (ofs == jeb->offset && je16_to_cpu(node->magic) == KSAMTIB_CIGAM_2SFFJ) { + printk(KERN_WARNING "Magic bitmask is backwards at offset 0x%08x. Wrong endian filesystem?\n", ofs); + if ((err = jffs2_scan_dirty_space(c, jeb, 4))) + return err; + ofs += 4; + continue; + } + if (je16_to_cpu(node->magic) == JFFS2_DIRTY_BITMASK) { + D1(printk(KERN_DEBUG "Dirty bitmask at 0x%08x\n", ofs)); + if ((err = jffs2_scan_dirty_space(c, jeb, 4))) + return err; + ofs += 4; + continue; + } + if (je16_to_cpu(node->magic) == JFFS2_OLD_MAGIC_BITMASK) { + printk(KERN_WARNING "Old JFFS2 bitmask found at 0x%08x\n", ofs); + printk(KERN_WARNING "You cannot use older JFFS2 filesystems with newer kernels\n"); + if ((err = jffs2_scan_dirty_space(c, jeb, 4))) + return err; + ofs += 4; + continue; + } + if (je16_to_cpu(node->magic) != JFFS2_MAGIC_BITMASK) { + /* OK. We're out of possibilities. Whinge and move on */ + noisy_printk(&noise, "jffs2_scan_eraseblock(): Magic bitmask 0x%04x not found at 0x%08x: 0x%04x instead\n", + JFFS2_MAGIC_BITMASK, ofs, + je16_to_cpu(node->magic)); + if ((err = jffs2_scan_dirty_space(c, jeb, 4))) + return err; + ofs += 4; + continue; + } + /* We seem to have a node of sorts. Check the CRC */ + crcnode.magic = node->magic; + crcnode.nodetype = cpu_to_je16( je16_to_cpu(node->nodetype) | JFFS2_NODE_ACCURATE); + crcnode.totlen = node->totlen; + hdr_crc = crc32(0, &crcnode, sizeof(crcnode)-4); + + if (hdr_crc != je32_to_cpu(node->hdr_crc)) { + noisy_printk(&noise, "jffs2_scan_eraseblock(): Node at 0x%08x {0x%04x, 0x%04x, 0x%08x) has invalid CRC 0x%08x (calculated 0x%08x)\n", + ofs, je16_to_cpu(node->magic), + je16_to_cpu(node->nodetype), + je32_to_cpu(node->totlen), + je32_to_cpu(node->hdr_crc), + hdr_crc); + if ((err = jffs2_scan_dirty_space(c, jeb, 4))) + return err; + ofs += 4; + continue; + } + + if (ofs + je32_to_cpu(node->totlen) > jeb->offset + c->sector_size) { + /* Eep. Node goes over the end of the erase block. */ + printk(KERN_WARNING "Node at 0x%08x with length 0x%08x would run over the end of the erase block\n", + ofs, je32_to_cpu(node->totlen)); + printk(KERN_WARNING "Perhaps the file system was created with the wrong erase size?\n"); + if ((err = jffs2_scan_dirty_space(c, jeb, 4))) + return err; + ofs += 4; + continue; + } + + if (!(je16_to_cpu(node->nodetype) & JFFS2_NODE_ACCURATE)) { + /* Wheee. This is an obsoleted node */ + D2(printk(KERN_DEBUG "Node at 0x%08x is obsolete. Skipping\n", ofs)); + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen))))) + return err; + ofs += PAD(je32_to_cpu(node->totlen)); + continue; + } + + switch(je16_to_cpu(node->nodetype)) { + case JFFS2_NODETYPE_INODE: + if (buf_ofs + buf_len < ofs + sizeof(struct jffs2_raw_inode)) { + buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); + D1(printk(KERN_DEBUG "Fewer than %zd bytes (inode node) left to end of buf. Reading 0x%x at 0x%08x\n", + sizeof(struct jffs2_raw_inode), buf_len, ofs)); + err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); + if (err) + return err; + buf_ofs = ofs; + node = (void *)buf; + } + err = jffs2_scan_inode_node(c, jeb, (void *)node, ofs, s); + if (err) return err; + ofs += PAD(je32_to_cpu(node->totlen)); + break; + + case JFFS2_NODETYPE_DIRENT: + if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) { + buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); + D1(printk(KERN_DEBUG "Fewer than %d bytes (dirent node) left to end of buf. Reading 0x%x at 0x%08x\n", + je32_to_cpu(node->totlen), buf_len, ofs)); + err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); + if (err) + return err; + buf_ofs = ofs; + node = (void *)buf; + } + err = jffs2_scan_dirent_node(c, jeb, (void *)node, ofs, s); + if (err) return err; + ofs += PAD(je32_to_cpu(node->totlen)); + break; + +#ifdef CONFIG_JFFS2_FS_XATTR + case JFFS2_NODETYPE_XATTR: + if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) { + buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); + D1(printk(KERN_DEBUG "Fewer than %d bytes (xattr node)" + " left to end of buf. Reading 0x%x at 0x%08x\n", + je32_to_cpu(node->totlen), buf_len, ofs)); + err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); + if (err) + return err; + buf_ofs = ofs; + node = (void *)buf; + } + err = jffs2_scan_xattr_node(c, jeb, (void *)node, ofs, s); + if (err) + return err; + ofs += PAD(je32_to_cpu(node->totlen)); + break; + case JFFS2_NODETYPE_XREF: + if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) { + buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); + D1(printk(KERN_DEBUG "Fewer than %d bytes (xref node)" + " left to end of buf. Reading 0x%x at 0x%08x\n", + je32_to_cpu(node->totlen), buf_len, ofs)); + err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); + if (err) + return err; + buf_ofs = ofs; + node = (void *)buf; + } + err = jffs2_scan_xref_node(c, jeb, (void *)node, ofs, s); + if (err) + return err; + ofs += PAD(je32_to_cpu(node->totlen)); + break; +#endif /* CONFIG_JFFS2_FS_XATTR */ + + case JFFS2_NODETYPE_CLEANMARKER: + D1(printk(KERN_DEBUG "CLEANMARKER node found at 0x%08x\n", ofs)); + if (je32_to_cpu(node->totlen) != c->cleanmarker_size) { + printk(KERN_NOTICE "CLEANMARKER node found at 0x%08x has totlen 0x%x != normal 0x%x\n", + ofs, je32_to_cpu(node->totlen), c->cleanmarker_size); + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node))))) + return err; + ofs += PAD(sizeof(struct jffs2_unknown_node)); + } else if (jeb->first_node) { + printk(KERN_NOTICE "CLEANMARKER node found at 0x%08x, not first node in block (0x%08x)\n", ofs, jeb->offset); + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node))))) + return err; + ofs += PAD(sizeof(struct jffs2_unknown_node)); + } else { + jffs2_link_node_ref(c, jeb, ofs | REF_NORMAL, c->cleanmarker_size, NULL); + + ofs += PAD(c->cleanmarker_size); + } + break; + + case JFFS2_NODETYPE_PADDING: + if (jffs2_sum_active()) + jffs2_sum_add_padding_mem(s, je32_to_cpu(node->totlen)); + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen))))) + return err; + ofs += PAD(je32_to_cpu(node->totlen)); + break; + + default: + switch (je16_to_cpu(node->nodetype) & JFFS2_COMPAT_MASK) { + case JFFS2_FEATURE_ROCOMPAT: + printk(KERN_NOTICE "Read-only compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs); + c->flags |= JFFS2_SB_FLAG_RO; + if (!(jffs2_is_readonly(c))) + return -EROFS; + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen))))) + return err; + ofs += PAD(je32_to_cpu(node->totlen)); + break; + + case JFFS2_FEATURE_INCOMPAT: + printk(KERN_NOTICE "Incompatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs); + return -EINVAL; + + case JFFS2_FEATURE_RWCOMPAT_DELETE: + D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs)); + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen))))) + return err; + ofs += PAD(je32_to_cpu(node->totlen)); + break; + + case JFFS2_FEATURE_RWCOMPAT_COPY: { + D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs)); + + jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(node->totlen)), NULL); + + /* We can't summarise nodes we don't grok */ + jffs2_sum_disable_collecting(s); + ofs += PAD(je32_to_cpu(node->totlen)); + break; + } + } + } + } + + if (jffs2_sum_active()) { + if (PAD(s->sum_size + JFFS2_SUMMARY_FRAME_SIZE) > jeb->free_size) { + dbg_summary("There is not enough space for " + "summary information, disabling for this jeb!\n"); + jffs2_sum_disable_collecting(s); + } + } + + D1(printk(KERN_DEBUG "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x, wasted 0x%08x\n", + jeb->offset,jeb->free_size, jeb->dirty_size, jeb->unchecked_size, jeb->used_size, jeb->wasted_size)); + + /* mark_node_obsolete can add to wasted !! */ + if (jeb->wasted_size) { + jeb->dirty_size += jeb->wasted_size; + c->dirty_size += jeb->wasted_size; + c->wasted_size -= jeb->wasted_size; + jeb->wasted_size = 0; + } + + return jffs2_scan_classify_jeb(c, jeb); +} + +struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uint32_t ino) +{ + struct jffs2_inode_cache *ic; + + ic = jffs2_get_ino_cache(c, ino); + if (ic) + return ic; + + if (ino > c->highest_ino) + c->highest_ino = ino; + + ic = jffs2_alloc_inode_cache(); + if (!ic) { + printk(KERN_NOTICE "jffs2_scan_make_inode_cache(): allocation of inode cache failed\n"); + return NULL; + } + memset(ic, 0, sizeof(*ic)); + + ic->ino = ino; + ic->nodes = (void *)ic; + jffs2_add_ino_cache(c, ic); + if (ino == 1) + ic->pino_nlink = 1; + return ic; +} + +static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s) +{ + struct jffs2_inode_cache *ic; + uint32_t crc, ino = je32_to_cpu(ri->ino); + + D1(printk(KERN_DEBUG "jffs2_scan_inode_node(): Node at 0x%08x\n", ofs)); + + /* We do very little here now. Just check the ino# to which we should attribute + this node; we can do all the CRC checking etc. later. There's a tradeoff here -- + we used to scan the flash once only, reading everything we want from it into + memory, then building all our in-core data structures and freeing the extra + information. Now we allow the first part of the mount to complete a lot quicker, + but we have to go _back_ to the flash in order to finish the CRC checking, etc. + Which means that the _full_ amount of time to get to proper write mode with GC + operational may actually be _longer_ than before. Sucks to be me. */ + + /* Check the node CRC in any case. */ + crc = crc32(0, ri, sizeof(*ri)-8); + if (crc != je32_to_cpu(ri->node_crc)) { + printk(KERN_NOTICE "jffs2_scan_inode_node(): CRC failed on " + "node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ofs, je32_to_cpu(ri->node_crc), crc); + /* + * We believe totlen because the CRC on the node + * _header_ was OK, just the node itself failed. + */ + return jffs2_scan_dirty_space(c, jeb, + PAD(je32_to_cpu(ri->totlen))); + } + + ic = jffs2_get_ino_cache(c, ino); + if (!ic) { + ic = jffs2_scan_make_ino_cache(c, ino); + if (!ic) + return -ENOMEM; + } + + /* Wheee. It worked */ + jffs2_link_node_ref(c, jeb, ofs | REF_UNCHECKED, PAD(je32_to_cpu(ri->totlen)), ic); + + D1(printk(KERN_DEBUG "Node is ino #%u, version %d. Range 0x%x-0x%x\n", + je32_to_cpu(ri->ino), je32_to_cpu(ri->version), + je32_to_cpu(ri->offset), + je32_to_cpu(ri->offset)+je32_to_cpu(ri->dsize))); + + pseudo_random += je32_to_cpu(ri->version); + + if (jffs2_sum_active()) { + jffs2_sum_add_inode_mem(s, ri, ofs - jeb->offset); + } + + return 0; +} + +static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_dirent *rd, uint32_t ofs, struct jffs2_summary *s) +{ + struct jffs2_full_dirent *fd; + struct jffs2_inode_cache *ic; + uint32_t checkedlen; + uint32_t crc; + int err; + + D1(printk(KERN_DEBUG "jffs2_scan_dirent_node(): Node at 0x%08x\n", ofs)); + + /* We don't get here unless the node is still valid, so we don't have to + mask in the ACCURATE bit any more. */ + crc = crc32(0, rd, sizeof(*rd)-8); + + if (crc != je32_to_cpu(rd->node_crc)) { + printk(KERN_NOTICE "jffs2_scan_dirent_node(): Node CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ofs, je32_to_cpu(rd->node_crc), crc); + /* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */ + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen))))) + return err; + return 0; + } + + pseudo_random += je32_to_cpu(rd->version); + + /* Should never happen. Did. (OLPC trac #4184)*/ + checkedlen = strnlen(rd->name, rd->nsize); + if (checkedlen < rd->nsize) { + printk(KERN_ERR "Dirent at %08x has zeroes in name. Truncating to %d chars\n", + ofs, checkedlen); + } + fd = jffs2_alloc_full_dirent(checkedlen+1); + if (!fd) { + return -ENOMEM; + } + memcpy(&fd->name, rd->name, checkedlen); + fd->name[checkedlen] = 0; + + crc = crc32(0, fd->name, rd->nsize); + if (crc != je32_to_cpu(rd->name_crc)) { + printk(KERN_NOTICE "jffs2_scan_dirent_node(): Name CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ofs, je32_to_cpu(rd->name_crc), crc); + D1(printk(KERN_NOTICE "Name for which CRC failed is (now) '%s', ino #%d\n", fd->name, je32_to_cpu(rd->ino))); + jffs2_free_full_dirent(fd); + /* FIXME: Why do we believe totlen? */ + /* We believe totlen because the CRC on the node _header_ was OK, just the name failed. */ + if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen))))) + return err; + return 0; + } + ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(rd->pino)); + if (!ic) { + jffs2_free_full_dirent(fd); + return -ENOMEM; + } + + fd->raw = jffs2_link_node_ref(c, jeb, ofs | dirent_node_state(rd), + PAD(je32_to_cpu(rd->totlen)), ic); + + fd->next = NULL; + fd->version = je32_to_cpu(rd->version); + fd->ino = je32_to_cpu(rd->ino); + fd->nhash = full_name_hash(fd->name, checkedlen); + fd->type = rd->type; + jffs2_add_fd_to_list(c, fd, &ic->scan_dents); + + if (jffs2_sum_active()) { + jffs2_sum_add_dirent_mem(s, rd, ofs - jeb->offset); + } + + return 0; +} + +static int count_list(struct list_head *l) +{ + uint32_t count = 0; + struct list_head *tmp; + + list_for_each(tmp, l) { + count++; + } + return count; +} + +/* Note: This breaks if list_empty(head). I don't care. You + might, if you copy this code and use it elsewhere :) */ +static void rotate_list(struct list_head *head, uint32_t count) +{ + struct list_head *n = head->next; + + list_del(head); + while(count--) { + n = n->next; + } + list_add(head, n); +} + +void jffs2_rotate_lists(struct jffs2_sb_info *c) +{ + uint32_t x; + uint32_t rotateby; + + x = count_list(&c->clean_list); + if (x) { + rotateby = pseudo_random % x; + rotate_list((&c->clean_list), rotateby); + } + + x = count_list(&c->very_dirty_list); + if (x) { + rotateby = pseudo_random % x; + rotate_list((&c->very_dirty_list), rotateby); + } + + x = count_list(&c->dirty_list); + if (x) { + rotateby = pseudo_random % x; + rotate_list((&c->dirty_list), rotateby); + } + + x = count_list(&c->erasable_list); + if (x) { + rotateby = pseudo_random % x; + rotate_list((&c->erasable_list), rotateby); + } + + if (c->nr_erasing_blocks) { + rotateby = pseudo_random % c->nr_erasing_blocks; + rotate_list((&c->erase_pending_list), rotateby); + } + + if (c->nr_free_blocks) { + rotateby = pseudo_random % c->nr_free_blocks; + rotate_list((&c->free_list), rotateby); + } +} diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c new file mode 100644 index 00000000..cfeb7164 --- /dev/null +++ b/fs/jffs2/security.c @@ -0,0 +1,86 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2006 NEC Corporation + * + * Created by KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" + +/* ---- Initial Security Label Attachment -------------- */ +int jffs2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + int rc; + size_t len; + void *value; + char *name; + + rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len); + if (rc) { + if (rc == -EOPNOTSUPP) + return 0; + return rc; + } + rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0); + + kfree(name); + kfree(value); + return rc; +} + +/* ---- XATTR Handler for "security.*" ----------------- */ +static int jffs2_security_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (!strcmp(name, "")) + return -EINVAL; + + return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY, + name, buffer, size); +} + +static int jffs2_security_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + if (!strcmp(name, "")) + return -EINVAL; + + return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY, + name, buffer, size, flags); +} + +static size_t jffs2_security_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1; + + if (list && retlen <= list_size) { + strcpy(list, XATTR_SECURITY_PREFIX); + strcpy(list + XATTR_SECURITY_PREFIX_LEN, name); + } + + return retlen; +} + +const struct xattr_handler jffs2_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = jffs2_security_listxattr, + .set = jffs2_security_setxattr, + .get = jffs2_security_getxattr +}; diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c new file mode 100644 index 00000000..e537fb0e --- /dev/null +++ b/fs/jffs2/summary.c @@ -0,0 +1,869 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2004 Ferenc Havasi , + * Zoltan Sogor , + * Patrik Kluba , + * University of Szeged, Hungary + * 2006 KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" +#include "debug.h" + +int jffs2_sum_init(struct jffs2_sb_info *c) +{ + uint32_t sum_size = min_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE); + + c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL); + + if (!c->summary) { + JFFS2_WARNING("Can't allocate memory for summary information!\n"); + return -ENOMEM; + } + + c->summary->sum_buf = kmalloc(sum_size, GFP_KERNEL); + + if (!c->summary->sum_buf) { + JFFS2_WARNING("Can't allocate buffer for writing out summary information!\n"); + kfree(c->summary); + return -ENOMEM; + } + + dbg_summary("returned successfully\n"); + + return 0; +} + +void jffs2_sum_exit(struct jffs2_sb_info *c) +{ + dbg_summary("called\n"); + + jffs2_sum_disable_collecting(c->summary); + + kfree(c->summary->sum_buf); + c->summary->sum_buf = NULL; + + kfree(c->summary); + c->summary = NULL; +} + +static int jffs2_sum_add_mem(struct jffs2_summary *s, union jffs2_sum_mem *item) +{ + if (!s->sum_list_head) + s->sum_list_head = (union jffs2_sum_mem *) item; + if (s->sum_list_tail) + s->sum_list_tail->u.next = (union jffs2_sum_mem *) item; + s->sum_list_tail = (union jffs2_sum_mem *) item; + + switch (je16_to_cpu(item->u.nodetype)) { + case JFFS2_NODETYPE_INODE: + s->sum_size += JFFS2_SUMMARY_INODE_SIZE; + s->sum_num++; + dbg_summary("inode (%u) added to summary\n", + je32_to_cpu(item->i.inode)); + break; + case JFFS2_NODETYPE_DIRENT: + s->sum_size += JFFS2_SUMMARY_DIRENT_SIZE(item->d.nsize); + s->sum_num++; + dbg_summary("dirent (%u) added to summary\n", + je32_to_cpu(item->d.ino)); + break; +#ifdef CONFIG_JFFS2_FS_XATTR + case JFFS2_NODETYPE_XATTR: + s->sum_size += JFFS2_SUMMARY_XATTR_SIZE; + s->sum_num++; + dbg_summary("xattr (xid=%u, version=%u) added to summary\n", + je32_to_cpu(item->x.xid), je32_to_cpu(item->x.version)); + break; + case JFFS2_NODETYPE_XREF: + s->sum_size += JFFS2_SUMMARY_XREF_SIZE; + s->sum_num++; + dbg_summary("xref added to summary\n"); + break; +#endif + default: + JFFS2_WARNING("UNKNOWN node type %u\n", + je16_to_cpu(item->u.nodetype)); + return 1; + } + return 0; +} + + +/* The following 3 functions are called from scan.c to collect summary info for not closed jeb */ + +int jffs2_sum_add_padding_mem(struct jffs2_summary *s, uint32_t size) +{ + dbg_summary("called with %u\n", size); + s->sum_padded += size; + return 0; +} + +int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri, + uint32_t ofs) +{ + struct jffs2_sum_inode_mem *temp = kmalloc(sizeof(struct jffs2_sum_inode_mem), GFP_KERNEL); + + if (!temp) + return -ENOMEM; + + temp->nodetype = ri->nodetype; + temp->inode = ri->ino; + temp->version = ri->version; + temp->offset = cpu_to_je32(ofs); /* relative offset from the beginning of the jeb */ + temp->totlen = ri->totlen; + temp->next = NULL; + + return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp); +} + +int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *rd, + uint32_t ofs) +{ + struct jffs2_sum_dirent_mem *temp = + kmalloc(sizeof(struct jffs2_sum_dirent_mem) + rd->nsize, GFP_KERNEL); + + if (!temp) + return -ENOMEM; + + temp->nodetype = rd->nodetype; + temp->totlen = rd->totlen; + temp->offset = cpu_to_je32(ofs); /* relative from the beginning of the jeb */ + temp->pino = rd->pino; + temp->version = rd->version; + temp->ino = rd->ino; + temp->nsize = rd->nsize; + temp->type = rd->type; + temp->next = NULL; + + memcpy(temp->name, rd->name, rd->nsize); + + return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp); +} + +#ifdef CONFIG_JFFS2_FS_XATTR +int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs) +{ + struct jffs2_sum_xattr_mem *temp; + + temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL); + if (!temp) + return -ENOMEM; + + temp->nodetype = rx->nodetype; + temp->xid = rx->xid; + temp->version = rx->version; + temp->offset = cpu_to_je32(ofs); + temp->totlen = rx->totlen; + temp->next = NULL; + + return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp); +} + +int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs) +{ + struct jffs2_sum_xref_mem *temp; + + temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL); + if (!temp) + return -ENOMEM; + + temp->nodetype = rr->nodetype; + temp->offset = cpu_to_je32(ofs); + temp->next = NULL; + + return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp); +} +#endif +/* Cleanup every collected summary information */ + +static void jffs2_sum_clean_collected(struct jffs2_summary *s) +{ + union jffs2_sum_mem *temp; + + if (!s->sum_list_head) { + dbg_summary("already empty\n"); + } + while (s->sum_list_head) { + temp = s->sum_list_head; + s->sum_list_head = s->sum_list_head->u.next; + kfree(temp); + } + s->sum_list_tail = NULL; + s->sum_padded = 0; + s->sum_num = 0; +} + +void jffs2_sum_reset_collected(struct jffs2_summary *s) +{ + dbg_summary("called\n"); + jffs2_sum_clean_collected(s); + s->sum_size = 0; +} + +void jffs2_sum_disable_collecting(struct jffs2_summary *s) +{ + dbg_summary("called\n"); + jffs2_sum_clean_collected(s); + s->sum_size = JFFS2_SUMMARY_NOSUM_SIZE; +} + +int jffs2_sum_is_disabled(struct jffs2_summary *s) +{ + return (s->sum_size == JFFS2_SUMMARY_NOSUM_SIZE); +} + +/* Move the collected summary information into sb (called from scan.c) */ + +void jffs2_sum_move_collected(struct jffs2_sb_info *c, struct jffs2_summary *s) +{ + dbg_summary("oldsize=0x%x oldnum=%u => newsize=0x%x newnum=%u\n", + c->summary->sum_size, c->summary->sum_num, + s->sum_size, s->sum_num); + + c->summary->sum_size = s->sum_size; + c->summary->sum_num = s->sum_num; + c->summary->sum_padded = s->sum_padded; + c->summary->sum_list_head = s->sum_list_head; + c->summary->sum_list_tail = s->sum_list_tail; + + s->sum_list_head = s->sum_list_tail = NULL; +} + +/* Called from wbuf.c to collect writed node info */ + +int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs, + unsigned long count, uint32_t ofs) +{ + union jffs2_node_union *node; + struct jffs2_eraseblock *jeb; + + if (c->summary->sum_size == JFFS2_SUMMARY_NOSUM_SIZE) { + dbg_summary("Summary is disabled for this jeb! Skipping summary info!\n"); + return 0; + } + + node = invecs[0].iov_base; + jeb = &c->blocks[ofs / c->sector_size]; + ofs -= jeb->offset; + + switch (je16_to_cpu(node->u.nodetype)) { + case JFFS2_NODETYPE_INODE: { + struct jffs2_sum_inode_mem *temp = + kmalloc(sizeof(struct jffs2_sum_inode_mem), GFP_KERNEL); + + if (!temp) + goto no_mem; + + temp->nodetype = node->i.nodetype; + temp->inode = node->i.ino; + temp->version = node->i.version; + temp->offset = cpu_to_je32(ofs); + temp->totlen = node->i.totlen; + temp->next = NULL; + + return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp); + } + + case JFFS2_NODETYPE_DIRENT: { + struct jffs2_sum_dirent_mem *temp = + kmalloc(sizeof(struct jffs2_sum_dirent_mem) + node->d.nsize, GFP_KERNEL); + + if (!temp) + goto no_mem; + + temp->nodetype = node->d.nodetype; + temp->totlen = node->d.totlen; + temp->offset = cpu_to_je32(ofs); + temp->pino = node->d.pino; + temp->version = node->d.version; + temp->ino = node->d.ino; + temp->nsize = node->d.nsize; + temp->type = node->d.type; + temp->next = NULL; + + switch (count) { + case 1: + memcpy(temp->name,node->d.name,node->d.nsize); + break; + + case 2: + memcpy(temp->name,invecs[1].iov_base,node->d.nsize); + break; + + default: + BUG(); /* impossible count value */ + break; + } + + return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp); + } +#ifdef CONFIG_JFFS2_FS_XATTR + case JFFS2_NODETYPE_XATTR: { + struct jffs2_sum_xattr_mem *temp; + temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL); + if (!temp) + goto no_mem; + + temp->nodetype = node->x.nodetype; + temp->xid = node->x.xid; + temp->version = node->x.version; + temp->totlen = node->x.totlen; + temp->offset = cpu_to_je32(ofs); + temp->next = NULL; + + return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp); + } + case JFFS2_NODETYPE_XREF: { + struct jffs2_sum_xref_mem *temp; + temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL); + if (!temp) + goto no_mem; + temp->nodetype = node->r.nodetype; + temp->offset = cpu_to_je32(ofs); + temp->next = NULL; + + return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp); + } +#endif + case JFFS2_NODETYPE_PADDING: + dbg_summary("node PADDING\n"); + c->summary->sum_padded += je32_to_cpu(node->u.totlen); + break; + + case JFFS2_NODETYPE_CLEANMARKER: + dbg_summary("node CLEANMARKER\n"); + break; + + case JFFS2_NODETYPE_SUMMARY: + dbg_summary("node SUMMARY\n"); + break; + + default: + /* If you implement a new node type you should also implement + summary support for it or disable summary. + */ + BUG(); + break; + } + + return 0; + +no_mem: + JFFS2_WARNING("MEMORY ALLOCATION ERROR!"); + return -ENOMEM; +} + +static struct jffs2_raw_node_ref *sum_link_node_ref(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, + uint32_t ofs, uint32_t len, + struct jffs2_inode_cache *ic) +{ + /* If there was a gap, mark it dirty */ + if ((ofs & ~3) > c->sector_size - jeb->free_size) { + /* Ew. Summary doesn't actually tell us explicitly about dirty space */ + jffs2_scan_dirty_space(c, jeb, (ofs & ~3) - (c->sector_size - jeb->free_size)); + } + + return jffs2_link_node_ref(c, jeb, jeb->offset + ofs, len, ic); +} + +/* Process the stored summary information - helper function for jffs2_sum_scan_sumnode() */ + +static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_summary *summary, uint32_t *pseudo_random) +{ + struct jffs2_inode_cache *ic; + struct jffs2_full_dirent *fd; + void *sp; + int i, ino; + int err; + + sp = summary->sum; + + for (i=0; isum_num); i++) { + dbg_summary("processing summary index %d\n", i); + + cond_resched(); + + /* Make sure there's a spare ref for dirty space */ + err = jffs2_prealloc_raw_node_refs(c, jeb, 2); + if (err) + return err; + + switch (je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype)) { + case JFFS2_NODETYPE_INODE: { + struct jffs2_sum_inode_flash *spi; + spi = sp; + + ino = je32_to_cpu(spi->inode); + + dbg_summary("Inode at 0x%08x-0x%08x\n", + jeb->offset + je32_to_cpu(spi->offset), + jeb->offset + je32_to_cpu(spi->offset) + je32_to_cpu(spi->totlen)); + + ic = jffs2_scan_make_ino_cache(c, ino); + if (!ic) { + JFFS2_NOTICE("scan_make_ino_cache failed\n"); + return -ENOMEM; + } + + sum_link_node_ref(c, jeb, je32_to_cpu(spi->offset) | REF_UNCHECKED, + PAD(je32_to_cpu(spi->totlen)), ic); + + *pseudo_random += je32_to_cpu(spi->version); + + sp += JFFS2_SUMMARY_INODE_SIZE; + + break; + } + + case JFFS2_NODETYPE_DIRENT: { + struct jffs2_sum_dirent_flash *spd; + int checkedlen; + spd = sp; + + dbg_summary("Dirent at 0x%08x-0x%08x\n", + jeb->offset + je32_to_cpu(spd->offset), + jeb->offset + je32_to_cpu(spd->offset) + je32_to_cpu(spd->totlen)); + + + /* This should never happen, but https://dev.laptop.org/ticket/4184 */ + checkedlen = strnlen(spd->name, spd->nsize); + if (!checkedlen) { + printk(KERN_ERR "Dirent at %08x has zero at start of name. Aborting mount.\n", + jeb->offset + je32_to_cpu(spd->offset)); + return -EIO; + } + if (checkedlen < spd->nsize) { + printk(KERN_ERR "Dirent at %08x has zeroes in name. Truncating to %d chars\n", + jeb->offset + je32_to_cpu(spd->offset), checkedlen); + } + + + fd = jffs2_alloc_full_dirent(checkedlen+1); + if (!fd) + return -ENOMEM; + + memcpy(&fd->name, spd->name, checkedlen); + fd->name[checkedlen] = 0; + + ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(spd->pino)); + if (!ic) { + jffs2_free_full_dirent(fd); + return -ENOMEM; + } + + fd->raw = sum_link_node_ref(c, jeb, je32_to_cpu(spd->offset) | REF_UNCHECKED, + PAD(je32_to_cpu(spd->totlen)), ic); + + fd->next = NULL; + fd->version = je32_to_cpu(spd->version); + fd->ino = je32_to_cpu(spd->ino); + fd->nhash = full_name_hash(fd->name, checkedlen); + fd->type = spd->type; + + jffs2_add_fd_to_list(c, fd, &ic->scan_dents); + + *pseudo_random += je32_to_cpu(spd->version); + + sp += JFFS2_SUMMARY_DIRENT_SIZE(spd->nsize); + + break; + } +#ifdef CONFIG_JFFS2_FS_XATTR + case JFFS2_NODETYPE_XATTR: { + struct jffs2_xattr_datum *xd; + struct jffs2_sum_xattr_flash *spx; + + spx = (struct jffs2_sum_xattr_flash *)sp; + dbg_summary("xattr at %#08x-%#08x (xid=%u, version=%u)\n", + jeb->offset + je32_to_cpu(spx->offset), + jeb->offset + je32_to_cpu(spx->offset) + je32_to_cpu(spx->totlen), + je32_to_cpu(spx->xid), je32_to_cpu(spx->version)); + + xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid), + je32_to_cpu(spx->version)); + if (IS_ERR(xd)) + return PTR_ERR(xd); + if (xd->version > je32_to_cpu(spx->version)) { + /* node is not the newest one */ + struct jffs2_raw_node_ref *raw + = sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED, + PAD(je32_to_cpu(spx->totlen)), NULL); + raw->next_in_ino = xd->node->next_in_ino; + xd->node->next_in_ino = raw; + } else { + xd->version = je32_to_cpu(spx->version); + sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED, + PAD(je32_to_cpu(spx->totlen)), (void *)xd); + } + *pseudo_random += je32_to_cpu(spx->xid); + sp += JFFS2_SUMMARY_XATTR_SIZE; + + break; + } + case JFFS2_NODETYPE_XREF: { + struct jffs2_xattr_ref *ref; + struct jffs2_sum_xref_flash *spr; + + spr = (struct jffs2_sum_xref_flash *)sp; + dbg_summary("xref at %#08x-%#08x\n", + jeb->offset + je32_to_cpu(spr->offset), + jeb->offset + je32_to_cpu(spr->offset) + + (uint32_t)PAD(sizeof(struct jffs2_raw_xref))); + + ref = jffs2_alloc_xattr_ref(); + if (!ref) { + JFFS2_NOTICE("allocation of xattr_datum failed\n"); + return -ENOMEM; + } + ref->next = c->xref_temp; + c->xref_temp = ref; + + sum_link_node_ref(c, jeb, je32_to_cpu(spr->offset) | REF_UNCHECKED, + PAD(sizeof(struct jffs2_raw_xref)), (void *)ref); + + *pseudo_random += ref->node->flash_offset; + sp += JFFS2_SUMMARY_XREF_SIZE; + + break; + } +#endif + default : { + uint16_t nodetype = je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype); + JFFS2_WARNING("Unsupported node type %x found in summary! Exiting...\n", nodetype); + if ((nodetype & JFFS2_COMPAT_MASK) == JFFS2_FEATURE_INCOMPAT) + return -EIO; + + /* For compatible node types, just fall back to the full scan */ + c->wasted_size -= jeb->wasted_size; + c->free_size += c->sector_size - jeb->free_size; + c->used_size -= jeb->used_size; + c->dirty_size -= jeb->dirty_size; + jeb->wasted_size = jeb->used_size = jeb->dirty_size = 0; + jeb->free_size = c->sector_size; + + jffs2_free_jeb_node_refs(c, jeb); + return -ENOTRECOVERABLE; + } + } + } + return 0; +} + +/* Process the summary node - called from jffs2_scan_eraseblock() */ +int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_summary *summary, uint32_t sumsize, + uint32_t *pseudo_random) +{ + struct jffs2_unknown_node crcnode; + int ret, ofs; + uint32_t crc; + + ofs = c->sector_size - sumsize; + + dbg_summary("summary found for 0x%08x at 0x%08x (0x%x bytes)\n", + jeb->offset, jeb->offset + ofs, sumsize); + + /* OK, now check for node validity and CRC */ + crcnode.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + crcnode.nodetype = cpu_to_je16(JFFS2_NODETYPE_SUMMARY); + crcnode.totlen = summary->totlen; + crc = crc32(0, &crcnode, sizeof(crcnode)-4); + + if (je32_to_cpu(summary->hdr_crc) != crc) { + dbg_summary("Summary node header is corrupt (bad CRC or " + "no summary at all)\n"); + goto crc_err; + } + + if (je32_to_cpu(summary->totlen) != sumsize) { + dbg_summary("Summary node is corrupt (wrong erasesize?)\n"); + goto crc_err; + } + + crc = crc32(0, summary, sizeof(struct jffs2_raw_summary)-8); + + if (je32_to_cpu(summary->node_crc) != crc) { + dbg_summary("Summary node is corrupt (bad CRC)\n"); + goto crc_err; + } + + crc = crc32(0, summary->sum, sumsize - sizeof(struct jffs2_raw_summary)); + + if (je32_to_cpu(summary->sum_crc) != crc) { + dbg_summary("Summary node data is corrupt (bad CRC)\n"); + goto crc_err; + } + + if ( je32_to_cpu(summary->cln_mkr) ) { + + dbg_summary("Summary : CLEANMARKER node \n"); + + ret = jffs2_prealloc_raw_node_refs(c, jeb, 1); + if (ret) + return ret; + + if (je32_to_cpu(summary->cln_mkr) != c->cleanmarker_size) { + dbg_summary("CLEANMARKER node has totlen 0x%x != normal 0x%x\n", + je32_to_cpu(summary->cln_mkr), c->cleanmarker_size); + if ((ret = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr))))) + return ret; + } else if (jeb->first_node) { + dbg_summary("CLEANMARKER node not first node in block " + "(0x%08x)\n", jeb->offset); + if ((ret = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr))))) + return ret; + } else { + jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL, + je32_to_cpu(summary->cln_mkr), NULL); + } + } + + ret = jffs2_sum_process_sum_data(c, jeb, summary, pseudo_random); + /* -ENOTRECOVERABLE isn't a fatal error -- it means we should do a full + scan of this eraseblock. So return zero */ + if (ret == -ENOTRECOVERABLE) + return 0; + if (ret) + return ret; /* real error */ + + /* for PARANOIA_CHECK */ + ret = jffs2_prealloc_raw_node_refs(c, jeb, 2); + if (ret) + return ret; + + sum_link_node_ref(c, jeb, ofs | REF_NORMAL, sumsize, NULL); + + if (unlikely(jeb->free_size)) { + JFFS2_WARNING("Free size 0x%x bytes in eraseblock @0x%08x with summary?\n", + jeb->free_size, jeb->offset); + jeb->wasted_size += jeb->free_size; + c->wasted_size += jeb->free_size; + c->free_size -= jeb->free_size; + jeb->free_size = 0; + } + + return jffs2_scan_classify_jeb(c, jeb); + +crc_err: + JFFS2_WARNING("Summary node crc error, skipping summary information.\n"); + + return 0; +} + +/* Write summary data to flash - helper function for jffs2_sum_write_sumnode() */ + +static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + uint32_t infosize, uint32_t datasize, int padsize) +{ + struct jffs2_raw_summary isum; + union jffs2_sum_mem *temp; + struct jffs2_sum_marker *sm; + struct kvec vecs[2]; + uint32_t sum_ofs; + void *wpage; + int ret; + size_t retlen; + + if (padsize + datasize > MAX_SUMMARY_SIZE) { + /* It won't fit in the buffer. Abort summary for this jeb */ + jffs2_sum_disable_collecting(c->summary); + + JFFS2_WARNING("Summary too big (%d data, %d pad) in eraseblock at %08x\n", + datasize, padsize, jeb->offset); + /* Non-fatal */ + return 0; + } + /* Is there enough space for summary? */ + if (padsize < 0) { + /* don't try to write out summary for this jeb */ + jffs2_sum_disable_collecting(c->summary); + + JFFS2_WARNING("Not enough space for summary, padsize = %d\n", + padsize); + /* Non-fatal */ + return 0; + } + + memset(c->summary->sum_buf, 0xff, datasize); + memset(&isum, 0, sizeof(isum)); + + isum.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + isum.nodetype = cpu_to_je16(JFFS2_NODETYPE_SUMMARY); + isum.totlen = cpu_to_je32(infosize); + isum.hdr_crc = cpu_to_je32(crc32(0, &isum, sizeof(struct jffs2_unknown_node) - 4)); + isum.padded = cpu_to_je32(c->summary->sum_padded); + isum.cln_mkr = cpu_to_je32(c->cleanmarker_size); + isum.sum_num = cpu_to_je32(c->summary->sum_num); + wpage = c->summary->sum_buf; + + while (c->summary->sum_num) { + temp = c->summary->sum_list_head; + + switch (je16_to_cpu(temp->u.nodetype)) { + case JFFS2_NODETYPE_INODE: { + struct jffs2_sum_inode_flash *sino_ptr = wpage; + + sino_ptr->nodetype = temp->i.nodetype; + sino_ptr->inode = temp->i.inode; + sino_ptr->version = temp->i.version; + sino_ptr->offset = temp->i.offset; + sino_ptr->totlen = temp->i.totlen; + + wpage += JFFS2_SUMMARY_INODE_SIZE; + + break; + } + + case JFFS2_NODETYPE_DIRENT: { + struct jffs2_sum_dirent_flash *sdrnt_ptr = wpage; + + sdrnt_ptr->nodetype = temp->d.nodetype; + sdrnt_ptr->totlen = temp->d.totlen; + sdrnt_ptr->offset = temp->d.offset; + sdrnt_ptr->pino = temp->d.pino; + sdrnt_ptr->version = temp->d.version; + sdrnt_ptr->ino = temp->d.ino; + sdrnt_ptr->nsize = temp->d.nsize; + sdrnt_ptr->type = temp->d.type; + + memcpy(sdrnt_ptr->name, temp->d.name, + temp->d.nsize); + + wpage += JFFS2_SUMMARY_DIRENT_SIZE(temp->d.nsize); + + break; + } +#ifdef CONFIG_JFFS2_FS_XATTR + case JFFS2_NODETYPE_XATTR: { + struct jffs2_sum_xattr_flash *sxattr_ptr = wpage; + + temp = c->summary->sum_list_head; + sxattr_ptr->nodetype = temp->x.nodetype; + sxattr_ptr->xid = temp->x.xid; + sxattr_ptr->version = temp->x.version; + sxattr_ptr->offset = temp->x.offset; + sxattr_ptr->totlen = temp->x.totlen; + + wpage += JFFS2_SUMMARY_XATTR_SIZE; + break; + } + case JFFS2_NODETYPE_XREF: { + struct jffs2_sum_xref_flash *sxref_ptr = wpage; + + temp = c->summary->sum_list_head; + sxref_ptr->nodetype = temp->r.nodetype; + sxref_ptr->offset = temp->r.offset; + + wpage += JFFS2_SUMMARY_XREF_SIZE; + break; + } +#endif + default : { + if ((je16_to_cpu(temp->u.nodetype) & JFFS2_COMPAT_MASK) + == JFFS2_FEATURE_RWCOMPAT_COPY) { + dbg_summary("Writing unknown RWCOMPAT_COPY node type %x\n", + je16_to_cpu(temp->u.nodetype)); + jffs2_sum_disable_collecting(c->summary); + } else { + BUG(); /* unknown node in summary information */ + } + } + } + + c->summary->sum_list_head = temp->u.next; + kfree(temp); + + c->summary->sum_num--; + } + + jffs2_sum_reset_collected(c->summary); + + wpage += padsize; + + sm = wpage; + sm->offset = cpu_to_je32(c->sector_size - jeb->free_size); + sm->magic = cpu_to_je32(JFFS2_SUM_MAGIC); + + isum.sum_crc = cpu_to_je32(crc32(0, c->summary->sum_buf, datasize)); + isum.node_crc = cpu_to_je32(crc32(0, &isum, sizeof(isum) - 8)); + + vecs[0].iov_base = &isum; + vecs[0].iov_len = sizeof(isum); + vecs[1].iov_base = c->summary->sum_buf; + vecs[1].iov_len = datasize; + + sum_ofs = jeb->offset + c->sector_size - jeb->free_size; + + dbg_summary("JFFS2: writing out data to flash to pos : 0x%08x\n", + sum_ofs); + + ret = jffs2_flash_writev(c, vecs, 2, sum_ofs, &retlen, 0); + + if (ret || (retlen != infosize)) { + + JFFS2_WARNING("Write of %u bytes at 0x%08x failed. returned %d, retlen %zd\n", + infosize, sum_ofs, ret, retlen); + + if (retlen) { + /* Waste remaining space */ + spin_lock(&c->erase_completion_lock); + jffs2_link_node_ref(c, jeb, sum_ofs | REF_OBSOLETE, infosize, NULL); + spin_unlock(&c->erase_completion_lock); + } + + c->summary->sum_size = JFFS2_SUMMARY_NOSUM_SIZE; + + return 0; + } + + spin_lock(&c->erase_completion_lock); + jffs2_link_node_ref(c, jeb, sum_ofs | REF_NORMAL, infosize, NULL); + spin_unlock(&c->erase_completion_lock); + + return 0; +} + +/* Write out summary information - called from jffs2_do_reserve_space */ + +int jffs2_sum_write_sumnode(struct jffs2_sb_info *c) +{ + int datasize, infosize, padsize; + struct jffs2_eraseblock *jeb; + int ret = 0; + + dbg_summary("called\n"); + + spin_unlock(&c->erase_completion_lock); + + jeb = c->nextblock; + jffs2_prealloc_raw_node_refs(c, jeb, 1); + + if (!c->summary->sum_num || !c->summary->sum_list_head) { + JFFS2_WARNING("Empty summary info!!!\n"); + BUG(); + } + + datasize = c->summary->sum_size + sizeof(struct jffs2_sum_marker); + infosize = sizeof(struct jffs2_raw_summary) + datasize; + padsize = jeb->free_size - infosize; + infosize += padsize; + datasize += padsize; + + ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize); + spin_lock(&c->erase_completion_lock); + return ret; +} diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h new file mode 100644 index 00000000..60207a2a --- /dev/null +++ b/fs/jffs2/summary.h @@ -0,0 +1,213 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2004 Ferenc Havasi , + * Zoltan Sogor , + * Patrik Kluba , + * University of Szeged, Hungary + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef JFFS2_SUMMARY_H +#define JFFS2_SUMMARY_H + +/* Limit summary size to 64KiB so that we can kmalloc it. If the summary + is larger than that, we have to just ditch it and avoid using summary + for the eraseblock in question... and it probably doesn't hurt us much + anyway. */ +#define MAX_SUMMARY_SIZE 65536 + +#include +#include + +#define BLK_STATE_ALLFF 0 +#define BLK_STATE_CLEAN 1 +#define BLK_STATE_PARTDIRTY 2 +#define BLK_STATE_CLEANMARKER 3 +#define BLK_STATE_ALLDIRTY 4 +#define BLK_STATE_BADBLOCK 5 + +#define JFFS2_SUMMARY_NOSUM_SIZE 0xffffffff +#define JFFS2_SUMMARY_INODE_SIZE (sizeof(struct jffs2_sum_inode_flash)) +#define JFFS2_SUMMARY_DIRENT_SIZE(x) (sizeof(struct jffs2_sum_dirent_flash) + (x)) +#define JFFS2_SUMMARY_XATTR_SIZE (sizeof(struct jffs2_sum_xattr_flash)) +#define JFFS2_SUMMARY_XREF_SIZE (sizeof(struct jffs2_sum_xref_flash)) + +/* Summary structures used on flash */ + +struct jffs2_sum_unknown_flash +{ + jint16_t nodetype; /* node type */ +}; + +struct jffs2_sum_inode_flash +{ + jint16_t nodetype; /* node type */ + jint32_t inode; /* inode number */ + jint32_t version; /* inode version */ + jint32_t offset; /* offset on jeb */ + jint32_t totlen; /* record length */ +} __attribute__((packed)); + +struct jffs2_sum_dirent_flash +{ + jint16_t nodetype; /* == JFFS_NODETYPE_DIRENT */ + jint32_t totlen; /* record length */ + jint32_t offset; /* offset on jeb */ + jint32_t pino; /* parent inode */ + jint32_t version; /* dirent version */ + jint32_t ino; /* == zero for unlink */ + uint8_t nsize; /* dirent name size */ + uint8_t type; /* dirent type */ + uint8_t name[0]; /* dirent name */ +} __attribute__((packed)); + +struct jffs2_sum_xattr_flash +{ + jint16_t nodetype; /* == JFFS2_NODETYPE_XATR */ + jint32_t xid; /* xattr identifier */ + jint32_t version; /* version number */ + jint32_t offset; /* offset on jeb */ + jint32_t totlen; /* node length */ +} __attribute__((packed)); + +struct jffs2_sum_xref_flash +{ + jint16_t nodetype; /* == JFFS2_NODETYPE_XREF */ + jint32_t offset; /* offset on jeb */ +} __attribute__((packed)); + +union jffs2_sum_flash +{ + struct jffs2_sum_unknown_flash u; + struct jffs2_sum_inode_flash i; + struct jffs2_sum_dirent_flash d; + struct jffs2_sum_xattr_flash x; + struct jffs2_sum_xref_flash r; +}; + +/* Summary structures used in the memory */ + +struct jffs2_sum_unknown_mem +{ + union jffs2_sum_mem *next; + jint16_t nodetype; /* node type */ +}; + +struct jffs2_sum_inode_mem +{ + union jffs2_sum_mem *next; + jint16_t nodetype; /* node type */ + jint32_t inode; /* inode number */ + jint32_t version; /* inode version */ + jint32_t offset; /* offset on jeb */ + jint32_t totlen; /* record length */ +} __attribute__((packed)); + +struct jffs2_sum_dirent_mem +{ + union jffs2_sum_mem *next; + jint16_t nodetype; /* == JFFS_NODETYPE_DIRENT */ + jint32_t totlen; /* record length */ + jint32_t offset; /* ofset on jeb */ + jint32_t pino; /* parent inode */ + jint32_t version; /* dirent version */ + jint32_t ino; /* == zero for unlink */ + uint8_t nsize; /* dirent name size */ + uint8_t type; /* dirent type */ + uint8_t name[0]; /* dirent name */ +} __attribute__((packed)); + +struct jffs2_sum_xattr_mem +{ + union jffs2_sum_mem *next; + jint16_t nodetype; + jint32_t xid; + jint32_t version; + jint32_t offset; + jint32_t totlen; +} __attribute__((packed)); + +struct jffs2_sum_xref_mem +{ + union jffs2_sum_mem *next; + jint16_t nodetype; + jint32_t offset; +} __attribute__((packed)); + +union jffs2_sum_mem +{ + struct jffs2_sum_unknown_mem u; + struct jffs2_sum_inode_mem i; + struct jffs2_sum_dirent_mem d; + struct jffs2_sum_xattr_mem x; + struct jffs2_sum_xref_mem r; +}; + +/* Summary related information stored in superblock */ + +struct jffs2_summary +{ + uint32_t sum_size; /* collected summary information for nextblock */ + uint32_t sum_num; + uint32_t sum_padded; + union jffs2_sum_mem *sum_list_head; + union jffs2_sum_mem *sum_list_tail; + + jint32_t *sum_buf; /* buffer for writing out summary */ +}; + +/* Summary marker is stored at the end of every sumarized erase block */ + +struct jffs2_sum_marker +{ + jint32_t offset; /* offset of the summary node in the jeb */ + jint32_t magic; /* == JFFS2_SUM_MAGIC */ +}; + +#define JFFS2_SUMMARY_FRAME_SIZE (sizeof(struct jffs2_raw_summary) + sizeof(struct jffs2_sum_marker)) + +#ifdef CONFIG_JFFS2_SUMMARY /* SUMMARY SUPPORT ENABLED */ + +#define jffs2_sum_active() (1) +int jffs2_sum_init(struct jffs2_sb_info *c); +void jffs2_sum_exit(struct jffs2_sb_info *c); +void jffs2_sum_disable_collecting(struct jffs2_summary *s); +int jffs2_sum_is_disabled(struct jffs2_summary *s); +void jffs2_sum_reset_collected(struct jffs2_summary *s); +void jffs2_sum_move_collected(struct jffs2_sb_info *c, struct jffs2_summary *s); +int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs, + unsigned long count, uint32_t to); +int jffs2_sum_write_sumnode(struct jffs2_sb_info *c); +int jffs2_sum_add_padding_mem(struct jffs2_summary *s, uint32_t size); +int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri, uint32_t ofs); +int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *rd, uint32_t ofs); +int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs); +int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs); +int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, + struct jffs2_raw_summary *summary, uint32_t sumlen, + uint32_t *pseudo_random); + +#else /* SUMMARY DISABLED */ + +#define jffs2_sum_active() (0) +#define jffs2_sum_init(a) (0) +#define jffs2_sum_exit(a) +#define jffs2_sum_disable_collecting(a) +#define jffs2_sum_is_disabled(a) (0) +#define jffs2_sum_reset_collected(a) +#define jffs2_sum_add_kvec(a,b,c,d) (0) +#define jffs2_sum_move_collected(a,b) +#define jffs2_sum_write_sumnode(a) (0) +#define jffs2_sum_add_padding_mem(a,b) +#define jffs2_sum_add_inode_mem(a,b,c) +#define jffs2_sum_add_dirent_mem(a,b,c) +#define jffs2_sum_add_xattr_mem(a,b,c) +#define jffs2_sum_add_xref_mem(a,b,c) +#define jffs2_sum_scan_sumnode(a,b,c,d,e) (0) + +#endif /* CONFIG_JFFS2_SUMMARY */ + +#endif /* JFFS2_SUMMARY_H */ diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c new file mode 100644 index 00000000..853b8e30 --- /dev/null +++ b/fs/jffs2/super.c @@ -0,0 +1,316 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compr.h" +#include "nodelist.h" + +static void jffs2_put_super(struct super_block *); + +static struct kmem_cache *jffs2_inode_cachep; + +static struct inode *jffs2_alloc_inode(struct super_block *sb) +{ + struct jffs2_inode_info *f; + + f = kmem_cache_alloc(jffs2_inode_cachep, GFP_KERNEL); + if (!f) + return NULL; + return &f->vfs_inode; +} + +static void jffs2_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode)); +} + +static void jffs2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, jffs2_i_callback); +} + +static void jffs2_i_init_once(void *foo) +{ + struct jffs2_inode_info *f = foo; + + mutex_init(&f->sem); + inode_init_once(&f->vfs_inode); +} + +static void jffs2_write_super(struct super_block *sb) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + + lock_super(sb); + sb->s_dirt = 0; + + if (!(sb->s_flags & MS_RDONLY)) { + D1(printk(KERN_DEBUG "jffs2_write_super()\n")); + jffs2_flush_wbuf_gc(c, 0); + } + + unlock_super(sb); +} + +static int jffs2_sync_fs(struct super_block *sb, int wait) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + + jffs2_write_super(sb); + + mutex_lock(&c->alloc_sem); + jffs2_flush_wbuf_pad(c); + mutex_unlock(&c->alloc_sem); + return 0; +} + +static struct inode *jffs2_nfs_get_inode(struct super_block *sb, uint64_t ino, + uint32_t generation) +{ + /* We don't care about i_generation. We'll destroy the flash + before we start re-using inode numbers anyway. And even + if that wasn't true, we'd have other problems...*/ + return jffs2_iget(sb, ino); +} + +static struct dentry *jffs2_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + jffs2_nfs_get_inode); +} + +static struct dentry *jffs2_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + jffs2_nfs_get_inode); +} + +static struct dentry *jffs2_get_parent(struct dentry *child) +{ + struct jffs2_inode_info *f; + uint32_t pino; + + BUG_ON(!S_ISDIR(child->d_inode->i_mode)); + + f = JFFS2_INODE_INFO(child->d_inode); + + pino = f->inocache->pino_nlink; + + JFFS2_DEBUG("Parent of directory ino #%u is #%u\n", + f->inocache->ino, pino); + + return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino)); +} + +static const struct export_operations jffs2_export_ops = { + .get_parent = jffs2_get_parent, + .fh_to_dentry = jffs2_fh_to_dentry, + .fh_to_parent = jffs2_fh_to_parent, +}; + +static const struct super_operations jffs2_super_operations = +{ + .alloc_inode = jffs2_alloc_inode, + .destroy_inode =jffs2_destroy_inode, + .put_super = jffs2_put_super, + .write_super = jffs2_write_super, + .statfs = jffs2_statfs, + .remount_fs = jffs2_remount_fs, + .evict_inode = jffs2_evict_inode, + .dirty_inode = jffs2_dirty_inode, + .sync_fs = jffs2_sync_fs, +}; + +/* + * fill in the superblock + */ +static int jffs2_fill_super(struct super_block *sb, void *data, int silent) +{ + struct jffs2_sb_info *c; + int ret; + + D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():" + " New superblock for device %d (\"%s\")\n", + sb->s_mtd->index, sb->s_mtd->name)); + + c = kzalloc(sizeof(*c), GFP_KERNEL); + if (!c) + return -ENOMEM; + + c->mtd = sb->s_mtd; + c->os_priv = sb; + sb->s_fs_info = c; + + /* Initialize JFFS2 superblock locks, the further initialization will + * be done later */ + mutex_init(&c->alloc_sem); + mutex_init(&c->erase_free_sem); + init_waitqueue_head(&c->erase_wait); + init_waitqueue_head(&c->inocache_wq); + spin_lock_init(&c->erase_completion_lock); + spin_lock_init(&c->inocache_lock); + + sb->s_op = &jffs2_super_operations; + sb->s_export_op = &jffs2_export_ops; + sb->s_flags = sb->s_flags | MS_NOATIME; + sb->s_xattr = jffs2_xattr_handlers; +#ifdef CONFIG_JFFS2_FS_POSIX_ACL + sb->s_flags |= MS_POSIXACL; +#endif + ret = jffs2_do_fill_super(sb, data, silent); + return ret; +} + +static struct dentry *jffs2_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super); +} + +static void jffs2_put_super (struct super_block *sb) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + + D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n")); + + if (sb->s_dirt) + jffs2_write_super(sb); + + mutex_lock(&c->alloc_sem); + jffs2_flush_wbuf_pad(c); + mutex_unlock(&c->alloc_sem); + + jffs2_sum_exit(c); + + jffs2_free_ino_caches(c); + jffs2_free_raw_node_refs(c); + if (jffs2_blocks_use_vmalloc(c)) + vfree(c->blocks); + else + kfree(c->blocks); + jffs2_flash_cleanup(c); + kfree(c->inocache_list); + jffs2_clear_xattr_subsystem(c); + if (c->mtd->sync) + c->mtd->sync(c->mtd); + + D1(printk(KERN_DEBUG "jffs2_put_super returning\n")); +} + +static void jffs2_kill_sb(struct super_block *sb) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + if (!(sb->s_flags & MS_RDONLY)) + jffs2_stop_garbage_collect_thread(c); + kill_mtd_super(sb); + kfree(c); +} + +static struct file_system_type jffs2_fs_type = { + .owner = THIS_MODULE, + .name = "jffs2", + .mount = jffs2_mount, + .kill_sb = jffs2_kill_sb, +}; + +static int __init init_jffs2_fs(void) +{ + int ret; + + /* Paranoia checks for on-medium structures. If we ask GCC + to pack them with __attribute__((packed)) then it _also_ + assumes that they're not aligned -- so it emits crappy + code on some architectures. Ideally we want an attribute + which means just 'no padding', without the alignment + thing. But GCC doesn't have that -- we have to just + hope the structs are the right sizes, instead. */ + BUILD_BUG_ON(sizeof(struct jffs2_unknown_node) != 12); + BUILD_BUG_ON(sizeof(struct jffs2_raw_dirent) != 40); + BUILD_BUG_ON(sizeof(struct jffs2_raw_inode) != 68); + BUILD_BUG_ON(sizeof(struct jffs2_raw_summary) != 32); + + printk(KERN_INFO "JFFS2 version 2.2." +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER + " (NAND)" +#endif +#ifdef CONFIG_JFFS2_SUMMARY + " (SUMMARY) " +#endif + " © 2001-2006 Red Hat, Inc.\n"); + + jffs2_inode_cachep = kmem_cache_create("jffs2_i", + sizeof(struct jffs2_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + jffs2_i_init_once); + if (!jffs2_inode_cachep) { + printk(KERN_ERR "JFFS2 error: Failed to initialise inode cache\n"); + return -ENOMEM; + } + ret = jffs2_compressors_init(); + if (ret) { + printk(KERN_ERR "JFFS2 error: Failed to initialise compressors\n"); + goto out; + } + ret = jffs2_create_slab_caches(); + if (ret) { + printk(KERN_ERR "JFFS2 error: Failed to initialise slab caches\n"); + goto out_compressors; + } + ret = register_filesystem(&jffs2_fs_type); + if (ret) { + printk(KERN_ERR "JFFS2 error: Failed to register filesystem\n"); + goto out_slab; + } + return 0; + + out_slab: + jffs2_destroy_slab_caches(); + out_compressors: + jffs2_compressors_exit(); + out: + kmem_cache_destroy(jffs2_inode_cachep); + return ret; +} + +static void __exit exit_jffs2_fs(void) +{ + unregister_filesystem(&jffs2_fs_type); + jffs2_destroy_slab_caches(); + jffs2_compressors_exit(); + kmem_cache_destroy(jffs2_inode_cachep); +} + +module_init(init_jffs2_fs); +module_exit(exit_jffs2_fs); + +MODULE_DESCRIPTION("The Journalling Flash File System, v2"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); // Actually dual-licensed, but it doesn't matter for + // the sake of this tag. It's Free Software. diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c new file mode 100644 index 00000000..b9556260 --- /dev/null +++ b/fs/jffs2/symlink.c @@ -0,0 +1,64 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include "nodelist.h" + +static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd); + +const struct inode_operations jffs2_symlink_inode_operations = +{ + .readlink = generic_readlink, + .follow_link = jffs2_follow_link, + .check_acl = jffs2_check_acl, + .setattr = jffs2_setattr, + .setxattr = jffs2_setxattr, + .getxattr = jffs2_getxattr, + .listxattr = jffs2_listxattr, + .removexattr = jffs2_removexattr +}; + +static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct jffs2_inode_info *f = JFFS2_INODE_INFO(dentry->d_inode); + char *p = (char *)f->target; + + /* + * We don't acquire the f->sem mutex here since the only data we + * use is f->target. + * + * 1. If we are here the inode has already built and f->target has + * to point to the target path. + * 2. Nobody uses f->target (if the inode is symlink's inode). The + * exception is inode freeing function which frees f->target. But + * it can't be called while we are here and before VFS has + * stopped using our f->target string which we provide by means of + * nd_set_link() call. + */ + + if (!p) { + printk(KERN_ERR "jffs2_follow_link(): can't find symlink target\n"); + p = ERR_PTR(-EIO); + } + D1(printk(KERN_DEBUG "jffs2_follow_link(): target path is '%s'\n", (char *) f->target)); + + nd_set_link(nd, p); + + /* + * We will unlock the f->sem mutex but VFS will use the f->target string. This is safe + * since the only way that may cause f->target to be changed is iput() operation. + * But VFS will not use f->target after iput() has been called. + */ + return NULL; +} + diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c new file mode 100644 index 00000000..4515bea0 --- /dev/null +++ b/fs/jffs2/wbuf.c @@ -0,0 +1,1310 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * Copyright © 2004 Thomas Gleixner + * + * Created by David Woodhouse + * Modified debugged and enhanced by Thomas Gleixner + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "nodelist.h" + +/* For testing write failures */ +#undef BREAKME +#undef BREAKMEHEADER + +#ifdef BREAKME +static unsigned char *brokenbuf; +#endif + +#define PAGE_DIV(x) ( ((unsigned long)(x) / (unsigned long)(c->wbuf_pagesize)) * (unsigned long)(c->wbuf_pagesize) ) +#define PAGE_MOD(x) ( (unsigned long)(x) % (unsigned long)(c->wbuf_pagesize) ) + +/* max. erase failures before we mark a block bad */ +#define MAX_ERASE_FAILURES 2 + +struct jffs2_inodirty { + uint32_t ino; + struct jffs2_inodirty *next; +}; + +static struct jffs2_inodirty inodirty_nomem; + +static int jffs2_wbuf_pending_for_ino(struct jffs2_sb_info *c, uint32_t ino) +{ + struct jffs2_inodirty *this = c->wbuf_inodes; + + /* If a malloc failed, consider _everything_ dirty */ + if (this == &inodirty_nomem) + return 1; + + /* If ino == 0, _any_ non-GC writes mean 'yes' */ + if (this && !ino) + return 1; + + /* Look to see if the inode in question is pending in the wbuf */ + while (this) { + if (this->ino == ino) + return 1; + this = this->next; + } + return 0; +} + +static void jffs2_clear_wbuf_ino_list(struct jffs2_sb_info *c) +{ + struct jffs2_inodirty *this; + + this = c->wbuf_inodes; + + if (this != &inodirty_nomem) { + while (this) { + struct jffs2_inodirty *next = this->next; + kfree(this); + this = next; + } + } + c->wbuf_inodes = NULL; +} + +static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino) +{ + struct jffs2_inodirty *new; + + /* Mark the superblock dirty so that kupdated will flush... */ + jffs2_dirty_trigger(c); + + if (jffs2_wbuf_pending_for_ino(c, ino)) + return; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) { + D1(printk(KERN_DEBUG "No memory to allocate inodirty. Fallback to all considered dirty\n")); + jffs2_clear_wbuf_ino_list(c); + c->wbuf_inodes = &inodirty_nomem; + return; + } + new->ino = ino; + new->next = c->wbuf_inodes; + c->wbuf_inodes = new; + return; +} + +static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c) +{ + struct list_head *this, *next; + static int n; + + if (list_empty(&c->erasable_pending_wbuf_list)) + return; + + list_for_each_safe(this, next, &c->erasable_pending_wbuf_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + D1(printk(KERN_DEBUG "Removing eraseblock at 0x%08x from erasable_pending_wbuf_list...\n", jeb->offset)); + list_del(this); + if ((jiffies + (n++)) & 127) { + /* Most of the time, we just erase it immediately. Otherwise we + spend ages scanning it on mount, etc. */ + D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); + list_add_tail(&jeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + jffs2_garbage_collect_trigger(c); + } else { + /* Sometimes, however, we leave it elsewhere so it doesn't get + immediately reused, and we spread the load a bit. */ + D1(printk(KERN_DEBUG "...and adding to erasable_list\n")); + list_add_tail(&jeb->list, &c->erasable_list); + } + } +} + +#define REFILE_NOTEMPTY 0 +#define REFILE_ANYWAY 1 + +static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, int allow_empty) +{ + D1(printk("About to refile bad block at %08x\n", jeb->offset)); + + /* File the existing block on the bad_used_list.... */ + if (c->nextblock == jeb) + c->nextblock = NULL; + else /* Not sure this should ever happen... need more coffee */ + list_del(&jeb->list); + if (jeb->first_node) { + D1(printk("Refiling block at %08x to bad_used_list\n", jeb->offset)); + list_add(&jeb->list, &c->bad_used_list); + } else { + BUG_ON(allow_empty == REFILE_NOTEMPTY); + /* It has to have had some nodes or we couldn't be here */ + D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset)); + list_add(&jeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + jffs2_garbage_collect_trigger(c); + } + + if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) { + uint32_t oldfree = jeb->free_size; + + jffs2_link_node_ref(c, jeb, + (jeb->offset+c->sector_size-oldfree) | REF_OBSOLETE, + oldfree, NULL); + /* convert to wasted */ + c->wasted_size += oldfree; + jeb->wasted_size += oldfree; + c->dirty_size -= oldfree; + jeb->dirty_size -= oldfree; + } + + jffs2_dbg_dump_block_lists_nolock(c); + jffs2_dbg_acct_sanity_check_nolock(c,jeb); + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); +} + +static struct jffs2_raw_node_ref **jffs2_incore_replace_raw(struct jffs2_sb_info *c, + struct jffs2_inode_info *f, + struct jffs2_raw_node_ref *raw, + union jffs2_node_union *node) +{ + struct jffs2_node_frag *frag; + struct jffs2_full_dirent *fd; + + dbg_noderef("incore_replace_raw: node at %p is {%04x,%04x}\n", + node, je16_to_cpu(node->u.magic), je16_to_cpu(node->u.nodetype)); + + BUG_ON(je16_to_cpu(node->u.magic) != 0x1985 && + je16_to_cpu(node->u.magic) != 0); + + switch (je16_to_cpu(node->u.nodetype)) { + case JFFS2_NODETYPE_INODE: + if (f->metadata && f->metadata->raw == raw) { + dbg_noderef("Will replace ->raw in f->metadata at %p\n", f->metadata); + return &f->metadata->raw; + } + frag = jffs2_lookup_node_frag(&f->fragtree, je32_to_cpu(node->i.offset)); + BUG_ON(!frag); + /* Find a frag which refers to the full_dnode we want to modify */ + while (!frag->node || frag->node->raw != raw) { + frag = frag_next(frag); + BUG_ON(!frag); + } + dbg_noderef("Will replace ->raw in full_dnode at %p\n", frag->node); + return &frag->node->raw; + + case JFFS2_NODETYPE_DIRENT: + for (fd = f->dents; fd; fd = fd->next) { + if (fd->raw == raw) { + dbg_noderef("Will replace ->raw in full_dirent at %p\n", fd); + return &fd->raw; + } + } + BUG(); + + default: + dbg_noderef("Don't care about replacing raw for nodetype %x\n", + je16_to_cpu(node->u.nodetype)); + break; + } + return NULL; +} + +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY +static int jffs2_verify_write(struct jffs2_sb_info *c, unsigned char *buf, + uint32_t ofs) +{ + int ret; + size_t retlen; + char *eccstr; + + ret = c->mtd->read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify); + if (ret && ret != -EUCLEAN && ret != -EBADMSG) { + printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x failed: %d\n", c->wbuf_ofs, ret); + return ret; + } else if (retlen != c->wbuf_pagesize) { + printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x gave short read: %zd not %d.\n", ofs, retlen, c->wbuf_pagesize); + return -EIO; + } + if (!memcmp(buf, c->wbuf_verify, c->wbuf_pagesize)) + return 0; + + if (ret == -EUCLEAN) + eccstr = "corrected"; + else if (ret == -EBADMSG) + eccstr = "correction failed"; + else + eccstr = "OK or unused"; + + printk(KERN_WARNING "Write verify error (ECC %s) at %08x. Wrote:\n", + eccstr, c->wbuf_ofs); + print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, + c->wbuf, c->wbuf_pagesize, 0); + + printk(KERN_WARNING "Read back:\n"); + print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, + c->wbuf_verify, c->wbuf_pagesize, 0); + + return -EIO; +} +#else +#define jffs2_verify_write(c,b,o) (0) +#endif + +/* Recover from failure to write wbuf. Recover the nodes up to the + * wbuf, not the one which we were starting to try to write. */ + +static void jffs2_wbuf_recover(struct jffs2_sb_info *c) +{ + struct jffs2_eraseblock *jeb, *new_jeb; + struct jffs2_raw_node_ref *raw, *next, *first_raw = NULL; + size_t retlen; + int ret; + int nr_refile = 0; + unsigned char *buf; + uint32_t start, end, ofs, len; + + jeb = &c->blocks[c->wbuf_ofs / c->sector_size]; + + spin_lock(&c->erase_completion_lock); + if (c->wbuf_ofs % c->mtd->erasesize) + jffs2_block_refile(c, jeb, REFILE_NOTEMPTY); + else + jffs2_block_refile(c, jeb, REFILE_ANYWAY); + spin_unlock(&c->erase_completion_lock); + + BUG_ON(!ref_obsolete(jeb->last_node)); + + /* Find the first node to be recovered, by skipping over every + node which ends before the wbuf starts, or which is obsolete. */ + for (next = raw = jeb->first_node; next; raw = next) { + next = ref_next(raw); + + if (ref_obsolete(raw) || + (next && ref_offset(next) <= c->wbuf_ofs)) { + dbg_noderef("Skipping node at 0x%08x(%d)-0x%08x which is either before 0x%08x or obsolete\n", + ref_offset(raw), ref_flags(raw), + (ref_offset(raw) + ref_totlen(c, jeb, raw)), + c->wbuf_ofs); + continue; + } + dbg_noderef("First node to be recovered is at 0x%08x(%d)-0x%08x\n", + ref_offset(raw), ref_flags(raw), + (ref_offset(raw) + ref_totlen(c, jeb, raw))); + + first_raw = raw; + break; + } + + if (!first_raw) { + /* All nodes were obsolete. Nothing to recover. */ + D1(printk(KERN_DEBUG "No non-obsolete nodes to be recovered. Just filing block bad\n")); + c->wbuf_len = 0; + return; + } + + start = ref_offset(first_raw); + end = ref_offset(jeb->last_node); + nr_refile = 1; + + /* Count the number of refs which need to be copied */ + while ((raw = ref_next(raw)) != jeb->last_node) + nr_refile++; + + dbg_noderef("wbuf recover %08x-%08x (%d bytes in %d nodes)\n", + start, end, end - start, nr_refile); + + buf = NULL; + if (start < c->wbuf_ofs) { + /* First affected node was already partially written. + * Attempt to reread the old data into our buffer. */ + + buf = kmalloc(end - start, GFP_KERNEL); + if (!buf) { + printk(KERN_CRIT "Malloc failure in wbuf recovery. Data loss ensues.\n"); + + goto read_failed; + } + + /* Do the read... */ + ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf); + + /* ECC recovered ? */ + if ((ret == -EUCLEAN || ret == -EBADMSG) && + (retlen == c->wbuf_ofs - start)) + ret = 0; + + if (ret || retlen != c->wbuf_ofs - start) { + printk(KERN_CRIT "Old data are already lost in wbuf recovery. Data loss ensues.\n"); + + kfree(buf); + buf = NULL; + read_failed: + first_raw = ref_next(first_raw); + nr_refile--; + while (first_raw && ref_obsolete(first_raw)) { + first_raw = ref_next(first_raw); + nr_refile--; + } + + /* If this was the only node to be recovered, give up */ + if (!first_raw) { + c->wbuf_len = 0; + return; + } + + /* It wasn't. Go on and try to recover nodes complete in the wbuf */ + start = ref_offset(first_raw); + dbg_noderef("wbuf now recover %08x-%08x (%d bytes in %d nodes)\n", + start, end, end - start, nr_refile); + + } else { + /* Read succeeded. Copy the remaining data from the wbuf */ + memcpy(buf + (c->wbuf_ofs - start), c->wbuf, end - c->wbuf_ofs); + } + } + /* OK... we're to rewrite (end-start) bytes of data from first_raw onwards. + Either 'buf' contains the data, or we find it in the wbuf */ + + /* ... and get an allocation of space from a shiny new block instead */ + ret = jffs2_reserve_space_gc(c, end-start, &len, JFFS2_SUMMARY_NOSUM_SIZE); + if (ret) { + printk(KERN_WARNING "Failed to allocate space for wbuf recovery. Data loss ensues.\n"); + kfree(buf); + return; + } + + /* The summary is not recovered, so it must be disabled for this erase block */ + jffs2_sum_disable_collecting(c->summary); + + ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, nr_refile); + if (ret) { + printk(KERN_WARNING "Failed to allocate node refs for wbuf recovery. Data loss ensues.\n"); + kfree(buf); + return; + } + + ofs = write_ofs(c); + + if (end-start >= c->wbuf_pagesize) { + /* Need to do another write immediately, but it's possible + that this is just because the wbuf itself is completely + full, and there's nothing earlier read back from the + flash. Hence 'buf' isn't necessarily what we're writing + from. */ + unsigned char *rewrite_buf = buf?:c->wbuf; + uint32_t towrite = (end-start) - ((end-start)%c->wbuf_pagesize); + + D1(printk(KERN_DEBUG "Write 0x%x bytes at 0x%08x in wbuf recover\n", + towrite, ofs)); + +#ifdef BREAKMEHEADER + static int breakme; + if (breakme++ == 20) { + printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs); + breakme = 0; + c->mtd->write(c->mtd, ofs, towrite, &retlen, + brokenbuf); + ret = -EIO; + } else +#endif + ret = c->mtd->write(c->mtd, ofs, towrite, &retlen, + rewrite_buf); + + if (ret || retlen != towrite || jffs2_verify_write(c, rewrite_buf, ofs)) { + /* Argh. We tried. Really we did. */ + printk(KERN_CRIT "Recovery of wbuf failed due to a second write error\n"); + kfree(buf); + + if (retlen) + jffs2_add_physical_node_ref(c, ofs | REF_OBSOLETE, ref_totlen(c, jeb, first_raw), NULL); + + return; + } + printk(KERN_NOTICE "Recovery of wbuf succeeded to %08x\n", ofs); + + c->wbuf_len = (end - start) - towrite; + c->wbuf_ofs = ofs + towrite; + memmove(c->wbuf, rewrite_buf + towrite, c->wbuf_len); + /* Don't muck about with c->wbuf_inodes. False positives are harmless. */ + } else { + /* OK, now we're left with the dregs in whichever buffer we're using */ + if (buf) { + memcpy(c->wbuf, buf, end-start); + } else { + memmove(c->wbuf, c->wbuf + (start - c->wbuf_ofs), end - start); + } + c->wbuf_ofs = ofs; + c->wbuf_len = end - start; + } + + /* Now sort out the jffs2_raw_node_refs, moving them from the old to the next block */ + new_jeb = &c->blocks[ofs / c->sector_size]; + + spin_lock(&c->erase_completion_lock); + for (raw = first_raw; raw != jeb->last_node; raw = ref_next(raw)) { + uint32_t rawlen = ref_totlen(c, jeb, raw); + struct jffs2_inode_cache *ic; + struct jffs2_raw_node_ref *new_ref; + struct jffs2_raw_node_ref **adjust_ref = NULL; + struct jffs2_inode_info *f = NULL; + + D1(printk(KERN_DEBUG "Refiling block of %08x at %08x(%d) to %08x\n", + rawlen, ref_offset(raw), ref_flags(raw), ofs)); + + ic = jffs2_raw_ref_to_ic(raw); + + /* Ick. This XATTR mess should be fixed shortly... */ + if (ic && ic->class == RAWNODE_CLASS_XATTR_DATUM) { + struct jffs2_xattr_datum *xd = (void *)ic; + BUG_ON(xd->node != raw); + adjust_ref = &xd->node; + raw->next_in_ino = NULL; + ic = NULL; + } else if (ic && ic->class == RAWNODE_CLASS_XATTR_REF) { + struct jffs2_xattr_datum *xr = (void *)ic; + BUG_ON(xr->node != raw); + adjust_ref = &xr->node; + raw->next_in_ino = NULL; + ic = NULL; + } else if (ic && ic->class == RAWNODE_CLASS_INODE_CACHE) { + struct jffs2_raw_node_ref **p = &ic->nodes; + + /* Remove the old node from the per-inode list */ + while (*p && *p != (void *)ic) { + if (*p == raw) { + (*p) = (raw->next_in_ino); + raw->next_in_ino = NULL; + break; + } + p = &((*p)->next_in_ino); + } + + if (ic->state == INO_STATE_PRESENT && !ref_obsolete(raw)) { + /* If it's an in-core inode, then we have to adjust any + full_dirent or full_dnode structure to point to the + new version instead of the old */ + f = jffs2_gc_fetch_inode(c, ic->ino, !ic->pino_nlink); + if (IS_ERR(f)) { + /* Should never happen; it _must_ be present */ + JFFS2_ERROR("Failed to iget() ino #%u, err %ld\n", + ic->ino, PTR_ERR(f)); + BUG(); + } + /* We don't lock f->sem. There's a number of ways we could + end up in here with it already being locked, and nobody's + going to modify it on us anyway because we hold the + alloc_sem. We're only changing one ->raw pointer too, + which we can get away with without upsetting readers. */ + adjust_ref = jffs2_incore_replace_raw(c, f, raw, + (void *)(buf?:c->wbuf) + (ref_offset(raw) - start)); + } else if (unlikely(ic->state != INO_STATE_PRESENT && + ic->state != INO_STATE_CHECKEDABSENT && + ic->state != INO_STATE_GC)) { + JFFS2_ERROR("Inode #%u is in strange state %d!\n", ic->ino, ic->state); + BUG(); + } + } + + new_ref = jffs2_link_node_ref(c, new_jeb, ofs | ref_flags(raw), rawlen, ic); + + if (adjust_ref) { + BUG_ON(*adjust_ref != raw); + *adjust_ref = new_ref; + } + if (f) + jffs2_gc_release_inode(c, f); + + if (!ref_obsolete(raw)) { + jeb->dirty_size += rawlen; + jeb->used_size -= rawlen; + c->dirty_size += rawlen; + c->used_size -= rawlen; + raw->flash_offset = ref_offset(raw) | REF_OBSOLETE; + BUG_ON(raw->next_in_ino); + } + ofs += rawlen; + } + + kfree(buf); + + /* Fix up the original jeb now it's on the bad_list */ + if (first_raw == jeb->first_node) { + D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset)); + list_move(&jeb->list, &c->erase_pending_list); + c->nr_erasing_blocks++; + jffs2_garbage_collect_trigger(c); + } + + jffs2_dbg_acct_sanity_check_nolock(c, jeb); + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + + jffs2_dbg_acct_sanity_check_nolock(c, new_jeb); + jffs2_dbg_acct_paranoia_check_nolock(c, new_jeb); + + spin_unlock(&c->erase_completion_lock); + + D1(printk(KERN_DEBUG "wbuf recovery completed OK. wbuf_ofs 0x%08x, len 0x%x\n", c->wbuf_ofs, c->wbuf_len)); + +} + +/* Meaning of pad argument: + 0: Do not pad. Probably pointless - we only ever use this when we can't pad anyway. + 1: Pad, do not adjust nextblock free_size + 2: Pad, adjust nextblock free_size +*/ +#define NOPAD 0 +#define PAD_NOACCOUNT 1 +#define PAD_ACCOUNTING 2 + +static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) +{ + struct jffs2_eraseblock *wbuf_jeb; + int ret; + size_t retlen; + + /* Nothing to do if not write-buffering the flash. In particular, we shouldn't + del_timer() the timer we never initialised. */ + if (!jffs2_is_writebuffered(c)) + return 0; + + if (mutex_trylock(&c->alloc_sem)) { + mutex_unlock(&c->alloc_sem); + printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n"); + BUG(); + } + + if (!c->wbuf_len) /* already checked c->wbuf above */ + return 0; + + wbuf_jeb = &c->blocks[c->wbuf_ofs / c->sector_size]; + if (jffs2_prealloc_raw_node_refs(c, wbuf_jeb, c->nextblock->allocated_refs + 1)) + return -ENOMEM; + + /* claim remaining space on the page + this happens, if we have a change to a new block, + or if fsync forces us to flush the writebuffer. + if we have a switch to next page, we will not have + enough remaining space for this. + */ + if (pad ) { + c->wbuf_len = PAD(c->wbuf_len); + + /* Pad with JFFS2_DIRTY_BITMASK initially. this helps out ECC'd NOR + with 8 byte page size */ + memset(c->wbuf + c->wbuf_len, 0, c->wbuf_pagesize - c->wbuf_len); + + if ( c->wbuf_len + sizeof(struct jffs2_unknown_node) < c->wbuf_pagesize) { + struct jffs2_unknown_node *padnode = (void *)(c->wbuf + c->wbuf_len); + padnode->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + padnode->nodetype = cpu_to_je16(JFFS2_NODETYPE_PADDING); + padnode->totlen = cpu_to_je32(c->wbuf_pagesize - c->wbuf_len); + padnode->hdr_crc = cpu_to_je32(crc32(0, padnode, sizeof(*padnode)-4)); + } + } + /* else jffs2_flash_writev has actually filled in the rest of the + buffer for us, and will deal with the node refs etc. later. */ + +#ifdef BREAKME + static int breakme; + if (breakme++ == 20) { + printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs); + breakme = 0; + c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, + brokenbuf); + ret = -EIO; + } else +#endif + + ret = c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf); + + if (ret) { + printk(KERN_WARNING "jffs2_flush_wbuf(): Write failed with %d\n", ret); + goto wfail; + } else if (retlen != c->wbuf_pagesize) { + printk(KERN_WARNING "jffs2_flush_wbuf(): Write was short: %zd instead of %d\n", + retlen, c->wbuf_pagesize); + ret = -EIO; + goto wfail; + } else if ((ret = jffs2_verify_write(c, c->wbuf, c->wbuf_ofs))) { + wfail: + jffs2_wbuf_recover(c); + + return ret; + } + + /* Adjust free size of the block if we padded. */ + if (pad) { + uint32_t waste = c->wbuf_pagesize - c->wbuf_len; + + D1(printk(KERN_DEBUG "jffs2_flush_wbuf() adjusting free_size of %sblock at %08x\n", + (wbuf_jeb==c->nextblock)?"next":"", wbuf_jeb->offset)); + + /* wbuf_pagesize - wbuf_len is the amount of space that's to be + padded. If there is less free space in the block than that, + something screwed up */ + if (wbuf_jeb->free_size < waste) { + printk(KERN_CRIT "jffs2_flush_wbuf(): Accounting error. wbuf at 0x%08x has 0x%03x bytes, 0x%03x left.\n", + c->wbuf_ofs, c->wbuf_len, waste); + printk(KERN_CRIT "jffs2_flush_wbuf(): But free_size for block at 0x%08x is only 0x%08x\n", + wbuf_jeb->offset, wbuf_jeb->free_size); + BUG(); + } + + spin_lock(&c->erase_completion_lock); + + jffs2_link_node_ref(c, wbuf_jeb, (c->wbuf_ofs + c->wbuf_len) | REF_OBSOLETE, waste, NULL); + /* FIXME: that made it count as dirty. Convert to wasted */ + wbuf_jeb->dirty_size -= waste; + c->dirty_size -= waste; + wbuf_jeb->wasted_size += waste; + c->wasted_size += waste; + } else + spin_lock(&c->erase_completion_lock); + + /* Stick any now-obsoleted blocks on the erase_pending_list */ + jffs2_refile_wbuf_blocks(c); + jffs2_clear_wbuf_ino_list(c); + spin_unlock(&c->erase_completion_lock); + + memset(c->wbuf,0xff,c->wbuf_pagesize); + /* adjust write buffer offset, else we get a non contiguous write bug */ + c->wbuf_ofs += c->wbuf_pagesize; + c->wbuf_len = 0; + return 0; +} + +/* Trigger garbage collection to flush the write-buffer. + If ino arg is zero, do it if _any_ real (i.e. not GC) writes are + outstanding. If ino arg non-zero, do it only if a write for the + given inode is outstanding. */ +int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino) +{ + uint32_t old_wbuf_ofs; + uint32_t old_wbuf_len; + int ret = 0; + + D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() called for ino #%u...\n", ino)); + + if (!c->wbuf) + return 0; + + mutex_lock(&c->alloc_sem); + if (!jffs2_wbuf_pending_for_ino(c, ino)) { + D1(printk(KERN_DEBUG "Ino #%d not pending in wbuf. Returning\n", ino)); + mutex_unlock(&c->alloc_sem); + return 0; + } + + old_wbuf_ofs = c->wbuf_ofs; + old_wbuf_len = c->wbuf_len; + + if (c->unchecked_size) { + /* GC won't make any progress for a while */ + D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() padding. Not finished checking\n")); + down_write(&c->wbuf_sem); + ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING); + /* retry flushing wbuf in case jffs2_wbuf_recover + left some data in the wbuf */ + if (ret) + ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING); + up_write(&c->wbuf_sem); + } else while (old_wbuf_len && + old_wbuf_ofs == c->wbuf_ofs) { + + mutex_unlock(&c->alloc_sem); + + D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() calls gc pass\n")); + + ret = jffs2_garbage_collect_pass(c); + if (ret) { + /* GC failed. Flush it with padding instead */ + mutex_lock(&c->alloc_sem); + down_write(&c->wbuf_sem); + ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING); + /* retry flushing wbuf in case jffs2_wbuf_recover + left some data in the wbuf */ + if (ret) + ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING); + up_write(&c->wbuf_sem); + break; + } + mutex_lock(&c->alloc_sem); + } + + D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() ends...\n")); + + mutex_unlock(&c->alloc_sem); + return ret; +} + +/* Pad write-buffer to end and write it, wasting space. */ +int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c) +{ + int ret; + + if (!c->wbuf) + return 0; + + down_write(&c->wbuf_sem); + ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT); + /* retry - maybe wbuf recover left some data in wbuf. */ + if (ret) + ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT); + up_write(&c->wbuf_sem); + + return ret; +} + +static size_t jffs2_fill_wbuf(struct jffs2_sb_info *c, const uint8_t *buf, + size_t len) +{ + if (len && !c->wbuf_len && (len >= c->wbuf_pagesize)) + return 0; + + if (len > (c->wbuf_pagesize - c->wbuf_len)) + len = c->wbuf_pagesize - c->wbuf_len; + memcpy(c->wbuf + c->wbuf_len, buf, len); + c->wbuf_len += (uint32_t) len; + return len; +} + +int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, + unsigned long count, loff_t to, size_t *retlen, + uint32_t ino) +{ + struct jffs2_eraseblock *jeb; + size_t wbuf_retlen, donelen = 0; + uint32_t outvec_to = to; + int ret, invec; + + /* If not writebuffered flash, don't bother */ + if (!jffs2_is_writebuffered(c)) + return jffs2_flash_direct_writev(c, invecs, count, to, retlen); + + down_write(&c->wbuf_sem); + + /* If wbuf_ofs is not initialized, set it to target address */ + if (c->wbuf_ofs == 0xFFFFFFFF) { + c->wbuf_ofs = PAGE_DIV(to); + c->wbuf_len = PAGE_MOD(to); + memset(c->wbuf,0xff,c->wbuf_pagesize); + } + + /* + * Sanity checks on target address. It's permitted to write + * at PAD(c->wbuf_len+c->wbuf_ofs), and it's permitted to + * write at the beginning of a new erase block. Anything else, + * and you die. New block starts at xxx000c (0-b = block + * header) + */ + if (SECTOR_ADDR(to) != SECTOR_ADDR(c->wbuf_ofs)) { + /* It's a write to a new block */ + if (c->wbuf_len) { + D1(printk(KERN_DEBUG "jffs2_flash_writev() to 0x%lx " + "causes flush of wbuf at 0x%08x\n", + (unsigned long)to, c->wbuf_ofs)); + ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT); + if (ret) + goto outerr; + } + /* set pointer to new block */ + c->wbuf_ofs = PAGE_DIV(to); + c->wbuf_len = PAGE_MOD(to); + } + + if (to != PAD(c->wbuf_ofs + c->wbuf_len)) { + /* We're not writing immediately after the writebuffer. Bad. */ + printk(KERN_CRIT "jffs2_flash_writev(): Non-contiguous write " + "to %08lx\n", (unsigned long)to); + if (c->wbuf_len) + printk(KERN_CRIT "wbuf was previously %08x-%08x\n", + c->wbuf_ofs, c->wbuf_ofs+c->wbuf_len); + BUG(); + } + + /* adjust alignment offset */ + if (c->wbuf_len != PAGE_MOD(to)) { + c->wbuf_len = PAGE_MOD(to); + /* take care of alignment to next page */ + if (!c->wbuf_len) { + c->wbuf_len = c->wbuf_pagesize; + ret = __jffs2_flush_wbuf(c, NOPAD); + if (ret) + goto outerr; + } + } + + for (invec = 0; invec < count; invec++) { + int vlen = invecs[invec].iov_len; + uint8_t *v = invecs[invec].iov_base; + + wbuf_retlen = jffs2_fill_wbuf(c, v, vlen); + + if (c->wbuf_len == c->wbuf_pagesize) { + ret = __jffs2_flush_wbuf(c, NOPAD); + if (ret) + goto outerr; + } + vlen -= wbuf_retlen; + outvec_to += wbuf_retlen; + donelen += wbuf_retlen; + v += wbuf_retlen; + + if (vlen >= c->wbuf_pagesize) { + ret = c->mtd->write(c->mtd, outvec_to, PAGE_DIV(vlen), + &wbuf_retlen, v); + if (ret < 0 || wbuf_retlen != PAGE_DIV(vlen)) + goto outfile; + + vlen -= wbuf_retlen; + outvec_to += wbuf_retlen; + c->wbuf_ofs = outvec_to; + donelen += wbuf_retlen; + v += wbuf_retlen; + } + + wbuf_retlen = jffs2_fill_wbuf(c, v, vlen); + if (c->wbuf_len == c->wbuf_pagesize) { + ret = __jffs2_flush_wbuf(c, NOPAD); + if (ret) + goto outerr; + } + + outvec_to += wbuf_retlen; + donelen += wbuf_retlen; + } + + /* + * If there's a remainder in the wbuf and it's a non-GC write, + * remember that the wbuf affects this ino + */ + *retlen = donelen; + + if (jffs2_sum_active()) { + int res = jffs2_sum_add_kvec(c, invecs, count, (uint32_t) to); + if (res) + return res; + } + + if (c->wbuf_len && ino) + jffs2_wbuf_dirties_inode(c, ino); + + ret = 0; + up_write(&c->wbuf_sem); + return ret; + +outfile: + /* + * At this point we have no problem, c->wbuf is empty. However + * refile nextblock to avoid writing again to same address. + */ + + spin_lock(&c->erase_completion_lock); + + jeb = &c->blocks[outvec_to / c->sector_size]; + jffs2_block_refile(c, jeb, REFILE_ANYWAY); + + spin_unlock(&c->erase_completion_lock); + +outerr: + *retlen = 0; + up_write(&c->wbuf_sem); + return ret; +} + +/* + * This is the entry for flash write. + * Check, if we work on NAND FLASH, if so build an kvec and write it via vritev +*/ +int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, + size_t *retlen, const u_char *buf) +{ + struct kvec vecs[1]; + + if (!jffs2_is_writebuffered(c)) + return jffs2_flash_direct_write(c, ofs, len, retlen, buf); + + vecs[0].iov_base = (unsigned char *) buf; + vecs[0].iov_len = len; + return jffs2_flash_writev(c, vecs, 1, ofs, retlen, 0); +} + +/* + Handle readback from writebuffer and ECC failure return +*/ +int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, u_char *buf) +{ + loff_t orbf = 0, owbf = 0, lwbf = 0; + int ret; + + if (!jffs2_is_writebuffered(c)) + return c->mtd->read(c->mtd, ofs, len, retlen, buf); + + /* Read flash */ + down_read(&c->wbuf_sem); + ret = c->mtd->read(c->mtd, ofs, len, retlen, buf); + + if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) { + if (ret == -EBADMSG) + printk(KERN_WARNING "mtd->read(0x%zx bytes from 0x%llx)" + " returned ECC error\n", len, ofs); + /* + * We have the raw data without ECC correction in the buffer, + * maybe we are lucky and all data or parts are correct. We + * check the node. If data are corrupted node check will sort + * it out. We keep this block, it will fail on write or erase + * and the we mark it bad. Or should we do that now? But we + * should give him a chance. Maybe we had a system crash or + * power loss before the ecc write or a erase was completed. + * So we return success. :) + */ + ret = 0; + } + + /* if no writebuffer available or write buffer empty, return */ + if (!c->wbuf_pagesize || !c->wbuf_len) + goto exit; + + /* if we read in a different block, return */ + if (SECTOR_ADDR(ofs) != SECTOR_ADDR(c->wbuf_ofs)) + goto exit; + + if (ofs >= c->wbuf_ofs) { + owbf = (ofs - c->wbuf_ofs); /* offset in write buffer */ + if (owbf > c->wbuf_len) /* is read beyond write buffer ? */ + goto exit; + lwbf = c->wbuf_len - owbf; /* number of bytes to copy */ + if (lwbf > len) + lwbf = len; + } else { + orbf = (c->wbuf_ofs - ofs); /* offset in read buffer */ + if (orbf > len) /* is write beyond write buffer ? */ + goto exit; + lwbf = len - orbf; /* number of bytes to copy */ + if (lwbf > c->wbuf_len) + lwbf = c->wbuf_len; + } + if (lwbf > 0) + memcpy(buf+orbf,c->wbuf+owbf,lwbf); + +exit: + up_read(&c->wbuf_sem); + return ret; +} + +#define NR_OOB_SCAN_PAGES 4 + +/* For historical reasons we use only 8 bytes for OOB clean marker */ +#define OOB_CM_SIZE 8 + +static const struct jffs2_unknown_node oob_cleanmarker = +{ + .magic = constant_cpu_to_je16(JFFS2_MAGIC_BITMASK), + .nodetype = constant_cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER), + .totlen = constant_cpu_to_je32(8) +}; + +/* + * Check, if the out of band area is empty. This function knows about the clean + * marker and if it is present in OOB, treats the OOB as empty anyway. + */ +int jffs2_check_oob_empty(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb, int mode) +{ + int i, ret; + int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); + struct mtd_oob_ops ops; + + ops.mode = MTD_OOB_AUTO; + ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; + ops.oobbuf = c->oobbuf; + ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; + ops.datbuf = NULL; + + ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops); + if (ret || ops.oobretlen != ops.ooblen) { + printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd" + " bytes, read %zd bytes, error %d\n", + jeb->offset, ops.ooblen, ops.oobretlen, ret); + if (!ret) + ret = -EIO; + return ret; + } + + for(i = 0; i < ops.ooblen; i++) { + if (mode && i < cmlen) + /* Yeah, we know about the cleanmarker */ + continue; + + if (ops.oobbuf[i] != 0xFF) { + D2(printk(KERN_DEBUG "Found %02x at %x in OOB for " + "%08x\n", ops.oobbuf[i], i, jeb->offset)); + return 1; + } + } + + return 0; +} + +/* + * Check for a valid cleanmarker. + * Returns: 0 if a valid cleanmarker was found + * 1 if no cleanmarker was found + * negative error code if an error occurred + */ +int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + struct mtd_oob_ops ops; + int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); + + ops.mode = MTD_OOB_AUTO; + ops.ooblen = cmlen; + ops.oobbuf = c->oobbuf; + ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; + ops.datbuf = NULL; + + ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops); + if (ret || ops.oobretlen != ops.ooblen) { + printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd" + " bytes, read %zd bytes, error %d\n", + jeb->offset, ops.ooblen, ops.oobretlen, ret); + if (!ret) + ret = -EIO; + return ret; + } + + return !!memcmp(&oob_cleanmarker, c->oobbuf, cmlen); +} + +int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) +{ + int ret; + struct mtd_oob_ops ops; + int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); + + ops.mode = MTD_OOB_AUTO; + ops.ooblen = cmlen; + ops.oobbuf = (uint8_t *)&oob_cleanmarker; + ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; + ops.datbuf = NULL; + + ret = c->mtd->write_oob(c->mtd, jeb->offset, &ops); + if (ret || ops.oobretlen != ops.ooblen) { + printk(KERN_ERR "cannot write OOB for EB at %08x, requested %zd" + " bytes, read %zd bytes, error %d\n", + jeb->offset, ops.ooblen, ops.oobretlen, ret); + if (!ret) + ret = -EIO; + return ret; + } + + return 0; +} + +/* + * On NAND we try to mark this block bad. If the block was erased more + * than MAX_ERASE_FAILURES we mark it finally bad. + * Don't care about failures. This block remains on the erase-pending + * or badblock list as long as nobody manipulates the flash with + * a bootloader or something like that. + */ + +int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset) +{ + int ret; + + /* if the count is < max, we try to write the counter to the 2nd page oob area */ + if( ++jeb->bad_count < MAX_ERASE_FAILURES) + return 0; + + if (!c->mtd->block_markbad) + return 1; // What else can we do? + + printk(KERN_WARNING "JFFS2: marking eraseblock at %08x\n as bad", bad_offset); + ret = c->mtd->block_markbad(c->mtd, bad_offset); + + if (ret) { + D1(printk(KERN_WARNING "jffs2_write_nand_badblock(): Write failed for block at %08x: error %d\n", jeb->offset, ret)); + return ret; + } + return 1; +} + +int jffs2_nand_flash_setup(struct jffs2_sb_info *c) +{ + struct nand_ecclayout *oinfo = c->mtd->ecclayout; + + if (!c->mtd->oobsize) + return 0; + + /* Cleanmarker is out-of-band, so inline size zero */ + c->cleanmarker_size = 0; + + if (!oinfo || oinfo->oobavail == 0) { + printk(KERN_ERR "inconsistent device description\n"); + return -EINVAL; + } + + D1(printk(KERN_DEBUG "JFFS2 using OOB on NAND\n")); + + c->oobavail = oinfo->oobavail; + + /* Initialise write buffer */ + init_rwsem(&c->wbuf_sem); + c->wbuf_pagesize = c->mtd->writesize; + c->wbuf_ofs = 0xFFFFFFFF; + + c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf) + return -ENOMEM; + + c->oobbuf = kmalloc(NR_OOB_SCAN_PAGES * c->oobavail, GFP_KERNEL); + if (!c->oobbuf) { + kfree(c->wbuf); + return -ENOMEM; + } + +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf_verify) { + kfree(c->oobbuf); + kfree(c->wbuf); + return -ENOMEM; + } +#endif + return 0; +} + +void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c) +{ +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + kfree(c->wbuf_verify); +#endif + kfree(c->wbuf); + kfree(c->oobbuf); +} + +int jffs2_dataflash_setup(struct jffs2_sb_info *c) { + c->cleanmarker_size = 0; /* No cleanmarkers needed */ + + /* Initialize write buffer */ + init_rwsem(&c->wbuf_sem); + + + c->wbuf_pagesize = c->mtd->erasesize; + + /* Find a suitable c->sector_size + * - Not too much sectors + * - Sectors have to be at least 4 K + some bytes + * - All known dataflashes have erase sizes of 528 or 1056 + * - we take at least 8 eraseblocks and want to have at least 8K size + * - The concatenation should be a power of 2 + */ + + c->sector_size = 8 * c->mtd->erasesize; + + while (c->sector_size < 8192) { + c->sector_size *= 2; + } + + /* It may be necessary to adjust the flash size */ + c->flash_size = c->mtd->size; + + if ((c->flash_size % c->sector_size) != 0) { + c->flash_size = (c->flash_size / c->sector_size) * c->sector_size; + printk(KERN_WARNING "JFFS2 flash size adjusted to %dKiB\n", c->flash_size); + }; + + c->wbuf_ofs = 0xFFFFFFFF; + c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf) + return -ENOMEM; + +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf_verify) { + kfree(c->oobbuf); + kfree(c->wbuf); + return -ENOMEM; + } +#endif + + printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size); + + return 0; +} + +void jffs2_dataflash_cleanup(struct jffs2_sb_info *c) { +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + kfree(c->wbuf_verify); +#endif + kfree(c->wbuf); +} + +int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) { + /* Cleanmarker currently occupies whole programming regions, + * either one or 2 for 8Byte STMicro flashes. */ + c->cleanmarker_size = max(16u, c->mtd->writesize); + + /* Initialize write buffer */ + init_rwsem(&c->wbuf_sem); + c->wbuf_pagesize = c->mtd->writesize; + c->wbuf_ofs = 0xFFFFFFFF; + + c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf) + return -ENOMEM; + +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf_verify) { + kfree(c->wbuf); + return -ENOMEM; + } +#endif + return 0; +} + +void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) { +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + kfree(c->wbuf_verify); +#endif + kfree(c->wbuf); +} + +int jffs2_ubivol_setup(struct jffs2_sb_info *c) { + c->cleanmarker_size = 0; + + if (c->mtd->writesize == 1) + /* We do not need write-buffer */ + return 0; + + init_rwsem(&c->wbuf_sem); + + c->wbuf_pagesize = c->mtd->writesize; + c->wbuf_ofs = 0xFFFFFFFF; + c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf) + return -ENOMEM; + + printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size); + + return 0; +} + +void jffs2_ubivol_cleanup(struct jffs2_sb_info *c) { + kfree(c->wbuf); +} diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c new file mode 100644 index 00000000..30d175b6 --- /dev/null +++ b/fs/jffs2/write.c @@ -0,0 +1,707 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include "nodelist.h" +#include "compr.h" + + +int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + uint32_t mode, struct jffs2_raw_inode *ri) +{ + struct jffs2_inode_cache *ic; + + ic = jffs2_alloc_inode_cache(); + if (!ic) { + return -ENOMEM; + } + + memset(ic, 0, sizeof(*ic)); + + f->inocache = ic; + f->inocache->pino_nlink = 1; /* Will be overwritten shortly for directories */ + f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache; + f->inocache->state = INO_STATE_PRESENT; + + jffs2_add_ino_cache(c, f->inocache); + D1(printk(KERN_DEBUG "jffs2_do_new_inode(): Assigned ino# %d\n", f->inocache->ino)); + ri->ino = cpu_to_je32(f->inocache->ino); + + ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + ri->nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); + ri->totlen = cpu_to_je32(PAD(sizeof(*ri))); + ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)); + ri->mode = cpu_to_jemode(mode); + + f->highest_version = 1; + ri->version = cpu_to_je32(f->highest_version); + + return 0; +} + +/* jffs2_write_dnode - given a raw_inode, allocate a full_dnode for it, + write it to the flash, link it into the existing inode/fragment list */ + +struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_raw_inode *ri, const unsigned char *data, + uint32_t datalen, int alloc_mode) + +{ + struct jffs2_full_dnode *fn; + size_t retlen; + uint32_t flash_ofs; + struct kvec vecs[2]; + int ret; + int retried = 0; + unsigned long cnt = 2; + + D1(if(je32_to_cpu(ri->hdr_crc) != crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)) { + printk(KERN_CRIT "Eep. CRC not correct in jffs2_write_dnode()\n"); + BUG(); + } + ); + vecs[0].iov_base = ri; + vecs[0].iov_len = sizeof(*ri); + vecs[1].iov_base = (unsigned char *)data; + vecs[1].iov_len = datalen; + + if (je32_to_cpu(ri->totlen) != sizeof(*ri) + datalen) { + printk(KERN_WARNING "jffs2_write_dnode: ri->totlen (0x%08x) != sizeof(*ri) (0x%08zx) + datalen (0x%08x)\n", je32_to_cpu(ri->totlen), sizeof(*ri), datalen); + } + + fn = jffs2_alloc_full_dnode(); + if (!fn) + return ERR_PTR(-ENOMEM); + + /* check number of valid vecs */ + if (!datalen || !data) + cnt = 1; + retry: + flash_ofs = write_ofs(c); + + jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len); + + if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(ri->version) < f->highest_version)) { + BUG_ON(!retried); + D1(printk(KERN_DEBUG "jffs2_write_dnode : dnode_version %d, " + "highest version %d -> updating dnode\n", + je32_to_cpu(ri->version), f->highest_version)); + ri->version = cpu_to_je32(++f->highest_version); + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + } + + ret = jffs2_flash_writev(c, vecs, cnt, flash_ofs, &retlen, + (alloc_mode==ALLOC_GC)?0:f->inocache->ino); + + if (ret || (retlen != sizeof(*ri) + datalen)) { + printk(KERN_NOTICE "Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n", + sizeof(*ri)+datalen, flash_ofs, ret, retlen); + + /* Mark the space as dirtied */ + if (retlen) { + /* Don't change raw->size to match retlen. We may have + written the node header already, and only the data will + seem corrupted, in which case the scan would skip over + any node we write before the original intended end of + this node */ + jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*ri)+datalen), NULL); + } else { + printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", flash_ofs); + } + if (!retried && alloc_mode != ALLOC_NORETRY) { + /* Try to reallocate space and retry */ + uint32_t dummy; + struct jffs2_eraseblock *jeb = &c->blocks[flash_ofs / c->sector_size]; + + retried = 1; + + D1(printk(KERN_DEBUG "Retrying failed write.\n")); + + jffs2_dbg_acct_sanity_check(c,jeb); + jffs2_dbg_acct_paranoia_check(c, jeb); + + if (alloc_mode == ALLOC_GC) { + ret = jffs2_reserve_space_gc(c, sizeof(*ri) + datalen, &dummy, + JFFS2_SUMMARY_INODE_SIZE); + } else { + /* Locking pain */ + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + + ret = jffs2_reserve_space(c, sizeof(*ri) + datalen, &dummy, + alloc_mode, JFFS2_SUMMARY_INODE_SIZE); + mutex_lock(&f->sem); + } + + if (!ret) { + flash_ofs = write_ofs(c); + D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", flash_ofs)); + + jffs2_dbg_acct_sanity_check(c,jeb); + jffs2_dbg_acct_paranoia_check(c, jeb); + + goto retry; + } + D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret)); + } + /* Release the full_dnode which is now useless, and return */ + jffs2_free_full_dnode(fn); + return ERR_PTR(ret?ret:-EIO); + } + /* Mark the space used */ + /* If node covers at least a whole page, or if it starts at the + beginning of a page and runs to the end of the file, or if + it's a hole node, mark it REF_PRISTINE, else REF_NORMAL. + */ + if ((je32_to_cpu(ri->dsize) >= PAGE_CACHE_SIZE) || + ( ((je32_to_cpu(ri->offset)&(PAGE_CACHE_SIZE-1))==0) && + (je32_to_cpu(ri->dsize)+je32_to_cpu(ri->offset) == je32_to_cpu(ri->isize)))) { + flash_ofs |= REF_PRISTINE; + } else { + flash_ofs |= REF_NORMAL; + } + fn->raw = jffs2_add_physical_node_ref(c, flash_ofs, PAD(sizeof(*ri)+datalen), f->inocache); + if (IS_ERR(fn->raw)) { + void *hold_err = fn->raw; + /* Release the full_dnode which is now useless, and return */ + jffs2_free_full_dnode(fn); + return ERR_CAST(hold_err); + } + fn->ofs = je32_to_cpu(ri->offset); + fn->size = je32_to_cpu(ri->dsize); + fn->frags = 0; + + D1(printk(KERN_DEBUG "jffs2_write_dnode wrote node at 0x%08x(%d) with dsize 0x%x, csize 0x%x, node_crc 0x%08x, data_crc 0x%08x, totlen 0x%08x\n", + flash_ofs & ~3, flash_ofs & 3, je32_to_cpu(ri->dsize), + je32_to_cpu(ri->csize), je32_to_cpu(ri->node_crc), + je32_to_cpu(ri->data_crc), je32_to_cpu(ri->totlen))); + + if (retried) { + jffs2_dbg_acct_sanity_check(c,NULL); + } + + return fn; +} + +struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_raw_dirent *rd, const unsigned char *name, + uint32_t namelen, int alloc_mode) +{ + struct jffs2_full_dirent *fd; + size_t retlen; + struct kvec vecs[2]; + uint32_t flash_ofs; + int retried = 0; + int ret; + + D1(printk(KERN_DEBUG "jffs2_write_dirent(ino #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x)\n", + je32_to_cpu(rd->pino), name, name, je32_to_cpu(rd->ino), + je32_to_cpu(rd->name_crc))); + + D1(if(je32_to_cpu(rd->hdr_crc) != crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)) { + printk(KERN_CRIT "Eep. CRC not correct in jffs2_write_dirent()\n"); + BUG(); + }); + + if (strnlen(name, namelen) != namelen) { + /* This should never happen, but seems to have done on at least one + occasion: https://dev.laptop.org/ticket/4184 */ + printk(KERN_CRIT "Error in jffs2_write_dirent() -- name contains zero bytes!\n"); + printk(KERN_CRIT "Directory inode #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x\n", + je32_to_cpu(rd->pino), name, name, je32_to_cpu(rd->ino), + je32_to_cpu(rd->name_crc)); + WARN_ON(1); + return ERR_PTR(-EIO); + } + + vecs[0].iov_base = rd; + vecs[0].iov_len = sizeof(*rd); + vecs[1].iov_base = (unsigned char *)name; + vecs[1].iov_len = namelen; + + fd = jffs2_alloc_full_dirent(namelen+1); + if (!fd) + return ERR_PTR(-ENOMEM); + + fd->version = je32_to_cpu(rd->version); + fd->ino = je32_to_cpu(rd->ino); + fd->nhash = full_name_hash(name, namelen); + fd->type = rd->type; + memcpy(fd->name, name, namelen); + fd->name[namelen]=0; + + retry: + flash_ofs = write_ofs(c); + + jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len); + + if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(rd->version) < f->highest_version)) { + BUG_ON(!retried); + D1(printk(KERN_DEBUG "jffs2_write_dirent : dirent_version %d, " + "highest version %d -> updating dirent\n", + je32_to_cpu(rd->version), f->highest_version)); + rd->version = cpu_to_je32(++f->highest_version); + fd->version = je32_to_cpu(rd->version); + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + } + + ret = jffs2_flash_writev(c, vecs, 2, flash_ofs, &retlen, + (alloc_mode==ALLOC_GC)?0:je32_to_cpu(rd->pino)); + if (ret || (retlen != sizeof(*rd) + namelen)) { + printk(KERN_NOTICE "Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n", + sizeof(*rd)+namelen, flash_ofs, ret, retlen); + /* Mark the space as dirtied */ + if (retlen) { + jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*rd)+namelen), NULL); + } else { + printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", flash_ofs); + } + if (!retried) { + /* Try to reallocate space and retry */ + uint32_t dummy; + struct jffs2_eraseblock *jeb = &c->blocks[flash_ofs / c->sector_size]; + + retried = 1; + + D1(printk(KERN_DEBUG "Retrying failed write.\n")); + + jffs2_dbg_acct_sanity_check(c,jeb); + jffs2_dbg_acct_paranoia_check(c, jeb); + + if (alloc_mode == ALLOC_GC) { + ret = jffs2_reserve_space_gc(c, sizeof(*rd) + namelen, &dummy, + JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + } else { + /* Locking pain */ + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + + ret = jffs2_reserve_space(c, sizeof(*rd) + namelen, &dummy, + alloc_mode, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + mutex_lock(&f->sem); + } + + if (!ret) { + flash_ofs = write_ofs(c); + D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", flash_ofs)); + jffs2_dbg_acct_sanity_check(c,jeb); + jffs2_dbg_acct_paranoia_check(c, jeb); + goto retry; + } + D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret)); + } + /* Release the full_dnode which is now useless, and return */ + jffs2_free_full_dirent(fd); + return ERR_PTR(ret?ret:-EIO); + } + /* Mark the space used */ + fd->raw = jffs2_add_physical_node_ref(c, flash_ofs | dirent_node_state(rd), + PAD(sizeof(*rd)+namelen), f->inocache); + if (IS_ERR(fd->raw)) { + void *hold_err = fd->raw; + /* Release the full_dirent which is now useless, and return */ + jffs2_free_full_dirent(fd); + return ERR_CAST(hold_err); + } + + if (retried) { + jffs2_dbg_acct_sanity_check(c,NULL); + } + + return fd; +} + +/* The OS-specific code fills in the metadata in the jffs2_raw_inode for us, so that + we don't have to go digging in struct inode or its equivalent. It should set: + mode, uid, gid, (starting)isize, atime, ctime, mtime */ +int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + struct jffs2_raw_inode *ri, unsigned char *buf, + uint32_t offset, uint32_t writelen, uint32_t *retlen) +{ + int ret = 0; + uint32_t writtenlen = 0; + + D1(printk(KERN_DEBUG "jffs2_write_inode_range(): Ino #%u, ofs 0x%x, len 0x%x\n", + f->inocache->ino, offset, writelen)); + + while(writelen) { + struct jffs2_full_dnode *fn; + unsigned char *comprbuf = NULL; + uint16_t comprtype = JFFS2_COMPR_NONE; + uint32_t alloclen; + uint32_t datalen, cdatalen; + int retried = 0; + + retry: + D2(printk(KERN_DEBUG "jffs2_commit_write() loop: 0x%x to write to 0x%x\n", writelen, offset)); + + ret = jffs2_reserve_space(c, sizeof(*ri) + JFFS2_MIN_DATA_LEN, + &alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + if (ret) { + D1(printk(KERN_DEBUG "jffs2_reserve_space returned %d\n", ret)); + break; + } + mutex_lock(&f->sem); + datalen = min_t(uint32_t, writelen, PAGE_CACHE_SIZE - (offset & (PAGE_CACHE_SIZE-1))); + cdatalen = min_t(uint32_t, alloclen - sizeof(*ri), datalen); + + comprtype = jffs2_compress(c, f, buf, &comprbuf, &datalen, &cdatalen); + + ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + ri->nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); + ri->totlen = cpu_to_je32(sizeof(*ri) + cdatalen); + ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)); + + ri->ino = cpu_to_je32(f->inocache->ino); + ri->version = cpu_to_je32(++f->highest_version); + ri->isize = cpu_to_je32(max(je32_to_cpu(ri->isize), offset + datalen)); + ri->offset = cpu_to_je32(offset); + ri->csize = cpu_to_je32(cdatalen); + ri->dsize = cpu_to_je32(datalen); + ri->compr = comprtype & 0xff; + ri->usercompr = (comprtype >> 8 ) & 0xff; + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + ri->data_crc = cpu_to_je32(crc32(0, comprbuf, cdatalen)); + + fn = jffs2_write_dnode(c, f, ri, comprbuf, cdatalen, ALLOC_NORETRY); + + jffs2_free_comprbuf(comprbuf, buf); + + if (IS_ERR(fn)) { + ret = PTR_ERR(fn); + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + if (!retried) { + /* Write error to be retried */ + retried = 1; + D1(printk(KERN_DEBUG "Retrying node write in jffs2_write_inode_range()\n")); + goto retry; + } + break; + } + ret = jffs2_add_full_dnode_to_inode(c, f, fn); + if (f->metadata) { + jffs2_mark_node_obsolete(c, f->metadata->raw); + jffs2_free_full_dnode(f->metadata); + f->metadata = NULL; + } + if (ret) { + /* Eep */ + D1(printk(KERN_DEBUG "Eep. add_full_dnode_to_inode() failed in commit_write, returned %d\n", ret)); + jffs2_mark_node_obsolete(c, fn->raw); + jffs2_free_full_dnode(fn); + + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + break; + } + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + if (!datalen) { + printk(KERN_WARNING "Eep. We didn't actually write any data in jffs2_write_inode_range()\n"); + ret = -EIO; + break; + } + D1(printk(KERN_DEBUG "increasing writtenlen by %d\n", datalen)); + writtenlen += datalen; + offset += datalen; + writelen -= datalen; + buf += datalen; + } + *retlen = writtenlen; + return ret; +} + +int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, + struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, + const struct qstr *qstr) +{ + struct jffs2_raw_dirent *rd; + struct jffs2_full_dnode *fn; + struct jffs2_full_dirent *fd; + uint32_t alloclen; + int ret; + + /* Try to reserve enough space for both node and dirent. + * Just the node will do for now, though + */ + ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL, + JFFS2_SUMMARY_INODE_SIZE); + D1(printk(KERN_DEBUG "jffs2_do_create(): reserved 0x%x bytes\n", alloclen)); + if (ret) + return ret; + + mutex_lock(&f->sem); + + ri->data_crc = cpu_to_je32(0); + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + + fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL); + + D1(printk(KERN_DEBUG "jffs2_do_create created file with mode 0x%x\n", + jemode_to_cpu(ri->mode))); + + if (IS_ERR(fn)) { + D1(printk(KERN_DEBUG "jffs2_write_dnode() failed\n")); + /* Eeek. Wave bye bye */ + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + return PTR_ERR(fn); + } + /* No data here. Only a metadata node, which will be + obsoleted by the first data write + */ + f->metadata = fn; + + mutex_unlock(&f->sem); + jffs2_complete_reservation(c); + + ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode, qstr); + if (ret) + return ret; + ret = jffs2_init_acl_post(&f->vfs_inode); + if (ret) + return ret; + + ret = jffs2_reserve_space(c, sizeof(*rd)+qstr->len, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(qstr->len)); + + if (ret) { + /* Eep. */ + D1(printk(KERN_DEBUG "jffs2_reserve_space() for dirent failed\n")); + return ret; + } + + rd = jffs2_alloc_raw_dirent(); + if (!rd) { + /* Argh. Now we treat it like a normal delete */ + jffs2_complete_reservation(c); + return -ENOMEM; + } + + mutex_lock(&dir_f->sem); + + rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); + rd->totlen = cpu_to_je32(sizeof(*rd) + qstr->len); + rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); + + rd->pino = cpu_to_je32(dir_f->inocache->ino); + rd->version = cpu_to_je32(++dir_f->highest_version); + rd->ino = ri->ino; + rd->mctime = ri->ctime; + rd->nsize = qstr->len; + rd->type = DT_REG; + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + rd->name_crc = cpu_to_je32(crc32(0, qstr->name, qstr->len)); + + fd = jffs2_write_dirent(c, dir_f, rd, qstr->name, qstr->len, ALLOC_NORMAL); + + jffs2_free_raw_dirent(rd); + + if (IS_ERR(fd)) { + /* dirent failed to write. Delete the inode normally + as if it were the final unlink() */ + jffs2_complete_reservation(c); + mutex_unlock(&dir_f->sem); + return PTR_ERR(fd); + } + + /* Link the fd into the inode's list, obsoleting an old + one if necessary. */ + jffs2_add_fd_to_list(c, fd, &dir_f->dents); + + jffs2_complete_reservation(c); + mutex_unlock(&dir_f->sem); + + return 0; +} + + +int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, + const char *name, int namelen, struct jffs2_inode_info *dead_f, + uint32_t time) +{ + struct jffs2_raw_dirent *rd; + struct jffs2_full_dirent *fd; + uint32_t alloclen; + int ret; + + if (!jffs2_can_mark_obsolete(c)) { + /* We can't mark stuff obsolete on the medium. We need to write a deletion dirent */ + + rd = jffs2_alloc_raw_dirent(); + if (!rd) + return -ENOMEM; + + ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, + ALLOC_DELETION, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + if (ret) { + jffs2_free_raw_dirent(rd); + return ret; + } + + mutex_lock(&dir_f->sem); + + /* Build a deletion node */ + rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); + rd->totlen = cpu_to_je32(sizeof(*rd) + namelen); + rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); + + rd->pino = cpu_to_je32(dir_f->inocache->ino); + rd->version = cpu_to_je32(++dir_f->highest_version); + rd->ino = cpu_to_je32(0); + rd->mctime = cpu_to_je32(time); + rd->nsize = namelen; + rd->type = DT_UNKNOWN; + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + rd->name_crc = cpu_to_je32(crc32(0, name, namelen)); + + fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_DELETION); + + jffs2_free_raw_dirent(rd); + + if (IS_ERR(fd)) { + jffs2_complete_reservation(c); + mutex_unlock(&dir_f->sem); + return PTR_ERR(fd); + } + + /* File it. This will mark the old one obsolete. */ + jffs2_add_fd_to_list(c, fd, &dir_f->dents); + mutex_unlock(&dir_f->sem); + } else { + uint32_t nhash = full_name_hash(name, namelen); + + fd = dir_f->dents; + /* We don't actually want to reserve any space, but we do + want to be holding the alloc_sem when we write to flash */ + mutex_lock(&c->alloc_sem); + mutex_lock(&dir_f->sem); + + for (fd = dir_f->dents; fd; fd = fd->next) { + if (fd->nhash == nhash && + !memcmp(fd->name, name, namelen) && + !fd->name[namelen]) { + + D1(printk(KERN_DEBUG "Marking old dirent node (ino #%u) @%08x obsolete\n", + fd->ino, ref_offset(fd->raw))); + jffs2_mark_node_obsolete(c, fd->raw); + /* We don't want to remove it from the list immediately, + because that screws up getdents()/seek() semantics even + more than they're screwed already. Turn it into a + node-less deletion dirent instead -- a placeholder */ + fd->raw = NULL; + fd->ino = 0; + break; + } + } + mutex_unlock(&dir_f->sem); + } + + /* dead_f is NULL if this was a rename not a real unlink */ + /* Also catch the !f->inocache case, where there was a dirent + pointing to an inode which didn't exist. */ + if (dead_f && dead_f->inocache) { + + mutex_lock(&dead_f->sem); + + if (S_ISDIR(OFNI_EDONI_2SFFJ(dead_f)->i_mode)) { + while (dead_f->dents) { + /* There can be only deleted ones */ + fd = dead_f->dents; + + dead_f->dents = fd->next; + + if (fd->ino) { + printk(KERN_WARNING "Deleting inode #%u with active dentry \"%s\"->ino #%u\n", + dead_f->inocache->ino, fd->name, fd->ino); + } else { + D1(printk(KERN_DEBUG "Removing deletion dirent for \"%s\" from dir ino #%u\n", + fd->name, dead_f->inocache->ino)); + } + if (fd->raw) + jffs2_mark_node_obsolete(c, fd->raw); + jffs2_free_full_dirent(fd); + } + dead_f->inocache->pino_nlink = 0; + } else + dead_f->inocache->pino_nlink--; + /* NB: Caller must set inode nlink if appropriate */ + mutex_unlock(&dead_f->sem); + } + + jffs2_complete_reservation(c); + + return 0; +} + + +int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino, uint8_t type, const char *name, int namelen, uint32_t time) +{ + struct jffs2_raw_dirent *rd; + struct jffs2_full_dirent *fd; + uint32_t alloclen; + int ret; + + rd = jffs2_alloc_raw_dirent(); + if (!rd) + return -ENOMEM; + + ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + if (ret) { + jffs2_free_raw_dirent(rd); + return ret; + } + + mutex_lock(&dir_f->sem); + + /* Build a deletion node */ + rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); + rd->totlen = cpu_to_je32(sizeof(*rd) + namelen); + rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); + + rd->pino = cpu_to_je32(dir_f->inocache->ino); + rd->version = cpu_to_je32(++dir_f->highest_version); + rd->ino = cpu_to_je32(ino); + rd->mctime = cpu_to_je32(time); + rd->nsize = namelen; + + rd->type = type; + + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + rd->name_crc = cpu_to_je32(crc32(0, name, namelen)); + + fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL); + + jffs2_free_raw_dirent(rd); + + if (IS_ERR(fd)) { + jffs2_complete_reservation(c); + mutex_unlock(&dir_f->sem); + return PTR_ERR(fd); + } + + /* File it. This will mark the old one obsolete. */ + jffs2_add_fd_to_list(c, fd, &dir_f->dents); + + jffs2_complete_reservation(c); + mutex_unlock(&dir_f->sem); + + return 0; +} diff --git a/fs/jffs2/writev.c b/fs/jffs2/writev.c new file mode 100644 index 00000000..b9276b11 --- /dev/null +++ b/fs/jffs2/writev.c @@ -0,0 +1,79 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2001-2007 Red Hat, Inc. + * + * Created by David Woodhouse + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include "nodelist.h" + +/* This ought to be in core MTD code. All registered MTD devices + without writev should have this put in place. Bug the MTD + maintainer */ +static inline int mtd_fake_writev(struct mtd_info *mtd, const struct kvec *vecs, + unsigned long count, loff_t to, size_t *retlen) +{ + unsigned long i; + size_t totlen = 0, thislen; + int ret = 0; + + for (i=0; iwrite(mtd, to, vecs[i].iov_len, &thislen, vecs[i].iov_base); + totlen += thislen; + if (ret || thislen != vecs[i].iov_len) + break; + to += vecs[i].iov_len; + } + if (retlen) + *retlen = totlen; + return ret; +} + +int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs, + unsigned long count, loff_t to, size_t *retlen) +{ + if (!jffs2_is_writebuffered(c)) { + if (jffs2_sum_active()) { + int res; + res = jffs2_sum_add_kvec(c, vecs, count, (uint32_t) to); + if (res) { + return res; + } + } + } + + if (c->mtd->writev) + return c->mtd->writev(c->mtd, vecs, count, to, retlen); + else { + return mtd_fake_writev(c->mtd, vecs, count, to, retlen); + } +} + +int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, + size_t *retlen, const u_char *buf) +{ + int ret; + ret = c->mtd->write(c->mtd, ofs, len, retlen, buf); + + if (jffs2_sum_active()) { + struct kvec vecs[1]; + int res; + + vecs[0].iov_base = (unsigned char *) buf; + vecs[0].iov_len = len; + + res = jffs2_sum_add_kvec(c, vecs, 1, (uint32_t) ofs); + if (res) { + return res; + } + } + return ret; +} diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c new file mode 100644 index 00000000..3e93cdd1 --- /dev/null +++ b/fs/jffs2/xattr.c @@ -0,0 +1,1330 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2006 NEC Corporation + * + * Created by KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nodelist.h" +/* -------- xdatum related functions ---------------- + * xattr_datum_hashkey(xprefix, xname, xvalue, xsize) + * is used to calcurate xdatum hashkey. The reminder of hashkey into XATTRINDEX_HASHSIZE is + * the index of the xattr name/value pair cache (c->xattrindex). + * is_xattr_datum_unchecked(c, xd) + * returns 1, if xdatum contains any unchecked raw nodes. if all raw nodes are not + * unchecked, it returns 0. + * unload_xattr_datum(c, xd) + * is used to release xattr name/value pair and detach from c->xattrindex. + * reclaim_xattr_datum(c) + * is used to reclaim xattr name/value pairs on the xattr name/value pair cache when + * memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold + * is hard coded as 32KiB. + * do_verify_xattr_datum(c, xd) + * is used to load the xdatum informations without name/value pair from the medium. + * It's necessary once, because those informations are not collected during mounting + * process when EBS is enabled. + * 0 will be returned, if success. An negative return value means recoverable error, and + * positive return value means unrecoverable error. Thus, caller must remove this xdatum + * and xref when it returned positive value. + * do_load_xattr_datum(c, xd) + * is used to load name/value pair from the medium. + * The meanings of return value is same as do_verify_xattr_datum(). + * load_xattr_datum(c, xd) + * is used to be as a wrapper of do_verify_xattr_datum() and do_load_xattr_datum(). + * If xd need to call do_verify_xattr_datum() at first, it's called before calling + * do_load_xattr_datum(). The meanings of return value is same as do_verify_xattr_datum(). + * save_xattr_datum(c, xd) + * is used to write xdatum to medium. xd->version will be incremented. + * create_xattr_datum(c, xprefix, xname, xvalue, xsize) + * is used to create new xdatum and write to medium. + * unrefer_xattr_datum(c, xd) + * is used to delete a xdatum. When nobody refers this xdatum, JFFS2_XFLAGS_DEAD + * is set on xd->flags and chained xattr_dead_list or release it immediately. + * In the first case, the garbage collector release it later. + * -------------------------------------------------- */ +static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *xvalue, int xsize) +{ + int name_len = strlen(xname); + + return crc32(xprefix, xname, name_len) ^ crc32(xprefix, xvalue, xsize); +} + +static int is_xattr_datum_unchecked(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + struct jffs2_raw_node_ref *raw; + int rc = 0; + + spin_lock(&c->erase_completion_lock); + for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) { + if (ref_flags(raw) == REF_UNCHECKED) { + rc = 1; + break; + } + } + spin_unlock(&c->erase_completion_lock); + return rc; +} + +static void unload_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem) */ + D1(dbg_xattr("%s: xid=%u, version=%u\n", __func__, xd->xid, xd->version)); + if (xd->xname) { + c->xdatum_mem_usage -= (xd->name_len + 1 + xd->value_len); + kfree(xd->xname); + } + + list_del_init(&xd->xindex); + xd->hashkey = 0; + xd->xname = NULL; + xd->xvalue = NULL; +} + +static void reclaim_xattr_datum(struct jffs2_sb_info *c) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_xattr_datum *xd, *_xd; + uint32_t target, before; + static int index = 0; + int count; + + if (c->xdatum_mem_threshold > c->xdatum_mem_usage) + return; + + before = c->xdatum_mem_usage; + target = c->xdatum_mem_usage * 4 / 5; /* 20% reduction */ + for (count = 0; count < XATTRINDEX_HASHSIZE; count++) { + list_for_each_entry_safe(xd, _xd, &c->xattrindex[index], xindex) { + if (xd->flags & JFFS2_XFLAGS_HOT) { + xd->flags &= ~JFFS2_XFLAGS_HOT; + } else if (!(xd->flags & JFFS2_XFLAGS_BIND)) { + unload_xattr_datum(c, xd); + } + if (c->xdatum_mem_usage <= target) + goto out; + } + index = (index+1) % XATTRINDEX_HASHSIZE; + } + out: + JFFS2_NOTICE("xdatum_mem_usage from %u byte to %u byte (%u byte reclaimed)\n", + before, c->xdatum_mem_usage, before - c->xdatum_mem_usage); +} + +static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_eraseblock *jeb; + struct jffs2_raw_node_ref *raw; + struct jffs2_raw_xattr rx; + size_t readlen; + uint32_t crc, offset, totlen; + int rc; + + spin_lock(&c->erase_completion_lock); + offset = ref_offset(xd->node); + if (ref_flags(xd->node) == REF_PRISTINE) + goto complete; + spin_unlock(&c->erase_completion_lock); + + rc = jffs2_flash_read(c, offset, sizeof(rx), &readlen, (char *)&rx); + if (rc || readlen != sizeof(rx)) { + JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n", + rc, sizeof(rx), readlen, offset); + return rc ? rc : -EIO; + } + crc = crc32(0, &rx, sizeof(rx) - 4); + if (crc != je32_to_cpu(rx.node_crc)) { + JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", + offset, je32_to_cpu(rx.hdr_crc), crc); + xd->flags |= JFFS2_XFLAGS_INVALID; + return -EIO; + } + totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len)); + if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK + || je16_to_cpu(rx.nodetype) != JFFS2_NODETYPE_XATTR + || je32_to_cpu(rx.totlen) != totlen + || je32_to_cpu(rx.xid) != xd->xid + || je32_to_cpu(rx.version) != xd->version) { + JFFS2_ERROR("inconsistent xdatum at %#08x, magic=%#04x/%#04x, " + "nodetype=%#04x/%#04x, totlen=%u/%u, xid=%u/%u, version=%u/%u\n", + offset, je16_to_cpu(rx.magic), JFFS2_MAGIC_BITMASK, + je16_to_cpu(rx.nodetype), JFFS2_NODETYPE_XATTR, + je32_to_cpu(rx.totlen), totlen, + je32_to_cpu(rx.xid), xd->xid, + je32_to_cpu(rx.version), xd->version); + xd->flags |= JFFS2_XFLAGS_INVALID; + return -EIO; + } + xd->xprefix = rx.xprefix; + xd->name_len = rx.name_len; + xd->value_len = je16_to_cpu(rx.value_len); + xd->data_crc = je32_to_cpu(rx.data_crc); + + spin_lock(&c->erase_completion_lock); + complete: + for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) { + jeb = &c->blocks[ref_offset(raw) / c->sector_size]; + totlen = PAD(ref_totlen(c, jeb, raw)); + if (ref_flags(raw) == REF_UNCHECKED) { + c->unchecked_size -= totlen; c->used_size += totlen; + jeb->unchecked_size -= totlen; jeb->used_size += totlen; + } + raw->flash_offset = ref_offset(raw) | ((xd->node==raw) ? REF_PRISTINE : REF_NORMAL); + } + spin_unlock(&c->erase_completion_lock); + + /* unchecked xdatum is chained with c->xattr_unchecked */ + list_del_init(&xd->xindex); + + dbg_xattr("success on verfying xdatum (xid=%u, version=%u)\n", + xd->xid, xd->version); + + return 0; +} + +static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem) */ + char *data; + size_t readlen; + uint32_t crc, length; + int i, ret, retry = 0; + + BUG_ON(ref_flags(xd->node) != REF_PRISTINE); + BUG_ON(!list_empty(&xd->xindex)); + retry: + length = xd->name_len + 1 + xd->value_len; + data = kmalloc(length, GFP_KERNEL); + if (!data) + return -ENOMEM; + + ret = jffs2_flash_read(c, ref_offset(xd->node)+sizeof(struct jffs2_raw_xattr), + length, &readlen, data); + + if (ret || length!=readlen) { + JFFS2_WARNING("jffs2_flash_read() returned %d, request=%d, readlen=%zu, at %#08x\n", + ret, length, readlen, ref_offset(xd->node)); + kfree(data); + return ret ? ret : -EIO; + } + + data[xd->name_len] = '\0'; + crc = crc32(0, data, length); + if (crc != xd->data_crc) { + JFFS2_WARNING("node CRC failed (JFFS2_NODETYPE_XREF)" + " at %#08x, read: 0x%08x calculated: 0x%08x\n", + ref_offset(xd->node), xd->data_crc, crc); + kfree(data); + xd->flags |= JFFS2_XFLAGS_INVALID; + return -EIO; + } + + xd->flags |= JFFS2_XFLAGS_HOT; + xd->xname = data; + xd->xvalue = data + xd->name_len+1; + + c->xdatum_mem_usage += length; + + xd->hashkey = xattr_datum_hashkey(xd->xprefix, xd->xname, xd->xvalue, xd->value_len); + i = xd->hashkey % XATTRINDEX_HASHSIZE; + list_add(&xd->xindex, &c->xattrindex[i]); + if (!retry) { + retry = 1; + reclaim_xattr_datum(c); + if (!xd->xname) + goto retry; + } + + dbg_xattr("success on loading xdatum (xid=%u, xprefix=%u, xname='%s')\n", + xd->xid, xd->xprefix, xd->xname); + + return 0; +} + +static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem); + * rc < 0 : recoverable error, try again + * rc = 0 : success + * rc > 0 : Unrecoverable error, this node should be deleted. + */ + int rc = 0; + + BUG_ON(xd->flags & JFFS2_XFLAGS_DEAD); + if (xd->xname) + return 0; + if (xd->flags & JFFS2_XFLAGS_INVALID) + return -EIO; + if (unlikely(is_xattr_datum_unchecked(c, xd))) + rc = do_verify_xattr_datum(c, xd); + if (!rc) + rc = do_load_xattr_datum(c, xd); + return rc; +} + +static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_raw_xattr rx; + struct kvec vecs[2]; + size_t length; + int rc, totlen; + uint32_t phys_ofs = write_ofs(c); + + BUG_ON(!xd->xname); + BUG_ON(xd->flags & (JFFS2_XFLAGS_DEAD|JFFS2_XFLAGS_INVALID)); + + vecs[0].iov_base = ℞ + vecs[0].iov_len = sizeof(rx); + vecs[1].iov_base = xd->xname; + vecs[1].iov_len = xd->name_len + 1 + xd->value_len; + totlen = vecs[0].iov_len + vecs[1].iov_len; + + /* Setup raw-xattr */ + memset(&rx, 0, sizeof(rx)); + rx.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rx.nodetype = cpu_to_je16(JFFS2_NODETYPE_XATTR); + rx.totlen = cpu_to_je32(PAD(totlen)); + rx.hdr_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_unknown_node) - 4)); + + rx.xid = cpu_to_je32(xd->xid); + rx.version = cpu_to_je32(++xd->version); + rx.xprefix = xd->xprefix; + rx.name_len = xd->name_len; + rx.value_len = cpu_to_je16(xd->value_len); + rx.data_crc = cpu_to_je32(crc32(0, vecs[1].iov_base, vecs[1].iov_len)); + rx.node_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_raw_xattr) - 4)); + + rc = jffs2_flash_writev(c, vecs, 2, phys_ofs, &length, 0); + if (rc || totlen != length) { + JFFS2_WARNING("jffs2_flash_writev()=%d, req=%u, wrote=%zu, at %#08x\n", + rc, totlen, length, phys_ofs); + rc = rc ? rc : -EIO; + if (length) + jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, PAD(totlen), NULL); + + return rc; + } + /* success */ + jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(totlen), (void *)xd); + + dbg_xattr("success on saving xdatum (xid=%u, version=%u, xprefix=%u, xname='%s')\n", + xd->xid, xd->version, xd->xprefix, xd->xname); + + return 0; +} + +static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c, + int xprefix, const char *xname, + const char *xvalue, int xsize) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_xattr_datum *xd; + uint32_t hashkey, name_len; + char *data; + int i, rc; + + /* Search xattr_datum has same xname/xvalue by index */ + hashkey = xattr_datum_hashkey(xprefix, xname, xvalue, xsize); + i = hashkey % XATTRINDEX_HASHSIZE; + list_for_each_entry(xd, &c->xattrindex[i], xindex) { + if (xd->hashkey==hashkey + && xd->xprefix==xprefix + && xd->value_len==xsize + && !strcmp(xd->xname, xname) + && !memcmp(xd->xvalue, xvalue, xsize)) { + atomic_inc(&xd->refcnt); + return xd; + } + } + + /* Not found, Create NEW XATTR-Cache */ + name_len = strlen(xname); + + xd = jffs2_alloc_xattr_datum(); + if (!xd) + return ERR_PTR(-ENOMEM); + + data = kmalloc(name_len + 1 + xsize, GFP_KERNEL); + if (!data) { + jffs2_free_xattr_datum(xd); + return ERR_PTR(-ENOMEM); + } + strcpy(data, xname); + memcpy(data + name_len + 1, xvalue, xsize); + + atomic_set(&xd->refcnt, 1); + xd->xid = ++c->highest_xid; + xd->flags |= JFFS2_XFLAGS_HOT; + xd->xprefix = xprefix; + + xd->hashkey = hashkey; + xd->xname = data; + xd->xvalue = data + name_len + 1; + xd->name_len = name_len; + xd->value_len = xsize; + xd->data_crc = crc32(0, data, xd->name_len + 1 + xd->value_len); + + rc = save_xattr_datum(c, xd); + if (rc) { + kfree(xd->xname); + jffs2_free_xattr_datum(xd); + return ERR_PTR(rc); + } + + /* Insert Hash Index */ + i = hashkey % XATTRINDEX_HASHSIZE; + list_add(&xd->xindex, &c->xattrindex[i]); + + c->xdatum_mem_usage += (xd->name_len + 1 + xd->value_len); + reclaim_xattr_datum(c); + + return xd; +} + +static void unrefer_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem) */ + if (atomic_dec_and_lock(&xd->refcnt, &c->erase_completion_lock)) { + unload_xattr_datum(c, xd); + xd->flags |= JFFS2_XFLAGS_DEAD; + if (xd->node == (void *)xd) { + BUG_ON(!(xd->flags & JFFS2_XFLAGS_INVALID)); + jffs2_free_xattr_datum(xd); + } else { + list_add(&xd->xindex, &c->xattr_dead_list); + } + spin_unlock(&c->erase_completion_lock); + + dbg_xattr("xdatum(xid=%u, version=%u) was removed.\n", + xd->xid, xd->version); + } +} + +/* -------- xref related functions ------------------ + * verify_xattr_ref(c, ref) + * is used to load xref information from medium. Because summary data does not + * contain xid/ino, it's necessary to verify once while mounting process. + * save_xattr_ref(c, ref) + * is used to write xref to medium. If delete marker is marked, it write + * a delete marker of xref into medium. + * create_xattr_ref(c, ic, xd) + * is used to create a new xref and write to medium. + * delete_xattr_ref(c, ref) + * is used to delete jffs2_xattr_ref. It marks xref XREF_DELETE_MARKER, + * and allows GC to reclaim those physical nodes. + * jffs2_xattr_delete_inode(c, ic) + * is called to remove xrefs related to obsolete inode when inode is unlinked. + * jffs2_xattr_free_inode(c, ic) + * is called to release xattr related objects when unmounting. + * check_xattr_ref_inode(c, ic) + * is used to confirm inode does not have duplicate xattr name/value pair. + * -------------------------------------------------- */ +static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) +{ + struct jffs2_eraseblock *jeb; + struct jffs2_raw_node_ref *raw; + struct jffs2_raw_xref rr; + size_t readlen; + uint32_t crc, offset, totlen; + int rc; + + spin_lock(&c->erase_completion_lock); + if (ref_flags(ref->node) != REF_UNCHECKED) + goto complete; + offset = ref_offset(ref->node); + spin_unlock(&c->erase_completion_lock); + + rc = jffs2_flash_read(c, offset, sizeof(rr), &readlen, (char *)&rr); + if (rc || sizeof(rr) != readlen) { + JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu, at %#08x\n", + rc, sizeof(rr), readlen, offset); + return rc ? rc : -EIO; + } + /* obsolete node */ + crc = crc32(0, &rr, sizeof(rr) - 4); + if (crc != je32_to_cpu(rr.node_crc)) { + JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", + offset, je32_to_cpu(rr.node_crc), crc); + return -EIO; + } + if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK + || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF + || je32_to_cpu(rr.totlen) != PAD(sizeof(rr))) { + JFFS2_ERROR("inconsistent xref at %#08x, magic=%#04x/%#04x, " + "nodetype=%#04x/%#04x, totlen=%u/%zu\n", + offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK, + je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF, + je32_to_cpu(rr.totlen), PAD(sizeof(rr))); + return -EIO; + } + ref->ino = je32_to_cpu(rr.ino); + ref->xid = je32_to_cpu(rr.xid); + ref->xseqno = je32_to_cpu(rr.xseqno); + if (ref->xseqno > c->highest_xseqno) + c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER); + + spin_lock(&c->erase_completion_lock); + complete: + for (raw=ref->node; raw != (void *)ref; raw=raw->next_in_ino) { + jeb = &c->blocks[ref_offset(raw) / c->sector_size]; + totlen = PAD(ref_totlen(c, jeb, raw)); + if (ref_flags(raw) == REF_UNCHECKED) { + c->unchecked_size -= totlen; c->used_size += totlen; + jeb->unchecked_size -= totlen; jeb->used_size += totlen; + } + raw->flash_offset = ref_offset(raw) | ((ref->node==raw) ? REF_PRISTINE : REF_NORMAL); + } + spin_unlock(&c->erase_completion_lock); + + dbg_xattr("success on verifying xref (ino=%u, xid=%u) at %#08x\n", + ref->ino, ref->xid, ref_offset(ref->node)); + return 0; +} + +static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_raw_xref rr; + size_t length; + uint32_t xseqno, phys_ofs = write_ofs(c); + int ret; + + rr.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); + rr.nodetype = cpu_to_je16(JFFS2_NODETYPE_XREF); + rr.totlen = cpu_to_je32(PAD(sizeof(rr))); + rr.hdr_crc = cpu_to_je32(crc32(0, &rr, sizeof(struct jffs2_unknown_node) - 4)); + + xseqno = (c->highest_xseqno += 2); + if (is_xattr_ref_dead(ref)) { + xseqno |= XREF_DELETE_MARKER; + rr.ino = cpu_to_je32(ref->ino); + rr.xid = cpu_to_je32(ref->xid); + } else { + rr.ino = cpu_to_je32(ref->ic->ino); + rr.xid = cpu_to_je32(ref->xd->xid); + } + rr.xseqno = cpu_to_je32(xseqno); + rr.node_crc = cpu_to_je32(crc32(0, &rr, sizeof(rr) - 4)); + + ret = jffs2_flash_write(c, phys_ofs, sizeof(rr), &length, (char *)&rr); + if (ret || sizeof(rr) != length) { + JFFS2_WARNING("jffs2_flash_write() returned %d, request=%zu, retlen=%zu, at %#08x\n", + ret, sizeof(rr), length, phys_ofs); + ret = ret ? ret : -EIO; + if (length) + jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, PAD(sizeof(rr)), NULL); + + return ret; + } + /* success */ + ref->xseqno = xseqno; + jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(sizeof(rr)), (void *)ref); + + dbg_xattr("success on saving xref (ino=%u, xid=%u)\n", ref->ic->ino, ref->xd->xid); + + return 0; +} + +static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, + struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_xattr_ref *ref; + int ret; + + ref = jffs2_alloc_xattr_ref(); + if (!ref) + return ERR_PTR(-ENOMEM); + ref->ic = ic; + ref->xd = xd; + + ret = save_xattr_ref(c, ref); + if (ret) { + jffs2_free_xattr_ref(ref); + return ERR_PTR(ret); + } + + /* Chain to inode */ + ref->next = ic->xref; + ic->xref = ref; + + return ref; /* success */ +} + +static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_xattr_datum *xd; + + xd = ref->xd; + ref->xseqno |= XREF_DELETE_MARKER; + ref->ino = ref->ic->ino; + ref->xid = ref->xd->xid; + spin_lock(&c->erase_completion_lock); + ref->next = c->xref_dead_list; + c->xref_dead_list = ref; + spin_unlock(&c->erase_completion_lock); + + dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) was removed.\n", + ref->ino, ref->xid, ref->xseqno); + + unrefer_xattr_datum(c, xd); +} + +void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) +{ + /* It's called from jffs2_evict_inode() on inode removing. + When an inode with XATTR is removed, those XATTRs must be removed. */ + struct jffs2_xattr_ref *ref, *_ref; + + if (!ic || ic->pino_nlink > 0) + return; + + down_write(&c->xattr_sem); + for (ref = ic->xref; ref; ref = _ref) { + _ref = ref->next; + delete_xattr_ref(c, ref); + } + ic->xref = NULL; + up_write(&c->xattr_sem); +} + +void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) +{ + /* It's called from jffs2_free_ino_caches() until unmounting FS. */ + struct jffs2_xattr_datum *xd; + struct jffs2_xattr_ref *ref, *_ref; + + down_write(&c->xattr_sem); + for (ref = ic->xref; ref; ref = _ref) { + _ref = ref->next; + xd = ref->xd; + if (atomic_dec_and_test(&xd->refcnt)) { + unload_xattr_datum(c, xd); + jffs2_free_xattr_datum(xd); + } + jffs2_free_xattr_ref(ref); + } + ic->xref = NULL; + up_write(&c->xattr_sem); +} + +static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) +{ + /* success of check_xattr_ref_inode() means that inode (ic) dose not have + * duplicate name/value pairs. If duplicate name/value pair would be found, + * one will be removed. + */ + struct jffs2_xattr_ref *ref, *cmp, **pref, **pcmp; + int rc = 0; + + if (likely(ic->flags & INO_FLAGS_XATTR_CHECKED)) + return 0; + down_write(&c->xattr_sem); + retry: + rc = 0; + for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) { + if (!ref->xd->xname) { + rc = load_xattr_datum(c, ref->xd); + if (unlikely(rc > 0)) { + *pref = ref->next; + delete_xattr_ref(c, ref); + goto retry; + } else if (unlikely(rc < 0)) + goto out; + } + for (cmp=ref->next, pcmp=&ref->next; cmp; pcmp=&cmp->next, cmp=cmp->next) { + if (!cmp->xd->xname) { + ref->xd->flags |= JFFS2_XFLAGS_BIND; + rc = load_xattr_datum(c, cmp->xd); + ref->xd->flags &= ~JFFS2_XFLAGS_BIND; + if (unlikely(rc > 0)) { + *pcmp = cmp->next; + delete_xattr_ref(c, cmp); + goto retry; + } else if (unlikely(rc < 0)) + goto out; + } + if (ref->xd->xprefix == cmp->xd->xprefix + && !strcmp(ref->xd->xname, cmp->xd->xname)) { + if (ref->xseqno > cmp->xseqno) { + *pcmp = cmp->next; + delete_xattr_ref(c, cmp); + } else { + *pref = ref->next; + delete_xattr_ref(c, ref); + } + goto retry; + } + } + } + ic->flags |= INO_FLAGS_XATTR_CHECKED; + out: + up_write(&c->xattr_sem); + + return rc; +} + +/* -------- xattr subsystem functions --------------- + * jffs2_init_xattr_subsystem(c) + * is used to initialize semaphore and list_head, and some variables. + * jffs2_find_xattr_datum(c, xid) + * is used to lookup xdatum while scanning process. + * jffs2_clear_xattr_subsystem(c) + * is used to release any xattr related objects. + * jffs2_build_xattr_subsystem(c) + * is used to associate xdatum and xref while super block building process. + * jffs2_setup_xattr_datum(c, xid, version) + * is used to insert xdatum while scanning process. + * -------------------------------------------------- */ +void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c) +{ + int i; + + for (i=0; i < XATTRINDEX_HASHSIZE; i++) + INIT_LIST_HEAD(&c->xattrindex[i]); + INIT_LIST_HEAD(&c->xattr_unchecked); + INIT_LIST_HEAD(&c->xattr_dead_list); + c->xref_dead_list = NULL; + c->xref_temp = NULL; + + init_rwsem(&c->xattr_sem); + c->highest_xid = 0; + c->highest_xseqno = 0; + c->xdatum_mem_usage = 0; + c->xdatum_mem_threshold = 32 * 1024; /* Default 32KB */ +} + +static struct jffs2_xattr_datum *jffs2_find_xattr_datum(struct jffs2_sb_info *c, uint32_t xid) +{ + struct jffs2_xattr_datum *xd; + int i = xid % XATTRINDEX_HASHSIZE; + + /* It's only used in scanning/building process. */ + BUG_ON(!(c->flags & (JFFS2_SB_FLAG_SCANNING|JFFS2_SB_FLAG_BUILDING))); + + list_for_each_entry(xd, &c->xattrindex[i], xindex) { + if (xd->xid==xid) + return xd; + } + return NULL; +} + +void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c) +{ + struct jffs2_xattr_datum *xd, *_xd; + struct jffs2_xattr_ref *ref, *_ref; + int i; + + for (ref=c->xref_temp; ref; ref = _ref) { + _ref = ref->next; + jffs2_free_xattr_ref(ref); + } + + for (ref=c->xref_dead_list; ref; ref = _ref) { + _ref = ref->next; + jffs2_free_xattr_ref(ref); + } + + for (i=0; i < XATTRINDEX_HASHSIZE; i++) { + list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) { + list_del(&xd->xindex); + if (xd->xname) + kfree(xd->xname); + jffs2_free_xattr_datum(xd); + } + } + + list_for_each_entry_safe(xd, _xd, &c->xattr_dead_list, xindex) { + list_del(&xd->xindex); + jffs2_free_xattr_datum(xd); + } + list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) { + list_del(&xd->xindex); + jffs2_free_xattr_datum(xd); + } +} + +#define XREF_TMPHASH_SIZE (128) +void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) +{ + struct jffs2_xattr_ref *ref, *_ref; + struct jffs2_xattr_ref *xref_tmphash[XREF_TMPHASH_SIZE]; + struct jffs2_xattr_datum *xd, *_xd; + struct jffs2_inode_cache *ic; + struct jffs2_raw_node_ref *raw; + int i, xdatum_count = 0, xdatum_unchecked_count = 0, xref_count = 0; + int xdatum_orphan_count = 0, xref_orphan_count = 0, xref_dead_count = 0; + + BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING)); + + /* Phase.1 : Merge same xref */ + for (i=0; i < XREF_TMPHASH_SIZE; i++) + xref_tmphash[i] = NULL; + for (ref=c->xref_temp; ref; ref=_ref) { + struct jffs2_xattr_ref *tmp; + + _ref = ref->next; + if (ref_flags(ref->node) != REF_PRISTINE) { + if (verify_xattr_ref(c, ref)) { + BUG_ON(ref->node->next_in_ino != (void *)ref); + ref->node->next_in_ino = NULL; + jffs2_mark_node_obsolete(c, ref->node); + jffs2_free_xattr_ref(ref); + continue; + } + } + + i = (ref->ino ^ ref->xid) % XREF_TMPHASH_SIZE; + for (tmp=xref_tmphash[i]; tmp; tmp=tmp->next) { + if (tmp->ino == ref->ino && tmp->xid == ref->xid) + break; + } + if (tmp) { + raw = ref->node; + if (ref->xseqno > tmp->xseqno) { + tmp->xseqno = ref->xseqno; + raw->next_in_ino = tmp->node; + tmp->node = raw; + } else { + raw->next_in_ino = tmp->node->next_in_ino; + tmp->node->next_in_ino = raw; + } + jffs2_free_xattr_ref(ref); + continue; + } else { + ref->next = xref_tmphash[i]; + xref_tmphash[i] = ref; + } + } + c->xref_temp = NULL; + + /* Phase.2 : Bind xref with inode_cache and xattr_datum */ + for (i=0; i < XREF_TMPHASH_SIZE; i++) { + for (ref=xref_tmphash[i]; ref; ref=_ref) { + xref_count++; + _ref = ref->next; + if (is_xattr_ref_dead(ref)) { + ref->next = c->xref_dead_list; + c->xref_dead_list = ref; + xref_dead_count++; + continue; + } + /* At this point, ref->xid and ref->ino contain XID and inode number. + ref->xd and ref->ic are not valid yet. */ + xd = jffs2_find_xattr_datum(c, ref->xid); + ic = jffs2_get_ino_cache(c, ref->ino); + if (!xd || !ic || !ic->pino_nlink) { + dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) is orphan.\n", + ref->ino, ref->xid, ref->xseqno); + ref->xseqno |= XREF_DELETE_MARKER; + ref->next = c->xref_dead_list; + c->xref_dead_list = ref; + xref_orphan_count++; + continue; + } + ref->xd = xd; + ref->ic = ic; + atomic_inc(&xd->refcnt); + ref->next = ic->xref; + ic->xref = ref; + } + } + + /* Phase.3 : Link unchecked xdatum to xattr_unchecked list */ + for (i=0; i < XATTRINDEX_HASHSIZE; i++) { + list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) { + xdatum_count++; + list_del_init(&xd->xindex); + if (!atomic_read(&xd->refcnt)) { + dbg_xattr("xdatum(xid=%u, version=%u) is orphan.\n", + xd->xid, xd->version); + xd->flags |= JFFS2_XFLAGS_DEAD; + list_add(&xd->xindex, &c->xattr_unchecked); + xdatum_orphan_count++; + continue; + } + if (is_xattr_datum_unchecked(c, xd)) { + dbg_xattr("unchecked xdatum(xid=%u, version=%u)\n", + xd->xid, xd->version); + list_add(&xd->xindex, &c->xattr_unchecked); + xdatum_unchecked_count++; + } + } + } + /* build complete */ + JFFS2_NOTICE("complete building xattr subsystem, %u of xdatum" + " (%u unchecked, %u orphan) and " + "%u of xref (%u dead, %u orphan) found.\n", + xdatum_count, xdatum_unchecked_count, xdatum_orphan_count, + xref_count, xref_dead_count, xref_orphan_count); +} + +struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, + uint32_t xid, uint32_t version) +{ + struct jffs2_xattr_datum *xd; + + xd = jffs2_find_xattr_datum(c, xid); + if (!xd) { + xd = jffs2_alloc_xattr_datum(); + if (!xd) + return ERR_PTR(-ENOMEM); + xd->xid = xid; + xd->version = version; + if (xd->xid > c->highest_xid) + c->highest_xid = xd->xid; + list_add_tail(&xd->xindex, &c->xattrindex[xid % XATTRINDEX_HASHSIZE]); + } + return xd; +} + +/* -------- xattr subsystem functions --------------- + * xprefix_to_handler(xprefix) + * is used to translate xprefix into xattr_handler. + * jffs2_listxattr(dentry, buffer, size) + * is an implementation of listxattr handler on jffs2. + * do_jffs2_getxattr(inode, xprefix, xname, buffer, size) + * is an implementation of getxattr handler on jffs2. + * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags) + * is an implementation of setxattr handler on jffs2. + * -------------------------------------------------- */ +const struct xattr_handler *jffs2_xattr_handlers[] = { + &jffs2_user_xattr_handler, +#ifdef CONFIG_JFFS2_FS_SECURITY + &jffs2_security_xattr_handler, +#endif +#ifdef CONFIG_JFFS2_FS_POSIX_ACL + &jffs2_acl_access_xattr_handler, + &jffs2_acl_default_xattr_handler, +#endif + &jffs2_trusted_xattr_handler, + NULL +}; + +static const struct xattr_handler *xprefix_to_handler(int xprefix) { + const struct xattr_handler *ret; + + switch (xprefix) { + case JFFS2_XPREFIX_USER: + ret = &jffs2_user_xattr_handler; + break; +#ifdef CONFIG_JFFS2_FS_SECURITY + case JFFS2_XPREFIX_SECURITY: + ret = &jffs2_security_xattr_handler; + break; +#endif +#ifdef CONFIG_JFFS2_FS_POSIX_ACL + case JFFS2_XPREFIX_ACL_ACCESS: + ret = &jffs2_acl_access_xattr_handler; + break; + case JFFS2_XPREFIX_ACL_DEFAULT: + ret = &jffs2_acl_default_xattr_handler; + break; +#endif + case JFFS2_XPREFIX_TRUSTED: + ret = &jffs2_trusted_xattr_handler; + break; + default: + ret = NULL; + break; + } + return ret; +} + +ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_inode_cache *ic = f->inocache; + struct jffs2_xattr_ref *ref, **pref; + struct jffs2_xattr_datum *xd; + const struct xattr_handler *xhandle; + ssize_t len, rc; + int retry = 0; + + rc = check_xattr_ref_inode(c, ic); + if (unlikely(rc)) + return rc; + + down_read(&c->xattr_sem); + retry: + len = 0; + for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) { + BUG_ON(ref->ic != ic); + xd = ref->xd; + if (!xd->xname) { + /* xdatum is unchached */ + if (!retry) { + retry = 1; + up_read(&c->xattr_sem); + down_write(&c->xattr_sem); + goto retry; + } else { + rc = load_xattr_datum(c, xd); + if (unlikely(rc > 0)) { + *pref = ref->next; + delete_xattr_ref(c, ref); + goto retry; + } else if (unlikely(rc < 0)) + goto out; + } + } + xhandle = xprefix_to_handler(xd->xprefix); + if (!xhandle) + continue; + if (buffer) { + rc = xhandle->list(dentry, buffer+len, size-len, + xd->xname, xd->name_len, xd->flags); + } else { + rc = xhandle->list(dentry, NULL, 0, xd->xname, + xd->name_len, xd->flags); + } + if (rc < 0) + goto out; + len += rc; + } + rc = len; + out: + if (!retry) { + up_read(&c->xattr_sem); + } else { + up_write(&c->xattr_sem); + } + return rc; +} + +int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname, + char *buffer, size_t size) +{ + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_inode_cache *ic = f->inocache; + struct jffs2_xattr_datum *xd; + struct jffs2_xattr_ref *ref, **pref; + int rc, retry = 0; + + rc = check_xattr_ref_inode(c, ic); + if (unlikely(rc)) + return rc; + + down_read(&c->xattr_sem); + retry: + for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) { + BUG_ON(ref->ic!=ic); + + xd = ref->xd; + if (xd->xprefix != xprefix) + continue; + if (!xd->xname) { + /* xdatum is unchached */ + if (!retry) { + retry = 1; + up_read(&c->xattr_sem); + down_write(&c->xattr_sem); + goto retry; + } else { + rc = load_xattr_datum(c, xd); + if (unlikely(rc > 0)) { + *pref = ref->next; + delete_xattr_ref(c, ref); + goto retry; + } else if (unlikely(rc < 0)) { + goto out; + } + } + } + if (!strcmp(xname, xd->xname)) { + rc = xd->value_len; + if (buffer) { + if (size < rc) { + rc = -ERANGE; + } else { + memcpy(buffer, xd->xvalue, rc); + } + } + goto out; + } + } + rc = -ENODATA; + out: + if (!retry) { + up_read(&c->xattr_sem); + } else { + up_write(&c->xattr_sem); + } + return rc; +} + +int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, + const char *buffer, size_t size, int flags) +{ + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_inode_cache *ic = f->inocache; + struct jffs2_xattr_datum *xd; + struct jffs2_xattr_ref *ref, *newref, **pref; + uint32_t length, request; + int rc; + + rc = check_xattr_ref_inode(c, ic); + if (unlikely(rc)) + return rc; + + request = PAD(sizeof(struct jffs2_raw_xattr) + strlen(xname) + 1 + size); + rc = jffs2_reserve_space(c, request, &length, + ALLOC_NORMAL, JFFS2_SUMMARY_XATTR_SIZE); + if (rc) { + JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request); + return rc; + } + + /* Find existing xattr */ + down_write(&c->xattr_sem); + retry: + for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) { + xd = ref->xd; + if (xd->xprefix != xprefix) + continue; + if (!xd->xname) { + rc = load_xattr_datum(c, xd); + if (unlikely(rc > 0)) { + *pref = ref->next; + delete_xattr_ref(c, ref); + goto retry; + } else if (unlikely(rc < 0)) + goto out; + } + if (!strcmp(xd->xname, xname)) { + if (flags & XATTR_CREATE) { + rc = -EEXIST; + goto out; + } + if (!buffer) { + ref->ino = ic->ino; + ref->xid = xd->xid; + ref->xseqno |= XREF_DELETE_MARKER; + rc = save_xattr_ref(c, ref); + if (!rc) { + *pref = ref->next; + spin_lock(&c->erase_completion_lock); + ref->next = c->xref_dead_list; + c->xref_dead_list = ref; + spin_unlock(&c->erase_completion_lock); + unrefer_xattr_datum(c, xd); + } else { + ref->ic = ic; + ref->xd = xd; + ref->xseqno &= ~XREF_DELETE_MARKER; + } + goto out; + } + goto found; + } + } + /* not found */ + if (flags & XATTR_REPLACE) { + rc = -ENODATA; + goto out; + } + if (!buffer) { + rc = -ENODATA; + goto out; + } + found: + xd = create_xattr_datum(c, xprefix, xname, buffer, size); + if (IS_ERR(xd)) { + rc = PTR_ERR(xd); + goto out; + } + up_write(&c->xattr_sem); + jffs2_complete_reservation(c); + + /* create xattr_ref */ + request = PAD(sizeof(struct jffs2_raw_xref)); + rc = jffs2_reserve_space(c, request, &length, + ALLOC_NORMAL, JFFS2_SUMMARY_XREF_SIZE); + down_write(&c->xattr_sem); + if (rc) { + JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request); + unrefer_xattr_datum(c, xd); + up_write(&c->xattr_sem); + return rc; + } + if (ref) + *pref = ref->next; + newref = create_xattr_ref(c, ic, xd); + if (IS_ERR(newref)) { + if (ref) { + ref->next = ic->xref; + ic->xref = ref; + } + rc = PTR_ERR(newref); + unrefer_xattr_datum(c, xd); + } else if (ref) { + delete_xattr_ref(c, ref); + } + out: + up_write(&c->xattr_sem); + jffs2_complete_reservation(c); + return rc; +} + +/* -------- garbage collector functions ------------- + * jffs2_garbage_collect_xattr_datum(c, xd, raw) + * is used to move xdatum into new node. + * jffs2_garbage_collect_xattr_ref(c, ref, raw) + * is used to move xref into new node. + * jffs2_verify_xattr(c) + * is used to call do_verify_xattr_datum() before garbage collecting. + * jffs2_release_xattr_datum(c, xd) + * is used to release an in-memory object of xdatum. + * jffs2_release_xattr_ref(c, ref) + * is used to release an in-memory object of xref. + * -------------------------------------------------- */ +int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd, + struct jffs2_raw_node_ref *raw) +{ + uint32_t totlen, length, old_ofs; + int rc = 0; + + down_write(&c->xattr_sem); + if (xd->node != raw) + goto out; + if (xd->flags & (JFFS2_XFLAGS_DEAD|JFFS2_XFLAGS_INVALID)) + goto out; + + rc = load_xattr_datum(c, xd); + if (unlikely(rc)) { + rc = (rc > 0) ? 0 : rc; + goto out; + } + old_ofs = ref_offset(xd->node); + totlen = PAD(sizeof(struct jffs2_raw_xattr) + + xd->name_len + 1 + xd->value_len); + rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XATTR_SIZE); + if (rc) { + JFFS2_WARNING("jffs2_reserve_space_gc()=%d, request=%u\n", rc, totlen); + goto out; + } + rc = save_xattr_datum(c, xd); + if (!rc) + dbg_xattr("xdatum (xid=%u, version=%u) GC'ed from %#08x to %08x\n", + xd->xid, xd->version, old_ofs, ref_offset(xd->node)); + out: + if (!rc) + jffs2_mark_node_obsolete(c, raw); + up_write(&c->xattr_sem); + return rc; +} + +int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref, + struct jffs2_raw_node_ref *raw) +{ + uint32_t totlen, length, old_ofs; + int rc = 0; + + down_write(&c->xattr_sem); + BUG_ON(!ref->node); + + if (ref->node != raw) + goto out; + if (is_xattr_ref_dead(ref) && (raw->next_in_ino == (void *)ref)) + goto out; + + old_ofs = ref_offset(ref->node); + totlen = ref_totlen(c, c->gcblock, ref->node); + + rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XREF_SIZE); + if (rc) { + JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n", + __func__, rc, totlen); + rc = rc ? rc : -EBADFD; + goto out; + } + rc = save_xattr_ref(c, ref); + if (!rc) + dbg_xattr("xref (ino=%u, xid=%u) GC'ed from %#08x to %08x\n", + ref->ic->ino, ref->xd->xid, old_ofs, ref_offset(ref->node)); + out: + if (!rc) + jffs2_mark_node_obsolete(c, raw); + up_write(&c->xattr_sem); + return rc; +} + +int jffs2_verify_xattr(struct jffs2_sb_info *c) +{ + struct jffs2_xattr_datum *xd, *_xd; + struct jffs2_eraseblock *jeb; + struct jffs2_raw_node_ref *raw; + uint32_t totlen; + int rc; + + down_write(&c->xattr_sem); + list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) { + rc = do_verify_xattr_datum(c, xd); + if (rc < 0) + continue; + list_del_init(&xd->xindex); + spin_lock(&c->erase_completion_lock); + for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) { + if (ref_flags(raw) != REF_UNCHECKED) + continue; + jeb = &c->blocks[ref_offset(raw) / c->sector_size]; + totlen = PAD(ref_totlen(c, jeb, raw)); + c->unchecked_size -= totlen; c->used_size += totlen; + jeb->unchecked_size -= totlen; jeb->used_size += totlen; + raw->flash_offset = ref_offset(raw) + | ((xd->node == (void *)raw) ? REF_PRISTINE : REF_NORMAL); + } + if (xd->flags & JFFS2_XFLAGS_DEAD) + list_add(&xd->xindex, &c->xattr_dead_list); + spin_unlock(&c->erase_completion_lock); + } + up_write(&c->xattr_sem); + return list_empty(&c->xattr_unchecked) ? 1 : 0; +} + +void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under spin_lock(&c->erase_completion_lock) */ + if (atomic_read(&xd->refcnt) || xd->node != (void *)xd) + return; + + list_del(&xd->xindex); + jffs2_free_xattr_datum(xd); +} + +void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) +{ + /* must be called under spin_lock(&c->erase_completion_lock) */ + struct jffs2_xattr_ref *tmp, **ptmp; + + if (ref->node != (void *)ref) + return; + + for (tmp=c->xref_dead_list, ptmp=&c->xref_dead_list; tmp; ptmp=&tmp->next, tmp=tmp->next) { + if (ref == tmp) { + *ptmp = tmp->next; + break; + } + } + jffs2_free_xattr_ref(ref); +} diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h new file mode 100644 index 00000000..7be4beb3 --- /dev/null +++ b/fs/jffs2/xattr.h @@ -0,0 +1,131 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2006 NEC Corporation + * + * Created by KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#ifndef _JFFS2_FS_XATTR_H_ +#define _JFFS2_FS_XATTR_H_ + +#include +#include + +#define JFFS2_XFLAGS_HOT (0x01) /* This datum is HOT */ +#define JFFS2_XFLAGS_BIND (0x02) /* This datum is not reclaimed */ +#define JFFS2_XFLAGS_DEAD (0x40) /* This datum is already dead */ +#define JFFS2_XFLAGS_INVALID (0x80) /* This datum contains crc error */ + +struct jffs2_xattr_datum +{ + void *always_null; + struct jffs2_raw_node_ref *node; + uint8_t class; + uint8_t flags; + uint16_t xprefix; /* see JFFS2_XATTR_PREFIX_* */ + + struct list_head xindex; /* chained from c->xattrindex[n] */ + atomic_t refcnt; /* # of xattr_ref refers this */ + uint32_t xid; + uint32_t version; + + uint32_t data_crc; + uint32_t hashkey; + char *xname; /* XATTR name without prefix */ + uint32_t name_len; /* length of xname */ + char *xvalue; /* XATTR value */ + uint32_t value_len; /* length of xvalue */ +}; + +struct jffs2_inode_cache; +struct jffs2_xattr_ref +{ + void *always_null; + struct jffs2_raw_node_ref *node; + uint8_t class; + uint8_t flags; /* Currently unused */ + u16 unused; + + uint32_t xseqno; + union { + struct jffs2_inode_cache *ic; /* reference to jffs2_inode_cache */ + uint32_t ino; /* only used in scanning/building */ + }; + union { + struct jffs2_xattr_datum *xd; /* reference to jffs2_xattr_datum */ + uint32_t xid; /* only used in sccanning/building */ + }; + struct jffs2_xattr_ref *next; /* chained from ic->xref_list */ +}; + +#define XREF_DELETE_MARKER (0x00000001) +static inline int is_xattr_ref_dead(struct jffs2_xattr_ref *ref) +{ + return ((ref->xseqno & XREF_DELETE_MARKER) != 0); +} + +#ifdef CONFIG_JFFS2_FS_XATTR + +extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c); +extern void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c); +extern void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c); + +extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, + uint32_t xid, uint32_t version); + +extern void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic); +extern void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic); + +extern int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd, + struct jffs2_raw_node_ref *raw); +extern int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref, + struct jffs2_raw_node_ref *raw); +extern int jffs2_verify_xattr(struct jffs2_sb_info *c); +extern void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd); +extern void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref); + +extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname, + char *buffer, size_t size); +extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, + const char *buffer, size_t size, int flags); + +extern const struct xattr_handler *jffs2_xattr_handlers[]; +extern const struct xattr_handler jffs2_user_xattr_handler; +extern const struct xattr_handler jffs2_trusted_xattr_handler; + +extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); +#define jffs2_getxattr generic_getxattr +#define jffs2_setxattr generic_setxattr +#define jffs2_removexattr generic_removexattr + +#else + +#define jffs2_init_xattr_subsystem(c) +#define jffs2_build_xattr_subsystem(c) +#define jffs2_clear_xattr_subsystem(c) + +#define jffs2_xattr_delete_inode(c, ic) +#define jffs2_xattr_free_inode(c, ic) +#define jffs2_verify_xattr(c) (1) + +#define jffs2_xattr_handlers NULL +#define jffs2_listxattr NULL +#define jffs2_getxattr NULL +#define jffs2_setxattr NULL +#define jffs2_removexattr NULL + +#endif /* CONFIG_JFFS2_FS_XATTR */ + +#ifdef CONFIG_JFFS2_FS_SECURITY +extern int jffs2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); +extern const struct xattr_handler jffs2_security_xattr_handler; +#else +#define jffs2_init_security(inode,dir,qstr) (0) +#endif /* CONFIG_JFFS2_FS_SECURITY */ + +#endif /* _JFFS2_FS_XATTR_H_ */ diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c new file mode 100644 index 00000000..1c868194 --- /dev/null +++ b/fs/jffs2/xattr_trusted.c @@ -0,0 +1,55 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2006 NEC Corporation + * + * Created by KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include "nodelist.h" + +static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (!strcmp(name, "")) + return -EINVAL; + return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED, + name, buffer, size); +} + +static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + if (!strcmp(name, "")) + return -EINVAL; + return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED, + name, buffer, size, flags); +} + +static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1; + + if (list && retlen<=list_size) { + strcpy(list, XATTR_TRUSTED_PREFIX); + strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name); + } + + return retlen; +} + +const struct xattr_handler jffs2_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = jffs2_trusted_listxattr, + .set = jffs2_trusted_setxattr, + .get = jffs2_trusted_getxattr +}; diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c new file mode 100644 index 00000000..916b5c96 --- /dev/null +++ b/fs/jffs2/xattr_user.c @@ -0,0 +1,55 @@ +/* + * JFFS2 -- Journalling Flash File System, Version 2. + * + * Copyright © 2006 NEC Corporation + * + * Created by KaiGai Kohei + * + * For licensing information, see the file 'LICENCE' in this directory. + * + */ + +#include +#include +#include +#include +#include +#include "nodelist.h" + +static int jffs2_user_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (!strcmp(name, "")) + return -EINVAL; + return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_USER, + name, buffer, size); +} + +static int jffs2_user_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + if (!strcmp(name, "")) + return -EINVAL; + return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_USER, + name, buffer, size, flags); +} + +static size_t jffs2_user_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1; + + if (list && retlen <= list_size) { + strcpy(list, XATTR_USER_PREFIX); + strcpy(list + XATTR_USER_PREFIX_LEN, name); + } + + return retlen; +} + +const struct xattr_handler jffs2_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .list = jffs2_user_listxattr, + .set = jffs2_user_setxattr, + .get = jffs2_user_getxattr +}; diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig new file mode 100644 index 00000000..57cef199 --- /dev/null +++ b/fs/jfs/Kconfig @@ -0,0 +1,50 @@ +config JFS_FS + tristate "JFS filesystem support" + select NLS + select CRC32 + help + This is a port of IBM's Journaled Filesystem . More information is + available in the file . + + If you do not intend to use the JFS filesystem, say N. + +config JFS_POSIX_ACL + bool "JFS POSIX Access Control Lists" + depends on JFS_FS + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config JFS_SECURITY + bool "JFS Security Labels" + depends on JFS_FS + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the jfs filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config JFS_DEBUG + bool "JFS debugging" + depends on JFS_FS + help + If you are experiencing any problems with the JFS filesystem, say + Y here. This will result in additional debugging messages to be + written to the system log. Under normal circumstances, this + results in very little overhead. + +config JFS_STATISTICS + bool "JFS statistics" + depends on JFS_FS + help + Enabling this option will cause statistics from the JFS file system + to be made available to the user in the /proc/fs/jfs/ directory. diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile new file mode 100644 index 00000000..a58fa72d --- /dev/null +++ b/fs/jfs/Makefile @@ -0,0 +1,16 @@ +# +# Makefile for the Linux JFS filesystem routines. +# + +obj-$(CONFIG_JFS_FS) += jfs.o + +jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \ + jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \ + jfs_unicode.o jfs_dtree.o jfs_inode.o \ + jfs_extent.o symlink.o jfs_metapage.o \ + jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o \ + resize.o xattr.o ioctl.o + +jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o + +ccflags-y := -D_JFS_4K diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c new file mode 100644 index 00000000..e5de9422 --- /dev/null +++ b/fs/jfs/acl.c @@ -0,0 +1,211 @@ +/* + * Copyright (C) International Business Machines Corp., 2002-2004 + * Copyright (C) Andreas Gruenbacher, 2001 + * Copyright (C) Linus Torvalds, 1991, 1992 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_txnmgr.h" +#include "jfs_xattr.h" +#include "jfs_acl.h" + +static struct posix_acl *jfs_get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + char *ea_name; + int size; + char *value = NULL; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + switch(type) { + case ACL_TYPE_ACCESS: + ea_name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + ea_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + size = __jfs_getxattr(inode, ea_name, NULL, 0); + + if (size > 0) { + value = kmalloc(size, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + size = __jfs_getxattr(inode, ea_name, value, size); + } + + if (size < 0) { + if (size == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(size); + } else { + acl = posix_acl_from_xattr(value, size); + } + kfree(value); + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + return acl; +} + +static int jfs_set_acl(tid_t tid, struct inode *inode, int type, + struct posix_acl *acl) +{ + char *ea_name; + int rc; + int size = 0; + char *value = NULL; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch(type) { + case ACL_TYPE_ACCESS: + ea_name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + ea_name = POSIX_ACL_XATTR_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + default: + return -EINVAL; + } + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_KERNEL); + if (!value) + return -ENOMEM; + rc = posix_acl_to_xattr(acl, value, size); + if (rc < 0) + goto out; + } + rc = __jfs_setxattr(tid, inode, ea_name, value, size, 0); +out: + kfree(value); + + if (!rc) + set_cached_acl(inode, type, acl); + + return rc; +} + +int jfs_check_acl(struct inode *inode, int mask, unsigned int flags) +{ + struct posix_acl *acl; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + + return -EAGAIN; +} + +int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + struct posix_acl *clone; + mode_t mode; + int rc = 0; + + if (S_ISLNK(inode->i_mode)) + return 0; + + acl = jfs_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + if (S_ISDIR(inode->i_mode)) { + rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl); + if (rc) + goto cleanup; + } + clone = posix_acl_clone(acl, GFP_KERNEL); + if (!clone) { + rc = -ENOMEM; + goto cleanup; + } + mode = inode->i_mode; + rc = posix_acl_create_masq(clone, &mode); + if (rc >= 0) { + inode->i_mode = mode; + if (rc > 0) + rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, + clone); + } + posix_acl_release(clone); +cleanup: + posix_acl_release(acl); + } else + inode->i_mode &= ~current_umask(); + + JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | + inode->i_mode; + + return rc; +} + +int jfs_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int rc; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + + rc = posix_acl_chmod_masq(clone, inode->i_mode); + if (!rc) { + tid_t tid = txBegin(inode->i_sb, 0); + mutex_lock(&JFS_IP(inode)->commit_mutex); + rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, clone); + if (!rc) + rc = txCommit(tid, 1, &inode, 0); + txEnd(tid); + mutex_unlock(&JFS_IP(inode)->commit_mutex); + } + + posix_acl_release(clone); + return rc; +} diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h new file mode 100644 index 00000000..fa92f7f1 --- /dev/null +++ b/fs/jfs/endian24.h @@ -0,0 +1,49 @@ +/* + * Copyright (C) International Business Machines Corp., 2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_ENDIAN24 +#define _H_ENDIAN24 + +/* + * endian24.h: + * + * Endian conversion for 24-byte data + * + */ +#define __swab24(x) \ +({ \ + __u32 __x = (x); \ + ((__u32)( \ + ((__x & (__u32)0x000000ffUL) << 16) | \ + (__x & (__u32)0x0000ff00UL) | \ + ((__x & (__u32)0x00ff0000UL) >> 16) )); \ +}) + +#if (defined(__KERNEL__) && defined(__LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN)) + #define __cpu_to_le24(x) ((__u32)(x)) + #define __le24_to_cpu(x) ((__u32)(x)) +#else + #define __cpu_to_le24(x) __swab24(x) + #define __le24_to_cpu(x) __swab24(x) +#endif + +#ifdef __KERNEL__ + #define cpu_to_le24 __cpu_to_le24 + #define le24_to_cpu __le24_to_cpu +#endif + +#endif /* !_H_ENDIAN24 */ diff --git a/fs/jfs/file.c b/fs/jfs/file.c new file mode 100644 index 00000000..2f3f531f --- /dev/null +++ b/fs/jfs/file.c @@ -0,0 +1,154 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_dmap.h" +#include "jfs_txnmgr.h" +#include "jfs_xattr.h" +#include "jfs_acl.h" +#include "jfs_debug.h" + +int jfs_fsync(struct file *file, int datasync) +{ + struct inode *inode = file->f_mapping->host; + int rc = 0; + + if (!(inode->i_state & I_DIRTY) || + (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { + /* Make sure committed changes hit the disk */ + jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); + return rc; + } + + rc |= jfs_commit_inode(inode, 1); + + return rc ? -EIO : 0; +} + +static int jfs_open(struct inode *inode, struct file *file) +{ + int rc; + + if ((rc = dquot_file_open(inode, file))) + return rc; + + /* + * We attempt to allow only one "active" file open per aggregate + * group. Otherwise, appending to files in parallel can cause + * fragmentation within the files. + * + * If the file is empty, it was probably just created and going + * to be written to. If it has a size, we'll hold off until the + * file is actually grown. + */ + if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE && + (inode->i_size == 0)) { + struct jfs_inode_info *ji = JFS_IP(inode); + spin_lock_irq(&ji->ag_lock); + if (ji->active_ag == -1) { + struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb); + ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb); + atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]); + } + spin_unlock_irq(&ji->ag_lock); + } + + return 0; +} +static int jfs_release(struct inode *inode, struct file *file) +{ + struct jfs_inode_info *ji = JFS_IP(inode); + + spin_lock_irq(&ji->ag_lock); + if (ji->active_ag != -1) { + struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap; + atomic_dec(&bmap->db_active[ji->active_ag]); + ji->active_ag = -1; + } + spin_unlock_irq(&ji->ag_lock); + + return 0; +} + +int jfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int rc; + + rc = inode_change_ok(inode, iattr); + if (rc) + return rc; + + if (is_quota_modification(inode, iattr)) + dquot_initialize(inode); + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || + (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { + rc = dquot_transfer(inode, iattr); + if (rc) + return rc; + } + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(inode)) { + rc = vmtruncate(inode, iattr->ia_size); + if (rc) + return rc; + } + + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + + if (iattr->ia_valid & ATTR_MODE) + rc = jfs_acl_chmod(inode); + return rc; +} + +const struct inode_operations jfs_file_inode_operations = { + .truncate = jfs_truncate, + .setxattr = jfs_setxattr, + .getxattr = jfs_getxattr, + .listxattr = jfs_listxattr, + .removexattr = jfs_removexattr, + .setattr = jfs_setattr, +#ifdef CONFIG_JFS_POSIX_ACL + .check_acl = jfs_check_acl, +#endif +}; + +const struct file_operations jfs_file_operations = { + .open = jfs_open, + .llseek = generic_file_llseek, + .write = do_sync_write, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, + .fsync = jfs_fsync, + .release = jfs_release, + .unlocked_ioctl = jfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = jfs_compat_ioctl, +#endif +}; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c new file mode 100644 index 00000000..10965590 --- /dev/null +++ b/fs/jfs/inode.c @@ -0,0 +1,414 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_filsys.h" +#include "jfs_imap.h" +#include "jfs_extent.h" +#include "jfs_unicode.h" +#include "jfs_debug.h" + + +struct inode *jfs_iget(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; + int ret; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ret = diRead(inode); + if (ret < 0) { + iget_failed(inode); + return ERR_PTR(ret); + } + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &jfs_file_inode_operations; + inode->i_fop = &jfs_file_operations; + inode->i_mapping->a_ops = &jfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &jfs_dir_inode_operations; + inode->i_fop = &jfs_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (inode->i_size >= IDATASIZE) { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &jfs_aops; + } else { + inode->i_op = &jfs_fast_symlink_inode_operations; + /* + * The inline data should be null-terminated, but + * don't let on-disk corruption crash the kernel + */ + JFS_IP(inode)->i_inline[inode->i_size] = '\0'; + } + } else { + inode->i_op = &jfs_file_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + } + unlock_new_inode(inode); + return inode; +} + +/* + * Workhorse of both fsync & write_inode + */ +int jfs_commit_inode(struct inode *inode, int wait) +{ + int rc = 0; + tid_t tid; + static int noisy = 5; + + jfs_info("In jfs_commit_inode, inode = 0x%p", inode); + + /* + * Don't commit if inode has been committed since last being + * marked dirty, or if it has been deleted. + */ + if (inode->i_nlink == 0 || !test_cflag(COMMIT_Dirty, inode)) + return 0; + + if (isReadOnly(inode)) { + /* kernel allows writes to devices on read-only + * partitions and may think inode is dirty + */ + if (!special_file(inode->i_mode) && noisy) { + jfs_err("jfs_commit_inode(0x%p) called on " + "read-only volume", inode); + jfs_err("Is remount racy?"); + noisy--; + } + return 0; + } + + tid = txBegin(inode->i_sb, COMMIT_INODE); + mutex_lock(&JFS_IP(inode)->commit_mutex); + + /* + * Retest inode state after taking commit_mutex + */ + if (inode->i_nlink && test_cflag(COMMIT_Dirty, inode)) + rc = txCommit(tid, 1, &inode, wait ? COMMIT_SYNC : 0); + + txEnd(tid); + mutex_unlock(&JFS_IP(inode)->commit_mutex); + return rc; +} + +int jfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int wait = wbc->sync_mode == WB_SYNC_ALL; + + if (test_cflag(COMMIT_Nolink, inode)) + return 0; + /* + * If COMMIT_DIRTY is not set, the inode isn't really dirty. + * It has been committed since the last change, but was still + * on the dirty inode list. + */ + if (!test_cflag(COMMIT_Dirty, inode)) { + /* Make sure committed changes hit the disk */ + jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait); + return 0; + } + + if (jfs_commit_inode(inode, wait)) { + jfs_err("jfs_write_inode: jfs_commit_inode failed!"); + return -EIO; + } else + return 0; +} + +void jfs_evict_inode(struct inode *inode) +{ + jfs_info("In jfs_evict_inode, inode = 0x%p", inode); + + if (!inode->i_nlink && !is_bad_inode(inode)) { + dquot_initialize(inode); + + if (JFS_IP(inode)->fileset == FILESYSTEM_I) { + truncate_inode_pages(&inode->i_data, 0); + + if (test_cflag(COMMIT_Freewmap, inode)) + jfs_free_zero_link(inode); + + diFree(inode); + + /* + * Free the inode from the quota allocation. + */ + dquot_initialize(inode); + dquot_free_inode(inode); + } + } else { + truncate_inode_pages(&inode->i_data, 0); + } + end_writeback(inode); + dquot_drop(inode); +} + +void jfs_dirty_inode(struct inode *inode, int flags) +{ + static int noisy = 5; + + if (isReadOnly(inode)) { + if (!special_file(inode->i_mode) && noisy) { + /* kernel allows writes to devices on read-only + * partitions and may try to mark inode dirty + */ + jfs_err("jfs_dirty_inode called on read-only volume"); + jfs_err("Is remount racy?"); + noisy--; + } + return; + } + + set_cflag(COMMIT_Dirty, inode); +} + +int jfs_get_block(struct inode *ip, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + s64 lblock64 = lblock; + int rc = 0; + xad_t xad; + s64 xaddr; + int xflag; + s32 xlen = bh_result->b_size >> ip->i_blkbits; + + /* + * Take appropriate lock on inode + */ + if (create) + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); + else + IREAD_LOCK(ip, RDWRLOCK_NORMAL); + + if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) && + (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) && + xaddr) { + if (xflag & XAD_NOTRECORDED) { + if (!create) + /* + * Allocated but not recorded, read treats + * this as a hole + */ + goto unlock; +#ifdef _JFS_4K + XADoffset(&xad, lblock64); + XADlength(&xad, xlen); + XADaddress(&xad, xaddr); +#else /* _JFS_4K */ + /* + * As long as block size = 4K, this isn't a problem. + * We should mark the whole page not ABNR, but how + * will we know to mark the other blocks BH_New? + */ + BUG(); +#endif /* _JFS_4K */ + rc = extRecord(ip, &xad); + if (rc) + goto unlock; + set_buffer_new(bh_result); + } + + map_bh(bh_result, ip->i_sb, xaddr); + bh_result->b_size = xlen << ip->i_blkbits; + goto unlock; + } + if (!create) + goto unlock; + + /* + * Allocate a new block + */ +#ifdef _JFS_4K + if ((rc = extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad))) + goto unlock; + rc = extAlloc(ip, xlen, lblock64, &xad, false); + if (rc) + goto unlock; + + set_buffer_new(bh_result); + map_bh(bh_result, ip->i_sb, addressXAD(&xad)); + bh_result->b_size = lengthXAD(&xad) << ip->i_blkbits; + +#else /* _JFS_4K */ + /* + * We need to do whatever it takes to keep all but the last buffers + * in 4K pages - see jfs_write.c + */ + BUG(); +#endif /* _JFS_4K */ + + unlock: + /* + * Release lock on inode + */ + if (create) + IWRITE_UNLOCK(ip); + else + IREAD_UNLOCK(ip); + return rc; +} + +static int jfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, jfs_get_block, wbc); +} + +static int jfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, jfs_get_block); +} + +static int jfs_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, jfs_get_block); +} + +static int jfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, jfs_get_block); +} + +static int jfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata, + jfs_get_block); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} + +static sector_t jfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, jfs_get_block); +} + +static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, jfs_get_block, NULL); + + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely((rw & WRITE) && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + + if (end > isize) + vmtruncate(inode, isize); + } + + return ret; +} + +const struct address_space_operations jfs_aops = { + .readpage = jfs_readpage, + .readpages = jfs_readpages, + .writepage = jfs_writepage, + .writepages = jfs_writepages, + .write_begin = jfs_write_begin, + .write_end = nobh_write_end, + .bmap = jfs_bmap, + .direct_IO = jfs_direct_IO, +}; + +/* + * Guts of jfs_truncate. Called with locks already held. Can be called + * with directory for truncating directory index table. + */ +void jfs_truncate_nolock(struct inode *ip, loff_t length) +{ + loff_t newsize; + tid_t tid; + + ASSERT(length >= 0); + + if (test_cflag(COMMIT_Nolink, ip)) { + xtTruncate(0, ip, length, COMMIT_WMAP); + return; + } + + do { + tid = txBegin(ip->i_sb, 0); + + /* + * The commit_mutex cannot be taken before txBegin. + * txBegin may block and there is a chance the inode + * could be marked dirty and need to be committed + * before txBegin unblocks + */ + mutex_lock(&JFS_IP(ip)->commit_mutex); + + newsize = xtTruncate(tid, ip, length, + COMMIT_TRUNCATE | COMMIT_PWMAP); + if (newsize < 0) { + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + break; + } + + ip->i_mtime = ip->i_ctime = CURRENT_TIME; + mark_inode_dirty(ip); + + txCommit(tid, 1, &ip, 0); + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + } while (newsize > length); /* Truncate isn't always atomic */ +} + +void jfs_truncate(struct inode *ip) +{ + jfs_info("jfs_truncate: size = 0x%lx", (ulong) ip->i_size); + + nobh_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block); + + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); + jfs_truncate_nolock(ip, ip->i_size); + IWRITE_UNLOCK(ip); +} diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c new file mode 100644 index 00000000..6f98a186 --- /dev/null +++ b/fs/jfs/ioctl.c @@ -0,0 +1,148 @@ +/* + * linux/fs/jfs/ioctl.c + * + * Copyright (C) 2006 Herbert Poetzl + * adapted from Remy Card's ext2/ioctl.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "jfs_incore.h" +#include "jfs_dinode.h" +#include "jfs_inode.h" + + +static struct { + long jfs_flag; + long ext2_flag; +} jfs_map[] = { + {JFS_NOATIME_FL, FS_NOATIME_FL}, + {JFS_DIRSYNC_FL, FS_DIRSYNC_FL}, + {JFS_SYNC_FL, FS_SYNC_FL}, + {JFS_SECRM_FL, FS_SECRM_FL}, + {JFS_UNRM_FL, FS_UNRM_FL}, + {JFS_APPEND_FL, FS_APPEND_FL}, + {JFS_IMMUTABLE_FL, FS_IMMUTABLE_FL}, + {0, 0}, +}; + +static long jfs_map_ext2(unsigned long flags, int from) +{ + int index=0; + long mapped=0; + + while (jfs_map[index].jfs_flag) { + if (from) { + if (jfs_map[index].ext2_flag & flags) + mapped |= jfs_map[index].jfs_flag; + } else { + if (jfs_map[index].jfs_flag & flags) + mapped |= jfs_map[index].ext2_flag; + } + index++; + } + return mapped; +} + + +long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct jfs_inode_info *jfs_inode = JFS_IP(inode); + unsigned int flags; + + switch (cmd) { + case JFS_IOC_GETFLAGS: + jfs_get_inode_flags(jfs_inode); + flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE; + flags = jfs_map_ext2(flags, 0); + return put_user(flags, (int __user *) arg); + case JFS_IOC_SETFLAGS: { + unsigned int oldflags; + int err; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + + if (!inode_owner_or_capable(inode)) { + err = -EACCES; + goto setflags_out; + } + if (get_user(flags, (int __user *) arg)) { + err = -EFAULT; + goto setflags_out; + } + + flags = jfs_map_ext2(flags, 1); + if (!S_ISDIR(inode->i_mode)) + flags &= ~JFS_DIRSYNC_FL; + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + err = -EPERM; + goto setflags_out; + } + + /* Lock against other parallel changes of flags */ + mutex_lock(&inode->i_mutex); + + jfs_get_inode_flags(jfs_inode); + oldflags = jfs_inode->mode2; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + */ + if ((oldflags & JFS_IMMUTABLE_FL) || + ((flags ^ oldflags) & + (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); + err = -EPERM; + goto setflags_out; + } + } + + flags = flags & JFS_FL_USER_MODIFIABLE; + flags |= oldflags & ~JFS_FL_USER_MODIFIABLE; + jfs_inode->mode2 = flags; + + jfs_set_inode_flags(inode); + mutex_unlock(&inode->i_mutex); + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +setflags_out: + mnt_drop_write(filp->f_path.mnt); + return err; + } + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + /* While these ioctl numbers defined with 'long' and have different + * numbers than the 64bit ABI, + * the actual implementation only deals with ints and is compatible. + */ + switch (cmd) { + case JFS_IOC_GETFLAGS32: + cmd = JFS_IOC_GETFLAGS; + break; + case JFS_IOC_SETFLAGS32: + cmd = JFS_IOC_SETFLAGS; + break; + } + return jfs_ioctl(filp, cmd, arg); +} +#endif diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h new file mode 100644 index 00000000..f9285c49 --- /dev/null +++ b/fs/jfs/jfs_acl.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) International Business Machines Corp., 2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_ACL +#define _H_JFS_ACL + +#ifdef CONFIG_JFS_POSIX_ACL + +int jfs_check_acl(struct inode *, int, unsigned int flags); +int jfs_init_acl(tid_t, struct inode *, struct inode *); +int jfs_acl_chmod(struct inode *inode); + +#else + +static inline int jfs_init_acl(tid_t tid, struct inode *inode, + struct inode *dir) +{ + return 0; +} + +static inline int jfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +#endif +#endif /* _H_JFS_ACL */ diff --git a/fs/jfs/jfs_btree.h b/fs/jfs/jfs_btree.h new file mode 100644 index 00000000..79c61805 --- /dev/null +++ b/fs/jfs/jfs_btree.h @@ -0,0 +1,172 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_BTREE +#define _H_JFS_BTREE + +/* + * jfs_btree.h: B+-tree + * + * JFS B+-tree (dtree and xtree) common definitions + */ + +/* + * basic btree page - btpage + * +struct btpage { + s64 next; right sibling bn + s64 prev; left sibling bn + + u8 flag; + u8 rsrvd[7]; type specific + s64 self; self address + + u8 entry[4064]; +}; */ + +/* btpaget_t flag */ +#define BT_TYPE 0x07 /* B+-tree index */ +#define BT_ROOT 0x01 /* root page */ +#define BT_LEAF 0x02 /* leaf page */ +#define BT_INTERNAL 0x04 /* internal page */ +#define BT_RIGHTMOST 0x10 /* rightmost page */ +#define BT_LEFTMOST 0x20 /* leftmost page */ +#define BT_SWAPPED 0x80 /* used by fsck for endian swapping */ + +/* btorder (in inode) */ +#define BT_RANDOM 0x0000 +#define BT_SEQUENTIAL 0x0001 +#define BT_LOOKUP 0x0010 +#define BT_INSERT 0x0020 +#define BT_DELETE 0x0040 + +/* + * btree page buffer cache access + */ +#define BT_IS_ROOT(MP) (((MP)->xflag & COMMIT_PAGE) == 0) + +/* get page from buffer page */ +#define BT_PAGE(IP, MP, TYPE, ROOT)\ + (BT_IS_ROOT(MP) ? (TYPE *)&JFS_IP(IP)->ROOT : (TYPE *)(MP)->data) + +/* get the page buffer and the page for specified block address */ +#define BT_GETPAGE(IP, BN, MP, TYPE, SIZE, P, RC, ROOT)\ +{\ + if ((BN) == 0)\ + {\ + MP = (struct metapage *)&JFS_IP(IP)->bxflag;\ + P = (TYPE *)&JFS_IP(IP)->ROOT;\ + RC = 0;\ + }\ + else\ + {\ + MP = read_metapage((IP), BN, SIZE, 1);\ + if (MP) {\ + RC = 0;\ + P = (MP)->data;\ + } else {\ + P = NULL;\ + jfs_err("bread failed!");\ + RC = -EIO;\ + }\ + }\ +} + +#define BT_MARK_DIRTY(MP, IP)\ +{\ + if (BT_IS_ROOT(MP))\ + mark_inode_dirty(IP);\ + else\ + mark_metapage_dirty(MP);\ +} + +/* put the page buffer */ +#define BT_PUTPAGE(MP)\ +{\ + if (! BT_IS_ROOT(MP)) \ + release_metapage(MP); \ +} + + +/* + * btree traversal stack + * + * record the path traversed during the search; + * top frame record the leaf page/entry selected. + */ +struct btframe { /* stack frame */ + s64 bn; /* 8: */ + s16 index; /* 2: */ + s16 lastindex; /* 2: unused */ + struct metapage *mp; /* 4/8: */ +}; /* (16/24) */ + +struct btstack { + struct btframe *top; + int nsplit; + struct btframe stack[MAXTREEHEIGHT]; +}; + +#define BT_CLR(btstack)\ + (btstack)->top = (btstack)->stack + +#define BT_STACK_FULL(btstack)\ + ( (btstack)->top == &((btstack)->stack[MAXTREEHEIGHT-1])) + +#define BT_PUSH(BTSTACK, BN, INDEX)\ +{\ + assert(!BT_STACK_FULL(BTSTACK));\ + (BTSTACK)->top->bn = BN;\ + (BTSTACK)->top->index = INDEX;\ + ++(BTSTACK)->top;\ +} + +#define BT_POP(btstack)\ + ( (btstack)->top == (btstack)->stack ? NULL : --(btstack)->top ) + +#define BT_STACK(btstack)\ + ( (btstack)->top == (btstack)->stack ? NULL : (btstack)->top ) + +static inline void BT_STACK_DUMP(struct btstack *btstack) +{ + int i; + printk("btstack dump:\n"); + for (i = 0; i < MAXTREEHEIGHT; i++) + printk(KERN_ERR "bn = %Lx, index = %d\n", + (long long)btstack->stack[i].bn, + btstack->stack[i].index); +} + +/* retrieve search results */ +#define BT_GETSEARCH(IP, LEAF, BN, MP, TYPE, P, INDEX, ROOT)\ +{\ + BN = (LEAF)->bn;\ + MP = (LEAF)->mp;\ + if (BN)\ + P = (TYPE *)MP->data;\ + else\ + P = (TYPE *)&JFS_IP(IP)->ROOT;\ + INDEX = (LEAF)->index;\ +} + +/* put the page buffer of search */ +#define BT_PUTSEARCH(BTSTACK)\ +{\ + if (! BT_IS_ROOT((BTSTACK)->top->mp))\ + release_metapage((BTSTACK)->top->mp);\ +} +#endif /* _H_JFS_BTREE */ diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c new file mode 100644 index 00000000..dd824d9b --- /dev/null +++ b/fs/jfs/jfs_debug.c @@ -0,0 +1,109 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_debug.h" + +#ifdef PROC_FS_JFS /* see jfs_debug.h */ + +static struct proc_dir_entry *base; +#ifdef CONFIG_JFS_DEBUG +static int jfs_loglevel_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", jfsloglevel); + return 0; +} + +static int jfs_loglevel_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_loglevel_proc_show, NULL); +} + +static ssize_t jfs_loglevel_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + char c; + + if (get_user(c, buffer)) + return -EFAULT; + + /* yes, I know this is an ASCIIism. --hch */ + if (c < '0' || c > '9') + return -EINVAL; + jfsloglevel = c - '0'; + return count; +} + +static const struct file_operations jfs_loglevel_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_loglevel_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = jfs_loglevel_proc_write, +}; +#endif + +static struct { + const char *name; + const struct file_operations *proc_fops; +} Entries[] = { +#ifdef CONFIG_JFS_STATISTICS + { "lmstats", &jfs_lmstats_proc_fops, }, + { "txstats", &jfs_txstats_proc_fops, }, + { "xtstat", &jfs_xtstat_proc_fops, }, + { "mpstat", &jfs_mpstat_proc_fops, }, +#endif +#ifdef CONFIG_JFS_DEBUG + { "TxAnchor", &jfs_txanchor_proc_fops, }, + { "loglevel", &jfs_loglevel_proc_fops } +#endif +}; +#define NPROCENT ARRAY_SIZE(Entries) + +void jfs_proc_init(void) +{ + int i; + + if (!(base = proc_mkdir("fs/jfs", NULL))) + return; + + for (i = 0; i < NPROCENT; i++) + proc_create(Entries[i].name, 0, base, Entries[i].proc_fops); +} + +void jfs_proc_clean(void) +{ + int i; + + if (base) { + for (i = 0; i < NPROCENT; i++) + remove_proc_entry(Entries[i].name, base); + remove_proc_entry("fs/jfs", NULL); + } +} + +#endif /* PROC_FS_JFS */ diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h new file mode 100644 index 00000000..eafd1300 --- /dev/null +++ b/fs/jfs/jfs_debug.h @@ -0,0 +1,122 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DEBUG +#define _H_JFS_DEBUG + +/* + * jfs_debug.h + * + * global debug message, data structure/macro definitions + * under control of CONFIG_JFS_DEBUG, CONFIG_JFS_STATISTICS; + */ + +/* + * Create /proc/fs/jfs if procfs is enabled andeither + * CONFIG_JFS_DEBUG or CONFIG_JFS_STATISTICS is defined + */ +#if defined(CONFIG_PROC_FS) && (defined(CONFIG_JFS_DEBUG) || defined(CONFIG_JFS_STATISTICS)) +#define PROC_FS_JFS +extern void jfs_proc_init(void); +extern void jfs_proc_clean(void); +#endif + +/* + * assert with traditional printf/panic + */ +#define assert(p) do { \ + if (!(p)) { \ + printk(KERN_CRIT "BUG at %s:%d assert(%s)\n", \ + __FILE__, __LINE__, #p); \ + BUG(); \ + } \ +} while (0) + +/* + * debug ON + * -------- + */ +#ifdef CONFIG_JFS_DEBUG +#define ASSERT(p) assert(p) + +/* printk verbosity */ +#define JFS_LOGLEVEL_ERR 1 +#define JFS_LOGLEVEL_WARN 2 +#define JFS_LOGLEVEL_DEBUG 3 +#define JFS_LOGLEVEL_INFO 4 + +extern int jfsloglevel; + +extern const struct file_operations jfs_txanchor_proc_fops; + +/* information message: e.g., configuration, major event */ +#define jfs_info(fmt, arg...) do { \ + if (jfsloglevel >= JFS_LOGLEVEL_INFO) \ + printk(KERN_INFO fmt "\n", ## arg); \ +} while (0) + +/* debug message: ad hoc */ +#define jfs_debug(fmt, arg...) do { \ + if (jfsloglevel >= JFS_LOGLEVEL_DEBUG) \ + printk(KERN_DEBUG fmt "\n", ## arg); \ +} while (0) + +/* warn message: */ +#define jfs_warn(fmt, arg...) do { \ + if (jfsloglevel >= JFS_LOGLEVEL_WARN) \ + printk(KERN_WARNING fmt "\n", ## arg); \ +} while (0) + +/* error event message: e.g., i/o error */ +#define jfs_err(fmt, arg...) do { \ + if (jfsloglevel >= JFS_LOGLEVEL_ERR) \ + printk(KERN_ERR fmt "\n", ## arg); \ +} while (0) + +/* + * debug OFF + * --------- + */ +#else /* CONFIG_JFS_DEBUG */ +#define ASSERT(p) do {} while (0) +#define jfs_info(fmt, arg...) do {} while (0) +#define jfs_debug(fmt, arg...) do {} while (0) +#define jfs_warn(fmt, arg...) do {} while (0) +#define jfs_err(fmt, arg...) do {} while (0) +#endif /* CONFIG_JFS_DEBUG */ + +/* + * statistics + * ---------- + */ +#ifdef CONFIG_JFS_STATISTICS +extern const struct file_operations jfs_lmstats_proc_fops; +extern const struct file_operations jfs_txstats_proc_fops; +extern const struct file_operations jfs_mpstat_proc_fops; +extern const struct file_operations jfs_xtstat_proc_fops; + +#define INCREMENT(x) ((x)++) +#define DECREMENT(x) ((x)--) +#define HIGHWATERMARK(x,y) ((x) = max((x), (y))) +#else +#define INCREMENT(x) +#define DECREMENT(x) +#define HIGHWATERMARK(x,y) +#endif /* CONFIG_JFS_STATISTICS */ + +#endif /* _H_JFS_DEBUG */ diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h new file mode 100644 index 00000000..395c4c0d --- /dev/null +++ b/fs/jfs/jfs_dinode.h @@ -0,0 +1,176 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DINODE +#define _H_JFS_DINODE + +/* + * jfs_dinode.h: on-disk inode manager + */ + +#define INODESLOTSIZE 128 +#define L2INODESLOTSIZE 7 +#define log2INODESIZE 9 /* log2(bytes per dinode) */ + + +/* + * on-disk inode : 512 bytes + * + * note: align 64-bit fields on 8-byte boundary. + */ +struct dinode { + /* + * I. base area (128 bytes) + * ------------------------ + * + * define generic/POSIX attributes + */ + __le32 di_inostamp; /* 4: stamp to show inode belongs to fileset */ + __le32 di_fileset; /* 4: fileset number */ + __le32 di_number; /* 4: inode number, aka file serial number */ + __le32 di_gen; /* 4: inode generation number */ + + pxd_t di_ixpxd; /* 8: inode extent descriptor */ + + __le64 di_size; /* 8: size */ + __le64 di_nblocks; /* 8: number of blocks allocated */ + + __le32 di_nlink; /* 4: number of links to the object */ + + __le32 di_uid; /* 4: user id of owner */ + __le32 di_gid; /* 4: group id of owner */ + + __le32 di_mode; /* 4: attribute, format and permission */ + + struct timestruc_t di_atime; /* 8: time last data accessed */ + struct timestruc_t di_ctime; /* 8: time last status changed */ + struct timestruc_t di_mtime; /* 8: time last data modified */ + struct timestruc_t di_otime; /* 8: time created */ + + dxd_t di_acl; /* 16: acl descriptor */ + + dxd_t di_ea; /* 16: ea descriptor */ + + __le32 di_next_index; /* 4: Next available dir_table index */ + + __le32 di_acltype; /* 4: Type of ACL */ + + /* + * Extension Areas. + * + * Historically, the inode was partitioned into 4 128-byte areas, + * the last 3 being defined as unions which could have multiple + * uses. The first 96 bytes had been completely unused until + * an index table was added to the directory. It is now more + * useful to describe the last 3/4 of the inode as a single + * union. We would probably be better off redesigning the + * entire structure from scratch, but we don't want to break + * commonality with OS/2's JFS at this time. + */ + union { + struct { + /* + * This table contains the information needed to + * find a directory entry from a 32-bit index. + * If the index is small enough, the table is inline, + * otherwise, an x-tree root overlays this table + */ + struct dir_table_slot _table[12]; /* 96: inline */ + + dtroot_t _dtroot; /* 288: dtree root */ + } _dir; /* (384) */ +#define di_dirtable u._dir._table +#define di_dtroot u._dir._dtroot +#define di_parent di_dtroot.header.idotdot +#define di_DASD di_dtroot.header.DASD + + struct { + union { + u8 _data[96]; /* 96: unused */ + struct { + void *_imap; /* 4: unused */ + __le32 _gengen; /* 4: generator */ + } _imap; + } _u1; /* 96: */ +#define di_gengen u._file._u1._imap._gengen + + union { + xtpage_t _xtroot; + struct { + u8 unused[16]; /* 16: */ + dxd_t _dxd; /* 16: */ + union { + __le32 _rdev; /* 4: */ + u8 _fastsymlink[128]; + } _u; + u8 _inlineea[128]; + } _special; + } _u2; + } _file; +#define di_xtroot u._file._u2._xtroot +#define di_dxd u._file._u2._special._dxd +#define di_btroot di_xtroot +#define di_inlinedata u._file._u2._special._u +#define di_rdev u._file._u2._special._u._rdev +#define di_fastsymlink u._file._u2._special._u._fastsymlink +#define di_inlineea u._file._u2._special._inlineea + } u; +}; + +/* extended mode bits (on-disk inode di_mode) */ +#define IFJOURNAL 0x00010000 /* journalled file */ +#define ISPARSE 0x00020000 /* sparse file enabled */ +#define INLINEEA 0x00040000 /* inline EA area free */ +#define ISWAPFILE 0x00800000 /* file open for pager swap space */ + +/* more extended mode bits: attributes for OS/2 */ +#define IREADONLY 0x02000000 /* no write access to file */ +#define IHIDDEN 0x04000000 /* hidden file */ +#define ISYSTEM 0x08000000 /* system file */ + +#define IDIRECTORY 0x20000000 /* directory (shadow of real bit) */ +#define IARCHIVE 0x40000000 /* file archive bit */ +#define INEWNAME 0x80000000 /* non-8.3 filename format */ + +#define IRASH 0x4E000000 /* mask for changeable attributes */ +#define ATTRSHIFT 25 /* bits to shift to move attribute + specification to mode position */ + +/* extended attributes for Linux */ + +#define JFS_NOATIME_FL 0x00080000 /* do not update atime */ + +#define JFS_DIRSYNC_FL 0x00100000 /* dirsync behaviour */ +#define JFS_SYNC_FL 0x00200000 /* Synchronous updates */ +#define JFS_SECRM_FL 0x00400000 /* Secure deletion */ +#define JFS_UNRM_FL 0x00800000 /* allow for undelete */ + +#define JFS_APPEND_FL 0x01000000 /* writes to file may only append */ +#define JFS_IMMUTABLE_FL 0x02000000 /* Immutable file */ + +#define JFS_FL_USER_VISIBLE 0x03F80000 +#define JFS_FL_USER_MODIFIABLE 0x03F80000 +#define JFS_FL_INHERIT 0x03C80000 + +/* These are identical to EXT[23]_IOC_GETFLAGS/SETFLAGS */ +#define JFS_IOC_GETFLAGS _IOR('f', 1, long) +#define JFS_IOC_SETFLAGS _IOW('f', 2, long) + +#define JFS_IOC_GETFLAGS32 _IOR('f', 1, int) +#define JFS_IOC_SETFLAGS32 _IOW('f', 2, int) + +#endif /*_H_JFS_DINODE */ diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c new file mode 100644 index 00000000..4496872c --- /dev/null +++ b/fs/jfs/jfs_dmap.c @@ -0,0 +1,3992 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_lock.h" +#include "jfs_metapage.h" +#include "jfs_debug.h" + +/* + * SERIALIZATION of the Block Allocation Map. + * + * the working state of the block allocation map is accessed in + * two directions: + * + * 1) allocation and free requests that start at the dmap + * level and move up through the dmap control pages (i.e. + * the vast majority of requests). + * + * 2) allocation requests that start at dmap control page + * level and work down towards the dmaps. + * + * the serialization scheme used here is as follows. + * + * requests which start at the bottom are serialized against each + * other through buffers and each requests holds onto its buffers + * as it works it way up from a single dmap to the required level + * of dmap control page. + * requests that start at the top are serialized against each other + * and request that start from the bottom by the multiple read/single + * write inode lock of the bmap inode. requests starting at the top + * take this lock in write mode while request starting at the bottom + * take the lock in read mode. a single top-down request may proceed + * exclusively while multiple bottoms-up requests may proceed + * simultaneously (under the protection of busy buffers). + * + * in addition to information found in dmaps and dmap control pages, + * the working state of the block allocation map also includes read/ + * write information maintained in the bmap descriptor (i.e. total + * free block count, allocation group level free block counts). + * a single exclusive lock (BMAP_LOCK) is used to guard this information + * in the face of multiple-bottoms up requests. + * (lock ordering: IREAD_LOCK, BMAP_LOCK); + * + * accesses to the persistent state of the block allocation map (limited + * to the persistent bitmaps in dmaps) is guarded by (busy) buffers. + */ + +#define BMAP_LOCK_INIT(bmp) mutex_init(&bmp->db_bmaplock) +#define BMAP_LOCK(bmp) mutex_lock(&bmp->db_bmaplock) +#define BMAP_UNLOCK(bmp) mutex_unlock(&bmp->db_bmaplock) + +/* + * forward references + */ +static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval); +static int dbBackSplit(dmtree_t * tp, int leafno); +static int dbJoin(dmtree_t * tp, int leafno, int newval); +static void dbAdjTree(dmtree_t * tp, int leafno, int newval); +static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, + int level); +static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results); +static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks, + int l2nb, s64 * results); +static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks, + int l2nb, + s64 * results); +static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, + s64 * results); +static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, + s64 * results); +static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks); +static int dbFindBits(u32 word, int l2nb); +static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno); +static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx); +static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbMaxBud(u8 * cp); +s64 dbMapFileSizeToMapSize(struct inode *ipbmap); +static int blkstol2(s64 nb); + +static int cntlz(u32 value); +static int cnttz(u32 word); + +static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks); +static int dbInitDmapTree(struct dmap * dp); +static int dbInitTree(struct dmaptree * dtp); +static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i); +static int dbGetL2AGSize(s64 nblocks); + +/* + * buddy table + * + * table used for determining buddy sizes within characters of + * dmap bitmap words. the characters themselves serve as indexes + * into the table, with the table elements yielding the maximum + * binary buddy of free bits within the character. + */ +static const s8 budtab[256] = { + 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1 +}; + + +/* + * NAME: dbMount() + * + * FUNCTION: initializate the block allocation map. + * + * memory is allocated for the in-core bmap descriptor and + * the in-core descriptor is initialized from disk. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * 0 - success + * -ENOMEM - insufficient memory + * -EIO - i/o error + */ +int dbMount(struct inode *ipbmap) +{ + struct bmap *bmp; + struct dbmap_disk *dbmp_le; + struct metapage *mp; + int i; + + /* + * allocate/initialize the in-memory bmap descriptor + */ + /* allocate memory for the in-memory bmap descriptor */ + bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL); + if (bmp == NULL) + return -ENOMEM; + + /* read the on-disk bmap descriptor. */ + mp = read_metapage(ipbmap, + BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + kfree(bmp); + return -EIO; + } + + /* copy the on-disk bmap descriptor to its in-memory version. */ + dbmp_le = (struct dbmap_disk *) mp->data; + bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize); + bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); + bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); + bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); + bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); + bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); + bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); + bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); + bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight); + bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); + bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); + bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); + for (i = 0; i < MAXAG; i++) + bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]); + bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize); + bmp->db_maxfreebud = dbmp_le->dn_maxfreebud; + + /* release the buffer. */ + release_metapage(mp); + + /* bind the bmap inode and the bmap descriptor to each other. */ + bmp->db_ipbmap = ipbmap; + JFS_SBI(ipbmap->i_sb)->bmap = bmp; + + memset(bmp->db_active, 0, sizeof(bmp->db_active)); + + /* + * allocate/initialize the bmap lock + */ + BMAP_LOCK_INIT(bmp); + + return (0); +} + + +/* + * NAME: dbUnmount() + * + * FUNCTION: terminate the block allocation map in preparation for + * file system unmount. + * + * the in-core bmap descriptor is written to disk and + * the memory for this descriptor is freed. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int dbUnmount(struct inode *ipbmap, int mounterror) +{ + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + + if (!(mounterror || isReadOnly(ipbmap))) + dbSync(ipbmap); + + /* + * Invalidate the page cache buffers + */ + truncate_inode_pages(ipbmap->i_mapping, 0); + + /* free the memory for the in-memory bmap. */ + kfree(bmp); + + return (0); +} + +/* + * dbSync() + */ +int dbSync(struct inode *ipbmap) +{ + struct dbmap_disk *dbmp_le; + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + struct metapage *mp; + int i; + + /* + * write bmap global control page + */ + /* get the buffer for the on-disk bmap descriptor. */ + mp = read_metapage(ipbmap, + BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + jfs_err("dbSync: read_metapage failed!"); + return -EIO; + } + /* copy the in-memory version of the bmap to the on-disk version */ + dbmp_le = (struct dbmap_disk *) mp->data; + dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize); + dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree); + dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage); + dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag); + dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel); + dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag); + dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref); + dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel); + dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight); + dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth); + dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart); + dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size); + for (i = 0; i < MAXAG; i++) + dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]); + dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize); + dbmp_le->dn_maxfreebud = bmp->db_maxfreebud; + + /* write the buffer */ + write_metapage(mp); + + /* + * write out dirty pages of bmap + */ + filemap_write_and_wait(ipbmap->i_mapping); + + diWriteSpecial(ipbmap, 0); + + return (0); +} + + +/* + * NAME: dbFree() + * + * FUNCTION: free the specified block range from the working block + * allocation map. + * + * the blocks will be free from the working map one dmap + * at a time. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * blkno - starting block number to be freed. + * nblocks - number of blocks to be freed. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int dbFree(struct inode *ip, s64 blkno, s64 nblocks) +{ + struct metapage *mp; + struct dmap *dp; + int nb, rc; + s64 lblkno, rem; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* block to be freed better be within the mapsize. */ + if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) { + IREAD_UNLOCK(ipbmap); + printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks); + jfs_error(ip->i_sb, + "dbFree: block to be freed is outside the map"); + return -EIO; + } + + /* + * free the blocks a dmap at a time. + */ + mp = NULL; + for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) { + /* release previous dmap if any */ + if (mp) { + write_metapage(mp); + } + + /* get the buffer for the current dmap. */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return -EIO; + } + dp = (struct dmap *) mp->data; + + /* determine the number of blocks to be freed from + * this dmap. + */ + nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); + + /* free the blocks. */ + if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) { + jfs_error(ip->i_sb, "dbFree: error in block map\n"); + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + return (rc); + } + } + + /* write the last buffer. */ + write_metapage(mp); + + IREAD_UNLOCK(ipbmap); + + return (0); +} + + +/* + * NAME: dbUpdatePMap() + * + * FUNCTION: update the allocation state (free or allocate) of the + * specified block range in the persistent block allocation map. + * + * the blocks will be updated in the persistent map one + * dmap at a time. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * free - 'true' if block range is to be freed from the persistent + * map; 'false' if it is to be allocated. + * blkno - starting block number of the range. + * nblocks - number of contiguous blocks in the range. + * tblk - transaction block; + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int +dbUpdatePMap(struct inode *ipbmap, + int free, s64 blkno, s64 nblocks, struct tblock * tblk) +{ + int nblks, dbitno, wbitno, rbits; + int word, nbits, nwords; + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + s64 lblkno, rem, lastlblkno; + u32 mask; + struct dmap *dp; + struct metapage *mp; + struct jfs_log *log; + int lsn, difft, diffp; + unsigned long flags; + + /* the blocks better be within the mapsize. */ + if (blkno + nblocks > bmp->db_mapsize) { + printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks); + jfs_error(ipbmap->i_sb, + "dbUpdatePMap: blocks are outside the map"); + return -EIO; + } + + /* compute delta of transaction lsn from log syncpt */ + lsn = tblk->lsn; + log = (struct jfs_log *) JFS_SBI(tblk->sb)->log; + logdiff(difft, lsn, log); + + /* + * update the block state a dmap at a time. + */ + mp = NULL; + lastlblkno = 0; + for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) { + /* get the buffer for the current dmap. */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + if (lblkno != lastlblkno) { + if (mp) { + write_metapage(mp); + } + + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, + 0); + if (mp == NULL) + return -EIO; + metapage_wait_for_io(mp); + } + dp = (struct dmap *) mp->data; + + /* determine the bit number and word within the dmap of + * the starting block. also determine how many blocks + * are to be updated within this dmap. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + nblks = min(rem, (s64)BPERDMAP - dbitno); + + /* update the bits of the dmap words. the first and last + * words may only have a subset of their bits updated. if + * this is the case, we'll work against that word (i.e. + * partial first and/or last) only in a single pass. a + * single pass will also be used to update all words that + * are to have all their bits updated. + */ + for (rbits = nblks; rbits > 0; + rbits -= nbits, dbitno += nbits) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nbits = min(rbits, DBWORD - wbitno); + + /* check if only part of the word is to be updated. */ + if (nbits < DBWORD) { + /* update (free or allocate) the bits + * in this word. + */ + mask = + (ONES << (DBWORD - nbits) >> wbitno); + if (free) + dp->pmap[word] &= + cpu_to_le32(~mask); + else + dp->pmap[word] |= + cpu_to_le32(mask); + + word += 1; + } else { + /* one or more words are to have all + * their bits updated. determine how + * many words and how many bits. + */ + nwords = rbits >> L2DBWORD; + nbits = nwords << L2DBWORD; + + /* update (free or allocate) the bits + * in these words. + */ + if (free) + memset(&dp->pmap[word], 0, + nwords * 4); + else + memset(&dp->pmap[word], (int) ONES, + nwords * 4); + + word += nwords; + } + } + + /* + * update dmap lsn + */ + if (lblkno == lastlblkno) + continue; + + lastlblkno = lblkno; + + LOGSYNC_LOCK(log, flags); + if (mp->lsn != 0) { + /* inherit older/smaller lsn */ + logdiff(diffp, mp->lsn, log); + if (difft < diffp) { + mp->lsn = lsn; + + /* move bp after tblock in logsync list */ + list_move(&mp->synclist, &tblk->synclist); + } + + /* inherit younger/larger clsn */ + logdiff(difft, tblk->clsn, log); + logdiff(diffp, mp->clsn, log); + if (difft > diffp) + mp->clsn = tblk->clsn; + } else { + mp->log = log; + mp->lsn = lsn; + + /* insert bp after tblock in logsync list */ + log->count++; + list_add(&mp->synclist, &tblk->synclist); + + mp->clsn = tblk->clsn; + } + LOGSYNC_UNLOCK(log, flags); + } + + /* write the last buffer. */ + if (mp) { + write_metapage(mp); + } + + return (0); +} + + +/* + * NAME: dbNextAG() + * + * FUNCTION: find the preferred allocation group for new allocations. + * + * Within the allocation groups, we maintain a preferred + * allocation group which consists of a group with at least + * average free space. It is the preferred group that we target + * new inode allocation towards. The tie-in between inode + * allocation and block allocation occurs as we allocate the + * first (data) block of an inode and specify the inode (block) + * as the allocation hint for this block. + * + * We try to avoid having more than one open file growing in + * an allocation group, as this will lead to fragmentation. + * This differs from the old OS/2 method of trying to keep + * empty ags around for large allocations. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * the preferred allocation group number. + */ +int dbNextAG(struct inode *ipbmap) +{ + s64 avgfree; + int agpref; + s64 hwm = 0; + int i; + int next_best = -1; + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + + BMAP_LOCK(bmp); + + /* determine the average number of free blocks within the ags. */ + avgfree = (u32)bmp->db_nfree / bmp->db_numag; + + /* + * if the current preferred ag does not have an active allocator + * and has at least average freespace, return it + */ + agpref = bmp->db_agpref; + if ((atomic_read(&bmp->db_active[agpref]) == 0) && + (bmp->db_agfree[agpref] >= avgfree)) + goto unlock; + + /* From the last preferred ag, find the next one with at least + * average free space. + */ + for (i = 0 ; i < bmp->db_numag; i++, agpref++) { + if (agpref == bmp->db_numag) + agpref = 0; + + if (atomic_read(&bmp->db_active[agpref])) + /* open file is currently growing in this ag */ + continue; + if (bmp->db_agfree[agpref] >= avgfree) { + /* Return this one */ + bmp->db_agpref = agpref; + goto unlock; + } else if (bmp->db_agfree[agpref] > hwm) { + /* Less than avg. freespace, but best so far */ + hwm = bmp->db_agfree[agpref]; + next_best = agpref; + } + } + + /* + * If no inactive ag was found with average freespace, use the + * next best + */ + if (next_best != -1) + bmp->db_agpref = next_best; + /* else leave db_agpref unchanged */ +unlock: + BMAP_UNLOCK(bmp); + + /* return the preferred group. + */ + return (bmp->db_agpref); +} + +/* + * NAME: dbAlloc() + * + * FUNCTION: attempt to allocate a specified number of contiguous free + * blocks from the working allocation block map. + * + * the block allocation policy uses hints and a multi-step + * approach. + * + * for allocation requests smaller than the number of blocks + * per dmap, we first try to allocate the new blocks + * immediately following the hint. if these blocks are not + * available, we try to allocate blocks near the hint. if + * no blocks near the hint are available, we next try to + * allocate within the same dmap as contains the hint. + * + * if no blocks are available in the dmap or the allocation + * request is larger than the dmap size, we try to allocate + * within the same allocation group as contains the hint. if + * this does not succeed, we finally try to allocate anywhere + * within the aggregate. + * + * we also try to allocate anywhere within the aggregate for + * for allocation requests larger than the allocation group + * size or requests that specify no hint value. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * hint - allocation hint. + * nblocks - number of contiguous blocks in the range. + * results - on successful return, set to the starting block number + * of the newly allocated contiguous range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) +{ + int rc, agno; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp; + struct metapage *mp; + s64 lblkno, blkno; + struct dmap *dp; + int l2nb; + s64 mapSize; + int writers; + + /* assert that nblocks is valid */ + assert(nblocks > 0); + + /* get the log2 number of blocks to be allocated. + * if the number of blocks is not a log2 multiple, + * it will be rounded up to the next log2 multiple. + */ + l2nb = BLKSTOL2(nblocks); + + bmp = JFS_SBI(ip->i_sb)->bmap; + + mapSize = bmp->db_mapsize; + + /* the hint should be within the map */ + if (hint >= mapSize) { + jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map"); + return -EIO; + } + + /* if the number of blocks to be allocated is greater than the + * allocation group size, try to allocate anywhere. + */ + if (l2nb > bmp->db_agl2size) { + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); + + rc = dbAllocAny(bmp, nblocks, l2nb, results); + + goto write_unlock; + } + + /* + * If no hint, let dbNextAG recommend an allocation group + */ + if (hint == 0) + goto pref_ag; + + /* we would like to allocate close to the hint. adjust the + * hint to the block following the hint since the allocators + * will start looking for free space starting at this point. + */ + blkno = hint + 1; + + if (blkno >= bmp->db_mapsize) + goto pref_ag; + + agno = blkno >> bmp->db_agl2size; + + /* check if blkno crosses over into a new allocation group. + * if so, check if we should allow allocations within this + * allocation group. + */ + if ((blkno & (bmp->db_agsize - 1)) == 0) + /* check if the AG is currently being written to. + * if so, call dbNextAG() to find a non-busy + * AG with sufficient free space. + */ + if (atomic_read(&bmp->db_active[agno])) + goto pref_ag; + + /* check if the allocation request size can be satisfied from a + * single dmap. if so, try to allocate from the dmap containing + * the hint using a tiered strategy. + */ + if (nblocks <= BPERDMAP) { + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* get the buffer for the dmap containing the hint. + */ + rc = -EIO; + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + goto read_unlock; + + dp = (struct dmap *) mp->data; + + /* first, try to satisfy the allocation request with the + * blocks beginning at the hint. + */ + if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks)) + != -ENOSPC) { + if (rc == 0) { + *results = blkno; + mark_metapage_dirty(mp); + } + + release_metapage(mp); + goto read_unlock; + } + + writers = atomic_read(&bmp->db_active[agno]); + if ((writers > 1) || + ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) { + /* + * Someone else is writing in this allocation + * group. To avoid fragmenting, try another ag + */ + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + goto pref_ag; + } + + /* next, try to satisfy the allocation request with blocks + * near the hint. + */ + if ((rc = + dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results)) + != -ENOSPC) { + if (rc == 0) + mark_metapage_dirty(mp); + + release_metapage(mp); + goto read_unlock; + } + + /* try to satisfy the allocation request with blocks within + * the same dmap as the hint. + */ + if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results)) + != -ENOSPC) { + if (rc == 0) + mark_metapage_dirty(mp); + + release_metapage(mp); + goto read_unlock; + } + + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + } + + /* try to satisfy the allocation request with blocks within + * the same allocation group as the hint. + */ + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); + if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC) + goto write_unlock; + + IWRITE_UNLOCK(ipbmap); + + + pref_ag: + /* + * Let dbNextAG recommend a preferred allocation group + */ + agno = dbNextAG(ipbmap); + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* Try to allocate within this allocation group. if that fails, try to + * allocate anywhere in the map. + */ + if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC) + rc = dbAllocAny(bmp, nblocks, l2nb, results); + + write_unlock: + IWRITE_UNLOCK(ipbmap); + + return (rc); + + read_unlock: + IREAD_UNLOCK(ipbmap); + + return (rc); +} + +#ifdef _NOTYET +/* + * NAME: dbAllocExact() + * + * FUNCTION: try to allocate the requested extent; + * + * PARAMETERS: + * ip - pointer to in-core inode; + * blkno - extent address; + * nblocks - extent length; + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) +{ + int rc; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + struct dmap *dp; + s64 lblkno; + struct metapage *mp; + + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* + * validate extent request: + * + * note: defragfs policy: + * max 64 blocks will be moved. + * allocation request size must be satisfied from a single dmap. + */ + if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) { + IREAD_UNLOCK(ipbmap); + return -EINVAL; + } + + if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) { + /* the free space is no longer available */ + IREAD_UNLOCK(ipbmap); + return -ENOSPC; + } + + /* read in the dmap covering the extent */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return -EIO; + } + dp = (struct dmap *) mp->data; + + /* try to allocate the requested extent */ + rc = dbAllocNext(bmp, dp, blkno, nblocks); + + IREAD_UNLOCK(ipbmap); + + if (rc == 0) + mark_metapage_dirty(mp); + + release_metapage(mp); + + return (rc); +} +#endif /* _NOTYET */ + +/* + * NAME: dbReAlloc() + * + * FUNCTION: attempt to extend a current allocation by a specified + * number of blocks. + * + * this routine attempts to satisfy the allocation request + * by first trying to extend the existing allocation in + * place by allocating the additional blocks as the blocks + * immediately following the current allocation. if these + * blocks are not available, this routine will attempt to + * allocate a new set of contiguous blocks large enough + * to cover the existing allocation plus the additional + * number of blocks required. + * + * PARAMETERS: + * ip - pointer to in-core inode requiring allocation. + * blkno - starting block of the current allocation. + * nblocks - number of contiguous blocks within the current + * allocation. + * addnblocks - number of blocks to add to the allocation. + * results - on successful return, set to the starting block number + * of the existing allocation if the existing allocation + * was extended in place or to a newly allocated contiguous + * range if the existing allocation could not be extended + * in place. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +int +dbReAlloc(struct inode *ip, + s64 blkno, s64 nblocks, s64 addnblocks, s64 * results) +{ + int rc; + + /* try to extend the allocation in place. + */ + if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) { + *results = blkno; + return (0); + } else { + if (rc != -ENOSPC) + return (rc); + } + + /* could not extend the allocation in place, so allocate a + * new set of blocks for the entire request (i.e. try to get + * a range of contiguous blocks large enough to cover the + * existing allocation plus the additional blocks.) + */ + return (dbAlloc + (ip, blkno + nblocks - 1, addnblocks + nblocks, results)); +} + + +/* + * NAME: dbExtend() + * + * FUNCTION: attempt to extend a current allocation by a specified + * number of blocks. + * + * this routine attempts to satisfy the allocation request + * by first trying to extend the existing allocation in + * place by allocating the additional blocks as the blocks + * immediately following the current allocation. + * + * PARAMETERS: + * ip - pointer to in-core inode requiring allocation. + * blkno - starting block of the current allocation. + * nblocks - number of contiguous blocks within the current + * allocation. + * addnblocks - number of blocks to add to the allocation. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + s64 lblkno, lastblkno, extblkno; + uint rel_block; + struct metapage *mp; + struct dmap *dp; + int rc; + struct inode *ipbmap = sbi->ipbmap; + struct bmap *bmp; + + /* + * We don't want a non-aligned extent to cross a page boundary + */ + if (((rel_block = blkno & (sbi->nbperpage - 1))) && + (rel_block + nblocks + addnblocks > sbi->nbperpage)) + return -ENOSPC; + + /* get the last block of the current allocation */ + lastblkno = blkno + nblocks - 1; + + /* determine the block number of the block following + * the existing allocation. + */ + extblkno = lastblkno + 1; + + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* better be within the file system */ + bmp = sbi->bmap; + if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) { + IREAD_UNLOCK(ipbmap); + jfs_error(ip->i_sb, + "dbExtend: the block is outside the filesystem"); + return -EIO; + } + + /* we'll attempt to extend the current allocation in place by + * allocating the additional blocks as the blocks immediately + * following the current allocation. we only try to extend the + * current allocation in place if the number of additional blocks + * can fit into a dmap, the last block of the current allocation + * is not the last block of the file system, and the start of the + * inplace extension is not on an allocation group boundary. + */ + if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize || + (extblkno & (bmp->db_agsize - 1)) == 0) { + IREAD_UNLOCK(ipbmap); + return -ENOSPC; + } + + /* get the buffer for the dmap containing the first block + * of the extension. + */ + lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return -EIO; + } + + dp = (struct dmap *) mp->data; + + /* try to allocate the blocks immediately following the + * current allocation. + */ + rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks); + + IREAD_UNLOCK(ipbmap); + + /* were we successful ? */ + if (rc == 0) + write_metapage(mp); + else + /* we were not successful */ + release_metapage(mp); + + + return (rc); +} + + +/* + * NAME: dbAllocNext() + * + * FUNCTION: attempt to allocate the blocks of the specified block + * range within a dmap. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap. + * blkno - starting block number of the range. + * nblocks - number of contiguous free blocks of the range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) held on entry/exit; + */ +static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + int dbitno, word, rembits, nb, nwords, wbitno, nw; + int l2size; + s8 *leaf; + u32 mask; + + if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocNext: Corrupt dmap page"); + return -EIO; + } + + /* pick up a pointer to the leaves of the dmap tree. + */ + leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx); + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* check if the specified block range is contained within + * this dmap. + */ + if (dbitno + nblocks > BPERDMAP) + return -ENOSPC; + + /* check if the starting leaf indicates that anything + * is free. + */ + if (leaf[word] == NOFREE) + return -ENOSPC; + + /* check the dmaps words corresponding to block range to see + * if the block range is free. not all bits of the first and + * last words may be contained within the block range. if this + * is the case, we'll work against those words (i.e. partial first + * and/or last) on an individual basis (a single pass) and examine + * the actual bits to determine if they are free. a single pass + * will be used for all dmap words fully contained within the + * specified range. within this pass, the leaves of the dmap + * tree will be examined to determine if the blocks are free. a + * single leaf may describe the free space of multiple dmap + * words, so we may visit only a subset of the actual leaves + * corresponding to the dmap words of the block range. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of the word is to be examined. + */ + if (nb < DBWORD) { + /* check if the bits are free. + */ + mask = (ONES << (DBWORD - nb) >> wbitno); + if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask) + return -ENOSPC; + + word += 1; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and how many bits. + */ + nwords = rembits >> L2DBWORD; + nb = nwords << L2DBWORD; + + /* now examine the appropriate leaves to determine + * if the blocks are free. + */ + while (nwords > 0) { + /* does the leaf describe any free space ? + */ + if (leaf[word] < BUDMIN) + return -ENOSPC; + + /* determine the l2 number of bits provided + * by this leaf. + */ + l2size = + min((int)leaf[word], NLSTOL2BSZ(nwords)); + + /* determine how many words were handled. + */ + nw = BUDSIZE(l2size, BUDMIN); + + nwords -= nw; + word += nw; + } + } + } + + /* allocate the blocks. + */ + return (dbAllocDmap(bmp, dp, blkno, nblocks)); +} + + +/* + * NAME: dbAllocNear() + * + * FUNCTION: attempt to allocate a number of contiguous free blocks near + * a specified block (hint) within a dmap. + * + * starting with the dmap leaf that covers the hint, we'll + * check the next four contiguous leaves for sufficient free + * space. if sufficient free space is found, we'll allocate + * the desired free space. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap. + * blkno - block number to allocate near. + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) held on entry/exit; + */ +static int +dbAllocNear(struct bmap * bmp, + struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results) +{ + int word, lword, rc; + s8 *leaf; + + if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocNear: Corrupt dmap page"); + return -EIO; + } + + leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx); + + /* determine the word within the dmap that holds the hint + * (i.e. blkno). also, determine the last word in the dmap + * that we'll include in our examination. + */ + word = (blkno & (BPERDMAP - 1)) >> L2DBWORD; + lword = min(word + 4, LPERDMAP); + + /* examine the leaves for sufficient free space. + */ + for (; word < lword; word++) { + /* does the leaf describe sufficient free space ? + */ + if (leaf[word] < l2nb) + continue; + + /* determine the block number within the file system + * of the first block described by this dmap word. + */ + blkno = le64_to_cpu(dp->start) + (word << L2DBWORD); + + /* if not all bits of the dmap word are free, get the + * starting bit number within the dmap word of the required + * string of free bits and adjust the block number with the + * value. + */ + if (leaf[word] < BUDMIN) + blkno += + dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb); + + /* allocate the blocks. + */ + if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0) + *results = blkno; + + return (rc); + } + + return -ENOSPC; +} + + +/* + * NAME: dbAllocAG() + * + * FUNCTION: attempt to allocate the specified number of contiguous + * free blocks within the specified allocation group. + * + * unless the allocation group size is equal to the number + * of blocks per dmap, the dmap control pages will be used to + * find the required free space, if available. we start the + * search at the highest dmap control page level which + * distinctly describes the allocation group's free space + * (i.e. the highest level at which the allocation group's + * free space is not mixed in with that of any other group). + * in addition, we start the search within this level at a + * height of the dmapctl dmtree at which the nodes distinctly + * describe the allocation group's free space. at this height, + * the allocation group's free space may be represented by 1 + * or two sub-trees, depending on the allocation group size. + * we search the top nodes of these subtrees left to right for + * sufficient free space. if sufficient free space is found, + * the subtree is searched to find the leftmost leaf that + * has free space. once we have made it to the leaf, we + * move the search to the next lower level dmap control page + * corresponding to this leaf. we continue down the dmap control + * pages until we find the dmap that contains or starts the + * sufficient free space and we allocate at this dmap. + * + * if the allocation group size is equal to the dmap size, + * we'll start at the dmap corresponding to the allocation + * group and attempt the allocation at this level. + * + * the dmap control page search is also not performed if the + * allocation group is completely free and we go to the first + * dmap of the allocation group to do the allocation. this is + * done because the allocation group may be part (not the first + * part) of a larger binary buddy system, causing the dmap + * control pages to indicate no free space (NOFREE) within + * the allocation group. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * agno - allocation group number. + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * note: IWRITE_LOCK(ipmap) held on entry/exit; + */ +static int +dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) +{ + struct metapage *mp; + struct dmapctl *dcp; + int rc, ti, i, k, m, n, agperlev; + s64 blkno, lblkno; + int budmin; + + /* allocation request should not be for more than the + * allocation group size. + */ + if (l2nb > bmp->db_agl2size) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocAG: allocation request is larger than the " + "allocation group size"); + return -EIO; + } + + /* determine the starting block number of the allocation + * group. + */ + blkno = (s64) agno << bmp->db_agl2size; + + /* check if the allocation group size is the minimum allocation + * group size or if the allocation group is completely free. if + * the allocation group size is the minimum size of BPERDMAP (i.e. + * 1 dmap), there is no need to search the dmap control page (below) + * that fully describes the allocation group since the allocation + * group is already fully described by a dmap. in this case, we + * just call dbAllocCtl() to search the dmap tree and allocate the + * required space if available. + * + * if the allocation group is completely free, dbAllocCtl() is + * also called to allocate the required space. this is done for + * two reasons. first, it makes no sense searching the dmap control + * pages for free space when we know that free space exists. second, + * the dmap control pages may indicate that the allocation group + * has no free space if the allocation group is part (not the first + * part) of a larger binary buddy system. + */ + if (bmp->db_agsize == BPERDMAP + || bmp->db_agfree[agno] == bmp->db_agsize) { + rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); + if ((rc == -ENOSPC) && + (bmp->db_agfree[agno] == bmp->db_agsize)) { + printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks); + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocAG: dbAllocCtl failed in free AG"); + } + return (rc); + } + + /* the buffer for the dmap control page that fully describes the + * allocation group. + */ + lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return -EIO; + dcp = (struct dmapctl *) mp->data; + budmin = dcp->budmin; + + if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocAG: Corrupt dmapctl page"); + release_metapage(mp); + return -EIO; + } + + /* search the subtree(s) of the dmap control page that describes + * the allocation group, looking for sufficient free space. to begin, + * determine how many allocation groups are represented in a dmap + * control page at the control page level (i.e. L0, L1, L2) that + * fully describes an allocation group. next, determine the starting + * tree index of this allocation group within the control page. + */ + agperlev = + (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth; + ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); + + /* dmap control page trees fan-out by 4 and a single allocation + * group may be described by 1 or 2 subtrees within the ag level + * dmap control page, depending upon the ag size. examine the ag's + * subtrees for sufficient free space, starting with the leftmost + * subtree. + */ + for (i = 0; i < bmp->db_agwidth; i++, ti++) { + /* is there sufficient free space ? + */ + if (l2nb > dcp->stree[ti]) + continue; + + /* sufficient free space found in a subtree. now search down + * the subtree to find the leftmost leaf that describes this + * free space. + */ + for (k = bmp->db_agheight; k > 0; k--) { + for (n = 0, m = (ti << 2) + 1; n < 4; n++) { + if (l2nb <= dcp->stree[m + n]) { + ti = m + n; + break; + } + } + if (n == 4) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocAG: failed descending stree"); + release_metapage(mp); + return -EIO; + } + } + + /* determine the block number within the file system + * that corresponds to this leaf. + */ + if (bmp->db_aglevel == 2) + blkno = 0; + else if (bmp->db_aglevel == 1) + blkno &= ~(MAXL1SIZE - 1); + else /* bmp->db_aglevel == 0 */ + blkno &= ~(MAXL0SIZE - 1); + + blkno += + ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin; + + /* release the buffer in preparation for going down + * the next level of dmap control pages. + */ + release_metapage(mp); + + /* check if we need to continue to search down the lower + * level dmap control pages. we need to if the number of + * blocks required is less than maximum number of blocks + * described at the next lower level. + */ + if (l2nb < budmin) { + + /* search the lower level dmap control pages to get + * the starting block number of the dmap that + * contains or starts off the free space. + */ + if ((rc = + dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1, + &blkno))) { + if (rc == -ENOSPC) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocAG: control page " + "inconsistent"); + return -EIO; + } + return (rc); + } + } + + /* allocate the blocks. + */ + rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); + if (rc == -ENOSPC) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocAG: unable to allocate blocks"); + rc = -EIO; + } + return (rc); + } + + /* no space in the allocation group. release the buffer and + * return -ENOSPC. + */ + release_metapage(mp); + + return -ENOSPC; +} + + +/* + * NAME: dbAllocAny() + * + * FUNCTION: attempt to allocate the specified number of contiguous + * free blocks anywhere in the file system. + * + * dbAllocAny() attempts to find the sufficient free space by + * searching down the dmap control pages, starting with the + * highest level (i.e. L0, L1, L2) control page. if free space + * large enough to satisfy the desired free space is found, the + * desired free space is allocated. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results) +{ + int rc; + s64 blkno = 0; + + /* starting with the top level dmap control page, search + * down the dmap control levels for sufficient free space. + * if free space is found, dbFindCtl() returns the starting + * block number of the dmap that contains or starts off the + * range of free space. + */ + if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno))) + return (rc); + + /* allocate the blocks. + */ + rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); + if (rc == -ENOSPC) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocAny: unable to allocate blocks"); + return -EIO; + } + return (rc); +} + + +/* + * NAME: dbFindCtl() + * + * FUNCTION: starting at a specified dmap control page level and block + * number, search down the dmap control levels for a range of + * contiguous free blocks large enough to satisfy an allocation + * request for the specified number of free blocks. + * + * if sufficient contiguous free blocks are found, this routine + * returns the starting block number within a dmap page that + * contains or starts a range of contiqious free blocks that + * is sufficient in size. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * level - starting dmap control page level. + * l2nb - log2 number of contiguous free blocks desired. + * *blkno - on entry, starting block number for conducting the search. + * on successful return, the first block within a dmap page + * that contains or starts a range of contiguous free blocks. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) +{ + int rc, leafidx, lev; + s64 b, lblkno; + struct dmapctl *dcp; + int budmin; + struct metapage *mp; + + /* starting at the specified dmap control page level and block + * number, search down the dmap control levels for the starting + * block number of a dmap page that contains or starts off + * sufficient free blocks. + */ + for (lev = level, b = *blkno; lev >= 0; lev--) { + /* get the buffer of the dmap control page for the block + * number and level (i.e. L0, L1, L2). + */ + lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return -EIO; + dcp = (struct dmapctl *) mp->data; + budmin = dcp->budmin; + + if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbFindCtl: Corrupt dmapctl page"); + release_metapage(mp); + return -EIO; + } + + /* search the tree within the dmap control page for + * sufficient free space. if sufficient free space is found, + * dbFindLeaf() returns the index of the leaf at which + * free space was found. + */ + rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx); + + /* release the buffer. + */ + release_metapage(mp); + + /* space found ? + */ + if (rc) { + if (lev != level) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbFindCtl: dmap inconsistent"); + return -EIO; + } + return -ENOSPC; + } + + /* adjust the block number to reflect the location within + * the dmap control page (i.e. the leaf) at which free + * space was found. + */ + b += (((s64) leafidx) << budmin); + + /* we stop the search at this dmap control page level if + * the number of blocks required is greater than or equal + * to the maximum number of blocks described at the next + * (lower) level. + */ + if (l2nb >= budmin) + break; + } + + *blkno = b; + return (0); +} + + +/* + * NAME: dbAllocCtl() + * + * FUNCTION: attempt to allocate a specified number of contiguous + * blocks starting within a specific dmap. + * + * this routine is called by higher level routines that search + * the dmap control pages above the actual dmaps for contiguous + * free space. the result of successful searches by these + * routines are the starting block numbers within dmaps, with + * the dmaps themselves containing the desired contiguous free + * space or starting a contiguous free space of desired size + * that is made up of the blocks of one or more dmaps. these + * calls should not fail due to insufficent resources. + * + * this routine is called in some cases where it is not known + * whether it will fail due to insufficient resources. more + * specifically, this occurs when allocating from an allocation + * group whose size is equal to the number of blocks per dmap. + * in this case, the dmap control pages are not examined prior + * to calling this routine (to save pathlength) and the call + * might fail. + * + * for a request size that fits within a dmap, this routine relies + * upon the dmap's dmtree to find the requested contiguous free + * space. for request sizes that are larger than a dmap, the + * requested free space will start at the first block of the + * first dmap (i.e. blkno). + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * nblocks - actual number of contiguous free blocks to allocate. + * l2nb - log2 number of contiguous free blocks to allocate. + * blkno - starting block number of the dmap to start the allocation + * from. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int +dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) +{ + int rc, nb; + s64 b, lblkno, n; + struct metapage *mp; + struct dmap *dp; + + /* check if the allocation request is confined to a single dmap. + */ + if (l2nb <= L2BPERDMAP) { + /* get the buffer for the dmap. + */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return -EIO; + dp = (struct dmap *) mp->data; + + /* try to allocate the blocks. + */ + rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results); + if (rc == 0) + mark_metapage_dirty(mp); + + release_metapage(mp); + + return (rc); + } + + /* allocation request involving multiple dmaps. it must start on + * a dmap boundary. + */ + assert((blkno & (BPERDMAP - 1)) == 0); + + /* allocate the blocks dmap by dmap. + */ + for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) { + /* get the buffer for the dmap. + */ + lblkno = BLKTODMAP(b, bmp->db_l2nbperpage); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + rc = -EIO; + goto backout; + } + dp = (struct dmap *) mp->data; + + /* the dmap better be all free. + */ + if (dp->tree.stree[ROOT] != L2BPERDMAP) { + release_metapage(mp); + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocCtl: the dmap is not all free"); + rc = -EIO; + goto backout; + } + + /* determine how many blocks to allocate from this dmap. + */ + nb = min(n, (s64)BPERDMAP); + + /* allocate the blocks from the dmap. + */ + if ((rc = dbAllocDmap(bmp, dp, b, nb))) { + release_metapage(mp); + goto backout; + } + + /* write the buffer. + */ + write_metapage(mp); + } + + /* set the results (starting block number) and return. + */ + *results = blkno; + return (0); + + /* something failed in handling an allocation request involving + * multiple dmaps. we'll try to clean up by backing out any + * allocation that has already happened for this request. if + * we fail in backing out the allocation, we'll mark the file + * system to indicate that blocks have been leaked. + */ + backout: + + /* try to backout the allocations dmap by dmap. + */ + for (n = nblocks - n, b = blkno; n > 0; + n -= BPERDMAP, b += BPERDMAP) { + /* get the buffer for this dmap. + */ + lblkno = BLKTODMAP(b, bmp->db_l2nbperpage); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + /* could not back out. mark the file system + * to indicate that we have leaked blocks. + */ + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocCtl: I/O Error: Block Leakage."); + continue; + } + dp = (struct dmap *) mp->data; + + /* free the blocks is this dmap. + */ + if (dbFreeDmap(bmp, dp, b, BPERDMAP)) { + /* could not back out. mark the file system + * to indicate that we have leaked blocks. + */ + release_metapage(mp); + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocCtl: Block Leakage."); + continue; + } + + /* write the buffer. + */ + write_metapage(mp); + } + + return (rc); +} + + +/* + * NAME: dbAllocDmapLev() + * + * FUNCTION: attempt to allocate a specified number of contiguous blocks + * from a specified dmap. + * + * this routine checks if the contiguous blocks are available. + * if so, nblocks of blocks are allocated; otherwise, ENOSPC is + * returned. + * + * PARAMETERS: + * mp - pointer to bmap descriptor + * dp - pointer to dmap to attempt to allocate blocks from. + * l2nb - log2 number of contiguous block desired. + * nblocks - actual number of contiguous block desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or + * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit; + */ +static int +dbAllocDmapLev(struct bmap * bmp, + struct dmap * dp, int nblocks, int l2nb, s64 * results) +{ + s64 blkno; + int leafidx, rc; + + /* can't be more than a dmaps worth of blocks */ + assert(l2nb <= L2BPERDMAP); + + /* search the tree within the dmap page for sufficient + * free space. if sufficient free space is found, dbFindLeaf() + * returns the index of the leaf at which free space was found. + */ + if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx)) + return -ENOSPC; + + /* determine the block number within the file system corresponding + * to the leaf at which free space was found. + */ + blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD); + + /* if not all bits of the dmap word are free, get the starting + * bit number within the dmap word of the required string of free + * bits and adjust the block number with this value. + */ + if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN) + blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb); + + /* allocate the blocks */ + if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0) + *results = blkno; + + return (rc); +} + + +/* + * NAME: dbAllocDmap() + * + * FUNCTION: adjust the disk allocation map to reflect the allocation + * of a specified block range within a dmap. + * + * this routine allocates the specified blocks from the dmap + * through a call to dbAllocBits(). if the allocation of the + * block range causes the maximum string of free blocks within + * the dmap to change (i.e. the value of the root of the dmap's + * dmtree), this routine will cause this change to be reflected + * up through the appropriate levels of the dmap control pages + * by a call to dbAdjCtl() for the L0 dmap control page that + * covers this dmap. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to allocate the block range from. + * blkno - starting block number of the block to be allocated. + * nblocks - number of blocks to be allocated. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + s8 oldroot; + int rc; + + /* save the current value of the root (i.e. maximum free string) + * of the dmap tree. + */ + oldroot = dp->tree.stree[ROOT]; + + /* allocate the specified (blocks) bits */ + dbAllocBits(bmp, dp, blkno, nblocks); + + /* if the root has not changed, done. */ + if (dp->tree.stree[ROOT] == oldroot) + return (0); + + /* root changed. bubble the change up to the dmap control pages. + * if the adjustment of the upper level control pages fails, + * backout the bit allocation (thus making everything consistent). + */ + if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0))) + dbFreeBits(bmp, dp, blkno, nblocks); + + return (rc); +} + + +/* + * NAME: dbFreeDmap() + * + * FUNCTION: adjust the disk allocation map to reflect the allocation + * of a specified block range within a dmap. + * + * this routine frees the specified blocks from the dmap through + * a call to dbFreeBits(). if the deallocation of the block range + * causes the maximum string of free blocks within the dmap to + * change (i.e. the value of the root of the dmap's dmtree), this + * routine will cause this change to be reflected up through the + * appropriate levels of the dmap control pages by a call to + * dbAdjCtl() for the L0 dmap control page that covers this dmap. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to free the block range from. + * blkno - starting block number of the block to be freed. + * nblocks - number of blocks to be freed. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + s8 oldroot; + int rc = 0, word; + + /* save the current value of the root (i.e. maximum free string) + * of the dmap tree. + */ + oldroot = dp->tree.stree[ROOT]; + + /* free the specified (blocks) bits */ + rc = dbFreeBits(bmp, dp, blkno, nblocks); + + /* if error or the root has not changed, done. */ + if (rc || (dp->tree.stree[ROOT] == oldroot)) + return (rc); + + /* root changed. bubble the change up to the dmap control pages. + * if the adjustment of the upper level control pages fails, + * backout the deallocation. + */ + if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) { + word = (blkno & (BPERDMAP - 1)) >> L2DBWORD; + + /* as part of backing out the deallocation, we will have + * to back split the dmap tree if the deallocation caused + * the freed blocks to become part of a larger binary buddy + * system. + */ + if (dp->tree.stree[word] == NOFREE) + dbBackSplit((dmtree_t *) & dp->tree, word); + + dbAllocBits(bmp, dp, blkno, nblocks); + } + + return (rc); +} + + +/* + * NAME: dbAllocBits() + * + * FUNCTION: allocate a specified block range from a dmap. + * + * this routine updates the dmap to reflect the working + * state allocation of the specified block range. it directly + * updates the bits of the working map and causes the adjustment + * of the binary buddy system described by the dmap's dmtree + * leaves to reflect the bits allocated. it also causes the + * dmap's dmtree, as a whole, to reflect the allocated range. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to allocate bits from. + * blkno - starting block number of the bits to be allocated. + * nblocks - number of bits to be allocated. + * + * RETURN VALUES: none + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + int dbitno, word, rembits, nb, nwords, wbitno, nw, agno; + dmtree_t *tp = (dmtree_t *) & dp->tree; + int size; + s8 *leaf; + + /* pick up a pointer to the leaves of the dmap tree */ + leaf = dp->tree.stree + LEAFIND; + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* block range better be within the dmap */ + assert(dbitno + nblocks <= BPERDMAP); + + /* allocate the bits of the dmap's words corresponding to the block + * range. not all bits of the first and last words may be contained + * within the block range. if this is the case, we'll work against + * those words (i.e. partial first and/or last) on an individual basis + * (a single pass), allocating the bits of interest by hand and + * updating the leaf corresponding to the dmap word. a single pass + * will be used for all dmap words fully contained within the + * specified range. within this pass, the bits of all fully contained + * dmap words will be marked as free in a single shot and the leaves + * will be updated. a single leaf may describe the free space of + * multiple dmap words, so we may update only a subset of the actual + * leaves corresponding to the dmap words of the block range. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of a word is to be allocated. + */ + if (nb < DBWORD) { + /* allocate (set to 1) the appropriate bits within + * this dmap word. + */ + dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb) + >> wbitno); + + /* update the leaf for this dmap word. in addition + * to setting the leaf value to the binary buddy max + * of the updated dmap word, dbSplit() will split + * the binary system of the leaves if need be. + */ + dbSplit(tp, word, BUDMIN, + dbMaxBud((u8 *) & dp->wmap[word])); + + word += 1; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and allocate (set to 1) the bits of these + * words. + */ + nwords = rembits >> L2DBWORD; + memset(&dp->wmap[word], (int) ONES, nwords * 4); + + /* determine how many bits. + */ + nb = nwords << L2DBWORD; + + /* now update the appropriate leaves to reflect + * the allocated words. + */ + for (; nwords > 0; nwords -= nw) { + if (leaf[word] < BUDMIN) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocBits: leaf page " + "corrupt"); + break; + } + + /* determine what the leaf value should be + * updated to as the minimum of the l2 number + * of bits being allocated and the l2 number + * of bits currently described by this leaf. + */ + size = min((int)leaf[word], NLSTOL2BSZ(nwords)); + + /* update the leaf to reflect the allocation. + * in addition to setting the leaf value to + * NOFREE, dbSplit() will split the binary + * system of the leaves to reflect the current + * allocation (size). + */ + dbSplit(tp, word, size, NOFREE); + + /* get the number of dmap words handled */ + nw = BUDSIZE(size, BUDMIN); + word += nw; + } + } + } + + /* update the free count for this dmap */ + le32_add_cpu(&dp->nfree, -nblocks); + + BMAP_LOCK(bmp); + + /* if this allocation group is completely free, + * update the maximum allocation group number if this allocation + * group is the new max. + */ + agno = blkno >> bmp->db_agl2size; + if (agno > bmp->db_maxag) + bmp->db_maxag = agno; + + /* update the free count for the allocation group and map */ + bmp->db_agfree[agno] -= nblocks; + bmp->db_nfree -= nblocks; + + BMAP_UNLOCK(bmp); +} + + +/* + * NAME: dbFreeBits() + * + * FUNCTION: free a specified block range from a dmap. + * + * this routine updates the dmap to reflect the working + * state allocation of the specified block range. it directly + * updates the bits of the working map and causes the adjustment + * of the binary buddy system described by the dmap's dmtree + * leaves to reflect the bits freed. it also causes the dmap's + * dmtree, as a whole, to reflect the deallocated range. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to free bits from. + * blkno - starting block number of the bits to be freed. + * nblocks - number of bits to be freed. + * + * RETURN VALUES: 0 for success + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + int dbitno, word, rembits, nb, nwords, wbitno, nw, agno; + dmtree_t *tp = (dmtree_t *) & dp->tree; + int rc = 0; + int size; + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* block range better be within the dmap. + */ + assert(dbitno + nblocks <= BPERDMAP); + + /* free the bits of the dmaps words corresponding to the block range. + * not all bits of the first and last words may be contained within + * the block range. if this is the case, we'll work against those + * words (i.e. partial first and/or last) on an individual basis + * (a single pass), freeing the bits of interest by hand and updating + * the leaf corresponding to the dmap word. a single pass will be used + * for all dmap words fully contained within the specified range. + * within this pass, the bits of all fully contained dmap words will + * be marked as free in a single shot and the leaves will be updated. a + * single leaf may describe the free space of multiple dmap words, + * so we may update only a subset of the actual leaves corresponding + * to the dmap words of the block range. + * + * dbJoin() is used to update leaf values and will join the binary + * buddy system of the leaves if the new leaf values indicate this + * should be done. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of a word is to be freed. + */ + if (nb < DBWORD) { + /* free (zero) the appropriate bits within this + * dmap word. + */ + dp->wmap[word] &= + cpu_to_le32(~(ONES << (DBWORD - nb) + >> wbitno)); + + /* update the leaf for this dmap word. + */ + rc = dbJoin(tp, word, + dbMaxBud((u8 *) & dp->wmap[word])); + if (rc) + return rc; + + word += 1; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and free (zero) the bits of these words. + */ + nwords = rembits >> L2DBWORD; + memset(&dp->wmap[word], 0, nwords * 4); + + /* determine how many bits. + */ + nb = nwords << L2DBWORD; + + /* now update the appropriate leaves to reflect + * the freed words. + */ + for (; nwords > 0; nwords -= nw) { + /* determine what the leaf value should be + * updated to as the minimum of the l2 number + * of bits being freed and the l2 (max) number + * of bits that can be described by this leaf. + */ + size = + min(LITOL2BSZ + (word, L2LPERDMAP, BUDMIN), + NLSTOL2BSZ(nwords)); + + /* update the leaf. + */ + rc = dbJoin(tp, word, size); + if (rc) + return rc; + + /* get the number of dmap words handled. + */ + nw = BUDSIZE(size, BUDMIN); + word += nw; + } + } + } + + /* update the free count for this dmap. + */ + le32_add_cpu(&dp->nfree, nblocks); + + BMAP_LOCK(bmp); + + /* update the free count for the allocation group and + * map. + */ + agno = blkno >> bmp->db_agl2size; + bmp->db_nfree += nblocks; + bmp->db_agfree[agno] += nblocks; + + /* check if this allocation group is not completely free and + * if it is currently the maximum (rightmost) allocation group. + * if so, establish the new maximum allocation group number by + * searching left for the first allocation group with allocation. + */ + if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) || + (agno == bmp->db_numag - 1 && + bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) { + while (bmp->db_maxag > 0) { + bmp->db_maxag -= 1; + if (bmp->db_agfree[bmp->db_maxag] != + bmp->db_agsize) + break; + } + + /* re-establish the allocation group preference if the + * current preference is right of the maximum allocation + * group. + */ + if (bmp->db_agpref > bmp->db_maxag) + bmp->db_agpref = bmp->db_maxag; + } + + BMAP_UNLOCK(bmp); + + return 0; +} + + +/* + * NAME: dbAdjCtl() + * + * FUNCTION: adjust a dmap control page at a specified level to reflect + * the change in a lower level dmap or dmap control page's + * maximum string of free blocks (i.e. a change in the root + * of the lower level object's dmtree) due to the allocation + * or deallocation of a range of blocks with a single dmap. + * + * on entry, this routine is provided with the new value of + * the lower level dmap or dmap control page root and the + * starting block number of the block range whose allocation + * or deallocation resulted in the root change. this range + * is respresented by a single leaf of the current dmapctl + * and the leaf will be updated with this value, possibly + * causing a binary buddy system within the leaves to be + * split or joined. the update may also cause the dmapctl's + * dmtree to be updated. + * + * if the adjustment of the dmap control page, itself, causes its + * root to change, this change will be bubbled up to the next dmap + * control level by a recursive call to this routine, specifying + * the new root value and the next dmap control page level to + * be adjusted. + * PARAMETERS: + * bmp - pointer to bmap descriptor + * blkno - the first block of a block range within a dmap. it is + * the allocation or deallocation of this block range that + * requires the dmap control page to be adjusted. + * newval - the new value of the lower level dmap or dmap control + * page root. + * alloc - 'true' if adjustment is due to an allocation. + * level - current level of dmap control page (i.e. L0, L1, L2) to + * be adjusted. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int +dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) +{ + struct metapage *mp; + s8 oldroot; + int oldval; + s64 lblkno; + struct dmapctl *dcp; + int rc, leafno, ti; + + /* get the buffer for the dmap control page for the specified + * block number and control page level. + */ + lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return -EIO; + dcp = (struct dmapctl *) mp->data; + + if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAdjCtl: Corrupt dmapctl page"); + release_metapage(mp); + return -EIO; + } + + /* determine the leaf number corresponding to the block and + * the index within the dmap control tree. + */ + leafno = BLKTOCTLLEAF(blkno, dcp->budmin); + ti = leafno + le32_to_cpu(dcp->leafidx); + + /* save the current leaf value and the current root level (i.e. + * maximum l2 free string described by this dmapctl). + */ + oldval = dcp->stree[ti]; + oldroot = dcp->stree[ROOT]; + + /* check if this is a control page update for an allocation. + * if so, update the leaf to reflect the new leaf value using + * dbSplit(); otherwise (deallocation), use dbJoin() to update + * the leaf with the new value. in addition to updating the + * leaf, dbSplit() will also split the binary buddy system of + * the leaves, if required, and bubble new values within the + * dmapctl tree, if required. similarly, dbJoin() will join + * the binary buddy system of leaves and bubble new values up + * the dmapctl tree as required by the new leaf value. + */ + if (alloc) { + /* check if we are in the middle of a binary buddy + * system. this happens when we are performing the + * first allocation out of an allocation group that + * is part (not the first part) of a larger binary + * buddy system. if we are in the middle, back split + * the system prior to calling dbSplit() which assumes + * that it is at the front of a binary buddy system. + */ + if (oldval == NOFREE) { + rc = dbBackSplit((dmtree_t *) dcp, leafno); + if (rc) + return rc; + oldval = dcp->stree[ti]; + } + dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval); + } else { + rc = dbJoin((dmtree_t *) dcp, leafno, newval); + if (rc) + return rc; + } + + /* check if the root of the current dmap control page changed due + * to the update and if the current dmap control page is not at + * the current top level (i.e. L0, L1, L2) of the map. if so (i.e. + * root changed and this is not the top level), call this routine + * again (recursion) for the next higher level of the mapping to + * reflect the change in root for the current dmap control page. + */ + if (dcp->stree[ROOT] != oldroot) { + /* are we below the top level of the map. if so, + * bubble the root up to the next higher level. + */ + if (level < bmp->db_maxlevel) { + /* bubble up the new root of this dmap control page to + * the next level. + */ + if ((rc = + dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc, + level + 1))) { + /* something went wrong in bubbling up the new + * root value, so backout the changes to the + * current dmap control page. + */ + if (alloc) { + dbJoin((dmtree_t *) dcp, leafno, + oldval); + } else { + /* the dbJoin() above might have + * caused a larger binary buddy system + * to form and we may now be in the + * middle of it. if this is the case, + * back split the buddies. + */ + if (dcp->stree[ti] == NOFREE) + dbBackSplit((dmtree_t *) + dcp, leafno); + dbSplit((dmtree_t *) dcp, leafno, + dcp->budmin, oldval); + } + + /* release the buffer and return the error. + */ + release_metapage(mp); + return (rc); + } + } else { + /* we're at the top level of the map. update + * the bmap control page to reflect the size + * of the maximum free buddy system. + */ + assert(level == bmp->db_maxlevel); + if (bmp->db_maxfreebud != oldroot) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAdjCtl: the maximum free buddy is " + "not the old root"); + } + bmp->db_maxfreebud = dcp->stree[ROOT]; + } + } + + /* write the buffer. + */ + write_metapage(mp); + + return (0); +} + + +/* + * NAME: dbSplit() + * + * FUNCTION: update the leaf of a dmtree with a new value, splitting + * the leaf from the binary buddy system of the dmtree's + * leaves, as required. + * + * PARAMETERS: + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * splitsz - the size the binary buddy system starting at the leaf + * must be split to, specified as the log2 number of blocks. + * newval - the new value for the leaf. + * + * RETURN VALUES: none + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) +{ + int budsz; + int cursz; + s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); + + /* check if the leaf needs to be split. + */ + if (leaf[leafno] > tp->dmt_budmin) { + /* the split occurs by cutting the buddy system in half + * at the specified leaf until we reach the specified + * size. pick up the starting split size (current size + * - 1 in l2) and the corresponding buddy size. + */ + cursz = leaf[leafno] - 1; + budsz = BUDSIZE(cursz, tp->dmt_budmin); + + /* split until we reach the specified size. + */ + while (cursz >= splitsz) { + /* update the buddy's leaf with its new value. + */ + dbAdjTree(tp, leafno ^ budsz, cursz); + + /* on to the next size and buddy. + */ + cursz -= 1; + budsz >>= 1; + } + } + + /* adjust the dmap tree to reflect the specified leaf's new + * value. + */ + dbAdjTree(tp, leafno, newval); +} + + +/* + * NAME: dbBackSplit() + * + * FUNCTION: back split the binary buddy system of dmtree leaves + * that hold a specified leaf until the specified leaf + * starts its own binary buddy system. + * + * the allocators typically perform allocations at the start + * of binary buddy systems and dbSplit() is used to accomplish + * any required splits. in some cases, however, allocation + * may occur in the middle of a binary system and requires a + * back split, with the split proceeding out from the middle of + * the system (less efficient) rather than the start of the + * system (more efficient). the cases in which a back split + * is required are rare and are limited to the first allocation + * within an allocation group which is a part (not first part) + * of a larger binary buddy system and a few exception cases + * in which a previous join operation must be backed out. + * + * PARAMETERS: + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * + * RETURN VALUES: none + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbBackSplit(dmtree_t * tp, int leafno) +{ + int budsz, bud, w, bsz, size; + int cursz; + s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); + + /* leaf should be part (not first part) of a binary + * buddy system. + */ + assert(leaf[leafno] == NOFREE); + + /* the back split is accomplished by iteratively finding the leaf + * that starts the buddy system that contains the specified leaf and + * splitting that system in two. this iteration continues until + * the specified leaf becomes the start of a buddy system. + * + * determine maximum possible l2 size for the specified leaf. + */ + size = + LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs), + tp->dmt_budmin); + + /* determine the number of leaves covered by this size. this + * is the buddy size that we will start with as we search for + * the buddy system that contains the specified leaf. + */ + budsz = BUDSIZE(size, tp->dmt_budmin); + + /* back split. + */ + while (leaf[leafno] == NOFREE) { + /* find the leftmost buddy leaf. + */ + for (w = leafno, bsz = budsz;; bsz <<= 1, + w = (w < bud) ? w : bud) { + if (bsz >= le32_to_cpu(tp->dmt_nleafs)) { + jfs_err("JFS: block map error in dbBackSplit"); + return -EIO; + } + + /* determine the buddy. + */ + bud = w ^ bsz; + + /* check if this buddy is the start of the system. + */ + if (leaf[bud] != NOFREE) { + /* split the leaf at the start of the + * system in two. + */ + cursz = leaf[bud] - 1; + dbSplit(tp, bud, cursz, cursz); + break; + } + } + } + + if (leaf[leafno] != size) { + jfs_err("JFS: wrong leaf value in dbBackSplit"); + return -EIO; + } + return 0; +} + + +/* + * NAME: dbJoin() + * + * FUNCTION: update the leaf of a dmtree with a new value, joining + * the leaf with other leaves of the dmtree into a multi-leaf + * binary buddy system, as required. + * + * PARAMETERS: + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * newval - the new value for the leaf. + * + * RETURN VALUES: none + */ +static int dbJoin(dmtree_t * tp, int leafno, int newval) +{ + int budsz, buddy; + s8 *leaf; + + /* can the new leaf value require a join with other leaves ? + */ + if (newval >= tp->dmt_budmin) { + /* pickup a pointer to the leaves of the tree. + */ + leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); + + /* try to join the specified leaf into a large binary + * buddy system. the join proceeds by attempting to join + * the specified leafno with its buddy (leaf) at new value. + * if the join occurs, we attempt to join the left leaf + * of the joined buddies with its buddy at new value + 1. + * we continue to join until we find a buddy that cannot be + * joined (does not have a value equal to the size of the + * last join) or until all leaves have been joined into a + * single system. + * + * get the buddy size (number of words covered) of + * the new value. + */ + budsz = BUDSIZE(newval, tp->dmt_budmin); + + /* try to join. + */ + while (budsz < le32_to_cpu(tp->dmt_nleafs)) { + /* get the buddy leaf. + */ + buddy = leafno ^ budsz; + + /* if the leaf's new value is greater than its + * buddy's value, we join no more. + */ + if (newval > leaf[buddy]) + break; + + /* It shouldn't be less */ + if (newval < leaf[buddy]) + return -EIO; + + /* check which (leafno or buddy) is the left buddy. + * the left buddy gets to claim the blocks resulting + * from the join while the right gets to claim none. + * the left buddy is also eligible to participate in + * a join at the next higher level while the right + * is not. + * + */ + if (leafno < buddy) { + /* leafno is the left buddy. + */ + dbAdjTree(tp, buddy, NOFREE); + } else { + /* buddy is the left buddy and becomes + * leafno. + */ + dbAdjTree(tp, leafno, NOFREE); + leafno = buddy; + } + + /* on to try the next join. + */ + newval += 1; + budsz <<= 1; + } + } + + /* update the leaf value. + */ + dbAdjTree(tp, leafno, newval); + + return 0; +} + + +/* + * NAME: dbAdjTree() + * + * FUNCTION: update a leaf of a dmtree with a new value, adjusting + * the dmtree, as required, to reflect the new leaf value. + * the combination of any buddies must already be done before + * this is called. + * + * PARAMETERS: + * tp - pointer to the tree to be adjusted. + * leafno - the number of the leaf to be updated. + * newval - the new value for the leaf. + * + * RETURN VALUES: none + */ +static void dbAdjTree(dmtree_t * tp, int leafno, int newval) +{ + int lp, pp, k; + int max; + + /* pick up the index of the leaf for this leafno. + */ + lp = leafno + le32_to_cpu(tp->dmt_leafidx); + + /* is the current value the same as the old value ? if so, + * there is nothing to do. + */ + if (tp->dmt_stree[lp] == newval) + return; + + /* set the new value. + */ + tp->dmt_stree[lp] = newval; + + /* bubble the new value up the tree as required. + */ + for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) { + /* get the index of the first leaf of the 4 leaf + * group containing the specified leaf (leafno). + */ + lp = ((lp - 1) & ~0x03) + 1; + + /* get the index of the parent of this 4 leaf group. + */ + pp = (lp - 1) >> 2; + + /* determine the maximum of the 4 leaves. + */ + max = TREEMAX(&tp->dmt_stree[lp]); + + /* if the maximum of the 4 is the same as the + * parent's value, we're done. + */ + if (tp->dmt_stree[pp] == max) + break; + + /* parent gets new value. + */ + tp->dmt_stree[pp] = max; + + /* parent becomes leaf for next go-round. + */ + lp = pp; + } +} + + +/* + * NAME: dbFindLeaf() + * + * FUNCTION: search a dmtree_t for sufficient free blocks, returning + * the index of a leaf describing the free blocks if + * sufficient free blocks are found. + * + * the search starts at the top of the dmtree_t tree and + * proceeds down the tree to the leftmost leaf with sufficient + * free space. + * + * PARAMETERS: + * tp - pointer to the tree to be searched. + * l2nb - log2 number of free blocks to search for. + * leafidx - return pointer to be set to the index of the leaf + * describing at least l2nb free blocks if sufficient + * free blocks are found. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient free blocks. + */ +static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) +{ + int ti, n = 0, k, x = 0; + + /* first check the root of the tree to see if there is + * sufficient free space. + */ + if (l2nb > tp->dmt_stree[ROOT]) + return -ENOSPC; + + /* sufficient free space available. now search down the tree + * starting at the next level for the leftmost leaf that + * describes sufficient free space. + */ + for (k = le32_to_cpu(tp->dmt_height), ti = 1; + k > 0; k--, ti = ((ti + n) << 2) + 1) { + /* search the four nodes at this level, starting from + * the left. + */ + for (x = ti, n = 0; n < 4; n++) { + /* sufficient free space found. move to the next + * level (or quit if this is the last level). + */ + if (l2nb <= tp->dmt_stree[x + n]) + break; + } + + /* better have found something since the higher + * levels of the tree said it was here. + */ + assert(n < 4); + } + + /* set the return to the leftmost leaf describing sufficient + * free space. + */ + *leafidx = x + n - le32_to_cpu(tp->dmt_leafidx); + + return (0); +} + + +/* + * NAME: dbFindBits() + * + * FUNCTION: find a specified number of binary buddy free bits within a + * dmap bitmap word value. + * + * this routine searches the bitmap value for (1 << l2nb) free + * bits at (1 << l2nb) alignments within the value. + * + * PARAMETERS: + * word - dmap bitmap word value. + * l2nb - number of free bits specified as a log2 number. + * + * RETURN VALUES: + * starting bit number of free bits. + */ +static int dbFindBits(u32 word, int l2nb) +{ + int bitno, nb; + u32 mask; + + /* get the number of bits. + */ + nb = 1 << l2nb; + assert(nb <= DBWORD); + + /* complement the word so we can use a mask (i.e. 0s represent + * free bits) and compute the mask. + */ + word = ~word; + mask = ONES << (DBWORD - nb); + + /* scan the word for nb free bits at nb alignments. + */ + for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) { + if ((mask & word) == mask) + break; + } + + ASSERT(bitno < 32); + + /* return the bit number. + */ + return (bitno); +} + + +/* + * NAME: dbMaxBud(u8 *cp) + * + * FUNCTION: determine the largest binary buddy string of free + * bits within 32-bits of the map. + * + * PARAMETERS: + * cp - pointer to the 32-bit value. + * + * RETURN VALUES: + * largest binary buddy of free bits within a dmap word. + */ +static int dbMaxBud(u8 * cp) +{ + signed char tmp1, tmp2; + + /* check if the wmap word is all free. if so, the + * free buddy size is BUDMIN. + */ + if (*((uint *) cp) == 0) + return (BUDMIN); + + /* check if the wmap word is half free. if so, the + * free buddy size is BUDMIN-1. + */ + if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0) + return (BUDMIN - 1); + + /* not all free or half free. determine the free buddy + * size thru table lookup using quarters of the wmap word. + */ + tmp1 = max(budtab[cp[2]], budtab[cp[3]]); + tmp2 = max(budtab[cp[0]], budtab[cp[1]]); + return (max(tmp1, tmp2)); +} + + +/* + * NAME: cnttz(uint word) + * + * FUNCTION: determine the number of trailing zeros within a 32-bit + * value. + * + * PARAMETERS: + * value - 32-bit value to be examined. + * + * RETURN VALUES: + * count of trailing zeros + */ +static int cnttz(u32 word) +{ + int n; + + for (n = 0; n < 32; n++, word >>= 1) { + if (word & 0x01) + break; + } + + return (n); +} + + +/* + * NAME: cntlz(u32 value) + * + * FUNCTION: determine the number of leading zeros within a 32-bit + * value. + * + * PARAMETERS: + * value - 32-bit value to be examined. + * + * RETURN VALUES: + * count of leading zeros + */ +static int cntlz(u32 value) +{ + int n; + + for (n = 0; n < 32; n++, value <<= 1) { + if (value & HIGHORDER) + break; + } + return (n); +} + + +/* + * NAME: blkstol2(s64 nb) + * + * FUNCTION: convert a block count to its log2 value. if the block + * count is not a l2 multiple, it is rounded up to the next + * larger l2 multiple. + * + * PARAMETERS: + * nb - number of blocks + * + * RETURN VALUES: + * log2 number of blocks + */ +static int blkstol2(s64 nb) +{ + int l2nb; + s64 mask; /* meant to be signed */ + + mask = (s64) 1 << (64 - 1); + + /* count the leading bits. + */ + for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) { + /* leading bit found. + */ + if (nb & mask) { + /* determine the l2 value. + */ + l2nb = (64 - 1) - l2nb; + + /* check if we need to round up. + */ + if (~mask & nb) + l2nb++; + + return (l2nb); + } + } + assert(0); + return 0; /* fix compiler warning */ +} + + +/* + * NAME: dbAllocBottomUp() + * + * FUNCTION: alloc the specified block range from the working block + * allocation map. + * + * the blocks will be alloc from the working map one dmap + * at a time. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * blkno - starting block number to be freed. + * nblocks - number of blocks to be freed. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks) +{ + struct metapage *mp; + struct dmap *dp; + int nb, rc; + s64 lblkno, rem; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* block to be allocated better be within the mapsize. */ + ASSERT(nblocks <= bmp->db_mapsize - blkno); + + /* + * allocate the blocks a dmap at a time. + */ + mp = NULL; + for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) { + /* release previous dmap if any */ + if (mp) { + write_metapage(mp); + } + + /* get the buffer for the current dmap. */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return -EIO; + } + dp = (struct dmap *) mp->data; + + /* determine the number of blocks to be allocated from + * this dmap. + */ + nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); + + /* allocate the blocks. */ + if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) { + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + return (rc); + } + } + + /* write the last buffer. */ + write_metapage(mp); + + IREAD_UNLOCK(ipbmap); + + return (0); +} + + +static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + int rc; + int dbitno, word, rembits, nb, nwords, wbitno, agno; + s8 oldroot, *leaf; + struct dmaptree *tp = (struct dmaptree *) & dp->tree; + + /* save the current value of the root (i.e. maximum free string) + * of the dmap tree. + */ + oldroot = tp->stree[ROOT]; + + /* pick up a pointer to the leaves of the dmap tree */ + leaf = tp->stree + LEAFIND; + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* block range better be within the dmap */ + assert(dbitno + nblocks <= BPERDMAP); + + /* allocate the bits of the dmap's words corresponding to the block + * range. not all bits of the first and last words may be contained + * within the block range. if this is the case, we'll work against + * those words (i.e. partial first and/or last) on an individual basis + * (a single pass), allocating the bits of interest by hand and + * updating the leaf corresponding to the dmap word. a single pass + * will be used for all dmap words fully contained within the + * specified range. within this pass, the bits of all fully contained + * dmap words will be marked as free in a single shot and the leaves + * will be updated. a single leaf may describe the free space of + * multiple dmap words, so we may update only a subset of the actual + * leaves corresponding to the dmap words of the block range. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of a word is to be allocated. + */ + if (nb < DBWORD) { + /* allocate (set to 1) the appropriate bits within + * this dmap word. + */ + dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb) + >> wbitno); + + word++; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and allocate (set to 1) the bits of these + * words. + */ + nwords = rembits >> L2DBWORD; + memset(&dp->wmap[word], (int) ONES, nwords * 4); + + /* determine how many bits */ + nb = nwords << L2DBWORD; + word += nwords; + } + } + + /* update the free count for this dmap */ + le32_add_cpu(&dp->nfree, -nblocks); + + /* reconstruct summary tree */ + dbInitDmapTree(dp); + + BMAP_LOCK(bmp); + + /* if this allocation group is completely free, + * update the highest active allocation group number + * if this allocation group is the new max. + */ + agno = blkno >> bmp->db_agl2size; + if (agno > bmp->db_maxag) + bmp->db_maxag = agno; + + /* update the free count for the allocation group and map */ + bmp->db_agfree[agno] -= nblocks; + bmp->db_nfree -= nblocks; + + BMAP_UNLOCK(bmp); + + /* if the root has not changed, done. */ + if (tp->stree[ROOT] == oldroot) + return (0); + + /* root changed. bubble the change up to the dmap control pages. + * if the adjustment of the upper level control pages fails, + * backout the bit allocation (thus making everything consistent). + */ + if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0))) + dbFreeBits(bmp, dp, blkno, nblocks); + + return (rc); +} + + +/* + * NAME: dbExtendFS() + * + * FUNCTION: extend bmap from blkno for nblocks; + * dbExtendFS() updates bmap ready for dbAllocBottomUp(); + * + * L2 + * | + * L1---------------------------------L1 + * | | + * L0---------L0---------L0 L0---------L0---------L0 + * | | | | | | + * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm; + * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm + * + * <---old---><----------------------------extend-----------------------> + */ +int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) +{ + struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb); + int nbperpage = sbi->nbperpage; + int i, i0 = true, j, j0 = true, k, n; + s64 newsize; + s64 p; + struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL; + struct dmapctl *l2dcp, *l1dcp, *l0dcp; + struct dmap *dp; + s8 *l0leaf, *l1leaf, *l2leaf; + struct bmap *bmp = sbi->bmap; + int agno, l2agsize, oldl2agsize; + s64 ag_rem; + + newsize = blkno + nblocks; + + jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld", + (long long) blkno, (long long) nblocks, (long long) newsize); + + /* + * initialize bmap control page. + * + * all the data in bmap control page should exclude + * the mkfs hidden dmap page. + */ + + /* update mapsize */ + bmp->db_mapsize = newsize; + bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize); + + /* compute new AG size */ + l2agsize = dbGetL2AGSize(newsize); + oldl2agsize = bmp->db_agl2size; + + bmp->db_agl2size = l2agsize; + bmp->db_agsize = 1 << l2agsize; + + /* compute new number of AG */ + agno = bmp->db_numag; + bmp->db_numag = newsize >> l2agsize; + bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0; + + /* + * reconfigure db_agfree[] + * from old AG configuration to new AG configuration; + * + * coalesce contiguous k (newAGSize/oldAGSize) AGs; + * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; + * note: new AG size = old AG size * (2**x). + */ + if (l2agsize == oldl2agsize) + goto extend; + k = 1 << (l2agsize - oldl2agsize); + ag_rem = bmp->db_agfree[0]; /* save agfree[0] */ + for (i = 0, n = 0; i < agno; n++) { + bmp->db_agfree[n] = 0; /* init collection point */ + + /* coalesce contiguous k AGs; */ + for (j = 0; j < k && i < agno; j++, i++) { + /* merge AGi to AGn */ + bmp->db_agfree[n] += bmp->db_agfree[i]; + } + } + bmp->db_agfree[0] += ag_rem; /* restore agfree[0] */ + + for (; n < MAXAG; n++) + bmp->db_agfree[n] = 0; + + /* + * update highest active ag number + */ + + bmp->db_maxag = bmp->db_maxag / k; + + /* + * extend bmap + * + * update bit maps and corresponding level control pages; + * global control page db_nfree, db_agfree[agno], db_maxfreebud; + */ + extend: + /* get L2 page */ + p = BMAPBLKNO + nbperpage; /* L2 page */ + l2mp = read_metapage(ipbmap, p, PSIZE, 0); + if (!l2mp) { + jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read"); + return -EIO; + } + l2dcp = (struct dmapctl *) l2mp->data; + + /* compute start L1 */ + k = blkno >> L2MAXL1SIZE; + l2leaf = l2dcp->stree + CTLLEAFIND + k; + p = BLKTOL1(blkno, sbi->l2nbperpage); /* L1 page */ + + /* + * extend each L1 in L2 + */ + for (; k < LPERCTL; k++, p += nbperpage) { + /* get L1 page */ + if (j0) { + /* read in L1 page: (blkno & (MAXL1SIZE - 1)) */ + l1mp = read_metapage(ipbmap, p, PSIZE, 0); + if (l1mp == NULL) + goto errout; + l1dcp = (struct dmapctl *) l1mp->data; + + /* compute start L0 */ + j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE; + l1leaf = l1dcp->stree + CTLLEAFIND + j; + p = BLKTOL0(blkno, sbi->l2nbperpage); + j0 = false; + } else { + /* assign/init L1 page */ + l1mp = get_metapage(ipbmap, p, PSIZE, 0); + if (l1mp == NULL) + goto errout; + + l1dcp = (struct dmapctl *) l1mp->data; + + /* compute start L0 */ + j = 0; + l1leaf = l1dcp->stree + CTLLEAFIND; + p += nbperpage; /* 1st L0 of L1.k */ + } + + /* + * extend each L0 in L1 + */ + for (; j < LPERCTL; j++) { + /* get L0 page */ + if (i0) { + /* read in L0 page: (blkno & (MAXL0SIZE - 1)) */ + + l0mp = read_metapage(ipbmap, p, PSIZE, 0); + if (l0mp == NULL) + goto errout; + l0dcp = (struct dmapctl *) l0mp->data; + + /* compute start dmap */ + i = (blkno & (MAXL0SIZE - 1)) >> + L2BPERDMAP; + l0leaf = l0dcp->stree + CTLLEAFIND + i; + p = BLKTODMAP(blkno, + sbi->l2nbperpage); + i0 = false; + } else { + /* assign/init L0 page */ + l0mp = get_metapage(ipbmap, p, PSIZE, 0); + if (l0mp == NULL) + goto errout; + + l0dcp = (struct dmapctl *) l0mp->data; + + /* compute start dmap */ + i = 0; + l0leaf = l0dcp->stree + CTLLEAFIND; + p += nbperpage; /* 1st dmap of L0.j */ + } + + /* + * extend each dmap in L0 + */ + for (; i < LPERCTL; i++) { + /* + * reconstruct the dmap page, and + * initialize corresponding parent L0 leaf + */ + if ((n = blkno & (BPERDMAP - 1))) { + /* read in dmap page: */ + mp = read_metapage(ipbmap, p, + PSIZE, 0); + if (mp == NULL) + goto errout; + n = min(nblocks, (s64)BPERDMAP - n); + } else { + /* assign/init dmap page */ + mp = read_metapage(ipbmap, p, + PSIZE, 0); + if (mp == NULL) + goto errout; + + n = min(nblocks, (s64)BPERDMAP); + } + + dp = (struct dmap *) mp->data; + *l0leaf = dbInitDmap(dp, blkno, n); + + bmp->db_nfree += n; + agno = le64_to_cpu(dp->start) >> l2agsize; + bmp->db_agfree[agno] += n; + + write_metapage(mp); + + l0leaf++; + p += nbperpage; + + blkno += n; + nblocks -= n; + if (nblocks == 0) + break; + } /* for each dmap in a L0 */ + + /* + * build current L0 page from its leaves, and + * initialize corresponding parent L1 leaf + */ + *l1leaf = dbInitDmapCtl(l0dcp, 0, ++i); + write_metapage(l0mp); + l0mp = NULL; + + if (nblocks) + l1leaf++; /* continue for next L0 */ + else { + /* more than 1 L0 ? */ + if (j > 0) + break; /* build L1 page */ + else { + /* summarize in global bmap page */ + bmp->db_maxfreebud = *l1leaf; + release_metapage(l1mp); + release_metapage(l2mp); + goto finalize; + } + } + } /* for each L0 in a L1 */ + + /* + * build current L1 page from its leaves, and + * initialize corresponding parent L2 leaf + */ + *l2leaf = dbInitDmapCtl(l1dcp, 1, ++j); + write_metapage(l1mp); + l1mp = NULL; + + if (nblocks) + l2leaf++; /* continue for next L1 */ + else { + /* more than 1 L1 ? */ + if (k > 0) + break; /* build L2 page */ + else { + /* summarize in global bmap page */ + bmp->db_maxfreebud = *l2leaf; + release_metapage(l2mp); + goto finalize; + } + } + } /* for each L1 in a L2 */ + + jfs_error(ipbmap->i_sb, + "dbExtendFS: function has not returned as expected"); +errout: + if (l0mp) + release_metapage(l0mp); + if (l1mp) + release_metapage(l1mp); + release_metapage(l2mp); + return -EIO; + + /* + * finalize bmap control page + */ +finalize: + + return 0; +} + + +/* + * dbFinalizeBmap() + */ +void dbFinalizeBmap(struct inode *ipbmap) +{ + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + int actags, inactags, l2nl; + s64 ag_rem, actfree, inactfree, avgfree; + int i, n; + + /* + * finalize bmap control page + */ +//finalize: + /* + * compute db_agpref: preferred ag to allocate from + * (the leftmost ag with average free space in it); + */ +//agpref: + /* get the number of active ags and inacitve ags */ + actags = bmp->db_maxag + 1; + inactags = bmp->db_numag - actags; + ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1); /* ??? */ + + /* determine how many blocks are in the inactive allocation + * groups. in doing this, we must account for the fact that + * the rightmost group might be a partial group (i.e. file + * system size is not a multiple of the group size). + */ + inactfree = (inactags && ag_rem) ? + ((inactags - 1) << bmp->db_agl2size) + ag_rem + : inactags << bmp->db_agl2size; + + /* determine how many free blocks are in the active + * allocation groups plus the average number of free blocks + * within the active ags. + */ + actfree = bmp->db_nfree - inactfree; + avgfree = (u32) actfree / (u32) actags; + + /* if the preferred allocation group has not average free space. + * re-establish the preferred group as the leftmost + * group with average free space. + */ + if (bmp->db_agfree[bmp->db_agpref] < avgfree) { + for (bmp->db_agpref = 0; bmp->db_agpref < actags; + bmp->db_agpref++) { + if (bmp->db_agfree[bmp->db_agpref] >= avgfree) + break; + } + if (bmp->db_agpref >= bmp->db_numag) { + jfs_error(ipbmap->i_sb, + "cannot find ag with average freespace"); + } + } + + /* + * compute db_aglevel, db_agheight, db_width, db_agstart: + * an ag is covered in aglevel dmapctl summary tree, + * at agheight level height (from leaf) with agwidth number of nodes + * each, which starts at agstart index node of the smmary tree node + * array; + */ + bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize); + l2nl = + bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL); + bmp->db_agheight = l2nl >> 1; + bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1)); + for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0; + i--) { + bmp->db_agstart += n; + n <<= 2; + } + +} + + +/* + * NAME: dbInitDmap()/ujfs_idmap_page() + * + * FUNCTION: initialize working/persistent bitmap of the dmap page + * for the specified number of blocks: + * + * at entry, the bitmaps had been initialized as free (ZEROS); + * The number of blocks will only account for the actually + * existing blocks. Blocks which don't actually exist in + * the aggregate will be marked as allocated (ONES); + * + * PARAMETERS: + * dp - pointer to page of map + * nblocks - number of blocks this page + * + * RETURNS: NONE + */ +static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks) +{ + int blkno, w, b, r, nw, nb, i; + + /* starting block number within the dmap */ + blkno = Blkno & (BPERDMAP - 1); + + if (blkno == 0) { + dp->nblocks = dp->nfree = cpu_to_le32(nblocks); + dp->start = cpu_to_le64(Blkno); + + if (nblocks == BPERDMAP) { + memset(&dp->wmap[0], 0, LPERDMAP * 4); + memset(&dp->pmap[0], 0, LPERDMAP * 4); + goto initTree; + } + } else { + le32_add_cpu(&dp->nblocks, nblocks); + le32_add_cpu(&dp->nfree, nblocks); + } + + /* word number containing start block number */ + w = blkno >> L2DBWORD; + + /* + * free the bits corresponding to the block range (ZEROS): + * note: not all bits of the first and last words may be contained + * within the block range. + */ + for (r = nblocks; r > 0; r -= nb, blkno += nb) { + /* number of bits preceding range to be freed in the word */ + b = blkno & (DBWORD - 1); + /* number of bits to free in the word */ + nb = min(r, DBWORD - b); + + /* is partial word to be freed ? */ + if (nb < DBWORD) { + /* free (set to 0) from the bitmap word */ + dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb) + >> b)); + dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb) + >> b)); + + /* skip the word freed */ + w++; + } else { + /* free (set to 0) contiguous bitmap words */ + nw = r >> L2DBWORD; + memset(&dp->wmap[w], 0, nw * 4); + memset(&dp->pmap[w], 0, nw * 4); + + /* skip the words freed */ + nb = nw << L2DBWORD; + w += nw; + } + } + + /* + * mark bits following the range to be freed (non-existing + * blocks) as allocated (ONES) + */ + + if (blkno == BPERDMAP) + goto initTree; + + /* the first word beyond the end of existing blocks */ + w = blkno >> L2DBWORD; + + /* does nblocks fall on a 32-bit boundary ? */ + b = blkno & (DBWORD - 1); + if (b) { + /* mark a partial word allocated */ + dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b); + w++; + } + + /* set the rest of the words in the page to allocated (ONES) */ + for (i = w; i < LPERDMAP; i++) + dp->pmap[i] = dp->wmap[i] = cpu_to_le32(ONES); + + /* + * init tree + */ + initTree: + return (dbInitDmapTree(dp)); +} + + +/* + * NAME: dbInitDmapTree()/ujfs_complete_dmap() + * + * FUNCTION: initialize summary tree of the specified dmap: + * + * at entry, bitmap of the dmap has been initialized; + * + * PARAMETERS: + * dp - dmap to complete + * blkno - starting block number for this dmap + * treemax - will be filled in with max free for this dmap + * + * RETURNS: max free string at the root of the tree + */ +static int dbInitDmapTree(struct dmap * dp) +{ + struct dmaptree *tp; + s8 *cp; + int i; + + /* init fixed info of tree */ + tp = &dp->tree; + tp->nleafs = cpu_to_le32(LPERDMAP); + tp->l2nleafs = cpu_to_le32(L2LPERDMAP); + tp->leafidx = cpu_to_le32(LEAFIND); + tp->height = cpu_to_le32(4); + tp->budmin = BUDMIN; + + /* init each leaf from corresponding wmap word: + * note: leaf is set to NOFREE(-1) if all blocks of corresponding + * bitmap word are allocated. + */ + cp = tp->stree + le32_to_cpu(tp->leafidx); + for (i = 0; i < LPERDMAP; i++) + *cp++ = dbMaxBud((u8 *) & dp->wmap[i]); + + /* build the dmap's binary buddy summary tree */ + return (dbInitTree(tp)); +} + + +/* + * NAME: dbInitTree()/ujfs_adjtree() + * + * FUNCTION: initialize binary buddy summary tree of a dmap or dmapctl. + * + * at entry, the leaves of the tree has been initialized + * from corresponding bitmap word or root of summary tree + * of the child control page; + * configure binary buddy system at the leaf level, then + * bubble up the values of the leaf nodes up the tree. + * + * PARAMETERS: + * cp - Pointer to the root of the tree + * l2leaves- Number of leaf nodes as a power of 2 + * l2min - Number of blocks that can be covered by a leaf + * as a power of 2 + * + * RETURNS: max free string at the root of the tree + */ +static int dbInitTree(struct dmaptree * dtp) +{ + int l2max, l2free, bsize, nextb, i; + int child, parent, nparent; + s8 *tp, *cp, *cp1; + + tp = dtp->stree; + + /* Determine the maximum free string possible for the leaves */ + l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin; + + /* + * configure the leaf levevl into binary buddy system + * + * Try to combine buddies starting with a buddy size of 1 + * (i.e. two leaves). At a buddy size of 1 two buddy leaves + * can be combined if both buddies have a maximum free of l2min; + * the combination will result in the left-most buddy leaf having + * a maximum free of l2min+1. + * After processing all buddies for a given size, process buddies + * at the next higher buddy size (i.e. current size * 2) and + * the next maximum free (current free + 1). + * This continues until the maximum possible buddy combination + * yields maximum free. + */ + for (l2free = dtp->budmin, bsize = 1; l2free < l2max; + l2free++, bsize = nextb) { + /* get next buddy size == current buddy pair size */ + nextb = bsize << 1; + + /* scan each adjacent buddy pair at current buddy size */ + for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx); + i < le32_to_cpu(dtp->nleafs); + i += nextb, cp += nextb) { + /* coalesce if both adjacent buddies are max free */ + if (*cp == l2free && *(cp + bsize) == l2free) { + *cp = l2free + 1; /* left take right */ + *(cp + bsize) = -1; /* right give left */ + } + } + } + + /* + * bubble summary information of leaves up the tree. + * + * Starting at the leaf node level, the four nodes described by + * the higher level parent node are compared for a maximum free and + * this maximum becomes the value of the parent node. + * when all lower level nodes are processed in this fashion then + * move up to the next level (parent becomes a lower level node) and + * continue the process for that level. + */ + for (child = le32_to_cpu(dtp->leafidx), + nparent = le32_to_cpu(dtp->nleafs) >> 2; + nparent > 0; nparent >>= 2, child = parent) { + /* get index of 1st node of parent level */ + parent = (child - 1) >> 2; + + /* set the value of the parent node as the maximum + * of the four nodes of the current level. + */ + for (i = 0, cp = tp + child, cp1 = tp + parent; + i < nparent; i++, cp += 4, cp1++) + *cp1 = TREEMAX(cp); + } + + return (*tp); +} + + +/* + * dbInitDmapCtl() + * + * function: initialize dmapctl page + */ +static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i) +{ /* start leaf index not covered by range */ + s8 *cp; + + dcp->nleafs = cpu_to_le32(LPERCTL); + dcp->l2nleafs = cpu_to_le32(L2LPERCTL); + dcp->leafidx = cpu_to_le32(CTLLEAFIND); + dcp->height = cpu_to_le32(5); + dcp->budmin = L2BPERDMAP + L2LPERCTL * level; + + /* + * initialize the leaves of current level that were not covered + * by the specified input block range (i.e. the leaves have no + * low level dmapctl or dmap). + */ + cp = &dcp->stree[CTLLEAFIND + i]; + for (; i < LPERCTL; i++) + *cp++ = NOFREE; + + /* build the dmap's binary buddy summary tree */ + return (dbInitTree((struct dmaptree *) dcp)); +} + + +/* + * NAME: dbGetL2AGSize()/ujfs_getagl2size() + * + * FUNCTION: Determine log2(allocation group size) from aggregate size + * + * PARAMETERS: + * nblocks - Number of blocks in aggregate + * + * RETURNS: log2(allocation group size) in aggregate blocks + */ +static int dbGetL2AGSize(s64 nblocks) +{ + s64 sz; + s64 m; + int l2sz; + + if (nblocks < BPERDMAP * MAXAG) + return (L2BPERDMAP); + + /* round up aggregate size to power of 2 */ + m = ((u64) 1 << (64 - 1)); + for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) { + if (m & nblocks) + break; + } + + sz = (s64) 1 << l2sz; + if (sz < nblocks) + l2sz += 1; + + /* agsize = roundupSize/max_number_of_ag */ + return (l2sz - L2MAXAG); +} + + +/* + * NAME: dbMapFileSizeToMapSize() + * + * FUNCTION: compute number of blocks the block allocation map file + * can cover from the map file size; + * + * RETURNS: Number of blocks which can be covered by this block map file; + */ + +/* + * maximum number of map pages at each level including control pages + */ +#define MAXL0PAGES (1 + LPERCTL) +#define MAXL1PAGES (1 + LPERCTL * MAXL0PAGES) +#define MAXL2PAGES (1 + LPERCTL * MAXL1PAGES) + +/* + * convert number of map pages to the zero origin top dmapctl level + */ +#define BMAPPGTOLEV(npages) \ + (((npages) <= 3 + MAXL0PAGES) ? 0 : \ + ((npages) <= 2 + MAXL1PAGES) ? 1 : 2) + +s64 dbMapFileSizeToMapSize(struct inode * ipbmap) +{ + struct super_block *sb = ipbmap->i_sb; + s64 nblocks; + s64 npages, ndmaps; + int level, i; + int complete, factor; + + nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize; + npages = nblocks >> JFS_SBI(sb)->l2nbperpage; + level = BMAPPGTOLEV(npages); + + /* At each level, accumulate the number of dmap pages covered by + * the number of full child levels below it; + * repeat for the last incomplete child level. + */ + ndmaps = 0; + npages--; /* skip the first global control page */ + /* skip higher level control pages above top level covered by map */ + npages -= (2 - level); + npages--; /* skip top level's control page */ + for (i = level; i >= 0; i--) { + factor = + (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1); + complete = (u32) npages / factor; + ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL : + ((i == 1) ? LPERCTL : 1)); + + /* pages in last/incomplete child */ + npages = (u32) npages % factor; + /* skip incomplete child's level control page */ + npages--; + } + + /* convert the number of dmaps into the number of blocks + * which can be covered by the dmaps; + */ + nblocks = ndmaps << L2BPERDMAP; + + return (nblocks); +} diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h new file mode 100644 index 00000000..6dcb906c --- /dev/null +++ b/fs/jfs/jfs_dmap.h @@ -0,0 +1,314 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DMAP +#define _H_JFS_DMAP + +#include "jfs_txnmgr.h" + +#define BMAPVERSION 1 /* version number */ +#define TREESIZE (256+64+16+4+1) /* size of a dmap tree */ +#define LEAFIND (64+16+4+1) /* index of 1st leaf of a dmap tree */ +#define LPERDMAP 256 /* num leaves per dmap tree */ +#define L2LPERDMAP 8 /* l2 number of leaves per dmap tree */ +#define DBWORD 32 /* # of blks covered by a map word */ +#define L2DBWORD 5 /* l2 # of blks covered by a mword */ +#define BUDMIN L2DBWORD /* max free string in a map word */ +#define BPERDMAP (LPERDMAP * DBWORD) /* num of blks per dmap */ +#define L2BPERDMAP 13 /* l2 num of blks per dmap */ +#define CTLTREESIZE (1024+256+64+16+4+1) /* size of a dmapctl tree */ +#define CTLLEAFIND (256+64+16+4+1) /* idx of 1st leaf of a dmapctl tree */ +#define LPERCTL 1024 /* num of leaves per dmapctl tree */ +#define L2LPERCTL 10 /* l2 num of leaves per dmapctl tree */ +#define ROOT 0 /* index of the root of a tree */ +#define NOFREE ((s8) -1) /* no blocks free */ +#define MAXAG 128 /* max number of allocation groups */ +#define L2MAXAG 7 /* l2 max num of AG */ +#define L2MINAGSZ 25 /* l2 of minimum AG size in bytes */ +#define BMAPBLKNO 0 /* lblkno of bmap within the map */ + +/* + * maximum l2 number of disk blocks at the various dmapctl levels. + */ +#define L2MAXL0SIZE (L2BPERDMAP + 1 * L2LPERCTL) +#define L2MAXL1SIZE (L2BPERDMAP + 2 * L2LPERCTL) +#define L2MAXL2SIZE (L2BPERDMAP + 3 * L2LPERCTL) + +/* + * maximum number of disk blocks at the various dmapctl levels. + */ +#define MAXL0SIZE ((s64)1 << L2MAXL0SIZE) +#define MAXL1SIZE ((s64)1 << L2MAXL1SIZE) +#define MAXL2SIZE ((s64)1 << L2MAXL2SIZE) + +#define MAXMAPSIZE MAXL2SIZE /* maximum aggregate map size */ + +/* + * determine the maximum free string for four (lower level) nodes + * of the tree. + */ +static inline signed char TREEMAX(signed char *cp) +{ + signed char tmp1, tmp2; + + tmp1 = max(*(cp+2), *(cp+3)); + tmp2 = max(*(cp), *(cp+1)); + + return max(tmp1, tmp2); +} + +/* + * convert disk block number to the logical block number of the dmap + * describing the disk block. s is the log2(number of logical blocks per page) + * + * The calculation figures out how many logical pages are in front of the dmap. + * - the number of dmaps preceding it + * - the number of L0 pages preceding its L0 page + * - the number of L1 pages preceding its L1 page + * - 3 is added to account for the L2, L1, and L0 page for this dmap + * - 1 is added to account for the control page of the map. + */ +#define BLKTODMAP(b,s) \ + ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s)) + +/* + * convert disk block number to the logical block number of the LEVEL 0 + * dmapctl describing the disk block. s is the log2(number of logical blocks + * per page) + * + * The calculation figures out how many logical pages are in front of the L0. + * - the number of dmap pages preceding it + * - the number of L0 pages preceding it + * - the number of L1 pages preceding its L1 page + * - 2 is added to account for the L2, and L1 page for this L0 + * - 1 is added to account for the control page of the map. + */ +#define BLKTOL0(b,s) \ + (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s)) + +/* + * convert disk block number to the logical block number of the LEVEL 1 + * dmapctl describing the disk block. s is the log2(number of logical blocks + * per page) + * + * The calculation figures out how many logical pages are in front of the L1. + * - the number of dmap pages preceding it + * - the number of L0 pages preceding it + * - the number of L1 pages preceding it + * - 1 is added to account for the L2 page + * - 1 is added to account for the control page of the map. + */ +#define BLKTOL1(b,s) \ + (((((b) >> 33) << 20) + (((b) >> 33) << 10) + ((b) >> 33) + 1 + 1) << (s)) + +/* + * convert disk block number to the logical block number of the dmapctl + * at the specified level which describes the disk block. + */ +#define BLKTOCTL(b,s,l) \ + (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s))) + +/* + * convert aggregate map size to the zero origin dmapctl level of the + * top dmapctl. + */ +#define BMAPSZTOLEV(size) \ + (((size) <= MAXL0SIZE) ? 0 : ((size) <= MAXL1SIZE) ? 1 : 2) + +/* convert disk block number to allocation group number. + */ +#define BLKTOAG(b,sbi) ((b) >> ((sbi)->bmap->db_agl2size)) + +/* convert allocation group number to starting disk block + * number. + */ +#define AGTOBLK(a,ip) \ + ((s64)(a) << (JFS_SBI((ip)->i_sb)->bmap->db_agl2size)) + +/* + * dmap summary tree + * + * dmaptree must be consistent with dmapctl. + */ +struct dmaptree { + __le32 nleafs; /* 4: number of tree leafs */ + __le32 l2nleafs; /* 4: l2 number of tree leafs */ + __le32 leafidx; /* 4: index of first tree leaf */ + __le32 height; /* 4: height of the tree */ + s8 budmin; /* 1: min l2 tree leaf value to combine */ + s8 stree[TREESIZE]; /* TREESIZE: tree */ + u8 pad[2]; /* 2: pad to word boundary */ +}; /* - 360 - */ + +/* + * dmap page per 8K blocks bitmap + */ +struct dmap { + __le32 nblocks; /* 4: num blks covered by this dmap */ + __le32 nfree; /* 4: num of free blks in this dmap */ + __le64 start; /* 8: starting blkno for this dmap */ + struct dmaptree tree; /* 360: dmap tree */ + u8 pad[1672]; /* 1672: pad to 2048 bytes */ + __le32 wmap[LPERDMAP]; /* 1024: bits of the working map */ + __le32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */ +}; /* - 4096 - */ + +/* + * disk map control page per level. + * + * dmapctl must be consistent with dmaptree. + */ +struct dmapctl { + __le32 nleafs; /* 4: number of tree leafs */ + __le32 l2nleafs; /* 4: l2 number of tree leafs */ + __le32 leafidx; /* 4: index of the first tree leaf */ + __le32 height; /* 4: height of tree */ + s8 budmin; /* 1: minimum l2 tree leaf value */ + s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */ + u8 pad[2714]; /* 2714: pad to 4096 */ +}; /* - 4096 - */ + +/* + * common definition for dmaptree within dmap and dmapctl + */ +typedef union dmtree { + struct dmaptree t1; + struct dmapctl t2; +} dmtree_t; + +/* macros for accessing fields within dmtree */ +#define dmt_nleafs t1.nleafs +#define dmt_l2nleafs t1.l2nleafs +#define dmt_leafidx t1.leafidx +#define dmt_height t1.height +#define dmt_budmin t1.budmin +#define dmt_stree t1.stree + +/* + * on-disk aggregate disk allocation map descriptor. + */ +struct dbmap_disk { + __le64 dn_mapsize; /* 8: number of blocks in aggregate */ + __le64 dn_nfree; /* 8: num free blks in aggregate map */ + __le32 dn_l2nbperpage; /* 4: number of blks per page */ + __le32 dn_numag; /* 4: total number of ags */ + __le32 dn_maxlevel; /* 4: number of active ags */ + __le32 dn_maxag; /* 4: max active alloc group number */ + __le32 dn_agpref; /* 4: preferred alloc group (hint) */ + __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ + __le32 dn_agheight; /* 4: height in dmapctl of the AG */ + __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ + __le32 dn_agstart; /* 4: start tree index at AG height */ + __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ + __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count */ + __le64 dn_agsize; /* 8: num of blks per alloc group */ + s8 dn_maxfreebud; /* 1: max free buddy system */ + u8 pad[3007]; /* 3007: pad to 4096 */ +}; /* - 4096 - */ + +struct dbmap { + s64 dn_mapsize; /* number of blocks in aggregate */ + s64 dn_nfree; /* num free blks in aggregate map */ + int dn_l2nbperpage; /* number of blks per page */ + int dn_numag; /* total number of ags */ + int dn_maxlevel; /* number of active ags */ + int dn_maxag; /* max active alloc group number */ + int dn_agpref; /* preferred alloc group (hint) */ + int dn_aglevel; /* dmapctl level holding the AG */ + int dn_agheight; /* height in dmapctl of the AG */ + int dn_agwidth; /* width in dmapctl of the AG */ + int dn_agstart; /* start tree index at AG height */ + int dn_agl2size; /* l2 num of blks per alloc group */ + s64 dn_agfree[MAXAG]; /* per AG free count */ + s64 dn_agsize; /* num of blks per alloc group */ + signed char dn_maxfreebud; /* max free buddy system */ +}; /* - 4096 - */ +/* + * in-memory aggregate disk allocation map descriptor. + */ +struct bmap { + struct dbmap db_bmap; /* on-disk aggregate map descriptor */ + struct inode *db_ipbmap; /* ptr to aggregate map incore inode */ + struct mutex db_bmaplock; /* aggregate map lock */ + atomic_t db_active[MAXAG]; /* count of active, open files in AG */ + u32 *db_DBmap; +}; + +/* macros for accessing fields within in-memory aggregate map descriptor */ +#define db_mapsize db_bmap.dn_mapsize +#define db_nfree db_bmap.dn_nfree +#define db_agfree db_bmap.dn_agfree +#define db_agsize db_bmap.dn_agsize +#define db_agl2size db_bmap.dn_agl2size +#define db_agwidth db_bmap.dn_agwidth +#define db_agheight db_bmap.dn_agheight +#define db_agstart db_bmap.dn_agstart +#define db_numag db_bmap.dn_numag +#define db_maxlevel db_bmap.dn_maxlevel +#define db_aglevel db_bmap.dn_aglevel +#define db_agpref db_bmap.dn_agpref +#define db_maxag db_bmap.dn_maxag +#define db_maxfreebud db_bmap.dn_maxfreebud +#define db_l2nbperpage db_bmap.dn_l2nbperpage + +/* + * macros for various conversions needed by the allocators. + * blkstol2(), cntlz(), and cnttz() are operating system dependent functions. + */ +/* convert number of blocks to log2 number of blocks, rounding up to + * the next log2 value if blocks is not a l2 multiple. + */ +#define BLKSTOL2(d) (blkstol2(d)) + +/* convert number of leafs to log2 leaf value */ +#define NLSTOL2BSZ(n) (31 - cntlz((n)) + BUDMIN) + +/* convert leaf index to log2 leaf value */ +#define LITOL2BSZ(n,m,b) ((((n) == 0) ? (m) : cnttz((n))) + (b)) + +/* convert a block number to a dmap control leaf index */ +#define BLKTOCTLLEAF(b,m) \ + (((b) & (((s64)1 << ((m) + L2LPERCTL)) - 1)) >> (m)) + +/* convert log2 leaf value to buddy size */ +#define BUDSIZE(s,m) (1 << ((s) - (m))) + +/* + * external references. + */ +extern int dbMount(struct inode *ipbmap); + +extern int dbUnmount(struct inode *ipbmap, int mounterror); + +extern int dbFree(struct inode *ipbmap, s64 blkno, s64 nblocks); + +extern int dbUpdatePMap(struct inode *ipbmap, + int free, s64 blkno, s64 nblocks, struct tblock * tblk); + +extern int dbNextAG(struct inode *ipbmap); + +extern int dbAlloc(struct inode *ipbmap, s64 hint, s64 nblocks, s64 * results); + +extern int dbReAlloc(struct inode *ipbmap, + s64 blkno, s64 nblocks, s64 addnblocks, s64 * results); + +extern int dbSync(struct inode *ipbmap); +extern int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks); +extern int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks); +extern void dbFinalizeBmap(struct inode *ipbmap); +extern s64 dbMapFileSizeToMapSize(struct inode *ipbmap); +#endif /* _H_JFS_DMAP */ diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c new file mode 100644 index 00000000..9197a1b0 --- /dev/null +++ b/fs/jfs/jfs_dtree.c @@ -0,0 +1,4567 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_dtree.c: directory B+-tree manager + * + * B+-tree with variable length key directory: + * + * each directory page is structured as an array of 32-byte + * directory entry slots initialized as a freelist + * to avoid search/compaction of free space at insertion. + * when an entry is inserted, a number of slots are allocated + * from the freelist as required to store variable length data + * of the entry; when the entry is deleted, slots of the entry + * are returned to freelist. + * + * leaf entry stores full name as key and file serial number + * (aka inode number) as data. + * internal/router entry stores sufffix compressed name + * as key and simple extent descriptor as data. + * + * each directory page maintains a sorted entry index table + * which stores the start slot index of sorted entries + * to allow binary search on the table. + * + * directory starts as a root/leaf page in on-disk inode + * inline data area. + * when it becomes full, it starts a leaf of a external extent + * of length of 1 block. each time the first leaf becomes full, + * it is extended rather than split (its size is doubled), + * until its length becoms 4 KBytes, from then the extent is split + * with new 4 Kbyte extent when it becomes full + * to reduce external fragmentation of small directories. + * + * blah, blah, blah, for linear scan of directory in pieces by + * readdir(). + * + * + * case-insensitive directory file system + * + * names are stored in case-sensitive way in leaf entry. + * but stored, searched and compared in case-insensitive (uppercase) order + * (i.e., both search key and entry key are folded for search/compare): + * (note that case-sensitive order is BROKEN in storage, e.g., + * sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad + * + * entries which folds to the same key makes up a equivalent class + * whose members are stored as contiguous cluster (may cross page boundary) + * but whose order is arbitrary and acts as duplicate, e.g., + * abc, Abc, aBc, abC) + * + * once match is found at leaf, requires scan forward/backward + * either for, in case-insensitive search, duplicate + * or for, in case-sensitive search, for exact match + * + * router entry must be created/stored in case-insensitive way + * in internal entry: + * (right most key of left page and left most key of right page + * are folded, and its suffix compression is propagated as router + * key in parent) + * (e.g., if split occurs and , trather than + * should be made the router key for the split) + * + * case-insensitive search: + * + * fold search key; + * + * case-insensitive search of B-tree: + * for internal entry, router key is already folded; + * for leaf entry, fold the entry key before comparison. + * + * if (leaf entry case-insensitive match found) + * if (next entry satisfies case-insensitive match) + * return EDUPLICATE; + * if (prev entry satisfies case-insensitive match) + * return EDUPLICATE; + * return match; + * else + * return no match; + * + * serialization: + * target directory inode lock is being held on entry/exit + * of all main directory service routines. + * + * log based recovery: + */ + +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dmap.h" +#include "jfs_unicode.h" +#include "jfs_debug.h" + +/* dtree split parameter */ +struct dtsplit { + struct metapage *mp; + s16 index; + s16 nslot; + struct component_name *key; + ddata_t *data; + struct pxdlist *pxdlist; +}; + +#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) + +/* get page buffer for specified block address */ +#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ +{\ + BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\ + if (!(RC))\ + {\ + if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\ + ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\ + {\ + BT_PUTPAGE(MP);\ + jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\ + MP = NULL;\ + RC = -EIO;\ + }\ + }\ +} + +/* for consistency */ +#define DT_PUTPAGE(MP) BT_PUTPAGE(MP) + +#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ + BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot) + +/* + * forward references + */ +static int dtSplitUp(tid_t tid, struct inode *ip, + struct dtsplit * split, struct btstack * btstack); + +static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, + struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp); + +static int dtExtendPage(tid_t tid, struct inode *ip, + struct dtsplit * split, struct btstack * btstack); + +static int dtSplitRoot(tid_t tid, struct inode *ip, + struct dtsplit * split, struct metapage ** rmpp); + +static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, + dtpage_t * fp, struct btstack * btstack); + +static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p); + +static int dtReadFirst(struct inode *ip, struct btstack * btstack); + +static int dtReadNext(struct inode *ip, + loff_t * offset, struct btstack * btstack); + +static int dtCompare(struct component_name * key, dtpage_t * p, int si); + +static int ciCompare(struct component_name * key, dtpage_t * p, int si, + int flag); + +static void dtGetKey(dtpage_t * p, int i, struct component_name * key, + int flag); + +static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, + int ri, struct component_name * key, int flag); + +static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, + ddata_t * data, struct dt_lock **); + +static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, + struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, + int do_index); + +static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock); + +static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock); + +static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock); + +#define ciToUpper(c) UniStrupr((c)->name) + +/* + * read_index_page() + * + * Reads a page of a directory's index table. + * Having metadata mapped into the directory inode's address space + * presents a multitude of problems. We avoid this by mapping to + * the absolute address space outside of the *_metapage routines + */ +static struct metapage *read_index_page(struct inode *inode, s64 blkno) +{ + int rc; + s64 xaddr; + int xflag; + s32 xlen; + + rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); + if (rc || (xaddr == 0)) + return NULL; + + return read_metapage(inode, xaddr, PSIZE, 1); +} + +/* + * get_index_page() + * + * Same as get_index_page(), but get's a new page without reading + */ +static struct metapage *get_index_page(struct inode *inode, s64 blkno) +{ + int rc; + s64 xaddr; + int xflag; + s32 xlen; + + rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); + if (rc || (xaddr == 0)) + return NULL; + + return get_metapage(inode, xaddr, PSIZE, 1); +} + +/* + * find_index() + * + * Returns dtree page containing directory table entry for specified + * index and pointer to its entry. + * + * mp must be released by caller. + */ +static struct dir_table_slot *find_index(struct inode *ip, u32 index, + struct metapage ** mp, s64 *lblock) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + s64 blkno; + s64 offset; + int page_offset; + struct dir_table_slot *slot; + static int maxWarnings = 10; + + if (index < 2) { + if (maxWarnings) { + jfs_warn("find_entry called with index = %d", index); + maxWarnings--; + } + return NULL; + } + + if (index >= jfs_ip->next_index) { + jfs_warn("find_entry called with index >= next_index"); + return NULL; + } + + if (jfs_dirtable_inline(ip)) { + /* + * Inline directory table + */ + *mp = NULL; + slot = &jfs_ip->i_dirtable[index - 2]; + } else { + offset = (index - 2) * sizeof(struct dir_table_slot); + page_offset = offset & (PSIZE - 1); + blkno = ((offset + 1) >> L2PSIZE) << + JFS_SBI(ip->i_sb)->l2nbperpage; + + if (*mp && (*lblock != blkno)) { + release_metapage(*mp); + *mp = NULL; + } + if (!(*mp)) { + *lblock = blkno; + *mp = read_index_page(ip, blkno); + } + if (!(*mp)) { + jfs_err("free_index: error reading directory table"); + return NULL; + } + + slot = + (struct dir_table_slot *) ((char *) (*mp)->data + + page_offset); + } + return slot; +} + +static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp, + u32 index) +{ + struct tlock *tlck; + struct linelock *llck; + struct lv *lv; + + tlck = txLock(tid, ip, mp, tlckDATA); + llck = (struct linelock *) tlck->lock; + + if (llck->index >= llck->maxcnt) + llck = txLinelock(llck); + lv = &llck->lv[llck->index]; + + /* + * Linelock slot size is twice the size of directory table + * slot size. 512 entries per page. + */ + lv->offset = ((index - 2) & 511) >> 1; + lv->length = 1; + llck->index++; +} + +/* + * add_index() + * + * Adds an entry to the directory index table. This is used to provide + * each directory entry with a persistent index in which to resume + * directory traversals + */ +static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) +{ + struct super_block *sb = ip->i_sb; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + u64 blkno; + struct dir_table_slot *dirtab_slot; + u32 index; + struct linelock *llck; + struct lv *lv; + struct metapage *mp; + s64 offset; + uint page_offset; + struct tlock *tlck; + s64 xaddr; + + ASSERT(DO_INDEX(ip)); + + if (jfs_ip->next_index < 2) { + jfs_warn("add_index: next_index = %d. Resetting!", + jfs_ip->next_index); + jfs_ip->next_index = 2; + } + + index = jfs_ip->next_index++; + + if (index <= MAX_INLINE_DIRTABLE_ENTRY) { + /* + * i_size reflects size of index table, or 8 bytes per entry. + */ + ip->i_size = (loff_t) (index - 1) << 3; + + /* + * dir table fits inline within inode + */ + dirtab_slot = &jfs_ip->i_dirtable[index-2]; + dirtab_slot->flag = DIR_INDEX_VALID; + dirtab_slot->slot = slot; + DTSaddress(dirtab_slot, bn); + + set_cflag(COMMIT_Dirtable, ip); + + return index; + } + if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) { + struct dir_table_slot temp_table[12]; + + /* + * It's time to move the inline table to an external + * page and begin to build the xtree + */ + if (dquot_alloc_block(ip, sbi->nbperpage)) + goto clean_up; + if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { + dquot_free_block(ip, sbi->nbperpage); + goto clean_up; + } + + /* + * Save the table, we're going to overwrite it with the + * xtree root + */ + memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table)); + + /* + * Initialize empty x-tree + */ + xtInitRoot(tid, ip); + + /* + * Add the first block to the xtree + */ + if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) { + /* This really shouldn't fail */ + jfs_warn("add_index: xtInsert failed!"); + memcpy(&jfs_ip->i_dirtable, temp_table, + sizeof (temp_table)); + dbFree(ip, xaddr, sbi->nbperpage); + dquot_free_block(ip, sbi->nbperpage); + goto clean_up; + } + ip->i_size = PSIZE; + + mp = get_index_page(ip, 0); + if (!mp) { + jfs_err("add_index: get_metapage failed!"); + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + memcpy(&jfs_ip->i_dirtable, temp_table, + sizeof (temp_table)); + goto clean_up; + } + tlck = txLock(tid, ip, mp, tlckDATA); + llck = (struct linelock *) & tlck->lock; + ASSERT(llck->index == 0); + lv = &llck->lv[0]; + + lv->offset = 0; + lv->length = 6; /* tlckDATA slot size is 16 bytes */ + llck->index++; + + memcpy(mp->data, temp_table, sizeof(temp_table)); + + mark_metapage_dirty(mp); + release_metapage(mp); + + /* + * Logging is now directed by xtree tlocks + */ + clear_cflag(COMMIT_Dirtable, ip); + } + + offset = (index - 2) * sizeof(struct dir_table_slot); + page_offset = offset & (PSIZE - 1); + blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage; + if (page_offset == 0) { + /* + * This will be the beginning of a new page + */ + xaddr = 0; + if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) { + jfs_warn("add_index: xtInsert failed!"); + goto clean_up; + } + ip->i_size += PSIZE; + + if ((mp = get_index_page(ip, blkno))) + memset(mp->data, 0, PSIZE); /* Just looks better */ + else + xtTruncate(tid, ip, offset, COMMIT_PWMAP); + } else + mp = read_index_page(ip, blkno); + + if (!mp) { + jfs_err("add_index: get/read_metapage failed!"); + goto clean_up; + } + + lock_index(tid, ip, mp, index); + + dirtab_slot = + (struct dir_table_slot *) ((char *) mp->data + page_offset); + dirtab_slot->flag = DIR_INDEX_VALID; + dirtab_slot->slot = slot; + DTSaddress(dirtab_slot, bn); + + mark_metapage_dirty(mp); + release_metapage(mp); + + return index; + + clean_up: + + jfs_ip->next_index--; + + return 0; +} + +/* + * free_index() + * + * Marks an entry to the directory index table as free. + */ +static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next) +{ + struct dir_table_slot *dirtab_slot; + s64 lblock; + struct metapage *mp = NULL; + + dirtab_slot = find_index(ip, index, &mp, &lblock); + + if (!dirtab_slot) + return; + + dirtab_slot->flag = DIR_INDEX_FREE; + dirtab_slot->slot = dirtab_slot->addr1 = 0; + dirtab_slot->addr2 = cpu_to_le32(next); + + if (mp) { + lock_index(tid, ip, mp, index); + mark_metapage_dirty(mp); + release_metapage(mp); + } else + set_cflag(COMMIT_Dirtable, ip); +} + +/* + * modify_index() + * + * Changes an entry in the directory index table + */ +static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn, + int slot, struct metapage ** mp, s64 *lblock) +{ + struct dir_table_slot *dirtab_slot; + + dirtab_slot = find_index(ip, index, mp, lblock); + + if (!dirtab_slot) + return; + + DTSaddress(dirtab_slot, bn); + dirtab_slot->slot = slot; + + if (*mp) { + lock_index(tid, ip, *mp, index); + mark_metapage_dirty(*mp); + } else + set_cflag(COMMIT_Dirtable, ip); +} + +/* + * read_index() + * + * reads a directory table slot + */ +static int read_index(struct inode *ip, u32 index, + struct dir_table_slot * dirtab_slot) +{ + s64 lblock; + struct metapage *mp = NULL; + struct dir_table_slot *slot; + + slot = find_index(ip, index, &mp, &lblock); + if (!slot) { + return -EIO; + } + + memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot)); + + if (mp) + release_metapage(mp); + + return 0; +} + +/* + * dtSearch() + * + * function: + * Search for the entry with specified key + * + * parameter: + * + * return: 0 - search result on stack, leaf page pinned; + * errno - I/O error + */ +int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, + struct btstack * btstack, int flag) +{ + int rc = 0; + int cmp = 1; /* init for empty page */ + s64 bn; + struct metapage *mp; + dtpage_t *p; + s8 *stbl; + int base, index, lim; + struct btframe *btsp; + pxd_t *pxd; + int psize = 288; /* initial in-line directory */ + ino_t inumber; + struct component_name ciKey; + struct super_block *sb = ip->i_sb; + + ciKey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS); + if (!ciKey.name) { + rc = -ENOMEM; + goto dtSearch_Exit2; + } + + + /* uppercase search key for c-i directory */ + UniStrcpy(ciKey.name, key->name); + ciKey.namlen = key->namlen; + + /* only uppercase if case-insensitive support is on */ + if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) { + ciToUpper(&ciKey); + } + BT_CLR(btstack); /* reset stack */ + + /* init level count for max pages to split */ + btstack->nsplit = 1; + + /* + * search down tree from root: + * + * between two consecutive entries of and of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + DT_GETPAGE(ip, bn, mp, psize, p, rc); + if (rc) + goto dtSearch_Exit1; + + /* get sorted entry table of the page */ + stbl = DT_GETSTBL(p); + + /* + * binary search with search key K on the current page. + */ + for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) { + index = base + (lim >> 1); + + if (p->header.flag & BT_LEAF) { + /* uppercase leaf name to compare */ + cmp = + ciCompare(&ciKey, p, stbl[index], + JFS_SBI(sb)->mntflag); + } else { + /* router key is in uppercase */ + + cmp = dtCompare(&ciKey, p, stbl[index]); + + + } + if (cmp == 0) { + /* + * search hit + */ + /* search hit - leaf page: + * return the entry found + */ + if (p->header.flag & BT_LEAF) { + inumber = le32_to_cpu( + ((struct ldtentry *) & p->slot[stbl[index]])->inumber); + + /* + * search for JFS_LOOKUP + */ + if (flag == JFS_LOOKUP) { + *data = inumber; + rc = 0; + goto out; + } + + /* + * search for JFS_CREATE + */ + if (flag == JFS_CREATE) { + *data = inumber; + rc = -EEXIST; + goto out; + } + + /* + * search for JFS_REMOVE or JFS_RENAME + */ + if ((flag == JFS_REMOVE || + flag == JFS_RENAME) && + *data != inumber) { + rc = -ESTALE; + goto out; + } + + /* + * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME + */ + /* save search result */ + *data = inumber; + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + rc = 0; + goto dtSearch_Exit1; + } + + /* search hit - internal page: + * descend/search its child page + */ + goto getChild; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or (maxindex + 1) index. + */ + /* + * search miss - leaf page + * + * return location of entry (base) where new entry with + * search key K is to be inserted. + */ + if (p->header.flag & BT_LEAF) { + /* + * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME + */ + if (flag == JFS_LOOKUP || flag == JFS_REMOVE || + flag == JFS_RENAME) { + rc = -ENOENT; + goto out; + } + + /* + * search for JFS_CREATE|JFS_FINDDIR: + * + * save search result + */ + *data = 0; + btsp = btstack->top; + btsp->bn = bn; + btsp->index = base; + btsp->mp = mp; + + rc = 0; + goto dtSearch_Exit1; + } + + /* + * search miss - internal page + * + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + getChild: + /* update max. number of pages to split */ + if (BT_STACK_FULL(btstack)) { + /* Something's corrupted, mark filesystem dirty so + * chkdsk will fix it. + */ + jfs_error(sb, "stack overrun in dtSearch!"); + BT_STACK_DUMP(btstack); + rc = -EIO; + goto out; + } + btstack->nsplit++; + + /* push (bn, index) of the parent page/entry */ + BT_PUSH(btstack, bn, index); + + /* get the child page block number */ + pxd = (pxd_t *) & p->slot[stbl[index]]; + bn = addressPXD(pxd); + psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; + + /* unpin the parent page */ + DT_PUTPAGE(mp); + } + + out: + DT_PUTPAGE(mp); + + dtSearch_Exit1: + + kfree(ciKey.name); + + dtSearch_Exit2: + + return rc; +} + + +/* + * dtInsert() + * + * function: insert an entry to directory tree + * + * parameter: + * + * return: 0 - success; + * errno - failure; + */ +int dtInsert(tid_t tid, struct inode *ip, + struct component_name * name, ino_t * fsn, struct btstack * btstack) +{ + int rc = 0; + struct metapage *mp; /* meta-page buffer */ + dtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index; + struct dtsplit split; /* split information */ + ddata_t data; + struct dt_lock *dtlck; + int n; + struct tlock *tlck; + struct lv *lv; + + /* + * retrieve search result + * + * dtSearch() returns (leaf page pinned, index at which to insert). + * n.b. dtSearch() may return index of (maxindex + 1) of + * the full page. + */ + DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); + + /* + * insert entry for new key + */ + if (DO_INDEX(ip)) { + if (JFS_IP(ip)->next_index == DIREND) { + DT_PUTPAGE(mp); + return -EMLINK; + } + n = NDTLEAF(name->namlen); + data.leaf.tid = tid; + data.leaf.ip = ip; + } else { + n = NDTLEAF_LEGACY(name->namlen); + data.leaf.ip = NULL; /* signifies legacy directory format */ + } + data.leaf.ino = *fsn; + + /* + * leaf page does not have enough room for new entry: + * + * extend/split the leaf page; + * + * dtSplitUp() will insert the entry and unpin the leaf page. + */ + if (n > p->header.freecnt) { + split.mp = mp; + split.index = index; + split.nslot = n; + split.key = name; + split.data = &data; + rc = dtSplitUp(tid, ip, &split, btstack); + return rc; + } + + /* + * leaf page does have enough room for new entry: + * + * insert the new data entry into the leaf page; + */ + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + + /* linelock header */ + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + dtInsertEntry(p, index, name, &data, &dtlck); + + /* linelock stbl of non-root leaf page */ + if (!(p->header.flag & BT_ROOT)) { + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + n = index >> L2DTSLOTSIZE; + lv->offset = p->header.stblindex + n; + lv->length = + ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; + dtlck->index++; + } + + /* unpin the leaf page */ + DT_PUTPAGE(mp); + + return 0; +} + + +/* + * dtSplitUp() + * + * function: propagate insertion bottom up; + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * leaf page unpinned; + */ +static int dtSplitUp(tid_t tid, + struct inode *ip, struct dtsplit * split, struct btstack * btstack) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + int rc = 0; + struct metapage *smp; + dtpage_t *sp; /* split page */ + struct metapage *rmp; + dtpage_t *rp; /* new right page split from sp */ + pxd_t rpxd; /* new right page extent descriptor */ + struct metapage *lmp; + dtpage_t *lp; /* left child page */ + int skip; /* index of entry of insertion */ + struct btframe *parent; /* parent page entry on traverse stack */ + s64 xaddr, nxaddr; + int xlen, xsize; + struct pxdlist pxdlist; + pxd_t *pxd; + struct component_name key = { 0, NULL }; + ddata_t *data = split->data; + int n; + struct dt_lock *dtlck; + struct tlock *tlck; + struct lv *lv; + int quota_allocation = 0; + + /* get split page */ + smp = split->mp; + sp = DT_PAGE(ip, smp); + + key.name = kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), GFP_NOFS); + if (!key.name) { + DT_PUTPAGE(smp); + rc = -ENOMEM; + goto dtSplitUp_Exit; + } + + /* + * split leaf page + * + * The split routines insert the new entry, and + * acquire txLock as appropriate. + */ + /* + * split root leaf page: + */ + if (sp->header.flag & BT_ROOT) { + /* + * allocate a single extent child page + */ + xlen = 1; + n = sbi->bsize >> L2DTSLOTSIZE; + n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ + n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */ + if (n <= split->nslot) + xlen++; + if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) { + DT_PUTPAGE(smp); + goto freeKeyName; + } + + pxdlist.maxnpxd = 1; + pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + split->pxdlist = &pxdlist; + rc = dtSplitRoot(tid, ip, split, &rmp); + + if (rc) + dbFree(ip, xaddr, xlen); + else + DT_PUTPAGE(rmp); + + DT_PUTPAGE(smp); + + if (!DO_INDEX(ip)) + ip->i_size = xlen << sbi->l2bsize; + + goto freeKeyName; + } + + /* + * extend first leaf page + * + * extend the 1st extent if less than buffer page size + * (dtExtendPage() reurns leaf page unpinned) + */ + pxd = &sp->header.self; + xlen = lengthPXD(pxd); + xsize = xlen << sbi->l2bsize; + if (xsize < PSIZE) { + xaddr = addressPXD(pxd); + n = xsize >> L2DTSLOTSIZE; + n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ + if ((n + sp->header.freecnt) <= split->nslot) + n = xlen + (xlen << 1); + else + n = xlen; + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, n); + if (rc) + goto extendOut; + quota_allocation += n; + + if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, + (s64) n, &nxaddr))) + goto extendOut; + + pxdlist.maxnpxd = 1; + pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + PXDaddress(pxd, nxaddr) + PXDlength(pxd, xlen + n); + split->pxdlist = &pxdlist; + if ((rc = dtExtendPage(tid, ip, split, btstack))) { + nxaddr = addressPXD(pxd); + if (xaddr != nxaddr) { + /* free relocated extent */ + xlen = lengthPXD(pxd); + dbFree(ip, nxaddr, (s64) xlen); + } else { + /* free extended delta */ + xlen = lengthPXD(pxd) - n; + xaddr = addressPXD(pxd) + xlen; + dbFree(ip, xaddr, (s64) n); + } + } else if (!DO_INDEX(ip)) + ip->i_size = lengthPXD(pxd) << sbi->l2bsize; + + + extendOut: + DT_PUTPAGE(smp); + goto freeKeyName; + } + + /* + * split leaf page into and a new right page . + * + * return pinned and its extent descriptor + */ + /* + * allocate new directory page extent and + * new index page(s) to cover page split(s) + * + * allocation hint: ? + */ + n = btstack->nsplit; + pxdlist.maxnpxd = pxdlist.npxd = 0; + xlen = sbi->nbperpage; + for (pxd = pxdlist.pxd; n > 0; n--, pxd++) { + if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + pxdlist.maxnpxd++; + continue; + } + + DT_PUTPAGE(smp); + + /* undo allocation */ + goto splitOut; + } + + split->pxdlist = &pxdlist; + if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) { + DT_PUTPAGE(smp); + + /* undo allocation */ + goto splitOut; + } + + if (!DO_INDEX(ip)) + ip->i_size += PSIZE; + + /* + * propagate up the router entry for the leaf page just split + * + * insert a router entry for the new page into the parent page, + * propagate the insert/split up the tree by walking back the stack + * of (bn of parent page, index of child page entry in parent page) + * that were traversed during the search for the page that split. + * + * the propagation of insert/split up the tree stops if the root + * splits or the page inserted into doesn't have to split to hold + * the new entry. + * + * the parent entry for the split page remains the same, and + * a new entry is inserted at its right with the first key and + * block number of the new right page. + * + * There are a maximum of 4 pages pinned at any time: + * two children, left parent and right parent (when the parent splits). + * keep the child pages pinned while working on the parent. + * make sure that all pins are released at exit. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* parent page specified by stack frame */ + + /* keep current child pages (, ) pinned */ + lmp = smp; + lp = sp; + + /* + * insert router entry in parent for new right child page + */ + /* get the parent page */ + DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); + if (rc) { + DT_PUTPAGE(lmp); + DT_PUTPAGE(rmp); + goto splitOut; + } + + /* + * The new key entry goes ONE AFTER the index of parent entry, + * because the split was to the right. + */ + skip = parent->index + 1; + + /* + * compute the key for the router entry + * + * key suffix compression: + * for internal pages that have leaf pages as children, + * retain only what's needed to distinguish between + * the new entry and the entry on the page to its left. + * If the keys compare equal, retain the entire key. + * + * note that compression is performed only at computing + * router key at the lowest internal level. + * further compression of the key between pairs of higher + * level internal pages loses too much information and + * the search may fail. + * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,} + * results in two adjacent parent entries (a)(xx). + * if split occurs between these two entries, and + * if compression is applied, the router key of parent entry + * of right page (x) will divert search for x into right + * subtree and miss x in the left subtree.) + * + * the entire key must be retained for the next-to-leftmost + * internal key at any level of the tree, or search may fail + * (e.g., ?) + */ + switch (rp->header.flag & BT_TYPE) { + case BT_LEAF: + /* + * compute the length of prefix for suffix compression + * between last entry of left page and first entry + * of right page + */ + if ((sp->header.flag & BT_ROOT && skip > 1) || + sp->header.prev != 0 || skip > 1) { + /* compute uppercase router prefix key */ + rc = ciGetLeafPrefixKey(lp, + lp->header.nextindex-1, + rp, 0, &key, + sbi->mntflag); + if (rc) { + DT_PUTPAGE(lmp); + DT_PUTPAGE(rmp); + DT_PUTPAGE(smp); + goto splitOut; + } + } else { + /* next to leftmost entry of + lowest internal level */ + + /* compute uppercase router key */ + dtGetKey(rp, 0, &key, sbi->mntflag); + key.name[key.namlen] = 0; + + if ((sbi->mntflag & JFS_OS2) == JFS_OS2) + ciToUpper(&key); + } + + n = NDTINTERNAL(key.namlen); + break; + + case BT_INTERNAL: + dtGetKey(rp, 0, &key, sbi->mntflag); + n = NDTINTERNAL(key.namlen); + break; + + default: + jfs_err("dtSplitUp(): UFO!"); + break; + } + + /* unpin left child page */ + DT_PUTPAGE(lmp); + + /* + * compute the data for the router entry + */ + data->xd = rpxd; /* child page xd */ + + /* + * parent page is full - split the parent page + */ + if (n > sp->header.freecnt) { + /* init for parent page split */ + split->mp = smp; + split->index = skip; /* index at insert */ + split->nslot = n; + split->key = &key; + /* split->data = data; */ + + /* unpin right child page */ + DT_PUTPAGE(rmp); + + /* The split routines insert the new entry, + * acquire txLock as appropriate. + * return pinned and its block number . + */ + rc = (sp->header.flag & BT_ROOT) ? + dtSplitRoot(tid, ip, split, &rmp) : + dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd); + if (rc) { + DT_PUTPAGE(smp); + goto splitOut; + } + + /* smp and rmp are pinned */ + } + /* + * parent page is not full - insert router entry in parent page + */ + else { + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the parent page + */ + tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + + /* linelock header */ + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + /* linelock stbl of non-root parent page */ + if (!(sp->header.flag & BT_ROOT)) { + lv++; + n = skip >> L2DTSLOTSIZE; + lv->offset = sp->header.stblindex + n; + lv->length = + ((sp->header.nextindex - + 1) >> L2DTSLOTSIZE) - n + 1; + dtlck->index++; + } + + dtInsertEntry(sp, skip, &key, data, &dtlck); + + /* exit propagate up */ + break; + } + } + + /* unpin current split and its right page */ + DT_PUTPAGE(smp); + DT_PUTPAGE(rmp); + + /* + * free remaining extents allocated for split + */ + splitOut: + n = pxdlist.npxd; + pxd = &pxdlist.pxd[n]; + for (; n < pxdlist.maxnpxd; n++, pxd++) + dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd)); + + freeKeyName: + kfree(key.name); + + /* Rollback quota allocation */ + if (rc && quota_allocation) + dquot_free_block(ip, quota_allocation); + + dtSplitUp_Exit: + + return rc; +} + + +/* + * dtSplitPage() + * + * function: Split a non-root page of a btree. + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * return split and new page pinned; + */ +static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, + struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp) +{ + int rc = 0; + struct metapage *smp; + dtpage_t *sp; + struct metapage *rmp; + dtpage_t *rp; /* new right page allocated */ + s64 rbn; /* new right page block number */ + struct metapage *mp; + dtpage_t *p; + s64 nextbn; + struct pxdlist *pxdlist; + pxd_t *pxd; + int skip, nextindex, half, left, nxt, off, si; + struct ldtentry *ldtentry; + struct idtentry *idtentry; + u8 *stbl; + struct dtslot *f; + int fsi, stblsize; + int n; + struct dt_lock *sdtlck, *rdtlck; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *slv, *rlv, *lv; + + /* get split page */ + smp = split->mp; + sp = DT_PAGE(ip, smp); + + /* + * allocate the new right page for the split + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + rmp = get_metapage(ip, rbn, PSIZE, 1); + if (rmp == NULL) + return -EIO; + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { + release_metapage(rmp); + return rc; + } + + jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); + + BT_MARK_DIRTY(rmp, ip); + /* + * acquire a transaction lock on the new right page + */ + tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); + rdtlck = (struct dt_lock *) & tlck->lock; + + rp = (dtpage_t *) rmp->data; + *rpp = rp; + rp->header.self = *pxd; + + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the split page + * + * action: + */ + tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); + sdtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header of split page */ + ASSERT(sdtlck->index == 0); + slv = & sdtlck->lv[0]; + slv->offset = 0; + slv->length = 1; + sdtlck->index++; + + /* + * initialize/update sibling pointers between sp and rp + */ + nextbn = le64_to_cpu(sp->header.next); + rp->header.next = cpu_to_le64(nextbn); + rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); + sp->header.next = cpu_to_le64(rbn); + + /* + * initialize new right page + */ + rp->header.flag = sp->header.flag; + + /* compute sorted entry table at start of extent data area */ + rp->header.nextindex = 0; + rp->header.stblindex = 1; + + n = PSIZE >> L2DTSLOTSIZE; + rp->header.maxslot = n; + stblsize = (n + 31) >> L2DTSLOTSIZE; /* in unit of slot */ + + /* init freelist */ + fsi = rp->header.stblindex + stblsize; + rp->header.freelist = fsi; + rp->header.freecnt = rp->header.maxslot - fsi; + + /* + * sequential append at tail: append without split + * + * If splitting the last page on a level because of appending + * a entry to it (skip is maxentry), it's likely that the access is + * sequential. Adding an empty page on the side of the level is less + * work and can push the fill factor much higher than normal. + * If we're wrong it's no big deal, we'll just do the split the right + * way next time. + * (It may look like it's equally easy to do a similar hack for + * reverse sorted data, that is, split the tree left, + * but it's not. Be my guest.) + */ + if (nextbn == 0 && split->index == sp->header.nextindex) { + /* linelock header + stbl (first slot) of new page */ + rlv = & rdtlck->lv[rdtlck->index]; + rlv->offset = 0; + rlv->length = 2; + rdtlck->index++; + + /* + * initialize freelist of new right page + */ + f = &rp->slot[fsi]; + for (fsi++; fsi < rp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* insert entry at the first entry of the new right page */ + dtInsertEntry(rp, 0, split->key, split->data, &rdtlck); + + goto out; + } + + /* + * non-sequential insert (at possibly middle page) + */ + + /* + * update prev pointer of previous right sibling page; + */ + if (nextbn != 0) { + DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) { + discard_metapage(rmp); + return rc; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the next page + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); + jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p", + tlck, ip, mp); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header of previous right sibling page */ + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + p->header.prev = cpu_to_le64(rbn); + + DT_PUTPAGE(mp); + } + + /* + * split the data between the split and right pages. + */ + skip = split->index; + half = (PSIZE >> L2DTSLOTSIZE) >> 1; /* swag */ + left = 0; + + /* + * compute fill factor for split pages + * + * traces the next entry to move to rp + * traces the next entry to stay in sp + */ + stbl = (u8 *) & sp->slot[sp->header.stblindex]; + nextindex = sp->header.nextindex; + for (nxt = off = 0; nxt < nextindex; ++off) { + if (off == skip) + /* check for fill factor with new entry size */ + n = split->nslot; + else { + si = stbl[nxt]; + switch (sp->header.flag & BT_TYPE) { + case BT_LEAF: + ldtentry = (struct ldtentry *) & sp->slot[si]; + if (DO_INDEX(ip)) + n = NDTLEAF(ldtentry->namlen); + else + n = NDTLEAF_LEGACY(ldtentry-> + namlen); + break; + + case BT_INTERNAL: + idtentry = (struct idtentry *) & sp->slot[si]; + n = NDTINTERNAL(idtentry->namlen); + break; + + default: + break; + } + + ++nxt; /* advance to next entry to move in sp */ + } + + left += n; + if (left >= half) + break; + } + + /* poins to the 1st entry to move */ + + /* + * move entries to right page + * + * dtMoveEntry() initializes rp and reserves entry for insertion + * + * split page moved out entries are linelocked; + * new/right page moved in entries are linelocked; + */ + /* linelock header + stbl of new right page */ + rlv = & rdtlck->lv[rdtlck->index]; + rlv->offset = 0; + rlv->length = 5; + rdtlck->index++; + + dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip)); + + sp->header.nextindex = nxt; + + /* + * finalize freelist of new right page + */ + fsi = rp->header.freelist; + f = &rp->slot[fsi]; + for (fsi++; fsi < rp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* + * Update directory index table for entries now in right page + */ + if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { + s64 lblock; + + mp = NULL; + stbl = DT_GETSTBL(rp); + for (n = 0; n < rp->header.nextindex; n++) { + ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; + modify_index(tid, ip, le32_to_cpu(ldtentry->index), + rbn, n, &mp, &lblock); + } + if (mp) + release_metapage(mp); + } + + /* + * the skipped index was on the left page, + */ + if (skip <= off) { + /* insert the new entry in the split page */ + dtInsertEntry(sp, skip, split->key, split->data, &sdtlck); + + /* linelock stbl of split page */ + if (sdtlck->index >= sdtlck->maxcnt) + sdtlck = (struct dt_lock *) txLinelock(sdtlck); + slv = & sdtlck->lv[sdtlck->index]; + n = skip >> L2DTSLOTSIZE; + slv->offset = sp->header.stblindex + n; + slv->length = + ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; + sdtlck->index++; + } + /* + * the skipped index was on the right page, + */ + else { + /* adjust the skip index to reflect the new position */ + skip -= nxt; + + /* insert the new entry in the right page */ + dtInsertEntry(rp, skip, split->key, split->data, &rdtlck); + } + + out: + *rmpp = rmp; + *rpxdp = *pxd; + + return rc; +} + + +/* + * dtExtendPage() + * + * function: extend 1st/only directory leaf page + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * return extended page pinned; + */ +static int dtExtendPage(tid_t tid, + struct inode *ip, struct dtsplit * split, struct btstack * btstack) +{ + struct super_block *sb = ip->i_sb; + int rc; + struct metapage *smp, *pmp, *mp; + dtpage_t *sp, *pp; + struct pxdlist *pxdlist; + pxd_t *pxd, *tpxd; + int xlen, xsize; + int newstblindex, newstblsize; + int oldstblindex, oldstblsize; + int fsi, last; + struct dtslot *f; + struct btframe *parent; + int n; + struct dt_lock *dtlck; + s64 xaddr, txaddr; + struct tlock *tlck; + struct pxd_lock *pxdlock; + struct lv *lv; + uint type; + struct ldtentry *ldtentry; + u8 *stbl; + + /* get page to extend */ + smp = split->mp; + sp = DT_PAGE(ip, smp); + + /* get parent/root page */ + parent = BT_POP(btstack); + DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc); + if (rc) + return (rc); + + /* + * extend the extent + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + + xaddr = addressPXD(pxd); + tpxd = &sp->header.self; + txaddr = addressPXD(tpxd); + /* in-place extension */ + if (xaddr == txaddr) { + type = tlckEXTEND; + } + /* relocation */ + else { + type = tlckNEW; + + /* save moved extent descriptor for later free */ + tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = sp->header.self; + pxdlock->index = 1; + + /* + * Update directory index table to reflect new page address + */ + if (DO_INDEX(ip)) { + s64 lblock; + + mp = NULL; + stbl = DT_GETSTBL(sp); + for (n = 0; n < sp->header.nextindex; n++) { + ldtentry = + (struct ldtentry *) & sp->slot[stbl[n]]; + modify_index(tid, ip, + le32_to_cpu(ldtentry->index), + xaddr, n, &mp, &lblock); + } + if (mp) + release_metapage(mp); + } + } + + /* + * extend the page + */ + sp->header.self = *pxd; + + jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp); + + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the extended/leaf page + */ + tlck = txLock(tid, ip, smp, tlckDTREE | type); + dtlck = (struct dt_lock *) & tlck->lock; + lv = & dtlck->lv[0]; + + /* update buffer extent descriptor of extended page */ + xlen = lengthPXD(pxd); + xsize = xlen << JFS_SBI(sb)->l2bsize; + + /* + * copy old stbl to new stbl at start of extended area + */ + oldstblindex = sp->header.stblindex; + oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE; + newstblindex = sp->header.maxslot; + n = xsize >> L2DTSLOTSIZE; + newstblsize = (n + 31) >> L2DTSLOTSIZE; + memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex], + sp->header.nextindex); + + /* + * in-line extension: linelock old area of extended page + */ + if (type == tlckEXTEND) { + /* linelock header */ + lv->offset = 0; + lv->length = 1; + dtlck->index++; + lv++; + + /* linelock new stbl of extended page */ + lv->offset = newstblindex; + lv->length = newstblsize; + } + /* + * relocation: linelock whole relocated area + */ + else { + lv->offset = 0; + lv->length = sp->header.maxslot + newstblsize; + } + + dtlck->index++; + + sp->header.maxslot = n; + sp->header.stblindex = newstblindex; + /* sp->header.nextindex remains the same */ + + /* + * add old stbl region at head of freelist + */ + fsi = oldstblindex; + f = &sp->slot[fsi]; + last = sp->header.freelist; + for (n = 0; n < oldstblsize; n++, fsi++, f++) { + f->next = last; + last = fsi; + } + sp->header.freelist = last; + sp->header.freecnt += oldstblsize; + + /* + * append free region of newly extended area at tail of freelist + */ + /* init free region of newly extended area */ + fsi = n = newstblindex + newstblsize; + f = &sp->slot[fsi]; + for (fsi++; fsi < sp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* append new free region at tail of old freelist */ + fsi = sp->header.freelist; + if (fsi == -1) + sp->header.freelist = n; + else { + do { + f = &sp->slot[fsi]; + fsi = f->next; + } while (fsi != -1); + + f->next = n; + } + + sp->header.freecnt += sp->header.maxslot - n; + + /* + * insert the new entry + */ + dtInsertEntry(sp, split->index, split->key, split->data, &dtlck); + + BT_MARK_DIRTY(pmp, ip); + /* + * linelock any freeslots residing in old extent + */ + if (type == tlckEXTEND) { + n = sp->header.maxslot >> 2; + if (sp->header.freelist < n) + dtLinelockFreelist(sp, n, &dtlck); + } + + /* + * update parent entry on the parent/root page + */ + /* + * acquire a transaction lock on the parent/root page + */ + tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + lv = & dtlck->lv[dtlck->index]; + + /* linelock parent entry - 1st slot */ + lv->offset = 1; + lv->length = 1; + dtlck->index++; + + /* update the parent pxd for page extension */ + tpxd = (pxd_t *) & pp->slot[1]; + *tpxd = *pxd; + + DT_PUTPAGE(pmp); + return 0; +} + + +/* + * dtSplitRoot() + * + * function: + * split the full root page into + * original/root/split page and new right page + * i.e., root remains fixed in tree anchor (inode) and + * the root is copied to a single new right child page + * since root page << non-root page, and + * the split root page contains a single entry for the + * new right child page. + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * return new page pinned; + */ +static int dtSplitRoot(tid_t tid, + struct inode *ip, struct dtsplit * split, struct metapage ** rmpp) +{ + struct super_block *sb = ip->i_sb; + struct metapage *smp; + dtroot_t *sp; + struct metapage *rmp; + dtpage_t *rp; + s64 rbn; + int xlen; + int xsize; + struct dtslot *f; + s8 *stbl; + int fsi, stblsize, n; + struct idtentry *s; + pxd_t *ppxd; + struct pxdlist *pxdlist; + pxd_t *pxd; + struct dt_lock *dtlck; + struct tlock *tlck; + struct lv *lv; + int rc; + + /* get split root page */ + smp = split->mp; + sp = &JFS_IP(ip)->i_dtroot; + + /* + * allocate/initialize a single (right) child page + * + * N.B. at first split, a one (or two) block to fit new entry + * is allocated; at subsequent split, a full page is allocated; + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + xlen = lengthPXD(pxd); + xsize = xlen << JFS_SBI(sb)->l2bsize; + rmp = get_metapage(ip, rbn, xsize, 1); + if (!rmp) + return -EIO; + + rp = rmp->data; + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { + release_metapage(rmp); + return rc; + } + + BT_MARK_DIRTY(rmp, ip); + /* + * acquire a transaction lock on the new right page + */ + tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); + dtlck = (struct dt_lock *) & tlck->lock; + + rp->header.flag = + (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; + rp->header.self = *pxd; + + /* initialize sibling pointers */ + rp->header.next = 0; + rp->header.prev = 0; + + /* + * move in-line root page into new right page extent + */ + /* linelock header + copied entries + new stbl (1st slot) in new page */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = 10; /* 1 + 8 + 1 */ + dtlck->index++; + + n = xsize >> L2DTSLOTSIZE; + rp->header.maxslot = n; + stblsize = (n + 31) >> L2DTSLOTSIZE; + + /* copy old stbl to new stbl at start of extended area */ + rp->header.stblindex = DTROOTMAXSLOT; + stbl = (s8 *) & rp->slot[DTROOTMAXSLOT]; + memcpy(stbl, sp->header.stbl, sp->header.nextindex); + rp->header.nextindex = sp->header.nextindex; + + /* copy old data area to start of new data area */ + memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE); + + /* + * append free region of newly extended area at tail of freelist + */ + /* init free region of newly extended area */ + fsi = n = DTROOTMAXSLOT + stblsize; + f = &rp->slot[fsi]; + for (fsi++; fsi < rp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* append new free region at tail of old freelist */ + fsi = sp->header.freelist; + if (fsi == -1) + rp->header.freelist = n; + else { + rp->header.freelist = fsi; + + do { + f = &rp->slot[fsi]; + fsi = f->next; + } while (fsi != -1); + + f->next = n; + } + + rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n; + + /* + * Update directory index table for entries now in right page + */ + if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { + s64 lblock; + struct metapage *mp = NULL; + struct ldtentry *ldtentry; + + stbl = DT_GETSTBL(rp); + for (n = 0; n < rp->header.nextindex; n++) { + ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; + modify_index(tid, ip, le32_to_cpu(ldtentry->index), + rbn, n, &mp, &lblock); + } + if (mp) + release_metapage(mp); + } + /* + * insert the new entry into the new right/child page + * (skip index in the new right page will not change) + */ + dtInsertEntry(rp, split->index, split->key, split->data, &dtlck); + + /* + * reset parent/root page + * + * set the 1st entry offset to 0, which force the left-most key + * at any level of the tree to be less than any search key. + * + * The btree comparison code guarantees that the left-most key on any + * level of the tree is never used, so it doesn't need to be filled in. + */ + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the root page (in-memory inode) + */ + tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock root */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = DTROOTMAXSLOT; + dtlck->index++; + + /* update page header of root */ + if (sp->header.flag & BT_LEAF) { + sp->header.flag &= ~BT_LEAF; + sp->header.flag |= BT_INTERNAL; + } + + /* init the first entry */ + s = (struct idtentry *) & sp->slot[DTENTRYSTART]; + ppxd = (pxd_t *) s; + *ppxd = *pxd; + s->next = -1; + s->namlen = 0; + + stbl = sp->header.stbl; + stbl[0] = DTENTRYSTART; + sp->header.nextindex = 1; + + /* init freelist */ + fsi = DTENTRYSTART + 1; + f = &sp->slot[fsi]; + + /* init free region of remaining area */ + for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) + f->next = fsi; + f->next = -1; + + sp->header.freelist = DTENTRYSTART + 1; + sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1); + + *rmpp = rmp; + + return 0; +} + + +/* + * dtDelete() + * + * function: delete the entry(s) referenced by a key. + * + * parameter: + * + * return: + */ +int dtDelete(tid_t tid, + struct inode *ip, struct component_name * key, ino_t * ino, int flag) +{ + int rc = 0; + s64 bn; + struct metapage *mp, *imp; + dtpage_t *p; + int index; + struct btstack btstack; + struct dt_lock *dtlck; + struct tlock *tlck; + struct lv *lv; + int i; + struct ldtentry *ldtentry; + u8 *stbl; + u32 table_index, next_index; + struct metapage *nmp; + dtpage_t *np; + + /* + * search for the entry to delete: + * + * dtSearch() returns (leaf page pinned, index at which to delete). + */ + if ((rc = dtSearch(ip, key, ino, &btstack, flag))) + return rc; + + /* retrieve search result */ + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* + * We need to find put the index of the next entry into the + * directory index table in order to resume a readdir from this + * entry. + */ + if (DO_INDEX(ip)) { + stbl = DT_GETSTBL(p); + ldtentry = (struct ldtentry *) & p->slot[stbl[index]]; + table_index = le32_to_cpu(ldtentry->index); + if (index == (p->header.nextindex - 1)) { + /* + * Last entry in this leaf page + */ + if ((p->header.flag & BT_ROOT) + || (p->header.next == 0)) + next_index = -1; + else { + /* Read next leaf page */ + DT_GETPAGE(ip, le64_to_cpu(p->header.next), + nmp, PSIZE, np, rc); + if (rc) + next_index = -1; + else { + stbl = DT_GETSTBL(np); + ldtentry = + (struct ldtentry *) & np-> + slot[stbl[0]]; + next_index = + le32_to_cpu(ldtentry->index); + DT_PUTPAGE(nmp); + } + } + } else { + ldtentry = + (struct ldtentry *) & p->slot[stbl[index + 1]]; + next_index = le32_to_cpu(ldtentry->index); + } + free_index(tid, ip, table_index, next_index); + } + /* + * the leaf page becomes empty, delete the page + */ + if (p->header.nextindex == 1) { + /* delete empty page */ + rc = dtDeleteUp(tid, ip, mp, p, &btstack); + } + /* + * the leaf page has other entries remaining: + * + * delete the entry from the leaf page. + */ + else { + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + + /* + * Do not assume that dtlck->index will be zero. During a + * rename within a directory, this transaction may have + * modified this page already when adding the new entry. + */ + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + /* linelock stbl of non-root leaf page */ + if (!(p->header.flag & BT_ROOT)) { + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + i = index >> L2DTSLOTSIZE; + lv->offset = p->header.stblindex + i; + lv->length = + ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - + i + 1; + dtlck->index++; + } + + /* free the leaf entry */ + dtDeleteEntry(p, index, &dtlck); + + /* + * Update directory index table for entries moved in stbl + */ + if (DO_INDEX(ip) && index < p->header.nextindex) { + s64 lblock; + + imp = NULL; + stbl = DT_GETSTBL(p); + for (i = index; i < p->header.nextindex; i++) { + ldtentry = + (struct ldtentry *) & p->slot[stbl[i]]; + modify_index(tid, ip, + le32_to_cpu(ldtentry->index), + bn, i, &imp, &lblock); + } + if (imp) + release_metapage(imp); + } + + DT_PUTPAGE(mp); + } + + return rc; +} + + +/* + * dtDeleteUp() + * + * function: + * free empty pages as propagating deletion up the tree + * + * parameter: + * + * return: + */ +static int dtDeleteUp(tid_t tid, struct inode *ip, + struct metapage * fmp, dtpage_t * fp, struct btstack * btstack) +{ + int rc = 0; + struct metapage *mp; + dtpage_t *p; + int index, nextindex; + int xlen; + struct btframe *parent; + struct dt_lock *dtlck; + struct tlock *tlck; + struct lv *lv; + struct pxd_lock *pxdlock; + int i; + + /* + * keep the root leaf page which has become empty + */ + if (BT_IS_ROOT(fmp)) { + /* + * reset the root + * + * dtInitRoot() acquires txlock on the root + */ + dtInitRoot(tid, ip, PARENT(ip)); + + DT_PUTPAGE(fmp); + + return 0; + } + + /* + * free the non-root leaf page + */ + /* + * acquire a transaction lock on the page + * + * write FREEXTENT|NOREDOPAGE log record + * N.B. linelock is overlaid as freed extent descriptor, and + * the buffer page is freed; + */ + tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = fp->header.self; + pxdlock->index = 1; + + /* update sibling pointers */ + if ((rc = dtRelink(tid, ip, fp))) { + BT_PUTPAGE(fmp); + return rc; + } + + xlen = lengthPXD(&fp->header.self); + + /* Free quota allocation. */ + dquot_free_block(ip, xlen); + + /* free/invalidate its buffer page */ + discard_metapage(fmp); + + /* + * propagate page deletion up the directory tree + * + * If the delete from the parent page makes it empty, + * continue all the way up the tree. + * stop if the root page is reached (which is never deleted) or + * if the entry deletion does not empty the page. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* pin the parent page */ + DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * free the extent of the child page deleted + */ + index = parent->index; + + /* + * delete the entry for the child page from parent + */ + nextindex = p->header.nextindex; + + /* + * the parent has the single entry being deleted: + * + * free the parent page which has become empty. + */ + if (nextindex == 1) { + /* + * keep the root internal page which has become empty + */ + if (p->header.flag & BT_ROOT) { + /* + * reset the root + * + * dtInitRoot() acquires txlock on the root + */ + dtInitRoot(tid, ip, PARENT(ip)); + + DT_PUTPAGE(mp); + + return 0; + } + /* + * free the parent page + */ + else { + /* + * acquire a transaction lock on the page + * + * write FREEXTENT|NOREDOPAGE log record + */ + tlck = + txMaplock(tid, ip, + tlckDTREE | tlckFREE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = p->header.self; + pxdlock->index = 1; + + /* update sibling pointers */ + if ((rc = dtRelink(tid, ip, p))) { + DT_PUTPAGE(mp); + return rc; + } + + xlen = lengthPXD(&p->header.self); + + /* Free quota allocation */ + dquot_free_block(ip, xlen); + + /* free/invalidate its buffer page */ + discard_metapage(mp); + + /* propagate up */ + continue; + } + } + + /* + * the parent has other entries remaining: + * + * delete the router entry from the parent page. + */ + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the page + * + * action: router entry deletion + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + /* linelock stbl of non-root leaf page */ + if (!(p->header.flag & BT_ROOT)) { + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + i = index >> L2DTSLOTSIZE; + lv->offset = p->header.stblindex + i; + lv->length = + ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - + i + 1; + dtlck->index++; + } + + /* free the router entry */ + dtDeleteEntry(p, index, &dtlck); + + /* reset key of new leftmost entry of level (for consistency) */ + if (index == 0 && + ((p->header.flag & BT_ROOT) || p->header.prev == 0)) + dtTruncateEntry(p, 0, &dtlck); + + /* unpin the parent page */ + DT_PUTPAGE(mp); + + /* exit propagation up */ + break; + } + + if (!DO_INDEX(ip)) + ip->i_size -= PSIZE; + + return 0; +} + +#ifdef _NOTYET +/* + * NAME: dtRelocate() + * + * FUNCTION: relocate dtpage (internal or leaf) of directory; + * This function is mainly used by defragfs utility. + */ +int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, + s64 nxaddr) +{ + int rc = 0; + struct metapage *mp, *pmp, *lmp, *rmp; + dtpage_t *p, *pp, *rp = 0, *lp= 0; + s64 bn; + int index; + struct btstack btstack; + pxd_t *pxd; + s64 oxaddr, nextbn, prevbn; + int xlen, xsize; + struct tlock *tlck; + struct dt_lock *dtlck; + struct pxd_lock *pxdlock; + s8 *stbl; + struct lv *lv; + + oxaddr = addressPXD(opxd); + xlen = lengthPXD(opxd); + + jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d", + (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr, + xlen); + + /* + * 1. get the internal parent dtpage covering + * router entry for the tartget page to be relocated; + */ + rc = dtSearchNode(ip, lmxaddr, opxd, &btstack); + if (rc) + return rc; + + /* retrieve search result */ + DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + jfs_info("dtRelocate: parent router entry validated."); + + /* + * 2. relocate the target dtpage + */ + /* read in the target page from src extent */ + DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); + if (rc) { + /* release the pinned parent page */ + DT_PUTPAGE(pmp); + return rc; + } + + /* + * read in sibling pages if any to update sibling pointers; + */ + rmp = NULL; + if (p->header.next) { + nextbn = le64_to_cpu(p->header.next); + DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); + if (rc) { + DT_PUTPAGE(mp); + DT_PUTPAGE(pmp); + return (rc); + } + } + + lmp = NULL; + if (p->header.prev) { + prevbn = le64_to_cpu(p->header.prev); + DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); + if (rc) { + DT_PUTPAGE(mp); + DT_PUTPAGE(pmp); + if (rmp) + DT_PUTPAGE(rmp); + return (rc); + } + } + + /* at this point, all xtpages to be updated are in memory */ + + /* + * update sibling pointers of sibling dtpages if any; + */ + if (lmp) { + tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK); + dtlck = (struct dt_lock *) & tlck->lock; + /* linelock header */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + lp->header.next = cpu_to_le64(nxaddr); + DT_PUTPAGE(lmp); + } + + if (rmp) { + tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK); + dtlck = (struct dt_lock *) & tlck->lock; + /* linelock header */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + rp->header.prev = cpu_to_le64(nxaddr); + DT_PUTPAGE(rmp); + } + + /* + * update the target dtpage to be relocated + * + * write LOG_REDOPAGE of LOG_NEW type for dst page + * for the whole target page (logredo() will apply + * after image and update bmap for allocation of the + * dst extent), and update bmap for allocation of + * the dst extent; + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW); + dtlck = (struct dt_lock *) & tlck->lock; + /* linelock header */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + + /* update the self address in the dtpage header */ + pxd = &p->header.self; + PXDaddress(pxd, nxaddr); + + /* the dst page is the same as the src page, i.e., + * linelock for afterimage of the whole page; + */ + lv->offset = 0; + lv->length = p->header.maxslot; + dtlck->index++; + + /* update the buffer extent descriptor of the dtpage */ + xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; + + /* unpin the relocated page */ + DT_PUTPAGE(mp); + jfs_info("dtRelocate: target dtpage relocated."); + + /* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec + * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec + * will also force a bmap update ). + */ + + /* + * 3. acquire maplock for the source extent to be freed; + */ + /* for dtpage relocation, write a LOG_NOREDOPAGE record + * for the source dtpage (logredo() will init NoRedoPage + * filter and will also update bmap for free of the source + * dtpage), and upadte bmap for free of the source dtpage; + */ + tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, oxaddr); + PXDlength(&pxdlock->pxd, xlen); + pxdlock->index = 1; + + /* + * 4. update the parent router entry for relocation; + * + * acquire tlck for the parent entry covering the target dtpage; + * write LOG_REDOPAGE to apply after image only; + */ + jfs_info("dtRelocate: update parent router entry."); + tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + lv = & dtlck->lv[dtlck->index]; + + /* update the PXD with the new address */ + stbl = DT_GETSTBL(pp); + pxd = (pxd_t *) & pp->slot[stbl[index]]; + PXDaddress(pxd, nxaddr); + lv->offset = stbl[index]; + lv->length = 1; + dtlck->index++; + + /* unpin the parent dtpage */ + DT_PUTPAGE(pmp); + + return rc; +} + +/* + * NAME: dtSearchNode() + * + * FUNCTION: Search for an dtpage containing a specified address + * This function is mainly used by defragfs utility. + * + * NOTE: Search result on stack, the found page is pinned at exit. + * The result page must be an internal dtpage. + * lmxaddr give the address of the left most page of the + * dtree level, in which the required dtpage resides. + */ +static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd, + struct btstack * btstack) +{ + int rc = 0; + s64 bn; + struct metapage *mp; + dtpage_t *p; + int psize = 288; /* initial in-line directory */ + s8 *stbl; + int i; + pxd_t *pxd; + struct btframe *btsp; + + BT_CLR(btstack); /* reset stack */ + + /* + * descend tree to the level with specified leftmost page + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + DT_GETPAGE(ip, bn, mp, psize, p, rc); + if (rc) + return rc; + + /* does the xaddr of leftmost page of the levevl + * matches levevl search key ? + */ + if (p->header.flag & BT_ROOT) { + if (lmxaddr == 0) + break; + } else if (addressPXD(&p->header.self) == lmxaddr) + break; + + /* + * descend down to leftmost child page + */ + if (p->header.flag & BT_LEAF) { + DT_PUTPAGE(mp); + return -ESTALE; + } + + /* get the leftmost entry */ + stbl = DT_GETSTBL(p); + pxd = (pxd_t *) & p->slot[stbl[0]]; + + /* get the child page block address */ + bn = addressPXD(pxd); + psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; + /* unpin the parent page */ + DT_PUTPAGE(mp); + } + + /* + * search each page at the current levevl + */ + loop: + stbl = DT_GETSTBL(p); + for (i = 0; i < p->header.nextindex; i++) { + pxd = (pxd_t *) & p->slot[stbl[i]]; + + /* found the specified router entry */ + if (addressPXD(pxd) == addressPXD(kpxd) && + lengthPXD(pxd) == lengthPXD(kpxd)) { + btsp = btstack->top; + btsp->bn = bn; + btsp->index = i; + btsp->mp = mp; + + return 0; + } + } + + /* get the right sibling page if any */ + if (p->header.next) + bn = le64_to_cpu(p->header.next); + else { + DT_PUTPAGE(mp); + return -ESTALE; + } + + /* unpin current page */ + DT_PUTPAGE(mp); + + /* get the right sibling page */ + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + goto loop; +} +#endif /* _NOTYET */ + +/* + * dtRelink() + * + * function: + * link around a freed page. + * + * parameter: + * fp: page to be freed + * + * return: + */ +static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p) +{ + int rc; + struct metapage *mp; + s64 nextbn, prevbn; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *lv; + + nextbn = le64_to_cpu(p->header.next); + prevbn = le64_to_cpu(p->header.prev); + + /* update prev pointer of the next page */ + if (nextbn != 0) { + DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the next page + * + * action: update prev pointer; + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); + jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", + tlck, ip, mp); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + p->header.prev = cpu_to_le64(prevbn); + DT_PUTPAGE(mp); + } + + /* update next pointer of the previous page */ + if (prevbn != 0) { + DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the prev page + * + * action: update next pointer; + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); + jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", + tlck, ip, mp); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + p->header.next = cpu_to_le64(nextbn); + DT_PUTPAGE(mp); + } + + return 0; +} + + +/* + * dtInitRoot() + * + * initialize directory root (inline in inode) + */ +void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + dtroot_t *p; + int fsi; + struct dtslot *f; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *lv; + u16 xflag_save; + + /* + * If this was previously an non-empty directory, we need to remove + * the old directory table. + */ + if (DO_INDEX(ip)) { + if (!jfs_dirtable_inline(ip)) { + struct tblock *tblk = tid_to_tblock(tid); + /* + * We're playing games with the tid's xflag. If + * we're removing a regular file, the file's xtree + * is committed with COMMIT_PMAP, but we always + * commit the directories xtree with COMMIT_PWMAP. + */ + xflag_save = tblk->xflag; + tblk->xflag = 0; + /* + * xtTruncate isn't guaranteed to fully truncate + * the xtree. The caller needs to check i_size + * after committing the transaction to see if + * additional truncation is needed. The + * COMMIT_Stale flag tells caller that we + * initiated the truncation. + */ + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + set_cflag(COMMIT_Stale, ip); + + tblk->xflag = xflag_save; + } else + ip->i_size = 1; + + jfs_ip->next_index = 2; + } else + ip->i_size = IDATASIZE; + + /* + * acquire a transaction lock on the root + * + * action: directory initialization; + */ + tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag, + tlckDTREE | tlckENTRY | tlckBTROOT); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock root */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = DTROOTMAXSLOT; + dtlck->index++; + + p = &jfs_ip->i_dtroot; + + p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; + + p->header.nextindex = 0; + + /* init freelist */ + fsi = 1; + f = &p->slot[fsi]; + + /* init data area of root */ + for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) + f->next = fsi; + f->next = -1; + + p->header.freelist = 1; + p->header.freecnt = 8; + + /* init '..' entry */ + p->header.idotdot = cpu_to_le32(idotdot); + + return; +} + +/* + * add_missing_indices() + * + * function: Fix dtree page in which one or more entries has an invalid index. + * fsck.jfs should really fix this, but it currently does not. + * Called from jfs_readdir when bad index is detected. + */ +static void add_missing_indices(struct inode *inode, s64 bn) +{ + struct ldtentry *d; + struct dt_lock *dtlck; + int i; + uint index; + struct lv *lv; + struct metapage *mp; + dtpage_t *p; + int rc; + s8 *stbl; + tid_t tid; + struct tlock *tlck; + + tid = txBegin(inode->i_sb, 0); + + DT_GETPAGE(inode, bn, mp, PSIZE, p, rc); + + if (rc) { + printk(KERN_ERR "DT_GETPAGE failed!\n"); + goto end; + } + BT_MARK_DIRTY(mp, inode); + + ASSERT(p->header.flag & BT_LEAF); + + tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY); + if (BT_IS_ROOT(mp)) + tlck->type |= tlckBTROOT; + + dtlck = (struct dt_lock *) &tlck->lock; + + stbl = DT_GETSTBL(p); + for (i = 0; i < p->header.nextindex; i++) { + d = (struct ldtentry *) &p->slot[stbl[i]]; + index = le32_to_cpu(d->index); + if ((index < 2) || (index >= JFS_IP(inode)->next_index)) { + d->index = cpu_to_le32(add_index(tid, inode, bn, i)); + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = &dtlck->lv[dtlck->index]; + lv->offset = stbl[i]; + lv->length = 1; + dtlck->index++; + } + } + + DT_PUTPAGE(mp); + (void) txCommit(tid, 1, &inode, 0); +end: + txEnd(tid); +} + +/* + * Buffer to hold directory entry info while traversing a dtree page + * before being fed to the filldir function + */ +struct jfs_dirent { + loff_t position; + int ino; + u16 name_len; + char name[0]; +}; + +/* + * function to determine next variable-sized jfs_dirent in buffer + */ +static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent) +{ + return (struct jfs_dirent *) + ((char *)dirent + + ((sizeof (struct jfs_dirent) + dirent->name_len + 1 + + sizeof (loff_t) - 1) & + ~(sizeof (loff_t) - 1))); +} + +/* + * jfs_readdir() + * + * function: read directory entries sequentially + * from the specified entry offset + * + * parameter: + * + * return: offset = (pn, index) of start entry + * of next jfs_readdir()/dtRead() + */ +int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *ip = filp->f_path.dentry->d_inode; + struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; + int rc = 0; + loff_t dtpos; /* legacy OS/2 style position */ + struct dtoffset { + s16 pn; + s16 index; + s32 unused; + } *dtoffset = (struct dtoffset *) &dtpos; + s64 bn; + struct metapage *mp; + dtpage_t *p; + int index; + s8 *stbl; + struct btstack btstack; + int i, next; + struct ldtentry *d; + struct dtslot *t; + int d_namleft, len, outlen; + unsigned long dirent_buf; + char *name_ptr; + u32 dir_index; + int do_index = 0; + uint loop_count = 0; + struct jfs_dirent *jfs_dirent; + int jfs_dirents; + int overflow, fix_page, page_fixed = 0; + static int unique_pos = 2; /* If we can't fix broken index */ + + if (filp->f_pos == DIREND) + return 0; + + if (DO_INDEX(ip)) { + /* + * persistent index is stored in directory entries. + * Special cases: 0 = . + * 1 = .. + * -1 = End of directory + */ + do_index = 1; + + dir_index = (u32) filp->f_pos; + + if (dir_index > 1) { + struct dir_table_slot dirtab_slot; + + if (dtEmpty(ip) || + (dir_index >= JFS_IP(ip)->next_index)) { + /* Stale position. Directory has shrunk */ + filp->f_pos = DIREND; + return 0; + } + repeat: + rc = read_index(ip, dir_index, &dirtab_slot); + if (rc) { + filp->f_pos = DIREND; + return rc; + } + if (dirtab_slot.flag == DIR_INDEX_FREE) { + if (loop_count++ > JFS_IP(ip)->next_index) { + jfs_err("jfs_readdir detected " + "infinite loop!"); + filp->f_pos = DIREND; + return 0; + } + dir_index = le32_to_cpu(dirtab_slot.addr2); + if (dir_index == -1) { + filp->f_pos = DIREND; + return 0; + } + goto repeat; + } + bn = addressDTS(&dirtab_slot); + index = dirtab_slot.slot; + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) { + filp->f_pos = DIREND; + return 0; + } + if (p->header.flag & BT_INTERNAL) { + jfs_err("jfs_readdir: bad index table"); + DT_PUTPAGE(mp); + filp->f_pos = -1; + return 0; + } + } else { + if (dir_index == 0) { + /* + * self "." + */ + filp->f_pos = 0; + if (filldir(dirent, ".", 1, 0, ip->i_ino, + DT_DIR)) + return 0; + } + /* + * parent ".." + */ + filp->f_pos = 1; + if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR)) + return 0; + + /* + * Find first entry of left-most leaf + */ + if (dtEmpty(ip)) { + filp->f_pos = DIREND; + return 0; + } + + if ((rc = dtReadFirst(ip, &btstack))) + return rc; + + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + } + } else { + /* + * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 + * + * pn = index = 0: First entry "." + * pn = 0; index = 1: Second entry ".." + * pn > 0: Real entries, pn=1 -> leftmost page + * pn = index = -1: No more entries + */ + dtpos = filp->f_pos; + if (dtpos == 0) { + /* build "." entry */ + + if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino, + DT_DIR)) + return 0; + dtoffset->index = 1; + filp->f_pos = dtpos; + } + + if (dtoffset->pn == 0) { + if (dtoffset->index == 1) { + /* build ".." entry */ + + if (filldir(dirent, "..", 2, filp->f_pos, + PARENT(ip), DT_DIR)) + return 0; + } else { + jfs_err("jfs_readdir called with " + "invalid offset!"); + } + dtoffset->pn = 1; + dtoffset->index = 0; + filp->f_pos = dtpos; + } + + if (dtEmpty(ip)) { + filp->f_pos = DIREND; + return 0; + } + + if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) { + jfs_err("jfs_readdir: unexpected rc = %d " + "from dtReadNext", rc); + filp->f_pos = DIREND; + return 0; + } + /* get start leaf page and index */ + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* offset beyond directory eof ? */ + if (bn < 0) { + filp->f_pos = DIREND; + return 0; + } + } + + dirent_buf = __get_free_page(GFP_KERNEL); + if (dirent_buf == 0) { + DT_PUTPAGE(mp); + jfs_warn("jfs_readdir: __get_free_page failed!"); + filp->f_pos = DIREND; + return -ENOMEM; + } + + while (1) { + jfs_dirent = (struct jfs_dirent *) dirent_buf; + jfs_dirents = 0; + overflow = fix_page = 0; + + stbl = DT_GETSTBL(p); + + for (i = index; i < p->header.nextindex; i++) { + d = (struct ldtentry *) & p->slot[stbl[i]]; + + if (((long) jfs_dirent + d->namlen + 1) > + (dirent_buf + PAGE_SIZE)) { + /* DBCS codepages could overrun dirent_buf */ + index = i; + overflow = 1; + break; + } + + d_namleft = d->namlen; + name_ptr = jfs_dirent->name; + jfs_dirent->ino = le32_to_cpu(d->inumber); + + if (do_index) { + len = min(d_namleft, DTLHDRDATALEN); + jfs_dirent->position = le32_to_cpu(d->index); + /* + * d->index should always be valid, but it + * isn't. fsck.jfs doesn't create the + * directory index for the lost+found + * directory. Rather than let it go, + * we can try to fix it. + */ + if ((jfs_dirent->position < 2) || + (jfs_dirent->position >= + JFS_IP(ip)->next_index)) { + if (!page_fixed && !isReadOnly(ip)) { + fix_page = 1; + /* + * setting overflow and setting + * index to i will cause the + * same page to be processed + * again starting here + */ + overflow = 1; + index = i; + break; + } + jfs_dirent->position = unique_pos++; + } + } else { + jfs_dirent->position = dtpos; + len = min(d_namleft, DTLHDRDATALEN_LEGACY); + } + + /* copy the name of head/only segment */ + outlen = jfs_strfromUCS_le(name_ptr, d->name, len, + codepage); + jfs_dirent->name_len = outlen; + + /* copy name in the additional segment(s) */ + next = d->next; + while (next >= 0) { + t = (struct dtslot *) & p->slot[next]; + name_ptr += outlen; + d_namleft -= len; + /* Sanity Check */ + if (d_namleft == 0) { + jfs_error(ip->i_sb, + "JFS:Dtree error: ino = " + "%ld, bn=%Ld, index = %d", + (long)ip->i_ino, + (long long)bn, + i); + goto skip_one; + } + len = min(d_namleft, DTSLOTDATALEN); + outlen = jfs_strfromUCS_le(name_ptr, t->name, + len, codepage); + jfs_dirent->name_len += outlen; + + next = t->next; + } + + jfs_dirents++; + jfs_dirent = next_jfs_dirent(jfs_dirent); +skip_one: + if (!do_index) + dtoffset->index++; + } + + if (!overflow) { + /* Point to next leaf page */ + if (p->header.flag & BT_ROOT) + bn = 0; + else { + bn = le64_to_cpu(p->header.next); + index = 0; + /* update offset (pn:index) for new page */ + if (!do_index) { + dtoffset->pn++; + dtoffset->index = 0; + } + } + page_fixed = 0; + } + + /* unpin previous leaf page */ + DT_PUTPAGE(mp); + + jfs_dirent = (struct jfs_dirent *) dirent_buf; + while (jfs_dirents--) { + filp->f_pos = jfs_dirent->position; + if (filldir(dirent, jfs_dirent->name, + jfs_dirent->name_len, filp->f_pos, + jfs_dirent->ino, DT_UNKNOWN)) + goto out; + jfs_dirent = next_jfs_dirent(jfs_dirent); + } + + if (fix_page) { + add_missing_indices(ip, bn); + page_fixed = 1; + } + + if (!overflow && (bn == 0)) { + filp->f_pos = DIREND; + break; + } + + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) { + free_page(dirent_buf); + return rc; + } + } + + out: + free_page(dirent_buf); + + return rc; +} + + +/* + * dtReadFirst() + * + * function: get the leftmost page of the directory + */ +static int dtReadFirst(struct inode *ip, struct btstack * btstack) +{ + int rc = 0; + s64 bn; + int psize = 288; /* initial in-line directory */ + struct metapage *mp; + dtpage_t *p; + s8 *stbl; + struct btframe *btsp; + pxd_t *xd; + + BT_CLR(btstack); /* reset stack */ + + /* + * descend leftmost path of the tree + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + DT_GETPAGE(ip, bn, mp, psize, p, rc); + if (rc) + return rc; + + /* + * leftmost leaf page + */ + if (p->header.flag & BT_LEAF) { + /* return leftmost entry */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = 0; + btsp->mp = mp; + + return 0; + } + + /* + * descend down to leftmost child page + */ + if (BT_STACK_FULL(btstack)) { + DT_PUTPAGE(mp); + jfs_error(ip->i_sb, "dtReadFirst: btstack overrun"); + BT_STACK_DUMP(btstack); + return -EIO; + } + /* push (bn, index) of the parent page/entry */ + BT_PUSH(btstack, bn, 0); + + /* get the leftmost entry */ + stbl = DT_GETSTBL(p); + xd = (pxd_t *) & p->slot[stbl[0]]; + + /* get the child page block address */ + bn = addressPXD(xd); + psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize; + + /* unpin the parent page */ + DT_PUTPAGE(mp); + } +} + + +/* + * dtReadNext() + * + * function: get the page of the specified offset (pn:index) + * + * return: if (offset > eof), bn = -1; + * + * note: if index > nextindex of the target leaf page, + * start with 1st entry of next leaf page; + */ +static int dtReadNext(struct inode *ip, loff_t * offset, + struct btstack * btstack) +{ + int rc = 0; + struct dtoffset { + s16 pn; + s16 index; + s32 unused; + } *dtoffset = (struct dtoffset *) offset; + s64 bn; + struct metapage *mp; + dtpage_t *p; + int index; + int pn; + s8 *stbl; + struct btframe *btsp, *parent; + pxd_t *xd; + + /* + * get leftmost leaf page pinned + */ + if ((rc = dtReadFirst(ip, btstack))) + return rc; + + /* get leaf page */ + DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); + + /* get the start offset (pn:index) */ + pn = dtoffset->pn - 1; /* Now pn = 0 represents leftmost leaf */ + index = dtoffset->index; + + /* start at leftmost page ? */ + if (pn == 0) { + /* offset beyond eof ? */ + if (index < p->header.nextindex) + goto out; + + if (p->header.flag & BT_ROOT) { + bn = -1; + goto out; + } + + /* start with 1st entry of next leaf page */ + dtoffset->pn++; + dtoffset->index = index = 0; + goto a; + } + + /* start at non-leftmost page: scan parent pages for large pn */ + if (p->header.flag & BT_ROOT) { + bn = -1; + goto out; + } + + /* start after next leaf page ? */ + if (pn > 1) + goto b; + + /* get leaf page pn = 1 */ + a: + bn = le64_to_cpu(p->header.next); + + /* unpin leaf page */ + DT_PUTPAGE(mp); + + /* offset beyond eof ? */ + if (bn == 0) { + bn = -1; + goto out; + } + + goto c; + + /* + * scan last internal page level to get target leaf page + */ + b: + /* unpin leftmost leaf page */ + DT_PUTPAGE(mp); + + /* get left most parent page */ + btsp = btstack->top; + parent = btsp - 1; + bn = parent->bn; + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* scan parent pages at last internal page level */ + while (pn >= p->header.nextindex) { + pn -= p->header.nextindex; + + /* get next parent page address */ + bn = le64_to_cpu(p->header.next); + + /* unpin current parent page */ + DT_PUTPAGE(mp); + + /* offset beyond eof ? */ + if (bn == 0) { + bn = -1; + goto out; + } + + /* get next parent page */ + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* update parent page stack frame */ + parent->bn = bn; + } + + /* get leaf page address */ + stbl = DT_GETSTBL(p); + xd = (pxd_t *) & p->slot[stbl[pn]]; + bn = addressPXD(xd); + + /* unpin parent page */ + DT_PUTPAGE(mp); + + /* + * get target leaf page + */ + c: + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * leaf page has been completed: + * start with 1st entry of next leaf page + */ + if (index >= p->header.nextindex) { + bn = le64_to_cpu(p->header.next); + + /* unpin leaf page */ + DT_PUTPAGE(mp); + + /* offset beyond eof ? */ + if (bn == 0) { + bn = -1; + goto out; + } + + /* get next leaf page */ + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* start with 1st entry of next leaf page */ + dtoffset->pn++; + dtoffset->index = 0; + } + + out: + /* return target leaf page pinned */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = dtoffset->index; + btsp->mp = mp; + + return 0; +} + + +/* + * dtCompare() + * + * function: compare search key with an internal entry + * + * return: + * < 0 if k is < record + * = 0 if k is = record + * > 0 if k is > record + */ +static int dtCompare(struct component_name * key, /* search key */ + dtpage_t * p, /* directory page */ + int si) +{ /* entry slot index */ + wchar_t *kname; + __le16 *name; + int klen, namlen, len, rc; + struct idtentry *ih; + struct dtslot *t; + + /* + * force the left-most key on internal pages, at any level of + * the tree, to be less than any search key. + * this obviates having to update the leftmost key on an internal + * page when the user inserts a new key in the tree smaller than + * anything that has been stored. + * + * (? if/when dtSearch() narrows down to 1st entry (index = 0), + * at any internal page at any level of the tree, + * it descends to child of the entry anyway - + * ? make the entry as min size dummy entry) + * + * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) + * return (1); + */ + + kname = key->name; + klen = key->namlen; + + ih = (struct idtentry *) & p->slot[si]; + si = ih->next; + name = ih->name; + namlen = ih->namlen; + len = min(namlen, DTIHDRDATALEN); + + /* compare with head/only segment */ + len = min(klen, len); + if ((rc = UniStrncmp_le(kname, name, len))) + return rc; + + klen -= len; + namlen -= len; + + /* compare with additional segment(s) */ + kname += len; + while (klen > 0 && namlen > 0) { + /* compare with next name segment */ + t = (struct dtslot *) & p->slot[si]; + len = min(namlen, DTSLOTDATALEN); + len = min(klen, len); + name = t->name; + if ((rc = UniStrncmp_le(kname, name, len))) + return rc; + + klen -= len; + namlen -= len; + kname += len; + si = t->next; + } + + return (klen - namlen); +} + + + + +/* + * ciCompare() + * + * function: compare search key with an (leaf/internal) entry + * + * return: + * < 0 if k is < record + * = 0 if k is = record + * > 0 if k is > record + */ +static int ciCompare(struct component_name * key, /* search key */ + dtpage_t * p, /* directory page */ + int si, /* entry slot index */ + int flag) +{ + wchar_t *kname, x; + __le16 *name; + int klen, namlen, len, rc; + struct ldtentry *lh; + struct idtentry *ih; + struct dtslot *t; + int i; + + /* + * force the left-most key on internal pages, at any level of + * the tree, to be less than any search key. + * this obviates having to update the leftmost key on an internal + * page when the user inserts a new key in the tree smaller than + * anything that has been stored. + * + * (? if/when dtSearch() narrows down to 1st entry (index = 0), + * at any internal page at any level of the tree, + * it descends to child of the entry anyway - + * ? make the entry as min size dummy entry) + * + * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) + * return (1); + */ + + kname = key->name; + klen = key->namlen; + + /* + * leaf page entry + */ + if (p->header.flag & BT_LEAF) { + lh = (struct ldtentry *) & p->slot[si]; + si = lh->next; + name = lh->name; + namlen = lh->namlen; + if (flag & JFS_DIR_INDEX) + len = min(namlen, DTLHDRDATALEN); + else + len = min(namlen, DTLHDRDATALEN_LEGACY); + } + /* + * internal page entry + */ + else { + ih = (struct idtentry *) & p->slot[si]; + si = ih->next; + name = ih->name; + namlen = ih->namlen; + len = min(namlen, DTIHDRDATALEN); + } + + /* compare with head/only segment */ + len = min(klen, len); + for (i = 0; i < len; i++, kname++, name++) { + /* only uppercase if case-insensitive support is on */ + if ((flag & JFS_OS2) == JFS_OS2) + x = UniToupper(le16_to_cpu(*name)); + else + x = le16_to_cpu(*name); + if ((rc = *kname - x)) + return rc; + } + + klen -= len; + namlen -= len; + + /* compare with additional segment(s) */ + while (klen > 0 && namlen > 0) { + /* compare with next name segment */ + t = (struct dtslot *) & p->slot[si]; + len = min(namlen, DTSLOTDATALEN); + len = min(klen, len); + name = t->name; + for (i = 0; i < len; i++, kname++, name++) { + /* only uppercase if case-insensitive support is on */ + if ((flag & JFS_OS2) == JFS_OS2) + x = UniToupper(le16_to_cpu(*name)); + else + x = le16_to_cpu(*name); + + if ((rc = *kname - x)) + return rc; + } + + klen -= len; + namlen -= len; + si = t->next; + } + + return (klen - namlen); +} + + +/* + * ciGetLeafPrefixKey() + * + * function: compute prefix of suffix compression + * from two adjacent leaf entries + * across page boundary + * + * return: non-zero on error + * + */ +static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, + int ri, struct component_name * key, int flag) +{ + int klen, namlen; + wchar_t *pl, *pr, *kname; + struct component_name lkey; + struct component_name rkey; + + lkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), + GFP_KERNEL); + if (lkey.name == NULL) + return -ENOMEM; + + rkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), + GFP_KERNEL); + if (rkey.name == NULL) { + kfree(lkey.name); + return -ENOMEM; + } + + /* get left and right key */ + dtGetKey(lp, li, &lkey, flag); + lkey.name[lkey.namlen] = 0; + + if ((flag & JFS_OS2) == JFS_OS2) + ciToUpper(&lkey); + + dtGetKey(rp, ri, &rkey, flag); + rkey.name[rkey.namlen] = 0; + + + if ((flag & JFS_OS2) == JFS_OS2) + ciToUpper(&rkey); + + /* compute prefix */ + klen = 0; + kname = key->name; + namlen = min(lkey.namlen, rkey.namlen); + for (pl = lkey.name, pr = rkey.name; + namlen; pl++, pr++, namlen--, klen++, kname++) { + *kname = *pr; + if (*pl != *pr) { + key->namlen = klen + 1; + goto free_names; + } + } + + /* l->namlen <= r->namlen since l <= r */ + if (lkey.namlen < rkey.namlen) { + *kname = *pr; + key->namlen = klen + 1; + } else /* l->namelen == r->namelen */ + key->namlen = klen; + +free_names: + kfree(lkey.name); + kfree(rkey.name); + return 0; +} + + + +/* + * dtGetKey() + * + * function: get key of the entry + */ +static void dtGetKey(dtpage_t * p, int i, /* entry index */ + struct component_name * key, int flag) +{ + int si; + s8 *stbl; + struct ldtentry *lh; + struct idtentry *ih; + struct dtslot *t; + int namlen, len; + wchar_t *kname; + __le16 *name; + + /* get entry */ + stbl = DT_GETSTBL(p); + si = stbl[i]; + if (p->header.flag & BT_LEAF) { + lh = (struct ldtentry *) & p->slot[si]; + si = lh->next; + namlen = lh->namlen; + name = lh->name; + if (flag & JFS_DIR_INDEX) + len = min(namlen, DTLHDRDATALEN); + else + len = min(namlen, DTLHDRDATALEN_LEGACY); + } else { + ih = (struct idtentry *) & p->slot[si]; + si = ih->next; + namlen = ih->namlen; + name = ih->name; + len = min(namlen, DTIHDRDATALEN); + } + + key->namlen = namlen; + kname = key->name; + + /* + * move head/only segment + */ + UniStrncpy_from_le(kname, name, len); + + /* + * move additional segment(s) + */ + while (si >= 0) { + /* get next segment */ + t = &p->slot[si]; + kname += len; + namlen -= len; + len = min(namlen, DTSLOTDATALEN); + UniStrncpy_from_le(kname, t->name, len); + + si = t->next; + } +} + + +/* + * dtInsertEntry() + * + * function: allocate free slot(s) and + * write a leaf/internal entry + * + * return: entry slot index + */ +static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, + ddata_t * data, struct dt_lock ** dtlock) +{ + struct dtslot *h, *t; + struct ldtentry *lh = NULL; + struct idtentry *ih = NULL; + int hsi, fsi, klen, len, nextindex; + wchar_t *kname; + __le16 *name; + s8 *stbl; + pxd_t *xd; + struct dt_lock *dtlck = *dtlock; + struct lv *lv; + int xsi, n; + s64 bn = 0; + struct metapage *mp = NULL; + + klen = key->namlen; + kname = key->name; + + /* allocate a free slot */ + hsi = fsi = p->header.freelist; + h = &p->slot[fsi]; + p->header.freelist = h->next; + --p->header.freecnt; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + + lv = & dtlck->lv[dtlck->index]; + lv->offset = hsi; + + /* write head/only segment */ + if (p->header.flag & BT_LEAF) { + lh = (struct ldtentry *) h; + lh->next = h->next; + lh->inumber = cpu_to_le32(data->leaf.ino); + lh->namlen = klen; + name = lh->name; + if (data->leaf.ip) { + len = min(klen, DTLHDRDATALEN); + if (!(p->header.flag & BT_ROOT)) + bn = addressPXD(&p->header.self); + lh->index = cpu_to_le32(add_index(data->leaf.tid, + data->leaf.ip, + bn, index)); + } else + len = min(klen, DTLHDRDATALEN_LEGACY); + } else { + ih = (struct idtentry *) h; + ih->next = h->next; + xd = (pxd_t *) ih; + *xd = data->xd; + ih->namlen = klen; + name = ih->name; + len = min(klen, DTIHDRDATALEN); + } + + UniStrncpy_to_le(name, kname, len); + + n = 1; + xsi = hsi; + + /* write additional segment(s) */ + t = h; + klen -= len; + while (klen) { + /* get free slot */ + fsi = p->header.freelist; + t = &p->slot[fsi]; + p->header.freelist = t->next; + --p->header.freecnt; + + /* is next slot contiguous ? */ + if (fsi != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + + lv->offset = fsi; + n = 0; + } + + kname += len; + len = min(klen, DTSLOTDATALEN); + UniStrncpy_to_le(t->name, kname, len); + + n++; + xsi = fsi; + klen -= len; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; + + /* terminate last/only segment */ + if (h == t) { + /* single segment entry */ + if (p->header.flag & BT_LEAF) + lh->next = -1; + else + ih->next = -1; + } else + /* multi-segment entry */ + t->next = -1; + + /* if insert into middle, shift right succeeding entries in stbl */ + stbl = DT_GETSTBL(p); + nextindex = p->header.nextindex; + if (index < nextindex) { + memmove(stbl + index + 1, stbl + index, nextindex - index); + + if ((p->header.flag & BT_LEAF) && data->leaf.ip) { + s64 lblock; + + /* + * Need to update slot number for entries that moved + * in the stbl + */ + mp = NULL; + for (n = index + 1; n <= nextindex; n++) { + lh = (struct ldtentry *) & (p->slot[stbl[n]]); + modify_index(data->leaf.tid, data->leaf.ip, + le32_to_cpu(lh->index), bn, n, + &mp, &lblock); + } + if (mp) + release_metapage(mp); + } + } + + stbl[index] = hsi; + + /* advance next available entry index of stbl */ + ++p->header.nextindex; +} + + +/* + * dtMoveEntry() + * + * function: move entries from split/left page to new/right page + * + * nextindex of dst page and freelist/freecnt of both pages + * are updated. + */ +static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, + struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, + int do_index) +{ + int ssi, next; /* src slot index */ + int di; /* dst entry index */ + int dsi; /* dst slot index */ + s8 *sstbl, *dstbl; /* sorted entry table */ + int snamlen, len; + struct ldtentry *slh, *dlh = NULL; + struct idtentry *sih, *dih = NULL; + struct dtslot *h, *s, *d; + struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock; + struct lv *slv, *dlv; + int xssi, ns, nd; + int sfsi; + + sstbl = (s8 *) & sp->slot[sp->header.stblindex]; + dstbl = (s8 *) & dp->slot[dp->header.stblindex]; + + dsi = dp->header.freelist; /* first (whole page) free slot */ + sfsi = sp->header.freelist; + + /* linelock destination entry slot */ + dlv = & ddtlck->lv[ddtlck->index]; + dlv->offset = dsi; + + /* linelock source entry slot */ + slv = & sdtlck->lv[sdtlck->index]; + slv->offset = sstbl[si]; + xssi = slv->offset - 1; + + /* + * move entries + */ + ns = nd = 0; + for (di = 0; si < sp->header.nextindex; si++, di++) { + ssi = sstbl[si]; + dstbl[di] = dsi; + + /* is next slot contiguous ? */ + if (ssi != xssi + 1) { + /* close current linelock */ + slv->length = ns; + sdtlck->index++; + + /* open new linelock */ + if (sdtlck->index < sdtlck->maxcnt) + slv++; + else { + sdtlck = (struct dt_lock *) txLinelock(sdtlck); + slv = & sdtlck->lv[0]; + } + + slv->offset = ssi; + ns = 0; + } + + /* + * move head/only segment of an entry + */ + /* get dst slot */ + h = d = &dp->slot[dsi]; + + /* get src slot and move */ + s = &sp->slot[ssi]; + if (sp->header.flag & BT_LEAF) { + /* get source entry */ + slh = (struct ldtentry *) s; + dlh = (struct ldtentry *) h; + snamlen = slh->namlen; + + if (do_index) { + len = min(snamlen, DTLHDRDATALEN); + dlh->index = slh->index; /* little-endian */ + } else + len = min(snamlen, DTLHDRDATALEN_LEGACY); + + memcpy(dlh, slh, 6 + len * 2); + + next = slh->next; + + /* update dst head/only segment next field */ + dsi++; + dlh->next = dsi; + } else { + sih = (struct idtentry *) s; + snamlen = sih->namlen; + + len = min(snamlen, DTIHDRDATALEN); + dih = (struct idtentry *) h; + memcpy(dih, sih, 10 + len * 2); + next = sih->next; + + dsi++; + dih->next = dsi; + } + + /* free src head/only segment */ + s->next = sfsi; + s->cnt = 1; + sfsi = ssi; + + ns++; + nd++; + xssi = ssi; + + /* + * move additional segment(s) of the entry + */ + snamlen -= len; + while ((ssi = next) >= 0) { + /* is next slot contiguous ? */ + if (ssi != xssi + 1) { + /* close current linelock */ + slv->length = ns; + sdtlck->index++; + + /* open new linelock */ + if (sdtlck->index < sdtlck->maxcnt) + slv++; + else { + sdtlck = + (struct dt_lock *) + txLinelock(sdtlck); + slv = & sdtlck->lv[0]; + } + + slv->offset = ssi; + ns = 0; + } + + /* get next source segment */ + s = &sp->slot[ssi]; + + /* get next destination free slot */ + d++; + + len = min(snamlen, DTSLOTDATALEN); + UniStrncpy_le(d->name, s->name, len); + + ns++; + nd++; + xssi = ssi; + + dsi++; + d->next = dsi; + + /* free source segment */ + next = s->next; + s->next = sfsi; + s->cnt = 1; + sfsi = ssi; + + snamlen -= len; + } /* end while */ + + /* terminate dst last/only segment */ + if (h == d) { + /* single segment entry */ + if (dp->header.flag & BT_LEAF) + dlh->next = -1; + else + dih->next = -1; + } else + /* multi-segment entry */ + d->next = -1; + } /* end for */ + + /* close current linelock */ + slv->length = ns; + sdtlck->index++; + *sdtlock = sdtlck; + + dlv->length = nd; + ddtlck->index++; + *ddtlock = ddtlck; + + /* update source header */ + sp->header.freelist = sfsi; + sp->header.freecnt += nd; + + /* update destination header */ + dp->header.nextindex = di; + + dp->header.freelist = dsi; + dp->header.freecnt -= nd; +} + + +/* + * dtDeleteEntry() + * + * function: free a (leaf/internal) entry + * + * log freelist header, stbl, and each segment slot of entry + * (even though last/only segment next field is modified, + * physical image logging requires all segment slots of + * the entry logged to avoid applying previous updates + * to the same slots) + */ +static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock) +{ + int fsi; /* free entry slot index */ + s8 *stbl; + struct dtslot *t; + int si, freecnt; + struct dt_lock *dtlck = *dtlock; + struct lv *lv; + int xsi, n; + + /* get free entry slot index */ + stbl = DT_GETSTBL(p); + fsi = stbl[fi]; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + + lv->offset = fsi; + + /* get the head/only segment */ + t = &p->slot[fsi]; + if (p->header.flag & BT_LEAF) + si = ((struct ldtentry *) t)->next; + else + si = ((struct idtentry *) t)->next; + t->next = si; + t->cnt = 1; + + n = freecnt = 1; + xsi = fsi; + + /* find the last/only segment */ + while (si >= 0) { + /* is next slot contiguous ? */ + if (si != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + + lv->offset = si; + n = 0; + } + + n++; + xsi = si; + freecnt++; + + t = &p->slot[si]; + t->cnt = 1; + si = t->next; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; + + /* update freelist */ + t->next = p->header.freelist; + p->header.freelist = fsi; + p->header.freecnt += freecnt; + + /* if delete from middle, + * shift left the succedding entries in the stbl + */ + si = p->header.nextindex; + if (fi < si - 1) + memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1); + + p->header.nextindex--; +} + + +/* + * dtTruncateEntry() + * + * function: truncate a (leaf/internal) entry + * + * log freelist header, stbl, and each segment slot of entry + * (even though last/only segment next field is modified, + * physical image logging requires all segment slots of + * the entry logged to avoid applying previous updates + * to the same slots) + */ +static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock) +{ + int tsi; /* truncate entry slot index */ + s8 *stbl; + struct dtslot *t; + int si, freecnt; + struct dt_lock *dtlck = *dtlock; + struct lv *lv; + int fsi, xsi, n; + + /* get free entry slot index */ + stbl = DT_GETSTBL(p); + tsi = stbl[ti]; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + + lv->offset = tsi; + + /* get the head/only segment */ + t = &p->slot[tsi]; + ASSERT(p->header.flag & BT_INTERNAL); + ((struct idtentry *) t)->namlen = 0; + si = ((struct idtentry *) t)->next; + ((struct idtentry *) t)->next = -1; + + n = 1; + freecnt = 0; + fsi = si; + xsi = tsi; + + /* find the last/only segment */ + while (si >= 0) { + /* is next slot contiguous ? */ + if (si != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + + lv->offset = si; + n = 0; + } + + n++; + xsi = si; + freecnt++; + + t = &p->slot[si]; + t->cnt = 1; + si = t->next; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; + + /* update freelist */ + if (freecnt == 0) + return; + t->next = p->header.freelist; + p->header.freelist = fsi; + p->header.freecnt += freecnt; +} + + +/* + * dtLinelockFreelist() + */ +static void dtLinelockFreelist(dtpage_t * p, /* directory page */ + int m, /* max slot index */ + struct dt_lock ** dtlock) +{ + int fsi; /* free entry slot index */ + struct dtslot *t; + int si; + struct dt_lock *dtlck = *dtlock; + struct lv *lv; + int xsi, n; + + /* get free entry slot index */ + fsi = p->header.freelist; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + + lv->offset = fsi; + + n = 1; + xsi = fsi; + + t = &p->slot[fsi]; + si = t->next; + + /* find the last/only segment */ + while (si < m && si >= 0) { + /* is next slot contiguous ? */ + if (si != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + + lv->offset = si; + n = 0; + } + + n++; + xsi = si; + + t = &p->slot[si]; + si = t->next; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; +} + + +/* + * NAME: dtModify + * + * FUNCTION: Modify the inode number part of a directory entry + * + * PARAMETERS: + * tid - Transaction id + * ip - Inode of parent directory + * key - Name of entry to be modified + * orig_ino - Original inode number expected in entry + * new_ino - New inode number to put into entry + * flag - JFS_RENAME + * + * RETURNS: + * -ESTALE - If entry found does not match orig_ino passed in + * -ENOENT - If no entry can be found to match key + * 0 - If successfully modified entry + */ +int dtModify(tid_t tid, struct inode *ip, + struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag) +{ + int rc; + s64 bn; + struct metapage *mp; + dtpage_t *p; + int index; + struct btstack btstack; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *lv; + s8 *stbl; + int entry_si; /* entry slot index */ + struct ldtentry *entry; + + /* + * search for the entry to modify: + * + * dtSearch() returns (leaf page pinned, index at which to modify). + */ + if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag))) + return rc; + + /* retrieve search result */ + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page of named entry + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + + /* get slot index of the entry */ + stbl = DT_GETSTBL(p); + entry_si = stbl[index]; + + /* linelock entry */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = entry_si; + lv->length = 1; + dtlck->index++; + + /* get the head/only segment */ + entry = (struct ldtentry *) & p->slot[entry_si]; + + /* substitute the inode number of the entry */ + entry->inumber = cpu_to_le32(new_ino); + + /* unpin the leaf page */ + DT_PUTPAGE(mp); + + return 0; +} diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h new file mode 100644 index 00000000..2545bb31 --- /dev/null +++ b/fs/jfs/jfs_dtree.h @@ -0,0 +1,269 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DTREE +#define _H_JFS_DTREE + +/* + * jfs_dtree.h: directory B+-tree manager + */ + +#include "jfs_btree.h" + +typedef union { + struct { + tid_t tid; + struct inode *ip; + u32 ino; + } leaf; + pxd_t xd; +} ddata_t; + + +/* + * entry segment/slot + * + * an entry consists of type dependent head/only segment/slot and + * additional segments/slots linked vi next field; + * N.B. last/only segment of entry is terminated by next = -1; + */ +/* + * directory page slot + */ +struct dtslot { + s8 next; /* 1: */ + s8 cnt; /* 1: */ + __le16 name[15]; /* 30: */ +}; /* (32) */ + + +#define DATASLOTSIZE 16 +#define L2DATASLOTSIZE 4 +#define DTSLOTSIZE 32 +#define L2DTSLOTSIZE 5 +#define DTSLOTHDRSIZE 2 +#define DTSLOTDATASIZE 30 +#define DTSLOTDATALEN 15 + +/* + * internal node entry head/only segment + */ +struct idtentry { + pxd_t xd; /* 8: child extent descriptor */ + + s8 next; /* 1: */ + u8 namlen; /* 1: */ + __le16 name[11]; /* 22: 2-byte aligned */ +}; /* (32) */ + +#define DTIHDRSIZE 10 +#define DTIHDRDATALEN 11 + +/* compute number of slots for entry */ +#define NDTINTERNAL(klen) (DIV_ROUND_UP((4 + (klen)), 15)) + + +/* + * leaf node entry head/only segment + * + * For legacy filesystems, name contains 13 wchars -- no index field + */ +struct ldtentry { + __le32 inumber; /* 4: 4-byte aligned */ + s8 next; /* 1: */ + u8 namlen; /* 1: */ + __le16 name[11]; /* 22: 2-byte aligned */ + __le32 index; /* 4: index into dir_table */ +}; /* (32) */ + +#define DTLHDRSIZE 6 +#define DTLHDRDATALEN_LEGACY 13 /* Old (OS/2) format */ +#define DTLHDRDATALEN 11 + +/* + * dir_table used for directory traversal during readdir + */ + +/* + * Keep persistent index for directory entries + */ +#define DO_INDEX(INODE) (JFS_SBI((INODE)->i_sb)->mntflag & JFS_DIR_INDEX) + +/* + * Maximum entry in inline directory table + */ +#define MAX_INLINE_DIRTABLE_ENTRY 13 + +struct dir_table_slot { + u8 rsrvd; /* 1: */ + u8 flag; /* 1: 0 if free */ + u8 slot; /* 1: slot within leaf page of entry */ + u8 addr1; /* 1: upper 8 bits of leaf page address */ + __le32 addr2; /* 4: lower 32 bits of leaf page address -OR- + index of next entry when this entry was deleted */ +}; /* (8) */ + +/* + * flag values + */ +#define DIR_INDEX_VALID 1 +#define DIR_INDEX_FREE 0 + +#define DTSaddress(dir_table_slot, address64)\ +{\ + (dir_table_slot)->addr1 = ((u64)address64) >> 32;\ + (dir_table_slot)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ +} + +#define addressDTS(dts)\ + ( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) ) + +/* compute number of slots for entry */ +#define NDTLEAF_LEGACY(klen) (DIV_ROUND_UP((2 + (klen)), 15)) +#define NDTLEAF NDTINTERNAL + + +/* + * directory root page (in-line in on-disk inode): + * + * cf. dtpage_t below. + */ +typedef union { + struct { + struct dasd DASD; /* 16: DASD limit/usage info */ + + u8 flag; /* 1: */ + u8 nextindex; /* 1: next free entry in stbl */ + s8 freecnt; /* 1: free count */ + s8 freelist; /* 1: freelist header */ + + __le32 idotdot; /* 4: parent inode number */ + + s8 stbl[8]; /* 8: sorted entry index table */ + } header; /* (32) */ + + struct dtslot slot[9]; +} dtroot_t; + +#define PARENT(IP) \ + (le32_to_cpu(JFS_IP(IP)->i_dtroot.header.idotdot)) + +#define DTROOTMAXSLOT 9 + +#define dtEmpty(IP) (JFS_IP(IP)->i_dtroot.header.nextindex == 0) + + +/* + * directory regular page: + * + * entry slot array of 32 byte slot + * + * sorted entry slot index table (stbl): + * contiguous slots at slot specified by stblindex, + * 1-byte per entry + * 512 byte block: 16 entry tbl (1 slot) + * 1024 byte block: 32 entry tbl (1 slot) + * 2048 byte block: 64 entry tbl (2 slot) + * 4096 byte block: 128 entry tbl (4 slot) + * + * data area: + * 512 byte block: 16 - 2 = 14 slot + * 1024 byte block: 32 - 2 = 30 slot + * 2048 byte block: 64 - 3 = 61 slot + * 4096 byte block: 128 - 5 = 123 slot + * + * N.B. index is 0-based; index fields refer to slot index + * except nextindex which refers to entry index in stbl; + * end of entry stot list or freelist is marked with -1. + */ +typedef union { + struct { + __le64 next; /* 8: next sibling */ + __le64 prev; /* 8: previous sibling */ + + u8 flag; /* 1: */ + u8 nextindex; /* 1: next entry index in stbl */ + s8 freecnt; /* 1: */ + s8 freelist; /* 1: slot index of head of freelist */ + + u8 maxslot; /* 1: number of slots in page slot[] */ + u8 stblindex; /* 1: slot index of start of stbl */ + u8 rsrvd[2]; /* 2: */ + + pxd_t self; /* 8: self pxd */ + } header; /* (32) */ + + struct dtslot slot[128]; +} dtpage_t; + +#define DTPAGEMAXSLOT 128 + +#define DT8THPGNODEBYTES 512 +#define DT8THPGNODETSLOTS 1 +#define DT8THPGNODESLOTS 16 + +#define DTQTRPGNODEBYTES 1024 +#define DTQTRPGNODETSLOTS 1 +#define DTQTRPGNODESLOTS 32 + +#define DTHALFPGNODEBYTES 2048 +#define DTHALFPGNODETSLOTS 2 +#define DTHALFPGNODESLOTS 64 + +#define DTFULLPGNODEBYTES 4096 +#define DTFULLPGNODETSLOTS 4 +#define DTFULLPGNODESLOTS 128 + +#define DTENTRYSTART 1 + +/* get sorted entry table of the page */ +#define DT_GETSTBL(p) ( ((p)->header.flag & BT_ROOT) ?\ + ((dtroot_t *)(p))->header.stbl : \ + (s8 *)&(p)->slot[(p)->header.stblindex] ) + +/* + * Flags for dtSearch + */ +#define JFS_CREATE 1 +#define JFS_LOOKUP 2 +#define JFS_REMOVE 3 +#define JFS_RENAME 4 + +/* + * Maximum file offset for directories. + */ +#define DIREND INT_MAX + +/* + * external declarations + */ +extern void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot); + +extern int dtSearch(struct inode *ip, struct component_name * key, + ino_t * data, struct btstack * btstack, int flag); + +extern int dtInsert(tid_t tid, struct inode *ip, struct component_name * key, + ino_t * ino, struct btstack * btstack); + +extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key, + ino_t * data, int flag); + +extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key, + ino_t * orig_ino, ino_t new_ino, int flag); + +extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir); +#endif /* !_H_JFS_DTREE */ diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c new file mode 100644 index 00000000..e5fe8506 --- /dev/null +++ b/fs/jfs/jfs_extent.c @@ -0,0 +1,651 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_extent.h" +#include "jfs_debug.h" + +/* + * forward references + */ +static int extBalloc(struct inode *, s64, s64 *, s64 *); +#ifdef _NOTYET +static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *); +#endif +static s64 extRoundDown(s64 nb); + +#define DPD(a) (printk("(a): %d\n",(a))) +#define DPC(a) (printk("(a): %c\n",(a))) +#define DPL1(a) \ +{ \ + if ((a) >> 32) \ + printk("(a): %x%08x ",(a)); \ + else \ + printk("(a): %x ",(a) << 32); \ +} +#define DPL(a) \ +{ \ + if ((a) >> 32) \ + printk("(a): %x%08x\n",(a)); \ + else \ + printk("(a): %x\n",(a) << 32); \ +} + +#define DPD1(a) (printk("(a): %d ",(a))) +#define DPX(a) (printk("(a): %08x\n",(a))) +#define DPX1(a) (printk("(a): %08x ",(a))) +#define DPS(a) (printk("%s\n",(a))) +#define DPE(a) (printk("\nENTERING: %s\n",(a))) +#define DPE1(a) (printk("\nENTERING: %s",(a))) +#define DPS1(a) (printk(" %s ",(a))) + + +/* + * NAME: extAlloc() + * + * FUNCTION: allocate an extent for a specified page range within a + * file. + * + * PARAMETERS: + * ip - the inode of the file. + * xlen - requested extent length. + * pno - the starting page number with the file. + * xp - pointer to an xad. on entry, xad describes an + * extent that is used as an allocation hint if the + * xaddr of the xad is non-zero. on successful exit, + * the xad describes the newly allocated extent. + * abnr - bool indicating whether the newly allocated extent + * should be marked as allocated but not recorded. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +int +extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + s64 nxlen, nxaddr, xoff, hint, xaddr = 0; + int rc; + int xflag; + + /* This blocks if we are low on resources */ + txBeginAnon(ip->i_sb); + + /* Avoid race with jfs_commit_inode() */ + mutex_lock(&JFS_IP(ip)->commit_mutex); + + /* validate extent length */ + if (xlen > MAXXLEN) + xlen = MAXXLEN; + + /* get the page's starting extent offset */ + xoff = pno << sbi->l2nbperpage; + + /* check if an allocation hint was provided */ + if ((hint = addressXAD(xp))) { + /* get the size of the extent described by the hint */ + nxlen = lengthXAD(xp); + + /* check if the hint is for the portion of the file + * immediately previous to the current allocation + * request and if hint extent has the same abnr + * value as the current request. if so, we can + * extend the hint extent to include the current + * extent if we can allocate the blocks immediately + * following the hint extent. + */ + if (offsetXAD(xp) + nxlen == xoff && + abnr == ((xp->flag & XAD_NOTRECORDED) ? true : false)) + xaddr = hint + nxlen; + + /* adjust the hint to the last block of the extent */ + hint += (nxlen - 1); + } + + /* allocate the disk blocks for the extent. initially, extBalloc() + * will try to allocate disk blocks for the requested size (xlen). + * if this fails (xlen contiguous free blocks not available), it'll + * try to allocate a smaller number of blocks (producing a smaller + * extent), with this smaller number of blocks consisting of the + * requested number of blocks rounded down to the next smaller + * power of 2 number (i.e. 16 -> 8). it'll continue to round down + * and retry the allocation until the number of blocks to allocate + * is smaller than the number of blocks per page. + */ + nxlen = xlen; + if ((rc = extBalloc(ip, hint ? hint : INOHINT(ip), &nxlen, &nxaddr))) { + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return (rc); + } + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, nxlen); + if (rc) { + dbFree(ip, nxaddr, (s64) nxlen); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return rc; + } + + /* determine the value of the extent flag */ + xflag = abnr ? XAD_NOTRECORDED : 0; + + /* if we can extend the hint extent to cover the current request, + * extend it. otherwise, insert a new extent to + * cover the current request. + */ + if (xaddr && xaddr == nxaddr) + rc = xtExtend(0, ip, xoff, (int) nxlen, 0); + else + rc = xtInsert(0, ip, xflag, xoff, (int) nxlen, &nxaddr, 0); + + /* if the extend or insert failed, + * free the newly allocated blocks and return the error. + */ + if (rc) { + dbFree(ip, nxaddr, nxlen); + dquot_free_block(ip, nxlen); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return (rc); + } + + /* set the results of the extent allocation */ + XADaddress(xp, nxaddr); + XADlength(xp, nxlen); + XADoffset(xp, xoff); + xp->flag = xflag; + + mark_inode_dirty(ip); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + /* + * COMMIT_SyncList flags an anonymous tlock on page that is on + * sync list. + * We need to commit the inode to get the page written disk. + */ + if (test_and_clear_cflag(COMMIT_Synclist,ip)) + jfs_commit_inode(ip, 0); + + return (0); +} + + +#ifdef _NOTYET +/* + * NAME: extRealloc() + * + * FUNCTION: extend the allocation of a file extent containing a + * partial back last page. + * + * PARAMETERS: + * ip - the inode of the file. + * cp - cbuf for the partial backed last page. + * xlen - request size of the resulting extent. + * xp - pointer to an xad. on successful exit, the xad + * describes the newly allocated extent. + * abnr - bool indicating whether the newly allocated extent + * should be marked as allocated but not recorded. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) +{ + struct super_block *sb = ip->i_sb; + s64 xaddr, xlen, nxaddr, delta, xoff; + s64 ntail, nextend, ninsert; + int rc, nbperpage = JFS_SBI(sb)->nbperpage; + int xflag; + + /* This blocks if we are low on resources */ + txBeginAnon(ip->i_sb); + + mutex_lock(&JFS_IP(ip)->commit_mutex); + /* validate extent length */ + if (nxlen > MAXXLEN) + nxlen = MAXXLEN; + + /* get the extend (partial) page's disk block address and + * number of blocks. + */ + xaddr = addressXAD(xp); + xlen = lengthXAD(xp); + xoff = offsetXAD(xp); + + /* if the extend page is abnr and if the request is for + * the extent to be allocated and recorded, + * make the page allocated and recorded. + */ + if ((xp->flag & XAD_NOTRECORDED) && !abnr) { + xp->flag = 0; + if ((rc = xtUpdate(0, ip, xp))) + goto exit; + } + + /* try to allocated the request number of blocks for the + * extent. dbRealloc() first tries to satisfy the request + * by extending the allocation in place. otherwise, it will + * try to allocate a new set of blocks large enough for the + * request. in satisfying a request, dbReAlloc() may allocate + * less than what was request but will always allocate enough + * space as to satisfy the extend page. + */ + if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr))) + goto exit; + + /* Allocat blocks to quota. */ + rc = dquot_alloc_block(ip, nxlen); + if (rc) { + dbFree(ip, nxaddr, (s64) nxlen); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return rc; + } + + delta = nxlen - xlen; + + /* check if the extend page is not abnr but the request is abnr + * and the allocated disk space is for more than one page. if this + * is the case, there is a miss match of abnr between the extend page + * and the one or more pages following the extend page. as a result, + * two extents will have to be manipulated. the first will be that + * of the extent of the extend page and will be manipulated thru + * an xtExtend() or an xtTailgate(), depending upon whether the + * disk allocation occurred as an inplace extension. the second + * extent will be manipulated (created) through an xtInsert() and + * will be for the pages following the extend page. + */ + if (abnr && (!(xp->flag & XAD_NOTRECORDED)) && (nxlen > nbperpage)) { + ntail = nbperpage; + nextend = ntail - xlen; + ninsert = nxlen - nbperpage; + + xflag = XAD_NOTRECORDED; + } else { + ntail = nxlen; + nextend = delta; + ninsert = 0; + + xflag = xp->flag; + } + + /* if we were able to extend the disk allocation in place, + * extend the extent. otherwise, move the extent to a + * new disk location. + */ + if (xaddr == nxaddr) { + /* extend the extent */ + if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { + dbFree(ip, xaddr + xlen, delta); + dquot_free_block(ip, nxlen); + goto exit; + } + } else { + /* + * move the extent to a new location: + * + * xtTailgate() accounts for relocated tail extent; + */ + if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { + dbFree(ip, nxaddr, nxlen); + dquot_free_block(ip, nxlen); + goto exit; + } + } + + + /* check if we need to also insert a new extent */ + if (ninsert) { + /* perform the insert. if it fails, free the blocks + * to be inserted and make it appear that we only did + * the xtExtend() or xtTailgate() above. + */ + xaddr = nxaddr + ntail; + if (xtInsert (0, ip, xflag, xoff + ntail, (int) ninsert, + &xaddr, 0)) { + dbFree(ip, xaddr, (s64) ninsert); + delta = nextend; + nxlen = ntail; + xflag = 0; + } + } + + /* set the return results */ + XADaddress(xp, nxaddr); + XADlength(xp, nxlen); + XADoffset(xp, xoff); + xp->flag = xflag; + + mark_inode_dirty(ip); +exit: + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return (rc); +} +#endif /* _NOTYET */ + + +/* + * NAME: extHint() + * + * FUNCTION: produce an extent allocation hint for a file offset. + * + * PARAMETERS: + * ip - the inode of the file. + * offset - file offset for which the hint is needed. + * xp - pointer to the xad that is to be filled in with + * the hint. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + */ +int extHint(struct inode *ip, s64 offset, xad_t * xp) +{ + struct super_block *sb = ip->i_sb; + int nbperpage = JFS_SBI(sb)->nbperpage; + s64 prev; + int rc = 0; + s64 xaddr; + int xlen; + int xflag; + + /* init the hint as "no hint provided" */ + XADaddress(xp, 0); + + /* determine the starting extent offset of the page previous + * to the page containing the offset. + */ + prev = ((offset & ~POFFSET) >> JFS_SBI(sb)->l2bsize) - nbperpage; + + /* if the offset is in the first page of the file, no hint provided. + */ + if (prev < 0) + goto out; + + rc = xtLookup(ip, prev, nbperpage, &xflag, &xaddr, &xlen, 0); + + if ((rc == 0) && xlen) { + if (xlen != nbperpage) { + jfs_error(ip->i_sb, "extHint: corrupt xtree"); + rc = -EIO; + } + XADaddress(xp, xaddr); + XADlength(xp, xlen); + XADoffset(xp, prev); + /* + * only preserve the abnr flag within the xad flags + * of the returned hint. + */ + xp->flag = xflag & XAD_NOTRECORDED; + } else + rc = 0; + +out: + return (rc); +} + + +/* + * NAME: extRecord() + * + * FUNCTION: change a page with a file from not recorded to recorded. + * + * PARAMETERS: + * ip - inode of the file. + * cp - cbuf of the file page. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +int extRecord(struct inode *ip, xad_t * xp) +{ + int rc; + + txBeginAnon(ip->i_sb); + + mutex_lock(&JFS_IP(ip)->commit_mutex); + + /* update the extent */ + rc = xtUpdate(0, ip, xp); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return rc; +} + + +#ifdef _NOTYET +/* + * NAME: extFill() + * + * FUNCTION: allocate disk space for a file page that represents + * a file hole. + * + * PARAMETERS: + * ip - the inode of the file. + * cp - cbuf of the file page represent the hole. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +int extFill(struct inode *ip, xad_t * xp) +{ + int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage; + s64 blkno = offsetXAD(xp) >> ip->i_blkbits; + +// assert(ISSPARSE(ip)); + + /* initialize the extent allocation hint */ + XADaddress(xp, 0); + + /* allocate an extent to fill the hole */ + if ((rc = extAlloc(ip, nbperpage, blkno, xp, false))) + return (rc); + + assert(lengthPXD(xp) == nbperpage); + + return (0); +} +#endif /* _NOTYET */ + + +/* + * NAME: extBalloc() + * + * FUNCTION: allocate disk blocks to form an extent. + * + * initially, we will try to allocate disk blocks for the + * requested size (nblocks). if this fails (nblocks + * contiguous free blocks not available), we'll try to allocate + * a smaller number of blocks (producing a smaller extent), with + * this smaller number of blocks consisting of the requested + * number of blocks rounded down to the next smaller power of 2 + * number (i.e. 16 -> 8). we'll continue to round down and + * retry the allocation until the number of blocks to allocate + * is smaller than the number of blocks per page. + * + * PARAMETERS: + * ip - the inode of the file. + * hint - disk block number to be used as an allocation hint. + * *nblocks - pointer to an s64 value. on entry, this value specifies + * the desired number of block to be allocated. on successful + * exit, this value is set to the number of blocks actually + * allocated. + * blkno - pointer to a block address that is filled in on successful + * return with the starting block number of the newly + * allocated block range. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +static int +extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) +{ + struct jfs_inode_info *ji = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + s64 nb, nblks, daddr, max; + int rc, nbperpage = sbi->nbperpage; + struct bmap *bmp = sbi->bmap; + int ag; + + /* get the number of blocks to initially attempt to allocate. + * we'll first try the number of blocks requested unless this + * number is greater than the maximum number of contiguous free + * blocks in the map. in that case, we'll start off with the + * maximum free. + */ + max = (s64) 1 << bmp->db_maxfreebud; + if (*nblocks >= max && *nblocks > nbperpage) + nb = nblks = (max > nbperpage) ? max : nbperpage; + else + nb = nblks = *nblocks; + + /* try to allocate blocks */ + while ((rc = dbAlloc(ip, hint, nb, &daddr)) != 0) { + /* if something other than an out of space error, + * stop and return this error. + */ + if (rc != -ENOSPC) + return (rc); + + /* decrease the allocation request size */ + nb = min(nblks, extRoundDown(nb)); + + /* give up if we cannot cover a page */ + if (nb < nbperpage) + return (rc); + } + + *nblocks = nb; + *blkno = daddr; + + if (S_ISREG(ip->i_mode) && (ji->fileset == FILESYSTEM_I)) { + ag = BLKTOAG(daddr, sbi); + spin_lock_irq(&ji->ag_lock); + if (ji->active_ag == -1) { + atomic_inc(&bmp->db_active[ag]); + ji->active_ag = ag; + } else if (ji->active_ag != ag) { + atomic_dec(&bmp->db_active[ji->active_ag]); + atomic_inc(&bmp->db_active[ag]); + ji->active_ag = ag; + } + spin_unlock_irq(&ji->ag_lock); + } + + return (0); +} + + +#ifdef _NOTYET +/* + * NAME: extBrealloc() + * + * FUNCTION: attempt to extend an extent's allocation. + * + * Initially, we will try to extend the extent's allocation + * in place. If this fails, we'll try to move the extent + * to a new set of blocks. If moving the extent, we initially + * will try to allocate disk blocks for the requested size + * (newnblks). if this fails (new contiguous free blocks not + * available), we'll try to allocate a smaller number of + * blocks (producing a smaller extent), with this smaller + * number of blocks consisting of the requested number of + * blocks rounded down to the next smaller power of 2 + * number (i.e. 16 -> 8). We'll continue to round down and + * retry the allocation until the number of blocks to allocate + * is smaller than the number of blocks per page. + * + * PARAMETERS: + * ip - the inode of the file. + * blkno - starting block number of the extents current allocation. + * nblks - number of blocks within the extents current allocation. + * newnblks - pointer to a s64 value. on entry, this value is the + * the new desired extent size (number of blocks). on + * successful exit, this value is set to the extent's actual + * new size (new number of blocks). + * newblkno - the starting block number of the extents new allocation. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +static int +extBrealloc(struct inode *ip, + s64 blkno, s64 nblks, s64 * newnblks, s64 * newblkno) +{ + int rc; + + /* try to extend in place */ + if ((rc = dbExtend(ip, blkno, nblks, *newnblks - nblks)) == 0) { + *newblkno = blkno; + return (0); + } else { + if (rc != -ENOSPC) + return (rc); + } + + /* in place extension not possible. + * try to move the extent to a new set of blocks. + */ + return (extBalloc(ip, blkno, newnblks, newblkno)); +} +#endif /* _NOTYET */ + + +/* + * NAME: extRoundDown() + * + * FUNCTION: round down a specified number of blocks to the next + * smallest power of 2 number. + * + * PARAMETERS: + * nb - the inode of the file. + * + * RETURN VALUES: + * next smallest power of 2 number. + */ +static s64 extRoundDown(s64 nb) +{ + int i; + u64 m, k; + + for (i = 0, m = (u64) 1 << 63; i < 64; i++, m >>= 1) { + if (m & nb) + break; + } + + i = 63 - i; + k = (u64) 1 << i; + k = ((k - 1) & nb) ? k : k >> 1; + + return (k); +} diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h new file mode 100644 index 00000000..b567e12c --- /dev/null +++ b/fs/jfs/jfs_extent.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_EXTENT +#define _H_JFS_EXTENT + +/* get block allocation allocation hint as location of disk inode */ +#define INOHINT(ip) \ + (addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1) + +extern int extAlloc(struct inode *, s64, s64, xad_t *, bool); +extern int extFill(struct inode *, xad_t *); +extern int extHint(struct inode *, s64, xad_t *); +extern int extRealloc(struct inode *, s64, xad_t *, bool); +extern int extRecord(struct inode *, xad_t *); + +#endif /* _H_JFS_EXTENT */ diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h new file mode 100644 index 00000000..b3f5463f --- /dev/null +++ b/fs/jfs/jfs_filsys.h @@ -0,0 +1,282 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2003 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_FILSYS +#define _H_JFS_FILSYS + +/* + * jfs_filsys.h + * + * file system (implementation-dependent) constants + * + * refer to for system wide implementation-dependent constants + */ + +/* + * file system option (superblock flag) + */ + +/* directory option */ +#define JFS_UNICODE 0x00000001 /* unicode name */ + +/* mount time flags for error handling */ +#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ +#define JFS_ERR_CONTINUE 0x00000004 /* continue */ +#define JFS_ERR_PANIC 0x00000008 /* panic */ + +/* Quota support */ +#define JFS_USRQUOTA 0x00000010 +#define JFS_GRPQUOTA 0x00000020 + +/* mount time flag to disable journaling to disk */ +#define JFS_NOINTEGRITY 0x00000040 + +/* commit option */ +#define JFS_COMMIT 0x00000f00 /* commit option mask */ +#define JFS_GROUPCOMMIT 0x00000100 /* group (of 1) commit */ +#define JFS_LAZYCOMMIT 0x00000200 /* lazy commit */ +#define JFS_TMPFS 0x00000400 /* temporary file system - + * do not log/commit: + * Never implemented + */ + +/* log logical volume option */ +#define JFS_INLINELOG 0x00000800 /* inline log within file system */ +#define JFS_INLINEMOVE 0x00001000 /* inline log being moved */ + +/* Secondary aggregate inode table */ +#define JFS_BAD_SAIT 0x00010000 /* current secondary ait is bad */ + +/* sparse regular file support */ +#define JFS_SPARSE 0x00020000 /* sparse regular file */ + +/* DASD Limits F226941 */ +#define JFS_DASD_ENABLED 0x00040000 /* DASD limits enabled */ +#define JFS_DASD_PRIME 0x00080000 /* Prime DASD usage on boot */ + +/* big endian flag */ +#define JFS_SWAP_BYTES 0x00100000 /* running on big endian computer */ + +/* Directory index */ +#define JFS_DIR_INDEX 0x00200000 /* Persistent index for */ + +/* platform options */ +#define JFS_LINUX 0x10000000 /* Linux support */ +#define JFS_DFS 0x20000000 /* DCE DFS LFS support */ +/* Never implemented */ + +#define JFS_OS2 0x40000000 /* OS/2 support */ +/* case-insensitive name/directory support */ + +#define JFS_AIX 0x80000000 /* AIX support */ + +/* + * buffer cache configuration + */ +/* page size */ +#ifdef PSIZE +#undef PSIZE +#endif +#define PSIZE 4096 /* page size (in byte) */ +#define L2PSIZE 12 /* log2(PSIZE) */ +#define POFFSET 4095 /* offset within page */ + +/* buffer page size */ +#define BPSIZE PSIZE + +/* + * fs fundamental size + * + * PSIZE >= file system block size >= PBSIZE >= DISIZE + */ +#define PBSIZE 512 /* physical block size (in byte) */ +#define L2PBSIZE 9 /* log2(PBSIZE) */ + +#define DISIZE 512 /* on-disk inode size (in byte) */ +#define L2DISIZE 9 /* log2(DISIZE) */ + +#define IDATASIZE 256 /* inode inline data size */ +#define IXATTRSIZE 128 /* inode inline extended attribute size */ + +#define XTPAGE_SIZE 4096 +#define log2_PAGESIZE 12 + +#define IAG_SIZE 4096 +#define IAG_EXTENT_SIZE 4096 +#define INOSPERIAG 4096 /* number of disk inodes per iag */ +#define L2INOSPERIAG 12 /* l2 number of disk inodes per iag */ +#define INOSPEREXT 32 /* number of disk inode per extent */ +#define L2INOSPEREXT 5 /* l2 number of disk inode per extent */ +#define IXSIZE (DISIZE * INOSPEREXT) /* inode extent size */ +#define INOSPERPAGE 8 /* number of disk inodes per 4K page */ +#define L2INOSPERPAGE 3 /* log2(INOSPERPAGE) */ + +#define IAGFREELIST_LWM 64 + +#define INODE_EXTENT_SIZE IXSIZE /* inode extent size */ +#define NUM_INODE_PER_EXTENT INOSPEREXT +#define NUM_INODE_PER_IAG INOSPERIAG + +#define MINBLOCKSIZE 512 +#define MAXBLOCKSIZE 4096 +#define MAXFILESIZE ((s64)1 << 52) + +#define JFS_LINK_MAX 0xffffffff + +/* Minimum number of bytes supported for a JFS partition */ +#define MINJFS (0x1000000) +#define MINJFSTEXT "16" + +/* + * file system block size -> physical block size + */ +#define LBOFFSET(x) ((x) & (PBSIZE - 1)) +#define LBNUMBER(x) ((x) >> L2PBSIZE) +#define LBLK2PBLK(sb,b) ((b) << (sb->s_blocksize_bits - L2PBSIZE)) +#define PBLK2LBLK(sb,b) ((b) >> (sb->s_blocksize_bits - L2PBSIZE)) +/* size in byte -> last page number */ +#define SIZE2PN(size) ( ((s64)((size) - 1)) >> (L2PSIZE) ) +/* size in byte -> last file system block number */ +#define SIZE2BN(size, l2bsize) ( ((s64)((size) - 1)) >> (l2bsize) ) + +/* + * fixed physical block address (physical block size = 512 byte) + * + * NOTE: since we can't guarantee a physical block size of 512 bytes the use of + * these macros should be removed and the byte offset macros used instead. + */ +#define SUPER1_B 64 /* primary superblock */ +#define AIMAP_B (SUPER1_B + 8) /* 1st extent of aggregate inode map */ +#define AITBL_B (AIMAP_B + 16) /* + * 1st extent of aggregate inode table + */ +#define SUPER2_B (AITBL_B + 32) /* 2ndary superblock pbn */ +#define BMAP_B (SUPER2_B + 8) /* block allocation map */ + +/* + * SIZE_OF_SUPER defines the total amount of space reserved on disk for the + * superblock. This is not the same as the superblock structure, since all of + * this space is not currently being used. + */ +#define SIZE_OF_SUPER PSIZE + +/* + * SIZE_OF_AG_TABLE defines the amount of space reserved to hold the AG table + */ +#define SIZE_OF_AG_TABLE PSIZE + +/* + * SIZE_OF_MAP_PAGE defines the amount of disk space reserved for each page of + * the inode allocation map (to hold iag) + */ +#define SIZE_OF_MAP_PAGE PSIZE + +/* + * fixed byte offset address + */ +#define SUPER1_OFF 0x8000 /* primary superblock */ +#define AIMAP_OFF (SUPER1_OFF + SIZE_OF_SUPER) + /* + * Control page of aggregate inode map + * followed by 1st extent of map + */ +#define AITBL_OFF (AIMAP_OFF + (SIZE_OF_MAP_PAGE << 1)) + /* + * 1st extent of aggregate inode table + */ +#define SUPER2_OFF (AITBL_OFF + INODE_EXTENT_SIZE) + /* + * secondary superblock + */ +#define BMAP_OFF (SUPER2_OFF + SIZE_OF_SUPER) + /* + * block allocation map + */ + +/* + * The following macro is used to indicate the number of reserved disk blocks at + * the front of an aggregate, in terms of physical blocks. This value is + * currently defined to be 32K. This turns out to be the same as the primary + * superblock's address, since it directly follows the reserved blocks. + */ +#define AGGR_RSVD_BLOCKS SUPER1_B + +/* + * The following macro is used to indicate the number of reserved bytes at the + * front of an aggregate. This value is currently defined to be 32K. This + * turns out to be the same as the primary superblock's byte offset, since it + * directly follows the reserved blocks. + */ +#define AGGR_RSVD_BYTES SUPER1_OFF + +/* + * The following macro defines the byte offset for the first inode extent in + * the aggregate inode table. This allows us to find the self inode to find the + * rest of the table. Currently this value is 44K. + */ +#define AGGR_INODE_TABLE_START AITBL_OFF + +/* + * fixed reserved inode number + */ +/* aggregate inode */ +#define AGGR_RESERVED_I 0 /* aggregate inode (reserved) */ +#define AGGREGATE_I 1 /* aggregate inode map inode */ +#define BMAP_I 2 /* aggregate block allocation map inode */ +#define LOG_I 3 /* aggregate inline log inode */ +#define BADBLOCK_I 4 /* aggregate bad block inode */ +#define FILESYSTEM_I 16 /* 1st/only fileset inode in ait: + * fileset inode map inode + */ + +/* per fileset inode */ +#define FILESET_RSVD_I 0 /* fileset inode (reserved) */ +#define FILESET_EXT_I 1 /* fileset inode extension */ +#define ROOT_I 2 /* fileset root inode */ +#define ACL_I 3 /* fileset ACL inode */ + +#define FILESET_OBJECT_I 4 /* the first fileset inode available for a file + * or directory or link... + */ +#define FIRST_FILESET_INO 16 /* the first aggregate inode which describes + * an inode. (To fsck this is also the first + * inode in part 2 of the agg inode table.) + */ + +/* + * directory configuration + */ +#define JFS_NAME_MAX 255 +#define JFS_PATH_MAX BPSIZE + + +/* + * file system state (superblock state) + */ +#define FM_CLEAN 0x00000000 /* file system is unmounted and clean */ +#define FM_MOUNT 0x00000001 /* file system is mounted cleanly */ +#define FM_DIRTY 0x00000002 /* file system was not unmounted and clean + * when mounted or + * commit failure occurred while being mounted: + * fsck() must be run to repair + */ +#define FM_LOGREDO 0x00000004 /* log based recovery (logredo()) failed: + * fsck() must be run to repair + */ +#define FM_EXTENDFS 0x00000008 /* file system extendfs() in progress */ + +#endif /* _H_JFS_FILSYS */ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c new file mode 100644 index 00000000..b78b2f97 --- /dev/null +++ b/fs/jfs/jfs_imap.c @@ -0,0 +1,3187 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_imap.c: inode allocation map manager + * + * Serialization: + * Each AG has a simple lock which is used to control the serialization of + * the AG level lists. This lock should be taken first whenever an AG + * level list will be modified or accessed. + * + * Each IAG is locked by obtaining the buffer for the IAG page. + * + * There is also a inode lock for the inode map inode. A read lock needs to + * be taken whenever an IAG is read from the map or the global level + * information is read. A write lock needs to be taken whenever the global + * level information is modified or an atomic operation needs to be used. + * + * If more than one IAG is read at one time, the read lock may not + * be given up until all of the IAG's are read. Otherwise, a deadlock + * may occur when trying to obtain the read lock while another thread + * holding the read lock is waiting on the IAG already being held. + * + * The control page of the inode map is read into memory by diMount(). + * Thereafter it should only be modified in memory and then it will be + * written out when the filesystem is unmounted by diUnmount(). + */ + +#include +#include +#include +#include +#include + +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_filsys.h" +#include "jfs_dinode.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_metapage.h" +#include "jfs_superblock.h" +#include "jfs_debug.h" + +/* + * imap locks + */ +/* iag free list lock */ +#define IAGFREE_LOCK_INIT(imap) mutex_init(&imap->im_freelock) +#define IAGFREE_LOCK(imap) mutex_lock(&imap->im_freelock) +#define IAGFREE_UNLOCK(imap) mutex_unlock(&imap->im_freelock) + +/* per ag iag list locks */ +#define AG_LOCK_INIT(imap,index) mutex_init(&(imap->im_aglock[index])) +#define AG_LOCK(imap,agno) mutex_lock(&imap->im_aglock[agno]) +#define AG_UNLOCK(imap,agno) mutex_unlock(&imap->im_aglock[agno]) + +/* + * forward references + */ +static int diAllocAG(struct inomap *, int, bool, struct inode *); +static int diAllocAny(struct inomap *, int, bool, struct inode *); +static int diAllocBit(struct inomap *, struct iag *, int); +static int diAllocExt(struct inomap *, int, struct inode *); +static int diAllocIno(struct inomap *, int, struct inode *); +static int diFindFree(u32, int); +static int diNewExt(struct inomap *, struct iag *, int); +static int diNewIAG(struct inomap *, int *, int, struct metapage **); +static void duplicateIXtree(struct super_block *, s64, int, s64 *); + +static int diIAGRead(struct inomap * imap, int, struct metapage **); +static int copy_from_dinode(struct dinode *, struct inode *); +static void copy_to_dinode(struct dinode *, struct inode *); + +/* + * NAME: diMount() + * + * FUNCTION: initialize the incore inode map control structures for + * a fileset or aggregate init time. + * + * the inode map's control structure (dinomap) is + * brought in from disk and placed in virtual memory. + * + * PARAMETERS: + * ipimap - pointer to inode map inode for the aggregate or fileset. + * + * RETURN VALUES: + * 0 - success + * -ENOMEM - insufficient free virtual memory. + * -EIO - i/o error. + */ +int diMount(struct inode *ipimap) +{ + struct inomap *imap; + struct metapage *mp; + int index; + struct dinomap_disk *dinom_le; + + /* + * allocate/initialize the in-memory inode map control structure + */ + /* allocate the in-memory inode map control structure. */ + imap = kmalloc(sizeof(struct inomap), GFP_KERNEL); + if (imap == NULL) { + jfs_err("diMount: kmalloc returned NULL!"); + return -ENOMEM; + } + + /* read the on-disk inode map control structure. */ + + mp = read_metapage(ipimap, + IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + kfree(imap); + return -EIO; + } + + /* copy the on-disk version to the in-memory version. */ + dinom_le = (struct dinomap_disk *) mp->data; + imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag); + imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag); + atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos)); + atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree)); + imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext); + imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext); + for (index = 0; index < MAXAG; index++) { + imap->im_agctl[index].inofree = + le32_to_cpu(dinom_le->in_agctl[index].inofree); + imap->im_agctl[index].extfree = + le32_to_cpu(dinom_le->in_agctl[index].extfree); + imap->im_agctl[index].numinos = + le32_to_cpu(dinom_le->in_agctl[index].numinos); + imap->im_agctl[index].numfree = + le32_to_cpu(dinom_le->in_agctl[index].numfree); + } + + /* release the buffer. */ + release_metapage(mp); + + /* + * allocate/initialize inode allocation map locks + */ + /* allocate and init iag free list lock */ + IAGFREE_LOCK_INIT(imap); + + /* allocate and init ag list locks */ + for (index = 0; index < MAXAG; index++) { + AG_LOCK_INIT(imap, index); + } + + /* bind the inode map inode and inode map control structure + * to each other. + */ + imap->im_ipimap = ipimap; + JFS_IP(ipimap)->i_imap = imap; + + return (0); +} + + +/* + * NAME: diUnmount() + * + * FUNCTION: write to disk the incore inode map control structures for + * a fileset or aggregate at unmount time. + * + * PARAMETERS: + * ipimap - pointer to inode map inode for the aggregate or fileset. + * + * RETURN VALUES: + * 0 - success + * -ENOMEM - insufficient free virtual memory. + * -EIO - i/o error. + */ +int diUnmount(struct inode *ipimap, int mounterror) +{ + struct inomap *imap = JFS_IP(ipimap)->i_imap; + + /* + * update the on-disk inode map control structure + */ + + if (!(mounterror || isReadOnly(ipimap))) + diSync(ipimap); + + /* + * Invalidate the page cache buffers + */ + truncate_inode_pages(ipimap->i_mapping, 0); + + /* + * free in-memory control structure + */ + kfree(imap); + + return (0); +} + + +/* + * diSync() + */ +int diSync(struct inode *ipimap) +{ + struct dinomap_disk *dinom_le; + struct inomap *imp = JFS_IP(ipimap)->i_imap; + struct metapage *mp; + int index; + + /* + * write imap global conrol page + */ + /* read the on-disk inode map control structure */ + mp = get_metapage(ipimap, + IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + jfs_err("diSync: get_metapage failed!"); + return -EIO; + } + + /* copy the in-memory version to the on-disk version */ + dinom_le = (struct dinomap_disk *) mp->data; + dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag); + dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag); + dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos)); + dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree)); + dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext); + dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext); + for (index = 0; index < MAXAG; index++) { + dinom_le->in_agctl[index].inofree = + cpu_to_le32(imp->im_agctl[index].inofree); + dinom_le->in_agctl[index].extfree = + cpu_to_le32(imp->im_agctl[index].extfree); + dinom_le->in_agctl[index].numinos = + cpu_to_le32(imp->im_agctl[index].numinos); + dinom_le->in_agctl[index].numfree = + cpu_to_le32(imp->im_agctl[index].numfree); + } + + /* write out the control structure */ + write_metapage(mp); + + /* + * write out dirty pages of imap + */ + filemap_write_and_wait(ipimap->i_mapping); + + diWriteSpecial(ipimap, 0); + + return (0); +} + + +/* + * NAME: diRead() + * + * FUNCTION: initialize an incore inode from disk. + * + * on entry, the specifed incore inode should itself + * specify the disk inode number corresponding to the + * incore inode (i.e. i_number should be initialized). + * + * this routine handles incore inode initialization for + * both "special" and "regular" inodes. special inodes + * are those required early in the mount process and + * require special handling since much of the file system + * is not yet initialized. these "special" inodes are + * identified by a NULL inode map inode pointer and are + * actually initialized by a call to diReadSpecial(). + * + * for regular inodes, the iag describing the disk inode + * is read from disk to determine the inode extent address + * for the disk inode. with the inode extent address in + * hand, the page of the extent that contains the disk + * inode is read and the disk inode is copied to the + * incore inode. + * + * PARAMETERS: + * ip - pointer to incore inode to be initialized from disk. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOMEM - insufficient memory + * + */ +int diRead(struct inode *ip) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + int iagno, ino, extno, rc; + struct inode *ipimap; + struct dinode *dp; + struct iag *iagp; + struct metapage *mp; + s64 blkno, agstart; + struct inomap *imap; + int block_offset; + int inodes_left; + unsigned long pageno; + int rel_inode; + + jfs_info("diRead: ino = %ld", ip->i_ino); + + ipimap = sbi->ipimap; + JFS_IP(ip)->ipimap = ipimap; + + /* determine the iag number for this inode (number) */ + iagno = INOTOIAG(ip->i_ino); + + /* read the iag */ + imap = JFS_IP(ipimap)->i_imap; + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + rc = diIAGRead(imap, iagno, &mp); + IREAD_UNLOCK(ipimap); + if (rc) { + jfs_err("diRead: diIAGRead returned %d", rc); + return (rc); + } + + iagp = (struct iag *) mp->data; + + /* determine inode extent that holds the disk inode */ + ino = ip->i_ino & (INOSPERIAG - 1); + extno = ino >> L2INOSPEREXT; + + if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) || + (addressPXD(&iagp->inoext[extno]) == 0)) { + release_metapage(mp); + return -ESTALE; + } + + /* get disk block number of the page within the inode extent + * that holds the disk inode. + */ + blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage); + + /* get the ag for the iag */ + agstart = le64_to_cpu(iagp->agstart); + + release_metapage(mp); + + rel_inode = (ino & (INOSPERPAGE - 1)); + pageno = blkno >> sbi->l2nbperpage; + + if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) { + /* + * OS/2 didn't always align inode extents on page boundaries + */ + inodes_left = + (sbi->nbperpage - block_offset) << sbi->l2niperblk; + + if (rel_inode < inodes_left) + rel_inode += block_offset << sbi->l2niperblk; + else { + pageno += 1; + rel_inode -= inodes_left; + } + } + + /* read the page of disk inode */ + mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); + if (!mp) { + jfs_err("diRead: read_metapage failed"); + return -EIO; + } + + /* locate the disk inode requested */ + dp = (struct dinode *) mp->data; + dp += rel_inode; + + if (ip->i_ino != le32_to_cpu(dp->di_number)) { + jfs_error(ip->i_sb, "diRead: i_ino != di_number"); + rc = -EIO; + } else if (le32_to_cpu(dp->di_nlink) == 0) + rc = -ESTALE; + else + /* copy the disk inode to the in-memory inode */ + rc = copy_from_dinode(dp, ip); + + release_metapage(mp); + + /* set the ag for the inode */ + JFS_IP(ip)->agstart = agstart; + JFS_IP(ip)->active_ag = -1; + + return (rc); +} + + +/* + * NAME: diReadSpecial() + * + * FUNCTION: initialize a 'special' inode from disk. + * + * this routines handles aggregate level inodes. The + * inode cache cannot differentiate between the + * aggregate inodes and the filesystem inodes, so we + * handle these here. We don't actually use the aggregate + * inode map, since these inodes are at a fixed location + * and in some cases the aggregate inode map isn't initialized + * yet. + * + * PARAMETERS: + * sb - filesystem superblock + * inum - aggregate inode number + * secondary - 1 if secondary aggregate inode table + * + * RETURN VALUES: + * new inode - success + * NULL - i/o error. + */ +struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + uint address; + struct dinode *dp; + struct inode *ip; + struct metapage *mp; + + ip = new_inode(sb); + if (ip == NULL) { + jfs_err("diReadSpecial: new_inode returned NULL!"); + return ip; + } + + if (secondary) { + address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage; + JFS_IP(ip)->ipimap = sbi->ipaimap2; + } else { + address = AITBL_OFF >> L2PSIZE; + JFS_IP(ip)->ipimap = sbi->ipaimap; + } + + ASSERT(inum < INOSPEREXT); + + ip->i_ino = inum; + + address += inum >> 3; /* 8 inodes per 4K page */ + + /* read the page of fixed disk inode (AIT) in raw mode */ + mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); + if (mp == NULL) { + ip->i_nlink = 1; /* Don't want iput() deleting it */ + iput(ip); + return (NULL); + } + + /* get the pointer to the disk inode of interest */ + dp = (struct dinode *) (mp->data); + dp += inum % 8; /* 8 inodes per 4K page */ + + /* copy on-disk inode to in-memory inode */ + if ((copy_from_dinode(dp, ip)) != 0) { + /* handle bad return by returning NULL for ip */ + ip->i_nlink = 1; /* Don't want iput() deleting it */ + iput(ip); + /* release the page */ + release_metapage(mp); + return (NULL); + + } + + ip->i_mapping->a_ops = &jfs_metapage_aops; + mapping_set_gfp_mask(ip->i_mapping, GFP_NOFS); + + /* Allocations to metadata inodes should not affect quotas */ + ip->i_flags |= S_NOQUOTA; + + if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) { + sbi->gengen = le32_to_cpu(dp->di_gengen); + sbi->inostamp = le32_to_cpu(dp->di_inostamp); + } + + /* release the page */ + release_metapage(mp); + + /* + * __mark_inode_dirty expects inodes to be hashed. Since we don't + * want special inodes in the fileset inode space, we make them + * appear hashed, but do not put on any lists. hlist_del() + * will work fine and require no locking. + */ + hlist_add_fake(&ip->i_hash); + + return (ip); +} + +/* + * NAME: diWriteSpecial() + * + * FUNCTION: Write the special inode to disk + * + * PARAMETERS: + * ip - special inode + * secondary - 1 if secondary aggregate inode table + * + * RETURN VALUES: none + */ + +void diWriteSpecial(struct inode *ip, int secondary) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + uint address; + struct dinode *dp; + ino_t inum = ip->i_ino; + struct metapage *mp; + + if (secondary) + address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage; + else + address = AITBL_OFF >> L2PSIZE; + + ASSERT(inum < INOSPEREXT); + + address += inum >> 3; /* 8 inodes per 4K page */ + + /* read the page of fixed disk inode (AIT) in raw mode */ + mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); + if (mp == NULL) { + jfs_err("diWriteSpecial: failed to read aggregate inode " + "extent!"); + return; + } + + /* get the pointer to the disk inode of interest */ + dp = (struct dinode *) (mp->data); + dp += inum % 8; /* 8 inodes per 4K page */ + + /* copy on-disk inode to in-memory inode */ + copy_to_dinode(dp, ip); + memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288); + + if (inum == FILESYSTEM_I) + dp->di_gengen = cpu_to_le32(sbi->gengen); + + /* write the page */ + write_metapage(mp); +} + +/* + * NAME: diFreeSpecial() + * + * FUNCTION: Free allocated space for special inode + */ +void diFreeSpecial(struct inode *ip) +{ + if (ip == NULL) { + jfs_err("diFreeSpecial called with NULL ip!"); + return; + } + filemap_write_and_wait(ip->i_mapping); + truncate_inode_pages(ip->i_mapping, 0); + iput(ip); +} + + + +/* + * NAME: diWrite() + * + * FUNCTION: write the on-disk inode portion of the in-memory inode + * to its corresponding on-disk inode. + * + * on entry, the specifed incore inode should itself + * specify the disk inode number corresponding to the + * incore inode (i.e. i_number should be initialized). + * + * the inode contains the inode extent address for the disk + * inode. with the inode extent address in hand, the + * page of the extent that contains the disk inode is + * read and the disk inode portion of the incore inode + * is copied to the disk inode. + * + * PARAMETERS: + * tid - transacation id + * ip - pointer to incore inode to be written to the inode extent. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + */ +int diWrite(tid_t tid, struct inode *ip) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + int rc = 0; + s32 ino; + struct dinode *dp; + s64 blkno; + int block_offset; + int inodes_left; + struct metapage *mp; + unsigned long pageno; + int rel_inode; + int dioffset; + struct inode *ipimap; + uint type; + lid_t lid; + struct tlock *ditlck, *tlck; + struct linelock *dilinelock, *ilinelock; + struct lv *lv; + int n; + + ipimap = jfs_ip->ipimap; + + ino = ip->i_ino & (INOSPERIAG - 1); + + if (!addressPXD(&(jfs_ip->ixpxd)) || + (lengthPXD(&(jfs_ip->ixpxd)) != + JFS_IP(ipimap)->i_imap->im_nbperiext)) { + jfs_error(ip->i_sb, "diWrite: ixpxd invalid"); + return -EIO; + } + + /* + * read the page of disk inode containing the specified inode: + */ + /* compute the block address of the page */ + blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage); + + rel_inode = (ino & (INOSPERPAGE - 1)); + pageno = blkno >> sbi->l2nbperpage; + + if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) { + /* + * OS/2 didn't always align inode extents on page boundaries + */ + inodes_left = + (sbi->nbperpage - block_offset) << sbi->l2niperblk; + + if (rel_inode < inodes_left) + rel_inode += block_offset << sbi->l2niperblk; + else { + pageno += 1; + rel_inode -= inodes_left; + } + } + /* read the page of disk inode */ + retry: + mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); + if (!mp) + return -EIO; + + /* get the pointer to the disk inode */ + dp = (struct dinode *) mp->data; + dp += rel_inode; + + dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE; + + /* + * acquire transaction lock on the on-disk inode; + * N.B. tlock is acquired on ipimap not ip; + */ + if ((ditlck = + txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL) + goto retry; + dilinelock = (struct linelock *) & ditlck->lock; + + /* + * copy btree root from in-memory inode to on-disk inode + * + * (tlock is taken from inline B+-tree root in in-memory + * inode when the B+-tree root is updated, which is pointed + * by jfs_ip->blid as well as being on tx tlock list) + * + * further processing of btree root is based on the copy + * in in-memory inode, where txLog() will log from, and, + * for xtree root, txUpdateMap() will update map and reset + * XAD_NEW bit; + */ + + if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) { + /* + * This is the special xtree inside the directory for storing + * the directory table + */ + xtpage_t *p, *xp; + xad_t *xad; + + jfs_ip->xtlid = 0; + tlck = lid_to_tlock(lid); + assert(tlck->type & tlckXTREE); + tlck->type |= tlckBTROOT; + tlck->mp = mp; + ilinelock = (struct linelock *) & tlck->lock; + + /* + * copy xtree root from inode to dinode: + */ + p = &jfs_ip->i_xtroot; + xp = (xtpage_t *) &dp->di_dirtable; + lv = ilinelock->lv; + for (n = 0; n < ilinelock->index; n++, lv++) { + memcpy(&xp->xad[lv->offset], &p->xad[lv->offset], + lv->length << L2XTSLOTSIZE); + } + + /* reset on-disk (metadata page) xtree XAD_NEW bit */ + xad = &xp->xad[XTENTRYSTART]; + for (n = XTENTRYSTART; + n < le16_to_cpu(xp->header.nextindex); n++, xad++) + if (xad->flag & (XAD_NEW | XAD_EXTENDED)) + xad->flag &= ~(XAD_NEW | XAD_EXTENDED); + } + + if ((lid = jfs_ip->blid) == 0) + goto inlineData; + jfs_ip->blid = 0; + + tlck = lid_to_tlock(lid); + type = tlck->type; + tlck->type |= tlckBTROOT; + tlck->mp = mp; + ilinelock = (struct linelock *) & tlck->lock; + + /* + * regular file: 16 byte (XAD slot) granularity + */ + if (type & tlckXTREE) { + xtpage_t *p, *xp; + xad_t *xad; + + /* + * copy xtree root from inode to dinode: + */ + p = &jfs_ip->i_xtroot; + xp = &dp->di_xtroot; + lv = ilinelock->lv; + for (n = 0; n < ilinelock->index; n++, lv++) { + memcpy(&xp->xad[lv->offset], &p->xad[lv->offset], + lv->length << L2XTSLOTSIZE); + } + + /* reset on-disk (metadata page) xtree XAD_NEW bit */ + xad = &xp->xad[XTENTRYSTART]; + for (n = XTENTRYSTART; + n < le16_to_cpu(xp->header.nextindex); n++, xad++) + if (xad->flag & (XAD_NEW | XAD_EXTENDED)) + xad->flag &= ~(XAD_NEW | XAD_EXTENDED); + } + /* + * directory: 32 byte (directory entry slot) granularity + */ + else if (type & tlckDTREE) { + dtpage_t *p, *xp; + + /* + * copy dtree root from inode to dinode: + */ + p = (dtpage_t *) &jfs_ip->i_dtroot; + xp = (dtpage_t *) & dp->di_dtroot; + lv = ilinelock->lv; + for (n = 0; n < ilinelock->index; n++, lv++) { + memcpy(&xp->slot[lv->offset], &p->slot[lv->offset], + lv->length << L2DTSLOTSIZE); + } + } else { + jfs_err("diWrite: UFO tlock"); + } + + inlineData: + /* + * copy inline symlink from in-memory inode to on-disk inode + */ + if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) { + lv = & dilinelock->lv[dilinelock->index]; + lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE; + lv->length = 2; + memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE); + dilinelock->index++; + } + /* + * copy inline data from in-memory inode to on-disk inode: + * 128 byte slot granularity + */ + if (test_cflag(COMMIT_Inlineea, ip)) { + lv = & dilinelock->lv[dilinelock->index]; + lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE; + lv->length = 1; + memcpy(&dp->di_inlineea, jfs_ip->i_inline_ea, INODESLOTSIZE); + dilinelock->index++; + + clear_cflag(COMMIT_Inlineea, ip); + } + + /* + * lock/copy inode base: 128 byte slot granularity + */ + lv = & dilinelock->lv[dilinelock->index]; + lv->offset = dioffset >> L2INODESLOTSIZE; + copy_to_dinode(dp, ip); + if (test_and_clear_cflag(COMMIT_Dirtable, ip)) { + lv->length = 2; + memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96); + } else + lv->length = 1; + dilinelock->index++; + + /* release the buffer holding the updated on-disk inode. + * the buffer will be later written by commit processing. + */ + write_metapage(mp); + + return (rc); +} + + +/* + * NAME: diFree(ip) + * + * FUNCTION: free a specified inode from the inode working map + * for a fileset or aggregate. + * + * if the inode to be freed represents the first (only) + * free inode within the iag, the iag will be placed on + * the ag free inode list. + * + * freeing the inode will cause the inode extent to be + * freed if the inode is the only allocated inode within + * the extent. in this case all the disk resource backing + * up the inode extent will be freed. in addition, the iag + * will be placed on the ag extent free list if the extent + * is the first free extent in the iag. if freeing the + * extent also means that no free inodes will exist for + * the iag, the iag will also be removed from the ag free + * inode list. + * + * the iag describing the inode will be freed if the extent + * is to be freed and it is the only backed extent within + * the iag. in this case, the iag will be removed from the + * ag free extent list and ag free inode list and placed on + * the inode map's free iag list. + * + * a careful update approach is used to provide consistency + * in the face of updates to multiple buffers. under this + * approach, all required buffers are obtained before making + * any updates and are held until all updates are complete. + * + * PARAMETERS: + * ip - inode to be freed. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + */ +int diFree(struct inode *ip) +{ + int rc; + ino_t inum = ip->i_ino; + struct iag *iagp, *aiagp, *biagp, *ciagp, *diagp; + struct metapage *mp, *amp, *bmp, *cmp, *dmp; + int iagno, ino, extno, bitno, sword, agno; + int back, fwd; + u32 bitmap, mask; + struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap; + struct inomap *imap = JFS_IP(ipimap)->i_imap; + pxd_t freepxd; + tid_t tid; + struct inode *iplist[3]; + struct tlock *tlck; + struct pxd_lock *pxdlock; + + /* + * This is just to suppress compiler warnings. The same logic that + * references these variables is used to initialize them. + */ + aiagp = biagp = ciagp = diagp = NULL; + + /* get the iag number containing the inode. + */ + iagno = INOTOIAG(inum); + + /* make sure that the iag is contained within + * the map. + */ + if (iagno >= imap->im_nextiag) { + print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4, + imap, 32, 0); + jfs_error(ip->i_sb, + "diFree: inum = %d, iagno = %d, nextiag = %d", + (uint) inum, iagno, imap->im_nextiag); + return -EIO; + } + + /* get the allocation group for this ino. + */ + agno = BLKTOAG(JFS_IP(ip)->agstart, JFS_SBI(ip->i_sb)); + + /* Lock the AG specific inode map information + */ + AG_LOCK(imap, agno); + + /* Obtain read lock in imap inode. Don't release it until we have + * read all of the IAG's that we are going to. + */ + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + + /* read the iag. + */ + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + return (rc); + } + iagp = (struct iag *) mp->data; + + /* get the inode number and extent number of the inode within + * the iag and the inode number within the extent. + */ + ino = inum & (INOSPERIAG - 1); + extno = ino >> L2INOSPEREXT; + bitno = ino & (INOSPEREXT - 1); + mask = HIGHORDER >> bitno; + + if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { + jfs_error(ip->i_sb, + "diFree: wmap shows inode already free"); + } + + if (!addressPXD(&iagp->inoext[extno])) { + release_metapage(mp); + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + jfs_error(ip->i_sb, "diFree: invalid inoext"); + return -EIO; + } + + /* compute the bitmap for the extent reflecting the freed inode. + */ + bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask; + + if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) { + release_metapage(mp); + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + jfs_error(ip->i_sb, "diFree: numfree > numinos"); + return -EIO; + } + /* + * inode extent still has some inodes or below low water mark: + * keep the inode extent; + */ + if (bitmap || + imap->im_agctl[agno].numfree < 96 || + (imap->im_agctl[agno].numfree < 288 && + (((imap->im_agctl[agno].numfree * 100) / + imap->im_agctl[agno].numinos) <= 25))) { + /* if the iag currently has no free inodes (i.e., + * the inode being freed is the first free inode of iag), + * insert the iag at head of the inode free list for the ag. + */ + if (iagp->nfreeinos == 0) { + /* check if there are any iags on the ag inode + * free list. if so, read the first one so that + * we can link the current iag onto the list at + * the head. + */ + if ((fwd = imap->im_agctl[agno].inofree) >= 0) { + /* read the iag that currently is the head + * of the list. + */ + if ((rc = diIAGRead(imap, fwd, &))) { + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + release_metapage(mp); + return (rc); + } + aiagp = (struct iag *) amp->data; + + /* make current head point back to the iag. + */ + aiagp->inofreeback = cpu_to_le32(iagno); + + write_metapage(amp); + } + + /* iag points forward to current head and iag + * becomes the new head of the list. + */ + iagp->inofreefwd = + cpu_to_le32(imap->im_agctl[agno].inofree); + iagp->inofreeback = cpu_to_le32(-1); + imap->im_agctl[agno].inofree = iagno; + } + IREAD_UNLOCK(ipimap); + + /* update the free inode summary map for the extent if + * freeing the inode means the extent will now have free + * inodes (i.e., the inode being freed is the first free + * inode of extent), + */ + if (iagp->wmap[extno] == cpu_to_le32(ONES)) { + sword = extno >> L2EXTSPERSUM; + bitno = extno & (EXTSPERSUM - 1); + iagp->inosmap[sword] &= + cpu_to_le32(~(HIGHORDER >> bitno)); + } + + /* update the bitmap. + */ + iagp->wmap[extno] = cpu_to_le32(bitmap); + + /* update the free inode counts at the iag, ag and + * map level. + */ + le32_add_cpu(&iagp->nfreeinos, 1); + imap->im_agctl[agno].numfree += 1; + atomic_inc(&imap->im_numfree); + + /* release the AG inode map lock + */ + AG_UNLOCK(imap, agno); + + /* write the iag */ + write_metapage(mp); + + return (0); + } + + + /* + * inode extent has become free and above low water mark: + * free the inode extent; + */ + + /* + * prepare to update iag list(s) (careful update step 1) + */ + amp = bmp = cmp = dmp = NULL; + fwd = back = -1; + + /* check if the iag currently has no free extents. if so, + * it will be placed on the head of the ag extent free list. + */ + if (iagp->nfreeexts == 0) { + /* check if the ag extent free list has any iags. + * if so, read the iag at the head of the list now. + * this (head) iag will be updated later to reflect + * the addition of the current iag at the head of + * the list. + */ + if ((fwd = imap->im_agctl[agno].extfree) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + goto error_out; + aiagp = (struct iag *) amp->data; + } + } else { + /* iag has free extents. check if the addition of a free + * extent will cause all extents to be free within this + * iag. if so, the iag will be removed from the ag extent + * free list and placed on the inode map's free iag list. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) { + /* in preparation for removing the iag from the + * ag extent free list, read the iags preceding + * and following the iag on the ag extent free + * list. + */ + if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + goto error_out; + aiagp = (struct iag *) amp->data; + } + + if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) { + if ((rc = diIAGRead(imap, back, &bmp))) + goto error_out; + biagp = (struct iag *) bmp->data; + } + } + } + + /* remove the iag from the ag inode free list if freeing + * this extent cause the iag to have no free inodes. + */ + if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) { + int inofreeback = le32_to_cpu(iagp->inofreeback); + int inofreefwd = le32_to_cpu(iagp->inofreefwd); + + /* in preparation for removing the iag from the + * ag inode free list, read the iags preceding + * and following the iag on the ag inode free + * list. before reading these iags, we must make + * sure that we already don't have them in hand + * from up above, since re-reading an iag (buffer) + * we are currently holding would cause a deadlock. + */ + if (inofreefwd >= 0) { + + if (inofreefwd == fwd) + ciagp = (struct iag *) amp->data; + else if (inofreefwd == back) + ciagp = (struct iag *) bmp->data; + else { + if ((rc = + diIAGRead(imap, inofreefwd, &cmp))) + goto error_out; + ciagp = (struct iag *) cmp->data; + } + assert(ciagp != NULL); + } + + if (inofreeback >= 0) { + if (inofreeback == fwd) + diagp = (struct iag *) amp->data; + else if (inofreeback == back) + diagp = (struct iag *) bmp->data; + else { + if ((rc = + diIAGRead(imap, inofreeback, &dmp))) + goto error_out; + diagp = (struct iag *) dmp->data; + } + assert(diagp != NULL); + } + } + + IREAD_UNLOCK(ipimap); + + /* + * invalidate any page of the inode extent freed from buffer cache; + */ + freepxd = iagp->inoext[extno]; + invalidate_pxd_metapages(ip, freepxd); + + /* + * update iag list(s) (careful update step 2) + */ + /* add the iag to the ag extent free list if this is the + * first free extent for the iag. + */ + if (iagp->nfreeexts == 0) { + if (fwd >= 0) + aiagp->extfreeback = cpu_to_le32(iagno); + + iagp->extfreefwd = + cpu_to_le32(imap->im_agctl[agno].extfree); + iagp->extfreeback = cpu_to_le32(-1); + imap->im_agctl[agno].extfree = iagno; + } else { + /* remove the iag from the ag extent list if all extents + * are now free and place it on the inode map iag free list. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) { + if (fwd >= 0) + aiagp->extfreeback = iagp->extfreeback; + + if (back >= 0) + biagp->extfreefwd = iagp->extfreefwd; + else + imap->im_agctl[agno].extfree = + le32_to_cpu(iagp->extfreefwd); + + iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); + + IAGFREE_LOCK(imap); + iagp->iagfree = cpu_to_le32(imap->im_freeiag); + imap->im_freeiag = iagno; + IAGFREE_UNLOCK(imap); + } + } + + /* remove the iag from the ag inode free list if freeing + * this extent causes the iag to have no free inodes. + */ + if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) { + if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) + ciagp->inofreeback = iagp->inofreeback; + + if ((int) le32_to_cpu(iagp->inofreeback) >= 0) + diagp->inofreefwd = iagp->inofreefwd; + else + imap->im_agctl[agno].inofree = + le32_to_cpu(iagp->inofreefwd); + + iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); + } + + /* update the inode extent address and working map + * to reflect the free extent. + * the permanent map should have been updated already + * for the inode being freed. + */ + if (iagp->pmap[extno] != 0) { + jfs_error(ip->i_sb, "diFree: the pmap does not show inode free"); + } + iagp->wmap[extno] = 0; + PXDlength(&iagp->inoext[extno], 0); + PXDaddress(&iagp->inoext[extno], 0); + + /* update the free extent and free inode summary maps + * to reflect the freed extent. + * the inode summary map is marked to indicate no inodes + * available for the freed extent. + */ + sword = extno >> L2EXTSPERSUM; + bitno = extno & (EXTSPERSUM - 1); + mask = HIGHORDER >> bitno; + iagp->inosmap[sword] |= cpu_to_le32(mask); + iagp->extsmap[sword] &= cpu_to_le32(~mask); + + /* update the number of free inodes and number of free extents + * for the iag. + */ + le32_add_cpu(&iagp->nfreeinos, -(INOSPEREXT - 1)); + le32_add_cpu(&iagp->nfreeexts, 1); + + /* update the number of free inodes and backed inodes + * at the ag and inode map level. + */ + imap->im_agctl[agno].numfree -= (INOSPEREXT - 1); + imap->im_agctl[agno].numinos -= INOSPEREXT; + atomic_sub(INOSPEREXT - 1, &imap->im_numfree); + atomic_sub(INOSPEREXT, &imap->im_numinos); + + if (amp) + write_metapage(amp); + if (bmp) + write_metapage(bmp); + if (cmp) + write_metapage(cmp); + if (dmp) + write_metapage(dmp); + + /* + * start transaction to update block allocation map + * for the inode extent freed; + * + * N.B. AG_LOCK is released and iag will be released below, and + * other thread may allocate inode from/reusing the ixad freed + * BUT with new/different backing inode extent from the extent + * to be freed by the transaction; + */ + tid = txBegin(ipimap->i_sb, COMMIT_FORCE); + mutex_lock(&JFS_IP(ipimap)->commit_mutex); + + /* acquire tlock of the iag page of the freed ixad + * to force the page NOHOMEOK (even though no data is + * logged from the iag page) until NOREDOPAGE|FREEXTENT log + * for the free of the extent is committed; + * write FREEXTENT|NOREDOPAGE log record + * N.B. linelock is overlaid as freed extent descriptor; + */ + tlck = txLock(tid, ipimap, mp, tlckINODE | tlckFREE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = freepxd; + pxdlock->index = 1; + + write_metapage(mp); + + iplist[0] = ipimap; + + /* + * logredo needs the IAG number and IAG extent index in order + * to ensure that the IMap is consistent. The least disruptive + * way to pass these values through to the transaction manager + * is in the iplist array. + * + * It's not pretty, but it works. + */ + iplist[1] = (struct inode *) (size_t)iagno; + iplist[2] = (struct inode *) (size_t)extno; + + rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); + + txEnd(tid); + mutex_unlock(&JFS_IP(ipimap)->commit_mutex); + + /* unlock the AG inode map information */ + AG_UNLOCK(imap, agno); + + return (0); + + error_out: + IREAD_UNLOCK(ipimap); + + if (amp) + release_metapage(amp); + if (bmp) + release_metapage(bmp); + if (cmp) + release_metapage(cmp); + if (dmp) + release_metapage(dmp); + + AG_UNLOCK(imap, agno); + + release_metapage(mp); + + return (rc); +} + +/* + * There are several places in the diAlloc* routines where we initialize + * the inode. + */ +static inline void +diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + + ip->i_ino = (iagno << L2INOSPERIAG) + ino; + jfs_ip->ixpxd = iagp->inoext[extno]; + jfs_ip->agstart = le64_to_cpu(iagp->agstart); + jfs_ip->active_ag = -1; +} + + +/* + * NAME: diAlloc(pip,dir,ip) + * + * FUNCTION: allocate a disk inode from the inode working map + * for a fileset or aggregate. + * + * PARAMETERS: + * pip - pointer to incore inode for the parent inode. + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to a new inode + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +int diAlloc(struct inode *pip, bool dir, struct inode *ip) +{ + int rc, ino, iagno, addext, extno, bitno, sword; + int nwords, rem, i, agno; + u32 mask, inosmap, extsmap; + struct inode *ipimap; + struct metapage *mp; + ino_t inum; + struct iag *iagp; + struct inomap *imap; + + /* get the pointers to the inode map inode and the + * corresponding imap control structure. + */ + ipimap = JFS_SBI(pip->i_sb)->ipimap; + imap = JFS_IP(ipimap)->i_imap; + JFS_IP(ip)->ipimap = ipimap; + JFS_IP(ip)->fileset = FILESYSTEM_I; + + /* for a directory, the allocation policy is to start + * at the ag level using the preferred ag. + */ + if (dir) { + agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap); + AG_LOCK(imap, agno); + goto tryag; + } + + /* for files, the policy starts off by trying to allocate from + * the same iag containing the parent disk inode: + * try to allocate the new disk inode close to the parent disk + * inode, using parent disk inode number + 1 as the allocation + * hint. (we use a left-to-right policy to attempt to avoid + * moving backward on the disk.) compute the hint within the + * file system and the iag. + */ + + /* get the ag number of this iag */ + agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb)); + + if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) { + /* + * There is an open file actively growing. We want to + * allocate new inodes from a different ag to avoid + * fragmentation problems. + */ + agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap); + AG_LOCK(imap, agno); + goto tryag; + } + + inum = pip->i_ino + 1; + ino = inum & (INOSPERIAG - 1); + + /* back off the hint if it is outside of the iag */ + if (ino == 0) + inum = pip->i_ino; + + /* lock the AG inode map information */ + AG_LOCK(imap, agno); + + /* Get read lock on imap inode */ + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + + /* get the iag number and read the iag */ + iagno = INOTOIAG(inum); + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + return (rc); + } + iagp = (struct iag *) mp->data; + + /* determine if new inode extent is allowed to be added to the iag. + * new inode extent can be added to the iag if the ag + * has less than 32 free disk inodes and the iag has free extents. + */ + addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts); + + /* + * try to allocate from the IAG + */ + /* check if the inode may be allocated from the iag + * (i.e. the inode has free inodes or new extent can be added). + */ + if (iagp->nfreeinos || addext) { + /* determine the extent number of the hint. + */ + extno = ino >> L2INOSPEREXT; + + /* check if the extent containing the hint has backed + * inodes. if so, try to allocate within this extent. + */ + if (addressPXD(&iagp->inoext[extno])) { + bitno = ino & (INOSPEREXT - 1); + if ((bitno = + diFindFree(le32_to_cpu(iagp->wmap[extno]), + bitno)) + < INOSPEREXT) { + ino = (extno << L2INOSPEREXT) + bitno; + + /* a free inode (bit) was found within this + * extent, so allocate it. + */ + rc = diAllocBit(imap, iagp, ino); + IREAD_UNLOCK(ipimap); + if (rc) { + assert(rc == -EIO); + } else { + /* set the results of the allocation + * and write the iag. + */ + diInitInode(ip, iagno, ino, extno, + iagp); + mark_metapage_dirty(mp); + } + release_metapage(mp); + + /* free the AG lock and return. + */ + AG_UNLOCK(imap, agno); + return (rc); + } + + if (!addext) + extno = + (extno == + EXTSPERIAG - 1) ? 0 : extno + 1; + } + + /* + * no free inodes within the extent containing the hint. + * + * try to allocate from the backed extents following + * hint or, if appropriate (i.e. addext is true), allocate + * an extent of free inodes at or following the extent + * containing the hint. + * + * the free inode and free extent summary maps are used + * here, so determine the starting summary map position + * and the number of words we'll have to examine. again, + * the approach is to allocate following the hint, so we + * might have to initially ignore prior bits of the summary + * map that represent extents prior to the extent containing + * the hint and later revisit these bits. + */ + bitno = extno & (EXTSPERSUM - 1); + nwords = (bitno == 0) ? SMAPSZ : SMAPSZ + 1; + sword = extno >> L2EXTSPERSUM; + + /* mask any prior bits for the starting words of the + * summary map. + */ + mask = ONES << (EXTSPERSUM - bitno); + inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask; + extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask; + + /* scan the free inode and free extent summary maps for + * free resources. + */ + for (i = 0; i < nwords; i++) { + /* check if this word of the free inode summary + * map describes an extent with free inodes. + */ + if (~inosmap) { + /* an extent with free inodes has been + * found. determine the extent number + * and the inode number within the extent. + */ + rem = diFindFree(inosmap, 0); + extno = (sword << L2EXTSPERSUM) + rem; + rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), + 0); + if (rem >= INOSPEREXT) { + IREAD_UNLOCK(ipimap); + release_metapage(mp); + AG_UNLOCK(imap, agno); + jfs_error(ip->i_sb, + "diAlloc: can't find free bit " + "in wmap"); + return -EIO; + } + + /* determine the inode number within the + * iag and allocate the inode from the + * map. + */ + ino = (extno << L2INOSPEREXT) + rem; + rc = diAllocBit(imap, iagp, ino); + IREAD_UNLOCK(ipimap); + if (rc) + assert(rc == -EIO); + else { + /* set the results of the allocation + * and write the iag. + */ + diInitInode(ip, iagno, ino, extno, + iagp); + mark_metapage_dirty(mp); + } + release_metapage(mp); + + /* free the AG lock and return. + */ + AG_UNLOCK(imap, agno); + return (rc); + + } + + /* check if we may allocate an extent of free + * inodes and whether this word of the free + * extents summary map describes a free extent. + */ + if (addext && ~extsmap) { + /* a free extent has been found. determine + * the extent number. + */ + rem = diFindFree(extsmap, 0); + extno = (sword << L2EXTSPERSUM) + rem; + + /* allocate an extent of free inodes. + */ + if ((rc = diNewExt(imap, iagp, extno))) { + /* if there is no disk space for a + * new extent, try to allocate the + * disk inode from somewhere else. + */ + if (rc == -ENOSPC) + break; + + assert(rc == -EIO); + } else { + /* set the results of the allocation + * and write the iag. + */ + diInitInode(ip, iagno, + extno << L2INOSPEREXT, + extno, iagp); + mark_metapage_dirty(mp); + } + release_metapage(mp); + /* free the imap inode & the AG lock & return. + */ + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + return (rc); + } + + /* move on to the next set of summary map words. + */ + sword = (sword == SMAPSZ - 1) ? 0 : sword + 1; + inosmap = le32_to_cpu(iagp->inosmap[sword]); + extsmap = le32_to_cpu(iagp->extsmap[sword]); + } + } + /* unlock imap inode */ + IREAD_UNLOCK(ipimap); + + /* nothing doing in this iag, so release it. */ + release_metapage(mp); + + tryag: + /* + * try to allocate anywhere within the same AG as the parent inode. + */ + rc = diAllocAG(imap, agno, dir, ip); + + AG_UNLOCK(imap, agno); + + if (rc != -ENOSPC) + return (rc); + + /* + * try to allocate in any AG. + */ + return (diAllocAny(imap, agno, dir, ip)); +} + + +/* + * NAME: diAllocAG(imap,agno,dir,ip) + * + * FUNCTION: allocate a disk inode from the allocation group. + * + * this routine first determines if a new extent of free + * inodes should be added for the allocation group, with + * the current request satisfied from this extent. if this + * is the case, an attempt will be made to do just that. if + * this attempt fails or it has been determined that a new + * extent should not be added, an attempt is made to satisfy + * the request by allocating an existing (backed) free inode + * from the allocation group. + * + * PRE CONDITION: Already have the AG lock for this AG. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - allocation group to allocate from. + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to the new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int +diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) +{ + int rc, addext, numfree, numinos; + + /* get the number of free and the number of backed disk + * inodes currently within the ag. + */ + numfree = imap->im_agctl[agno].numfree; + numinos = imap->im_agctl[agno].numinos; + + if (numfree > numinos) { + jfs_error(ip->i_sb, "diAllocAG: numfree > numinos"); + return -EIO; + } + + /* determine if we should allocate a new extent of free inodes + * within the ag: for directory inodes, add a new extent + * if there are a small number of free inodes or number of free + * inodes is a small percentage of the number of backed inodes. + */ + if (dir) + addext = (numfree < 64 || + (numfree < 256 + && ((numfree * 100) / numinos) <= 20)); + else + addext = (numfree == 0); + + /* + * try to allocate a new extent of free inodes. + */ + if (addext) { + /* if free space is not available for this new extent, try + * below to allocate a free and existing (already backed) + * inode from the ag. + */ + if ((rc = diAllocExt(imap, agno, ip)) != -ENOSPC) + return (rc); + } + + /* + * try to allocate an existing free inode from the ag. + */ + return (diAllocIno(imap, agno, ip)); +} + + +/* + * NAME: diAllocAny(imap,agno,dir,iap) + * + * FUNCTION: allocate a disk inode from any other allocation group. + * + * this routine is called when an allocation attempt within + * the primary allocation group has failed. if attempts to + * allocate an inode from any allocation group other than the + * specified primary group. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - primary allocation group (to avoid). + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to a new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int +diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) +{ + int ag, rc; + int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag; + + + /* try to allocate from the ags following agno up to + * the maximum ag number. + */ + for (ag = agno + 1; ag <= maxag; ag++) { + AG_LOCK(imap, ag); + + rc = diAllocAG(imap, ag, dir, ip); + + AG_UNLOCK(imap, ag); + + if (rc != -ENOSPC) + return (rc); + } + + /* try to allocate from the ags in front of agno. + */ + for (ag = 0; ag < agno; ag++) { + AG_LOCK(imap, ag); + + rc = diAllocAG(imap, ag, dir, ip); + + AG_UNLOCK(imap, ag); + + if (rc != -ENOSPC) + return (rc); + } + + /* no free disk inodes. + */ + return -ENOSPC; +} + + +/* + * NAME: diAllocIno(imap,agno,ip) + * + * FUNCTION: allocate a disk inode from the allocation group's free + * inode list, returning an error if this free list is + * empty (i.e. no iags on the list). + * + * allocation occurs from the first iag on the list using + * the iag's free inode summary map to find the leftmost + * free inode in the iag. + * + * PRE CONDITION: Already have AG lock for this AG. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - allocation group. + * ip - pointer to new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) +{ + int iagno, ino, rc, rem, extno, sword; + struct metapage *mp; + struct iag *iagp; + + /* check if there are iags on the ag's free inode list. + */ + if ((iagno = imap->im_agctl[agno].inofree) < 0) + return -ENOSPC; + + /* obtain read lock on imap inode */ + IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); + + /* read the iag at the head of the list. + */ + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(imap->im_ipimap); + return (rc); + } + iagp = (struct iag *) mp->data; + + /* better be free inodes in this iag if it is on the + * list. + */ + if (!iagp->nfreeinos) { + IREAD_UNLOCK(imap->im_ipimap); + release_metapage(mp); + jfs_error(ip->i_sb, + "diAllocIno: nfreeinos = 0, but iag on freelist"); + return -EIO; + } + + /* scan the free inode summary map to find an extent + * with free inodes. + */ + for (sword = 0;; sword++) { + if (sword >= SMAPSZ) { + IREAD_UNLOCK(imap->im_ipimap); + release_metapage(mp); + jfs_error(ip->i_sb, + "diAllocIno: free inode not found in summary map"); + return -EIO; + } + + if (~iagp->inosmap[sword]) + break; + } + + /* found a extent with free inodes. determine + * the extent number. + */ + rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0); + if (rem >= EXTSPERSUM) { + IREAD_UNLOCK(imap->im_ipimap); + release_metapage(mp); + jfs_error(ip->i_sb, "diAllocIno: no free extent found"); + return -EIO; + } + extno = (sword << L2EXTSPERSUM) + rem; + + /* find the first free inode in the extent. + */ + rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0); + if (rem >= INOSPEREXT) { + IREAD_UNLOCK(imap->im_ipimap); + release_metapage(mp); + jfs_error(ip->i_sb, "diAllocIno: free inode not found"); + return -EIO; + } + + /* compute the inode number within the iag. + */ + ino = (extno << L2INOSPEREXT) + rem; + + /* allocate the inode. + */ + rc = diAllocBit(imap, iagp, ino); + IREAD_UNLOCK(imap->im_ipimap); + if (rc) { + release_metapage(mp); + return (rc); + } + + /* set the results of the allocation and write the iag. + */ + diInitInode(ip, iagno, ino, extno, iagp); + write_metapage(mp); + + return (0); +} + + +/* + * NAME: diAllocExt(imap,agno,ip) + * + * FUNCTION: add a new extent of free inodes to an iag, allocating + * an inode from this extent to satisfy the current allocation + * request. + * + * this routine first tries to find an existing iag with free + * extents through the ag free extent list. if list is not + * empty, the head of the list will be selected as the home + * of the new extent of free inodes. otherwise (the list is + * empty), a new iag will be allocated for the ag to contain + * the extent. + * + * once an iag has been selected, the free extent summary map + * is used to locate a free extent within the iag and diNewExt() + * is called to initialize the extent, with initialization + * including the allocation of the first inode of the extent + * for the purpose of satisfying this request. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - allocation group number. + * ip - pointer to new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) +{ + int rem, iagno, sword, extno, rc; + struct metapage *mp; + struct iag *iagp; + + /* check if the ag has any iags with free extents. if not, + * allocate a new iag for the ag. + */ + if ((iagno = imap->im_agctl[agno].extfree) < 0) { + /* If successful, diNewIAG will obtain the read lock on the + * imap inode. + */ + if ((rc = diNewIAG(imap, &iagno, agno, &mp))) { + return (rc); + } + iagp = (struct iag *) mp->data; + + /* set the ag number if this a brand new iag + */ + iagp->agstart = + cpu_to_le64(AGTOBLK(agno, imap->im_ipimap)); + } else { + /* read the iag. + */ + IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, "diAllocExt: error reading iag"); + return rc; + } + iagp = (struct iag *) mp->data; + } + + /* using the free extent summary map, find a free extent. + */ + for (sword = 0;; sword++) { + if (sword >= SMAPSZ) { + release_metapage(mp); + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, + "diAllocExt: free ext summary map not found"); + return -EIO; + } + if (~iagp->extsmap[sword]) + break; + } + + /* determine the extent number of the free extent. + */ + rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0); + if (rem >= EXTSPERSUM) { + release_metapage(mp); + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, "diAllocExt: free extent not found"); + return -EIO; + } + extno = (sword << L2EXTSPERSUM) + rem; + + /* initialize the new extent. + */ + rc = diNewExt(imap, iagp, extno); + IREAD_UNLOCK(imap->im_ipimap); + if (rc) { + /* something bad happened. if a new iag was allocated, + * place it back on the inode map's iag free list, and + * clear the ag number information. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + IAGFREE_LOCK(imap); + iagp->iagfree = cpu_to_le32(imap->im_freeiag); + imap->im_freeiag = iagno; + IAGFREE_UNLOCK(imap); + } + write_metapage(mp); + return (rc); + } + + /* set the results of the allocation and write the iag. + */ + diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp); + + write_metapage(mp); + + return (0); +} + + +/* + * NAME: diAllocBit(imap,iagp,ino) + * + * FUNCTION: allocate a backed inode from an iag. + * + * this routine performs the mechanics of allocating a + * specified inode from a backed extent. + * + * if the inode to be allocated represents the last free + * inode within the iag, the iag will be removed from the + * ag free inode list. + * + * a careful update approach is used to provide consistency + * in the face of updates to multiple buffers. under this + * approach, all required buffers are obtained before making + * any updates and are held all are updates are complete. + * + * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on + * this AG. Must have read lock on imap inode. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagp - pointer to iag. + * ino - inode number to be allocated within the iag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) +{ + int extno, bitno, agno, sword, rc; + struct metapage *amp = NULL, *bmp = NULL; + struct iag *aiagp = NULL, *biagp = NULL; + u32 mask; + + /* check if this is the last free inode within the iag. + * if so, it will have to be removed from the ag free + * inode list, so get the iags preceding and following + * it on the list. + */ + if (iagp->nfreeinos == cpu_to_le32(1)) { + if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) { + if ((rc = + diIAGRead(imap, le32_to_cpu(iagp->inofreefwd), + &))) + return (rc); + aiagp = (struct iag *) amp->data; + } + + if ((int) le32_to_cpu(iagp->inofreeback) >= 0) { + if ((rc = + diIAGRead(imap, + le32_to_cpu(iagp->inofreeback), + &bmp))) { + if (amp) + release_metapage(amp); + return (rc); + } + biagp = (struct iag *) bmp->data; + } + } + + /* get the ag number, extent number, inode number within + * the extent. + */ + agno = BLKTOAG(le64_to_cpu(iagp->agstart), JFS_SBI(imap->im_ipimap->i_sb)); + extno = ino >> L2INOSPEREXT; + bitno = ino & (INOSPEREXT - 1); + + /* compute the mask for setting the map. + */ + mask = HIGHORDER >> bitno; + + /* the inode should be free and backed. + */ + if (((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) || + ((le32_to_cpu(iagp->wmap[extno]) & mask) != 0) || + (addressPXD(&iagp->inoext[extno]) == 0)) { + if (amp) + release_metapage(amp); + if (bmp) + release_metapage(bmp); + + jfs_error(imap->im_ipimap->i_sb, + "diAllocBit: iag inconsistent"); + return -EIO; + } + + /* mark the inode as allocated in the working map. + */ + iagp->wmap[extno] |= cpu_to_le32(mask); + + /* check if all inodes within the extent are now + * allocated. if so, update the free inode summary + * map to reflect this. + */ + if (iagp->wmap[extno] == cpu_to_le32(ONES)) { + sword = extno >> L2EXTSPERSUM; + bitno = extno & (EXTSPERSUM - 1); + iagp->inosmap[sword] |= cpu_to_le32(HIGHORDER >> bitno); + } + + /* if this was the last free inode in the iag, remove the + * iag from the ag free inode list. + */ + if (iagp->nfreeinos == cpu_to_le32(1)) { + if (amp) { + aiagp->inofreeback = iagp->inofreeback; + write_metapage(amp); + } + + if (bmp) { + biagp->inofreefwd = iagp->inofreefwd; + write_metapage(bmp); + } else { + imap->im_agctl[agno].inofree = + le32_to_cpu(iagp->inofreefwd); + } + iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); + } + + /* update the free inode count at the iag, ag, inode + * map levels. + */ + le32_add_cpu(&iagp->nfreeinos, -1); + imap->im_agctl[agno].numfree -= 1; + atomic_dec(&imap->im_numfree); + + return (0); +} + + +/* + * NAME: diNewExt(imap,iagp,extno) + * + * FUNCTION: initialize a new extent of inodes for an iag, allocating + * the first inode of the extent for use for the current + * allocation request. + * + * disk resources are allocated for the new extent of inodes + * and the inodes themselves are initialized to reflect their + * existence within the extent (i.e. their inode numbers and + * inode extent addresses are set) and their initial state + * (mode and link count are set to zero). + * + * if the iag is new, it is not yet on an ag extent free list + * but will now be placed on this list. + * + * if the allocation of the new extent causes the iag to + * have no free extent, the iag will be removed from the + * ag extent free list. + * + * if the iag has no free backed inodes, it will be placed + * on the ag free inode list, since the addition of the new + * extent will now cause it to have free inodes. + * + * a careful update approach is used to provide consistency + * (i.e. list consistency) in the face of updates to multiple + * buffers. under this approach, all required buffers are + * obtained before making any updates and are held until all + * updates are complete. + * + * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on + * this AG. Must have read lock on imap inode. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagp - pointer to iag. + * extno - extent number. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) +{ + int agno, iagno, fwd, back, freei = 0, sword, rc; + struct iag *aiagp = NULL, *biagp = NULL, *ciagp = NULL; + struct metapage *amp, *bmp, *cmp, *dmp; + struct inode *ipimap; + s64 blkno, hint; + int i, j; + u32 mask; + ino_t ino; + struct dinode *dp; + struct jfs_sb_info *sbi; + + /* better have free extents. + */ + if (!iagp->nfreeexts) { + jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents"); + return -EIO; + } + + /* get the inode map inode. + */ + ipimap = imap->im_ipimap; + sbi = JFS_SBI(ipimap->i_sb); + + amp = bmp = cmp = NULL; + + /* get the ag and iag numbers for this iag. + */ + agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi); + iagno = le32_to_cpu(iagp->iagnum); + + /* check if this is the last free extent within the + * iag. if so, the iag must be removed from the ag + * free extent list, so get the iags preceding and + * following the iag on this list. + */ + if (iagp->nfreeexts == cpu_to_le32(1)) { + if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + return (rc); + aiagp = (struct iag *) amp->data; + } + + if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) { + if ((rc = diIAGRead(imap, back, &bmp))) + goto error_out; + biagp = (struct iag *) bmp->data; + } + } else { + /* the iag has free extents. if all extents are free + * (as is the case for a newly allocated iag), the iag + * must be added to the ag free extent list, so get + * the iag at the head of the list in preparation for + * adding this iag to this list. + */ + fwd = back = -1; + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + if ((fwd = imap->im_agctl[agno].extfree) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + goto error_out; + aiagp = (struct iag *) amp->data; + } + } + } + + /* check if the iag has no free inodes. if so, the iag + * will have to be added to the ag free inode list, so get + * the iag at the head of the list in preparation for + * adding this iag to this list. in doing this, we must + * check if we already have the iag at the head of + * the list in hand. + */ + if (iagp->nfreeinos == 0) { + freei = imap->im_agctl[agno].inofree; + + if (freei >= 0) { + if (freei == fwd) { + ciagp = aiagp; + } else if (freei == back) { + ciagp = biagp; + } else { + if ((rc = diIAGRead(imap, freei, &cmp))) + goto error_out; + ciagp = (struct iag *) cmp->data; + } + if (ciagp == NULL) { + jfs_error(imap->im_ipimap->i_sb, + "diNewExt: ciagp == NULL"); + rc = -EIO; + goto error_out; + } + } + } + + /* allocate disk space for the inode extent. + */ + if ((extno == 0) || (addressPXD(&iagp->inoext[extno - 1]) == 0)) + hint = ((s64) agno << sbi->bmap->db_agl2size) - 1; + else + hint = addressPXD(&iagp->inoext[extno - 1]) + + lengthPXD(&iagp->inoext[extno - 1]) - 1; + + if ((rc = dbAlloc(ipimap, hint, (s64) imap->im_nbperiext, &blkno))) + goto error_out; + + /* compute the inode number of the first inode within the + * extent. + */ + ino = (iagno << L2INOSPERIAG) + (extno << L2INOSPEREXT); + + /* initialize the inodes within the newly allocated extent a + * page at a time. + */ + for (i = 0; i < imap->im_nbperiext; i += sbi->nbperpage) { + /* get a buffer for this page of disk inodes. + */ + dmp = get_metapage(ipimap, blkno + i, PSIZE, 1); + if (dmp == NULL) { + rc = -EIO; + goto error_out; + } + dp = (struct dinode *) dmp->data; + + /* initialize the inode number, mode, link count and + * inode extent address. + */ + for (j = 0; j < INOSPERPAGE; j++, dp++, ino++) { + dp->di_inostamp = cpu_to_le32(sbi->inostamp); + dp->di_number = cpu_to_le32(ino); + dp->di_fileset = cpu_to_le32(FILESYSTEM_I); + dp->di_mode = 0; + dp->di_nlink = 0; + PXDaddress(&(dp->di_ixpxd), blkno); + PXDlength(&(dp->di_ixpxd), imap->im_nbperiext); + } + write_metapage(dmp); + } + + /* if this is the last free extent within the iag, remove the + * iag from the ag free extent list. + */ + if (iagp->nfreeexts == cpu_to_le32(1)) { + if (fwd >= 0) + aiagp->extfreeback = iagp->extfreeback; + + if (back >= 0) + biagp->extfreefwd = iagp->extfreefwd; + else + imap->im_agctl[agno].extfree = + le32_to_cpu(iagp->extfreefwd); + + iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); + } else { + /* if the iag has all free extents (newly allocated iag), + * add the iag to the ag free extent list. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + if (fwd >= 0) + aiagp->extfreeback = cpu_to_le32(iagno); + + iagp->extfreefwd = cpu_to_le32(fwd); + iagp->extfreeback = cpu_to_le32(-1); + imap->im_agctl[agno].extfree = iagno; + } + } + + /* if the iag has no free inodes, add the iag to the + * ag free inode list. + */ + if (iagp->nfreeinos == 0) { + if (freei >= 0) + ciagp->inofreeback = cpu_to_le32(iagno); + + iagp->inofreefwd = + cpu_to_le32(imap->im_agctl[agno].inofree); + iagp->inofreeback = cpu_to_le32(-1); + imap->im_agctl[agno].inofree = iagno; + } + + /* initialize the extent descriptor of the extent. */ + PXDlength(&iagp->inoext[extno], imap->im_nbperiext); + PXDaddress(&iagp->inoext[extno], blkno); + + /* initialize the working and persistent map of the extent. + * the working map will be initialized such that + * it indicates the first inode of the extent is allocated. + */ + iagp->wmap[extno] = cpu_to_le32(HIGHORDER); + iagp->pmap[extno] = 0; + + /* update the free inode and free extent summary maps + * for the extent to indicate the extent has free inodes + * and no longer represents a free extent. + */ + sword = extno >> L2EXTSPERSUM; + mask = HIGHORDER >> (extno & (EXTSPERSUM - 1)); + iagp->extsmap[sword] |= cpu_to_le32(mask); + iagp->inosmap[sword] &= cpu_to_le32(~mask); + + /* update the free inode and free extent counts for the + * iag. + */ + le32_add_cpu(&iagp->nfreeinos, (INOSPEREXT - 1)); + le32_add_cpu(&iagp->nfreeexts, -1); + + /* update the free and backed inode counts for the ag. + */ + imap->im_agctl[agno].numfree += (INOSPEREXT - 1); + imap->im_agctl[agno].numinos += INOSPEREXT; + + /* update the free and backed inode counts for the inode map. + */ + atomic_add(INOSPEREXT - 1, &imap->im_numfree); + atomic_add(INOSPEREXT, &imap->im_numinos); + + /* write the iags. + */ + if (amp) + write_metapage(amp); + if (bmp) + write_metapage(bmp); + if (cmp) + write_metapage(cmp); + + return (0); + + error_out: + + /* release the iags. + */ + if (amp) + release_metapage(amp); + if (bmp) + release_metapage(bmp); + if (cmp) + release_metapage(cmp); + + return (rc); +} + + +/* + * NAME: diNewIAG(imap,iagnop,agno) + * + * FUNCTION: allocate a new iag for an allocation group. + * + * first tries to allocate the iag from the inode map + * iagfree list: + * if the list has free iags, the head of the list is removed + * and returned to satisfy the request. + * if the inode map's iag free list is empty, the inode map + * is extended to hold a new iag. this new iag is initialized + * and returned to satisfy the request. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagnop - pointer to an iag number set with the number of the + * newly allocated iag upon successful return. + * agno - allocation group number. + * bpp - Buffer pointer to be filled in with new IAG's buffer + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + * + * serialization: + * AG lock held on entry/exit; + * write lock on the map is held inside; + * read lock on the map is held on successful completion; + * + * note: new iag transaction: + * . synchronously write iag; + * . write log of xtree and inode of imap; + * . commit; + * . synchronous write of xtree (right to left, bottom to top); + * . at start of logredo(): init in-memory imap with one additional iag page; + * . at end of logredo(): re-read imap inode to determine + * new imap size; + */ +static int +diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) +{ + int rc; + int iagno, i, xlen; + struct inode *ipimap; + struct super_block *sb; + struct jfs_sb_info *sbi; + struct metapage *mp; + struct iag *iagp; + s64 xaddr = 0; + s64 blkno; + tid_t tid; + struct inode *iplist[1]; + + /* pick up pointers to the inode map and mount inodes */ + ipimap = imap->im_ipimap; + sb = ipimap->i_sb; + sbi = JFS_SBI(sb); + + /* acquire the free iag lock */ + IAGFREE_LOCK(imap); + + /* if there are any iags on the inode map free iag list, + * allocate the iag from the head of the list. + */ + if (imap->im_freeiag >= 0) { + /* pick up the iag number at the head of the list */ + iagno = imap->im_freeiag; + + /* determine the logical block number of the iag */ + blkno = IAGTOLBLK(iagno, sbi->l2nbperpage); + } else { + /* no free iags. the inode map will have to be extented + * to include a new iag. + */ + + /* acquire inode map lock */ + IWRITE_LOCK(ipimap, RDWRLOCK_IMAP); + + if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) { + IWRITE_UNLOCK(ipimap); + IAGFREE_UNLOCK(imap); + jfs_error(imap->im_ipimap->i_sb, + "diNewIAG: ipimap->i_size is wrong"); + return -EIO; + } + + + /* get the next available iag number */ + iagno = imap->im_nextiag; + + /* make sure that we have not exceeded the maximum inode + * number limit. + */ + if (iagno > (MAXIAGS - 1)) { + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + rc = -ENOSPC; + goto out; + } + + /* + * synchronously append new iag page. + */ + /* determine the logical address of iag page to append */ + blkno = IAGTOLBLK(iagno, sbi->l2nbperpage); + + /* Allocate extent for new iag page */ + xlen = sbi->nbperpage; + if ((rc = dbAlloc(ipimap, 0, (s64) xlen, &xaddr))) { + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + goto out; + } + + /* + * start transaction of update of the inode map + * addressing structure pointing to the new iag page; + */ + tid = txBegin(sb, COMMIT_FORCE); + mutex_lock(&JFS_IP(ipimap)->commit_mutex); + + /* update the inode map addressing structure to point to it */ + if ((rc = + xtInsert(tid, ipimap, 0, blkno, xlen, &xaddr, 0))) { + txEnd(tid); + mutex_unlock(&JFS_IP(ipimap)->commit_mutex); + /* Free the blocks allocated for the iag since it was + * not successfully added to the inode map + */ + dbFree(ipimap, xaddr, (s64) xlen); + + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + goto out; + } + + /* update the inode map's inode to reflect the extension */ + ipimap->i_size += PSIZE; + inode_add_bytes(ipimap, PSIZE); + + /* assign a buffer for the page */ + mp = get_metapage(ipimap, blkno, PSIZE, 0); + if (!mp) { + /* + * This is very unlikely since we just created the + * extent, but let's try to handle it correctly + */ + xtTruncate(tid, ipimap, ipimap->i_size - PSIZE, + COMMIT_PWMAP); + + txAbort(tid, 0); + txEnd(tid); + mutex_unlock(&JFS_IP(ipimap)->commit_mutex); + + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + rc = -EIO; + goto out; + } + iagp = (struct iag *) mp->data; + + /* init the iag */ + memset(iagp, 0, sizeof(struct iag)); + iagp->iagnum = cpu_to_le32(iagno); + iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); + iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); + iagp->iagfree = cpu_to_le32(-1); + iagp->nfreeinos = 0; + iagp->nfreeexts = cpu_to_le32(EXTSPERIAG); + + /* initialize the free inode summary map (free extent + * summary map initialization handled by bzero). + */ + for (i = 0; i < SMAPSZ; i++) + iagp->inosmap[i] = cpu_to_le32(ONES); + + /* + * Write and sync the metapage + */ + flush_metapage(mp); + + /* + * txCommit(COMMIT_FORCE) will synchronously write address + * index pages and inode after commit in careful update order + * of address index pages (right to left, bottom up); + */ + iplist[0] = ipimap; + rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); + + txEnd(tid); + mutex_unlock(&JFS_IP(ipimap)->commit_mutex); + + duplicateIXtree(sb, blkno, xlen, &xaddr); + + /* update the next available iag number */ + imap->im_nextiag += 1; + + /* Add the iag to the iag free list so we don't lose the iag + * if a failure happens now. + */ + imap->im_freeiag = iagno; + + /* Until we have logredo working, we want the imap inode & + * control page to be up to date. + */ + diSync(ipimap); + + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + } + + /* obtain read lock on map */ + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + + /* read the iag */ + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(ipimap); + rc = -EIO; + goto out; + } + iagp = (struct iag *) mp->data; + + /* remove the iag from the iag free list */ + imap->im_freeiag = le32_to_cpu(iagp->iagfree); + iagp->iagfree = cpu_to_le32(-1); + + /* set the return iag number and buffer pointer */ + *iagnop = iagno; + *mpp = mp; + + out: + /* release the iag free lock */ + IAGFREE_UNLOCK(imap); + + return (rc); +} + +/* + * NAME: diIAGRead() + * + * FUNCTION: get the buffer for the specified iag within a fileset + * or aggregate inode map. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagno - iag number. + * bpp - point to buffer pointer to be filled in on successful + * exit. + * + * SERIALIZATION: + * must have read lock on imap inode + * (When called by diExtendFS, the filesystem is quiesced, therefore + * the read lock is unnecessary.) + * + * RETURN VALUES: + * 0 - success. + * -EIO - i/o error. + */ +static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp) +{ + struct inode *ipimap = imap->im_ipimap; + s64 blkno; + + /* compute the logical block number of the iag. */ + blkno = IAGTOLBLK(iagno, JFS_SBI(ipimap->i_sb)->l2nbperpage); + + /* read the iag. */ + *mpp = read_metapage(ipimap, blkno, PSIZE, 0); + if (*mpp == NULL) { + return -EIO; + } + + return (0); +} + +/* + * NAME: diFindFree() + * + * FUNCTION: find the first free bit in a word starting at + * the specified bit position. + * + * PARAMETERS: + * word - word to be examined. + * start - starting bit position. + * + * RETURN VALUES: + * bit position of first free bit in the word or 32 if + * no free bits were found. + */ +static int diFindFree(u32 word, int start) +{ + int bitno; + assert(start < 32); + /* scan the word for the first free bit. */ + for (word <<= start, bitno = start; bitno < 32; + bitno++, word <<= 1) { + if ((word & HIGHORDER) == 0) + break; + } + return (bitno); +} + +/* + * NAME: diUpdatePMap() + * + * FUNCTION: Update the persistent map in an IAG for the allocation or + * freeing of the specified inode. + * + * PRE CONDITIONS: Working map has already been updated for allocate. + * + * PARAMETERS: + * ipimap - Incore inode map inode + * inum - Number of inode to mark in permanent map + * is_free - If 'true' indicates inode should be marked freed, otherwise + * indicates inode should be marked allocated. + * + * RETURN VALUES: + * 0 for success + */ +int +diUpdatePMap(struct inode *ipimap, + unsigned long inum, bool is_free, struct tblock * tblk) +{ + int rc; + struct iag *iagp; + struct metapage *mp; + int iagno, ino, extno, bitno; + struct inomap *imap; + u32 mask; + struct jfs_log *log; + int lsn, difft, diffp; + unsigned long flags; + + imap = JFS_IP(ipimap)->i_imap; + /* get the iag number containing the inode */ + iagno = INOTOIAG(inum); + /* make sure that the iag is contained within the map */ + if (iagno >= imap->im_nextiag) { + jfs_error(ipimap->i_sb, + "diUpdatePMap: the iag is outside the map"); + return -EIO; + } + /* read the iag */ + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + rc = diIAGRead(imap, iagno, &mp); + IREAD_UNLOCK(ipimap); + if (rc) + return (rc); + metapage_wait_for_io(mp); + iagp = (struct iag *) mp->data; + /* get the inode number and extent number of the inode within + * the iag and the inode number within the extent. + */ + ino = inum & (INOSPERIAG - 1); + extno = ino >> L2INOSPEREXT; + bitno = ino & (INOSPEREXT - 1); + mask = HIGHORDER >> bitno; + /* + * mark the inode free in persistent map: + */ + if (is_free) { + /* The inode should have been allocated both in working + * map and in persistent map; + * the inode will be freed from working map at the release + * of last reference release; + */ + if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { + jfs_error(ipimap->i_sb, + "diUpdatePMap: inode %ld not marked as " + "allocated in wmap!", inum); + } + if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) { + jfs_error(ipimap->i_sb, + "diUpdatePMap: inode %ld not marked as " + "allocated in pmap!", inum); + } + /* update the bitmap for the extent of the freed inode */ + iagp->pmap[extno] &= cpu_to_le32(~mask); + } + /* + * mark the inode allocated in persistent map: + */ + else { + /* The inode should be already allocated in the working map + * and should be free in persistent map; + */ + if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { + release_metapage(mp); + jfs_error(ipimap->i_sb, + "diUpdatePMap: the inode is not allocated in " + "the working map"); + return -EIO; + } + if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) { + release_metapage(mp); + jfs_error(ipimap->i_sb, + "diUpdatePMap: the inode is not free in the " + "persistent map"); + return -EIO; + } + /* update the bitmap for the extent of the allocated inode */ + iagp->pmap[extno] |= cpu_to_le32(mask); + } + /* + * update iag lsn + */ + lsn = tblk->lsn; + log = JFS_SBI(tblk->sb)->log; + LOGSYNC_LOCK(log, flags); + if (mp->lsn != 0) { + /* inherit older/smaller lsn */ + logdiff(difft, lsn, log); + logdiff(diffp, mp->lsn, log); + if (difft < diffp) { + mp->lsn = lsn; + /* move mp after tblock in logsync list */ + list_move(&mp->synclist, &tblk->synclist); + } + /* inherit younger/larger clsn */ + assert(mp->clsn); + logdiff(difft, tblk->clsn, log); + logdiff(diffp, mp->clsn, log); + if (difft > diffp) + mp->clsn = tblk->clsn; + } else { + mp->log = log; + mp->lsn = lsn; + /* insert mp after tblock in logsync list */ + log->count++; + list_add(&mp->synclist, &tblk->synclist); + mp->clsn = tblk->clsn; + } + LOGSYNC_UNLOCK(log, flags); + write_metapage(mp); + return (0); +} + +/* + * diExtendFS() + * + * function: update imap for extendfs(); + * + * note: AG size has been increased s.t. each k old contiguous AGs are + * coalesced into a new AG; + */ +int diExtendFS(struct inode *ipimap, struct inode *ipbmap) +{ + int rc, rcx = 0; + struct inomap *imap = JFS_IP(ipimap)->i_imap; + struct iag *iagp = NULL, *hiagp = NULL; + struct bmap *mp = JFS_SBI(ipbmap->i_sb)->bmap; + struct metapage *bp, *hbp; + int i, n, head; + int numinos, xnuminos = 0, xnumfree = 0; + s64 agstart; + + jfs_info("diExtendFS: nextiag:%d numinos:%d numfree:%d", + imap->im_nextiag, atomic_read(&imap->im_numinos), + atomic_read(&imap->im_numfree)); + + /* + * reconstruct imap + * + * coalesce contiguous k (newAGSize/oldAGSize) AGs; + * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; + * note: new AG size = old AG size * (2**x). + */ + + /* init per AG control information im_agctl[] */ + for (i = 0; i < MAXAG; i++) { + imap->im_agctl[i].inofree = -1; + imap->im_agctl[i].extfree = -1; + imap->im_agctl[i].numinos = 0; /* number of backed inodes */ + imap->im_agctl[i].numfree = 0; /* number of free backed inodes */ + } + + /* + * process each iag page of the map. + * + * rebuild AG Free Inode List, AG Free Inode Extent List; + */ + for (i = 0; i < imap->im_nextiag; i++) { + if ((rc = diIAGRead(imap, i, &bp))) { + rcx = rc; + continue; + } + iagp = (struct iag *) bp->data; + if (le32_to_cpu(iagp->iagnum) != i) { + release_metapage(bp); + jfs_error(ipimap->i_sb, + "diExtendFs: unexpected value of iagnum"); + return -EIO; + } + + /* leave free iag in the free iag list */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + release_metapage(bp); + continue; + } + + agstart = le64_to_cpu(iagp->agstart); + n = agstart >> mp->db_agl2size; + iagp->agstart = cpu_to_le64((s64)n << mp->db_agl2size); + + /* compute backed inodes */ + numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts)) + << L2INOSPEREXT; + if (numinos > 0) { + /* merge AG backed inodes */ + imap->im_agctl[n].numinos += numinos; + xnuminos += numinos; + } + + /* if any backed free inodes, insert at AG free inode list */ + if ((int) le32_to_cpu(iagp->nfreeinos) > 0) { + if ((head = imap->im_agctl[n].inofree) == -1) { + iagp->inofreefwd = cpu_to_le32(-1); + iagp->inofreeback = cpu_to_le32(-1); + } else { + if ((rc = diIAGRead(imap, head, &hbp))) { + rcx = rc; + goto nextiag; + } + hiagp = (struct iag *) hbp->data; + hiagp->inofreeback = iagp->iagnum; + iagp->inofreefwd = cpu_to_le32(head); + iagp->inofreeback = cpu_to_le32(-1); + write_metapage(hbp); + } + + imap->im_agctl[n].inofree = + le32_to_cpu(iagp->iagnum); + + /* merge AG backed free inodes */ + imap->im_agctl[n].numfree += + le32_to_cpu(iagp->nfreeinos); + xnumfree += le32_to_cpu(iagp->nfreeinos); + } + + /* if any free extents, insert at AG free extent list */ + if (le32_to_cpu(iagp->nfreeexts) > 0) { + if ((head = imap->im_agctl[n].extfree) == -1) { + iagp->extfreefwd = cpu_to_le32(-1); + iagp->extfreeback = cpu_to_le32(-1); + } else { + if ((rc = diIAGRead(imap, head, &hbp))) { + rcx = rc; + goto nextiag; + } + hiagp = (struct iag *) hbp->data; + hiagp->extfreeback = iagp->iagnum; + iagp->extfreefwd = cpu_to_le32(head); + iagp->extfreeback = cpu_to_le32(-1); + write_metapage(hbp); + } + + imap->im_agctl[n].extfree = + le32_to_cpu(iagp->iagnum); + } + + nextiag: + write_metapage(bp); + } + + if (xnuminos != atomic_read(&imap->im_numinos) || + xnumfree != atomic_read(&imap->im_numfree)) { + jfs_error(ipimap->i_sb, + "diExtendFs: numinos or numfree incorrect"); + return -EIO; + } + + return rcx; +} + + +/* + * duplicateIXtree() + * + * serialization: IWRITE_LOCK held on entry/exit + * + * note: shadow page with regular inode (rel.2); + */ +static void duplicateIXtree(struct super_block *sb, s64 blkno, + int xlen, s64 *xaddr) +{ + struct jfs_superblock *j_sb; + struct buffer_head *bh; + struct inode *ip; + tid_t tid; + + /* if AIT2 ipmap2 is bad, do not try to update it */ + if (JFS_SBI(sb)->mntflag & JFS_BAD_SAIT) /* s_flag */ + return; + ip = diReadSpecial(sb, FILESYSTEM_I, 1); + if (ip == NULL) { + JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT; + if (readSuper(sb, &bh)) + return; + j_sb = (struct jfs_superblock *)bh->b_data; + j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT); + + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + return; + } + + /* start transaction */ + tid = txBegin(sb, COMMIT_FORCE); + /* update the inode map addressing structure to point to it */ + if (xtInsert(tid, ip, 0, blkno, xlen, xaddr, 0)) { + JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT; + txAbort(tid, 1); + goto cleanup; + + } + /* update the inode map's inode to reflect the extension */ + ip->i_size += PSIZE; + inode_add_bytes(ip, PSIZE); + txCommit(tid, 1, &ip, COMMIT_FORCE); + cleanup: + txEnd(tid); + diFreeSpecial(ip); +} + +/* + * NAME: copy_from_dinode() + * + * FUNCTION: Copies inode info from disk inode to in-memory inode + * + * RETURN VALUES: + * 0 - success + * -ENOMEM - insufficient memory + */ +static int copy_from_dinode(struct dinode * dip, struct inode *ip) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + + jfs_ip->fileset = le32_to_cpu(dip->di_fileset); + jfs_ip->mode2 = le32_to_cpu(dip->di_mode); + jfs_set_inode_flags(ip); + + ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff; + if (sbi->umask != -1) { + ip->i_mode = (ip->i_mode & ~0777) | (0777 & ~sbi->umask); + /* For directories, add x permission if r is allowed by umask */ + if (S_ISDIR(ip->i_mode)) { + if (ip->i_mode & 0400) + ip->i_mode |= 0100; + if (ip->i_mode & 0040) + ip->i_mode |= 0010; + if (ip->i_mode & 0004) + ip->i_mode |= 0001; + } + } + ip->i_nlink = le32_to_cpu(dip->di_nlink); + + jfs_ip->saved_uid = le32_to_cpu(dip->di_uid); + if (sbi->uid == -1) + ip->i_uid = jfs_ip->saved_uid; + else { + ip->i_uid = sbi->uid; + } + + jfs_ip->saved_gid = le32_to_cpu(dip->di_gid); + if (sbi->gid == -1) + ip->i_gid = jfs_ip->saved_gid; + else { + ip->i_gid = sbi->gid; + } + + ip->i_size = le64_to_cpu(dip->di_size); + ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec); + ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec); + ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec); + ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec); + ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec); + ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec); + ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks)); + ip->i_generation = le32_to_cpu(dip->di_gen); + + jfs_ip->ixpxd = dip->di_ixpxd; /* in-memory pxd's are little-endian */ + jfs_ip->acl = dip->di_acl; /* as are dxd's */ + jfs_ip->ea = dip->di_ea; + jfs_ip->next_index = le32_to_cpu(dip->di_next_index); + jfs_ip->otime = le32_to_cpu(dip->di_otime.tv_sec); + jfs_ip->acltype = le32_to_cpu(dip->di_acltype); + + if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) { + jfs_ip->dev = le32_to_cpu(dip->di_rdev); + ip->i_rdev = new_decode_dev(jfs_ip->dev); + } + + if (S_ISDIR(ip->i_mode)) { + memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384); + } else if (S_ISREG(ip->i_mode) || S_ISLNK(ip->i_mode)) { + memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288); + } else + memcpy(&jfs_ip->i_inline_ea, &dip->di_inlineea, 128); + + /* Zero the in-memory-only stuff */ + jfs_ip->cflag = 0; + jfs_ip->btindex = 0; + jfs_ip->btorder = 0; + jfs_ip->bxflag = 0; + jfs_ip->blid = 0; + jfs_ip->atlhead = 0; + jfs_ip->atltail = 0; + jfs_ip->xtlid = 0; + return (0); +} + +/* + * NAME: copy_to_dinode() + * + * FUNCTION: Copies inode info from in-memory inode to disk inode + */ +static void copy_to_dinode(struct dinode * dip, struct inode *ip) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + + dip->di_fileset = cpu_to_le32(jfs_ip->fileset); + dip->di_inostamp = cpu_to_le32(sbi->inostamp); + dip->di_number = cpu_to_le32(ip->i_ino); + dip->di_gen = cpu_to_le32(ip->i_generation); + dip->di_size = cpu_to_le64(ip->i_size); + dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks)); + dip->di_nlink = cpu_to_le32(ip->i_nlink); + if (sbi->uid == -1) + dip->di_uid = cpu_to_le32(ip->i_uid); + else + dip->di_uid = cpu_to_le32(jfs_ip->saved_uid); + if (sbi->gid == -1) + dip->di_gid = cpu_to_le32(ip->i_gid); + else + dip->di_gid = cpu_to_le32(jfs_ip->saved_gid); + jfs_get_inode_flags(jfs_ip); + /* + * mode2 is only needed for storing the higher order bits. + * Trust i_mode for the lower order ones + */ + if (sbi->umask == -1) + dip->di_mode = cpu_to_le32((jfs_ip->mode2 & 0xffff0000) | + ip->i_mode); + else /* Leave the original permissions alone */ + dip->di_mode = cpu_to_le32(jfs_ip->mode2); + + dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec); + dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec); + dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec); + dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec); + dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec); + dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec); + dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */ + dip->di_acl = jfs_ip->acl; /* as are dxd's */ + dip->di_ea = jfs_ip->ea; + dip->di_next_index = cpu_to_le32(jfs_ip->next_index); + dip->di_otime.tv_sec = cpu_to_le32(jfs_ip->otime); + dip->di_otime.tv_nsec = 0; + dip->di_acltype = cpu_to_le32(jfs_ip->acltype); + if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) + dip->di_rdev = cpu_to_le32(jfs_ip->dev); +} diff --git a/fs/jfs/jfs_imap.h b/fs/jfs/jfs_imap.h new file mode 100644 index 00000000..610a0e9d --- /dev/null +++ b/fs/jfs/jfs_imap.h @@ -0,0 +1,175 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_IMAP +#define _H_JFS_IMAP + +#include "jfs_txnmgr.h" + +/* + * jfs_imap.h: disk inode manager + */ + +#define EXTSPERIAG 128 /* number of disk inode extent per iag */ +#define IMAPBLKNO 0 /* lblkno of dinomap within inode map */ +#define SMAPSZ 4 /* number of words per summary map */ +#define EXTSPERSUM 32 /* number of extents per summary map entry */ +#define L2EXTSPERSUM 5 /* l2 number of extents per summary map */ +#define PGSPERIEXT 4 /* number of 4K pages per dinode extent */ +#define MAXIAGS ((1<<20)-1) /* maximum number of iags */ +#define MAXAG 128 /* maximum number of allocation groups */ + +#define AMAPSIZE 512 /* bytes in the IAG allocation maps */ +#define SMAPSIZE 16 /* bytes in the IAG summary maps */ + +/* convert inode number to iag number */ +#define INOTOIAG(ino) ((ino) >> L2INOSPERIAG) + +/* convert iag number to logical block number of the iag page */ +#define IAGTOLBLK(iagno,l2nbperpg) (((iagno) + 1) << (l2nbperpg)) + +/* get the starting block number of the 4K page of an inode extent + * that contains ino. + */ +#define INOPBLK(pxd,ino,l2nbperpg) (addressPXD((pxd)) + \ + ((((ino) & (INOSPEREXT-1)) >> L2INOSPERPAGE) << (l2nbperpg))) + +/* + * inode allocation map: + * + * inode allocation map consists of + * . the inode map control page and + * . inode allocation group pages (per 4096 inodes) + * which are addressed by standard JFS xtree. + */ +/* + * inode allocation group page (per 4096 inodes of an AG) + */ +struct iag { + __le64 agstart; /* 8: starting block of ag */ + __le32 iagnum; /* 4: inode allocation group number */ + __le32 inofreefwd; /* 4: ag inode free list forward */ + __le32 inofreeback; /* 4: ag inode free list back */ + __le32 extfreefwd; /* 4: ag inode extent free list forward */ + __le32 extfreeback; /* 4: ag inode extent free list back */ + __le32 iagfree; /* 4: iag free list */ + + /* summary map: 1 bit per inode extent */ + __le32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes; + * note: this indicates free and backed + * inodes, if the extent is not backed the + * value will be 1. if the extent is + * backed but all inodes are being used the + * value will be 1. if the extent is + * backed but at least one of the inodes is + * free the value will be 0. + */ + __le32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */ + __le32 nfreeinos; /* 4: number of free inodes */ + __le32 nfreeexts; /* 4: number of free extents */ + /* (72) */ + u8 pad[1976]; /* 1976: pad to 2048 bytes */ + /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */ + __le32 wmap[EXTSPERIAG]; /* 512: working allocation map */ + __le32 pmap[EXTSPERIAG]; /* 512: persistent allocation map */ + pxd_t inoext[EXTSPERIAG]; /* 1024: inode extent addresses */ +}; /* (4096) */ + +/* + * per AG control information (in inode map control page) + */ +struct iagctl_disk { + __le32 inofree; /* 4: free inode list anchor */ + __le32 extfree; /* 4: free extent list anchor */ + __le32 numinos; /* 4: number of backed inodes */ + __le32 numfree; /* 4: number of free inodes */ +}; /* (16) */ + +struct iagctl { + int inofree; /* free inode list anchor */ + int extfree; /* free extent list anchor */ + int numinos; /* number of backed inodes */ + int numfree; /* number of free inodes */ +}; + +/* + * per fileset/aggregate inode map control page + */ +struct dinomap_disk { + __le32 in_freeiag; /* 4: free iag list anchor */ + __le32 in_nextiag; /* 4: next free iag number */ + __le32 in_numinos; /* 4: num of backed inodes */ + __le32 in_numfree; /* 4: num of free backed inodes */ + __le32 in_nbperiext; /* 4: num of blocks per inode extent */ + __le32 in_l2nbperiext; /* 4: l2 of in_nbperiext */ + __le32 in_diskblock; /* 4: for standalone test driver */ + __le32 in_maxag; /* 4: for standalone test driver */ + u8 pad[2016]; /* 2016: pad to 2048 */ + struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */ +}; /* (4096) */ + +struct dinomap { + int in_freeiag; /* free iag list anchor */ + int in_nextiag; /* next free iag number */ + int in_numinos; /* num of backed inodes */ + int in_numfree; /* num of free backed inodes */ + int in_nbperiext; /* num of blocks per inode extent */ + int in_l2nbperiext; /* l2 of in_nbperiext */ + int in_diskblock; /* for standalone test driver */ + int in_maxag; /* for standalone test driver */ + struct iagctl in_agctl[MAXAG]; /* AG control information */ +}; + +/* + * In-core inode map control page + */ +struct inomap { + struct dinomap im_imap; /* 4096: inode allocation control */ + struct inode *im_ipimap; /* 4: ptr to inode for imap */ + struct mutex im_freelock; /* 4: iag free list lock */ + struct mutex im_aglock[MAXAG]; /* 512: per AG locks */ + u32 *im_DBGdimap; + atomic_t im_numinos; /* num of backed inodes */ + atomic_t im_numfree; /* num of free backed inodes */ +}; + +#define im_freeiag im_imap.in_freeiag +#define im_nextiag im_imap.in_nextiag +#define im_agctl im_imap.in_agctl +#define im_nbperiext im_imap.in_nbperiext +#define im_l2nbperiext im_imap.in_l2nbperiext + +/* for standalone testdriver + */ +#define im_diskblock im_imap.in_diskblock +#define im_maxag im_imap.in_maxag + +extern int diFree(struct inode *); +extern int diAlloc(struct inode *, bool, struct inode *); +extern int diSync(struct inode *); +/* external references */ +extern int diUpdatePMap(struct inode *ipimap, unsigned long inum, + bool is_free, struct tblock * tblk); +extern int diExtendFS(struct inode *ipimap, struct inode *ipbmap); +extern int diMount(struct inode *); +extern int diUnmount(struct inode *, int); +extern int diRead(struct inode *); +extern struct inode *diReadSpecial(struct super_block *, ino_t, int); +extern void diWriteSpecial(struct inode *, int); +extern void diFreeSpecial(struct inode *); +extern int diWrite(tid_t tid, struct inode *); +#endif /* _H_JFS_IMAP */ diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h new file mode 100644 index 00000000..584a4a1a --- /dev/null +++ b/fs/jfs/jfs_incore.h @@ -0,0 +1,224 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_INCORE +#define _H_JFS_INCORE + +#include +#include +#include +#include +#include "jfs_types.h" +#include "jfs_xtree.h" +#include "jfs_dtree.h" + +/* + * JFS magic number + */ +#define JFS_SUPER_MAGIC 0x3153464a /* "JFS1" */ + +/* + * JFS-private inode information + */ +struct jfs_inode_info { + int fileset; /* fileset number (always 16)*/ + uint mode2; /* jfs-specific mode */ + uint saved_uid; /* saved for uid mount option */ + uint saved_gid; /* saved for gid mount option */ + pxd_t ixpxd; /* inode extent descriptor */ + dxd_t acl; /* dxd describing acl */ + dxd_t ea; /* dxd describing ea */ + time_t otime; /* time created */ + uint next_index; /* next available directory entry index */ + int acltype; /* Type of ACL */ + short btorder; /* access order */ + short btindex; /* btpage entry index*/ + struct inode *ipimap; /* inode map */ + unsigned long cflag; /* commit flags */ + u64 agstart; /* agstart of the containing IAG */ + u16 bxflag; /* xflag of pseudo buffer? */ + unchar pad; + signed char active_ag; /* ag currently allocating from */ + lid_t blid; /* lid of pseudo buffer? */ + lid_t atlhead; /* anonymous tlock list head */ + lid_t atltail; /* anonymous tlock list tail */ + spinlock_t ag_lock; /* protects active_ag */ + struct list_head anon_inode_list; /* inodes having anonymous txns */ + /* + * rdwrlock serializes xtree between reads & writes and synchronizes + * changes to special inodes. It's use would be redundant on + * directories since the i_mutex taken in the VFS is sufficient. + */ + struct rw_semaphore rdwrlock; + /* + * commit_mutex serializes transaction processing on an inode. + * It must be taken after beginning a transaction (txBegin), since + * dirty inodes may be committed while a new transaction on the + * inode is blocked in txBegin or TxBeginAnon + */ + struct mutex commit_mutex; + /* xattr_sem allows us to access the xattrs without taking i_mutex */ + struct rw_semaphore xattr_sem; + lid_t xtlid; /* lid of xtree lock on directory */ + union { + struct { + xtpage_t _xtroot; /* 288: xtree root */ + struct inomap *_imap; /* 4: inode map header */ + } file; + struct { + struct dir_table_slot _table[12]; /* 96: dir index */ + dtroot_t _dtroot; /* 288: dtree root */ + } dir; + struct { + unchar _unused[16]; /* 16: */ + dxd_t _dxd; /* 16: */ + unchar _inline[128]; /* 128: inline symlink */ + /* _inline_ea may overlay the last part of + * file._xtroot if maxentry = XTROOTINITSLOT + */ + unchar _inline_ea[128]; /* 128: inline extended attr */ + } link; + } u; + u32 dev; /* will die when we get wide dev_t */ + struct inode vfs_inode; +}; +#define i_xtroot u.file._xtroot +#define i_imap u.file._imap +#define i_dirtable u.dir._table +#define i_dtroot u.dir._dtroot +#define i_inline u.link._inline +#define i_inline_ea u.link._inline_ea + +#define IREAD_LOCK(ip, subclass) \ + down_read_nested(&JFS_IP(ip)->rdwrlock, subclass) +#define IREAD_UNLOCK(ip) up_read(&JFS_IP(ip)->rdwrlock) +#define IWRITE_LOCK(ip, subclass) \ + down_write_nested(&JFS_IP(ip)->rdwrlock, subclass) +#define IWRITE_UNLOCK(ip) up_write(&JFS_IP(ip)->rdwrlock) + +/* + * cflag + */ +enum cflags { + COMMIT_Nolink, /* inode committed with zero link count */ + COMMIT_Inlineea, /* commit inode inline EA */ + COMMIT_Freewmap, /* free WMAP at iClose() */ + COMMIT_Dirty, /* Inode is really dirty */ + COMMIT_Dirtable, /* commit changes to di_dirtable */ + COMMIT_Stale, /* data extent is no longer valid */ + COMMIT_Synclist, /* metadata pages on group commit synclist */ +}; + +/* + * commit_mutex nesting subclasses: + */ +enum commit_mutex_class +{ + COMMIT_MUTEX_PARENT, + COMMIT_MUTEX_CHILD, + COMMIT_MUTEX_SECOND_PARENT, /* Renaming */ + COMMIT_MUTEX_VICTIM /* Inode being unlinked due to rename */ +}; + +/* + * rdwrlock subclasses: + * The dmap inode may be locked while a normal inode or the imap inode are + * locked. + */ +enum rdwrlock_class +{ + RDWRLOCK_NORMAL, + RDWRLOCK_IMAP, + RDWRLOCK_DMAP +}; + +#define set_cflag(flag, ip) set_bit(flag, &(JFS_IP(ip)->cflag)) +#define clear_cflag(flag, ip) clear_bit(flag, &(JFS_IP(ip)->cflag)) +#define test_cflag(flag, ip) test_bit(flag, &(JFS_IP(ip)->cflag)) +#define test_and_clear_cflag(flag, ip) \ + test_and_clear_bit(flag, &(JFS_IP(ip)->cflag)) +/* + * JFS-private superblock information. + */ +struct jfs_sb_info { + struct super_block *sb; /* Point back to vfs super block */ + unsigned long mntflag; /* aggregate attributes */ + struct inode *ipbmap; /* block map inode */ + struct inode *ipaimap; /* aggregate inode map inode */ + struct inode *ipaimap2; /* secondary aimap inode */ + struct inode *ipimap; /* aggregate inode map inode */ + struct jfs_log *log; /* log */ + struct list_head log_list; /* volumes associated with a journal */ + short bsize; /* logical block size */ + short l2bsize; /* log2 logical block size */ + short nbperpage; /* blocks per page */ + short l2nbperpage; /* log2 blocks per page */ + short l2niperblk; /* log2 inodes per page */ + dev_t logdev; /* external log device */ + uint aggregate; /* volume identifier in log record */ + pxd_t logpxd; /* pxd describing log */ + pxd_t fsckpxd; /* pxd describing fsck wkspc */ + pxd_t ait2; /* pxd describing AIT copy */ + char uuid[16]; /* 128-bit uuid for volume */ + char loguuid[16]; /* 128-bit uuid for log */ + /* + * commit_state is used for synchronization of the jfs_commit + * threads. It is protected by LAZY_LOCK(). + */ + int commit_state; /* commit state */ + /* Formerly in ipimap */ + uint gengen; /* inode generation generator*/ + uint inostamp; /* shows inode belongs to fileset*/ + + /* Formerly in ipbmap */ + struct bmap *bmap; /* incore bmap descriptor */ + struct nls_table *nls_tab; /* current codepage */ + struct inode *direct_inode; /* metadata inode */ + uint state; /* mount/recovery state */ + unsigned long flag; /* mount time flags */ + uint p_state; /* state prior to going no integrity */ + uint uid; /* uid to override on-disk uid */ + uint gid; /* gid to override on-disk gid */ + uint umask; /* umask to override on-disk umask */ +}; + +/* jfs_sb_info commit_state */ +#define IN_LAZYCOMMIT 1 + +static inline struct jfs_inode_info *JFS_IP(struct inode *inode) +{ + return list_entry(inode, struct jfs_inode_info, vfs_inode); +} + +static inline int jfs_dirtable_inline(struct inode *inode) +{ + return (JFS_IP(inode)->next_index <= (MAX_INLINE_DIRTABLE_ENTRY + 1)); +} + +static inline struct jfs_sb_info *JFS_SBI(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline int isReadOnly(struct inode *inode) +{ + if (JFS_SBI(inode->i_sb)->log) + return 0; + return 1; +} +#endif /* _H_JFS_INCORE */ diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c new file mode 100644 index 00000000..2686531e --- /dev/null +++ b/fs/jfs/jfs_inode.c @@ -0,0 +1,166 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_filsys.h" +#include "jfs_imap.h" +#include "jfs_dinode.h" +#include "jfs_debug.h" + + +void jfs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = JFS_IP(inode)->mode2; + + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | + S_NOATIME | S_DIRSYNC | S_SYNC); + + if (flags & JFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & JFS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & JFS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & JFS_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; + if (flags & JFS_SYNC_FL) + inode->i_flags |= S_SYNC; +} + +void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip) +{ + unsigned int flags = jfs_ip->vfs_inode.i_flags; + + jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_APPEND_FL | JFS_NOATIME_FL | + JFS_DIRSYNC_FL | JFS_SYNC_FL); + if (flags & S_IMMUTABLE) + jfs_ip->mode2 |= JFS_IMMUTABLE_FL; + if (flags & S_APPEND) + jfs_ip->mode2 |= JFS_APPEND_FL; + if (flags & S_NOATIME) + jfs_ip->mode2 |= JFS_NOATIME_FL; + if (flags & S_DIRSYNC) + jfs_ip->mode2 |= JFS_DIRSYNC_FL; + if (flags & S_SYNC) + jfs_ip->mode2 |= JFS_SYNC_FL; +} + +/* + * NAME: ialloc() + * + * FUNCTION: Allocate a new inode + * + */ +struct inode *ialloc(struct inode *parent, umode_t mode) +{ + struct super_block *sb = parent->i_sb; + struct inode *inode; + struct jfs_inode_info *jfs_inode; + int rc; + + inode = new_inode(sb); + if (!inode) { + jfs_warn("ialloc: new_inode returned NULL!"); + rc = -ENOMEM; + goto fail; + } + + jfs_inode = JFS_IP(inode); + + rc = diAlloc(parent, S_ISDIR(mode), inode); + if (rc) { + jfs_warn("ialloc: diAlloc returned %d!", rc); + if (rc == -EIO) + make_bad_inode(inode); + goto fail_put; + } + + if (insert_inode_locked(inode) < 0) { + rc = -EINVAL; + goto fail_unlock; + } + + inode_init_owner(inode, parent, mode); + /* + * New inodes need to save sane values on disk when + * uid & gid mount options are used + */ + jfs_inode->saved_uid = inode->i_uid; + jfs_inode->saved_gid = inode->i_gid; + + /* + * Allocate inode to quota. + */ + dquot_initialize(inode); + rc = dquot_alloc_inode(inode); + if (rc) + goto fail_drop; + + /* inherit flags from parent */ + jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT; + + if (S_ISDIR(mode)) { + jfs_inode->mode2 |= IDIRECTORY; + jfs_inode->mode2 &= ~JFS_DIRSYNC_FL; + } + else { + jfs_inode->mode2 |= INLINEEA | ISPARSE; + if (S_ISLNK(mode)) + jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL); + } + jfs_inode->mode2 |= inode->i_mode; + + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + jfs_inode->otime = inode->i_ctime.tv_sec; + inode->i_generation = JFS_SBI(sb)->gengen++; + + jfs_inode->cflag = 0; + + /* Zero remaining fields */ + memset(&jfs_inode->acl, 0, sizeof(dxd_t)); + memset(&jfs_inode->ea, 0, sizeof(dxd_t)); + jfs_inode->next_index = 0; + jfs_inode->acltype = 0; + jfs_inode->btorder = 0; + jfs_inode->btindex = 0; + jfs_inode->bxflag = 0; + jfs_inode->blid = 0; + jfs_inode->atlhead = 0; + jfs_inode->atltail = 0; + jfs_inode->xtlid = 0; + jfs_set_inode_flags(inode); + + jfs_info("ialloc returns inode = 0x%p\n", inode); + + return inode; + +fail_drop: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; +fail_unlock: + inode->i_nlink = 0; + unlock_new_inode(inode); +fail_put: + iput(inode); +fail: + return ERR_PTR(rc); +} diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h new file mode 100644 index 00000000..ec2fb8b9 --- /dev/null +++ b/fs/jfs/jfs_inode.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_INODE +#define _H_JFS_INODE + +struct fid; + +extern struct inode *ialloc(struct inode *, umode_t); +extern int jfs_fsync(struct file *, int); +extern long jfs_ioctl(struct file *, unsigned int, unsigned long); +extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); +extern struct inode *jfs_iget(struct super_block *, unsigned long); +extern int jfs_commit_inode(struct inode *, int); +extern int jfs_write_inode(struct inode *, struct writeback_control *); +extern void jfs_evict_inode(struct inode *); +extern void jfs_dirty_inode(struct inode *, int); +extern void jfs_truncate(struct inode *); +extern void jfs_truncate_nolock(struct inode *, loff_t); +extern void jfs_free_zero_link(struct inode *); +extern struct dentry *jfs_get_parent(struct dentry *dentry); +extern void jfs_get_inode_flags(struct jfs_inode_info *); +extern struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +extern void jfs_set_inode_flags(struct inode *); +extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern int jfs_setattr(struct dentry *, struct iattr *); + +extern const struct address_space_operations jfs_aops; +extern const struct inode_operations jfs_dir_inode_operations; +extern const struct file_operations jfs_dir_operations; +extern const struct inode_operations jfs_file_inode_operations; +extern const struct file_operations jfs_file_operations; +extern const struct inode_operations jfs_symlink_inode_operations; +extern const struct inode_operations jfs_fast_symlink_inode_operations; +extern const struct dentry_operations jfs_ci_dentry_operations; +#endif /* _H_JFS_INODE */ diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h new file mode 100644 index 00000000..ecf04882 --- /dev/null +++ b/fs/jfs/jfs_lock.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2001 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_LOCK +#define _H_JFS_LOCK + +#include +#include +#include + +/* + * jfs_lock.h + */ + +/* + * Conditional sleep where condition is protected by spinlock + * + * lock_cmd and unlock_cmd take and release the spinlock + */ +#define __SLEEP_COND(wq, cond, lock_cmd, unlock_cmd) \ +do { \ + DECLARE_WAITQUEUE(__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE);\ + if (cond) \ + break; \ + unlock_cmd; \ + io_schedule(); \ + lock_cmd; \ + } \ + __set_current_state(TASK_RUNNING); \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#endif /* _H_JFS_LOCK */ diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c new file mode 100644 index 00000000..583636f7 --- /dev/null +++ b/fs/jfs/jfs_logmgr.c @@ -0,0 +1,2529 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_logmgr.c: log manager + * + * for related information, see transaction manager (jfs_txnmgr.c), and + * recovery manager (jfs_logredo.c). + * + * note: for detail, RTFS. + * + * log buffer manager: + * special purpose buffer manager supporting log i/o requirements. + * per log serial pageout of logpage + * queuing i/o requests and redrive i/o at iodone + * maintain current logpage buffer + * no caching since append only + * appropriate jfs buffer cache buffers as needed + * + * group commit: + * transactions which wrote COMMIT records in the same in-memory + * log page during the pageout of previous/current log page(s) are + * committed together by the pageout of the page. + * + * TBD lazy commit: + * transactions are committed asynchronously when the log page + * containing it COMMIT is paged out when it becomes full; + * + * serialization: + * . a per log lock serialize log write. + * . a per log lock serialize group commit. + * . a per log lock serialize log open/close; + * + * TBD log integrity: + * careful-write (ping-pong) of last logpage to recover from crash + * in overwrite. + * detection of split (out-of-order) write of physical sectors + * of last logpage via timestamp at end of each sector + * with its mirror data array at trailer). + * + * alternatives: + * lsn - 64-bit monotonically increasing integer vs + * 32-bit lspn and page eor. + */ + +#include +#include +#include +#include +#include +#include /* for sync_blockdev() */ +#include +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_superblock.h" +#include "jfs_txnmgr.h" +#include "jfs_debug.h" + + +/* + * lbuf's ready to be redriven. Protected by log_redrive_lock (jfsIO thread) + */ +static struct lbuf *log_redrive_list; +static DEFINE_SPINLOCK(log_redrive_lock); + + +/* + * log read/write serialization (per log) + */ +#define LOG_LOCK_INIT(log) mutex_init(&(log)->loglock) +#define LOG_LOCK(log) mutex_lock(&((log)->loglock)) +#define LOG_UNLOCK(log) mutex_unlock(&((log)->loglock)) + + +/* + * log group commit serialization (per log) + */ + +#define LOGGC_LOCK_INIT(log) spin_lock_init(&(log)->gclock) +#define LOGGC_LOCK(log) spin_lock_irq(&(log)->gclock) +#define LOGGC_UNLOCK(log) spin_unlock_irq(&(log)->gclock) +#define LOGGC_WAKEUP(tblk) wake_up_all(&(tblk)->gcwait) + +/* + * log sync serialization (per log) + */ +#define LOGSYNC_DELTA(logsize) min((logsize)/8, 128*LOGPSIZE) +#define LOGSYNC_BARRIER(logsize) ((logsize)/4) +/* +#define LOGSYNC_DELTA(logsize) min((logsize)/4, 256*LOGPSIZE) +#define LOGSYNC_BARRIER(logsize) ((logsize)/2) +*/ + + +/* + * log buffer cache synchronization + */ +static DEFINE_SPINLOCK(jfsLCacheLock); + +#define LCACHE_LOCK(flags) spin_lock_irqsave(&jfsLCacheLock, flags) +#define LCACHE_UNLOCK(flags) spin_unlock_irqrestore(&jfsLCacheLock, flags) + +/* + * See __SLEEP_COND in jfs_locks.h + */ +#define LCACHE_SLEEP_COND(wq, cond, flags) \ +do { \ + if (cond) \ + break; \ + __SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \ +} while (0) + +#define LCACHE_WAKEUP(event) wake_up(event) + + +/* + * lbuf buffer cache (lCache) control + */ +/* log buffer manager pageout control (cumulative, inclusive) */ +#define lbmREAD 0x0001 +#define lbmWRITE 0x0002 /* enqueue at tail of write queue; + * init pageout if at head of queue; + */ +#define lbmRELEASE 0x0004 /* remove from write queue + * at completion of pageout; + * do not free/recycle it yet: + * caller will free it; + */ +#define lbmSYNC 0x0008 /* do not return to freelist + * when removed from write queue; + */ +#define lbmFREE 0x0010 /* return to freelist + * at completion of pageout; + * the buffer may be recycled; + */ +#define lbmDONE 0x0020 +#define lbmERROR 0x0040 +#define lbmGC 0x0080 /* lbmIODone to perform post-GC processing + * of log page + */ +#define lbmDIRECT 0x0100 + +/* + * Global list of active external journals + */ +static LIST_HEAD(jfs_external_logs); +static struct jfs_log *dummy_log = NULL; +static DEFINE_MUTEX(jfs_log_mutex); + +/* + * forward references + */ +static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk, + struct lrd * lrd, struct tlock * tlck); + +static int lmNextPage(struct jfs_log * log); +static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, + int activate); + +static int open_inline_log(struct super_block *sb); +static int open_dummy_log(struct super_block *sb); +static int lbmLogInit(struct jfs_log * log); +static void lbmLogShutdown(struct jfs_log * log); +static struct lbuf *lbmAllocate(struct jfs_log * log, int); +static void lbmFree(struct lbuf * bp); +static void lbmfree(struct lbuf * bp); +static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp); +static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block); +static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag); +static int lbmIOWait(struct lbuf * bp, int flag); +static bio_end_io_t lbmIODone; +static void lbmStartIO(struct lbuf * bp); +static void lmGCwrite(struct jfs_log * log, int cant_block); +static int lmLogSync(struct jfs_log * log, int hard_sync); + + + +/* + * statistics + */ +#ifdef CONFIG_JFS_STATISTICS +static struct lmStat { + uint commit; /* # of commit */ + uint pagedone; /* # of page written */ + uint submitted; /* # of pages submitted */ + uint full_page; /* # of full pages submitted */ + uint partial_page; /* # of partial pages submitted */ +} lmStat; +#endif + +static void write_special_inodes(struct jfs_log *log, + int (*writer)(struct address_space *)) +{ + struct jfs_sb_info *sbi; + + list_for_each_entry(sbi, &log->sb_list, log_list) { + writer(sbi->ipbmap->i_mapping); + writer(sbi->ipimap->i_mapping); + writer(sbi->direct_inode->i_mapping); + } +} + +/* + * NAME: lmLog() + * + * FUNCTION: write a log record; + * + * PARAMETER: + * + * RETURN: lsn - offset to the next log record to write (end-of-log); + * -1 - error; + * + * note: todo: log error handler + */ +int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + int lsn; + int diffp, difft; + struct metapage *mp = NULL; + unsigned long flags; + + jfs_info("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p", + log, tblk, lrd, tlck); + + LOG_LOCK(log); + + /* log by (out-of-transaction) JFS ? */ + if (tblk == NULL) + goto writeRecord; + + /* log from page ? */ + if (tlck == NULL || + tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL) + goto writeRecord; + + /* + * initialize/update page/transaction recovery lsn + */ + lsn = log->lsn; + + LOGSYNC_LOCK(log, flags); + + /* + * initialize page lsn if first log write of the page + */ + if (mp->lsn == 0) { + mp->log = log; + mp->lsn = lsn; + log->count++; + + /* insert page at tail of logsynclist */ + list_add_tail(&mp->synclist, &log->synclist); + } + + /* + * initialize/update lsn of tblock of the page + * + * transaction inherits oldest lsn of pages associated + * with allocation/deallocation of resources (their + * log records are used to reconstruct allocation map + * at recovery time: inode for inode allocation map, + * B+-tree index of extent descriptors for block + * allocation map); + * allocation map pages inherit transaction lsn at + * commit time to allow forwarding log syncpt past log + * records associated with allocation/deallocation of + * resources only after persistent map of these map pages + * have been updated and propagated to home. + */ + /* + * initialize transaction lsn: + */ + if (tblk->lsn == 0) { + /* inherit lsn of its first page logged */ + tblk->lsn = mp->lsn; + log->count++; + + /* insert tblock after the page on logsynclist */ + list_add(&tblk->synclist, &mp->synclist); + } + /* + * update transaction lsn: + */ + else { + /* inherit oldest/smallest lsn of page */ + logdiff(diffp, mp->lsn, log); + logdiff(difft, tblk->lsn, log); + if (diffp < difft) { + /* update tblock lsn with page lsn */ + tblk->lsn = mp->lsn; + + /* move tblock after page on logsynclist */ + list_move(&tblk->synclist, &mp->synclist); + } + } + + LOGSYNC_UNLOCK(log, flags); + + /* + * write the log record + */ + writeRecord: + lsn = lmWriteRecord(log, tblk, lrd, tlck); + + /* + * forward log syncpt if log reached next syncpt trigger + */ + logdiff(diffp, lsn, log); + if (diffp >= log->nextsync) + lsn = lmLogSync(log, 0); + + /* update end-of-log lsn */ + log->lsn = lsn; + + LOG_UNLOCK(log); + + /* return end-of-log address */ + return lsn; +} + +/* + * NAME: lmWriteRecord() + * + * FUNCTION: move the log record to current log page + * + * PARAMETER: cd - commit descriptor + * + * RETURN: end-of-log address + * + * serialization: LOG_LOCK() held on entry/exit + */ +static int +lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + int lsn = 0; /* end-of-log address */ + struct lbuf *bp; /* dst log page buffer */ + struct logpage *lp; /* dst log page */ + caddr_t dst; /* destination address in log page */ + int dstoffset; /* end-of-log offset in log page */ + int freespace; /* free space in log page */ + caddr_t p; /* src meta-data page */ + caddr_t src; + int srclen; + int nbytes; /* number of bytes to move */ + int i; + int len; + struct linelock *linelock; + struct lv *lv; + struct lvd *lvd; + int l2linesize; + + len = 0; + + /* retrieve destination log page to write */ + bp = (struct lbuf *) log->bp; + lp = (struct logpage *) bp->l_ldata; + dstoffset = log->eor; + + /* any log data to write ? */ + if (tlck == NULL) + goto moveLrd; + + /* + * move log record data + */ + /* retrieve source meta-data page to log */ + if (tlck->flag & tlckPAGELOCK) { + p = (caddr_t) (tlck->mp->data); + linelock = (struct linelock *) & tlck->lock; + } + /* retrieve source in-memory inode to log */ + else if (tlck->flag & tlckINODELOCK) { + if (tlck->type & tlckDTREE) + p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot; + else + p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot; + linelock = (struct linelock *) & tlck->lock; + } +#ifdef _JFS_WIP + else if (tlck->flag & tlckINLINELOCK) { + + inlinelock = (struct inlinelock *) & tlck; + p = (caddr_t) & inlinelock->pxd; + linelock = (struct linelock *) & tlck; + } +#endif /* _JFS_WIP */ + else { + jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck); + return 0; /* Probably should trap */ + } + l2linesize = linelock->l2linesize; + + moveData: + ASSERT(linelock->index <= linelock->maxcnt); + + lv = linelock->lv; + for (i = 0; i < linelock->index; i++, lv++) { + if (lv->length == 0) + continue; + + /* is page full ? */ + if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) { + /* page become full: move on to next page */ + lmNextPage(log); + + bp = log->bp; + lp = (struct logpage *) bp->l_ldata; + dstoffset = LOGPHDRSIZE; + } + + /* + * move log vector data + */ + src = (u8 *) p + (lv->offset << l2linesize); + srclen = lv->length << l2linesize; + len += srclen; + while (srclen > 0) { + freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset; + nbytes = min(freespace, srclen); + dst = (caddr_t) lp + dstoffset; + memcpy(dst, src, nbytes); + dstoffset += nbytes; + + /* is page not full ? */ + if (dstoffset < LOGPSIZE - LOGPTLRSIZE) + break; + + /* page become full: move on to next page */ + lmNextPage(log); + + bp = (struct lbuf *) log->bp; + lp = (struct logpage *) bp->l_ldata; + dstoffset = LOGPHDRSIZE; + + srclen -= nbytes; + src += nbytes; + } + + /* + * move log vector descriptor + */ + len += 4; + lvd = (struct lvd *) ((caddr_t) lp + dstoffset); + lvd->offset = cpu_to_le16(lv->offset); + lvd->length = cpu_to_le16(lv->length); + dstoffset += 4; + jfs_info("lmWriteRecord: lv offset:%d length:%d", + lv->offset, lv->length); + } + + if ((i = linelock->next)) { + linelock = (struct linelock *) lid_to_tlock(i); + goto moveData; + } + + /* + * move log record descriptor + */ + moveLrd: + lrd->length = cpu_to_le16(len); + + src = (caddr_t) lrd; + srclen = LOGRDSIZE; + + while (srclen > 0) { + freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset; + nbytes = min(freespace, srclen); + dst = (caddr_t) lp + dstoffset; + memcpy(dst, src, nbytes); + + dstoffset += nbytes; + srclen -= nbytes; + + /* are there more to move than freespace of page ? */ + if (srclen) + goto pageFull; + + /* + * end of log record descriptor + */ + + /* update last log record eor */ + log->eor = dstoffset; + bp->l_eor = dstoffset; + lsn = (log->page << L2LOGPSIZE) + dstoffset; + + if (lrd->type & cpu_to_le16(LOG_COMMIT)) { + tblk->clsn = lsn; + jfs_info("wr: tclsn:0x%x, beor:0x%x", tblk->clsn, + bp->l_eor); + + INCREMENT(lmStat.commit); /* # of commit */ + + /* + * enqueue tblock for group commit: + * + * enqueue tblock of non-trivial/synchronous COMMIT + * at tail of group commit queue + * (trivial/asynchronous COMMITs are ignored by + * group commit.) + */ + LOGGC_LOCK(log); + + /* init tblock gc state */ + tblk->flag = tblkGC_QUEUE; + tblk->bp = log->bp; + tblk->pn = log->page; + tblk->eor = log->eor; + + /* enqueue transaction to commit queue */ + list_add_tail(&tblk->cqueue, &log->cqueue); + + LOGGC_UNLOCK(log); + } + + jfs_info("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x", + le16_to_cpu(lrd->type), log->bp, log->page, dstoffset); + + /* page not full ? */ + if (dstoffset < LOGPSIZE - LOGPTLRSIZE) + return lsn; + + pageFull: + /* page become full: move on to next page */ + lmNextPage(log); + + bp = (struct lbuf *) log->bp; + lp = (struct logpage *) bp->l_ldata; + dstoffset = LOGPHDRSIZE; + src += nbytes; + } + + return lsn; +} + + +/* + * NAME: lmNextPage() + * + * FUNCTION: write current page and allocate next page. + * + * PARAMETER: log + * + * RETURN: 0 + * + * serialization: LOG_LOCK() held on entry/exit + */ +static int lmNextPage(struct jfs_log * log) +{ + struct logpage *lp; + int lspn; /* log sequence page number */ + int pn; /* current page number */ + struct lbuf *bp; + struct lbuf *nextbp; + struct tblock *tblk; + + /* get current log page number and log sequence page number */ + pn = log->page; + bp = log->bp; + lp = (struct logpage *) bp->l_ldata; + lspn = le32_to_cpu(lp->h.page); + + LOGGC_LOCK(log); + + /* + * write or queue the full page at the tail of write queue + */ + /* get the tail tblk on commit queue */ + if (list_empty(&log->cqueue)) + tblk = NULL; + else + tblk = list_entry(log->cqueue.prev, struct tblock, cqueue); + + /* every tblk who has COMMIT record on the current page, + * and has not been committed, must be on commit queue + * since tblk is queued at commit queueu at the time + * of writing its COMMIT record on the page before + * page becomes full (even though the tblk thread + * who wrote COMMIT record may have been suspended + * currently); + */ + + /* is page bound with outstanding tail tblk ? */ + if (tblk && tblk->pn == pn) { + /* mark tblk for end-of-page */ + tblk->flag |= tblkGC_EOP; + + if (log->cflag & logGC_PAGEOUT) { + /* if page is not already on write queue, + * just enqueue (no lbmWRITE to prevent redrive) + * buffer to wqueue to ensure correct serial order + * of the pages since log pages will be added + * continuously + */ + if (bp->l_wqnext == NULL) + lbmWrite(log, bp, 0, 0); + } else { + /* + * No current GC leader, initiate group commit + */ + log->cflag |= logGC_PAGEOUT; + lmGCwrite(log, 0); + } + } + /* page is not bound with outstanding tblk: + * init write or mark it to be redriven (lbmWRITE) + */ + else { + /* finalize the page */ + bp->l_ceor = bp->l_eor; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); + lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0); + } + LOGGC_UNLOCK(log); + + /* + * allocate/initialize next page + */ + /* if log wraps, the first data page of log is 2 + * (0 never used, 1 is superblock). + */ + log->page = (pn == log->size - 1) ? 2 : pn + 1; + log->eor = LOGPHDRSIZE; /* ? valid page empty/full at logRedo() */ + + /* allocate/initialize next log page buffer */ + nextbp = lbmAllocate(log, log->page); + nextbp->l_eor = log->eor; + log->bp = nextbp; + + /* initialize next log page */ + lp = (struct logpage *) nextbp->l_ldata; + lp->h.page = lp->t.page = cpu_to_le32(lspn + 1); + lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE); + + return 0; +} + + +/* + * NAME: lmGroupCommit() + * + * FUNCTION: group commit + * initiate pageout of the pages with COMMIT in the order of + * page number - redrive pageout of the page at the head of + * pageout queue until full page has been written. + * + * RETURN: + * + * NOTE: + * LOGGC_LOCK serializes log group commit queue, and + * transaction blocks on the commit queue. + * N.B. LOG_LOCK is NOT held during lmGroupCommit(). + */ +int lmGroupCommit(struct jfs_log * log, struct tblock * tblk) +{ + int rc = 0; + + LOGGC_LOCK(log); + + /* group committed already ? */ + if (tblk->flag & tblkGC_COMMITTED) { + if (tblk->flag & tblkGC_ERROR) + rc = -EIO; + + LOGGC_UNLOCK(log); + return rc; + } + jfs_info("lmGroup Commit: tblk = 0x%p, gcrtc = %d", tblk, log->gcrtc); + + if (tblk->xflag & COMMIT_LAZY) + tblk->flag |= tblkGC_LAZY; + + if ((!(log->cflag & logGC_PAGEOUT)) && (!list_empty(&log->cqueue)) && + (!(tblk->xflag & COMMIT_LAZY) || test_bit(log_FLUSH, &log->flag) + || jfs_tlocks_low)) { + /* + * No pageout in progress + * + * start group commit as its group leader. + */ + log->cflag |= logGC_PAGEOUT; + + lmGCwrite(log, 0); + } + + if (tblk->xflag & COMMIT_LAZY) { + /* + * Lazy transactions can leave now + */ + LOGGC_UNLOCK(log); + return 0; + } + + /* lmGCwrite gives up LOGGC_LOCK, check again */ + + if (tblk->flag & tblkGC_COMMITTED) { + if (tblk->flag & tblkGC_ERROR) + rc = -EIO; + + LOGGC_UNLOCK(log); + return rc; + } + + /* upcount transaction waiting for completion + */ + log->gcrtc++; + tblk->flag |= tblkGC_READY; + + __SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED), + LOGGC_LOCK(log), LOGGC_UNLOCK(log)); + + /* removed from commit queue */ + if (tblk->flag & tblkGC_ERROR) + rc = -EIO; + + LOGGC_UNLOCK(log); + return rc; +} + +/* + * NAME: lmGCwrite() + * + * FUNCTION: group commit write + * initiate write of log page, building a group of all transactions + * with commit records on that page. + * + * RETURN: None + * + * NOTE: + * LOGGC_LOCK must be held by caller. + * N.B. LOG_LOCK is NOT held during lmGroupCommit(). + */ +static void lmGCwrite(struct jfs_log * log, int cant_write) +{ + struct lbuf *bp; + struct logpage *lp; + int gcpn; /* group commit page number */ + struct tblock *tblk; + struct tblock *xtblk = NULL; + + /* + * build the commit group of a log page + * + * scan commit queue and make a commit group of all + * transactions with COMMIT records on the same log page. + */ + /* get the head tblk on the commit queue */ + gcpn = list_entry(log->cqueue.next, struct tblock, cqueue)->pn; + + list_for_each_entry(tblk, &log->cqueue, cqueue) { + if (tblk->pn != gcpn) + break; + + xtblk = tblk; + + /* state transition: (QUEUE, READY) -> COMMIT */ + tblk->flag |= tblkGC_COMMIT; + } + tblk = xtblk; /* last tblk of the page */ + + /* + * pageout to commit transactions on the log page. + */ + bp = (struct lbuf *) tblk->bp; + lp = (struct logpage *) bp->l_ldata; + /* is page already full ? */ + if (tblk->flag & tblkGC_EOP) { + /* mark page to free at end of group commit of the page */ + tblk->flag &= ~tblkGC_EOP; + tblk->flag |= tblkGC_FREE; + bp->l_ceor = bp->l_eor; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); + lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC, + cant_write); + INCREMENT(lmStat.full_page); + } + /* page is not yet full */ + else { + bp->l_ceor = tblk->eor; /* ? bp->l_ceor = bp->l_eor; */ + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); + lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write); + INCREMENT(lmStat.partial_page); + } +} + +/* + * NAME: lmPostGC() + * + * FUNCTION: group commit post-processing + * Processes transactions after their commit records have been written + * to disk, redriving log I/O if necessary. + * + * RETURN: None + * + * NOTE: + * This routine is called a interrupt time by lbmIODone + */ +static void lmPostGC(struct lbuf * bp) +{ + unsigned long flags; + struct jfs_log *log = bp->l_log; + struct logpage *lp; + struct tblock *tblk, *temp; + + //LOGGC_LOCK(log); + spin_lock_irqsave(&log->gclock, flags); + /* + * current pageout of group commit completed. + * + * remove/wakeup transactions from commit queue who were + * group committed with the current log page + */ + list_for_each_entry_safe(tblk, temp, &log->cqueue, cqueue) { + if (!(tblk->flag & tblkGC_COMMIT)) + break; + /* if transaction was marked GC_COMMIT then + * it has been shipped in the current pageout + * and made it to disk - it is committed. + */ + + if (bp->l_flag & lbmERROR) + tblk->flag |= tblkGC_ERROR; + + /* remove it from the commit queue */ + list_del(&tblk->cqueue); + tblk->flag &= ~tblkGC_QUEUE; + + if (tblk == log->flush_tblk) { + /* we can stop flushing the log now */ + clear_bit(log_FLUSH, &log->flag); + log->flush_tblk = NULL; + } + + jfs_info("lmPostGC: tblk = 0x%p, flag = 0x%x", tblk, + tblk->flag); + + if (!(tblk->xflag & COMMIT_FORCE)) + /* + * Hand tblk over to lazy commit thread + */ + txLazyUnlock(tblk); + else { + /* state transition: COMMIT -> COMMITTED */ + tblk->flag |= tblkGC_COMMITTED; + + if (tblk->flag & tblkGC_READY) + log->gcrtc--; + + LOGGC_WAKEUP(tblk); + } + + /* was page full before pageout ? + * (and this is the last tblk bound with the page) + */ + if (tblk->flag & tblkGC_FREE) + lbmFree(bp); + /* did page become full after pageout ? + * (and this is the last tblk bound with the page) + */ + else if (tblk->flag & tblkGC_EOP) { + /* finalize the page */ + lp = (struct logpage *) bp->l_ldata; + bp->l_ceor = bp->l_eor; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); + jfs_info("lmPostGC: calling lbmWrite"); + lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, + 1); + } + + } + + /* are there any transactions who have entered lnGroupCommit() + * (whose COMMITs are after that of the last log page written. + * They are waiting for new group commit (above at (SLEEP 1)) + * or lazy transactions are on a full (queued) log page, + * select the latest ready transaction as new group leader and + * wake her up to lead her group. + */ + if ((!list_empty(&log->cqueue)) && + ((log->gcrtc > 0) || (tblk->bp->l_wqnext != NULL) || + test_bit(log_FLUSH, &log->flag) || jfs_tlocks_low)) + /* + * Call lmGCwrite with new group leader + */ + lmGCwrite(log, 1); + + /* no transaction are ready yet (transactions are only just + * queued (GC_QUEUE) and not entered for group commit yet). + * the first transaction entering group commit + * will elect herself as new group leader. + */ + else + log->cflag &= ~logGC_PAGEOUT; + + //LOGGC_UNLOCK(log); + spin_unlock_irqrestore(&log->gclock, flags); + return; +} + +/* + * NAME: lmLogSync() + * + * FUNCTION: write log SYNCPT record for specified log + * if new sync address is available + * (normally the case if sync() is executed by back-ground + * process). + * calculate new value of i_nextsync which determines when + * this code is called again. + * + * PARAMETERS: log - log structure + * hard_sync - 1 to force all metadata to be written + * + * RETURN: 0 + * + * serialization: LOG_LOCK() held on entry/exit + */ +static int lmLogSync(struct jfs_log * log, int hard_sync) +{ + int logsize; + int written; /* written since last syncpt */ + int free; /* free space left available */ + int delta; /* additional delta to write normally */ + int more; /* additional write granted */ + struct lrd lrd; + int lsn; + struct logsyncblk *lp; + unsigned long flags; + + /* push dirty metapages out to disk */ + if (hard_sync) + write_special_inodes(log, filemap_fdatawrite); + else + write_special_inodes(log, filemap_flush); + + /* + * forward syncpt + */ + /* if last sync is same as last syncpt, + * invoke sync point forward processing to update sync. + */ + + if (log->sync == log->syncpt) { + LOGSYNC_LOCK(log, flags); + if (list_empty(&log->synclist)) + log->sync = log->lsn; + else { + lp = list_entry(log->synclist.next, + struct logsyncblk, synclist); + log->sync = lp->lsn; + } + LOGSYNC_UNLOCK(log, flags); + + } + + /* if sync is different from last syncpt, + * write a SYNCPT record with syncpt = sync. + * reset syncpt = sync + */ + if (log->sync != log->syncpt) { + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_SYNCPT); + lrd.length = 0; + lrd.log.syncpt.sync = cpu_to_le32(log->sync); + lsn = lmWriteRecord(log, NULL, &lrd, NULL); + + log->syncpt = log->sync; + } else + lsn = log->lsn; + + /* + * setup next syncpt trigger (SWAG) + */ + logsize = log->logsize; + + logdiff(written, lsn, log); + free = logsize - written; + delta = LOGSYNC_DELTA(logsize); + more = min(free / 2, delta); + if (more < 2 * LOGPSIZE) { + jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n"); + /* + * log wrapping + * + * option 1 - panic ? No.! + * option 2 - shutdown file systems + * associated with log ? + * option 3 - extend log ? + * option 4 - second chance + * + * mark log wrapped, and continue. + * when all active transactions are completed, + * mark log valid for recovery. + * if crashed during invalid state, log state + * implies invalid log, forcing fsck(). + */ + /* mark log state log wrap in log superblock */ + /* log->state = LOGWRAP; */ + + /* reset sync point computation */ + log->syncpt = log->sync = lsn; + log->nextsync = delta; + } else + /* next syncpt trigger = written + more */ + log->nextsync = written + more; + + /* if number of bytes written from last sync point is more + * than 1/4 of the log size, stop new transactions from + * starting until all current transactions are completed + * by setting syncbarrier flag. + */ + if (!test_bit(log_SYNCBARRIER, &log->flag) && + (written > LOGSYNC_BARRIER(logsize)) && log->active) { + set_bit(log_SYNCBARRIER, &log->flag); + jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn, + log->syncpt); + /* + * We may have to initiate group commit + */ + jfs_flush_journal(log, 0); + } + + return lsn; +} + +/* + * NAME: jfs_syncpt + * + * FUNCTION: write log SYNCPT record for specified log + * + * PARAMETERS: log - log structure + * hard_sync - set to 1 to force metadata to be written + */ +void jfs_syncpt(struct jfs_log *log, int hard_sync) +{ LOG_LOCK(log); + lmLogSync(log, hard_sync); + LOG_UNLOCK(log); +} + +/* + * NAME: lmLogOpen() + * + * FUNCTION: open the log on first open; + * insert filesystem in the active list of the log. + * + * PARAMETER: ipmnt - file system mount inode + * iplog - log inode (out) + * + * RETURN: + * + * serialization: + */ +int lmLogOpen(struct super_block *sb) +{ + int rc; + struct block_device *bdev; + struct jfs_log *log; + struct jfs_sb_info *sbi = JFS_SBI(sb); + + if (sbi->flag & JFS_NOINTEGRITY) + return open_dummy_log(sb); + + if (sbi->mntflag & JFS_INLINELOG) + return open_inline_log(sb); + + mutex_lock(&jfs_log_mutex); + list_for_each_entry(log, &jfs_external_logs, journal_list) { + if (log->bdev->bd_dev == sbi->logdev) { + if (memcmp(log->uuid, sbi->loguuid, + sizeof(log->uuid))) { + jfs_warn("wrong uuid on JFS journal\n"); + mutex_unlock(&jfs_log_mutex); + return -EINVAL; + } + /* + * add file system to log active file system list + */ + if ((rc = lmLogFileSystem(log, sbi, 1))) { + mutex_unlock(&jfs_log_mutex); + return rc; + } + goto journal_found; + } + } + + if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL))) { + mutex_unlock(&jfs_log_mutex); + return -ENOMEM; + } + INIT_LIST_HEAD(&log->sb_list); + init_waitqueue_head(&log->syncwait); + + /* + * external log as separate logical volume + * + * file systems to log may have n-to-1 relationship; + */ + + bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + log); + if (IS_ERR(bdev)) { + rc = PTR_ERR(bdev); + goto free; + } + + log->bdev = bdev; + memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid)); + + /* + * initialize log: + */ + if ((rc = lmLogInit(log))) + goto close; + + list_add(&log->journal_list, &jfs_external_logs); + + /* + * add file system to log active file system list + */ + if ((rc = lmLogFileSystem(log, sbi, 1))) + goto shutdown; + +journal_found: + LOG_LOCK(log); + list_add(&sbi->log_list, &log->sb_list); + sbi->log = log; + LOG_UNLOCK(log); + + mutex_unlock(&jfs_log_mutex); + return 0; + + /* + * unwind on error + */ + shutdown: /* unwind lbmLogInit() */ + list_del(&log->journal_list); + lbmLogShutdown(log); + + close: /* close external log device */ + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + + free: /* free log descriptor */ + mutex_unlock(&jfs_log_mutex); + kfree(log); + + jfs_warn("lmLogOpen: exit(%d)", rc); + return rc; +} + +static int open_inline_log(struct super_block *sb) +{ + struct jfs_log *log; + int rc; + + if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL))) + return -ENOMEM; + INIT_LIST_HEAD(&log->sb_list); + init_waitqueue_head(&log->syncwait); + + set_bit(log_INLINELOG, &log->flag); + log->bdev = sb->s_bdev; + log->base = addressPXD(&JFS_SBI(sb)->logpxd); + log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >> + (L2LOGPSIZE - sb->s_blocksize_bits); + log->l2bsize = sb->s_blocksize_bits; + ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits); + + /* + * initialize log. + */ + if ((rc = lmLogInit(log))) { + kfree(log); + jfs_warn("lmLogOpen: exit(%d)", rc); + return rc; + } + + list_add(&JFS_SBI(sb)->log_list, &log->sb_list); + JFS_SBI(sb)->log = log; + + return rc; +} + +static int open_dummy_log(struct super_block *sb) +{ + int rc; + + mutex_lock(&jfs_log_mutex); + if (!dummy_log) { + dummy_log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL); + if (!dummy_log) { + mutex_unlock(&jfs_log_mutex); + return -ENOMEM; + } + INIT_LIST_HEAD(&dummy_log->sb_list); + init_waitqueue_head(&dummy_log->syncwait); + dummy_log->no_integrity = 1; + /* Make up some stuff */ + dummy_log->base = 0; + dummy_log->size = 1024; + rc = lmLogInit(dummy_log); + if (rc) { + kfree(dummy_log); + dummy_log = NULL; + mutex_unlock(&jfs_log_mutex); + return rc; + } + } + + LOG_LOCK(dummy_log); + list_add(&JFS_SBI(sb)->log_list, &dummy_log->sb_list); + JFS_SBI(sb)->log = dummy_log; + LOG_UNLOCK(dummy_log); + mutex_unlock(&jfs_log_mutex); + + return 0; +} + +/* + * NAME: lmLogInit() + * + * FUNCTION: log initialization at first log open. + * + * logredo() (or logformat()) should have been run previously. + * initialize the log from log superblock. + * set the log state in the superblock to LOGMOUNT and + * write SYNCPT log record. + * + * PARAMETER: log - log structure + * + * RETURN: 0 - if ok + * -EINVAL - bad log magic number or superblock dirty + * error returned from logwait() + * + * serialization: single first open thread + */ +int lmLogInit(struct jfs_log * log) +{ + int rc = 0; + struct lrd lrd; + struct logsuper *logsuper; + struct lbuf *bpsuper; + struct lbuf *bp; + struct logpage *lp; + int lsn = 0; + + jfs_info("lmLogInit: log:0x%p", log); + + /* initialize the group commit serialization lock */ + LOGGC_LOCK_INIT(log); + + /* allocate/initialize the log write serialization lock */ + LOG_LOCK_INIT(log); + + LOGSYNC_LOCK_INIT(log); + + INIT_LIST_HEAD(&log->synclist); + + INIT_LIST_HEAD(&log->cqueue); + log->flush_tblk = NULL; + + log->count = 0; + + /* + * initialize log i/o + */ + if ((rc = lbmLogInit(log))) + return rc; + + if (!test_bit(log_INLINELOG, &log->flag)) + log->l2bsize = L2LOGPSIZE; + + /* check for disabled journaling to disk */ + if (log->no_integrity) { + /* + * Journal pages will still be filled. When the time comes + * to actually do the I/O, the write is not done, and the + * endio routine is called directly. + */ + bp = lbmAllocate(log , 0); + log->bp = bp; + bp->l_pn = bp->l_eor = 0; + } else { + /* + * validate log superblock + */ + if ((rc = lbmRead(log, 1, &bpsuper))) + goto errout10; + + logsuper = (struct logsuper *) bpsuper->l_ldata; + + if (logsuper->magic != cpu_to_le32(LOGMAGIC)) { + jfs_warn("*** Log Format Error ! ***"); + rc = -EINVAL; + goto errout20; + } + + /* logredo() should have been run successfully. */ + if (logsuper->state != cpu_to_le32(LOGREDONE)) { + jfs_warn("*** Log Is Dirty ! ***"); + rc = -EINVAL; + goto errout20; + } + + /* initialize log from log superblock */ + if (test_bit(log_INLINELOG,&log->flag)) { + if (log->size != le32_to_cpu(logsuper->size)) { + rc = -EINVAL; + goto errout20; + } + jfs_info("lmLogInit: inline log:0x%p base:0x%Lx " + "size:0x%x", log, + (unsigned long long) log->base, log->size); + } else { + if (memcmp(logsuper->uuid, log->uuid, 16)) { + jfs_warn("wrong uuid on JFS log device"); + goto errout20; + } + log->size = le32_to_cpu(logsuper->size); + log->l2bsize = le32_to_cpu(logsuper->l2bsize); + jfs_info("lmLogInit: external log:0x%p base:0x%Lx " + "size:0x%x", log, + (unsigned long long) log->base, log->size); + } + + log->page = le32_to_cpu(logsuper->end) / LOGPSIZE; + log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page); + + /* + * initialize for log append write mode + */ + /* establish current/end-of-log page/buffer */ + if ((rc = lbmRead(log, log->page, &bp))) + goto errout20; + + lp = (struct logpage *) bp->l_ldata; + + jfs_info("lmLogInit: lsn:0x%x page:%d eor:%d:%d", + le32_to_cpu(logsuper->end), log->page, log->eor, + le16_to_cpu(lp->h.eor)); + + log->bp = bp; + bp->l_pn = log->page; + bp->l_eor = log->eor; + + /* if current page is full, move on to next page */ + if (log->eor >= LOGPSIZE - LOGPTLRSIZE) + lmNextPage(log); + + /* + * initialize log syncpoint + */ + /* + * write the first SYNCPT record with syncpoint = 0 + * (i.e., log redo up to HERE !); + * remove current page from lbm write queue at end of pageout + * (to write log superblock update), but do not release to + * freelist; + */ + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_SYNCPT); + lrd.length = 0; + lrd.log.syncpt.sync = 0; + lsn = lmWriteRecord(log, NULL, &lrd, NULL); + bp = log->bp; + bp->l_ceor = bp->l_eor; + lp = (struct logpage *) bp->l_ldata; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); + lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0); + if ((rc = lbmIOWait(bp, 0))) + goto errout30; + + /* + * update/write superblock + */ + logsuper->state = cpu_to_le32(LOGMOUNT); + log->serial = le32_to_cpu(logsuper->serial) + 1; + logsuper->serial = cpu_to_le32(log->serial); + lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); + if ((rc = lbmIOWait(bpsuper, lbmFREE))) + goto errout30; + } + + /* initialize logsync parameters */ + log->logsize = (log->size - 2) << L2LOGPSIZE; + log->lsn = lsn; + log->syncpt = lsn; + log->sync = log->syncpt; + log->nextsync = LOGSYNC_DELTA(log->logsize); + + jfs_info("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x", + log->lsn, log->syncpt, log->sync); + + /* + * initialize for lazy/group commit + */ + log->clsn = lsn; + + return 0; + + /* + * unwind on error + */ + errout30: /* release log page */ + log->wqueue = NULL; + bp->l_wqnext = NULL; + lbmFree(bp); + + errout20: /* release log superblock */ + lbmFree(bpsuper); + + errout10: /* unwind lbmLogInit() */ + lbmLogShutdown(log); + + jfs_warn("lmLogInit: exit(%d)", rc); + return rc; +} + + +/* + * NAME: lmLogClose() + * + * FUNCTION: remove file system from active list of log + * and close it on last close. + * + * PARAMETER: sb - superblock + * + * RETURN: errors from subroutines + * + * serialization: + */ +int lmLogClose(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_log *log = sbi->log; + struct block_device *bdev; + int rc = 0; + + jfs_info("lmLogClose: log:0x%p", log); + + mutex_lock(&jfs_log_mutex); + LOG_LOCK(log); + list_del(&sbi->log_list); + LOG_UNLOCK(log); + sbi->log = NULL; + + /* + * We need to make sure all of the "written" metapages + * actually make it to disk + */ + sync_blockdev(sb->s_bdev); + + if (test_bit(log_INLINELOG, &log->flag)) { + /* + * in-line log in host file system + */ + rc = lmLogShutdown(log); + kfree(log); + goto out; + } + + if (!log->no_integrity) + lmLogFileSystem(log, sbi, 0); + + if (!list_empty(&log->sb_list)) + goto out; + + /* + * TODO: ensure that the dummy_log is in a state to allow + * lbmLogShutdown to deallocate all the buffers and call + * kfree against dummy_log. For now, leave dummy_log & its + * buffers in memory, and resuse if another no-integrity mount + * is requested. + */ + if (log->no_integrity) + goto out; + + /* + * external log as separate logical volume + */ + list_del(&log->journal_list); + bdev = log->bdev; + rc = lmLogShutdown(log); + + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + + kfree(log); + + out: + mutex_unlock(&jfs_log_mutex); + jfs_info("lmLogClose: exit(%d)", rc); + return rc; +} + + +/* + * NAME: jfs_flush_journal() + * + * FUNCTION: initiate write of any outstanding transactions to the journal + * and optionally wait until they are all written to disk + * + * wait == 0 flush until latest txn is committed, don't wait + * wait == 1 flush until latest txn is committed, wait + * wait > 1 flush until all txn's are complete, wait + */ +void jfs_flush_journal(struct jfs_log *log, int wait) +{ + int i; + struct tblock *target = NULL; + + /* jfs_write_inode may call us during read-only mount */ + if (!log) + return; + + jfs_info("jfs_flush_journal: log:0x%p wait=%d", log, wait); + + LOGGC_LOCK(log); + + if (!list_empty(&log->cqueue)) { + /* + * This ensures that we will keep writing to the journal as long + * as there are unwritten commit records + */ + target = list_entry(log->cqueue.prev, struct tblock, cqueue); + + if (test_bit(log_FLUSH, &log->flag)) { + /* + * We're already flushing. + * if flush_tblk is NULL, we are flushing everything, + * so leave it that way. Otherwise, update it to the + * latest transaction + */ + if (log->flush_tblk) + log->flush_tblk = target; + } else { + /* Only flush until latest transaction is committed */ + log->flush_tblk = target; + set_bit(log_FLUSH, &log->flag); + + /* + * Initiate I/O on outstanding transactions + */ + if (!(log->cflag & logGC_PAGEOUT)) { + log->cflag |= logGC_PAGEOUT; + lmGCwrite(log, 0); + } + } + } + if ((wait > 1) || test_bit(log_SYNCBARRIER, &log->flag)) { + /* Flush until all activity complete */ + set_bit(log_FLUSH, &log->flag); + log->flush_tblk = NULL; + } + + if (wait && target && !(target->flag & tblkGC_COMMITTED)) { + DECLARE_WAITQUEUE(__wait, current); + + add_wait_queue(&target->gcwait, &__wait); + set_current_state(TASK_UNINTERRUPTIBLE); + LOGGC_UNLOCK(log); + schedule(); + __set_current_state(TASK_RUNNING); + LOGGC_LOCK(log); + remove_wait_queue(&target->gcwait, &__wait); + } + LOGGC_UNLOCK(log); + + if (wait < 2) + return; + + write_special_inodes(log, filemap_fdatawrite); + + /* + * If there was recent activity, we may need to wait + * for the lazycommit thread to catch up + */ + if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) { + for (i = 0; i < 200; i++) { /* Too much? */ + msleep(250); + write_special_inodes(log, filemap_fdatawrite); + if (list_empty(&log->cqueue) && + list_empty(&log->synclist)) + break; + } + } + assert(list_empty(&log->cqueue)); + +#ifdef CONFIG_JFS_DEBUG + if (!list_empty(&log->synclist)) { + struct logsyncblk *lp; + + printk(KERN_ERR "jfs_flush_journal: synclist not empty\n"); + list_for_each_entry(lp, &log->synclist, synclist) { + if (lp->xflag & COMMIT_PAGE) { + struct metapage *mp = (struct metapage *)lp; + print_hex_dump(KERN_ERR, "metapage: ", + DUMP_PREFIX_ADDRESS, 16, 4, + mp, sizeof(struct metapage), 0); + print_hex_dump(KERN_ERR, "page: ", + DUMP_PREFIX_ADDRESS, 16, + sizeof(long), mp->page, + sizeof(struct page), 0); + } else + print_hex_dump(KERN_ERR, "tblock:", + DUMP_PREFIX_ADDRESS, 16, 4, + lp, sizeof(struct tblock), 0); + } + } +#else + WARN_ON(!list_empty(&log->synclist)); +#endif + clear_bit(log_FLUSH, &log->flag); +} + +/* + * NAME: lmLogShutdown() + * + * FUNCTION: log shutdown at last LogClose(). + * + * write log syncpt record. + * update super block to set redone flag to 0. + * + * PARAMETER: log - log inode + * + * RETURN: 0 - success + * + * serialization: single last close thread + */ +int lmLogShutdown(struct jfs_log * log) +{ + int rc; + struct lrd lrd; + int lsn; + struct logsuper *logsuper; + struct lbuf *bpsuper; + struct lbuf *bp; + struct logpage *lp; + + jfs_info("lmLogShutdown: log:0x%p", log); + + jfs_flush_journal(log, 2); + + /* + * write the last SYNCPT record with syncpoint = 0 + * (i.e., log redo up to HERE !) + */ + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_SYNCPT); + lrd.length = 0; + lrd.log.syncpt.sync = 0; + + lsn = lmWriteRecord(log, NULL, &lrd, NULL); + bp = log->bp; + lp = (struct logpage *) bp->l_ldata; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); + lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0); + lbmIOWait(log->bp, lbmFREE); + log->bp = NULL; + + /* + * synchronous update log superblock + * mark log state as shutdown cleanly + * (i.e., Log does not need to be replayed). + */ + if ((rc = lbmRead(log, 1, &bpsuper))) + goto out; + + logsuper = (struct logsuper *) bpsuper->l_ldata; + logsuper->state = cpu_to_le32(LOGREDONE); + logsuper->end = cpu_to_le32(lsn); + lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); + rc = lbmIOWait(bpsuper, lbmFREE); + + jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d", + lsn, log->page, log->eor); + + out: + /* + * shutdown per log i/o + */ + lbmLogShutdown(log); + + if (rc) { + jfs_warn("lmLogShutdown: exit(%d)", rc); + } + return rc; +} + + +/* + * NAME: lmLogFileSystem() + * + * FUNCTION: insert ( = true)/remove ( = false) + * file system into/from log active file system list. + * + * PARAMETE: log - pointer to logs inode. + * fsdev - kdev_t of filesystem. + * serial - pointer to returned log serial number + * activate - insert/remove device from active list. + * + * RETURN: 0 - success + * errors returned by vms_iowait(). + */ +static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, + int activate) +{ + int rc = 0; + int i; + struct logsuper *logsuper; + struct lbuf *bpsuper; + char *uuid = sbi->uuid; + + /* + * insert/remove file system device to log active file system list. + */ + if ((rc = lbmRead(log, 1, &bpsuper))) + return rc; + + logsuper = (struct logsuper *) bpsuper->l_ldata; + if (activate) { + for (i = 0; i < MAX_ACTIVE; i++) + if (!memcmp(logsuper->active[i].uuid, NULL_UUID, 16)) { + memcpy(logsuper->active[i].uuid, uuid, 16); + sbi->aggregate = i; + break; + } + if (i == MAX_ACTIVE) { + jfs_warn("Too many file systems sharing journal!"); + lbmFree(bpsuper); + return -EMFILE; /* Is there a better rc? */ + } + } else { + for (i = 0; i < MAX_ACTIVE; i++) + if (!memcmp(logsuper->active[i].uuid, uuid, 16)) { + memcpy(logsuper->active[i].uuid, NULL_UUID, 16); + break; + } + if (i == MAX_ACTIVE) { + jfs_warn("Somebody stomped on the journal!"); + lbmFree(bpsuper); + return -EIO; + } + + } + + /* + * synchronous write log superblock: + * + * write sidestream bypassing write queue: + * at file system mount, log super block is updated for + * activation of the file system before any log record + * (MOUNT record) of the file system, and at file system + * unmount, all meta data for the file system has been + * flushed before log super block is updated for deactivation + * of the file system. + */ + lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); + rc = lbmIOWait(bpsuper, lbmFREE); + + return rc; +} + +/* + * log buffer manager (lbm) + * ------------------------ + * + * special purpose buffer manager supporting log i/o requirements. + * + * per log write queue: + * log pageout occurs in serial order by fifo write queue and + * restricting to a single i/o in pregress at any one time. + * a circular singly-linked list + * (log->wrqueue points to the tail, and buffers are linked via + * bp->wrqueue field), and + * maintains log page in pageout ot waiting for pageout in serial pageout. + */ + +/* + * lbmLogInit() + * + * initialize per log I/O setup at lmLogInit() + */ +static int lbmLogInit(struct jfs_log * log) +{ /* log inode */ + int i; + struct lbuf *lbuf; + + jfs_info("lbmLogInit: log:0x%p", log); + + /* initialize current buffer cursor */ + log->bp = NULL; + + /* initialize log device write queue */ + log->wqueue = NULL; + + /* + * Each log has its own buffer pages allocated to it. These are + * not managed by the page cache. This ensures that a transaction + * writing to the log does not block trying to allocate a page from + * the page cache (for the log). This would be bad, since page + * allocation waits on the kswapd thread that may be committing inodes + * which would cause log activity. Was that clear? I'm trying to + * avoid deadlock here. + */ + init_waitqueue_head(&log->free_wait); + + log->lbuf_free = NULL; + + for (i = 0; i < LOGPAGES;) { + char *buffer; + uint offset; + struct page *page; + + buffer = (char *) get_zeroed_page(GFP_KERNEL); + if (buffer == NULL) + goto error; + page = virt_to_page(buffer); + for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) { + lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL); + if (lbuf == NULL) { + if (offset == 0) + free_page((unsigned long) buffer); + goto error; + } + if (offset) /* we already have one reference */ + get_page(page); + lbuf->l_offset = offset; + lbuf->l_ldata = buffer + offset; + lbuf->l_page = page; + lbuf->l_log = log; + init_waitqueue_head(&lbuf->l_ioevent); + + lbuf->l_freelist = log->lbuf_free; + log->lbuf_free = lbuf; + i++; + } + } + + return (0); + + error: + lbmLogShutdown(log); + return -ENOMEM; +} + + +/* + * lbmLogShutdown() + * + * finalize per log I/O setup at lmLogShutdown() + */ +static void lbmLogShutdown(struct jfs_log * log) +{ + struct lbuf *lbuf; + + jfs_info("lbmLogShutdown: log:0x%p", log); + + lbuf = log->lbuf_free; + while (lbuf) { + struct lbuf *next = lbuf->l_freelist; + __free_page(lbuf->l_page); + kfree(lbuf); + lbuf = next; + } +} + + +/* + * lbmAllocate() + * + * allocate an empty log buffer + */ +static struct lbuf *lbmAllocate(struct jfs_log * log, int pn) +{ + struct lbuf *bp; + unsigned long flags; + + /* + * recycle from log buffer freelist if any + */ + LCACHE_LOCK(flags); + LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags); + log->lbuf_free = bp->l_freelist; + LCACHE_UNLOCK(flags); + + bp->l_flag = 0; + + bp->l_wqnext = NULL; + bp->l_freelist = NULL; + + bp->l_pn = pn; + bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize)); + bp->l_ceor = 0; + + return bp; +} + + +/* + * lbmFree() + * + * release a log buffer to freelist + */ +static void lbmFree(struct lbuf * bp) +{ + unsigned long flags; + + LCACHE_LOCK(flags); + + lbmfree(bp); + + LCACHE_UNLOCK(flags); +} + +static void lbmfree(struct lbuf * bp) +{ + struct jfs_log *log = bp->l_log; + + assert(bp->l_wqnext == NULL); + + /* + * return the buffer to head of freelist + */ + bp->l_freelist = log->lbuf_free; + log->lbuf_free = bp; + + wake_up(&log->free_wait); + return; +} + + +/* + * NAME: lbmRedrive + * + * FUNCTION: add a log buffer to the log redrive list + * + * PARAMETER: + * bp - log buffer + * + * NOTES: + * Takes log_redrive_lock. + */ +static inline void lbmRedrive(struct lbuf *bp) +{ + unsigned long flags; + + spin_lock_irqsave(&log_redrive_lock, flags); + bp->l_redrive_next = log_redrive_list; + log_redrive_list = bp; + spin_unlock_irqrestore(&log_redrive_lock, flags); + + wake_up_process(jfsIOthread); +} + + +/* + * lbmRead() + */ +static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) +{ + struct bio *bio; + struct lbuf *bp; + + /* + * allocate a log buffer + */ + *bpp = bp = lbmAllocate(log, pn); + jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn); + + bp->l_flag |= lbmREAD; + + bio = bio_alloc(GFP_NOFS, 1); + + bio->bi_sector = bp->l_blkno << (log->l2bsize - 9); + bio->bi_bdev = log->bdev; + bio->bi_io_vec[0].bv_page = bp->l_page; + bio->bi_io_vec[0].bv_len = LOGPSIZE; + bio->bi_io_vec[0].bv_offset = bp->l_offset; + + bio->bi_vcnt = 1; + bio->bi_idx = 0; + bio->bi_size = LOGPSIZE; + + bio->bi_end_io = lbmIODone; + bio->bi_private = bp; + submit_bio(READ_SYNC, bio); + + wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); + + return 0; +} + + +/* + * lbmWrite() + * + * buffer at head of pageout queue stays after completion of + * partial-page pageout and redriven by explicit initiation of + * pageout by caller until full-page pageout is completed and + * released. + * + * device driver i/o done redrives pageout of new buffer at + * head of pageout queue when current buffer at head of pageout + * queue is released at the completion of its full-page pageout. + * + * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit(). + * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone() + */ +static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, + int cant_block) +{ + struct lbuf *tail; + unsigned long flags; + + jfs_info("lbmWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn); + + /* map the logical block address to physical block address */ + bp->l_blkno = + log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); + + LCACHE_LOCK(flags); /* disable+lock */ + + /* + * initialize buffer for device driver + */ + bp->l_flag = flag; + + /* + * insert bp at tail of write queue associated with log + * + * (request is either for bp already/currently at head of queue + * or new bp to be inserted at tail) + */ + tail = log->wqueue; + + /* is buffer not already on write queue ? */ + if (bp->l_wqnext == NULL) { + /* insert at tail of wqueue */ + if (tail == NULL) { + log->wqueue = bp; + bp->l_wqnext = bp; + } else { + log->wqueue = bp; + bp->l_wqnext = tail->l_wqnext; + tail->l_wqnext = bp; + } + + tail = bp; + } + + /* is buffer at head of wqueue and for write ? */ + if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) { + LCACHE_UNLOCK(flags); /* unlock+enable */ + return; + } + + LCACHE_UNLOCK(flags); /* unlock+enable */ + + if (cant_block) + lbmRedrive(bp); + else if (flag & lbmSYNC) + lbmStartIO(bp); + else { + LOGGC_UNLOCK(log); + lbmStartIO(bp); + LOGGC_LOCK(log); + } +} + + +/* + * lbmDirectWrite() + * + * initiate pageout bypassing write queue for sidestream + * (e.g., log superblock) write; + */ +static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag) +{ + jfs_info("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x", + bp, flag, bp->l_pn); + + /* + * initialize buffer for device driver + */ + bp->l_flag = flag | lbmDIRECT; + + /* map the logical block address to physical block address */ + bp->l_blkno = + log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); + + /* + * initiate pageout of the page + */ + lbmStartIO(bp); +} + + +/* + * NAME: lbmStartIO() + * + * FUNCTION: Interface to DD strategy routine + * + * RETURN: none + * + * serialization: LCACHE_LOCK() is NOT held during log i/o; + */ +static void lbmStartIO(struct lbuf * bp) +{ + struct bio *bio; + struct jfs_log *log = bp->l_log; + + jfs_info("lbmStartIO\n"); + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_sector = bp->l_blkno << (log->l2bsize - 9); + bio->bi_bdev = log->bdev; + bio->bi_io_vec[0].bv_page = bp->l_page; + bio->bi_io_vec[0].bv_len = LOGPSIZE; + bio->bi_io_vec[0].bv_offset = bp->l_offset; + + bio->bi_vcnt = 1; + bio->bi_idx = 0; + bio->bi_size = LOGPSIZE; + + bio->bi_end_io = lbmIODone; + bio->bi_private = bp; + + /* check if journaling to disk has been disabled */ + if (log->no_integrity) { + bio->bi_size = 0; + lbmIODone(bio, 0); + } else { + submit_bio(WRITE_SYNC, bio); + INCREMENT(lmStat.submitted); + } +} + + +/* + * lbmIOWait() + */ +static int lbmIOWait(struct lbuf * bp, int flag) +{ + unsigned long flags; + int rc = 0; + + jfs_info("lbmIOWait1: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag); + + LCACHE_LOCK(flags); /* disable+lock */ + + LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags); + + rc = (bp->l_flag & lbmERROR) ? -EIO : 0; + + if (flag & lbmFREE) + lbmfree(bp); + + LCACHE_UNLOCK(flags); /* unlock+enable */ + + jfs_info("lbmIOWait2: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag); + return rc; +} + +/* + * lbmIODone() + * + * executed at INTIODONE level + */ +static void lbmIODone(struct bio *bio, int error) +{ + struct lbuf *bp = bio->bi_private; + struct lbuf *nextbp, *tail; + struct jfs_log *log; + unsigned long flags; + + /* + * get back jfs buffer bound to the i/o buffer + */ + jfs_info("lbmIODone: bp:0x%p flag:0x%x", bp, bp->l_flag); + + LCACHE_LOCK(flags); /* disable+lock */ + + bp->l_flag |= lbmDONE; + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + bp->l_flag |= lbmERROR; + + jfs_err("lbmIODone: I/O error in JFS log"); + } + + bio_put(bio); + + /* + * pagein completion + */ + if (bp->l_flag & lbmREAD) { + bp->l_flag &= ~lbmREAD; + + LCACHE_UNLOCK(flags); /* unlock+enable */ + + /* wakeup I/O initiator */ + LCACHE_WAKEUP(&bp->l_ioevent); + + return; + } + + /* + * pageout completion + * + * the bp at the head of write queue has completed pageout. + * + * if single-commit/full-page pageout, remove the current buffer + * from head of pageout queue, and redrive pageout with + * the new buffer at head of pageout queue; + * otherwise, the partial-page pageout buffer stays at + * the head of pageout queue to be redriven for pageout + * by lmGroupCommit() until full-page pageout is completed. + */ + bp->l_flag &= ~lbmWRITE; + INCREMENT(lmStat.pagedone); + + /* update committed lsn */ + log = bp->l_log; + log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor; + + if (bp->l_flag & lbmDIRECT) { + LCACHE_WAKEUP(&bp->l_ioevent); + LCACHE_UNLOCK(flags); + return; + } + + tail = log->wqueue; + + /* single element queue */ + if (bp == tail) { + /* remove head buffer of full-page pageout + * from log device write queue + */ + if (bp->l_flag & lbmRELEASE) { + log->wqueue = NULL; + bp->l_wqnext = NULL; + } + } + /* multi element queue */ + else { + /* remove head buffer of full-page pageout + * from log device write queue + */ + if (bp->l_flag & lbmRELEASE) { + nextbp = tail->l_wqnext = bp->l_wqnext; + bp->l_wqnext = NULL; + + /* + * redrive pageout of next page at head of write queue: + * redrive next page without any bound tblk + * (i.e., page w/o any COMMIT records), or + * first page of new group commit which has been + * queued after current page (subsequent pageout + * is performed synchronously, except page without + * any COMMITs) by lmGroupCommit() as indicated + * by lbmWRITE flag; + */ + if (nextbp->l_flag & lbmWRITE) { + /* + * We can't do the I/O at interrupt time. + * The jfsIO thread can do it + */ + lbmRedrive(nextbp); + } + } + } + + /* + * synchronous pageout: + * + * buffer has not necessarily been removed from write queue + * (e.g., synchronous write of partial-page with COMMIT): + * leave buffer for i/o initiator to dispose + */ + if (bp->l_flag & lbmSYNC) { + LCACHE_UNLOCK(flags); /* unlock+enable */ + + /* wakeup I/O initiator */ + LCACHE_WAKEUP(&bp->l_ioevent); + } + + /* + * Group Commit pageout: + */ + else if (bp->l_flag & lbmGC) { + LCACHE_UNLOCK(flags); + lmPostGC(bp); + } + + /* + * asynchronous pageout: + * + * buffer must have been removed from write queue: + * insert buffer at head of freelist where it can be recycled + */ + else { + assert(bp->l_flag & lbmRELEASE); + assert(bp->l_flag & lbmFREE); + lbmfree(bp); + + LCACHE_UNLOCK(flags); /* unlock+enable */ + } +} + +int jfsIOWait(void *arg) +{ + struct lbuf *bp; + + do { + spin_lock_irq(&log_redrive_lock); + while ((bp = log_redrive_list)) { + log_redrive_list = bp->l_redrive_next; + bp->l_redrive_next = NULL; + spin_unlock_irq(&log_redrive_lock); + lbmStartIO(bp); + spin_lock_irq(&log_redrive_lock); + } + + if (freezing(current)) { + spin_unlock_irq(&log_redrive_lock); + refrigerator(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&log_redrive_lock); + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + + jfs_info("jfsIOWait being killed!"); + return 0; +} + +/* + * NAME: lmLogFormat()/jfs_logform() + * + * FUNCTION: format file system log + * + * PARAMETERS: + * log - volume log + * logAddress - start address of log space in FS block + * logSize - length of log space in FS block; + * + * RETURN: 0 - success + * -EIO - i/o error + * + * XXX: We're synchronously writing one page at a time. This needs to + * be improved by writing multiple pages at once. + */ +int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize) +{ + int rc = -EIO; + struct jfs_sb_info *sbi; + struct logsuper *logsuper; + struct logpage *lp; + int lspn; /* log sequence page number */ + struct lrd *lrd_ptr; + int npages = 0; + struct lbuf *bp; + + jfs_info("lmLogFormat: logAddress:%Ld logSize:%d", + (long long)logAddress, logSize); + + sbi = list_entry(log->sb_list.next, struct jfs_sb_info, log_list); + + /* allocate a log buffer */ + bp = lbmAllocate(log, 1); + + npages = logSize >> sbi->l2nbperpage; + + /* + * log space: + * + * page 0 - reserved; + * page 1 - log superblock; + * page 2 - log data page: A SYNC log record is written + * into this page at logform time; + * pages 3-N - log data page: set to empty log data pages; + */ + /* + * init log superblock: log page 1 + */ + logsuper = (struct logsuper *) bp->l_ldata; + + logsuper->magic = cpu_to_le32(LOGMAGIC); + logsuper->version = cpu_to_le32(LOGVERSION); + logsuper->state = cpu_to_le32(LOGREDONE); + logsuper->flag = cpu_to_le32(sbi->mntflag); /* ? */ + logsuper->size = cpu_to_le32(npages); + logsuper->bsize = cpu_to_le32(sbi->bsize); + logsuper->l2bsize = cpu_to_le32(sbi->l2bsize); + logsuper->end = cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE); + + bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; + bp->l_blkno = logAddress + sbi->nbperpage; + lbmStartIO(bp); + if ((rc = lbmIOWait(bp, 0))) + goto exit; + + /* + * init pages 2 to npages-1 as log data pages: + * + * log page sequence number (lpsn) initialization: + * + * pn: 0 1 2 3 n-1 + * +-----+-----+=====+=====+===.....===+=====+ + * lspn: N-1 0 1 N-2 + * <--- N page circular file ----> + * + * the N (= npages-2) data pages of the log is maintained as + * a circular file for the log records; + * lpsn grows by 1 monotonically as each log page is written + * to the circular file of the log; + * and setLogpage() will not reset the page number even if + * the eor is equal to LOGPHDRSIZE. In order for binary search + * still work in find log end process, we have to simulate the + * log wrap situation at the log format time. + * The 1st log page written will have the highest lpsn. Then + * the succeeding log pages will have ascending order of + * the lspn starting from 0, ... (N-2) + */ + lp = (struct logpage *) bp->l_ldata; + /* + * initialize 1st log page to be written: lpsn = N - 1, + * write a SYNCPT log record is written to this page + */ + lp->h.page = lp->t.page = cpu_to_le32(npages - 3); + lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE); + + lrd_ptr = (struct lrd *) &lp->data; + lrd_ptr->logtid = 0; + lrd_ptr->backchain = 0; + lrd_ptr->type = cpu_to_le16(LOG_SYNCPT); + lrd_ptr->length = 0; + lrd_ptr->log.syncpt.sync = 0; + + bp->l_blkno += sbi->nbperpage; + bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; + lbmStartIO(bp); + if ((rc = lbmIOWait(bp, 0))) + goto exit; + + /* + * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2) + */ + for (lspn = 0; lspn < npages - 3; lspn++) { + lp->h.page = lp->t.page = cpu_to_le32(lspn); + lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE); + + bp->l_blkno += sbi->nbperpage; + bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; + lbmStartIO(bp); + if ((rc = lbmIOWait(bp, 0))) + goto exit; + } + + rc = 0; +exit: + /* + * finalize log + */ + /* release the buffer */ + lbmFree(bp); + + return rc; +} + +#ifdef CONFIG_JFS_STATISTICS +static int jfs_lmstats_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, + "JFS Logmgr stats\n" + "================\n" + "commits = %d\n" + "writes submitted = %d\n" + "writes completed = %d\n" + "full pages submitted = %d\n" + "partial pages submitted = %d\n", + lmStat.commit, + lmStat.submitted, + lmStat.pagedone, + lmStat.full_page, + lmStat.partial_page); + return 0; +} + +static int jfs_lmstats_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_lmstats_proc_show, NULL); +} + +const struct file_operations jfs_lmstats_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_lmstats_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_JFS_STATISTICS */ diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h new file mode 100644 index 00000000..e38c2159 --- /dev/null +++ b/fs/jfs/jfs_logmgr.h @@ -0,0 +1,513 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_LOGMGR +#define _H_JFS_LOGMGR + +#include "jfs_filsys.h" +#include "jfs_lock.h" + +/* + * log manager configuration parameters + */ + +/* log page size */ +#define LOGPSIZE 4096 +#define L2LOGPSIZE 12 + +#define LOGPAGES 16 /* Log pages per mounted file system */ + +/* + * log logical volume + * + * a log is used to make the commit operation on journalled + * files within the same logical volume group atomic. + * a log is implemented with a logical volume. + * there is one log per logical volume group. + * + * block 0 of the log logical volume is not used (ipl etc). + * block 1 contains a log "superblock" and is used by logFormat(), + * lmLogInit(), lmLogShutdown(), and logRedo() to record status + * of the log but is not otherwise used during normal processing. + * blocks 2 - (N-1) are used to contain log records. + * + * when a volume group is varied-on-line, logRedo() must have + * been executed before the file systems (logical volumes) in + * the volume group can be mounted. + */ +/* + * log superblock (block 1 of logical volume) + */ +#define LOGSUPER_B 1 +#define LOGSTART_B 2 + +#define LOGMAGIC 0x87654321 +#define LOGVERSION 1 + +#define MAX_ACTIVE 128 /* Max active file systems sharing log */ + +struct logsuper { + __le32 magic; /* 4: log lv identifier */ + __le32 version; /* 4: version number */ + __le32 serial; /* 4: log open/mount counter */ + __le32 size; /* 4: size in number of LOGPSIZE blocks */ + __le32 bsize; /* 4: logical block size in byte */ + __le32 l2bsize; /* 4: log2 of bsize */ + + __le32 flag; /* 4: option */ + __le32 state; /* 4: state - see below */ + + __le32 end; /* 4: addr of last log record set by logredo */ + char uuid[16]; /* 16: 128-bit journal uuid */ + char label[16]; /* 16: journal label */ + struct { + char uuid[16]; + } active[MAX_ACTIVE]; /* 2048: active file systems list */ +}; + +#define NULL_UUID "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" + +/* log flag: commit option (see jfs_filsys.h) */ + +/* log state */ +#define LOGMOUNT 0 /* log mounted by lmLogInit() */ +#define LOGREDONE 1 /* log shutdown by lmLogShutdown(). + * log redo completed by logredo(). + */ +#define LOGWRAP 2 /* log wrapped */ +#define LOGREADERR 3 /* log read error detected in logredo() */ + + +/* + * log logical page + * + * (this comment should be rewritten !) + * the header and trailer structures (h,t) will normally have + * the same page and eor value. + * An exception to this occurs when a complete page write is not + * accomplished on a power failure. Since the hardware may "split write" + * sectors in the page, any out of order sequence may occur during powerfail + * and needs to be recognized during log replay. The xor value is + * an "exclusive or" of all log words in the page up to eor. This + * 32 bit eor is stored with the top 16 bits in the header and the + * bottom 16 bits in the trailer. logredo can easily recognize pages + * that were not completed by reconstructing this eor and checking + * the log page. + * + * Previous versions of the operating system did not allow split + * writes and detected partially written records in logredo by + * ordering the updates to the header, trailer, and the move of data + * into the logdata area. The order: (1) data is moved (2) header + * is updated (3) trailer is updated. In logredo, when the header + * differed from the trailer, the header and trailer were reconciled + * as follows: if h.page != t.page they were set to the smaller of + * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only) + * h.eor != t.eor they were set to the smaller of their two values. + */ +struct logpage { + struct { /* header */ + __le32 page; /* 4: log sequence page number */ + __le16 rsrvd; /* 2: */ + __le16 eor; /* 2: end-of-log offset of lasrt record write */ + } h; + + __le32 data[LOGPSIZE / 4 - 4]; /* log record area */ + + struct { /* trailer */ + __le32 page; /* 4: normally the same as h.page */ + __le16 rsrvd; /* 2: */ + __le16 eor; /* 2: normally the same as h.eor */ + } t; +}; + +#define LOGPHDRSIZE 8 /* log page header size */ +#define LOGPTLRSIZE 8 /* log page trailer size */ + + +/* + * log record + * + * (this comment should be rewritten !) + * jfs uses only "after" log records (only a single writer is allowed + * in a page, pages are written to temporary paging space if + * if they must be written to disk before commit, and i/o is + * scheduled for modified pages to their home location after + * the log records containing the after values and the commit + * record is written to the log on disk, undo discards the copy + * in main-memory.) + * + * a log record consists of a data area of variable length followed by + * a descriptor of fixed size LOGRDSIZE bytes. + * the data area is rounded up to an integral number of 4-bytes and + * must be no longer than LOGPSIZE. + * the descriptor is of size of multiple of 4-bytes and aligned on a + * 4-byte boundary. + * records are packed one after the other in the data area of log pages. + * (sometimes a DUMMY record is inserted so that at least one record ends + * on every page or the longest record is placed on at most two pages). + * the field eor in page header/trailer points to the byte following + * the last record on a page. + */ + +/* log record types */ +#define LOG_COMMIT 0x8000 +#define LOG_SYNCPT 0x4000 +#define LOG_MOUNT 0x2000 +#define LOG_REDOPAGE 0x0800 +#define LOG_NOREDOPAGE 0x0080 +#define LOG_NOREDOINOEXT 0x0040 +#define LOG_UPDATEMAP 0x0008 +#define LOG_NOREDOFILE 0x0001 + +/* REDOPAGE/NOREDOPAGE log record data type */ +#define LOG_INODE 0x0001 +#define LOG_XTREE 0x0002 +#define LOG_DTREE 0x0004 +#define LOG_BTROOT 0x0010 +#define LOG_EA 0x0020 +#define LOG_ACL 0x0040 +#define LOG_DATA 0x0080 +#define LOG_NEW 0x0100 +#define LOG_EXTEND 0x0200 +#define LOG_RELOCATE 0x0400 +#define LOG_DIR_XTREE 0x0800 /* Xtree is in directory inode */ + +/* UPDATEMAP log record descriptor type */ +#define LOG_ALLOCXADLIST 0x0080 +#define LOG_ALLOCPXDLIST 0x0040 +#define LOG_ALLOCXAD 0x0020 +#define LOG_ALLOCPXD 0x0010 +#define LOG_FREEXADLIST 0x0008 +#define LOG_FREEPXDLIST 0x0004 +#define LOG_FREEXAD 0x0002 +#define LOG_FREEPXD 0x0001 + + +struct lrd { + /* + * type independent area + */ + __le32 logtid; /* 4: log transaction identifier */ + __le32 backchain; /* 4: ptr to prev record of same transaction */ + __le16 type; /* 2: record type */ + __le16 length; /* 2: length of data in record (in byte) */ + __le32 aggregate; /* 4: file system lv/aggregate */ + /* (16) */ + + /* + * type dependent area (20) + */ + union { + + /* + * COMMIT: commit + * + * transaction commit: no type-dependent information; + */ + + /* + * REDOPAGE: after-image + * + * apply after-image; + * + * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + __le16 type; /* 2: REDOPAGE record type */ + __le16 l2linesize; /* 2: log2 of line size */ + pxd_t pxd; /* 8: on-disk page pxd */ + } redopage; /* (20) */ + + /* + * NOREDOPAGE: the page is freed + * + * do not apply after-image records which precede this record + * in the log with the same page block number to this page. + * + * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + __le16 type; /* 2: NOREDOPAGE record type */ + __le16 rsrvd; /* 2: reserved */ + pxd_t pxd; /* 8: on-disk page pxd */ + } noredopage; /* (20) */ + + /* + * UPDATEMAP: update block allocation map + * + * either in-line PXD, + * or out-of-line XADLIST; + * + * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + __le16 type; /* 2: UPDATEMAP record type */ + __le16 nxd; /* 2: number of extents */ + pxd_t pxd; /* 8: pxd */ + } updatemap; /* (20) */ + + /* + * NOREDOINOEXT: the inode extent is freed + * + * do not apply after-image records which precede this + * record in the log with the any of the 4 page block + * numbers in this inode extent. + * + * NOTE: The fileset and pxd fields MUST remain in + * the same fields in the REDOPAGE record format. + * + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 iagnum; /* 4: IAG number */ + __le32 inoext_idx; /* 4: inode extent index */ + pxd_t pxd; /* 8: on-disk page pxd */ + } noredoinoext; /* (20) */ + + /* + * SYNCPT: log sync point + * + * replay log up to syncpt address specified; + */ + struct { + __le32 sync; /* 4: syncpt address (0 = here) */ + } syncpt; + + /* + * MOUNT: file system mount + * + * file system mount: no type-dependent information; + */ + + /* + * ? FREEXTENT: free specified extent(s) + * + * free specified extent(s) from block allocation map + * N.B.: nextents should be length of data/sizeof(xad_t) + */ + struct { + __le32 type; /* 4: FREEXTENT record type */ + __le32 nextent; /* 4: number of extents */ + + /* data: PXD or XAD list */ + } freextent; + + /* + * ? NOREDOFILE: this file is freed + * + * do not apply records which precede this record in the log + * with the same inode number. + * + * NOREDOFILE must be the first to be written at commit + * (last to be read in logredo()) - it prevents + * replay of preceding updates of all preceding generations + * of the inumber esp. the on-disk inode itself. + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + } noredofile; + + /* + * ? NEWPAGE: + * + * metadata type dependent + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + __le32 type; /* 4: NEWPAGE record type */ + pxd_t pxd; /* 8: on-disk page pxd */ + } newpage; + + /* + * ? DUMMY: filler + * + * no type-dependent information + */ + } log; +}; /* (36) */ + +#define LOGRDSIZE (sizeof(struct lrd)) + +/* + * line vector descriptor + */ +struct lvd { + __le16 offset; + __le16 length; +}; + + +/* + * log logical volume + */ +struct jfs_log { + + struct list_head sb_list;/* This is used to sync metadata + * before writing syncpt. + */ + struct list_head journal_list; /* Global list */ + struct block_device *bdev; /* 4: log lv pointer */ + int serial; /* 4: log mount serial number */ + + s64 base; /* @8: log extent address (inline log ) */ + int size; /* 4: log size in log page (in page) */ + int l2bsize; /* 4: log2 of bsize */ + + unsigned long flag; /* 4: flag */ + + struct lbuf *lbuf_free; /* 4: free lbufs */ + wait_queue_head_t free_wait; /* 4: */ + + /* log write */ + int logtid; /* 4: log tid */ + int page; /* 4: page number of eol page */ + int eor; /* 4: eor of last record in eol page */ + struct lbuf *bp; /* 4: current log page buffer */ + + struct mutex loglock; /* 4: log write serialization lock */ + + /* syncpt */ + int nextsync; /* 4: bytes to write before next syncpt */ + int active; /* 4: */ + wait_queue_head_t syncwait; /* 4: */ + + /* commit */ + uint cflag; /* 4: */ + struct list_head cqueue; /* FIFO commit queue */ + struct tblock *flush_tblk; /* tblk we're waiting on for flush */ + int gcrtc; /* 4: GC_READY transaction count */ + struct tblock *gclrt; /* 4: latest GC_READY transaction */ + spinlock_t gclock; /* 4: group commit lock */ + int logsize; /* 4: log data area size in byte */ + int lsn; /* 4: end-of-log */ + int clsn; /* 4: clsn */ + int syncpt; /* 4: addr of last syncpt record */ + int sync; /* 4: addr from last logsync() */ + struct list_head synclist; /* 8: logsynclist anchor */ + spinlock_t synclock; /* 4: synclist lock */ + struct lbuf *wqueue; /* 4: log pageout queue */ + int count; /* 4: count */ + char uuid[16]; /* 16: 128-bit uuid of log device */ + + int no_integrity; /* 3: flag to disable journaling to disk */ +}; + +/* + * Log flag + */ +#define log_INLINELOG 1 +#define log_SYNCBARRIER 2 +#define log_QUIESCE 3 +#define log_FLUSH 4 + +/* + * group commit flag + */ +/* jfs_log */ +#define logGC_PAGEOUT 0x00000001 + +/* tblock/lbuf */ +#define tblkGC_QUEUE 0x0001 +#define tblkGC_READY 0x0002 +#define tblkGC_COMMIT 0x0004 +#define tblkGC_COMMITTED 0x0008 +#define tblkGC_EOP 0x0010 +#define tblkGC_FREE 0x0020 +#define tblkGC_LEADER 0x0040 +#define tblkGC_ERROR 0x0080 +#define tblkGC_LAZY 0x0100 // D230860 +#define tblkGC_UNLOCKED 0x0200 // D230860 + +/* + * log cache buffer header + */ +struct lbuf { + struct jfs_log *l_log; /* 4: log associated with buffer */ + + /* + * data buffer base area + */ + uint l_flag; /* 4: pageout control flags */ + + struct lbuf *l_wqnext; /* 4: write queue link */ + struct lbuf *l_freelist; /* 4: freelistlink */ + + int l_pn; /* 4: log page number */ + int l_eor; /* 4: log record eor */ + int l_ceor; /* 4: committed log record eor */ + + s64 l_blkno; /* 8: log page block number */ + caddr_t l_ldata; /* 4: data page */ + struct page *l_page; /* The page itself */ + uint l_offset; /* Offset of l_ldata within the page */ + + wait_queue_head_t l_ioevent; /* 4: i/o done event */ +}; + +/* Reuse l_freelist for redrive list */ +#define l_redrive_next l_freelist + +/* + * logsynclist block + * + * common logsyncblk prefix for jbuf_t and tblock + */ +struct logsyncblk { + u16 xflag; /* flags */ + u16 flag; /* only meaninful in tblock */ + lid_t lid; /* lock id */ + s32 lsn; /* log sequence number */ + struct list_head synclist; /* log sync list link */ +}; + +/* + * logsynclist serialization (per log) + */ + +#define LOGSYNC_LOCK_INIT(log) spin_lock_init(&(log)->synclock) +#define LOGSYNC_LOCK(log, flags) spin_lock_irqsave(&(log)->synclock, flags) +#define LOGSYNC_UNLOCK(log, flags) \ + spin_unlock_irqrestore(&(log)->synclock, flags) + +/* compute the difference in bytes of lsn from sync point */ +#define logdiff(diff, lsn, log)\ +{\ + diff = (lsn) - (log)->syncpt;\ + if (diff < 0)\ + diff += (log)->logsize;\ +} + +extern int lmLogOpen(struct super_block *sb); +extern int lmLogClose(struct super_block *sb); +extern int lmLogShutdown(struct jfs_log * log); +extern int lmLogInit(struct jfs_log * log); +extern int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize); +extern int lmGroupCommit(struct jfs_log *, struct tblock *); +extern int jfsIOWait(void *); +extern void jfs_flush_journal(struct jfs_log * log, int wait); +extern void jfs_syncpt(struct jfs_log *log, int hard_sync); + +#endif /* _H_JFS_LOGMGR */ diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c new file mode 100644 index 00000000..6740d34c --- /dev/null +++ b/fs/jfs/jfs_metapage.c @@ -0,0 +1,843 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2005 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_txnmgr.h" +#include "jfs_debug.h" + +#ifdef CONFIG_JFS_STATISTICS +static struct { + uint pagealloc; /* # of page allocations */ + uint pagefree; /* # of page frees */ + uint lockwait; /* # of sleeping lock_metapage() calls */ +} mpStat; +#endif + +#define metapage_locked(mp) test_bit(META_locked, &(mp)->flag) +#define trylock_metapage(mp) test_and_set_bit_lock(META_locked, &(mp)->flag) + +static inline void unlock_metapage(struct metapage *mp) +{ + clear_bit_unlock(META_locked, &mp->flag); + wake_up(&mp->wait); +} + +static inline void __lock_metapage(struct metapage *mp) +{ + DECLARE_WAITQUEUE(wait, current); + INCREMENT(mpStat.lockwait); + add_wait_queue_exclusive(&mp->wait, &wait); + do { + set_current_state(TASK_UNINTERRUPTIBLE); + if (metapage_locked(mp)) { + unlock_page(mp->page); + io_schedule(); + lock_page(mp->page); + } + } while (trylock_metapage(mp)); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&mp->wait, &wait); +} + +/* + * Must have mp->page locked + */ +static inline void lock_metapage(struct metapage *mp) +{ + if (trylock_metapage(mp)) + __lock_metapage(mp); +} + +#define METAPOOL_MIN_PAGES 32 +static struct kmem_cache *metapage_cache; +static mempool_t *metapage_mempool; + +#define MPS_PER_PAGE (PAGE_CACHE_SIZE >> L2PSIZE) + +#if MPS_PER_PAGE > 1 + +struct meta_anchor { + int mp_count; + atomic_t io_count; + struct metapage *mp[MPS_PER_PAGE]; +}; +#define mp_anchor(page) ((struct meta_anchor *)page_private(page)) + +static inline struct metapage *page_to_mp(struct page *page, int offset) +{ + if (!PagePrivate(page)) + return NULL; + return mp_anchor(page)->mp[offset >> L2PSIZE]; +} + +static inline int insert_metapage(struct page *page, struct metapage *mp) +{ + struct meta_anchor *a; + int index; + int l2mp_blocks; /* log2 blocks per metapage */ + + if (PagePrivate(page)) + a = mp_anchor(page); + else { + a = kzalloc(sizeof(struct meta_anchor), GFP_NOFS); + if (!a) + return -ENOMEM; + set_page_private(page, (unsigned long)a); + SetPagePrivate(page); + kmap(page); + } + + if (mp) { + l2mp_blocks = L2PSIZE - page->mapping->host->i_blkbits; + index = (mp->index >> l2mp_blocks) & (MPS_PER_PAGE - 1); + a->mp_count++; + a->mp[index] = mp; + } + + return 0; +} + +static inline void remove_metapage(struct page *page, struct metapage *mp) +{ + struct meta_anchor *a = mp_anchor(page); + int l2mp_blocks = L2PSIZE - page->mapping->host->i_blkbits; + int index; + + index = (mp->index >> l2mp_blocks) & (MPS_PER_PAGE - 1); + + BUG_ON(a->mp[index] != mp); + + a->mp[index] = NULL; + if (--a->mp_count == 0) { + kfree(a); + set_page_private(page, 0); + ClearPagePrivate(page); + kunmap(page); + } +} + +static inline void inc_io(struct page *page) +{ + atomic_inc(&mp_anchor(page)->io_count); +} + +static inline void dec_io(struct page *page, void (*handler) (struct page *)) +{ + if (atomic_dec_and_test(&mp_anchor(page)->io_count)) + handler(page); +} + +#else +static inline struct metapage *page_to_mp(struct page *page, int offset) +{ + return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL; +} + +static inline int insert_metapage(struct page *page, struct metapage *mp) +{ + if (mp) { + set_page_private(page, (unsigned long)mp); + SetPagePrivate(page); + kmap(page); + } + return 0; +} + +static inline void remove_metapage(struct page *page, struct metapage *mp) +{ + set_page_private(page, 0); + ClearPagePrivate(page); + kunmap(page); +} + +#define inc_io(page) do {} while(0) +#define dec_io(page, handler) handler(page) + +#endif + +static void init_once(void *foo) +{ + struct metapage *mp = (struct metapage *)foo; + + mp->lid = 0; + mp->lsn = 0; + mp->flag = 0; + mp->data = NULL; + mp->clsn = 0; + mp->log = NULL; + set_bit(META_free, &mp->flag); + init_waitqueue_head(&mp->wait); +} + +static inline struct metapage *alloc_metapage(gfp_t gfp_mask) +{ + return mempool_alloc(metapage_mempool, gfp_mask); +} + +static inline void free_metapage(struct metapage *mp) +{ + mp->flag = 0; + set_bit(META_free, &mp->flag); + + mempool_free(mp, metapage_mempool); +} + +int __init metapage_init(void) +{ + /* + * Allocate the metapage structures + */ + metapage_cache = kmem_cache_create("jfs_mp", sizeof(struct metapage), + 0, 0, init_once); + if (metapage_cache == NULL) + return -ENOMEM; + + metapage_mempool = mempool_create_slab_pool(METAPOOL_MIN_PAGES, + metapage_cache); + + if (metapage_mempool == NULL) { + kmem_cache_destroy(metapage_cache); + return -ENOMEM; + } + + return 0; +} + +void metapage_exit(void) +{ + mempool_destroy(metapage_mempool); + kmem_cache_destroy(metapage_cache); +} + +static inline void drop_metapage(struct page *page, struct metapage *mp) +{ + if (mp->count || mp->nohomeok || test_bit(META_dirty, &mp->flag) || + test_bit(META_io, &mp->flag)) + return; + remove_metapage(page, mp); + INCREMENT(mpStat.pagefree); + free_metapage(mp); +} + +/* + * Metapage address space operations + */ + +static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock, + int *len) +{ + int rc = 0; + int xflag; + s64 xaddr; + sector_t file_blocks = (inode->i_size + inode->i_sb->s_blocksize - 1) >> + inode->i_blkbits; + + if (lblock >= file_blocks) + return 0; + if (lblock + *len > file_blocks) + *len = file_blocks - lblock; + + if (inode->i_ino) { + rc = xtLookup(inode, (s64)lblock, *len, &xflag, &xaddr, len, 0); + if ((rc == 0) && *len) + lblock = (sector_t)xaddr; + else + lblock = 0; + } /* else no mapping */ + + return lblock; +} + +static void last_read_complete(struct page *page) +{ + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); +} + +static void metapage_read_end_io(struct bio *bio, int err) +{ + struct page *page = bio->bi_private; + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + printk(KERN_ERR "metapage_read_end_io: I/O error\n"); + SetPageError(page); + } + + dec_io(page, last_read_complete); + bio_put(bio); +} + +static void remove_from_logsync(struct metapage *mp) +{ + struct jfs_log *log = mp->log; + unsigned long flags; +/* + * This can race. Recheck that log hasn't been set to null, and after + * acquiring logsync lock, recheck lsn + */ + if (!log) + return; + + LOGSYNC_LOCK(log, flags); + if (mp->lsn) { + mp->log = NULL; + mp->lsn = 0; + mp->clsn = 0; + log->count--; + list_del(&mp->synclist); + } + LOGSYNC_UNLOCK(log, flags); +} + +static void last_write_complete(struct page *page) +{ + struct metapage *mp; + unsigned int offset; + + for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { + mp = page_to_mp(page, offset); + if (mp && test_bit(META_io, &mp->flag)) { + if (mp->lsn) + remove_from_logsync(mp); + clear_bit(META_io, &mp->flag); + } + /* + * I'd like to call drop_metapage here, but I don't think it's + * safe unless I have the page locked + */ + } + end_page_writeback(page); +} + +static void metapage_write_end_io(struct bio *bio, int err) +{ + struct page *page = bio->bi_private; + + BUG_ON(!PagePrivate(page)); + + if (! test_bit(BIO_UPTODATE, &bio->bi_flags)) { + printk(KERN_ERR "metapage_write_end_io: I/O error\n"); + SetPageError(page); + } + dec_io(page, last_write_complete); + bio_put(bio); +} + +static int metapage_writepage(struct page *page, struct writeback_control *wbc) +{ + struct bio *bio = NULL; + int block_offset; /* block offset of mp within page */ + struct inode *inode = page->mapping->host; + int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage; + int len; + int xlen; + struct metapage *mp; + int redirty = 0; + sector_t lblock; + int nr_underway = 0; + sector_t pblock; + sector_t next_block = 0; + sector_t page_start; + unsigned long bio_bytes = 0; + unsigned long bio_offset = 0; + int offset; + int bad_blocks = 0; + + page_start = (sector_t)page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { + mp = page_to_mp(page, offset); + + if (!mp || !test_bit(META_dirty, &mp->flag)) + continue; + + if (mp->nohomeok && !test_bit(META_forcewrite, &mp->flag)) { + redirty = 1; + /* + * Make sure this page isn't blocked indefinitely. + * If the journal isn't undergoing I/O, push it + */ + if (mp->log && !(mp->log->cflag & logGC_PAGEOUT)) + jfs_flush_journal(mp->log, 0); + continue; + } + + clear_bit(META_dirty, &mp->flag); + set_bit(META_io, &mp->flag); + block_offset = offset >> inode->i_blkbits; + lblock = page_start + block_offset; + if (bio) { + if (xlen && lblock == next_block) { + /* Contiguous, in memory & on disk */ + len = min(xlen, blocks_per_mp); + xlen -= len; + bio_bytes += len << inode->i_blkbits; + continue; + } + /* Not contiguous */ + if (bio_add_page(bio, page, bio_bytes, bio_offset) < + bio_bytes) + goto add_failed; + /* + * Increment counter before submitting i/o to keep + * count from hitting zero before we're through + */ + inc_io(page); + if (!bio->bi_size) + goto dump_bio; + submit_bio(WRITE, bio); + nr_underway++; + bio = NULL; + } else + inc_io(page); + xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits; + pblock = metapage_get_blocks(inode, lblock, &xlen); + if (!pblock) { + printk(KERN_ERR "JFS: metapage_get_blocks failed\n"); + /* + * We already called inc_io(), but can't cancel it + * with dec_io() until we're done with the page + */ + bad_blocks++; + continue; + } + len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage); + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_sector = pblock << (inode->i_blkbits - 9); + bio->bi_end_io = metapage_write_end_io; + bio->bi_private = page; + + /* Don't call bio_add_page yet, we may add to this vec */ + bio_offset = offset; + bio_bytes = len << inode->i_blkbits; + + xlen -= len; + next_block = lblock + len; + } + if (bio) { + if (bio_add_page(bio, page, bio_bytes, bio_offset) < bio_bytes) + goto add_failed; + if (!bio->bi_size) + goto dump_bio; + + submit_bio(WRITE, bio); + nr_underway++; + } + if (redirty) + redirty_page_for_writepage(wbc, page); + + unlock_page(page); + + if (bad_blocks) + goto err_out; + + if (nr_underway == 0) + end_page_writeback(page); + + return 0; +add_failed: + /* We should never reach here, since we're only adding one vec */ + printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n"); + goto skip; +dump_bio: + print_hex_dump(KERN_ERR, "JFS: dump of bio: ", DUMP_PREFIX_ADDRESS, 16, + 4, bio, sizeof(*bio), 0); +skip: + bio_put(bio); + unlock_page(page); + dec_io(page, last_write_complete); +err_out: + while (bad_blocks--) + dec_io(page, last_write_complete); + return -EIO; +} + +static int metapage_readpage(struct file *fp, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct bio *bio = NULL; + int block_offset; + int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; + sector_t page_start; /* address of page in fs blocks */ + sector_t pblock; + int xlen; + unsigned int len; + int offset; + + BUG_ON(!PageLocked(page)); + page_start = (sector_t)page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + block_offset = 0; + while (block_offset < blocks_per_page) { + xlen = blocks_per_page - block_offset; + pblock = metapage_get_blocks(inode, page_start + block_offset, + &xlen); + if (pblock) { + if (!PagePrivate(page)) + insert_metapage(page, NULL); + inc_io(page); + if (bio) + submit_bio(READ, bio); + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_sector = pblock << (inode->i_blkbits - 9); + bio->bi_end_io = metapage_read_end_io; + bio->bi_private = page; + len = xlen << inode->i_blkbits; + offset = block_offset << inode->i_blkbits; + if (bio_add_page(bio, page, len, offset) < len) + goto add_failed; + block_offset += xlen; + } else + block_offset++; + } + if (bio) + submit_bio(READ, bio); + else + unlock_page(page); + + return 0; + +add_failed: + printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n"); + bio_put(bio); + dec_io(page, last_read_complete); + return -EIO; +} + +static int metapage_releasepage(struct page *page, gfp_t gfp_mask) +{ + struct metapage *mp; + int ret = 1; + int offset; + + for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { + mp = page_to_mp(page, offset); + + if (!mp) + continue; + + jfs_info("metapage_releasepage: mp = 0x%p", mp); + if (mp->count || mp->nohomeok || + test_bit(META_dirty, &mp->flag)) { + jfs_info("count = %ld, nohomeok = %d", mp->count, + mp->nohomeok); + ret = 0; + continue; + } + if (mp->lsn) + remove_from_logsync(mp); + remove_metapage(page, mp); + INCREMENT(mpStat.pagefree); + free_metapage(mp); + } + return ret; +} + +static void metapage_invalidatepage(struct page *page, unsigned long offset) +{ + BUG_ON(offset); + + BUG_ON(PageWriteback(page)); + + metapage_releasepage(page, 0); +} + +const struct address_space_operations jfs_metapage_aops = { + .readpage = metapage_readpage, + .writepage = metapage_writepage, + .releasepage = metapage_releasepage, + .invalidatepage = metapage_invalidatepage, + .set_page_dirty = __set_page_dirty_nobuffers, +}; + +struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, + unsigned int size, int absolute, + unsigned long new) +{ + int l2BlocksPerPage; + int l2bsize; + struct address_space *mapping; + struct metapage *mp = NULL; + struct page *page; + unsigned long page_index; + unsigned long page_offset; + + jfs_info("__get_metapage: ino = %ld, lblock = 0x%lx, abs=%d", + inode->i_ino, lblock, absolute); + + l2bsize = inode->i_blkbits; + l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize; + page_index = lblock >> l2BlocksPerPage; + page_offset = (lblock - (page_index << l2BlocksPerPage)) << l2bsize; + if ((page_offset + size) > PAGE_CACHE_SIZE) { + jfs_err("MetaData crosses page boundary!!"); + jfs_err("lblock = %lx, size = %d", lblock, size); + dump_stack(); + return NULL; + } + if (absolute) + mapping = JFS_SBI(inode->i_sb)->direct_inode->i_mapping; + else { + /* + * If an nfs client tries to read an inode that is larger + * than any existing inodes, we may try to read past the + * end of the inode map + */ + if ((lblock << inode->i_blkbits) >= inode->i_size) + return NULL; + mapping = inode->i_mapping; + } + + if (new && (PSIZE == PAGE_CACHE_SIZE)) { + page = grab_cache_page(mapping, page_index); + if (!page) { + jfs_err("grab_cache_page failed!"); + return NULL; + } + SetPageUptodate(page); + } else { + page = read_mapping_page(mapping, page_index, NULL); + if (IS_ERR(page) || !PageUptodate(page)) { + jfs_err("read_mapping_page failed!"); + return NULL; + } + lock_page(page); + } + + mp = page_to_mp(page, page_offset); + if (mp) { + if (mp->logical_size != size) { + jfs_error(inode->i_sb, + "__get_metapage: mp->logical_size != size"); + jfs_err("logical_size = %d, size = %d", + mp->logical_size, size); + dump_stack(); + goto unlock; + } + mp->count++; + lock_metapage(mp); + if (test_bit(META_discard, &mp->flag)) { + if (!new) { + jfs_error(inode->i_sb, + "__get_metapage: using a " + "discarded metapage"); + discard_metapage(mp); + goto unlock; + } + clear_bit(META_discard, &mp->flag); + } + } else { + INCREMENT(mpStat.pagealloc); + mp = alloc_metapage(GFP_NOFS); + mp->page = page; + mp->flag = 0; + mp->xflag = COMMIT_PAGE; + mp->count = 1; + mp->nohomeok = 0; + mp->logical_size = size; + mp->data = page_address(page) + page_offset; + mp->index = lblock; + if (unlikely(insert_metapage(page, mp))) { + free_metapage(mp); + goto unlock; + } + lock_metapage(mp); + } + + if (new) { + jfs_info("zeroing mp = 0x%p", mp); + memset(mp->data, 0, PSIZE); + } + + unlock_page(page); + jfs_info("__get_metapage: returning = 0x%p data = 0x%p", mp, mp->data); + return mp; + +unlock: + unlock_page(page); + return NULL; +} + +void grab_metapage(struct metapage * mp) +{ + jfs_info("grab_metapage: mp = 0x%p", mp); + page_cache_get(mp->page); + lock_page(mp->page); + mp->count++; + lock_metapage(mp); + unlock_page(mp->page); +} + +void force_metapage(struct metapage *mp) +{ + struct page *page = mp->page; + jfs_info("force_metapage: mp = 0x%p", mp); + set_bit(META_forcewrite, &mp->flag); + clear_bit(META_sync, &mp->flag); + page_cache_get(page); + lock_page(page); + set_page_dirty(page); + write_one_page(page, 1); + clear_bit(META_forcewrite, &mp->flag); + page_cache_release(page); +} + +void hold_metapage(struct metapage *mp) +{ + lock_page(mp->page); +} + +void put_metapage(struct metapage *mp) +{ + if (mp->count || mp->nohomeok) { + /* Someone else will release this */ + unlock_page(mp->page); + return; + } + page_cache_get(mp->page); + mp->count++; + lock_metapage(mp); + unlock_page(mp->page); + release_metapage(mp); +} + +void release_metapage(struct metapage * mp) +{ + struct page *page = mp->page; + jfs_info("release_metapage: mp = 0x%p, flag = 0x%lx", mp, mp->flag); + + BUG_ON(!page); + + lock_page(page); + unlock_metapage(mp); + + assert(mp->count); + if (--mp->count || mp->nohomeok) { + unlock_page(page); + page_cache_release(page); + return; + } + + if (test_bit(META_dirty, &mp->flag)) { + set_page_dirty(page); + if (test_bit(META_sync, &mp->flag)) { + clear_bit(META_sync, &mp->flag); + write_one_page(page, 1); + lock_page(page); /* write_one_page unlocks the page */ + } + } else if (mp->lsn) /* discard_metapage doesn't remove it */ + remove_from_logsync(mp); + + /* Try to keep metapages from using up too much memory */ + drop_metapage(page, mp); + + unlock_page(page); + page_cache_release(page); +} + +void __invalidate_metapages(struct inode *ip, s64 addr, int len) +{ + sector_t lblock; + int l2BlocksPerPage = PAGE_CACHE_SHIFT - ip->i_blkbits; + int BlocksPerPage = 1 << l2BlocksPerPage; + /* All callers are interested in block device's mapping */ + struct address_space *mapping = + JFS_SBI(ip->i_sb)->direct_inode->i_mapping; + struct metapage *mp; + struct page *page; + unsigned int offset; + + /* + * Mark metapages to discard. They will eventually be + * released, but should not be written. + */ + for (lblock = addr & ~(BlocksPerPage - 1); lblock < addr + len; + lblock += BlocksPerPage) { + page = find_lock_page(mapping, lblock >> l2BlocksPerPage); + if (!page) + continue; + for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { + mp = page_to_mp(page, offset); + if (!mp) + continue; + if (mp->index < addr) + continue; + if (mp->index >= addr + len) + break; + + clear_bit(META_dirty, &mp->flag); + set_bit(META_discard, &mp->flag); + if (mp->lsn) + remove_from_logsync(mp); + } + unlock_page(page); + page_cache_release(page); + } +} + +#ifdef CONFIG_JFS_STATISTICS +static int jfs_mpstat_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, + "JFS Metapage statistics\n" + "=======================\n" + "page allocations = %d\n" + "page frees = %d\n" + "lock waits = %d\n", + mpStat.pagealloc, + mpStat.pagefree, + mpStat.lockwait); + return 0; +} + +static int jfs_mpstat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_mpstat_proc_show, NULL); +} + +const struct file_operations jfs_mpstat_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_mpstat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h new file mode 100644 index 00000000..a78beda8 --- /dev/null +++ b/fs/jfs/jfs_metapage.h @@ -0,0 +1,155 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_METAPAGE +#define _H_JFS_METAPAGE + +#include + +struct metapage { + /* Common logsyncblk prefix (see jfs_logmgr.h) */ + u16 xflag; + u16 unused; + lid_t lid; + int lsn; + struct list_head synclist; + /* End of logsyncblk prefix */ + + unsigned long flag; /* See Below */ + unsigned long count; /* Reference count */ + void *data; /* Data pointer */ + sector_t index; /* block address of page */ + wait_queue_head_t wait; + + /* implementation */ + struct page *page; + unsigned int logical_size; + + /* Journal management */ + int clsn; + int nohomeok; + struct jfs_log *log; +}; + +/* metapage flag */ +#define META_locked 0 +#define META_free 1 +#define META_dirty 2 +#define META_sync 3 +#define META_discard 4 +#define META_forcewrite 5 +#define META_io 6 + +#define mark_metapage_dirty(mp) set_bit(META_dirty, &(mp)->flag) + +/* function prototypes */ +extern int metapage_init(void); +extern void metapage_exit(void); +extern struct metapage *__get_metapage(struct inode *inode, + unsigned long lblock, unsigned int size, + int absolute, unsigned long new); + +#define read_metapage(inode, lblock, size, absolute)\ + __get_metapage(inode, lblock, size, absolute, false) + +#define get_metapage(inode, lblock, size, absolute)\ + __get_metapage(inode, lblock, size, absolute, true) + +extern void release_metapage(struct metapage *); +extern void grab_metapage(struct metapage *); +extern void force_metapage(struct metapage *); + +/* + * hold_metapage and put_metapage are used in conjunction. The page lock + * is not dropped between the two, so no other threads can get or release + * the metapage + */ +extern void hold_metapage(struct metapage *); +extern void put_metapage(struct metapage *); + +static inline void write_metapage(struct metapage *mp) +{ + set_bit(META_dirty, &mp->flag); + release_metapage(mp); +} + +static inline void flush_metapage(struct metapage *mp) +{ + set_bit(META_sync, &mp->flag); + write_metapage(mp); +} + +static inline void discard_metapage(struct metapage *mp) +{ + clear_bit(META_dirty, &mp->flag); + set_bit(META_discard, &mp->flag); + release_metapage(mp); +} + +static inline void metapage_nohomeok(struct metapage *mp) +{ + struct page *page = mp->page; + lock_page(page); + if (!mp->nohomeok++) { + mark_metapage_dirty(mp); + page_cache_get(page); + wait_on_page_writeback(page); + } + unlock_page(page); +} + +/* + * This serializes access to mp->lsn when metapages are added to logsynclist + * without setting nohomeok. i.e. updating imap & dmap + */ +static inline void metapage_wait_for_io(struct metapage *mp) +{ + if (test_bit(META_io, &mp->flag)) + wait_on_page_writeback(mp->page); +} + +/* + * This is called when already holding the metapage + */ +static inline void _metapage_homeok(struct metapage *mp) +{ + if (!--mp->nohomeok) + page_cache_release(mp->page); +} + +static inline void metapage_homeok(struct metapage *mp) +{ + hold_metapage(mp); + _metapage_homeok(mp); + put_metapage(mp); +} + +extern const struct address_space_operations jfs_metapage_aops; + +/* + * This routines invalidate all pages for an extent. + */ +extern void __invalidate_metapages(struct inode *, s64, int); +#define invalidate_pxd_metapages(ip, pxd) \ + __invalidate_metapages((ip), addressPXD(&(pxd)), lengthPXD(&(pxd))) +#define invalidate_dxd_metapages(ip, dxd) \ + __invalidate_metapages((ip), addressDXD(&(dxd)), lengthDXD(&(dxd))) +#define invalidate_xad_metapages(ip, xad) \ + __invalidate_metapages((ip), addressXAD(&(xad)), lengthXAD(&(xad))) + +#endif /* _H_JFS_METAPAGE */ diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c new file mode 100644 index 00000000..9895595f --- /dev/null +++ b/fs/jfs/jfs_mount.c @@ -0,0 +1,507 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * Module: jfs_mount.c + * + * note: file system in transition to aggregate/fileset: + * + * file system mount is interpreted as the mount of aggregate, + * if not already mounted, and mount of the single/only fileset in + * the aggregate; + * + * a file system/aggregate is represented by an internal inode + * (aka mount inode) initialized with aggregate superblock; + * each vfs represents a fileset, and points to its "fileset inode + * allocation map inode" (aka fileset inode): + * (an aggregate itself is structured recursively as a filset: + * an internal vfs is constructed and points to its "fileset inode + * allocation map inode" (aka aggregate inode) where each inode + * represents a fileset inode) so that inode number is mapped to + * on-disk inode in uniform way at both aggregate and fileset level; + * + * each vnode/inode of a fileset is linked to its vfs (to facilitate + * per fileset inode operations, e.g., unmount of a fileset, etc.); + * each inode points to the mount inode (to facilitate access to + * per aggregate information, e.g., block size, etc.) as well as + * its file set inode. + * + * aggregate + * ipmnt + * mntvfs -> fileset ipimap+ -> aggregate ipbmap -> aggregate ipaimap; + * fileset vfs -> vp(1) <-> ... <-> vp(n) <->vproot; + */ + +#include +#include + +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_metapage.h" +#include "jfs_debug.h" + + +/* + * forward references + */ +static int chkSuper(struct super_block *); +static int logMOUNT(struct super_block *sb); + +/* + * NAME: jfs_mount(sb) + * + * FUNCTION: vfs_mount() + * + * PARAMETER: sb - super block + * + * RETURN: -EBUSY - device already mounted or open for write + * -EBUSY - cvrdvp already mounted; + * -EBUSY - mount table full + * -ENOTDIR- cvrdvp not directory on a device mount + * -ENXIO - device open failure + */ +int jfs_mount(struct super_block *sb) +{ + int rc = 0; /* Return code */ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct inode *ipaimap = NULL; + struct inode *ipaimap2 = NULL; + struct inode *ipimap = NULL; + struct inode *ipbmap = NULL; + + /* + * read/validate superblock + * (initialize mount inode from the superblock) + */ + if ((rc = chkSuper(sb))) { + goto errout20; + } + + ipaimap = diReadSpecial(sb, AGGREGATE_I, 0); + if (ipaimap == NULL) { + jfs_err("jfs_mount: Failed to read AGGREGATE_I"); + rc = -EIO; + goto errout20; + } + sbi->ipaimap = ipaimap; + + jfs_info("jfs_mount: ipaimap:0x%p", ipaimap); + + /* + * initialize aggregate inode allocation map + */ + if ((rc = diMount(ipaimap))) { + jfs_err("jfs_mount: diMount(ipaimap) failed w/rc = %d", rc); + goto errout21; + } + + /* + * open aggregate block allocation map + */ + ipbmap = diReadSpecial(sb, BMAP_I, 0); + if (ipbmap == NULL) { + rc = -EIO; + goto errout22; + } + + jfs_info("jfs_mount: ipbmap:0x%p", ipbmap); + + sbi->ipbmap = ipbmap; + + /* + * initialize aggregate block allocation map + */ + if ((rc = dbMount(ipbmap))) { + jfs_err("jfs_mount: dbMount failed w/rc = %d", rc); + goto errout22; + } + + /* + * open the secondary aggregate inode allocation map + * + * This is a duplicate of the aggregate inode allocation map. + * + * hand craft a vfs in the same fashion as we did to read ipaimap. + * By adding INOSPEREXT (32) to the inode number, we are telling + * diReadSpecial that we are reading from the secondary aggregate + * inode table. This also creates a unique entry in the inode hash + * table. + */ + if ((sbi->mntflag & JFS_BAD_SAIT) == 0) { + ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1); + if (!ipaimap2) { + jfs_err("jfs_mount: Failed to read AGGREGATE_I"); + rc = -EIO; + goto errout35; + } + sbi->ipaimap2 = ipaimap2; + + jfs_info("jfs_mount: ipaimap2:0x%p", ipaimap2); + + /* + * initialize secondary aggregate inode allocation map + */ + if ((rc = diMount(ipaimap2))) { + jfs_err("jfs_mount: diMount(ipaimap2) failed, rc = %d", + rc); + goto errout35; + } + } else + /* Secondary aggregate inode table is not valid */ + sbi->ipaimap2 = NULL; + + /* + * mount (the only/single) fileset + */ + /* + * open fileset inode allocation map (aka fileset inode) + */ + ipimap = diReadSpecial(sb, FILESYSTEM_I, 0); + if (ipimap == NULL) { + jfs_err("jfs_mount: Failed to read FILESYSTEM_I"); + /* open fileset secondary inode allocation map */ + rc = -EIO; + goto errout40; + } + jfs_info("jfs_mount: ipimap:0x%p", ipimap); + + /* map further access of per fileset inodes by the fileset inode */ + sbi->ipimap = ipimap; + + /* initialize fileset inode allocation map */ + if ((rc = diMount(ipimap))) { + jfs_err("jfs_mount: diMount failed w/rc = %d", rc); + goto errout41; + } + + goto out; + + /* + * unwind on error + */ + errout41: /* close fileset inode allocation map inode */ + diFreeSpecial(ipimap); + + errout40: /* fileset closed */ + + /* close secondary aggregate inode allocation map */ + if (ipaimap2) { + diUnmount(ipaimap2, 1); + diFreeSpecial(ipaimap2); + } + + errout35: + + /* close aggregate block allocation map */ + dbUnmount(ipbmap, 1); + diFreeSpecial(ipbmap); + + errout22: /* close aggregate inode allocation map */ + + diUnmount(ipaimap, 1); + + errout21: /* close aggregate inodes */ + diFreeSpecial(ipaimap); + errout20: /* aggregate closed */ + + out: + + if (rc) + jfs_err("Mount JFS Failure: %d", rc); + + return rc; +} + +/* + * NAME: jfs_mount_rw(sb, remount) + * + * FUNCTION: Completes read-write mount, or remounts read-only volume + * as read-write + */ +int jfs_mount_rw(struct super_block *sb, int remount) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + int rc; + + /* + * If we are re-mounting a previously read-only volume, we want to + * re-read the inode and block maps, since fsck.jfs may have updated + * them. + */ + if (remount) { + if (chkSuper(sb) || (sbi->state != FM_CLEAN)) + return -EINVAL; + + truncate_inode_pages(sbi->ipimap->i_mapping, 0); + truncate_inode_pages(sbi->ipbmap->i_mapping, 0); + diUnmount(sbi->ipimap, 1); + if ((rc = diMount(sbi->ipimap))) { + jfs_err("jfs_mount_rw: diMount failed!"); + return rc; + } + + dbUnmount(sbi->ipbmap, 1); + if ((rc = dbMount(sbi->ipbmap))) { + jfs_err("jfs_mount_rw: dbMount failed!"); + return rc; + } + } + + /* + * open/initialize log + */ + if ((rc = lmLogOpen(sb))) + return rc; + + /* + * update file system superblock; + */ + if ((rc = updateSuper(sb, FM_MOUNT))) { + jfs_err("jfs_mount: updateSuper failed w/rc = %d", rc); + lmLogClose(sb); + return rc; + } + + /* + * write MOUNT log record of the file system + */ + logMOUNT(sb); + + return rc; +} + +/* + * chkSuper() + * + * validate the superblock of the file system to be mounted and + * get the file system parameters. + * + * returns + * 0 with fragsize set if check successful + * error code if not successful + */ +static int chkSuper(struct super_block *sb) +{ + int rc = 0; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_superblock *j_sb; + struct buffer_head *bh; + int AIM_bytesize, AIT_bytesize; + int expected_AIM_bytesize, expected_AIT_bytesize; + s64 AIM_byte_addr, AIT_byte_addr, fsckwsp_addr; + s64 byte_addr_diff0, byte_addr_diff1; + s32 bsize; + + if ((rc = readSuper(sb, &bh))) + return rc; + j_sb = (struct jfs_superblock *)bh->b_data; + + /* + * validate superblock + */ + /* validate fs signature */ + if (strncmp(j_sb->s_magic, JFS_MAGIC, 4) || + le32_to_cpu(j_sb->s_version) > JFS_VERSION) { + rc = -EINVAL; + goto out; + } + + bsize = le32_to_cpu(j_sb->s_bsize); +#ifdef _JFS_4K + if (bsize != PSIZE) { + jfs_err("Currently only 4K block size supported!"); + rc = -EINVAL; + goto out; + } +#endif /* _JFS_4K */ + + jfs_info("superblock: flag:0x%08x state:0x%08x size:0x%Lx", + le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state), + (unsigned long long) le64_to_cpu(j_sb->s_size)); + + /* validate the descriptors for Secondary AIM and AIT */ + if ((j_sb->s_flag & cpu_to_le32(JFS_BAD_SAIT)) != + cpu_to_le32(JFS_BAD_SAIT)) { + expected_AIM_bytesize = 2 * PSIZE; + AIM_bytesize = lengthPXD(&(j_sb->s_aim2)) * bsize; + expected_AIT_bytesize = 4 * PSIZE; + AIT_bytesize = lengthPXD(&(j_sb->s_ait2)) * bsize; + AIM_byte_addr = addressPXD(&(j_sb->s_aim2)) * bsize; + AIT_byte_addr = addressPXD(&(j_sb->s_ait2)) * bsize; + byte_addr_diff0 = AIT_byte_addr - AIM_byte_addr; + fsckwsp_addr = addressPXD(&(j_sb->s_fsckpxd)) * bsize; + byte_addr_diff1 = fsckwsp_addr - AIT_byte_addr; + if ((AIM_bytesize != expected_AIM_bytesize) || + (AIT_bytesize != expected_AIT_bytesize) || + (byte_addr_diff0 != AIM_bytesize) || + (byte_addr_diff1 <= AIT_bytesize)) + j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT); + } + + if ((j_sb->s_flag & cpu_to_le32(JFS_GROUPCOMMIT)) != + cpu_to_le32(JFS_GROUPCOMMIT)) + j_sb->s_flag |= cpu_to_le32(JFS_GROUPCOMMIT); + + /* validate fs state */ + if (j_sb->s_state != cpu_to_le32(FM_CLEAN) && + !(sb->s_flags & MS_RDONLY)) { + jfs_err("jfs_mount: Mount Failure: File System Dirty."); + rc = -EINVAL; + goto out; + } + + sbi->state = le32_to_cpu(j_sb->s_state); + sbi->mntflag = le32_to_cpu(j_sb->s_flag); + + /* + * JFS always does I/O by 4K pages. Don't tell the buffer cache + * that we use anything else (leave s_blocksize alone). + */ + sbi->bsize = bsize; + sbi->l2bsize = le16_to_cpu(j_sb->s_l2bsize); + + /* + * For now, ignore s_pbsize, l2bfactor. All I/O going through buffer + * cache. + */ + sbi->nbperpage = PSIZE >> sbi->l2bsize; + sbi->l2nbperpage = L2PSIZE - sbi->l2bsize; + sbi->l2niperblk = sbi->l2bsize - L2DISIZE; + if (sbi->mntflag & JFS_INLINELOG) + sbi->logpxd = j_sb->s_logpxd; + else { + sbi->logdev = new_decode_dev(le32_to_cpu(j_sb->s_logdev)); + memcpy(sbi->uuid, j_sb->s_uuid, sizeof(sbi->uuid)); + memcpy(sbi->loguuid, j_sb->s_loguuid, sizeof(sbi->uuid)); + } + sbi->fsckpxd = j_sb->s_fsckpxd; + sbi->ait2 = j_sb->s_ait2; + + out: + brelse(bh); + return rc; +} + + +/* + * updateSuper() + * + * update synchronously superblock if it is mounted read-write. + */ +int updateSuper(struct super_block *sb, uint state) +{ + struct jfs_superblock *j_sb; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct buffer_head *bh; + int rc; + + if (sbi->flag & JFS_NOINTEGRITY) { + if (state == FM_DIRTY) { + sbi->p_state = state; + return 0; + } else if (state == FM_MOUNT) { + sbi->p_state = sbi->state; + state = FM_DIRTY; + } else if (state == FM_CLEAN) { + state = sbi->p_state; + } else + jfs_err("updateSuper: bad state"); + } else if (sbi->state == FM_DIRTY) + return 0; + + if ((rc = readSuper(sb, &bh))) + return rc; + + j_sb = (struct jfs_superblock *)bh->b_data; + + j_sb->s_state = cpu_to_le32(state); + sbi->state = state; + + if (state == FM_MOUNT) { + /* record log's dev_t and mount serial number */ + j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev)); + j_sb->s_logserial = cpu_to_le32(sbi->log->serial); + } else if (state == FM_CLEAN) { + /* + * If this volume is shared with OS/2, OS/2 will need to + * recalculate DASD usage, since we don't deal with it. + */ + if (j_sb->s_flag & cpu_to_le32(JFS_DASD_ENABLED)) + j_sb->s_flag |= cpu_to_le32(JFS_DASD_PRIME); + } + + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + + return 0; +} + + +/* + * readSuper() + * + * read superblock by raw sector address + */ +int readSuper(struct super_block *sb, struct buffer_head **bpp) +{ + /* read in primary superblock */ + *bpp = sb_bread(sb, SUPER1_OFF >> sb->s_blocksize_bits); + if (*bpp) + return 0; + + /* read in secondary/replicated superblock */ + *bpp = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits); + if (*bpp) + return 0; + + return -EIO; +} + + +/* + * logMOUNT() + * + * function: write a MOUNT log record for file system. + * + * MOUNT record keeps logredo() from processing log records + * for this file system past this point in log. + * it is harmless if mount fails. + * + * note: MOUNT record is at aggregate level, not at fileset level, + * since log records of previous mounts of a fileset + * (e.g., AFTER record of extent allocation) have to be processed + * to update block allocation map at aggregate level. + */ +static int logMOUNT(struct super_block *sb) +{ + struct jfs_log *log = JFS_SBI(sb)->log; + struct lrd lrd; + + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_MOUNT); + lrd.length = 0; + lrd.aggregate = cpu_to_le32(new_encode_dev(sb->s_bdev->bd_dev)); + lmLog(log, NULL, &lrd, NULL); + + return 0; +} diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h new file mode 100644 index 00000000..884fc21a --- /dev/null +++ b/fs/jfs/jfs_superblock.h @@ -0,0 +1,121 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2003 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_SUPERBLOCK +#define _H_JFS_SUPERBLOCK + +/* + * make the magic number something a human could read + */ +#define JFS_MAGIC "JFS1" /* Magic word */ + +#define JFS_VERSION 2 /* Version number: Version 2 */ + +#define LV_NAME_SIZE 11 /* MUST BE 11 for OS/2 boot sector */ + +/* + * aggregate superblock + * + * The name superblock is too close to super_block, so the name has been + * changed to jfs_superblock. The utilities are still using the old name. + */ +struct jfs_superblock { + char s_magic[4]; /* 4: magic number */ + __le32 s_version; /* 4: version number */ + + __le64 s_size; /* 8: aggregate size in hardware/LVM blocks; + * VFS: number of blocks + */ + __le32 s_bsize; /* 4: aggregate block size in bytes; + * VFS: fragment size + */ + __le16 s_l2bsize; /* 2: log2 of s_bsize */ + __le16 s_l2bfactor; /* 2: log2(s_bsize/hardware block size) */ + __le32 s_pbsize; /* 4: hardware/LVM block size in bytes */ + __le16 s_l2pbsize; /* 2: log2 of s_pbsize */ + __le16 pad; /* 2: padding necessary for alignment */ + + __le32 s_agsize; /* 4: allocation group size in aggr. blocks */ + + __le32 s_flag; /* 4: aggregate attributes: + * see jfs_filsys.h + */ + __le32 s_state; /* 4: mount/unmount/recovery state: + * see jfs_filsys.h + */ + __le32 s_compress; /* 4: > 0 if data compression */ + + pxd_t s_ait2; /* 8: first extent of secondary + * aggregate inode table + */ + + pxd_t s_aim2; /* 8: first extent of secondary + * aggregate inode map + */ + __le32 s_logdev; /* 4: device address of log */ + __le32 s_logserial; /* 4: log serial number at aggregate mount */ + pxd_t s_logpxd; /* 8: inline log extent */ + + pxd_t s_fsckpxd; /* 8: inline fsck work space extent */ + + struct timestruc_t s_time; /* 8: time last updated */ + + __le32 s_fsckloglen; /* 4: Number of filesystem blocks reserved for + * the fsck service log. + * N.B. These blocks are divided among the + * versions kept. This is not a per + * version size. + * N.B. These blocks are included in the + * length field of s_fsckpxd. + */ + s8 s_fscklog; /* 1: which fsck service log is most recent + * 0 => no service log data yet + * 1 => the first one + * 2 => the 2nd one + */ + char s_fpack[11]; /* 11: file system volume name + * N.B. This must be 11 bytes to + * conform with the OS/2 BootSector + * requirements + * Only used when s_version is 1 + */ + + /* extendfs() parameter under s_state & FM_EXTENDFS */ + __le64 s_xsize; /* 8: extendfs s_size */ + pxd_t s_xfsckpxd; /* 8: extendfs fsckpxd */ + pxd_t s_xlogpxd; /* 8: extendfs logpxd */ + /* - 128 byte boundary - */ + + char s_uuid[16]; /* 16: 128-bit uuid for volume */ + char s_label[16]; /* 16: volume label */ + char s_loguuid[16]; /* 16: 128-bit uuid for log device */ + +}; + +extern int readSuper(struct super_block *, struct buffer_head **); +extern int updateSuper(struct super_block *, uint); +extern void jfs_error(struct super_block *, const char *, ...); +extern int jfs_mount(struct super_block *); +extern int jfs_mount_rw(struct super_block *, int); +extern int jfs_umount(struct super_block *); +extern int jfs_umount_rw(struct super_block *); +extern int jfs_extendfs(struct super_block *, s64, int); + +extern struct task_struct *jfsIOthread; +extern struct task_struct *jfsSyncThread; + +#endif /*_H_JFS_SUPERBLOCK */ diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c new file mode 100644 index 00000000..f6cc0c09 --- /dev/null +++ b/fs/jfs/jfs_txnmgr.c @@ -0,0 +1,3101 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2005 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_txnmgr.c: transaction manager + * + * notes: + * transaction starts with txBegin() and ends with txCommit() + * or txAbort(). + * + * tlock is acquired at the time of update; + * (obviate scan at commit time for xtree and dtree) + * tlock and mp points to each other; + * (no hashlist for mp -> tlock). + * + * special cases: + * tlock on in-memory inode: + * in-place tlock in the in-memory inode itself; + * converted to page lock by iWrite() at commit time. + * + * tlock during write()/mmap() under anonymous transaction (tid = 0): + * transferred (?) to transaction at commit time. + * + * use the page itself to update allocation maps + * (obviate intermediate replication of allocation/deallocation data) + * hold on to mp+lock thru update of maps + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dinode.h" +#include "jfs_imap.h" +#include "jfs_dmap.h" +#include "jfs_superblock.h" +#include "jfs_debug.h" + +/* + * transaction management structures + */ +static struct { + int freetid; /* index of a free tid structure */ + int freelock; /* index first free lock word */ + wait_queue_head_t freewait; /* eventlist of free tblock */ + wait_queue_head_t freelockwait; /* eventlist of free tlock */ + wait_queue_head_t lowlockwait; /* eventlist of ample tlocks */ + int tlocksInUse; /* Number of tlocks in use */ + spinlock_t LazyLock; /* synchronize sync_queue & unlock_queue */ +/* struct tblock *sync_queue; * Transactions waiting for data sync */ + struct list_head unlock_queue; /* Txns waiting to be released */ + struct list_head anon_list; /* inodes having anonymous txns */ + struct list_head anon_list2; /* inodes having anonymous txns + that couldn't be sync'ed */ +} TxAnchor; + +int jfs_tlocks_low; /* Indicates low number of available tlocks */ + +#ifdef CONFIG_JFS_STATISTICS +static struct { + uint txBegin; + uint txBegin_barrier; + uint txBegin_lockslow; + uint txBegin_freetid; + uint txBeginAnon; + uint txBeginAnon_barrier; + uint txBeginAnon_lockslow; + uint txLockAlloc; + uint txLockAlloc_freelock; +} TxStat; +#endif + +static int nTxBlock = -1; /* number of transaction blocks */ +module_param(nTxBlock, int, 0); +MODULE_PARM_DESC(nTxBlock, + "Number of transaction blocks (max:65536)"); + +static int nTxLock = -1; /* number of transaction locks */ +module_param(nTxLock, int, 0); +MODULE_PARM_DESC(nTxLock, + "Number of transaction locks (max:65536)"); + +struct tblock *TxBlock; /* transaction block table */ +static int TxLockLWM; /* Low water mark for number of txLocks used */ +static int TxLockHWM; /* High water mark for number of txLocks used */ +static int TxLockVHWM; /* Very High water mark */ +struct tlock *TxLock; /* transaction lock table */ + +/* + * transaction management lock + */ +static DEFINE_SPINLOCK(jfsTxnLock); + +#define TXN_LOCK() spin_lock(&jfsTxnLock) +#define TXN_UNLOCK() spin_unlock(&jfsTxnLock) + +#define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock); +#define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags) +#define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags) + +static DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait); +static int jfs_commit_thread_waking; + +/* + * Retry logic exist outside these macros to protect from spurrious wakeups. + */ +static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(event, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + TXN_UNLOCK(); + io_schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(event, &wait); +} + +#define TXN_SLEEP(event)\ +{\ + TXN_SLEEP_DROP_LOCK(event);\ + TXN_LOCK();\ +} + +#define TXN_WAKEUP(event) wake_up_all(event) + +/* + * statistics + */ +static struct { + tid_t maxtid; /* 4: biggest tid ever used */ + lid_t maxlid; /* 4: biggest lid ever used */ + int ntid; /* 4: # of transactions performed */ + int nlid; /* 4: # of tlocks acquired */ + int waitlock; /* 4: # of tlock wait */ +} stattx; + +/* + * forward references + */ +static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck, struct commit * cd); +static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck); +static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck); +static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck); +static void txAllocPMap(struct inode *ip, struct maplock * maplock, + struct tblock * tblk); +static void txForce(struct tblock * tblk); +static int txLog(struct jfs_log * log, struct tblock * tblk, + struct commit * cd); +static void txUpdateMap(struct tblock * tblk); +static void txRelease(struct tblock * tblk); +static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck); +static void LogSyncRelease(struct metapage * mp); + +/* + * transaction block/lock management + * --------------------------------- + */ + +/* + * Get a transaction lock from the free list. If the number in use is + * greater than the high water mark, wake up the sync daemon. This should + * free some anonymous transaction locks. (TXN_LOCK must be held.) + */ +static lid_t txLockAlloc(void) +{ + lid_t lid; + + INCREMENT(TxStat.txLockAlloc); + if (!TxAnchor.freelock) { + INCREMENT(TxStat.txLockAlloc_freelock); + } + + while (!(lid = TxAnchor.freelock)) + TXN_SLEEP(&TxAnchor.freelockwait); + TxAnchor.freelock = TxLock[lid].next; + HIGHWATERMARK(stattx.maxlid, lid); + if ((++TxAnchor.tlocksInUse > TxLockHWM) && (jfs_tlocks_low == 0)) { + jfs_info("txLockAlloc tlocks low"); + jfs_tlocks_low = 1; + wake_up_process(jfsSyncThread); + } + + return lid; +} + +static void txLockFree(lid_t lid) +{ + TxLock[lid].tid = 0; + TxLock[lid].next = TxAnchor.freelock; + TxAnchor.freelock = lid; + TxAnchor.tlocksInUse--; + if (jfs_tlocks_low && (TxAnchor.tlocksInUse < TxLockLWM)) { + jfs_info("txLockFree jfs_tlocks_low no more"); + jfs_tlocks_low = 0; + TXN_WAKEUP(&TxAnchor.lowlockwait); + } + TXN_WAKEUP(&TxAnchor.freelockwait); +} + +/* + * NAME: txInit() + * + * FUNCTION: initialize transaction management structures + * + * RETURN: + * + * serialization: single thread at jfs_init() + */ +int txInit(void) +{ + int k, size; + struct sysinfo si; + + /* Set defaults for nTxLock and nTxBlock if unset */ + + if (nTxLock == -1) { + if (nTxBlock == -1) { + /* Base default on memory size */ + si_meminfo(&si); + if (si.totalram > (256 * 1024)) /* 1 GB */ + nTxLock = 64 * 1024; + else + nTxLock = si.totalram >> 2; + } else if (nTxBlock > (8 * 1024)) + nTxLock = 64 * 1024; + else + nTxLock = nTxBlock << 3; + } + if (nTxBlock == -1) + nTxBlock = nTxLock >> 3; + + /* Verify tunable parameters */ + if (nTxBlock < 16) + nTxBlock = 16; /* No one should set it this low */ + if (nTxBlock > 65536) + nTxBlock = 65536; + if (nTxLock < 256) + nTxLock = 256; /* No one should set it this low */ + if (nTxLock > 65536) + nTxLock = 65536; + + printk(KERN_INFO "JFS: nTxBlock = %d, nTxLock = %d\n", + nTxBlock, nTxLock); + /* + * initialize transaction block (tblock) table + * + * transaction id (tid) = tblock index + * tid = 0 is reserved. + */ + TxLockLWM = (nTxLock * 4) / 10; + TxLockHWM = (nTxLock * 7) / 10; + TxLockVHWM = (nTxLock * 8) / 10; + + size = sizeof(struct tblock) * nTxBlock; + TxBlock = vmalloc(size); + if (TxBlock == NULL) + return -ENOMEM; + + for (k = 1; k < nTxBlock - 1; k++) { + TxBlock[k].next = k + 1; + init_waitqueue_head(&TxBlock[k].gcwait); + init_waitqueue_head(&TxBlock[k].waitor); + } + TxBlock[k].next = 0; + init_waitqueue_head(&TxBlock[k].gcwait); + init_waitqueue_head(&TxBlock[k].waitor); + + TxAnchor.freetid = 1; + init_waitqueue_head(&TxAnchor.freewait); + + stattx.maxtid = 1; /* statistics */ + + /* + * initialize transaction lock (tlock) table + * + * transaction lock id = tlock index + * tlock id = 0 is reserved. + */ + size = sizeof(struct tlock) * nTxLock; + TxLock = vmalloc(size); + if (TxLock == NULL) { + vfree(TxBlock); + return -ENOMEM; + } + + /* initialize tlock table */ + for (k = 1; k < nTxLock - 1; k++) + TxLock[k].next = k + 1; + TxLock[k].next = 0; + init_waitqueue_head(&TxAnchor.freelockwait); + init_waitqueue_head(&TxAnchor.lowlockwait); + + TxAnchor.freelock = 1; + TxAnchor.tlocksInUse = 0; + INIT_LIST_HEAD(&TxAnchor.anon_list); + INIT_LIST_HEAD(&TxAnchor.anon_list2); + + LAZY_LOCK_INIT(); + INIT_LIST_HEAD(&TxAnchor.unlock_queue); + + stattx.maxlid = 1; /* statistics */ + + return 0; +} + +/* + * NAME: txExit() + * + * FUNCTION: clean up when module is unloaded + */ +void txExit(void) +{ + vfree(TxLock); + TxLock = NULL; + vfree(TxBlock); + TxBlock = NULL; +} + +/* + * NAME: txBegin() + * + * FUNCTION: start a transaction. + * + * PARAMETER: sb - superblock + * flag - force for nested tx; + * + * RETURN: tid - transaction id + * + * note: flag force allows to start tx for nested tx + * to prevent deadlock on logsync barrier; + */ +tid_t txBegin(struct super_block *sb, int flag) +{ + tid_t t; + struct tblock *tblk; + struct jfs_log *log; + + jfs_info("txBegin: flag = 0x%x", flag); + log = JFS_SBI(sb)->log; + + TXN_LOCK(); + + INCREMENT(TxStat.txBegin); + + retry: + if (!(flag & COMMIT_FORCE)) { + /* + * synchronize with logsync barrier + */ + if (test_bit(log_SYNCBARRIER, &log->flag) || + test_bit(log_QUIESCE, &log->flag)) { + INCREMENT(TxStat.txBegin_barrier); + TXN_SLEEP(&log->syncwait); + goto retry; + } + } + if (flag == 0) { + /* + * Don't begin transaction if we're getting starved for tlocks + * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately + * free tlocks) + */ + if (TxAnchor.tlocksInUse > TxLockVHWM) { + INCREMENT(TxStat.txBegin_lockslow); + TXN_SLEEP(&TxAnchor.lowlockwait); + goto retry; + } + } + + /* + * allocate transaction id/block + */ + if ((t = TxAnchor.freetid) == 0) { + jfs_info("txBegin: waiting for free tid"); + INCREMENT(TxStat.txBegin_freetid); + TXN_SLEEP(&TxAnchor.freewait); + goto retry; + } + + tblk = tid_to_tblock(t); + + if ((tblk->next == 0) && !(flag & COMMIT_FORCE)) { + /* Don't let a non-forced transaction take the last tblk */ + jfs_info("txBegin: waiting for free tid"); + INCREMENT(TxStat.txBegin_freetid); + TXN_SLEEP(&TxAnchor.freewait); + goto retry; + } + + TxAnchor.freetid = tblk->next; + + /* + * initialize transaction + */ + + /* + * We can't zero the whole thing or we screw up another thread being + * awakened after sleeping on tblk->waitor + * + * memset(tblk, 0, sizeof(struct tblock)); + */ + tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0; + + tblk->sb = sb; + ++log->logtid; + tblk->logtid = log->logtid; + + ++log->active; + + HIGHWATERMARK(stattx.maxtid, t); /* statistics */ + INCREMENT(stattx.ntid); /* statistics */ + + TXN_UNLOCK(); + + jfs_info("txBegin: returning tid = %d", t); + + return t; +} + +/* + * NAME: txBeginAnon() + * + * FUNCTION: start an anonymous transaction. + * Blocks if logsync or available tlocks are low to prevent + * anonymous tlocks from depleting supply. + * + * PARAMETER: sb - superblock + * + * RETURN: none + */ +void txBeginAnon(struct super_block *sb) +{ + struct jfs_log *log; + + log = JFS_SBI(sb)->log; + + TXN_LOCK(); + INCREMENT(TxStat.txBeginAnon); + + retry: + /* + * synchronize with logsync barrier + */ + if (test_bit(log_SYNCBARRIER, &log->flag) || + test_bit(log_QUIESCE, &log->flag)) { + INCREMENT(TxStat.txBeginAnon_barrier); + TXN_SLEEP(&log->syncwait); + goto retry; + } + + /* + * Don't begin transaction if we're getting starved for tlocks + */ + if (TxAnchor.tlocksInUse > TxLockVHWM) { + INCREMENT(TxStat.txBeginAnon_lockslow); + TXN_SLEEP(&TxAnchor.lowlockwait); + goto retry; + } + TXN_UNLOCK(); +} + +/* + * txEnd() + * + * function: free specified transaction block. + * + * logsync barrier processing: + * + * serialization: + */ +void txEnd(tid_t tid) +{ + struct tblock *tblk = tid_to_tblock(tid); + struct jfs_log *log; + + jfs_info("txEnd: tid = %d", tid); + TXN_LOCK(); + + /* + * wakeup transactions waiting on the page locked + * by the current transaction + */ + TXN_WAKEUP(&tblk->waitor); + + log = JFS_SBI(tblk->sb)->log; + + /* + * Lazy commit thread can't free this guy until we mark it UNLOCKED, + * otherwise, we would be left with a transaction that may have been + * reused. + * + * Lazy commit thread will turn off tblkGC_LAZY before calling this + * routine. + */ + if (tblk->flag & tblkGC_LAZY) { + jfs_info("txEnd called w/lazy tid: %d, tblk = 0x%p", tid, tblk); + TXN_UNLOCK(); + + spin_lock_irq(&log->gclock); // LOGGC_LOCK + tblk->flag |= tblkGC_UNLOCKED; + spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK + return; + } + + jfs_info("txEnd: tid: %d, tblk = 0x%p", tid, tblk); + + assert(tblk->next == 0); + + /* + * insert tblock back on freelist + */ + tblk->next = TxAnchor.freetid; + TxAnchor.freetid = tid; + + /* + * mark the tblock not active + */ + if (--log->active == 0) { + clear_bit(log_FLUSH, &log->flag); + + /* + * synchronize with logsync barrier + */ + if (test_bit(log_SYNCBARRIER, &log->flag)) { + TXN_UNLOCK(); + + /* write dirty metadata & forward log syncpt */ + jfs_syncpt(log, 1); + + jfs_info("log barrier off: 0x%x", log->lsn); + + /* enable new transactions start */ + clear_bit(log_SYNCBARRIER, &log->flag); + + /* wakeup all waitors for logsync barrier */ + TXN_WAKEUP(&log->syncwait); + + goto wakeup; + } + } + + TXN_UNLOCK(); +wakeup: + /* + * wakeup all waitors for a free tblock + */ + TXN_WAKEUP(&TxAnchor.freewait); +} + +/* + * txLock() + * + * function: acquire a transaction lock on the specified + * + * parameter: + * + * return: transaction lock id + * + * serialization: + */ +struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp, + int type) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + int dir_xtree = 0; + lid_t lid; + tid_t xtid; + struct tlock *tlck; + struct xtlock *xtlck; + struct linelock *linelock; + xtpage_t *p; + struct tblock *tblk; + + TXN_LOCK(); + + if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) && + !(mp->xflag & COMMIT_PAGE)) { + /* + * Directory inode is special. It can have both an xtree tlock + * and a dtree tlock associated with it. + */ + dir_xtree = 1; + lid = jfs_ip->xtlid; + } else + lid = mp->lid; + + /* is page not locked by a transaction ? */ + if (lid == 0) + goto allocateLock; + + jfs_info("txLock: tid:%d ip:0x%p mp:0x%p lid:%d", tid, ip, mp, lid); + + /* is page locked by the requester transaction ? */ + tlck = lid_to_tlock(lid); + if ((xtid = tlck->tid) == tid) { + TXN_UNLOCK(); + goto grantLock; + } + + /* + * is page locked by anonymous transaction/lock ? + * + * (page update without transaction (i.e., file write) is + * locked under anonymous transaction tid = 0: + * anonymous tlocks maintained on anonymous tlock list of + * the inode of the page and available to all anonymous + * transactions until txCommit() time at which point + * they are transferred to the transaction tlock list of + * the committing transaction of the inode) + */ + if (xtid == 0) { + tlck->tid = tid; + TXN_UNLOCK(); + tblk = tid_to_tblock(tid); + /* + * The order of the tlocks in the transaction is important + * (during truncate, child xtree pages must be freed before + * parent's tlocks change the working map). + * Take tlock off anonymous list and add to tail of + * transaction list + * + * Note: We really need to get rid of the tid & lid and + * use list_head's. This code is getting UGLY! + */ + if (jfs_ip->atlhead == lid) { + if (jfs_ip->atltail == lid) { + /* only anonymous txn. + * Remove from anon_list + */ + TXN_LOCK(); + list_del_init(&jfs_ip->anon_inode_list); + TXN_UNLOCK(); + } + jfs_ip->atlhead = tlck->next; + } else { + lid_t last; + for (last = jfs_ip->atlhead; + lid_to_tlock(last)->next != lid; + last = lid_to_tlock(last)->next) { + assert(last); + } + lid_to_tlock(last)->next = tlck->next; + if (jfs_ip->atltail == lid) + jfs_ip->atltail = last; + } + + /* insert the tlock at tail of transaction tlock list */ + + if (tblk->next) + lid_to_tlock(tblk->last)->next = lid; + else + tblk->next = lid; + tlck->next = 0; + tblk->last = lid; + + goto grantLock; + } + + goto waitLock; + + /* + * allocate a tlock + */ + allocateLock: + lid = txLockAlloc(); + tlck = lid_to_tlock(lid); + + /* + * initialize tlock + */ + tlck->tid = tid; + + TXN_UNLOCK(); + + /* mark tlock for meta-data page */ + if (mp->xflag & COMMIT_PAGE) { + + tlck->flag = tlckPAGELOCK; + + /* mark the page dirty and nohomeok */ + metapage_nohomeok(mp); + + jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p", + mp, mp->nohomeok, tid, tlck); + + /* if anonymous transaction, and buffer is on the group + * commit synclist, mark inode to show this. This will + * prevent the buffer from being marked nohomeok for too + * long a time. + */ + if ((tid == 0) && mp->lsn) + set_cflag(COMMIT_Synclist, ip); + } + /* mark tlock for in-memory inode */ + else + tlck->flag = tlckINODELOCK; + + if (S_ISDIR(ip->i_mode)) + tlck->flag |= tlckDIRECTORY; + + tlck->type = 0; + + /* bind the tlock and the page */ + tlck->ip = ip; + tlck->mp = mp; + if (dir_xtree) + jfs_ip->xtlid = lid; + else + mp->lid = lid; + + /* + * enqueue transaction lock to transaction/inode + */ + /* insert the tlock at tail of transaction tlock list */ + if (tid) { + tblk = tid_to_tblock(tid); + if (tblk->next) + lid_to_tlock(tblk->last)->next = lid; + else + tblk->next = lid; + tlck->next = 0; + tblk->last = lid; + } + /* anonymous transaction: + * insert the tlock at head of inode anonymous tlock list + */ + else { + tlck->next = jfs_ip->atlhead; + jfs_ip->atlhead = lid; + if (tlck->next == 0) { + /* This inode's first anonymous transaction */ + jfs_ip->atltail = lid; + TXN_LOCK(); + list_add_tail(&jfs_ip->anon_inode_list, + &TxAnchor.anon_list); + TXN_UNLOCK(); + } + } + + /* initialize type dependent area for linelock */ + linelock = (struct linelock *) & tlck->lock; + linelock->next = 0; + linelock->flag = tlckLINELOCK; + linelock->maxcnt = TLOCKSHORT; + linelock->index = 0; + + switch (type & tlckTYPE) { + case tlckDTREE: + linelock->l2linesize = L2DTSLOTSIZE; + break; + + case tlckXTREE: + linelock->l2linesize = L2XTSLOTSIZE; + + xtlck = (struct xtlock *) linelock; + xtlck->header.offset = 0; + xtlck->header.length = 2; + + if (type & tlckNEW) { + xtlck->lwm.offset = XTENTRYSTART; + } else { + if (mp->xflag & COMMIT_PAGE) + p = (xtpage_t *) mp->data; + else + p = &jfs_ip->i_xtroot; + xtlck->lwm.offset = + le16_to_cpu(p->header.nextindex); + } + xtlck->lwm.length = 0; /* ! */ + xtlck->twm.offset = 0; + xtlck->hwm.offset = 0; + + xtlck->index = 2; + break; + + case tlckINODE: + linelock->l2linesize = L2INODESLOTSIZE; + break; + + case tlckDATA: + linelock->l2linesize = L2DATASLOTSIZE; + break; + + default: + jfs_err("UFO tlock:0x%p", tlck); + } + + /* + * update tlock vector + */ + grantLock: + tlck->type |= type; + + return tlck; + + /* + * page is being locked by another transaction: + */ + waitLock: + /* Only locks on ipimap or ipaimap should reach here */ + /* assert(jfs_ip->fileset == AGGREGATE_I); */ + if (jfs_ip->fileset != AGGREGATE_I) { + printk(KERN_ERR "txLock: trying to lock locked page!"); + print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4, + ip, sizeof(*ip), 0); + print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4, + mp, sizeof(*mp), 0); + print_hex_dump(KERN_ERR, "Locker's tblock: ", + DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid), + sizeof(struct tblock), 0); + print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4, + tlck, sizeof(*tlck), 0); + BUG(); + } + INCREMENT(stattx.waitlock); /* statistics */ + TXN_UNLOCK(); + release_metapage(mp); + TXN_LOCK(); + xtid = tlck->tid; /* reacquire after dropping TXN_LOCK */ + + jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d", + tid, xtid, lid); + + /* Recheck everything since dropping TXN_LOCK */ + if (xtid && (tlck->mp == mp) && (mp->lid == lid)) + TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor); + else + TXN_UNLOCK(); + jfs_info("txLock: awakened tid = %d, lid = %d", tid, lid); + + return NULL; +} + +/* + * NAME: txRelease() + * + * FUNCTION: Release buffers associated with transaction locks, but don't + * mark homeok yet. The allows other transactions to modify + * buffers, but won't let them go to disk until commit record + * actually gets written. + * + * PARAMETER: + * tblk - + * + * RETURN: Errors from subroutines. + */ +static void txRelease(struct tblock * tblk) +{ + struct metapage *mp; + lid_t lid; + struct tlock *tlck; + + TXN_LOCK(); + + for (lid = tblk->next; lid; lid = tlck->next) { + tlck = lid_to_tlock(lid); + if ((mp = tlck->mp) != NULL && + (tlck->type & tlckBTROOT) == 0) { + assert(mp->xflag & COMMIT_PAGE); + mp->lid = 0; + } + } + + /* + * wakeup transactions waiting on a page locked + * by the current transaction + */ + TXN_WAKEUP(&tblk->waitor); + + TXN_UNLOCK(); +} + +/* + * NAME: txUnlock() + * + * FUNCTION: Initiates pageout of pages modified by tid in journalled + * objects and frees their lockwords. + */ +static void txUnlock(struct tblock * tblk) +{ + struct tlock *tlck; + struct linelock *linelock; + lid_t lid, next, llid, k; + struct metapage *mp; + struct jfs_log *log; + int difft, diffp; + unsigned long flags; + + jfs_info("txUnlock: tblk = 0x%p", tblk); + log = JFS_SBI(tblk->sb)->log; + + /* + * mark page under tlock homeok (its log has been written): + */ + for (lid = tblk->next; lid; lid = next) { + tlck = lid_to_tlock(lid); + next = tlck->next; + + jfs_info("unlocking lid = %d, tlck = 0x%p", lid, tlck); + + /* unbind page from tlock */ + if ((mp = tlck->mp) != NULL && + (tlck->type & tlckBTROOT) == 0) { + assert(mp->xflag & COMMIT_PAGE); + + /* hold buffer + */ + hold_metapage(mp); + + assert(mp->nohomeok > 0); + _metapage_homeok(mp); + + /* inherit younger/larger clsn */ + LOGSYNC_LOCK(log, flags); + if (mp->clsn) { + logdiff(difft, tblk->clsn, log); + logdiff(diffp, mp->clsn, log); + if (difft > diffp) + mp->clsn = tblk->clsn; + } else + mp->clsn = tblk->clsn; + LOGSYNC_UNLOCK(log, flags); + + assert(!(tlck->flag & tlckFREEPAGE)); + + put_metapage(mp); + } + + /* insert tlock, and linelock(s) of the tlock if any, + * at head of freelist + */ + TXN_LOCK(); + + llid = ((struct linelock *) & tlck->lock)->next; + while (llid) { + linelock = (struct linelock *) lid_to_tlock(llid); + k = linelock->next; + txLockFree(llid); + llid = k; + } + txLockFree(lid); + + TXN_UNLOCK(); + } + tblk->next = tblk->last = 0; + + /* + * remove tblock from logsynclist + * (allocation map pages inherited lsn of tblk and + * has been inserted in logsync list at txUpdateMap()) + */ + if (tblk->lsn) { + LOGSYNC_LOCK(log, flags); + log->count--; + list_del(&tblk->synclist); + LOGSYNC_UNLOCK(log, flags); + } +} + +/* + * txMaplock() + * + * function: allocate a transaction lock for freed page/entry; + * for freed page, maplock is used as xtlock/dtlock type; + */ +struct tlock *txMaplock(tid_t tid, struct inode *ip, int type) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + lid_t lid; + struct tblock *tblk; + struct tlock *tlck; + struct maplock *maplock; + + TXN_LOCK(); + + /* + * allocate a tlock + */ + lid = txLockAlloc(); + tlck = lid_to_tlock(lid); + + /* + * initialize tlock + */ + tlck->tid = tid; + + /* bind the tlock and the object */ + tlck->flag = tlckINODELOCK; + if (S_ISDIR(ip->i_mode)) + tlck->flag |= tlckDIRECTORY; + tlck->ip = ip; + tlck->mp = NULL; + + tlck->type = type; + + /* + * enqueue transaction lock to transaction/inode + */ + /* insert the tlock at tail of transaction tlock list */ + if (tid) { + tblk = tid_to_tblock(tid); + if (tblk->next) + lid_to_tlock(tblk->last)->next = lid; + else + tblk->next = lid; + tlck->next = 0; + tblk->last = lid; + } + /* anonymous transaction: + * insert the tlock at head of inode anonymous tlock list + */ + else { + tlck->next = jfs_ip->atlhead; + jfs_ip->atlhead = lid; + if (tlck->next == 0) { + /* This inode's first anonymous transaction */ + jfs_ip->atltail = lid; + list_add_tail(&jfs_ip->anon_inode_list, + &TxAnchor.anon_list); + } + } + + TXN_UNLOCK(); + + /* initialize type dependent area for maplock */ + maplock = (struct maplock *) & tlck->lock; + maplock->next = 0; + maplock->maxcnt = 0; + maplock->index = 0; + + return tlck; +} + +/* + * txLinelock() + * + * function: allocate a transaction lock for log vector list + */ +struct linelock *txLinelock(struct linelock * tlock) +{ + lid_t lid; + struct tlock *tlck; + struct linelock *linelock; + + TXN_LOCK(); + + /* allocate a TxLock structure */ + lid = txLockAlloc(); + tlck = lid_to_tlock(lid); + + TXN_UNLOCK(); + + /* initialize linelock */ + linelock = (struct linelock *) tlck; + linelock->next = 0; + linelock->flag = tlckLINELOCK; + linelock->maxcnt = TLOCKLONG; + linelock->index = 0; + if (tlck->flag & tlckDIRECTORY) + linelock->flag |= tlckDIRECTORY; + + /* append linelock after tlock */ + linelock->next = tlock->next; + tlock->next = lid; + + return linelock; +} + +/* + * transaction commit management + * ----------------------------- + */ + +/* + * NAME: txCommit() + * + * FUNCTION: commit the changes to the objects specified in + * clist. For journalled segments only the + * changes of the caller are committed, ie by tid. + * for non-journalled segments the data are flushed to + * disk and then the change to the disk inode and indirect + * blocks committed (so blocks newly allocated to the + * segment will be made a part of the segment atomically). + * + * all of the segments specified in clist must be in + * one file system. no more than 6 segments are needed + * to handle all unix svcs. + * + * if the i_nlink field (i.e. disk inode link count) + * is zero, and the type of inode is a regular file or + * directory, or symbolic link , the inode is truncated + * to zero length. the truncation is committed but the + * VM resources are unaffected until it is closed (see + * iput and iclose). + * + * PARAMETER: + * + * RETURN: + * + * serialization: + * on entry the inode lock on each segment is assumed + * to be held. + * + * i/o error: + */ +int txCommit(tid_t tid, /* transaction identifier */ + int nip, /* number of inodes to commit */ + struct inode **iplist, /* list of inode to commit */ + int flag) +{ + int rc = 0; + struct commit cd; + struct jfs_log *log; + struct tblock *tblk; + struct lrd *lrd; + int lsn; + struct inode *ip; + struct jfs_inode_info *jfs_ip; + int k, n; + ino_t top; + struct super_block *sb; + + jfs_info("txCommit, tid = %d, flag = %d", tid, flag); + /* is read-only file system ? */ + if (isReadOnly(iplist[0])) { + rc = -EROFS; + goto TheEnd; + } + + sb = cd.sb = iplist[0]->i_sb; + cd.tid = tid; + + if (tid == 0) + tid = txBegin(sb, 0); + tblk = tid_to_tblock(tid); + + /* + * initialize commit structure + */ + log = JFS_SBI(sb)->log; + cd.log = log; + + /* initialize log record descriptor in commit */ + lrd = &cd.lrd; + lrd->logtid = cpu_to_le32(tblk->logtid); + lrd->backchain = 0; + + tblk->xflag |= flag; + + if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0) + tblk->xflag |= COMMIT_LAZY; + /* + * prepare non-journaled objects for commit + * + * flush data pages of non-journaled file + * to prevent the file getting non-initialized disk blocks + * in case of crash. + * (new blocks - ) + */ + cd.iplist = iplist; + cd.nip = nip; + + /* + * acquire transaction lock on (on-disk) inodes + * + * update on-disk inode from in-memory inode + * acquiring transaction locks for AFTER records + * on the on-disk inode of file object + * + * sort the inodes array by inode number in descending order + * to prevent deadlock when acquiring transaction lock + * of on-disk inodes on multiple on-disk inode pages by + * multiple concurrent transactions + */ + for (k = 0; k < cd.nip; k++) { + top = (cd.iplist[k])->i_ino; + for (n = k + 1; n < cd.nip; n++) { + ip = cd.iplist[n]; + if (ip->i_ino > top) { + top = ip->i_ino; + cd.iplist[n] = cd.iplist[k]; + cd.iplist[k] = ip; + } + } + + ip = cd.iplist[k]; + jfs_ip = JFS_IP(ip); + + /* + * BUGBUG - This code has temporarily been removed. The + * intent is to ensure that any file data is written before + * the metadata is committed to the journal. This prevents + * uninitialized data from appearing in a file after the + * journal has been replayed. (The uninitialized data + * could be sensitive data removed by another user.) + * + * The problem now is that we are holding the IWRITELOCK + * on the inode, and calling filemap_fdatawrite on an + * unmapped page will cause a deadlock in jfs_get_block. + * + * The long term solution is to pare down the use of + * IWRITELOCK. We are currently holding it too long. + * We could also be smarter about which data pages need + * to be written before the transaction is committed and + * when we don't need to worry about it at all. + * + * if ((!S_ISDIR(ip->i_mode)) + * && (tblk->flag & COMMIT_DELETE) == 0) + * filemap_write_and_wait(ip->i_mapping); + */ + + /* + * Mark inode as not dirty. It will still be on the dirty + * inode list, but we'll know not to commit it again unless + * it gets marked dirty again + */ + clear_cflag(COMMIT_Dirty, ip); + + /* inherit anonymous tlock(s) of inode */ + if (jfs_ip->atlhead) { + lid_to_tlock(jfs_ip->atltail)->next = tblk->next; + tblk->next = jfs_ip->atlhead; + if (!tblk->last) + tblk->last = jfs_ip->atltail; + jfs_ip->atlhead = jfs_ip->atltail = 0; + TXN_LOCK(); + list_del_init(&jfs_ip->anon_inode_list); + TXN_UNLOCK(); + } + + /* + * acquire transaction lock on on-disk inode page + * (become first tlock of the tblk's tlock list) + */ + if (((rc = diWrite(tid, ip)))) + goto out; + } + + /* + * write log records from transaction locks + * + * txUpdateMap() resets XAD_NEW in XAD. + */ + if ((rc = txLog(log, tblk, &cd))) + goto TheEnd; + + /* + * Ensure that inode isn't reused before + * lazy commit thread finishes processing + */ + if (tblk->xflag & COMMIT_DELETE) { + ihold(tblk->u.ip); + /* + * Avoid a rare deadlock + * + * If the inode is locked, we may be blocked in + * jfs_commit_inode. If so, we don't want the + * lazy_commit thread doing the last iput() on the inode + * since that may block on the locked inode. Instead, + * commit the transaction synchronously, so the last iput + * will be done by the calling thread (or later) + */ + /* + * I believe this code is no longer needed. Splitting I_LOCK + * into two bits, I_NEW and I_SYNC should prevent this + * deadlock as well. But since I don't have a JFS testload + * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done. + * Joern + */ + if (tblk->u.ip->i_state & I_SYNC) + tblk->xflag &= ~COMMIT_LAZY; + } + + ASSERT((!(tblk->xflag & COMMIT_DELETE)) || + ((tblk->u.ip->i_nlink == 0) && + !test_cflag(COMMIT_Nolink, tblk->u.ip))); + + /* + * write COMMIT log record + */ + lrd->type = cpu_to_le16(LOG_COMMIT); + lrd->length = 0; + lsn = lmLog(log, tblk, lrd, NULL); + + lmGroupCommit(log, tblk); + + /* + * - transaction is now committed - + */ + + /* + * force pages in careful update + * (imap addressing structure update) + */ + if (flag & COMMIT_FORCE) + txForce(tblk); + + /* + * update allocation map. + * + * update inode allocation map and inode: + * free pager lock on memory object of inode if any. + * update block allocation map. + * + * txUpdateMap() resets XAD_NEW in XAD. + */ + if (tblk->xflag & COMMIT_FORCE) + txUpdateMap(tblk); + + /* + * free transaction locks and pageout/free pages + */ + txRelease(tblk); + + if ((tblk->flag & tblkGC_LAZY) == 0) + txUnlock(tblk); + + + /* + * reset in-memory object state + */ + for (k = 0; k < cd.nip; k++) { + ip = cd.iplist[k]; + jfs_ip = JFS_IP(ip); + + /* + * reset in-memory inode state + */ + jfs_ip->bxflag = 0; + jfs_ip->blid = 0; + } + + out: + if (rc != 0) + txAbort(tid, 1); + + TheEnd: + jfs_info("txCommit: tid = %d, returning %d", tid, rc); + return rc; +} + +/* + * NAME: txLog() + * + * FUNCTION: Writes AFTER log records for all lines modified + * by tid for segments specified by inodes in comdata. + * Code assumes only WRITELOCKS are recorded in lockwords. + * + * PARAMETERS: + * + * RETURN : + */ +static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd) +{ + int rc = 0; + struct inode *ip; + lid_t lid; + struct tlock *tlck; + struct lrd *lrd = &cd->lrd; + + /* + * write log record(s) for each tlock of transaction, + */ + for (lid = tblk->next; lid; lid = tlck->next) { + tlck = lid_to_tlock(lid); + + tlck->flag |= tlckLOG; + + /* initialize lrd common */ + ip = tlck->ip; + lrd->aggregate = cpu_to_le32(JFS_SBI(ip->i_sb)->aggregate); + lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset); + lrd->log.redopage.inode = cpu_to_le32(ip->i_ino); + + /* write log record of page from the tlock */ + switch (tlck->type & tlckTYPE) { + case tlckXTREE: + xtLog(log, tblk, lrd, tlck); + break; + + case tlckDTREE: + dtLog(log, tblk, lrd, tlck); + break; + + case tlckINODE: + diLog(log, tblk, lrd, tlck, cd); + break; + + case tlckMAP: + mapLog(log, tblk, lrd, tlck); + break; + + case tlckDATA: + dataLog(log, tblk, lrd, tlck); + break; + + default: + jfs_err("UFO tlock:0x%p", tlck); + } + } + + return rc; +} + +/* + * diLog() + * + * function: log inode tlock and format maplock to update bmap; + */ +static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck, struct commit * cd) +{ + int rc = 0; + struct metapage *mp; + pxd_t *pxd; + struct pxd_lock *pxdlock; + + mp = tlck->mp; + + /* initialize as REDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_INODE); + lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE); + + pxd = &lrd->log.redopage.pxd; + + /* + * inode after image + */ + if (tlck->type & tlckENTRY) { + /* log after-image for logredo(): */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + } else if (tlck->type & tlckFREE) { + /* + * free inode extent + * + * (pages of the freed inode extent have been invalidated and + * a maplock for free of the extent has been formatted at + * txLock() time); + * + * the tlock had been acquired on the inode allocation map page + * (iag) that specifies the freed extent, even though the map + * page is not itself logged, to prevent pageout of the map + * page before the log; + */ + + /* log LOG_NOREDOINOEXT of the freed inode extent for + * logredo() to start NoRedoPage filters, and to update + * imap and bmap for free of the extent; + */ + lrd->type = cpu_to_le16(LOG_NOREDOINOEXT); + /* + * For the LOG_NOREDOINOEXT record, we need + * to pass the IAG number and inode extent + * index (within that IAG) from which the + * the extent being released. These have been + * passed to us in the iplist[1] and iplist[2]. + */ + lrd->log.noredoinoext.iagnum = + cpu_to_le32((u32) (size_t) cd->iplist[1]); + lrd->log.noredoinoext.inoext_idx = + cpu_to_le32((u32) (size_t) cd->iplist[2]); + + pxdlock = (struct pxd_lock *) & tlck->lock; + *pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* update bmap */ + tlck->flag |= tlckUPDATEMAP; + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + } else + jfs_err("diLog: UFO type tlck:0x%p", tlck); +#ifdef _JFS_WIP + /* + * alloc/free external EA extent + * + * a maplock for txUpdateMap() to update bPWMAP for alloc/free + * of the extent has been formatted at txLock() time; + */ + else { + assert(tlck->type & tlckEA); + + /* log LOG_UPDATEMAP for logredo() to update bmap for + * alloc of new (and free of old) external EA extent; + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + pxdlock = (struct pxd_lock *) & tlck->lock; + nlock = pxdlock->index; + for (i = 0; i < nlock; i++, pxdlock++) { + if (pxdlock->flag & mlckALLOCPXD) + lrd->log.updatemap.type = + cpu_to_le16(LOG_ALLOCPXD); + else + lrd->log.updatemap.type = + cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + } + + /* update bmap */ + tlck->flag |= tlckUPDATEMAP; + } +#endif /* _JFS_WIP */ + + return rc; +} + +/* + * dataLog() + * + * function: log data tlock + */ +static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + struct metapage *mp; + pxd_t *pxd; + + mp = tlck->mp; + + /* initialize as REDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_DATA); + lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE); + + pxd = &lrd->log.redopage.pxd; + + /* log after-image for logredo(): */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + + if (jfs_dirtable_inline(tlck->ip)) { + /* + * The table has been truncated, we've must have deleted + * the last entry, so don't bother logging this + */ + mp->lid = 0; + grab_metapage(mp); + metapage_homeok(mp); + discard_metapage(mp); + tlck->mp = NULL; + return 0; + } + + PXDaddress(pxd, mp->index); + PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); + + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + + return 0; +} + +/* + * dtLog() + * + * function: log dtree tlock and format maplock to update bmap; + */ +static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + struct metapage *mp; + struct pxd_lock *pxdlock; + pxd_t *pxd; + + mp = tlck->mp; + + /* initialize as REDOPAGE/NOREDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_DTREE); + lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE); + + pxd = &lrd->log.redopage.pxd; + + if (tlck->type & tlckBTROOT) + lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); + + /* + * page extension via relocation: entry insertion; + * page extension in-place: entry insertion; + * new right page from page split, reinitialized in-line + * root from root page split: entry insertion; + */ + if (tlck->type & (tlckNEW | tlckEXTEND)) { + /* log after-image of the new page for logredo(): + * mark log (LOG_NEW) for logredo() to initialize + * freelist and update bmap for alloc of the new page; + */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + if (tlck->type & tlckEXTEND) + lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND); + else + lrd->log.redopage.type |= cpu_to_le16(LOG_NEW); + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* format a maplock for txUpdateMap() to update bPMAP for + * alloc of the new page; + */ + if (tlck->type & tlckBTROOT) + return; + tlck->flag |= tlckUPDATEMAP; + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckALLOCPXD; + pxdlock->pxd = *pxd; + + pxdlock->index = 1; + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + return; + } + + /* + * entry insertion/deletion, + * sibling page link update (old right page before split); + */ + if (tlck->type & (tlckENTRY | tlckRELINK)) { + /* log after-image for logredo(): */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + return; + } + + /* + * page deletion: page has been invalidated + * page relocation: source extent + * + * a maplock for free of the page has been formatted + * at txLock() time); + */ + if (tlck->type & (tlckFREE | tlckRELOCATE)) { + /* log LOG_NOREDOPAGE of the deleted page for logredo() + * to start NoRedoPage filter and to update bmap for free + * of the deletd page + */ + lrd->type = cpu_to_le16(LOG_NOREDOPAGE); + pxdlock = (struct pxd_lock *) & tlck->lock; + *pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* a maplock for txUpdateMap() for free of the page + * has been formatted at txLock() time; + */ + tlck->flag |= tlckUPDATEMAP; + } + return; +} + +/* + * xtLog() + * + * function: log xtree tlock and format maplock to update bmap; + */ +static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + struct inode *ip; + struct metapage *mp; + xtpage_t *p; + struct xtlock *xtlck; + struct maplock *maplock; + struct xdlistlock *xadlock; + struct pxd_lock *pxdlock; + pxd_t *page_pxd; + int next, lwm, hwm; + + ip = tlck->ip; + mp = tlck->mp; + + /* initialize as REDOPAGE/NOREDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_XTREE); + lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE); + + page_pxd = &lrd->log.redopage.pxd; + + if (tlck->type & tlckBTROOT) { + lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); + p = &JFS_IP(ip)->i_xtroot; + if (S_ISDIR(ip->i_mode)) + lrd->log.redopage.type |= + cpu_to_le16(LOG_DIR_XTREE); + } else + p = (xtpage_t *) mp->data; + next = le16_to_cpu(p->header.nextindex); + + xtlck = (struct xtlock *) & tlck->lock; + + maplock = (struct maplock *) & tlck->lock; + xadlock = (struct xdlistlock *) maplock; + + /* + * entry insertion/extension; + * sibling page link update (old right page before split); + */ + if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) { + /* log after-image for logredo(): + * logredo() will update bmap for alloc of new/extended + * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from + * after-image of XADlist; + * logredo() resets (XAD_NEW|XAD_EXTEND) flag when + * applying the after-image to the meta-data page. + */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + PXDaddress(page_pxd, mp->index); + PXDlength(page_pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* format a maplock for txUpdateMap() to update bPMAP + * for alloc of new/extended extents of XAD[lwm:next) + * from the page itself; + * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag. + */ + lwm = xtlck->lwm.offset; + if (lwm == 0) + lwm = XTPAGEMAXSLOT; + + if (lwm == next) + goto out; + if (lwm > next) { + jfs_err("xtLog: lwm > next\n"); + goto out; + } + tlck->flag |= tlckUPDATEMAP; + xadlock->flag = mlckALLOCXADLIST; + xadlock->count = next - lwm; + if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) { + int i; + pxd_t *pxd; + /* + * Lazy commit may allow xtree to be modified before + * txUpdateMap runs. Copy xad into linelock to + * preserve correct data. + * + * We can fit twice as may pxd's as xads in the lock + */ + xadlock->flag = mlckALLOCPXDLIST; + pxd = xadlock->xdlist = &xtlck->pxdlock; + for (i = 0; i < xadlock->count; i++) { + PXDaddress(pxd, addressXAD(&p->xad[lwm + i])); + PXDlength(pxd, lengthXAD(&p->xad[lwm + i])); + p->xad[lwm + i].flag &= + ~(XAD_NEW | XAD_EXTENDED); + pxd++; + } + } else { + /* + * xdlist will point to into inode's xtree, ensure + * that transaction is not committed lazily. + */ + xadlock->flag = mlckALLOCXADLIST; + xadlock->xdlist = &p->xad[lwm]; + tblk->xflag &= ~COMMIT_LAZY; + } + jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d " + "count:%d", tlck->ip, mp, tlck, lwm, xadlock->count); + + maplock->index = 1; + + out: + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + + return; + } + + /* + * page deletion: file deletion/truncation (ref. xtTruncate()) + * + * (page will be invalidated after log is written and bmap + * is updated from the page); + */ + if (tlck->type & tlckFREE) { + /* LOG_NOREDOPAGE log for NoRedoPage filter: + * if page free from file delete, NoRedoFile filter from + * inode image of zero link count will subsume NoRedoPage + * filters for each page; + * if page free from file truncattion, write NoRedoPage + * filter; + * + * upadte of block allocation map for the page itself: + * if page free from deletion and truncation, LOG_UPDATEMAP + * log for the page itself is generated from processing + * its parent page xad entries; + */ + /* if page free from file truncation, log LOG_NOREDOPAGE + * of the deleted page for logredo() to start NoRedoPage + * filter for the page; + */ + if (tblk->xflag & COMMIT_TRUNCATE) { + /* write NOREDOPAGE for the page */ + lrd->type = cpu_to_le16(LOG_NOREDOPAGE); + PXDaddress(page_pxd, mp->index); + PXDlength(page_pxd, + mp->logical_size >> tblk->sb-> + s_blocksize_bits); + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + if (tlck->type & tlckBTROOT) { + /* Empty xtree must be logged */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + } + } + + /* init LOG_UPDATEMAP of the freed extents + * XAD[XTENTRYSTART:hwm) from the deleted page itself + * for logredo() to update bmap; + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST); + xtlck = (struct xtlock *) & tlck->lock; + hwm = xtlck->hwm.offset; + lrd->log.updatemap.nxd = + cpu_to_le16(hwm - XTENTRYSTART + 1); + /* reformat linelock for lmLog() */ + xtlck->header.offset = XTENTRYSTART; + xtlck->header.length = hwm - XTENTRYSTART + 1; + xtlck->index = 1; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* format a maplock for txUpdateMap() to update bmap + * to free extents of XAD[XTENTRYSTART:hwm) from the + * deleted page itself; + */ + tlck->flag |= tlckUPDATEMAP; + xadlock->count = hwm - XTENTRYSTART + 1; + if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) { + int i; + pxd_t *pxd; + /* + * Lazy commit may allow xtree to be modified before + * txUpdateMap runs. Copy xad into linelock to + * preserve correct data. + * + * We can fit twice as may pxd's as xads in the lock + */ + xadlock->flag = mlckFREEPXDLIST; + pxd = xadlock->xdlist = &xtlck->pxdlock; + for (i = 0; i < xadlock->count; i++) { + PXDaddress(pxd, + addressXAD(&p->xad[XTENTRYSTART + i])); + PXDlength(pxd, + lengthXAD(&p->xad[XTENTRYSTART + i])); + pxd++; + } + } else { + /* + * xdlist will point to into inode's xtree, ensure + * that transaction is not committed lazily. + */ + xadlock->flag = mlckFREEXADLIST; + xadlock->xdlist = &p->xad[XTENTRYSTART]; + tblk->xflag &= ~COMMIT_LAZY; + } + jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2", + tlck->ip, mp, xadlock->count); + + maplock->index = 1; + + /* mark page as invalid */ + if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode)) + && !(tlck->type & tlckBTROOT)) + tlck->flag |= tlckFREEPAGE; + /* + else (tblk->xflag & COMMIT_PMAP) + ? release the page; + */ + return; + } + + /* + * page/entry truncation: file truncation (ref. xtTruncate()) + * + * |----------+------+------+---------------| + * | | | + * | | hwm - hwm before truncation + * | next - truncation point + * lwm - lwm before truncation + * header ? + */ + if (tlck->type & tlckTRUNCATE) { + /* This odd declaration suppresses a bogus gcc warning */ + pxd_t pxd = pxd; /* truncated extent of xad */ + int twm; + + /* + * For truncation the entire linelock may be used, so it would + * be difficult to store xad list in linelock itself. + * Therefore, we'll just force transaction to be committed + * synchronously, so that xtree pages won't be changed before + * txUpdateMap runs. + */ + tblk->xflag &= ~COMMIT_LAZY; + lwm = xtlck->lwm.offset; + if (lwm == 0) + lwm = XTPAGEMAXSLOT; + hwm = xtlck->hwm.offset; + twm = xtlck->twm.offset; + + /* + * write log records + */ + /* log after-image for logredo(): + * + * logredo() will update bmap for alloc of new/extended + * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from + * after-image of XADlist; + * logredo() resets (XAD_NEW|XAD_EXTEND) flag when + * applying the after-image to the meta-data page. + */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + PXDaddress(page_pxd, mp->index); + PXDlength(page_pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* + * truncate entry XAD[twm == next - 1]: + */ + if (twm == next - 1) { + /* init LOG_UPDATEMAP for logredo() to update bmap for + * free of truncated delta extent of the truncated + * entry XAD[next - 1]: + * (xtlck->pxdlock = truncated delta extent); + */ + pxdlock = (struct pxd_lock *) & xtlck->pxdlock; + /* assert(pxdlock->type & tlckTRUNCATE); */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + pxd = pxdlock->pxd; /* save to format maplock */ + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + } + + /* + * free entries XAD[next:hwm]: + */ + if (hwm >= next) { + /* init LOG_UPDATEMAP of the freed extents + * XAD[next:hwm] from the deleted page itself + * for logredo() to update bmap; + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = + cpu_to_le16(LOG_FREEXADLIST); + xtlck = (struct xtlock *) & tlck->lock; + hwm = xtlck->hwm.offset; + lrd->log.updatemap.nxd = + cpu_to_le16(hwm - next + 1); + /* reformat linelock for lmLog() */ + xtlck->header.offset = next; + xtlck->header.length = hwm - next + 1; + xtlck->index = 1; + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + } + + /* + * format maplock(s) for txUpdateMap() to update bmap + */ + maplock->index = 0; + + /* + * allocate entries XAD[lwm:next): + */ + if (lwm < next) { + /* format a maplock for txUpdateMap() to update bPMAP + * for alloc of new/extended extents of XAD[lwm:next) + * from the page itself; + * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag. + */ + tlck->flag |= tlckUPDATEMAP; + xadlock->flag = mlckALLOCXADLIST; + xadlock->count = next - lwm; + xadlock->xdlist = &p->xad[lwm]; + + jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d " + "lwm:%d next:%d", + tlck->ip, mp, xadlock->count, lwm, next); + maplock->index++; + xadlock++; + } + + /* + * truncate entry XAD[twm == next - 1]: + */ + if (twm == next - 1) { + /* format a maplock for txUpdateMap() to update bmap + * to free truncated delta extent of the truncated + * entry XAD[next - 1]; + * (xtlck->pxdlock = truncated delta extent); + */ + tlck->flag |= tlckUPDATEMAP; + pxdlock = (struct pxd_lock *) xadlock; + pxdlock->flag = mlckFREEPXD; + pxdlock->count = 1; + pxdlock->pxd = pxd; + + jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d " + "hwm:%d", ip, mp, pxdlock->count, hwm); + maplock->index++; + xadlock++; + } + + /* + * free entries XAD[next:hwm]: + */ + if (hwm >= next) { + /* format a maplock for txUpdateMap() to update bmap + * to free extents of XAD[next:hwm] from thedeleted + * page itself; + */ + tlck->flag |= tlckUPDATEMAP; + xadlock->flag = mlckFREEXADLIST; + xadlock->count = hwm - next + 1; + xadlock->xdlist = &p->xad[next]; + + jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d " + "next:%d hwm:%d", + tlck->ip, mp, xadlock->count, next, hwm); + maplock->index++; + } + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + } + return; +} + +/* + * mapLog() + * + * function: log from maplock of freed data extents; + */ +static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + struct pxd_lock *pxdlock; + int i, nlock; + pxd_t *pxd; + + /* + * page relocation: free the source page extent + * + * a maplock for txUpdateMap() for free of the page + * has been formatted at txLock() time saving the src + * relocated page address; + */ + if (tlck->type & tlckRELOCATE) { + /* log LOG_NOREDOPAGE of the old relocated page + * for logredo() to start NoRedoPage filter; + */ + lrd->type = cpu_to_le16(LOG_NOREDOPAGE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxd = &lrd->log.redopage.pxd; + *pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* (N.B. currently, logredo() does NOT update bmap + * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE); + * if page free from relocation, LOG_UPDATEMAP log is + * specifically generated now for logredo() + * to update bmap for free of src relocated page; + * (new flag LOG_RELOCATE may be introduced which will + * inform logredo() to start NORedoPage filter and also + * update block allocation map at the same time, thus + * avoiding an extra log write); + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* a maplock for txUpdateMap() for free of the page + * has been formatted at txLock() time; + */ + tlck->flag |= tlckUPDATEMAP; + return; + } + /* + + * Otherwise it's not a relocate request + * + */ + else { + /* log LOG_UPDATEMAP for logredo() to update bmap for + * free of truncated/relocated delta extent of the data; + * e.g.: external EA extent, relocated/truncated extent + * from xtTailgate(); + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + pxdlock = (struct pxd_lock *) & tlck->lock; + nlock = pxdlock->index; + for (i = 0; i < nlock; i++, pxdlock++) { + if (pxdlock->flag & mlckALLOCPXD) + lrd->log.updatemap.type = + cpu_to_le16(LOG_ALLOCPXD); + else + lrd->log.updatemap.type = + cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + jfs_info("mapLog: xaddr:0x%lx xlen:0x%x", + (ulong) addressPXD(&pxdlock->pxd), + lengthPXD(&pxdlock->pxd)); + } + + /* update bmap */ + tlck->flag |= tlckUPDATEMAP; + } +} + +/* + * txEA() + * + * function: acquire maplock for EA/ACL extents or + * set COMMIT_INLINE flag; + */ +void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea) +{ + struct tlock *tlck = NULL; + struct pxd_lock *maplock = NULL, *pxdlock = NULL; + + /* + * format maplock for alloc of new EA extent + */ + if (newea) { + /* Since the newea could be a completely zeroed entry we need to + * check for the two flags which indicate we should actually + * commit new EA data + */ + if (newea->flag & DXD_EXTENT) { + tlck = txMaplock(tid, ip, tlckMAP); + maplock = (struct pxd_lock *) & tlck->lock; + pxdlock = (struct pxd_lock *) maplock; + pxdlock->flag = mlckALLOCPXD; + PXDaddress(&pxdlock->pxd, addressDXD(newea)); + PXDlength(&pxdlock->pxd, lengthDXD(newea)); + pxdlock++; + maplock->index = 1; + } else if (newea->flag & DXD_INLINE) { + tlck = NULL; + + set_cflag(COMMIT_Inlineea, ip); + } + } + + /* + * format maplock for free of old EA extent + */ + if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) { + if (tlck == NULL) { + tlck = txMaplock(tid, ip, tlckMAP); + maplock = (struct pxd_lock *) & tlck->lock; + pxdlock = (struct pxd_lock *) maplock; + maplock->index = 0; + } + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, addressDXD(oldea)); + PXDlength(&pxdlock->pxd, lengthDXD(oldea)); + maplock->index++; + } +} + +/* + * txForce() + * + * function: synchronously write pages locked by transaction + * after txLog() but before txUpdateMap(); + */ +static void txForce(struct tblock * tblk) +{ + struct tlock *tlck; + lid_t lid, next; + struct metapage *mp; + + /* + * reverse the order of transaction tlocks in + * careful update order of address index pages + * (right to left, bottom up) + */ + tlck = lid_to_tlock(tblk->next); + lid = tlck->next; + tlck->next = 0; + while (lid) { + tlck = lid_to_tlock(lid); + next = tlck->next; + tlck->next = tblk->next; + tblk->next = lid; + lid = next; + } + + /* + * synchronously write the page, and + * hold the page for txUpdateMap(); + */ + for (lid = tblk->next; lid; lid = next) { + tlck = lid_to_tlock(lid); + next = tlck->next; + + if ((mp = tlck->mp) != NULL && + (tlck->type & tlckBTROOT) == 0) { + assert(mp->xflag & COMMIT_PAGE); + + if (tlck->flag & tlckWRITEPAGE) { + tlck->flag &= ~tlckWRITEPAGE; + + /* do not release page to freelist */ + force_metapage(mp); +#if 0 + /* + * The "right" thing to do here is to + * synchronously write the metadata. + * With the current implementation this + * is hard since write_metapage requires + * us to kunmap & remap the page. If we + * have tlocks pointing into the metadata + * pages, we don't want to do this. I think + * we can get by with synchronously writing + * the pages when they are released. + */ + assert(mp->nohomeok); + set_bit(META_dirty, &mp->flag); + set_bit(META_sync, &mp->flag); +#endif + } + } + } +} + +/* + * txUpdateMap() + * + * function: update persistent allocation map (and working map + * if appropriate); + * + * parameter: + */ +static void txUpdateMap(struct tblock * tblk) +{ + struct inode *ip; + struct inode *ipimap; + lid_t lid; + struct tlock *tlck; + struct maplock *maplock; + struct pxd_lock pxdlock; + int maptype; + int k, nlock; + struct metapage *mp = NULL; + + ipimap = JFS_SBI(tblk->sb)->ipimap; + + maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP; + + + /* + * update block allocation map + * + * update allocation state in pmap (and wmap) and + * update lsn of the pmap page; + */ + /* + * scan each tlock/page of transaction for block allocation/free: + * + * for each tlock/page of transaction, update map. + * ? are there tlock for pmap and pwmap at the same time ? + */ + for (lid = tblk->next; lid; lid = tlck->next) { + tlck = lid_to_tlock(lid); + + if ((tlck->flag & tlckUPDATEMAP) == 0) + continue; + + if (tlck->flag & tlckFREEPAGE) { + /* + * Another thread may attempt to reuse freed space + * immediately, so we want to get rid of the metapage + * before anyone else has a chance to get it. + * Lock metapage, update maps, then invalidate + * the metapage. + */ + mp = tlck->mp; + ASSERT(mp->xflag & COMMIT_PAGE); + grab_metapage(mp); + } + + /* + * extent list: + * . in-line PXD list: + * . out-of-line XAD list: + */ + maplock = (struct maplock *) & tlck->lock; + nlock = maplock->index; + + for (k = 0; k < nlock; k++, maplock++) { + /* + * allocate blocks in persistent map: + * + * blocks have been allocated from wmap at alloc time; + */ + if (maplock->flag & mlckALLOC) { + txAllocPMap(ipimap, maplock, tblk); + } + /* + * free blocks in persistent and working map: + * blocks will be freed in pmap and then in wmap; + * + * ? tblock specifies the PMAP/PWMAP based upon + * transaction + * + * free blocks in persistent map: + * blocks will be freed from wmap at last reference + * release of the object for regular files; + * + * Alway free blocks from both persistent & working + * maps for directories + */ + else { /* (maplock->flag & mlckFREE) */ + + if (tlck->flag & tlckDIRECTORY) + txFreeMap(ipimap, maplock, + tblk, COMMIT_PWMAP); + else + txFreeMap(ipimap, maplock, + tblk, maptype); + } + } + if (tlck->flag & tlckFREEPAGE) { + if (!(tblk->flag & tblkGC_LAZY)) { + /* This is equivalent to txRelease */ + ASSERT(mp->lid == lid); + tlck->mp->lid = 0; + } + assert(mp->nohomeok == 1); + metapage_homeok(mp); + discard_metapage(mp); + tlck->mp = NULL; + } + } + /* + * update inode allocation map + * + * update allocation state in pmap and + * update lsn of the pmap page; + * update in-memory inode flag/state + * + * unlock mapper/write lock + */ + if (tblk->xflag & COMMIT_CREATE) { + diUpdatePMap(ipimap, tblk->ino, false, tblk); + /* update persistent block allocation map + * for the allocation of inode extent; + */ + pxdlock.flag = mlckALLOCPXD; + pxdlock.pxd = tblk->u.ixpxd; + pxdlock.index = 1; + txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk); + } else if (tblk->xflag & COMMIT_DELETE) { + ip = tblk->u.ip; + diUpdatePMap(ipimap, ip->i_ino, true, tblk); + iput(ip); + } +} + +/* + * txAllocPMap() + * + * function: allocate from persistent map; + * + * parameter: + * ipbmap - + * malock - + * xad list: + * pxd: + * + * maptype - + * allocate from persistent map; + * free from persistent map; + * (e.g., tmp file - free from working map at releae + * of last reference); + * free from persistent and working map; + * + * lsn - log sequence number; + */ +static void txAllocPMap(struct inode *ip, struct maplock * maplock, + struct tblock * tblk) +{ + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct xdlistlock *xadlistlock; + xad_t *xad; + s64 xaddr; + int xlen; + struct pxd_lock *pxdlock; + struct xdlistlock *pxdlistlock; + pxd_t *pxd; + int n; + + /* + * allocate from persistent map; + */ + if (maplock->flag & mlckALLOCXADLIST) { + xadlistlock = (struct xdlistlock *) maplock; + xad = xadlistlock->xdlist; + for (n = 0; n < xadlistlock->count; n++, xad++) { + if (xad->flag & (XAD_NEW | XAD_EXTENDED)) { + xaddr = addressXAD(xad); + xlen = lengthXAD(xad); + dbUpdatePMap(ipbmap, false, xaddr, + (s64) xlen, tblk); + xad->flag &= ~(XAD_NEW | XAD_EXTENDED); + jfs_info("allocPMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } + } else if (maplock->flag & mlckALLOCPXD) { + pxdlock = (struct pxd_lock *) maplock; + xaddr = addressPXD(&pxdlock->pxd); + xlen = lengthPXD(&pxdlock->pxd); + dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk); + jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); + } else { /* (maplock->flag & mlckALLOCPXDLIST) */ + + pxdlistlock = (struct xdlistlock *) maplock; + pxd = pxdlistlock->xdlist; + for (n = 0; n < pxdlistlock->count; n++, pxd++) { + xaddr = addressPXD(pxd); + xlen = lengthPXD(pxd); + dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, + tblk); + jfs_info("allocPMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } +} + +/* + * txFreeMap() + * + * function: free from persistent and/or working map; + * + * todo: optimization + */ +void txFreeMap(struct inode *ip, + struct maplock * maplock, struct tblock * tblk, int maptype) +{ + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct xdlistlock *xadlistlock; + xad_t *xad; + s64 xaddr; + int xlen; + struct pxd_lock *pxdlock; + struct xdlistlock *pxdlistlock; + pxd_t *pxd; + int n; + + jfs_info("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x", + tblk, maplock, maptype); + + /* + * free from persistent map; + */ + if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) { + if (maplock->flag & mlckFREEXADLIST) { + xadlistlock = (struct xdlistlock *) maplock; + xad = xadlistlock->xdlist; + for (n = 0; n < xadlistlock->count; n++, xad++) { + if (!(xad->flag & XAD_NEW)) { + xaddr = addressXAD(xad); + xlen = lengthXAD(xad); + dbUpdatePMap(ipbmap, true, xaddr, + (s64) xlen, tblk); + jfs_info("freePMap: xaddr:0x%lx " + "xlen:%d", + (ulong) xaddr, xlen); + } + } + } else if (maplock->flag & mlckFREEPXD) { + pxdlock = (struct pxd_lock *) maplock; + xaddr = addressPXD(&pxdlock->pxd); + xlen = lengthPXD(&pxdlock->pxd); + dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen, + tblk); + jfs_info("freePMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } else { /* (maplock->flag & mlckALLOCPXDLIST) */ + + pxdlistlock = (struct xdlistlock *) maplock; + pxd = pxdlistlock->xdlist; + for (n = 0; n < pxdlistlock->count; n++, pxd++) { + xaddr = addressPXD(pxd); + xlen = lengthPXD(pxd); + dbUpdatePMap(ipbmap, true, xaddr, + (s64) xlen, tblk); + jfs_info("freePMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } + } + + /* + * free from working map; + */ + if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) { + if (maplock->flag & mlckFREEXADLIST) { + xadlistlock = (struct xdlistlock *) maplock; + xad = xadlistlock->xdlist; + for (n = 0; n < xadlistlock->count; n++, xad++) { + xaddr = addressXAD(xad); + xlen = lengthXAD(xad); + dbFree(ip, xaddr, (s64) xlen); + xad->flag = 0; + jfs_info("freeWMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } else if (maplock->flag & mlckFREEPXD) { + pxdlock = (struct pxd_lock *) maplock; + xaddr = addressPXD(&pxdlock->pxd); + xlen = lengthPXD(&pxdlock->pxd); + dbFree(ip, xaddr, (s64) xlen); + jfs_info("freeWMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } else { /* (maplock->flag & mlckFREEPXDLIST) */ + + pxdlistlock = (struct xdlistlock *) maplock; + pxd = pxdlistlock->xdlist; + for (n = 0; n < pxdlistlock->count; n++, pxd++) { + xaddr = addressPXD(pxd); + xlen = lengthPXD(pxd); + dbFree(ip, xaddr, (s64) xlen); + jfs_info("freeWMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } + } +} + +/* + * txFreelock() + * + * function: remove tlock from inode anonymous locklist + */ +void txFreelock(struct inode *ip) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + struct tlock *xtlck, *tlck; + lid_t xlid = 0, lid; + + if (!jfs_ip->atlhead) + return; + + TXN_LOCK(); + xtlck = (struct tlock *) &jfs_ip->atlhead; + + while ((lid = xtlck->next) != 0) { + tlck = lid_to_tlock(lid); + if (tlck->flag & tlckFREELOCK) { + xtlck->next = tlck->next; + txLockFree(lid); + } else { + xtlck = tlck; + xlid = lid; + } + } + + if (jfs_ip->atlhead) + jfs_ip->atltail = xlid; + else { + jfs_ip->atltail = 0; + /* + * If inode was on anon_list, remove it + */ + list_del_init(&jfs_ip->anon_inode_list); + } + TXN_UNLOCK(); +} + +/* + * txAbort() + * + * function: abort tx before commit; + * + * frees line-locks and segment locks for all + * segments in comdata structure. + * Optionally sets state of file-system to FM_DIRTY in super-block. + * log age of page-frames in memory for which caller has + * are reset to 0 (to avoid logwarap). + */ +void txAbort(tid_t tid, int dirty) +{ + lid_t lid, next; + struct metapage *mp; + struct tblock *tblk = tid_to_tblock(tid); + struct tlock *tlck; + + /* + * free tlocks of the transaction + */ + for (lid = tblk->next; lid; lid = next) { + tlck = lid_to_tlock(lid); + next = tlck->next; + mp = tlck->mp; + JFS_IP(tlck->ip)->xtlid = 0; + + if (mp) { + mp->lid = 0; + + /* + * reset lsn of page to avoid logwarap: + * + * (page may have been previously committed by another + * transaction(s) but has not been paged, i.e., + * it may be on logsync list even though it has not + * been logged for the current tx.) + */ + if (mp->xflag & COMMIT_PAGE && mp->lsn) + LogSyncRelease(mp); + } + /* insert tlock at head of freelist */ + TXN_LOCK(); + txLockFree(lid); + TXN_UNLOCK(); + } + + /* caller will free the transaction block */ + + tblk->next = tblk->last = 0; + + /* + * mark filesystem dirty + */ + if (dirty) + jfs_error(tblk->sb, "txAbort"); + + return; +} + +/* + * txLazyCommit(void) + * + * All transactions except those changing ipimap (COMMIT_FORCE) are + * processed by this routine. This insures that the inode and block + * allocation maps are updated in order. For synchronous transactions, + * let the user thread finish processing after txUpdateMap() is called. + */ +static void txLazyCommit(struct tblock * tblk) +{ + struct jfs_log *log; + + while (((tblk->flag & tblkGC_READY) == 0) && + ((tblk->flag & tblkGC_UNLOCKED) == 0)) { + /* We must have gotten ahead of the user thread + */ + jfs_info("jfs_lazycommit: tblk 0x%p not unlocked", tblk); + yield(); + } + + jfs_info("txLazyCommit: processing tblk 0x%p", tblk); + + txUpdateMap(tblk); + + log = (struct jfs_log *) JFS_SBI(tblk->sb)->log; + + spin_lock_irq(&log->gclock); // LOGGC_LOCK + + tblk->flag |= tblkGC_COMMITTED; + + if (tblk->flag & tblkGC_READY) + log->gcrtc--; + + wake_up_all(&tblk->gcwait); // LOGGC_WAKEUP + + /* + * Can't release log->gclock until we've tested tblk->flag + */ + if (tblk->flag & tblkGC_LAZY) { + spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK + txUnlock(tblk); + tblk->flag &= ~tblkGC_LAZY; + txEnd(tblk - TxBlock); /* Convert back to tid */ + } else + spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK + + jfs_info("txLazyCommit: done: tblk = 0x%p", tblk); +} + +/* + * jfs_lazycommit(void) + * + * To be run as a kernel daemon. If lbmIODone is called in an interrupt + * context, or where blocking is not wanted, this routine will process + * committed transactions from the unlock queue. + */ +int jfs_lazycommit(void *arg) +{ + int WorkDone; + struct tblock *tblk; + unsigned long flags; + struct jfs_sb_info *sbi; + + do { + LAZY_LOCK(flags); + jfs_commit_thread_waking = 0; /* OK to wake another thread */ + while (!list_empty(&TxAnchor.unlock_queue)) { + WorkDone = 0; + list_for_each_entry(tblk, &TxAnchor.unlock_queue, + cqueue) { + + sbi = JFS_SBI(tblk->sb); + /* + * For each volume, the transactions must be + * handled in order. If another commit thread + * is handling a tblk for this superblock, + * skip it + */ + if (sbi->commit_state & IN_LAZYCOMMIT) + continue; + + sbi->commit_state |= IN_LAZYCOMMIT; + WorkDone = 1; + + /* + * Remove transaction from queue + */ + list_del(&tblk->cqueue); + + LAZY_UNLOCK(flags); + txLazyCommit(tblk); + LAZY_LOCK(flags); + + sbi->commit_state &= ~IN_LAZYCOMMIT; + /* + * Don't continue in the for loop. (We can't + * anyway, it's unsafe!) We want to go back to + * the beginning of the list. + */ + break; + } + + /* If there was nothing to do, don't continue */ + if (!WorkDone) + break; + } + /* In case a wakeup came while all threads were active */ + jfs_commit_thread_waking = 0; + + if (freezing(current)) { + LAZY_UNLOCK(flags); + refrigerator(); + } else { + DECLARE_WAITQUEUE(wq, current); + + add_wait_queue(&jfs_commit_thread_wait, &wq); + set_current_state(TASK_INTERRUPTIBLE); + LAZY_UNLOCK(flags); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&jfs_commit_thread_wait, &wq); + } + } while (!kthread_should_stop()); + + if (!list_empty(&TxAnchor.unlock_queue)) + jfs_err("jfs_lazycommit being killed w/pending transactions!"); + else + jfs_info("jfs_lazycommit being killed\n"); + return 0; +} + +void txLazyUnlock(struct tblock * tblk) +{ + unsigned long flags; + + LAZY_LOCK(flags); + + list_add_tail(&tblk->cqueue, &TxAnchor.unlock_queue); + /* + * Don't wake up a commit thread if there is already one servicing + * this superblock, or if the last one we woke up hasn't started yet. + */ + if (!(JFS_SBI(tblk->sb)->commit_state & IN_LAZYCOMMIT) && + !jfs_commit_thread_waking) { + jfs_commit_thread_waking = 1; + wake_up(&jfs_commit_thread_wait); + } + LAZY_UNLOCK(flags); +} + +static void LogSyncRelease(struct metapage * mp) +{ + struct jfs_log *log = mp->log; + + assert(mp->nohomeok); + assert(log); + metapage_homeok(mp); +} + +/* + * txQuiesce + * + * Block all new transactions and push anonymous transactions to + * completion + * + * This does almost the same thing as jfs_sync below. We don't + * worry about deadlocking when jfs_tlocks_low is set, since we would + * expect jfs_sync to get us out of that jam. + */ +void txQuiesce(struct super_block *sb) +{ + struct inode *ip; + struct jfs_inode_info *jfs_ip; + struct jfs_log *log = JFS_SBI(sb)->log; + tid_t tid; + + set_bit(log_QUIESCE, &log->flag); + + TXN_LOCK(); +restart: + while (!list_empty(&TxAnchor.anon_list)) { + jfs_ip = list_entry(TxAnchor.anon_list.next, + struct jfs_inode_info, + anon_inode_list); + ip = &jfs_ip->vfs_inode; + + /* + * inode will be removed from anonymous list + * when it is committed + */ + TXN_UNLOCK(); + tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE); + mutex_lock(&jfs_ip->commit_mutex); + txCommit(tid, 1, &ip, 0); + txEnd(tid); + mutex_unlock(&jfs_ip->commit_mutex); + /* + * Just to be safe. I don't know how + * long we can run without blocking + */ + cond_resched(); + TXN_LOCK(); + } + + /* + * If jfs_sync is running in parallel, there could be some inodes + * on anon_list2. Let's check. + */ + if (!list_empty(&TxAnchor.anon_list2)) { + list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list); + INIT_LIST_HEAD(&TxAnchor.anon_list2); + goto restart; + } + TXN_UNLOCK(); + + /* + * We may need to kick off the group commit + */ + jfs_flush_journal(log, 0); +} + +/* + * txResume() + * + * Allows transactions to start again following txQuiesce + */ +void txResume(struct super_block *sb) +{ + struct jfs_log *log = JFS_SBI(sb)->log; + + clear_bit(log_QUIESCE, &log->flag); + TXN_WAKEUP(&log->syncwait); +} + +/* + * jfs_sync(void) + * + * To be run as a kernel daemon. This is awakened when tlocks run low. + * We write any inodes that have anonymous tlocks so they will become + * available. + */ +int jfs_sync(void *arg) +{ + struct inode *ip; + struct jfs_inode_info *jfs_ip; + int rc; + tid_t tid; + + do { + /* + * write each inode on the anonymous inode list + */ + TXN_LOCK(); + while (jfs_tlocks_low && !list_empty(&TxAnchor.anon_list)) { + jfs_ip = list_entry(TxAnchor.anon_list.next, + struct jfs_inode_info, + anon_inode_list); + ip = &jfs_ip->vfs_inode; + + if (! igrab(ip)) { + /* + * Inode is being freed + */ + list_del_init(&jfs_ip->anon_inode_list); + } else if (mutex_trylock(&jfs_ip->commit_mutex)) { + /* + * inode will be removed from anonymous list + * when it is committed + */ + TXN_UNLOCK(); + tid = txBegin(ip->i_sb, COMMIT_INODE); + rc = txCommit(tid, 1, &ip, 0); + txEnd(tid); + mutex_unlock(&jfs_ip->commit_mutex); + + iput(ip); + /* + * Just to be safe. I don't know how + * long we can run without blocking + */ + cond_resched(); + TXN_LOCK(); + } else { + /* We can't get the commit mutex. It may + * be held by a thread waiting for tlock's + * so let's not block here. Save it to + * put back on the anon_list. + */ + + /* Take off anon_list */ + list_del(&jfs_ip->anon_inode_list); + + /* Put on anon_list2 */ + list_add(&jfs_ip->anon_inode_list, + &TxAnchor.anon_list2); + + TXN_UNLOCK(); + iput(ip); + TXN_LOCK(); + } + } + /* Add anon_list2 back to anon_list */ + list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list); + + if (freezing(current)) { + TXN_UNLOCK(); + refrigerator(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + TXN_UNLOCK(); + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + + jfs_info("jfs_sync being killed"); + return 0; +} + +#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG) +static int jfs_txanchor_proc_show(struct seq_file *m, void *v) +{ + char *freewait; + char *freelockwait; + char *lowlockwait; + + freewait = + waitqueue_active(&TxAnchor.freewait) ? "active" : "empty"; + freelockwait = + waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty"; + lowlockwait = + waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty"; + + seq_printf(m, + "JFS TxAnchor\n" + "============\n" + "freetid = %d\n" + "freewait = %s\n" + "freelock = %d\n" + "freelockwait = %s\n" + "lowlockwait = %s\n" + "tlocksInUse = %d\n" + "jfs_tlocks_low = %d\n" + "unlock_queue is %sempty\n", + TxAnchor.freetid, + freewait, + TxAnchor.freelock, + freelockwait, + lowlockwait, + TxAnchor.tlocksInUse, + jfs_tlocks_low, + list_empty(&TxAnchor.unlock_queue) ? "" : "not "); + return 0; +} + +static int jfs_txanchor_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_txanchor_proc_show, NULL); +} + +const struct file_operations jfs_txanchor_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_txanchor_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS) +static int jfs_txstats_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, + "JFS TxStats\n" + "===========\n" + "calls to txBegin = %d\n" + "txBegin blocked by sync barrier = %d\n" + "txBegin blocked by tlocks low = %d\n" + "txBegin blocked by no free tid = %d\n" + "calls to txBeginAnon = %d\n" + "txBeginAnon blocked by sync barrier = %d\n" + "txBeginAnon blocked by tlocks low = %d\n" + "calls to txLockAlloc = %d\n" + "tLockAlloc blocked by no free lock = %d\n", + TxStat.txBegin, + TxStat.txBegin_barrier, + TxStat.txBegin_lockslow, + TxStat.txBegin_freetid, + TxStat.txBeginAnon, + TxStat.txBeginAnon_barrier, + TxStat.txBeginAnon_lockslow, + TxStat.txLockAlloc, + TxStat.txLockAlloc_freelock); + return 0; +} + +static int jfs_txstats_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_txstats_proc_show, NULL); +} + +const struct file_operations jfs_txstats_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_txstats_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h new file mode 100644 index 00000000..ab728893 --- /dev/null +++ b/fs/jfs/jfs_txnmgr.h @@ -0,0 +1,311 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_TXNMGR +#define _H_JFS_TXNMGR + +#include "jfs_logmgr.h" + +/* + * Hide implementation of TxBlock and TxLock + */ +#define tid_to_tblock(tid) (&TxBlock[tid]) + +#define lid_to_tlock(lid) (&TxLock[lid]) + +/* + * transaction block + */ +struct tblock { + /* + * tblock and jbuf_t common area: struct logsyncblk + * + * the following 5 fields are the same as struct logsyncblk + * which is common to tblock and jbuf to form logsynclist + */ + u16 xflag; /* tx commit type */ + u16 flag; /* tx commit state */ + lid_t dummy; /* Must keep structures common */ + s32 lsn; /* recovery lsn */ + struct list_head synclist; /* logsynclist link */ + + /* lock management */ + struct super_block *sb; /* super block */ + lid_t next; /* index of first tlock of tid */ + lid_t last; /* index of last tlock of tid */ + wait_queue_head_t waitor; /* tids waiting on this tid */ + + /* log management */ + u32 logtid; /* log transaction id */ + + /* commit management */ + struct list_head cqueue; /* commit queue list */ + s32 clsn; /* commit lsn */ + struct lbuf *bp; + s32 pn; /* commit record log page number */ + s32 eor; /* commit record eor */ + wait_queue_head_t gcwait; /* group commit event list: + * ready transactions wait on this + * event for group commit completion. + */ + union { + struct inode *ip; /* inode being deleted */ + pxd_t ixpxd; /* pxd of inode extent for created inode */ + } u; + u32 ino; /* inode number being created */ +}; + +extern struct tblock *TxBlock; /* transaction block table */ + +/* commit flags: tblk->xflag */ +#define COMMIT_SYNC 0x0001 /* synchronous commit */ +#define COMMIT_FORCE 0x0002 /* force pageout at end of commit */ +#define COMMIT_FLUSH 0x0004 /* init flush at end of commit */ +#define COMMIT_MAP 0x00f0 +#define COMMIT_PMAP 0x0010 /* update pmap */ +#define COMMIT_WMAP 0x0020 /* update wmap */ +#define COMMIT_PWMAP 0x0040 /* update pwmap */ +#define COMMIT_FREE 0x0f00 +#define COMMIT_DELETE 0x0100 /* inode delete */ +#define COMMIT_TRUNCATE 0x0200 /* file truncation */ +#define COMMIT_CREATE 0x0400 /* inode create */ +#define COMMIT_LAZY 0x0800 /* lazy commit */ +#define COMMIT_PAGE 0x1000 /* Identifies element as metapage */ +#define COMMIT_INODE 0x2000 /* Identifies element as inode */ + +/* group commit flags tblk->flag: see jfs_logmgr.h */ + +/* + * transaction lock + */ +struct tlock { + lid_t next; /* 2: index next lockword on tid locklist + * next lockword on freelist + */ + tid_t tid; /* 2: transaction id holding lock */ + + u16 flag; /* 2: lock control */ + u16 type; /* 2: log type */ + + struct metapage *mp; /* 4/8: object page buffer locked */ + struct inode *ip; /* 4/8: object */ + /* (16) */ + + s16 lock[24]; /* 48: overlay area */ +}; /* (64) */ + +extern struct tlock *TxLock; /* transaction lock table */ + +/* + * tlock flag + */ +/* txLock state */ +#define tlckPAGELOCK 0x8000 +#define tlckINODELOCK 0x4000 +#define tlckLINELOCK 0x2000 +#define tlckINLINELOCK 0x1000 +/* lmLog state */ +#define tlckLOG 0x0800 +/* updateMap state */ +#define tlckUPDATEMAP 0x0080 +#define tlckDIRECTORY 0x0040 +/* freeLock state */ +#define tlckFREELOCK 0x0008 +#define tlckWRITEPAGE 0x0004 +#define tlckFREEPAGE 0x0002 + +/* + * tlock type + */ +#define tlckTYPE 0xfe00 +#define tlckINODE 0x8000 +#define tlckXTREE 0x4000 +#define tlckDTREE 0x2000 +#define tlckMAP 0x1000 +#define tlckEA 0x0800 +#define tlckACL 0x0400 +#define tlckDATA 0x0200 +#define tlckBTROOT 0x0100 + +#define tlckOPERATION 0x00ff +#define tlckGROW 0x0001 /* file grow */ +#define tlckREMOVE 0x0002 /* file delete */ +#define tlckTRUNCATE 0x0004 /* file truncate */ +#define tlckRELOCATE 0x0008 /* file/directory relocate */ +#define tlckENTRY 0x0001 /* directory insert/delete */ +#define tlckEXTEND 0x0002 /* directory extend in-line */ +#define tlckSPLIT 0x0010 /* splited page */ +#define tlckNEW 0x0020 /* new page from split */ +#define tlckFREE 0x0040 /* free page */ +#define tlckRELINK 0x0080 /* update sibling pointer */ + +/* + * linelock for lmLog() + * + * note: linelock and its variations are overlaid + * at tlock.lock: watch for alignment; + */ +struct lv { + u8 offset; /* 1: */ + u8 length; /* 1: */ +}; /* (2) */ + +#define TLOCKSHORT 20 +#define TLOCKLONG 28 + +struct linelock { + lid_t next; /* 2: next linelock */ + + s8 maxcnt; /* 1: */ + s8 index; /* 1: */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 l2linesize; /* 1: log2 of linesize */ + /* (8) */ + + struct lv lv[20]; /* 40: */ +}; /* (48) */ + +#define dt_lock linelock + +struct xtlock { + lid_t next; /* 2: */ + + s8 maxcnt; /* 1: */ + s8 index; /* 1: */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 l2linesize; /* 1: log2 of linesize */ + /* (8) */ + + struct lv header; /* 2: */ + struct lv lwm; /* 2: low water mark */ + struct lv hwm; /* 2: high water mark */ + struct lv twm; /* 2: */ + /* (16) */ + + s32 pxdlock[8]; /* 32: */ +}; /* (48) */ + + +/* + * maplock for txUpdateMap() + * + * note: maplock and its variations are overlaid + * at tlock.lock/linelock: watch for alignment; + * N.B. next field may be set by linelock, and should not + * be modified by maplock; + * N.B. index of the first pxdlock specifies index of next + * free maplock (i.e., number of maplock) in the tlock; + */ +struct maplock { + lid_t next; /* 2: */ + + u8 maxcnt; /* 2: */ + u8 index; /* 2: next free maplock index */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 count; /* 1: number of pxd/xad */ + /* (8) */ + + pxd_t pxd; /* 8: */ +}; /* (16): */ + +/* maplock flag */ +#define mlckALLOC 0x00f0 +#define mlckALLOCXADLIST 0x0080 +#define mlckALLOCPXDLIST 0x0040 +#define mlckALLOCXAD 0x0020 +#define mlckALLOCPXD 0x0010 +#define mlckFREE 0x000f +#define mlckFREEXADLIST 0x0008 +#define mlckFREEPXDLIST 0x0004 +#define mlckFREEXAD 0x0002 +#define mlckFREEPXD 0x0001 + +#define pxd_lock maplock + +struct xdlistlock { + lid_t next; /* 2: */ + + u8 maxcnt; /* 2: */ + u8 index; /* 2: */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 count; /* 1: number of pxd/xad */ + /* (8) */ + + /* + * We need xdlist to be 64 bits (8 bytes), regardless of + * whether void * is 32 or 64 bits + */ + union { + void *_xdlist; /* pxd/xad list */ + s64 pad; /* 8: Force 64-bit xdlist size */ + } union64; +}; /* (16): */ + +#define xdlist union64._xdlist + +/* + * commit + * + * parameter to the commit manager routines + */ +struct commit { + tid_t tid; /* tid = index of tblock */ + int flag; /* flags */ + struct jfs_log *log; /* log */ + struct super_block *sb; /* superblock */ + + int nip; /* number of entries in iplist */ + struct inode **iplist; /* list of pointers to inodes */ + + /* log record descriptor on 64-bit boundary */ + struct lrd lrd; /* : log record descriptor */ +}; + +/* + * external declarations + */ +extern int jfs_tlocks_low; + +extern int txInit(void); +extern void txExit(void); +extern struct tlock *txLock(tid_t, struct inode *, struct metapage *, int); +extern struct tlock *txMaplock(tid_t, struct inode *, int); +extern int txCommit(tid_t, int, struct inode **, int); +extern tid_t txBegin(struct super_block *, int); +extern void txBeginAnon(struct super_block *); +extern void txEnd(tid_t); +extern void txAbort(tid_t, int); +extern struct linelock *txLinelock(struct linelock *); +extern void txFreeMap(struct inode *, struct maplock *, struct tblock *, int); +extern void txEA(tid_t, struct inode *, dxd_t *, dxd_t *); +extern void txFreelock(struct inode *); +extern int lmLog(struct jfs_log *, struct tblock *, struct lrd *, + struct tlock *); +extern void txQuiesce(struct super_block *); +extern void txResume(struct super_block *); +extern void txLazyUnlock(struct tblock *); +extern int jfs_lazycommit(void *); +extern int jfs_sync(void *); +#endif /* _H_JFS_TXNMGR */ diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h new file mode 100644 index 00000000..43ea3713 --- /dev/null +++ b/fs/jfs/jfs_types.h @@ -0,0 +1,159 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_TYPES +#define _H_JFS_TYPES + +/* + * jfs_types.h: + * + * basic type/utility definitions + * + * note: this header file must be the 1st include file + * of JFS include list in all JFS .c file. + */ + +#include +#include + +#include "endian24.h" + +/* + * transaction and lock id's + * + * Don't change these without carefully considering the impact on the + * size and alignment of all of the linelock variants + */ +typedef u16 tid_t; +typedef u16 lid_t; + +/* + * Almost identical to Linux's timespec, but not quite + */ +struct timestruc_t { + __le32 tv_sec; + __le32 tv_nsec; +}; + +/* + * handy + */ + +#define LEFTMOSTONE 0x80000000 +#define HIGHORDER 0x80000000u /* high order bit on */ +#define ONES 0xffffffffu /* all bit on */ + +/* + * physical xd (pxd) + */ +typedef struct { + unsigned len:24; + unsigned addr1:8; + __le32 addr2; +} pxd_t; + +/* xd_t field construction */ + +#define PXDlength(pxd, length32) ((pxd)->len = __cpu_to_le24(length32)) +#define PXDaddress(pxd, address64)\ +{\ + (pxd)->addr1 = ((s64)address64) >> 32;\ + (pxd)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ +} + +/* xd_t field extraction */ +#define lengthPXD(pxd) __le24_to_cpu((pxd)->len) +#define addressPXD(pxd)\ + ( ((s64)((pxd)->addr1)) << 32 | __le32_to_cpu((pxd)->addr2)) + +#define MAXTREEHEIGHT 8 +/* pxd list */ +struct pxdlist { + s16 maxnpxd; + s16 npxd; + pxd_t pxd[MAXTREEHEIGHT]; +}; + + +/* + * data extent descriptor (dxd) + */ +typedef struct { + unsigned flag:8; /* 1: flags */ + unsigned rsrvd:24; + __le32 size; /* 4: size in byte */ + unsigned len:24; /* 3: length in unit of fsblksize */ + unsigned addr1:8; /* 1: address in unit of fsblksize */ + __le32 addr2; /* 4: address in unit of fsblksize */ +} dxd_t; /* - 16 - */ + +/* dxd_t flags */ +#define DXD_INDEX 0x80 /* B+-tree index */ +#define DXD_INLINE 0x40 /* in-line data extent */ +#define DXD_EXTENT 0x20 /* out-of-line single extent */ +#define DXD_FILE 0x10 /* out-of-line file (inode) */ +#define DXD_CORRUPT 0x08 /* Inconsistency detected */ + +/* dxd_t field construction + * Conveniently, the PXD macros work for DXD + */ +#define DXDlength PXDlength +#define DXDaddress PXDaddress +#define lengthDXD lengthPXD +#define addressDXD addressPXD +#define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32)) +#define sizeDXD(dxd) le32_to_cpu((dxd)->size) + +/* + * directory entry argument + */ +struct component_name { + int namlen; + wchar_t *name; +}; + + +/* + * DASD limit information - stored in directory inode + */ +struct dasd { + u8 thresh; /* Alert Threshold (in percent) */ + u8 delta; /* Alert Threshold delta (in percent) */ + u8 rsrvd1; + u8 limit_hi; /* DASD limit (in logical blocks) */ + __le32 limit_lo; /* DASD limit (in logical blocks) */ + u8 rsrvd2[3]; + u8 used_hi; /* DASD usage (in logical blocks) */ + __le32 used_lo; /* DASD usage (in logical blocks) */ +}; + +#define DASDLIMIT(dasdp) \ + (((u64)((dasdp)->limit_hi) << 32) + __le32_to_cpu((dasdp)->limit_lo)) +#define setDASDLIMIT(dasdp, limit)\ +{\ + (dasdp)->limit_hi = ((u64)limit) >> 32;\ + (dasdp)->limit_lo = __cpu_to_le32(limit);\ +} +#define DASDUSED(dasdp) \ + (((u64)((dasdp)->used_hi) << 32) + __le32_to_cpu((dasdp)->used_lo)) +#define setDASDUSED(dasdp, used)\ +{\ + (dasdp)->used_hi = ((u64)used) >> 32;\ + (dasdp)->used_lo = __cpu_to_le32(used);\ +} + +#endif /* !_H_JFS_TYPES */ diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c new file mode 100644 index 00000000..adcf92d3 --- /dev/null +++ b/fs/jfs/jfs_umount.c @@ -0,0 +1,168 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_umount.c + * + * note: file system in transition to aggregate/fileset: + * (ref. jfs_mount.c) + * + * file system unmount is interpreted as mount of the single/only + * fileset in the aggregate and, if unmount of the last fileset, + * as unmount of the aggerate; + */ + +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_metapage.h" +#include "jfs_debug.h" + +/* + * NAME: jfs_umount(vfsp, flags, crp) + * + * FUNCTION: vfs_umount() + * + * PARAMETERS: vfsp - virtual file system pointer + * flags - unmount for shutdown + * crp - credential + * + * RETURN : EBUSY - device has open files + */ +int jfs_umount(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct inode *ipbmap = sbi->ipbmap; + struct inode *ipimap = sbi->ipimap; + struct inode *ipaimap = sbi->ipaimap; + struct inode *ipaimap2 = sbi->ipaimap2; + struct jfs_log *log; + int rc = 0; + + jfs_info("UnMount JFS: sb:0x%p", sb); + + /* + * update superblock and close log + * + * if mounted read-write and log based recovery was enabled + */ + if ((log = sbi->log)) + /* + * Wait for outstanding transactions to be written to log: + */ + jfs_flush_journal(log, 1); + + /* + * close fileset inode allocation map (aka fileset inode) + */ + diUnmount(ipimap, 0); + + diFreeSpecial(ipimap); + sbi->ipimap = NULL; + + /* + * close secondary aggregate inode allocation map + */ + ipaimap2 = sbi->ipaimap2; + if (ipaimap2) { + diUnmount(ipaimap2, 0); + diFreeSpecial(ipaimap2); + sbi->ipaimap2 = NULL; + } + + /* + * close aggregate inode allocation map + */ + ipaimap = sbi->ipaimap; + diUnmount(ipaimap, 0); + diFreeSpecial(ipaimap); + sbi->ipaimap = NULL; + + /* + * close aggregate block allocation map + */ + dbUnmount(ipbmap, 0); + + diFreeSpecial(ipbmap); + sbi->ipimap = NULL; + + /* + * Make sure all metadata makes it to disk before we mark + * the superblock as clean + */ + filemap_write_and_wait(sbi->direct_inode->i_mapping); + + /* + * ensure all file system file pages are propagated to their + * home blocks on disk (and their in-memory buffer pages are + * invalidated) BEFORE updating file system superblock state + * (to signify file system is unmounted cleanly, and thus in + * consistent state) and log superblock active file system + * list (to signify skip logredo()). + */ + if (log) { /* log = NULL if read-only mount */ + updateSuper(sb, FM_CLEAN); + + /* + * close log: + * + * remove file system from log active file system list. + */ + rc = lmLogClose(sb); + } + jfs_info("UnMount JFS Complete: rc = %d", rc); + return rc; +} + + +int jfs_umount_rw(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_log *log = sbi->log; + + if (!log) + return 0; + + /* + * close log: + * + * remove file system from log active file system list. + */ + jfs_flush_journal(log, 1); + + /* + * Make sure all metadata makes it to disk + */ + dbSync(sbi->ipbmap); + diSync(sbi->ipimap); + + /* + * Note that we have to do this even if sync_blockdev() will + * do exactly the same a few instructions later: We can't + * mark the superblock clean before everything is flushed to + * disk. + */ + filemap_write_and_wait(sbi->direct_inode->i_mapping); + + updateSuper(sb, FM_CLEAN); + + return lmLogClose(sb); +} diff --git a/fs/jfs/jfs_unicode.c b/fs/jfs/jfs_unicode.c new file mode 100644 index 00000000..c7de6f5b --- /dev/null +++ b/fs/jfs/jfs_unicode.c @@ -0,0 +1,138 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_unicode.h" +#include "jfs_debug.h" + +/* + * NAME: jfs_strfromUCS() + * + * FUNCTION: Convert little-endian unicode string to character string + * + */ +int jfs_strfromUCS_le(char *to, const __le16 * from, + int len, struct nls_table *codepage) +{ + int i; + int outlen = 0; + static int warn_again = 5; /* Only warn up to 5 times total */ + int warn = !!warn_again; /* once per string */ + + if (codepage) { + for (i = 0; (i < len) && from[i]; i++) { + int charlen; + charlen = + codepage->uni2char(le16_to_cpu(from[i]), + &to[outlen], + NLS_MAX_CHARSET_SIZE); + if (charlen > 0) + outlen += charlen; + else + to[outlen++] = '?'; + } + } else { + for (i = 0; (i < len) && from[i]; i++) { + if (unlikely(le16_to_cpu(from[i]) & 0xff00)) { + to[i] = '?'; + if (unlikely(warn)) { + warn--; + warn_again--; + printk(KERN_ERR + "non-latin1 character 0x%x found in JFS file name\n", + le16_to_cpu(from[i])); + printk(KERN_ERR + "mount with iocharset=utf8 to access\n"); + } + + } + else + to[i] = (char) (le16_to_cpu(from[i])); + } + outlen = i; + } + to[outlen] = 0; + return outlen; +} + +/* + * NAME: jfs_strtoUCS() + * + * FUNCTION: Convert character string to unicode string + * + */ +static int jfs_strtoUCS(wchar_t * to, const unsigned char *from, int len, + struct nls_table *codepage) +{ + int charlen; + int i; + + if (codepage) { + for (i = 0; len && *from; i++, from += charlen, len -= charlen) + { + charlen = codepage->char2uni(from, len, &to[i]); + if (charlen < 1) { + jfs_err("jfs_strtoUCS: char2uni returned %d.", + charlen); + jfs_err("charset = %s, char = 0x%x", + codepage->charset, *from); + return charlen; + } + } + } else { + for (i = 0; (i < len) && from[i]; i++) + to[i] = (wchar_t) from[i]; + } + + to[i] = 0; + return i; +} + +/* + * NAME: get_UCSname() + * + * FUNCTION: Allocate and translate to unicode string + * + */ +int get_UCSname(struct component_name * uniName, struct dentry *dentry) +{ + struct nls_table *nls_tab = JFS_SBI(dentry->d_sb)->nls_tab; + int length = dentry->d_name.len; + + if (length > JFS_NAME_MAX) + return -ENAMETOOLONG; + + uniName->name = + kmalloc((length + 1) * sizeof(wchar_t), GFP_NOFS); + + if (uniName->name == NULL) + return -ENOMEM; + + uniName->namlen = jfs_strtoUCS(uniName->name, dentry->d_name.name, + length, nls_tab); + + if (uniName->namlen < 0) { + kfree(uniName->name); + return uniName->namlen; + } + + return 0; +} diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h new file mode 100644 index 00000000..8f0f02cb --- /dev/null +++ b/fs/jfs/jfs_unicode.h @@ -0,0 +1,156 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_UNICODE +#define _H_JFS_UNICODE + +#include +#include +#include "jfs_types.h" + +typedef struct { + wchar_t start; + wchar_t end; + signed char *table; +} UNICASERANGE; + +extern signed char UniUpperTable[512]; +extern UNICASERANGE UniUpperRange[]; +extern int get_UCSname(struct component_name *, struct dentry *); +extern int jfs_strfromUCS_le(char *, const __le16 *, int, struct nls_table *); + +#define free_UCSname(COMP) kfree((COMP)->name) + +/* + * UniStrcpy: Copy a string + */ +static inline wchar_t *UniStrcpy(wchar_t * ucs1, const wchar_t * ucs2) +{ + wchar_t *anchor = ucs1; /* save the start of result string */ + + while ((*ucs1++ = *ucs2++)); + return anchor; +} + + + +/* + * UniStrncpy: Copy length limited string with pad + */ +static inline __le16 *UniStrncpy_le(__le16 * ucs1, const __le16 * ucs2, + size_t n) +{ + __le16 *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = *ucs2++; + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrncmp_le: Compare length limited string - native to little-endian + */ +static inline int UniStrncmp_le(const wchar_t * ucs1, const __le16 * ucs2, + size_t n) +{ + if (!n) + return 0; /* Null strings are equal */ + while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) { + ucs1++; + ucs2++; + } + return (int) *ucs1 - (int) __le16_to_cpu(*ucs2); +} + +/* + * UniStrncpy_to_le: Copy length limited string with pad to little-endian + */ +static inline __le16 *UniStrncpy_to_le(__le16 * ucs1, const wchar_t * ucs2, + size_t n) +{ + __le16 *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = cpu_to_le16(*ucs2++); + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrncpy_from_le: Copy length limited string with pad from little-endian + */ +static inline wchar_t *UniStrncpy_from_le(wchar_t * ucs1, const __le16 * ucs2, + size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = __le16_to_cpu(*ucs2++); + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniToupper: Convert a unicode character to upper case + */ +static inline wchar_t UniToupper(wchar_t uc) +{ + UNICASERANGE *rp; + + if (uc < sizeof(UniUpperTable)) { /* Latin characters */ + return uc + UniUpperTable[uc]; /* Use base tables */ + } else { + rp = UniUpperRange; /* Use range tables */ + while (rp->start) { + if (uc < rp->start) /* Before start of range */ + return uc; /* Uppercase = input */ + if (uc <= rp->end) /* In range */ + return uc + rp->table[uc - rp->start]; + rp++; /* Try next range */ + } + } + return uc; /* Past last range */ +} + + +/* + * UniStrupr: Upper case a unicode string + */ +static inline wchar_t *UniStrupr(wchar_t * upin) +{ + wchar_t *up; + + up = upin; + while (*up) { /* For all characters */ + *up = UniToupper(*up); + up++; + } + return upin; /* Return input pointer */ +} + +#endif /* !_H_JFS_UNICODE */ diff --git a/fs/jfs/jfs_uniupr.c b/fs/jfs/jfs_uniupr.c new file mode 100644 index 00000000..cfe50666 --- /dev/null +++ b/fs/jfs/jfs_uniupr.c @@ -0,0 +1,134 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include "jfs_unicode.h" + +/* + * Latin upper case + */ +signed char UniUpperTable[512] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */ + 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 060-06f */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, 0, 0, 0, 0, 0, /* 070-07f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 0e0-0ef */ + -32,-32,-32,-32,-32,-32,-32, 0,-32,-32,-32,-32,-32,-32,-32,121, /* 0f0-0ff */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */ + 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */ + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */ + 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */ + 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */ + 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */ + -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */ + 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */ + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,-79, 0, -1, /* 1d0-1df */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */ + 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */ +}; + +/* Upper case range - Greek */ +static signed char UniCaseRangeU03a0[47] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-38,-37,-37,-37, /* 3a0-3af */ + 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 3b0-3bf */ + -32,-32,-31,-32,-32,-32,-32,-32,-32,-32,-32,-32,-64,-63,-63, +}; + +/* Upper case range - Cyrillic */ +static signed char UniCaseRangeU0430[48] = { + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 430-43f */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 440-44f */ + 0,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80, 0,-80,-80, /* 450-45f */ +}; + +/* Upper case range - Extended cyrillic */ +static signed char UniCaseRangeU0490[61] = { + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */ + 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, +}; + +/* Upper case range - Extended latin and greek */ +static signed char UniCaseRangeU1e00[509] = { + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */ + 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0,-59, 0, -1, 0, -1, /* 1e90-1e9f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */ + 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */ + 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */ + 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */ + 74, 74, 86, 86, 86, 86,100,100, 0, 0,112,112,126,126, 0, 0, /* 1f70-1f7f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */ + 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */ + 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */ + 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */ + 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */ + 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Upper case range - Wide latin */ +static signed char UniCaseRangeUff40[27] = { + 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* ff40-ff4f */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, +}; + +/* + * Upper Case Range + */ +UNICASERANGE UniUpperRange[] = { + { 0x03a0, 0x03ce, UniCaseRangeU03a0 }, + { 0x0430, 0x045f, UniCaseRangeU0430 }, + { 0x0490, 0x04cc, UniCaseRangeU0490 }, + { 0x1e00, 0x1ffc, UniCaseRangeU1e00 }, + { 0xff40, 0xff5a, UniCaseRangeUff40 }, + { 0 } +}; diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h new file mode 100644 index 00000000..e9e100fd --- /dev/null +++ b/fs/jfs/jfs_xattr.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef H_JFS_XATTR +#define H_JFS_XATTR + +/* + * jfs_ea_list describe the on-disk format of the extended attributes. + * I know the null-terminator is redundant since namelen is stored, but + * I am maintaining compatibility with OS/2 where possible. + */ +struct jfs_ea { + u8 flag; /* Unused? */ + u8 namelen; /* Length of name */ + __le16 valuelen; /* Length of value */ + char name[0]; /* Attribute name (includes null-terminator) */ +}; /* Value immediately follows name */ + +struct jfs_ea_list { + __le32 size; /* overall size */ + struct jfs_ea ea[0]; /* Variable length list */ +}; + +/* Macros for defining maxiumum number of bytes supported for EAs */ +#define MAXEASIZE 65535 +#define MAXEALISTSIZE MAXEASIZE + +/* + * some macros for dealing with variable length EA lists. + */ +#define EA_SIZE(ea) \ + (sizeof (struct jfs_ea) + (ea)->namelen + 1 + \ + le16_to_cpu((ea)->valuelen)) +#define NEXT_EA(ea) ((struct jfs_ea *) (((char *) (ea)) + (EA_SIZE (ea)))) +#define FIRST_EA(ealist) ((ealist)->ea) +#define EALIST_SIZE(ealist) le32_to_cpu((ealist)->size) +#define END_EALIST(ealist) \ + ((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist))) + +extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *, + size_t, int); +extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t, + int); +extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t); +extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t); +extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); +extern int jfs_removexattr(struct dentry *, const char *); + +#ifdef CONFIG_JFS_SECURITY +extern int jfs_init_security(tid_t, struct inode *, struct inode *, + const struct qstr *); +#else +static inline int jfs_init_security(tid_t tid, struct inode *inode, + struct inode *dir, const struct qstr *qstr) +{ + return 0; +} +#endif + +#endif /* H_JFS_XATTR */ diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c new file mode 100644 index 00000000..6c50871e --- /dev/null +++ b/fs/jfs/jfs_xtree.c @@ -0,0 +1,3905 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2005 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * jfs_xtree.c: extent allocation descriptor B+-tree manager + */ + +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dmap.h" +#include "jfs_dinode.h" +#include "jfs_superblock.h" +#include "jfs_debug.h" + +/* + * xtree local flag + */ +#define XT_INSERT 0x00000001 + +/* + * xtree key/entry comparison: extent offset + * + * return: + * -1: k < start of extent + * 0: start_of_extent <= k <= end_of_extent + * 1: k > end_of_extent + */ +#define XT_CMP(CMP, K, X, OFFSET64)\ +{\ + OFFSET64 = offsetXAD(X);\ + (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\ + ((K) < OFFSET64) ? -1 : 0;\ +} + +/* write a xad entry */ +#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\ +{\ + (XAD)->flag = (FLAG);\ + XADoffset((XAD), (OFF));\ + XADlength((XAD), (LEN));\ + XADaddress((XAD), (ADDR));\ +} + +#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot) + +/* get page buffer for specified block address */ +/* ToDo: Replace this ugly macro with a function */ +#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ +{\ + BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\ + if (!(RC))\ + {\ + if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\ + (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\ + (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\ + {\ + jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\ + BT_PUTPAGE(MP);\ + MP = NULL;\ + RC = -EIO;\ + }\ + }\ +} + +/* for consistency */ +#define XT_PUTPAGE(MP) BT_PUTPAGE(MP) + +#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ + BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot) +/* xtree entry parameter descriptor */ +struct xtsplit { + struct metapage *mp; + s16 index; + u8 flag; + s64 off; + s64 addr; + int len; + struct pxdlist *pxdlist; +}; + + +/* + * statistics + */ +#ifdef CONFIG_JFS_STATISTICS +static struct { + uint search; + uint fastSearch; + uint split; +} xtStat; +#endif + + +/* + * forward references + */ +static int xtSearch(struct inode *ip, s64 xoff, s64 *next, int *cmpp, + struct btstack * btstack, int flag); + +static int xtSplitUp(tid_t tid, + struct inode *ip, + struct xtsplit * split, struct btstack * btstack); + +static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split, + struct metapage ** rmpp, s64 * rbnp); + +static int xtSplitRoot(tid_t tid, struct inode *ip, + struct xtsplit * split, struct metapage ** rmpp); + +#ifdef _STILL_TO_PORT +static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, + xtpage_t * fp, struct btstack * btstack); + +static int xtSearchNode(struct inode *ip, + xad_t * xad, + int *cmpp, struct btstack * btstack, int flag); + +static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp); +#endif /* _STILL_TO_PORT */ + +/* + * xtLookup() + * + * function: map a single page into a physical extent; + */ +int xtLookup(struct inode *ip, s64 lstart, + s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check) +{ + int rc = 0; + struct btstack btstack; + int cmp; + s64 bn; + struct metapage *mp; + xtpage_t *p; + int index; + xad_t *xad; + s64 next, size, xoff, xend; + int xlen; + s64 xaddr; + + *paddr = 0; + *plen = llen; + + if (!no_check) { + /* is lookup offset beyond eof ? */ + size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >> + JFS_SBI(ip->i_sb)->l2bsize; + if (lstart >= size) + return 0; + } + + /* + * search for the xad entry covering the logical extent + */ +//search: + if ((rc = xtSearch(ip, lstart, &next, &cmp, &btstack, 0))) { + jfs_err("xtLookup: xtSearch returned %d", rc); + return rc; + } + + /* + * compute the physical extent covering logical extent + * + * N.B. search may have failed (e.g., hole in sparse file), + * and returned the index of the next entry. + */ + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* is xad found covering start of logical extent ? + * lstart is a page start address, + * i.e., lstart cannot start in a hole; + */ + if (cmp) { + if (next) + *plen = min(next - lstart, llen); + goto out; + } + + /* + * lxd covered by xad + */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xend = xoff + xlen; + xaddr = addressXAD(xad); + + /* initialize new pxd */ + *pflag = xad->flag; + *paddr = xaddr + (lstart - xoff); + /* a page must be fully covered by an xad */ + *plen = min(xend - lstart, llen); + + out: + XT_PUTPAGE(mp); + + return rc; +} + +/* + * xtSearch() + * + * function: search for the xad entry covering specified offset. + * + * parameters: + * ip - file object; + * xoff - extent offset; + * nextp - address of next extent (if any) for search miss + * cmpp - comparison result: + * btstack - traverse stack; + * flag - search process flag (XT_INSERT); + * + * returns: + * btstack contains (bn, index) of search path traversed to the entry. + * *cmpp is set to result of comparison with the entry returned. + * the page containing the entry is pinned at exit. + */ +static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, + int *cmpp, struct btstack * btstack, int flag) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + int rc = 0; + int cmp = 1; /* init for empty page */ + s64 bn; /* block number */ + struct metapage *mp; /* page buffer */ + xtpage_t *p; /* page */ + xad_t *xad; + int base, index, lim, btindex; + struct btframe *btsp; + int nsplit = 0; /* number of pages to split */ + s64 t64; + s64 next = 0; + + INCREMENT(xtStat.search); + + BT_CLR(btstack); + + btstack->nsplit = 0; + + /* + * search down tree from root: + * + * between two consecutive entries of and of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* try sequential access heuristics with the previous + * access entry in target leaf page: + * once search narrowed down into the target leaf, + * key must either match an entry in the leaf or + * key entry does not exist in the tree; + */ +//fastSearch: + if ((jfs_ip->btorder & BT_SEQUENTIAL) && + (p->header.flag & BT_LEAF) && + (index = jfs_ip->btindex) < + le16_to_cpu(p->header.nextindex)) { + xad = &p->xad[index]; + t64 = offsetXAD(xad); + if (xoff < t64 + lengthXAD(xad)) { + if (xoff >= t64) { + *cmpp = 0; + goto out; + } + + /* stop sequential access heuristics */ + goto binarySearch; + } else { /* (t64 + lengthXAD(xad)) <= xoff */ + + /* try next sequential entry */ + index++; + if (index < + le16_to_cpu(p->header.nextindex)) { + xad++; + t64 = offsetXAD(xad); + if (xoff < t64 + lengthXAD(xad)) { + if (xoff >= t64) { + *cmpp = 0; + goto out; + } + + /* miss: key falls between + * previous and this entry + */ + *cmpp = 1; + next = t64; + goto out; + } + + /* (xoff >= t64 + lengthXAD(xad)); + * matching entry may be further out: + * stop heuristic search + */ + /* stop sequential access heuristics */ + goto binarySearch; + } + + /* (index == p->header.nextindex); + * miss: key entry does not exist in + * the target leaf/tree + */ + *cmpp = 1; + goto out; + } + + /* + * if hit, return index of the entry found, and + * if miss, where new entry with search key is + * to be inserted; + */ + out: + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == /* little-endian */ + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + /* update sequential access heuristics */ + jfs_ip->btindex = index; + + if (nextp) + *nextp = next; + + INCREMENT(xtStat.fastSearch); + return 0; + } + + /* well, ... full search now */ + binarySearch: + lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + + /* + * binary search with search key K on the current page + */ + for (base = XTENTRYSTART; lim; lim >>= 1) { + index = base + (lim >> 1); + + XT_CMP(cmp, xoff, &p->xad[index], t64); + if (cmp == 0) { + /* + * search hit + */ + /* search hit - leaf page: + * return the entry found + */ + if (p->header.flag & BT_LEAF) { + *cmpp = cmp; + + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + /* init sequential access heuristics */ + btindex = jfs_ip->btindex; + if (index == btindex || + index == btindex + 1) + jfs_ip->btorder = BT_SEQUENTIAL; + else + jfs_ip->btorder = BT_RANDOM; + jfs_ip->btindex = index; + + return 0; + } + /* search hit - internal page: + * descend/search its child page + */ + if (index < le16_to_cpu(p->header.nextindex)-1) + next = offsetXAD(&p->xad[index + 1]); + goto next; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or maxentry index. + */ + if (base < le16_to_cpu(p->header.nextindex)) + next = offsetXAD(&p->xad[base]); + /* + * search miss - leaf page: + * + * return location of entry (base) where new entry with + * search key K is to be inserted. + */ + if (p->header.flag & BT_LEAF) { + *cmpp = cmp; + + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = base; + btsp->mp = mp; + + /* init sequential access heuristics */ + btindex = jfs_ip->btindex; + if (base == btindex || base == btindex + 1) + jfs_ip->btorder = BT_SEQUENTIAL; + else + jfs_ip->btorder = BT_RANDOM; + jfs_ip->btindex = base; + + if (nextp) + *nextp = next; + + return 0; + } + + /* + * search miss - non-leaf page: + * + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + next: + /* update number of pages to split */ + if (p->header.nextindex == p->header.maxentry) + nsplit++; + else + nsplit = 0; + + /* push (bn, index) of the parent page/entry */ + if (BT_STACK_FULL(btstack)) { + jfs_error(ip->i_sb, "stack overrun in xtSearch!"); + XT_PUTPAGE(mp); + return -EIO; + } + BT_PUSH(btstack, bn, index); + + /* get the child page block number */ + bn = addressXAD(&p->xad[index]); + + /* unpin the parent page */ + XT_PUTPAGE(mp); + } +} + +/* + * xtInsert() + * + * function: + * + * parameter: + * tid - transaction id; + * ip - file object; + * xflag - extent flag (XAD_NOTRECORDED): + * xoff - extent offset; + * xlen - extent length; + * xaddrp - extent address pointer (in/out): + * if (*xaddrp) + * caller allocated data extent at *xaddrp; + * else + * allocate data extent and return its xaddr; + * flag - + * + * return: + */ +int xtInsert(tid_t tid, /* transaction id */ + struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp, + int flag) +{ + int rc = 0; + s64 xaddr, hint; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + int cmp; + s64 next; + struct tlock *tlck; + struct xtlock *xtlck; + + jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); + + /* + * search for the entry location at which to insert: + * + * xtFastSearch() and xtSearch() both returns (leaf page + * pinned, index at which to insert). + * n.b. xtSearch() may return index of maxentry of + * the full page. + */ + if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* This test must follow XT_GETSEARCH since mp must be valid if + * we branch to out: */ + if ((cmp == 0) || (next && (xlen > next - xoff))) { + rc = -EEXIST; + goto out; + } + + /* + * allocate data extent requested + * + * allocation hint: last xad + */ + if ((xaddr = *xaddrp) == 0) { + if (index > XTENTRYSTART) { + xad = &p->xad[index - 1]; + hint = addressXAD(xad) + lengthXAD(xad) - 1; + } else + hint = 0; + if ((rc = dquot_alloc_block(ip, xlen))) + goto out; + if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) { + dquot_free_block(ip, xlen); + goto out; + } + } + + /* + * insert entry for new extent + */ + xflag |= XAD_NEW; + + /* + * if the leaf page is full, split the page and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + nextindex = le16_to_cpu(p->header.nextindex); + if (nextindex == le16_to_cpu(p->header.maxentry)) { + split.mp = mp; + split.index = index; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) { + /* undo data extent allocation */ + if (*xaddrp == 0) { + dbFree(ip, xaddr, (s64) xlen); + dquot_free_block(ip, xlen); + } + return rc; + } + + *xaddrp = xaddr; + return 0; + } + + /* + * insert the new entry into the leaf page + */ + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + BT_MARK_DIRTY(mp, ip); + + /* if insert into middle, shift right remaining entries. */ + if (index < nextindex) + memmove(&p->xad[index + 1], &p->xad[index], + (nextindex - index) * sizeof(xad_t)); + + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index */ + le16_add_cpu(&p->header.nextindex, 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + } + + *xaddrp = xaddr; + + out: + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtSplitUp() + * + * function: + * split full pages as propagating insertion up the tree + * + * parameter: + * tid - transaction id; + * ip - file object; + * split - entry parameter descriptor; + * btstack - traverse stack from xtSearch() + * + * return: + */ +static int +xtSplitUp(tid_t tid, + struct inode *ip, struct xtsplit * split, struct btstack * btstack) +{ + int rc = 0; + struct metapage *smp; + xtpage_t *sp; /* split page */ + struct metapage *rmp; + s64 rbn; /* new right page block number */ + struct metapage *rcmp; + xtpage_t *rcp; /* right child page */ + s64 rcbn; /* right child page block number */ + int skip; /* index of entry of insertion */ + int nextindex; /* next available entry index of p */ + struct btframe *parent; /* parent page entry on traverse stack */ + xad_t *xad; + s64 xaddr; + int xlen; + int nsplit; /* number of pages split */ + struct pxdlist pxdlist; + pxd_t *pxd; + struct tlock *tlck; + struct xtlock *xtlck; + + smp = split->mp; + sp = XT_PAGE(ip, smp); + + /* is inode xtree root extension/inline EA area free ? */ + if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) && + (le16_to_cpu(sp->header.maxentry) < XTROOTMAXSLOT) && + (JFS_IP(ip)->mode2 & INLINEEA)) { + sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT); + JFS_IP(ip)->mode2 &= ~INLINEEA; + + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + + /* if insert into middle, shift right remaining entries. */ + skip = split->index; + nextindex = le16_to_cpu(sp->header.nextindex); + if (skip < nextindex) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (nextindex - skip) * sizeof(xad_t)); + + /* insert the new entry: mark the entry NEW */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* advance next available entry index */ + le16_add_cpu(&sp->header.nextindex, 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(skip, (int)xtlck->lwm.offset) : skip; + xtlck->lwm.length = + le16_to_cpu(sp->header.nextindex) - + xtlck->lwm.offset; + } + + return 0; + } + + /* + * allocate new index blocks to cover index page split(s) + * + * allocation hint: ? + */ + if (split->pxdlist == NULL) { + nsplit = btstack->nsplit; + split->pxdlist = &pxdlist; + pxdlist.maxnpxd = pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + xlen = JFS_SBI(ip->i_sb)->nbperpage; + for (; nsplit > 0; nsplit--, pxd++) { + if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr)) + == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + + pxdlist.maxnpxd++; + + continue; + } + + /* undo allocation */ + + XT_PUTPAGE(smp); + return rc; + } + } + + /* + * Split leaf page into and a new right page . + * + * The split routines insert the new entry into the leaf page, + * and acquire txLock as appropriate. + * return pinned and its block number . + */ + rc = (sp->header.flag & BT_ROOT) ? + xtSplitRoot(tid, ip, split, &rmp) : + xtSplitPage(tid, ip, split, &rmp, &rbn); + + XT_PUTPAGE(smp); + + if (rc) + return -EIO; + /* + * propagate up the router entry for the leaf page just split + * + * insert a router entry for the new page into the parent page, + * propagate the insert/split up the tree by walking back the stack + * of (bn of parent page, index of child page entry in parent page) + * that were traversed during the search for the page that split. + * + * the propagation of insert/split up the tree stops if the root + * splits or the page inserted into doesn't have to split to hold + * the new entry. + * + * the parent entry for the split page remains the same, and + * a new entry is inserted at its right with the first key and + * block number of the new right page. + * + * There are a maximum of 3 pages pinned at any time: + * right child, left parent and right parent (when the parent splits) + * to keep the child page pinned while working on the parent. + * make sure that all pins are released at exit. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* parent page specified by stack frame */ + + /* keep current child pages pinned */ + rcmp = rmp; + rcbn = rbn; + rcp = XT_PAGE(ip, rcmp); + + /* + * insert router entry in parent for new right child page + */ + /* get/pin the parent page */ + XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); + if (rc) { + XT_PUTPAGE(rcmp); + return rc; + } + + /* + * The new key entry goes ONE AFTER the index of parent entry, + * because the split was to the right. + */ + skip = parent->index + 1; + + /* + * split or shift right remaining entries of the parent page + */ + nextindex = le16_to_cpu(sp->header.nextindex); + /* + * parent page is full - split the parent page + */ + if (nextindex == le16_to_cpu(sp->header.maxentry)) { + /* init for parent page split */ + split->mp = smp; + split->index = skip; /* index at insert */ + split->flag = XAD_NEW; + split->off = offsetXAD(&rcp->xad[XTENTRYSTART]); + split->len = JFS_SBI(ip->i_sb)->nbperpage; + split->addr = rcbn; + + /* unpin previous right child page */ + XT_PUTPAGE(rcmp); + + /* The split routines insert the new entry, + * and acquire txLock as appropriate. + * return pinned and its block number . + */ + rc = (sp->header.flag & BT_ROOT) ? + xtSplitRoot(tid, ip, split, &rmp) : + xtSplitPage(tid, ip, split, &rmp, &rbn); + if (rc) { + XT_PUTPAGE(smp); + return rc; + } + + XT_PUTPAGE(smp); + /* keep new child page pinned */ + } + /* + * parent page is not full - insert in parent page + */ + else { + /* + * insert router entry in parent for the right child + * page from the first entry of the right child page: + */ + /* + * acquire a transaction lock on the parent page; + * + * action: router xad insertion; + */ + BT_MARK_DIRTY(smp, ip); + + /* + * if insert into middle, shift right remaining entries + */ + if (skip < nextindex) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (nextindex - + skip) << L2XTSLOTSIZE); + + /* insert the router entry */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, XAD_NEW, + offsetXAD(&rcp->xad[XTENTRYSTART]), + JFS_SBI(ip->i_sb)->nbperpage, rcbn); + + /* advance next available entry index. */ + le16_add_cpu(&sp->header.nextindex, 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, smp, + tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(skip, (int)xtlck->lwm.offset) : skip; + xtlck->lwm.length = + le16_to_cpu(sp->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin parent page */ + XT_PUTPAGE(smp); + + /* exit propagate up */ + break; + } + } + + /* unpin current right page */ + XT_PUTPAGE(rmp); + + return 0; +} + + +/* + * xtSplitPage() + * + * function: + * split a full non-root page into + * original/split/left page and new right page + * i.e., the original/split page remains as left page. + * + * parameter: + * int tid, + * struct inode *ip, + * struct xtsplit *split, + * struct metapage **rmpp, + * u64 *rbnp, + * + * return: + * Pointer to page in which to insert or NULL on error. + */ +static int +xtSplitPage(tid_t tid, struct inode *ip, + struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp) +{ + int rc = 0; + struct metapage *smp; + xtpage_t *sp; + struct metapage *rmp; + xtpage_t *rp; /* new right page allocated */ + s64 rbn; /* new right page block number */ + struct metapage *mp; + xtpage_t *p; + s64 nextbn; + int skip, maxentry, middle, righthalf, n; + xad_t *xad; + struct pxdlist *pxdlist; + pxd_t *pxd; + struct tlock *tlck; + struct xtlock *sxtlck = NULL, *rxtlck = NULL; + int quota_allocation = 0; + + smp = split->mp; + sp = XT_PAGE(ip, smp); + + INCREMENT(xtStat.split); + + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) + goto clean_up; + + quota_allocation += lengthPXD(pxd); + + /* + * allocate the new right page for the split + */ + rmp = get_metapage(ip, rbn, PSIZE, 1); + if (rmp == NULL) { + rc = -EIO; + goto clean_up; + } + + jfs_info("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); + + BT_MARK_DIRTY(rmp, ip); + /* + * action: new page; + */ + + rp = (xtpage_t *) rmp->data; + rp->header.self = *pxd; + rp->header.flag = sp->header.flag & BT_TYPE; + rp->header.maxentry = sp->header.maxentry; /* little-endian */ + rp->header.nextindex = cpu_to_le16(XTENTRYSTART); + + BT_MARK_DIRTY(smp, ip); + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + /* + * acquire a transaction lock on the new right page; + */ + tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW); + rxtlck = (struct xtlock *) & tlck->lock; + rxtlck->lwm.offset = XTENTRYSTART; + /* + * acquire a transaction lock on the split page + */ + tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); + sxtlck = (struct xtlock *) & tlck->lock; + } + + /* + * initialize/update sibling pointers of and + */ + nextbn = le64_to_cpu(sp->header.next); + rp->header.next = cpu_to_le64(nextbn); + rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); + sp->header.next = cpu_to_le64(rbn); + + skip = split->index; + + /* + * sequential append at tail (after last entry of last page) + * + * if splitting the last page on a level because of appending + * a entry to it (skip is maxentry), it's likely that the access is + * sequential. adding an empty page on the side of the level is less + * work and can push the fill factor much higher than normal. + * if we're wrong it's no big deal - we will do the split the right + * way next time. + * (it may look like it's equally easy to do a similar hack for + * reverse sorted data, that is, split the tree left, but it's not. + * Be my guest.) + */ + if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) { + /* + * acquire a transaction lock on the new/right page; + * + * action: xad insertion; + */ + /* insert entry at the first entry of the new right page */ + xad = &rp->xad[XTENTRYSTART]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + /* rxtlck->lwm.offset = XTENTRYSTART; */ + rxtlck->lwm.length = 1; + } + + *rmpp = rmp; + *rbnp = rbn; + + jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp); + return 0; + } + + /* + * non-sequential insert (at possibly middle page) + */ + + /* + * update previous pointer of old next/right page of + */ + if (nextbn != 0) { + XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) { + XT_PUTPAGE(rmp); + goto clean_up; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the next page; + * + * action:sibling pointer update; + */ + if (!test_cflag(COMMIT_Nolink, ip)) + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + p->header.prev = cpu_to_le64(rbn); + + /* sibling page may have been updated previously, or + * it may be updated later; + */ + + XT_PUTPAGE(mp); + } + + /* + * split the data between the split and new/right pages + */ + maxentry = le16_to_cpu(sp->header.maxentry); + middle = maxentry >> 1; + righthalf = maxentry - middle; + + /* + * skip index in old split/left page - insert into left page: + */ + if (skip <= middle) { + /* move right half of split page to the new right page */ + memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle], + righthalf << L2XTSLOTSIZE); + + /* shift right tail of left half to make room for new entry */ + if (skip < middle) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (middle - skip) << L2XTSLOTSIZE); + + /* insert new entry */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* update page header */ + sp->header.nextindex = cpu_to_le16(middle + 1); + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.offset = (sxtlck->lwm.offset) ? + min(skip, (int)sxtlck->lwm.offset) : skip; + } + + rp->header.nextindex = + cpu_to_le16(XTENTRYSTART + righthalf); + } + /* + * skip index in new right page - insert into right page: + */ + else { + /* move left head of right half to right page */ + n = skip - middle; + memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle], + n << L2XTSLOTSIZE); + + /* insert new entry */ + n += XTENTRYSTART; + xad = &rp->xad[n]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* move right tail of right half to right page */ + if (skip < maxentry) + memmove(&rp->xad[n + 1], &sp->xad[skip], + (maxentry - skip) << L2XTSLOTSIZE); + + /* update page header */ + sp->header.nextindex = cpu_to_le16(middle); + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.offset = (sxtlck->lwm.offset) ? + min(middle, (int)sxtlck->lwm.offset) : middle; + } + + rp->header.nextindex = cpu_to_le16(XTENTRYSTART + + righthalf + 1); + } + + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) - + sxtlck->lwm.offset; + + /* rxtlck->lwm.offset = XTENTRYSTART; */ + rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) - + XTENTRYSTART; + } + + *rmpp = rmp; + *rbnp = rbn; + + jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp); + return rc; + + clean_up: + + /* Rollback quota allocation. */ + if (quota_allocation) + dquot_free_block(ip, quota_allocation); + + return (rc); +} + + +/* + * xtSplitRoot() + * + * function: + * split the full root page into original/root/split page and new + * right page + * i.e., root remains fixed in tree anchor (inode) and the root is + * copied to a single new right child page since root page << + * non-root page, and the split root page contains a single entry + * for the new right child page. + * + * parameter: + * int tid, + * struct inode *ip, + * struct xtsplit *split, + * struct metapage **rmpp) + * + * return: + * Pointer to page in which to insert or NULL on error. + */ +static int +xtSplitRoot(tid_t tid, + struct inode *ip, struct xtsplit * split, struct metapage ** rmpp) +{ + xtpage_t *sp; + struct metapage *rmp; + xtpage_t *rp; + s64 rbn; + int skip, nextindex; + xad_t *xad; + pxd_t *pxd; + struct pxdlist *pxdlist; + struct tlock *tlck; + struct xtlock *xtlck; + int rc; + + sp = &JFS_IP(ip)->i_xtroot; + + INCREMENT(xtStat.split); + + /* + * allocate a single (right) child page + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + rmp = get_metapage(ip, rbn, PSIZE, 1); + if (rmp == NULL) + return -EIO; + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { + release_metapage(rmp); + return rc; + } + + jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp); + + /* + * acquire a transaction lock on the new right page; + * + * action: new page; + */ + BT_MARK_DIRTY(rmp, ip); + + rp = (xtpage_t *) rmp->data; + rp->header.flag = + (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; + rp->header.self = *pxd; + rp->header.nextindex = cpu_to_le16(XTENTRYSTART); + rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE); + + /* initialize sibling pointers */ + rp->header.next = 0; + rp->header.prev = 0; + + /* + * copy the in-line root page into new right page extent + */ + nextindex = le16_to_cpu(sp->header.maxentry); + memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART], + (nextindex - XTENTRYSTART) << L2XTSLOTSIZE); + + /* + * insert the new entry into the new right/child page + * (skip index in the new right page will not change) + */ + skip = split->index; + /* if insert into middle, shift right remaining entries */ + if (skip != nextindex) + memmove(&rp->xad[skip + 1], &rp->xad[skip], + (nextindex - skip) * sizeof(xad_t)); + + xad = &rp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr); + + /* update page header */ + rp->header.nextindex = cpu_to_le16(nextindex + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = XTENTRYSTART; + xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) - + XTENTRYSTART; + } + + /* + * reset the root + * + * init root with the single entry for the new right page + * set the 1st entry offset to 0, which force the left-most key + * at any level of the tree to be less than any search key. + */ + /* + * acquire a transaction lock on the root page (in-memory inode); + * + * action: root split; + */ + BT_MARK_DIRTY(split->mp, ip); + + xad = &sp->xad[XTENTRYSTART]; + XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn); + + /* update page header of root */ + sp->header.flag &= ~BT_LEAF; + sp->header.flag |= BT_INTERNAL; + + sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = XTENTRYSTART; + xtlck->lwm.length = 1; + } + + *rmpp = rmp; + + jfs_info("xtSplitRoot: sp:0x%p rp:0x%p", sp, rp); + return 0; +} + + +/* + * xtExtend() + * + * function: extend in-place; + * + * note: existing extent may or may not have been committed. + * caller is responsible for pager buffer cache update, and + * working block allocation map update; + * update pmap: alloc whole extended extent; + */ +int xtExtend(tid_t tid, /* transaction id */ + struct inode *ip, s64 xoff, /* delta extent offset */ + s32 xlen, /* delta extent length */ + int flag) +{ + int rc = 0; + int cmp; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex, len; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + s64 xaddr; + struct tlock *tlck; + struct xtlock *xtlck = NULL; + + jfs_info("xtExtend: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); + + /* there must exist extent to be extended */ + if ((rc = xtSearch(ip, xoff - 1, NULL, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent"); + return -EIO; + } + + /* extension must be contiguous */ + xad = &p->xad[index]; + if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtExtend: extension is not contiguous"); + return -EIO; + } + + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + /* extend will overflow extent ? */ + xlen = lengthXAD(xad) + xlen; + if ((len = xlen - MAXXLEN) <= 0) + goto extendOld; + + /* + * extent overflow: insert entry for new extent + */ +//insertNew: + xoff = offsetXAD(xad) + MAXXLEN; + xaddr = addressXAD(xad) + MAXXLEN; + nextindex = le16_to_cpu(p->header.nextindex); + + /* + * if the leaf page is full, insert the new entry and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = index + 1; + split.flag = XAD_NEW; + split.off = xoff; /* split offset */ + split.len = len; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } + } + /* + * insert the new entry into the leaf page + */ + else { + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index + 1]; + XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr); + + /* advance next available entry index */ + le16_add_cpu(&p->header.nextindex, 1); + } + + /* get back old entry */ + xad = &p->xad[index]; + xlen = MAXXLEN; + + /* + * extend old extent + */ + extendOld: + XADlength(xad, xlen); + if (!(xad->flag & XAD_NEW)) + xad->flag |= XAD_EXTENDED; + + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + +#ifdef _NOTYET +/* + * xtTailgate() + * + * function: split existing 'tail' extent + * (split offset >= start offset of tail extent), and + * relocate and extend the split tail half; + * + * note: existing extent may or may not have been committed. + * caller is responsible for pager buffer cache update, and + * working block allocation map update; + * update pmap: free old split tail extent, alloc new extent; + */ +int xtTailgate(tid_t tid, /* transaction id */ + struct inode *ip, s64 xoff, /* split/new extent offset */ + s32 xlen, /* new extent length */ + s64 xaddr, /* new extent address */ + int flag) +{ + int rc = 0; + int cmp; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex, llen, rlen; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + struct tlock *tlck; + struct xtlock *xtlck = 0; + struct tlock *mtlck; + struct maplock *pxdlock; + +/* +printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", + (ulong)xoff, xlen, (ulong)xaddr); +*/ + + /* there must exist extent to be tailgated */ + if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtTailgate: couldn't find extent"); + return -EIO; + } + + /* entry found must be last entry */ + nextindex = le16_to_cpu(p->header.nextindex); + if (index != nextindex - 1) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, + "xtTailgate: the entry found is not the last entry"); + return -EIO; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire tlock of the leaf page containing original entry + */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + /* completely replace extent ? */ + xad = &p->xad[index]; +/* +printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", + (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); +*/ + if ((llen = xoff - offsetXAD(xad)) == 0) + goto updateOld; + + /* + * partially replace extent: insert entry for new extent + */ +//insertNew: + /* + * if the leaf page is full, insert the new entry and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = index + 1; + split.flag = XAD_NEW; + split.off = xoff; /* split offset */ + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } + } + /* + * insert the new entry into the leaf page + */ + else { + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index + 1]; + XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); + + /* advance next available entry index */ + le16_add_cpu(&p->header.nextindex, 1); + } + + /* get back old XAD */ + xad = &p->xad[index]; + + /* + * truncate/relocate old extent at split offset + */ + updateOld: + /* update dmap for old/committed/truncated extent */ + rlen = lengthXAD(xad) - llen; + if (!(xad->flag & XAD_NEW)) { + /* free from PWMAP at commit */ + if (!test_cflag(COMMIT_Nolink, ip)) { + mtlck = txMaplock(tid, ip, tlckMAP); + pxdlock = (struct maplock *) & mtlck->lock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen); + PXDlength(&pxdlock->pxd, rlen); + pxdlock->index = 1; + } + } else + /* free from WMAP */ + dbFree(ip, addressXAD(xad) + llen, (s64) rlen); + + if (llen) + /* truncate */ + XADlength(xad, llen); + else + /* replace */ + XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); + + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index, (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} +#endif /* _NOTYET */ + +/* + * xtUpdate() + * + * function: update XAD; + * + * update extent for allocated_but_not_recorded or + * compressed extent; + * + * parameter: + * nxad - new XAD; + * logical extent of the specified XAD must be completely + * contained by an existing XAD; + */ +int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) +{ /* new XAD */ + int rc = 0; + int cmp; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index0, index, newindex, nextindex; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad, *lxad, *rxad; + int xflag; + s64 nxoff, xoff; + int nxlen, xlen, lxlen, rxlen; + s64 nxaddr, xaddr; + struct tlock *tlck; + struct xtlock *xtlck = NULL; + int newpage = 0; + + /* there must exist extent to be tailgated */ + nxoff = offsetXAD(nxad); + nxlen = lengthXAD(nxad); + nxaddr = addressXAD(nxad); + + if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtUpdate: Could not find extent"); + return -EIO; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire tlock of the leaf page containing original entry + */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + xad = &p->xad[index0]; + xflag = xad->flag; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); + + /* nXAD must be completely contained within XAD */ + if ((xoff > nxoff) || + (nxoff + nxlen > xoff + xlen)) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, + "xtUpdate: nXAD in not completely contained within XAD"); + return -EIO; + } + + index = index0; + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + +#ifdef _JFS_WIP_NOCOALESCE + if (xoff < nxoff) + goto updateRight; + + /* + * replace XAD with nXAD + */ + replace: /* (nxoff == xoff) */ + if (nxlen == xlen) { + /* replace XAD with nXAD:recorded */ + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + goto out; + } else /* (nxlen < xlen) */ + goto updateLeft; +#endif /* _JFS_WIP_NOCOALESCE */ + +/* #ifdef _JFS_WIP_COALESCE */ + if (xoff < nxoff) + goto coalesceRight; + + /* + * coalesce with left XAD + */ +//coalesceLeft: /* (xoff == nxoff) */ + /* is XAD first entry of page ? */ + if (index == XTENTRYSTART) + goto replace; + + /* is nXAD logically and physically contiguous with lXAD ? */ + lxad = &p->xad[index - 1]; + lxlen = lengthXAD(lxad); + if (!(lxad->flag & XAD_NOTRECORDED) && + (nxoff == offsetXAD(lxad) + lxlen) && + (nxaddr == addressXAD(lxad) + lxlen) && + (lxlen + nxlen < MAXXLEN)) { + /* extend right lXAD */ + index0 = index - 1; + XADlength(lxad, lxlen + nxlen); + + /* If we just merged two extents together, need to make sure the + * right extent gets logged. If the left one is marked XAD_NEW, + * then we know it will be logged. Otherwise, mark as + * XAD_EXTENDED + */ + if (!(lxad->flag & XAD_NEW)) + lxad->flag |= XAD_EXTENDED; + + if (xlen > nxlen) { + /* truncate XAD */ + XADoffset(xad, xoff + nxlen); + XADlength(xad, xlen - nxlen); + XADaddress(xad, xaddr + nxlen); + goto out; + } else { /* (xlen == nxlen) */ + + /* remove XAD */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - + 1) << L2XTSLOTSIZE); + + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) - + 1); + + index = index0; + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + xoff = nxoff = offsetXAD(lxad); + xlen = nxlen = lxlen + nxlen; + xaddr = nxaddr = addressXAD(lxad); + goto coalesceRight; + } + } + + /* + * replace XAD with nXAD + */ + replace: /* (nxoff == xoff) */ + if (nxlen == xlen) { + /* replace XAD with nXAD:recorded */ + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + goto coalesceRight; + } else /* (nxlen < xlen) */ + goto updateLeft; + + /* + * coalesce with right XAD + */ + coalesceRight: /* (xoff <= nxoff) */ + /* is XAD last entry of page ? */ + if (newindex == nextindex) { + if (xoff == nxoff) + goto out; + goto updateRight; + } + + /* is nXAD logically and physically contiguous with rXAD ? */ + rxad = &p->xad[index + 1]; + rxlen = lengthXAD(rxad); + if (!(rxad->flag & XAD_NOTRECORDED) && + (nxoff + nxlen == offsetXAD(rxad)) && + (nxaddr + nxlen == addressXAD(rxad)) && + (rxlen + nxlen < MAXXLEN)) { + /* extend left rXAD */ + XADoffset(rxad, nxoff); + XADlength(rxad, rxlen + nxlen); + XADaddress(rxad, nxaddr); + + /* If we just merged two extents together, need to make sure + * the left extent gets logged. If the right one is marked + * XAD_NEW, then we know it will be logged. Otherwise, mark as + * XAD_EXTENDED + */ + if (!(rxad->flag & XAD_NEW)) + rxad->flag |= XAD_EXTENDED; + + if (xlen > nxlen) + /* truncate XAD */ + XADlength(xad, xlen - nxlen); + else { /* (xlen == nxlen) */ + + /* remove XAD */ + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - 1) << L2XTSLOTSIZE); + + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) - + 1); + } + + goto out; + } else if (xoff == nxoff) + goto out; + + if (xoff >= nxoff) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff"); + return -EIO; + } +/* #endif _JFS_WIP_COALESCE */ + + /* + * split XAD into (lXAD, nXAD): + * + * |---nXAD---> + * --|----------XAD----------|-- + * |-lXAD-| + */ + updateRight: /* (xoff < nxoff) */ + /* truncate old XAD as lXAD:not_recorded */ + xad = &p->xad[index]; + XADlength(xad, nxoff - xoff); + + /* insert nXAD:recorded */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = newindex; + split.flag = xflag & ~XAD_NOTRECORDED; + split.off = nxoff; + split.len = nxlen; + split.addr = nxaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } else { + /* is nXAD on new page ? */ + if (newindex > + (le16_to_cpu(p->header.maxentry) >> 1)) { + newindex = + newindex - + le16_to_cpu(p->header.nextindex) + + XTENTRYSTART; + newpage = 1; + } + } + } else { + /* if insert into middle, shift right remaining entries */ + if (newindex < nextindex) + memmove(&p->xad[newindex + 1], &p->xad[newindex], + (nextindex - newindex) << L2XTSLOTSIZE); + + /* insert the entry */ + xad = &p->xad[newindex]; + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + /* advance next available entry index. */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + } + + /* + * does nXAD force 3-way split ? + * + * |---nXAD--->| + * --|----------XAD-------------|-- + * |-lXAD-| |-rXAD -| + */ + if (nxoff + nxlen == xoff + xlen) + goto out; + + /* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */ + if (newpage) { + /* close out old page */ + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index0, (int)xtlck->lwm.offset) : index0; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + bn = le64_to_cpu(p->header.next); + XT_PUTPAGE(mp); + + /* get new right page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + index0 = index = newindex; + } else + index++; + + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + xlen = xlen - (nxoff - xoff); + xoff = nxoff; + xaddr = nxaddr; + + /* recompute split pages */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + XT_PUTPAGE(mp); + + if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtUpdate: xtSearch failed"); + return -EIO; + } + + if (index0 != index) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, + "xtUpdate: unexpected value of index"); + return -EIO; + } + } + + /* + * split XAD into (nXAD, rXAD) + * + * ---nXAD---| + * --|----------XAD----------|-- + * |-rXAD-| + */ + updateLeft: /* (nxoff == xoff) && (nxlen < xlen) */ + /* update old XAD with nXAD:recorded */ + xad = &p->xad[index]; + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + /* insert rXAD:not_recorded */ + xoff = xoff + nxlen; + xlen = xlen - nxlen; + xaddr = xaddr + nxlen; + if (nextindex == le16_to_cpu(p->header.maxentry)) { +/* +printf("xtUpdate.updateLeft.split p:0x%p\n", p); +*/ + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = newindex; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } + } else { + /* if insert into middle, shift right remaining entries */ + if (newindex < nextindex) + memmove(&p->xad[newindex + 1], &p->xad[newindex], + (nextindex - newindex) << L2XTSLOTSIZE); + + /* insert the entry */ + xad = &p->xad[newindex]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index. */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + } + + out: + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index0, (int)xtlck->lwm.offset) : index0; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtAppend() + * + * function: grow in append mode from contiguous region specified ; + * + * parameter: + * tid - transaction id; + * ip - file object; + * xflag - extent flag: + * xoff - extent offset; + * maxblocks - max extent length; + * xlen - extent length (in/out); + * xaddrp - extent address pointer (in/out): + * flag - + * + * return: + */ +int xtAppend(tid_t tid, /* transaction id */ + struct inode *ip, int xflag, s64 xoff, s32 maxblocks, + s32 * xlenp, /* (in/out) */ + s64 * xaddrp, /* (in/out) */ + int flag) +{ + int rc = 0; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn, xaddr; + int index, nextindex; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + int cmp; + struct tlock *tlck; + struct xtlock *xtlck; + int nsplit, nblocks, xlen; + struct pxdlist pxdlist; + pxd_t *pxd; + s64 next; + + xaddr = *xaddrp; + xlen = *xlenp; + jfs_info("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx", + (ulong) xoff, maxblocks, xlen, (ulong) xaddr); + + /* + * search for the entry location at which to insert: + * + * xtFastSearch() and xtSearch() both returns (leaf page + * pinned, index at which to insert). + * n.b. xtSearch() may return index of maxentry of + * the full page. + */ + if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp == 0) { + rc = -EEXIST; + goto out; + } + + if (next) + xlen = min(xlen, (int)(next - xoff)); +//insert: + /* + * insert entry for new extent + */ + xflag |= XAD_NEW; + + /* + * if the leaf page is full, split the page and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + nextindex = le16_to_cpu(p->header.nextindex); + if (nextindex < le16_to_cpu(p->header.maxentry)) + goto insertLeaf; + + /* + * allocate new index blocks to cover index page split(s) + */ + nsplit = btstack.nsplit; + split.pxdlist = &pxdlist; + pxdlist.maxnpxd = pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + nblocks = JFS_SBI(ip->i_sb)->nbperpage; + for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) { + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, nblocks); + + pxdlist.maxnpxd++; + + continue; + } + + /* undo allocation */ + + goto out; + } + + xlen = min(xlen, maxblocks); + + /* + * allocate data extent requested + */ + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen))) + goto out; + + split.mp = mp; + split.index = index; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) { + /* undo data extent allocation */ + dbFree(ip, *xaddrp, (s64) * xlenp); + + return rc; + } + + *xaddrp = xaddr; + *xlenp = xlen; + return 0; + + /* + * insert the new entry into the leaf page + */ + insertLeaf: + /* + * allocate data extent requested + */ + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen))) + goto out; + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index */ + le16_add_cpu(&p->header.nextindex, 1); + + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + + *xaddrp = xaddr; + *xlenp = xlen; + + out: + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} +#ifdef _STILL_TO_PORT + +/* - TBD for defragmentaion/reorganization - + * + * xtDelete() + * + * function: + * delete the entry with the specified key. + * + * N.B.: whole extent of the entry is assumed to be deleted. + * + * parameter: + * + * return: + * ENOENT: if the entry is not found. + * + * exception: + */ +int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag) +{ + int rc = 0; + struct btstack btstack; + int cmp; + s64 bn; + struct metapage *mp; + xtpage_t *p; + int index, nextindex; + struct tlock *tlck; + struct xtlock *xtlck; + + /* + * find the matching entry; xtSearch() pins the page + */ + if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0))) + return rc; + + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + if (cmp) { + /* unpin the leaf page */ + XT_PUTPAGE(mp); + return -ENOENT; + } + + /* + * delete the entry from the leaf page + */ + nextindex = le16_to_cpu(p->header.nextindex); + le16_add_cpu(&p->header.nextindex, -1); + + /* + * if the leaf page bocome empty, free the page + */ + if (p->header.nextindex == cpu_to_le16(XTENTRYSTART)) + return (xtDeleteUp(tid, ip, mp, p, &btstack)); + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action:xad deletion; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index; + + /* if delete from middle, shift left/compact the remaining entries */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - 1) * sizeof(xad_t)); + + XT_PUTPAGE(mp); + + return 0; +} + + +/* - TBD for defragmentaion/reorganization - + * + * xtDeleteUp() + * + * function: + * free empty pages as propagating deletion up the tree + * + * parameter: + * + * return: + */ +static int +xtDeleteUp(tid_t tid, struct inode *ip, + struct metapage * fmp, xtpage_t * fp, struct btstack * btstack) +{ + int rc = 0; + struct metapage *mp; + xtpage_t *p; + int index, nextindex; + s64 xaddr; + int xlen; + struct btframe *parent; + struct tlock *tlck; + struct xtlock *xtlck; + + /* + * keep root leaf page which has become empty + */ + if (fp->header.flag & BT_ROOT) { + /* keep the root page */ + fp->header.flag &= ~BT_INTERNAL; + fp->header.flag |= BT_LEAF; + fp->header.nextindex = cpu_to_le16(XTENTRYSTART); + + /* XT_PUTPAGE(fmp); */ + + return 0; + } + + /* + * free non-root leaf page + */ + if ((rc = xtRelink(tid, ip, fp))) { + XT_PUTPAGE(fmp); + return rc; + } + + xaddr = addressPXD(&fp->header.self); + xlen = lengthPXD(&fp->header.self); + /* free the page extent */ + dbFree(ip, xaddr, (s64) xlen); + + /* free the buffer page */ + discard_metapage(fmp); + + /* + * propagate page deletion up the index tree + * + * If the delete from the parent page makes it empty, + * continue all the way up the tree. + * stop if the root page is reached (which is never deleted) or + * if the entry deletion does not empty the page. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* get/pin the parent page */ + XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = parent->index; + + /* delete the entry for the freed child page from parent. + */ + nextindex = le16_to_cpu(p->header.nextindex); + + /* + * the parent has the single entry being deleted: + * free the parent page which has become empty. + */ + if (nextindex == 1) { + if (p->header.flag & BT_ROOT) { + /* keep the root page */ + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = + cpu_to_le16(XTENTRYSTART); + + /* XT_PUTPAGE(mp); */ + + break; + } else { + /* free the parent page */ + if ((rc = xtRelink(tid, ip, p))) + return rc; + + xaddr = addressPXD(&p->header.self); + /* free the page extent */ + dbFree(ip, xaddr, + (s64) JFS_SBI(ip->i_sb)->nbperpage); + + /* unpin/free the buffer page */ + discard_metapage(mp); + + /* propagate up */ + continue; + } + } + /* + * the parent has other entries remaining: + * delete the router entry from the parent page. + */ + else { + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action:xad deletion; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + xtlck->lwm. + offset) : index; + + /* if delete from middle, + * shift left/compact the remaining entries in the page + */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - + 1) << L2XTSLOTSIZE); + + le16_add_cpu(&p->header.nextindex, -1); + jfs_info("xtDeleteUp(entry): 0x%lx[%d]", + (ulong) parent->bn, index); + } + + /* unpin the parent page */ + XT_PUTPAGE(mp); + + /* exit propagation up */ + break; + } + + return 0; +} + + +/* + * NAME: xtRelocate() + * + * FUNCTION: relocate xtpage or data extent of regular file; + * This function is mainly used by defragfs utility. + * + * NOTE: This routine does not have the logic to handle + * uncommitted allocated extent. The caller should call + * txCommit() to commit all the allocation before call + * this routine. + */ +int +xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ + s64 nxaddr, /* new xaddr */ + int xtype) +{ /* extent type: XTPAGE or DATAEXT */ + int rc = 0; + struct tblock *tblk; + struct tlock *tlck; + struct xtlock *xtlck; + struct metapage *mp, *pmp, *lmp, *rmp; /* meta-page buffer */ + xtpage_t *p, *pp, *rp, *lp; /* base B+-tree index page */ + xad_t *xad; + pxd_t *pxd; + s64 xoff, xsize; + int xlen; + s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn; + cbuf_t *cp; + s64 offset, nbytes, nbrd, pno; + int nb, npages, nblks; + s64 bn; + int cmp; + int index; + struct pxd_lock *pxdlock; + struct btstack btstack; /* traverse stack */ + + xtype = xtype & EXTENT_TYPE; + + xoff = offsetXAD(oxad); + oxaddr = addressXAD(oxad); + xlen = lengthXAD(oxad); + + /* validate extent offset */ + offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; + if (offset >= ip->i_size) + return -ESTALE; /* stale extent */ + + jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx", + xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr); + + /* + * 1. get and validate the parent xtpage/xad entry + * covering the source extent to be relocated; + */ + if (xtype == DATAEXT) { + /* search in leaf entry */ + rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0); + if (rc) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + + if (cmp) { + XT_PUTPAGE(pmp); + return -ESTALE; + } + + /* validate for exact match with a single entry */ + xad = &pp->xad[index]; + if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) { + XT_PUTPAGE(pmp); + return -ESTALE; + } + } else { /* (xtype == XTPAGE) */ + + /* search in internal entry */ + rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0); + if (rc) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + + if (cmp) { + XT_PUTPAGE(pmp); + return -ESTALE; + } + + /* xtSearchNode() validated for exact match with a single entry + */ + xad = &pp->xad[index]; + } + jfs_info("xtRelocate: parent xad entry validated."); + + /* + * 2. relocate the extent + */ + if (xtype == DATAEXT) { + /* if the extent is allocated-but-not-recorded + * there is no real data to be moved in this extent, + */ + if (xad->flag & XAD_NOTRECORDED) + goto out; + else + /* release xtpage for cmRead()/xtLookup() */ + XT_PUTPAGE(pmp); + + /* + * cmRelocate() + * + * copy target data pages to be relocated; + * + * data extent must start at page boundary and + * multiple of page size (except the last data extent); + * read in each page of the source data extent into cbuf, + * update the cbuf extent descriptor of the page to be + * homeward bound to new dst data extent + * copy the data from the old extent to new extent. + * copy is essential for compressed files to avoid problems + * that can arise if there was a change in compression + * algorithms. + * it is a good strategy because it may disrupt cache + * policy to keep the pages in memory afterwards. + */ + offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; + assert((offset & CM_OFFSET) == 0); + nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize; + pno = offset >> CM_L2BSIZE; + npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE; +/* + npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - + (offset >> CM_L2BSIZE) + 1; +*/ + sxaddr = oxaddr; + dxaddr = nxaddr; + + /* process the request one cache buffer at a time */ + for (nbrd = 0; nbrd < nbytes; nbrd += nb, + offset += nb, pno++, npages--) { + /* compute page size */ + nb = min(nbytes - nbrd, CM_BSIZE); + + /* get the cache buffer of the page */ + if (rc = cmRead(ip, offset, npages, &cp)) + break; + + assert(addressPXD(&cp->cm_pxd) == sxaddr); + assert(!cp->cm_modified); + + /* bind buffer with the new extent address */ + nblks = nb >> JFS_IP(ip->i_sb)->l2bsize; + cmSetXD(ip, cp, pno, dxaddr, nblks); + + /* release the cbuf, mark it as modified */ + cmPut(cp, true); + + dxaddr += nblks; + sxaddr += nblks; + } + + /* get back parent page */ + if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0))) + return rc; + + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + jfs_info("xtRelocate: target data extent relocated."); + } else { /* (xtype == XTPAGE) */ + + /* + * read in the target xtpage from the source extent; + */ + XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); + if (rc) { + XT_PUTPAGE(pmp); + return rc; + } + + /* + * read in sibling pages if any to update sibling pointers; + */ + rmp = NULL; + if (p->header.next) { + nextbn = le64_to_cpu(p->header.next); + XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); + if (rc) { + XT_PUTPAGE(pmp); + XT_PUTPAGE(mp); + return (rc); + } + } + + lmp = NULL; + if (p->header.prev) { + prevbn = le64_to_cpu(p->header.prev); + XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); + if (rc) { + XT_PUTPAGE(pmp); + XT_PUTPAGE(mp); + if (rmp) + XT_PUTPAGE(rmp); + return (rc); + } + } + + /* at this point, all xtpages to be updated are in memory */ + + /* + * update sibling pointers of sibling xtpages if any; + */ + if (lmp) { + BT_MARK_DIRTY(lmp, ip); + tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK); + lp->header.next = cpu_to_le64(nxaddr); + XT_PUTPAGE(lmp); + } + + if (rmp) { + BT_MARK_DIRTY(rmp, ip); + tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK); + rp->header.prev = cpu_to_le64(nxaddr); + XT_PUTPAGE(rmp); + } + + /* + * update the target xtpage to be relocated + * + * update the self address of the target page + * and write to destination extent; + * redo image covers the whole xtpage since it is new page + * to the destination extent; + * update of bmap for the free of source extent + * of the target xtpage itself: + * update of bmap for the allocation of destination extent + * of the target xtpage itself: + * update of bmap for the extents covered by xad entries in + * the target xtpage is not necessary since they are not + * updated; + * if not committed before this relocation, + * target page may contain XAD_NEW entries which must + * be scanned for bmap update (logredo() always + * scan xtpage REDOPAGE image for bmap update); + * if committed before this relocation (tlckRELOCATE), + * scan may be skipped by commit() and logredo(); + */ + BT_MARK_DIRTY(mp, ip); + /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ + tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW); + xtlck = (struct xtlock *) & tlck->lock; + + /* update the self address in the xtpage header */ + pxd = &p->header.self; + PXDaddress(pxd, nxaddr); + + /* linelock for the after image of the whole page */ + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + + /* update the buffer extent descriptor of target xtpage */ + xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; + bmSetXD(mp, nxaddr, xsize); + + /* unpin the target page to new homeward bound */ + XT_PUTPAGE(mp); + jfs_info("xtRelocate: target xtpage relocated."); + } + + /* + * 3. acquire maplock for the source extent to be freed; + * + * acquire a maplock saving the src relocated extent address; + * to free of the extent at commit time; + */ + out: + /* if DATAEXT relocation, write a LOG_UPDATEMAP record for + * free PXD of the source data extent (logredo() will update + * bmap for free of source data extent), and update bmap for + * free of the source data extent; + */ + if (xtype == DATAEXT) + tlck = txMaplock(tid, ip, tlckMAP); + /* if XTPAGE relocation, write a LOG_NOREDOPAGE record + * for the source xtpage (logredo() will init NoRedoPage + * filter and will also update bmap for free of the source + * xtpage), and update bmap for free of the source xtpage; + * N.B. We use tlckMAP instead of tlkcXTREE because there + * is no buffer associated with this lock since the buffer + * has been redirected to the target location. + */ + else /* (xtype == XTPAGE) */ + tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE); + + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, oxaddr); + PXDlength(&pxdlock->pxd, xlen); + pxdlock->index = 1; + + /* + * 4. update the parent xad entry for relocation; + * + * acquire tlck for the parent entry with XAD_NEW as entry + * update which will write LOG_REDOPAGE and update bmap for + * allocation of XAD_NEW destination extent; + */ + jfs_info("xtRelocate: update parent xad entry."); + BT_MARK_DIRTY(pmp, ip); + tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + + /* update the XAD with the new destination extent; */ + xad = &pp->xad[index]; + xad->flag |= XAD_NEW; + XADaddress(xad, nxaddr); + + xtlck->lwm.offset = min(index, xtlck->lwm.offset); + xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) - + xtlck->lwm.offset; + + /* unpin the parent xtpage */ + XT_PUTPAGE(pmp); + + return rc; +} + + +/* + * xtSearchNode() + * + * function: search for the internal xad entry covering specified extent. + * This function is mainly used by defragfs utility. + * + * parameters: + * ip - file object; + * xad - extent to find; + * cmpp - comparison result: + * btstack - traverse stack; + * flag - search process flag; + * + * returns: + * btstack contains (bn, index) of search path traversed to the entry. + * *cmpp is set to result of comparison with the entry returned. + * the page containing the entry is pinned at exit. + */ +static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ + int *cmpp, struct btstack * btstack, int flag) +{ + int rc = 0; + s64 xoff, xaddr; + int xlen; + int cmp = 1; /* init for empty page */ + s64 bn; /* block number */ + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* page */ + int base, index, lim; + struct btframe *btsp; + s64 t64; + + BT_CLR(btstack); + + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); + + /* + * search down tree from root: + * + * between two consecutive entries of and of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + if (p->header.flag & BT_LEAF) { + XT_PUTPAGE(mp); + return -ESTALE; + } + + lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + + /* + * binary search with search key K on the current page + */ + for (base = XTENTRYSTART; lim; lim >>= 1) { + index = base + (lim >> 1); + + XT_CMP(cmp, xoff, &p->xad[index], t64); + if (cmp == 0) { + /* + * search hit + * + * verify for exact match; + */ + if (xaddr == addressXAD(&p->xad[index]) && + xoff == offsetXAD(&p->xad[index])) { + *cmpp = cmp; + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + return 0; + } + + /* descend/search its child page */ + goto next; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss - non-leaf page: + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or maxentry index. + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + next: + /* get the child page block number */ + bn = addressXAD(&p->xad[index]); + + /* unpin the parent page */ + XT_PUTPAGE(mp); + } +} + + +/* + * xtRelink() + * + * function: + * link around a freed page. + * + * Parameter: + * int tid, + * struct inode *ip, + * xtpage_t *p) + * + * returns: + */ +static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p) +{ + int rc = 0; + struct metapage *mp; + s64 nextbn, prevbn; + struct tlock *tlck; + + nextbn = le64_to_cpu(p->header.next); + prevbn = le64_to_cpu(p->header.prev); + + /* update prev pointer of the next page */ + if (nextbn != 0) { + XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * acquire a transaction lock on the page; + * + * action: update prev pointer; + */ + BT_MARK_DIRTY(mp, ip); + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + /* the page may already have been tlock'd */ + + p->header.prev = cpu_to_le64(prevbn); + + XT_PUTPAGE(mp); + } + + /* update next pointer of the previous page */ + if (prevbn != 0) { + XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * acquire a transaction lock on the page; + * + * action: update next pointer; + */ + BT_MARK_DIRTY(mp, ip); + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + /* the page may already have been tlock'd */ + + p->header.next = le64_to_cpu(nextbn); + + XT_PUTPAGE(mp); + } + + return 0; +} +#endif /* _STILL_TO_PORT */ + + +/* + * xtInitRoot() + * + * initialize file root (inline in inode) + */ +void xtInitRoot(tid_t tid, struct inode *ip) +{ + xtpage_t *p; + + /* + * acquire a transaction lock on the root + * + * action: + */ + txLock(tid, ip, (struct metapage *) &JFS_IP(ip)->bxflag, + tlckXTREE | tlckNEW); + p = &JFS_IP(ip)->i_xtroot; + + p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + + if (S_ISDIR(ip->i_mode)) + p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR); + else { + p->header.maxentry = cpu_to_le16(XTROOTINITSLOT); + ip->i_size = 0; + } + + + return; +} + + +/* + * We can run into a deadlock truncating a file with a large number of + * xtree pages (large fragmented file). A robust fix would entail a + * reservation system where we would reserve a number of metadata pages + * and tlocks which we would be guaranteed without a deadlock. Without + * this, a partial fix is to limit number of metadata pages we will lock + * in a single transaction. Currently we will truncate the file so that + * no more than 50 leaf pages will be locked. The caller of xtTruncate + * will be responsible for ensuring that the current transaction gets + * committed, and that subsequent transactions are created to truncate + * the file further if needed. + */ +#define MAX_TRUNCATE_LEAVES 50 + +/* + * xtTruncate() + * + * function: + * traverse for truncation logging backward bottom up; + * terminate at the last extent entry at the current subtree + * root page covering new down size. + * truncation may occur within the last extent entry. + * + * parameter: + * int tid, + * struct inode *ip, + * s64 newsize, + * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE} + * + * return: + * + * note: + * PWMAP: + * 1. truncate (non-COMMIT_NOLINK file) + * by jfs_truncate() or jfs_open(O_TRUNC): + * xtree is updated; + * 2. truncate index table of directory when last entry removed + * map update via tlock at commit time; + * PMAP: + * Call xtTruncate_pmap instead + * WMAP: + * 1. remove (free zero link count) on last reference release + * (pmap has been freed at commit zero link count); + * 2. truncate (COMMIT_NOLINK file, i.e., tmp file): + * xtree is updated; + * map update directly at truncation time; + * + * if (DELETE) + * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient); + * else if (TRUNCATE) + * must write LOG_NOREDOPAGE for deleted index page; + * + * pages may already have been tlocked by anonymous transactions + * during file growth (i.e., write) before truncation; + * + * except last truncated entry, deleted entries remains as is + * in the page (nextindex is updated) for other use + * (e.g., log/update allocation map): this avoid copying the page + * info but delay free of pages; + * + */ +s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) +{ + int rc = 0; + s64 teof; + struct metapage *mp; + xtpage_t *p; + s64 bn; + int index, nextindex; + xad_t *xad; + s64 xoff, xaddr; + int xlen, len, freexlen; + struct btstack btstack; + struct btframe *parent; + struct tblock *tblk = NULL; + struct tlock *tlck = NULL; + struct xtlock *xtlck = NULL; + struct xdlistlock xadlock; /* maplock for COMMIT_WMAP */ + struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ + s64 nfreed; + int freed, log; + int locked_leaves = 0; + + /* save object truncation type */ + if (tid) { + tblk = tid_to_tblock(tid); + tblk->xflag |= flag; + } + + nfreed = 0; + + flag &= COMMIT_MAP; + assert(flag != COMMIT_PMAP); + + if (flag == COMMIT_PWMAP) + log = 1; + else { + log = 0; + xadlock.flag = mlckFREEXADLIST; + xadlock.index = 1; + } + + /* + * if the newsize is not an integral number of pages, + * the file between newsize and next page boundary will + * be cleared. + * if truncating into a file hole, it will cause + * a full block to be allocated for the logical block. + */ + + /* + * release page blocks of truncated region + * + * free the data blocks from the leaf index blocks. + * delete the parent index entries corresponding to + * the freed child data/index blocks. + * free the index blocks themselves which aren't needed + * in new sized file. + * + * index blocks are updated only if the blocks are to be + * retained in the new sized file. + * if type is PMAP, the data and index pages are NOT + * freed, and the data and index blocks are NOT freed + * from working map. + * (this will allow continued access of data/index of + * temporary file (zerolink count file truncated to zero-length)). + */ + teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >> + JFS_SBI(ip->i_sb)->l2bsize; + + /* clear stack */ + BT_CLR(&btstack); + + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + + /* + * first access of each page: + */ + getPage: + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* process entries backward from last index */ + index = le16_to_cpu(p->header.nextindex) - 1; + + + /* Since this is the rightmost page at this level, and we may have + * already freed a page that was formerly to the right, let's make + * sure that the next pointer is zero. + */ + if (p->header.next) { + if (log) + /* + * Make sure this change to the header is logged. + * If we really truncate this leaf, the flag + * will be changed to tlckTRUNCATE + */ + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + BT_MARK_DIRTY(mp, ip); + p->header.next = 0; + } + + if (p->header.flag & BT_INTERNAL) + goto getChild; + + /* + * leaf page + */ + freed = 0; + + /* does region covered by leaf page precede Teof ? */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + if (teof >= xoff + xlen) { + XT_PUTPAGE(mp); + goto getParent; + } + + /* (re)acquire tlock of the leaf page */ + if (log) { + if (++locked_leaves > MAX_TRUNCATE_LEAVES) { + /* + * We need to limit the size of the transaction + * to avoid exhausting pagecache & tlocks + */ + XT_PUTPAGE(mp); + newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; + goto getParent; + } + tlck = txLock(tid, ip, mp, tlckXTREE); + tlck->type = tlckXTREE | tlckTRUNCATE; + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1; + } + BT_MARK_DIRTY(mp, ip); + + /* + * scan backward leaf page entries + */ + for (; index >= XTENTRYSTART; index--) { + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); + + /* + * The "data" for a directory is indexed by the block + * device's address space. This metadata must be invalidated + * here + */ + if (S_ISDIR(ip->i_mode) && (teof == 0)) + invalidate_xad_metapages(ip, *xad); + /* + * entry beyond eof: continue scan of current page + * xad + * ---|---=======-------> + * eof + */ + if (teof < xoff) { + nfreed += xlen; + continue; + } + + /* + * (xoff <= teof): last entry to be deleted from page; + * If other entries remain in page: keep and update the page. + */ + + /* + * eof == entry_start: delete the entry + * xad + * -------|=======-------> + * eof + * + */ + if (teof == xoff) { + nfreed += xlen; + + if (index == XTENTRYSTART) + break; + + nextindex = index; + } + /* + * eof within the entry: truncate the entry. + * xad + * -------===|===-------> + * eof + */ + else if (teof < xoff + xlen) { + /* update truncated entry */ + len = teof - xoff; + freexlen = xlen - len; + XADlength(xad, len); + + /* save pxd of truncated extent in tlck */ + xaddr += len; + if (log) { /* COMMIT_PWMAP */ + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index, (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = index + 1 - + xtlck->lwm.offset; + xtlck->twm.offset = index; + pxdlock = (struct pxd_lock *) & xtlck->pxdlock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, freexlen); + } + /* free truncated extent */ + else { /* COMMIT_WMAP */ + + pxdlock = (struct pxd_lock *) & xadlock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, freexlen); + txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); + + /* reset map lock */ + xadlock.flag = mlckFREEXADLIST; + } + + /* current entry is new last entry; */ + nextindex = index + 1; + + nfreed += freexlen; + } + /* + * eof beyond the entry: + * xad + * -------=======---|---> + * eof + */ + else { /* (xoff + xlen < teof) */ + + nextindex = index + 1; + } + + if (nextindex < le16_to_cpu(p->header.nextindex)) { + if (!log) { /* COMMIT_WAMP */ + xadlock.xdlist = &p->xad[nextindex]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + nextindex; + txFreeMap(ip, (struct maplock *) & xadlock, + NULL, COMMIT_WMAP); + } + p->header.nextindex = cpu_to_le16(nextindex); + } + + XT_PUTPAGE(mp); + + /* assert(freed == 0); */ + goto getParent; + } /* end scan of leaf page entries */ + + freed = 1; + + /* + * leaf page become empty: free the page if type != PMAP + */ + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckFREE: + * free data extents covered by leaf [XTENTRYSTART:hwm); + * invalidate leaf if COMMIT_PWMAP; + * if (TRUNCATE), will write LOG_NOREDOPAGE; + */ + tlck->type = tlckXTREE | tlckFREE; + } else { /* COMMIT_WAMP */ + + /* free data extents covered by leaf */ + xadlock.xdlist = &p->xad[XTENTRYSTART]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP); + } + + if (p->header.flag & BT_ROOT) { + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + + XT_PUTPAGE(mp); /* debug */ + goto out; + } else { + if (log) { /* COMMIT_PWMAP */ + /* page will be invalidated at tx completion + */ + XT_PUTPAGE(mp); + } else { /* COMMIT_WMAP */ + + if (mp->lid) + lid_to_tlock(mp->lid)->flag |= tlckFREELOCK; + + /* invalidate empty leaf page */ + discard_metapage(mp); + } + } + + /* + * the leaf page become empty: delete the parent entry + * for the leaf page if the parent page is to be kept + * in the new sized file. + */ + + /* + * go back up to the parent page + */ + getParent: + /* pop/restore parent entry for the current child page */ + if ((parent = BT_POP(&btstack)) == NULL) + /* current page must have been root */ + goto out; + + /* get back the parent page */ + bn = parent->bn; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = parent->index; + + /* + * child page was not empty: + */ + if (freed == 0) { + /* has any entry deleted from parent ? */ + if (index < le16_to_cpu(p->header.nextindex) - 1) { + /* (re)acquire tlock on the parent page */ + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckTRUNCATE: + * free child extents covered by parent [); + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + if (!(tlck->type & tlckTRUNCATE)) { + xtlck->hwm.offset = + le16_to_cpu(p->header. + nextindex) - 1; + tlck->type = + tlckXTREE | tlckTRUNCATE; + } + } else { /* COMMIT_WMAP */ + + /* free child extents covered by parent */ + xadlock.xdlist = &p->xad[index + 1]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + index - 1; + txFreeMap(ip, (struct maplock *) & xadlock, + NULL, COMMIT_WMAP); + } + BT_MARK_DIRTY(mp, ip); + + p->header.nextindex = cpu_to_le16(index + 1); + } + XT_PUTPAGE(mp); + goto getParent; + } + + /* + * child page was empty: + */ + nfreed += lengthXAD(&p->xad[index]); + + /* + * During working map update, child page's tlock must be handled + * before parent's. This is because the parent's tlock will cause + * the child's disk space to be marked available in the wmap, so + * it's important that the child page be released by that time. + * + * ToDo: tlocks should be on doubly-linked list, so we can + * quickly remove it and add it to the end. + */ + + /* + * Move parent page's tlock to the end of the tid's tlock list + */ + if (log && mp->lid && (tblk->last != mp->lid) && + lid_to_tlock(mp->lid)->tid) { + lid_t lid = mp->lid; + struct tlock *prev; + + tlck = lid_to_tlock(lid); + + if (tblk->next == lid) + tblk->next = tlck->next; + else { + for (prev = lid_to_tlock(tblk->next); + prev->next != lid; + prev = lid_to_tlock(prev->next)) { + assert(prev->next); + } + prev->next = tlck->next; + } + lid_to_tlock(tblk->last)->next = lid; + tlck->next = 0; + tblk->last = lid; + } + + /* + * parent page become empty: free the page + */ + if (index == XTENTRYSTART) { + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckFREE: + * free child extents covered by parent; + * invalidate parent if COMMIT_PWMAP; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = + le16_to_cpu(p->header.nextindex) - 1; + tlck->type = tlckXTREE | tlckFREE; + } else { /* COMMIT_WMAP */ + + /* free child extents covered by parent */ + xadlock.xdlist = &p->xad[XTENTRYSTART]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + XTENTRYSTART; + txFreeMap(ip, (struct maplock *) & xadlock, NULL, + COMMIT_WMAP); + } + BT_MARK_DIRTY(mp, ip); + + if (p->header.flag & BT_ROOT) { + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) { + /* + * Shrink root down to allow inline + * EA (otherwise fsck complains) + */ + p->header.maxentry = + cpu_to_le16(XTROOTINITSLOT); + JFS_IP(ip)->mode2 |= INLINEEA; + } + + XT_PUTPAGE(mp); /* debug */ + goto out; + } else { + if (log) { /* COMMIT_PWMAP */ + /* page will be invalidated at tx completion + */ + XT_PUTPAGE(mp); + } else { /* COMMIT_WMAP */ + + if (mp->lid) + lid_to_tlock(mp->lid)->flag |= + tlckFREELOCK; + + /* invalidate parent page */ + discard_metapage(mp); + } + + /* parent has become empty and freed: + * go back up to its parent page + */ + /* freed = 1; */ + goto getParent; + } + } + /* + * parent page still has entries for front region; + */ + else { + /* try truncate region covered by preceding entry + * (process backward) + */ + index--; + + /* go back down to the child page corresponding + * to the entry + */ + goto getChild; + } + + /* + * internal page: go down to child page of current entry + */ + getChild: + /* save current parent entry for the child page */ + if (BT_STACK_FULL(&btstack)) { + jfs_error(ip->i_sb, "stack overrun in xtTruncate!"); + XT_PUTPAGE(mp); + return -EIO; + } + BT_PUSH(&btstack, bn, index); + + /* get child page */ + xad = &p->xad[index]; + bn = addressXAD(xad); + + /* + * first access of each internal entry: + */ + /* release parent page */ + XT_PUTPAGE(mp); + + /* process the child page */ + goto getPage; + + out: + /* + * update file resource stat + */ + /* set size + */ + if (S_ISDIR(ip->i_mode) && !newsize) + ip->i_size = 1; /* fsck hates zero-length directories */ + else + ip->i_size = newsize; + + /* update quota allocation to reflect freed blocks */ + dquot_free_block(ip, nfreed); + + /* + * free tlock of invalidated pages + */ + if (flag == COMMIT_WMAP) + txFreelock(ip); + + return newsize; +} + + +/* + * xtTruncate_pmap() + * + * function: + * Perform truncate to zero length for deleted file, leaving the + * the xtree and working map untouched. This allows the file to + * be accessed via open file handles, while the delete of the file + * is committed to disk. + * + * parameter: + * tid_t tid, + * struct inode *ip, + * s64 committed_size) + * + * return: new committed size + * + * note: + * + * To avoid deadlock by holding too many transaction locks, the + * truncation may be broken up into multiple transactions. + * The committed_size keeps track of part of the file has been + * freed from the pmaps. + */ +s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) +{ + s64 bn; + struct btstack btstack; + int cmp; + int index; + int locked_leaves = 0; + struct metapage *mp; + xtpage_t *p; + struct btframe *parent; + int rc; + struct tblock *tblk; + struct tlock *tlck = NULL; + xad_t *xad; + int xlen; + s64 xoff; + struct xtlock *xtlck = NULL; + + /* save object truncation type */ + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_PMAP; + + /* clear stack */ + BT_CLR(&btstack); + + if (committed_size) { + xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1; + rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0); + if (rc) + return rc; + + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, + "xtTruncate_pmap: did not find extent"); + return -EIO; + } + } else { + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + + /* + * first access of each page: + */ + getPage: + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* process entries backward from last index */ + index = le16_to_cpu(p->header.nextindex) - 1; + + if (p->header.flag & BT_INTERNAL) + goto getChild; + } + + /* + * leaf page + */ + + if (++locked_leaves > MAX_TRUNCATE_LEAVES) { + /* + * We need to limit the size of the transaction + * to avoid exhausting pagecache & tlocks + */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + XT_PUTPAGE(mp); + return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; + } + tlck = txLock(tid, ip, mp, tlckXTREE); + tlck->type = tlckXTREE | tlckFREE; + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = index; + + + XT_PUTPAGE(mp); + + /* + * go back up to the parent page + */ + getParent: + /* pop/restore parent entry for the current child page */ + if ((parent = BT_POP(&btstack)) == NULL) + /* current page must have been root */ + goto out; + + /* get back the parent page */ + bn = parent->bn; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = parent->index; + + /* + * parent page become empty: free the page + */ + if (index == XTENTRYSTART) { + /* txCommit() with tlckFREE: + * free child extents covered by parent; + * invalidate parent if COMMIT_PWMAP; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1; + tlck->type = tlckXTREE | tlckFREE; + + XT_PUTPAGE(mp); + + if (p->header.flag & BT_ROOT) { + + goto out; + } else { + goto getParent; + } + } + /* + * parent page still has entries for front region; + */ + else + index--; + /* + * internal page: go down to child page of current entry + */ + getChild: + /* save current parent entry for the child page */ + if (BT_STACK_FULL(&btstack)) { + jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!"); + XT_PUTPAGE(mp); + return -EIO; + } + BT_PUSH(&btstack, bn, index); + + /* get child page */ + xad = &p->xad[index]; + bn = addressXAD(xad); + + /* + * first access of each internal entry: + */ + /* release parent page */ + XT_PUTPAGE(mp); + + /* process the child page */ + goto getPage; + + out: + + return 0; +} + +#ifdef CONFIG_JFS_STATISTICS +static int jfs_xtstat_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, + "JFS Xtree statistics\n" + "====================\n" + "searches = %d\n" + "fast searches = %d\n" + "splits = %d\n", + xtStat.search, + xtStat.fastSearch, + xtStat.split); + return 0; +} + +static int jfs_xtstat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_xtstat_proc_show, NULL); +} + +const struct file_operations jfs_xtstat_proc_fops = { + .owner = THIS_MODULE, + .open = jfs_xtstat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h new file mode 100644 index 00000000..08c0c749 --- /dev/null +++ b/fs/jfs/jfs_xtree.h @@ -0,0 +1,132 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_XTREE +#define _H_JFS_XTREE + +/* + * jfs_xtree.h: extent allocation descriptor B+-tree manager + */ + +#include "jfs_btree.h" + + +/* + * extent allocation descriptor (xad) + */ +typedef struct xad { + unsigned flag:8; /* 1: flag */ + unsigned rsvrd:16; /* 2: reserved */ + unsigned off1:8; /* 1: offset in unit of fsblksize */ + __le32 off2; /* 4: offset in unit of fsblksize */ + unsigned len:24; /* 3: length in unit of fsblksize */ + unsigned addr1:8; /* 1: address in unit of fsblksize */ + __le32 addr2; /* 4: address in unit of fsblksize */ +} xad_t; /* (16) */ + +#define MAXXLEN ((1 << 24) - 1) + +#define XTSLOTSIZE 16 +#define L2XTSLOTSIZE 4 + +/* xad_t field construction */ +#define XADoffset(xad, offset64)\ +{\ + (xad)->off1 = ((u64)offset64) >> 32;\ + (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\ +} +#define XADaddress(xad, address64)\ +{\ + (xad)->addr1 = ((u64)address64) >> 32;\ + (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ +} +#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32) + +/* xad_t field extraction */ +#define offsetXAD(xad)\ + ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2)) +#define addressXAD(xad)\ + ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2)) +#define lengthXAD(xad) __le24_to_cpu((xad)->len) + +/* xad list */ +struct xadlist { + s16 maxnxad; + s16 nxad; + xad_t *xad; +}; + +/* xad_t flags */ +#define XAD_NEW 0x01 /* new */ +#define XAD_EXTENDED 0x02 /* extended */ +#define XAD_COMPRESSED 0x04 /* compressed with recorded length */ +#define XAD_NOTRECORDED 0x08 /* allocated but not recorded */ +#define XAD_COW 0x10 /* copy-on-write */ + + +/* possible values for maxentry */ +#define XTROOTINITSLOT_DIR 6 +#define XTROOTINITSLOT 10 +#define XTROOTMAXSLOT 18 +#define XTPAGEMAXSLOT 256 +#define XTENTRYSTART 2 + +/* + * xtree page: + */ +typedef union { + struct xtheader { + __le64 next; /* 8: */ + __le64 prev; /* 8: */ + + u8 flag; /* 1: */ + u8 rsrvd1; /* 1: */ + __le16 nextindex; /* 2: next index = number of entries */ + __le16 maxentry; /* 2: max number of entries */ + __le16 rsrvd2; /* 2: */ + + pxd_t self; /* 8: self */ + } header; /* (32) */ + + xad_t xad[XTROOTMAXSLOT]; /* 16 * maxentry: xad array */ +} xtpage_t; + +/* + * external declaration + */ +extern int xtLookup(struct inode *ip, s64 lstart, s64 llen, + int *pflag, s64 * paddr, int *plen, int flag); +extern void xtInitRoot(tid_t tid, struct inode *ip); +extern int xtInsert(tid_t tid, struct inode *ip, + int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag); +extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen, + int flag); +#ifdef _NOTYET +extern int xtTailgate(tid_t tid, struct inode *ip, + s64 xoff, int xlen, s64 xaddr, int flag); +#endif +extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad); +extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen, + int flag); +extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type); +extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size); +extern int xtRelocate(tid_t tid, struct inode *ip, + xad_t * oxad, s64 nxaddr, int xtype); +extern int xtAppend(tid_t tid, + struct inode *ip, int xflag, s64 xoff, int maxblocks, + int *xlenp, s64 * xaddrp, int flag); +#endif /* !_H_JFS_XTREE */ diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c new file mode 100644 index 00000000..eaaf2b51 --- /dev/null +++ b/fs/jfs/namei.c @@ -0,0 +1,1639 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_inode.h" +#include "jfs_dinode.h" +#include "jfs_dmap.h" +#include "jfs_unicode.h" +#include "jfs_metapage.h" +#include "jfs_xattr.h" +#include "jfs_acl.h" +#include "jfs_debug.h" + +/* + * forward references + */ +const struct dentry_operations jfs_ci_dentry_operations; + +static s64 commitZeroLink(tid_t, struct inode *); + +/* + * NAME: free_ea_wmap(inode) + * + * FUNCTION: free uncommitted extended attributes from working map + * + */ +static inline void free_ea_wmap(struct inode *inode) +{ + dxd_t *ea = &JFS_IP(inode)->ea; + + if (ea->flag & DXD_EXTENT) { + /* free EA pages from cache */ + invalidate_dxd_metapages(inode, *ea); + dbFree(inode, addressDXD(ea), lengthDXD(ea)); + } + ea->flag = 0; +} + +/* + * NAME: jfs_create(dip, dentry, mode) + * + * FUNCTION: create a regular file in the parent directory + * with name = and mode = + * + * PARAMETER: dip - parent directory vnode + * dentry - dentry of new file + * mode - create mode (rwxrwxrwx). + * nd- nd struct + * + * RETURN: Errors from subroutines + * + */ +static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + int rc = 0; + tid_t tid; /* transaction id */ + struct inode *ip = NULL; /* child directory inode */ + ino_t ino; + struct component_name dname; /* child directory name */ + struct btstack btstack; + struct inode *iplist[2]; + struct tblock *tblk; + + jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name); + + dquot_initialize(dip); + + /* + * search parent directory for entry/freespace + * (dtSearch() returns parent directory page pinned) + */ + if ((rc = get_UCSname(&dname, dentry))) + goto out1; + + /* + * Either iAlloc() or txBegin() may block. Deadlock can occur if we + * block there while holding dtree page, so we allocate the inode & + * begin the transaction before we search the directory. + */ + ip = ialloc(dip, mode); + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); + goto out2; + } + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + rc = jfs_init_acl(tid, ip, dip); + if (rc) + goto out3; + + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); + if (rc) { + txAbort(tid, 0); + goto out3; + } + + if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) { + jfs_err("jfs_create: dtSearch returned %d", rc); + txAbort(tid, 0); + goto out3; + } + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ino = ip->i_ino; + tblk->u.ixpxd = JFS_IP(ip)->ixpxd; + + iplist[0] = dip; + iplist[1] = ip; + + /* + * initialize the child XAD tree root in-line in inode + */ + xtInitRoot(tid, ip); + + /* + * create entry in parent directory for child directory + * (dtInsert() releases parent directory page) + */ + ino = ip->i_ino; + if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) { + if (rc == -EIO) { + jfs_err("jfs_create: dtInsert returned -EIO"); + txAbort(tid, 1); /* Marks Filesystem dirty */ + } else + txAbort(tid, 0); /* Filesystem full */ + goto out3; + } + + ip->i_op = &jfs_file_inode_operations; + ip->i_fop = &jfs_file_operations; + ip->i_mapping->a_ops = &jfs_aops; + + mark_inode_dirty(ip); + + dip->i_ctime = dip->i_mtime = CURRENT_TIME; + + mark_inode_dirty(dip); + + rc = txCommit(tid, 2, &iplist[0], 0); + + out3: + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + if (rc) { + free_ea_wmap(ip); + ip->i_nlink = 0; + unlock_new_inode(ip); + iput(ip); + } else { + d_instantiate(dentry, ip); + unlock_new_inode(ip); + } + + out2: + free_UCSname(&dname); + + out1: + + jfs_info("jfs_create: rc:%d", rc); + return rc; +} + + +/* + * NAME: jfs_mkdir(dip, dentry, mode) + * + * FUNCTION: create a child directory in the parent directory + * with name = and mode = + * + * PARAMETER: dip - parent directory vnode + * dentry - dentry of child directory + * mode - create mode (rwxrwxrwx). + * + * RETURN: Errors from subroutines + * + * note: + * EACCESS: user needs search+write permission on the parent directory + */ +static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) +{ + int rc = 0; + tid_t tid; /* transaction id */ + struct inode *ip = NULL; /* child directory inode */ + ino_t ino; + struct component_name dname; /* child directory name */ + struct btstack btstack; + struct inode *iplist[2]; + struct tblock *tblk; + + jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name); + + dquot_initialize(dip); + + /* link count overflow on parent directory ? */ + if (dip->i_nlink == JFS_LINK_MAX) { + rc = -EMLINK; + goto out1; + } + + /* + * search parent directory for entry/freespace + * (dtSearch() returns parent directory page pinned) + */ + if ((rc = get_UCSname(&dname, dentry))) + goto out1; + + /* + * Either iAlloc() or txBegin() may block. Deadlock can occur if we + * block there while holding dtree page, so we allocate the inode & + * begin the transaction before we search the directory. + */ + ip = ialloc(dip, S_IFDIR | mode); + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); + goto out2; + } + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + rc = jfs_init_acl(tid, ip, dip); + if (rc) + goto out3; + + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); + if (rc) { + txAbort(tid, 0); + goto out3; + } + + if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) { + jfs_err("jfs_mkdir: dtSearch returned %d", rc); + txAbort(tid, 0); + goto out3; + } + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ino = ip->i_ino; + tblk->u.ixpxd = JFS_IP(ip)->ixpxd; + + iplist[0] = dip; + iplist[1] = ip; + + /* + * initialize the child directory in-line in inode + */ + dtInitRoot(tid, ip, dip->i_ino); + + /* + * create entry in parent directory for child directory + * (dtInsert() releases parent directory page) + */ + ino = ip->i_ino; + if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) { + if (rc == -EIO) { + jfs_err("jfs_mkdir: dtInsert returned -EIO"); + txAbort(tid, 1); /* Marks Filesystem dirty */ + } else + txAbort(tid, 0); /* Filesystem full */ + goto out3; + } + + ip->i_nlink = 2; /* for '.' */ + ip->i_op = &jfs_dir_inode_operations; + ip->i_fop = &jfs_dir_operations; + + mark_inode_dirty(ip); + + /* update parent directory inode */ + inc_nlink(dip); /* for '..' from child directory */ + dip->i_ctime = dip->i_mtime = CURRENT_TIME; + mark_inode_dirty(dip); + + rc = txCommit(tid, 2, &iplist[0], 0); + + out3: + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + if (rc) { + free_ea_wmap(ip); + ip->i_nlink = 0; + unlock_new_inode(ip); + iput(ip); + } else { + d_instantiate(dentry, ip); + unlock_new_inode(ip); + } + + out2: + free_UCSname(&dname); + + + out1: + + jfs_info("jfs_mkdir: rc:%d", rc); + return rc; +} + +/* + * NAME: jfs_rmdir(dip, dentry) + * + * FUNCTION: remove a link to child directory + * + * PARAMETER: dip - parent inode + * dentry - child directory dentry + * + * RETURN: -EINVAL - if name is . or .. + * -EINVAL - if . or .. exist but are invalid. + * errors from subroutines + * + * note: + * if other threads have the directory open when the last link + * is removed, the "." and ".." entries, if present, are removed before + * rmdir() returns and no new entries may be created in the directory, + * but the directory is not removed until the last reference to + * the directory is released (cf.unlink() of regular file). + */ +static int jfs_rmdir(struct inode *dip, struct dentry *dentry) +{ + int rc; + tid_t tid; /* transaction id */ + struct inode *ip = dentry->d_inode; + ino_t ino; + struct component_name dname; + struct inode *iplist[2]; + struct tblock *tblk; + + jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); + + /* Init inode for quota operations. */ + dquot_initialize(dip); + dquot_initialize(ip); + + /* directory must be empty to be removed */ + if (!dtEmpty(ip)) { + rc = -ENOTEMPTY; + goto out; + } + + if ((rc = get_UCSname(&dname, dentry))) { + goto out; + } + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + iplist[0] = dip; + iplist[1] = ip; + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->u.ip = ip; + + /* + * delete the entry of target directory from parent directory + */ + ino = ip->i_ino; + if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) { + jfs_err("jfs_rmdir: dtDelete returned %d", rc); + if (rc == -EIO) + txAbort(tid, 1); + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + + goto out2; + } + + /* update parent directory's link count corresponding + * to ".." entry of the target directory deleted + */ + dip->i_ctime = dip->i_mtime = CURRENT_TIME; + inode_dec_link_count(dip); + + /* + * OS/2 could have created EA and/or ACL + */ + /* free EA from both persistent and working map */ + if (JFS_IP(ip)->ea.flag & DXD_EXTENT) { + /* free EA pages */ + txEA(tid, ip, &JFS_IP(ip)->ea, NULL); + } + JFS_IP(ip)->ea.flag = 0; + + /* free ACL from both persistent and working map */ + if (JFS_IP(ip)->acl.flag & DXD_EXTENT) { + /* free ACL pages */ + txEA(tid, ip, &JFS_IP(ip)->acl, NULL); + } + JFS_IP(ip)->acl.flag = 0; + + /* mark the target directory as deleted */ + clear_nlink(ip); + mark_inode_dirty(ip); + + rc = txCommit(tid, 2, &iplist[0], 0); + + txEnd(tid); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + + /* + * Truncating the directory index table is not guaranteed. It + * may need to be done iteratively + */ + if (test_cflag(COMMIT_Stale, dip)) { + if (dip->i_size > 1) + jfs_truncate_nolock(dip, 0); + + clear_cflag(COMMIT_Stale, dip); + } + + out2: + free_UCSname(&dname); + + out: + jfs_info("jfs_rmdir: rc:%d", rc); + return rc; +} + +/* + * NAME: jfs_unlink(dip, dentry) + * + * FUNCTION: remove a link to object named by + * from parent directory + * + * PARAMETER: dip - inode of parent directory + * dentry - dentry of object to be removed + * + * RETURN: errors from subroutines + * + * note: + * temporary file: if one or more processes have the file open + * when the last link is removed, the link will be removed before + * unlink() returns, but the removal of the file contents will be + * postponed until all references to the files are closed. + * + * JFS does NOT support unlink() on directories. + * + */ +static int jfs_unlink(struct inode *dip, struct dentry *dentry) +{ + int rc; + tid_t tid; /* transaction id */ + struct inode *ip = dentry->d_inode; + ino_t ino; + struct component_name dname; /* object name */ + struct inode *iplist[2]; + struct tblock *tblk; + s64 new_size = 0; + int commit_flag; + + jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name); + + /* Init inode for quota operations. */ + dquot_initialize(dip); + dquot_initialize(ip); + + if ((rc = get_UCSname(&dname, dentry))) + goto out; + + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + iplist[0] = dip; + iplist[1] = ip; + + /* + * delete the entry of target file from parent directory + */ + ino = ip->i_ino; + if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) { + jfs_err("jfs_unlink: dtDelete returned %d", rc); + if (rc == -EIO) + txAbort(tid, 1); /* Marks FS Dirty */ + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + IWRITE_UNLOCK(ip); + goto out1; + } + + ASSERT(ip->i_nlink); + + ip->i_ctime = dip->i_ctime = dip->i_mtime = CURRENT_TIME; + mark_inode_dirty(dip); + + /* update target's inode */ + inode_dec_link_count(ip); + + /* + * commit zero link count object + */ + if (ip->i_nlink == 0) { + assert(!test_cflag(COMMIT_Nolink, ip)); + /* free block resources */ + if ((new_size = commitZeroLink(tid, ip)) < 0) { + txAbort(tid, 1); /* Marks FS Dirty */ + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + IWRITE_UNLOCK(ip); + rc = new_size; + goto out1; + } + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->u.ip = ip; + } + + /* + * Incomplete truncate of file data can + * result in timing problems unless we synchronously commit the + * transaction. + */ + if (new_size) + commit_flag = COMMIT_SYNC; + else + commit_flag = 0; + + /* + * If xtTruncate was incomplete, commit synchronously to avoid + * timing complications + */ + rc = txCommit(tid, 2, &iplist[0], commit_flag); + + txEnd(tid); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + + while (new_size && (rc == 0)) { + tid = txBegin(dip->i_sb, 0); + mutex_lock(&JFS_IP(ip)->commit_mutex); + new_size = xtTruncate_pmap(tid, ip, new_size); + if (new_size < 0) { + txAbort(tid, 1); /* Marks FS Dirty */ + rc = new_size; + } else + rc = txCommit(tid, 2, &iplist[0], COMMIT_SYNC); + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + } + + if (ip->i_nlink == 0) + set_cflag(COMMIT_Nolink, ip); + + IWRITE_UNLOCK(ip); + + /* + * Truncating the directory index table is not guaranteed. It + * may need to be done iteratively + */ + if (test_cflag(COMMIT_Stale, dip)) { + if (dip->i_size > 1) + jfs_truncate_nolock(dip, 0); + + clear_cflag(COMMIT_Stale, dip); + } + + out1: + free_UCSname(&dname); + out: + jfs_info("jfs_unlink: rc:%d", rc); + return rc; +} + +/* + * NAME: commitZeroLink() + * + * FUNCTION: for non-directory, called by jfs_remove(), + * truncate a regular file, directory or symbolic + * link to zero length. return 0 if type is not + * one of these. + * + * if the file is currently associated with a VM segment + * only permanent disk and inode map resources are freed, + * and neither the inode nor indirect blocks are modified + * so that the resources can be later freed in the work + * map by ctrunc1. + * if there is no VM segment on entry, the resources are + * freed in both work and permanent map. + * (? for temporary file - memory object is cached even + * after no reference: + * reference count > 0 - ) + * + * PARAMETERS: cd - pointer to commit data structure. + * current inode is the one to truncate. + * + * RETURN: Errors from subroutines + */ +static s64 commitZeroLink(tid_t tid, struct inode *ip) +{ + int filetype; + struct tblock *tblk; + + jfs_info("commitZeroLink: tid = %d, ip = 0x%p", tid, ip); + + filetype = ip->i_mode & S_IFMT; + switch (filetype) { + case S_IFREG: + break; + case S_IFLNK: + /* fast symbolic link */ + if (ip->i_size < IDATASIZE) { + ip->i_size = 0; + return 0; + } + break; + default: + assert(filetype != S_IFDIR); + return 0; + } + + set_cflag(COMMIT_Freewmap, ip); + + /* mark transaction of block map update type */ + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_PMAP; + + /* + * free EA + */ + if (JFS_IP(ip)->ea.flag & DXD_EXTENT) + /* acquire maplock on EA to be freed from block map */ + txEA(tid, ip, &JFS_IP(ip)->ea, NULL); + + /* + * free ACL + */ + if (JFS_IP(ip)->acl.flag & DXD_EXTENT) + /* acquire maplock on EA to be freed from block map */ + txEA(tid, ip, &JFS_IP(ip)->acl, NULL); + + /* + * free xtree/data (truncate to zero length): + * free xtree/data pages from cache if COMMIT_PWMAP, + * free xtree/data blocks from persistent block map, and + * free xtree/data blocks from working block map if COMMIT_PWMAP; + */ + if (ip->i_size) + return xtTruncate_pmap(tid, ip, 0); + + return 0; +} + + +/* + * NAME: jfs_free_zero_link() + * + * FUNCTION: for non-directory, called by iClose(), + * free resources of a file from cache and WORKING map + * for a file previously committed with zero link count + * while associated with a pager object, + * + * PARAMETER: ip - pointer to inode of file. + */ +void jfs_free_zero_link(struct inode *ip) +{ + int type; + + jfs_info("jfs_free_zero_link: ip = 0x%p", ip); + + /* return if not reg or symbolic link or if size is + * already ok. + */ + type = ip->i_mode & S_IFMT; + + switch (type) { + case S_IFREG: + break; + case S_IFLNK: + /* if its contained in inode nothing to do */ + if (ip->i_size < IDATASIZE) + return; + break; + default: + return; + } + + /* + * free EA + */ + if (JFS_IP(ip)->ea.flag & DXD_EXTENT) { + s64 xaddr = addressDXD(&JFS_IP(ip)->ea); + int xlen = lengthDXD(&JFS_IP(ip)->ea); + struct maplock maplock; /* maplock for COMMIT_WMAP */ + struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ + + /* free EA pages from cache */ + invalidate_dxd_metapages(ip, JFS_IP(ip)->ea); + + /* free EA extent from working block map */ + maplock.index = 1; + pxdlock = (struct pxd_lock *) & maplock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, xlen); + txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); + } + + /* + * free ACL + */ + if (JFS_IP(ip)->acl.flag & DXD_EXTENT) { + s64 xaddr = addressDXD(&JFS_IP(ip)->acl); + int xlen = lengthDXD(&JFS_IP(ip)->acl); + struct maplock maplock; /* maplock for COMMIT_WMAP */ + struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ + + invalidate_dxd_metapages(ip, JFS_IP(ip)->acl); + + /* free ACL extent from working block map */ + maplock.index = 1; + pxdlock = (struct pxd_lock *) & maplock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, xlen); + txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); + } + + /* + * free xtree/data (truncate to zero length): + * free xtree/data pages from cache, and + * free xtree/data blocks from working block map; + */ + if (ip->i_size) + xtTruncate(0, ip, 0, COMMIT_WMAP); +} + +/* + * NAME: jfs_link(vp, dvp, name, crp) + * + * FUNCTION: create a link to by the name = + * in the parent directory + * + * PARAMETER: vp - target object + * dvp - parent directory of new link + * name - name of new link to target object + * crp - credential + * + * RETURN: Errors from subroutines + * + * note: + * JFS does NOT support link() on directories (to prevent circular + * path in the directory hierarchy); + * EPERM: the target object is a directory, and either the caller + * does not have appropriate privileges or the implementation prohibits + * using link() on directories [XPG4.2]. + * + * JFS does NOT support links between file systems: + * EXDEV: target object and new link are on different file systems and + * implementation does not support links between file systems [XPG4.2]. + */ +static int jfs_link(struct dentry *old_dentry, + struct inode *dir, struct dentry *dentry) +{ + int rc; + tid_t tid; + struct inode *ip = old_dentry->d_inode; + ino_t ino; + struct component_name dname; + struct btstack btstack; + struct inode *iplist[2]; + + jfs_info("jfs_link: %s %s", old_dentry->d_name.name, + dentry->d_name.name); + + if (ip->i_nlink == JFS_LINK_MAX) + return -EMLINK; + + dquot_initialize(dir); + + tid = txBegin(ip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + /* + * scan parent directory for entry/freespace + */ + if ((rc = get_UCSname(&dname, dentry))) + goto out; + + if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) + goto free_dname; + + /* + * create entry for new link in parent directory + */ + ino = ip->i_ino; + if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) + goto free_dname; + + /* update object inode */ + inc_nlink(ip); /* for new link */ + ip->i_ctime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + mark_inode_dirty(dir); + ihold(ip); + + iplist[0] = ip; + iplist[1] = dir; + rc = txCommit(tid, 2, &iplist[0], 0); + + if (rc) { + ip->i_nlink--; /* never instantiated */ + iput(ip); + } else + d_instantiate(dentry, ip); + + free_dname: + free_UCSname(&dname); + + out: + txEnd(tid); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dir)->commit_mutex); + + jfs_info("jfs_link: rc:%d", rc); + return rc; +} + +/* + * NAME: jfs_symlink(dip, dentry, name) + * + * FUNCTION: creates a symbolic link to by name + * in directory + * + * PARAMETER: dip - parent directory vnode + * dentry - dentry of symbolic link + * name - the path name of the existing object + * that will be the source of the link + * + * RETURN: errors from subroutines + * + * note: + * ENAMETOOLONG: pathname resolution of a symbolic link produced + * an intermediate result whose length exceeds PATH_MAX [XPG4.2] +*/ + +static int jfs_symlink(struct inode *dip, struct dentry *dentry, + const char *name) +{ + int rc; + tid_t tid; + ino_t ino = 0; + struct component_name dname; + int ssize; /* source pathname size */ + struct btstack btstack; + struct inode *ip = dentry->d_inode; + unchar *i_fastsymlink; + s64 xlen = 0; + int bmask = 0, xsize; + s64 extent = 0, xaddr; + struct metapage *mp; + struct super_block *sb; + struct tblock *tblk; + + struct inode *iplist[2]; + + jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); + + dquot_initialize(dip); + + ssize = strlen(name) + 1; + + /* + * search parent directory for entry/freespace + * (dtSearch() returns parent directory page pinned) + */ + + if ((rc = get_UCSname(&dname, dentry))) + goto out1; + + /* + * allocate on-disk/in-memory inode for symbolic link: + * (iAlloc() returns new, locked inode) + */ + ip = ialloc(dip, S_IFLNK | 0777); + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); + goto out2; + } + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); + if (rc) + goto out3; + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ino = ip->i_ino; + tblk->u.ixpxd = JFS_IP(ip)->ixpxd; + + /* fix symlink access permission + * (dir_create() ANDs in the u.u_cmask, + * but symlinks really need to be 777 access) + */ + ip->i_mode |= 0777; + + /* + * write symbolic link target path name + */ + xtInitRoot(tid, ip); + + /* + * write source path name inline in on-disk inode (fast symbolic link) + */ + + if (ssize <= IDATASIZE) { + ip->i_op = &jfs_fast_symlink_inode_operations; + + i_fastsymlink = JFS_IP(ip)->i_inline; + memcpy(i_fastsymlink, name, ssize); + ip->i_size = ssize - 1; + + /* + * if symlink is > 128 bytes, we don't have the space to + * store inline extended attributes + */ + if (ssize > sizeof (JFS_IP(ip)->i_inline)) + JFS_IP(ip)->mode2 &= ~INLINEEA; + + jfs_info("jfs_symlink: fast symlink added ssize:%d name:%s ", + ssize, name); + } + /* + * write source path name in a single extent + */ + else { + jfs_info("jfs_symlink: allocate extent ip:0x%p", ip); + + ip->i_op = &jfs_symlink_inode_operations; + ip->i_mapping->a_ops = &jfs_aops; + + /* + * even though the data of symlink object (source + * path name) is treated as non-journaled user data, + * it is read/written thru buffer cache for performance. + */ + sb = ip->i_sb; + bmask = JFS_SBI(sb)->bsize - 1; + xsize = (ssize + bmask) & ~bmask; + xaddr = 0; + xlen = xsize >> JFS_SBI(sb)->l2bsize; + if ((rc = xtInsert(tid, ip, 0, 0, xlen, &xaddr, 0))) { + txAbort(tid, 0); + goto out3; + } + extent = xaddr; + ip->i_size = ssize - 1; + while (ssize) { + /* This is kind of silly since PATH_MAX == 4K */ + int copy_size = min(ssize, PSIZE); + + mp = get_metapage(ip, xaddr, PSIZE, 1); + + if (mp == NULL) { + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + rc = -EIO; + txAbort(tid, 0); + goto out3; + } + memcpy(mp->data, name, copy_size); + flush_metapage(mp); + ssize -= copy_size; + name += copy_size; + xaddr += JFS_SBI(sb)->nbperpage; + } + } + + /* + * create entry for symbolic link in parent directory + */ + rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE); + if (rc == 0) { + ino = ip->i_ino; + rc = dtInsert(tid, dip, &dname, &ino, &btstack); + } + if (rc) { + if (xlen) + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + txAbort(tid, 0); + /* discard new inode */ + goto out3; + } + + mark_inode_dirty(ip); + + dip->i_ctime = dip->i_mtime = CURRENT_TIME; + mark_inode_dirty(dip); + /* + * commit update of parent directory and link object + */ + + iplist[0] = dip; + iplist[1] = ip; + rc = txCommit(tid, 2, &iplist[0], 0); + + out3: + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + if (rc) { + free_ea_wmap(ip); + ip->i_nlink = 0; + unlock_new_inode(ip); + iput(ip); + } else { + d_instantiate(dentry, ip); + unlock_new_inode(ip); + } + + out2: + free_UCSname(&dname); + + out1: + jfs_info("jfs_symlink: rc:%d", rc); + return rc; +} + + +/* + * NAME: jfs_rename + * + * FUNCTION: rename a file or directory + */ +static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct btstack btstack; + ino_t ino; + struct component_name new_dname; + struct inode *new_ip; + struct component_name old_dname; + struct inode *old_ip; + int rc; + tid_t tid; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *lv; + int ipcount; + struct inode *iplist[4]; + struct tblock *tblk; + s64 new_size = 0; + int commit_flag; + + + jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, + new_dentry->d_name.name); + + dquot_initialize(old_dir); + dquot_initialize(new_dir); + + old_ip = old_dentry->d_inode; + new_ip = new_dentry->d_inode; + + if ((rc = get_UCSname(&old_dname, old_dentry))) + goto out1; + + if ((rc = get_UCSname(&new_dname, new_dentry))) + goto out2; + + /* + * Make sure source inode number is what we think it is + */ + rc = dtSearch(old_dir, &old_dname, &ino, &btstack, JFS_LOOKUP); + if (rc || (ino != old_ip->i_ino)) { + rc = -ENOENT; + goto out3; + } + + /* + * Make sure dest inode number (if any) is what we think it is + */ + rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP); + if (!rc) { + if ((!new_ip) || (ino != new_ip->i_ino)) { + rc = -ESTALE; + goto out3; + } + } else if (rc != -ENOENT) + goto out3; + else if (new_ip) { + /* no entry exists, but one was expected */ + rc = -ESTALE; + goto out3; + } + + if (S_ISDIR(old_ip->i_mode)) { + if (new_ip) { + if (!dtEmpty(new_ip)) { + rc = -ENOTEMPTY; + goto out3; + } + } else if ((new_dir != old_dir) && + (new_dir->i_nlink == JFS_LINK_MAX)) { + rc = -EMLINK; + goto out3; + } + } else if (new_ip) { + IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); + /* Init inode for quota operations. */ + dquot_initialize(new_ip); + } + + /* + * The real work starts here + */ + tid = txBegin(new_dir->i_sb, 0); + + /* + * How do we know the locking is safe from deadlocks? + * The vfs does the hard part for us. Any time we are taking nested + * commit_mutexes, the vfs already has i_mutex held on the parent. + * Here, the vfs has already taken i_mutex on both old_dir and new_dir. + */ + mutex_lock_nested(&JFS_IP(new_dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(old_ip)->commit_mutex, COMMIT_MUTEX_CHILD); + if (old_dir != new_dir) + mutex_lock_nested(&JFS_IP(old_dir)->commit_mutex, + COMMIT_MUTEX_SECOND_PARENT); + + if (new_ip) { + mutex_lock_nested(&JFS_IP(new_ip)->commit_mutex, + COMMIT_MUTEX_VICTIM); + /* + * Change existing directory entry to new inode number + */ + ino = new_ip->i_ino; + rc = dtModify(tid, new_dir, &new_dname, &ino, + old_ip->i_ino, JFS_RENAME); + if (rc) + goto out4; + drop_nlink(new_ip); + if (S_ISDIR(new_ip->i_mode)) { + drop_nlink(new_ip); + if (new_ip->i_nlink) { + mutex_unlock(&JFS_IP(new_ip)->commit_mutex); + if (old_dir != new_dir) + mutex_unlock(&JFS_IP(old_dir)->commit_mutex); + mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_dir)->commit_mutex); + if (!S_ISDIR(old_ip->i_mode) && new_ip) + IWRITE_UNLOCK(new_ip); + jfs_error(new_ip->i_sb, + "jfs_rename: new_ip->i_nlink != 0"); + return -EIO; + } + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->u.ip = new_ip; + } else if (new_ip->i_nlink == 0) { + assert(!test_cflag(COMMIT_Nolink, new_ip)); + /* free block resources */ + if ((new_size = commitZeroLink(tid, new_ip)) < 0) { + txAbort(tid, 1); /* Marks FS Dirty */ + rc = new_size; + goto out4; + } + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->u.ip = new_ip; + } else { + new_ip->i_ctime = CURRENT_TIME; + mark_inode_dirty(new_ip); + } + } else { + /* + * Add new directory entry + */ + rc = dtSearch(new_dir, &new_dname, &ino, &btstack, + JFS_CREATE); + if (rc) { + jfs_err("jfs_rename didn't expect dtSearch to fail " + "w/rc = %d", rc); + goto out4; + } + + ino = old_ip->i_ino; + rc = dtInsert(tid, new_dir, &new_dname, &ino, &btstack); + if (rc) { + if (rc == -EIO) + jfs_err("jfs_rename: dtInsert returned -EIO"); + goto out4; + } + if (S_ISDIR(old_ip->i_mode)) + inc_nlink(new_dir); + } + /* + * Remove old directory entry + */ + + ino = old_ip->i_ino; + rc = dtDelete(tid, old_dir, &old_dname, &ino, JFS_REMOVE); + if (rc) { + jfs_err("jfs_rename did not expect dtDelete to return rc = %d", + rc); + txAbort(tid, 1); /* Marks Filesystem dirty */ + goto out4; + } + if (S_ISDIR(old_ip->i_mode)) { + drop_nlink(old_dir); + if (old_dir != new_dir) { + /* + * Change inode number of parent for moved directory + */ + + JFS_IP(old_ip)->i_dtroot.header.idotdot = + cpu_to_le32(new_dir->i_ino); + + /* Linelock header of dtree */ + tlck = txLock(tid, old_ip, + (struct metapage *) &JFS_IP(old_ip)->bxflag, + tlckDTREE | tlckBTROOT | tlckRELINK); + dtlck = (struct dt_lock *) & tlck->lock; + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + } + } + + /* + * Update ctime on changed/moved inodes & mark dirty + */ + old_ip->i_ctime = CURRENT_TIME; + mark_inode_dirty(old_ip); + + new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb); + mark_inode_dirty(new_dir); + + /* Build list of inodes modified by this transaction */ + ipcount = 0; + iplist[ipcount++] = old_ip; + if (new_ip) + iplist[ipcount++] = new_ip; + iplist[ipcount++] = old_dir; + + if (old_dir != new_dir) { + iplist[ipcount++] = new_dir; + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; + mark_inode_dirty(old_dir); + } + + /* + * Incomplete truncate of file data can + * result in timing problems unless we synchronously commit the + * transaction. + */ + if (new_size) + commit_flag = COMMIT_SYNC; + else + commit_flag = 0; + + rc = txCommit(tid, ipcount, iplist, commit_flag); + + out4: + txEnd(tid); + if (new_ip) + mutex_unlock(&JFS_IP(new_ip)->commit_mutex); + if (old_dir != new_dir) + mutex_unlock(&JFS_IP(old_dir)->commit_mutex); + mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_dir)->commit_mutex); + + while (new_size && (rc == 0)) { + tid = txBegin(new_ip->i_sb, 0); + mutex_lock(&JFS_IP(new_ip)->commit_mutex); + new_size = xtTruncate_pmap(tid, new_ip, new_size); + if (new_size < 0) { + txAbort(tid, 1); + rc = new_size; + } else + rc = txCommit(tid, 1, &new_ip, COMMIT_SYNC); + txEnd(tid); + mutex_unlock(&JFS_IP(new_ip)->commit_mutex); + } + if (new_ip && (new_ip->i_nlink == 0)) + set_cflag(COMMIT_Nolink, new_ip); + out3: + free_UCSname(&new_dname); + out2: + free_UCSname(&old_dname); + out1: + if (new_ip && !S_ISDIR(new_ip->i_mode)) + IWRITE_UNLOCK(new_ip); + /* + * Truncating the directory index table is not guaranteed. It + * may need to be done iteratively + */ + if (test_cflag(COMMIT_Stale, old_dir)) { + if (old_dir->i_size > 1) + jfs_truncate_nolock(old_dir, 0); + + clear_cflag(COMMIT_Stale, old_dir); + } + + jfs_info("jfs_rename: returning %d", rc); + return rc; +} + + +/* + * NAME: jfs_mknod + * + * FUNCTION: Create a special file (device) + */ +static int jfs_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + struct jfs_inode_info *jfs_ip; + struct btstack btstack; + struct component_name dname; + ino_t ino; + struct inode *ip; + struct inode *iplist[2]; + int rc; + tid_t tid; + struct tblock *tblk; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + jfs_info("jfs_mknod: %s", dentry->d_name.name); + + dquot_initialize(dir); + + if ((rc = get_UCSname(&dname, dentry))) + goto out; + + ip = ialloc(dir, mode); + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); + goto out1; + } + jfs_ip = JFS_IP(ip); + + tid = txBegin(dir->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + rc = jfs_init_acl(tid, ip, dir); + if (rc) + goto out3; + + rc = jfs_init_security(tid, ip, dir, &dentry->d_name); + if (rc) { + txAbort(tid, 0); + goto out3; + } + + if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) { + txAbort(tid, 0); + goto out3; + } + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ino = ip->i_ino; + tblk->u.ixpxd = JFS_IP(ip)->ixpxd; + + ino = ip->i_ino; + if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) { + txAbort(tid, 0); + goto out3; + } + + ip->i_op = &jfs_file_inode_operations; + jfs_ip->dev = new_encode_dev(rdev); + init_special_inode(ip, ip->i_mode, rdev); + + mark_inode_dirty(ip); + + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + mark_inode_dirty(dir); + + iplist[0] = dir; + iplist[1] = ip; + rc = txCommit(tid, 2, iplist, 0); + + out3: + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dir)->commit_mutex); + if (rc) { + free_ea_wmap(ip); + ip->i_nlink = 0; + unlock_new_inode(ip); + iput(ip); + } else { + d_instantiate(dentry, ip); + unlock_new_inode(ip); + } + + out1: + free_UCSname(&dname); + + out: + jfs_info("jfs_mknod: returning %d", rc); + return rc; +} + +static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd) +{ + struct btstack btstack; + ino_t inum; + struct inode *ip; + struct component_name key; + const char *name = dentry->d_name.name; + int len = dentry->d_name.len; + int rc; + + jfs_info("jfs_lookup: name = %s", name); + + if ((name[0] == '.') && (len == 1)) + inum = dip->i_ino; + else if (strcmp(name, "..") == 0) + inum = PARENT(dip); + else { + if ((rc = get_UCSname(&key, dentry))) + return ERR_PTR(rc); + rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP); + free_UCSname(&key); + if (rc == -ENOENT) { + d_add(dentry, NULL); + return NULL; + } else if (rc) { + jfs_err("jfs_lookup: dtSearch returned %d", rc); + return ERR_PTR(rc); + } + } + + ip = jfs_iget(dip->i_sb, inum); + if (IS_ERR(ip)) { + jfs_err("jfs_lookup: iget failed on inum %d", (uint) inum); + return ERR_CAST(ip); + } + + return d_splice_alias(ip, dentry); +} + +static struct inode *jfs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino == 0) + return ERR_PTR(-ESTALE); + inode = jfs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + jfs_nfs_get_inode); +} + +struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + jfs_nfs_get_inode); +} + +struct dentry *jfs_get_parent(struct dentry *dentry) +{ + unsigned long parent_ino; + + parent_ino = + le32_to_cpu(JFS_IP(dentry->d_inode)->i_dtroot.header.idotdot); + + return d_obtain_alias(jfs_iget(dentry->d_inode->i_sb, parent_ino)); +} + +const struct inode_operations jfs_dir_inode_operations = { + .create = jfs_create, + .lookup = jfs_lookup, + .link = jfs_link, + .unlink = jfs_unlink, + .symlink = jfs_symlink, + .mkdir = jfs_mkdir, + .rmdir = jfs_rmdir, + .mknod = jfs_mknod, + .rename = jfs_rename, + .setxattr = jfs_setxattr, + .getxattr = jfs_getxattr, + .listxattr = jfs_listxattr, + .removexattr = jfs_removexattr, + .setattr = jfs_setattr, +#ifdef CONFIG_JFS_POSIX_ACL + .check_acl = jfs_check_acl, +#endif +}; + +const struct file_operations jfs_dir_operations = { + .read = generic_read_dir, + .readdir = jfs_readdir, + .fsync = jfs_fsync, + .unlocked_ioctl = jfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = jfs_compat_ioctl, +#endif + .llseek = generic_file_llseek, +}; + +static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode, + struct qstr *this) +{ + unsigned long hash; + int i; + + hash = init_name_hash(); + for (i=0; i < this->len; i++) + hash = partial_name_hash(tolower(this->name[i]), hash); + this->hash = end_name_hash(hash); + + return 0; +} + +static int jfs_ci_compare(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + int i, result = 1; + + if (len != name->len) + goto out; + for (i=0; i < len; i++) { + if (tolower(str[i]) != tolower(name->name[i])) + goto out; + } + result = 0; +out: + return result; +} + +static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + /* + * This is not negative dentry. Always valid. + * + * Note, rename() to existing directory entry will have ->d_inode, + * and will use existing name which isn't specified name by user. + * + * We may be able to drop this positive dentry here. But dropping + * positive dentry isn't good idea. So it's unsupported like + * rename("filename", "FILENAME") for now. + */ + if (dentry->d_inode) + return 1; + + /* + * This may be nfsd (or something), anyway, we can't see the + * intent of this. So, since this can be for creation, drop it. + */ + if (!nd) + return 0; + + /* + * Drop the negative dentry, in order to make sure to use the + * case sensitive name which is specified by user if this is + * for creation. + */ + if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) { + if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + return 0; + } + return 1; +} + +const struct dentry_operations jfs_ci_dentry_operations = +{ + .d_hash = jfs_ci_hash, + .d_compare = jfs_ci_compare, + .d_revalidate = jfs_ci_revalidate, +}; diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c new file mode 100644 index 00000000..8d0c1c7c --- /dev/null +++ b/fs/jfs/resize.c @@ -0,0 +1,543 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dinode.h" +#include "jfs_imap.h" +#include "jfs_dmap.h" +#include "jfs_superblock.h" +#include "jfs_txnmgr.h" +#include "jfs_debug.h" + +#define BITSPERPAGE (PSIZE << 3) +#define L2MEGABYTE 20 +#define MEGABYTE (1 << L2MEGABYTE) +#define MEGABYTE32 (MEGABYTE << 5) + +/* convert block number to bmap file page number */ +#define BLKTODMAPN(b)\ + (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) + +/* + * jfs_extendfs() + * + * function: extend file system; + * + * |-------------------------------|----------|----------| + * file system space fsck inline log + * workspace space + * + * input: + * new LVSize: in LV blocks (required) + * new LogSize: in LV blocks (optional) + * new FSSize: in LV blocks (optional) + * + * new configuration: + * 1. set new LogSize as specified or default from new LVSize; + * 2. compute new FSCKSize from new LVSize; + * 3. set new FSSize as MIN(FSSize, LVSize-(LogSize+FSCKSize)) where + * assert(new FSSize >= old FSSize), + * i.e., file system must not be shrunk; + */ +int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) +{ + int rc = 0; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct inode *ipbmap = sbi->ipbmap; + struct inode *ipbmap2; + struct inode *ipimap = sbi->ipimap; + struct jfs_log *log = sbi->log; + struct bmap *bmp = sbi->bmap; + s64 newLogAddress, newFSCKAddress; + int newFSCKSize; + s64 newMapSize = 0, mapSize; + s64 XAddress, XSize, nblocks, xoff, xaddr, t64; + s64 oldLVSize; + s64 newFSSize; + s64 VolumeSize; + int newNpages = 0, nPages, newPage, xlen, t32; + int tid; + int log_formatted = 0; + struct inode *iplist[1]; + struct jfs_superblock *j_sb, *j_sb2; + s64 old_agsize; + int agsizechanged = 0; + struct buffer_head *bh, *bh2; + + /* If the volume hasn't grown, get out now */ + + if (sbi->mntflag & JFS_INLINELOG) + oldLVSize = addressPXD(&sbi->logpxd) + lengthPXD(&sbi->logpxd); + else + oldLVSize = addressPXD(&sbi->fsckpxd) + + lengthPXD(&sbi->fsckpxd); + + if (oldLVSize >= newLVSize) { + printk(KERN_WARNING + "jfs_extendfs: volume hasn't grown, returning\n"); + goto out; + } + + VolumeSize = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + + if (VolumeSize) { + if (newLVSize > VolumeSize) { + printk(KERN_WARNING "jfs_extendfs: invalid size\n"); + rc = -EINVAL; + goto out; + } + } else { + /* check the device */ + bh = sb_bread(sb, newLVSize - 1); + if (!bh) { + printk(KERN_WARNING "jfs_extendfs: invalid size\n"); + rc = -EINVAL; + goto out; + } + bforget(bh); + } + + /* Can't extend write-protected drive */ + + if (isReadOnly(ipbmap)) { + printk(KERN_WARNING "jfs_extendfs: read-only file system\n"); + rc = -EROFS; + goto out; + } + + /* + * reconfigure LV spaces + * --------------------- + * + * validate new size, or, if not specified, determine new size + */ + + /* + * reconfigure inline log space: + */ + if ((sbi->mntflag & JFS_INLINELOG)) { + if (newLogSize == 0) { + /* + * no size specified: default to 1/256 of aggregate + * size; rounded up to a megabyte boundary; + */ + newLogSize = newLVSize >> 8; + t32 = (1 << (20 - sbi->l2bsize)) - 1; + newLogSize = (newLogSize + t32) & ~t32; + newLogSize = + min(newLogSize, MEGABYTE32 >> sbi->l2bsize); + } else { + /* + * convert the newLogSize to fs blocks. + * + * Since this is given in megabytes, it will always be + * an even number of pages. + */ + newLogSize = (newLogSize * MEGABYTE) >> sbi->l2bsize; + } + + } else + newLogSize = 0; + + newLogAddress = newLVSize - newLogSize; + + /* + * reconfigure fsck work space: + * + * configure it to the end of the logical volume regardless of + * whether file system extends to the end of the aggregate; + * Need enough 4k pages to cover: + * - 1 bit per block in aggregate rounded up to BPERDMAP boundary + * - 1 extra page to handle control page and intermediate level pages + * - 50 extra pages for the chkdsk service log + */ + t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP) + << L2BPERDMAP; + t32 = DIV_ROUND_UP(t64, BITSPERPAGE) + 1 + 50; + newFSCKSize = t32 << sbi->l2nbperpage; + newFSCKAddress = newLogAddress - newFSCKSize; + + /* + * compute new file system space; + */ + newFSSize = newLVSize - newLogSize - newFSCKSize; + + /* file system cannot be shrunk */ + if (newFSSize < bmp->db_mapsize) { + rc = -EINVAL; + goto out; + } + + /* + * If we're expanding enough that the inline log does not overlap + * the old one, we can format the new log before we quiesce the + * filesystem. + */ + if ((sbi->mntflag & JFS_INLINELOG) && (newLogAddress > oldLVSize)) { + if ((rc = lmLogFormat(log, newLogAddress, newLogSize))) + goto out; + log_formatted = 1; + } + /* + * quiesce file system + * + * (prepare to move the inline log and to prevent map update) + * + * block any new transactions and wait for completion of + * all wip transactions and flush modified pages s.t. + * on-disk file system is in consistent state and + * log is not required for recovery. + */ + txQuiesce(sb); + + /* Reset size of direct inode */ + sbi->direct_inode->i_size = sb->s_bdev->bd_inode->i_size; + + if (sbi->mntflag & JFS_INLINELOG) { + /* + * deactivate old inline log + */ + lmLogShutdown(log); + + /* + * mark on-disk super block for fs in transition; + * + * update on-disk superblock for the new space configuration + * of inline log space and fsck work space descriptors: + * N.B. FS descriptor is NOT updated; + * + * crash recovery: + * logredo(): if FM_EXTENDFS, return to fsck() for cleanup; + * fsck(): if FM_EXTENDFS, reformat inline log and fsck + * workspace from superblock inline log descriptor and fsck + * workspace descriptor; + */ + + /* read in superblock */ + if ((rc = readSuper(sb, &bh))) + goto error_out; + j_sb = (struct jfs_superblock *)bh->b_data; + + /* mark extendfs() in progress */ + j_sb->s_state |= cpu_to_le32(FM_EXTENDFS); + j_sb->s_xsize = cpu_to_le64(newFSSize); + PXDaddress(&j_sb->s_xfsckpxd, newFSCKAddress); + PXDlength(&j_sb->s_xfsckpxd, newFSCKSize); + PXDaddress(&j_sb->s_xlogpxd, newLogAddress); + PXDlength(&j_sb->s_xlogpxd, newLogSize); + + /* synchronously update superblock */ + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + + /* + * format new inline log synchronously; + * + * crash recovery: if log move in progress, + * reformat log and exit success; + */ + if (!log_formatted) + if ((rc = lmLogFormat(log, newLogAddress, newLogSize))) + goto error_out; + + /* + * activate new log + */ + log->base = newLogAddress; + log->size = newLogSize >> (L2LOGPSIZE - sb->s_blocksize_bits); + if ((rc = lmLogInit(log))) + goto error_out; + } + + /* + * extend block allocation map + * --------------------------- + * + * extendfs() for new extension, retry after crash recovery; + * + * note: both logredo() and fsck() rebuild map from + * the bitmap and configuration parameter from superblock + * (disregarding all other control information in the map); + * + * superblock: + * s_size: aggregate size in physical blocks; + */ + /* + * compute the new block allocation map configuration + * + * map dinode: + * di_size: map file size in byte; + * di_nblocks: number of blocks allocated for map file; + * di_mapsize: number of blocks in aggregate (covered by map); + * map control page: + * db_mapsize: number of blocks in aggregate (covered by map); + */ + newMapSize = newFSSize; + /* number of data pages of new bmap file: + * roundup new size to full dmap page boundary and + * add 1 extra dmap page for next extendfs() + */ + t64 = (newMapSize - 1) + BPERDMAP; + newNpages = BLKTODMAPN(t64) + 1; + + /* + * extend map from current map (WITHOUT growing mapfile) + * + * map new extension with unmapped part of the last partial + * dmap page, if applicable, and extra page(s) allocated + * at end of bmap by mkfs() or previous extendfs(); + */ + extendBmap: + /* compute number of blocks requested to extend */ + mapSize = bmp->db_mapsize; + XAddress = mapSize; /* eXtension Address */ + XSize = newMapSize - mapSize; /* eXtension Size */ + old_agsize = bmp->db_agsize; /* We need to know if this changes */ + + /* compute number of blocks that can be extended by current mapfile */ + t64 = dbMapFileSizeToMapSize(ipbmap); + if (mapSize > t64) { + printk(KERN_ERR "jfs_extendfs: mapSize (0x%Lx) > t64 (0x%Lx)\n", + (long long) mapSize, (long long) t64); + rc = -EIO; + goto error_out; + } + nblocks = min(t64 - mapSize, XSize); + + /* + * update map pages for new extension: + * + * update/init dmap and bubble up the control hierarchy + * incrementally fold up dmaps into upper levels; + * update bmap control page; + */ + if ((rc = dbExtendFS(ipbmap, XAddress, nblocks))) + goto error_out; + + agsizechanged |= (bmp->db_agsize != old_agsize); + + /* + * the map now has extended to cover additional nblocks: + * dn_mapsize = oldMapsize + nblocks; + */ + /* ipbmap->i_mapsize += nblocks; */ + XSize -= nblocks; + + /* + * grow map file to cover remaining extension + * and/or one extra dmap page for next extendfs(); + * + * allocate new map pages and its backing blocks, and + * update map file xtree + */ + /* compute number of data pages of current bmap file */ + nPages = ipbmap->i_size >> L2PSIZE; + + /* need to grow map file ? */ + if (nPages == newNpages) + goto finalizeBmap; + + /* + * grow bmap file for the new map pages required: + * + * allocate growth at the start of newly extended region; + * bmap file only grows sequentially, i.e., both data pages + * and possibly xtree index pages may grow in append mode, + * s.t. logredo() can reconstruct pre-extension state + * by washing away bmap file of pages outside s_size boundary; + */ + /* + * journal map file growth as if a regular file growth: + * (note: bmap is created with di_mode = IFJOURNAL|IFREG); + * + * journaling of bmap file growth is not required since + * logredo() do/can not use log records of bmap file growth + * but it provides careful write semantics, pmap update, etc.; + */ + /* synchronous write of data pages: bmap data pages are + * cached in meta-data cache, and not written out + * by txCommit(); + */ + filemap_fdatawait(ipbmap->i_mapping); + filemap_write_and_wait(ipbmap->i_mapping); + diWriteSpecial(ipbmap, 0); + + newPage = nPages; /* first new page number */ + xoff = newPage << sbi->l2nbperpage; + xlen = (newNpages - nPages) << sbi->l2nbperpage; + xlen = min(xlen, (int) nblocks) & ~(sbi->nbperpage - 1); + xaddr = XAddress; + + tid = txBegin(sb, COMMIT_FORCE); + + if ((rc = xtAppend(tid, ipbmap, 0, xoff, nblocks, &xlen, &xaddr, 0))) { + txEnd(tid); + goto error_out; + } + /* update bmap file size */ + ipbmap->i_size += xlen << sbi->l2bsize; + inode_add_bytes(ipbmap, xlen << sbi->l2bsize); + + iplist[0] = ipbmap; + rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); + + txEnd(tid); + + if (rc) + goto error_out; + + /* + * map file has been grown now to cover extension to further out; + * di_size = new map file size; + * + * if huge extension, the previous extension based on previous + * map file size may not have been sufficient to cover whole extension + * (it could have been used up for new map pages), + * but the newly grown map file now covers lot bigger new free space + * available for further extension of map; + */ + /* any more blocks to extend ? */ + if (XSize) + goto extendBmap; + + finalizeBmap: + /* finalize bmap */ + dbFinalizeBmap(ipbmap); + + /* + * update inode allocation map + * --------------------------- + * + * move iag lists from old to new iag; + * agstart field is not updated for logredo() to reconstruct + * iag lists if system crash occurs. + * (computation of ag number from agstart based on agsize + * will correctly identify the new ag); + */ + /* if new AG size the same as old AG size, done! */ + if (agsizechanged) { + if ((rc = diExtendFS(ipimap, ipbmap))) + goto error_out; + + /* finalize imap */ + if ((rc = diSync(ipimap))) + goto error_out; + } + + /* + * finalize + * -------- + * + * extension is committed when on-disk super block is + * updated with new descriptors: logredo will recover + * crash before it to pre-extension state; + */ + + /* sync log to skip log replay of bmap file growth transaction; */ + /* lmLogSync(log, 1); */ + + /* + * synchronous write bmap global control page; + * for crash before completion of write + * logredo() will recover to pre-extendfs state; + * for crash after completion of write, + * logredo() will recover post-extendfs state; + */ + if ((rc = dbSync(ipbmap))) + goto error_out; + + /* + * copy primary bmap inode to secondary bmap inode + */ + + ipbmap2 = diReadSpecial(sb, BMAP_I, 1); + if (ipbmap2 == NULL) { + printk(KERN_ERR "jfs_extendfs: diReadSpecial(bmap) failed\n"); + goto error_out; + } + memcpy(&JFS_IP(ipbmap2)->i_xtroot, &JFS_IP(ipbmap)->i_xtroot, 288); + ipbmap2->i_size = ipbmap->i_size; + ipbmap2->i_blocks = ipbmap->i_blocks; + + diWriteSpecial(ipbmap2, 1); + diFreeSpecial(ipbmap2); + + /* + * update superblock + */ + if ((rc = readSuper(sb, &bh))) + goto error_out; + j_sb = (struct jfs_superblock *)bh->b_data; + + /* mark extendfs() completion */ + j_sb->s_state &= cpu_to_le32(~FM_EXTENDFS); + j_sb->s_size = cpu_to_le64(bmp->db_mapsize << + le16_to_cpu(j_sb->s_l2bfactor)); + j_sb->s_agsize = cpu_to_le32(bmp->db_agsize); + + /* update inline log space descriptor */ + if (sbi->mntflag & JFS_INLINELOG) { + PXDaddress(&(j_sb->s_logpxd), newLogAddress); + PXDlength(&(j_sb->s_logpxd), newLogSize); + } + + /* record log's mount serial number */ + j_sb->s_logserial = cpu_to_le32(log->serial); + + /* update fsck work space descriptor */ + PXDaddress(&(j_sb->s_fsckpxd), newFSCKAddress); + PXDlength(&(j_sb->s_fsckpxd), newFSCKSize); + j_sb->s_fscklog = 1; + /* sb->s_fsckloglen remains the same */ + + /* Update secondary superblock */ + bh2 = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits); + if (bh2) { + j_sb2 = (struct jfs_superblock *)bh2->b_data; + memcpy(j_sb2, j_sb, sizeof (struct jfs_superblock)); + + mark_buffer_dirty(bh); + sync_dirty_buffer(bh2); + brelse(bh2); + } + + /* write primary superblock */ + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + + goto resume; + + error_out: + jfs_error(sb, "jfs_extendfs"); + + resume: + /* + * resume file system transactions + */ + txResume(sb); + + out: + return rc; +} diff --git a/fs/jfs/super.c b/fs/jfs/super.c new file mode 100644 index 00000000..06c8a67c --- /dev/null +++ b/fs/jfs/super.c @@ -0,0 +1,901 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_inode.h" +#include "jfs_metapage.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_acl.h" +#include "jfs_debug.h" + +MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); +MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); +MODULE_LICENSE("GPL"); + +static struct kmem_cache * jfs_inode_cachep; + +static const struct super_operations jfs_super_operations; +static const struct export_operations jfs_export_operations; +static struct file_system_type jfs_fs_type; + +#define MAX_COMMIT_THREADS 64 +static int commit_threads = 0; +module_param(commit_threads, int, 0); +MODULE_PARM_DESC(commit_threads, "Number of commit threads"); + +static struct task_struct *jfsCommitThread[MAX_COMMIT_THREADS]; +struct task_struct *jfsIOthread; +struct task_struct *jfsSyncThread; + +#ifdef CONFIG_JFS_DEBUG +int jfsloglevel = JFS_LOGLEVEL_WARN; +module_param(jfsloglevel, int, 0644); +MODULE_PARM_DESC(jfsloglevel, "Specify JFS loglevel (0, 1 or 2)"); +#endif + +static void jfs_handle_error(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + + if (sb->s_flags & MS_RDONLY) + return; + + updateSuper(sb, FM_DIRTY); + + if (sbi->flag & JFS_ERR_PANIC) + panic("JFS (device %s): panic forced after error\n", + sb->s_id); + else if (sbi->flag & JFS_ERR_REMOUNT_RO) { + jfs_err("ERROR: (device %s): remounting filesystem " + "as read-only\n", + sb->s_id); + sb->s_flags |= MS_RDONLY; + } + + /* nothing is done for continue beyond marking the superblock dirty */ +} + +void jfs_error(struct super_block *sb, const char * function, ...) +{ + static char error_buf[256]; + va_list args; + + va_start(args, function); + vsnprintf(error_buf, sizeof(error_buf), function, args); + va_end(args); + + printk(KERN_ERR "ERROR: (device %s): %s\n", sb->s_id, error_buf); + + jfs_handle_error(sb); +} + +static struct inode *jfs_alloc_inode(struct super_block *sb) +{ + struct jfs_inode_info *jfs_inode; + + jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS); + if (!jfs_inode) + return NULL; + return &jfs_inode->vfs_inode; +} + +static void jfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct jfs_inode_info *ji = JFS_IP(inode); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(jfs_inode_cachep, ji); +} + +static void jfs_destroy_inode(struct inode *inode) +{ + struct jfs_inode_info *ji = JFS_IP(inode); + + BUG_ON(!list_empty(&ji->anon_inode_list)); + + spin_lock_irq(&ji->ag_lock); + if (ji->active_ag != -1) { + struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap; + atomic_dec(&bmap->db_active[ji->active_ag]); + ji->active_ag = -1; + } + spin_unlock_irq(&ji->ag_lock); + call_rcu(&inode->i_rcu, jfs_i_callback); +} + +static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); + s64 maxinodes; + struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap; + + jfs_info("In jfs_statfs"); + buf->f_type = JFS_SUPER_MAGIC; + buf->f_bsize = sbi->bsize; + buf->f_blocks = sbi->bmap->db_mapsize; + buf->f_bfree = sbi->bmap->db_nfree; + buf->f_bavail = sbi->bmap->db_nfree; + /* + * If we really return the number of allocated & free inodes, some + * applications will fail because they won't see enough free inodes. + * We'll try to calculate some guess as to how may inodes we can + * really allocate + * + * buf->f_files = atomic_read(&imap->im_numinos); + * buf->f_ffree = atomic_read(&imap->im_numfree); + */ + maxinodes = min((s64) atomic_read(&imap->im_numinos) + + ((sbi->bmap->db_nfree >> imap->im_l2nbperiext) + << L2INOSPEREXT), (s64) 0xffffffffLL); + buf->f_files = maxinodes; + buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) - + atomic_read(&imap->im_numfree)); + buf->f_fsid.val[0] = (u32)crc32_le(0, sbi->uuid, sizeof(sbi->uuid)/2); + buf->f_fsid.val[1] = (u32)crc32_le(0, sbi->uuid + sizeof(sbi->uuid)/2, + sizeof(sbi->uuid)/2); + + buf->f_namelen = JFS_NAME_MAX; + return 0; +} + +static void jfs_put_super(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + int rc; + + jfs_info("In jfs_put_super"); + + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + + rc = jfs_umount(sb); + if (rc) + jfs_err("jfs_umount failed with return code %d", rc); + + unload_nls(sbi->nls_tab); + + truncate_inode_pages(sbi->direct_inode->i_mapping, 0); + iput(sbi->direct_inode); + + kfree(sbi); +} + +enum { + Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize, + Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota, + Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask +}; + +static const match_table_t tokens = { + {Opt_integrity, "integrity"}, + {Opt_nointegrity, "nointegrity"}, + {Opt_iocharset, "iocharset=%s"}, + {Opt_resize, "resize=%u"}, + {Opt_resize_nosize, "resize"}, + {Opt_errors, "errors=%s"}, + {Opt_ignore, "noquota"}, + {Opt_ignore, "quota"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_umask, "umask=%u"}, + {Opt_err, NULL} +}; + +static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, + int *flag) +{ + void *nls_map = (void *)-1; /* -1: no change; NULL: none */ + char *p; + struct jfs_sb_info *sbi = JFS_SBI(sb); + + *newLVSize = 0; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_integrity: + *flag &= ~JFS_NOINTEGRITY; + break; + case Opt_nointegrity: + *flag |= JFS_NOINTEGRITY; + break; + case Opt_ignore: + /* Silently ignore the quota options */ + /* Don't do anything ;-) */ + break; + case Opt_iocharset: + if (nls_map && nls_map != (void *) -1) + unload_nls(nls_map); + if (!strcmp(args[0].from, "none")) + nls_map = NULL; + else { + nls_map = load_nls(args[0].from); + if (!nls_map) { + printk(KERN_ERR + "JFS: charset not found\n"); + goto cleanup; + } + } + break; + case Opt_resize: + { + char *resize = args[0].from; + *newLVSize = simple_strtoull(resize, &resize, 0); + break; + } + case Opt_resize_nosize: + { + *newLVSize = sb->s_bdev->bd_inode->i_size >> + sb->s_blocksize_bits; + if (*newLVSize == 0) + printk(KERN_ERR + "JFS: Cannot determine volume size\n"); + break; + } + case Opt_errors: + { + char *errors = args[0].from; + if (!errors || !*errors) + goto cleanup; + if (!strcmp(errors, "continue")) { + *flag &= ~JFS_ERR_REMOUNT_RO; + *flag &= ~JFS_ERR_PANIC; + *flag |= JFS_ERR_CONTINUE; + } else if (!strcmp(errors, "remount-ro")) { + *flag &= ~JFS_ERR_CONTINUE; + *flag &= ~JFS_ERR_PANIC; + *flag |= JFS_ERR_REMOUNT_RO; + } else if (!strcmp(errors, "panic")) { + *flag &= ~JFS_ERR_CONTINUE; + *flag &= ~JFS_ERR_REMOUNT_RO; + *flag |= JFS_ERR_PANIC; + } else { + printk(KERN_ERR + "JFS: %s is an invalid error handler\n", + errors); + goto cleanup; + } + break; + } + +#ifdef CONFIG_QUOTA + case Opt_quota: + case Opt_usrquota: + *flag |= JFS_USRQUOTA; + break; + case Opt_grpquota: + *flag |= JFS_GRPQUOTA; + break; +#else + case Opt_usrquota: + case Opt_grpquota: + case Opt_quota: + printk(KERN_ERR + "JFS: quota operations not supported\n"); + break; +#endif + case Opt_uid: + { + char *uid = args[0].from; + sbi->uid = simple_strtoul(uid, &uid, 0); + break; + } + case Opt_gid: + { + char *gid = args[0].from; + sbi->gid = simple_strtoul(gid, &gid, 0); + break; + } + case Opt_umask: + { + char *umask = args[0].from; + sbi->umask = simple_strtoul(umask, &umask, 8); + if (sbi->umask & ~0777) { + printk(KERN_ERR + "JFS: Invalid value of umask\n"); + goto cleanup; + } + break; + } + default: + printk("jfs: Unrecognized mount option \"%s\" " + " or missing value\n", p); + goto cleanup; + } + } + + if (nls_map != (void *) -1) { + /* Discard old (if remount) */ + unload_nls(sbi->nls_tab); + sbi->nls_tab = nls_map; + } + return 1; + +cleanup: + if (nls_map && nls_map != (void *) -1) + unload_nls(nls_map); + return 0; +} + +static int jfs_remount(struct super_block *sb, int *flags, char *data) +{ + s64 newLVSize = 0; + int rc = 0; + int flag = JFS_SBI(sb)->flag; + int ret; + + if (!parse_options(data, sb, &newLVSize, &flag)) { + return -EINVAL; + } + + if (newLVSize) { + if (sb->s_flags & MS_RDONLY) { + printk(KERN_ERR + "JFS: resize requires volume to be mounted read-write\n"); + return -EROFS; + } + rc = jfs_extendfs(sb, newLVSize, 0); + if (rc) + return rc; + } + + if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { + /* + * Invalidate any previously read metadata. fsck may have + * changed the on-disk data since we mounted r/o + */ + truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0); + + JFS_SBI(sb)->flag = flag; + ret = jfs_mount_rw(sb, 1); + + /* mark the fs r/w for quota activity */ + sb->s_flags &= ~MS_RDONLY; + + dquot_resume(sb, -1); + return ret; + } + if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { + rc = dquot_suspend(sb, -1); + if (rc < 0) { + return rc; + } + rc = jfs_umount_rw(sb); + JFS_SBI(sb)->flag = flag; + return rc; + } + if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) + if (!(sb->s_flags & MS_RDONLY)) { + rc = jfs_umount_rw(sb); + if (rc) + return rc; + + JFS_SBI(sb)->flag = flag; + ret = jfs_mount_rw(sb, 1); + return ret; + } + JFS_SBI(sb)->flag = flag; + + return 0; +} + +static int jfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct jfs_sb_info *sbi; + struct inode *inode; + int rc; + s64 newLVSize = 0; + int flag, ret = -EINVAL; + + jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags); + + if (!new_valid_dev(sb->s_bdev->bd_dev)) + return -EOVERFLOW; + + sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + sbi->sb = sb; + sbi->uid = sbi->gid = sbi->umask = -1; + + /* initialize the mount flag and determine the default error handler */ + flag = JFS_ERR_REMOUNT_RO; + + if (!parse_options((char *) data, sb, &newLVSize, &flag)) + goto out_kfree; + sbi->flag = flag; + +#ifdef CONFIG_JFS_POSIX_ACL + sb->s_flags |= MS_POSIXACL; +#endif + + if (newLVSize) { + printk(KERN_ERR "resize option for remount only\n"); + goto out_kfree; + } + + /* + * Initialize blocksize to 4K. + */ + sb_set_blocksize(sb, PSIZE); + + /* + * Set method vectors. + */ + sb->s_op = &jfs_super_operations; + sb->s_export_op = &jfs_export_operations; +#ifdef CONFIG_QUOTA + sb->dq_op = &dquot_operations; + sb->s_qcop = &dquot_quotactl_ops; +#endif + + /* + * Initialize direct-mapping inode/address-space + */ + inode = new_inode(sb); + if (inode == NULL) { + ret = -ENOMEM; + goto out_unload; + } + inode->i_ino = 0; + inode->i_nlink = 1; + inode->i_size = sb->s_bdev->bd_inode->i_size; + inode->i_mapping->a_ops = &jfs_metapage_aops; + insert_inode_hash(inode); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + + sbi->direct_inode = inode; + + rc = jfs_mount(sb); + if (rc) { + if (!silent) { + jfs_err("jfs_mount failed w/return code = %d", rc); + } + goto out_mount_failed; + } + if (sb->s_flags & MS_RDONLY) + sbi->log = NULL; + else { + rc = jfs_mount_rw(sb, 0); + if (rc) { + if (!silent) { + jfs_err("jfs_mount_rw failed, return code = %d", + rc); + } + goto out_no_rw; + } + } + + sb->s_magic = JFS_SUPER_MAGIC; + + if (sbi->mntflag & JFS_OS2) + sb->s_d_op = &jfs_ci_dentry_operations; + + inode = jfs_iget(sb, ROOT_I); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out_no_rw; + } + sb->s_root = d_alloc_root(inode); + if (!sb->s_root) + goto out_no_root; + + /* logical blocks are represented by 40 bits in pxd_t, etc. */ + sb->s_maxbytes = ((u64) sb->s_blocksize) << 40; +#if BITS_PER_LONG == 32 + /* + * Page cache is indexed by long. + * I would use MAX_LFS_FILESIZE, but it's only half as big + */ + sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes); +#endif + sb->s_time_gran = 1; + return 0; + +out_no_root: + jfs_err("jfs_read_super: get root dentry failed"); + iput(inode); + +out_no_rw: + rc = jfs_umount(sb); + if (rc) { + jfs_err("jfs_umount failed with return code %d", rc); + } +out_mount_failed: + filemap_write_and_wait(sbi->direct_inode->i_mapping); + truncate_inode_pages(sbi->direct_inode->i_mapping, 0); + make_bad_inode(sbi->direct_inode); + iput(sbi->direct_inode); + sbi->direct_inode = NULL; +out_unload: + if (sbi->nls_tab) + unload_nls(sbi->nls_tab); +out_kfree: + kfree(sbi); + return ret; +} + +static int jfs_freeze(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_log *log = sbi->log; + + if (!(sb->s_flags & MS_RDONLY)) { + txQuiesce(sb); + lmLogShutdown(log); + updateSuper(sb, FM_CLEAN); + } + return 0; +} + +static int jfs_unfreeze(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_log *log = sbi->log; + int rc = 0; + + if (!(sb->s_flags & MS_RDONLY)) { + updateSuper(sb, FM_MOUNT); + if ((rc = lmLogInit(log))) + jfs_err("jfs_unlock failed with return code %d", rc); + else + txResume(sb); + } + return 0; +} + +static struct dentry *jfs_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super); +} + +static int jfs_sync_fs(struct super_block *sb, int wait) +{ + struct jfs_log *log = JFS_SBI(sb)->log; + + /* log == NULL indicates read-only mount */ + if (log) { + jfs_flush_journal(log, wait); + jfs_syncpt(log, 0); + } + + return 0; +} + +static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct jfs_sb_info *sbi = JFS_SBI(vfs->mnt_sb); + + if (sbi->uid != -1) + seq_printf(seq, ",uid=%d", sbi->uid); + if (sbi->gid != -1) + seq_printf(seq, ",gid=%d", sbi->gid); + if (sbi->umask != -1) + seq_printf(seq, ",umask=%03o", sbi->umask); + if (sbi->flag & JFS_NOINTEGRITY) + seq_puts(seq, ",nointegrity"); + if (sbi->nls_tab) + seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset); + if (sbi->flag & JFS_ERR_CONTINUE) + seq_printf(seq, ",errors=continue"); + if (sbi->flag & JFS_ERR_PANIC) + seq_printf(seq, ",errors=panic"); + +#ifdef CONFIG_QUOTA + if (sbi->flag & JFS_USRQUOTA) + seq_puts(seq, ",usrquota"); + + if (sbi->flag & JFS_GRPQUOTA) + seq_puts(seq, ",grpquota"); +#endif + + return 0; +} + +#ifdef CONFIG_QUOTA + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) + * we don't have to be afraid of races */ +static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head tmp_bh; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + + tmp_bh.b_state = 0; + tmp_bh.b_size = 1 << inode->i_blkbits; + err = jfs_get_block(inode, blk, &tmp_bh, 0); + if (err) + return err; + if (!buffer_mapped(&tmp_bh)) /* A hole? */ + memset(data, 0, tocopy); + else { + bh = sb_bread(sb, tmp_bh.b_blocknr); + if (!bh) + return -EIO; + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + } + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t jfs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t towrite = len; + struct buffer_head tmp_bh; + struct buffer_head *bh; + + mutex_lock(&inode->i_mutex); + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + + tmp_bh.b_state = 0; + tmp_bh.b_size = 1 << inode->i_blkbits; + err = jfs_get_block(inode, blk, &tmp_bh, 1); + if (err) + goto out; + if (offset || tocopy != sb->s_blocksize) + bh = sb_bread(sb, tmp_bh.b_blocknr); + else + bh = sb_getblk(sb, tmp_bh.b_blocknr); + if (!bh) { + err = -EIO; + goto out; + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, tocopy); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + brelse(bh); + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } +out: + if (len == towrite) { + mutex_unlock(&inode->i_mutex); + return err; + } + if (inode->i_size < off+len-towrite) + i_size_write(inode, off+len-towrite); + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + mutex_unlock(&inode->i_mutex); + return len - towrite; +} + +#endif + +static const struct super_operations jfs_super_operations = { + .alloc_inode = jfs_alloc_inode, + .destroy_inode = jfs_destroy_inode, + .dirty_inode = jfs_dirty_inode, + .write_inode = jfs_write_inode, + .evict_inode = jfs_evict_inode, + .put_super = jfs_put_super, + .sync_fs = jfs_sync_fs, + .freeze_fs = jfs_freeze, + .unfreeze_fs = jfs_unfreeze, + .statfs = jfs_statfs, + .remount_fs = jfs_remount, + .show_options = jfs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = jfs_quota_read, + .quota_write = jfs_quota_write, +#endif +}; + +static const struct export_operations jfs_export_operations = { + .fh_to_dentry = jfs_fh_to_dentry, + .fh_to_parent = jfs_fh_to_parent, + .get_parent = jfs_get_parent, +}; + +static struct file_system_type jfs_fs_type = { + .owner = THIS_MODULE, + .name = "jfs", + .mount = jfs_do_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static void init_once(void *foo) +{ + struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo; + + memset(jfs_ip, 0, sizeof(struct jfs_inode_info)); + INIT_LIST_HEAD(&jfs_ip->anon_inode_list); + init_rwsem(&jfs_ip->rdwrlock); + mutex_init(&jfs_ip->commit_mutex); + init_rwsem(&jfs_ip->xattr_sem); + spin_lock_init(&jfs_ip->ag_lock); + jfs_ip->active_ag = -1; + inode_init_once(&jfs_ip->vfs_inode); +} + +static int __init init_jfs_fs(void) +{ + int i; + int rc; + + jfs_inode_cachep = + kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + init_once); + if (jfs_inode_cachep == NULL) + return -ENOMEM; + + /* + * Metapage initialization + */ + rc = metapage_init(); + if (rc) { + jfs_err("metapage_init failed w/rc = %d", rc); + goto free_slab; + } + + /* + * Transaction Manager initialization + */ + rc = txInit(); + if (rc) { + jfs_err("txInit failed w/rc = %d", rc); + goto free_metapage; + } + + /* + * I/O completion thread (endio) + */ + jfsIOthread = kthread_run(jfsIOWait, NULL, "jfsIO"); + if (IS_ERR(jfsIOthread)) { + rc = PTR_ERR(jfsIOthread); + jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); + goto end_txmngr; + } + + if (commit_threads < 1) + commit_threads = num_online_cpus(); + if (commit_threads > MAX_COMMIT_THREADS) + commit_threads = MAX_COMMIT_THREADS; + + for (i = 0; i < commit_threads; i++) { + jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL, "jfsCommit"); + if (IS_ERR(jfsCommitThread[i])) { + rc = PTR_ERR(jfsCommitThread[i]); + jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); + commit_threads = i; + goto kill_committask; + } + } + + jfsSyncThread = kthread_run(jfs_sync, NULL, "jfsSync"); + if (IS_ERR(jfsSyncThread)) { + rc = PTR_ERR(jfsSyncThread); + jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); + goto kill_committask; + } + +#ifdef PROC_FS_JFS + jfs_proc_init(); +#endif + + return register_filesystem(&jfs_fs_type); + +kill_committask: + for (i = 0; i < commit_threads; i++) + kthread_stop(jfsCommitThread[i]); + kthread_stop(jfsIOthread); +end_txmngr: + txExit(); +free_metapage: + metapage_exit(); +free_slab: + kmem_cache_destroy(jfs_inode_cachep); + return rc; +} + +static void __exit exit_jfs_fs(void) +{ + int i; + + jfs_info("exit_jfs_fs called"); + + txExit(); + metapage_exit(); + + kthread_stop(jfsIOthread); + for (i = 0; i < commit_threads; i++) + kthread_stop(jfsCommitThread[i]); + kthread_stop(jfsSyncThread); +#ifdef PROC_FS_JFS + jfs_proc_clean(); +#endif + unregister_filesystem(&jfs_fs_type); + kmem_cache_destroy(jfs_inode_cachep); +} + +module_init(init_jfs_fs) +module_exit(exit_jfs_fs) diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c new file mode 100644 index 00000000..205b946d --- /dev/null +++ b/fs/jfs/symlink.c @@ -0,0 +1,52 @@ +/* + * Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_xattr.h" + +static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + char *s = JFS_IP(dentry->d_inode)->i_inline; + nd_set_link(nd, s); + return NULL; +} + +const struct inode_operations jfs_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = jfs_follow_link, + .setattr = jfs_setattr, + .setxattr = jfs_setxattr, + .getxattr = jfs_getxattr, + .listxattr = jfs_listxattr, + .removexattr = jfs_removexattr, +}; + +const struct inode_operations jfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = jfs_setattr, + .setxattr = jfs_setxattr, + .getxattr = jfs_getxattr, + .listxattr = jfs_listxattr, + .removexattr = jfs_removexattr, +}; + diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c new file mode 100644 index 00000000..24838f1e --- /dev/null +++ b/fs/jfs/xattr.c @@ -0,0 +1,1128 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Copyright (C) Christoph Hellwig, 2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_debug.h" +#include "jfs_dinode.h" +#include "jfs_extent.h" +#include "jfs_metapage.h" +#include "jfs_xattr.h" +#include "jfs_acl.h" + +/* + * jfs_xattr.c: extended attribute service + * + * Overall design -- + * + * Format: + * + * Extended attribute lists (jfs_ea_list) consist of an overall size (32 bit + * value) and a variable (0 or more) number of extended attribute + * entries. Each extended attribute entry (jfs_ea) is a double + * where is constructed from a null-terminated ascii string + * (1 ... 255 bytes in the name) and is arbitrary 8 bit data + * (1 ... 65535 bytes). The in-memory format is + * + * 0 1 2 4 4 + namelen + 1 + * +-------+--------+--------+----------------+-------------------+ + * | Flags | Name | Value | Name String \0 | Data . . . . | + * | | Length | Length | | | + * +-------+--------+--------+----------------+-------------------+ + * + * A jfs_ea_list then is structured as + * + * 0 4 4 + EA_SIZE(ea1) + * +------------+-------------------+--------------------+----- + * | Overall EA | First FEA Element | Second FEA Element | ..... + * | List Size | | | + * +------------+-------------------+--------------------+----- + * + * On-disk: + * + * FEALISTs are stored on disk using blocks allocated by dbAlloc() and + * written directly. An EA list may be in-lined in the inode if there is + * sufficient room available. + */ + +struct ea_buffer { + int flag; /* Indicates what storage xattr points to */ + int max_size; /* largest xattr that fits in current buffer */ + dxd_t new_ea; /* dxd to replace ea when modifying xattr */ + struct metapage *mp; /* metapage containing ea list */ + struct jfs_ea_list *xattr; /* buffer containing ea list */ +}; + +/* + * ea_buffer.flag values + */ +#define EA_INLINE 0x0001 +#define EA_EXTENT 0x0002 +#define EA_NEW 0x0004 +#define EA_MALLOC 0x0008 + + +static int is_known_namespace(const char *name) +{ + if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) && + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && + strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + return false; + + return true; +} + +/* + * These three routines are used to recognize on-disk extended attributes + * that are in a recognized namespace. If the attribute is not recognized, + * "os2." is prepended to the name + */ +static int is_os2_xattr(struct jfs_ea *ea) +{ + return !is_known_namespace(ea->name); +} + +static inline int name_size(struct jfs_ea *ea) +{ + if (is_os2_xattr(ea)) + return ea->namelen + XATTR_OS2_PREFIX_LEN; + else + return ea->namelen; +} + +static inline int copy_name(char *buffer, struct jfs_ea *ea) +{ + int len = ea->namelen; + + if (is_os2_xattr(ea)) { + memcpy(buffer, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN); + buffer += XATTR_OS2_PREFIX_LEN; + len += XATTR_OS2_PREFIX_LEN; + } + memcpy(buffer, ea->name, ea->namelen); + buffer[ea->namelen] = 0; + + return len; +} + +/* Forward references */ +static void ea_release(struct inode *inode, struct ea_buffer *ea_buf); + +/* + * NAME: ea_write_inline + * + * FUNCTION: Attempt to write an EA inline if area is available + * + * PRE CONDITIONS: + * Already verified that the specified EA is small enough to fit inline + * + * PARAMETERS: + * ip - Inode pointer + * ealist - EA list pointer + * size - size of ealist in bytes + * ea - dxd_t structure to be filled in with necessary EA information + * if we successfully copy the EA inline + * + * NOTES: + * Checks if the inode's inline area is available. If so, copies EA inline + * and sets fields appropriately. Otherwise, returns failure, EA will + * have to be put into an extent. + * + * RETURNS: 0 for successful copy to inline area; -1 if area not available + */ +static int ea_write_inline(struct inode *ip, struct jfs_ea_list *ealist, + int size, dxd_t * ea) +{ + struct jfs_inode_info *ji = JFS_IP(ip); + + /* + * Make sure we have an EA -- the NULL EA list is valid, but you + * can't copy it! + */ + if (ealist && size > sizeof (struct jfs_ea_list)) { + assert(size <= sizeof (ji->i_inline_ea)); + + /* + * See if the space is available or if it is already being + * used for an inline EA. + */ + if (!(ji->mode2 & INLINEEA) && !(ji->ea.flag & DXD_INLINE)) + return -EPERM; + + DXDsize(ea, size); + DXDlength(ea, 0); + DXDaddress(ea, 0); + memcpy(ji->i_inline_ea, ealist, size); + ea->flag = DXD_INLINE; + ji->mode2 &= ~INLINEEA; + } else { + ea->flag = 0; + DXDsize(ea, 0); + DXDlength(ea, 0); + DXDaddress(ea, 0); + + /* Free up INLINE area */ + if (ji->ea.flag & DXD_INLINE) + ji->mode2 |= INLINEEA; + } + + return 0; +} + +/* + * NAME: ea_write + * + * FUNCTION: Write an EA for an inode + * + * PRE CONDITIONS: EA has been verified + * + * PARAMETERS: + * ip - Inode pointer + * ealist - EA list pointer + * size - size of ealist in bytes + * ea - dxd_t structure to be filled in appropriately with where the + * EA was copied + * + * NOTES: Will write EA inline if able to, otherwise allocates blocks for an + * extent and synchronously writes it to those blocks. + * + * RETURNS: 0 for success; Anything else indicates failure + */ +static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size, + dxd_t * ea) +{ + struct super_block *sb = ip->i_sb; + struct jfs_inode_info *ji = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(sb); + int nblocks; + s64 blkno; + int rc = 0, i; + char *cp; + s32 nbytes, nb; + s32 bytes_to_write; + struct metapage *mp; + + /* + * Quick check to see if this is an in-linable EA. Short EAs + * and empty EAs are all in-linable, provided the space exists. + */ + if (!ealist || size <= sizeof (ji->i_inline_ea)) { + if (!ea_write_inline(ip, ealist, size, ea)) + return 0; + } + + /* figure out how many blocks we need */ + nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits; + + /* Allocate new blocks to quota. */ + rc = dquot_alloc_block(ip, nblocks); + if (rc) + return rc; + + rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno); + if (rc) { + /*Rollback quota allocation. */ + dquot_free_block(ip, nblocks); + return rc; + } + + /* + * Now have nblocks worth of storage to stuff into the FEALIST. + * loop over the FEALIST copying data into the buffer one page at + * a time. + */ + cp = (char *) ealist; + nbytes = size; + for (i = 0; i < nblocks; i += sbi->nbperpage) { + /* + * Determine how many bytes for this request, and round up to + * the nearest aggregate block size + */ + nb = min(PSIZE, nbytes); + bytes_to_write = + ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits)) + << sb->s_blocksize_bits; + + if (!(mp = get_metapage(ip, blkno + i, bytes_to_write, 1))) { + rc = -EIO; + goto failed; + } + + memcpy(mp->data, cp, nb); + + /* + * We really need a way to propagate errors for + * forced writes like this one. --hch + * + * (__write_metapage => release_metapage => flush_metapage) + */ +#ifdef _JFS_FIXME + if ((rc = flush_metapage(mp))) { + /* + * the write failed -- this means that the buffer + * is still assigned and the blocks are not being + * used. this seems like the best error recovery + * we can get ... + */ + goto failed; + } +#else + flush_metapage(mp); +#endif + + cp += PSIZE; + nbytes -= nb; + } + + ea->flag = DXD_EXTENT; + DXDsize(ea, le32_to_cpu(ealist->size)); + DXDlength(ea, nblocks); + DXDaddress(ea, blkno); + + /* Free up INLINE area */ + if (ji->ea.flag & DXD_INLINE) + ji->mode2 |= INLINEEA; + + return 0; + + failed: + /* Rollback quota allocation. */ + dquot_free_block(ip, nblocks); + + dbFree(ip, blkno, nblocks); + return rc; +} + +/* + * NAME: ea_read_inline + * + * FUNCTION: Read an inlined EA into user's buffer + * + * PARAMETERS: + * ip - Inode pointer + * ealist - Pointer to buffer to fill in with EA + * + * RETURNS: 0 + */ +static int ea_read_inline(struct inode *ip, struct jfs_ea_list *ealist) +{ + struct jfs_inode_info *ji = JFS_IP(ip); + int ea_size = sizeDXD(&ji->ea); + + if (ea_size == 0) { + ealist->size = 0; + return 0; + } + + /* Sanity Check */ + if ((sizeDXD(&ji->ea) > sizeof (ji->i_inline_ea))) + return -EIO; + if (le32_to_cpu(((struct jfs_ea_list *) &ji->i_inline_ea)->size) + != ea_size) + return -EIO; + + memcpy(ealist, ji->i_inline_ea, ea_size); + return 0; +} + +/* + * NAME: ea_read + * + * FUNCTION: copy EA data into user's buffer + * + * PARAMETERS: + * ip - Inode pointer + * ealist - Pointer to buffer to fill in with EA + * + * NOTES: If EA is inline calls ea_read_inline() to copy EA. + * + * RETURNS: 0 for success; other indicates failure + */ +static int ea_read(struct inode *ip, struct jfs_ea_list *ealist) +{ + struct super_block *sb = ip->i_sb; + struct jfs_inode_info *ji = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(sb); + int nblocks; + s64 blkno; + char *cp = (char *) ealist; + int i; + int nbytes, nb; + s32 bytes_to_read; + struct metapage *mp; + + /* quick check for in-line EA */ + if (ji->ea.flag & DXD_INLINE) + return ea_read_inline(ip, ealist); + + nbytes = sizeDXD(&ji->ea); + if (!nbytes) { + jfs_error(sb, "ea_read: nbytes is 0"); + return -EIO; + } + + /* + * Figure out how many blocks were allocated when this EA list was + * originally written to disk. + */ + nblocks = lengthDXD(&ji->ea) << sbi->l2nbperpage; + blkno = addressDXD(&ji->ea) << sbi->l2nbperpage; + + /* + * I have found the disk blocks which were originally used to store + * the FEALIST. now i loop over each contiguous block copying the + * data into the buffer. + */ + for (i = 0; i < nblocks; i += sbi->nbperpage) { + /* + * Determine how many bytes for this request, and round up to + * the nearest aggregate block size + */ + nb = min(PSIZE, nbytes); + bytes_to_read = + ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits)) + << sb->s_blocksize_bits; + + if (!(mp = read_metapage(ip, blkno + i, bytes_to_read, 1))) + return -EIO; + + memcpy(cp, mp->data, nb); + release_metapage(mp); + + cp += PSIZE; + nbytes -= nb; + } + + return 0; +} + +/* + * NAME: ea_get + * + * FUNCTION: Returns buffer containing existing extended attributes. + * The size of the buffer will be the larger of the existing + * attributes size, or min_size. + * + * The buffer, which may be inlined in the inode or in the + * page cache must be release by calling ea_release or ea_put + * + * PARAMETERS: + * inode - Inode pointer + * ea_buf - Structure to be populated with ealist and its metadata + * min_size- minimum size of buffer to be returned + * + * RETURNS: 0 for success; Other indicates failure + */ +static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) +{ + struct jfs_inode_info *ji = JFS_IP(inode); + struct super_block *sb = inode->i_sb; + int size; + int ea_size = sizeDXD(&ji->ea); + int blocks_needed, current_blocks; + s64 blkno; + int rc; + int quota_allocation = 0; + + /* When fsck.jfs clears a bad ea, it doesn't clear the size */ + if (ji->ea.flag == 0) + ea_size = 0; + + if (ea_size == 0) { + if (min_size == 0) { + ea_buf->flag = 0; + ea_buf->max_size = 0; + ea_buf->xattr = NULL; + return 0; + } + if ((min_size <= sizeof (ji->i_inline_ea)) && + (ji->mode2 & INLINEEA)) { + ea_buf->flag = EA_INLINE | EA_NEW; + ea_buf->max_size = sizeof (ji->i_inline_ea); + ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea; + DXDlength(&ea_buf->new_ea, 0); + DXDaddress(&ea_buf->new_ea, 0); + ea_buf->new_ea.flag = DXD_INLINE; + DXDsize(&ea_buf->new_ea, min_size); + return 0; + } + current_blocks = 0; + } else if (ji->ea.flag & DXD_INLINE) { + if (min_size <= sizeof (ji->i_inline_ea)) { + ea_buf->flag = EA_INLINE; + ea_buf->max_size = sizeof (ji->i_inline_ea); + ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea; + goto size_check; + } + current_blocks = 0; + } else { + if (!(ji->ea.flag & DXD_EXTENT)) { + jfs_error(sb, "ea_get: invalid ea.flag)"); + return -EIO; + } + current_blocks = (ea_size + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + } + size = max(min_size, ea_size); + + if (size > PSIZE) { + /* + * To keep the rest of the code simple. Allocate a + * contiguous buffer to work with + */ + ea_buf->xattr = kmalloc(size, GFP_KERNEL); + if (ea_buf->xattr == NULL) + return -ENOMEM; + + ea_buf->flag = EA_MALLOC; + ea_buf->max_size = (size + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + + if (ea_size == 0) + return 0; + + if ((rc = ea_read(inode, ea_buf->xattr))) { + kfree(ea_buf->xattr); + ea_buf->xattr = NULL; + return rc; + } + goto size_check; + } + blocks_needed = (min_size + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + + if (blocks_needed > current_blocks) { + /* Allocate new blocks to quota. */ + rc = dquot_alloc_block(inode, blocks_needed); + if (rc) + return -EDQUOT; + + quota_allocation = blocks_needed; + + rc = dbAlloc(inode, INOHINT(inode), (s64) blocks_needed, + &blkno); + if (rc) + goto clean_up; + + DXDlength(&ea_buf->new_ea, blocks_needed); + DXDaddress(&ea_buf->new_ea, blkno); + ea_buf->new_ea.flag = DXD_EXTENT; + DXDsize(&ea_buf->new_ea, min_size); + + ea_buf->flag = EA_EXTENT | EA_NEW; + + ea_buf->mp = get_metapage(inode, blkno, + blocks_needed << sb->s_blocksize_bits, + 1); + if (ea_buf->mp == NULL) { + dbFree(inode, blkno, (s64) blocks_needed); + rc = -EIO; + goto clean_up; + } + ea_buf->xattr = ea_buf->mp->data; + ea_buf->max_size = (min_size + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + if (ea_size == 0) + return 0; + if ((rc = ea_read(inode, ea_buf->xattr))) { + discard_metapage(ea_buf->mp); + dbFree(inode, blkno, (s64) blocks_needed); + goto clean_up; + } + goto size_check; + } + ea_buf->flag = EA_EXTENT; + ea_buf->mp = read_metapage(inode, addressDXD(&ji->ea), + lengthDXD(&ji->ea) << sb->s_blocksize_bits, + 1); + if (ea_buf->mp == NULL) { + rc = -EIO; + goto clean_up; + } + ea_buf->xattr = ea_buf->mp->data; + ea_buf->max_size = (ea_size + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + + size_check: + if (EALIST_SIZE(ea_buf->xattr) != ea_size) { + printk(KERN_ERR "ea_get: invalid extended attribute\n"); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, + ea_buf->xattr, ea_size, 1); + ea_release(inode, ea_buf); + rc = -EIO; + goto clean_up; + } + + return ea_size; + + clean_up: + /* Rollback quota allocation */ + if (quota_allocation) + dquot_free_block(inode, quota_allocation); + + return (rc); +} + +static void ea_release(struct inode *inode, struct ea_buffer *ea_buf) +{ + if (ea_buf->flag & EA_MALLOC) + kfree(ea_buf->xattr); + else if (ea_buf->flag & EA_EXTENT) { + assert(ea_buf->mp); + release_metapage(ea_buf->mp); + + if (ea_buf->flag & EA_NEW) + dbFree(inode, addressDXD(&ea_buf->new_ea), + lengthDXD(&ea_buf->new_ea)); + } +} + +static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, + int new_size) +{ + struct jfs_inode_info *ji = JFS_IP(inode); + unsigned long old_blocks, new_blocks; + int rc = 0; + + if (new_size == 0) { + ea_release(inode, ea_buf); + ea_buf = NULL; + } else if (ea_buf->flag & EA_INLINE) { + assert(new_size <= sizeof (ji->i_inline_ea)); + ji->mode2 &= ~INLINEEA; + ea_buf->new_ea.flag = DXD_INLINE; + DXDsize(&ea_buf->new_ea, new_size); + DXDaddress(&ea_buf->new_ea, 0); + DXDlength(&ea_buf->new_ea, 0); + } else if (ea_buf->flag & EA_MALLOC) { + rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea); + kfree(ea_buf->xattr); + } else if (ea_buf->flag & EA_NEW) { + /* We have already allocated a new dxd */ + flush_metapage(ea_buf->mp); + } else { + /* ->xattr must point to original ea's metapage */ + rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea); + discard_metapage(ea_buf->mp); + } + if (rc) + return rc; + + old_blocks = new_blocks = 0; + + if (ji->ea.flag & DXD_EXTENT) { + invalidate_dxd_metapages(inode, ji->ea); + old_blocks = lengthDXD(&ji->ea); + } + + if (ea_buf) { + txEA(tid, inode, &ji->ea, &ea_buf->new_ea); + if (ea_buf->new_ea.flag & DXD_EXTENT) { + new_blocks = lengthDXD(&ea_buf->new_ea); + if (ji->ea.flag & DXD_INLINE) + ji->mode2 |= INLINEEA; + } + ji->ea = ea_buf->new_ea; + } else { + txEA(tid, inode, &ji->ea, NULL); + if (ji->ea.flag & DXD_INLINE) + ji->mode2 |= INLINEEA; + ji->ea.flag = 0; + ji->ea.size = 0; + } + + /* If old blocks exist, they must be removed from quota allocation. */ + if (old_blocks) + dquot_free_block(inode, old_blocks); + + inode->i_ctime = CURRENT_TIME; + + return 0; +} + +/* + * can_set_system_xattr + * + * This code is specific to the system.* namespace. It contains policy + * which doesn't belong in the main xattr codepath. + */ +static int can_set_system_xattr(struct inode *inode, const char *name, + const void *value, size_t value_len) +{ +#ifdef CONFIG_JFS_POSIX_ACL + struct posix_acl *acl; + int rc; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + /* + * POSIX_ACL_XATTR_ACCESS is tied to i_mode + */ + if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) { + acl = posix_acl_from_xattr(value, value_len); + if (IS_ERR(acl)) { + rc = PTR_ERR(acl); + printk(KERN_ERR "posix_acl_from_xattr returned %d\n", + rc); + return rc; + } + if (acl) { + mode_t mode = inode->i_mode; + rc = posix_acl_equiv_mode(acl, &mode); + posix_acl_release(acl); + if (rc < 0) { + printk(KERN_ERR + "posix_acl_equiv_mode returned %d\n", + rc); + return rc; + } + inode->i_mode = mode; + mark_inode_dirty(inode); + } + /* + * We're changing the ACL. Get rid of the cached one + */ + forget_cached_acl(inode, ACL_TYPE_ACCESS); + + return 0; + } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { + acl = posix_acl_from_xattr(value, value_len); + if (IS_ERR(acl)) { + rc = PTR_ERR(acl); + printk(KERN_ERR "posix_acl_from_xattr returned %d\n", + rc); + return rc; + } + posix_acl_release(acl); + + /* + * We're changing the default ACL. Get rid of the cached one + */ + forget_cached_acl(inode, ACL_TYPE_DEFAULT); + + return 0; + } +#endif /* CONFIG_JFS_POSIX_ACL */ + return -EOPNOTSUPP; +} + +/* + * Most of the permission checking is done by xattr_permission in the vfs. + * The local file system is responsible for handling the system.* namespace. + * We also need to verify that this is a namespace that we recognize. + */ +static int can_set_xattr(struct inode *inode, const char *name, + const void *value, size_t value_len) +{ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return can_set_system_xattr(inode, name, value, value_len); + + if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) { + /* + * This makes sure that we aren't trying to set an + * attribute in a different namespace by prefixing it + * with "os2." + */ + if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN)) + return -EOPNOTSUPP; + return 0; + } + + /* + * Don't allow setting an attribute in an unknown namespace. + */ + if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && + strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + return -EOPNOTSUPP; + + return 0; +} + +int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, + const void *value, size_t value_len, int flags) +{ + struct jfs_ea_list *ealist; + struct jfs_ea *ea, *old_ea = NULL, *next_ea = NULL; + struct ea_buffer ea_buf; + int old_ea_size = 0; + int xattr_size; + int new_size; + int namelen = strlen(name); + char *os2name = NULL; + int found = 0; + int rc; + int length; + + if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { + os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1, + GFP_KERNEL); + if (!os2name) + return -ENOMEM; + strcpy(os2name, name + XATTR_OS2_PREFIX_LEN); + name = os2name; + namelen -= XATTR_OS2_PREFIX_LEN; + } + + down_write(&JFS_IP(inode)->xattr_sem); + + xattr_size = ea_get(inode, &ea_buf, 0); + if (xattr_size < 0) { + rc = xattr_size; + goto out; + } + + again: + ealist = (struct jfs_ea_list *) ea_buf.xattr; + new_size = sizeof (struct jfs_ea_list); + + if (xattr_size) { + for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); + ea = NEXT_EA(ea)) { + if ((namelen == ea->namelen) && + (memcmp(name, ea->name, namelen) == 0)) { + found = 1; + if (flags & XATTR_CREATE) { + rc = -EEXIST; + goto release; + } + old_ea = ea; + old_ea_size = EA_SIZE(ea); + next_ea = NEXT_EA(ea); + } else + new_size += EA_SIZE(ea); + } + } + + if (!found) { + if (flags & XATTR_REPLACE) { + rc = -ENODATA; + goto release; + } + if (value == NULL) { + rc = 0; + goto release; + } + } + if (value) + new_size += sizeof (struct jfs_ea) + namelen + 1 + value_len; + + if (new_size > ea_buf.max_size) { + /* + * We need to allocate more space for merged ea list. + * We should only have loop to again: once. + */ + ea_release(inode, &ea_buf); + xattr_size = ea_get(inode, &ea_buf, new_size); + if (xattr_size < 0) { + rc = xattr_size; + goto out; + } + goto again; + } + + /* Remove old ea of the same name */ + if (found) { + /* number of bytes following target EA */ + length = (char *) END_EALIST(ealist) - (char *) next_ea; + if (length > 0) + memmove(old_ea, next_ea, length); + xattr_size -= old_ea_size; + } + + /* Add new entry to the end */ + if (value) { + if (xattr_size == 0) + /* Completely new ea list */ + xattr_size = sizeof (struct jfs_ea_list); + + ea = (struct jfs_ea *) ((char *) ealist + xattr_size); + ea->flag = 0; + ea->namelen = namelen; + ea->valuelen = (cpu_to_le16(value_len)); + memcpy(ea->name, name, namelen); + ea->name[namelen] = 0; + if (value_len) + memcpy(&ea->name[namelen + 1], value, value_len); + xattr_size += EA_SIZE(ea); + } + + /* DEBUG - If we did this right, these number match */ + if (xattr_size != new_size) { + printk(KERN_ERR + "jfs_xsetattr: xattr_size = %d, new_size = %d\n", + xattr_size, new_size); + + rc = -EINVAL; + goto release; + } + + /* + * If we're left with an empty list, there's no ea + */ + if (new_size == sizeof (struct jfs_ea_list)) + new_size = 0; + + ealist->size = cpu_to_le32(new_size); + + rc = ea_put(tid, inode, &ea_buf, new_size); + + goto out; + release: + ea_release(inode, &ea_buf); + out: + up_write(&JFS_IP(inode)->xattr_sem); + + kfree(os2name); + + return rc; +} + +int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t value_len, int flags) +{ + struct inode *inode = dentry->d_inode; + struct jfs_inode_info *ji = JFS_IP(inode); + int rc; + tid_t tid; + + if ((rc = can_set_xattr(inode, name, value, value_len))) + return rc; + + if (value == NULL) { /* empty EA, do not remove */ + value = ""; + value_len = 0; + } + + tid = txBegin(inode->i_sb, 0); + mutex_lock(&ji->commit_mutex); + rc = __jfs_setxattr(tid, dentry->d_inode, name, value, value_len, + flags); + if (!rc) + rc = txCommit(tid, 1, &inode, 0); + txEnd(tid); + mutex_unlock(&ji->commit_mutex); + + return rc; +} + +ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, + size_t buf_size) +{ + struct jfs_ea_list *ealist; + struct jfs_ea *ea; + struct ea_buffer ea_buf; + int xattr_size; + ssize_t size; + int namelen = strlen(name); + char *value; + + down_read(&JFS_IP(inode)->xattr_sem); + + xattr_size = ea_get(inode, &ea_buf, 0); + + if (xattr_size < 0) { + size = xattr_size; + goto out; + } + + if (xattr_size == 0) + goto not_found; + + ealist = (struct jfs_ea_list *) ea_buf.xattr; + + /* Find the named attribute */ + for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) + if ((namelen == ea->namelen) && + memcmp(name, ea->name, namelen) == 0) { + /* Found it */ + size = le16_to_cpu(ea->valuelen); + if (!data) + goto release; + else if (size > buf_size) { + size = -ERANGE; + goto release; + } + value = ((char *) &ea->name) + ea->namelen + 1; + memcpy(data, value, size); + goto release; + } + not_found: + size = -ENODATA; + release: + ea_release(inode, &ea_buf); + out: + up_read(&JFS_IP(inode)->xattr_sem); + + return size; +} + +ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data, + size_t buf_size) +{ + int err; + + if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { + /* + * skip past "os2." prefix + */ + name += XATTR_OS2_PREFIX_LEN; + /* + * Don't allow retrieving properly prefixed attributes + * by prepending them with "os2." + */ + if (is_known_namespace(name)) + return -EOPNOTSUPP; + } + + err = __jfs_getxattr(dentry->d_inode, name, data, buf_size); + + return err; +} + +/* + * No special permissions are needed to list attributes except for trusted.* + */ +static inline int can_list(struct jfs_ea *ea) +{ + return (strncmp(ea->name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN) || + capable(CAP_SYS_ADMIN)); +} + +ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size) +{ + struct inode *inode = dentry->d_inode; + char *buffer; + ssize_t size = 0; + int xattr_size; + struct jfs_ea_list *ealist; + struct jfs_ea *ea; + struct ea_buffer ea_buf; + + down_read(&JFS_IP(inode)->xattr_sem); + + xattr_size = ea_get(inode, &ea_buf, 0); + if (xattr_size < 0) { + size = xattr_size; + goto out; + } + + if (xattr_size == 0) + goto release; + + ealist = (struct jfs_ea_list *) ea_buf.xattr; + + /* compute required size of list */ + for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) { + if (can_list(ea)) + size += name_size(ea) + 1; + } + + if (!data) + goto release; + + if (size > buf_size) { + size = -ERANGE; + goto release; + } + + /* Copy attribute names to buffer */ + buffer = data; + for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) { + if (can_list(ea)) { + int namelen = copy_name(buffer, ea); + buffer += namelen + 1; + } + } + + release: + ea_release(inode, &ea_buf); + out: + up_read(&JFS_IP(inode)->xattr_sem); + return size; +} + +int jfs_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + struct jfs_inode_info *ji = JFS_IP(inode); + int rc; + tid_t tid; + + if ((rc = can_set_xattr(inode, name, NULL, 0))) + return rc; + + tid = txBegin(inode->i_sb, 0); + mutex_lock(&ji->commit_mutex); + rc = __jfs_setxattr(tid, dentry->d_inode, name, NULL, 0, XATTR_REPLACE); + if (!rc) + rc = txCommit(tid, 1, &inode, 0); + txEnd(tid); + mutex_unlock(&ji->commit_mutex); + + return rc; +} + +#ifdef CONFIG_JFS_SECURITY +int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + int rc; + size_t len; + void *value; + char *suffix; + char *name; + + rc = security_inode_init_security(inode, dir, qstr, &suffix, &value, + &len); + if (rc) { + if (rc == -EOPNOTSUPP) + return 0; + return rc; + } + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix), + GFP_NOFS); + if (!name) { + rc = -ENOMEM; + goto kmalloc_failed; + } + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); + + rc = __jfs_setxattr(tid, inode, name, value, len, 0); + + kfree(name); +kmalloc_failed: + kfree(suffix); + kfree(value); + + return rc; +} +#endif diff --git a/fs/libfs.c b/fs/libfs.c new file mode 100644 index 00000000..275ca474 --- /dev/null +++ b/fs/libfs.c @@ -0,0 +1,997 @@ +/* + * fs/libfs.c + * Library for filesystems writers. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static inline int simple_positive(struct dentry *dentry) +{ + return dentry->d_inode && !d_unhashed(dentry); +} + +int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + generic_fillattr(inode, stat); + stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9); + return 0; +} + +int simple_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + buf->f_type = dentry->d_sb->s_magic; + buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_namelen = NAME_MAX; + return 0; +} + +/* + * Retaining negative dentries for an in-memory filesystem just wastes + * memory and lookup time: arrange for them to be deleted immediately. + */ +static int simple_delete_dentry(const struct dentry *dentry) +{ + return 1; +} + +/* + * Lookup the data. This is trivial - if the dentry didn't already + * exist, we know it is negative. Set d_op to delete negative dentries. + */ +struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + static const struct dentry_operations simple_dentry_operations = { + .d_delete = simple_delete_dentry, + }; + + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + d_set_d_op(dentry, &simple_dentry_operations); + d_add(dentry, NULL); + return NULL; +} + +int dcache_dir_open(struct inode *inode, struct file *file) +{ + static struct qstr cursor_name = {.len = 1, .name = "."}; + + file->private_data = d_alloc(file->f_path.dentry, &cursor_name); + + return file->private_data ? 0 : -ENOMEM; +} + +int dcache_dir_close(struct inode *inode, struct file *file) +{ + dput(file->private_data); + return 0; +} + +loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) +{ + struct dentry *dentry = file->f_path.dentry; + mutex_lock(&dentry->d_inode->i_mutex); + switch (origin) { + case 1: + offset += file->f_pos; + case 0: + if (offset >= 0) + break; + default: + mutex_unlock(&dentry->d_inode->i_mutex); + return -EINVAL; + } + if (offset != file->f_pos) { + file->f_pos = offset; + if (file->f_pos >= 2) { + struct list_head *p; + struct dentry *cursor = file->private_data; + loff_t n = file->f_pos - 2; + + spin_lock(&dentry->d_lock); + /* d_lock not required for cursor */ + list_del(&cursor->d_u.d_child); + p = dentry->d_subdirs.next; + while (n && p != &dentry->d_subdirs) { + struct dentry *next; + next = list_entry(p, struct dentry, d_u.d_child); + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(next)) + n--; + spin_unlock(&next->d_lock); + p = p->next; + } + list_add_tail(&cursor->d_u.d_child, p); + spin_unlock(&dentry->d_lock); + } + } + mutex_unlock(&dentry->d_inode->i_mutex); + return offset; +} + +/* Relationship between i_mode and the DT_xxx types */ +static inline unsigned char dt_type(struct inode *inode) +{ + return (inode->i_mode >> 12) & 15; +} + +/* + * Directory is locked and all positive dentries in it are safe, since + * for ramfs-type trees they can't go away without unlink() or rmdir(), + * both impossible due to the lock on directory. + */ + +int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct dentry *cursor = filp->private_data; + struct list_head *p, *q = &cursor->d_u.d_child; + ino_t ino; + int i = filp->f_pos; + + switch (i) { + case 0: + ino = dentry->d_inode->i_ino; + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + break; + filp->f_pos++; + i++; + /* fallthrough */ + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) + break; + filp->f_pos++; + i++; + /* fallthrough */ + default: + spin_lock(&dentry->d_lock); + if (filp->f_pos == 2) + list_move(q, &dentry->d_subdirs); + + for (p=q->next; p != &dentry->d_subdirs; p=p->next) { + struct dentry *next; + next = list_entry(p, struct dentry, d_u.d_child); + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + if (!simple_positive(next)) { + spin_unlock(&next->d_lock); + continue; + } + + spin_unlock(&next->d_lock); + spin_unlock(&dentry->d_lock); + if (filldir(dirent, next->d_name.name, + next->d_name.len, filp->f_pos, + next->d_inode->i_ino, + dt_type(next->d_inode)) < 0) + return 0; + spin_lock(&dentry->d_lock); + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + /* next is still alive */ + list_move(q, p); + spin_unlock(&next->d_lock); + p = q; + filp->f_pos++; + } + spin_unlock(&dentry->d_lock); + } + return 0; +} + +ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) +{ + return -EISDIR; +} + +const struct file_operations simple_dir_operations = { + .open = dcache_dir_open, + .release = dcache_dir_close, + .llseek = dcache_dir_lseek, + .read = generic_read_dir, + .readdir = dcache_readdir, + .fsync = noop_fsync, +}; + +const struct inode_operations simple_dir_inode_operations = { + .lookup = simple_lookup, +}; + +static const struct super_operations simple_super_operations = { + .statfs = simple_statfs, +}; + +/* + * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that + * will never be mountable) + */ +struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name, + const struct super_operations *ops, + const struct dentry_operations *dops, unsigned long magic) +{ + struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); + struct dentry *dentry; + struct inode *root; + struct qstr d_name = {.name = name, .len = strlen(name)}; + + if (IS_ERR(s)) + return ERR_CAST(s); + + s->s_flags = MS_NOUSER; + s->s_maxbytes = MAX_LFS_FILESIZE; + s->s_blocksize = PAGE_SIZE; + s->s_blocksize_bits = PAGE_SHIFT; + s->s_magic = magic; + s->s_op = ops ? ops : &simple_super_operations; + s->s_time_gran = 1; + root = new_inode(s); + if (!root) + goto Enomem; + /* + * since this is the first inode, make it number 1. New inodes created + * after this must take care not to collide with it (by passing + * max_reserved of 1 to iunique). + */ + root->i_ino = 1; + root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; + root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; + dentry = d_alloc(NULL, &d_name); + if (!dentry) { + iput(root); + goto Enomem; + } + dentry->d_sb = s; + dentry->d_parent = dentry; + d_instantiate(dentry, root); + s->s_root = dentry; + s->s_d_op = dops; + s->s_flags |= MS_ACTIVE; + return dget(s->s_root); + +Enomem: + deactivate_locked_super(s); + return ERR_PTR(-ENOMEM); +} + +int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inc_nlink(inode); + ihold(inode); + dget(dentry); + d_instantiate(dentry, inode); + return 0; +} + +int simple_empty(struct dentry *dentry) +{ + struct dentry *child; + int ret = 0; + + spin_lock(&dentry->d_lock); + list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) { + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(child)) { + spin_unlock(&child->d_lock); + goto out; + } + spin_unlock(&child->d_lock); + } + ret = 1; +out: + spin_unlock(&dentry->d_lock); + return ret; +} + +int simple_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + drop_nlink(inode); + dput(dentry); + return 0; +} + +int simple_rmdir(struct inode *dir, struct dentry *dentry) +{ + if (!simple_empty(dentry)) + return -ENOTEMPTY; + + drop_nlink(dentry->d_inode); + simple_unlink(dir, dentry); + drop_nlink(dir); + return 0; +} + +int simple_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *inode = old_dentry->d_inode; + int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode); + + if (!simple_empty(new_dentry)) + return -ENOTEMPTY; + + if (new_dentry->d_inode) { + simple_unlink(new_dir, new_dentry); + if (they_are_dirs) + drop_nlink(old_dir); + } else if (they_are_dirs) { + drop_nlink(old_dir); + inc_nlink(new_dir); + } + + old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = + new_dir->i_mtime = inode->i_ctime = CURRENT_TIME; + + return 0; +} + +/** + * simple_setattr - setattr for simple filesystem + * @dentry: dentry + * @iattr: iattr structure + * + * Returns 0 on success, -error on failure. + * + * simple_setattr is a simple ->setattr implementation without a proper + * implementation of size changes. + * + * It can either be used for in-memory filesystems or special files + * on simple regular filesystems. Anything that needs to change on-disk + * or wire state on size changes needs its own setattr method. + */ +int simple_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int error; + + WARN_ON_ONCE(inode->i_op->truncate); + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + if (iattr->ia_valid & ATTR_SIZE) + truncate_setsize(inode, iattr->ia_size); + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + return 0; +} +EXPORT_SYMBOL(simple_setattr); + +int simple_readpage(struct file *file, struct page *page) +{ + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + return 0; +} + +int simple_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct page *page; + pgoff_t index; + + index = pos >> PAGE_CACHE_SHIFT; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + + *pagep = page; + + if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + + zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE); + } + return 0; +} + +/** + * simple_write_end - .write_end helper for non-block-device FSes + * @available: See .write_end of address_space_operations + * @file: " + * @mapping: " + * @pos: " + * @len: " + * @copied: " + * @page: " + * @fsdata: " + * + * simple_write_end does the minimum needed for updating a page after writing is + * done. It has the same API signature as the .write_end of + * address_space_operations vector. So it can just be set onto .write_end for + * FSes that don't need any other processing. i_mutex is assumed to be held. + * Block based filesystems should use generic_write_end(). + * NOTE: Even though i_size might get updated by this function, mark_inode_dirty + * is not called, so a filesystem that actually does store data in .write_inode + * should extend on what's done here with a call to mark_inode_dirty() in the + * case that i_size has changed. + */ +int simple_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + loff_t last_pos = pos + copied; + + /* zero the stale part of the page if we did a short copy */ + if (copied < len) { + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + + zero_user(page, from + copied, len - copied); + } + + if (!PageUptodate(page)) + SetPageUptodate(page); + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold the i_mutex. + */ + if (last_pos > inode->i_size) + i_size_write(inode, last_pos); + + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + + return copied; +} + +/* + * the inodes created here are not hashed. If you use iunique to generate + * unique inode values later for this filesystem, then you must take care + * to pass it an appropriate max_reserved value to avoid collisions. + */ +int simple_fill_super(struct super_block *s, unsigned long magic, + struct tree_descr *files) +{ + struct inode *inode; + struct dentry *root; + struct dentry *dentry; + int i; + + s->s_blocksize = PAGE_CACHE_SIZE; + s->s_blocksize_bits = PAGE_CACHE_SHIFT; + s->s_magic = magic; + s->s_op = &simple_super_operations; + s->s_time_gran = 1; + + inode = new_inode(s); + if (!inode) + return -ENOMEM; + /* + * because the root inode is 1, the files array must not contain an + * entry at index 1 + */ + inode->i_ino = 1; + inode->i_mode = S_IFDIR | 0755; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inode->i_nlink = 2; + root = d_alloc_root(inode); + if (!root) { + iput(inode); + return -ENOMEM; + } + for (i = 0; !files->name || files->name[0]; i++, files++) { + if (!files->name) + continue; + + /* warn if it tries to conflict with the root inode */ + if (unlikely(i == 1)) + printk(KERN_WARNING "%s: %s passed in a files array" + "with an index of 1!\n", __func__, + s->s_type->name); + + dentry = d_alloc_name(root, files->name); + if (!dentry) + goto out; + inode = new_inode(s); + if (!inode) + goto out; + inode->i_mode = S_IFREG | files->mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_fop = files->ops; + inode->i_ino = i; + d_add(dentry, inode); + } + s->s_root = root; + return 0; +out: + d_genocide(root); + dput(root); + return -ENOMEM; +} + +static DEFINE_SPINLOCK(pin_fs_lock); + +int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count) +{ + struct vfsmount *mnt = NULL; + spin_lock(&pin_fs_lock); + if (unlikely(!*mount)) { + spin_unlock(&pin_fs_lock); + mnt = vfs_kern_mount(type, 0, type->name, NULL); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + spin_lock(&pin_fs_lock); + if (!*mount) + *mount = mnt; + } + mntget(*mount); + ++*count; + spin_unlock(&pin_fs_lock); + mntput(mnt); + return 0; +} + +void simple_release_fs(struct vfsmount **mount, int *count) +{ + struct vfsmount *mnt; + spin_lock(&pin_fs_lock); + mnt = *mount; + if (!--*count) + *mount = NULL; + spin_unlock(&pin_fs_lock); + mntput(mnt); +} + +/** + * simple_read_from_buffer - copy data from the buffer to user space + * @to: the user space buffer to read to + * @count: the maximum number of bytes to read + * @ppos: the current position in the buffer + * @from: the buffer to read from + * @available: the size of the buffer + * + * The simple_read_from_buffer() function reads up to @count bytes from the + * buffer @from at offset @ppos into the user space address starting at @to. + * + * On success, the number of bytes read is returned and the offset @ppos is + * advanced by this number, or negative value is returned on error. + **/ +ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, + const void *from, size_t available) +{ + loff_t pos = *ppos; + size_t ret; + + if (pos < 0) + return -EINVAL; + if (pos >= available || !count) + return 0; + if (count > available - pos) + count = available - pos; + ret = copy_to_user(to, from + pos, count); + if (ret == count) + return -EFAULT; + count -= ret; + *ppos = pos + count; + return count; +} + +/** + * simple_write_to_buffer - copy data from user space to the buffer + * @to: the buffer to write to + * @available: the size of the buffer + * @ppos: the current position in the buffer + * @from: the user space buffer to read from + * @count: the maximum number of bytes to read + * + * The simple_write_to_buffer() function reads up to @count bytes from the user + * space address starting at @from into the buffer @to at offset @ppos. + * + * On success, the number of bytes written is returned and the offset @ppos is + * advanced by this number, or negative value is returned on error. + **/ +ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, + const void __user *from, size_t count) +{ + loff_t pos = *ppos; + size_t res; + + if (pos < 0) + return -EINVAL; + if (pos >= available || !count) + return 0; + if (count > available - pos) + count = available - pos; + res = copy_from_user(to + pos, from, count); + if (res == count) + return -EFAULT; + count -= res; + *ppos = pos + count; + return count; +} + +/** + * memory_read_from_buffer - copy data from the buffer + * @to: the kernel space buffer to read to + * @count: the maximum number of bytes to read + * @ppos: the current position in the buffer + * @from: the buffer to read from + * @available: the size of the buffer + * + * The memory_read_from_buffer() function reads up to @count bytes from the + * buffer @from at offset @ppos into the kernel space address starting at @to. + * + * On success, the number of bytes read is returned and the offset @ppos is + * advanced by this number, or negative value is returned on error. + **/ +ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, + const void *from, size_t available) +{ + loff_t pos = *ppos; + + if (pos < 0) + return -EINVAL; + if (pos >= available) + return 0; + if (count > available - pos) + count = available - pos; + memcpy(to, from + pos, count); + *ppos = pos + count; + + return count; +} + +/* + * Transaction based IO. + * The file expects a single write which triggers the transaction, and then + * possibly a read which collects the result - which is stored in a + * file-local buffer. + */ + +void simple_transaction_set(struct file *file, size_t n) +{ + struct simple_transaction_argresp *ar = file->private_data; + + BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); + + /* + * The barrier ensures that ar->size will really remain zero until + * ar->data is ready for reading. + */ + smp_mb(); + ar->size = n; +} + +char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) +{ + struct simple_transaction_argresp *ar; + static DEFINE_SPINLOCK(simple_transaction_lock); + + if (size > SIMPLE_TRANSACTION_LIMIT - 1) + return ERR_PTR(-EFBIG); + + ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL); + if (!ar) + return ERR_PTR(-ENOMEM); + + spin_lock(&simple_transaction_lock); + + /* only one write allowed per open */ + if (file->private_data) { + spin_unlock(&simple_transaction_lock); + free_page((unsigned long)ar); + return ERR_PTR(-EBUSY); + } + + file->private_data = ar; + + spin_unlock(&simple_transaction_lock); + + if (copy_from_user(ar->data, buf, size)) + return ERR_PTR(-EFAULT); + + return ar->data; +} + +ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) +{ + struct simple_transaction_argresp *ar = file->private_data; + + if (!ar) + return 0; + return simple_read_from_buffer(buf, size, pos, ar->data, ar->size); +} + +int simple_transaction_release(struct inode *inode, struct file *file) +{ + free_page((unsigned long)file->private_data); + return 0; +} + +/* Simple attribute files */ + +struct simple_attr { + int (*get)(void *, u64 *); + int (*set)(void *, u64); + char get_buf[24]; /* enough to store a u64 and "\n\0" */ + char set_buf[24]; + void *data; + const char *fmt; /* format for read operation */ + struct mutex mutex; /* protects access to these buffers */ +}; + +/* simple_attr_open is called by an actual attribute open file operation + * to set the attribute specific access operations. */ +int simple_attr_open(struct inode *inode, struct file *file, + int (*get)(void *, u64 *), int (*set)(void *, u64), + const char *fmt) +{ + struct simple_attr *attr; + + attr = kmalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) + return -ENOMEM; + + attr->get = get; + attr->set = set; + attr->data = inode->i_private; + attr->fmt = fmt; + mutex_init(&attr->mutex); + + file->private_data = attr; + + return nonseekable_open(inode, file); +} + +int simple_attr_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +/* read from the buffer that is filled with the get function */ +ssize_t simple_attr_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct simple_attr *attr; + size_t size; + ssize_t ret; + + attr = file->private_data; + + if (!attr->get) + return -EACCES; + + ret = mutex_lock_interruptible(&attr->mutex); + if (ret) + return ret; + + if (*ppos) { /* continued read */ + size = strlen(attr->get_buf); + } else { /* first read */ + u64 val; + ret = attr->get(attr->data, &val); + if (ret) + goto out; + + size = scnprintf(attr->get_buf, sizeof(attr->get_buf), + attr->fmt, (unsigned long long)val); + } + + ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size); +out: + mutex_unlock(&attr->mutex); + return ret; +} + +/* interpret the buffer as a number to call the set function with */ +ssize_t simple_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + struct simple_attr *attr; + u64 val; + size_t size; + ssize_t ret; + + attr = file->private_data; + if (!attr->set) + return -EACCES; + + ret = mutex_lock_interruptible(&attr->mutex); + if (ret) + return ret; + + ret = -EFAULT; + size = min(sizeof(attr->set_buf) - 1, len); + if (copy_from_user(attr->set_buf, buf, size)) + goto out; + + attr->set_buf[size] = '\0'; + val = simple_strtoll(attr->set_buf, NULL, 0); + ret = attr->set(attr->data, val); + if (ret == 0) + ret = len; /* on success, claim we got the whole input */ +out: + mutex_unlock(&attr->mutex); + return ret; +} + +/** + * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation + * @sb: filesystem to do the file handle conversion on + * @fid: file handle to convert + * @fh_len: length of the file handle in bytes + * @fh_type: type of file handle + * @get_inode: filesystem callback to retrieve inode + * + * This function decodes @fid as long as it has one of the well-known + * Linux filehandle types and calls @get_inode on it to retrieve the + * inode for the object specified in the file handle. + */ +struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type, struct inode *(*get_inode) + (struct super_block *sb, u64 ino, u32 gen)) +{ + struct inode *inode = NULL; + + if (fh_len < 2) + return NULL; + + switch (fh_type) { + case FILEID_INO32_GEN: + case FILEID_INO32_GEN_PARENT: + inode = get_inode(sb, fid->i32.ino, fid->i32.gen); + break; + } + + return d_obtain_alias(inode); +} +EXPORT_SYMBOL_GPL(generic_fh_to_dentry); + +/** + * generic_fh_to_dentry - generic helper for the fh_to_parent export operation + * @sb: filesystem to do the file handle conversion on + * @fid: file handle to convert + * @fh_len: length of the file handle in bytes + * @fh_type: type of file handle + * @get_inode: filesystem callback to retrieve inode + * + * This function decodes @fid as long as it has one of the well-known + * Linux filehandle types and calls @get_inode on it to retrieve the + * inode for the _parent_ object specified in the file handle if it + * is specified in the file handle, or NULL otherwise. + */ +struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type, struct inode *(*get_inode) + (struct super_block *sb, u64 ino, u32 gen)) +{ + struct inode *inode = NULL; + + if (fh_len <= 2) + return NULL; + + switch (fh_type) { + case FILEID_INO32_GEN_PARENT: + inode = get_inode(sb, fid->i32.parent_ino, + (fh_len > 3 ? fid->i32.parent_gen : 0)); + break; + } + + return d_obtain_alias(inode); +} +EXPORT_SYMBOL_GPL(generic_fh_to_parent); + +/** + * generic_file_fsync - generic fsync implementation for simple filesystems + * @file: file to synchronize + * @datasync: only synchronize essential metadata if true + * + * This is a generic implementation of the fsync method for simple + * filesystems which track all non-inode metadata in the buffers list + * hanging off the address_space structure. + */ +int generic_file_fsync(struct file *file, int datasync) +{ + struct inode *inode = file->f_mapping->host; + int err; + int ret; + + ret = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY)) + return ret; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return ret; + + err = sync_inode_metadata(inode, 1); + if (ret == 0) + ret = err; + return ret; +} +EXPORT_SYMBOL(generic_file_fsync); + +/** + * generic_check_addressable - Check addressability of file system + * @blocksize_bits: log of file system block size + * @num_blocks: number of blocks in file system + * + * Determine whether a file system with @num_blocks blocks (and a + * block size of 2**@blocksize_bits) is addressable by the sector_t + * and page cache of the system. Return 0 if so and -EFBIG otherwise. + */ +int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) +{ + u64 last_fs_block = num_blocks - 1; + u64 last_fs_page = + last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits); + + if (unlikely(num_blocks == 0)) + return 0; + + if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT)) + return -EINVAL; + + if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || + (last_fs_page > (pgoff_t)(~0ULL))) { + return -EFBIG; + } + return 0; +} +EXPORT_SYMBOL(generic_check_addressable); + +/* + * No-op implementation of ->fsync for in-memory filesystems. + */ +int noop_fsync(struct file *file, int datasync) +{ + return 0; +} + +EXPORT_SYMBOL(dcache_dir_close); +EXPORT_SYMBOL(dcache_dir_lseek); +EXPORT_SYMBOL(dcache_dir_open); +EXPORT_SYMBOL(dcache_readdir); +EXPORT_SYMBOL(generic_read_dir); +EXPORT_SYMBOL(mount_pseudo); +EXPORT_SYMBOL(simple_write_begin); +EXPORT_SYMBOL(simple_write_end); +EXPORT_SYMBOL(simple_dir_inode_operations); +EXPORT_SYMBOL(simple_dir_operations); +EXPORT_SYMBOL(simple_empty); +EXPORT_SYMBOL(simple_fill_super); +EXPORT_SYMBOL(simple_getattr); +EXPORT_SYMBOL(simple_link); +EXPORT_SYMBOL(simple_lookup); +EXPORT_SYMBOL(simple_pin_fs); +EXPORT_SYMBOL(simple_readpage); +EXPORT_SYMBOL(simple_release_fs); +EXPORT_SYMBOL(simple_rename); +EXPORT_SYMBOL(simple_rmdir); +EXPORT_SYMBOL(simple_statfs); +EXPORT_SYMBOL(noop_fsync); +EXPORT_SYMBOL(simple_unlink); +EXPORT_SYMBOL(simple_read_from_buffer); +EXPORT_SYMBOL(simple_write_to_buffer); +EXPORT_SYMBOL(memory_read_from_buffer); +EXPORT_SYMBOL(simple_transaction_set); +EXPORT_SYMBOL(simple_transaction_get); +EXPORT_SYMBOL(simple_transaction_read); +EXPORT_SYMBOL(simple_transaction_release); +EXPORT_SYMBOL_GPL(simple_attr_open); +EXPORT_SYMBOL_GPL(simple_attr_release); +EXPORT_SYMBOL_GPL(simple_attr_read); +EXPORT_SYMBOL_GPL(simple_attr_write); diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile new file mode 100644 index 00000000..ca58d643 --- /dev/null +++ b/fs/lockd/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the linux lock manager stuff +# + +obj-$(CONFIG_LOCKD) += lockd.o + +lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ + svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o +lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o +lockd-objs := $(lockd-objs-y) diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c new file mode 100644 index 00000000..046bb77c --- /dev/null +++ b/fs/lockd/clnt4xdr.c @@ -0,0 +1,605 @@ +/* + * linux/fs/lockd/clnt4xdr.c + * + * XDR functions to encode/decode NLM version 4 RPC arguments and results. + * + * NLM client-side only. + * + * Copyright (C) 2010, Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_XDR + +#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) +# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!" +#endif + +#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN) +# error "NLM host name cannot be larger than NLM's maximum string length!" +#endif + +/* + * Declare the space requirements for NLM arguments and replies as + * number of 32bit-words + */ +#define NLM4_void_sz (0) +#define NLM4_cookie_sz (1+(NLM_MAXCOOKIELEN>>2)) +#define NLM4_caller_sz (1+(NLMCLNT_OHSIZE>>2)) +#define NLM4_owner_sz (1+(NLMCLNT_OHSIZE>>2)) +#define NLM4_fhandle_sz (1+(NFS3_FHSIZE>>2)) +#define NLM4_lock_sz (5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz) +#define NLM4_holder_sz (6+NLM4_owner_sz) + +#define NLM4_testargs_sz (NLM4_cookie_sz+1+NLM4_lock_sz) +#define NLM4_lockargs_sz (NLM4_cookie_sz+4+NLM4_lock_sz) +#define NLM4_cancargs_sz (NLM4_cookie_sz+2+NLM4_lock_sz) +#define NLM4_unlockargs_sz (NLM4_cookie_sz+NLM4_lock_sz) + +#define NLM4_testres_sz (NLM4_cookie_sz+1+NLM4_holder_sz) +#define NLM4_res_sz (NLM4_cookie_sz+1) +#define NLM4_norep_sz (0) + + +static s64 loff_t_to_s64(loff_t offset) +{ + s64 res; + + if (offset >= NLM4_OFFSET_MAX) + res = NLM4_OFFSET_MAX; + else if (offset <= -NLM4_OFFSET_MAX) + res = -NLM4_OFFSET_MAX; + else + res = offset; + return res; +} + +static void nlm4_compute_offsets(const struct nlm_lock *lock, + u64 *l_offset, u64 *l_len) +{ + const struct file_lock *fl = &lock->fl; + + BUG_ON(fl->fl_start > NLM4_OFFSET_MAX); + BUG_ON(fl->fl_end > NLM4_OFFSET_MAX && + fl->fl_end != OFFSET_MAX); + + *l_offset = loff_t_to_s64(fl->fl_start); + if (fl->fl_end == OFFSET_MAX) + *l_len = 0; + else + *l_len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1); +} + +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("lockd: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + + +/* + * Encode/decode NLMv4 basic data types + * + * Basic NLMv4 data types are defined in Appendix II, section 6.1.4 + * of RFC 1813: "NFS Version 3 Protocol Specification" and in Chapter + * 10 of X/Open's "Protocols for Interworking: XNFS, Version 3W". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. + */ + +static void encode_bool(struct xdr_stream *xdr, const int value) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = value ? xdr_one : xdr_zero; +} + +static void encode_int32(struct xdr_stream *xdr, const s32 value) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(value); +} + +/* + * typedef opaque netobj + */ +static void encode_netobj(struct xdr_stream *xdr, + const u8 *data, const unsigned int length) +{ + __be32 *p; + + BUG_ON(length > XDR_MAX_NETOBJ); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, data, length); +} + +static int decode_netobj(struct xdr_stream *xdr, + struct xdr_netobj *obj) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + if (unlikely(length > XDR_MAX_NETOBJ)) + goto out_size; + obj->len = length; + obj->data = (u8 *)p; + return 0; +out_size: + dprintk("NFS: returned netobj was too long: %u\n", length); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * netobj cookie; + */ +static void encode_cookie(struct xdr_stream *xdr, + const struct nlm_cookie *cookie) +{ + BUG_ON(cookie->len > NLM_MAXCOOKIELEN); + encode_netobj(xdr, (u8 *)&cookie->data, cookie->len); +} + +static int decode_cookie(struct xdr_stream *xdr, + struct nlm_cookie *cookie) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + /* apparently HPUX can return empty cookies */ + if (length == 0) + goto out_hpux; + if (length > NLM_MAXCOOKIELEN) + goto out_size; + p = xdr_inline_decode(xdr, length); + if (unlikely(p == NULL)) + goto out_overflow; + cookie->len = length; + memcpy(cookie->data, p, length); + return 0; +out_hpux: + cookie->len = 4; + memset(cookie->data, 0, 4); + return 0; +out_size: + dprintk("NFS: returned cookie was too long: %u\n", length); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * netobj fh; + */ +static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh) +{ + BUG_ON(fh->size > NFS3_FHSIZE); + encode_netobj(xdr, (u8 *)&fh->data, fh->size); +} + +/* + * enum nlm4_stats { + * NLM4_GRANTED = 0, + * NLM4_DENIED = 1, + * NLM4_DENIED_NOLOCKS = 2, + * NLM4_BLOCKED = 3, + * NLM4_DENIED_GRACE_PERIOD = 4, + * NLM4_DEADLCK = 5, + * NLM4_ROFS = 6, + * NLM4_STALE_FH = 7, + * NLM4_FBIG = 8, + * NLM4_FAILED = 9 + * }; + * + * struct nlm4_stat { + * nlm4_stats stat; + * }; + * + * NB: we don't swap bytes for the NLM status values. The upper + * layers deal directly with the status value in network byte + * order. + */ +static void encode_nlm4_stat(struct xdr_stream *xdr, + const __be32 stat) +{ + __be32 *p; + + BUG_ON(be32_to_cpu(stat) > NLM_FAILED); + p = xdr_reserve_space(xdr, 4); + *p = stat; +} + +static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (unlikely(ntohl(*p) > ntohl(nlm4_failed))) + goto out_bad_xdr; + *stat = *p; + return 0; +out_bad_xdr: + dprintk("%s: server returned invalid nlm4_stats value: %u\n", + __func__, be32_to_cpup(p)); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * struct nlm4_holder { + * bool exclusive; + * int32 svid; + * netobj oh; + * uint64 l_offset; + * uint64 l_len; + * }; + */ +static void encode_nlm4_holder(struct xdr_stream *xdr, + const struct nlm_res *result) +{ + const struct nlm_lock *lock = &result->lock; + u64 l_offset, l_len; + __be32 *p; + + encode_bool(xdr, lock->fl.fl_type == F_RDLCK); + encode_int32(xdr, lock->svid); + encode_netobj(xdr, lock->oh.data, lock->oh.len); + + p = xdr_reserve_space(xdr, 4 + 4); + nlm4_compute_offsets(lock, &l_offset, &l_len); + p = xdr_encode_hyper(p, l_offset); + xdr_encode_hyper(p, l_len); +} + +static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result) +{ + struct nlm_lock *lock = &result->lock; + struct file_lock *fl = &lock->fl; + u64 l_offset, l_len; + u32 exclusive; + int error; + __be32 *p; + s32 end; + + memset(lock, 0, sizeof(*lock)); + locks_init_lock(fl); + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + exclusive = be32_to_cpup(p++); + lock->svid = be32_to_cpup(p); + fl->fl_pid = (pid_t)lock->svid; + + error = decode_netobj(xdr, &lock->oh); + if (unlikely(error)) + goto out; + + p = xdr_inline_decode(xdr, 8 + 8); + if (unlikely(p == NULL)) + goto out_overflow; + + fl->fl_flags = FL_POSIX; + fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK; + p = xdr_decode_hyper(p, &l_offset); + xdr_decode_hyper(p, &l_len); + end = l_offset + l_len - 1; + + fl->fl_start = (loff_t)l_offset; + if (l_len == 0 || end < 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = (loff_t)end; + error = 0; +out: + return error; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * string caller_name; + */ +static void encode_caller_name(struct xdr_stream *xdr, const char *name) +{ + /* NB: client-side does not set lock->len */ + u32 length = strlen(name); + __be32 *p; + + BUG_ON(length > NLM_MAXSTRLEN); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); +} + +/* + * struct nlm4_lock { + * string caller_name; + * netobj fh; + * netobj oh; + * int32 svid; + * uint64 l_offset; + * uint64 l_len; + * }; + */ +static void encode_nlm4_lock(struct xdr_stream *xdr, + const struct nlm_lock *lock) +{ + u64 l_offset, l_len; + __be32 *p; + + encode_caller_name(xdr, lock->caller); + encode_fh(xdr, &lock->fh); + encode_netobj(xdr, lock->oh.data, lock->oh.len); + + p = xdr_reserve_space(xdr, 4 + 8 + 8); + *p++ = cpu_to_be32(lock->svid); + + nlm4_compute_offsets(lock, &l_offset, &l_len); + p = xdr_encode_hyper(p, l_offset); + xdr_encode_hyper(p, l_len); +} + + +/* + * NLMv4 XDR encode functions + * + * NLMv4 argument types are defined in Appendix II of RFC 1813: + * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's + * "Protocols for Interworking: XNFS, Version 3W". + */ + +/* + * struct nlm4_testargs { + * netobj cookie; + * bool exclusive; + * struct nlm4_lock alock; + * }; + */ +static void nlm4_xdr_enc_testargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm4_lock(xdr, lock); +} + +/* + * struct nlm4_lockargs { + * netobj cookie; + * bool block; + * bool exclusive; + * struct nlm4_lock alock; + * bool reclaim; + * int state; + * }; + */ +static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, args->block); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm4_lock(xdr, lock); + encode_bool(xdr, args->reclaim); + encode_int32(xdr, args->state); +} + +/* + * struct nlm4_cancargs { + * netobj cookie; + * bool block; + * bool exclusive; + * struct nlm4_lock alock; + * }; + */ +static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, args->block); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm4_lock(xdr, lock); +} + +/* + * struct nlm4_unlockargs { + * netobj cookie; + * struct nlm4_lock alock; + * }; + */ +static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_nlm4_lock(xdr, lock); +} + +/* + * struct nlm4_res { + * netobj cookie; + * nlm4_stat stat; + * }; + */ +static void nlm4_xdr_enc_res(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_res *result) +{ + encode_cookie(xdr, &result->cookie); + encode_nlm4_stat(xdr, result->status); +} + +/* + * union nlm4_testrply switch (nlm4_stats stat) { + * case NLM4_DENIED: + * struct nlm4_holder holder; + * default: + * void; + * }; + * + * struct nlm4_testres { + * netobj cookie; + * nlm4_testrply test_stat; + * }; + */ +static void nlm4_xdr_enc_testres(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_res *result) +{ + encode_cookie(xdr, &result->cookie); + encode_nlm4_stat(xdr, result->status); + if (result->status == nlm_lck_denied) + encode_nlm4_holder(xdr, result); +} + + +/* + * NLMv4 XDR decode functions + * + * NLMv4 argument types are defined in Appendix II of RFC 1813: + * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's + * "Protocols for Interworking: XNFS, Version 3W". + */ + +/* + * union nlm4_testrply switch (nlm4_stats stat) { + * case NLM4_DENIED: + * struct nlm4_holder holder; + * default: + * void; + * }; + * + * struct nlm4_testres { + * netobj cookie; + * nlm4_testrply test_stat; + * }; + */ +static int decode_nlm4_testrply(struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_nlm4_stat(xdr, &result->status); + if (unlikely(error)) + goto out; + if (result->status == nlm_lck_denied) + error = decode_nlm4_holder(xdr, result); +out: + return error; +} + +static int nlm4_xdr_dec_testres(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_cookie(xdr, &result->cookie); + if (unlikely(error)) + goto out; + error = decode_nlm4_testrply(xdr, result); +out: + return error; +} + +/* + * struct nlm4_res { + * netobj cookie; + * nlm4_stat stat; + * }; + */ +static int nlm4_xdr_dec_res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_cookie(xdr, &result->cookie); + if (unlikely(error)) + goto out; + error = decode_nlm4_stat(xdr, &result->status); +out: + return error; +} + + +/* + * For NLM, a void procedure really returns nothing + */ +#define nlm4_xdr_dec_norep NULL + +#define PROC(proc, argtype, restype) \ +[NLMPROC_##proc] = { \ + .p_proc = NLMPROC_##proc, \ + .p_encode = (kxdreproc_t)nlm4_xdr_enc_##argtype, \ + .p_decode = (kxdrdproc_t)nlm4_xdr_dec_##restype, \ + .p_arglen = NLM4_##argtype##_sz, \ + .p_replen = NLM4_##restype##_sz, \ + .p_statidx = NLMPROC_##proc, \ + .p_name = #proc, \ + } + +static struct rpc_procinfo nlm4_procedures[] = { + PROC(TEST, testargs, testres), + PROC(LOCK, lockargs, res), + PROC(CANCEL, cancargs, res), + PROC(UNLOCK, unlockargs, res), + PROC(GRANTED, testargs, res), + PROC(TEST_MSG, testargs, norep), + PROC(LOCK_MSG, lockargs, norep), + PROC(CANCEL_MSG, cancargs, norep), + PROC(UNLOCK_MSG, unlockargs, norep), + PROC(GRANTED_MSG, testargs, norep), + PROC(TEST_RES, testres, norep), + PROC(LOCK_RES, res, norep), + PROC(CANCEL_RES, res, norep), + PROC(UNLOCK_RES, res, norep), + PROC(GRANTED_RES, res, norep), +}; + +struct rpc_version nlm_version4 = { + .number = 4, + .nrprocs = ARRAY_SIZE(nlm4_procedures), + .procs = nlm4_procedures, +}; diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c new file mode 100644 index 00000000..8d4ea835 --- /dev/null +++ b/fs/lockd/clntlock.c @@ -0,0 +1,279 @@ +/* + * linux/fs/lockd/clntlock.c + * + * Lock handling for the client side NLM implementation + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_CLIENT + +/* + * Local function prototypes + */ +static int reclaimer(void *ptr); + +/* + * The following functions handle blocking and granting from the + * client perspective. + */ + +/* + * This is the representation of a blocked client lock. + */ +struct nlm_wait { + struct list_head b_list; /* linked list */ + wait_queue_head_t b_wait; /* where to wait on */ + struct nlm_host * b_host; + struct file_lock * b_lock; /* local file lock */ + unsigned short b_reclaim; /* got to reclaim lock */ + __be32 b_status; /* grant callback status */ +}; + +static LIST_HEAD(nlm_blocked); +static DEFINE_SPINLOCK(nlm_blocked_lock); + +/** + * nlmclnt_init - Set up per-NFS mount point lockd data structures + * @nlm_init: pointer to arguments structure + * + * Returns pointer to an appropriate nlm_host struct, + * or an ERR_PTR value. + */ +struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) +{ + struct nlm_host *host; + u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4; + int status; + + status = lockd_up(); + if (status < 0) + return ERR_PTR(status); + + host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen, + nlm_init->protocol, nlm_version, + nlm_init->hostname, nlm_init->noresvport); + if (host == NULL) { + lockd_down(); + return ERR_PTR(-ENOLCK); + } + + return host; +} +EXPORT_SYMBOL_GPL(nlmclnt_init); + +/** + * nlmclnt_done - Release resources allocated by nlmclnt_init() + * @host: nlm_host structure reserved by nlmclnt_init() + * + */ +void nlmclnt_done(struct nlm_host *host) +{ + nlmclnt_release_host(host); + lockd_down(); +} +EXPORT_SYMBOL_GPL(nlmclnt_done); + +/* + * Queue up a lock for blocking so that the GRANTED request can see it + */ +struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl) +{ + struct nlm_wait *block; + + block = kmalloc(sizeof(*block), GFP_KERNEL); + if (block != NULL) { + block->b_host = host; + block->b_lock = fl; + init_waitqueue_head(&block->b_wait); + block->b_status = nlm_lck_blocked; + + spin_lock(&nlm_blocked_lock); + list_add(&block->b_list, &nlm_blocked); + spin_unlock(&nlm_blocked_lock); + } + return block; +} + +void nlmclnt_finish_block(struct nlm_wait *block) +{ + if (block == NULL) + return; + spin_lock(&nlm_blocked_lock); + list_del(&block->b_list); + spin_unlock(&nlm_blocked_lock); + kfree(block); +} + +/* + * Block on a lock + */ +int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) +{ + long ret; + + /* A borken server might ask us to block even if we didn't + * request it. Just say no! + */ + if (block == NULL) + return -EAGAIN; + + /* Go to sleep waiting for GRANT callback. Some servers seem + * to lose callbacks, however, so we're going to poll from + * time to time just to make sure. + * + * For now, the retry frequency is pretty high; normally + * a 1 minute timeout would do. See the comment before + * nlmclnt_lock for an explanation. + */ + ret = wait_event_interruptible_timeout(block->b_wait, + block->b_status != nlm_lck_blocked, + timeout); + if (ret < 0) + return -ERESTARTSYS; + req->a_res.status = block->b_status; + return 0; +} + +/* + * The server lockd has called us back to tell us the lock was granted + */ +__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) +{ + const struct file_lock *fl = &lock->fl; + const struct nfs_fh *fh = &lock->fh; + struct nlm_wait *block; + __be32 res = nlm_lck_denied; + + /* + * Look up blocked request based on arguments. + * Warning: must not use cookie to match it! + */ + spin_lock(&nlm_blocked_lock); + list_for_each_entry(block, &nlm_blocked, b_list) { + struct file_lock *fl_blocked = block->b_lock; + + if (fl_blocked->fl_start != fl->fl_start) + continue; + if (fl_blocked->fl_end != fl->fl_end) + continue; + /* + * Careful! The NLM server will return the 32-bit "pid" that + * we put on the wire: in this case the lockowner "pid". + */ + if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) + continue; + if (!rpc_cmp_addr(nlm_addr(block->b_host), addr)) + continue; + if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) + continue; + /* Alright, we found a lock. Set the return status + * and wake up the caller + */ + block->b_status = nlm_granted; + wake_up(&block->b_wait); + res = nlm_granted; + } + spin_unlock(&nlm_blocked_lock); + return res; +} + +/* + * The following procedures deal with the recovery of locks after a + * server crash. + */ + +/* + * Reclaim all locks on server host. We do this by spawning a separate + * reclaimer thread. + */ +void +nlmclnt_recovery(struct nlm_host *host) +{ + struct task_struct *task; + + if (!host->h_reclaiming++) { + nlm_get_host(host); + task = kthread_run(reclaimer, host, "%s-reclaim", host->h_name); + if (IS_ERR(task)) + printk(KERN_ERR "lockd: unable to spawn reclaimer " + "thread. Locks for %s won't be reclaimed! " + "(%ld)\n", host->h_name, PTR_ERR(task)); + } +} + +static int +reclaimer(void *ptr) +{ + struct nlm_host *host = (struct nlm_host *) ptr; + struct nlm_wait *block; + struct file_lock *fl, *next; + u32 nsmstate; + + allow_signal(SIGKILL); + + down_write(&host->h_rwsem); + lockd_up(); /* note: this cannot fail as lockd is already running */ + + dprintk("lockd: reclaiming locks for host %s\n", host->h_name); + +restart: + nsmstate = host->h_nsmstate; + + /* Force a portmap getport - the peer's lockd will + * most likely end up on a different port. + */ + host->h_nextrebind = jiffies; + nlm_rebind_host(host); + + /* First, reclaim all locks that have been granted. */ + list_splice_init(&host->h_granted, &host->h_reclaim); + list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) { + list_del_init(&fl->fl_u.nfs_fl.list); + + /* + * sending this thread a SIGKILL will result in any unreclaimed + * locks being removed from the h_granted list. This means that + * the kernel will not attempt to reclaim them again if a new + * reclaimer thread is spawned for this host. + */ + if (signalled()) + continue; + if (nlmclnt_reclaim(host, fl) != 0) + continue; + list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted); + if (host->h_nsmstate != nsmstate) { + /* Argh! The server rebooted again! */ + goto restart; + } + } + + host->h_reclaiming = 0; + up_write(&host->h_rwsem); + dprintk("NLM: done reclaiming locks for host %s\n", host->h_name); + + /* Now, wake up all processes that sleep on a blocked lock */ + spin_lock(&nlm_blocked_lock); + list_for_each_entry(block, &nlm_blocked, b_list) { + if (block->b_host == host) { + block->b_status = nlm_lck_denied_grace_period; + wake_up(&block->b_wait); + } + } + spin_unlock(&nlm_blocked_lock); + + /* Release host handle after use */ + nlmclnt_release_host(host); + lockd_down(); + return 0; +} diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c new file mode 100644 index 00000000..e374050a --- /dev/null +++ b/fs/lockd/clntproc.c @@ -0,0 +1,848 @@ +/* + * linux/fs/lockd/clntproc.c + * + * RPC procedures for the client side NLM implementation + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_CLIENT +#define NLMCLNT_GRACE_WAIT (5*HZ) +#define NLMCLNT_POLL_TIMEOUT (30*HZ) +#define NLMCLNT_MAX_RETRIES 3 + +static int nlmclnt_test(struct nlm_rqst *, struct file_lock *); +static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *); +static int nlmclnt_unlock(struct nlm_rqst *, struct file_lock *); +static int nlm_stat_to_errno(__be32 stat); +static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host); +static int nlmclnt_cancel(struct nlm_host *, int , struct file_lock *); + +static const struct rpc_call_ops nlmclnt_unlock_ops; +static const struct rpc_call_ops nlmclnt_cancel_ops; + +/* + * Cookie counter for NLM requests + */ +static atomic_t nlm_cookie = ATOMIC_INIT(0x1234); + +void nlmclnt_next_cookie(struct nlm_cookie *c) +{ + u32 cookie = atomic_inc_return(&nlm_cookie); + + memcpy(c->data, &cookie, 4); + c->len=4; +} + +static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner) +{ + atomic_inc(&lockowner->count); + return lockowner; +} + +static void nlm_put_lockowner(struct nlm_lockowner *lockowner) +{ + if (!atomic_dec_and_lock(&lockowner->count, &lockowner->host->h_lock)) + return; + list_del(&lockowner->list); + spin_unlock(&lockowner->host->h_lock); + nlmclnt_release_host(lockowner->host); + kfree(lockowner); +} + +static inline int nlm_pidbusy(struct nlm_host *host, uint32_t pid) +{ + struct nlm_lockowner *lockowner; + list_for_each_entry(lockowner, &host->h_lockowners, list) { + if (lockowner->pid == pid) + return -EBUSY; + } + return 0; +} + +static inline uint32_t __nlm_alloc_pid(struct nlm_host *host) +{ + uint32_t res; + do { + res = host->h_pidcount++; + } while (nlm_pidbusy(host, res) < 0); + return res; +} + +static struct nlm_lockowner *__nlm_find_lockowner(struct nlm_host *host, fl_owner_t owner) +{ + struct nlm_lockowner *lockowner; + list_for_each_entry(lockowner, &host->h_lockowners, list) { + if (lockowner->owner != owner) + continue; + return nlm_get_lockowner(lockowner); + } + return NULL; +} + +static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_t owner) +{ + struct nlm_lockowner *res, *new = NULL; + + spin_lock(&host->h_lock); + res = __nlm_find_lockowner(host, owner); + if (res == NULL) { + spin_unlock(&host->h_lock); + new = kmalloc(sizeof(*new), GFP_KERNEL); + spin_lock(&host->h_lock); + res = __nlm_find_lockowner(host, owner); + if (res == NULL && new != NULL) { + res = new; + atomic_set(&new->count, 1); + new->owner = owner; + new->pid = __nlm_alloc_pid(host); + new->host = nlm_get_host(host); + list_add(&new->list, &host->h_lockowners); + new = NULL; + } + } + spin_unlock(&host->h_lock); + kfree(new); + return res; +} + +/* + * Initialize arguments for TEST/LOCK/UNLOCK/CANCEL calls + */ +static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) +{ + struct nlm_args *argp = &req->a_args; + struct nlm_lock *lock = &argp->lock; + + nlmclnt_next_cookie(&argp->cookie); + memcpy(&lock->fh, NFS_FH(fl->fl_file->f_path.dentry->d_inode), sizeof(struct nfs_fh)); + lock->caller = utsname()->nodename; + lock->oh.data = req->a_owner; + lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", + (unsigned int)fl->fl_u.nfs_fl.owner->pid, + utsname()->nodename); + lock->svid = fl->fl_u.nfs_fl.owner->pid; + lock->fl.fl_start = fl->fl_start; + lock->fl.fl_end = fl->fl_end; + lock->fl.fl_type = fl->fl_type; +} + +static void nlmclnt_release_lockargs(struct nlm_rqst *req) +{ + BUG_ON(req->a_args.lock.fl.fl_ops != NULL); +} + +/** + * nlmclnt_proc - Perform a single client-side lock request + * @host: address of a valid nlm_host context representing the NLM server + * @cmd: fcntl-style file lock operation to perform + * @fl: address of arguments for the lock operation + * + */ +int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) +{ + struct nlm_rqst *call; + int status; + + nlm_get_host(host); + call = nlm_alloc_call(host); + if (call == NULL) + return -ENOMEM; + + nlmclnt_locks_init_private(fl, host); + /* Set up the argument struct */ + nlmclnt_setlockargs(call, fl); + + if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { + if (fl->fl_type != F_UNLCK) { + call->a_args.block = IS_SETLKW(cmd) ? 1 : 0; + status = nlmclnt_lock(call, fl); + } else + status = nlmclnt_unlock(call, fl); + } else if (IS_GETLK(cmd)) + status = nlmclnt_test(call, fl); + else + status = -EINVAL; + fl->fl_ops->fl_release_private(fl); + fl->fl_ops = NULL; + + dprintk("lockd: clnt proc returns %d\n", status); + return status; +} +EXPORT_SYMBOL_GPL(nlmclnt_proc); + +/* + * Allocate an NLM RPC call struct + * + * Note: the caller must hold a reference to host. In case of failure, + * this reference will be released. + */ +struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) +{ + struct nlm_rqst *call; + + for(;;) { + call = kzalloc(sizeof(*call), GFP_KERNEL); + if (call != NULL) { + atomic_set(&call->a_count, 1); + locks_init_lock(&call->a_args.lock.fl); + locks_init_lock(&call->a_res.lock.fl); + call->a_host = host; + return call; + } + if (signalled()) + break; + printk("nlm_alloc_call: failed, waiting for memory\n"); + schedule_timeout_interruptible(5*HZ); + } + nlmclnt_release_host(host); + return NULL; +} + +void nlmclnt_release_call(struct nlm_rqst *call) +{ + if (!atomic_dec_and_test(&call->a_count)) + return; + nlmclnt_release_host(call->a_host); + nlmclnt_release_lockargs(call); + kfree(call); +} + +static void nlmclnt_rpc_release(void *data) +{ + nlmclnt_release_call(data); +} + +static int nlm_wait_on_grace(wait_queue_head_t *queue) +{ + DEFINE_WAIT(wait); + int status = -EINTR; + + prepare_to_wait(queue, &wait, TASK_INTERRUPTIBLE); + if (!signalled ()) { + schedule_timeout(NLMCLNT_GRACE_WAIT); + try_to_freeze(); + if (!signalled ()) + status = 0; + } + finish_wait(queue, &wait); + return status; +} + +/* + * Generic NLM call + */ +static int +nlmclnt_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc) +{ + struct nlm_host *host = req->a_host; + struct rpc_clnt *clnt; + struct nlm_args *argp = &req->a_args; + struct nlm_res *resp = &req->a_res; + struct rpc_message msg = { + .rpc_argp = argp, + .rpc_resp = resp, + .rpc_cred = cred, + }; + int status; + + dprintk("lockd: call procedure %d on %s\n", + (int)proc, host->h_name); + + do { + if (host->h_reclaiming && !argp->reclaim) + goto in_grace_period; + + /* If we have no RPC client yet, create one. */ + if ((clnt = nlm_bind_host(host)) == NULL) + return -ENOLCK; + msg.rpc_proc = &clnt->cl_procinfo[proc]; + + /* Perform the RPC call. If an error occurs, try again */ + if ((status = rpc_call_sync(clnt, &msg, 0)) < 0) { + dprintk("lockd: rpc_call returned error %d\n", -status); + switch (status) { + case -EPROTONOSUPPORT: + status = -EINVAL; + break; + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ENOTCONN: + nlm_rebind_host(host); + status = -EAGAIN; + break; + case -ERESTARTSYS: + return signalled () ? -EINTR : status; + default: + break; + } + break; + } else + if (resp->status == nlm_lck_denied_grace_period) { + dprintk("lockd: server in grace period\n"); + if (argp->reclaim) { + printk(KERN_WARNING + "lockd: spurious grace period reject?!\n"); + return -ENOLCK; + } + } else { + if (!argp->reclaim) { + /* We appear to be out of the grace period */ + wake_up_all(&host->h_gracewait); + } + dprintk("lockd: server returns status %d\n", resp->status); + return 0; /* Okay, call complete */ + } + +in_grace_period: + /* + * The server has rebooted and appears to be in the grace + * period during which locks are only allowed to be + * reclaimed. + * We can only back off and try again later. + */ + status = nlm_wait_on_grace(&host->h_gracewait); + } while (status == 0); + + return status; +} + +/* + * Generic NLM call, async version. + */ +static struct rpc_task *__nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops) +{ + struct nlm_host *host = req->a_host; + struct rpc_clnt *clnt; + struct rpc_task_setup task_setup_data = { + .rpc_message = msg, + .callback_ops = tk_ops, + .callback_data = req, + .flags = RPC_TASK_ASYNC, + }; + + dprintk("lockd: call procedure %d on %s (async)\n", + (int)proc, host->h_name); + + /* If we have no RPC client yet, create one. */ + clnt = nlm_bind_host(host); + if (clnt == NULL) + goto out_err; + msg->rpc_proc = &clnt->cl_procinfo[proc]; + task_setup_data.rpc_client = clnt; + + /* bootstrap and kick off the async RPC call */ + return rpc_run_task(&task_setup_data); +out_err: + tk_ops->rpc_release(req); + return ERR_PTR(-ENOLCK); +} + +static int nlm_do_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops) +{ + struct rpc_task *task; + + task = __nlm_async_call(req, proc, msg, tk_ops); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} + +/* + * NLM asynchronous call. + */ +int nlm_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) +{ + struct rpc_message msg = { + .rpc_argp = &req->a_args, + .rpc_resp = &req->a_res, + }; + return nlm_do_async_call(req, proc, &msg, tk_ops); +} + +int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) +{ + struct rpc_message msg = { + .rpc_argp = &req->a_res, + }; + return nlm_do_async_call(req, proc, &msg, tk_ops); +} + +/* + * NLM client asynchronous call. + * + * Note that although the calls are asynchronous, and are therefore + * guaranteed to complete, we still always attempt to wait for + * completion in order to be able to correctly track the lock + * state. + */ +static int nlmclnt_async_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) +{ + struct rpc_message msg = { + .rpc_argp = &req->a_args, + .rpc_resp = &req->a_res, + .rpc_cred = cred, + }; + struct rpc_task *task; + int err; + + task = __nlm_async_call(req, proc, &msg, tk_ops); + if (IS_ERR(task)) + return PTR_ERR(task); + err = rpc_wait_for_completion_task(task); + rpc_put_task(task); + return err; +} + +/* + * TEST for the presence of a conflicting lock + */ +static int +nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) +{ + int status; + + status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_TEST); + if (status < 0) + goto out; + + switch (req->a_res.status) { + case nlm_granted: + fl->fl_type = F_UNLCK; + break; + case nlm_lck_denied: + /* + * Report the conflicting lock back to the application. + */ + fl->fl_start = req->a_res.lock.fl.fl_start; + fl->fl_end = req->a_res.lock.fl.fl_end; + fl->fl_type = req->a_res.lock.fl.fl_type; + fl->fl_pid = 0; + break; + default: + status = nlm_stat_to_errno(req->a_res.status); + } +out: + nlmclnt_release_call(req); + return status; +} + +static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl) +{ + spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock); + new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state; + new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner); + list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted); + spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock); +} + +static void nlmclnt_locks_release_private(struct file_lock *fl) +{ + spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock); + list_del(&fl->fl_u.nfs_fl.list); + spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock); + nlm_put_lockowner(fl->fl_u.nfs_fl.owner); +} + +static const struct file_lock_operations nlmclnt_lock_ops = { + .fl_copy_lock = nlmclnt_locks_copy_lock, + .fl_release_private = nlmclnt_locks_release_private, +}; + +static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host) +{ + BUG_ON(fl->fl_ops != NULL); + fl->fl_u.nfs_fl.state = 0; + fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner); + INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list); + fl->fl_ops = &nlmclnt_lock_ops; +} + +static int do_vfs_lock(struct file_lock *fl) +{ + int res = 0; + switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { + case FL_POSIX: + res = posix_lock_file_wait(fl->fl_file, fl); + break; + case FL_FLOCK: + res = flock_lock_file_wait(fl->fl_file, fl); + break; + default: + BUG(); + } + return res; +} + +/* + * LOCK: Try to create a lock + * + * Programmer Harassment Alert + * + * When given a blocking lock request in a sync RPC call, the HPUX lockd + * will faithfully return LCK_BLOCKED but never cares to notify us when + * the lock could be granted. This way, our local process could hang + * around forever waiting for the callback. + * + * Solution A: Implement busy-waiting + * Solution B: Use the async version of the call (NLM_LOCK_{MSG,RES}) + * + * For now I am implementing solution A, because I hate the idea of + * re-implementing lockd for a third time in two months. The async + * calls shouldn't be too hard to do, however. + * + * This is one of the lovely things about standards in the NFS area: + * they're so soft and squishy you can't really blame HP for doing this. + */ +static int +nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) +{ + struct rpc_cred *cred = nfs_file_cred(fl->fl_file); + struct nlm_host *host = req->a_host; + struct nlm_res *resp = &req->a_res; + struct nlm_wait *block = NULL; + unsigned char fl_flags = fl->fl_flags; + unsigned char fl_type; + int status = -ENOLCK; + + if (nsm_monitor(host) < 0) + goto out; + req->a_args.state = nsm_local_state; + + fl->fl_flags |= FL_ACCESS; + status = do_vfs_lock(fl); + fl->fl_flags = fl_flags; + if (status < 0) + goto out; + + block = nlmclnt_prepare_block(host, fl); +again: + /* + * Initialise resp->status to a valid non-zero value, + * since 0 == nlm_lck_granted + */ + resp->status = nlm_lck_blocked; + for(;;) { + /* Reboot protection */ + fl->fl_u.nfs_fl.state = host->h_state; + status = nlmclnt_call(cred, req, NLMPROC_LOCK); + if (status < 0) + break; + /* Did a reclaimer thread notify us of a server reboot? */ + if (resp->status == nlm_lck_denied_grace_period) + continue; + if (resp->status != nlm_lck_blocked) + break; + /* Wait on an NLM blocking lock */ + status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); + if (status < 0) + break; + if (resp->status != nlm_lck_blocked) + break; + } + + /* if we were interrupted while blocking, then cancel the lock request + * and exit + */ + if (resp->status == nlm_lck_blocked) { + if (!req->a_args.block) + goto out_unlock; + if (nlmclnt_cancel(host, req->a_args.block, fl) == 0) + goto out_unblock; + } + + if (resp->status == nlm_granted) { + down_read(&host->h_rwsem); + /* Check whether or not the server has rebooted */ + if (fl->fl_u.nfs_fl.state != host->h_state) { + up_read(&host->h_rwsem); + goto again; + } + /* Ensure the resulting lock will get added to granted list */ + fl->fl_flags |= FL_SLEEP; + if (do_vfs_lock(fl) < 0) + printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__); + up_read(&host->h_rwsem); + fl->fl_flags = fl_flags; + status = 0; + } + if (status < 0) + goto out_unlock; + /* + * EAGAIN doesn't make sense for sleeping locks, and in some + * cases NLM_LCK_DENIED is returned for a permanent error. So + * turn it into an ENOLCK. + */ + if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP)) + status = -ENOLCK; + else + status = nlm_stat_to_errno(resp->status); +out_unblock: + nlmclnt_finish_block(block); +out: + nlmclnt_release_call(req); + return status; +out_unlock: + /* Fatal error: ensure that we remove the lock altogether */ + dprintk("lockd: lock attempt ended in fatal error.\n" + " Attempting to unlock.\n"); + nlmclnt_finish_block(block); + fl_type = fl->fl_type; + fl->fl_type = F_UNLCK; + down_read(&host->h_rwsem); + do_vfs_lock(fl); + up_read(&host->h_rwsem); + fl->fl_type = fl_type; + fl->fl_flags = fl_flags; + nlmclnt_async_call(cred, req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); + return status; +} + +/* + * RECLAIM: Try to reclaim a lock + */ +int +nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl) +{ + struct nlm_rqst reqst, *req; + int status; + + req = &reqst; + memset(req, 0, sizeof(*req)); + locks_init_lock(&req->a_args.lock.fl); + locks_init_lock(&req->a_res.lock.fl); + req->a_host = host; + req->a_flags = 0; + + /* Set up the argument struct */ + nlmclnt_setlockargs(req, fl); + req->a_args.reclaim = 1; + + status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_LOCK); + if (status >= 0 && req->a_res.status == nlm_granted) + return 0; + + printk(KERN_WARNING "lockd: failed to reclaim lock for pid %d " + "(errno %d, status %d)\n", fl->fl_pid, + status, ntohl(req->a_res.status)); + + /* + * FIXME: This is a serious failure. We can + * + * a. Ignore the problem + * b. Send the owning process some signal (Linux doesn't have + * SIGLOST, though...) + * c. Retry the operation + * + * Until someone comes up with a simple implementation + * for b or c, I'll choose option a. + */ + + return -ENOLCK; +} + +/* + * UNLOCK: remove an existing lock + */ +static int +nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) +{ + struct nlm_host *host = req->a_host; + struct nlm_res *resp = &req->a_res; + int status; + unsigned char fl_flags = fl->fl_flags; + + /* + * Note: the server is supposed to either grant us the unlock + * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either + * case, we want to unlock. + */ + fl->fl_flags |= FL_EXISTS; + down_read(&host->h_rwsem); + status = do_vfs_lock(fl); + up_read(&host->h_rwsem); + fl->fl_flags = fl_flags; + if (status == -ENOENT) { + status = 0; + goto out; + } + + atomic_inc(&req->a_count); + status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, + NLMPROC_UNLOCK, &nlmclnt_unlock_ops); + if (status < 0) + goto out; + + if (resp->status == nlm_granted) + goto out; + + if (resp->status != nlm_lck_denied_nolocks) + printk("lockd: unexpected unlock status: %d\n", resp->status); + /* What to do now? I'm out of my depth... */ + status = -ENOLCK; +out: + nlmclnt_release_call(req); + return status; +} + +static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) +{ + struct nlm_rqst *req = data; + u32 status = ntohl(req->a_res.status); + + if (RPC_ASSASSINATED(task)) + goto die; + + if (task->tk_status < 0) { + dprintk("lockd: unlock failed (err = %d)\n", -task->tk_status); + switch (task->tk_status) { + case -EACCES: + case -EIO: + goto die; + default: + goto retry_rebind; + } + } + if (status == NLM_LCK_DENIED_GRACE_PERIOD) { + rpc_delay(task, NLMCLNT_GRACE_WAIT); + goto retry_unlock; + } + if (status != NLM_LCK_GRANTED) + printk(KERN_WARNING "lockd: unexpected unlock status: %d\n", status); +die: + return; + retry_rebind: + nlm_rebind_host(req->a_host); + retry_unlock: + rpc_restart_call(task); +} + +static const struct rpc_call_ops nlmclnt_unlock_ops = { + .rpc_call_done = nlmclnt_unlock_callback, + .rpc_release = nlmclnt_rpc_release, +}; + +/* + * Cancel a blocked lock request. + * We always use an async RPC call for this in order not to hang a + * process that has been Ctrl-C'ed. + */ +static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl) +{ + struct nlm_rqst *req; + int status; + + dprintk("lockd: blocking lock attempt was interrupted by a signal.\n" + " Attempting to cancel lock.\n"); + + req = nlm_alloc_call(nlm_get_host(host)); + if (!req) + return -ENOMEM; + req->a_flags = RPC_TASK_ASYNC; + + nlmclnt_setlockargs(req, fl); + req->a_args.block = block; + + atomic_inc(&req->a_count); + status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, + NLMPROC_CANCEL, &nlmclnt_cancel_ops); + if (status == 0 && req->a_res.status == nlm_lck_denied) + status = -ENOLCK; + nlmclnt_release_call(req); + return status; +} + +static void nlmclnt_cancel_callback(struct rpc_task *task, void *data) +{ + struct nlm_rqst *req = data; + u32 status = ntohl(req->a_res.status); + + if (RPC_ASSASSINATED(task)) + goto die; + + if (task->tk_status < 0) { + dprintk("lockd: CANCEL call error %d, retrying.\n", + task->tk_status); + goto retry_cancel; + } + + dprintk("lockd: cancel status %u (task %u)\n", + status, task->tk_pid); + + switch (status) { + case NLM_LCK_GRANTED: + case NLM_LCK_DENIED_GRACE_PERIOD: + case NLM_LCK_DENIED: + /* Everything's good */ + break; + case NLM_LCK_DENIED_NOLOCKS: + dprintk("lockd: CANCEL failed (server has no locks)\n"); + goto retry_cancel; + default: + printk(KERN_NOTICE "lockd: weird return %d for CANCEL call\n", + status); + } + +die: + return; + +retry_cancel: + /* Don't ever retry more than 3 times */ + if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) + goto die; + nlm_rebind_host(req->a_host); + rpc_restart_call(task); + rpc_delay(task, 30 * HZ); +} + +static const struct rpc_call_ops nlmclnt_cancel_ops = { + .rpc_call_done = nlmclnt_cancel_callback, + .rpc_release = nlmclnt_rpc_release, +}; + +/* + * Convert an NLM status code to a generic kernel errno + */ +static int +nlm_stat_to_errno(__be32 status) +{ + switch(ntohl(status)) { + case NLM_LCK_GRANTED: + return 0; + case NLM_LCK_DENIED: + return -EAGAIN; + case NLM_LCK_DENIED_NOLOCKS: + case NLM_LCK_DENIED_GRACE_PERIOD: + return -ENOLCK; + case NLM_LCK_BLOCKED: + printk(KERN_NOTICE "lockd: unexpected status NLM_BLOCKED\n"); + return -ENOLCK; +#ifdef CONFIG_LOCKD_V4 + case NLM_DEADLCK: + return -EDEADLK; + case NLM_ROFS: + return -EROFS; + case NLM_STALE_FH: + return -ESTALE; + case NLM_FBIG: + return -EOVERFLOW; + case NLM_FAILED: + return -ENOLCK; +#endif + } + printk(KERN_NOTICE "lockd: unexpected server status %d\n", status); + return -ENOLCK; +} diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c new file mode 100644 index 00000000..36057ced --- /dev/null +++ b/fs/lockd/clntxdr.c @@ -0,0 +1,627 @@ +/* + * linux/fs/lockd/clntxdr.c + * + * XDR functions to encode/decode NLM version 3 RPC arguments and results. + * NLM version 3 is backwards compatible with NLM versions 1 and 2. + * + * NLM client-side only. + * + * Copyright (C) 2010, Oracle. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_XDR + +#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) +# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!" +#endif + +/* + * Declare the space requirements for NLM arguments and replies as + * number of 32bit-words + */ +#define NLM_cookie_sz (1+(NLM_MAXCOOKIELEN>>2)) +#define NLM_caller_sz (1+(NLMCLNT_OHSIZE>>2)) +#define NLM_owner_sz (1+(NLMCLNT_OHSIZE>>2)) +#define NLM_fhandle_sz (1+(NFS2_FHSIZE>>2)) +#define NLM_lock_sz (3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz) +#define NLM_holder_sz (4+NLM_owner_sz) + +#define NLM_testargs_sz (NLM_cookie_sz+1+NLM_lock_sz) +#define NLM_lockargs_sz (NLM_cookie_sz+4+NLM_lock_sz) +#define NLM_cancargs_sz (NLM_cookie_sz+2+NLM_lock_sz) +#define NLM_unlockargs_sz (NLM_cookie_sz+NLM_lock_sz) + +#define NLM_testres_sz (NLM_cookie_sz+1+NLM_holder_sz) +#define NLM_res_sz (NLM_cookie_sz+1) +#define NLM_norep_sz (0) + + +static s32 loff_t_to_s32(loff_t offset) +{ + s32 res; + + if (offset >= NLM_OFFSET_MAX) + res = NLM_OFFSET_MAX; + else if (offset <= -NLM_OFFSET_MAX) + res = -NLM_OFFSET_MAX; + else + res = offset; + return res; +} + +static void nlm_compute_offsets(const struct nlm_lock *lock, + u32 *l_offset, u32 *l_len) +{ + const struct file_lock *fl = &lock->fl; + + BUG_ON(fl->fl_start > NLM_OFFSET_MAX); + BUG_ON(fl->fl_end > NLM_OFFSET_MAX && + fl->fl_end != OFFSET_MAX); + + *l_offset = loff_t_to_s32(fl->fl_start); + if (fl->fl_end == OFFSET_MAX) + *l_len = 0; + else + *l_len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1); +} + +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("lockd: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + + +/* + * Encode/decode NLMv3 basic data types + * + * Basic NLMv3 data types are not defined in an IETF standards + * document. X/Open has a description of these data types that + * is useful. See Chapter 10 of "Protocols for Interworking: + * XNFS, Version 3W". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. + */ + +static void encode_bool(struct xdr_stream *xdr, const int value) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = value ? xdr_one : xdr_zero; +} + +static void encode_int32(struct xdr_stream *xdr, const s32 value) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(value); +} + +/* + * typedef opaque netobj + */ +static void encode_netobj(struct xdr_stream *xdr, + const u8 *data, const unsigned int length) +{ + __be32 *p; + + BUG_ON(length > XDR_MAX_NETOBJ); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, data, length); +} + +static int decode_netobj(struct xdr_stream *xdr, + struct xdr_netobj *obj) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + if (unlikely(length > XDR_MAX_NETOBJ)) + goto out_size; + obj->len = length; + obj->data = (u8 *)p; + return 0; +out_size: + dprintk("NFS: returned netobj was too long: %u\n", length); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * netobj cookie; + */ +static void encode_cookie(struct xdr_stream *xdr, + const struct nlm_cookie *cookie) +{ + BUG_ON(cookie->len > NLM_MAXCOOKIELEN); + encode_netobj(xdr, (u8 *)&cookie->data, cookie->len); +} + +static int decode_cookie(struct xdr_stream *xdr, + struct nlm_cookie *cookie) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + /* apparently HPUX can return empty cookies */ + if (length == 0) + goto out_hpux; + if (length > NLM_MAXCOOKIELEN) + goto out_size; + p = xdr_inline_decode(xdr, length); + if (unlikely(p == NULL)) + goto out_overflow; + cookie->len = length; + memcpy(cookie->data, p, length); + return 0; +out_hpux: + cookie->len = 4; + memset(cookie->data, 0, 4); + return 0; +out_size: + dprintk("NFS: returned cookie was too long: %u\n", length); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * netobj fh; + */ +static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh) +{ + BUG_ON(fh->size != NFS2_FHSIZE); + encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE); +} + +/* + * enum nlm_stats { + * LCK_GRANTED = 0, + * LCK_DENIED = 1, + * LCK_DENIED_NOLOCKS = 2, + * LCK_BLOCKED = 3, + * LCK_DENIED_GRACE_PERIOD = 4 + * }; + * + * + * struct nlm_stat { + * nlm_stats stat; + * }; + * + * NB: we don't swap bytes for the NLM status values. The upper + * layers deal directly with the status value in network byte + * order. + */ + +static void encode_nlm_stat(struct xdr_stream *xdr, + const __be32 stat) +{ + __be32 *p; + + BUG_ON(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD); + p = xdr_reserve_space(xdr, 4); + *p = stat; +} + +static int decode_nlm_stat(struct xdr_stream *xdr, + __be32 *stat) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (unlikely(ntohl(*p) > ntohl(nlm_lck_denied_grace_period))) + goto out_enum; + *stat = *p; + return 0; +out_enum: + dprintk("%s: server returned invalid nlm_stats value: %u\n", + __func__, be32_to_cpup(p)); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * struct nlm_holder { + * bool exclusive; + * int uppid; + * netobj oh; + * unsigned l_offset; + * unsigned l_len; + * }; + */ +static void encode_nlm_holder(struct xdr_stream *xdr, + const struct nlm_res *result) +{ + const struct nlm_lock *lock = &result->lock; + u32 l_offset, l_len; + __be32 *p; + + encode_bool(xdr, lock->fl.fl_type == F_RDLCK); + encode_int32(xdr, lock->svid); + encode_netobj(xdr, lock->oh.data, lock->oh.len); + + p = xdr_reserve_space(xdr, 4 + 4); + nlm_compute_offsets(lock, &l_offset, &l_len); + *p++ = cpu_to_be32(l_offset); + *p = cpu_to_be32(l_len); +} + +static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result) +{ + struct nlm_lock *lock = &result->lock; + struct file_lock *fl = &lock->fl; + u32 exclusive, l_offset, l_len; + int error; + __be32 *p; + s32 end; + + memset(lock, 0, sizeof(*lock)); + locks_init_lock(fl); + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + exclusive = be32_to_cpup(p++); + lock->svid = be32_to_cpup(p); + fl->fl_pid = (pid_t)lock->svid; + + error = decode_netobj(xdr, &lock->oh); + if (unlikely(error)) + goto out; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + + fl->fl_flags = FL_POSIX; + fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK; + l_offset = be32_to_cpup(p++); + l_len = be32_to_cpup(p); + end = l_offset + l_len - 1; + + fl->fl_start = (loff_t)l_offset; + if (l_len == 0 || end < 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = (loff_t)end; + error = 0; +out: + return error; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * string caller_name; + */ +static void encode_caller_name(struct xdr_stream *xdr, const char *name) +{ + /* NB: client-side does not set lock->len */ + u32 length = strlen(name); + __be32 *p; + + BUG_ON(length > NLM_MAXSTRLEN); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); +} + +/* + * struct nlm_lock { + * string caller_name; + * netobj fh; + * netobj oh; + * int uppid; + * unsigned l_offset; + * unsigned l_len; + * }; + */ +static void encode_nlm_lock(struct xdr_stream *xdr, + const struct nlm_lock *lock) +{ + u32 l_offset, l_len; + __be32 *p; + + encode_caller_name(xdr, lock->caller); + encode_fh(xdr, &lock->fh); + encode_netobj(xdr, lock->oh.data, lock->oh.len); + + p = xdr_reserve_space(xdr, 4 + 4 + 4); + *p++ = cpu_to_be32(lock->svid); + + nlm_compute_offsets(lock, &l_offset, &l_len); + *p++ = cpu_to_be32(l_offset); + *p = cpu_to_be32(l_len); +} + + +/* + * NLMv3 XDR encode functions + * + * NLMv3 argument types are defined in Chapter 10 of The Open Group's + * "Protocols for Interworking: XNFS, Version 3W". + */ + +/* + * struct nlm_testargs { + * netobj cookie; + * bool exclusive; + * struct nlm_lock alock; + * }; + */ +static void nlm_xdr_enc_testargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm_lock(xdr, lock); +} + +/* + * struct nlm_lockargs { + * netobj cookie; + * bool block; + * bool exclusive; + * struct nlm_lock alock; + * bool reclaim; + * int state; + * }; + */ +static void nlm_xdr_enc_lockargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, args->block); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm_lock(xdr, lock); + encode_bool(xdr, args->reclaim); + encode_int32(xdr, args->state); +} + +/* + * struct nlm_cancargs { + * netobj cookie; + * bool block; + * bool exclusive; + * struct nlm_lock alock; + * }; + */ +static void nlm_xdr_enc_cancargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, args->block); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm_lock(xdr, lock); +} + +/* + * struct nlm_unlockargs { + * netobj cookie; + * struct nlm_lock alock; + * }; + */ +static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_nlm_lock(xdr, lock); +} + +/* + * struct nlm_res { + * netobj cookie; + * nlm_stat stat; + * }; + */ +static void nlm_xdr_enc_res(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_res *result) +{ + encode_cookie(xdr, &result->cookie); + encode_nlm_stat(xdr, result->status); +} + +/* + * union nlm_testrply switch (nlm_stats stat) { + * case LCK_DENIED: + * struct nlm_holder holder; + * default: + * void; + * }; + * + * struct nlm_testres { + * netobj cookie; + * nlm_testrply test_stat; + * }; + */ +static void encode_nlm_testrply(struct xdr_stream *xdr, + const struct nlm_res *result) +{ + if (result->status == nlm_lck_denied) + encode_nlm_holder(xdr, result); +} + +static void nlm_xdr_enc_testres(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_res *result) +{ + encode_cookie(xdr, &result->cookie); + encode_nlm_stat(xdr, result->status); + encode_nlm_testrply(xdr, result); +} + + +/* + * NLMv3 XDR decode functions + * + * NLMv3 result types are defined in Chapter 10 of The Open Group's + * "Protocols for Interworking: XNFS, Version 3W". + */ + +/* + * union nlm_testrply switch (nlm_stats stat) { + * case LCK_DENIED: + * struct nlm_holder holder; + * default: + * void; + * }; + * + * struct nlm_testres { + * netobj cookie; + * nlm_testrply test_stat; + * }; + */ +static int decode_nlm_testrply(struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_nlm_stat(xdr, &result->status); + if (unlikely(error)) + goto out; + if (result->status == nlm_lck_denied) + error = decode_nlm_holder(xdr, result); +out: + return error; +} + +static int nlm_xdr_dec_testres(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_cookie(xdr, &result->cookie); + if (unlikely(error)) + goto out; + error = decode_nlm_testrply(xdr, result); +out: + return error; +} + +/* + * struct nlm_res { + * netobj cookie; + * nlm_stat stat; + * }; + */ +static int nlm_xdr_dec_res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_cookie(xdr, &result->cookie); + if (unlikely(error)) + goto out; + error = decode_nlm_stat(xdr, &result->status); +out: + return error; +} + + +/* + * For NLM, a void procedure really returns nothing + */ +#define nlm_xdr_dec_norep NULL + +#define PROC(proc, argtype, restype) \ +[NLMPROC_##proc] = { \ + .p_proc = NLMPROC_##proc, \ + .p_encode = (kxdreproc_t)nlm_xdr_enc_##argtype, \ + .p_decode = (kxdrdproc_t)nlm_xdr_dec_##restype, \ + .p_arglen = NLM_##argtype##_sz, \ + .p_replen = NLM_##restype##_sz, \ + .p_statidx = NLMPROC_##proc, \ + .p_name = #proc, \ + } + +static struct rpc_procinfo nlm_procedures[] = { + PROC(TEST, testargs, testres), + PROC(LOCK, lockargs, res), + PROC(CANCEL, cancargs, res), + PROC(UNLOCK, unlockargs, res), + PROC(GRANTED, testargs, res), + PROC(TEST_MSG, testargs, norep), + PROC(LOCK_MSG, lockargs, norep), + PROC(CANCEL_MSG, cancargs, norep), + PROC(UNLOCK_MSG, unlockargs, norep), + PROC(GRANTED_MSG, testargs, norep), + PROC(TEST_RES, testres, norep), + PROC(LOCK_RES, res, norep), + PROC(CANCEL_RES, res, norep), + PROC(UNLOCK_RES, res, norep), + PROC(GRANTED_RES, res, norep), +}; + +static struct rpc_version nlm_version1 = { + .number = 1, + .nrprocs = ARRAY_SIZE(nlm_procedures), + .procs = nlm_procedures, +}; + +static struct rpc_version nlm_version3 = { + .number = 3, + .nrprocs = ARRAY_SIZE(nlm_procedures), + .procs = nlm_procedures, +}; + +static struct rpc_version *nlm_versions[] = { + [1] = &nlm_version1, + [3] = &nlm_version3, +#ifdef CONFIG_LOCKD_V4 + [4] = &nlm_version4, +#endif +}; + +static struct rpc_stat nlm_rpc_stats; + +struct rpc_program nlm_program = { + .name = "lockd", + .number = NLM_PROGRAM, + .nrvers = ARRAY_SIZE(nlm_versions), + .version = nlm_versions, + .stats = &nlm_rpc_stats, +}; diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c new file mode 100644 index 00000000..183cc1f0 --- /dev/null +++ b/fs/lockd/grace.c @@ -0,0 +1,59 @@ +/* + * Common code for control of lockd and nfsv4 grace periods. + */ + +#include +#include + +static LIST_HEAD(grace_list); +static DEFINE_SPINLOCK(grace_lock); + +/** + * locks_start_grace + * @lm: who this grace period is for + * + * A grace period is a period during which locks should not be given + * out. Currently grace periods are only enforced by the two lock + * managers (lockd and nfsd), using the locks_in_grace() function to + * check when they are in a grace period. + * + * This function is called to start a grace period. + */ +void locks_start_grace(struct lock_manager *lm) +{ + spin_lock(&grace_lock); + list_add(&lm->list, &grace_list); + spin_unlock(&grace_lock); +} +EXPORT_SYMBOL_GPL(locks_start_grace); + +/** + * locks_end_grace + * @lm: who this grace period is for + * + * Call this function to state that the given lock manager is ready to + * resume regular locking. The grace period will not end until all lock + * managers that called locks_start_grace() also call locks_end_grace(). + * Note that callers count on it being safe to call this more than once, + * and the second call should be a no-op. + */ +void locks_end_grace(struct lock_manager *lm) +{ + spin_lock(&grace_lock); + list_del_init(&lm->list); + spin_unlock(&grace_lock); +} +EXPORT_SYMBOL_GPL(locks_end_grace); + +/** + * locks_in_grace + * + * Lock managers call this function to determine when it is OK for them + * to answer ordinary lock requests, and when they should accept only + * lock reclaims. + */ +int locks_in_grace(void) +{ + return !list_empty(&grace_list); +} +EXPORT_SYMBOL_GPL(locks_in_grace); diff --git a/fs/lockd/host.c b/fs/lockd/host.c new file mode 100644 index 00000000..b7c99bfb --- /dev/null +++ b/fs/lockd/host.c @@ -0,0 +1,649 @@ +/* + * linux/fs/lockd/host.c + * + * Management for NLM peer hosts. The nlm_host struct is shared + * between client and server implementation. The only reason to + * do so is to reduce code bloat. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define NLMDBG_FACILITY NLMDBG_HOSTCACHE +#define NLM_HOST_NRHASH 32 +#define NLM_HOST_REBIND (60 * HZ) +#define NLM_HOST_EXPIRE (300 * HZ) +#define NLM_HOST_COLLECT (120 * HZ) + +static struct hlist_head nlm_server_hosts[NLM_HOST_NRHASH]; +static struct hlist_head nlm_client_hosts[NLM_HOST_NRHASH]; + +#define for_each_host(host, pos, chain, table) \ + for ((chain) = (table); \ + (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \ + hlist_for_each_entry((host), (pos), (chain), h_hash) + +#define for_each_host_safe(host, pos, next, chain, table) \ + for ((chain) = (table); \ + (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \ + hlist_for_each_entry_safe((host), (pos), (next), \ + (chain), h_hash) + +static unsigned long next_gc; +static unsigned long nrhosts; +static DEFINE_MUTEX(nlm_host_mutex); + +static void nlm_gc_hosts(void); + +struct nlm_lookup_host_info { + const int server; /* search for server|client */ + const struct sockaddr *sap; /* address to search for */ + const size_t salen; /* it's length */ + const unsigned short protocol; /* transport to search for*/ + const u32 version; /* NLM version to search for */ + const char *hostname; /* remote's hostname */ + const size_t hostname_len; /* it's length */ + const int noresvport; /* use non-priv port */ +}; + +/* + * Hash function must work well on big- and little-endian platforms + */ +static unsigned int __nlm_hash32(const __be32 n) +{ + unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16); + return hash ^ (hash >> 8); +} + +static unsigned int __nlm_hash_addr4(const struct sockaddr *sap) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + return __nlm_hash32(sin->sin_addr.s_addr); +} + +static unsigned int __nlm_hash_addr6(const struct sockaddr *sap) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + const struct in6_addr addr = sin6->sin6_addr; + return __nlm_hash32(addr.s6_addr32[0]) ^ + __nlm_hash32(addr.s6_addr32[1]) ^ + __nlm_hash32(addr.s6_addr32[2]) ^ + __nlm_hash32(addr.s6_addr32[3]); +} + +static unsigned int nlm_hash_address(const struct sockaddr *sap) +{ + unsigned int hash; + + switch (sap->sa_family) { + case AF_INET: + hash = __nlm_hash_addr4(sap); + break; + case AF_INET6: + hash = __nlm_hash_addr6(sap); + break; + default: + hash = 0; + } + return hash & (NLM_HOST_NRHASH - 1); +} + +/* + * Allocate and initialize an nlm_host. Common to both client and server. + */ +static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, + struct nsm_handle *nsm) +{ + struct nlm_host *host = NULL; + unsigned long now = jiffies; + + if (nsm != NULL) + atomic_inc(&nsm->sm_count); + else { + host = NULL; + nsm = nsm_get_handle(ni->sap, ni->salen, + ni->hostname, ni->hostname_len); + if (unlikely(nsm == NULL)) { + dprintk("lockd: %s failed; no nsm handle\n", + __func__); + goto out; + } + } + + host = kmalloc(sizeof(*host), GFP_KERNEL); + if (unlikely(host == NULL)) { + dprintk("lockd: %s failed; no memory\n", __func__); + nsm_release(nsm); + goto out; + } + + memcpy(nlm_addr(host), ni->sap, ni->salen); + host->h_addrlen = ni->salen; + rpc_set_port(nlm_addr(host), 0); + host->h_srcaddrlen = 0; + + host->h_rpcclnt = NULL; + host->h_name = nsm->sm_name; + host->h_version = ni->version; + host->h_proto = ni->protocol; + host->h_reclaiming = 0; + host->h_server = ni->server; + host->h_noresvport = ni->noresvport; + host->h_inuse = 0; + init_waitqueue_head(&host->h_gracewait); + init_rwsem(&host->h_rwsem); + host->h_state = 0; + host->h_nsmstate = 0; + host->h_pidcount = 0; + atomic_set(&host->h_count, 1); + mutex_init(&host->h_mutex); + host->h_nextrebind = now + NLM_HOST_REBIND; + host->h_expires = now + NLM_HOST_EXPIRE; + INIT_LIST_HEAD(&host->h_lockowners); + spin_lock_init(&host->h_lock); + INIT_LIST_HEAD(&host->h_granted); + INIT_LIST_HEAD(&host->h_reclaim); + host->h_nsmhandle = nsm; + host->h_addrbuf = nsm->sm_addrbuf; + +out: + return host; +} + +/* + * Destroy an nlm_host and free associated resources + * + * Caller must hold nlm_host_mutex. + */ +static void nlm_destroy_host_locked(struct nlm_host *host) +{ + struct rpc_clnt *clnt; + + dprintk("lockd: destroy host %s\n", host->h_name); + + BUG_ON(!list_empty(&host->h_lockowners)); + BUG_ON(atomic_read(&host->h_count)); + + hlist_del_init(&host->h_hash); + + nsm_unmonitor(host); + nsm_release(host->h_nsmhandle); + + clnt = host->h_rpcclnt; + if (clnt != NULL) + rpc_shutdown_client(clnt); + kfree(host); + + nrhosts--; +} + +/** + * nlmclnt_lookup_host - Find an NLM host handle matching a remote server + * @sap: network address of server + * @salen: length of server address + * @protocol: transport protocol to use + * @version: NLM protocol version + * @hostname: '\0'-terminated hostname of server + * @noresvport: 1 if non-privileged port should be used + * + * Returns an nlm_host structure that matches the passed-in + * [server address, transport protocol, NLM version, server hostname]. + * If one doesn't already exist in the host cache, a new handle is + * created and returned. + */ +struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, + const size_t salen, + const unsigned short protocol, + const u32 version, + const char *hostname, + int noresvport) +{ + struct nlm_lookup_host_info ni = { + .server = 0, + .sap = sap, + .salen = salen, + .protocol = protocol, + .version = version, + .hostname = hostname, + .hostname_len = strlen(hostname), + .noresvport = noresvport, + }; + struct hlist_head *chain; + struct hlist_node *pos; + struct nlm_host *host; + struct nsm_handle *nsm = NULL; + + dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, + (hostname ? hostname : ""), version, + (protocol == IPPROTO_UDP ? "udp" : "tcp")); + + mutex_lock(&nlm_host_mutex); + + chain = &nlm_client_hosts[nlm_hash_address(sap)]; + hlist_for_each_entry(host, pos, chain, h_hash) { + if (!rpc_cmp_addr(nlm_addr(host), sap)) + continue; + + /* Same address. Share an NSM handle if we already have one */ + if (nsm == NULL) + nsm = host->h_nsmhandle; + + if (host->h_proto != protocol) + continue; + if (host->h_version != version) + continue; + + nlm_get_host(host); + dprintk("lockd: %s found host %s (%s)\n", __func__, + host->h_name, host->h_addrbuf); + goto out; + } + + host = nlm_alloc_host(&ni, nsm); + if (unlikely(host == NULL)) + goto out; + + hlist_add_head(&host->h_hash, chain); + nrhosts++; + + dprintk("lockd: %s created host %s (%s)\n", __func__, + host->h_name, host->h_addrbuf); + +out: + mutex_unlock(&nlm_host_mutex); + return host; +} + +/** + * nlmclnt_release_host - release client nlm_host + * @host: nlm_host to release + * + */ +void nlmclnt_release_host(struct nlm_host *host) +{ + if (host == NULL) + return; + + dprintk("lockd: release client host %s\n", host->h_name); + + BUG_ON(atomic_read(&host->h_count) < 0); + BUG_ON(host->h_server); + + if (atomic_dec_and_test(&host->h_count)) { + BUG_ON(!list_empty(&host->h_lockowners)); + BUG_ON(!list_empty(&host->h_granted)); + BUG_ON(!list_empty(&host->h_reclaim)); + + mutex_lock(&nlm_host_mutex); + nlm_destroy_host_locked(host); + mutex_unlock(&nlm_host_mutex); + } +} + +/** + * nlmsvc_lookup_host - Find an NLM host handle matching a remote client + * @rqstp: incoming NLM request + * @hostname: name of client host + * @hostname_len: length of client hostname + * + * Returns an nlm_host structure that matches the [client address, + * transport protocol, NLM version, client hostname] of the passed-in + * NLM request. If one doesn't already exist in the host cache, a + * new handle is created and returned. + * + * Before possibly creating a new nlm_host, construct a sockaddr + * for a specific source address in case the local system has + * multiple network addresses. The family of the address in + * rq_daddr is guaranteed to be the same as the family of the + * address in rq_addr, so it's safe to use the same family for + * the source address. + */ +struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, + const char *hostname, + const size_t hostname_len) +{ + struct hlist_head *chain; + struct hlist_node *pos; + struct nlm_host *host = NULL; + struct nsm_handle *nsm = NULL; + struct sockaddr_in sin = { + .sin_family = AF_INET, + }; + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + }; + struct sockaddr *src_sap; + size_t src_len = rqstp->rq_addrlen; + struct nlm_lookup_host_info ni = { + .server = 1, + .sap = svc_addr(rqstp), + .salen = rqstp->rq_addrlen, + .protocol = rqstp->rq_prot, + .version = rqstp->rq_vers, + .hostname = hostname, + .hostname_len = hostname_len, + }; + + dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, + (int)hostname_len, hostname, rqstp->rq_vers, + (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); + + mutex_lock(&nlm_host_mutex); + + switch (ni.sap->sa_family) { + case AF_INET: + sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr; + src_sap = (struct sockaddr *)&sin; + break; + case AF_INET6: + ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6); + src_sap = (struct sockaddr *)&sin6; + break; + default: + dprintk("lockd: %s failed; unrecognized address family\n", + __func__); + goto out; + } + + if (time_after_eq(jiffies, next_gc)) + nlm_gc_hosts(); + + chain = &nlm_server_hosts[nlm_hash_address(ni.sap)]; + hlist_for_each_entry(host, pos, chain, h_hash) { + if (!rpc_cmp_addr(nlm_addr(host), ni.sap)) + continue; + + /* Same address. Share an NSM handle if we already have one */ + if (nsm == NULL) + nsm = host->h_nsmhandle; + + if (host->h_proto != ni.protocol) + continue; + if (host->h_version != ni.version) + continue; + if (!rpc_cmp_addr(nlm_srcaddr(host), src_sap)) + continue; + + /* Move to head of hash chain. */ + hlist_del(&host->h_hash); + hlist_add_head(&host->h_hash, chain); + + nlm_get_host(host); + dprintk("lockd: %s found host %s (%s)\n", + __func__, host->h_name, host->h_addrbuf); + goto out; + } + + host = nlm_alloc_host(&ni, nsm); + if (unlikely(host == NULL)) + goto out; + + memcpy(nlm_srcaddr(host), src_sap, src_len); + host->h_srcaddrlen = src_len; + hlist_add_head(&host->h_hash, chain); + nrhosts++; + + dprintk("lockd: %s created host %s (%s)\n", + __func__, host->h_name, host->h_addrbuf); + +out: + mutex_unlock(&nlm_host_mutex); + return host; +} + +/** + * nlmsvc_release_host - release server nlm_host + * @host: nlm_host to release + * + * Host is destroyed later in nlm_gc_host(). + */ +void nlmsvc_release_host(struct nlm_host *host) +{ + if (host == NULL) + return; + + dprintk("lockd: release server host %s\n", host->h_name); + + BUG_ON(atomic_read(&host->h_count) < 0); + BUG_ON(!host->h_server); + atomic_dec(&host->h_count); +} + +/* + * Create the NLM RPC client for an NLM peer + */ +struct rpc_clnt * +nlm_bind_host(struct nlm_host *host) +{ + struct rpc_clnt *clnt; + + dprintk("lockd: nlm_bind_host %s (%s)\n", + host->h_name, host->h_addrbuf); + + /* Lock host handle */ + mutex_lock(&host->h_mutex); + + /* If we've already created an RPC client, check whether + * RPC rebind is required + */ + if ((clnt = host->h_rpcclnt) != NULL) { + if (time_after_eq(jiffies, host->h_nextrebind)) { + rpc_force_rebind(clnt); + host->h_nextrebind = jiffies + NLM_HOST_REBIND; + dprintk("lockd: next rebind in %lu jiffies\n", + host->h_nextrebind - jiffies); + } + } else { + unsigned long increment = nlmsvc_timeout; + struct rpc_timeout timeparms = { + .to_initval = increment, + .to_increment = increment, + .to_maxval = increment * 6UL, + .to_retries = 5U, + }; + struct rpc_create_args args = { + .net = &init_net, + .protocol = host->h_proto, + .address = nlm_addr(host), + .addrsize = host->h_addrlen, + .timeout = &timeparms, + .servername = host->h_name, + .program = &nlm_program, + .version = host->h_version, + .authflavor = RPC_AUTH_UNIX, + .flags = (RPC_CLNT_CREATE_NOPING | + RPC_CLNT_CREATE_AUTOBIND), + }; + + /* + * lockd retries server side blocks automatically so we want + * those to be soft RPC calls. Client side calls need to be + * hard RPC tasks. + */ + if (!host->h_server) + args.flags |= RPC_CLNT_CREATE_HARDRTRY; + if (host->h_noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + if (host->h_srcaddrlen) + args.saddress = nlm_srcaddr(host); + + clnt = rpc_create(&args); + if (!IS_ERR(clnt)) + host->h_rpcclnt = clnt; + else { + printk("lockd: couldn't create RPC handle for %s\n", host->h_name); + clnt = NULL; + } + } + + mutex_unlock(&host->h_mutex); + return clnt; +} + +/* + * Force a portmap lookup of the remote lockd port + */ +void +nlm_rebind_host(struct nlm_host *host) +{ + dprintk("lockd: rebind host %s\n", host->h_name); + if (host->h_rpcclnt && time_after_eq(jiffies, host->h_nextrebind)) { + rpc_force_rebind(host->h_rpcclnt); + host->h_nextrebind = jiffies + NLM_HOST_REBIND; + } +} + +/* + * Increment NLM host count + */ +struct nlm_host * nlm_get_host(struct nlm_host *host) +{ + if (host) { + dprintk("lockd: get host %s\n", host->h_name); + atomic_inc(&host->h_count); + host->h_expires = jiffies + NLM_HOST_EXPIRE; + } + return host; +} + +static struct nlm_host *next_host_state(struct hlist_head *cache, + struct nsm_handle *nsm, + const struct nlm_reboot *info) +{ + struct nlm_host *host; + struct hlist_head *chain; + struct hlist_node *pos; + + mutex_lock(&nlm_host_mutex); + for_each_host(host, pos, chain, cache) { + if (host->h_nsmhandle == nsm + && host->h_nsmstate != info->state) { + host->h_nsmstate = info->state; + host->h_state++; + + nlm_get_host(host); + mutex_unlock(&nlm_host_mutex); + return host; + } + } + + mutex_unlock(&nlm_host_mutex); + return NULL; +} + +/** + * nlm_host_rebooted - Release all resources held by rebooted host + * @info: pointer to decoded results of NLM_SM_NOTIFY call + * + * We were notified that the specified host has rebooted. Release + * all resources held by that peer. + */ +void nlm_host_rebooted(const struct nlm_reboot *info) +{ + struct nsm_handle *nsm; + struct nlm_host *host; + + nsm = nsm_reboot_lookup(info); + if (unlikely(nsm == NULL)) + return; + + /* Mark all hosts tied to this NSM state as having rebooted. + * We run the loop repeatedly, because we drop the host table + * lock for this. + * To avoid processing a host several times, we match the nsmstate. + */ + while ((host = next_host_state(nlm_server_hosts, nsm, info)) != NULL) { + nlmsvc_free_host_resources(host); + nlmsvc_release_host(host); + } + while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) { + nlmclnt_recovery(host); + nlmclnt_release_host(host); + } + + nsm_release(nsm); +} + +/* + * Shut down the hosts module. + * Note that this routine is called only at server shutdown time. + */ +void +nlm_shutdown_hosts(void) +{ + struct hlist_head *chain; + struct hlist_node *pos; + struct nlm_host *host; + + dprintk("lockd: shutting down host module\n"); + mutex_lock(&nlm_host_mutex); + + /* First, make all hosts eligible for gc */ + dprintk("lockd: nuking all hosts...\n"); + for_each_host(host, pos, chain, nlm_server_hosts) { + host->h_expires = jiffies - 1; + if (host->h_rpcclnt) { + rpc_shutdown_client(host->h_rpcclnt); + host->h_rpcclnt = NULL; + } + } + + /* Then, perform a garbage collection pass */ + nlm_gc_hosts(); + mutex_unlock(&nlm_host_mutex); + + /* complain if any hosts are left */ + if (nrhosts != 0) { + printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); + dprintk("lockd: %lu hosts left:\n", nrhosts); + for_each_host(host, pos, chain, nlm_server_hosts) { + dprintk(" %s (cnt %d use %d exp %ld)\n", + host->h_name, atomic_read(&host->h_count), + host->h_inuse, host->h_expires); + } + } +} + +/* + * Garbage collect any unused NLM hosts. + * This GC combines reference counting for async operations with + * mark & sweep for resources held by remote clients. + */ +static void +nlm_gc_hosts(void) +{ + struct hlist_head *chain; + struct hlist_node *pos, *next; + struct nlm_host *host; + + dprintk("lockd: host garbage collection\n"); + for_each_host(host, pos, chain, nlm_server_hosts) + host->h_inuse = 0; + + /* Mark all hosts that hold locks, blocks or shares */ + nlmsvc_mark_resources(); + + for_each_host_safe(host, pos, next, chain, nlm_server_hosts) { + if (atomic_read(&host->h_count) || host->h_inuse + || time_before(jiffies, host->h_expires)) { + dprintk("nlm_gc_hosts skipping %s " + "(cnt %d use %d exp %ld)\n", + host->h_name, atomic_read(&host->h_count), + host->h_inuse, host->h_expires); + continue; + } + nlm_destroy_host_locked(host); + } + + next_gc = jiffies + NLM_HOST_COLLECT; +} diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c new file mode 100644 index 00000000..23d7451b --- /dev/null +++ b/fs/lockd/mon.c @@ -0,0 +1,555 @@ +/* + * linux/fs/lockd/mon.c + * + * The kernel statd client. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#define NLMDBG_FACILITY NLMDBG_MONITOR +#define NSM_PROGRAM 100024 +#define NSM_VERSION 1 + +enum { + NSMPROC_NULL, + NSMPROC_STAT, + NSMPROC_MON, + NSMPROC_UNMON, + NSMPROC_UNMON_ALL, + NSMPROC_SIMU_CRASH, + NSMPROC_NOTIFY, +}; + +struct nsm_args { + struct nsm_private *priv; + u32 prog; /* RPC callback info */ + u32 vers; + u32 proc; + + char *mon_name; +}; + +struct nsm_res { + u32 status; + u32 state; +}; + +static struct rpc_program nsm_program; +static LIST_HEAD(nsm_handles); +static DEFINE_SPINLOCK(nsm_lock); + +/* + * Local NSM state + */ +u32 __read_mostly nsm_local_state; +int __read_mostly nsm_use_hostnames; + +static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) +{ + return (struct sockaddr *)&nsm->sm_addr; +} + +static struct rpc_clnt *nsm_create(void) +{ + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + }; + struct rpc_create_args args = { + .net = &init_net, + .protocol = XPRT_TRANSPORT_UDP, + .address = (struct sockaddr *)&sin, + .addrsize = sizeof(sin), + .servername = "rpc.statd", + .program = &nsm_program, + .version = NSM_VERSION, + .authflavor = RPC_AUTH_NULL, + .flags = RPC_CLNT_CREATE_NOPING, + }; + + return rpc_create(&args); +} + +static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) +{ + struct rpc_clnt *clnt; + int status; + struct nsm_args args = { + .priv = &nsm->sm_priv, + .prog = NLM_PROGRAM, + .vers = 3, + .proc = NLMPROC_NSM_NOTIFY, + .mon_name = nsm->sm_mon_name, + }; + struct rpc_message msg = { + .rpc_argp = &args, + .rpc_resp = res, + }; + + clnt = nsm_create(); + if (IS_ERR(clnt)) { + status = PTR_ERR(clnt); + dprintk("lockd: failed to create NSM upcall transport, " + "status=%d\n", status); + goto out; + } + + memset(res, 0, sizeof(*res)); + + msg.rpc_proc = &clnt->cl_procinfo[proc]; + status = rpc_call_sync(clnt, &msg, 0); + if (status < 0) + dprintk("lockd: NSM upcall RPC failed, status=%d\n", + status); + else + status = 0; + rpc_shutdown_client(clnt); + out: + return status; +} + +/** + * nsm_monitor - Notify a peer in case we reboot + * @host: pointer to nlm_host of peer to notify + * + * If this peer is not already monitored, this function sends an + * upcall to the local rpc.statd to record the name/address of + * the peer to notify in case we reboot. + * + * Returns zero if the peer is monitored by the local rpc.statd; + * otherwise a negative errno value is returned. + */ +int nsm_monitor(const struct nlm_host *host) +{ + struct nsm_handle *nsm = host->h_nsmhandle; + struct nsm_res res; + int status; + + dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); + + if (nsm->sm_monitored) + return 0; + + /* + * Choose whether to record the caller_name or IP address of + * this peer in the local rpc.statd's database. + */ + nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; + + status = nsm_mon_unmon(nsm, NSMPROC_MON, &res); + if (unlikely(res.status != 0)) + status = -EIO; + if (unlikely(status < 0)) { + printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name); + return status; + } + + nsm->sm_monitored = 1; + if (unlikely(nsm_local_state != res.state)) { + nsm_local_state = res.state; + dprintk("lockd: NSM state changed to %d\n", nsm_local_state); + } + return 0; +} + +/** + * nsm_unmonitor - Unregister peer notification + * @host: pointer to nlm_host of peer to stop monitoring + * + * If this peer is monitored, this function sends an upcall to + * tell the local rpc.statd not to send this peer a notification + * when we reboot. + */ +void nsm_unmonitor(const struct nlm_host *host) +{ + struct nsm_handle *nsm = host->h_nsmhandle; + struct nsm_res res; + int status; + + if (atomic_read(&nsm->sm_count) == 1 + && nsm->sm_monitored && !nsm->sm_sticky) { + dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); + + status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res); + if (res.status != 0) + status = -EIO; + if (status < 0) + printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", + nsm->sm_name); + else + nsm->sm_monitored = 0; + } +} + +static struct nsm_handle *nsm_lookup_hostname(const char *hostname, + const size_t len) +{ + struct nsm_handle *nsm; + + list_for_each_entry(nsm, &nsm_handles, sm_link) + if (strlen(nsm->sm_name) == len && + memcmp(nsm->sm_name, hostname, len) == 0) + return nsm; + return NULL; +} + +static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap) +{ + struct nsm_handle *nsm; + + list_for_each_entry(nsm, &nsm_handles, sm_link) + if (rpc_cmp_addr(nsm_addr(nsm), sap)) + return nsm; + return NULL; +} + +static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv) +{ + struct nsm_handle *nsm; + + list_for_each_entry(nsm, &nsm_handles, sm_link) + if (memcmp(nsm->sm_priv.data, priv->data, + sizeof(priv->data)) == 0) + return nsm; + return NULL; +} + +/* + * Construct a unique cookie to match this nsm_handle to this monitored + * host. It is passed to the local rpc.statd via NSMPROC_MON, and + * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these + * requests. + * + * The NSM protocol requires that these cookies be unique while the + * system is running. We prefer a stronger requirement of making them + * unique across reboots. If user space bugs cause a stale cookie to + * be sent to the kernel, it could cause the wrong host to lose its + * lock state if cookies were not unique across reboots. + * + * The cookies are exposed only to local user space via loopback. They + * do not appear on the physical network. If we want greater security + * for some reason, nsm_init_private() could perform a one-way hash to + * obscure the contents of the cookie. + */ +static void nsm_init_private(struct nsm_handle *nsm) +{ + u64 *p = (u64 *)&nsm->sm_priv.data; + struct timespec ts; + s64 ns; + + ktime_get_ts(&ts); + ns = timespec_to_ns(&ts); + put_unaligned(ns, p); + put_unaligned((unsigned long)nsm, p + 1); +} + +static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, + const size_t salen, + const char *hostname, + const size_t hostname_len) +{ + struct nsm_handle *new; + + new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL); + if (unlikely(new == NULL)) + return NULL; + + atomic_set(&new->sm_count, 1); + new->sm_name = (char *)(new + 1); + memcpy(nsm_addr(new), sap, salen); + new->sm_addrlen = salen; + nsm_init_private(new); + + if (rpc_ntop(nsm_addr(new), new->sm_addrbuf, + sizeof(new->sm_addrbuf)) == 0) + (void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf), + "unsupported address family"); + memcpy(new->sm_name, hostname, hostname_len); + new->sm_name[hostname_len] = '\0'; + + return new; +} + +/** + * nsm_get_handle - Find or create a cached nsm_handle + * @sap: pointer to socket address of handle to find + * @salen: length of socket address + * @hostname: pointer to C string containing hostname to find + * @hostname_len: length of C string + * + * Behavior is modulated by the global nsm_use_hostnames variable. + * + * Returns a cached nsm_handle after bumping its ref count, or + * returns a fresh nsm_handle if a handle that matches @sap and/or + * @hostname cannot be found in the handle cache. Returns NULL if + * an error occurs. + */ +struct nsm_handle *nsm_get_handle(const struct sockaddr *sap, + const size_t salen, const char *hostname, + const size_t hostname_len) +{ + struct nsm_handle *cached, *new = NULL; + + if (hostname && memchr(hostname, '/', hostname_len) != NULL) { + if (printk_ratelimit()) { + printk(KERN_WARNING "Invalid hostname \"%.*s\" " + "in NFS lock request\n", + (int)hostname_len, hostname); + } + return NULL; + } + +retry: + spin_lock(&nsm_lock); + + if (nsm_use_hostnames && hostname != NULL) + cached = nsm_lookup_hostname(hostname, hostname_len); + else + cached = nsm_lookup_addr(sap); + + if (cached != NULL) { + atomic_inc(&cached->sm_count); + spin_unlock(&nsm_lock); + kfree(new); + dprintk("lockd: found nsm_handle for %s (%s), " + "cnt %d\n", cached->sm_name, + cached->sm_addrbuf, + atomic_read(&cached->sm_count)); + return cached; + } + + if (new != NULL) { + list_add(&new->sm_link, &nsm_handles); + spin_unlock(&nsm_lock); + dprintk("lockd: created nsm_handle for %s (%s)\n", + new->sm_name, new->sm_addrbuf); + return new; + } + + spin_unlock(&nsm_lock); + + new = nsm_create_handle(sap, salen, hostname, hostname_len); + if (unlikely(new == NULL)) + return NULL; + goto retry; +} + +/** + * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle + * @info: pointer to NLMPROC_SM_NOTIFY arguments + * + * Returns a matching nsm_handle if found in the nsm cache. The returned + * nsm_handle's reference count is bumped. Otherwise returns NULL if some + * error occurred. + */ +struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info) +{ + struct nsm_handle *cached; + + spin_lock(&nsm_lock); + + cached = nsm_lookup_priv(&info->priv); + if (unlikely(cached == NULL)) { + spin_unlock(&nsm_lock); + dprintk("lockd: never saw rebooted peer '%.*s' before\n", + info->len, info->mon); + return cached; + } + + atomic_inc(&cached->sm_count); + spin_unlock(&nsm_lock); + + dprintk("lockd: host %s (%s) rebooted, cnt %d\n", + cached->sm_name, cached->sm_addrbuf, + atomic_read(&cached->sm_count)); + return cached; +} + +/** + * nsm_release - Release an NSM handle + * @nsm: pointer to handle to be released + * + */ +void nsm_release(struct nsm_handle *nsm) +{ + if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) { + list_del(&nsm->sm_link); + spin_unlock(&nsm_lock); + dprintk("lockd: destroyed nsm_handle for %s (%s)\n", + nsm->sm_name, nsm->sm_addrbuf); + kfree(nsm); + } +} + +/* + * XDR functions for NSM. + * + * See http://www.opengroup.org/ for details on the Network + * Status Monitor wire protocol. + */ + +static void encode_nsm_string(struct xdr_stream *xdr, const char *string) +{ + const u32 len = strlen(string); + __be32 *p; + + BUG_ON(len > SM_MAXSTRLEN); + p = xdr_reserve_space(xdr, 4 + len); + xdr_encode_opaque(p, string, len); +} + +/* + * "mon_name" specifies the host to be monitored. + */ +static void encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp) +{ + encode_nsm_string(xdr, argp->mon_name); +} + +/* + * The "my_id" argument specifies the hostname and RPC procedure + * to be called when the status manager receives notification + * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name" + * has changed. + */ +static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp) +{ + __be32 *p; + + encode_nsm_string(xdr, utsname()->nodename); + p = xdr_reserve_space(xdr, 4 + 4 + 4); + *p++ = cpu_to_be32(argp->prog); + *p++ = cpu_to_be32(argp->vers); + *p = cpu_to_be32(argp->proc); +} + +/* + * The "mon_id" argument specifies the non-private arguments + * of an NSMPROC_MON or NSMPROC_UNMON call. + */ +static void encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp) +{ + encode_mon_name(xdr, argp); + encode_my_id(xdr, argp); +} + +/* + * The "priv" argument may contain private information required + * by the NSMPROC_MON call. This information will be supplied in the + * NLMPROC_SM_NOTIFY call. + */ +static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, SM_PRIV_SIZE); + xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE); +} + +static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nsm_args *argp) +{ + encode_mon_id(xdr, argp); + encode_priv(xdr, argp); +} + +static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nsm_args *argp) +{ + encode_mon_id(xdr, argp); +} + +static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nsm_res *resp) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + return -EIO; + resp->status = be32_to_cpup(p++); + resp->state = be32_to_cpup(p); + + dprintk("lockd: %s status %d state %d\n", + __func__, resp->status, resp->state); + return 0; +} + +static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nsm_res *resp) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + resp->state = be32_to_cpup(p); + + dprintk("lockd: %s state %d\n", __func__, resp->state); + return 0; +} + +#define SM_my_name_sz (1+XDR_QUADLEN(SM_MAXSTRLEN)) +#define SM_my_id_sz (SM_my_name_sz+3) +#define SM_mon_name_sz (1+XDR_QUADLEN(SM_MAXSTRLEN)) +#define SM_mon_id_sz (SM_mon_name_sz+SM_my_id_sz) +#define SM_priv_sz (XDR_QUADLEN(SM_PRIV_SIZE)) +#define SM_mon_sz (SM_mon_id_sz+SM_priv_sz) +#define SM_monres_sz 2 +#define SM_unmonres_sz 1 + +static struct rpc_procinfo nsm_procedures[] = { +[NSMPROC_MON] = { + .p_proc = NSMPROC_MON, + .p_encode = (kxdreproc_t)nsm_xdr_enc_mon, + .p_decode = (kxdrdproc_t)nsm_xdr_dec_stat_res, + .p_arglen = SM_mon_sz, + .p_replen = SM_monres_sz, + .p_statidx = NSMPROC_MON, + .p_name = "MONITOR", + }, +[NSMPROC_UNMON] = { + .p_proc = NSMPROC_UNMON, + .p_encode = (kxdreproc_t)nsm_xdr_enc_unmon, + .p_decode = (kxdrdproc_t)nsm_xdr_dec_stat, + .p_arglen = SM_mon_id_sz, + .p_replen = SM_unmonres_sz, + .p_statidx = NSMPROC_UNMON, + .p_name = "UNMONITOR", + }, +}; + +static struct rpc_version nsm_version1 = { + .number = 1, + .nrprocs = ARRAY_SIZE(nsm_procedures), + .procs = nsm_procedures +}; + +static struct rpc_version * nsm_version[] = { + [1] = &nsm_version1, +}; + +static struct rpc_stat nsm_stats; + +static struct rpc_program nsm_program = { + .name = "statd", + .number = NSM_PROGRAM, + .nrvers = ARRAY_SIZE(nsm_version), + .version = nsm_version, + .stats = &nsm_stats +}; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c new file mode 100644 index 00000000..1743064c --- /dev/null +++ b/fs/lockd/svc.c @@ -0,0 +1,568 @@ +/* + * linux/fs/lockd/svc.c + * + * This is the central lockd service. + * + * FIXME: Separate the lockd NFS server functionality from the lockd NFS + * client functionality. Oh why didn't Sun create two separate + * services in the first place? + * + * Authors: Olaf Kirch (okir@monad.swb.de) + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_SVC +#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) +#define ALLOWED_SIGS (sigmask(SIGKILL)) + +static struct svc_program nlmsvc_program; + +struct nlmsvc_binding * nlmsvc_ops; +EXPORT_SYMBOL_GPL(nlmsvc_ops); + +static DEFINE_MUTEX(nlmsvc_mutex); +static unsigned int nlmsvc_users; +static struct task_struct *nlmsvc_task; +static struct svc_rqst *nlmsvc_rqst; +unsigned long nlmsvc_timeout; + +/* + * These can be set at insmod time (useful for NFS as root filesystem), + * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 + */ +static unsigned long nlm_grace_period; +static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; +static int nlm_udpport, nlm_tcpport; + +/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */ +static unsigned int nlm_max_connections = 1024; + +/* + * Constants needed for the sysctl interface. + */ +static const unsigned long nlm_grace_period_min = 0; +static const unsigned long nlm_grace_period_max = 240; +static const unsigned long nlm_timeout_min = 3; +static const unsigned long nlm_timeout_max = 20; +static const int nlm_port_min = 0, nlm_port_max = 65535; + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header * nlm_sysctl_table; +#endif + +static unsigned long get_lockd_grace_period(void) +{ + /* Note: nlm_timeout should always be nonzero */ + if (nlm_grace_period) + return roundup(nlm_grace_period, nlm_timeout) * HZ; + else + return nlm_timeout * 5 * HZ; +} + +static struct lock_manager lockd_manager = { +}; + +static void grace_ender(struct work_struct *not_used) +{ + locks_end_grace(&lockd_manager); +} + +static DECLARE_DELAYED_WORK(grace_period_end, grace_ender); + +static void set_grace_period(void) +{ + unsigned long grace_period = get_lockd_grace_period(); + + locks_start_grace(&lockd_manager); + cancel_delayed_work_sync(&grace_period_end); + schedule_delayed_work(&grace_period_end, grace_period); +} + +static void restart_grace(void) +{ + if (nlmsvc_ops) { + cancel_delayed_work_sync(&grace_period_end); + locks_end_grace(&lockd_manager); + nlmsvc_invalidate_all(); + set_grace_period(); + } +} + +/* + * This is the lockd kernel thread + */ +static int +lockd(void *vrqstp) +{ + int err = 0, preverr = 0; + struct svc_rqst *rqstp = vrqstp; + + /* try_to_freeze() is called from svc_recv() */ + set_freezable(); + + /* Allow SIGKILL to tell lockd to drop all of its locks */ + allow_signal(SIGKILL); + + dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); + + if (!nlm_timeout) + nlm_timeout = LOCKD_DFLT_TIMEO; + nlmsvc_timeout = nlm_timeout * HZ; + + set_grace_period(); + + /* + * The main request loop. We don't terminate until the last + * NFS mount or NFS daemon has gone away. + */ + while (!kthread_should_stop()) { + long timeout = MAX_SCHEDULE_TIMEOUT; + RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + + /* update sv_maxconn if it has changed */ + rqstp->rq_server->sv_maxconn = nlm_max_connections; + + if (signalled()) { + flush_signals(current); + restart_grace(); + continue; + } + + timeout = nlmsvc_retry_blocked(); + + /* + * Find a socket with data available and call its + * recvfrom routine. + */ + err = svc_recv(rqstp, timeout); + if (err == -EAGAIN || err == -EINTR) { + preverr = err; + continue; + } + if (err < 0) { + if (err != preverr) { + printk(KERN_WARNING "%s: unexpected error " + "from svc_recv (%d)\n", __func__, err); + preverr = err; + } + schedule_timeout_interruptible(HZ); + continue; + } + preverr = err; + + dprintk("lockd: request from %s\n", + svc_print_addr(rqstp, buf, sizeof(buf))); + + svc_process(rqstp); + } + flush_signals(current); + cancel_delayed_work_sync(&grace_period_end); + locks_end_grace(&lockd_manager); + if (nlmsvc_ops) + nlmsvc_invalidate_all(); + nlm_shutdown_hosts(); + return 0; +} + +static int create_lockd_listener(struct svc_serv *serv, const char *name, + const int family, const unsigned short port) +{ + struct svc_xprt *xprt; + + xprt = svc_find_xprt(serv, name, family, 0); + if (xprt == NULL) + return svc_create_xprt(serv, name, &init_net, family, port, + SVC_SOCK_DEFAULTS); + svc_xprt_put(xprt); + return 0; +} + +static int create_lockd_family(struct svc_serv *serv, const int family) +{ + int err; + + err = create_lockd_listener(serv, "udp", family, nlm_udpport); + if (err < 0) + return err; + + return create_lockd_listener(serv, "tcp", family, nlm_tcpport); +} + +/* + * Ensure there are active UDP and TCP listeners for lockd. + * + * Even if we have only TCP NFS mounts and/or TCP NFSDs, some + * local services (such as rpc.statd) still require UDP, and + * some NFS servers do not yet support NLM over TCP. + * + * Returns zero if all listeners are available; otherwise a + * negative errno value is returned. + */ +static int make_socks(struct svc_serv *serv) +{ + static int warned; + int err; + + err = create_lockd_family(serv, PF_INET); + if (err < 0) + goto out_err; + + err = create_lockd_family(serv, PF_INET6); + if (err < 0 && err != -EAFNOSUPPORT) + goto out_err; + + warned = 0; + return 0; + +out_err: + if (warned++ == 0) + printk(KERN_WARNING + "lockd_up: makesock failed, error=%d\n", err); + return err; +} + +/* + * Bring up the lockd process if it's not already up. + */ +int lockd_up(void) +{ + struct svc_serv *serv; + int error = 0; + + mutex_lock(&nlmsvc_mutex); + /* + * Check whether we're already up and running. + */ + if (nlmsvc_rqst) + goto out; + + /* + * Sanity check: if there's no pid, + * we should be the first user ... + */ + if (nlmsvc_users) + printk(KERN_WARNING + "lockd_up: no pid, %d users??\n", nlmsvc_users); + + error = -ENOMEM; + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); + if (!serv) { + printk(KERN_WARNING "lockd_up: create service failed\n"); + goto out; + } + + error = make_socks(serv); + if (error < 0) + goto destroy_and_out; + + /* + * Create the kernel thread and wait for it to start. + */ + nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); + if (IS_ERR(nlmsvc_rqst)) { + error = PTR_ERR(nlmsvc_rqst); + nlmsvc_rqst = NULL; + printk(KERN_WARNING + "lockd_up: svc_rqst allocation failed, error=%d\n", + error); + goto destroy_and_out; + } + + svc_sock_update_bufs(serv); + serv->sv_maxconn = nlm_max_connections; + + nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); + if (IS_ERR(nlmsvc_task)) { + error = PTR_ERR(nlmsvc_task); + svc_exit_thread(nlmsvc_rqst); + nlmsvc_task = NULL; + nlmsvc_rqst = NULL; + printk(KERN_WARNING + "lockd_up: kthread_run failed, error=%d\n", error); + goto destroy_and_out; + } + + /* + * Note: svc_serv structures have an initial use count of 1, + * so we exit through here on both success and failure. + */ +destroy_and_out: + svc_destroy(serv); +out: + if (!error) + nlmsvc_users++; + mutex_unlock(&nlmsvc_mutex); + return error; +} +EXPORT_SYMBOL_GPL(lockd_up); + +/* + * Decrement the user count and bring down lockd if we're the last. + */ +void +lockd_down(void) +{ + mutex_lock(&nlmsvc_mutex); + if (nlmsvc_users) { + if (--nlmsvc_users) + goto out; + } else { + printk(KERN_ERR "lockd_down: no users! task=%p\n", + nlmsvc_task); + BUG(); + } + + if (!nlmsvc_task) { + printk(KERN_ERR "lockd_down: no lockd running.\n"); + BUG(); + } + kthread_stop(nlmsvc_task); + svc_exit_thread(nlmsvc_rqst); + nlmsvc_task = NULL; + nlmsvc_rqst = NULL; +out: + mutex_unlock(&nlmsvc_mutex); +} +EXPORT_SYMBOL_GPL(lockd_down); + +#ifdef CONFIG_SYSCTL + +/* + * Sysctl parameters (same as module parameters, different interface). + */ + +static ctl_table nlm_sysctls[] = { + { + .procname = "nlm_grace_period", + .data = &nlm_grace_period, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = (unsigned long *) &nlm_grace_period_min, + .extra2 = (unsigned long *) &nlm_grace_period_max, + }, + { + .procname = "nlm_timeout", + .data = &nlm_timeout, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = (unsigned long *) &nlm_timeout_min, + .extra2 = (unsigned long *) &nlm_timeout_max, + }, + { + .procname = "nlm_udpport", + .data = &nlm_udpport, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (int *) &nlm_port_min, + .extra2 = (int *) &nlm_port_max, + }, + { + .procname = "nlm_tcpport", + .data = &nlm_tcpport, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (int *) &nlm_port_min, + .extra2 = (int *) &nlm_port_max, + }, + { + .procname = "nsm_use_hostnames", + .data = &nsm_use_hostnames, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nsm_local_state", + .data = &nsm_local_state, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static ctl_table nlm_sysctl_dir[] = { + { + .procname = "nfs", + .mode = 0555, + .child = nlm_sysctls, + }, + { } +}; + +static ctl_table nlm_sysctl_root[] = { + { + .procname = "fs", + .mode = 0555, + .child = nlm_sysctl_dir, + }, + { } +}; + +#endif /* CONFIG_SYSCTL */ + +/* + * Module (and sysfs) parameters. + */ + +#define param_set_min_max(name, type, which_strtol, min, max) \ +static int param_set_##name(const char *val, struct kernel_param *kp) \ +{ \ + char *endp; \ + __typeof__(type) num = which_strtol(val, &endp, 0); \ + if (endp == val || *endp || num < (min) || num > (max)) \ + return -EINVAL; \ + *((type *) kp->arg) = num; \ + return 0; \ +} + +static inline int is_callback(u32 proc) +{ + return proc == NLMPROC_GRANTED + || proc == NLMPROC_GRANTED_MSG + || proc == NLMPROC_TEST_RES + || proc == NLMPROC_LOCK_RES + || proc == NLMPROC_CANCEL_RES + || proc == NLMPROC_UNLOCK_RES + || proc == NLMPROC_NSM_NOTIFY; +} + + +static int lockd_authenticate(struct svc_rqst *rqstp) +{ + rqstp->rq_client = NULL; + switch (rqstp->rq_authop->flavour) { + case RPC_AUTH_NULL: + case RPC_AUTH_UNIX: + if (rqstp->rq_proc == 0) + return SVC_OK; + if (is_callback(rqstp->rq_proc)) { + /* Leave it to individual procedures to + * call nlmsvc_lookup_host(rqstp) + */ + return SVC_OK; + } + return svc_set_client(rqstp); + } + return SVC_DENIED; +} + + +param_set_min_max(port, int, simple_strtol, 0, 65535) +param_set_min_max(grace_period, unsigned long, simple_strtoul, + nlm_grace_period_min, nlm_grace_period_max) +param_set_min_max(timeout, unsigned long, simple_strtoul, + nlm_timeout_min, nlm_timeout_max) + +MODULE_AUTHOR("Olaf Kirch "); +MODULE_DESCRIPTION("NFS file locking service version " LOCKD_VERSION "."); +MODULE_LICENSE("GPL"); + +module_param_call(nlm_grace_period, param_set_grace_period, param_get_ulong, + &nlm_grace_period, 0644); +module_param_call(nlm_timeout, param_set_timeout, param_get_ulong, + &nlm_timeout, 0644); +module_param_call(nlm_udpport, param_set_port, param_get_int, + &nlm_udpport, 0644); +module_param_call(nlm_tcpport, param_set_port, param_get_int, + &nlm_tcpport, 0644); +module_param(nsm_use_hostnames, bool, 0644); +module_param(nlm_max_connections, uint, 0644); + +/* + * Initialising and terminating the module. + */ + +static int __init init_nlm(void) +{ +#ifdef CONFIG_SYSCTL + nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root); + return nlm_sysctl_table ? 0 : -ENOMEM; +#else + return 0; +#endif +} + +static void __exit exit_nlm(void) +{ + /* FIXME: delete all NLM clients */ + nlm_shutdown_hosts(); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nlm_sysctl_table); +#endif +} + +module_init(init_nlm); +module_exit(exit_nlm); + +/* + * Define NLM program and procedures + */ +static struct svc_version nlmsvc_version1 = { + .vs_vers = 1, + .vs_nproc = 17, + .vs_proc = nlmsvc_procedures, + .vs_xdrsize = NLMSVC_XDRSIZE, +}; +static struct svc_version nlmsvc_version3 = { + .vs_vers = 3, + .vs_nproc = 24, + .vs_proc = nlmsvc_procedures, + .vs_xdrsize = NLMSVC_XDRSIZE, +}; +#ifdef CONFIG_LOCKD_V4 +static struct svc_version nlmsvc_version4 = { + .vs_vers = 4, + .vs_nproc = 24, + .vs_proc = nlmsvc_procedures4, + .vs_xdrsize = NLMSVC_XDRSIZE, +}; +#endif +static struct svc_version * nlmsvc_version[] = { + [1] = &nlmsvc_version1, + [3] = &nlmsvc_version3, +#ifdef CONFIG_LOCKD_V4 + [4] = &nlmsvc_version4, +#endif +}; + +static struct svc_stat nlmsvc_stats; + +#define NLM_NRVERS ARRAY_SIZE(nlmsvc_version) +static struct svc_program nlmsvc_program = { + .pg_prog = NLM_PROGRAM, /* program number */ + .pg_nvers = NLM_NRVERS, /* number of entries in nlmsvc_version */ + .pg_vers = nlmsvc_version, /* version table */ + .pg_name = "lockd", /* service name */ + .pg_class = "nfsd", /* share authentication with nfsd */ + .pg_stats = &nlmsvc_stats, /* stats table */ + .pg_authenticate = &lockd_authenticate /* export authentication */ +}; diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c new file mode 100644 index 00000000..9a41fdc1 --- /dev/null +++ b/fs/lockd/svc4proc.c @@ -0,0 +1,503 @@ +/* + * linux/fs/lockd/svc4proc.c + * + * Lockd server procedures. We don't implement the NLM_*_RES + * procedures because we don't use the async procedures. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_CLIENT + +/* + * Obtain client and file from arguments + */ +static __be32 +nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_host **hostp, struct nlm_file **filp) +{ + struct nlm_host *host = NULL; + struct nlm_file *file = NULL; + struct nlm_lock *lock = &argp->lock; + __be32 error = 0; + + /* nfsd callbacks must have been installed for this procedure */ + if (!nlmsvc_ops) + return nlm_lck_denied_nolocks; + + /* Obtain host handle */ + if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len)) + || (argp->monitor && nsm_monitor(host) < 0)) + goto no_locks; + *hostp = host; + + /* Obtain file pointer. Not used by FREE_ALL call. */ + if (filp != NULL) { + if ((error = nlm_lookup_file(rqstp, &file, &lock->fh)) != 0) + goto no_locks; + *filp = file; + + /* Set up the missing parts of the file_lock structure */ + lock->fl.fl_file = file->f_file; + lock->fl.fl_owner = (fl_owner_t) host; + lock->fl.fl_lmops = &nlmsvc_lock_operations; + } + + return 0; + +no_locks: + nlmsvc_release_host(host); + if (error) + return error; + return nlm_lck_denied_nolocks; +} + +/* + * NULL: Test for presence of service + */ +static __be32 +nlm4svc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + dprintk("lockd: NULL called\n"); + return rpc_success; +} + +/* + * TEST: Check for conflicting lock + */ +static __be32 +nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + __be32 rc = rpc_success; + + dprintk("lockd: TEST4 called\n"); + resp->cookie = argp->cookie; + + /* Obtain client and file */ + if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now check for conflicting locks */ + resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie); + if (resp->status == nlm_drop_reply) + rc = rpc_drop_reply; + else + dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); + + nlmsvc_release_host(host); + nlm_release_file(file); + return rc; +} + +static __be32 +nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + __be32 rc = rpc_success; + + dprintk("lockd: LOCK called\n"); + + resp->cookie = argp->cookie; + + /* Obtain client and file */ + if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + +#if 0 + /* If supplied state doesn't match current state, we assume it's + * an old request that time-warped somehow. Any error return would + * do in this case because it's irrelevant anyway. + * + * NB: We don't retrieve the remote host's state yet. + */ + if (host->h_nsmstate && host->h_nsmstate != argp->state) { + resp->status = nlm_lck_denied_nolocks; + } else +#endif + + /* Now try to lock the file */ + resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, + argp->block, &argp->cookie, + argp->reclaim); + if (resp->status == nlm_drop_reply) + rc = rpc_drop_reply; + else + dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); + + nlmsvc_release_host(host); + nlm_release_file(file); + return rc; +} + +static __be32 +nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + + dprintk("lockd: CANCEL called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept requests during grace period */ + if (locks_in_grace()) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Try to cancel request. */ + resp->status = nlmsvc_cancel_blocked(file, &argp->lock); + + dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * UNLOCK: release a lock + */ +static __be32 +nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + + dprintk("lockd: UNLOCK called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept new lock requests during grace period */ + if (locks_in_grace()) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now try to remove the lock */ + resp->status = nlmsvc_unlock(file, &argp->lock); + + dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * GRANTED: A server calls us to tell that a process' lock request + * was granted + */ +static __be32 +nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + resp->cookie = argp->cookie; + + dprintk("lockd: GRANTED called\n"); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); + dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); + return rpc_success; +} + +/* + * This is the generic lockd callback for async RPC calls + */ +static void nlm4svc_callback_exit(struct rpc_task *task, void *data) +{ + dprintk("lockd: %5u callback returned %d\n", task->tk_pid, + -task->tk_status); +} + +static void nlm4svc_callback_release(void *data) +{ + nlmsvc_release_call(data); +} + +static const struct rpc_call_ops nlm4svc_callback_ops = { + .rpc_call_done = nlm4svc_callback_exit, + .rpc_release = nlm4svc_callback_release, +}; + +/* + * `Async' versions of the above service routines. They aren't really, + * because we send the callback before the reply proper. I hope this + * doesn't break any clients. + */ +static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp, + __be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res *)) +{ + struct nlm_host *host; + struct nlm_rqst *call; + __be32 stat; + + host = nlmsvc_lookup_host(rqstp, + argp->lock.caller, + argp->lock.len); + if (host == NULL) + return rpc_system_err; + + call = nlm_alloc_call(host); + if (call == NULL) + return rpc_system_err; + + stat = func(rqstp, argp, &call->a_res); + if (stat != 0) { + nlmsvc_release_call(call); + return stat; + } + + call->a_flags = RPC_TASK_ASYNC; + if (nlm_async_reply(call, proc, &nlm4svc_callback_ops) < 0) + return rpc_system_err; + return rpc_success; +} + +static __be32 nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: TEST_MSG called\n"); + return nlm4svc_callback(rqstp, NLMPROC_TEST_RES, argp, nlm4svc_proc_test); +} + +static __be32 nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: LOCK_MSG called\n"); + return nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlm4svc_proc_lock); +} + +static __be32 nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: CANCEL_MSG called\n"); + return nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlm4svc_proc_cancel); +} + +static __be32 nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: UNLOCK_MSG called\n"); + return nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlm4svc_proc_unlock); +} + +static __be32 nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: GRANTED_MSG called\n"); + return nlm4svc_callback(rqstp, NLMPROC_GRANTED_RES, argp, nlm4svc_proc_granted); +} + +/* + * SHARE: create a DOS share or alter existing share. + */ +static __be32 +nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + + dprintk("lockd: SHARE called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept new lock requests during grace period */ + if (locks_in_grace() && !argp->reclaim) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now try to create the share */ + resp->status = nlmsvc_share_file(host, file, argp); + + dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * UNSHARE: Release a DOS share. + */ +static __be32 +nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + + dprintk("lockd: UNSHARE called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept requests during grace period */ + if (locks_in_grace()) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now try to lock the file */ + resp->status = nlmsvc_unshare_file(host, file, argp); + + dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * NM_LOCK: Create an unmonitored lock + */ +static __be32 +nlm4svc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + dprintk("lockd: NM_LOCK called\n"); + + argp->monitor = 0; /* just clean the monitor flag */ + return nlm4svc_proc_lock(rqstp, argp, resp); +} + +/* + * FREE_ALL: Release all locks and shares held by client + */ +static __be32 +nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + struct nlm_host *host; + + /* Obtain client */ + if (nlm4svc_retrieve_args(rqstp, argp, &host, NULL)) + return rpc_success; + + nlmsvc_free_host_resources(host); + nlmsvc_release_host(host); + return rpc_success; +} + +/* + * SM_NOTIFY: private callback from statd (not part of official NLM proto) + */ +static __be32 +nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, + void *resp) +{ + dprintk("lockd: SM_NOTIFY called\n"); + + if (!nlm_privileged_requester(rqstp)) { + char buf[RPC_MAX_ADDRBUFLEN]; + printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", + svc_print_addr(rqstp, buf, sizeof(buf))); + return rpc_system_err; + } + + nlm_host_rebooted(argp); + return rpc_success; +} + +/* + * client sent a GRANTED_RES, let's remove the associated block + */ +static __be32 +nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp, + void *resp) +{ + if (!nlmsvc_ops) + return rpc_success; + + dprintk("lockd: GRANTED_RES called\n"); + + nlmsvc_grant_reply(&argp->cookie, argp->status); + return rpc_success; +} + + +/* + * NLM Server procedures. + */ + +#define nlm4svc_encode_norep nlm4svc_encode_void +#define nlm4svc_decode_norep nlm4svc_decode_void +#define nlm4svc_decode_testres nlm4svc_decode_void +#define nlm4svc_decode_lockres nlm4svc_decode_void +#define nlm4svc_decode_unlockres nlm4svc_decode_void +#define nlm4svc_decode_cancelres nlm4svc_decode_void +#define nlm4svc_decode_grantedres nlm4svc_decode_void + +#define nlm4svc_proc_none nlm4svc_proc_null +#define nlm4svc_proc_test_res nlm4svc_proc_null +#define nlm4svc_proc_lock_res nlm4svc_proc_null +#define nlm4svc_proc_cancel_res nlm4svc_proc_null +#define nlm4svc_proc_unlock_res nlm4svc_proc_null + +struct nlm_void { int dummy; }; + +#define PROC(name, xargt, xrest, argt, rest, respsize) \ + { .pc_func = (svc_procfunc) nlm4svc_proc_##name, \ + .pc_decode = (kxdrproc_t) nlm4svc_decode_##xargt, \ + .pc_encode = (kxdrproc_t) nlm4svc_encode_##xrest, \ + .pc_release = NULL, \ + .pc_argsize = sizeof(struct nlm_##argt), \ + .pc_ressize = sizeof(struct nlm_##rest), \ + .pc_xdrressize = respsize, \ + } +#define Ck (1+XDR_QUADLEN(NLM_MAXCOOKIELEN)) /* cookie */ +#define No (1+1024/4) /* netobj */ +#define St 1 /* status */ +#define Rg 4 /* range (offset + length) */ +struct svc_procedure nlmsvc_procedures4[] = { + PROC(null, void, void, void, void, 1), + PROC(test, testargs, testres, args, res, Ck+St+2+No+Rg), + PROC(lock, lockargs, res, args, res, Ck+St), + PROC(cancel, cancargs, res, args, res, Ck+St), + PROC(unlock, unlockargs, res, args, res, Ck+St), + PROC(granted, testargs, res, args, res, Ck+St), + PROC(test_msg, testargs, norep, args, void, 1), + PROC(lock_msg, lockargs, norep, args, void, 1), + PROC(cancel_msg, cancargs, norep, args, void, 1), + PROC(unlock_msg, unlockargs, norep, args, void, 1), + PROC(granted_msg, testargs, norep, args, void, 1), + PROC(test_res, testres, norep, res, void, 1), + PROC(lock_res, lockres, norep, res, void, 1), + PROC(cancel_res, cancelres, norep, res, void, 1), + PROC(unlock_res, unlockres, norep, res, void, 1), + PROC(granted_res, res, norep, res, void, 1), + /* statd callback */ + PROC(sm_notify, reboot, void, reboot, void, 1), + PROC(none, void, void, void, void, 0), + PROC(none, void, void, void, void, 0), + PROC(none, void, void, void, void, 0), + PROC(share, shareargs, shareres, args, res, Ck+St+1), + PROC(unshare, shareargs, shareres, args, res, Ck+St+1), + PROC(nm_lock, lockargs, res, args, res, Ck+St), + PROC(free_all, notify, void, args, void, 1), + +}; diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c new file mode 100644 index 00000000..6e31695d --- /dev/null +++ b/fs/lockd/svclock.c @@ -0,0 +1,966 @@ +/* + * linux/fs/lockd/svclock.c + * + * Handling of server-side locks, mostly of the blocked variety. + * This is the ugliest part of lockd because we tread on very thin ice. + * GRANT and CANCEL calls may get stuck, meet in mid-flight, etc. + * IMNSHO introducing the grant callback into the NLM protocol was one + * of the worst ideas Sun ever had. Except maybe for the idea of doing + * NFS file locking at all. + * + * I'm trying hard to avoid race conditions by protecting most accesses + * to a file's list of blocked locks through a semaphore. The global + * list of blocked locks is not protected in this fashion however. + * Therefore, some functions (such as the RPC callback for the async grant + * call) move blocked locks towards the head of the list *while some other + * process might be traversing it*. This should not be a problem in + * practice, because this will only cause functions traversing the list + * to visit some blocks twice. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_SVCLOCK + +#ifdef CONFIG_LOCKD_V4 +#define nlm_deadlock nlm4_deadlock +#else +#define nlm_deadlock nlm_lck_denied +#endif + +static void nlmsvc_release_block(struct nlm_block *block); +static void nlmsvc_insert_block(struct nlm_block *block, unsigned long); +static void nlmsvc_remove_block(struct nlm_block *block); + +static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock); +static void nlmsvc_freegrantargs(struct nlm_rqst *call); +static const struct rpc_call_ops nlmsvc_grant_ops; +static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie); + +/* + * The list of blocked locks to retry + */ +static LIST_HEAD(nlm_blocked); +static DEFINE_SPINLOCK(nlm_blocked_lock); + +/* + * Insert a blocked lock into the global list + */ +static void +nlmsvc_insert_block_locked(struct nlm_block *block, unsigned long when) +{ + struct nlm_block *b; + struct list_head *pos; + + dprintk("lockd: nlmsvc_insert_block(%p, %ld)\n", block, when); + if (list_empty(&block->b_list)) { + kref_get(&block->b_count); + } else { + list_del_init(&block->b_list); + } + + pos = &nlm_blocked; + if (when != NLM_NEVER) { + if ((when += jiffies) == NLM_NEVER) + when ++; + list_for_each(pos, &nlm_blocked) { + b = list_entry(pos, struct nlm_block, b_list); + if (time_after(b->b_when,when) || b->b_when == NLM_NEVER) + break; + } + /* On normal exit from the loop, pos == &nlm_blocked, + * so we will be adding to the end of the list - good + */ + } + + list_add_tail(&block->b_list, pos); + block->b_when = when; +} + +static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when) +{ + spin_lock(&nlm_blocked_lock); + nlmsvc_insert_block_locked(block, when); + spin_unlock(&nlm_blocked_lock); +} + +/* + * Remove a block from the global list + */ +static inline void +nlmsvc_remove_block(struct nlm_block *block) +{ + if (!list_empty(&block->b_list)) { + spin_lock(&nlm_blocked_lock); + list_del_init(&block->b_list); + spin_unlock(&nlm_blocked_lock); + nlmsvc_release_block(block); + } +} + +/* + * Find a block for a given lock + */ +static struct nlm_block * +nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock) +{ + struct nlm_block *block; + struct file_lock *fl; + + dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n", + file, lock->fl.fl_pid, + (long long)lock->fl.fl_start, + (long long)lock->fl.fl_end, lock->fl.fl_type); + list_for_each_entry(block, &nlm_blocked, b_list) { + fl = &block->b_call->a_args.lock.fl; + dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n", + block->b_file, fl->fl_pid, + (long long)fl->fl_start, + (long long)fl->fl_end, fl->fl_type, + nlmdbg_cookie2a(&block->b_call->a_args.cookie)); + if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) { + kref_get(&block->b_count); + return block; + } + } + + return NULL; +} + +static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b) +{ + if (a->len != b->len) + return 0; + if (memcmp(a->data, b->data, a->len)) + return 0; + return 1; +} + +/* + * Find a block with a given NLM cookie. + */ +static inline struct nlm_block * +nlmsvc_find_block(struct nlm_cookie *cookie) +{ + struct nlm_block *block; + + list_for_each_entry(block, &nlm_blocked, b_list) { + if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie)) + goto found; + } + + return NULL; + +found: + dprintk("nlmsvc_find_block(%s): block=%p\n", nlmdbg_cookie2a(cookie), block); + kref_get(&block->b_count); + return block; +} + +/* + * Create a block and initialize it. + * + * Note: we explicitly set the cookie of the grant reply to that of + * the blocked lock request. The spec explicitly mentions that the client + * should _not_ rely on the callback containing the same cookie as the + * request, but (as I found out later) that's because some implementations + * do just this. Never mind the standards comittees, they support our + * logging industries. + * + * 10 years later: I hope we can safely ignore these old and broken + * clients by now. Let's fix this so we can uniquely identify an incoming + * GRANTED_RES message by cookie, without having to rely on the client's IP + * address. --okir + */ +static struct nlm_block * +nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, + struct nlm_file *file, struct nlm_lock *lock, + struct nlm_cookie *cookie) +{ + struct nlm_block *block; + struct nlm_rqst *call = NULL; + + nlm_get_host(host); + call = nlm_alloc_call(host); + if (call == NULL) + return NULL; + + /* Allocate memory for block, and initialize arguments */ + block = kzalloc(sizeof(*block), GFP_KERNEL); + if (block == NULL) + goto failed; + kref_init(&block->b_count); + INIT_LIST_HEAD(&block->b_list); + INIT_LIST_HEAD(&block->b_flist); + + if (!nlmsvc_setgrantargs(call, lock)) + goto failed_free; + + /* Set notifier function for VFS, and init args */ + call->a_args.lock.fl.fl_flags |= FL_SLEEP; + call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations; + nlmclnt_next_cookie(&call->a_args.cookie); + + dprintk("lockd: created block %p...\n", block); + + /* Create and initialize the block */ + block->b_daemon = rqstp->rq_server; + block->b_host = host; + block->b_file = file; + block->b_fl = NULL; + file->f_count++; + + /* Add to file's list of blocks */ + list_add(&block->b_flist, &file->f_blocks); + + /* Set up RPC arguments for callback */ + block->b_call = call; + call->a_flags = RPC_TASK_ASYNC; + call->a_block = block; + + return block; + +failed_free: + kfree(block); +failed: + nlmsvc_release_call(call); + return NULL; +} + +/* + * Delete a block. + * It is the caller's responsibility to check whether the file + * can be closed hereafter. + */ +static int nlmsvc_unlink_block(struct nlm_block *block) +{ + int status; + dprintk("lockd: unlinking block %p...\n", block); + + /* Remove block from list */ + status = posix_unblock_lock(block->b_file->f_file, &block->b_call->a_args.lock.fl); + nlmsvc_remove_block(block); + return status; +} + +static void nlmsvc_free_block(struct kref *kref) +{ + struct nlm_block *block = container_of(kref, struct nlm_block, b_count); + struct nlm_file *file = block->b_file; + + dprintk("lockd: freeing block %p...\n", block); + + /* Remove block from file's list of blocks */ + mutex_lock(&file->f_mutex); + list_del_init(&block->b_flist); + mutex_unlock(&file->f_mutex); + + nlmsvc_freegrantargs(block->b_call); + nlmsvc_release_call(block->b_call); + nlm_release_file(block->b_file); + kfree(block->b_fl); + kfree(block); +} + +static void nlmsvc_release_block(struct nlm_block *block) +{ + if (block != NULL) + kref_put(&block->b_count, nlmsvc_free_block); +} + +/* + * Loop over all blocks and delete blocks held by + * a matching host. + */ +void nlmsvc_traverse_blocks(struct nlm_host *host, + struct nlm_file *file, + nlm_host_match_fn_t match) +{ + struct nlm_block *block, *next; + +restart: + mutex_lock(&file->f_mutex); + list_for_each_entry_safe(block, next, &file->f_blocks, b_flist) { + if (!match(block->b_host, host)) + continue; + /* Do not destroy blocks that are not on + * the global retry list - why? */ + if (list_empty(&block->b_list)) + continue; + kref_get(&block->b_count); + mutex_unlock(&file->f_mutex); + nlmsvc_unlink_block(block); + nlmsvc_release_block(block); + goto restart; + } + mutex_unlock(&file->f_mutex); +} + +/* + * Initialize arguments for GRANTED call. The nlm_rqst structure + * has been cleared already. + */ +static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock) +{ + locks_copy_lock(&call->a_args.lock.fl, &lock->fl); + memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh)); + call->a_args.lock.caller = utsname()->nodename; + call->a_args.lock.oh.len = lock->oh.len; + + /* set default data area */ + call->a_args.lock.oh.data = call->a_owner; + call->a_args.lock.svid = lock->fl.fl_pid; + + if (lock->oh.len > NLMCLNT_OHSIZE) { + void *data = kmalloc(lock->oh.len, GFP_KERNEL); + if (!data) + return 0; + call->a_args.lock.oh.data = (u8 *) data; + } + + memcpy(call->a_args.lock.oh.data, lock->oh.data, lock->oh.len); + return 1; +} + +static void nlmsvc_freegrantargs(struct nlm_rqst *call) +{ + if (call->a_args.lock.oh.data != call->a_owner) + kfree(call->a_args.lock.oh.data); + + locks_release_private(&call->a_args.lock.fl); +} + +/* + * Deferred lock request handling for non-blocking lock + */ +static __be32 +nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block) +{ + __be32 status = nlm_lck_denied_nolocks; + + block->b_flags |= B_QUEUED; + + nlmsvc_insert_block(block, NLM_TIMEOUT); + + block->b_cache_req = &rqstp->rq_chandle; + if (rqstp->rq_chandle.defer) { + block->b_deferred_req = + rqstp->rq_chandle.defer(block->b_cache_req); + if (block->b_deferred_req != NULL) + status = nlm_drop_reply; + } + dprintk("lockd: nlmsvc_defer_lock_rqst block %p flags %d status %d\n", + block, block->b_flags, ntohl(status)); + + return status; +} + +/* + * Attempt to establish a lock, and if it can't be granted, block it + * if required. + */ +__be32 +nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, + struct nlm_host *host, struct nlm_lock *lock, int wait, + struct nlm_cookie *cookie, int reclaim) +{ + struct nlm_block *block = NULL; + int error; + __be32 ret; + + dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", + file->f_file->f_path.dentry->d_inode->i_sb->s_id, + file->f_file->f_path.dentry->d_inode->i_ino, + lock->fl.fl_type, lock->fl.fl_pid, + (long long)lock->fl.fl_start, + (long long)lock->fl.fl_end, + wait); + + /* Lock file against concurrent access */ + mutex_lock(&file->f_mutex); + /* Get existing block (in case client is busy-waiting) + * or create new block + */ + block = nlmsvc_lookup_block(file, lock); + if (block == NULL) { + block = nlmsvc_create_block(rqstp, host, file, lock, cookie); + ret = nlm_lck_denied_nolocks; + if (block == NULL) + goto out; + lock = &block->b_call->a_args.lock; + } else + lock->fl.fl_flags &= ~FL_SLEEP; + + if (block->b_flags & B_QUEUED) { + dprintk("lockd: nlmsvc_lock deferred block %p flags %d\n", + block, block->b_flags); + if (block->b_granted) { + nlmsvc_unlink_block(block); + ret = nlm_granted; + goto out; + } + if (block->b_flags & B_TIMED_OUT) { + nlmsvc_unlink_block(block); + ret = nlm_lck_denied; + goto out; + } + ret = nlm_drop_reply; + goto out; + } + + if (locks_in_grace() && !reclaim) { + ret = nlm_lck_denied_grace_period; + goto out; + } + if (reclaim && !locks_in_grace()) { + ret = nlm_lck_denied_grace_period; + goto out; + } + + if (!wait) + lock->fl.fl_flags &= ~FL_SLEEP; + error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + lock->fl.fl_flags &= ~FL_SLEEP; + + dprintk("lockd: vfs_lock_file returned %d\n", error); + switch (error) { + case 0: + ret = nlm_granted; + goto out; + case -EAGAIN: + /* + * If this is a blocking request for an + * already pending lock request then we need + * to put it back on lockd's block list + */ + if (wait) + break; + ret = nlm_lck_denied; + goto out; + case FILE_LOCK_DEFERRED: + if (wait) + break; + /* Filesystem lock operation is in progress + Add it to the queue waiting for callback */ + ret = nlmsvc_defer_lock_rqst(rqstp, block); + goto out; + case -EDEADLK: + ret = nlm_deadlock; + goto out; + default: /* includes ENOLCK */ + ret = nlm_lck_denied_nolocks; + goto out; + } + + ret = nlm_lck_blocked; + + /* Append to list of blocked */ + nlmsvc_insert_block(block, NLM_NEVER); +out: + mutex_unlock(&file->f_mutex); + nlmsvc_release_block(block); + dprintk("lockd: nlmsvc_lock returned %u\n", ret); + return ret; +} + +/* + * Test for presence of a conflicting lock. + */ +__be32 +nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, + struct nlm_host *host, struct nlm_lock *lock, + struct nlm_lock *conflock, struct nlm_cookie *cookie) +{ + struct nlm_block *block = NULL; + int error; + __be32 ret; + + dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", + file->f_file->f_path.dentry->d_inode->i_sb->s_id, + file->f_file->f_path.dentry->d_inode->i_ino, + lock->fl.fl_type, + (long long)lock->fl.fl_start, + (long long)lock->fl.fl_end); + + /* Get existing block (in case client is busy-waiting) */ + block = nlmsvc_lookup_block(file, lock); + + if (block == NULL) { + struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL); + + if (conf == NULL) + return nlm_granted; + block = nlmsvc_create_block(rqstp, host, file, lock, cookie); + if (block == NULL) { + kfree(conf); + return nlm_granted; + } + block->b_fl = conf; + } + if (block->b_flags & B_QUEUED) { + dprintk("lockd: nlmsvc_testlock deferred block %p flags %d fl %p\n", + block, block->b_flags, block->b_fl); + if (block->b_flags & B_TIMED_OUT) { + nlmsvc_unlink_block(block); + ret = nlm_lck_denied; + goto out; + } + if (block->b_flags & B_GOT_CALLBACK) { + nlmsvc_unlink_block(block); + if (block->b_fl != NULL + && block->b_fl->fl_type != F_UNLCK) { + lock->fl = *block->b_fl; + goto conf_lock; + } else { + ret = nlm_granted; + goto out; + } + } + ret = nlm_drop_reply; + goto out; + } + + if (locks_in_grace()) { + ret = nlm_lck_denied_grace_period; + goto out; + } + error = vfs_test_lock(file->f_file, &lock->fl); + if (error == FILE_LOCK_DEFERRED) { + ret = nlmsvc_defer_lock_rqst(rqstp, block); + goto out; + } + if (error) { + ret = nlm_lck_denied_nolocks; + goto out; + } + if (lock->fl.fl_type == F_UNLCK) { + ret = nlm_granted; + goto out; + } + +conf_lock: + dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n", + lock->fl.fl_type, (long long)lock->fl.fl_start, + (long long)lock->fl.fl_end); + conflock->caller = "somehost"; /* FIXME */ + conflock->len = strlen(conflock->caller); + conflock->oh.len = 0; /* don't return OH info */ + conflock->svid = lock->fl.fl_pid; + conflock->fl.fl_type = lock->fl.fl_type; + conflock->fl.fl_start = lock->fl.fl_start; + conflock->fl.fl_end = lock->fl.fl_end; + ret = nlm_lck_denied; +out: + if (block) + nlmsvc_release_block(block); + return ret; +} + +/* + * Remove a lock. + * This implies a CANCEL call: We send a GRANT_MSG, the client replies + * with a GRANT_RES call which gets lost, and calls UNLOCK immediately + * afterwards. In this case the block will still be there, and hence + * must be removed. + */ +__be32 +nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock) +{ + int error; + + dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", + file->f_file->f_path.dentry->d_inode->i_sb->s_id, + file->f_file->f_path.dentry->d_inode->i_ino, + lock->fl.fl_pid, + (long long)lock->fl.fl_start, + (long long)lock->fl.fl_end); + + /* First, cancel any lock that might be there */ + nlmsvc_cancel_blocked(file, lock); + + lock->fl.fl_type = F_UNLCK; + error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + + return (error < 0)? nlm_lck_denied_nolocks : nlm_granted; +} + +/* + * Cancel a previously blocked request. + * + * A cancel request always overrides any grant that may currently + * be in progress. + * The calling procedure must check whether the file can be closed. + */ +__be32 +nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) +{ + struct nlm_block *block; + int status = 0; + + dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", + file->f_file->f_path.dentry->d_inode->i_sb->s_id, + file->f_file->f_path.dentry->d_inode->i_ino, + lock->fl.fl_pid, + (long long)lock->fl.fl_start, + (long long)lock->fl.fl_end); + + if (locks_in_grace()) + return nlm_lck_denied_grace_period; + + mutex_lock(&file->f_mutex); + block = nlmsvc_lookup_block(file, lock); + mutex_unlock(&file->f_mutex); + if (block != NULL) { + vfs_cancel_lock(block->b_file->f_file, + &block->b_call->a_args.lock.fl); + status = nlmsvc_unlink_block(block); + nlmsvc_release_block(block); + } + return status ? nlm_lck_denied : nlm_granted; +} + +/* + * This is a callback from the filesystem for VFS file lock requests. + * It will be used if fl_grant is defined and the filesystem can not + * respond to the request immediately. + * For GETLK request it will copy the reply to the nlm_block. + * For SETLK or SETLKW request it will get the local posix lock. + * In all cases it will move the block to the head of nlm_blocked q where + * nlmsvc_retry_blocked() can send back a reply for SETLKW or revisit the + * deferred rpc for GETLK and SETLK. + */ +static void +nlmsvc_update_deferred_block(struct nlm_block *block, struct file_lock *conf, + int result) +{ + block->b_flags |= B_GOT_CALLBACK; + if (result == 0) + block->b_granted = 1; + else + block->b_flags |= B_TIMED_OUT; + if (conf) { + if (block->b_fl) + __locks_copy_lock(block->b_fl, conf); + } +} + +static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf, + int result) +{ + struct nlm_block *block; + int rc = -ENOENT; + + spin_lock(&nlm_blocked_lock); + list_for_each_entry(block, &nlm_blocked, b_list) { + if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { + dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", + block, block->b_flags); + if (block->b_flags & B_QUEUED) { + if (block->b_flags & B_TIMED_OUT) { + rc = -ENOLCK; + break; + } + nlmsvc_update_deferred_block(block, conf, result); + } else if (result == 0) + block->b_granted = 1; + + nlmsvc_insert_block_locked(block, 0); + svc_wake_up(block->b_daemon); + rc = 0; + break; + } + } + spin_unlock(&nlm_blocked_lock); + if (rc == -ENOENT) + printk(KERN_WARNING "lockd: grant for unknown block\n"); + return rc; +} + +/* + * Unblock a blocked lock request. This is a callback invoked from the + * VFS layer when a lock on which we blocked is removed. + * + * This function doesn't grant the blocked lock instantly, but rather moves + * the block to the head of nlm_blocked where it can be picked up by lockd. + */ +static void +nlmsvc_notify_blocked(struct file_lock *fl) +{ + struct nlm_block *block; + + dprintk("lockd: VFS unblock notification for block %p\n", fl); + spin_lock(&nlm_blocked_lock); + list_for_each_entry(block, &nlm_blocked, b_list) { + if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { + nlmsvc_insert_block_locked(block, 0); + spin_unlock(&nlm_blocked_lock); + svc_wake_up(block->b_daemon); + return; + } + } + spin_unlock(&nlm_blocked_lock); + printk(KERN_WARNING "lockd: notification for unknown block!\n"); +} + +static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2) +{ + return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid; +} + +const struct lock_manager_operations nlmsvc_lock_operations = { + .fl_compare_owner = nlmsvc_same_owner, + .fl_notify = nlmsvc_notify_blocked, + .fl_grant = nlmsvc_grant_deferred, +}; + +/* + * Try to claim a lock that was previously blocked. + * + * Note that we use both the RPC_GRANTED_MSG call _and_ an async + * RPC thread when notifying the client. This seems like overkill... + * Here's why: + * - we don't want to use a synchronous RPC thread, otherwise + * we might find ourselves hanging on a dead portmapper. + * - Some lockd implementations (e.g. HP) don't react to + * RPC_GRANTED calls; they seem to insist on RPC_GRANTED_MSG calls. + */ +static void +nlmsvc_grant_blocked(struct nlm_block *block) +{ + struct nlm_file *file = block->b_file; + struct nlm_lock *lock = &block->b_call->a_args.lock; + int error; + + dprintk("lockd: grant blocked lock %p\n", block); + + kref_get(&block->b_count); + + /* Unlink block request from list */ + nlmsvc_unlink_block(block); + + /* If b_granted is true this means we've been here before. + * Just retry the grant callback, possibly refreshing the RPC + * binding */ + if (block->b_granted) { + nlm_rebind_host(block->b_host); + goto callback; + } + + /* Try the lock operation again */ + lock->fl.fl_flags |= FL_SLEEP; + error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + lock->fl.fl_flags &= ~FL_SLEEP; + + switch (error) { + case 0: + break; + case FILE_LOCK_DEFERRED: + dprintk("lockd: lock still blocked error %d\n", error); + nlmsvc_insert_block(block, NLM_NEVER); + nlmsvc_release_block(block); + return; + default: + printk(KERN_WARNING "lockd: unexpected error %d in %s!\n", + -error, __func__); + nlmsvc_insert_block(block, 10 * HZ); + nlmsvc_release_block(block); + return; + } + +callback: + /* Lock was granted by VFS. */ + dprintk("lockd: GRANTing blocked lock.\n"); + block->b_granted = 1; + + /* keep block on the list, but don't reattempt until the RPC + * completes or the submission fails + */ + nlmsvc_insert_block(block, NLM_NEVER); + + /* Call the client -- use a soft RPC task since nlmsvc_retry_blocked + * will queue up a new one if this one times out + */ + error = nlm_async_call(block->b_call, NLMPROC_GRANTED_MSG, + &nlmsvc_grant_ops); + + /* RPC submission failed, wait a bit and retry */ + if (error < 0) + nlmsvc_insert_block(block, 10 * HZ); +} + +/* + * This is the callback from the RPC layer when the NLM_GRANTED_MSG + * RPC call has succeeded or timed out. + * Like all RPC callbacks, it is invoked by the rpciod process, so it + * better not sleep. Therefore, we put the blocked lock on the nlm_blocked + * chain once more in order to have it removed by lockd itself (which can + * then sleep on the file semaphore without disrupting e.g. the nfs client). + */ +static void nlmsvc_grant_callback(struct rpc_task *task, void *data) +{ + struct nlm_rqst *call = data; + struct nlm_block *block = call->a_block; + unsigned long timeout; + + dprintk("lockd: GRANT_MSG RPC callback\n"); + + spin_lock(&nlm_blocked_lock); + /* if the block is not on a list at this point then it has + * been invalidated. Don't try to requeue it. + * + * FIXME: it's possible that the block is removed from the list + * after this check but before the nlmsvc_insert_block. In that + * case it will be added back. Perhaps we need better locking + * for nlm_blocked? + */ + if (list_empty(&block->b_list)) + goto out; + + /* Technically, we should down the file semaphore here. Since we + * move the block towards the head of the queue only, no harm + * can be done, though. */ + if (task->tk_status < 0) { + /* RPC error: Re-insert for retransmission */ + timeout = 10 * HZ; + } else { + /* Call was successful, now wait for client callback */ + timeout = 60 * HZ; + } + nlmsvc_insert_block_locked(block, timeout); + svc_wake_up(block->b_daemon); +out: + spin_unlock(&nlm_blocked_lock); +} + +/* + * FIXME: nlmsvc_release_block() grabs a mutex. This is not allowed for an + * .rpc_release rpc_call_op + */ +static void nlmsvc_grant_release(void *data) +{ + struct nlm_rqst *call = data; + nlmsvc_release_block(call->a_block); +} + +static const struct rpc_call_ops nlmsvc_grant_ops = { + .rpc_call_done = nlmsvc_grant_callback, + .rpc_release = nlmsvc_grant_release, +}; + +/* + * We received a GRANT_RES callback. Try to find the corresponding + * block. + */ +void +nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status) +{ + struct nlm_block *block; + + dprintk("grant_reply: looking for cookie %x, s=%d \n", + *(unsigned int *)(cookie->data), status); + if (!(block = nlmsvc_find_block(cookie))) + return; + + if (block) { + if (status == nlm_lck_denied_grace_period) { + /* Try again in a couple of seconds */ + nlmsvc_insert_block(block, 10 * HZ); + } else { + /* Lock is now held by client, or has been rejected. + * In both cases, the block should be removed. */ + nlmsvc_unlink_block(block); + } + } + nlmsvc_release_block(block); +} + +/* Helper function to handle retry of a deferred block. + * If it is a blocking lock, call grant_blocked. + * For a non-blocking lock or test lock, revisit the request. + */ +static void +retry_deferred_block(struct nlm_block *block) +{ + if (!(block->b_flags & B_GOT_CALLBACK)) + block->b_flags |= B_TIMED_OUT; + nlmsvc_insert_block(block, NLM_TIMEOUT); + dprintk("revisit block %p flags %d\n", block, block->b_flags); + if (block->b_deferred_req) { + block->b_deferred_req->revisit(block->b_deferred_req, 0); + block->b_deferred_req = NULL; + } +} + +/* + * Retry all blocked locks that have been notified. This is where lockd + * picks up locks that can be granted, or grant notifications that must + * be retransmitted. + */ +unsigned long +nlmsvc_retry_blocked(void) +{ + unsigned long timeout = MAX_SCHEDULE_TIMEOUT; + struct nlm_block *block; + + while (!list_empty(&nlm_blocked) && !kthread_should_stop()) { + block = list_entry(nlm_blocked.next, struct nlm_block, b_list); + + if (block->b_when == NLM_NEVER) + break; + if (time_after(block->b_when, jiffies)) { + timeout = block->b_when - jiffies; + break; + } + + dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n", + block, block->b_when); + if (block->b_flags & B_QUEUED) { + dprintk("nlmsvc_retry_blocked delete block (%p, granted=%d, flags=%d)\n", + block, block->b_granted, block->b_flags); + retry_deferred_block(block); + } else + nlmsvc_grant_blocked(block); + } + + return timeout; +} + +#ifdef RPC_DEBUG +static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) +{ + /* + * We can get away with a static buffer because we're only + * called with BKL held. + */ + static char buf[2*NLM_MAXCOOKIELEN+1]; + unsigned int i, len = sizeof(buf); + char *p = buf; + + len--; /* allow for trailing \0 */ + if (len < 3) + return "???"; + for (i = 0 ; i < cookie->len ; i++) { + if (len < 2) { + strcpy(p-3, "..."); + break; + } + sprintf(p, "%02x", cookie->data[i]); + p += 2; + len -= 2; + } + *p = '\0'; + + return buf; +} +#endif diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c new file mode 100644 index 00000000..d27aab11 --- /dev/null +++ b/fs/lockd/svcproc.c @@ -0,0 +1,544 @@ +/* + * linux/fs/lockd/svcproc.c + * + * Lockd server procedures. We don't implement the NLM_*_RES + * procedures because we don't use the async procedures. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_CLIENT + +#ifdef CONFIG_LOCKD_V4 +static __be32 +cast_to_nlm(__be32 status, u32 vers) +{ + /* Note: status is assumed to be in network byte order !!! */ + if (vers != 4){ + switch (status) { + case nlm_granted: + case nlm_lck_denied: + case nlm_lck_denied_nolocks: + case nlm_lck_blocked: + case nlm_lck_denied_grace_period: + case nlm_drop_reply: + break; + case nlm4_deadlock: + status = nlm_lck_denied; + break; + default: + status = nlm_lck_denied_nolocks; + } + } + + return (status); +} +#define cast_status(status) (cast_to_nlm(status, rqstp->rq_vers)) +#else +#define cast_status(status) (status) +#endif + +/* + * Obtain client and file from arguments + */ +static __be32 +nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_host **hostp, struct nlm_file **filp) +{ + struct nlm_host *host = NULL; + struct nlm_file *file = NULL; + struct nlm_lock *lock = &argp->lock; + __be32 error = 0; + + /* nfsd callbacks must have been installed for this procedure */ + if (!nlmsvc_ops) + return nlm_lck_denied_nolocks; + + /* Obtain host handle */ + if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len)) + || (argp->monitor && nsm_monitor(host) < 0)) + goto no_locks; + *hostp = host; + + /* Obtain file pointer. Not used by FREE_ALL call. */ + if (filp != NULL) { + if ((error = nlm_lookup_file(rqstp, &file, &lock->fh)) != 0) + goto no_locks; + *filp = file; + + /* Set up the missing parts of the file_lock structure */ + lock->fl.fl_file = file->f_file; + lock->fl.fl_owner = (fl_owner_t) host; + lock->fl.fl_lmops = &nlmsvc_lock_operations; + } + + return 0; + +no_locks: + nlmsvc_release_host(host); + if (error) + return error; + return nlm_lck_denied_nolocks; +} + +/* + * NULL: Test for presence of service + */ +static __be32 +nlmsvc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + dprintk("lockd: NULL called\n"); + return rpc_success; +} + +/* + * TEST: Check for conflicting lock + */ +static __be32 +nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + __be32 rc = rpc_success; + + dprintk("lockd: TEST called\n"); + resp->cookie = argp->cookie; + + /* Obtain client and file */ + if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now check for conflicting locks */ + resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie)); + if (resp->status == nlm_drop_reply) + rc = rpc_drop_reply; + else + dprintk("lockd: TEST status %d vers %d\n", + ntohl(resp->status), rqstp->rq_vers); + + nlmsvc_release_host(host); + nlm_release_file(file); + return rc; +} + +static __be32 +nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + __be32 rc = rpc_success; + + dprintk("lockd: LOCK called\n"); + + resp->cookie = argp->cookie; + + /* Obtain client and file */ + if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + +#if 0 + /* If supplied state doesn't match current state, we assume it's + * an old request that time-warped somehow. Any error return would + * do in this case because it's irrelevant anyway. + * + * NB: We don't retrieve the remote host's state yet. + */ + if (host->h_nsmstate && host->h_nsmstate != argp->state) { + resp->status = nlm_lck_denied_nolocks; + } else +#endif + + /* Now try to lock the file */ + resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock, + argp->block, &argp->cookie, + argp->reclaim)); + if (resp->status == nlm_drop_reply) + rc = rpc_drop_reply; + else + dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); + + nlmsvc_release_host(host); + nlm_release_file(file); + return rc; +} + +static __be32 +nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + + dprintk("lockd: CANCEL called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept requests during grace period */ + if (locks_in_grace()) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Try to cancel request. */ + resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock)); + + dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * UNLOCK: release a lock + */ +static __be32 +nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + + dprintk("lockd: UNLOCK called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept new lock requests during grace period */ + if (locks_in_grace()) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now try to remove the lock */ + resp->status = cast_status(nlmsvc_unlock(file, &argp->lock)); + + dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * GRANTED: A server calls us to tell that a process' lock request + * was granted + */ +static __be32 +nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + resp->cookie = argp->cookie; + + dprintk("lockd: GRANTED called\n"); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); + dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); + return rpc_success; +} + +/* + * This is the generic lockd callback for async RPC calls + */ +static void nlmsvc_callback_exit(struct rpc_task *task, void *data) +{ + dprintk("lockd: %5u callback returned %d\n", task->tk_pid, + -task->tk_status); +} + +void nlmsvc_release_call(struct nlm_rqst *call) +{ + if (!atomic_dec_and_test(&call->a_count)) + return; + nlmsvc_release_host(call->a_host); + kfree(call); +} + +static void nlmsvc_callback_release(void *data) +{ + nlmsvc_release_call(data); +} + +static const struct rpc_call_ops nlmsvc_callback_ops = { + .rpc_call_done = nlmsvc_callback_exit, + .rpc_release = nlmsvc_callback_release, +}; + +/* + * `Async' versions of the above service routines. They aren't really, + * because we send the callback before the reply proper. I hope this + * doesn't break any clients. + */ +static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp, + __be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res *)) +{ + struct nlm_host *host; + struct nlm_rqst *call; + __be32 stat; + + host = nlmsvc_lookup_host(rqstp, + argp->lock.caller, + argp->lock.len); + if (host == NULL) + return rpc_system_err; + + call = nlm_alloc_call(host); + if (call == NULL) + return rpc_system_err; + + stat = func(rqstp, argp, &call->a_res); + if (stat != 0) { + nlmsvc_release_call(call); + return stat; + } + + call->a_flags = RPC_TASK_ASYNC; + if (nlm_async_reply(call, proc, &nlmsvc_callback_ops) < 0) + return rpc_system_err; + return rpc_success; +} + +static __be32 nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: TEST_MSG called\n"); + return nlmsvc_callback(rqstp, NLMPROC_TEST_RES, argp, nlmsvc_proc_test); +} + +static __be32 nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: LOCK_MSG called\n"); + return nlmsvc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlmsvc_proc_lock); +} + +static __be32 nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: CANCEL_MSG called\n"); + return nlmsvc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlmsvc_proc_cancel); +} + +static __be32 +nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: UNLOCK_MSG called\n"); + return nlmsvc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlmsvc_proc_unlock); +} + +static __be32 +nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + dprintk("lockd: GRANTED_MSG called\n"); + return nlmsvc_callback(rqstp, NLMPROC_GRANTED_RES, argp, nlmsvc_proc_granted); +} + +/* + * SHARE: create a DOS share or alter existing share. + */ +static __be32 +nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + + dprintk("lockd: SHARE called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept new lock requests during grace period */ + if (locks_in_grace() && !argp->reclaim) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now try to create the share */ + resp->status = cast_status(nlmsvc_share_file(host, file, argp)); + + dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * UNSHARE: Release a DOS share. + */ +static __be32 +nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + struct nlm_host *host; + struct nlm_file *file; + + dprintk("lockd: UNSHARE called\n"); + + resp->cookie = argp->cookie; + + /* Don't accept requests during grace period */ + if (locks_in_grace()) { + resp->status = nlm_lck_denied_grace_period; + return rpc_success; + } + + /* Obtain client and file */ + if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) + return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + + /* Now try to unshare the file */ + resp->status = cast_status(nlmsvc_unshare_file(host, file, argp)); + + dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); + nlmsvc_release_host(host); + nlm_release_file(file); + return rpc_success; +} + +/* + * NM_LOCK: Create an unmonitored lock + */ +static __be32 +nlmsvc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp, + struct nlm_res *resp) +{ + dprintk("lockd: NM_LOCK called\n"); + + argp->monitor = 0; /* just clean the monitor flag */ + return nlmsvc_proc_lock(rqstp, argp, resp); +} + +/* + * FREE_ALL: Release all locks and shares held by client + */ +static __be32 +nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp, + void *resp) +{ + struct nlm_host *host; + + /* Obtain client */ + if (nlmsvc_retrieve_args(rqstp, argp, &host, NULL)) + return rpc_success; + + nlmsvc_free_host_resources(host); + nlmsvc_release_host(host); + return rpc_success; +} + +/* + * SM_NOTIFY: private callback from statd (not part of official NLM proto) + */ +static __be32 +nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, + void *resp) +{ + dprintk("lockd: SM_NOTIFY called\n"); + + if (!nlm_privileged_requester(rqstp)) { + char buf[RPC_MAX_ADDRBUFLEN]; + printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", + svc_print_addr(rqstp, buf, sizeof(buf))); + return rpc_system_err; + } + + nlm_host_rebooted(argp); + return rpc_success; +} + +/* + * client sent a GRANTED_RES, let's remove the associated block + */ +static __be32 +nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp, + void *resp) +{ + if (!nlmsvc_ops) + return rpc_success; + + dprintk("lockd: GRANTED_RES called\n"); + + nlmsvc_grant_reply(&argp->cookie, argp->status); + return rpc_success; +} + +/* + * NLM Server procedures. + */ + +#define nlmsvc_encode_norep nlmsvc_encode_void +#define nlmsvc_decode_norep nlmsvc_decode_void +#define nlmsvc_decode_testres nlmsvc_decode_void +#define nlmsvc_decode_lockres nlmsvc_decode_void +#define nlmsvc_decode_unlockres nlmsvc_decode_void +#define nlmsvc_decode_cancelres nlmsvc_decode_void +#define nlmsvc_decode_grantedres nlmsvc_decode_void + +#define nlmsvc_proc_none nlmsvc_proc_null +#define nlmsvc_proc_test_res nlmsvc_proc_null +#define nlmsvc_proc_lock_res nlmsvc_proc_null +#define nlmsvc_proc_cancel_res nlmsvc_proc_null +#define nlmsvc_proc_unlock_res nlmsvc_proc_null + +struct nlm_void { int dummy; }; + +#define PROC(name, xargt, xrest, argt, rest, respsize) \ + { .pc_func = (svc_procfunc) nlmsvc_proc_##name, \ + .pc_decode = (kxdrproc_t) nlmsvc_decode_##xargt, \ + .pc_encode = (kxdrproc_t) nlmsvc_encode_##xrest, \ + .pc_release = NULL, \ + .pc_argsize = sizeof(struct nlm_##argt), \ + .pc_ressize = sizeof(struct nlm_##rest), \ + .pc_xdrressize = respsize, \ + } + +#define Ck (1+XDR_QUADLEN(NLM_MAXCOOKIELEN)) /* cookie */ +#define St 1 /* status */ +#define No (1+1024/4) /* Net Obj */ +#define Rg 2 /* range - offset + size */ + +struct svc_procedure nlmsvc_procedures[] = { + PROC(null, void, void, void, void, 1), + PROC(test, testargs, testres, args, res, Ck+St+2+No+Rg), + PROC(lock, lockargs, res, args, res, Ck+St), + PROC(cancel, cancargs, res, args, res, Ck+St), + PROC(unlock, unlockargs, res, args, res, Ck+St), + PROC(granted, testargs, res, args, res, Ck+St), + PROC(test_msg, testargs, norep, args, void, 1), + PROC(lock_msg, lockargs, norep, args, void, 1), + PROC(cancel_msg, cancargs, norep, args, void, 1), + PROC(unlock_msg, unlockargs, norep, args, void, 1), + PROC(granted_msg, testargs, norep, args, void, 1), + PROC(test_res, testres, norep, res, void, 1), + PROC(lock_res, lockres, norep, res, void, 1), + PROC(cancel_res, cancelres, norep, res, void, 1), + PROC(unlock_res, unlockres, norep, res, void, 1), + PROC(granted_res, res, norep, res, void, 1), + /* statd callback */ + PROC(sm_notify, reboot, void, reboot, void, 1), + PROC(none, void, void, void, void, 1), + PROC(none, void, void, void, void, 1), + PROC(none, void, void, void, void, 1), + PROC(share, shareargs, shareres, args, res, Ck+St+1), + PROC(unshare, shareargs, shareres, args, res, Ck+St+1), + PROC(nm_lock, lockargs, res, args, res, Ck+St), + PROC(free_all, notify, void, args, void, 0), + +}; diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c new file mode 100644 index 00000000..b0ae0700 --- /dev/null +++ b/fs/lockd/svcshare.c @@ -0,0 +1,106 @@ +/* + * linux/fs/lockd/svcshare.c + * + * Management of DOS shares. + * + * Copyright (C) 1996 Olaf Kirch + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +static inline int +nlm_cmp_owner(struct nlm_share *share, struct xdr_netobj *oh) +{ + return share->s_owner.len == oh->len + && !memcmp(share->s_owner.data, oh->data, oh->len); +} + +__be32 +nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file, + struct nlm_args *argp) +{ + struct nlm_share *share; + struct xdr_netobj *oh = &argp->lock.oh; + u8 *ohdata; + + for (share = file->f_shares; share; share = share->s_next) { + if (share->s_host == host && nlm_cmp_owner(share, oh)) + goto update; + if ((argp->fsm_access & share->s_mode) + || (argp->fsm_mode & share->s_access )) + return nlm_lck_denied; + } + + share = kmalloc(sizeof(*share) + oh->len, + GFP_KERNEL); + if (share == NULL) + return nlm_lck_denied_nolocks; + + /* Copy owner handle */ + ohdata = (u8 *) (share + 1); + memcpy(ohdata, oh->data, oh->len); + + share->s_file = file; + share->s_host = host; + share->s_owner.data = ohdata; + share->s_owner.len = oh->len; + share->s_next = file->f_shares; + file->f_shares = share; + +update: + share->s_access = argp->fsm_access; + share->s_mode = argp->fsm_mode; + return nlm_granted; +} + +/* + * Delete a share. + */ +__be32 +nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file, + struct nlm_args *argp) +{ + struct nlm_share *share, **shpp; + struct xdr_netobj *oh = &argp->lock.oh; + + for (shpp = &file->f_shares; (share = *shpp) != NULL; + shpp = &share->s_next) { + if (share->s_host == host && nlm_cmp_owner(share, oh)) { + *shpp = share->s_next; + kfree(share); + return nlm_granted; + } + } + + /* X/Open spec says return success even if there was no + * corresponding share. */ + return nlm_granted; +} + +/* + * Traverse all shares for a given file, and delete + * those owned by the given (type of) host + */ +void nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file, + nlm_host_match_fn_t match) +{ + struct nlm_share *share, **shpp; + + shpp = &file->f_shares; + while ((share = *shpp) != NULL) { + if (match(share->s_host, host)) { + *shpp = share->s_next; + kfree(share); + continue; + } + shpp = &share->s_next; + } +} diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c new file mode 100644 index 00000000..1ca0679c --- /dev/null +++ b/fs/lockd/svcsubs.c @@ -0,0 +1,446 @@ +/* + * linux/fs/lockd/svcsubs.c + * + * Various support routines for the NLM server. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_SVCSUBS + + +/* + * Global file hash table + */ +#define FILE_HASH_BITS 7 +#define FILE_NRHASH (1<data; + + /* print the first 32 bytes of the fh */ + dprintk("lockd: %s (%08x %08x %08x %08x %08x %08x %08x %08x)\n", + msg, fhp[0], fhp[1], fhp[2], fhp[3], + fhp[4], fhp[5], fhp[6], fhp[7]); +} + +static inline void nlm_debug_print_file(char *msg, struct nlm_file *file) +{ + struct inode *inode = file->f_file->f_path.dentry->d_inode; + + dprintk("lockd: %s %s/%ld\n", + msg, inode->i_sb->s_id, inode->i_ino); +} +#else +static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f) +{ + return; +} + +static inline void nlm_debug_print_file(char *msg, struct nlm_file *file) +{ + return; +} +#endif + +static inline unsigned int file_hash(struct nfs_fh *f) +{ + unsigned int tmp=0; + int i; + for (i=0; idata[i]; + return tmp & (FILE_NRHASH - 1); +} + +/* + * Lookup file info. If it doesn't exist, create a file info struct + * and open a (VFS) file for the given inode. + * + * FIXME: + * Note that we open the file O_RDONLY even when creating write locks. + * This is not quite right, but for now, we assume the client performs + * the proper R/W checking. + */ +__be32 +nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, + struct nfs_fh *f) +{ + struct hlist_node *pos; + struct nlm_file *file; + unsigned int hash; + __be32 nfserr; + + nlm_debug_print_fh("nlm_lookup_file", f); + + hash = file_hash(f); + + /* Lock file table */ + mutex_lock(&nlm_file_mutex); + + hlist_for_each_entry(file, pos, &nlm_files[hash], f_list) + if (!nfs_compare_fh(&file->f_handle, f)) + goto found; + + nlm_debug_print_fh("creating file for", f); + + nfserr = nlm_lck_denied_nolocks; + file = kzalloc(sizeof(*file), GFP_KERNEL); + if (!file) + goto out_unlock; + + memcpy(&file->f_handle, f, sizeof(struct nfs_fh)); + mutex_init(&file->f_mutex); + INIT_HLIST_NODE(&file->f_list); + INIT_LIST_HEAD(&file->f_blocks); + + /* Open the file. Note that this must not sleep for too long, else + * we would lock up lockd:-) So no NFS re-exports, folks. + * + * We have to make sure we have the right credential to open + * the file. + */ + if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) { + dprintk("lockd: open failed (error %d)\n", nfserr); + goto out_free; + } + + hlist_add_head(&file->f_list, &nlm_files[hash]); + +found: + dprintk("lockd: found file %p (count %d)\n", file, file->f_count); + *result = file; + file->f_count++; + nfserr = 0; + +out_unlock: + mutex_unlock(&nlm_file_mutex); + return nfserr; + +out_free: + kfree(file); + goto out_unlock; +} + +/* + * Delete a file after having released all locks, blocks and shares + */ +static inline void +nlm_delete_file(struct nlm_file *file) +{ + nlm_debug_print_file("closing file", file); + if (!hlist_unhashed(&file->f_list)) { + hlist_del(&file->f_list); + nlmsvc_ops->fclose(file->f_file); + kfree(file); + } else { + printk(KERN_WARNING "lockd: attempt to release unknown file!\n"); + } +} + +/* + * Loop over all locks on the given file and perform the specified + * action. + */ +static int +nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, + nlm_host_match_fn_t match) +{ + struct inode *inode = nlmsvc_file_inode(file); + struct file_lock *fl; + struct nlm_host *lockhost; + +again: + file->f_locks = 0; + lock_flocks(); /* protects i_flock list */ + for (fl = inode->i_flock; fl; fl = fl->fl_next) { + if (fl->fl_lmops != &nlmsvc_lock_operations) + continue; + + /* update current lock count */ + file->f_locks++; + + lockhost = (struct nlm_host *) fl->fl_owner; + if (match(lockhost, host)) { + struct file_lock lock = *fl; + + unlock_flocks(); + lock.fl_type = F_UNLCK; + lock.fl_start = 0; + lock.fl_end = OFFSET_MAX; + if (vfs_lock_file(file->f_file, F_SETLK, &lock, NULL) < 0) { + printk("lockd: unlock failure in %s:%d\n", + __FILE__, __LINE__); + return 1; + } + goto again; + } + } + unlock_flocks(); + + return 0; +} + +static int +nlmsvc_always_match(void *dummy1, struct nlm_host *dummy2) +{ + return 1; +} + +/* + * Inspect a single file + */ +static inline int +nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, nlm_host_match_fn_t match) +{ + nlmsvc_traverse_blocks(host, file, match); + nlmsvc_traverse_shares(host, file, match); + return nlm_traverse_locks(host, file, match); +} + +/* + * Quick check whether there are still any locks, blocks or + * shares on a given file. + */ +static inline int +nlm_file_inuse(struct nlm_file *file) +{ + struct inode *inode = nlmsvc_file_inode(file); + struct file_lock *fl; + + if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) + return 1; + + lock_flocks(); + for (fl = inode->i_flock; fl; fl = fl->fl_next) { + if (fl->fl_lmops == &nlmsvc_lock_operations) { + unlock_flocks(); + return 1; + } + } + unlock_flocks(); + file->f_locks = 0; + return 0; +} + +/* + * Loop over all files in the file table. + */ +static int +nlm_traverse_files(void *data, nlm_host_match_fn_t match, + int (*is_failover_file)(void *data, struct nlm_file *file)) +{ + struct hlist_node *pos, *next; + struct nlm_file *file; + int i, ret = 0; + + mutex_lock(&nlm_file_mutex); + for (i = 0; i < FILE_NRHASH; i++) { + hlist_for_each_entry_safe(file, pos, next, &nlm_files[i], f_list) { + if (is_failover_file && !is_failover_file(data, file)) + continue; + file->f_count++; + mutex_unlock(&nlm_file_mutex); + + /* Traverse locks, blocks and shares of this file + * and update file->f_locks count */ + if (nlm_inspect_file(data, file, match)) + ret = 1; + + mutex_lock(&nlm_file_mutex); + file->f_count--; + /* No more references to this file. Let go of it. */ + if (list_empty(&file->f_blocks) && !file->f_locks + && !file->f_shares && !file->f_count) { + hlist_del(&file->f_list); + nlmsvc_ops->fclose(file->f_file); + kfree(file); + } + } + } + mutex_unlock(&nlm_file_mutex); + return ret; +} + +/* + * Release file. If there are no more remote locks on this file, + * close it and free the handle. + * + * Note that we can't do proper reference counting without major + * contortions because the code in fs/locks.c creates, deletes and + * splits locks without notification. Our only way is to walk the + * entire lock list each time we remove a lock. + */ +void +nlm_release_file(struct nlm_file *file) +{ + dprintk("lockd: nlm_release_file(%p, ct = %d)\n", + file, file->f_count); + + /* Lock file table */ + mutex_lock(&nlm_file_mutex); + + /* If there are no more locks etc, delete the file */ + if (--file->f_count == 0 && !nlm_file_inuse(file)) + nlm_delete_file(file); + + mutex_unlock(&nlm_file_mutex); +} + +/* + * Helpers function for resource traversal + * + * nlmsvc_mark_host: + * used by the garbage collector; simply sets h_inuse. + * Always returns 0. + * + * nlmsvc_same_host: + * returns 1 iff the two hosts match. Used to release + * all resources bound to a specific host. + * + * nlmsvc_is_client: + * returns 1 iff the host is a client. + * Used by nlmsvc_invalidate_all + */ +static int +nlmsvc_mark_host(void *data, struct nlm_host *dummy) +{ + struct nlm_host *host = data; + + host->h_inuse = 1; + return 0; +} + +static int +nlmsvc_same_host(void *data, struct nlm_host *other) +{ + struct nlm_host *host = data; + + return host == other; +} + +static int +nlmsvc_is_client(void *data, struct nlm_host *dummy) +{ + struct nlm_host *host = data; + + if (host->h_server) { + /* we are destroying locks even though the client + * hasn't asked us too, so don't unmonitor the + * client + */ + if (host->h_nsmhandle) + host->h_nsmhandle->sm_sticky = 1; + return 1; + } else + return 0; +} + +/* + * Mark all hosts that still hold resources + */ +void +nlmsvc_mark_resources(void) +{ + dprintk("lockd: nlmsvc_mark_resources\n"); + nlm_traverse_files(NULL, nlmsvc_mark_host, NULL); +} + +/* + * Release all resources held by the given client + */ +void +nlmsvc_free_host_resources(struct nlm_host *host) +{ + dprintk("lockd: nlmsvc_free_host_resources\n"); + + if (nlm_traverse_files(host, nlmsvc_same_host, NULL)) { + printk(KERN_WARNING + "lockd: couldn't remove all locks held by %s\n", + host->h_name); + BUG(); + } +} + +/** + * nlmsvc_invalidate_all - remove all locks held for clients + * + * Release all locks held by NFS clients. + * + */ +void +nlmsvc_invalidate_all(void) +{ + /* + * Previously, the code would call + * nlmsvc_free_host_resources for each client in + * turn, which is about as inefficient as it gets. + * Now we just do it once in nlm_traverse_files. + */ + nlm_traverse_files(NULL, nlmsvc_is_client, NULL); +} + +static int +nlmsvc_match_sb(void *datap, struct nlm_file *file) +{ + struct super_block *sb = datap; + + return sb == file->f_file->f_path.mnt->mnt_sb; +} + +/** + * nlmsvc_unlock_all_by_sb - release locks held on this file system + * @sb: super block + * + * Release all locks held by clients accessing this file system. + */ +int +nlmsvc_unlock_all_by_sb(struct super_block *sb) +{ + int ret; + + ret = nlm_traverse_files(sb, nlmsvc_always_match, nlmsvc_match_sb); + return ret ? -EIO : 0; +} +EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb); + +static int +nlmsvc_match_ip(void *datap, struct nlm_host *host) +{ + return rpc_cmp_addr(nlm_srcaddr(host), datap); +} + +/** + * nlmsvc_unlock_all_by_ip - release local locks by IP address + * @server_addr: server's IP address as seen by clients + * + * Release all locks held by clients accessing this host + * via the passed in IP address. + */ +int +nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr) +{ + int ret; + + ret = nlm_traverse_files(server_addr, nlmsvc_match_ip, NULL); + return ret ? -EIO : 0; +} +EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_ip); diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c new file mode 100644 index 00000000..964666c6 --- /dev/null +++ b/fs/lockd/xdr.c @@ -0,0 +1,343 @@ +/* + * linux/fs/lockd/xdr.c + * + * XDR support for lockd and the lock client. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_XDR + + +static inline loff_t +s32_to_loff_t(__s32 offset) +{ + return (loff_t)offset; +} + +static inline __s32 +loff_t_to_s32(loff_t offset) +{ + __s32 res; + if (offset >= NLM_OFFSET_MAX) + res = NLM_OFFSET_MAX; + else if (offset <= -NLM_OFFSET_MAX) + res = -NLM_OFFSET_MAX; + else + res = offset; + return res; +} + +/* + * XDR functions for basic NLM types + */ +static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c) +{ + unsigned int len; + + len = ntohl(*p++); + + if(len==0) + { + c->len=4; + memset(c->data, 0, 4); /* hockeypux brain damage */ + } + else if(len<=NLM_MAXCOOKIELEN) + { + c->len=len; + memcpy(c->data, p, len); + p+=XDR_QUADLEN(len); + } + else + { + dprintk("lockd: bad cookie size %d (only cookies under " + "%d bytes are supported.)\n", + len, NLM_MAXCOOKIELEN); + return NULL; + } + return p; +} + +static inline __be32 * +nlm_encode_cookie(__be32 *p, struct nlm_cookie *c) +{ + *p++ = htonl(c->len); + memcpy(p, c->data, c->len); + p+=XDR_QUADLEN(c->len); + return p; +} + +static __be32 * +nlm_decode_fh(__be32 *p, struct nfs_fh *f) +{ + unsigned int len; + + if ((len = ntohl(*p++)) != NFS2_FHSIZE) { + dprintk("lockd: bad fhandle size %d (should be %d)\n", + len, NFS2_FHSIZE); + return NULL; + } + f->size = NFS2_FHSIZE; + memset(f->data, 0, sizeof(f->data)); + memcpy(f->data, p, NFS2_FHSIZE); + return p + XDR_QUADLEN(NFS2_FHSIZE); +} + +static inline __be32 * +nlm_encode_fh(__be32 *p, struct nfs_fh *f) +{ + *p++ = htonl(NFS2_FHSIZE); + memcpy(p, f->data, NFS2_FHSIZE); + return p + XDR_QUADLEN(NFS2_FHSIZE); +} + +/* + * Encode and decode owner handle + */ +static inline __be32 * +nlm_decode_oh(__be32 *p, struct xdr_netobj *oh) +{ + return xdr_decode_netobj(p, oh); +} + +static inline __be32 * +nlm_encode_oh(__be32 *p, struct xdr_netobj *oh) +{ + return xdr_encode_netobj(p, oh); +} + +static __be32 * +nlm_decode_lock(__be32 *p, struct nlm_lock *lock) +{ + struct file_lock *fl = &lock->fl; + s32 start, len, end; + + if (!(p = xdr_decode_string_inplace(p, &lock->caller, + &lock->len, + NLM_MAXSTRLEN)) + || !(p = nlm_decode_fh(p, &lock->fh)) + || !(p = nlm_decode_oh(p, &lock->oh))) + return NULL; + lock->svid = ntohl(*p++); + + locks_init_lock(fl); + fl->fl_owner = current->files; + fl->fl_pid = (pid_t)lock->svid; + fl->fl_flags = FL_POSIX; + fl->fl_type = F_RDLCK; /* as good as anything else */ + start = ntohl(*p++); + len = ntohl(*p++); + end = start + len - 1; + + fl->fl_start = s32_to_loff_t(start); + + if (len == 0 || end < 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = s32_to_loff_t(end); + return p; +} + +/* + * Encode result of a TEST/TEST_MSG call + */ +static __be32 * +nlm_encode_testres(__be32 *p, struct nlm_res *resp) +{ + s32 start, len; + + if (!(p = nlm_encode_cookie(p, &resp->cookie))) + return NULL; + *p++ = resp->status; + + if (resp->status == nlm_lck_denied) { + struct file_lock *fl = &resp->lock.fl; + + *p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one; + *p++ = htonl(resp->lock.svid); + + /* Encode owner handle. */ + if (!(p = xdr_encode_netobj(p, &resp->lock.oh))) + return NULL; + + start = loff_t_to_s32(fl->fl_start); + if (fl->fl_end == OFFSET_MAX) + len = 0; + else + len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1); + + *p++ = htonl(start); + *p++ = htonl(len); + } + + return p; +} + + +/* + * First, the server side XDR functions + */ +int +nlmsvc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + u32 exclusive; + + if (!(p = nlm_decode_cookie(p, &argp->cookie))) + return 0; + + exclusive = ntohl(*p++); + if (!(p = nlm_decode_lock(p, &argp->lock))) + return 0; + if (exclusive) + argp->lock.fl.fl_type = F_WRLCK; + + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm_encode_testres(p, resp))) + return 0; + return xdr_ressize_check(rqstp, p); +} + +int +nlmsvc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + u32 exclusive; + + if (!(p = nlm_decode_cookie(p, &argp->cookie))) + return 0; + argp->block = ntohl(*p++); + exclusive = ntohl(*p++); + if (!(p = nlm_decode_lock(p, &argp->lock))) + return 0; + if (exclusive) + argp->lock.fl.fl_type = F_WRLCK; + argp->reclaim = ntohl(*p++); + argp->state = ntohl(*p++); + argp->monitor = 1; /* monitor client by default */ + + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + u32 exclusive; + + if (!(p = nlm_decode_cookie(p, &argp->cookie))) + return 0; + argp->block = ntohl(*p++); + exclusive = ntohl(*p++); + if (!(p = nlm_decode_lock(p, &argp->lock))) + return 0; + if (exclusive) + argp->lock.fl.fl_type = F_WRLCK; + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + if (!(p = nlm_decode_cookie(p, &argp->cookie)) + || !(p = nlm_decode_lock(p, &argp->lock))) + return 0; + argp->lock.fl.fl_type = F_UNLCK; + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + struct nlm_lock *lock = &argp->lock; + + memset(lock, 0, sizeof(*lock)); + locks_init_lock(&lock->fl); + lock->svid = ~(u32) 0; + lock->fl.fl_pid = (pid_t)lock->svid; + + if (!(p = nlm_decode_cookie(p, &argp->cookie)) + || !(p = xdr_decode_string_inplace(p, &lock->caller, + &lock->len, NLM_MAXSTRLEN)) + || !(p = nlm_decode_fh(p, &lock->fh)) + || !(p = nlm_decode_oh(p, &lock->oh))) + return 0; + argp->fsm_mode = ntohl(*p++); + argp->fsm_access = ntohl(*p++); + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm_encode_cookie(p, &resp->cookie))) + return 0; + *p++ = resp->status; + *p++ = xdr_zero; /* sequence argument */ + return xdr_ressize_check(rqstp, p); +} + +int +nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm_encode_cookie(p, &resp->cookie))) + return 0; + *p++ = resp->status; + return xdr_ressize_check(rqstp, p); +} + +int +nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp) +{ + struct nlm_lock *lock = &argp->lock; + + if (!(p = xdr_decode_string_inplace(p, &lock->caller, + &lock->len, NLM_MAXSTRLEN))) + return 0; + argp->state = ntohl(*p++); + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp) +{ + if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) + return 0; + argp->state = ntohl(*p++); + memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); + p += XDR_QUADLEN(SM_PRIV_SIZE); + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm_decode_cookie(p, &resp->cookie))) + return 0; + resp->status = *p++; + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_argsize_check(rqstp, p); +} + +int +nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c new file mode 100644 index 00000000..dfa4789c --- /dev/null +++ b/fs/lockd/xdr4.c @@ -0,0 +1,334 @@ +/* + * linux/fs/lockd/xdr4.c + * + * XDR support for lockd and the lock client. + * + * Copyright (C) 1995, 1996 Olaf Kirch + * Copyright (C) 1999, Trond Myklebust + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#define NLMDBG_FACILITY NLMDBG_XDR + +static inline loff_t +s64_to_loff_t(__s64 offset) +{ + return (loff_t)offset; +} + + +static inline s64 +loff_t_to_s64(loff_t offset) +{ + s64 res; + if (offset > NLM4_OFFSET_MAX) + res = NLM4_OFFSET_MAX; + else if (offset < -NLM4_OFFSET_MAX) + res = -NLM4_OFFSET_MAX; + else + res = offset; + return res; +} + +/* + * XDR functions for basic NLM types + */ +static __be32 * +nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c) +{ + unsigned int len; + + len = ntohl(*p++); + + if(len==0) + { + c->len=4; + memset(c->data, 0, 4); /* hockeypux brain damage */ + } + else if(len<=NLM_MAXCOOKIELEN) + { + c->len=len; + memcpy(c->data, p, len); + p+=XDR_QUADLEN(len); + } + else + { + dprintk("lockd: bad cookie size %d (only cookies under " + "%d bytes are supported.)\n", + len, NLM_MAXCOOKIELEN); + return NULL; + } + return p; +} + +static __be32 * +nlm4_encode_cookie(__be32 *p, struct nlm_cookie *c) +{ + *p++ = htonl(c->len); + memcpy(p, c->data, c->len); + p+=XDR_QUADLEN(c->len); + return p; +} + +static __be32 * +nlm4_decode_fh(__be32 *p, struct nfs_fh *f) +{ + memset(f->data, 0, sizeof(f->data)); + f->size = ntohl(*p++); + if (f->size > NFS_MAXFHSIZE) { + dprintk("lockd: bad fhandle size %d (should be <=%d)\n", + f->size, NFS_MAXFHSIZE); + return NULL; + } + memcpy(f->data, p, f->size); + return p + XDR_QUADLEN(f->size); +} + +/* + * Encode and decode owner handle + */ +static __be32 * +nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh) +{ + return xdr_decode_netobj(p, oh); +} + +static __be32 * +nlm4_decode_lock(__be32 *p, struct nlm_lock *lock) +{ + struct file_lock *fl = &lock->fl; + __u64 len, start; + __s64 end; + + if (!(p = xdr_decode_string_inplace(p, &lock->caller, + &lock->len, NLM_MAXSTRLEN)) + || !(p = nlm4_decode_fh(p, &lock->fh)) + || !(p = nlm4_decode_oh(p, &lock->oh))) + return NULL; + lock->svid = ntohl(*p++); + + locks_init_lock(fl); + fl->fl_owner = current->files; + fl->fl_pid = (pid_t)lock->svid; + fl->fl_flags = FL_POSIX; + fl->fl_type = F_RDLCK; /* as good as anything else */ + p = xdr_decode_hyper(p, &start); + p = xdr_decode_hyper(p, &len); + end = start + len - 1; + + fl->fl_start = s64_to_loff_t(start); + + if (len == 0 || end < 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = s64_to_loff_t(end); + return p; +} + +/* + * Encode result of a TEST/TEST_MSG call + */ +static __be32 * +nlm4_encode_testres(__be32 *p, struct nlm_res *resp) +{ + s64 start, len; + + dprintk("xdr: before encode_testres (p %p resp %p)\n", p, resp); + if (!(p = nlm4_encode_cookie(p, &resp->cookie))) + return NULL; + *p++ = resp->status; + + if (resp->status == nlm_lck_denied) { + struct file_lock *fl = &resp->lock.fl; + + *p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one; + *p++ = htonl(resp->lock.svid); + + /* Encode owner handle. */ + if (!(p = xdr_encode_netobj(p, &resp->lock.oh))) + return NULL; + + start = loff_t_to_s64(fl->fl_start); + if (fl->fl_end == OFFSET_MAX) + len = 0; + else + len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1); + + p = xdr_encode_hyper(p, start); + p = xdr_encode_hyper(p, len); + dprintk("xdr: encode_testres (status %u pid %d type %d start %Ld end %Ld)\n", + resp->status, (int)resp->lock.svid, fl->fl_type, + (long long)fl->fl_start, (long long)fl->fl_end); + } + + dprintk("xdr: after encode_testres (p %p resp %p)\n", p, resp); + return p; +} + + +/* + * First, the server side XDR functions + */ +int +nlm4svc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + u32 exclusive; + + if (!(p = nlm4_decode_cookie(p, &argp->cookie))) + return 0; + + exclusive = ntohl(*p++); + if (!(p = nlm4_decode_lock(p, &argp->lock))) + return 0; + if (exclusive) + argp->lock.fl.fl_type = F_WRLCK; + + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm4_encode_testres(p, resp))) + return 0; + return xdr_ressize_check(rqstp, p); +} + +int +nlm4svc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + u32 exclusive; + + if (!(p = nlm4_decode_cookie(p, &argp->cookie))) + return 0; + argp->block = ntohl(*p++); + exclusive = ntohl(*p++); + if (!(p = nlm4_decode_lock(p, &argp->lock))) + return 0; + if (exclusive) + argp->lock.fl.fl_type = F_WRLCK; + argp->reclaim = ntohl(*p++); + argp->state = ntohl(*p++); + argp->monitor = 1; /* monitor client by default */ + + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + u32 exclusive; + + if (!(p = nlm4_decode_cookie(p, &argp->cookie))) + return 0; + argp->block = ntohl(*p++); + exclusive = ntohl(*p++); + if (!(p = nlm4_decode_lock(p, &argp->lock))) + return 0; + if (exclusive) + argp->lock.fl.fl_type = F_WRLCK; + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + if (!(p = nlm4_decode_cookie(p, &argp->cookie)) + || !(p = nlm4_decode_lock(p, &argp->lock))) + return 0; + argp->lock.fl.fl_type = F_UNLCK; + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp) +{ + struct nlm_lock *lock = &argp->lock; + + memset(lock, 0, sizeof(*lock)); + locks_init_lock(&lock->fl); + lock->svid = ~(u32) 0; + lock->fl.fl_pid = (pid_t)lock->svid; + + if (!(p = nlm4_decode_cookie(p, &argp->cookie)) + || !(p = xdr_decode_string_inplace(p, &lock->caller, + &lock->len, NLM_MAXSTRLEN)) + || !(p = nlm4_decode_fh(p, &lock->fh)) + || !(p = nlm4_decode_oh(p, &lock->oh))) + return 0; + argp->fsm_mode = ntohl(*p++); + argp->fsm_access = ntohl(*p++); + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm4_encode_cookie(p, &resp->cookie))) + return 0; + *p++ = resp->status; + *p++ = xdr_zero; /* sequence argument */ + return xdr_ressize_check(rqstp, p); +} + +int +nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm4_encode_cookie(p, &resp->cookie))) + return 0; + *p++ = resp->status; + return xdr_ressize_check(rqstp, p); +} + +int +nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp) +{ + struct nlm_lock *lock = &argp->lock; + + if (!(p = xdr_decode_string_inplace(p, &lock->caller, + &lock->len, NLM_MAXSTRLEN))) + return 0; + argp->state = ntohl(*p++); + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp) +{ + if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) + return 0; + argp->state = ntohl(*p++); + memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); + p += XDR_QUADLEN(SM_PRIV_SIZE); + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp) +{ + if (!(p = nlm4_decode_cookie(p, &resp->cookie))) + return 0; + resp->status = *p++; + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_argsize_check(rqstp, p); +} + +int +nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} diff --git a/fs/locks.c b/fs/locks.c new file mode 100644 index 00000000..b286539d --- /dev/null +++ b/fs/locks.c @@ -0,0 +1,2341 @@ +/* + * linux/fs/locks.c + * + * Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls. + * Doug Evans (dje@spiff.uucp), August 07, 1992 + * + * Deadlock detection added. + * FIXME: one thing isn't handled yet: + * - mandatory locks (requires lots of changes elsewhere) + * Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994. + * + * Miscellaneous edits, and a total rewrite of posix_lock_file() code. + * Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994 + * + * Converted file_lock_table to a linked list from an array, which eliminates + * the limits on how many active file locks are open. + * Chad Page (pageone@netcom.com), November 27, 1994 + * + * Removed dependency on file descriptors. dup()'ed file descriptors now + * get the same locks as the original file descriptors, and a close() on + * any file descriptor removes ALL the locks on the file for the current + * process. Since locks still depend on the process id, locks are inherited + * after an exec() but not after a fork(). This agrees with POSIX, and both + * BSD and SVR4 practice. + * Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995 + * + * Scrapped free list which is redundant now that we allocate locks + * dynamically with kmalloc()/kfree(). + * Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995 + * + * Implemented two lock personalities - FL_FLOCK and FL_POSIX. + * + * FL_POSIX locks are created with calls to fcntl() and lockf() through the + * fcntl() system call. They have the semantics described above. + * + * FL_FLOCK locks are created with calls to flock(), through the flock() + * system call, which is new. Old C libraries implement flock() via fcntl() + * and will continue to use the old, broken implementation. + * + * FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated + * with a file pointer (filp). As a result they can be shared by a parent + * process and its children after a fork(). They are removed when the last + * file descriptor referring to the file pointer is closed (unless explicitly + * unlocked). + * + * FL_FLOCK locks never deadlock, an existing lock is always removed before + * upgrading from shared to exclusive (or vice versa). When this happens + * any processes blocked by the current lock are woken up and allowed to + * run before the new lock is applied. + * Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995 + * + * Removed some race conditions in flock_lock_file(), marked other possible + * races. Just grep for FIXME to see them. + * Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996. + * + * Addressed Dmitry's concerns. Deadlock checking no longer recursive. + * Lock allocation changed to GFP_ATOMIC as we can't afford to sleep + * once we've checked for blocking and deadlocking. + * Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996. + * + * Initial implementation of mandatory locks. SunOS turned out to be + * a rotten model, so I implemented the "obvious" semantics. + * See 'Documentation/mandatory.txt' for details. + * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996. + * + * Don't allow mandatory locks on mmap()'ed files. Added simple functions to + * check if a file has mandatory locks, used by mmap(), open() and creat() to + * see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference + * Manual, Section 2. + * Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996. + * + * Tidied up block list handling. Added '/proc/locks' interface. + * Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996. + * + * Fixed deadlock condition for pathological code that mixes calls to + * flock() and fcntl(). + * Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996. + * + * Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use + * for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to + * guarantee sensible behaviour in the case where file system modules might + * be compiled with different options than the kernel itself. + * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996. + * + * Added a couple of missing wake_up() calls. Thanks to Thomas Meckel + * (Thomas.Meckel@mni.fh-giessen.de) for spotting this. + * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996. + * + * Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK + * locks. Changed process synchronisation to avoid dereferencing locks that + * have already been freed. + * Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996. + * + * Made the block list a circular list to minimise searching in the list. + * Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996. + * + * Made mandatory locking a mount option. Default is not to allow mandatory + * locking. + * Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996. + * + * Some adaptations for NFS support. + * Olaf Kirch (okir@monad.swb.de), Dec 1996, + * + * Fixed /proc/locks interface so that we can't overrun the buffer we are handed. + * Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997. + * + * Use slab allocator instead of kmalloc/kfree. + * Use generic list implementation from . + * Sped up posix_locks_deadlock by only considering blocked locks. + * Matthew Wilcox , March, 2000. + * + * Leases and LOCK_MAND + * Matthew Wilcox , June, 2000. + * Stephen Rothwell , June, 2000. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) +#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) +#define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) + +int leases_enable = 1; +int lease_break_time = 45; + +#define for_each_lock(inode, lockp) \ + for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next) + +static LIST_HEAD(file_lock_list); +static LIST_HEAD(blocked_list); +static DEFINE_SPINLOCK(file_lock_lock); + +/* + * Protects the two list heads above, plus the inode->i_flock list + */ +void lock_flocks(void) +{ + spin_lock(&file_lock_lock); +} +EXPORT_SYMBOL_GPL(lock_flocks); + +void unlock_flocks(void) +{ + spin_unlock(&file_lock_lock); +} +EXPORT_SYMBOL_GPL(unlock_flocks); + +static struct kmem_cache *filelock_cache __read_mostly; + +static void locks_init_lock_always(struct file_lock *fl) +{ + fl->fl_next = NULL; + fl->fl_fasync = NULL; + fl->fl_owner = NULL; + fl->fl_pid = 0; + fl->fl_nspid = NULL; + fl->fl_file = NULL; + fl->fl_flags = 0; + fl->fl_type = 0; + fl->fl_start = fl->fl_end = 0; +} + +/* Allocate an empty lock structure. */ +struct file_lock *locks_alloc_lock(void) +{ + struct file_lock *fl = kmem_cache_alloc(filelock_cache, GFP_KERNEL); + + if (fl) + locks_init_lock_always(fl); + + return fl; +} +EXPORT_SYMBOL_GPL(locks_alloc_lock); + +void locks_release_private(struct file_lock *fl) +{ + if (fl->fl_ops) { + if (fl->fl_ops->fl_release_private) + fl->fl_ops->fl_release_private(fl); + fl->fl_ops = NULL; + } + if (fl->fl_lmops) { + if (fl->fl_lmops->fl_release_private) + fl->fl_lmops->fl_release_private(fl); + fl->fl_lmops = NULL; + } + +} +EXPORT_SYMBOL_GPL(locks_release_private); + +/* Free a lock which is not in use. */ +void locks_free_lock(struct file_lock *fl) +{ + BUG_ON(waitqueue_active(&fl->fl_wait)); + BUG_ON(!list_empty(&fl->fl_block)); + BUG_ON(!list_empty(&fl->fl_link)); + + locks_release_private(fl); + kmem_cache_free(filelock_cache, fl); +} +EXPORT_SYMBOL(locks_free_lock); + +void locks_init_lock(struct file_lock *fl) +{ + INIT_LIST_HEAD(&fl->fl_link); + INIT_LIST_HEAD(&fl->fl_block); + init_waitqueue_head(&fl->fl_wait); + fl->fl_ops = NULL; + fl->fl_lmops = NULL; + locks_init_lock_always(fl); +} + +EXPORT_SYMBOL(locks_init_lock); + +/* + * Initialises the fields of the file lock which are invariant for + * free file_locks. + */ +static void init_once(void *foo) +{ + struct file_lock *lock = (struct file_lock *) foo; + + locks_init_lock(lock); +} + +static void locks_copy_private(struct file_lock *new, struct file_lock *fl) +{ + if (fl->fl_ops) { + if (fl->fl_ops->fl_copy_lock) + fl->fl_ops->fl_copy_lock(new, fl); + new->fl_ops = fl->fl_ops; + } + if (fl->fl_lmops) + new->fl_lmops = fl->fl_lmops; +} + +/* + * Initialize a new lock from an existing file_lock structure. + */ +void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl) +{ + new->fl_owner = fl->fl_owner; + new->fl_pid = fl->fl_pid; + new->fl_file = NULL; + new->fl_flags = fl->fl_flags; + new->fl_type = fl->fl_type; + new->fl_start = fl->fl_start; + new->fl_end = fl->fl_end; + new->fl_ops = NULL; + new->fl_lmops = NULL; +} +EXPORT_SYMBOL(__locks_copy_lock); + +void locks_copy_lock(struct file_lock *new, struct file_lock *fl) +{ + locks_release_private(new); + + __locks_copy_lock(new, fl); + new->fl_file = fl->fl_file; + new->fl_ops = fl->fl_ops; + new->fl_lmops = fl->fl_lmops; + + locks_copy_private(new, fl); +} + +EXPORT_SYMBOL(locks_copy_lock); + +static inline int flock_translate_cmd(int cmd) { + if (cmd & LOCK_MAND) + return cmd & (LOCK_MAND | LOCK_RW); + switch (cmd) { + case LOCK_SH: + return F_RDLCK; + case LOCK_EX: + return F_WRLCK; + case LOCK_UN: + return F_UNLCK; + } + return -EINVAL; +} + +/* Fill in a file_lock structure with an appropriate FLOCK lock. */ +static int flock_make_lock(struct file *filp, struct file_lock **lock, + unsigned int cmd) +{ + struct file_lock *fl; + int type = flock_translate_cmd(cmd); + if (type < 0) + return type; + + fl = locks_alloc_lock(); + if (fl == NULL) + return -ENOMEM; + + fl->fl_file = filp; + fl->fl_pid = current->tgid; + fl->fl_flags = FL_FLOCK; + fl->fl_type = type; + fl->fl_end = OFFSET_MAX; + + *lock = fl; + return 0; +} + +static int assign_type(struct file_lock *fl, int type) +{ + switch (type) { + case F_RDLCK: + case F_WRLCK: + case F_UNLCK: + fl->fl_type = type; + break; + default: + return -EINVAL; + } + return 0; +} + +/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX + * style lock. + */ +static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, + struct flock *l) +{ + off_t start, end; + + switch (l->l_whence) { + case SEEK_SET: + start = 0; + break; + case SEEK_CUR: + start = filp->f_pos; + break; + case SEEK_END: + start = i_size_read(filp->f_path.dentry->d_inode); + break; + default: + return -EINVAL; + } + + /* POSIX-1996 leaves the case l->l_len < 0 undefined; + POSIX-2001 defines it. */ + start += l->l_start; + if (start < 0) + return -EINVAL; + fl->fl_end = OFFSET_MAX; + if (l->l_len > 0) { + end = start + l->l_len - 1; + fl->fl_end = end; + } else if (l->l_len < 0) { + end = start - 1; + fl->fl_end = end; + start += l->l_len; + if (start < 0) + return -EINVAL; + } + fl->fl_start = start; /* we record the absolute position */ + if (fl->fl_end < fl->fl_start) + return -EOVERFLOW; + + fl->fl_owner = current->files; + fl->fl_pid = current->tgid; + fl->fl_file = filp; + fl->fl_flags = FL_POSIX; + fl->fl_ops = NULL; + fl->fl_lmops = NULL; + + return assign_type(fl, l->l_type); +} + +#if BITS_PER_LONG == 32 +static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, + struct flock64 *l) +{ + loff_t start; + + switch (l->l_whence) { + case SEEK_SET: + start = 0; + break; + case SEEK_CUR: + start = filp->f_pos; + break; + case SEEK_END: + start = i_size_read(filp->f_path.dentry->d_inode); + break; + default: + return -EINVAL; + } + + start += l->l_start; + if (start < 0) + return -EINVAL; + fl->fl_end = OFFSET_MAX; + if (l->l_len > 0) { + fl->fl_end = start + l->l_len - 1; + } else if (l->l_len < 0) { + fl->fl_end = start - 1; + start += l->l_len; + if (start < 0) + return -EINVAL; + } + fl->fl_start = start; /* we record the absolute position */ + if (fl->fl_end < fl->fl_start) + return -EOVERFLOW; + + fl->fl_owner = current->files; + fl->fl_pid = current->tgid; + fl->fl_file = filp; + fl->fl_flags = FL_POSIX; + fl->fl_ops = NULL; + fl->fl_lmops = NULL; + + return assign_type(fl, l->l_type); +} +#endif + +/* default lease lock manager operations */ +static void lease_break_callback(struct file_lock *fl) +{ + kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG); +} + +static void lease_release_private_callback(struct file_lock *fl) +{ + if (!fl->fl_file) + return; + + f_delown(fl->fl_file); + fl->fl_file->f_owner.signum = 0; +} + +static const struct lock_manager_operations lease_manager_ops = { + .fl_break = lease_break_callback, + .fl_release_private = lease_release_private_callback, + .fl_change = lease_modify, +}; + +/* + * Initialize a lease, use the default lock manager operations + */ +static int lease_init(struct file *filp, int type, struct file_lock *fl) + { + if (assign_type(fl, type) != 0) + return -EINVAL; + + fl->fl_owner = current->files; + fl->fl_pid = current->tgid; + + fl->fl_file = filp; + fl->fl_flags = FL_LEASE; + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; + fl->fl_ops = NULL; + fl->fl_lmops = &lease_manager_ops; + return 0; +} + +/* Allocate a file_lock initialised to this type of lease */ +static struct file_lock *lease_alloc(struct file *filp, int type) +{ + struct file_lock *fl = locks_alloc_lock(); + int error = -ENOMEM; + + if (fl == NULL) + return ERR_PTR(error); + + error = lease_init(filp, type, fl); + if (error) { + locks_free_lock(fl); + return ERR_PTR(error); + } + return fl; +} + +/* Check if two locks overlap each other. + */ +static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2) +{ + return ((fl1->fl_end >= fl2->fl_start) && + (fl2->fl_end >= fl1->fl_start)); +} + +/* + * Check whether two locks have the same owner. + */ +static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) +{ + if (fl1->fl_lmops && fl1->fl_lmops->fl_compare_owner) + return fl2->fl_lmops == fl1->fl_lmops && + fl1->fl_lmops->fl_compare_owner(fl1, fl2); + return fl1->fl_owner == fl2->fl_owner; +} + +/* Remove waiter from blocker's block list. + * When blocker ends up pointing to itself then the list is empty. + */ +static void __locks_delete_block(struct file_lock *waiter) +{ + list_del_init(&waiter->fl_block); + list_del_init(&waiter->fl_link); + waiter->fl_next = NULL; +} + +/* + */ +static void locks_delete_block(struct file_lock *waiter) +{ + lock_flocks(); + __locks_delete_block(waiter); + unlock_flocks(); +} + +/* Insert waiter into blocker's block list. + * We use a circular list so that processes can be easily woken up in + * the order they blocked. The documentation doesn't require this but + * it seems like the reasonable thing to do. + */ +static void locks_insert_block(struct file_lock *blocker, + struct file_lock *waiter) +{ + BUG_ON(!list_empty(&waiter->fl_block)); + list_add_tail(&waiter->fl_block, &blocker->fl_block); + waiter->fl_next = blocker; + if (IS_POSIX(blocker)) + list_add(&waiter->fl_link, &blocked_list); +} + +/* Wake up processes blocked waiting for blocker. + * If told to wait then schedule the processes until the block list + * is empty, otherwise empty the block list ourselves. + */ +static void locks_wake_up_blocks(struct file_lock *blocker) +{ + while (!list_empty(&blocker->fl_block)) { + struct file_lock *waiter; + + waiter = list_first_entry(&blocker->fl_block, + struct file_lock, fl_block); + __locks_delete_block(waiter); + if (waiter->fl_lmops && waiter->fl_lmops->fl_notify) + waiter->fl_lmops->fl_notify(waiter); + else + wake_up(&waiter->fl_wait); + } +} + +/* Insert file lock fl into an inode's lock list at the position indicated + * by pos. At the same time add the lock to the global file lock list. + */ +static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) +{ + list_add(&fl->fl_link, &file_lock_list); + + fl->fl_nspid = get_pid(task_tgid(current)); + + /* insert into file's list */ + fl->fl_next = *pos; + *pos = fl; +} + +/* + * Delete a lock and then free it. + * Wake up processes that are blocked waiting for this lock, + * notify the FS that the lock has been cleared and + * finally free the lock. + */ +static void locks_delete_lock(struct file_lock **thisfl_p) +{ + struct file_lock *fl = *thisfl_p; + + *thisfl_p = fl->fl_next; + fl->fl_next = NULL; + list_del_init(&fl->fl_link); + + fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync); + if (fl->fl_fasync != NULL) { + printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); + fl->fl_fasync = NULL; + } + + if (fl->fl_nspid) { + put_pid(fl->fl_nspid); + fl->fl_nspid = NULL; + } + + locks_wake_up_blocks(fl); + locks_free_lock(fl); +} + +/* Determine if lock sys_fl blocks lock caller_fl. Common functionality + * checks for shared/exclusive status of overlapping locks. + */ +static int locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl) +{ + if (sys_fl->fl_type == F_WRLCK) + return 1; + if (caller_fl->fl_type == F_WRLCK) + return 1; + return 0; +} + +/* Determine if lock sys_fl blocks lock caller_fl. POSIX specific + * checking before calling the locks_conflict(). + */ +static int posix_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl) +{ + /* POSIX locks owned by the same process do not conflict with + * each other. + */ + if (!IS_POSIX(sys_fl) || posix_same_owner(caller_fl, sys_fl)) + return (0); + + /* Check whether they overlap */ + if (!locks_overlap(caller_fl, sys_fl)) + return 0; + + return (locks_conflict(caller_fl, sys_fl)); +} + +/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific + * checking before calling the locks_conflict(). + */ +static int flock_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl) +{ + /* FLOCK locks referring to the same filp do not conflict with + * each other. + */ + if (!IS_FLOCK(sys_fl) || (caller_fl->fl_file == sys_fl->fl_file)) + return (0); + if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND)) + return 0; + + return (locks_conflict(caller_fl, sys_fl)); +} + +void +posix_test_lock(struct file *filp, struct file_lock *fl) +{ + struct file_lock *cfl; + + lock_flocks(); + for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) { + if (!IS_POSIX(cfl)) + continue; + if (posix_locks_conflict(fl, cfl)) + break; + } + if (cfl) { + __locks_copy_lock(fl, cfl); + if (cfl->fl_nspid) + fl->fl_pid = pid_vnr(cfl->fl_nspid); + } else + fl->fl_type = F_UNLCK; + unlock_flocks(); + return; +} +EXPORT_SYMBOL(posix_test_lock); + +/* + * Deadlock detection: + * + * We attempt to detect deadlocks that are due purely to posix file + * locks. + * + * We assume that a task can be waiting for at most one lock at a time. + * So for any acquired lock, the process holding that lock may be + * waiting on at most one other lock. That lock in turns may be held by + * someone waiting for at most one other lock. Given a requested lock + * caller_fl which is about to wait for a conflicting lock block_fl, we + * follow this chain of waiters to ensure we are not about to create a + * cycle. + * + * Since we do this before we ever put a process to sleep on a lock, we + * are ensured that there is never a cycle; that is what guarantees that + * the while() loop in posix_locks_deadlock() eventually completes. + * + * Note: the above assumption may not be true when handling lock + * requests from a broken NFS client. It may also fail in the presence + * of tasks (such as posix threads) sharing the same open file table. + * + * To handle those cases, we just bail out after a few iterations. + */ + +#define MAX_DEADLK_ITERATIONS 10 + +/* Find a lock that the owner of the given block_fl is blocking on. */ +static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl) +{ + struct file_lock *fl; + + list_for_each_entry(fl, &blocked_list, fl_link) { + if (posix_same_owner(fl, block_fl)) + return fl->fl_next; + } + return NULL; +} + +static int posix_locks_deadlock(struct file_lock *caller_fl, + struct file_lock *block_fl) +{ + int i = 0; + + while ((block_fl = what_owner_is_waiting_for(block_fl))) { + if (i++ > MAX_DEADLK_ITERATIONS) + return 0; + if (posix_same_owner(caller_fl, block_fl)) + return 1; + } + return 0; +} + +/* Try to create a FLOCK lock on filp. We always insert new FLOCK locks + * after any leases, but before any posix locks. + * + * Note that if called with an FL_EXISTS argument, the caller may determine + * whether or not a lock was successfully freed by testing the return + * value for -ENOENT. + */ +static int flock_lock_file(struct file *filp, struct file_lock *request) +{ + struct file_lock *new_fl = NULL; + struct file_lock **before; + struct inode * inode = filp->f_path.dentry->d_inode; + int error = 0; + int found = 0; + + if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) { + new_fl = locks_alloc_lock(); + if (!new_fl) + return -ENOMEM; + } + + lock_flocks(); + if (request->fl_flags & FL_ACCESS) + goto find_conflict; + + for_each_lock(inode, before) { + struct file_lock *fl = *before; + if (IS_POSIX(fl)) + break; + if (IS_LEASE(fl)) + continue; + if (filp != fl->fl_file) + continue; + if (request->fl_type == fl->fl_type) + goto out; + found = 1; + locks_delete_lock(before); + break; + } + + if (request->fl_type == F_UNLCK) { + if ((request->fl_flags & FL_EXISTS) && !found) + error = -ENOENT; + goto out; + } + + /* + * If a higher-priority process was blocked on the old file lock, + * give it the opportunity to lock the file. + */ + if (found) { + unlock_flocks(); + cond_resched(); + lock_flocks(); + } + +find_conflict: + for_each_lock(inode, before) { + struct file_lock *fl = *before; + if (IS_POSIX(fl)) + break; + if (IS_LEASE(fl)) + continue; + if (!flock_locks_conflict(request, fl)) + continue; + error = -EAGAIN; + if (!(request->fl_flags & FL_SLEEP)) + goto out; + error = FILE_LOCK_DEFERRED; + locks_insert_block(fl, request); + goto out; + } + if (request->fl_flags & FL_ACCESS) + goto out; + locks_copy_lock(new_fl, request); + locks_insert_lock(before, new_fl); + new_fl = NULL; + error = 0; + +out: + unlock_flocks(); + if (new_fl) + locks_free_lock(new_fl); + return error; +} + +static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock) +{ + struct file_lock *fl; + struct file_lock *new_fl = NULL; + struct file_lock *new_fl2 = NULL; + struct file_lock *left = NULL; + struct file_lock *right = NULL; + struct file_lock **before; + int error, added = 0; + + /* + * We may need two file_lock structures for this operation, + * so we get them in advance to avoid races. + * + * In some cases we can be sure, that no new locks will be needed + */ + if (!(request->fl_flags & FL_ACCESS) && + (request->fl_type != F_UNLCK || + request->fl_start != 0 || request->fl_end != OFFSET_MAX)) { + new_fl = locks_alloc_lock(); + new_fl2 = locks_alloc_lock(); + } + + lock_flocks(); + if (request->fl_type != F_UNLCK) { + for_each_lock(inode, before) { + fl = *before; + if (!IS_POSIX(fl)) + continue; + if (!posix_locks_conflict(request, fl)) + continue; + if (conflock) + __locks_copy_lock(conflock, fl); + error = -EAGAIN; + if (!(request->fl_flags & FL_SLEEP)) + goto out; + error = -EDEADLK; + if (posix_locks_deadlock(request, fl)) + goto out; + error = FILE_LOCK_DEFERRED; + locks_insert_block(fl, request); + goto out; + } + } + + /* If we're just looking for a conflict, we're done. */ + error = 0; + if (request->fl_flags & FL_ACCESS) + goto out; + + /* + * Find the first old lock with the same owner as the new lock. + */ + + before = &inode->i_flock; + + /* First skip locks owned by other processes. */ + while ((fl = *before) && (!IS_POSIX(fl) || + !posix_same_owner(request, fl))) { + before = &fl->fl_next; + } + + /* Process locks with this owner. */ + while ((fl = *before) && posix_same_owner(request, fl)) { + /* Detect adjacent or overlapping regions (if same lock type) + */ + if (request->fl_type == fl->fl_type) { + /* In all comparisons of start vs end, use + * "start - 1" rather than "end + 1". If end + * is OFFSET_MAX, end + 1 will become negative. + */ + if (fl->fl_end < request->fl_start - 1) + goto next_lock; + /* If the next lock in the list has entirely bigger + * addresses than the new one, insert the lock here. + */ + if (fl->fl_start - 1 > request->fl_end) + break; + + /* If we come here, the new and old lock are of the + * same type and adjacent or overlapping. Make one + * lock yielding from the lower start address of both + * locks to the higher end address. + */ + if (fl->fl_start > request->fl_start) + fl->fl_start = request->fl_start; + else + request->fl_start = fl->fl_start; + if (fl->fl_end < request->fl_end) + fl->fl_end = request->fl_end; + else + request->fl_end = fl->fl_end; + if (added) { + locks_delete_lock(before); + continue; + } + request = fl; + added = 1; + } + else { + /* Processing for different lock types is a bit + * more complex. + */ + if (fl->fl_end < request->fl_start) + goto next_lock; + if (fl->fl_start > request->fl_end) + break; + if (request->fl_type == F_UNLCK) + added = 1; + if (fl->fl_start < request->fl_start) + left = fl; + /* If the next lock in the list has a higher end + * address than the new one, insert the new one here. + */ + if (fl->fl_end > request->fl_end) { + right = fl; + break; + } + if (fl->fl_start >= request->fl_start) { + /* The new lock completely replaces an old + * one (This may happen several times). + */ + if (added) { + locks_delete_lock(before); + continue; + } + /* Replace the old lock with the new one. + * Wake up anybody waiting for the old one, + * as the change in lock type might satisfy + * their needs. + */ + locks_wake_up_blocks(fl); + fl->fl_start = request->fl_start; + fl->fl_end = request->fl_end; + fl->fl_type = request->fl_type; + locks_release_private(fl); + locks_copy_private(fl, request); + request = fl; + added = 1; + } + } + /* Go on to next lock. + */ + next_lock: + before = &fl->fl_next; + } + + /* + * The above code only modifies existing locks in case of + * merging or replacing. If new lock(s) need to be inserted + * all modifications are done bellow this, so it's safe yet to + * bail out. + */ + error = -ENOLCK; /* "no luck" */ + if (right && left == right && !new_fl2) + goto out; + + error = 0; + if (!added) { + if (request->fl_type == F_UNLCK) { + if (request->fl_flags & FL_EXISTS) + error = -ENOENT; + goto out; + } + + if (!new_fl) { + error = -ENOLCK; + goto out; + } + locks_copy_lock(new_fl, request); + locks_insert_lock(before, new_fl); + new_fl = NULL; + } + if (right) { + if (left == right) { + /* The new lock breaks the old one in two pieces, + * so we have to use the second new lock. + */ + left = new_fl2; + new_fl2 = NULL; + locks_copy_lock(left, right); + locks_insert_lock(before, left); + } + right->fl_start = request->fl_end + 1; + locks_wake_up_blocks(right); + } + if (left) { + left->fl_end = request->fl_start - 1; + locks_wake_up_blocks(left); + } + out: + unlock_flocks(); + /* + * Free any unused locks. + */ + if (new_fl) + locks_free_lock(new_fl); + if (new_fl2) + locks_free_lock(new_fl2); + return error; +} + +/** + * posix_lock_file - Apply a POSIX-style lock to a file + * @filp: The file to apply the lock to + * @fl: The lock to be applied + * @conflock: Place to return a copy of the conflicting lock, if found. + * + * Add a POSIX style lock to a file. + * We merge adjacent & overlapping locks whenever possible. + * POSIX locks are sorted by owner task, then by starting address + * + * Note that if called with an FL_EXISTS argument, the caller may determine + * whether or not a lock was successfully freed by testing the return + * value for -ENOENT. + */ +int posix_lock_file(struct file *filp, struct file_lock *fl, + struct file_lock *conflock) +{ + return __posix_lock_file(filp->f_path.dentry->d_inode, fl, conflock); +} +EXPORT_SYMBOL(posix_lock_file); + +/** + * posix_lock_file_wait - Apply a POSIX-style lock to a file + * @filp: The file to apply the lock to + * @fl: The lock to be applied + * + * Add a POSIX style lock to a file. + * We merge adjacent & overlapping locks whenever possible. + * POSIX locks are sorted by owner task, then by starting address + */ +int posix_lock_file_wait(struct file *filp, struct file_lock *fl) +{ + int error; + might_sleep (); + for (;;) { + error = posix_lock_file(filp, fl, NULL); + if (error != FILE_LOCK_DEFERRED) + break; + error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); + if (!error) + continue; + + locks_delete_block(fl); + break; + } + return error; +} +EXPORT_SYMBOL(posix_lock_file_wait); + +/** + * locks_mandatory_locked - Check for an active lock + * @inode: the file to check + * + * Searches the inode's list of locks to find any POSIX locks which conflict. + * This function is called from locks_verify_locked() only. + */ +int locks_mandatory_locked(struct inode *inode) +{ + fl_owner_t owner = current->files; + struct file_lock *fl; + + /* + * Search the lock list for this inode for any POSIX locks. + */ + lock_flocks(); + for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { + if (!IS_POSIX(fl)) + continue; + if (fl->fl_owner != owner) + break; + } + unlock_flocks(); + return fl ? -EAGAIN : 0; +} + +/** + * locks_mandatory_area - Check for a conflicting lock + * @read_write: %FLOCK_VERIFY_WRITE for exclusive access, %FLOCK_VERIFY_READ + * for shared + * @inode: the file to check + * @filp: how the file was opened (if it was) + * @offset: start of area to check + * @count: length of area to check + * + * Searches the inode's list of locks to find any POSIX locks which conflict. + * This function is called from rw_verify_area() and + * locks_verify_truncate(). + */ +int locks_mandatory_area(int read_write, struct inode *inode, + struct file *filp, loff_t offset, + size_t count) +{ + struct file_lock fl; + int error; + + locks_init_lock(&fl); + fl.fl_owner = current->files; + fl.fl_pid = current->tgid; + fl.fl_file = filp; + fl.fl_flags = FL_POSIX | FL_ACCESS; + if (filp && !(filp->f_flags & O_NONBLOCK)) + fl.fl_flags |= FL_SLEEP; + fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK; + fl.fl_start = offset; + fl.fl_end = offset + count - 1; + + for (;;) { + error = __posix_lock_file(inode, &fl, NULL); + if (error != FILE_LOCK_DEFERRED) + break; + error = wait_event_interruptible(fl.fl_wait, !fl.fl_next); + if (!error) { + /* + * If we've been sleeping someone might have + * changed the permissions behind our back. + */ + if (__mandatory_lock(inode)) + continue; + } + + locks_delete_block(&fl); + break; + } + + return error; +} + +EXPORT_SYMBOL(locks_mandatory_area); + +/* We already had a lease on this file; just change its type */ +int lease_modify(struct file_lock **before, int arg) +{ + struct file_lock *fl = *before; + int error = assign_type(fl, arg); + + if (error) + return error; + locks_wake_up_blocks(fl); + if (arg == F_UNLCK) + locks_delete_lock(before); + return 0; +} + +EXPORT_SYMBOL(lease_modify); + +static void time_out_leases(struct inode *inode) +{ + struct file_lock **before; + struct file_lock *fl; + + before = &inode->i_flock; + while ((fl = *before) && IS_LEASE(fl) && (fl->fl_type & F_INPROGRESS)) { + if ((fl->fl_break_time == 0) + || time_before(jiffies, fl->fl_break_time)) { + before = &fl->fl_next; + continue; + } + lease_modify(before, fl->fl_type & ~F_INPROGRESS); + if (fl == *before) /* lease_modify may have freed fl */ + before = &fl->fl_next; + } +} + +/** + * __break_lease - revoke all outstanding leases on file + * @inode: the inode of the file to return + * @mode: the open mode (read or write) + * + * break_lease (inlined for speed) has checked there already is at least + * some kind of lock (maybe a lease) on this file. Leases are broken on + * a call to open() or truncate(). This function can sleep unless you + * specified %O_NONBLOCK to your open(). + */ +int __break_lease(struct inode *inode, unsigned int mode) +{ + int error = 0, future; + struct file_lock *new_fl, *flock; + struct file_lock *fl; + unsigned long break_time; + int i_have_this_lease = 0; + int want_write = (mode & O_ACCMODE) != O_RDONLY; + + new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); + + lock_flocks(); + + time_out_leases(inode); + + flock = inode->i_flock; + if ((flock == NULL) || !IS_LEASE(flock)) + goto out; + + for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) + if (fl->fl_owner == current->files) + i_have_this_lease = 1; + + if (want_write) { + /* If we want write access, we have to revoke any lease. */ + future = F_UNLCK | F_INPROGRESS; + } else if (flock->fl_type & F_INPROGRESS) { + /* If the lease is already being broken, we just leave it */ + future = flock->fl_type; + } else if (flock->fl_type & F_WRLCK) { + /* Downgrade the exclusive lease to a read-only lease. */ + future = F_RDLCK | F_INPROGRESS; + } else { + /* the existing lease was read-only, so we can read too. */ + goto out; + } + + if (IS_ERR(new_fl) && !i_have_this_lease + && ((mode & O_NONBLOCK) == 0)) { + error = PTR_ERR(new_fl); + goto out; + } + + break_time = 0; + if (lease_break_time > 0) { + break_time = jiffies + lease_break_time * HZ; + if (break_time == 0) + break_time++; /* so that 0 means no break time */ + } + + for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { + if (fl->fl_type != future) { + fl->fl_type = future; + fl->fl_break_time = break_time; + /* lease must have lmops break callback */ + fl->fl_lmops->fl_break(fl); + } + } + + if (i_have_this_lease || (mode & O_NONBLOCK)) { + error = -EWOULDBLOCK; + goto out; + } + +restart: + break_time = flock->fl_break_time; + if (break_time != 0) { + break_time -= jiffies; + if (break_time == 0) + break_time++; + } + locks_insert_block(flock, new_fl); + unlock_flocks(); + error = wait_event_interruptible_timeout(new_fl->fl_wait, + !new_fl->fl_next, break_time); + lock_flocks(); + __locks_delete_block(new_fl); + if (error >= 0) { + if (error == 0) + time_out_leases(inode); + /* Wait for the next lease that has not been broken yet */ + for (flock = inode->i_flock; flock && IS_LEASE(flock); + flock = flock->fl_next) { + if (flock->fl_type & F_INPROGRESS) + goto restart; + } + error = 0; + } + +out: + unlock_flocks(); + if (!IS_ERR(new_fl)) + locks_free_lock(new_fl); + return error; +} + +EXPORT_SYMBOL(__break_lease); + +/** + * lease_get_mtime - get the last modified time of an inode + * @inode: the inode + * @time: pointer to a timespec which will contain the last modified time + * + * This is to force NFS clients to flush their caches for files with + * exclusive leases. The justification is that if someone has an + * exclusive lease, then they could be modifying it. + */ +void lease_get_mtime(struct inode *inode, struct timespec *time) +{ + struct file_lock *flock = inode->i_flock; + if (flock && IS_LEASE(flock) && (flock->fl_type & F_WRLCK)) + *time = current_fs_time(inode->i_sb); + else + *time = inode->i_mtime; +} + +EXPORT_SYMBOL(lease_get_mtime); + +/** + * fcntl_getlease - Enquire what lease is currently active + * @filp: the file + * + * The value returned by this function will be one of + * (if no lease break is pending): + * + * %F_RDLCK to indicate a shared lease is held. + * + * %F_WRLCK to indicate an exclusive lease is held. + * + * %F_UNLCK to indicate no lease is held. + * + * (if a lease break is pending): + * + * %F_RDLCK to indicate an exclusive lease needs to be + * changed to a shared lease (or removed). + * + * %F_UNLCK to indicate the lease needs to be removed. + * + * XXX: sfr & willy disagree over whether F_INPROGRESS + * should be returned to userspace. + */ +int fcntl_getlease(struct file *filp) +{ + struct file_lock *fl; + int type = F_UNLCK; + + lock_flocks(); + time_out_leases(filp->f_path.dentry->d_inode); + for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); + fl = fl->fl_next) { + if (fl->fl_file == filp) { + type = fl->fl_type & ~F_INPROGRESS; + break; + } + } + unlock_flocks(); + return type; +} + +/** + * generic_setlease - sets a lease on an open file + * @filp: file pointer + * @arg: type of lease to obtain + * @flp: input - file_lock to use, output - file_lock inserted + * + * The (input) flp->fl_lmops->fl_break function is required + * by break_lease(). + * + * Called with file_lock_lock held. + */ +int generic_setlease(struct file *filp, long arg, struct file_lock **flp) +{ + struct file_lock *fl, **before, **my_before = NULL, *lease; + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + int error, rdlease_count = 0, wrlease_count = 0; + + lease = *flp; + + error = -EACCES; + if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) + goto out; + error = -EINVAL; + if (!S_ISREG(inode->i_mode)) + goto out; + error = security_file_lock(filp, arg); + if (error) + goto out; + + time_out_leases(inode); + + BUG_ON(!(*flp)->fl_lmops->fl_break); + + if (arg != F_UNLCK) { + error = -EAGAIN; + if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) + goto out; + if ((arg == F_WRLCK) + && ((dentry->d_count > 1) + || (atomic_read(&inode->i_count) > 1))) + goto out; + } + + /* + * At this point, we know that if there is an exclusive + * lease on this file, then we hold it on this filp + * (otherwise our open of this file would have blocked). + * And if we are trying to acquire an exclusive lease, + * then the file is not open by anyone (including us) + * except for this filp. + */ + for (before = &inode->i_flock; + ((fl = *before) != NULL) && IS_LEASE(fl); + before = &fl->fl_next) { + if (fl->fl_file == filp) + my_before = before; + else if (fl->fl_type == (F_INPROGRESS | F_UNLCK)) + /* + * Someone is in the process of opening this + * file for writing so we may not take an + * exclusive lease on it. + */ + wrlease_count++; + else + rdlease_count++; + } + + error = -EAGAIN; + if ((arg == F_RDLCK && (wrlease_count > 0)) || + (arg == F_WRLCK && ((rdlease_count + wrlease_count) > 0))) + goto out; + + if (my_before != NULL) { + error = lease->fl_lmops->fl_change(my_before, arg); + if (!error) + *flp = *my_before; + goto out; + } + + if (arg == F_UNLCK) + goto out; + + error = -EINVAL; + if (!leases_enable) + goto out; + + locks_insert_lock(before, lease); + return 0; + +out: + return error; +} +EXPORT_SYMBOL(generic_setlease); + +static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) +{ + if (filp->f_op && filp->f_op->setlease) + return filp->f_op->setlease(filp, arg, lease); + else + return generic_setlease(filp, arg, lease); +} + +/** + * vfs_setlease - sets a lease on an open file + * @filp: file pointer + * @arg: type of lease to obtain + * @lease: file_lock to use + * + * Call this to establish a lease on the file. + * The (*lease)->fl_lmops->fl_break operation must be set; if not, + * break_lease will oops! + * + * This will call the filesystem's setlease file method, if + * defined. Note that there is no getlease method; instead, the + * filesystem setlease method should call back to setlease() to + * add a lease to the inode's lease list, where fcntl_getlease() can + * find it. Since fcntl_getlease() only reports whether the current + * task holds a lease, a cluster filesystem need only do this for + * leases held by processes on this node. + * + * There is also no break_lease method; filesystems that + * handle their own leases should break leases themselves from the + * filesystem's open, create, and (on truncate) setattr methods. + * + * Warning: the only current setlease methods exist only to disable + * leases in certain cases. More vfs changes may be required to + * allow a full filesystem lease implementation. + */ + +int vfs_setlease(struct file *filp, long arg, struct file_lock **lease) +{ + int error; + + lock_flocks(); + error = __vfs_setlease(filp, arg, lease); + unlock_flocks(); + + return error; +} +EXPORT_SYMBOL_GPL(vfs_setlease); + +static int do_fcntl_delete_lease(struct file *filp) +{ + struct file_lock fl, *flp = &fl; + + lease_init(filp, F_UNLCK, flp); + + return vfs_setlease(filp, F_UNLCK, &flp); +} + +static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) +{ + struct file_lock *fl, *ret; + struct fasync_struct *new; + int error; + + fl = lease_alloc(filp, arg); + if (IS_ERR(fl)) + return PTR_ERR(fl); + + new = fasync_alloc(); + if (!new) { + locks_free_lock(fl); + return -ENOMEM; + } + ret = fl; + lock_flocks(); + error = __vfs_setlease(filp, arg, &ret); + if (error) { + unlock_flocks(); + locks_free_lock(fl); + goto out_free_fasync; + } + if (ret != fl) + locks_free_lock(fl); + + /* + * fasync_insert_entry() returns the old entry if any. + * If there was no old entry, then it used 'new' and + * inserted it into the fasync list. Clear new so that + * we don't release it here. + */ + if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new)) + new = NULL; + + error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); + unlock_flocks(); + +out_free_fasync: + if (new) + fasync_free(new); + return error; +} + +/** + * fcntl_setlease - sets a lease on an open file + * @fd: open file descriptor + * @filp: file pointer + * @arg: type of lease to obtain + * + * Call this fcntl to establish a lease on the file. + * Note that you also need to call %F_SETSIG to + * receive a signal when the lease is broken. + */ +int fcntl_setlease(unsigned int fd, struct file *filp, long arg) +{ + if (arg == F_UNLCK) + return do_fcntl_delete_lease(filp); + return do_fcntl_add_lease(fd, filp, arg); +} + +/** + * flock_lock_file_wait - Apply a FLOCK-style lock to a file + * @filp: The file to apply the lock to + * @fl: The lock to be applied + * + * Add a FLOCK style lock to a file. + */ +int flock_lock_file_wait(struct file *filp, struct file_lock *fl) +{ + int error; + might_sleep(); + for (;;) { + error = flock_lock_file(filp, fl); + if (error != FILE_LOCK_DEFERRED) + break; + error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); + if (!error) + continue; + + locks_delete_block(fl); + break; + } + return error; +} + +EXPORT_SYMBOL(flock_lock_file_wait); + +/** + * sys_flock: - flock() system call. + * @fd: the file descriptor to lock. + * @cmd: the type of lock to apply. + * + * Apply a %FL_FLOCK style lock to an open file descriptor. + * The @cmd can be one of + * + * %LOCK_SH -- a shared lock. + * + * %LOCK_EX -- an exclusive lock. + * + * %LOCK_UN -- remove an existing lock. + * + * %LOCK_MAND -- a `mandatory' flock. This exists to emulate Windows Share Modes. + * + * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other + * processes read and write access respectively. + */ +SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) +{ + struct file *filp; + struct file_lock *lock; + int can_sleep, unlock; + int error; + + error = -EBADF; + filp = fget(fd); + if (!filp) + goto out; + + can_sleep = !(cmd & LOCK_NB); + cmd &= ~LOCK_NB; + unlock = (cmd == LOCK_UN); + + if (!unlock && !(cmd & LOCK_MAND) && + !(filp->f_mode & (FMODE_READ|FMODE_WRITE))) + goto out_putf; + + error = flock_make_lock(filp, &lock, cmd); + if (error) + goto out_putf; + if (can_sleep) + lock->fl_flags |= FL_SLEEP; + + error = security_file_lock(filp, lock->fl_type); + if (error) + goto out_free; + + if (filp->f_op && filp->f_op->flock) + error = filp->f_op->flock(filp, + (can_sleep) ? F_SETLKW : F_SETLK, + lock); + else + error = flock_lock_file_wait(filp, lock); + + out_free: + locks_free_lock(lock); + + out_putf: + fput(filp); + out: + return error; +} + +/** + * vfs_test_lock - test file byte range lock + * @filp: The file to test lock for + * @fl: The lock to test; also used to hold result + * + * Returns -ERRNO on failure. Indicates presence of conflicting lock by + * setting conf->fl_type to something other than F_UNLCK. + */ +int vfs_test_lock(struct file *filp, struct file_lock *fl) +{ + if (filp->f_op && filp->f_op->lock) + return filp->f_op->lock(filp, F_GETLK, fl); + posix_test_lock(filp, fl); + return 0; +} +EXPORT_SYMBOL_GPL(vfs_test_lock); + +static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) +{ + flock->l_pid = fl->fl_pid; +#if BITS_PER_LONG == 32 + /* + * Make sure we can represent the posix lock via + * legacy 32bit flock. + */ + if (fl->fl_start > OFFT_OFFSET_MAX) + return -EOVERFLOW; + if (fl->fl_end != OFFSET_MAX && fl->fl_end > OFFT_OFFSET_MAX) + return -EOVERFLOW; +#endif + flock->l_start = fl->fl_start; + flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : + fl->fl_end - fl->fl_start + 1; + flock->l_whence = 0; + flock->l_type = fl->fl_type; + return 0; +} + +#if BITS_PER_LONG == 32 +static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) +{ + flock->l_pid = fl->fl_pid; + flock->l_start = fl->fl_start; + flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : + fl->fl_end - fl->fl_start + 1; + flock->l_whence = 0; + flock->l_type = fl->fl_type; +} +#endif + +/* Report the first existing lock that would conflict with l. + * This implements the F_GETLK command of fcntl(). + */ +int fcntl_getlk(struct file *filp, struct flock __user *l) +{ + struct file_lock file_lock; + struct flock flock; + int error; + + error = -EFAULT; + if (copy_from_user(&flock, l, sizeof(flock))) + goto out; + error = -EINVAL; + if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK)) + goto out; + + error = flock_to_posix_lock(filp, &file_lock, &flock); + if (error) + goto out; + + error = vfs_test_lock(filp, &file_lock); + if (error) + goto out; + + flock.l_type = file_lock.fl_type; + if (file_lock.fl_type != F_UNLCK) { + error = posix_lock_to_flock(&flock, &file_lock); + if (error) + goto out; + } + error = -EFAULT; + if (!copy_to_user(l, &flock, sizeof(flock))) + error = 0; +out: + return error; +} + +/** + * vfs_lock_file - file byte range lock + * @filp: The file to apply the lock to + * @cmd: type of locking operation (F_SETLK, F_GETLK, etc.) + * @fl: The lock to be applied + * @conf: Place to return a copy of the conflicting lock, if found. + * + * A caller that doesn't care about the conflicting lock may pass NULL + * as the final argument. + * + * If the filesystem defines a private ->lock() method, then @conf will + * be left unchanged; so a caller that cares should initialize it to + * some acceptable default. + * + * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX + * locks, the ->lock() interface may return asynchronously, before the lock has + * been granted or denied by the underlying filesystem, if (and only if) + * fl_grant is set. Callers expecting ->lock() to return asynchronously + * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if) + * the request is for a blocking lock. When ->lock() does return asynchronously, + * it must return FILE_LOCK_DEFERRED, and call ->fl_grant() when the lock + * request completes. + * If the request is for non-blocking lock the file system should return + * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine + * with the result. If the request timed out the callback routine will return a + * nonzero return code and the file system should release the lock. The file + * system is also responsible to keep a corresponding posix lock when it + * grants a lock so the VFS can find out which locks are locally held and do + * the correct lock cleanup when required. + * The underlying filesystem must not drop the kernel lock or call + * ->fl_grant() before returning to the caller with a FILE_LOCK_DEFERRED + * return code. + */ +int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) +{ + if (filp->f_op && filp->f_op->lock) + return filp->f_op->lock(filp, cmd, fl); + else + return posix_lock_file(filp, fl, conf); +} +EXPORT_SYMBOL_GPL(vfs_lock_file); + +static int do_lock_file_wait(struct file *filp, unsigned int cmd, + struct file_lock *fl) +{ + int error; + + error = security_file_lock(filp, fl->fl_type); + if (error) + return error; + + for (;;) { + error = vfs_lock_file(filp, cmd, fl, NULL); + if (error != FILE_LOCK_DEFERRED) + break; + error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); + if (!error) + continue; + + locks_delete_block(fl); + break; + } + + return error; +} + +/* Apply the lock described by l to an open file descriptor. + * This implements both the F_SETLK and F_SETLKW commands of fcntl(). + */ +int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, + struct flock __user *l) +{ + struct file_lock *file_lock = locks_alloc_lock(); + struct flock flock; + struct inode *inode; + struct file *f; + int error; + + if (file_lock == NULL) + return -ENOLCK; + + /* + * This might block, so we do it before checking the inode. + */ + error = -EFAULT; + if (copy_from_user(&flock, l, sizeof(flock))) + goto out; + + inode = filp->f_path.dentry->d_inode; + + /* Don't allow mandatory locks on files that may be memory mapped + * and shared. + */ + if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) { + error = -EAGAIN; + goto out; + } + +again: + error = flock_to_posix_lock(filp, file_lock, &flock); + if (error) + goto out; + if (cmd == F_SETLKW) { + file_lock->fl_flags |= FL_SLEEP; + } + + error = -EBADF; + switch (flock.l_type) { + case F_RDLCK: + if (!(filp->f_mode & FMODE_READ)) + goto out; + break; + case F_WRLCK: + if (!(filp->f_mode & FMODE_WRITE)) + goto out; + break; + case F_UNLCK: + break; + default: + error = -EINVAL; + goto out; + } + + error = do_lock_file_wait(filp, cmd, file_lock); + + /* + * Attempt to detect a close/fcntl race and recover by + * releasing the lock that was just acquired. + */ + /* + * we need that spin_lock here - it prevents reordering between + * update of inode->i_flock and check for it done in close(). + * rcu_read_lock() wouldn't do. + */ + spin_lock(¤t->files->file_lock); + f = fcheck(fd); + spin_unlock(¤t->files->file_lock); + if (!error && f != filp && flock.l_type != F_UNLCK) { + flock.l_type = F_UNLCK; + goto again; + } + +out: + locks_free_lock(file_lock); + return error; +} + +#if BITS_PER_LONG == 32 +/* Report the first existing lock that would conflict with l. + * This implements the F_GETLK command of fcntl(). + */ +int fcntl_getlk64(struct file *filp, struct flock64 __user *l) +{ + struct file_lock file_lock; + struct flock64 flock; + int error; + + error = -EFAULT; + if (copy_from_user(&flock, l, sizeof(flock))) + goto out; + error = -EINVAL; + if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK)) + goto out; + + error = flock64_to_posix_lock(filp, &file_lock, &flock); + if (error) + goto out; + + error = vfs_test_lock(filp, &file_lock); + if (error) + goto out; + + flock.l_type = file_lock.fl_type; + if (file_lock.fl_type != F_UNLCK) + posix_lock_to_flock64(&flock, &file_lock); + + error = -EFAULT; + if (!copy_to_user(l, &flock, sizeof(flock))) + error = 0; + +out: + return error; +} + +/* Apply the lock described by l to an open file descriptor. + * This implements both the F_SETLK and F_SETLKW commands of fcntl(). + */ +int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, + struct flock64 __user *l) +{ + struct file_lock *file_lock = locks_alloc_lock(); + struct flock64 flock; + struct inode *inode; + struct file *f; + int error; + + if (file_lock == NULL) + return -ENOLCK; + + /* + * This might block, so we do it before checking the inode. + */ + error = -EFAULT; + if (copy_from_user(&flock, l, sizeof(flock))) + goto out; + + inode = filp->f_path.dentry->d_inode; + + /* Don't allow mandatory locks on files that may be memory mapped + * and shared. + */ + if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) { + error = -EAGAIN; + goto out; + } + +again: + error = flock64_to_posix_lock(filp, file_lock, &flock); + if (error) + goto out; + if (cmd == F_SETLKW64) { + file_lock->fl_flags |= FL_SLEEP; + } + + error = -EBADF; + switch (flock.l_type) { + case F_RDLCK: + if (!(filp->f_mode & FMODE_READ)) + goto out; + break; + case F_WRLCK: + if (!(filp->f_mode & FMODE_WRITE)) + goto out; + break; + case F_UNLCK: + break; + default: + error = -EINVAL; + goto out; + } + + error = do_lock_file_wait(filp, cmd, file_lock); + + /* + * Attempt to detect a close/fcntl race and recover by + * releasing the lock that was just acquired. + */ + spin_lock(¤t->files->file_lock); + f = fcheck(fd); + spin_unlock(¤t->files->file_lock); + if (!error && f != filp && flock.l_type != F_UNLCK) { + flock.l_type = F_UNLCK; + goto again; + } + +out: + locks_free_lock(file_lock); + return error; +} +#endif /* BITS_PER_LONG == 32 */ + +/* + * This function is called when the file is being removed + * from the task's fd array. POSIX locks belonging to this task + * are deleted at this time. + */ +void locks_remove_posix(struct file *filp, fl_owner_t owner) +{ + struct file_lock lock; + + /* + * If there are no locks held on this file, we don't need to call + * posix_lock_file(). Another process could be setting a lock on this + * file at the same time, but we wouldn't remove that lock anyway. + */ + if (!filp->f_path.dentry->d_inode->i_flock) + return; + + lock.fl_type = F_UNLCK; + lock.fl_flags = FL_POSIX | FL_CLOSE; + lock.fl_start = 0; + lock.fl_end = OFFSET_MAX; + lock.fl_owner = owner; + lock.fl_pid = current->tgid; + lock.fl_file = filp; + lock.fl_ops = NULL; + lock.fl_lmops = NULL; + + vfs_lock_file(filp, F_SETLK, &lock, NULL); + + if (lock.fl_ops && lock.fl_ops->fl_release_private) + lock.fl_ops->fl_release_private(&lock); +} + +EXPORT_SYMBOL(locks_remove_posix); + +/* + * This function is called on the last close of an open file. + */ +void locks_remove_flock(struct file *filp) +{ + struct inode * inode = filp->f_path.dentry->d_inode; + struct file_lock *fl; + struct file_lock **before; + + if (!inode->i_flock) + return; + + if (filp->f_op && filp->f_op->flock) { + struct file_lock fl = { + .fl_pid = current->tgid, + .fl_file = filp, + .fl_flags = FL_FLOCK, + .fl_type = F_UNLCK, + .fl_end = OFFSET_MAX, + }; + filp->f_op->flock(filp, F_SETLKW, &fl); + if (fl.fl_ops && fl.fl_ops->fl_release_private) + fl.fl_ops->fl_release_private(&fl); + } + + lock_flocks(); + before = &inode->i_flock; + + while ((fl = *before) != NULL) { + if (fl->fl_file == filp) { + if (IS_FLOCK(fl)) { + locks_delete_lock(before); + continue; + } + if (IS_LEASE(fl)) { + lease_modify(before, F_UNLCK); + continue; + } + /* What? */ + BUG(); + } + before = &fl->fl_next; + } + unlock_flocks(); +} + +/** + * posix_unblock_lock - stop waiting for a file lock + * @filp: how the file was opened + * @waiter: the lock which was waiting + * + * lockd needs to block waiting for locks. + */ +int +posix_unblock_lock(struct file *filp, struct file_lock *waiter) +{ + int status = 0; + + lock_flocks(); + if (waiter->fl_next) + __locks_delete_block(waiter); + else + status = -ENOENT; + unlock_flocks(); + return status; +} + +EXPORT_SYMBOL(posix_unblock_lock); + +/** + * vfs_cancel_lock - file byte range unblock lock + * @filp: The file to apply the unblock to + * @fl: The lock to be unblocked + * + * Used by lock managers to cancel blocked requests + */ +int vfs_cancel_lock(struct file *filp, struct file_lock *fl) +{ + if (filp->f_op && filp->f_op->lock) + return filp->f_op->lock(filp, F_CANCELLK, fl); + return 0; +} + +EXPORT_SYMBOL_GPL(vfs_cancel_lock); + +#ifdef CONFIG_PROC_FS +#include +#include + +static void lock_get_status(struct seq_file *f, struct file_lock *fl, + loff_t id, char *pfx) +{ + struct inode *inode = NULL; + unsigned int fl_pid; + + if (fl->fl_nspid) + fl_pid = pid_vnr(fl->fl_nspid); + else + fl_pid = fl->fl_pid; + + if (fl->fl_file != NULL) + inode = fl->fl_file->f_path.dentry->d_inode; + + seq_printf(f, "%lld:%s ", id, pfx); + if (IS_POSIX(fl)) { + seq_printf(f, "%6s %s ", + (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ", + (inode == NULL) ? "*NOINODE*" : + mandatory_lock(inode) ? "MANDATORY" : "ADVISORY "); + } else if (IS_FLOCK(fl)) { + if (fl->fl_type & LOCK_MAND) { + seq_printf(f, "FLOCK MSNFS "); + } else { + seq_printf(f, "FLOCK ADVISORY "); + } + } else if (IS_LEASE(fl)) { + seq_printf(f, "LEASE "); + if (fl->fl_type & F_INPROGRESS) + seq_printf(f, "BREAKING "); + else if (fl->fl_file) + seq_printf(f, "ACTIVE "); + else + seq_printf(f, "BREAKER "); + } else { + seq_printf(f, "UNKNOWN UNKNOWN "); + } + if (fl->fl_type & LOCK_MAND) { + seq_printf(f, "%s ", + (fl->fl_type & LOCK_READ) + ? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ " + : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE "); + } else { + seq_printf(f, "%s ", + (fl->fl_type & F_INPROGRESS) + ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ " + : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ "); + } + if (inode) { +#ifdef WE_CAN_BREAK_LSLK_NOW + seq_printf(f, "%d %s:%ld ", fl_pid, + inode->i_sb->s_id, inode->i_ino); +#else + /* userspace relies on this representation of dev_t ;-( */ + seq_printf(f, "%d %02x:%02x:%ld ", fl_pid, + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), inode->i_ino); +#endif + } else { + seq_printf(f, "%d :0 ", fl_pid); + } + if (IS_POSIX(fl)) { + if (fl->fl_end == OFFSET_MAX) + seq_printf(f, "%Ld EOF\n", fl->fl_start); + else + seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end); + } else { + seq_printf(f, "0 EOF\n"); + } +} + +static int locks_show(struct seq_file *f, void *v) +{ + struct file_lock *fl, *bfl; + + fl = list_entry(v, struct file_lock, fl_link); + + lock_get_status(f, fl, *((loff_t *)f->private), ""); + + list_for_each_entry(bfl, &fl->fl_block, fl_block) + lock_get_status(f, bfl, *((loff_t *)f->private), " ->"); + + return 0; +} + +static void *locks_start(struct seq_file *f, loff_t *pos) +{ + loff_t *p = f->private; + + lock_flocks(); + *p = (*pos + 1); + return seq_list_start(&file_lock_list, *pos); +} + +static void *locks_next(struct seq_file *f, void *v, loff_t *pos) +{ + loff_t *p = f->private; + ++*p; + return seq_list_next(v, &file_lock_list, pos); +} + +static void locks_stop(struct seq_file *f, void *v) +{ + unlock_flocks(); +} + +static const struct seq_operations locks_seq_operations = { + .start = locks_start, + .next = locks_next, + .stop = locks_stop, + .show = locks_show, +}; + +static int locks_open(struct inode *inode, struct file *filp) +{ + return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t)); +} + +static const struct file_operations proc_locks_operations = { + .open = locks_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int __init proc_locks_init(void) +{ + proc_create("locks", 0, NULL, &proc_locks_operations); + return 0; +} +module_init(proc_locks_init); +#endif + +/** + * lock_may_read - checks that the region is free of locks + * @inode: the inode that is being read + * @start: the first byte to read + * @len: the number of bytes to read + * + * Emulates Windows locking requirements. Whole-file + * mandatory locks (share modes) can prohibit a read and + * byte-range POSIX locks can prohibit a read if they overlap. + * + * N.B. this function is only ever called + * from knfsd and ownership of locks is never checked. + */ +int lock_may_read(struct inode *inode, loff_t start, unsigned long len) +{ + struct file_lock *fl; + int result = 1; + lock_flocks(); + for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { + if (IS_POSIX(fl)) { + if (fl->fl_type == F_RDLCK) + continue; + if ((fl->fl_end < start) || (fl->fl_start > (start + len))) + continue; + } else if (IS_FLOCK(fl)) { + if (!(fl->fl_type & LOCK_MAND)) + continue; + if (fl->fl_type & LOCK_READ) + continue; + } else + continue; + result = 0; + break; + } + unlock_flocks(); + return result; +} + +EXPORT_SYMBOL(lock_may_read); + +/** + * lock_may_write - checks that the region is free of locks + * @inode: the inode that is being written + * @start: the first byte to write + * @len: the number of bytes to write + * + * Emulates Windows locking requirements. Whole-file + * mandatory locks (share modes) can prohibit a write and + * byte-range POSIX locks can prohibit a write if they overlap. + * + * N.B. this function is only ever called + * from knfsd and ownership of locks is never checked. + */ +int lock_may_write(struct inode *inode, loff_t start, unsigned long len) +{ + struct file_lock *fl; + int result = 1; + lock_flocks(); + for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { + if (IS_POSIX(fl)) { + if ((fl->fl_end < start) || (fl->fl_start > (start + len))) + continue; + } else if (IS_FLOCK(fl)) { + if (!(fl->fl_type & LOCK_MAND)) + continue; + if (fl->fl_type & LOCK_WRITE) + continue; + } else + continue; + result = 0; + break; + } + unlock_flocks(); + return result; +} + +EXPORT_SYMBOL(lock_may_write); + +static int __init filelock_init(void) +{ + filelock_cache = kmem_cache_create("file_lock_cache", + sizeof(struct file_lock), 0, SLAB_PANIC, + init_once); + return 0; +} + +core_initcall(filelock_init); diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig new file mode 100644 index 00000000..daf9a9b3 --- /dev/null +++ b/fs/logfs/Kconfig @@ -0,0 +1,17 @@ +config LOGFS + tristate "LogFS file system (EXPERIMENTAL)" + depends on (MTD || BLOCK) && EXPERIMENTAL + select ZLIB_INFLATE + select ZLIB_DEFLATE + select CRC32 + select BTREE + help + Flash filesystem aimed to scale efficiently to large devices. + In comparison to JFFS2 it offers significantly faster mount + times and potentially less RAM usage, although the latter has + not been measured yet. + + In its current state it is still very experimental and should + not be used for other than testing purposes. + + If unsure, say N. diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile new file mode 100644 index 00000000..48200277 --- /dev/null +++ b/fs/logfs/Makefile @@ -0,0 +1,13 @@ +obj-$(CONFIG_LOGFS) += logfs.o + +logfs-y += compr.o +logfs-y += dir.o +logfs-y += file.o +logfs-y += gc.o +logfs-y += inode.o +logfs-y += journal.o +logfs-y += readwrite.o +logfs-y += segment.o +logfs-y += super.o +logfs-$(CONFIG_BLOCK) += dev_bdev.o +logfs-$(CONFIG_MTD) += dev_mtd.o diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c new file mode 100644 index 00000000..961f02b8 --- /dev/null +++ b/fs/logfs/compr.c @@ -0,0 +1,95 @@ +/* + * fs/logfs/compr.c - compression routines + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include + +#define COMPR_LEVEL 3 + +static DEFINE_MUTEX(compr_mutex); +static struct z_stream_s stream; + +int logfs_compress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + mutex_lock(&compr_mutex); + err = zlib_deflateInit(&stream, COMPR_LEVEL); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_deflateEnd(&stream); + if (err != Z_OK) + goto error; + + if (stream.total_out >= stream.total_in) + goto error; + + ret = stream.total_out; +error: + mutex_unlock(&compr_mutex); + return ret; +} + +int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + mutex_lock(&compr_mutex); + err = zlib_inflateInit(&stream); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_inflateEnd(&stream); + if (err != Z_OK) + goto error; + + ret = 0; +error: + mutex_unlock(&compr_mutex); + return ret; +} + +int __init logfs_compr_init(void) +{ + size_t size = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + stream.workspace = vmalloc(size); + if (!stream.workspace) + return -ENOMEM; + return 0; +} + +void logfs_compr_exit(void) +{ + vfree(stream.workspace); +} diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c new file mode 100644 index 00000000..df0de27c --- /dev/null +++ b/fs/logfs/dev_bdev.c @@ -0,0 +1,342 @@ +/* + * fs/logfs/dev_bdev.c - Device access methods for block devices + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include +#include +#include +#include + +#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) + +static void request_complete(struct bio *bio, int err) +{ + complete((struct completion *)bio->bi_private); +} + +static int sync_request(struct page *page, struct block_device *bdev, int rw) +{ + struct bio bio; + struct bio_vec bio_vec; + struct completion complete; + + bio_init(&bio); + bio.bi_io_vec = &bio_vec; + bio_vec.bv_page = page; + bio_vec.bv_len = PAGE_SIZE; + bio_vec.bv_offset = 0; + bio.bi_vcnt = 1; + bio.bi_idx = 0; + bio.bi_size = PAGE_SIZE; + bio.bi_bdev = bdev; + bio.bi_sector = page->index * (PAGE_SIZE >> 9); + init_completion(&complete); + bio.bi_private = &complete; + bio.bi_end_io = request_complete; + + submit_bio(rw, &bio); + wait_for_completion(&complete); + return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO; +} + +static int bdev_readpage(void *_sb, struct page *page) +{ + struct super_block *sb = _sb; + struct block_device *bdev = logfs_super(sb)->s_bdev; + int err; + + err = sync_request(page, bdev, READ); + if (err) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + unlock_page(page); + return err; +} + +static DECLARE_WAIT_QUEUE_HEAD(wq); + +static void writeseg_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct super_block *sb = bio->bi_private; + struct logfs_super *super = logfs_super(sb); + struct page *page; + + BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ + BUG_ON(err); + BUG_ON(bio->bi_vcnt == 0); + do { + page = bvec->bv_page; + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + end_page_writeback(page); + page_cache_release(page); + } while (bvec >= bio->bi_io_vec); + bio_put(bio); + if (atomic_dec_and_test(&super->s_pending_writes)) + wake_up(&wq); +} + +static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct bio *bio; + struct page *page; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); + int i; + + if (max_pages > BIO_MAX_PAGES) + max_pages = BIO_MAX_PAGES; + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + + for (i = 0; i < nr_pages; i++) { + if (i >= max_pages) { + /* Block layer cannot split bios :( */ + bio->bi_vcnt = i; + bio->bi_idx = 0; + bio->bi_size = i * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = writeseg_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + + ofs += i * PAGE_SIZE; + index += i; + nr_pages -= i; + i = 0; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + } + page = find_lock_page(mapping, index + i); + BUG_ON(!page); + bio->bi_io_vec[i].bv_page = page; + bio->bi_io_vec[i].bv_len = PAGE_SIZE; + bio->bi_io_vec[i].bv_offset = 0; + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + } + bio->bi_vcnt = nr_pages; + bio->bi_idx = 0; + bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = writeseg_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + return 0; +} + +static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + int head; + + BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO); + + if (len == 0) { + /* This can happen when the object fit perfectly into a + * segment, the segment gets written per sync and subsequently + * closed. + */ + return; + } + head = ofs & (PAGE_SIZE - 1); + if (head) { + ofs -= head; + len += head; + } + len = PAGE_ALIGN(len); + __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); +} + + +static void erase_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct super_block *sb = bio->bi_private; + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ + BUG_ON(err); + BUG_ON(bio->bi_vcnt == 0); + bio_put(bio); + if (atomic_dec_and_test(&super->s_pending_writes)) + wake_up(&wq); +} + +static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct bio *bio; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); + int i; + + if (max_pages > BIO_MAX_PAGES) + max_pages = BIO_MAX_PAGES; + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + + for (i = 0; i < nr_pages; i++) { + if (i >= max_pages) { + /* Block layer cannot split bios :( */ + bio->bi_vcnt = i; + bio->bi_idx = 0; + bio->bi_size = i * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = erase_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + + ofs += i * PAGE_SIZE; + index += i; + nr_pages -= i; + i = 0; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + } + bio->bi_io_vec[i].bv_page = super->s_erase_page; + bio->bi_io_vec[i].bv_len = PAGE_SIZE; + bio->bi_io_vec[i].bv_offset = 0; + } + bio->bi_vcnt = nr_pages; + bio->bi_idx = 0; + bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = erase_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + return 0; +} + +static int bdev_erase(struct super_block *sb, loff_t to, size_t len, + int ensure_write) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(to & (PAGE_SIZE - 1)); + BUG_ON(len & (PAGE_SIZE - 1)); + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + if (ensure_write) { + /* + * Object store doesn't care whether erases happen or not. + * But for the journal they are required. Otherwise a scan + * can find an old commit entry and assume it is the current + * one, travelling back in time. + */ + do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT); + } + + return 0; +} + +static void bdev_sync(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + wait_event(wq, atomic_read(&super->s_pending_writes) == 0); +} + +static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = bdev_readpage; + + *ofs = 0; + return read_cache_page(mapping, 0, filler, sb); +} + +static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = bdev_readpage; + u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000; + pgoff_t index = pos >> PAGE_SHIFT; + + *ofs = pos; + return read_cache_page(mapping, index, filler, sb); +} + +static int bdev_write_sb(struct super_block *sb, struct page *page) +{ + struct block_device *bdev = logfs_super(sb)->s_bdev; + + /* Nothing special to do for block devices. */ + return sync_request(page, bdev, WRITE); +} + +static void bdev_put_device(struct logfs_super *s) +{ + blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +} + +static int bdev_can_write_buf(struct super_block *sb, u64 ofs) +{ + return 0; +} + +static const struct logfs_device_ops bd_devops = { + .find_first_sb = bdev_find_first_sb, + .find_last_sb = bdev_find_last_sb, + .write_sb = bdev_write_sb, + .readpage = bdev_readpage, + .writeseg = bdev_writeseg, + .erase = bdev_erase, + .can_write_buf = bdev_can_write_buf, + .sync = bdev_sync, + .put_device = bdev_put_device, +}; + +int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type, + const char *devname) +{ + struct block_device *bdev; + + bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + type); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { + int mtdnr = MINOR(bdev->bd_dev); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + return logfs_get_sb_mtd(p, mtdnr); + } + + p->s_bdev = bdev; + p->s_mtd = NULL; + p->s_devops = &bd_devops; + return 0; +} diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c new file mode 100644 index 00000000..339e17e9 --- /dev/null +++ b/fs/logfs/dev_mtd.c @@ -0,0 +1,278 @@ +/* + * fs/logfs/dev_mtd.c - Device access methods for MTD + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include +#include +#include + +#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) + +static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + size_t retlen; + int ret; + + ret = mtd->read(mtd, ofs, len, &retlen, buf); + BUG_ON(ret == -EINVAL); + if (ret) + return ret; + + /* Not sure if we should loop instead. */ + if (retlen != len) + return -EIO; + + return 0; +} + +static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf) +{ + struct logfs_super *super = logfs_super(sb); + struct mtd_info *mtd = super->s_mtd; + size_t retlen; + loff_t page_start, page_end; + int ret; + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs)); + BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift); + BUG_ON(len > PAGE_CACHE_SIZE); + page_start = ofs & PAGE_CACHE_MASK; + page_end = PAGE_CACHE_ALIGN(ofs + len) - 1; + ret = mtd->write(mtd, ofs, len, &retlen, buf); + if (ret || (retlen != len)) + return -EIO; + + return 0; +} + +/* + * For as long as I can remember (since about 2001) mtd->erase has been an + * asynchronous interface lacking the first driver to actually use the + * asynchronous properties. So just to prevent the first implementor of such + * a thing from breaking logfs in 2350, we do the usual pointless dance to + * declare a completion variable and wait for completion before returning + * from mtd_erase(). What an exercise in futility! + */ +static void logfs_erase_callback(struct erase_info *ei) +{ + complete((struct completion *)ei->priv); +} + +static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + pgoff_t index = ofs >> PAGE_SHIFT; + + for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) { + page = find_get_page(mapping, index); + if (!page) + continue; + memset(page_address(page), 0xFF, PAGE_SIZE); + page_cache_release(page); + } + return 0; +} + +static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len, + int ensure_write) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + struct erase_info ei; + DECLARE_COMPLETION_ONSTACK(complete); + int ret; + + BUG_ON(len % mtd->erasesize); + if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + memset(&ei, 0, sizeof(ei)); + ei.mtd = mtd; + ei.addr = ofs; + ei.len = len; + ei.callback = logfs_erase_callback; + ei.priv = (long)&complete; + ret = mtd->erase(mtd, &ei); + if (ret) + return -EIO; + + wait_for_completion(&complete); + if (ei.state != MTD_ERASE_DONE) + return -EIO; + return mtd_erase_mapping(sb, ofs, len); +} + +static void mtd_sync(struct super_block *sb) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + + if (mtd->sync) + mtd->sync(mtd); +} + +static int mtd_readpage(void *_sb, struct page *page) +{ + struct super_block *sb = _sb; + int err; + + err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); + if (err == -EUCLEAN || err == -EBADMSG) { + /* -EBADMSG happens regularly on power failures */ + err = 0; + /* FIXME: force GC this segment */ + } + if (err) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + unlock_page(page); + return err; +} + +static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = mtd_readpage; + struct mtd_info *mtd = super->s_mtd; + + if (!mtd->block_isbad) + return NULL; + + *ofs = 0; + while (mtd->block_isbad(mtd, *ofs)) { + *ofs += mtd->erasesize; + if (*ofs >= mtd->size) + return NULL; + } + BUG_ON(*ofs & ~PAGE_MASK); + return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); +} + +static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = mtd_readpage; + struct mtd_info *mtd = super->s_mtd; + + if (!mtd->block_isbad) + return NULL; + + *ofs = mtd->size - mtd->erasesize; + while (mtd->block_isbad(mtd, *ofs)) { + *ofs -= mtd->erasesize; + if (*ofs <= 0) + return NULL; + } + *ofs = *ofs + mtd->erasesize - 0x1000; + BUG_ON(*ofs & ~PAGE_MASK); + return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); +} + +static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + int i, err; + + for (i = 0; i < nr_pages; i++) { + page = find_lock_page(mapping, index + i); + BUG_ON(!page); + + err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); + unlock_page(page); + page_cache_release(page); + if (err) + return err; + } + return 0; +} + +static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + int head; + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return; + + if (len == 0) { + /* This can happen when the object fit perfectly into a + * segment, the segment gets written per sync and subsequently + * closed. + */ + return; + } + head = ofs & (PAGE_SIZE - 1); + if (head) { + ofs -= head; + len += head; + } + len = PAGE_ALIGN(len); + __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); +} + +static void mtd_put_device(struct logfs_super *s) +{ + put_mtd_device(s->s_mtd); +} + +static int mtd_can_write_buf(struct super_block *sb, u64 ofs) +{ + struct logfs_super *super = logfs_super(sb); + void *buf; + int err; + + buf = kmalloc(super->s_writesize, GFP_KERNEL); + if (!buf) + return -ENOMEM; + err = mtd_read(sb, ofs, super->s_writesize, buf); + if (err) + goto out; + if (memchr_inv(buf, 0xff, super->s_writesize)) + err = -EIO; + kfree(buf); +out: + return err; +} + +static const struct logfs_device_ops mtd_devops = { + .find_first_sb = mtd_find_first_sb, + .find_last_sb = mtd_find_last_sb, + .readpage = mtd_readpage, + .writeseg = mtd_writeseg, + .erase = mtd_erase, + .can_write_buf = mtd_can_write_buf, + .sync = mtd_sync, + .put_device = mtd_put_device, +}; + +int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) +{ + struct mtd_info *mtd = get_mtd_device(NULL, mtdnr); + if (IS_ERR(mtd)) + return PTR_ERR(mtd); + + s->s_bdev = NULL; + s->s_mtd = mtd; + s->s_devops = &mtd_devops; + return 0; +} diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c new file mode 100644 index 00000000..1afae26c --- /dev/null +++ b/fs/logfs/dir.c @@ -0,0 +1,825 @@ +/* + * fs/logfs/dir.c - directory-related code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include + +/* + * Atomic dir operations + * + * Directory operations are by default not atomic. Dentries and Inodes are + * created/removed/altered in separate operations. Therefore we need to do + * a small amount of journaling. + * + * Create, link, mkdir, mknod and symlink all share the same function to do + * the work: __logfs_create. This function works in two atomic steps: + * 1. allocate inode (remember in journal) + * 2. allocate dentry (clear journal) + * + * As we can only get interrupted between the two, when the inode we just + * created is simply stored in the anchor. On next mount, if we were + * interrupted, we delete the inode. From a users point of view the + * operation never happened. + * + * Unlink and rmdir also share the same function: unlink. Again, this + * function works in two atomic steps + * 1. remove dentry (remember inode in journal) + * 2. unlink inode (clear journal) + * + * And again, on the next mount, if we were interrupted, we delete the inode. + * From a users point of view the operation succeeded. + * + * Rename is the real pain to deal with, harder than all the other methods + * combined. Depending on the circumstances we can run into three cases. + * A "target rename" where the target dentry already existed, a "local + * rename" where both parent directories are identical or a "cross-directory + * rename" in the remaining case. + * + * Local rename is atomic, as the old dentry is simply rewritten with a new + * name. + * + * Cross-directory rename works in two steps, similar to __logfs_create and + * logfs_unlink: + * 1. Write new dentry (remember old dentry in journal) + * 2. Remove old dentry (clear journal) + * + * Here we remember a dentry instead of an inode. On next mount, if we were + * interrupted, we delete the dentry. From a users point of view, the + * operation succeeded. + * + * Target rename works in three atomic steps: + * 1. Attach old inode to new dentry (remember old dentry and new inode) + * 2. Remove old dentry (still remember the new inode) + * 3. Remove victim inode + * + * Here we remember both an inode an a dentry. If we get interrupted + * between steps 1 and 2, we delete both the dentry and the inode. If + * we get interrupted between steps 2 and 3, we delete just the inode. + * In either case, the remaining objects are deleted on next mount. From + * a users point of view, the operation succeeded. + */ + +static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd, + loff_t pos) +{ + return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL); +} + +static int write_inode(struct inode *inode) +{ + return __logfs_write_inode(inode, WF_LOCK); +} + +static s64 dir_seek_data(struct inode *inode, s64 pos) +{ + s64 new_pos = logfs_seek_data(inode, pos); + + return max(pos, new_pos - 1); +} + +static int beyond_eof(struct inode *inode, loff_t bix) +{ + loff_t pos = bix << inode->i_sb->s_blocksize_bits; + return pos >= i_size_read(inode); +} + +/* + * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11, + * so short names (len <= 9) don't even occupy the complete 32bit name + * space. A prime >256 ensures short names quickly spread the 32bit + * name space. Add about 26 for the estimated amount of information + * of each character and pick a prime nearby, preferably a bit-sparse + * one. + */ +static u32 hash_32(const char *s, int len, u32 seed) +{ + u32 hash = seed; + int i; + + for (i = 0; i < len; i++) + hash = hash * 293 + s[i]; + return hash; +} + +/* + * We have to satisfy several conflicting requirements here. Small + * directories should stay fairly compact and not require too many + * indirect blocks. The number of possible locations for a given hash + * should be small to make lookup() fast. And we should try hard not + * to overflow the 32bit name space or nfs and 32bit host systems will + * be unhappy. + * + * So we use the following scheme. First we reduce the hash to 0..15 + * and try a direct block. If that is occupied we reduce the hash to + * 16..255 and try an indirect block. Same for 2x and 3x indirect + * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff, + * but use buckets containing eight entries instead of a single one. + * + * Using 16 entries should allow for a reasonable amount of hash + * collisions, so the 32bit name space can be packed fairly tight + * before overflowing. Oh and currently we don't overflow but return + * and error. + * + * How likely are collisions? Doing the appropriate math is beyond me + * and the Bronstein textbook. But running a test program to brute + * force collisions for a couple of days showed that on average the + * first collision occurs after 598M entries, with 290M being the + * smallest result. Obviously 21 entries could already cause a + * collision if all entries are carefully chosen. + */ +static pgoff_t hash_index(u32 hash, int round) +{ + u32 i0_blocks = I0_BLOCKS; + u32 i1_blocks = I1_BLOCKS; + u32 i2_blocks = I2_BLOCKS; + u32 i3_blocks = I3_BLOCKS; + + switch (round) { + case 0: + return hash % i0_blocks; + case 1: + return i0_blocks + hash % (i1_blocks - i0_blocks); + case 2: + return i1_blocks + hash % (i2_blocks - i1_blocks); + case 3: + return i2_blocks + hash % (i3_blocks - i2_blocks); + case 4 ... 19: + return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16)) + + round - 4; + } + BUG(); +} + +static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry) +{ + struct qstr *name = &dentry->d_name; + struct page *page; + struct logfs_disk_dentry *dd; + u32 hash = hash_32(name->name, name->len, 0); + pgoff_t index; + int round; + + if (name->len > LOGFS_MAX_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + for (round = 0; round < 20; round++) { + index = hash_index(hash, round); + + if (beyond_eof(dir, index)) + return NULL; + if (!logfs_exist_block(dir, index)) + continue; + page = read_cache_page(dir->i_mapping, index, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return page; + dd = kmap_atomic(page, KM_USER0); + BUG_ON(dd->namelen == 0); + + if (name->len != be16_to_cpu(dd->namelen) || + memcmp(name->name, dd->name, name->len)) { + kunmap_atomic(dd, KM_USER0); + page_cache_release(page); + continue; + } + + kunmap_atomic(dd, KM_USER0); + return page; + } + return NULL; +} + +static int logfs_remove_inode(struct inode *inode) +{ + int ret; + + inode->i_nlink--; + ret = write_inode(inode); + LOGFS_BUG_ON(ret, inode->i_sb); + return ret; +} + +static void abort_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + if (logfs_inode(inode)->li_block) + logfs_inode(inode)->li_block->ta = NULL; + kfree(ta); +} + +static int logfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct logfs_super *super = logfs_super(dir->i_sb); + struct inode *inode = dentry->d_inode; + struct logfs_transaction *ta; + struct page *page; + pgoff_t index; + int ret; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = UNLINK_1; + ta->ino = inode->i_ino; + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + page = logfs_get_dd_page(dir, dentry); + if (!page) { + kfree(ta); + return -ENOENT; + } + if (IS_ERR(page)) { + kfree(ta); + return PTR_ERR(page); + } + index = page->index; + page_cache_release(page); + + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(dir, ta); + + ret = logfs_delete(dir, index, NULL); + if (!ret) + ret = write_inode(dir); + + if (ret) { + abort_transaction(dir, ta); + printk(KERN_ERR"LOGFS: unable to delete inode\n"); + goto out; + } + + ta->state = UNLINK_2; + logfs_add_transaction(inode, ta); + ret = logfs_remove_inode(inode); +out: + mutex_unlock(&super->s_dirop_mutex); + return ret; +} + +static inline int logfs_empty_dir(struct inode *dir) +{ + u64 data; + + data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits; + return data >= i_size_read(dir); +} + +static int logfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + if (!logfs_empty_dir(inode)) + return -ENOTEMPTY; + + return logfs_unlink(dir, dentry); +} + +/* FIXME: readdir currently has it's own dir_walk code. I don't see a good + * way to combine the two copies */ +#define IMPLICIT_NODES 2 +static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir) +{ + struct inode *dir = file->f_dentry->d_inode; + loff_t pos = file->f_pos - IMPLICIT_NODES; + struct page *page; + struct logfs_disk_dentry *dd; + int full; + + BUG_ON(pos < 0); + for (;; pos++) { + if (beyond_eof(dir, pos)) + break; + if (!logfs_exist_block(dir, pos)) { + /* deleted dentry */ + pos = dir_seek_data(dir, pos); + continue; + } + page = read_cache_page(dir->i_mapping, pos, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + dd = kmap(page); + BUG_ON(dd->namelen == 0); + + full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen), + pos, be64_to_cpu(dd->ino), dd->type); + kunmap(page); + page_cache_release(page); + if (full) + break; + } + + file->f_pos = pos + IMPLICIT_NODES; + return 0; +} + +static int logfs_readdir(struct file *file, void *buf, filldir_t filldir) +{ + struct inode *inode = file->f_dentry->d_inode; + ino_t pino = parent_ino(file->f_dentry); + int err; + + if (file->f_pos < 0) + return -EINVAL; + + if (file->f_pos == 0) { + if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0) + return 0; + file->f_pos++; + } + if (file->f_pos == 1) { + if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0) + return 0; + file->f_pos++; + } + + err = __logfs_readdir(file, buf, filldir); + return err; +} + +static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name) +{ + dd->namelen = cpu_to_be16(name->len); + memcpy(dd->name, name->name, name->len); +} + +static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct page *page; + struct logfs_disk_dentry *dd; + pgoff_t index; + u64 ino = 0; + struct inode *inode; + + page = logfs_get_dd_page(dir, dentry); + if (IS_ERR(page)) + return ERR_CAST(page); + if (!page) { + d_add(dentry, NULL); + return NULL; + } + index = page->index; + dd = kmap_atomic(page, KM_USER0); + ino = be64_to_cpu(dd->ino); + kunmap_atomic(dd, KM_USER0); + page_cache_release(page); + + inode = logfs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) { + printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n", + ino, dir->i_ino, index); + return ERR_CAST(inode); + } + return d_splice_alias(inode, dentry); +} + +static void grow_dir(struct inode *dir, loff_t index) +{ + index = (index + 1) << dir->i_sb->s_blocksize_bits; + if (i_size_read(dir) < index) + i_size_write(dir, index); +} + +static int logfs_write_dir(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct page *page; + struct logfs_disk_dentry *dd; + u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0); + pgoff_t index; + int round, err; + + for (round = 0; round < 20; round++) { + index = hash_index(hash, round); + + if (logfs_exist_block(dir, index)) + continue; + page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL); + if (!page) + return -ENOMEM; + + dd = kmap_atomic(page, KM_USER0); + memset(dd, 0, sizeof(*dd)); + dd->ino = cpu_to_be64(inode->i_ino); + dd->type = logfs_type(inode); + logfs_set_name(dd, &dentry->d_name); + kunmap_atomic(dd, KM_USER0); + + err = logfs_write_buf(dir, page, WF_LOCK); + unlock_page(page); + page_cache_release(page); + if (!err) + grow_dir(dir, index); + return err; + } + /* FIXME: Is there a better return value? In most cases neither + * the filesystem nor the directory are full. But we have had + * too many collisions for this particular hash and no fallback. + */ + return -ENOSPC; +} + +static int __logfs_create(struct inode *dir, struct dentry *dentry, + struct inode *inode, const char *dest, long destlen) +{ + struct logfs_super *super = logfs_super(dir->i_sb); + struct logfs_inode *li = logfs_inode(inode); + struct logfs_transaction *ta; + int ret; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) { + inode->i_nlink--; + iput(inode); + return -ENOMEM; + } + + ta->state = CREATE_1; + ta->ino = inode->i_ino; + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(inode, ta); + + if (dest) { + /* symlink */ + ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL); + if (!ret) + ret = write_inode(inode); + } else { + /* creat/mkdir/mknod */ + ret = write_inode(inode); + } + if (ret) { + abort_transaction(inode, ta); + li->li_flags |= LOGFS_IF_STILLBORN; + /* FIXME: truncate symlink */ + inode->i_nlink--; + iput(inode); + goto out; + } + + ta->state = CREATE_2; + logfs_add_transaction(dir, ta); + ret = logfs_write_dir(dir, dentry, inode); + /* sync directory */ + if (!ret) + ret = write_inode(dir); + + if (ret) { + logfs_del_transaction(dir, ta); + ta->state = CREATE_2; + logfs_add_transaction(inode, ta); + logfs_remove_inode(inode); + iput(inode); + goto out; + } + d_instantiate(dentry, inode); +out: + mutex_unlock(&super->s_dirop_mutex); + return ret; +} + +static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + + /* + * FIXME: why do we have to fill in S_IFDIR, while the mode is + * correct for mknod, creat, etc.? Smells like the vfs *should* + * do it for us but for some reason fails to do so. + */ + inode = logfs_new_inode(dir, S_IFDIR | mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_dir_iops; + inode->i_fop = &logfs_dir_fops; + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + + inode = logfs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_reg_iops; + inode->i_fop = &logfs_reg_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + struct inode *inode; + + if (dentry->d_name.len > LOGFS_MAX_NAMELEN) + return -ENAMETOOLONG; + + inode = logfs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + init_special_inode(inode, mode, rdev); + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_symlink(struct inode *dir, struct dentry *dentry, + const char *target) +{ + struct inode *inode; + size_t destlen = strlen(target) + 1; + + if (destlen > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + + inode = logfs_new_inode(dir, S_IFLNK | 0777); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_symlink_iops; + inode->i_mapping->a_ops = &logfs_reg_aops; + + return __logfs_create(dir, dentry, inode, target, destlen); +} + +static int logfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + + if (inode->i_nlink >= LOGFS_LINK_MAX) + return -EMLINK; + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + ihold(inode); + inode->i_nlink++; + mark_inode_dirty_sync(inode); + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_get_dd(struct inode *dir, struct dentry *dentry, + struct logfs_disk_dentry *dd, loff_t *pos) +{ + struct page *page; + void *map; + + page = logfs_get_dd_page(dir, dentry); + if (IS_ERR(page)) + return PTR_ERR(page); + *pos = page->index; + map = kmap_atomic(page, KM_USER0); + memcpy(dd, map, sizeof(*dd)); + kunmap_atomic(map, KM_USER0); + page_cache_release(page); + return 0; +} + +static int logfs_delete_dd(struct inode *dir, loff_t pos) +{ + /* + * Getting called with pos somewhere beyond eof is either a goofup + * within this file or means someone maliciously edited the + * (crc-protected) journal. + */ + BUG_ON(beyond_eof(dir, pos)); + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos); + return logfs_delete(dir, pos, NULL); +} + +/* + * Cross-directory rename, target does not exist. Just a little nasty. + * Create a new dentry in the target dir, then remove the old dentry, + * all the while taking care to remember our operation in the journal. + */ +static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct logfs_super *super = logfs_super(old_dir->i_sb); + struct logfs_disk_dentry dd; + struct logfs_transaction *ta; + loff_t pos; + int err; + + /* 1. locate source dd */ + err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); + if (err) + return err; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = CROSS_RENAME_1; + ta->dir = old_dir->i_ino; + ta->pos = pos; + + /* 2. write target dd */ + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(new_dir, ta); + err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode); + if (!err) + err = write_inode(new_dir); + + if (err) { + super->s_rename_dir = 0; + super->s_rename_pos = 0; + abort_transaction(new_dir, ta); + goto out; + } + + /* 3. remove source dd */ + ta->state = CROSS_RENAME_2; + logfs_add_transaction(old_dir, ta); + err = logfs_delete_dd(old_dir, pos); + if (!err) + err = write_inode(old_dir); + LOGFS_BUG_ON(err, old_dir->i_sb); +out: + mutex_unlock(&super->s_dirop_mutex); + return err; +} + +static int logfs_replace_inode(struct inode *dir, struct dentry *dentry, + struct logfs_disk_dentry *dd, struct inode *inode) +{ + loff_t pos; + int err; + + err = logfs_get_dd(dir, dentry, dd, &pos); + if (err) + return err; + dd->ino = cpu_to_be64(inode->i_ino); + dd->type = logfs_type(inode); + + err = write_dir(dir, dd, pos); + if (err) + return err; + log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos, + dd->name, be64_to_cpu(dd->ino)); + return write_inode(dir); +} + +/* Target dentry exists - the worst case. We need to attach the source + * inode to the target dentry, then remove the orphaned target inode and + * source dentry. + */ +static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct logfs_super *super = logfs_super(old_dir->i_sb); + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + int isdir = S_ISDIR(old_inode->i_mode); + struct logfs_disk_dentry dd; + struct logfs_transaction *ta; + loff_t pos; + int err; + + BUG_ON(isdir != S_ISDIR(new_inode->i_mode)); + if (isdir) { + if (!logfs_empty_dir(new_inode)) + return -ENOTEMPTY; + } + + /* 1. locate source dd */ + err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); + if (err) + return err; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = TARGET_RENAME_1; + ta->dir = old_dir->i_ino; + ta->pos = pos; + ta->ino = new_inode->i_ino; + + /* 2. attach source inode to target dd */ + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(new_dir, ta); + err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode); + if (err) { + super->s_rename_dir = 0; + super->s_rename_pos = 0; + super->s_victim_ino = 0; + abort_transaction(new_dir, ta); + goto out; + } + + /* 3. remove source dd */ + ta->state = TARGET_RENAME_2; + logfs_add_transaction(old_dir, ta); + err = logfs_delete_dd(old_dir, pos); + if (!err) + err = write_inode(old_dir); + LOGFS_BUG_ON(err, old_dir->i_sb); + + /* 4. remove target inode */ + ta->state = TARGET_RENAME_3; + logfs_add_transaction(new_inode, ta); + err = logfs_remove_inode(new_inode); + +out: + mutex_unlock(&super->s_dirop_mutex); + return err; +} + +static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (new_dentry->d_inode) + return logfs_rename_target(old_dir, old_dentry, + new_dir, new_dentry); + return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry); +} + +/* No locking done here, as this is called before .get_sb() returns. */ +int logfs_replay_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + u64 ino, pos; + int err; + + if (super->s_victim_ino) { + /* delete victim inode */ + ino = super->s_victim_ino; + printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino); + inode = logfs_iget(sb, ino); + if (IS_ERR(inode)) + goto fail; + + LOGFS_BUG_ON(i_size_read(inode) > 0, sb); + super->s_victim_ino = 0; + err = logfs_remove_inode(inode); + iput(inode); + if (err) { + super->s_victim_ino = ino; + goto fail; + } + } + if (super->s_rename_dir) { + /* delete old dd from rename */ + ino = super->s_rename_dir; + pos = super->s_rename_pos; + printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n", + ino, pos); + inode = logfs_iget(sb, ino); + if (IS_ERR(inode)) + goto fail; + + super->s_rename_dir = 0; + super->s_rename_pos = 0; + err = logfs_delete_dd(inode, pos); + iput(inode); + if (err) { + super->s_rename_dir = ino; + super->s_rename_pos = pos; + goto fail; + } + } + return 0; +fail: + LOGFS_BUG(sb); + return -EIO; +} + +const struct inode_operations logfs_symlink_iops = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, +}; + +const struct inode_operations logfs_dir_iops = { + .create = logfs_create, + .link = logfs_link, + .lookup = logfs_lookup, + .mkdir = logfs_mkdir, + .mknod = logfs_mknod, + .rename = logfs_rename, + .rmdir = logfs_rmdir, + .symlink = logfs_symlink, + .unlink = logfs_unlink, +}; +const struct file_operations logfs_dir_fops = { + .fsync = logfs_fsync, + .unlocked_ioctl = logfs_ioctl, + .readdir = logfs_readdir, + .read = generic_read_dir, + .llseek = default_llseek, +}; diff --git a/fs/logfs/file.c b/fs/logfs/file.c new file mode 100644 index 00000000..c2ad7028 --- /dev/null +++ b/fs/logfs/file.c @@ -0,0 +1,275 @@ +/* + * fs/logfs/file.c - prepare_write, commit_write and friends + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include + +static int logfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct page *page; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) + return 0; + if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + len; + + /* Reading beyond i_size is simple: memset to zero */ + zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); + return 0; + } + return logfs_readpage_nolock(page); +} + +static int logfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, + void *fsdata) +{ + struct inode *inode = mapping->host; + pgoff_t index = page->index; + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + copied; + int ret = 0; + + BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize); + BUG_ON(page->index > I3_BLOCKS); + + if (copied < len) { + /* + * Short write of a non-initialized paged. Just tell userspace + * to retry the entire page. + */ + if (!PageUptodate(page)) { + copied = 0; + goto out; + } + } + if (copied == 0) + goto out; /* FIXME: do we need to update inode? */ + + if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) { + i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end); + mark_inode_dirty_sync(inode); + } + + SetPageUptodate(page); + if (!PageDirty(page)) { + if (!get_page_reserve(inode, page)) + __set_page_dirty_nobuffers(page); + else + ret = logfs_write_buf(inode, page, WF_LOCK); + } +out: + unlock_page(page); + page_cache_release(page); + return ret ? ret : copied; +} + +int logfs_readpage(struct file *file, struct page *page) +{ + int ret; + + ret = logfs_readpage_nolock(page); + unlock_page(page); + return ret; +} + +/* Clear the page's dirty flag in the radix tree. */ +/* TODO: mucking with PageWriteback is silly. Add a generic function to clear + * the dirty bit from the radix tree for filesystems that don't have to wait + * for page writeback to finish (i.e. any compressing filesystem). + */ +static void clear_radix_tree_dirty(struct page *page) +{ + BUG_ON(PagePrivate(page) || page->private); + set_page_writeback(page); + end_page_writeback(page); +} + +static int __logfs_writepage(struct page *page) +{ + struct inode *inode = page->mapping->host; + int err; + + err = logfs_write_buf(inode, page, WF_LOCK); + if (err) + set_page_dirty(page); + else + clear_radix_tree_dirty(page); + unlock_page(page); + return err; +} + +static int logfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + u64 bix; + level_t level; + + log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index, + page); + + logfs_unpack_index(page->index, &bix, &level); + + /* Indirect blocks are never truncated */ + if (level != 0) + return __logfs_writepage(page); + + /* + * TODO: everything below is a near-verbatim copy of nobh_writepage(). + * The relevant bits should be factored out after logfs is merged. + */ + + /* Is the page fully inside i_size? */ + if (bix < end_index) + return __logfs_writepage(page); + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (bix > end_index || offset == 0) { + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invokation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + return __logfs_writepage(page); +} + +static void logfs_invalidatepage(struct page *page, unsigned long offset) +{ + struct logfs_block *block = logfs_block(page); + + if (block->reserved_bytes) { + struct super_block *sb = page->mapping->host->i_sb; + struct logfs_super *super = logfs_super(sb); + + super->s_dirty_pages -= block->reserved_bytes; + block->ops->free_block(sb, block); + BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR)); + } else + move_page_to_btree(page); + BUG_ON(PagePrivate(page) || page->private); +} + +static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this) +{ + return 0; /* None of these are easy to release */ +} + + +long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct logfs_inode *li = logfs_inode(inode); + unsigned int oldflags, flags; + int err; + + switch (cmd) { + case FS_IOC_GETFLAGS: + flags = li->li_flags & LOGFS_FL_USER_VISIBLE; + return put_user(flags, (int __user *)arg); + case FS_IOC_SETFLAGS: + if (IS_RDONLY(inode)) + return -EROFS; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + err = get_user(flags, (int __user *)arg); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + oldflags = li->li_flags; + flags &= LOGFS_FL_USER_MODIFIABLE; + flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE; + li->li_flags = flags; + mutex_unlock(&inode->i_mutex); + + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty_sync(inode); + return 0; + + default: + return -ENOTTY; + } +} + +int logfs_fsync(struct file *file, int datasync) +{ + struct super_block *sb = file->f_mapping->host->i_sb; + + logfs_write_anchor(sb); + return 0; +} + +static int logfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int err = 0; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (attr->ia_valid & ATTR_SIZE) { + err = logfs_truncate(inode, attr->ia_size); + if (err) + return err; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations logfs_reg_iops = { + .setattr = logfs_setattr, +}; + +const struct file_operations logfs_reg_fops = { + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .fsync = logfs_fsync, + .unlocked_ioctl = logfs_ioctl, + .llseek = generic_file_llseek, + .mmap = generic_file_readonly_mmap, + .open = generic_file_open, + .read = do_sync_read, + .write = do_sync_write, +}; + +const struct address_space_operations logfs_reg_aops = { + .invalidatepage = logfs_invalidatepage, + .readpage = logfs_readpage, + .releasepage = logfs_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, + .writepage = logfs_writepage, + .writepages = generic_writepages, + .write_begin = logfs_write_begin, + .write_end = logfs_write_end, +}; diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c new file mode 100644 index 00000000..caa44192 --- /dev/null +++ b/fs/logfs/gc.c @@ -0,0 +1,732 @@ +/* + * fs/logfs/gc.c - garbage collection code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include + +/* + * Wear leveling needs to kick in when the difference between low erase + * counts and high erase counts gets too big. A good value for "too big" + * may be somewhat below 10% of maximum erase count for the device. + * Why not 397, to pick a nice round number with no specific meaning? :) + * + * WL_RATELIMIT is the minimum time between two wear level events. A huge + * number of segments may fulfil the requirements for wear leveling at the + * same time. If that happens we don't want to cause a latency from hell, + * but just gently pick one segment every so often and minimize overhead. + */ +#define WL_DELTA 397 +#define WL_RATELIMIT 100 +#define MAX_OBJ_ALIASES 2600 +#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */ +#define LIST_SIZE 64 /* base size of candidate lists */ +#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */ +#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */ + +static int no_free_segments(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + return super->s_free_list.count; +} + +/* journal has distance -1, top-most ifile layer distance 0 */ +static u8 root_distance(struct super_block *sb, gc_level_t __gc_level) +{ + struct logfs_super *super = logfs_super(sb); + u8 gc_level = (__force u8)__gc_level; + + switch (gc_level) { + case 0: /* fall through */ + case 1: /* fall through */ + case 2: /* fall through */ + case 3: + /* file data or indirect blocks */ + return super->s_ifile_levels + super->s_iblock_levels - gc_level; + case 6: /* fall through */ + case 7: /* fall through */ + case 8: /* fall through */ + case 9: + /* inode file data or indirect blocks */ + return super->s_ifile_levels - (gc_level - 6); + default: + printk(KERN_ERR"LOGFS: segment of unknown level %x found\n", + gc_level); + WARN_ON(1); + return super->s_ifile_levels + super->s_iblock_levels; + } +} + +static int segment_is_reserved(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area; + void *reserved; + int i; + + /* Some segments are reserved. Just pretend they were all valid */ + reserved = btree_lookup32(&super->s_reserved_segments, segno); + if (reserved) + return 1; + + /* Currently open segments */ + for_each_area(i) { + area = super->s_area[i]; + if (area->a_is_open && area->a_segno == segno) + return 1; + } + + return 0; +} + +static void logfs_mark_segment_bad(struct super_block *sb, u32 segno) +{ + BUG(); +} + +/* + * Returns the bytes consumed by valid objects in this segment. Object headers + * are counted, the segment header is not. + */ +static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec, + gc_level_t *gc_level) +{ + struct logfs_segment_entry se; + u32 ec_level; + + logfs_get_segment_entry(sb, segno, &se); + if (se.ec_level == cpu_to_be32(BADSEG) || + se.valid == cpu_to_be32(RESERVED)) + return RESERVED; + + ec_level = be32_to_cpu(se.ec_level); + *ec = ec_level >> 4; + *gc_level = GC_LEVEL(ec_level & 0xf); + return be32_to_cpu(se.valid); +} + +static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino, + u64 bix, gc_level_t gc_level) +{ + struct inode *inode; + int err, cookie; + + inode = logfs_safe_iget(sb, ino, &cookie); + err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0); + BUG_ON(err); + logfs_safe_iput(inode, cookie); +} + +static u32 logfs_gc_segment(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_header sh; + struct logfs_object_header oh; + u64 ofs, ino, bix; + u32 seg_ofs, logical_segno, cleaned = 0; + int err, len, valid; + gc_level_t gc_level; + + LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb); + + btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS); + err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); + BUG_ON(err); + gc_level = GC_LEVEL(sh.level); + logical_segno = be32_to_cpu(sh.segno); + if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { + logfs_mark_segment_bad(sb, segno); + cleaned = -1; + goto out; + } + + for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; + seg_ofs + sizeof(oh) < super->s_segsize; ) { + ofs = dev_ofs(sb, logical_segno, seg_ofs); + err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh), + &oh); + BUG_ON(err); + + if (!memchr_inv(&oh, 0xff, sizeof(oh))) + break; + + if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { + logfs_mark_segment_bad(sb, segno); + cleaned = super->s_segsize - 1; + goto out; + } + + ino = be64_to_cpu(oh.ino); + bix = be64_to_cpu(oh.bix); + len = sizeof(oh) + be16_to_cpu(oh.len); + valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level); + if (valid == 1) { + logfs_cleanse_block(sb, ofs, ino, bix, gc_level); + cleaned += len; + } else if (valid == 2) { + /* Will be invalid upon journal commit */ + cleaned += len; + } + seg_ofs += len; + } +out: + btree_remove32(&super->s_reserved_segments, segno); + return cleaned; +} + +static struct gc_candidate *add_list(struct gc_candidate *cand, + struct candidate_list *list) +{ + struct rb_node **p = &list->rb_tree.rb_node; + struct rb_node *parent = NULL; + struct gc_candidate *cur; + int comp; + + cand->list = list; + while (*p) { + parent = *p; + cur = rb_entry(parent, struct gc_candidate, rb_node); + + if (list->sort_by_ec) + comp = cand->erase_count < cur->erase_count; + else + comp = cand->valid < cur->valid; + + if (comp) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&cand->rb_node, parent, p); + rb_insert_color(&cand->rb_node, &list->rb_tree); + + if (list->count <= list->maxcount) { + list->count++; + return NULL; + } + cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node); + rb_erase(&cand->rb_node, &list->rb_tree); + cand->list = NULL; + return cand; +} + +static void remove_from_list(struct gc_candidate *cand) +{ + struct candidate_list *list = cand->list; + + rb_erase(&cand->rb_node, &list->rb_tree); + list->count--; +} + +static void free_candidate(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + + btree_remove32(&super->s_cand_tree, cand->segno); + kfree(cand); +} + +u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec) +{ + struct gc_candidate *cand; + u32 segno; + + BUG_ON(list->count == 0); + + cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); + remove_from_list(cand); + segno = cand->segno; + if (ec) + *ec = cand->erase_count; + free_candidate(sb, cand); + return segno; +} + +/* + * We have several lists to manage segments with. The reserve_list is used to + * deal with bad blocks. We try to keep the best (lowest ec) segments on this + * list. + * The free_list contains free segments for normal usage. It usually gets the + * second pick after the reserve_list. But when the free_list is running short + * it is more important to keep the free_list full than to keep a reserve. + * + * Segments that are not free are put onto a per-level low_list. If we have + * to run garbage collection, we pick a candidate from there. All segments on + * those lists should have at least some free space so GC will make progress. + * + * And last we have the ec_list, which is used to pick segments for wear + * leveling. + * + * If all appropriate lists are full, we simply free the candidate and forget + * about that segment for a while. We have better candidates for each purpose. + */ +static void __add_candidate(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE; + + if (cand->valid == 0) { + /* 100% free segments */ + log_gc_noisy("add reserve segment %x (ec %x) at %llx\n", + cand->segno, cand->erase_count, + dev_ofs(sb, cand->segno, 0)); + cand = add_list(cand, &super->s_reserve_list); + if (cand) { + log_gc_noisy("add free segment %x (ec %x) at %llx\n", + cand->segno, cand->erase_count, + dev_ofs(sb, cand->segno, 0)); + cand = add_list(cand, &super->s_free_list); + } + } else { + /* good candidates for Garbage Collection */ + if (cand->valid < full) + cand = add_list(cand, &super->s_low_list[cand->dist]); + /* good candidates for wear leveling, + * segments that were recently written get ignored */ + if (cand) + cand = add_list(cand, &super->s_ec_list); + } + if (cand) + free_candidate(sb, cand); +} + +static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec, + u8 dist) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + + cand = kmalloc(sizeof(*cand), GFP_NOFS); + if (!cand) + return -ENOMEM; + + cand->segno = segno; + cand->valid = valid; + cand->erase_count = ec; + cand->dist = dist; + + btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS); + __add_candidate(sb, cand); + return 0; +} + +static void remove_segment_from_lists(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + + cand = btree_lookup32(&super->s_cand_tree, segno); + if (cand) { + remove_from_list(cand); + free_candidate(sb, cand); + } +} + +static void scan_segment(struct super_block *sb, u32 segno) +{ + u32 valid, ec = 0; + gc_level_t gc_level = 0; + u8 dist; + + if (segment_is_reserved(sb, segno)) + return; + + remove_segment_from_lists(sb, segno); + valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); + if (valid == RESERVED) + return; + + dist = root_distance(sb, gc_level); + add_candidate(sb, segno, valid, ec, dist); +} + +static struct gc_candidate *first_in_list(struct candidate_list *list) +{ + if (list->count == 0) + return NULL; + return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); +} + +/* + * Find the best segment for garbage collection. Main criterion is + * the segment requiring the least effort to clean. Secondary + * criterion is to GC on the lowest level available. + * + * So we search the least effort segment on the lowest level first, + * then move up and pick another segment iff is requires significantly + * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison. + */ +static struct gc_candidate *get_candidate(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i, max_dist; + struct gc_candidate *cand = NULL, *this; + + max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS); + + for (i = max_dist; i >= 0; i--) { + this = first_in_list(&super->s_low_list[i]); + if (!this) + continue; + if (!cand) + cand = this; + if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid) + cand = this; + } + return cand; +} + +static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + gc_level_t gc_level; + u32 cleaned, valid, segno, ec; + u8 dist; + + if (!cand) { + log_gc("GC attempted, but no candidate found\n"); + return 0; + } + + segno = cand->segno; + dist = cand->dist; + valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); + free_candidate(sb, cand); + log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n", + segno, (u64)segno << super->s_segshift, + dist, no_free_segments(sb), valid, + super->s_free_bytes); + cleaned = logfs_gc_segment(sb, segno); + log_gc("GC segment #%02x complete - now %x valid\n", segno, + valid - cleaned); + BUG_ON(cleaned != valid); + return 1; +} + +static int logfs_gc_once(struct super_block *sb) +{ + struct gc_candidate *cand; + + cand = get_candidate(sb); + if (cand) + remove_from_list(cand); + return __logfs_gc_once(sb, cand); +} + +/* returns 1 if a wrap occurs, 0 otherwise */ +static int logfs_scan_some(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u32 segno; + int i, ret = 0; + + segno = super->s_sweeper; + for (i = SCAN_RATIO; i > 0; i--) { + segno++; + if (segno >= super->s_no_segs) { + segno = 0; + ret = 1; + /* Break out of the loop. We want to read a single + * block from the segment size on next invocation if + * SCAN_RATIO is set to match block size + */ + break; + } + + scan_segment(sb, segno); + } + super->s_sweeper = segno; + return ret; +} + +/* + * In principle, this function should loop forever, looking for GC candidates + * and moving data. LogFS is designed in such a way that this loop is + * guaranteed to terminate. + * + * Limiting the loop to some iterations serves purely to catch cases when + * these guarantees have failed. An actual endless loop is an obvious bug + * and should be reported as such. + */ +static void __logfs_gc_pass(struct super_block *sb, int target) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + int round, progress, last_progress = 0; + + /* + * Doing too many changes to the segfile at once would result + * in a large number of aliases. Write the journal before + * things get out of hand. + */ + if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES) + logfs_write_anchor(sb); + + if (no_free_segments(sb) >= target && + super->s_no_object_aliases < MAX_OBJ_ALIASES) + return; + + log_gc("__logfs_gc_pass(%x)\n", target); + for (round = 0; round < SCAN_ROUNDS; ) { + if (no_free_segments(sb) >= target) + goto write_alias; + + /* Sync in-memory state with on-medium state in case they + * diverged */ + logfs_write_anchor(sb); + round += logfs_scan_some(sb); + if (no_free_segments(sb) >= target) + goto write_alias; + progress = logfs_gc_once(sb); + if (progress) + last_progress = round; + else if (round - last_progress > 2) + break; + continue; + + /* + * The goto logic is nasty, I just don't know a better way to + * code it. GC is supposed to ensure two things: + * 1. Enough free segments are available. + * 2. The number of aliases is bounded. + * When 1. is achieved, we take a look at 2. and write back + * some alias-containing blocks, if necessary. However, after + * each such write we need to go back to 1., as writes can + * consume free segments. + */ +write_alias: + if (super->s_no_object_aliases < MAX_OBJ_ALIASES) + return; + if (list_empty(&super->s_object_alias)) { + /* All aliases are still in btree */ + return; + } + log_gc("Write back one alias\n"); + block = list_entry(super->s_object_alias.next, + struct logfs_block, alias_list); + block->ops->write_block(block); + /* + * To round off the nasty goto logic, we reset round here. It + * is a safety-net for GC not making any progress and limited + * to something reasonably small. If incremented it for every + * single alias, the loop could terminate rather quickly. + */ + round = 0; + } + LOGFS_BUG(sb); +} + +static int wl_ratelimit(struct super_block *sb, u64 *next_event) +{ + struct logfs_super *super = logfs_super(sb); + + if (*next_event < super->s_gec) { + *next_event = super->s_gec + WL_RATELIMIT; + return 0; + } + return 1; +} + +static void logfs_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *wl_cand, *free_cand; + + if (wl_ratelimit(sb, &super->s_wl_gec_ostore)) + return; + + wl_cand = first_in_list(&super->s_ec_list); + if (!wl_cand) + return; + free_cand = first_in_list(&super->s_free_list); + if (!free_cand) + return; + + if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) { + remove_from_list(wl_cand); + __logfs_gc_once(sb, wl_cand); + } +} + +/* + * The journal needs wear leveling as well. But moving the journal is an + * expensive operation so we try to avoid it as much as possible. And if we + * have to do it, we move the whole journal, not individual segments. + * + * Ratelimiting is not strictly necessary here, it mainly serves to avoid the + * calculations. First we check whether moving the journal would be a + * significant improvement. That means that a) the current journal segments + * have more wear than the future journal segments and b) the current journal + * segments have more wear than normal ostore segments. + * Rationale for b) is that we don't have to move the journal if it is aging + * less than the ostore, even if the reserve segments age even less (they are + * excluded from wear leveling, after all). + * Next we check that the superblocks have less wear than the journal. Since + * moving the journal requires writing the superblocks, we have to protect the + * superblocks even more than the journal. + * + * Also we double the acceptable wear difference, compared to ostore wear + * leveling. Journal data is read and rewritten rapidly, comparatively. So + * soft errors have much less time to accumulate and we allow the journal to + * be a bit worse than the ostore. + */ +static void logfs_journal_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + u32 min_journal_ec = -1, max_reserve_ec = 0; + int i; + + if (wl_ratelimit(sb, &super->s_wl_gec_journal)) + return; + + if (super->s_reserve_list.count < super->s_no_journal_segs) { + /* Reserve is not full enough to move complete journal */ + return; + } + + journal_for_each(i) + if (super->s_journal_seg[i]) + min_journal_ec = min(min_journal_ec, + super->s_journal_ec[i]); + cand = rb_entry(rb_first(&super->s_free_list.rb_tree), + struct gc_candidate, rb_node); + max_reserve_ec = cand->erase_count; + for (i = 0; i < 2; i++) { + struct logfs_segment_entry se; + u32 segno = seg_no(sb, super->s_sb_ofs[i]); + u32 ec; + + logfs_get_segment_entry(sb, segno, &se); + ec = be32_to_cpu(se.ec_level) >> 4; + max_reserve_ec = max(max_reserve_ec, ec); + } + + if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) { + do_logfs_journal_wl_pass(sb); + } +} + +void logfs_gc_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex)); + /* Write journal before free space is getting saturated with dirty + * objects. + */ + if (super->s_dirty_used_bytes + super->s_dirty_free_bytes + + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes) + logfs_write_anchor(sb); + __logfs_gc_pass(sb, super->s_total_levels); + logfs_wl_pass(sb); + logfs_journal_wl_pass(sb); +} + +static int check_area(struct super_block *sb, int i) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[i]; + gc_level_t gc_level; + u32 cleaned, valid, ec; + u32 segno = area->a_segno; + u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + + if (!area->a_is_open) + return 0; + + if (super->s_devops->can_write_buf(sb, ofs) == 0) + return 0; + + printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs); + /* + * The device cannot write back the write buffer. Most likely the + * wbuf was already written out and the system crashed at some point + * before the journal commit happened. In that case we wouldn't have + * to do anything. But if the crash happened before the wbuf was + * written out correctly, we must GC this segment. So assume the + * worst and always do the GC run. + */ + area->a_is_open = 0; + valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); + cleaned = logfs_gc_segment(sb, segno); + if (cleaned != valid) + return -EIO; + return 0; +} + +int logfs_check_areas(struct super_block *sb) +{ + int i, err; + + for_each_area(i) { + err = check_area(sb, i); + if (err) + return err; + } + return 0; +} + +static void logfs_init_candlist(struct candidate_list *list, int maxcount, + int sort_by_ec) +{ + list->count = 0; + list->maxcount = maxcount; + list->sort_by_ec = sort_by_ec; + list->rb_tree = RB_ROOT; +} + +int logfs_init_gc(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool); + logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1); + logfs_init_candlist(&super->s_reserve_list, + super->s_bad_seg_reserve, 1); + for_each_area(i) + logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0); + logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1); + return 0; +} + +static void logfs_cleanup_list(struct super_block *sb, + struct candidate_list *list) +{ + struct gc_candidate *cand; + + while (list->count) { + cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate, + rb_node); + remove_from_list(cand); + free_candidate(sb, cand); + } + BUG_ON(list->rb_tree.rb_node); +} + +void logfs_cleanup_gc(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + if (!super->s_free_list.count) + return; + + /* + * FIXME: The btree may still contain a single empty node. So we + * call the grim visitor to clean up that mess. Btree code should + * do it for us, really. + */ + btree_grim_visitor32(&super->s_cand_tree, 0, NULL); + logfs_cleanup_list(sb, &super->s_free_list); + logfs_cleanup_list(sb, &super->s_reserve_list); + for_each_area(i) + logfs_cleanup_list(sb, &super->s_low_list[i]); + logfs_cleanup_list(sb, &super->s_ec_list); +} diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c new file mode 100644 index 00000000..edfea7a3 --- /dev/null +++ b/fs/logfs/inode.c @@ -0,0 +1,405 @@ +/* + * fs/logfs/inode.c - inode handling code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include +#include + +/* + * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes + * on the medium. It therefore also lacks a method to store the previous + * generation number for deleted inodes. Instead a single generation number + * is stored which will be used for new inodes. Being just a 32bit counter, + * this can obvious wrap relatively quickly. So we only reuse inodes if we + * know that a fair number of inodes can be created before we have to increment + * the generation again - effectively adding some bits to the counter. + * But being too aggressive here means we keep a very large and very sparse + * inode file, wasting space on indirect blocks. + * So what is a good value? Beats me. 64k seems moderately bad on both + * fronts, so let's use that for now... + * + * NFS sucks, as everyone already knows. + */ +#define INOS_PER_WRAP (0x10000) + +/* + * Logfs' requirement to read inodes for garbage collection makes life a bit + * harder. GC may have to read inodes that are in I_FREEING state, when they + * are being written out - and waiting for GC to make progress, naturally. + * + * So we cannot just call iget() or some variant of it, but first have to check + * wether the inode in question might be in I_FREEING state. Therefore we + * maintain our own per-sb list of "almost deleted" inodes and check against + * that list first. Normally this should be at most 1-2 entries long. + * + * Also, inodes have logfs-specific reference counting on top of what the vfs + * does. When .destroy_inode is called, normally the reference count will drop + * to zero and the inode gets deleted. But if GC accessed the inode, its + * refcount will remain nonzero and final deletion will have to wait. + * + * As a result we have two sets of functions to get/put inodes: + * logfs_safe_iget/logfs_safe_iput - safe to call from GC context + * logfs_iget/iput - normal version + */ +static struct kmem_cache *logfs_inode_cache; + +static DEFINE_SPINLOCK(logfs_inode_lock); + +static void logfs_inode_setops(struct inode *inode) +{ + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + inode->i_op = &logfs_dir_iops; + inode->i_fop = &logfs_dir_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFREG: + inode->i_op = &logfs_reg_iops; + inode->i_fop = &logfs_reg_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFLNK: + inode->i_op = &logfs_symlink_iops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + default: + BUG(); + } +} + +static struct inode *__logfs_iget(struct super_block *sb, ino_t ino) +{ + struct inode *inode = iget_locked(sb, ino); + int err; + + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = logfs_read_inode(inode); + if (err || inode->i_nlink == 0) { + /* inode->i_nlink == 0 can be true when called from + * block validator */ + /* set i_nlink to 0 to prevent caching */ + inode->i_nlink = 0; + logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE; + iget_failed(inode); + if (!err) + err = -ENOENT; + return ERR_PTR(err); + } + + logfs_inode_setops(inode); + unlock_new_inode(inode); + return inode; +} + +struct inode *logfs_iget(struct super_block *sb, ino_t ino) +{ + BUG_ON(ino == LOGFS_INO_MASTER); + BUG_ON(ino == LOGFS_INO_SEGFILE); + return __logfs_iget(sb, ino); +} + +/* + * is_cached is set to 1 if we hand out a cached inode, 0 otherwise. + * this allows logfs_iput to do the right thing later + */ +struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_inode *li; + + if (ino == LOGFS_INO_MASTER) + return super->s_master_inode; + if (ino == LOGFS_INO_SEGFILE) + return super->s_segfile_inode; + + spin_lock(&logfs_inode_lock); + list_for_each_entry(li, &super->s_freeing_list, li_freeing_list) + if (li->vfs_inode.i_ino == ino) { + li->li_refcount++; + spin_unlock(&logfs_inode_lock); + *is_cached = 1; + return &li->vfs_inode; + } + spin_unlock(&logfs_inode_lock); + + *is_cached = 0; + return __logfs_iget(sb, ino); +} + +static void logfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(logfs_inode_cache, logfs_inode(inode)); +} + +static void __logfs_destroy_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + BUG_ON(li->li_block); + list_del(&li->li_freeing_list); + call_rcu(&inode->i_rcu, logfs_i_callback); +} + +static void logfs_destroy_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + BUG_ON(list_empty(&li->li_freeing_list)); + spin_lock(&logfs_inode_lock); + li->li_refcount--; + if (li->li_refcount == 0) + __logfs_destroy_inode(inode); + spin_unlock(&logfs_inode_lock); +} + +void logfs_safe_iput(struct inode *inode, int is_cached) +{ + if (inode->i_ino == LOGFS_INO_MASTER) + return; + if (inode->i_ino == LOGFS_INO_SEGFILE) + return; + + if (is_cached) { + logfs_destroy_inode(inode); + return; + } + + iput(inode); +} + +static void logfs_init_inode(struct super_block *sb, struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + li->li_flags = 0; + li->li_height = 0; + li->li_used_bytes = 0; + li->li_block = NULL; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_ctime = CURRENT_TIME; + inode->i_mtime = CURRENT_TIME; + inode->i_nlink = 1; + li->li_refcount = 1; + INIT_LIST_HEAD(&li->li_freeing_list); + + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = 0; + + return; +} + +static struct inode *logfs_alloc_inode(struct super_block *sb) +{ + struct logfs_inode *li; + + li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS); + if (!li) + return NULL; + logfs_init_inode(sb, &li->vfs_inode); + return &li->vfs_inode; +} + +/* + * In logfs inodes are written to an inode file. The inode file, like any + * other file, is managed with a inode. The inode file's inode, aka master + * inode, requires special handling in several respects. First, it cannot be + * written to the inode file, so it is stored in the journal instead. + * + * Secondly, this inode cannot be written back and destroyed before all other + * inodes have been written. The ordering is important. Linux' VFS is happily + * unaware of the ordering constraint and would ordinarily destroy the master + * inode at umount time while other inodes are still in use and dirty. Not + * good. + * + * So logfs makes sure the master inode is not written until all other inodes + * have been destroyed. Sadly, this method has another side-effect. The VFS + * will notice one remaining inode and print a frightening warning message. + * Worse, it is impossible to judge whether such a warning was caused by the + * master inode or any other inodes have leaked as well. + * + * Our attempt of solving this is with logfs_new_meta_inode() below. Its + * purpose is to create a new inode that will not trigger the warning if such + * an inode is still in use. An ugly hack, no doubt. Suggections for + * improvement are welcome. + * + * AV: that's what ->put_super() is for... + */ +struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_mode = S_IFREG; + inode->i_ino = ino; + inode->i_data.a_ops = &logfs_reg_aops; + mapping_set_gfp_mask(&inode->i_data, GFP_NOFS); + + return inode; +} + +struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode; + int err; + + inode = logfs_new_meta_inode(sb, ino); + if (IS_ERR(inode)) + return inode; + + err = logfs_read_inode(inode); + if (err) { + iput(inode); + return ERR_PTR(err); + } + logfs_inode_setops(inode); + return inode; +} + +static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int ret; + long flags = WF_LOCK; + + /* Can only happen if creat() failed. Safe to skip. */ + if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN) + return 0; + + ret = __logfs_write_inode(inode, flags); + LOGFS_BUG_ON(ret, inode->i_sb); + return ret; +} + +/* called with inode->i_lock held */ +static int logfs_drop_inode(struct inode *inode) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_inode *li = logfs_inode(inode); + + spin_lock(&logfs_inode_lock); + list_move(&li->li_freeing_list, &super->s_freeing_list); + spin_unlock(&logfs_inode_lock); + return generic_drop_inode(inode); +} + +static void logfs_set_ino_generation(struct super_block *sb, + struct inode *inode) +{ + struct logfs_super *super = logfs_super(sb); + u64 ino; + + mutex_lock(&super->s_journal_mutex); + ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1); + super->s_last_ino = ino; + super->s_inos_till_wrap--; + if (super->s_inos_till_wrap < 0) { + super->s_last_ino = LOGFS_RESERVED_INOS; + super->s_generation++; + super->s_inos_till_wrap = INOS_PER_WRAP; + } + inode->i_ino = ino; + inode->i_generation = super->s_generation; + mutex_unlock(&super->s_journal_mutex); +} + +struct inode *logfs_new_inode(struct inode *dir, int mode) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + logfs_init_inode(sb, inode); + + /* inherit parent flags */ + logfs_inode(inode)->li_flags |= + logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED; + + inode->i_mode = mode; + logfs_set_ino_generation(sb, inode); + + inode_init_owner(inode, dir, mode); + logfs_inode_setops(inode); + insert_inode_hash(inode); + + return inode; +} + +static void logfs_init_once(void *_li) +{ + struct logfs_inode *li = _li; + int i; + + li->li_flags = 0; + li->li_used_bytes = 0; + li->li_refcount = 1; + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = 0; + inode_init_once(&li->vfs_inode); +} + +static int logfs_sync_fs(struct super_block *sb, int wait) +{ + logfs_write_anchor(sb); + return 0; +} + +static void logfs_put_super(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + /* kill the meta-inodes */ + iput(super->s_master_inode); + iput(super->s_segfile_inode); + iput(super->s_mapping_inode); +} + +const struct super_operations logfs_super_operations = { + .alloc_inode = logfs_alloc_inode, + .destroy_inode = logfs_destroy_inode, + .evict_inode = logfs_evict_inode, + .drop_inode = logfs_drop_inode, + .put_super = logfs_put_super, + .write_inode = logfs_write_inode, + .statfs = logfs_statfs, + .sync_fs = logfs_sync_fs, +}; + +int logfs_init_inode_cache(void) +{ + logfs_inode_cache = kmem_cache_create("logfs_inode_cache", + sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT, + logfs_init_once); + if (!logfs_inode_cache) + return -ENOMEM; + return 0; +} + +void logfs_destroy_inode_cache(void) +{ + kmem_cache_destroy(logfs_inode_cache); +} diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c new file mode 100644 index 00000000..9da29706 --- /dev/null +++ b/fs/logfs/journal.c @@ -0,0 +1,895 @@ +/* + * fs/logfs/journal.c - journal handling code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include + +static void logfs_calc_free(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u64 reserve, no_segs = super->s_no_segs; + s64 free; + int i; + + /* superblock segments */ + no_segs -= 2; + super->s_no_journal_segs = 0; + /* journal */ + journal_for_each(i) + if (super->s_journal_seg[i]) { + no_segs--; + super->s_no_journal_segs++; + } + + /* open segments plus one extra per level for GC */ + no_segs -= 2 * super->s_total_levels; + + free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE); + free -= super->s_used_bytes; + /* just a bit extra */ + free -= super->s_total_levels * 4096; + + /* Bad blocks are 'paid' for with speed reserve - the filesystem + * simply gets slower as bad blocks accumulate. Until the bad blocks + * exceed the speed reserve - then the filesystem gets smaller. + */ + reserve = super->s_bad_segments + super->s_bad_seg_reserve; + reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE; + reserve = max(reserve, super->s_speed_reserve); + free -= reserve; + if (free < 0) + free = 0; + + super->s_free_bytes = free; +} + +static void reserve_sb_and_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct btree_head32 *head = &super->s_reserved_segments; + int i, err; + + err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1, + GFP_KERNEL); + BUG_ON(err); + + err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1, + GFP_KERNEL); + BUG_ON(err); + + journal_for_each(i) { + if (!super->s_journal_seg[i]) + continue; + err = btree_insert32(head, super->s_journal_seg[i], (void *)1, + GFP_KERNEL); + BUG_ON(err); + } +} + +static void read_dynsb(struct super_block *sb, + struct logfs_je_dynsb *dynsb) +{ + struct logfs_super *super = logfs_super(sb); + + super->s_gec = be64_to_cpu(dynsb->ds_gec); + super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper); + super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino); + super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir); + super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos); + super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes); + super->s_generation = be32_to_cpu(dynsb->ds_generation); +} + +static void read_anchor(struct super_block *sb, + struct logfs_je_anchor *da) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + int i; + + super->s_last_ino = be64_to_cpu(da->da_last_ino); + li->li_flags = 0; + li->li_height = da->da_height; + i_size_write(inode, be64_to_cpu(da->da_size)); + li->li_used_bytes = be64_to_cpu(da->da_used_bytes); + + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = be64_to_cpu(da->da_data[i]); +} + +static void read_erasecount(struct super_block *sb, + struct logfs_je_journal_ec *ec) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + journal_for_each(i) + super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]); +} + +static int read_area(struct super_block *sb, struct logfs_je_area *a) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[a->gc_level]; + u64 ofs; + u32 writemask = ~(super->s_writesize - 1); + + if (a->gc_level >= LOGFS_NO_AREAS) + return -EIO; + if (a->vim != VIM_DEFAULT) + return -EIO; /* TODO: close area and continue */ + + area->a_used_bytes = be32_to_cpu(a->used_bytes); + area->a_written_bytes = area->a_used_bytes & writemask; + area->a_segno = be32_to_cpu(a->segno); + if (area->a_segno) + area->a_is_open = 1; + + ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + if (super->s_writesize > 1) + return logfs_buf_recover(area, ofs, a + 1, super->s_writesize); + else + return logfs_buf_recover(area, ofs, NULL, 0); +} + +static void *unpack(void *from, void *to) +{ + struct logfs_journal_header *jh = from; + void *data = from + sizeof(struct logfs_journal_header); + int err; + size_t inlen, outlen; + + inlen = be16_to_cpu(jh->h_len); + outlen = be16_to_cpu(jh->h_datalen); + + if (jh->h_compr == COMPR_NONE) + memcpy(to, data, inlen); + else { + err = logfs_uncompress(data, to, inlen, outlen); + BUG_ON(err); + } + return to; +} + +static int __read_je_header(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + struct logfs_super *super = logfs_super(sb); + size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize) + + MAX_JOURNAL_HEADER; + u16 type, len, datalen; + int err; + + /* read header only */ + err = wbuf_read(sb, ofs, sizeof(*jh), jh); + if (err) + return err; + type = be16_to_cpu(jh->h_type); + len = be16_to_cpu(jh->h_len); + datalen = be16_to_cpu(jh->h_datalen); + if (len > sb->s_blocksize) + return -EIO; + if ((type < JE_FIRST) || (type > JE_LAST)) + return -EIO; + if (datalen > bufsize) + return -EIO; + return 0; +} + +static int __read_je_payload(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + u16 len; + int err; + + len = be16_to_cpu(jh->h_len); + err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1); + if (err) + return err; + if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) { + /* Old code was confused. It forgot about the header length + * and stopped calculating the crc 16 bytes before the end + * of data - ick! + * FIXME: Remove this hack once the old code is fixed. + */ + if (jh->h_crc == logfs_crc32(jh, len, 4)) + WARN_ON_ONCE(1); + else + return -EIO; + } + return 0; +} + +/* + * jh needs to be large enough to hold the complete entry, not just the header + */ +static int __read_je(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + int err; + + err = __read_je_header(sb, ofs, jh); + if (err) + return err; + return __read_je_payload(sb, ofs, jh); +} + +static int read_je(struct super_block *sb, u64 ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_journal_header *jh = super->s_compressed_je; + void *scratch = super->s_je; + u16 type, datalen; + int err; + + err = __read_je(sb, ofs, jh); + if (err) + return err; + type = be16_to_cpu(jh->h_type); + datalen = be16_to_cpu(jh->h_datalen); + + switch (type) { + case JE_DYNSB: + read_dynsb(sb, unpack(jh, scratch)); + break; + case JE_ANCHOR: + read_anchor(sb, unpack(jh, scratch)); + break; + case JE_ERASECOUNT: + read_erasecount(sb, unpack(jh, scratch)); + break; + case JE_AREA: + err = read_area(sb, unpack(jh, scratch)); + break; + case JE_OBJ_ALIAS: + err = logfs_load_object_aliases(sb, unpack(jh, scratch), + datalen); + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + return err; +} + +static int logfs_read_segment(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_journal_header *jh = super->s_compressed_je; + u64 ofs, seg_ofs = dev_ofs(sb, segno, 0); + u32 h_ofs, last_ofs = 0; + u16 len, datalen, last_len = 0; + int i, err; + + /* search for most recent commit */ + for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) { + ofs = seg_ofs + h_ofs; + err = __read_je_header(sb, ofs, jh); + if (err) + continue; + if (jh->h_type != cpu_to_be16(JE_COMMIT)) + continue; + err = __read_je_payload(sb, ofs, jh); + if (err) + continue; + len = be16_to_cpu(jh->h_len); + datalen = be16_to_cpu(jh->h_datalen); + if ((datalen > sizeof(super->s_je_array)) || + (datalen % sizeof(__be64))) + continue; + last_ofs = h_ofs; + last_len = datalen; + h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh); + } + /* read commit */ + if (last_ofs == 0) + return -ENOENT; + ofs = seg_ofs + last_ofs; + log_journal("Read commit from %llx\n", ofs); + err = __read_je(sb, ofs, jh); + BUG_ON(err); /* We should have caught it in the scan loop already */ + if (err) + return err; + /* uncompress */ + unpack(jh, super->s_je_array); + super->s_no_je = last_len / sizeof(__be64); + /* iterate over array */ + for (i = 0; i < super->s_no_je; i++) { + err = read_je(sb, be64_to_cpu(super->s_je_array[i])); + if (err) + return err; + } + super->s_journal_area->a_segno = segno; + return 0; +} + +static u64 read_gec(struct super_block *sb, u32 segno) +{ + struct logfs_segment_header sh; + __be32 crc; + int err; + + if (!segno) + return 0; + err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); + if (err) + return 0; + crc = logfs_crc32(&sh, sizeof(sh), 4); + if (crc != sh.crc) { + WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull)); + /* Most likely it was just erased */ + return 0; + } + return be64_to_cpu(sh.gec); +} + +static int logfs_read_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u64 gec[LOGFS_JOURNAL_SEGS], max; + u32 segno; + int i, max_i; + + max = 0; + max_i = -1; + journal_for_each(i) { + segno = super->s_journal_seg[i]; + gec[i] = read_gec(sb, super->s_journal_seg[i]); + if (gec[i] > max) { + max = gec[i]; + max_i = i; + } + } + if (max_i == -1) + return -EIO; + /* FIXME: Try older segments in case of error */ + return logfs_read_segment(sb, super->s_journal_seg[max_i]); +} + +/* + * First search the current segment (outer loop), then pick the next segment + * in the array, skipping any zero entries (inner loop). + */ +static void journal_get_free_segment(struct logfs_area *area) +{ + struct logfs_super *super = logfs_super(area->a_sb); + int i; + + journal_for_each(i) { + if (area->a_segno != super->s_journal_seg[i]) + continue; + + do { + i++; + if (i == LOGFS_JOURNAL_SEGS) + i = 0; + } while (!super->s_journal_seg[i]); + + area->a_segno = super->s_journal_seg[i]; + area->a_erase_count = ++(super->s_journal_ec[i]); + log_journal("Journal now at %x (ec %x)\n", area->a_segno, + area->a_erase_count); + return; + } + BUG(); +} + +static void journal_get_erase_count(struct logfs_area *area) +{ + /* erase count is stored globally and incremented in + * journal_get_free_segment() - nothing to do here */ +} + +static int journal_erase_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + union { + struct logfs_segment_header sh; + unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)]; + } u; + u64 ofs; + int err; + + err = logfs_erase_segment(sb, area->a_segno, 1); + if (err) + return err; + + memset(&u, 0, sizeof(u)); + u.sh.pad = 0; + u.sh.type = SEG_JOURNAL; + u.sh.level = 0; + u.sh.segno = cpu_to_be32(area->a_segno); + u.sh.ec = cpu_to_be32(area->a_erase_count); + u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); + u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4); + + /* This causes a bug in segment.c. Not yet. */ + //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0); + + ofs = dev_ofs(sb, area->a_segno, 0); + area->a_used_bytes = sizeof(u); + logfs_buf_write(area, ofs, &u, sizeof(u)); + return 0; +} + +static size_t __logfs_write_header(struct logfs_super *super, + struct logfs_journal_header *jh, size_t len, size_t datalen, + u16 type, u8 compr) +{ + jh->h_len = cpu_to_be16(len); + jh->h_type = cpu_to_be16(type); + jh->h_datalen = cpu_to_be16(datalen); + jh->h_compr = compr; + jh->h_pad[0] = 'H'; + jh->h_pad[1] = 'E'; + jh->h_pad[2] = 'A'; + jh->h_pad[3] = 'D'; + jh->h_pad[4] = 'R'; + jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4); + return ALIGN(len, 16) + sizeof(*jh); +} + +static size_t logfs_write_header(struct logfs_super *super, + struct logfs_journal_header *jh, size_t datalen, u16 type) +{ + size_t len = datalen; + + return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE); +} + +static inline size_t logfs_journal_erasecount_size(struct logfs_super *super) +{ + return LOGFS_JOURNAL_SEGS * sizeof(__be32); +} + +static void *logfs_write_erasecount(struct super_block *sb, void *_ec, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_journal_ec *ec = _ec; + int i; + + journal_for_each(i) + ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]); + *type = JE_ERASECOUNT; + *len = logfs_journal_erasecount_size(super); + return ec; +} + +static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore, + size_t ignore2) +{ + struct logfs_shadow *shadow = _shadow; + struct super_block *sb = (void *)_sb; + struct logfs_super *super = logfs_super(sb); + + /* consume new space */ + super->s_free_bytes -= shadow->new_len; + super->s_used_bytes += shadow->new_len; + super->s_dirty_used_bytes -= shadow->new_len; + + /* free up old space */ + super->s_free_bytes += shadow->old_len; + super->s_used_bytes -= shadow->old_len; + super->s_dirty_free_bytes -= shadow->old_len; + + logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len); + logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len); + + log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + mempool_free(shadow, super->s_shadow_pool); +} + +static void account_shadows(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + struct shadow_tree *tree = &super->s_shadow_tree; + + btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow); + btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow); + btree_grim_visitor32(&tree->segment_map, 0, NULL); + tree->no_shadowed_segments = 0; + + if (li->li_block) { + /* + * We never actually use the structure, when attached to the + * master inode. But it is easier to always free it here than + * to have checks in several places elsewhere when allocating + * it. + */ + li->li_block->ops->free_block(sb, li->li_block); + } + BUG_ON((s64)li->li_used_bytes < 0); +} + +static void *__logfs_write_anchor(struct super_block *sb, void *_da, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_anchor *da = _da; + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + int i; + + da->da_height = li->li_height; + da->da_last_ino = cpu_to_be64(super->s_last_ino); + da->da_size = cpu_to_be64(i_size_read(inode)); + da->da_used_bytes = cpu_to_be64(li->li_used_bytes); + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + da->da_data[i] = cpu_to_be64(li->li_data[i]); + *type = JE_ANCHOR; + *len = sizeof(*da); + return da; +} + +static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_dynsb *dynsb = _dynsb; + + dynsb->ds_gec = cpu_to_be64(super->s_gec); + dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper); + dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino); + dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir); + dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos); + dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes); + dynsb->ds_generation = cpu_to_be32(super->s_generation); + *type = JE_DYNSB; + *len = sizeof(*dynsb); + return dynsb; +} + +static void write_wbuf(struct super_block *sb, struct logfs_area *area, + void *wbuf) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + u64 ofs; + pgoff_t index; + int page_ofs; + struct page *page; + + ofs = dev_ofs(sb, area->a_segno, + area->a_used_bytes & ~(super->s_writesize - 1)); + index = ofs >> PAGE_SHIFT; + page_ofs = ofs & (PAGE_SIZE - 1); + + page = find_lock_page(mapping, index); + BUG_ON(!page); + memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize); + unlock_page(page); +} + +static void *logfs_write_area(struct super_block *sb, void *_a, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[super->s_sum_index]; + struct logfs_je_area *a = _a; + + a->vim = VIM_DEFAULT; + a->gc_level = super->s_sum_index; + a->used_bytes = cpu_to_be32(area->a_used_bytes); + a->segno = cpu_to_be32(area->a_segno); + if (super->s_writesize > 1) + write_wbuf(sb, area, a + 1); + + *type = JE_AREA; + *len = sizeof(*a) + super->s_writesize; + return a; +} + +static void *logfs_write_commit(struct super_block *sb, void *h, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + + *type = JE_COMMIT; + *len = super->s_no_je * sizeof(__be64); + return super->s_je_array; +} + +static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type, + size_t len) +{ + struct logfs_super *super = logfs_super(sb); + void *header = super->s_compressed_je; + void *data = header + sizeof(struct logfs_journal_header); + ssize_t compr_len, pad_len; + u8 compr = COMPR_ZLIB; + + if (len == 0) + return logfs_write_header(super, header, 0, type); + + BUG_ON(len > sb->s_blocksize); + compr_len = logfs_compress(buf, data, len, sb->s_blocksize); + if (compr_len < 0 || type == JE_ANCHOR) { + memcpy(data, buf, len); + compr_len = len; + compr = COMPR_NONE; + } + + pad_len = ALIGN(compr_len, 16); + memset(data + compr_len, 0, pad_len - compr_len); + + return __logfs_write_header(super, header, compr_len, len, type, compr); +} + +static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes, + int must_pad) +{ + u32 writesize = logfs_super(area->a_sb)->s_writesize; + s32 ofs; + int ret; + + ret = logfs_open_area(area, *bytes); + if (ret) + return -EAGAIN; + + ofs = area->a_used_bytes; + area->a_used_bytes += *bytes; + + if (must_pad) { + area->a_used_bytes = ALIGN(area->a_used_bytes, writesize); + *bytes = area->a_used_bytes - ofs; + } + + return dev_ofs(area->a_sb, area->a_segno, ofs); +} + +static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type, + size_t buf_len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + struct logfs_journal_header *jh = super->s_compressed_je; + size_t len; + int must_pad = 0; + s64 ofs; + + len = __logfs_write_je(sb, buf, type, buf_len); + if (jh->h_type == cpu_to_be16(JE_COMMIT)) + must_pad = 1; + + ofs = logfs_get_free_bytes(area, &len, must_pad); + if (ofs < 0) + return ofs; + logfs_buf_write(area, ofs, super->s_compressed_je, len); + BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES); + super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs); + return 0; +} + +static int logfs_write_je(struct super_block *sb, + void* (*write)(struct super_block *sb, void *scratch, + u16 *type, size_t *len)) +{ + void *buf; + size_t len; + u16 type; + + buf = write(sb, logfs_super(sb)->s_je, &type, &len); + return logfs_write_je_buf(sb, buf, type, len); +} + +int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_obj_alias *oa = super->s_je; + int err = 0, fill = super->s_je_fill; + + log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n", + fill, ino, bix, level, child_no, be64_to_cpu(val)); + oa[fill].ino = cpu_to_be64(ino); + oa[fill].bix = cpu_to_be64(bix); + oa[fill].val = val; + oa[fill].level = (__force u8)level; + oa[fill].child_no = cpu_to_be16(child_no); + fill++; + if (fill >= sb->s_blocksize / sizeof(*oa)) { + err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize); + fill = 0; + } + + super->s_je_fill = fill; + return err; +} + +static int logfs_write_obj_aliases(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int err; + + log_journal("logfs_write_obj_aliases: %d aliases to write\n", + super->s_no_object_aliases); + super->s_je_fill = 0; + err = logfs_write_obj_aliases_pagecache(sb); + if (err) + return err; + + if (super->s_je_fill) + err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS, + super->s_je_fill + * sizeof(struct logfs_obj_alias)); + return err; +} + +/* + * Write all journal entries. The goto logic ensures that all journal entries + * are written whenever a new segment is used. It is ugly and potentially a + * bit wasteful, but robustness is more important. With this we can *always* + * erase all journal segments except the one containing the most recent commit. + */ +void logfs_write_anchor(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + int i, err; + + if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY)) + return; + super->s_flags &= ~LOGFS_SB_FLAG_DIRTY; + + BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + mutex_lock(&super->s_journal_mutex); + + /* Do this first or suffer corruption */ + logfs_sync_segments(sb); + account_shadows(sb); + +again: + super->s_no_je = 0; + for_each_area(i) { + if (!super->s_area[i]->a_is_open) + continue; + super->s_sum_index = i; + err = logfs_write_je(sb, logfs_write_area); + if (err) + goto again; + } + err = logfs_write_obj_aliases(sb); + if (err) + goto again; + err = logfs_write_je(sb, logfs_write_erasecount); + if (err) + goto again; + err = logfs_write_je(sb, __logfs_write_anchor); + if (err) + goto again; + err = logfs_write_je(sb, logfs_write_dynsb); + if (err) + goto again; + /* + * Order is imperative. First we sync all writes, including the + * non-committed journal writes. Then we write the final commit and + * sync the current journal segment. + * There is a theoretical bug here. Syncing the journal segment will + * write a number of journal entries and the final commit. All these + * are written in a single operation. If the device layer writes the + * data back-to-front, the commit will precede the other journal + * entries, leaving a race window. + * Two fixes are possible. Preferred is to fix the device layer to + * ensure writes happen front-to-back. Alternatively we can insert + * another logfs_sync_area() super->s_devops->sync() combo before + * writing the commit. + */ + /* + * On another subject, super->s_devops->sync is usually not necessary. + * Unless called from sys_sync or friends, a barrier would suffice. + */ + super->s_devops->sync(sb); + err = logfs_write_je(sb, logfs_write_commit); + if (err) + goto again; + log_journal("Write commit to %llx\n", + be64_to_cpu(super->s_je_array[super->s_no_je - 1])); + logfs_sync_area(area); + BUG_ON(area->a_used_bytes != area->a_written_bytes); + super->s_devops->sync(sb); + + mutex_unlock(&super->s_journal_mutex); + return; +} + +void do_logfs_journal_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + struct btree_head32 *head = &super->s_reserved_segments; + u32 segno, ec; + int i, err; + + log_journal("Journal requires wear-leveling.\n"); + /* Drop old segments */ + journal_for_each(i) + if (super->s_journal_seg[i]) { + btree_remove32(head, super->s_journal_seg[i]); + logfs_set_segment_unreserved(sb, + super->s_journal_seg[i], + super->s_journal_ec[i]); + super->s_journal_seg[i] = 0; + super->s_journal_ec[i] = 0; + } + /* Get new segments */ + for (i = 0; i < super->s_no_journal_segs; i++) { + segno = get_best_cand(sb, &super->s_reserve_list, &ec); + super->s_journal_seg[i] = segno; + super->s_journal_ec[i] = ec; + logfs_set_segment_reserved(sb, segno); + err = btree_insert32(head, segno, (void *)1, GFP_NOFS); + BUG_ON(err); /* mempool should prevent this */ + err = logfs_erase_segment(sb, segno, 1); + BUG_ON(err); /* FIXME: remount-ro would be nicer */ + } + /* Manually move journal_area */ + freeseg(sb, area->a_segno); + area->a_segno = super->s_journal_seg[0]; + area->a_is_open = 0; + area->a_used_bytes = 0; + /* Write journal */ + logfs_write_anchor(sb); + /* Write superblocks */ + err = logfs_write_sb(sb); + BUG_ON(err); +} + +static const struct logfs_area_ops journal_area_ops = { + .get_free_segment = journal_get_free_segment, + .get_erase_count = journal_get_erase_count, + .erase_segment = journal_erase_segment, +}; + +int logfs_init_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize) + + MAX_JOURNAL_HEADER; + int ret = -ENOMEM; + + mutex_init(&super->s_journal_mutex); + btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool); + + super->s_je = kzalloc(bufsize, GFP_KERNEL); + if (!super->s_je) + return ret; + + super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL); + if (!super->s_compressed_je) + return ret; + + super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER); + if (IS_ERR(super->s_master_inode)) + return PTR_ERR(super->s_master_inode); + + ret = logfs_read_journal(sb); + if (ret) + return -EIO; + + reserve_sb_and_journal(sb); + logfs_calc_free(sb); + + super->s_journal_area->a_ops = &journal_area_ops; + return 0; +} + +void logfs_cleanup_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + btree_grim_visitor32(&super->s_reserved_segments, 0, NULL); + + kfree(super->s_compressed_je); + kfree(super->s_je); +} diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h new file mode 100644 index 00000000..57afd4a6 --- /dev/null +++ b/fs/logfs/logfs.h @@ -0,0 +1,734 @@ +/* + * fs/logfs/logfs.h + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Private header for logfs. + */ +#ifndef FS_LOGFS_LOGFS_H +#define FS_LOGFS_LOGFS_H + +#undef __CHECK_ENDIAN__ +#define __CHECK_ENDIAN__ + +#include +#include +#include +#include +#include +#include +#include +#include "logfs_abi.h" + +#define LOGFS_DEBUG_SUPER (0x0001) +#define LOGFS_DEBUG_SEGMENT (0x0002) +#define LOGFS_DEBUG_JOURNAL (0x0004) +#define LOGFS_DEBUG_DIR (0x0008) +#define LOGFS_DEBUG_FILE (0x0010) +#define LOGFS_DEBUG_INODE (0x0020) +#define LOGFS_DEBUG_READWRITE (0x0040) +#define LOGFS_DEBUG_GC (0x0080) +#define LOGFS_DEBUG_GC_NOISY (0x0100) +#define LOGFS_DEBUG_ALIASES (0x0200) +#define LOGFS_DEBUG_BLOCKMOVE (0x0400) +#define LOGFS_DEBUG_ALL (0xffffffff) + +#define LOGFS_DEBUG (0x01) +/* + * To enable specific log messages, simply define LOGFS_DEBUG to match any + * or all of the above. + */ +#ifndef LOGFS_DEBUG +#define LOGFS_DEBUG (0) +#endif + +#define log_cond(cond, fmt, arg...) do { \ + if (cond) \ + printk(KERN_DEBUG fmt, ##arg); \ +} while (0) + +#define log_super(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg) +#define log_segment(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg) +#define log_journal(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg) +#define log_dir(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg) +#define log_file(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg) +#define log_inode(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg) +#define log_readwrite(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg) +#define log_gc(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg) +#define log_gc_noisy(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg) +#define log_aliases(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg) +#define log_blockmove(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg) + +#define PG_pre_locked PG_owner_priv_1 +#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags) +#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags) +#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags) + +/* FIXME: This should really be somewhere in the 64bit area. */ +#define LOGFS_LINK_MAX (1<<30) + +/* Read-only filesystem */ +#define LOGFS_SB_FLAG_RO 0x0001 +#define LOGFS_SB_FLAG_DIRTY 0x0002 +#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004 +#define LOGFS_SB_FLAG_SHUTDOWN 0x0008 + +/* Write Control Flags */ +#define WF_LOCK 0x01 /* take write lock */ +#define WF_WRITE 0x02 /* write block */ +#define WF_DELETE 0x04 /* delete old block */ + +typedef u8 __bitwise level_t; +typedef u8 __bitwise gc_level_t; + +#define LEVEL(level) ((__force level_t)(level)) +#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level)) + +#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \ + (__force level_t)((__force u8)(level) - 1) ) + +/** + * struct logfs_area - area management information + * + * @a_sb: the superblock this area belongs to + * @a_is_open: 1 if the area is currently open, else 0 + * @a_segno: segment number of area + * @a_written_bytes: number of bytes already written back + * @a_used_bytes: number of used bytes + * @a_ops: area operations (either journal or ostore) + * @a_erase_count: erase count + * @a_level: GC level + */ +struct logfs_area { /* a segment open for writing */ + struct super_block *a_sb; + int a_is_open; + u32 a_segno; + u32 a_written_bytes; + u32 a_used_bytes; + const struct logfs_area_ops *a_ops; + u32 a_erase_count; + gc_level_t a_level; +}; + +/** + * struct logfs_area_ops - area operations + * + * @get_free_segment: fill area->ofs with the offset of a free segment + * @get_erase_count: fill area->erase_count (needs area->ofs) + * @erase_segment: erase and setup segment + */ +struct logfs_area_ops { + void (*get_free_segment)(struct logfs_area *area); + void (*get_erase_count)(struct logfs_area *area); + int (*erase_segment)(struct logfs_area *area); +}; + +struct logfs_super; /* forward */ +/** + * struct logfs_device_ops - device access operations + * + * @readpage: read one page (mm page) + * @writeseg: write one segment. may be a partial segment + * @erase: erase one segment + * @read: read from the device + * @erase: erase part of the device + * @can_write_buf: decide whether wbuf can be written to ofs + */ +struct logfs_device_ops { + struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs); + struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs); + int (*write_sb)(struct super_block *sb, struct page *page); + int (*readpage)(void *_sb, struct page *page); + void (*writeseg)(struct super_block *sb, u64 ofs, size_t len); + int (*erase)(struct super_block *sb, loff_t ofs, size_t len, + int ensure_write); + int (*can_write_buf)(struct super_block *sb, u64 ofs); + void (*sync)(struct super_block *sb); + void (*put_device)(struct logfs_super *s); +}; + +/** + * struct candidate_list - list of similar candidates + */ +struct candidate_list { + struct rb_root rb_tree; + int count; + int maxcount; + int sort_by_ec; +}; + +/** + * struct gc_candidate - "candidate" segment to be garbage collected next + * + * @list: list (either free of low) + * @segno: segment number + * @valid: number of valid bytes + * @erase_count: erase count of segment + * @dist: distance from tree root + * + * Candidates can be on two lists. The free list contains electees rather + * than candidates - segments that no longer contain any valid data. The + * low list contains candidates to be picked for GC. It should be kept + * short. It is not required to always pick a perfect candidate. In the + * worst case GC will have to move more data than absolutely necessary. + */ +struct gc_candidate { + struct rb_node rb_node; + struct candidate_list *list; + u32 segno; + u32 valid; + u32 erase_count; + u8 dist; +}; + +/** + * struct logfs_journal_entry - temporary structure used during journal scan + * + * @used: + * @version: normalized version + * @len: length + * @offset: offset + */ +struct logfs_journal_entry { + int used; + s16 version; + u16 len; + u16 datalen; + u64 offset; +}; + +enum transaction_state { + CREATE_1 = 1, + CREATE_2, + UNLINK_1, + UNLINK_2, + CROSS_RENAME_1, + CROSS_RENAME_2, + TARGET_RENAME_1, + TARGET_RENAME_2, + TARGET_RENAME_3 +}; + +/** + * struct logfs_transaction - essential fields to support atomic dirops + * + * @ino: target inode + * @dir: inode of directory containing dentry + * @pos: pos of dentry in directory + */ +struct logfs_transaction { + enum transaction_state state; + u64 ino; + u64 dir; + u64 pos; +}; + +/** + * struct logfs_shadow - old block in the shadow of a not-yet-committed new one + * @old_ofs: offset of old block on medium + * @new_ofs: offset of new block on medium + * @ino: inode number + * @bix: block index + * @old_len: size of old block, including header + * @new_len: size of new block, including header + * @level: block level + */ +struct logfs_shadow { + u64 old_ofs; + u64 new_ofs; + u64 ino; + u64 bix; + int old_len; + int new_len; + gc_level_t gc_level; +}; + +/** + * struct shadow_tree + * @new: shadows where old_ofs==0, indexed by new_ofs + * @old: shadows where old_ofs!=0, indexed by old_ofs + * @segment_map: bitfield of segments containing shadows + * @no_shadowed_segment: number of segments containing shadows + */ +struct shadow_tree { + struct btree_head64 new; + struct btree_head64 old; + struct btree_head32 segment_map; + int no_shadowed_segments; +}; + +struct object_alias_item { + struct list_head list; + __be64 val; + int child_no; +}; + +/** + * struct logfs_block - contains any block state + * @type: indirect block or inode + * @full: number of fully populated children + * @partial: number of partially populated children + * + * Most blocks are directly represented by page cache pages. But when a block + * becomes dirty, is part of a transaction, contains aliases or is otherwise + * special, a struct logfs_block is allocated to track the additional state. + * Inodes are very similar to indirect blocks, so they can also get one of + * these structures added when appropriate. + */ +#define BLOCK_INDIRECT 1 /* Indirect block */ +#define BLOCK_INODE 2 /* Inode */ +struct logfs_block_ops; +struct logfs_block { + struct list_head alias_list; + struct list_head item_list; + struct super_block *sb; + u64 ino; + u64 bix; + level_t level; + struct page *page; + struct inode *inode; + struct logfs_transaction *ta; + unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG]; + struct logfs_block_ops *ops; + int full; + int partial; + int reserved_bytes; +}; + +typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val); +struct logfs_block_ops { + void (*write_block)(struct logfs_block *block); + void (*free_block)(struct super_block *sb, struct logfs_block*block); + int (*write_alias)(struct super_block *sb, + struct logfs_block *block, + write_alias_t *write_one_alias); +}; + +#define MAX_JOURNAL_ENTRIES 256 + +struct logfs_super { + struct mtd_info *s_mtd; /* underlying device */ + struct block_device *s_bdev; /* underlying device */ + const struct logfs_device_ops *s_devops;/* device access */ + struct inode *s_master_inode; /* inode file */ + struct inode *s_segfile_inode; /* segment file */ + struct inode *s_mapping_inode; /* device mapping */ + atomic_t s_pending_writes; /* outstanting bios */ + long s_flags; + mempool_t *s_btree_pool; /* for btree nodes */ + mempool_t *s_alias_pool; /* aliases in segment.c */ + u64 s_feature_incompat; + u64 s_feature_ro_compat; + u64 s_feature_compat; + u64 s_feature_flags; + u64 s_sb_ofs[2]; + struct page *s_erase_page; /* for dev_bdev.c */ + /* alias.c fields */ + struct btree_head32 s_segment_alias; /* remapped segments */ + int s_no_object_aliases; + struct list_head s_object_alias; /* remapped objects */ + struct btree_head128 s_object_alias_tree; /* remapped objects */ + struct mutex s_object_alias_mutex; + /* dir.c fields */ + struct mutex s_dirop_mutex; /* for creat/unlink/rename */ + u64 s_victim_ino; /* used for atomic dir-ops */ + u64 s_rename_dir; /* source directory ino */ + u64 s_rename_pos; /* position of source dd */ + /* gc.c fields */ + long s_segsize; /* size of a segment */ + int s_segshift; /* log2 of segment size */ + long s_segmask; /* 1 << s_segshift - 1 */ + long s_no_segs; /* segments on device */ + long s_no_journal_segs; /* segments used for journal */ + long s_no_blocks; /* blocks per segment */ + long s_writesize; /* minimum write size */ + int s_writeshift; /* log2 of write size */ + u64 s_size; /* filesystem size */ + struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */ + u64 s_gec; /* global erase count */ + u64 s_wl_gec_ostore; /* time of last wl event */ + u64 s_wl_gec_journal; /* time of last wl event */ + u64 s_sweeper; /* current sweeper pos */ + u8 s_ifile_levels; /* max level of ifile */ + u8 s_iblock_levels; /* max level of regular files */ + u8 s_data_levels; /* # of segments to leaf block*/ + u8 s_total_levels; /* sum of above three */ + struct btree_head32 s_cand_tree; /* all candidates */ + struct candidate_list s_free_list; /* 100% free segments */ + struct candidate_list s_reserve_list; /* Bad segment reserve */ + struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */ + struct candidate_list s_ec_list; /* wear level candidates */ + struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */ + /* inode.c fields */ + u64 s_last_ino; /* highest ino used */ + long s_inos_till_wrap; + u32 s_generation; /* i_generation for new files */ + struct list_head s_freeing_list; /* inodes being freed */ + /* journal.c fields */ + struct mutex s_journal_mutex; + void *s_je; /* journal entry to compress */ + void *s_compressed_je; /* block to write to journal */ + u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */ + u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */ + u64 s_last_version; + struct logfs_area *s_journal_area; /* open journal segment */ + __be64 s_je_array[MAX_JOURNAL_ENTRIES]; + int s_no_je; + + int s_sum_index; /* for the 12 summaries */ + struct shadow_tree s_shadow_tree; + int s_je_fill; /* index of current je */ + /* readwrite.c fields */ + struct mutex s_write_mutex; + int s_lock_count; + mempool_t *s_block_pool; /* struct logfs_block pool */ + mempool_t *s_shadow_pool; /* struct logfs_shadow pool */ + struct list_head s_writeback_list; /* writeback pages */ + /* + * Space accounting: + * - s_used_bytes specifies space used to store valid data objects. + * - s_dirty_used_bytes is space used to store non-committed data + * objects. Those objects have already been written themselves, + * but they don't become valid until all indirect blocks up to the + * journal have been written as well. + * - s_dirty_free_bytes is space used to store the old copy of a + * replaced object, as long as the replacement is non-committed. + * In other words, it is the amount of space freed when all dirty + * blocks are written back. + * - s_free_bytes is the amount of free space available for any + * purpose. + * - s_root_reserve is the amount of free space available only to + * the root user. Non-privileged users can no longer write once + * this watermark has been reached. + * - s_speed_reserve is space which remains unused to speed up + * garbage collection performance. + * - s_dirty_pages is the space reserved for currently dirty pages. + * It is a pessimistic estimate, so some/most will get freed on + * page writeback. + * + * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size + */ + u64 s_free_bytes; + u64 s_used_bytes; + u64 s_dirty_free_bytes; + u64 s_dirty_used_bytes; + u64 s_root_reserve; + u64 s_speed_reserve; + u64 s_dirty_pages; + /* Bad block handling: + * - s_bad_seg_reserve is a number of segments usually kept + * free. When encountering bad blocks, the affected segment's data + * is _temporarily_ moved to a reserved segment. + * - s_bad_segments is the number of known bad segments. + */ + u32 s_bad_seg_reserve; + u32 s_bad_segments; +}; + +/** + * struct logfs_inode - in-memory inode + * + * @vfs_inode: struct inode + * @li_data: data pointers + * @li_used_bytes: number of used bytes + * @li_freeing_list: used to track inodes currently being freed + * @li_flags: inode flags + * @li_refcount: number of internal (GC-induced) references + */ +struct logfs_inode { + struct inode vfs_inode; + u64 li_data[LOGFS_EMBEDDED_FIELDS]; + u64 li_used_bytes; + struct list_head li_freeing_list; + struct logfs_block *li_block; + u32 li_flags; + u8 li_height; + int li_refcount; +}; + +#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++) +#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++) +#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--) + +/* compr.c */ +int logfs_compress(void *in, void *out, size_t inlen, size_t outlen); +int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen); +int __init logfs_compr_init(void); +void logfs_compr_exit(void); + +/* dev_bdev.c */ +#ifdef CONFIG_BLOCK +int logfs_get_sb_bdev(struct logfs_super *s, + struct file_system_type *type, + const char *devname); +#else +static inline int logfs_get_sb_bdev(struct logfs_super *s, + struct file_system_type *type, + const char *devname) +{ + return -ENODEV; +} +#endif + +/* dev_mtd.c */ +#ifdef CONFIG_MTD +int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr); +#else +static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) +{ + return -ENODEV; +} +#endif + +/* dir.c */ +extern const struct inode_operations logfs_symlink_iops; +extern const struct inode_operations logfs_dir_iops; +extern const struct file_operations logfs_dir_fops; +int logfs_replay_journal(struct super_block *sb); + +/* file.c */ +extern const struct inode_operations logfs_reg_iops; +extern const struct file_operations logfs_reg_fops; +extern const struct address_space_operations logfs_reg_aops; +int logfs_readpage(struct file *file, struct page *page); +long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int logfs_fsync(struct file *file, int datasync); + +/* gc.c */ +u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec); +void logfs_gc_pass(struct super_block *sb); +int logfs_check_areas(struct super_block *sb); +int logfs_init_gc(struct super_block *sb); +void logfs_cleanup_gc(struct super_block *sb); + +/* inode.c */ +extern const struct super_operations logfs_super_operations; +struct inode *logfs_iget(struct super_block *sb, ino_t ino); +struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie); +void logfs_safe_iput(struct inode *inode, int cookie); +struct inode *logfs_new_inode(struct inode *dir, int mode); +struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino); +struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino); +int logfs_init_inode_cache(void); +void logfs_destroy_inode_cache(void); +void logfs_set_blocks(struct inode *inode, u64 no); +/* these logically belong into inode.c but actually reside in readwrite.c */ +int logfs_read_inode(struct inode *inode); +int __logfs_write_inode(struct inode *inode, long flags); +void logfs_evict_inode(struct inode *inode); + +/* journal.c */ +void logfs_write_anchor(struct super_block *sb); +int logfs_init_journal(struct super_block *sb); +void logfs_cleanup_journal(struct super_block *sb); +int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val); +void do_logfs_journal_wl_pass(struct super_block *sb); + +/* readwrite.c */ +pgoff_t logfs_pack_index(u64 bix, level_t level); +void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level); +int logfs_inode_write(struct inode *inode, const void *buf, size_t count, + loff_t bix, long flags, struct shadow_tree *shadow_tree); +int logfs_readpage_nolock(struct page *page); +int logfs_write_buf(struct inode *inode, struct page *page, long flags); +int logfs_delete(struct inode *inode, pgoff_t index, + struct shadow_tree *shadow_tree); +int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, + gc_level_t gc_level, long flags); +int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, + gc_level_t gc_level); +int logfs_truncate(struct inode *inode, u64 size); +u64 logfs_seek_hole(struct inode *inode, u64 bix); +u64 logfs_seek_data(struct inode *inode, u64 bix); +int logfs_open_segfile(struct super_block *sb); +int logfs_init_rw(struct super_block *sb); +void logfs_cleanup_rw(struct super_block *sb); +void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta); +void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta); +void logfs_write_block(struct logfs_block *block, long flags); +int logfs_write_obj_aliases_pagecache(struct super_block *sb); +void logfs_get_segment_entry(struct super_block *sb, u32 segno, + struct logfs_segment_entry *se); +void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment); +void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec, + gc_level_t gc_level); +void logfs_set_segment_reserved(struct super_block *sb, u32 segno); +void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec); +struct logfs_block *__alloc_block(struct super_block *sb, + u64 ino, u64 bix, level_t level); +void __free_block(struct super_block *sb, struct logfs_block *block); +void btree_write_block(struct logfs_block *block); +void initialize_block_counters(struct page *page, struct logfs_block *block, + __be64 *array, int page_is_empty); +int logfs_exist_block(struct inode *inode, u64 bix); +int get_page_reserve(struct inode *inode, struct page *page); +extern struct logfs_block_ops indirect_block_ops; + +/* segment.c */ +int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase); +int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf); +int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix, + level_t level); +int logfs_segment_write(struct inode *inode, struct page *page, + struct logfs_shadow *shadow); +int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow); +int logfs_load_object_aliases(struct super_block *sb, + struct logfs_obj_alias *oa, int count); +void move_page_to_btree(struct page *page); +int logfs_init_mapping(struct super_block *sb); +void logfs_sync_area(struct logfs_area *area); +void logfs_sync_segments(struct super_block *sb); +void freeseg(struct super_block *sb, u32 segno); + +/* area handling */ +int logfs_init_areas(struct super_block *sb); +void logfs_cleanup_areas(struct super_block *sb); +int logfs_open_area(struct logfs_area *area, size_t bytes); +int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, + int use_filler); + +static inline int logfs_buf_write(struct logfs_area *area, u64 ofs, + void *buf, size_t len) +{ + return __logfs_buf_write(area, ofs, buf, len, 0); +} + +static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs, + void *buf, size_t len) +{ + return __logfs_buf_write(area, ofs, buf, len, 1); +} + +/* super.c */ +struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index); +void emergency_read_end(struct page *page); +void logfs_crash_dump(struct super_block *sb); +void *memchr_inv(const void *s, int c, size_t n); +int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); +int logfs_check_ds(struct logfs_disk_super *ds); +int logfs_write_sb(struct super_block *sb); + +static inline struct logfs_super *logfs_super(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct logfs_inode *logfs_inode(struct inode *inode) +{ + return container_of(inode, struct logfs_inode, vfs_inode); +} + +static inline void logfs_set_ro(struct super_block *sb) +{ + logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO; +} + +#define LOGFS_BUG(sb) do { \ + struct super_block *__sb = sb; \ + logfs_crash_dump(__sb); \ + logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \ + BUG(); \ +} while (0) + +#define LOGFS_BUG_ON(condition, sb) \ + do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0) + +static inline __be32 logfs_crc32(void *data, size_t len, size_t skip) +{ + return cpu_to_be32(crc32(~0, data+skip, len-skip)); +} + +static inline u8 logfs_type(struct inode *inode) +{ + return (inode->i_mode >> 12) & 15; +} + +static inline pgoff_t logfs_index(struct super_block *sb, u64 pos) +{ + return pos >> sb->s_blocksize_bits; +} + +static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs) +{ + return ((u64)segno << logfs_super(sb)->s_segshift) + ofs; +} + +static inline u32 seg_no(struct super_block *sb, u64 ofs) +{ + return ofs >> logfs_super(sb)->s_segshift; +} + +static inline u32 seg_ofs(struct super_block *sb, u64 ofs) +{ + return ofs & logfs_super(sb)->s_segmask; +} + +static inline u64 seg_align(struct super_block *sb, u64 ofs) +{ + return ofs & ~logfs_super(sb)->s_segmask; +} + +static inline struct logfs_block *logfs_block(struct page *page) +{ + return (void *)page->private; +} + +static inline level_t shrink_level(gc_level_t __level) +{ + u8 level = (__force u8)__level; + + if (level >= LOGFS_MAX_LEVELS) + level -= LOGFS_MAX_LEVELS; + return (__force level_t)level; +} + +static inline gc_level_t expand_level(u64 ino, level_t __level) +{ + u8 level = (__force u8)__level; + + if (ino == LOGFS_INO_MASTER) { + /* ifile has separate areas */ + level += LOGFS_MAX_LEVELS; + } + return (__force gc_level_t)level; +} + +static inline int logfs_block_shift(struct super_block *sb, level_t level) +{ + level = shrink_level((__force gc_level_t)level); + return (__force int)level * (sb->s_blocksize_bits - 3); +} + +static inline u64 logfs_block_mask(struct super_block *sb, level_t level) +{ + return ~0ull << logfs_block_shift(sb, level); +} + +static inline struct logfs_area *get_area(struct super_block *sb, + gc_level_t gc_level) +{ + return logfs_super(sb)->s_area[(__force u8)gc_level]; +} + +static inline void logfs_mempool_destroy(mempool_t *pool) +{ + if (pool) + mempool_destroy(pool); +} + +#endif diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h new file mode 100644 index 00000000..ae960519 --- /dev/null +++ b/fs/logfs/logfs_abi.h @@ -0,0 +1,629 @@ +/* + * fs/logfs/logfs_abi.h + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Public header for logfs. + */ +#ifndef FS_LOGFS_LOGFS_ABI_H +#define FS_LOGFS_LOGFS_ABI_H + +/* For out-of-kernel compiles */ +#ifndef BUILD_BUG_ON +#define BUILD_BUG_ON(condition) /**/ +#endif + +#define SIZE_CHECK(type, size) \ +static inline void check_##type(void) \ +{ \ + BUILD_BUG_ON(sizeof(struct type) != (size)); \ +} + +/* + * Throughout the logfs code, we're constantly dealing with blocks at + * various positions or offsets. To remove confusion, we stricly + * distinguish between a "position" - the logical position within a + * file and an "offset" - the physical location within the device. + * + * Any usage of the term offset for a logical location or position for + * a physical one is a bug and should get fixed. + */ + +/* + * Block are allocated in one of several segments depending on their + * level. The following levels are used: + * 0 - regular data block + * 1 - i1 indirect blocks + * 2 - i2 indirect blocks + * 3 - i3 indirect blocks + * 4 - i4 indirect blocks + * 5 - i5 indirect blocks + * 6 - ifile data blocks + * 7 - ifile i1 indirect blocks + * 8 - ifile i2 indirect blocks + * 9 - ifile i3 indirect blocks + * 10 - ifile i4 indirect blocks + * 11 - ifile i5 indirect blocks + * Potential levels to be used in the future: + * 12 - gc recycled blocks, long-lived data + * 13 - replacement blocks, short-lived data + * + * Levels 1-11 are necessary for robust gc operations and help separate + * short-lived metadata from longer-lived file data. In the future, + * file data should get separated into several segments based on simple + * heuristics. Old data recycled during gc operation is expected to be + * long-lived. New data is of uncertain life expectancy. New data + * used to replace older blocks in existing files is expected to be + * short-lived. + */ + + +/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */ +#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull +#define LOGFS_MAGIC_U32 0xc97e8168u + +/* + * Various blocksize related macros. Blocksize is currently fixed at 4KiB. + * Sooner or later that should become configurable and the macros replaced + * by something superblock-dependent. Pointers in indirect blocks are and + * will remain 64bit. + * + * LOGFS_BLOCKSIZE - self-explaining + * LOGFS_BLOCK_FACTOR - number of pointers per indirect block + * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts + */ +#define LOGFS_BLOCKSIZE (4096ull) +#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64)) +#define LOGFS_BLOCK_BITS (9) + +/* + * Number of blocks at various levels of indirection. There are 16 direct + * block pointers plus a single indirect pointer. + */ +#define I0_BLOCKS (16) +#define I1_BLOCKS LOGFS_BLOCK_FACTOR +#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS) +#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS) +#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS) +#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS) + +#define INDIRECT_INDEX I0_BLOCKS +#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1) + +/* + * Sizes at which files require another level of indirection. Files smaller + * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself, + * similar like ext2 fast symlinks. + * + * Data at a position smaller than LOGFS_I0_SIZE is accessed through the + * direct pointers, else through the 1x indirect pointer and so forth. + */ +#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64)) +#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE) + +/* + * Each indirect block pointer must have this flag set, if all block pointers + * behind it are set, i.e. there is no hole hidden in the shadow of this + * indirect block pointer. + */ +#define LOGFS_FULLY_POPULATED (1ULL << 63) +#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED) + +/* + * LogFS needs to separate data into levels. Each level is defined as the + * maximal possible distance from the master inode (inode of the inode file). + * Data blocks reside on level 0, 1x indirect block on level 1, etc. + * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11. + * This effort is necessary to guarantee garbage collection to always make + * progress. + * + * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks, + * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is + * the maximal number of levels for one file. + * LOGFS_NO_AREAS is twice that, as the inode file and regular files are + * effectively stacked on top of each other. + */ +#define LOGFS_MAX_INDIRECT (5) +#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1) +#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS) + +/* Maximum size of filenames */ +#define LOGFS_MAX_NAMELEN (255) + +/* Number of segments in the primary journal. */ +#define LOGFS_JOURNAL_SEGS (16) + +/* Maximum number of free/erased/etc. segments in journal entries */ +#define MAX_CACHED_SEGS (64) + + +/* + * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store, + * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including + * its header, + * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for + * its segment header and the padded space at the end when no further objects + * fit. + */ +#define LOGFS_OBJECT_HEADERSIZE (0x1c) +#define LOGFS_SEGMENT_HEADERSIZE (0x18) +#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE) +#define LOGFS_SEGMENT_RESERVE \ + (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1) + +/* + * Segment types: + * SEG_SUPER - Data or indirect block + * SEG_JOURNAL - Inode + * SEG_OSTORE - Dentry + */ +enum { + SEG_SUPER = 0x01, + SEG_JOURNAL = 0x02, + SEG_OSTORE = 0x03, +}; + +/** + * struct logfs_segment_header - per-segment header in the ostore + * + * @crc: crc32 of header (there is no data) + * @pad: unused, must be 0 + * @type: segment type, see above + * @level: GC level for all objects in this segment + * @segno: segment number + * @ec: erase count for this segment + * @gec: global erase count at time of writing + */ +struct logfs_segment_header { + __be32 crc; + __be16 pad; + __u8 type; + __u8 level; + __be32 segno; + __be32 ec; + __be64 gec; +}; + +SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE); + +#define LOGFS_FEATURES_INCOMPAT (0ull) +#define LOGFS_FEATURES_RO_COMPAT (0ull) +#define LOGFS_FEATURES_COMPAT (0ull) + +/** + * struct logfs_disk_super - on-medium superblock + * + * @ds_magic: magic number, must equal LOGFS_MAGIC + * @ds_crc: crc32 of structure starting with the next field + * @ds_ifile_levels: maximum number of levels for ifile + * @ds_iblock_levels: maximum number of levels for regular files + * @ds_data_levels: number of separate levels for data + * @pad0: reserved, must be 0 + * @ds_feature_incompat: incompatible filesystem features + * @ds_feature_ro_compat: read-only compatible filesystem features + * @ds_feature_compat: compatible filesystem features + * @ds_flags: flags + * @ds_segment_shift: log2 of segment size + * @ds_block_shift: log2 of block size + * @ds_write_shift: log2 of write size + * @pad1: reserved, must be 0 + * @ds_journal_seg: segments used by primary journal + * @ds_root_reserve: bytes reserved for the superuser + * @ds_speed_reserve: bytes reserved to speed up GC + * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks + * @pad2: reserved, must be 0 + * @pad3: reserved, must be 0 + * + * Contains only read-only fields. Read-write fields like the amount of used + * space is tracked in the dynamic superblock, which is stored in the journal. + */ +struct logfs_disk_super { + struct logfs_segment_header ds_sh; + __be64 ds_magic; + + __be32 ds_crc; + __u8 ds_ifile_levels; + __u8 ds_iblock_levels; + __u8 ds_data_levels; + __u8 ds_segment_shift; + __u8 ds_block_shift; + __u8 ds_write_shift; + __u8 pad0[6]; + + __be64 ds_filesystem_size; + __be32 ds_segment_size; + __be32 ds_bad_seg_reserve; + + __be64 ds_feature_incompat; + __be64 ds_feature_ro_compat; + + __be64 ds_feature_compat; + __be64 ds_feature_flags; + + __be64 ds_root_reserve; + __be64 ds_speed_reserve; + + __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS]; + + __be64 ds_super_ofs[2]; + __be64 pad3[8]; +}; + +SIZE_CHECK(logfs_disk_super, 256); + +/* + * Object types: + * OBJ_BLOCK - Data or indirect block + * OBJ_INODE - Inode + * OBJ_DENTRY - Dentry + */ +enum { + OBJ_BLOCK = 0x04, + OBJ_INODE = 0x05, + OBJ_DENTRY = 0x06, +}; + +/** + * struct logfs_object_header - per-object header in the ostore + * + * @crc: crc32 of header, excluding data_crc + * @len: length of data + * @type: object type, see above + * @compr: compression type + * @ino: inode number + * @bix: block index + * @data_crc: crc32 of payload + */ +struct logfs_object_header { + __be32 crc; + __be16 len; + __u8 type; + __u8 compr; + __be64 ino; + __be64 bix; + __be32 data_crc; +} __attribute__((packed)); + +SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE); + +/* + * Reserved inode numbers: + * LOGFS_INO_MASTER - master inode (for inode file) + * LOGFS_INO_ROOT - root directory + * LOGFS_INO_SEGFILE - per-segment used bytes and erase count + */ +enum { + LOGFS_INO_MAPPING = 0x00, + LOGFS_INO_MASTER = 0x01, + LOGFS_INO_ROOT = 0x02, + LOGFS_INO_SEGFILE = 0x03, + LOGFS_RESERVED_INOS = 0x10, +}; + +/* + * Inode flags. High bits should never be written to the medium. They are + * reserved for in-memory usage. + * Low bits should either remain in sync with the corresponding FS_*_FL or + * reuse slots that obviously don't make sense for logfs. + * + * LOGFS_IF_DIRTY Inode must be written back + * LOGFS_IF_ZOMBIE Inode has been deleted + * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode + */ +#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */ +#define LOGFS_IF_DIRTY 0x20000000 +#define LOGFS_IF_ZOMBIE 0x40000000 +#define LOGFS_IF_STILLBORN 0x80000000 + +/* Flags available to chattr */ +#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED) +#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED) +/* Flags inherited from parent directory on file/directory creation */ +#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED) + +/** + * struct logfs_disk_inode - on-medium inode + * + * @di_mode: file mode + * @di_pad: reserved, must be 0 + * @di_flags: inode flags, see above + * @di_uid: user id + * @di_gid: group id + * @di_ctime: change time + * @di_mtime: modify time + * @di_refcount: reference count (aka nlink or link count) + * @di_generation: inode generation, for nfs + * @di_used_bytes: number of bytes used + * @di_size: file size + * @di_data: data pointers + */ +struct logfs_disk_inode { + __be16 di_mode; + __u8 di_height; + __u8 di_pad; + __be32 di_flags; + __be32 di_uid; + __be32 di_gid; + + __be64 di_ctime; + __be64 di_mtime; + + __be64 di_atime; + __be32 di_refcount; + __be32 di_generation; + + __be64 di_used_bytes; + __be64 di_size; + + __be64 di_data[LOGFS_EMBEDDED_FIELDS]; +}; + +SIZE_CHECK(logfs_disk_inode, 200); + +#define INODE_POINTER_OFS \ + (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64)) +#define INODE_USED_OFS \ + (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64)) +#define INODE_SIZE_OFS \ + (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64)) +#define INODE_HEIGHT_OFS (0) + +/** + * struct logfs_disk_dentry - on-medium dentry structure + * + * @ino: inode number + * @namelen: length of file name + * @type: file type, identical to bits 12..15 of mode + * @name: file name + */ +/* FIXME: add 6 bytes of padding to remove the __packed */ +struct logfs_disk_dentry { + __be64 ino; + __be16 namelen; + __u8 type; + __u8 name[LOGFS_MAX_NAMELEN]; +} __attribute__((packed)); + +SIZE_CHECK(logfs_disk_dentry, 266); + +#define RESERVED 0xffffffff +#define BADSEG 0xffffffff +/** + * struct logfs_segment_entry - segment file entry + * + * @ec_level: erase count and level + * @valid: number of valid bytes + * + * Segment file contains one entry for every segment. ec_level contains the + * erasecount in the upper 28 bits and the level in the lower 4 bits. An + * ec_level of BADSEG (-1) identifies bad segments. valid contains the number + * of valid bytes or RESERVED (-1 again) if the segment is used for either the + * superblock or the journal, or when the segment is bad. + */ +struct logfs_segment_entry { + __be32 ec_level; + __be32 valid; +}; + +SIZE_CHECK(logfs_segment_entry, 8); + +/** + * struct logfs_journal_header - header for journal entries (JEs) + * + * @h_crc: crc32 of journal entry + * @h_len: length of compressed journal entry, + * not including header + * @h_datalen: length of uncompressed data + * @h_type: JE type + * @h_compr: compression type + * @h_pad: reserved + */ +struct logfs_journal_header { + __be32 h_crc; + __be16 h_len; + __be16 h_datalen; + __be16 h_type; + __u8 h_compr; + __u8 h_pad[5]; +}; + +SIZE_CHECK(logfs_journal_header, 16); + +/* + * Life expectency of data. + * VIM_DEFAULT - default vim + * VIM_SEGFILE - for segment file only - very short-living + * VIM_GC - GC'd data - likely long-living + */ +enum logfs_vim { + VIM_DEFAULT = 0, + VIM_SEGFILE = 1, +}; + +/** + * struct logfs_je_area - wbuf header + * + * @segno: segment number of area + * @used_bytes: number of bytes already used + * @gc_level: GC level + * @vim: life expectancy of data + * + * "Areas" are segments currently being used for writing. There is at least + * one area per GC level. Several may be used to separate long-living from + * short-living data. If an area with unknown vim is encountered, it can + * simply be closed. + * The write buffer immediately follow this header. + */ +struct logfs_je_area { + __be32 segno; + __be32 used_bytes; + __u8 gc_level; + __u8 vim; +} __attribute__((packed)); + +SIZE_CHECK(logfs_je_area, 10); + +#define MAX_JOURNAL_HEADER \ + (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area)) + +/** + * struct logfs_je_dynsb - dynamic superblock + * + * @ds_gec: global erase count + * @ds_sweeper: current position of GC "sweeper" + * @ds_rename_dir: source directory ino (see dir.c documentation) + * @ds_rename_pos: position of source dd (see dir.c documentation) + * @ds_victim_ino: victims of incomplete dir operation (see dir.c) + * @ds_victim_ino: parent inode of victim (see dir.c) + * @ds_used_bytes: number of used bytes + */ +struct logfs_je_dynsb { + __be64 ds_gec; + __be64 ds_sweeper; + + __be64 ds_rename_dir; + __be64 ds_rename_pos; + + __be64 ds_victim_ino; + __be64 ds_victim_parent; /* XXX */ + + __be64 ds_used_bytes; + __be32 ds_generation; + __be32 pad; +}; + +SIZE_CHECK(logfs_je_dynsb, 64); + +/** + * struct logfs_je_anchor - anchor of filesystem tree, aka master inode + * + * @da_size: size of inode file + * @da_last_ino: last created inode + * @da_used_bytes: number of bytes used + * @da_data: data pointers + */ +struct logfs_je_anchor { + __be64 da_size; + __be64 da_last_ino; + + __be64 da_used_bytes; + u8 da_height; + u8 pad[7]; + + __be64 da_data[LOGFS_EMBEDDED_FIELDS]; +}; + +SIZE_CHECK(logfs_je_anchor, 168); + +/** + * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal) + * + * @so_segment: segments used for 2nd journal + * + * Length of the array is given by h_len field in the header. + */ +struct logfs_je_spillout { + __be64 so_segment[0]; +}; + +SIZE_CHECK(logfs_je_spillout, 0); + +/** + * struct logfs_je_journal_ec - erase counts for all journal segments + * + * @ec: erase count + * + * Length of the array is given by h_len field in the header. + */ +struct logfs_je_journal_ec { + __be32 ec[0]; +}; + +SIZE_CHECK(logfs_je_journal_ec, 0); + +/** + * struct logfs_je_free_segments - list of free segmetns with erase count + */ +struct logfs_je_free_segments { + __be32 segno; + __be32 ec; +}; + +SIZE_CHECK(logfs_je_free_segments, 8); + +/** + * struct logfs_seg_alias - list of segment aliases + */ +struct logfs_seg_alias { + __be32 old_segno; + __be32 new_segno; +}; + +SIZE_CHECK(logfs_seg_alias, 8); + +/** + * struct logfs_obj_alias - list of object aliases + */ +struct logfs_obj_alias { + __be64 ino; + __be64 bix; + __be64 val; + u8 level; + u8 pad[5]; + __be16 child_no; +}; + +SIZE_CHECK(logfs_obj_alias, 32); + +/** + * Compression types. + * + * COMPR_NONE - uncompressed + * COMPR_ZLIB - compressed with zlib + */ +enum { + COMPR_NONE = 0, + COMPR_ZLIB = 1, +}; + +/* + * Journal entries come in groups of 16. First group contains unique + * entries, next groups contain one entry per level + * + * JE_FIRST - smallest possible journal entry number + * + * JEG_BASE - base group, containing unique entries + * JE_COMMIT - commit entry, validates all previous entries + * JE_DYNSB - dynamic superblock, anything that ought to be in the + * superblock but cannot because it is read-write data + * JE_ANCHOR - anchor aka master inode aka inode file's inode + * JE_ERASECOUNT erasecounts for all journal segments + * JE_SPILLOUT - unused + * JE_SEG_ALIAS - aliases segments + * JE_AREA - area description + * + * JE_LAST - largest possible journal entry number + */ +enum { + JE_FIRST = 0x01, + + JEG_BASE = 0x00, + JE_COMMIT = 0x02, + JE_DYNSB = 0x03, + JE_ANCHOR = 0x04, + JE_ERASECOUNT = 0x05, + JE_SPILLOUT = 0x06, + JE_OBJ_ALIAS = 0x0d, + JE_AREA = 0x0e, + + JE_LAST = 0x0e, +}; + +#endif diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c new file mode 100644 index 00000000..d8d09380 --- /dev/null +++ b/fs/logfs/readwrite.c @@ -0,0 +1,2277 @@ +/* + * fs/logfs/readwrite.c + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * + * Actually contains five sets of very similar functions: + * read read blocks from a file + * seek_hole find next hole + * seek_data find next data block + * valid check whether a block still belongs to a file + * write write blocks to a file + * delete delete a block (for directories and ifile) + * rewrite move existing blocks of a file to a new location (gc helper) + * truncate truncate a file + */ +#include "logfs.h" +#include +#include + +static u64 adjust_bix(u64 bix, level_t level) +{ + switch (level) { + case 0: + return bix; + case LEVEL(1): + return max_t(u64, bix, I0_BLOCKS); + case LEVEL(2): + return max_t(u64, bix, I1_BLOCKS); + case LEVEL(3): + return max_t(u64, bix, I2_BLOCKS); + case LEVEL(4): + return max_t(u64, bix, I3_BLOCKS); + case LEVEL(5): + return max_t(u64, bix, I4_BLOCKS); + default: + WARN_ON(1); + return bix; + } +} + +static inline u64 maxbix(u8 height) +{ + return 1ULL << (LOGFS_BLOCK_BITS * height); +} + +/** + * The inode address space is cut in two halves. Lower half belongs to data + * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is + * set, the actual block index (bix) and level can be derived from the page + * index. + * + * The lowest three bits of the block index are set to 0 after packing and + * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored + * anyway this is harmless. + */ +#define ARCH_SHIFT (BITS_PER_LONG - 32) +#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT) +#define LEVEL_SHIFT (28 + ARCH_SHIFT) +static inline pgoff_t first_indirect_block(void) +{ + return INDIRECT_BIT | (1ULL << LEVEL_SHIFT); +} + +pgoff_t logfs_pack_index(u64 bix, level_t level) +{ + pgoff_t index; + + BUG_ON(bix >= INDIRECT_BIT); + if (level == 0) + return bix; + + index = INDIRECT_BIT; + index |= (__force long)level << LEVEL_SHIFT; + index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS); + return index; +} + +void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level) +{ + u8 __level; + + if (!(index & INDIRECT_BIT)) { + *bix = index; + *level = 0; + return; + } + + __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT; + *level = LEVEL(__level); + *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT; + *bix = adjust_bix(*bix, *level); + return; +} +#undef ARCH_SHIFT +#undef INDIRECT_BIT +#undef LEVEL_SHIFT + +/* + * Time is stored as nanoseconds since the epoch. + */ +static struct timespec be64_to_timespec(__be64 betime) +{ + return ns_to_timespec(be64_to_cpu(betime)); +} + +static __be64 timespec_to_be64(struct timespec tsp) +{ + return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec); +} + +static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + inode->i_mode = be16_to_cpu(di->di_mode); + li->li_height = di->di_height; + li->li_flags = be32_to_cpu(di->di_flags); + inode->i_uid = be32_to_cpu(di->di_uid); + inode->i_gid = be32_to_cpu(di->di_gid); + inode->i_size = be64_to_cpu(di->di_size); + logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes)); + inode->i_atime = be64_to_timespec(di->di_atime); + inode->i_ctime = be64_to_timespec(di->di_ctime); + inode->i_mtime = be64_to_timespec(di->di_mtime); + inode->i_nlink = be32_to_cpu(di->di_refcount); + inode->i_generation = be32_to_cpu(di->di_generation); + + switch (inode->i_mode & S_IFMT) { + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + inode->i_rdev = be64_to_cpu(di->di_data[0]); + break; + case S_IFDIR: /* fall through */ + case S_IFREG: /* fall through */ + case S_IFLNK: + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = be64_to_cpu(di->di_data[i]); + break; + default: + BUG(); + } +} + +static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + di->di_mode = cpu_to_be16(inode->i_mode); + di->di_height = li->li_height; + di->di_pad = 0; + di->di_flags = cpu_to_be32(li->li_flags); + di->di_uid = cpu_to_be32(inode->i_uid); + di->di_gid = cpu_to_be32(inode->i_gid); + di->di_size = cpu_to_be64(i_size_read(inode)); + di->di_used_bytes = cpu_to_be64(li->li_used_bytes); + di->di_atime = timespec_to_be64(inode->i_atime); + di->di_ctime = timespec_to_be64(inode->i_ctime); + di->di_mtime = timespec_to_be64(inode->i_mtime); + di->di_refcount = cpu_to_be32(inode->i_nlink); + di->di_generation = cpu_to_be32(inode->i_generation); + + switch (inode->i_mode & S_IFMT) { + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + di->di_data[0] = cpu_to_be64(inode->i_rdev); + break; + case S_IFDIR: /* fall through */ + case S_IFREG: /* fall through */ + case S_IFLNK: + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + di->di_data[i] = cpu_to_be64(li->li_data[i]); + break; + default: + BUG(); + } +} + +static void __logfs_set_blocks(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_inode *li = logfs_inode(inode); + + inode->i_blocks = ULONG_MAX; + if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX) + inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9; +} + +void logfs_set_blocks(struct inode *inode, u64 bytes) +{ + struct logfs_inode *li = logfs_inode(inode); + + li->li_used_bytes = bytes; + __logfs_set_blocks(inode); +} + +static void prelock_page(struct super_block *sb, struct page *page, int lock) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!PageLocked(page)); + if (lock) { + BUG_ON(PagePreLocked(page)); + SetPagePreLocked(page); + } else { + /* We are in GC path. */ + if (PagePreLocked(page)) + super->s_lock_count++; + else + SetPagePreLocked(page); + } +} + +static void preunlock_page(struct super_block *sb, struct page *page, int lock) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!PageLocked(page)); + if (lock) + ClearPagePreLocked(page); + else { + /* We are in GC path. */ + BUG_ON(!PagePreLocked(page)); + if (super->s_lock_count) + super->s_lock_count--; + else + ClearPagePreLocked(page); + } +} + +/* + * Logfs is prone to an AB-BA deadlock where one task tries to acquire + * s_write_mutex with a locked page and GC tries to get that page while holding + * s_write_mutex. + * To solve this issue logfs will ignore the page lock iff the page in question + * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked + * in addition to PG_locked. + */ +static void logfs_get_wblocks(struct super_block *sb, struct page *page, + int lock) +{ + struct logfs_super *super = logfs_super(sb); + + if (page) + prelock_page(sb, page, lock); + + if (lock) { + mutex_lock(&super->s_write_mutex); + logfs_gc_pass(sb); + /* FIXME: We also have to check for shadowed space + * and mempool fill grade */ + } +} + +static void logfs_put_wblocks(struct super_block *sb, struct page *page, + int lock) +{ + struct logfs_super *super = logfs_super(sb); + + if (page) + preunlock_page(sb, page, lock); + /* Order matters - we must clear PG_pre_locked before releasing + * s_write_mutex or we could race against another task. */ + if (lock) + mutex_unlock(&super->s_write_mutex); +} + +static struct page *logfs_get_read_page(struct inode *inode, u64 bix, + level_t level) +{ + return find_or_create_page(inode->i_mapping, + logfs_pack_index(bix, level), GFP_NOFS); +} + +static void logfs_put_read_page(struct page *page) +{ + unlock_page(page); + page_cache_release(page); +} + +static void logfs_lock_write_page(struct page *page) +{ + int loop = 0; + + while (unlikely(!trylock_page(page))) { + if (loop++ > 0x1000) { + /* Has been observed once so far... */ + printk(KERN_ERR "stack at %p\n", &loop); + BUG(); + } + if (PagePreLocked(page)) { + /* Holder of page lock is waiting for us, it + * is safe to use this page. */ + break; + } + /* Some other process has this page locked and has + * nothing to do with us. Wait for it to finish. + */ + schedule(); + } + BUG_ON(!PageLocked(page)); +} + +static struct page *logfs_get_write_page(struct inode *inode, u64 bix, + level_t level) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index = logfs_pack_index(bix, level); + struct page *page; + int err; + +repeat: + page = find_get_page(mapping, index); + if (!page) { + page = __page_cache_alloc(GFP_NOFS); + if (!page) + return NULL; + err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS); + if (unlikely(err)) { + page_cache_release(page); + if (err == -EEXIST) + goto repeat; + return NULL; + } + } else logfs_lock_write_page(page); + BUG_ON(!PageLocked(page)); + return page; +} + +static void logfs_unlock_write_page(struct page *page) +{ + if (!PagePreLocked(page)) + unlock_page(page); +} + +static void logfs_put_write_page(struct page *page) +{ + logfs_unlock_write_page(page); + page_cache_release(page); +} + +static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level, + int rw) +{ + if (rw == READ) + return logfs_get_read_page(inode, bix, level); + else + return logfs_get_write_page(inode, bix, level); +} + +static void logfs_put_page(struct page *page, int rw) +{ + if (rw == READ) + logfs_put_read_page(page); + else + logfs_put_write_page(page); +} + +static unsigned long __get_bits(u64 val, int skip, int no) +{ + u64 ret = val; + + ret >>= skip * no; + ret <<= 64 - no; + ret >>= 64 - no; + return ret; +} + +static unsigned long get_bits(u64 val, level_t skip) +{ + return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS); +} + +static inline void init_shadow_tree(struct super_block *sb, + struct shadow_tree *tree) +{ + struct logfs_super *super = logfs_super(sb); + + btree_init_mempool64(&tree->new, super->s_btree_pool); + btree_init_mempool64(&tree->old, super->s_btree_pool); +} + +static void indirect_write_block(struct logfs_block *block) +{ + struct page *page; + struct inode *inode; + int ret; + + page = block->page; + inode = page->mapping->host; + logfs_lock_write_page(page); + ret = logfs_write_buf(inode, page, 0); + logfs_unlock_write_page(page); + /* + * This needs some rework. Unless you want your filesystem to run + * completely synchronously (you don't), the filesystem will always + * report writes as 'successful' before the actual work has been + * done. The actual work gets done here and this is where any errors + * will show up. And there isn't much we can do about it, really. + * + * Some attempts to fix the errors (move from bad blocks, retry io,...) + * have already been done, so anything left should be either a broken + * device or a bug somewhere in logfs itself. Being relatively new, + * the odds currently favor a bug, so for now the line below isn't + * entirely tasteles. + */ + BUG_ON(ret); +} + +static void inode_write_block(struct logfs_block *block) +{ + struct inode *inode; + int ret; + + inode = block->inode; + if (inode->i_ino == LOGFS_INO_MASTER) + logfs_write_anchor(inode->i_sb); + else { + ret = __logfs_write_inode(inode, 0); + /* see indirect_write_block comment */ + BUG_ON(ret); + } +} + +/* + * This silences a false, yet annoying gcc warning. I hate it when my editor + * jumps into bitops.h each time I recompile this file. + * TODO: Complain to gcc folks about this and upgrade compiler. + */ +static unsigned long fnb(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +static __be64 inode_val0(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 val; + + /* + * Explicit shifting generates good code, but must match the format + * of the structure. Add some paranoia just in case. + */ + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0); + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2); + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4); + + val = (u64)inode->i_mode << 48 | + (u64)li->li_height << 40 | + (u64)li->li_flags; + return cpu_to_be64(val); +} + +static int inode_write_alias(struct super_block *sb, + struct logfs_block *block, write_alias_t *write_one_alias) +{ + struct inode *inode = block->inode; + struct logfs_inode *li = logfs_inode(inode); + unsigned long pos; + u64 ino , bix; + __be64 val; + level_t level; + int err; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS) + return 0; + + switch (pos) { + case INODE_HEIGHT_OFS: + val = inode_val0(inode); + break; + case INODE_USED_OFS: + val = cpu_to_be64(li->li_used_bytes); + break; + case INODE_SIZE_OFS: + val = cpu_to_be64(i_size_read(inode)); + break; + case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1: + val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]); + break; + default: + BUG(); + } + + ino = LOGFS_INO_MASTER; + bix = inode->i_ino; + level = LEVEL(0); + err = write_one_alias(sb, ino, bix, level, pos, val); + if (err) + return err; + } +} + +static int indirect_write_alias(struct super_block *sb, + struct logfs_block *block, write_alias_t *write_one_alias) +{ + unsigned long pos; + struct page *page = block->page; + u64 ino , bix; + __be64 *child, val; + level_t level; + int err; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_BLOCK_FACTOR) + return 0; + + ino = page->mapping->host->i_ino; + logfs_unpack_index(page->index, &bix, &level); + child = kmap_atomic(page, KM_USER0); + val = child[pos]; + kunmap_atomic(child, KM_USER0); + err = write_one_alias(sb, ino, bix, level, pos, val); + if (err) + return err; + } +} + +int logfs_write_obj_aliases_pagecache(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + int err; + + list_for_each_entry(block, &super->s_object_alias, alias_list) { + err = block->ops->write_alias(sb, block, write_alias_journal); + if (err) + return err; + } + return 0; +} + +void __free_block(struct super_block *sb, struct logfs_block *block) +{ + BUG_ON(!list_empty(&block->item_list)); + list_del(&block->alias_list); + mempool_free(block, logfs_super(sb)->s_block_pool); +} + +static void inode_free_block(struct super_block *sb, struct logfs_block *block) +{ + struct inode *inode = block->inode; + + logfs_inode(inode)->li_block = NULL; + __free_block(sb, block); +} + +static void indirect_free_block(struct super_block *sb, + struct logfs_block *block) +{ + ClearPagePrivate(block->page); + block->page->private = 0; + __free_block(sb, block); +} + + +static struct logfs_block_ops inode_block_ops = { + .write_block = inode_write_block, + .free_block = inode_free_block, + .write_alias = inode_write_alias, +}; + +struct logfs_block_ops indirect_block_ops = { + .write_block = indirect_write_block, + .free_block = indirect_free_block, + .write_alias = indirect_write_alias, +}; + +struct logfs_block *__alloc_block(struct super_block *sb, + u64 ino, u64 bix, level_t level) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + + block = mempool_alloc(super->s_block_pool, GFP_NOFS); + memset(block, 0, sizeof(*block)); + INIT_LIST_HEAD(&block->alias_list); + INIT_LIST_HEAD(&block->item_list); + block->sb = sb; + block->ino = ino; + block->bix = bix; + block->level = level; + return block; +} + +static void alloc_inode_block(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block; + + if (li->li_block) + return; + + block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0); + block->inode = inode; + li->li_block = block; + block->ops = &inode_block_ops; +} + +void initialize_block_counters(struct page *page, struct logfs_block *block, + __be64 *array, int page_is_empty) +{ + u64 ptr; + int i, start; + + block->partial = 0; + block->full = 0; + start = 0; + if (page->index < first_indirect_block()) { + /* Counters are pointless on level 0 */ + return; + } + if (page->index == first_indirect_block()) { + /* Skip unused pointers */ + start = I0_BLOCKS; + block->full = I0_BLOCKS; + } + if (!page_is_empty) { + for (i = start; i < LOGFS_BLOCK_FACTOR; i++) { + ptr = be64_to_cpu(array[i]); + if (ptr) + block->partial++; + if (ptr & LOGFS_FULLY_POPULATED) + block->full++; + } + } +} + +static void alloc_data_block(struct inode *inode, struct page *page) +{ + struct logfs_block *block; + u64 bix; + level_t level; + + if (PagePrivate(page)) + return; + + logfs_unpack_index(page->index, &bix, &level); + block = __alloc_block(inode->i_sb, inode->i_ino, bix, level); + block->page = page; + SetPagePrivate(page); + page->private = (unsigned long)block; + block->ops = &indirect_block_ops; +} + +static void alloc_indirect_block(struct inode *inode, struct page *page, + int page_is_empty) +{ + struct logfs_block *block; + __be64 *array; + + if (PagePrivate(page)) + return; + + alloc_data_block(inode, page); + + block = logfs_block(page); + array = kmap_atomic(page, KM_USER0); + initialize_block_counters(page, block, array, page_is_empty); + kunmap_atomic(array, KM_USER0); +} + +static void block_set_pointer(struct page *page, int index, u64 ptr) +{ + struct logfs_block *block = logfs_block(page); + __be64 *array; + u64 oldptr; + + BUG_ON(!block); + array = kmap_atomic(page, KM_USER0); + oldptr = be64_to_cpu(array[index]); + array[index] = cpu_to_be64(ptr); + kunmap_atomic(array, KM_USER0); + SetPageUptodate(page); + + block->full += !!(ptr & LOGFS_FULLY_POPULATED) + - !!(oldptr & LOGFS_FULLY_POPULATED); + block->partial += !!ptr - !!oldptr; +} + +static u64 block_get_pointer(struct page *page, int index) +{ + __be64 *block; + u64 ptr; + + block = kmap_atomic(page, KM_USER0); + ptr = be64_to_cpu(block[index]); + kunmap_atomic(block, KM_USER0); + return ptr; +} + +static int logfs_read_empty(struct page *page) +{ + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + return 0; +} + +static int logfs_read_direct(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + pgoff_t index = page->index; + u64 block; + + block = li->li_data[index]; + if (!block) + return logfs_read_empty(page); + + return logfs_segment_read(inode, page, block, index, 0); +} + +static int logfs_read_loop(struct inode *inode, struct page *page, + int rw_context) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bix, bofs = li->li_data[INDIRECT_INDEX]; + level_t level, target_level; + int ret; + struct page *ipage; + + logfs_unpack_index(page->index, &bix, &target_level); + if (!bofs) + return logfs_read_empty(page); + + if (bix >= maxbix(li->li_height)) + return logfs_read_empty(page); + + for (level = LEVEL(li->li_height); + (__force u8)level > (__force u8)target_level; + level = SUBLEVEL(level)){ + ipage = logfs_get_page(inode, bix, level, rw_context); + if (!ipage) + return -ENOMEM; + + ret = logfs_segment_read(inode, ipage, bofs, bix, level); + if (ret) { + logfs_put_read_page(ipage); + return ret; + } + + bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level))); + logfs_put_page(ipage, rw_context); + if (!bofs) + return logfs_read_empty(page); + } + + return logfs_segment_read(inode, page, bofs, bix, 0); +} + +static int logfs_read_block(struct inode *inode, struct page *page, + int rw_context) +{ + pgoff_t index = page->index; + + if (index < I0_BLOCKS) + return logfs_read_direct(inode, page); + return logfs_read_loop(inode, page, rw_context); +} + +static int logfs_exist_loop(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bofs = li->li_data[INDIRECT_INDEX]; + level_t level; + int ret; + struct page *ipage; + + if (!bofs) + return 0; + if (bix >= maxbix(li->li_height)) + return 0; + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) { + ipage = logfs_get_read_page(inode, bix, level); + if (!ipage) + return -ENOMEM; + + ret = logfs_segment_read(inode, ipage, bofs, bix, level); + if (ret) { + logfs_put_read_page(ipage); + return ret; + } + + bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level))); + logfs_put_read_page(ipage); + if (!bofs) + return 0; + } + + return 1; +} + +int logfs_exist_block(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) + return !!li->li_data[bix]; + return logfs_exist_loop(inode, bix); +} + +static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data) +{ + struct logfs_inode *li = logfs_inode(inode); + + for (; bix < I0_BLOCKS; bix++) + if (data ^ (li->li_data[bix] == 0)) + return bix; + return I0_BLOCKS; +} + +static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data) +{ + struct logfs_inode *li = logfs_inode(inode); + __be64 *rblock; + u64 increment, bofs = li->li_data[INDIRECT_INDEX]; + level_t level; + int ret, slot; + struct page *page; + + BUG_ON(!bofs); + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) { + increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1)); + page = logfs_get_read_page(inode, bix, level); + if (!page) + return bix; + + ret = logfs_segment_read(inode, page, bofs, bix, level); + if (ret) { + logfs_put_read_page(page); + return bix; + } + + slot = get_bits(bix, SUBLEVEL(level)); + rblock = kmap_atomic(page, KM_USER0); + while (slot < LOGFS_BLOCK_FACTOR) { + if (data && (rblock[slot] != 0)) + break; + if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED)) + break; + slot++; + bix += increment; + bix &= ~(increment - 1); + } + if (slot >= LOGFS_BLOCK_FACTOR) { + kunmap_atomic(rblock, KM_USER0); + logfs_put_read_page(page); + return bix; + } + bofs = be64_to_cpu(rblock[slot]); + kunmap_atomic(rblock, KM_USER0); + logfs_put_read_page(page); + if (!bofs) { + BUG_ON(data); + return bix; + } + } + return bix; +} + +/** + * logfs_seek_hole - find next hole starting at a given block index + * @inode: inode to search in + * @bix: block index to start searching + * + * Returns next hole. If the file doesn't contain any further holes, the + * block address next to eof is returned instead. + */ +u64 logfs_seek_hole(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) { + bix = seek_holedata_direct(inode, bix, 0); + if (bix < I0_BLOCKS) + return bix; + } + + if (!li->li_data[INDIRECT_INDEX]) + return bix; + else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED) + bix = maxbix(li->li_height); + else if (bix >= maxbix(li->li_height)) + return bix; + else { + bix = seek_holedata_loop(inode, bix, 0); + if (bix < maxbix(li->li_height)) + return bix; + /* Should not happen anymore. But if some port writes semi- + * corrupt images (as this one used to) we might run into it. + */ + WARN_ON_ONCE(bix == maxbix(li->li_height)); + } + + return bix; +} + +static u64 __logfs_seek_data(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) { + bix = seek_holedata_direct(inode, bix, 1); + if (bix < I0_BLOCKS) + return bix; + } + + if (bix < maxbix(li->li_height)) { + if (!li->li_data[INDIRECT_INDEX]) + bix = maxbix(li->li_height); + else + return seek_holedata_loop(inode, bix, 1); + } + + return bix; +} + +/** + * logfs_seek_data - find next data block after a given block index + * @inode: inode to search in + * @bix: block index to start searching + * + * Returns next data block. If the file doesn't contain any further data + * blocks, the last block in the file is returned instead. + */ +u64 logfs_seek_data(struct inode *inode, u64 bix) +{ + struct super_block *sb = inode->i_sb; + u64 ret, end; + + ret = __logfs_seek_data(inode, bix); + end = i_size_read(inode) >> sb->s_blocksize_bits; + if (ret >= end) + ret = max(bix, end); + return ret; +} + +static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs) +{ + return pure_ofs(li->li_data[bix]) == ofs; +} + +static int __logfs_is_valid_loop(struct inode *inode, u64 bix, + u64 ofs, u64 bofs) +{ + struct logfs_inode *li = logfs_inode(inode); + level_t level; + int ret; + struct page *page; + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){ + page = logfs_get_write_page(inode, bix, level); + BUG_ON(!page); + + ret = logfs_segment_read(inode, page, bofs, bix, level); + if (ret) { + logfs_put_write_page(page); + return 0; + } + + bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level))); + logfs_put_write_page(page); + if (!bofs) + return 0; + + if (pure_ofs(bofs) == ofs) + return 1; + } + return 0; +} + +static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bofs = li->li_data[INDIRECT_INDEX]; + + if (!bofs) + return 0; + + if (bix >= maxbix(li->li_height)) + return 0; + + if (pure_ofs(bofs) == ofs) + return 1; + + return __logfs_is_valid_loop(inode, bix, ofs, bofs); +} + +static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs) +{ + struct logfs_inode *li = logfs_inode(inode); + + if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1) + return 0; + + if (bix < I0_BLOCKS) + return logfs_is_valid_direct(li, bix, ofs); + return logfs_is_valid_loop(inode, bix, ofs); +} + +/** + * logfs_is_valid_block - check whether this block is still valid + * + * @sb - superblock + * @ofs - block physical offset + * @ino - block inode number + * @bix - block index + * @level - block level + * + * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will + * become invalid once the journal is written. + */ +int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, + gc_level_t gc_level) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + int ret, cookie; + + /* Umount closes a segment with free blocks remaining. Those + * blocks are by definition invalid. */ + if (ino == -1) + return 0; + + LOGFS_BUG_ON((u64)(u_long)ino != ino, sb); + + inode = logfs_safe_iget(sb, ino, &cookie); + if (IS_ERR(inode)) + goto invalid; + + ret = __logfs_is_valid_block(inode, bix, ofs); + logfs_safe_iput(inode, cookie); + if (ret) + return ret; + +invalid: + /* Block is nominally invalid, but may still sit in the shadow tree, + * waiting for a journal commit. + */ + if (btree_lookup64(&super->s_shadow_tree.old, ofs)) + return 2; + return 0; +} + +int logfs_readpage_nolock(struct page *page) +{ + struct inode *inode = page->mapping->host; + int ret = -EIO; + + ret = logfs_read_block(inode, page, READ); + + if (ret) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + flush_dcache_page(page); + + return ret; +} + +static int logfs_reserve_bytes(struct inode *inode, int bytes) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + u64 available = super->s_free_bytes + super->s_dirty_free_bytes + - super->s_dirty_used_bytes - super->s_dirty_pages; + + if (!bytes) + return 0; + + if (available < bytes) + return -ENOSPC; + + if (available < bytes + super->s_root_reserve && + !capable(CAP_SYS_RESOURCE)) + return -ENOSPC; + + return 0; +} + +int get_page_reserve(struct inode *inode, struct page *page) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_block *block = logfs_block(page); + int ret; + + if (block && block->reserved_bytes) + return 0; + + logfs_get_wblocks(inode->i_sb, page, WF_LOCK); + while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) && + !list_empty(&super->s_writeback_list)) { + block = list_entry(super->s_writeback_list.next, + struct logfs_block, alias_list); + block->ops->write_block(block); + } + if (!ret) { + alloc_data_block(inode, page); + block = logfs_block(page); + block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE; + super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE; + list_move_tail(&block->alias_list, &super->s_writeback_list); + } + logfs_put_wblocks(inode->i_sb, page, WF_LOCK); + return ret; +} + +/* + * We are protected by write lock. Push victims up to superblock level + * and release transaction when appropriate. + */ +/* FIXME: This is currently called from the wrong spots. */ +static void logfs_handle_transaction(struct inode *inode, + struct logfs_transaction *ta) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + if (!ta) + return; + logfs_inode(inode)->li_block->ta = NULL; + + if (inode->i_ino != LOGFS_INO_MASTER) { + BUG(); /* FIXME: Yes, this needs more thought */ + /* just remember the transaction until inode is written */ + //BUG_ON(logfs_inode(inode)->li_transaction); + //logfs_inode(inode)->li_transaction = ta; + return; + } + + switch (ta->state) { + case CREATE_1: /* fall through */ + case UNLINK_1: + BUG_ON(super->s_victim_ino); + super->s_victim_ino = ta->ino; + break; + case CREATE_2: /* fall through */ + case UNLINK_2: + BUG_ON(super->s_victim_ino != ta->ino); + super->s_victim_ino = 0; + /* transaction ends here - free it */ + kfree(ta); + break; + case CROSS_RENAME_1: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + super->s_rename_dir = ta->dir; + super->s_rename_pos = ta->pos; + break; + case CROSS_RENAME_2: + BUG_ON(super->s_rename_dir != ta->dir); + BUG_ON(super->s_rename_pos != ta->pos); + super->s_rename_dir = 0; + super->s_rename_pos = 0; + kfree(ta); + break; + case TARGET_RENAME_1: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + BUG_ON(super->s_victim_ino); + super->s_rename_dir = ta->dir; + super->s_rename_pos = ta->pos; + super->s_victim_ino = ta->ino; + break; + case TARGET_RENAME_2: + BUG_ON(super->s_rename_dir != ta->dir); + BUG_ON(super->s_rename_pos != ta->pos); + BUG_ON(super->s_victim_ino != ta->ino); + super->s_rename_dir = 0; + super->s_rename_pos = 0; + break; + case TARGET_RENAME_3: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + BUG_ON(super->s_victim_ino != ta->ino); + super->s_victim_ino = 0; + kfree(ta); + break; + default: + BUG(); + } +} + +/* + * Not strictly a reservation, but rather a check that we still have enough + * space to satisfy the write. + */ +static int logfs_reserve_blocks(struct inode *inode, int blocks) +{ + return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE); +} + +struct write_control { + u64 ofs; + long flags; +}; + +static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix, + level_t level, u64 old_ofs) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_shadow *shadow; + + shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS); + memset(shadow, 0, sizeof(*shadow)); + shadow->ino = inode->i_ino; + shadow->bix = bix; + shadow->gc_level = expand_level(inode->i_ino, level); + shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED; + return shadow; +} + +static void free_shadow(struct inode *inode, struct logfs_shadow *shadow) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + mempool_free(shadow, super->s_shadow_pool); +} + +static void mark_segment(struct shadow_tree *tree, u32 segno) +{ + int err; + + if (!btree_lookup32(&tree->segment_map, segno)) { + err = btree_insert32(&tree->segment_map, segno, (void *)1, + GFP_NOFS); + BUG_ON(err); + tree->no_shadowed_segments++; + } +} + +/** + * fill_shadow_tree - Propagate shadow tree changes due to a write + * @inode: Inode owning the page + * @page: Struct page that was written + * @shadow: Shadow for the current write + * + * Writes in logfs can result in two semi-valid objects. The old object + * is still valid as long as it can be reached by following pointers on + * the medium. Only when writes propagate all the way up to the journal + * has the new object safely replaced the old one. + * + * To handle this problem, a struct logfs_shadow is used to represent + * every single write. It is attached to the indirect block, which is + * marked dirty. When the indirect block is written, its shadows are + * handed up to the next indirect block (or inode). Untimately they + * will reach the master inode and be freed upon journal commit. + * + * This function handles a single step in the propagation. It adds the + * shadow for the current write to the tree, along with any shadows in + * the page's tree, in case it was an indirect block. If a page is + * written, the inode parameter is left NULL, if an inode is written, + * the page parameter is left NULL. + */ +static void fill_shadow_tree(struct inode *inode, struct page *page, + struct logfs_shadow *shadow) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_block *block = logfs_block(page); + struct shadow_tree *tree = &super->s_shadow_tree; + + if (PagePrivate(page)) { + if (block->alias_map) + super->s_no_object_aliases -= bitmap_weight( + block->alias_map, LOGFS_BLOCK_FACTOR); + logfs_handle_transaction(inode, block->ta); + block->ops->free_block(inode->i_sb, block); + } + if (shadow) { + if (shadow->old_ofs) + btree_insert64(&tree->old, shadow->old_ofs, shadow, + GFP_NOFS); + else + btree_insert64(&tree->new, shadow->new_ofs, shadow, + GFP_NOFS); + + super->s_dirty_used_bytes += shadow->new_len; + super->s_dirty_free_bytes += shadow->old_len; + mark_segment(tree, shadow->old_ofs >> super->s_segshift); + mark_segment(tree, shadow->new_ofs >> super->s_segshift); + } +} + +static void logfs_set_alias(struct super_block *sb, struct logfs_block *block, + long child_no) +{ + struct logfs_super *super = logfs_super(sb); + + if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) { + /* Aliases in the master inode are pointless. */ + return; + } + + if (!test_bit(child_no, block->alias_map)) { + set_bit(child_no, block->alias_map); + super->s_no_object_aliases++; + } + list_move_tail(&block->alias_list, &super->s_object_alias); +} + +/* + * Object aliases can and often do change the size and occupied space of a + * file. So not only do we have to change the pointers, we also have to + * change inode->i_size and li->li_used_bytes. Which is done by setting + * another two object aliases for the inode itself. + */ +static void set_iused(struct inode *inode, struct logfs_shadow *shadow) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (shadow->new_len == shadow->old_len) + return; + + alloc_inode_block(inode); + li->li_used_bytes += shadow->new_len - shadow->old_len; + __logfs_set_blocks(inode); + logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS); + logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS); +} + +static int logfs_write_i0(struct inode *inode, struct page *page, + struct write_control *wc) +{ + struct logfs_shadow *shadow; + u64 bix; + level_t level; + int full, err = 0; + + logfs_unpack_index(page->index, &bix, &level); + if (wc->ofs == 0) + if (logfs_reserve_blocks(inode, 1)) + return -ENOSPC; + + shadow = alloc_shadow(inode, bix, level, wc->ofs); + if (wc->flags & WF_WRITE) + err = logfs_segment_write(inode, page, shadow); + if (wc->flags & WF_DELETE) + logfs_segment_delete(inode, shadow); + if (err) { + free_shadow(inode, shadow); + return err; + } + + set_iused(inode, shadow); + full = 1; + if (level != 0) { + alloc_indirect_block(inode, page, 0); + full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR; + } + fill_shadow_tree(inode, page, shadow); + wc->ofs = shadow->new_ofs; + if (wc->ofs && full) + wc->ofs |= LOGFS_FULLY_POPULATED; + return 0; +} + +static int logfs_write_direct(struct inode *inode, struct page *page, + long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[page->index], + .flags = flags, + }; + int err; + + alloc_inode_block(inode); + + err = logfs_write_i0(inode, page, &wc); + if (err) + return err; + + li->li_data[page->index] = wc.ofs; + logfs_set_alias(inode->i_sb, li->li_block, + page->index + INODE_POINTER_OFS); + return 0; +} + +static int ptr_change(u64 ofs, struct page *page) +{ + struct logfs_block *block = logfs_block(page); + int empty0, empty1, full0, full1; + + empty0 = ofs == 0; + empty1 = block->partial == 0; + if (empty0 != empty1) + return 1; + + /* The !! is necessary to shrink result to int */ + full0 = !!(ofs & LOGFS_FULLY_POPULATED); + full1 = block->full == LOGFS_BLOCK_FACTOR; + if (full0 != full1) + return 1; + return 0; +} + +static int __logfs_write_rec(struct inode *inode, struct page *page, + struct write_control *this_wc, + pgoff_t bix, level_t target_level, level_t level) +{ + int ret, page_empty = 0; + int child_no = get_bits(bix, SUBLEVEL(level)); + struct page *ipage; + struct write_control child_wc = { + .flags = this_wc->flags, + }; + + ipage = logfs_get_write_page(inode, bix, level); + if (!ipage) + return -ENOMEM; + + if (this_wc->ofs) { + ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); + if (ret) + goto out; + } else if (!PageUptodate(ipage)) { + page_empty = 1; + logfs_read_empty(ipage); + } + + child_wc.ofs = block_get_pointer(ipage, child_no); + + if ((__force u8)level-1 > (__force u8)target_level) + ret = __logfs_write_rec(inode, page, &child_wc, bix, + target_level, SUBLEVEL(level)); + else + ret = logfs_write_i0(inode, page, &child_wc); + + if (ret) + goto out; + + alloc_indirect_block(inode, ipage, page_empty); + block_set_pointer(ipage, child_no, child_wc.ofs); + /* FIXME: first condition seems superfluous */ + if (child_wc.ofs || logfs_block(ipage)->partial) + this_wc->flags |= WF_WRITE; + /* the condition on this_wc->ofs ensures that we won't consume extra + * space for indirect blocks in the future, which we cannot reserve */ + if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage)) + ret = logfs_write_i0(inode, ipage, this_wc); + else + logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no); +out: + logfs_put_write_page(ipage); + return ret; +} + +static int logfs_write_rec(struct inode *inode, struct page *page, + pgoff_t bix, level_t target_level, long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[INDIRECT_INDEX], + .flags = flags, + }; + int ret; + + alloc_inode_block(inode); + + if (li->li_height > (__force u8)target_level) + ret = __logfs_write_rec(inode, page, &wc, bix, target_level, + LEVEL(li->li_height)); + else + ret = logfs_write_i0(inode, page, &wc); + if (ret) + return ret; + + if (li->li_data[INDIRECT_INDEX] != wc.ofs) { + li->li_data[INDIRECT_INDEX] = wc.ofs; + logfs_set_alias(inode->i_sb, li->li_block, + INDIRECT_INDEX + INODE_POINTER_OFS); + } + return ret; +} + +void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + alloc_inode_block(inode); + logfs_inode(inode)->li_block->ta = ta; +} + +void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + struct logfs_block *block = logfs_inode(inode)->li_block; + + if (block && block->ta) + block->ta = NULL; +} + +static int grow_inode(struct inode *inode, u64 bix, level_t level) +{ + struct logfs_inode *li = logfs_inode(inode); + u8 height = (__force u8)level; + struct page *page; + struct write_control wc = { + .flags = WF_WRITE, + }; + int err; + + BUG_ON(height > 5 || li->li_height > 5); + while (height > li->li_height || bix >= maxbix(li->li_height)) { + page = logfs_get_write_page(inode, I0_BLOCKS + 1, + LEVEL(li->li_height + 1)); + if (!page) + return -ENOMEM; + logfs_read_empty(page); + alloc_indirect_block(inode, page, 1); + block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]); + err = logfs_write_i0(inode, page, &wc); + logfs_put_write_page(page); + if (err) + return err; + li->li_data[INDIRECT_INDEX] = wc.ofs; + wc.ofs = 0; + li->li_height++; + logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS); + } + return 0; +} + +static int __logfs_write_buf(struct inode *inode, struct page *page, long flags) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + pgoff_t index = page->index; + u64 bix; + level_t level; + int err; + + flags |= WF_WRITE | WF_DELETE; + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + logfs_unpack_index(index, &bix, &level); + if (logfs_block(page) && logfs_block(page)->reserved_bytes) + super->s_dirty_pages -= logfs_block(page)->reserved_bytes; + + if (index < I0_BLOCKS) + return logfs_write_direct(inode, page, flags); + + bix = adjust_bix(bix, level); + err = grow_inode(inode, bix, level); + if (err) + return err; + return logfs_write_rec(inode, page, bix, level, flags); +} + +int logfs_write_buf(struct inode *inode, struct page *page, long flags) +{ + struct super_block *sb = inode->i_sb; + int ret; + + logfs_get_wblocks(sb, page, flags & WF_LOCK); + ret = __logfs_write_buf(inode, page, flags); + logfs_put_wblocks(sb, page, flags & WF_LOCK); + return ret; +} + +static int __logfs_delete(struct inode *inode, struct page *page) +{ + long flags = WF_DELETE; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + if (page->index < I0_BLOCKS) + return logfs_write_direct(inode, page, flags); + return logfs_write_rec(inode, page, page->index, 0, flags); +} + +int logfs_delete(struct inode *inode, pgoff_t index, + struct shadow_tree *shadow_tree) +{ + struct super_block *sb = inode->i_sb; + struct page *page; + int ret; + + page = logfs_get_read_page(inode, index, 0); + if (!page) + return -ENOMEM; + + logfs_get_wblocks(sb, page, 1); + ret = __logfs_delete(inode, page); + logfs_put_wblocks(sb, page, 1); + + logfs_put_read_page(page); + + return ret; +} + +int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, + gc_level_t gc_level, long flags) +{ + level_t level = shrink_level(gc_level); + struct page *page; + int err; + + page = logfs_get_write_page(inode, bix, level); + if (!page) + return -ENOMEM; + + err = logfs_segment_read(inode, page, ofs, bix, level); + if (!err) { + if (level != 0) + alloc_indirect_block(inode, page, 0); + err = logfs_write_buf(inode, page, flags); + if (!err && shrink_level(gc_level) == 0) { + /* Rewrite cannot mark the inode dirty but has to + * write it immediately. + * Q: Can't we just create an alias for the inode + * instead? And if not, why not? + */ + if (inode->i_ino == LOGFS_INO_MASTER) + logfs_write_anchor(inode->i_sb); + else { + err = __logfs_write_inode(inode, flags); + } + } + } + logfs_put_write_page(page); + return err; +} + +static int truncate_data_block(struct inode *inode, struct page *page, + u64 ofs, struct logfs_shadow *shadow, u64 size) +{ + loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits; + u64 bix; + level_t level; + int err; + + /* Does truncation happen within this page? */ + if (size <= pageofs || size - pageofs >= PAGE_SIZE) + return 0; + + logfs_unpack_index(page->index, &bix, &level); + BUG_ON(level != 0); + + err = logfs_segment_read(inode, page, ofs, bix, level); + if (err) + return err; + + zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE); + return logfs_segment_write(inode, page, shadow); +} + +static int logfs_truncate_i0(struct inode *inode, struct page *page, + struct write_control *wc, u64 size) +{ + struct logfs_shadow *shadow; + u64 bix; + level_t level; + int err = 0; + + logfs_unpack_index(page->index, &bix, &level); + BUG_ON(level != 0); + shadow = alloc_shadow(inode, bix, level, wc->ofs); + + err = truncate_data_block(inode, page, wc->ofs, shadow, size); + if (err) { + free_shadow(inode, shadow); + return err; + } + + logfs_segment_delete(inode, shadow); + set_iused(inode, shadow); + fill_shadow_tree(inode, page, shadow); + wc->ofs = shadow->new_ofs; + return 0; +} + +static int logfs_truncate_direct(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc; + struct page *page; + int e; + int err; + + alloc_inode_block(inode); + + for (e = I0_BLOCKS - 1; e >= 0; e--) { + if (size > (e+1) * LOGFS_BLOCKSIZE) + break; + + wc.ofs = li->li_data[e]; + if (!wc.ofs) + continue; + + page = logfs_get_write_page(inode, e, 0); + if (!page) + return -ENOMEM; + err = logfs_segment_read(inode, page, wc.ofs, e, 0); + if (err) { + logfs_put_write_page(page); + return err; + } + err = logfs_truncate_i0(inode, page, &wc, size); + logfs_put_write_page(page); + if (err) + return err; + + li->li_data[e] = wc.ofs; + } + return 0; +} + +/* FIXME: these need to become per-sb once we support different blocksizes */ +static u64 __logfs_step[] = { + 1, + I1_BLOCKS, + I2_BLOCKS, + I3_BLOCKS, +}; + +static u64 __logfs_start_index[] = { + I0_BLOCKS, + I1_BLOCKS, + I2_BLOCKS, + I3_BLOCKS +}; + +static inline u64 logfs_step(level_t level) +{ + return __logfs_step[(__force u8)level]; +} + +static inline u64 logfs_factor(u8 level) +{ + return __logfs_step[level] * LOGFS_BLOCKSIZE; +} + +static inline u64 logfs_start_index(level_t level) +{ + return __logfs_start_index[(__force u8)level]; +} + +static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level) +{ + logfs_unpack_index(index, bix, level); + if (*bix <= logfs_start_index(SUBLEVEL(*level))) + *bix = 0; +} + +static int __logfs_truncate_rec(struct inode *inode, struct page *ipage, + struct write_control *this_wc, u64 size) +{ + int truncate_happened = 0; + int e, err = 0; + u64 bix, child_bix, next_bix; + level_t level; + struct page *page; + struct write_control child_wc = { /* FIXME: flags */ }; + + logfs_unpack_raw_index(ipage->index, &bix, &level); + err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); + if (err) + return err; + + for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) { + child_bix = bix + e * logfs_step(SUBLEVEL(level)); + next_bix = child_bix + logfs_step(SUBLEVEL(level)); + if (size > next_bix * LOGFS_BLOCKSIZE) + break; + + child_wc.ofs = pure_ofs(block_get_pointer(ipage, e)); + if (!child_wc.ofs) + continue; + + page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level)); + if (!page) + return -ENOMEM; + + if ((__force u8)level > 1) + err = __logfs_truncate_rec(inode, page, &child_wc, size); + else + err = logfs_truncate_i0(inode, page, &child_wc, size); + logfs_put_write_page(page); + if (err) + return err; + + truncate_happened = 1; + alloc_indirect_block(inode, ipage, 0); + block_set_pointer(ipage, e, child_wc.ofs); + } + + if (!truncate_happened) { + printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size); + return 0; + } + + this_wc->flags = WF_DELETE; + if (logfs_block(ipage)->partial) + this_wc->flags |= WF_WRITE; + + return logfs_write_i0(inode, ipage, this_wc); +} + +static int logfs_truncate_rec(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[INDIRECT_INDEX], + }; + struct page *page; + int err; + + alloc_inode_block(inode); + + if (!wc.ofs) + return 0; + + page = logfs_get_write_page(inode, 0, LEVEL(li->li_height)); + if (!page) + return -ENOMEM; + + err = __logfs_truncate_rec(inode, page, &wc, size); + logfs_put_write_page(page); + if (err) + return err; + + if (li->li_data[INDIRECT_INDEX] != wc.ofs) + li->li_data[INDIRECT_INDEX] = wc.ofs; + return 0; +} + +static int __logfs_truncate(struct inode *inode, u64 size) +{ + int ret; + + if (size >= logfs_factor(logfs_inode(inode)->li_height)) + return 0; + + ret = logfs_truncate_rec(inode, size); + if (ret) + return ret; + + return logfs_truncate_direct(inode, size); +} + +/* + * Truncate, by changing the segment file, can consume a fair amount + * of resources. So back off from time to time and do some GC. + * 8 or 2048 blocks should be well within safety limits even if + * every single block resided in a different segment. + */ +#define TRUNCATE_STEP (8 * 1024 * 1024) +int logfs_truncate(struct inode *inode, u64 target) +{ + struct super_block *sb = inode->i_sb; + u64 size = i_size_read(inode); + int err = 0; + + size = ALIGN(size, TRUNCATE_STEP); + while (size > target) { + if (size > TRUNCATE_STEP) + size -= TRUNCATE_STEP; + else + size = 0; + if (size < target) + size = target; + + logfs_get_wblocks(sb, NULL, 1); + err = __logfs_truncate(inode, size); + if (!err) + err = __logfs_write_inode(inode, 0); + logfs_put_wblocks(sb, NULL, 1); + } + + if (!err) + err = vmtruncate(inode, target); + + /* I don't trust error recovery yet. */ + WARN_ON(err); + return err; +} + +static void move_page_to_inode(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = logfs_block(page); + + if (!block) + return; + + log_blockmove("move_page_to_inode(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + BUG_ON(li->li_block); + block->ops = &inode_block_ops; + block->inode = inode; + li->li_block = block; + + block->page = NULL; + page->private = 0; + ClearPagePrivate(page); +} + +static void move_inode_to_page(struct page *page, struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = li->li_block; + + if (!block) + return; + + log_blockmove("move_inode_to_page(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + BUG_ON(PagePrivate(page)); + block->ops = &indirect_block_ops; + block->page = page; + page->private = (unsigned long)block; + SetPagePrivate(page); + + block->inode = NULL; + li->li_block = NULL; +} + +int logfs_read_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct inode *master_inode = super->s_master_inode; + struct page *page; + struct logfs_disk_inode *di; + u64 ino = inode->i_ino; + + if (ino << sb->s_blocksize_bits > i_size_read(master_inode)) + return -ENODATA; + if (!logfs_exist_block(master_inode, ino)) + return -ENODATA; + + page = read_cache_page(master_inode->i_mapping, ino, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + di = kmap_atomic(page, KM_USER0); + logfs_disk_to_inode(di, inode); + kunmap_atomic(di, KM_USER0); + move_page_to_inode(inode, page); + page_cache_release(page); + return 0; +} + +/* Caller must logfs_put_write_page(page); */ +static struct page *inode_to_page(struct inode *inode) +{ + struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode; + struct logfs_disk_inode *di; + struct page *page; + + BUG_ON(inode->i_ino == LOGFS_INO_MASTER); + + page = logfs_get_write_page(master_inode, inode->i_ino, 0); + if (!page) + return NULL; + + di = kmap_atomic(page, KM_USER0); + logfs_inode_to_disk(inode, di); + kunmap_atomic(di, KM_USER0); + move_inode_to_page(page, inode); + return page; +} + +static int do_write_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct inode *master_inode = logfs_super(sb)->s_master_inode; + loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits; + struct page *page; + int err; + + BUG_ON(inode->i_ino == LOGFS_INO_MASTER); + /* FIXME: lock inode */ + + if (i_size_read(master_inode) < size) + i_size_write(master_inode, size); + + /* TODO: Tell vfs this inode is clean now */ + + page = inode_to_page(inode); + if (!page) + return -ENOMEM; + + /* FIXME: transaction is part of logfs_block now. Is that enough? */ + err = logfs_write_buf(master_inode, page, 0); + if (err) + move_page_to_inode(inode, page); + + logfs_put_write_page(page); + return err; +} + +static void logfs_mod_segment_entry(struct super_block *sb, u32 segno, + int write, + void (*change_se)(struct logfs_segment_entry *, long), + long arg) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + struct page *page; + struct logfs_segment_entry *se; + pgoff_t page_no; + int child_no; + + page_no = segno >> (sb->s_blocksize_bits - 3); + child_no = segno & ((sb->s_blocksize >> 3) - 1); + + inode = super->s_segfile_inode; + page = logfs_get_write_page(inode, page_no, 0); + BUG_ON(!page); /* FIXME: We need some reserve page for this case */ + if (!PageUptodate(page)) + logfs_read_block(inode, page, WRITE); + + if (write) + alloc_indirect_block(inode, page, 0); + se = kmap_atomic(page, KM_USER0); + change_se(se + child_no, arg); + if (write) { + logfs_set_alias(sb, logfs_block(page), child_no); + BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize); + } + kunmap_atomic(se, KM_USER0); + + logfs_put_write_page(page); +} + +static void __get_segment_entry(struct logfs_segment_entry *se, long _target) +{ + struct logfs_segment_entry *target = (void *)_target; + + *target = *se; +} + +void logfs_get_segment_entry(struct super_block *sb, u32 segno, + struct logfs_segment_entry *se) +{ + logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se); +} + +static void __set_segment_used(struct logfs_segment_entry *se, long increment) +{ + u32 valid; + + valid = be32_to_cpu(se->valid); + valid += increment; + se->valid = cpu_to_be32(valid); +} + +void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment) +{ + struct logfs_super *super = logfs_super(sb); + u32 segno = ofs >> super->s_segshift; + + if (!increment) + return; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment); +} + +static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level) +{ + se->ec_level = cpu_to_be32(ec_level); +} + +void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec, + gc_level_t gc_level) +{ + u32 ec_level = ec << 4 | (__force u8)gc_level; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level); +} + +static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore) +{ + se->valid = cpu_to_be32(RESERVED); +} + +void logfs_set_segment_reserved(struct super_block *sb, u32 segno) +{ + logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0); +} + +static void __set_segment_unreserved(struct logfs_segment_entry *se, + long ec_level) +{ + se->valid = 0; + se->ec_level = cpu_to_be32(ec_level); +} + +void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec) +{ + u32 ec_level = ec << 4; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved, + ec_level); +} + +int __logfs_write_inode(struct inode *inode, long flags) +{ + struct super_block *sb = inode->i_sb; + int ret; + + logfs_get_wblocks(sb, NULL, flags & WF_LOCK); + ret = do_write_inode(inode); + logfs_put_wblocks(sb, NULL, flags & WF_LOCK); + return ret; +} + +static int do_delete_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct inode *master_inode = logfs_super(sb)->s_master_inode; + struct page *page; + int ret; + + page = logfs_get_write_page(master_inode, inode->i_ino, 0); + if (!page) + return -ENOMEM; + + move_inode_to_page(page, inode); + + logfs_get_wblocks(sb, page, 1); + ret = __logfs_delete(master_inode, page); + logfs_put_wblocks(sb, page, 1); + + logfs_put_write_page(page); + return ret; +} + +/* + * ZOMBIE inodes have already been deleted before and should remain dead, + * if it weren't for valid checking. No need to kill them again here. + */ +void logfs_evict_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = li->li_block; + struct page *page; + + if (!inode->i_nlink) { + if (!(li->li_flags & LOGFS_IF_ZOMBIE)) { + li->li_flags |= LOGFS_IF_ZOMBIE; + if (i_size_read(inode) > 0) + logfs_truncate(inode, 0); + do_delete_inode(inode); + } + } + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + + /* Cheaper version of write_inode. All changes are concealed in + * aliases, which are moved back. No write to the medium happens. + */ + /* Only deleted files may be dirty at this point */ + BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink); + if (!block) + return; + if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) { + block->ops->free_block(inode->i_sb, block); + return; + } + + BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS); + page = inode_to_page(inode); + BUG_ON(!page); /* FIXME: Use emergency page */ + logfs_put_write_page(page); +} + +void btree_write_block(struct logfs_block *block) +{ + struct inode *inode; + struct page *page; + int err, cookie; + + inode = logfs_safe_iget(block->sb, block->ino, &cookie); + page = logfs_get_write_page(inode, block->bix, block->level); + + err = logfs_readpage_nolock(page); + BUG_ON(err); + BUG_ON(!PagePrivate(page)); + BUG_ON(logfs_block(page) != block); + err = __logfs_write_buf(inode, page, 0); + BUG_ON(err); + BUG_ON(PagePrivate(page) || page->private); + + logfs_put_write_page(page); + logfs_safe_iput(inode, cookie); +} + +/** + * logfs_inode_write - write inode or dentry objects + * + * @inode: parent inode (ifile or directory) + * @buf: object to write (inode or dentry) + * @n: object size + * @_pos: object number (file position in blocks/objects) + * @flags: write flags + * @lock: 0 if write lock is already taken, 1 otherwise + * @shadow_tree: shadow below this inode + * + * FIXME: All caller of this put a 200-300 byte variable on the stack, + * only to call here and do a memcpy from that stack variable. A good + * example of wasted performance and stack space. + */ +int logfs_inode_write(struct inode *inode, const void *buf, size_t count, + loff_t bix, long flags, struct shadow_tree *shadow_tree) +{ + loff_t pos = bix << inode->i_sb->s_blocksize_bits; + int err; + struct page *page; + void *pagebuf; + + BUG_ON(pos & (LOGFS_BLOCKSIZE-1)); + BUG_ON(count > LOGFS_BLOCKSIZE); + page = logfs_get_write_page(inode, bix, 0); + if (!page) + return -ENOMEM; + + pagebuf = kmap_atomic(page, KM_USER0); + memcpy(pagebuf, buf, count); + flush_dcache_page(page); + kunmap_atomic(pagebuf, KM_USER0); + + if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE) + i_size_write(inode, pos + LOGFS_BLOCKSIZE); + + err = logfs_write_buf(inode, page, flags); + logfs_put_write_page(page); + return err; +} + +int logfs_open_segfile(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + + inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE); + if (IS_ERR(inode)) + return PTR_ERR(inode); + super->s_segfile_inode = inode; + return 0; +} + +int logfs_init_rw(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int min_fill = 3 * super->s_no_blocks; + + INIT_LIST_HEAD(&super->s_object_alias); + INIT_LIST_HEAD(&super->s_writeback_list); + mutex_init(&super->s_write_mutex); + super->s_block_pool = mempool_create_kmalloc_pool(min_fill, + sizeof(struct logfs_block)); + super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill, + sizeof(struct logfs_shadow)); + return 0; +} + +void logfs_cleanup_rw(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + logfs_mempool_destroy(super->s_block_pool); + logfs_mempool_destroy(super->s_shadow_pool); +} diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c new file mode 100644 index 00000000..9d518735 --- /dev/null +++ b/fs/logfs/segment.c @@ -0,0 +1,932 @@ +/* + * fs/logfs/segment.c - Handling the Object Store + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Object store or ostore makes up the complete device with exception of + * the superblock and journal areas. Apart from its own metadata it stores + * three kinds of objects: inodes, dentries and blocks, both data and indirect. + */ +#include "logfs.h" +#include + +static int logfs_mark_segment_bad(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct btree_head32 *head = &super->s_reserved_segments; + int err; + + err = btree_insert32(head, segno, (void *)1, GFP_NOFS); + if (err) + return err; + logfs_super(sb)->s_bad_segments++; + /* FIXME: write to journal */ + return 0; +} + +int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase) +{ + struct logfs_super *super = logfs_super(sb); + + super->s_gec++; + + return super->s_devops->erase(sb, (u64)segno << super->s_segshift, + super->s_segsize, ensure_erase); +} + +static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes) +{ + s32 ofs; + + logfs_open_area(area, bytes); + + ofs = area->a_used_bytes; + area->a_used_bytes += bytes; + BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize); + + return dev_ofs(area->a_sb, area->a_segno, ofs); +} + +static struct page *get_mapping_page(struct super_block *sb, pgoff_t index, + int use_filler) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = super->s_devops->readpage; + struct page *page; + + BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS); + if (use_filler) + page = read_cache_page(mapping, index, filler, sb); + else { + page = find_or_create_page(mapping, index, GFP_NOFS); + unlock_page(page); + } + return page; +} + +int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, + int use_filler) +{ + pgoff_t index = ofs >> PAGE_SHIFT; + struct page *page; + long offset = ofs & (PAGE_SIZE-1); + long copylen; + + /* Only logfs_wbuf_recover may use len==0 */ + BUG_ON(!len && !use_filler); + do { + copylen = min((ulong)len, PAGE_SIZE - offset); + + page = get_mapping_page(area->a_sb, index, use_filler); + if (IS_ERR(page)) + return PTR_ERR(page); + BUG_ON(!page); /* FIXME: reserve a pool */ + SetPageUptodate(page); + memcpy(page_address(page) + offset, buf, copylen); + SetPagePrivate(page); + page_cache_release(page); + + buf += copylen; + len -= copylen; + offset = 0; + index++; + } while (len); + return 0; +} + +static void pad_partial_page(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct page *page; + u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); + pgoff_t index = ofs >> PAGE_SHIFT; + long offset = ofs & (PAGE_SIZE-1); + u32 len = PAGE_SIZE - offset; + + if (len % PAGE_SIZE) { + page = get_mapping_page(sb, index, 0); + BUG_ON(!page); /* FIXME: reserve a pool */ + memset(page_address(page) + offset, 0xff, len); + SetPagePrivate(page); + page_cache_release(page); + } +} + +static void pad_full_pages(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); + u32 len = super->s_segsize - area->a_used_bytes; + pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT; + pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT; + struct page *page; + + while (no_indizes) { + page = get_mapping_page(sb, index, 0); + BUG_ON(!page); /* FIXME: reserve a pool */ + SetPageUptodate(page); + memset(page_address(page), 0xff, PAGE_CACHE_SIZE); + SetPagePrivate(page); + page_cache_release(page); + index++; + no_indizes--; + } +} + +/* + * bdev_writeseg will write full pages. Memset the tail to prevent data leaks. + * Also make sure we allocate (and memset) all pages for final writeout. + */ +static void pad_wbuf(struct logfs_area *area, int final) +{ + pad_partial_page(area); + if (final) + pad_full_pages(area); +} + +/* + * We have to be careful with the alias tree. Since lookup is done by bix, + * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with + * indirect blocks. So always use it through accessor functions. + */ +static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix, + level_t level) +{ + struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree; + pgoff_t index = logfs_pack_index(bix, level); + + return btree_lookup128(head, ino, index); +} + +static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix, + level_t level, void *val) +{ + struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree; + pgoff_t index = logfs_pack_index(bix, level); + + return btree_insert128(head, ino, index, val, GFP_NOFS); +} + +static int btree_write_alias(struct super_block *sb, struct logfs_block *block, + write_alias_t *write_one_alias) +{ + struct object_alias_item *item; + int err; + + list_for_each_entry(item, &block->item_list, list) { + err = write_alias_journal(sb, block->ino, block->bix, + block->level, item->child_no, item->val); + if (err) + return err; + } + return 0; +} + +static struct logfs_block_ops btree_block_ops = { + .write_block = btree_write_block, + .free_block = __free_block, + .write_alias = btree_write_alias, +}; + +int logfs_load_object_aliases(struct super_block *sb, + struct logfs_obj_alias *oa, int count) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + struct object_alias_item *item; + u64 ino, bix; + level_t level; + int i, err; + + super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS; + count /= sizeof(*oa); + for (i = 0; i < count; i++) { + item = mempool_alloc(super->s_alias_pool, GFP_NOFS); + if (!item) + return -ENOMEM; + memset(item, 0, sizeof(*item)); + + super->s_no_object_aliases++; + item->val = oa[i].val; + item->child_no = be16_to_cpu(oa[i].child_no); + + ino = be64_to_cpu(oa[i].ino); + bix = be64_to_cpu(oa[i].bix); + level = LEVEL(oa[i].level); + + log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n", + ino, bix, level, item->child_no, + be64_to_cpu(item->val)); + block = alias_tree_lookup(sb, ino, bix, level); + if (!block) { + block = __alloc_block(sb, ino, bix, level); + block->ops = &btree_block_ops; + err = alias_tree_insert(sb, ino, bix, level, block); + BUG_ON(err); /* mempool empty */ + } + if (test_and_set_bit(item->child_no, block->alias_map)) { + printk(KERN_ERR"LogFS: Alias collision detected\n"); + return -EIO; + } + list_move_tail(&block->alias_list, &super->s_object_alias); + list_add(&item->list, &block->item_list); + } + return 0; +} + +static void kill_alias(void *_block, unsigned long ignore0, + u64 ignore1, u64 ignore2, size_t ignore3) +{ + struct logfs_block *block = _block; + struct super_block *sb = block->sb; + struct logfs_super *super = logfs_super(sb); + struct object_alias_item *item; + + while (!list_empty(&block->item_list)) { + item = list_entry(block->item_list.next, typeof(*item), list); + list_del(&item->list); + mempool_free(item, super->s_alias_pool); + } + block->ops->free_block(sb, block); +} + +static int obj_type(struct inode *inode, level_t level) +{ + if (level == 0) { + if (S_ISDIR(inode->i_mode)) + return OBJ_DENTRY; + if (inode->i_ino == LOGFS_INO_MASTER) + return OBJ_INODE; + } + return OBJ_BLOCK; +} + +static int obj_len(struct super_block *sb, int obj_type) +{ + switch (obj_type) { + case OBJ_DENTRY: + return sizeof(struct logfs_disk_dentry); + case OBJ_INODE: + return sizeof(struct logfs_disk_inode); + case OBJ_BLOCK: + return sb->s_blocksize; + default: + BUG(); + } +} + +static int __logfs_segment_write(struct inode *inode, void *buf, + struct logfs_shadow *shadow, int type, int len, int compr) +{ + struct logfs_area *area; + struct super_block *sb = inode->i_sb; + s64 ofs; + struct logfs_object_header h; + int acc_len; + + if (shadow->gc_level == 0) + acc_len = len; + else + acc_len = obj_len(sb, type); + + area = get_area(sb, shadow->gc_level); + ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE); + LOGFS_BUG_ON(ofs <= 0, sb); + /* + * Order is important. logfs_get_free_bytes(), by modifying the + * segment file, may modify the content of the very page we're about + * to write now. Which is fine, as long as the calculated crc and + * written data still match. So do the modifications _before_ + * calculating the crc. + */ + + h.len = cpu_to_be16(len); + h.type = type; + h.compr = compr; + h.ino = cpu_to_be64(inode->i_ino); + h.bix = cpu_to_be64(shadow->bix); + h.crc = logfs_crc32(&h, sizeof(h) - 4, 4); + h.data_crc = logfs_crc32(buf, len, 0); + + logfs_buf_write(area, ofs, &h, sizeof(h)); + logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len); + + shadow->new_ofs = ofs; + shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE; + + return 0; +} + +static s64 logfs_segment_write_compress(struct inode *inode, void *buf, + struct logfs_shadow *shadow, int type, int len) +{ + struct super_block *sb = inode->i_sb; + void *compressor_buf = logfs_super(sb)->s_compressed_je; + ssize_t compr_len; + int ret; + + mutex_lock(&logfs_super(sb)->s_journal_mutex); + compr_len = logfs_compress(buf, compressor_buf, len, len); + + if (compr_len >= 0) { + ret = __logfs_segment_write(inode, compressor_buf, shadow, + type, compr_len, COMPR_ZLIB); + } else { + ret = __logfs_segment_write(inode, buf, shadow, type, len, + COMPR_NONE); + } + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + return ret; +} + +/** + * logfs_segment_write - write data block to object store + * @inode: inode containing data + * + * Returns an errno or zero. + */ +int logfs_segment_write(struct inode *inode, struct page *page, + struct logfs_shadow *shadow) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + int do_compress, type, len; + int ret; + void *buf; + + super->s_flags |= LOGFS_SB_FLAG_DIRTY; + BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED; + if (shadow->gc_level != 0) { + /* temporarily disable compression for indirect blocks */ + do_compress = 0; + } + + type = obj_type(inode, shrink_level(shadow->gc_level)); + len = obj_len(sb, type); + buf = kmap(page); + if (do_compress) + ret = logfs_segment_write_compress(inode, buf, shadow, type, + len); + else + ret = __logfs_segment_write(inode, buf, shadow, type, len, + COMPR_NONE); + kunmap(page); + + log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + /* this BUG_ON did catch a locking bug. useful */ + BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1))); + return ret; +} + +int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf) +{ + pgoff_t index = ofs >> PAGE_SHIFT; + struct page *page; + long offset = ofs & (PAGE_SIZE-1); + long copylen; + + while (len) { + copylen = min((ulong)len, PAGE_SIZE - offset); + + page = get_mapping_page(sb, index, 1); + if (IS_ERR(page)) + return PTR_ERR(page); + memcpy(buf, page_address(page) + offset, copylen); + page_cache_release(page); + + buf += copylen; + len -= copylen; + offset = 0; + index++; + } + return 0; +} + +/* + * The "position" of indirect blocks is ambiguous. It can be the position + * of any data block somewhere behind this indirect block. So we need to + * normalize the positions through logfs_block_mask() before comparing. + */ +static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level) +{ + return (pos1 & logfs_block_mask(sb, level)) != + (pos2 & logfs_block_mask(sb, level)); +} + +#if 0 +static int read_seg_header(struct super_block *sb, u64 ofs, + struct logfs_segment_header *sh) +{ + __be32 crc; + int err; + + err = wbuf_read(sb, ofs, sizeof(*sh), sh); + if (err) + return err; + crc = logfs_crc32(sh, sizeof(*sh), 4); + if (crc != sh->crc) { + printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, " + "got %x\n", ofs, be32_to_cpu(sh->crc), + be32_to_cpu(crc)); + return -EIO; + } + return 0; +} +#endif + +static int read_obj_header(struct super_block *sb, u64 ofs, + struct logfs_object_header *oh) +{ + __be32 crc; + int err; + + err = wbuf_read(sb, ofs, sizeof(*oh), oh); + if (err) + return err; + crc = logfs_crc32(oh, sizeof(*oh) - 4, 4); + if (crc != oh->crc) { + printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, " + "got %x\n", ofs, be32_to_cpu(oh->crc), + be32_to_cpu(crc)); + return -EIO; + } + return 0; +} + +static void move_btree_to_page(struct inode *inode, struct page *page, + __be64 *data) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct btree_head128 *head = &super->s_object_alias_tree; + struct logfs_block *block; + struct object_alias_item *item, *next; + + if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS)) + return; + + block = btree_remove128(head, inode->i_ino, page->index); + if (!block) + return; + + log_blockmove("move_btree_to_page(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + list_for_each_entry_safe(item, next, &block->item_list, list) { + data[item->child_no] = item->val; + list_del(&item->list); + mempool_free(item, super->s_alias_pool); + } + block->page = page; + SetPagePrivate(page); + page->private = (unsigned long)block; + block->ops = &indirect_block_ops; + initialize_block_counters(page, block, data, 0); +} + +/* + * This silences a false, yet annoying gcc warning. I hate it when my editor + * jumps into bitops.h each time I recompile this file. + * TODO: Complain to gcc folks about this and upgrade compiler. + */ +static unsigned long fnb(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +void move_page_to_btree(struct page *page) +{ + struct logfs_block *block = logfs_block(page); + struct super_block *sb = block->sb; + struct logfs_super *super = logfs_super(sb); + struct object_alias_item *item; + unsigned long pos; + __be64 *child; + int err; + + if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) { + block->ops->free_block(sb, block); + return; + } + log_blockmove("move_page_to_btree(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_BLOCK_FACTOR) + break; + + item = mempool_alloc(super->s_alias_pool, GFP_NOFS); + BUG_ON(!item); /* mempool empty */ + memset(item, 0, sizeof(*item)); + + child = kmap_atomic(page, KM_USER0); + item->val = child[pos]; + kunmap_atomic(child, KM_USER0); + item->child_no = pos; + list_add(&item->list, &block->item_list); + } + block->page = NULL; + ClearPagePrivate(page); + page->private = 0; + block->ops = &btree_block_ops; + err = alias_tree_insert(block->sb, block->ino, block->bix, block->level, + block); + BUG_ON(err); /* mempool empty */ + ClearPageUptodate(page); +} + +static int __logfs_segment_read(struct inode *inode, void *buf, + u64 ofs, u64 bix, level_t level) +{ + struct super_block *sb = inode->i_sb; + void *compressor_buf = logfs_super(sb)->s_compressed_je; + struct logfs_object_header oh; + __be32 crc; + u16 len; + int err, block_len; + + block_len = obj_len(sb, obj_type(inode, level)); + err = read_obj_header(sb, ofs, &oh); + if (err) + goto out_err; + + err = -EIO; + if (be64_to_cpu(oh.ino) != inode->i_ino + || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) { + printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: " + "expected (%lx, %llx), got (%llx, %llx)\n", + ofs, inode->i_ino, bix, + be64_to_cpu(oh.ino), be64_to_cpu(oh.bix)); + goto out_err; + } + + len = be16_to_cpu(oh.len); + + switch (oh.compr) { + case COMPR_NONE: + err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf); + if (err) + goto out_err; + crc = logfs_crc32(buf, len, 0); + if (crc != oh.data_crc) { + printk(KERN_ERR"LOGFS: uncompressed data crc error at " + "%llx: expected %x, got %x\n", ofs, + be32_to_cpu(oh.data_crc), + be32_to_cpu(crc)); + goto out_err; + } + break; + case COMPR_ZLIB: + mutex_lock(&logfs_super(sb)->s_journal_mutex); + err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, + compressor_buf); + if (err) { + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + goto out_err; + } + crc = logfs_crc32(compressor_buf, len, 0); + if (crc != oh.data_crc) { + printk(KERN_ERR"LOGFS: compressed data crc error at " + "%llx: expected %x, got %x\n", ofs, + be32_to_cpu(oh.data_crc), + be32_to_cpu(crc)); + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + goto out_err; + } + err = logfs_uncompress(compressor_buf, buf, len, block_len); + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + if (err) { + printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs); + goto out_err; + } + break; + default: + LOGFS_BUG(sb); + err = -EIO; + goto out_err; + } + return 0; + +out_err: + logfs_set_ro(sb); + printk(KERN_ERR"LOGFS: device is read-only now\n"); + LOGFS_BUG(sb); + return err; +} + +/** + * logfs_segment_read - read data block from object store + * @inode: inode containing data + * @buf: data buffer + * @ofs: physical data offset + * @bix: block index + * @level: block level + * + * Returns 0 on success or a negative errno. + */ +int logfs_segment_read(struct inode *inode, struct page *page, + u64 ofs, u64 bix, level_t level) +{ + int err; + void *buf; + + if (PageUptodate(page)) + return 0; + + ofs &= ~LOGFS_FULLY_POPULATED; + + buf = kmap(page); + err = __logfs_segment_read(inode, buf, ofs, bix, level); + if (!err) { + move_btree_to_page(inode, page, buf); + SetPageUptodate(page); + } + kunmap(page); + log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n", + inode->i_ino, bix, level, ofs, err); + return err; +} + +int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct logfs_object_header h; + u16 len; + int err; + + super->s_flags |= LOGFS_SB_FLAG_DIRTY; + BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED); + if (!shadow->old_ofs) + return 0; + + log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + err = read_obj_header(sb, shadow->old_ofs, &h); + LOGFS_BUG_ON(err, sb); + LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb); + LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix), + shrink_level(shadow->gc_level)), sb); + + if (shadow->gc_level == 0) + len = be16_to_cpu(h.len); + else + len = obj_len(sb, h.type); + shadow->old_len = len + sizeof(h); + return 0; +} + +void freeseg(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + u64 ofs, start, end; + + start = dev_ofs(sb, segno, 0); + end = dev_ofs(sb, segno + 1, 0); + for (ofs = start; ofs < end; ofs += PAGE_SIZE) { + page = find_get_page(mapping, ofs >> PAGE_SHIFT); + if (!page) + continue; + ClearPagePrivate(page); + page_cache_release(page); + } +} + +int logfs_open_area(struct logfs_area *area, size_t bytes) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + int err, closed = 0; + + if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize) + return 0; + + if (area->a_is_open) { + u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + u32 len = super->s_segsize - area->a_written_bytes; + + log_gc("logfs_close_area(%x)\n", area->a_segno); + pad_wbuf(area, 1); + super->s_devops->writeseg(area->a_sb, ofs, len); + freeseg(sb, area->a_segno); + closed = 1; + } + + area->a_used_bytes = 0; + area->a_written_bytes = 0; +again: + area->a_ops->get_free_segment(area); + area->a_ops->get_erase_count(area); + + log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level); + err = area->a_ops->erase_segment(area); + if (err) { + printk(KERN_WARNING "LogFS: Error erasing segment %x\n", + area->a_segno); + logfs_mark_segment_bad(sb, area->a_segno); + goto again; + } + area->a_is_open = 1; + return closed; +} + +void logfs_sync_area(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + u32 len = (area->a_used_bytes - area->a_written_bytes); + + if (super->s_writesize) + len &= ~(super->s_writesize - 1); + if (len == 0) + return; + pad_wbuf(area, 0); + super->s_devops->writeseg(sb, ofs, len); + area->a_written_bytes += len; +} + +void logfs_sync_segments(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + for_each_area(i) + logfs_sync_area(super->s_area[i]); +} + +/* + * Pick a free segment to be used for this area. Effectively takes a + * candidate from the free list (not really a candidate anymore). + */ +static void ostore_get_free_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + + if (super->s_free_list.count == 0) { + printk(KERN_ERR"LOGFS: ran out of free segments\n"); + LOGFS_BUG(sb); + } + + area->a_segno = get_best_cand(sb, &super->s_free_list, NULL); +} + +static void ostore_get_erase_count(struct logfs_area *area) +{ + struct logfs_segment_entry se; + u32 ec_level; + + logfs_get_segment_entry(area->a_sb, area->a_segno, &se); + BUG_ON(se.ec_level == cpu_to_be32(BADSEG) || + se.valid == cpu_to_be32(RESERVED)); + + ec_level = be32_to_cpu(se.ec_level); + area->a_erase_count = (ec_level >> 4) + 1; +} + +static int ostore_erase_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_segment_header sh; + u64 ofs; + int err; + + err = logfs_erase_segment(sb, area->a_segno, 0); + if (err) + return err; + + sh.pad = 0; + sh.type = SEG_OSTORE; + sh.level = (__force u8)area->a_level; + sh.segno = cpu_to_be32(area->a_segno); + sh.ec = cpu_to_be32(area->a_erase_count); + sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); + sh.crc = logfs_crc32(&sh, sizeof(sh), 4); + + logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, + area->a_level); + + ofs = dev_ofs(sb, area->a_segno, 0); + area->a_used_bytes = sizeof(sh); + logfs_buf_write(area, ofs, &sh, sizeof(sh)); + return 0; +} + +static const struct logfs_area_ops ostore_area_ops = { + .get_free_segment = ostore_get_free_segment, + .get_erase_count = ostore_get_erase_count, + .erase_segment = ostore_erase_segment, +}; + +static void free_area(struct logfs_area *area) +{ + if (area) + freeseg(area->a_sb, area->a_segno); + kfree(area); +} + +static struct logfs_area *alloc_area(struct super_block *sb) +{ + struct logfs_area *area; + + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + + area->a_sb = sb; + return area; +} + +static void map_invalidatepage(struct page *page, unsigned long l) +{ + BUG(); +} + +static int map_releasepage(struct page *page, gfp_t g) +{ + /* Don't release these pages */ + return 0; +} + +static const struct address_space_operations mapping_aops = { + .invalidatepage = map_invalidatepage, + .releasepage = map_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, +}; + +int logfs_init_mapping(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping; + struct inode *inode; + + inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING); + if (IS_ERR(inode)) + return PTR_ERR(inode); + super->s_mapping_inode = inode; + mapping = inode->i_mapping; + mapping->a_ops = &mapping_aops; + /* Would it be possible to use __GFP_HIGHMEM as well? */ + mapping_set_gfp_mask(mapping, GFP_NOFS); + return 0; +} + +int logfs_init_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i = -1; + + super->s_alias_pool = mempool_create_kmalloc_pool(600, + sizeof(struct object_alias_item)); + if (!super->s_alias_pool) + return -ENOMEM; + + super->s_journal_area = alloc_area(sb); + if (!super->s_journal_area) + goto err; + + for_each_area(i) { + super->s_area[i] = alloc_area(sb); + if (!super->s_area[i]) + goto err; + super->s_area[i]->a_level = GC_LEVEL(i); + super->s_area[i]->a_ops = &ostore_area_ops; + } + btree_init_mempool128(&super->s_object_alias_tree, + super->s_btree_pool); + return 0; + +err: + for (i--; i >= 0; i--) + free_area(super->s_area[i]); + free_area(super->s_journal_area); + logfs_mempool_destroy(super->s_alias_pool); + return -ENOMEM; +} + +void logfs_cleanup_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias); + for_each_area(i) + free_area(super->s_area[i]); + free_area(super->s_journal_area); +} diff --git a/fs/logfs/super.c b/fs/logfs/super.c new file mode 100644 index 00000000..ce03a182 --- /dev/null +++ b/fs/logfs/super.c @@ -0,0 +1,671 @@ +/* + * fs/logfs/super.c + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Generally contains mount/umount code and also serves as a dump area for + * any functions that don't fit elsewhere and neither justify a file of their + * own. + */ +#include "logfs.h" +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(emergency_mutex); +static struct page *emergency_page; + +struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index) +{ + filler_t *filler = (filler_t *)mapping->a_ops->readpage; + struct page *page; + int err; + + page = read_cache_page(mapping, index, filler, NULL); + if (page) + return page; + + /* No more pages available, switch to emergency page */ + printk(KERN_INFO"Logfs: Using emergency page\n"); + mutex_lock(&emergency_mutex); + err = filler(NULL, emergency_page); + if (err) { + mutex_unlock(&emergency_mutex); + printk(KERN_EMERG"Logfs: Error reading emergency page\n"); + return ERR_PTR(err); + } + return emergency_page; +} + +void emergency_read_end(struct page *page) +{ + if (page == emergency_page) + mutex_unlock(&emergency_mutex); + else + page_cache_release(page); +} + +static void dump_segfile(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_entry se; + u32 segno; + + for (segno = 0; segno < super->s_no_segs; segno++) { + logfs_get_segment_entry(sb, segno, &se); + printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + printk("\n"); + } +} + +/* + * logfs_crash_dump - dump debug information to device + * + * The LogFS superblock only occupies part of a segment. This function will + * write as much debug information as it can gather into the spare space. + */ +void logfs_crash_dump(struct super_block *sb) +{ + dump_segfile(sb); +} + +/* + * TODO: move to lib/string.c + */ +/** + * memchr_inv - Find a character in an area of memory. + * @s: The memory area + * @c: The byte to search for + * @n: The size of the area. + * + * returns the address of the first character other than @c, or %NULL + * if the whole buffer contains just @c. + */ +void *memchr_inv(const void *s, int c, size_t n) +{ + const unsigned char *p = s; + while (n-- != 0) + if ((unsigned char)c != *p++) + return (void *)(p - 1); + + return NULL; +} + +/* + * FIXME: There should be a reserve for root, similar to ext2. + */ +int logfs_statfs(struct dentry *dentry, struct kstatfs *stats) +{ + struct super_block *sb = dentry->d_sb; + struct logfs_super *super = logfs_super(sb); + + stats->f_type = LOGFS_MAGIC_U32; + stats->f_bsize = sb->s_blocksize; + stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3; + stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits; + stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits; + stats->f_files = 0; + stats->f_ffree = 0; + stats->f_namelen = LOGFS_MAX_NAMELEN; + return 0; +} + +static int logfs_sb_set(struct super_block *sb, void *_super) +{ + struct logfs_super *super = _super; + + sb->s_fs_info = super; + sb->s_mtd = super->s_mtd; + sb->s_bdev = super->s_bdev; +#ifdef CONFIG_BLOCK + if (sb->s_bdev) + sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info; +#endif +#ifdef CONFIG_MTD + if (sb->s_mtd) + sb->s_bdi = sb->s_mtd->backing_dev_info; +#endif + return 0; +} + +static int logfs_sb_test(struct super_block *sb, void *_super) +{ + struct logfs_super *super = _super; + struct mtd_info *mtd = super->s_mtd; + + if (mtd && sb->s_mtd == mtd) + return 1; + if (super->s_bdev && sb->s_bdev == super->s_bdev) + return 1; + return 0; +} + +static void set_segment_header(struct logfs_segment_header *sh, u8 type, + u8 level, u32 segno, u32 ec) +{ + sh->pad = 0; + sh->type = type; + sh->level = level; + sh->segno = cpu_to_be32(segno); + sh->ec = cpu_to_be32(ec); + sh->gec = cpu_to_be64(segno); + sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4); +} + +static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds, + u32 segno, u32 ec) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_header *sh = &ds->ds_sh; + int i; + + memset(ds, 0, sizeof(*ds)); + set_segment_header(sh, SEG_SUPER, 0, segno, ec); + + ds->ds_ifile_levels = super->s_ifile_levels; + ds->ds_iblock_levels = super->s_iblock_levels; + ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */ + ds->ds_segment_shift = super->s_segshift; + ds->ds_block_shift = sb->s_blocksize_bits; + ds->ds_write_shift = super->s_writeshift; + ds->ds_filesystem_size = cpu_to_be64(super->s_size); + ds->ds_segment_size = cpu_to_be32(super->s_segsize); + ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve); + ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat); + ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat); + ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat); + ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags); + ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve); + ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve); + journal_for_each(i) + ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]); + ds->ds_magic = cpu_to_be64(LOGFS_MAGIC); + ds->ds_crc = logfs_crc32(ds, sizeof(*ds), + LOGFS_SEGMENT_HEADERSIZE + 12); +} + +static int write_one_sb(struct super_block *sb, + struct page *(*find_sb)(struct super_block *sb, u64 *ofs)) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_disk_super *ds; + struct logfs_segment_entry se; + struct page *page; + u64 ofs; + u32 ec, segno; + int err; + + page = find_sb(sb, &ofs); + if (!page) + return -EIO; + ds = page_address(page); + segno = seg_no(sb, ofs); + logfs_get_segment_entry(sb, segno, &se); + ec = be32_to_cpu(se.ec_level) >> 4; + ec++; + logfs_set_segment_erased(sb, segno, ec, 0); + logfs_write_ds(sb, ds, segno, ec); + err = super->s_devops->write_sb(sb, page); + page_cache_release(page); + return err; +} + +int logfs_write_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int err; + + /* First superblock */ + err = write_one_sb(sb, super->s_devops->find_first_sb); + if (err) + return err; + + /* Last superblock */ + err = write_one_sb(sb, super->s_devops->find_last_sb); + if (err) + return err; + return 0; +} + +static int ds_cmp(const void *ds0, const void *ds1) +{ + size_t len = sizeof(struct logfs_disk_super); + + /* We know the segment headers differ, so ignore them */ + len -= LOGFS_SEGMENT_HEADERSIZE; + ds0 += LOGFS_SEGMENT_HEADERSIZE; + ds1 += LOGFS_SEGMENT_HEADERSIZE; + return memcmp(ds0, ds1, len); +} + +static int logfs_recover_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_disk_super _ds0, *ds0 = &_ds0; + struct logfs_disk_super _ds1, *ds1 = &_ds1; + int err, valid0, valid1; + + /* read first superblock */ + err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0); + if (err) + return err; + /* read last superblock */ + err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1); + if (err) + return err; + valid0 = logfs_check_ds(ds0) == 0; + valid1 = logfs_check_ds(ds1) == 0; + + if (!valid0 && valid1) { + printk(KERN_INFO"First superblock is invalid - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_first_sb); + } + if (valid0 && !valid1) { + printk(KERN_INFO"Last superblock is invalid - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_last_sb); + } + if (valid0 && valid1 && ds_cmp(ds0, ds1)) { + printk(KERN_INFO"Superblocks don't match - fixing.\n"); + return logfs_write_sb(sb); + } + /* If neither is valid now, something's wrong. Didn't we properly + * check them before?!? */ + BUG_ON(!valid0 && !valid1); + return 0; +} + +static int logfs_make_writeable(struct super_block *sb) +{ + int err; + + err = logfs_open_segfile(sb); + if (err) + return err; + + /* Repair any broken superblock copies */ + err = logfs_recover_sb(sb); + if (err) + return err; + + /* Check areas for trailing unaccounted data */ + err = logfs_check_areas(sb); + if (err) + return err; + + /* Do one GC pass before any data gets dirtied */ + logfs_gc_pass(sb); + + /* after all initializations are done, replay the journal + * for rw-mounts, if necessary */ + err = logfs_replay_journal(sb); + if (err) + return err; + + return 0; +} + +static int logfs_get_sb_final(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *rootdir; + int err; + + /* root dir */ + rootdir = logfs_iget(sb, LOGFS_INO_ROOT); + if (IS_ERR(rootdir)) + goto fail; + + sb->s_root = d_alloc_root(rootdir); + if (!sb->s_root) { + iput(rootdir); + goto fail; + } + + /* at that point we know that ->put_super() will be called */ + super->s_erase_page = alloc_pages(GFP_KERNEL, 0); + if (!super->s_erase_page) + return -ENOMEM; + memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE); + + /* FIXME: check for read-only mounts */ + err = logfs_make_writeable(sb); + if (err) { + __free_page(super->s_erase_page); + return err; + } + + log_super("LogFS: Finished mounting\n"); + return 0; + +fail: + iput(super->s_master_inode); + iput(super->s_segfile_inode); + iput(super->s_mapping_inode); + return -EIO; +} + +int logfs_check_ds(struct logfs_disk_super *ds) +{ + struct logfs_segment_header *sh = &ds->ds_sh; + + if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC)) + return -EINVAL; + if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4)) + return -EINVAL; + if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds), + LOGFS_SEGMENT_HEADERSIZE + 12)) + return -EINVAL; + return 0; +} + +static struct page *find_super_block(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct page *first, *last; + + first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]); + if (!first || IS_ERR(first)) + return NULL; + last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]); + if (!last || IS_ERR(last)) { + page_cache_release(first); + return NULL; + } + + if (!logfs_check_ds(page_address(first))) { + page_cache_release(last); + return first; + } + + /* First one didn't work, try the second superblock */ + if (!logfs_check_ds(page_address(last))) { + page_cache_release(first); + return last; + } + + /* Neither worked, sorry folks */ + page_cache_release(first); + page_cache_release(last); + return NULL; +} + +static int __logfs_read_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct page *page; + struct logfs_disk_super *ds; + int i; + + page = find_super_block(sb); + if (!page) + return -EINVAL; + + ds = page_address(page); + super->s_size = be64_to_cpu(ds->ds_filesystem_size); + super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve); + super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve); + super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve); + super->s_segsize = 1 << ds->ds_segment_shift; + super->s_segmask = (1 << ds->ds_segment_shift) - 1; + super->s_segshift = ds->ds_segment_shift; + sb->s_blocksize = 1 << ds->ds_block_shift; + sb->s_blocksize_bits = ds->ds_block_shift; + super->s_writesize = 1 << ds->ds_write_shift; + super->s_writeshift = ds->ds_write_shift; + super->s_no_segs = super->s_size >> super->s_segshift; + super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits; + super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat); + super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat); + super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat); + super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags); + + journal_for_each(i) + super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]); + + super->s_ifile_levels = ds->ds_ifile_levels; + super->s_iblock_levels = ds->ds_iblock_levels; + super->s_data_levels = ds->ds_data_levels; + super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels + + super->s_data_levels; + page_cache_release(page); + return 0; +} + +static int logfs_read_sb(struct super_block *sb, int read_only) +{ + struct logfs_super *super = logfs_super(sb); + int ret; + + super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL); + if (!super->s_btree_pool) + return -ENOMEM; + + btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool); + btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool); + btree_init_mempool32(&super->s_shadow_tree.segment_map, + super->s_btree_pool); + + ret = logfs_init_mapping(sb); + if (ret) + return ret; + + ret = __logfs_read_sb(sb); + if (ret) + return ret; + + if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT) + return -EIO; + if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) && + !read_only) + return -EIO; + + ret = logfs_init_rw(sb); + if (ret) + return ret; + + ret = logfs_init_areas(sb); + if (ret) + return ret; + + ret = logfs_init_gc(sb); + if (ret) + return ret; + + ret = logfs_init_journal(sb); + if (ret) + return ret; + + return 0; +} + +static void logfs_kill_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + log_super("LogFS: Start unmounting\n"); + /* Alias entries slow down mount, so evict as many as possible */ + sync_filesystem(sb); + logfs_write_anchor(sb); + + /* + * From this point on alias entries are simply dropped - and any + * writes to the object store are considered bugs. + */ + super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN; + log_super("LogFS: Now in shutdown\n"); + generic_shutdown_super(sb); + + BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes); + + logfs_cleanup_gc(sb); + logfs_cleanup_journal(sb); + logfs_cleanup_areas(sb); + logfs_cleanup_rw(sb); + if (super->s_erase_page) + __free_page(super->s_erase_page); + super->s_devops->put_device(super); + logfs_mempool_destroy(super->s_btree_pool); + logfs_mempool_destroy(super->s_alias_pool); + kfree(super); + log_super("LogFS: Finished unmounting\n"); +} + +static struct dentry *logfs_get_sb_device(struct logfs_super *super, + struct file_system_type *type, int flags) +{ + struct super_block *sb; + int err = -ENOMEM; + static int mount_count; + + log_super("LogFS: Start mount %x\n", mount_count++); + + err = -EINVAL; + sb = sget(type, logfs_sb_test, logfs_sb_set, super); + if (IS_ERR(sb)) { + super->s_devops->put_device(super); + kfree(super); + return ERR_CAST(sb); + } + + if (sb->s_root) { + /* Device is already in use */ + super->s_devops->put_device(super); + kfree(super); + return dget(sb->s_root); + } + + /* + * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache + * only covers 16TB and the upper 8TB are used for indirect blocks. + * On 64bit system we could bump up the limit, but that would make + * the filesystem incompatible with 32bit systems. + */ + sb->s_maxbytes = (1ull << 43) - 1; + sb->s_op = &logfs_super_operations; + sb->s_flags = flags | MS_NOATIME; + + err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY); + if (err) + goto err1; + + sb->s_flags |= MS_ACTIVE; + err = logfs_get_sb_final(sb); + if (err) { + deactivate_locked_super(sb); + return ERR_PTR(err); + } + return dget(sb->s_root); + +err1: + /* no ->s_root, no ->put_super() */ + iput(super->s_master_inode); + iput(super->s_segfile_inode); + iput(super->s_mapping_inode); + deactivate_locked_super(sb); + return ERR_PTR(err); +} + +static struct dentry *logfs_mount(struct file_system_type *type, int flags, + const char *devname, void *data) +{ + ulong mtdnr; + struct logfs_super *super; + int err; + + super = kzalloc(sizeof(*super), GFP_KERNEL); + if (!super) + return ERR_PTR(-ENOMEM); + + mutex_init(&super->s_dirop_mutex); + mutex_init(&super->s_object_alias_mutex); + INIT_LIST_HEAD(&super->s_freeing_list); + + if (!devname) + err = logfs_get_sb_bdev(super, type, devname); + else if (strncmp(devname, "mtd", 3)) + err = logfs_get_sb_bdev(super, type, devname); + else { + char *garbage; + mtdnr = simple_strtoul(devname+3, &garbage, 0); + if (*garbage) + err = -EINVAL; + else + err = logfs_get_sb_mtd(super, mtdnr); + } + + if (err) { + kfree(super); + return ERR_PTR(err); + } + + return logfs_get_sb_device(super, type, flags); +} + +static struct file_system_type logfs_fs_type = { + .owner = THIS_MODULE, + .name = "logfs", + .mount = logfs_mount, + .kill_sb = logfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, + +}; + +static int __init logfs_init(void) +{ + int ret; + + emergency_page = alloc_pages(GFP_KERNEL, 0); + if (!emergency_page) + return -ENOMEM; + + ret = logfs_compr_init(); + if (ret) + goto out1; + + ret = logfs_init_inode_cache(); + if (ret) + goto out2; + + return register_filesystem(&logfs_fs_type); +out2: + logfs_compr_exit(); +out1: + __free_pages(emergency_page, 0); + return ret; +} + +static void __exit logfs_exit(void) +{ + unregister_filesystem(&logfs_fs_type); + logfs_destroy_inode_cache(); + logfs_compr_exit(); + __free_pages(emergency_page, 0); +} + +module_init(logfs_init); +module_exit(logfs_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joern Engel "); +MODULE_DESCRIPTION("scalable flash filesystem"); diff --git a/fs/mbcache.c b/fs/mbcache.c new file mode 100644 index 00000000..8c32ef3b --- /dev/null +++ b/fs/mbcache.c @@ -0,0 +1,620 @@ +/* + * linux/fs/mbcache.c + * (C) 2001-2002 Andreas Gruenbacher, + */ + +/* + * Filesystem Meta Information Block Cache (mbcache) + * + * The mbcache caches blocks of block devices that need to be located + * by their device/block number, as well as by other criteria (such + * as the block's contents). + * + * There can only be one cache entry in a cache per device and block number. + * Additional indexes need not be unique in this sense. The number of + * additional indexes (=other criteria) can be hardwired at compile time + * or specified at cache create time. + * + * Each cache entry is of fixed size. An entry may be `valid' or `invalid' + * in the cache. A valid entry is in the main hash tables of the cache, + * and may also be in the lru list. An invalid entry is not in any hashes + * or lists. + * + * A valid cache entry is only in the lru list if no handles refer to it. + * Invalid cache entries will be freed when the last handle to the cache + * entry is released. Entries that cannot be freed immediately are put + * back on the lru list. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + + +#ifdef MB_CACHE_DEBUG +# define mb_debug(f...) do { \ + printk(KERN_DEBUG f); \ + printk("\n"); \ + } while (0) +#define mb_assert(c) do { if (!(c)) \ + printk(KERN_ERR "assertion " #c " failed\n"); \ + } while(0) +#else +# define mb_debug(f...) do { } while(0) +# define mb_assert(c) do { } while(0) +#endif +#define mb_error(f...) do { \ + printk(KERN_ERR f); \ + printk("\n"); \ + } while(0) + +#define MB_CACHE_WRITER ((unsigned short)~0U >> 1) + +static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); + +MODULE_AUTHOR("Andreas Gruenbacher "); +MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); +MODULE_LICENSE("GPL"); + +EXPORT_SYMBOL(mb_cache_create); +EXPORT_SYMBOL(mb_cache_shrink); +EXPORT_SYMBOL(mb_cache_destroy); +EXPORT_SYMBOL(mb_cache_entry_alloc); +EXPORT_SYMBOL(mb_cache_entry_insert); +EXPORT_SYMBOL(mb_cache_entry_release); +EXPORT_SYMBOL(mb_cache_entry_free); +EXPORT_SYMBOL(mb_cache_entry_get); +#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) +EXPORT_SYMBOL(mb_cache_entry_find_first); +EXPORT_SYMBOL(mb_cache_entry_find_next); +#endif + +/* + * Global data: list of all mbcache's, lru list, and a spinlock for + * accessing cache data structures on SMP machines. The lru list is + * global across all mbcaches. + */ + +static LIST_HEAD(mb_cache_list); +static LIST_HEAD(mb_cache_lru_list); +static DEFINE_SPINLOCK(mb_cache_spinlock); + +/* + * What the mbcache registers as to get shrunk dynamically. + */ + +static int mb_cache_shrink_fn(struct shrinker *shrink, + struct shrink_control *sc); + +static struct shrinker mb_cache_shrinker = { + .shrink = mb_cache_shrink_fn, + .seeks = DEFAULT_SEEKS, +}; + +static inline int +__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) +{ + return !list_empty(&ce->e_block_list); +} + + +static void +__mb_cache_entry_unhash(struct mb_cache_entry *ce) +{ + if (__mb_cache_entry_is_hashed(ce)) { + list_del_init(&ce->e_block_list); + list_del(&ce->e_index.o_list); + } +} + + +static void +__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) +{ + struct mb_cache *cache = ce->e_cache; + + mb_assert(!(ce->e_used || ce->e_queued)); + kmem_cache_free(cache->c_entry_cache, ce); + atomic_dec(&cache->c_entry_count); +} + + +static void +__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) + __releases(mb_cache_spinlock) +{ + /* Wake up all processes queuing for this cache entry. */ + if (ce->e_queued) + wake_up_all(&mb_cache_queue); + if (ce->e_used >= MB_CACHE_WRITER) + ce->e_used -= MB_CACHE_WRITER; + ce->e_used--; + if (!(ce->e_used || ce->e_queued)) { + if (!__mb_cache_entry_is_hashed(ce)) + goto forget; + mb_assert(list_empty(&ce->e_lru_list)); + list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); + } + spin_unlock(&mb_cache_spinlock); + return; +forget: + spin_unlock(&mb_cache_spinlock); + __mb_cache_entry_forget(ce, GFP_KERNEL); +} + + +/* + * mb_cache_shrink_fn() memory pressure callback + * + * This function is called by the kernel memory management when memory + * gets low. + * + * @shrink: (ignored) + * @sc: shrink_control passed from reclaim + * + * Returns the number of objects which are present in the cache. + */ +static int +mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc) +{ + LIST_HEAD(free_list); + struct mb_cache *cache; + struct mb_cache_entry *entry, *tmp; + int count = 0; + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + + mb_debug("trying to free %d entries", nr_to_scan); + spin_lock(&mb_cache_spinlock); + while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) { + struct mb_cache_entry *ce = + list_entry(mb_cache_lru_list.next, + struct mb_cache_entry, e_lru_list); + list_move_tail(&ce->e_lru_list, &free_list); + __mb_cache_entry_unhash(ce); + } + list_for_each_entry(cache, &mb_cache_list, c_cache_list) { + mb_debug("cache %s (%d)", cache->c_name, + atomic_read(&cache->c_entry_count)); + count += atomic_read(&cache->c_entry_count); + } + spin_unlock(&mb_cache_spinlock); + list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { + __mb_cache_entry_forget(entry, gfp_mask); + } + return (count / 100) * sysctl_vfs_cache_pressure; +} + + +/* + * mb_cache_create() create a new cache + * + * All entries in one cache are equal size. Cache entries may be from + * multiple devices. If this is the first mbcache created, registers + * the cache with kernel memory management. Returns NULL if no more + * memory was available. + * + * @name: name of the cache (informal) + * @bucket_bits: log2(number of hash buckets) + */ +struct mb_cache * +mb_cache_create(const char *name, int bucket_bits) +{ + int n, bucket_count = 1 << bucket_bits; + struct mb_cache *cache = NULL; + + cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); + if (!cache) + return NULL; + cache->c_name = name; + atomic_set(&cache->c_entry_count, 0); + cache->c_bucket_bits = bucket_bits; + cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), + GFP_KERNEL); + if (!cache->c_block_hash) + goto fail; + for (n=0; nc_block_hash[n]); + cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head), + GFP_KERNEL); + if (!cache->c_index_hash) + goto fail; + for (n=0; nc_index_hash[n]); + cache->c_entry_cache = kmem_cache_create(name, + sizeof(struct mb_cache_entry), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + if (!cache->c_entry_cache) + goto fail2; + + /* + * Set an upper limit on the number of cache entries so that the hash + * chains won't grow too long. + */ + cache->c_max_entries = bucket_count << 4; + + spin_lock(&mb_cache_spinlock); + list_add(&cache->c_cache_list, &mb_cache_list); + spin_unlock(&mb_cache_spinlock); + return cache; + +fail2: + kfree(cache->c_index_hash); + +fail: + kfree(cache->c_block_hash); + kfree(cache); + return NULL; +} + + +/* + * mb_cache_shrink() + * + * Removes all cache entries of a device from the cache. All cache entries + * currently in use cannot be freed, and thus remain in the cache. All others + * are freed. + * + * @bdev: which device's cache entries to shrink + */ +void +mb_cache_shrink(struct block_device *bdev) +{ + LIST_HEAD(free_list); + struct list_head *l, *ltmp; + + spin_lock(&mb_cache_spinlock); + list_for_each_safe(l, ltmp, &mb_cache_lru_list) { + struct mb_cache_entry *ce = + list_entry(l, struct mb_cache_entry, e_lru_list); + if (ce->e_bdev == bdev) { + list_move_tail(&ce->e_lru_list, &free_list); + __mb_cache_entry_unhash(ce); + } + } + spin_unlock(&mb_cache_spinlock); + list_for_each_safe(l, ltmp, &free_list) { + __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, + e_lru_list), GFP_KERNEL); + } +} + + +/* + * mb_cache_destroy() + * + * Shrinks the cache to its minimum possible size (hopefully 0 entries), + * and then destroys it. If this was the last mbcache, un-registers the + * mbcache from kernel memory management. + */ +void +mb_cache_destroy(struct mb_cache *cache) +{ + LIST_HEAD(free_list); + struct list_head *l, *ltmp; + + spin_lock(&mb_cache_spinlock); + list_for_each_safe(l, ltmp, &mb_cache_lru_list) { + struct mb_cache_entry *ce = + list_entry(l, struct mb_cache_entry, e_lru_list); + if (ce->e_cache == cache) { + list_move_tail(&ce->e_lru_list, &free_list); + __mb_cache_entry_unhash(ce); + } + } + list_del(&cache->c_cache_list); + spin_unlock(&mb_cache_spinlock); + + list_for_each_safe(l, ltmp, &free_list) { + __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, + e_lru_list), GFP_KERNEL); + } + + if (atomic_read(&cache->c_entry_count) > 0) { + mb_error("cache %s: %d orphaned entries", + cache->c_name, + atomic_read(&cache->c_entry_count)); + } + + kmem_cache_destroy(cache->c_entry_cache); + + kfree(cache->c_index_hash); + kfree(cache->c_block_hash); + kfree(cache); +} + +/* + * mb_cache_entry_alloc() + * + * Allocates a new cache entry. The new entry will not be valid initially, + * and thus cannot be looked up yet. It should be filled with data, and + * then inserted into the cache using mb_cache_entry_insert(). Returns NULL + * if no more memory was available. + */ +struct mb_cache_entry * +mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) +{ + struct mb_cache_entry *ce = NULL; + + if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { + spin_lock(&mb_cache_spinlock); + if (!list_empty(&mb_cache_lru_list)) { + ce = list_entry(mb_cache_lru_list.next, + struct mb_cache_entry, e_lru_list); + list_del_init(&ce->e_lru_list); + __mb_cache_entry_unhash(ce); + } + spin_unlock(&mb_cache_spinlock); + } + if (!ce) { + ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); + if (!ce) + return NULL; + atomic_inc(&cache->c_entry_count); + INIT_LIST_HEAD(&ce->e_lru_list); + INIT_LIST_HEAD(&ce->e_block_list); + ce->e_cache = cache; + ce->e_queued = 0; + } + ce->e_used = 1 + MB_CACHE_WRITER; + return ce; +} + + +/* + * mb_cache_entry_insert() + * + * Inserts an entry that was allocated using mb_cache_entry_alloc() into + * the cache. After this, the cache entry can be looked up, but is not yet + * in the lru list as the caller still holds a handle to it. Returns 0 on + * success, or -EBUSY if a cache entry for that device + inode exists + * already (this may happen after a failed lookup, but when another process + * has inserted the same cache entry in the meantime). + * + * @bdev: device the cache entry belongs to + * @block: block number + * @key: lookup key + */ +int +mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, + sector_t block, unsigned int key) +{ + struct mb_cache *cache = ce->e_cache; + unsigned int bucket; + struct list_head *l; + int error = -EBUSY; + + bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), + cache->c_bucket_bits); + spin_lock(&mb_cache_spinlock); + list_for_each_prev(l, &cache->c_block_hash[bucket]) { + struct mb_cache_entry *ce = + list_entry(l, struct mb_cache_entry, e_block_list); + if (ce->e_bdev == bdev && ce->e_block == block) + goto out; + } + __mb_cache_entry_unhash(ce); + ce->e_bdev = bdev; + ce->e_block = block; + list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); + ce->e_index.o_key = key; + bucket = hash_long(key, cache->c_bucket_bits); + list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]); + error = 0; +out: + spin_unlock(&mb_cache_spinlock); + return error; +} + + +/* + * mb_cache_entry_release() + * + * Release a handle to a cache entry. When the last handle to a cache entry + * is released it is either freed (if it is invalid) or otherwise inserted + * in to the lru list. + */ +void +mb_cache_entry_release(struct mb_cache_entry *ce) +{ + spin_lock(&mb_cache_spinlock); + __mb_cache_entry_release_unlock(ce); +} + + +/* + * mb_cache_entry_free() + * + * This is equivalent to the sequence mb_cache_entry_takeout() -- + * mb_cache_entry_release(). + */ +void +mb_cache_entry_free(struct mb_cache_entry *ce) +{ + spin_lock(&mb_cache_spinlock); + mb_assert(list_empty(&ce->e_lru_list)); + __mb_cache_entry_unhash(ce); + __mb_cache_entry_release_unlock(ce); +} + + +/* + * mb_cache_entry_get() + * + * Get a cache entry by device / block number. (There can only be one entry + * in the cache per device and block.) Returns NULL if no such cache entry + * exists. The returned cache entry is locked for exclusive access ("single + * writer"). + */ +struct mb_cache_entry * +mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, + sector_t block) +{ + unsigned int bucket; + struct list_head *l; + struct mb_cache_entry *ce; + + bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), + cache->c_bucket_bits); + spin_lock(&mb_cache_spinlock); + list_for_each(l, &cache->c_block_hash[bucket]) { + ce = list_entry(l, struct mb_cache_entry, e_block_list); + if (ce->e_bdev == bdev && ce->e_block == block) { + DEFINE_WAIT(wait); + + if (!list_empty(&ce->e_lru_list)) + list_del_init(&ce->e_lru_list); + + while (ce->e_used > 0) { + ce->e_queued++; + prepare_to_wait(&mb_cache_queue, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&mb_cache_spinlock); + schedule(); + spin_lock(&mb_cache_spinlock); + ce->e_queued--; + } + finish_wait(&mb_cache_queue, &wait); + ce->e_used += 1 + MB_CACHE_WRITER; + + if (!__mb_cache_entry_is_hashed(ce)) { + __mb_cache_entry_release_unlock(ce); + return NULL; + } + goto cleanup; + } + } + ce = NULL; + +cleanup: + spin_unlock(&mb_cache_spinlock); + return ce; +} + +#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) + +static struct mb_cache_entry * +__mb_cache_entry_find(struct list_head *l, struct list_head *head, + struct block_device *bdev, unsigned int key) +{ + while (l != head) { + struct mb_cache_entry *ce = + list_entry(l, struct mb_cache_entry, e_index.o_list); + if (ce->e_bdev == bdev && ce->e_index.o_key == key) { + DEFINE_WAIT(wait); + + if (!list_empty(&ce->e_lru_list)) + list_del_init(&ce->e_lru_list); + + /* Incrementing before holding the lock gives readers + priority over writers. */ + ce->e_used++; + while (ce->e_used >= MB_CACHE_WRITER) { + ce->e_queued++; + prepare_to_wait(&mb_cache_queue, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&mb_cache_spinlock); + schedule(); + spin_lock(&mb_cache_spinlock); + ce->e_queued--; + } + finish_wait(&mb_cache_queue, &wait); + + if (!__mb_cache_entry_is_hashed(ce)) { + __mb_cache_entry_release_unlock(ce); + spin_lock(&mb_cache_spinlock); + return ERR_PTR(-EAGAIN); + } + return ce; + } + l = l->next; + } + return NULL; +} + + +/* + * mb_cache_entry_find_first() + * + * Find the first cache entry on a given device with a certain key in + * an additional index. Additional matches can be found with + * mb_cache_entry_find_next(). Returns NULL if no match was found. The + * returned cache entry is locked for shared access ("multiple readers"). + * + * @cache: the cache to search + * @bdev: the device the cache entry should belong to + * @key: the key in the index + */ +struct mb_cache_entry * +mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev, + unsigned int key) +{ + unsigned int bucket = hash_long(key, cache->c_bucket_bits); + struct list_head *l; + struct mb_cache_entry *ce; + + spin_lock(&mb_cache_spinlock); + l = cache->c_index_hash[bucket].next; + ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); + spin_unlock(&mb_cache_spinlock); + return ce; +} + + +/* + * mb_cache_entry_find_next() + * + * Find the next cache entry on a given device with a certain key in an + * additional index. Returns NULL if no match could be found. The previous + * entry is atomatically released, so that mb_cache_entry_find_next() can + * be called like this: + * + * entry = mb_cache_entry_find_first(); + * while (entry) { + * ... + * entry = mb_cache_entry_find_next(entry, ...); + * } + * + * @prev: The previous match + * @bdev: the device the cache entry should belong to + * @key: the key in the index + */ +struct mb_cache_entry * +mb_cache_entry_find_next(struct mb_cache_entry *prev, + struct block_device *bdev, unsigned int key) +{ + struct mb_cache *cache = prev->e_cache; + unsigned int bucket = hash_long(key, cache->c_bucket_bits); + struct list_head *l; + struct mb_cache_entry *ce; + + spin_lock(&mb_cache_spinlock); + l = prev->e_index.o_list.next; + ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); + __mb_cache_entry_release_unlock(prev); + return ce; +} + +#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ + +static int __init init_mbcache(void) +{ + register_shrinker(&mb_cache_shrinker); + return 0; +} + +static void __exit exit_mbcache(void) +{ + unregister_shrinker(&mb_cache_shrinker); +} + +module_init(init_mbcache) +module_exit(exit_mbcache) + diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig new file mode 100644 index 00000000..6624684d --- /dev/null +++ b/fs/minix/Kconfig @@ -0,0 +1,25 @@ +config MINIX_FS + tristate "Minix file system support" + depends on BLOCK + help + Minix is a simple operating system used in many classes about OS's. + The minix file system (method to organize files on a hard disk + partition or a floppy disk) was the original file system for Linux, + but has been superseded by the second extended file system ext2fs. + You don't want to use the minix file system on your hard disk + because of certain built-in restrictions, but it is sometimes found + on older Linux floppy disks. This option will enlarge your kernel + by about 28 KB. If unsure, say N. + + To compile this file system support as a module, choose M here: the + module will be called minix. Note that the file system of your root + partition (the one containing the directory /) cannot be compiled as + a module. + +config MINIX_FS_NATIVE_ENDIAN + def_bool MINIX_FS + depends on H8300 || M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU) + +config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED + def_bool MINIX_FS + depends on M68K && MMU diff --git a/fs/minix/Makefile b/fs/minix/Makefile new file mode 100644 index 00000000..3063015a --- /dev/null +++ b/fs/minix/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux minix filesystem routines. +# + +obj-$(CONFIG_MINIX_FS) += minix.o + +minix-objs := bitmap.o itree_v1.o itree_v2.o namei.o inode.o file.o dir.o diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c new file mode 100644 index 00000000..3f32bcb0 --- /dev/null +++ b/fs/minix/bitmap.c @@ -0,0 +1,279 @@ +/* + * linux/fs/minix/bitmap.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * Modified for 680x0 by Hamish Macdonald + * Fixed for 680x0 by Andreas Schwab + */ + +/* bitmap.c contains the code that handles the inode and block bitmaps */ + +#include "minix.h" +#include +#include +#include + +static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 }; + +static DEFINE_SPINLOCK(bitmap_lock); + +static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits) +{ + unsigned i, j, sum = 0; + struct buffer_head *bh; + + for (i=0; ib_size; j++) + sum += nibblemap[bh->b_data[j] & 0xf] + + nibblemap[(bh->b_data[j]>>4) & 0xf]; + } + + if (numblocks==0 || !(bh=map[numblocks-1])) + return(0); + i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2; + for (j=0; jb_data[j] & 0xf] + + nibblemap[(bh->b_data[j]>>4) & 0xf]; + } + + i = numbits%16; + if (i!=0) { + i = *(__u16 *)(&bh->b_data[j]) | ~((1<>4) & 0xf]; + sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf]; + } + return(sum); +} + +void minix_free_block(struct inode *inode, unsigned long block) +{ + struct super_block *sb = inode->i_sb; + struct minix_sb_info *sbi = minix_sb(sb); + struct buffer_head *bh; + int k = sb->s_blocksize_bits + 3; + unsigned long bit, zone; + + if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) { + printk("Trying to free block not in datazone\n"); + return; + } + zone = block - sbi->s_firstdatazone + 1; + bit = zone & ((1<>= k; + if (zone >= sbi->s_zmap_blocks) { + printk("minix_free_block: nonexistent bitmap buffer\n"); + return; + } + bh = sbi->s_zmap[zone]; + spin_lock(&bitmap_lock); + if (!minix_test_and_clear_bit(bit, bh->b_data)) + printk("minix_free_block (%s:%lu): bit already cleared\n", + sb->s_id, block); + spin_unlock(&bitmap_lock); + mark_buffer_dirty(bh); + return; +} + +int minix_new_block(struct inode * inode) +{ + struct minix_sb_info *sbi = minix_sb(inode->i_sb); + int bits_per_zone = 8 * inode->i_sb->s_blocksize; + int i; + + for (i = 0; i < sbi->s_zmap_blocks; i++) { + struct buffer_head *bh = sbi->s_zmap[i]; + int j; + + spin_lock(&bitmap_lock); + j = minix_find_first_zero_bit(bh->b_data, bits_per_zone); + if (j < bits_per_zone) { + minix_set_bit(j, bh->b_data); + spin_unlock(&bitmap_lock); + mark_buffer_dirty(bh); + j += i * bits_per_zone + sbi->s_firstdatazone-1; + if (j < sbi->s_firstdatazone || j >= sbi->s_nzones) + break; + return j; + } + spin_unlock(&bitmap_lock); + } + return 0; +} + +unsigned long minix_count_free_blocks(struct minix_sb_info *sbi) +{ + return (count_free(sbi->s_zmap, sbi->s_zmap_blocks, + sbi->s_nzones - sbi->s_firstdatazone + 1) + << sbi->s_log_zone_size); +} + +struct minix_inode * +minix_V1_raw_inode(struct super_block *sb, ino_t ino, struct buffer_head **bh) +{ + int block; + struct minix_sb_info *sbi = minix_sb(sb); + struct minix_inode *p; + + if (!ino || ino > sbi->s_ninodes) { + printk("Bad inode number on dev %s: %ld is out of range\n", + sb->s_id, (long)ino); + return NULL; + } + ino--; + block = 2 + sbi->s_imap_blocks + sbi->s_zmap_blocks + + ino / MINIX_INODES_PER_BLOCK; + *bh = sb_bread(sb, block); + if (!*bh) { + printk("Unable to read inode block\n"); + return NULL; + } + p = (void *)(*bh)->b_data; + return p + ino % MINIX_INODES_PER_BLOCK; +} + +struct minix2_inode * +minix_V2_raw_inode(struct super_block *sb, ino_t ino, struct buffer_head **bh) +{ + int block; + struct minix_sb_info *sbi = minix_sb(sb); + struct minix2_inode *p; + int minix2_inodes_per_block = sb->s_blocksize / sizeof(struct minix2_inode); + + *bh = NULL; + if (!ino || ino > sbi->s_ninodes) { + printk("Bad inode number on dev %s: %ld is out of range\n", + sb->s_id, (long)ino); + return NULL; + } + ino--; + block = 2 + sbi->s_imap_blocks + sbi->s_zmap_blocks + + ino / minix2_inodes_per_block; + *bh = sb_bread(sb, block); + if (!*bh) { + printk("Unable to read inode block\n"); + return NULL; + } + p = (void *)(*bh)->b_data; + return p + ino % minix2_inodes_per_block; +} + +/* Clear the link count and mode of a deleted inode on disk. */ + +static void minix_clear_inode(struct inode *inode) +{ + struct buffer_head *bh = NULL; + + if (INODE_VERSION(inode) == MINIX_V1) { + struct minix_inode *raw_inode; + raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh); + if (raw_inode) { + raw_inode->i_nlinks = 0; + raw_inode->i_mode = 0; + } + } else { + struct minix2_inode *raw_inode; + raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh); + if (raw_inode) { + raw_inode->i_nlinks = 0; + raw_inode->i_mode = 0; + } + } + if (bh) { + mark_buffer_dirty(bh); + brelse (bh); + } +} + +void minix_free_inode(struct inode * inode) +{ + struct super_block *sb = inode->i_sb; + struct minix_sb_info *sbi = minix_sb(inode->i_sb); + struct buffer_head *bh; + int k = sb->s_blocksize_bits + 3; + unsigned long ino, bit; + + ino = inode->i_ino; + if (ino < 1 || ino > sbi->s_ninodes) { + printk("minix_free_inode: inode 0 or nonexistent inode\n"); + return; + } + bit = ino & ((1<>= k; + if (ino >= sbi->s_imap_blocks) { + printk("minix_free_inode: nonexistent imap in superblock\n"); + return; + } + + minix_clear_inode(inode); /* clear on-disk copy */ + + bh = sbi->s_imap[ino]; + spin_lock(&bitmap_lock); + if (!minix_test_and_clear_bit(bit, bh->b_data)) + printk("minix_free_inode: bit %lu already cleared\n", bit); + spin_unlock(&bitmap_lock); + mark_buffer_dirty(bh); +} + +struct inode *minix_new_inode(const struct inode *dir, int mode, int *error) +{ + struct super_block *sb = dir->i_sb; + struct minix_sb_info *sbi = minix_sb(sb); + struct inode *inode = new_inode(sb); + struct buffer_head * bh; + int bits_per_zone = 8 * sb->s_blocksize; + unsigned long j; + int i; + + if (!inode) { + *error = -ENOMEM; + return NULL; + } + j = bits_per_zone; + bh = NULL; + *error = -ENOSPC; + spin_lock(&bitmap_lock); + for (i = 0; i < sbi->s_imap_blocks; i++) { + bh = sbi->s_imap[i]; + j = minix_find_first_zero_bit(bh->b_data, bits_per_zone); + if (j < bits_per_zone) + break; + } + if (!bh || j >= bits_per_zone) { + spin_unlock(&bitmap_lock); + iput(inode); + return NULL; + } + if (minix_test_and_set_bit(j, bh->b_data)) { /* shouldn't happen */ + spin_unlock(&bitmap_lock); + printk("minix_new_inode: bit already set\n"); + iput(inode); + return NULL; + } + spin_unlock(&bitmap_lock); + mark_buffer_dirty(bh); + j += i * bits_per_zone; + if (!j || j > sbi->s_ninodes) { + iput(inode); + return NULL; + } + inode_init_owner(inode, dir, mode); + inode->i_ino = j; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_blocks = 0; + memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u)); + insert_inode_hash(inode); + mark_inode_dirty(inode); + + *error = 0; + return inode; +} + +unsigned long minix_count_free_inodes(struct minix_sb_info *sbi) +{ + return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1); +} diff --git a/fs/minix/dir.c b/fs/minix/dir.c new file mode 100644 index 00000000..085a9262 --- /dev/null +++ b/fs/minix/dir.c @@ -0,0 +1,477 @@ +/* + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * minix directory handling functions + * + * Updated to filesystem version 3 by Daniel Aragones + */ + +#include "minix.h" +#include +#include +#include + +typedef struct minix_dir_entry minix_dirent; +typedef struct minix3_dir_entry minix3_dirent; + +static int minix_readdir(struct file *, void *, filldir_t); + +const struct file_operations minix_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = minix_readdir, + .fsync = generic_file_fsync, +}; + +static inline void dir_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +/* + * Return the offset into page `page_nr' of the last valid + * byte in that page, plus one. + */ +static unsigned +minix_last_byte(struct inode *inode, unsigned long page_nr) +{ + unsigned last_byte = PAGE_CACHE_SIZE; + + if (page_nr == (inode->i_size >> PAGE_CACHE_SHIFT)) + last_byte = inode->i_size & (PAGE_CACHE_SIZE - 1); + return last_byte; +} + +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) +{ + struct address_space *mapping = page->mapping; + struct inode *dir = mapping->host; + int err = 0; + block_write_end(NULL, mapping, pos, len, len, page, NULL); + + if (pos+len > dir->i_size) { + i_size_write(dir, pos+len); + mark_inode_dirty(dir); + } + if (IS_DIRSYNC(dir)) + err = write_one_page(page, 1); + else + unlock_page(page); + return err; +} + +static struct page * dir_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + if (!IS_ERR(page)) + kmap(page); + return page; +} + +static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) +{ + return (void*)((char*)de + sbi->s_dirsize); +} + +static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + unsigned long pos = filp->f_pos; + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + unsigned offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = dir_pages(inode); + struct minix_sb_info *sbi = minix_sb(sb); + unsigned chunk_size = sbi->s_dirsize; + char *name; + __u32 inumber; + + pos = (pos + chunk_size-1) & ~(chunk_size-1); + if (pos >= inode->i_size) + goto done; + + for ( ; n < npages; n++, offset = 0) { + char *p, *kaddr, *limit; + struct page *page = dir_get_page(inode, n); + + if (IS_ERR(page)) + continue; + kaddr = (char *)page_address(page); + p = kaddr+offset; + limit = kaddr + minix_last_byte(inode, n) - chunk_size; + for ( ; p <= limit; p = minix_next_entry(p, sbi)) { + if (sbi->s_version == MINIX_V3) { + minix3_dirent *de3 = (minix3_dirent *)p; + name = de3->name; + inumber = de3->inode; + } else { + minix_dirent *de = (minix_dirent *)p; + name = de->name; + inumber = de->inode; + } + if (inumber) { + int over; + + unsigned l = strnlen(name, sbi->s_namelen); + offset = p - kaddr; + over = filldir(dirent, name, l, + (n << PAGE_CACHE_SHIFT) | offset, + inumber, DT_UNKNOWN); + if (over) { + dir_put_page(page); + goto done; + } + } + } + dir_put_page(page); + } + +done: + filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset; + return 0; +} + +static inline int namecompare(int len, int maxlen, + const char * name, const char * buffer) +{ + if (len < maxlen && buffer[len]) + return 0; + return !memcmp(name, buffer, len); +} + +/* + * minix_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + */ +minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) +{ + const char * name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct inode * dir = dentry->d_parent->d_inode; + struct super_block * sb = dir->i_sb; + struct minix_sb_info * sbi = minix_sb(sb); + unsigned long n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + char *p; + + char *namx; + __u32 inumber; + *res_page = NULL; + + for (n = 0; n < npages; n++) { + char *kaddr, *limit; + + page = dir_get_page(dir, n); + if (IS_ERR(page)) + continue; + + kaddr = (char*)page_address(page); + limit = kaddr + minix_last_byte(dir, n) - sbi->s_dirsize; + for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { + if (sbi->s_version == MINIX_V3) { + minix3_dirent *de3 = (minix3_dirent *)p; + namx = de3->name; + inumber = de3->inode; + } else { + minix_dirent *de = (minix_dirent *)p; + namx = de->name; + inumber = de->inode; + } + if (!inumber) + continue; + if (namecompare(namelen, sbi->s_namelen, name, namx)) + goto found; + } + dir_put_page(page); + } + return NULL; + +found: + *res_page = page; + return (minix_dirent *)p; +} + +int minix_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char * name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct super_block * sb = dir->i_sb; + struct minix_sb_info * sbi = minix_sb(sb); + struct page *page = NULL; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr, *p; + minix_dirent *de; + minix3_dirent *de3; + loff_t pos; + int err; + char *namx = NULL; + __u32 inumber; + + /* + * We take care of directory expansion in the same loop + * This code plays outside i_size, so it locks the page + * to protect that region. + */ + for (n = 0; n <= npages; n++) { + char *limit, *dir_end; + + page = dir_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = (char*)page_address(page); + dir_end = kaddr + minix_last_byte(dir, n); + limit = kaddr + PAGE_CACHE_SIZE - sbi->s_dirsize; + for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { + de = (minix_dirent *)p; + de3 = (minix3_dirent *)p; + if (sbi->s_version == MINIX_V3) { + namx = de3->name; + inumber = de3->inode; + } else { + namx = de->name; + inumber = de->inode; + } + if (p == dir_end) { + /* We hit i_size */ + if (sbi->s_version == MINIX_V3) + de3->inode = 0; + else + de->inode = 0; + goto got_it; + } + if (!inumber) + goto got_it; + err = -EEXIST; + if (namecompare(namelen, sbi->s_namelen, name, namx)) + goto out_unlock; + } + unlock_page(page); + dir_put_page(page); + } + BUG(); + return -EINVAL; + +got_it: + pos = page_offset(page) + p - (char *)page_address(page); + err = minix_prepare_chunk(page, pos, sbi->s_dirsize); + if (err) + goto out_unlock; + memcpy (namx, name, namelen); + if (sbi->s_version == MINIX_V3) { + memset (namx + namelen, 0, sbi->s_dirsize - namelen - 4); + de3->inode = inode->i_ino; + } else { + memset (namx + namelen, 0, sbi->s_dirsize - namelen - 2); + de->inode = inode->i_ino; + } + err = dir_commit_chunk(page, pos, sbi->s_dirsize); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); +out_put: + dir_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +int minix_delete_entry(struct minix_dir_entry *de, struct page *page) +{ + struct inode *inode = page->mapping->host; + char *kaddr = page_address(page); + loff_t pos = page_offset(page) + (char*)de - kaddr; + struct minix_sb_info *sbi = minix_sb(inode->i_sb); + unsigned len = sbi->s_dirsize; + int err; + + lock_page(page); + err = minix_prepare_chunk(page, pos, len); + if (err == 0) { + if (sbi->s_version == MINIX_V3) + ((minix3_dirent *) de)->inode = 0; + else + de->inode = 0; + err = dir_commit_chunk(page, pos, len); + } else { + unlock_page(page); + } + dir_put_page(page); + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + return err; +} + +int minix_make_empty(struct inode *inode, struct inode *dir) +{ + struct page *page = grab_cache_page(inode->i_mapping, 0); + struct minix_sb_info *sbi = minix_sb(inode->i_sb); + char *kaddr; + int err; + + if (!page) + return -ENOMEM; + err = minix_prepare_chunk(page, 0, 2 * sbi->s_dirsize); + if (err) { + unlock_page(page); + goto fail; + } + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr, 0, PAGE_CACHE_SIZE); + + if (sbi->s_version == MINIX_V3) { + minix3_dirent *de3 = (minix3_dirent *)kaddr; + + de3->inode = inode->i_ino; + strcpy(de3->name, "."); + de3 = minix_next_entry(de3, sbi); + de3->inode = dir->i_ino; + strcpy(de3->name, ".."); + } else { + minix_dirent *de = (minix_dirent *)kaddr; + + de->inode = inode->i_ino; + strcpy(de->name, "."); + de = minix_next_entry(de, sbi); + de->inode = dir->i_ino; + strcpy(de->name, ".."); + } + kunmap_atomic(kaddr, KM_USER0); + + err = dir_commit_chunk(page, 0, 2 * sbi->s_dirsize); +fail: + page_cache_release(page); + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int minix_empty_dir(struct inode * inode) +{ + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + struct minix_sb_info *sbi = minix_sb(inode->i_sb); + char *name; + __u32 inumber; + + for (i = 0; i < npages; i++) { + char *p, *kaddr, *limit; + + page = dir_get_page(inode, i); + if (IS_ERR(page)) + continue; + + kaddr = (char *)page_address(page); + limit = kaddr + minix_last_byte(inode, i) - sbi->s_dirsize; + for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { + if (sbi->s_version == MINIX_V3) { + minix3_dirent *de3 = (minix3_dirent *)p; + name = de3->name; + inumber = de3->inode; + } else { + minix_dirent *de = (minix_dirent *)p; + name = de->name; + inumber = de->inode; + } + + if (inumber != 0) { + /* check for . and .. */ + if (name[0] != '.') + goto not_empty; + if (!name[1]) { + if (inumber != inode->i_ino) + goto not_empty; + } else if (name[1] != '.') + goto not_empty; + else if (name[2]) + goto not_empty; + } + } + dir_put_page(page); + } + return 1; + +not_empty: + dir_put_page(page); + return 0; +} + +/* Releases the page */ +void minix_set_link(struct minix_dir_entry *de, struct page *page, + struct inode *inode) +{ + struct inode *dir = page->mapping->host; + struct minix_sb_info *sbi = minix_sb(dir->i_sb); + loff_t pos = page_offset(page) + + (char *)de-(char*)page_address(page); + int err; + + lock_page(page); + + err = minix_prepare_chunk(page, pos, sbi->s_dirsize); + if (err == 0) { + if (sbi->s_version == MINIX_V3) + ((minix3_dirent *) de)->inode = inode->i_ino; + else + de->inode = inode->i_ino; + err = dir_commit_chunk(page, pos, sbi->s_dirsize); + } else { + unlock_page(page); + } + dir_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); +} + +struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p) +{ + struct page *page = dir_get_page(dir, 0); + struct minix_sb_info *sbi = minix_sb(dir->i_sb); + struct minix_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = minix_next_entry(page_address(page), sbi); + *p = page; + } + return de; +} + +ino_t minix_inode_by_name(struct dentry *dentry) +{ + struct page *page; + struct minix_dir_entry *de = minix_find_entry(dentry, &page); + ino_t res = 0; + + if (de) { + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct minix_sb_info *sbi = minix_sb(inode->i_sb); + + if (sbi->s_version == MINIX_V3) + res = ((minix3_dirent *) de)->inode; + else + res = de->inode; + dir_put_page(page); + } + return res; +} diff --git a/fs/minix/file.c b/fs/minix/file.c new file mode 100644 index 00000000..4493ce69 --- /dev/null +++ b/fs/minix/file.c @@ -0,0 +1,51 @@ +/* + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * minix regular file handling primitives + */ + +#include "minix.h" + +/* + * We have mostly NULLs here: the current defaults are OK for + * the minix filesystem. + */ +const struct file_operations minix_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .fsync = generic_file_fsync, + .splice_read = generic_file_splice_read, +}; + +static int minix_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations minix_file_inode_operations = { + .truncate = minix_truncate, + .setattr = minix_setattr, + .getattr = minix_getattr, +}; diff --git a/fs/minix/inode.c b/fs/minix/inode.c new file mode 100644 index 00000000..adcdc0a4 --- /dev/null +++ b/fs/minix/inode.c @@ -0,0 +1,661 @@ +/* + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Copyright (C) 1996 Gertjan van Wingerde + * Minix V2 fs support. + * + * Modified for 680x0 by Andreas Schwab + * Updated to filesystem version 3 by Daniel Aragones + */ + +#include +#include "minix.h" +#include +#include +#include +#include +#include +#include + +static int minix_write_inode(struct inode *inode, + struct writeback_control *wbc); +static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); +static int minix_remount (struct super_block * sb, int * flags, char * data); + +static void minix_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + if (!inode->i_nlink) { + inode->i_size = 0; + minix_truncate(inode); + } + invalidate_inode_buffers(inode); + end_writeback(inode); + if (!inode->i_nlink) + minix_free_inode(inode); +} + +static void minix_put_super(struct super_block *sb) +{ + int i; + struct minix_sb_info *sbi = minix_sb(sb); + + if (!(sb->s_flags & MS_RDONLY)) { + if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ + sbi->s_ms->s_state = sbi->s_mount_state; + mark_buffer_dirty(sbi->s_sbh); + } + for (i = 0; i < sbi->s_imap_blocks; i++) + brelse(sbi->s_imap[i]); + for (i = 0; i < sbi->s_zmap_blocks; i++) + brelse(sbi->s_zmap[i]); + brelse (sbi->s_sbh); + kfree(sbi->s_imap); + sb->s_fs_info = NULL; + kfree(sbi); +} + +static struct kmem_cache * minix_inode_cachep; + +static struct inode *minix_alloc_inode(struct super_block *sb) +{ + struct minix_inode_info *ei; + ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void minix_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(minix_inode_cachep, minix_i(inode)); +} + +static void minix_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, minix_i_callback); +} + +static void init_once(void *foo) +{ + struct minix_inode_info *ei = (struct minix_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + minix_inode_cachep = kmem_cache_create("minix_inode_cache", + sizeof(struct minix_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (minix_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(minix_inode_cachep); +} + +static const struct super_operations minix_sops = { + .alloc_inode = minix_alloc_inode, + .destroy_inode = minix_destroy_inode, + .write_inode = minix_write_inode, + .evict_inode = minix_evict_inode, + .put_super = minix_put_super, + .statfs = minix_statfs, + .remount_fs = minix_remount, +}; + +static int minix_remount (struct super_block * sb, int * flags, char * data) +{ + struct minix_sb_info * sbi = minix_sb(sb); + struct minix_super_block * ms; + + ms = sbi->s_ms; + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + if (*flags & MS_RDONLY) { + if (ms->s_state & MINIX_VALID_FS || + !(sbi->s_mount_state & MINIX_VALID_FS)) + return 0; + /* Mounting a rw partition read-only. */ + if (sbi->s_version != MINIX_V3) + ms->s_state = sbi->s_mount_state; + mark_buffer_dirty(sbi->s_sbh); + } else { + /* Mount a partition which is read-only, read-write. */ + if (sbi->s_version != MINIX_V3) { + sbi->s_mount_state = ms->s_state; + ms->s_state &= ~MINIX_VALID_FS; + } else { + sbi->s_mount_state = MINIX_VALID_FS; + } + mark_buffer_dirty(sbi->s_sbh); + + if (!(sbi->s_mount_state & MINIX_VALID_FS)) + printk("MINIX-fs warning: remounting unchecked fs, " + "running fsck is recommended\n"); + else if ((sbi->s_mount_state & MINIX_ERROR_FS)) + printk("MINIX-fs warning: remounting fs with errors, " + "running fsck is recommended\n"); + } + return 0; +} + +static int minix_fill_super(struct super_block *s, void *data, int silent) +{ + struct buffer_head *bh; + struct buffer_head **map; + struct minix_super_block *ms; + struct minix3_super_block *m3s = NULL; + unsigned long i, block; + struct inode *root_inode; + struct minix_sb_info *sbi; + int ret = -EINVAL; + + sbi = kzalloc(sizeof(struct minix_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + s->s_fs_info = sbi; + + BUILD_BUG_ON(32 != sizeof (struct minix_inode)); + BUILD_BUG_ON(64 != sizeof(struct minix2_inode)); + + if (!sb_set_blocksize(s, BLOCK_SIZE)) + goto out_bad_hblock; + + if (!(bh = sb_bread(s, 1))) + goto out_bad_sb; + + ms = (struct minix_super_block *) bh->b_data; + sbi->s_ms = ms; + sbi->s_sbh = bh; + sbi->s_mount_state = ms->s_state; + sbi->s_ninodes = ms->s_ninodes; + sbi->s_nzones = ms->s_nzones; + sbi->s_imap_blocks = ms->s_imap_blocks; + sbi->s_zmap_blocks = ms->s_zmap_blocks; + sbi->s_firstdatazone = ms->s_firstdatazone; + sbi->s_log_zone_size = ms->s_log_zone_size; + sbi->s_max_size = ms->s_max_size; + s->s_magic = ms->s_magic; + if (s->s_magic == MINIX_SUPER_MAGIC) { + sbi->s_version = MINIX_V1; + sbi->s_dirsize = 16; + sbi->s_namelen = 14; + sbi->s_link_max = MINIX_LINK_MAX; + } else if (s->s_magic == MINIX_SUPER_MAGIC2) { + sbi->s_version = MINIX_V1; + sbi->s_dirsize = 32; + sbi->s_namelen = 30; + sbi->s_link_max = MINIX_LINK_MAX; + } else if (s->s_magic == MINIX2_SUPER_MAGIC) { + sbi->s_version = MINIX_V2; + sbi->s_nzones = ms->s_zones; + sbi->s_dirsize = 16; + sbi->s_namelen = 14; + sbi->s_link_max = MINIX2_LINK_MAX; + } else if (s->s_magic == MINIX2_SUPER_MAGIC2) { + sbi->s_version = MINIX_V2; + sbi->s_nzones = ms->s_zones; + sbi->s_dirsize = 32; + sbi->s_namelen = 30; + sbi->s_link_max = MINIX2_LINK_MAX; + } else if ( *(__u16 *)(bh->b_data + 24) == MINIX3_SUPER_MAGIC) { + m3s = (struct minix3_super_block *) bh->b_data; + s->s_magic = m3s->s_magic; + sbi->s_imap_blocks = m3s->s_imap_blocks; + sbi->s_zmap_blocks = m3s->s_zmap_blocks; + sbi->s_firstdatazone = m3s->s_firstdatazone; + sbi->s_log_zone_size = m3s->s_log_zone_size; + sbi->s_max_size = m3s->s_max_size; + sbi->s_ninodes = m3s->s_ninodes; + sbi->s_nzones = m3s->s_zones; + sbi->s_dirsize = 64; + sbi->s_namelen = 60; + sbi->s_version = MINIX_V3; + sbi->s_link_max = MINIX2_LINK_MAX; + sbi->s_mount_state = MINIX_VALID_FS; + sb_set_blocksize(s, m3s->s_blocksize); + } else + goto out_no_fs; + + /* + * Allocate the buffer map to keep the superblock small. + */ + if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0) + goto out_illegal_sb; + i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh); + map = kzalloc(i, GFP_KERNEL); + if (!map) + goto out_no_map; + sbi->s_imap = &map[0]; + sbi->s_zmap = &map[sbi->s_imap_blocks]; + + block=2; + for (i=0 ; i < sbi->s_imap_blocks ; i++) { + if (!(sbi->s_imap[i]=sb_bread(s, block))) + goto out_no_bitmap; + block++; + } + for (i=0 ; i < sbi->s_zmap_blocks ; i++) { + if (!(sbi->s_zmap[i]=sb_bread(s, block))) + goto out_no_bitmap; + block++; + } + + minix_set_bit(0,sbi->s_imap[0]->b_data); + minix_set_bit(0,sbi->s_zmap[0]->b_data); + + /* set up enough so that it can read an inode */ + s->s_op = &minix_sops; + root_inode = minix_iget(s, MINIX_ROOT_INO); + if (IS_ERR(root_inode)) { + ret = PTR_ERR(root_inode); + goto out_no_root; + } + + ret = -ENOMEM; + s->s_root = d_alloc_root(root_inode); + if (!s->s_root) + goto out_iput; + + if (!(s->s_flags & MS_RDONLY)) { + if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ + ms->s_state &= ~MINIX_VALID_FS; + mark_buffer_dirty(bh); + } + if (!(sbi->s_mount_state & MINIX_VALID_FS)) + printk("MINIX-fs: mounting unchecked file system, " + "running fsck is recommended\n"); + else if (sbi->s_mount_state & MINIX_ERROR_FS) + printk("MINIX-fs: mounting file system with errors, " + "running fsck is recommended\n"); + return 0; + +out_iput: + iput(root_inode); + goto out_freemap; + +out_no_root: + if (!silent) + printk("MINIX-fs: get root inode failed\n"); + goto out_freemap; + +out_no_bitmap: + printk("MINIX-fs: bad superblock or unable to read bitmaps\n"); +out_freemap: + for (i = 0; i < sbi->s_imap_blocks; i++) + brelse(sbi->s_imap[i]); + for (i = 0; i < sbi->s_zmap_blocks; i++) + brelse(sbi->s_zmap[i]); + kfree(sbi->s_imap); + goto out_release; + +out_no_map: + ret = -ENOMEM; + if (!silent) + printk("MINIX-fs: can't allocate map\n"); + goto out_release; + +out_illegal_sb: + if (!silent) + printk("MINIX-fs: bad superblock\n"); + goto out_release; + +out_no_fs: + if (!silent) + printk("VFS: Can't find a Minix filesystem V1 | V2 | V3 " + "on device %s.\n", s->s_id); +out_release: + brelse(bh); + goto out; + +out_bad_hblock: + printk("MINIX-fs: blocksize too small for device\n"); + goto out; + +out_bad_sb: + printk("MINIX-fs: unable to read superblock\n"); +out: + s->s_fs_info = NULL; + kfree(sbi); + return ret; +} + +static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct minix_sb_info *sbi = minix_sb(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + buf->f_type = sb->s_magic; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; + buf->f_bfree = minix_count_free_blocks(sbi); + buf->f_bavail = buf->f_bfree; + buf->f_files = sbi->s_ninodes; + buf->f_ffree = minix_count_free_inodes(sbi); + buf->f_namelen = sbi->s_namelen; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + +static int minix_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + if (INODE_VERSION(inode) == MINIX_V1) + return V1_minix_get_block(inode, block, bh_result, create); + else + return V2_minix_get_block(inode, block, bh_result, create); +} + +static int minix_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, minix_get_block, wbc); +} + +static int minix_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,minix_get_block); +} + +int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len) +{ + return __block_write_begin(page, pos, len, minix_get_block); +} + +static int minix_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, + minix_get_block); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} + +static sector_t minix_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,minix_get_block); +} + +static const struct address_space_operations minix_aops = { + .readpage = minix_readpage, + .writepage = minix_writepage, + .write_begin = minix_write_begin, + .write_end = generic_write_end, + .bmap = minix_bmap +}; + +static const struct inode_operations minix_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .getattr = minix_getattr, +}; + +void minix_set_inode(struct inode *inode, dev_t rdev) +{ + if (S_ISREG(inode->i_mode)) { + inode->i_op = &minix_file_inode_operations; + inode->i_fop = &minix_file_operations; + inode->i_mapping->a_ops = &minix_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &minix_dir_inode_operations; + inode->i_fop = &minix_dir_operations; + inode->i_mapping->a_ops = &minix_aops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &minix_symlink_inode_operations; + inode->i_mapping->a_ops = &minix_aops; + } else + init_special_inode(inode, inode->i_mode, rdev); +} + +/* + * The minix V1 function to read an inode. + */ +static struct inode *V1_minix_iget(struct inode *inode) +{ + struct buffer_head * bh; + struct minix_inode * raw_inode; + struct minix_inode_info *minix_inode = minix_i(inode); + int i; + + raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh); + if (!raw_inode) { + iget_failed(inode); + return ERR_PTR(-EIO); + } + inode->i_mode = raw_inode->i_mode; + inode->i_uid = (uid_t)raw_inode->i_uid; + inode->i_gid = (gid_t)raw_inode->i_gid; + inode->i_nlink = raw_inode->i_nlinks; + inode->i_size = raw_inode->i_size; + inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time; + inode->i_mtime.tv_nsec = 0; + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + inode->i_blocks = 0; + for (i = 0; i < 9; i++) + minix_inode->u.i1_data[i] = raw_inode->i_zone[i]; + minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0])); + brelse(bh); + unlock_new_inode(inode); + return inode; +} + +/* + * The minix V2 function to read an inode. + */ +static struct inode *V2_minix_iget(struct inode *inode) +{ + struct buffer_head * bh; + struct minix2_inode * raw_inode; + struct minix_inode_info *minix_inode = minix_i(inode); + int i; + + raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh); + if (!raw_inode) { + iget_failed(inode); + return ERR_PTR(-EIO); + } + inode->i_mode = raw_inode->i_mode; + inode->i_uid = (uid_t)raw_inode->i_uid; + inode->i_gid = (gid_t)raw_inode->i_gid; + inode->i_nlink = raw_inode->i_nlinks; + inode->i_size = raw_inode->i_size; + inode->i_mtime.tv_sec = raw_inode->i_mtime; + inode->i_atime.tv_sec = raw_inode->i_atime; + inode->i_ctime.tv_sec = raw_inode->i_ctime; + inode->i_mtime.tv_nsec = 0; + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + inode->i_blocks = 0; + for (i = 0; i < 10; i++) + minix_inode->u.i2_data[i] = raw_inode->i_zone[i]; + minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0])); + brelse(bh); + unlock_new_inode(inode); + return inode; +} + +/* + * The global function to read an inode. + */ +struct inode *minix_iget(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + if (INODE_VERSION(inode) == MINIX_V1) + return V1_minix_iget(inode); + else + return V2_minix_iget(inode); +} + +/* + * The minix V1 function to synchronize an inode. + */ +static struct buffer_head * V1_minix_update_inode(struct inode * inode) +{ + struct buffer_head * bh; + struct minix_inode * raw_inode; + struct minix_inode_info *minix_inode = minix_i(inode); + int i; + + raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh); + if (!raw_inode) + return NULL; + raw_inode->i_mode = inode->i_mode; + raw_inode->i_uid = fs_high2lowuid(inode->i_uid); + raw_inode->i_gid = fs_high2lowgid(inode->i_gid); + raw_inode->i_nlinks = inode->i_nlink; + raw_inode->i_size = inode->i_size; + raw_inode->i_time = inode->i_mtime.tv_sec; + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev); + else for (i = 0; i < 9; i++) + raw_inode->i_zone[i] = minix_inode->u.i1_data[i]; + mark_buffer_dirty(bh); + return bh; +} + +/* + * The minix V2 function to synchronize an inode. + */ +static struct buffer_head * V2_minix_update_inode(struct inode * inode) +{ + struct buffer_head * bh; + struct minix2_inode * raw_inode; + struct minix_inode_info *minix_inode = minix_i(inode); + int i; + + raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh); + if (!raw_inode) + return NULL; + raw_inode->i_mode = inode->i_mode; + raw_inode->i_uid = fs_high2lowuid(inode->i_uid); + raw_inode->i_gid = fs_high2lowgid(inode->i_gid); + raw_inode->i_nlinks = inode->i_nlink; + raw_inode->i_size = inode->i_size; + raw_inode->i_mtime = inode->i_mtime.tv_sec; + raw_inode->i_atime = inode->i_atime.tv_sec; + raw_inode->i_ctime = inode->i_ctime.tv_sec; + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev); + else for (i = 0; i < 10; i++) + raw_inode->i_zone[i] = minix_inode->u.i2_data[i]; + mark_buffer_dirty(bh); + return bh; +} + +static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int err = 0; + struct buffer_head *bh; + + if (INODE_VERSION(inode) == MINIX_V1) + bh = V1_minix_update_inode(inode); + else + bh = V2_minix_update_inode(inode); + if (!bh) + return -EIO; + if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + printk("IO error syncing minix inode [%s:%08lx]\n", + inode->i_sb->s_id, inode->i_ino); + err = -EIO; + } + } + brelse (bh); + return err; +} + +int minix_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = dir->i_sb; + generic_fillattr(dentry->d_inode, stat); + if (INODE_VERSION(dentry->d_inode) == MINIX_V1) + stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); + else + stat->blocks = (sb->s_blocksize / 512) * V2_minix_blocks(stat->size, sb); + stat->blksize = sb->s_blocksize; + return 0; +} + +/* + * The function that is called for file truncation. + */ +void minix_truncate(struct inode * inode) +{ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) + return; + if (INODE_VERSION(inode) == MINIX_V1) + V1_minix_truncate(inode); + else + V2_minix_truncate(inode); +} + +static struct dentry *minix_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super); +} + +static struct file_system_type minix_fs_type = { + .owner = THIS_MODULE, + .name = "minix", + .mount = minix_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_minix_fs(void) +{ + int err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&minix_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_minix_fs(void) +{ + unregister_filesystem(&minix_fs_type); + destroy_inodecache(); +} + +module_init(init_minix_fs) +module_exit(exit_minix_fs) +MODULE_LICENSE("GPL"); + diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c new file mode 100644 index 00000000..a731cabf --- /dev/null +++ b/fs/minix/itree_common.c @@ -0,0 +1,364 @@ +/* Generic part */ + +typedef struct { + block_t *p; + block_t key; + struct buffer_head *bh; +} Indirect; + +static DEFINE_RWLOCK(pointers_lock); + +static inline void add_chain(Indirect *p, struct buffer_head *bh, block_t *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +static inline int verify_chain(Indirect *from, Indirect *to) +{ + while (from <= to && from->key == *from->p) + from++; + return (from > to); +} + +static inline block_t *block_end(struct buffer_head *bh) +{ + return (block_t *)((char*)bh->b_data + bh->b_size); +} + +static inline Indirect *get_branch(struct inode *inode, + int depth, + int *offsets, + Indirect chain[DEPTH], + int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, i_data(inode) + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_bread(sb, block_to_cpu(p->key)); + if (!bh) + goto failure; + read_lock(&pointers_lock); + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (block_t *)bh->b_data + *++offsets); + read_unlock(&pointers_lock); + if (!p->key) + goto no_block; + } + return NULL; + +changed: + read_unlock(&pointers_lock); + brelse(bh); + *err = -EAGAIN; + goto no_block; +failure: + *err = -EIO; +no_block: + return p; +} + +static int alloc_branch(struct inode *inode, + int num, + int *offsets, + Indirect *branch) +{ + int n = 0; + int i; + int parent = minix_new_block(inode); + + branch[0].key = cpu_to_block(parent); + if (parent) for (n = 1; n < num; n++) { + struct buffer_head *bh; + /* Allocate the next block */ + int nr = minix_new_block(inode); + if (!nr) + break; + branch[n].key = cpu_to_block(nr); + bh = sb_getblk(inode->i_sb, parent); + lock_buffer(bh); + memset(bh->b_data, 0, bh->b_size); + branch[n].bh = bh; + branch[n].p = (block_t*) bh->b_data + offsets[n]; + *branch[n].p = branch[n].key; + set_buffer_uptodate(bh); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + parent = nr; + } + if (n == num) + return 0; + + /* Allocation failed, free what we already allocated */ + for (i = 1; i < n; i++) + bforget(branch[i].bh); + for (i = 0; i < n; i++) + minix_free_block(inode, block_to_cpu(branch[i].key)); + return -ENOSPC; +} + +static inline int splice_branch(struct inode *inode, + Indirect chain[DEPTH], + Indirect *where, + int num) +{ + int i; + + write_lock(&pointers_lock); + + /* Verify that place we are splicing to is still there and vacant */ + if (!verify_chain(chain, where-1) || *where->p) + goto changed; + + *where->p = where->key; + + write_unlock(&pointers_lock); + + /* We are done with atomic stuff, now do the rest of housekeeping */ + + inode->i_ctime = CURRENT_TIME_SEC; + + /* had we spliced it onto indirect block? */ + if (where->bh) + mark_buffer_dirty_inode(where->bh, inode); + + mark_inode_dirty(inode); + return 0; + +changed: + write_unlock(&pointers_lock); + for (i = 1; i < num; i++) + bforget(where[i].bh); + for (i = 0; i < num; i++) + minix_free_block(inode, block_to_cpu(where[i].key)); + return -EAGAIN; +} + +static inline int get_block(struct inode * inode, sector_t block, + struct buffer_head *bh, int create) +{ + int err = -EIO; + int offsets[DEPTH]; + Indirect chain[DEPTH]; + Indirect *partial; + int left; + int depth = block_to_path(inode, block, offsets); + + if (depth == 0) + goto out; + +reread: + partial = get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { +got_it: + map_bh(bh, inode->i_sb, block_to_cpu(chain[depth-1].key)); + /* Clean up and exit */ + partial = chain+depth-1; /* the whole chain */ + goto cleanup; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) { +cleanup: + while (partial > chain) { + brelse(partial->bh); + partial--; + } +out: + return err; + } + + /* + * Indirect block might be removed by truncate while we were + * reading it. Handling of that case (forget what we've got and + * reread) is taken out of the main path. + */ + if (err == -EAGAIN) + goto changed; + + left = (chain + depth) - partial; + err = alloc_branch(inode, left, offsets+(partial-chain), partial); + if (err) + goto cleanup; + + if (splice_branch(inode, chain, partial, left) < 0) + goto changed; + + set_buffer_new(bh); + goto got_it; + +changed: + while (partial > chain) { + brelse(partial->bh); + partial--; + } + goto reread; +} + +static inline int all_zeroes(block_t *p, block_t *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +static Indirect *find_shared(struct inode *inode, + int depth, + int offsets[DEPTH], + Indirect chain[DEPTH], + block_t *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = get_branch(inode, k, offsets, chain, &err); + + write_lock(&pointers_lock); + if (!partial) + partial = chain + k-1; + if (!partial->key && *partial->p) { + write_unlock(&pointers_lock); + goto no_top; + } + for (p=partial;p>chain && all_zeroes((block_t*)p->bh->b_data,p->p);p--) + ; + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + *p->p = 0; + } + write_unlock(&pointers_lock); + + while(partial > p) + { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +static inline void free_data(struct inode *inode, block_t *p, block_t *q) +{ + unsigned long nr; + + for ( ; p < q ; p++) { + nr = block_to_cpu(*p); + if (nr) { + *p = 0; + minix_free_block(inode, nr); + } + } +} + +static void free_branches(struct inode *inode, block_t *p, block_t *q, int depth) +{ + struct buffer_head * bh; + unsigned long nr; + + if (depth--) { + for ( ; p < q ; p++) { + nr = block_to_cpu(*p); + if (!nr) + continue; + *p = 0; + bh = sb_bread(inode->i_sb, nr); + if (!bh) + continue; + free_branches(inode, (block_t*)bh->b_data, + block_end(bh), depth); + bforget(bh); + minix_free_block(inode, nr); + mark_inode_dirty(inode); + } + } else + free_data(inode, p, q); +} + +static inline void truncate (struct inode * inode) +{ + struct super_block *sb = inode->i_sb; + block_t *idata = i_data(inode); + int offsets[DEPTH]; + Indirect chain[DEPTH]; + Indirect *partial; + block_t nr = 0; + int n; + int first_whole; + long iblock; + + iblock = (inode->i_size + sb->s_blocksize -1) >> sb->s_blocksize_bits; + block_truncate_page(inode->i_mapping, inode->i_size, get_block); + + n = block_to_path(inode, iblock, offsets); + if (!n) + return; + + if (n == 1) { + free_data(inode, idata+offsets[0], idata + DIRECT); + first_whole = 0; + goto do_indirects; + } + + first_whole = offsets[0] + 1 - DIRECT; + partial = find_shared(inode, n, offsets, chain, &nr); + if (nr) { + if (partial == chain) + mark_inode_dirty(inode); + else + mark_buffer_dirty_inode(partial->bh, inode); + free_branches(inode, &nr, &nr+1, (chain+n-1) - partial); + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + free_branches(inode, partial->p + 1, block_end(partial->bh), + (chain+n-1) - partial); + mark_buffer_dirty_inode(partial->bh, inode); + brelse (partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + while (first_whole < DEPTH-1) { + nr = idata[DIRECT+first_whole]; + if (nr) { + idata[DIRECT+first_whole] = 0; + mark_inode_dirty(inode); + free_branches(inode, &nr, &nr+1, first_whole+1); + } + first_whole++; + } + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +} + +static inline unsigned nblocks(loff_t size, struct super_block *sb) +{ + int k = sb->s_blocksize_bits - 10; + unsigned blocks, res, direct = DIRECT, i = DEPTH; + blocks = (size + sb->s_blocksize - 1) >> (BLOCK_SIZE_BITS + k); + res = blocks; + while (--i && blocks > direct) { + blocks -= direct; + blocks += sb->s_blocksize/sizeof(block_t) - 1; + blocks /= sb->s_blocksize/sizeof(block_t); + res += blocks; + direct = 1; + } + return res; +} diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c new file mode 100644 index 00000000..282e15ad --- /dev/null +++ b/fs/minix/itree_v1.c @@ -0,0 +1,67 @@ +#include +#include +#include "minix.h" + +enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */ + +typedef u16 block_t; /* 16 bit, host order */ + +static inline unsigned long block_to_cpu(block_t n) +{ + return n; +} + +static inline block_t cpu_to_block(unsigned long n) +{ + return n; +} + +static inline block_t *i_data(struct inode *inode) +{ + return (block_t *)minix_i(inode)->u.i1_data; +} + +static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) +{ + int n = 0; + char b[BDEVNAME_SIZE]; + + if (block < 0) { + printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n", + block, bdevname(inode->i_sb->s_bdev, b)); + } else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) { + if (printk_ratelimit()) + printk("MINIX-fs: block_to_path: " + "block %ld too big on dev %s\n", + block, bdevname(inode->i_sb->s_bdev, b)); + } else if (block < 7) { + offsets[n++] = block; + } else if ((block -= 7) < 512) { + offsets[n++] = 7; + offsets[n++] = block; + } else { + block -= 512; + offsets[n++] = 8; + offsets[n++] = block>>9; + offsets[n++] = block & 511; + } + return n; +} + +#include "itree_common.c" + +int V1_minix_get_block(struct inode * inode, long block, + struct buffer_head *bh_result, int create) +{ + return get_block(inode, block, bh_result, create); +} + +void V1_minix_truncate(struct inode * inode) +{ + truncate(inode); +} + +unsigned V1_minix_blocks(loff_t size, struct super_block *sb) +{ + return nblocks(size, sb); +} diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c new file mode 100644 index 00000000..13487ad1 --- /dev/null +++ b/fs/minix/itree_v2.c @@ -0,0 +1,75 @@ +#include +#include "minix.h" + +enum {DIRECT = 7, DEPTH = 4}; /* Have triple indirect */ + +typedef u32 block_t; /* 32 bit, host order */ + +static inline unsigned long block_to_cpu(block_t n) +{ + return n; +} + +static inline block_t cpu_to_block(unsigned long n) +{ + return n; +} + +static inline block_t *i_data(struct inode *inode) +{ + return (block_t *)minix_i(inode)->u.i2_data; +} + +#define DIRCOUNT 7 +#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2)) + +static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) +{ + int n = 0; + char b[BDEVNAME_SIZE]; + struct super_block *sb = inode->i_sb; + + if (block < 0) { + printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n", + block, bdevname(sb->s_bdev, b)); + } else if (block >= (minix_sb(inode->i_sb)->s_max_size/sb->s_blocksize)) { + if (printk_ratelimit()) + printk("MINIX-fs: block_to_path: " + "block %ld too big on dev %s\n", + block, bdevname(sb->s_bdev, b)); + } else if (block < DIRCOUNT) { + offsets[n++] = block; + } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) { + offsets[n++] = DIRCOUNT; + offsets[n++] = block; + } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) { + offsets[n++] = DIRCOUNT + 1; + offsets[n++] = block / INDIRCOUNT(sb); + offsets[n++] = block % INDIRCOUNT(sb); + } else { + block -= INDIRCOUNT(sb) * INDIRCOUNT(sb); + offsets[n++] = DIRCOUNT + 2; + offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb); + offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb); + offsets[n++] = block % INDIRCOUNT(sb); + } + return n; +} + +#include "itree_common.c" + +int V2_minix_get_block(struct inode * inode, long block, + struct buffer_head *bh_result, int create) +{ + return get_block(inode, block, bh_result, create); +} + +void V2_minix_truncate(struct inode * inode) +{ + truncate(inode); +} + +unsigned V2_minix_blocks(loff_t size, struct super_block *sb) +{ + return nblocks(size, sb); +} diff --git a/fs/minix/minix.h b/fs/minix/minix.h new file mode 100644 index 00000000..341e2122 --- /dev/null +++ b/fs/minix/minix.h @@ -0,0 +1,165 @@ +#ifndef FS_MINIX_H +#define FS_MINIX_H + +#include +#include +#include + +#define INODE_VERSION(inode) minix_sb(inode->i_sb)->s_version +#define MINIX_V1 0x0001 /* original minix fs */ +#define MINIX_V2 0x0002 /* minix V2 fs */ +#define MINIX_V3 0x0003 /* minix V3 fs */ + +/* + * minix fs inode data in memory + */ +struct minix_inode_info { + union { + __u16 i1_data[16]; + __u32 i2_data[16]; + } u; + struct inode vfs_inode; +}; + +/* + * minix super-block data in memory + */ +struct minix_sb_info { + unsigned long s_ninodes; + unsigned long s_nzones; + unsigned long s_imap_blocks; + unsigned long s_zmap_blocks; + unsigned long s_firstdatazone; + unsigned long s_log_zone_size; + unsigned long s_max_size; + int s_dirsize; + int s_namelen; + int s_link_max; + struct buffer_head ** s_imap; + struct buffer_head ** s_zmap; + struct buffer_head * s_sbh; + struct minix_super_block * s_ms; + unsigned short s_mount_state; + unsigned short s_version; +}; + +extern struct inode *minix_iget(struct super_block *, unsigned long); +extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); +extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); +extern struct inode * minix_new_inode(const struct inode *, int, int *); +extern void minix_free_inode(struct inode * inode); +extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); +extern int minix_new_block(struct inode * inode); +extern void minix_free_block(struct inode *inode, unsigned long block); +extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi); +extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); + +extern void V1_minix_truncate(struct inode *); +extern void V2_minix_truncate(struct inode *); +extern void minix_truncate(struct inode *); +extern void minix_set_inode(struct inode *, dev_t); +extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int); +extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int); +extern unsigned V1_minix_blocks(loff_t, struct super_block *); +extern unsigned V2_minix_blocks(loff_t, struct super_block *); + +extern struct minix_dir_entry *minix_find_entry(struct dentry*, struct page**); +extern int minix_add_link(struct dentry*, struct inode*); +extern int minix_delete_entry(struct minix_dir_entry*, struct page*); +extern int minix_make_empty(struct inode*, struct inode*); +extern int minix_empty_dir(struct inode*); +extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*); +extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**); +extern ino_t minix_inode_by_name(struct dentry*); + +extern const struct inode_operations minix_file_inode_operations; +extern const struct inode_operations minix_dir_inode_operations; +extern const struct file_operations minix_file_operations; +extern const struct file_operations minix_dir_operations; + +static inline struct minix_sb_info *minix_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct minix_inode_info *minix_i(struct inode *inode) +{ + return list_entry(inode, struct minix_inode_info, vfs_inode); +} + +#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ + defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) + +#error Minix file system byte order broken + +#elif defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) + +/* + * big-endian 32 or 64 bit indexed bitmaps on big-endian system or + * little-endian bitmaps on little-endian system + */ + +#define minix_test_and_set_bit(nr, addr) \ + __test_and_set_bit((nr), (unsigned long *)(addr)) +#define minix_set_bit(nr, addr) \ + __set_bit((nr), (unsigned long *)(addr)) +#define minix_test_and_clear_bit(nr, addr) \ + __test_and_clear_bit((nr), (unsigned long *)(addr)) +#define minix_test_bit(nr, addr) \ + test_bit((nr), (unsigned long *)(addr)) +#define minix_find_first_zero_bit(addr, size) \ + find_first_zero_bit((unsigned long *)(addr), (size)) + +#elif defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) + +/* + * big-endian 16bit indexed bitmaps + */ + +static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size) +{ + const unsigned short *p = vaddr, *addr = vaddr; + unsigned short num; + + if (!size) + return 0; + + size = (size >> 4) + ((size & 15) > 0); + while (*p++ == 0xffff) { + if (--size == 0) + return (p - addr) << 4; + } + + num = *--p; + return ((p - addr) << 4) + ffz(num); +} + +#define minix_test_and_set_bit(nr, addr) \ + __test_and_set_bit((nr) ^ 16, (unsigned long *)(addr)) +#define minix_set_bit(nr, addr) \ + __set_bit((nr) ^ 16, (unsigned long *)(addr)) +#define minix_test_and_clear_bit(nr, addr) \ + __test_and_clear_bit((nr) ^ 16, (unsigned long *)(addr)) + +static inline int minix_test_bit(int nr, const void *vaddr) +{ + const unsigned short *p = vaddr; + return (p[nr >> 4] & (1U << (nr & 15))) != 0; +} + +#else + +/* + * little-endian bitmaps + */ + +#define minix_test_and_set_bit __test_and_set_bit_le +#define minix_set_bit __set_bit_le +#define minix_test_and_clear_bit __test_and_clear_bit_le +#define minix_test_bit test_bit_le +#define minix_find_first_zero_bit find_first_zero_bit_le + +#endif + +#endif /* FS_MINIX_H */ diff --git a/fs/minix/namei.c b/fs/minix/namei.c new file mode 100644 index 00000000..6e6777f1 --- /dev/null +++ b/fs/minix/namei.c @@ -0,0 +1,269 @@ +/* + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include "minix.h" + +static int add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = minix_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode * inode = NULL; + ino_t ino; + + if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen) + return ERR_PTR(-ENAMETOOLONG); + + ino = minix_inode_by_name(dentry); + if (ino) { + inode = minix_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + d_add(dentry, inode); + return NULL; +} + +static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) +{ + int error; + struct inode *inode; + + if (!old_valid_dev(rdev)) + return -EINVAL; + + inode = minix_new_inode(dir, mode, &error); + + if (inode) { + minix_set_inode(inode, rdev); + mark_inode_dirty(inode); + error = add_nondir(dentry, inode); + } + return error; +} + +static int minix_create(struct inode * dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + return minix_mknod(dir, dentry, mode, 0); +} + +static int minix_symlink(struct inode * dir, struct dentry *dentry, + const char * symname) +{ + int err = -ENAMETOOLONG; + int i = strlen(symname)+1; + struct inode * inode; + + if (i > dir->i_sb->s_blocksize) + goto out; + + inode = minix_new_inode(dir, S_IFLNK | 0777, &err); + if (!inode) + goto out; + + minix_set_inode(inode, 0); + err = page_symlink(inode, symname, i); + if (err) + goto out_fail; + + err = add_nondir(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + iput(inode); + goto out; +} + +static int minix_link(struct dentry * old_dentry, struct inode * dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + + if (inode->i_nlink >= minix_sb(inode->i_sb)->s_link_max) + return -EMLINK; + + inode->i_ctime = CURRENT_TIME_SEC; + inode_inc_link_count(inode); + ihold(inode); + return add_nondir(dentry, inode); +} + +static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode) +{ + struct inode * inode; + int err = -EMLINK; + + if (dir->i_nlink >= minix_sb(dir->i_sb)->s_link_max) + goto out; + + inode_inc_link_count(dir); + + inode = minix_new_inode(dir, S_IFDIR | mode, &err); + if (!inode) + goto out_dir; + + minix_set_inode(inode, 0); + + inode_inc_link_count(inode); + + err = minix_make_empty(inode, dir); + if (err) + goto out_fail; + + err = minix_add_link(dentry, inode); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + iput(inode); +out_dir: + inode_dec_link_count(dir); + goto out; +} + +static int minix_unlink(struct inode * dir, struct dentry *dentry) +{ + int err = -ENOENT; + struct inode * inode = dentry->d_inode; + struct page * page; + struct minix_dir_entry * de; + + de = minix_find_entry(dentry, &page); + if (!de) + goto end_unlink; + + err = minix_delete_entry(de, page); + if (err) + goto end_unlink; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); +end_unlink: + return err; +} + +static int minix_rmdir(struct inode * dir, struct dentry *dentry) +{ + struct inode * inode = dentry->d_inode; + int err = -ENOTEMPTY; + + if (minix_empty_dir(inode)) { + err = minix_unlink(dir, dentry); + if (!err) { + inode_dec_link_count(dir); + inode_dec_link_count(inode); + } + } + return err; +} + +static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, + struct inode * new_dir, struct dentry *new_dentry) +{ + struct minix_sb_info * info = minix_sb(old_dir->i_sb); + struct inode * old_inode = old_dentry->d_inode; + struct inode * new_inode = new_dentry->d_inode; + struct page * dir_page = NULL; + struct minix_dir_entry * dir_de = NULL; + struct page * old_page; + struct minix_dir_entry * old_de; + int err = -ENOENT; + + old_de = minix_find_entry(old_dentry, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = minix_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page * new_page; + struct minix_dir_entry * new_de; + + err = -ENOTEMPTY; + if (dir_de && !minix_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = minix_find_entry(new_dentry, &new_page); + if (!new_de) + goto out_dir; + minix_set_link(new_de, new_page, old_inode); + new_inode->i_ctime = CURRENT_TIME_SEC; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + } else { + if (dir_de) { + err = -EMLINK; + if (new_dir->i_nlink >= info->s_link_max) + goto out_dir; + } + err = minix_add_link(new_dentry, old_inode); + if (err) + goto out_dir; + if (dir_de) + inode_inc_link_count(new_dir); + } + + minix_delete_entry(old_de, old_page); + mark_inode_dirty(old_inode); + + if (dir_de) { + minix_set_link(dir_de, dir_page, new_dir); + inode_dec_link_count(old_dir); + } + return 0; + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + return err; +} + +/* + * directories can handle most operations... + */ +const struct inode_operations minix_dir_inode_operations = { + .create = minix_create, + .lookup = minix_lookup, + .link = minix_link, + .unlink = minix_unlink, + .symlink = minix_symlink, + .mkdir = minix_mkdir, + .rmdir = minix_rmdir, + .mknod = minix_mknod, + .rename = minix_rename, + .getattr = minix_getattr, +}; diff --git a/fs/mpage.c b/fs/mpage.c new file mode 100644 index 00000000..fdfae9fa --- /dev/null +++ b/fs/mpage.c @@ -0,0 +1,718 @@ +/* + * fs/mpage.c + * + * Copyright (C) 2002, Linus Torvalds. + * + * Contains functions related to preparing and submitting BIOs which contain + * multiple pagecache pages. + * + * 15May2002 Andrew Morton + * Initial version + * 27Jun2002 axboe@suse.de + * use bio_add_page() to build bio's just the right size + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * I/O completion handler for multipage BIOs. + * + * The mpage code never puts partial pages into a BIO (except for end-of-file). + * If a page does not map to a contiguous run of blocks then it simply falls + * back to block_read_full_page(). + * + * Why is this? If a page's completion depends on a number of different BIOs + * which can complete in any order (or at the same time) then determining the + * status of that page is hard. See end_buffer_async_read() for the details. + * There is no point in duplicating all that complexity. + */ +static void mpage_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + + do { + struct page *page = bvec->bv_page; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + if (bio_data_dir(bio) == READ) { + if (uptodate) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { /* bio_data_dir(bio) == WRITE */ + if (!uptodate) { + SetPageError(page); + if (page->mapping) + set_bit(AS_EIO, &page->mapping->flags); + } + end_page_writeback(page); + } + } while (bvec >= bio->bi_io_vec); + bio_put(bio); +} + +static struct bio *mpage_bio_submit(int rw, struct bio *bio) +{ + bio->bi_end_io = mpage_end_io; + submit_bio(rw, bio); + return NULL; +} + +static struct bio * +mpage_alloc(struct block_device *bdev, + sector_t first_sector, int nr_vecs, + gfp_t gfp_flags) +{ + struct bio *bio; + + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_bdev = bdev; + bio->bi_sector = first_sector; + } + return bio; +} + +/* + * support function for mpage_readpages. The fs supplied get_block might + * return an up to date buffer. This is used to map that buffer into + * the page, which allows readpage to avoid triggering a duplicate call + * to get_block. + * + * The idea is to avoid adding buffers to pages that don't already have + * them. So when the buffer is up to date and the page size == block size, + * this marks the page up to date instead of adding new buffers. + */ +static void +map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *page_bh, *head; + int block = 0; + + if (!page_has_buffers(page)) { + /* + * don't make any buffers if there is only one buffer on + * the page and the page just needs to be set up to date + */ + if (inode->i_blkbits == PAGE_CACHE_SHIFT && + buffer_uptodate(bh)) { + SetPageUptodate(page); + return; + } + create_empty_buffers(page, 1 << inode->i_blkbits, 0); + } + head = page_buffers(page); + page_bh = head; + do { + if (block == page_block) { + page_bh->b_state = bh->b_state; + page_bh->b_bdev = bh->b_bdev; + page_bh->b_blocknr = bh->b_blocknr; + break; + } + page_bh = page_bh->b_this_page; + block++; + } while (page_bh != head); +} + +/* + * This is the worker routine which does all the work of mapping the disk + * blocks and constructs largest possible bios, submits them for IO if the + * blocks are not contiguous on the disk. + * + * We pass a buffer_head back and forth and use its buffer_mapped() flag to + * represent the validity of its disk mapping and to decide when to do the next + * get_block() call. + */ +static struct bio * +do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, + sector_t *last_block_in_bio, struct buffer_head *map_bh, + unsigned long *first_logical_block, get_block_t get_block) +{ + struct inode *inode = page->mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; + const unsigned blocksize = 1 << blkbits; + sector_t block_in_file; + sector_t last_block; + sector_t last_block_in_file; + sector_t blocks[MAX_BUF_PER_PAGE]; + unsigned page_block; + unsigned first_hole = blocks_per_page; + struct block_device *bdev = NULL; + int length; + int fully_mapped = 1; + unsigned nblocks; + unsigned relative_block; + + if (page_has_buffers(page)) + goto confused; + + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + last_block = block_in_file + nr_pages * blocks_per_page; + last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + if (last_block > last_block_in_file) + last_block = last_block_in_file; + page_block = 0; + + /* + * Map blocks using the result from the previous get_blocks call first. + */ + nblocks = map_bh->b_size >> blkbits; + if (buffer_mapped(map_bh) && block_in_file > *first_logical_block && + block_in_file < (*first_logical_block + nblocks)) { + unsigned map_offset = block_in_file - *first_logical_block; + unsigned last = nblocks - map_offset; + + for (relative_block = 0; ; relative_block++) { + if (relative_block == last) { + clear_buffer_mapped(map_bh); + break; + } + if (page_block == blocks_per_page) + break; + blocks[page_block] = map_bh->b_blocknr + map_offset + + relative_block; + page_block++; + block_in_file++; + } + bdev = map_bh->b_bdev; + } + + /* + * Then do more get_blocks calls until we are done with this page. + */ + map_bh->b_page = page; + while (page_block < blocks_per_page) { + map_bh->b_state = 0; + map_bh->b_size = 0; + + if (block_in_file < last_block) { + map_bh->b_size = (last_block-block_in_file) << blkbits; + if (get_block(inode, block_in_file, map_bh, 0)) + goto confused; + *first_logical_block = block_in_file; + } + + if (!buffer_mapped(map_bh)) { + fully_mapped = 0; + if (first_hole == blocks_per_page) + first_hole = page_block; + page_block++; + block_in_file++; + continue; + } + + /* some filesystems will copy data into the page during + * the get_block call, in which case we don't want to + * read it again. map_buffer_to_page copies the data + * we just collected from get_block into the page's buffers + * so readpage doesn't have to repeat the get_block call + */ + if (buffer_uptodate(map_bh)) { + map_buffer_to_page(page, map_bh, page_block); + goto confused; + } + + if (first_hole != blocks_per_page) + goto confused; /* hole -> non-hole */ + + /* Contiguous blocks? */ + if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) + goto confused; + nblocks = map_bh->b_size >> blkbits; + for (relative_block = 0; ; relative_block++) { + if (relative_block == nblocks) { + clear_buffer_mapped(map_bh); + break; + } else if (page_block == blocks_per_page) + break; + blocks[page_block] = map_bh->b_blocknr+relative_block; + page_block++; + block_in_file++; + } + bdev = map_bh->b_bdev; + } + + if (first_hole != blocks_per_page) { + zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE); + if (first_hole == 0) { + SetPageUptodate(page); + unlock_page(page); + goto out; + } + } else if (fully_mapped) { + SetPageMappedToDisk(page); + } + + if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && + cleancache_get_page(page) == 0) { + SetPageUptodate(page); + goto confused; + } + + /* + * This page will go to BIO. Do we need to send this BIO off first? + */ + if (bio && (*last_block_in_bio != blocks[0] - 1)) + bio = mpage_bio_submit(READ, bio); + +alloc_new: + if (bio == NULL) { + bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), + min_t(int, nr_pages, bio_get_nr_vecs(bdev)), + GFP_KERNEL); + if (bio == NULL) + goto confused; + } + + length = first_hole << blkbits; + if (bio_add_page(bio, page, length, 0) < length) { + bio = mpage_bio_submit(READ, bio); + goto alloc_new; + } + + relative_block = block_in_file - *first_logical_block; + nblocks = map_bh->b_size >> blkbits; + if ((buffer_boundary(map_bh) && relative_block == nblocks) || + (first_hole != blocks_per_page)) + bio = mpage_bio_submit(READ, bio); + else + *last_block_in_bio = blocks[blocks_per_page - 1]; +out: + return bio; + +confused: + if (bio) + bio = mpage_bio_submit(READ, bio); + if (!PageUptodate(page)) + block_read_full_page(page, get_block); + else + unlock_page(page); + goto out; +} + +/** + * mpage_readpages - populate an address space with some pages & start reads against them + * @mapping: the address_space + * @pages: The address of a list_head which contains the target pages. These + * pages have their ->index populated and are otherwise uninitialised. + * The page at @pages->prev has the lowest file offset, and reads should be + * issued in @pages->prev to @pages->next order. + * @nr_pages: The number of pages at *@pages + * @get_block: The filesystem's block mapper function. + * + * This function walks the pages and the blocks within each page, building and + * emitting large BIOs. + * + * If anything unusual happens, such as: + * + * - encountering a page which has buffers + * - encountering a page which has a non-hole after a hole + * - encountering a page with non-contiguous blocks + * + * then this code just gives up and calls the buffer_head-based read function. + * It does handle a page which has holes at the end - that is a common case: + * the end-of-file on blocksize < PAGE_CACHE_SIZE setups. + * + * BH_Boundary explanation: + * + * There is a problem. The mpage read code assembles several pages, gets all + * their disk mappings, and then submits them all. That's fine, but obtaining + * the disk mappings may require I/O. Reads of indirect blocks, for example. + * + * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be + * submitted in the following order: + * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 + * + * because the indirect block has to be read to get the mappings of blocks + * 13,14,15,16. Obviously, this impacts performance. + * + * So what we do it to allow the filesystem's get_block() function to set + * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block + * after this one will require I/O against a block which is probably close to + * this one. So you should push what I/O you have currently accumulated. + * + * This all causes the disk requests to be issued in the correct order. + */ +int +mpage_readpages(struct address_space *mapping, struct list_head *pages, + unsigned nr_pages, get_block_t get_block) +{ + struct bio *bio = NULL; + unsigned page_idx; + sector_t last_block_in_bio = 0; + struct buffer_head map_bh; + unsigned long first_logical_block = 0; + struct blk_plug plug; + + blk_start_plug(&plug); + + map_bh.b_state = 0; + map_bh.b_size = 0; + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + if (!add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL)) { + bio = do_mpage_readpage(bio, page, + nr_pages - page_idx, + &last_block_in_bio, &map_bh, + &first_logical_block, + get_block); + } + page_cache_release(page); + } + BUG_ON(!list_empty(pages)); + if (bio) + mpage_bio_submit(READ, bio); + blk_finish_plug(&plug); + return 0; +} +EXPORT_SYMBOL(mpage_readpages); + +/* + * This isn't called much at all + */ +int mpage_readpage(struct page *page, get_block_t get_block) +{ + struct bio *bio = NULL; + sector_t last_block_in_bio = 0; + struct buffer_head map_bh; + unsigned long first_logical_block = 0; + + map_bh.b_state = 0; + map_bh.b_size = 0; + bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, + &map_bh, &first_logical_block, get_block); + if (bio) + mpage_bio_submit(READ, bio); + return 0; +} +EXPORT_SYMBOL(mpage_readpage); + +/* + * Writing is not so simple. + * + * If the page has buffers then they will be used for obtaining the disk + * mapping. We only support pages which are fully mapped-and-dirty, with a + * special case for pages which are unmapped at the end: end-of-file. + * + * If the page has no buffers (preferred) then the page is mapped here. + * + * If all blocks are found to be contiguous then the page can go into the + * BIO. Otherwise fall back to the mapping's writepage(). + * + * FIXME: This code wants an estimate of how many pages are still to be + * written, so it can intelligently allocate a suitably-sized BIO. For now, + * just allocate full-size (16-page) BIOs. + */ + +struct mpage_data { + struct bio *bio; + sector_t last_block_in_bio; + get_block_t *get_block; + unsigned use_writepage; +}; + +static int __mpage_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct mpage_data *mpd = data; + struct bio *bio = mpd->bio; + struct address_space *mapping = page->mapping; + struct inode *inode = page->mapping->host; + const unsigned blkbits = inode->i_blkbits; + unsigned long end_index; + const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; + sector_t last_block; + sector_t block_in_file; + sector_t blocks[MAX_BUF_PER_PAGE]; + unsigned page_block; + unsigned first_unmapped = blocks_per_page; + struct block_device *bdev = NULL; + int boundary = 0; + sector_t boundary_block = 0; + struct block_device *boundary_bdev = NULL; + int length; + struct buffer_head map_bh; + loff_t i_size = i_size_read(inode); + int ret = 0; + + if (page_has_buffers(page)) { + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + + /* If they're all mapped and dirty, do it */ + page_block = 0; + do { + BUG_ON(buffer_locked(bh)); + if (!buffer_mapped(bh)) { + /* + * unmapped dirty buffers are created by + * __set_page_dirty_buffers -> mmapped data + */ + if (buffer_dirty(bh)) + goto confused; + if (first_unmapped == blocks_per_page) + first_unmapped = page_block; + continue; + } + + if (first_unmapped != blocks_per_page) + goto confused; /* hole -> non-hole */ + + if (!buffer_dirty(bh) || !buffer_uptodate(bh)) + goto confused; + if (page_block) { + if (bh->b_blocknr != blocks[page_block-1] + 1) + goto confused; + } + blocks[page_block++] = bh->b_blocknr; + boundary = buffer_boundary(bh); + if (boundary) { + boundary_block = bh->b_blocknr; + boundary_bdev = bh->b_bdev; + } + bdev = bh->b_bdev; + } while ((bh = bh->b_this_page) != head); + + if (first_unmapped) + goto page_is_mapped; + + /* + * Page has buffers, but they are all unmapped. The page was + * created by pagein or read over a hole which was handled by + * block_read_full_page(). If this address_space is also + * using mpage_readpages then this can rarely happen. + */ + goto confused; + } + + /* + * The page has no buffers: map it to disk + */ + BUG_ON(!PageUptodate(page)); + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + last_block = (i_size - 1) >> blkbits; + map_bh.b_page = page; + for (page_block = 0; page_block < blocks_per_page; ) { + + map_bh.b_state = 0; + map_bh.b_size = 1 << blkbits; + if (mpd->get_block(inode, block_in_file, &map_bh, 1)) + goto confused; + if (buffer_new(&map_bh)) + unmap_underlying_metadata(map_bh.b_bdev, + map_bh.b_blocknr); + if (buffer_boundary(&map_bh)) { + boundary_block = map_bh.b_blocknr; + boundary_bdev = map_bh.b_bdev; + } + if (page_block) { + if (map_bh.b_blocknr != blocks[page_block-1] + 1) + goto confused; + } + blocks[page_block++] = map_bh.b_blocknr; + boundary = buffer_boundary(&map_bh); + bdev = map_bh.b_bdev; + if (block_in_file == last_block) + break; + block_in_file++; + } + BUG_ON(page_block == 0); + + first_unmapped = page_block; + +page_is_mapped: + end_index = i_size >> PAGE_CACHE_SHIFT; + if (page->index >= end_index) { + /* + * The page straddles i_size. It must be zeroed out on each + * and every writepage invocation because it may be mmapped. + * "A file is mapped in multiples of the page size. For a file + * that is not a multiple of the page size, the remaining memory + * is zeroed when mapped, and writes to that region are not + * written out to the file." + */ + unsigned offset = i_size & (PAGE_CACHE_SIZE - 1); + + if (page->index > end_index || !offset) + goto confused; + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + } + + /* + * This page will go to BIO. Do we need to send this BIO off first? + */ + if (bio && mpd->last_block_in_bio != blocks[0] - 1) + bio = mpage_bio_submit(WRITE, bio); + +alloc_new: + if (bio == NULL) { + bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), + bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); + if (bio == NULL) + goto confused; + } + + /* + * Must try to add the page before marking the buffer clean or + * the confused fail path above (OOM) will be very confused when + * it finds all bh marked clean (i.e. it will not write anything) + */ + length = first_unmapped << blkbits; + if (bio_add_page(bio, page, length, 0) < length) { + bio = mpage_bio_submit(WRITE, bio); + goto alloc_new; + } + + /* + * OK, we have our BIO, so we can now mark the buffers clean. Make + * sure to only clean buffers which we know we'll be writing. + */ + if (page_has_buffers(page)) { + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + unsigned buffer_counter = 0; + + do { + if (buffer_counter++ == first_unmapped) + break; + clear_buffer_dirty(bh); + bh = bh->b_this_page; + } while (bh != head); + + /* + * we cannot drop the bh if the page is not uptodate + * or a concurrent readpage would fail to serialize with the bh + * and it would read from disk before we reach the platter. + */ + if (buffer_heads_over_limit && PageUptodate(page)) + try_to_free_buffers(page); + } + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + if (boundary || (first_unmapped != blocks_per_page)) { + bio = mpage_bio_submit(WRITE, bio); + if (boundary_block) { + write_boundary_block(boundary_bdev, + boundary_block, 1 << blkbits); + } + } else { + mpd->last_block_in_bio = blocks[blocks_per_page - 1]; + } + goto out; + +confused: + if (bio) + bio = mpage_bio_submit(WRITE, bio); + + if (mpd->use_writepage) { + ret = mapping->a_ops->writepage(page, wbc); + } else { + ret = -EAGAIN; + goto out; + } + /* + * The caller has a ref on the inode, so *mapping is stable + */ + mapping_set_error(mapping, ret); +out: + mpd->bio = bio; + return ret; +} + +/** + * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @get_block: the filesystem's block mapper function. + * If this is NULL then use a_ops->writepage. Otherwise, go + * direct-to-BIO. + * + * This is a library function, which implements the writepages() + * address_space_operation. + * + * If a page is already under I/O, generic_writepages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ +int +mpage_writepages(struct address_space *mapping, + struct writeback_control *wbc, get_block_t get_block) +{ + struct blk_plug plug; + int ret; + + blk_start_plug(&plug); + + if (!get_block) + ret = generic_writepages(mapping, wbc); + else { + struct mpage_data mpd = { + .bio = NULL, + .last_block_in_bio = 0, + .get_block = get_block, + .use_writepage = 1, + }; + + ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); + if (mpd.bio) + mpage_bio_submit(WRITE, mpd.bio); + } + blk_finish_plug(&plug); + return ret; +} +EXPORT_SYMBOL(mpage_writepages); + +int mpage_writepage(struct page *page, get_block_t get_block, + struct writeback_control *wbc) +{ + struct mpage_data mpd = { + .bio = NULL, + .last_block_in_bio = 0, + .get_block = get_block, + .use_writepage = 0, + }; + int ret = __mpage_writepage(page, wbc, &mpd); + if (mpd.bio) + mpage_bio_submit(WRITE, mpd.bio); + return ret; +} +EXPORT_SYMBOL(mpage_writepage); diff --git a/fs/namei.c b/fs/namei.c new file mode 100644 index 00000000..16bda6cd --- /dev/null +++ b/fs/namei.c @@ -0,0 +1,3402 @@ +/* + * linux/fs/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * Some corrections by tytso. + */ + +/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname + * lookup logic. + */ +/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* [Feb-1997 T. Schoebel-Theuer] + * Fundamental changes in the pathname lookup mechanisms (namei) + * were necessary because of omirr. The reason is that omirr needs + * to know the _real_ pathname, not the user-supplied one, in case + * of symlinks (and also when transname replacements occur). + * + * The new code replaces the old recursive symlink resolution with + * an iterative one (in case of non-nested symlink chains). It does + * this with calls to _follow_link(). + * As a side effect, dir_namei(), _namei() and follow_link() are now + * replaced with a single function lookup_dentry() that can handle all + * the special cases of the former code. + * + * With the new dcache, the pathname is stored at each inode, at least as + * long as the refcount of the inode is positive. As a side effect, the + * size of the dcache depends on the inode cache and thus is dynamic. + * + * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink + * resolution to correspond with current state of the code. + * + * Note that the symlink resolution is not *completely* iterative. + * There is still a significant amount of tail- and mid- recursion in + * the algorithm. Also, note that _readlink() is not used in + * lookup_dentry(): lookup_dentry() on the result of _readlink() + * may return different results than _follow_link(). Many virtual + * filesystems (including /proc) exhibit this behavior. + */ + +/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: + * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL + * and the name already exists in form of a symlink, try to create the new + * name indicated by the symlink. The old code always complained that the + * name already exists, due to not following the symlink even if its target + * is nonexistent. The new semantics affects also mknod() and link() when + * the name is a symlink pointing to a non-existent name. + * + * I don't know which semantics is the right one, since I have no access + * to standards. But I found by trial that HP-UX 9.0 has the full "new" + * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the + * "old" one. Personally, I think the new semantics is much more logical. + * Note that "ln old new" where "new" is a symlink pointing to a non-existing + * file does succeed in both HP-UX and SunOs, but not in Solaris + * and in the old Linux semantics. + */ + +/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink + * semantics. See the comments in "open_namei" and "do_link" below. + * + * [10-Sep-98 Alan Modra] Another symlink change. + */ + +/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: + * inside the path - always follow. + * in the last component in creation/removal/renaming - never follow. + * if LOOKUP_FOLLOW passed - follow. + * if the pathname has trailing slashes - follow. + * otherwise - don't follow. + * (applied in that order). + * + * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT + * restored for 2.4. This is the last surviving part of old 4.2BSD bug. + * During the 2.4 we need to fix the userland stuff depending on it - + * hopefully we will be able to get rid of that wart in 2.5. So far only + * XEmacs seems to be relying on it... + */ +/* + * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) + * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives + * any extra contention... + */ + +/* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the + * kernel data space before using them.. + * + * POSIX.1 2.4: an empty pathname is invalid (ENOENT). + * PATH_MAX includes the nul terminator --RR. + */ +static int do_getname(const char __user *filename, char *page) +{ + int retval; + unsigned long len = PATH_MAX; + + if (!segment_eq(get_fs(), KERNEL_DS)) { + if ((unsigned long) filename >= TASK_SIZE) + return -EFAULT; + if (TASK_SIZE - (unsigned long) filename < PATH_MAX) + len = TASK_SIZE - (unsigned long) filename; + } + + retval = strncpy_from_user(page, filename, len); + if (retval > 0) { + if (retval < len) + return 0; + return -ENAMETOOLONG; + } else if (!retval) + retval = -ENOENT; + return retval; +} + +static char *getname_flags(const char __user *filename, int flags, int *empty) +{ + char *tmp, *result; + + result = ERR_PTR(-ENOMEM); + tmp = __getname(); + if (tmp) { + int retval = do_getname(filename, tmp); + + result = tmp; + if (retval < 0) { + if (retval == -ENOENT && empty) + *empty = 1; + if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { + __putname(tmp); + result = ERR_PTR(retval); + } + } + } + audit_getname(result); + return result; +} + +char *getname(const char __user * filename) +{ + return getname_flags(filename, 0, 0); +} + +#ifdef CONFIG_AUDITSYSCALL +void putname(const char *name) +{ + if (unlikely(!audit_dummy_context())) + audit_putname(name); + else + __putname(name); +} +EXPORT_SYMBOL(putname); +#endif + +/* + * This does basic POSIX ACL permission checking + */ +static int acl_permission_check(struct inode *inode, int mask, unsigned int flags, + int (*check_acl)(struct inode *inode, int mask, unsigned int flags)) +{ + unsigned int mode = inode->i_mode; + + mask &= MAY_READ | MAY_WRITE | MAY_EXEC; + + if (current_user_ns() != inode_userns(inode)) + goto other_perms; + + if (current_fsuid() == inode->i_uid) + mode >>= 6; + else { + if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { + int error = check_acl(inode, mask, flags); + if (error != -EAGAIN) + return error; + } + + if (in_group_p(inode->i_gid)) + mode >>= 3; + } + +other_perms: + /* + * If the DACs are ok we don't need any capability check. + */ + if ((mask & ~mode) == 0) + return 0; + return -EACCES; +} + +/** + * generic_permission - check for access rights on a Posix-like filesystem + * @inode: inode to check access rights for + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @check_acl: optional callback to check for Posix ACLs + * @flags: IPERM_FLAG_ flags. + * + * Used to check for read/write/execute permissions on a file. + * We use "fsuid" for this, letting us set arbitrary permissions + * for filesystem access without changing the "normal" uids which + * are used for other things. + * + * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk + * request cannot be satisfied (eg. requires blocking or too much complexity). + * It would then be called again in ref-walk mode. + */ +int generic_permission(struct inode *inode, int mask, unsigned int flags, + int (*check_acl)(struct inode *inode, int mask, unsigned int flags)) +{ + int ret; + + /* + * Do the basic POSIX ACL permission checks. + */ + ret = acl_permission_check(inode, mask, flags, check_acl); + if (ret != -EACCES) + return ret; + + /* + * Read/write DACs are always overridable. + * Executable DACs are overridable for all directories and + * for non-directories that have least one exec bit set. + */ + if (!(mask & MAY_EXEC) || execute_ok(inode)) + if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) + return 0; + + /* + * Searching includes executable on directories, else just read. + */ + mask &= MAY_READ | MAY_WRITE | MAY_EXEC; + if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) + if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH)) + return 0; + + return -EACCES; +} + +/** + * inode_permission - check for access rights to a given inode + * @inode: inode to check permission on + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * + * Used to check for read/write/execute permissions on an inode. + * We use "fsuid" for this, letting us set arbitrary permissions + * for filesystem access without changing the "normal" uids which + * are used for other things. + */ +int inode_permission(struct inode *inode, int mask) +{ + int retval; + + if (mask & MAY_WRITE) { + umode_t mode = inode->i_mode; + + /* + * Nobody gets write access to a read-only fs. + */ + if (IS_RDONLY(inode) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + return -EROFS; + + /* + * Nobody gets write access to an immutable file. + */ + if (IS_IMMUTABLE(inode)) + return -EACCES; + } + + if (inode->i_op->permission) + retval = inode->i_op->permission(inode, mask, 0); + else + retval = generic_permission(inode, mask, 0, + inode->i_op->check_acl); + + if (retval) + return retval; + + retval = devcgroup_inode_permission(inode, mask); + if (retval) + return retval; + + return security_inode_permission(inode, mask); +} + +/** + * file_permission - check for additional access rights to a given file + * @file: file to check access rights for + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * + * Used to check for read/write/execute permissions on an already opened + * file. + * + * Note: + * Do not use this function in new code. All access checks should + * be done using inode_permission(). + */ +int file_permission(struct file *file, int mask) +{ + return inode_permission(file->f_path.dentry->d_inode, mask); +} + +/* + * get_write_access() gets write permission for a file. + * put_write_access() releases this write permission. + * This is used for regular files. + * We cannot support write (and maybe mmap read-write shared) accesses and + * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode + * can have the following values: + * 0: no writers, no VM_DENYWRITE mappings + * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist + * > 0: (i_writecount) users are writing to the file. + * + * Normally we operate on that counter with atomic_{inc,dec} and it's safe + * except for the cases where we don't hold i_writecount yet. Then we need to + * use {get,deny}_write_access() - these functions check the sign and refuse + * to do the change if sign is wrong. Exclusion between them is provided by + * the inode->i_lock spinlock. + */ + +int get_write_access(struct inode * inode) +{ + spin_lock(&inode->i_lock); + if (atomic_read(&inode->i_writecount) < 0) { + spin_unlock(&inode->i_lock); + return -ETXTBSY; + } + atomic_inc(&inode->i_writecount); + spin_unlock(&inode->i_lock); + + return 0; +} + +int deny_write_access(struct file * file) +{ + struct inode *inode = file->f_path.dentry->d_inode; + + spin_lock(&inode->i_lock); + if (atomic_read(&inode->i_writecount) > 0) { + spin_unlock(&inode->i_lock); + return -ETXTBSY; + } + atomic_dec(&inode->i_writecount); + spin_unlock(&inode->i_lock); + + return 0; +} + +/** + * path_get - get a reference to a path + * @path: path to get the reference to + * + * Given a path increment the reference count to the dentry and the vfsmount. + */ +void path_get(struct path *path) +{ + mntget(path->mnt); + dget(path->dentry); +} +EXPORT_SYMBOL(path_get); + +/** + * path_put - put a reference to a path + * @path: path to put the reference to + * + * Given a path decrement the reference count to the dentry and the vfsmount. + */ +void path_put(struct path *path) +{ + dput(path->dentry); + mntput(path->mnt); +} +EXPORT_SYMBOL(path_put); + +/* + * Path walking has 2 modes, rcu-walk and ref-walk (see + * Documentation/filesystems/path-lookup.txt). In situations when we can't + * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab + * normal reference counts on dentries and vfsmounts to transition to rcu-walk + * mode. Refcounts are grabbed at the last known good point before rcu-walk + * got stuck, so ref-walk may continue from there. If this is not successful + * (eg. a seqcount has changed), then failure is returned and it's up to caller + * to restart the path walk from the beginning in ref-walk mode. + */ + +/** + * unlazy_walk - try to switch to ref-walk mode. + * @nd: nameidata pathwalk data + * @dentry: child of nd->path.dentry or NULL + * Returns: 0 on success, -ECHILD on failure + * + * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry + * for ref-walk mode. @dentry must be a path found by a do_lookup call on + * @nd or NULL. Must be called from rcu-walk context. + */ +static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) +{ + struct fs_struct *fs = current->fs; + struct dentry *parent = nd->path.dentry; + int want_root = 0; + + BUG_ON(!(nd->flags & LOOKUP_RCU)); + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + want_root = 1; + spin_lock(&fs->lock); + if (nd->root.mnt != fs->root.mnt || + nd->root.dentry != fs->root.dentry) + goto err_root; + } + spin_lock(&parent->d_lock); + if (!dentry) { + if (!__d_rcu_to_refcount(parent, nd->seq)) + goto err_parent; + BUG_ON(nd->inode != parent->d_inode); + } else { + if (dentry->d_parent != parent) + goto err_parent; + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (!__d_rcu_to_refcount(dentry, nd->seq)) + goto err_child; + /* + * If the sequence check on the child dentry passed, then + * the child has not been removed from its parent. This + * means the parent dentry must be valid and able to take + * a reference at this point. + */ + BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); + BUG_ON(!parent->d_count); + parent->d_count++; + spin_unlock(&dentry->d_lock); + } + spin_unlock(&parent->d_lock); + if (want_root) { + path_get(&nd->root); + spin_unlock(&fs->lock); + } + mntget(nd->path.mnt); + + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + nd->flags &= ~LOOKUP_RCU; + return 0; + +err_child: + spin_unlock(&dentry->d_lock); +err_parent: + spin_unlock(&parent->d_lock); +err_root: + if (want_root) + spin_unlock(&fs->lock); + return -ECHILD; +} + +/** + * release_open_intent - free up open intent resources + * @nd: pointer to nameidata + */ +void release_open_intent(struct nameidata *nd) +{ + struct file *file = nd->intent.open.file; + + if (file && !IS_ERR(file)) { + if (file->f_path.dentry == NULL) + put_filp(file); + else + fput(file); + } +} + +static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + return dentry->d_op->d_revalidate(dentry, nd); +} + +static struct dentry * +do_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + int status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + /* + * The dentry failed validation. + * If d_revalidate returned 0 attempt to invalidate + * the dentry otherwise d_revalidate is asking us + * to return a fail status. + */ + if (status < 0) { + dput(dentry); + dentry = ERR_PTR(status); + } else if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + } + } + return dentry; +} + +/** + * complete_walk - successful completion of path walk + * @nd: pointer nameidata + * + * If we had been in RCU mode, drop out of it and legitimize nd->path. + * Revalidate the final result, unless we'd already done that during + * the path walk or the filesystem doesn't ask for it. Return 0 on + * success, -error on failure. In case of failure caller does not + * need to drop nd->path. + */ +static int complete_walk(struct nameidata *nd) +{ + struct dentry *dentry = nd->path.dentry; + int status; + + if (nd->flags & LOOKUP_RCU) { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + spin_lock(&dentry->d_lock); + if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; + } + BUG_ON(nd->inode != dentry->d_inode); + spin_unlock(&dentry->d_lock); + mntget(nd->path.mnt); + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + } + + if (likely(!(nd->flags & LOOKUP_JUMPED))) + return 0; + + if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) + return 0; + + if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) + return 0; + + /* Note: we do not d_invalidate() */ + status = d_revalidate(dentry, nd); + if (status > 0) + return 0; + + if (!status) + status = -ESTALE; + + path_put(&nd->path); + return status; +} + +/* + * Short-cut version of permission(), for calling on directories + * during pathname resolution. Combines parts of permission() + * and generic_permission(), and tests ONLY for MAY_EXEC permission. + * + * If appropriate, check DAC only. If not appropriate, or + * short-cut DAC fails, then call ->permission() to do more + * complete permission check. + */ +static inline int exec_permission(struct inode *inode, unsigned int flags) +{ + int ret; + struct user_namespace *ns = inode_userns(inode); + + if (inode->i_op->permission) { + ret = inode->i_op->permission(inode, MAY_EXEC, flags); + } else { + ret = acl_permission_check(inode, MAY_EXEC, flags, + inode->i_op->check_acl); + } + if (likely(!ret)) + goto ok; + if (ret == -ECHILD) + return ret; + + if (ns_capable(ns, CAP_DAC_OVERRIDE) || + ns_capable(ns, CAP_DAC_READ_SEARCH)) + goto ok; + + return ret; +ok: + return security_inode_exec_permission(inode, flags); +} + +static __always_inline void set_root(struct nameidata *nd) +{ + if (!nd->root.mnt) + get_fs_root(current->fs, &nd->root); +} + +static int link_path_walk(const char *, struct nameidata *); + +static __always_inline void set_root_rcu(struct nameidata *nd) +{ + if (!nd->root.mnt) { + struct fs_struct *fs = current->fs; + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + nd->root = fs->root; + nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + } +} + +static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) +{ + int ret; + + if (IS_ERR(link)) + goto fail; + + if (*link == '/') { + set_root(nd); + path_put(&nd->path); + nd->path = nd->root; + path_get(&nd->root); + nd->flags |= LOOKUP_JUMPED; + } + nd->inode = nd->path.dentry->d_inode; + + ret = link_path_walk(link, nd); + return ret; +fail: + path_put(&nd->path); + return PTR_ERR(link); +} + +static void path_put_conditional(struct path *path, struct nameidata *nd) +{ + dput(path->dentry); + if (path->mnt != nd->path.mnt) + mntput(path->mnt); +} + +static inline void path_to_nameidata(const struct path *path, + struct nameidata *nd) +{ + if (!(nd->flags & LOOKUP_RCU)) { + dput(nd->path.dentry); + if (nd->path.mnt != path->mnt) + mntput(nd->path.mnt); + } + nd->path.mnt = path->mnt; + nd->path.dentry = path->dentry; +} + +static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) +{ + struct inode *inode = link->dentry->d_inode; + if (!IS_ERR(cookie) && inode->i_op->put_link) + inode->i_op->put_link(link->dentry, nd, cookie); + path_put(link); +} + +static __always_inline int +follow_link(struct path *link, struct nameidata *nd, void **p) +{ + int error; + struct dentry *dentry = link->dentry; + + BUG_ON(nd->flags & LOOKUP_RCU); + + if (link->mnt == nd->path.mnt) + mntget(link->mnt); + + if (unlikely(current->total_link_count >= 40)) { + *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */ + path_put(&nd->path); + return -ELOOP; + } + cond_resched(); + current->total_link_count++; + + touch_atime(link->mnt, dentry); + nd_set_link(nd, NULL); + + error = security_inode_follow_link(link->dentry, nd); + if (error) { + *p = ERR_PTR(error); /* no ->put_link(), please */ + path_put(&nd->path); + return error; + } + + nd->last_type = LAST_BIND; + *p = dentry->d_inode->i_op->follow_link(dentry, nd); + error = PTR_ERR(*p); + if (!IS_ERR(*p)) { + char *s = nd_get_link(nd); + error = 0; + if (s) + error = __vfs_follow_link(nd, s); + else if (nd->last_type == LAST_BIND) { + nd->flags |= LOOKUP_JUMPED; + nd->inode = nd->path.dentry->d_inode; + if (nd->inode->i_op->follow_link) { + /* stepped on a _really_ weird one */ + path_put(&nd->path); + error = -ELOOP; + } + } + } + return error; +} + +static int follow_up_rcu(struct path *path) +{ + struct vfsmount *parent; + struct dentry *mountpoint; + + parent = path->mnt->mnt_parent; + if (parent == path->mnt) + return 0; + mountpoint = path->mnt->mnt_mountpoint; + path->dentry = mountpoint; + path->mnt = parent; + return 1; +} + +int follow_up(struct path *path) +{ + struct vfsmount *parent; + struct dentry *mountpoint; + + br_read_lock(vfsmount_lock); + parent = path->mnt->mnt_parent; + if (parent == path->mnt) { + br_read_unlock(vfsmount_lock); + return 0; + } + mntget(parent); + mountpoint = dget(path->mnt->mnt_mountpoint); + br_read_unlock(vfsmount_lock); + dput(path->dentry); + path->dentry = mountpoint; + mntput(path->mnt); + path->mnt = parent; + return 1; +} + +/* + * Perform an automount + * - return -EISDIR to tell follow_managed() to stop and return the path we + * were called with. + */ +static int follow_automount(struct path *path, unsigned flags, + bool *need_mntput) +{ + struct vfsmount *mnt; + int err; + + if (!path->dentry->d_op || !path->dentry->d_op->d_automount) + return -EREMOTE; + + /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT + * and this is the terminal part of the path. + */ + if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE)) + return -EISDIR; /* we actually want to stop here */ + + /* We don't want to mount if someone's just doing a stat - + * unless they're stat'ing a directory and appended a '/' to + * the name. + * + * We do, however, want to mount if someone wants to open or + * create a file of any type under the mountpoint, wants to + * traverse through the mountpoint or wants to open the + * mounted directory. Also, autofs may mark negative dentries + * as being automount points. These will need the attentions + * of the daemon to instantiate them before they can be used. + */ + if (!(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY | + LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && + path->dentry->d_inode) + return -EISDIR; + + current->total_link_count++; + if (current->total_link_count >= 40) + return -ELOOP; + + mnt = path->dentry->d_op->d_automount(path); + if (IS_ERR(mnt)) { + /* + * The filesystem is allowed to return -EISDIR here to indicate + * it doesn't want to automount. For instance, autofs would do + * this so that its userspace daemon can mount on this dentry. + * + * However, we can only permit this if it's a terminal point in + * the path being looked up; if it wasn't then the remainder of + * the path is inaccessible and we should say so. + */ + if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE)) + return -EREMOTE; + return PTR_ERR(mnt); + } + + if (!mnt) /* mount collision */ + return 0; + + if (!*need_mntput) { + /* lock_mount() may release path->mnt on error */ + mntget(path->mnt); + *need_mntput = true; + } + err = finish_automount(mnt, path); + + switch (err) { + case -EBUSY: + /* Someone else made a mount here whilst we were busy */ + return 0; + case 0: + path_put(path); + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); + return 0; + default: + return err; + } + +} + +/* + * Handle a dentry that is managed in some way. + * - Flagged for transit management (autofs) + * - Flagged as mountpoint + * - Flagged as automount point + * + * This may only be called in refwalk mode. + * + * Serialization is taken care of in namespace.c + */ +static int follow_managed(struct path *path, unsigned flags) +{ + struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ + unsigned managed; + bool need_mntput = false; + int ret = 0; + + /* Given that we're not holding a lock here, we retain the value in a + * local variable for each dentry as we look at it so that we don't see + * the components of that value change under us */ + while (managed = ACCESS_ONCE(path->dentry->d_flags), + managed &= DCACHE_MANAGED_DENTRY, + unlikely(managed != 0)) { + /* Allow the filesystem to manage the transit without i_mutex + * being held. */ + if (managed & DCACHE_MANAGE_TRANSIT) { + BUG_ON(!path->dentry->d_op); + BUG_ON(!path->dentry->d_op->d_manage); + ret = path->dentry->d_op->d_manage(path->dentry, false); + if (ret < 0) + break; + } + + /* Transit to a mounted filesystem. */ + if (managed & DCACHE_MOUNTED) { + struct vfsmount *mounted = lookup_mnt(path); + if (mounted) { + dput(path->dentry); + if (need_mntput) + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); + need_mntput = true; + continue; + } + + /* Something is mounted on this dentry in another + * namespace and/or whatever was mounted there in this + * namespace got unmounted before we managed to get the + * vfsmount_lock */ + } + + /* Handle an automount point */ + if (managed & DCACHE_NEED_AUTOMOUNT) { + ret = follow_automount(path, flags, &need_mntput); + if (ret < 0) + break; + continue; + } + + /* We didn't change the current path point */ + break; + } + + if (need_mntput && path->mnt == mnt) + mntput(path->mnt); + if (ret == -EISDIR) + ret = 0; + return ret < 0 ? ret : need_mntput; +} + +int follow_down_one(struct path *path) +{ + struct vfsmount *mounted; + + mounted = lookup_mnt(path); + if (mounted) { + dput(path->dentry); + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); + return 1; + } + return 0; +} + +static inline bool managed_dentry_might_block(struct dentry *dentry) +{ + return (dentry->d_flags & DCACHE_MANAGE_TRANSIT && + dentry->d_op->d_manage(dentry, true) < 0); +} + +/* + * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if + * we meet a managed dentry that would need blocking. + */ +static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, + struct inode **inode) +{ + for (;;) { + struct vfsmount *mounted; + /* + * Don't forget we might have a non-mountpoint managed dentry + * that wants to block transit. + */ + if (unlikely(managed_dentry_might_block(path->dentry))) + return false; + + if (!d_mountpoint(path->dentry)) + break; + + mounted = __lookup_mnt(path->mnt, path->dentry, 1); + if (!mounted) + break; + path->mnt = mounted; + path->dentry = mounted->mnt_root; + nd->flags |= LOOKUP_JUMPED; + nd->seq = read_seqcount_begin(&path->dentry->d_seq); + /* + * Update the inode too. We don't need to re-check the + * dentry sequence number here after this d_inode read, + * because a mount-point is always pinned. + */ + *inode = path->dentry->d_inode; + } + return true; +} + +static void follow_mount_rcu(struct nameidata *nd) +{ + while (d_mountpoint(nd->path.dentry)) { + struct vfsmount *mounted; + mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); + if (!mounted) + break; + nd->path.mnt = mounted; + nd->path.dentry = mounted->mnt_root; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + } +} + +static int follow_dotdot_rcu(struct nameidata *nd) +{ + set_root_rcu(nd); + + while (1) { + if (nd->path.dentry == nd->root.dentry && + nd->path.mnt == nd->root.mnt) { + break; + } + if (nd->path.dentry != nd->path.mnt->mnt_root) { + struct dentry *old = nd->path.dentry; + struct dentry *parent = old->d_parent; + unsigned seq; + + seq = read_seqcount_begin(&parent->d_seq); + if (read_seqcount_retry(&old->d_seq, nd->seq)) + goto failed; + nd->path.dentry = parent; + nd->seq = seq; + break; + } + if (!follow_up_rcu(&nd->path)) + break; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + } + follow_mount_rcu(nd); + nd->inode = nd->path.dentry->d_inode; + return 0; + +failed: + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; +} + +/* + * Follow down to the covering mount currently visible to userspace. At each + * point, the filesystem owning that dentry may be queried as to whether the + * caller is permitted to proceed or not. + */ +int follow_down(struct path *path) +{ + unsigned managed; + int ret; + + while (managed = ACCESS_ONCE(path->dentry->d_flags), + unlikely(managed & DCACHE_MANAGED_DENTRY)) { + /* Allow the filesystem to manage the transit without i_mutex + * being held. + * + * We indicate to the filesystem if someone is trying to mount + * something here. This gives autofs the chance to deny anyone + * other than its daemon the right to mount on its + * superstructure. + * + * The filesystem may sleep at this point. + */ + if (managed & DCACHE_MANAGE_TRANSIT) { + BUG_ON(!path->dentry->d_op); + BUG_ON(!path->dentry->d_op->d_manage); + ret = path->dentry->d_op->d_manage( + path->dentry, false); + if (ret < 0) + return ret == -EISDIR ? 0 : ret; + } + + /* Transit to a mounted filesystem. */ + if (managed & DCACHE_MOUNTED) { + struct vfsmount *mounted = lookup_mnt(path); + if (!mounted) + break; + dput(path->dentry); + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); + continue; + } + + /* Don't handle automount points here */ + break; + } + return 0; +} + +/* + * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() + */ +static void follow_mount(struct path *path) +{ + while (d_mountpoint(path->dentry)) { + struct vfsmount *mounted = lookup_mnt(path); + if (!mounted) + break; + dput(path->dentry); + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); + } +} + +static void follow_dotdot(struct nameidata *nd) +{ + set_root(nd); + + while(1) { + struct dentry *old = nd->path.dentry; + + if (nd->path.dentry == nd->root.dentry && + nd->path.mnt == nd->root.mnt) { + break; + } + if (nd->path.dentry != nd->path.mnt->mnt_root) { + /* rare case of legitimate dget_parent()... */ + nd->path.dentry = dget_parent(nd->path.dentry); + dput(old); + break; + } + if (!follow_up(&nd->path)) + break; + } + follow_mount(&nd->path); + nd->inode = nd->path.dentry->d_inode; +} + +/* + * Allocate a dentry with name and parent, and perform a parent + * directory ->lookup on it. Returns the new dentry, or ERR_PTR + * on error. parent->d_inode->i_mutex must be held. d_lookup must + * have verified that no child exists while under i_mutex. + */ +static struct dentry *d_alloc_and_lookup(struct dentry *parent, + struct qstr *name, struct nameidata *nd) +{ + struct inode *inode = parent->d_inode; + struct dentry *dentry; + struct dentry *old; + + /* Don't create child dentry for a dead directory. */ + if (unlikely(IS_DEADDIR(inode))) + return ERR_PTR(-ENOENT); + + dentry = d_alloc(parent, name); + if (unlikely(!dentry)) + return ERR_PTR(-ENOMEM); + + old = inode->i_op->lookup(inode, dentry, nd); + if (unlikely(old)) { + dput(dentry); + dentry = old; + } + return dentry; +} + +/* + * It's more convoluted than I'd like it to be, but... it's still fairly + * small and for now I'd prefer to have fast path as straight as possible. + * It _is_ time-critical. + */ +static int do_lookup(struct nameidata *nd, struct qstr *name, + struct path *path, struct inode **inode) +{ + struct vfsmount *mnt = nd->path.mnt; + struct dentry *dentry, *parent = nd->path.dentry; + int need_reval = 1; + int status = 1; + int err; + + /* + * Rename seqlock is not required here because in the off chance + * of a false negative due to a concurrent rename, we're going to + * do the non-racy lookup, below. + */ + if (nd->flags & LOOKUP_RCU) { + unsigned seq; + *inode = nd->inode; + dentry = __d_lookup_rcu(parent, name, &seq, inode); + if (!dentry) + goto unlazy; + + /* Memory barrier in read_seqcount_begin of child is enough */ + if (__read_seqcount_retry(&parent->d_seq, nd->seq)) + return -ECHILD; + nd->seq = seq; + + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { + status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + if (status != -ECHILD) + need_reval = 0; + goto unlazy; + } + } + path->mnt = mnt; + path->dentry = dentry; + if (unlikely(!__follow_mount_rcu(nd, path, inode))) + goto unlazy; + if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) + goto unlazy; + return 0; +unlazy: + if (unlazy_walk(nd, dentry)) + return -ECHILD; + } else { + dentry = __d_lookup(parent, name); + } + +retry: + if (unlikely(!dentry)) { + struct inode *dir = parent->d_inode; + BUG_ON(nd->inode != dir); + + mutex_lock(&dir->i_mutex); + dentry = d_lookup(parent, name); + if (likely(!dentry)) { + dentry = d_alloc_and_lookup(parent, name, nd); + if (IS_ERR(dentry)) { + mutex_unlock(&dir->i_mutex); + return PTR_ERR(dentry); + } + /* known good */ + need_reval = 0; + status = 1; + } + mutex_unlock(&dir->i_mutex); + } + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) + status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + if (status < 0) { + dput(dentry); + return status; + } + if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + need_reval = 1; + goto retry; + } + } + + path->mnt = mnt; + path->dentry = dentry; + err = follow_managed(path, nd->flags); + if (unlikely(err < 0)) { + path_put_conditional(path, nd); + return err; + } + if (err) + nd->flags |= LOOKUP_JUMPED; + *inode = path->dentry->d_inode; + return 0; +} + +static inline int may_lookup(struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) { + int err = exec_permission(nd->inode, IPERM_FLAG_RCU); + if (err != -ECHILD) + return err; + if (unlazy_walk(nd, NULL)) + return -ECHILD; + } + return exec_permission(nd->inode, 0); +} + +static inline int handle_dots(struct nameidata *nd, int type) +{ + if (type == LAST_DOTDOT) { + if (nd->flags & LOOKUP_RCU) { + if (follow_dotdot_rcu(nd)) + return -ECHILD; + } else + follow_dotdot(nd); + } + return 0; +} + +static void terminate_walk(struct nameidata *nd) +{ + if (!(nd->flags & LOOKUP_RCU)) { + path_put(&nd->path); + } else { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + } +} + +static inline int walk_component(struct nameidata *nd, struct path *path, + struct qstr *name, int type, int follow) +{ + struct inode *inode; + int err; + /* + * "." and ".." are special - ".." especially so because it has + * to be able to know about the current root directory and + * parent relationships. + */ + if (unlikely(type != LAST_NORM)) + return handle_dots(nd, type); + err = do_lookup(nd, name, path, &inode); + if (unlikely(err)) { + terminate_walk(nd); + return err; + } + if (!inode) { + path_to_nameidata(path, nd); + terminate_walk(nd); + return -ENOENT; + } + if (unlikely(inode->i_op->follow_link) && follow) { + if (nd->flags & LOOKUP_RCU) { + if (unlikely(unlazy_walk(nd, path->dentry))) { + terminate_walk(nd); + return -ECHILD; + } + } + BUG_ON(inode != path->dentry->d_inode); + return 1; + } + path_to_nameidata(path, nd); + nd->inode = inode; + return 0; +} + +/* + * This limits recursive symlink follows to 8, while + * limiting consecutive symlinks to 40. + * + * Without that kind of total limit, nasty chains of consecutive + * symlinks can cause almost arbitrarily long lookups. + */ +static inline int nested_symlink(struct path *path, struct nameidata *nd) +{ + int res; + + if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { + path_put_conditional(path, nd); + path_put(&nd->path); + return -ELOOP; + } + BUG_ON(nd->depth >= MAX_NESTED_LINKS); + + nd->depth++; + current->link_count++; + + do { + struct path link = *path; + void *cookie; + + res = follow_link(&link, nd, &cookie); + if (!res) + res = walk_component(nd, path, &nd->last, + nd->last_type, LOOKUP_FOLLOW); + put_link(nd, &link, cookie); + } while (res > 0); + + current->link_count--; + nd->depth--; + return res; +} + +/* + * Name resolution. + * This is the basic name resolution function, turning a pathname into + * the final dentry. We expect 'base' to be positive and a directory. + * + * Returns 0 and nd will have valid dentry and mnt on success. + * Returns error and drops reference to input namei data on failure. + */ +static int link_path_walk(const char *name, struct nameidata *nd) +{ + struct path next; + int err; + unsigned int lookup_flags = nd->flags; + + while (*name=='/') + name++; + if (!*name) + return 0; + + /* At this point we know we have a real path component. */ + for(;;) { + unsigned long hash; + struct qstr this; + unsigned int c; + int type; + + nd->flags |= LOOKUP_CONTINUE; + + err = may_lookup(nd); + if (err) + break; + + this.name = name; + c = *(const unsigned char *)name; + + hash = init_name_hash(); + do { + name++; + hash = partial_name_hash(c, hash); + c = *(const unsigned char *)name; + } while (c && (c != '/')); + this.len = name - (const char *) this.name; + this.hash = end_name_hash(hash); + + type = LAST_NORM; + if (this.name[0] == '.') switch (this.len) { + case 2: + if (this.name[1] == '.') { + type = LAST_DOTDOT; + nd->flags |= LOOKUP_JUMPED; + } + break; + case 1: + type = LAST_DOT; + } + if (likely(type == LAST_NORM)) { + struct dentry *parent = nd->path.dentry; + nd->flags &= ~LOOKUP_JUMPED; + if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { + err = parent->d_op->d_hash(parent, nd->inode, + &this); + if (err < 0) + break; + } + } + + /* remove trailing slashes? */ + if (!c) + goto last_component; + while (*++name == '/'); + if (!*name) + goto last_component; + + err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); + if (err < 0) + return err; + + if (err) { + err = nested_symlink(&next, nd); + if (err) + return err; + } + err = -ENOTDIR; + if (!nd->inode->i_op->lookup) + break; + continue; + /* here ends the main loop */ + +last_component: + /* Clear LOOKUP_CONTINUE iff it was previously unset */ + nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; + nd->last = this; + nd->last_type = type; + return 0; + } + terminate_walk(nd); + return err; +} + +static int path_init(int dfd, const char *name, unsigned int flags, + struct nameidata *nd, struct file **fp) +{ + int retval = 0; + int fput_needed; + struct file *file; + + nd->last_type = LAST_ROOT; /* if there are only slashes... */ + nd->flags = flags | LOOKUP_JUMPED; + nd->depth = 0; + if (flags & LOOKUP_ROOT) { + struct inode *inode = nd->root.dentry->d_inode; + if (*name) { + if (!inode->i_op->lookup) + return -ENOTDIR; + retval = inode_permission(inode, MAY_EXEC); + if (retval) + return retval; + } + nd->path = nd->root; + nd->inode = inode; + if (flags & LOOKUP_RCU) { + br_read_lock(vfsmount_lock); + rcu_read_lock(); + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } else { + path_get(&nd->path); + } + return 0; + } + + nd->root.mnt = NULL; + + if (*name=='/') { + if (flags & LOOKUP_RCU) { + br_read_lock(vfsmount_lock); + rcu_read_lock(); + set_root_rcu(nd); + } else { + set_root(nd); + path_get(&nd->root); + } + nd->path = nd->root; + } else if (dfd == AT_FDCWD) { + if (flags & LOOKUP_RCU) { + struct fs_struct *fs = current->fs; + unsigned seq; + + br_read_lock(vfsmount_lock); + rcu_read_lock(); + + do { + seq = read_seqcount_begin(&fs->seq); + nd->path = fs->pwd; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + } else { + get_fs_pwd(current->fs, &nd->path); + } + } else { + struct dentry *dentry; + + file = fget_raw_light(dfd, &fput_needed); + retval = -EBADF; + if (!file) + goto out_fail; + + dentry = file->f_path.dentry; + + if (*name) { + retval = -ENOTDIR; + if (!S_ISDIR(dentry->d_inode->i_mode)) + goto fput_fail; + + retval = file_permission(file, MAY_EXEC); + if (retval) + goto fput_fail; + } + + nd->path = file->f_path; + if (flags & LOOKUP_RCU) { + if (fput_needed) + *fp = file; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + br_read_lock(vfsmount_lock); + rcu_read_lock(); + } else { + path_get(&file->f_path); + fput_light(file, fput_needed); + } + } + + nd->inode = nd->path.dentry->d_inode; + return 0; + +fput_fail: + fput_light(file, fput_needed); +out_fail: + return retval; +} + +static inline int lookup_last(struct nameidata *nd, struct path *path) +{ + if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + + nd->flags &= ~LOOKUP_PARENT; + return walk_component(nd, path, &nd->last, nd->last_type, + nd->flags & LOOKUP_FOLLOW); +} + +/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +static int path_lookupat(int dfd, const char *name, + unsigned int flags, struct nameidata *nd) +{ + struct file *base = NULL; + struct path path; + int err; + + /* + * Path walking is largely split up into 2 different synchronisation + * schemes, rcu-walk and ref-walk (explained in + * Documentation/filesystems/path-lookup.txt). These share much of the + * path walk code, but some things particularly setup, cleanup, and + * following mounts are sufficiently divergent that functions are + * duplicated. Typically there is a function foo(), and its RCU + * analogue, foo_rcu(). + * + * -ECHILD is the error number of choice (just to avoid clashes) that + * is returned if some aspect of an rcu-walk fails. Such an error must + * be handled by restarting a traditional ref-walk (which will always + * be able to complete). + */ + err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); + + if (unlikely(err)) + return err; + + current->total_link_count = 0; + err = link_path_walk(name, nd); + + if (!err && !(flags & LOOKUP_PARENT)) { + err = lookup_last(nd, &path); + while (err > 0) { + void *cookie; + struct path link = path; + nd->flags |= LOOKUP_PARENT; + err = follow_link(&link, nd, &cookie); + if (!err) + err = lookup_last(nd, &path); + put_link(nd, &link, cookie); + } + } + + if (!err) + err = complete_walk(nd); + + if (!err && nd->flags & LOOKUP_DIRECTORY) { + if (!nd->inode->i_op->lookup) { + path_put(&nd->path); + err = -ENOTDIR; + } + } + + if (base) + fput(base); + + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + path_put(&nd->root); + nd->root.mnt = NULL; + } + return err; +} + +static int do_path_lookup(int dfd, const char *name, + unsigned int flags, struct nameidata *nd) +{ + int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); + if (unlikely(retval == -ECHILD)) + retval = path_lookupat(dfd, name, flags, nd); + if (unlikely(retval == -ESTALE)) + retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd); + + if (likely(!retval)) { + if (unlikely(!audit_dummy_context())) { + if (nd->path.dentry && nd->inode) + audit_inode(name, nd->path.dentry); + } + } + return retval; +} + +int kern_path_parent(const char *name, struct nameidata *nd) +{ + return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd); +} + +int kern_path(const char *name, unsigned int flags, struct path *path) +{ + struct nameidata nd; + int res = do_path_lookup(AT_FDCWD, name, flags, &nd); + if (!res) + *path = nd.path; + return res; +} + +/** + * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair + * @dentry: pointer to dentry of the base directory + * @mnt: pointer to vfs mount of the base directory + * @name: pointer to file name + * @flags: lookup flags + * @nd: pointer to nameidata + */ +int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, + const char *name, unsigned int flags, + struct nameidata *nd) +{ + nd->root.dentry = dentry; + nd->root.mnt = mnt; + /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */ + return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd); +} + +static struct dentry *__lookup_hash(struct qstr *name, + struct dentry *base, struct nameidata *nd) +{ + struct inode *inode = base->d_inode; + struct dentry *dentry; + int err; + + err = exec_permission(inode, 0); + if (err) + return ERR_PTR(err); + + /* + * Don't bother with __d_lookup: callers are for creat as + * well as unlink, so a lot of the time it would cost + * a double lookup. + */ + dentry = d_lookup(base, name); + + if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) + dentry = do_revalidate(dentry, nd); + + if (!dentry) + dentry = d_alloc_and_lookup(base, name, nd); + + return dentry; +} + +/* + * Restricted form of lookup. Doesn't follow links, single-component only, + * needs parent already locked. Doesn't follow mounts. + * SMP-safe. + */ +static struct dentry *lookup_hash(struct nameidata *nd) +{ + return __lookup_hash(&nd->last, nd->path.dentry, nd); +} + +/** + * lookup_one_len - filesystem helper to lookup single pathname component + * @name: pathname component to lookup + * @base: base directory to lookup from + * @len: maximum length @len should be interpreted to + * + * Note that this routine is purely a helper for filesystem usage and should + * not be called by generic code. Also note that by using this function the + * nameidata argument is passed to the filesystem methods and a filesystem + * using this helper needs to be prepared for that. + */ +struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) +{ + struct qstr this; + unsigned long hash; + unsigned int c; + + WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); + + this.name = name; + this.len = len; + if (!len) + return ERR_PTR(-EACCES); + + hash = init_name_hash(); + while (len--) { + c = *(const unsigned char *)name++; + if (c == '/' || c == '\0') + return ERR_PTR(-EACCES); + hash = partial_name_hash(c, hash); + } + this.hash = end_name_hash(hash); + /* + * See if the low-level filesystem might want + * to use its own hash.. + */ + if (base->d_flags & DCACHE_OP_HASH) { + int err = base->d_op->d_hash(base, base->d_inode, &this); + if (err < 0) + return ERR_PTR(err); + } + + return __lookup_hash(&this, base, NULL); +} + +int user_path_at_empty(int dfd, const char __user *name, unsigned flags, + struct path *path, int *empty) +{ + struct nameidata nd; + char *tmp = getname_flags(name, flags, empty); + int err = PTR_ERR(tmp); + if (!IS_ERR(tmp)) { + + BUG_ON(flags & LOOKUP_PARENT); + + err = do_path_lookup(dfd, tmp, flags, &nd); + putname(tmp); + if (!err) + *path = nd.path; + } + return err; +} + +int user_path_at(int dfd, const char __user *name, unsigned flags, + struct path *path) +{ + return user_path_at_empty(dfd, name, flags, path, 0); +} + +static int user_path_parent(int dfd, const char __user *path, + struct nameidata *nd, char **name) +{ + char *s = getname(path); + int error; + + if (IS_ERR(s)) + return PTR_ERR(s); + + error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd); + if (error) + putname(s); + else + *name = s; + + return error; +} + +/* + * It's inline, so penalty for filesystems that don't use sticky bit is + * minimal. + */ +static inline int check_sticky(struct inode *dir, struct inode *inode) +{ + uid_t fsuid = current_fsuid(); + + if (!(dir->i_mode & S_ISVTX)) + return 0; + if (current_user_ns() != inode_userns(inode)) + goto other_userns; + if (inode->i_uid == fsuid) + return 0; + if (dir->i_uid == fsuid) + return 0; + +other_userns: + return !ns_capable(inode_userns(inode), CAP_FOWNER); +} + +/* + * Check whether we can remove a link victim from directory dir, check + * whether the type of victim is right. + * 1. We can't do it if dir is read-only (done in permission()) + * 2. We should have write and exec permissions on dir + * 3. We can't remove anything from append-only dir + * 4. We can't do anything with immutable dir (done in permission()) + * 5. If the sticky bit on dir is set we should either + * a. be owner of dir, or + * b. be owner of victim, or + * c. have CAP_FOWNER capability + * 6. If the victim is append-only or immutable we can't do antyhing with + * links pointing to it. + * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. + * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. + * 9. We can't remove a root or mountpoint. + * 10. We don't allow removal of NFS sillyrenamed files; it's handled by + * nfs_async_unlink(). + */ +static int may_delete(struct inode *dir,struct dentry *victim,int isdir) +{ + int error; + + if (!victim->d_inode) + return -ENOENT; + + BUG_ON(victim->d_parent->d_inode != dir); + audit_inode_child(victim, dir); + + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + if (error) + return error; + if (IS_APPEND(dir)) + return -EPERM; + if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)|| + IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + return -EPERM; + if (isdir) { + if (!S_ISDIR(victim->d_inode->i_mode)) + return -ENOTDIR; + if (IS_ROOT(victim)) + return -EBUSY; + } else if (S_ISDIR(victim->d_inode->i_mode)) + return -EISDIR; + if (IS_DEADDIR(dir)) + return -ENOENT; + if (victim->d_flags & DCACHE_NFSFS_RENAMED) + return -EBUSY; + return 0; +} + +/* Check whether we can create an object with dentry child in directory + * dir. + * 1. We can't do it if child already exists (open has special treatment for + * this case, but since we are inlined it's OK) + * 2. We can't do it if dir is read-only (done in permission()) + * 3. We should have write and exec permissions on dir + * 4. We can't do it if dir is immutable (done in permission()) + */ +static inline int may_create(struct inode *dir, struct dentry *child) +{ + if (child->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/* + * p1 and p2 should be directories on the same fs. + */ +struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) +{ + struct dentry *p; + + if (p1 == p2) { + mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); + return NULL; + } + + mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex); + + p = d_ancestor(p2, p1); + if (p) { + mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD); + return p; + } + + p = d_ancestor(p1, p2); + if (p) { + mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + return p; + } + + mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + return NULL; +} + +void unlock_rename(struct dentry *p1, struct dentry *p2) +{ + mutex_unlock(&p1->d_inode->i_mutex); + if (p1 != p2) { + mutex_unlock(&p2->d_inode->i_mutex); + mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); + } +} + +int vfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + int error = may_create(dir, dentry); + + if (error) + return error; + + if (!dir->i_op->create) + return -EACCES; /* shouldn't it be ENOSYS? */ + mode &= S_IALLUGO; + mode |= S_IFREG; + error = security_inode_create(dir, dentry, mode); + if (error) + return error; + error = dir->i_op->create(dir, dentry, mode, nd); + if (!error) + fsnotify_create(dir, dentry); + return error; +} + +static int may_open(struct path *path, int acc_mode, int flag) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = dentry->d_inode; + int error; + + /* O_PATH? */ + if (!acc_mode) + return 0; + + if (!inode) + return -ENOENT; + + switch (inode->i_mode & S_IFMT) { + case S_IFLNK: + return -ELOOP; + case S_IFDIR: + if (acc_mode & MAY_WRITE) + return -EISDIR; + break; + case S_IFBLK: + case S_IFCHR: + if (path->mnt->mnt_flags & MNT_NODEV) + return -EACCES; + /*FALLTHRU*/ + case S_IFIFO: + case S_IFSOCK: + flag &= ~O_TRUNC; + break; + } + + error = inode_permission(inode, acc_mode); + if (error) + return error; + + /* + * An append-only file must be opened in append mode for writing. + */ + if (IS_APPEND(inode)) { + if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) + return -EPERM; + if (flag & O_TRUNC) + return -EPERM; + } + + /* O_NOATIME can only be set by the owner or superuser */ + if (flag & O_NOATIME && !inode_owner_or_capable(inode)) + return -EPERM; + + /* + * Ensure there are no outstanding leases on the file. + */ + return break_lease(inode, flag); +} + +static int handle_truncate(struct file *filp) +{ + struct path *path = &filp->f_path; + struct inode *inode = path->dentry->d_inode; + int error = get_write_access(inode); + if (error) + return error; + /* + * Refuse to truncate files with mandatory locks held on them. + */ + error = locks_verify_locked(inode); + if (!error) + error = security_path_truncate(path); + if (!error) { + error = do_truncate(path->dentry, 0, + ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, + filp); + } + put_write_access(inode); + return error; +} + +/* + * Note that while the flag value (low two bits) for sys_open means: + * 00 - read-only + * 01 - write-only + * 10 - read-write + * 11 - special + * it is changed into + * 00 - no permissions needed + * 01 - read-permission + * 10 - write-permission + * 11 - read-write + * for the internal routines (ie open_namei()/follow_link() etc) + * This is more logical, and also allows the 00 "no perm needed" + * to be used for symlinks (where the permissions are checked + * later). + * +*/ +static inline int open_to_namei_flags(int flag) +{ + if ((flag+1) & O_ACCMODE) + flag++; + return flag; +} + +/* + * Handle the last step of open() + */ +static struct file *do_last(struct nameidata *nd, struct path *path, + const struct open_flags *op, const char *pathname) +{ + struct dentry *dir = nd->path.dentry; + struct dentry *dentry; + int open_flag = op->open_flag; + int will_truncate = open_flag & O_TRUNC; + int want_write = 0; + int acc_mode = op->acc_mode; + struct file *filp; + int error; + + nd->flags &= ~LOOKUP_PARENT; + nd->flags |= op->intent; + + switch (nd->last_type) { + case LAST_DOTDOT: + case LAST_DOT: + error = handle_dots(nd, nd->last_type); + if (error) + return ERR_PTR(error); + /* fallthrough */ + case LAST_ROOT: + error = complete_walk(nd); + if (error) + return ERR_PTR(error); + audit_inode(pathname, nd->path.dentry); + if (open_flag & O_CREAT) { + error = -EISDIR; + goto exit; + } + goto ok; + case LAST_BIND: + error = complete_walk(nd); + if (error) + return ERR_PTR(error); + audit_inode(pathname, dir); + goto ok; + } + + if (!(open_flag & O_CREAT)) { + int symlink_ok = 0; + if (nd->last.name[nd->last.len]) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) + symlink_ok = 1; + /* we _can_ be in RCU mode here */ + error = walk_component(nd, path, &nd->last, LAST_NORM, + !symlink_ok); + if (error < 0) + return ERR_PTR(error); + if (error) /* symlink */ + return NULL; + /* sayonara */ + error = complete_walk(nd); + if (error) + return ERR_PTR(error); + + error = -ENOTDIR; + if (nd->flags & LOOKUP_DIRECTORY) { + if (!nd->inode->i_op->lookup) + goto exit; + } + audit_inode(pathname, nd->path.dentry); + goto ok; + } + + /* create side of things */ + /* + * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been + * cleared when we got to the last component we are about to look up + */ + error = complete_walk(nd); + if (error) + return ERR_PTR(error); + + audit_inode(pathname, dir); + error = -EISDIR; + /* trailing slashes? */ + if (nd->last.name[nd->last.len]) + goto exit; + + mutex_lock(&dir->d_inode->i_mutex); + + dentry = lookup_hash(nd); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) { + mutex_unlock(&dir->d_inode->i_mutex); + goto exit; + } + + path->dentry = dentry; + path->mnt = nd->path.mnt; + + /* Negative dentry, just create the file */ + if (!dentry->d_inode) { + int mode = op->mode; + if (!IS_POSIXACL(dir->d_inode)) + mode &= ~current_umask(); + /* + * This write is needed to ensure that a + * rw->ro transition does not occur between + * the time when the file is created and when + * a permanent write count is taken through + * the 'struct file' in nameidata_to_filp(). + */ + error = mnt_want_write(nd->path.mnt); + if (error) + goto exit_mutex_unlock; + want_write = 1; + /* Don't check for write permission, don't truncate */ + open_flag &= ~O_TRUNC; + will_truncate = 0; + acc_mode = MAY_OPEN; + error = security_path_mknod(&nd->path, dentry, mode, 0); + if (error) + goto exit_mutex_unlock; + error = vfs_create(dir->d_inode, dentry, mode, nd); + if (error) + goto exit_mutex_unlock; + mutex_unlock(&dir->d_inode->i_mutex); + dput(nd->path.dentry); + nd->path.dentry = dentry; + goto common; + } + + /* + * It already exists. + */ + mutex_unlock(&dir->d_inode->i_mutex); + audit_inode(pathname, path->dentry); + + error = -EEXIST; + if (open_flag & O_EXCL) + goto exit_dput; + + error = follow_managed(path, nd->flags); + if (error < 0) + goto exit_dput; + + if (error) + nd->flags |= LOOKUP_JUMPED; + + error = -ENOENT; + if (!path->dentry->d_inode) + goto exit_dput; + + if (path->dentry->d_inode->i_op->follow_link) + return NULL; + + path_to_nameidata(path, nd); + nd->inode = path->dentry->d_inode; + /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ + error = complete_walk(nd); + if (error) + return ERR_PTR(error); + error = -EISDIR; + if (S_ISDIR(nd->inode->i_mode)) + goto exit; +ok: + if (!S_ISREG(nd->inode->i_mode)) + will_truncate = 0; + + if (will_truncate) { + error = mnt_want_write(nd->path.mnt); + if (error) + goto exit; + want_write = 1; + } +common: + error = may_open(&nd->path, acc_mode, open_flag); + if (error) + goto exit; + filp = nameidata_to_filp(nd); + if (!IS_ERR(filp)) { + error = ima_file_check(filp, op->acc_mode); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + if (!IS_ERR(filp)) { + if (will_truncate) { + error = handle_truncate(filp); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + } +out: + if (want_write) + mnt_drop_write(nd->path.mnt); + path_put(&nd->path); + return filp; + +exit_mutex_unlock: + mutex_unlock(&dir->d_inode->i_mutex); +exit_dput: + path_put_conditional(path, nd); +exit: + filp = ERR_PTR(error); + goto out; +} + +static struct file *path_openat(int dfd, const char *pathname, + struct nameidata *nd, const struct open_flags *op, int flags) +{ + struct file *base = NULL; + struct file *filp; + struct path path; + int error; + + filp = get_empty_filp(); + if (!filp) + return ERR_PTR(-ENFILE); + + filp->f_flags = op->open_flag; + nd->intent.open.file = filp; + nd->intent.open.flags = open_to_namei_flags(op->open_flag); + nd->intent.open.create_mode = op->mode; + + error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base); + if (unlikely(error)) + goto out_filp; + + current->total_link_count = 0; + error = link_path_walk(pathname, nd); + if (unlikely(error)) + goto out_filp; + + filp = do_last(nd, &path, op, pathname); + while (unlikely(!filp)) { /* trailing symlink */ + struct path link = path; + void *cookie; + if (!(nd->flags & LOOKUP_FOLLOW)) { + path_put_conditional(&path, nd); + path_put(&nd->path); + filp = ERR_PTR(-ELOOP); + break; + } + nd->flags |= LOOKUP_PARENT; + nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); + error = follow_link(&link, nd, &cookie); + if (unlikely(error)) + filp = ERR_PTR(error); + else + filp = do_last(nd, &path, op, pathname); + put_link(nd, &link, cookie); + } +out: + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) + path_put(&nd->root); + if (base) + fput(base); + release_open_intent(nd); + return filp; + +out_filp: + filp = ERR_PTR(error); + goto out; +} + +struct file *do_filp_open(int dfd, const char *pathname, + const struct open_flags *op, int flags) +{ + struct nameidata nd; + struct file *filp; + + filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); + if (unlikely(filp == ERR_PTR(-ECHILD))) + filp = path_openat(dfd, pathname, &nd, op, flags); + if (unlikely(filp == ERR_PTR(-ESTALE))) + filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); + return filp; +} + +struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *name, const struct open_flags *op, int flags) +{ + struct nameidata nd; + struct file *file; + + nd.root.mnt = mnt; + nd.root.dentry = dentry; + + flags |= LOOKUP_ROOT; + + if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) + return ERR_PTR(-ELOOP); + + file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU); + if (unlikely(file == ERR_PTR(-ECHILD))) + file = path_openat(-1, name, &nd, op, flags); + if (unlikely(file == ERR_PTR(-ESTALE))) + file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL); + return file; +} + +/** + * lookup_create - lookup a dentry, creating it if it doesn't exist + * @nd: nameidata info + * @is_dir: directory flag + * + * Simple function to lookup and return a dentry and create it + * if it doesn't exist. Is SMP-safe. + * + * Returns with nd->path.dentry->d_inode->i_mutex locked. + */ +struct dentry *lookup_create(struct nameidata *nd, int is_dir) +{ + struct dentry *dentry = ERR_PTR(-EEXIST); + + mutex_lock_nested(&nd->path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + /* + * Yucky last component or no last component at all? + * (foo/., foo/.., /////) + */ + if (nd->last_type != LAST_NORM) + goto fail; + nd->flags &= ~LOOKUP_PARENT; + nd->flags |= LOOKUP_CREATE | LOOKUP_EXCL; + nd->intent.open.flags = O_EXCL; + + /* + * Do the final lookup. + */ + dentry = lookup_hash(nd); + if (IS_ERR(dentry)) + goto fail; + + if (dentry->d_inode) + goto eexist; + /* + * Special case - lookup gave negative, but... we had foo/bar/ + * From the vfs_mknod() POV we just have a negative dentry - + * all is fine. Let's be bastards - you had / on the end, you've + * been asking for (non-existent) directory. -ENOENT for you. + */ + if (unlikely(!is_dir && nd->last.name[nd->last.len])) { + dput(dentry); + dentry = ERR_PTR(-ENOENT); + } + return dentry; +eexist: + dput(dentry); + dentry = ERR_PTR(-EEXIST); +fail: + return dentry; +} +EXPORT_SYMBOL_GPL(lookup_create); + +int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +{ + int error = may_create(dir, dentry); + + if (error) + return error; + + if ((S_ISCHR(mode) || S_ISBLK(mode)) && + !ns_capable(inode_userns(dir), CAP_MKNOD)) + return -EPERM; + + if (!dir->i_op->mknod) + return -EPERM; + + error = devcgroup_inode_mknod(mode, dev); + if (error) + return error; + + error = security_inode_mknod(dir, dentry, mode, dev); + if (error) + return error; + + error = dir->i_op->mknod(dir, dentry, mode, dev); + if (!error) + fsnotify_create(dir, dentry); + return error; +} + +static int may_mknod(mode_t mode) +{ + switch (mode & S_IFMT) { + case S_IFREG: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + case 0: /* zero mode translates to S_IFREG */ + return 0; + case S_IFDIR: + return -EPERM; + default: + return -EINVAL; + } +} + +SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode, + unsigned, dev) +{ + int error; + char *tmp; + struct dentry *dentry; + struct nameidata nd; + + if (S_ISDIR(mode)) + return -EPERM; + + error = user_path_parent(dfd, filename, &nd, &tmp); + if (error) + return error; + + dentry = lookup_create(&nd, 0); + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + goto out_unlock; + } + if (!IS_POSIXACL(nd.path.dentry->d_inode)) + mode &= ~current_umask(); + error = may_mknod(mode); + if (error) + goto out_dput; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; + error = security_path_mknod(&nd.path, dentry, mode, dev); + if (error) + goto out_drop_write; + switch (mode & S_IFMT) { + case 0: case S_IFREG: + error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd); + break; + case S_IFCHR: case S_IFBLK: + error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode, + new_decode_dev(dev)); + break; + case S_IFIFO: case S_IFSOCK: + error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0); + break; + } +out_drop_write: + mnt_drop_write(nd.path.mnt); +out_dput: + dput(dentry); +out_unlock: + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); + putname(tmp); + + return error; +} + +SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev) +{ + return sys_mknodat(AT_FDCWD, filename, mode, dev); +} + +int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int error = may_create(dir, dentry); + + if (error) + return error; + + if (!dir->i_op->mkdir) + return -EPERM; + + mode &= (S_IRWXUGO|S_ISVTX); + error = security_inode_mkdir(dir, dentry, mode); + if (error) + return error; + + error = dir->i_op->mkdir(dir, dentry, mode); + if (!error) + fsnotify_mkdir(dir, dentry); + return error; +} + +SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode) +{ + int error = 0; + char * tmp; + struct dentry *dentry; + struct nameidata nd; + + error = user_path_parent(dfd, pathname, &nd, &tmp); + if (error) + goto out_err; + + dentry = lookup_create(&nd, 1); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_unlock; + + if (!IS_POSIXACL(nd.path.dentry->d_inode)) + mode &= ~current_umask(); + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; + error = security_path_mkdir(&nd.path, dentry, mode); + if (error) + goto out_drop_write; + error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode); +out_drop_write: + mnt_drop_write(nd.path.mnt); +out_dput: + dput(dentry); +out_unlock: + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); + putname(tmp); +out_err: + return error; +} + +SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode) +{ + return sys_mkdirat(AT_FDCWD, pathname, mode); +} + +/* + * The dentry_unhash() helper will try to drop the dentry early: we + * should have a usage count of 2 if we're the only user of this + * dentry, and if that is true (possibly after pruning the dcache), + * then we drop the dentry now. + * + * A low-level filesystem can, if it choses, legally + * do a + * + * if (!d_unhashed(dentry)) + * return -EBUSY; + * + * if it cannot handle the case of removing a directory + * that is still in use by something else.. + */ +void dentry_unhash(struct dentry *dentry) +{ + shrink_dcache_parent(dentry); + spin_lock(&dentry->d_lock); + if (dentry->d_count == 1) + __d_drop(dentry); + spin_unlock(&dentry->d_lock); +} + +int vfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + int error = may_delete(dir, dentry, 1); + + if (error) + return error; + + if (!dir->i_op->rmdir) + return -EPERM; + + dget(dentry); + mutex_lock(&dentry->d_inode->i_mutex); + + error = -EBUSY; + if (d_mountpoint(dentry)) + goto out; + + error = security_inode_rmdir(dir, dentry); + if (error) + goto out; + + shrink_dcache_parent(dentry); + error = dir->i_op->rmdir(dir, dentry); + if (error) + goto out; + + dentry->d_inode->i_flags |= S_DEAD; + dont_mount(dentry); + +out: + mutex_unlock(&dentry->d_inode->i_mutex); + dput(dentry); + if (!error) + d_delete(dentry); + return error; +} + +static long do_rmdir(int dfd, const char __user *pathname) +{ + int error = 0; + char * name; + struct dentry *dentry; + struct nameidata nd; + + error = user_path_parent(dfd, pathname, &nd, &name); + if (error) + return error; + + switch(nd.last_type) { + case LAST_DOTDOT: + error = -ENOTEMPTY; + goto exit1; + case LAST_DOT: + error = -EINVAL; + goto exit1; + case LAST_ROOT: + error = -EBUSY; + goto exit1; + } + + nd.flags &= ~LOOKUP_PARENT; + + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = lookup_hash(&nd); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto exit2; + if (!dentry->d_inode) { + error = -ENOENT; + goto exit3; + } + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit3; + error = security_path_rmdir(&nd.path, dentry); + if (error) + goto exit4; + error = vfs_rmdir(nd.path.dentry->d_inode, dentry); +exit4: + mnt_drop_write(nd.path.mnt); +exit3: + dput(dentry); +exit2: + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); +exit1: + path_put(&nd.path); + putname(name); + return error; +} + +SYSCALL_DEFINE1(rmdir, const char __user *, pathname) +{ + return do_rmdir(AT_FDCWD, pathname); +} + +int vfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int error = may_delete(dir, dentry, 0); + + if (error) + return error; + + if (!dir->i_op->unlink) + return -EPERM; + + mutex_lock(&dentry->d_inode->i_mutex); + if (d_mountpoint(dentry)) + error = -EBUSY; + else { + error = security_inode_unlink(dir, dentry); + if (!error) { + error = dir->i_op->unlink(dir, dentry); + if (!error) + dont_mount(dentry); + } + } + mutex_unlock(&dentry->d_inode->i_mutex); + + /* We don't d_delete() NFS sillyrenamed files--they still exist. */ + if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { + fsnotify_link_count(dentry->d_inode); + d_delete(dentry); + } + + return error; +} + +/* + * Make sure that the actual truncation of the file will occur outside its + * directory's i_mutex. Truncate can take a long time if there is a lot of + * writeout happening, and we don't want to prevent access to the directory + * while waiting on the I/O. + */ +static long do_unlinkat(int dfd, const char __user *pathname) +{ + int error; + char *name; + struct dentry *dentry; + struct nameidata nd; + struct inode *inode = NULL; + + error = user_path_parent(dfd, pathname, &nd, &name); + if (error) + return error; + + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; + + nd.flags &= ~LOOKUP_PARENT; + + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = lookup_hash(&nd); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + /* Why not before? Because we want correct error value */ + if (nd.last.name[nd.last.len]) + goto slashes; + inode = dentry->d_inode; + if (!inode) + goto slashes; + ihold(inode); + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit2; + error = security_path_unlink(&nd.path, dentry); + if (error) + goto exit3; + error = vfs_unlink(nd.path.dentry->d_inode, dentry); +exit3: + mnt_drop_write(nd.path.mnt); + exit2: + dput(dentry); + } + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + if (inode) + iput(inode); /* truncate the inode here */ +exit1: + path_put(&nd.path); + putname(name); + return error; + +slashes: + error = !dentry->d_inode ? -ENOENT : + S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR; + goto exit2; +} + +SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) +{ + if ((flag & ~AT_REMOVEDIR) != 0) + return -EINVAL; + + if (flag & AT_REMOVEDIR) + return do_rmdir(dfd, pathname); + + return do_unlinkat(dfd, pathname); +} + +SYSCALL_DEFINE1(unlink, const char __user *, pathname) +{ + return do_unlinkat(AT_FDCWD, pathname); +} + +int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) +{ + int error = may_create(dir, dentry); + + if (error) + return error; + + if (!dir->i_op->symlink) + return -EPERM; + + error = security_inode_symlink(dir, dentry, oldname); + if (error) + return error; + + error = dir->i_op->symlink(dir, dentry, oldname); + if (!error) + fsnotify_create(dir, dentry); + return error; +} + +SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, + int, newdfd, const char __user *, newname) +{ + int error; + char *from; + char *to; + struct dentry *dentry; + struct nameidata nd; + + from = getname(oldname); + if (IS_ERR(from)) + return PTR_ERR(from); + + error = user_path_parent(newdfd, newname, &nd, &to); + if (error) + goto out_putname; + + dentry = lookup_create(&nd, 0); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_unlock; + + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; + error = security_path_symlink(&nd.path, dentry, from); + if (error) + goto out_drop_write; + error = vfs_symlink(nd.path.dentry->d_inode, dentry, from); +out_drop_write: + mnt_drop_write(nd.path.mnt); +out_dput: + dput(dentry); +out_unlock: + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); + putname(to); +out_putname: + putname(from); + return error; +} + +SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) +{ + return sys_symlinkat(oldname, AT_FDCWD, newname); +} + +int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) +{ + struct inode *inode = old_dentry->d_inode; + int error; + + if (!inode) + return -ENOENT; + + error = may_create(dir, new_dentry); + if (error) + return error; + + if (dir->i_sb != inode->i_sb) + return -EXDEV; + + /* + * A link to an append-only or immutable file cannot be created. + */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + if (!dir->i_op->link) + return -EPERM; + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + error = security_inode_link(old_dentry, dir, new_dentry); + if (error) + return error; + + mutex_lock(&inode->i_mutex); + /* Make sure we don't allow creating hardlink to an unlinked file */ + if (inode->i_nlink == 0) + error = -ENOENT; + else + error = dir->i_op->link(old_dentry, dir, new_dentry); + mutex_unlock(&inode->i_mutex); + if (!error) + fsnotify_link(dir, inode, new_dentry); + return error; +} + +/* + * Hardlinks are often used in delicate situations. We avoid + * security-related surprises by not following symlinks on the + * newname. --KAB + * + * We don't follow them on the oldname either to be compatible + * with linux 2.0, and to avoid hard-linking to directories + * and other special files. --ADM + */ +SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname, int, flags) +{ + struct dentry *new_dentry; + struct nameidata nd; + struct path old_path; + int how = 0; + int error; + char *to; + + if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + /* + * To use null names we require CAP_DAC_READ_SEARCH + * This ensures that not everyone will be able to create + * handlink using the passed filedescriptor. + */ + if (flags & AT_EMPTY_PATH) { + if (!capable(CAP_DAC_READ_SEARCH)) + return -ENOENT; + how = LOOKUP_EMPTY; + } + + if (flags & AT_SYMLINK_FOLLOW) + how |= LOOKUP_FOLLOW; + + error = user_path_at(olddfd, oldname, how, &old_path); + if (error) + return error; + + error = user_path_parent(newdfd, newname, &nd, &to); + if (error) + goto out; + error = -EXDEV; + if (old_path.mnt != nd.path.mnt) + goto out_release; + new_dentry = lookup_create(&nd, 0); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) + goto out_unlock; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; + error = security_path_link(old_path.dentry, &nd.path, new_dentry); + if (error) + goto out_drop_write; + error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry); +out_drop_write: + mnt_drop_write(nd.path.mnt); +out_dput: + dput(new_dentry); +out_unlock: + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); +out_release: + path_put(&nd.path); + putname(to); +out: + path_put(&old_path); + + return error; +} + +SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) +{ + return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); +} + +/* + * The worst of all namespace operations - renaming directory. "Perverted" + * doesn't even start to describe it. Somebody in UCB had a heck of a trip... + * Problems: + * a) we can get into loop creation. Check is done in is_subdir(). + * b) race potential - two innocent renames can create a loop together. + * That's where 4.4 screws up. Current fix: serialization on + * sb->s_vfs_rename_mutex. We might be more accurate, but that's another + * story. + * c) we have to lock _three_ objects - parents and victim (if it exists). + * And that - after we got ->i_mutex on parents (until then we don't know + * whether the target exists). Solution: try to be smart with locking + * order for inodes. We rely on the fact that tree topology may change + * only under ->s_vfs_rename_mutex _and_ that parent of the object we + * move will be locked. Thus we can rank directories by the tree + * (ancestors first) and rank all non-directories after them. + * That works since everybody except rename does "lock parent, lookup, + * lock child" and rename is under ->s_vfs_rename_mutex. + * HOWEVER, it relies on the assumption that any object with ->lookup() + * has no more than 1 dentry. If "hybrid" objects will ever appear, + * we'd better make sure that there's no link(2) for them. + * d) conversion from fhandle to dentry may come in the wrong moment - when + * we are removing the target. Solution: we will have to grab ->i_mutex + * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on + * ->i_mutex on parents, which works but leads to some truly excessive + * locking]. + */ +static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int error = 0; + struct inode *target = new_dentry->d_inode; + + /* + * If we are going to change the parent - check write permissions, + * we'll need to flip '..'. + */ + if (new_dir != old_dir) { + error = inode_permission(old_dentry->d_inode, MAY_WRITE); + if (error) + return error; + } + + error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); + if (error) + return error; + + dget(new_dentry); + if (target) + mutex_lock(&target->i_mutex); + + error = -EBUSY; + if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) + goto out; + + if (target) + shrink_dcache_parent(new_dentry); + error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + if (error) + goto out; + + if (target) { + target->i_flags |= S_DEAD; + dont_mount(new_dentry); + } +out: + if (target) + mutex_unlock(&target->i_mutex); + dput(new_dentry); + if (!error) + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) + d_move(old_dentry,new_dentry); + return error; +} + +static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *target = new_dentry->d_inode; + int error; + + error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); + if (error) + return error; + + dget(new_dentry); + if (target) + mutex_lock(&target->i_mutex); + + error = -EBUSY; + if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) + goto out; + + error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + if (error) + goto out; + + if (target) + dont_mount(new_dentry); + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) + d_move(old_dentry, new_dentry); +out: + if (target) + mutex_unlock(&target->i_mutex); + dput(new_dentry); + return error; +} + +int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int error; + int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); + const unsigned char *old_name; + + if (old_dentry->d_inode == new_dentry->d_inode) + return 0; + + error = may_delete(old_dir, old_dentry, is_dir); + if (error) + return error; + + if (!new_dentry->d_inode) + error = may_create(new_dir, new_dentry); + else + error = may_delete(new_dir, new_dentry, is_dir); + if (error) + return error; + + if (!old_dir->i_op->rename) + return -EPERM; + + old_name = fsnotify_oldname_init(old_dentry->d_name.name); + + if (is_dir) + error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); + else + error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); + if (!error) + fsnotify_move(old_dir, new_dir, old_name, is_dir, + new_dentry->d_inode, old_dentry); + fsnotify_oldname_free(old_name); + + return error; +} + +SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname) +{ + struct dentry *old_dir, *new_dir; + struct dentry *old_dentry, *new_dentry; + struct dentry *trap; + struct nameidata oldnd, newnd; + char *from; + char *to; + int error; + + error = user_path_parent(olddfd, oldname, &oldnd, &from); + if (error) + goto exit; + + error = user_path_parent(newdfd, newname, &newnd, &to); + if (error) + goto exit1; + + error = -EXDEV; + if (oldnd.path.mnt != newnd.path.mnt) + goto exit2; + + old_dir = oldnd.path.dentry; + error = -EBUSY; + if (oldnd.last_type != LAST_NORM) + goto exit2; + + new_dir = newnd.path.dentry; + if (newnd.last_type != LAST_NORM) + goto exit2; + + oldnd.flags &= ~LOOKUP_PARENT; + newnd.flags &= ~LOOKUP_PARENT; + newnd.flags |= LOOKUP_RENAME_TARGET; + + trap = lock_rename(new_dir, old_dir); + + old_dentry = lookup_hash(&oldnd); + error = PTR_ERR(old_dentry); + if (IS_ERR(old_dentry)) + goto exit3; + /* source must exist */ + error = -ENOENT; + if (!old_dentry->d_inode) + goto exit4; + /* unless the source is a directory trailing slashes give -ENOTDIR */ + if (!S_ISDIR(old_dentry->d_inode->i_mode)) { + error = -ENOTDIR; + if (oldnd.last.name[oldnd.last.len]) + goto exit4; + if (newnd.last.name[newnd.last.len]) + goto exit4; + } + /* source should not be ancestor of target */ + error = -EINVAL; + if (old_dentry == trap) + goto exit4; + new_dentry = lookup_hash(&newnd); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) + goto exit4; + /* target should not be an ancestor of source */ + error = -ENOTEMPTY; + if (new_dentry == trap) + goto exit5; + + error = mnt_want_write(oldnd.path.mnt); + if (error) + goto exit5; + error = security_path_rename(&oldnd.path, old_dentry, + &newnd.path, new_dentry); + if (error) + goto exit6; + error = vfs_rename(old_dir->d_inode, old_dentry, + new_dir->d_inode, new_dentry); +exit6: + mnt_drop_write(oldnd.path.mnt); +exit5: + dput(new_dentry); +exit4: + dput(old_dentry); +exit3: + unlock_rename(new_dir, old_dir); +exit2: + path_put(&newnd.path); + putname(to); +exit1: + path_put(&oldnd.path); + putname(from); +exit: + return error; +} + +SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) +{ + return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); +} + +int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) +{ + int len; + + len = PTR_ERR(link); + if (IS_ERR(link)) + goto out; + + len = strlen(link); + if (len > (unsigned) buflen) + len = buflen; + if (copy_to_user(buffer, link, len)) + len = -EFAULT; +out: + return len; +} + +/* + * A helper for ->readlink(). This should be used *ONLY* for symlinks that + * have ->follow_link() touching nd only in nd_set_link(). Using (or not + * using) it for any given inode is up to filesystem. + */ +int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + struct nameidata nd; + void *cookie; + int res; + + nd.depth = 0; + cookie = dentry->d_inode->i_op->follow_link(dentry, &nd); + if (IS_ERR(cookie)) + return PTR_ERR(cookie); + + res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); + if (dentry->d_inode->i_op->put_link) + dentry->d_inode->i_op->put_link(dentry, &nd, cookie); + return res; +} + +int vfs_follow_link(struct nameidata *nd, const char *link) +{ + return __vfs_follow_link(nd, link); +} + +/* get the link contents into pagecache */ +static char *page_getlink(struct dentry * dentry, struct page **ppage) +{ + char *kaddr; + struct page *page; + struct address_space *mapping = dentry->d_inode->i_mapping; + page = read_mapping_page(mapping, 0, NULL); + if (IS_ERR(page)) + return (char*)page; + *ppage = page; + kaddr = kmap(page); + nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1); + return kaddr; +} + +int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + struct page *page = NULL; + char *s = page_getlink(dentry, &page); + int res = vfs_readlink(dentry,buffer,buflen,s); + if (page) { + kunmap(page); + page_cache_release(page); + } + return res; +} + +void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) +{ + struct page *page = NULL; + nd_set_link(nd, page_getlink(dentry, &page)); + return page; +} + +void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +{ + struct page *page = cookie; + + if (page) { + kunmap(page); + page_cache_release(page); + } +} + +/* + * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS + */ +int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + void *fsdata; + int err; + char *kaddr; + unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE; + if (nofs) + flags |= AOP_FLAG_NOFS; + +retry: + err = pagecache_write_begin(NULL, mapping, 0, len-1, + flags, &page, &fsdata); + if (err) + goto fail; + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr, symname, len-1); + kunmap_atomic(kaddr, KM_USER0); + + err = pagecache_write_end(NULL, mapping, 0, len-1, len-1, + page, fsdata); + if (err < 0) + goto fail; + if (err < len-1) + goto retry; + + mark_inode_dirty(inode); + return 0; +fail: + return err; +} + +int page_symlink(struct inode *inode, const char *symname, int len) +{ + return __page_symlink(inode, symname, len, + !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); +} + +const struct inode_operations page_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +}; + +EXPORT_SYMBOL(user_path_at); +EXPORT_SYMBOL(follow_down_one); +EXPORT_SYMBOL(follow_down); +EXPORT_SYMBOL(follow_up); +EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ +EXPORT_SYMBOL(getname); +EXPORT_SYMBOL(lock_rename); +EXPORT_SYMBOL(lookup_one_len); +EXPORT_SYMBOL(page_follow_link_light); +EXPORT_SYMBOL(page_put_link); +EXPORT_SYMBOL(page_readlink); +EXPORT_SYMBOL(__page_symlink); +EXPORT_SYMBOL(page_symlink); +EXPORT_SYMBOL(page_symlink_inode_operations); +EXPORT_SYMBOL(kern_path_parent); +EXPORT_SYMBOL(kern_path); +EXPORT_SYMBOL(vfs_path_lookup); +EXPORT_SYMBOL(inode_permission); +EXPORT_SYMBOL(file_permission); +EXPORT_SYMBOL(unlock_rename); +EXPORT_SYMBOL(vfs_create); +EXPORT_SYMBOL(vfs_follow_link); +EXPORT_SYMBOL(vfs_link); +EXPORT_SYMBOL(vfs_mkdir); +EXPORT_SYMBOL(vfs_mknod); +EXPORT_SYMBOL(generic_permission); +EXPORT_SYMBOL(vfs_readlink); +EXPORT_SYMBOL(vfs_rename); +EXPORT_SYMBOL(vfs_rmdir); +EXPORT_SYMBOL(vfs_symlink); +EXPORT_SYMBOL(vfs_unlink); +EXPORT_SYMBOL(dentry_unhash); +EXPORT_SYMBOL(generic_readlink); diff --git a/fs/namespace.c b/fs/namespace.c new file mode 100644 index 00000000..b3d8f51c --- /dev/null +++ b/fs/namespace.c @@ -0,0 +1,2730 @@ +/* + * linux/fs/namespace.c + * + * (C) Copyright Al Viro 2000, 2001 + * Released under GPL v2. + * + * Based on code from fs/super.c, copyright Linus Torvalds and others. + * Heavily rewritten. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "pnode.h" +#include "internal.h" + +#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) +#define HASH_SIZE (1UL << HASH_SHIFT) + +static int event; +static DEFINE_IDA(mnt_id_ida); +static DEFINE_IDA(mnt_group_ida); +static DEFINE_SPINLOCK(mnt_id_lock); +static int mnt_id_start = 0; +static int mnt_group_start = 1; + +static struct list_head *mount_hashtable __read_mostly; +static struct kmem_cache *mnt_cache __read_mostly; +static struct rw_semaphore namespace_sem; + +/* /sys/fs */ +struct kobject *fs_kobj; +EXPORT_SYMBOL_GPL(fs_kobj); + +/* + * vfsmount lock may be taken for read to prevent changes to the + * vfsmount hash, ie. during mountpoint lookups or walking back + * up the tree. + * + * It should be taken for write in all cases where the vfsmount + * tree or hash is modified or when a vfsmount structure is modified. + */ +DEFINE_BRLOCK(vfsmount_lock); + +static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) +{ + unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); + tmp += ((unsigned long)dentry / L1_CACHE_BYTES); + tmp = tmp + (tmp >> HASH_SHIFT); + return tmp & (HASH_SIZE - 1); +} + +#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) + +/* + * allocation is serialized by namespace_sem, but we need the spinlock to + * serialize with freeing. + */ +static int mnt_alloc_id(struct vfsmount *mnt) +{ + int res; + +retry: + ida_pre_get(&mnt_id_ida, GFP_KERNEL); + spin_lock(&mnt_id_lock); + res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); + if (!res) + mnt_id_start = mnt->mnt_id + 1; + spin_unlock(&mnt_id_lock); + if (res == -EAGAIN) + goto retry; + + return res; +} + +static void mnt_free_id(struct vfsmount *mnt) +{ + int id = mnt->mnt_id; + spin_lock(&mnt_id_lock); + ida_remove(&mnt_id_ida, id); + if (mnt_id_start > id) + mnt_id_start = id; + spin_unlock(&mnt_id_lock); +} + +/* + * Allocate a new peer group ID + * + * mnt_group_ida is protected by namespace_sem + */ +static int mnt_alloc_group_id(struct vfsmount *mnt) +{ + int res; + + if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) + return -ENOMEM; + + res = ida_get_new_above(&mnt_group_ida, + mnt_group_start, + &mnt->mnt_group_id); + if (!res) + mnt_group_start = mnt->mnt_group_id + 1; + + return res; +} + +/* + * Release a peer group ID + */ +void mnt_release_group_id(struct vfsmount *mnt) +{ + int id = mnt->mnt_group_id; + ida_remove(&mnt_group_ida, id); + if (mnt_group_start > id) + mnt_group_start = id; + mnt->mnt_group_id = 0; +} + +/* + * vfsmount lock must be held for read + */ +static inline void mnt_add_count(struct vfsmount *mnt, int n) +{ +#ifdef CONFIG_SMP + this_cpu_add(mnt->mnt_pcp->mnt_count, n); +#else + preempt_disable(); + mnt->mnt_count += n; + preempt_enable(); +#endif +} + +static inline void mnt_set_count(struct vfsmount *mnt, int n) +{ +#ifdef CONFIG_SMP + this_cpu_write(mnt->mnt_pcp->mnt_count, n); +#else + mnt->mnt_count = n; +#endif +} + +/* + * vfsmount lock must be held for read + */ +static inline void mnt_inc_count(struct vfsmount *mnt) +{ + mnt_add_count(mnt, 1); +} + +/* + * vfsmount lock must be held for read + */ +static inline void mnt_dec_count(struct vfsmount *mnt) +{ + mnt_add_count(mnt, -1); +} + +/* + * vfsmount lock must be held for write + */ +unsigned int mnt_get_count(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + unsigned int count = 0; + int cpu; + + for_each_possible_cpu(cpu) { + count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; + } + + return count; +#else + return mnt->mnt_count; +#endif +} + +static struct vfsmount *alloc_vfsmnt(const char *name) +{ + struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); + if (mnt) { + int err; + + err = mnt_alloc_id(mnt); + if (err) + goto out_free_cache; + + if (name) { + mnt->mnt_devname = kstrdup(name, GFP_KERNEL); + if (!mnt->mnt_devname) + goto out_free_id; + } + +#ifdef CONFIG_SMP + mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); + if (!mnt->mnt_pcp) + goto out_free_devname; + + this_cpu_add(mnt->mnt_pcp->mnt_count, 1); +#else + mnt->mnt_count = 1; + mnt->mnt_writers = 0; +#endif + + INIT_LIST_HEAD(&mnt->mnt_hash); + INIT_LIST_HEAD(&mnt->mnt_child); + INIT_LIST_HEAD(&mnt->mnt_mounts); + INIT_LIST_HEAD(&mnt->mnt_list); + INIT_LIST_HEAD(&mnt->mnt_expire); + INIT_LIST_HEAD(&mnt->mnt_share); + INIT_LIST_HEAD(&mnt->mnt_slave_list); + INIT_LIST_HEAD(&mnt->mnt_slave); +#ifdef CONFIG_FSNOTIFY + INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); +#endif + } + return mnt; + +#ifdef CONFIG_SMP +out_free_devname: + kfree(mnt->mnt_devname); +#endif +out_free_id: + mnt_free_id(mnt); +out_free_cache: + kmem_cache_free(mnt_cache, mnt); + return NULL; +} + +/* + * Most r/o checks on a fs are for operations that take + * discrete amounts of time, like a write() or unlink(). + * We must keep track of when those operations start + * (for permission checks) and when they end, so that + * we can determine when writes are able to occur to + * a filesystem. + */ +/* + * __mnt_is_readonly: check whether a mount is read-only + * @mnt: the mount to check for its write status + * + * This shouldn't be used directly ouside of the VFS. + * It does not guarantee that the filesystem will stay + * r/w, just that it is right *now*. This can not and + * should not be used in place of IS_RDONLY(inode). + * mnt_want/drop_write() will _keep_ the filesystem + * r/w. + */ +int __mnt_is_readonly(struct vfsmount *mnt) +{ + if (mnt->mnt_flags & MNT_READONLY) + return 1; + if (mnt->mnt_sb->s_flags & MS_RDONLY) + return 1; + return 0; +} +EXPORT_SYMBOL_GPL(__mnt_is_readonly); + +static inline void mnt_inc_writers(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + this_cpu_inc(mnt->mnt_pcp->mnt_writers); +#else + mnt->mnt_writers++; +#endif +} + +static inline void mnt_dec_writers(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + this_cpu_dec(mnt->mnt_pcp->mnt_writers); +#else + mnt->mnt_writers--; +#endif +} + +static unsigned int mnt_get_writers(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + unsigned int count = 0; + int cpu; + + for_each_possible_cpu(cpu) { + count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; + } + + return count; +#else + return mnt->mnt_writers; +#endif +} + +/* + * Most r/o checks on a fs are for operations that take + * discrete amounts of time, like a write() or unlink(). + * We must keep track of when those operations start + * (for permission checks) and when they end, so that + * we can determine when writes are able to occur to + * a filesystem. + */ +/** + * mnt_want_write - get write access to a mount + * @mnt: the mount on which to take a write + * + * This tells the low-level filesystem that a write is + * about to be performed to it, and makes sure that + * writes are allowed before returning success. When + * the write operation is finished, mnt_drop_write() + * must be called. This is effectively a refcount. + */ +int mnt_want_write(struct vfsmount *mnt) +{ + int ret = 0; + + preempt_disable(); + mnt_inc_writers(mnt); + /* + * The store to mnt_inc_writers must be visible before we pass + * MNT_WRITE_HOLD loop below, so that the slowpath can see our + * incremented count after it has set MNT_WRITE_HOLD. + */ + smp_mb(); + while (mnt->mnt_flags & MNT_WRITE_HOLD) + cpu_relax(); + /* + * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will + * be set to match its requirements. So we must not load that until + * MNT_WRITE_HOLD is cleared. + */ + smp_rmb(); + if (__mnt_is_readonly(mnt)) { + mnt_dec_writers(mnt); + ret = -EROFS; + goto out; + } +out: + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(mnt_want_write); + +/** + * mnt_clone_write - get write access to a mount + * @mnt: the mount on which to take a write + * + * This is effectively like mnt_want_write, except + * it must only be used to take an extra write reference + * on a mountpoint that we already know has a write reference + * on it. This allows some optimisation. + * + * After finished, mnt_drop_write must be called as usual to + * drop the reference. + */ +int mnt_clone_write(struct vfsmount *mnt) +{ + /* superblock may be r/o */ + if (__mnt_is_readonly(mnt)) + return -EROFS; + preempt_disable(); + mnt_inc_writers(mnt); + preempt_enable(); + return 0; +} +EXPORT_SYMBOL_GPL(mnt_clone_write); + +/** + * mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + */ +int mnt_want_write_file(struct file *file) +{ + struct inode *inode = file->f_dentry->d_inode; + if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) + return mnt_want_write(file->f_path.mnt); + else + return mnt_clone_write(file->f_path.mnt); +} +EXPORT_SYMBOL_GPL(mnt_want_write_file); + +/** + * mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done + * performing writes to it. Must be matched with + * mnt_want_write() call above. + */ +void mnt_drop_write(struct vfsmount *mnt) +{ + preempt_disable(); + mnt_dec_writers(mnt); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(mnt_drop_write); + +static int mnt_make_readonly(struct vfsmount *mnt) +{ + int ret = 0; + + br_write_lock(vfsmount_lock); + mnt->mnt_flags |= MNT_WRITE_HOLD; + /* + * After storing MNT_WRITE_HOLD, we'll read the counters. This store + * should be visible before we do. + */ + smp_mb(); + + /* + * With writers on hold, if this value is zero, then there are + * definitely no active writers (although held writers may subsequently + * increment the count, they'll have to wait, and decrement it after + * seeing MNT_READONLY). + * + * It is OK to have counter incremented on one CPU and decremented on + * another: the sum will add up correctly. The danger would be when we + * sum up each counter, if we read a counter before it is incremented, + * but then read another CPU's count which it has been subsequently + * decremented from -- we would see more decrements than we should. + * MNT_WRITE_HOLD protects against this scenario, because + * mnt_want_write first increments count, then smp_mb, then spins on + * MNT_WRITE_HOLD, so it can't be decremented by another CPU while + * we're counting up here. + */ + if (mnt_get_writers(mnt) > 0) + ret = -EBUSY; + else + mnt->mnt_flags |= MNT_READONLY; + /* + * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers + * that become unheld will see MNT_READONLY. + */ + smp_wmb(); + mnt->mnt_flags &= ~MNT_WRITE_HOLD; + br_write_unlock(vfsmount_lock); + return ret; +} + +static void __mnt_unmake_readonly(struct vfsmount *mnt) +{ + br_write_lock(vfsmount_lock); + mnt->mnt_flags &= ~MNT_READONLY; + br_write_unlock(vfsmount_lock); +} + +static void free_vfsmnt(struct vfsmount *mnt) +{ + kfree(mnt->mnt_devname); + mnt_free_id(mnt); +#ifdef CONFIG_SMP + free_percpu(mnt->mnt_pcp); +#endif + kmem_cache_free(mnt_cache, mnt); +} + +/* + * find the first or last mount at @dentry on vfsmount @mnt depending on + * @dir. If @dir is set return the first mount else return the last mount. + * vfsmount_lock must be held for read or write. + */ +struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, + int dir) +{ + struct list_head *head = mount_hashtable + hash(mnt, dentry); + struct list_head *tmp = head; + struct vfsmount *p, *found = NULL; + + for (;;) { + tmp = dir ? tmp->next : tmp->prev; + p = NULL; + if (tmp == head) + break; + p = list_entry(tmp, struct vfsmount, mnt_hash); + if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { + found = p; + break; + } + } + return found; +} + +/* + * lookup_mnt increments the ref count before returning + * the vfsmount struct. + */ +struct vfsmount *lookup_mnt(struct path *path) +{ + struct vfsmount *child_mnt; + + br_read_lock(vfsmount_lock); + if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) + mntget(child_mnt); + br_read_unlock(vfsmount_lock); + return child_mnt; +} + +static inline int check_mnt(struct vfsmount *mnt) +{ + return mnt->mnt_ns == current->nsproxy->mnt_ns; +} + +/* + * vfsmount lock must be held for write + */ +static void touch_mnt_namespace(struct mnt_namespace *ns) +{ + if (ns) { + ns->event = ++event; + wake_up_interruptible(&ns->poll); + } +} + +/* + * vfsmount lock must be held for write + */ +static void __touch_mnt_namespace(struct mnt_namespace *ns) +{ + if (ns && ns->event != event) { + ns->event = event; + wake_up_interruptible(&ns->poll); + } +} + +/* + * Clear dentry's mounted state if it has no remaining mounts. + * vfsmount_lock must be held for write. + */ +static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry) +{ + unsigned u; + + for (u = 0; u < HASH_SIZE; u++) { + struct vfsmount *p; + + list_for_each_entry(p, &mount_hashtable[u], mnt_hash) { + if (p->mnt_mountpoint == dentry) + return; + } + } + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_MOUNTED; + spin_unlock(&dentry->d_lock); +} + +/* + * vfsmount lock must be held for write + */ +static void detach_mnt(struct vfsmount *mnt, struct path *old_path) +{ + old_path->dentry = mnt->mnt_mountpoint; + old_path->mnt = mnt->mnt_parent; + mnt->mnt_parent = mnt; + mnt->mnt_mountpoint = mnt->mnt_root; + list_del_init(&mnt->mnt_child); + list_del_init(&mnt->mnt_hash); + dentry_reset_mounted(old_path->mnt, old_path->dentry); +} + +/* + * vfsmount lock must be held for write + */ +void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, + struct vfsmount *child_mnt) +{ + child_mnt->mnt_parent = mntget(mnt); + child_mnt->mnt_mountpoint = dget(dentry); + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_MOUNTED; + spin_unlock(&dentry->d_lock); +} + +/* + * vfsmount lock must be held for write + */ +static void attach_mnt(struct vfsmount *mnt, struct path *path) +{ + mnt_set_mountpoint(path->mnt, path->dentry, mnt); + list_add_tail(&mnt->mnt_hash, mount_hashtable + + hash(path->mnt, path->dentry)); + list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); +} + +static inline void __mnt_make_longterm(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + atomic_inc(&mnt->mnt_longterm); +#endif +} + +/* needs vfsmount lock for write */ +static inline void __mnt_make_shortterm(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + atomic_dec(&mnt->mnt_longterm); +#endif +} + +/* + * vfsmount lock must be held for write + */ +static void commit_tree(struct vfsmount *mnt) +{ + struct vfsmount *parent = mnt->mnt_parent; + struct vfsmount *m; + LIST_HEAD(head); + struct mnt_namespace *n = parent->mnt_ns; + + BUG_ON(parent == mnt); + + list_add_tail(&head, &mnt->mnt_list); + list_for_each_entry(m, &head, mnt_list) { + m->mnt_ns = n; + __mnt_make_longterm(m); + } + + list_splice(&head, n->list.prev); + + list_add_tail(&mnt->mnt_hash, mount_hashtable + + hash(parent, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + touch_mnt_namespace(n); +} + +static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) +{ + struct list_head *next = p->mnt_mounts.next; + if (next == &p->mnt_mounts) { + while (1) { + if (p == root) + return NULL; + next = p->mnt_child.next; + if (next != &p->mnt_parent->mnt_mounts) + break; + p = p->mnt_parent; + } + } + return list_entry(next, struct vfsmount, mnt_child); +} + +static struct vfsmount *skip_mnt_tree(struct vfsmount *p) +{ + struct list_head *prev = p->mnt_mounts.prev; + while (prev != &p->mnt_mounts) { + p = list_entry(prev, struct vfsmount, mnt_child); + prev = p->mnt_mounts.prev; + } + return p; +} + +struct vfsmount * +vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) +{ + struct vfsmount *mnt; + struct dentry *root; + + if (!type) + return ERR_PTR(-ENODEV); + + mnt = alloc_vfsmnt(name); + if (!mnt) + return ERR_PTR(-ENOMEM); + + if (flags & MS_KERNMOUNT) + mnt->mnt_flags = MNT_INTERNAL; + + root = mount_fs(type, flags, name, data); + if (IS_ERR(root)) { + free_vfsmnt(mnt); + return ERR_CAST(root); + } + + mnt->mnt_root = root; + mnt->mnt_sb = root->d_sb; + mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt_parent = mnt; + return mnt; +} +EXPORT_SYMBOL_GPL(vfs_kern_mount); + +static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, + int flag) +{ + struct super_block *sb = old->mnt_sb; + struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); + + if (mnt) { + if (flag & (CL_SLAVE | CL_PRIVATE)) + mnt->mnt_group_id = 0; /* not a peer of original */ + else + mnt->mnt_group_id = old->mnt_group_id; + + if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { + int err = mnt_alloc_group_id(mnt); + if (err) + goto out_free; + } + + mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD; + atomic_inc(&sb->s_active); + mnt->mnt_sb = sb; + mnt->mnt_root = dget(root); + mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt_parent = mnt; + + if (flag & CL_SLAVE) { + list_add(&mnt->mnt_slave, &old->mnt_slave_list); + mnt->mnt_master = old; + CLEAR_MNT_SHARED(mnt); + } else if (!(flag & CL_PRIVATE)) { + if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) + list_add(&mnt->mnt_share, &old->mnt_share); + if (IS_MNT_SLAVE(old)) + list_add(&mnt->mnt_slave, &old->mnt_slave); + mnt->mnt_master = old->mnt_master; + } + if (flag & CL_MAKE_SHARED) + set_mnt_shared(mnt); + + /* stick the duplicate mount on the same expiry list + * as the original if that was on one */ + if (flag & CL_EXPIRE) { + if (!list_empty(&old->mnt_expire)) + list_add(&mnt->mnt_expire, &old->mnt_expire); + } + } + return mnt; + + out_free: + free_vfsmnt(mnt); + return NULL; +} + +static inline void mntfree(struct vfsmount *mnt) +{ + struct super_block *sb = mnt->mnt_sb; + + /* + * This probably indicates that somebody messed + * up a mnt_want/drop_write() pair. If this + * happens, the filesystem was probably unable + * to make r/w->r/o transitions. + */ + /* + * The locking used to deal with mnt_count decrement provides barriers, + * so mnt_get_writers() below is safe. + */ + WARN_ON(mnt_get_writers(mnt)); + fsnotify_vfsmount_delete(mnt); + dput(mnt->mnt_root); + free_vfsmnt(mnt); + deactivate_super(sb); +} + +static void mntput_no_expire(struct vfsmount *mnt) +{ +put_again: +#ifdef CONFIG_SMP + br_read_lock(vfsmount_lock); + if (likely(atomic_read(&mnt->mnt_longterm))) { + mnt_dec_count(mnt); + br_read_unlock(vfsmount_lock); + return; + } + br_read_unlock(vfsmount_lock); + + br_write_lock(vfsmount_lock); + mnt_dec_count(mnt); + if (mnt_get_count(mnt)) { + br_write_unlock(vfsmount_lock); + return; + } +#else + mnt_dec_count(mnt); + if (likely(mnt_get_count(mnt))) + return; + br_write_lock(vfsmount_lock); +#endif + if (unlikely(mnt->mnt_pinned)) { + mnt_add_count(mnt, mnt->mnt_pinned + 1); + mnt->mnt_pinned = 0; + br_write_unlock(vfsmount_lock); + acct_auto_close_mnt(mnt); + goto put_again; + } + br_write_unlock(vfsmount_lock); + mntfree(mnt); +} + +void mntput(struct vfsmount *mnt) +{ + if (mnt) { + /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ + if (unlikely(mnt->mnt_expiry_mark)) + mnt->mnt_expiry_mark = 0; + mntput_no_expire(mnt); + } +} +EXPORT_SYMBOL(mntput); + +struct vfsmount *mntget(struct vfsmount *mnt) +{ + if (mnt) + mnt_inc_count(mnt); + return mnt; +} +EXPORT_SYMBOL(mntget); + +void mnt_pin(struct vfsmount *mnt) +{ + br_write_lock(vfsmount_lock); + mnt->mnt_pinned++; + br_write_unlock(vfsmount_lock); +} +EXPORT_SYMBOL(mnt_pin); + +void mnt_unpin(struct vfsmount *mnt) +{ + br_write_lock(vfsmount_lock); + if (mnt->mnt_pinned) { + mnt_inc_count(mnt); + mnt->mnt_pinned--; + } + br_write_unlock(vfsmount_lock); +} +EXPORT_SYMBOL(mnt_unpin); + +static inline void mangle(struct seq_file *m, const char *s) +{ + seq_escape(m, s, " \t\n\\"); +} + +/* + * Simple .show_options callback for filesystems which don't want to + * implement more complex mount option showing. + * + * See also save_mount_options(). + */ +int generic_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + const char *options; + + rcu_read_lock(); + options = rcu_dereference(mnt->mnt_sb->s_options); + + if (options != NULL && options[0]) { + seq_putc(m, ','); + mangle(m, options); + } + rcu_read_unlock(); + + return 0; +} +EXPORT_SYMBOL(generic_show_options); + +/* + * If filesystem uses generic_show_options(), this function should be + * called from the fill_super() callback. + * + * The .remount_fs callback usually needs to be handled in a special + * way, to make sure, that previous options are not overwritten if the + * remount fails. + * + * Also note, that if the filesystem's .remount_fs function doesn't + * reset all options to their default value, but changes only newly + * given options, then the displayed options will not reflect reality + * any more. + */ +void save_mount_options(struct super_block *sb, char *options) +{ + BUG_ON(sb->s_options); + rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL)); +} +EXPORT_SYMBOL(save_mount_options); + +void replace_mount_options(struct super_block *sb, char *options) +{ + char *old = sb->s_options; + rcu_assign_pointer(sb->s_options, options); + if (old) { + synchronize_rcu(); + kfree(old); + } +} +EXPORT_SYMBOL(replace_mount_options); + +#ifdef CONFIG_PROC_FS +/* iterator */ +static void *m_start(struct seq_file *m, loff_t *pos) +{ + struct proc_mounts *p = m->private; + + down_read(&namespace_sem); + return seq_list_start(&p->ns->list, *pos); +} + +static void *m_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct proc_mounts *p = m->private; + + return seq_list_next(v, &p->ns->list, pos); +} + +static void m_stop(struct seq_file *m, void *v) +{ + up_read(&namespace_sem); +} + +int mnt_had_events(struct proc_mounts *p) +{ + struct mnt_namespace *ns = p->ns; + int res = 0; + + br_read_lock(vfsmount_lock); + if (p->event != ns->event) { + p->event = ns->event; + res = 1; + } + br_read_unlock(vfsmount_lock); + + return res; +} + +struct proc_fs_info { + int flag; + const char *str; +}; + +static int show_sb_opts(struct seq_file *m, struct super_block *sb) +{ + static const struct proc_fs_info fs_info[] = { + { MS_SYNCHRONOUS, ",sync" }, + { MS_DIRSYNC, ",dirsync" }, + { MS_MANDLOCK, ",mand" }, + { 0, NULL } + }; + const struct proc_fs_info *fs_infop; + + for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { + if (sb->s_flags & fs_infop->flag) + seq_puts(m, fs_infop->str); + } + + return security_sb_show_options(m, sb); +} + +static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) +{ + static const struct proc_fs_info mnt_info[] = { + { MNT_NOSUID, ",nosuid" }, + { MNT_NODEV, ",nodev" }, + { MNT_NOEXEC, ",noexec" }, + { MNT_NOATIME, ",noatime" }, + { MNT_NODIRATIME, ",nodiratime" }, + { MNT_RELATIME, ",relatime" }, + { 0, NULL } + }; + const struct proc_fs_info *fs_infop; + + for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { + if (mnt->mnt_flags & fs_infop->flag) + seq_puts(m, fs_infop->str); + } +} + +static void show_type(struct seq_file *m, struct super_block *sb) +{ + mangle(m, sb->s_type->name); + if (sb->s_subtype && sb->s_subtype[0]) { + seq_putc(m, '.'); + mangle(m, sb->s_subtype); + } +} + +static int show_vfsmnt(struct seq_file *m, void *v) +{ + struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); + int err = 0; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + + if (mnt->mnt_sb->s_op->show_devname) { + err = mnt->mnt_sb->s_op->show_devname(m, mnt); + if (err) + goto out; + } else { + mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + } + seq_putc(m, ' '); + seq_path(m, &mnt_path, " \t\n\\"); + seq_putc(m, ' '); + show_type(m, mnt->mnt_sb); + seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); + err = show_sb_opts(m, mnt->mnt_sb); + if (err) + goto out; + show_mnt_opts(m, mnt); + if (mnt->mnt_sb->s_op->show_options) + err = mnt->mnt_sb->s_op->show_options(m, mnt); + seq_puts(m, " 0 0\n"); +out: + return err; +} + +const struct seq_operations mounts_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_vfsmnt +}; + +static int show_mountinfo(struct seq_file *m, void *v) +{ + struct proc_mounts *p = m->private; + struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); + struct super_block *sb = mnt->mnt_sb; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + struct path root = p->root; + int err = 0; + + seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id, + MAJOR(sb->s_dev), MINOR(sb->s_dev)); + if (sb->s_op->show_path) + err = sb->s_op->show_path(m, mnt); + else + seq_dentry(m, mnt->mnt_root, " \t\n\\"); + if (err) + goto out; + seq_putc(m, ' '); + + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &root, " \t\n\\"); + if (err) + goto out; + + seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); + show_mnt_opts(m, mnt); + + /* Tagged fields ("foo:X" or "bar") */ + if (IS_MNT_SHARED(mnt)) + seq_printf(m, " shared:%i", mnt->mnt_group_id); + if (IS_MNT_SLAVE(mnt)) { + int master = mnt->mnt_master->mnt_group_id; + int dom = get_dominating_id(mnt, &p->root); + seq_printf(m, " master:%i", master); + if (dom && dom != master) + seq_printf(m, " propagate_from:%i", dom); + } + if (IS_MNT_UNBINDABLE(mnt)) + seq_puts(m, " unbindable"); + + /* Filesystem specific data */ + seq_puts(m, " - "); + show_type(m, sb); + seq_putc(m, ' '); + if (sb->s_op->show_devname) + err = sb->s_op->show_devname(m, mnt); + else + mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + if (err) + goto out; + seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); + err = show_sb_opts(m, sb); + if (err) + goto out; + if (sb->s_op->show_options) + err = sb->s_op->show_options(m, mnt); + seq_putc(m, '\n'); +out: + return err; +} + +const struct seq_operations mountinfo_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_mountinfo, +}; + +static int show_vfsstat(struct seq_file *m, void *v) +{ + struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + int err = 0; + + /* device */ + if (mnt->mnt_sb->s_op->show_devname) { + seq_puts(m, "device "); + err = mnt->mnt_sb->s_op->show_devname(m, mnt); + } else { + if (mnt->mnt_devname) { + seq_puts(m, "device "); + mangle(m, mnt->mnt_devname); + } else + seq_puts(m, "no device"); + } + + /* mount point */ + seq_puts(m, " mounted on "); + seq_path(m, &mnt_path, " \t\n\\"); + seq_putc(m, ' '); + + /* file system type */ + seq_puts(m, "with fstype "); + show_type(m, mnt->mnt_sb); + + /* optional statistics */ + if (mnt->mnt_sb->s_op->show_stats) { + seq_putc(m, ' '); + if (!err) + err = mnt->mnt_sb->s_op->show_stats(m, mnt); + } + + seq_putc(m, '\n'); + return err; +} + +const struct seq_operations mountstats_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_vfsstat, +}; +#endif /* CONFIG_PROC_FS */ + +/** + * may_umount_tree - check if a mount tree is busy + * @mnt: root of mount tree + * + * This is called to check if a tree of mounts has any + * open files, pwds, chroots or sub mounts that are + * busy. + */ +int may_umount_tree(struct vfsmount *mnt) +{ + int actual_refs = 0; + int minimum_refs = 0; + struct vfsmount *p; + + /* write lock needed for mnt_get_count */ + br_write_lock(vfsmount_lock); + for (p = mnt; p; p = next_mnt(p, mnt)) { + actual_refs += mnt_get_count(p); + minimum_refs += 2; + } + br_write_unlock(vfsmount_lock); + + if (actual_refs > minimum_refs) + return 0; + + return 1; +} + +EXPORT_SYMBOL(may_umount_tree); + +/** + * may_umount - check if a mount point is busy + * @mnt: root of mount + * + * This is called to check if a mount point has any + * open files, pwds, chroots or sub mounts. If the + * mount has sub mounts this will return busy + * regardless of whether the sub mounts are busy. + * + * Doesn't take quota and stuff into account. IOW, in some cases it will + * give false negatives. The main reason why it's here is that we need + * a non-destructive way to look for easily umountable filesystems. + */ +int may_umount(struct vfsmount *mnt) +{ + int ret = 1; + down_read(&namespace_sem); + br_write_lock(vfsmount_lock); + if (propagate_mount_busy(mnt, 2)) + ret = 0; + br_write_unlock(vfsmount_lock); + up_read(&namespace_sem); + return ret; +} + +EXPORT_SYMBOL(may_umount); + +void release_mounts(struct list_head *head) +{ + struct vfsmount *mnt; + while (!list_empty(head)) { + mnt = list_first_entry(head, struct vfsmount, mnt_hash); + list_del_init(&mnt->mnt_hash); + if (mnt->mnt_parent != mnt) { + struct dentry *dentry; + struct vfsmount *m; + + br_write_lock(vfsmount_lock); + dentry = mnt->mnt_mountpoint; + m = mnt->mnt_parent; + mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt_parent = mnt; + m->mnt_ghosts--; + br_write_unlock(vfsmount_lock); + dput(dentry); + mntput(m); + } + mntput(mnt); + } +} + +/* + * vfsmount lock must be held for write + * namespace_sem must be held for write + */ +void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) +{ + LIST_HEAD(tmp_list); + struct vfsmount *p; + + for (p = mnt; p; p = next_mnt(p, mnt)) + list_move(&p->mnt_hash, &tmp_list); + + if (propagate) + propagate_umount(&tmp_list); + + list_for_each_entry(p, &tmp_list, mnt_hash) { + list_del_init(&p->mnt_expire); + list_del_init(&p->mnt_list); + __touch_mnt_namespace(p->mnt_ns); + if (p->mnt_ns) + __mnt_make_shortterm(p); + p->mnt_ns = NULL; + list_del_init(&p->mnt_child); + if (p->mnt_parent != p) { + p->mnt_parent->mnt_ghosts++; + dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint); + } + change_mnt_propagation(p, MS_PRIVATE); + } + list_splice(&tmp_list, kill); +} + +static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); + +static int do_umount(struct vfsmount *mnt, int flags) +{ + struct super_block *sb = mnt->mnt_sb; + int retval; + LIST_HEAD(umount_list); + + retval = security_sb_umount(mnt, flags); + if (retval) + return retval; + + /* + * Allow userspace to request a mountpoint be expired rather than + * unmounting unconditionally. Unmount only happens if: + * (1) the mark is already set (the mark is cleared by mntput()) + * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] + */ + if (flags & MNT_EXPIRE) { + if (mnt == current->fs->root.mnt || + flags & (MNT_FORCE | MNT_DETACH)) + return -EINVAL; + + /* + * probably don't strictly need the lock here if we examined + * all race cases, but it's a slowpath. + */ + br_write_lock(vfsmount_lock); + if (mnt_get_count(mnt) != 2) { + br_write_unlock(vfsmount_lock); + return -EBUSY; + } + br_write_unlock(vfsmount_lock); + + if (!xchg(&mnt->mnt_expiry_mark, 1)) + return -EAGAIN; + } + + /* + * If we may have to abort operations to get out of this + * mount, and they will themselves hold resources we must + * allow the fs to do things. In the Unix tradition of + * 'Gee thats tricky lets do it in userspace' the umount_begin + * might fail to complete on the first run through as other tasks + * must return, and the like. Thats for the mount program to worry + * about for the moment. + */ + + if (flags & MNT_FORCE && sb->s_op->umount_begin) { + sb->s_op->umount_begin(sb); + } + + /* + * No sense to grab the lock for this test, but test itself looks + * somewhat bogus. Suggestions for better replacement? + * Ho-hum... In principle, we might treat that as umount + switch + * to rootfs. GC would eventually take care of the old vfsmount. + * Actually it makes sense, especially if rootfs would contain a + * /reboot - static binary that would close all descriptors and + * call reboot(9). Then init(8) could umount root and exec /reboot. + */ + if (mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { + /* + * Special case for "unmounting" root ... + * we just try to remount it readonly. + */ + down_write(&sb->s_umount); + if (!(sb->s_flags & MS_RDONLY)) + retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); + up_write(&sb->s_umount); + return retval; + } + + down_write(&namespace_sem); + br_write_lock(vfsmount_lock); + event++; + + if (!(flags & MNT_DETACH)) + shrink_submounts(mnt, &umount_list); + + retval = -EBUSY; + if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { + if (!list_empty(&mnt->mnt_list)) + umount_tree(mnt, 1, &umount_list); + retval = 0; + } + br_write_unlock(vfsmount_lock); + up_write(&namespace_sem); + release_mounts(&umount_list); + return retval; +} + +/* + * Now umount can handle mount points as well as block devices. + * This is important for filesystems which use unnamed block devices. + * + * We now support a flag for forced unmount like the other 'big iron' + * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD + */ + +SYSCALL_DEFINE2(umount, char __user *, name, int, flags) +{ + struct path path; + int retval; + int lookup_flags = 0; + + if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) + return -EINVAL; + + if (!(flags & UMOUNT_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + + retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); + if (retval) + goto out; + retval = -EINVAL; + if (path.dentry != path.mnt->mnt_root) + goto dput_and_out; + if (!check_mnt(path.mnt)) + goto dput_and_out; + + retval = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto dput_and_out; + + retval = do_umount(path.mnt, flags); +dput_and_out: + /* we mustn't call path_put() as that would clear mnt_expiry_mark */ + dput(path.dentry); + mntput_no_expire(path.mnt); +out: + return retval; +} + +#ifdef __ARCH_WANT_SYS_OLDUMOUNT + +/* + * The 2.0 compatible umount. No flags. + */ +SYSCALL_DEFINE1(oldumount, char __user *, name) +{ + return sys_umount(name, 0); +} + +#endif + +static int mount_is_safe(struct path *path) +{ + if (capable(CAP_SYS_ADMIN)) + return 0; + return -EPERM; +#ifdef notyet + if (S_ISLNK(path->dentry->d_inode->i_mode)) + return -EPERM; + if (path->dentry->d_inode->i_mode & S_ISVTX) { + if (current_uid() != path->dentry->d_inode->i_uid) + return -EPERM; + } + if (inode_permission(path->dentry->d_inode, MAY_WRITE)) + return -EPERM; + return 0; +#endif +} + +struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, + int flag) +{ + struct vfsmount *res, *p, *q, *r, *s; + struct path path; + + if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) + return NULL; + + res = q = clone_mnt(mnt, dentry, flag); + if (!q) + goto Enomem; + q->mnt_mountpoint = mnt->mnt_mountpoint; + + p = mnt; + list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { + if (!is_subdir(r->mnt_mountpoint, dentry)) + continue; + + for (s = r; s; s = next_mnt(s, r)) { + if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { + s = skip_mnt_tree(s); + continue; + } + while (p != s->mnt_parent) { + p = p->mnt_parent; + q = q->mnt_parent; + } + p = s; + path.mnt = q; + path.dentry = p->mnt_mountpoint; + q = clone_mnt(p, p->mnt_root, flag); + if (!q) + goto Enomem; + br_write_lock(vfsmount_lock); + list_add_tail(&q->mnt_list, &res->mnt_list); + attach_mnt(q, &path); + br_write_unlock(vfsmount_lock); + } + } + return res; +Enomem: + if (res) { + LIST_HEAD(umount_list); + br_write_lock(vfsmount_lock); + umount_tree(res, 0, &umount_list); + br_write_unlock(vfsmount_lock); + release_mounts(&umount_list); + } + return NULL; +} + +struct vfsmount *collect_mounts(struct path *path) +{ + struct vfsmount *tree; + down_write(&namespace_sem); + tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE); + up_write(&namespace_sem); + return tree; +} + +void drop_collected_mounts(struct vfsmount *mnt) +{ + LIST_HEAD(umount_list); + down_write(&namespace_sem); + br_write_lock(vfsmount_lock); + umount_tree(mnt, 0, &umount_list); + br_write_unlock(vfsmount_lock); + up_write(&namespace_sem); + release_mounts(&umount_list); +} + +int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, + struct vfsmount *root) +{ + struct vfsmount *mnt; + int res = f(root, arg); + if (res) + return res; + list_for_each_entry(mnt, &root->mnt_list, mnt_list) { + res = f(mnt, arg); + if (res) + return res; + } + return 0; +} + +static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) +{ + struct vfsmount *p; + + for (p = mnt; p != end; p = next_mnt(p, mnt)) { + if (p->mnt_group_id && !IS_MNT_SHARED(p)) + mnt_release_group_id(p); + } +} + +static int invent_group_ids(struct vfsmount *mnt, bool recurse) +{ + struct vfsmount *p; + + for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { + if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { + int err = mnt_alloc_group_id(p); + if (err) { + cleanup_group_ids(mnt, p); + return err; + } + } + } + + return 0; +} + +/* + * @source_mnt : mount tree to be attached + * @nd : place the mount tree @source_mnt is attached + * @parent_nd : if non-null, detach the source_mnt from its parent and + * store the parent mount and mountpoint dentry. + * (done when source_mnt is moved) + * + * NOTE: in the table below explains the semantics when a source mount + * of a given type is attached to a destination mount of a given type. + * --------------------------------------------------------------------------- + * | BIND MOUNT OPERATION | + * |************************************************************************** + * | source-->| shared | private | slave | unbindable | + * | dest | | | | | + * | | | | | | | + * | v | | | | | + * |************************************************************************** + * | shared | shared (++) | shared (+) | shared(+++)| invalid | + * | | | | | | + * |non-shared| shared (+) | private | slave (*) | invalid | + * *************************************************************************** + * A bind operation clones the source mount and mounts the clone on the + * destination mount. + * + * (++) the cloned mount is propagated to all the mounts in the propagation + * tree of the destination mount and the cloned mount is added to + * the peer group of the source mount. + * (+) the cloned mount is created under the destination mount and is marked + * as shared. The cloned mount is added to the peer group of the source + * mount. + * (+++) the mount is propagated to all the mounts in the propagation tree + * of the destination mount and the cloned mount is made slave + * of the same master as that of the source mount. The cloned mount + * is marked as 'shared and slave'. + * (*) the cloned mount is made a slave of the same master as that of the + * source mount. + * + * --------------------------------------------------------------------------- + * | MOVE MOUNT OPERATION | + * |************************************************************************** + * | source-->| shared | private | slave | unbindable | + * | dest | | | | | + * | | | | | | | + * | v | | | | | + * |************************************************************************** + * | shared | shared (+) | shared (+) | shared(+++) | invalid | + * | | | | | | + * |non-shared| shared (+*) | private | slave (*) | unbindable | + * *************************************************************************** + * + * (+) the mount is moved to the destination. And is then propagated to + * all the mounts in the propagation tree of the destination mount. + * (+*) the mount is moved to the destination. + * (+++) the mount is moved to the destination and is then propagated to + * all the mounts belonging to the destination mount's propagation tree. + * the mount is marked as 'shared and slave'. + * (*) the mount continues to be a slave at the new location. + * + * if the source mount is a tree, the operations explained above is + * applied to each mount in the tree. + * Must be called without spinlocks held, since this function can sleep + * in allocations. + */ +static int attach_recursive_mnt(struct vfsmount *source_mnt, + struct path *path, struct path *parent_path) +{ + LIST_HEAD(tree_list); + struct vfsmount *dest_mnt = path->mnt; + struct dentry *dest_dentry = path->dentry; + struct vfsmount *child, *p; + int err; + + if (IS_MNT_SHARED(dest_mnt)) { + err = invent_group_ids(source_mnt, true); + if (err) + goto out; + } + err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list); + if (err) + goto out_cleanup_ids; + + br_write_lock(vfsmount_lock); + + if (IS_MNT_SHARED(dest_mnt)) { + for (p = source_mnt; p; p = next_mnt(p, source_mnt)) + set_mnt_shared(p); + } + if (parent_path) { + detach_mnt(source_mnt, parent_path); + attach_mnt(source_mnt, path); + touch_mnt_namespace(parent_path->mnt->mnt_ns); + } else { + mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); + commit_tree(source_mnt); + } + + list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { + list_del_init(&child->mnt_hash); + commit_tree(child); + } + br_write_unlock(vfsmount_lock); + + return 0; + + out_cleanup_ids: + if (IS_MNT_SHARED(dest_mnt)) + cleanup_group_ids(source_mnt, NULL); + out: + return err; +} + +static int lock_mount(struct path *path) +{ + struct vfsmount *mnt; +retry: + mutex_lock(&path->dentry->d_inode->i_mutex); + if (unlikely(cant_mount(path->dentry))) { + mutex_unlock(&path->dentry->d_inode->i_mutex); + return -ENOENT; + } + down_write(&namespace_sem); + mnt = lookup_mnt(path); + if (likely(!mnt)) + return 0; + up_write(&namespace_sem); + mutex_unlock(&path->dentry->d_inode->i_mutex); + path_put(path); + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); + goto retry; +} + +static void unlock_mount(struct path *path) +{ + up_write(&namespace_sem); + mutex_unlock(&path->dentry->d_inode->i_mutex); +} + +static int graft_tree(struct vfsmount *mnt, struct path *path) +{ + if (mnt->mnt_sb->s_flags & MS_NOUSER) + return -EINVAL; + + if (S_ISDIR(path->dentry->d_inode->i_mode) != + S_ISDIR(mnt->mnt_root->d_inode->i_mode)) + return -ENOTDIR; + + if (d_unlinked(path->dentry)) + return -ENOENT; + + return attach_recursive_mnt(mnt, path, NULL); +} + +/* + * Sanity check the flags to change_mnt_propagation. + */ + +static int flags_to_propagation_type(int flags) +{ + int type = flags & ~(MS_REC | MS_SILENT); + + /* Fail if any non-propagation flags are set */ + if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + return 0; + /* Only one propagation flag should be set */ + if (!is_power_of_2(type)) + return 0; + return type; +} + +/* + * recursively change the type of the mountpoint. + */ +static int do_change_type(struct path *path, int flag) +{ + struct vfsmount *m, *mnt = path->mnt; + int recurse = flag & MS_REC; + int type; + int err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (path->dentry != path->mnt->mnt_root) + return -EINVAL; + + type = flags_to_propagation_type(flag); + if (!type) + return -EINVAL; + + down_write(&namespace_sem); + if (type == MS_SHARED) { + err = invent_group_ids(mnt, recurse); + if (err) + goto out_unlock; + } + + br_write_lock(vfsmount_lock); + for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) + change_mnt_propagation(m, type); + br_write_unlock(vfsmount_lock); + + out_unlock: + up_write(&namespace_sem); + return err; +} + +/* + * do loopback mount. + */ +static int do_loopback(struct path *path, char *old_name, + int recurse) +{ + LIST_HEAD(umount_list); + struct path old_path; + struct vfsmount *mnt = NULL; + int err = mount_is_safe(path); + if (err) + return err; + if (!old_name || !*old_name) + return -EINVAL; + err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); + if (err) + return err; + + err = lock_mount(path); + if (err) + goto out; + + err = -EINVAL; + if (IS_MNT_UNBINDABLE(old_path.mnt)) + goto out2; + + if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) + goto out2; + + err = -ENOMEM; + if (recurse) + mnt = copy_tree(old_path.mnt, old_path.dentry, 0); + else + mnt = clone_mnt(old_path.mnt, old_path.dentry, 0); + + if (!mnt) + goto out2; + + err = graft_tree(mnt, path); + if (err) { + br_write_lock(vfsmount_lock); + umount_tree(mnt, 0, &umount_list); + br_write_unlock(vfsmount_lock); + } +out2: + unlock_mount(path); + release_mounts(&umount_list); +out: + path_put(&old_path); + return err; +} + +static int change_mount_flags(struct vfsmount *mnt, int ms_flags) +{ + int error = 0; + int readonly_request = 0; + + if (ms_flags & MS_RDONLY) + readonly_request = 1; + if (readonly_request == __mnt_is_readonly(mnt)) + return 0; + + if (readonly_request) + error = mnt_make_readonly(mnt); + else + __mnt_unmake_readonly(mnt); + return error; +} + +/* + * change filesystem flags. dir should be a physical root of filesystem. + * If you've mounted a non-root directory somewhere and want to do remount + * on it - tough luck. + */ +static int do_remount(struct path *path, int flags, int mnt_flags, + void *data) +{ + int err; + struct super_block *sb = path->mnt->mnt_sb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!check_mnt(path->mnt)) + return -EINVAL; + + if (path->dentry != path->mnt->mnt_root) + return -EINVAL; + + err = security_sb_remount(sb, data); + if (err) + return err; + + down_write(&sb->s_umount); + if (flags & MS_BIND) + err = change_mount_flags(path->mnt, flags); + else + err = do_remount_sb(sb, flags, data, 0); + if (!err) { + br_write_lock(vfsmount_lock); + mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK; + path->mnt->mnt_flags = mnt_flags; + br_write_unlock(vfsmount_lock); + } + up_write(&sb->s_umount); + if (!err) { + br_write_lock(vfsmount_lock); + touch_mnt_namespace(path->mnt->mnt_ns); + br_write_unlock(vfsmount_lock); + } + return err; +} + +static inline int tree_contains_unbindable(struct vfsmount *mnt) +{ + struct vfsmount *p; + for (p = mnt; p; p = next_mnt(p, mnt)) { + if (IS_MNT_UNBINDABLE(p)) + return 1; + } + return 0; +} + +static int do_move_mount(struct path *path, char *old_name) +{ + struct path old_path, parent_path; + struct vfsmount *p; + int err = 0; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!old_name || !*old_name) + return -EINVAL; + err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); + if (err) + return err; + + err = lock_mount(path); + if (err < 0) + goto out; + + err = -EINVAL; + if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) + goto out1; + + if (d_unlinked(path->dentry)) + goto out1; + + err = -EINVAL; + if (old_path.dentry != old_path.mnt->mnt_root) + goto out1; + + if (old_path.mnt == old_path.mnt->mnt_parent) + goto out1; + + if (S_ISDIR(path->dentry->d_inode->i_mode) != + S_ISDIR(old_path.dentry->d_inode->i_mode)) + goto out1; + /* + * Don't move a mount residing in a shared parent. + */ + if (old_path.mnt->mnt_parent && + IS_MNT_SHARED(old_path.mnt->mnt_parent)) + goto out1; + /* + * Don't move a mount tree containing unbindable mounts to a destination + * mount which is shared. + */ + if (IS_MNT_SHARED(path->mnt) && + tree_contains_unbindable(old_path.mnt)) + goto out1; + err = -ELOOP; + for (p = path->mnt; p->mnt_parent != p; p = p->mnt_parent) + if (p == old_path.mnt) + goto out1; + + err = attach_recursive_mnt(old_path.mnt, path, &parent_path); + if (err) + goto out1; + + /* if the mount is moved, it should no longer be expire + * automatically */ + list_del_init(&old_path.mnt->mnt_expire); +out1: + unlock_mount(path); +out: + if (!err) + path_put(&parent_path); + path_put(&old_path); + return err; +} + +static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) +{ + int err; + const char *subtype = strchr(fstype, '.'); + if (subtype) { + subtype++; + err = -EINVAL; + if (!subtype[0]) + goto err; + } else + subtype = ""; + + mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); + err = -ENOMEM; + if (!mnt->mnt_sb->s_subtype) + goto err; + return mnt; + + err: + mntput(mnt); + return ERR_PTR(err); +} + +struct vfsmount * +do_kern_mount(const char *fstype, int flags, const char *name, void *data) +{ + struct file_system_type *type = get_fs_type(fstype); + struct vfsmount *mnt; + if (!type) + return ERR_PTR(-ENODEV); + mnt = vfs_kern_mount(type, flags, name, data); + if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && + !mnt->mnt_sb->s_subtype) + mnt = fs_set_subtype(mnt, fstype); + put_filesystem(type); + return mnt; +} +EXPORT_SYMBOL_GPL(do_kern_mount); + +/* + * add a mount into a namespace's mount tree + */ +static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags) +{ + int err; + + mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); + + err = lock_mount(path); + if (err) + return err; + + err = -EINVAL; + if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) + goto unlock; + + /* Refuse the same filesystem on the same mount point */ + err = -EBUSY; + if (path->mnt->mnt_sb == newmnt->mnt_sb && + path->mnt->mnt_root == path->dentry) + goto unlock; + + err = -EINVAL; + if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) + goto unlock; + + newmnt->mnt_flags = mnt_flags; + err = graft_tree(newmnt, path); + +unlock: + unlock_mount(path); + return err; +} + +/* + * create a new mount for userspace and request it to be added into the + * namespace's tree + */ +static int do_new_mount(struct path *path, char *type, int flags, + int mnt_flags, char *name, void *data) +{ + struct vfsmount *mnt; + int err; + + if (!type) + return -EINVAL; + + /* we need capabilities... */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mnt = do_kern_mount(type, flags, name, data); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + err = do_add_mount(mnt, path, mnt_flags); + if (err) + mntput(mnt); + return err; +} + +int finish_automount(struct vfsmount *m, struct path *path) +{ + int err; + /* The new mount record should have at least 2 refs to prevent it being + * expired before we get a chance to add it + */ + BUG_ON(mnt_get_count(m) < 2); + + if (m->mnt_sb == path->mnt->mnt_sb && + m->mnt_root == path->dentry) { + err = -ELOOP; + goto fail; + } + + err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + if (!err) + return 0; +fail: + /* remove m from any expiration list it may be on */ + if (!list_empty(&m->mnt_expire)) { + down_write(&namespace_sem); + br_write_lock(vfsmount_lock); + list_del_init(&m->mnt_expire); + br_write_unlock(vfsmount_lock); + up_write(&namespace_sem); + } + mntput(m); + mntput(m); + return err; +} + +/** + * mnt_set_expiry - Put a mount on an expiration list + * @mnt: The mount to list. + * @expiry_list: The list to add the mount to. + */ +void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) +{ + down_write(&namespace_sem); + br_write_lock(vfsmount_lock); + + list_add_tail(&mnt->mnt_expire, expiry_list); + + br_write_unlock(vfsmount_lock); + up_write(&namespace_sem); +} +EXPORT_SYMBOL(mnt_set_expiry); + +/* + * process a list of expirable mountpoints with the intent of discarding any + * mountpoints that aren't in use and haven't been touched since last we came + * here + */ +void mark_mounts_for_expiry(struct list_head *mounts) +{ + struct vfsmount *mnt, *next; + LIST_HEAD(graveyard); + LIST_HEAD(umounts); + + if (list_empty(mounts)) + return; + + down_write(&namespace_sem); + br_write_lock(vfsmount_lock); + + /* extract from the expiration list every vfsmount that matches the + * following criteria: + * - only referenced by its parent vfsmount + * - still marked for expiry (marked on the last call here; marks are + * cleared by mntput()) + */ + list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { + if (!xchg(&mnt->mnt_expiry_mark, 1) || + propagate_mount_busy(mnt, 1)) + continue; + list_move(&mnt->mnt_expire, &graveyard); + } + while (!list_empty(&graveyard)) { + mnt = list_first_entry(&graveyard, struct vfsmount, mnt_expire); + touch_mnt_namespace(mnt->mnt_ns); + umount_tree(mnt, 1, &umounts); + } + br_write_unlock(vfsmount_lock); + up_write(&namespace_sem); + + release_mounts(&umounts); +} + +EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); + +/* + * Ripoff of 'select_parent()' + * + * search the list of submounts for a given mountpoint, and move any + * shrinkable submounts to the 'graveyard' list. + */ +static int select_submounts(struct vfsmount *parent, struct list_head *graveyard) +{ + struct vfsmount *this_parent = parent; + struct list_head *next; + int found = 0; + +repeat: + next = this_parent->mnt_mounts.next; +resume: + while (next != &this_parent->mnt_mounts) { + struct list_head *tmp = next; + struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child); + + next = tmp->next; + if (!(mnt->mnt_flags & MNT_SHRINKABLE)) + continue; + /* + * Descend a level if the d_mounts list is non-empty. + */ + if (!list_empty(&mnt->mnt_mounts)) { + this_parent = mnt; + goto repeat; + } + + if (!propagate_mount_busy(mnt, 1)) { + list_move_tail(&mnt->mnt_expire, graveyard); + found++; + } + } + /* + * All done at this level ... ascend and resume the search + */ + if (this_parent != parent) { + next = this_parent->mnt_child.next; + this_parent = this_parent->mnt_parent; + goto resume; + } + return found; +} + +/* + * process a list of expirable mountpoints with the intent of discarding any + * submounts of a specific parent mountpoint + * + * vfsmount_lock must be held for write + */ +static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) +{ + LIST_HEAD(graveyard); + struct vfsmount *m; + + /* extract submounts of 'mountpoint' from the expiration list */ + while (select_submounts(mnt, &graveyard)) { + while (!list_empty(&graveyard)) { + m = list_first_entry(&graveyard, struct vfsmount, + mnt_expire); + touch_mnt_namespace(m->mnt_ns); + umount_tree(m, 1, umounts); + } + } +} + +/* + * Some copy_from_user() implementations do not return the exact number of + * bytes remaining to copy on a fault. But copy_mount_options() requires that. + * Note that this function differs from copy_from_user() in that it will oops + * on bad values of `to', rather than returning a short copy. + */ +static long exact_copy_from_user(void *to, const void __user * from, + unsigned long n) +{ + char *t = to; + const char __user *f = from; + char c; + + if (!access_ok(VERIFY_READ, from, n)) + return n; + + while (n) { + if (__get_user(c, f)) { + memset(t, 0, n); + break; + } + *t++ = c; + f++; + n--; + } + return n; +} + +int copy_mount_options(const void __user * data, unsigned long *where) +{ + int i; + unsigned long page; + unsigned long size; + + *where = 0; + if (!data) + return 0; + + if (!(page = __get_free_page(GFP_KERNEL))) + return -ENOMEM; + + /* We only care that *some* data at the address the user + * gave us is valid. Just in case, we'll zero + * the remainder of the page. + */ + /* copy_from_user cannot cross TASK_SIZE ! */ + size = TASK_SIZE - (unsigned long)data; + if (size > PAGE_SIZE) + size = PAGE_SIZE; + + i = size - exact_copy_from_user((void *)page, data, size); + if (!i) { + free_page(page); + return -EFAULT; + } + if (i != PAGE_SIZE) + memset((char *)page + i, 0, PAGE_SIZE - i); + *where = page; + return 0; +} + +int copy_mount_string(const void __user *data, char **where) +{ + char *tmp; + + if (!data) { + *where = NULL; + return 0; + } + + tmp = strndup_user(data, PAGE_SIZE); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + *where = tmp; + return 0; +} + +/* + * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to + * be given to the mount() call (ie: read-only, no-dev, no-suid etc). + * + * data is a (void *) that can point to any structure up to + * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent + * information (or be NULL). + * + * Pre-0.97 versions of mount() didn't have a flags word. + * When the flags word was introduced its top half was required + * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. + * Therefore, if this magic number is present, it carries no information + * and must be discarded. + */ +long do_mount(char *dev_name, char *dir_name, char *type_page, + unsigned long flags, void *data_page) +{ + struct path path; + int retval = 0; + int mnt_flags = 0; + + /* Discard magic */ + if ((flags & MS_MGC_MSK) == MS_MGC_VAL) + flags &= ~MS_MGC_MSK; + + /* Basic sanity checks */ + + if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) + return -EINVAL; + + if (data_page) + ((char *)data_page)[PAGE_SIZE - 1] = 0; + + /* ... and get the mountpoint */ + retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); + if (retval) + return retval; + + retval = security_sb_mount(dev_name, &path, + type_page, flags, data_page); + if (retval) + goto dput_out; + + /* Default to relatime unless overriden */ + if (!(flags & MS_NOATIME)) + mnt_flags |= MNT_RELATIME; + + /* Separate the per-mountpoint flags */ + if (flags & MS_NOSUID) + mnt_flags |= MNT_NOSUID; + if (flags & MS_NODEV) + mnt_flags |= MNT_NODEV; + if (flags & MS_NOEXEC) + mnt_flags |= MNT_NOEXEC; + if (flags & MS_NOATIME) + mnt_flags |= MNT_NOATIME; + if (flags & MS_NODIRATIME) + mnt_flags |= MNT_NODIRATIME; + if (flags & MS_STRICTATIME) + mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); + if (flags & MS_RDONLY) + mnt_flags |= MNT_READONLY; + + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | + MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | + MS_STRICTATIME); + + if (flags & MS_REMOUNT) + retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, + data_page); + else if (flags & MS_BIND) + retval = do_loopback(&path, dev_name, flags & MS_REC); + else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + retval = do_change_type(&path, flags); + else if (flags & MS_MOVE) + retval = do_move_mount(&path, dev_name); + else + retval = do_new_mount(&path, type_page, flags, mnt_flags, + dev_name, data_page); +dput_out: + path_put(&path); + return retval; +} + +static struct mnt_namespace *alloc_mnt_ns(void) +{ + struct mnt_namespace *new_ns; + + new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); + if (!new_ns) + return ERR_PTR(-ENOMEM); + atomic_set(&new_ns->count, 1); + new_ns->root = NULL; + INIT_LIST_HEAD(&new_ns->list); + init_waitqueue_head(&new_ns->poll); + new_ns->event = 0; + return new_ns; +} + +void mnt_make_longterm(struct vfsmount *mnt) +{ + __mnt_make_longterm(mnt); +} + +void mnt_make_shortterm(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + if (atomic_add_unless(&mnt->mnt_longterm, -1, 1)) + return; + br_write_lock(vfsmount_lock); + atomic_dec(&mnt->mnt_longterm); + br_write_unlock(vfsmount_lock); +#endif +} + +/* + * Allocate a new namespace structure and populate it with contents + * copied from the namespace of the passed in task structure. + */ +static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, + struct fs_struct *fs) +{ + struct mnt_namespace *new_ns; + struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; + struct vfsmount *p, *q; + + new_ns = alloc_mnt_ns(); + if (IS_ERR(new_ns)) + return new_ns; + + down_write(&namespace_sem); + /* First pass: copy the tree topology */ + new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root, + CL_COPY_ALL | CL_EXPIRE); + if (!new_ns->root) { + up_write(&namespace_sem); + kfree(new_ns); + return ERR_PTR(-ENOMEM); + } + br_write_lock(vfsmount_lock); + list_add_tail(&new_ns->list, &new_ns->root->mnt_list); + br_write_unlock(vfsmount_lock); + + /* + * Second pass: switch the tsk->fs->* elements and mark new vfsmounts + * as belonging to new namespace. We have already acquired a private + * fs_struct, so tsk->fs->lock is not needed. + */ + p = mnt_ns->root; + q = new_ns->root; + while (p) { + q->mnt_ns = new_ns; + __mnt_make_longterm(q); + if (fs) { + if (p == fs->root.mnt) { + fs->root.mnt = mntget(q); + __mnt_make_longterm(q); + mnt_make_shortterm(p); + rootmnt = p; + } + if (p == fs->pwd.mnt) { + fs->pwd.mnt = mntget(q); + __mnt_make_longterm(q); + mnt_make_shortterm(p); + pwdmnt = p; + } + } + p = next_mnt(p, mnt_ns->root); + q = next_mnt(q, new_ns->root); + } + up_write(&namespace_sem); + + if (rootmnt) + mntput(rootmnt); + if (pwdmnt) + mntput(pwdmnt); + + return new_ns; +} + +struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, + struct fs_struct *new_fs) +{ + struct mnt_namespace *new_ns; + + BUG_ON(!ns); + get_mnt_ns(ns); + + if (!(flags & CLONE_NEWNS)) + return ns; + + new_ns = dup_mnt_ns(ns, new_fs); + + put_mnt_ns(ns); + return new_ns; +} + +/** + * create_mnt_ns - creates a private namespace and adds a root filesystem + * @mnt: pointer to the new root filesystem mountpoint + */ +struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) +{ + struct mnt_namespace *new_ns; + + new_ns = alloc_mnt_ns(); + if (!IS_ERR(new_ns)) { + mnt->mnt_ns = new_ns; + __mnt_make_longterm(mnt); + new_ns->root = mnt; + list_add(&new_ns->list, &new_ns->root->mnt_list); + } + return new_ns; +} +EXPORT_SYMBOL(create_mnt_ns); + +SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, + char __user *, type, unsigned long, flags, void __user *, data) +{ + int ret; + char *kernel_type; + char *kernel_dir; + char *kernel_dev; + unsigned long data_page; + + ret = copy_mount_string(type, &kernel_type); + if (ret < 0) + goto out_type; + + kernel_dir = getname(dir_name); + if (IS_ERR(kernel_dir)) { + ret = PTR_ERR(kernel_dir); + goto out_dir; + } + + ret = copy_mount_string(dev_name, &kernel_dev); + if (ret < 0) + goto out_dev; + + ret = copy_mount_options(data, &data_page); + if (ret < 0) + goto out_data; + + ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags, + (void *) data_page); + + free_page(data_page); +out_data: + kfree(kernel_dev); +out_dev: + putname(kernel_dir); +out_dir: + kfree(kernel_type); +out_type: + return ret; +} + +/* + * pivot_root Semantics: + * Moves the root file system of the current process to the directory put_old, + * makes new_root as the new root file system of the current process, and sets + * root/cwd of all processes which had them on the current root to new_root. + * + * Restrictions: + * The new_root and put_old must be directories, and must not be on the + * same file system as the current process root. The put_old must be + * underneath new_root, i.e. adding a non-zero number of /.. to the string + * pointed to by put_old must yield the same directory as new_root. No other + * file system may be mounted on put_old. After all, new_root is a mountpoint. + * + * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. + * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives + * in this situation. + * + * Notes: + * - we don't move root/cwd if they are not at the root (reason: if something + * cared enough to change them, it's probably wrong to force them elsewhere) + * - it's okay to pick a root that isn't the root of a file system, e.g. + * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, + * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root + * first. + */ +SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, + const char __user *, put_old) +{ + struct vfsmount *tmp; + struct path new, old, parent_path, root_parent, root; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = user_path_dir(new_root, &new); + if (error) + goto out0; + + error = user_path_dir(put_old, &old); + if (error) + goto out1; + + error = security_sb_pivotroot(&old, &new); + if (error) + goto out2; + + get_fs_root(current->fs, &root); + error = lock_mount(&old); + if (error) + goto out3; + + error = -EINVAL; + if (IS_MNT_SHARED(old.mnt) || + IS_MNT_SHARED(new.mnt->mnt_parent) || + IS_MNT_SHARED(root.mnt->mnt_parent)) + goto out4; + if (!check_mnt(root.mnt) || !check_mnt(new.mnt)) + goto out4; + error = -ENOENT; + if (d_unlinked(new.dentry)) + goto out4; + if (d_unlinked(old.dentry)) + goto out4; + error = -EBUSY; + if (new.mnt == root.mnt || + old.mnt == root.mnt) + goto out4; /* loop, on the same file system */ + error = -EINVAL; + if (root.mnt->mnt_root != root.dentry) + goto out4; /* not a mountpoint */ + if (root.mnt->mnt_parent == root.mnt) + goto out4; /* not attached */ + if (new.mnt->mnt_root != new.dentry) + goto out4; /* not a mountpoint */ + if (new.mnt->mnt_parent == new.mnt) + goto out4; /* not attached */ + /* make sure we can reach put_old from new_root */ + tmp = old.mnt; + if (tmp != new.mnt) { + for (;;) { + if (tmp->mnt_parent == tmp) + goto out4; /* already mounted on put_old */ + if (tmp->mnt_parent == new.mnt) + break; + tmp = tmp->mnt_parent; + } + if (!is_subdir(tmp->mnt_mountpoint, new.dentry)) + goto out4; + } else if (!is_subdir(old.dentry, new.dentry)) + goto out4; + br_write_lock(vfsmount_lock); + detach_mnt(new.mnt, &parent_path); + detach_mnt(root.mnt, &root_parent); + /* mount old root on put_old */ + attach_mnt(root.mnt, &old); + /* mount new_root on / */ + attach_mnt(new.mnt, &root_parent); + touch_mnt_namespace(current->nsproxy->mnt_ns); + br_write_unlock(vfsmount_lock); + chroot_fs_refs(&root, &new); + error = 0; +out4: + unlock_mount(&old); + if (!error) { + path_put(&root_parent); + path_put(&parent_path); + } +out3: + path_put(&root); +out2: + path_put(&old); +out1: + path_put(&new); +out0: + return error; +} + +static void __init init_mount_tree(void) +{ + struct vfsmount *mnt; + struct mnt_namespace *ns; + struct path root; + + mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); + if (IS_ERR(mnt)) + panic("Can't create rootfs"); + + ns = create_mnt_ns(mnt); + if (IS_ERR(ns)) + panic("Can't allocate initial namespace"); + + init_task.nsproxy->mnt_ns = ns; + get_mnt_ns(ns); + + root.mnt = ns->root; + root.dentry = ns->root->mnt_root; + + set_fs_pwd(current->fs, &root); + set_fs_root(current->fs, &root); +} + +void __init mnt_init(void) +{ + unsigned u; + int err; + + init_rwsem(&namespace_sem); + + mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); + + if (!mount_hashtable) + panic("Failed to allocate mount hash table\n"); + + printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE); + + for (u = 0; u < HASH_SIZE; u++) + INIT_LIST_HEAD(&mount_hashtable[u]); + + br_lock_init(vfsmount_lock); + + err = sysfs_init(); + if (err) + printk(KERN_WARNING "%s: sysfs_init error: %d\n", + __func__, err); + fs_kobj = kobject_create_and_add("fs", NULL); + if (!fs_kobj) + printk(KERN_WARNING "%s: kobj create error\n", __func__); + init_rootfs(); + init_mount_tree(); +} + +void put_mnt_ns(struct mnt_namespace *ns) +{ + LIST_HEAD(umount_list); + + if (!atomic_dec_and_test(&ns->count)) + return; + down_write(&namespace_sem); + br_write_lock(vfsmount_lock); + umount_tree(ns->root, 0, &umount_list); + br_write_unlock(vfsmount_lock); + up_write(&namespace_sem); + release_mounts(&umount_list); + kfree(ns); +} +EXPORT_SYMBOL(put_mnt_ns); + +struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) +{ + return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); +} +EXPORT_SYMBOL_GPL(kern_mount_data); + +bool our_mnt(struct vfsmount *mnt) +{ + return check_mnt(mnt); +} diff --git a/fs/ncpfs/Kconfig b/fs/ncpfs/Kconfig new file mode 100644 index 00000000..c931cf22 --- /dev/null +++ b/fs/ncpfs/Kconfig @@ -0,0 +1,108 @@ +# +# NCP Filesystem configuration +# +config NCP_FS + tristate "NCP file system support (to mount NetWare volumes)" + depends on IPX!=n || INET + help + NCP (NetWare Core Protocol) is a protocol that runs over IPX and is + used by Novell NetWare clients to talk to file servers. It is to + IPX what NFS is to TCP/IP, if that helps. Saying Y here allows you + to mount NetWare file server volumes and to access them just like + any other Unix directory. For details, please read the file + in the kernel source and + the IPX-HOWTO from . + + You do not have to say Y here if you want your Linux box to act as a + file *server* for Novell NetWare clients. + + General information about how to connect Linux, Windows machines and + Macs is on the WWW at . + + To compile this as a module, choose M here: the module will be called + ncpfs. Say N unless you are connected to a Novell network. + +config NCPFS_PACKET_SIGNING + bool "Packet signatures" + depends on NCP_FS + help + NCP allows packets to be signed for stronger security. If you want + security, say Y. Normal users can leave it off. To be able to use + packet signing you must use ncpfs > 2.0.12. + +config NCPFS_IOCTL_LOCKING + bool "Proprietary file locking" + depends on NCP_FS + help + Allows locking of records on remote volumes. Say N unless you have + special applications which are able to utilize this locking scheme. + +config NCPFS_STRONG + bool "Clear remove/delete inhibit when needed" + depends on NCP_FS + help + Allows manipulation of files flagged as Delete or Rename Inhibit. + To use this feature you must mount volumes with the ncpmount + parameter "-s" (ncpfs-2.0.12 and newer). Say Y unless you are not + mounting volumes with -f 444. + +config NCPFS_NFS_NS + bool "Use NFS namespace if available" + depends on NCP_FS + help + Allows you to utilize NFS namespace on NetWare servers. It brings + you case sensitive filenames. Say Y. You can disable it at + mount-time with the `-N nfs' parameter of ncpmount. + +config NCPFS_OS2_NS + bool "Use LONG (OS/2) namespace if available" + depends on NCP_FS + help + Allows you to utilize OS2/LONG namespace on NetWare servers. + Filenames in this namespace are limited to 255 characters, they are + case insensitive, and case in names is preserved. Say Y. You can + disable it at mount time with the -N os2 parameter of ncpmount. + +config NCPFS_SMALLDOS + bool "Lowercase DOS filenames" + depends on NCP_FS + ---help--- + If you say Y here, every filename on a NetWare server volume using + the OS2/LONG namespace and created under DOS or on a volume using + DOS namespace will be converted to lowercase characters. + Saying N here will give you these filenames in uppercase. + + This is only a cosmetic option since the OS2/LONG namespace is case + insensitive. The only major reason for this option is backward + compatibility when moving from DOS to OS2/LONG namespace support. + Long filenames (created by Win95) will not be affected. + + This option does not solve the problem that filenames appear + differently under Linux and under Windows, since Windows does an + additional conversions on the client side. You can achieve similar + effects by saying Y to "Allow using of Native Language Support" + below. + +config NCPFS_NLS + bool "Use Native Language Support" + depends on NCP_FS + select NLS + help + Allows you to use codepages and I/O charsets for file name + translation between the server file system and input/output. This + may be useful, if you want to access the server with other operating + systems, e.g. Windows 95. See also NLS for more Information. + + To select codepages and I/O charsets use ncpfs-2.2.0.13 or newer. + +config NCPFS_EXTRAS + bool "Enable symbolic links and execute flags" + depends on NCP_FS + help + This enables the use of symbolic links and an execute permission + bit on NCPFS. The file server need not have long name space or NFS + name space loaded for these to work. + + To use the new attributes, it is recommended to use the flags + '-f 600 -d 755' on the ncpmount command line. + diff --git a/fs/ncpfs/Makefile b/fs/ncpfs/Makefile new file mode 100644 index 00000000..c66af563 --- /dev/null +++ b/fs/ncpfs/Makefile @@ -0,0 +1,16 @@ +# +# Makefile for the linux ncp filesystem routines. +# + +obj-$(CONFIG_NCP_FS) += ncpfs.o + +ncpfs-y := dir.o file.o inode.o ioctl.o mmap.o ncplib_kernel.o sock.o \ + ncpsign_kernel.o getopt.o + +ncpfs-$(CONFIG_NCPFS_EXTRAS) += symlink.o +ncpfs-$(CONFIG_NCPFS_NFS_NS) += symlink.o + +# If you want debugging output, please uncomment the following line +# ccflags-y := -DDEBUG_NCP=1 + +CFLAGS_ncplib_kernel.o := -finline-functions diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c new file mode 100644 index 00000000..9c51f621 --- /dev/null +++ b/fs/ncpfs/dir.c @@ -0,0 +1,1280 @@ +/* + * dir.c + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * Modified for big endian by J.F. Chadima and David S. Miller + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * Modified 1998, 1999 Wolfram Pienkoss for NLS + * Modified 1999 Wolfram Pienkoss for directory caching + * Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncp_fs.h" + +static void ncp_read_volume_list(struct file *, void *, filldir_t, + struct ncp_cache_control *); +static void ncp_do_readdir(struct file *, void *, filldir_t, + struct ncp_cache_control *); + +static int ncp_readdir(struct file *, void *, filldir_t); + +static int ncp_create(struct inode *, struct dentry *, int, struct nameidata *); +static struct dentry *ncp_lookup(struct inode *, struct dentry *, struct nameidata *); +static int ncp_unlink(struct inode *, struct dentry *); +static int ncp_mkdir(struct inode *, struct dentry *, int); +static int ncp_rmdir(struct inode *, struct dentry *); +static int ncp_rename(struct inode *, struct dentry *, + struct inode *, struct dentry *); +static int ncp_mknod(struct inode * dir, struct dentry *dentry, + int mode, dev_t rdev); +#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) +extern int ncp_symlink(struct inode *, struct dentry *, const char *); +#else +#define ncp_symlink NULL +#endif + +const struct file_operations ncp_dir_operations = +{ + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = ncp_readdir, + .unlocked_ioctl = ncp_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ncp_compat_ioctl, +#endif +}; + +const struct inode_operations ncp_dir_inode_operations = +{ + .create = ncp_create, + .lookup = ncp_lookup, + .unlink = ncp_unlink, + .symlink = ncp_symlink, + .mkdir = ncp_mkdir, + .rmdir = ncp_rmdir, + .mknod = ncp_mknod, + .rename = ncp_rename, + .setattr = ncp_notify_change, +}; + +/* + * Dentry operations routines + */ +static int ncp_lookup_validate(struct dentry *, struct nameidata *); +static int ncp_hash_dentry(const struct dentry *, const struct inode *, + struct qstr *); +static int ncp_compare_dentry(const struct dentry *, const struct inode *, + const struct dentry *, const struct inode *, + unsigned int, const char *, const struct qstr *); +static int ncp_delete_dentry(const struct dentry *); + +const struct dentry_operations ncp_dentry_operations = +{ + .d_revalidate = ncp_lookup_validate, + .d_hash = ncp_hash_dentry, + .d_compare = ncp_compare_dentry, + .d_delete = ncp_delete_dentry, +}; + +#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber]) + +static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator) +{ +#ifdef CONFIG_NCPFS_SMALLDOS + int ns = ncp_namespace(i); + + if ((ns == NW_NS_DOS) +#ifdef CONFIG_NCPFS_OS2_NS + || ((ns == NW_NS_OS2) && (nscreator == NW_NS_DOS)) +#endif /* CONFIG_NCPFS_OS2_NS */ + ) + return 0; +#endif /* CONFIG_NCPFS_SMALLDOS */ + return 1; +} + +#define ncp_preserve_case(i) (ncp_namespace(i) != NW_NS_DOS) + +static inline int ncp_case_sensitive(const struct inode *i) +{ +#ifdef CONFIG_NCPFS_NFS_NS + return ncp_namespace(i) == NW_NS_NFS; +#else + return 0; +#endif /* CONFIG_NCPFS_NFS_NS */ +} + +/* + * Note: leave the hash unchanged if the directory + * is case-sensitive. + */ +static int +ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *this) +{ + if (!ncp_case_sensitive(inode)) { + struct super_block *sb = dentry->d_sb; + struct nls_table *t; + unsigned long hash; + int i; + + t = NCP_IO_TABLE(sb); + hash = init_name_hash(); + for (i=0; ilen ; i++) + hash = partial_name_hash(ncp_tolower(t, this->name[i]), + hash); + this->hash = end_name_hash(hash); + } + return 0; +} + +static int +ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + if (len != name->len) + return 1; + + if (ncp_case_sensitive(pinode)) + return strncmp(str, name->name, len); + + return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len); +} + +/* + * This is the callback from dput() when d_count is going to 0. + * We use this to unhash dentries with bad inodes. + * Closing files can be safely postponed until iput() - it's done there anyway. + */ +static int +ncp_delete_dentry(const struct dentry * dentry) +{ + struct inode *inode = dentry->d_inode; + + if (inode) { + if (is_bad_inode(inode)) + return 1; + } else + { + /* N.B. Unhash negative dentries? */ + } + return 0; +} + +static inline int +ncp_single_volume(struct ncp_server *server) +{ + return (server->m.mounted_vol[0] != '\0'); +} + +static inline int ncp_is_server_root(struct inode *inode) +{ + return (!ncp_single_volume(NCP_SERVER(inode)) && + inode == inode->i_sb->s_root->d_inode); +} + + +/* + * This is the callback when the dcache has a lookup hit. + */ + + +#ifdef CONFIG_NCPFS_STRONG +/* try to delete a readonly file (NW R bit set) */ + +static int +ncp_force_unlink(struct inode *dir, struct dentry* dentry) +{ + int res=0x9c,res2; + struct nw_modify_dos_info info; + __le32 old_nwattr; + struct inode *inode; + + memset(&info, 0, sizeof(info)); + + /* remove the Read-Only flag on the NW server */ + inode = dentry->d_inode; + + old_nwattr = NCP_FINFO(inode)->nwattr; + info.attributes = old_nwattr & ~(aRONLY|aDELETEINHIBIT|aRENAMEINHIBIT); + res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(inode), inode, NULL, DM_ATTRIBUTES, &info); + if (res2) + goto leave_me; + + /* now try again the delete operation */ + res = ncp_del_file_or_subdir2(NCP_SERVER(dir), dentry); + + if (res) /* delete failed, set R bit again */ + { + info.attributes = old_nwattr; + res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(inode), inode, NULL, DM_ATTRIBUTES, &info); + if (res2) + goto leave_me; + } +leave_me: + return(res); +} +#endif /* CONFIG_NCPFS_STRONG */ + +#ifdef CONFIG_NCPFS_STRONG +static int +ncp_force_rename(struct inode *old_dir, struct dentry* old_dentry, char *_old_name, + struct inode *new_dir, struct dentry* new_dentry, char *_new_name) +{ + struct nw_modify_dos_info info; + int res=0x90,res2; + struct inode *old_inode = old_dentry->d_inode; + __le32 old_nwattr = NCP_FINFO(old_inode)->nwattr; + __le32 new_nwattr = 0; /* shut compiler warning */ + int old_nwattr_changed = 0; + int new_nwattr_changed = 0; + + memset(&info, 0, sizeof(info)); + + /* remove the Read-Only flag on the NW server */ + + info.attributes = old_nwattr & ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); + res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(old_inode), old_inode, NULL, DM_ATTRIBUTES, &info); + if (!res2) + old_nwattr_changed = 1; + if (new_dentry && new_dentry->d_inode) { + new_nwattr = NCP_FINFO(new_dentry->d_inode)->nwattr; + info.attributes = new_nwattr & ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); + res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(new_dir), new_dir, _new_name, DM_ATTRIBUTES, &info); + if (!res2) + new_nwattr_changed = 1; + } + /* now try again the rename operation */ + /* but only if something really happened */ + if (new_nwattr_changed || old_nwattr_changed) { + res = ncp_ren_or_mov_file_or_subdir(NCP_SERVER(old_dir), + old_dir, _old_name, + new_dir, _new_name); + } + if (res) + goto leave_me; + /* file was successfully renamed, so: + do not set attributes on old file - it no longer exists + copy attributes from old file to new */ + new_nwattr_changed = old_nwattr_changed; + new_nwattr = old_nwattr; + old_nwattr_changed = 0; + +leave_me:; + if (old_nwattr_changed) { + info.attributes = old_nwattr; + res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(old_inode), old_inode, NULL, DM_ATTRIBUTES, &info); + /* ignore errors */ + } + if (new_nwattr_changed) { + info.attributes = new_nwattr; + res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(new_dir), new_dir, _new_name, DM_ATTRIBUTES, &info); + /* ignore errors */ + } + return(res); +} +#endif /* CONFIG_NCPFS_STRONG */ + + +static int +ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd) +{ + struct ncp_server *server; + struct dentry *parent; + struct inode *dir; + struct ncp_entry_info finfo; + int res, val = 0, len; + __u8 __name[NCP_MAXPATHLEN + 1]; + + if (dentry == dentry->d_sb->s_root) + return 1; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + parent = dget_parent(dentry); + dir = parent->d_inode; + + if (!dentry->d_inode) + goto finished; + + server = NCP_SERVER(dir); + + /* + * Inspired by smbfs: + * The default validation is based on dentry age: + * We set the max age at mount time. (But each + * successful server lookup renews the timestamp.) + */ + val = NCP_TEST_AGE(server, dentry); + if (val) + goto finished; + + DDPRINTK("ncp_lookup_validate: %s/%s not valid, age=%ld, server lookup\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + NCP_GET_AGE(dentry)); + + len = sizeof(__name); + if (ncp_is_server_root(dir)) { + res = ncp_io2vol(server, __name, &len, dentry->d_name.name, + dentry->d_name.len, 1); + if (!res) { + res = ncp_lookup_volume(server, __name, &(finfo.i)); + if (!res) + ncp_update_known_namespace(server, finfo.i.volNumber, NULL); + } + } else { + res = ncp_io2vol(server, __name, &len, dentry->d_name.name, + dentry->d_name.len, !ncp_preserve_case(dir)); + if (!res) + res = ncp_obtain_info(server, dir, __name, &(finfo.i)); + } + finfo.volume = finfo.i.volNumber; + DDPRINTK("ncp_lookup_validate: looked for %s/%s, res=%d\n", + dentry->d_parent->d_name.name, __name, res); + /* + * If we didn't find it, or if it has a different dirEntNum to + * what we remember, it's not valid any more. + */ + if (!res) { + struct inode *inode = dentry->d_inode; + + mutex_lock(&inode->i_mutex); + if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) { + ncp_new_dentry(dentry); + val=1; + } else + DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n"); + + ncp_update_inode2(inode, &finfo); + mutex_unlock(&inode->i_mutex); + } + +finished: + DDPRINTK("ncp_lookup_validate: result=%d\n", val); + dput(parent); + return val; +} + +static struct dentry * +ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) +{ + struct dentry *dent = dentry; + struct list_head *next; + + if (d_validate(dent, parent)) { + if (dent->d_name.len <= NCP_MAXPATHLEN && + (unsigned long)dent->d_fsdata == fpos) { + if (!dent->d_inode) { + dput(dent); + dent = NULL; + } + return dent; + } + dput(dent); + } + + /* If a pointer is invalid, we search the dentry. */ + spin_lock(&parent->d_lock); + next = parent->d_subdirs.next; + while (next != &parent->d_subdirs) { + dent = list_entry(next, struct dentry, d_u.d_child); + if ((unsigned long)dent->d_fsdata == fpos) { + if (dent->d_inode) + dget(dent); + else + dent = NULL; + spin_unlock(&parent->d_lock); + goto out; + } + next = next->next; + } + spin_unlock(&parent->d_lock); + return NULL; + +out: + return dent; +} + +static time_t ncp_obtain_mtime(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct ncp_server *server = NCP_SERVER(inode); + struct nw_info_struct i; + + if (!ncp_conn_valid(server) || ncp_is_server_root(inode)) + return 0; + + if (ncp_obtain_info(server, inode, NULL, &i)) + return 0; + + return ncp_date_dos2unix(i.modifyTime, i.modifyDate); +} + +static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct page *page = NULL; + struct ncp_server *server = NCP_SERVER(inode); + union ncp_dir_cache *cache = NULL; + struct ncp_cache_control ctl; + int result, mtime_valid = 0; + time_t mtime = 0; + + ctl.page = NULL; + ctl.cache = NULL; + + DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + (int) filp->f_pos); + + result = -EIO; + /* Do not generate '.' and '..' when server is dead. */ + if (!ncp_conn_valid(server)) + goto out; + + result = 0; + if (filp->f_pos == 0) { + if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) + goto out; + filp->f_pos = 1; + } + if (filp->f_pos == 1) { + if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR)) + goto out; + filp->f_pos = 2; + } + + page = grab_cache_page(&inode->i_data, 0); + if (!page) + goto read_really; + + ctl.cache = cache = kmap(page); + ctl.head = cache->head; + + if (!PageUptodate(page) || !ctl.head.eof) + goto init_cache; + + if (filp->f_pos == 2) { + if (jiffies - ctl.head.time >= NCP_MAX_AGE(server)) + goto init_cache; + + mtime = ncp_obtain_mtime(dentry); + mtime_valid = 1; + if ((!mtime) || (mtime != ctl.head.mtime)) + goto init_cache; + } + + if (filp->f_pos > ctl.head.end) + goto finished; + + ctl.fpos = filp->f_pos + (NCP_DIRCACHE_START - 2); + ctl.ofs = ctl.fpos / NCP_DIRCACHE_SIZE; + ctl.idx = ctl.fpos % NCP_DIRCACHE_SIZE; + + for (;;) { + if (ctl.ofs != 0) { + ctl.page = find_lock_page(&inode->i_data, ctl.ofs); + if (!ctl.page) + goto invalid_cache; + ctl.cache = kmap(ctl.page); + if (!PageUptodate(ctl.page)) + goto invalid_cache; + } + while (ctl.idx < NCP_DIRCACHE_SIZE) { + struct dentry *dent; + int res; + + dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx], + dentry, filp->f_pos); + if (!dent) + goto invalid_cache; + res = filldir(dirent, dent->d_name.name, + dent->d_name.len, filp->f_pos, + dent->d_inode->i_ino, DT_UNKNOWN); + dput(dent); + if (res) + goto finished; + filp->f_pos += 1; + ctl.idx += 1; + if (filp->f_pos > ctl.head.end) + goto finished; + } + if (ctl.page) { + kunmap(ctl.page); + SetPageUptodate(ctl.page); + unlock_page(ctl.page); + page_cache_release(ctl.page); + ctl.page = NULL; + } + ctl.idx = 0; + ctl.ofs += 1; + } +invalid_cache: + if (ctl.page) { + kunmap(ctl.page); + unlock_page(ctl.page); + page_cache_release(ctl.page); + ctl.page = NULL; + } + ctl.cache = cache; +init_cache: + ncp_invalidate_dircache_entries(dentry); + if (!mtime_valid) { + mtime = ncp_obtain_mtime(dentry); + mtime_valid = 1; + } + ctl.head.mtime = mtime; + ctl.head.time = jiffies; + ctl.head.eof = 0; + ctl.fpos = 2; + ctl.ofs = 0; + ctl.idx = NCP_DIRCACHE_START; + ctl.filled = 0; + ctl.valid = 1; +read_really: + if (ncp_is_server_root(inode)) { + ncp_read_volume_list(filp, dirent, filldir, &ctl); + } else { + ncp_do_readdir(filp, dirent, filldir, &ctl); + } + ctl.head.end = ctl.fpos - 1; + ctl.head.eof = ctl.valid; +finished: + if (ctl.page) { + kunmap(ctl.page); + SetPageUptodate(ctl.page); + unlock_page(ctl.page); + page_cache_release(ctl.page); + } + if (page) { + cache->head = ctl.head; + kunmap(page); + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + } +out: + return result; +} + +static int +ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct ncp_cache_control *ctrl, struct ncp_entry_info *entry, + int inval_childs) +{ + struct dentry *newdent, *dentry = filp->f_path.dentry; + struct inode *dir = dentry->d_inode; + struct ncp_cache_control ctl = *ctrl; + struct qstr qname; + int valid = 0; + int hashed = 0; + ino_t ino = 0; + __u8 __name[NCP_MAXPATHLEN + 1]; + + qname.len = sizeof(__name); + if (ncp_vol2io(NCP_SERVER(dir), __name, &qname.len, + entry->i.entryName, entry->i.nameLen, + !ncp_preserve_entry_case(dir, entry->i.NSCreator))) + return 1; /* I'm not sure */ + + qname.name = __name; + qname.hash = full_name_hash(qname.name, qname.len); + + if (dentry->d_op && dentry->d_op->d_hash) + if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0) + goto end_advance; + + newdent = d_lookup(dentry, &qname); + + if (!newdent) { + newdent = d_alloc(dentry, &qname); + if (!newdent) + goto end_advance; + } else { + hashed = 1; + + /* If case sensitivity changed for this volume, all entries below this one + should be thrown away. This entry itself is not affected, as its case + sensitivity is controlled by its own parent. */ + if (inval_childs) + shrink_dcache_parent(newdent); + + /* + * NetWare's OS2 namespace is case preserving yet case + * insensitive. So we update dentry's name as received from + * server. Parent dir's i_mutex is locked because we're in + * readdir. + */ + dentry_update_name_case(newdent, &qname); + } + + if (!newdent->d_inode) { + struct inode *inode; + + entry->opened = 0; + entry->ino = iunique(dir->i_sb, 2); + inode = ncp_iget(dir->i_sb, entry); + if (inode) { + d_instantiate(newdent, inode); + if (!hashed) + d_rehash(newdent); + } + } else { + struct inode *inode = newdent->d_inode; + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + ncp_update_inode2(inode, entry); + mutex_unlock(&inode->i_mutex); + } + + if (newdent->d_inode) { + ino = newdent->d_inode->i_ino; + newdent->d_fsdata = (void *) ctl.fpos; + ncp_new_dentry(newdent); + } + + if (ctl.idx >= NCP_DIRCACHE_SIZE) { + if (ctl.page) { + kunmap(ctl.page); + SetPageUptodate(ctl.page); + unlock_page(ctl.page); + page_cache_release(ctl.page); + } + ctl.cache = NULL; + ctl.idx -= NCP_DIRCACHE_SIZE; + ctl.ofs += 1; + ctl.page = grab_cache_page(&dir->i_data, ctl.ofs); + if (ctl.page) + ctl.cache = kmap(ctl.page); + } + if (ctl.cache) { + ctl.cache->dentry[ctl.idx] = newdent; + valid = 1; + } + dput(newdent); +end_advance: + if (!valid) + ctl.valid = 0; + if (!ctl.filled && (ctl.fpos == filp->f_pos)) { + if (!ino) + ino = find_inode_number(dentry, &qname); + if (!ino) + ino = iunique(dir->i_sb, 2); + ctl.filled = filldir(dirent, qname.name, qname.len, + filp->f_pos, ino, DT_UNKNOWN); + if (!ctl.filled) + filp->f_pos += 1; + } + ctl.fpos += 1; + ctl.idx += 1; + *ctrl = ctl; + return (ctl.valid || !ctl.filled); +} + +static void +ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir, + struct ncp_cache_control *ctl) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct ncp_server *server = NCP_SERVER(inode); + struct ncp_volume_info info; + struct ncp_entry_info entry; + int i; + + DPRINTK("ncp_read_volume_list: pos=%ld\n", + (unsigned long) filp->f_pos); + + for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { + int inval_dentry; + + if (ncp_get_volume_info_with_number(server, i, &info) != 0) + return; + if (!strlen(info.volume_name)) + continue; + + DPRINTK("ncp_read_volume_list: found vol: %s\n", + info.volume_name); + + if (ncp_lookup_volume(server, info.volume_name, + &entry.i)) { + DPRINTK("ncpfs: could not lookup vol %s\n", + info.volume_name); + continue; + } + inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL); + entry.volume = entry.i.volNumber; + if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry)) + return; + } +} + +static void +ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir, + struct ncp_cache_control *ctl) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *dir = dentry->d_inode; + struct ncp_server *server = NCP_SERVER(dir); + struct nw_search_sequence seq; + struct ncp_entry_info entry; + int err; + void* buf; + int more; + size_t bufsize; + + DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + (unsigned long) filp->f_pos); + PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n", + dentry->d_name.name, NCP_FINFO(dir)->volNumber, + NCP_FINFO(dir)->dirEntNum); + + err = ncp_initialize_search(server, dir, &seq); + if (err) { + DPRINTK("ncp_do_readdir: init failed, err=%d\n", err); + return; + } + /* We MUST NOT use server->buffer_size handshaked with server if we are + using UDP, as for UDP server uses max. buffer size determined by + MTU, and for TCP server uses hardwired value 65KB (== 66560 bytes). + So we use 128KB, just to be sure, as there is no way how to know + this value in advance. */ + bufsize = 131072; + buf = vmalloc(bufsize); + if (!buf) + return; + do { + int cnt; + char* rpl; + size_t rpls; + + err = ncp_search_for_fileset(server, &seq, &more, &cnt, buf, bufsize, &rpl, &rpls); + if (err) /* Error */ + break; + if (!cnt) /* prevent endless loop */ + break; + while (cnt--) { + size_t onerpl; + + if (rpls < offsetof(struct nw_info_struct, entryName)) + break; /* short packet */ + ncp_extract_file_info(rpl, &entry.i); + onerpl = offsetof(struct nw_info_struct, entryName) + entry.i.nameLen; + if (rpls < onerpl) + break; /* short packet */ + (void)ncp_obtain_nfs_info(server, &entry.i); + rpl += onerpl; + rpls -= onerpl; + entry.volume = entry.i.volNumber; + if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0)) + break; + } + } while (more); + vfree(buf); + return; +} + +int ncp_conn_logged_in(struct super_block *sb) +{ + struct ncp_server* server = NCP_SBP(sb); + int result; + + if (ncp_single_volume(server)) { + int len; + struct dentry* dent; + __u32 volNumber; + __le32 dirEntNum; + __le32 DosDirNum; + __u8 __name[NCP_MAXPATHLEN + 1]; + + len = sizeof(__name); + result = ncp_io2vol(server, __name, &len, server->m.mounted_vol, + strlen(server->m.mounted_vol), 1); + if (result) + goto out; + result = -ENOENT; + if (ncp_get_volume_root(server, __name, &volNumber, &dirEntNum, &DosDirNum)) { + PPRINTK("ncp_conn_logged_in: %s not found\n", + server->m.mounted_vol); + goto out; + } + dent = sb->s_root; + if (dent) { + struct inode* ino = dent->d_inode; + if (ino) { + ncp_update_known_namespace(server, volNumber, NULL); + NCP_FINFO(ino)->volNumber = volNumber; + NCP_FINFO(ino)->dirEntNum = dirEntNum; + NCP_FINFO(ino)->DosDirNum = DosDirNum; + result = 0; + } else { + DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n"); + } + } else { + DPRINTK("ncpfs: sb->s_root == NULL!\n"); + } + } else + result = 0; + +out: + return result; +} + +static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct ncp_server *server = NCP_SERVER(dir); + struct inode *inode = NULL; + struct ncp_entry_info finfo; + int error, res, len; + __u8 __name[NCP_MAXPATHLEN + 1]; + + error = -EIO; + if (!ncp_conn_valid(server)) + goto finished; + + PPRINTK("ncp_lookup: server lookup for %s/%s\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + + len = sizeof(__name); + if (ncp_is_server_root(dir)) { + res = ncp_io2vol(server, __name, &len, dentry->d_name.name, + dentry->d_name.len, 1); + if (!res) + res = ncp_lookup_volume(server, __name, &(finfo.i)); + if (!res) + ncp_update_known_namespace(server, finfo.i.volNumber, NULL); + } else { + res = ncp_io2vol(server, __name, &len, dentry->d_name.name, + dentry->d_name.len, !ncp_preserve_case(dir)); + if (!res) + res = ncp_obtain_info(server, dir, __name, &(finfo.i)); + } + PPRINTK("ncp_lookup: looked for %s/%s, res=%d\n", + dentry->d_parent->d_name.name, __name, res); + /* + * If we didn't find an entry, make a negative dentry. + */ + if (res) + goto add_entry; + + /* + * Create an inode for the entry. + */ + finfo.opened = 0; + finfo.ino = iunique(dir->i_sb, 2); + finfo.volume = finfo.i.volNumber; + error = -EACCES; + inode = ncp_iget(dir->i_sb, &finfo); + + if (inode) { + ncp_new_dentry(dentry); +add_entry: + d_add(dentry, inode); + error = 0; + } + +finished: + PPRINTK("ncp_lookup: result=%d\n", error); + return ERR_PTR(error); +} + +/* + * This code is common to create, mkdir, and mknod. + */ +static int ncp_instantiate(struct inode *dir, struct dentry *dentry, + struct ncp_entry_info *finfo) +{ + struct inode *inode; + int error = -EINVAL; + + finfo->ino = iunique(dir->i_sb, 2); + inode = ncp_iget(dir->i_sb, finfo); + if (!inode) + goto out_close; + d_instantiate(dentry,inode); + error = 0; +out: + return error; + +out_close: + PPRINTK("ncp_instantiate: %s/%s failed, closing file\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + ncp_close_file(NCP_SERVER(dir), finfo->file_handle); + goto out; +} + +int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev, __le32 attributes) +{ + struct ncp_server *server = NCP_SERVER(dir); + struct ncp_entry_info finfo; + int error, result, len; + int opmode; + __u8 __name[NCP_MAXPATHLEN + 1]; + + PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n", + dentry->d_parent->d_name.name, dentry->d_name.name, mode); + + ncp_age_dentry(server, dentry); + len = sizeof(__name); + error = ncp_io2vol(server, __name, &len, dentry->d_name.name, + dentry->d_name.len, !ncp_preserve_case(dir)); + if (error) + goto out; + + error = -EACCES; + + if (S_ISREG(mode) && + (server->m.flags & NCP_MOUNT_EXTRAS) && + (mode & S_IXUGO)) + attributes |= aSYSTEM | aSHARED; + + result = ncp_open_create_file_or_subdir(server, dir, __name, + OC_MODE_CREATE | OC_MODE_OPEN | OC_MODE_REPLACE, + attributes, AR_READ | AR_WRITE, &finfo); + opmode = O_RDWR; + if (result) { + result = ncp_open_create_file_or_subdir(server, dir, __name, + OC_MODE_CREATE | OC_MODE_OPEN | OC_MODE_REPLACE, + attributes, AR_WRITE, &finfo); + if (result) { + if (result == 0x87) + error = -ENAMETOOLONG; + else if (result < 0) + error = result; + DPRINTK("ncp_create: %s/%s failed\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + goto out; + } + opmode = O_WRONLY; + } + finfo.access = opmode; + if (ncp_is_nfs_extras(server, finfo.volume)) { + finfo.i.nfs.mode = mode; + finfo.i.nfs.rdev = new_encode_dev(rdev); + if (ncp_modify_nfs_info(server, finfo.volume, + finfo.i.dirEntNum, + mode, new_encode_dev(rdev)) != 0) + goto out; + } + + error = ncp_instantiate(dir, dentry, &finfo); +out: + return error; +} + +static int ncp_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + return ncp_create_new(dir, dentry, mode, 0, 0); +} + +static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct ncp_entry_info finfo; + struct ncp_server *server = NCP_SERVER(dir); + int error, len; + __u8 __name[NCP_MAXPATHLEN + 1]; + + DPRINTK("ncp_mkdir: making %s/%s\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + + ncp_age_dentry(server, dentry); + len = sizeof(__name); + error = ncp_io2vol(server, __name, &len, dentry->d_name.name, + dentry->d_name.len, !ncp_preserve_case(dir)); + if (error) + goto out; + + error = ncp_open_create_file_or_subdir(server, dir, __name, + OC_MODE_CREATE, aDIR, + cpu_to_le16(0xffff), + &finfo); + if (error == 0) { + if (ncp_is_nfs_extras(server, finfo.volume)) { + mode |= S_IFDIR; + finfo.i.nfs.mode = mode; + if (ncp_modify_nfs_info(server, + finfo.volume, + finfo.i.dirEntNum, + mode, 0) != 0) + goto out; + } + error = ncp_instantiate(dir, dentry, &finfo); + } else if (error > 0) { + error = -EACCES; + } +out: + return error; +} + +static int ncp_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct ncp_server *server = NCP_SERVER(dir); + int error, result, len; + __u8 __name[NCP_MAXPATHLEN + 1]; + + DPRINTK("ncp_rmdir: removing %s/%s\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + + /* + * fail with EBUSY if there are still references to this + * directory. + */ + dentry_unhash(dentry); + error = -EBUSY; + if (!d_unhashed(dentry)) + goto out; + + len = sizeof(__name); + error = ncp_io2vol(server, __name, &len, dentry->d_name.name, + dentry->d_name.len, !ncp_preserve_case(dir)); + if (error) + goto out; + + result = ncp_del_file_or_subdir(server, dir, __name); + switch (result) { + case 0x00: + error = 0; + break; + case 0x85: /* unauthorized to delete file */ + case 0x8A: /* unauthorized to delete file */ + error = -EACCES; + break; + case 0x8F: + case 0x90: /* read only */ + error = -EPERM; + break; + case 0x9F: /* in use by another client */ + error = -EBUSY; + break; + case 0xA0: /* directory not empty */ + error = -ENOTEMPTY; + break; + case 0xFF: /* someone deleted file */ + error = -ENOENT; + break; + default: + error = result < 0 ? result : -EACCES; + break; + } +out: + return error; +} + +static int ncp_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct ncp_server *server; + int error; + + server = NCP_SERVER(dir); + DPRINTK("ncp_unlink: unlinking %s/%s\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + + /* + * Check whether to close the file ... + */ + if (inode) { + PPRINTK("ncp_unlink: closing file\n"); + ncp_make_closed(inode); + } + + error = ncp_del_file_or_subdir2(server, dentry); +#ifdef CONFIG_NCPFS_STRONG + /* 9C is Invalid path.. It should be 8F, 90 - read only, but + it is not :-( */ + if ((error == 0x9C || error == 0x90) && server->m.flags & NCP_MOUNT_STRONG) { /* R/O */ + error = ncp_force_unlink(dir, dentry); + } +#endif + switch (error) { + case 0x00: + DPRINTK("ncp: removed %s/%s\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + break; + case 0x85: + case 0x8A: + error = -EACCES; + break; + case 0x8D: /* some files in use */ + case 0x8E: /* all files in use */ + error = -EBUSY; + break; + case 0x8F: /* some read only */ + case 0x90: /* all read only */ + case 0x9C: /* !!! returned when in-use or read-only by NW4 */ + error = -EPERM; + break; + case 0xFF: + error = -ENOENT; + break; + default: + error = error < 0 ? error : -EACCES; + break; + } + return error; +} + +static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct ncp_server *server = NCP_SERVER(old_dir); + int error; + int old_len, new_len; + __u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1]; + + DPRINTK("ncp_rename: %s/%s to %s/%s\n", + old_dentry->d_parent->d_name.name, old_dentry->d_name.name, + new_dentry->d_parent->d_name.name, new_dentry->d_name.name); + + if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) { + /* + * fail with EBUSY if there are still references to this + * directory. + */ + dentry_unhash(new_dentry); + error = -EBUSY; + if (!d_unhashed(new_dentry)) + goto out; + } + + ncp_age_dentry(server, old_dentry); + ncp_age_dentry(server, new_dentry); + + old_len = sizeof(__old_name); + error = ncp_io2vol(server, __old_name, &old_len, + old_dentry->d_name.name, old_dentry->d_name.len, + !ncp_preserve_case(old_dir)); + if (error) + goto out; + + new_len = sizeof(__new_name); + error = ncp_io2vol(server, __new_name, &new_len, + new_dentry->d_name.name, new_dentry->d_name.len, + !ncp_preserve_case(new_dir)); + if (error) + goto out; + + error = ncp_ren_or_mov_file_or_subdir(server, old_dir, __old_name, + new_dir, __new_name); +#ifdef CONFIG_NCPFS_STRONG + if ((error == 0x90 || error == 0x8B || error == -EACCES) && + server->m.flags & NCP_MOUNT_STRONG) { /* RO */ + error = ncp_force_rename(old_dir, old_dentry, __old_name, + new_dir, new_dentry, __new_name); + } +#endif + switch (error) { + case 0x00: + DPRINTK("ncp renamed %s -> %s.\n", + old_dentry->d_name.name,new_dentry->d_name.name); + break; + case 0x9E: + error = -ENAMETOOLONG; + break; + case 0xFF: + error = -ENOENT; + break; + default: + error = error < 0 ? error : -EACCES; + break; + } +out: + return error; +} + +static int ncp_mknod(struct inode * dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + if (!new_valid_dev(rdev)) + return -EINVAL; + if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) { + DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%o\n", mode); + return ncp_create_new(dir, dentry, mode, rdev, 0); + } + return -EPERM; /* Strange, but true */ +} + +/* The following routines are taken directly from msdos-fs */ + +/* Linear day numbers of the respective 1sts in non-leap years. */ + +static int day_n[] = +{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0}; +/* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */ + + +extern struct timezone sys_tz; + +static int utc2local(int time) +{ + return time - sys_tz.tz_minuteswest * 60; +} + +static int local2utc(int time) +{ + return time + sys_tz.tz_minuteswest * 60; +} + +/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */ +int +ncp_date_dos2unix(__le16 t, __le16 d) +{ + unsigned short time = le16_to_cpu(t), date = le16_to_cpu(d); + int month, year, secs; + + /* first subtract and mask after that... Otherwise, if + date == 0, bad things happen */ + month = ((date >> 5) - 1) & 15; + year = date >> 9; + secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + + 86400 * ((date & 31) - 1 + day_n[month] + (year / 4) + + year * 365 - ((year & 3) == 0 && month < 2 ? 1 : 0) + 3653); + /* days since 1.1.70 plus 80's leap day */ + return local2utc(secs); +} + + +/* Convert linear UNIX date to a MS-DOS time/date pair. */ +void +ncp_date_unix2dos(int unix_date, __le16 *time, __le16 *date) +{ + int day, year, nl_day, month; + + unix_date = utc2local(unix_date); + *time = cpu_to_le16( + (unix_date % 60) / 2 + (((unix_date / 60) % 60) << 5) + + (((unix_date / 3600) % 24) << 11)); + day = unix_date / 86400 - 3652; + year = day / 365; + if ((year + 3) / 4 + 365 * year > day) + year--; + day -= (year + 3) / 4 + 365 * year; + if (day == 59 && !(year & 3)) { + nl_day = day; + month = 2; + } else { + nl_day = (year & 3) || day <= 59 ? day : day - 1; + for (month = 1; month < 12; month++) + if (day_n[month] > nl_day) + break; + } + *date = cpu_to_le16(nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9)); +} diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c new file mode 100644 index 00000000..0ed65e0c --- /dev/null +++ b/fs/ncpfs/file.c @@ -0,0 +1,297 @@ +/* + * file.c + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncp_fs.h" + +static int ncp_fsync(struct file *file, int datasync) +{ + return 0; +} + +/* + * Open a file with the specified read/write mode. + */ +int ncp_make_open(struct inode *inode, int right) +{ + int error; + int access; + + error = -EINVAL; + if (!inode) { + printk(KERN_ERR "ncp_make_open: got NULL inode\n"); + goto out; + } + + DPRINTK("ncp_make_open: opened=%d, volume # %u, dir entry # %u\n", + atomic_read(&NCP_FINFO(inode)->opened), + NCP_FINFO(inode)->volNumber, + NCP_FINFO(inode)->dirEntNum); + error = -EACCES; + mutex_lock(&NCP_FINFO(inode)->open_mutex); + if (!atomic_read(&NCP_FINFO(inode)->opened)) { + struct ncp_entry_info finfo; + int result; + + /* tries max. rights */ + finfo.access = O_RDWR; + result = ncp_open_create_file_or_subdir(NCP_SERVER(inode), + inode, NULL, OC_MODE_OPEN, + 0, AR_READ | AR_WRITE, &finfo); + if (!result) + goto update; + /* RDWR did not succeeded, try readonly or writeonly as requested */ + switch (right) { + case O_RDONLY: + finfo.access = O_RDONLY; + result = ncp_open_create_file_or_subdir(NCP_SERVER(inode), + inode, NULL, OC_MODE_OPEN, + 0, AR_READ, &finfo); + break; + case O_WRONLY: + finfo.access = O_WRONLY; + result = ncp_open_create_file_or_subdir(NCP_SERVER(inode), + inode, NULL, OC_MODE_OPEN, + 0, AR_WRITE, &finfo); + break; + } + if (result) { + PPRINTK("ncp_make_open: failed, result=%d\n", result); + goto out_unlock; + } + /* + * Update the inode information. + */ + update: + ncp_update_inode(inode, &finfo); + atomic_set(&NCP_FINFO(inode)->opened, 1); + } + + access = NCP_FINFO(inode)->access; + PPRINTK("ncp_make_open: file open, access=%x\n", access); + if (access == right || access == O_RDWR) { + atomic_inc(&NCP_FINFO(inode)->opened); + error = 0; + } + +out_unlock: + mutex_unlock(&NCP_FINFO(inode)->open_mutex); +out: + return error; +} + +static ssize_t +ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + size_t already_read = 0; + off_t pos; + size_t bufsize; + int error; + void* freepage; + size_t freelen; + + DPRINTK("ncp_file_read: enter %s/%s\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + + pos = *ppos; + + if ((ssize_t) count < 0) { + return -EINVAL; + } + if (!count) + return 0; + if (pos > inode->i_sb->s_maxbytes) + return 0; + if (pos + count > inode->i_sb->s_maxbytes) { + count = inode->i_sb->s_maxbytes - pos; + } + + error = ncp_make_open(inode, O_RDONLY); + if (error) { + DPRINTK(KERN_ERR "ncp_file_read: open failed, error=%d\n", error); + return error; + } + + bufsize = NCP_SERVER(inode)->buffer_size; + + error = -EIO; + freelen = ncp_read_bounce_size(bufsize); + freepage = vmalloc(freelen); + if (!freepage) + goto outrel; + error = 0; + /* First read in as much as possible for each bufsize. */ + while (already_read < count) { + int read_this_time; + size_t to_read = min_t(unsigned int, + bufsize - (pos % bufsize), + count - already_read); + + error = ncp_read_bounce(NCP_SERVER(inode), + NCP_FINFO(inode)->file_handle, + pos, to_read, buf, &read_this_time, + freepage, freelen); + if (error) { + error = -EIO; /* NW errno -> Linux errno */ + break; + } + pos += read_this_time; + buf += read_this_time; + already_read += read_this_time; + + if (read_this_time != to_read) { + break; + } + } + vfree(freepage); + + *ppos = pos; + + file_accessed(file); + + DPRINTK("ncp_file_read: exit %s/%s\n", + dentry->d_parent->d_name.name, dentry->d_name.name); +outrel: + ncp_inode_close(inode); + return already_read ? already_read : error; +} + +static ssize_t +ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + size_t already_written = 0; + off_t pos; + size_t bufsize; + int errno; + void* bouncebuffer; + + DPRINTK("ncp_file_write: enter %s/%s\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + if ((ssize_t) count < 0) + return -EINVAL; + pos = *ppos; + if (file->f_flags & O_APPEND) { + pos = i_size_read(inode); + } + + if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { + if (pos >= MAX_NON_LFS) { + return -EFBIG; + } + if (count > MAX_NON_LFS - (u32)pos) { + count = MAX_NON_LFS - (u32)pos; + } + } + if (pos >= inode->i_sb->s_maxbytes) { + if (count || pos > inode->i_sb->s_maxbytes) { + return -EFBIG; + } + } + if (pos + count > inode->i_sb->s_maxbytes) { + count = inode->i_sb->s_maxbytes - pos; + } + + if (!count) + return 0; + errno = ncp_make_open(inode, O_WRONLY); + if (errno) { + DPRINTK(KERN_ERR "ncp_file_write: open failed, error=%d\n", errno); + return errno; + } + bufsize = NCP_SERVER(inode)->buffer_size; + + already_written = 0; + + bouncebuffer = vmalloc(bufsize); + if (!bouncebuffer) { + errno = -EIO; /* -ENOMEM */ + goto outrel; + } + while (already_written < count) { + int written_this_time; + size_t to_write = min_t(unsigned int, + bufsize - (pos % bufsize), + count - already_written); + + if (copy_from_user(bouncebuffer, buf, to_write)) { + errno = -EFAULT; + break; + } + if (ncp_write_kernel(NCP_SERVER(inode), + NCP_FINFO(inode)->file_handle, + pos, to_write, bouncebuffer, &written_this_time) != 0) { + errno = -EIO; + break; + } + pos += written_this_time; + buf += written_this_time; + already_written += written_this_time; + + if (written_this_time != to_write) { + break; + } + } + vfree(bouncebuffer); + + file_update_time(file); + + *ppos = pos; + + if (pos > i_size_read(inode)) { + mutex_lock(&inode->i_mutex); + if (pos > i_size_read(inode)) + i_size_write(inode, pos); + mutex_unlock(&inode->i_mutex); + } + DPRINTK("ncp_file_write: exit %s/%s\n", + dentry->d_parent->d_name.name, dentry->d_name.name); +outrel: + ncp_inode_close(inode); + return already_written ? already_written : errno; +} + +static int ncp_release(struct inode *inode, struct file *file) { + if (ncp_make_closed(inode)) { + DPRINTK("ncp_release: failed to close\n"); + } + return 0; +} + +const struct file_operations ncp_file_operations = +{ + .llseek = generic_file_llseek, + .read = ncp_file_read, + .write = ncp_file_write, + .unlocked_ioctl = ncp_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ncp_compat_ioctl, +#endif + .mmap = ncp_mmap, + .release = ncp_release, + .fsync = ncp_fsync, +}; + +const struct inode_operations ncp_file_inode_operations = +{ + .setattr = ncp_notify_change, +}; diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c new file mode 100644 index 00000000..0af3349d --- /dev/null +++ b/fs/ncpfs/getopt.c @@ -0,0 +1,74 @@ +/* + * getopt.c + */ + +#include +#include + +#include + +#include "getopt.h" + +/** + * ncp_getopt - option parser + * @caller: name of the caller, for error messages + * @options: the options string + * @opts: an array of &struct option entries controlling parser operations + * @optopt: output; will contain the current option + * @optarg: output; will contain the value (if one exists) + * @value: output; may be NULL; will be overwritten with the integer value + * of the current argument. + * + * Helper to parse options on the format used by mount ("a=b,c=d,e,f"). + * Returns opts->val if a matching entry in the 'opts' array is found, + * 0 when no more tokens are found, -1 if an error is encountered. + */ +int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts, + char **optopt, char **optarg, unsigned long *value) +{ + char *token; + char *val; + + do { + if ((token = strsep(options, ",")) == NULL) + return 0; + } while (*token == '\0'); + if (optopt) + *optopt = token; + + if ((val = strchr (token, '=')) != NULL) { + *val++ = 0; + } + *optarg = val; + for (; opts->name; opts++) { + if (!strcmp(opts->name, token)) { + if (!val) { + if (opts->has_arg & OPT_NOPARAM) { + return opts->val; + } + printk(KERN_INFO "%s: the %s option requires an argument\n", + caller, token); + return -EINVAL; + } + if (opts->has_arg & OPT_INT) { + char* v; + + *value = simple_strtoul(val, &v, 0); + if (!*v) { + return opts->val; + } + printk(KERN_INFO "%s: invalid numeric value in %s=%s\n", + caller, token, val); + return -EDOM; + } + if (opts->has_arg & OPT_STRING) { + return opts->val; + } + printk(KERN_INFO "%s: unexpected argument %s to the %s option\n", + caller, val, token); + return -EINVAL; + } + } + printk(KERN_INFO "%s: Unrecognized mount option %s\n", caller, token); + return -EOPNOTSUPP; +} diff --git a/fs/ncpfs/getopt.h b/fs/ncpfs/getopt.h new file mode 100644 index 00000000..cccc007d --- /dev/null +++ b/fs/ncpfs/getopt.h @@ -0,0 +1,16 @@ +#ifndef _LINUX_GETOPT_H +#define _LINUX_GETOPT_H + +#define OPT_NOPARAM 1 +#define OPT_INT 2 +#define OPT_STRING 4 +struct ncp_option { + const char *name; + unsigned int has_arg; + int val; +}; + +extern int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts, + char **optopt, char **optarg, unsigned long *value); + +#endif /* _LINUX_GETOPT_H */ diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c new file mode 100644 index 00000000..202f3705 --- /dev/null +++ b/fs/ncpfs/inode.c @@ -0,0 +1,1072 @@ +/* + * inode.c + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * Modified for big endian by J.F. Chadima and David S. Miller + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * Modified 1998 Wolfram Pienkoss for NLS + * Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info + * + */ + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ncp_fs.h" +#include "getopt.h" + +#define NCP_DEFAULT_FILE_MODE 0600 +#define NCP_DEFAULT_DIR_MODE 0700 +#define NCP_DEFAULT_TIME_OUT 10 +#define NCP_DEFAULT_RETRY_COUNT 20 + +static void ncp_evict_inode(struct inode *); +static void ncp_put_super(struct super_block *); +static int ncp_statfs(struct dentry *, struct kstatfs *); +static int ncp_show_options(struct seq_file *, struct vfsmount *); + +static struct kmem_cache * ncp_inode_cachep; + +static struct inode *ncp_alloc_inode(struct super_block *sb) +{ + struct ncp_inode_info *ei; + ei = (struct ncp_inode_info *)kmem_cache_alloc(ncp_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void ncp_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode)); +} + +static void ncp_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ncp_i_callback); +} + +static void init_once(void *foo) +{ + struct ncp_inode_info *ei = (struct ncp_inode_info *) foo; + + mutex_init(&ei->open_mutex); + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + ncp_inode_cachep = kmem_cache_create("ncp_inode_cache", + sizeof(struct ncp_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (ncp_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(ncp_inode_cachep); +} + +static int ncp_remount(struct super_block *sb, int *flags, char* data) +{ + *flags |= MS_NODIRATIME; + return 0; +} + +static const struct super_operations ncp_sops = +{ + .alloc_inode = ncp_alloc_inode, + .destroy_inode = ncp_destroy_inode, + .drop_inode = generic_delete_inode, + .evict_inode = ncp_evict_inode, + .put_super = ncp_put_super, + .statfs = ncp_statfs, + .remount_fs = ncp_remount, + .show_options = ncp_show_options, +}; + +/* + * Fill in the ncpfs-specific information in the inode. + */ +static void ncp_update_dirent(struct inode *inode, struct ncp_entry_info *nwinfo) +{ + NCP_FINFO(inode)->DosDirNum = nwinfo->i.DosDirNum; + NCP_FINFO(inode)->dirEntNum = nwinfo->i.dirEntNum; + NCP_FINFO(inode)->volNumber = nwinfo->volume; +} + +void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo) +{ + ncp_update_dirent(inode, nwinfo); + NCP_FINFO(inode)->nwattr = nwinfo->i.attributes; + NCP_FINFO(inode)->access = nwinfo->access; + memcpy(NCP_FINFO(inode)->file_handle, nwinfo->file_handle, + sizeof(nwinfo->file_handle)); + DPRINTK("ncp_update_inode: updated %s, volnum=%d, dirent=%u\n", + nwinfo->i.entryName, NCP_FINFO(inode)->volNumber, + NCP_FINFO(inode)->dirEntNum); +} + +static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi) +{ + /* NFS namespace mode overrides others if it's set. */ + DPRINTK(KERN_DEBUG "ncp_update_dates_and_mode: (%s) nfs.mode=0%o\n", + nwi->entryName, nwi->nfs.mode); + if (nwi->nfs.mode) { + /* XXX Security? */ + inode->i_mode = nwi->nfs.mode; + } + + inode->i_blocks = (i_size_read(inode) + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT; + + inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate); + inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate); + inode->i_atime.tv_sec = ncp_date_dos2unix(0, nwi->lastAccessDate); + inode->i_atime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; +} + +static void ncp_update_attrs(struct inode *inode, struct ncp_entry_info *nwinfo) +{ + struct nw_info_struct *nwi = &nwinfo->i; + struct ncp_server *server = NCP_SERVER(inode); + + if (nwi->attributes & aDIR) { + inode->i_mode = server->m.dir_mode; + /* for directories dataStreamSize seems to be some + Object ID ??? */ + i_size_write(inode, NCP_BLOCK_SIZE); + } else { + u32 size; + + inode->i_mode = server->m.file_mode; + size = le32_to_cpu(nwi->dataStreamSize); + i_size_write(inode, size); +#ifdef CONFIG_NCPFS_EXTRAS + if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS)) + && (nwi->attributes & aSHARED)) { + switch (nwi->attributes & (aHIDDEN|aSYSTEM)) { + case aHIDDEN: + if (server->m.flags & NCP_MOUNT_SYMLINKS) { + if (/* (size >= NCP_MIN_SYMLINK_SIZE) + && */ (size <= NCP_MAX_SYMLINK_SIZE)) { + inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK; + NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK; + break; + } + } + /* FALLTHROUGH */ + case 0: + if (server->m.flags & NCP_MOUNT_EXTRAS) + inode->i_mode |= S_IRUGO; + break; + case aSYSTEM: + if (server->m.flags & NCP_MOUNT_EXTRAS) + inode->i_mode |= (inode->i_mode >> 2) & S_IXUGO; + break; + /* case aSYSTEM|aHIDDEN: */ + default: + /* reserved combination */ + break; + } + } +#endif + } + if (nwi->attributes & aRONLY) inode->i_mode &= ~S_IWUGO; +} + +void ncp_update_inode2(struct inode* inode, struct ncp_entry_info *nwinfo) +{ + NCP_FINFO(inode)->flags = 0; + if (!atomic_read(&NCP_FINFO(inode)->opened)) { + NCP_FINFO(inode)->nwattr = nwinfo->i.attributes; + ncp_update_attrs(inode, nwinfo); + } + + ncp_update_dates(inode, &nwinfo->i); + ncp_update_dirent(inode, nwinfo); +} + +/* + * Fill in the inode based on the ncp_entry_info structure. Used only for brand new inodes. + */ +static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) +{ + struct ncp_server *server = NCP_SERVER(inode); + + NCP_FINFO(inode)->flags = 0; + + ncp_update_attrs(inode, nwinfo); + + DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode); + + inode->i_nlink = 1; + inode->i_uid = server->m.uid; + inode->i_gid = server->m.gid; + + ncp_update_dates(inode, &nwinfo->i); + ncp_update_inode(inode, nwinfo); +} + +#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) +static const struct inode_operations ncp_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = ncp_notify_change, +}; +#endif + +/* + * Get a new inode. + */ +struct inode * +ncp_iget(struct super_block *sb, struct ncp_entry_info *info) +{ + struct inode *inode; + + if (info == NULL) { + printk(KERN_ERR "ncp_iget: info is NULL\n"); + return NULL; + } + + inode = new_inode(sb); + if (inode) { + atomic_set(&NCP_FINFO(inode)->opened, info->opened); + + inode->i_mapping->backing_dev_info = sb->s_bdi; + inode->i_ino = info->ino; + ncp_set_attr(inode, info); + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ncp_file_inode_operations; + inode->i_fop = &ncp_file_operations; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ncp_dir_inode_operations; + inode->i_fop = &ncp_dir_operations; +#ifdef CONFIG_NCPFS_NFS_NS + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + init_special_inode(inode, inode->i_mode, + new_decode_dev(info->i.nfs.rdev)); +#endif +#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &ncp_symlink_inode_operations; + inode->i_data.a_ops = &ncp_symlink_aops; +#endif + } else { + make_bad_inode(inode); + } + insert_inode_hash(inode); + } else + printk(KERN_ERR "ncp_iget: iget failed!\n"); + return inode; +} + +static void +ncp_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + + if (S_ISDIR(inode->i_mode)) { + DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino); + } + + if (ncp_make_closed(inode) != 0) { + /* We can't do anything but complain. */ + printk(KERN_ERR "ncp_evict_inode: could not close\n"); + } +} + +static void ncp_stop_tasks(struct ncp_server *server) { + struct sock* sk = server->ncp_sock->sk; + + lock_sock(sk); + sk->sk_error_report = server->error_report; + sk->sk_data_ready = server->data_ready; + sk->sk_write_space = server->write_space; + release_sock(sk); + del_timer_sync(&server->timeout_tm); + + flush_work_sync(&server->rcv.tq); + if (sk->sk_socket->type == SOCK_STREAM) + flush_work_sync(&server->tx.tq); + else + flush_work_sync(&server->timeout_tq); +} + +static int ncp_show_options(struct seq_file *seq, struct vfsmount *mnt) +{ + struct ncp_server *server = NCP_SBP(mnt->mnt_sb); + unsigned int tmp; + + if (server->m.uid != 0) + seq_printf(seq, ",uid=%u", server->m.uid); + if (server->m.gid != 0) + seq_printf(seq, ",gid=%u", server->m.gid); + if (server->m.mounted_uid != 0) + seq_printf(seq, ",owner=%u", server->m.mounted_uid); + tmp = server->m.file_mode & S_IALLUGO; + if (tmp != NCP_DEFAULT_FILE_MODE) + seq_printf(seq, ",mode=0%o", tmp); + tmp = server->m.dir_mode & S_IALLUGO; + if (tmp != NCP_DEFAULT_DIR_MODE) + seq_printf(seq, ",dirmode=0%o", tmp); + if (server->m.time_out != NCP_DEFAULT_TIME_OUT * HZ / 100) { + tmp = server->m.time_out * 100 / HZ; + seq_printf(seq, ",timeout=%u", tmp); + } + if (server->m.retry_count != NCP_DEFAULT_RETRY_COUNT) + seq_printf(seq, ",retry=%u", server->m.retry_count); + if (server->m.flags != 0) + seq_printf(seq, ",flags=%lu", server->m.flags); + if (server->m.wdog_pid != NULL) + seq_printf(seq, ",wdogpid=%u", pid_vnr(server->m.wdog_pid)); + + return 0; +} + +static const struct ncp_option ncp_opts[] = { + { "uid", OPT_INT, 'u' }, + { "gid", OPT_INT, 'g' }, + { "owner", OPT_INT, 'o' }, + { "mode", OPT_INT, 'm' }, + { "dirmode", OPT_INT, 'd' }, + { "timeout", OPT_INT, 't' }, + { "retry", OPT_INT, 'r' }, + { "flags", OPT_INT, 'f' }, + { "wdogpid", OPT_INT, 'w' }, + { "ncpfd", OPT_INT, 'n' }, + { "infofd", OPT_INT, 'i' }, /* v5 */ + { "version", OPT_INT, 'v' }, + { NULL, 0, 0 } }; + +static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) { + int optval; + char *optarg; + unsigned long optint; + int version = 0; + int ret; + + data->flags = 0; + data->int_flags = 0; + data->mounted_uid = 0; + data->wdog_pid = NULL; + data->ncp_fd = ~0; + data->time_out = NCP_DEFAULT_TIME_OUT; + data->retry_count = NCP_DEFAULT_RETRY_COUNT; + data->uid = 0; + data->gid = 0; + data->file_mode = NCP_DEFAULT_FILE_MODE; + data->dir_mode = NCP_DEFAULT_DIR_MODE; + data->info_fd = -1; + data->mounted_vol[0] = 0; + + while ((optval = ncp_getopt("ncpfs", &options, ncp_opts, NULL, &optarg, &optint)) != 0) { + ret = optval; + if (ret < 0) + goto err; + switch (optval) { + case 'u': + data->uid = optint; + break; + case 'g': + data->gid = optint; + break; + case 'o': + data->mounted_uid = optint; + break; + case 'm': + data->file_mode = optint; + break; + case 'd': + data->dir_mode = optint; + break; + case 't': + data->time_out = optint; + break; + case 'r': + data->retry_count = optint; + break; + case 'f': + data->flags = optint; + break; + case 'w': + data->wdog_pid = find_get_pid(optint); + break; + case 'n': + data->ncp_fd = optint; + break; + case 'i': + data->info_fd = optint; + break; + case 'v': + ret = -ECHRNG; + if (optint < NCP_MOUNT_VERSION_V4) + goto err; + if (optint > NCP_MOUNT_VERSION_V5) + goto err; + version = optint; + break; + + } + } + return 0; +err: + put_pid(data->wdog_pid); + data->wdog_pid = NULL; + return ret; +} + +static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) +{ + struct ncp_mount_data_kernel data; + struct ncp_server *server; + struct file *ncp_filp; + struct inode *root_inode; + struct inode *sock_inode; + struct socket *sock; + int error; + int default_bufsize; +#ifdef CONFIG_NCPFS_PACKET_SIGNING + int options; +#endif + struct ncp_entry_info finfo; + + memset(&data, 0, sizeof(data)); + server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL); + if (!server) + return -ENOMEM; + sb->s_fs_info = server; + + error = -EFAULT; + if (raw_data == NULL) + goto out; + switch (*(int*)raw_data) { + case NCP_MOUNT_VERSION: + { + struct ncp_mount_data* md = (struct ncp_mount_data*)raw_data; + + data.flags = md->flags; + data.int_flags = NCP_IMOUNT_LOGGEDIN_POSSIBLE; + data.mounted_uid = md->mounted_uid; + data.wdog_pid = find_get_pid(md->wdog_pid); + data.ncp_fd = md->ncp_fd; + data.time_out = md->time_out; + data.retry_count = md->retry_count; + data.uid = md->uid; + data.gid = md->gid; + data.file_mode = md->file_mode; + data.dir_mode = md->dir_mode; + data.info_fd = -1; + memcpy(data.mounted_vol, md->mounted_vol, + NCP_VOLNAME_LEN+1); + } + break; + case NCP_MOUNT_VERSION_V4: + { + struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data; + + data.flags = md->flags; + data.mounted_uid = md->mounted_uid; + data.wdog_pid = find_get_pid(md->wdog_pid); + data.ncp_fd = md->ncp_fd; + data.time_out = md->time_out; + data.retry_count = md->retry_count; + data.uid = md->uid; + data.gid = md->gid; + data.file_mode = md->file_mode; + data.dir_mode = md->dir_mode; + data.info_fd = -1; + } + break; + default: + error = -ECHRNG; + if (memcmp(raw_data, "vers", 4) == 0) { + error = ncp_parse_options(&data, raw_data); + } + if (error) + goto out; + break; + } + error = -EBADF; + ncp_filp = fget(data.ncp_fd); + if (!ncp_filp) + goto out; + error = -ENOTSOCK; + sock_inode = ncp_filp->f_path.dentry->d_inode; + if (!S_ISSOCK(sock_inode->i_mode)) + goto out_fput; + sock = SOCKET_I(sock_inode); + if (!sock) + goto out_fput; + + if (sock->type == SOCK_STREAM) + default_bufsize = 0xF000; + else + default_bufsize = 1024; + + sb->s_flags |= MS_NODIRATIME; /* probably even noatime */ + sb->s_maxbytes = 0xFFFFFFFFU; + sb->s_blocksize = 1024; /* Eh... Is this correct? */ + sb->s_blocksize_bits = 10; + sb->s_magic = NCP_SUPER_MAGIC; + sb->s_op = &ncp_sops; + sb->s_d_op = &ncp_dentry_operations; + sb->s_bdi = &server->bdi; + + server = NCP_SBP(sb); + memset(server, 0, sizeof(*server)); + + error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY); + if (error) + goto out_bdi; + + server->ncp_filp = ncp_filp; + server->ncp_sock = sock; + + if (data.info_fd != -1) { + struct socket *info_sock; + + error = -EBADF; + server->info_filp = fget(data.info_fd); + if (!server->info_filp) + goto out_fput; + error = -ENOTSOCK; + sock_inode = server->info_filp->f_path.dentry->d_inode; + if (!S_ISSOCK(sock_inode->i_mode)) + goto out_fput2; + info_sock = SOCKET_I(sock_inode); + if (!info_sock) + goto out_fput2; + error = -EBADFD; + if (info_sock->type != SOCK_STREAM) + goto out_fput2; + server->info_sock = info_sock; + } + +/* server->lock = 0; */ + mutex_init(&server->mutex); + server->packet = NULL; +/* server->buffer_size = 0; */ +/* server->conn_status = 0; */ +/* server->root_dentry = NULL; */ +/* server->root_setuped = 0; */ + mutex_init(&server->root_setup_lock); +#ifdef CONFIG_NCPFS_PACKET_SIGNING +/* server->sign_wanted = 0; */ +/* server->sign_active = 0; */ +#endif + init_rwsem(&server->auth_rwsem); + server->auth.auth_type = NCP_AUTH_NONE; +/* server->auth.object_name_len = 0; */ +/* server->auth.object_name = NULL; */ +/* server->auth.object_type = 0; */ +/* server->priv.len = 0; */ +/* server->priv.data = NULL; */ + + server->m = data; + /* Although anything producing this is buggy, it happens + now because of PATH_MAX changes.. */ + if (server->m.time_out < 1) { + server->m.time_out = 10; + printk(KERN_INFO "You need to recompile your ncpfs utils..\n"); + } + server->m.time_out = server->m.time_out * HZ / 100; + server->m.file_mode = (server->m.file_mode & S_IRWXUGO) | S_IFREG; + server->m.dir_mode = (server->m.dir_mode & S_IRWXUGO) | S_IFDIR; + +#ifdef CONFIG_NCPFS_NLS + /* load the default NLS charsets */ + server->nls_vol = load_nls_default(); + server->nls_io = load_nls_default(); +#endif /* CONFIG_NCPFS_NLS */ + + atomic_set(&server->dentry_ttl, 0); /* no caching */ + + INIT_LIST_HEAD(&server->tx.requests); + mutex_init(&server->rcv.creq_mutex); + server->tx.creq = NULL; + server->rcv.creq = NULL; + + init_timer(&server->timeout_tm); +#undef NCP_PACKET_SIZE +#define NCP_PACKET_SIZE 131072 + error = -ENOMEM; + server->packet_size = NCP_PACKET_SIZE; + server->packet = vmalloc(NCP_PACKET_SIZE); + if (server->packet == NULL) + goto out_nls; + server->txbuf = vmalloc(NCP_PACKET_SIZE); + if (server->txbuf == NULL) + goto out_packet; + server->rxbuf = vmalloc(NCP_PACKET_SIZE); + if (server->rxbuf == NULL) + goto out_txbuf; + + lock_sock(sock->sk); + server->data_ready = sock->sk->sk_data_ready; + server->write_space = sock->sk->sk_write_space; + server->error_report = sock->sk->sk_error_report; + sock->sk->sk_user_data = server; + sock->sk->sk_data_ready = ncp_tcp_data_ready; + sock->sk->sk_error_report = ncp_tcp_error_report; + if (sock->type == SOCK_STREAM) { + server->rcv.ptr = (unsigned char*)&server->rcv.buf; + server->rcv.len = 10; + server->rcv.state = 0; + INIT_WORK(&server->rcv.tq, ncp_tcp_rcv_proc); + INIT_WORK(&server->tx.tq, ncp_tcp_tx_proc); + sock->sk->sk_write_space = ncp_tcp_write_space; + } else { + INIT_WORK(&server->rcv.tq, ncpdgram_rcv_proc); + INIT_WORK(&server->timeout_tq, ncpdgram_timeout_proc); + server->timeout_tm.data = (unsigned long)server; + server->timeout_tm.function = ncpdgram_timeout_call; + } + release_sock(sock->sk); + + ncp_lock_server(server); + error = ncp_connect(server); + ncp_unlock_server(server); + if (error < 0) + goto out_rxbuf; + DPRINTK("ncp_fill_super: NCP_SBP(sb) = %x\n", (int) NCP_SBP(sb)); + + error = -EMSGSIZE; /* -EREMOTESIDEINCOMPATIBLE */ +#ifdef CONFIG_NCPFS_PACKET_SIGNING + if (ncp_negotiate_size_and_options(server, default_bufsize, + NCP_DEFAULT_OPTIONS, &(server->buffer_size), &options) == 0) + { + if (options != NCP_DEFAULT_OPTIONS) + { + if (ncp_negotiate_size_and_options(server, + default_bufsize, + options & 2, + &(server->buffer_size), &options) != 0) + + { + goto out_disconnect; + } + } + ncp_lock_server(server); + if (options & 2) + server->sign_wanted = 1; + ncp_unlock_server(server); + } + else +#endif /* CONFIG_NCPFS_PACKET_SIGNING */ + if (ncp_negotiate_buffersize(server, default_bufsize, + &(server->buffer_size)) != 0) + goto out_disconnect; + DPRINTK("ncpfs: bufsize = %d\n", server->buffer_size); + + memset(&finfo, 0, sizeof(finfo)); + finfo.i.attributes = aDIR; + finfo.i.dataStreamSize = 0; /* ignored */ + finfo.i.dirEntNum = 0; + finfo.i.DosDirNum = 0; +#ifdef CONFIG_NCPFS_SMALLDOS + finfo.i.NSCreator = NW_NS_DOS; +#endif + finfo.volume = NCP_NUMBER_OF_VOLUMES; + /* set dates of mountpoint to Jan 1, 1986; 00:00 */ + finfo.i.creationTime = finfo.i.modifyTime + = cpu_to_le16(0x0000); + finfo.i.creationDate = finfo.i.modifyDate + = finfo.i.lastAccessDate + = cpu_to_le16(0x0C21); + finfo.i.nameLen = 0; + finfo.i.entryName[0] = '\0'; + + finfo.opened = 0; + finfo.ino = 2; /* tradition */ + + server->name_space[finfo.volume] = NW_NS_DOS; + + error = -ENOMEM; + root_inode = ncp_iget(sb, &finfo); + if (!root_inode) + goto out_disconnect; + DPRINTK("ncp_fill_super: root vol=%d\n", NCP_FINFO(root_inode)->volNumber); + sb->s_root = d_alloc_root(root_inode); + if (!sb->s_root) + goto out_no_root; + return 0; + +out_no_root: + iput(root_inode); +out_disconnect: + ncp_lock_server(server); + ncp_disconnect(server); + ncp_unlock_server(server); +out_rxbuf: + ncp_stop_tasks(server); + vfree(server->rxbuf); +out_txbuf: + vfree(server->txbuf); +out_packet: + vfree(server->packet); +out_nls: +#ifdef CONFIG_NCPFS_NLS + unload_nls(server->nls_io); + unload_nls(server->nls_vol); +#endif + mutex_destroy(&server->rcv.creq_mutex); + mutex_destroy(&server->root_setup_lock); + mutex_destroy(&server->mutex); +out_fput2: + if (server->info_filp) + fput(server->info_filp); +out_fput: + bdi_destroy(&server->bdi); +out_bdi: + /* 23/12/1998 Marcin Dalecki : + * + * The previously used put_filp(ncp_filp); was bogus, since + * it doesn't perform proper unlocking. + */ + fput(ncp_filp); +out: + put_pid(data.wdog_pid); + sb->s_fs_info = NULL; + kfree(server); + return error; +} + +static void ncp_put_super(struct super_block *sb) +{ + struct ncp_server *server = NCP_SBP(sb); + + ncp_lock_server(server); + ncp_disconnect(server); + ncp_unlock_server(server); + + ncp_stop_tasks(server); + +#ifdef CONFIG_NCPFS_NLS + /* unload the NLS charsets */ + unload_nls(server->nls_vol); + unload_nls(server->nls_io); +#endif /* CONFIG_NCPFS_NLS */ + mutex_destroy(&server->rcv.creq_mutex); + mutex_destroy(&server->root_setup_lock); + mutex_destroy(&server->mutex); + + if (server->info_filp) + fput(server->info_filp); + fput(server->ncp_filp); + kill_pid(server->m.wdog_pid, SIGTERM, 1); + put_pid(server->m.wdog_pid); + + bdi_destroy(&server->bdi); + kfree(server->priv.data); + kfree(server->auth.object_name); + vfree(server->rxbuf); + vfree(server->txbuf); + vfree(server->packet); + sb->s_fs_info = NULL; + kfree(server); +} + +static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct dentry* d; + struct inode* i; + struct ncp_inode_info* ni; + struct ncp_server* s; + struct ncp_volume_info vi; + struct super_block *sb = dentry->d_sb; + int err; + __u8 dh; + + d = sb->s_root; + if (!d) { + goto dflt; + } + i = d->d_inode; + if (!i) { + goto dflt; + } + ni = NCP_FINFO(i); + if (!ni) { + goto dflt; + } + s = NCP_SBP(sb); + if (!s) { + goto dflt; + } + if (!s->m.mounted_vol[0]) { + goto dflt; + } + + err = ncp_dirhandle_alloc(s, ni->volNumber, ni->DosDirNum, &dh); + if (err) { + goto dflt; + } + err = ncp_get_directory_info(s, dh, &vi); + ncp_dirhandle_free(s, dh); + if (err) { + goto dflt; + } + buf->f_type = NCP_SUPER_MAGIC; + buf->f_bsize = vi.sectors_per_block * 512; + buf->f_blocks = vi.total_blocks; + buf->f_bfree = vi.free_blocks; + buf->f_bavail = vi.free_blocks; + buf->f_files = vi.total_dir_entries; + buf->f_ffree = vi.available_dir_entries; + buf->f_namelen = 12; + return 0; + + /* We cannot say how much disk space is left on a mounted + NetWare Server, because free space is distributed over + volumes, and the current user might have disk quotas. So + free space is not that simple to determine. Our decision + here is to err conservatively. */ + +dflt:; + buf->f_type = NCP_SUPER_MAGIC; + buf->f_bsize = NCP_BLOCK_SIZE; + buf->f_blocks = 0; + buf->f_bfree = 0; + buf->f_bavail = 0; + buf->f_namelen = 12; + return 0; +} + +int ncp_notify_change(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int result = 0; + __le32 info_mask; + struct nw_modify_dos_info info; + struct ncp_server *server; + + result = -EIO; + + server = NCP_SERVER(inode); + if (!server) /* How this could happen? */ + goto out; + + /* ageing the dentry to force validation */ + ncp_age_dentry(server, dentry); + + result = inode_change_ok(inode, attr); + if (result < 0) + goto out; + + result = -EPERM; + if (((attr->ia_valid & ATTR_UID) && + (attr->ia_uid != server->m.uid))) + goto out; + + if (((attr->ia_valid & ATTR_GID) && + (attr->ia_gid != server->m.gid))) + goto out; + + if (((attr->ia_valid & ATTR_MODE) && + (attr->ia_mode & + ~(S_IFREG | S_IFDIR | S_IRWXUGO)))) + goto out; + + info_mask = 0; + memset(&info, 0, sizeof(info)); + +#if 1 + if ((attr->ia_valid & ATTR_MODE) != 0) + { + umode_t newmode = attr->ia_mode; + + info_mask |= DM_ATTRIBUTES; + + if (S_ISDIR(inode->i_mode)) { + newmode &= server->m.dir_mode; + } else { +#ifdef CONFIG_NCPFS_EXTRAS + if (server->m.flags & NCP_MOUNT_EXTRAS) { + /* any non-default execute bit set */ + if (newmode & ~server->m.file_mode & S_IXUGO) + info.attributes |= aSHARED | aSYSTEM; + /* read for group/world and not in default file_mode */ + else if (newmode & ~server->m.file_mode & S_IRUGO) + info.attributes |= aSHARED; + } else +#endif + newmode &= server->m.file_mode; + } + if (newmode & S_IWUGO) + info.attributes &= ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); + else + info.attributes |= (aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); + +#ifdef CONFIG_NCPFS_NFS_NS + if (ncp_is_nfs_extras(server, NCP_FINFO(inode)->volNumber)) { + result = ncp_modify_nfs_info(server, + NCP_FINFO(inode)->volNumber, + NCP_FINFO(inode)->dirEntNum, + attr->ia_mode, 0); + if (result != 0) + goto out; + info.attributes &= ~(aSHARED | aSYSTEM); + { + /* mark partial success */ + struct iattr tmpattr; + + tmpattr.ia_valid = ATTR_MODE; + tmpattr.ia_mode = attr->ia_mode; + + setattr_copy(inode, &tmpattr); + mark_inode_dirty(inode); + } + } +#endif + } +#endif + + /* Do SIZE before attributes, otherwise mtime together with size does not work... + */ + if ((attr->ia_valid & ATTR_SIZE) != 0) { + int written; + + DPRINTK("ncpfs: trying to change size to %ld\n", + attr->ia_size); + + if ((result = ncp_make_open(inode, O_WRONLY)) < 0) { + result = -EACCES; + goto out; + } + ncp_write_kernel(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle, + attr->ia_size, 0, "", &written); + + /* According to ndir, the changes only take effect after + closing the file */ + ncp_inode_close(inode); + result = ncp_make_closed(inode); + if (result) + goto out; + + if (attr->ia_size != i_size_read(inode)) { + result = vmtruncate(inode, attr->ia_size); + if (result) + goto out; + mark_inode_dirty(inode); + } + } + if ((attr->ia_valid & ATTR_CTIME) != 0) { + info_mask |= (DM_CREATE_TIME | DM_CREATE_DATE); + ncp_date_unix2dos(attr->ia_ctime.tv_sec, + &info.creationTime, &info.creationDate); + } + if ((attr->ia_valid & ATTR_MTIME) != 0) { + info_mask |= (DM_MODIFY_TIME | DM_MODIFY_DATE); + ncp_date_unix2dos(attr->ia_mtime.tv_sec, + &info.modifyTime, &info.modifyDate); + } + if ((attr->ia_valid & ATTR_ATIME) != 0) { + __le16 dummy; + info_mask |= (DM_LAST_ACCESS_DATE); + ncp_date_unix2dos(attr->ia_atime.tv_sec, + &dummy, &info.lastAccessDate); + } + if (info_mask != 0) { + result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode), + inode, info_mask, &info); + if (result != 0) { + if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) { + /* NetWare seems not to allow this. I + do not know why. So, just tell the + user everything went fine. This is + a terrible hack, but I do not know + how to do this correctly. */ + result = 0; + } else + goto out; + } +#ifdef CONFIG_NCPFS_STRONG + if ((!result) && (info_mask & DM_ATTRIBUTES)) + NCP_FINFO(inode)->nwattr = info.attributes; +#endif + } + if (result) + goto out; + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + +out: + if (result > 0) + result = -EACCES; + return result; +} + +static struct dentry *ncp_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, ncp_fill_super); +} + +static struct file_system_type ncp_fs_type = { + .owner = THIS_MODULE, + .name = "ncpfs", + .mount = ncp_mount, + .kill_sb = kill_anon_super, + .fs_flags = FS_BINARY_MOUNTDATA, +}; + +static int __init init_ncp_fs(void) +{ + int err; + DPRINTK("ncpfs: init_ncp_fs called\n"); + + err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&ncp_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_ncp_fs(void) +{ + DPRINTK("ncpfs: exit_ncp_fs called\n"); + unregister_filesystem(&ncp_fs_type); + destroy_inodecache(); +} + +module_init(init_ncp_fs) +module_exit(exit_ncp_fs) +MODULE_LICENSE("GPL"); diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c new file mode 100644 index 00000000..790e92a9 --- /dev/null +++ b/fs/ncpfs/ioctl.c @@ -0,0 +1,918 @@ +/* + * ioctl.c + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * Modified 1998, 1999 Wolfram Pienkoss for NLS + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ncp_fs.h" + +/* maximum limit for ncp_objectname_ioctl */ +#define NCP_OBJECT_NAME_MAX_LEN 4096 +/* maximum limit for ncp_privatedata_ioctl */ +#define NCP_PRIVATE_DATA_MAX_LEN 8192 +/* maximum negotiable packet size */ +#define NCP_PACKET_SIZE_INTERNAL 65536 + +static int +ncp_get_fs_info(struct ncp_server * server, struct inode *inode, + struct ncp_fs_info __user *arg) +{ + struct ncp_fs_info info; + + if (copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + if (info.version != NCP_GET_FS_INFO_VERSION) { + DPRINTK("info.version invalid: %d\n", info.version); + return -EINVAL; + } + /* TODO: info.addr = server->m.serv_addr; */ + SET_UID(info.mounted_uid, server->m.mounted_uid); + info.connection = server->connection; + info.buffer_size = server->buffer_size; + info.volume_number = NCP_FINFO(inode)->volNumber; + info.directory_id = NCP_FINFO(inode)->DosDirNum; + + if (copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +static int +ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode, + struct ncp_fs_info_v2 __user * arg) +{ + struct ncp_fs_info_v2 info2; + + if (copy_from_user(&info2, arg, sizeof(info2))) + return -EFAULT; + + if (info2.version != NCP_GET_FS_INFO_VERSION_V2) { + DPRINTK("info.version invalid: %d\n", info2.version); + return -EINVAL; + } + info2.mounted_uid = server->m.mounted_uid; + info2.connection = server->connection; + info2.buffer_size = server->buffer_size; + info2.volume_number = NCP_FINFO(inode)->volNumber; + info2.directory_id = NCP_FINFO(inode)->DosDirNum; + info2.dummy1 = info2.dummy2 = info2.dummy3 = 0; + + if (copy_to_user(arg, &info2, sizeof(info2))) + return -EFAULT; + return 0; +} + +#ifdef CONFIG_COMPAT +struct compat_ncp_objectname_ioctl +{ + s32 auth_type; + u32 object_name_len; + compat_caddr_t object_name; /* a userspace data, in most cases user name */ +}; + +struct compat_ncp_fs_info_v2 { + s32 version; + u32 mounted_uid; + u32 connection; + u32 buffer_size; + + u32 volume_number; + u32 directory_id; + + u32 dummy1; + u32 dummy2; + u32 dummy3; +}; + +struct compat_ncp_ioctl_request { + u32 function; + u32 size; + compat_caddr_t data; +}; + +struct compat_ncp_privatedata_ioctl +{ + u32 len; + compat_caddr_t data; /* ~1000 for NDS */ +}; + +#define NCP_IOC_GET_FS_INFO_V2_32 _IOWR('n', 4, struct compat_ncp_fs_info_v2) +#define NCP_IOC_NCPREQUEST_32 _IOR('n', 1, struct compat_ncp_ioctl_request) +#define NCP_IOC_GETOBJECTNAME_32 _IOWR('n', 9, struct compat_ncp_objectname_ioctl) +#define NCP_IOC_SETOBJECTNAME_32 _IOR('n', 9, struct compat_ncp_objectname_ioctl) +#define NCP_IOC_GETPRIVATEDATA_32 _IOWR('n', 10, struct compat_ncp_privatedata_ioctl) +#define NCP_IOC_SETPRIVATEDATA_32 _IOR('n', 10, struct compat_ncp_privatedata_ioctl) + +static int +ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode, + struct compat_ncp_fs_info_v2 __user * arg) +{ + struct compat_ncp_fs_info_v2 info2; + + if (copy_from_user(&info2, arg, sizeof(info2))) + return -EFAULT; + + if (info2.version != NCP_GET_FS_INFO_VERSION_V2) { + DPRINTK("info.version invalid: %d\n", info2.version); + return -EINVAL; + } + info2.mounted_uid = server->m.mounted_uid; + info2.connection = server->connection; + info2.buffer_size = server->buffer_size; + info2.volume_number = NCP_FINFO(inode)->volNumber; + info2.directory_id = NCP_FINFO(inode)->DosDirNum; + info2.dummy1 = info2.dummy2 = info2.dummy3 = 0; + + if (copy_to_user(arg, &info2, sizeof(info2))) + return -EFAULT; + return 0; +} +#endif + +#define NCP_IOC_GETMOUNTUID16 _IOW('n', 2, u16) +#define NCP_IOC_GETMOUNTUID32 _IOW('n', 2, u32) +#define NCP_IOC_GETMOUNTUID64 _IOW('n', 2, u64) + +#ifdef CONFIG_NCPFS_NLS +/* Here we are select the iocharset and the codepage for NLS. + * Thanks Petr Vandrovec for idea and many hints. + */ +static int +ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg) +{ + struct ncp_nls_ioctl user; + struct nls_table *codepage; + struct nls_table *iocharset; + struct nls_table *oldset_io; + struct nls_table *oldset_cp; + int utf8; + int err; + + if (copy_from_user(&user, arg, sizeof(user))) + return -EFAULT; + + codepage = NULL; + user.codepage[NCP_IOCSNAME_LEN] = 0; + if (!user.codepage[0] || !strcmp(user.codepage, "default")) + codepage = load_nls_default(); + else { + codepage = load_nls(user.codepage); + if (!codepage) { + return -EBADRQC; + } + } + + iocharset = NULL; + user.iocharset[NCP_IOCSNAME_LEN] = 0; + if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) { + iocharset = load_nls_default(); + utf8 = 0; + } else if (!strcmp(user.iocharset, "utf8")) { + iocharset = load_nls_default(); + utf8 = 1; + } else { + iocharset = load_nls(user.iocharset); + if (!iocharset) { + unload_nls(codepage); + return -EBADRQC; + } + utf8 = 0; + } + + mutex_lock(&server->root_setup_lock); + if (server->root_setuped) { + oldset_cp = codepage; + oldset_io = iocharset; + err = -EBUSY; + } else { + if (utf8) + NCP_SET_FLAG(server, NCP_FLAG_UTF8); + else + NCP_CLR_FLAG(server, NCP_FLAG_UTF8); + oldset_cp = server->nls_vol; + server->nls_vol = codepage; + oldset_io = server->nls_io; + server->nls_io = iocharset; + err = 0; + } + mutex_unlock(&server->root_setup_lock); + unload_nls(oldset_cp); + unload_nls(oldset_io); + + return err; +} + +static int +ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg) +{ + struct ncp_nls_ioctl user; + int len; + + memset(&user, 0, sizeof(user)); + mutex_lock(&server->root_setup_lock); + if (server->nls_vol && server->nls_vol->charset) { + len = strlen(server->nls_vol->charset); + if (len > NCP_IOCSNAME_LEN) + len = NCP_IOCSNAME_LEN; + strncpy(user.codepage, server->nls_vol->charset, len); + user.codepage[len] = 0; + } + + if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) + strcpy(user.iocharset, "utf8"); + else if (server->nls_io && server->nls_io->charset) { + len = strlen(server->nls_io->charset); + if (len > NCP_IOCSNAME_LEN) + len = NCP_IOCSNAME_LEN; + strncpy(user.iocharset, server->nls_io->charset, len); + user.iocharset[len] = 0; + } + mutex_unlock(&server->root_setup_lock); + + if (copy_to_user(arg, &user, sizeof(user))) + return -EFAULT; + return 0; +} +#endif /* CONFIG_NCPFS_NLS */ + +static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg) +{ + struct ncp_server *server = NCP_SERVER(inode); + int result; + struct ncp_ioctl_request request; + char* bouncebuffer; + void __user *argp = (void __user *)arg; + + switch (cmd) { +#ifdef CONFIG_COMPAT + case NCP_IOC_NCPREQUEST_32: +#endif + case NCP_IOC_NCPREQUEST: +#ifdef CONFIG_COMPAT + if (cmd == NCP_IOC_NCPREQUEST_32) { + struct compat_ncp_ioctl_request request32; + if (copy_from_user(&request32, argp, sizeof(request32))) + return -EFAULT; + request.function = request32.function; + request.size = request32.size; + request.data = compat_ptr(request32.data); + } else +#endif + if (copy_from_user(&request, argp, sizeof(request))) + return -EFAULT; + + if ((request.function > 255) + || (request.size > + NCP_PACKET_SIZE - sizeof(struct ncp_request_header))) { + return -EINVAL; + } + bouncebuffer = vmalloc(NCP_PACKET_SIZE_INTERNAL); + if (!bouncebuffer) + return -ENOMEM; + if (copy_from_user(bouncebuffer, request.data, request.size)) { + vfree(bouncebuffer); + return -EFAULT; + } + ncp_lock_server(server); + + /* FIXME: We hack around in the server's structures + here to be able to use ncp_request */ + + server->has_subfunction = 0; + server->current_size = request.size; + memcpy(server->packet, bouncebuffer, request.size); + + result = ncp_request2(server, request.function, + bouncebuffer, NCP_PACKET_SIZE_INTERNAL); + if (result < 0) + result = -EIO; + else + result = server->reply_size; + ncp_unlock_server(server); + DPRINTK("ncp_ioctl: copy %d bytes\n", + result); + if (result >= 0) + if (copy_to_user(request.data, bouncebuffer, result)) + result = -EFAULT; + vfree(bouncebuffer); + return result; + + case NCP_IOC_CONN_LOGGED_IN: + + if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE)) + return -EINVAL; + mutex_lock(&server->root_setup_lock); + if (server->root_setuped) + result = -EBUSY; + else { + result = ncp_conn_logged_in(inode->i_sb); + if (result == 0) + server->root_setuped = 1; + } + mutex_unlock(&server->root_setup_lock); + return result; + + case NCP_IOC_GET_FS_INFO: + return ncp_get_fs_info(server, inode, argp); + + case NCP_IOC_GET_FS_INFO_V2: + return ncp_get_fs_info_v2(server, inode, argp); + +#ifdef CONFIG_COMPAT + case NCP_IOC_GET_FS_INFO_V2_32: + return ncp_get_compat_fs_info_v2(server, inode, argp); +#endif + /* we have too many combinations of CONFIG_COMPAT, + * CONFIG_64BIT and CONFIG_UID16, so just handle + * any of the possible ioctls */ + case NCP_IOC_GETMOUNTUID16: + { + u16 uid; + + SET_UID(uid, server->m.mounted_uid); + if (put_user(uid, (u16 __user *)argp)) + return -EFAULT; + return 0; + } + case NCP_IOC_GETMOUNTUID32: + if (put_user(server->m.mounted_uid, + (u32 __user *)argp)) + return -EFAULT; + return 0; + case NCP_IOC_GETMOUNTUID64: + if (put_user(server->m.mounted_uid, + (u64 __user *)argp)) + return -EFAULT; + return 0; + + case NCP_IOC_GETROOT: + { + struct ncp_setroot_ioctl sr; + + result = -EACCES; + mutex_lock(&server->root_setup_lock); + if (server->m.mounted_vol[0]) { + struct dentry* dentry = inode->i_sb->s_root; + + if (dentry) { + struct inode* s_inode = dentry->d_inode; + + if (s_inode) { + sr.volNumber = NCP_FINFO(s_inode)->volNumber; + sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum; + sr.namespace = server->name_space[sr.volNumber]; + result = 0; + } else + DPRINTK("ncpfs: s_root->d_inode==NULL\n"); + } else + DPRINTK("ncpfs: s_root==NULL\n"); + } else { + sr.volNumber = -1; + sr.namespace = 0; + sr.dirEntNum = 0; + result = 0; + } + mutex_unlock(&server->root_setup_lock); + if (!result && copy_to_user(argp, &sr, sizeof(sr))) + result = -EFAULT; + return result; + } + + case NCP_IOC_SETROOT: + { + struct ncp_setroot_ioctl sr; + __u32 vnum; + __le32 de; + __le32 dosde; + struct dentry* dentry; + + if (copy_from_user(&sr, argp, sizeof(sr))) + return -EFAULT; + mutex_lock(&server->root_setup_lock); + if (server->root_setuped) + result = -EBUSY; + else { + if (sr.volNumber < 0) { + server->m.mounted_vol[0] = 0; + vnum = NCP_NUMBER_OF_VOLUMES; + de = 0; + dosde = 0; + result = 0; + } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) { + result = -EINVAL; + } else if (ncp_mount_subdir(server, sr.volNumber, + sr.namespace, sr.dirEntNum, + &vnum, &de, &dosde)) { + result = -ENOENT; + } else + result = 0; + + if (result == 0) { + dentry = inode->i_sb->s_root; + if (dentry) { + struct inode* s_inode = dentry->d_inode; + + if (s_inode) { + NCP_FINFO(s_inode)->volNumber = vnum; + NCP_FINFO(s_inode)->dirEntNum = de; + NCP_FINFO(s_inode)->DosDirNum = dosde; + server->root_setuped = 1; + } else { + DPRINTK("ncpfs: s_root->d_inode==NULL\n"); + result = -EIO; + } + } else { + DPRINTK("ncpfs: s_root==NULL\n"); + result = -EIO; + } + } + result = 0; + } + mutex_unlock(&server->root_setup_lock); + + return result; + } + +#ifdef CONFIG_NCPFS_PACKET_SIGNING + case NCP_IOC_SIGN_INIT: + { + struct ncp_sign_init sign; + + if (argp) + if (copy_from_user(&sign, argp, sizeof(sign))) + return -EFAULT; + ncp_lock_server(server); + mutex_lock(&server->rcv.creq_mutex); + if (argp) { + if (server->sign_wanted) { + memcpy(server->sign_root,sign.sign_root,8); + memcpy(server->sign_last,sign.sign_last,16); + server->sign_active = 1; + } + /* ignore when signatures not wanted */ + } else { + server->sign_active = 0; + } + mutex_unlock(&server->rcv.creq_mutex); + ncp_unlock_server(server); + return 0; + } + + case NCP_IOC_SIGN_WANTED: + { + int state; + + ncp_lock_server(server); + state = server->sign_wanted; + ncp_unlock_server(server); + if (put_user(state, (int __user *)argp)) + return -EFAULT; + return 0; + } + + case NCP_IOC_SET_SIGN_WANTED: + { + int newstate; + + /* get only low 8 bits... */ + if (get_user(newstate, (unsigned char __user *)argp)) + return -EFAULT; + result = 0; + ncp_lock_server(server); + if (server->sign_active) { + /* cannot turn signatures OFF when active */ + if (!newstate) + result = -EINVAL; + } else { + server->sign_wanted = newstate != 0; + } + ncp_unlock_server(server); + return result; + } + +#endif /* CONFIG_NCPFS_PACKET_SIGNING */ + +#ifdef CONFIG_NCPFS_IOCTL_LOCKING + case NCP_IOC_LOCKUNLOCK: + { + struct ncp_lock_ioctl rqdata; + + if (copy_from_user(&rqdata, argp, sizeof(rqdata))) + return -EFAULT; + if (rqdata.origin != 0) + return -EINVAL; + /* check for cmd */ + switch (rqdata.cmd) { + case NCP_LOCK_EX: + case NCP_LOCK_SH: + if (rqdata.timeout == 0) + rqdata.timeout = NCP_LOCK_DEFAULT_TIMEOUT; + else if (rqdata.timeout > NCP_LOCK_MAX_TIMEOUT) + rqdata.timeout = NCP_LOCK_MAX_TIMEOUT; + break; + case NCP_LOCK_LOG: + rqdata.timeout = NCP_LOCK_DEFAULT_TIMEOUT; /* has no effect */ + case NCP_LOCK_CLEAR: + break; + default: + return -EINVAL; + } + /* locking needs both read and write access */ + if ((result = ncp_make_open(inode, O_RDWR)) != 0) + { + return result; + } + result = -EISDIR; + if (!S_ISREG(inode->i_mode)) + goto outrel; + if (rqdata.cmd == NCP_LOCK_CLEAR) + { + result = ncp_ClearPhysicalRecord(NCP_SERVER(inode), + NCP_FINFO(inode)->file_handle, + rqdata.offset, + rqdata.length); + if (result > 0) result = 0; /* no such lock */ + } + else + { + int lockcmd; + + switch (rqdata.cmd) + { + case NCP_LOCK_EX: lockcmd=1; break; + case NCP_LOCK_SH: lockcmd=3; break; + default: lockcmd=0; break; + } + result = ncp_LogPhysicalRecord(NCP_SERVER(inode), + NCP_FINFO(inode)->file_handle, + lockcmd, + rqdata.offset, + rqdata.length, + rqdata.timeout); + if (result > 0) result = -EAGAIN; + } +outrel: + ncp_inode_close(inode); + return result; + } +#endif /* CONFIG_NCPFS_IOCTL_LOCKING */ + +#ifdef CONFIG_COMPAT + case NCP_IOC_GETOBJECTNAME_32: + { + struct compat_ncp_objectname_ioctl user; + size_t outl; + + if (copy_from_user(&user, argp, sizeof(user))) + return -EFAULT; + down_read(&server->auth_rwsem); + user.auth_type = server->auth.auth_type; + outl = user.object_name_len; + user.object_name_len = server->auth.object_name_len; + if (outl > user.object_name_len) + outl = user.object_name_len; + result = 0; + if (outl) { + if (copy_to_user(compat_ptr(user.object_name), + server->auth.object_name, + outl)) + result = -EFAULT; + } + up_read(&server->auth_rwsem); + if (!result && copy_to_user(argp, &user, sizeof(user))) + result = -EFAULT; + return result; + } +#endif + + case NCP_IOC_GETOBJECTNAME: + { + struct ncp_objectname_ioctl user; + size_t outl; + + if (copy_from_user(&user, argp, sizeof(user))) + return -EFAULT; + down_read(&server->auth_rwsem); + user.auth_type = server->auth.auth_type; + outl = user.object_name_len; + user.object_name_len = server->auth.object_name_len; + if (outl > user.object_name_len) + outl = user.object_name_len; + result = 0; + if (outl) { + if (copy_to_user(user.object_name, + server->auth.object_name, + outl)) + result = -EFAULT; + } + up_read(&server->auth_rwsem); + if (!result && copy_to_user(argp, &user, sizeof(user))) + result = -EFAULT; + return result; + } + +#ifdef CONFIG_COMPAT + case NCP_IOC_SETOBJECTNAME_32: +#endif + case NCP_IOC_SETOBJECTNAME: + { + struct ncp_objectname_ioctl user; + void* newname; + void* oldname; + size_t oldnamelen; + void* oldprivate; + size_t oldprivatelen; + +#ifdef CONFIG_COMPAT + if (cmd == NCP_IOC_SETOBJECTNAME_32) { + struct compat_ncp_objectname_ioctl user32; + if (copy_from_user(&user32, argp, sizeof(user32))) + return -EFAULT; + user.auth_type = user32.auth_type; + user.object_name_len = user32.object_name_len; + user.object_name = compat_ptr(user32.object_name); + } else +#endif + if (copy_from_user(&user, argp, sizeof(user))) + return -EFAULT; + + if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN) + return -ENOMEM; + if (user.object_name_len) { + newname = memdup_user(user.object_name, + user.object_name_len); + if (IS_ERR(newname)) + return PTR_ERR(newname); + } else { + newname = NULL; + } + down_write(&server->auth_rwsem); + oldname = server->auth.object_name; + oldnamelen = server->auth.object_name_len; + oldprivate = server->priv.data; + oldprivatelen = server->priv.len; + server->auth.auth_type = user.auth_type; + server->auth.object_name_len = user.object_name_len; + server->auth.object_name = newname; + server->priv.len = 0; + server->priv.data = NULL; + up_write(&server->auth_rwsem); + kfree(oldprivate); + kfree(oldname); + return 0; + } + +#ifdef CONFIG_COMPAT + case NCP_IOC_GETPRIVATEDATA_32: +#endif + case NCP_IOC_GETPRIVATEDATA: + { + struct ncp_privatedata_ioctl user; + size_t outl; + +#ifdef CONFIG_COMPAT + if (cmd == NCP_IOC_GETPRIVATEDATA_32) { + struct compat_ncp_privatedata_ioctl user32; + if (copy_from_user(&user32, argp, sizeof(user32))) + return -EFAULT; + user.len = user32.len; + user.data = compat_ptr(user32.data); + } else +#endif + if (copy_from_user(&user, argp, sizeof(user))) + return -EFAULT; + + down_read(&server->auth_rwsem); + outl = user.len; + user.len = server->priv.len; + if (outl > user.len) outl = user.len; + result = 0; + if (outl) { + if (copy_to_user(user.data, + server->priv.data, + outl)) + result = -EFAULT; + } + up_read(&server->auth_rwsem); + if (result) + return result; +#ifdef CONFIG_COMPAT + if (cmd == NCP_IOC_GETPRIVATEDATA_32) { + struct compat_ncp_privatedata_ioctl user32; + user32.len = user.len; + user32.data = (unsigned long) user.data; + if (copy_to_user(argp, &user32, sizeof(user32))) + return -EFAULT; + } else +#endif + if (copy_to_user(argp, &user, sizeof(user))) + return -EFAULT; + + return 0; + } + +#ifdef CONFIG_COMPAT + case NCP_IOC_SETPRIVATEDATA_32: +#endif + case NCP_IOC_SETPRIVATEDATA: + { + struct ncp_privatedata_ioctl user; + void* new; + void* old; + size_t oldlen; + +#ifdef CONFIG_COMPAT + if (cmd == NCP_IOC_SETPRIVATEDATA_32) { + struct compat_ncp_privatedata_ioctl user32; + if (copy_from_user(&user32, argp, sizeof(user32))) + return -EFAULT; + user.len = user32.len; + user.data = compat_ptr(user32.data); + } else +#endif + if (copy_from_user(&user, argp, sizeof(user))) + return -EFAULT; + + if (user.len > NCP_PRIVATE_DATA_MAX_LEN) + return -ENOMEM; + if (user.len) { + new = memdup_user(user.data, user.len); + if (IS_ERR(new)) + return PTR_ERR(new); + } else { + new = NULL; + } + down_write(&server->auth_rwsem); + old = server->priv.data; + oldlen = server->priv.len; + server->priv.len = user.len; + server->priv.data = new; + up_write(&server->auth_rwsem); + kfree(old); + return 0; + } + +#ifdef CONFIG_NCPFS_NLS + case NCP_IOC_SETCHARSETS: + return ncp_set_charsets(server, argp); + + case NCP_IOC_GETCHARSETS: + return ncp_get_charsets(server, argp); + +#endif /* CONFIG_NCPFS_NLS */ + + case NCP_IOC_SETDENTRYTTL: + { + u_int32_t user; + + if (copy_from_user(&user, argp, sizeof(user))) + return -EFAULT; + /* 20 secs at most... */ + if (user > 20000) + return -EINVAL; + user = (user * HZ) / 1000; + atomic_set(&server->dentry_ttl, user); + return 0; + } + + case NCP_IOC_GETDENTRYTTL: + { + u_int32_t user = (atomic_read(&server->dentry_ttl) * 1000) / HZ; + if (copy_to_user(argp, &user, sizeof(user))) + return -EFAULT; + return 0; + } + + } + return -EINVAL; +} + +long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct ncp_server *server = NCP_SERVER(inode); + uid_t uid = current_uid(); + int need_drop_write = 0; + long ret; + + switch (cmd) { + case NCP_IOC_SETCHARSETS: + case NCP_IOC_CONN_LOGGED_IN: + case NCP_IOC_SETROOT: + if (!capable(CAP_SYS_ADMIN)) { + ret = -EACCES; + goto out; + } + break; + } + if (server->m.mounted_uid != uid) { + switch (cmd) { + /* + * Only mount owner can issue these ioctls. Information + * necessary to authenticate to other NDS servers are + * stored here. + */ + case NCP_IOC_GETOBJECTNAME: + case NCP_IOC_SETOBJECTNAME: + case NCP_IOC_GETPRIVATEDATA: + case NCP_IOC_SETPRIVATEDATA: +#ifdef CONFIG_COMPAT + case NCP_IOC_GETOBJECTNAME_32: + case NCP_IOC_SETOBJECTNAME_32: + case NCP_IOC_GETPRIVATEDATA_32: + case NCP_IOC_SETPRIVATEDATA_32: +#endif + ret = -EACCES; + goto out; + /* + * These require write access on the inode if user id + * does not match. Note that they do not write to the + * file... But old code did mnt_want_write, so I keep + * it as is. Of course not for mountpoint owner, as + * that breaks read-only mounts altogether as ncpmount + * needs working NCP_IOC_NCPREQUEST and + * NCP_IOC_GET_FS_INFO. Some of these codes (setdentryttl, + * signinit, setsignwanted) should be probably restricted + * to owner only, or even more to CAP_SYS_ADMIN). + */ + case NCP_IOC_GET_FS_INFO: + case NCP_IOC_GET_FS_INFO_V2: + case NCP_IOC_NCPREQUEST: + case NCP_IOC_SETDENTRYTTL: + case NCP_IOC_SIGN_INIT: + case NCP_IOC_LOCKUNLOCK: + case NCP_IOC_SET_SIGN_WANTED: +#ifdef CONFIG_COMPAT + case NCP_IOC_GET_FS_INFO_V2_32: + case NCP_IOC_NCPREQUEST_32: +#endif + ret = mnt_want_write_file(filp); + if (ret) + goto out; + need_drop_write = 1; + ret = inode_permission(inode, MAY_WRITE); + if (ret) + goto outDropWrite; + break; + /* + * Read access required. + */ + case NCP_IOC_GETMOUNTUID16: + case NCP_IOC_GETMOUNTUID32: + case NCP_IOC_GETMOUNTUID64: + case NCP_IOC_GETROOT: + case NCP_IOC_SIGN_WANTED: + ret = inode_permission(inode, MAY_READ); + if (ret) + goto out; + break; + /* + * Anybody can read these. + */ + case NCP_IOC_GETCHARSETS: + case NCP_IOC_GETDENTRYTTL: + default: + /* Three codes below are protected by CAP_SYS_ADMIN above. */ + case NCP_IOC_SETCHARSETS: + case NCP_IOC_CONN_LOGGED_IN: + case NCP_IOC_SETROOT: + break; + } + } + ret = __ncp_ioctl(inode, cmd, arg); +outDropWrite: + if (need_drop_write) + mnt_drop_write(filp->f_path.mnt); +out: + return ret; +} + +#ifdef CONFIG_COMPAT +long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long ret; + + arg = (unsigned long) compat_ptr(arg); + ret = ncp_ioctl(file, cmd, arg); + return ret; +} +#endif diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c new file mode 100644 index 00000000..e5d71b27 --- /dev/null +++ b/fs/ncpfs/mmap.c @@ -0,0 +1,128 @@ +/* + * mmap.c + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "ncp_fs.h" + +/* + * Fill in the supplied page for mmap + * XXX: how are we excluding truncate/invalidate here? Maybe need to lock + * page? + */ +static int ncp_file_mmap_fault(struct vm_area_struct *area, + struct vm_fault *vmf) +{ + struct file *file = area->vm_file; + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + char *pg_addr; + unsigned int already_read; + unsigned int count; + int bufsize; + int pos; /* XXX: loff_t ? */ + + /* + * ncpfs has nothing against high pages as long + * as recvmsg and memset works on it + */ + vmf->page = alloc_page(GFP_HIGHUSER); + if (!vmf->page) + return VM_FAULT_OOM; + pg_addr = kmap(vmf->page); + pos = vmf->pgoff << PAGE_SHIFT; + + count = PAGE_SIZE; + /* what we can read in one go */ + bufsize = NCP_SERVER(inode)->buffer_size; + + already_read = 0; + if (ncp_make_open(inode, O_RDONLY) >= 0) { + while (already_read < count) { + int read_this_time; + int to_read; + + to_read = bufsize - (pos % bufsize); + + to_read = min_t(unsigned int, to_read, count - already_read); + + if (ncp_read_kernel(NCP_SERVER(inode), + NCP_FINFO(inode)->file_handle, + pos, to_read, + pg_addr + already_read, + &read_this_time) != 0) { + read_this_time = 0; + } + pos += read_this_time; + already_read += read_this_time; + + if (read_this_time < to_read) { + break; + } + } + ncp_inode_close(inode); + + } + + if (already_read < PAGE_SIZE) + memset(pg_addr + already_read, 0, PAGE_SIZE - already_read); + flush_dcache_page(vmf->page); + kunmap(vmf->page); + + /* + * If I understand ncp_read_kernel() properly, the above always + * fetches from the network, here the analogue of disk. + * -- wli + */ + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT); + return VM_FAULT_MAJOR; +} + +static const struct vm_operations_struct ncp_file_mmap = +{ + .fault = ncp_file_mmap_fault, +}; + + +/* This is used for a general mmap of a ncp file */ +int ncp_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode = file->f_path.dentry->d_inode; + + DPRINTK("ncp_mmap: called\n"); + + if (!ncp_conn_valid(NCP_SERVER(inode))) + return -EIO; + + /* only PAGE_COW or read-only supported now */ + if (vma->vm_flags & VM_SHARED) + return -EINVAL; + /* we do not support files bigger than 4GB... We eventually + supports just 4GB... */ + if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff + > (1U << (32 - PAGE_SHIFT))) + return -EFBIG; + + vma->vm_ops = &ncp_file_mmap; + file_accessed(file); + return 0; +} diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h new file mode 100644 index 00000000..31831afe --- /dev/null +++ b/fs/ncpfs/ncp_fs.h @@ -0,0 +1,98 @@ +#include +#include "ncp_fs_i.h" +#include "ncp_fs_sb.h" + +/* define because it is easy to change PRINTK to {*}PRINTK */ +#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args) + +#undef NCPFS_PARANOIA +#ifdef NCPFS_PARANOIA +#define PPRINTK(format, args...) PRINTK(format , ## args) +#else +#define PPRINTK(format, args...) +#endif + +#ifndef DEBUG_NCP +#define DEBUG_NCP 0 +#endif +#if DEBUG_NCP > 0 +#define DPRINTK(format, args...) PRINTK(format , ## args) +#else +#define DPRINTK(format, args...) +#endif +#if DEBUG_NCP > 1 +#define DDPRINTK(format, args...) PRINTK(format , ## args) +#else +#define DDPRINTK(format, args...) +#endif + +#define NCP_MAX_RPC_TIMEOUT (6*HZ) + + +struct ncp_entry_info { + struct nw_info_struct i; + ino_t ino; + int opened; + int access; + unsigned int volume; + __u8 file_handle[6]; +}; + +static inline struct ncp_server *NCP_SBP(const struct super_block *sb) +{ + return sb->s_fs_info; +} + +#define NCP_SERVER(inode) NCP_SBP((inode)->i_sb) +static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode) +{ + return container_of(inode, struct ncp_inode_info, vfs_inode); +} + +/* linux/fs/ncpfs/inode.c */ +int ncp_notify_change(struct dentry *, struct iattr *); +struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *); +void ncp_update_inode(struct inode *, struct ncp_entry_info *); +void ncp_update_inode2(struct inode *, struct ncp_entry_info *); + +/* linux/fs/ncpfs/dir.c */ +extern const struct inode_operations ncp_dir_inode_operations; +extern const struct file_operations ncp_dir_operations; +extern const struct dentry_operations ncp_dentry_operations; +int ncp_conn_logged_in(struct super_block *); +int ncp_date_dos2unix(__le16 time, __le16 date); +void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date); + +/* linux/fs/ncpfs/ioctl.c */ +long ncp_ioctl(struct file *, unsigned int, unsigned long); +long ncp_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* linux/fs/ncpfs/sock.c */ +int ncp_request2(struct ncp_server *server, int function, + void* reply, int max_reply_size); +static inline int ncp_request(struct ncp_server *server, int function) { + return ncp_request2(server, function, server->packet, server->packet_size); +} +int ncp_connect(struct ncp_server *server); +int ncp_disconnect(struct ncp_server *server); +void ncp_lock_server(struct ncp_server *server); +void ncp_unlock_server(struct ncp_server *server); + +/* linux/fs/ncpfs/symlink.c */ +#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) +extern const struct address_space_operations ncp_symlink_aops; +int ncp_symlink(struct inode*, struct dentry*, const char*); +#endif + +/* linux/fs/ncpfs/file.c */ +extern const struct inode_operations ncp_file_inode_operations; +extern const struct file_operations ncp_file_operations; +int ncp_make_open(struct inode *, int); + +/* linux/fs/ncpfs/mmap.c */ +int ncp_mmap(struct file *, struct vm_area_struct *); + +/* linux/fs/ncpfs/ncplib_kernel.c */ +int ncp_make_closed(struct inode *); + +#include "ncplib_kernel.h" diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h new file mode 100644 index 00000000..4b0bec47 --- /dev/null +++ b/fs/ncpfs/ncp_fs_i.h @@ -0,0 +1,29 @@ +/* + * ncp_fs_i.h + * + * Copyright (C) 1995 Volker Lendecke + * + */ + +#ifndef _LINUX_NCP_FS_I +#define _LINUX_NCP_FS_I + +/* + * This is the ncpfs part of the inode structure. This must contain + * all the information we need to work with an inode after creation. + */ +struct ncp_inode_info { + __le32 dirEntNum; + __le32 DosDirNum; + __u8 volNumber; + __le32 nwattr; + struct mutex open_mutex; + atomic_t opened; + int access; + int flags; +#define NCPI_KLUDGE_SYMLINK 0x0001 + __u8 file_handle[6]; + struct inode vfs_inode; +}; + +#endif /* _LINUX_NCP_FS_I */ diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h new file mode 100644 index 00000000..4af803f1 --- /dev/null +++ b/fs/ncpfs/ncp_fs_sb.h @@ -0,0 +1,176 @@ +/* + * ncp_fs_sb.h + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * + */ + +#ifndef _NCP_FS_SB +#define _NCP_FS_SB + +#include +#include +#include +#include +#include +#include + +#define NCP_DEFAULT_OPTIONS 0 /* 2 for packet signatures */ + +struct sock; + +struct ncp_mount_data_kernel { + unsigned long flags; /* NCP_MOUNT_* flags */ + unsigned int int_flags; /* internal flags */ +#define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001 + __kernel_uid32_t mounted_uid; /* Who may umount() this filesystem? */ + struct pid *wdog_pid; /* Who cares for our watchdog packets? */ + unsigned int ncp_fd; /* The socket to the ncp port */ + unsigned int time_out; /* How long should I wait after + sending a NCP request? */ + unsigned int retry_count; /* And how often should I retry? */ + unsigned char mounted_vol[NCP_VOLNAME_LEN + 1]; + __kernel_uid32_t uid; + __kernel_gid32_t gid; + __kernel_mode_t file_mode; + __kernel_mode_t dir_mode; + int info_fd; +}; + +struct ncp_server { + + struct ncp_mount_data_kernel m; /* Nearly all of the mount data is of + interest for us later, so we store + it completely. */ + + __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2]; + + struct file *ncp_filp; /* File pointer to ncp socket */ + struct socket *ncp_sock;/* ncp socket */ + struct file *info_filp; + struct socket *info_sock; + + u8 sequence; + u8 task; + u16 connection; /* Remote connection number */ + + u8 completion; /* Status message from server */ + u8 conn_status; /* Bit 4 = 1 ==> Server going down, no + requests allowed anymore. + Bit 0 = 1 ==> Server is down. */ + + int buffer_size; /* Negotiated bufsize */ + + int reply_size; /* Size of last reply */ + + int packet_size; + unsigned char *packet; /* Here we prepare requests and + receive replies */ + unsigned char *txbuf; /* Storage for current request */ + unsigned char *rxbuf; /* Storage for reply to current request */ + + int lock; /* To prevent mismatch in protocols. */ + struct mutex mutex; + + int current_size; /* for packet preparation */ + int has_subfunction; + int ncp_reply_size; + + int root_setuped; + struct mutex root_setup_lock; + + /* info for packet signing */ + int sign_wanted; /* 1=Server needs signed packets */ + int sign_active; /* 0=don't do signing, 1=do */ + char sign_root[8]; /* generated from password and encr. key */ + char sign_last[16]; + + /* Authentication info: NDS or BINDERY, username */ + struct { + int auth_type; + size_t object_name_len; + void* object_name; + int object_type; + } auth; + /* Password info */ + struct { + size_t len; + void* data; + } priv; + struct rw_semaphore auth_rwsem; + + /* nls info: codepage for volume and charset for I/O */ + struct nls_table *nls_vol; + struct nls_table *nls_io; + + /* maximum age in jiffies */ + atomic_t dentry_ttl; + + /* miscellaneous */ + unsigned int flags; + + spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */ + + void (*data_ready)(struct sock* sk, int len); + void (*error_report)(struct sock* sk); + void (*write_space)(struct sock* sk); /* STREAM mode only */ + struct { + struct work_struct tq; /* STREAM/DGRAM: data/error ready */ + struct ncp_request_reply* creq; /* STREAM/DGRAM: awaiting reply from this request */ + struct mutex creq_mutex; /* DGRAM only: lock accesses to rcv.creq */ + + unsigned int state; /* STREAM only: receiver state */ + struct { + __u32 magic __packed; + __u32 len __packed; + __u16 type __packed; + __u16 p1 __packed; + __u16 p2 __packed; + __u16 p3 __packed; + __u16 type2 __packed; + } buf; /* STREAM only: temporary buffer */ + unsigned char* ptr; /* STREAM only: pointer to data */ + size_t len; /* STREAM only: length of data to receive */ + } rcv; + struct { + struct list_head requests; /* STREAM only: queued requests */ + struct work_struct tq; /* STREAM only: transmitter ready */ + struct ncp_request_reply* creq; /* STREAM only: currently transmitted entry */ + } tx; + struct timer_list timeout_tm; /* DGRAM only: timeout timer */ + struct work_struct timeout_tq; /* DGRAM only: associated queue, we run timers from process context */ + int timeout_last; /* DGRAM only: current timeout length */ + int timeout_retries; /* DGRAM only: retries left */ + struct { + size_t len; + __u8 data[128]; + } unexpected_packet; + struct backing_dev_info bdi; +}; + +extern void ncp_tcp_rcv_proc(struct work_struct *work); +extern void ncp_tcp_tx_proc(struct work_struct *work); +extern void ncpdgram_rcv_proc(struct work_struct *work); +extern void ncpdgram_timeout_proc(struct work_struct *work); +extern void ncpdgram_timeout_call(unsigned long server); +extern void ncp_tcp_data_ready(struct sock* sk, int len); +extern void ncp_tcp_write_space(struct sock* sk); +extern void ncp_tcp_error_report(struct sock* sk); + +#define NCP_FLAG_UTF8 1 + +#define NCP_CLR_FLAG(server, flag) ((server)->flags &= ~(flag)) +#define NCP_SET_FLAG(server, flag) ((server)->flags |= (flag)) +#define NCP_IS_FLAG(server, flag) ((server)->flags & (flag)) + +static inline int ncp_conn_valid(struct ncp_server *server) +{ + return ((server->conn_status & 0x11) == 0); +} + +static inline void ncp_invalidate_conn(struct ncp_server *server) +{ + server->conn_status |= 0x01; +} + +#endif diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c new file mode 100644 index 00000000..981a9561 --- /dev/null +++ b/fs/ncpfs/ncplib_kernel.c @@ -0,0 +1,1323 @@ +/* + * ncplib_kernel.c + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * Modified for big endian by J.F. Chadima and David S. Miller + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * Modified 1999 Wolfram Pienkoss for NLS + * Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info + * + */ + + + +#include "ncp_fs.h" + +static inline void assert_server_locked(struct ncp_server *server) +{ + if (server->lock == 0) { + DPRINTK("ncpfs: server not locked!\n"); + } +} + +static void ncp_add_byte(struct ncp_server *server, __u8 x) +{ + assert_server_locked(server); + *(__u8 *) (&(server->packet[server->current_size])) = x; + server->current_size += 1; + return; +} + +static void ncp_add_word(struct ncp_server *server, __le16 x) +{ + assert_server_locked(server); + put_unaligned(x, (__le16 *) (&(server->packet[server->current_size]))); + server->current_size += 2; + return; +} + +static void ncp_add_be16(struct ncp_server *server, __u16 x) +{ + assert_server_locked(server); + put_unaligned(cpu_to_be16(x), (__be16 *) (&(server->packet[server->current_size]))); + server->current_size += 2; +} + +static void ncp_add_dword(struct ncp_server *server, __le32 x) +{ + assert_server_locked(server); + put_unaligned(x, (__le32 *) (&(server->packet[server->current_size]))); + server->current_size += 4; + return; +} + +static void ncp_add_be32(struct ncp_server *server, __u32 x) +{ + assert_server_locked(server); + put_unaligned(cpu_to_be32(x), (__be32 *)(&(server->packet[server->current_size]))); + server->current_size += 4; +} + +static inline void ncp_add_dword_lh(struct ncp_server *server, __u32 x) { + ncp_add_dword(server, cpu_to_le32(x)); +} + +static void ncp_add_mem(struct ncp_server *server, const void *source, int size) +{ + assert_server_locked(server); + memcpy(&(server->packet[server->current_size]), source, size); + server->current_size += size; + return; +} + +static void ncp_add_pstring(struct ncp_server *server, const char *s) +{ + int len = strlen(s); + assert_server_locked(server); + if (len > 255) { + DPRINTK("ncpfs: string too long: %s\n", s); + len = 255; + } + ncp_add_byte(server, len); + ncp_add_mem(server, s, len); + return; +} + +static inline void ncp_init_request(struct ncp_server *server) +{ + ncp_lock_server(server); + + server->current_size = sizeof(struct ncp_request_header); + server->has_subfunction = 0; +} + +static inline void ncp_init_request_s(struct ncp_server *server, int subfunction) +{ + ncp_lock_server(server); + + server->current_size = sizeof(struct ncp_request_header) + 2; + ncp_add_byte(server, subfunction); + + server->has_subfunction = 1; +} + +static inline char * +ncp_reply_data(struct ncp_server *server, int offset) +{ + return &(server->packet[sizeof(struct ncp_reply_header) + offset]); +} + +static inline u8 BVAL(const void *data) +{ + return *(const u8 *)data; +} + +static u8 ncp_reply_byte(struct ncp_server *server, int offset) +{ + return *(const u8 *)ncp_reply_data(server, offset); +} + +static inline u16 WVAL_LH(const void *data) +{ + return get_unaligned_le16(data); +} + +static u16 +ncp_reply_le16(struct ncp_server *server, int offset) +{ + return get_unaligned_le16(ncp_reply_data(server, offset)); +} + +static u16 +ncp_reply_be16(struct ncp_server *server, int offset) +{ + return get_unaligned_be16(ncp_reply_data(server, offset)); +} + +static inline u32 DVAL_LH(const void *data) +{ + return get_unaligned_le32(data); +} + +static __le32 +ncp_reply_dword(struct ncp_server *server, int offset) +{ + return get_unaligned((__le32 *)ncp_reply_data(server, offset)); +} + +static inline __u32 ncp_reply_dword_lh(struct ncp_server* server, int offset) { + return le32_to_cpu(ncp_reply_dword(server, offset)); +} + +int +ncp_negotiate_buffersize(struct ncp_server *server, int size, int *target) +{ + int result; + + ncp_init_request(server); + ncp_add_be16(server, size); + + if ((result = ncp_request(server, 33)) != 0) { + ncp_unlock_server(server); + return result; + } + *target = min_t(unsigned int, ncp_reply_be16(server, 0), size); + + ncp_unlock_server(server); + return 0; +} + + +/* options: + * bit 0 ipx checksum + * bit 1 packet signing + */ +int +ncp_negotiate_size_and_options(struct ncp_server *server, + int size, int options, int *ret_size, int *ret_options) { + int result; + + /* there is minimum */ + if (size < NCP_BLOCK_SIZE) size = NCP_BLOCK_SIZE; + + ncp_init_request(server); + ncp_add_be16(server, size); + ncp_add_byte(server, options); + + if ((result = ncp_request(server, 0x61)) != 0) + { + ncp_unlock_server(server); + return result; + } + + /* NCP over UDP returns 0 (!!!) */ + result = ncp_reply_be16(server, 0); + if (result >= NCP_BLOCK_SIZE) + size = min(result, size); + *ret_size = size; + *ret_options = ncp_reply_byte(server, 4); + + ncp_unlock_server(server); + return 0; +} + +int ncp_get_volume_info_with_number(struct ncp_server* server, + int n, struct ncp_volume_info* target) { + int result; + int len; + + ncp_init_request_s(server, 44); + ncp_add_byte(server, n); + + if ((result = ncp_request(server, 22)) != 0) { + goto out; + } + target->total_blocks = ncp_reply_dword_lh(server, 0); + target->free_blocks = ncp_reply_dword_lh(server, 4); + target->purgeable_blocks = ncp_reply_dword_lh(server, 8); + target->not_yet_purgeable_blocks = ncp_reply_dword_lh(server, 12); + target->total_dir_entries = ncp_reply_dword_lh(server, 16); + target->available_dir_entries = ncp_reply_dword_lh(server, 20); + target->sectors_per_block = ncp_reply_byte(server, 28); + + memset(&(target->volume_name), 0, sizeof(target->volume_name)); + + result = -EIO; + len = ncp_reply_byte(server, 29); + if (len > NCP_VOLNAME_LEN) { + DPRINTK("ncpfs: volume name too long: %d\n", len); + goto out; + } + memcpy(&(target->volume_name), ncp_reply_data(server, 30), len); + result = 0; +out: + ncp_unlock_server(server); + return result; +} + +int ncp_get_directory_info(struct ncp_server* server, __u8 n, + struct ncp_volume_info* target) { + int result; + int len; + + ncp_init_request_s(server, 45); + ncp_add_byte(server, n); + + if ((result = ncp_request(server, 22)) != 0) { + goto out; + } + target->total_blocks = ncp_reply_dword_lh(server, 0); + target->free_blocks = ncp_reply_dword_lh(server, 4); + target->purgeable_blocks = 0; + target->not_yet_purgeable_blocks = 0; + target->total_dir_entries = ncp_reply_dword_lh(server, 8); + target->available_dir_entries = ncp_reply_dword_lh(server, 12); + target->sectors_per_block = ncp_reply_byte(server, 20); + + memset(&(target->volume_name), 0, sizeof(target->volume_name)); + + result = -EIO; + len = ncp_reply_byte(server, 21); + if (len > NCP_VOLNAME_LEN) { + DPRINTK("ncpfs: volume name too long: %d\n", len); + goto out; + } + memcpy(&(target->volume_name), ncp_reply_data(server, 22), len); + result = 0; +out: + ncp_unlock_server(server); + return result; +} + +int +ncp_close_file(struct ncp_server *server, const char *file_id) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 0); + ncp_add_mem(server, file_id, 6); + + result = ncp_request(server, 66); + ncp_unlock_server(server); + return result; +} + +int +ncp_make_closed(struct inode *inode) +{ + int err; + + err = 0; + mutex_lock(&NCP_FINFO(inode)->open_mutex); + if (atomic_read(&NCP_FINFO(inode)->opened) == 1) { + atomic_set(&NCP_FINFO(inode)->opened, 0); + err = ncp_close_file(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle); + + if (!err) + PPRINTK("ncp_make_closed: volnum=%d, dirent=%u, error=%d\n", + NCP_FINFO(inode)->volNumber, + NCP_FINFO(inode)->dirEntNum, err); + } + mutex_unlock(&NCP_FINFO(inode)->open_mutex); + return err; +} + +static void ncp_add_handle_path(struct ncp_server *server, __u8 vol_num, + __le32 dir_base, int have_dir_base, + const char *path) +{ + ncp_add_byte(server, vol_num); + ncp_add_dword(server, dir_base); + if (have_dir_base != 0) { + ncp_add_byte(server, 1); /* dir_base */ + } else { + ncp_add_byte(server, 0xff); /* no handle */ + } + if (path != NULL) { + ncp_add_byte(server, 1); /* 1 component */ + ncp_add_pstring(server, path); + } else { + ncp_add_byte(server, 0); + } +} + +int ncp_dirhandle_alloc(struct ncp_server* server, __u8 volnum, __le32 dirent, + __u8* dirhandle) { + int result; + + ncp_init_request(server); + ncp_add_byte(server, 12); /* subfunction */ + ncp_add_byte(server, NW_NS_DOS); + ncp_add_byte(server, 0); + ncp_add_word(server, 0); + ncp_add_handle_path(server, volnum, dirent, 1, NULL); + if ((result = ncp_request(server, 87)) == 0) { + *dirhandle = ncp_reply_byte(server, 0); + } + ncp_unlock_server(server); + return result; +} + +int ncp_dirhandle_free(struct ncp_server* server, __u8 dirhandle) { + int result; + + ncp_init_request_s(server, 20); + ncp_add_byte(server, dirhandle); + result = ncp_request(server, 22); + ncp_unlock_server(server); + return result; +} + +void ncp_extract_file_info(const void *structure, struct nw_info_struct *target) +{ + const __u8 *name_len; + const int info_struct_size = offsetof(struct nw_info_struct, nameLen); + + memcpy(target, structure, info_struct_size); + name_len = structure + info_struct_size; + target->nameLen = *name_len; + memcpy(target->entryName, name_len + 1, *name_len); + target->entryName[*name_len] = '\0'; + target->volNumber = le32_to_cpu(target->volNumber); + return; +} + +#ifdef CONFIG_NCPFS_NFS_NS +static inline void ncp_extract_nfs_info(const unsigned char *structure, + struct nw_nfs_info *target) +{ + target->mode = DVAL_LH(structure); + target->rdev = DVAL_LH(structure + 8); +} +#endif + +int ncp_obtain_nfs_info(struct ncp_server *server, + struct nw_info_struct *target) + +{ + int result = 0; +#ifdef CONFIG_NCPFS_NFS_NS + __u32 volnum = target->volNumber; + + if (ncp_is_nfs_extras(server, volnum)) { + ncp_init_request(server); + ncp_add_byte(server, 19); /* subfunction */ + ncp_add_byte(server, server->name_space[volnum]); + ncp_add_byte(server, NW_NS_NFS); + ncp_add_byte(server, 0); + ncp_add_byte(server, volnum); + ncp_add_dword(server, target->dirEntNum); + /* We must retrieve both nlinks and rdev, otherwise some server versions + report zeroes instead of valid data */ + ncp_add_dword_lh(server, NSIBM_NFS_MODE | NSIBM_NFS_NLINKS | NSIBM_NFS_RDEV); + + if ((result = ncp_request(server, 87)) == 0) { + ncp_extract_nfs_info(ncp_reply_data(server, 0), &target->nfs); + DPRINTK(KERN_DEBUG + "ncp_obtain_nfs_info: (%s) mode=0%o, rdev=0x%x\n", + target->entryName, target->nfs.mode, + target->nfs.rdev); + } else { + target->nfs.mode = 0; + target->nfs.rdev = 0; + } + ncp_unlock_server(server); + + } else +#endif + { + target->nfs.mode = 0; + target->nfs.rdev = 0; + } + return result; +} + +/* + * Returns information for a (one-component) name relative to + * the specified directory. + */ +int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *path, + struct nw_info_struct *target) +{ + __u8 volnum = NCP_FINFO(dir)->volNumber; + __le32 dirent = NCP_FINFO(dir)->dirEntNum; + int result; + + if (target == NULL) { + printk(KERN_ERR "ncp_obtain_info: invalid call\n"); + return -EINVAL; + } + ncp_init_request(server); + ncp_add_byte(server, 6); /* subfunction */ + ncp_add_byte(server, server->name_space[volnum]); + ncp_add_byte(server, server->name_space[volnum]); /* N.B. twice ?? */ + ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */ + ncp_add_dword(server, RIM_ALL); + ncp_add_handle_path(server, volnum, dirent, 1, path); + + if ((result = ncp_request(server, 87)) != 0) + goto out; + ncp_extract_file_info(ncp_reply_data(server, 0), target); + ncp_unlock_server(server); + + result = ncp_obtain_nfs_info(server, target); + return result; + +out: + ncp_unlock_server(server); + return result; +} + +#ifdef CONFIG_NCPFS_NFS_NS +static int +ncp_obtain_DOS_dir_base(struct ncp_server *server, + __u8 ns, __u8 volnum, __le32 dirent, + const char *path, /* At most 1 component */ + __le32 *DOS_dir_base) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 6); /* subfunction */ + ncp_add_byte(server, ns); + ncp_add_byte(server, ns); + ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */ + ncp_add_dword(server, RIM_DIRECTORY); + ncp_add_handle_path(server, volnum, dirent, 1, path); + + if ((result = ncp_request(server, 87)) == 0) + { + if (DOS_dir_base) *DOS_dir_base=ncp_reply_dword(server, 0x34); + } + ncp_unlock_server(server); + return result; +} +#endif /* CONFIG_NCPFS_NFS_NS */ + +static inline int +ncp_get_known_namespace(struct ncp_server *server, __u8 volume) +{ +#if defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) + int result; + __u8 *namespace; + __u16 no_namespaces; + + ncp_init_request(server); + ncp_add_byte(server, 24); /* Subfunction: Get Name Spaces Loaded */ + ncp_add_word(server, 0); + ncp_add_byte(server, volume); + + if ((result = ncp_request(server, 87)) != 0) { + ncp_unlock_server(server); + return NW_NS_DOS; /* not result ?? */ + } + + result = NW_NS_DOS; + no_namespaces = ncp_reply_le16(server, 0); + namespace = ncp_reply_data(server, 2); + + while (no_namespaces > 0) { + DPRINTK("get_namespaces: found %d on %d\n", *namespace, volume); + +#ifdef CONFIG_NCPFS_NFS_NS + if ((*namespace == NW_NS_NFS) && !(server->m.flags&NCP_MOUNT_NO_NFS)) + { + result = NW_NS_NFS; + break; + } +#endif /* CONFIG_NCPFS_NFS_NS */ +#ifdef CONFIG_NCPFS_OS2_NS + if ((*namespace == NW_NS_OS2) && !(server->m.flags&NCP_MOUNT_NO_OS2)) + { + result = NW_NS_OS2; + } +#endif /* CONFIG_NCPFS_OS2_NS */ + namespace += 1; + no_namespaces -= 1; + } + ncp_unlock_server(server); + return result; +#else /* neither OS2 nor NFS - only DOS */ + return NW_NS_DOS; +#endif /* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */ +} + +int +ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns) +{ + int ns = ncp_get_known_namespace(server, volume); + + if (ret_ns) + *ret_ns = ns; + + DPRINTK("lookup_vol: namespace[%d] = %d\n", + volume, server->name_space[volume]); + + if (server->name_space[volume] == ns) + return 0; + server->name_space[volume] = ns; + return 1; +} + +static int +ncp_ObtainSpecificDirBase(struct ncp_server *server, + __u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base, + const char *path, /* At most 1 component */ + __le32 *dirEntNum, __le32 *DosDirNum) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 6); /* subfunction */ + ncp_add_byte(server, nsSrc); + ncp_add_byte(server, nsDst); + ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */ + ncp_add_dword(server, RIM_ALL); + ncp_add_handle_path(server, vol_num, dir_base, 1, path); + + if ((result = ncp_request(server, 87)) != 0) + { + ncp_unlock_server(server); + return result; + } + + if (dirEntNum) + *dirEntNum = ncp_reply_dword(server, 0x30); + if (DosDirNum) + *DosDirNum = ncp_reply_dword(server, 0x34); + ncp_unlock_server(server); + return 0; +} + +int +ncp_mount_subdir(struct ncp_server *server, + __u8 volNumber, __u8 srcNS, __le32 dirEntNum, + __u32* volume, __le32* newDirEnt, __le32* newDosEnt) +{ + int dstNS; + int result; + + ncp_update_known_namespace(server, volNumber, &dstNS); + if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber, + dirEntNum, NULL, newDirEnt, newDosEnt)) != 0) + { + return result; + } + *volume = volNumber; + server->m.mounted_vol[1] = 0; + server->m.mounted_vol[0] = 'X'; + return 0; +} + +int +ncp_get_volume_root(struct ncp_server *server, + const char *volname, __u32* volume, __le32* dirent, __le32* dosdirent) +{ + int result; + + DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname); + + ncp_init_request(server); + ncp_add_byte(server, 22); /* Subfunction: Generate dir handle */ + ncp_add_byte(server, 0); /* DOS namespace */ + ncp_add_byte(server, 0); /* reserved */ + ncp_add_byte(server, 0); /* reserved */ + ncp_add_byte(server, 0); /* reserved */ + + ncp_add_byte(server, 0); /* faked volume number */ + ncp_add_dword(server, 0); /* faked dir_base */ + ncp_add_byte(server, 0xff); /* Don't have a dir_base */ + ncp_add_byte(server, 1); /* 1 path component */ + ncp_add_pstring(server, volname); + + if ((result = ncp_request(server, 87)) != 0) { + ncp_unlock_server(server); + return result; + } + *dirent = *dosdirent = ncp_reply_dword(server, 4); + *volume = ncp_reply_byte(server, 8); + ncp_unlock_server(server); + return 0; +} + +int +ncp_lookup_volume(struct ncp_server *server, + const char *volname, struct nw_info_struct *target) +{ + int result; + + memset(target, 0, sizeof(*target)); + result = ncp_get_volume_root(server, volname, + &target->volNumber, &target->dirEntNum, &target->DosDirNum); + if (result) { + return result; + } + ncp_update_known_namespace(server, target->volNumber, NULL); + target->nameLen = strlen(volname); + memcpy(target->entryName, volname, target->nameLen+1); + target->attributes = aDIR; + /* set dates to Jan 1, 1986 00:00 */ + target->creationTime = target->modifyTime = cpu_to_le16(0x0000); + target->creationDate = target->modifyDate = target->lastAccessDate = cpu_to_le16(0x0C21); + target->nfs.mode = 0; + return 0; +} + +int ncp_modify_file_or_subdir_dos_info_path(struct ncp_server *server, + struct inode *dir, + const char *path, + __le32 info_mask, + const struct nw_modify_dos_info *info) +{ + __u8 volnum = NCP_FINFO(dir)->volNumber; + __le32 dirent = NCP_FINFO(dir)->dirEntNum; + int result; + + ncp_init_request(server); + ncp_add_byte(server, 7); /* subfunction */ + ncp_add_byte(server, server->name_space[volnum]); + ncp_add_byte(server, 0); /* reserved */ + ncp_add_word(server, cpu_to_le16(0x8006)); /* search attribs: all */ + + ncp_add_dword(server, info_mask); + ncp_add_mem(server, info, sizeof(*info)); + ncp_add_handle_path(server, volnum, dirent, 1, path); + + result = ncp_request(server, 87); + ncp_unlock_server(server); + return result; +} + +int ncp_modify_file_or_subdir_dos_info(struct ncp_server *server, + struct inode *dir, + __le32 info_mask, + const struct nw_modify_dos_info *info) +{ + return ncp_modify_file_or_subdir_dos_info_path(server, dir, NULL, + info_mask, info); +} + +#ifdef CONFIG_NCPFS_NFS_NS +int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent, + __u32 mode, __u32 rdev) + +{ + int result = 0; + + ncp_init_request(server); + if (server->name_space[volnum] == NW_NS_NFS) { + ncp_add_byte(server, 25); /* subfunction */ + ncp_add_byte(server, server->name_space[volnum]); + ncp_add_byte(server, NW_NS_NFS); + ncp_add_byte(server, volnum); + ncp_add_dword(server, dirent); + /* we must always operate on both nlinks and rdev, otherwise + rdev is not set */ + ncp_add_dword_lh(server, NSIBM_NFS_MODE | NSIBM_NFS_NLINKS | NSIBM_NFS_RDEV); + ncp_add_dword_lh(server, mode); + ncp_add_dword_lh(server, 1); /* nlinks */ + ncp_add_dword_lh(server, rdev); + result = ncp_request(server, 87); + } + ncp_unlock_server(server); + return result; +} +#endif + + +static int +ncp_DeleteNSEntry(struct ncp_server *server, + __u8 have_dir_base, __u8 volnum, __le32 dirent, + const char* name, __u8 ns, __le16 attr) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 8); /* subfunction */ + ncp_add_byte(server, ns); + ncp_add_byte(server, 0); /* reserved */ + ncp_add_word(server, attr); /* search attribs: all */ + ncp_add_handle_path(server, volnum, dirent, have_dir_base, name); + + result = ncp_request(server, 87); + ncp_unlock_server(server); + return result; +} + +int +ncp_del_file_or_subdir2(struct ncp_server *server, + struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + __u8 volnum; + __le32 dirent; + + if (!inode) { + return 0xFF; /* Any error */ + } + volnum = NCP_FINFO(inode)->volNumber; + dirent = NCP_FINFO(inode)->DosDirNum; + return ncp_DeleteNSEntry(server, 1, volnum, dirent, NULL, NW_NS_DOS, cpu_to_le16(0x8006)); +} + +int +ncp_del_file_or_subdir(struct ncp_server *server, + struct inode *dir, const char *name) +{ + __u8 volnum = NCP_FINFO(dir)->volNumber; + __le32 dirent = NCP_FINFO(dir)->dirEntNum; + int name_space; + + name_space = server->name_space[volnum]; +#ifdef CONFIG_NCPFS_NFS_NS + if (name_space == NW_NS_NFS) + { + int result; + + result=ncp_obtain_DOS_dir_base(server, name_space, volnum, dirent, name, &dirent); + if (result) return result; + name = NULL; + name_space = NW_NS_DOS; + } +#endif /* CONFIG_NCPFS_NFS_NS */ + return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, name_space, cpu_to_le16(0x8006)); +} + +static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6]) +{ + __le16 *dest = (__le16 *) ret; + dest[1] = cpu_to_le16(v0); + dest[2] = cpu_to_le16(v1); + dest[0] = cpu_to_le16(v0 + 1); + return; +} + +/* If both dir and name are NULL, then in target there's already a + looked-up entry that wants to be opened. */ +int ncp_open_create_file_or_subdir(struct ncp_server *server, + struct inode *dir, const char *name, + int open_create_mode, + __le32 create_attributes, + __le16 desired_acc_rights, + struct ncp_entry_info *target) +{ + __le16 search_attribs = cpu_to_le16(0x0006); + __u8 volnum; + __le32 dirent; + int result; + + volnum = NCP_FINFO(dir)->volNumber; + dirent = NCP_FINFO(dir)->dirEntNum; + + if ((create_attributes & aDIR) != 0) { + search_attribs |= cpu_to_le16(0x8000); + } + ncp_init_request(server); + ncp_add_byte(server, 1); /* subfunction */ + ncp_add_byte(server, server->name_space[volnum]); + ncp_add_byte(server, open_create_mode); + ncp_add_word(server, search_attribs); + ncp_add_dword(server, RIM_ALL); + ncp_add_dword(server, create_attributes); + /* The desired acc rights seem to be the inherited rights mask + for directories */ + ncp_add_word(server, desired_acc_rights); + ncp_add_handle_path(server, volnum, dirent, 1, name); + + if ((result = ncp_request(server, 87)) != 0) + goto out; + if (!(create_attributes & aDIR)) + target->opened = 1; + + /* in target there's a new finfo to fill */ + ncp_extract_file_info(ncp_reply_data(server, 6), &(target->i)); + target->volume = target->i.volNumber; + ConvertToNWfromDWORD(ncp_reply_le16(server, 0), + ncp_reply_le16(server, 2), + target->file_handle); + + ncp_unlock_server(server); + + (void)ncp_obtain_nfs_info(server, &(target->i)); + return 0; + +out: + ncp_unlock_server(server); + return result; +} + +int +ncp_initialize_search(struct ncp_server *server, struct inode *dir, + struct nw_search_sequence *target) +{ + __u8 volnum = NCP_FINFO(dir)->volNumber; + __le32 dirent = NCP_FINFO(dir)->dirEntNum; + int result; + + ncp_init_request(server); + ncp_add_byte(server, 2); /* subfunction */ + ncp_add_byte(server, server->name_space[volnum]); + ncp_add_byte(server, 0); /* reserved */ + ncp_add_handle_path(server, volnum, dirent, 1, NULL); + + result = ncp_request(server, 87); + if (result) + goto out; + memcpy(target, ncp_reply_data(server, 0), sizeof(*target)); + +out: + ncp_unlock_server(server); + return result; +} + +int ncp_search_for_fileset(struct ncp_server *server, + struct nw_search_sequence *seq, + int* more, + int* cnt, + char* buffer, + size_t bufsize, + char** rbuf, + size_t* rsize) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 20); + ncp_add_byte(server, server->name_space[seq->volNumber]); + ncp_add_byte(server, 0); /* datastream */ + ncp_add_word(server, cpu_to_le16(0x8006)); + ncp_add_dword(server, RIM_ALL); + ncp_add_word(server, cpu_to_le16(32767)); /* max returned items */ + ncp_add_mem(server, seq, 9); +#ifdef CONFIG_NCPFS_NFS_NS + if (server->name_space[seq->volNumber] == NW_NS_NFS) { + ncp_add_byte(server, 0); /* 0 byte pattern */ + } else +#endif + { + ncp_add_byte(server, 2); /* 2 byte pattern */ + ncp_add_byte(server, 0xff); /* following is a wildcard */ + ncp_add_byte(server, '*'); + } + result = ncp_request2(server, 87, buffer, bufsize); + if (result) { + ncp_unlock_server(server); + return result; + } + if (server->ncp_reply_size < 12) { + ncp_unlock_server(server); + return 0xFF; + } + *rsize = server->ncp_reply_size - 12; + ncp_unlock_server(server); + buffer = buffer + sizeof(struct ncp_reply_header); + *rbuf = buffer + 12; + *cnt = WVAL_LH(buffer + 10); + *more = BVAL(buffer + 9); + memcpy(seq, buffer, 9); + return 0; +} + +static int +ncp_RenameNSEntry(struct ncp_server *server, + struct inode *old_dir, const char *old_name, __le16 old_type, + struct inode *new_dir, const char *new_name) +{ + int result = -EINVAL; + + if ((old_dir == NULL) || (old_name == NULL) || + (new_dir == NULL) || (new_name == NULL)) + goto out; + + ncp_init_request(server); + ncp_add_byte(server, 4); /* subfunction */ + ncp_add_byte(server, server->name_space[NCP_FINFO(old_dir)->volNumber]); + ncp_add_byte(server, 1); /* rename flag */ + ncp_add_word(server, old_type); /* search attributes */ + + /* source Handle Path */ + ncp_add_byte(server, NCP_FINFO(old_dir)->volNumber); + ncp_add_dword(server, NCP_FINFO(old_dir)->dirEntNum); + ncp_add_byte(server, 1); + ncp_add_byte(server, 1); /* 1 source component */ + + /* dest Handle Path */ + ncp_add_byte(server, NCP_FINFO(new_dir)->volNumber); + ncp_add_dword(server, NCP_FINFO(new_dir)->dirEntNum); + ncp_add_byte(server, 1); + ncp_add_byte(server, 1); /* 1 destination component */ + + /* source path string */ + ncp_add_pstring(server, old_name); + /* dest path string */ + ncp_add_pstring(server, new_name); + + result = ncp_request(server, 87); + ncp_unlock_server(server); +out: + return result; +} + +int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server, + struct inode *old_dir, const char *old_name, + struct inode *new_dir, const char *new_name) +{ + int result; + __le16 old_type = cpu_to_le16(0x06); + +/* If somebody can do it atomic, call me... vandrove@vc.cvut.cz */ + result = ncp_RenameNSEntry(server, old_dir, old_name, old_type, + new_dir, new_name); + if (result == 0xFF) /* File Not Found, try directory */ + { + old_type = cpu_to_le16(0x16); + result = ncp_RenameNSEntry(server, old_dir, old_name, old_type, + new_dir, new_name); + } + if (result != 0x92) return result; /* All except NO_FILES_RENAMED */ + result = ncp_del_file_or_subdir(server, new_dir, new_name); + if (result != 0) return -EACCES; + result = ncp_RenameNSEntry(server, old_dir, old_name, old_type, + new_dir, new_name); + return result; +} + + +/* We have to transfer to/from user space */ +int +ncp_read_kernel(struct ncp_server *server, const char *file_id, + __u32 offset, __u16 to_read, char *target, int *bytes_read) +{ + const char *source; + int result; + + ncp_init_request(server); + ncp_add_byte(server, 0); + ncp_add_mem(server, file_id, 6); + ncp_add_be32(server, offset); + ncp_add_be16(server, to_read); + + if ((result = ncp_request(server, 72)) != 0) { + goto out; + } + *bytes_read = ncp_reply_be16(server, 0); + source = ncp_reply_data(server, 2 + (offset & 1)); + + memcpy(target, source, *bytes_read); +out: + ncp_unlock_server(server); + return result; +} + +/* There is a problem... egrep and some other silly tools do: + x = mmap(NULL, MAP_PRIVATE, PROT_READ|PROT_WRITE, , 32768); + read(, x, 32768); + Now copying read result by copy_to_user causes pagefault. This pagefault + could not be handled because of server was locked due to read. So we have + to use temporary buffer. So ncp_unlock_server must be done before + copy_to_user (and for write, copy_from_user must be done before + ncp_init_request... same applies for send raw packet ioctl). Because of + file is normally read in bigger chunks, caller provides kmalloced + (vmalloced) chunk of memory with size >= to_read... + */ +int +ncp_read_bounce(struct ncp_server *server, const char *file_id, + __u32 offset, __u16 to_read, char __user *target, int *bytes_read, + void* bounce, __u32 bufsize) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 0); + ncp_add_mem(server, file_id, 6); + ncp_add_be32(server, offset); + ncp_add_be16(server, to_read); + result = ncp_request2(server, 72, bounce, bufsize); + ncp_unlock_server(server); + if (!result) { + int len = get_unaligned_be16((char *)bounce + + sizeof(struct ncp_reply_header)); + result = -EIO; + if (len <= to_read) { + char* source; + + source = (char*)bounce + + sizeof(struct ncp_reply_header) + 2 + + (offset & 1); + *bytes_read = len; + result = 0; + if (copy_to_user(target, source, len)) + result = -EFAULT; + } + } + return result; +} + +int +ncp_write_kernel(struct ncp_server *server, const char *file_id, + __u32 offset, __u16 to_write, + const char *source, int *bytes_written) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 0); + ncp_add_mem(server, file_id, 6); + ncp_add_be32(server, offset); + ncp_add_be16(server, to_write); + ncp_add_mem(server, source, to_write); + + if ((result = ncp_request(server, 73)) == 0) + *bytes_written = to_write; + ncp_unlock_server(server); + return result; +} + +#ifdef CONFIG_NCPFS_IOCTL_LOCKING +int +ncp_LogPhysicalRecord(struct ncp_server *server, const char *file_id, + __u8 locktype, __u32 offset, __u32 length, __u16 timeout) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, locktype); + ncp_add_mem(server, file_id, 6); + ncp_add_be32(server, offset); + ncp_add_be32(server, length); + ncp_add_be16(server, timeout); + + if ((result = ncp_request(server, 0x1A)) != 0) + { + ncp_unlock_server(server); + return result; + } + ncp_unlock_server(server); + return 0; +} + +int +ncp_ClearPhysicalRecord(struct ncp_server *server, const char *file_id, + __u32 offset, __u32 length) +{ + int result; + + ncp_init_request(server); + ncp_add_byte(server, 0); /* who knows... lanalyzer says that */ + ncp_add_mem(server, file_id, 6); + ncp_add_be32(server, offset); + ncp_add_be32(server, length); + + if ((result = ncp_request(server, 0x1E)) != 0) + { + ncp_unlock_server(server); + return result; + } + ncp_unlock_server(server); + return 0; +} +#endif /* CONFIG_NCPFS_IOCTL_LOCKING */ + +#ifdef CONFIG_NCPFS_NLS +/* This are the NLS conversion routines with inspirations and code parts + * from the vfat file system and hints from Petr Vandrovec. + */ + +int +ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen, + const unsigned char *iname, unsigned int ilen, int cc) +{ + struct nls_table *in = server->nls_io; + struct nls_table *out = server->nls_vol; + unsigned char *vname_start; + unsigned char *vname_end; + const unsigned char *iname_end; + + iname_end = iname + ilen; + vname_start = vname; + vname_end = vname + *vlen - 1; + + while (iname < iname_end) { + int chl; + wchar_t ec; + + if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { + int k; + unicode_t u; + + k = utf8_to_utf32(iname, iname_end - iname, &u); + if (k < 0 || u > MAX_WCHAR_T) + return -EINVAL; + iname += k; + ec = u; + } else { + if (*iname == NCP_ESC) { + int k; + + if (iname_end - iname < 5) + goto nospec; + + ec = 0; + for (k = 1; k < 5; k++) { + unsigned char nc; + + nc = iname[k] - '0'; + if (nc >= 10) { + nc -= 'A' - '0' - 10; + if ((nc < 10) || (nc > 15)) { + goto nospec; + } + } + ec = (ec << 4) | nc; + } + iname += 5; + } else { +nospec:; + if ( (chl = in->char2uni(iname, iname_end - iname, &ec)) < 0) + return chl; + iname += chl; + } + } + + /* unitoupper should be here! */ + + chl = out->uni2char(ec, vname, vname_end - vname); + if (chl < 0) + return chl; + + /* this is wrong... */ + if (cc) { + int chi; + + for (chi = 0; chi < chl; chi++){ + vname[chi] = ncp_toupper(out, vname[chi]); + } + } + vname += chl; + } + + *vname = 0; + *vlen = vname - vname_start; + return 0; +} + +int +ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen, + const unsigned char *vname, unsigned int vlen, int cc) +{ + struct nls_table *in = server->nls_vol; + struct nls_table *out = server->nls_io; + const unsigned char *vname_end; + unsigned char *iname_start; + unsigned char *iname_end; + unsigned char *vname_cc; + int err; + + vname_cc = NULL; + + if (cc) { + int i; + + /* this is wrong! */ + vname_cc = kmalloc(vlen, GFP_KERNEL); + if (!vname_cc) + return -ENOMEM; + for (i = 0; i < vlen; i++) + vname_cc[i] = ncp_tolower(in, vname[i]); + vname = vname_cc; + } + + iname_start = iname; + iname_end = iname + *ilen - 1; + vname_end = vname + vlen; + + while (vname < vname_end) { + wchar_t ec; + int chl; + + if ( (chl = in->char2uni(vname, vname_end - vname, &ec)) < 0) { + err = chl; + goto quit; + } + vname += chl; + + /* unitolower should be here! */ + + if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { + int k; + + k = utf32_to_utf8(ec, iname, iname_end - iname); + if (k < 0) { + err = -ENAMETOOLONG; + goto quit; + } + iname += k; + } else { + if ( (chl = out->uni2char(ec, iname, iname_end - iname)) >= 0) { + iname += chl; + } else { + int k; + + if (iname_end - iname < 5) { + err = -ENAMETOOLONG; + goto quit; + } + *iname = NCP_ESC; + for (k = 4; k > 0; k--) { + unsigned char v; + + v = (ec & 0xF) + '0'; + if (v > '9') { + v += 'A' - '9' - 1; + } + iname[k] = v; + ec >>= 4; + } + iname += 5; + } + } + } + + *iname = 0; + *ilen = iname - iname_start; + err = 0; +quit:; + if (cc) + kfree(vname_cc); + return err; +} + +#else + +int +ncp__io2vol(unsigned char *vname, unsigned int *vlen, + const unsigned char *iname, unsigned int ilen, int cc) +{ + int i; + + if (*vlen <= ilen) + return -ENAMETOOLONG; + + if (cc) + for (i = 0; i < ilen; i++) { + *vname = toupper(*iname); + vname++; + iname++; + } + else { + memmove(vname, iname, ilen); + vname += ilen; + } + + *vlen = ilen; + *vname = 0; + return 0; +} + +int +ncp__vol2io(unsigned char *iname, unsigned int *ilen, + const unsigned char *vname, unsigned int vlen, int cc) +{ + int i; + + if (*ilen <= vlen) + return -ENAMETOOLONG; + + if (cc) + for (i = 0; i < vlen; i++) { + *iname = tolower(*vname); + iname++; + vname++; + } + else { + memmove(iname, vname, vlen); + iname += vlen; + } + + *ilen = vlen; + *iname = 0; + return 0; +} + +#endif diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h new file mode 100644 index 00000000..09881e6a --- /dev/null +++ b/fs/ncpfs/ncplib_kernel.h @@ -0,0 +1,254 @@ +/* + * ncplib_kernel.h + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * Modified for big endian by J.F. Chadima and David S. Miller + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * Modified 1998, 1999 Wolfram Pienkoss for NLS + * Modified 1999 Wolfram Pienkoss for directory caching + * + */ + +#ifndef _NCPLIB_H +#define _NCPLIB_H + + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef CONFIG_NCPFS_NLS +#include +#else +#include +#endif /* CONFIG_NCPFS_NLS */ + +#define NCP_MIN_SYMLINK_SIZE 8 +#define NCP_MAX_SYMLINK_SIZE 512 + +#define NCP_BLOCK_SHIFT 9 +#define NCP_BLOCK_SIZE (1 << (NCP_BLOCK_SHIFT)) + +int ncp_negotiate_buffersize(struct ncp_server *, int, int *); +int ncp_negotiate_size_and_options(struct ncp_server *server, int size, + int options, int *ret_size, int *ret_options); + +int ncp_get_volume_info_with_number(struct ncp_server* server, int n, + struct ncp_volume_info *target); + +int ncp_get_directory_info(struct ncp_server* server, __u8 dirhandle, + struct ncp_volume_info* target); + +int ncp_close_file(struct ncp_server *, const char *); +static inline int ncp_read_bounce_size(__u32 size) { + return sizeof(struct ncp_reply_header) + 2 + 2 + size + 8; +}; +int ncp_read_bounce(struct ncp_server *, const char *, __u32, __u16, + char __user *, int *, void* bounce, __u32 bouncelen); +int ncp_read_kernel(struct ncp_server *, const char *, __u32, __u16, + char *, int *); +int ncp_write_kernel(struct ncp_server *, const char *, __u32, __u16, + const char *, int *); + +static inline void ncp_inode_close(struct inode *inode) { + atomic_dec(&NCP_FINFO(inode)->opened); +} + +void ncp_extract_file_info(const void* src, struct nw_info_struct* target); +int ncp_obtain_info(struct ncp_server *server, struct inode *, const char *, + struct nw_info_struct *target); +int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target); +int ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns); +int ncp_get_volume_root(struct ncp_server *server, const char *volname, + __u32 *volume, __le32 *dirent, __le32 *dosdirent); +int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *); +int ncp_modify_file_or_subdir_dos_info(struct ncp_server *, struct inode *, + __le32, const struct nw_modify_dos_info *info); +int ncp_modify_file_or_subdir_dos_info_path(struct ncp_server *, struct inode *, + const char* path, __le32, const struct nw_modify_dos_info *info); +int ncp_modify_nfs_info(struct ncp_server *, __u8 volnum, __le32 dirent, + __u32 mode, __u32 rdev); + +int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*); +int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, const char *); +int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, const char *, + int, __le32, __le16, struct ncp_entry_info *); + +int ncp_initialize_search(struct ncp_server *, struct inode *, + struct nw_search_sequence *target); +int ncp_search_for_fileset(struct ncp_server *server, + struct nw_search_sequence *seq, + int* more, int* cnt, + char* buffer, size_t bufsize, + char** rbuf, size_t* rsize); + +int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server, + struct inode *, const char *, struct inode *, const char *); + + +int +ncp_LogPhysicalRecord(struct ncp_server *server, + const char *file_id, __u8 locktype, + __u32 offset, __u32 length, __u16 timeout); + +#ifdef CONFIG_NCPFS_IOCTL_LOCKING +int +ncp_ClearPhysicalRecord(struct ncp_server *server, + const char *file_id, + __u32 offset, __u32 length); +#endif /* CONFIG_NCPFS_IOCTL_LOCKING */ + +int +ncp_mount_subdir(struct ncp_server *, __u8, __u8, __le32, + __u32* volume, __le32* dirent, __le32* dosdirent); +int ncp_dirhandle_alloc(struct ncp_server *, __u8 vol, __le32 dirent, __u8 *dirhandle); +int ncp_dirhandle_free(struct ncp_server *, __u8 dirhandle); + +int ncp_create_new(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev, __le32 attributes); + +static inline int ncp_is_nfs_extras(struct ncp_server* server, unsigned int volnum) { +#ifdef CONFIG_NCPFS_NFS_NS + return (server->m.flags & NCP_MOUNT_NFS_EXTRAS) && + (server->name_space[volnum] == NW_NS_NFS); +#else + return 0; +#endif +} + +#ifdef CONFIG_NCPFS_NLS + +int ncp__io2vol(struct ncp_server *, unsigned char *, unsigned int *, + const unsigned char *, unsigned int, int); +int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *, + const unsigned char *, unsigned int, int); + +#define NCP_ESC ':' +#define NCP_IO_TABLE(sb) (NCP_SBP(sb)->nls_io) +#define ncp_tolower(t, c) nls_tolower(t, c) +#define ncp_toupper(t, c) nls_toupper(t, c) +#define ncp_strnicmp(t, s1, s2, len) \ + nls_strnicmp(t, s1, s2, len) +#define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(S,m,i,n,k,U) +#define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(S,m,i,n,k,U) + +#else + +int ncp__io2vol(unsigned char *, unsigned int *, + const unsigned char *, unsigned int, int); +int ncp__vol2io(unsigned char *, unsigned int *, + const unsigned char *, unsigned int, int); + +#define NCP_IO_TABLE(sb) NULL +#define ncp_tolower(t, c) tolower(c) +#define ncp_toupper(t, c) toupper(c) +#define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(m,i,n,k,U) +#define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(m,i,n,k,U) + + +static inline int ncp_strnicmp(const struct nls_table *t, + const unsigned char *s1, const unsigned char *s2, int len) +{ + while (len--) { + if (tolower(*s1++) != tolower(*s2++)) + return 1; + } + + return 0; +} + +#endif /* CONFIG_NCPFS_NLS */ + +#define NCP_GET_AGE(dentry) (jiffies - (dentry)->d_time) +#define NCP_MAX_AGE(server) atomic_read(&(server)->dentry_ttl) +#define NCP_TEST_AGE(server,dentry) (NCP_GET_AGE(dentry) < NCP_MAX_AGE(server)) + +static inline void +ncp_age_dentry(struct ncp_server* server, struct dentry* dentry) +{ + dentry->d_time = jiffies - NCP_MAX_AGE(server); +} + +static inline void +ncp_new_dentry(struct dentry* dentry) +{ + dentry->d_time = jiffies; +} + +static inline void +ncp_renew_dentries(struct dentry *parent) +{ + struct ncp_server *server = NCP_SERVER(parent->d_inode); + struct list_head *next; + struct dentry *dentry; + + spin_lock(&parent->d_lock); + next = parent->d_subdirs.next; + while (next != &parent->d_subdirs) { + dentry = list_entry(next, struct dentry, d_u.d_child); + + if (dentry->d_fsdata == NULL) + ncp_age_dentry(server, dentry); + else + ncp_new_dentry(dentry); + + next = next->next; + } + spin_unlock(&parent->d_lock); +} + +static inline void +ncp_invalidate_dircache_entries(struct dentry *parent) +{ + struct ncp_server *server = NCP_SERVER(parent->d_inode); + struct list_head *next; + struct dentry *dentry; + + spin_lock(&parent->d_lock); + next = parent->d_subdirs.next; + while (next != &parent->d_subdirs) { + dentry = list_entry(next, struct dentry, d_u.d_child); + dentry->d_fsdata = NULL; + ncp_age_dentry(server, dentry); + next = next->next; + } + spin_unlock(&parent->d_lock); +} + +struct ncp_cache_head { + time_t mtime; + unsigned long time; /* cache age */ + unsigned long end; /* last valid fpos in cache */ + int eof; +}; + +#define NCP_DIRCACHE_SIZE ((int)(PAGE_CACHE_SIZE/sizeof(struct dentry *))) +union ncp_dir_cache { + struct ncp_cache_head head; + struct dentry *dentry[NCP_DIRCACHE_SIZE]; +}; + +#define NCP_FIRSTCACHE_SIZE ((int)((NCP_DIRCACHE_SIZE * \ + sizeof(struct dentry *) - sizeof(struct ncp_cache_head)) / \ + sizeof(struct dentry *))) + +#define NCP_DIRCACHE_START (NCP_DIRCACHE_SIZE - NCP_FIRSTCACHE_SIZE) + +struct ncp_cache_control { + struct ncp_cache_head head; + struct page *page; + union ncp_dir_cache *cache; + unsigned long fpos, ofs; + int filled, valid, idx; +}; + +#endif /* _NCPLIB_H */ diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c new file mode 100644 index 00000000..08907599 --- /dev/null +++ b/fs/ncpfs/ncpsign_kernel.c @@ -0,0 +1,127 @@ +/* + * ncpsign_kernel.c + * + * Arne de Bruijn (arne@knoware.nl), 1997 + * + */ + + +#ifdef CONFIG_NCPFS_PACKET_SIGNING + +#include +#include +#include +#include "ncp_fs.h" +#include "ncpsign_kernel.h" + +/* i386: 32-bit, little endian, handles mis-alignment */ +#ifdef __i386__ +#define GET_LE32(p) (*(const int *)(p)) +#define PUT_LE32(p,v) { *(int *)(p)=v; } +#else +/* from include/ncplib.h */ +#define BVAL(buf,pos) (((const __u8 *)(buf))[pos]) +#define PVAL(buf,pos) ((unsigned)BVAL(buf,pos)) +#define BSET(buf,pos,val) (((__u8 *)(buf))[pos] = (val)) + +static inline __u16 +WVAL_LH(const __u8 * buf, int pos) +{ + return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8; +} +static inline __u32 +DVAL_LH(const __u8 * buf, int pos) +{ + return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16; +} +static inline void +WSET_LH(__u8 * buf, int pos, __u16 val) +{ + BSET(buf, pos, val & 0xff); + BSET(buf, pos + 1, val >> 8); +} +static inline void +DSET_LH(__u8 * buf, int pos, __u32 val) +{ + WSET_LH(buf, pos, val & 0xffff); + WSET_LH(buf, pos + 2, val >> 16); +} + +#define GET_LE32(p) DVAL_LH(p,0) +#define PUT_LE32(p,v) DSET_LH(p,0,v) +#endif + +static void nwsign(char *r_data1, char *r_data2, char *outdata) { + int i; + unsigned int w0,w1,w2,w3; + static int rbit[4]={0, 2, 1, 3}; +#ifdef __i386__ + unsigned int *data2=(unsigned int *)r_data2; +#else + unsigned int data2[16]; + for (i=0;i<16;i++) + data2[i]=GET_LE32(r_data2+(i<<2)); +#endif + w0=GET_LE32(r_data1); + w1=GET_LE32(r_data1+4); + w2=GET_LE32(r_data1+8); + w3=GET_LE32(r_data1+12); + for (i=0;i<16;i+=4) { + w0=rol32(w0 + ((w1 & w2) | ((~w1) & w3)) + data2[i+0],3); + w3=rol32(w3 + ((w0 & w1) | ((~w0) & w2)) + data2[i+1],7); + w2=rol32(w2 + ((w3 & w0) | ((~w3) & w1)) + data2[i+2],11); + w1=rol32(w1 + ((w2 & w3) | ((~w2) & w0)) + data2[i+3],19); + } + for (i=0;i<4;i++) { + w0=rol32(w0 + (((w2 | w3) & w1) | (w2 & w3)) + 0x5a827999 + data2[i+0],3); + w3=rol32(w3 + (((w1 | w2) & w0) | (w1 & w2)) + 0x5a827999 + data2[i+4],5); + w2=rol32(w2 + (((w0 | w1) & w3) | (w0 & w1)) + 0x5a827999 + data2[i+8],9); + w1=rol32(w1 + (((w3 | w0) & w2) | (w3 & w0)) + 0x5a827999 + data2[i+12],13); + } + for (i=0;i<4;i++) { + w0=rol32(w0 + ((w1 ^ w2) ^ w3) + 0x6ed9eba1 + data2[rbit[i]+0],3); + w3=rol32(w3 + ((w0 ^ w1) ^ w2) + 0x6ed9eba1 + data2[rbit[i]+8],9); + w2=rol32(w2 + ((w3 ^ w0) ^ w1) + 0x6ed9eba1 + data2[rbit[i]+4],11); + w1=rol32(w1 + ((w2 ^ w3) ^ w0) + 0x6ed9eba1 + data2[rbit[i]+12],15); + } + PUT_LE32(outdata,(w0+GET_LE32(r_data1)) & 0xffffffff); + PUT_LE32(outdata+4,(w1+GET_LE32(r_data1+4)) & 0xffffffff); + PUT_LE32(outdata+8,(w2+GET_LE32(r_data1+8)) & 0xffffffff); + PUT_LE32(outdata+12,(w3+GET_LE32(r_data1+12)) & 0xffffffff); +} + +/* Make a signature for the current packet and add it at the end of the */ +/* packet. */ +void __sign_packet(struct ncp_server *server, const char *packet, size_t size, __u32 totalsize, void *sign_buff) { + unsigned char data[64]; + + memcpy(data, server->sign_root, 8); + *(__u32*)(data + 8) = totalsize; + if (size < 52) { + memcpy(data + 12, packet, size); + memset(data + 12 + size, 0, 52 - size); + } else { + memcpy(data + 12, packet, 52); + } + nwsign(server->sign_last, data, server->sign_last); + memcpy(sign_buff, server->sign_last, 8); +} + +int sign_verify_reply(struct ncp_server *server, const char *packet, size_t size, __u32 totalsize, const void *sign_buff) { + unsigned char data[64]; + unsigned char hash[16]; + + memcpy(data, server->sign_root, 8); + *(__u32*)(data + 8) = totalsize; + if (size < 52) { + memcpy(data + 12, packet, size); + memset(data + 12 + size, 0, 52 - size); + } else { + memcpy(data + 12, packet, 52); + } + nwsign(server->sign_last, data, hash); + return memcmp(sign_buff, hash, 8); +} + +#endif /* CONFIG_NCPFS_PACKET_SIGNING */ + diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h new file mode 100644 index 00000000..d9a1438b --- /dev/null +++ b/fs/ncpfs/ncpsign_kernel.h @@ -0,0 +1,26 @@ +/* + * ncpsign_kernel.h + * + * Arne de Bruijn (arne@knoware.nl), 1997 + * + */ + +#ifndef _NCPSIGN_KERNEL_H +#define _NCPSIGN_KERNEL_H + +#ifdef CONFIG_NCPFS_PACKET_SIGNING +void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff); +int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff); +#endif + +static inline size_t sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff) { +#ifdef CONFIG_NCPFS_PACKET_SIGNING + if (server->sign_active) { + __sign_packet(server, data, size, totalsize, sign_buff); + return 8; + } +#endif + return 0; +} + +#endif diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c new file mode 100644 index 00000000..3a158722 --- /dev/null +++ b/fs/ncpfs/sock.c @@ -0,0 +1,880 @@ +/* + * linux/fs/ncpfs/sock.c + * + * Copyright (C) 1992, 1993 Rick Sladkey + * + * Modified 1995, 1996 by Volker Lendecke to be usable for ncp + * Modified 1997 Peter Waltenberg, Bill Hawes, David Woodhouse for 2.1 dcache + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncp_fs.h" + +#include "ncpsign_kernel.h" + +static int _recv(struct socket *sock, void *buf, int size, unsigned flags) +{ + struct msghdr msg = {NULL, }; + struct kvec iov = {buf, size}; + return kernel_recvmsg(sock, &msg, &iov, 1, size, flags); +} + +static inline int do_send(struct socket *sock, struct kvec *vec, int count, + int len, unsigned flags) +{ + struct msghdr msg = { .msg_flags = flags }; + return kernel_sendmsg(sock, &msg, vec, count, len); +} + +static int _send(struct socket *sock, const void *buff, int len) +{ + struct kvec vec; + vec.iov_base = (void *) buff; + vec.iov_len = len; + return do_send(sock, &vec, 1, len, 0); +} + +struct ncp_request_reply { + struct list_head req; + wait_queue_head_t wq; + atomic_t refs; + unsigned char* reply_buf; + size_t datalen; + int result; + enum { RQ_DONE, RQ_INPROGRESS, RQ_QUEUED, RQ_IDLE, RQ_ABANDONED } status; + struct kvec* tx_ciov; + size_t tx_totallen; + size_t tx_iovlen; + struct kvec tx_iov[3]; + u_int16_t tx_type; + u_int32_t sign[6]; +}; + +static inline struct ncp_request_reply* ncp_alloc_req(void) +{ + struct ncp_request_reply *req; + + req = kmalloc(sizeof(struct ncp_request_reply), GFP_KERNEL); + if (!req) + return NULL; + + init_waitqueue_head(&req->wq); + atomic_set(&req->refs, (1)); + req->status = RQ_IDLE; + + return req; +} + +static void ncp_req_get(struct ncp_request_reply *req) +{ + atomic_inc(&req->refs); +} + +static void ncp_req_put(struct ncp_request_reply *req) +{ + if (atomic_dec_and_test(&req->refs)) + kfree(req); +} + +void ncp_tcp_data_ready(struct sock *sk, int len) +{ + struct ncp_server *server = sk->sk_user_data; + + server->data_ready(sk, len); + schedule_work(&server->rcv.tq); +} + +void ncp_tcp_error_report(struct sock *sk) +{ + struct ncp_server *server = sk->sk_user_data; + + server->error_report(sk); + schedule_work(&server->rcv.tq); +} + +void ncp_tcp_write_space(struct sock *sk) +{ + struct ncp_server *server = sk->sk_user_data; + + /* We do not need any locking: we first set tx.creq, and then we do sendmsg, + not vice versa... */ + server->write_space(sk); + if (server->tx.creq) + schedule_work(&server->tx.tq); +} + +void ncpdgram_timeout_call(unsigned long v) +{ + struct ncp_server *server = (void*)v; + + schedule_work(&server->timeout_tq); +} + +static inline void ncp_finish_request(struct ncp_server *server, struct ncp_request_reply *req, int result) +{ + req->result = result; + if (req->status != RQ_ABANDONED) + memcpy(req->reply_buf, server->rxbuf, req->datalen); + req->status = RQ_DONE; + wake_up_all(&req->wq); + ncp_req_put(req); +} + +static void __abort_ncp_connection(struct ncp_server *server) +{ + struct ncp_request_reply *req; + + ncp_invalidate_conn(server); + del_timer(&server->timeout_tm); + while (!list_empty(&server->tx.requests)) { + req = list_entry(server->tx.requests.next, struct ncp_request_reply, req); + + list_del_init(&req->req); + ncp_finish_request(server, req, -EIO); + } + req = server->rcv.creq; + if (req) { + server->rcv.creq = NULL; + ncp_finish_request(server, req, -EIO); + server->rcv.ptr = NULL; + server->rcv.state = 0; + } + req = server->tx.creq; + if (req) { + server->tx.creq = NULL; + ncp_finish_request(server, req, -EIO); + } +} + +static inline int get_conn_number(struct ncp_reply_header *rp) +{ + return rp->conn_low | (rp->conn_high << 8); +} + +static inline void __ncp_abort_request(struct ncp_server *server, struct ncp_request_reply *req, int err) +{ + /* If req is done, we got signal, but we also received answer... */ + switch (req->status) { + case RQ_IDLE: + case RQ_DONE: + break; + case RQ_QUEUED: + list_del_init(&req->req); + ncp_finish_request(server, req, err); + break; + case RQ_INPROGRESS: + req->status = RQ_ABANDONED; + break; + case RQ_ABANDONED: + break; + } +} + +static inline void ncp_abort_request(struct ncp_server *server, struct ncp_request_reply *req, int err) +{ + mutex_lock(&server->rcv.creq_mutex); + __ncp_abort_request(server, req, err); + mutex_unlock(&server->rcv.creq_mutex); +} + +static inline void __ncptcp_abort(struct ncp_server *server) +{ + __abort_ncp_connection(server); +} + +static int ncpdgram_send(struct socket *sock, struct ncp_request_reply *req) +{ + struct kvec vec[3]; + /* sock_sendmsg updates iov pointers for us :-( */ + memcpy(vec, req->tx_ciov, req->tx_iovlen * sizeof(vec[0])); + return do_send(sock, vec, req->tx_iovlen, + req->tx_totallen, MSG_DONTWAIT); +} + +static void __ncptcp_try_send(struct ncp_server *server) +{ + struct ncp_request_reply *rq; + struct kvec *iov; + struct kvec iovc[3]; + int result; + + rq = server->tx.creq; + if (!rq) + return; + + /* sock_sendmsg updates iov pointers for us :-( */ + memcpy(iovc, rq->tx_ciov, rq->tx_iovlen * sizeof(iov[0])); + result = do_send(server->ncp_sock, iovc, rq->tx_iovlen, + rq->tx_totallen, MSG_NOSIGNAL | MSG_DONTWAIT); + + if (result == -EAGAIN) + return; + + if (result < 0) { + printk(KERN_ERR "ncpfs: tcp: Send failed: %d\n", result); + __ncp_abort_request(server, rq, result); + return; + } + if (result >= rq->tx_totallen) { + server->rcv.creq = rq; + server->tx.creq = NULL; + return; + } + rq->tx_totallen -= result; + iov = rq->tx_ciov; + while (iov->iov_len <= result) { + result -= iov->iov_len; + iov++; + rq->tx_iovlen--; + } + iov->iov_base += result; + iov->iov_len -= result; + rq->tx_ciov = iov; +} + +static inline void ncp_init_header(struct ncp_server *server, struct ncp_request_reply *req, struct ncp_request_header *h) +{ + req->status = RQ_INPROGRESS; + h->conn_low = server->connection; + h->conn_high = server->connection >> 8; + h->sequence = ++server->sequence; +} + +static void ncpdgram_start_request(struct ncp_server *server, struct ncp_request_reply *req) +{ + size_t signlen; + struct ncp_request_header* h; + + req->tx_ciov = req->tx_iov + 1; + + h = req->tx_iov[1].iov_base; + ncp_init_header(server, req, h); + signlen = sign_packet(server, req->tx_iov[1].iov_base + sizeof(struct ncp_request_header) - 1, + req->tx_iov[1].iov_len - sizeof(struct ncp_request_header) + 1, + cpu_to_le32(req->tx_totallen), req->sign); + if (signlen) { + req->tx_ciov[1].iov_base = req->sign; + req->tx_ciov[1].iov_len = signlen; + req->tx_iovlen += 1; + req->tx_totallen += signlen; + } + server->rcv.creq = req; + server->timeout_last = server->m.time_out; + server->timeout_retries = server->m.retry_count; + ncpdgram_send(server->ncp_sock, req); + mod_timer(&server->timeout_tm, jiffies + server->m.time_out); +} + +#define NCP_TCP_XMIT_MAGIC (0x446D6454) +#define NCP_TCP_XMIT_VERSION (1) +#define NCP_TCP_RCVD_MAGIC (0x744E6350) + +static void ncptcp_start_request(struct ncp_server *server, struct ncp_request_reply *req) +{ + size_t signlen; + struct ncp_request_header* h; + + req->tx_ciov = req->tx_iov; + h = req->tx_iov[1].iov_base; + ncp_init_header(server, req, h); + signlen = sign_packet(server, req->tx_iov[1].iov_base + sizeof(struct ncp_request_header) - 1, + req->tx_iov[1].iov_len - sizeof(struct ncp_request_header) + 1, + cpu_to_be32(req->tx_totallen + 24), req->sign + 4) + 16; + + req->sign[0] = htonl(NCP_TCP_XMIT_MAGIC); + req->sign[1] = htonl(req->tx_totallen + signlen); + req->sign[2] = htonl(NCP_TCP_XMIT_VERSION); + req->sign[3] = htonl(req->datalen + 8); + req->tx_iov[0].iov_base = req->sign; + req->tx_iov[0].iov_len = signlen; + req->tx_iovlen += 1; + req->tx_totallen += signlen; + + server->tx.creq = req; + __ncptcp_try_send(server); +} + +static inline void __ncp_start_request(struct ncp_server *server, struct ncp_request_reply *req) +{ + /* we copy the data so that we do not depend on the caller + staying alive */ + memcpy(server->txbuf, req->tx_iov[1].iov_base, req->tx_iov[1].iov_len); + req->tx_iov[1].iov_base = server->txbuf; + + if (server->ncp_sock->type == SOCK_STREAM) + ncptcp_start_request(server, req); + else + ncpdgram_start_request(server, req); +} + +static int ncp_add_request(struct ncp_server *server, struct ncp_request_reply *req) +{ + mutex_lock(&server->rcv.creq_mutex); + if (!ncp_conn_valid(server)) { + mutex_unlock(&server->rcv.creq_mutex); + printk(KERN_ERR "ncpfs: tcp: Server died\n"); + return -EIO; + } + ncp_req_get(req); + if (server->tx.creq || server->rcv.creq) { + req->status = RQ_QUEUED; + list_add_tail(&req->req, &server->tx.requests); + mutex_unlock(&server->rcv.creq_mutex); + return 0; + } + __ncp_start_request(server, req); + mutex_unlock(&server->rcv.creq_mutex); + return 0; +} + +static void __ncp_next_request(struct ncp_server *server) +{ + struct ncp_request_reply *req; + + server->rcv.creq = NULL; + if (list_empty(&server->tx.requests)) { + return; + } + req = list_entry(server->tx.requests.next, struct ncp_request_reply, req); + list_del_init(&req->req); + __ncp_start_request(server, req); +} + +static void info_server(struct ncp_server *server, unsigned int id, const void * data, size_t len) +{ + if (server->info_sock) { + struct kvec iov[2]; + __be32 hdr[2]; + + hdr[0] = cpu_to_be32(len + 8); + hdr[1] = cpu_to_be32(id); + + iov[0].iov_base = hdr; + iov[0].iov_len = 8; + iov[1].iov_base = (void *) data; + iov[1].iov_len = len; + + do_send(server->info_sock, iov, 2, len + 8, MSG_NOSIGNAL); + } +} + +void ncpdgram_rcv_proc(struct work_struct *work) +{ + struct ncp_server *server = + container_of(work, struct ncp_server, rcv.tq); + struct socket* sock; + + sock = server->ncp_sock; + + while (1) { + struct ncp_reply_header reply; + int result; + + result = _recv(sock, &reply, sizeof(reply), MSG_PEEK | MSG_DONTWAIT); + if (result < 0) { + break; + } + if (result >= sizeof(reply)) { + struct ncp_request_reply *req; + + if (reply.type == NCP_WATCHDOG) { + unsigned char buf[10]; + + if (server->connection != get_conn_number(&reply)) { + goto drop; + } + result = _recv(sock, buf, sizeof(buf), MSG_DONTWAIT); + if (result < 0) { + DPRINTK("recv failed with %d\n", result); + continue; + } + if (result < 10) { + DPRINTK("too short (%u) watchdog packet\n", result); + continue; + } + if (buf[9] != '?') { + DPRINTK("bad signature (%02X) in watchdog packet\n", buf[9]); + continue; + } + buf[9] = 'Y'; + _send(sock, buf, sizeof(buf)); + continue; + } + if (reply.type != NCP_POSITIVE_ACK && reply.type != NCP_REPLY) { + result = _recv(sock, server->unexpected_packet.data, sizeof(server->unexpected_packet.data), MSG_DONTWAIT); + if (result < 0) { + continue; + } + info_server(server, 0, server->unexpected_packet.data, result); + continue; + } + mutex_lock(&server->rcv.creq_mutex); + req = server->rcv.creq; + if (req && (req->tx_type == NCP_ALLOC_SLOT_REQUEST || (server->sequence == reply.sequence && + server->connection == get_conn_number(&reply)))) { + if (reply.type == NCP_POSITIVE_ACK) { + server->timeout_retries = server->m.retry_count; + server->timeout_last = NCP_MAX_RPC_TIMEOUT; + mod_timer(&server->timeout_tm, jiffies + NCP_MAX_RPC_TIMEOUT); + } else if (reply.type == NCP_REPLY) { + result = _recv(sock, server->rxbuf, req->datalen, MSG_DONTWAIT); +#ifdef CONFIG_NCPFS_PACKET_SIGNING + if (result >= 0 && server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) { + if (result < 8 + 8) { + result = -EIO; + } else { + unsigned int hdrl; + + result -= 8; + hdrl = sock->sk->sk_family == AF_INET ? 8 : 6; + if (sign_verify_reply(server, server->rxbuf + hdrl, result - hdrl, cpu_to_le32(result), server->rxbuf + result)) { + printk(KERN_INFO "ncpfs: Signature violation\n"); + result = -EIO; + } + } + } +#endif + del_timer(&server->timeout_tm); + server->rcv.creq = NULL; + ncp_finish_request(server, req, result); + __ncp_next_request(server); + mutex_unlock(&server->rcv.creq_mutex); + continue; + } + } + mutex_unlock(&server->rcv.creq_mutex); + } +drop:; + _recv(sock, &reply, sizeof(reply), MSG_DONTWAIT); + } +} + +static void __ncpdgram_timeout_proc(struct ncp_server *server) +{ + /* If timer is pending, we are processing another request... */ + if (!timer_pending(&server->timeout_tm)) { + struct ncp_request_reply* req; + + req = server->rcv.creq; + if (req) { + int timeout; + + if (server->m.flags & NCP_MOUNT_SOFT) { + if (server->timeout_retries-- == 0) { + __ncp_abort_request(server, req, -ETIMEDOUT); + return; + } + } + /* Ignore errors */ + ncpdgram_send(server->ncp_sock, req); + timeout = server->timeout_last << 1; + if (timeout > NCP_MAX_RPC_TIMEOUT) { + timeout = NCP_MAX_RPC_TIMEOUT; + } + server->timeout_last = timeout; + mod_timer(&server->timeout_tm, jiffies + timeout); + } + } +} + +void ncpdgram_timeout_proc(struct work_struct *work) +{ + struct ncp_server *server = + container_of(work, struct ncp_server, timeout_tq); + mutex_lock(&server->rcv.creq_mutex); + __ncpdgram_timeout_proc(server); + mutex_unlock(&server->rcv.creq_mutex); +} + +static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len) +{ + int result; + + if (buffer) { + result = _recv(server->ncp_sock, buffer, len, MSG_DONTWAIT); + } else { + static unsigned char dummy[1024]; + + if (len > sizeof(dummy)) { + len = sizeof(dummy); + } + result = _recv(server->ncp_sock, dummy, len, MSG_DONTWAIT); + } + if (result < 0) { + return result; + } + if (result > len) { + printk(KERN_ERR "ncpfs: tcp: bug in recvmsg (%u > %Zu)\n", result, len); + return -EIO; + } + return result; +} + +static int __ncptcp_rcv_proc(struct ncp_server *server) +{ + /* We have to check the result, so store the complete header */ + while (1) { + int result; + struct ncp_request_reply *req; + int datalen; + int type; + + while (server->rcv.len) { + result = do_tcp_rcv(server, server->rcv.ptr, server->rcv.len); + if (result == -EAGAIN) { + return 0; + } + if (result <= 0) { + req = server->rcv.creq; + if (req) { + __ncp_abort_request(server, req, -EIO); + } else { + __ncptcp_abort(server); + } + if (result < 0) { + printk(KERN_ERR "ncpfs: tcp: error in recvmsg: %d\n", result); + } else { + DPRINTK(KERN_ERR "ncpfs: tcp: EOF\n"); + } + return -EIO; + } + if (server->rcv.ptr) { + server->rcv.ptr += result; + } + server->rcv.len -= result; + } + switch (server->rcv.state) { + case 0: + if (server->rcv.buf.magic != htonl(NCP_TCP_RCVD_MAGIC)) { + printk(KERN_ERR "ncpfs: tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic)); + __ncptcp_abort(server); + return -EIO; + } + datalen = ntohl(server->rcv.buf.len) & 0x0FFFFFFF; + if (datalen < 10) { + printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen); + __ncptcp_abort(server); + return -EIO; + } +#ifdef CONFIG_NCPFS_PACKET_SIGNING + if (server->sign_active) { + if (datalen < 18) { + printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen); + __ncptcp_abort(server); + return -EIO; + } + server->rcv.buf.len = datalen - 8; + server->rcv.ptr = (unsigned char*)&server->rcv.buf.p1; + server->rcv.len = 8; + server->rcv.state = 4; + break; + } +#endif + type = ntohs(server->rcv.buf.type); +#ifdef CONFIG_NCPFS_PACKET_SIGNING +cont:; +#endif + if (type != NCP_REPLY) { + if (datalen - 8 <= sizeof(server->unexpected_packet.data)) { + *(__u16*)(server->unexpected_packet.data) = htons(type); + server->unexpected_packet.len = datalen - 8; + + server->rcv.state = 5; + server->rcv.ptr = server->unexpected_packet.data + 2; + server->rcv.len = datalen - 10; + break; + } + DPRINTK("ncpfs: tcp: Unexpected NCP type %02X\n", type); +skipdata2:; + server->rcv.state = 2; +skipdata:; + server->rcv.ptr = NULL; + server->rcv.len = datalen - 10; + break; + } + req = server->rcv.creq; + if (!req) { + DPRINTK(KERN_ERR "ncpfs: Reply without appropriate request\n"); + goto skipdata2; + } + if (datalen > req->datalen + 8) { + printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8); + server->rcv.state = 3; + goto skipdata; + } + req->datalen = datalen - 8; + ((struct ncp_reply_header*)server->rxbuf)->type = NCP_REPLY; + server->rcv.ptr = server->rxbuf + 2; + server->rcv.len = datalen - 10; + server->rcv.state = 1; + break; +#ifdef CONFIG_NCPFS_PACKET_SIGNING + case 4: + datalen = server->rcv.buf.len; + type = ntohs(server->rcv.buf.type2); + goto cont; +#endif + case 1: + req = server->rcv.creq; + if (req->tx_type != NCP_ALLOC_SLOT_REQUEST) { + if (((struct ncp_reply_header*)server->rxbuf)->sequence != server->sequence) { + printk(KERN_ERR "ncpfs: tcp: Bad sequence number\n"); + __ncp_abort_request(server, req, -EIO); + return -EIO; + } + if ((((struct ncp_reply_header*)server->rxbuf)->conn_low | (((struct ncp_reply_header*)server->rxbuf)->conn_high << 8)) != server->connection) { + printk(KERN_ERR "ncpfs: tcp: Connection number mismatch\n"); + __ncp_abort_request(server, req, -EIO); + return -EIO; + } + } +#ifdef CONFIG_NCPFS_PACKET_SIGNING + if (server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) { + if (sign_verify_reply(server, server->rxbuf + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) { + printk(KERN_ERR "ncpfs: tcp: Signature violation\n"); + __ncp_abort_request(server, req, -EIO); + return -EIO; + } + } +#endif + ncp_finish_request(server, req, req->datalen); + nextreq:; + __ncp_next_request(server); + case 2: + next:; + server->rcv.ptr = (unsigned char*)&server->rcv.buf; + server->rcv.len = 10; + server->rcv.state = 0; + break; + case 3: + ncp_finish_request(server, server->rcv.creq, -EIO); + goto nextreq; + case 5: + info_server(server, 0, server->unexpected_packet.data, server->unexpected_packet.len); + goto next; + } + } +} + +void ncp_tcp_rcv_proc(struct work_struct *work) +{ + struct ncp_server *server = + container_of(work, struct ncp_server, rcv.tq); + + mutex_lock(&server->rcv.creq_mutex); + __ncptcp_rcv_proc(server); + mutex_unlock(&server->rcv.creq_mutex); +} + +void ncp_tcp_tx_proc(struct work_struct *work) +{ + struct ncp_server *server = + container_of(work, struct ncp_server, tx.tq); + + mutex_lock(&server->rcv.creq_mutex); + __ncptcp_try_send(server); + mutex_unlock(&server->rcv.creq_mutex); +} + +static int do_ncp_rpc_call(struct ncp_server *server, int size, + unsigned char* reply_buf, int max_reply_size) +{ + int result; + struct ncp_request_reply *req; + + req = ncp_alloc_req(); + if (!req) + return -ENOMEM; + + req->reply_buf = reply_buf; + req->datalen = max_reply_size; + req->tx_iov[1].iov_base = server->packet; + req->tx_iov[1].iov_len = size; + req->tx_iovlen = 1; + req->tx_totallen = size; + req->tx_type = *(u_int16_t*)server->packet; + + result = ncp_add_request(server, req); + if (result < 0) + goto out; + + if (wait_event_interruptible(req->wq, req->status == RQ_DONE)) { + ncp_abort_request(server, req, -EINTR); + result = -EINTR; + goto out; + } + + result = req->result; + +out: + ncp_req_put(req); + + return result; +} + +/* + * We need the server to be locked here, so check! + */ + +static int ncp_do_request(struct ncp_server *server, int size, + void* reply, int max_reply_size) +{ + int result; + + if (server->lock == 0) { + printk(KERN_ERR "ncpfs: Server not locked!\n"); + return -EIO; + } + if (!ncp_conn_valid(server)) { + return -EIO; + } + { + sigset_t old_set; + unsigned long mask, flags; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + old_set = current->blocked; + if (current->flags & PF_EXITING) + mask = 0; + else + mask = sigmask(SIGKILL); + if (server->m.flags & NCP_MOUNT_INTR) { + /* FIXME: This doesn't seem right at all. So, like, + we can't handle SIGINT and get whatever to stop? + What if we've blocked it ourselves? What about + alarms? Why, in fact, are we mucking with the + sigmask at all? -- r~ */ + if (current->sighand->action[SIGINT - 1].sa.sa_handler == SIG_DFL) + mask |= sigmask(SIGINT); + if (current->sighand->action[SIGQUIT - 1].sa.sa_handler == SIG_DFL) + mask |= sigmask(SIGQUIT); + } + siginitsetinv(¤t->blocked, mask); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + + result = do_ncp_rpc_call(server, size, reply, max_reply_size); + + spin_lock_irqsave(¤t->sighand->siglock, flags); + current->blocked = old_set; + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + + DDPRINTK("do_ncp_rpc_call returned %d\n", result); + + return result; +} + +/* ncp_do_request assures that at least a complete reply header is + * received. It assumes that server->current_size contains the ncp + * request size + */ +int ncp_request2(struct ncp_server *server, int function, + void* rpl, int size) +{ + struct ncp_request_header *h; + struct ncp_reply_header* reply = rpl; + int result; + + h = (struct ncp_request_header *) (server->packet); + if (server->has_subfunction != 0) { + *(__u16 *) & (h->data[0]) = htons(server->current_size - sizeof(*h) - 2); + } + h->type = NCP_REQUEST; + /* + * The server shouldn't know or care what task is making a + * request, so we always use the same task number. + */ + h->task = 2; /* (current->pid) & 0xff; */ + h->function = function; + + result = ncp_do_request(server, server->current_size, reply, size); + if (result < 0) { + DPRINTK("ncp_request_error: %d\n", result); + goto out; + } + server->completion = reply->completion_code; + server->conn_status = reply->connection_state; + server->reply_size = result; + server->ncp_reply_size = result - sizeof(struct ncp_reply_header); + + result = reply->completion_code; + + if (result != 0) + PPRINTK("ncp_request: completion code=%x\n", result); +out: + return result; +} + +int ncp_connect(struct ncp_server *server) +{ + struct ncp_request_header *h; + int result; + + server->connection = 0xFFFF; + server->sequence = 255; + + h = (struct ncp_request_header *) (server->packet); + h->type = NCP_ALLOC_SLOT_REQUEST; + h->task = 2; /* see above */ + h->function = 0; + + result = ncp_do_request(server, sizeof(*h), server->packet, server->packet_size); + if (result < 0) + goto out; + server->connection = h->conn_low + (h->conn_high * 256); + result = 0; +out: + return result; +} + +int ncp_disconnect(struct ncp_server *server) +{ + struct ncp_request_header *h; + + h = (struct ncp_request_header *) (server->packet); + h->type = NCP_DEALLOC_SLOT_REQUEST; + h->task = 2; /* see above */ + h->function = 0; + + return ncp_do_request(server, sizeof(*h), server->packet, server->packet_size); +} + +void ncp_lock_server(struct ncp_server *server) +{ + mutex_lock(&server->mutex); + if (server->lock) + printk(KERN_WARNING "ncp_lock_server: was locked!\n"); + server->lock = 1; +} + +void ncp_unlock_server(struct ncp_server *server) +{ + if (!server->lock) { + printk(KERN_WARNING "ncp_unlock_server: was not locked!\n"); + return; + } + server->lock = 0; + mutex_unlock(&server->mutex); +} diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c new file mode 100644 index 00000000..661f861d --- /dev/null +++ b/fs/ncpfs/symlink.c @@ -0,0 +1,181 @@ +/* + * linux/fs/ncpfs/symlink.c + * + * Code for allowing symbolic links on NCPFS (i.e. NetWare) + * Symbolic links are not supported on native NetWare, so we use an + * infrequently-used flag (Sh) and store a two-word magic header in + * the file to make sure we don't accidentally use a non-link file + * as a link. + * + * When using the NFS namespace, we set the mode to indicate a symlink and + * don't bother with the magic numbers. + * + * from linux/fs/ext2/symlink.c + * + * Copyright (C) 1998-99, Frank A. Vorstenbosch + * + * ncpfs symlink handling code + * NLS support (c) 1999 Petr Vandrovec + * Modified 2000 Ben Harris, University of Cambridge for NFS NS meta-info + * + */ + + +#include + +#include +#include +#include +#include +#include +#include +#include "ncp_fs.h" + +/* these magic numbers must appear in the symlink file -- this makes it a bit + more resilient against the magic attributes being set on random files. */ + +#define NCP_SYMLINK_MAGIC0 cpu_to_le32(0x6c6d7973) /* "symlnk->" */ +#define NCP_SYMLINK_MAGIC1 cpu_to_le32(0x3e2d6b6e) + +/* ----- read a symbolic link ------------------------------------------ */ + +static int ncp_symlink_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + int error, length, len; + char *link, *rawlink; + char *buf = kmap(page); + + error = -ENOMEM; + rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL); + if (!rawlink) + goto fail; + + if (ncp_make_open(inode,O_RDONLY)) + goto failEIO; + + error=ncp_read_kernel(NCP_SERVER(inode),NCP_FINFO(inode)->file_handle, + 0,NCP_MAX_SYMLINK_SIZE,rawlink,&length); + + ncp_inode_close(inode); + /* Close file handle if no other users... */ + ncp_make_closed(inode); + if (error) + goto failEIO; + + if (NCP_FINFO(inode)->flags & NCPI_KLUDGE_SYMLINK) { + if (lengthvolNumber)) + kludge = 0; + else +#ifdef CONFIG_NCPFS_EXTRAS + if (NCP_SERVER(dir)->m.flags & NCP_MOUNT_SYMLINKS) + kludge = 1; + else +#endif + /* EPERM is returned by VFS if symlink procedure does not exist */ + return -EPERM; + + rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL); + if (!rawlink) + return -ENOMEM; + + if (kludge) { + mode = 0; + attr = aSHARED | aHIDDEN; + ((__le32 *)rawlink)[0]=NCP_SYMLINK_MAGIC0; + ((__le32 *)rawlink)[1]=NCP_SYMLINK_MAGIC1; + hdr = 8; + } else { + mode = S_IFLNK | S_IRWXUGO; + attr = 0; + hdr = 0; + } + + length = strlen(symname); + /* map to/from server charset, do not touch upper/lower case as + symlink can point out of ncp filesystem */ + outlen = NCP_MAX_SYMLINK_SIZE - hdr; + err = ncp_io2vol(NCP_SERVER(dir), rawlink + hdr, &outlen, symname, length, 0); + if (err) + goto failfree; + + outlen += hdr; + + err = -EIO; + if (ncp_create_new(dir,dentry,mode,0,attr)) { + goto failfree; + } + + inode=dentry->d_inode; + + if (ncp_make_open(inode, O_WRONLY)) + goto failfree; + + if (ncp_write_kernel(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle, + 0, outlen, rawlink, &i) || i!=outlen) { + goto fail; + } + + ncp_inode_close(inode); + ncp_make_closed(inode); + kfree(rawlink); + return 0; +fail:; + ncp_inode_close(inode); + ncp_make_closed(inode); +failfree:; + kfree(rawlink); + return err; +} + +/* ----- EOF ----- */ diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig new file mode 100644 index 00000000..81515545 --- /dev/null +++ b/fs/nfs/Kconfig @@ -0,0 +1,144 @@ +config NFS_FS + tristate "NFS client support" + depends on INET && FILE_LOCKING + select LOCKD + select SUNRPC + select NFS_ACL_SUPPORT if NFS_V3_ACL + help + Choose Y here if you want to access files residing on other + computers using Sun's Network File System protocol. To compile + this file system support as a module, choose M here: the module + will be called nfs. + + To mount file systems exported by NFS servers, you also need to + install the user space mount.nfs command which can be found in + the Linux nfs-utils package, available from http://linux-nfs.org/. + Information about using the mount command is available in the + mount(8) man page. More detail about the Linux NFS client + implementation is available via the nfs(5) man page. + + Below you can choose which versions of the NFS protocol are + available in the kernel to mount NFS servers. Support for NFS + version 2 (RFC 1094) is always available when NFS_FS is selected. + + To configure a system which mounts its root file system via NFS + at boot time, say Y here, select "Kernel level IP + autoconfiguration" in the NETWORK menu, and select "Root file + system on NFS" below. You cannot compile this file system as a + module in this case. + + If unsure, say N. + +config NFS_V3 + bool "NFS client support for NFS version 3" + depends on NFS_FS + help + This option enables support for version 3 of the NFS protocol + (RFC 1813) in the kernel's NFS client. + + If unsure, say Y. + +config NFS_V3_ACL + bool "NFS client support for the NFSv3 ACL protocol extension" + depends on NFS_V3 + help + Some NFS servers support an auxiliary NFSv3 ACL protocol that + Sun added to Solaris but never became an official part of the + NFS version 3 protocol. This protocol extension allows + applications on NFS clients to manipulate POSIX Access Control + Lists on files residing on NFS servers. NFS servers enforce + ACLs on local files whether this protocol is available or not. + + Choose Y here if your NFS server supports the Solaris NFSv3 ACL + protocol extension and you want your NFS client to allow + applications to access and modify ACLs on files on the server. + + Most NFS servers don't support the Solaris NFSv3 ACL protocol + extension. You can choose N here or specify the "noacl" mount + option to prevent your NFS client from trying to use the NFSv3 + ACL protocol. + + If unsure, say N. + +config NFS_V4 + bool "NFS client support for NFS version 4" + depends on NFS_FS + select SUNRPC_GSS + help + This option enables support for version 4 of the NFS protocol + (RFC 3530) in the kernel's NFS client. + + To mount NFS servers using NFSv4, you also need to install user + space programs which can be found in the Linux nfs-utils package, + available from http://linux-nfs.org/. + + If unsure, say Y. + +config NFS_V4_1 + bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" + depends on NFS_FS && NFS_V4 && EXPERIMENTAL + select PNFS_FILE_LAYOUT + help + This option enables support for minor version 1 of the NFSv4 protocol + (RFC 5661) in the kernel's NFS client. + + If unsure, say N. + +config PNFS_FILE_LAYOUT + tristate + +config PNFS_OBJLAYOUT + tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" + depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD + help + Say M here if you want your pNFS client to support the Objects Layout Driver. + Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and + upper level driver (SCSI_OSD_ULD). + + If unsure, say N. + +config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP + help + If you want your system to mount its root file system via NFS, + choose Y here. This is common practice for managing systems + without local permanent storage. For details, read + . + + Most people say N here. + +config NFS_FSCACHE + bool "Provide NFS client caching support" + depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y + help + Say Y here if you want NFS data to be cached locally on disc through + the general filesystem cache manager + +config NFS_USE_LEGACY_DNS + bool "Use the legacy NFS DNS resolver" + depends on NFS_V4 + help + The kernel now provides a method for translating a host name into an + IP address. Select Y here if you would rather use your own DNS + resolver script. + + If unsure, say N + +config NFS_USE_KERNEL_DNS + bool + depends on NFS_V4 && !NFS_USE_LEGACY_DNS + select DNS_RESOLVER + select KEYS + default y + +config NFS_USE_NEW_IDMAPPER + bool "Use the new idmapper upcall routine" + depends on NFS_V4 && KEYS + help + Say Y here if you want NFS to use the new idmapper upcall functions. + You will need /sbin/request-key (usually provided by the keyutils + package). For details, read + . + + If you are unsure, say N. diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile new file mode 100644 index 00000000..6a34f7dd --- /dev/null +++ b/fs/nfs/Makefile @@ -0,0 +1,25 @@ +# +# Makefile for the Linux nfs filesystem routines. +# + +obj-$(CONFIG_NFS_FS) += nfs.o + +nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ + direct.o pagelist.o proc.o read.o symlink.o unlink.o \ + write.o namespace.o mount_clnt.o \ + dns_resolve.o cache_lib.o +nfs-$(CONFIG_ROOT_NFS) += nfsroot.o +nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o +nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o +nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ + delegation.o idmap.o \ + callback.o callback_xdr.o callback_proc.o \ + nfs4namespace.o +nfs-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o +nfs-$(CONFIG_SYSCTL) += sysctl.o +nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o + +obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o +nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o + +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c new file mode 100644 index 00000000..84690319 --- /dev/null +++ b/fs/nfs/cache_lib.c @@ -0,0 +1,141 @@ +/* + * linux/fs/nfs/cache_lib.c + * + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cache_lib.h" + +#define NFS_CACHE_UPCALL_PATHLEN 256 +#define NFS_CACHE_UPCALL_TIMEOUT 15 + +static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] = + "/sbin/nfs_cache_getent"; +static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT; + +module_param_string(cache_getent, nfs_cache_getent_prog, + sizeof(nfs_cache_getent_prog), 0600); +MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program"); +module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600); +MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which " + "the cache upcall is assumed to have failed"); + +int nfs_cache_upcall(struct cache_detail *cd, char *entry_name) +{ + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[] = { + nfs_cache_getent_prog, + cd->name, + entry_name, + NULL + }; + int ret = -EACCES; + + if (nfs_cache_getent_prog[0] == '\0') + goto out; + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + /* + * Disable the upcall mechanism if we're getting an ENOENT or + * EACCES error. The admin can re-enable it on the fly by using + * sysfs to set the 'cache_getent' parameter once the problem + * has been fixed. + */ + if (ret == -ENOENT || ret == -EACCES) + nfs_cache_getent_prog[0] = '\0'; +out: + return ret > 0 ? 0 : ret; +} + +/* + * Deferred request handling + */ +void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq) +{ + if (atomic_dec_and_test(&dreq->count)) + kfree(dreq); +} + +static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(d, struct nfs_cache_defer_req, deferred_req); + + complete_all(&dreq->completion); + nfs_cache_defer_req_put(dreq); +} + +static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(req, struct nfs_cache_defer_req, req); + dreq->deferred_req.revisit = nfs_dns_cache_revisit; + atomic_inc(&dreq->count); + + return &dreq->deferred_req; +} + +struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void) +{ + struct nfs_cache_defer_req *dreq; + + dreq = kzalloc(sizeof(*dreq), GFP_KERNEL); + if (dreq) { + init_completion(&dreq->completion); + atomic_set(&dreq->count, 1); + dreq->req.defer = nfs_dns_cache_defer; + } + return dreq; +} + +int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq) +{ + if (wait_for_completion_timeout(&dreq->completion, + nfs_cache_getent_timeout * HZ) == 0) + return -ETIMEDOUT; + return 0; +} + +int nfs_cache_register(struct cache_detail *cd) +{ + struct nameidata nd; + struct vfsmount *mnt; + int ret; + + mnt = rpc_get_mount(); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd); + if (ret) + goto err; + ret = sunrpc_cache_register_pipefs(nd.path.dentry, + cd->name, 0600, cd); + path_put(&nd.path); + if (!ret) + return ret; +err: + rpc_put_mount(); + return ret; +} + +void nfs_cache_unregister(struct cache_detail *cd) +{ + sunrpc_cache_unregister_pipefs(cd); + rpc_put_mount(); +} + diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h new file mode 100644 index 00000000..76f856e2 --- /dev/null +++ b/fs/nfs/cache_lib.h @@ -0,0 +1,27 @@ +/* + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust + */ + +#include +#include +#include + +/* + * Deferred request handling + */ +struct nfs_cache_defer_req { + struct cache_req req; + struct cache_deferred_req deferred_req; + struct completion completion; + atomic_t count; +}; + +extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name); +extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void); +extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq); +extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); + +extern int nfs_cache_register(struct cache_detail *cd); +extern void nfs_cache_unregister(struct cache_detail *cd); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c new file mode 100644 index 00000000..e3d29426 --- /dev/null +++ b/fs/nfs/callback.c @@ -0,0 +1,403 @@ +/* + * linux/fs/nfs/callback.c + * + * Copyright (C) 2004 Trond Myklebust + * + * NFSv4 callback handling + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "nfs4_fs.h" +#include "callback.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_CALLBACK + +struct nfs_callback_data { + unsigned int users; + struct svc_serv *serv; + struct svc_rqst *rqst; + struct task_struct *task; +}; + +static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1]; +static DEFINE_MUTEX(nfs_callback_mutex); +static struct svc_program nfs4_callback_program; + +unsigned int nfs_callback_set_tcpport; +unsigned short nfs_callback_tcpport; +unsigned short nfs_callback_tcpport6; +#define NFS_CALLBACK_MAXPORTNR (65535U) + +static int param_set_portnr(const char *val, const struct kernel_param *kp) +{ + unsigned long num; + int ret; + + if (!val) + return -EINVAL; + ret = strict_strtoul(val, 0, &num); + if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) + return -EINVAL; + *((unsigned int *)kp->arg) = num; + return 0; +} +static struct kernel_param_ops param_ops_portnr = { + .set = param_set_portnr, + .get = param_get_uint, +}; +#define param_check_portnr(name, p) __param_check(name, p, unsigned int); + +module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); + +/* + * This is the NFSv4 callback kernel thread. + */ +static int +nfs4_callback_svc(void *vrqstp) +{ + int err, preverr = 0; + struct svc_rqst *rqstp = vrqstp; + + set_freezable(); + + while (!kthread_should_stop()) { + /* + * Listen for a request on the socket + */ + err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); + if (err == -EAGAIN || err == -EINTR) { + preverr = err; + continue; + } + if (err < 0) { + if (err != preverr) { + printk(KERN_WARNING "%s: unexpected error " + "from svc_recv (%d)\n", __func__, err); + preverr = err; + } + schedule_timeout_uninterruptible(HZ); + continue; + } + preverr = err; + svc_process(rqstp); + } + return 0; +} + +/* + * Prepare to bring up the NFSv4 callback service + */ +struct svc_rqst * +nfs4_callback_up(struct svc_serv *serv) +{ + int ret; + + ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + if (ret <= 0) + goto out_err; + nfs_callback_tcpport = ret; + dprintk("NFS: Callback listener port = %u (af %u)\n", + nfs_callback_tcpport, PF_INET); + + ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + if (ret > 0) { + nfs_callback_tcpport6 = ret; + dprintk("NFS: Callback listener port = %u (af %u)\n", + nfs_callback_tcpport6, PF_INET6); + } else if (ret == -EAFNOSUPPORT) + ret = 0; + else + goto out_err; + + return svc_prepare_thread(serv, &serv->sv_pools[0]); + +out_err: + if (ret == 0) + ret = -ENOMEM; + return ERR_PTR(ret); +} + +#if defined(CONFIG_NFS_V4_1) +/* + * The callback service for NFSv4.1 callbacks + */ +static int +nfs41_callback_svc(void *vrqstp) +{ + struct svc_rqst *rqstp = vrqstp; + struct svc_serv *serv = rqstp->rq_server; + struct rpc_rqst *req; + int error; + DEFINE_WAIT(wq); + + set_freezable(); + + while (!kthread_should_stop()) { + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); + spin_lock_bh(&serv->sv_cb_lock); + if (!list_empty(&serv->sv_cb_list)) { + req = list_first_entry(&serv->sv_cb_list, + struct rpc_rqst, rq_bc_list); + list_del(&req->rq_bc_list); + spin_unlock_bh(&serv->sv_cb_lock); + dprintk("Invoking bc_svc_process()\n"); + error = bc_svc_process(serv, req, rqstp); + dprintk("bc_svc_process() returned w/ error code= %d\n", + error); + } else { + spin_unlock_bh(&serv->sv_cb_lock); + schedule(); + } + finish_wait(&serv->sv_cb_waitq, &wq); + } + return 0; +} + +/* + * Bring up the NFSv4.1 callback service + */ +struct svc_rqst * +nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) +{ + struct svc_rqst *rqstp; + int ret; + + /* + * Create an svc_sock for the back channel service that shares the + * fore channel connection. + * Returns the input port (0) and sets the svc_serv bc_xprt on success + */ + ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0, + SVC_SOCK_ANONYMOUS); + if (ret < 0) { + rqstp = ERR_PTR(ret); + goto out; + } + + /* + * Save the svc_serv in the transport so that it can + * be referenced when the session backchannel is initialized + */ + xprt->bc_serv = serv; + + INIT_LIST_HEAD(&serv->sv_cb_list); + spin_lock_init(&serv->sv_cb_lock); + init_waitqueue_head(&serv->sv_cb_waitq); + rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); + if (IS_ERR(rqstp)) { + svc_xprt_put(serv->sv_bc_xprt); + serv->sv_bc_xprt = NULL; + } +out: + dprintk("--> %s return %ld\n", __func__, + IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0); + return rqstp; +} + +static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, + struct svc_serv *serv, struct rpc_xprt *xprt, + struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) +{ + if (minorversion) { + *rqstpp = nfs41_callback_up(serv, xprt); + *callback_svc = nfs41_callback_svc; + } + return minorversion; +} + +static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, + struct nfs_callback_data *cb_info) +{ + if (minorversion) + xprt->bc_serv = cb_info->serv; +} +#else +static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, + struct svc_serv *serv, struct rpc_xprt *xprt, + struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) +{ + return 0; +} + +static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, + struct nfs_callback_data *cb_info) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Bring up the callback thread if it is not already up. + */ +int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) +{ + struct svc_serv *serv = NULL; + struct svc_rqst *rqstp; + int (*callback_svc)(void *vrqstp); + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + char svc_name[12]; + int ret = 0; + int minorversion_setup; + + mutex_lock(&nfs_callback_mutex); + if (cb_info->users++ || cb_info->task != NULL) { + nfs_callback_bc_serv(minorversion, xprt, cb_info); + goto out; + } + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); + if (!serv) { + ret = -ENOMEM; + goto out_err; + } + + minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion, + serv, xprt, &rqstp, &callback_svc); + if (!minorversion_setup) { + /* v4.0 callback setup */ + rqstp = nfs4_callback_up(serv); + callback_svc = nfs4_callback_svc; + } + + if (IS_ERR(rqstp)) { + ret = PTR_ERR(rqstp); + goto out_err; + } + + svc_sock_update_bufs(serv); + + sprintf(svc_name, "nfsv4.%u-svc", minorversion); + cb_info->serv = serv; + cb_info->rqst = rqstp; + cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name); + if (IS_ERR(cb_info->task)) { + ret = PTR_ERR(cb_info->task); + svc_exit_thread(cb_info->rqst); + cb_info->rqst = NULL; + cb_info->task = NULL; + goto out_err; + } +out: + /* + * svc_create creates the svc_serv with sv_nrthreads == 1, and then + * svc_prepare_thread increments that. So we need to call svc_destroy + * on both success and failure so that the refcount is 1 when the + * thread exits. + */ + if (serv) + svc_destroy(serv); + mutex_unlock(&nfs_callback_mutex); + return ret; +out_err: + dprintk("NFS: Couldn't create callback socket or server thread; " + "err = %d\n", ret); + cb_info->users--; + goto out; +} + +/* + * Kill the callback thread if it's no longer being used. + */ +void nfs_callback_down(int minorversion) +{ + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + + mutex_lock(&nfs_callback_mutex); + cb_info->users--; + if (cb_info->users == 0 && cb_info->task != NULL) { + kthread_stop(cb_info->task); + svc_exit_thread(cb_info->rqst); + cb_info->serv = NULL; + cb_info->rqst = NULL; + cb_info->task = NULL; + } + mutex_unlock(&nfs_callback_mutex); +} + +/* Boolean check of RPC_AUTH_GSS principal */ +int +check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) +{ + struct rpc_clnt *r = clp->cl_rpcclient; + char *p = svc_gss_principal(rqstp); + + if (rqstp->rq_authop->flavour != RPC_AUTH_GSS) + return 1; + + /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */ + if (clp->cl_minorversion != 0) + return 0; + /* + * It might just be a normal user principal, in which case + * userspace won't bother to tell us the name at all. + */ + if (p == NULL) + return 0; + + /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */ + + if (memcmp(p, "nfs@", 4) != 0) + return 0; + p += 4; + if (strcmp(p, r->cl_server) != 0) + return 0; + return 1; +} + +/* + * pg_authenticate method for nfsv4 callback threads. + * + * The authflavor has been negotiated, so an incorrect flavor is a server + * bug. Drop packets with incorrect authflavor. + * + * All other checking done after NFS decoding where the nfs_client can be + * found in nfs4_callback_compound + */ +static int nfs_callback_authenticate(struct svc_rqst *rqstp) +{ + switch (rqstp->rq_authop->flavour) { + case RPC_AUTH_NULL: + if (rqstp->rq_proc != CB_NULL) + return SVC_DROP; + break; + case RPC_AUTH_GSS: + /* No RPC_AUTH_GSS support yet in NFSv4.1 */ + if (svc_is_backchannel(rqstp)) + return SVC_DROP; + } + return SVC_OK; +} + +/* + * Define NFS4 callback program + */ +static struct svc_version *nfs4_callback_version[] = { + [1] = &nfs4_callback_version1, + [4] = &nfs4_callback_version4, +}; + +static struct svc_stat nfs4_callback_stats; + +static struct svc_program nfs4_callback_program = { + .pg_prog = NFS4_CALLBACK, /* RPC service number */ + .pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */ + .pg_vers = nfs4_callback_version, /* version table */ + .pg_name = "NFSv4 callback", /* service name */ + .pg_class = "nfs", /* authentication class */ + .pg_stats = &nfs4_callback_stats, + .pg_authenticate = nfs_callback_authenticate, +}; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h new file mode 100644 index 00000000..07df5f1d --- /dev/null +++ b/fs/nfs/callback.h @@ -0,0 +1,213 @@ +/* + * linux/fs/nfs/callback.h + * + * Copyright (C) 2004 Trond Myklebust + * + * NFSv4 callback definitions + */ +#ifndef __LINUX_FS_NFS_CALLBACK_H +#define __LINUX_FS_NFS_CALLBACK_H +#include + +#define NFS4_CALLBACK 0x40000000 +#define NFS4_CALLBACK_XDRSIZE 2048 +#define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE) + +enum nfs4_callback_procnum { + CB_NULL = 0, + CB_COMPOUND = 1, +}; + +enum nfs4_callback_opnum { + OP_CB_GETATTR = 3, + OP_CB_RECALL = 4, +/* Callback operations new to NFSv4.1 */ + OP_CB_LAYOUTRECALL = 5, + OP_CB_NOTIFY = 6, + OP_CB_PUSH_DELEG = 7, + OP_CB_RECALL_ANY = 8, + OP_CB_RECALLABLE_OBJ_AVAIL = 9, + OP_CB_RECALL_SLOT = 10, + OP_CB_SEQUENCE = 11, + OP_CB_WANTS_CANCELLED = 12, + OP_CB_NOTIFY_LOCK = 13, + OP_CB_NOTIFY_DEVICEID = 14, + OP_CB_ILLEGAL = 10044, +}; + +struct cb_process_state { + __be32 drc_status; + struct nfs_client *clp; + int slotid; +}; + +struct cb_compound_hdr_arg { + unsigned int taglen; + const char *tag; + unsigned int minorversion; + unsigned int cb_ident; /* v4.0 callback identifier */ + unsigned nops; +}; + +struct cb_compound_hdr_res { + __be32 *status; + unsigned int taglen; + const char *tag; + __be32 *nops; +}; + +struct cb_getattrargs { + struct sockaddr *addr; + struct nfs_fh fh; + uint32_t bitmap[2]; +}; + +struct cb_getattrres { + __be32 status; + uint32_t bitmap[2]; + uint64_t size; + uint64_t change_attr; + struct timespec ctime; + struct timespec mtime; +}; + +struct cb_recallargs { + struct sockaddr *addr; + struct nfs_fh fh; + nfs4_stateid stateid; + uint32_t truncate; +}; + +#if defined(CONFIG_NFS_V4_1) + +struct referring_call { + uint32_t rc_sequenceid; + uint32_t rc_slotid; +}; + +struct referring_call_list { + struct nfs4_sessionid rcl_sessionid; + uint32_t rcl_nrefcalls; + struct referring_call *rcl_refcalls; +}; + +struct cb_sequenceargs { + struct sockaddr *csa_addr; + struct nfs4_sessionid csa_sessionid; + uint32_t csa_sequenceid; + uint32_t csa_slotid; + uint32_t csa_highestslotid; + uint32_t csa_cachethis; + uint32_t csa_nrclists; + struct referring_call_list *csa_rclists; +}; + +struct cb_sequenceres { + __be32 csr_status; + struct nfs4_sessionid csr_sessionid; + uint32_t csr_sequenceid; + uint32_t csr_slotid; + uint32_t csr_highestslotid; + uint32_t csr_target_highestslotid; +}; + +extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, + struct cb_sequenceres *res, + struct cb_process_state *cps); + +extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, + const nfs4_stateid *stateid); + +#define RCA4_TYPE_MASK_RDATA_DLG 0 +#define RCA4_TYPE_MASK_WDATA_DLG 1 +#define RCA4_TYPE_MASK_DIR_DLG 2 +#define RCA4_TYPE_MASK_FILE_LAYOUT 3 +#define RCA4_TYPE_MASK_BLK_LAYOUT 4 +#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 +#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 +#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 +#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 +#define RCA4_TYPE_MASK_ALL 0xf31f + +struct cb_recallanyargs { + struct sockaddr *craa_addr; + uint32_t craa_objs_to_keep; + uint32_t craa_type_mask; +}; + +extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, + void *dummy, + struct cb_process_state *cps); + +struct cb_recallslotargs { + struct sockaddr *crsa_addr; + uint32_t crsa_target_max_slots; +}; +extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, + void *dummy, + struct cb_process_state *cps); + +struct cb_layoutrecallargs { + struct sockaddr *cbl_addr; + uint32_t cbl_recall_type; + uint32_t cbl_layout_type; + uint32_t cbl_layoutchanged; + union { + struct { + struct nfs_fh cbl_fh; + struct pnfs_layout_range cbl_range; + nfs4_stateid cbl_stateid; + }; + struct nfs_fsid cbl_fsid; + }; +}; + +extern unsigned nfs4_callback_layoutrecall( + struct cb_layoutrecallargs *args, + void *dummy, struct cb_process_state *cps); + +extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); + +struct cb_devicenotifyitem { + uint32_t cbd_notify_type; + uint32_t cbd_layout_type; + struct nfs4_deviceid cbd_dev_id; + uint32_t cbd_immediate; +}; + +struct cb_devicenotifyargs { + int ndevs; + struct cb_devicenotifyitem *devs; +}; + +extern __be32 nfs4_callback_devicenotify( + struct cb_devicenotifyargs *args, + void *dummy, struct cb_process_state *cps); + +#endif /* CONFIG_NFS_V4_1 */ +extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); +extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, + struct cb_getattrres *res, + struct cb_process_state *cps); +extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, + struct cb_process_state *cps); +#ifdef CONFIG_NFS_V4 +extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); +extern void nfs_callback_down(int minorversion); +extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, + const nfs4_stateid *stateid); +extern int nfs4_set_callback_sessionid(struct nfs_client *clp); +#endif /* CONFIG_NFS_V4 */ +/* + * nfs41: Callbacks are expected to not cause substantial latency, + * so we limit their concurrency to 1 by setting up the maximum number + * of slots for the backchannel. + */ +#define NFS41_BC_MIN_CALLBACKS 1 +#define NFS41_BC_MAX_CALLBACKS 1 + +extern unsigned int nfs_callback_set_tcpport; +extern unsigned short nfs_callback_tcpport; +extern unsigned short nfs_callback_tcpport6; + +#endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c new file mode 100644 index 00000000..b5c826e1 --- /dev/null +++ b/fs/nfs/callback_proc.c @@ -0,0 +1,561 @@ +/* + * linux/fs/nfs/callback_proc.c + * + * Copyright (C) 2004 Trond Myklebust + * + * NFSv4 callback procedures + */ +#include +#include +#include +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "internal.h" +#include "pnfs.h" + +#ifdef NFS_DEBUG +#define NFSDBG_FACILITY NFSDBG_CALLBACK +#endif + +__be32 nfs4_callback_getattr(struct cb_getattrargs *args, + struct cb_getattrres *res, + struct cb_process_state *cps) +{ + struct nfs_delegation *delegation; + struct nfs_inode *nfsi; + struct inode *inode; + + res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ + goto out; + + res->bitmap[0] = res->bitmap[1] = 0; + res->status = htonl(NFS4ERR_BADHANDLE); + + dprintk("NFS: GETATTR callback request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + inode = nfs_delegation_find_inode(cps->clp, &args->fh); + if (inode == NULL) + goto out; + nfsi = NFS_I(inode); + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) + goto out_iput; + res->size = i_size_read(inode); + res->change_attr = delegation->change_attr; + if (nfsi->npages != 0) + res->change_attr++; + res->ctime = inode->i_ctime; + res->mtime = inode->i_mtime; + res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & + args->bitmap[0]; + res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) & + args->bitmap[1]; + res->status = 0; +out_iput: + rcu_read_unlock(); + iput(inode); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); + return res->status; +} + +__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, + struct cb_process_state *cps) +{ + struct inode *inode; + __be32 res; + + res = htonl(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ + goto out; + + dprintk("NFS: RECALL callback request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + res = htonl(NFS4ERR_BADHANDLE); + inode = nfs_delegation_find_inode(cps->clp, &args->fh); + if (inode == NULL) + goto out; + /* Set up a helper thread to actually return the delegation */ + switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { + case 0: + res = 0; + break; + case -ENOENT: + if (res != 0) + res = htonl(NFS4ERR_BAD_STATEID); + break; + default: + res = htonl(NFS4ERR_RESOURCE); + } + iput(inode); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); + return res; +} + +int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) +{ + if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, + sizeof(delegation->stateid.data)) != 0) + return 0; + return 1; +} + +#if defined(CONFIG_NFS_V4_1) + +static u32 initiate_file_draining(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + struct pnfs_layout_hdr *lo; + struct inode *ino; + bool found = false; + u32 rv = NFS4ERR_NOMATCHING_LAYOUT; + LIST_HEAD(free_me_list); + + spin_lock(&clp->cl_lock); + list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { + if (nfs_compare_fh(&args->cbl_fh, + &NFS_I(lo->plh_inode)->fh)) + continue; + ino = igrab(lo->plh_inode); + if (!ino) + continue; + found = true; + /* Without this, layout can be freed as soon + * as we release cl_lock. + */ + get_layout_hdr(lo); + break; + } + spin_unlock(&clp->cl_lock); + if (!found) + return NFS4ERR_NOMATCHING_LAYOUT; + + spin_lock(&ino->i_lock); + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || + mark_matching_lsegs_invalid(lo, &free_me_list, + &args->cbl_range)) + rv = NFS4ERR_DELAY; + else + rv = NFS4ERR_NOMATCHING_LAYOUT; + pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me_list); + put_layout_hdr(lo); + iput(ino); + return rv; +} + +static u32 initiate_bulk_draining(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + struct pnfs_layout_hdr *lo; + struct inode *ino; + u32 rv = NFS4ERR_NOMATCHING_LAYOUT; + struct pnfs_layout_hdr *tmp; + LIST_HEAD(recall_list); + LIST_HEAD(free_me_list); + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + spin_lock(&clp->cl_lock); + list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { + if ((args->cbl_recall_type == RETURN_FSID) && + memcmp(&NFS_SERVER(lo->plh_inode)->fsid, + &args->cbl_fsid, sizeof(struct nfs_fsid))) + continue; + if (!igrab(lo->plh_inode)) + continue; + get_layout_hdr(lo); + BUG_ON(!list_empty(&lo->plh_bulk_recall)); + list_add(&lo->plh_bulk_recall, &recall_list); + } + spin_unlock(&clp->cl_lock); + list_for_each_entry_safe(lo, tmp, + &recall_list, plh_bulk_recall) { + ino = lo->plh_inode; + spin_lock(&ino->i_lock); + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + if (mark_matching_lsegs_invalid(lo, &free_me_list, &range)) + rv = NFS4ERR_DELAY; + list_del_init(&lo->plh_bulk_recall); + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me_list); + put_layout_hdr(lo); + iput(ino); + } + return rv; +} + +static u32 do_callback_layoutrecall(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + u32 res = NFS4ERR_DELAY; + + dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); + if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state)) + goto out; + if (args->cbl_recall_type == RETURN_FILE) + res = initiate_file_draining(clp, args); + else + res = initiate_bulk_draining(clp, args); + clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state); +out: + dprintk("%s returning %i\n", __func__, res); + return res; + +} + +__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, + void *dummy, struct cb_process_state *cps) +{ + u32 res; + + dprintk("%s: -->\n", __func__); + + if (cps->clp) + res = do_callback_layoutrecall(cps->clp, args); + else + res = NFS4ERR_OP_NOT_IN_SESSION; + + dprintk("%s: exit with status = %d\n", __func__, res); + return cpu_to_be32(res); +} + +static void pnfs_recall_all_layouts(struct nfs_client *clp) +{ + struct cb_layoutrecallargs args; + + /* Pretend we got a CB_LAYOUTRECALL(ALL) */ + memset(&args, 0, sizeof(args)); + args.cbl_recall_type = RETURN_ALL; + /* FIXME we ignore errors, what should we do? */ + do_callback_layoutrecall(clp, &args); +} + +__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, + void *dummy, struct cb_process_state *cps) +{ + int i; + __be32 res = 0; + struct nfs_client *clp = cps->clp; + struct nfs_server *server = NULL; + + dprintk("%s: -->\n", __func__); + + if (!clp) { + res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + goto out; + } + + for (i = 0; i < args->ndevs; i++) { + struct cb_devicenotifyitem *dev = &args->devs[i]; + + if (!server || + server->pnfs_curr_ld->id != dev->cbd_layout_type) { + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + if (server->pnfs_curr_ld && + server->pnfs_curr_ld->id == dev->cbd_layout_type) { + rcu_read_unlock(); + goto found; + } + rcu_read_unlock(); + dprintk("%s: layout type %u not found\n", + __func__, dev->cbd_layout_type); + continue; + } + + found: + if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) + dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, " + "deleting instead\n", __func__); + nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); + } + +out: + kfree(args->devs); + dprintk("%s: exit with status = %u\n", + __func__, be32_to_cpu(res)); + return res; +} + +int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) +{ + if (delegation == NULL) + return 0; + + if (stateid->stateid.seqid != 0) + return 0; + if (memcmp(&delegation->stateid.stateid.other, + &stateid->stateid.other, + NFS4_STATEID_OTHER_SIZE)) + return 0; + + return 1; +} + +/* + * Validate the sequenceID sent by the server. + * Return success if the sequenceID is one more than what we last saw on + * this slot, accounting for wraparound. Increments the slot's sequence. + * + * We don't yet implement a duplicate request cache, instead we set the + * back channel ca_maxresponsesize_cached to zero. This is OK for now + * since we only currently implement idempotent callbacks anyway. + * + * We have a single slot backchannel at this time, so we don't bother + * checking the used_slots bit array on the table. The lower layer guarantees + * a single outstanding callback request at a time. + */ +static __be32 +validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) +{ + struct nfs4_slot *slot; + + dprintk("%s enter. slotid %d seqid %d\n", + __func__, args->csa_slotid, args->csa_sequenceid); + + if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS) + return htonl(NFS4ERR_BADSLOT); + + slot = tbl->slots + args->csa_slotid; + dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr); + + /* Normal */ + if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { + slot->seq_nr++; + goto out_ok; + } + + /* Replay */ + if (args->csa_sequenceid == slot->seq_nr) { + dprintk("%s seqid %d is a replay\n", + __func__, args->csa_sequenceid); + /* Signal process_op to set this error on next op */ + if (args->csa_cachethis == 0) + return htonl(NFS4ERR_RETRY_UNCACHED_REP); + + /* The ca_maxresponsesize_cached is 0 with no DRC */ + else if (args->csa_cachethis == 1) + return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); + } + + /* Wraparound */ + if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) { + slot->seq_nr = 1; + goto out_ok; + } + + /* Misordered request */ + return htonl(NFS4ERR_SEQ_MISORDERED); +out_ok: + tbl->highest_used_slotid = args->csa_slotid; + return htonl(NFS4_OK); +} + +/* + * For each referring call triple, check the session's slot table for + * a match. If the slot is in use and the sequence numbers match, the + * client is still waiting for a response to the original request. + */ +static bool referring_call_exists(struct nfs_client *clp, + uint32_t nrclists, + struct referring_call_list *rclists) +{ + bool status = 0; + int i, j; + struct nfs4_session *session; + struct nfs4_slot_table *tbl; + struct referring_call_list *rclist; + struct referring_call *ref; + + /* + * XXX When client trunking is implemented, this becomes + * a session lookup from within the loop + */ + session = clp->cl_session; + tbl = &session->fc_slot_table; + + for (i = 0; i < nrclists; i++) { + rclist = &rclists[i]; + if (memcmp(session->sess_id.data, + rclist->rcl_sessionid.data, + NFS4_MAX_SESSIONID_LEN) != 0) + continue; + + for (j = 0; j < rclist->rcl_nrefcalls; j++) { + ref = &rclist->rcl_refcalls[j]; + + dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u " + "slotid %u\n", __func__, + ((u32 *)&rclist->rcl_sessionid.data)[0], + ((u32 *)&rclist->rcl_sessionid.data)[1], + ((u32 *)&rclist->rcl_sessionid.data)[2], + ((u32 *)&rclist->rcl_sessionid.data)[3], + ref->rc_sequenceid, ref->rc_slotid); + + spin_lock(&tbl->slot_tbl_lock); + status = (test_bit(ref->rc_slotid, tbl->used_slots) && + tbl->slots[ref->rc_slotid].seq_nr == + ref->rc_sequenceid); + spin_unlock(&tbl->slot_tbl_lock); + if (status) + goto out; + } + } + +out: + return status; +} + +__be32 nfs4_callback_sequence(struct cb_sequenceargs *args, + struct cb_sequenceres *res, + struct cb_process_state *cps) +{ + struct nfs4_slot_table *tbl; + struct nfs_client *clp; + int i; + __be32 status = htonl(NFS4ERR_BADSESSION); + + clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid); + if (clp == NULL) + goto out; + + tbl = &clp->cl_session->bc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); + /* state manager is resetting the session */ + if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { + spin_unlock(&tbl->slot_tbl_lock); + status = htonl(NFS4ERR_DELAY); + /* Return NFS4ERR_BADSESSION if we're draining the session + * in order to reset it. + */ + if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + status = htonl(NFS4ERR_BADSESSION); + goto out; + } + + status = validate_seqid(&clp->cl_session->bc_slot_table, args); + spin_unlock(&tbl->slot_tbl_lock); + if (status) + goto out; + + cps->slotid = args->csa_slotid; + + /* + * Check for pending referring calls. If a match is found, a + * related callback was received before the response to the original + * call. + */ + if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { + status = htonl(NFS4ERR_DELAY); + goto out; + } + + memcpy(&res->csr_sessionid, &args->csa_sessionid, + sizeof(res->csr_sessionid)); + res->csr_sequenceid = args->csa_sequenceid; + res->csr_slotid = args->csa_slotid; + res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + +out: + cps->clp = clp; /* put in nfs4_callback_compound */ + for (i = 0; i < args->csa_nrclists; i++) + kfree(args->csa_rclists[i].rcl_refcalls); + kfree(args->csa_rclists); + + if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) { + cps->drc_status = status; + status = 0; + } else + res->csr_status = status; + + dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, + ntohl(status), ntohl(res->csr_status)); + return status; +} + +static bool +validate_bitmap_values(unsigned long mask) +{ + return (mask & ~RCA4_TYPE_MASK_ALL) == 0; +} + +__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy, + struct cb_process_state *cps) +{ + __be32 status; + fmode_t flags = 0; + + status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* set in cb_sequence */ + goto out; + + dprintk("NFS: RECALL_ANY callback request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + status = cpu_to_be32(NFS4ERR_INVAL); + if (!validate_bitmap_values(args->craa_type_mask)) + goto out; + + status = cpu_to_be32(NFS4_OK); + if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags = FMODE_READ; + if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags |= FMODE_WRITE; + if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) + &args->craa_type_mask)) + pnfs_recall_all_layouts(cps->clp); + if (flags) + nfs_expire_all_delegation_types(cps->clp, flags); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +/* Reduce the fore channel's max_slots to the target value */ +__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, + struct cb_process_state *cps) +{ + struct nfs4_slot_table *fc_tbl; + __be32 status; + + status = htonl(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* set in cb_sequence */ + goto out; + + dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR), + args->crsa_target_max_slots); + + fc_tbl = &cps->clp->cl_session->fc_slot_table; + + status = htonl(NFS4ERR_BAD_HIGH_SLOT); + if (args->crsa_target_max_slots > fc_tbl->max_slots || + args->crsa_target_max_slots < 1) + goto out; + + status = htonl(NFS4_OK); + if (args->crsa_target_max_slots == fc_tbl->max_slots) + goto out; + + fc_tbl->target_max_slots = args->crsa_target_max_slots; + nfs41_handle_recall_slot(cps->clp); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} +#endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c new file mode 100644 index 00000000..918ad647 --- /dev/null +++ b/fs/nfs/callback_xdr.c @@ -0,0 +1,989 @@ +/* + * linux/fs/nfs/callback_xdr.c + * + * Copyright (C) 2004 Trond Myklebust + * + * NFSv4 callback encode/decode procedures + */ +#include +#include +#include +#include +#include +#include +#include "nfs4_fs.h" +#include "callback.h" +#include "internal.h" + +#define CB_OP_TAGLEN_MAXSZ (512) +#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ) +#define CB_OP_GETATTR_BITMAP_MAXSZ (4) +#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + CB_OP_GETATTR_BITMAP_MAXSZ + \ + 2 + 2 + 3 + 3) +#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + +#if defined(CONFIG_NFS_V4_1) +#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + 4 + 1 + 3) +#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#endif /* CONFIG_NFS_V4_1 */ + +#define NFSDBG_FACILITY NFSDBG_CALLBACK + +/* Internal error code */ +#define NFS4ERR_RESOURCE_HDR 11050 + +typedef __be32 (*callback_process_op_t)(void *, void *, + struct cb_process_state *); +typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); +typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); + + +struct callback_op { + callback_process_op_t process_op; + callback_decode_arg_t decode_args; + callback_encode_res_t encode_res; + long res_maxsize; +}; + +static struct callback_op callback_ops[]; + +static __be32 nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return htonl(NFS4_OK); +} + +static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_argsize_check(rqstp, p); +} + +static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} + +static __be32 *read_buf(struct xdr_stream *xdr, int nbytes) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, nbytes); + if (unlikely(p == NULL)) + printk(KERN_WARNING "NFSv4 callback reply buffer overflowed!\n"); + return p; +} + +static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str) +{ + __be32 *p; + + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *len = ntohl(*p); + + if (*len != 0) { + p = read_buf(xdr, *len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *str = (const char *)p; + } else + *str = NULL; + + return 0; +} + +static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p; + + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + fh->size = ntohl(*p); + if (fh->size > NFS4_FHSIZE) + return htonl(NFS4ERR_BADHANDLE); + p = read_buf(xdr, fh->size); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + memcpy(&fh->data[0], p, fh->size); + memset(&fh->data[fh->size], 0, sizeof(fh->data) - fh->size); + return 0; +} + +static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) +{ + __be32 *p; + unsigned int attrlen; + + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + attrlen = ntohl(*p); + p = read_buf(xdr, attrlen << 2); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + if (likely(attrlen > 0)) + bitmap[0] = ntohl(*p++); + if (attrlen > 1) + bitmap[1] = ntohl(*p); + return 0; +} + +static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + __be32 *p; + + p = read_buf(xdr, 16); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + memcpy(stateid->data, p, 16); + return 0; +} + +static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr) +{ + __be32 *p; + __be32 status; + + status = decode_string(xdr, &hdr->taglen, &hdr->tag); + if (unlikely(status != 0)) + return status; + /* We do not like overly long tags! */ + if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) { + printk("NFSv4 CALLBACK %s: client sent tag of length %u\n", + __func__, hdr->taglen); + return htonl(NFS4ERR_RESOURCE); + } + p = read_buf(xdr, 12); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + hdr->minorversion = ntohl(*p++); + /* Check minor version is zero or one. */ + if (hdr->minorversion <= 1) { + hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */ + } else { + printk(KERN_WARNING "%s: NFSv4 server callback with " + "illegal minor version %u!\n", + __func__, hdr->minorversion); + return htonl(NFS4ERR_MINOR_VERS_MISMATCH); + } + hdr->nops = ntohl(*p); + dprintk("%s: minorversion %d nops %d\n", __func__, + hdr->minorversion, hdr->nops); + return 0; +} + +static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op) +{ + __be32 *p; + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE_HDR); + *op = ntohl(*p); + return 0; +} + +static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args) +{ + __be32 status; + + status = decode_fh(xdr, &args->fh); + if (unlikely(status != 0)) + goto out; + args->addr = svc_addr(rqstp); + status = decode_bitmap(xdr, args->bitmap); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args) +{ + __be32 *p; + __be32 status; + + args->addr = svc_addr(rqstp); + status = decode_stateid(xdr, &args->stateid); + if (unlikely(status != 0)) + goto out; + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_RESOURCE); + goto out; + } + args->truncate = ntohl(*p); + status = decode_fh(xdr, &args->fh); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +#if defined(CONFIG_NFS_V4_1) + +static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_layoutrecallargs *args) +{ + __be32 *p; + __be32 status = 0; + uint32_t iomode; + + args->cbl_addr = svc_addr(rqstp); + p = read_buf(xdr, 4 * sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + + args->cbl_layout_type = ntohl(*p++); + /* Depite the spec's xdr, iomode really belongs in the FILE switch, + * as it is unusable and ignored with the other types. + */ + iomode = ntohl(*p++); + args->cbl_layoutchanged = ntohl(*p++); + args->cbl_recall_type = ntohl(*p++); + + if (args->cbl_recall_type == RETURN_FILE) { + args->cbl_range.iomode = iomode; + status = decode_fh(xdr, &args->cbl_fh); + if (unlikely(status != 0)) + goto out; + + p = read_buf(xdr, 2 * sizeof(uint64_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + p = xdr_decode_hyper(p, &args->cbl_range.offset); + p = xdr_decode_hyper(p, &args->cbl_range.length); + status = decode_stateid(xdr, &args->cbl_stateid); + if (unlikely(status != 0)) + goto out; + } else if (args->cbl_recall_type == RETURN_FSID) { + p = read_buf(xdr, 2 * sizeof(uint64_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + p = xdr_decode_hyper(p, &args->cbl_fsid.major); + p = xdr_decode_hyper(p, &args->cbl_fsid.minor); + } else if (args->cbl_recall_type != RETURN_ALL) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n", + __func__, + args->cbl_layout_type, iomode, + args->cbl_layoutchanged, args->cbl_recall_type); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +static +__be32 decode_devicenotify_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_devicenotifyargs *args) +{ + __be32 *p; + __be32 status = 0; + u32 tmp; + int n, i; + args->ndevs = 0; + + /* Num of device notifications */ + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + n = ntohl(*p++); + if (n <= 0) + goto out; + + args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL); + if (!args->devs) { + status = htonl(NFS4ERR_DELAY); + goto out; + } + + /* Decode each dev notification */ + for (i = 0; i < n; i++) { + struct cb_devicenotifyitem *dev = &args->devs[i]; + + p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto err; + } + + tmp = ntohl(*p++); /* bitmap size */ + if (tmp != 1) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + dev->cbd_notify_type = ntohl(*p++); + if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && + dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + + tmp = ntohl(*p++); /* opaque size */ + if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && + (tmp != NFS4_DEVICEID4_SIZE + 8)) || + ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && + (tmp != NFS4_DEVICEID4_SIZE + 4))) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + dev->cbd_layout_type = ntohl(*p++); + memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + + if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto err; + } + dev->cbd_immediate = ntohl(*p++); + } else { + dev->cbd_immediate = 0; + } + + args->ndevs++; + + dprintk("%s: type %d layout 0x%x immediate %d\n", + __func__, dev->cbd_notify_type, dev->cbd_layout_type, + dev->cbd_immediate); + } +out: + dprintk("%s: status %d ndevs %d\n", + __func__, ntohl(status), args->ndevs); + return status; +err: + kfree(args->devs); + goto out; +} + +static __be32 decode_sessionid(struct xdr_stream *xdr, + struct nfs4_sessionid *sid) +{ + __be32 *p; + int len = NFS4_MAX_SESSIONID_LEN; + + p = read_buf(xdr, len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + + memcpy(sid->data, p, len); + return 0; +} + +static __be32 decode_rc_list(struct xdr_stream *xdr, + struct referring_call_list *rc_list) +{ + __be32 *p; + int i; + __be32 status; + + status = decode_sessionid(xdr, &rc_list->rcl_sessionid); + if (status) + goto out; + + status = htonl(NFS4ERR_RESOURCE); + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) + goto out; + + rc_list->rcl_nrefcalls = ntohl(*p++); + if (rc_list->rcl_nrefcalls) { + p = read_buf(xdr, + rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)); + if (unlikely(p == NULL)) + goto out; + rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls * + sizeof(*rc_list->rcl_refcalls), + GFP_KERNEL); + if (unlikely(rc_list->rcl_refcalls == NULL)) + goto out; + for (i = 0; i < rc_list->rcl_nrefcalls; i++) { + rc_list->rcl_refcalls[i].rc_sequenceid = ntohl(*p++); + rc_list->rcl_refcalls[i].rc_slotid = ntohl(*p++); + } + } + status = 0; + +out: + return status; +} + +static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_sequenceargs *args) +{ + __be32 *p; + int i; + __be32 status; + + status = decode_sessionid(xdr, &args->csa_sessionid); + if (status) + goto out; + + status = htonl(NFS4ERR_RESOURCE); + p = read_buf(xdr, 5 * sizeof(uint32_t)); + if (unlikely(p == NULL)) + goto out; + + args->csa_addr = svc_addr(rqstp); + args->csa_sequenceid = ntohl(*p++); + args->csa_slotid = ntohl(*p++); + args->csa_highestslotid = ntohl(*p++); + args->csa_cachethis = ntohl(*p++); + args->csa_nrclists = ntohl(*p++); + args->csa_rclists = NULL; + if (args->csa_nrclists) { + args->csa_rclists = kmalloc(args->csa_nrclists * + sizeof(*args->csa_rclists), + GFP_KERNEL); + if (unlikely(args->csa_rclists == NULL)) + goto out; + + for (i = 0; i < args->csa_nrclists; i++) { + status = decode_rc_list(xdr, &args->csa_rclists[i]); + if (status) + goto out_free; + } + } + status = 0; + + dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u " + "highestslotid %u cachethis %d nrclists %u\n", + __func__, + ((u32 *)&args->csa_sessionid)[0], + ((u32 *)&args->csa_sessionid)[1], + ((u32 *)&args->csa_sessionid)[2], + ((u32 *)&args->csa_sessionid)[3], + args->csa_sequenceid, args->csa_slotid, + args->csa_highestslotid, args->csa_cachethis, + args->csa_nrclists); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; + +out_free: + for (i = 0; i < args->csa_nrclists; i++) + kfree(args->csa_rclists[i].rcl_refcalls); + kfree(args->csa_rclists); + goto out; +} + +static __be32 decode_recallany_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_recallanyargs *args) +{ + __be32 *p; + + args->craa_addr = svc_addr(rqstp); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->craa_objs_to_keep = ntohl(*p++); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->craa_type_mask = ntohl(*p); + + return 0; +} + +static __be32 decode_recallslot_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_recallslotargs *args) +{ + __be32 *p; + + args->crsa_addr = svc_addr(rqstp); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->crsa_target_max_slots = ntohl(*p++); + return 0; +} + +#endif /* CONFIG_NFS_V4_1 */ + +static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4 + len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + xdr_encode_opaque(p, str, len); + return 0; +} + +#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) +#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) +static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep) +{ + __be32 bm[2]; + __be32 *p; + + bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0); + bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1); + if (bm[1] != 0) { + p = xdr_reserve_space(xdr, 16); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *p++ = htonl(2); + *p++ = bm[0]; + *p++ = bm[1]; + } else if (bm[0] != 0) { + p = xdr_reserve_space(xdr, 12); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *p++ = htonl(1); + *p++ = bm[0]; + } else { + p = xdr_reserve_space(xdr, 8); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *p++ = htonl(0); + } + *savep = p; + return 0; +} + +static __be32 encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change) +{ + __be32 *p; + + if (!(bitmap[0] & FATTR4_WORD0_CHANGE)) + return 0; + p = xdr_reserve_space(xdr, 8); + if (unlikely(!p)) + return htonl(NFS4ERR_RESOURCE); + p = xdr_encode_hyper(p, change); + return 0; +} + +static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size) +{ + __be32 *p; + + if (!(bitmap[0] & FATTR4_WORD0_SIZE)) + return 0; + p = xdr_reserve_space(xdr, 8); + if (unlikely(!p)) + return htonl(NFS4ERR_RESOURCE); + p = xdr_encode_hyper(p, size); + return 0; +} + +static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *time) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 12); + if (unlikely(!p)) + return htonl(NFS4ERR_RESOURCE); + p = xdr_encode_hyper(p, time->tv_sec); + *p = htonl(time->tv_nsec); + return 0; +} + +static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) +{ + if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) + return 0; + return encode_attr_time(xdr,time); +} + +static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) +{ + if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) + return 0; + return encode_attr_time(xdr,time); +} + +static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr) +{ + __be32 status; + + hdr->status = xdr_reserve_space(xdr, 4); + if (unlikely(hdr->status == NULL)) + return htonl(NFS4ERR_RESOURCE); + status = encode_string(xdr, hdr->taglen, hdr->tag); + if (unlikely(status != 0)) + return status; + hdr->nops = xdr_reserve_space(xdr, 4); + if (unlikely(hdr->nops == NULL)) + return htonl(NFS4ERR_RESOURCE); + return 0; +} + +static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 8); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE_HDR); + *p++ = htonl(op); + *p = res; + return 0; +} + +static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res) +{ + __be32 *savep = NULL; + __be32 status = res->status; + + if (unlikely(status != 0)) + goto out; + status = encode_attr_bitmap(xdr, res->bitmap, &savep); + if (unlikely(status != 0)) + goto out; + status = encode_attr_change(xdr, res->bitmap, res->change_attr); + if (unlikely(status != 0)) + goto out; + status = encode_attr_size(xdr, res->bitmap, res->size); + if (unlikely(status != 0)) + goto out; + status = encode_attr_ctime(xdr, res->bitmap, &res->ctime); + if (unlikely(status != 0)) + goto out; + status = encode_attr_mtime(xdr, res->bitmap, &res->mtime); + *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1))); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +#if defined(CONFIG_NFS_V4_1) + +static __be32 encode_sessionid(struct xdr_stream *xdr, + const struct nfs4_sessionid *sid) +{ + __be32 *p; + int len = NFS4_MAX_SESSIONID_LEN; + + p = xdr_reserve_space(xdr, len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + + memcpy(p, sid, len); + return 0; +} + +static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + const struct cb_sequenceres *res) +{ + __be32 *p; + unsigned status = res->csr_status; + + if (unlikely(status != 0)) + goto out; + + encode_sessionid(xdr, &res->csr_sessionid); + + p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t)); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + + *p++ = htonl(res->csr_sequenceid); + *p++ = htonl(res->csr_slotid); + *p++ = htonl(res->csr_highestslotid); + *p++ = htonl(res->csr_target_highestslotid); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +static __be32 +preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + if (op_nr == OP_CB_SEQUENCE) { + if (nop != 0) + return htonl(NFS4ERR_SEQUENCE_POS); + } else { + if (nop == 0) + return htonl(NFS4ERR_OP_NOT_IN_SESSION); + } + + switch (op_nr) { + case OP_CB_GETATTR: + case OP_CB_RECALL: + case OP_CB_SEQUENCE: + case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: + case OP_CB_LAYOUTRECALL: + case OP_CB_NOTIFY_DEVICEID: + *op = &callback_ops[op_nr]; + break; + + case OP_CB_NOTIFY: + case OP_CB_PUSH_DELEG: + case OP_CB_RECALLABLE_OBJ_AVAIL: + case OP_CB_WANTS_CANCELLED: + case OP_CB_NOTIFY_LOCK: + return htonl(NFS4ERR_NOTSUPP); + + default: + return htonl(NFS4ERR_OP_ILLEGAL); + } + + return htonl(NFS_OK); +} + +static void nfs4_callback_free_slot(struct nfs4_session *session) +{ + struct nfs4_slot_table *tbl = &session->bc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); + /* + * Let the state manager know callback processing done. + * A single slot, so highest used slotid is either 0 or -1 + */ + tbl->highest_used_slotid = -1; + nfs4_check_drain_bc_complete(session); + spin_unlock(&tbl->slot_tbl_lock); +} + +static void nfs4_cb_free_slot(struct cb_process_state *cps) +{ + if (cps->slotid != -1) + nfs4_callback_free_slot(cps->clp->cl_session); +} + +#else /* CONFIG_NFS_V4_1 */ + +static __be32 +preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + return htonl(NFS4ERR_MINOR_VERS_MISMATCH); +} + +static void nfs4_cb_free_slot(struct cb_process_state *cps) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +static __be32 +preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) +{ + switch (op_nr) { + case OP_CB_GETATTR: + case OP_CB_RECALL: + *op = &callback_ops[op_nr]; + break; + default: + return htonl(NFS4ERR_OP_ILLEGAL); + } + + return htonl(NFS_OK); +} + +static __be32 process_op(uint32_t minorversion, int nop, + struct svc_rqst *rqstp, + struct xdr_stream *xdr_in, void *argp, + struct xdr_stream *xdr_out, void *resp, + struct cb_process_state *cps) +{ + struct callback_op *op = &callback_ops[0]; + unsigned int op_nr; + __be32 status; + long maxlen; + __be32 res; + + dprintk("%s: start\n", __func__); + status = decode_op_hdr(xdr_in, &op_nr); + if (unlikely(status)) + return status; + + dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", + __func__, minorversion, nop, op_nr); + + status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) : + preprocess_nfs4_op(op_nr, &op); + if (status == htonl(NFS4ERR_OP_ILLEGAL)) + op_nr = OP_CB_ILLEGAL; + if (status) + goto encode_hdr; + + if (cps->drc_status) { + status = cps->drc_status; + goto encode_hdr; + } + + maxlen = xdr_out->end - xdr_out->p; + if (maxlen > 0 && maxlen < PAGE_SIZE) { + status = op->decode_args(rqstp, xdr_in, argp); + if (likely(status == 0)) + status = op->process_op(argp, resp, cps); + } else + status = htonl(NFS4ERR_RESOURCE); + +encode_hdr: + res = encode_op_hdr(xdr_out, op_nr, status); + if (unlikely(res)) + return res; + if (op->encode_res != NULL && status == 0) + status = op->encode_res(rqstp, xdr_out, resp); + dprintk("%s: done, status = %d\n", __func__, ntohl(status)); + return status; +} + +/* + * Decode, process and encode a COMPOUND + */ +static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp) +{ + struct cb_compound_hdr_arg hdr_arg = { 0 }; + struct cb_compound_hdr_res hdr_res = { NULL }; + struct xdr_stream xdr_in, xdr_out; + __be32 *p, status; + struct cb_process_state cps = { + .drc_status = 0, + .clp = NULL, + .slotid = -1, + }; + unsigned int nops = 0; + + dprintk("%s: start\n", __func__); + + xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); + + p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); + xdr_init_encode(&xdr_out, &rqstp->rq_res, p); + + status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); + if (status == __constant_htonl(NFS4ERR_RESOURCE)) + return rpc_garbage_args; + + if (hdr_arg.minorversion == 0) { + cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident); + if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) + return rpc_drop_reply; + } + + hdr_res.taglen = hdr_arg.taglen; + hdr_res.tag = hdr_arg.tag; + if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) + return rpc_system_err; + + while (status == 0 && nops != hdr_arg.nops) { + status = process_op(hdr_arg.minorversion, nops, rqstp, + &xdr_in, argp, &xdr_out, resp, &cps); + nops++; + } + + /* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return + * resource error in cb_compound status without returning op */ + if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) { + status = htonl(NFS4ERR_RESOURCE); + nops--; + } + + *hdr_res.status = status; + *hdr_res.nops = htonl(nops); + nfs4_cb_free_slot(&cps); + nfs_put_client(cps.clp); + dprintk("%s: done, status = %u\n", __func__, ntohl(status)); + return rpc_success; +} + +/* + * Define NFS4 callback COMPOUND ops. + */ +static struct callback_op callback_ops[] = { + [0] = { + .res_maxsize = CB_OP_HDR_RES_MAXSZ, + }, + [OP_CB_GETATTR] = { + .process_op = (callback_process_op_t)nfs4_callback_getattr, + .decode_args = (callback_decode_arg_t)decode_getattr_args, + .encode_res = (callback_encode_res_t)encode_getattr_res, + .res_maxsize = CB_OP_GETATTR_RES_MAXSZ, + }, + [OP_CB_RECALL] = { + .process_op = (callback_process_op_t)nfs4_callback_recall, + .decode_args = (callback_decode_arg_t)decode_recall_args, + .res_maxsize = CB_OP_RECALL_RES_MAXSZ, + }, +#if defined(CONFIG_NFS_V4_1) + [OP_CB_LAYOUTRECALL] = { + .process_op = (callback_process_op_t)nfs4_callback_layoutrecall, + .decode_args = + (callback_decode_arg_t)decode_layoutrecall_args, + .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, + }, + [OP_CB_NOTIFY_DEVICEID] = { + .process_op = (callback_process_op_t)nfs4_callback_devicenotify, + .decode_args = + (callback_decode_arg_t)decode_devicenotify_args, + .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, + }, + [OP_CB_SEQUENCE] = { + .process_op = (callback_process_op_t)nfs4_callback_sequence, + .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, + .encode_res = (callback_encode_res_t)encode_cb_sequence_res, + .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ, + }, + [OP_CB_RECALL_ANY] = { + .process_op = (callback_process_op_t)nfs4_callback_recallany, + .decode_args = (callback_decode_arg_t)decode_recallany_args, + .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ, + }, + [OP_CB_RECALL_SLOT] = { + .process_op = (callback_process_op_t)nfs4_callback_recallslot, + .decode_args = (callback_decode_arg_t)decode_recallslot_args, + .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ, + }, +#endif /* CONFIG_NFS_V4_1 */ +}; + +/* + * Define NFS4 callback procedures + */ +static struct svc_procedure nfs4_callback_procedures1[] = { + [CB_NULL] = { + .pc_func = nfs4_callback_null, + .pc_decode = (kxdrproc_t)nfs4_decode_void, + .pc_encode = (kxdrproc_t)nfs4_encode_void, + .pc_xdrressize = 1, + }, + [CB_COMPOUND] = { + .pc_func = nfs4_callback_compound, + .pc_encode = (kxdrproc_t)nfs4_encode_void, + .pc_argsize = 256, + .pc_ressize = 256, + .pc_xdrressize = NFS4_CALLBACK_BUFSIZE, + } +}; + +struct svc_version nfs4_callback_version1 = { + .vs_vers = 1, + .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), + .vs_proc = nfs4_callback_procedures1, + .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, + .vs_dispatch = NULL, + .vs_hidden = 1, +}; + +struct svc_version nfs4_callback_version4 = { + .vs_vers = 4, + .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), + .vs_proc = nfs4_callback_procedures1, + .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, + .vs_dispatch = NULL, +}; diff --git a/fs/nfs/client.c b/fs/nfs/client.c new file mode 100644 index 00000000..b3dc2b88 --- /dev/null +++ b/fs/nfs/client.c @@ -0,0 +1,2004 @@ +/* client.c: NFS client sharing and management code + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "iostat.h" +#include "internal.h" +#include "fscache.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_CLIENT + +static DEFINE_SPINLOCK(nfs_client_lock); +static LIST_HEAD(nfs_client_list); +static LIST_HEAD(nfs_volume_list); +static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); +#ifdef CONFIG_NFS_V4 +static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */ + +/* + * Get a unique NFSv4.0 callback identifier which will be used + * by the V4.0 callback service to lookup the nfs_client struct + */ +static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) +{ + int ret = 0; + + if (clp->rpc_ops->version != 4 || minorversion != 0) + return ret; +retry: + if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL)) + return -ENOMEM; + spin_lock(&nfs_client_lock); + ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident); + spin_unlock(&nfs_client_lock); + if (ret == -EAGAIN) + goto retry; + return ret; +} +#endif /* CONFIG_NFS_V4 */ + +/* + * Turn off NFSv4 uid/gid mapping when using AUTH_SYS + */ +static int nfs4_disable_idmapping = 0; + +/* + * RPC cruft for NFS + */ +static struct rpc_version *nfs_version[5] = { + [2] = &nfs_version2, +#ifdef CONFIG_NFS_V3 + [3] = &nfs_version3, +#endif +#ifdef CONFIG_NFS_V4 + [4] = &nfs_version4, +#endif +}; + +struct rpc_program nfs_program = { + .name = "nfs", + .number = NFS_PROGRAM, + .nrvers = ARRAY_SIZE(nfs_version), + .version = nfs_version, + .stats = &nfs_rpcstat, + .pipe_dir_name = "/nfs", +}; + +struct rpc_stat nfs_rpcstat = { + .program = &nfs_program +}; + + +#ifdef CONFIG_NFS_V3_ACL +static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; +static struct rpc_version * nfsacl_version[] = { + [3] = &nfsacl_version3, +}; + +struct rpc_program nfsacl_program = { + .name = "nfsacl", + .number = NFS_ACL_PROGRAM, + .nrvers = ARRAY_SIZE(nfsacl_version), + .version = nfsacl_version, + .stats = &nfsacl_rpcstat, +}; +#endif /* CONFIG_NFS_V3_ACL */ + +struct nfs_client_initdata { + const char *hostname; + const struct sockaddr *addr; + size_t addrlen; + const struct nfs_rpc_ops *rpc_ops; + int proto; + u32 minorversion; +}; + +/* + * Allocate a shared client record + * + * Since these are allocated/deallocated very rarely, we don't + * bother putting them in a slab cache... + */ +static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) +{ + struct nfs_client *clp; + struct rpc_cred *cred; + int err = -ENOMEM; + + if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) + goto error_0; + + clp->rpc_ops = cl_init->rpc_ops; + + atomic_set(&clp->cl_count, 1); + clp->cl_cons_state = NFS_CS_INITING; + + memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen); + clp->cl_addrlen = cl_init->addrlen; + + if (cl_init->hostname) { + err = -ENOMEM; + clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL); + if (!clp->cl_hostname) + goto error_cleanup; + } + + INIT_LIST_HEAD(&clp->cl_superblocks); + clp->cl_rpcclient = ERR_PTR(-EINVAL); + + clp->cl_proto = cl_init->proto; + +#ifdef CONFIG_NFS_V4 + err = nfs_get_cb_ident_idr(clp, cl_init->minorversion); + if (err) + goto error_cleanup; + + spin_lock_init(&clp->cl_lock); + INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); + rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); + clp->cl_boot_time = CURRENT_TIME; + clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; + clp->cl_minorversion = cl_init->minorversion; + clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; +#endif + cred = rpc_lookup_machine_cred(); + if (!IS_ERR(cred)) + clp->cl_machine_cred = cred; +#if defined(CONFIG_NFS_V4_1) + INIT_LIST_HEAD(&clp->cl_layouts); +#endif + nfs_fscache_get_client_cookie(clp); + + return clp; + +error_cleanup: + kfree(clp); +error_0: + return ERR_PTR(err); +} + +#ifdef CONFIG_NFS_V4 +#ifdef CONFIG_NFS_V4_1 +static void nfs4_shutdown_session(struct nfs_client *clp) +{ + if (nfs4_has_session(clp)) + nfs4_destroy_session(clp->cl_session); +} +#else /* CONFIG_NFS_V4_1 */ +static void nfs4_shutdown_session(struct nfs_client *clp) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Destroy the NFS4 callback service + */ +static void nfs4_destroy_callback(struct nfs_client *clp) +{ + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) + nfs_callback_down(clp->cl_mvops->minor_version); +} + +static void nfs4_shutdown_client(struct nfs_client *clp) +{ + if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) + nfs4_kill_renewd(clp); + nfs4_shutdown_session(clp); + nfs4_destroy_callback(clp); + if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) + nfs_idmap_delete(clp); + + rpc_destroy_wait_queue(&clp->cl_rpcwaitq); +} + +/* idr_remove_all is not needed as all id's are removed by nfs_put_client */ +void nfs_cleanup_cb_ident_idr(void) +{ + idr_destroy(&cb_ident_idr); +} + +/* nfs_client_lock held */ +static void nfs_cb_idr_remove_locked(struct nfs_client *clp) +{ + if (clp->cl_cb_ident) + idr_remove(&cb_ident_idr, clp->cl_cb_ident); +} + +static void pnfs_init_server(struct nfs_server *server) +{ + rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); +} + +#else +static void nfs4_shutdown_client(struct nfs_client *clp) +{ +} + +void nfs_cleanup_cb_ident_idr(void) +{ +} + +static void nfs_cb_idr_remove_locked(struct nfs_client *clp) +{ +} + +static void pnfs_init_server(struct nfs_server *server) +{ +} + +#endif /* CONFIG_NFS_V4 */ + +/* + * Destroy a shared client record + */ +static void nfs_free_client(struct nfs_client *clp) +{ + dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); + + nfs4_shutdown_client(clp); + + nfs_fscache_release_client_cookie(clp); + + /* -EIO all pending I/O */ + if (!IS_ERR(clp->cl_rpcclient)) + rpc_shutdown_client(clp->cl_rpcclient); + + if (clp->cl_machine_cred != NULL) + put_rpccred(clp->cl_machine_cred); + + nfs4_deviceid_purge_client(clp); + + kfree(clp->cl_hostname); + kfree(clp); + + dprintk("<-- nfs_free_client()\n"); +} + +/* + * Release a reference to a shared client record + */ +void nfs_put_client(struct nfs_client *clp) +{ + if (!clp) + return; + + dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count)); + + if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) { + list_del(&clp->cl_share_link); + nfs_cb_idr_remove_locked(clp); + spin_unlock(&nfs_client_lock); + + BUG_ON(!list_empty(&clp->cl_superblocks)); + + nfs_free_client(clp); + } +} +EXPORT_SYMBOL_GPL(nfs_put_client); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/* + * Test if two ip6 socket addresses refer to the same socket by + * comparing relevant fields. The padding bytes specifically, are not + * compared. sin6_flowinfo is not compared because it only affects QoS + * and sin6_scope_id is only compared if the address is "link local" + * because "link local" addresses need only be unique to a specific + * link. Conversely, ordinary unicast addresses might have different + * sin6_scope_id. + * + * The caller should ensure both socket addresses are AF_INET6. + */ +static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; + + if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && + sin1->sin6_scope_id != sin2->sin6_scope_id) + return 0; + + return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); +} +#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ +static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + return 0; +} +#endif + +/* + * Test if two ip4 socket addresses refer to the same socket, by + * comparing relevant fields. The padding bytes specifically, are + * not compared. + * + * The caller should ensure both socket addresses are AF_INET. + */ +static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; + + return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; +} + +static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; + + return nfs_sockaddr_match_ipaddr6(sa1, sa2) && + (sin1->sin6_port == sin2->sin6_port); +} + +static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; + + return nfs_sockaddr_match_ipaddr4(sa1, sa2) && + (sin1->sin_port == sin2->sin_port); +} + +/* + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, excluding the port number. + */ +static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + if (sa1->sa_family != sa2->sa_family) + return 0; + + switch (sa1->sa_family) { + case AF_INET: + return nfs_sockaddr_match_ipaddr4(sa1, sa2); + case AF_INET6: + return nfs_sockaddr_match_ipaddr6(sa1, sa2); + } + return 0; +} + +/* + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, including the port number. + */ +static int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + if (sa1->sa_family != sa2->sa_family) + return 0; + + switch (sa1->sa_family) { + case AF_INET: + return nfs_sockaddr_cmp_ip4(sa1, sa2); + case AF_INET6: + return nfs_sockaddr_cmp_ip6(sa1, sa2); + } + return 0; +} + +/* Common match routine for v4.0 and v4.1 callback services */ +bool +nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp, + u32 minorversion) +{ + struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; + + /* Don't match clients that failed to initialise */ + if (!(clp->cl_cons_state == NFS_CS_READY || + clp->cl_cons_state == NFS_CS_SESSION_INITING)) + return false; + + /* Match the version and minorversion */ + if (clp->rpc_ops->version != 4 || + clp->cl_minorversion != minorversion) + return false; + + /* Match only the IP address, not the port number */ + if (!nfs_sockaddr_match_ipaddr(addr, clap)) + return false; + + return true; +} + +/* + * Find an nfs_client on the list that matches the initialisation data + * that is supplied. + */ +static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) +{ + struct nfs_client *clp; + const struct sockaddr *sap = data->addr; + + list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; + /* Don't match clients that failed to initialise properly */ + if (clp->cl_cons_state < 0) + continue; + + /* Different NFS versions cannot share the same nfs_client */ + if (clp->rpc_ops != data->rpc_ops) + continue; + + if (clp->cl_proto != data->proto) + continue; + /* Match nfsv4 minorversion */ + if (clp->cl_minorversion != data->minorversion) + continue; + /* Match the full socket address */ + if (!nfs_sockaddr_cmp(sap, clap)) + continue; + + atomic_inc(&clp->cl_count); + return clp; + } + return NULL; +} + +/* + * Look up a client by IP address and protocol version + * - creates a new record if one doesn't yet exist + */ +static struct nfs_client * +nfs_get_client(const struct nfs_client_initdata *cl_init, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport) +{ + struct nfs_client *clp, *new = NULL; + int error; + + dprintk("--> nfs_get_client(%s,v%u)\n", + cl_init->hostname ?: "", cl_init->rpc_ops->version); + + /* see if the client already exists */ + do { + spin_lock(&nfs_client_lock); + + clp = nfs_match_client(cl_init); + if (clp) + goto found_client; + if (new) + goto install_client; + + spin_unlock(&nfs_client_lock); + + new = nfs_alloc_client(cl_init); + } while (!IS_ERR(new)); + + dprintk("--> nfs_get_client() = %ld [failed]\n", PTR_ERR(new)); + return new; + + /* install a new client and return with it unready */ +install_client: + clp = new; + list_add(&clp->cl_share_link, &nfs_client_list); + spin_unlock(&nfs_client_lock); + + error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr, + authflavour, noresvport); + if (error < 0) { + nfs_put_client(clp); + return ERR_PTR(error); + } + dprintk("--> nfs_get_client() = %p [new]\n", clp); + return clp; + + /* found an existing client + * - make sure it's ready before returning + */ +found_client: + spin_unlock(&nfs_client_lock); + + if (new) + nfs_free_client(new); + + error = wait_event_killable(nfs_client_active_wq, + clp->cl_cons_state < NFS_CS_INITING); + if (error < 0) { + nfs_put_client(clp); + return ERR_PTR(-ERESTARTSYS); + } + + if (clp->cl_cons_state < NFS_CS_READY) { + error = clp->cl_cons_state; + nfs_put_client(clp); + return ERR_PTR(error); + } + + BUG_ON(clp->cl_cons_state != NFS_CS_READY); + + dprintk("--> nfs_get_client() = %p [share]\n", clp); + return clp; +} + +/* + * Mark a server as ready or failed + */ +void nfs_mark_client_ready(struct nfs_client *clp, int state) +{ + clp->cl_cons_state = state; + wake_up_all(&nfs_client_active_wq); +} + +/* + * With sessions, the client is not marked ready until after a + * successful EXCHANGE_ID and CREATE_SESSION. + * + * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate + * other versions of NFS can be tried. + */ +int nfs4_check_client_ready(struct nfs_client *clp) +{ + if (!nfs4_has_session(clp)) + return 0; + if (clp->cl_cons_state < NFS_CS_READY) + return -EPROTONOSUPPORT; + return 0; +} + +/* + * Initialise the timeout values for a connection + */ +static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, + unsigned int timeo, unsigned int retrans) +{ + to->to_initval = timeo * HZ / 10; + to->to_retries = retrans; + + switch (proto) { + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + if (to->to_retries == 0) + to->to_retries = NFS_DEF_TCP_RETRANS; + if (to->to_initval == 0) + to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; + if (to->to_initval > NFS_MAX_TCP_TIMEOUT) + to->to_initval = NFS_MAX_TCP_TIMEOUT; + to->to_increment = to->to_initval; + to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); + if (to->to_maxval > NFS_MAX_TCP_TIMEOUT) + to->to_maxval = NFS_MAX_TCP_TIMEOUT; + if (to->to_maxval < to->to_initval) + to->to_maxval = to->to_initval; + to->to_exponential = 0; + break; + case XPRT_TRANSPORT_UDP: + if (to->to_retries == 0) + to->to_retries = NFS_DEF_UDP_RETRANS; + if (!to->to_initval) + to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; + if (to->to_initval > NFS_MAX_UDP_TIMEOUT) + to->to_initval = NFS_MAX_UDP_TIMEOUT; + to->to_maxval = NFS_MAX_UDP_TIMEOUT; + to->to_exponential = 1; + break; + default: + BUG(); + } +} + +/* + * Create an RPC client handle + */ +static int nfs_create_rpc_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + rpc_authflavor_t flavor, + int discrtry, int noresvport) +{ + struct rpc_clnt *clnt = NULL; + struct rpc_create_args args = { + .net = &init_net, + .protocol = clp->cl_proto, + .address = (struct sockaddr *)&clp->cl_addr, + .addrsize = clp->cl_addrlen, + .timeout = timeparms, + .servername = clp->cl_hostname, + .program = &nfs_program, + .version = clp->rpc_ops->version, + .authflavor = flavor, + }; + + if (discrtry) + args.flags |= RPC_CLNT_CREATE_DISCRTRY; + if (noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + + if (!IS_ERR(clp->cl_rpcclient)) + return 0; + + clnt = rpc_create(&args); + if (IS_ERR(clnt)) { + dprintk("%s: cannot create RPC client. Error = %ld\n", + __func__, PTR_ERR(clnt)); + return PTR_ERR(clnt); + } + + clp->cl_rpcclient = clnt; + return 0; +} + +/* + * Version 2 or 3 client destruction + */ +static void nfs_destroy_server(struct nfs_server *server) +{ + if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) || + !(server->flags & NFS_MOUNT_LOCAL_FCNTL)) + nlmclnt_done(server->nlm_host); +} + +/* + * Version 2 or 3 lockd setup + */ +static int nfs_start_lockd(struct nfs_server *server) +{ + struct nlm_host *host; + struct nfs_client *clp = server->nfs_client; + struct nlmclnt_initdata nlm_init = { + .hostname = clp->cl_hostname, + .address = (struct sockaddr *)&clp->cl_addr, + .addrlen = clp->cl_addrlen, + .nfs_version = clp->rpc_ops->version, + .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? + 1 : 0, + }; + + if (nlm_init.nfs_version > 3) + return 0; + if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) && + (server->flags & NFS_MOUNT_LOCAL_FCNTL)) + return 0; + + switch (clp->cl_proto) { + default: + nlm_init.protocol = IPPROTO_TCP; + break; + case XPRT_TRANSPORT_UDP: + nlm_init.protocol = IPPROTO_UDP; + } + + host = nlmclnt_init(&nlm_init); + if (IS_ERR(host)) + return PTR_ERR(host); + + server->nlm_host = host; + server->destroy = nfs_destroy_server; + return 0; +} + +/* + * Initialise an NFSv3 ACL client connection + */ +#ifdef CONFIG_NFS_V3_ACL +static void nfs_init_server_aclclient(struct nfs_server *server) +{ + if (server->nfs_client->rpc_ops->version != 3) + goto out_noacl; + if (server->flags & NFS_MOUNT_NOACL) + goto out_noacl; + + server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3); + if (IS_ERR(server->client_acl)) + goto out_noacl; + + /* No errors! Assume that Sun nfsacls are supported */ + server->caps |= NFS_CAP_ACLS; + return; + +out_noacl: + server->caps &= ~NFS_CAP_ACLS; +} +#else +static inline void nfs_init_server_aclclient(struct nfs_server *server) +{ + server->flags &= ~NFS_MOUNT_NOACL; + server->caps &= ~NFS_CAP_ACLS; +} +#endif + +/* + * Create a general RPC client + */ +static int nfs_init_server_rpcclient(struct nfs_server *server, + const struct rpc_timeout *timeo, + rpc_authflavor_t pseudoflavour) +{ + struct nfs_client *clp = server->nfs_client; + + server->client = rpc_clone_client(clp->cl_rpcclient); + if (IS_ERR(server->client)) { + dprintk("%s: couldn't create rpc_client!\n", __func__); + return PTR_ERR(server->client); + } + + memcpy(&server->client->cl_timeout_default, + timeo, + sizeof(server->client->cl_timeout_default)); + server->client->cl_timeout = &server->client->cl_timeout_default; + + if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) { + struct rpc_auth *auth; + + auth = rpcauth_create(pseudoflavour, server->client); + if (IS_ERR(auth)) { + dprintk("%s: couldn't create credcache!\n", __func__); + return PTR_ERR(auth); + } + } + server->client->cl_softrtry = 0; + if (server->flags & NFS_MOUNT_SOFT) + server->client->cl_softrtry = 1; + + return 0; +} + +/* + * Initialise an NFS2 or NFS3 client + */ +int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, + const char *ip_addr, rpc_authflavor_t authflavour, + int noresvport) +{ + int error; + + if (clp->cl_cons_state == NFS_CS_READY) { + /* the client is already initialised */ + dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp); + return 0; + } + + /* + * Create a client RPC handle for doing FSSTAT with UNIX auth only + * - RFC 2623, sec 2.3.2 + */ + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, + 0, noresvport); + if (error < 0) + goto error; + nfs_mark_client_ready(clp, NFS_CS_READY); + return 0; + +error: + nfs_mark_client_ready(clp, error); + dprintk("<-- nfs_init_client() = xerror %d\n", error); + return error; +} + +/* + * Create a version 2 or 3 client + */ +static int nfs_init_server(struct nfs_server *server, + const struct nfs_parsed_mount_data *data) +{ + struct nfs_client_initdata cl_init = { + .hostname = data->nfs_server.hostname, + .addr = (const struct sockaddr *)&data->nfs_server.address, + .addrlen = data->nfs_server.addrlen, + .rpc_ops = &nfs_v2_clientops, + .proto = data->nfs_server.protocol, + }; + struct rpc_timeout timeparms; + struct nfs_client *clp; + int error; + + dprintk("--> nfs_init_server()\n"); + +#ifdef CONFIG_NFS_V3 + if (data->version == 3) + cl_init.rpc_ops = &nfs_v3_clientops; +#endif + + nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, + data->timeo, data->retrans); + + /* Allocate or find a client reference we can use */ + clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX, + data->flags & NFS_MOUNT_NORESVPORT); + if (IS_ERR(clp)) { + dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); + return PTR_ERR(clp); + } + + server->nfs_client = clp; + + /* Initialise the client representation from the mount data */ + server->flags = data->flags; + server->options = data->options; + server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; + + if (data->rsize) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize) + server->wsize = nfs_block_size(data->wsize, NULL); + + server->acregmin = data->acregmin * HZ; + server->acregmax = data->acregmax * HZ; + server->acdirmin = data->acdirmin * HZ; + server->acdirmax = data->acdirmax * HZ; + + /* Start lockd here, before we might error out */ + error = nfs_start_lockd(server); + if (error < 0) + goto error; + + server->port = data->nfs_server.port; + + error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); + if (error < 0) + goto error; + + /* Preserve the values of mount_server-related mount options */ + if (data->mount_server.addrlen) { + memcpy(&server->mountd_address, &data->mount_server.address, + data->mount_server.addrlen); + server->mountd_addrlen = data->mount_server.addrlen; + } + server->mountd_version = data->mount_server.version; + server->mountd_port = data->mount_server.port; + server->mountd_protocol = data->mount_server.protocol; + + server->namelen = data->namlen; + /* Create a client RPC handle for the NFSv3 ACL management interface */ + nfs_init_server_aclclient(server); + dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp); + return 0; + +error: + server->nfs_client = NULL; + nfs_put_client(clp); + dprintk("<-- nfs_init_server() = xerror %d\n", error); + return error; +} + +/* + * Load up the server record from information gained in an fsinfo record + */ +static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) +{ + unsigned long max_rpc_payload; + + /* Work out a lot of parameters */ + if (server->rsize == 0) + server->rsize = nfs_block_size(fsinfo->rtpref, NULL); + if (server->wsize == 0) + server->wsize = nfs_block_size(fsinfo->wtpref, NULL); + + if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax) + server->rsize = nfs_block_size(fsinfo->rtmax, NULL); + if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax) + server->wsize = nfs_block_size(fsinfo->wtmax, NULL); + + max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL); + if (server->rsize > max_rpc_payload) + server->rsize = max_rpc_payload; + if (server->rsize > NFS_MAX_FILE_IO_SIZE) + server->rsize = NFS_MAX_FILE_IO_SIZE; + server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + server->backing_dev_info.name = "nfs"; + server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; + + if (server->wsize > max_rpc_payload) + server->wsize = max_rpc_payload; + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + set_pnfs_layoutdriver(server, fsinfo->layouttype); + + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); + + server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); + if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES) + server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES; + if (server->dtsize > server->rsize) + server->dtsize = server->rsize; + + if (server->flags & NFS_MOUNT_NOAC) { + server->acregmin = server->acregmax = 0; + server->acdirmin = server->acdirmax = 0; + } + + server->maxfilesize = fsinfo->maxfilesize; + + server->time_delta = fsinfo->time_delta; + + /* We're airborne Set socket buffersize */ + rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); +} + +/* + * Probe filesystem information, including the FSID on v2/v3 + */ +static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) +{ + struct nfs_fsinfo fsinfo; + struct nfs_client *clp = server->nfs_client; + int error; + + dprintk("--> nfs_probe_fsinfo()\n"); + + if (clp->rpc_ops->set_capabilities != NULL) { + error = clp->rpc_ops->set_capabilities(server, mntfh); + if (error < 0) + goto out_error; + } + + fsinfo.fattr = fattr; + fsinfo.layouttype = 0; + error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); + if (error < 0) + goto out_error; + + nfs_server_set_fsinfo(server, &fsinfo); + + /* Get some general file system info */ + if (server->namelen == 0) { + struct nfs_pathconf pathinfo; + + pathinfo.fattr = fattr; + nfs_fattr_init(fattr); + + if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0) + server->namelen = pathinfo.max_namelen; + } + + dprintk("<-- nfs_probe_fsinfo() = 0\n"); + return 0; + +out_error: + dprintk("nfs_probe_fsinfo: error = %d\n", -error); + return error; +} + +/* + * Copy useful information when duplicating a server record + */ +static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) +{ + target->flags = source->flags; + target->rsize = source->rsize; + target->wsize = source->wsize; + target->acregmin = source->acregmin; + target->acregmax = source->acregmax; + target->acdirmin = source->acdirmin; + target->acdirmax = source->acdirmax; + target->caps = source->caps; + target->options = source->options; +} + +static void nfs_server_insert_lists(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + + spin_lock(&nfs_client_lock); + list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); + list_add_tail(&server->master_link, &nfs_volume_list); + clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); + spin_unlock(&nfs_client_lock); + +} + +static void nfs_server_remove_lists(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + + spin_lock(&nfs_client_lock); + list_del_rcu(&server->client_link); + if (clp && list_empty(&clp->cl_superblocks)) + set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); + list_del(&server->master_link); + spin_unlock(&nfs_client_lock); + + synchronize_rcu(); +} + +/* + * Allocate and initialise a server record + */ +static struct nfs_server *nfs_alloc_server(void) +{ + struct nfs_server *server; + + server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); + if (!server) + return NULL; + + server->client = server->client_acl = ERR_PTR(-EINVAL); + + /* Zero out the NFS state stuff */ + INIT_LIST_HEAD(&server->client_link); + INIT_LIST_HEAD(&server->master_link); + INIT_LIST_HEAD(&server->delegations); + + atomic_set(&server->active, 0); + + server->io_stats = nfs_alloc_iostats(); + if (!server->io_stats) { + kfree(server); + return NULL; + } + + if (bdi_init(&server->backing_dev_info)) { + nfs_free_iostats(server->io_stats); + kfree(server); + return NULL; + } + + pnfs_init_server(server); + + return server; +} + +/* + * Free up a server record + */ +void nfs_free_server(struct nfs_server *server) +{ + dprintk("--> nfs_free_server()\n"); + + nfs_server_remove_lists(server); + unset_pnfs_layoutdriver(server); + + if (server->destroy != NULL) + server->destroy(server); + + if (!IS_ERR(server->client_acl)) + rpc_shutdown_client(server->client_acl); + if (!IS_ERR(server->client)) + rpc_shutdown_client(server->client); + + nfs_put_client(server->nfs_client); + + nfs_free_iostats(server->io_stats); + bdi_destroy(&server->backing_dev_info); + kfree(server); + nfs_release_automount_timer(); + dprintk("<-- nfs_free_server()\n"); +} + +/* + * Create a version 2 or 3 volume record + * - keyed on server and FSID + */ +struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, + struct nfs_fh *mntfh) +{ + struct nfs_server *server; + struct nfs_fattr *fattr; + int error; + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + error = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto error; + + /* Get a client representation */ + error = nfs_init_server(server, data); + if (error < 0) + goto error; + + BUG_ON(!server->nfs_client); + BUG_ON(!server->nfs_client->rpc_ops); + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + /* Probe the root fh to retrieve its FSID */ + error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto error; + if (server->nfs_client->rpc_ops->version == 3) { + if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) + server->namelen = NFS3_MAXNAMLEN; + if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) + server->caps |= NFS_CAP_READDIRPLUS; + } else { + if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) + server->namelen = NFS2_MAXNAMLEN; + } + + if (!(fattr->valid & NFS_ATTR_FATTR)) { + error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); + if (error < 0) { + dprintk("nfs_create_server: getattr error = %d\n", -error); + goto error; + } + } + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + + nfs_server_insert_lists(server); + server->mount_time = jiffies; + nfs_free_fattr(fattr); + return server; + +error: + nfs_free_fattr(fattr); + nfs_free_server(server); + return ERR_PTR(error); +} + +#ifdef CONFIG_NFS_V4 +/* + * NFSv4.0 callback thread helper + * + * Find a client by IP address, protocol version, and minorversion + * + * Called from the pg_authenticate method. The callback identifier + * is not used as it has not been decoded. + * + * Returns NULL if no such client + */ +struct nfs_client * +nfs4_find_client_no_ident(const struct sockaddr *addr) +{ + struct nfs_client *clp; + + spin_lock(&nfs_client_lock); + list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + if (nfs4_cb_match_client(addr, clp, 0) == false) + continue; + atomic_inc(&clp->cl_count); + spin_unlock(&nfs_client_lock); + return clp; + } + spin_unlock(&nfs_client_lock); + return NULL; +} + +/* + * NFSv4.0 callback thread helper + * + * Find a client by callback identifier + */ +struct nfs_client * +nfs4_find_client_ident(int cb_ident) +{ + struct nfs_client *clp; + + spin_lock(&nfs_client_lock); + clp = idr_find(&cb_ident_idr, cb_ident); + if (clp) + atomic_inc(&clp->cl_count); + spin_unlock(&nfs_client_lock); + return clp; +} + +#if defined(CONFIG_NFS_V4_1) +/* + * NFSv4.1 callback thread helper + * For CB_COMPOUND calls, find a client by IP address, protocol version, + * minorversion, and sessionID + * + * Returns NULL if no such client + */ +struct nfs_client * +nfs4_find_client_sessionid(const struct sockaddr *addr, + struct nfs4_sessionid *sid) +{ + struct nfs_client *clp; + + spin_lock(&nfs_client_lock); + list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + if (nfs4_cb_match_client(addr, clp, 1) == false) + continue; + + if (!nfs4_has_session(clp)) + continue; + + /* Match sessionid*/ + if (memcmp(clp->cl_session->sess_id.data, + sid->data, NFS4_MAX_SESSIONID_LEN) != 0) + continue; + + atomic_inc(&clp->cl_count); + spin_unlock(&nfs_client_lock); + return clp; + } + spin_unlock(&nfs_client_lock); + return NULL; +} + +#else /* CONFIG_NFS_V4_1 */ + +struct nfs_client * +nfs4_find_client_sessionid(const struct sockaddr *addr, + struct nfs4_sessionid *sid) +{ + return NULL; +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Initialize the NFS4 callback service + */ +static int nfs4_init_callback(struct nfs_client *clp) +{ + int error; + + if (clp->rpc_ops->version == 4) { + if (nfs4_has_session(clp)) { + error = xprt_setup_backchannel( + clp->cl_rpcclient->cl_xprt, + NFS41_BC_MIN_CALLBACKS); + if (error < 0) + return error; + } + + error = nfs_callback_up(clp->cl_mvops->minor_version, + clp->cl_rpcclient->cl_xprt); + if (error < 0) { + dprintk("%s: failed to start callback. Error = %d\n", + __func__, error); + return error; + } + __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state); + } + return 0; +} + +/* + * Initialize the minor version specific parts of an NFS4 client record + */ +static int nfs4_init_client_minor_version(struct nfs_client *clp) +{ +#if defined(CONFIG_NFS_V4_1) + if (clp->cl_mvops->minor_version) { + struct nfs4_session *session = NULL; + /* + * Create the session and mark it expired. + * When a SEQUENCE operation encounters the expired session + * it will do session recovery to initialize it. + */ + session = nfs4_alloc_session(clp); + if (!session) + return -ENOMEM; + + clp->cl_session = session; + /* + * The create session reply races with the server back + * channel probe. Mark the client NFS_CS_SESSION_INITING + * so that the client back channel can find the + * nfs_client struct + */ + clp->cl_cons_state = NFS_CS_SESSION_INITING; + } +#endif /* CONFIG_NFS_V4_1 */ + + return nfs4_init_callback(clp); +} + +/* + * Initialise an NFS4 client record + */ +int nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport) +{ + int error; + + if (clp->cl_cons_state == NFS_CS_READY) { + /* the client is initialised already */ + dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); + return 0; + } + + /* Check NFS protocol revision and initialize RPC op vector */ + clp->rpc_ops = &nfs_v4_clientops; + + error = nfs_create_rpc_client(clp, timeparms, authflavour, + 1, noresvport); + if (error < 0) + goto error; + strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + + error = nfs_idmap_new(clp); + if (error < 0) { + dprintk("%s: failed to create idmapper. Error = %d\n", + __func__, error); + goto error; + } + __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); + + error = nfs4_init_client_minor_version(clp); + if (error < 0) + goto error; + + if (!nfs4_has_session(clp)) + nfs_mark_client_ready(clp, NFS_CS_READY); + return 0; + +error: + nfs_mark_client_ready(clp, error); + dprintk("<-- nfs4_init_client() = xerror %d\n", error); + return error; +} + +/* + * Set up an NFS4 client + */ +static int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, + const char *ip_addr, + rpc_authflavor_t authflavour, + int proto, const struct rpc_timeout *timeparms, + u32 minorversion) +{ + struct nfs_client_initdata cl_init = { + .hostname = hostname, + .addr = addr, + .addrlen = addrlen, + .rpc_ops = &nfs_v4_clientops, + .proto = proto, + .minorversion = minorversion, + }; + struct nfs_client *clp; + int error; + + dprintk("--> nfs4_set_client()\n"); + + /* Allocate or find a client reference we can use */ + clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour, + server->flags & NFS_MOUNT_NORESVPORT); + if (IS_ERR(clp)) { + error = PTR_ERR(clp); + goto error; + } + + /* + * Query for the lease time on clientid setup or renewal + * + * Note that this will be set on nfs_clients that were created + * only for the DS role and did not set this bit, but now will + * serve a dual role. + */ + set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); + + server->nfs_client = clp; + dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); + return 0; +error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; +} + +/* + * Set up a pNFS Data Server client. + * + * Return any existing nfs_client that matches server address,port,version + * and minorversion. + * + * For a new nfs_client, use a soft mount (default), a low retrans and a + * low timeout interval so that if a connection is lost, we retry through + * the MDS. + */ +struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, + int ds_addrlen, int ds_proto) +{ + struct nfs_client_initdata cl_init = { + .addr = ds_addr, + .addrlen = ds_addrlen, + .rpc_ops = &nfs_v4_clientops, + .proto = ds_proto, + .minorversion = mds_clp->cl_minorversion, + }; + struct rpc_timeout ds_timeout = { + .to_initval = 15 * HZ, + .to_maxval = 15 * HZ, + .to_retries = 1, + .to_exponential = 1, + }; + struct nfs_client *clp; + + /* + * Set an authflavor equual to the MDS value. Use the MDS nfs_client + * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS + * (section 13.1 RFC 5661). + */ + clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, + mds_clp->cl_rpcclient->cl_auth->au_flavor, 0); + + dprintk("<-- %s %p\n", __func__, clp); + return clp; +} +EXPORT_SYMBOL(nfs4_set_ds_client); + +/* + * Session has been established, and the client marked ready. + * Set the mount rsize and wsize with negotiated fore channel + * attributes which will be bound checked in nfs_server_set_fsinfo. + */ +static void nfs4_session_set_rwsize(struct nfs_server *server) +{ +#ifdef CONFIG_NFS_V4_1 + struct nfs4_session *sess; + u32 server_resp_sz; + u32 server_rqst_sz; + + if (!nfs4_has_session(server->nfs_client)) + return; + sess = server->nfs_client->cl_session; + server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; + server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; + + if (server->rsize > server_resp_sz) + server->rsize = server_resp_sz; + if (server->wsize > server_rqst_sz) + server->wsize = server_rqst_sz; +#endif /* CONFIG_NFS_V4_1 */ +} + +static int nfs4_server_common_setup(struct nfs_server *server, + struct nfs_fh *mntfh) +{ + struct nfs_fattr *fattr; + int error; + + BUG_ON(!server->nfs_client); + BUG_ON(!server->nfs_client->rpc_ops); + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + /* data servers support only a subset of NFSv4.1 */ + if (is_ds_only_client(server->nfs_client)) + return -EPROTONOSUPPORT; + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + return -ENOMEM; + + /* We must ensure the session is initialised first */ + error = nfs4_init_session(server); + if (error < 0) + goto out; + + /* Probe the root fh to retrieve its FSID and filehandle */ + error = nfs4_get_rootfh(server, mntfh); + if (error < 0) + goto out; + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + dprintk("Mount FH: %d\n", mntfh->size); + + nfs4_session_set_rwsize(server); + + error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto out; + + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) + server->namelen = NFS4_MAXNAMLEN; + + nfs_server_insert_lists(server); + server->mount_time = jiffies; +out: + nfs_free_fattr(fattr); + return error; +} + +/* + * Create a version 4 volume record + */ +static int nfs4_init_server(struct nfs_server *server, + const struct nfs_parsed_mount_data *data) +{ + struct rpc_timeout timeparms; + int error; + + dprintk("--> nfs4_init_server()\n"); + + nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, + data->timeo, data->retrans); + + /* Initialise the client representation from the mount data */ + server->flags = data->flags; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK; + if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) + server->caps |= NFS_CAP_READDIRPLUS; + server->options = data->options; + + /* Get a client record */ + error = nfs4_set_client(server, + data->nfs_server.hostname, + (const struct sockaddr *)&data->nfs_server.address, + data->nfs_server.addrlen, + data->client_address, + data->auth_flavors[0], + data->nfs_server.protocol, + &timeparms, + data->minorversion); + if (error < 0) + goto error; + + /* + * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower + * authentication. + */ + if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX) + server->caps |= NFS_CAP_UIDGID_NOMAP; + + if (data->rsize) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize) + server->wsize = nfs_block_size(data->wsize, NULL); + + server->acregmin = data->acregmin * HZ; + server->acregmax = data->acregmax * HZ; + server->acdirmin = data->acdirmin * HZ; + server->acdirmax = data->acdirmax * HZ; + + server->port = data->nfs_server.port; + + error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); + +error: + /* Done */ + dprintk("<-- nfs4_init_server() = %d\n", error); + return error; +} + +/* + * Create a version 4 volume record + * - keyed on server and FSID + */ +struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, + struct nfs_fh *mntfh) +{ + struct nfs_server *server; + int error; + + dprintk("--> nfs4_create_server()\n"); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + /* set up the general RPC client */ + error = nfs4_init_server(server, data); + if (error < 0) + goto error; + + error = nfs4_server_common_setup(server, mntfh); + if (error < 0) + goto error; + + dprintk("<-- nfs4_create_server() = %p\n", server); + return server; + +error: + nfs_free_server(server); + dprintk("<-- nfs4_create_server() = error %d\n", error); + return ERR_PTR(error); +} + +/* + * Create an NFS4 referral server record + */ +struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, + struct nfs_fh *mntfh) +{ + struct nfs_client *parent_client; + struct nfs_server *server, *parent_server; + int error; + + dprintk("--> nfs4_create_referral_server()\n"); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + parent_server = NFS_SB(data->sb); + parent_client = parent_server->nfs_client; + + /* Initialise the client representation from the parent server */ + nfs_server_copy_userdata(server, parent_server); + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; + + /* Get a client representation. + * Note: NFSv4 always uses TCP, */ + error = nfs4_set_client(server, data->hostname, + data->addr, + data->addrlen, + parent_client->cl_ipaddr, + data->authflavor, + parent_server->client->cl_xprt->prot, + parent_server->client->cl_timeout, + parent_client->cl_mvops->minor_version); + if (error < 0) + goto error; + + error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); + if (error < 0) + goto error; + + error = nfs4_server_common_setup(server, mntfh); + if (error < 0) + goto error; + + dprintk("<-- nfs_create_referral_server() = %p\n", server); + return server; + +error: + nfs_free_server(server); + dprintk("<-- nfs4_create_referral_server() = error %d\n", error); + return ERR_PTR(error); +} + +#endif /* CONFIG_NFS_V4 */ + +/* + * Clone an NFS2, NFS3 or NFS4 server record + */ +struct nfs_server *nfs_clone_server(struct nfs_server *source, + struct nfs_fh *fh, + struct nfs_fattr *fattr) +{ + struct nfs_server *server; + struct nfs_fattr *fattr_fsinfo; + int error; + + dprintk("--> nfs_clone_server(,%llx:%llx,)\n", + (unsigned long long) fattr->fsid.major, + (unsigned long long) fattr->fsid.minor); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + error = -ENOMEM; + fattr_fsinfo = nfs_alloc_fattr(); + if (fattr_fsinfo == NULL) + goto out_free_server; + + /* Copy data from the source */ + server->nfs_client = source->nfs_client; + atomic_inc(&server->nfs_client->cl_count); + nfs_server_copy_userdata(server, source); + + server->fsid = fattr->fsid; + + error = nfs_init_server_rpcclient(server, + source->client->cl_timeout, + source->client->cl_auth->au_flavor); + if (error < 0) + goto out_free_server; + if (!IS_ERR(source->client_acl)) + nfs_init_server_aclclient(server); + + /* probe the filesystem info for this server filesystem */ + error = nfs_probe_fsinfo(server, fh, fattr_fsinfo); + if (error < 0) + goto out_free_server; + + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) + server->namelen = NFS4_MAXNAMLEN; + + dprintk("Cloned FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + + error = nfs_start_lockd(server); + if (error < 0) + goto out_free_server; + + nfs_server_insert_lists(server); + server->mount_time = jiffies; + + nfs_free_fattr(fattr_fsinfo); + dprintk("<-- nfs_clone_server() = %p\n", server); + return server; + +out_free_server: + nfs_free_fattr(fattr_fsinfo); + nfs_free_server(server); + dprintk("<-- nfs_clone_server() = error %d\n", error); + return ERR_PTR(error); +} + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry *proc_fs_nfs; + +static int nfs_server_list_open(struct inode *inode, struct file *file); +static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); +static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); +static void nfs_server_list_stop(struct seq_file *p, void *v); +static int nfs_server_list_show(struct seq_file *m, void *v); + +static const struct seq_operations nfs_server_list_ops = { + .start = nfs_server_list_start, + .next = nfs_server_list_next, + .stop = nfs_server_list_stop, + .show = nfs_server_list_show, +}; + +static const struct file_operations nfs_server_list_fops = { + .open = nfs_server_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + +static int nfs_volume_list_open(struct inode *inode, struct file *file); +static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos); +static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); +static void nfs_volume_list_stop(struct seq_file *p, void *v); +static int nfs_volume_list_show(struct seq_file *m, void *v); + +static const struct seq_operations nfs_volume_list_ops = { + .start = nfs_volume_list_start, + .next = nfs_volume_list_next, + .stop = nfs_volume_list_stop, + .show = nfs_volume_list_show, +}; + +static const struct file_operations nfs_volume_list_fops = { + .open = nfs_volume_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + +/* + * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which + * we're dealing + */ +static int nfs_server_list_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &nfs_server_list_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = PDE(inode)->data; + + return 0; +} + +/* + * set up the iterator to start reading from the server list and return the first item + */ +static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) +{ + /* lock the list against modification */ + spin_lock(&nfs_client_lock); + return seq_list_start_head(&nfs_client_list, *_pos); +} + +/* + * move to next server + */ +static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) +{ + return seq_list_next(v, &nfs_client_list, pos); +} + +/* + * clean up after reading from the transports list + */ +static void nfs_server_list_stop(struct seq_file *p, void *v) +{ + spin_unlock(&nfs_client_lock); +} + +/* + * display a header line followed by a load of call lines + */ +static int nfs_server_list_show(struct seq_file *m, void *v) +{ + struct nfs_client *clp; + + /* display header on line 1 */ + if (v == &nfs_client_list) { + seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); + return 0; + } + + /* display one transport per line on subsequent lines */ + clp = list_entry(v, struct nfs_client, cl_share_link); + + seq_printf(m, "v%u %s %s %3d %s\n", + clp->rpc_ops->version, + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), + atomic_read(&clp->cl_count), + clp->cl_hostname); + + return 0; +} + +/* + * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes + */ +static int nfs_volume_list_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &nfs_volume_list_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = PDE(inode)->data; + + return 0; +} + +/* + * set up the iterator to start reading from the volume list and return the first item + */ +static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) +{ + /* lock the list against modification */ + spin_lock(&nfs_client_lock); + return seq_list_start_head(&nfs_volume_list, *_pos); +} + +/* + * move to next volume + */ +static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) +{ + return seq_list_next(v, &nfs_volume_list, pos); +} + +/* + * clean up after reading from the transports list + */ +static void nfs_volume_list_stop(struct seq_file *p, void *v) +{ + spin_unlock(&nfs_client_lock); +} + +/* + * display a header line followed by a load of call lines + */ +static int nfs_volume_list_show(struct seq_file *m, void *v) +{ + struct nfs_server *server; + struct nfs_client *clp; + char dev[8], fsid[17]; + + /* display header on line 1 */ + if (v == &nfs_volume_list) { + seq_puts(m, "NV SERVER PORT DEV FSID FSC\n"); + return 0; + } + /* display one transport per line on subsequent lines */ + server = list_entry(v, struct nfs_server, master_link); + clp = server->nfs_client; + + snprintf(dev, 8, "%u:%u", + MAJOR(server->s_dev), MINOR(server->s_dev)); + + snprintf(fsid, 17, "%llx:%llx", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + + seq_printf(m, "v%u %s %s %-7s %-17s %s\n", + clp->rpc_ops->version, + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), + dev, + fsid, + nfs_server_fscache_state(server)); + + return 0; +} + +/* + * initialise the /proc/fs/nfsfs/ directory + */ +int __init nfs_fs_proc_init(void) +{ + struct proc_dir_entry *p; + + proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL); + if (!proc_fs_nfs) + goto error_0; + + /* a file of servers with which we're dealing */ + p = proc_create("servers", S_IFREG|S_IRUGO, + proc_fs_nfs, &nfs_server_list_fops); + if (!p) + goto error_1; + + /* a file of volumes that we have mounted */ + p = proc_create("volumes", S_IFREG|S_IRUGO, + proc_fs_nfs, &nfs_volume_list_fops); + if (!p) + goto error_2; + return 0; + +error_2: + remove_proc_entry("servers", proc_fs_nfs); +error_1: + remove_proc_entry("fs/nfsfs", NULL); +error_0: + return -ENOMEM; +} + +/* + * clean up the /proc/fs/nfsfs/ directory + */ +void nfs_fs_proc_exit(void) +{ + remove_proc_entry("volumes", proc_fs_nfs); + remove_proc_entry("servers", proc_fs_nfs); + remove_proc_entry("fs/nfsfs", NULL); +} + +#endif /* CONFIG_PROC_FS */ + +module_param(nfs4_disable_idmapping, bool, 0644); +MODULE_PARM_DESC(nfs4_disable_idmapping, + "Turn off NFSv4 idmapping when using 'sec=sys'"); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c new file mode 100644 index 00000000..ecabbd8f --- /dev/null +++ b/fs/nfs/delegation.c @@ -0,0 +1,716 @@ +/* + * linux/fs/nfs/delegation.c + * + * Copyright (C) 2004 Trond Myklebust + * + * NFS file delegation management + * + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "nfs4_fs.h" +#include "delegation.h" +#include "internal.h" + +static void nfs_free_delegation(struct nfs_delegation *delegation) +{ + if (delegation->cred) { + put_rpccred(delegation->cred); + delegation->cred = NULL; + } + kfree_rcu(delegation, rcu); +} + +/** + * nfs_mark_delegation_referenced - set delegation's REFERENCED flag + * @delegation: delegation to process + * + */ +void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); +} + +/** + * nfs_have_delegation - check if inode has a delegation + * @inode: inode to check + * @flags: delegation types to check for + * + * Returns one if inode has the indicated delegation, otherwise zero. + */ +int nfs_have_delegation(struct inode *inode, fmode_t flags) +{ + struct nfs_delegation *delegation; + int ret = 0; + + flags &= FMODE_READ|FMODE_WRITE; + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL && (delegation->type & flags) == flags) { + nfs_mark_delegation_referenced(delegation); + ret = 1; + } + rcu_read_unlock(); + return ret; +} + +static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state) +{ + struct inode *inode = state->inode; + struct file_lock *fl; + int status = 0; + + if (inode->i_flock == NULL) + goto out; + + /* Protect inode->i_flock using the file locks lock */ + lock_flocks(); + for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { + if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) + continue; + if (nfs_file_open_context(fl->fl_file) != ctx) + continue; + unlock_flocks(); + status = nfs4_lock_delegation_recall(state, fl); + if (status < 0) + goto out; + lock_flocks(); + } + unlock_flocks(); +out: + return status; +} + +static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *ctx; + struct nfs4_state *state; + int err; + +again: + spin_lock(&inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + state = ctx->state; + if (state == NULL) + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; + if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) + continue; + get_nfs_open_context(ctx); + spin_unlock(&inode->i_lock); + err = nfs4_open_delegation_recall(ctx, state, stateid); + if (err >= 0) + err = nfs_delegation_claim_locks(ctx, state); + put_nfs_open_context(ctx); + if (err != 0) + return err; + goto again; + } + spin_unlock(&inode->i_lock); + return 0; +} + +/** + * nfs_inode_reclaim_delegation - process a delegation reclaim request + * @inode: inode to process + * @cred: credential to use for request + * @res: new delegation state from server + * + */ +void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, + struct nfs_openres *res) +{ + struct nfs_delegation *delegation; + struct rpc_cred *oldcred = NULL; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL) { + memcpy(delegation->stateid.data, res->delegation.data, + sizeof(delegation->stateid.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + oldcred = delegation->cred; + delegation->cred = get_rpccred(cred); + clear_bit(NFS_DELEGATION_NEED_RECLAIM, + &delegation->flags); + NFS_I(inode)->delegation_state = delegation->type; + spin_unlock(&delegation->lock); + put_rpccred(oldcred); + rcu_read_unlock(); + } else { + /* We appear to have raced with a delegation return. */ + spin_unlock(&delegation->lock); + rcu_read_unlock(); + nfs_inode_set_delegation(inode, cred, res); + } + } else { + rcu_read_unlock(); + } +} + +static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) +{ + int res = 0; + + res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync); + nfs_free_delegation(delegation); + return res; +} + +static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation) +{ + struct inode *inode = NULL; + + spin_lock(&delegation->lock); + if (delegation->inode != NULL) + inode = igrab(delegation->inode); + spin_unlock(&delegation->lock); + return inode; +} + +static struct nfs_delegation * +nfs_detach_delegation_locked(struct nfs_inode *nfsi, + struct nfs_server *server) +{ + struct nfs_delegation *delegation = + rcu_dereference_protected(nfsi->delegation, + lockdep_is_held(&server->nfs_client->cl_lock)); + + if (delegation == NULL) + goto nomatch; + + spin_lock(&delegation->lock); + list_del_rcu(&delegation->super_list); + delegation->inode = NULL; + nfsi->delegation_state = 0; + rcu_assign_pointer(nfsi->delegation, NULL); + spin_unlock(&delegation->lock); + return delegation; +nomatch: + return NULL; +} + +static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi, + struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs_delegation *delegation; + + spin_lock(&clp->cl_lock); + delegation = nfs_detach_delegation_locked(nfsi, server); + spin_unlock(&clp->cl_lock); + return delegation; +} + +/** + * nfs_inode_set_delegation - set up a delegation on an inode + * @inode: inode to which delegation applies + * @cred: cred to use for subsequent delegation processing + * @res: new delegation state from server + * + * Returns zero on success, or a negative errno value. + */ +int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation, *old_delegation; + struct nfs_delegation *freeme = NULL; + int status = 0; + + delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + if (delegation == NULL) + return -ENOMEM; + memcpy(delegation->stateid.data, res->delegation.data, + sizeof(delegation->stateid.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + delegation->change_attr = nfsi->change_attr; + delegation->cred = get_rpccred(cred); + delegation->inode = inode; + delegation->flags = 1<lock); + + spin_lock(&clp->cl_lock); + old_delegation = rcu_dereference_protected(nfsi->delegation, + lockdep_is_held(&clp->cl_lock)); + if (old_delegation != NULL) { + if (memcmp(&delegation->stateid, &old_delegation->stateid, + sizeof(old_delegation->stateid)) == 0 && + delegation->type == old_delegation->type) { + goto out; + } + /* + * Deal with broken servers that hand out two + * delegations for the same file. + */ + dfprintk(FILE, "%s: server %s handed out " + "a duplicate delegation!\n", + __func__, clp->cl_hostname); + if (delegation->type <= old_delegation->type) { + freeme = delegation; + delegation = NULL; + goto out; + } + freeme = nfs_detach_delegation_locked(nfsi, server); + } + list_add_rcu(&delegation->super_list, &server->delegations); + nfsi->delegation_state = delegation->type; + rcu_assign_pointer(nfsi->delegation, delegation); + delegation = NULL; + + /* Ensure we revalidate the attributes and page cache! */ + spin_lock(&inode->i_lock); + nfsi->cache_validity |= NFS_INO_REVAL_FORCED; + spin_unlock(&inode->i_lock); + +out: + spin_unlock(&clp->cl_lock); + if (delegation != NULL) + nfs_free_delegation(delegation); + if (freeme != NULL) + nfs_do_return_delegation(inode, freeme, 0); + return status; +} + +/* + * Basic procedure for returning a delegation to the server + */ +static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int err; + + /* + * Guard against new delegated open/lock/unlock calls and against + * state recovery + */ + down_write(&nfsi->rwsem); + err = nfs_delegation_claim_opens(inode, &delegation->stateid); + up_write(&nfsi->rwsem); + if (err) + goto out; + + err = nfs_do_return_delegation(inode, delegation, issync); +out: + return err; +} + +/** + * nfs_client_return_marked_delegations - return previously marked delegations + * @clp: nfs_client to process + * + * Returns zero on success, or a negative errno value. + */ +int nfs_client_return_marked_delegations(struct nfs_client *clp) +{ + struct nfs_delegation *delegation; + struct nfs_server *server; + struct inode *inode; + int err = 0; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry_rcu(delegation, &server->delegations, + super_list) { + if (!test_and_clear_bit(NFS_DELEGATION_RETURN, + &delegation->flags)) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; + delegation = nfs_detach_delegation(NFS_I(inode), + server); + rcu_read_unlock(); + + if (delegation != NULL) { + filemap_flush(inode->i_mapping); + err = __nfs_inode_return_delegation(inode, + delegation, 0); + } + iput(inode); + if (!err) + goto restart; + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + return err; + } + } + rcu_read_unlock(); + return 0; +} + +/** + * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens + * @inode: inode to process + * + * Does not protect against delegation reclaims, therefore really only safe + * to be called from nfs4_clear_inode(). + */ +void nfs_inode_return_delegation_noreclaim(struct inode *inode) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation; + + if (rcu_access_pointer(nfsi->delegation) != NULL) { + delegation = nfs_detach_delegation(nfsi, server); + if (delegation != NULL) + nfs_do_return_delegation(inode, delegation, 0); + } +} + +/** + * nfs_inode_return_delegation - synchronously return a delegation + * @inode: inode to process + * + * Returns zero on success, or a negative errno value. + */ +int nfs_inode_return_delegation(struct inode *inode) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation; + int err = 0; + + if (rcu_access_pointer(nfsi->delegation) != NULL) { + delegation = nfs_detach_delegation(nfsi, server); + if (delegation != NULL) { + nfs_wb_all(inode); + err = __nfs_inode_return_delegation(inode, delegation, 1); + } + } + return err; +} + +static void nfs_mark_return_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); +} + +/** + * nfs_super_return_all_delegations - return delegations for one superblock + * @sb: sb to process + * + */ +void nfs_super_return_all_delegations(struct super_block *sb) +{ + struct nfs_server *server = NFS_SB(sb); + struct nfs_client *clp = server->nfs_client; + struct nfs_delegation *delegation; + + if (clp == NULL) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + spin_lock(&delegation->lock); + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + spin_unlock(&delegation->lock); + } + rcu_read_unlock(); + + if (nfs_client_return_marked_delegations(clp) != 0) + nfs4_schedule_state_manager(clp); +} + +static void nfs_mark_return_all_delegation_types(struct nfs_server *server, + fmode_t flags) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) + continue; + if (delegation->type & flags) + nfs_mark_return_delegation(server, delegation); + } +} + +static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, + fmode_t flags) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_mark_return_all_delegation_types(server, flags); + rcu_read_unlock(); +} + +static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) +{ + nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); +} + +static void nfs_delegation_run_state_manager(struct nfs_client *clp) +{ + if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) + nfs4_schedule_state_manager(clp); +} + +void nfs_remove_bad_delegation(struct inode *inode) +{ + struct nfs_delegation *delegation; + + delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode)); + if (delegation) { + nfs_inode_find_state_and_recover(inode, &delegation->stateid); + nfs_free_delegation(delegation); + } +} + +/** + * nfs_expire_all_delegation_types + * @clp: client to process + * @flags: delegation types to expire + * + */ +void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags) +{ + nfs_client_mark_return_all_delegation_types(clp, flags); + nfs_delegation_run_state_manager(clp); +} + +/** + * nfs_expire_all_delegations + * @clp: client to process + * + */ +void nfs_expire_all_delegations(struct nfs_client *clp) +{ + nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); +} + +/** + * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN + * @clp: client to process + * + */ +void nfs_handle_cb_pathdown(struct nfs_client *clp) +{ + if (clp == NULL) + return; + nfs_client_mark_return_all_delegations(clp); +} + +static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) + continue; + nfs_mark_return_delegation(server, delegation); + } +} + +/** + * nfs_expire_unreferenced_delegations - Eliminate unused delegations + * @clp: nfs_client to process + * + */ +void nfs_expire_unreferenced_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_mark_return_unreferenced_delegations(server); + rcu_read_unlock(); + + nfs_delegation_run_state_manager(clp); +} + +/** + * nfs_async_inode_return_delegation - asynchronously return a delegation + * @inode: inode to process + * @stateid: state ID information from CB_RECALL arguments + * + * Returns zero on success, or a negative errno value. + */ +int nfs_async_inode_return_delegation(struct inode *inode, + const nfs4_stateid *stateid) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + + if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { + rcu_read_unlock(); + return -ENOENT; + } + nfs_mark_return_delegation(server, delegation); + rcu_read_unlock(); + + nfs_delegation_run_state_manager(clp); + return 0; +} + +static struct inode * +nfs_delegation_find_inode_server(struct nfs_server *server, + const struct nfs_fh *fhandle) +{ + struct nfs_delegation *delegation; + struct inode *res = NULL; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL && + nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { + res = igrab(delegation->inode); + } + spin_unlock(&delegation->lock); + if (res != NULL) + break; + } + return res; +} + +/** + * nfs_delegation_find_inode - retrieve the inode associated with a delegation + * @clp: client state handle + * @fhandle: filehandle from a delegation recall + * + * Returns pointer to inode matching "fhandle," or NULL if a matching inode + * cannot be found. + */ +struct inode *nfs_delegation_find_inode(struct nfs_client *clp, + const struct nfs_fh *fhandle) +{ + struct nfs_server *server; + struct inode *res = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + res = nfs_delegation_find_inode_server(server, fhandle); + if (res != NULL) + break; + } + rcu_read_unlock(); + return res; +} + +static void nfs_delegation_mark_reclaim_server(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) + set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); +} + +/** + * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed + * @clp: nfs_client to process + * + */ +void nfs_delegation_mark_reclaim(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_delegation_mark_reclaim_server(server); + rcu_read_unlock(); +} + +/** + * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done + * @clp: nfs_client to process + * + */ +void nfs_delegation_reap_unclaimed(struct nfs_client *clp) +{ + struct nfs_delegation *delegation; + struct nfs_server *server; + struct inode *inode; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry_rcu(delegation, &server->delegations, + super_list) { + if (test_bit(NFS_DELEGATION_NEED_RECLAIM, + &delegation->flags) == 0) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; + delegation = nfs_detach_delegation(NFS_I(inode), + server); + rcu_read_unlock(); + + if (delegation != NULL) + nfs_free_delegation(delegation); + iput(inode); + goto restart; + } + } + rcu_read_unlock(); +} + +/** + * nfs_delegations_present - check for existence of delegations + * @clp: client state handle + * + * Returns one if there are any nfs_delegation structures attached + * to this nfs_client. + */ +int nfs_delegations_present(struct nfs_client *clp) +{ + struct nfs_server *server; + int ret = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + if (!list_empty(&server->delegations)) { + ret = 1; + break; + } + rcu_read_unlock(); + return ret; +} + +/** + * nfs4_copy_delegation_stateid - Copy inode's state ID information + * @dst: stateid data structure to fill in + * @inode: inode to check + * + * Returns one and fills in "dst->data" * if inode had a delegation, + * otherwise zero is returned. + */ +int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation; + int ret = 0; + + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation != NULL) { + memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); + ret = 1; + } + rcu_read_unlock(); + return ret; +} diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h new file mode 100644 index 00000000..691a7960 --- /dev/null +++ b/fs/nfs/delegation.h @@ -0,0 +1,80 @@ +/* + * linux/fs/nfs/delegation.h + * + * Copyright (c) Trond Myklebust + * + * Definitions pertaining to NFS delegated files + */ +#ifndef FS_NFS_DELEGATION_H +#define FS_NFS_DELEGATION_H + +#if defined(CONFIG_NFS_V4) +/* + * NFSv4 delegation + */ +struct nfs_delegation { + struct list_head super_list; + struct rpc_cred *cred; + struct inode *inode; + nfs4_stateid stateid; + fmode_t type; + loff_t maxsize; + __u64 change_attr; + unsigned long flags; + spinlock_t lock; + struct rcu_head rcu; +}; + +enum { + NFS_DELEGATION_NEED_RECLAIM = 0, + NFS_DELEGATION_RETURN, + NFS_DELEGATION_REFERENCED, +}; + +int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); +void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); +int nfs_inode_return_delegation(struct inode *inode); +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); +void nfs_inode_return_delegation_noreclaim(struct inode *inode); + +struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); +void nfs_super_return_all_delegations(struct super_block *sb); +void nfs_expire_all_delegations(struct nfs_client *clp); +void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags); +void nfs_expire_unreferenced_delegations(struct nfs_client *clp); +void nfs_handle_cb_pathdown(struct nfs_client *clp); +int nfs_client_return_marked_delegations(struct nfs_client *clp); +int nfs_delegations_present(struct nfs_client *clp); +void nfs_remove_bad_delegation(struct inode *inode); + +void nfs_delegation_mark_reclaim(struct nfs_client *clp); +void nfs_delegation_reap_unclaimed(struct nfs_client *clp); + +/* NFSv4 delegation-related procedures */ +int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); +int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); +int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode); + +void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); +int nfs_have_delegation(struct inode *inode, fmode_t flags); + +#else +static inline int nfs_have_delegation(struct inode *inode, fmode_t flags) +{ + return 0; +} + +static inline int nfs_inode_return_delegation(struct inode *inode) +{ + return 0; +} +#endif + +static inline int nfs_have_delegated_attributes(struct inode *inode) +{ + return nfs_have_delegation(inode, FMODE_READ) && + !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED); +} + +#endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c new file mode 100644 index 00000000..462a0060 --- /dev/null +++ b/fs/nfs/dir.c @@ -0,0 +1,2350 @@ +/* + * linux/fs/nfs/dir.c + * + * Copyright (C) 1992 Rick Sladkey + * + * nfs directory handling functions + * + * 10 Apr 1996 Added silly rename for unlink --okir + * 28 Sep 1996 Improved directory cache --okir + * 23 Aug 1997 Claus Heine claus@momo.math.rwth-aachen.de + * Re-implemented silly rename for unlink, newly implemented + * silly rename for nfs_rename() following the suggestions + * of Olaf Kirch (okir) found in this file. + * Following Linus comments on my original hack, this version + * depends only on the dcache stuff and doesn't touch the inode + * layer (iput() and friends). + * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "delegation.h" +#include "iostat.h" +#include "internal.h" +#include "fscache.h" + +/* #define NFS_DEBUG_VERBOSE 1 */ + +static int nfs_opendir(struct inode *, struct file *); +static int nfs_closedir(struct inode *, struct file *); +static int nfs_readdir(struct file *, void *, filldir_t); +static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *); +static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *); +static int nfs_mkdir(struct inode *, struct dentry *, int); +static int nfs_rmdir(struct inode *, struct dentry *); +static int nfs_unlink(struct inode *, struct dentry *); +static int nfs_symlink(struct inode *, struct dentry *, const char *); +static int nfs_link(struct dentry *, struct inode *, struct dentry *); +static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); +static int nfs_rename(struct inode *, struct dentry *, + struct inode *, struct dentry *); +static int nfs_fsync_dir(struct file *, int); +static loff_t nfs_llseek_dir(struct file *, loff_t, int); +static void nfs_readdir_clear_array(struct page*); + +const struct file_operations nfs_dir_operations = { + .llseek = nfs_llseek_dir, + .read = generic_read_dir, + .readdir = nfs_readdir, + .open = nfs_opendir, + .release = nfs_closedir, + .fsync = nfs_fsync_dir, +}; + +const struct inode_operations nfs_dir_inode_operations = { + .create = nfs_create, + .lookup = nfs_lookup, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +}; + +const struct address_space_operations nfs_dir_aops = { + .freepage = nfs_readdir_clear_array, +}; + +#ifdef CONFIG_NFS_V3 +const struct inode_operations nfs3_dir_inode_operations = { + .create = nfs_create, + .lookup = nfs_lookup, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .listxattr = nfs3_listxattr, + .getxattr = nfs3_getxattr, + .setxattr = nfs3_setxattr, + .removexattr = nfs3_removexattr, +}; +#endif /* CONFIG_NFS_V3 */ + +#ifdef CONFIG_NFS_V4 + +static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); +static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd); +const struct inode_operations nfs4_dir_inode_operations = { + .create = nfs_open_create, + .lookup = nfs_atomic_lookup, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, +}; + +#endif /* CONFIG_NFS_V4 */ + +static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) +{ + struct nfs_open_dir_context *ctx; + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (ctx != NULL) { + ctx->duped = 0; + ctx->attr_gencount = NFS_I(dir)->attr_gencount; + ctx->dir_cookie = 0; + ctx->dup_cookie = 0; + ctx->cred = get_rpccred(cred); + return ctx; + } + return ERR_PTR(-ENOMEM); +} + +static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) +{ + put_rpccred(ctx->cred); + kfree(ctx); +} + +/* + * Open file + */ +static int +nfs_opendir(struct inode *inode, struct file *filp) +{ + int res = 0; + struct nfs_open_dir_context *ctx; + struct rpc_cred *cred; + + dfprintk(FILE, "NFS: open dir(%s/%s)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name); + + nfs_inc_stats(inode, NFSIOS_VFSOPEN); + + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return PTR_ERR(cred); + ctx = alloc_nfs_open_dir_context(inode, cred); + if (IS_ERR(ctx)) { + res = PTR_ERR(ctx); + goto out; + } + filp->private_data = ctx; + if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) { + /* This is a mountpoint, so d_revalidate will never + * have been called, so we need to refresh the + * inode (for close-open consistency) ourselves. + */ + __nfs_revalidate_inode(NFS_SERVER(inode), inode); + } +out: + put_rpccred(cred); + return res; +} + +static int +nfs_closedir(struct inode *inode, struct file *filp) +{ + put_nfs_open_dir_context(filp->private_data); + return 0; +} + +struct nfs_cache_array_entry { + u64 cookie; + u64 ino; + struct qstr string; + unsigned char d_type; +}; + +struct nfs_cache_array { + unsigned int size; + int eof_index; + u64 last_cookie; + struct nfs_cache_array_entry array[0]; +}; + +typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int); +typedef struct { + struct file *file; + struct page *page; + unsigned long page_index; + u64 *dir_cookie; + u64 last_cookie; + loff_t current_index; + decode_dirent_t decode; + + unsigned long timestamp; + unsigned long gencount; + unsigned int cache_entry_index; + unsigned int plus:1; + unsigned int eof:1; +} nfs_readdir_descriptor_t; + +/* + * The caller is responsible for calling nfs_readdir_release_array(page) + */ +static +struct nfs_cache_array *nfs_readdir_get_array(struct page *page) +{ + void *ptr; + if (page == NULL) + return ERR_PTR(-EIO); + ptr = kmap(page); + if (ptr == NULL) + return ERR_PTR(-ENOMEM); + return ptr; +} + +static +void nfs_readdir_release_array(struct page *page) +{ + kunmap(page); +} + +/* + * we are freeing strings created by nfs_add_to_readdir_array() + */ +static +void nfs_readdir_clear_array(struct page *page) +{ + struct nfs_cache_array *array; + int i; + + array = kmap_atomic(page, KM_USER0); + for (i = 0; i < array->size; i++) + kfree(array->array[i].string.name); + kunmap_atomic(array, KM_USER0); +} + +/* + * the caller is responsible for freeing qstr.name + * when called by nfs_readdir_add_to_array, the strings will be freed in + * nfs_clear_readdir_array() + */ +static +int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len) +{ + string->len = len; + string->name = kmemdup(name, len, GFP_KERNEL); + if (string->name == NULL) + return -ENOMEM; + /* + * Avoid a kmemleak false positive. The pointer to the name is stored + * in a page cache page which kmemleak does not scan. + */ + kmemleak_not_leak(string->name); + string->hash = full_name_hash(name, len); + return 0; +} + +static +int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) +{ + struct nfs_cache_array *array = nfs_readdir_get_array(page); + struct nfs_cache_array_entry *cache_entry; + int ret; + + if (IS_ERR(array)) + return PTR_ERR(array); + + cache_entry = &array->array[array->size]; + + /* Check that this entry lies within the page bounds */ + ret = -ENOSPC; + if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE) + goto out; + + cache_entry->cookie = entry->prev_cookie; + cache_entry->ino = entry->ino; + cache_entry->d_type = entry->d_type; + ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); + if (ret) + goto out; + array->last_cookie = entry->cookie; + array->size++; + if (entry->eof != 0) + array->eof_index = array->size; +out: + nfs_readdir_release_array(page); + return ret; +} + +static +int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +{ + loff_t diff = desc->file->f_pos - desc->current_index; + unsigned int index; + + if (diff < 0) + goto out_eof; + if (diff >= array->size) { + if (array->eof_index >= 0) + goto out_eof; + return -EAGAIN; + } + + index = (unsigned int)diff; + *desc->dir_cookie = array->array[index].cookie; + desc->cache_entry_index = index; + return 0; +out_eof: + desc->eof = 1; + return -EBADCOOKIE; +} + +static +int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +{ + int i; + loff_t new_pos; + int status = -EAGAIN; + + for (i = 0; i < array->size; i++) { + if (array->array[i].cookie == *desc->dir_cookie) { + struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode); + struct nfs_open_dir_context *ctx = desc->file->private_data; + + new_pos = desc->current_index + i; + if (ctx->attr_gencount != nfsi->attr_gencount + || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { + ctx->duped = 0; + ctx->attr_gencount = nfsi->attr_gencount; + } else if (new_pos < desc->file->f_pos) { + if (ctx->duped > 0 + && ctx->dup_cookie == *desc->dir_cookie) { + if (printk_ratelimit()) { + pr_notice("NFS: directory %s/%s contains a readdir loop." + "Please contact your server vendor. " + "Offending cookie: %llu\n", + desc->file->f_dentry->d_parent->d_name.name, + desc->file->f_dentry->d_name.name, + *desc->dir_cookie); + } + status = -ELOOP; + goto out; + } + ctx->dup_cookie = *desc->dir_cookie; + ctx->duped = -1; + } + desc->file->f_pos = new_pos; + desc->cache_entry_index = i; + return 0; + } + } + if (array->eof_index >= 0) { + status = -EBADCOOKIE; + if (*desc->dir_cookie == array->last_cookie) + desc->eof = 1; + } +out: + return status; +} + +static +int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) +{ + struct nfs_cache_array *array; + int status; + + array = nfs_readdir_get_array(desc->page); + if (IS_ERR(array)) { + status = PTR_ERR(array); + goto out; + } + + if (*desc->dir_cookie == 0) + status = nfs_readdir_search_for_pos(array, desc); + else + status = nfs_readdir_search_for_cookie(array, desc); + + if (status == -EAGAIN) { + desc->last_cookie = array->last_cookie; + desc->current_index += array->size; + desc->page_index++; + } + nfs_readdir_release_array(desc->page); +out: + return status; +} + +/* Fill a page with xdr information before transferring to the cache page */ +static +int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, + struct nfs_entry *entry, struct file *file, struct inode *inode) +{ + struct nfs_open_dir_context *ctx = file->private_data; + struct rpc_cred *cred = ctx->cred; + unsigned long timestamp, gencount; + int error; + + again: + timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); + error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages, + NFS_SERVER(inode)->dtsize, desc->plus); + if (error < 0) { + /* We requested READDIRPLUS, but the server doesn't grok it */ + if (error == -ENOTSUPP && desc->plus) { + NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; + clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); + desc->plus = 0; + goto again; + } + goto error; + } + desc->timestamp = timestamp; + desc->gencount = gencount; +error: + return error; +} + +static int xdr_decode(nfs_readdir_descriptor_t *desc, + struct nfs_entry *entry, struct xdr_stream *xdr) +{ + int error; + + error = desc->decode(xdr, entry, desc->plus); + if (error) + return error; + entry->fattr->time_start = desc->timestamp; + entry->fattr->gencount = desc->gencount; + return 0; +} + +static +int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) +{ + if (dentry->d_inode == NULL) + goto different; + if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0) + goto different; + return 1; +different: + return 0; +} + +static +void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) +{ + struct qstr filename = { + .len = entry->len, + .name = entry->name, + }; + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = parent->d_inode; + struct inode *inode; + + if (filename.name[0] == '.') { + if (filename.len == 1) + return; + if (filename.len == 2 && filename.name[1] == '.') + return; + } + filename.hash = full_name_hash(filename.name, filename.len); + + dentry = d_lookup(parent, &filename); + if (dentry != NULL) { + if (nfs_same_file(dentry, entry)) { + nfs_refresh_inode(dentry->d_inode, entry->fattr); + goto out; + } else { + d_drop(dentry); + dput(dentry); + } + } + + dentry = d_alloc(parent, &filename); + if (dentry == NULL) + return; + + inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); + if (IS_ERR(inode)) + goto out; + + alias = d_materialise_unique(dentry, inode); + if (IS_ERR(alias)) + goto out; + else if (alias) { + nfs_set_verifier(alias, nfs_save_change_attribute(dir)); + dput(alias); + } else + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + +out: + dput(dentry); +} + +/* Perform conversion from xdr to cache array */ +static +int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, + struct page **xdr_pages, struct page *page, unsigned int buflen) +{ + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + struct nfs_cache_array *array; + unsigned int count = 0; + int status; + + scratch = alloc_page(GFP_KERNEL); + if (scratch == NULL) + return -ENOMEM; + + xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + do { + status = xdr_decode(desc, entry, &stream); + if (status != 0) { + if (status == -EAGAIN) + status = 0; + break; + } + + count++; + + if (desc->plus != 0) + nfs_prime_dcache(desc->file->f_path.dentry, entry); + + status = nfs_readdir_add_to_array(entry, page); + if (status != 0) + break; + } while (!entry->eof); + + if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { + array = nfs_readdir_get_array(page); + if (!IS_ERR(array)) { + array->eof_index = array->size; + status = 0; + nfs_readdir_release_array(page); + } else + status = PTR_ERR(array); + } + + put_page(scratch); + return status; +} + +static +void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages) +{ + unsigned int i; + for (i = 0; i < npages; i++) + put_page(pages[i]); +} + +static +void nfs_readdir_free_large_page(void *ptr, struct page **pages, + unsigned int npages) +{ + nfs_readdir_free_pagearray(pages, npages); +} + +/* + * nfs_readdir_large_page will allocate pages that must be freed with a call + * to nfs_readdir_free_large_page + */ +static +int nfs_readdir_large_page(struct page **pages, unsigned int npages) +{ + unsigned int i; + + for (i = 0; i < npages; i++) { + struct page *page = alloc_page(GFP_KERNEL); + if (page == NULL) + goto out_freepages; + pages[i] = page; + } + return 0; + +out_freepages: + nfs_readdir_free_pagearray(pages, i); + return -ENOMEM; +} + +static +int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) +{ + struct page *pages[NFS_MAX_READDIR_PAGES]; + void *pages_ptr = NULL; + struct nfs_entry entry; + struct file *file = desc->file; + struct nfs_cache_array *array; + int status = -ENOMEM; + unsigned int array_size = ARRAY_SIZE(pages); + + entry.prev_cookie = 0; + entry.cookie = desc->last_cookie; + entry.eof = 0; + entry.fh = nfs_alloc_fhandle(); + entry.fattr = nfs_alloc_fattr(); + entry.server = NFS_SERVER(inode); + if (entry.fh == NULL || entry.fattr == NULL) + goto out; + + array = nfs_readdir_get_array(page); + if (IS_ERR(array)) { + status = PTR_ERR(array); + goto out; + } + memset(array, 0, sizeof(struct nfs_cache_array)); + array->eof_index = -1; + + status = nfs_readdir_large_page(pages, array_size); + if (status < 0) + goto out_release_array; + do { + unsigned int pglen; + status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); + + if (status < 0) + break; + pglen = status; + status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen); + if (status < 0) { + if (status == -ENOSPC) + status = 0; + break; + } + } while (array->eof_index < 0); + + nfs_readdir_free_large_page(pages_ptr, pages, array_size); +out_release_array: + nfs_readdir_release_array(page); +out: + nfs_free_fattr(entry.fattr); + nfs_free_fhandle(entry.fh); + return status; +} + +/* + * Now we cache directories properly, by converting xdr information + * to an array that can be used for lookups later. This results in + * fewer cache pages, since we can store more information on each page. + * We only need to convert from xdr once so future lookups are much simpler + */ +static +int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) +{ + struct inode *inode = desc->file->f_path.dentry->d_inode; + int ret; + + ret = nfs_readdir_xdr_to_array(desc, page, inode); + if (ret < 0) + goto error; + SetPageUptodate(page); + + if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { + /* Should never happen */ + nfs_zap_mapping(inode, inode->i_mapping); + } + unlock_page(page); + return 0; + error: + unlock_page(page); + return ret; +} + +static +void cache_page_release(nfs_readdir_descriptor_t *desc) +{ + if (!desc->page->mapping) + nfs_readdir_clear_array(desc->page); + page_cache_release(desc->page); + desc->page = NULL; +} + +static +struct page *get_cache_page(nfs_readdir_descriptor_t *desc) +{ + return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, + desc->page_index, (filler_t *)nfs_readdir_filler, desc); +} + +/* + * Returns 0 if desc->dir_cookie was found on page desc->page_index + */ +static +int find_cache_page(nfs_readdir_descriptor_t *desc) +{ + int res; + + desc->page = get_cache_page(desc); + if (IS_ERR(desc->page)) + return PTR_ERR(desc->page); + + res = nfs_readdir_search_array(desc); + if (res != 0) + cache_page_release(desc); + return res; +} + +/* Search for desc->dir_cookie from the beginning of the page cache */ +static inline +int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) +{ + int res; + + if (desc->page_index == 0) { + desc->current_index = 0; + desc->last_cookie = 0; + } + do { + res = find_cache_page(desc); + } while (res == -EAGAIN); + return res; +} + +/* + * Once we've found the start of the dirent within a page: fill 'er up... + */ +static +int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, + filldir_t filldir) +{ + struct file *file = desc->file; + int i = 0; + int res = 0; + struct nfs_cache_array *array = NULL; + struct nfs_open_dir_context *ctx = file->private_data; + + array = nfs_readdir_get_array(desc->page); + if (IS_ERR(array)) { + res = PTR_ERR(array); + goto out; + } + + for (i = desc->cache_entry_index; i < array->size; i++) { + struct nfs_cache_array_entry *ent; + + ent = &array->array[i]; + if (filldir(dirent, ent->string.name, ent->string.len, + file->f_pos, nfs_compat_user_ino64(ent->ino), + ent->d_type) < 0) { + desc->eof = 1; + break; + } + file->f_pos++; + if (i < (array->size-1)) + *desc->dir_cookie = array->array[i+1].cookie; + else + *desc->dir_cookie = array->last_cookie; + if (ctx->duped != 0) + ctx->duped = 1; + } + if (array->eof_index >= 0) + desc->eof = 1; + + nfs_readdir_release_array(desc->page); +out: + cache_page_release(desc); + dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", + (unsigned long long)*desc->dir_cookie, res); + return res; +} + +/* + * If we cannot find a cookie in our cache, we suspect that this is + * because it points to a deleted file, so we ask the server to return + * whatever it thinks is the next entry. We then feed this to filldir. + * If all goes well, we should then be able to find our way round the + * cache on the next call to readdir_search_pagecache(); + * + * NOTE: we cannot add the anonymous page to the pagecache because + * the data it contains might not be page aligned. Besides, + * we should already have a complete representation of the + * directory in the page cache by the time we get here. + */ +static inline +int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, + filldir_t filldir) +{ + struct page *page = NULL; + int status; + struct inode *inode = desc->file->f_path.dentry->d_inode; + struct nfs_open_dir_context *ctx = desc->file->private_data; + + dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", + (unsigned long long)*desc->dir_cookie); + + page = alloc_page(GFP_HIGHUSER); + if (!page) { + status = -ENOMEM; + goto out; + } + + desc->page_index = 0; + desc->last_cookie = *desc->dir_cookie; + desc->page = page; + ctx->duped = 0; + + status = nfs_readdir_xdr_to_array(desc, page, inode); + if (status < 0) + goto out_release; + + status = nfs_do_filldir(desc, dirent, filldir); + + out: + dfprintk(DIRCACHE, "NFS: %s: returns %d\n", + __func__, status); + return status; + out_release: + cache_page_release(desc); + goto out; +} + +/* The file offset position represents the dirent entry number. A + last cookie cache takes care of the common case of reading the + whole directory. + */ +static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + nfs_readdir_descriptor_t my_desc, + *desc = &my_desc; + struct nfs_open_dir_context *dir_ctx = filp->private_data; + int res; + + dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + (long long)filp->f_pos); + nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); + + /* + * filp->f_pos points to the dirent entry number. + * *desc->dir_cookie has the cookie for the next entry. We have + * to either find the entry with the appropriate number or + * revalidate the cookie. + */ + memset(desc, 0, sizeof(*desc)); + + desc->file = filp; + desc->dir_cookie = &dir_ctx->dir_cookie; + desc->decode = NFS_PROTO(inode)->decode_dirent; + desc->plus = NFS_USE_READDIRPLUS(inode); + + nfs_block_sillyrename(dentry); + res = nfs_revalidate_mapping(inode, filp->f_mapping); + if (res < 0) + goto out; + + do { + res = readdir_search_pagecache(desc); + + if (res == -EBADCOOKIE) { + res = 0; + /* This means either end of directory */ + if (*desc->dir_cookie && desc->eof == 0) { + /* Or that the server has 'lost' a cookie */ + res = uncached_readdir(desc, dirent, filldir); + if (res == 0) + continue; + } + break; + } + if (res == -ETOOSMALL && desc->plus) { + clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); + nfs_zap_caches(inode); + desc->page_index = 0; + desc->plus = 0; + desc->eof = 0; + continue; + } + if (res < 0) + break; + + res = nfs_do_filldir(desc, dirent, filldir); + if (res < 0) + break; + } while (!desc->eof); +out: + nfs_unblock_sillyrename(dentry); + if (res > 0) + res = 0; + dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + res); + return res; +} + +static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct nfs_open_dir_context *dir_ctx = filp->private_data; + + dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name, + offset, origin); + + mutex_lock(&inode->i_mutex); + switch (origin) { + case 1: + offset += filp->f_pos; + case 0: + if (offset >= 0) + break; + default: + offset = -EINVAL; + goto out; + } + if (offset != filp->f_pos) { + filp->f_pos = offset; + dir_ctx->dir_cookie = 0; + dir_ctx->duped = 0; + } +out: + mutex_unlock(&inode->i_mutex); + return offset; +} + +/* + * All directory operations under NFS are synchronous, so fsync() + * is a dummy operation. + */ +static int nfs_fsync_dir(struct file *filp, int datasync) +{ + struct dentry *dentry = filp->f_path.dentry; + + dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + datasync); + + nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC); + return 0; +} + +/** + * nfs_force_lookup_revalidate - Mark the directory as having changed + * @dir - pointer to directory inode + * + * This forces the revalidation code in nfs_lookup_revalidate() to do a + * full lookup on all child dentries of 'dir' whenever a change occurs + * on the server that might have invalidated our dcache. + * + * The caller should be holding dir->i_lock + */ +void nfs_force_lookup_revalidate(struct inode *dir) +{ + NFS_I(dir)->cache_change_attribute++; +} + +/* + * A check for whether or not the parent directory has changed. + * In the case it has, we assume that the dentries are untrustworthy + * and may need to be looked up again. + */ +static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) +{ + if (IS_ROOT(dentry)) + return 1; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + return 0; + if (!nfs_verify_change_attribute(dir, dentry->d_time)) + return 0; + /* Revalidate nfsi->cache_change_attribute before we declare a match */ + if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) + return 0; + if (!nfs_verify_change_attribute(dir, dentry->d_time)) + return 0; + return 1; +} + +/* + * Return the intent data that applies to this particular path component + * + * Note that the current set of intents only apply to the very last + * component of the path. + * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT. + */ +static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, + unsigned int mask) +{ + if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT)) + return 0; + return nd->flags & mask; +} + +/* + * Use intent information to check whether or not we're going to do + * an O_EXCL create using this path component. + */ +static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd) +{ + if (NFS_PROTO(dir)->version == 2) + return 0; + return nd && nfs_lookup_check_intent(nd, LOOKUP_EXCL); +} + +/* + * Inode and filehandle revalidation for lookups. + * + * We force revalidation in the cases where the VFS sets LOOKUP_REVAL, + * or if the intent information indicates that we're about to open this + * particular file and the "nocto" mount flag is not set. + * + */ +static inline +int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd) +{ + struct nfs_server *server = NFS_SERVER(inode); + + if (IS_AUTOMOUNT(inode)) + return 0; + if (nd != NULL) { + /* VFS wants an on-the-wire revalidation */ + if (nd->flags & LOOKUP_REVAL) + goto out_force; + /* This is an open(2) */ + if (nfs_lookup_check_intent(nd, LOOKUP_OPEN) != 0 && + !(server->flags & NFS_MOUNT_NOCTO) && + (S_ISREG(inode->i_mode) || + S_ISDIR(inode->i_mode))) + goto out_force; + return 0; + } + return nfs_revalidate_inode(server, inode); +out_force: + return __nfs_revalidate_inode(server, inode); +} + +/* + * We judge how long we want to trust negative + * dentries by looking at the parent inode mtime. + * + * If parent mtime has changed, we revalidate, else we wait for a + * period corresponding to the parent's attribute cache timeout value. + */ +static inline +int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + /* Don't revalidate a negative dentry if we're creating a new file */ + if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0) + return 0; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) + return 1; + return !nfs_check_verifier(dir, dentry); +} + +/* + * This is called every time the dcache has a lookup hit, + * and we should check whether we can really trust that + * lookup. + * + * NOTE! The hit can be a negative hit too, don't assume + * we have an inode! + * + * If the parent directory is seen to have changed, we throw out the + * cached dentry and do a new lookup. + */ +static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *dir; + struct inode *inode; + struct dentry *parent; + struct nfs_fh *fhandle = NULL; + struct nfs_fattr *fattr = NULL; + int error; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + parent = dget_parent(dentry); + dir = parent->d_inode; + nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); + inode = dentry->d_inode; + + if (!inode) { + if (nfs_neg_need_reval(dir, dentry, nd)) + goto out_bad; + goto out_valid; + } + + if (is_bad_inode(inode)) { + dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n", + __func__, dentry->d_parent->d_name.name, + dentry->d_name.name); + goto out_bad; + } + + if (nfs_have_delegation(inode, FMODE_READ)) + goto out_set_verifier; + + /* Force a full look up iff the parent directory has changed */ + if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) { + if (nfs_lookup_verify_inode(inode, nd)) + goto out_zap_parent; + goto out_valid; + } + + if (NFS_STALE(inode)) + goto out_bad; + + error = -ENOMEM; + fhandle = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fhandle == NULL || fattr == NULL) + goto out_error; + + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); + if (error) + goto out_bad; + if (nfs_compare_fh(NFS_FH(inode), fhandle)) + goto out_bad; + if ((error = nfs_refresh_inode(inode, fattr)) != 0) + goto out_bad; + + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); +out_set_verifier: + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + out_valid: + dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n", + __func__, dentry->d_parent->d_name.name, + dentry->d_name.name); + return 1; +out_zap_parent: + nfs_zap_caches(dir); + out_bad: + nfs_mark_for_revalidate(dir); + if (inode && S_ISDIR(inode->i_mode)) { + /* Purge readdir caches. */ + nfs_zap_caches(inode); + /* If we have submounts, don't unhash ! */ + if (have_submounts(dentry)) + goto out_valid; + if (dentry->d_flags & DCACHE_DISCONNECTED) + goto out_valid; + shrink_dcache_parent(dentry); + } + d_drop(dentry); + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", + __func__, dentry->d_parent->d_name.name, + dentry->d_name.name); + return 0; +out_error: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n", + __func__, dentry->d_parent->d_name.name, + dentry->d_name.name, error); + return error; +} + +/* + * This is called from dput() when d_count is going to 0. + */ +static int nfs_dentry_delete(const struct dentry *dentry) +{ + dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + dentry->d_flags); + + /* Unhash any dentry with a stale inode */ + if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode)) + return 1; + + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + /* Unhash it, so that ->d_iput() would be called */ + return 1; + } + if (!(dentry->d_sb->s_flags & MS_ACTIVE)) { + /* Unhash it, so that ancestors of killed async unlink + * files will be cleaned up during umount */ + return 1; + } + return 0; + +} + +static void nfs_drop_nlink(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (inode->i_nlink > 0) + drop_nlink(inode); + spin_unlock(&inode->i_lock); +} + +/* + * Called when the dentry loses inode. + * We use it to clean up silly-renamed files. + */ +static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) +{ + if (S_ISDIR(inode->i_mode)) + /* drop any readdir cache as it could easily be old */ + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; + + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + drop_nlink(inode); + nfs_complete_unlink(dentry, inode); + } + iput(inode); +} + +static void nfs_d_release(struct dentry *dentry) +{ + /* free cached devname value, if it survived that far */ + if (unlikely(dentry->d_fsdata)) { + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + WARN_ON(1); + else + kfree(dentry->d_fsdata); + } +} + +const struct dentry_operations nfs_dentry_operations = { + .d_revalidate = nfs_lookup_revalidate, + .d_delete = nfs_dentry_delete, + .d_iput = nfs_dentry_iput, + .d_automount = nfs_d_automount, + .d_release = nfs_d_release, +}; + +static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +{ + struct dentry *res; + struct dentry *parent; + struct inode *inode = NULL; + struct nfs_fh *fhandle = NULL; + struct nfs_fattr *fattr = NULL; + int error; + + dfprintk(VFS, "NFS: lookup(%s/%s)\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + nfs_inc_stats(dir, NFSIOS_VFSLOOKUP); + + res = ERR_PTR(-ENAMETOOLONG); + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + goto out; + + /* + * If we're doing an exclusive create, optimize away the lookup + * but don't hash the dentry. + */ + if (nfs_is_exclusive_create(dir, nd)) { + d_instantiate(dentry, NULL); + res = NULL; + goto out; + } + + res = ERR_PTR(-ENOMEM); + fhandle = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fhandle == NULL || fattr == NULL) + goto out; + + parent = dentry->d_parent; + /* Protect against concurrent sillydeletes */ + nfs_block_sillyrename(parent); + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); + if (error == -ENOENT) + goto no_entry; + if (error < 0) { + res = ERR_PTR(error); + goto out_unblock_sillyrename; + } + inode = nfs_fhget(dentry->d_sb, fhandle, fattr); + res = ERR_CAST(inode); + if (IS_ERR(res)) + goto out_unblock_sillyrename; + +no_entry: + res = d_materialise_unique(dentry, inode); + if (res != NULL) { + if (IS_ERR(res)) + goto out_unblock_sillyrename; + dentry = res; + } + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); +out_unblock_sillyrename: + nfs_unblock_sillyrename(parent); +out: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + return res; +} + +#ifdef CONFIG_NFS_V4 +static int nfs_open_revalidate(struct dentry *, struct nameidata *); + +const struct dentry_operations nfs4_dentry_operations = { + .d_revalidate = nfs_open_revalidate, + .d_delete = nfs_dentry_delete, + .d_iput = nfs_dentry_iput, + .d_automount = nfs_d_automount, + .d_release = nfs_d_release, +}; + +/* + * Use intent information to determine whether we need to substitute + * the NFSv4-style stateful OPEN for the LOOKUP call + */ +static int is_atomic_open(struct nameidata *nd) +{ + if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_OPEN) == 0) + return 0; + /* NFS does not (yet) have a stateful open for directories */ + if (nd->flags & LOOKUP_DIRECTORY) + return 0; + /* Are we trying to write to a read only partition? */ + if (__mnt_is_readonly(nd->path.mnt) && + (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) + return 0; + return 1; +} + +static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd) +{ + struct path path = { + .mnt = nd->path.mnt, + .dentry = dentry, + }; + struct nfs_open_context *ctx; + struct rpc_cred *cred; + fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); + + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return ERR_CAST(cred); + ctx = alloc_nfs_open_context(&path, cred, fmode); + put_rpccred(cred); + if (ctx == NULL) + return ERR_PTR(-ENOMEM); + return ctx; +} + +static int do_open(struct inode *inode, struct file *filp) +{ + nfs_fscache_set_inode_cookie(inode, filp); + return 0; +} + +static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx) +{ + struct file *filp; + int ret = 0; + + /* If the open_intent is for execute, we have an extra check to make */ + if (ctx->mode & FMODE_EXEC) { + ret = nfs_may_open(ctx->path.dentry->d_inode, + ctx->cred, + nd->intent.open.flags); + if (ret < 0) + goto out; + } + filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open); + if (IS_ERR(filp)) + ret = PTR_ERR(filp); + else + nfs_file_set_open_context(filp, ctx); +out: + put_nfs_open_context(ctx); + return ret; +} + +static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct nfs_open_context *ctx; + struct iattr attr; + struct dentry *res = NULL; + struct inode *inode; + int open_flags; + int err; + + dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + + /* Check that we are indeed trying to open this file */ + if (!is_atomic_open(nd)) + goto no_open; + + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) { + res = ERR_PTR(-ENAMETOOLONG); + goto out; + } + + /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash + * the dentry. */ + if (nd->flags & LOOKUP_EXCL) { + d_instantiate(dentry, NULL); + goto out; + } + + ctx = nameidata_to_nfs_open_context(dentry, nd); + res = ERR_CAST(ctx); + if (IS_ERR(ctx)) + goto out; + + open_flags = nd->intent.open.flags; + if (nd->flags & LOOKUP_CREATE) { + attr.ia_mode = nd->intent.open.create_mode; + attr.ia_valid = ATTR_MODE; + attr.ia_mode &= ~current_umask(); + } else { + open_flags &= ~(O_EXCL | O_CREAT); + attr.ia_valid = 0; + } + + /* Open the file on the server */ + nfs_block_sillyrename(dentry->d_parent); + inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); + if (IS_ERR(inode)) { + nfs_unblock_sillyrename(dentry->d_parent); + put_nfs_open_context(ctx); + switch (PTR_ERR(inode)) { + /* Make a negative dentry */ + case -ENOENT: + d_add(dentry, NULL); + res = NULL; + goto out; + /* This turned out not to be a regular file */ + case -EISDIR: + case -ENOTDIR: + goto no_open; + case -ELOOP: + if (!(nd->intent.open.flags & O_NOFOLLOW)) + goto no_open; + /* case -EINVAL: */ + default: + res = ERR_CAST(inode); + goto out; + } + } + res = d_add_unique(dentry, inode); + nfs_unblock_sillyrename(dentry->d_parent); + if (res != NULL) { + dput(ctx->path.dentry); + ctx->path.dentry = dget(res); + dentry = res; + } + err = nfs_intent_set_file(nd, ctx); + if (err < 0) { + if (res != NULL) + dput(res); + return ERR_PTR(err); + } +out: + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + return res; +no_open: + return nfs_lookup(dir, dentry, nd); +} + +static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *parent = NULL; + struct inode *inode; + struct inode *dir; + struct nfs_open_context *ctx; + int openflags, ret = 0; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + if (!is_atomic_open(nd) || d_mountpoint(dentry)) + goto no_open; + + parent = dget_parent(dentry); + dir = parent->d_inode; + + /* We can't create new files in nfs_open_revalidate(), so we + * optimize away revalidation of negative dentries. + */ + if (inode == NULL) { + if (!nfs_neg_need_reval(dir, dentry, nd)) + ret = 1; + goto out; + } + + /* NFS only supports OPEN on regular files */ + if (!S_ISREG(inode->i_mode)) + goto no_open_dput; + openflags = nd->intent.open.flags; + /* We cannot do exclusive creation on a positive dentry */ + if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) + goto no_open_dput; + /* We can't create new files, or truncate existing ones here */ + openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); + + ctx = nameidata_to_nfs_open_context(dentry, nd); + ret = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out; + /* + * Note: we're not holding inode->i_mutex and so may be racing with + * operations that change the directory. We therefore save the + * change attribute *before* we do the RPC call. + */ + inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + switch (ret) { + case -EPERM: + case -EACCES: + case -EDQUOT: + case -ENOSPC: + case -EROFS: + goto out_put_ctx; + default: + goto out_drop; + } + } + iput(inode); + if (inode != dentry->d_inode) + goto out_drop; + + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + ret = nfs_intent_set_file(nd, ctx); + if (ret >= 0) + ret = 1; +out: + dput(parent); + return ret; +out_drop: + d_drop(dentry); + ret = 0; +out_put_ctx: + put_nfs_open_context(ctx); + goto out; + +no_open_dput: + dput(parent); +no_open: + return nfs_lookup_revalidate(dentry, nd); +} + +static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct nfs_open_context *ctx = NULL; + struct iattr attr; + int error; + int open_flags = 0; + + dfprintk(VFS, "NFS: create(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + + attr.ia_mode = mode; + attr.ia_valid = ATTR_MODE; + + if ((nd->flags & LOOKUP_CREATE) != 0) { + open_flags = nd->intent.open.flags; + + ctx = nameidata_to_nfs_open_context(dentry, nd); + error = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out_err_drop; + } + + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx); + if (error != 0) + goto out_put_ctx; + if (ctx != NULL) { + error = nfs_intent_set_file(nd, ctx); + if (error < 0) + goto out_err; + } + return 0; +out_put_ctx: + if (ctx != NULL) + put_nfs_open_context(ctx); +out_err_drop: + d_drop(dentry); +out_err: + return error; +} + +#endif /* CONFIG_NFSV4 */ + +/* + * Code common to create, mkdir, and mknod. + */ +int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) +{ + struct dentry *parent = dget_parent(dentry); + struct inode *dir = parent->d_inode; + struct inode *inode; + int error = -EACCES; + + d_drop(dentry); + + /* We may have been initialized further down */ + if (dentry->d_inode) + goto out; + if (fhandle->size == 0) { + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); + if (error) + goto out_error; + } + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + if (!(fattr->valid & NFS_ATTR_FATTR)) { + struct nfs_server *server = NFS_SB(dentry->d_sb); + error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr); + if (error < 0) + goto out_error; + } + inode = nfs_fhget(dentry->d_sb, fhandle, fattr); + error = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_error; + d_add(dentry, inode); +out: + dput(parent); + return 0; +out_error: + nfs_mark_for_revalidate(dir); + dput(parent); + return error; +} + +/* + * Following a failed create operation, we drop the dentry rather + * than retain a negative dentry. This avoids a problem in the event + * that the operation succeeded on the server, but an error in the + * reply path made it appear to have failed. + */ +static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct iattr attr; + int error; + int open_flags = 0; + + dfprintk(VFS, "NFS: create(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + + attr.ia_mode = mode; + attr.ia_valid = ATTR_MODE; + + if ((nd->flags & LOOKUP_CREATE) != 0) + open_flags = nd->intent.open.flags; + + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL); + if (error != 0) + goto out_err; + return 0; +out_err: + d_drop(dentry); + return error; +} + +/* + * See comments for nfs_proc_create regarding failed operations. + */ +static int +nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +{ + struct iattr attr; + int status; + + dfprintk(VFS, "NFS: mknod(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + attr.ia_mode = mode; + attr.ia_valid = ATTR_MODE; + + status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); + if (status != 0) + goto out_err; + return 0; +out_err: + d_drop(dentry); + return status; +} + +/* + * See comments for nfs_proc_create regarding failed operations. + */ +static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct iattr attr; + int error; + + dfprintk(VFS, "NFS: mkdir(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + + attr.ia_valid = ATTR_MODE; + attr.ia_mode = mode | S_IFDIR; + + error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); + if (error != 0) + goto out_err; + return 0; +out_err: + d_drop(dentry); + return error; +} + +static void nfs_dentry_handle_enoent(struct dentry *dentry) +{ + if (dentry->d_inode != NULL && !d_unhashed(dentry)) + d_delete(dentry); +} + +static int nfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + int error; + + dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + + error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); + /* Ensure the VFS deletes this inode */ + if (error == 0 && dentry->d_inode != NULL) + clear_nlink(dentry->d_inode); + else if (error == -ENOENT) + nfs_dentry_handle_enoent(dentry); + + return error; +} + +/* + * Remove a file after making sure there are no pending writes, + * and after checking that the file has only one user. + * + * We invalidate the attribute cache and free the inode prior to the operation + * to avoid possible races if the server reuses the inode. + */ +static int nfs_safe_remove(struct dentry *dentry) +{ + struct inode *dir = dentry->d_parent->d_inode; + struct inode *inode = dentry->d_inode; + int error = -EBUSY; + + dfprintk(VFS, "NFS: safe_remove(%s/%s)\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + + /* If the dentry was sillyrenamed, we simply call d_delete() */ + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + error = 0; + goto out; + } + + if (inode != NULL) { + nfs_inode_return_delegation(inode); + error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + /* The VFS may want to delete this inode */ + if (error == 0) + nfs_drop_nlink(inode); + nfs_mark_for_revalidate(inode); + } else + error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + if (error == -ENOENT) + nfs_dentry_handle_enoent(dentry); +out: + return error; +} + +/* We do silly rename. In case sillyrename() returns -EBUSY, the inode + * belongs to an active ".nfs..." file and we return -EBUSY. + * + * If sillyrename() returns 0, we do nothing, otherwise we unlink. + */ +static int nfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int error; + int need_rehash = 0; + + dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, + dir->i_ino, dentry->d_name.name); + + spin_lock(&dentry->d_lock); + if (dentry->d_count > 1) { + spin_unlock(&dentry->d_lock); + /* Start asynchronous writeout of the inode */ + write_inode_now(dentry->d_inode, 0); + error = nfs_sillyrename(dir, dentry); + return error; + } + if (!d_unhashed(dentry)) { + __d_drop(dentry); + need_rehash = 1; + } + spin_unlock(&dentry->d_lock); + error = nfs_safe_remove(dentry); + if (!error || error == -ENOENT) { + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + } else if (need_rehash) + d_rehash(dentry); + return error; +} + +/* + * To create a symbolic link, most file systems instantiate a new inode, + * add a page to it containing the path, then write it out to the disk + * using prepare_write/commit_write. + * + * Unfortunately the NFS client can't create the in-core inode first + * because it needs a file handle to create an in-core inode (see + * fs/nfs/inode.c:nfs_fhget). We only have a file handle *after* the + * symlink request has completed on the server. + * + * So instead we allocate a raw page, copy the symname into it, then do + * the SYMLINK request with the page as the buffer. If it succeeds, we + * now have a new file handle and can instantiate an in-core NFS inode + * and move the raw page into its mapping. + */ +static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + struct pagevec lru_pvec; + struct page *page; + char *kaddr; + struct iattr attr; + unsigned int pathlen = strlen(symname); + int error; + + dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id, + dir->i_ino, dentry->d_name.name, symname); + + if (pathlen > PAGE_SIZE) + return -ENAMETOOLONG; + + attr.ia_mode = S_IFLNK | S_IRWXUGO; + attr.ia_valid = ATTR_MODE; + + page = alloc_page(GFP_HIGHUSER); + if (!page) + return -ENOMEM; + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr, symname, pathlen); + if (pathlen < PAGE_SIZE) + memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); + kunmap_atomic(kaddr, KM_USER0); + + error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); + if (error != 0) { + dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n", + dir->i_sb->s_id, dir->i_ino, + dentry->d_name.name, symname, error); + d_drop(dentry); + __free_page(page); + return error; + } + + /* + * No big deal if we can't add this page to the page cache here. + * READLINK will get the missing page from the server if needed. + */ + pagevec_init(&lru_pvec, 0); + if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, + GFP_KERNEL)) { + pagevec_add(&lru_pvec, page); + pagevec_lru_add_file(&lru_pvec); + SetPageUptodate(page); + unlock_page(page); + } else + __free_page(page); + + return 0; +} + +static int +nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + int error; + + dfprintk(VFS, "NFS: link(%s/%s -> %s/%s)\n", + old_dentry->d_parent->d_name.name, old_dentry->d_name.name, + dentry->d_parent->d_name.name, dentry->d_name.name); + + nfs_inode_return_delegation(inode); + + d_drop(dentry); + error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); + if (error == 0) { + ihold(inode); + d_add(dentry, inode); + } + return error; +} + +/* + * RENAME + * FIXME: Some nfsds, like the Linux user space nfsd, may generate a + * different file handle for the same inode after a rename (e.g. when + * moving to a different directory). A fail-safe method to do so would + * be to look up old_dir/old_name, create a link to new_dir/new_name and + * rename the old file using the sillyrename stuff. This way, the original + * file in old_dir will go away when the last process iput()s the inode. + * + * FIXED. + * + * It actually works quite well. One needs to have the possibility for + * at least one ".nfs..." file in each directory the file ever gets + * moved or linked to which happens automagically with the new + * implementation that only depends on the dcache stuff instead of + * using the inode layer + * + * Unfortunately, things are a little more complicated than indicated + * above. For a cross-directory move, we want to make sure we can get + * rid of the old inode after the operation. This means there must be + * no pending writes (if it's a file), and the use count must be 1. + * If these conditions are met, we can drop the dentries before doing + * the rename. + */ +static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct dentry *dentry = NULL, *rehash = NULL; + int error = -EBUSY; + + dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", + old_dentry->d_parent->d_name.name, old_dentry->d_name.name, + new_dentry->d_parent->d_name.name, new_dentry->d_name.name, + new_dentry->d_count); + + /* + * For non-directories, check whether the target is busy and if so, + * make a copy of the dentry and then do a silly-rename. If the + * silly-rename succeeds, the copied dentry is hashed and becomes + * the new target. + */ + if (new_inode && !S_ISDIR(new_inode->i_mode)) { + /* + * To prevent any new references to the target during the + * rename, we unhash the dentry in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + rehash = new_dentry; + } + + if (new_dentry->d_count > 2) { + int err; + + /* copy the target dentry's name */ + dentry = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!dentry) + goto out; + + /* silly-rename the existing target ... */ + err = nfs_sillyrename(new_dir, new_dentry); + if (err) + goto out; + + new_dentry = dentry; + rehash = NULL; + new_inode = NULL; + } + } + + nfs_inode_return_delegation(old_inode); + if (new_inode != NULL) + nfs_inode_return_delegation(new_inode); + + error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, + new_dir, &new_dentry->d_name); + nfs_mark_for_revalidate(old_inode); +out: + if (rehash) + d_rehash(rehash); + if (!error) { + if (new_inode != NULL) + nfs_drop_nlink(new_inode); + d_move(old_dentry, new_dentry); + nfs_set_verifier(new_dentry, + nfs_save_change_attribute(new_dir)); + } else if (error == -ENOENT) + nfs_dentry_handle_enoent(old_dentry); + + /* new dentry created? */ + if (dentry) + dput(dentry); + return error; +} + +static DEFINE_SPINLOCK(nfs_access_lru_lock); +static LIST_HEAD(nfs_access_lru_list); +static atomic_long_t nfs_access_nr_entries; + +static void nfs_access_free_entry(struct nfs_access_entry *entry) +{ + put_rpccred(entry->cred); + kfree(entry); + smp_mb__before_atomic_dec(); + atomic_long_dec(&nfs_access_nr_entries); + smp_mb__after_atomic_dec(); +} + +static void nfs_access_free_list(struct list_head *head) +{ + struct nfs_access_entry *cache; + + while (!list_empty(head)) { + cache = list_entry(head->next, struct nfs_access_entry, lru); + list_del(&cache->lru); + nfs_access_free_entry(cache); + } +} + +int nfs_access_cache_shrinker(struct shrinker *shrink, + struct shrink_control *sc) +{ + LIST_HEAD(head); + struct nfs_inode *nfsi, *next; + struct nfs_access_entry *cache; + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + + if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) + return (nr_to_scan == 0) ? 0 : -1; + + spin_lock(&nfs_access_lru_lock); + list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { + struct inode *inode; + + if (nr_to_scan-- == 0) + break; + inode = &nfsi->vfs_inode; + spin_lock(&inode->i_lock); + if (list_empty(&nfsi->access_cache_entry_lru)) + goto remove_lru_entry; + cache = list_entry(nfsi->access_cache_entry_lru.next, + struct nfs_access_entry, lru); + list_move(&cache->lru, &head); + rb_erase(&cache->rb_node, &nfsi->access_cache); + if (!list_empty(&nfsi->access_cache_entry_lru)) + list_move_tail(&nfsi->access_cache_inode_lru, + &nfs_access_lru_list); + else { +remove_lru_entry: + list_del_init(&nfsi->access_cache_inode_lru); + smp_mb__before_clear_bit(); + clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); + smp_mb__after_clear_bit(); + } + spin_unlock(&inode->i_lock); + } + spin_unlock(&nfs_access_lru_lock); + nfs_access_free_list(&head); + return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; +} + +static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) +{ + struct rb_root *root_node = &nfsi->access_cache; + struct rb_node *n; + struct nfs_access_entry *entry; + + /* Unhook entries from the cache */ + while ((n = rb_first(root_node)) != NULL) { + entry = rb_entry(n, struct nfs_access_entry, rb_node); + rb_erase(n, root_node); + list_move(&entry->lru, head); + } + nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; +} + +void nfs_access_zap_cache(struct inode *inode) +{ + LIST_HEAD(head); + + if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0) + return; + /* Remove from global LRU init */ + spin_lock(&nfs_access_lru_lock); + if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) + list_del_init(&NFS_I(inode)->access_cache_inode_lru); + + spin_lock(&inode->i_lock); + __nfs_access_zap_cache(NFS_I(inode), &head); + spin_unlock(&inode->i_lock); + spin_unlock(&nfs_access_lru_lock); + nfs_access_free_list(&head); +} + +static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) +{ + struct rb_node *n = NFS_I(inode)->access_cache.rb_node; + struct nfs_access_entry *entry; + + while (n != NULL) { + entry = rb_entry(n, struct nfs_access_entry, rb_node); + + if (cred < entry->cred) + n = n->rb_left; + else if (cred > entry->cred) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_access_entry *cache; + int err = -ENOENT; + + spin_lock(&inode->i_lock); + if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) + goto out_zap; + cache = nfs_access_search_rbtree(inode, cred); + if (cache == NULL) + goto out; + if (!nfs_have_delegated_attributes(inode) && + !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) + goto out_stale; + res->jiffies = cache->jiffies; + res->cred = cache->cred; + res->mask = cache->mask; + list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); + err = 0; +out: + spin_unlock(&inode->i_lock); + return err; +out_stale: + rb_erase(&cache->rb_node, &nfsi->access_cache); + list_del(&cache->lru); + spin_unlock(&inode->i_lock); + nfs_access_free_entry(cache); + return -ENOENT; +out_zap: + spin_unlock(&inode->i_lock); + nfs_access_zap_cache(inode); + return -ENOENT; +} + +static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct rb_root *root_node = &nfsi->access_cache; + struct rb_node **p = &root_node->rb_node; + struct rb_node *parent = NULL; + struct nfs_access_entry *entry; + + spin_lock(&inode->i_lock); + while (*p != NULL) { + parent = *p; + entry = rb_entry(parent, struct nfs_access_entry, rb_node); + + if (set->cred < entry->cred) + p = &parent->rb_left; + else if (set->cred > entry->cred) + p = &parent->rb_right; + else + goto found; + } + rb_link_node(&set->rb_node, parent, p); + rb_insert_color(&set->rb_node, root_node); + list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); + spin_unlock(&inode->i_lock); + return; +found: + rb_replace_node(parent, &set->rb_node, root_node); + list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); + list_del(&entry->lru); + spin_unlock(&inode->i_lock); + nfs_access_free_entry(entry); +} + +static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) +{ + struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); + if (cache == NULL) + return; + RB_CLEAR_NODE(&cache->rb_node); + cache->jiffies = set->jiffies; + cache->cred = get_rpccred(set->cred); + cache->mask = set->mask; + + nfs_access_add_rbtree(inode, cache); + + /* Update accounting */ + smp_mb__before_atomic_inc(); + atomic_long_inc(&nfs_access_nr_entries); + smp_mb__after_atomic_inc(); + + /* Add inode to global LRU list */ + if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { + spin_lock(&nfs_access_lru_lock); + if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) + list_add_tail(&NFS_I(inode)->access_cache_inode_lru, + &nfs_access_lru_list); + spin_unlock(&nfs_access_lru_lock); + } +} + +static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) +{ + struct nfs_access_entry cache; + int status; + + status = nfs_access_get_cached(inode, cred, &cache); + if (status == 0) + goto out; + + /* Be clever: ask server to check for all possible rights */ + cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; + cache.cred = cred; + cache.jiffies = jiffies; + status = NFS_PROTO(inode)->access(inode, &cache); + if (status != 0) { + if (status == -ESTALE) { + nfs_zap_caches(inode); + if (!S_ISDIR(inode->i_mode)) + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + } + return status; + } + nfs_access_add_cache(inode, &cache); +out: + if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) + return 0; + return -EACCES; +} + +static int nfs_open_permission_mask(int openflags) +{ + int mask = 0; + + if (openflags & FMODE_READ) + mask |= MAY_READ; + if (openflags & FMODE_WRITE) + mask |= MAY_WRITE; + if (openflags & FMODE_EXEC) + mask |= MAY_EXEC; + return mask; +} + +int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) +{ + return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); +} + +int nfs_permission(struct inode *inode, int mask, unsigned int flags) +{ + struct rpc_cred *cred; + int res = 0; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + nfs_inc_stats(inode, NFSIOS_VFSACCESS); + + if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) + goto out; + /* Is this sys_access() ? */ + if (mask & (MAY_ACCESS | MAY_CHDIR)) + goto force_lookup; + + switch (inode->i_mode & S_IFMT) { + case S_IFLNK: + goto out; + case S_IFREG: + /* NFSv4 has atomic_open... */ + if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN) + && (mask & MAY_OPEN) + && !(mask & MAY_EXEC)) + goto out; + break; + case S_IFDIR: + /* + * Optimize away all write operations, since the server + * will check permissions when we perform the op. + */ + if ((mask & MAY_WRITE) && !(mask & MAY_READ)) + goto out; + } + +force_lookup: + if (!NFS_PROTO(inode)->access) + goto out_notsup; + + cred = rpc_lookup_cred(); + if (!IS_ERR(cred)) { + res = nfs_do_access(inode, cred, mask); + put_rpccred(cred); + } else + res = PTR_ERR(cred); +out: + if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) + res = -EACCES; + + dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", + inode->i_sb->s_id, inode->i_ino, mask, res); + return res; +out_notsup: + res = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (res == 0) + res = generic_permission(inode, mask, flags, NULL); + goto out; +} + +/* + * Local variables: + * version-control: t + * kept-new-versions: 5 + * End: + */ diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c new file mode 100644 index 00000000..8eea2536 --- /dev/null +++ b/fs/nfs/direct.c @@ -0,0 +1,1039 @@ +/* + * linux/fs/nfs/direct.c + * + * Copyright (C) 2003 by Chuck Lever + * + * High-performance uncached I/O for the Linux NFS client + * + * There are important applications whose performance or correctness + * depends on uncached access to file data. Database clusters + * (multiple copies of the same instance running on separate hosts) + * implement their own cache coherency protocol that subsumes file + * system cache protocols. Applications that process datasets + * considerably larger than the client's memory do not always benefit + * from a local cache. A streaming video server, for instance, has no + * need to cache the contents of a file. + * + * When an application requests uncached I/O, all read and write requests + * are made directly to the server; data stored or fetched via these + * requests is not cached in the Linux page cache. The client does not + * correct unaligned requests from applications. All requested bytes are + * held on permanent storage before a direct write system call returns to + * an application. + * + * Solaris implements an uncached I/O facility called directio() that + * is used for backups and sequential I/O to very large files. Solaris + * also supports uncaching whole NFS partitions with "-o forcedirectio," + * an undocumented mount option. + * + * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with + * help from Andrew Morton. + * + * 18 Dec 2001 Initial implementation for 2.4 --cel + * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy + * 08 Jun 2003 Port to 2.5 APIs --cel + * 31 Mar 2004 Handle direct I/O without VFS support --cel + * 15 Sep 2004 Parallel async reads --cel + * 04 May 2005 support O_DIRECT with aio --cel + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "iostat.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +static struct kmem_cache *nfs_direct_cachep; + +/* + * This represents a set of asynchronous requests that we're waiting on + */ +struct nfs_direct_req { + struct kref kref; /* release manager */ + + /* I/O parameters */ + struct nfs_open_context *ctx; /* file open context info */ + struct nfs_lock_context *l_ctx; /* Lock context info */ + struct kiocb * iocb; /* controlling i/o request */ + struct inode * inode; /* target file of i/o */ + + /* completion state */ + atomic_t io_count; /* i/os we're waiting for */ + spinlock_t lock; /* protect completion state */ + ssize_t count, /* bytes actually processed */ + error; /* any reported error */ + struct completion completion; /* wait for i/o completion */ + + /* commit state */ + struct list_head rewrite_list; /* saved nfs_write_data structs */ + struct nfs_write_data * commit_data; /* special write_data for commits */ + int flags; +#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ +#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ + struct nfs_writeverf verf; /* unstable write verifier */ +}; + +static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode); +static const struct rpc_call_ops nfs_write_direct_ops; + +static inline void get_dreq(struct nfs_direct_req *dreq) +{ + atomic_inc(&dreq->io_count); +} + +static inline int put_dreq(struct nfs_direct_req *dreq) +{ + return atomic_dec_and_test(&dreq->io_count); +} + +/** + * nfs_direct_IO - NFS address space operation for direct I/O + * @rw: direction (read or write) + * @iocb: target I/O control block + * @iov: array of vectors that define I/O buffer + * @pos: offset in file to begin the operation + * @nr_segs: size of iovec array + * + * The presence of this routine in the address space ops vector means + * the NFS client supports direct I/O. However, we shunt off direct + * read and write requests before the VFS gets them, so this method + * should never be called. + */ +ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) +{ + dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", + iocb->ki_filp->f_path.dentry->d_name.name, + (long long) pos, nr_segs); + + return -EINVAL; +} + +static void nfs_direct_dirty_pages(struct page **pages, unsigned int pgbase, size_t count) +{ + unsigned int npages; + unsigned int i; + + if (count == 0) + return; + pages += (pgbase >> PAGE_SHIFT); + npages = (count + (pgbase & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + struct page *page = pages[i]; + if (!PageCompound(page)) + set_page_dirty(page); + } +} + +static void nfs_direct_release_pages(struct page **pages, unsigned int npages) +{ + unsigned int i; + for (i = 0; i < npages; i++) + page_cache_release(pages[i]); +} + +static inline struct nfs_direct_req *nfs_direct_req_alloc(void) +{ + struct nfs_direct_req *dreq; + + dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL); + if (!dreq) + return NULL; + + kref_init(&dreq->kref); + kref_get(&dreq->kref); + init_completion(&dreq->completion); + INIT_LIST_HEAD(&dreq->rewrite_list); + dreq->iocb = NULL; + dreq->ctx = NULL; + dreq->l_ctx = NULL; + spin_lock_init(&dreq->lock); + atomic_set(&dreq->io_count, 0); + dreq->count = 0; + dreq->error = 0; + dreq->flags = 0; + + return dreq; +} + +static void nfs_direct_req_free(struct kref *kref) +{ + struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); + + if (dreq->l_ctx != NULL) + nfs_put_lock_context(dreq->l_ctx); + if (dreq->ctx != NULL) + put_nfs_open_context(dreq->ctx); + kmem_cache_free(nfs_direct_cachep, dreq); +} + +static void nfs_direct_req_release(struct nfs_direct_req *dreq) +{ + kref_put(&dreq->kref, nfs_direct_req_free); +} + +/* + * Collects and returns the final error value/byte-count. + */ +static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq) +{ + ssize_t result = -EIOCBQUEUED; + + /* Async requests don't wait here */ + if (dreq->iocb) + goto out; + + result = wait_for_completion_killable(&dreq->completion); + + if (!result) + result = dreq->error; + if (!result) + result = dreq->count; + +out: + return (ssize_t) result; +} + +/* + * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust + * the iocb is still valid here if this is a synchronous request. + */ +static void nfs_direct_complete(struct nfs_direct_req *dreq) +{ + if (dreq->iocb) { + long res = (long) dreq->error; + if (!res) + res = (long) dreq->count; + aio_complete(dreq->iocb, res, 0); + } + complete_all(&dreq->completion); + + nfs_direct_req_release(dreq); +} + +/* + * We must hold a reference to all the pages in this direct read request + * until the RPCs complete. This could be long *after* we are woken up in + * nfs_direct_wait (for instance, if someone hits ^C on a slow server). + */ +static void nfs_direct_read_result(struct rpc_task *task, void *calldata) +{ + struct nfs_read_data *data = calldata; + + nfs_readpage_result(task, data); +} + +static void nfs_direct_read_release(void *calldata) +{ + + struct nfs_read_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + int status = data->task.tk_status; + + spin_lock(&dreq->lock); + if (unlikely(status < 0)) { + dreq->error = status; + spin_unlock(&dreq->lock); + } else { + dreq->count += data->res.count; + spin_unlock(&dreq->lock); + nfs_direct_dirty_pages(data->pagevec, + data->args.pgbase, + data->res.count); + } + nfs_direct_release_pages(data->pagevec, data->npages); + + if (put_dreq(dreq)) + nfs_direct_complete(dreq); + nfs_readdata_free(data); +} + +static const struct rpc_call_ops nfs_read_direct_ops = { +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_read_prepare, +#endif /* CONFIG_NFS_V4_1 */ + .rpc_call_done = nfs_direct_read_result, + .rpc_release = nfs_direct_read_release, +}; + +/* + * For each rsize'd chunk of the user's buffer, dispatch an NFS READ + * operation. If nfs_readdata_alloc() or get_user_pages() fails, + * bail and stop sending more reads. Read length accounting is + * handled automatically by nfs_direct_read_result(). Otherwise, if + * no requests have been sent, just return an error. + */ +static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, + const struct iovec *iov, + loff_t pos) +{ + struct nfs_open_context *ctx = dreq->ctx; + struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; + size_t rsize = NFS_SERVER(inode)->rsize; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = NFS_CLIENT(inode), + .rpc_message = &msg, + .callback_ops = &nfs_read_direct_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + unsigned int pgbase; + int result; + ssize_t started = 0; + + do { + struct nfs_read_data *data; + size_t bytes; + + pgbase = user_addr & ~PAGE_MASK; + bytes = min(rsize,count); + + result = -ENOMEM; + data = nfs_readdata_alloc(nfs_page_array_len(pgbase, bytes)); + if (unlikely(!data)) + break; + + down_read(¤t->mm->mmap_sem); + result = get_user_pages(current, current->mm, user_addr, + data->npages, 1, 0, data->pagevec, NULL); + up_read(¤t->mm->mmap_sem); + if (result < 0) { + nfs_readdata_free(data); + break; + } + if ((unsigned)result < data->npages) { + bytes = result * PAGE_SIZE; + if (bytes <= pgbase) { + nfs_direct_release_pages(data->pagevec, result); + nfs_readdata_free(data); + break; + } + bytes -= pgbase; + data->npages = result; + } + + get_dreq(dreq); + + data->req = (struct nfs_page *) dreq; + data->inode = inode; + data->cred = msg.rpc_cred; + data->args.fh = NFS_FH(inode); + data->args.context = ctx; + data->args.lock_context = dreq->l_ctx; + data->args.offset = pos; + data->args.pgbase = pgbase; + data->args.pages = data->pagevec; + data->args.count = bytes; + data->res.fattr = &data->fattr; + data->res.eof = 0; + data->res.count = bytes; + nfs_fattr_init(&data->fattr); + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + + task_setup_data.task = &data->task; + task_setup_data.callback_data = data; + NFS_PROTO(inode)->read_setup(data, &msg); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + break; + rpc_put_task(task); + + dprintk("NFS: %5u initiated direct read call " + "(req %s/%Ld, %zu bytes @ offset %Lu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + bytes, + (unsigned long long)data->args.offset); + + started += bytes; + user_addr += bytes; + pos += bytes; + /* FIXME: Remove this unnecessary math from final patch */ + pgbase += bytes; + pgbase &= ~PAGE_MASK; + BUG_ON(pgbase != (user_addr & ~PAGE_MASK)); + + count -= bytes; + } while (count != 0); + + if (started) + return started; + return result < 0 ? (ssize_t) result : -EFAULT; +} + +static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, + const struct iovec *iov, + unsigned long nr_segs, + loff_t pos) +{ + ssize_t result = -EINVAL; + size_t requested_bytes = 0; + unsigned long seg; + + get_dreq(dreq); + + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *vec = &iov[seg]; + result = nfs_direct_read_schedule_segment(dreq, vec, pos); + if (result < 0) + break; + requested_bytes += result; + if ((size_t)result < vec->iov_len) + break; + pos += vec->iov_len; + } + + /* + * If no bytes were started, return the error, and let the + * generic layer handle the completion. + */ + if (requested_bytes == 0) { + nfs_direct_req_release(dreq); + return result < 0 ? result : -EIO; + } + + if (put_dreq(dreq)) + nfs_direct_complete(dreq); + return 0; +} + +static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + ssize_t result = -ENOMEM; + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct nfs_direct_req *dreq; + + dreq = nfs_direct_req_alloc(); + if (dreq == NULL) + goto out; + + dreq->inode = inode; + dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + dreq->l_ctx = nfs_get_lock_context(dreq->ctx); + if (dreq->l_ctx == NULL) + goto out_release; + if (!is_sync_kiocb(iocb)) + dreq->iocb = iocb; + + result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); + if (!result) + result = nfs_direct_wait(dreq); +out_release: + nfs_direct_req_release(dreq); +out: + return result; +} + +static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) +{ + while (!list_empty(&dreq->rewrite_list)) { + struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); + list_del(&data->pages); + nfs_direct_release_pages(data->pagevec, data->npages); + nfs_writedata_free(data); + } +} + +#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) +{ + struct inode *inode = dreq->inode; + struct list_head *p; + struct nfs_write_data *data; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = dreq->ctx->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = NFS_CLIENT(inode), + .rpc_message = &msg, + .callback_ops = &nfs_write_direct_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + + dreq->count = 0; + get_dreq(dreq); + + list_for_each(p, &dreq->rewrite_list) { + data = list_entry(p, struct nfs_write_data, pages); + + get_dreq(dreq); + + /* Use stable writes */ + data->args.stable = NFS_FILE_SYNC; + + /* + * Reset data->res. + */ + nfs_fattr_init(&data->fattr); + data->res.count = data->args.count; + memset(&data->verf, 0, sizeof(data->verf)); + + /* + * Reuse data->task; data->args should not have changed + * since the original request was sent. + */ + task_setup_data.task = &data->task; + task_setup_data.callback_data = data; + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + NFS_PROTO(inode)->write_setup(data, &msg); + + /* + * We're called via an RPC callback, so BKL is already held. + */ + task = rpc_run_task(&task_setup_data); + if (!IS_ERR(task)) + rpc_put_task(task); + + dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + } + + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, inode); +} + +static void nfs_direct_commit_result(struct rpc_task *task, void *calldata) +{ + struct nfs_write_data *data = calldata; + + /* Call the NFS version-specific code */ + NFS_PROTO(data->inode)->commit_done(task, data); +} + +static void nfs_direct_commit_release(void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + int status = data->task.tk_status; + + if (status < 0) { + dprintk("NFS: %5u commit failed with error %d.\n", + data->task.tk_pid, status); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) { + dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + } + + dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); + nfs_direct_write_complete(dreq, data->inode); + nfs_commit_free(data); +} + +static const struct rpc_call_ops nfs_commit_direct_ops = { +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_write_prepare, +#endif /* CONFIG_NFS_V4_1 */ + .rpc_call_done = nfs_direct_commit_result, + .rpc_release = nfs_direct_commit_release, +}; + +static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) +{ + struct nfs_write_data *data = dreq->commit_data; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = dreq->ctx->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, + .rpc_client = NFS_CLIENT(dreq->inode), + .rpc_message = &msg, + .callback_ops = &nfs_commit_direct_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + + data->inode = dreq->inode; + data->cred = msg.rpc_cred; + + data->args.fh = NFS_FH(data->inode); + data->args.offset = 0; + data->args.count = 0; + data->args.context = dreq->ctx; + data->args.lock_context = dreq->l_ctx; + data->res.count = 0; + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + + NFS_PROTO(data->inode)->commit_setup(data, &msg); + + /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ + dreq->commit_data = NULL; + + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + + task = rpc_run_task(&task_setup_data); + if (!IS_ERR(task)) + rpc_put_task(task); +} + +static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +{ + int flags = dreq->flags; + + dreq->flags = 0; + switch (flags) { + case NFS_ODIRECT_DO_COMMIT: + nfs_direct_commit_schedule(dreq); + break; + case NFS_ODIRECT_RESCHED_WRITES: + nfs_direct_write_reschedule(dreq); + break; + default: + if (dreq->commit_data != NULL) + nfs_commit_free(dreq->commit_data); + nfs_direct_free_writedata(dreq); + nfs_zap_mapping(inode, inode->i_mapping); + nfs_direct_complete(dreq); + } +} + +static void nfs_alloc_commit_data(struct nfs_direct_req *dreq) +{ + dreq->commit_data = nfs_commitdata_alloc(); + if (dreq->commit_data != NULL) + dreq->commit_data->req = (struct nfs_page *) dreq; +} +#else +static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq) +{ + dreq->commit_data = NULL; +} + +static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +{ + nfs_direct_free_writedata(dreq); + nfs_zap_mapping(inode, inode->i_mapping); + nfs_direct_complete(dreq); +} +#endif + +static void nfs_direct_write_result(struct rpc_task *task, void *calldata) +{ + struct nfs_write_data *data = calldata; + + nfs_writeback_done(task, data); +} + +/* + * NB: Return the value of the first error return code. Subsequent + * errors after the first one are ignored. + */ +static void nfs_direct_write_release(void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + int status = data->task.tk_status; + + spin_lock(&dreq->lock); + + if (unlikely(status < 0)) { + /* An error has occurred, so we should not commit */ + dreq->flags = 0; + dreq->error = status; + } + if (unlikely(dreq->error != 0)) + goto out_unlock; + + dreq->count += data->res.count; + + if (data->res.verf->committed != NFS_FILE_SYNC) { + switch (dreq->flags) { + case 0: + memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf)); + dreq->flags = NFS_ODIRECT_DO_COMMIT; + break; + case NFS_ODIRECT_DO_COMMIT: + if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) { + dprintk("NFS: %5u write verify failed\n", data->task.tk_pid); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + } + } + } +out_unlock: + spin_unlock(&dreq->lock); + + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, data->inode); +} + +static const struct rpc_call_ops nfs_write_direct_ops = { +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_write_prepare, +#endif /* CONFIG_NFS_V4_1 */ + .rpc_call_done = nfs_direct_write_result, + .rpc_release = nfs_direct_write_release, +}; + +/* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, + * bail and stop sending more writes. Write length accounting is + * handled automatically by nfs_direct_write_result(). Otherwise, if + * no requests have been sent, just return an error. + */ +static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, + const struct iovec *iov, + loff_t pos, int sync) +{ + struct nfs_open_context *ctx = dreq->ctx; + struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = NFS_CLIENT(inode), + .rpc_message = &msg, + .callback_ops = &nfs_write_direct_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + size_t wsize = NFS_SERVER(inode)->wsize; + unsigned int pgbase; + int result; + ssize_t started = 0; + + do { + struct nfs_write_data *data; + size_t bytes; + + pgbase = user_addr & ~PAGE_MASK; + bytes = min(wsize,count); + + result = -ENOMEM; + data = nfs_writedata_alloc(nfs_page_array_len(pgbase, bytes)); + if (unlikely(!data)) + break; + + down_read(¤t->mm->mmap_sem); + result = get_user_pages(current, current->mm, user_addr, + data->npages, 0, 0, data->pagevec, NULL); + up_read(¤t->mm->mmap_sem); + if (result < 0) { + nfs_writedata_free(data); + break; + } + if ((unsigned)result < data->npages) { + bytes = result * PAGE_SIZE; + if (bytes <= pgbase) { + nfs_direct_release_pages(data->pagevec, result); + nfs_writedata_free(data); + break; + } + bytes -= pgbase; + data->npages = result; + } + + get_dreq(dreq); + + list_move_tail(&data->pages, &dreq->rewrite_list); + + data->req = (struct nfs_page *) dreq; + data->inode = inode; + data->cred = msg.rpc_cred; + data->args.fh = NFS_FH(inode); + data->args.context = ctx; + data->args.lock_context = dreq->l_ctx; + data->args.offset = pos; + data->args.pgbase = pgbase; + data->args.pages = data->pagevec; + data->args.count = bytes; + data->args.stable = sync; + data->res.fattr = &data->fattr; + data->res.count = bytes; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + + task_setup_data.task = &data->task; + task_setup_data.callback_data = data; + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + NFS_PROTO(inode)->write_setup(data, &msg); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + break; + rpc_put_task(task); + + dprintk("NFS: %5u initiated direct write call " + "(req %s/%Ld, %zu bytes @ offset %Lu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + bytes, + (unsigned long long)data->args.offset); + + started += bytes; + user_addr += bytes; + pos += bytes; + + /* FIXME: Remove this useless math from the final patch */ + pgbase += bytes; + pgbase &= ~PAGE_MASK; + BUG_ON(pgbase != (user_addr & ~PAGE_MASK)); + + count -= bytes; + } while (count != 0); + + if (started) + return started; + return result < 0 ? (ssize_t) result : -EFAULT; +} + +static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, + const struct iovec *iov, + unsigned long nr_segs, + loff_t pos, int sync) +{ + ssize_t result = 0; + size_t requested_bytes = 0; + unsigned long seg; + + get_dreq(dreq); + + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *vec = &iov[seg]; + result = nfs_direct_write_schedule_segment(dreq, vec, + pos, sync); + if (result < 0) + break; + requested_bytes += result; + if ((size_t)result < vec->iov_len) + break; + pos += vec->iov_len; + } + + /* + * If no bytes were started, return the error, and let the + * generic layer handle the completion. + */ + if (requested_bytes == 0) { + nfs_direct_req_release(dreq); + return result < 0 ? result : -EIO; + } + + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, dreq->inode); + return 0; +} + +static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, + size_t count) +{ + ssize_t result = -ENOMEM; + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct nfs_direct_req *dreq; + size_t wsize = NFS_SERVER(inode)->wsize; + int sync = NFS_UNSTABLE; + + dreq = nfs_direct_req_alloc(); + if (!dreq) + goto out; + nfs_alloc_commit_data(dreq); + + if (dreq->commit_data == NULL || count <= wsize) + sync = NFS_FILE_SYNC; + + dreq->inode = inode; + dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + dreq->l_ctx = nfs_get_lock_context(dreq->ctx); + if (dreq->l_ctx == NULL) + goto out_release; + if (!is_sync_kiocb(iocb)) + dreq->iocb = iocb; + + result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync); + if (!result) + result = nfs_direct_wait(dreq); +out_release: + nfs_direct_req_release(dreq); +out: + return result; +} + +/** + * nfs_file_direct_read - file direct read operation for NFS files + * @iocb: target I/O control block + * @iov: vector of user buffers into which to read data + * @nr_segs: size of iov vector + * @pos: byte offset in file where reading starts + * + * We use this function for direct reads instead of calling + * generic_file_aio_read() in order to avoid gfar's check to see if + * the request starts before the end of the file. For that check + * to work, we must generate a GETATTR before each direct read, and + * even then there is a window between the GETATTR and the subsequent + * READ where the file size could change. Our preference is simply + * to do all reads the application wants, and the server will take + * care of managing the end of file boundary. + * + * This function also eliminates unnecessarily updating the file's + * atime locally, as the NFS server sets the file's atime, and this + * client must read the updated atime from the server back into its + * cache. + */ +ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + ssize_t retval = -EINVAL; + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + size_t count; + + count = iov_length(iov, nr_segs); + nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); + + dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + count, (long long) pos); + + retval = 0; + if (!count) + goto out; + + retval = nfs_sync_mapping(mapping); + if (retval) + goto out; + + task_io_account_read(count); + + retval = nfs_direct_read(iocb, iov, nr_segs, pos); + if (retval > 0) + iocb->ki_pos = pos + retval; + +out: + return retval; +} + +/** + * nfs_file_direct_write - file direct write operation for NFS files + * @iocb: target I/O control block + * @iov: vector of user buffers from which to write data + * @nr_segs: size of iov vector + * @pos: byte offset in file where writing starts + * + * We use this function for direct writes instead of calling + * generic_file_aio_write() in order to avoid taking the inode + * semaphore and updating the i_size. The NFS server will set + * the new i_size and this client must read the updated size + * back into its cache. We let the server do generic write + * parameter checking and report problems. + * + * We eliminate local atime updates, see direct read above. + * + * We avoid unnecessary page cache invalidations for normal cached + * readers of this file. + * + * Note that O_APPEND is not supported for NFS direct writes, as there + * is no atomic O_APPEND write facility in the NFS protocol. + */ +ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + ssize_t retval = -EINVAL; + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + size_t count; + + count = iov_length(iov, nr_segs); + nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); + + dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + count, (long long) pos); + + retval = generic_write_checks(file, &pos, &count, 0); + if (retval) + goto out; + + retval = -EINVAL; + if ((ssize_t) count < 0) + goto out; + retval = 0; + if (!count) + goto out; + + retval = nfs_sync_mapping(mapping); + if (retval) + goto out; + + task_io_account_write(count); + + retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); + + if (retval > 0) + iocb->ki_pos = pos + retval; + +out: + return retval; +} + +/** + * nfs_init_directcache - create a slab cache for nfs_direct_req structures + * + */ +int __init nfs_init_directcache(void) +{ + nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", + sizeof(struct nfs_direct_req), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + NULL); + if (nfs_direct_cachep == NULL) + return -ENOMEM; + + return 0; +} + +/** + * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures + * + */ +void nfs_destroy_directcache(void) +{ + kmem_cache_destroy(nfs_direct_cachep); +} diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c new file mode 100644 index 00000000..a6e711ad --- /dev/null +++ b/fs/nfs/dns_resolve.c @@ -0,0 +1,372 @@ +/* + * linux/fs/nfs/dns_resolve.c + * + * Copyright (c) 2009 Trond Myklebust + * + * Resolves DNS hostnames into valid ip addresses + */ + +#ifdef CONFIG_NFS_USE_KERNEL_DNS + +#include +#include + +ssize_t nfs_dns_resolve_name(char *name, size_t namelen, + struct sockaddr *sa, size_t salen) +{ + ssize_t ret; + char *ip_addr = NULL; + int ip_len; + + ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL); + if (ip_len > 0) + ret = rpc_pton(ip_addr, ip_len, sa, salen); + else + ret = -ESRCH; + kfree(ip_addr); + return ret; +} + +#else + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dns_resolve.h" +#include "cache_lib.h" + +#define NFS_DNS_HASHBITS 4 +#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS) + +static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE]; + +struct nfs_dns_ent { + struct cache_head h; + + char *hostname; + size_t namelen; + + struct sockaddr_storage addr; + size_t addrlen; +}; + + +static void nfs_dns_ent_update(struct cache_head *cnew, + struct cache_head *ckey) +{ + struct nfs_dns_ent *new; + struct nfs_dns_ent *key; + + new = container_of(cnew, struct nfs_dns_ent, h); + key = container_of(ckey, struct nfs_dns_ent, h); + + memcpy(&new->addr, &key->addr, key->addrlen); + new->addrlen = key->addrlen; +} + +static void nfs_dns_ent_init(struct cache_head *cnew, + struct cache_head *ckey) +{ + struct nfs_dns_ent *new; + struct nfs_dns_ent *key; + + new = container_of(cnew, struct nfs_dns_ent, h); + key = container_of(ckey, struct nfs_dns_ent, h); + + kfree(new->hostname); + new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL); + if (new->hostname) { + new->namelen = key->namelen; + nfs_dns_ent_update(cnew, ckey); + } else { + new->namelen = 0; + new->addrlen = 0; + } +} + +static void nfs_dns_ent_put(struct kref *ref) +{ + struct nfs_dns_ent *item; + + item = container_of(ref, struct nfs_dns_ent, h.ref); + kfree(item->hostname); + kfree(item); +} + +static struct cache_head *nfs_dns_ent_alloc(void) +{ + struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL); + + if (item != NULL) { + item->hostname = NULL; + item->namelen = 0; + item->addrlen = 0; + return &item->h; + } + return NULL; +}; + +static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key) +{ + return hash_str(key->hostname, NFS_DNS_HASHBITS); +} + +static void nfs_dns_request(struct cache_detail *cd, + struct cache_head *ch, + char **bpp, int *blen) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + + qword_add(bpp, blen, key->hostname); + (*bpp)[-1] = '\n'; +} + +static int nfs_dns_upcall(struct cache_detail *cd, + struct cache_head *ch) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + int ret; + + ret = nfs_cache_upcall(cd, key->hostname); + if (ret) + ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request); + return ret; +} + +static int nfs_dns_match(struct cache_head *ca, + struct cache_head *cb) +{ + struct nfs_dns_ent *a; + struct nfs_dns_ent *b; + + a = container_of(ca, struct nfs_dns_ent, h); + b = container_of(cb, struct nfs_dns_ent, h); + + if (a->namelen == 0 || a->namelen != b->namelen) + return 0; + return memcmp(a->hostname, b->hostname, a->namelen) == 0; +} + +static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd, + struct cache_head *h) +{ + struct nfs_dns_ent *item; + long ttl; + + if (h == NULL) { + seq_puts(m, "# ip address hostname ttl\n"); + return 0; + } + item = container_of(h, struct nfs_dns_ent, h); + ttl = item->h.expiry_time - seconds_since_boot(); + if (ttl < 0) + ttl = 0; + + if (!test_bit(CACHE_NEGATIVE, &h->flags)) { + char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1]; + + rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf)); + seq_printf(m, "%15s ", buf); + } else + seq_puts(m, " "); + seq_printf(m, "%15s %ld\n", item->hostname, ttl); + return 0; +} + +static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_lookup(cd, + &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +static struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd, + struct nfs_dns_ent *new, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_update(cd, + &new->h, &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) +{ + char buf1[NFS_DNS_HOSTNAME_MAXLEN+1]; + struct nfs_dns_ent key, *item; + unsigned long ttl; + ssize_t len; + int ret = -EINVAL; + + if (buf[buflen-1] != '\n') + goto out; + buf[buflen-1] = '\0'; + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + key.addrlen = rpc_pton(buf1, len, + (struct sockaddr *)&key.addr, + sizeof(key.addr)); + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + + key.hostname = buf1; + key.namelen = len; + memset(&key.h, 0, sizeof(key.h)); + + ttl = get_expiry(&buf); + if (ttl == 0) + goto out; + key.h.expiry_time = ttl + seconds_since_boot(); + + ret = -ENOMEM; + item = nfs_dns_lookup(cd, &key); + if (item == NULL) + goto out; + + if (key.addrlen == 0) + set_bit(CACHE_NEGATIVE, &key.h.flags); + + item = nfs_dns_update(cd, &key, item); + if (item == NULL) + goto out; + + ret = 0; + cache_put(&item->h, cd); +out: + return ret; +} + +static struct cache_detail nfs_dns_resolve = { + .owner = THIS_MODULE, + .hash_size = NFS_DNS_HASHTBL_SIZE, + .hash_table = nfs_dns_table, + .name = "dns_resolve", + .cache_put = nfs_dns_ent_put, + .cache_upcall = nfs_dns_upcall, + .cache_parse = nfs_dns_parse, + .cache_show = nfs_dns_show, + .match = nfs_dns_match, + .init = nfs_dns_ent_init, + .update = nfs_dns_ent_update, + .alloc = nfs_dns_ent_alloc, +}; + +static int do_cache_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item, + struct nfs_cache_defer_req *dreq) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (*item) { + ret = cache_check(cd, &(*item)->h, &dreq->req); + if (ret) + *item = NULL; + } + return ret; +} + +static int do_cache_lookup_nowait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (!*item) + goto out_err; + ret = -ETIMEDOUT; + if (!test_bit(CACHE_VALID, &(*item)->h.flags) + || (*item)->h.expiry_time < seconds_since_boot() + || cd->flush_time > (*item)->h.last_refresh) + goto out_put; + ret = -ENOENT; + if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags)) + goto out_put; + return 0; +out_put: + cache_put(&(*item)->h, cd); +out_err: + *item = NULL; + return ret; +} + +static int do_cache_lookup_wait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + struct nfs_cache_defer_req *dreq; + int ret = -ENOMEM; + + dreq = nfs_cache_defer_req_alloc(); + if (!dreq) + goto out; + ret = do_cache_lookup(cd, key, item, dreq); + if (ret == -EAGAIN) { + ret = nfs_cache_wait_for_upcall(dreq); + if (!ret) + ret = do_cache_lookup_nowait(cd, key, item); + } + nfs_cache_defer_req_put(dreq); +out: + return ret; +} + +ssize_t nfs_dns_resolve_name(char *name, size_t namelen, + struct sockaddr *sa, size_t salen) +{ + struct nfs_dns_ent key = { + .hostname = name, + .namelen = namelen, + }; + struct nfs_dns_ent *item = NULL; + ssize_t ret; + + ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item); + if (ret == 0) { + if (salen >= item->addrlen) { + memcpy(sa, &item->addr, item->addrlen); + ret = item->addrlen; + } else + ret = -EOVERFLOW; + cache_put(&item->h, &nfs_dns_resolve); + } else if (ret == -ENOENT) + ret = -ESRCH; + return ret; +} + +int nfs_dns_resolver_init(void) +{ + return nfs_cache_register(&nfs_dns_resolve); +} + +void nfs_dns_resolver_destroy(void) +{ + nfs_cache_unregister(&nfs_dns_resolve); +} + +#endif diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h new file mode 100644 index 00000000..199bb554 --- /dev/null +++ b/fs/nfs/dns_resolve.h @@ -0,0 +1,26 @@ +/* + * Resolve DNS hostnames into valid ip addresses + */ +#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H +#define __LINUX_FS_NFS_DNS_RESOLVE_H + +#define NFS_DNS_HOSTNAME_MAXLEN (128) + + +#ifdef CONFIG_NFS_USE_KERNEL_DNS +static inline int nfs_dns_resolver_init(void) +{ + return 0; +} + +static inline void nfs_dns_resolver_destroy(void) +{} +#else +extern int nfs_dns_resolver_init(void); +extern void nfs_dns_resolver_destroy(void); +#endif + +extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen, + struct sockaddr *sa, size_t salen); + +#endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c new file mode 100644 index 00000000..dd2f1307 --- /dev/null +++ b/fs/nfs/file.c @@ -0,0 +1,921 @@ +/* + * linux/fs/nfs/file.c + * + * Copyright (C) 1992 Rick Sladkey + * + * Changes Copyright (C) 1994 by Florian La Roche + * - Do not copy data too often around in the kernel. + * - In nfs_file_read the return value of kmalloc wasn't checked. + * - Put in a better version of read look-ahead buffering. Original idea + * and implementation by Wai S Kok elekokws@ee.nus.sg. + * + * Expire cache on write to a file by Wai S Kok (Oct 1994). + * + * Total rewrite of read side for new NFS buffer cache.. Linus. + * + * nfs regular file handling functions + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "delegation.h" +#include "internal.h" +#include "iostat.h" +#include "fscache.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_FILE + +static int nfs_file_open(struct inode *, struct file *); +static int nfs_file_release(struct inode *, struct file *); +static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin); +static int nfs_file_mmap(struct file *, struct vm_area_struct *); +static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t count, unsigned int flags); +static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); +static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, + struct file *filp, loff_t *ppos, + size_t count, unsigned int flags); +static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); +static int nfs_file_flush(struct file *, fl_owner_t id); +static int nfs_file_fsync(struct file *, int datasync); +static int nfs_check_flags(int flags); +static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); +static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); +static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); + +static const struct vm_operations_struct nfs_file_vm_ops; + +const struct file_operations nfs_file_operations = { + .llseek = nfs_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = nfs_file_read, + .aio_write = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = nfs_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; + +const struct inode_operations nfs_file_inode_operations = { + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +}; + +#ifdef CONFIG_NFS_V3 +const struct inode_operations nfs3_file_inode_operations = { + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .listxattr = nfs3_listxattr, + .getxattr = nfs3_getxattr, + .setxattr = nfs3_setxattr, + .removexattr = nfs3_removexattr, +}; +#endif /* CONFIG_NFS_v3 */ + +/* Hack for future NFS swap support */ +#ifndef IS_SWAPFILE +# define IS_SWAPFILE(inode) (0) +#endif + +static int nfs_check_flags(int flags) +{ + if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT)) + return -EINVAL; + + return 0; +} + +/* + * Open file + */ +static int +nfs_file_open(struct inode *inode, struct file *filp) +{ + int res; + + dprintk("NFS: open file(%s/%s)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name); + + nfs_inc_stats(inode, NFSIOS_VFSOPEN); + res = nfs_check_flags(filp->f_flags); + if (res) + return res; + + res = nfs_open(inode, filp); + return res; +} + +static int +nfs_file_release(struct inode *inode, struct file *filp) +{ + struct dentry *dentry = filp->f_path.dentry; + + dprintk("NFS: release(%s/%s)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name); + + nfs_inc_stats(inode, NFSIOS_VFSRELEASE); + return nfs_release(inode, filp); +} + +/** + * nfs_revalidate_size - Revalidate the file size + * @inode - pointer to inode struct + * @file - pointer to struct file + * + * Revalidates the file length. This is basically a wrapper around + * nfs_revalidate_inode() that takes into account the fact that we may + * have cached writes (in which case we don't care about the server's + * idea of what the file length is), or O_DIRECT (in which case we + * shouldn't trust the cache). + */ +static int nfs_revalidate_file_size(struct inode *inode, struct file *filp) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + + if (nfs_have_delegated_attributes(inode)) + goto out_noreval; + + if (filp->f_flags & O_DIRECT) + goto force_reval; + if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + goto force_reval; + if (nfs_attribute_timeout(inode)) + goto force_reval; +out_noreval: + return 0; +force_reval: + return __nfs_revalidate_inode(server, inode); +} + +static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) +{ + loff_t loff; + + dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name, + offset, origin); + + /* origin == SEEK_END => we must revalidate the cached file length */ + if (origin == SEEK_END) { + struct inode *inode = filp->f_mapping->host; + + int retval = nfs_revalidate_file_size(inode, filp); + if (retval < 0) + return (loff_t)retval; + + spin_lock(&inode->i_lock); + loff = generic_file_llseek_unlocked(filp, offset, origin); + spin_unlock(&inode->i_lock); + } else + loff = generic_file_llseek_unlocked(filp, offset, origin); + return loff; +} + +/* + * Flush all dirty pages, and check for write errors. + */ +static int +nfs_file_flush(struct file *file, fl_owner_t id) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + + dprintk("NFS: flush(%s/%s)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name); + + nfs_inc_stats(inode, NFSIOS_VFSFLUSH); + if ((file->f_mode & FMODE_WRITE) == 0) + return 0; + + /* Flush writes to the server and return any errors */ + return vfs_fsync(file, 0); +} + +static ssize_t +nfs_file_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct dentry * dentry = iocb->ki_filp->f_path.dentry; + struct inode * inode = dentry->d_inode; + ssize_t result; + size_t count = iov_length(iov, nr_segs); + + if (iocb->ki_filp->f_flags & O_DIRECT) + return nfs_file_direct_read(iocb, iov, nr_segs, pos); + + dprintk("NFS: read(%s/%s, %lu@%lu)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + (unsigned long) count, (unsigned long) pos); + + result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); + if (!result) { + result = generic_file_aio_read(iocb, iov, nr_segs, pos); + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); + } + return result; +} + +static ssize_t +nfs_file_splice_read(struct file *filp, loff_t *ppos, + struct pipe_inode_info *pipe, size_t count, + unsigned int flags) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + ssize_t res; + + dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + (unsigned long) count, (unsigned long long) *ppos); + + res = nfs_revalidate_mapping(inode, filp->f_mapping); + if (!res) { + res = generic_file_splice_read(filp, ppos, pipe, count, flags); + if (res > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); + } + return res; +} + +static int +nfs_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + int status; + + dprintk("NFS: mmap(%s/%s)\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + + /* Note: generic_file_mmap() returns ENOSYS on nommu systems + * so we call that before revalidating the mapping + */ + status = generic_file_mmap(file, vma); + if (!status) { + vma->vm_ops = &nfs_file_vm_ops; + status = nfs_revalidate_mapping(inode, file->f_mapping); + } + return status; +} + +/* + * Flush any dirty pages for this process, and check for write errors. + * The return status from this call provides a reliable indication of + * whether any write errors occurred for this process. + * + * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to + * disk, but it retrieves and clears ctx->error after synching, despite + * the two being set at the same time in nfs_context_set_write_error(). + * This is because the former is used to notify the _next_ call to + * nfs_file_write() that a write error occurred, and hence cause it to + * fall back to doing a synchronous write. + */ +static int +nfs_file_fsync(struct file *file, int datasync) +{ + struct dentry *dentry = file->f_path.dentry; + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct inode *inode = dentry->d_inode; + int have_error, status; + int ret = 0; + + + dprintk("NFS: fsync file(%s/%s) datasync %d\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + datasync); + + nfs_inc_stats(inode, NFSIOS_VFSFSYNC); + have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + status = nfs_commit_inode(inode, FLUSH_SYNC); + have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + if (have_error) + ret = xchg(&ctx->error, 0); + if (!ret && status < 0) + ret = status; + if (!ret && !datasync) + /* application has asked for meta-data sync */ + ret = pnfs_layoutcommit_inode(inode, true); + return ret; +} + +/* + * Decide whether a read/modify/write cycle may be more efficient + * then a modify/write/read cycle when writing to a page in the + * page cache. + * + * The modify/write/read cycle may occur if a page is read before + * being completely filled by the writer. In this situation, the + * page must be completely written to stable storage on the server + * before it can be refilled by reading in the page from the server. + * This can lead to expensive, small, FILE_SYNC mode writes being + * done. + * + * It may be more efficient to read the page first if the file is + * open for reading in addition to writing, the page is not marked + * as Uptodate, it is not dirty or waiting to be committed, + * indicating that it was previously allocated and then modified, + * that there were valid bytes of data in that range of the file, + * and that the new data won't completely replace the old data in + * that range of the file. + */ +static int nfs_want_read_modify_write(struct file *file, struct page *page, + loff_t pos, unsigned len) +{ + unsigned int pglen = nfs_page_length(page); + unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); + unsigned int end = offset + len; + + if ((file->f_mode & FMODE_READ) && /* open for read? */ + !PageUptodate(page) && /* Uptodate? */ + !PagePrivate(page) && /* i/o request already? */ + pglen && /* valid bytes of file? */ + (end < pglen || offset)) /* replace all valid bytes? */ + return 1; + return 0; +} + +/* + * This does the "real" work of the write. We must allocate and lock the + * page to be sent back to the generic routine, which then copies the + * data from user space. + * + * If the writer ends up delaying the write, the writer needs to + * increment the page use counts until he is done with the page. + */ +static int nfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int once_thru = 0; + + dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + +start: + /* + * Prevent starvation issues if someone is doing a consistency + * sync-to-disk + */ + ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) + return ret; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + ret = nfs_flush_incompatible(file, page); + if (ret) { + unlock_page(page); + page_cache_release(page); + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; + ret = nfs_readpage(file, page); + page_cache_release(page); + if (!ret) + goto start; + } + return ret; +} + +static int nfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + int status; + + dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + + /* + * Zero any uninitialised parts of the page, and then mark the page + * as up to date if it turns out that we're extending the file. + */ + if (!PageUptodate(page)) { + unsigned pglen = nfs_page_length(page); + unsigned end = offset + len; + + if (pglen == 0) { + zero_user_segments(page, 0, offset, + end, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } else if (end >= pglen) { + zero_user_segment(page, end, PAGE_CACHE_SIZE); + if (offset == 0) + SetPageUptodate(page); + } else + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + + status = nfs_updatepage(file, page, offset, copied); + + unlock_page(page); + page_cache_release(page); + + if (status < 0) + return status; + return copied; +} + +/* + * Partially or wholly invalidate a page + * - Release the private state associated with a page if undergoing complete + * page invalidation + * - Called if either PG_private or PG_fscache is set on the page + * - Caller holds page lock + */ +static void nfs_invalidate_page(struct page *page, unsigned long offset) +{ + dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); + + if (offset != 0) + return; + /* Cancel any unstarted writes on this page */ + nfs_wb_page_cancel(page->mapping->host, page); + + nfs_fscache_invalidate_page(page, page->mapping->host); +} + +/* + * Attempt to release the private state associated with a page + * - Called if either PG_private or PG_fscache is set on the page + * - Caller holds page lock + * - Return true (may release page) or false (may not) + */ +static int nfs_release_page(struct page *page, gfp_t gfp) +{ + struct address_space *mapping = page->mapping; + + dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); + + /* Only do I/O if gfp is a superset of GFP_KERNEL */ + if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) { + int how = FLUSH_SYNC; + + /* Don't let kswapd deadlock waiting for OOM RPC calls */ + if (current_is_kswapd()) + how = 0; + nfs_commit_inode(mapping->host, how); + } + /* If PagePrivate() is set, then the page is not freeable */ + if (PagePrivate(page)) + return 0; + return nfs_fscache_release_page(page, gfp); +} + +/* + * Attempt to clear the private state associated with a page when an error + * occurs that requires the cached contents of an inode to be written back or + * destroyed + * - Called if either PG_private or fscache is set on the page + * - Caller holds page lock + * - Return 0 if successful, -error otherwise + */ +static int nfs_launder_page(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", + inode->i_ino, (long long)page_offset(page)); + + nfs_fscache_wait_on_page_write(nfsi, page); + return nfs_wb_page(inode, page); +} + +const struct address_space_operations nfs_file_aops = { + .readpage = nfs_readpage, + .readpages = nfs_readpages, + .set_page_dirty = __set_page_dirty_nobuffers, + .writepage = nfs_writepage, + .writepages = nfs_writepages, + .write_begin = nfs_write_begin, + .write_end = nfs_write_end, + .invalidatepage = nfs_invalidate_page, + .releasepage = nfs_release_page, + .direct_IO = nfs_direct_IO, + .migratepage = nfs_migrate_page, + .launder_page = nfs_launder_page, + .error_remove_page = generic_error_remove_page, +}; + +/* + * Notification that a PTE pointing to an NFS page is about to be made + * writable, implying that someone is about to modify the page through a + * shared-writable mapping + */ +static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct file *filp = vma->vm_file; + struct dentry *dentry = filp->f_path.dentry; + unsigned pagelen; + int ret = VM_FAULT_NOPAGE; + struct address_space *mapping; + + dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + filp->f_mapping->host->i_ino, + (long long)page_offset(page)); + + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); + + lock_page(page); + mapping = page->mapping; + if (mapping != dentry->d_inode->i_mapping) + goto out_unlock; + + pagelen = nfs_page_length(page); + if (pagelen == 0) + goto out_unlock; + + ret = VM_FAULT_LOCKED; + if (nfs_flush_incompatible(filp, page) == 0 && + nfs_updatepage(filp, page, 0, pagelen) == 0) + goto out; + + ret = VM_FAULT_SIGBUS; +out_unlock: + unlock_page(page); +out: + return ret; +} + +static const struct vm_operations_struct nfs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = nfs_vm_page_mkwrite, +}; + +static int nfs_need_sync_write(struct file *filp, struct inode *inode) +{ + struct nfs_open_context *ctx; + + if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC)) + return 1; + ctx = nfs_file_open_context(filp); + if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) + return 1; + return 0; +} + +static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct dentry * dentry = iocb->ki_filp->f_path.dentry; + struct inode * inode = dentry->d_inode; + unsigned long written = 0; + ssize_t result; + size_t count = iov_length(iov, nr_segs); + + if (iocb->ki_filp->f_flags & O_DIRECT) + return nfs_file_direct_write(iocb, iov, nr_segs, pos); + + dprintk("NFS: write(%s/%s, %lu@%Ld)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + (unsigned long) count, (long long) pos); + + result = -EBUSY; + if (IS_SWAPFILE(inode)) + goto out_swapfile; + /* + * O_APPEND implies that we must revalidate the file length. + */ + if (iocb->ki_filp->f_flags & O_APPEND) { + result = nfs_revalidate_file_size(inode, iocb->ki_filp); + if (result) + goto out; + } + + result = count; + if (!count) + goto out; + + result = generic_file_aio_write(iocb, iov, nr_segs, pos); + if (result > 0) + written = result; + + /* Return error values for O_DSYNC and IS_SYNC() */ + if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { + int err = vfs_fsync(iocb->ki_filp, 0); + if (err < 0) + result = err; + } + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); +out: + return result; + +out_swapfile: + printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); + goto out; +} + +static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, + struct file *filp, loff_t *ppos, + size_t count, unsigned int flags) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + unsigned long written = 0; + ssize_t ret; + + dprintk("NFS splice_write(%s/%s, %lu@%llu)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + (unsigned long) count, (unsigned long long) *ppos); + + /* + * The combination of splice and an O_APPEND destination is disallowed. + */ + + ret = generic_file_splice_write(pipe, filp, ppos, count, flags); + if (ret > 0) + written = ret; + + if (ret >= 0 && nfs_need_sync_write(filp, inode)) { + int err = vfs_fsync(filp, 0); + if (err < 0) + ret = err; + } + if (ret > 0) + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); + return ret; +} + +static int +do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) +{ + struct inode *inode = filp->f_mapping->host; + int status = 0; + unsigned int saved_type = fl->fl_type; + + /* Try local locking first */ + posix_test_lock(filp, fl); + if (fl->fl_type != F_UNLCK) { + /* found a conflict */ + goto out; + } + fl->fl_type = saved_type; + + if (nfs_have_delegation(inode, FMODE_READ)) + goto out_noconflict; + + if (is_local) + goto out_noconflict; + + status = NFS_PROTO(inode)->lock(filp, cmd, fl); +out: + return status; +out_noconflict: + fl->fl_type = F_UNLCK; + goto out; +} + +static int do_vfs_lock(struct file *file, struct file_lock *fl) +{ + int res = 0; + switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { + case FL_POSIX: + res = posix_lock_file_wait(file, fl); + break; + case FL_FLOCK: + res = flock_lock_file_wait(file, fl); + break; + default: + BUG(); + } + return res; +} + +static int +do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) +{ + struct inode *inode = filp->f_mapping->host; + int status; + + /* + * Flush all pending writes before doing anything + * with locks.. + */ + nfs_sync_mapping(filp->f_mapping); + + /* NOTE: special case + * If we're signalled while cleaning up locks on process exit, we + * still need to complete the unlock. + */ + /* + * Use local locking if mounted with "-onolock" or with appropriate + * "-olocal_lock=" + */ + if (!is_local) + status = NFS_PROTO(inode)->lock(filp, cmd, fl); + else + status = do_vfs_lock(filp, fl); + return status; +} + +static int +is_time_granular(struct timespec *ts) { + return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000)); +} + +static int +do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) +{ + struct inode *inode = filp->f_mapping->host; + int status; + + /* + * Flush all pending writes before doing anything + * with locks.. + */ + status = nfs_sync_mapping(filp->f_mapping); + if (status != 0) + goto out; + + /* + * Use local locking if mounted with "-onolock" or with appropriate + * "-olocal_lock=" + */ + if (!is_local) + status = NFS_PROTO(inode)->lock(filp, cmd, fl); + else + status = do_vfs_lock(filp, fl); + if (status < 0) + goto out; + + /* + * Revalidate the cache if the server has time stamps granular + * enough to detect subsecond changes. Otherwise, clear the + * cache to prevent missing any changes. + * + * This makes locking act as a cache coherency point. + */ + nfs_sync_mapping(filp->f_mapping); + if (!nfs_have_delegation(inode, FMODE_READ)) { + if (is_time_granular(&NFS_SERVER(inode)->time_delta)) + __nfs_revalidate_inode(NFS_SERVER(inode), inode); + else + nfs_zap_caches(inode); + } +out: + return status; +} + +/* + * Lock a (portion of) a file + */ +static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) +{ + struct inode *inode = filp->f_mapping->host; + int ret = -ENOLCK; + int is_local = 0; + + dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name, + fl->fl_type, fl->fl_flags, + (long long)fl->fl_start, (long long)fl->fl_end); + + nfs_inc_stats(inode, NFSIOS_VFSLOCK); + + /* No mandatory locks over NFS */ + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + goto out_err; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) + is_local = 1; + + if (NFS_PROTO(inode)->lock_check_bounds != NULL) { + ret = NFS_PROTO(inode)->lock_check_bounds(fl); + if (ret < 0) + goto out_err; + } + + if (IS_GETLK(cmd)) + ret = do_getlk(filp, cmd, fl, is_local); + else if (fl->fl_type == F_UNLCK) + ret = do_unlk(filp, cmd, fl, is_local); + else + ret = do_setlk(filp, cmd, fl, is_local); +out_err: + return ret; +} + +/* + * Lock a (portion of) a file + */ +static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) +{ + struct inode *inode = filp->f_mapping->host; + int is_local = 0; + + dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name, + fl->fl_type, fl->fl_flags); + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) + is_local = 1; + + /* We're simulating flock() locks using posix locks on the server */ + fl->fl_owner = (fl_owner_t)filp; + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; + + if (fl->fl_type == F_UNLCK) + return do_unlk(filp, cmd, fl, is_local); + return do_setlk(filp, cmd, fl, is_local); +} + +/* + * There is no protocol support for leases, so we have no way to implement + * them correctly in the face of opens by other clients. + */ +static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) +{ + dprintk("NFS: setlease(%s/%s, arg=%ld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, arg); + return -EINVAL; +} + +#ifdef CONFIG_NFS_V4 +static int +nfs4_file_open(struct inode *inode, struct file *filp) +{ + /* + * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to + * this point, then something is very wrong + */ + dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp); + return -ENOTDIR; +} + +const struct file_operations nfs4_file_operations = { + .llseek = nfs_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = nfs_file_read, + .aio_write = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs4_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = nfs_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; +#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c new file mode 100644 index 00000000..5b100648 --- /dev/null +++ b/fs/nfs/fscache-index.c @@ -0,0 +1,337 @@ +/* NFS FS-Cache index structure definition + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "fscache.h" + +#define NFSDBG_FACILITY NFSDBG_FSCACHE + +/* + * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks + * the cookie for the top-level index object for NFS into here. The top-level + * index can than have other cache objects inserted into it. + */ +struct fscache_netfs nfs_fscache_netfs = { + .name = "nfs", + .version = 0, +}; + +/* + * Register NFS for caching + */ +int nfs_fscache_register(void) +{ + return fscache_register_netfs(&nfs_fscache_netfs); +} + +/* + * Unregister NFS for caching + */ +void nfs_fscache_unregister(void) +{ + fscache_unregister_netfs(&nfs_fscache_netfs); +} + +/* + * Layout of the key for an NFS server cache object. + */ +struct nfs_server_key { + uint16_t nfsversion; /* NFS protocol version */ + uint16_t family; /* address family */ + uint16_t port; /* IP port */ + union { + struct in_addr ipv4_addr; /* IPv4 address */ + struct in6_addr ipv6_addr; /* IPv6 address */ + } addr[0]; +}; + +/* + * Generate a key to describe a server in the main NFS index + * - We return the length of the key, or 0 if we can't generate one + */ +static uint16_t nfs_server_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_client *clp = cookie_netfs_data; + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr; + const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr; + struct nfs_server_key *key = buffer; + uint16_t len = sizeof(struct nfs_server_key); + + key->nfsversion = clp->rpc_ops->version; + key->family = clp->cl_addr.ss_family; + + memset(key, 0, len); + + switch (clp->cl_addr.ss_family) { + case AF_INET: + key->port = sin->sin_port; + key->addr[0].ipv4_addr = sin->sin_addr; + len += sizeof(key->addr[0].ipv4_addr); + break; + + case AF_INET6: + key->port = sin6->sin6_port; + key->addr[0].ipv6_addr = sin6->sin6_addr; + len += sizeof(key->addr[0].ipv6_addr); + break; + + default: + printk(KERN_WARNING "NFS: Unknown network family '%d'\n", + clp->cl_addr.ss_family); + len = 0; + break; + } + + return len; +} + +/* + * Define the server object for FS-Cache. This is used to describe a server + * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and + * server address parameters. + */ +const struct fscache_cookie_def nfs_fscache_server_index_def = { + .name = "NFS.server", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = nfs_server_get_key, +}; + +/* + * Generate a key to describe a superblock key in the main NFS index + */ +static uint16_t nfs_super_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_fscache_key *key; + const struct nfs_server *nfss = cookie_netfs_data; + uint16_t len; + + key = nfss->fscache_key; + len = sizeof(key->key) + key->key.uniq_len; + if (len > bufmax) { + len = 0; + } else { + memcpy(buffer, &key->key, sizeof(key->key)); + memcpy(buffer + sizeof(key->key), + key->key.uniquifier, key->key.uniq_len); + } + + return len; +} + +/* + * Define the superblock object for FS-Cache. This is used to describe a + * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS + * parameters that might cause a separate superblock. + */ +const struct fscache_cookie_def nfs_fscache_super_index_def = { + .name = "NFS.super", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = nfs_super_get_key, +}; + +/* + * Definition of the auxiliary data attached to NFS inode storage objects + * within the cache. + * + * The contents of this struct are recorded in the on-disk local cache in the + * auxiliary data attached to the data storage object backing an inode. This + * permits coherency to be managed when a new inode binds to an already extant + * cache object. + */ +struct nfs_fscache_inode_auxdata { + struct timespec mtime; + struct timespec ctime; + loff_t size; + u64 change_attr; +}; + +/* + * Generate a key to describe an NFS inode in an NFS server's index + */ +static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_inode *nfsi = cookie_netfs_data; + uint16_t nsize; + + /* use the inode's NFS filehandle as the key */ + nsize = nfsi->fh.size; + memcpy(buffer, nfsi->fh.data, nsize); + return nsize; +} + +/* + * Get certain file attributes from the netfs data + * - This function can be absent for an index + * - Not permitted to return an error + * - The netfs data from the cookie being used as the source is presented + */ +static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct nfs_inode *nfsi = cookie_netfs_data; + + *size = nfsi->vfs_inode.i_size; +} + +/* + * Get the auxiliary data from netfs data + * - This function can be absent if the index carries no state data + * - Should store the auxiliary data in the buffer + * - Should return the amount of amount stored + * - Not permitted to return an error + * - The netfs data from the cookie being used as the source is presented + */ +static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + struct nfs_fscache_inode_auxdata auxdata; + const struct nfs_inode *nfsi = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.size = nfsi->vfs_inode.i_size; + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata.change_attr = nfsi->change_attr; + + if (bufmax > sizeof(auxdata)) + bufmax = sizeof(auxdata); + + memcpy(buffer, &auxdata, bufmax); + return bufmax; +} + +/* + * Consult the netfs about the state of an object + * - This function can be absent if the index carries no state data + * - The netfs data from the cookie being used as the target is + * presented, as is the auxiliary data + */ +static +enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct nfs_fscache_inode_auxdata auxdata; + struct nfs_inode *nfsi = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.size = nfsi->vfs_inode.i_size; + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata.change_attr = nfsi->change_attr; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +/* + * Indication from FS-Cache that the cookie is no longer cached + * - This function is called when the backing store currently caching a cookie + * is removed + * - The netfs should use this to clean up any markers indicating cached pages + * - This is mandatory for any object that may have data + */ +static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data) +{ + struct nfs_inode *nfsi = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi); + + for (;;) { + /* grab a bunch of pages to unmark */ + nr_pages = pagevec_lookup(&pvec, + nfsi->vfs_inode.i_mapping, + first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +/* + * Get an extra reference on a read context. + * - This function can be absent if the completion function doesn't require a + * context. + * - The read context is passed back to NFS in the event that a data read on the + * cache fails with EIO - in which case the server must be contacted to + * retrieve the data, which requires the read context for security. + */ +static void nfs_fh_get_context(void *cookie_netfs_data, void *context) +{ + get_nfs_open_context(context); +} + +/* + * Release an extra reference on a read context. + * - This function can be absent if the completion function doesn't require a + * context. + */ +static void nfs_fh_put_context(void *cookie_netfs_data, void *context) +{ + if (context) + put_nfs_open_context(context); +} + +/* + * Define the inode object for FS-Cache. This is used to describe an inode + * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for + * an inode. + * + * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime + * held in the cache auxiliary data for the data storage object with those in + * the inode struct in memory. + */ +const struct fscache_cookie_def nfs_fscache_inode_object_def = { + .name = "NFS.fh", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = nfs_fscache_inode_get_key, + .get_attr = nfs_fscache_inode_get_attr, + .get_aux = nfs_fscache_inode_get_aux, + .check_aux = nfs_fscache_inode_check_aux, + .now_uncached = nfs_fscache_inode_now_uncached, + .get_context = nfs_fh_get_context, + .put_context = nfs_fh_put_context, +}; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c new file mode 100644 index 00000000..419119c3 --- /dev/null +++ b/fs/nfs/fscache.c @@ -0,0 +1,535 @@ +/* NFS filesystem cache interface + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "iostat.h" +#include "fscache.h" + +#define NFSDBG_FACILITY NFSDBG_FSCACHE + +static struct rb_root nfs_fscache_keys = RB_ROOT; +static DEFINE_SPINLOCK(nfs_fscache_keys_lock); + +/* + * Get the per-client index cookie for an NFS client if the appropriate mount + * flag was set + * - We always try and get an index cookie for the client, but get filehandle + * cookies on a per-superblock basis, depending on the mount flags + */ +void nfs_fscache_get_client_cookie(struct nfs_client *clp) +{ + /* create a cache index for looking up filehandles */ + clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index, + &nfs_fscache_server_index_def, + clp); + dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n", + clp, clp->fscache); +} + +/* + * Dispose of a per-client cookie + */ +void nfs_fscache_release_client_cookie(struct nfs_client *clp) +{ + dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n", + clp, clp->fscache); + + fscache_relinquish_cookie(clp->fscache, 0); + clp->fscache = NULL; +} + +/* + * Get the cache cookie for an NFS superblock. We have to handle + * uniquification here because the cache doesn't do it for us. + * + * The default uniquifier is just an empty string, but it may be overridden + * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent + * superblock across an automount point of some nature. + */ +void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, + struct nfs_clone_mount *mntdata) +{ + struct nfs_fscache_key *key, *xkey; + struct nfs_server *nfss = NFS_SB(sb); + struct rb_node **p, *parent; + int diff, ulen; + + if (uniq) { + ulen = strlen(uniq); + } else if (mntdata) { + struct nfs_server *mnt_s = NFS_SB(mntdata->sb); + if (mnt_s->fscache_key) { + uniq = mnt_s->fscache_key->key.uniquifier; + ulen = mnt_s->fscache_key->key.uniq_len; + } + } + + if (!uniq) { + uniq = ""; + ulen = 1; + } + + key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL); + if (!key) + return; + + key->nfs_client = nfss->nfs_client; + key->key.super.s_flags = sb->s_flags & NFS_MS_MASK; + key->key.nfs_server.flags = nfss->flags; + key->key.nfs_server.rsize = nfss->rsize; + key->key.nfs_server.wsize = nfss->wsize; + key->key.nfs_server.acregmin = nfss->acregmin; + key->key.nfs_server.acregmax = nfss->acregmax; + key->key.nfs_server.acdirmin = nfss->acdirmin; + key->key.nfs_server.acdirmax = nfss->acdirmax; + key->key.nfs_server.fsid = nfss->fsid; + key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor; + + key->key.uniq_len = ulen; + memcpy(key->key.uniquifier, uniq, ulen); + + spin_lock(&nfs_fscache_keys_lock); + p = &nfs_fscache_keys.rb_node; + parent = NULL; + while (*p) { + parent = *p; + xkey = rb_entry(parent, struct nfs_fscache_key, node); + + if (key->nfs_client < xkey->nfs_client) + goto go_left; + if (key->nfs_client > xkey->nfs_client) + goto go_right; + + diff = memcmp(&key->key, &xkey->key, sizeof(key->key)); + if (diff < 0) + goto go_left; + if (diff > 0) + goto go_right; + + if (key->key.uniq_len == 0) + goto non_unique; + diff = memcmp(key->key.uniquifier, + xkey->key.uniquifier, + key->key.uniq_len); + if (diff < 0) + goto go_left; + if (diff > 0) + goto go_right; + goto non_unique; + + go_left: + p = &(*p)->rb_left; + continue; + go_right: + p = &(*p)->rb_right; + } + + rb_link_node(&key->node, parent, p); + rb_insert_color(&key->node, &nfs_fscache_keys); + spin_unlock(&nfs_fscache_keys_lock); + nfss->fscache_key = key; + + /* create a cache index for looking up filehandles */ + nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, + &nfs_fscache_super_index_def, + nfss); + dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", + nfss, nfss->fscache); + return; + +non_unique: + spin_unlock(&nfs_fscache_keys_lock); + kfree(key); + nfss->fscache_key = NULL; + nfss->fscache = NULL; + printk(KERN_WARNING "NFS:" + " Cache request denied due to non-unique superblock keys\n"); +} + +/* + * release a per-superblock cookie + */ +void nfs_fscache_release_super_cookie(struct super_block *sb) +{ + struct nfs_server *nfss = NFS_SB(sb); + + dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n", + nfss, nfss->fscache); + + fscache_relinquish_cookie(nfss->fscache, 0); + nfss->fscache = NULL; + + if (nfss->fscache_key) { + spin_lock(&nfs_fscache_keys_lock); + rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys); + spin_unlock(&nfs_fscache_keys_lock); + kfree(nfss->fscache_key); + nfss->fscache_key = NULL; + } +} + +/* + * Initialise the per-inode cache cookie pointer for an NFS inode. + */ +void nfs_fscache_init_inode_cookie(struct inode *inode) +{ + NFS_I(inode)->fscache = NULL; + if (S_ISREG(inode->i_mode)) + set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); +} + +/* + * Get the per-inode cache cookie for an NFS inode. + */ +static void nfs_fscache_enable_inode_cookie(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct nfs_inode *nfsi = NFS_I(inode); + + if (nfsi->fscache || !NFS_FSCACHE(inode)) + return; + + if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) { + nfsi->fscache = fscache_acquire_cookie( + NFS_SB(sb)->fscache, + &nfs_fscache_inode_object_def, + nfsi); + + dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n", + sb, nfsi, nfsi->fscache); + } +} + +/* + * Release a per-inode cookie. + */ +void nfs_fscache_release_inode_cookie(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", + nfsi, nfsi->fscache); + + fscache_relinquish_cookie(nfsi->fscache, 0); + nfsi->fscache = NULL; +} + +/* + * Retire a per-inode cookie, destroying the data attached to it. + */ +void nfs_fscache_zap_inode_cookie(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n", + nfsi, nfsi->fscache); + + fscache_relinquish_cookie(nfsi->fscache, 1); + nfsi->fscache = NULL; +} + +/* + * Turn off the cache with regard to a per-inode cookie if opened for writing, + * invalidating all the pages in the page cache relating to the associated + * inode to clear the per-page caching. + */ +static void nfs_fscache_disable_inode_cookie(struct inode *inode) +{ + clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); + + if (NFS_I(inode)->fscache) { + dfprintk(FSCACHE, + "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode)); + + /* Need to uncache any pages attached to this inode that + * fscache knows about before turning off the cache. + */ + fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode); + nfs_fscache_zap_inode_cookie(inode); + } +} + +/* + * wait_on_bit() sleep function for uninterruptible waiting + */ +static int nfs_fscache_wait_bit(void *flags) +{ + schedule(); + return 0; +} + +/* + * Lock against someone else trying to also acquire or relinquish a cookie + */ +static inline void nfs_fscache_inode_lock(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags)) + wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK, + nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE); +} + +/* + * Unlock cookie management lock + */ +static inline void nfs_fscache_inode_unlock(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + smp_mb__before_clear_bit(); + clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK); +} + +/* + * Decide if we should enable or disable local caching for this inode. + * - For now, with NFS, only regular files that are open read-only will be able + * to use the cache. + * - May be invoked multiple times in parallel by parallel nfs_open() functions. + */ +void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) +{ + if (NFS_FSCACHE(inode)) { + nfs_fscache_inode_lock(inode); + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) + nfs_fscache_disable_inode_cookie(inode); + else + nfs_fscache_enable_inode_cookie(inode); + nfs_fscache_inode_unlock(inode); + } +} + +/* + * Replace a per-inode cookie due to revalidation detecting a file having + * changed on the server. + */ +void nfs_fscache_reset_inode_cookie(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_server *nfss = NFS_SERVER(inode); + struct fscache_cookie *old = nfsi->fscache; + + nfs_fscache_inode_lock(inode); + if (nfsi->fscache) { + /* retire the current fscache cache and get a new one */ + fscache_relinquish_cookie(nfsi->fscache, 1); + + nfsi->fscache = fscache_acquire_cookie( + nfss->nfs_client->fscache, + &nfs_fscache_inode_object_def, + nfsi); + + dfprintk(FSCACHE, + "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n", + nfss, nfsi, old, nfsi->fscache); + } + nfs_fscache_inode_unlock(inode); +} + +/* + * Release the caching state associated with a page, if the page isn't busy + * interacting with the cache. + * - Returns true (can release page) or false (page busy). + */ +int nfs_fscache_release_page(struct page *page, gfp_t gfp) +{ + if (PageFsCache(page)) { + struct nfs_inode *nfsi = NFS_I(page->mapping->host); + struct fscache_cookie *cookie = nfsi->fscache; + + BUG_ON(!cookie); + dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", + cookie, page, nfsi); + + if (!fscache_maybe_release_page(cookie, page, gfp)) + return 0; + + nfs_add_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED, 1); + } + + return 1; +} + +/* + * Release the caching state associated with a page if undergoing complete page + * invalidation. + */ +void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = nfsi->fscache; + + BUG_ON(!cookie); + + dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n", + cookie, page, nfsi); + + fscache_wait_on_page_write(cookie, page); + + BUG_ON(!PageLocked(page)); + fscache_uncache_page(cookie, page); + nfs_add_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED, 1); +} + +/* + * Handle completion of a page being read from the cache. + * - Called in process (keventd) context. + */ +static void nfs_readpage_from_fscache_complete(struct page *page, + void *context, + int error) +{ + dfprintk(FSCACHE, + "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n", + page, context, error); + + /* if the read completes with an error, we just unlock the page and let + * the VM reissue the readpage */ + if (!error) { + SetPageUptodate(page); + unlock_page(page); + } else { + error = nfs_readpage_async(context, page->mapping->host, page); + if (error) + unlock_page(page); + } +} + +/* + * Retrieve a page from fscache + */ +int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, struct page *page) +{ + int ret; + + dfprintk(FSCACHE, + "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", + NFS_I(inode)->fscache, page, page->index, page->flags, inode); + + ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache, + page, + nfs_readpage_from_fscache_complete, + ctx, + GFP_KERNEL); + + switch (ret) { + case 0: /* read BIO submitted (page in fscache) */ + dfprintk(FSCACHE, + "NFS: readpage_from_fscache: BIO submitted\n"); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1); + return ret; + + case -ENOBUFS: /* inode not in cache */ + case -ENODATA: /* page not in cache */ + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1); + dfprintk(FSCACHE, + "NFS: readpage_from_fscache %d\n", ret); + return 1; + + default: + dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1); + } + return ret; +} + +/* + * Retrieve a set of pages from fscache + */ +int __nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + unsigned npages = *nr_pages; + int ret; + + dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", + NFS_I(inode)->fscache, npages, inode); + + ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache, + mapping, pages, nr_pages, + nfs_readpage_from_fscache_complete, + ctx, + mapping_gfp_mask(mapping)); + if (*nr_pages < npages) + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, + npages); + if (*nr_pages > 0) + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, + *nr_pages); + + switch (ret) { + case 0: /* read submitted to the cache for all pages */ + BUG_ON(!list_empty(pages)); + BUG_ON(*nr_pages != 0); + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: submitted\n"); + + return ret; + + case -ENOBUFS: /* some pages aren't cached and can't be */ + case -ENODATA: /* some pages aren't cached */ + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: no page: %d\n", ret); + return 1; + + default: + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: ret %d\n", ret); + } + + return ret; +} + +/* + * Store a newly fetched page in fscache + * - PG_fscache must be set on the page + */ +void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) +{ + int ret; + + dfprintk(FSCACHE, + "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", + NFS_I(inode)->fscache, page, page->index, page->flags, sync); + + ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL); + dfprintk(FSCACHE, + "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", + page, page->index, page->flags, ret); + + if (ret != 0) { + fscache_uncache_page(NFS_I(inode)->fscache, page); + nfs_add_fscache_stats(inode, + NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1); + } else { + nfs_add_fscache_stats(inode, + NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1); + } +} diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h new file mode 100644 index 00000000..b9c572d0 --- /dev/null +++ b/fs/nfs/fscache.h @@ -0,0 +1,222 @@ +/* NFS filesystem cache interface definitions + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _NFS_FSCACHE_H +#define _NFS_FSCACHE_H + +#include +#include +#include +#include + +#ifdef CONFIG_NFS_FSCACHE + +/* + * set of NFS FS-Cache objects that form a superblock key + */ +struct nfs_fscache_key { + struct rb_node node; + struct nfs_client *nfs_client; /* the server */ + + /* the elements of the unique key - as used by nfs_compare_super() and + * nfs_compare_mount_options() to distinguish superblocks */ + struct { + struct { + unsigned long s_flags; /* various flags + * (& NFS_MS_MASK) */ + } super; + + struct { + struct nfs_fsid fsid; + int flags; + unsigned int rsize; /* read size */ + unsigned int wsize; /* write size */ + unsigned int acregmin; /* attr cache timeouts */ + unsigned int acregmax; + unsigned int acdirmin; + unsigned int acdirmax; + } nfs_server; + + struct { + rpc_authflavor_t au_flavor; + } rpc_auth; + + /* uniquifier - can be used if nfs_server.flags includes + * NFS_MOUNT_UNSHARED */ + u8 uniq_len; + char uniquifier[0]; + } key; +}; + +/* + * fscache-index.c + */ +extern struct fscache_netfs nfs_fscache_netfs; +extern const struct fscache_cookie_def nfs_fscache_server_index_def; +extern const struct fscache_cookie_def nfs_fscache_super_index_def; +extern const struct fscache_cookie_def nfs_fscache_inode_object_def; + +extern int nfs_fscache_register(void); +extern void nfs_fscache_unregister(void); + +/* + * fscache.c + */ +extern void nfs_fscache_get_client_cookie(struct nfs_client *); +extern void nfs_fscache_release_client_cookie(struct nfs_client *); + +extern void nfs_fscache_get_super_cookie(struct super_block *, + const char *, + struct nfs_clone_mount *); +extern void nfs_fscache_release_super_cookie(struct super_block *); + +extern void nfs_fscache_init_inode_cookie(struct inode *); +extern void nfs_fscache_release_inode_cookie(struct inode *); +extern void nfs_fscache_zap_inode_cookie(struct inode *); +extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *); +extern void nfs_fscache_reset_inode_cookie(struct inode *); + +extern void __nfs_fscache_invalidate_page(struct page *, struct inode *); +extern int nfs_fscache_release_page(struct page *, gfp_t); + +extern int __nfs_readpage_from_fscache(struct nfs_open_context *, + struct inode *, struct page *); +extern int __nfs_readpages_from_fscache(struct nfs_open_context *, + struct inode *, struct address_space *, + struct list_head *, unsigned *); +extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int); + +/* + * wait for a page to complete writing to the cache + */ +static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, + struct page *page) +{ + if (PageFsCache(page)) + fscache_wait_on_page_write(nfsi->fscache, page); +} + +/* + * release the caching state associated with a page if undergoing complete page + * invalidation + */ +static inline void nfs_fscache_invalidate_page(struct page *page, + struct inode *inode) +{ + if (PageFsCache(page)) + __nfs_fscache_invalidate_page(page, inode); +} + +/* + * Retrieve a page from an inode data storage object. + */ +static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct page *page) +{ + if (NFS_I(inode)->fscache) + return __nfs_readpage_from_fscache(ctx, inode, page); + return -ENOBUFS; +} + +/* + * Retrieve a set of pages from an inode data storage object. + */ +static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + if (NFS_I(inode)->fscache) + return __nfs_readpages_from_fscache(ctx, inode, mapping, pages, + nr_pages); + return -ENOBUFS; +} + +/* + * Store a page newly fetched from the server in an inode data storage object + * in the cache. + */ +static inline void nfs_readpage_to_fscache(struct inode *inode, + struct page *page, + int sync) +{ + if (PageFsCache(page)) + __nfs_readpage_to_fscache(inode, page, sync); +} + +/* + * indicate the client caching state as readable text + */ +static inline const char *nfs_server_fscache_state(struct nfs_server *server) +{ + if (server->fscache && (server->options & NFS_OPTION_FSCACHE)) + return "yes"; + return "no "; +} + + +#else /* CONFIG_NFS_FSCACHE */ +static inline int nfs_fscache_register(void) { return 0; } +static inline void nfs_fscache_unregister(void) {} + +static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {} +static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} + +static inline void nfs_fscache_get_super_cookie( + struct super_block *sb, + const char *uniq, + struct nfs_clone_mount *mntdata) +{ +} +static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} + +static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_set_inode_cookie(struct inode *inode, + struct file *filp) {} +static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {} + +static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) +{ + return 1; /* True: may release page */ +} +static inline void nfs_fscache_invalidate_page(struct page *page, + struct inode *inode) {} +static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, + struct page *page) {} + +static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct page *page) +{ + return -ENOBUFS; +} +static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} +static inline void nfs_readpage_to_fscache(struct inode *inode, + struct page *page, int sync) {} + +static inline const char *nfs_server_fscache_state(struct nfs_server *server) +{ + return "no "; +} + +#endif /* CONFIG_NFS_FSCACHE */ +#endif /* _NFS_FSCACHE_H */ diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c new file mode 100644 index 00000000..dcb61548 --- /dev/null +++ b/fs/nfs/getroot.c @@ -0,0 +1,267 @@ +/* getroot.c: get the root dentry for an NFS mount + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "nfs4_fs.h" +#include "delegation.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_CLIENT + +/* + * Set the superblock root dentry. + * Note that this function frees the inode in case of error. + */ +static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *inode) +{ + /* The mntroot acts as the dummy root dentry for this superblock */ + if (sb->s_root == NULL) { + sb->s_root = d_alloc_root(inode); + if (sb->s_root == NULL) { + iput(inode); + return -ENOMEM; + } + ihold(inode); + /* + * Ensure that this dentry is invisible to d_find_alias(). + * Otherwise, it may be spliced into the tree by + * d_materialise_unique if a parent directory from the same + * filesystem gets mounted at a later time. + * This again causes shrink_dcache_for_umount_subtree() to + * Oops, since the test for IS_ROOT() will fail. + */ + spin_lock(&sb->s_root->d_inode->i_lock); + spin_lock(&sb->s_root->d_lock); + list_del_init(&sb->s_root->d_alias); + spin_unlock(&sb->s_root->d_lock); + spin_unlock(&sb->s_root->d_inode->i_lock); + } + return 0; +} + +/* + * get an NFS2/NFS3 root dentry from the root filehandle + */ +struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, + const char *devname) +{ + struct nfs_server *server = NFS_SB(sb); + struct nfs_fsinfo fsinfo; + struct dentry *ret; + struct inode *inode; + void *name = kstrdup(devname, GFP_KERNEL); + int error; + + if (!name) + return ERR_PTR(-ENOMEM); + + /* get the actual root for this mount */ + fsinfo.fattr = nfs_alloc_fattr(); + if (fsinfo.fattr == NULL) { + kfree(name); + return ERR_PTR(-ENOMEM); + } + + error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + if (error < 0) { + dprintk("nfs_get_root: getattr error = %d\n", -error); + ret = ERR_PTR(error); + goto out; + } + + inode = nfs_fhget(sb, mntfh, fsinfo.fattr); + if (IS_ERR(inode)) { + dprintk("nfs_get_root: get root inode failed\n"); + ret = ERR_CAST(inode); + goto out; + } + + error = nfs_superblock_set_dummy_root(sb, inode); + if (error != 0) { + ret = ERR_PTR(error); + goto out; + } + + /* root dentries normally start off anonymous and get spliced in later + * if the dentry tree reaches them; however if the dentry already + * exists, we'll pick it up at this point and use it as the root + */ + ret = d_obtain_alias(inode); + if (IS_ERR(ret)) { + dprintk("nfs_get_root: get root dentry failed\n"); + goto out; + } + + security_d_instantiate(ret, inode); + spin_lock(&ret->d_lock); + if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { + ret->d_fsdata = name; + name = NULL; + } + spin_unlock(&ret->d_lock); +out: + if (name) + kfree(name); + nfs_free_fattr(fsinfo.fattr); + return ret; +} + +#ifdef CONFIG_NFS_V4 + +int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) +{ + struct nfs_fsinfo fsinfo; + int ret = -ENOMEM; + + dprintk("--> nfs4_get_rootfh()\n"); + + fsinfo.fattr = nfs_alloc_fattr(); + if (fsinfo.fattr == NULL) + goto out; + + /* Start by getting the root filehandle from the server */ + ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + if (ret < 0) { + dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); + goto out; + } + + if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE) + || !S_ISDIR(fsinfo.fattr->mode)) { + printk(KERN_ERR "nfs4_get_rootfh:" + " getroot encountered non-directory\n"); + ret = -ENOTDIR; + goto out; + } + + if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { + printk(KERN_ERR "nfs4_get_rootfh:" + " getroot obtained referral\n"); + ret = -EREMOTE; + goto out; + } + + memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); +out: + nfs_free_fattr(fsinfo.fattr); + dprintk("<-- nfs4_get_rootfh() = %d\n", ret); + return ret; +} + +/* + * get an NFS4 root dentry from the root filehandle + */ +struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh, + const char *devname) +{ + struct nfs_server *server = NFS_SB(sb); + struct nfs_fattr *fattr = NULL; + struct dentry *ret; + struct inode *inode; + void *name = kstrdup(devname, GFP_KERNEL); + int error; + + dprintk("--> nfs4_get_root()\n"); + + if (!name) + return ERR_PTR(-ENOMEM); + + /* get the info about the server and filesystem */ + error = nfs4_server_capabilities(server, mntfh); + if (error < 0) { + dprintk("nfs_get_root: getcaps error = %d\n", + -error); + kfree(name); + return ERR_PTR(error); + } + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) { + kfree(name); + return ERR_PTR(-ENOMEM); + } + + /* get the actual root for this mount */ + error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); + if (error < 0) { + dprintk("nfs_get_root: getattr error = %d\n", -error); + ret = ERR_PTR(error); + goto out; + } + + if (fattr->valid & NFS_ATTR_FATTR_FSID && + !nfs_fsid_equal(&server->fsid, &fattr->fsid)) + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); + + inode = nfs_fhget(sb, mntfh, fattr); + if (IS_ERR(inode)) { + dprintk("nfs_get_root: get root inode failed\n"); + ret = ERR_CAST(inode); + goto out; + } + + error = nfs_superblock_set_dummy_root(sb, inode); + if (error != 0) { + ret = ERR_PTR(error); + goto out; + } + + /* root dentries normally start off anonymous and get spliced in later + * if the dentry tree reaches them; however if the dentry already + * exists, we'll pick it up at this point and use it as the root + */ + ret = d_obtain_alias(inode); + if (IS_ERR(ret)) { + dprintk("nfs_get_root: get root dentry failed\n"); + goto out; + } + + security_d_instantiate(ret, inode); + spin_lock(&ret->d_lock); + if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { + ret->d_fsdata = name; + name = NULL; + } + spin_unlock(&ret->d_lock); +out: + if (name) + kfree(name); + nfs_free_fattr(fattr); + dprintk("<-- nfs4_get_root()\n"); + return ret; +} + +#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c new file mode 100644 index 00000000..79664a10 --- /dev/null +++ b/fs/nfs/idmap.c @@ -0,0 +1,779 @@ +/* + * fs/nfs/idmap.c + * + * UID and GID to name mapping for clients. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include + +static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) +{ + unsigned long val; + char buf[16]; + + if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf)) + return 0; + memcpy(buf, name, namelen); + buf[namelen] = '\0'; + if (strict_strtoul(buf, 0, &val) != 0) + return 0; + *res = val; + return 1; +} + +static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) +{ + return snprintf(buf, buflen, "%u", id); +} + +#ifdef CONFIG_NFS_USE_NEW_IDMAPPER + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define NFS_UINT_MAXLEN 11 + +const struct cred *id_resolver_cache; + +struct key_type key_type_id_resolver = { + .name = "id_resolver", + .instantiate = user_instantiate, + .match = user_match, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = user_describe, + .read = user_read, +}; + +int nfs_idmap_init(void) +{ + struct cred *cred; + struct key *keyring; + int ret = 0; + + printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name); + + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); + if (ret < 0) + goto failed_put_key; + + ret = register_key_type(&key_type_id_resolver); + if (ret < 0) + goto failed_put_key; + + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + id_resolver_cache = cred; + return 0; + +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} + +void nfs_idmap_quit(void) +{ + key_revoke(id_resolver_cache->thread_keyring); + unregister_key_type(&key_type_id_resolver); + put_cred(id_resolver_cache); +} + +/* + * Assemble the description to pass to request_key() + * This function will allocate a new string and update dest to point + * at it. The caller is responsible for freeing dest. + * + * On error 0 is returned. Otherwise, the length of dest is returned. + */ +static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen, + const char *type, size_t typelen, char **desc) +{ + char *cp; + size_t desclen = typelen + namelen + 2; + + *desc = kmalloc(desclen, GFP_KERNEL); + if (!*desc) + return -ENOMEM; + + cp = *desc; + memcpy(cp, type, typelen); + cp += typelen; + *cp++ = ':'; + + memcpy(cp, name, namelen); + cp += namelen; + *cp = '\0'; + return desclen; +} + +static ssize_t nfs_idmap_request_key(const char *name, size_t namelen, + const char *type, void *data, size_t data_size) +{ + const struct cred *saved_cred; + struct key *rkey; + char *desc; + struct user_key_payload *payload; + ssize_t ret; + + ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); + if (ret <= 0) + goto out; + + saved_cred = override_creds(id_resolver_cache); + rkey = request_key(&key_type_id_resolver, desc, ""); + revert_creds(saved_cred); + kfree(desc); + if (IS_ERR(rkey)) { + ret = PTR_ERR(rkey); + goto out; + } + + rcu_read_lock(); + rkey->perm |= KEY_USR_VIEW; + + ret = key_validate(rkey); + if (ret < 0) + goto out_up; + + payload = rcu_dereference(rkey->payload.data); + if (IS_ERR_OR_NULL(payload)) { + ret = PTR_ERR(payload); + goto out_up; + } + + ret = payload->datalen; + if (ret > 0 && ret <= data_size) + memcpy(data, payload->data, ret); + else + ret = -EINVAL; + +out_up: + rcu_read_unlock(); + key_put(rkey); +out: + return ret; +} + + +/* ID -> Name */ +static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen) +{ + char id_str[NFS_UINT_MAXLEN]; + int id_len; + ssize_t ret; + + id_len = snprintf(id_str, sizeof(id_str), "%u", id); + ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen); + if (ret < 0) + return -EINVAL; + return ret; +} + +/* Name -> ID */ +static int nfs_idmap_lookup_id(const char *name, size_t namelen, + const char *type, __u32 *id) +{ + char id_str[NFS_UINT_MAXLEN]; + long id_long; + ssize_t data_size; + int ret = 0; + + data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN); + if (data_size <= 0) { + ret = -EINVAL; + } else { + ret = strict_strtol(id_str, 10, &id_long); + *id = (__u32)id_long; + } + return ret; +} + +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) +{ + if (nfs_map_string_to_numeric(name, namelen, uid)) + return 0; + return nfs_idmap_lookup_id(name, namelen, "uid", uid); +} + +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid) +{ + if (nfs_map_string_to_numeric(name, namelen, gid)) + return 0; + return nfs_idmap_lookup_id(name, namelen, "gid", gid); +} + +int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) +{ + int ret = -EINVAL; + + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_lookup_name(uid, "user", buf, buflen); + if (ret < 0) + ret = nfs_map_numeric_to_string(uid, buf, buflen); + return ret; +} +int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen) +{ + int ret = -EINVAL; + + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_lookup_name(gid, "group", buf, buflen); + if (ret < 0) + ret = nfs_map_numeric_to_string(gid, buf, buflen); + return ret; +} + +#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include "nfs4_fs.h" + +#define IDMAP_HASH_SZ 128 + +/* Default cache timeout is 10 minutes */ +unsigned int nfs_idmap_cache_timeout = 600 * HZ; + +static int param_set_idmap_timeout(const char *val, struct kernel_param *kp) +{ + char *endp; + int num = simple_strtol(val, &endp, 0); + int jif = num * HZ; + if (endp == val || *endp || num < 0 || jif < num) + return -EINVAL; + *((int *)kp->arg) = jif; + return 0; +} + +module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int, + &nfs_idmap_cache_timeout, 0644); + +struct idmap_hashent { + unsigned long ih_expires; + __u32 ih_id; + size_t ih_namelen; + char ih_name[IDMAP_NAMESZ]; +}; + +struct idmap_hashtable { + __u8 h_type; + struct idmap_hashent h_entries[IDMAP_HASH_SZ]; +}; + +struct idmap { + struct dentry *idmap_dentry; + wait_queue_head_t idmap_wq; + struct idmap_msg idmap_im; + struct mutex idmap_lock; /* Serializes upcalls */ + struct mutex idmap_im_lock; /* Protects the hashtable */ + struct idmap_hashtable idmap_user_hash; + struct idmap_hashtable idmap_group_hash; +}; + +static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *, + char __user *, size_t); +static ssize_t idmap_pipe_downcall(struct file *, const char __user *, + size_t); +static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); + +static unsigned int fnvhash32(const void *, size_t); + +static const struct rpc_pipe_ops idmap_upcall_ops = { + .upcall = idmap_pipe_upcall, + .downcall = idmap_pipe_downcall, + .destroy_msg = idmap_pipe_destroy_msg, +}; + +int +nfs_idmap_new(struct nfs_client *clp) +{ + struct idmap *idmap; + int error; + + BUG_ON(clp->cl_idmap != NULL); + + idmap = kzalloc(sizeof(*idmap), GFP_KERNEL); + if (idmap == NULL) + return -ENOMEM; + + idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry, + "idmap", idmap, &idmap_upcall_ops, 0); + if (IS_ERR(idmap->idmap_dentry)) { + error = PTR_ERR(idmap->idmap_dentry); + kfree(idmap); + return error; + } + + mutex_init(&idmap->idmap_lock); + mutex_init(&idmap->idmap_im_lock); + init_waitqueue_head(&idmap->idmap_wq); + idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER; + idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP; + + clp->cl_idmap = idmap; + return 0; +} + +void +nfs_idmap_delete(struct nfs_client *clp) +{ + struct idmap *idmap = clp->cl_idmap; + + if (!idmap) + return; + rpc_unlink(idmap->idmap_dentry); + clp->cl_idmap = NULL; + kfree(idmap); +} + +/* + * Helper routines for manipulating the hashtable + */ +static inline struct idmap_hashent * +idmap_name_hash(struct idmap_hashtable* h, const char *name, size_t len) +{ + return &h->h_entries[fnvhash32(name, len) % IDMAP_HASH_SZ]; +} + +static struct idmap_hashent * +idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len) +{ + struct idmap_hashent *he = idmap_name_hash(h, name, len); + + if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0) + return NULL; + if (time_after(jiffies, he->ih_expires)) + return NULL; + return he; +} + +static inline struct idmap_hashent * +idmap_id_hash(struct idmap_hashtable* h, __u32 id) +{ + return &h->h_entries[fnvhash32(&id, sizeof(id)) % IDMAP_HASH_SZ]; +} + +static struct idmap_hashent * +idmap_lookup_id(struct idmap_hashtable *h, __u32 id) +{ + struct idmap_hashent *he = idmap_id_hash(h, id); + if (he->ih_id != id || he->ih_namelen == 0) + return NULL; + if (time_after(jiffies, he->ih_expires)) + return NULL; + return he; +} + +/* + * Routines for allocating new entries in the hashtable. + * For now, we just have 1 entry per bucket, so it's all + * pretty trivial. + */ +static inline struct idmap_hashent * +idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len) +{ + return idmap_name_hash(h, name, len); +} + +static inline struct idmap_hashent * +idmap_alloc_id(struct idmap_hashtable *h, __u32 id) +{ + return idmap_id_hash(h, id); +} + +static void +idmap_update_entry(struct idmap_hashent *he, const char *name, + size_t namelen, __u32 id) +{ + he->ih_id = id; + memcpy(he->ih_name, name, namelen); + he->ih_name[namelen] = '\0'; + he->ih_namelen = namelen; + he->ih_expires = jiffies + nfs_idmap_cache_timeout; +} + +/* + * Name -> ID + */ +static int +nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h, + const char *name, size_t namelen, __u32 *id) +{ + struct rpc_pipe_msg msg; + struct idmap_msg *im; + struct idmap_hashent *he; + DECLARE_WAITQUEUE(wq, current); + int ret = -EIO; + + im = &idmap->idmap_im; + + /* + * String sanity checks + * Note that the userland daemon expects NUL terminated strings + */ + for (;;) { + if (namelen == 0) + return -EINVAL; + if (name[namelen-1] != '\0') + break; + namelen--; + } + if (namelen >= IDMAP_NAMESZ) + return -EINVAL; + + mutex_lock(&idmap->idmap_lock); + mutex_lock(&idmap->idmap_im_lock); + + he = idmap_lookup_name(h, name, namelen); + if (he != NULL) { + *id = he->ih_id; + ret = 0; + goto out; + } + + memset(im, 0, sizeof(*im)); + memcpy(im->im_name, name, namelen); + + im->im_type = h->h_type; + im->im_conv = IDMAP_CONV_NAMETOID; + + memset(&msg, 0, sizeof(msg)); + msg.data = im; + msg.len = sizeof(*im); + + add_wait_queue(&idmap->idmap_wq, &wq); + if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) { + remove_wait_queue(&idmap->idmap_wq, &wq); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&idmap->idmap_im_lock); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&idmap->idmap_wq, &wq); + mutex_lock(&idmap->idmap_im_lock); + + if (im->im_status & IDMAP_STATUS_SUCCESS) { + *id = im->im_id; + ret = 0; + } + + out: + memset(im, 0, sizeof(*im)); + mutex_unlock(&idmap->idmap_im_lock); + mutex_unlock(&idmap->idmap_lock); + return ret; +} + +/* + * ID -> Name + */ +static int +nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h, + __u32 id, char *name) +{ + struct rpc_pipe_msg msg; + struct idmap_msg *im; + struct idmap_hashent *he; + DECLARE_WAITQUEUE(wq, current); + int ret = -EIO; + unsigned int len; + + im = &idmap->idmap_im; + + mutex_lock(&idmap->idmap_lock); + mutex_lock(&idmap->idmap_im_lock); + + he = idmap_lookup_id(h, id); + if (he) { + memcpy(name, he->ih_name, he->ih_namelen); + ret = he->ih_namelen; + goto out; + } + + memset(im, 0, sizeof(*im)); + im->im_type = h->h_type; + im->im_conv = IDMAP_CONV_IDTONAME; + im->im_id = id; + + memset(&msg, 0, sizeof(msg)); + msg.data = im; + msg.len = sizeof(*im); + + add_wait_queue(&idmap->idmap_wq, &wq); + + if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) { + remove_wait_queue(&idmap->idmap_wq, &wq); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&idmap->idmap_im_lock); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&idmap->idmap_wq, &wq); + mutex_lock(&idmap->idmap_im_lock); + + if (im->im_status & IDMAP_STATUS_SUCCESS) { + if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0) + goto out; + memcpy(name, im->im_name, len); + ret = len; + } + + out: + memset(im, 0, sizeof(*im)); + mutex_unlock(&idmap->idmap_im_lock); + mutex_unlock(&idmap->idmap_lock); + return ret; +} + +/* RPC pipefs upcall/downcall routines */ +static ssize_t +idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, + char __user *dst, size_t buflen) +{ + char *data = (char *)msg->data + msg->copied; + size_t mlen = min(msg->len, buflen); + unsigned long left; + + left = copy_to_user(dst, data, mlen); + if (left == mlen) { + msg->errno = -EFAULT; + return -EFAULT; + } + + mlen -= left; + msg->copied += mlen; + msg->errno = 0; + return mlen; +} + +static ssize_t +idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) +{ + struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode); + struct idmap *idmap = (struct idmap *)rpci->private; + struct idmap_msg im_in, *im = &idmap->idmap_im; + struct idmap_hashtable *h; + struct idmap_hashent *he = NULL; + size_t namelen_in; + int ret; + + if (mlen != sizeof(im_in)) + return -ENOSPC; + + if (copy_from_user(&im_in, src, mlen) != 0) + return -EFAULT; + + mutex_lock(&idmap->idmap_im_lock); + + ret = mlen; + im->im_status = im_in.im_status; + /* If we got an error, terminate now, and wake up pending upcalls */ + if (!(im_in.im_status & IDMAP_STATUS_SUCCESS)) { + wake_up(&idmap->idmap_wq); + goto out; + } + + /* Sanity checking of strings */ + ret = -EINVAL; + namelen_in = strnlen(im_in.im_name, IDMAP_NAMESZ); + if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) + goto out; + + switch (im_in.im_type) { + case IDMAP_TYPE_USER: + h = &idmap->idmap_user_hash; + break; + case IDMAP_TYPE_GROUP: + h = &idmap->idmap_group_hash; + break; + default: + goto out; + } + + switch (im_in.im_conv) { + case IDMAP_CONV_IDTONAME: + /* Did we match the current upcall? */ + if (im->im_conv == IDMAP_CONV_IDTONAME + && im->im_type == im_in.im_type + && im->im_id == im_in.im_id) { + /* Yes: copy string, including the terminating '\0' */ + memcpy(im->im_name, im_in.im_name, namelen_in); + im->im_name[namelen_in] = '\0'; + wake_up(&idmap->idmap_wq); + } + he = idmap_alloc_id(h, im_in.im_id); + break; + case IDMAP_CONV_NAMETOID: + /* Did we match the current upcall? */ + if (im->im_conv == IDMAP_CONV_NAMETOID + && im->im_type == im_in.im_type + && strnlen(im->im_name, IDMAP_NAMESZ) == namelen_in + && memcmp(im->im_name, im_in.im_name, namelen_in) == 0) { + im->im_id = im_in.im_id; + wake_up(&idmap->idmap_wq); + } + he = idmap_alloc_name(h, im_in.im_name, namelen_in); + break; + default: + goto out; + } + + /* If the entry is valid, also copy it to the cache */ + if (he != NULL) + idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id); + ret = mlen; +out: + mutex_unlock(&idmap->idmap_im_lock); + return ret; +} + +static void +idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct idmap_msg *im = msg->data; + struct idmap *idmap = container_of(im, struct idmap, idmap_im); + + if (msg->errno >= 0) + return; + mutex_lock(&idmap->idmap_im_lock); + im->im_status = IDMAP_STATUS_LOOKUPFAIL; + wake_up(&idmap->idmap_wq); + mutex_unlock(&idmap->idmap_im_lock); +} + +/* + * Fowler/Noll/Vo hash + * http://www.isthe.com/chongo/tech/comp/fnv/ + */ + +#define FNV_P_32 ((unsigned int)0x01000193) /* 16777619 */ +#define FNV_1_32 ((unsigned int)0x811c9dc5) /* 2166136261 */ + +static unsigned int fnvhash32(const void *buf, size_t buflen) +{ + const unsigned char *p, *end = (const unsigned char *)buf + buflen; + unsigned int hash = FNV_1_32; + + for (p = buf; p < end; p++) { + hash *= FNV_P_32; + hash ^= (unsigned int)*p; + } + + return hash; +} + +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + + if (nfs_map_string_to_numeric(name, namelen, uid)) + return 0; + return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); +} + +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + + if (nfs_map_string_to_numeric(name, namelen, uid)) + return 0; + return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); +} + +int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; + + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); + if (ret < 0) + ret = nfs_map_numeric_to_string(uid, buf, buflen); + return ret; +} +int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; + + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); + if (ret < 0) + ret = nfs_map_numeric_to_string(uid, buf, buflen); + return ret; +} + +#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c new file mode 100644 index 00000000..c48f9f6a --- /dev/null +++ b/fs/nfs/inode.c @@ -0,0 +1,1656 @@ +/* + * linux/fs/nfs/inode.c + * + * Copyright (C) 1992 Rick Sladkey + * + * nfs inode and superblock handling functions + * + * Modularised by Alan Cox , while hacking some + * experimental NFS changes. Modularisation taken straight from SYS5 fs. + * + * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts. + * J.S.Peatfield@damtp.cam.ac.uk + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "iostat.h" +#include "internal.h" +#include "fscache.h" +#include "dns_resolve.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 + +/* Default is to see 64-bit inode numbers */ +static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; + +static void nfs_invalidate_inode(struct inode *); +static int nfs_update_inode(struct inode *, struct nfs_fattr *); + +static struct kmem_cache * nfs_inode_cachep; + +static inline unsigned long +nfs_fattr_to_ino_t(struct nfs_fattr *fattr) +{ + return nfs_fileid_to_ino_t(fattr->fileid); +} + +/** + * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks + * @word: long word containing the bit lock + */ +int nfs_wait_bit_killable(void *word) +{ + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + schedule(); + return 0; +} + +/** + * nfs_compat_user_ino64 - returns the user-visible inode number + * @fileid: 64-bit fileid + * + * This function returns a 32-bit inode number if the boot parameter + * nfs.enable_ino64 is zero. + */ +u64 nfs_compat_user_ino64(u64 fileid) +{ +#ifdef CONFIG_COMPAT + compat_ulong_t ino; +#else + unsigned long ino; +#endif + + if (enable_ino64) + return fileid; + ino = fileid; + if (sizeof(ino) < sizeof(fileid)) + ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8; + return ino; +} + +static void nfs_clear_inode(struct inode *inode) +{ + /* + * The following should never happen... + */ + BUG_ON(nfs_have_writebacks(inode)); + BUG_ON(!list_empty(&NFS_I(inode)->open_files)); + nfs_zap_acl_cache(inode); + nfs_access_zap_cache(inode); + nfs_fscache_release_inode_cookie(inode); +} + +void nfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + nfs_clear_inode(inode); +} + +/** + * nfs_sync_mapping - helper to flush all mmapped dirty data to disk + */ +int nfs_sync_mapping(struct address_space *mapping) +{ + int ret = 0; + + if (mapping->nrpages != 0) { + unmap_mapping_range(mapping, 0, 0, 0); + ret = nfs_wb_all(mapping->host); + } + return ret; +} + +/* + * Invalidate the local caches + */ +static void nfs_zap_caches_locked(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int mode = inode->i_mode; + + nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); + + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = jiffies; + + memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; + else + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; +} + +void nfs_zap_caches(struct inode *inode) +{ + spin_lock(&inode->i_lock); + nfs_zap_caches_locked(inode); + spin_unlock(&inode->i_lock); +} + +void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) +{ + if (mapping->nrpages != 0) { + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; + spin_unlock(&inode->i_lock); + } +} + +void nfs_zap_acl_cache(struct inode *inode) +{ + void (*clear_acl_cache)(struct inode *); + + clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache; + if (clear_acl_cache != NULL) + clear_acl_cache(inode); + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL; + spin_unlock(&inode->i_lock); +} + +void nfs_invalidate_atime(struct inode *inode) +{ + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; + spin_unlock(&inode->i_lock); +} + +/* + * Invalidate, but do not unhash, the inode. + * NB: must be called with inode->i_lock held! + */ +static void nfs_invalidate_inode(struct inode *inode) +{ + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_zap_caches_locked(inode); +} + +struct nfs_find_desc { + struct nfs_fh *fh; + struct nfs_fattr *fattr; +}; + +/* + * In NFSv3 we can have 64bit inode numbers. In order to support + * this, and re-exported directories (also seen in NFSv2) + * we are forced to allow 2 different inodes to have the same + * i_ino. + */ +static int +nfs_find_actor(struct inode *inode, void *opaque) +{ + struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_fh *fh = desc->fh; + struct nfs_fattr *fattr = desc->fattr; + + if (NFS_FILEID(inode) != fattr->fileid) + return 0; + if (nfs_compare_fh(NFS_FH(inode), fh)) + return 0; + if (is_bad_inode(inode) || NFS_STALE(inode)) + return 0; + return 1; +} + +static int +nfs_init_locked(struct inode *inode, void *opaque) +{ + struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_fattr *fattr = desc->fattr; + + set_nfs_fileid(inode, fattr->fileid); + nfs_copy_fh(NFS_FH(inode), desc->fh); + return 0; +} + +/* + * This is our front-end to iget that looks up inodes by file handle + * instead of inode number. + */ +struct inode * +nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + struct nfs_find_desc desc = { + .fh = fh, + .fattr = fattr + }; + struct inode *inode = ERR_PTR(-ENOENT); + unsigned long hash; + + nfs_attr_check_mountpoint(sb, fattr); + + if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) && + !nfs_attr_use_mounted_on_fileid(fattr)) + goto out_no_inode; + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) + goto out_no_inode; + + hash = nfs_fattr_to_ino_t(fattr); + + inode = iget5_locked(sb, hash, nfs_find_actor, nfs_init_locked, &desc); + if (inode == NULL) { + inode = ERR_PTR(-ENOMEM); + goto out_no_inode; + } + + if (inode->i_state & I_NEW) { + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long now = jiffies; + + /* We set i_ino for the few things that still rely on it, + * such as stat(2) */ + inode->i_ino = hash; + + /* We can't support update_atime(), since the server will reset it */ + inode->i_flags |= S_NOATIME|S_NOCMTIME; + inode->i_mode = fattr->mode; + if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 + && nfs_server_capable(inode, NFS_CAP_MODE)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + /* Why so? Because we want revalidate for devices/FIFOs, and + * that's precisely what we have in nfs_file_inode_operations. + */ + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; + if (S_ISREG(inode->i_mode)) { + inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; + inode->i_data.a_ops = &nfs_file_aops; + inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; + inode->i_fop = &nfs_dir_operations; + inode->i_data.a_ops = &nfs_dir_aops; + if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) + set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); + /* Deal with crossing mountpoints */ + if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT || + fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { + if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) + inode->i_op = &nfs_referral_inode_operations; + else + inode->i_op = &nfs_mountpoint_inode_operations; + inode->i_fop = NULL; + inode->i_flags |= S_AUTOMOUNT; + } + } else if (S_ISLNK(inode->i_mode)) + inode->i_op = &nfs_symlink_inode_operations; + else + init_special_inode(inode, inode->i_mode, fattr->rdev); + + memset(&inode->i_atime, 0, sizeof(inode->i_atime)); + memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); + memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); + nfsi->change_attr = 0; + inode->i_size = 0; + inode->i_nlink = 0; + inode->i_uid = -2; + inode->i_gid = -2; + inode->i_blocks = 0; + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + + nfsi->read_cache_jiffies = fattr->time_start; + nfsi->attr_gencount = fattr->gencount; + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + inode->i_atime = fattr->atime; + else if (nfs_server_capable(inode, NFS_CAP_ATIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + if (fattr->valid & NFS_ATTR_FATTR_MTIME) + inode->i_mtime = fattr->mtime; + else if (nfs_server_capable(inode, NFS_CAP_MTIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA; + if (fattr->valid & NFS_ATTR_FATTR_CTIME) + inode->i_ctime = fattr->ctime; + else if (nfs_server_capable(inode, NFS_CAP_CTIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + if (fattr->valid & NFS_ATTR_FATTR_CHANGE) + nfsi->change_attr = fattr->change_attr; + else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA; + if (fattr->valid & NFS_ATTR_FATTR_SIZE) + inode->i_size = nfs_size_to_loff_t(fattr->size); + else + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_REVAL_PAGECACHE; + if (fattr->valid & NFS_ATTR_FATTR_NLINK) + inode->i_nlink = fattr->nlink; + else if (nfs_server_capable(inode, NFS_CAP_NLINK)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + if (fattr->valid & NFS_ATTR_FATTR_OWNER) + inode->i_uid = fattr->uid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + if (fattr->valid & NFS_ATTR_FATTR_GROUP) + inode->i_gid = fattr->gid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + inode->i_blocks = fattr->du.nfs2.blocks; + if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { + /* + * report the blocks in 512byte units + */ + inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); + } + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = now; + nfsi->access_cache = RB_ROOT; + + nfs_fscache_init_inode_cookie(inode); + + unlock_new_inode(inode); + } else + nfs_refresh_inode(inode, fattr); + dprintk("NFS: nfs_fhget(%s/%Ld ct=%d)\n", + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + atomic_read(&inode->i_count)); + +out: + return inode; + +out_no_inode: + dprintk("nfs_fhget: iget failed with error %ld\n", PTR_ERR(inode)); + goto out; +} + +#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE) + +int +nfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct nfs_fattr *fattr; + int error = -ENOMEM; + + nfs_inc_stats(inode, NFSIOS_VFSSETATTR); + + /* skip mode change if it's just for clearing setuid/setgid */ + if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) + attr->ia_valid &= ~ATTR_MODE; + + if (attr->ia_valid & ATTR_SIZE) { + if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode)) + attr->ia_valid &= ~ATTR_SIZE; + } + + /* Optimization: if the end result is no change, don't RPC */ + attr->ia_valid &= NFS_VALID_ATTRS; + if ((attr->ia_valid & ~ATTR_FILE) == 0) + return 0; + + /* Write all dirty data */ + if (S_ISREG(inode->i_mode)) + nfs_wb_all(inode); + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out; + /* + * Return any delegations if we're going to change ACLs + */ + if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) + nfs_inode_return_delegation(inode); + error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); + if (error == 0) + nfs_refresh_inode(inode, fattr); + nfs_free_fattr(fattr); +out: + return error; +} + +/** + * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating + * + * This is a copy of the common vmtruncate, but with the locking + * corrected to take into account the fact that NFS requires + * inode->i_size to be updated under the inode->i_lock. + */ +static int nfs_vmtruncate(struct inode * inode, loff_t offset) +{ + loff_t oldsize; + int err; + + err = inode_newsize_ok(inode, offset); + if (err) + goto out; + + spin_lock(&inode->i_lock); + oldsize = inode->i_size; + i_size_write(inode, offset); + spin_unlock(&inode->i_lock); + + truncate_pagecache(inode, oldsize, offset); +out: + return err; +} + +/** + * nfs_setattr_update_inode - Update inode metadata after a setattr call. + * @inode: pointer to struct inode + * @attr: pointer to struct iattr + * + * Note: we do this in the *proc.c in order to ensure that + * it works for things like exclusive creates too. + */ +void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) +{ + if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { + spin_lock(&inode->i_lock); + if ((attr->ia_valid & ATTR_MODE) != 0) { + int mode = attr->ia_mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; + } + if ((attr->ia_valid & ATTR_UID) != 0) + inode->i_uid = attr->ia_uid; + if ((attr->ia_valid & ATTR_GID) != 0) + inode->i_gid = attr->ia_gid; + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + spin_unlock(&inode->i_lock); + } + if ((attr->ia_valid & ATTR_SIZE) != 0) { + nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); + nfs_vmtruncate(inode, attr->ia_size); + } +} + +int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; + int err; + + /* Flush out writes to the server in order to update c/mtime. */ + if (S_ISREG(inode->i_mode)) { + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto out; + } + + /* + * We may force a getattr if the user cares about atime. + * + * Note that we only have to check the vfsmount flags here: + * - NFS always sets S_NOATIME by so checking it would give a + * bogus result + * - NFS never sets MS_NOATIME or MS_NODIRATIME so there is + * no point in checking those. + */ + if ((mnt->mnt_flags & MNT_NOATIME) || + ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) + need_atime = 0; + + if (need_atime) + err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + else + err = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (!err) { + generic_fillattr(inode, stat); + stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + } +out: + return err; +} + +static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) +{ + atomic_set(&l_ctx->count, 1); + l_ctx->lockowner = current->files; + l_ctx->pid = current->tgid; + INIT_LIST_HEAD(&l_ctx->list); +} + +static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *pos; + + list_for_each_entry(pos, &ctx->lock_context.list, list) { + if (pos->lockowner != current->files) + continue; + if (pos->pid != current->tgid) + continue; + atomic_inc(&pos->count); + return pos; + } + return NULL; +} + +struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *res, *new = NULL; + struct inode *inode = ctx->path.dentry->d_inode; + + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + spin_unlock(&inode->i_lock); + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) + return NULL; + nfs_init_lock_context(new); + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + list_add_tail(&new->list, &ctx->lock_context.list); + new->open_context = ctx; + res = new; + new = NULL; + } + } + spin_unlock(&inode->i_lock); + kfree(new); + return res; +} + +void nfs_put_lock_context(struct nfs_lock_context *l_ctx) +{ + struct nfs_open_context *ctx = l_ctx->open_context; + struct inode *inode = ctx->path.dentry->d_inode; + + if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) + return; + list_del(&l_ctx->list); + spin_unlock(&inode->i_lock); + kfree(l_ctx); +} + +/** + * nfs_close_context - Common close_context() routine NFSv2/v3 + * @ctx: pointer to context + * @is_sync: is this a synchronous close + * + * always ensure that the attributes are up to date if we're mounted + * with close-to-open semantics + */ +void nfs_close_context(struct nfs_open_context *ctx, int is_sync) +{ + struct inode *inode; + struct nfs_server *server; + + if (!(ctx->mode & FMODE_WRITE)) + return; + if (!is_sync) + return; + inode = ctx->path.dentry->d_inode; + if (!list_empty(&NFS_I(inode)->open_files)) + return; + server = NFS_SERVER(inode); + if (server->flags & NFS_MOUNT_NOCTO) + return; + nfs_revalidate_inode(server, inode); +} + +struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode) +{ + struct nfs_open_context *ctx; + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (ctx != NULL) { + ctx->path = *path; + path_get(&ctx->path); + ctx->cred = get_rpccred(cred); + ctx->state = NULL; + ctx->mode = f_mode; + ctx->flags = 0; + ctx->error = 0; + nfs_init_lock_context(&ctx->lock_context); + ctx->lock_context.open_context = ctx; + INIT_LIST_HEAD(&ctx->list); + } + return ctx; +} + +struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) +{ + if (ctx != NULL) + atomic_inc(&ctx->lock_context.count); + return ctx; +} + +static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) +{ + struct inode *inode = ctx->path.dentry->d_inode; + + if (!list_empty(&ctx->list)) { + if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + return; + list_del(&ctx->list); + spin_unlock(&inode->i_lock); + } else if (!atomic_dec_and_test(&ctx->lock_context.count)) + return; + if (inode != NULL) + NFS_PROTO(inode)->close_context(ctx, is_sync); + if (ctx->cred != NULL) + put_rpccred(ctx->cred); + path_put(&ctx->path); + kfree(ctx); +} + +void put_nfs_open_context(struct nfs_open_context *ctx) +{ + __put_nfs_open_context(ctx, 0); +} + +/* + * Ensure that mmap has a recent RPC credential for use when writing out + * shared pages + */ +void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct nfs_inode *nfsi = NFS_I(inode); + + filp->private_data = get_nfs_open_context(ctx); + spin_lock(&inode->i_lock); + list_add(&ctx->list, &nfsi->open_files); + spin_unlock(&inode->i_lock); +} + +/* + * Given an inode, search for an open context with the desired characteristics + */ +struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *pos, *ctx = NULL; + + spin_lock(&inode->i_lock); + list_for_each_entry(pos, &nfsi->open_files, list) { + if (cred != NULL && pos->cred != cred) + continue; + if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) + continue; + ctx = get_nfs_open_context(pos); + break; + } + spin_unlock(&inode->i_lock); + return ctx; +} + +static void nfs_file_clear_open_context(struct file *filp) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct nfs_open_context *ctx = nfs_file_open_context(filp); + + if (ctx) { + filp->private_data = NULL; + spin_lock(&inode->i_lock); + list_move_tail(&ctx->list, &NFS_I(inode)->open_files); + spin_unlock(&inode->i_lock); + __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1); + } +} + +/* + * These allocate and release file read/write context information. + */ +int nfs_open(struct inode *inode, struct file *filp) +{ + struct nfs_open_context *ctx; + struct rpc_cred *cred; + + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return PTR_ERR(cred); + ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode); + put_rpccred(cred); + if (ctx == NULL) + return -ENOMEM; + nfs_file_set_open_context(filp, ctx); + put_nfs_open_context(ctx); + nfs_fscache_set_inode_cookie(inode, filp); + return 0; +} + +int nfs_release(struct inode *inode, struct file *filp) +{ + nfs_file_clear_open_context(filp); + return 0; +} + +/* + * This function is called whenever some part of NFS notices that + * the cached attributes have to be refreshed. + */ +int +__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) +{ + int status = -ESTALE; + struct nfs_fattr *fattr = NULL; + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", + inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + + if (is_bad_inode(inode)) + goto out; + if (NFS_STALE(inode)) + goto out; + + status = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out; + + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); + status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr); + if (status != 0) { + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), status); + if (status == -ESTALE) { + nfs_zap_caches(inode); + if (!S_ISDIR(inode->i_mode)) + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + } + goto out; + } + + status = nfs_refresh_inode(inode, fattr); + if (status) { + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), status); + goto out; + } + + if (nfsi->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); + + dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", + inode->i_sb->s_id, + (long long)NFS_FILEID(inode)); + + out: + nfs_free_fattr(fattr); + return status; +} + +int nfs_attribute_timeout(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); +} + +static int nfs_attribute_cache_expired(struct inode *inode) +{ + if (nfs_have_delegated_attributes(inode)) + return 0; + return nfs_attribute_timeout(inode); +} + +/** + * nfs_revalidate_inode - Revalidate the inode attributes + * @server - pointer to nfs_server struct + * @inode - pointer to inode struct + * + * Updates inode attribute information by retrieving the data from the server. + */ +int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) +{ + if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) + && !nfs_attribute_cache_expired(inode)) + return NFS_STALE(inode) ? -ESTALE : 0; + return __nfs_revalidate_inode(server, inode); +} + +static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if (mapping->nrpages != 0) { + int ret = invalidate_inode_pages2(mapping); + if (ret < 0) + return ret; + } + spin_lock(&inode->i_lock); + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + if (S_ISDIR(inode->i_mode)) + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + spin_unlock(&inode->i_lock); + nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); + nfs_fscache_reset_inode_cookie(inode); + dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", + inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + return 0; +} + +/** + * nfs_revalidate_mapping - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + */ +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int ret = 0; + + if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + || nfs_attribute_cache_expired(inode) + || NFS_STALE(inode)) { + ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret < 0) + goto out; + } + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + ret = nfs_invalidate_mapping(inode, mapping); +out: + return ret; +} + +static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long ret = 0; + + if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) + && (fattr->valid & NFS_ATTR_FATTR_CHANGE) + && nfsi->change_attr == fattr->pre_change_attr) { + nfsi->change_attr = fattr->change_attr; + if (S_ISDIR(inode->i_mode)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + ret |= NFS_INO_INVALID_ATTR; + } + /* If we have atomic WCC data, we may update some attributes */ + if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) + && (fattr->valid & NFS_ATTR_FATTR_CTIME) + && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + ret |= NFS_INO_INVALID_ATTR; + } + + if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) + && (fattr->valid & NFS_ATTR_FATTR_MTIME) + && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + if (S_ISDIR(inode->i_mode)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + ret |= NFS_INO_INVALID_ATTR; + } + if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) + && (fattr->valid & NFS_ATTR_FATTR_SIZE) + && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) + && nfsi->npages == 0) { + i_size_write(inode, nfs_size_to_loff_t(fattr->size)); + ret |= NFS_INO_INVALID_ATTR; + } + return ret; +} + +/** + * nfs_check_inode_attributes - verify consistency of the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * Verifies the attribute cache. If we have just changed the attributes, + * so that fattr carries weak cache consistency data, then it may + * also update the ctime/mtime/change_attribute. + */ +static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + loff_t cur_size, new_isize; + unsigned long invalid = 0; + + + /* Has the inode gone and changed behind our back? */ + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) + return -EIO; + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + return -EIO; + + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && + nfsi->change_attr != fattr->change_attr) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + + /* Verify a few of the more important attributes */ + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + cur_size = i_size_read(inode); + new_isize = nfs_size_to_loff_t(fattr->size); + if (cur_size != new_isize && nfsi->npages == 0) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + } + + /* Have any file permissions changed? */ + if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + + /* Has the link count changed? */ + if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) + invalid |= NFS_INO_INVALID_ATTR; + + if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime)) + invalid |= NFS_INO_INVALID_ATIME; + + if (invalid != 0) + nfsi->cache_validity |= invalid; + + nfsi->read_cache_jiffies = fattr->time_start; + return 0; +} + +static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + if (!(fattr->valid & NFS_ATTR_FATTR_CTIME)) + return 0; + return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; +} + +static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + return 0; + return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); +} + +static atomic_long_t nfs_attr_generation_counter; + +static unsigned long nfs_read_attr_generation_counter(void) +{ + return atomic_long_read(&nfs_attr_generation_counter); +} + +unsigned long nfs_inc_attr_generation_counter(void) +{ + return atomic_long_inc_return(&nfs_attr_generation_counter); +} + +void nfs_fattr_init(struct nfs_fattr *fattr) +{ + fattr->valid = 0; + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); +} + +struct nfs_fattr *nfs_alloc_fattr(void) +{ + struct nfs_fattr *fattr; + + fattr = kmalloc(sizeof(*fattr), GFP_NOFS); + if (fattr != NULL) + nfs_fattr_init(fattr); + return fattr; +} + +struct nfs_fh *nfs_alloc_fhandle(void) +{ + struct nfs_fh *fh; + + fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS); + if (fh != NULL) + fh->size = 0; + return fh; +} + +/** + * nfs_inode_attrs_need_update - check if the inode attributes need updating + * @inode - pointer to inode + * @fattr - attributes + * + * Attempt to divine whether or not an RPC call reply carrying stale + * attributes got scheduled after another call carrying updated ones. + * + * To do so, the function first assumes that a more recent ctime means + * that the attributes in fattr are newer, however it also attempt to + * catch the case where ctime either didn't change, or went backwards + * (if someone reset the clock on the server) by looking at whether + * or not this RPC call was started after the inode was last updated. + * Note also the check for wraparound of 'attr_gencount' + * + * The function returns 'true' if it thinks the attributes in 'fattr' are + * more recent than the ones cached in the inode. + * + */ +static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + const struct nfs_inode *nfsi = NFS_I(inode); + + return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || + nfs_ctime_need_update(inode, fattr) || + nfs_size_need_update(inode, fattr) || + ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); +} + +static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + if (nfs_inode_attrs_need_update(inode, fattr)) + return nfs_update_inode(inode, fattr); + return nfs_check_inode_attributes(inode, fattr); +} + +/** + * nfs_refresh_inode - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * Check that an RPC call that returned attributes has not overlapped with + * other recent updates of the inode metadata, then decide whether it is + * safe to do a full update of the inode attributes, or whether just to + * call nfs_check_inode_attributes. + */ +int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + spin_lock(&inode->i_lock); + status = nfs_refresh_inode_locked(inode, fattr); + spin_unlock(&inode->i_lock); + + return status; +} + +static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (S_ISDIR(inode->i_mode)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + return nfs_refresh_inode_locked(inode, fattr); +} + +/** + * nfs_post_op_update_inode - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * After an operation that has changed the inode metadata, mark the + * attribute cache as being invalid, then try to update it. + * + * NB: if the server didn't return any post op attributes, this + * function will force the retrieval of attributes before the next + * NFS request. Thus it should be used only for operations that + * are expected to change one or more attributes, to avoid + * unnecessary NFS requests and trips through nfs_update_inode(). + */ +int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + spin_lock(&inode->i_lock); + status = nfs_post_op_update_inode_locked(inode, fattr); + spin_unlock(&inode->i_lock); + return status; +} + +/** + * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * After an operation that has changed the inode metadata, mark the + * attribute cache as being invalid, then try to update it. Fake up + * weak cache consistency data, if none exist. + * + * This function is mainly designed to be used by the ->write_done() functions. + */ +int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + spin_lock(&inode->i_lock); + /* Don't do a WCC update if these attributes are already stale */ + if ((fattr->valid & NFS_ATTR_FATTR) == 0 || + !nfs_inode_attrs_need_update(inode, fattr)) { + fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE + | NFS_ATTR_FATTR_PRESIZE + | NFS_ATTR_FATTR_PREMTIME + | NFS_ATTR_FATTR_PRECTIME); + goto out_noforce; + } + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { + fattr->pre_change_attr = NFS_I(inode)->change_attr; + fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; + } + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { + memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); + fattr->valid |= NFS_ATTR_FATTR_PRECTIME; + } + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { + memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); + fattr->valid |= NFS_ATTR_FATTR_PREMTIME; + } + if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) { + fattr->pre_size = i_size_read(inode); + fattr->valid |= NFS_ATTR_FATTR_PRESIZE; + } +out_noforce: + status = nfs_post_op_update_inode_locked(inode, fattr); + spin_unlock(&inode->i_lock); + return status; +} + +/* + * Many nfs protocol calls return the new file attributes after + * an operation. Here we update the inode to reflect the state + * of the server's inode. + * + * This is a bit tricky because we have to make sure all dirty pages + * have been sent off to the server before calling invalidate_inode_pages. + * To make sure no other process adds more write requests while we try + * our best to flush them, we make them sleep during the attribute refresh. + * + * A very similar scenario holds for the dir cache. + */ +static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_server *server; + struct nfs_inode *nfsi = NFS_I(inode); + loff_t cur_isize, new_isize; + unsigned long invalid = 0; + unsigned long now = jiffies; + unsigned long save_cache_validity; + + dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", + __func__, inode->i_sb->s_id, inode->i_ino, + atomic_read(&inode->i_count), fattr->valid); + + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) + goto out_fileid; + + /* + * Make sure the inode's type hasn't changed. + */ + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + goto out_changed; + + server = NFS_SERVER(inode); + /* Update the fsid? */ + if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && + !nfs_fsid_equal(&server->fsid, &fattr->fsid) && + !IS_AUTOMOUNT(inode)) + server->fsid = fattr->fsid; + + /* + * Update the read time so we don't revalidate too often. + */ + nfsi->read_cache_jiffies = fattr->time_start; + + save_cache_validity = nfsi->cache_validity; + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED + | NFS_INO_REVAL_PAGECACHE); + + /* Do atomic weak cache consistency updates */ + invalid |= nfs_wcc_update_inode(inode, fattr); + + /* More cache consistency checks */ + if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { + if (nfsi->change_attr != fattr->change_attr) { + dprintk("NFS: change_attr change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); + nfsi->change_attr = fattr->change_attr; + } + } else if (server->caps & NFS_CAP_CHANGE_ATTR) + invalid |= save_cache_validity; + + if (fattr->valid & NFS_ATTR_FATTR_MTIME) { + /* NFSv2/v3: Check if the mtime agrees */ + if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { + dprintk("NFS: mtime change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + } + } else if (server->caps & NFS_CAP_MTIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_CTIME) { + /* If ctime has changed we should definitely clear access+acl caches */ + if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + /* and probably clear data for a directory too as utimes can cause + * havoc with our cache. + */ + if (S_ISDIR(inode->i_mode)) { + invalid |= NFS_INO_INVALID_DATA; + nfs_force_lookup_revalidate(inode); + } + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + } + } else if (server->caps & NFS_CAP_CTIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + + /* Check if our cached file size is stale */ + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + new_isize = nfs_size_to_loff_t(fattr->size); + cur_isize = i_size_read(inode); + if (new_isize != cur_isize) { + /* Do we perhaps have any outstanding writes, or has + * the file grown beyond our last write? */ + if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) || + new_isize > cur_isize) { + i_size_write(inode, new_isize); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + } + dprintk("NFS: isize change on server for file %s/%ld " + "(%Ld to %Ld)\n", + inode->i_sb->s_id, + inode->i_ino, + (long long)cur_isize, + (long long)new_isize); + } + } else + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); + + + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); + else if (server->caps & NFS_CAP_ATIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_MODE) { + if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { + umode_t newmode = inode->i_mode & S_IFMT; + newmode |= fattr->mode & S_IALLUGO; + inode->i_mode = newmode; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + } + } else if (server->caps & NFS_CAP_MODE) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_OWNER) { + if (inode->i_uid != fattr->uid) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + inode->i_uid = fattr->uid; + } + } else if (server->caps & NFS_CAP_OWNER) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_GROUP) { + if (inode->i_gid != fattr->gid) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + inode->i_gid = fattr->gid; + } + } else if (server->caps & NFS_CAP_OWNER_GROUP) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_NLINK) { + if (inode->i_nlink != fattr->nlink) { + invalid |= NFS_INO_INVALID_ATTR; + if (S_ISDIR(inode->i_mode)) + invalid |= NFS_INO_INVALID_DATA; + inode->i_nlink = fattr->nlink; + } + } else if (server->caps & NFS_CAP_NLINK) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { + /* + * report the blocks in 512byte units + */ + inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); + } + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + inode->i_blocks = fattr->du.nfs2.blocks; + + /* Update attrtimeo value if we're out of the unstable period */ + if (invalid & NFS_INO_INVALID_ATTR) { + nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = now; + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); + } else { + if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { + if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) + nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = now; + } + } + invalid &= ~NFS_INO_INVALID_ATTR; + /* Don't invalidate the data if we were to blame */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) + || S_ISLNK(inode->i_mode))) + invalid &= ~NFS_INO_INVALID_DATA; + if (!nfs_have_delegation(inode, FMODE_READ) || + (save_cache_validity & NFS_INO_REVAL_FORCED)) + nfsi->cache_validity |= invalid; + + return 0; + out_changed: + /* + * Big trouble! The inode has become a different object. + */ + printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n", + __func__, inode->i_ino, inode->i_mode, fattr->mode); + out_err: + /* + * No need to worry about unhashing the dentry, as the + * lookup validation will know that the inode is bad. + * (But we fall through to invalidate the caches.) + */ + nfs_invalidate_inode(inode); + return -ESTALE; + + out_fileid: + printk(KERN_ERR "NFS: server %s error: fileid changed\n" + "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", + NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id, + (long long)nfsi->fileid, (long long)fattr->fileid); + goto out_err; +} + + +#ifdef CONFIG_NFS_V4 + +/* + * Clean out any remaining NFSv4 state that might be left over due + * to open() calls that passed nfs_atomic_lookup, but failed to call + * nfs_open(). + */ +void nfs4_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + pnfs_return_layout(inode); + pnfs_destroy_layout(NFS_I(inode)); + /* If we are holding a delegation, return it! */ + nfs_inode_return_delegation_noreclaim(inode); + /* First call standard NFS clear_inode() code */ + nfs_clear_inode(inode); +} +#endif + +struct inode *nfs_alloc_inode(struct super_block *sb) +{ + struct nfs_inode *nfsi; + nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); + if (!nfsi) + return NULL; + nfsi->flags = 0UL; + nfsi->cache_validity = 0UL; +#ifdef CONFIG_NFS_V3_ACL + nfsi->acl_access = ERR_PTR(-EAGAIN); + nfsi->acl_default = ERR_PTR(-EAGAIN); +#endif +#ifdef CONFIG_NFS_V4 + nfsi->nfs4_acl = NULL; +#endif /* CONFIG_NFS_V4 */ + return &nfsi->vfs_inode; +} + +static void nfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); +} + +void nfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, nfs_i_callback); +} + +static inline void nfs4_init_once(struct nfs_inode *nfsi) +{ +#ifdef CONFIG_NFS_V4 + INIT_LIST_HEAD(&nfsi->open_states); + nfsi->delegation = NULL; + nfsi->delegation_state = 0; + init_rwsem(&nfsi->rwsem); + nfsi->layout = NULL; + atomic_set(&nfsi->commits_outstanding, 0); +#endif +} + +static void init_once(void *foo) +{ + struct nfs_inode *nfsi = (struct nfs_inode *) foo; + + inode_init_once(&nfsi->vfs_inode); + INIT_LIST_HEAD(&nfsi->open_files); + INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); + INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); + INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); + nfsi->npages = 0; + nfsi->ncommit = 0; + atomic_set(&nfsi->silly_count, 1); + INIT_HLIST_HEAD(&nfsi->silly_list); + init_waitqueue_head(&nfsi->waitqueue); + nfs4_init_once(nfsi); +} + +static int __init nfs_init_inodecache(void) +{ + nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", + sizeof(struct nfs_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (nfs_inode_cachep == NULL) + return -ENOMEM; + + return 0; +} + +static void nfs_destroy_inodecache(void) +{ + kmem_cache_destroy(nfs_inode_cachep); +} + +struct workqueue_struct *nfsiod_workqueue; + +/* + * start up the nfsiod workqueue + */ +static int nfsiod_start(void) +{ + struct workqueue_struct *wq; + dprintk("RPC: creating workqueue nfsiod\n"); + wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0); + if (wq == NULL) + return -ENOMEM; + nfsiod_workqueue = wq; + return 0; +} + +/* + * Destroy the nfsiod workqueue + */ +static void nfsiod_stop(void) +{ + struct workqueue_struct *wq; + + wq = nfsiod_workqueue; + if (wq == NULL) + return; + nfsiod_workqueue = NULL; + destroy_workqueue(wq); +} + +/* + * Initialize NFS + */ +static int __init init_nfs_fs(void) +{ + int err; + + err = nfs_idmap_init(); + if (err < 0) + goto out9; + + err = nfs_dns_resolver_init(); + if (err < 0) + goto out8; + + err = nfs_fscache_register(); + if (err < 0) + goto out7; + + err = nfsiod_start(); + if (err) + goto out6; + + err = nfs_fs_proc_init(); + if (err) + goto out5; + + err = nfs_init_nfspagecache(); + if (err) + goto out4; + + err = nfs_init_inodecache(); + if (err) + goto out3; + + err = nfs_init_readpagecache(); + if (err) + goto out2; + + err = nfs_init_writepagecache(); + if (err) + goto out1; + + err = nfs_init_directcache(); + if (err) + goto out0; + +#ifdef CONFIG_PROC_FS + rpc_proc_register(&nfs_rpcstat); +#endif + if ((err = register_nfs_fs()) != 0) + goto out; + return 0; +out: +#ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); +#endif + nfs_destroy_directcache(); +out0: + nfs_destroy_writepagecache(); +out1: + nfs_destroy_readpagecache(); +out2: + nfs_destroy_inodecache(); +out3: + nfs_destroy_nfspagecache(); +out4: + nfs_fs_proc_exit(); +out5: + nfsiod_stop(); +out6: + nfs_fscache_unregister(); +out7: + nfs_dns_resolver_destroy(); +out8: + nfs_idmap_quit(); +out9: + return err; +} + +static void __exit exit_nfs_fs(void) +{ + nfs_destroy_directcache(); + nfs_destroy_writepagecache(); + nfs_destroy_readpagecache(); + nfs_destroy_inodecache(); + nfs_destroy_nfspagecache(); + nfs_fscache_unregister(); + nfs_dns_resolver_destroy(); + nfs_idmap_quit(); +#ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); +#endif + nfs_cleanup_cb_ident_idr(); + unregister_nfs_fs(); + nfs_fs_proc_exit(); + nfsiod_stop(); +} + +/* Not quite true; I just maintain it */ +MODULE_AUTHOR("Olaf Kirch "); +MODULE_LICENSE("GPL"); +module_param(enable_ino64, bool, 0644); + +module_init(init_nfs_fs) +module_exit(exit_nfs_fs) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h new file mode 100644 index 00000000..2a55347a --- /dev/null +++ b/fs/nfs/internal.h @@ -0,0 +1,456 @@ +/* + * NFS internal definitions + */ + +#include "nfs4_fs.h" +#include +#include + +#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) + +struct nfs_string; + +/* Maximum number of readahead requests + * FIXME: this should really be a sysctl so that users may tune it to suit + * their needs. People that do NFS over a slow network, might for + * instance want to reduce it to something closer to 1 for improved + * interactive response. + */ +#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) + +/* + * Determine if sessions are in use. + */ +static inline int nfs4_has_session(const struct nfs_client *clp) +{ +#ifdef CONFIG_NFS_V4_1 + if (clp->cl_session) + return 1; +#endif /* CONFIG_NFS_V4_1 */ + return 0; +} + +static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +{ +#ifdef CONFIG_NFS_V4_1 + if (nfs4_has_session(clp)) + return (clp->cl_session->flags & SESSION4_PERSIST); +#endif /* CONFIG_NFS_V4_1 */ + return 0; +} + +static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) +{ + if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) + fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT; +} + +static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr) +{ + if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) || + (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) && + ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0))) + return 0; + + fattr->fileid = fattr->mounted_on_fileid; + return 1; +} + +struct nfs_clone_mount { + const struct super_block *sb; + const struct dentry *dentry; + struct nfs_fh *fh; + struct nfs_fattr *fattr; + char *hostname; + char *mnt_path; + struct sockaddr *addr; + size_t addrlen; + rpc_authflavor_t authflavor; +}; + +/* + * Note: RFC 1813 doesn't limit the number of auth flavors that + * a server can return, so make something up. + */ +#define NFS_MAX_SECFLAVORS (12) + +/* + * Value used if the user did not specify a port value. + */ +#define NFS_UNSPEC_PORT (-1) + +/* + * Maximum number of pages that readdir can use for creating + * a vmapped array of pages. + */ +#define NFS_MAX_READDIR_PAGES 8 + +/* + * In-kernel mount arguments + */ +struct nfs_parsed_mount_data { + int flags; + int rsize, wsize; + int timeo, retrans; + int acregmin, acregmax, + acdirmin, acdirmax; + int namlen; + unsigned int options; + unsigned int bsize; + unsigned int auth_flavor_len; + rpc_authflavor_t auth_flavors[1]; + char *client_address; + unsigned int version; + unsigned int minorversion; + char *fscache_uniq; + + struct { + struct sockaddr_storage address; + size_t addrlen; + char *hostname; + u32 version; + int port; + unsigned short protocol; + } mount_server; + + struct { + struct sockaddr_storage address; + size_t addrlen; + char *hostname; + char *export_path; + int port; + unsigned short protocol; + } nfs_server; + + struct security_mnt_opts lsm_opts; +}; + +/* mount_clnt.c */ +struct nfs_mount_request { + struct sockaddr *sap; + size_t salen; + char *hostname; + char *dirpath; + u32 version; + unsigned short protocol; + struct nfs_fh *fh; + int noresvport; + unsigned int *auth_flav_len; + rpc_authflavor_t *auth_flavs; +}; + +extern int nfs_mount(struct nfs_mount_request *info); +extern void nfs_umount(const struct nfs_mount_request *info); + +/* client.c */ +extern struct rpc_program nfs_program; + +extern void nfs_cleanup_cb_ident_idr(void); +extern void nfs_put_client(struct nfs_client *); +extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *); +extern struct nfs_client *nfs4_find_client_ident(int); +extern struct nfs_client * +nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *); +extern struct nfs_server *nfs_create_server( + const struct nfs_parsed_mount_data *, + struct nfs_fh *); +extern struct nfs_server *nfs4_create_server( + const struct nfs_parsed_mount_data *, + struct nfs_fh *); +extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, + struct nfs_fh *); +extern void nfs_free_server(struct nfs_server *server); +extern struct nfs_server *nfs_clone_server(struct nfs_server *, + struct nfs_fh *, + struct nfs_fattr *); +extern void nfs_mark_client_ready(struct nfs_client *clp, int state); +extern int nfs4_check_client_ready(struct nfs_client *clp); +extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, + int ds_addrlen, int ds_proto); +#ifdef CONFIG_PROC_FS +extern int __init nfs_fs_proc_init(void); +extern void nfs_fs_proc_exit(void); +#else +static inline int nfs_fs_proc_init(void) +{ + return 0; +} +static inline void nfs_fs_proc_exit(void) +{ +} +#endif + +/* nfs4namespace.c */ +#ifdef CONFIG_NFS_V4 +extern struct vfsmount *nfs_do_refmount(struct dentry *dentry); +#else +static inline +struct vfsmount *nfs_do_refmount(struct dentry *dentry) +{ + return ERR_PTR(-ENOENT); +} +#endif + +/* callback_xdr.c */ +extern struct svc_version nfs4_callback_version1; +extern struct svc_version nfs4_callback_version4; + +/* pagelist.c */ +extern int __init nfs_init_nfspagecache(void); +extern void nfs_destroy_nfspagecache(void); +extern int __init nfs_init_readpagecache(void); +extern void nfs_destroy_readpagecache(void); +extern int __init nfs_init_writepagecache(void); +extern void nfs_destroy_writepagecache(void); + +extern int __init nfs_init_directcache(void); +extern void nfs_destroy_directcache(void); + +/* nfs2xdr.c */ +extern int nfs_stat_to_errno(enum nfs_stat); +extern struct rpc_procinfo nfs_procedures[]; +extern int nfs2_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); + +/* nfs3xdr.c */ +extern struct rpc_procinfo nfs3_procedures[]; +extern int nfs3_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); + +/* nfs4xdr.c */ +#ifdef CONFIG_NFS_V4 +extern int nfs4_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); +#endif +#ifdef CONFIG_NFS_V4_1 +extern const u32 nfs41_maxread_overhead; +extern const u32 nfs41_maxwrite_overhead; +#endif + +/* nfs4proc.c */ +#ifdef CONFIG_NFS_V4 +extern struct rpc_procinfo nfs4_procedures[]; +void nfs_fixup_secinfo_attributes(struct nfs_fattr *, struct nfs_fh *); +#endif + +extern int nfs4_init_ds_session(struct nfs_client *clp); + +/* proc.c */ +void nfs_close_context(struct nfs_open_context *ctx, int is_sync); +extern int nfs_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, rpc_authflavor_t authflavour, + int noresvport); + +/* dir.c */ +extern int nfs_access_cache_shrinker(struct shrinker *shrink, + struct shrink_control *sc); + +/* inode.c */ +extern struct workqueue_struct *nfsiod_workqueue; +extern struct inode *nfs_alloc_inode(struct super_block *sb); +extern void nfs_destroy_inode(struct inode *); +extern int nfs_write_inode(struct inode *, struct writeback_control *); +extern void nfs_evict_inode(struct inode *); +#ifdef CONFIG_NFS_V4 +extern void nfs4_evict_inode(struct inode *); +#endif +void nfs_zap_acl_cache(struct inode *inode); +extern int nfs_wait_bit_killable(void *word); + +/* super.c */ +extern struct file_system_type nfs_xdev_fs_type; +#ifdef CONFIG_NFS_V4 +extern struct file_system_type nfs4_xdev_fs_type; +extern struct file_system_type nfs4_referral_fs_type; +#endif + +extern struct rpc_stat nfs_rpcstat; + +extern int __init register_nfs_fs(void); +extern void __exit unregister_nfs_fs(void); +extern void nfs_sb_active(struct super_block *sb); +extern void nfs_sb_deactive(struct super_block *sb); + +/* namespace.c */ +extern char *nfs_path(char **p, struct dentry *dentry, + char *buffer, ssize_t buflen); +extern struct vfsmount *nfs_d_automount(struct path *path); + +/* getroot.c */ +extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, + const char *); +#ifdef CONFIG_NFS_V4 +extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *, + const char *); + +extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); +#endif + +/* read.c */ +extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops); +extern void nfs_read_prepare(struct rpc_task *task, void *calldata); + +/* write.c */ +extern void nfs_commit_free(struct nfs_write_data *p); +extern int nfs_initiate_write(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how); +extern void nfs_write_prepare(struct rpc_task *task, void *calldata); +extern int nfs_initiate_commit(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how); +extern void nfs_init_commit(struct nfs_write_data *data, + struct list_head *head, + struct pnfs_layout_segment *lseg); +void nfs_retry_commit(struct list_head *page_list, + struct pnfs_layout_segment *lseg); +void nfs_commit_clear_lock(struct nfs_inode *nfsi); +void nfs_commitdata_release(void *data); +void nfs_commit_release_pages(struct nfs_write_data *data); + +#ifdef CONFIG_MIGRATION +extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *); +#else +#define nfs_migrate_page NULL +#endif + +/* nfs4proc.c */ +extern void __nfs4_read_done_cb(struct nfs_read_data *); +extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data); +extern int nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport); +extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data); +extern int _nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply); +extern int _nfs4_call_sync_session(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply); + +/* + * Determine the device name as a string + */ +static inline char *nfs_devname(struct dentry *dentry, + char *buffer, ssize_t buflen) +{ + char *dummy; + return nfs_path(&dummy, dentry, buffer, buflen); +} + +/* + * Determine the actual block size (and log2 thereof) + */ +static inline +unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp) +{ + /* make sure blocksize is a power of two */ + if ((bsize & (bsize - 1)) || nrbitsp) { + unsigned char nrbits; + + for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--) + ; + bsize = 1 << nrbits; + if (nrbitsp) + *nrbitsp = nrbits; + } + + return bsize; +} + +/* + * Calculate the number of 512byte blocks used. + */ +static inline blkcnt_t nfs_calc_block_size(u64 tsize) +{ + blkcnt_t used = (tsize + 511) >> 9; + return (used > ULONG_MAX) ? ULONG_MAX : used; +} + +/* + * Compute and set NFS server blocksize + */ +static inline +unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) +{ + if (bsize < NFS_MIN_FILE_IO_SIZE) + bsize = NFS_DEF_FILE_IO_SIZE; + else if (bsize >= NFS_MAX_FILE_IO_SIZE) + bsize = NFS_MAX_FILE_IO_SIZE; + + return nfs_block_bits(bsize, nrbitsp); +} + +/* + * Determine the maximum file size for a superblock + */ +static inline +void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) +{ + sb->s_maxbytes = (loff_t)maxfilesize; + if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0) + sb->s_maxbytes = MAX_LFS_FILESIZE; +} + +/* + * Determine the number of bytes of data the page contains + */ +static inline +unsigned int nfs_page_length(struct page *page) +{ + loff_t i_size = i_size_read(page->mapping->host); + + if (i_size > 0) { + pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + if (page->index < end_index) + return PAGE_CACHE_SIZE; + if (page->index == end_index) + return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; + } + return 0; +} + +/* + * Convert a umode to a dirent->d_type + */ +static inline +unsigned char nfs_umode_to_dtype(umode_t mode) +{ + return (mode >> 12) & 15; +} + +/* + * Determine the number of pages in an array of length 'len' and + * with a base offset of 'base' + */ +static inline +unsigned int nfs_page_array_len(unsigned int base, size_t len) +{ + return ((unsigned long)len + (unsigned long)base + + PAGE_SIZE - 1) >> PAGE_SHIFT; +} + +/* + * Helper for restarting RPC calls in the possible presence of NFSv4.1 + * sessions. + */ +static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) +{ + if (nfs4_has_session(clp)) + return rpc_restart_call_prepare(task); + return rpc_restart_call(task); +} diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h new file mode 100644 index 00000000..c5832487 --- /dev/null +++ b/fs/nfs/iostat.h @@ -0,0 +1,71 @@ +/* + * linux/fs/nfs/iostat.h + * + * Declarations for NFS client per-mount statistics + * + * Copyright (C) 2005, 2006 Chuck Lever + * + */ + +#ifndef _NFS_IOSTAT +#define _NFS_IOSTAT + +#include +#include +#include + +struct nfs_iostats { + unsigned long long bytes[__NFSIOS_BYTESMAX]; +#ifdef CONFIG_NFS_FSCACHE + unsigned long long fscache[__NFSIOS_FSCACHEMAX]; +#endif + unsigned long events[__NFSIOS_COUNTSMAX]; +} ____cacheline_aligned; + +static inline void nfs_inc_server_stats(const struct nfs_server *server, + enum nfs_stat_eventcounters stat) +{ + this_cpu_inc(server->io_stats->events[stat]); +} + +static inline void nfs_inc_stats(const struct inode *inode, + enum nfs_stat_eventcounters stat) +{ + nfs_inc_server_stats(NFS_SERVER(inode), stat); +} + +static inline void nfs_add_server_stats(const struct nfs_server *server, + enum nfs_stat_bytecounters stat, + long addend) +{ + this_cpu_add(server->io_stats->bytes[stat], addend); +} + +static inline void nfs_add_stats(const struct inode *inode, + enum nfs_stat_bytecounters stat, + long addend) +{ + nfs_add_server_stats(NFS_SERVER(inode), stat, addend); +} + +#ifdef CONFIG_NFS_FSCACHE +static inline void nfs_add_fscache_stats(struct inode *inode, + enum nfs_stat_fscachecounters stat, + long addend) +{ + this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); +} +#endif + +static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void) +{ + return alloc_percpu(struct nfs_iostats); +} + +static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats) +{ + if (stats != NULL) + free_percpu(stats); +} + +#endif /* _NFS_IOSTAT */ diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c new file mode 100644 index 00000000..d4c2d6b7 --- /dev/null +++ b/fs/nfs/mount_clnt.c @@ -0,0 +1,518 @@ +/* + * In-kernel MOUNT protocol client + * + * Copyright (C) 1997, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#ifdef RPC_DEBUG +# define NFSDBG_FACILITY NFSDBG_MOUNT +#endif + +/* + * Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4 + */ +#define MNTPATHLEN (1024) + +/* + * XDR data type sizes + */ +#define encode_dirpath_sz (1 + XDR_QUADLEN(MNTPATHLEN)) +#define MNT_status_sz (1) +#define MNT_fhs_status_sz (1) +#define MNT_fhandle_sz XDR_QUADLEN(NFS2_FHSIZE) +#define MNT_fhandle3_sz (1 + XDR_QUADLEN(NFS3_FHSIZE)) +#define MNT_authflav3_sz (1 + NFS_MAX_SECFLAVORS) + +/* + * XDR argument and result sizes + */ +#define MNT_enc_dirpath_sz encode_dirpath_sz +#define MNT_dec_mountres_sz (MNT_status_sz + MNT_fhandle_sz) +#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandle_sz + \ + MNT_authflav3_sz) + +/* + * Defined by RFC 1094, section A.5 + */ +enum { + MOUNTPROC_NULL = 0, + MOUNTPROC_MNT = 1, + MOUNTPROC_DUMP = 2, + MOUNTPROC_UMNT = 3, + MOUNTPROC_UMNTALL = 4, + MOUNTPROC_EXPORT = 5, +}; + +/* + * Defined by RFC 1813, section 5.2 + */ +enum { + MOUNTPROC3_NULL = 0, + MOUNTPROC3_MNT = 1, + MOUNTPROC3_DUMP = 2, + MOUNTPROC3_UMNT = 3, + MOUNTPROC3_UMNTALL = 4, + MOUNTPROC3_EXPORT = 5, +}; + +static struct rpc_program mnt_program; + +/* + * Defined by OpenGroup XNFS Version 3W, chapter 8 + */ +enum mountstat { + MNT_OK = 0, + MNT_EPERM = 1, + MNT_ENOENT = 2, + MNT_EACCES = 13, + MNT_EINVAL = 22, +}; + +static struct { + u32 status; + int errno; +} mnt_errtbl[] = { + { .status = MNT_OK, .errno = 0, }, + { .status = MNT_EPERM, .errno = -EPERM, }, + { .status = MNT_ENOENT, .errno = -ENOENT, }, + { .status = MNT_EACCES, .errno = -EACCES, }, + { .status = MNT_EINVAL, .errno = -EINVAL, }, +}; + +/* + * Defined by RFC 1813, section 5.1.5 + */ +enum mountstat3 { + MNT3_OK = 0, /* no error */ + MNT3ERR_PERM = 1, /* Not owner */ + MNT3ERR_NOENT = 2, /* No such file or directory */ + MNT3ERR_IO = 5, /* I/O error */ + MNT3ERR_ACCES = 13, /* Permission denied */ + MNT3ERR_NOTDIR = 20, /* Not a directory */ + MNT3ERR_INVAL = 22, /* Invalid argument */ + MNT3ERR_NAMETOOLONG = 63, /* Filename too long */ + MNT3ERR_NOTSUPP = 10004, /* Operation not supported */ + MNT3ERR_SERVERFAULT = 10006, /* A failure on the server */ +}; + +static struct { + u32 status; + int errno; +} mnt3_errtbl[] = { + { .status = MNT3_OK, .errno = 0, }, + { .status = MNT3ERR_PERM, .errno = -EPERM, }, + { .status = MNT3ERR_NOENT, .errno = -ENOENT, }, + { .status = MNT3ERR_IO, .errno = -EIO, }, + { .status = MNT3ERR_ACCES, .errno = -EACCES, }, + { .status = MNT3ERR_NOTDIR, .errno = -ENOTDIR, }, + { .status = MNT3ERR_INVAL, .errno = -EINVAL, }, + { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, }, + { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, }, + { .status = MNT3ERR_SERVERFAULT, .errno = -EREMOTEIO, }, +}; + +struct mountres { + int errno; + struct nfs_fh *fh; + unsigned int *auth_count; + rpc_authflavor_t *auth_flavors; +}; + +struct mnt_fhstatus { + u32 status; + struct nfs_fh *fh; +}; + +/** + * nfs_mount - Obtain an NFS file handle for the given host and path + * @info: pointer to mount request arguments + * + * Uses default timeout parameters specified by underlying transport. + */ +int nfs_mount(struct nfs_mount_request *info) +{ + struct mountres result = { + .fh = info->fh, + .auth_count = info->auth_flav_len, + .auth_flavors = info->auth_flavs, + }; + struct rpc_message msg = { + .rpc_argp = info->dirpath, + .rpc_resp = &result, + }; + struct rpc_create_args args = { + .net = &init_net, + .protocol = info->protocol, + .address = info->sap, + .addrsize = info->salen, + .servername = info->hostname, + .program = &mnt_program, + .version = info->version, + .authflavor = RPC_AUTH_UNIX, + }; + struct rpc_clnt *mnt_clnt; + int status; + + dprintk("NFS: sending MNT request for %s:%s\n", + (info->hostname ? info->hostname : "server"), + info->dirpath); + + if (info->noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + + mnt_clnt = rpc_create(&args); + if (IS_ERR(mnt_clnt)) + goto out_clnt_err; + + if (info->version == NFS_MNT3_VERSION) + msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT]; + else + msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT]; + + status = rpc_call_sync(mnt_clnt, &msg, 0); + rpc_shutdown_client(mnt_clnt); + + if (status < 0) + goto out_call_err; + if (result.errno != 0) + goto out_mnt_err; + + dprintk("NFS: MNT request succeeded\n"); + status = 0; + +out: + return status; + +out_clnt_err: + status = PTR_ERR(mnt_clnt); + dprintk("NFS: failed to create MNT RPC client, status=%d\n", status); + goto out; + +out_call_err: + dprintk("NFS: MNT request failed, status=%d\n", status); + goto out; + +out_mnt_err: + dprintk("NFS: MNT server returned result %d\n", result.errno); + status = result.errno; + goto out; +} + +/** + * nfs_umount - Notify a server that we have unmounted this export + * @info: pointer to umount request arguments + * + * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always + * use UDP. + */ +void nfs_umount(const struct nfs_mount_request *info) +{ + static const struct rpc_timeout nfs_umnt_timeout = { + .to_initval = 1 * HZ, + .to_maxval = 3 * HZ, + .to_retries = 2, + }; + struct rpc_create_args args = { + .net = &init_net, + .protocol = IPPROTO_UDP, + .address = info->sap, + .addrsize = info->salen, + .timeout = &nfs_umnt_timeout, + .servername = info->hostname, + .program = &mnt_program, + .version = info->version, + .authflavor = RPC_AUTH_UNIX, + .flags = RPC_CLNT_CREATE_NOPING, + }; + struct rpc_message msg = { + .rpc_argp = info->dirpath, + }; + struct rpc_clnt *clnt; + int status; + + if (info->noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + + clnt = rpc_create(&args); + if (IS_ERR(clnt)) + goto out_clnt_err; + + dprintk("NFS: sending UMNT request for %s:%s\n", + (info->hostname ? info->hostname : "server"), info->dirpath); + + if (info->version == NFS_MNT3_VERSION) + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT]; + else + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT]; + + status = rpc_call_sync(clnt, &msg, 0); + rpc_shutdown_client(clnt); + + if (unlikely(status < 0)) + goto out_call_err; + + return; + +out_clnt_err: + dprintk("NFS: failed to create UMNT RPC client, status=%ld\n", + PTR_ERR(clnt)); + return; + +out_call_err: + dprintk("NFS: UMNT request failed, status=%d\n", status); +} + +/* + * XDR encode/decode functions for MOUNT + */ + +static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname) +{ + const u32 pathname_len = strlen(pathname); + __be32 *p; + + BUG_ON(pathname_len > MNTPATHLEN); + p = xdr_reserve_space(xdr, 4 + pathname_len); + xdr_encode_opaque(p, pathname, pathname_len); +} + +static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr, + const char *dirpath) +{ + encode_mntdirpath(xdr, dirpath); +} + +/* + * RFC 1094: "A non-zero status indicates some sort of error. In this + * case, the status is a UNIX error number." This can be problematic + * if the server and client use different errno values for the same + * error. + * + * However, the OpenGroup XNFS spec provides a simple mapping that is + * independent of local errno values on the server and the client. + */ +static int decode_status(struct xdr_stream *xdr, struct mountres *res) +{ + unsigned int i; + u32 status; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + status = be32_to_cpup(p); + + for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) { + if (mnt_errtbl[i].status == status) { + res->errno = mnt_errtbl[i].errno; + return 0; + } + } + + dprintk("NFS: unrecognized MNT status code: %u\n", status); + res->errno = -EACCES; + return 0; +} + +static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res) +{ + struct nfs_fh *fh = res->fh; + __be32 *p; + + p = xdr_inline_decode(xdr, NFS2_FHSIZE); + if (unlikely(p == NULL)) + return -EIO; + + fh->size = NFS2_FHSIZE; + memcpy(fh->data, p, NFS2_FHSIZE); + return 0; +} + +static int mnt_xdr_dec_mountres(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct mountres *res) +{ + int status; + + status = decode_status(xdr, res); + if (unlikely(status != 0 || res->errno != 0)) + return status; + return decode_fhandle(xdr, res); +} + +static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res) +{ + unsigned int i; + u32 status; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + status = be32_to_cpup(p); + + for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) { + if (mnt3_errtbl[i].status == status) { + res->errno = mnt3_errtbl[i].errno; + return 0; + } + } + + dprintk("NFS: unrecognized MNT3 status code: %u\n", status); + res->errno = -EACCES; + return 0; +} + +static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res) +{ + struct nfs_fh *fh = res->fh; + u32 size; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + + size = be32_to_cpup(p); + if (size > NFS3_FHSIZE || size == 0) + return -EIO; + + p = xdr_inline_decode(xdr, size); + if (unlikely(p == NULL)) + return -EIO; + + fh->size = size; + memcpy(fh->data, p, size); + return 0; +} + +static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res) +{ + rpc_authflavor_t *flavors = res->auth_flavors; + unsigned int *count = res->auth_count; + u32 entries, i; + __be32 *p; + + if (*count == 0) + return 0; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + entries = be32_to_cpup(p); + dprintk("NFS: received %u auth flavors\n", entries); + if (entries > NFS_MAX_SECFLAVORS) + entries = NFS_MAX_SECFLAVORS; + + p = xdr_inline_decode(xdr, 4 * entries); + if (unlikely(p == NULL)) + return -EIO; + + if (entries > *count) + entries = *count; + + for (i = 0; i < entries; i++) { + flavors[i] = be32_to_cpup(p++); + dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]); + } + *count = i; + + return 0; +} + +static int mnt_xdr_dec_mountres3(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct mountres *res) +{ + int status; + + status = decode_fhs_status(xdr, res); + if (unlikely(status != 0 || res->errno != 0)) + return status; + status = decode_fhandle3(xdr, res); + if (unlikely(status != 0)) { + res->errno = -EBADHANDLE; + return 0; + } + return decode_auth_flavors(xdr, res); +} + +static struct rpc_procinfo mnt_procedures[] = { + [MOUNTPROC_MNT] = { + .p_proc = MOUNTPROC_MNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres, + .p_arglen = MNT_enc_dirpath_sz, + .p_replen = MNT_dec_mountres_sz, + .p_statidx = MOUNTPROC_MNT, + .p_name = "MOUNT", + }, + [MOUNTPROC_UMNT] = { + .p_proc = MOUNTPROC_UMNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC_UMNT, + .p_name = "UMOUNT", + }, +}; + +static struct rpc_procinfo mnt3_procedures[] = { + [MOUNTPROC3_MNT] = { + .p_proc = MOUNTPROC3_MNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres3, + .p_arglen = MNT_enc_dirpath_sz, + .p_replen = MNT_dec_mountres3_sz, + .p_statidx = MOUNTPROC3_MNT, + .p_name = "MOUNT", + }, + [MOUNTPROC3_UMNT] = { + .p_proc = MOUNTPROC3_UMNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC3_UMNT, + .p_name = "UMOUNT", + }, +}; + + +static struct rpc_version mnt_version1 = { + .number = 1, + .nrprocs = ARRAY_SIZE(mnt_procedures), + .procs = mnt_procedures, +}; + +static struct rpc_version mnt_version3 = { + .number = 3, + .nrprocs = ARRAY_SIZE(mnt3_procedures), + .procs = mnt3_procedures, +}; + +static struct rpc_version *mnt_version[] = { + NULL, + &mnt_version1, + NULL, + &mnt_version3, +}; + +static struct rpc_stat mnt_stats; + +static struct rpc_program mnt_program = { + .name = "mount", + .number = NFS_MNT_PROGRAM, + .nrvers = ARRAY_SIZE(mnt_version), + .version = mnt_version, + .stats = &mnt_stats, +}; diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c new file mode 100644 index 00000000..1f063bac --- /dev/null +++ b/fs/nfs/namespace.c @@ -0,0 +1,371 @@ +/* + * linux/fs/nfs/namespace.c + * + * Copyright (C) 2005 Trond Myklebust + * - Modified by David Howells + * + * NFS namespace + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +static void nfs_expire_automounts(struct work_struct *work); + +static LIST_HEAD(nfs_automount_list); +static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts); +int nfs_mountpoint_expiry_timeout = 500 * HZ; + +static struct vfsmount *nfs_do_submount(struct dentry *dentry, + struct nfs_fh *fh, + struct nfs_fattr *fattr, + rpc_authflavor_t authflavor); + +/* + * nfs_path - reconstruct the path given an arbitrary dentry + * @base - used to return pointer to the end of devname part of path + * @dentry - pointer to dentry + * @buffer - result buffer + * @buflen - length of buffer + * + * Helper function for constructing the server pathname + * by arbitrary hashed dentry. + * + * This is mainly for use in figuring out the path on the + * server side when automounting on top of an existing partition + * and in generating /proc/mounts and friends. + */ +char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen) +{ + char *end; + int namelen; + unsigned seq; + const char *base; + +rename_retry: + end = buffer+buflen; + *--end = '\0'; + buflen--; + + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + while (1) { + spin_lock(&dentry->d_lock); + if (IS_ROOT(dentry)) + break; + namelen = dentry->d_name.len; + buflen -= namelen + 1; + if (buflen < 0) + goto Elong_unlock; + end -= namelen; + memcpy(end, dentry->d_name.name, namelen); + *--end = '/'; + spin_unlock(&dentry->d_lock); + dentry = dentry->d_parent; + } + if (read_seqretry(&rename_lock, seq)) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + goto rename_retry; + } + if (*end != '/') { + if (--buflen < 0) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + goto Elong; + } + *--end = '/'; + } + *p = end; + base = dentry->d_fsdata; + if (!base) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + WARN_ON(1); + return end; + } + namelen = strlen(base); + /* Strip off excess slashes in base string */ + while (namelen > 0 && base[namelen - 1] == '/') + namelen--; + buflen -= namelen; + if (buflen < 0) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + goto Elong; + } + end -= namelen; + memcpy(end, base, namelen); + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + return end; +Elong_unlock: + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; +Elong: + return ERR_PTR(-ENAMETOOLONG); +} + +#ifdef CONFIG_NFS_V4 +static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) +{ + struct gss_api_mech *mech; + struct xdr_netobj oid; + int i; + rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX; + + for (i = 0; i < flavors->num_flavors; i++) { + struct nfs4_secinfo_flavor *flavor; + flavor = &flavors->flavors[i]; + + if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) { + pseudoflavor = flavor->flavor; + break; + } else if (flavor->flavor == RPC_AUTH_GSS) { + oid.len = flavor->gss.sec_oid4.len; + oid.data = flavor->gss.sec_oid4.data; + mech = gss_mech_get_by_OID(&oid); + if (!mech) + continue; + pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service); + gss_mech_put(mech); + break; + } + } + + return pseudoflavor; +} + +static int nfs_negotiate_security(const struct dentry *parent, + const struct dentry *dentry, + rpc_authflavor_t *flavor) +{ + struct page *page; + struct nfs4_secinfo_flavors *flavors; + int (*secinfo)(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); + int ret = -EPERM; + + secinfo = NFS_PROTO(parent->d_inode)->secinfo; + if (secinfo != NULL) { + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto out; + } + flavors = page_address(page); + ret = secinfo(parent->d_inode, &dentry->d_name, flavors); + *flavor = nfs_find_best_sec(flavors); + put_page(page); + } + +out: + return ret; +} + +static int nfs_lookup_with_sec(struct nfs_server *server, struct dentry *parent, + struct dentry *dentry, struct path *path, + struct nfs_fh *fh, struct nfs_fattr *fattr, + rpc_authflavor_t *flavor) +{ + struct rpc_clnt *clone; + struct rpc_auth *auth; + int err; + + err = nfs_negotiate_security(parent, path->dentry, flavor); + if (err < 0) + goto out; + clone = rpc_clone_client(server->client); + auth = rpcauth_create(*flavor, clone); + if (!auth) { + err = -EIO; + goto out_shutdown; + } + err = server->nfs_client->rpc_ops->lookup(clone, parent->d_inode, + &path->dentry->d_name, + fh, fattr); +out_shutdown: + rpc_shutdown_client(clone); +out: + return err; +} +#else /* CONFIG_NFS_V4 */ +static inline int nfs_lookup_with_sec(struct nfs_server *server, + struct dentry *parent, struct dentry *dentry, + struct path *path, struct nfs_fh *fh, + struct nfs_fattr *fattr, + rpc_authflavor_t *flavor) +{ + return -EPERM; +} +#endif /* CONFIG_NFS_V4 */ + +/* + * nfs_d_automount - Handle crossing a mountpoint on the server + * @path - The mountpoint + * + * When we encounter a mountpoint on the server, we want to set up + * a mountpoint on the client too, to prevent inode numbers from + * colliding, and to allow "df" to work properly. + * On NFSv4, we also want to allow for the fact that different + * filesystems may be migrated to different servers in a failover + * situation, and that different filesystems may want to use + * different security flavours. + */ +struct vfsmount *nfs_d_automount(struct path *path) +{ + struct vfsmount *mnt; + struct nfs_server *server = NFS_SERVER(path->dentry->d_inode); + struct dentry *parent; + struct nfs_fh *fh = NULL; + struct nfs_fattr *fattr = NULL; + int err; + rpc_authflavor_t flavor = RPC_AUTH_UNIX; + + dprintk("--> nfs_d_automount()\n"); + + mnt = ERR_PTR(-ESTALE); + if (IS_ROOT(path->dentry)) + goto out_nofree; + + mnt = ERR_PTR(-ENOMEM); + fh = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fh == NULL || fattr == NULL) + goto out; + + dprintk("%s: enter\n", __func__); + + /* Look it up again to get its attributes */ + parent = dget_parent(path->dentry); + err = server->nfs_client->rpc_ops->lookup(server->client, parent->d_inode, + &path->dentry->d_name, + fh, fattr); + if (err == -EPERM && NFS_PROTO(parent->d_inode)->secinfo != NULL) + err = nfs_lookup_with_sec(server, parent, path->dentry, path, fh, fattr, &flavor); + dput(parent); + if (err != 0) { + mnt = ERR_PTR(err); + goto out; + } + + if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) + mnt = nfs_do_refmount(path->dentry); + else + mnt = nfs_do_submount(path->dentry, fh, fattr, flavor); + if (IS_ERR(mnt)) + goto out; + + dprintk("%s: done, success\n", __func__); + mntget(mnt); /* prevent immediate expiration */ + mnt_set_expiry(mnt, &nfs_automount_list); + schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + +out: + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); +out_nofree: + dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt); + return mnt; +} + +const struct inode_operations nfs_mountpoint_inode_operations = { + .getattr = nfs_getattr, +}; + +const struct inode_operations nfs_referral_inode_operations = { +}; + +static void nfs_expire_automounts(struct work_struct *work) +{ + struct list_head *list = &nfs_automount_list; + + mark_mounts_for_expiry(list); + if (!list_empty(list)) + schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); +} + +void nfs_release_automount_timer(void) +{ + if (list_empty(&nfs_automount_list)) + cancel_delayed_work(&nfs_automount_task); +} + +/* + * Clone a mountpoint of the appropriate type + */ +static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, + const char *devname, + struct nfs_clone_mount *mountdata) +{ +#ifdef CONFIG_NFS_V4 + struct vfsmount *mnt = ERR_PTR(-EINVAL); + switch (server->nfs_client->rpc_ops->version) { + case 2: + case 3: + mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); + break; + case 4: + mnt = vfs_kern_mount(&nfs4_xdev_fs_type, 0, devname, mountdata); + } + return mnt; +#else + return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); +#endif +} + +/** + * nfs_do_submount - set up mountpoint when crossing a filesystem boundary + * @dentry - parent directory + * @fh - filehandle for new root dentry + * @fattr - attributes for new root inode + * @authflavor - security flavor to use when performing the mount + * + */ +static struct vfsmount *nfs_do_submount(struct dentry *dentry, + struct nfs_fh *fh, + struct nfs_fattr *fattr, + rpc_authflavor_t authflavor) +{ + struct nfs_clone_mount mountdata = { + .sb = dentry->d_sb, + .dentry = dentry, + .fh = fh, + .fattr = fattr, + .authflavor = authflavor, + }; + struct vfsmount *mnt = ERR_PTR(-ENOMEM); + char *page = (char *) __get_free_page(GFP_USER); + char *devname; + + dprintk("--> nfs_do_submount()\n"); + + dprintk("%s: submounting on %s/%s\n", __func__, + dentry->d_parent->d_name.name, + dentry->d_name.name); + if (page == NULL) + goto out; + devname = nfs_devname(dentry, page, PAGE_SIZE); + mnt = (struct vfsmount *)devname; + if (IS_ERR(devname)) + goto free_page; + mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); +free_page: + free_page((unsigned long)page); +out: + dprintk("%s: done\n", __func__); + + dprintk("<-- nfs_do_submount() = %p\n", mnt); + return mnt; +} diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c new file mode 100644 index 00000000..792cb13a --- /dev/null +++ b/fs/nfs/nfs2xdr.c @@ -0,0 +1,1157 @@ +/* + * linux/fs/nfs/nfs2xdr.c + * + * XDR functions to encode/decode NFS RPC arguments and results. + * + * Copyright (C) 1992, 1993, 1994 Rick Sladkey + * Copyright (C) 1996 Olaf Kirch + * 04 Aug 1998 Ion Badulescu + * FIFO's need special handling in NFSv2 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_XDR + +/* Mapping from NFS error code to "errno" error code. */ +#define errno_NFSERR_IO EIO + +/* + * Declare the space requirements for NFS arguments and replies as + * number of 32bit-words + */ +#define NFS_fhandle_sz (8) +#define NFS_sattr_sz (8) +#define NFS_filename_sz (1+(NFS2_MAXNAMLEN>>2)) +#define NFS_path_sz (1+(NFS2_MAXPATHLEN>>2)) +#define NFS_fattr_sz (17) +#define NFS_info_sz (5) +#define NFS_entry_sz (NFS_filename_sz+3) + +#define NFS_diropargs_sz (NFS_fhandle_sz+NFS_filename_sz) +#define NFS_removeargs_sz (NFS_fhandle_sz+NFS_filename_sz) +#define NFS_sattrargs_sz (NFS_fhandle_sz+NFS_sattr_sz) +#define NFS_readlinkargs_sz (NFS_fhandle_sz) +#define NFS_readargs_sz (NFS_fhandle_sz+3) +#define NFS_writeargs_sz (NFS_fhandle_sz+4) +#define NFS_createargs_sz (NFS_diropargs_sz+NFS_sattr_sz) +#define NFS_renameargs_sz (NFS_diropargs_sz+NFS_diropargs_sz) +#define NFS_linkargs_sz (NFS_fhandle_sz+NFS_diropargs_sz) +#define NFS_symlinkargs_sz (NFS_diropargs_sz+1+NFS_sattr_sz) +#define NFS_readdirargs_sz (NFS_fhandle_sz+2) + +#define NFS_attrstat_sz (1+NFS_fattr_sz) +#define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz) +#define NFS_readlinkres_sz (2) +#define NFS_readres_sz (1+NFS_fattr_sz+1) +#define NFS_writeres_sz (NFS_attrstat_sz) +#define NFS_stat_sz (1) +#define NFS_readdirres_sz (1) +#define NFS_statfsres_sz (1+NFS_info_sz) + + +/* + * While encoding arguments, set up the reply buffer in advance to + * receive reply data directly into the page cache. + */ +static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, + unsigned int base, unsigned int len, + unsigned int bufsize) +{ + struct rpc_auth *auth = req->rq_cred->cr_auth; + unsigned int replen; + + replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); +} + +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("NFS: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + + +/* + * Encode/decode NFSv2 basic data types + * + * Basic NFSv2 data types are defined in section 2.3 of RFC 1094: + * "NFS: Network File System Protocol Specification". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. + */ + +/* + * typedef opaque nfsdata<>; + */ +static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result) +{ + u32 recvd, count; + size_t hdrlen; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(count > recvd)) + goto out_cheating; +out: + xdr_read_pages(xdr, count); + result->eof = 0; /* NFSv2 does not pass EOF flag on the wire. */ + result->count = count; + return count; +out_cheating: + dprintk("NFS: server cheating in read result: " + "count %u > recvd %u\n", count, recvd); + count = recvd; + goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * enum stat { + * NFS_OK = 0, + * NFSERR_PERM = 1, + * NFSERR_NOENT = 2, + * NFSERR_IO = 5, + * NFSERR_NXIO = 6, + * NFSERR_ACCES = 13, + * NFSERR_EXIST = 17, + * NFSERR_NODEV = 19, + * NFSERR_NOTDIR = 20, + * NFSERR_ISDIR = 21, + * NFSERR_FBIG = 27, + * NFSERR_NOSPC = 28, + * NFSERR_ROFS = 30, + * NFSERR_NAMETOOLONG = 63, + * NFSERR_NOTEMPTY = 66, + * NFSERR_DQUOT = 69, + * NFSERR_STALE = 70, + * NFSERR_WFLUSH = 99 + * }; + */ +static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + *status = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.2. ftype + * + * enum ftype { + * NFNON = 0, + * NFREG = 1, + * NFDIR = 2, + * NFBLK = 3, + * NFCHR = 4, + * NFLNK = 5 + * }; + * + */ +static __be32 *xdr_decode_ftype(__be32 *p, u32 *type) +{ + *type = be32_to_cpup(p++); + if (unlikely(*type > NF2FIFO)) + *type = NFBAD; + return p; +} + +/* + * 2.3.3. fhandle + * + * typedef opaque fhandle[FHSIZE]; + */ +static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh) +{ + __be32 *p; + + BUG_ON(fh->size != NFS2_FHSIZE); + p = xdr_reserve_space(xdr, NFS2_FHSIZE); + memcpy(p, fh->data, NFS2_FHSIZE); +} + +static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS2_FHSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + fh->size = NFS2_FHSIZE; + memcpy(fh->data, p, NFS2_FHSIZE); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.4. timeval + * + * struct timeval { + * unsigned int seconds; + * unsigned int useconds; + * }; + */ +static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep) +{ + *p++ = cpu_to_be32(timep->tv_sec); + if (timep->tv_nsec != 0) + *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC); + else + *p++ = cpu_to_be32(0); + return p; +} + +/* + * Passing the invalid value useconds=1000000 is a Sun convention for + * "set to current server time". It's needed to make permissions checks + * for the "touch" program across v2 mounts to Solaris and Irix servers + * work correctly. See description of sattr in section 6.1 of "NFS + * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5. + */ +static __be32 *xdr_encode_current_server_time(__be32 *p, + const struct timespec *timep) +{ + *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32(1000000); + return p; +} + +static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep) +{ + timep->tv_sec = be32_to_cpup(p++); + timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC; + return p; +} + +/* + * 2.3.5. fattr + * + * struct fattr { + * ftype type; + * unsigned int mode; + * unsigned int nlink; + * unsigned int uid; + * unsigned int gid; + * unsigned int size; + * unsigned int blocksize; + * unsigned int rdev; + * unsigned int blocks; + * unsigned int fsid; + * unsigned int fileid; + * timeval atime; + * timeval mtime; + * timeval ctime; + * }; + * + */ +static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + u32 rdev, type; + __be32 *p; + + p = xdr_inline_decode(xdr, NFS_fattr_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + + fattr->valid |= NFS_ATTR_FATTR_V2; + + p = xdr_decode_ftype(p, &type); + + fattr->mode = be32_to_cpup(p++); + fattr->nlink = be32_to_cpup(p++); + fattr->uid = be32_to_cpup(p++); + fattr->gid = be32_to_cpup(p++); + fattr->size = be32_to_cpup(p++); + fattr->du.nfs2.blocksize = be32_to_cpup(p++); + + rdev = be32_to_cpup(p++); + fattr->rdev = new_decode_dev(rdev); + if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) { + fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; + fattr->rdev = 0; + } + + fattr->du.nfs2.blocks = be32_to_cpup(p++); + fattr->fsid.major = be32_to_cpup(p++); + fattr->fsid.minor = 0; + fattr->fileid = be32_to_cpup(p++); + + p = xdr_decode_time(p, &fattr->atime); + p = xdr_decode_time(p, &fattr->mtime); + xdr_decode_time(p, &fattr->ctime); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.6. sattr + * + * struct sattr { + * unsigned int mode; + * unsigned int uid; + * unsigned int gid; + * unsigned int size; + * timeval atime; + * timeval mtime; + * }; + */ + +#define NFS2_SATTR_NOT_SET (0xffffffff) + +static __be32 *xdr_time_not_set(__be32 *p) +{ + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + return p; +} + +static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS_sattr_sz << 2); + + if (attr->ia_valid & ATTR_MODE) + *p++ = cpu_to_be32(attr->ia_mode); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_UID) + *p++ = cpu_to_be32(attr->ia_uid); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_GID) + *p++ = cpu_to_be32(attr->ia_gid); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_SIZE) + *p++ = cpu_to_be32((u32)attr->ia_size); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + + if (attr->ia_valid & ATTR_ATIME_SET) + p = xdr_encode_time(p, &attr->ia_atime); + else if (attr->ia_valid & ATTR_ATIME) + p = xdr_encode_current_server_time(p, &attr->ia_atime); + else + p = xdr_time_not_set(p); + if (attr->ia_valid & ATTR_MTIME_SET) + xdr_encode_time(p, &attr->ia_mtime); + else if (attr->ia_valid & ATTR_MTIME) + xdr_encode_current_server_time(p, &attr->ia_mtime); + else + xdr_time_not_set(p); +} + +/* + * 2.3.7. filename + * + * typedef string filename; + */ +static void encode_filename(struct xdr_stream *xdr, + const char *name, u32 length) +{ + __be32 *p; + + BUG_ON(length > NFS2_MAXNAMLEN); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); +} + +static int decode_filename_inline(struct xdr_stream *xdr, + const char **name, u32 *length) +{ + __be32 *p; + u32 count; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + if (count > NFS3_MAXNAMLEN) + goto out_nametoolong; + p = xdr_inline_decode(xdr, count); + if (unlikely(p == NULL)) + goto out_overflow; + *name = (const char *)p; + *length = count; + return 0; +out_nametoolong: + dprintk("NFS: returned filename too long: %u\n", count); + return -ENAMETOOLONG; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.8. path + * + * typedef string path; + */ +static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length) +{ + __be32 *p; + + BUG_ON(length > NFS2_MAXPATHLEN); + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(length); + xdr_write_pages(xdr, pages, 0, length); +} + +static int decode_path(struct xdr_stream *xdr) +{ + u32 length, recvd; + size_t hdrlen; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p); + if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN)) + goto out_size; + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(length > recvd)) + goto out_cheating; + + xdr_read_pages(xdr, length); + xdr_terminate_string(xdr->buf, length); + return 0; +out_size: + dprintk("NFS: returned pathname too long: %u\n", length); + return -ENAMETOOLONG; +out_cheating: + dprintk("NFS: server cheating in pathname result: " + "length %u > received %u\n", length, recvd); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.9. attrstat + * + * union attrstat switch (stat status) { + * case NFS_OK: + * fattr attributes; + * default: + * void; + * }; + */ +static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_fattr(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +/* + * 2.3.10. diropargs + * + * struct diropargs { + * fhandle dir; + * filename name; + * }; + */ +static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh, + const char *name, u32 length) +{ + encode_fhandle(xdr, fh); + encode_filename(xdr, name, length); +} + +/* + * 2.3.11. diropres + * + * union diropres switch (stat status) { + * case NFS_OK: + * struct { + * fhandle file; + * fattr attributes; + * } diropok; + * default: + * void; + * }; + */ +static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result) +{ + int error; + + error = decode_fhandle(xdr, result->fh); + if (unlikely(error)) + goto out; + error = decode_fattr(xdr, result->fattr); +out: + return error; +} + +static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_diropok(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + + +/* + * NFSv2 XDR encode functions + * + * NFSv2 argument types are defined in section 2.2 of RFC 1094: + * "NFS: Network File System Protocol Specification". + */ + +static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_fh *fh) +{ + encode_fhandle(xdr, fh); +} + +/* + * 2.2.3. sattrargs + * + * struct sattrargs { + * fhandle file; + * sattr attributes; + * }; + */ +static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_sattrargs *args) +{ + encode_fhandle(xdr, args->fh); + encode_sattr(xdr, args->sattr); +} + +static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_diropargs *args) +{ + encode_diropargs(xdr, args->fh, args->name, args->len); +} + +static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readlinkargs *args) +{ + encode_fhandle(xdr, args->fh); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->pglen, NFS_readlinkres_sz); +} + +/* + * 2.2.7. readargs + * + * struct readargs { + * fhandle file; + * unsigned offset; + * unsigned count; + * unsigned totalcount; + * }; + */ +static void encode_readargs(struct xdr_stream *xdr, + const struct nfs_readargs *args) +{ + u32 offset = args->offset; + u32 count = args->count; + __be32 *p; + + encode_fhandle(xdr, args->fh); + + p = xdr_reserve_space(xdr, 4 + 4 + 4); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(count); + *p = cpu_to_be32(count); +} + +static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readargs *args) +{ + encode_readargs(xdr, args); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->count, NFS_readres_sz); + req->rq_rcv_buf.flags |= XDRBUF_READ; +} + +/* + * 2.2.9. writeargs + * + * struct writeargs { + * fhandle file; + * unsigned beginoffset; + * unsigned offset; + * unsigned totalcount; + * nfsdata data; + * }; + */ +static void encode_writeargs(struct xdr_stream *xdr, + const struct nfs_writeargs *args) +{ + u32 offset = args->offset; + u32 count = args->count; + __be32 *p; + + encode_fhandle(xdr, args->fh); + + p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(count); + + /* nfsdata */ + *p = cpu_to_be32(count); + xdr_write_pages(xdr, args->pages, args->pgbase, count); +} + +static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_writeargs *args) +{ + encode_writeargs(xdr, args); + xdr->buf->flags |= XDRBUF_WRITE; +} + +/* + * 2.2.10. createargs + * + * struct createargs { + * diropargs where; + * sattr attributes; + * }; + */ +static void nfs2_xdr_enc_createargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_createargs *args) +{ + encode_diropargs(xdr, args->fh, args->name, args->len); + encode_sattr(xdr, args->sattr); +} + +static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_removeargs *args) +{ + encode_diropargs(xdr, args->fh, args->name.name, args->name.len); +} + +/* + * 2.2.12. renameargs + * + * struct renameargs { + * diropargs from; + * diropargs to; + * }; + */ +static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_renameargs *args) +{ + const struct qstr *old = args->old_name; + const struct qstr *new = args->new_name; + + encode_diropargs(xdr, args->old_dir, old->name, old->len); + encode_diropargs(xdr, args->new_dir, new->name, new->len); +} + +/* + * 2.2.13. linkargs + * + * struct linkargs { + * fhandle from; + * diropargs to; + * }; + */ +static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_linkargs *args) +{ + encode_fhandle(xdr, args->fromfh); + encode_diropargs(xdr, args->tofh, args->toname, args->tolen); +} + +/* + * 2.2.14. symlinkargs + * + * struct symlinkargs { + * diropargs from; + * path to; + * sattr attributes; + * }; + */ +static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_symlinkargs *args) +{ + encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen); + encode_path(xdr, args->pages, args->pathlen); + encode_sattr(xdr, args->sattr); +} + +/* + * 2.2.17. readdirargs + * + * struct readdirargs { + * fhandle dir; + * nfscookie cookie; + * unsigned count; + * }; + */ +static void encode_readdirargs(struct xdr_stream *xdr, + const struct nfs_readdirargs *args) +{ + __be32 *p; + + encode_fhandle(xdr, args->fh); + + p = xdr_reserve_space(xdr, 4 + 4); + *p++ = cpu_to_be32(args->cookie); + *p = cpu_to_be32(args->count); +} + +static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readdirargs *args) +{ + encode_readdirargs(xdr, args); + prepare_reply_buffer(req, args->pages, 0, + args->count, NFS_readdirres_sz); +} + +/* + * NFSv2 XDR decode functions + * + * NFSv2 result types are defined in section 2.2 of RFC 1094: + * "NFS: Network File System Protocol Specification". + */ + +static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr, + void *__unused) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + return decode_attrstat(xdr, result); +} + +static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_diropok *result) +{ + return decode_diropres(xdr, result); +} + +/* + * 2.2.6. readlinkres + * + * union readlinkres switch (stat status) { + * case NFS_OK: + * path data; + * default: + * void; + * }; + */ +static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req, + struct xdr_stream *xdr, void *__unused) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_path(xdr); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +/* + * 2.2.7. readres + * + * union readres switch (stat status) { + * case NFS_OK: + * fattr attributes; + * nfsdata data; + * default: + * void; + * }; + */ +static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_readres *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_fattr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_nfsdata(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_writeres *result) +{ + /* All NFSv2 writes are "file sync" writes */ + result->verf->committed = NFS_FILE_SYNC; + return decode_attrstat(xdr, result->fattr); +} + +/** + * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in + * the local page cache. + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + * + * 2.2.17. entry + * + * struct entry { + * unsigned fileid; + * filename name; + * nfscookie cookie; + * entry *nextentry; + * }; + */ +int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) +{ + __be32 *p; + int error; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p++ == xdr_zero) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p++ == xdr_zero) + return -EAGAIN; + entry->eof = 1; + return -EBADCOOKIE; + } + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + entry->ino = be32_to_cpup(p); + + error = decode_filename_inline(xdr, &entry->name, &entry->len); + if (unlikely(error)) + return error; + + /* + * The type (size and byte order) of nfscookie isn't defined in + * RFC 1094. This implementation assumes that it's an XDR uint32. + */ + entry->prev_cookie = entry->cookie; + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + entry->cookie = be32_to_cpup(p); + + entry->d_type = DT_UNKNOWN; + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EAGAIN; +} + +/* + * 2.2.17. readdirres + * + * union readdirres switch (stat status) { + * case NFS_OK: + * struct { + * entry *entries; + * bool eof; + * } readdirok; + * default: + * void; + * }; + * + * Read the directory contents into the page cache, but don't + * touch them. The actual decoding is done by nfs2_decode_dirent() + * during subsequent nfs_readdir() calls. + */ +static int decode_readdirok(struct xdr_stream *xdr) +{ + u32 recvd, pglen; + size_t hdrlen; + + pglen = xdr->buf->page_len; + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(pglen > recvd)) + goto out_cheating; +out: + xdr_read_pages(xdr, pglen); + return pglen; +out_cheating: + dprintk("NFS: server cheating in readdir result: " + "pglen %u > recvd %u\n", pglen, recvd); + pglen = recvd; + goto out; +} + +static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req, + struct xdr_stream *xdr, void *__unused) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_readdirok(xdr); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +/* + * 2.2.18. statfsres + * + * union statfsres (stat status) { + * case NFS_OK: + * struct { + * unsigned tsize; + * unsigned bsize; + * unsigned blocks; + * unsigned bfree; + * unsigned bavail; + * } info; + * default: + * void; + * }; + */ +static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS_info_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + result->tsize = be32_to_cpup(p++); + result->bsize = be32_to_cpup(p++); + result->blocks = be32_to_cpup(p++); + result->bfree = be32_to_cpup(p++); + result->bavail = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs2_fsstat *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_info(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + + +/* + * We need to translate between nfs status return values and + * the local errno values which may not be the same. + */ +static const struct { + int stat; + int errno; +} nfs_errtbl[] = { + { NFS_OK, 0 }, + { NFSERR_PERM, -EPERM }, + { NFSERR_NOENT, -ENOENT }, + { NFSERR_IO, -errno_NFSERR_IO}, + { NFSERR_NXIO, -ENXIO }, +/* { NFSERR_EAGAIN, -EAGAIN }, */ + { NFSERR_ACCES, -EACCES }, + { NFSERR_EXIST, -EEXIST }, + { NFSERR_XDEV, -EXDEV }, + { NFSERR_NODEV, -ENODEV }, + { NFSERR_NOTDIR, -ENOTDIR }, + { NFSERR_ISDIR, -EISDIR }, + { NFSERR_INVAL, -EINVAL }, + { NFSERR_FBIG, -EFBIG }, + { NFSERR_NOSPC, -ENOSPC }, + { NFSERR_ROFS, -EROFS }, + { NFSERR_MLINK, -EMLINK }, + { NFSERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFSERR_NOTEMPTY, -ENOTEMPTY }, + { NFSERR_DQUOT, -EDQUOT }, + { NFSERR_STALE, -ESTALE }, + { NFSERR_REMOTE, -EREMOTE }, +#ifdef EWFLUSH + { NFSERR_WFLUSH, -EWFLUSH }, +#endif + { NFSERR_BADHANDLE, -EBADHANDLE }, + { NFSERR_NOT_SYNC, -ENOTSYNC }, + { NFSERR_BAD_COOKIE, -EBADCOOKIE }, + { NFSERR_NOTSUPP, -ENOTSUPP }, + { NFSERR_TOOSMALL, -ETOOSMALL }, + { NFSERR_SERVERFAULT, -EREMOTEIO }, + { NFSERR_BADTYPE, -EBADTYPE }, + { NFSERR_JUKEBOX, -EJUKEBOX }, + { -1, -EIO } +}; + +/** + * nfs_stat_to_errno - convert an NFS status code to a local errno + * @status: NFS status code to convert + * + * Returns a local errno value, or -EIO if the NFS status code is + * not recognized. This function is used jointly by NFSv2 and NFSv3. + */ +int nfs_stat_to_errno(enum nfs_stat status) +{ + int i; + + for (i = 0; nfs_errtbl[i].stat != -1; i++) { + if (nfs_errtbl[i].stat == (int)status) + return nfs_errtbl[i].errno; + } + dprintk("NFS: Unrecognized nfs status value: %u\n", status); + return nfs_errtbl[i].errno; +} + +#define PROC(proc, argtype, restype, timer) \ +[NFSPROC_##proc] = { \ + .p_proc = NFSPROC_##proc, \ + .p_encode = (kxdreproc_t)nfs2_xdr_enc_##argtype, \ + .p_decode = (kxdrdproc_t)nfs2_xdr_dec_##restype, \ + .p_arglen = NFS_##argtype##_sz, \ + .p_replen = NFS_##restype##_sz, \ + .p_timer = timer, \ + .p_statidx = NFSPROC_##proc, \ + .p_name = #proc, \ + } +struct rpc_procinfo nfs_procedures[] = { + PROC(GETATTR, fhandle, attrstat, 1), + PROC(SETATTR, sattrargs, attrstat, 0), + PROC(LOOKUP, diropargs, diropres, 2), + PROC(READLINK, readlinkargs, readlinkres, 3), + PROC(READ, readargs, readres, 3), + PROC(WRITE, writeargs, writeres, 4), + PROC(CREATE, createargs, diropres, 0), + PROC(REMOVE, removeargs, stat, 0), + PROC(RENAME, renameargs, stat, 0), + PROC(LINK, linkargs, stat, 0), + PROC(SYMLINK, symlinkargs, stat, 0), + PROC(MKDIR, createargs, diropres, 0), + PROC(RMDIR, diropargs, stat, 0), + PROC(READDIR, readdirargs, readdirres, 3), + PROC(STATFS, fhandle, statfsres, 0), +}; + +struct rpc_version nfs_version2 = { + .number = 2, + .nrprocs = ARRAY_SIZE(nfs_procedures), + .procs = nfs_procedures +}; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c new file mode 100644 index 00000000..27434277 --- /dev/null +++ b/fs/nfs/nfs3acl.c @@ -0,0 +1,444 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_PROC + +ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl; + int pos=0, len=0; + +# define output(s) do { \ + if (pos + sizeof(s) <= size) { \ + memcpy(buffer + pos, s, sizeof(s)); \ + pos += sizeof(s); \ + } \ + len += sizeof(s); \ + } while(0) + + acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + output("system.posix_acl_access"); + posix_acl_release(acl); + } + + if (S_ISDIR(inode->i_mode)) { + acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + output("system.posix_acl_default"); + posix_acl_release(acl); + } + } + +# undef output + + if (!buffer || len <= size) + return len; + return -ERANGE; +} + +ssize_t nfs3_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl; + int type, error = 0; + + if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) + type = ACL_TYPE_ACCESS; + else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) + type = ACL_TYPE_DEFAULT; + else + return -EOPNOTSUPP; + + acl = nfs3_proc_getacl(inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + if (type == ACL_TYPE_ACCESS && acl->a_count == 0) + error = -ENODATA; + else + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + } else + error = -ENODATA; + + return error; +} + +int nfs3_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl; + int type, error; + + if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) + type = ACL_TYPE_ACCESS; + else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) + type = ACL_TYPE_DEFAULT; + else + return -EOPNOTSUPP; + + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + error = nfs3_proc_setacl(inode, type, acl); + posix_acl_release(acl); + + return error; +} + +int nfs3_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + int type; + + if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) + type = ACL_TYPE_ACCESS; + else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) + type = ACL_TYPE_DEFAULT; + else + return -EOPNOTSUPP; + + return nfs3_proc_setacl(inode, type, NULL); +} + +static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi) +{ + if (!IS_ERR(nfsi->acl_access)) { + posix_acl_release(nfsi->acl_access); + nfsi->acl_access = ERR_PTR(-EAGAIN); + } + if (!IS_ERR(nfsi->acl_default)) { + posix_acl_release(nfsi->acl_default); + nfsi->acl_default = ERR_PTR(-EAGAIN); + } +} + +void nfs3_forget_cached_acls(struct inode *inode) +{ + dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id, + inode->i_ino); + spin_lock(&inode->i_lock); + __nfs3_forget_cached_acls(NFS_I(inode)); + spin_unlock(&inode->i_lock); +} + +static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct posix_acl *acl = ERR_PTR(-EINVAL); + + spin_lock(&inode->i_lock); + switch(type) { + case ACL_TYPE_ACCESS: + acl = nfsi->acl_access; + break; + + case ACL_TYPE_DEFAULT: + acl = nfsi->acl_default; + break; + + default: + goto out; + } + if (IS_ERR(acl)) + acl = ERR_PTR(-EAGAIN); + else + acl = posix_acl_dup(acl); +out: + spin_unlock(&inode->i_lock); + dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id, + inode->i_ino, type, acl); + return acl; +} + +static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id, + inode->i_ino, acl, dfacl); + spin_lock(&inode->i_lock); + __nfs3_forget_cached_acls(NFS_I(inode)); + if (!IS_ERR(acl)) + nfsi->acl_access = posix_acl_dup(acl); + if (!IS_ERR(dfacl)) + nfsi->acl_default = posix_acl_dup(dfacl); + spin_unlock(&inode->i_lock); +} + +struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct page *pages[NFSACL_MAXPAGES] = { }; + struct nfs3_getaclargs args = { + .fh = NFS_FH(inode), + /* The xdr layer may allocate pages here. */ + .pages = pages, + }; + struct nfs3_getaclres res = { + 0 + }; + struct rpc_message msg = { + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct posix_acl *acl; + int status, count; + + if (!nfs_server_capable(inode, NFS_CAP_ACLS)) + return ERR_PTR(-EOPNOTSUPP); + + status = nfs_revalidate_inode(server, inode); + if (status < 0) + return ERR_PTR(status); + acl = nfs3_get_cached_acl(inode, type); + if (acl != ERR_PTR(-EAGAIN)) + return acl; + acl = NULL; + + /* + * Only get the access acl when explicitly requested: We don't + * need it for access decisions, and only some applications use + * it. Applications which request the access acl first are not + * penalized from this optimization. + */ + if (type == ACL_TYPE_ACCESS) + args.mask |= NFS_ACLCNT|NFS_ACL; + if (S_ISDIR(inode->i_mode)) + args.mask |= NFS_DFACLCNT|NFS_DFACL; + if (args.mask == 0) + return NULL; + + dprintk("NFS call getacl\n"); + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + return ERR_PTR(-ENOMEM); + + status = rpc_call_sync(server->client_acl, &msg, 0); + dprintk("NFS reply getacl: %d\n", status); + + /* pages may have been allocated at the xdr layer. */ + for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++) + __free_page(args.pages[count]); + + switch (status) { + case 0: + status = nfs_refresh_inode(inode, res.fattr); + break; + case -EPFNOSUPPORT: + case -EPROTONOSUPPORT: + dprintk("NFS_V3_ACL extension not supported; disabling\n"); + server->caps &= ~NFS_CAP_ACLS; + case -ENOTSUPP: + status = -EOPNOTSUPP; + default: + goto getout; + } + if ((args.mask & res.mask) != args.mask) { + status = -EIO; + goto getout; + } + + if (res.acl_access != NULL) { + if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) { + posix_acl_release(res.acl_access); + res.acl_access = NULL; + } + } + nfs3_cache_acls(inode, + (res.mask & NFS_ACL) ? res.acl_access : ERR_PTR(-EINVAL), + (res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL)); + + switch(type) { + case ACL_TYPE_ACCESS: + acl = res.acl_access; + res.acl_access = NULL; + break; + + case ACL_TYPE_DEFAULT: + acl = res.acl_default; + res.acl_default = NULL; + } + +getout: + posix_acl_release(res.acl_access); + posix_acl_release(res.acl_default); + nfs_free_fattr(res.fattr); + + if (status != 0) { + posix_acl_release(acl); + acl = ERR_PTR(status); + } + return acl; +} + +static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_fattr *fattr; + struct page *pages[NFSACL_MAXPAGES]; + struct nfs3_setaclargs args = { + .inode = inode, + .mask = NFS_ACL, + .acl_access = acl, + .pages = pages, + }; + struct rpc_message msg = { + .rpc_argp = &args, + .rpc_resp = &fattr, + }; + int status; + + status = -EOPNOTSUPP; + if (!nfs_server_capable(inode, NFS_CAP_ACLS)) + goto out; + + /* We are doing this here because XDR marshalling does not + * return any results, it BUGs. */ + status = -ENOSPC; + if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES) + goto out; + if (dfacl != NULL && dfacl->a_count > NFS_ACL_MAX_ENTRIES) + goto out; + if (S_ISDIR(inode->i_mode)) { + args.mask |= NFS_DFACL; + args.acl_default = dfacl; + args.len = nfsacl_size(acl, dfacl); + } else + args.len = nfsacl_size(acl, NULL); + + if (args.len > NFS_ACL_INLINE_BUFSIZE) { + unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT); + + status = -ENOMEM; + do { + args.pages[args.npages] = alloc_page(GFP_KERNEL); + if (args.pages[args.npages] == NULL) + goto out_freepages; + args.npages++; + } while (args.npages < npages); + } + + dprintk("NFS call setacl\n"); + status = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out_freepages; + + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; + msg.rpc_resp = fattr; + status = rpc_call_sync(server->client_acl, &msg, 0); + nfs_access_zap_cache(inode); + nfs_zap_acl_cache(inode); + dprintk("NFS reply setacl: %d\n", status); + + switch (status) { + case 0: + status = nfs_refresh_inode(inode, fattr); + nfs3_cache_acls(inode, acl, dfacl); + break; + case -EPFNOSUPPORT: + case -EPROTONOSUPPORT: + dprintk("NFS_V3_ACL SETACL RPC not supported" + "(will not retry)\n"); + server->caps &= ~NFS_CAP_ACLS; + case -ENOTSUPP: + status = -EOPNOTSUPP; + } + nfs_free_fattr(fattr); +out_freepages: + while (args.npages != 0) { + args.npages--; + __free_page(args.pages[args.npages]); + } +out: + return status; +} + +int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct posix_acl *alloc = NULL, *dfacl = NULL; + int status; + + if (S_ISDIR(inode->i_mode)) { + switch(type) { + case ACL_TYPE_ACCESS: + alloc = dfacl = nfs3_proc_getacl(inode, + ACL_TYPE_DEFAULT); + if (IS_ERR(alloc)) + goto fail; + break; + + case ACL_TYPE_DEFAULT: + dfacl = acl; + alloc = acl = nfs3_proc_getacl(inode, + ACL_TYPE_ACCESS); + if (IS_ERR(alloc)) + goto fail; + break; + + default: + return -EINVAL; + } + } else if (type != ACL_TYPE_ACCESS) + return -EINVAL; + + if (acl == NULL) { + alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + if (IS_ERR(alloc)) + goto fail; + } + status = nfs3_proc_setacls(inode, acl, dfacl); + posix_acl_release(alloc); + return status; + +fail: + return PTR_ERR(alloc); +} + +int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, + mode_t mode) +{ + struct posix_acl *dfacl, *acl; + int error = 0; + + dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(dfacl)) { + error = PTR_ERR(dfacl); + return (error == -EOPNOTSUPP) ? 0 : error; + } + if (!dfacl) + return 0; + acl = posix_acl_clone(dfacl, GFP_KERNEL); + error = -ENOMEM; + if (!acl) + goto out_release_dfacl; + error = posix_acl_create_masq(acl, &mode); + if (error < 0) + goto out_release_acl; + error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ? + dfacl : NULL); +out_release_acl: + posix_acl_release(acl); +out_release_dfacl: + posix_acl_release(dfacl); + return error; +} diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c new file mode 100644 index 00000000..771741f1 --- /dev/null +++ b/fs/nfs/nfs3proc.c @@ -0,0 +1,890 @@ +/* + * linux/fs/nfs/nfs3proc.c + * + * Client-side NFSv3 procedures stubs. + * + * Copyright (C) 1997, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iostat.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_PROC + +/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */ +static int +nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) +{ + int res; + do { + res = rpc_call_sync(clnt, msg, flags); + if (res != -EJUKEBOX && res != -EKEYEXPIRED) + break; + schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); + res = -ERESTARTSYS; + } while (!fatal_signal_pending(current)); + return res; +} + +#define rpc_call_sync(clnt, msg, flags) nfs3_rpc_wrapper(clnt, msg, flags) + +static int +nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) +{ + if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED) + return 0; + if (task->tk_status == -EJUKEBOX) + nfs_inc_stats(inode, NFSIOS_DELAY); + task->tk_status = 0; + rpc_restart_call(task); + rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); + return 1; +} + +static int +do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_FSINFO], + .rpc_argp = fhandle, + .rpc_resp = info, + }; + int status; + + dprintk("%s: call fsinfo\n", __func__); + nfs_fattr_init(info->fattr); + status = rpc_call_sync(client, &msg, 0); + dprintk("%s: reply fsinfo: %d\n", __func__, status); + if (!(info->fattr->valid & NFS_ATTR_FATTR)) { + msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; + msg.rpc_resp = info->fattr; + status = rpc_call_sync(client, &msg, 0); + dprintk("%s: reply getattr: %d\n", __func__, status); + } + return status; +} + +/* + * Bare-bones access to getattr: this is for nfs_get_root/nfs_get_sb + */ +static int +nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + status = do_proc_get_root(server->client, fhandle, info); + if (status && server->nfs_client->cl_rpcclient != server->client) + status = do_proc_get_root(server->nfs_client->cl_rpcclient, fhandle, info); + return status; +} + +/* + * One function for each procedure in the NFS protocol. + */ +static int +nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) +{ + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR], + .rpc_argp = fhandle, + .rpc_resp = fattr, + }; + int status; + + dprintk("NFS call getattr\n"); + nfs_fattr_init(fattr); + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply getattr: %d\n", status); + return status; +} + +static int +nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, + struct iattr *sattr) +{ + struct inode *inode = dentry->d_inode; + struct nfs3_sattrargs arg = { + .fh = NFS_FH(inode), + .sattr = sattr, + }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_SETATTR], + .rpc_argp = &arg, + .rpc_resp = fattr, + }; + int status; + + dprintk("NFS call setattr\n"); + if (sattr->ia_valid & ATTR_FILE) + msg.rpc_cred = nfs_file_cred(sattr->ia_file); + nfs_fattr_init(fattr); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + if (status == 0) + nfs_setattr_update_inode(inode, sattr); + dprintk("NFS reply setattr: %d\n", status); + return status; +} + +static int +nfs3_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) +{ + struct nfs3_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, + .len = name->len + }; + struct nfs3_diropres res = { + .fh = fhandle, + .fattr = fattr + }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_LOOKUP], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status; + + dprintk("NFS call lookup %s\n", name->name); + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + return -ENOMEM; + + nfs_fattr_init(fattr); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_refresh_inode(dir, res.dir_attr); + if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { + msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; + msg.rpc_argp = fhandle; + msg.rpc_resp = fattr; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + } + nfs_free_fattr(res.dir_attr); + dprintk("NFS reply lookup: %d\n", status); + return status; +} + +static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) +{ + struct nfs3_accessargs arg = { + .fh = NFS_FH(inode), + }; + struct nfs3_accessres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], + .rpc_argp = &arg, + .rpc_resp = &res, + .rpc_cred = entry->cred, + }; + int mode = entry->mask; + int status = -ENOMEM; + + dprintk("NFS call access\n"); + + if (mode & MAY_READ) + arg.access |= NFS3_ACCESS_READ; + if (S_ISDIR(inode->i_mode)) { + if (mode & MAY_WRITE) + arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE; + if (mode & MAY_EXEC) + arg.access |= NFS3_ACCESS_LOOKUP; + } else { + if (mode & MAY_WRITE) + arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND; + if (mode & MAY_EXEC) + arg.access |= NFS3_ACCESS_EXECUTE; + } + + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + goto out; + + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_refresh_inode(inode, res.fattr); + if (status == 0) { + entry->mask = 0; + if (res.access & NFS3_ACCESS_READ) + entry->mask |= MAY_READ; + if (res.access & (NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE)) + entry->mask |= MAY_WRITE; + if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; + } + nfs_free_fattr(res.fattr); +out: + dprintk("NFS reply access: %d\n", status); + return status; +} + +static int nfs3_proc_readlink(struct inode *inode, struct page *page, + unsigned int pgbase, unsigned int pglen) +{ + struct nfs_fattr *fattr; + struct nfs3_readlinkargs args = { + .fh = NFS_FH(inode), + .pgbase = pgbase, + .pglen = pglen, + .pages = &page + }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK], + .rpc_argp = &args, + }; + int status = -ENOMEM; + + dprintk("NFS call readlink\n"); + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out; + msg.rpc_resp = fattr; + + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_refresh_inode(inode, fattr); + nfs_free_fattr(fattr); +out: + dprintk("NFS reply readlink: %d\n", status); + return status; +} + +struct nfs3_createdata { + struct rpc_message msg; + union { + struct nfs3_createargs create; + struct nfs3_mkdirargs mkdir; + struct nfs3_symlinkargs symlink; + struct nfs3_mknodargs mknod; + } arg; + struct nfs3_diropres res; + struct nfs_fh fh; + struct nfs_fattr fattr; + struct nfs_fattr dir_attr; +}; + +static struct nfs3_createdata *nfs3_alloc_createdata(void) +{ + struct nfs3_createdata *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data != NULL) { + data->msg.rpc_argp = &data->arg; + data->msg.rpc_resp = &data->res; + data->res.fh = &data->fh; + data->res.fattr = &data->fattr; + data->res.dir_attr = &data->dir_attr; + nfs_fattr_init(data->res.fattr); + nfs_fattr_init(data->res.dir_attr); + } + return data; +} + +static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) +{ + int status; + + status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); + nfs_post_op_update_inode(dir, data->res.dir_attr); + if (status == 0) + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + return status; +} + +static void nfs3_free_createdata(struct nfs3_createdata *data) +{ + kfree(data); +} + +/* + * Create a regular file. + */ +static int +nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + int flags, struct nfs_open_context *ctx) +{ + struct nfs3_createdata *data; + mode_t mode = sattr->ia_mode; + int status = -ENOMEM; + + dprintk("NFS call create %s\n", dentry->d_name.name); + + data = nfs3_alloc_createdata(); + if (data == NULL) + goto out; + + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE]; + data->arg.create.fh = NFS_FH(dir); + data->arg.create.name = dentry->d_name.name; + data->arg.create.len = dentry->d_name.len; + data->arg.create.sattr = sattr; + + data->arg.create.createmode = NFS3_CREATE_UNCHECKED; + if (flags & O_EXCL) { + data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE; + data->arg.create.verifier[0] = jiffies; + data->arg.create.verifier[1] = current->pid; + } + + sattr->ia_mode &= ~current_umask(); + + for (;;) { + status = nfs3_do_create(dir, dentry, data); + + if (status != -ENOTSUPP) + break; + /* If the server doesn't support the exclusive creation + * semantics, try again with simple 'guarded' mode. */ + switch (data->arg.create.createmode) { + case NFS3_CREATE_EXCLUSIVE: + data->arg.create.createmode = NFS3_CREATE_GUARDED; + break; + + case NFS3_CREATE_GUARDED: + data->arg.create.createmode = NFS3_CREATE_UNCHECKED; + break; + + case NFS3_CREATE_UNCHECKED: + goto out; + } + nfs_fattr_init(data->res.dir_attr); + nfs_fattr_init(data->res.fattr); + } + + if (status != 0) + goto out; + + /* When we created the file with exclusive semantics, make + * sure we set the attributes afterwards. */ + if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) { + dprintk("NFS call setattr (post-create)\n"); + + if (!(sattr->ia_valid & ATTR_ATIME_SET)) + sattr->ia_valid |= ATTR_ATIME; + if (!(sattr->ia_valid & ATTR_MTIME_SET)) + sattr->ia_valid |= ATTR_MTIME; + + /* Note: we could use a guarded setattr here, but I'm + * not sure this buys us anything (and I'd have + * to revamp the NFSv3 XDR code) */ + status = nfs3_proc_setattr(dentry, data->res.fattr, sattr); + nfs_post_op_update_inode(dentry->d_inode, data->res.fattr); + dprintk("NFS reply setattr (post-create): %d\n", status); + if (status != 0) + goto out; + } + status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); +out: + nfs3_free_createdata(data); + dprintk("NFS reply create: %d\n", status); + return status; +} + +static int +nfs3_proc_remove(struct inode *dir, struct qstr *name) +{ + struct nfs_removeargs arg = { + .fh = NFS_FH(dir), + .name.len = name->len, + .name.name = name->name, + }; + struct nfs_removeres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status = -ENOMEM; + + dprintk("NFS call remove %s\n", name->name); + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + goto out; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_post_op_update_inode(dir, res.dir_attr); + nfs_free_fattr(res.dir_attr); +out: + dprintk("NFS reply remove: %d\n", status); + return status; +} + +static void +nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +{ + msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE]; +} + +static int +nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir) +{ + struct nfs_removeres *res; + if (nfs3_async_handle_jukebox(task, dir)) + return 0; + res = task->tk_msg.rpc_resp; + nfs_post_op_update_inode(dir, res->dir_attr); + return 1; +} + +static void +nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +{ + msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME]; +} + +static int +nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ + struct nfs_renameres *res; + + if (nfs3_async_handle_jukebox(task, old_dir)) + return 0; + res = task->tk_msg.rpc_resp; + + nfs_post_op_update_inode(old_dir, res->old_fattr); + nfs_post_op_update_inode(new_dir, res->new_fattr); + return 1; +} + +static int +nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, + struct inode *new_dir, struct qstr *new_name) +{ + struct nfs_renameargs arg = { + .old_dir = NFS_FH(old_dir), + .old_name = old_name, + .new_dir = NFS_FH(new_dir), + .new_name = new_name, + }; + struct nfs_renameres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status = -ENOMEM; + + dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); + + res.old_fattr = nfs_alloc_fattr(); + res.new_fattr = nfs_alloc_fattr(); + if (res.old_fattr == NULL || res.new_fattr == NULL) + goto out; + + status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); + nfs_post_op_update_inode(old_dir, res.old_fattr); + nfs_post_op_update_inode(new_dir, res.new_fattr); +out: + nfs_free_fattr(res.old_fattr); + nfs_free_fattr(res.new_fattr); + dprintk("NFS reply rename: %d\n", status); + return status; +} + +static int +nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +{ + struct nfs3_linkargs arg = { + .fromfh = NFS_FH(inode), + .tofh = NFS_FH(dir), + .toname = name->name, + .tolen = name->len + }; + struct nfs3_linkres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_LINK], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status = -ENOMEM; + + dprintk("NFS call link %s\n", name->name); + res.fattr = nfs_alloc_fattr(); + res.dir_attr = nfs_alloc_fattr(); + if (res.fattr == NULL || res.dir_attr == NULL) + goto out; + + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_post_op_update_inode(dir, res.dir_attr); + nfs_post_op_update_inode(inode, res.fattr); +out: + nfs_free_fattr(res.dir_attr); + nfs_free_fattr(res.fattr); + dprintk("NFS reply link: %d\n", status); + return status; +} + +static int +nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, + unsigned int len, struct iattr *sattr) +{ + struct nfs3_createdata *data; + int status = -ENOMEM; + + if (len > NFS3_MAXPATHLEN) + return -ENAMETOOLONG; + + dprintk("NFS call symlink %s\n", dentry->d_name.name); + + data = nfs3_alloc_createdata(); + if (data == NULL) + goto out; + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK]; + data->arg.symlink.fromfh = NFS_FH(dir); + data->arg.symlink.fromname = dentry->d_name.name; + data->arg.symlink.fromlen = dentry->d_name.len; + data->arg.symlink.pages = &page; + data->arg.symlink.pathlen = len; + data->arg.symlink.sattr = sattr; + + status = nfs3_do_create(dir, dentry, data); + + nfs3_free_createdata(data); +out: + dprintk("NFS reply symlink: %d\n", status); + return status; +} + +static int +nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) +{ + struct nfs3_createdata *data; + int mode = sattr->ia_mode; + int status = -ENOMEM; + + dprintk("NFS call mkdir %s\n", dentry->d_name.name); + + sattr->ia_mode &= ~current_umask(); + + data = nfs3_alloc_createdata(); + if (data == NULL) + goto out; + + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR]; + data->arg.mkdir.fh = NFS_FH(dir); + data->arg.mkdir.name = dentry->d_name.name; + data->arg.mkdir.len = dentry->d_name.len; + data->arg.mkdir.sattr = sattr; + + status = nfs3_do_create(dir, dentry, data); + if (status != 0) + goto out; + + status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); +out: + nfs3_free_createdata(data); + dprintk("NFS reply mkdir: %d\n", status); + return status; +} + +static int +nfs3_proc_rmdir(struct inode *dir, struct qstr *name) +{ + struct nfs_fattr *dir_attr; + struct nfs3_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, + .len = name->len + }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR], + .rpc_argp = &arg, + }; + int status = -ENOMEM; + + dprintk("NFS call rmdir %s\n", name->name); + dir_attr = nfs_alloc_fattr(); + if (dir_attr == NULL) + goto out; + + msg.rpc_resp = dir_attr; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_post_op_update_inode(dir, dir_attr); + nfs_free_fattr(dir_attr); +out: + dprintk("NFS reply rmdir: %d\n", status); + return status; +} + +/* + * The READDIR implementation is somewhat hackish - we pass the user buffer + * to the encode function, which installs it in the receive iovec. + * The decode function itself doesn't perform any decoding, it just makes + * sure the reply is syntactically correct. + * + * Also note that this implementation handles both plain readdir and + * readdirplus. + */ +static int +nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, + u64 cookie, struct page **pages, unsigned int count, int plus) +{ + struct inode *dir = dentry->d_inode; + __be32 *verf = NFS_COOKIEVERF(dir); + struct nfs3_readdirargs arg = { + .fh = NFS_FH(dir), + .cookie = cookie, + .verf = {verf[0], verf[1]}, + .plus = plus, + .count = count, + .pages = pages + }; + struct nfs3_readdirres res = { + .verf = verf, + .plus = plus + }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_READDIR], + .rpc_argp = &arg, + .rpc_resp = &res, + .rpc_cred = cred + }; + int status = -ENOMEM; + + if (plus) + msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS]; + + dprintk("NFS call readdir%s %d\n", + plus? "plus" : "", (unsigned int) cookie); + + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + goto out; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + + nfs_invalidate_atime(dir); + nfs_refresh_inode(dir, res.dir_attr); + + nfs_free_fattr(res.dir_attr); +out: + dprintk("NFS reply readdir%s: %d\n", + plus? "plus" : "", status); + return status; +} + +static int +nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + dev_t rdev) +{ + struct nfs3_createdata *data; + mode_t mode = sattr->ia_mode; + int status = -ENOMEM; + + dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, + MAJOR(rdev), MINOR(rdev)); + + sattr->ia_mode &= ~current_umask(); + + data = nfs3_alloc_createdata(); + if (data == NULL) + goto out; + + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD]; + data->arg.mknod.fh = NFS_FH(dir); + data->arg.mknod.name = dentry->d_name.name; + data->arg.mknod.len = dentry->d_name.len; + data->arg.mknod.sattr = sattr; + data->arg.mknod.rdev = rdev; + + switch (sattr->ia_mode & S_IFMT) { + case S_IFBLK: + data->arg.mknod.type = NF3BLK; + break; + case S_IFCHR: + data->arg.mknod.type = NF3CHR; + break; + case S_IFIFO: + data->arg.mknod.type = NF3FIFO; + break; + case S_IFSOCK: + data->arg.mknod.type = NF3SOCK; + break; + default: + status = -EINVAL; + goto out; + } + + status = nfs3_do_create(dir, dentry, data); + if (status != 0) + goto out; + status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); +out: + nfs3_free_createdata(data); + dprintk("NFS reply mknod: %d\n", status); + return status; +} + +static int +nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsstat *stat) +{ + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_FSSTAT], + .rpc_argp = fhandle, + .rpc_resp = stat, + }; + int status; + + dprintk("NFS call fsstat\n"); + nfs_fattr_init(stat->fattr); + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply fsstat: %d\n", status); + return status; +} + +static int +do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_FSINFO], + .rpc_argp = fhandle, + .rpc_resp = info, + }; + int status; + + dprintk("NFS call fsinfo\n"); + nfs_fattr_init(info->fattr); + status = rpc_call_sync(client, &msg, 0); + dprintk("NFS reply fsinfo: %d\n", status); + return status; +} + +/* + * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via + * nfs_create_server + */ +static int +nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + status = do_proc_fsinfo(server->client, fhandle, info); + if (status && server->nfs_client->cl_rpcclient != server->client) + status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info); + return status; +} + +static int +nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *info) +{ + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_PATHCONF], + .rpc_argp = fhandle, + .rpc_resp = info, + }; + int status; + + dprintk("NFS call pathconf\n"); + nfs_fattr_init(info->fattr); + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply pathconf: %d\n", status); + return status; +} + +static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data) +{ + if (nfs3_async_handle_jukebox(task, data->inode)) + return -EAGAIN; + + nfs_invalidate_atime(data->inode); + nfs_refresh_inode(data->inode, &data->fattr); + return 0; +} + +static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +{ + msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; +} + +static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) +{ + if (nfs3_async_handle_jukebox(task, data->inode)) + return -EAGAIN; + if (task->tk_status >= 0) + nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); + return 0; +} + +static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +{ + msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; +} + +static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data) +{ + if (nfs3_async_handle_jukebox(task, data->inode)) + return -EAGAIN; + nfs_refresh_inode(data->inode, data->res.fattr); + return 0; +} + +static void nfs3_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) +{ + msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT]; +} + +static int +nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + + return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); +} + +const struct nfs_rpc_ops nfs_v3_clientops = { + .version = 3, /* protocol version */ + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs3_dir_inode_operations, + .file_inode_ops = &nfs3_file_inode_operations, + .file_ops = &nfs_file_operations, + .getroot = nfs3_proc_get_root, + .getattr = nfs3_proc_getattr, + .setattr = nfs3_proc_setattr, + .lookup = nfs3_proc_lookup, + .access = nfs3_proc_access, + .readlink = nfs3_proc_readlink, + .create = nfs3_proc_create, + .remove = nfs3_proc_remove, + .unlink_setup = nfs3_proc_unlink_setup, + .unlink_done = nfs3_proc_unlink_done, + .rename = nfs3_proc_rename, + .rename_setup = nfs3_proc_rename_setup, + .rename_done = nfs3_proc_rename_done, + .link = nfs3_proc_link, + .symlink = nfs3_proc_symlink, + .mkdir = nfs3_proc_mkdir, + .rmdir = nfs3_proc_rmdir, + .readdir = nfs3_proc_readdir, + .mknod = nfs3_proc_mknod, + .statfs = nfs3_proc_statfs, + .fsinfo = nfs3_proc_fsinfo, + .pathconf = nfs3_proc_pathconf, + .decode_dirent = nfs3_decode_dirent, + .read_setup = nfs3_proc_read_setup, + .read_done = nfs3_read_done, + .write_setup = nfs3_proc_write_setup, + .write_done = nfs3_write_done, + .commit_setup = nfs3_proc_commit_setup, + .commit_done = nfs3_commit_done, + .lock = nfs3_proc_lock, + .clear_acl_cache = nfs3_forget_cached_acls, + .close_context = nfs_close_context, + .init_client = nfs_init_client, +}; diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c new file mode 100644 index 00000000..183c6b12 --- /dev/null +++ b/fs/nfs/nfs3xdr.c @@ -0,0 +1,2498 @@ +/* + * linux/fs/nfs/nfs3xdr.c + * + * XDR functions to encode/decode NFSv3 RPC arguments and results. + * + * Copyright (C) 1996, 1997 Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_XDR + +/* Mapping from NFS error code to "errno" error code. */ +#define errno_NFSERR_IO EIO + +/* + * Declare the space requirements for NFS arguments and replies as + * number of 32bit-words + */ +#define NFS3_fhandle_sz (1+16) +#define NFS3_fh_sz (NFS3_fhandle_sz) /* shorthand */ +#define NFS3_sattr_sz (15) +#define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2)) +#define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2)) +#define NFS3_fattr_sz (21) +#define NFS3_cookieverf_sz (NFS3_COOKIEVERFSIZE>>2) +#define NFS3_wcc_attr_sz (6) +#define NFS3_pre_op_attr_sz (1+NFS3_wcc_attr_sz) +#define NFS3_post_op_attr_sz (1+NFS3_fattr_sz) +#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz) +#define NFS3_diropargs_sz (NFS3_fh_sz+NFS3_filename_sz) + +#define NFS3_getattrargs_sz (NFS3_fh_sz) +#define NFS3_setattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3) +#define NFS3_lookupargs_sz (NFS3_fh_sz+NFS3_filename_sz) +#define NFS3_accessargs_sz (NFS3_fh_sz+1) +#define NFS3_readlinkargs_sz (NFS3_fh_sz) +#define NFS3_readargs_sz (NFS3_fh_sz+3) +#define NFS3_writeargs_sz (NFS3_fh_sz+5) +#define NFS3_createargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz) +#define NFS3_mkdirargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz) +#define NFS3_symlinkargs_sz (NFS3_diropargs_sz+1+NFS3_sattr_sz) +#define NFS3_mknodargs_sz (NFS3_diropargs_sz+2+NFS3_sattr_sz) +#define NFS3_removeargs_sz (NFS3_fh_sz+NFS3_filename_sz) +#define NFS3_renameargs_sz (NFS3_diropargs_sz+NFS3_diropargs_sz) +#define NFS3_linkargs_sz (NFS3_fh_sz+NFS3_diropargs_sz) +#define NFS3_readdirargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+3) +#define NFS3_readdirplusargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+4) +#define NFS3_commitargs_sz (NFS3_fh_sz+3) + +#define NFS3_getattrres_sz (1+NFS3_fattr_sz) +#define NFS3_setattrres_sz (1+NFS3_wcc_data_sz) +#define NFS3_removeres_sz (NFS3_setattrres_sz) +#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz)) +#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1) +#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1) +#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3) +#define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4) +#define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) +#define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz)) +#define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) +#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2) +#define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13) +#define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12) +#define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6) +#define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2) + +#define ACL3_getaclargs_sz (NFS3_fh_sz+1) +#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) +#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) +#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) + +/* + * Map file type to S_IFMT bits + */ +static const umode_t nfs_type2fmt[] = { + [NF3BAD] = 0, + [NF3REG] = S_IFREG, + [NF3DIR] = S_IFDIR, + [NF3BLK] = S_IFBLK, + [NF3CHR] = S_IFCHR, + [NF3LNK] = S_IFLNK, + [NF3SOCK] = S_IFSOCK, + [NF3FIFO] = S_IFIFO, +}; + +/* + * While encoding arguments, set up the reply buffer in advance to + * receive reply data directly into the page cache. + */ +static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, + unsigned int base, unsigned int len, + unsigned int bufsize) +{ + struct rpc_auth *auth = req->rq_cred->cr_auth; + unsigned int replen; + + replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); +} + +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("NFS: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + + +/* + * Encode/decode NFSv3 basic data types + * + * Basic NFSv3 data types are defined in section 2.5 of RFC 1813: + * "NFS Version 3 Protocol Specification". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. + */ + +static void encode_uint32(struct xdr_stream *xdr, u32 value) +{ + __be32 *p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(value); +} + +static int decode_uint32(struct xdr_stream *xdr, u32 *value) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + *value = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_uint64(struct xdr_stream *xdr, u64 *value) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(p == NULL)) + goto out_overflow; + xdr_decode_hyper(p, value); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * fileid3 + * + * typedef uint64 fileid3; + */ +static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid) +{ + return xdr_decode_hyper(p, fileid); +} + +static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid) +{ + return decode_uint64(xdr, fileid); +} + +/* + * filename3 + * + * typedef string filename3<>; + */ +static void encode_filename3(struct xdr_stream *xdr, + const char *name, u32 length) +{ + __be32 *p; + + BUG_ON(length > NFS3_MAXNAMLEN); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); +} + +static int decode_inline_filename3(struct xdr_stream *xdr, + const char **name, u32 *length) +{ + __be32 *p; + u32 count; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + if (count > NFS3_MAXNAMLEN) + goto out_nametoolong; + p = xdr_inline_decode(xdr, count); + if (unlikely(p == NULL)) + goto out_overflow; + *name = (const char *)p; + *length = count; + return 0; + +out_nametoolong: + dprintk("NFS: returned filename too long: %u\n", count); + return -ENAMETOOLONG; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * nfspath3 + * + * typedef string nfspath3<>; + */ +static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages, + const u32 length) +{ + BUG_ON(length > NFS3_MAXPATHLEN); + encode_uint32(xdr, length); + xdr_write_pages(xdr, pages, 0, length); +} + +static int decode_nfspath3(struct xdr_stream *xdr) +{ + u32 recvd, count; + size_t hdrlen; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN)) + goto out_nametoolong; + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(count > recvd)) + goto out_cheating; + + xdr_read_pages(xdr, count); + xdr_terminate_string(xdr->buf, count); + return 0; + +out_nametoolong: + dprintk("NFS: returned pathname too long: %u\n", count); + return -ENAMETOOLONG; +out_cheating: + dprintk("NFS: server cheating in pathname result: " + "count %u > recvd %u\n", count, recvd); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * cookie3 + * + * typedef uint64 cookie3 + */ +static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie) +{ + return xdr_encode_hyper(p, cookie); +} + +static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie) +{ + return decode_uint64(xdr, cookie); +} + +/* + * cookieverf3 + * + * typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE]; + */ +static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier) +{ + memcpy(p, verifier, NFS3_COOKIEVERFSIZE); + return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE); +} + +static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + memcpy(verifier, p, NFS3_COOKIEVERFSIZE); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * createverf3 + * + * typedef opaque createverf3[NFS3_CREATEVERFSIZE]; + */ +static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE); + memcpy(p, verifier, NFS3_CREATEVERFSIZE); +} + +static int decode_writeverf3(struct xdr_stream *xdr, __be32 *verifier) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + memcpy(verifier, p, NFS3_WRITEVERFSIZE); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * size3 + * + * typedef uint64 size3; + */ +static __be32 *xdr_decode_size3(__be32 *p, u64 *size) +{ + return xdr_decode_hyper(p, size); +} + +/* + * nfsstat3 + * + * enum nfsstat3 { + * NFS3_OK = 0, + * ... + * } + */ +#define NFS3_OK NFS_OK + +static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + *status = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * ftype3 + * + * enum ftype3 { + * NF3REG = 1, + * NF3DIR = 2, + * NF3BLK = 3, + * NF3CHR = 4, + * NF3LNK = 5, + * NF3SOCK = 6, + * NF3FIFO = 7 + * }; + */ +static void encode_ftype3(struct xdr_stream *xdr, const u32 type) +{ + BUG_ON(type > NF3FIFO); + encode_uint32(xdr, type); +} + +static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode) +{ + u32 type; + + type = be32_to_cpup(p++); + if (type > NF3FIFO) + type = NF3NON; + *mode = nfs_type2fmt[type]; + return p; +} + +/* + * specdata3 + * + * struct specdata3 { + * uint32 specdata1; + * uint32 specdata2; + * }; + */ +static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 8); + *p++ = cpu_to_be32(MAJOR(rdev)); + *p = cpu_to_be32(MINOR(rdev)); +} + +static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev) +{ + unsigned int major, minor; + + major = be32_to_cpup(p++); + minor = be32_to_cpup(p++); + *rdev = MKDEV(major, minor); + if (MAJOR(*rdev) != major || MINOR(*rdev) != minor) + *rdev = 0; + return p; +} + +/* + * nfs_fh3 + * + * struct nfs_fh3 { + * opaque data; + * }; + */ +static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh) +{ + __be32 *p; + + BUG_ON(fh->size > NFS3_FHSIZE); + p = xdr_reserve_space(xdr, 4 + fh->size); + xdr_encode_opaque(p, fh->data, fh->size); +} + +static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + if (unlikely(length > NFS3_FHSIZE)) + goto out_toobig; + p = xdr_inline_decode(xdr, length); + if (unlikely(p == NULL)) + goto out_overflow; + fh->size = length; + memcpy(fh->data, p, length); + return 0; +out_toobig: + dprintk("NFS: file handle size (%u) too big\n", length); + return -E2BIG; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static void zero_nfs_fh3(struct nfs_fh *fh) +{ + memset(fh, 0, sizeof(*fh)); +} + +/* + * nfstime3 + * + * struct nfstime3 { + * uint32 seconds; + * uint32 nseconds; + * }; + */ +static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep) +{ + *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32(timep->tv_nsec); + return p; +} + +static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep) +{ + timep->tv_sec = be32_to_cpup(p++); + timep->tv_nsec = be32_to_cpup(p++); + return p; +} + +/* + * sattr3 + * + * enum time_how { + * DONT_CHANGE = 0, + * SET_TO_SERVER_TIME = 1, + * SET_TO_CLIENT_TIME = 2 + * }; + * + * union set_mode3 switch (bool set_it) { + * case TRUE: + * mode3 mode; + * default: + * void; + * }; + * + * union set_uid3 switch (bool set_it) { + * case TRUE: + * uid3 uid; + * default: + * void; + * }; + * + * union set_gid3 switch (bool set_it) { + * case TRUE: + * gid3 gid; + * default: + * void; + * }; + * + * union set_size3 switch (bool set_it) { + * case TRUE: + * size3 size; + * default: + * void; + * }; + * + * union set_atime switch (time_how set_it) { + * case SET_TO_CLIENT_TIME: + * nfstime3 atime; + * default: + * void; + * }; + * + * union set_mtime switch (time_how set_it) { + * case SET_TO_CLIENT_TIME: + * nfstime3 mtime; + * default: + * void; + * }; + * + * struct sattr3 { + * set_mode3 mode; + * set_uid3 uid; + * set_gid3 gid; + * set_size3 size; + * set_atime atime; + * set_mtime mtime; + * }; + */ +static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) +{ + u32 nbytes; + __be32 *p; + + /* + * In order to make only a single xdr_reserve_space() call, + * pre-compute the total number of bytes to be reserved. + * Six boolean values, one for each set_foo field, are always + * present in the encoded result, so start there. + */ + nbytes = 6 * 4; + if (attr->ia_valid & ATTR_MODE) + nbytes += 4; + if (attr->ia_valid & ATTR_UID) + nbytes += 4; + if (attr->ia_valid & ATTR_GID) + nbytes += 4; + if (attr->ia_valid & ATTR_SIZE) + nbytes += 8; + if (attr->ia_valid & ATTR_ATIME_SET) + nbytes += 8; + if (attr->ia_valid & ATTR_MTIME_SET) + nbytes += 8; + p = xdr_reserve_space(xdr, nbytes); + + if (attr->ia_valid & ATTR_MODE) { + *p++ = xdr_one; + *p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO); + } else + *p++ = xdr_zero; + + if (attr->ia_valid & ATTR_UID) { + *p++ = xdr_one; + *p++ = cpu_to_be32(attr->ia_uid); + } else + *p++ = xdr_zero; + + if (attr->ia_valid & ATTR_GID) { + *p++ = xdr_one; + *p++ = cpu_to_be32(attr->ia_gid); + } else + *p++ = xdr_zero; + + if (attr->ia_valid & ATTR_SIZE) { + *p++ = xdr_one; + p = xdr_encode_hyper(p, (u64)attr->ia_size); + } else + *p++ = xdr_zero; + + if (attr->ia_valid & ATTR_ATIME_SET) { + *p++ = xdr_two; + p = xdr_encode_nfstime3(p, &attr->ia_atime); + } else if (attr->ia_valid & ATTR_ATIME) { + *p++ = xdr_one; + } else + *p++ = xdr_zero; + + if (attr->ia_valid & ATTR_MTIME_SET) { + *p++ = xdr_two; + xdr_encode_nfstime3(p, &attr->ia_mtime); + } else if (attr->ia_valid & ATTR_MTIME) { + *p = xdr_one; + } else + *p = xdr_zero; +} + +/* + * fattr3 + * + * struct fattr3 { + * ftype3 type; + * mode3 mode; + * uint32 nlink; + * uid3 uid; + * gid3 gid; + * size3 size; + * size3 used; + * specdata3 rdev; + * uint64 fsid; + * fileid3 fileid; + * nfstime3 atime; + * nfstime3 mtime; + * nfstime3 ctime; + * }; + */ +static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + umode_t fmode; + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + + p = xdr_decode_ftype3(p, &fmode); + + fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode; + fattr->nlink = be32_to_cpup(p++); + fattr->uid = be32_to_cpup(p++); + fattr->gid = be32_to_cpup(p++); + + p = xdr_decode_size3(p, &fattr->size); + p = xdr_decode_size3(p, &fattr->du.nfs3.used); + p = xdr_decode_specdata3(p, &fattr->rdev); + + p = xdr_decode_hyper(p, &fattr->fsid.major); + fattr->fsid.minor = 0; + + p = xdr_decode_fileid3(p, &fattr->fileid); + p = xdr_decode_nfstime3(p, &fattr->atime); + p = xdr_decode_nfstime3(p, &fattr->mtime); + xdr_decode_nfstime3(p, &fattr->ctime); + + fattr->valid |= NFS_ATTR_FATTR_V3; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * post_op_attr + * + * union post_op_attr switch (bool attributes_follow) { + * case TRUE: + * fattr3 attributes; + * case FALSE: + * void; + * }; + */ +static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) + return decode_fattr3(xdr, fattr); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * wcc_attr + * struct wcc_attr { + * size3 size; + * nfstime3 mtime; + * nfstime3 ctime; + * }; + */ +static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + + fattr->valid |= NFS_ATTR_FATTR_PRESIZE + | NFS_ATTR_FATTR_PREMTIME + | NFS_ATTR_FATTR_PRECTIME; + + p = xdr_decode_size3(p, &fattr->pre_size); + p = xdr_decode_nfstime3(p, &fattr->pre_mtime); + xdr_decode_nfstime3(p, &fattr->pre_ctime); + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * pre_op_attr + * union pre_op_attr switch (bool attributes_follow) { + * case TRUE: + * wcc_attr attributes; + * case FALSE: + * void; + * }; + * + * wcc_data + * + * struct wcc_data { + * pre_op_attr before; + * post_op_attr after; + * }; + */ +static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) + return decode_wcc_attr(xdr, fattr); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + int error; + + error = decode_pre_op_attr(xdr, fattr); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, fattr); +out: + return error; +} + +/* + * post_op_fh3 + * + * union post_op_fh3 switch (bool handle_follows) { + * case TRUE: + * nfs_fh3 handle; + * case FALSE: + * void; + * }; + */ +static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) + return decode_nfs_fh3(xdr, fh); + zero_nfs_fh3(fh); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * diropargs3 + * + * struct diropargs3 { + * nfs_fh3 dir; + * filename3 name; + * }; + */ +static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh, + const char *name, u32 length) +{ + encode_nfs_fh3(xdr, fh); + encode_filename3(xdr, name, length); +} + + +/* + * NFSv3 XDR encode functions + * + * NFSv3 argument types are defined in section 3.3 of RFC 1813: + * "NFS Version 3 Protocol Specification". + */ + +/* + * 3.3.1 GETATTR3args + * + * struct GETATTR3args { + * nfs_fh3 object; + * }; + */ +static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_fh *fh) +{ + encode_nfs_fh3(xdr, fh); +} + +/* + * 3.3.2 SETATTR3args + * + * union sattrguard3 switch (bool check) { + * case TRUE: + * nfstime3 obj_ctime; + * case FALSE: + * void; + * }; + * + * struct SETATTR3args { + * nfs_fh3 object; + * sattr3 new_attributes; + * sattrguard3 guard; + * }; + */ +static void encode_sattrguard3(struct xdr_stream *xdr, + const struct nfs3_sattrargs *args) +{ + __be32 *p; + + if (args->guard) { + p = xdr_reserve_space(xdr, 4 + 8); + *p++ = xdr_one; + xdr_encode_nfstime3(p, &args->guardtime); + } else { + p = xdr_reserve_space(xdr, 4); + *p = xdr_zero; + } +} + +static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_sattrargs *args) +{ + encode_nfs_fh3(xdr, args->fh); + encode_sattr3(xdr, args->sattr); + encode_sattrguard3(xdr, args); +} + +/* + * 3.3.3 LOOKUP3args + * + * struct LOOKUP3args { + * diropargs3 what; + * }; + */ +static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_diropargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name, args->len); +} + +/* + * 3.3.4 ACCESS3args + * + * struct ACCESS3args { + * nfs_fh3 object; + * uint32 access; + * }; + */ +static void encode_access3args(struct xdr_stream *xdr, + const struct nfs3_accessargs *args) +{ + encode_nfs_fh3(xdr, args->fh); + encode_uint32(xdr, args->access); +} + +static void nfs3_xdr_enc_access3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_accessargs *args) +{ + encode_access3args(xdr, args); +} + +/* + * 3.3.5 READLINK3args + * + * struct READLINK3args { + * nfs_fh3 symlink; + * }; + */ +static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_readlinkargs *args) +{ + encode_nfs_fh3(xdr, args->fh); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->pglen, NFS3_readlinkres_sz); +} + +/* + * 3.3.6 READ3args + * + * struct READ3args { + * nfs_fh3 file; + * offset3 offset; + * count3 count; + * }; + */ +static void encode_read3args(struct xdr_stream *xdr, + const struct nfs_readargs *args) +{ + __be32 *p; + + encode_nfs_fh3(xdr, args->fh); + + p = xdr_reserve_space(xdr, 8 + 4); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); +} + +static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readargs *args) +{ + encode_read3args(xdr, args); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->count, NFS3_readres_sz); + req->rq_rcv_buf.flags |= XDRBUF_READ; +} + +/* + * 3.3.7 WRITE3args + * + * enum stable_how { + * UNSTABLE = 0, + * DATA_SYNC = 1, + * FILE_SYNC = 2 + * }; + * + * struct WRITE3args { + * nfs_fh3 file; + * offset3 offset; + * count3 count; + * stable_how stable; + * opaque data<>; + * }; + */ +static void encode_write3args(struct xdr_stream *xdr, + const struct nfs_writeargs *args) +{ + __be32 *p; + + encode_nfs_fh3(xdr, args->fh); + + p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4); + p = xdr_encode_hyper(p, args->offset); + *p++ = cpu_to_be32(args->count); + *p++ = cpu_to_be32(args->stable); + *p = cpu_to_be32(args->count); + xdr_write_pages(xdr, args->pages, args->pgbase, args->count); +} + +static void nfs3_xdr_enc_write3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_writeargs *args) +{ + encode_write3args(xdr, args); + xdr->buf->flags |= XDRBUF_WRITE; +} + +/* + * 3.3.8 CREATE3args + * + * enum createmode3 { + * UNCHECKED = 0, + * GUARDED = 1, + * EXCLUSIVE = 2 + * }; + * + * union createhow3 switch (createmode3 mode) { + * case UNCHECKED: + * case GUARDED: + * sattr3 obj_attributes; + * case EXCLUSIVE: + * createverf3 verf; + * }; + * + * struct CREATE3args { + * diropargs3 where; + * createhow3 how; + * }; + */ +static void encode_createhow3(struct xdr_stream *xdr, + const struct nfs3_createargs *args) +{ + encode_uint32(xdr, args->createmode); + switch (args->createmode) { + case NFS3_CREATE_UNCHECKED: + case NFS3_CREATE_GUARDED: + encode_sattr3(xdr, args->sattr); + break; + case NFS3_CREATE_EXCLUSIVE: + encode_createverf3(xdr, args->verifier); + break; + default: + BUG(); + } +} + +static void nfs3_xdr_enc_create3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_createargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name, args->len); + encode_createhow3(xdr, args); +} + +/* + * 3.3.9 MKDIR3args + * + * struct MKDIR3args { + * diropargs3 where; + * sattr3 attributes; + * }; + */ +static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_mkdirargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name, args->len); + encode_sattr3(xdr, args->sattr); +} + +/* + * 3.3.10 SYMLINK3args + * + * struct symlinkdata3 { + * sattr3 symlink_attributes; + * nfspath3 symlink_data; + * }; + * + * struct SYMLINK3args { + * diropargs3 where; + * symlinkdata3 symlink; + * }; + */ +static void encode_symlinkdata3(struct xdr_stream *xdr, + const struct nfs3_symlinkargs *args) +{ + encode_sattr3(xdr, args->sattr); + encode_nfspath3(xdr, args->pages, args->pathlen); +} + +static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_symlinkargs *args) +{ + encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen); + encode_symlinkdata3(xdr, args); +} + +/* + * 3.3.11 MKNOD3args + * + * struct devicedata3 { + * sattr3 dev_attributes; + * specdata3 spec; + * }; + * + * union mknoddata3 switch (ftype3 type) { + * case NF3CHR: + * case NF3BLK: + * devicedata3 device; + * case NF3SOCK: + * case NF3FIFO: + * sattr3 pipe_attributes; + * default: + * void; + * }; + * + * struct MKNOD3args { + * diropargs3 where; + * mknoddata3 what; + * }; + */ +static void encode_devicedata3(struct xdr_stream *xdr, + const struct nfs3_mknodargs *args) +{ + encode_sattr3(xdr, args->sattr); + encode_specdata3(xdr, args->rdev); +} + +static void encode_mknoddata3(struct xdr_stream *xdr, + const struct nfs3_mknodargs *args) +{ + encode_ftype3(xdr, args->type); + switch (args->type) { + case NF3CHR: + case NF3BLK: + encode_devicedata3(xdr, args); + break; + case NF3SOCK: + case NF3FIFO: + encode_sattr3(xdr, args->sattr); + break; + case NF3REG: + case NF3DIR: + break; + default: + BUG(); + } +} + +static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_mknodargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name, args->len); + encode_mknoddata3(xdr, args); +} + +/* + * 3.3.12 REMOVE3args + * + * struct REMOVE3args { + * diropargs3 object; + * }; + */ +static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_removeargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name.name, args->name.len); +} + +/* + * 3.3.14 RENAME3args + * + * struct RENAME3args { + * diropargs3 from; + * diropargs3 to; + * }; + */ +static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_renameargs *args) +{ + const struct qstr *old = args->old_name; + const struct qstr *new = args->new_name; + + encode_diropargs3(xdr, args->old_dir, old->name, old->len); + encode_diropargs3(xdr, args->new_dir, new->name, new->len); +} + +/* + * 3.3.15 LINK3args + * + * struct LINK3args { + * nfs_fh3 file; + * diropargs3 link; + * }; + */ +static void nfs3_xdr_enc_link3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_linkargs *args) +{ + encode_nfs_fh3(xdr, args->fromfh); + encode_diropargs3(xdr, args->tofh, args->toname, args->tolen); +} + +/* + * 3.3.16 READDIR3args + * + * struct READDIR3args { + * nfs_fh3 dir; + * cookie3 cookie; + * cookieverf3 cookieverf; + * count3 count; + * }; + */ +static void encode_readdir3args(struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) +{ + __be32 *p; + + encode_nfs_fh3(xdr, args->fh); + + p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4); + p = xdr_encode_cookie3(p, args->cookie); + p = xdr_encode_cookieverf3(p, args->verf); + *p = cpu_to_be32(args->count); +} + +static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) +{ + encode_readdir3args(xdr, args); + prepare_reply_buffer(req, args->pages, 0, + args->count, NFS3_readdirres_sz); +} + +/* + * 3.3.17 READDIRPLUS3args + * + * struct READDIRPLUS3args { + * nfs_fh3 dir; + * cookie3 cookie; + * cookieverf3 cookieverf; + * count3 dircount; + * count3 maxcount; + * }; + */ +static void encode_readdirplus3args(struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) +{ + __be32 *p; + + encode_nfs_fh3(xdr, args->fh); + + p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4); + p = xdr_encode_cookie3(p, args->cookie); + p = xdr_encode_cookieverf3(p, args->verf); + + /* + * readdirplus: need dircount + buffer size. + * We just make sure we make dircount big enough + */ + *p++ = cpu_to_be32(args->count >> 3); + + *p = cpu_to_be32(args->count); +} + +static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) +{ + encode_readdirplus3args(xdr, args); + prepare_reply_buffer(req, args->pages, 0, + args->count, NFS3_readdirres_sz); +} + +/* + * 3.3.21 COMMIT3args + * + * struct COMMIT3args { + * nfs_fh3 file; + * offset3 offset; + * count3 count; + * }; + */ +static void encode_commit3args(struct xdr_stream *xdr, + const struct nfs_writeargs *args) +{ + __be32 *p; + + encode_nfs_fh3(xdr, args->fh); + + p = xdr_reserve_space(xdr, 8 + 4); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); +} + +static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_writeargs *args) +{ + encode_commit3args(xdr, args); +} + +#ifdef CONFIG_NFS_V3_ACL + +static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_getaclargs *args) +{ + encode_nfs_fh3(xdr, args->fh); + encode_uint32(xdr, args->mask); + if (args->mask & (NFS_ACL | NFS_DFACL)) + prepare_reply_buffer(req, args->pages, 0, + NFSACL_MAXPAGES << PAGE_SHIFT, + ACL3_getaclres_sz); +} + +static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_setaclargs *args) +{ + unsigned int base; + int error; + + encode_nfs_fh3(xdr, NFS_FH(args->inode)); + encode_uint32(xdr, args->mask); + + base = req->rq_slen; + if (args->npages != 0) + xdr_write_pages(xdr, args->pages, 0, args->len); + else + xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE); + + error = nfsacl_encode(xdr->buf, base, args->inode, + (args->mask & NFS_ACL) ? + args->acl_access : NULL, 1, 0); + BUG_ON(error < 0); + error = nfsacl_encode(xdr->buf, base + error, args->inode, + (args->mask & NFS_DFACL) ? + args->acl_default : NULL, 1, + NFS_ACL_DEFAULT); + BUG_ON(error < 0); +} + +#endif /* CONFIG_NFS_V3_ACL */ + +/* + * NFSv3 XDR decode functions + * + * NFSv3 result types are defined in section 3.3 of RFC 1813: + * "NFS Version 3 Protocol Specification". + */ + +/* + * 3.3.1 GETATTR3res + * + * struct GETATTR3resok { + * fattr3 obj_attributes; + * }; + * + * union GETATTR3res switch (nfsstat3 status) { + * case NFS3_OK: + * GETATTR3resok resok; + * default: + * void; + * }; + */ +static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_fattr3(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +/* + * 3.3.2 SETATTR3res + * + * struct SETATTR3resok { + * wcc_data obj_wcc; + * }; + * + * struct SETATTR3resfail { + * wcc_data obj_wcc; + * }; + * + * union SETATTR3res switch (nfsstat3 status) { + * case NFS3_OK: + * SETATTR3resok resok; + * default: + * SETATTR3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs_stat_to_errno(status); +} + +/* + * 3.3.3 LOOKUP3res + * + * struct LOOKUP3resok { + * nfs_fh3 object; + * post_op_attr obj_attributes; + * post_op_attr dir_attributes; + * }; + * + * struct LOOKUP3resfail { + * post_op_attr dir_attributes; + * }; + * + * union LOOKUP3res switch (nfsstat3 status) { + * case NFS3_OK: + * LOOKUP3resok resok; + * default: + * LOOKUP3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_diropres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_nfs_fh3(xdr, result->fh); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->dir_attr); +out: + return error; +out_default: + error = decode_post_op_attr(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + return nfs_stat_to_errno(status); +} + +/* + * 3.3.4 ACCESS3res + * + * struct ACCESS3resok { + * post_op_attr obj_attributes; + * uint32 access; + * }; + * + * struct ACCESS3resfail { + * post_op_attr obj_attributes; + * }; + * + * union ACCESS3res switch (nfsstat3 status) { + * case NFS3_OK: + * ACCESS3resok resok; + * default: + * ACCESS3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_access3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_accessres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_uint32(xdr, &result->access); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +/* + * 3.3.5 READLINK3res + * + * struct READLINK3resok { + * post_op_attr symlink_attributes; + * nfspath3 data; + * }; + * + * struct READLINK3resfail { + * post_op_attr symlink_attributes; + * }; + * + * union READLINK3res switch (nfsstat3 status) { + * case NFS3_OK: + * READLINK3resok resok; + * default: + * READLINK3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_nfspath3(xdr); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +/* + * 3.3.6 READ3res + * + * struct READ3resok { + * post_op_attr file_attributes; + * count3 count; + * bool eof; + * opaque data<>; + * }; + * + * struct READ3resfail { + * post_op_attr file_attributes; + * }; + * + * union READ3res switch (nfsstat3 status) { + * case NFS3_OK: + * READ3resok resok; + * default: + * READ3resfail resfail; + * }; + */ +static int decode_read3resok(struct xdr_stream *xdr, + struct nfs_readres *result) +{ + u32 eof, count, ocount, recvd; + size_t hdrlen; + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p++); + eof = be32_to_cpup(p++); + ocount = be32_to_cpup(p++); + if (unlikely(ocount != count)) + goto out_mismatch; + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(count > recvd)) + goto out_cheating; + +out: + xdr_read_pages(xdr, count); + result->eof = eof; + result->count = count; + return count; +out_mismatch: + dprintk("NFS: READ count doesn't match length of opaque: " + "count %u != ocount %u\n", count, ocount); + return -EIO; +out_cheating: + dprintk("NFS: server cheating in read result: " + "count %u > recvd %u\n", count, recvd); + count = recvd; + eof = 0; + goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_readres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_read3resok(xdr, result); +out: + return error; +out_status: + return nfs_stat_to_errno(status); +} + +/* + * 3.3.7 WRITE3res + * + * enum stable_how { + * UNSTABLE = 0, + * DATA_SYNC = 1, + * FILE_SYNC = 2 + * }; + * + * struct WRITE3resok { + * wcc_data file_wcc; + * count3 count; + * stable_how committed; + * writeverf3 verf; + * }; + * + * struct WRITE3resfail { + * wcc_data file_wcc; + * }; + * + * union WRITE3res switch (nfsstat3 status) { + * case NFS3_OK: + * WRITE3resok resok; + * default: + * WRITE3resfail resfail; + * }; + */ +static int decode_write3resok(struct xdr_stream *xdr, + struct nfs_writeres *result) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4 + NFS3_WRITEVERFSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + result->count = be32_to_cpup(p++); + result->verf->committed = be32_to_cpup(p++); + if (unlikely(result->verf->committed > NFS_FILE_SYNC)) + goto out_badvalue; + memcpy(result->verf->verifier, p, NFS3_WRITEVERFSIZE); + return result->count; +out_badvalue: + dprintk("NFS: bad stable_how value: %u\n", result->verf->committed); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_writeres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_write3resok(xdr, result); +out: + return error; +out_status: + return nfs_stat_to_errno(status); +} + +/* + * 3.3.8 CREATE3res + * + * struct CREATE3resok { + * post_op_fh3 obj; + * post_op_attr obj_attributes; + * wcc_data dir_wcc; + * }; + * + * struct CREATE3resfail { + * wcc_data dir_wcc; + * }; + * + * union CREATE3res switch (nfsstat3 status) { + * case NFS3_OK: + * CREATE3resok resok; + * default: + * CREATE3resfail resfail; + * }; + */ +static int decode_create3resok(struct xdr_stream *xdr, + struct nfs3_diropres *result) +{ + int error; + + error = decode_post_op_fh3(xdr, result->fh); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + /* The server isn't required to return a file handle. + * If it didn't, force the client to perform a LOOKUP + * to determine the correct file handle and attribute + * values for the new object. */ + if (result->fh->size == 0) + result->fattr->valid = 0; + error = decode_wcc_data(xdr, result->dir_attr); +out: + return error; +} + +static int nfs3_xdr_dec_create3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_diropres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_create3resok(xdr, result); +out: + return error; +out_default: + error = decode_wcc_data(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + return nfs_stat_to_errno(status); +} + +/* + * 3.3.12 REMOVE3res + * + * struct REMOVE3resok { + * wcc_data dir_wcc; + * }; + * + * struct REMOVE3resfail { + * wcc_data dir_wcc; + * }; + * + * union REMOVE3res switch (nfsstat3 status) { + * case NFS3_OK: + * REMOVE3resok resok; + * default: + * REMOVE3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_removeres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs_stat_to_errno(status); +} + +/* + * 3.3.14 RENAME3res + * + * struct RENAME3resok { + * wcc_data fromdir_wcc; + * wcc_data todir_wcc; + * }; + * + * struct RENAME3resfail { + * wcc_data fromdir_wcc; + * wcc_data todir_wcc; + * }; + * + * union RENAME3res switch (nfsstat3 status) { + * case NFS3_OK: + * RENAME3resok resok; + * default: + * RENAME3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_renameres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->old_fattr); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->new_fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs_stat_to_errno(status); +} + +/* + * 3.3.15 LINK3res + * + * struct LINK3resok { + * post_op_attr file_attributes; + * wcc_data linkdir_wcc; + * }; + * + * struct LINK3resfail { + * post_op_attr file_attributes; + * wcc_data linkdir_wcc; + * }; + * + * union LINK3res switch (nfsstat3 status) { + * case NFS3_OK: + * LINK3resok resok; + * default: + * LINK3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs3_linkres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs_stat_to_errno(status); +} + +/** + * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in + * the local page cache + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + * + * 3.3.16 entry3 + * + * struct entry3 { + * fileid3 fileid; + * filename3 name; + * cookie3 cookie; + * fhandle3 filehandle; + * post_op_attr3 attributes; + * entry3 *nextentry; + * }; + * + * 3.3.17 entryplus3 + * struct entryplus3 { + * fileid3 fileid; + * filename3 name; + * cookie3 cookie; + * post_op_attr name_attributes; + * post_op_fh3 name_handle; + * entryplus3 *nextentry; + * }; + */ +int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) +{ + struct nfs_entry old = *entry; + __be32 *p; + int error; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p == xdr_zero) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p == xdr_zero) + return -EAGAIN; + entry->eof = 1; + return -EBADCOOKIE; + } + + error = decode_fileid3(xdr, &entry->ino); + if (unlikely(error)) + return error; + + error = decode_inline_filename3(xdr, &entry->name, &entry->len); + if (unlikely(error)) + return error; + + entry->prev_cookie = entry->cookie; + error = decode_cookie3(xdr, &entry->cookie); + if (unlikely(error)) + return error; + + entry->d_type = DT_UNKNOWN; + + if (plus) { + entry->fattr->valid = 0; + error = decode_post_op_attr(xdr, entry->fattr); + if (unlikely(error)) + return error; + if (entry->fattr->valid & NFS_ATTR_FATTR_V3) + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + + /* In fact, a post_op_fh3: */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) { + error = decode_nfs_fh3(xdr, entry->fh); + if (unlikely(error)) { + if (error == -E2BIG) + goto out_truncated; + return error; + } + } else + zero_nfs_fh3(entry->fh); + } + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EAGAIN; +out_truncated: + dprintk("NFS: directory entry contains invalid file handle\n"); + *entry = old; + return -EAGAIN; +} + +/* + * 3.3.16 READDIR3res + * + * struct dirlist3 { + * entry3 *entries; + * bool eof; + * }; + * + * struct READDIR3resok { + * post_op_attr dir_attributes; + * cookieverf3 cookieverf; + * dirlist3 reply; + * }; + * + * struct READDIR3resfail { + * post_op_attr dir_attributes; + * }; + * + * union READDIR3res switch (nfsstat3 status) { + * case NFS3_OK: + * READDIR3resok resok; + * default: + * READDIR3resfail resfail; + * }; + * + * Read the directory contents into the page cache, but otherwise + * don't touch them. The actual decoding is done by nfs3_decode_entry() + * during subsequent nfs_readdir() calls. + */ +static int decode_dirlist3(struct xdr_stream *xdr) +{ + u32 recvd, pglen; + size_t hdrlen; + + pglen = xdr->buf->page_len; + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(pglen > recvd)) + goto out_cheating; +out: + xdr_read_pages(xdr, pglen); + return pglen; +out_cheating: + dprintk("NFS: server cheating in readdir result: " + "pglen %u > recvd %u\n", pglen, recvd); + pglen = recvd; + goto out; +} + +static int decode_readdir3resok(struct xdr_stream *xdr, + struct nfs3_readdirres *result) +{ + int error; + + error = decode_post_op_attr(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + /* XXX: do we need to check if result->verf != NULL ? */ + error = decode_cookieverf3(xdr, result->verf); + if (unlikely(error)) + goto out; + error = decode_dirlist3(xdr); +out: + return error; +} + +static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_readdirres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_readdir3resok(xdr, result); +out: + return error; +out_default: + error = decode_post_op_attr(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + return nfs_stat_to_errno(status); +} + +/* + * 3.3.18 FSSTAT3res + * + * struct FSSTAT3resok { + * post_op_attr obj_attributes; + * size3 tbytes; + * size3 fbytes; + * size3 abytes; + * size3 tfiles; + * size3 ffiles; + * size3 afiles; + * uint32 invarsec; + * }; + * + * struct FSSTAT3resfail { + * post_op_attr obj_attributes; + * }; + * + * union FSSTAT3res switch (nfsstat3 status) { + * case NFS3_OK: + * FSSTAT3resok resok; + * default: + * FSSTAT3resfail resfail; + * }; + */ +static int decode_fsstat3resok(struct xdr_stream *xdr, + struct nfs_fsstat *result) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 8 * 6 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + p = xdr_decode_size3(p, &result->tbytes); + p = xdr_decode_size3(p, &result->fbytes); + p = xdr_decode_size3(p, &result->abytes); + p = xdr_decode_size3(p, &result->tfiles); + p = xdr_decode_size3(p, &result->ffiles); + xdr_decode_size3(p, &result->afiles); + /* ignore invarsec */ + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fsstat *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_fsstat3resok(xdr, result); +out: + return error; +out_status: + return nfs_stat_to_errno(status); +} + +/* + * 3.3.19 FSINFO3res + * + * struct FSINFO3resok { + * post_op_attr obj_attributes; + * uint32 rtmax; + * uint32 rtpref; + * uint32 rtmult; + * uint32 wtmax; + * uint32 wtpref; + * uint32 wtmult; + * uint32 dtpref; + * size3 maxfilesize; + * nfstime3 time_delta; + * uint32 properties; + * }; + * + * struct FSINFO3resfail { + * post_op_attr obj_attributes; + * }; + * + * union FSINFO3res switch (nfsstat3 status) { + * case NFS3_OK: + * FSINFO3resok resok; + * default: + * FSINFO3resfail resfail; + * }; + */ +static int decode_fsinfo3resok(struct xdr_stream *xdr, + struct nfs_fsinfo *result) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + result->rtmax = be32_to_cpup(p++); + result->rtpref = be32_to_cpup(p++); + result->rtmult = be32_to_cpup(p++); + result->wtmax = be32_to_cpup(p++); + result->wtpref = be32_to_cpup(p++); + result->wtmult = be32_to_cpup(p++); + result->dtpref = be32_to_cpup(p++); + p = xdr_decode_size3(p, &result->maxfilesize); + xdr_decode_nfstime3(p, &result->time_delta); + + /* ignore properties */ + result->lease_time = 0; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fsinfo *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_fsinfo3resok(xdr, result); +out: + return error; +out_status: + return nfs_stat_to_errno(status); +} + +/* + * 3.3.20 PATHCONF3res + * + * struct PATHCONF3resok { + * post_op_attr obj_attributes; + * uint32 linkmax; + * uint32 name_max; + * bool no_trunc; + * bool chown_restricted; + * bool case_insensitive; + * bool case_preserving; + * }; + * + * struct PATHCONF3resfail { + * post_op_attr obj_attributes; + * }; + * + * union PATHCONF3res switch (nfsstat3 status) { + * case NFS3_OK: + * PATHCONF3resok resok; + * default: + * PATHCONF3resfail resfail; + * }; + */ +static int decode_pathconf3resok(struct xdr_stream *xdr, + struct nfs_pathconf *result) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4 * 6); + if (unlikely(p == NULL)) + goto out_overflow; + result->max_link = be32_to_cpup(p++); + result->max_namelen = be32_to_cpup(p); + /* ignore remaining fields */ + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_pathconf *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_pathconf3resok(xdr, result); +out: + return error; +out_status: + return nfs_stat_to_errno(status); +} + +/* + * 3.3.21 COMMIT3res + * + * struct COMMIT3resok { + * wcc_data file_wcc; + * writeverf3 verf; + * }; + * + * struct COMMIT3resfail { + * wcc_data file_wcc; + * }; + * + * union COMMIT3res switch (nfsstat3 status) { + * case NFS3_OK: + * COMMIT3resok resok; + * default: + * COMMIT3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_writeres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_writeverf3(xdr, result->verf->verifier); +out: + return error; +out_status: + return nfs_stat_to_errno(status); +} + +#ifdef CONFIG_NFS_V3_ACL + +static inline int decode_getacl3resok(struct xdr_stream *xdr, + struct nfs3_getaclres *result) +{ + struct posix_acl **acl; + unsigned int *aclcnt; + size_t hdrlen; + int error; + + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_uint32(xdr, &result->mask); + if (unlikely(error)) + goto out; + error = -EINVAL; + if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + goto out; + + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + + acl = NULL; + if (result->mask & NFS_ACL) + acl = &result->acl_access; + aclcnt = NULL; + if (result->mask & NFS_ACLCNT) + aclcnt = &result->acl_access_count; + error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl); + if (unlikely(error <= 0)) + goto out; + + acl = NULL; + if (result->mask & NFS_DFACL) + acl = &result->acl_default; + aclcnt = NULL; + if (result->mask & NFS_DFACLCNT) + aclcnt = &result->acl_default_count; + error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl); + if (unlikely(error <= 0)) + return error; + error = 0; +out: + return error; +} + +static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_getaclres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_getacl3resok(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_post_op_attr(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +#endif /* CONFIG_NFS_V3_ACL */ + +#define PROC(proc, argtype, restype, timer) \ +[NFS3PROC_##proc] = { \ + .p_proc = NFS3PROC_##proc, \ + .p_encode = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args, \ + .p_decode = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res, \ + .p_arglen = NFS3_##argtype##args_sz, \ + .p_replen = NFS3_##restype##res_sz, \ + .p_timer = timer, \ + .p_statidx = NFS3PROC_##proc, \ + .p_name = #proc, \ + } + +struct rpc_procinfo nfs3_procedures[] = { + PROC(GETATTR, getattr, getattr, 1), + PROC(SETATTR, setattr, setattr, 0), + PROC(LOOKUP, lookup, lookup, 2), + PROC(ACCESS, access, access, 1), + PROC(READLINK, readlink, readlink, 3), + PROC(READ, read, read, 3), + PROC(WRITE, write, write, 4), + PROC(CREATE, create, create, 0), + PROC(MKDIR, mkdir, create, 0), + PROC(SYMLINK, symlink, create, 0), + PROC(MKNOD, mknod, create, 0), + PROC(REMOVE, remove, remove, 0), + PROC(RMDIR, lookup, setattr, 0), + PROC(RENAME, rename, rename, 0), + PROC(LINK, link, link, 0), + PROC(READDIR, readdir, readdir, 3), + PROC(READDIRPLUS, readdirplus, readdir, 3), + PROC(FSSTAT, getattr, fsstat, 0), + PROC(FSINFO, getattr, fsinfo, 0), + PROC(PATHCONF, getattr, pathconf, 0), + PROC(COMMIT, commit, commit, 5), +}; + +struct rpc_version nfs_version3 = { + .number = 3, + .nrprocs = ARRAY_SIZE(nfs3_procedures), + .procs = nfs3_procedures +}; + +#ifdef CONFIG_NFS_V3_ACL +static struct rpc_procinfo nfs3_acl_procedures[] = { + [ACLPROC3_GETACL] = { + .p_proc = ACLPROC3_GETACL, + .p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args, + .p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res, + .p_arglen = ACL3_getaclargs_sz, + .p_replen = ACL3_getaclres_sz, + .p_timer = 1, + .p_name = "GETACL", + }, + [ACLPROC3_SETACL] = { + .p_proc = ACLPROC3_SETACL, + .p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args, + .p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res, + .p_arglen = ACL3_setaclargs_sz, + .p_replen = ACL3_setaclres_sz, + .p_timer = 0, + .p_name = "SETACL", + }, +}; + +struct rpc_version nfsacl_version3 = { + .number = 3, + .nrprocs = sizeof(nfs3_acl_procedures)/ + sizeof(nfs3_acl_procedures[0]), + .procs = nfs3_acl_procedures, +}; +#endif /* CONFIG_NFS_V3_ACL */ diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h new file mode 100644 index 00000000..e1c1365b --- /dev/null +++ b/fs/nfs/nfs4_fs.h @@ -0,0 +1,383 @@ +/* + * linux/fs/nfs/nfs4_fs.h + * + * Copyright (C) 2005 Trond Myklebust + * + * NFSv4-specific filesystem definitions and declarations + */ + +#ifndef __LINUX_FS_NFS_NFS4_FS_H +#define __LINUX_FS_NFS_NFS4_FS_H + +#ifdef CONFIG_NFS_V4 + +struct idmap; + +/* + * In a seqid-mutating op, this macro controls which error return + * values trigger incrementation of the seqid. + * + * from rfc 3010: + * The client MUST monotonically increment the sequence number for the + * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE + * operations. This is true even in the event that the previous + * operation that used the sequence number received an error. The only + * exception to this rule is if the previous operation received one of + * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID, + * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR, + * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE. + * + */ +#define seqid_mutating_err(err) \ +(((err) != NFSERR_STALE_CLIENTID) && \ + ((err) != NFSERR_STALE_STATEID) && \ + ((err) != NFSERR_BAD_STATEID) && \ + ((err) != NFSERR_BAD_SEQID) && \ + ((err) != NFSERR_BAD_XDR) && \ + ((err) != NFSERR_RESOURCE) && \ + ((err) != NFSERR_NOFILEHANDLE)) + +enum nfs4_client_state { + NFS4CLNT_MANAGER_RUNNING = 0, + NFS4CLNT_CHECK_LEASE, + NFS4CLNT_LEASE_EXPIRED, + NFS4CLNT_RECLAIM_REBOOT, + NFS4CLNT_RECLAIM_NOGRACE, + NFS4CLNT_DELEGRETURN, + NFS4CLNT_LAYOUTRECALL, + NFS4CLNT_SESSION_RESET, + NFS4CLNT_RECALL_SLOT, + NFS4CLNT_LEASE_CONFIRM, +}; + +enum nfs4_session_state { + NFS4_SESSION_INITING, + NFS4_SESSION_DRAINING, +}; + +struct nfs4_minor_version_ops { + u32 minor_version; + + int (*call_sync)(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply); + int (*validate_stateid)(struct nfs_delegation *, + const nfs4_stateid *); + const struct nfs4_state_recovery_ops *reboot_recovery_ops; + const struct nfs4_state_recovery_ops *nograce_recovery_ops; + const struct nfs4_state_maintenance_ops *state_renewal_ops; +}; + +/* + * struct rpc_sequence ensures that RPC calls are sent in the exact + * order that they appear on the list. + */ +struct rpc_sequence { + struct rpc_wait_queue wait; /* RPC call delay queue */ + spinlock_t lock; /* Protects the list */ + struct list_head list; /* Defines sequence of RPC calls */ +}; + +#define NFS_SEQID_CONFIRMED 1 +struct nfs_seqid_counter { + struct rpc_sequence *sequence; + int flags; + u32 counter; +}; + +struct nfs_seqid { + struct nfs_seqid_counter *sequence; + struct list_head list; +}; + +static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status) +{ + if (seqid_mutating_err(-status)) + seqid->flags |= NFS_SEQID_CONFIRMED; +} + +struct nfs_unique_id { + struct rb_node rb_node; + __u64 id; +}; + +/* + * NFS4 state_owners and lock_owners are simply labels for ordered + * sequences of RPC calls. Their sole purpose is to provide once-only + * semantics by allowing the server to identify replayed requests. + */ +struct nfs4_state_owner { + struct nfs_unique_id so_owner_id; + struct nfs_server *so_server; + struct rb_node so_server_node; + + struct rpc_cred *so_cred; /* Associated cred */ + + spinlock_t so_lock; + atomic_t so_count; + unsigned long so_flags; + struct list_head so_states; + struct nfs_seqid_counter so_seqid; + struct rpc_sequence so_sequence; +}; + +enum { + NFS_OWNER_RECLAIM_REBOOT, + NFS_OWNER_RECLAIM_NOGRACE +}; + +#define NFS_LOCK_NEW 0 +#define NFS_LOCK_RECLAIM 1 +#define NFS_LOCK_EXPIRED 2 + +/* + * struct nfs4_state maintains the client-side state for a given + * (state_owner,inode) tuple (OPEN) or state_owner (LOCK). + * + * OPEN: + * In order to know when to OPEN_DOWNGRADE or CLOSE the state on the server, + * we need to know how many files are open for reading or writing on a + * given inode. This information too is stored here. + * + * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) + */ + +struct nfs4_lock_owner { + unsigned int lo_type; +#define NFS4_ANY_LOCK_TYPE (0U) +#define NFS4_FLOCK_LOCK_TYPE (1U << 0) +#define NFS4_POSIX_LOCK_TYPE (1U << 1) + union { + fl_owner_t posix_owner; + pid_t flock_owner; + } lo_u; +}; + +struct nfs4_lock_state { + struct list_head ls_locks; /* Other lock stateids */ + struct nfs4_state * ls_state; /* Pointer to open state */ +#define NFS_LOCK_INITIALIZED 1 + int ls_flags; + struct nfs_seqid_counter ls_seqid; + struct rpc_sequence ls_sequence; + struct nfs_unique_id ls_id; + nfs4_stateid ls_stateid; + atomic_t ls_count; + struct nfs4_lock_owner ls_owner; +}; + +/* bits for nfs4_state->flags */ +enum { + LK_STATE_IN_USE, + NFS_DELEGATED_STATE, /* Current stateid is delegation */ + NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */ + NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */ + NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */ + NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */ + NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ + NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ +}; + +struct nfs4_state { + struct list_head open_states; /* List of states for the same state_owner */ + struct list_head inode_states; /* List of states for the same inode */ + struct list_head lock_states; /* List of subservient lock stateids */ + + struct nfs4_state_owner *owner; /* Pointer to the open owner */ + struct inode *inode; /* Pointer to the inode */ + + unsigned long flags; /* Do we hold any locks? */ + spinlock_t state_lock; /* Protects the lock_states list */ + + seqlock_t seqlock; /* Protects the stateid/open_stateid */ + nfs4_stateid stateid; /* Current stateid: may be delegation */ + nfs4_stateid open_stateid; /* OPEN stateid */ + + /* The following 3 fields are protected by owner->so_lock */ + unsigned int n_rdonly; /* Number of read-only references */ + unsigned int n_wronly; /* Number of write-only references */ + unsigned int n_rdwr; /* Number of read/write references */ + fmode_t state; /* State on the server (R,W, or RW) */ + atomic_t count; +}; + + +struct nfs4_exception { + long timeout; + int retry; + struct nfs4_state *state; + struct inode *inode; +}; + +struct nfs4_state_recovery_ops { + int owner_flag_bit; + int state_flag_bit; + int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *); + int (*recover_lock)(struct nfs4_state *, struct file_lock *); + int (*establish_clid)(struct nfs_client *, struct rpc_cred *); + struct rpc_cred * (*get_clid_cred)(struct nfs_client *); + int (*reclaim_complete)(struct nfs_client *); +}; + +struct nfs4_state_maintenance_ops { + int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *); + struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *); + int (*renew_lease)(struct nfs_client *, struct rpc_cred *); +}; + +extern const struct dentry_operations nfs4_dentry_operations; +extern const struct inode_operations nfs4_dir_inode_operations; + +/* nfs4proc.c */ +extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); +extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); +extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); +extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); +extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); +extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); +extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); +extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc); +extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); +extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, + struct nfs4_fs_locations *fs_locations, struct page *page); +extern void nfs4_release_lockowner(const struct nfs4_lock_state *); +extern const struct xattr_handler *nfs4_xattr_handlers[]; + +#if defined(CONFIG_NFS_V4_1) +static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +{ + return server->nfs_client->cl_session; +} + +extern int nfs4_setup_sequence(const struct nfs_server *server, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task); +extern int nfs41_setup_sequence(struct nfs4_session *session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task); +extern void nfs4_destroy_session(struct nfs4_session *session); +extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); +extern int nfs4_proc_create_session(struct nfs_client *); +extern int nfs4_proc_destroy_session(struct nfs4_session *); +extern int nfs4_init_session(struct nfs_server *server); +extern int nfs4_proc_get_lease_time(struct nfs_client *clp, + struct nfs_fsinfo *fsinfo); +extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, + bool sync); + +static inline bool +is_ds_only_client(struct nfs_client *clp) +{ + return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) == + EXCHGID4_FLAG_USE_PNFS_DS; +} + +static inline bool +is_ds_client(struct nfs_client *clp) +{ + return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS; +} +#else /* CONFIG_NFS_v4_1 */ +static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +{ + return NULL; +} + +static inline int nfs4_setup_sequence(const struct nfs_server *server, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task) +{ + return 0; +} + +static inline int nfs4_init_session(struct nfs_server *server) +{ + return 0; +} + +static inline bool +is_ds_only_client(struct nfs_client *clp) +{ + return false; +} + +static inline bool +is_ds_client(struct nfs_client *clp) +{ + return false; +} +#endif /* CONFIG_NFS_V4_1 */ + +extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; + +extern const u32 nfs4_fattr_bitmap[2]; +extern const u32 nfs4_statfs_bitmap[2]; +extern const u32 nfs4_pathconf_bitmap[2]; +extern const u32 nfs4_fsinfo_bitmap[2]; +extern const u32 nfs4_fs_locations_bitmap[2]; + +/* nfs4renewd.c */ +extern void nfs4_schedule_state_renewal(struct nfs_client *); +extern void nfs4_renewd_prepare_shutdown(struct nfs_server *); +extern void nfs4_kill_renewd(struct nfs_client *); +extern void nfs4_renew_state(struct work_struct *); + +/* nfs4state.c */ +struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp); +struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); +#if defined(CONFIG_NFS_V4_1) +struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); +struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); +extern void nfs4_schedule_session_recovery(struct nfs4_session *); +#else +static inline void nfs4_schedule_session_recovery(struct nfs4_session *session) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); +extern void nfs4_put_state_owner(struct nfs4_state_owner *); +extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); +extern void nfs4_put_open_state(struct nfs4_state *); +extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t); +extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t); +extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); +extern void nfs_inode_find_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid); +extern void nfs4_schedule_lease_recovery(struct nfs_client *); +extern void nfs4_schedule_state_manager(struct nfs_client *); +extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); +extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); +extern void nfs41_handle_recall_slot(struct nfs_client *clp); +extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); +extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); +extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); + +extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); +extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); +extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); +extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); +extern void nfs_release_seqid(struct nfs_seqid *seqid); +extern void nfs_free_seqid(struct nfs_seqid *seqid); + +extern const nfs4_stateid zero_stateid; + +/* nfs4xdr.c */ +extern struct rpc_procinfo nfs4_procedures[]; + +struct nfs4_mount_data; + +/* callback_xdr.c */ +extern struct svc_version nfs4_callback_version1; +extern struct svc_version nfs4_callback_version4; + +#else + +#define nfs4_close_state(a, b, c) do { } while (0) +#define nfs4_close_sync(a, b, c) do { } while (0) + +#endif /* CONFIG_NFS_V4 */ +#endif /* __LINUX_FS_NFS_NFS4_FS.H */ diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c new file mode 100644 index 00000000..75af8121 --- /dev/null +++ b/fs/nfs/nfs4filelayout.c @@ -0,0 +1,914 @@ +/* + * Module for the pnfs nfs4 file layout driver. + * Defines all I/O and Policy interface operations, plus code + * to register itself with the pNFS client. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include +#include + +#include "internal.h" +#include "nfs4filelayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dean Hildebrand "); +MODULE_DESCRIPTION("The NFSv4 file layout driver"); + +#define FILELAYOUT_POLL_RETRY_MAX (15*HZ) + +static loff_t +filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, + loff_t offset) +{ + u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count; + u64 tmp; + + offset -= flseg->pattern_offset; + tmp = offset; + do_div(tmp, stripe_width); + + return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit); +} + +/* This function is used by the layout driver to calculate the + * offset of the file on the dserver based on whether the + * layout type is STRIPE_DENSE or STRIPE_SPARSE + */ +static loff_t +filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + switch (flseg->stripe_type) { + case STRIPE_SPARSE: + return offset; + + case STRIPE_DENSE: + return filelayout_get_dense_offset(flseg, offset); + } + + BUG(); +} + +/* For data server errors we don't recover from */ +static void +filelayout_set_lo_fail(struct pnfs_layout_segment *lseg) +{ + if (lseg->pls_range.iomode == IOMODE_RW) { + dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + } else { + dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + } +} + +static int filelayout_async_handle_error(struct rpc_task *task, + struct nfs4_state *state, + struct nfs_client *clp, + int *reset) +{ + if (task->tk_status >= 0) + return 0; + + *reset = 0; + + switch (task->tk_status) { + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR %d, Reset session. Exchangeid " + "flags 0x%x\n", __func__, task->tk_status, + clp->cl_exchange_flags); + nfs4_schedule_session_recovery(clp->cl_session); + break; + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + case -EKEYEXPIRED: + rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX); + break; + case -NFS4ERR_RETRY_UNCACHED_REP: + break; + default: + dprintk("%s DS error. Retry through MDS %d\n", __func__, + task->tk_status); + *reset = 1; + break; + } + task->tk_status = 0; + return -EAGAIN; +} + +/* NFS_PROTO call done callback routines */ + +static int filelayout_read_done_cb(struct rpc_task *task, + struct nfs_read_data *data) +{ + struct nfs_client *clp = data->ds_clp; + int reset = 0; + + dprintk("%s DS read\n", __func__); + + if (filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, &reset) == -EAGAIN) { + dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", + __func__, data->ds_clp, data->ds_clp->cl_session); + if (reset) { + filelayout_set_lo_fail(data->lseg); + nfs4_reset_read(task, data); + clp = NFS_SERVER(data->inode)->nfs_client; + } + nfs_restart_rpc(task, clp); + return -EAGAIN; + } + + return 0; +} + +/* + * We reference the rpc_cred of the first WRITE that triggers the need for + * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. + * rfc5661 is not clear about which credential should be used. + */ +static void +filelayout_set_layoutcommit(struct nfs_write_data *wdata) +{ + if (FILELAYOUT_LSEG(wdata->lseg)->commit_through_mds || + wdata->res.verf->committed == NFS_FILE_SYNC) + return; + + pnfs_set_layoutcommit(wdata); + dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, + (unsigned long) NFS_I(wdata->inode)->layout->plh_lwb); +} + +/* + * Call ops for the async read/write cases + * In the case of dense layouts, the offset needs to be reset to its + * original value. + */ +static void filelayout_read_prepare(struct rpc_task *task, void *data) +{ + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + rdata->read_done_cb = filelayout_read_done_cb; + + if (nfs41_setup_sequence(rdata->ds_clp->cl_session, + &rdata->args.seq_args, &rdata->res.seq_res, + 0, task)) + return; + + rpc_call_start(task); +} + +static void filelayout_read_call_done(struct rpc_task *task, void *data) +{ + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + + /* Note this may cause RPC to be resent */ + rdata->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_read_release(void *data) +{ + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + rdata->mds_ops->rpc_release(data); +} + +static int filelayout_write_done_cb(struct rpc_task *task, + struct nfs_write_data *data) +{ + int reset = 0; + + if (filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, &reset) == -EAGAIN) { + struct nfs_client *clp; + + dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", + __func__, data->ds_clp, data->ds_clp->cl_session); + if (reset) { + filelayout_set_lo_fail(data->lseg); + nfs4_reset_write(task, data); + clp = NFS_SERVER(data->inode)->nfs_client; + } else + clp = data->ds_clp; + nfs_restart_rpc(task, clp); + return -EAGAIN; + } + + filelayout_set_layoutcommit(data); + return 0; +} + +/* Fake up some data that will cause nfs_commit_release to retry the writes. */ +static void prepare_to_resend_writes(struct nfs_write_data *data) +{ + struct nfs_page *first = nfs_list_entry(data->pages.next); + + data->task.tk_status = 0; + memcpy(data->verf.verifier, first->wb_verf.verifier, + sizeof(first->wb_verf.verifier)); + data->verf.verifier[0]++; /* ensure verifier mismatch */ +} + +static int filelayout_commit_done_cb(struct rpc_task *task, + struct nfs_write_data *data) +{ + int reset = 0; + + if (filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, &reset) == -EAGAIN) { + dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", + __func__, data->ds_clp, data->ds_clp->cl_session); + if (reset) { + prepare_to_resend_writes(data); + filelayout_set_lo_fail(data->lseg); + } else + nfs_restart_rpc(task, data->ds_clp); + return -EAGAIN; + } + + return 0; +} + +static void filelayout_write_prepare(struct rpc_task *task, void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + if (nfs41_setup_sequence(wdata->ds_clp->cl_session, + &wdata->args.seq_args, &wdata->res.seq_res, + 0, task)) + return; + + rpc_call_start(task); +} + +static void filelayout_write_call_done(struct rpc_task *task, void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + /* Note this may cause RPC to be resent */ + wdata->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_write_release(void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + wdata->mds_ops->rpc_release(data); +} + +static void filelayout_commit_release(void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + nfs_commit_release_pages(wdata); + if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding)) + nfs_commit_clear_lock(NFS_I(wdata->inode)); + nfs_commitdata_release(wdata); +} + +struct rpc_call_ops filelayout_read_call_ops = { + .rpc_call_prepare = filelayout_read_prepare, + .rpc_call_done = filelayout_read_call_done, + .rpc_release = filelayout_read_release, +}; + +struct rpc_call_ops filelayout_write_call_ops = { + .rpc_call_prepare = filelayout_write_prepare, + .rpc_call_done = filelayout_write_call_done, + .rpc_release = filelayout_write_release, +}; + +struct rpc_call_ops filelayout_commit_call_ops = { + .rpc_call_prepare = filelayout_write_prepare, + .rpc_call_done = filelayout_write_call_done, + .rpc_release = filelayout_commit_release, +}; + +static enum pnfs_try_status +filelayout_read_pagelist(struct nfs_read_data *data) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + loff_t offset = data->args.offset; + u32 j, idx; + struct nfs_fh *fh; + int status; + + dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", + __func__, data->inode->i_ino, + data->args.pgbase, (size_t)data->args.count, offset); + + /* Retrieve the correct rpc_client for the byte range */ + j = nfs4_fl_calc_j_index(lseg, offset); + idx = nfs4_fl_calc_ds_index(lseg, j); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) { + /* Either layout fh index faulty, or ds connect failed */ + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + return PNFS_NOT_ATTEMPTED; + } + dprintk("%s USE DS:ip %x %hu\n", __func__, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + + /* No multipath support. Use first DS */ + data->ds_clp = ds->ds_clp; + fh = nfs4_fl_select_ds_fh(lseg, j); + if (fh) + data->args.fh = fh; + + data->args.offset = filelayout_get_dserver_offset(lseg, offset); + data->mds_offset = offset; + + /* Perform an asynchronous read to ds */ + status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, + &filelayout_read_call_ops); + BUG_ON(status != 0); + return PNFS_ATTEMPTED; +} + +/* Perform async writes. */ +static enum pnfs_try_status +filelayout_write_pagelist(struct nfs_write_data *data, int sync) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + loff_t offset = data->args.offset; + u32 j, idx; + struct nfs_fh *fh; + int status; + + /* Retrieve the correct rpc_client for the byte range */ + j = nfs4_fl_calc_j_index(lseg, offset); + idx = nfs4_fl_calc_ds_index(lseg, j); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) { + printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + return PNFS_NOT_ATTEMPTED; + } + dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__, + data->inode->i_ino, sync, (size_t) data->args.count, offset, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + + data->write_done_cb = filelayout_write_done_cb; + data->ds_clp = ds->ds_clp; + fh = nfs4_fl_select_ds_fh(lseg, j); + if (fh) + data->args.fh = fh; + /* + * Get the file offset on the dserver. Set the write offset to + * this offset and save the original offset. + */ + data->args.offset = filelayout_get_dserver_offset(lseg, offset); + + /* Perform an asynchronous write */ + status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, + &filelayout_write_call_ops, sync); + BUG_ON(status != 0); + return PNFS_ATTEMPTED; +} + +/* + * filelayout_check_layout() + * + * Make sure layout segment parameters are sane WRT the device. + * At this point no generic layer initialization of the lseg has occurred, + * and nothing has been added to the layout_hdr cache. + * + */ +static int +filelayout_check_layout(struct pnfs_layout_hdr *lo, + struct nfs4_filelayout_segment *fl, + struct nfs4_layoutget_res *lgr, + struct nfs4_deviceid *id, + gfp_t gfp_flags) +{ + struct nfs4_deviceid_node *d; + struct nfs4_file_layout_dsaddr *dsaddr; + int status = -EINVAL; + struct nfs_server *nfss = NFS_SERVER(lo->plh_inode); + + dprintk("--> %s\n", __func__); + + /* FIXME: remove this check when layout segment support is added */ + if (lgr->range.offset != 0 || + lgr->range.length != NFS4_MAX_UINT64) { + dprintk("%s Only whole file layouts supported. Use MDS i/o\n", + __func__); + goto out; + } + + if (fl->pattern_offset > lgr->range.offset) { + dprintk("%s pattern_offset %lld too large\n", + __func__, fl->pattern_offset); + goto out; + } + + if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) { + dprintk("%s Invalid stripe unit (%u)\n", + __func__, fl->stripe_unit); + goto out; + } + + /* find and reference the deviceid */ + d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, + NFS_SERVER(lo->plh_inode)->nfs_client, id); + if (d == NULL) { + dsaddr = get_device_info(lo->plh_inode, id, gfp_flags); + if (dsaddr == NULL) + goto out; + } else + dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); + fl->dsaddr = dsaddr; + + if (fl->first_stripe_index < 0 || + fl->first_stripe_index >= dsaddr->stripe_count) { + dprintk("%s Bad first_stripe_index %d\n", + __func__, fl->first_stripe_index); + goto out_put; + } + + if ((fl->stripe_type == STRIPE_SPARSE && + fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) || + (fl->stripe_type == STRIPE_DENSE && + fl->num_fh != dsaddr->stripe_count)) { + dprintk("%s num_fh %u not valid for given packing\n", + __func__, fl->num_fh); + goto out_put; + } + + if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { + dprintk("%s Stripe unit (%u) not aligned with rsize %u " + "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, + nfss->wsize); + } + + status = 0; +out: + dprintk("--> %s returns %d\n", __func__, status); + return status; +out_put: + nfs4_fl_put_deviceid(dsaddr); + goto out; +} + +static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) +{ + int i; + + for (i = 0; i < fl->num_fh; i++) { + if (!fl->fh_array[i]) + break; + kfree(fl->fh_array[i]); + } + kfree(fl->fh_array); + fl->fh_array = NULL; +} + +static void +_filelayout_free_lseg(struct nfs4_filelayout_segment *fl) +{ + filelayout_free_fh_array(fl); + kfree(fl); +} + +static int +filelayout_decode_layout(struct pnfs_layout_hdr *flo, + struct nfs4_filelayout_segment *fl, + struct nfs4_layoutget_res *lgr, + struct nfs4_deviceid *id, + gfp_t gfp_flags) +{ + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + __be32 *p; + uint32_t nfl_util; + int i; + + dprintk("%s: set_layout_map Begin\n", __func__); + + scratch = alloc_page(gfp_flags); + if (!scratch) + return -ENOMEM; + + xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), + * num_fh (4) */ + p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20); + if (unlikely(!p)) + goto out_err; + + memcpy(id, p, sizeof(*id)); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + nfs4_print_deviceid(id); + + nfl_util = be32_to_cpup(p++); + if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) + fl->commit_through_mds = 1; + if (nfl_util & NFL4_UFLG_DENSE) + fl->stripe_type = STRIPE_DENSE; + else + fl->stripe_type = STRIPE_SPARSE; + fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; + + fl->first_stripe_index = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &fl->pattern_offset); + fl->num_fh = be32_to_cpup(p++); + + dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n", + __func__, nfl_util, fl->num_fh, fl->first_stripe_index, + fl->pattern_offset); + + /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. + * Futher checking is done in filelayout_check_layout */ + if (fl->num_fh < 0 || fl->num_fh > + max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) + goto out_err; + + if (fl->num_fh > 0) { + fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), + gfp_flags); + if (!fl->fh_array) + goto out_err; + } + + for (i = 0; i < fl->num_fh; i++) { + /* Do we want to use a mempool here? */ + fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags); + if (!fl->fh_array[i]) + goto out_err_free; + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free; + fl->fh_array[i]->size = be32_to_cpup(p++); + if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { + printk(KERN_ERR "Too big fh %d received %d\n", + i, fl->fh_array[i]->size); + goto out_err_free; + } + + p = xdr_inline_decode(&stream, fl->fh_array[i]->size); + if (unlikely(!p)) + goto out_err_free; + memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); + dprintk("DEBUG: %s: fh len %d\n", __func__, + fl->fh_array[i]->size); + } + + __free_page(scratch); + return 0; + +out_err_free: + filelayout_free_fh_array(fl); +out_err: + __free_page(scratch); + return -EIO; +} + +static void +filelayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + + dprintk("--> %s\n", __func__); + nfs4_fl_put_deviceid(fl->dsaddr); + kfree(fl->commit_buckets); + _filelayout_free_lseg(fl); +} + +static struct pnfs_layout_segment * +filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) +{ + struct nfs4_filelayout_segment *fl; + int rc; + struct nfs4_deviceid id; + + dprintk("--> %s\n", __func__); + fl = kzalloc(sizeof(*fl), gfp_flags); + if (!fl) + return NULL; + + rc = filelayout_decode_layout(layoutid, fl, lgr, &id, gfp_flags); + if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id, gfp_flags)) { + _filelayout_free_lseg(fl); + return NULL; + } + + /* This assumes there is only one IOMODE_RW lseg. What + * we really want to do is have a layout_hdr level + * dictionary of keys, each + * associated with a struct list_head, populated by calls + * to filelayout_write_pagelist(). + * */ + if ((!fl->commit_through_mds) && (lgr->range.iomode == IOMODE_RW)) { + int i; + int size = (fl->stripe_type == STRIPE_SPARSE) ? + fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + + fl->commit_buckets = kcalloc(size, sizeof(struct list_head), gfp_flags); + if (!fl->commit_buckets) { + filelayout_free_lseg(&fl->generic_hdr); + return NULL; + } + fl->number_of_buckets = size; + for (i = 0; i < size; i++) + INIT_LIST_HEAD(&fl->commit_buckets[i]); + } + return &fl->generic_hdr; +} + +/* + * filelayout_pg_test(). Called by nfs_can_coalesce_requests() + * + * return true : coalesce page + * return false : don't coalesce page + */ +bool +filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + u64 p_stripe, r_stripe; + u32 stripe_unit; + + if (!pnfs_generic_pg_test(pgio, prev, req) || + !nfs_generic_pg_test(pgio, prev, req)) + return false; + + if (!pgio->pg_lseg) + return 1; + p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; + r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; + stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + + do_div(p_stripe, stripe_unit); + do_div(r_stripe, stripe_unit); + + return (p_stripe == r_stripe); +} + +static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg) +{ + return !FILELAYOUT_LSEG(lseg)->commit_through_mds; +} + +static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) +{ + if (fl->stripe_type == STRIPE_SPARSE) + return nfs4_fl_calc_ds_index(&fl->generic_hdr, j); + else + return j; +} + +struct list_head *filelayout_choose_commit_list(struct nfs_page *req) +{ + struct pnfs_layout_segment *lseg = req->wb_commit_lseg; + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + u32 i, j; + struct list_head *list; + + /* Note that we are calling nfs4_fl_calc_j_index on each page + * that ends up being committed to a data server. An attractive + * alternative is to add a field to nfs_write_data and nfs_page + * to store the value calculated in filelayout_write_pagelist + * and just use that here. + */ + j = nfs4_fl_calc_j_index(lseg, + (loff_t)req->wb_index << PAGE_CACHE_SHIFT); + i = select_bucket_index(fl, j); + list = &fl->commit_buckets[i]; + if (list_empty(list)) { + /* Non-empty buckets hold a reference on the lseg */ + get_lseg(lseg); + } + return list; +} + +static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + if (flseg->stripe_type == STRIPE_SPARSE) + return i; + else + return nfs4_fl_calc_ds_index(lseg, i); +} + +static struct nfs_fh * +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + if (flseg->stripe_type == STRIPE_SPARSE) { + if (flseg->num_fh == 1) + i = 0; + else if (flseg->num_fh == 0) + /* Use the MDS OPEN fh set in nfs_read_rpcsetup */ + return NULL; + } + return flseg->fh_array[i]; +} + +static int filelayout_initiate_commit(struct nfs_write_data *data, int how) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + u32 idx; + struct nfs_fh *fh; + + idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) { + printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + prepare_to_resend_writes(data); + data->mds_ops->rpc_release(data); + return -EAGAIN; + } + dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how); + data->write_done_cb = filelayout_commit_done_cb; + data->ds_clp = ds->ds_clp; + fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); + if (fh) + data->args.fh = fh; + return nfs_initiate_commit(data, ds->ds_clp->cl_rpcclient, + &filelayout_commit_call_ops, how); +} + +/* + * This is only useful while we are using whole file layouts. + */ +static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode) +{ + struct pnfs_layout_segment *lseg, *rv = NULL; + + spin_lock(&inode->i_lock); + list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) + if (lseg->pls_range.iomode == IOMODE_RW) + rv = get_lseg(lseg); + spin_unlock(&inode->i_lock); + return rv; +} + +static int alloc_ds_commits(struct inode *inode, struct list_head *list) +{ + struct pnfs_layout_segment *lseg; + struct nfs4_filelayout_segment *fl; + struct nfs_write_data *data; + int i, j; + + /* Won't need this when non-whole file layout segments are supported + * instead we will use a pnfs_layout_hdr structure */ + lseg = find_only_write_lseg(inode); + if (!lseg) + return 0; + fl = FILELAYOUT_LSEG(lseg); + for (i = 0; i < fl->number_of_buckets; i++) { + if (list_empty(&fl->commit_buckets[i])) + continue; + data = nfs_commitdata_alloc(); + if (!data) + goto out_bad; + data->ds_commit_index = i; + data->lseg = lseg; + list_add(&data->pages, list); + } + put_lseg(lseg); + return 0; + +out_bad: + for (j = i; j < fl->number_of_buckets; j++) { + if (list_empty(&fl->commit_buckets[i])) + continue; + nfs_retry_commit(&fl->commit_buckets[i], lseg); + put_lseg(lseg); /* associated with emptying bucket */ + } + put_lseg(lseg); + /* Caller will clean up entries put on list */ + return -ENOMEM; +} + +/* This follows nfs_commit_list pretty closely */ +static int +filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, + int how) +{ + struct nfs_write_data *data, *tmp; + LIST_HEAD(list); + + if (!list_empty(mds_pages)) { + data = nfs_commitdata_alloc(); + if (!data) + goto out_bad; + data->lseg = NULL; + list_add(&data->pages, &list); + } + + if (alloc_ds_commits(inode, &list)) + goto out_bad; + + list_for_each_entry_safe(data, tmp, &list, pages) { + list_del_init(&data->pages); + atomic_inc(&NFS_I(inode)->commits_outstanding); + if (!data->lseg) { + nfs_init_commit(data, mds_pages, NULL); + nfs_initiate_commit(data, NFS_CLIENT(inode), + data->mds_ops, how); + } else { + nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index], data->lseg); + filelayout_initiate_commit(data, how); + } + } + return 0; + out_bad: + list_for_each_entry_safe(data, tmp, &list, pages) { + nfs_retry_commit(&data->pages, data->lseg); + list_del_init(&data->pages); + nfs_commit_free(data); + } + nfs_retry_commit(mds_pages, NULL); + nfs_commit_clear_lock(NFS_I(inode)); + return -ENOMEM; +} + +static void +filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) +{ + nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node)); +} + +static struct pnfs_layoutdriver_type filelayout_type = { + .id = LAYOUT_NFSV4_1_FILES, + .name = "LAYOUT_NFSV4_1_FILES", + .owner = THIS_MODULE, + .alloc_lseg = filelayout_alloc_lseg, + .free_lseg = filelayout_free_lseg, + .pg_test = filelayout_pg_test, + .mark_pnfs_commit = filelayout_mark_pnfs_commit, + .choose_commit_list = filelayout_choose_commit_list, + .commit_pagelist = filelayout_commit_pagelist, + .read_pagelist = filelayout_read_pagelist, + .write_pagelist = filelayout_write_pagelist, + .free_deviceid_node = filelayout_free_deveiceid_node, +}; + +static int __init nfs4filelayout_init(void) +{ + printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", + __func__); + return pnfs_register_layoutdriver(&filelayout_type); +} + +static void __exit nfs4filelayout_exit(void) +{ + printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", + __func__); + pnfs_unregister_layoutdriver(&filelayout_type); +} + +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h new file mode 100644 index 00000000..cebe01e3 --- /dev/null +++ b/fs/nfs/nfs4filelayout.h @@ -0,0 +1,105 @@ +/* + * NFSv4 file layout driver data structures. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#ifndef FS_NFS_NFS4FILELAYOUT_H +#define FS_NFS_NFS4FILELAYOUT_H + +#include "pnfs.h" + +/* + * Field testing shows we need to support up to 4096 stripe indices. + * We store each index as a u8 (u32 on the wire) to keep the memory footprint + * reasonable. This in turn means we support a maximum of 256 + * RFC 5661 multipath_list4 structures. + */ +#define NFS4_PNFS_MAX_STRIPE_CNT 4096 +#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */ + +enum stripetype4 { + STRIPE_SPARSE = 1, + STRIPE_DENSE = 2 +}; + +/* Individual ip address */ +struct nfs4_pnfs_ds { + struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ + u32 ds_ip_addr; + u32 ds_port; + struct nfs_client *ds_clp; + atomic_t ds_count; +}; + +/* nfs4_file_layout_dsaddr flags */ +#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001 + +struct nfs4_file_layout_dsaddr { + struct nfs4_deviceid_node id_node; + unsigned long flags; + u32 stripe_count; + u8 *stripe_indices; + u32 ds_num; + struct nfs4_pnfs_ds *ds_list[1]; +}; + +struct nfs4_filelayout_segment { + struct pnfs_layout_segment generic_hdr; + u32 stripe_type; + u32 commit_through_mds; + u32 stripe_unit; + u32 first_stripe_index; + u64 pattern_offset; + struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ + unsigned int num_fh; + struct nfs_fh **fh_array; + struct list_head *commit_buckets; /* Sort commits to ds */ + int number_of_buckets; +}; + +static inline struct nfs4_filelayout_segment * +FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) +{ + return container_of(lseg, + struct nfs4_filelayout_segment, + generic_hdr); +} + +extern struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); + +extern void print_ds(struct nfs4_pnfs_ds *ds); +u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); +u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); +struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, + u32 ds_idx); +extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); +extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); +struct nfs4_file_layout_dsaddr * +get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c new file mode 100644 index 00000000..3b7bf137 --- /dev/null +++ b/fs/nfs/nfs4filelayoutdev.c @@ -0,0 +1,636 @@ +/* + * Device operations for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * Garth Goodson + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include +#include + +#include "internal.h" +#include "nfs4filelayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* + * Data server cache + * + * Data servers can be mapped to different device ids. + * nfs4_pnfs_ds reference counting + * - set to 1 on allocation + * - incremented when a device id maps a data server already in the cache. + * - decremented when deviceid is removed from the cache. + */ +DEFINE_SPINLOCK(nfs4_ds_cache_lock); +static LIST_HEAD(nfs4_data_server_cache); + +/* Debug routines */ +void +print_ds(struct nfs4_pnfs_ds *ds) +{ + if (ds == NULL) { + printk("%s NULL device\n", __func__); + return; + } + printk(" ip_addr %x port %hu\n" + " ref count %d\n" + " client %p\n" + " cl_exchange_flags %x\n", + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + atomic_read(&ds->ds_count), ds->ds_clp, + ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); +} + +/* nfs4_ds_cache_lock is held */ +static struct nfs4_pnfs_ds * +_data_server_lookup_locked(u32 ip_addr, u32 port) +{ + struct nfs4_pnfs_ds *ds; + + dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", + ntohl(ip_addr), ntohs(port)); + + list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { + if (ds->ds_ip_addr == ip_addr && + ds->ds_port == port) { + return ds; + } + } + return NULL; +} + +/* + * Create an rpc connection to the nfs4_pnfs_ds data server + * Currently only support IPv4 + */ +static int +nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) +{ + struct nfs_client *clp; + struct sockaddr_in sin; + int status = 0; + + dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ds->ds_ip_addr; + sin.sin_port = ds->ds_port; + + clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin, + sizeof(sin), IPPROTO_TCP); + if (IS_ERR(clp)) { + status = PTR_ERR(clp); + goto out; + } + + if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) { + if (!is_ds_client(clp)) { + status = -ENODEV; + goto out_put; + } + ds->ds_clp = clp; + dprintk("%s [existing] ip=%x, port=%hu\n", __func__, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + goto out; + } + + /* + * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to + * be equal to the MDS lease. Renewal is scheduled in create_session. + */ + spin_lock(&mds_srv->nfs_client->cl_lock); + clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; + spin_unlock(&mds_srv->nfs_client->cl_lock); + clp->cl_last_renewal = jiffies; + + /* New nfs_client */ + status = nfs4_init_ds_session(clp); + if (status) + goto out_put; + + ds->ds_clp = clp; + dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr), + ntohs(ds->ds_port)); +out: + return status; +out_put: + nfs_put_client(clp); + goto out; +} + +static void +destroy_ds(struct nfs4_pnfs_ds *ds) +{ + dprintk("--> %s\n", __func__); + ifdebug(FACILITY) + print_ds(ds); + + if (ds->ds_clp) + nfs_put_client(ds->ds_clp); + kfree(ds); +} + +void +nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) +{ + struct nfs4_pnfs_ds *ds; + int i; + + nfs4_print_deviceid(&dsaddr->id_node.deviceid); + + for (i = 0; i < dsaddr->ds_num; i++) { + ds = dsaddr->ds_list[i]; + if (ds != NULL) { + if (atomic_dec_and_lock(&ds->ds_count, + &nfs4_ds_cache_lock)) { + list_del_init(&ds->ds_node); + spin_unlock(&nfs4_ds_cache_lock); + destroy_ds(ds); + } + } + } + kfree(dsaddr->stripe_indices); + kfree(dsaddr); +} + +static struct nfs4_pnfs_ds * +nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port, gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds *tmp_ds, *ds; + + ds = kzalloc(sizeof(*tmp_ds), gfp_flags); + if (!ds) + goto out; + + spin_lock(&nfs4_ds_cache_lock); + tmp_ds = _data_server_lookup_locked(ip_addr, port); + if (tmp_ds == NULL) { + ds->ds_ip_addr = ip_addr; + ds->ds_port = port; + atomic_set(&ds->ds_count, 1); + INIT_LIST_HEAD(&ds->ds_node); + ds->ds_clp = NULL; + list_add(&ds->ds_node, &nfs4_data_server_cache); + dprintk("%s add new data server ip 0x%x\n", __func__, + ds->ds_ip_addr); + } else { + kfree(ds); + atomic_inc(&tmp_ds->ds_count); + dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", + __func__, tmp_ds->ds_ip_addr, + atomic_read(&tmp_ds->ds_count)); + ds = tmp_ds; + } + spin_unlock(&nfs4_ds_cache_lock); +out: + return ds; +} + +/* + * Currently only support ipv4, and one multi-path address. + */ +static struct nfs4_pnfs_ds * +decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds *ds = NULL; + char *buf; + const char *ipend, *pstr; + u32 ip_addr, port; + int nlen, rlen, i; + int tmp[2]; + __be32 *p; + + /* r_netid */ + p = xdr_inline_decode(streamp, 4); + if (unlikely(!p)) + goto out_err; + nlen = be32_to_cpup(p++); + + p = xdr_inline_decode(streamp, nlen); + if (unlikely(!p)) + goto out_err; + + /* Check that netid is "tcp" */ + if (nlen != 3 || memcmp((char *)p, "tcp", 3)) { + dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); + goto out_err; + } + + /* r_addr */ + p = xdr_inline_decode(streamp, 4); + if (unlikely(!p)) + goto out_err; + rlen = be32_to_cpup(p); + + p = xdr_inline_decode(streamp, rlen); + if (unlikely(!p)) + goto out_err; + + /* ipv6 length plus port is legal */ + if (rlen > INET6_ADDRSTRLEN + 8) { + dprintk("%s: Invalid address, length %d\n", __func__, + rlen); + goto out_err; + } + buf = kmalloc(rlen + 1, gfp_flags); + if (!buf) { + dprintk("%s: Not enough memory\n", __func__); + goto out_err; + } + buf[rlen] = '\0'; + memcpy(buf, p, rlen); + + /* replace the port dots with dashes for the in4_pton() delimiter*/ + for (i = 0; i < 2; i++) { + char *res = strrchr(buf, '.'); + if (!res) { + dprintk("%s: Failed finding expected dots in port\n", + __func__); + goto out_free; + } + *res = '-'; + } + + /* Currently only support ipv4 address */ + if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) { + dprintk("%s: Only ipv4 addresses supported\n", __func__); + goto out_free; + } + + /* port */ + pstr = ipend; + sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]); + port = htons((tmp[0] << 8) | (tmp[1])); + + ds = nfs4_pnfs_ds_add(inode, ip_addr, port, gfp_flags); + dprintk("%s: Decoded address and port %s\n", __func__, buf); +out_free: + kfree(buf); +out_err: + return ds; +} + +/* Decode opaque device data and return the result */ +static struct nfs4_file_layout_dsaddr* +decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) +{ + int i; + u32 cnt, num; + u8 *indexp; + __be32 *p; + u8 *stripe_indices; + u8 max_stripe_index; + struct nfs4_file_layout_dsaddr *dsaddr = NULL; + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + + /* set up xdr stream */ + scratch = alloc_page(gfp_flags); + if (!scratch) + goto out_err; + + xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + /* Get the stripe count (number of stripe index) */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_scratch; + + cnt = be32_to_cpup(p); + dprintk("%s stripe count %d\n", __func__, cnt); + if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { + printk(KERN_WARNING "%s: stripe count %d greater than " + "supported maximum %d\n", __func__, + cnt, NFS4_PNFS_MAX_STRIPE_CNT); + goto out_err_free_scratch; + } + + /* read stripe indices */ + stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags); + if (!stripe_indices) + goto out_err_free_scratch; + + p = xdr_inline_decode(&stream, cnt << 2); + if (unlikely(!p)) + goto out_err_free_stripe_indices; + + indexp = &stripe_indices[0]; + max_stripe_index = 0; + for (i = 0; i < cnt; i++) { + *indexp = be32_to_cpup(p++); + max_stripe_index = max(max_stripe_index, *indexp); + indexp++; + } + + /* Check the multipath list count */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_stripe_indices; + + num = be32_to_cpup(p); + dprintk("%s ds_num %u\n", __func__, num); + if (num > NFS4_PNFS_MAX_MULTI_CNT) { + printk(KERN_WARNING "%s: multipath count %d greater than " + "supported maximum %d\n", __func__, + num, NFS4_PNFS_MAX_MULTI_CNT); + goto out_err_free_stripe_indices; + } + + /* validate stripe indices are all < num */ + if (max_stripe_index >= num) { + printk(KERN_WARNING "%s: stripe index %u >= num ds %u\n", + __func__, max_stripe_index, num); + goto out_err_free_stripe_indices; + } + + dsaddr = kzalloc(sizeof(*dsaddr) + + (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), + gfp_flags); + if (!dsaddr) + goto out_err_free_stripe_indices; + + dsaddr->stripe_count = cnt; + dsaddr->stripe_indices = stripe_indices; + stripe_indices = NULL; + dsaddr->ds_num = num; + nfs4_init_deviceid_node(&dsaddr->id_node, + NFS_SERVER(ino)->pnfs_curr_ld, + NFS_SERVER(ino)->nfs_client, + &pdev->dev_id); + + for (i = 0; i < dsaddr->ds_num; i++) { + int j; + u32 mp_count; + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_deviceid; + + mp_count = be32_to_cpup(p); /* multipath count */ + if (mp_count > 1) { + printk(KERN_WARNING + "%s: Multipath count %d not supported, " + "skipping all greater than 1\n", __func__, + mp_count); + } + for (j = 0; j < mp_count; j++) { + if (j == 0) { + dsaddr->ds_list[i] = decode_and_add_ds(&stream, + ino, gfp_flags); + if (dsaddr->ds_list[i] == NULL) + goto out_err_free_deviceid; + } else { + u32 len; + /* skip extra multipath */ + + /* read len, skip */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_deviceid; + len = be32_to_cpup(p); + + p = xdr_inline_decode(&stream, len); + if (unlikely(!p)) + goto out_err_free_deviceid; + + /* read len, skip */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_deviceid; + len = be32_to_cpup(p); + + p = xdr_inline_decode(&stream, len); + if (unlikely(!p)) + goto out_err_free_deviceid; + } + } + } + + __free_page(scratch); + return dsaddr; + +out_err_free_deviceid: + nfs4_fl_free_deviceid(dsaddr); + /* stripe_indicies was part of dsaddr */ + goto out_err_free_scratch; +out_err_free_stripe_indices: + kfree(stripe_indices); +out_err_free_scratch: + __free_page(scratch); +out_err: + dprintk("%s ERROR: returning NULL\n", __func__); + return NULL; +} + +/* + * Decode the opaque device specified in 'dev' and add it to the cache of + * available devices. + */ +static struct nfs4_file_layout_dsaddr * +decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags) +{ + struct nfs4_deviceid_node *d; + struct nfs4_file_layout_dsaddr *n, *new; + + new = decode_device(inode, dev, gfp_flags); + if (!new) { + printk(KERN_WARNING "%s: Could not decode or add device\n", + __func__); + return NULL; + } + + d = nfs4_insert_deviceid_node(&new->id_node); + n = container_of(d, struct nfs4_file_layout_dsaddr, id_node); + if (n != new) { + nfs4_fl_free_deviceid(new); + return n; + } + + return new; +} + +/* + * Retrieve the information for dev_id, add it to the list + * of available devices, and return it. + */ +struct nfs4_file_layout_dsaddr * +get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags) +{ + struct pnfs_device *pdev = NULL; + u32 max_resp_sz; + int max_pages; + struct page **pages = NULL; + struct nfs4_file_layout_dsaddr *dsaddr = NULL; + int rc, i; + struct nfs_server *server = NFS_SERVER(inode); + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = max_resp_sz >> PAGE_SHIFT; + dprintk("%s inode %p max_resp_sz %u max_pages %d\n", + __func__, inode, max_resp_sz, max_pages); + + pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags); + if (pdev == NULL) + return NULL; + + pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags); + if (pages == NULL) { + kfree(pdev); + return NULL; + } + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(gfp_flags); + if (!pages[i]) + goto out_free; + } + + memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); + pdev->layout_type = LAYOUT_NFSV4_1_FILES; + pdev->pages = pages; + pdev->pgbase = 0; + pdev->pglen = PAGE_SIZE * max_pages; + pdev->mincount = 0; + + rc = nfs4_proc_getdeviceinfo(server, pdev); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) + goto out_free; + + /* + * Found new device, need to decode it and then add it to the + * list of known devices for this mountpoint. + */ + dsaddr = decode_and_add_device(inode, pdev, gfp_flags); +out_free: + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + kfree(pdev); + dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); + return dsaddr; +} + +void +nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) +{ + nfs4_put_deviceid_node(&dsaddr->id_node); +} + +/* + * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit + * Then: ((res + fsi) % dsaddr->stripe_count) + */ +u32 +nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + u64 tmp; + + tmp = offset - flseg->pattern_offset; + do_div(tmp, flseg->stripe_unit); + tmp += flseg->first_stripe_index; + return do_div(tmp, flseg->dsaddr->stripe_count); +} + +u32 +nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j) +{ + return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j]; +} + +struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + u32 i; + + if (flseg->stripe_type == STRIPE_SPARSE) { + if (flseg->num_fh == 1) + i = 0; + else if (flseg->num_fh == 0) + /* Use the MDS OPEN fh set in nfs_read_rpcsetup */ + return NULL; + else + i = nfs4_fl_calc_ds_index(lseg, j); + } else + i = j; + return flseg->fh_array[i]; +} + +static void +filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, + int err, u32 ds_addr) +{ + u32 *p = (u32 *)&dsaddr->id_node.deviceid; + + printk(KERN_ERR "NFS: data server %x connection error %d." + " Deviceid [%x%x%x%x] marked out of use.\n", + ds_addr, err, p[0], p[1], p[2], p[3]); + + spin_lock(&nfs4_ds_cache_lock); + dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; + spin_unlock(&nfs4_ds_cache_lock); +} + +struct nfs4_pnfs_ds * +nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) +{ + struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr; + struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; + + if (ds == NULL) { + printk(KERN_ERR "%s: No data server for offset index %d\n", + __func__, ds_idx); + return NULL; + } + + if (!ds->ds_clp) { + struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); + int err; + + if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) { + /* Already tried to connect, don't try again */ + dprintk("%s Deviceid marked out of use\n", __func__); + return NULL; + } + err = nfs4_ds_connect(s, ds); + if (err) { + filelayout_mark_devid_negative(dsaddr, err, + ntohl(ds->ds_ip_addr)); + return NULL; + } + } + return ds; +} diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c new file mode 100644 index 00000000..bb80c49b --- /dev/null +++ b/fs/nfs/nfs4namespace.c @@ -0,0 +1,265 @@ +/* + * linux/fs/nfs/nfs4namespace.c + * + * Copyright (C) 2005 Trond Myklebust + * - Modified by David Howells + * + * NFSv4 namespace + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" +#include "nfs4_fs.h" +#include "dns_resolve.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +/* + * Convert the NFSv4 pathname components into a standard posix path. + * + * Note that the resulting string will be placed at the end of the buffer + */ +static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname, + char *buffer, ssize_t buflen) +{ + char *end = buffer + buflen; + int n; + + *--end = '\0'; + buflen--; + + n = pathname->ncomponents; + while (--n >= 0) { + const struct nfs4_string *component = &pathname->components[n]; + buflen -= component->len + 1; + if (buflen < 0) + goto Elong; + end -= component->len; + memcpy(end, component->data, component->len); + *--end = '/'; + } + return end; +Elong: + return ERR_PTR(-ENAMETOOLONG); +} + +/* + * Determine the mount path as a string + */ +static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen) +{ + char *limit; + char *path = nfs_path(&limit, dentry, buffer, buflen); + if (!IS_ERR(path)) { + char *colon = strchr(path, ':'); + if (colon && colon < limit) + path = colon + 1; + } + return path; +} + +/* + * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we + * believe to be the server path to this dentry + */ +static int nfs4_validate_fspath(struct dentry *dentry, + const struct nfs4_fs_locations *locations, + char *page, char *page2) +{ + const char *path, *fs_path; + + path = nfs4_path(dentry, page, PAGE_SIZE); + if (IS_ERR(path)) + return PTR_ERR(path); + + fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE); + if (IS_ERR(fs_path)) + return PTR_ERR(fs_path); + + if (strncmp(path, fs_path, strlen(fs_path)) != 0) { + dprintk("%s: path %s does not begin with fsroot %s\n", + __func__, path, fs_path); + return -ENOENT; + } + + return 0; +} + +static size_t nfs_parse_server_name(char *string, size_t len, + struct sockaddr *sa, size_t salen) +{ + ssize_t ret; + + ret = rpc_pton(string, len, sa, salen); + if (ret == 0) { + ret = nfs_dns_resolve_name(string, len, sa, salen); + if (ret < 0) + ret = 0; + } + return ret; +} + +static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, + char *page, char *page2, + const struct nfs4_fs_location *location) +{ + const size_t addr_bufsize = sizeof(struct sockaddr_storage); + struct vfsmount *mnt = ERR_PTR(-ENOENT); + char *mnt_path; + unsigned int maxbuflen; + unsigned int s; + + mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); + if (IS_ERR(mnt_path)) + return ERR_CAST(mnt_path); + mountdata->mnt_path = mnt_path; + maxbuflen = mnt_path - 1 - page2; + + mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL); + if (mountdata->addr == NULL) + return ERR_PTR(-ENOMEM); + + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; + + if (buf->len <= 0 || buf->len >= maxbuflen) + continue; + + if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) + continue; + + mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, + mountdata->addr, addr_bufsize); + if (mountdata->addrlen == 0) + continue; + + rpc_set_port(mountdata->addr, NFS_PORT); + + memcpy(page2, buf->data, buf->len); + page2[buf->len] = '\0'; + mountdata->hostname = page2; + + snprintf(page, PAGE_SIZE, "%s:%s", + mountdata->hostname, + mountdata->mnt_path); + + mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); + if (!IS_ERR(mnt)) + break; + } + kfree(mountdata->addr); + return mnt; +} + +/** + * nfs_follow_referral - set up mountpoint when hitting a referral on moved error + * @dentry - parent directory + * @locations - array of NFSv4 server location information + * + */ +static struct vfsmount *nfs_follow_referral(struct dentry *dentry, + const struct nfs4_fs_locations *locations) +{ + struct vfsmount *mnt = ERR_PTR(-ENOENT); + struct nfs_clone_mount mountdata = { + .sb = dentry->d_sb, + .dentry = dentry, + .authflavor = NFS_SB(dentry->d_sb)->client->cl_auth->au_flavor, + }; + char *page = NULL, *page2 = NULL; + int loc, error; + + if (locations == NULL || locations->nlocations <= 0) + goto out; + + dprintk("%s: referral at %s/%s\n", __func__, + dentry->d_parent->d_name.name, dentry->d_name.name); + + page = (char *) __get_free_page(GFP_USER); + if (!page) + goto out; + + page2 = (char *) __get_free_page(GFP_USER); + if (!page2) + goto out; + + /* Ensure fs path is a prefix of current dentry path */ + error = nfs4_validate_fspath(dentry, locations, page, page2); + if (error < 0) { + mnt = ERR_PTR(error); + goto out; + } + + for (loc = 0; loc < locations->nlocations; loc++) { + const struct nfs4_fs_location *location = &locations->locations[loc]; + + if (location == NULL || location->nservers <= 0 || + location->rootpath.ncomponents == 0) + continue; + + mnt = try_location(&mountdata, page, page2, location); + if (!IS_ERR(mnt)) + break; + } + +out: + free_page((unsigned long) page); + free_page((unsigned long) page2); + dprintk("%s: done\n", __func__); + return mnt; +} + +/* + * nfs_do_refmount - handle crossing a referral on server + * @dentry - dentry of referral + * + */ +struct vfsmount *nfs_do_refmount(struct dentry *dentry) +{ + struct vfsmount *mnt = ERR_PTR(-ENOMEM); + struct dentry *parent; + struct nfs4_fs_locations *fs_locations = NULL; + struct page *page; + int err; + + /* BUG_ON(IS_ROOT(dentry)); */ + dprintk("%s: enter\n", __func__); + + page = alloc_page(GFP_KERNEL); + if (page == NULL) + goto out; + + fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (fs_locations == NULL) + goto out_free; + + /* Get locations */ + mnt = ERR_PTR(-ENOENT); + + parent = dget_parent(dentry); + dprintk("%s: getting locations for %s/%s\n", + __func__, parent->d_name.name, dentry->d_name.name); + + err = nfs4_proc_fs_locations(parent->d_inode, &dentry->d_name, fs_locations, page); + dput(parent); + if (err != 0 || + fs_locations->nlocations <= 0 || + fs_locations->fs_path.ncomponents <= 0) + goto out_free; + + mnt = nfs_follow_referral(dentry, fs_locations); +out_free: + __free_page(page); + kfree(fs_locations); +out: + dprintk("%s: done\n", __func__); + return mnt; +} diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c new file mode 100644 index 00000000..b7a7e5fe --- /dev/null +++ b/fs/nfs/nfs4proc.c @@ -0,0 +1,6114 @@ +/* + * fs/nfs/nfs4proc.c + * + * Client-side procedure declarations for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nfs4_fs.h" +#include "delegation.h" +#include "internal.h" +#include "iostat.h" +#include "callback.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PROC + +#define NFS4_POLL_RETRY_MIN (HZ/10) +#define NFS4_POLL_RETRY_MAX (15*HZ) + +#define NFS4_MAX_LOOP_ON_RECOVER (10) + +struct nfs4_opendata; +static int _nfs4_proc_open(struct nfs4_opendata *data); +static int _nfs4_recover_proc_open(struct nfs4_opendata *data); +static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); +static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); +static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir, + const struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr); +static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); +static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, + struct nfs_fattr *fattr, struct iattr *sattr, + struct nfs4_state *state); + +/* Prevent leaks of NFSv4 errors into userland */ +static int nfs4_map_errors(int err) +{ + if (err >= -1000) + return err; + switch (err) { + case -NFS4ERR_RESOURCE: + return -EREMOTEIO; + case -NFS4ERR_WRONGSEC: + return -EPERM; + case -NFS4ERR_BADOWNER: + case -NFS4ERR_BADNAME: + return -EINVAL; + case -NFS4ERR_SHARE_DENIED: + return -EACCES; + default: + dprintk("%s could not handle NFSv4 error %d\n", + __func__, -err); + break; + } + return -EIO; +} + +/* + * This is our standard bitmap for GETATTR requests. + */ +const u32 nfs4_fattr_bitmap[2] = { + FATTR4_WORD0_TYPE + | FATTR4_WORD0_CHANGE + | FATTR4_WORD0_SIZE + | FATTR4_WORD0_FSID + | FATTR4_WORD0_FILEID, + FATTR4_WORD1_MODE + | FATTR4_WORD1_NUMLINKS + | FATTR4_WORD1_OWNER + | FATTR4_WORD1_OWNER_GROUP + | FATTR4_WORD1_RAWDEV + | FATTR4_WORD1_SPACE_USED + | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_METADATA + | FATTR4_WORD1_TIME_MODIFY +}; + +const u32 nfs4_statfs_bitmap[2] = { + FATTR4_WORD0_FILES_AVAIL + | FATTR4_WORD0_FILES_FREE + | FATTR4_WORD0_FILES_TOTAL, + FATTR4_WORD1_SPACE_AVAIL + | FATTR4_WORD1_SPACE_FREE + | FATTR4_WORD1_SPACE_TOTAL +}; + +const u32 nfs4_pathconf_bitmap[2] = { + FATTR4_WORD0_MAXLINK + | FATTR4_WORD0_MAXNAME, + 0 +}; + +const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE + | FATTR4_WORD0_MAXREAD + | FATTR4_WORD0_MAXWRITE + | FATTR4_WORD0_LEASE_TIME, + FATTR4_WORD1_TIME_DELTA + | FATTR4_WORD1_FS_LAYOUT_TYPES +}; + +const u32 nfs4_fs_locations_bitmap[2] = { + FATTR4_WORD0_TYPE + | FATTR4_WORD0_CHANGE + | FATTR4_WORD0_SIZE + | FATTR4_WORD0_FSID + | FATTR4_WORD0_FILEID + | FATTR4_WORD0_FS_LOCATIONS, + FATTR4_WORD1_MODE + | FATTR4_WORD1_NUMLINKS + | FATTR4_WORD1_OWNER + | FATTR4_WORD1_OWNER_GROUP + | FATTR4_WORD1_RAWDEV + | FATTR4_WORD1_SPACE_USED + | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_METADATA + | FATTR4_WORD1_TIME_MODIFY + | FATTR4_WORD1_MOUNTED_ON_FILEID +}; + +static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry, + struct nfs4_readdir_arg *readdir) +{ + __be32 *start, *p; + + BUG_ON(readdir->count < 80); + if (cookie > 2) { + readdir->cookie = cookie; + memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier)); + return; + } + + readdir->cookie = 0; + memset(&readdir->verifier, 0, sizeof(readdir->verifier)); + if (cookie == 2) + return; + + /* + * NFSv4 servers do not return entries for '.' and '..' + * Therefore, we fake these entries here. We let '.' + * have cookie 0 and '..' have cookie 1. Note that + * when talking to the server, we always send cookie 0 + * instead of 1 or 2. + */ + start = p = kmap_atomic(*readdir->pages, KM_USER0); + + if (cookie == 0) { + *p++ = xdr_one; /* next */ + *p++ = xdr_zero; /* cookie, first word */ + *p++ = xdr_one; /* cookie, second word */ + *p++ = xdr_one; /* entry len */ + memcpy(p, ".\0\0\0", 4); /* entry */ + p++; + *p++ = xdr_one; /* bitmap length */ + *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ + *p++ = htonl(8); /* attribute buffer length */ + p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_inode)); + } + + *p++ = xdr_one; /* next */ + *p++ = xdr_zero; /* cookie, first word */ + *p++ = xdr_two; /* cookie, second word */ + *p++ = xdr_two; /* entry len */ + memcpy(p, "..\0\0", 4); /* entry */ + p++; + *p++ = xdr_one; /* bitmap length */ + *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ + *p++ = htonl(8); /* attribute buffer length */ + p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_parent->d_inode)); + + readdir->pgbase = (char *)p - (char *)start; + readdir->count -= readdir->pgbase; + kunmap_atomic(start, KM_USER0); +} + +static int nfs4_wait_clnt_recover(struct nfs_client *clp) +{ + int res; + + might_sleep(); + + res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, + nfs_wait_bit_killable, TASK_KILLABLE); + return res; +} + +static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) +{ + int res = 0; + + might_sleep(); + + if (*timeout <= 0) + *timeout = NFS4_POLL_RETRY_MIN; + if (*timeout > NFS4_POLL_RETRY_MAX) + *timeout = NFS4_POLL_RETRY_MAX; + schedule_timeout_killable(*timeout); + if (fatal_signal_pending(current)) + res = -ERESTARTSYS; + *timeout <<= 1; + return res; +} + +/* This is the error handling routine for processes that are allowed + * to sleep. + */ +static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state *state = exception->state; + struct inode *inode = exception->inode; + int ret = errorcode; + + exception->retry = 0; + switch(errorcode) { + case 0: + return 0; + case -NFS4ERR_OPENMODE: + if (nfs_have_delegation(inode, FMODE_READ)) { + nfs_inode_return_delegation(inode); + exception->retry = 1; + return 0; + } + if (state == NULL) + break; + nfs4_schedule_stateid_recovery(server, state); + goto wait_on_recovery; + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + if (state != NULL) + nfs_remove_bad_delegation(state->inode); + if (state == NULL) + break; + nfs4_schedule_stateid_recovery(server, state); + goto wait_on_recovery; + case -NFS4ERR_EXPIRED: + if (state != NULL) + nfs4_schedule_stateid_recovery(server, state); + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_STALE_CLIENTID: + nfs4_schedule_lease_recovery(clp); + goto wait_on_recovery; +#if defined(CONFIG_NFS_V4_1) + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR: %d Reset session\n", __func__, + errorcode); + nfs4_schedule_session_recovery(clp->cl_session); + exception->retry = 1; + break; +#endif /* defined(CONFIG_NFS_V4_1) */ + case -NFS4ERR_FILE_OPEN: + if (exception->timeout > HZ) { + /* We have retried a decent amount, time to + * fail + */ + ret = -EBUSY; + break; + } + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + case -EKEYEXPIRED: + ret = nfs4_delay(server->client, &exception->timeout); + if (ret != 0) + break; + case -NFS4ERR_RETRY_UNCACHED_REP: + case -NFS4ERR_OLD_STATEID: + exception->retry = 1; + break; + case -NFS4ERR_BADOWNER: + /* The following works around a Linux server bug! */ + case -NFS4ERR_BADNAME: + if (server->caps & NFS_CAP_UIDGID_NOMAP) { + server->caps &= ~NFS_CAP_UIDGID_NOMAP; + exception->retry = 1; + printk(KERN_WARNING "NFS: v4 server %s " + "does not accept raw " + "uid/gids. " + "Reenabling the idmapper.\n", + server->nfs_client->cl_hostname); + } + } + /* We failed to handle the error */ + return nfs4_map_errors(ret); +wait_on_recovery: + ret = nfs4_wait_clnt_recover(clp); + if (ret == 0) + exception->retry = 1; + return ret; +} + + +static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp) +{ + spin_lock(&clp->cl_lock); + if (time_before(clp->cl_last_renewal,timestamp)) + clp->cl_last_renewal = timestamp; + spin_unlock(&clp->cl_lock); +} + +static void renew_lease(const struct nfs_server *server, unsigned long timestamp) +{ + do_renew_lease(server->nfs_client, timestamp); +} + +#if defined(CONFIG_NFS_V4_1) + +/* + * nfs4_free_slot - free a slot and efficiently update slot table. + * + * freeing a slot is trivially done by clearing its respective bit + * in the bitmap. + * If the freed slotid equals highest_used_slotid we want to update it + * so that the server would be able to size down the slot table if needed, + * otherwise we know that the highest_used_slotid is still in use. + * When updating highest_used_slotid there may be "holes" in the bitmap + * so we need to scan down from highest_used_slotid to 0 looking for the now + * highest slotid in use. + * If none found, highest_used_slotid is set to -1. + * + * Must be called while holding tbl->slot_tbl_lock + */ +static void +nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot) +{ + int free_slotid = free_slot - tbl->slots; + int slotid = free_slotid; + + BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE); + /* clear used bit in bitmap */ + __clear_bit(slotid, tbl->used_slots); + + /* update highest_used_slotid when it is freed */ + if (slotid == tbl->highest_used_slotid) { + slotid = find_last_bit(tbl->used_slots, tbl->max_slots); + if (slotid < tbl->max_slots) + tbl->highest_used_slotid = slotid; + else + tbl->highest_used_slotid = -1; + } + dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__, + free_slotid, tbl->highest_used_slotid); +} + +/* + * Signal state manager thread if session fore channel is drained + */ +static void nfs4_check_drain_fc_complete(struct nfs4_session *ses) +{ + struct rpc_task *task; + + if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); + if (task) + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + return; + } + + if (ses->fc_slot_table.highest_used_slotid != -1) + return; + + dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__); + complete(&ses->fc_slot_table.complete); +} + +/* + * Signal state manager thread if session back channel is drained + */ +void nfs4_check_drain_bc_complete(struct nfs4_session *ses) +{ + if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) || + ses->bc_slot_table.highest_used_slotid != -1) + return; + dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__); + complete(&ses->bc_slot_table.complete); +} + +static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) +{ + struct nfs4_slot_table *tbl; + + tbl = &res->sr_session->fc_slot_table; + if (!res->sr_slot) { + /* just wake up the next guy waiting since + * we may have not consumed a slot after all */ + dprintk("%s: No slot\n", __func__); + return; + } + + spin_lock(&tbl->slot_tbl_lock); + nfs4_free_slot(tbl, res->sr_slot); + nfs4_check_drain_fc_complete(res->sr_session); + spin_unlock(&tbl->slot_tbl_lock); + res->sr_slot = NULL; +} + +static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + unsigned long timestamp; + struct nfs_client *clp; + + /* + * sr_status remains 1 if an RPC level error occurred. The server + * may or may not have processed the sequence operation.. + * Proceed as if the server received and processed the sequence + * operation. + */ + if (res->sr_status == 1) + res->sr_status = NFS_OK; + + /* don't increment the sequence number if the task wasn't sent */ + if (!RPC_WAS_SENT(task)) + goto out; + + /* Check the SEQUENCE operation status */ + switch (res->sr_status) { + case 0: + /* Update the slot's sequence and clientid lease timer */ + ++res->sr_slot->seq_nr; + timestamp = res->sr_renewal_time; + clp = res->sr_session->clp; + do_renew_lease(clp, timestamp); + /* Check sequence flags */ + if (res->sr_status_flags != 0) + nfs4_schedule_lease_recovery(clp); + break; + case -NFS4ERR_DELAY: + /* The server detected a resend of the RPC call and + * returned NFS4ERR_DELAY as per Section 2.10.6.2 + * of RFC5661. + */ + dprintk("%s: slot=%td seq=%d: Operation in progress\n", + __func__, + res->sr_slot - res->sr_session->fc_slot_table.slots, + res->sr_slot->seq_nr); + goto out_retry; + default: + /* Just update the slot sequence no. */ + ++res->sr_slot->seq_nr; + } +out: + /* The session may be reset by one of the error handlers. */ + dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); + nfs41_sequence_free_slot(res); + return 1; +out_retry: + if (!rpc_restart_call(task)) + goto out; + rpc_delay(task, NFS4_POLL_RETRY_MAX); + return 0; +} + +static int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + if (res->sr_session == NULL) + return 1; + return nfs41_sequence_done(task, res); +} + +/* + * nfs4_find_slot - efficiently look for a free slot + * + * nfs4_find_slot looks for an unset bit in the used_slots bitmap. + * If found, we mark the slot as used, update the highest_used_slotid, + * and respectively set up the sequence operation args. + * The slot number is returned if found, or NFS4_MAX_SLOT_TABLE otherwise. + * + * Note: must be called with under the slot_tbl_lock. + */ +static u8 +nfs4_find_slot(struct nfs4_slot_table *tbl) +{ + int slotid; + u8 ret_id = NFS4_MAX_SLOT_TABLE; + BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE); + + dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n", + __func__, tbl->used_slots[0], tbl->highest_used_slotid, + tbl->max_slots); + slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots); + if (slotid >= tbl->max_slots) + goto out; + __set_bit(slotid, tbl->used_slots); + if (slotid > tbl->highest_used_slotid) + tbl->highest_used_slotid = slotid; + ret_id = slotid; +out: + dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n", + __func__, tbl->used_slots[0], tbl->highest_used_slotid, ret_id); + return ret_id; +} + +int nfs41_setup_sequence(struct nfs4_session *session, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply, + struct rpc_task *task) +{ + struct nfs4_slot *slot; + struct nfs4_slot_table *tbl; + u8 slotid; + + dprintk("--> %s\n", __func__); + /* slot already allocated? */ + if (res->sr_slot != NULL) + return 0; + + tbl = &session->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); + if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && + !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { + /* + * The state manager will wait until the slot table is empty. + * Schedule the reset thread + */ + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + spin_unlock(&tbl->slot_tbl_lock); + dprintk("%s Schedule Session Reset\n", __func__); + return -EAGAIN; + } + + if (!rpc_queue_empty(&tbl->slot_tbl_waitq) && + !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + spin_unlock(&tbl->slot_tbl_lock); + dprintk("%s enforce FIFO order\n", __func__); + return -EAGAIN; + } + + slotid = nfs4_find_slot(tbl); + if (slotid == NFS4_MAX_SLOT_TABLE) { + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + spin_unlock(&tbl->slot_tbl_lock); + dprintk("<-- %s: no free slots\n", __func__); + return -EAGAIN; + } + spin_unlock(&tbl->slot_tbl_lock); + + rpc_task_set_priority(task, RPC_PRIORITY_NORMAL); + slot = tbl->slots + slotid; + args->sa_session = session; + args->sa_slotid = slotid; + args->sa_cache_this = cache_reply; + + dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); + + res->sr_session = session; + res->sr_slot = slot; + res->sr_renewal_time = jiffies; + res->sr_status_flags = 0; + /* + * sr_status is only set in decode_sequence, and so will remain + * set to 1 if an rpc level failure occurs. + */ + res->sr_status = 1; + return 0; +} +EXPORT_SYMBOL_GPL(nfs41_setup_sequence); + +int nfs4_setup_sequence(const struct nfs_server *server, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply, + struct rpc_task *task) +{ + struct nfs4_session *session = nfs4_get_session(server); + int ret = 0; + + if (session == NULL) { + args->sa_session = NULL; + res->sr_session = NULL; + goto out; + } + + dprintk("--> %s clp %p session %p sr_slot %td\n", + __func__, session->clp, session, res->sr_slot ? + res->sr_slot - session->fc_slot_table.slots : -1); + + ret = nfs41_setup_sequence(session, args, res, cache_reply, + task); +out: + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; +} + +struct nfs41_call_sync_data { + const struct nfs_server *seq_server; + struct nfs4_sequence_args *seq_args; + struct nfs4_sequence_res *seq_res; + int cache_reply; +}; + +static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs41_call_sync_data *data = calldata; + + dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); + + if (nfs4_setup_sequence(data->seq_server, data->seq_args, + data->seq_res, data->cache_reply, task)) + return; + rpc_call_start(task); +} + +static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata) +{ + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + nfs41_call_sync_prepare(task, calldata); +} + +static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) +{ + struct nfs41_call_sync_data *data = calldata; + + nfs41_sequence_done(task, data->seq_res); +} + +struct rpc_call_ops nfs41_call_sync_ops = { + .rpc_call_prepare = nfs41_call_sync_prepare, + .rpc_call_done = nfs41_call_sync_done, +}; + +struct rpc_call_ops nfs41_call_priv_sync_ops = { + .rpc_call_prepare = nfs41_call_priv_sync_prepare, + .rpc_call_done = nfs41_call_sync_done, +}; + +static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply, + int privileged) +{ + int ret; + struct rpc_task *task; + struct nfs41_call_sync_data data = { + .seq_server = server, + .seq_args = args, + .seq_res = res, + .cache_reply = cache_reply, + }; + struct rpc_task_setup task_setup = { + .rpc_client = clnt, + .rpc_message = msg, + .callback_ops = &nfs41_call_sync_ops, + .callback_data = &data + }; + + res->sr_slot = NULL; + if (privileged) + task_setup.callback_ops = &nfs41_call_priv_sync_ops; + task = rpc_run_task(&task_setup); + if (IS_ERR(task)) + ret = PTR_ERR(task); + else { + ret = task->tk_status; + rpc_put_task(task); + } + return ret; +} + +int _nfs4_call_sync_session(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply) +{ + return nfs4_call_sync_sequence(clnt, server, msg, args, res, cache_reply, 0); +} + +#else +static int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + return 1; +} +#endif /* CONFIG_NFS_V4_1 */ + +int _nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply) +{ + args->sa_session = res->sr_session = NULL; + return rpc_call_sync(clnt, msg, 0); +} + +static inline +int nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply) +{ + return server->nfs_client->cl_mvops->call_sync(clnt, server, msg, + args, res, cache_reply); +} + +static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) +{ + struct nfs_inode *nfsi = NFS_I(dir); + + spin_lock(&dir->i_lock); + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; + if (!cinfo->atomic || cinfo->before != nfsi->change_attr) + nfs_force_lookup_revalidate(dir); + nfsi->change_attr = cinfo->after; + spin_unlock(&dir->i_lock); +} + +struct nfs4_opendata { + struct kref kref; + struct nfs_openargs o_arg; + struct nfs_openres o_res; + struct nfs_open_confirmargs c_arg; + struct nfs_open_confirmres c_res; + struct nfs_fattr f_attr; + struct nfs_fattr dir_attr; + struct path path; + struct dentry *dir; + struct nfs4_state_owner *owner; + struct nfs4_state *state; + struct iattr attrs; + unsigned long timestamp; + unsigned int rpc_done : 1; + int rpc_status; + int cancelled; +}; + + +static void nfs4_init_opendata_res(struct nfs4_opendata *p) +{ + p->o_res.f_attr = &p->f_attr; + p->o_res.dir_attr = &p->dir_attr; + p->o_res.seqid = p->o_arg.seqid; + p->c_res.seqid = p->c_arg.seqid; + p->o_res.server = p->o_arg.server; + nfs_fattr_init(&p->f_attr); + nfs_fattr_init(&p->dir_attr); +} + +static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, + struct nfs4_state_owner *sp, fmode_t fmode, int flags, + const struct iattr *attrs, + gfp_t gfp_mask) +{ + struct dentry *parent = dget_parent(path->dentry); + struct inode *dir = parent->d_inode; + struct nfs_server *server = NFS_SERVER(dir); + struct nfs4_opendata *p; + + p = kzalloc(sizeof(*p), gfp_mask); + if (p == NULL) + goto err; + p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); + if (p->o_arg.seqid == NULL) + goto err_free; + path_get(path); + p->path = *path; + p->dir = parent; + p->owner = sp; + atomic_inc(&sp->so_count); + p->o_arg.fh = NFS_FH(dir); + p->o_arg.open_flags = flags; + p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); + p->o_arg.clientid = server->nfs_client->cl_clientid; + p->o_arg.id = sp->so_owner_id.id; + p->o_arg.name = &p->path.dentry->d_name; + p->o_arg.server = server; + p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; + if (flags & O_CREAT) { + u32 *s; + + p->o_arg.u.attrs = &p->attrs; + memcpy(&p->attrs, attrs, sizeof(p->attrs)); + s = (u32 *) p->o_arg.u.verifier.data; + s[0] = jiffies; + s[1] = current->pid; + } + p->c_arg.fh = &p->o_res.fh; + p->c_arg.stateid = &p->o_res.stateid; + p->c_arg.seqid = p->o_arg.seqid; + nfs4_init_opendata_res(p); + kref_init(&p->kref); + return p; +err_free: + kfree(p); +err: + dput(parent); + return NULL; +} + +static void nfs4_opendata_free(struct kref *kref) +{ + struct nfs4_opendata *p = container_of(kref, + struct nfs4_opendata, kref); + + nfs_free_seqid(p->o_arg.seqid); + if (p->state != NULL) + nfs4_put_open_state(p->state); + nfs4_put_state_owner(p->owner); + dput(p->dir); + path_put(&p->path); + kfree(p); +} + +static void nfs4_opendata_put(struct nfs4_opendata *p) +{ + if (p != NULL) + kref_put(&p->kref, nfs4_opendata_free); +} + +static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) +{ + int ret; + + ret = rpc_wait_for_completion_task(task); + return ret; +} + +static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode) +{ + int ret = 0; + + if (open_mode & O_EXCL) + goto out; + switch (mode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_READ: + ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 + && state->n_rdonly != 0; + break; + case FMODE_WRITE: + ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 + && state->n_wronly != 0; + break; + case FMODE_READ|FMODE_WRITE: + ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 + && state->n_rdwr != 0; + } +out: + return ret; +} + +static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) +{ + if ((delegation->type & fmode) != fmode) + return 0; + if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) + return 0; + nfs_mark_delegation_referenced(delegation); + return 1; +} + +static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode) +{ + switch (fmode) { + case FMODE_WRITE: + state->n_wronly++; + break; + case FMODE_READ: + state->n_rdonly++; + break; + case FMODE_READ|FMODE_WRITE: + state->n_rdwr++; + } + nfs4_state_set_mode_locked(state, state->state | fmode); +} + +static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) + memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); + memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); + switch (fmode) { + case FMODE_READ: + set_bit(NFS_O_RDONLY_STATE, &state->flags); + break; + case FMODE_WRITE: + set_bit(NFS_O_WRONLY_STATE, &state->flags); + break; + case FMODE_READ|FMODE_WRITE: + set_bit(NFS_O_RDWR_STATE, &state->flags); + } +} + +static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{ + write_seqlock(&state->seqlock); + nfs_set_open_stateid_locked(state, stateid, fmode); + write_sequnlock(&state->seqlock); +} + +static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) +{ + /* + * Protect the call to nfs4_state_set_mode_locked and + * serialise the stateid update + */ + write_seqlock(&state->seqlock); + if (deleg_stateid != NULL) { + memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); + set_bit(NFS_DELEGATED_STATE, &state->flags); + } + if (open_stateid != NULL) + nfs_set_open_stateid_locked(state, open_stateid, fmode); + write_sequnlock(&state->seqlock); + spin_lock(&state->owner->so_lock); + update_open_stateflags(state, fmode); + spin_unlock(&state->owner->so_lock); +} + +static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode) +{ + struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs_delegation *deleg_cur; + int ret = 0; + + fmode &= (FMODE_READ|FMODE_WRITE); + + rcu_read_lock(); + deleg_cur = rcu_dereference(nfsi->delegation); + if (deleg_cur == NULL) + goto no_delegation; + + spin_lock(&deleg_cur->lock); + if (nfsi->delegation != deleg_cur || + (deleg_cur->type & fmode) != fmode) + goto no_delegation_unlock; + + if (delegation == NULL) + delegation = &deleg_cur->stateid; + else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0) + goto no_delegation_unlock; + + nfs_mark_delegation_referenced(deleg_cur); + __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode); + ret = 1; +no_delegation_unlock: + spin_unlock(&deleg_cur->lock); +no_delegation: + rcu_read_unlock(); + + if (!ret && open_stateid != NULL) { + __update_open_stateid(state, open_stateid, NULL, fmode); + ret = 1; + } + + return ret; +} + + +static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode) +{ + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation == NULL || (delegation->type & fmode) == fmode) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + nfs_inode_return_delegation(inode); +} + +static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) +{ + struct nfs4_state *state = opendata->state; + struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs_delegation *delegation; + int open_mode = opendata->o_arg.open_flags & O_EXCL; + fmode_t fmode = opendata->o_arg.fmode; + nfs4_stateid stateid; + int ret = -EAGAIN; + + for (;;) { + if (can_open_cached(state, fmode, open_mode)) { + spin_lock(&state->owner->so_lock); + if (can_open_cached(state, fmode, open_mode)) { + update_open_stateflags(state, fmode); + spin_unlock(&state->owner->so_lock); + goto out_return_state; + } + spin_unlock(&state->owner->so_lock); + } + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation == NULL || + !can_open_delegated(delegation, fmode)) { + rcu_read_unlock(); + break; + } + /* Save the delegation */ + memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); + rcu_read_unlock(); + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); + if (ret != 0) + goto out; + ret = -EAGAIN; + + /* Try to update the stateid using the delegation */ + if (update_open_stateid(state, NULL, &stateid, fmode)) + goto out_return_state; + } +out: + return ERR_PTR(ret); +out_return_state: + atomic_inc(&state->count); + return state; +} + +static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +{ + struct inode *inode; + struct nfs4_state *state = NULL; + struct nfs_delegation *delegation; + int ret; + + if (!data->rpc_done) { + state = nfs4_try_open_cached(data); + goto out; + } + + ret = -EAGAIN; + if (!(data->f_attr.valid & NFS_ATTR_FATTR)) + goto err; + inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr); + ret = PTR_ERR(inode); + if (IS_ERR(inode)) + goto err; + ret = -ENOMEM; + state = nfs4_get_open_state(inode, data->owner); + if (state == NULL) + goto err_put_inode; + if (data->o_res.delegation_type != 0) { + int delegation_flags = 0; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation) + delegation_flags = delegation->flags; + rcu_read_unlock(); + if ((delegation_flags & 1UL<inode, + data->owner->so_cred, + &data->o_res); + else + nfs_inode_reclaim_delegation(state->inode, + data->owner->so_cred, + &data->o_res); + } + + update_open_stateid(state, &data->o_res.stateid, NULL, + data->o_arg.fmode); + iput(inode); +out: + return state; +err_put_inode: + iput(inode); +err: + return ERR_PTR(ret); +} + +static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) +{ + struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs_open_context *ctx; + + spin_lock(&state->inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + if (ctx->state != state) + continue; + get_nfs_open_context(ctx); + spin_unlock(&state->inode->i_lock); + return ctx; + } + spin_unlock(&state->inode->i_lock); + return ERR_PTR(-ENOENT); +} + +static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, struct nfs4_state *state) +{ + struct nfs4_opendata *opendata; + + opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS); + if (opendata == NULL) + return ERR_PTR(-ENOMEM); + opendata->state = state; + atomic_inc(&state->count); + return opendata; +} + +static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res) +{ + struct nfs4_state *newstate; + int ret; + + opendata->o_arg.open_flags = 0; + opendata->o_arg.fmode = fmode; + memset(&opendata->o_res, 0, sizeof(opendata->o_res)); + memset(&opendata->c_res, 0, sizeof(opendata->c_res)); + nfs4_init_opendata_res(opendata); + ret = _nfs4_recover_proc_open(opendata); + if (ret != 0) + return ret; + newstate = nfs4_opendata_to_nfs4_state(opendata); + if (IS_ERR(newstate)) + return PTR_ERR(newstate); + nfs4_close_state(&opendata->path, newstate, fmode); + *res = newstate; + return 0; +} + +static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state) +{ + struct nfs4_state *newstate; + int ret; + + /* memory barrier prior to reading state->n_* */ + clear_bit(NFS_DELEGATED_STATE, &state->flags); + smp_rmb(); + if (state->n_rdwr != 0) { + clear_bit(NFS_O_RDWR_STATE, &state->flags); + ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); + if (ret != 0) + return ret; + if (newstate != state) + return -ESTALE; + } + if (state->n_wronly != 0) { + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); + if (ret != 0) + return ret; + if (newstate != state) + return -ESTALE; + } + if (state->n_rdonly != 0) { + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); + if (ret != 0) + return ret; + if (newstate != state) + return -ESTALE; + } + /* + * We may have performed cached opens for all three recoveries. + * Check if we need to update the current stateid. + */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && + memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { + write_seqlock(&state->seqlock); + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) + memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); + write_sequnlock(&state->seqlock); + } + return 0; +} + +/* + * OPEN_RECLAIM: + * reclaim state on the server after a reboot. + */ +static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state) +{ + struct nfs_delegation *delegation; + struct nfs4_opendata *opendata; + fmode_t delegation_type = 0; + int status; + + opendata = nfs4_open_recoverdata_alloc(ctx, state); + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS; + opendata->o_arg.fh = NFS_FH(state->inode); + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(state->inode)->delegation); + if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0) + delegation_type = delegation->type; + rcu_read_unlock(); + opendata->o_arg.u.delegation_type = delegation_type; + status = nfs4_open_recover(opendata, state); + nfs4_opendata_put(opendata); + return status; +} + +static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs4_do_open_reclaim(ctx, state); + if (err != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + return err; +} + +static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + struct nfs_open_context *ctx; + int ret; + + ctx = nfs4_state_find_open_context(state); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + ret = nfs4_do_open_reclaim(ctx, state); + put_nfs_open_context(ctx); + return ret; +} + +static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +{ + struct nfs4_opendata *opendata; + int ret; + + opendata = nfs4_open_recoverdata_alloc(ctx, state); + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; + memcpy(opendata->o_arg.u.delegation.data, stateid->data, + sizeof(opendata->o_arg.u.delegation.data)); + ret = nfs4_open_recover(opendata, state); + nfs4_opendata_put(opendata); + return ret; +} + +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +{ + struct nfs4_exception exception = { }; + struct nfs_server *server = NFS_SERVER(state->inode); + int err; + do { + err = _nfs4_open_delegation_recall(ctx, state, stateid); + switch (err) { + case 0: + case -ENOENT: + case -ESTALE: + goto out; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + nfs4_schedule_session_recovery(server->nfs_client->cl_session); + goto out; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + /* Don't recall a delegation if it was lost */ + nfs4_schedule_lease_recovery(server->nfs_client); + goto out; + case -ERESTARTSYS: + /* + * The show must go on: exit, but mark the + * stateid as needing recovery. + */ + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + nfs_inode_find_state_and_recover(state->inode, + stateid); + nfs4_schedule_stateid_recovery(server, state); + case -EKEYEXPIRED: + /* + * User RPCSEC_GSS context has expired. + * We cannot recover this stateid now, so + * skip it and allow recovery thread to + * proceed. + */ + case -ENOMEM: + err = 0; + goto out; + } + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); +out: + return err; +} + +static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_opendata *data = calldata; + + data->rpc_status = task->tk_status; + if (data->rpc_status == 0) { + memcpy(data->o_res.stateid.data, data->c_res.stateid.data, + sizeof(data->o_res.stateid.data)); + nfs_confirm_seqid(&data->owner->so_seqid, 0); + renew_lease(data->o_res.server, data->timestamp); + data->rpc_done = 1; + } +} + +static void nfs4_open_confirm_release(void *calldata) +{ + struct nfs4_opendata *data = calldata; + struct nfs4_state *state = NULL; + + /* If this request hasn't been cancelled, do nothing */ + if (data->cancelled == 0) + goto out_free; + /* In case of error, no cleanup! */ + if (!data->rpc_done) + goto out_free; + state = nfs4_opendata_to_nfs4_state(data); + if (!IS_ERR(state)) + nfs4_close_state(&data->path, state, data->o_arg.fmode); +out_free: + nfs4_opendata_put(data); +} + +static const struct rpc_call_ops nfs4_open_confirm_ops = { + .rpc_call_done = nfs4_open_confirm_done, + .rpc_release = nfs4_open_confirm_release, +}; + +/* + * Note: On error, nfs4_proc_open_confirm will free the struct nfs4_opendata + */ +static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) +{ + struct nfs_server *server = NFS_SERVER(data->dir->d_inode); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM], + .rpc_argp = &data->c_arg, + .rpc_resp = &data->c_res, + .rpc_cred = data->owner->so_cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_open_confirm_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + int status; + + kref_get(&data->kref); + data->rpc_done = 0; + data->rpc_status = 0; + data->timestamp = jiffies; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) { + data->cancelled = 1; + smp_wmb(); + } else + status = data->rpc_status; + rpc_put_task(task); + return status; +} + +static void nfs4_open_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_opendata *data = calldata; + struct nfs4_state_owner *sp = data->owner; + + if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) + return; + /* + * Check if we still need to send an OPEN call, or if we can use + * a delegation instead. + */ + if (data->state != NULL) { + struct nfs_delegation *delegation; + + if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags)) + goto out_no_action; + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); + if (delegation != NULL && + test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) { + rcu_read_unlock(); + goto out_no_action; + } + rcu_read_unlock(); + } + /* Update sequence id. */ + data->o_arg.id = sp->so_owner_id.id; + data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); + } + data->timestamp = jiffies; + if (nfs4_setup_sequence(data->o_arg.server, + &data->o_arg.seq_args, + &data->o_res.seq_res, 1, task)) + return; + rpc_call_start(task); + return; +out_no_action: + task->tk_action = NULL; + +} + +static void nfs4_recover_open_prepare(struct rpc_task *task, void *calldata) +{ + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + nfs4_open_prepare(task, calldata); +} + +static void nfs4_open_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_opendata *data = calldata; + + data->rpc_status = task->tk_status; + + if (!nfs4_sequence_done(task, &data->o_res.seq_res)) + return; + + if (task->tk_status == 0) { + switch (data->o_res.f_attr->mode & S_IFMT) { + case S_IFREG: + break; + case S_IFLNK: + data->rpc_status = -ELOOP; + break; + case S_IFDIR: + data->rpc_status = -EISDIR; + break; + default: + data->rpc_status = -ENOTDIR; + } + renew_lease(data->o_res.server, data->timestamp); + if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)) + nfs_confirm_seqid(&data->owner->so_seqid, 0); + } + data->rpc_done = 1; +} + +static void nfs4_open_release(void *calldata) +{ + struct nfs4_opendata *data = calldata; + struct nfs4_state *state = NULL; + + /* If this request hasn't been cancelled, do nothing */ + if (data->cancelled == 0) + goto out_free; + /* In case of error, no cleanup! */ + if (data->rpc_status != 0 || !data->rpc_done) + goto out_free; + /* In case we need an open_confirm, no cleanup! */ + if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) + goto out_free; + state = nfs4_opendata_to_nfs4_state(data); + if (!IS_ERR(state)) + nfs4_close_state(&data->path, state, data->o_arg.fmode); +out_free: + nfs4_opendata_put(data); +} + +static const struct rpc_call_ops nfs4_open_ops = { + .rpc_call_prepare = nfs4_open_prepare, + .rpc_call_done = nfs4_open_done, + .rpc_release = nfs4_open_release, +}; + +static const struct rpc_call_ops nfs4_recover_open_ops = { + .rpc_call_prepare = nfs4_recover_open_prepare, + .rpc_call_done = nfs4_open_done, + .rpc_release = nfs4_open_release, +}; + +static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) +{ + struct inode *dir = data->dir->d_inode; + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_openargs *o_arg = &data->o_arg; + struct nfs_openres *o_res = &data->o_res; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN], + .rpc_argp = o_arg, + .rpc_resp = o_res, + .rpc_cred = data->owner->so_cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_open_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + int status; + + kref_get(&data->kref); + data->rpc_done = 0; + data->rpc_status = 0; + data->cancelled = 0; + if (isrecover) + task_setup_data.callback_ops = &nfs4_recover_open_ops; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) { + data->cancelled = 1; + smp_wmb(); + } else + status = data->rpc_status; + rpc_put_task(task); + + return status; +} + +static int _nfs4_recover_proc_open(struct nfs4_opendata *data) +{ + struct inode *dir = data->dir->d_inode; + struct nfs_openres *o_res = &data->o_res; + int status; + + status = nfs4_run_open_task(data, 1); + if (status != 0 || !data->rpc_done) + return status; + + nfs_refresh_inode(dir, o_res->dir_attr); + + if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { + status = _nfs4_proc_open_confirm(data); + if (status != 0) + return status; + } + + return status; +} + +/* + * Note: On error, nfs4_proc_open will free the struct nfs4_opendata + */ +static int _nfs4_proc_open(struct nfs4_opendata *data) +{ + struct inode *dir = data->dir->d_inode; + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_openargs *o_arg = &data->o_arg; + struct nfs_openres *o_res = &data->o_res; + int status; + + status = nfs4_run_open_task(data, 0); + if (status != 0 || !data->rpc_done) + return status; + + if (o_arg->open_flags & O_CREAT) { + update_changeattr(dir, &o_res->cinfo); + nfs_post_op_update_inode(dir, o_res->dir_attr); + } else + nfs_refresh_inode(dir, o_res->dir_attr); + if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) + server->caps &= ~NFS_CAP_POSIX_LOCK; + if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { + status = _nfs4_proc_open_confirm(data); + if (status != 0) + return status; + } + if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) + _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr); + return 0; +} + +static int nfs4_client_recover_expired_lease(struct nfs_client *clp) +{ + unsigned int loop; + int ret; + + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { + ret = nfs4_wait_clnt_recover(clp); + if (ret != 0) + break; + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && + !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) + break; + nfs4_schedule_state_manager(clp); + ret = -EIO; + } + return ret; +} + +static int nfs4_recover_expired_lease(struct nfs_server *server) +{ + return nfs4_client_recover_expired_lease(server->nfs_client); +} + +/* + * OPEN_EXPIRED: + * reclaim state on the server after a network partition. + * Assumes caller holds the appropriate lock + */ +static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) +{ + struct nfs4_opendata *opendata; + int ret; + + opendata = nfs4_open_recoverdata_alloc(ctx, state); + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + ret = nfs4_open_recover(opendata, state); + if (ret == -ESTALE) + d_drop(ctx->path.dentry); + nfs4_opendata_put(opendata); + return ret; +} + +static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs4_open_expired(ctx, state); + switch (err) { + default: + goto out; + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + nfs4_handle_exception(server, err, &exception); + err = 0; + } + } while (exception.retry); +out: + return err; +} + +static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + struct nfs_open_context *ctx; + int ret; + + ctx = nfs4_state_find_open_context(state); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + ret = nfs4_do_open_expired(ctx, state); + put_nfs_open_context(ctx); + return ret; +} + +/* + * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-* + * fields corresponding to attributes that were used to store the verifier. + * Make sure we clobber those fields in the later setattr call + */ +static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr) +{ + if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) && + !(sattr->ia_valid & ATTR_ATIME_SET)) + sattr->ia_valid |= ATTR_ATIME; + + if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) && + !(sattr->ia_valid & ATTR_MTIME_SET)) + sattr->ia_valid |= ATTR_MTIME; +} + +/* + * Returns a referenced nfs4_state + */ +static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) +{ + struct nfs4_state_owner *sp; + struct nfs4_state *state = NULL; + struct nfs_server *server = NFS_SERVER(dir); + struct nfs4_opendata *opendata; + int status; + + /* Protect against reboot recovery conflicts */ + status = -ENOMEM; + if (!(sp = nfs4_get_state_owner(server, cred))) { + dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); + goto out_err; + } + status = nfs4_recover_expired_lease(server); + if (status != 0) + goto err_put_state_owner; + if (path->dentry->d_inode != NULL) + nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode); + status = -ENOMEM; + opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL); + if (opendata == NULL) + goto err_put_state_owner; + + if (path->dentry->d_inode != NULL) + opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp); + + status = _nfs4_proc_open(opendata); + if (status != 0) + goto err_opendata_put; + + state = nfs4_opendata_to_nfs4_state(opendata); + status = PTR_ERR(state); + if (IS_ERR(state)) + goto err_opendata_put; + if (server->caps & NFS_CAP_POSIX_LOCK) + set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + + if (opendata->o_arg.open_flags & O_EXCL) { + nfs4_exclusive_attrset(opendata, sattr); + + nfs_fattr_init(opendata->o_res.f_attr); + status = nfs4_do_setattr(state->inode, cred, + opendata->o_res.f_attr, sattr, + state); + if (status == 0) + nfs_setattr_update_inode(state->inode, sattr); + nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); + } + nfs_revalidate_inode(server, state->inode); + nfs4_opendata_put(opendata); + nfs4_put_state_owner(sp); + *res = state; + return 0; +err_opendata_put: + nfs4_opendata_put(opendata); +err_put_state_owner: + nfs4_put_state_owner(sp); +out_err: + *res = NULL; + return status; +} + + +static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred) +{ + struct nfs4_exception exception = { }; + struct nfs4_state *res; + int status; + + do { + status = _nfs4_do_open(dir, path, fmode, flags, sattr, cred, &res); + if (status == 0) + break; + /* NOTE: BAD_SEQID means the server and client disagree about the + * book-keeping w.r.t. state-changing operations + * (OPEN/CLOSE/LOCK/LOCKU...) + * It is actually a sign of a bug on the client or on the server. + * + * If we receive a BAD_SEQID error in the particular case of + * doing an OPEN, we assume that nfs_increment_open_seqid() will + * have unhashed the old state_owner for us, and that we can + * therefore safely retry using a new one. We should still warn + * the user though... + */ + if (status == -NFS4ERR_BAD_SEQID) { + printk(KERN_WARNING "NFS: v4 server %s " + " returned a bad sequence-id error!\n", + NFS_SERVER(dir)->nfs_client->cl_hostname); + exception.retry = 1; + continue; + } + /* + * BAD_STATEID on OPEN means that the server cancelled our + * state before it received the OPEN_CONFIRM. + * Recover by retrying the request as per the discussion + * on Page 181 of RFC3530. + */ + if (status == -NFS4ERR_BAD_STATEID) { + exception.retry = 1; + continue; + } + if (status == -EAGAIN) { + /* We must have found a delegation */ + exception.retry = 1; + continue; + } + res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), + status, &exception)); + } while (exception.retry); + return res; +} + +static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, + struct nfs_fattr *fattr, struct iattr *sattr, + struct nfs4_state *state) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_setattrargs arg = { + .fh = NFS_FH(inode), + .iap = sattr, + .server = server, + .bitmask = server->attr_bitmask, + }; + struct nfs_setattrres res = { + .fattr = fattr, + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], + .rpc_argp = &arg, + .rpc_resp = &res, + .rpc_cred = cred, + }; + unsigned long timestamp = jiffies; + int status; + + nfs_fattr_init(fattr); + + if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { + /* Use that stateid */ + } else if (state != NULL) { + nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid); + } else + memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); + + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + if (status == 0 && state != NULL) + renew_lease(server, timestamp); + return status; +} + +static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, + struct nfs_fattr *fattr, struct iattr *sattr, + struct nfs4_state *state) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_exception exception = { + .state = state, + .inode = inode, + }; + int err; + do { + err = nfs4_handle_exception(server, + _nfs4_do_setattr(inode, cred, fattr, sattr, state), + &exception); + } while (exception.retry); + return err; +} + +struct nfs4_closedata { + struct path path; + struct inode *inode; + struct nfs4_state *state; + struct nfs_closeargs arg; + struct nfs_closeres res; + struct nfs_fattr fattr; + unsigned long timestamp; + bool roc; + u32 roc_barrier; +}; + +static void nfs4_free_closedata(void *data) +{ + struct nfs4_closedata *calldata = data; + struct nfs4_state_owner *sp = calldata->state->owner; + + if (calldata->roc) + pnfs_roc_release(calldata->state->inode); + nfs4_put_open_state(calldata->state); + nfs_free_seqid(calldata->arg.seqid); + nfs4_put_state_owner(sp); + path_put(&calldata->path); + kfree(calldata); +} + +static void nfs4_close_clear_stateid_flags(struct nfs4_state *state, + fmode_t fmode) +{ + spin_lock(&state->owner->so_lock); + if (!(fmode & FMODE_READ)) + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + if (!(fmode & FMODE_WRITE)) + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDWR_STATE, &state->flags); + spin_unlock(&state->owner->so_lock); +} + +static void nfs4_close_done(struct rpc_task *task, void *data) +{ + struct nfs4_closedata *calldata = data; + struct nfs4_state *state = calldata->state; + struct nfs_server *server = NFS_SERVER(calldata->inode); + + if (!nfs4_sequence_done(task, &calldata->res.seq_res)) + return; + /* hmm. we are done with the inode, and in the process of freeing + * the state_owner. we keep this around to process errors + */ + switch (task->tk_status) { + case 0: + if (calldata->roc) + pnfs_roc_set_barrier(state->inode, + calldata->roc_barrier); + nfs_set_open_stateid(state, &calldata->res.stateid, 0); + renew_lease(server, calldata->timestamp); + nfs4_close_clear_stateid_flags(state, + calldata->arg.fmode); + break; + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_EXPIRED: + if (calldata->arg.fmode == 0) + break; + default: + if (nfs4_async_handle_error(task, server, state) == -EAGAIN) + rpc_restart_call_prepare(task); + } + nfs_release_seqid(calldata->arg.seqid); + nfs_refresh_inode(calldata->inode, calldata->res.fattr); +} + +static void nfs4_close_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_closedata *calldata = data; + struct nfs4_state *state = calldata->state; + int call_close = 0; + + if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) + return; + + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; + calldata->arg.fmode = FMODE_READ|FMODE_WRITE; + spin_lock(&state->owner->so_lock); + /* Calculate the change in open mode */ + if (state->n_rdwr == 0) { + if (state->n_rdonly == 0) { + call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + calldata->arg.fmode &= ~FMODE_READ; + } + if (state->n_wronly == 0) { + call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + calldata->arg.fmode &= ~FMODE_WRITE; + } + } + spin_unlock(&state->owner->so_lock); + + if (!call_close) { + /* Note: exit _without_ calling nfs4_close_done */ + task->tk_action = NULL; + return; + } + + if (calldata->arg.fmode == 0) { + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; + if (calldata->roc && + pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) { + rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq, + task, NULL); + return; + } + } + + nfs_fattr_init(calldata->res.fattr); + calldata->timestamp = jiffies; + if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), + &calldata->arg.seq_args, &calldata->res.seq_res, + 1, task)) + return; + rpc_call_start(task); +} + +static const struct rpc_call_ops nfs4_close_ops = { + .rpc_call_prepare = nfs4_close_prepare, + .rpc_call_done = nfs4_close_done, + .rpc_release = nfs4_free_closedata, +}; + +/* + * It is possible for data to be read/written from a mem-mapped file + * after the sys_close call (which hits the vfs layer as a flush). + * This means that we can't safely call nfsv4 close on a file until + * the inode is cleared. This in turn means that we are not good + * NFSv4 citizens - we do not indicate to the server to update the file's + * share state even when we are done with one of the three share + * stateid's in the inode. + * + * NOTE: Caller must be holding the sp->so_owner semaphore! + */ +int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_closedata *calldata; + struct nfs4_state_owner *sp = state->owner; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE], + .rpc_cred = state->owner->so_cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_close_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + int status = -ENOMEM; + + calldata = kzalloc(sizeof(*calldata), gfp_mask); + if (calldata == NULL) + goto out; + calldata->inode = state->inode; + calldata->state = state; + calldata->arg.fh = NFS_FH(state->inode); + calldata->arg.stateid = &state->open_stateid; + /* Serialization for the sequence id */ + calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask); + if (calldata->arg.seqid == NULL) + goto out_free_calldata; + calldata->arg.fmode = 0; + calldata->arg.bitmask = server->cache_consistency_bitmask; + calldata->res.fattr = &calldata->fattr; + calldata->res.seqid = calldata->arg.seqid; + calldata->res.server = server; + calldata->roc = roc; + path_get(path); + calldata->path = *path; + + msg.rpc_argp = &calldata->arg; + msg.rpc_resp = &calldata->res; + task_setup_data.callback_data = calldata; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = 0; + if (wait) + status = rpc_wait_for_completion_task(task); + rpc_put_task(task); + return status; +out_free_calldata: + kfree(calldata); +out: + if (roc) + pnfs_roc_release(state->inode); + nfs4_put_open_state(state); + nfs4_put_state_owner(sp); + return status; +} + +static struct inode * +nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) +{ + struct nfs4_state *state; + + /* Protect against concurrent sillydeletes */ + state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred); + if (IS_ERR(state)) + return ERR_CAST(state); + ctx->state = state; + return igrab(state->inode); +} + +static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) +{ + if (ctx->state == NULL) + return; + if (is_sync) + nfs4_close_sync(&ctx->path, ctx->state, ctx->mode); + else + nfs4_close_state(&ctx->path, ctx->state, ctx->mode); +} + +static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) +{ + struct nfs4_server_caps_arg args = { + .fhandle = fhandle, + }; + struct nfs4_server_caps_res res = {}; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SERVER_CAPS], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + if (status == 0) { + memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); + server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| + NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER| + NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| + NFS_CAP_CTIME|NFS_CAP_MTIME); + if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) + server->caps |= NFS_CAP_ACLS; + if (res.has_links != 0) + server->caps |= NFS_CAP_HARDLINKS; + if (res.has_symlinks != 0) + server->caps |= NFS_CAP_SYMLINKS; + if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID) + server->caps |= NFS_CAP_FILEID; + if (res.attr_bitmask[1] & FATTR4_WORD1_MODE) + server->caps |= NFS_CAP_MODE; + if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS) + server->caps |= NFS_CAP_NLINK; + if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER) + server->caps |= NFS_CAP_OWNER; + if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP) + server->caps |= NFS_CAP_OWNER_GROUP; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS) + server->caps |= NFS_CAP_ATIME; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA) + server->caps |= NFS_CAP_CTIME; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY) + server->caps |= NFS_CAP_MTIME; + + memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); + server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; + server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + server->acl_bitmask = res.acl_bitmask; + } + + return status; +} + +int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(server, + _nfs4_server_capabilities(server, fhandle), + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + struct nfs4_lookup_root_arg args = { + .bitmask = nfs4_fattr_bitmap, + }; + struct nfs4_lookup_res res = { + .server = server, + .fattr = info->fattr, + .fh = fhandle, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP_ROOT], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + nfs_fattr_init(info->fattr); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); +} + +static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs4_lookup_root(server, fhandle, info); + switch (err) { + case 0: + case -NFS4ERR_WRONGSEC: + break; + default: + err = nfs4_handle_exception(server, err, &exception); + } + } while (exception.retry); + return err; +} + +static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, rpc_authflavor_t flavor) +{ + struct rpc_auth *auth; + int ret; + + auth = rpcauth_create(flavor, server->client); + if (!auth) { + ret = -EIO; + goto out; + } + ret = nfs4_lookup_root(server, fhandle, info); +out: + return ret; +} + +static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int i, len, status = 0; + rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS]; + + len = gss_mech_list_pseudoflavors(&flav_array[0]); + flav_array[len] = RPC_AUTH_NULL; + len += 1; + + for (i = 0; i < len; i++) { + status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]); + if (status == -NFS4ERR_WRONGSEC || status == -EACCES) + continue; + break; + } + /* + * -EACCESS could mean that the user doesn't have correct permissions + * to access the mount. It could also mean that we tried to mount + * with a gss auth flavor, but rpc.gssd isn't running. Either way, + * existing mount programs don't handle -EACCES very well so it should + * be mapped to -EPERM instead. + */ + if (status == -EACCES) + status = -EPERM; + return status; +} + +/* + * get the file handle for the "/" directory on the server + */ +static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status = nfs4_lookup_root(server, fhandle, info); + if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR)) + /* + * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM + * by nfs4_map_errors() as this function exits. + */ + status = nfs4_find_root_sec(server, fhandle, info); + if (status == 0) + status = nfs4_server_capabilities(server, fhandle); + if (status == 0) + status = nfs4_do_fsinfo(server, fhandle, info); + return nfs4_map_errors(status); +} + +static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); +/* + * Get locations and (maybe) other attributes of a referral. + * Note that we'll actually follow the referral later when + * we detect fsid mismatch in inode revalidation + */ +static int nfs4_get_referral(struct inode *dir, const struct qstr *name, + struct nfs_fattr *fattr, struct nfs_fh *fhandle) +{ + int status = -ENOMEM; + struct page *page = NULL; + struct nfs4_fs_locations *locations = NULL; + + page = alloc_page(GFP_KERNEL); + if (page == NULL) + goto out; + locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (locations == NULL) + goto out; + + status = nfs4_proc_fs_locations(dir, name, locations, page); + if (status != 0) + goto out; + /* Make sure server returned a different fsid for the referral */ + if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { + dprintk("%s: server did not return a different fsid for" + " a referral at %s\n", __func__, name->name); + status = -EIO; + goto out; + } + /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ + nfs_fixup_referral_attributes(&locations->fattr); + + /* replace the lookup nfs_fattr with the locations nfs_fattr */ + memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr)); + memset(fhandle, 0, sizeof(struct nfs_fh)); +out: + if (page) + __free_page(page); + kfree(locations); + return status; +} + +static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +{ + struct nfs4_getattr_arg args = { + .fh = fhandle, + .bitmask = server->attr_bitmask, + }; + struct nfs4_getattr_res res = { + .fattr = fattr, + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + nfs_fattr_init(fattr); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); +} + +static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(server, + _nfs4_proc_getattr(server, fhandle, fattr), + &exception); + } while (exception.retry); + return err; +} + +/* + * The file is not closed if it is opened due to the a request to change + * the size of the file. The open call will not be needed once the + * VFS layer lookup-intents are implemented. + * + * Close is called when the inode is destroyed. + * If we haven't opened the file for O_WRONLY, we + * need to in the size_change case to obtain a stateid. + * + * Got race? + * Because OPEN is always done by name in nfsv4, it is + * possible that we opened a different file by the same + * name. We can recognize this race condition, but we + * can't do anything about it besides returning an error. + * + * This will be fixed with VFS changes (lookup-intent). + */ +static int +nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, + struct iattr *sattr) +{ + struct inode *inode = dentry->d_inode; + struct rpc_cred *cred = NULL; + struct nfs4_state *state = NULL; + int status; + + if (pnfs_ld_layoutret_on_setattr(inode)) + pnfs_return_layout(inode); + + nfs_fattr_init(fattr); + + /* Search for an existing open(O_WRITE) file */ + if (sattr->ia_valid & ATTR_FILE) { + struct nfs_open_context *ctx; + + ctx = nfs_file_open_context(sattr->ia_file); + if (ctx) { + cred = ctx->cred; + state = ctx->state; + } + } + + status = nfs4_do_setattr(inode, cred, fattr, sattr, state); + if (status == 0) + nfs_setattr_update_inode(inode, sattr); + return status; +} + +static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server, + const struct nfs_fh *dirfh, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) +{ + int status; + struct nfs4_lookup_arg args = { + .bitmask = server->attr_bitmask, + .dir_fh = dirfh, + .name = name, + }; + struct nfs4_lookup_res res = { + .server = server, + .fattr = fattr, + .fh = fhandle, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + nfs_fattr_init(fattr); + + dprintk("NFS call lookupfh %s\n", name->name); + status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); + dprintk("NFS reply lookupfh: %d\n", status); + return status; +} + +static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, + struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr); + /* FIXME: !!!! */ + if (err == -NFS4ERR_MOVED) { + err = -EREMOTE; + break; + } + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, + const struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) +{ + int status; + + dprintk("NFS call lookup %s\n", name->name); + status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); + if (status == -NFS4ERR_MOVED) + status = nfs4_get_referral(dir, name, fattr, fhandle); + dprintk("NFS reply lookup: %d\n", status); + return status; +} + +void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr, struct nfs_fh *fh) +{ + memset(fh, 0, sizeof(struct nfs_fh)); + fattr->fsid.major = 1; + fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | + NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_FSID | NFS_ATTR_FATTR_MOUNTPOINT; + fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; + fattr->nlink = 2; +} + +static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(dir), + _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr), + &exception); + if (err == -EPERM) + nfs_fixup_secinfo_attributes(fattr, fhandle); + } while (exception.retry); + return err; +} + +static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_accessargs args = { + .fh = NFS_FH(inode), + .bitmask = server->attr_bitmask, + }; + struct nfs4_accessres res = { + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = entry->cred, + }; + int mode = entry->mask; + int status; + + /* + * Determine which access bits we want to ask for... + */ + if (mode & MAY_READ) + args.access |= NFS4_ACCESS_READ; + if (S_ISDIR(inode->i_mode)) { + if (mode & MAY_WRITE) + args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE; + if (mode & MAY_EXEC) + args.access |= NFS4_ACCESS_LOOKUP; + } else { + if (mode & MAY_WRITE) + args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND; + if (mode & MAY_EXEC) + args.access |= NFS4_ACCESS_EXECUTE; + } + + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + return -ENOMEM; + + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + if (!status) { + entry->mask = 0; + if (res.access & NFS4_ACCESS_READ) + entry->mask |= MAY_READ; + if (res.access & (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE)) + entry->mask |= MAY_WRITE; + if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; + nfs_refresh_inode(inode, res.fattr); + } + nfs_free_fattr(res.fattr); + return status; +} + +static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(inode), + _nfs4_proc_access(inode, entry), + &exception); + } while (exception.retry); + return err; +} + +/* + * TODO: For the time being, we don't try to get any attributes + * along with any of the zero-copy operations READ, READDIR, + * READLINK, WRITE. + * + * In the case of the first three, we want to put the GETATTR + * after the read-type operation -- this is because it is hard + * to predict the length of a GETATTR response in v4, and thus + * align the READ data correctly. This means that the GETATTR + * may end up partially falling into the page cache, and we should + * shift it into the 'tail' of the xdr_buf before processing. + * To do this efficiently, we need to know the total length + * of data received, which doesn't seem to be available outside + * of the RPC layer. + * + * In the case of WRITE, we also want to put the GETATTR after + * the operation -- in this case because we want to make sure + * we get the post-operation mtime and size. This means that + * we can't use xdr_encode_pages() as written: we need a variant + * of it which would leave room in the 'tail' iovec. + * + * Both of these changes to the XDR layer would in fact be quite + * minor, but I decided to leave them for a subsequent patch. + */ +static int _nfs4_proc_readlink(struct inode *inode, struct page *page, + unsigned int pgbase, unsigned int pglen) +{ + struct nfs4_readlink args = { + .fh = NFS_FH(inode), + .pgbase = pgbase, + .pglen = pglen, + .pages = &page, + }; + struct nfs4_readlink_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READLINK], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + return nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); +} + +static int nfs4_proc_readlink(struct inode *inode, struct page *page, + unsigned int pgbase, unsigned int pglen) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(inode), + _nfs4_proc_readlink(inode, page, pgbase, pglen), + &exception); + } while (exception.retry); + return err; +} + +/* + * Got race? + * We will need to arrange for the VFS layer to provide an atomic open. + * Until then, this create/open method is prone to inefficiency and race + * conditions due to the lookup, create, and open VFS calls from sys_open() + * placed on the wire. + * + * Given the above sorry state of affairs, I'm simply sending an OPEN. + * The file will be opened again in the subsequent VFS open call + * (nfs4_proc_file_open). + * + * The open for read will just hang around to be used by any process that + * opens the file O_RDONLY. This will all be resolved with the VFS changes. + */ + +static int +nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + int flags, struct nfs_open_context *ctx) +{ + struct path my_path = { + .dentry = dentry, + }; + struct path *path = &my_path; + struct nfs4_state *state; + struct rpc_cred *cred = NULL; + fmode_t fmode = 0; + int status = 0; + + if (ctx != NULL) { + cred = ctx->cred; + path = &ctx->path; + fmode = ctx->mode; + } + sattr->ia_mode &= ~current_umask(); + state = nfs4_do_open(dir, path, fmode, flags, sattr, cred); + d_drop(dentry); + if (IS_ERR(state)) { + status = PTR_ERR(state); + goto out; + } + d_add(dentry, igrab(state->inode)); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + if (ctx != NULL) + ctx->state = state; + else + nfs4_close_sync(path, state, fmode); +out: + return status; +} + +static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) +{ + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_removeargs args = { + .fh = NFS_FH(dir), + .name.len = name->len, + .name.name = name->name, + .bitmask = server->attr_bitmask, + }; + struct nfs_removeres res = { + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status = -ENOMEM; + + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + goto out; + + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); + if (status == 0) { + update_changeattr(dir, &res.cinfo); + nfs_post_op_update_inode(dir, res.dir_attr); + } + nfs_free_fattr(res.dir_attr); +out: + return status; +} + +static int nfs4_proc_remove(struct inode *dir, struct qstr *name) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(dir), + _nfs4_proc_remove(dir, name), + &exception); + } while (exception.retry); + return err; +} + +static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +{ + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_removeargs *args = msg->rpc_argp; + struct nfs_removeres *res = msg->rpc_resp; + + args->bitmask = server->cache_consistency_bitmask; + res->server = server; + res->seq_res.sr_slot = NULL; + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; +} + +static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) +{ + struct nfs_removeres *res = task->tk_msg.rpc_resp; + + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; + if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + return 0; + update_changeattr(dir, &res->cinfo); + nfs_post_op_update_inode(dir, res->dir_attr); + return 1; +} + +static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +{ + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_renameargs *arg = msg->rpc_argp; + struct nfs_renameres *res = msg->rpc_resp; + + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; + arg->bitmask = server->attr_bitmask; + res->server = server; +} + +static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ + struct nfs_renameres *res = task->tk_msg.rpc_resp; + + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; + if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + return 0; + + update_changeattr(old_dir, &res->old_cinfo); + nfs_post_op_update_inode(old_dir, res->old_fattr); + update_changeattr(new_dir, &res->new_cinfo); + nfs_post_op_update_inode(new_dir, res->new_fattr); + return 1; +} + +static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, + struct inode *new_dir, struct qstr *new_name) +{ + struct nfs_server *server = NFS_SERVER(old_dir); + struct nfs_renameargs arg = { + .old_dir = NFS_FH(old_dir), + .new_dir = NFS_FH(new_dir), + .old_name = old_name, + .new_name = new_name, + .bitmask = server->attr_bitmask, + }; + struct nfs_renameres res = { + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status = -ENOMEM; + + res.old_fattr = nfs_alloc_fattr(); + res.new_fattr = nfs_alloc_fattr(); + if (res.old_fattr == NULL || res.new_fattr == NULL) + goto out; + + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + if (!status) { + update_changeattr(old_dir, &res.old_cinfo); + nfs_post_op_update_inode(old_dir, res.old_fattr); + update_changeattr(new_dir, &res.new_cinfo); + nfs_post_op_update_inode(new_dir, res.new_fattr); + } +out: + nfs_free_fattr(res.new_fattr); + nfs_free_fattr(res.old_fattr); + return status; +} + +static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, + struct inode *new_dir, struct qstr *new_name) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(old_dir), + _nfs4_proc_rename(old_dir, old_name, + new_dir, new_name), + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_link_arg arg = { + .fh = NFS_FH(inode), + .dir_fh = NFS_FH(dir), + .name = name, + .bitmask = server->attr_bitmask, + }; + struct nfs4_link_res res = { + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status = -ENOMEM; + + res.fattr = nfs_alloc_fattr(); + res.dir_attr = nfs_alloc_fattr(); + if (res.fattr == NULL || res.dir_attr == NULL) + goto out; + + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + if (!status) { + update_changeattr(dir, &res.cinfo); + nfs_post_op_update_inode(dir, res.dir_attr); + nfs_post_op_update_inode(inode, res.fattr); + } +out: + nfs_free_fattr(res.dir_attr); + nfs_free_fattr(res.fattr); + return status; +} + +static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(inode), + _nfs4_proc_link(inode, dir, name), + &exception); + } while (exception.retry); + return err; +} + +struct nfs4_createdata { + struct rpc_message msg; + struct nfs4_create_arg arg; + struct nfs4_create_res res; + struct nfs_fh fh; + struct nfs_fattr fattr; + struct nfs_fattr dir_fattr; +}; + +static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, + struct qstr *name, struct iattr *sattr, u32 ftype) +{ + struct nfs4_createdata *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data != NULL) { + struct nfs_server *server = NFS_SERVER(dir); + + data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE]; + data->msg.rpc_argp = &data->arg; + data->msg.rpc_resp = &data->res; + data->arg.dir_fh = NFS_FH(dir); + data->arg.server = server; + data->arg.name = name; + data->arg.attrs = sattr; + data->arg.ftype = ftype; + data->arg.bitmask = server->attr_bitmask; + data->res.server = server; + data->res.fh = &data->fh; + data->res.fattr = &data->fattr; + data->res.dir_fattr = &data->dir_fattr; + nfs_fattr_init(data->res.fattr); + nfs_fattr_init(data->res.dir_fattr); + } + return data; +} + +static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) +{ + int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, + &data->arg.seq_args, &data->res.seq_res, 1); + if (status == 0) { + update_changeattr(dir, &data->res.dir_cinfo); + nfs_post_op_update_inode(dir, data->res.dir_fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + } + return status; +} + +static void nfs4_free_createdata(struct nfs4_createdata *data) +{ + kfree(data); +} + +static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, + struct page *page, unsigned int len, struct iattr *sattr) +{ + struct nfs4_createdata *data; + int status = -ENAMETOOLONG; + + if (len > NFS4_MAXPATHLEN) + goto out; + + status = -ENOMEM; + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK); + if (data == NULL) + goto out; + + data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK]; + data->arg.u.symlink.pages = &page; + data->arg.u.symlink.len = len; + + status = nfs4_do_create(dir, dentry, data); + + nfs4_free_createdata(data); +out: + return status; +} + +static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, + struct page *page, unsigned int len, struct iattr *sattr) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(dir), + _nfs4_proc_symlink(dir, dentry, page, + len, sattr), + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr) +{ + struct nfs4_createdata *data; + int status = -ENOMEM; + + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR); + if (data == NULL) + goto out; + + status = nfs4_do_create(dir, dentry, data); + + nfs4_free_createdata(data); +out: + return status; +} + +static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr) +{ + struct nfs4_exception exception = { }; + int err; + + sattr->ia_mode &= ~current_umask(); + do { + err = nfs4_handle_exception(NFS_SERVER(dir), + _nfs4_proc_mkdir(dir, dentry, sattr), + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, + u64 cookie, struct page **pages, unsigned int count, int plus) +{ + struct inode *dir = dentry->d_inode; + struct nfs4_readdir_arg args = { + .fh = NFS_FH(dir), + .pages = pages, + .pgbase = 0, + .count = count, + .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, + .plus = plus, + }; + struct nfs4_readdir_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READDIR], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + dprintk("%s: dentry = %s/%s, cookie = %Lu\n", __func__, + dentry->d_parent->d_name.name, + dentry->d_name.name, + (unsigned long long)cookie); + nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); + res.pgbase = args.pgbase; + status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); + if (status >= 0) { + memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); + status += args.pgbase; + } + + nfs_invalidate_atime(dir); + + dprintk("%s: returns %d\n", __func__, status); + return status; +} + +static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, + u64 cookie, struct page **pages, unsigned int count, int plus) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), + _nfs4_proc_readdir(dentry, cred, cookie, + pages, count, plus), + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, dev_t rdev) +{ + struct nfs4_createdata *data; + int mode = sattr->ia_mode; + int status = -ENOMEM; + + BUG_ON(!(sattr->ia_valid & ATTR_MODE)); + BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode)); + + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK); + if (data == NULL) + goto out; + + if (S_ISFIFO(mode)) + data->arg.ftype = NF4FIFO; + else if (S_ISBLK(mode)) { + data->arg.ftype = NF4BLK; + data->arg.u.device.specdata1 = MAJOR(rdev); + data->arg.u.device.specdata2 = MINOR(rdev); + } + else if (S_ISCHR(mode)) { + data->arg.ftype = NF4CHR; + data->arg.u.device.specdata1 = MAJOR(rdev); + data->arg.u.device.specdata2 = MINOR(rdev); + } + + status = nfs4_do_create(dir, dentry, data); + + nfs4_free_createdata(data); +out: + return status; +} + +static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, dev_t rdev) +{ + struct nfs4_exception exception = { }; + int err; + + sattr->ia_mode &= ~current_umask(); + do { + err = nfs4_handle_exception(NFS_SERVER(dir), + _nfs4_proc_mknod(dir, dentry, sattr, rdev), + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsstat *fsstat) +{ + struct nfs4_statfs_arg args = { + .fh = fhandle, + .bitmask = server->attr_bitmask, + }; + struct nfs4_statfs_res res = { + .fsstat = fsstat, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_STATFS], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + nfs_fattr_init(fsstat->fattr); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); +} + +static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(server, + _nfs4_proc_statfs(server, fhandle, fsstat), + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *fsinfo) +{ + struct nfs4_fsinfo_arg args = { + .fh = fhandle, + .bitmask = server->attr_bitmask, + }; + struct nfs4_fsinfo_res res = { + .fsinfo = fsinfo, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSINFO], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); +} + +static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(server, + _nfs4_do_fsinfo(server, fhandle, fsinfo), + &exception); + } while (exception.retry); + return err; +} + +static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) +{ + nfs_fattr_init(fsinfo->fattr); + return nfs4_do_fsinfo(server, fhandle, fsinfo); +} + +static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *pathconf) +{ + struct nfs4_pathconf_arg args = { + .fh = fhandle, + .bitmask = server->attr_bitmask, + }; + struct nfs4_pathconf_res res = { + .pathconf = pathconf, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PATHCONF], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + /* None of the pathconf attributes are mandatory to implement */ + if ((args.bitmask[0] & nfs4_pathconf_bitmap[0]) == 0) { + memset(pathconf, 0, sizeof(*pathconf)); + return 0; + } + + nfs_fattr_init(pathconf->fattr); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); +} + +static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *pathconf) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(server, + _nfs4_proc_pathconf(server, fhandle, pathconf), + &exception); + } while (exception.retry); + return err; +} + +void __nfs4_read_done_cb(struct nfs_read_data *data) +{ + nfs_invalidate_atime(data->inode); +} + +static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) +{ + struct nfs_server *server = NFS_SERVER(data->inode); + + if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { + nfs_restart_rpc(task, server->nfs_client); + return -EAGAIN; + } + + __nfs4_read_done_cb(data); + if (task->tk_status > 0) + renew_lease(server, data->timestamp); + return 0; +} + +static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +{ + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + + return data->read_done_cb ? data->read_done_cb(task, data) : + nfs4_read_done_cb(task, data); +} + +static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +{ + data->timestamp = jiffies; + data->read_done_cb = nfs4_read_done_cb; + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; +} + +/* Reset the the nfs_read_data to send the read to the MDS. */ +void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data) +{ + dprintk("%s Reset task for i/o through\n", __func__); + put_lseg(data->lseg); + data->lseg = NULL; + /* offsets will differ in the dense stripe case */ + data->args.offset = data->mds_offset; + data->ds_clp = NULL; + data->args.fh = NFS_FH(data->inode); + data->read_done_cb = nfs4_read_done_cb; + task->tk_ops = data->mds_ops; + rpc_task_reset_client(task, NFS_CLIENT(data->inode)); +} +EXPORT_SYMBOL_GPL(nfs4_reset_read); + +static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) +{ + struct inode *inode = data->inode; + + if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + return -EAGAIN; + } + if (task->tk_status >= 0) { + renew_lease(NFS_SERVER(inode), data->timestamp); + nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); + } + return 0; +} + +static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) +{ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + return data->write_done_cb ? data->write_done_cb(task, data) : + nfs4_write_done_cb(task, data); +} + +/* Reset the the nfs_write_data to send the write to the MDS. */ +void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data) +{ + dprintk("%s Reset task for i/o through\n", __func__); + put_lseg(data->lseg); + data->lseg = NULL; + data->ds_clp = NULL; + data->write_done_cb = nfs4_write_done_cb; + data->args.fh = NFS_FH(data->inode); + data->args.bitmask = data->res.server->cache_consistency_bitmask; + data->args.offset = data->mds_offset; + data->res.fattr = &data->fattr; + task->tk_ops = data->mds_ops; + rpc_task_reset_client(task, NFS_CLIENT(data->inode)); +} +EXPORT_SYMBOL_GPL(nfs4_reset_write); + +static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +{ + struct nfs_server *server = NFS_SERVER(data->inode); + + if (data->lseg) { + data->args.bitmask = NULL; + data->res.fattr = NULL; + } else + data->args.bitmask = server->cache_consistency_bitmask; + if (!data->write_done_cb) + data->write_done_cb = nfs4_write_done_cb; + data->res.server = server; + data->timestamp = jiffies; + + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; +} + +static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data) +{ + struct inode *inode = data->inode; + + if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + return -EAGAIN; + } + nfs_refresh_inode(inode, data->res.fattr); + return 0; +} + +static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) +{ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + return data->write_done_cb(task, data); +} + +static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) +{ + struct nfs_server *server = NFS_SERVER(data->inode); + + if (data->lseg) { + data->args.bitmask = NULL; + data->res.fattr = NULL; + } else + data->args.bitmask = server->cache_consistency_bitmask; + if (!data->write_done_cb) + data->write_done_cb = nfs4_commit_done_cb; + data->res.server = server; + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; +} + +struct nfs4_renewdata { + struct nfs_client *client; + unsigned long timestamp; +}; + +/* + * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special + * standalone procedure for queueing an asynchronous RENEW. + */ +static void nfs4_renew_release(void *calldata) +{ + struct nfs4_renewdata *data = calldata; + struct nfs_client *clp = data->client; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); + kfree(data); +} + +static void nfs4_renew_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_renewdata *data = calldata; + struct nfs_client *clp = data->client; + unsigned long timestamp = data->timestamp; + + if (task->tk_status < 0) { + /* Unless we're shutting down, schedule state recovery! */ + if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) + nfs4_schedule_lease_recovery(clp); + return; + } + do_renew_lease(clp, timestamp); +} + +static const struct rpc_call_ops nfs4_renew_ops = { + .rpc_call_done = nfs4_renew_done, + .rpc_release = nfs4_renew_release, +}; + +int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], + .rpc_argp = clp, + .rpc_cred = cred, + }; + struct nfs4_renewdata *data; + + if (!atomic_inc_not_zero(&clp->cl_count)) + return -EIO; + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + data->client = clp; + data->timestamp = jiffies; + return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, + &nfs4_renew_ops, data); +} + +int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], + .rpc_argp = clp, + .rpc_cred = cred, + }; + unsigned long now = jiffies; + int status; + + status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); + if (status < 0) + return status; + do_renew_lease(clp, now); + return 0; +} + +static inline int nfs4_server_supports_acls(struct nfs_server *server) +{ + return (server->caps & NFS_CAP_ACLS) + && (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) + && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL); +} + +/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_CACHE_SIZE, and that + * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_CACHE_SIZE) bytes on + * the stack. + */ +#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT) + +static void buf_to_pages(const void *buf, size_t buflen, + struct page **pages, unsigned int *pgbase) +{ + const void *p = buf; + + *pgbase = offset_in_page(buf); + p -= *pgbase; + while (p < buf + buflen) { + *(pages++) = virt_to_page(p); + p += PAGE_CACHE_SIZE; + } +} + +static int buf_to_pages_noslab(const void *buf, size_t buflen, + struct page **pages, unsigned int *pgbase) +{ + struct page *newpage, **spages; + int rc = 0; + size_t len; + spages = pages; + + do { + len = min_t(size_t, PAGE_CACHE_SIZE, buflen); + newpage = alloc_page(GFP_KERNEL); + + if (newpage == NULL) + goto unwind; + memcpy(page_address(newpage), buf, len); + buf += len; + buflen -= len; + *pages++ = newpage; + rc++; + } while (buflen != 0); + + return rc; + +unwind: + for(; rc > 0; rc--) + __free_page(spages[rc-1]); + return -ENOMEM; +} + +struct nfs4_cached_acl { + int cached; + size_t len; + char data[0]; +}; + +static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + spin_lock(&inode->i_lock); + kfree(nfsi->nfs4_acl); + nfsi->nfs4_acl = acl; + spin_unlock(&inode->i_lock); +} + +static void nfs4_zap_acl_attr(struct inode *inode) +{ + nfs4_set_cached_acl(inode, NULL); +} + +static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_t buflen) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs4_cached_acl *acl; + int ret = -ENOENT; + + spin_lock(&inode->i_lock); + acl = nfsi->nfs4_acl; + if (acl == NULL) + goto out; + if (buf == NULL) /* user is just asking for length */ + goto out_len; + if (acl->cached == 0) + goto out; + ret = -ERANGE; /* see getxattr(2) man page */ + if (acl->len > buflen) + goto out; + memcpy(buf, acl->data, acl->len); +out_len: + ret = acl->len; +out: + spin_unlock(&inode->i_lock); + return ret; +} + +static void nfs4_write_cached_acl(struct inode *inode, const char *buf, size_t acl_len) +{ + struct nfs4_cached_acl *acl; + + if (buf && acl_len <= PAGE_SIZE) { + acl = kmalloc(sizeof(*acl) + acl_len, GFP_KERNEL); + if (acl == NULL) + goto out; + acl->cached = 1; + memcpy(acl->data, buf, acl_len); + } else { + acl = kmalloc(sizeof(*acl), GFP_KERNEL); + if (acl == NULL) + goto out; + acl->cached = 0; + } + acl->len = acl_len; +out: + nfs4_set_cached_acl(inode, acl); +} + +static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +{ + struct page *pages[NFS4ACL_MAXPAGES]; + struct nfs_getaclargs args = { + .fh = NFS_FH(inode), + .acl_pages = pages, + .acl_len = buflen, + }; + struct nfs_getaclres res = { + .acl_len = buflen, + }; + void *resp_buf; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL], + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct page *localpage = NULL; + int ret; + + if (buflen < PAGE_SIZE) { + /* As long as we're doing a round trip to the server anyway, + * let's be prepared for a page of acl data. */ + localpage = alloc_page(GFP_KERNEL); + resp_buf = page_address(localpage); + if (localpage == NULL) + return -ENOMEM; + args.acl_pages[0] = localpage; + args.acl_pgbase = 0; + args.acl_len = PAGE_SIZE; + } else { + resp_buf = buf; + buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); + } + ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); + if (ret) + goto out_free; + if (res.acl_len > args.acl_len) + nfs4_write_cached_acl(inode, NULL, res.acl_len); + else + nfs4_write_cached_acl(inode, resp_buf, res.acl_len); + if (buf) { + ret = -ERANGE; + if (res.acl_len > buflen) + goto out_free; + if (localpage) + memcpy(buf, resp_buf, res.acl_len); + } + ret = res.acl_len; +out_free: + if (localpage) + __free_page(localpage); + return ret; +} + +static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +{ + struct nfs4_exception exception = { }; + ssize_t ret; + do { + ret = __nfs4_get_acl_uncached(inode, buf, buflen); + if (ret >= 0) + break; + ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception); + } while (exception.retry); + return ret; +} + +static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) +{ + struct nfs_server *server = NFS_SERVER(inode); + int ret; + + if (!nfs4_server_supports_acls(server)) + return -EOPNOTSUPP; + ret = nfs_revalidate_inode(server, inode); + if (ret < 0) + return ret; + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); + ret = nfs4_read_cached_acl(inode, buf, buflen); + if (ret != -ENOENT) + return ret; + return nfs4_get_acl_uncached(inode, buf, buflen); +} + +static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct page *pages[NFS4ACL_MAXPAGES]; + struct nfs_setaclargs arg = { + .fh = NFS_FH(inode), + .acl_pages = pages, + .acl_len = buflen, + }; + struct nfs_setaclres res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETACL], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int ret, i; + + if (!nfs4_server_supports_acls(server)) + return -EOPNOTSUPP; + i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase); + if (i < 0) + return i; + nfs_inode_return_delegation(inode); + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + + /* + * Free each page after tx, so the only ref left is + * held by the network stack + */ + for (; i > 0; i--) + put_page(pages[i-1]); + + /* + * Acl update can result in inode attribute update. + * so mark the attribute cache invalid. + */ + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + spin_unlock(&inode->i_lock); + nfs_access_zap_cache(inode); + nfs_zap_acl_cache(inode); + return ret; +} + +static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(inode), + __nfs4_proc_set_acl(inode, buf, buflen), + &exception); + } while (exception.retry); + return err; +} + +static int +nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) +{ + struct nfs_client *clp = server->nfs_client; + + if (task->tk_status >= 0) + return 0; + switch(task->tk_status) { + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + if (state != NULL) + nfs_remove_bad_delegation(state->inode); + case -NFS4ERR_OPENMODE: + if (state == NULL) + break; + nfs4_schedule_stateid_recovery(server, state); + goto wait_on_recovery; + case -NFS4ERR_EXPIRED: + if (state != NULL) + nfs4_schedule_stateid_recovery(server, state); + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_STALE_CLIENTID: + nfs4_schedule_lease_recovery(clp); + goto wait_on_recovery; +#if defined(CONFIG_NFS_V4_1) + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR %d, Reset session\n", __func__, + task->tk_status); + nfs4_schedule_session_recovery(clp->cl_session); + task->tk_status = 0; + return -EAGAIN; +#endif /* CONFIG_NFS_V4_1 */ + case -NFS4ERR_DELAY: + nfs_inc_server_stats(server, NFSIOS_DELAY); + case -NFS4ERR_GRACE: + case -EKEYEXPIRED: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + task->tk_status = 0; + return -EAGAIN; + case -NFS4ERR_RETRY_UNCACHED_REP: + case -NFS4ERR_OLD_STATEID: + task->tk_status = 0; + return -EAGAIN; + } + task->tk_status = nfs4_map_errors(task->tk_status); + return 0; +wait_on_recovery: + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) + rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); + task->tk_status = 0; + return -EAGAIN; +} + +int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, + unsigned short port, struct rpc_cred *cred, + struct nfs4_setclientid_res *res) +{ + nfs4_verifier sc_verifier; + struct nfs4_setclientid setclientid = { + .sc_verifier = &sc_verifier, + .sc_prog = program, + .sc_cb_ident = clp->cl_cb_ident, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], + .rpc_argp = &setclientid, + .rpc_resp = res, + .rpc_cred = cred, + }; + __be32 *p; + int loop = 0; + int status; + + p = (__be32*)sc_verifier.data; + *p++ = htonl((u32)clp->cl_boot_time.tv_sec); + *p = htonl((u32)clp->cl_boot_time.tv_nsec); + + for(;;) { + setclientid.sc_name_len = scnprintf(setclientid.sc_name, + sizeof(setclientid.sc_name), "%s/%s %s %s %u", + clp->cl_ipaddr, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_PROTO), + clp->cl_rpcclient->cl_auth->au_ops->au_name, + clp->cl_id_uniquifier); + setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, + sizeof(setclientid.sc_netid), + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_NETID)); + setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, + sizeof(setclientid.sc_uaddr), "%s.%u.%u", + clp->cl_ipaddr, port >> 8, port & 255); + + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + if (status != -NFS4ERR_CLID_INUSE) + break; + if (loop != 0) { + ++clp->cl_id_uniquifier; + break; + } + ++loop; + ssleep(clp->cl_lease_time / HZ + 1); + } + return status; +} + +int nfs4_proc_setclientid_confirm(struct nfs_client *clp, + struct nfs4_setclientid_res *arg, + struct rpc_cred *cred) +{ + struct nfs_fsinfo fsinfo; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], + .rpc_argp = arg, + .rpc_resp = &fsinfo, + .rpc_cred = cred, + }; + unsigned long now; + int status; + + now = jiffies; + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + if (status == 0) { + spin_lock(&clp->cl_lock); + clp->cl_lease_time = fsinfo.lease_time * HZ; + clp->cl_last_renewal = now; + spin_unlock(&clp->cl_lock); + } + return status; +} + +struct nfs4_delegreturndata { + struct nfs4_delegreturnargs args; + struct nfs4_delegreturnres res; + struct nfs_fh fh; + nfs4_stateid stateid; + unsigned long timestamp; + struct nfs_fattr fattr; + int rpc_status; +}; + +static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_delegreturndata *data = calldata; + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + case 0: + renew_lease(data->res.server, data->timestamp); + break; + default: + if (nfs4_async_handle_error(task, data->res.server, NULL) == + -EAGAIN) { + nfs_restart_rpc(task, data->res.server->nfs_client); + return; + } + } + data->rpc_status = task->tk_status; +} + +static void nfs4_delegreturn_release(void *calldata) +{ + kfree(calldata); +} + +#if defined(CONFIG_NFS_V4_1) +static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_delegreturndata *d_data; + + d_data = (struct nfs4_delegreturndata *)data; + + if (nfs4_setup_sequence(d_data->res.server, + &d_data->args.seq_args, + &d_data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +} +#endif /* CONFIG_NFS_V4_1 */ + +static const struct rpc_call_ops nfs4_delegreturn_ops = { +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs4_delegreturn_prepare, +#endif /* CONFIG_NFS_V4_1 */ + .rpc_call_done = nfs4_delegreturn_done, + .rpc_release = nfs4_delegreturn_release, +}; + +static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync) +{ + struct nfs4_delegreturndata *data; + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN], + .rpc_cred = cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_delegreturn_ops, + .flags = RPC_TASK_ASYNC, + }; + int status = 0; + + data = kzalloc(sizeof(*data), GFP_NOFS); + if (data == NULL) + return -ENOMEM; + data->args.fhandle = &data->fh; + data->args.stateid = &data->stateid; + data->args.bitmask = server->attr_bitmask; + nfs_copy_fh(&data->fh, NFS_FH(inode)); + memcpy(&data->stateid, stateid, sizeof(data->stateid)); + data->res.fattr = &data->fattr; + data->res.server = server; + nfs_fattr_init(data->res.fattr); + data->timestamp = jiffies; + data->rpc_status = 0; + + task_setup_data.callback_data = data; + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (!issync) + goto out; + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) + goto out; + status = data->rpc_status; + if (status != 0) + goto out; + nfs_refresh_inode(inode, &data->fattr); +out: + rpc_put_task(task); + return status; +} + +int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs4_proc_delegreturn(inode, cred, stateid, issync); + switch (err) { + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + case 0: + return 0; + } + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + return err; +} + +#define NFS4_LOCK_MINTIMEOUT (1 * HZ) +#define NFS4_LOCK_MAXTIMEOUT (30 * HZ) + +/* + * sleep, with exponential backoff, and retry the LOCK operation. + */ +static unsigned long +nfs4_set_lock_task_retry(unsigned long timeout) +{ + schedule_timeout_killable(timeout); + timeout <<= 1; + if (timeout > NFS4_LOCK_MAXTIMEOUT) + return NFS4_LOCK_MAXTIMEOUT; + return timeout; +} + +static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + struct inode *inode = state->inode; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + struct nfs_lockt_args arg = { + .fh = NFS_FH(inode), + .fl = request, + }; + struct nfs_lockt_res res = { + .denied = request, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKT], + .rpc_argp = &arg, + .rpc_resp = &res, + .rpc_cred = state->owner->so_cred, + }; + struct nfs4_lock_state *lsp; + int status; + + arg.lock_owner.clientid = clp->cl_clientid; + status = nfs4_set_lock_state(state, request); + if (status != 0) + goto out; + lsp = request->fl_u.nfs4_fl.owner; + arg.lock_owner.id = lsp->ls_id.id; + arg.lock_owner.s_dev = server->s_dev; + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + switch (status) { + case 0: + request->fl_type = F_UNLCK; + break; + case -NFS4ERR_DENIED: + status = 0; + } + request->fl_ops->fl_release_private(request); +out: + return status; +} + +static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(NFS_SERVER(state->inode), + _nfs4_proc_getlk(state, cmd, request), + &exception); + } while (exception.retry); + return err; +} + +static int do_vfs_lock(struct file *file, struct file_lock *fl) +{ + int res = 0; + switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { + case FL_POSIX: + res = posix_lock_file_wait(file, fl); + break; + case FL_FLOCK: + res = flock_lock_file_wait(file, fl); + break; + default: + BUG(); + } + return res; +} + +struct nfs4_unlockdata { + struct nfs_locku_args arg; + struct nfs_locku_res res; + struct nfs4_lock_state *lsp; + struct nfs_open_context *ctx; + struct file_lock fl; + const struct nfs_server *server; + unsigned long timestamp; +}; + +static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, + struct nfs_open_context *ctx, + struct nfs4_lock_state *lsp, + struct nfs_seqid *seqid) +{ + struct nfs4_unlockdata *p; + struct inode *inode = lsp->ls_state->inode; + + p = kzalloc(sizeof(*p), GFP_NOFS); + if (p == NULL) + return NULL; + p->arg.fh = NFS_FH(inode); + p->arg.fl = &p->fl; + p->arg.seqid = seqid; + p->res.seqid = seqid; + p->arg.stateid = &lsp->ls_stateid; + p->lsp = lsp; + atomic_inc(&lsp->ls_count); + /* Ensure we don't close file until we're done freeing locks! */ + p->ctx = get_nfs_open_context(ctx); + memcpy(&p->fl, fl, sizeof(p->fl)); + p->server = NFS_SERVER(inode); + return p; +} + +static void nfs4_locku_release_calldata(void *data) +{ + struct nfs4_unlockdata *calldata = data; + nfs_free_seqid(calldata->arg.seqid); + nfs4_put_lock_state(calldata->lsp); + put_nfs_open_context(calldata->ctx); + kfree(calldata); +} + +static void nfs4_locku_done(struct rpc_task *task, void *data) +{ + struct nfs4_unlockdata *calldata = data; + + if (!nfs4_sequence_done(task, &calldata->res.seq_res)) + return; + switch (task->tk_status) { + case 0: + memcpy(calldata->lsp->ls_stateid.data, + calldata->res.stateid.data, + sizeof(calldata->lsp->ls_stateid.data)); + renew_lease(calldata->server, calldata->timestamp); + break; + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + break; + default: + if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) + nfs_restart_rpc(task, + calldata->server->nfs_client); + } +} + +static void nfs4_locku_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_unlockdata *calldata = data; + + if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) + return; + if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) { + /* Note: exit _without_ running nfs4_locku_done */ + task->tk_action = NULL; + return; + } + calldata->timestamp = jiffies; + if (nfs4_setup_sequence(calldata->server, + &calldata->arg.seq_args, + &calldata->res.seq_res, 1, task)) + return; + rpc_call_start(task); +} + +static const struct rpc_call_ops nfs4_locku_ops = { + .rpc_call_prepare = nfs4_locku_prepare, + .rpc_call_done = nfs4_locku_done, + .rpc_release = nfs4_locku_release_calldata, +}; + +static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, + struct nfs_open_context *ctx, + struct nfs4_lock_state *lsp, + struct nfs_seqid *seqid) +{ + struct nfs4_unlockdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU], + .rpc_cred = ctx->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = NFS_CLIENT(lsp->ls_state->inode), + .rpc_message = &msg, + .callback_ops = &nfs4_locku_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + + /* Ensure this is an unlock - when canceling a lock, the + * canceled lock is passed in, and it won't be an unlock. + */ + fl->fl_type = F_UNLCK; + + data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid); + if (data == NULL) { + nfs_free_seqid(seqid); + return ERR_PTR(-ENOMEM); + } + + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; + task_setup_data.callback_data = data; + return rpc_run_task(&task_setup_data); +} + +static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs_seqid *seqid; + struct nfs4_lock_state *lsp; + struct rpc_task *task; + int status = 0; + unsigned char fl_flags = request->fl_flags; + + status = nfs4_set_lock_state(state, request); + /* Unlock _before_ we do the RPC call */ + request->fl_flags |= FL_EXISTS; + down_read(&nfsi->rwsem); + if (do_vfs_lock(request->fl_file, request) == -ENOENT) { + up_read(&nfsi->rwsem); + goto out; + } + up_read(&nfsi->rwsem); + if (status != 0) + goto out; + /* Is this a delegated lock? */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags)) + goto out; + lsp = request->fl_u.nfs4_fl.owner; + seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); + status = -ENOMEM; + if (seqid == NULL) + goto out; + task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid); + status = PTR_ERR(task); + if (IS_ERR(task)) + goto out; + status = nfs4_wait_for_completion_rpc_task(task); + rpc_put_task(task); +out: + request->fl_flags = fl_flags; + return status; +} + +struct nfs4_lockdata { + struct nfs_lock_args arg; + struct nfs_lock_res res; + struct nfs4_lock_state *lsp; + struct nfs_open_context *ctx; + struct file_lock fl; + unsigned long timestamp; + int rpc_status; + int cancelled; + struct nfs_server *server; +}; + +static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, + struct nfs_open_context *ctx, struct nfs4_lock_state *lsp, + gfp_t gfp_mask) +{ + struct nfs4_lockdata *p; + struct inode *inode = lsp->ls_state->inode; + struct nfs_server *server = NFS_SERVER(inode); + + p = kzalloc(sizeof(*p), gfp_mask); + if (p == NULL) + return NULL; + + p->arg.fh = NFS_FH(inode); + p->arg.fl = &p->fl; + p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask); + if (p->arg.open_seqid == NULL) + goto out_free; + p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask); + if (p->arg.lock_seqid == NULL) + goto out_free_seqid; + p->arg.lock_stateid = &lsp->ls_stateid; + p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; + p->arg.lock_owner.id = lsp->ls_id.id; + p->arg.lock_owner.s_dev = server->s_dev; + p->res.lock_seqid = p->arg.lock_seqid; + p->lsp = lsp; + p->server = server; + atomic_inc(&lsp->ls_count); + p->ctx = get_nfs_open_context(ctx); + memcpy(&p->fl, fl, sizeof(p->fl)); + return p; +out_free_seqid: + nfs_free_seqid(p->arg.open_seqid); +out_free: + kfree(p); + return NULL; +} + +static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_lockdata *data = calldata; + struct nfs4_state *state = data->lsp->ls_state; + + dprintk("%s: begin!\n", __func__); + if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) + return; + /* Do we need to do an open_to_lock_owner? */ + if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { + if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) + return; + data->arg.open_stateid = &state->stateid; + data->arg.new_lock_owner = 1; + data->res.open_seqid = data->arg.open_seqid; + } else + data->arg.new_lock_owner = 0; + data->timestamp = jiffies; + if (nfs4_setup_sequence(data->server, + &data->arg.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); + dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); +} + +static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata) +{ + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + nfs4_lock_prepare(task, calldata); +} + +static void nfs4_lock_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_lockdata *data = calldata; + + dprintk("%s: begin!\n", __func__); + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + data->rpc_status = task->tk_status; + if (data->arg.new_lock_owner != 0) { + if (data->rpc_status == 0) + nfs_confirm_seqid(&data->lsp->ls_seqid, 0); + else + goto out; + } + if (data->rpc_status == 0) { + memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, + sizeof(data->lsp->ls_stateid.data)); + data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; + renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); + } +out: + dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status); +} + +static void nfs4_lock_release(void *calldata) +{ + struct nfs4_lockdata *data = calldata; + + dprintk("%s: begin!\n", __func__); + nfs_free_seqid(data->arg.open_seqid); + if (data->cancelled != 0) { + struct rpc_task *task; + task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp, + data->arg.lock_seqid); + if (!IS_ERR(task)) + rpc_put_task_async(task); + dprintk("%s: cancelling lock!\n", __func__); + } else + nfs_free_seqid(data->arg.lock_seqid); + nfs4_put_lock_state(data->lsp); + put_nfs_open_context(data->ctx); + kfree(data); + dprintk("%s: done!\n", __func__); +} + +static const struct rpc_call_ops nfs4_lock_ops = { + .rpc_call_prepare = nfs4_lock_prepare, + .rpc_call_done = nfs4_lock_done, + .rpc_release = nfs4_lock_release, +}; + +static const struct rpc_call_ops nfs4_recover_lock_ops = { + .rpc_call_prepare = nfs4_recover_lock_prepare, + .rpc_call_done = nfs4_lock_done, + .rpc_release = nfs4_lock_release, +}; + +static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) +{ + switch (error) { + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + if (new_lock_owner != 0 || + (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + nfs4_schedule_stateid_recovery(server, lsp->ls_state); + break; + case -NFS4ERR_STALE_STATEID: + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + case -NFS4ERR_EXPIRED: + nfs4_schedule_lease_recovery(server->nfs_client); + }; +} + +static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type) +{ + struct nfs4_lockdata *data; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK], + .rpc_cred = state->owner->so_cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = NFS_CLIENT(state->inode), + .rpc_message = &msg, + .callback_ops = &nfs4_lock_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + int ret; + + dprintk("%s: begin!\n", __func__); + data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), + fl->fl_u.nfs4_fl.owner, + recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS); + if (data == NULL) + return -ENOMEM; + if (IS_SETLKW(cmd)) + data->arg.block = 1; + if (recovery_type > NFS_LOCK_NEW) { + if (recovery_type == NFS_LOCK_RECLAIM) + data->arg.reclaim = NFS_LOCK_RECLAIM; + task_setup_data.callback_ops = &nfs4_recover_lock_ops; + } + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; + task_setup_data.callback_data = data; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + ret = nfs4_wait_for_completion_rpc_task(task); + if (ret == 0) { + ret = data->rpc_status; + if (ret) + nfs4_handle_setlk_error(data->server, data->lsp, + data->arg.new_lock_owner, ret); + } else + data->cancelled = 1; + rpc_put_task(task); + dprintk("%s: done, ret = %d!\n", __func__, ret); + return ret; +} + +static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_exception exception = { + .inode = state->inode, + }; + int err; + + do { + /* Cache the lock if possible... */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) + return 0; + err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); + if (err != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + return err; +} + +static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_exception exception = { + .inode = state->inode, + }; + int err; + + err = nfs4_set_lock_state(state, request); + if (err != 0) + return err; + do { + if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) + return 0; + err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED); + switch (err) { + default: + goto out; + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + nfs4_handle_exception(server, err, &exception); + err = 0; + } + } while (exception.retry); +out: + return err; +} + +static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + struct nfs_inode *nfsi = NFS_I(state->inode); + unsigned char fl_flags = request->fl_flags; + int status = -ENOLCK; + + if ((fl_flags & FL_POSIX) && + !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) + goto out; + /* Is this a delegated open? */ + status = nfs4_set_lock_state(state, request); + if (status != 0) + goto out; + request->fl_flags |= FL_ACCESS; + status = do_vfs_lock(request->fl_file, request); + if (status < 0) + goto out; + down_read(&nfsi->rwsem); + if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { + /* Yes: cache locks! */ + /* ...but avoid races with delegation recall... */ + request->fl_flags = fl_flags & ~FL_SLEEP; + status = do_vfs_lock(request->fl_file, request); + goto out_unlock; + } + status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); + if (status != 0) + goto out_unlock; + /* Note: we always want to sleep here! */ + request->fl_flags = fl_flags | FL_SLEEP; + if (do_vfs_lock(request->fl_file, request) < 0) + printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__); +out_unlock: + up_read(&nfsi->rwsem); +out: + request->fl_flags = fl_flags; + return status; +} + +static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + struct nfs4_exception exception = { + .state = state, + .inode = state->inode, + }; + int err; + + do { + err = _nfs4_proc_setlk(state, cmd, request); + if (err == -NFS4ERR_DENIED) + err = -EAGAIN; + err = nfs4_handle_exception(NFS_SERVER(state->inode), + err, &exception); + } while (exception.retry); + return err; +} + +static int +nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) +{ + struct nfs_open_context *ctx; + struct nfs4_state *state; + unsigned long timeout = NFS4_LOCK_MINTIMEOUT; + int status; + + /* verify open state */ + ctx = nfs_file_open_context(filp); + state = ctx->state; + + if (request->fl_start < 0 || request->fl_end < 0) + return -EINVAL; + + if (IS_GETLK(cmd)) { + if (state != NULL) + return nfs4_proc_getlk(state, F_GETLK, request); + return 0; + } + + if (!(IS_SETLK(cmd) || IS_SETLKW(cmd))) + return -EINVAL; + + if (request->fl_type == F_UNLCK) { + if (state != NULL) + return nfs4_proc_unlck(state, cmd, request); + return 0; + } + + if (state == NULL) + return -ENOLCK; + /* + * Don't rely on the VFS having checked the file open mode, + * since it won't do this for flock() locks. + */ + switch (request->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) { + case F_RDLCK: + if (!(filp->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + } + + do { + status = nfs4_proc_setlk(state, cmd, request); + if ((status != -EAGAIN) || IS_SETLK(cmd)) + break; + timeout = nfs4_set_lock_task_retry(timeout); + status = -ERESTARTSYS; + if (signalled()) + break; + } while(status < 0); + return status; +} + +int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_exception exception = { }; + int err; + + err = nfs4_set_lock_state(state, fl); + if (err != 0) + goto out; + do { + err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); + switch (err) { + default: + printk(KERN_ERR "%s: unhandled error %d.\n", + __func__, err); + case 0: + case -ESTALE: + goto out; + case -NFS4ERR_EXPIRED: + nfs4_schedule_stateid_recovery(server, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_STALE_STATEID: + nfs4_schedule_lease_recovery(server->nfs_client); + goto out; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + nfs4_schedule_session_recovery(server->nfs_client->cl_session); + goto out; + case -ERESTARTSYS: + /* + * The show must go on: exit, but mark the + * stateid as needing recovery. + */ + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OPENMODE: + nfs4_schedule_stateid_recovery(server, state); + err = 0; + goto out; + case -EKEYEXPIRED: + /* + * User RPCSEC_GSS context has expired. + * We cannot recover this stateid now, so + * skip it and allow recovery thread to + * proceed. + */ + err = 0; + goto out; + case -ENOMEM: + case -NFS4ERR_DENIED: + /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + err = 0; + goto out; + case -NFS4ERR_DELAY: + break; + } + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); +out: + return err; +} + +static void nfs4_release_lockowner_release(void *calldata) +{ + kfree(calldata); +} + +const struct rpc_call_ops nfs4_release_lockowner_ops = { + .rpc_release = nfs4_release_lockowner_release, +}; + +void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) +{ + struct nfs_server *server = lsp->ls_state->owner->so_server; + struct nfs_release_lockowner_args *args; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], + }; + + if (server->nfs_client->cl_mvops->minor_version != 0) + return; + args = kmalloc(sizeof(*args), GFP_NOFS); + if (!args) + return; + args->lock_owner.clientid = server->nfs_client->cl_clientid; + args->lock_owner.id = lsp->ls_id.id; + args->lock_owner.s_dev = server->s_dev; + msg.rpc_argp = args; + rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); +} + +#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" + +static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key, + const void *buf, size_t buflen, + int flags, int type) +{ + if (strcmp(key, "") != 0) + return -EINVAL; + + return nfs4_proc_set_acl(dentry->d_inode, buf, buflen); +} + +static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key, + void *buf, size_t buflen, int type) +{ + if (strcmp(key, "") != 0) + return -EINVAL; + + return nfs4_proc_get_acl(dentry->d_inode, buf, buflen); +} + +static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, + size_t list_len, const char *name, + size_t name_len, int type) +{ + size_t len = sizeof(XATTR_NAME_NFSV4_ACL); + + if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode))) + return 0; + + if (list && len <= list_len) + memcpy(list, XATTR_NAME_NFSV4_ACL, len); + return len; +} + +/* + * nfs_fhget will use either the mounted_on_fileid or the fileid + */ +static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr) +{ + if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) || + (fattr->valid & NFS_ATTR_FATTR_FILEID)) && + (fattr->valid & NFS_ATTR_FATTR_FSID) && + (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL))) + return; + + fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | + NFS_ATTR_FATTR_NLINK; + fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; + fattr->nlink = 2; +} + +int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, + struct nfs4_fs_locations *fs_locations, struct page *page) +{ + struct nfs_server *server = NFS_SERVER(dir); + u32 bitmask[2] = { + [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, + }; + struct nfs4_fs_locations_arg args = { + .dir_fh = NFS_FH(dir), + .name = name, + .page = page, + .bitmask = bitmask, + }; + struct nfs4_fs_locations_res res = { + .fs_locations = fs_locations, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + dprintk("%s: start\n", __func__); + + /* Ask for the fileid of the absent filesystem if mounted_on_fileid + * is not supported */ + if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) + bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; + else + bitmask[0] |= FATTR4_WORD0_FILEID; + + nfs_fattr_init(&fs_locations->fattr); + fs_locations->server = server; + fs_locations->nlocations = 0; + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + dprintk("%s: returned status = %d\n", __func__, status); + return status; +} + +static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) +{ + int status; + struct nfs4_secinfo_arg args = { + .dir_fh = NFS_FH(dir), + .name = name, + }; + struct nfs4_secinfo_res res = { + .flavors = flavors, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + dprintk("NFS call secinfo %s\n", name->name); + status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); + dprintk("NFS reply secinfo: %d\n", status); + return status; +} + +int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(dir), + _nfs4_proc_secinfo(dir, name, flavors), + &exception); + } while (exception.retry); + return err; +} + +#ifdef CONFIG_NFS_V4_1 +/* + * Check the exchange flags returned by the server for invalid flags, having + * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or + * DS flags set. + */ +static int nfs4_check_cl_exchange_flags(u32 flags) +{ + if (flags & ~EXCHGID4_FLAG_MASK_R) + goto out_inval; + if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) && + (flags & EXCHGID4_FLAG_USE_NON_PNFS)) + goto out_inval; + if (!(flags & (EXCHGID4_FLAG_MASK_PNFS))) + goto out_inval; + return NFS_OK; +out_inval: + return -NFS4ERR_INVAL; +} + +/* + * nfs4_proc_exchange_id() + * + * Since the clientid has expired, all compounds using sessions + * associated with the stale clientid will be returning + * NFS4ERR_BADSESSION in the sequence operation, and will therefore + * be in some phase of session reset. + */ +int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) +{ + nfs4_verifier verifier; + struct nfs41_exchange_id_args args = { + .client = clp, + .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER, + }; + struct nfs41_exchange_id_res res = { + .client = clp, + }; + int status; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + __be32 *p; + + dprintk("--> %s\n", __func__); + BUG_ON(clp == NULL); + + p = (u32 *)verifier.data; + *p++ = htonl((u32)clp->cl_boot_time.tv_sec); + *p = htonl((u32)clp->cl_boot_time.tv_nsec); + args.verifier = &verifier; + + args.id_len = scnprintf(args.id, sizeof(args.id), + "%s/%s.%s/%u", + clp->cl_ipaddr, + init_utsname()->nodename, + init_utsname()->domainname, + clp->cl_rpcclient->cl_auth->au_flavor); + + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + if (!status) + status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); + dprintk("<-- %s status= %d\n", __func__, status); + return status; +} + +struct nfs4_get_lease_time_data { + struct nfs4_get_lease_time_args *args; + struct nfs4_get_lease_time_res *res; + struct nfs_client *clp; +}; + +static void nfs4_get_lease_time_prepare(struct rpc_task *task, + void *calldata) +{ + int ret; + struct nfs4_get_lease_time_data *data = + (struct nfs4_get_lease_time_data *)calldata; + + dprintk("--> %s\n", __func__); + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + /* just setup sequence, do not trigger session recovery + since we're invoked within one */ + ret = nfs41_setup_sequence(data->clp->cl_session, + &data->args->la_seq_args, + &data->res->lr_seq_res, 0, task); + + BUG_ON(ret == -EAGAIN); + rpc_call_start(task); + dprintk("<-- %s\n", __func__); +} + +/* + * Called from nfs4_state_manager thread for session setup, so don't recover + * from sequence operation or clientid errors. + */ +static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_get_lease_time_data *data = + (struct nfs4_get_lease_time_data *)calldata; + + dprintk("--> %s\n", __func__); + if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) + return; + switch (task->tk_status) { + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); + rpc_delay(task, NFS4_POLL_RETRY_MIN); + task->tk_status = 0; + /* fall through */ + case -NFS4ERR_RETRY_UNCACHED_REP: + nfs_restart_rpc(task, data->clp); + return; + } + dprintk("<-- %s\n", __func__); +} + +struct rpc_call_ops nfs4_get_lease_time_ops = { + .rpc_call_prepare = nfs4_get_lease_time_prepare, + .rpc_call_done = nfs4_get_lease_time_done, +}; + +int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) +{ + struct rpc_task *task; + struct nfs4_get_lease_time_args args; + struct nfs4_get_lease_time_res res = { + .lr_fsinfo = fsinfo, + }; + struct nfs4_get_lease_time_data data = { + .args = &args, + .res = &res, + .clp = clp, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GET_LEASE_TIME], + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs4_get_lease_time_ops, + .callback_data = &data, + .flags = RPC_TASK_TIMEOUT, + }; + int status; + + dprintk("--> %s\n", __func__); + task = rpc_run_task(&task_setup); + + if (IS_ERR(task)) + status = PTR_ERR(task); + else { + status = task->tk_status; + rpc_put_task(task); + } + dprintk("<-- %s return %d\n", __func__, status); + + return status; +} + +/* + * Reset a slot table + */ +static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, + int ivalue) +{ + struct nfs4_slot *new = NULL; + int i; + int ret = 0; + + dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__, + max_reqs, tbl->max_slots); + + /* Does the newly negotiated max_reqs match the existing slot table? */ + if (max_reqs != tbl->max_slots) { + ret = -ENOMEM; + new = kmalloc(max_reqs * sizeof(struct nfs4_slot), + GFP_NOFS); + if (!new) + goto out; + ret = 0; + kfree(tbl->slots); + } + spin_lock(&tbl->slot_tbl_lock); + if (new) { + tbl->slots = new; + tbl->max_slots = max_reqs; + } + for (i = 0; i < tbl->max_slots; ++i) + tbl->slots[i].seq_nr = ivalue; + spin_unlock(&tbl->slot_tbl_lock); + dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, + tbl, tbl->slots, tbl->max_slots); +out: + dprintk("<-- %s: return %d\n", __func__, ret); + return ret; +} + +/* + * Reset the forechannel and backchannel slot tables + */ +static int nfs4_reset_slot_tables(struct nfs4_session *session) +{ + int status; + + status = nfs4_reset_slot_table(&session->fc_slot_table, + session->fc_attrs.max_reqs, 1); + if (status) + return status; + + status = nfs4_reset_slot_table(&session->bc_slot_table, + session->bc_attrs.max_reqs, 0); + return status; +} + +/* Destroy the slot table */ +static void nfs4_destroy_slot_tables(struct nfs4_session *session) +{ + if (session->fc_slot_table.slots != NULL) { + kfree(session->fc_slot_table.slots); + session->fc_slot_table.slots = NULL; + } + if (session->bc_slot_table.slots != NULL) { + kfree(session->bc_slot_table.slots); + session->bc_slot_table.slots = NULL; + } + return; +} + +/* + * Initialize slot table + */ +static int nfs4_init_slot_table(struct nfs4_slot_table *tbl, + int max_slots, int ivalue) +{ + struct nfs4_slot *slot; + int ret = -ENOMEM; + + BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE); + + dprintk("--> %s: max_reqs=%u\n", __func__, max_slots); + + slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS); + if (!slot) + goto out; + ret = 0; + + spin_lock(&tbl->slot_tbl_lock); + tbl->max_slots = max_slots; + tbl->slots = slot; + tbl->highest_used_slotid = -1; /* no slot is currently used */ + spin_unlock(&tbl->slot_tbl_lock); + dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, + tbl, tbl->slots, tbl->max_slots); +out: + dprintk("<-- %s: return %d\n", __func__, ret); + return ret; +} + +/* + * Initialize the forechannel and backchannel tables + */ +static int nfs4_init_slot_tables(struct nfs4_session *session) +{ + struct nfs4_slot_table *tbl; + int status = 0; + + tbl = &session->fc_slot_table; + if (tbl->slots == NULL) { + status = nfs4_init_slot_table(tbl, + session->fc_attrs.max_reqs, 1); + if (status) + return status; + } + + tbl = &session->bc_slot_table; + if (tbl->slots == NULL) { + status = nfs4_init_slot_table(tbl, + session->bc_attrs.max_reqs, 0); + if (status) + nfs4_destroy_slot_tables(session); + } + + return status; +} + +struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) +{ + struct nfs4_session *session; + struct nfs4_slot_table *tbl; + + session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); + if (!session) + return NULL; + + tbl = &session->fc_slot_table; + tbl->highest_used_slotid = -1; + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); + init_completion(&tbl->complete); + + tbl = &session->bc_slot_table; + tbl->highest_used_slotid = -1; + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); + init_completion(&tbl->complete); + + session->session_state = 1<clp = clp; + return session; +} + +void nfs4_destroy_session(struct nfs4_session *session) +{ + nfs4_proc_destroy_session(session); + dprintk("%s Destroy backchannel for xprt %p\n", + __func__, session->clp->cl_rpcclient->cl_xprt); + xprt_destroy_backchannel(session->clp->cl_rpcclient->cl_xprt, + NFS41_BC_MIN_CALLBACKS); + nfs4_destroy_slot_tables(session); + kfree(session); +} + +/* + * Initialize the values to be used by the client in CREATE_SESSION + * If nfs4_init_session set the fore channel request and response sizes, + * use them. + * + * Set the back channel max_resp_sz_cached to zero to force the client to + * always set csa_cachethis to FALSE because the current implementation + * of the back channel DRC only supports caching the CB_SEQUENCE operation. + */ +static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) +{ + struct nfs4_session *session = args->client->cl_session; + unsigned int mxrqst_sz = session->fc_attrs.max_rqst_sz, + mxresp_sz = session->fc_attrs.max_resp_sz; + + if (mxrqst_sz == 0) + mxrqst_sz = NFS_MAX_FILE_IO_SIZE; + if (mxresp_sz == 0) + mxresp_sz = NFS_MAX_FILE_IO_SIZE; + /* Fore channel attributes */ + args->fc_attrs.max_rqst_sz = mxrqst_sz; + args->fc_attrs.max_resp_sz = mxresp_sz; + args->fc_attrs.max_ops = NFS4_MAX_OPS; + args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs; + + dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u " + "max_ops=%u max_reqs=%u\n", + __func__, + args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz, + args->fc_attrs.max_ops, args->fc_attrs.max_reqs); + + /* Back channel attributes */ + args->bc_attrs.max_rqst_sz = PAGE_SIZE; + args->bc_attrs.max_resp_sz = PAGE_SIZE; + args->bc_attrs.max_resp_sz_cached = 0; + args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; + args->bc_attrs.max_reqs = 1; + + dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u " + "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", + __func__, + args->bc_attrs.max_rqst_sz, args->bc_attrs.max_resp_sz, + args->bc_attrs.max_resp_sz_cached, args->bc_attrs.max_ops, + args->bc_attrs.max_reqs); +} + +static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) +{ + struct nfs4_channel_attrs *sent = &args->fc_attrs; + struct nfs4_channel_attrs *rcvd = &session->fc_attrs; + + if (rcvd->max_resp_sz > sent->max_resp_sz) + return -EINVAL; + /* + * Our requested max_ops is the minimum we need; we're not + * prepared to break up compounds into smaller pieces than that. + * So, no point even trying to continue if the server won't + * cooperate: + */ + if (rcvd->max_ops < sent->max_ops) + return -EINVAL; + if (rcvd->max_reqs == 0) + return -EINVAL; + return 0; +} + +static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) +{ + struct nfs4_channel_attrs *sent = &args->bc_attrs; + struct nfs4_channel_attrs *rcvd = &session->bc_attrs; + + if (rcvd->max_rqst_sz > sent->max_rqst_sz) + return -EINVAL; + if (rcvd->max_resp_sz < sent->max_resp_sz) + return -EINVAL; + if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) + return -EINVAL; + /* These would render the backchannel useless: */ + if (rcvd->max_ops == 0) + return -EINVAL; + if (rcvd->max_reqs == 0) + return -EINVAL; + return 0; +} + +static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args, + struct nfs4_session *session) +{ + int ret; + + ret = nfs4_verify_fore_channel_attrs(args, session); + if (ret) + return ret; + return nfs4_verify_back_channel_attrs(args, session); +} + +static int _nfs4_proc_create_session(struct nfs_client *clp) +{ + struct nfs4_session *session = clp->cl_session; + struct nfs41_create_session_args args = { + .client = clp, + .cb_program = NFS4_CALLBACK, + }; + struct nfs41_create_session_res res = { + .client = clp, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + nfs4_init_channel_attrs(&args); + args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); + + status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + + if (!status) + /* Verify the session's negotiated channel_attrs values */ + status = nfs4_verify_channel_attrs(&args, session); + if (!status) { + /* Increment the clientid slot sequence id */ + clp->cl_seqid++; + } + + return status; +} + +/* + * Issues a CREATE_SESSION operation to the server. + * It is the responsibility of the caller to verify the session is + * expired before calling this routine. + */ +int nfs4_proc_create_session(struct nfs_client *clp) +{ + int status; + unsigned *ptr; + struct nfs4_session *session = clp->cl_session; + + dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); + + status = _nfs4_proc_create_session(clp); + if (status) + goto out; + + /* Init and reset the fore channel */ + status = nfs4_init_slot_tables(session); + dprintk("slot table initialization returned %d\n", status); + if (status) + goto out; + status = nfs4_reset_slot_tables(session); + dprintk("slot table reset returned %d\n", status); + if (status) + goto out; + + ptr = (unsigned *)&session->sess_id.data[0]; + dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__, + clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]); +out: + dprintk("<-- %s\n", __func__); + return status; +} + +/* + * Issue the over-the-wire RPC DESTROY_SESSION. + * The caller must serialize access to this routine. + */ +int nfs4_proc_destroy_session(struct nfs4_session *session) +{ + int status = 0; + struct rpc_message msg; + + dprintk("--> nfs4_proc_destroy_session\n"); + + /* session is still being setup */ + if (session->clp->cl_cons_state != NFS_CS_READY) + return status; + + msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION]; + msg.rpc_argp = session; + msg.rpc_resp = NULL; + msg.rpc_cred = NULL; + status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + + if (status) + printk(KERN_WARNING + "Got error %d from the server on DESTROY_SESSION. " + "Session has been destroyed regardless...\n", status); + + dprintk("<-- nfs4_proc_destroy_session\n"); + return status; +} + +int nfs4_init_session(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_session *session; + unsigned int rsize, wsize; + int ret; + + if (!nfs4_has_session(clp)) + return 0; + + session = clp->cl_session; + if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) + return 0; + + rsize = server->rsize; + if (rsize == 0) + rsize = NFS_MAX_FILE_IO_SIZE; + wsize = server->wsize; + if (wsize == 0) + wsize = NFS_MAX_FILE_IO_SIZE; + + session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; + session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; + + ret = nfs4_recover_expired_lease(server); + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; +} + +int nfs4_init_ds_session(struct nfs_client *clp) +{ + struct nfs4_session *session = clp->cl_session; + int ret; + + if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) + return 0; + + ret = nfs4_client_recover_expired_lease(clp); + if (!ret) + /* Test for the DS role */ + if (!is_ds_client(clp)) + ret = -ENODEV; + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; + +} +EXPORT_SYMBOL_GPL(nfs4_init_ds_session); + + +/* + * Renew the cl_session lease. + */ +struct nfs4_sequence_data { + struct nfs_client *clp; + struct nfs4_sequence_args args; + struct nfs4_sequence_res res; +}; + +static void nfs41_sequence_release(void *data) +{ + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); + kfree(calldata); +} + +static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp) +{ + switch(task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + return -EAGAIN; + default: + nfs4_schedule_lease_recovery(clp); + } + return 0; +} + +static void nfs41_sequence_call_done(struct rpc_task *task, void *data) +{ + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; + + if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) + return; + + if (task->tk_status < 0) { + dprintk("%s ERROR %d\n", __func__, task->tk_status); + if (atomic_read(&clp->cl_count) == 1) + goto out; + + if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + } + dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); +out: + dprintk("<-- %s\n", __func__); +} + +static void nfs41_sequence_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_args *args; + struct nfs4_sequence_res *res; + + args = task->tk_msg.rpc_argp; + res = task->tk_msg.rpc_resp; + + if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) + return; + rpc_call_start(task); +} + +static const struct rpc_call_ops nfs41_sequence_ops = { + .rpc_call_done = nfs41_sequence_call_done, + .rpc_call_prepare = nfs41_sequence_prepare, + .rpc_release = nfs41_sequence_release, +}; + +static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct nfs4_sequence_data *calldata; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], + .rpc_cred = cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs41_sequence_ops, + .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, + }; + + if (!atomic_inc_not_zero(&clp->cl_count)) + return ERR_PTR(-EIO); + calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + if (calldata == NULL) { + nfs_put_client(clp); + return ERR_PTR(-ENOMEM); + } + msg.rpc_argp = &calldata->args; + msg.rpc_resp = &calldata->res; + calldata->clp = clp; + task_setup_data.callback_data = calldata; + + return rpc_run_task(&task_setup_data); +} + +static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct rpc_task *task; + int ret = 0; + + task = _nfs41_proc_sequence(clp, cred); + if (IS_ERR(task)) + ret = PTR_ERR(task); + else + rpc_put_task_async(task); + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; +} + +static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct rpc_task *task; + int ret; + + task = _nfs41_proc_sequence(clp, cred); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + ret = rpc_wait_for_completion_task(task); + if (!ret) { + struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; + + if (task->tk_status == 0) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + ret = task->tk_status; + } + rpc_put_task(task); +out: + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; +} + +struct nfs4_reclaim_complete_data { + struct nfs_client *clp; + struct nfs41_reclaim_complete_args arg; + struct nfs41_reclaim_complete_res res; +}; + +static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + if (nfs41_setup_sequence(calldata->clp->cl_session, + &calldata->arg.seq_args, + &calldata->res.seq_res, 0, task)) + return; + + rpc_call_start(task); +} + +static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) +{ + switch(task->tk_status) { + case 0: + case -NFS4ERR_COMPLETE_ALREADY: + case -NFS4ERR_WRONG_CRED: /* What to do here? */ + break; + case -NFS4ERR_DELAY: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + /* fall through */ + case -NFS4ERR_RETRY_UNCACHED_REP: + return -EAGAIN; + default: + nfs4_schedule_lease_recovery(clp); + } + return 0; +} + +static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_res *res = &calldata->res.seq_res; + + dprintk("--> %s\n", __func__); + if (!nfs41_sequence_done(task, res)) + return; + + if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + dprintk("<-- %s\n", __func__); +} + +static void nfs4_free_reclaim_complete_data(void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + + kfree(calldata); +} + +static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = { + .rpc_call_prepare = nfs4_reclaim_complete_prepare, + .rpc_call_done = nfs4_reclaim_complete_done, + .rpc_release = nfs4_free_reclaim_complete_data, +}; + +/* + * Issue a global reclaim complete. + */ +static int nfs41_proc_reclaim_complete(struct nfs_client *clp) +{ + struct nfs4_reclaim_complete_data *calldata; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs4_reclaim_complete_call_ops, + .flags = RPC_TASK_ASYNC, + }; + int status = -ENOMEM; + + dprintk("--> %s\n", __func__); + calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + if (calldata == NULL) + goto out; + calldata->clp = clp; + calldata->arg.one_fs = 0; + + msg.rpc_argp = &calldata->arg; + msg.rpc_resp = &calldata->res; + task_setup_data.callback_data = calldata; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + status = PTR_ERR(task); + goto out; + } + status = nfs4_wait_for_completion_rpc_task(task); + if (status == 0) + status = task->tk_status; + rpc_put_task(task); + return 0; +out: + dprintk("<-- %s status=%d\n", __func__, status); + return status; +} + +static void +nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct nfs_server *server = NFS_SERVER(lgp->args.inode); + + dprintk("--> %s\n", __func__); + /* Note the is a race here, where a CB_LAYOUTRECALL can come in + * right now covering the LAYOUTGET we are about to send. + * However, that is not so catastrophic, and there seems + * to be no way to prevent it completely. + */ + if (nfs4_setup_sequence(server, &lgp->args.seq_args, + &lgp->res.seq_res, 0, task)) + return; + if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, + NFS_I(lgp->args.inode)->layout, + lgp->args.ctx->state)) { + rpc_exit(task, NFS4_OK); + return; + } + rpc_call_start(task); +} + +static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct nfs_server *server = NFS_SERVER(lgp->args.inode); + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &lgp->res.seq_res)) + return; + + switch (task->tk_status) { + case 0: + break; + case -NFS4ERR_LAYOUTTRYLATER: + case -NFS4ERR_RECALLCONFLICT: + task->tk_status = -NFS4ERR_DELAY; + /* Fall through */ + default: + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + } + dprintk("<-- %s\n", __func__); +} + +static void nfs4_layoutget_release(void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + + dprintk("--> %s\n", __func__); + put_nfs_open_context(lgp->args.ctx); + kfree(calldata); + dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_layoutget_call_ops = { + .rpc_call_prepare = nfs4_layoutget_prepare, + .rpc_call_done = nfs4_layoutget_done, + .rpc_release = nfs4_layoutget_release, +}; + +int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) +{ + struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], + .rpc_argp = &lgp->args, + .rpc_resp = &lgp->res, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_layoutget_call_ops, + .callback_data = lgp, + .flags = RPC_TASK_ASYNC, + }; + int status = 0; + + dprintk("--> %s\n", __func__); + + lgp->res.layoutp = &lgp->args.layout; + lgp->res.seq_res.sr_slot = NULL; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status == 0) + status = task->tk_status; + if (status == 0) + status = pnfs_layout_process(lgp); + rpc_put_task(task); + dprintk("<-- %s status=%d\n", __func__, status); + return status; +} + +static void +nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + + dprintk("--> %s\n", __func__); + if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args, + &lrp->res.seq_res, 0, task)) + return; + rpc_call_start(task); +} + +static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + struct nfs_server *server; + struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &lrp->res.seq_res)) + return; + + server = NFS_SERVER(lrp->args.inode); + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + nfs_restart_rpc(task, lrp->clp); + return; + } + spin_lock(&lo->plh_inode->i_lock); + if (task->tk_status == 0) { + if (lrp->res.lrs_present) { + pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + } else + BUG_ON(!list_empty(&lo->plh_segs)); + } + lo->plh_block_lgets--; + spin_unlock(&lo->plh_inode->i_lock); + dprintk("<-- %s\n", __func__); +} + +static void nfs4_layoutreturn_release(void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + + dprintk("--> %s\n", __func__); + put_layout_hdr(NFS_I(lrp->args.inode)->layout); + kfree(calldata); + dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { + .rpc_call_prepare = nfs4_layoutreturn_prepare, + .rpc_call_done = nfs4_layoutreturn_done, + .rpc_release = nfs4_layoutreturn_release, +}; + +int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) +{ + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], + .rpc_argp = &lrp->args, + .rpc_resp = &lrp->res, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = lrp->clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs4_layoutreturn_call_ops, + .callback_data = lrp, + }; + int status; + + dprintk("--> %s\n", __func__); + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = task->tk_status; + dprintk("<-- %s status=%d\n", __func__, status); + rpc_put_task(task); + return status; +} + +static int +_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +{ + struct nfs4_getdeviceinfo_args args = { + .pdev = pdev, + }; + struct nfs4_getdeviceinfo_res res = { + .pdev = pdev, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + dprintk("<-- %s status=%d\n", __func__, status); + + return status; +} + +int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(server, + _nfs4_proc_getdeviceinfo(server, pdev), + &exception); + } while (exception.retry); + return err; +} +EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo); + +static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + if (nfs4_setup_sequence(server, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +} + +static void +nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { /* Just ignore these failures */ + case NFS4ERR_DELEG_REVOKED: /* layout was recalled */ + case NFS4ERR_BADIOMODE: /* no IOMODE_RW layout for range */ + case NFS4ERR_BADLAYOUT: /* no layout */ + case NFS4ERR_GRACE: /* loca_recalim always false */ + task->tk_status = 0; + } + + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + nfs_restart_rpc(task, server->nfs_client); + return; + } + + if (task->tk_status == 0) + nfs_post_op_update_inode_force_wcc(data->args.inode, + data->res.fattr); +} + +static void nfs4_layoutcommit_release(void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + struct pnfs_layout_segment *lseg, *tmp; + + /* Matched by references in pnfs_set_layoutcommit */ + list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) { + list_del_init(&lseg->pls_lc_list); + if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, + &lseg->pls_flags)) + put_lseg(lseg); + } + put_rpccred(data->cred); + kfree(data); +} + +static const struct rpc_call_ops nfs4_layoutcommit_ops = { + .rpc_call_prepare = nfs4_layoutcommit_prepare, + .rpc_call_done = nfs4_layoutcommit_done, + .rpc_release = nfs4_layoutcommit_release, +}; + +int +nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, + .rpc_client = NFS_CLIENT(data->args.inode), + .rpc_message = &msg, + .callback_ops = &nfs4_layoutcommit_ops, + .callback_data = data, + .flags = RPC_TASK_ASYNC, + }; + struct rpc_task *task; + int status = 0; + + dprintk("NFS: %4d initiating layoutcommit call. sync %d " + "lbw: %llu inode %lu\n", + data->task.tk_pid, sync, + data->args.lastbytewritten, + data->args.inode->i_ino); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (sync == false) + goto out; + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) + goto out; + status = task->tk_status; +out: + dprintk("%s: status %d\n", __func__, status); + rpc_put_task(task); + return status; +} +#endif /* CONFIG_NFS_V4_1 */ + +struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { + .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT, + .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, + .recover_open = nfs4_open_reclaim, + .recover_lock = nfs4_lock_reclaim, + .establish_clid = nfs4_init_clientid, + .get_clid_cred = nfs4_get_setclientid_cred, +}; + +#if defined(CONFIG_NFS_V4_1) +struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { + .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT, + .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, + .recover_open = nfs4_open_reclaim, + .recover_lock = nfs4_lock_reclaim, + .establish_clid = nfs41_init_clientid, + .get_clid_cred = nfs4_get_exchange_id_cred, + .reclaim_complete = nfs41_proc_reclaim_complete, +}; +#endif /* CONFIG_NFS_V4_1 */ + +struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { + .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, + .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, + .recover_open = nfs4_open_expired, + .recover_lock = nfs4_lock_expired, + .establish_clid = nfs4_init_clientid, + .get_clid_cred = nfs4_get_setclientid_cred, +}; + +#if defined(CONFIG_NFS_V4_1) +struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { + .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, + .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, + .recover_open = nfs4_open_expired, + .recover_lock = nfs4_lock_expired, + .establish_clid = nfs41_init_clientid, + .get_clid_cred = nfs4_get_exchange_id_cred, +}; +#endif /* CONFIG_NFS_V4_1 */ + +struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = { + .sched_state_renewal = nfs4_proc_async_renew, + .get_state_renewal_cred_locked = nfs4_get_renew_cred_locked, + .renew_lease = nfs4_proc_renew, +}; + +#if defined(CONFIG_NFS_V4_1) +struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { + .sched_state_renewal = nfs41_proc_async_sequence, + .get_state_renewal_cred_locked = nfs4_get_machine_cred_locked, + .renew_lease = nfs4_proc_sequence, +}; +#endif + +static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { + .minor_version = 0, + .call_sync = _nfs4_call_sync, + .validate_stateid = nfs4_validate_delegation_stateid, + .reboot_recovery_ops = &nfs40_reboot_recovery_ops, + .nograce_recovery_ops = &nfs40_nograce_recovery_ops, + .state_renewal_ops = &nfs40_state_renewal_ops, +}; + +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { + .minor_version = 1, + .call_sync = _nfs4_call_sync_session, + .validate_stateid = nfs41_validate_delegation_stateid, + .reboot_recovery_ops = &nfs41_reboot_recovery_ops, + .nograce_recovery_ops = &nfs41_nograce_recovery_ops, + .state_renewal_ops = &nfs41_state_renewal_ops, +}; +#endif + +const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { + [0] = &nfs_v4_0_minor_ops, +#if defined(CONFIG_NFS_V4_1) + [1] = &nfs_v4_1_minor_ops, +#endif +}; + +static const struct inode_operations nfs4_file_inode_operations = { + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, +}; + +const struct nfs_rpc_ops nfs_v4_clientops = { + .version = 4, /* protocol version */ + .dentry_ops = &nfs4_dentry_operations, + .dir_inode_ops = &nfs4_dir_inode_operations, + .file_inode_ops = &nfs4_file_inode_operations, + .file_ops = &nfs4_file_operations, + .getroot = nfs4_proc_get_root, + .getattr = nfs4_proc_getattr, + .setattr = nfs4_proc_setattr, + .lookupfh = nfs4_proc_lookupfh, + .lookup = nfs4_proc_lookup, + .access = nfs4_proc_access, + .readlink = nfs4_proc_readlink, + .create = nfs4_proc_create, + .remove = nfs4_proc_remove, + .unlink_setup = nfs4_proc_unlink_setup, + .unlink_done = nfs4_proc_unlink_done, + .rename = nfs4_proc_rename, + .rename_setup = nfs4_proc_rename_setup, + .rename_done = nfs4_proc_rename_done, + .link = nfs4_proc_link, + .symlink = nfs4_proc_symlink, + .mkdir = nfs4_proc_mkdir, + .rmdir = nfs4_proc_remove, + .readdir = nfs4_proc_readdir, + .mknod = nfs4_proc_mknod, + .statfs = nfs4_proc_statfs, + .fsinfo = nfs4_proc_fsinfo, + .pathconf = nfs4_proc_pathconf, + .set_capabilities = nfs4_server_capabilities, + .decode_dirent = nfs4_decode_dirent, + .read_setup = nfs4_proc_read_setup, + .read_done = nfs4_read_done, + .write_setup = nfs4_proc_write_setup, + .write_done = nfs4_write_done, + .commit_setup = nfs4_proc_commit_setup, + .commit_done = nfs4_commit_done, + .lock = nfs4_proc_lock, + .clear_acl_cache = nfs4_zap_acl_attr, + .close_context = nfs4_close_context, + .open_context = nfs4_atomic_open, + .init_client = nfs4_init_client, + .secinfo = nfs4_proc_secinfo, +}; + +static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { + .prefix = XATTR_NAME_NFSV4_ACL, + .list = nfs4_xattr_list_nfs4_acl, + .get = nfs4_xattr_get_nfs4_acl, + .set = nfs4_xattr_set_nfs4_acl, +}; + +const struct xattr_handler *nfs4_xattr_handlers[] = { + &nfs4_xattr_nfs4_acl_handler, + NULL +}; + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c new file mode 100644 index 00000000..df8e7f3c --- /dev/null +++ b/fs/nfs/nfs4renewd.c @@ -0,0 +1,130 @@ +/* + * fs/nfs/nfs4renewd.c + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Implementation of the NFSv4 "renew daemon", which wakes up periodically to + * send a RENEW, to keep state alive on the server. The daemon is implemented + * as an rpc_task, not a real kernel thread, so it always runs in rpciod's + * context. There is one renewd per nfs_server. + * + */ + +#include +#include +#include +#include + +#include +#include +#include +#include "nfs4_fs.h" +#include "delegation.h" + +#define NFSDBG_FACILITY NFSDBG_PROC + +void +nfs4_renew_state(struct work_struct *work) +{ + const struct nfs4_state_maintenance_ops *ops; + struct nfs_client *clp = + container_of(work, struct nfs_client, cl_renewd.work); + struct rpc_cred *cred; + long lease; + unsigned long last, now; + + ops = clp->cl_mvops->state_renewal_ops; + dprintk("%s: start\n", __func__); + + if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state)) + goto out; + + spin_lock(&clp->cl_lock); + lease = clp->cl_lease_time; + last = clp->cl_last_renewal; + now = jiffies; + /* Are we close to a lease timeout? */ + if (time_after(now, last + lease/3)) { + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) { + if (!nfs_delegations_present(clp)) { + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + goto out; + } + nfs_expire_all_delegations(clp); + } else { + /* Queue an asynchronous RENEW. */ + ops->sched_state_renewal(clp, cred); + put_rpccred(cred); + goto out_exp; + } + } else { + dprintk("%s: failed to call renewd. Reason: lease not expired \n", + __func__); + spin_unlock(&clp->cl_lock); + } + nfs4_schedule_state_renewal(clp); +out_exp: + nfs_expire_unreferenced_delegations(clp); +out: + dprintk("%s: done\n", __func__); +} + +void +nfs4_schedule_state_renewal(struct nfs_client *clp) +{ + long timeout; + + spin_lock(&clp->cl_lock); + timeout = (2 * clp->cl_lease_time) / 3 + (long)clp->cl_last_renewal + - (long)jiffies; + if (timeout < 5 * HZ) + timeout = 5 * HZ; + dprintk("%s: requeueing work. Lease period = %ld\n", + __func__, (timeout + HZ - 1) / HZ); + cancel_delayed_work(&clp->cl_renewd); + schedule_delayed_work(&clp->cl_renewd, timeout); + set_bit(NFS_CS_RENEWD, &clp->cl_res_state); + spin_unlock(&clp->cl_lock); +} + +void +nfs4_kill_renewd(struct nfs_client *clp) +{ + cancel_delayed_work_sync(&clp->cl_renewd); +} + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c new file mode 100644 index 00000000..c6e2769f --- /dev/null +++ b/fs/nfs/nfs4state.c @@ -0,0 +1,1766 @@ +/* + * fs/nfs/nfs4state.c + * + * Client-side XDR for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Implementation of the NFSv4 state model. For the time being, + * this is minimal, but will be made much more complex in a + * subsequent patch. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "internal.h" +#include "pnfs.h" + +#define OPENOWNER_POOL_SIZE 8 + +const nfs4_stateid zero_stateid; + +static LIST_HEAD(nfs4_clientid_list); + +int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct nfs4_setclientid_res clid = { + .clientid = clp->cl_clientid, + .confirm = clp->cl_confirm, + }; + unsigned short port; + int status; + + if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) + goto do_confirm; + port = nfs_callback_tcpport; + if (clp->cl_addr.ss_family == AF_INET6) + port = nfs_callback_tcpport6; + + status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); + if (status != 0) + goto out; + clp->cl_clientid = clid.clientid; + clp->cl_confirm = clid.confirm; + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); +do_confirm: + status = nfs4_proc_setclientid_confirm(clp, &clid, cred); + if (status != 0) + goto out; + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs4_schedule_state_renewal(clp); +out: + return status; +} + +struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp) +{ + struct rpc_cred *cred = NULL; + + if (clp->cl_machine_cred != NULL) + cred = get_rpccred(clp->cl_machine_cred); + return cred; +} + +static void nfs4_clear_machine_cred(struct nfs_client *clp) +{ + struct rpc_cred *cred; + + spin_lock(&clp->cl_lock); + cred = clp->cl_machine_cred; + clp->cl_machine_cred = NULL; + spin_unlock(&clp->cl_lock); + if (cred != NULL) + put_rpccred(cred); +} + +static struct rpc_cred * +nfs4_get_renew_cred_server_locked(struct nfs_server *server) +{ + struct rpc_cred *cred = NULL; + struct nfs4_state_owner *sp; + struct rb_node *pos; + + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); + if (list_empty(&sp->so_states)) + continue; + cred = get_rpccred(sp->so_cred); + break; + } + return cred; +} + +/** + * nfs4_get_renew_cred_locked - Acquire credential for a renew operation + * @clp: client state handle + * + * Returns an rpc_cred with reference count bumped, or NULL. + * Caller must hold clp->cl_lock. + */ +struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) +{ + struct rpc_cred *cred = NULL; + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + cred = nfs4_get_renew_cred_server_locked(server); + if (cred != NULL) + break; + } + rcu_read_unlock(); + return cred; +} + +#if defined(CONFIG_NFS_V4_1) + +static int nfs41_setup_state_renewal(struct nfs_client *clp) +{ + int status; + struct nfs_fsinfo fsinfo; + + if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { + nfs4_schedule_state_renewal(clp); + return 0; + } + + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ + spin_lock(&clp->cl_lock); + clp->cl_lease_time = fsinfo.lease_time * HZ; + clp->cl_last_renewal = jiffies; + spin_unlock(&clp->cl_lock); + + nfs4_schedule_state_renewal(clp); + } + + return status; +} + +/* + * Back channel returns NFS4ERR_DELAY for new requests when + * NFS4_SESSION_DRAINING is set so there is no work to be done when draining + * is ended. + */ +static void nfs4_end_drain_session(struct nfs_client *clp) +{ + struct nfs4_session *ses = clp->cl_session; + int max_slots; + + if (ses == NULL) + return; + if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + spin_lock(&ses->fc_slot_table.slot_tbl_lock); + max_slots = ses->fc_slot_table.max_slots; + while (max_slots--) { + struct rpc_task *task; + + task = rpc_wake_up_next(&ses->fc_slot_table. + slot_tbl_waitq); + if (!task) + break; + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + } + spin_unlock(&ses->fc_slot_table.slot_tbl_lock); + } +} + +static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl) +{ + spin_lock(&tbl->slot_tbl_lock); + if (tbl->highest_used_slotid != -1) { + INIT_COMPLETION(tbl->complete); + spin_unlock(&tbl->slot_tbl_lock); + return wait_for_completion_interruptible(&tbl->complete); + } + spin_unlock(&tbl->slot_tbl_lock); + return 0; +} + +static int nfs4_begin_drain_session(struct nfs_client *clp) +{ + struct nfs4_session *ses = clp->cl_session; + int ret = 0; + + set_bit(NFS4_SESSION_DRAINING, &ses->session_state); + /* back channel */ + ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table); + if (ret) + return ret; + /* fore channel */ + return nfs4_wait_on_slot_tbl(&ses->fc_slot_table); +} + +int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) +{ + int status; + + if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) + goto do_confirm; + nfs4_begin_drain_session(clp); + status = nfs4_proc_exchange_id(clp, cred); + if (status != 0) + goto out; + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); +do_confirm: + status = nfs4_proc_create_session(clp); + if (status != 0) + goto out; + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs41_setup_state_renewal(clp); + nfs_mark_client_ready(clp, NFS_CS_READY); +out: + return status; +} + +struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp) +{ + struct rpc_cred *cred; + + spin_lock(&clp->cl_lock); + cred = nfs4_get_machine_cred_locked(clp); + spin_unlock(&clp->cl_lock); + return cred; +} + +#endif /* CONFIG_NFS_V4_1 */ + +static struct rpc_cred * +nfs4_get_setclientid_cred_server(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct rpc_cred *cred = NULL; + struct nfs4_state_owner *sp; + struct rb_node *pos; + + spin_lock(&clp->cl_lock); + pos = rb_first(&server->state_owners); + if (pos != NULL) { + sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); + cred = get_rpccred(sp->so_cred); + } + spin_unlock(&clp->cl_lock); + return cred; +} + +/** + * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation + * @clp: client state handle + * + * Returns an rpc_cred with reference count bumped, or NULL. + */ +struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) +{ + struct nfs_server *server; + struct rpc_cred *cred; + + spin_lock(&clp->cl_lock); + cred = nfs4_get_machine_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred != NULL) + goto out; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + cred = nfs4_get_setclientid_cred_server(server); + if (cred != NULL) + break; + } + rcu_read_unlock(); + +out: + return cred; +} + +static void nfs_alloc_unique_id_locked(struct rb_root *root, + struct nfs_unique_id *new, + __u64 minval, int maxbits) +{ + struct rb_node **p, *parent; + struct nfs_unique_id *pos; + __u64 mask = ~0ULL; + + if (maxbits < 64) + mask = (1ULL << maxbits) - 1ULL; + + /* Ensure distribution is more or less flat */ + get_random_bytes(&new->id, sizeof(new->id)); + new->id &= mask; + if (new->id < minval) + new->id += minval; +retry: + p = &root->rb_node; + parent = NULL; + + while (*p != NULL) { + parent = *p; + pos = rb_entry(parent, struct nfs_unique_id, rb_node); + + if (new->id < pos->id) + p = &(*p)->rb_left; + else if (new->id > pos->id) + p = &(*p)->rb_right; + else + goto id_exists; + } + rb_link_node(&new->rb_node, parent, p); + rb_insert_color(&new->rb_node, root); + return; +id_exists: + for (;;) { + new->id++; + if (new->id < minval || (new->id & mask) != new->id) { + new->id = minval; + break; + } + parent = rb_next(parent); + if (parent == NULL) + break; + pos = rb_entry(parent, struct nfs_unique_id, rb_node); + if (new->id < pos->id) + break; + } + goto retry; +} + +static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id) +{ + rb_erase(&id->rb_node, root); +} + +static struct nfs4_state_owner * +nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred) +{ + struct rb_node **p = &server->state_owners.rb_node, + *parent = NULL; + struct nfs4_state_owner *sp, *res = NULL; + + while (*p != NULL) { + parent = *p; + sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); + + if (server < sp->so_server) { + p = &parent->rb_left; + continue; + } + if (server > sp->so_server) { + p = &parent->rb_right; + continue; + } + if (cred < sp->so_cred) + p = &parent->rb_left; + else if (cred > sp->so_cred) + p = &parent->rb_right; + else { + atomic_inc(&sp->so_count); + res = sp; + break; + } + } + return res; +} + +static struct nfs4_state_owner * +nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) +{ + struct nfs_server *server = new->so_server; + struct rb_node **p = &server->state_owners.rb_node, + *parent = NULL; + struct nfs4_state_owner *sp; + + while (*p != NULL) { + parent = *p; + sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); + + if (new->so_cred < sp->so_cred) + p = &parent->rb_left; + else if (new->so_cred > sp->so_cred) + p = &parent->rb_right; + else { + atomic_inc(&sp->so_count); + return sp; + } + } + nfs_alloc_unique_id_locked(&server->openowner_id, + &new->so_owner_id, 1, 64); + rb_link_node(&new->so_server_node, parent, p); + rb_insert_color(&new->so_server_node, &server->state_owners); + return new; +} + +static void +nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp) +{ + struct nfs_server *server = sp->so_server; + + if (!RB_EMPTY_NODE(&sp->so_server_node)) + rb_erase(&sp->so_server_node, &server->state_owners); + nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id); +} + +/* + * nfs4_alloc_state_owner(): this is called on the OPEN or CREATE path to + * create a new state_owner. + * + */ +static struct nfs4_state_owner * +nfs4_alloc_state_owner(void) +{ + struct nfs4_state_owner *sp; + + sp = kzalloc(sizeof(*sp),GFP_NOFS); + if (!sp) + return NULL; + spin_lock_init(&sp->so_lock); + INIT_LIST_HEAD(&sp->so_states); + rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); + sp->so_seqid.sequence = &sp->so_sequence; + spin_lock_init(&sp->so_sequence.lock); + INIT_LIST_HEAD(&sp->so_sequence.list); + atomic_set(&sp->so_count, 1); + return sp; +} + +static void +nfs4_drop_state_owner(struct nfs4_state_owner *sp) +{ + if (!RB_EMPTY_NODE(&sp->so_server_node)) { + struct nfs_server *server = sp->so_server; + struct nfs_client *clp = server->nfs_client; + + spin_lock(&clp->cl_lock); + rb_erase(&sp->so_server_node, &server->state_owners); + RB_CLEAR_NODE(&sp->so_server_node); + spin_unlock(&clp->cl_lock); + } +} + +/** + * nfs4_get_state_owner - Look up a state owner given a credential + * @server: nfs_server to search + * @cred: RPC credential to match + * + * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL. + */ +struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, + struct rpc_cred *cred) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state_owner *sp, *new; + + spin_lock(&clp->cl_lock); + sp = nfs4_find_state_owner_locked(server, cred); + spin_unlock(&clp->cl_lock); + if (sp != NULL) + return sp; + new = nfs4_alloc_state_owner(); + if (new == NULL) + return NULL; + new->so_server = server; + new->so_cred = cred; + spin_lock(&clp->cl_lock); + sp = nfs4_insert_state_owner_locked(new); + spin_unlock(&clp->cl_lock); + if (sp == new) + get_rpccred(cred); + else { + rpc_destroy_wait_queue(&new->so_sequence.wait); + kfree(new); + } + return sp; +} + +/** + * nfs4_put_state_owner - Release a nfs4_state_owner + * @sp: state owner data to release + * + */ +void nfs4_put_state_owner(struct nfs4_state_owner *sp) +{ + struct nfs_client *clp = sp->so_server->nfs_client; + struct rpc_cred *cred = sp->so_cred; + + if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) + return; + nfs4_remove_state_owner_locked(sp); + spin_unlock(&clp->cl_lock); + rpc_destroy_wait_queue(&sp->so_sequence.wait); + put_rpccred(cred); + kfree(sp); +} + +static struct nfs4_state * +nfs4_alloc_open_state(void) +{ + struct nfs4_state *state; + + state = kzalloc(sizeof(*state), GFP_NOFS); + if (!state) + return NULL; + atomic_set(&state->count, 1); + INIT_LIST_HEAD(&state->lock_states); + spin_lock_init(&state->state_lock); + seqlock_init(&state->seqlock); + return state; +} + +void +nfs4_state_set_mode_locked(struct nfs4_state *state, fmode_t fmode) +{ + if (state->state == fmode) + return; + /* NB! List reordering - see the reclaim code for why. */ + if ((fmode & FMODE_WRITE) != (state->state & FMODE_WRITE)) { + if (fmode & FMODE_WRITE) + list_move(&state->open_states, &state->owner->so_states); + else + list_move_tail(&state->open_states, &state->owner->so_states); + } + state->state = fmode; +} + +static struct nfs4_state * +__nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs4_state *state; + + list_for_each_entry(state, &nfsi->open_states, inode_states) { + if (state->owner != owner) + continue; + if (atomic_inc_not_zero(&state->count)) + return state; + } + return NULL; +} + +static void +nfs4_free_open_state(struct nfs4_state *state) +{ + kfree(state); +} + +struct nfs4_state * +nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) +{ + struct nfs4_state *state, *new; + struct nfs_inode *nfsi = NFS_I(inode); + + spin_lock(&inode->i_lock); + state = __nfs4_find_state_byowner(inode, owner); + spin_unlock(&inode->i_lock); + if (state) + goto out; + new = nfs4_alloc_open_state(); + spin_lock(&owner->so_lock); + spin_lock(&inode->i_lock); + state = __nfs4_find_state_byowner(inode, owner); + if (state == NULL && new != NULL) { + state = new; + state->owner = owner; + atomic_inc(&owner->so_count); + list_add(&state->inode_states, &nfsi->open_states); + ihold(inode); + state->inode = inode; + spin_unlock(&inode->i_lock); + /* Note: The reclaim code dictates that we add stateless + * and read-only stateids to the end of the list */ + list_add_tail(&state->open_states, &owner->so_states); + spin_unlock(&owner->so_lock); + } else { + spin_unlock(&inode->i_lock); + spin_unlock(&owner->so_lock); + if (new) + nfs4_free_open_state(new); + } +out: + return state; +} + +void nfs4_put_open_state(struct nfs4_state *state) +{ + struct inode *inode = state->inode; + struct nfs4_state_owner *owner = state->owner; + + if (!atomic_dec_and_lock(&state->count, &owner->so_lock)) + return; + spin_lock(&inode->i_lock); + list_del(&state->inode_states); + list_del(&state->open_states); + spin_unlock(&inode->i_lock); + spin_unlock(&owner->so_lock); + iput(inode); + nfs4_free_open_state(state); + nfs4_put_state_owner(owner); +} + +/* + * Close the current file. + */ +static void __nfs4_close(struct path *path, struct nfs4_state *state, + fmode_t fmode, gfp_t gfp_mask, int wait) +{ + struct nfs4_state_owner *owner = state->owner; + int call_close = 0; + fmode_t newstate; + + atomic_inc(&owner->so_count); + /* Protect against nfs4_find_state() */ + spin_lock(&owner->so_lock); + switch (fmode & (FMODE_READ | FMODE_WRITE)) { + case FMODE_READ: + state->n_rdonly--; + break; + case FMODE_WRITE: + state->n_wronly--; + break; + case FMODE_READ|FMODE_WRITE: + state->n_rdwr--; + } + newstate = FMODE_READ|FMODE_WRITE; + if (state->n_rdwr == 0) { + if (state->n_rdonly == 0) { + newstate &= ~FMODE_READ; + call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + } + if (state->n_wronly == 0) { + newstate &= ~FMODE_WRITE; + call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + } + if (newstate == 0) + clear_bit(NFS_DELEGATED_STATE, &state->flags); + } + nfs4_state_set_mode_locked(state, newstate); + spin_unlock(&owner->so_lock); + + if (!call_close) { + nfs4_put_open_state(state); + nfs4_put_state_owner(owner); + } else { + bool roc = pnfs_roc(state->inode); + + nfs4_do_close(path, state, gfp_mask, wait, roc); + } +} + +void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) +{ + __nfs4_close(path, state, fmode, GFP_NOFS, 0); +} + +void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode) +{ + __nfs4_close(path, state, fmode, GFP_KERNEL, 1); +} + +/* + * Search the state->lock_states for an existing lock_owner + * that is compatible with current->files + */ +static struct nfs4_lock_state * +__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) +{ + struct nfs4_lock_state *pos; + list_for_each_entry(pos, &state->lock_states, ls_locks) { + if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) + continue; + switch (pos->ls_owner.lo_type) { + case NFS4_POSIX_LOCK_TYPE: + if (pos->ls_owner.lo_u.posix_owner != fl_owner) + continue; + break; + case NFS4_FLOCK_LOCK_TYPE: + if (pos->ls_owner.lo_u.flock_owner != fl_pid) + continue; + } + atomic_inc(&pos->ls_count); + return pos; + } + return NULL; +} + +/* + * Return a compatible lock_state. If no initialized lock_state structure + * exists, return an uninitialized one. + * + */ +static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) +{ + struct nfs4_lock_state *lsp; + struct nfs_server *server = state->owner->so_server; + struct nfs_client *clp = server->nfs_client; + + lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + if (lsp == NULL) + return NULL; + rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue"); + spin_lock_init(&lsp->ls_sequence.lock); + INIT_LIST_HEAD(&lsp->ls_sequence.list); + lsp->ls_seqid.sequence = &lsp->ls_sequence; + atomic_set(&lsp->ls_count, 1); + lsp->ls_state = state; + lsp->ls_owner.lo_type = type; + switch (lsp->ls_owner.lo_type) { + case NFS4_FLOCK_LOCK_TYPE: + lsp->ls_owner.lo_u.flock_owner = fl_pid; + break; + case NFS4_POSIX_LOCK_TYPE: + lsp->ls_owner.lo_u.posix_owner = fl_owner; + break; + default: + kfree(lsp); + return NULL; + } + spin_lock(&clp->cl_lock); + nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64); + spin_unlock(&clp->cl_lock); + INIT_LIST_HEAD(&lsp->ls_locks); + return lsp; +} + +static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) +{ + struct nfs_server *server = lsp->ls_state->owner->so_server; + struct nfs_client *clp = server->nfs_client; + + spin_lock(&clp->cl_lock); + nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id); + spin_unlock(&clp->cl_lock); + rpc_destroy_wait_queue(&lsp->ls_sequence.wait); + kfree(lsp); +} + +/* + * Return a compatible lock_state. If no initialized lock_state structure + * exists, return an uninitialized one. + * + */ +static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) +{ + struct nfs4_lock_state *lsp, *new = NULL; + + for(;;) { + spin_lock(&state->state_lock); + lsp = __nfs4_find_lock_state(state, owner, pid, type); + if (lsp != NULL) + break; + if (new != NULL) { + list_add(&new->ls_locks, &state->lock_states); + set_bit(LK_STATE_IN_USE, &state->flags); + lsp = new; + new = NULL; + break; + } + spin_unlock(&state->state_lock); + new = nfs4_alloc_lock_state(state, owner, pid, type); + if (new == NULL) + return NULL; + } + spin_unlock(&state->state_lock); + if (new != NULL) + nfs4_free_lock_state(new); + return lsp; +} + +/* + * Release reference to lock_state, and free it if we see that + * it is no longer in use + */ +void nfs4_put_lock_state(struct nfs4_lock_state *lsp) +{ + struct nfs4_state *state; + + if (lsp == NULL) + return; + state = lsp->ls_state; + if (!atomic_dec_and_lock(&lsp->ls_count, &state->state_lock)) + return; + list_del(&lsp->ls_locks); + if (list_empty(&state->lock_states)) + clear_bit(LK_STATE_IN_USE, &state->flags); + spin_unlock(&state->state_lock); + if (lsp->ls_flags & NFS_LOCK_INITIALIZED) + nfs4_release_lockowner(lsp); + nfs4_free_lock_state(lsp); +} + +static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) +{ + struct nfs4_lock_state *lsp = src->fl_u.nfs4_fl.owner; + + dst->fl_u.nfs4_fl.owner = lsp; + atomic_inc(&lsp->ls_count); +} + +static void nfs4_fl_release_lock(struct file_lock *fl) +{ + nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner); +} + +static const struct file_lock_operations nfs4_fl_lock_ops = { + .fl_copy_lock = nfs4_fl_copy_lock, + .fl_release_private = nfs4_fl_release_lock, +}; + +int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) +{ + struct nfs4_lock_state *lsp; + + if (fl->fl_ops != NULL) + return 0; + if (fl->fl_flags & FL_POSIX) + lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); + else if (fl->fl_flags & FL_FLOCK) + lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); + else + return -EINVAL; + if (lsp == NULL) + return -ENOMEM; + fl->fl_u.nfs4_fl.owner = lsp; + fl->fl_ops = &nfs4_fl_lock_ops; + return 0; +} + +/* + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. + */ +void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid) +{ + struct nfs4_lock_state *lsp; + int seq; + + do { + seq = read_seqbegin(&state->seqlock); + memcpy(dst, &state->stateid, sizeof(*dst)); + } while (read_seqretry(&state->seqlock, seq)); + if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) + return; + + spin_lock(&state->state_lock); + lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); + if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); + spin_unlock(&state->state_lock); + nfs4_put_lock_state(lsp); +} + +struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask) +{ + struct nfs_seqid *new; + + new = kmalloc(sizeof(*new), gfp_mask); + if (new != NULL) { + new->sequence = counter; + INIT_LIST_HEAD(&new->list); + } + return new; +} + +void nfs_release_seqid(struct nfs_seqid *seqid) +{ + if (!list_empty(&seqid->list)) { + struct rpc_sequence *sequence = seqid->sequence->sequence; + + spin_lock(&sequence->lock); + list_del_init(&seqid->list); + spin_unlock(&sequence->lock); + rpc_wake_up(&sequence->wait); + } +} + +void nfs_free_seqid(struct nfs_seqid *seqid) +{ + nfs_release_seqid(seqid); + kfree(seqid); +} + +/* + * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or + * failed with a seqid incrementing error - + * see comments nfs_fs.h:seqid_mutating_error() + */ +static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) +{ + BUG_ON(list_first_entry(&seqid->sequence->sequence->list, struct nfs_seqid, list) != seqid); + switch (status) { + case 0: + break; + case -NFS4ERR_BAD_SEQID: + if (seqid->sequence->flags & NFS_SEQID_CONFIRMED) + return; + printk(KERN_WARNING "NFS: v4 server returned a bad" + " sequence-id error on an" + " unconfirmed sequence %p!\n", + seqid->sequence); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_BADXDR: + case -NFS4ERR_RESOURCE: + case -NFS4ERR_NOFILEHANDLE: + /* Non-seqid mutating errors */ + return; + }; + /* + * Note: no locking needed as we are guaranteed to be first + * on the sequence list + */ + seqid->sequence->counter++; +} + +void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) +{ + struct nfs4_state_owner *sp = container_of(seqid->sequence, + struct nfs4_state_owner, so_seqid); + struct nfs_server *server = sp->so_server; + + if (status == -NFS4ERR_BAD_SEQID) + nfs4_drop_state_owner(sp); + if (!nfs4_has_session(server->nfs_client)) + nfs_increment_seqid(status, seqid); +} + +/* + * Increment the seqid if the LOCK/LOCKU succeeded, or + * failed with a seqid incrementing error - + * see comments nfs_fs.h:seqid_mutating_error() + */ +void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) +{ + nfs_increment_seqid(status, seqid); +} + +int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) +{ + struct rpc_sequence *sequence = seqid->sequence->sequence; + int status = 0; + + spin_lock(&sequence->lock); + if (list_empty(&seqid->list)) + list_add_tail(&seqid->list, &sequence->list); + if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid) + goto unlock; + rpc_sleep_on(&sequence->wait, task, NULL); + status = -EAGAIN; +unlock: + spin_unlock(&sequence->lock); + return status; +} + +static int nfs4_run_state_manager(void *); + +static void nfs4_clear_state_manager_bit(struct nfs_client *clp) +{ + smp_mb__before_clear_bit(); + clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); + smp_mb__after_clear_bit(); + wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING); + rpc_wake_up(&clp->cl_rpcwaitq); +} + +/* + * Schedule the nfs_client asynchronous state management routine + */ +void nfs4_schedule_state_manager(struct nfs_client *clp) +{ + struct task_struct *task; + + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + return; + __module_get(THIS_MODULE); + atomic_inc(&clp->cl_count); + task = kthread_run(nfs4_run_state_manager, clp, "%s-manager", + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR)); + if (!IS_ERR(task)) + return; + nfs4_clear_state_manager_bit(clp); + nfs_put_client(clp); + module_put(THIS_MODULE); +} + +/* + * Schedule a lease recovery attempt + */ +void nfs4_schedule_lease_recovery(struct nfs_client *clp) +{ + if (!clp) + return; + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + nfs4_schedule_state_manager(clp); +} + +static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) +{ + + set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); + /* Don't recover state that expired before the reboot */ + if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) { + clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); + return 0; + } + set_bit(NFS_OWNER_RECLAIM_REBOOT, &state->owner->so_flags); + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + return 1; +} + +static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) +{ + set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); + clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); + set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags); + set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); + return 1; +} + +void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state) +{ + struct nfs_client *clp = server->nfs_client; + + nfs4_state_mark_reclaim_nograce(clp, state); + nfs4_schedule_state_manager(clp); +} + +void nfs_inode_find_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *ctx; + struct nfs4_state *state; + bool found = false; + + spin_lock(&inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + state = ctx->state; + if (state == NULL) + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; + if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) + continue; + nfs4_state_mark_reclaim_nograce(clp, state); + found = true; + } + spin_unlock(&inode->i_lock); + if (found) + nfs4_schedule_state_manager(clp); +} + + +static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) +{ + struct inode *inode = state->inode; + struct nfs_inode *nfsi = NFS_I(inode); + struct file_lock *fl; + int status = 0; + + if (inode->i_flock == NULL) + return 0; + + /* Guard against delegation returns and new lock/unlock calls */ + down_write(&nfsi->rwsem); + /* Protect inode->i_flock using the BKL */ + lock_flocks(); + for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { + if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) + continue; + if (nfs_file_open_context(fl->fl_file)->state != state) + continue; + unlock_flocks(); + status = ops->recover_lock(state, fl); + switch (status) { + case 0: + break; + case -ESTALE: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + goto out; + default: + printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", + __func__, status); + case -ENOMEM: + case -NFS4ERR_DENIED: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: + /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + status = 0; + } + lock_flocks(); + } + unlock_flocks(); +out: + up_write(&nfsi->rwsem); + return status; +} + +static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops) +{ + struct nfs4_state *state; + struct nfs4_lock_state *lock; + int status = 0; + + /* Note: we rely on the sp->so_states list being ordered + * so that we always reclaim open(O_RDWR) and/or open(O_WRITE) + * states first. + * This is needed to ensure that the server won't give us any + * read delegations that we have to return if, say, we are + * recovering after a network partition or a reboot from a + * server that doesn't support a grace period. + */ +restart: + spin_lock(&sp->so_lock); + list_for_each_entry(state, &sp->so_states, open_states) { + if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) + continue; + if (state->state == 0) + continue; + atomic_inc(&state->count); + spin_unlock(&sp->so_lock); + status = ops->recover_open(sp, state); + if (status >= 0) { + status = nfs4_reclaim_locks(state, ops); + if (status >= 0) { + list_for_each_entry(lock, &state->lock_states, ls_locks) { + if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) + printk("%s: Lock reclaim failed!\n", + __func__); + } + nfs4_put_open_state(state); + goto restart; + } + } + switch (status) { + default: + printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", + __func__, status); + case -ENOENT: + case -ENOMEM: + case -ESTALE: + /* + * Open state on this file cannot be recovered + * All we can do is revert to using the zero stateid. + */ + memset(state->stateid.data, 0, + sizeof(state->stateid.data)); + /* Mark the file as being 'closed' */ + state->state = 0; + break; + case -EKEYEXPIRED: + /* + * User RPCSEC_GSS context has expired. + * We cannot recover this stateid now, so + * skip it and allow recovery thread to + * proceed. + */ + break; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + goto out_err; + } + nfs4_put_open_state(state); + goto restart; + } + spin_unlock(&sp->so_lock); + return 0; +out_err: + nfs4_put_open_state(state); + return status; +} + +static void nfs4_clear_open_state(struct nfs4_state *state) +{ + struct nfs4_lock_state *lock; + + clear_bit(NFS_DELEGATED_STATE, &state->flags); + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDWR_STATE, &state->flags); + list_for_each_entry(lock, &state->lock_states, ls_locks) { + lock->ls_seqid.flags = 0; + lock->ls_flags &= ~NFS_LOCK_INITIALIZED; + } +} + +static void nfs4_reset_seqids(struct nfs_server *server, + int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state_owner *sp; + struct rb_node *pos; + struct nfs4_state *state; + + spin_lock(&clp->cl_lock); + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); + sp->so_seqid.flags = 0; + spin_lock(&sp->so_lock); + list_for_each_entry(state, &sp->so_states, open_states) { + if (mark_reclaim(clp, state)) + nfs4_clear_open_state(state); + } + spin_unlock(&sp->so_lock); + } + spin_unlock(&clp->cl_lock); +} + +static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, + int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs4_reset_seqids(server, mark_reclaim); + rcu_read_unlock(); +} + +static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) +{ + /* Mark all delegations for reclaim */ + nfs_delegation_mark_reclaim(clp); + nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); +} + +static void nfs4_reclaim_complete(struct nfs_client *clp, + const struct nfs4_state_recovery_ops *ops) +{ + /* Notify the server we're done reclaiming our state */ + if (ops->reclaim_complete) + (void)ops->reclaim_complete(clp); +} + +static void nfs4_clear_reclaim_server(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state_owner *sp; + struct rb_node *pos; + struct nfs4_state *state; + + spin_lock(&clp->cl_lock); + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); + spin_lock(&sp->so_lock); + list_for_each_entry(state, &sp->so_states, open_states) { + if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, + &state->flags)) + continue; + nfs4_state_mark_reclaim_nograce(clp, state); + } + spin_unlock(&sp->so_lock); + } + spin_unlock(&clp->cl_lock); +} + +static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) +{ + struct nfs_server *server; + + if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + return 0; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs4_clear_reclaim_server(server); + rcu_read_unlock(); + + nfs_delegation_reap_unclaimed(clp); + return 1; +} + +static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) +{ + if (!nfs4_state_clear_reclaim_reboot(clp)) + return; + nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); +} + +static void nfs_delegation_clear_all(struct nfs_client *clp) +{ + nfs_delegation_mark_reclaim(clp); + nfs_delegation_reap_unclaimed(clp); +} + +static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) +{ + nfs_delegation_clear_all(clp); + nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); +} + +static void nfs4_warn_keyexpired(const char *s) +{ + printk_ratelimited(KERN_WARNING "Error: state manager" + " encountered RPCSEC_GSS session" + " expired against NFSv4 server %s.\n", + s); +} + +static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) +{ + switch (error) { + case -NFS4ERR_CB_PATH_DOWN: + nfs_handle_cb_pathdown(clp); + return 0; + case -NFS4ERR_NO_GRACE: + nfs4_state_end_reclaim_reboot(clp); + return 0; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_LEASE_MOVED: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_clear_reclaim_reboot(clp); + nfs4_state_start_reclaim_reboot(clp); + break; + case -NFS4ERR_EXPIRED: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + /* Zero session reset errors */ + return 0; + case -EKEYEXPIRED: + /* Nothing we can do */ + nfs4_warn_keyexpired(clp->cl_hostname); + return 0; + } + return error; +} + +static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops) +{ + struct nfs4_state_owner *sp; + struct nfs_server *server; + struct rb_node *pos; + int status = 0; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + spin_lock(&clp->cl_lock); + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, + struct nfs4_state_owner, so_server_node); + if (!test_and_clear_bit(ops->owner_flag_bit, + &sp->so_flags)) + continue; + atomic_inc(&sp->so_count); + spin_unlock(&clp->cl_lock); + rcu_read_unlock(); + + status = nfs4_reclaim_open_state(sp, ops); + if (status < 0) { + set_bit(ops->owner_flag_bit, &sp->so_flags); + nfs4_put_state_owner(sp); + return nfs4_recovery_handle_error(clp, status); + } + + nfs4_put_state_owner(sp); + goto restart; + } + spin_unlock(&clp->cl_lock); + } + rcu_read_unlock(); + return status; +} + +static int nfs4_check_lease(struct nfs_client *clp) +{ + struct rpc_cred *cred; + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + int status = -NFS4ERR_EXPIRED; + + /* Is the client already known to have an expired lease? */ + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + return 0; + spin_lock(&clp->cl_lock); + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) { + cred = nfs4_get_setclientid_cred(clp); + if (cred == NULL) + goto out; + } + status = ops->renew_lease(clp, cred); + put_rpccred(cred); +out: + return nfs4_recovery_handle_error(clp, status); +} + +static int nfs4_reclaim_lease(struct nfs_client *clp) +{ + struct rpc_cred *cred; + const struct nfs4_state_recovery_ops *ops = + clp->cl_mvops->reboot_recovery_ops; + int status = -ENOENT; + + cred = ops->get_clid_cred(clp); + if (cred != NULL) { + status = ops->establish_clid(clp, cred); + put_rpccred(cred); + /* Handle case where the user hasn't set up machine creds */ + if (status == -EACCES && cred == clp->cl_machine_cred) { + nfs4_clear_machine_cred(clp); + status = -EAGAIN; + } + if (status == -NFS4ERR_MINOR_VERS_MISMATCH) + status = -EPROTONOSUPPORT; + } + return status; +} + +#ifdef CONFIG_NFS_V4_1 +void nfs4_schedule_session_recovery(struct nfs4_session *session) +{ + struct nfs_client *clp = session->clp; + + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + nfs4_schedule_lease_recovery(clp); +} +EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); + +void nfs41_handle_recall_slot(struct nfs_client *clp) +{ + set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); + nfs4_schedule_state_manager(clp); +} + +static void nfs4_reset_all_state(struct nfs_client *clp) +{ + if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { + clp->cl_boot_time = CURRENT_TIME; + nfs4_state_start_reclaim_nograce(clp); + nfs4_schedule_state_manager(clp); + } +} + +static void nfs41_handle_server_reboot(struct nfs_client *clp) +{ + if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { + nfs4_state_start_reclaim_reboot(clp); + nfs4_schedule_state_manager(clp); + } +} + +static void nfs41_handle_state_revoked(struct nfs_client *clp) +{ + /* Temporary */ + nfs4_reset_all_state(clp); +} + +static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp) +{ + /* This will need to handle layouts too */ + nfs_expire_all_delegations(clp); +} + +static void nfs41_handle_cb_path_down(struct nfs_client *clp) +{ + nfs_expire_all_delegations(clp); + if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) + nfs4_schedule_state_manager(clp); +} + +void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) +{ + if (!flags) + return; + if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) + nfs41_handle_server_reboot(clp); + if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | + SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | + SEQ4_STATUS_ADMIN_STATE_REVOKED | + SEQ4_STATUS_LEASE_MOVED)) + nfs41_handle_state_revoked(clp); + if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) + nfs41_handle_recallable_state_revoked(clp); + if (flags & (SEQ4_STATUS_CB_PATH_DOWN | + SEQ4_STATUS_BACKCHANNEL_FAULT | + SEQ4_STATUS_CB_PATH_DOWN_SESSION)) + nfs41_handle_cb_path_down(clp); +} + +static int nfs4_reset_session(struct nfs_client *clp) +{ + int status; + + nfs4_begin_drain_session(clp); + status = nfs4_proc_destroy_session(clp->cl_session); + if (status && status != -NFS4ERR_BADSESSION && + status != -NFS4ERR_DEADSESSION) { + status = nfs4_recovery_handle_error(clp, status); + goto out; + } + + memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN); + status = nfs4_proc_create_session(clp); + if (status) { + status = nfs4_recovery_handle_error(clp, status); + goto out; + } + clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + /* create_session negotiated new slot table */ + clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); + + /* Let the state manager reestablish state */ + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + nfs41_setup_state_renewal(clp); +out: + return status; +} + +static int nfs4_recall_slot(struct nfs_client *clp) +{ + struct nfs4_slot_table *fc_tbl = &clp->cl_session->fc_slot_table; + struct nfs4_channel_attrs *fc_attrs = &clp->cl_session->fc_attrs; + struct nfs4_slot *new, *old; + int i; + + nfs4_begin_drain_session(clp); + new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot), + GFP_NOFS); + if (!new) + return -ENOMEM; + + spin_lock(&fc_tbl->slot_tbl_lock); + for (i = 0; i < fc_tbl->target_max_slots; i++) + new[i].seq_nr = fc_tbl->slots[i].seq_nr; + old = fc_tbl->slots; + fc_tbl->slots = new; + fc_tbl->max_slots = fc_tbl->target_max_slots; + fc_tbl->target_max_slots = 0; + fc_attrs->max_reqs = fc_tbl->max_slots; + spin_unlock(&fc_tbl->slot_tbl_lock); + + kfree(old); + nfs4_end_drain_session(clp); + return 0; +} + +#else /* CONFIG_NFS_V4_1 */ +static int nfs4_reset_session(struct nfs_client *clp) { return 0; } +static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } +static int nfs4_recall_slot(struct nfs_client *clp) { return 0; } +#endif /* CONFIG_NFS_V4_1 */ + +/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors + * on EXCHANGE_ID for v4.1 + */ +static void nfs4_set_lease_expired(struct nfs_client *clp, int status) +{ + switch (status) { + case -NFS4ERR_CLID_INUSE: + case -NFS4ERR_STALE_CLIENTID: + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + break; + case -NFS4ERR_DELAY: + case -ETIMEDOUT: + case -EAGAIN: + ssleep(1); + break; + + case -EKEYEXPIRED: + nfs4_warn_keyexpired(clp->cl_hostname); + case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery + * in nfs4_exchange_id */ + default: + return; + } + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); +} + +static void nfs4_state_manager(struct nfs_client *clp) +{ + int status = 0; + + /* Ensure exclusive access to NFSv4 state */ + do { + if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) { + /* We're going to have to re-establish a clientid */ + status = nfs4_reclaim_lease(clp); + if (status) { + nfs4_set_lease_expired(clp, status); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, + &clp->cl_state)) + continue; + if (clp->cl_cons_state == + NFS_CS_SESSION_INITING) + nfs_mark_client_ready(clp, status); + goto out_error; + } + clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + pnfs_destroy_all_layouts(clp); + } + + if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { + status = nfs4_check_lease(clp); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + continue; + if (status < 0 && status != -NFS4ERR_CB_PATH_DOWN) + goto out_error; + } + + /* Initialize or reset the session */ + if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) + && nfs4_has_session(clp)) { + status = nfs4_reset_session(clp); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + continue; + if (status < 0) + goto out_error; + } + + /* First recover reboot state... */ + if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, + clp->cl_mvops->reboot_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + continue; + nfs4_state_end_reclaim_reboot(clp); + if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) + continue; + if (status < 0) + goto out_error; + } + + /* Now recover expired state... */ + if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, + clp->cl_mvops->nograce_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || + test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + continue; + if (status < 0) + goto out_error; + } + + nfs4_end_drain_session(clp); + if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { + nfs_client_return_marked_delegations(clp); + continue; + } + /* Recall session slots */ + if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state) + && nfs4_has_session(clp)) { + status = nfs4_recall_slot(clp); + if (status < 0) + goto out_error; + continue; + } + + + nfs4_clear_state_manager_bit(clp); + /* Did we race with an attempt to give us more work? */ + if (clp->cl_state == 0) + break; + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + break; + } while (atomic_read(&clp->cl_count) > 1); + return; +out_error: + printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s" + " with error %d\n", clp->cl_hostname, -status); + nfs4_end_drain_session(clp); + nfs4_clear_state_manager_bit(clp); +} + +static int nfs4_run_state_manager(void *ptr) +{ + struct nfs_client *clp = ptr; + + allow_signal(SIGKILL); + nfs4_state_manager(clp); + nfs_put_client(clp); + module_put_and_exit(0); + return 0; +} + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c new file mode 100644 index 00000000..fc97fd53 --- /dev/null +++ b/fs/nfs/nfs4xdr.c @@ -0,0 +1,6679 @@ +/* + * fs/nfs/nfs4xdr.c + * + * Client-side XDR for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nfs4_fs.h" +#include "internal.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_XDR + +/* Mapping from NFS error code to "errno" error code. */ +#define errno_NFSERR_IO EIO + +static int nfs4_stat_to_errno(int); + +/* NFSv4 COMPOUND tags are only wanted for debugging purposes */ +#ifdef DEBUG +#define NFS4_MAXTAGLEN 20 +#else +#define NFS4_MAXTAGLEN 0 +#endif + +/* lock,open owner id: + * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) + */ +#define open_owner_id_maxsz (1 + 1 + 4) +#define lock_owner_id_maxsz (1 + 1 + 4) +#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) +#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) +#define op_encode_hdr_maxsz (1) +#define op_decode_hdr_maxsz (2) +#define encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE)) +#define decode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE)) +#define encode_putfh_maxsz (op_encode_hdr_maxsz + 1 + \ + (NFS4_FHSIZE >> 2)) +#define decode_putfh_maxsz (op_decode_hdr_maxsz) +#define encode_putrootfh_maxsz (op_encode_hdr_maxsz) +#define decode_putrootfh_maxsz (op_decode_hdr_maxsz) +#define encode_getfh_maxsz (op_encode_hdr_maxsz) +#define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ + ((3+NFS4_FHSIZE) >> 2)) +#define nfs4_fattr_bitmap_maxsz 4 +#define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) +#define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) +#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) +#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +/* This is based on getfattr, which uses the most attributes: */ +#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ + 3 + 3 + 3 + nfs4_owner_maxsz + nfs4_group_maxsz)) +#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ + nfs4_fattr_value_maxsz) +#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) +#define encode_attrs_maxsz (nfs4_fattr_bitmap_maxsz + \ + 1 + 2 + 1 + \ + nfs4_owner_maxsz + \ + nfs4_group_maxsz + \ + 4 + 4) +#define encode_savefh_maxsz (op_encode_hdr_maxsz) +#define decode_savefh_maxsz (op_decode_hdr_maxsz) +#define encode_restorefh_maxsz (op_encode_hdr_maxsz) +#define decode_restorefh_maxsz (op_decode_hdr_maxsz) +#define encode_fsinfo_maxsz (encode_getattr_maxsz) +#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15) +#define encode_renew_maxsz (op_encode_hdr_maxsz + 3) +#define decode_renew_maxsz (op_decode_hdr_maxsz) +#define encode_setclientid_maxsz \ + (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \ + XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \ + 1 /* sc_prog */ + \ + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \ + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \ + 1) /* sc_cb_ident */ +#define decode_setclientid_maxsz \ + (op_decode_hdr_maxsz + \ + 2 + \ + 1024) /* large value for CLID_INUSE */ +#define encode_setclientid_confirm_maxsz \ + (op_encode_hdr_maxsz + \ + 3 + (NFS4_VERIFIER_SIZE >> 2)) +#define decode_setclientid_confirm_maxsz \ + (op_decode_hdr_maxsz) +#define encode_lookup_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) +#define decode_lookup_maxsz (op_decode_hdr_maxsz) +#define encode_share_access_maxsz \ + (2) +#define encode_createmode_maxsz (1 + encode_attrs_maxsz + encode_verifier_maxsz) +#define encode_opentype_maxsz (1 + encode_createmode_maxsz) +#define encode_claim_null_maxsz (1 + nfs4_name_maxsz) +#define encode_open_maxsz (op_encode_hdr_maxsz + \ + 2 + encode_share_access_maxsz + 2 + \ + open_owner_id_maxsz + \ + encode_opentype_maxsz + \ + encode_claim_null_maxsz) +#define decode_ace_maxsz (3 + nfs4_owner_maxsz) +#define decode_delegation_maxsz (1 + decode_stateid_maxsz + 1 + \ + decode_ace_maxsz) +#define decode_change_info_maxsz (5) +#define decode_open_maxsz (op_decode_hdr_maxsz + \ + decode_stateid_maxsz + \ + decode_change_info_maxsz + 1 + \ + nfs4_fattr_bitmap_maxsz + \ + decode_delegation_maxsz) +#define encode_open_confirm_maxsz \ + (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + 1) +#define decode_open_confirm_maxsz \ + (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) +#define encode_open_downgrade_maxsz \ + (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + 1 + \ + encode_share_access_maxsz) +#define decode_open_downgrade_maxsz \ + (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) +#define encode_close_maxsz (op_encode_hdr_maxsz + \ + 1 + encode_stateid_maxsz) +#define decode_close_maxsz (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) +#define encode_setattr_maxsz (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + \ + encode_attrs_maxsz) +#define decode_setattr_maxsz (op_decode_hdr_maxsz + \ + nfs4_fattr_bitmap_maxsz) +#define encode_read_maxsz (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + 3) +#define decode_read_maxsz (op_decode_hdr_maxsz + 2) +#define encode_readdir_maxsz (op_encode_hdr_maxsz + \ + 2 + encode_verifier_maxsz + 5) +#define decode_readdir_maxsz (op_decode_hdr_maxsz + \ + decode_verifier_maxsz) +#define encode_readlink_maxsz (op_encode_hdr_maxsz) +#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) +#define encode_write_maxsz (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + 4) +#define decode_write_maxsz (op_decode_hdr_maxsz + \ + 2 + decode_verifier_maxsz) +#define encode_commit_maxsz (op_encode_hdr_maxsz + 3) +#define decode_commit_maxsz (op_decode_hdr_maxsz + \ + decode_verifier_maxsz) +#define encode_remove_maxsz (op_encode_hdr_maxsz + \ + nfs4_name_maxsz) +#define decode_remove_maxsz (op_decode_hdr_maxsz + \ + decode_change_info_maxsz) +#define encode_rename_maxsz (op_encode_hdr_maxsz + \ + 2 * nfs4_name_maxsz) +#define decode_rename_maxsz (op_decode_hdr_maxsz + \ + decode_change_info_maxsz + \ + decode_change_info_maxsz) +#define encode_link_maxsz (op_encode_hdr_maxsz + \ + nfs4_name_maxsz) +#define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) +#define encode_lockowner_maxsz (7) +#define encode_lock_maxsz (op_encode_hdr_maxsz + \ + 7 + \ + 1 + encode_stateid_maxsz + 1 + \ + encode_lockowner_maxsz) +#define decode_lock_denied_maxsz \ + (8 + decode_lockowner_maxsz) +#define decode_lock_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) +#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ + encode_lockowner_maxsz) +#define decode_lockt_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) +#define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ + encode_stateid_maxsz + \ + 4) +#define decode_locku_maxsz (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) +#define encode_release_lockowner_maxsz \ + (op_encode_hdr_maxsz + \ + encode_lockowner_maxsz) +#define decode_release_lockowner_maxsz \ + (op_decode_hdr_maxsz) +#define encode_access_maxsz (op_encode_hdr_maxsz + 1) +#define decode_access_maxsz (op_decode_hdr_maxsz + 2) +#define encode_symlink_maxsz (op_encode_hdr_maxsz + \ + 1 + nfs4_name_maxsz + \ + 1 + \ + nfs4_fattr_maxsz) +#define decode_symlink_maxsz (op_decode_hdr_maxsz + 8) +#define encode_create_maxsz (op_encode_hdr_maxsz + \ + 1 + 2 + nfs4_name_maxsz + \ + encode_attrs_maxsz) +#define decode_create_maxsz (op_decode_hdr_maxsz + \ + decode_change_info_maxsz + \ + nfs4_fattr_bitmap_maxsz) +#define encode_statfs_maxsz (encode_getattr_maxsz) +#define decode_statfs_maxsz (decode_getattr_maxsz) +#define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4) +#define decode_delegreturn_maxsz (op_decode_hdr_maxsz) +#define encode_getacl_maxsz (encode_getattr_maxsz) +#define decode_getacl_maxsz (op_decode_hdr_maxsz + \ + nfs4_fattr_bitmap_maxsz + 1) +#define encode_setacl_maxsz (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + 3) +#define decode_setacl_maxsz (decode_setattr_maxsz) +#define encode_fs_locations_maxsz \ + (encode_getattr_maxsz) +#define decode_fs_locations_maxsz \ + (0) +#define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) +#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4)) + +#if defined(CONFIG_NFS_V4_1) +#define NFS4_MAX_MACHINE_NAME_LEN (64) + +#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \ + encode_verifier_maxsz + \ + 1 /* co_ownerid.len */ + \ + XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \ + 1 /* flags */ + \ + 1 /* spa_how */ + \ + 0 /* SP4_NONE (for now) */ + \ + 1 /* zero implemetation id array */) +#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \ + 2 /* eir_clientid */ + \ + 1 /* eir_sequenceid */ + \ + 1 /* eir_flags */ + \ + 1 /* spr_how */ + \ + 0 /* SP4_NONE (for now) */ + \ + 2 /* eir_server_owner.so_minor_id */ + \ + /* eir_server_owner.so_major_id<> */ \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \ + /* eir_server_scope<> */ \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \ + 1 /* eir_server_impl_id array length */ + \ + 0 /* ignored eir_server_impl_id contents */) +#define encode_channel_attrs_maxsz (6 + 1 /* ca_rdma_ird.len (0) */) +#define decode_channel_attrs_maxsz (6 + \ + 1 /* ca_rdma_ird.len */ + \ + 1 /* ca_rdma_ird */) +#define encode_create_session_maxsz (op_encode_hdr_maxsz + \ + 2 /* csa_clientid */ + \ + 1 /* csa_sequence */ + \ + 1 /* csa_flags */ + \ + encode_channel_attrs_maxsz + \ + encode_channel_attrs_maxsz + \ + 1 /* csa_cb_program */ + \ + 1 /* csa_sec_parms.len (1) */ + \ + 1 /* cb_secflavor (AUTH_SYS) */ + \ + 1 /* stamp */ + \ + 1 /* machinename.len */ + \ + XDR_QUADLEN(NFS4_MAX_MACHINE_NAME_LEN) + \ + 1 /* uid */ + \ + 1 /* gid */ + \ + 1 /* gids.len (0) */) +#define decode_create_session_maxsz (op_decode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + 1 /* csr_sequence */ + \ + 1 /* csr_flags */ + \ + decode_channel_attrs_maxsz + \ + decode_channel_attrs_maxsz) +#define encode_destroy_session_maxsz (op_encode_hdr_maxsz + 4) +#define decode_destroy_session_maxsz (op_decode_hdr_maxsz) +#define encode_sequence_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4) +#define decode_sequence_maxsz (op_decode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) +#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) +#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) +#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ + XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) +#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ + 1 /* layout type */ + \ + 1 /* opaque devaddr4 length */ + \ + /* devaddr4 payload is read into page */ \ + 1 /* notification bitmap length */ + \ + 1 /* notification bitmap */) +#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ + encode_stateid_maxsz) +#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ + decode_stateid_maxsz + \ + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) +#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \ + 2 /* offset */ + \ + 2 /* length */ + \ + 1 /* reclaim */ + \ + encode_stateid_maxsz + \ + 1 /* new offset (true) */ + \ + 2 /* last byte written */ + \ + 1 /* nt_timechanged (false) */ + \ + 1 /* layoutupdate4 layout type */ + \ + 1 /* NULL filelayout layoutupdate4 payload */) +#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) +#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ + encode_stateid_maxsz + \ + 1 /* FIXME: opaque lrf_body always empty at the moment */) +#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ + 1 + decode_stateid_maxsz) +#else /* CONFIG_NFS_V4_1 */ +#define encode_sequence_maxsz 0 +#define decode_sequence_maxsz 0 +#endif /* CONFIG_NFS_V4_1 */ + +#define NFS4_enc_compound_sz (1024) /* XXX: large enough? */ +#define NFS4_dec_compound_sz (1024) /* XXX: large enough? */ +#define NFS4_enc_read_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_read_maxsz) +#define NFS4_dec_read_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_read_maxsz) +#define NFS4_enc_readlink_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_readlink_maxsz) +#define NFS4_dec_readlink_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_readlink_maxsz) +#define NFS4_enc_readdir_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_readdir_maxsz) +#define NFS4_dec_readdir_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_readdir_maxsz) +#define NFS4_enc_write_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_write_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_write_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_write_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_commit_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_commit_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_commit_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_commit_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_savefh_maxsz + \ + encode_open_maxsz + \ + encode_getfh_maxsz + \ + encode_getattr_maxsz + \ + encode_restorefh_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_savefh_maxsz + \ + decode_open_maxsz + \ + decode_getfh_maxsz + \ + decode_getattr_maxsz + \ + decode_restorefh_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_open_confirm_sz \ + (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_open_confirm_maxsz) +#define NFS4_dec_open_confirm_sz \ + (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_open_confirm_maxsz) +#define NFS4_enc_open_noattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_open_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_open_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_open_downgrade_sz \ + (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_open_downgrade_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_open_downgrade_sz \ + (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_open_downgrade_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_close_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_close_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_setattr_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_setattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_setattr_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_fsinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_fsinfo_maxsz) +#define NFS4_dec_fsinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_fsinfo_maxsz) +#define NFS4_enc_renew_sz (compound_encode_hdr_maxsz + \ + encode_renew_maxsz) +#define NFS4_dec_renew_sz (compound_decode_hdr_maxsz + \ + decode_renew_maxsz) +#define NFS4_enc_setclientid_sz (compound_encode_hdr_maxsz + \ + encode_setclientid_maxsz) +#define NFS4_dec_setclientid_sz (compound_decode_hdr_maxsz + \ + decode_setclientid_maxsz) +#define NFS4_enc_setclientid_confirm_sz \ + (compound_encode_hdr_maxsz + \ + encode_setclientid_confirm_maxsz + \ + encode_putrootfh_maxsz + \ + encode_fsinfo_maxsz) +#define NFS4_dec_setclientid_confirm_sz \ + (compound_decode_hdr_maxsz + \ + decode_setclientid_confirm_maxsz + \ + decode_putrootfh_maxsz + \ + decode_fsinfo_maxsz) +#define NFS4_enc_lock_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_lock_maxsz) +#define NFS4_dec_lock_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_lock_maxsz) +#define NFS4_enc_lockt_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_lockt_maxsz) +#define NFS4_dec_lockt_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_lockt_maxsz) +#define NFS4_enc_locku_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_locku_maxsz) +#define NFS4_dec_locku_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_locku_maxsz) +#define NFS4_enc_release_lockowner_sz \ + (compound_encode_hdr_maxsz + \ + encode_lockowner_maxsz) +#define NFS4_dec_release_lockowner_sz \ + (compound_decode_hdr_maxsz + \ + decode_lockowner_maxsz) +#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_access_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_access_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_access_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_getattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_lookup_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_lookup_maxsz + \ + encode_getattr_maxsz + \ + encode_getfh_maxsz) +#define NFS4_dec_lookup_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_lookup_maxsz + \ + decode_getattr_maxsz + \ + decode_getfh_maxsz) +#define NFS4_enc_lookup_root_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putrootfh_maxsz + \ + encode_getattr_maxsz + \ + encode_getfh_maxsz) +#define NFS4_dec_lookup_root_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putrootfh_maxsz + \ + decode_getattr_maxsz + \ + decode_getfh_maxsz) +#define NFS4_enc_remove_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_remove_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_remove_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_remove_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_rename_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_savefh_maxsz + \ + encode_putfh_maxsz + \ + encode_rename_maxsz + \ + encode_getattr_maxsz + \ + encode_restorefh_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_rename_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_savefh_maxsz + \ + decode_putfh_maxsz + \ + decode_rename_maxsz + \ + decode_getattr_maxsz + \ + decode_restorefh_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_link_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_savefh_maxsz + \ + encode_putfh_maxsz + \ + encode_link_maxsz + \ + decode_getattr_maxsz + \ + encode_restorefh_maxsz + \ + decode_getattr_maxsz) +#define NFS4_dec_link_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_savefh_maxsz + \ + decode_putfh_maxsz + \ + decode_link_maxsz + \ + decode_getattr_maxsz + \ + decode_restorefh_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_symlink_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_symlink_maxsz + \ + encode_getattr_maxsz + \ + encode_getfh_maxsz) +#define NFS4_dec_symlink_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_symlink_maxsz + \ + decode_getattr_maxsz + \ + decode_getfh_maxsz) +#define NFS4_enc_create_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_savefh_maxsz + \ + encode_create_maxsz + \ + encode_getfh_maxsz + \ + encode_getattr_maxsz + \ + encode_restorefh_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_create_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_savefh_maxsz + \ + decode_create_maxsz + \ + decode_getfh_maxsz + \ + decode_getattr_maxsz + \ + decode_restorefh_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_pathconf_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_pathconf_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_statfs_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_statfs_maxsz) +#define NFS4_dec_statfs_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_statfs_maxsz) +#define NFS4_enc_server_caps_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_delegreturn_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_delegreturn_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getacl_maxsz) +#define NFS4_dec_getacl_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getacl_maxsz) +#define NFS4_enc_setacl_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_setacl_maxsz) +#define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_setacl_maxsz) +#define NFS4_enc_fs_locations_sz \ + (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_lookup_maxsz + \ + encode_fs_locations_maxsz) +#define NFS4_dec_fs_locations_sz \ + (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_lookup_maxsz + \ + decode_fs_locations_maxsz) +#define NFS4_enc_secinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_secinfo_maxsz) +#define NFS4_dec_secinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_secinfo_maxsz) +#if defined(CONFIG_NFS_V4_1) +#define NFS4_enc_exchange_id_sz \ + (compound_encode_hdr_maxsz + \ + encode_exchange_id_maxsz) +#define NFS4_dec_exchange_id_sz \ + (compound_decode_hdr_maxsz + \ + decode_exchange_id_maxsz) +#define NFS4_enc_create_session_sz \ + (compound_encode_hdr_maxsz + \ + encode_create_session_maxsz) +#define NFS4_dec_create_session_sz \ + (compound_decode_hdr_maxsz + \ + decode_create_session_maxsz) +#define NFS4_enc_destroy_session_sz (compound_encode_hdr_maxsz + \ + encode_destroy_session_maxsz) +#define NFS4_dec_destroy_session_sz (compound_decode_hdr_maxsz + \ + decode_destroy_session_maxsz) +#define NFS4_enc_sequence_sz \ + (compound_decode_hdr_maxsz + \ + encode_sequence_maxsz) +#define NFS4_dec_sequence_sz \ + (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz) +#define NFS4_enc_get_lease_time_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putrootfh_maxsz + \ + encode_fsinfo_maxsz) +#define NFS4_dec_get_lease_time_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putrootfh_maxsz + \ + decode_fsinfo_maxsz) +#define NFS4_enc_reclaim_complete_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_reclaim_complete_maxsz) +#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_reclaim_complete_maxsz) +#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_getdeviceinfo_maxsz) +#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_getdeviceinfo_maxsz) +#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_layoutget_maxsz) +#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutget_maxsz) +#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_putfh_maxsz + \ + encode_layoutcommit_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutcommit_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_layoutreturn_maxsz) +#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutreturn_maxsz) + +const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + + encode_sequence_maxsz + + encode_putfh_maxsz + + encode_getattr_maxsz) * + XDR_UNIT); + +const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_decode_hdr_maxsz + + decode_sequence_maxsz + + decode_putfh_maxsz) * + XDR_UNIT); +#endif /* CONFIG_NFS_V4_1 */ + +static const umode_t nfs_type2fmt[] = { + [NF4BAD] = 0, + [NF4REG] = S_IFREG, + [NF4DIR] = S_IFDIR, + [NF4BLK] = S_IFBLK, + [NF4CHR] = S_IFCHR, + [NF4LNK] = S_IFLNK, + [NF4SOCK] = S_IFSOCK, + [NF4FIFO] = S_IFIFO, + [NF4ATTRDIR] = 0, + [NF4NAMEDATTR] = 0, +}; + +struct compound_hdr { + int32_t status; + uint32_t nops; + __be32 * nops_p; + uint32_t taglen; + char * tag; + uint32_t replen; /* expected reply words */ + u32 minorversion; +}; + +static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes) +{ + __be32 *p = xdr_reserve_space(xdr, nbytes); + BUG_ON(!p); + return p; +} + +static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4 + len); + BUG_ON(p == NULL); + xdr_encode_opaque(p, str, len); +} + +static void encode_compound_hdr(struct xdr_stream *xdr, + struct rpc_rqst *req, + struct compound_hdr *hdr) +{ + __be32 *p; + struct rpc_auth *auth = req->rq_cred->cr_auth; + + /* initialize running count of expected bytes in reply. + * NOTE: the replied tag SHOULD be the same is the one sent, + * but this is not required as a MUST for the server to do so. */ + hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; + + dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); + BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); + p = reserve_space(xdr, 4 + hdr->taglen + 8); + p = xdr_encode_opaque(p, hdr->tag, hdr->taglen); + *p++ = cpu_to_be32(hdr->minorversion); + hdr->nops_p = p; + *p = cpu_to_be32(hdr->nops); +} + +static void encode_nops(struct compound_hdr *hdr) +{ + BUG_ON(hdr->nops > NFS4_MAX_OPS); + *hdr->nops_p = htonl(hdr->nops); +} + +static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + BUG_ON(p == NULL); + xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE); +} + +static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) +{ + char owner_name[IDMAP_NAMESZ]; + char owner_group[IDMAP_NAMESZ]; + int owner_namelen = 0; + int owner_grouplen = 0; + __be32 *p; + __be32 *q; + int len; + uint32_t bmval0 = 0; + uint32_t bmval1 = 0; + + /* + * We reserve enough space to write the entire attribute buffer at once. + * In the worst-case, this would be + * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) + * = 36 bytes, plus any contribution from variable-length fields + * such as owner/group. + */ + len = 16; + + /* Sigh */ + if (iap->ia_valid & ATTR_SIZE) + len += 8; + if (iap->ia_valid & ATTR_MODE) + len += 4; + if (iap->ia_valid & ATTR_UID) { + owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); + if (owner_namelen < 0) { + dprintk("nfs: couldn't resolve uid %d to string\n", + iap->ia_uid); + /* XXX */ + strcpy(owner_name, "nobody"); + owner_namelen = sizeof("nobody") - 1; + /* goto out; */ + } + len += 4 + (XDR_QUADLEN(owner_namelen) << 2); + } + if (iap->ia_valid & ATTR_GID) { + owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ); + if (owner_grouplen < 0) { + dprintk("nfs: couldn't resolve gid %d to string\n", + iap->ia_gid); + strcpy(owner_group, "nobody"); + owner_grouplen = sizeof("nobody") - 1; + /* goto out; */ + } + len += 4 + (XDR_QUADLEN(owner_grouplen) << 2); + } + if (iap->ia_valid & ATTR_ATIME_SET) + len += 16; + else if (iap->ia_valid & ATTR_ATIME) + len += 4; + if (iap->ia_valid & ATTR_MTIME_SET) + len += 16; + else if (iap->ia_valid & ATTR_MTIME) + len += 4; + p = reserve_space(xdr, len); + + /* + * We write the bitmap length now, but leave the bitmap and the attribute + * buffer length to be backfilled at the end of this routine. + */ + *p++ = cpu_to_be32(2); + q = p; + p += 3; + + if (iap->ia_valid & ATTR_SIZE) { + bmval0 |= FATTR4_WORD0_SIZE; + p = xdr_encode_hyper(p, iap->ia_size); + } + if (iap->ia_valid & ATTR_MODE) { + bmval1 |= FATTR4_WORD1_MODE; + *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO); + } + if (iap->ia_valid & ATTR_UID) { + bmval1 |= FATTR4_WORD1_OWNER; + p = xdr_encode_opaque(p, owner_name, owner_namelen); + } + if (iap->ia_valid & ATTR_GID) { + bmval1 |= FATTR4_WORD1_OWNER_GROUP; + p = xdr_encode_opaque(p, owner_group, owner_grouplen); + } + if (iap->ia_valid & ATTR_ATIME_SET) { + bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(iap->ia_atime.tv_sec); + *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); + } + else if (iap->ia_valid & ATTR_ATIME) { + bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); + } + if (iap->ia_valid & ATTR_MTIME_SET) { + bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); + *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); + } + else if (iap->ia_valid & ATTR_MTIME) { + bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); + } + + /* + * Now we backfill the bitmap and the attribute buffer length. + */ + if (len != ((char *)p - (char *)q) + 4) { + printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n", + len, ((char *)p - (char *)q) + 4); + BUG(); + } + len = (char *)p - (char *)q - 12; + *q++ = htonl(bmval0); + *q++ = htonl(bmval1); + *q = htonl(len); + +/* out: */ +} + +static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_ACCESS); + *p = cpu_to_be32(access); + hdr->nops++; + hdr->replen += decode_access_maxsz; +} + +static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_CLOSE); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); + xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_close_maxsz; +} + +static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(OP_COMMIT); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); + hdr->nops++; + hdr->replen += decode_commit_maxsz; +} + +static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_CREATE); + *p = cpu_to_be32(create->ftype); + + switch (create->ftype) { + case NF4LNK: + p = reserve_space(xdr, 4); + *p = cpu_to_be32(create->u.symlink.len); + xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); + break; + + case NF4BLK: case NF4CHR: + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(create->u.device.specdata1); + *p = cpu_to_be32(create->u.device.specdata2); + break; + + default: + break; + } + + encode_string(xdr, create->name->len, create->name->name); + hdr->nops++; + hdr->replen += decode_create_maxsz; + + encode_attrs(xdr, create->attrs, create->server); +} + +static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(OP_GETATTR); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(bitmap); + hdr->nops++; + hdr->replen += decode_getattr_maxsz; +} + +static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(OP_GETATTR); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bm0); + *p = cpu_to_be32(bm1); + hdr->nops++; + hdr->replen += decode_getattr_maxsz; +} + +static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +{ + encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], + bitmask[1] & nfs4_fattr_bitmap[1], hdr); +} + +static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +{ + encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], + bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); +} + +static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +{ + encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0], + bitmask[1] & nfs4_fs_locations_bitmap[1], hdr); +} + +static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_GETFH); + hdr->nops++; + hdr->replen += decode_getfh_maxsz; +} + +static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 8 + name->len); + *p++ = cpu_to_be32(OP_LINK); + xdr_encode_opaque(p, name->name, name->len); + hdr->nops++; + hdr->replen += decode_link_maxsz; +} + +static inline int nfs4_lock_type(struct file_lock *fl, int block) +{ + if ((fl->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) == F_RDLCK) + return block ? NFS4_READW_LT : NFS4_READ_LT; + return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT; +} + +static inline uint64_t nfs4_lock_length(struct file_lock *fl) +{ + if (fl->fl_end == OFFSET_MAX) + return ~(uint64_t)0; + return fl->fl_end - fl->fl_start + 1; +} + +static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner) +{ + __be32 *p; + + p = reserve_space(xdr, 32); + p = xdr_encode_hyper(p, lowner->clientid); + *p++ = cpu_to_be32(20); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); + *p++ = cpu_to_be32(lowner->s_dev); + xdr_encode_hyper(p, lowner->id); +} + +/* + * opcode,type,reclaim,offset,length,new_lock_owner = 32 + * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 + */ +static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 32); + *p++ = cpu_to_be32(OP_LOCK); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block)); + *p++ = cpu_to_be32(args->reclaim); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + *p = cpu_to_be32(args->new_lock_owner); + if (args->new_lock_owner){ + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(args->open_seqid->sequence->counter); + p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); + encode_lockowner(xdr, &args->lock_owner); + } + else { + p = reserve_space(xdr, NFS4_STATEID_SIZE+4); + p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->lock_seqid->sequence->counter); + } + hdr->nops++; + hdr->replen += decode_lock_maxsz; +} + +static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 24); + *p++ = cpu_to_be32(OP_LOCKT); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + encode_lockowner(xdr, &args->lock_owner); + hdr->nops++; + hdr->replen += decode_lockt_maxsz; +} + +static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16); + *p++ = cpu_to_be32(OP_LOCKU); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + *p++ = cpu_to_be32(args->seqid->sequence->counter); + p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, args->fl->fl_start); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + hdr->nops++; + hdr->replen += decode_locku_maxsz; +} + +static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RELEASE_LOCKOWNER); + encode_lockowner(xdr, lowner); + hdr->nops++; + hdr->replen += decode_release_lockowner_maxsz; +} + +static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ + int len = name->len; + __be32 *p; + + p = reserve_space(xdr, 8 + len); + *p++ = cpu_to_be32(OP_LOOKUP); + xdr_encode_opaque(p, name->name, len); + hdr->nops++; + hdr->replen += decode_lookup_maxsz; +} + +static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) +{ + __be32 *p; + + p = reserve_space(xdr, 8); + switch (fmode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_READ: + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ); + break; + case FMODE_WRITE: + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE); + break; + case FMODE_READ|FMODE_WRITE: + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH); + break; + default: + *p++ = cpu_to_be32(0); + } + *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */ +} + +static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg) +{ + __be32 *p; + /* + * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, + * owner 4 = 32 + */ + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_OPEN); + *p = cpu_to_be32(arg->seqid->sequence->counter); + encode_share_access(xdr, arg->fmode); + p = reserve_space(xdr, 32); + p = xdr_encode_hyper(p, arg->clientid); + *p++ = cpu_to_be32(20); + p = xdr_encode_opaque_fixed(p, "open id:", 8); + *p++ = cpu_to_be32(arg->server->s_dev); + xdr_encode_hyper(p, arg->id); +} + +static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) +{ + __be32 *p; + struct nfs_client *clp; + + p = reserve_space(xdr, 4); + switch(arg->open_flags & O_EXCL) { + case 0: + *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); + encode_attrs(xdr, arg->u.attrs, arg->server); + break; + default: + clp = arg->server->nfs_client; + if (clp->cl_mvops->minor_version > 0) { + if (nfs4_has_persistent_session(clp)) { + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->server); + } else { + struct iattr dummy; + + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); + encode_nfs4_verifier(xdr, &arg->u.verifier); + dummy.ia_valid = 0; + encode_attrs(xdr, &dummy, arg->server); + } + } else { + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); + encode_nfs4_verifier(xdr, &arg->u.verifier); + } + } +} + +static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *arg) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + switch (arg->open_flags & O_CREAT) { + case 0: + *p = cpu_to_be32(NFS4_OPEN_NOCREATE); + break; + default: + BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); + *p = cpu_to_be32(NFS4_OPEN_CREATE); + encode_createmode(xdr, arg); + } +} + +static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + switch (delegation_type) { + case 0: + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE); + break; + case FMODE_READ: + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ); + break; + case FMODE_WRITE|FMODE_READ: + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE); + break; + default: + BUG(); + } +} + +static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *name) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL); + encode_string(xdr, name->len, name->name); +} + +static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS); + encode_delegation_type(xdr, type); +} + +static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struct qstr *name, const nfs4_stateid *stateid) +{ + __be32 *p; + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); + xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); + encode_string(xdr, name->len, name->name); +} + +static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr) +{ + encode_openhdr(xdr, arg); + encode_opentype(xdr, arg); + switch (arg->claim) { + case NFS4_OPEN_CLAIM_NULL: + encode_claim_null(xdr, arg->name); + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + encode_claim_previous(xdr, arg->u.delegation_type); + break; + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation); + break; + default: + BUG(); + } + hdr->nops++; + hdr->replen += decode_open_maxsz; +} + +static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_CONFIRM); + p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + hdr->nops++; + hdr->replen += decode_open_confirm_maxsz; +} + +static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); + p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + encode_share_access(xdr, arg->fmode); + hdr->nops++; + hdr->replen += decode_open_downgrade_maxsz; +} + +static void +encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr) +{ + int len = fh->size; + __be32 *p; + + p = reserve_space(xdr, 8 + len); + *p++ = cpu_to_be32(OP_PUTFH); + xdr_encode_opaque(p, fh->data, len); + hdr->nops++; + hdr->replen += decode_putfh_maxsz; +} + +static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_PUTROOTFH); + hdr->nops++; + hdr->replen += decode_putrootfh_maxsz; +} + +static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid) +{ + nfs4_stateid stateid; + __be32 *p; + + p = reserve_space(xdr, NFS4_STATEID_SIZE); + if (ctx->state != NULL) { + nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); + if (zero_seqid) + stateid.stateid.seqid = 0; + xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); + } else + xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); +} + +static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READ); + + encode_stateid(xdr, args->context, args->lock_context, + hdr->minorversion); + + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); + hdr->nops++; + hdr->replen += decode_read_maxsz; +} + +static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) +{ + uint32_t attrs[2] = { + FATTR4_WORD0_RDATTR_ERROR, + FATTR4_WORD1_MOUNTED_ON_FILEID, + }; + uint32_t dircount = readdir->count >> 1; + __be32 *p; + + if (readdir->plus) { + attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| + FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID; + attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER| + FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV| + FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| + FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + dircount >>= 1; + } + /* Use mounted_on_fileid only if the server supports it */ + if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) + attrs[0] |= FATTR4_WORD0_FILEID; + + p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); + *p++ = cpu_to_be32(OP_READDIR); + p = xdr_encode_hyper(p, readdir->cookie); + p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(dircount); + *p++ = cpu_to_be32(readdir->count); + *p++ = cpu_to_be32(2); + + *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); + *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); + hdr->nops++; + hdr->replen += decode_readdir_maxsz; + dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", + __func__, + (unsigned long long)readdir->cookie, + ((u32 *)readdir->verifier.data)[0], + ((u32 *)readdir->verifier.data)[1], + attrs[0] & readdir->bitmask[0], + attrs[1] & readdir->bitmask[1]); +} + +static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READLINK); + hdr->nops++; + hdr->replen += decode_readlink_maxsz; +} + +static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 8 + name->len); + *p++ = cpu_to_be32(OP_REMOVE); + xdr_encode_opaque(p, name->name, name->len); + hdr->nops++; + hdr->replen += decode_remove_maxsz; +} + +static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RENAME); + encode_string(xdr, oldname->len, oldname->name); + encode_string(xdr, newname->len, newname->name); + hdr->nops++; + hdr->replen += decode_rename_maxsz; +} + +static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(OP_RENEW); + xdr_encode_hyper(p, client_stateid->cl_clientid); + hdr->nops++; + hdr->replen += decode_renew_maxsz; +} + +static void +encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RESTOREFH); + hdr->nops++; + hdr->replen += decode_restorefh_maxsz; +} + +static void +encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); + xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 2*4); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(FATTR4_WORD0_ACL); + BUG_ON(arg->acl_len % 4); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(arg->acl_len); + xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); + hdr->nops++; + hdr->replen += decode_setacl_maxsz; +} + +static void +encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_SAVEFH); + hdr->nops++; + hdr->replen += decode_savefh_maxsz; +} + +static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); + xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_setattr_maxsz; + encode_attrs(xdr, arg->iap, server); +} + +static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(OP_SETCLIENTID); + xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); + + encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(setclientid->sc_prog); + encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); + encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(setclientid->sc_cb_ident); + hdr->nops++; + hdr->replen += decode_setclientid_maxsz; +} + +static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); + p = xdr_encode_hyper(p, arg->clientid); + xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE); + hdr->nops++; + hdr->replen += decode_setclientid_confirm_maxsz; +} + +static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_WRITE); + + encode_stateid(xdr, args->context, args->lock_context, + hdr->minorversion); + + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->offset); + *p++ = cpu_to_be32(args->stable); + *p = cpu_to_be32(args->count); + + xdr_write_pages(xdr, args->pages, args->pgbase, args->count); + hdr->nops++; + hdr->replen += decode_write_maxsz; +} + +static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + + *p++ = cpu_to_be32(OP_DELEGRETURN); + xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_delegreturn_maxsz; +} + +static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ + int len = name->len; + __be32 *p; + + p = reserve_space(xdr, 8 + len); + *p++ = cpu_to_be32(OP_SECINFO); + xdr_encode_opaque(p, name->name, len); + hdr->nops++; + hdr->replen += decode_secinfo_maxsz; +} + +#if defined(CONFIG_NFS_V4_1) +/* NFSv4.1 operations */ +static void encode_exchange_id(struct xdr_stream *xdr, + struct nfs41_exchange_id_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4 + sizeof(args->verifier->data)); + *p++ = cpu_to_be32(OP_EXCHANGE_ID); + xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data)); + + encode_string(xdr, args->id_len, args->id); + + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(args->flags); + *p++ = cpu_to_be32(0); /* zero length state_protect4_a */ + *p = cpu_to_be32(0); /* zero length implementation id array */ + hdr->nops++; + hdr->replen += decode_exchange_id_maxsz; +} + +static void encode_create_session(struct xdr_stream *xdr, + struct nfs41_create_session_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + char machine_name[NFS4_MAX_MACHINE_NAME_LEN]; + uint32_t len; + struct nfs_client *clp = args->client; + u32 max_resp_sz_cached; + + /* + * Assumes OPEN is the biggest non-idempotent compound. + * 2 is the verifier. + */ + max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + + RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT; + + len = scnprintf(machine_name, sizeof(machine_name), "%s", + clp->cl_ipaddr); + + p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); + *p++ = cpu_to_be32(OP_CREATE_SESSION); + p = xdr_encode_hyper(p, clp->cl_clientid); + *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ + *p++ = cpu_to_be32(args->flags); /*flags */ + + /* Fore Channel */ + *p++ = cpu_to_be32(0); /* header padding size */ + *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ + + /* Back Channel */ + *p++ = cpu_to_be32(0); /* header padding size */ + *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ + + *p++ = cpu_to_be32(args->cb_program); /* cb_program */ + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ + + /* authsys_parms rfc1831 */ + *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ + p = xdr_encode_opaque(p, machine_name, len); + *p++ = cpu_to_be32(0); /* UID */ + *p++ = cpu_to_be32(0); /* GID */ + *p = cpu_to_be32(0); /* No more gids */ + hdr->nops++; + hdr->replen += decode_create_session_maxsz; +} + +static void encode_destroy_session(struct xdr_stream *xdr, + struct nfs4_session *session, + struct compound_hdr *hdr) +{ + __be32 *p; + p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(OP_DESTROY_SESSION); + xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + hdr->nops++; + hdr->replen += decode_destroy_session_maxsz; +} + +static void encode_reclaim_complete(struct xdr_stream *xdr, + struct nfs41_reclaim_complete_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_RECLAIM_COMPLETE); + *p++ = cpu_to_be32(args->one_fs); + hdr->nops++; + hdr->replen += decode_reclaim_complete_maxsz; +} +#endif /* CONFIG_NFS_V4_1 */ + +static void encode_sequence(struct xdr_stream *xdr, + const struct nfs4_sequence_args *args, + struct compound_hdr *hdr) +{ +#if defined(CONFIG_NFS_V4_1) + struct nfs4_session *session = args->sa_session; + struct nfs4_slot_table *tp; + struct nfs4_slot *slot; + __be32 *p; + + if (!session) + return; + + tp = &session->fc_slot_table; + + WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); + slot = tp->slots + args->sa_slotid; + + p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16); + *p++ = cpu_to_be32(OP_SEQUENCE); + + /* + * Sessionid + seqid + slotid + max slotid + cache_this + */ + dprintk("%s: sessionid=%u:%u:%u:%u seqid=%d slotid=%d " + "max_slotid=%d cache_this=%d\n", + __func__, + ((u32 *)session->sess_id.data)[0], + ((u32 *)session->sess_id.data)[1], + ((u32 *)session->sess_id.data)[2], + ((u32 *)session->sess_id.data)[3], + slot->seq_nr, args->sa_slotid, + tp->highest_used_slotid, args->sa_cache_this); + p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(slot->seq_nr); + *p++ = cpu_to_be32(args->sa_slotid); + *p++ = cpu_to_be32(tp->highest_used_slotid); + *p = cpu_to_be32(args->sa_cache_this); + hdr->nops++; + hdr->replen += decode_sequence_maxsz; +#endif /* CONFIG_NFS_V4_1 */ +} + +#ifdef CONFIG_NFS_V4_1 +static void +encode_getdeviceinfo(struct xdr_stream *xdr, + const struct nfs4_getdeviceinfo_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(OP_GETDEVICEINFO); + p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(args->pdev->layout_type); + *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */ + *p++ = cpu_to_be32(0); /* bitmap length 0 */ + hdr->nops++; + hdr->replen += decode_getdeviceinfo_maxsz; +} + +static void +encode_layoutget(struct xdr_stream *xdr, + const struct nfs4_layoutget_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_LAYOUTGET); + *p++ = cpu_to_be32(0); /* Signal layout available */ + *p++ = cpu_to_be32(args->type); + *p++ = cpu_to_be32(args->range.iomode); + p = xdr_encode_hyper(p, args->range.offset); + p = xdr_encode_hyper(p, args->range.length); + p = xdr_encode_hyper(p, args->minlength); + p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->maxcount); + + dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", + __func__, + args->type, + args->range.iomode, + (unsigned long)args->range.offset, + (unsigned long)args->range.length, + args->maxcount); + hdr->nops++; + hdr->replen += decode_layoutget_maxsz; +} + +static int +encode_layoutcommit(struct xdr_stream *xdr, + struct inode *inode, + const struct nfs4_layoutcommit_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten, + NFS_SERVER(args->inode)->pnfs_curr_ld->id); + + p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); + /* Only whole file layouts */ + p = xdr_encode_hyper(p, 0); /* offset */ + p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ + *p++ = cpu_to_be32(0); /* reclaim */ + p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(1); /* newoffset = TRUE */ + p = xdr_encode_hyper(p, args->lastbytewritten); + *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ + *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ + + if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) + NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( + NFS_I(inode)->layout, xdr, args); + else { + p = reserve_space(xdr, 4); + *p = cpu_to_be32(0); /* no layout-type payload */ + } + + hdr->nops++; + hdr->replen += decode_layoutcommit_maxsz; + return 0; +} + +static void +encode_layoutreturn(struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(OP_LAYOUTRETURN); + *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */ + *p++ = cpu_to_be32(args->layout_type); + *p++ = cpu_to_be32(IOMODE_ANY); + *p = cpu_to_be32(RETURN_FILE); + p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, 0); + p = xdr_encode_hyper(p, NFS4_MAX_UINT64); + spin_lock(&args->inode->i_lock); + xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE); + spin_unlock(&args->inode->i_lock); + if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) { + NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn( + NFS_I(args->inode)->layout, xdr, args); + } else { + p = reserve_space(xdr, 4); + *p = cpu_to_be32(0); + } + hdr->nops++; + hdr->replen += decode_layoutreturn_maxsz; +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * END OF "GENERIC" ENCODE ROUTINES. + */ + +static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args) +{ +#if defined(CONFIG_NFS_V4_1) + if (args->sa_session) + return args->sa_session->clp->cl_mvops->minor_version; +#endif /* CONFIG_NFS_V4_1 */ + return 0; +} + +/* + * Encode an ACCESS request + */ +static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_accessargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_access(xdr, args->access, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode LOOKUP request + */ +static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_lookup_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_lookup(xdr, args->name, &hdr); + encode_getfh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode LOOKUP_ROOT request + */ +static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs4_lookup_root_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putrootfh(xdr, &hdr); + encode_getfh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode REMOVE request + */ +static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs_removeargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_remove(xdr, &args->name, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode RENAME request + */ +static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs_renameargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->old_dir, &hdr); + encode_savefh(xdr, &hdr); + encode_putfh(xdr, args->new_dir, &hdr); + encode_rename(xdr, args->old_name, args->new_name, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_restorefh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode LINK request + */ +static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_link_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_savefh(xdr, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_link(xdr, args->name, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_restorefh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode CREATE request + */ +static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_create_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_savefh(xdr, &hdr); + encode_create(xdr, args, &hdr); + encode_getfh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_restorefh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode SYMLINK request + */ +static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_create_arg *args) +{ + nfs4_xdr_enc_create(req, xdr, args); +} + +/* + * Encode GETATTR request + */ +static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_getattr_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode a CLOSE request + */ +static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_closeargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_close(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode an OPEN request + */ +static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_openargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_savefh(xdr, &hdr); + encode_open(xdr, args, &hdr); + encode_getfh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_restorefh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode an OPEN_CONFIRM request + */ +static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_open_confirmargs *args) +{ + struct compound_hdr hdr = { + .nops = 0, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_open_confirm(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode an OPEN request with no attributes. + */ +static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_openargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_open(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode an OPEN_DOWNGRADE request + */ +static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_closeargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_open_downgrade(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode a LOCK request + */ +static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_lock_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_lock(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode a LOCKT request + */ +static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_lockt_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_lockt(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode a LOCKU request + */ +static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_locku_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_locku(xdr, args, &hdr); + encode_nops(&hdr); +} + +static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_release_lockowner_args *args) +{ + struct compound_hdr hdr = { + .minorversion = 0, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_release_lockowner(xdr, &args->lock_owner, &hdr); + encode_nops(&hdr); +} + +/* + * Encode a READLINK request + */ +static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_readlink *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_readlink(xdr, args, req, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, + args->pgbase, args->pglen); + encode_nops(&hdr); +} + +/* + * Encode a READDIR request + */ +static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_readdir_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_readdir(xdr, args, req, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, + args->pgbase, args->count); + dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", + __func__, hdr.replen << 2, args->pages, + args->pgbase, args->count); + encode_nops(&hdr); +} + +/* + * Encode a READ request + */ +static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_readargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_read(xdr, args, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, + args->pages, args->pgbase, args->count); + req->rq_rcv_buf.flags |= XDRBUF_READ; + encode_nops(&hdr); +} + +/* + * Encode an SETATTR request + */ +static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_setattrargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_setattr(xdr, args, args->server, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode a GETACL request + */ +static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_getaclargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + uint32_t replen; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1; + encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, + args->acl_pages, args->acl_pgbase, args->acl_len); + encode_nops(&hdr); +} + +/* + * Encode a WRITE request + */ +static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_writeargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_write(xdr, args, &hdr); + req->rq_snd_buf.flags |= XDRBUF_WRITE; + if (args->bitmask) + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * a COMMIT request + */ +static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_writeargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_commit(xdr, args, &hdr); + if (args->bitmask) + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * FSINFO request + */ +static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_fsinfo_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_fsinfo(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * a PATHCONF request + */ +static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_pathconf_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0], + &hdr); + encode_nops(&hdr); +} + +/* + * a STATFS request + */ +static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_statfs_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0], + args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr); + encode_nops(&hdr); +} + +/* + * GETATTR_BITMAP request + */ +static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_server_caps_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fhandle, &hdr); + encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS| + FATTR4_WORD0_LINK_SUPPORT| + FATTR4_WORD0_SYMLINK_SUPPORT| + FATTR4_WORD0_ACLSUPPORT, &hdr); + encode_nops(&hdr); +} + +/* + * a RENEW request + */ +static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_client *clp) +{ + struct compound_hdr hdr = { + .nops = 0, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_renew(xdr, clp, &hdr); + encode_nops(&hdr); +} + +/* + * a SETCLIENTID request + */ +static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_setclientid *sc) +{ + struct compound_hdr hdr = { + .nops = 0, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_setclientid(xdr, sc, &hdr); + encode_nops(&hdr); +} + +/* + * a SETCLIENTID_CONFIRM request + */ +static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_setclientid_res *arg) +{ + struct compound_hdr hdr = { + .nops = 0, + }; + const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; + + encode_compound_hdr(xdr, req, &hdr); + encode_setclientid_confirm(xdr, arg, &hdr); + encode_putrootfh(xdr, &hdr); + encode_fsinfo(xdr, lease_bitmap, &hdr); + encode_nops(&hdr); +} + +/* + * DELEGRETURN request + */ +static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs4_delegreturnargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fhandle, &hdr); + encode_delegreturn(xdr, args->stateid, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode FS_LOCATIONS request + */ +static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_fs_locations_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + uint32_t replen; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_lookup(xdr, args->name, &hdr); + replen = hdr.replen; /* get the attribute into args->page */ + encode_fs_locations(xdr, args->bitmask, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page, + 0, PAGE_SIZE); + encode_nops(&hdr); +} + +/* + * Encode SECINFO request + */ +static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_secinfo_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_secinfo(xdr, args->name, &hdr); + encode_nops(&hdr); +} + +#if defined(CONFIG_NFS_V4_1) +/* + * EXCHANGE_ID request + */ +static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_exchange_id_args *args) +{ + struct compound_hdr hdr = { + .minorversion = args->client->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_exchange_id(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * a CREATE_SESSION request + */ +static void nfs4_xdr_enc_create_session(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_create_session_args *args) +{ + struct compound_hdr hdr = { + .minorversion = args->client->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_create_session(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * a DESTROY_SESSION request + */ +static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_session *session) +{ + struct compound_hdr hdr = { + .minorversion = session->clp->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_destroy_session(xdr, session, &hdr); + encode_nops(&hdr); +} + +/* + * a SEQUENCE request + */ +static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_sequence_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * a GET_LEASE_TIME request + */ +static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_get_lease_time_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), + }; + const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->la_seq_args, &hdr); + encode_putrootfh(xdr, &hdr); + encode_fsinfo(xdr, lease_bitmap, &hdr); + encode_nops(&hdr); +} + +/* + * a RECLAIM_COMPLETE request + */ +static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_reclaim_complete_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args) + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_reclaim_complete(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode GETDEVICEINFO request + */ +static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_getdeviceinfo_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_getdeviceinfo(xdr, args, &hdr); + + /* set up reply kvec. Subtract notification bitmap max size (2) + * so that notification bitmap is put in xdr_buf tail */ + xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2, + args->pdev->pages, args->pdev->pgbase, + args->pdev->pglen); + + encode_nops(&hdr); +} + +/* + * Encode LAYOUTGET request + */ +static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutget_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + encode_layoutget(xdr, args, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, + args->layout.pages, 0, args->layout.pglen); + + encode_nops(&hdr); +} + +/* + * Encode LAYOUTCOMMIT request + */ +static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutcommit_args *args) +{ + struct nfs4_layoutcommit_data *data = + container_of(args, struct nfs4_layoutcommit_data, args); + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + encode_layoutcommit(xdr, data->args.inode, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode LAYOUTRETURN request + */ +static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutreturn_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + encode_layoutreturn(xdr, args, &hdr); + encode_nops(&hdr); +} +#endif /* CONFIG_NFS_V4_1 */ + +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("nfs: %s: prematurely hit end of receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + +static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, *len); + if (unlikely(!p)) + goto out_overflow; + *string = (char *)p; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + hdr->status = be32_to_cpup(p++); + hdr->taglen = be32_to_cpup(p); + + p = xdr_inline_decode(xdr, hdr->taglen + 4); + if (unlikely(!p)) + goto out_overflow; + hdr->tag = (char *)p; + p += XDR_QUADLEN(hdr->taglen); + hdr->nops = be32_to_cpup(p); + if (unlikely(hdr->nops < 1)) + return nfs4_stat_to_errno(hdr->status); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +{ + __be32 *p; + uint32_t opnum; + int32_t nfserr; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + opnum = be32_to_cpup(p++); + if (opnum != expected) { + dprintk("nfs: Server returned operation" + " %d but we issued a request for %d\n", + opnum, expected); + return -EIO; + } + nfserr = be32_to_cpup(p); + if (nfserr != NFS_OK) + return nfs4_stat_to_errno(nfserr); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* Dummy routine */ +static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp) +{ + __be32 *p; + unsigned int strlen; + char *str; + + p = xdr_inline_decode(xdr, 12); + if (likely(p)) + return decode_opaque_inline(xdr, &strlen, &str); + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) +{ + uint32_t bmlen; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + + bitmap[0] = bitmap[1] = 0; + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; + if (bmlen > 0) { + bitmap[0] = be32_to_cpup(p++); + if (bmlen > 1) + bitmap[1] = be32_to_cpup(p); + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *attrlen = be32_to_cpup(p); + *savep = xdr->p; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) +{ + if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) { + int ret; + ret = decode_attr_bitmap(xdr, bitmask); + if (unlikely(ret < 0)) + return ret; + bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; + } else + bitmask[0] = bitmask[1] = 0; + dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); + return 0; +} + +static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type) +{ + __be32 *p; + int ret = 0; + + *type = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *type = be32_to_cpup(p); + if (*type < NF4REG || *type > NF4NAMEDATTR) { + dprintk("%s: bad type %d\n", __func__, *type); + return -EIO; + } + bitmap[0] &= ~FATTR4_WORD0_TYPE; + ret = NFS_ATTR_FATTR_TYPE; + } + dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) +{ + __be32 *p; + int ret = 0; + + *change = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, change); + bitmap[0] &= ~FATTR4_WORD0_CHANGE; + ret = NFS_ATTR_FATTR_CHANGE; + } + dprintk("%s: change attribute=%Lu\n", __func__, + (unsigned long long)*change); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) +{ + __be32 *p; + int ret = 0; + + *size = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, size); + bitmap[0] &= ~FATTR4_WORD0_SIZE; + ret = NFS_ATTR_FATTR_SIZE; + } + dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; + } + dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; + } + dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) +{ + __be32 *p; + int ret = 0; + + fsid->major = 0; + fsid->minor = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { + p = xdr_inline_decode(xdr, 16); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &fsid->major); + xdr_decode_hyper(p, &fsid->minor); + bitmap[0] &= ~FATTR4_WORD0_FSID; + ret = NFS_ATTR_FATTR_FSID; + } + dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__, + (unsigned long long)fsid->major, + (unsigned long long)fsid->minor); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 60; + if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; + } + dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res) +{ + __be32 *p; + + if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; + *res = -be32_to_cpup(p); + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh) +{ + __be32 *p; + int len; + + if (fh != NULL) + memset(fh, 0, sizeof(*fh)); + + if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len > NFS4_FHSIZE) + return -EIO; + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (fh != NULL) { + memcpy(fh->data, p, len); + fh->size = len; + } + bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL; + if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; + } + dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) +{ + __be32 *p; + int ret = 0; + + *fileid = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, fileid); + bitmap[0] &= ~FATTR4_WORD0_FILEID; + ret = NFS_ATTR_FATTR_FILEID; + } + dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) +{ + __be32 *p; + int ret = 0; + + *fileid = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, fileid); + bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; + ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID; + } + dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) +{ + __be32 *p; + int status = 0; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; + } + dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) +{ + __be32 *p; + int status = 0; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; + } + dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) +{ + __be32 *p; + int status = 0; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; + } + dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) +{ + u32 n; + __be32 *p; + int status = 0; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + n = be32_to_cpup(p); + if (n == 0) + goto root_path; + dprintk("path "); + path->ncomponents = 0; + while (path->ncomponents < n) { + struct nfs4_string *component = &path->components[path->ncomponents]; + status = decode_opaque_inline(xdr, &component->len, &component->data); + if (unlikely(status != 0)) + goto out_eio; + if (path->ncomponents != n) + dprintk("/"); + dprintk("%s", component->data); + if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS) + path->ncomponents++; + else { + dprintk("cannot parse %d components in path\n", n); + goto out_eio; + } + } +out: + dprintk("\n"); + return status; +root_path: +/* a root pathname is sent as a zero component4 */ + path->ncomponents = 1; + path->components[0].len=0; + path->components[0].data=NULL; + dprintk("path /\n"); + goto out; +out_eio: + dprintk(" status %d", status); + status = -EIO; + goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) +{ + int n; + __be32 *p; + int status = -EIO; + + if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U))) + goto out; + status = 0; + if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS))) + goto out; + dprintk("%s: fsroot ", __func__); + status = decode_pathname(xdr, &res->fs_path); + if (unlikely(status != 0)) + goto out; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + n = be32_to_cpup(p); + if (n <= 0) + goto out_eio; + res->nlocations = 0; + while (res->nlocations < n) { + u32 m; + struct nfs4_fs_location *loc = &res->locations[res->nlocations]; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + m = be32_to_cpup(p); + + loc->nservers = 0; + dprintk("%s: servers ", __func__); + while (loc->nservers < m) { + struct nfs4_string *server = &loc->servers[loc->nservers]; + status = decode_opaque_inline(xdr, &server->len, &server->data); + if (unlikely(status != 0)) + goto out_eio; + dprintk("%s ", server->data); + if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS) + loc->nservers++; + else { + unsigned int i; + dprintk("%s: using first %u of %u servers " + "returned for location %u\n", + __func__, + NFS4_FS_LOCATION_MAXSERVERS, + m, res->nlocations); + for (i = loc->nservers; i < m; i++) { + unsigned int len; + char *data; + status = decode_opaque_inline(xdr, &len, &data); + if (unlikely(status != 0)) + goto out_eio; + } + } + } + status = decode_pathname(xdr, &loc->rootpath); + if (unlikely(status != 0)) + goto out_eio; + if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) + res->nlocations++; + } + if (res->nlocations != 0) + status = NFS_ATTR_FATTR_V4_REFERRAL; +out: + dprintk("%s: fs_locations done, error = %d\n", __func__, status); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); +out_eio: + status = -EIO; + goto out; +} + +static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) +{ + __be32 *p; + int status = 0; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; + } + dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) +{ + __be32 *p; + int status = 0; + + *maxlink = 1; + if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *maxlink = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_MAXLINK; + } + dprintk("%s: maxlink=%u\n", __func__, *maxlink); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) +{ + __be32 *p; + int status = 0; + + *maxname = 1024; + if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *maxname = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_MAXNAME; + } + dprintk("%s: maxname=%u\n", __func__, *maxname); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + int status = 0; + + *res = 1024; + if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXREAD - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) { + uint64_t maxread; + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &maxread); + if (maxread > 0x7FFFFFFF) + maxread = 0x7FFFFFFF; + *res = (uint32_t)maxread; + bitmap[0] &= ~FATTR4_WORD0_MAXREAD; + } + dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + int status = 0; + + *res = 1024; + if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXWRITE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) { + uint64_t maxwrite; + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &maxwrite); + if (maxwrite > 0x7FFFFFFF) + maxwrite = 0x7FFFFFFF; + *res = (uint32_t)maxwrite; + bitmap[0] &= ~FATTR4_WORD0_MAXWRITE; + } + dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) +{ + uint32_t tmp; + __be32 *p; + int ret = 0; + + *mode = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + tmp = be32_to_cpup(p); + *mode = tmp & ~S_IFMT; + bitmap[1] &= ~FATTR4_WORD1_MODE; + ret = NFS_ATTR_FATTR_MODE; + } + dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) +{ + __be32 *p; + int ret = 0; + + *nlink = 1; + if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *nlink = be32_to_cpup(p); + bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; + ret = NFS_ATTR_FATTR_NLINK; + } + dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, + const struct nfs_server *server, uint32_t *uid, int may_sleep) +{ + uint32_t len; + __be32 *p; + int ret = 0; + + *uid = -2; + if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (!may_sleep) { + /* do nothing */ + } else if (len < XDR_MAX_NETOBJ) { + if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0) + ret = NFS_ATTR_FATTR_OWNER; + else + dprintk("%s: nfs_map_name_to_uid failed!\n", + __func__); + } else + dprintk("%s: name too long (%u)!\n", + __func__, len); + bitmap[1] &= ~FATTR4_WORD1_OWNER; + } + dprintk("%s: uid=%d\n", __func__, (int)*uid); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, + const struct nfs_server *server, uint32_t *gid, int may_sleep) +{ + uint32_t len; + __be32 *p; + int ret = 0; + + *gid = -2; + if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (!may_sleep) { + /* do nothing */ + } else if (len < XDR_MAX_NETOBJ) { + if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0) + ret = NFS_ATTR_FATTR_GROUP; + else + dprintk("%s: nfs_map_group_to_gid failed!\n", + __func__); + } else + dprintk("%s: name too long (%u)!\n", + __func__, len); + bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; + } + dprintk("%s: gid=%d\n", __func__, (int)*gid); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) +{ + uint32_t major = 0, minor = 0; + __be32 *p; + int ret = 0; + + *rdev = MKDEV(0,0); + if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) { + dev_t tmp; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + major = be32_to_cpup(p++); + minor = be32_to_cpup(p); + tmp = MKDEV(major, minor); + if (MAJOR(tmp) == major && MINOR(tmp) == minor) + *rdev = tmp; + bitmap[1] &= ~ FATTR4_WORD1_RAWDEV; + ret = NFS_ATTR_FATTR_RDEV; + } + dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) +{ + __be32 *p; + int status = 0; + + *res = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; + } + dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) +{ + __be32 *p; + int status = 0; + + *res = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; + } + dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) +{ + __be32 *p; + int status = 0; + + *res = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; + } + dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) +{ + __be32 *p; + int ret = 0; + + *used = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, used); + bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; + ret = NFS_ATTR_FATTR_SPACE_USED; + } + dprintk("%s: space used=%Lu\n", __func__, + (unsigned long long)*used); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) +{ + __be32 *p; + uint64_t sec; + uint32_t nsec; + + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &sec); + nsec = be32_to_cpup(p); + time->tv_sec = (time_t)sec; + time->tv_nsec = (long)nsec; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +{ + int status = 0; + + time->tv_sec = 0; + time->tv_nsec = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_ACCESS - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) { + status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_ATIME; + bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS; + } + dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec); + return status; +} + +static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +{ + int status = 0; + + time->tv_sec = 0; + time->tv_nsec = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_METADATA - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) { + status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_CTIME; + bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA; + } + dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec); + return status; +} + +static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap, + struct timespec *time) +{ + int status = 0; + + time->tv_sec = 0; + time->tv_nsec = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) { + status = decode_attr_time(xdr, time); + bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA; + } + dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec, + (long)time->tv_nsec); + return status; +} + +static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +{ + int status = 0; + + time->tv_sec = 0; + time->tv_nsec = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_MODIFY - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) { + status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_MTIME; + bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY; + } + dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec); + return status; +} + +static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrlen) +{ + unsigned int attrwords = XDR_QUADLEN(attrlen); + unsigned int nwords = xdr->p - savep; + + if (unlikely(attrwords != nwords)) { + dprintk("%s: server returned incorrect attribute length: " + "%u %c %u\n", + __func__, + attrwords << 2, + (attrwords < nwords) ? '<' : '>', + nwords << 2); + return -EIO; + } + return 0; +} + +static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; + cinfo->atomic = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &cinfo->before); + xdr_decode_hyper(p, &cinfo->after); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) +{ + __be32 *p; + uint32_t supp, acc; + int status; + + status = decode_op_hdr(xdr, OP_ACCESS); + if (status) + return status; + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + supp = be32_to_cpup(p++); + acc = be32_to_cpup(p); + access->supported = supp; + access->access = acc; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, len); + if (likely(p)) { + memcpy(buf, p, len); + return 0; + } + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); +} + +static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_CLOSE); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; +} + +static int decode_verifier(struct xdr_stream *xdr, void *verifier) +{ + return decode_opaque_fixed(xdr, verifier, 8); +} + +static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_COMMIT); + if (!status) + status = decode_verifier(xdr, res->verf->verifier); + return status; +} + +static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) +{ + __be32 *p; + uint32_t bmlen; + int status; + + status = decode_op_hdr(xdr, OP_CREATE); + if (status) + return status; + if ((status = decode_change_info(xdr, cinfo))) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) +{ + __be32 *savep; + uint32_t attrlen, bitmap[2] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) + goto xdr_error; + if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + goto xdr_error; + if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + goto xdr_error; + if ((status = decode_attr_supported(xdr, bitmap, res->attr_bitmask)) != 0) + goto xdr_error; + if ((status = decode_attr_link_support(xdr, bitmap, &res->has_links)) != 0) + goto xdr_error; + if ((status = decode_attr_symlink_support(xdr, bitmap, &res->has_symlinks)) != 0) + goto xdr_error; + if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0) + goto xdr_error; + status = verify_attr_len(xdr, savep, attrlen); +xdr_error: + dprintk("%s: xdr returned %d!\n", __func__, -status); + return status; +} + +static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) +{ + __be32 *savep; + uint32_t attrlen, bitmap[2] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) + goto xdr_error; + if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + goto xdr_error; + if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + goto xdr_error; + + if ((status = decode_attr_files_avail(xdr, bitmap, &fsstat->afiles)) != 0) + goto xdr_error; + if ((status = decode_attr_files_free(xdr, bitmap, &fsstat->ffiles)) != 0) + goto xdr_error; + if ((status = decode_attr_files_total(xdr, bitmap, &fsstat->tfiles)) != 0) + goto xdr_error; + if ((status = decode_attr_space_avail(xdr, bitmap, &fsstat->abytes)) != 0) + goto xdr_error; + if ((status = decode_attr_space_free(xdr, bitmap, &fsstat->fbytes)) != 0) + goto xdr_error; + if ((status = decode_attr_space_total(xdr, bitmap, &fsstat->tbytes)) != 0) + goto xdr_error; + + status = verify_attr_len(xdr, savep, attrlen); +xdr_error: + dprintk("%s: xdr returned %d!\n", __func__, -status); + return status; +} + +static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) +{ + __be32 *savep; + uint32_t attrlen, bitmap[2] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) + goto xdr_error; + if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + goto xdr_error; + if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + goto xdr_error; + + if ((status = decode_attr_maxlink(xdr, bitmap, &pathconf->max_link)) != 0) + goto xdr_error; + if ((status = decode_attr_maxname(xdr, bitmap, &pathconf->max_namelen)) != 0) + goto xdr_error; + + status = verify_attr_len(xdr, savep, attrlen); +xdr_error: + dprintk("%s: xdr returned %d!\n", __func__, -status); + return status; +} + +static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs_fattr *fattr, struct nfs_fh *fh, + const struct nfs_server *server, int may_sleep) +{ + int status; + umode_t fmode = 0; + uint32_t type; + int32_t err; + + status = decode_attr_type(xdr, bitmap, &type); + if (status < 0) + goto xdr_error; + fattr->mode = 0; + if (status != 0) { + fattr->mode |= nfs_type2fmt[type]; + fattr->valid |= status; + } + + status = decode_attr_change(xdr, bitmap, &fattr->change_attr); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_size(xdr, bitmap, &fattr->size); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_fsid(xdr, bitmap, &fattr->fsid); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + err = 0; + status = decode_attr_error(xdr, bitmap, &err); + if (status < 0) + goto xdr_error; + if (err == -NFS4ERR_WRONGSEC) + nfs_fixup_secinfo_attributes(fattr, fh); + + status = decode_attr_filehandle(xdr, bitmap, fh); + if (status < 0) + goto xdr_error; + + status = decode_attr_fileid(xdr, bitmap, &fattr->fileid); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, + struct nfs4_fs_locations, + fattr)); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_mode(xdr, bitmap, &fmode); + if (status < 0) + goto xdr_error; + if (status != 0) { + fattr->mode |= fmode; + fattr->valid |= status; + } + + status = decode_attr_nlink(xdr, bitmap, &fattr->nlink); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_rdev(xdr, bitmap, &fattr->rdev); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_time_access(xdr, bitmap, &fattr->atime); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_mounted_on_fileid(xdr, bitmap, &fattr->mounted_on_fileid); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + +xdr_error: + dprintk("%s: xdr returned %d\n", __func__, -status); + return status; +} + +static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct nfs_fh *fh, const struct nfs_server *server, int may_sleep) +{ + __be32 *savep; + uint32_t attrlen, + bitmap[2] = {0}; + int status; + + status = decode_op_hdr(xdr, OP_GETATTR); + if (status < 0) + goto xdr_error; + + status = decode_attr_bitmap(xdr, bitmap); + if (status < 0) + goto xdr_error; + + status = decode_attr_length(xdr, &attrlen, &savep); + if (status < 0) + goto xdr_error; + + status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep); + if (status < 0) + goto xdr_error; + + status = verify_attr_len(xdr, savep, attrlen); +xdr_error: + dprintk("%s: xdr returned %d\n", __func__, -status); + return status; +} + +static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, + const struct nfs_server *server, int may_sleep) +{ + return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep); +} + +/* + * Decode potentially multiple layout types. Currently we only support + * one layout driver per file system. + */ +static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, + uint32_t *layouttype) +{ + uint32_t *p; + int num; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num = be32_to_cpup(p); + + /* pNFS is not supported by the underlying file system */ + if (num == 0) { + *layouttype = 0; + return 0; + } + if (num > 1) + printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " + "per filesystem not supported\n", __func__); + + /* Decode and set first layout type, move xdr->p past unused types */ + p = xdr_inline_decode(xdr, num * 4); + if (unlikely(!p)) + goto out_overflow; + *layouttype = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * The type of file system exported. + * Note we must ensure that layouttype is set in any non-error case. + */ +static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *layouttype) +{ + int status = 0; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); + if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) + return -EIO; + if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) { + status = decode_first_pnfs_layout_type(xdr, layouttype); + bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; + } else + *layouttype = 0; + return status; +} + +static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) +{ + __be32 *savep; + uint32_t attrlen, bitmap[2]; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) + goto xdr_error; + if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + goto xdr_error; + if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + goto xdr_error; + + fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ + + if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) + goto xdr_error; + if ((status = decode_attr_maxfilesize(xdr, bitmap, &fsinfo->maxfilesize)) != 0) + goto xdr_error; + if ((status = decode_attr_maxread(xdr, bitmap, &fsinfo->rtmax)) != 0) + goto xdr_error; + fsinfo->rtpref = fsinfo->dtpref = fsinfo->rtmax; + if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) + goto xdr_error; + fsinfo->wtpref = fsinfo->wtmax; + status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); + if (status != 0) + goto xdr_error; + status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); + if (status != 0) + goto xdr_error; + + status = verify_attr_len(xdr, savep, attrlen); +xdr_error: + dprintk("%s: xdr returned %d!\n", __func__, -status); + return status; +} + +static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p; + uint32_t len; + int status; + + /* Zero handle first to allow comparisons */ + memset(fh, 0, sizeof(*fh)); + + status = decode_op_hdr(xdr, OP_GETFH); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len > NFS4_FHSIZE) + return -EIO; + fh->size = len; + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + memcpy(fh->data, p, len); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) +{ + int status; + + status = decode_op_hdr(xdr, OP_LINK); + if (status) + return status; + return decode_change_info(xdr, cinfo); +} + +/* + * We create the owner, so we know a proper owner.id length is 4. + */ +static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) +{ + uint64_t offset, length, clientid; + __be32 *p; + uint32_t namelen, type; + + p = xdr_inline_decode(xdr, 32); /* read 32 bytes */ + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */ + p = xdr_decode_hyper(p, &length); + type = be32_to_cpup(p++); /* 4 byte read */ + if (fl != NULL) { /* manipulate file lock */ + fl->fl_start = (loff_t)offset; + fl->fl_end = fl->fl_start + (loff_t)length - 1; + if (length == ~(uint64_t)0) + fl->fl_end = OFFSET_MAX; + fl->fl_type = F_WRLCK; + if (type & 1) + fl->fl_type = F_RDLCK; + fl->fl_pid = 0; + } + p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */ + namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */ + p = xdr_inline_decode(xdr, namelen); /* variable size field */ + if (likely(p)) + return -NFS4ERR_DENIED; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_LOCK); + if (status == -EIO) + goto out; + if (status == 0) { + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) + goto out; + } else if (status == -NFS4ERR_DENIED) + status = decode_lock_denied(xdr, NULL); + if (res->open_seqid != NULL) + nfs_increment_open_seqid(status, res->open_seqid); + nfs_increment_lock_seqid(status, res->lock_seqid); +out: + return status; +} + +static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res) +{ + int status; + status = decode_op_hdr(xdr, OP_LOCKT); + if (status == -NFS4ERR_DENIED) + return decode_lock_denied(xdr, res->denied); + return status; +} + +static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_LOCKU); + if (status != -EIO) + nfs_increment_lock_seqid(status, res->seqid); + if (status == 0) + status = decode_stateid(xdr, &res->stateid); + return status; +} + +static int decode_release_lockowner(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER); +} + +static int decode_lookup(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_LOOKUP); +} + +/* This is too sick! */ +static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) +{ + __be32 *p; + uint32_t limit_type, nblocks, blocksize; + + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + limit_type = be32_to_cpup(p++); + switch (limit_type) { + case 1: + xdr_decode_hyper(p, maxsize); + break; + case 2: + nblocks = be32_to_cpup(p++); + blocksize = be32_to_cpup(p); + *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) +{ + __be32 *p; + uint32_t delegation_type; + int status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + delegation_type = be32_to_cpup(p); + if (delegation_type == NFS4_OPEN_DELEGATE_NONE) { + res->delegation_type = 0; + return 0; + } + status = decode_stateid(xdr, &res->delegation); + if (unlikely(status)) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->do_recall = be32_to_cpup(p); + + switch (delegation_type) { + case NFS4_OPEN_DELEGATE_READ: + res->delegation_type = FMODE_READ; + break; + case NFS4_OPEN_DELEGATE_WRITE: + res->delegation_type = FMODE_WRITE|FMODE_READ; + if (decode_space_limit(xdr, &res->maxsize) < 0) + return -EIO; + } + return decode_ace(xdr, NULL, res->server->nfs_client); +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) +{ + __be32 *p; + uint32_t savewords, bmlen, i; + int status; + + status = decode_op_hdr(xdr, OP_OPEN); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); + if (!status) + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) + return status; + + decode_change_info(xdr, &res->cinfo); + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + res->rflags = be32_to_cpup(p++); + bmlen = be32_to_cpup(p); + if (bmlen > 10) + goto xdr_error; + + p = xdr_inline_decode(xdr, bmlen << 2); + if (unlikely(!p)) + goto out_overflow; + savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); + for (i = 0; i < savewords; ++i) + res->attrset[i] = be32_to_cpup(p++); + for (; i < NFS4_BITMAP_SIZE; i++) + res->attrset[i] = 0; + + return decode_delegation(xdr, res); +xdr_error: + dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; +} + +static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; +} + +static int decode_putfh(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_PUTFH); +} + +static int decode_putrootfh(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_PUTROOTFH); +} + +static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res) +{ + struct kvec *iov = req->rq_rcv_buf.head; + __be32 *p; + uint32_t count, eof, recvd, hdrlen; + int status; + + status = decode_op_hdr(xdr, OP_READ); + if (status) + return status; + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + eof = be32_to_cpup(p++); + count = be32_to_cpup(p); + hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base; + recvd = req->rq_rcv_buf.len - hdrlen; + if (count > recvd) { + dprintk("NFS: server cheating in read reply: " + "count %u > recvd %u\n", count, recvd); + count = recvd; + eof = 0; + } + xdr_read_pages(xdr, count); + res->eof = eof; + res->count = count; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) +{ + struct xdr_buf *rcvbuf = &req->rq_rcv_buf; + struct kvec *iov = rcvbuf->head; + size_t hdrlen; + u32 recvd, pglen = rcvbuf->page_len; + int status; + + status = decode_op_hdr(xdr, OP_READDIR); + if (!status) + status = decode_verifier(xdr, readdir->verifier.data); + if (unlikely(status)) + return status; + dprintk("%s: verifier = %08x:%08x\n", + __func__, + ((u32 *)readdir->verifier.data)[0], + ((u32 *)readdir->verifier.data)[1]); + + + hdrlen = (char *) xdr->p - (char *) iov->iov_base; + recvd = rcvbuf->len - hdrlen; + if (pglen > recvd) + pglen = recvd; + xdr_read_pages(xdr, pglen); + + + return pglen; +} + +static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) +{ + struct xdr_buf *rcvbuf = &req->rq_rcv_buf; + struct kvec *iov = rcvbuf->head; + size_t hdrlen; + u32 len, recvd; + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_READLINK); + if (status) + return status; + + /* Convert length of symlink */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len >= rcvbuf->page_len || len <= 0) { + dprintk("nfs: server returned giant symlink!\n"); + return -ENAMETOOLONG; + } + hdrlen = (char *) xdr->p - (char *) iov->iov_base; + recvd = req->rq_rcv_buf.len - hdrlen; + if (recvd < len) { + dprintk("NFS: server cheating in readlink reply: " + "count %u > recvd %u\n", len, recvd); + return -EIO; + } + xdr_read_pages(xdr, len); + /* + * The XDR encode routine has set things up so that + * the link text will be copied directly into the + * buffer. We just have to do overflow-checking, + * and and null-terminate the text (the VFS expects + * null-termination). + */ + xdr_terminate_string(rcvbuf, len); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) +{ + int status; + + status = decode_op_hdr(xdr, OP_REMOVE); + if (status) + goto out; + status = decode_change_info(xdr, cinfo); +out: + return status; +} + +static int decode_rename(struct xdr_stream *xdr, struct nfs4_change_info *old_cinfo, + struct nfs4_change_info *new_cinfo) +{ + int status; + + status = decode_op_hdr(xdr, OP_RENAME); + if (status) + goto out; + if ((status = decode_change_info(xdr, old_cinfo))) + goto out; + status = decode_change_info(xdr, new_cinfo); +out: + return status; +} + +static int decode_renew(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_RENEW); +} + +static int +decode_restorefh(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_RESTOREFH); +} + +static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, + size_t *acl_len) +{ + __be32 *savep; + uint32_t attrlen, + bitmap[2] = {0}; + struct kvec *iov = req->rq_rcv_buf.head; + int status; + + *acl_len = 0; + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) + goto out; + if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + goto out; + if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + goto out; + + if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_ACL)) { + size_t hdrlen; + u32 recvd; + + /* We ignore &savep and don't do consistency checks on + * the attr length. Let userspace figure it out.... */ + hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base; + recvd = req->rq_rcv_buf.len - hdrlen; + if (attrlen > recvd) { + dprintk("NFS: server cheating in getattr" + " acl reply: attrlen %u > recvd %u\n", + attrlen, recvd); + return -EINVAL; + } + xdr_read_pages(xdr, attrlen); + *acl_len = attrlen; + } else + status = -EOPNOTSUPP; + +out: + return status; +} + +static int +decode_savefh(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_SAVEFH); +} + +static int decode_setattr(struct xdr_stream *xdr) +{ + __be32 *p; + uint32_t bmlen; + int status; + + status = decode_op_hdr(xdr, OP_SETATTR); + if (status) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res) +{ + __be32 *p; + uint32_t opnum; + int32_t nfserr; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + opnum = be32_to_cpup(p++); + if (opnum != OP_SETCLIENTID) { + dprintk("nfs: decode_setclientid: Server returned operation" + " %d\n", opnum); + return -EIO; + } + nfserr = be32_to_cpup(p); + if (nfserr == NFS_OK) { + p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->clientid); + memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE); + } else if (nfserr == NFSERR_CLID_INUSE) { + uint32_t len; + + /* skip netid string */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + + /* skip uaddr string */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + return -NFSERR_CLID_INUSE; + } else + return nfs4_stat_to_errno(nfserr); + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_setclientid_confirm(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM); +} + +static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_WRITE); + if (status) + return status; + + p = xdr_inline_decode(xdr, 16); + if (unlikely(!p)) + goto out_overflow; + res->count = be32_to_cpup(p++); + res->verf->committed = be32_to_cpup(p++); + memcpy(res->verf->verifier, p, 8); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_delegreturn(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_DELEGRETURN); +} + +static int decode_secinfo_gss(struct xdr_stream *xdr, struct nfs4_secinfo_flavor *flavor) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + flavor->gss.sec_oid4.len = be32_to_cpup(p); + if (flavor->gss.sec_oid4.len > GSS_OID_MAX_LEN) + goto out_err; + + p = xdr_inline_decode(xdr, flavor->gss.sec_oid4.len); + if (unlikely(!p)) + goto out_overflow; + memcpy(flavor->gss.sec_oid4.data, p, flavor->gss.sec_oid4.len); + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + flavor->gss.qop4 = be32_to_cpup(p++); + flavor->gss.service = be32_to_cpup(p); + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +out_err: + return -EINVAL; +} + +static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) +{ + struct nfs4_secinfo_flavor *sec_flavor; + int status; + __be32 *p; + int i, num_flavors; + + status = decode_op_hdr(xdr, OP_SECINFO); + if (status) + goto out; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + + res->flavors->num_flavors = 0; + num_flavors = be32_to_cpup(p); + + for (i = 0; i < num_flavors; i++) { + sec_flavor = &res->flavors->flavors[i]; + if ((char *)&sec_flavor[1] - (char *)res->flavors > PAGE_SIZE) + break; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + sec_flavor->flavor = be32_to_cpup(p); + + if (sec_flavor->flavor == RPC_AUTH_GSS) { + status = decode_secinfo_gss(xdr, sec_flavor); + if (status) + goto out; + } + res->flavors->num_flavors++; + } + +out: + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +#if defined(CONFIG_NFS_V4_1) +static int decode_exchange_id(struct xdr_stream *xdr, + struct nfs41_exchange_id_res *res) +{ + __be32 *p; + uint32_t dummy; + char *dummy_str; + int status; + struct nfs_client *clp = res->client; + + status = decode_op_hdr(xdr, OP_EXCHANGE_ID); + if (status) + return status; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &clp->cl_clientid); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + clp->cl_seqid = be32_to_cpup(p++); + clp->cl_exchange_flags = be32_to_cpup(p++); + + /* We ask for SP4_NONE */ + dummy = be32_to_cpup(p); + if (dummy != SP4_NONE) + return -EIO; + + /* Throw away minor_id */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + + /* Throw away Major id */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + + /* Throw away server_scope */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + + /* Throw away Implementation id array */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_chan_attrs(struct xdr_stream *xdr, + struct nfs4_channel_attrs *attrs) +{ + __be32 *p; + u32 nr_attrs, val; + + p = xdr_inline_decode(xdr, 28); + if (unlikely(!p)) + goto out_overflow; + val = be32_to_cpup(p++); /* headerpadsz */ + if (val) + return -EINVAL; /* no support for header padding yet */ + attrs->max_rqst_sz = be32_to_cpup(p++); + attrs->max_resp_sz = be32_to_cpup(p++); + attrs->max_resp_sz_cached = be32_to_cpup(p++); + attrs->max_ops = be32_to_cpup(p++); + attrs->max_reqs = be32_to_cpup(p++); + nr_attrs = be32_to_cpup(p); + if (unlikely(nr_attrs > 1)) { + printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n", + __func__, nr_attrs); + return -EINVAL; + } + if (nr_attrs == 1) { + p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */ + if (unlikely(!p)) + goto out_overflow; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) +{ + return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN); +} + +static int decode_create_session(struct xdr_stream *xdr, + struct nfs41_create_session_res *res) +{ + __be32 *p; + int status; + struct nfs_client *clp = res->client; + struct nfs4_session *session = clp->cl_session; + + status = decode_op_hdr(xdr, OP_CREATE_SESSION); + if (!status) + status = decode_sessionid(xdr, &session->sess_id); + if (unlikely(status)) + return status; + + /* seqid, flags */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + clp->cl_seqid = be32_to_cpup(p++); + session->flags = be32_to_cpup(p); + + /* Channel attributes */ + status = decode_chan_attrs(xdr, &session->fc_attrs); + if (!status) + status = decode_chan_attrs(xdr, &session->bc_attrs); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) +{ + return decode_op_hdr(xdr, OP_DESTROY_SESSION); +} + +static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy) +{ + return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE); +} +#endif /* CONFIG_NFS_V4_1 */ + +static int decode_sequence(struct xdr_stream *xdr, + struct nfs4_sequence_res *res, + struct rpc_rqst *rqstp) +{ +#if defined(CONFIG_NFS_V4_1) + struct nfs4_sessionid id; + u32 dummy; + int status; + __be32 *p; + + if (!res->sr_session) + return 0; + + status = decode_op_hdr(xdr, OP_SEQUENCE); + if (!status) + status = decode_sessionid(xdr, &id); + if (unlikely(status)) + goto out_err; + + /* + * If the server returns different values for sessionID, slotID or + * sequence number, the server is looney tunes. + */ + status = -EREMOTEIO; + + if (memcmp(id.data, res->sr_session->sess_id.data, + NFS4_MAX_SESSIONID_LEN)) { + dprintk("%s Invalid session id\n", __func__); + goto out_err; + } + + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; + + /* seqid */ + dummy = be32_to_cpup(p++); + if (dummy != res->sr_slot->seq_nr) { + dprintk("%s Invalid sequence number\n", __func__); + goto out_err; + } + /* slot id */ + dummy = be32_to_cpup(p++); + if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) { + dprintk("%s Invalid slot id\n", __func__); + goto out_err; + } + /* highest slot id - currently not processed */ + dummy = be32_to_cpup(p++); + /* target highest slot id - currently not processed */ + dummy = be32_to_cpup(p++); + /* result flags */ + res->sr_status_flags = be32_to_cpup(p); + status = 0; +out_err: + res->sr_status = status; + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + status = -EIO; + goto out_err; +#else /* CONFIG_NFS_V4_1 */ + return 0; +#endif /* CONFIG_NFS_V4_1 */ +} + +#if defined(CONFIG_NFS_V4_1) + +static int decode_getdeviceinfo(struct xdr_stream *xdr, + struct pnfs_device *pdev) +{ + __be32 *p; + uint32_t len, type; + int status; + + status = decode_op_hdr(xdr, OP_GETDEVICEINFO); + if (status) { + if (status == -ETOOSMALL) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + pdev->mincount = be32_to_cpup(p); + dprintk("%s: Min count too small. mincnt = %u\n", + __func__, pdev->mincount); + } + return status; + } + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + type = be32_to_cpup(p++); + if (type != pdev->layout_type) { + dprintk("%s: layout mismatch req: %u pdev: %u\n", + __func__, pdev->layout_type, type); + return -EINVAL; + } + /* + * Get the length of the opaque device_addr4. xdr_read_pages places + * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) + * and places the remaining xdr data in xdr_buf->tail + */ + pdev->mincount = be32_to_cpup(p); + xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ + + /* Parse notification bitmap, verifying that it is zero. */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len) { + uint32_t i; + + p = xdr_inline_decode(xdr, 4 * len); + if (unlikely(!p)) + goto out_overflow; + for (i = 0; i < len; i++, p++) { + if (be32_to_cpup(p)) { + dprintk("%s: notifications not supported\n", + __func__); + return -EIO; + } + } + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs4_layoutget_res *res) +{ + __be32 *p; + int status; + u32 layout_count; + struct xdr_buf *rcvbuf = &req->rq_rcv_buf; + struct kvec *iov = rcvbuf->head; + u32 hdrlen, recvd; + + status = decode_op_hdr(xdr, OP_LAYOUTGET); + if (status) + return status; + p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); + if (unlikely(!p)) + goto out_overflow; + res->return_on_close = be32_to_cpup(p++); + p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE); + layout_count = be32_to_cpup(p); + if (!layout_count) { + dprintk("%s: server responded with empty layout array\n", + __func__); + return -EINVAL; + } + + p = xdr_inline_decode(xdr, 28); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->range.offset); + p = xdr_decode_hyper(p, &res->range.length); + res->range.iomode = be32_to_cpup(p++); + res->type = be32_to_cpup(p++); + res->layoutp->len = be32_to_cpup(p); + + dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", + __func__, + (unsigned long)res->range.offset, + (unsigned long)res->range.length, + res->range.iomode, + res->type, + res->layoutp->len); + + hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base; + recvd = req->rq_rcv_buf.len - hdrlen; + if (res->layoutp->len > recvd) { + dprintk("NFS: server cheating in layoutget reply: " + "layout len %u > recvd %u\n", + res->layoutp->len, recvd); + return -EINVAL; + } + + xdr_read_pages(xdr, res->layoutp->len); + + if (layout_count > 1) { + /* We only handle a length one array at the moment. Any + * further entries are just ignored. Note that this means + * the client may see a response that is less than the + * minimum it requested. + */ + dprintk("%s: server responded with %d layouts, dropping tail\n", + __func__, layout_count); + } + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutreturn(struct xdr_stream *xdr, + struct nfs4_layoutreturn_res *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_LAYOUTRETURN); + if (status) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->lrs_present = be32_to_cpup(p); + if (res->lrs_present) + status = decode_stateid(xdr, &res->stateid); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutcommit(struct xdr_stream *xdr, + struct rpc_rqst *req, + struct nfs4_layoutcommit_res *res) +{ + __be32 *p; + __u32 sizechanged; + int status; + + status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + sizechanged = be32_to_cpup(p); + + if (sizechanged) { + /* throw away new size */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * END OF "GENERIC" DECODE ROUTINES. + */ + +/* + * Decode OPEN_DOWNGRADE response + */ +static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_closeres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_open_downgrade(xdr, res); + if (status != 0) + goto out; + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode ACCESS response + */ +static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_accessres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status != 0) + goto out; + status = decode_access(xdr, res); + if (status != 0) + goto out; + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode LOOKUP response + */ +static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_lookup_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_lookup(xdr); + if (status) + goto out; + status = decode_getfh(xdr, res->fh); + if (status) + goto out; + status = decode_getfattr(xdr, res->fattr, res->server + ,!RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode LOOKUP_ROOT response + */ +static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_lookup_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putrootfh(xdr); + if (status) + goto out; + status = decode_getfh(xdr, res->fh); + if (status == 0) + status = decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode REMOVE response + */ +static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_removeres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_remove(xdr, &res->cinfo); + if (status) + goto out; + decode_getfattr(xdr, res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode RENAME response + */ +static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_renameres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_savefh(xdr); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo); + if (status) + goto out; + /* Current FH is target directory */ + if (decode_getfattr(xdr, res->new_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) + goto out; + status = decode_restorefh(xdr); + if (status) + goto out; + decode_getfattr(xdr, res->old_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode LINK response + */ +static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_link_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_savefh(xdr); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_link(xdr, &res->cinfo); + if (status) + goto out; + /* + * Note order: OP_LINK leaves the directory as the current + * filehandle. + */ + if (decode_getfattr(xdr, res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) + goto out; + status = decode_restorefh(xdr); + if (status) + goto out; + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode CREATE response + */ +static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_create_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_savefh(xdr); + if (status) + goto out; + status = decode_create(xdr, &res->dir_cinfo); + if (status) + goto out; + status = decode_getfh(xdr, res->fh); + if (status) + goto out; + if (decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) + goto out; + status = decode_restorefh(xdr); + if (status) + goto out; + decode_getfattr(xdr, res->dir_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode SYMLINK response + */ +static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_create_res *res) +{ + return nfs4_xdr_dec_create(rqstp, xdr, res); +} + +/* + * Decode GETATTR response + */ +static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_getattr_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Encode an SETACL request + */ +static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_setaclargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_setacl(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Decode SETACL response + */ +static int +nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_setaclres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_setattr(xdr); +out: + return status; +} + +/* + * Decode GETACL response + */ +static int +nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_getaclres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_getacl(xdr, rqstp, &res->acl_len); + +out: + return status; +} + +/* + * Decode CLOSE response + */ +static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_closeres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_close(xdr, res); + if (status != 0) + goto out; + /* + * Note: Server may do delete on close for this file + * in which case the getattr call will fail with + * an ESTALE error. Shouldn't be a problem, + * though, since fattr->valid will remain unset. + */ + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode OPEN response + */ +static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_openres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_savefh(xdr); + if (status) + goto out; + status = decode_open(xdr, res); + if (status) + goto out; + if (decode_getfh(xdr, &res->fh) != 0) + goto out; + if (decode_getfattr(xdr, res->f_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) + goto out; + if (decode_restorefh(xdr) != 0) + goto out; + decode_getfattr(xdr, res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode OPEN_CONFIRM response + */ +static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_open_confirmres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_open_confirm(xdr, res); +out: + return status; +} + +/* + * Decode OPEN response + */ +static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_openres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_open(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->f_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode SETATTR response + */ +static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_setattrres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_setattr(xdr); + if (status) + goto out; + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode LOCK response + */ +static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_lock_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_lock(xdr, res); +out: + return status; +} + +/* + * Decode LOCKT response + */ +static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_lockt_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_lockt(xdr, res); +out: + return status; +} + +/* + * Decode LOCKU response + */ +static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_locku_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_locku(xdr, res); +out: + return status; +} + +static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, void *dummy) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_release_lockowner(xdr); + return status; +} + +/* + * Decode READLINK response + */ +static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_readlink_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_readlink(xdr, rqstp); +out: + return status; +} + +/* + * Decode READDIR response + */ +static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_readdir_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_readdir(xdr, rqstp, res); +out: + return status; +} + +/* + * Decode Read response + */ +static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_readres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_read(xdr, rqstp, res); + if (!status) + status = res->count; +out: + return status; +} + +/* + * Decode WRITE response + */ +static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_writeres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_write(xdr, res); + if (status) + goto out; + if (res->fattr) + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); + if (!status) + status = res->count; +out: + return status; +} + +/* + * Decode COMMIT response + */ +static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_writeres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_commit(xdr, res); + if (status) + goto out; + if (res->fattr) + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode FSINFO response + */ +static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_fsinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, &res->seq_res, req); + if (!status) + status = decode_putfh(xdr); + if (!status) + status = decode_fsinfo(xdr, res->fsinfo); + return status; +} + +/* + * Decode PATHCONF response + */ +static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_pathconf_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, &res->seq_res, req); + if (!status) + status = decode_putfh(xdr); + if (!status) + status = decode_pathconf(xdr, res->pathconf); + return status; +} + +/* + * Decode STATFS response + */ +static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_statfs_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, &res->seq_res, req); + if (!status) + status = decode_putfh(xdr); + if (!status) + status = decode_statfs(xdr, res->fsstat); + return status; +} + +/* + * Decode GETATTR_BITMAP response + */ +static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_server_caps_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, req); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_server_caps(xdr, res); +out: + return status; +} + +/* + * Decode RENEW response + */ +static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + void *__unused) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_renew(xdr); + return status; +} + +/* + * Decode SETCLIENTID response + */ +static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_setclientid_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_setclientid(xdr, res); + return status; +} + +/* + * Decode SETCLIENTID_CONFIRM response + */ +static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fsinfo *fsinfo) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_setclientid_confirm(xdr); + if (!status) + status = decode_putrootfh(xdr); + if (!status) + status = decode_fsinfo(xdr, fsinfo); + return status; +} + +/* + * Decode DELEGRETURN response + */ +static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_delegreturnres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status != 0) + goto out; + status = decode_delegreturn(xdr); + if (status != 0) + goto out; + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode FS_LOCATIONS response + */ +static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_fs_locations_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, req); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_lookup(xdr); + if (status) + goto out; + xdr_enter_page(xdr, PAGE_SIZE); + status = decode_getfattr(xdr, &res->fs_locations->fattr, + res->fs_locations->server, + !RPC_IS_ASYNC(req->rq_task)); +out: + return status; +} + +/* + * Decode SECINFO response + */ +static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_secinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_secinfo(xdr, res); + if (status) + goto out; +out: + return status; +} + +#if defined(CONFIG_NFS_V4_1) +/* + * Decode EXCHANGE_ID response + */ +static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_exchange_id(xdr, res); + return status; +} + +/* + * Decode CREATE_SESSION response + */ +static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_create_session_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_create_session(xdr, res); + return status; +} + +/* + * Decode DESTROY_SESSION response + */ +static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_destroy_session(xdr, res); + return status; +} + +/* + * Decode SEQUENCE response + */ +static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_sequence_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, res, rqstp); + return status; +} + +/* + * Decode GET_LEASE_TIME response + */ +static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_get_lease_time_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, &res->lr_seq_res, rqstp); + if (!status) + status = decode_putrootfh(xdr); + if (!status) + status = decode_fsinfo(xdr, res->lr_fsinfo); + return status; +} + +/* + * Decode RECLAIM_COMPLETE response + */ +static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_reclaim_complete_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (!status) + status = decode_reclaim_complete(xdr, (void *)NULL); + return status; +} + +/* + * Decode GETDEVINFO response + */ +static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_getdeviceinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status != 0) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status != 0) + goto out; + status = decode_getdeviceinfo(xdr, res->pdev); +out: + return status; +} + +/* + * Decode LAYOUTGET response + */ +static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_layoutget_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_layoutget(xdr, rqstp, res); +out: + return status; +} + +/* + * Decode LAYOUTRETURN response + */ +static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_layoutreturn_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_layoutreturn(xdr, res); +out: + return status; +} + +/* + * Decode LAYOUTCOMMIT response + */ +static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_layoutcommit_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_layoutcommit(xdr, rqstp, res); + if (status) + goto out; + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in + * the local page cache. + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + */ +int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) +{ + uint32_t bitmap[2] = {0}; + uint32_t len; + __be32 *p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (*p == xdr_zero) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (*p == xdr_zero) + return -EAGAIN; + entry->eof = 1; + return -EBADCOOKIE; + } + + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + entry->prev_cookie = entry->cookie; + p = xdr_decode_hyper(p, &entry->cookie); + entry->len = be32_to_cpup(p); + + p = xdr_inline_decode(xdr, entry->len); + if (unlikely(!p)) + goto out_overflow; + entry->name = (const char *) p; + + /* + * In case the server doesn't return an inode number, + * we fake one here. (We don't use inode number 0, + * since glibc seems to choke on it...) + */ + entry->ino = 1; + entry->fattr->valid = 0; + + if (decode_attr_bitmap(xdr, bitmap) < 0) + goto out_overflow; + + if (decode_attr_length(xdr, &len, &p) < 0) + goto out_overflow; + + if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, + entry->server, 1) < 0) + goto out_overflow; + if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + entry->ino = entry->fattr->mounted_on_fileid; + else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) + entry->ino = entry->fattr->fileid; + + entry->d_type = DT_UNKNOWN; + if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EAGAIN; +} + +/* + * We need to translate between nfs status return values and + * the local errno values which may not be the same. + */ +static struct { + int stat; + int errno; +} nfs_errtbl[] = { + { NFS4_OK, 0 }, + { NFS4ERR_PERM, -EPERM }, + { NFS4ERR_NOENT, -ENOENT }, + { NFS4ERR_IO, -errno_NFSERR_IO}, + { NFS4ERR_NXIO, -ENXIO }, + { NFS4ERR_ACCESS, -EACCES }, + { NFS4ERR_EXIST, -EEXIST }, + { NFS4ERR_XDEV, -EXDEV }, + { NFS4ERR_NOTDIR, -ENOTDIR }, + { NFS4ERR_ISDIR, -EISDIR }, + { NFS4ERR_INVAL, -EINVAL }, + { NFS4ERR_FBIG, -EFBIG }, + { NFS4ERR_NOSPC, -ENOSPC }, + { NFS4ERR_ROFS, -EROFS }, + { NFS4ERR_MLINK, -EMLINK }, + { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFS4ERR_NOTEMPTY, -ENOTEMPTY }, + { NFS4ERR_DQUOT, -EDQUOT }, + { NFS4ERR_STALE, -ESTALE }, + { NFS4ERR_BADHANDLE, -EBADHANDLE }, + { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, + { NFS4ERR_NOTSUPP, -ENOTSUPP }, + { NFS4ERR_TOOSMALL, -ETOOSMALL }, + { NFS4ERR_SERVERFAULT, -EREMOTEIO }, + { NFS4ERR_BADTYPE, -EBADTYPE }, + { NFS4ERR_LOCKED, -EAGAIN }, + { NFS4ERR_SYMLINK, -ELOOP }, + { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, + { NFS4ERR_DEADLOCK, -EDEADLK }, + { -1, -EIO } +}; + +/* + * Convert an NFS error code to a local one. + * This one is used jointly by NFSv2 and NFSv3. + */ +static int +nfs4_stat_to_errno(int stat) +{ + int i; + for (i = 0; nfs_errtbl[i].stat != -1; i++) { + if (nfs_errtbl[i].stat == stat) + return nfs_errtbl[i].errno; + } + if (stat <= 10000 || stat > 10100) { + /* The server is looney tunes. */ + return -EREMOTEIO; + } + /* If we cannot translate the error, the recovery routines should + * handle it. + * Note: remaining NFSv4 error codes have values > 10000, so should + * not conflict with native Linux error codes. + */ + return -stat; +} + +#define PROC(proc, argtype, restype) \ +[NFSPROC4_CLNT_##proc] = { \ + .p_proc = NFSPROC4_COMPOUND, \ + .p_encode = (kxdreproc_t)nfs4_xdr_##argtype, \ + .p_decode = (kxdrdproc_t)nfs4_xdr_##restype, \ + .p_arglen = NFS4_##argtype##_sz, \ + .p_replen = NFS4_##restype##_sz, \ + .p_statidx = NFSPROC4_CLNT_##proc, \ + .p_name = #proc, \ +} + +struct rpc_procinfo nfs4_procedures[] = { + PROC(READ, enc_read, dec_read), + PROC(WRITE, enc_write, dec_write), + PROC(COMMIT, enc_commit, dec_commit), + PROC(OPEN, enc_open, dec_open), + PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm), + PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr), + PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade), + PROC(CLOSE, enc_close, dec_close), + PROC(SETATTR, enc_setattr, dec_setattr), + PROC(FSINFO, enc_fsinfo, dec_fsinfo), + PROC(RENEW, enc_renew, dec_renew), + PROC(SETCLIENTID, enc_setclientid, dec_setclientid), + PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm), + PROC(LOCK, enc_lock, dec_lock), + PROC(LOCKT, enc_lockt, dec_lockt), + PROC(LOCKU, enc_locku, dec_locku), + PROC(ACCESS, enc_access, dec_access), + PROC(GETATTR, enc_getattr, dec_getattr), + PROC(LOOKUP, enc_lookup, dec_lookup), + PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), + PROC(REMOVE, enc_remove, dec_remove), + PROC(RENAME, enc_rename, dec_rename), + PROC(LINK, enc_link, dec_link), + PROC(SYMLINK, enc_symlink, dec_symlink), + PROC(CREATE, enc_create, dec_create), + PROC(PATHCONF, enc_pathconf, dec_pathconf), + PROC(STATFS, enc_statfs, dec_statfs), + PROC(READLINK, enc_readlink, dec_readlink), + PROC(READDIR, enc_readdir, dec_readdir), + PROC(SERVER_CAPS, enc_server_caps, dec_server_caps), + PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), + PROC(GETACL, enc_getacl, dec_getacl), + PROC(SETACL, enc_setacl, dec_setacl), + PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), + PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), + PROC(SECINFO, enc_secinfo, dec_secinfo), +#if defined(CONFIG_NFS_V4_1) + PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC(CREATE_SESSION, enc_create_session, dec_create_session), + PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), + PROC(SEQUENCE, enc_sequence, dec_sequence), + PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), + PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC(LAYOUTGET, enc_layoutget, dec_layoutget), + PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), + PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), +#endif /* CONFIG_NFS_V4_1 */ +}; + +struct rpc_version nfs_version4 = { + .number = 4, + .nrprocs = ARRAY_SIZE(nfs4_procedures), + .procs = nfs4_procedures +}; + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c new file mode 100644 index 00000000..c4744e1d --- /dev/null +++ b/fs/nfs/nfsroot.c @@ -0,0 +1,309 @@ +/* + * Copyright (C) 1995, 1996 Gero Kuhlmann + * + * Allow an NFS filesystem to be mounted as root. The way this works is: + * (1) Use the IP autoconfig mechanism to set local IP addresses and routes. + * (2) Construct the device string and the options string using DHCP + * option 17 and/or kernel command line options. + * (3) When mount_root() sets up the root file system, pass these strings + * to the NFS client's regular mount interface via sys_mount(). + * + * + * Changes: + * + * Alan Cox : Removed get_address name clash with FPU. + * Alan Cox : Reformatted a bit. + * Gero Kuhlmann : Code cleanup + * Michael Rausch : Fixed recognition of an incoming RARP answer. + * Martin Mares : (2.0) Auto-configuration via BOOTP supported. + * Martin Mares : Manual selection of interface & BOOTP/RARP. + * Martin Mares : Using network routes instead of host routes, + * allowing the default configuration to be used + * for normal operation of the host. + * Martin Mares : Randomized timer with exponential backoff + * installed to minimize network congestion. + * Martin Mares : Code cleanup. + * Martin Mares : (2.1) BOOTP and RARP made configuration options. + * Martin Mares : Server hostname generation fixed. + * Gerd Knorr : Fixed wired inode handling + * Martin Mares : (2.2) "0.0.0.0" addresses from command line ignored. + * Martin Mares : RARP replies not tested for server address. + * Gero Kuhlmann : (2.3) Some bug fixes and code cleanup again (please + * send me your new patches _before_ bothering + * Linus so that I don' always have to cleanup + * _afterwards_ - thanks) + * Gero Kuhlmann : Last changes of Martin Mares undone. + * Gero Kuhlmann : RARP replies are tested for specified server + * again. However, it's now possible to have + * different RARP and NFS servers. + * Gero Kuhlmann : "0.0.0.0" addresses from command line are + * now mapped to INADDR_NONE. + * Gero Kuhlmann : Fixed a bug which prevented BOOTP path name + * from being used (thanks to Leo Spiekman) + * Andy Walker : Allow to specify the NFS server in nfs_root + * without giving a path name + * Swen Thümmler : Allow to specify the NFS options in nfs_root + * without giving a path name. Fix BOOTP request + * for domainname (domainname is NIS domain, not + * DNS domain!). Skip dummy devices for BOOTP. + * Jacek Zapala : Fixed a bug which prevented server-ip address + * from nfsroot parameter from being used. + * Olaf Kirch : Adapted to new NFS code. + * Jakub Jelinek : Free used code segment. + * Marko Kohtala : Fixed some bugs. + * Martin Mares : Debug message cleanup + * Martin Mares : Changed to use the new generic IP layer autoconfig + * code. BOOTP and RARP moved there. + * Martin Mares : Default path now contains host name instead of + * host IP address (but host name defaults to IP + * address anyway). + * Martin Mares : Use root_server_addr appropriately during setup. + * Martin Mares : Rewrote parameter parsing, now hopefully giving + * correct overriding. + * Trond Myklebust : Add in preliminary support for NFSv3 and TCP. + * Fix bug in root_nfs_addr(). nfs_data.namlen + * is NOT for the length of the hostname. + * Hua Qin : Support for mounting root file system via + * NFS over TCP. + * Fabian Frederick: Option parser rebuilt (using parser lib) + * Chuck Lever : Use super.c's text-based mount option parsing + * Chuck Lever : Add "nfsrootdebug". + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_ROOT + +/* Default path we try to mount. "%s" gets replaced by our IP address */ +#define NFS_ROOT "/tftpboot/%s" + +/* Default NFSROOT mount options. */ +#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096" + +/* Parameters passed from the kernel command line */ +static char nfs_root_parms[256] __initdata = ""; + +/* Text-based mount options passed to super.c */ +static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS; + +/* Address of NFS server */ +static __be32 servaddr __initdata = htonl(INADDR_NONE); + +/* Name of directory to mount */ +static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = ""; + +/* server:export path string passed to super.c */ +static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = ""; + +#ifdef RPC_DEBUG +/* + * When the "nfsrootdebug" kernel command line option is specified, + * enable debugging messages for NFSROOT. + */ +static int __init nfs_root_debug(char *__unused) +{ + nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT; + return 1; +} + +__setup("nfsrootdebug", nfs_root_debug); +#endif + +/* + * Parse NFS server and directory information passed on the kernel + * command line. + * + * nfsroot=[:][,] + * + * If there is a "%s" token in the string, it is replaced + * by the ASCII-representation of the client's IP address. + */ +static int __init nfs_root_setup(char *line) +{ + ROOT_DEV = Root_NFS; + + if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { + strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms)); + } else { + size_t n = strlen(line) + sizeof(NFS_ROOT) - 1; + if (n >= sizeof(nfs_root_parms)) + line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0'; + sprintf(nfs_root_parms, NFS_ROOT, line); + } + + /* + * Extract the IP address of the NFS server containing our + * root file system, if one was specified. + * + * Note: root_nfs_parse_addr() removes the server-ip from + * nfs_root_parms, if it exists. + */ + root_server_addr = root_nfs_parse_addr(nfs_root_parms); + + return 1; +} + +__setup("nfsroot=", nfs_root_setup); + +static int __init root_nfs_copy(char *dest, const char *src, + const size_t destlen) +{ + if (strlcpy(dest, src, destlen) > destlen) + return -1; + return 0; +} + +static int __init root_nfs_cat(char *dest, const char *src, + const size_t destlen) +{ + size_t len = strlen(dest); + + if (len && dest[len - 1] != ',') + if (strlcat(dest, ",", destlen) > destlen) + return -1; + + if (strlcat(dest, src, destlen) > destlen) + return -1; + return 0; +} + +/* + * Parse out root export path and mount options from + * passed-in string @incoming. + * + * Copy the export path into @exppath. + */ +static int __init root_nfs_parse_options(char *incoming, char *exppath, + const size_t exppathlen) +{ + char *p; + + /* + * Set the NFS remote path + */ + p = strsep(&incoming, ","); + if (*p != '\0' && strcmp(p, "default") != 0) + if (root_nfs_copy(exppath, p, exppathlen)) + return -1; + + /* + * @incoming now points to the rest of the string; if it + * contains something, append it to our root options buffer + */ + if (incoming != NULL && *incoming != '\0') + if (root_nfs_cat(nfs_root_options, incoming, + sizeof(nfs_root_options))) + return -1; + return 0; +} + +/* + * Decode the export directory path name and NFS options from + * the kernel command line. This has to be done late in order to + * use a dynamically acquired client IP address for the remote + * root directory path. + * + * Returns zero if successful; otherwise -1 is returned. + */ +static int __init root_nfs_data(char *cmdline) +{ + char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; + int len, retval = -1; + char *tmp = NULL; + const size_t tmplen = sizeof(nfs_export_path); + + tmp = kzalloc(tmplen, GFP_KERNEL); + if (tmp == NULL) + goto out_nomem; + strcpy(tmp, NFS_ROOT); + + if (root_server_path[0] != '\0') { + dprintk("Root-NFS: DHCPv4 option 17: %s\n", + root_server_path); + if (root_nfs_parse_options(root_server_path, tmp, tmplen)) + goto out_optionstoolong; + } + + if (cmdline[0] != '\0') { + dprintk("Root-NFS: nfsroot=%s\n", cmdline); + if (root_nfs_parse_options(cmdline, tmp, tmplen)) + goto out_optionstoolong; + } + + /* + * Append mandatory options for nfsroot so they override + * what has come before + */ + snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4", + &servaddr); + if (root_nfs_cat(nfs_root_options, mand_options, + sizeof(nfs_root_options))) + goto out_optionstoolong; + + /* + * Set up nfs_root_device. For NFS mounts, this looks like + * + * server:/path + * + * At this point, utsname()->nodename contains our local + * IP address or hostname, set by ipconfig. If "%s" exists + * in tmp, substitute the nodename, then shovel the whole + * mess into nfs_root_device. + */ + len = snprintf(nfs_export_path, sizeof(nfs_export_path), + tmp, utsname()->nodename); + if (len > (int)sizeof(nfs_export_path)) + goto out_devnametoolong; + len = snprintf(nfs_root_device, sizeof(nfs_root_device), + "%pI4:%s", &servaddr, nfs_export_path); + if (len > (int)sizeof(nfs_root_device)) + goto out_devnametoolong; + + retval = 0; + +out: + kfree(tmp); + return retval; +out_nomem: + printk(KERN_ERR "Root-NFS: could not allocate memory\n"); + goto out; +out_optionstoolong: + printk(KERN_ERR "Root-NFS: mount options string too long\n"); + goto out; +out_devnametoolong: + printk(KERN_ERR "Root-NFS: root device name too long.\n"); + goto out; +} + +/** + * nfs_root_data - Return prepared 'data' for NFSROOT mount + * @root_device: OUT: address of string containing NFSROOT device + * @root_data: OUT: address of string containing NFSROOT mount options + * + * Returns zero and sets @root_device and @root_data if successful, + * otherwise -1 is returned. + */ +int __init nfs_root_data(char **root_device, char **root_data) +{ + servaddr = root_server_addr; + if (servaddr == htonl(INADDR_NONE)) { + printk(KERN_ERR "Root-NFS: no NFS server address\n"); + return -1; + } + + if (root_nfs_data(nfs_root_parms) < 0) + return -1; + + *root_device = nfs_root_device; + *root_data = nfs_root_options; + return 0; +} diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild new file mode 100644 index 00000000..ed30ea07 --- /dev/null +++ b/fs/nfs/objlayout/Kbuild @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Objects Layout Driver kernel module +# +objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c new file mode 100644 index 00000000..75fe694d --- /dev/null +++ b/fs/nfs/objlayout/objio_osd.c @@ -0,0 +1,1056 @@ +/* + * pNFS Objects layout implementation over open-osd initiator library + * + * Copyright (C) 2009 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "objlayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +#define _LLU(x) ((unsigned long long)x) + +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), +}; + +struct objio_dev_ent { + struct nfs4_deviceid_node id_node; + struct osd_dev *od; +}; + +static void +objio_free_deviceid_node(struct nfs4_deviceid_node *d) +{ + struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); + + dprintk("%s: free od=%p\n", __func__, de->od); + osduld_put_device(de->od); + kfree(de); +} + +static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss, + const struct nfs4_deviceid *d_id) +{ + struct nfs4_deviceid_node *d; + struct objio_dev_ent *de; + + d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id); + if (!d) + return NULL; + + de = container_of(d, struct objio_dev_ent, id_node); + return de; +} + +static struct objio_dev_ent * +_dev_list_add(const struct nfs_server *nfss, + const struct nfs4_deviceid *d_id, struct osd_dev *od, + gfp_t gfp_flags) +{ + struct nfs4_deviceid_node *d; + struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags); + struct objio_dev_ent *n; + + if (!de) { + dprintk("%s: -ENOMEM od=%p\n", __func__, od); + return NULL; + } + + dprintk("%s: Adding od=%p\n", __func__, od); + nfs4_init_deviceid_node(&de->id_node, + nfss->pnfs_curr_ld, + nfss->nfs_client, + d_id); + de->od = od; + + d = nfs4_insert_deviceid_node(&de->id_node); + n = container_of(d, struct objio_dev_ent, id_node); + if (n != de) { + dprintk("%s: Race with other n->od=%p\n", __func__, n->od); + objio_free_deviceid_node(&de->id_node); + de = n; + } + + return de; +} + +struct caps_buffers { + u8 caps_key[OSD_CRYPTO_KEYID_SIZE]; + u8 creds[OSD_CAP_LEN]; +}; + +struct objio_segment { + struct pnfs_layout_segment lseg; + + struct pnfs_osd_object_cred *comps; + + unsigned mirrors_p1; + unsigned stripe_unit; + unsigned group_width; /* Data stripe_units without integrity comps */ + u64 group_depth; + unsigned group_count; + + unsigned max_io_size; + + unsigned comps_index; + unsigned num_comps; + /* variable length */ + struct objio_dev_ent *ods[]; +}; + +static inline struct objio_segment * +OBJIO_LSEG(struct pnfs_layout_segment *lseg) +{ + return container_of(lseg, struct objio_segment, lseg); +} + +struct objio_state; +typedef ssize_t (*objio_done_fn)(struct objio_state *ios); + +struct objio_state { + /* Generic layer */ + struct objlayout_io_state ol_state; + + struct objio_segment *layout; + + struct kref kref; + objio_done_fn done; + void *private; + + unsigned long length; + unsigned numdevs; /* Actually used devs in this IO */ + /* A per-device variable array of size numdevs */ + struct _objio_per_comp { + struct bio *bio; + struct osd_request *or; + unsigned long length; + u64 offset; + unsigned dev; + } per_dev[]; +}; + +/* Send and wait for a get_device_info of devices in the layout, + then look them up with the osd_initiator library */ +static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, + struct objio_segment *objio_seg, unsigned comp, + gfp_t gfp_flags) +{ + struct pnfs_osd_deviceaddr *deviceaddr; + struct nfs4_deviceid *d_id; + struct objio_dev_ent *ode; + struct osd_dev *od; + struct osd_dev_info odi; + int err; + + d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id; + + ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); + if (ode) + return ode; + + err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); + if (unlikely(err)) { + dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", + __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); + return ERR_PTR(err); + } + + odi.systemid_len = deviceaddr->oda_systemid.len; + if (odi.systemid_len > sizeof(odi.systemid)) { + err = -EINVAL; + goto out; + } else if (odi.systemid_len) + memcpy(odi.systemid, deviceaddr->oda_systemid.data, + odi.systemid_len); + odi.osdname_len = deviceaddr->oda_osdname.len; + odi.osdname = (u8 *)deviceaddr->oda_osdname.data; + + if (!odi.osdname_len && !odi.systemid_len) { + dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", + __func__); + err = -ENODEV; + goto out; + } + + od = osduld_info_lookup(&odi); + if (unlikely(IS_ERR(od))) { + err = PTR_ERR(od); + dprintk("%s: osduld_info_lookup => %d\n", __func__, err); + goto out; + } + + ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, + gfp_flags); + +out: + dprintk("%s: return=%d\n", __func__, err); + objlayout_put_deviceinfo(deviceaddr); + return err ? ERR_PTR(err) : ode; +} + +static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, + struct objio_segment *objio_seg, + gfp_t gfp_flags) +{ + unsigned i; + int err; + + /* lookup all devices */ + for (i = 0; i < objio_seg->num_comps; i++) { + struct objio_dev_ent *ode; + + ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags); + if (unlikely(IS_ERR(ode))) { + err = PTR_ERR(ode); + goto out; + } + objio_seg->ods[i] = ode; + } + err = 0; + +out: + dprintk("%s: return=%d\n", __func__, err); + return err; +} + +static int _verify_data_map(struct pnfs_osd_layout *layout) +{ + struct pnfs_osd_data_map *data_map = &layout->olo_map; + u64 stripe_length; + u32 group_width; + +/* FIXME: Only raid0 for now. if not go through MDS */ + if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { + printk(KERN_ERR "Only RAID_0 for now\n"); + return -ENOTSUPP; + } + if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { + printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", + data_map->odm_num_comps, data_map->odm_mirror_cnt); + return -EINVAL; + } + + if (data_map->odm_group_width) + group_width = data_map->odm_group_width; + else + group_width = data_map->odm_num_comps / + (data_map->odm_mirror_cnt + 1); + + stripe_length = (u64)data_map->odm_stripe_unit * group_width; + if (stripe_length >= (1ULL << 32)) { + printk(KERN_ERR "Total Stripe length(0x%llx)" + " >= 32bit is not supported\n", _LLU(stripe_length)); + return -ENOTSUPP; + } + + if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { + printk(KERN_ERR "Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(data_map->odm_stripe_unit), PAGE_SIZE); + return -ENOTSUPP; + } + + return 0; +} + +static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp, + struct pnfs_osd_object_cred *src_comp, + struct caps_buffers *caps_p) +{ + WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key)); + WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds)); + + *cur_comp = *src_comp; + + memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred, + sizeof(caps_p->caps_key)); + cur_comp->oc_cap_key.cred = caps_p->caps_key; + + memcpy(caps_p->creds, src_comp->oc_cap.cred, + sizeof(caps_p->creds)); + cur_comp->oc_cap.cred = caps_p->creds; +} + +int objio_alloc_lseg(struct pnfs_layout_segment **outp, + struct pnfs_layout_hdr *pnfslay, + struct pnfs_layout_range *range, + struct xdr_stream *xdr, + gfp_t gfp_flags) +{ + struct objio_segment *objio_seg; + struct pnfs_osd_xdr_decode_layout_iter iter; + struct pnfs_osd_layout layout; + struct pnfs_osd_object_cred *cur_comp, src_comp; + struct caps_buffers *caps_p; + int err; + + err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); + if (unlikely(err)) + return err; + + err = _verify_data_map(&layout); + if (unlikely(err)) + return err; + + objio_seg = kzalloc(sizeof(*objio_seg) + + sizeof(objio_seg->ods[0]) * layout.olo_num_comps + + sizeof(*objio_seg->comps) * layout.olo_num_comps + + sizeof(struct caps_buffers) * layout.olo_num_comps, + gfp_flags); + if (!objio_seg) + return -ENOMEM; + + objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps); + cur_comp = objio_seg->comps; + caps_p = (void *)(cur_comp + layout.olo_num_comps); + while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) + copy_single_comp(cur_comp++, &src_comp, caps_p++); + if (unlikely(err)) + goto err; + + objio_seg->num_comps = layout.olo_num_comps; + objio_seg->comps_index = layout.olo_comps_index; + err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags); + if (err) + goto err; + + objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; + objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; + if (layout.olo_map.odm_group_width) { + objio_seg->group_width = layout.olo_map.odm_group_width; + objio_seg->group_depth = layout.olo_map.odm_group_depth; + objio_seg->group_count = layout.olo_map.odm_num_comps / + objio_seg->mirrors_p1 / + objio_seg->group_width; + } else { + objio_seg->group_width = layout.olo_map.odm_num_comps / + objio_seg->mirrors_p1; + objio_seg->group_depth = -1; + objio_seg->group_count = 1; + } + + /* Cache this calculation it will hit for every page */ + objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - + objio_seg->stripe_unit) * + objio_seg->group_width; + + *outp = &objio_seg->lseg; + return 0; + +err: + kfree(objio_seg); + dprintk("%s: Error: return %d\n", __func__, err); + *outp = NULL; + return err; +} + +void objio_free_lseg(struct pnfs_layout_segment *lseg) +{ + int i; + struct objio_segment *objio_seg = OBJIO_LSEG(lseg); + + for (i = 0; i < objio_seg->num_comps; i++) { + if (!objio_seg->ods[i]) + break; + nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node); + } + kfree(objio_seg); +} + +int objio_alloc_io_state(struct pnfs_layout_segment *lseg, + struct objlayout_io_state **outp, + gfp_t gfp_flags) +{ + struct objio_segment *objio_seg = OBJIO_LSEG(lseg); + struct objio_state *ios; + const unsigned first_size = sizeof(*ios) + + objio_seg->num_comps * sizeof(ios->per_dev[0]); + const unsigned sec_size = objio_seg->num_comps * + sizeof(ios->ol_state.ioerrs[0]); + + ios = kzalloc(first_size + sec_size, gfp_flags); + if (unlikely(!ios)) + return -ENOMEM; + + ios->layout = objio_seg; + ios->ol_state.ioerrs = ((void *)ios) + first_size; + ios->ol_state.num_comps = objio_seg->num_comps; + + *outp = &ios->ol_state; + return 0; +} + +void objio_free_io_state(struct objlayout_io_state *ol_state) +{ + struct objio_state *ios = container_of(ol_state, struct objio_state, + ol_state); + + kfree(ios); +} + +enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) +{ + switch (oep) { + case OSD_ERR_PRI_NO_ERROR: + return (enum pnfs_osd_errno)0; + + case OSD_ERR_PRI_CLEAR_PAGES: + BUG_ON(1); + return 0; + + case OSD_ERR_PRI_RESOURCE: + return PNFS_OSD_ERR_RESOURCE; + case OSD_ERR_PRI_BAD_CRED: + return PNFS_OSD_ERR_BAD_CRED; + case OSD_ERR_PRI_NO_ACCESS: + return PNFS_OSD_ERR_NO_ACCESS; + case OSD_ERR_PRI_UNREACHABLE: + return PNFS_OSD_ERR_UNREACHABLE; + case OSD_ERR_PRI_NOT_FOUND: + return PNFS_OSD_ERR_NOT_FOUND; + case OSD_ERR_PRI_NO_SPACE: + return PNFS_OSD_ERR_NO_SPACE; + default: + WARN_ON(1); + /* fallthrough */ + case OSD_ERR_PRI_EIO: + return PNFS_OSD_ERR_EIO; + } +} + +static void _clear_bio(struct bio *bio) +{ + struct bio_vec *bv; + unsigned i; + + __bio_for_each_segment(bv, bio, i, 0) { + unsigned this_count = bv->bv_len; + + if (likely(PAGE_SIZE == this_count)) + clear_highpage(bv->bv_page); + else + zero_user(bv->bv_page, bv->bv_offset, this_count); + } +} + +static int _io_check(struct objio_state *ios, bool is_write) +{ + enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; + int lin_ret = 0; + int i; + + for (i = 0; i < ios->numdevs; i++) { + struct osd_sense_info osi; + struct osd_request *or = ios->per_dev[i].or; + int ret; + + if (!or) + continue; + + ret = osd_req_decode_sense(or, &osi); + if (likely(!ret)) + continue; + + if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { + /* start read offset passed endof file */ + BUG_ON(is_write); + _clear_bio(ios->per_dev[i].bio); + dprintk("%s: start read offset passed end of file " + "offset=0x%llx, length=0x%lx\n", __func__, + _LLU(ios->per_dev[i].offset), + ios->per_dev[i].length); + + continue; /* we recovered */ + } + objlayout_io_set_result(&ios->ol_state, i, + &ios->layout->comps[i].oc_object_id, + osd_pri_2_pnfs_err(osi.osd_err_pri), + ios->per_dev[i].offset, + ios->per_dev[i].length, + is_write); + + if (osi.osd_err_pri >= oep) { + oep = osi.osd_err_pri; + lin_ret = ret; + } + } + + return lin_ret; +} + +/* + * Common IO state helpers. + */ +static void _io_free(struct objio_state *ios) +{ + unsigned i; + + for (i = 0; i < ios->numdevs; i++) { + struct _objio_per_comp *per_dev = &ios->per_dev[i]; + + if (per_dev->or) { + osd_end_request(per_dev->or); + per_dev->or = NULL; + } + + if (per_dev->bio) { + bio_put(per_dev->bio); + per_dev->bio = NULL; + } + } +} + +struct osd_dev *_io_od(struct objio_state *ios, unsigned dev) +{ + unsigned min_dev = ios->layout->comps_index; + unsigned max_dev = min_dev + ios->layout->num_comps; + + BUG_ON(dev < min_dev || max_dev <= dev); + return ios->layout->ods[dev - min_dev]->od; +} + +struct _striping_info { + u64 obj_offset; + u64 group_length; + unsigned dev; + unsigned unit_off; +}; + +static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, + struct _striping_info *si) +{ + u32 stripe_unit = ios->layout->stripe_unit; + u32 group_width = ios->layout->group_width; + u64 group_depth = ios->layout->group_depth; + u32 U = stripe_unit * group_width; + + u64 T = U * group_depth; + u64 S = T * ios->layout->group_count; + u64 M = div64_u64(file_offset, S); + + /* + G = (L - (M * S)) / T + H = (L - (M * S)) % T + */ + u64 LmodU = file_offset - M * S; + u32 G = div64_u64(LmodU, T); + u64 H = LmodU - G * T; + + u32 N = div_u64(H, U); + + div_u64_rem(file_offset, stripe_unit, &si->unit_off); + si->obj_offset = si->unit_off + (N * stripe_unit) + + (M * group_depth * stripe_unit); + + /* "H - (N * U)" is just "H % U" so it's bound to u32 */ + si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; + si->dev *= ios->layout->mirrors_p1; + + si->group_length = T - H; +} + +static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, + unsigned pgbase, struct _objio_per_comp *per_dev, int len, + gfp_t gfp_flags) +{ + unsigned pg = *cur_pg; + int cur_len = len; + struct request_queue *q = + osd_request_queue(_io_od(ios, per_dev->dev)); + + if (per_dev->bio == NULL) { + unsigned pages_in_stripe = ios->layout->group_width * + (ios->layout->stripe_unit / PAGE_SIZE); + unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / + ios->layout->group_width; + + if (BIO_MAX_PAGES_KMALLOC < bio_size) + bio_size = BIO_MAX_PAGES_KMALLOC; + + per_dev->bio = bio_kmalloc(gfp_flags, bio_size); + if (unlikely(!per_dev->bio)) { + dprintk("Faild to allocate BIO size=%u\n", bio_size); + return -ENOMEM; + } + } + + while (cur_len > 0) { + unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); + unsigned added_len; + + BUG_ON(ios->ol_state.nr_pages <= pg); + cur_len -= pglen; + + added_len = bio_add_pc_page(q, per_dev->bio, + ios->ol_state.pages[pg], pglen, pgbase); + if (unlikely(pglen != added_len)) + return -ENOMEM; + pgbase = 0; + ++pg; + } + BUG_ON(cur_len); + + per_dev->length += len; + *cur_pg = pg; + return 0; +} + +static int _prepare_one_group(struct objio_state *ios, u64 length, + struct _striping_info *si, unsigned *last_pg, + gfp_t gfp_flags) +{ + unsigned stripe_unit = ios->layout->stripe_unit; + unsigned mirrors_p1 = ios->layout->mirrors_p1; + unsigned devs_in_group = ios->layout->group_width * mirrors_p1; + unsigned dev = si->dev; + unsigned first_dev = dev - (dev % devs_in_group); + unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; + unsigned cur_pg = *last_pg; + int ret = 0; + + while (length) { + struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev]; + unsigned cur_len, page_off = 0; + + if (!per_dev->length) { + per_dev->dev = dev; + if (dev < si->dev) { + per_dev->offset = si->obj_offset + stripe_unit - + si->unit_off; + cur_len = stripe_unit; + } else if (dev == si->dev) { + per_dev->offset = si->obj_offset; + cur_len = stripe_unit - si->unit_off; + page_off = si->unit_off & ~PAGE_MASK; + BUG_ON(page_off && + (page_off != ios->ol_state.pgbase)); + } else { /* dev > si->dev */ + per_dev->offset = si->obj_offset - si->unit_off; + cur_len = stripe_unit; + } + + if (max_comp < dev - first_dev) + max_comp = dev - first_dev; + } else { + cur_len = stripe_unit; + } + if (cur_len >= length) + cur_len = length; + + ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, + cur_len, gfp_flags); + if (unlikely(ret)) + goto out; + + dev += mirrors_p1; + dev = (dev % devs_in_group) + first_dev; + + length -= cur_len; + ios->length += cur_len; + } +out: + ios->numdevs = max_comp + mirrors_p1; + *last_pg = cur_pg; + return ret; +} + +static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags) +{ + u64 length = ios->ol_state.count; + u64 offset = ios->ol_state.offset; + struct _striping_info si; + unsigned last_pg = 0; + int ret = 0; + + while (length) { + _calc_stripe_info(ios, offset, &si); + + if (length < si.group_length) + si.group_length = length; + + ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags); + if (unlikely(ret)) + goto out; + + offset += si.group_length; + length -= si.group_length; + } + +out: + if (!ios->length) + return ret; + + return 0; +} + +static ssize_t _sync_done(struct objio_state *ios) +{ + struct completion *waiting = ios->private; + + complete(waiting); + return 0; +} + +static void _last_io(struct kref *kref) +{ + struct objio_state *ios = container_of(kref, struct objio_state, kref); + + ios->done(ios); +} + +static void _done_io(struct osd_request *or, void *p) +{ + struct objio_state *ios = p; + + kref_put(&ios->kref, _last_io); +} + +static ssize_t _io_exec(struct objio_state *ios) +{ + DECLARE_COMPLETION_ONSTACK(wait); + ssize_t status = 0; /* sync status */ + unsigned i; + objio_done_fn saved_done_fn = ios->done; + bool sync = ios->ol_state.sync; + + if (sync) { + ios->done = _sync_done; + ios->private = &wait; + } + + kref_init(&ios->kref); + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + + if (!or) + continue; + + kref_get(&ios->kref); + osd_execute_request_async(or, _done_io, ios); + } + + kref_put(&ios->kref, _last_io); + + if (sync) { + wait_for_completion(&wait); + status = saved_done_fn(ios); + } + + return status; +} + +/* + * read + */ +static ssize_t _read_done(struct objio_state *ios) +{ + ssize_t status; + int ret = _io_check(ios, false); + + _io_free(ios); + + if (likely(!ret)) + status = ios->length; + else + status = ret; + + objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); + return status; +} + +static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) +{ + struct osd_request *or = NULL; + struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; + unsigned dev = per_dev->dev; + struct pnfs_osd_object_cred *cred = + &ios->layout->comps[cur_comp]; + struct osd_obj_id obj = { + .partition = cred->oc_object_id.oid_partition_id, + .id = cred->oc_object_id.oid_object_id, + }; + int ret; + + or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); + if (unlikely(!or)) { + ret = -ENOMEM; + goto err; + } + per_dev->or = or; + + osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); + + ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); + if (ret) { + dprintk("%s: Faild to osd_finalize_request() => %d\n", + __func__, ret); + goto err; + } + + dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", + __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), + per_dev->length); + +err: + return ret; +} + +static ssize_t _read_exec(struct objio_state *ios) +{ + unsigned i; + int ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + if (!ios->per_dev[i].length) + continue; + ret = _read_mirrors(ios, i); + if (unlikely(ret)) + goto err; + } + + ios->done = _read_done; + return _io_exec(ios); /* In sync mode exec returns the io status */ + +err: + _io_free(ios); + return ret; +} + +ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) +{ + struct objio_state *ios = container_of(ol_state, struct objio_state, + ol_state); + int ret; + + ret = _io_rw_pagelist(ios, GFP_KERNEL); + if (unlikely(ret)) + return ret; + + return _read_exec(ios); +} + +/* + * write + */ +static ssize_t _write_done(struct objio_state *ios) +{ + ssize_t status; + int ret = _io_check(ios, true); + + _io_free(ios); + + if (likely(!ret)) { + /* FIXME: should be based on the OSD's persistence model + * See OSD2r05 Section 4.13 Data persistence model */ + ios->ol_state.committed = NFS_FILE_SYNC; + status = ios->length; + } else { + status = ret; + } + + objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); + return status; +} + +static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) +{ + struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; + unsigned dev = ios->per_dev[cur_comp].dev; + unsigned last_comp = cur_comp + ios->layout->mirrors_p1; + int ret; + + for (; cur_comp < last_comp; ++cur_comp, ++dev) { + struct osd_request *or = NULL; + struct pnfs_osd_object_cred *cred = + &ios->layout->comps[cur_comp]; + struct osd_obj_id obj = { + .partition = cred->oc_object_id.oid_partition_id, + .id = cred->oc_object_id.oid_object_id, + }; + struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; + struct bio *bio; + + or = osd_start_request(_io_od(ios, dev), GFP_NOFS); + if (unlikely(!or)) { + ret = -ENOMEM; + goto err; + } + per_dev->or = or; + + if (per_dev != master_dev) { + bio = bio_kmalloc(GFP_NOFS, + master_dev->bio->bi_max_vecs); + if (unlikely(!bio)) { + dprintk("Faild to allocate BIO size=%u\n", + master_dev->bio->bi_max_vecs); + ret = -ENOMEM; + goto err; + } + + __bio_clone(bio, master_dev->bio); + bio->bi_bdev = NULL; + bio->bi_next = NULL; + per_dev->bio = bio; + per_dev->dev = dev; + per_dev->length = master_dev->length; + per_dev->offset = master_dev->offset; + } else { + bio = master_dev->bio; + bio->bi_rw |= REQ_WRITE; + } + + osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); + + ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); + if (ret) { + dprintk("%s: Faild to osd_finalize_request() => %d\n", + __func__, ret); + goto err; + } + + dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", + __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), + per_dev->length); + } + +err: + return ret; +} + +static ssize_t _write_exec(struct objio_state *ios) +{ + unsigned i; + int ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + if (!ios->per_dev[i].length) + continue; + ret = _write_mirrors(ios, i); + if (unlikely(ret)) + goto err; + } + + ios->done = _write_done; + return _io_exec(ios); /* In sync mode exec returns the io->status */ + +err: + _io_free(ios); + return ret; +} + +ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) +{ + struct objio_state *ios = container_of(ol_state, struct objio_state, + ol_state); + int ret; + + /* TODO: ios->stable = stable; */ + ret = _io_rw_pagelist(ios, GFP_NOFS); + if (unlikely(ret)) + return ret; + + return _write_exec(ios); +} + +static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, struct nfs_page *req) +{ + if (!pnfs_generic_pg_test(pgio, prev, req)) + return false; + + if (pgio->pg_lseg == NULL) + return true; + + return pgio->pg_count + req->wb_bytes <= + OBJIO_LSEG(pgio->pg_lseg)->max_io_size; +} + +static struct pnfs_layoutdriver_type objlayout_type = { + .id = LAYOUT_OSD2_OBJECTS, + .name = "LAYOUT_OSD2_OBJECTS", + .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR, + + .alloc_layout_hdr = objlayout_alloc_layout_hdr, + .free_layout_hdr = objlayout_free_layout_hdr, + + .alloc_lseg = objlayout_alloc_lseg, + .free_lseg = objlayout_free_lseg, + + .read_pagelist = objlayout_read_pagelist, + .write_pagelist = objlayout_write_pagelist, + .pg_test = objio_pg_test, + + .free_deviceid_node = objio_free_deviceid_node, + + .encode_layoutcommit = objlayout_encode_layoutcommit, + .encode_layoutreturn = objlayout_encode_layoutreturn, +}; + +MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); +MODULE_AUTHOR("Benny Halevy "); +MODULE_LICENSE("GPL"); + +static int __init +objlayout_init(void) +{ + int ret = pnfs_register_layoutdriver(&objlayout_type); + + if (ret) + printk(KERN_INFO + "%s: Registering OSD pNFS Layout Driver failed: error=%d\n", + __func__, ret); + else + printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", + __func__); + return ret; +} + +static void __exit +objlayout_exit(void) +{ + pnfs_unregister_layoutdriver(&objlayout_type); + printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", + __func__); +} + +module_init(objlayout_init); +module_exit(objlayout_exit); diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c new file mode 100644 index 00000000..fefa1224 --- /dev/null +++ b/fs/nfs/objlayout/objlayout.c @@ -0,0 +1,716 @@ +/* + * pNFS Objects layout driver high level definitions + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "objlayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD +/* + * Create a objlayout layout structure for the given inode and return it. + */ +struct pnfs_layout_hdr * +objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) +{ + struct objlayout *objlay; + + objlay = kzalloc(sizeof(struct objlayout), gfp_flags); + if (objlay) { + spin_lock_init(&objlay->lock); + INIT_LIST_HEAD(&objlay->err_list); + } + dprintk("%s: Return %p\n", __func__, objlay); + return &objlay->pnfs_layout; +} + +/* + * Free an objlayout layout structure + */ +void +objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct objlayout *objlay = OBJLAYOUT(lo); + + dprintk("%s: objlay %p\n", __func__, objlay); + + WARN_ON(!list_empty(&objlay->err_list)); + kfree(objlay); +} + +/* + * Unmarshall layout and store it in pnfslay. + */ +struct pnfs_layout_segment * +objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) +{ + int status = -ENOMEM; + struct xdr_stream stream; + struct xdr_buf buf = { + .pages = lgr->layoutp->pages, + .page_len = lgr->layoutp->len, + .buflen = lgr->layoutp->len, + .len = lgr->layoutp->len, + }; + struct page *scratch; + struct pnfs_layout_segment *lseg; + + dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay); + + scratch = alloc_page(gfp_flags); + if (!scratch) + goto err_nofree; + + xdr_init_decode(&stream, &buf, NULL); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags); + if (unlikely(status)) { + dprintk("%s: objio_alloc_lseg Return err %d\n", __func__, + status); + goto err; + } + + __free_page(scratch); + + dprintk("%s: Return %p\n", __func__, lseg); + return lseg; + +err: + __free_page(scratch); +err_nofree: + dprintk("%s: Err Return=>%d\n", __func__, status); + return ERR_PTR(status); +} + +/* + * Free a layout segement + */ +void +objlayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + dprintk("%s: freeing layout segment %p\n", __func__, lseg); + + if (unlikely(!lseg)) + return; + + objio_free_lseg(lseg); +} + +/* + * I/O Operations + */ +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end : NFS4_MAX_UINT64; +} + +/* last octet in a range */ +static inline u64 +last_byte_offset(u64 start, u64 len) +{ + u64 end; + + BUG_ON(!len); + end = start + len; + return end > start ? end - 1 : NFS4_MAX_UINT64; +} + +static struct objlayout_io_state * +objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, + struct page **pages, + unsigned pgbase, + loff_t offset, + size_t count, + struct pnfs_layout_segment *lseg, + void *rpcdata, + gfp_t gfp_flags) +{ + struct objlayout_io_state *state; + u64 lseg_end_offset; + + dprintk("%s: allocating io_state\n", __func__); + if (objio_alloc_io_state(lseg, &state, gfp_flags)) + return NULL; + + BUG_ON(offset < lseg->pls_range.offset); + lseg_end_offset = end_offset(lseg->pls_range.offset, + lseg->pls_range.length); + BUG_ON(offset >= lseg_end_offset); + if (offset + count > lseg_end_offset) { + count = lseg->pls_range.length - + (offset - lseg->pls_range.offset); + dprintk("%s: truncated count %Zd\n", __func__, count); + } + + if (pgbase > PAGE_SIZE) { + pages += pgbase >> PAGE_SHIFT; + pgbase &= ~PAGE_MASK; + } + + INIT_LIST_HEAD(&state->err_list); + state->lseg = lseg; + state->rpcdata = rpcdata; + state->pages = pages; + state->pgbase = pgbase; + state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; + state->offset = offset; + state->count = count; + state->sync = 0; + + return state; +} + +static void +objlayout_free_io_state(struct objlayout_io_state *state) +{ + dprintk("%s: freeing io_state\n", __func__); + if (unlikely(!state)) + return; + + objio_free_io_state(state); +} + +/* + * I/O done common code + */ +static void +objlayout_iodone(struct objlayout_io_state *state) +{ + dprintk("%s: state %p status\n", __func__, state); + + if (likely(state->status >= 0)) { + objlayout_free_io_state(state); + } else { + struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); + + spin_lock(&objlay->lock); + objlay->delta_space_valid = OBJ_DSU_INVALID; + list_add(&objlay->err_list, &state->err_list); + spin_unlock(&objlay->lock); + } +} + +/* + * objlayout_io_set_result - Set an osd_error code on a specific osd comp. + * + * The @index component IO failed (error returned from target). Register + * the error for later reporting at layout-return. + */ +void +objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, + struct pnfs_osd_objid *pooid, int osd_error, + u64 offset, u64 length, bool is_write) +{ + struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; + + BUG_ON(index >= state->num_comps); + if (osd_error) { + ioerr->oer_component = *pooid; + ioerr->oer_comp_offset = offset; + ioerr->oer_comp_length = length; + ioerr->oer_iswrite = is_write; + ioerr->oer_errno = osd_error; + + dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " + "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", + __func__, index, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + } else { + /* User need not call if no error is reported */ + ioerr->oer_errno = 0; + } +} + +/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). + * This is because the osd completion is called with ints-off from + * the block layer + */ +static void _rpc_read_complete(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_read_data *rdata; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + rdata = container_of(task, struct nfs_read_data, task); + + pnfs_ld_read_done(rdata); +} + +void +objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) +{ + int eof = state->eof; + struct nfs_read_data *rdata; + + state->status = status; + dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); + rdata = state->rpcdata; + rdata->task.tk_status = status; + if (likely(status >= 0)) { + rdata->res.count = status; + rdata->res.eof = eof; + } else { + rdata->pnfs_error = status; + } + objlayout_iodone(state); + /* must not use state after this point */ + + if (sync) + pnfs_ld_read_done(rdata); + else { + INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); + schedule_work(&rdata->task.u.tk_work); + } +} + +/* + * Perform sync or async reads. + */ +enum pnfs_try_status +objlayout_read_pagelist(struct nfs_read_data *rdata) +{ + loff_t offset = rdata->args.offset; + size_t count = rdata->args.count; + struct objlayout_io_state *state; + ssize_t status = 0; + loff_t eof; + + dprintk("%s: Begin inode %p offset %llu count %d\n", + __func__, rdata->inode, offset, (int)count); + + eof = i_size_read(rdata->inode); + if (unlikely(offset + count > eof)) { + if (offset >= eof) { + status = 0; + rdata->res.count = 0; + rdata->res.eof = 1; + goto out; + } + count = eof - offset; + } + + state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, + rdata->args.pages, rdata->args.pgbase, + offset, count, + rdata->lseg, rdata, + GFP_KERNEL); + if (unlikely(!state)) { + status = -ENOMEM; + goto out; + } + + state->eof = state->offset + state->count >= eof; + + status = objio_read_pagelist(state); + out: + dprintk("%s: Return status %Zd\n", __func__, status); + rdata->pnfs_error = status; + return PNFS_ATTEMPTED; +} + +/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). + * This is because the osd completion is called with ints-off from + * the block layer + */ +static void _rpc_write_complete(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_write_data *wdata; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + wdata = container_of(task, struct nfs_write_data, task); + + pnfs_ld_write_done(wdata); +} + +void +objlayout_write_done(struct objlayout_io_state *state, ssize_t status, + bool sync) +{ + struct nfs_write_data *wdata; + + dprintk("%s: Begin\n", __func__); + wdata = state->rpcdata; + state->status = status; + wdata->task.tk_status = status; + if (likely(status >= 0)) { + wdata->res.count = status; + wdata->verf.committed = state->committed; + dprintk("%s: Return status %d committed %d\n", + __func__, wdata->task.tk_status, + wdata->verf.committed); + } else { + wdata->pnfs_error = status; + dprintk("%s: Return status %d\n", + __func__, wdata->task.tk_status); + } + objlayout_iodone(state); + /* must not use state after this point */ + + if (sync) + pnfs_ld_write_done(wdata); + else { + INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); + schedule_work(&wdata->task.u.tk_work); + } +} + +/* + * Perform sync or async writes. + */ +enum pnfs_try_status +objlayout_write_pagelist(struct nfs_write_data *wdata, + int how) +{ + struct objlayout_io_state *state; + ssize_t status; + + dprintk("%s: Begin inode %p offset %llu count %u\n", + __func__, wdata->inode, wdata->args.offset, wdata->args.count); + + state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, + wdata->args.pages, + wdata->args.pgbase, + wdata->args.offset, + wdata->args.count, + wdata->lseg, wdata, + GFP_NOFS); + if (unlikely(!state)) { + status = -ENOMEM; + goto out; + } + + state->sync = how & FLUSH_SYNC; + + status = objio_write_pagelist(state, how & FLUSH_STABLE); + out: + dprintk("%s: Return status %Zd\n", __func__, status); + wdata->pnfs_error = status; + return PNFS_ATTEMPTED; +} + +void +objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args) +{ + struct objlayout *objlay = OBJLAYOUT(pnfslay); + struct pnfs_osd_layoutupdate lou; + __be32 *start; + + dprintk("%s: Begin\n", __func__); + + spin_lock(&objlay->lock); + lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); + lou.dsu_delta = objlay->delta_space_used; + objlay->delta_space_used = 0; + objlay->delta_space_valid = OBJ_DSU_INIT; + lou.olu_ioerr_flag = !list_empty(&objlay->err_list); + spin_unlock(&objlay->lock); + + start = xdr_reserve_space(xdr, 4); + + BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); + + dprintk("%s: Return delta_space_used %lld err %d\n", __func__, + lou.dsu_delta, lou.olu_ioerr_flag); +} + +static int +err_prio(u32 oer_errno) +{ + switch (oer_errno) { + case 0: + return 0; + + case PNFS_OSD_ERR_RESOURCE: + return OSD_ERR_PRI_RESOURCE; + case PNFS_OSD_ERR_BAD_CRED: + return OSD_ERR_PRI_BAD_CRED; + case PNFS_OSD_ERR_NO_ACCESS: + return OSD_ERR_PRI_NO_ACCESS; + case PNFS_OSD_ERR_UNREACHABLE: + return OSD_ERR_PRI_UNREACHABLE; + case PNFS_OSD_ERR_NOT_FOUND: + return OSD_ERR_PRI_NOT_FOUND; + case PNFS_OSD_ERR_NO_SPACE: + return OSD_ERR_PRI_NO_SPACE; + default: + WARN_ON(1); + /* fallthrough */ + case PNFS_OSD_ERR_EIO: + return OSD_ERR_PRI_EIO; + } +} + +static void +merge_ioerr(struct pnfs_osd_ioerr *dest_err, + const struct pnfs_osd_ioerr *src_err) +{ + u64 dest_end, src_end; + + if (!dest_err->oer_errno) { + *dest_err = *src_err; + /* accumulated device must be blank */ + memset(&dest_err->oer_component.oid_device_id, 0, + sizeof(dest_err->oer_component.oid_device_id)); + + return; + } + + if (dest_err->oer_component.oid_partition_id != + src_err->oer_component.oid_partition_id) + dest_err->oer_component.oid_partition_id = 0; + + if (dest_err->oer_component.oid_object_id != + src_err->oer_component.oid_object_id) + dest_err->oer_component.oid_object_id = 0; + + if (dest_err->oer_comp_offset > src_err->oer_comp_offset) + dest_err->oer_comp_offset = src_err->oer_comp_offset; + + dest_end = end_offset(dest_err->oer_comp_offset, + dest_err->oer_comp_length); + src_end = end_offset(src_err->oer_comp_offset, + src_err->oer_comp_length); + if (dest_end < src_end) + dest_end = src_end; + + dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; + + if ((src_err->oer_iswrite == dest_err->oer_iswrite) && + (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { + dest_err->oer_errno = src_err->oer_errno; + } else if (src_err->oer_iswrite) { + dest_err->oer_iswrite = true; + dest_err->oer_errno = src_err->oer_errno; + } +} + +static void +encode_accumulated_error(struct objlayout *objlay, __be32 *p) +{ + struct objlayout_io_state *state, *tmp; + struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; + + list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + unsigned i; + + for (i = 0; i < state->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + + if (!ioerr->oer_errno) + continue; + + printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " + "dev(%llx:%llx) par=0x%llx obj=0x%llx " + "offset=0x%llx length=0x%llx\n", + __func__, i, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + + merge_ioerr(&accumulated_err, ioerr); + } + list_del(&state->err_list); + objlayout_free_io_state(state); + } + + pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); +} + +void +objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args) +{ + struct objlayout *objlay = OBJLAYOUT(pnfslay); + struct objlayout_io_state *state, *tmp; + __be32 *start; + + dprintk("%s: Begin\n", __func__); + start = xdr_reserve_space(xdr, 4); + BUG_ON(!start); + + spin_lock(&objlay->lock); + + list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + __be32 *last_xdr = NULL, *p; + unsigned i; + int res = 0; + + for (i = 0; i < state->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + + if (!ioerr->oer_errno) + continue; + + dprintk("%s: err[%d]: errno=%d is_write=%d " + "dev(%llx:%llx) par=0x%llx obj=0x%llx " + "offset=0x%llx length=0x%llx\n", + __func__, i, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + + p = pnfs_osd_xdr_ioerr_reserve_space(xdr); + if (unlikely(!p)) { + res = -E2BIG; + break; /* accumulated_error */ + } + + last_xdr = p; + pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]); + } + + /* TODO: use xdr_write_pages */ + if (unlikely(res)) { + /* no space for even one error descriptor */ + BUG_ON(!last_xdr); + + /* we've encountered a situation with lots and lots of + * errors and no space to encode them all. Use the last + * available slot to report the union of all the + * remaining errors. + */ + encode_accumulated_error(objlay, last_xdr); + goto loop_done; + } + list_del(&state->err_list); + objlayout_free_io_state(state); + } +loop_done: + spin_unlock(&objlay->lock); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); + dprintk("%s: Return\n", __func__); +} + + +/* + * Get Device Info API for io engines + */ +struct objlayout_deviceinfo { + struct page *page; + struct pnfs_osd_deviceaddr da; /* This must be last */ +}; + +/* Initialize and call nfs_getdeviceinfo, then decode and return a + * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() + * should be called. + */ +int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, + struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, + gfp_t gfp_flags) +{ + struct objlayout_deviceinfo *odi; + struct pnfs_device pd; + struct super_block *sb; + struct page *page, **pages; + u32 *p; + int err; + + page = alloc_page(gfp_flags); + if (!page) + return -ENOMEM; + + pages = &page; + pd.pages = pages; + + memcpy(&pd.dev_id, d_id, sizeof(*d_id)); + pd.layout_type = LAYOUT_OSD2_OBJECTS; + pd.pages = &page; + pd.pgbase = 0; + pd.pglen = PAGE_SIZE; + pd.mincount = 0; + + sb = pnfslay->plh_inode->i_sb; + err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd); + dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); + if (err) + goto err_out; + + p = page_address(page); + odi = kzalloc(sizeof(*odi), gfp_flags); + if (!odi) { + err = -ENOMEM; + goto err_out; + } + pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); + odi->page = page; + *deviceaddr = &odi->da; + return 0; + +err_out: + __free_page(page); + return err; +} + +void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) +{ + struct objlayout_deviceinfo *odi = container_of(deviceaddr, + struct objlayout_deviceinfo, + da); + + __free_page(odi->page); + kfree(odi); +} diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h new file mode 100644 index 00000000..a8244c8e --- /dev/null +++ b/fs/nfs/objlayout/objlayout.h @@ -0,0 +1,187 @@ +/* + * Data types and function declerations for interfacing with the + * pNFS standard object layout driver. + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _OBJLAYOUT_H +#define _OBJLAYOUT_H + +#include +#include +#include "../pnfs.h" + +/* + * per-inode layout + */ +struct objlayout { + struct pnfs_layout_hdr pnfs_layout; + + /* for layout_commit */ + enum osd_delta_space_valid_enum { + OBJ_DSU_INIT = 0, + OBJ_DSU_VALID, + OBJ_DSU_INVALID, + } delta_space_valid; + s64 delta_space_used; /* consumed by write ops */ + + /* for layout_return */ + spinlock_t lock; + struct list_head err_list; +}; + +static inline struct objlayout * +OBJLAYOUT(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct objlayout, pnfs_layout); +} + +/* + * per-I/O operation state + * embedded in objects provider io_state data structure + */ +struct objlayout_io_state { + struct pnfs_layout_segment *lseg; + + struct page **pages; + unsigned pgbase; + unsigned nr_pages; + unsigned long count; + loff_t offset; + bool sync; + + void *rpcdata; + int status; /* res */ + int eof; /* res */ + int committed; /* res */ + + /* Error reporting (layout_return) */ + struct list_head err_list; + unsigned num_comps; + /* Pointer to array of error descriptors of size num_comps. + * It should contain as many entries as devices in the osd_layout + * that participate in the I/O. It is up to the io_engine to allocate + * needed space and set num_comps. + */ + struct pnfs_osd_ioerr *ioerrs; +}; + +/* + * Raid engine I/O API + */ +extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, + struct pnfs_layout_hdr *pnfslay, + struct pnfs_layout_range *range, + struct xdr_stream *xdr, + gfp_t gfp_flags); +extern void objio_free_lseg(struct pnfs_layout_segment *lseg); + +extern int objio_alloc_io_state( + struct pnfs_layout_segment *lseg, + struct objlayout_io_state **outp, + gfp_t gfp_flags); +extern void objio_free_io_state(struct objlayout_io_state *state); + +extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); +extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, + bool stable); + +/* + * callback API + */ +extern void objlayout_io_set_result(struct objlayout_io_state *state, + unsigned index, struct pnfs_osd_objid *pooid, + int osd_error, u64 offset, u64 length, bool is_write); + +static inline void +objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) +{ + struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); + + /* If one of the I/Os errored out and the delta_space_used was + * invalid we render the complete report as invalid. Protocol mandate + * the DSU be accurate or not reported. + */ + spin_lock(&objlay->lock); + if (objlay->delta_space_valid != OBJ_DSU_INVALID) { + objlay->delta_space_valid = OBJ_DSU_VALID; + objlay->delta_space_used += space_used; + } + spin_unlock(&objlay->lock); +} + +extern void objlayout_read_done(struct objlayout_io_state *state, + ssize_t status, bool sync); +extern void objlayout_write_done(struct objlayout_io_state *state, + ssize_t status, bool sync); + +extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, + struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, + gfp_t gfp_flags); +extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); + +/* + * exported generic objects function vectors + */ + +extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags); +extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *); + +extern struct pnfs_layout_segment *objlayout_alloc_lseg( + struct pnfs_layout_hdr *, + struct nfs4_layoutget_res *, + gfp_t gfp_flags); +extern void objlayout_free_lseg(struct pnfs_layout_segment *); + +extern enum pnfs_try_status objlayout_read_pagelist( + struct nfs_read_data *); + +extern enum pnfs_try_status objlayout_write_pagelist( + struct nfs_write_data *, + int how); + +extern void objlayout_encode_layoutcommit( + struct pnfs_layout_hdr *, + struct xdr_stream *, + const struct nfs4_layoutcommit_args *); + +extern void objlayout_encode_layoutreturn( + struct pnfs_layout_hdr *, + struct xdr_stream *, + const struct nfs4_layoutreturn_args *); + +#endif /* _OBJLAYOUT_H */ diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c new file mode 100644 index 00000000..b3918f7a --- /dev/null +++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -0,0 +1,415 @@ +/* + * Object-Based pNFS Layout XDR layer + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* + * The following implementation is based on RFC5664 + */ + +/* + * struct pnfs_osd_objid { + * struct nfs4_deviceid oid_device_id; + * u64 oid_partition_id; + * u64 oid_object_id; + * }; // xdr size 32 bytes + */ +static __be32 * +_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) +{ + p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data, + sizeof(objid->oid_device_id.data)); + + p = xdr_decode_hyper(p, &objid->oid_partition_id); + p = xdr_decode_hyper(p, &objid->oid_object_id); + return p; +} +/* + * struct pnfs_osd_opaque_cred { + * u32 cred_len; + * void *cred; + * }; // xdr size [variable] + * The return pointers are from the xdr buffer + */ +static int +_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred, + struct xdr_stream *xdr) +{ + __be32 *p = xdr_inline_decode(xdr, 1); + + if (!p) + return -EINVAL; + + opaque_cred->cred_len = be32_to_cpu(*p++); + + p = xdr_inline_decode(xdr, opaque_cred->cred_len); + if (!p) + return -EINVAL; + + opaque_cred->cred = p; + return 0; +} + +/* + * struct pnfs_osd_object_cred { + * struct pnfs_osd_objid oc_object_id; + * u32 oc_osd_version; + * u32 oc_cap_key_sec; + * struct pnfs_osd_opaque_cred oc_cap_key + * struct pnfs_osd_opaque_cred oc_cap; + * }; // xdr size 32 + 4 + 4 + [variable] + [variable] + */ +static int +_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp, + struct xdr_stream *xdr) +{ + __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4); + int ret; + + if (!p) + return -EIO; + + p = _osd_xdr_decode_objid(p, &comp->oc_object_id); + comp->oc_osd_version = be32_to_cpup(p++); + comp->oc_cap_key_sec = be32_to_cpup(p); + + ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr); + if (unlikely(ret)) + return ret; + + ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr); + return ret; +} + +/* + * struct pnfs_osd_data_map { + * u32 odm_num_comps; + * u64 odm_stripe_unit; + * u32 odm_group_width; + * u32 odm_group_depth; + * u32 odm_mirror_cnt; + * u32 odm_raid_algorithm; + * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4 + */ +static inline int +_osd_data_map_xdr_sz(void) +{ + return 4 + 8 + 4 + 4 + 4 + 4; +} + +static __be32 * +_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map) +{ + data_map->odm_num_comps = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &data_map->odm_stripe_unit); + data_map->odm_group_width = be32_to_cpup(p++); + data_map->odm_group_depth = be32_to_cpup(p++); + data_map->odm_mirror_cnt = be32_to_cpup(p++); + data_map->odm_raid_algorithm = be32_to_cpup(p++); + dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " + "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", + __func__, + data_map->odm_num_comps, + (unsigned long long)data_map->odm_stripe_unit, + data_map->odm_group_width, + data_map->odm_group_depth, + data_map->odm_mirror_cnt, + data_map->odm_raid_algorithm); + return p; +} + +int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout, + struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr) +{ + __be32 *p; + + memset(iter, 0, sizeof(*iter)); + + p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4); + if (unlikely(!p)) + return -EINVAL; + + p = _osd_xdr_decode_data_map(p, &layout->olo_map); + layout->olo_comps_index = be32_to_cpup(p++); + layout->olo_num_comps = be32_to_cpup(p++); + dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__, + layout->olo_comps_index, layout->olo_num_comps); + + iter->total_comps = layout->olo_num_comps; + return 0; +} + +bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp, + struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr, + int *err) +{ + BUG_ON(iter->decoded_comps > iter->total_comps); + if (iter->decoded_comps == iter->total_comps) + return false; + + *err = _osd_xdr_decode_object_cred(comp, xdr); + if (unlikely(*err)) { + dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d " + "total_comps=%d\n", __func__, *err, + iter->decoded_comps, iter->total_comps); + return false; /* stop the loop */ + } + dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx " + "key_len=%u cap_len=%u\n", + __func__, + _DEVID_LO(&comp->oc_object_id.oid_device_id), + _DEVID_HI(&comp->oc_object_id.oid_device_id), + comp->oc_object_id.oid_partition_id, + comp->oc_object_id.oid_object_id, + comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); + + iter->decoded_comps++; + return true; +} + +/* + * Get Device Information Decoding + * + * Note: since Device Information is currently done synchronously, all + * variable strings fields are left inside the rpc buffer and are only + * pointed to by the pnfs_osd_deviceaddr members. So the read buffer + * should not be freed while the returned information is in use. + */ +/* + *struct nfs4_string { + * unsigned int len; + * char *data; + *}; // size [variable] + * NOTE: Returned string points to inside the XDR buffer + */ +static __be32 * +__read_u8_opaque(__be32 *p, struct nfs4_string *str) +{ + str->len = be32_to_cpup(p++); + str->data = (char *)p; + + p += XDR_QUADLEN(str->len); + return p; +} + +/* + * struct pnfs_osd_targetid { + * u32 oti_type; + * struct nfs4_string oti_scsi_device_id; + * };// size 4 + [variable] + */ +static __be32 * +__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid) +{ + u32 oti_type; + + oti_type = be32_to_cpup(p++); + targetid->oti_type = oti_type; + + switch (oti_type) { + case OBJ_TARGET_SCSI_NAME: + case OBJ_TARGET_SCSI_DEVICE_ID: + p = __read_u8_opaque(p, &targetid->oti_scsi_device_id); + } + + return p; +} + +/* + * struct pnfs_osd_net_addr { + * struct nfs4_string r_netid; + * struct nfs4_string r_addr; + * }; + */ +static __be32 * +__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr) +{ + p = __read_u8_opaque(p, &netaddr->r_netid); + p = __read_u8_opaque(p, &netaddr->r_addr); + + return p; +} + +/* + * struct pnfs_osd_targetaddr { + * u32 ota_available; + * struct pnfs_osd_net_addr ota_netaddr; + * }; + */ +static __be32 * +__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr) +{ + u32 ota_available; + + ota_available = be32_to_cpup(p++); + targetaddr->ota_available = ota_available; + + if (ota_available) + p = __read_net_addr(p, &targetaddr->ota_netaddr); + + + return p; +} + +/* + * struct pnfs_osd_deviceaddr { + * struct pnfs_osd_targetid oda_targetid; + * struct pnfs_osd_targetaddr oda_targetaddr; + * u8 oda_lun[8]; + * struct nfs4_string oda_systemid; + * struct pnfs_osd_object_cred oda_root_obj_cred; + * struct nfs4_string oda_osdname; + * }; + */ + +/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does + * not have an xdr_stream + */ +static __be32 * +__read_opaque_cred(__be32 *p, + struct pnfs_osd_opaque_cred *opaque_cred) +{ + opaque_cred->cred_len = be32_to_cpu(*p++); + opaque_cred->cred = p; + return p + XDR_QUADLEN(opaque_cred->cred_len); +} + +static __be32 * +__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp) +{ + p = _osd_xdr_decode_objid(p, &comp->oc_object_id); + comp->oc_osd_version = be32_to_cpup(p++); + comp->oc_cap_key_sec = be32_to_cpup(p++); + + p = __read_opaque_cred(p, &comp->oc_cap_key); + p = __read_opaque_cred(p, &comp->oc_cap); + return p; +} + +void pnfs_osd_xdr_decode_deviceaddr( + struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p) +{ + p = __read_targetid(p, &deviceaddr->oda_targetid); + + p = __read_targetaddr(p, &deviceaddr->oda_targetaddr); + + p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun, + sizeof(deviceaddr->oda_lun)); + + p = __read_u8_opaque(p, &deviceaddr->oda_systemid); + + p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred); + + p = __read_u8_opaque(p, &deviceaddr->oda_osdname); + + /* libosd likes this terminated in dbg. It's last, so no problems */ + deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0; +} + +/* + * struct pnfs_osd_layoutupdate { + * u32 dsu_valid; + * s64 dsu_delta; + * u32 olu_ioerr_flag; + * }; xdr size 4 + 8 + 4 + */ +int +pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, + struct pnfs_osd_layoutupdate *lou) +{ + __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4); + + if (!p) + return -E2BIG; + + *p++ = cpu_to_be32(lou->dsu_valid); + if (lou->dsu_valid) + p = xdr_encode_hyper(p, lou->dsu_delta); + *p++ = cpu_to_be32(lou->olu_ioerr_flag); + return 0; +} + +/* + * struct pnfs_osd_objid { + * struct nfs4_deviceid oid_device_id; + * u64 oid_partition_id; + * u64 oid_object_id; + * }; // xdr size 32 bytes + */ +static inline __be32 * +pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id) +{ + p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, + sizeof(object_id->oid_device_id.data)); + p = xdr_encode_hyper(p, object_id->oid_partition_id); + p = xdr_encode_hyper(p, object_id->oid_object_id); + + return p; +} + +/* + * struct pnfs_osd_ioerr { + * struct pnfs_osd_objid oer_component; + * u64 oer_comp_offset; + * u64 oer_comp_length; + * u32 oer_iswrite; + * u32 oer_errno; + * }; // xdr size 32 + 24 bytes + */ +void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr) +{ + p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component); + p = xdr_encode_hyper(p, ioerr->oer_comp_offset); + p = xdr_encode_hyper(p, ioerr->oer_comp_length); + *p++ = cpu_to_be32(ioerr->oer_iswrite); + *p = cpu_to_be32(ioerr->oer_errno); +} + +__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 32 + 24); + if (unlikely(!p)) + dprintk("%s: out of xdr space\n", __func__); + + return p; +} diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c new file mode 100644 index 00000000..00985571 --- /dev/null +++ b/fs/nfs/pagelist.c @@ -0,0 +1,453 @@ +/* + * linux/fs/nfs/pagelist.c + * + * A set of helper functions for managing NFS read and write requests. + * The main purpose of these routines is to provide support for the + * coalescing of several requests into a single RPC call. + * + * Copyright 2000, 2001 (c) Trond Myklebust + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "pnfs.h" + +static struct kmem_cache *nfs_page_cachep; + +static inline struct nfs_page * +nfs_page_alloc(void) +{ + struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL); + if (p) + INIT_LIST_HEAD(&p->wb_list); + return p; +} + +static inline void +nfs_page_free(struct nfs_page *p) +{ + kmem_cache_free(nfs_page_cachep, p); +} + +/** + * nfs_create_request - Create an NFS read/write request. + * @file: file descriptor to use + * @inode: inode to which the request is attached + * @page: page to write + * @offset: starting offset within the page for the write + * @count: number of bytes to read/write + * + * The page must be locked by the caller. This makes sure we never + * create two different requests for the same page. + * User should ensure it is safe to sleep in this function. + */ +struct nfs_page * +nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, + struct page *page, + unsigned int offset, unsigned int count) +{ + struct nfs_page *req; + + /* try to allocate the request struct */ + req = nfs_page_alloc(); + if (req == NULL) + return ERR_PTR(-ENOMEM); + + /* get lock context early so we can deal with alloc failures */ + req->wb_lock_context = nfs_get_lock_context(ctx); + if (req->wb_lock_context == NULL) { + nfs_page_free(req); + return ERR_PTR(-ENOMEM); + } + + /* Initialize the request struct. Initially, we assume a + * long write-back delay. This will be adjusted in + * update_nfs_request below if the region is not locked. */ + req->wb_page = page; + atomic_set(&req->wb_complete, 0); + req->wb_index = page->index; + page_cache_get(page); + BUG_ON(PagePrivate(page)); + BUG_ON(!PageLocked(page)); + BUG_ON(page->mapping->host != inode); + req->wb_offset = offset; + req->wb_pgbase = offset; + req->wb_bytes = count; + req->wb_context = get_nfs_open_context(ctx); + kref_init(&req->wb_kref); + return req; +} + +/** + * nfs_unlock_request - Unlock request and wake up sleepers. + * @req: + */ +void nfs_unlock_request(struct nfs_page *req) +{ + if (!NFS_WBACK_BUSY(req)) { + printk(KERN_ERR "NFS: Invalid unlock attempted\n"); + BUG(); + } + smp_mb__before_clear_bit(); + clear_bit(PG_BUSY, &req->wb_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&req->wb_flags, PG_BUSY); + nfs_release_request(req); +} + +/** + * nfs_set_page_tag_locked - Tag a request as locked + * @req: + */ +int nfs_set_page_tag_locked(struct nfs_page *req) +{ + if (!nfs_lock_request_dontget(req)) + return 0; + if (test_bit(PG_MAPPED, &req->wb_flags)) + radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); + return 1; +} + +/** + * nfs_clear_page_tag_locked - Clear request tag and wake up sleepers + */ +void nfs_clear_page_tag_locked(struct nfs_page *req) +{ + if (test_bit(PG_MAPPED, &req->wb_flags)) { + struct inode *inode = req->wb_context->path.dentry->d_inode; + struct nfs_inode *nfsi = NFS_I(inode); + + spin_lock(&inode->i_lock); + radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); + nfs_unlock_request(req); + spin_unlock(&inode->i_lock); + } else + nfs_unlock_request(req); +} + +/* + * nfs_clear_request - Free up all resources allocated to the request + * @req: + * + * Release page and open context resources associated with a read/write + * request after it has completed. + */ +static void nfs_clear_request(struct nfs_page *req) +{ + struct page *page = req->wb_page; + struct nfs_open_context *ctx = req->wb_context; + struct nfs_lock_context *l_ctx = req->wb_lock_context; + + if (page != NULL) { + page_cache_release(page); + req->wb_page = NULL; + } + if (l_ctx != NULL) { + nfs_put_lock_context(l_ctx); + req->wb_lock_context = NULL; + } + if (ctx != NULL) { + put_nfs_open_context(ctx); + req->wb_context = NULL; + } +} + + +/** + * nfs_release_request - Release the count on an NFS read/write request + * @req: request to release + * + * Note: Should never be called with the spinlock held! + */ +static void nfs_free_request(struct kref *kref) +{ + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); + + /* Release struct file and open context */ + nfs_clear_request(req); + nfs_page_free(req); +} + +void nfs_release_request(struct nfs_page *req) +{ + kref_put(&req->wb_kref, nfs_free_request); +} + +static int nfs_wait_bit_uninterruptible(void *word) +{ + io_schedule(); + return 0; +} + +/** + * nfs_wait_on_request - Wait for a request to complete. + * @req: request to wait upon. + * + * Interruptible by fatal signals only. + * The user is responsible for holding a count on the request. + */ +int +nfs_wait_on_request(struct nfs_page *req) +{ + return wait_on_bit(&req->wb_flags, PG_BUSY, + nfs_wait_bit_uninterruptible, + TASK_UNINTERRUPTIBLE); +} + +bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req) +{ + /* + * FIXME: ideally we should be able to coalesce all requests + * that are not block boundary aligned, but currently this + * is problematic for the case of bsize < PAGE_CACHE_SIZE, + * since nfs_flush_multi and nfs_pagein_multi assume you + * can have only one struct nfs_page. + */ + if (desc->pg_bsize < PAGE_SIZE) + return 0; + + return desc->pg_count + req->wb_bytes <= desc->pg_bsize; +} +EXPORT_SYMBOL_GPL(nfs_generic_pg_test); + +/** + * nfs_pageio_init - initialise a page io descriptor + * @desc: pointer to descriptor + * @inode: pointer to inode + * @doio: pointer to io function + * @bsize: io block size + * @io_flags: extra parameters for the io function + */ +void nfs_pageio_init(struct nfs_pageio_descriptor *desc, + struct inode *inode, + int (*doio)(struct nfs_pageio_descriptor *), + size_t bsize, + int io_flags) +{ + INIT_LIST_HEAD(&desc->pg_list); + desc->pg_bytes_written = 0; + desc->pg_count = 0; + desc->pg_bsize = bsize; + desc->pg_base = 0; + desc->pg_moreio = 0; + desc->pg_inode = inode; + desc->pg_doio = doio; + desc->pg_ioflags = io_flags; + desc->pg_error = 0; + desc->pg_lseg = NULL; + desc->pg_test = nfs_generic_pg_test; + pnfs_pageio_init(desc, inode); +} + +/** + * nfs_can_coalesce_requests - test two requests for compatibility + * @prev: pointer to nfs_page + * @req: pointer to nfs_page + * + * The nfs_page structures 'prev' and 'req' are compared to ensure that the + * page data area they describe is contiguous, and that their RPC + * credentials, NFSv4 open state, and lockowners are the same. + * + * Return 'true' if this is the case, else return 'false'. + */ +static bool nfs_can_coalesce_requests(struct nfs_page *prev, + struct nfs_page *req, + struct nfs_pageio_descriptor *pgio) +{ + if (req->wb_context->cred != prev->wb_context->cred) + return false; + if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) + return false; + if (req->wb_context->state != prev->wb_context->state) + return false; + if (req->wb_index != (prev->wb_index + 1)) + return false; + if (req->wb_pgbase != 0) + return false; + if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) + return false; + return pgio->pg_test(pgio, prev, req); +} + +/** + * nfs_pageio_do_add_request - Attempt to coalesce a request into a page list. + * @desc: destination io descriptor + * @req: request + * + * Returns true if the request 'req' was successfully coalesced into the + * existing list of pages 'desc'. + */ +static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + if (desc->pg_count != 0) { + struct nfs_page *prev; + + prev = nfs_list_entry(desc->pg_list.prev); + if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; + } else { + desc->pg_base = req->wb_pgbase; + } + nfs_list_remove_request(req); + nfs_list_add_request(req, &desc->pg_list); + desc->pg_count += req->wb_bytes; + return 1; +} + +/* + * Helper for nfs_pageio_add_request and nfs_pageio_complete + */ +static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) +{ + if (!list_empty(&desc->pg_list)) { + int error = desc->pg_doio(desc); + if (error < 0) + desc->pg_error = error; + else + desc->pg_bytes_written += desc->pg_count; + } + if (list_empty(&desc->pg_list)) { + desc->pg_count = 0; + desc->pg_base = 0; + } +} + +/** + * nfs_pageio_add_request - Attempt to coalesce a request into a page list. + * @desc: destination io descriptor + * @req: request + * + * Returns true if the request 'req' was successfully coalesced into the + * existing list of pages 'desc'. + */ +int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + while (!nfs_pageio_do_add_request(desc, req)) { + desc->pg_moreio = 1; + nfs_pageio_doio(desc); + if (desc->pg_error < 0) + return 0; + desc->pg_moreio = 0; + } + return 1; +} + +/** + * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor + * @desc: pointer to io descriptor + */ +void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) +{ + nfs_pageio_doio(desc); +} + +/** + * nfs_pageio_cond_complete - Conditional I/O completion + * @desc: pointer to io descriptor + * @index: page index + * + * It is important to ensure that processes don't try to take locks + * on non-contiguous ranges of pages as that might deadlock. This + * function should be called before attempting to wait on a locked + * nfs_page. It will complete the I/O if the page index 'index' + * is not contiguous with the existing list of pages in 'desc'. + */ +void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) +{ + if (!list_empty(&desc->pg_list)) { + struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev); + if (index != prev->wb_index + 1) + nfs_pageio_doio(desc); + } +} + +#define NFS_SCAN_MAXENTRIES 16 +/** + * nfs_scan_list - Scan a list for matching requests + * @nfsi: NFS inode + * @dst: Destination list + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan. + * @tag: tag to scan for + * + * Moves elements from one of the inode request lists. + * If the number of requests is set to 0, the entire address_space + * starting at index idx_start, is scanned. + * The requests are *not* checked to ensure that they form a contiguous set. + * You must be holding the inode's i_lock when calling this function + */ +int nfs_scan_list(struct nfs_inode *nfsi, + struct list_head *dst, pgoff_t idx_start, + unsigned int npages, int tag) +{ + struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; + struct nfs_page *req; + pgoff_t idx_end; + int found, i; + int res; + struct list_head *list; + + res = 0; + if (npages == 0) + idx_end = ~0; + else + idx_end = idx_start + npages - 1; + + for (;;) { + found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, + (void **)&pgvec[0], idx_start, + NFS_SCAN_MAXENTRIES, tag); + if (found <= 0) + break; + for (i = 0; i < found; i++) { + req = pgvec[i]; + if (req->wb_index > idx_end) + goto out; + idx_start = req->wb_index + 1; + if (nfs_set_page_tag_locked(req)) { + kref_get(&req->wb_kref); + radix_tree_tag_clear(&nfsi->nfs_page_tree, + req->wb_index, tag); + list = pnfs_choose_commit_list(req, dst); + nfs_list_add_request(req, list); + res++; + if (res == INT_MAX) + goto out; + } + } + /* for latency reduction */ + cond_resched_lock(&nfsi->vfs_inode.i_lock); + } +out: + return res; +} + +int __init nfs_init_nfspagecache(void) +{ + nfs_page_cachep = kmem_cache_create("nfs_page", + sizeof(struct nfs_page), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (nfs_page_cachep == NULL) + return -ENOMEM; + + return 0; +} + +void nfs_destroy_nfspagecache(void) +{ + kmem_cache_destroy(nfs_page_cachep); +} + diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c new file mode 100644 index 00000000..99518872 --- /dev/null +++ b/fs/nfs/pnfs.c @@ -0,0 +1,1319 @@ +/* + * pNFS functions to call and manage layout drivers. + * + * Copyright (c) 2002 [year of first publication] + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include +#include "internal.h" +#include "pnfs.h" +#include "iostat.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS + +/* Locking: + * + * pnfs_spinlock: + * protects pnfs_modules_tbl. + */ +static DEFINE_SPINLOCK(pnfs_spinlock); + +/* + * pnfs_modules_tbl holds all pnfs modules + */ +static LIST_HEAD(pnfs_modules_tbl); + +/* Return the registered pnfs layout driver module matching given id */ +static struct pnfs_layoutdriver_type * +find_pnfs_driver_locked(u32 id) +{ + struct pnfs_layoutdriver_type *local; + + list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) + if (local->id == id) + goto out; + local = NULL; +out: + dprintk("%s: Searching for id %u, found %p\n", __func__, id, local); + return local; +} + +static struct pnfs_layoutdriver_type * +find_pnfs_driver(u32 id) +{ + struct pnfs_layoutdriver_type *local; + + spin_lock(&pnfs_spinlock); + local = find_pnfs_driver_locked(id); + spin_unlock(&pnfs_spinlock); + return local; +} + +void +unset_pnfs_layoutdriver(struct nfs_server *nfss) +{ + if (nfss->pnfs_curr_ld) + module_put(nfss->pnfs_curr_ld->owner); + nfss->pnfs_curr_ld = NULL; +} + +/* + * Try to set the server's pnfs module to the pnfs layout type specified by id. + * Currently only one pNFS layout driver per filesystem is supported. + * + * @id layout type. Zero (illegal layout type) indicates pNFS not in use. + */ +void +set_pnfs_layoutdriver(struct nfs_server *server, u32 id) +{ + struct pnfs_layoutdriver_type *ld_type = NULL; + + if (id == 0) + goto out_no_driver; + if (!(server->nfs_client->cl_exchange_flags & + (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { + printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__, + id, server->nfs_client->cl_exchange_flags); + goto out_no_driver; + } + ld_type = find_pnfs_driver(id); + if (!ld_type) { + request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); + ld_type = find_pnfs_driver(id); + if (!ld_type) { + dprintk("%s: No pNFS module found for %u.\n", + __func__, id); + goto out_no_driver; + } + } + if (!try_module_get(ld_type->owner)) { + dprintk("%s: Could not grab reference on module\n", __func__); + goto out_no_driver; + } + server->pnfs_curr_ld = ld_type; + + dprintk("%s: pNFS module for %u set\n", __func__, id); + return; + +out_no_driver: + dprintk("%s: Using NFSv4 I/O\n", __func__); + server->pnfs_curr_ld = NULL; +} + +int +pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) +{ + int status = -EINVAL; + struct pnfs_layoutdriver_type *tmp; + + if (ld_type->id == 0) { + printk(KERN_ERR "%s id 0 is reserved\n", __func__); + return status; + } + if (!ld_type->alloc_lseg || !ld_type->free_lseg) { + printk(KERN_ERR "%s Layout driver must provide " + "alloc_lseg and free_lseg.\n", __func__); + return status; + } + + spin_lock(&pnfs_spinlock); + tmp = find_pnfs_driver_locked(ld_type->id); + if (!tmp) { + list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl); + status = 0; + dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, + ld_type->name); + } else { + printk(KERN_ERR "%s Module with id %d already loaded!\n", + __func__, ld_type->id); + } + spin_unlock(&pnfs_spinlock); + + return status; +} +EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver); + +void +pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) +{ + dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); + spin_lock(&pnfs_spinlock); + list_del(&ld_type->pnfs_tblid); + spin_unlock(&pnfs_spinlock); +} +EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); + +/* + * pNFS client layout cache + */ + +/* Need to hold i_lock if caller does not already hold reference */ +void +get_layout_hdr(struct pnfs_layout_hdr *lo) +{ + atomic_inc(&lo->plh_refcount); +} + +static struct pnfs_layout_hdr * +pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; + return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) : + kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags); +} + +static void +pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; + put_rpccred(lo->plh_lc_cred); + return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); +} + +static void +destroy_layout_hdr(struct pnfs_layout_hdr *lo) +{ + dprintk("%s: freeing layout cache %p\n", __func__, lo); + BUG_ON(!list_empty(&lo->plh_layouts)); + NFS_I(lo->plh_inode)->layout = NULL; + pnfs_free_layout_hdr(lo); +} + +static void +put_layout_hdr_locked(struct pnfs_layout_hdr *lo) +{ + if (atomic_dec_and_test(&lo->plh_refcount)) + destroy_layout_hdr(lo); +} + +void +put_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct inode *inode = lo->plh_inode; + + if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { + destroy_layout_hdr(lo); + spin_unlock(&inode->i_lock); + } +} + +static void +init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) +{ + INIT_LIST_HEAD(&lseg->pls_list); + INIT_LIST_HEAD(&lseg->pls_lc_list); + atomic_set(&lseg->pls_refcount, 1); + smp_mb(); + set_bit(NFS_LSEG_VALID, &lseg->pls_flags); + lseg->pls_layout = lo; +} + +static void free_lseg(struct pnfs_layout_segment *lseg) +{ + struct inode *ino = lseg->pls_layout->plh_inode; + + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); + /* Matched by get_layout_hdr in pnfs_insert_layout */ + put_layout_hdr(NFS_I(ino)->layout); +} + +static void +put_lseg_common(struct pnfs_layout_segment *lseg) +{ + struct inode *inode = lseg->pls_layout->plh_inode; + + WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + list_del_init(&lseg->pls_list); + if (list_empty(&lseg->pls_layout->plh_segs)) { + set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); + /* Matched by initial refcount set in alloc_init_layout_hdr */ + put_layout_hdr_locked(lseg->pls_layout); + } + rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); +} + +void +put_lseg(struct pnfs_layout_segment *lseg) +{ + struct inode *inode; + + if (!lseg) + return; + + dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, + atomic_read(&lseg->pls_refcount), + test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + inode = lseg->pls_layout->plh_inode; + if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + LIST_HEAD(free_me); + + put_lseg_common(lseg); + list_add(&lseg->pls_list, &free_me); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&free_me); + } +} +EXPORT_SYMBOL_GPL(put_lseg); + +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end : NFS4_MAX_UINT64; +} + +/* last octet in a range */ +static inline u64 +last_byte_offset(u64 start, u64 len) +{ + u64 end; + + BUG_ON(!len); + end = start + len; + return end > start ? end - 1 : NFS4_MAX_UINT64; +} + +/* + * is l2 fully contained in l1? + * start1 end1 + * [----------------------------------) + * start2 end2 + * [----------------) + */ +static inline int +lo_seg_contained(struct pnfs_layout_range *l1, + struct pnfs_layout_range *l2) +{ + u64 start1 = l1->offset; + u64 end1 = end_offset(start1, l1->length); + u64 start2 = l2->offset; + u64 end2 = end_offset(start2, l2->length); + + return (start1 <= start2) && (end1 >= end2); +} + +/* + * is l1 and l2 intersecting? + * start1 end1 + * [----------------------------------) + * start2 end2 + * [----------------) + */ +static inline int +lo_seg_intersecting(struct pnfs_layout_range *l1, + struct pnfs_layout_range *l2) +{ + u64 start1 = l1->offset; + u64 end1 = end_offset(start1, l1->length); + u64 start2 = l2->offset; + u64 end2 = end_offset(start2, l2->length); + + return (end1 == NFS4_MAX_UINT64 || end1 > start2) && + (end2 == NFS4_MAX_UINT64 || end2 > start1); +} + +static bool +should_free_lseg(struct pnfs_layout_range *lseg_range, + struct pnfs_layout_range *recall_range) +{ + return (recall_range->iomode == IOMODE_ANY || + lseg_range->iomode == recall_range->iomode) && + lo_seg_intersecting(lseg_range, recall_range); +} + +/* Returns 1 if lseg is removed from list, 0 otherwise */ +static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list) +{ + int rv = 0; + + if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { + /* Remove the reference keeping the lseg in the + * list. It will now be removed when all + * outstanding io is finished. + */ + dprintk("%s: lseg %p ref %d\n", __func__, lseg, + atomic_read(&lseg->pls_refcount)); + if (atomic_dec_and_test(&lseg->pls_refcount)) { + put_lseg_common(lseg); + list_add(&lseg->pls_list, tmp_list); + rv = 1; + } + } + return rv; +} + +/* Returns count of number of matching invalid lsegs remaining in list + * after call. + */ +int +mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + struct pnfs_layout_range *recall_range) +{ + struct pnfs_layout_segment *lseg, *next; + int invalid = 0, removed = 0; + + dprintk("%s:Begin lo %p\n", __func__, lo); + + if (list_empty(&lo->plh_segs)) { + if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) + put_layout_hdr_locked(lo); + return 0; + } + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) + if (!recall_range || + should_free_lseg(&lseg->pls_range, recall_range)) { + dprintk("%s: freeing lseg %p iomode %d " + "offset %llu length %llu\n", __func__, + lseg, lseg->pls_range.iomode, lseg->pls_range.offset, + lseg->pls_range.length); + invalid++; + removed += mark_lseg_invalid(lseg, tmp_list); + } + dprintk("%s:Return %i\n", __func__, invalid - removed); + return invalid - removed; +} + +/* note free_me must contain lsegs from a single layout_hdr */ +void +pnfs_free_lseg_list(struct list_head *free_me) +{ + struct pnfs_layout_segment *lseg, *tmp; + struct pnfs_layout_hdr *lo; + + if (list_empty(free_me)) + return; + + lo = list_first_entry(free_me, struct pnfs_layout_segment, + pls_list)->pls_layout; + + if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) { + struct nfs_client *clp; + + clp = NFS_SERVER(lo->plh_inode)->nfs_client; + spin_lock(&clp->cl_lock); + list_del_init(&lo->plh_layouts); + spin_unlock(&clp->cl_lock); + } + list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { + list_del(&lseg->pls_list); + free_lseg(lseg); + } +} + +void +pnfs_destroy_layout(struct nfs_inode *nfsi) +{ + struct pnfs_layout_hdr *lo; + LIST_HEAD(tmp_list); + + spin_lock(&nfsi->vfs_inode.i_lock); + lo = nfsi->layout; + if (lo) { + lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ + mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + } + spin_unlock(&nfsi->vfs_inode.i_lock); + pnfs_free_lseg_list(&tmp_list); +} + +/* + * Called by the state manger to remove all layouts established under an + * expired lease. + */ +void +pnfs_destroy_all_layouts(struct nfs_client *clp) +{ + struct pnfs_layout_hdr *lo; + LIST_HEAD(tmp_list); + + spin_lock(&clp->cl_lock); + list_splice_init(&clp->cl_layouts, &tmp_list); + spin_unlock(&clp->cl_lock); + + while (!list_empty(&tmp_list)) { + lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, + plh_layouts); + dprintk("%s freeing layout for inode %lu\n", __func__, + lo->plh_inode->i_ino); + list_del_init(&lo->plh_layouts); + pnfs_destroy_layout(NFS_I(lo->plh_inode)); + } +} + +/* update lo->plh_stateid with new if is more recent */ +void +pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, + bool update_barrier) +{ + u32 oldseq, newseq; + + oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid); + newseq = be32_to_cpu(new->stateid.seqid); + if ((int)(newseq - oldseq) > 0) { + memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid)); + if (update_barrier) { + u32 new_barrier = be32_to_cpu(new->stateid.seqid); + + if ((int)(new_barrier - lo->plh_barrier)) + lo->plh_barrier = new_barrier; + } else { + /* Because of wraparound, we want to keep the barrier + * "close" to the current seqids. It needs to be + * within 2**31 to count as "behind", so if it + * gets too near that limit, give us a litle leeway + * and bring it to within 2**30. + * NOTE - and yes, this is all unsigned arithmetic. + */ + if (unlikely((newseq - lo->plh_barrier) > (3 << 29))) + lo->plh_barrier = newseq - (1 << 30); + } + } +} + +/* lget is set to 1 if called from inside send_layoutget call chain */ +static bool +pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, + int lget) +{ + if ((stateid) && + (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) + return true; + return lo->plh_block_lgets || + test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || + (list_empty(&lo->plh_segs) && + (atomic_read(&lo->plh_outstanding) > lget)); +} + +int +pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state) +{ + int status = 0; + + dprintk("--> %s\n", __func__); + spin_lock(&lo->plh_inode->i_lock); + if (pnfs_layoutgets_blocked(lo, NULL, 1)) { + status = -EAGAIN; + } else if (list_empty(&lo->plh_segs)) { + int seq; + + do { + seq = read_seqbegin(&open_state->seqlock); + memcpy(dst->data, open_state->stateid.data, + sizeof(open_state->stateid.data)); + } while (read_seqretry(&open_state->seqlock, seq)); + } else + memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data)); + spin_unlock(&lo->plh_inode->i_lock); + dprintk("<-- %s\n", __func__); + return status; +} + +/* +* Get layout from server. +* for now, assume that whole file layouts are requested. +* arg->offset: 0 +* arg->length: all ones +*/ +static struct pnfs_layout_segment * +send_layoutget(struct pnfs_layout_hdr *lo, + struct nfs_open_context *ctx, + struct pnfs_layout_range *range, + gfp_t gfp_flags) +{ + struct inode *ino = lo->plh_inode; + struct nfs_server *server = NFS_SERVER(ino); + struct nfs4_layoutget *lgp; + struct pnfs_layout_segment *lseg = NULL; + struct page **pages = NULL; + int i; + u32 max_resp_sz, max_pages; + + dprintk("--> %s\n", __func__); + + BUG_ON(ctx == NULL); + lgp = kzalloc(sizeof(*lgp), gfp_flags); + if (lgp == NULL) + return NULL; + + /* allocate pages for xdr post processing */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = max_resp_sz >> PAGE_SHIFT; + + pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags); + if (!pages) + goto out_err_free; + + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(gfp_flags); + if (!pages[i]) + goto out_err_free; + } + + lgp->args.minlength = PAGE_CACHE_SIZE; + if (lgp->args.minlength > range->length) + lgp->args.minlength = range->length; + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; + lgp->args.range = *range; + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); + lgp->args.layout.pages = pages; + lgp->args.layout.pglen = max_pages * PAGE_SIZE; + lgp->lsegpp = &lseg; + lgp->gfp_flags = gfp_flags; + + /* Synchronously retrieve layout information from server and + * store in lseg. + */ + nfs4_proc_layoutget(lgp); + if (!lseg) { + /* remember that LAYOUTGET failed and suspend trying */ + set_bit(lo_fail_bit(range->iomode), &lo->plh_flags); + } + + /* free xdr pages */ + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + + return lseg; + +out_err_free: + /* free any allocated xdr pages, lgp as it's not used */ + if (pages) { + for (i = 0; i < max_pages; i++) { + if (!pages[i]) + break; + __free_page(pages[i]); + } + kfree(pages); + } + kfree(lgp); + return NULL; +} + +/* Initiates a LAYOUTRETURN(FILE) */ +int +_pnfs_return_layout(struct inode *ino) +{ + struct pnfs_layout_hdr *lo = NULL; + struct nfs_inode *nfsi = NFS_I(ino); + LIST_HEAD(tmp_list); + struct nfs4_layoutreturn *lrp; + nfs4_stateid stateid; + int status = 0; + + dprintk("--> %s\n", __func__); + + spin_lock(&ino->i_lock); + lo = nfsi->layout; + if (!lo) { + spin_unlock(&ino->i_lock); + dprintk("%s: no layout to return\n", __func__); + return status; + } + stateid = nfsi->layout->plh_stateid; + /* Reference matched in nfs4_layoutreturn_release */ + get_layout_hdr(lo); + mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + lo->plh_block_lgets++; + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&tmp_list); + + WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)); + + lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); + if (unlikely(lrp == NULL)) { + status = -ENOMEM; + set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags); + set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags); + put_layout_hdr(lo); + goto out; + } + + lrp->args.stateid = stateid; + lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; + lrp->args.inode = ino; + lrp->clp = NFS_SERVER(ino)->nfs_client; + + status = nfs4_proc_layoutreturn(lrp); +out: + dprintk("<-- %s status: %d\n", __func__, status); + return status; +} + +bool pnfs_roc(struct inode *ino) +{ + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg, *tmp; + LIST_HEAD(tmp_list); + bool found = false; + + spin_lock(&ino->i_lock); + lo = NFS_I(ino)->layout; + if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) + goto out_nolayout; + list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) + if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { + mark_lseg_invalid(lseg, &tmp_list); + found = true; + } + if (!found) + goto out_nolayout; + lo->plh_block_lgets++; + get_layout_hdr(lo); /* matched in pnfs_roc_release */ + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&tmp_list); + return true; + +out_nolayout: + spin_unlock(&ino->i_lock); + return false; +} + +void pnfs_roc_release(struct inode *ino) +{ + struct pnfs_layout_hdr *lo; + + spin_lock(&ino->i_lock); + lo = NFS_I(ino)->layout; + lo->plh_block_lgets--; + put_layout_hdr_locked(lo); + spin_unlock(&ino->i_lock); +} + +void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) +{ + struct pnfs_layout_hdr *lo; + + spin_lock(&ino->i_lock); + lo = NFS_I(ino)->layout; + if ((int)(barrier - lo->plh_barrier) > 0) + lo->plh_barrier = barrier; + spin_unlock(&ino->i_lock); +} + +bool pnfs_roc_drain(struct inode *ino, u32 *barrier) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_segment *lseg; + bool found = false; + + spin_lock(&ino->i_lock); + list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) + if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { + found = true; + break; + } + if (!found) { + struct pnfs_layout_hdr *lo = nfsi->layout; + u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid); + + /* Since close does not return a layout stateid for use as + * a barrier, we choose the worst-case barrier. + */ + *barrier = current_seqid + atomic_read(&lo->plh_outstanding); + } + spin_unlock(&ino->i_lock); + return found; +} + +/* + * Compare two layout segments for sorting into layout cache. + * We want to preferentially return RW over RO layouts, so ensure those + * are seen first. + */ +static s64 +cmp_layout(struct pnfs_layout_range *l1, + struct pnfs_layout_range *l2) +{ + s64 d; + + /* high offset > low offset */ + d = l1->offset - l2->offset; + if (d) + return d; + + /* short length > long length */ + d = l2->length - l1->length; + if (d) + return d; + + /* read > read/write */ + return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ); +} + +static void +pnfs_insert_layout(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_segment *lp; + + dprintk("%s:Begin\n", __func__); + + assert_spin_locked(&lo->plh_inode->i_lock); + list_for_each_entry(lp, &lo->plh_segs, pls_list) { + if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0) + continue; + list_add_tail(&lseg->pls_list, &lp->pls_list); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu before " + "lp %p iomode %d offset %llu length %llu\n", + __func__, lseg, lseg->pls_range.iomode, + lseg->pls_range.offset, lseg->pls_range.length, + lp, lp->pls_range.iomode, lp->pls_range.offset, + lp->pls_range.length); + goto out; + } + list_add_tail(&lseg->pls_list, &lo->plh_segs); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu at tail\n", + __func__, lseg, lseg->pls_range.iomode, + lseg->pls_range.offset, lseg->pls_range.length); +out: + get_layout_hdr(lo); + + dprintk("%s:Return\n", __func__); +} + +static struct pnfs_layout_hdr * +alloc_init_layout_hdr(struct inode *ino, + struct nfs_open_context *ctx, + gfp_t gfp_flags) +{ + struct pnfs_layout_hdr *lo; + + lo = pnfs_alloc_layout_hdr(ino, gfp_flags); + if (!lo) + return NULL; + atomic_set(&lo->plh_refcount, 1); + INIT_LIST_HEAD(&lo->plh_layouts); + INIT_LIST_HEAD(&lo->plh_segs); + INIT_LIST_HEAD(&lo->plh_bulk_recall); + lo->plh_inode = ino; + lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); + return lo; +} + +static struct pnfs_layout_hdr * +pnfs_find_alloc_layout(struct inode *ino, + struct nfs_open_context *ctx, + gfp_t gfp_flags) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *new = NULL; + + dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); + + assert_spin_locked(&ino->i_lock); + if (nfsi->layout) { + if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags)) + return NULL; + else + return nfsi->layout; + } + spin_unlock(&ino->i_lock); + new = alloc_init_layout_hdr(ino, ctx, gfp_flags); + spin_lock(&ino->i_lock); + + if (likely(nfsi->layout == NULL)) /* Won the race? */ + nfsi->layout = new; + else + pnfs_free_layout_hdr(new); + return nfsi->layout; +} + +/* + * iomode matching rules: + * iomode lseg match + * ----- ----- ----- + * ANY READ true + * ANY RW true + * RW READ false + * RW RW true + * READ READ true + * READ RW true + */ +static int +is_matching_lseg(struct pnfs_layout_range *ls_range, + struct pnfs_layout_range *range) +{ + struct pnfs_layout_range range1; + + if ((range->iomode == IOMODE_RW && + ls_range->iomode != IOMODE_RW) || + !lo_seg_intersecting(ls_range, range)) + return 0; + + /* range1 covers only the first byte in the range */ + range1 = *range; + range1.length = 1; + return lo_seg_contained(ls_range, &range1); +} + +/* + * lookup range in layout + */ +static struct pnfs_layout_segment * +pnfs_find_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range) +{ + struct pnfs_layout_segment *lseg, *ret = NULL; + + dprintk("%s:Begin\n", __func__); + + assert_spin_locked(&lo->plh_inode->i_lock); + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && + is_matching_lseg(&lseg->pls_range, range)) { + ret = get_lseg(lseg); + break; + } + if (lseg->pls_range.offset > range->offset) + break; + } + + dprintk("%s:Return lseg %p ref %d\n", + __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); + return ret; +} + +/* + * Layout segment is retreived from the server if not cached. + * The appropriate layout segment is referenced and returned to the caller. + */ +struct pnfs_layout_segment * +pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + gfp_t gfp_flags) +{ + struct pnfs_layout_range arg = { + .iomode = iomode, + .offset = pos, + .length = count, + }; + unsigned pg_offset; + struct nfs_inode *nfsi = NFS_I(ino); + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg = NULL; + bool first = false; + + if (!pnfs_enabled_sb(NFS_SERVER(ino))) + return NULL; + spin_lock(&ino->i_lock); + lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); + if (lo == NULL) { + dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); + goto out_unlock; + } + + /* Do we even need to bother with this? */ + if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + dprintk("%s matches recall, use MDS\n", __func__); + goto out_unlock; + } + + /* if LAYOUTGET already failed once we don't try again */ + if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) + goto out_unlock; + + /* Check to see if the layout for the given range already exists */ + lseg = pnfs_find_lseg(lo, &arg); + if (lseg) + goto out_unlock; + + if (pnfs_layoutgets_blocked(lo, NULL, 0)) + goto out_unlock; + atomic_inc(&lo->plh_outstanding); + + get_layout_hdr(lo); + if (list_empty(&lo->plh_segs)) + first = true; + spin_unlock(&ino->i_lock); + if (first) { + /* The lo must be on the clp list if there is any + * chance of a CB_LAYOUTRECALL(FILE) coming in. + */ + spin_lock(&clp->cl_lock); + BUG_ON(!list_empty(&lo->plh_layouts)); + list_add_tail(&lo->plh_layouts, &clp->cl_layouts); + spin_unlock(&clp->cl_lock); + } + + pg_offset = arg.offset & ~PAGE_CACHE_MASK; + if (pg_offset) { + arg.offset -= pg_offset; + arg.length += pg_offset; + } + if (arg.length != NFS4_MAX_UINT64) + arg.length = PAGE_CACHE_ALIGN(arg.length); + + lseg = send_layoutget(lo, ctx, &arg, gfp_flags); + if (!lseg && first) { + spin_lock(&clp->cl_lock); + list_del_init(&lo->plh_layouts); + spin_unlock(&clp->cl_lock); + } + atomic_dec(&lo->plh_outstanding); + put_layout_hdr(lo); +out: + dprintk("%s end, state 0x%lx lseg %p\n", __func__, + nfsi->layout ? nfsi->layout->plh_flags : -1, lseg); + return lseg; +out_unlock: + spin_unlock(&ino->i_lock); + goto out; +} + +int +pnfs_layout_process(struct nfs4_layoutget *lgp) +{ + struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; + struct nfs4_layoutget_res *res = &lgp->res; + struct pnfs_layout_segment *lseg; + struct inode *ino = lo->plh_inode; + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; + int status = 0; + + /* Inject layout blob into I/O device driver */ + lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); + if (!lseg || IS_ERR(lseg)) { + if (!lseg) + status = -ENOMEM; + else + status = PTR_ERR(lseg); + dprintk("%s: Could not allocate layout: error %d\n", + __func__, status); + goto out; + } + + spin_lock(&ino->i_lock); + if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + dprintk("%s forget reply due to recall\n", __func__); + goto out_forget_reply; + } + + if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) { + dprintk("%s forget reply due to state\n", __func__); + goto out_forget_reply; + } + init_lseg(lo, lseg); + lseg->pls_range = res->range; + *lgp->lsegpp = get_lseg(lseg); + pnfs_insert_layout(lo, lseg); + + if (res->return_on_close) { + set_bit(NFS_LSEG_ROC, &lseg->pls_flags); + set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); + } + + /* Done processing layoutget. Set the layout stateid */ + pnfs_set_layout_stateid(lo, &res->stateid, false); + spin_unlock(&ino->i_lock); +out: + return status; + +out_forget_reply: + spin_unlock(&ino->i_lock); + lseg->pls_layout = lo; + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); + goto out; +} + +bool +pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + enum pnfs_iomode access_type; + gfp_t gfp_flags; + + /* We assume that pg_ioflags == 0 iff we're reading a page */ + if (pgio->pg_ioflags == 0) { + access_type = IOMODE_READ; + gfp_flags = GFP_KERNEL; + } else { + access_type = IOMODE_RW; + gfp_flags = GFP_NOFS; + } + + if (pgio->pg_lseg == NULL) { + if (pgio->pg_count != prev->wb_bytes) + return true; + /* This is first coelesce call for a series of nfs_pages */ + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + prev->wb_context, + req_offset(prev), + pgio->pg_count, + access_type, + gfp_flags); + if (pgio->pg_lseg == NULL) + return true; + } + + /* + * Test if a nfs_page is fully contained in the pnfs_layout_range. + * Note that this test makes several assumptions: + * - that the previous nfs_page in the struct nfs_pageio_descriptor + * is known to lie within the range. + * - that the nfs_page being tested is known to be contiguous with the + * previous nfs_page. + * - Layout ranges are page aligned, so we only have to test the + * start offset of the request. + * + * Please also note that 'end_offset' is actually the offset of the + * first byte that lies outside the pnfs_layout_range. FIXME? + * + */ + return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset, + pgio->pg_lseg->pls_range.length); +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); + +/* + * Called by non rpc-based layout drivers + */ +int +pnfs_ld_write_done(struct nfs_write_data *data) +{ + int status; + + if (!data->pnfs_error) { + pnfs_set_layoutcommit(data); + data->mds_ops->rpc_call_done(&data->task, data); + data->mds_ops->rpc_release(data); + return 0; + } + if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_ERROR) { + /* Don't lo_commit on error, Server will needs to + * preform a file recovery. + */ + clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(data->inode)->flags); + pnfs_return_layout(data->inode); + } + + dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__, + data->pnfs_error); + status = nfs_initiate_write(data, NFS_CLIENT(data->inode), + data->mds_ops, NFS_FILE_SYNC); + return status ? : -EAGAIN; +} +EXPORT_SYMBOL_GPL(pnfs_ld_write_done); + +enum pnfs_try_status +pnfs_try_to_write_data(struct nfs_write_data *wdata, + const struct rpc_call_ops *call_ops, int how) +{ + struct inode *inode = wdata->inode; + enum pnfs_try_status trypnfs; + struct nfs_server *nfss = NFS_SERVER(inode); + + wdata->mds_ops = call_ops; + + dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, + inode->i_ino, wdata->args.count, wdata->args.offset, how); + + trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); + if (trypnfs == PNFS_NOT_ATTEMPTED) { + put_lseg(wdata->lseg); + wdata->lseg = NULL; + } else + nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); + + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; +} + +/* + * Called by non rpc-based layout drivers + */ +int +pnfs_ld_read_done(struct nfs_read_data *data) +{ + int status; + + if (!data->pnfs_error) { + __nfs4_read_done_cb(data); + data->mds_ops->rpc_call_done(&data->task, data); + data->mds_ops->rpc_release(data); + return 0; + } + + if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_ERROR) + pnfs_return_layout(data->inode); + + dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__, + data->pnfs_error); + status = nfs_initiate_read(data, NFS_CLIENT(data->inode), + data->mds_ops); + return status ? : -EAGAIN; +} +EXPORT_SYMBOL_GPL(pnfs_ld_read_done); + +/* + * Call the appropriate parallel I/O subsystem read function. + */ +enum pnfs_try_status +pnfs_try_to_read_data(struct nfs_read_data *rdata, + const struct rpc_call_ops *call_ops) +{ + struct inode *inode = rdata->inode; + struct nfs_server *nfss = NFS_SERVER(inode); + enum pnfs_try_status trypnfs; + + rdata->mds_ops = call_ops; + + dprintk("%s: Reading ino:%lu %u@%llu\n", + __func__, inode->i_ino, rdata->args.count, rdata->args.offset); + + trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); + if (trypnfs == PNFS_NOT_ATTEMPTED) { + put_lseg(rdata->lseg); + rdata->lseg = NULL; + } else { + nfs_inc_stats(inode, NFSIOS_PNFS_READ); + } + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; +} + +/* + * There can be multiple RW segments. + */ +static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { + if (lseg->pls_range.iomode == IOMODE_RW && + test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + list_add(&lseg->pls_lc_list, listp); + } +} + +void +pnfs_set_layoutcommit(struct nfs_write_data *wdata) +{ + struct nfs_inode *nfsi = NFS_I(wdata->inode); + loff_t end_pos = wdata->mds_offset + wdata->res.count; + bool mark_as_dirty = false; + + spin_lock(&nfsi->vfs_inode.i_lock); + if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + mark_as_dirty = true; + dprintk("%s: Set layoutcommit for inode %lu ", + __func__, wdata->inode->i_ino); + } + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) { + /* references matched in nfs4_layoutcommit_release */ + get_lseg(wdata->lseg); + } + if (end_pos > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; + spin_unlock(&nfsi->vfs_inode.i_lock); + dprintk("%s: lseg %p end_pos %llu\n", + __func__, wdata->lseg, nfsi->layout->plh_lwb); + + /* if pnfs_layoutcommit_inode() runs between inode locks, the next one + * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ + if (mark_as_dirty) + mark_inode_dirty_sync(wdata->inode); +} +EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); + +/* + * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and + * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough + * data to disk to allow the server to recover the data if it crashes. + * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag + * is off, and a COMMIT is sent to a data server, or + * if WRITEs to a data server return NFS_DATA_SYNC. + */ +int +pnfs_layoutcommit_inode(struct inode *inode, bool sync) +{ + struct nfs4_layoutcommit_data *data; + struct nfs_inode *nfsi = NFS_I(inode); + loff_t end_pos; + int status = 0; + + dprintk("--> %s inode %lu\n", __func__, inode->i_ino); + + if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + return 0; + + /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + mark_inode_dirty_sync(inode); + status = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&data->lseg_list); + spin_lock(&inode->i_lock); + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + spin_unlock(&inode->i_lock); + kfree(data); + goto out; + } + + pnfs_list_write_lseg(inode, &data->lseg_list); + + end_pos = nfsi->layout->plh_lwb; + nfsi->layout->plh_lwb = 0; + + memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, + sizeof(nfsi->layout->plh_stateid.data)); + spin_unlock(&inode->i_lock); + + data->args.inode = inode; + data->cred = get_rpccred(nfsi->layout->plh_lc_cred); + nfs_fattr_init(&data->fattr); + data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; + data->res.fattr = &data->fattr; + data->args.lastbytewritten = end_pos - 1; + data->res.server = NFS_SERVER(inode); + + status = nfs4_proc_layoutcommit(data, sync); +out: + dprintk("<-- %s status %d\n", __func__, status); + return status; +} diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h new file mode 100644 index 00000000..bb8b3247 --- /dev/null +++ b/fs/nfs/pnfs.h @@ -0,0 +1,427 @@ +/* + * pNFS client data structures. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#ifndef FS_NFS_PNFS_H +#define FS_NFS_PNFS_H + +#include +#include + +enum { + NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ + NFS_LSEG_ROC, /* roc bit received from server */ + NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ +}; + +struct pnfs_layout_segment { + struct list_head pls_list; + struct list_head pls_lc_list; + struct pnfs_layout_range pls_range; + atomic_t pls_refcount; + unsigned long pls_flags; + struct pnfs_layout_hdr *pls_layout; +}; + +enum pnfs_try_status { + PNFS_ATTEMPTED = 0, + PNFS_NOT_ATTEMPTED = 1, +}; + +#ifdef CONFIG_NFS_V4_1 + +#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" + +enum { + NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ + NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ + NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ + NFS_LAYOUT_ROC, /* some lseg had roc bit set */ + NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */ +}; + +enum layoutdriver_policy_flags { + /* Should the pNFS client commit and return the layout upon a setattr */ + PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, + PNFS_LAYOUTRET_ON_ERROR = 1 << 1, +}; + +struct nfs4_deviceid_node; + +/* Per-layout driver specific registration structure */ +struct pnfs_layoutdriver_type { + struct list_head pnfs_tblid; + const u32 id; + const char *name; + struct module *owner; + unsigned flags; + + struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags); + void (*free_layout_hdr) (struct pnfs_layout_hdr *); + + struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); + void (*free_lseg) (struct pnfs_layout_segment *lseg); + + /* test for nfs page cache coalescing */ + bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); + + /* Returns true if layoutdriver wants to divert this request to + * driver's commit routine. + */ + bool (*mark_pnfs_commit)(struct pnfs_layout_segment *lseg); + struct list_head * (*choose_commit_list) (struct nfs_page *req); + int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how); + + /* + * Return PNFS_ATTEMPTED to indicate the layout code has attempted + * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS + */ + enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); + enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how); + + void (*free_deviceid_node) (struct nfs4_deviceid_node *); + + void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args); + + void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args); +}; + +struct pnfs_layout_hdr { + atomic_t plh_refcount; + struct list_head plh_layouts; /* other client layouts */ + struct list_head plh_bulk_recall; /* clnt list of bulk recalls */ + struct list_head plh_segs; /* layout segments list */ + nfs4_stateid plh_stateid; + atomic_t plh_outstanding; /* number of RPCs out */ + unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ + u32 plh_barrier; /* ignore lower seqids */ + unsigned long plh_flags; + loff_t plh_lwb; /* last write byte for layoutcommit */ + struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ + struct inode *plh_inode; +}; + +struct pnfs_device { + struct nfs4_deviceid dev_id; + unsigned int layout_type; + unsigned int mincount; + struct page **pages; + unsigned int pgbase; + unsigned int pglen; +}; + +extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); +extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); + +/* nfs4proc.c */ +extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *dev); +extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); +extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); + +/* pnfs.c */ +void get_layout_hdr(struct pnfs_layout_hdr *lo); +void put_lseg(struct pnfs_layout_segment *lseg); +struct pnfs_layout_segment * +pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, + loff_t pos, u64 count, enum pnfs_iomode access_type, + gfp_t gfp_flags); +void set_pnfs_layoutdriver(struct nfs_server *, u32 id); +void unset_pnfs_layoutdriver(struct nfs_server *); +enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, + const struct rpc_call_ops *, int); +enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, + const struct rpc_call_ops *); +bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); +int pnfs_layout_process(struct nfs4_layoutget *lgp); +void pnfs_free_lseg_list(struct list_head *tmp_list); +void pnfs_destroy_layout(struct nfs_inode *); +void pnfs_destroy_all_layouts(struct nfs_client *); +void put_layout_hdr(struct pnfs_layout_hdr *lo); +void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new, + bool update_barrier); +int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, + struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state); +int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + struct pnfs_layout_range *recall_range); +bool pnfs_roc(struct inode *ino); +void pnfs_roc_release(struct inode *ino); +void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); +bool pnfs_roc_drain(struct inode *ino, u32 *barrier); +void pnfs_set_layoutcommit(struct nfs_write_data *wdata); +int pnfs_layoutcommit_inode(struct inode *inode, bool sync); +int _pnfs_return_layout(struct inode *); +int pnfs_ld_write_done(struct nfs_write_data *); +int pnfs_ld_read_done(struct nfs_read_data *); + +/* pnfs_dev.c */ +struct nfs4_deviceid_node { + struct hlist_node node; + struct hlist_node tmpnode; + const struct pnfs_layoutdriver_type *ld; + const struct nfs_client *nfs_client; + struct nfs4_deviceid deviceid; + atomic_t ref; +}; + +void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); +struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); +struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); +void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); +void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, + const struct pnfs_layoutdriver_type *, + const struct nfs_client *, + const struct nfs4_deviceid *); +struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *); +bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); +void nfs4_deviceid_purge_client(const struct nfs_client *); + +static inline int lo_fail_bit(u32 iomode) +{ + return iomode == IOMODE_RW ? + NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; +} + +static inline struct pnfs_layout_segment * +get_lseg(struct pnfs_layout_segment *lseg) +{ + if (lseg) { + atomic_inc(&lseg->pls_refcount); + smp_mb__after_atomic_inc(); + } + return lseg; +} + +/* Return true if a layout driver is being used for this mountpoint */ +static inline int pnfs_enabled_sb(struct nfs_server *nfss) +{ + return nfss->pnfs_curr_ld != NULL; +} + +static inline void +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +{ + if (lseg) { + struct pnfs_layoutdriver_type *ld; + + ld = NFS_SERVER(req->wb_page->mapping->host)->pnfs_curr_ld; + if (ld->mark_pnfs_commit && ld->mark_pnfs_commit(lseg)) { + set_bit(PG_PNFS_COMMIT, &req->wb_flags); + req->wb_commit_lseg = get_lseg(lseg); + } + } +} + +static inline int +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) +{ + if (!test_and_clear_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags)) + return PNFS_NOT_ATTEMPTED; + return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how); +} + +static inline struct list_head * +pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) +{ + struct list_head *rv; + + if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) { + struct inode *inode = req->wb_commit_lseg->pls_layout->plh_inode; + + set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags); + rv = NFS_SERVER(inode)->pnfs_curr_ld->choose_commit_list(req); + /* matched by ref taken when PG_PNFS_COMMIT is set */ + put_lseg(req->wb_commit_lseg); + } else + rv = mds; + return rv; +} + +static inline void pnfs_clear_request_commit(struct nfs_page *req) +{ + if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) + put_lseg(req->wb_commit_lseg); +} + +/* Should the pNFS client commit and return the layout upon a setattr */ +static inline bool +pnfs_ld_layoutret_on_setattr(struct inode *inode) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return false; + return NFS_SERVER(inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_SETATTR; +} + +static inline int pnfs_return_layout(struct inode *ino) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct nfs_server *nfss = NFS_SERVER(ino); + + if (pnfs_enabled_sb(nfss) && nfsi->layout) + return _pnfs_return_layout(ino); + + return 0; +} + +static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio, + struct inode *inode) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (ld) + pgio->pg_test = ld->pg_test; +} + +#else /* CONFIG_NFS_V4_1 */ + +static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) +{ +} + +static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) +{ +} + +static inline struct pnfs_layout_segment * +get_lseg(struct pnfs_layout_segment *lseg) +{ + return NULL; +} + +static inline void put_lseg(struct pnfs_layout_segment *lseg) +{ +} + +static inline struct pnfs_layout_segment * +pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, + loff_t pos, u64 count, enum pnfs_iomode access_type, + gfp_t gfp_flags) +{ + return NULL; +} + +static inline enum pnfs_try_status +pnfs_try_to_read_data(struct nfs_read_data *data, + const struct rpc_call_ops *call_ops) +{ + return PNFS_NOT_ATTEMPTED; +} + +static inline enum pnfs_try_status +pnfs_try_to_write_data(struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, int how) +{ + return PNFS_NOT_ATTEMPTED; +} + +static inline int pnfs_return_layout(struct inode *ino) +{ + return 0; +} + +static inline bool +pnfs_ld_layoutret_on_setattr(struct inode *inode) +{ + return false; +} + +static inline bool +pnfs_roc(struct inode *ino) +{ + return false; +} + +static inline void +pnfs_roc_release(struct inode *ino) +{ +} + +static inline void +pnfs_roc_set_barrier(struct inode *ino, u32 barrier) +{ +} + +static inline bool +pnfs_roc_drain(struct inode *ino, u32 *barrier) +{ + return false; +} + +static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) +{ +} + +static inline void unset_pnfs_layoutdriver(struct nfs_server *s) +{ +} + +static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio, + struct inode *inode) +{ +} + +static inline void +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +{ +} + +static inline int +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) +{ + return PNFS_NOT_ATTEMPTED; +} + +static inline struct list_head * +pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) +{ + return mds; +} + +static inline void pnfs_clear_request_commit(struct nfs_page *req) +{ +} + +static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) +{ + return 0; +} + +static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +#endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c new file mode 100644 index 00000000..f0f8e1e2 --- /dev/null +++ b/fs/nfs/pnfs_dev.c @@ -0,0 +1,277 @@ +/* + * Device operations for the pnfs client. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand + * Garth Goodson + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS + +/* + * Device ID RCU cache. A device ID is unique per server and layout type. + */ +#define NFS4_DEVICE_ID_HASH_BITS 5 +#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) +#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) + +static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; +static DEFINE_SPINLOCK(nfs4_deviceid_lock); + +void +nfs4_print_deviceid(const struct nfs4_deviceid *id) +{ + u32 *p = (u32 *)id; + + dprintk("%s: device id= [%x%x%x%x]\n", __func__, + p[0], p[1], p[2], p[3]); +} +EXPORT_SYMBOL_GPL(nfs4_print_deviceid); + +static inline u32 +nfs4_deviceid_hash(const struct nfs4_deviceid *id) +{ + unsigned char *cptr = (unsigned char *)id->data; + unsigned int nbytes = NFS4_DEVICEID4_SIZE; + u32 x = 0; + + while (nbytes--) { + x *= 37; + x += *cptr++; + } + return x & NFS4_DEVICE_ID_HASH_MASK; +} + +static struct nfs4_deviceid_node * +_lookup_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id, + long hash) +{ + struct nfs4_deviceid_node *d; + struct hlist_node *n; + + hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) + if (d->ld == ld && d->nfs_client == clp && + !memcmp(&d->deviceid, id, sizeof(*id))) { + if (atomic_read(&d->ref)) + return d; + else + continue; + } + return NULL; +} + +/* + * Lookup a deviceid in cache and get a reference count on it if found + * + * @clp nfs_client associated with deviceid + * @id deviceid to look up + */ +struct nfs4_deviceid_node * +_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id, + long hash) +{ + struct nfs4_deviceid_node *d; + + rcu_read_lock(); + d = _lookup_deviceid(ld, clp, id, hash); + if (d && !atomic_inc_not_zero(&d->ref)) + d = NULL; + rcu_read_unlock(); + return d; +} + +struct nfs4_deviceid_node * +nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id) +{ + return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); +} +EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); + +/* + * Unhash and put deviceid + * + * @clp nfs_client associated with deviceid + * @id the deviceid to unhash + * + * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise. + */ +struct nfs4_deviceid_node * +nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id) +{ + struct nfs4_deviceid_node *d; + + spin_lock(&nfs4_deviceid_lock); + rcu_read_lock(); + d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); + rcu_read_unlock(); + if (!d) { + spin_unlock(&nfs4_deviceid_lock); + return NULL; + } + hlist_del_init_rcu(&d->node); + spin_unlock(&nfs4_deviceid_lock); + synchronize_rcu(); + + /* balance the initial ref set in pnfs_insert_deviceid */ + if (atomic_dec_and_test(&d->ref)) + return d; + + return NULL; +} +EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid); + +/* + * Delete a deviceid from cache + * + * @clp struct nfs_client qualifying the deviceid + * @id deviceid to delete + */ +void +nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id) +{ + struct nfs4_deviceid_node *d; + + d = nfs4_unhash_put_deviceid(ld, clp, id); + if (!d) + return; + d->ld->free_deviceid_node(d); +} +EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); + +void +nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, + const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *nfs_client, + const struct nfs4_deviceid *id) +{ + INIT_HLIST_NODE(&d->node); + INIT_HLIST_NODE(&d->tmpnode); + d->ld = ld; + d->nfs_client = nfs_client; + d->deviceid = *id; + atomic_set(&d->ref, 1); +} +EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); + +/* + * Uniquely initialize and insert a deviceid node into cache + * + * @new new deviceid node + * Note that the caller must set up the following members: + * new->ld + * new->nfs_client + * new->deviceid + * + * @ret the inserted node, if none found, otherwise, the found entry. + */ +struct nfs4_deviceid_node * +nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new) +{ + struct nfs4_deviceid_node *d; + long hash; + + spin_lock(&nfs4_deviceid_lock); + hash = nfs4_deviceid_hash(&new->deviceid); + d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash); + if (d) { + spin_unlock(&nfs4_deviceid_lock); + return d; + } + + hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); + spin_unlock(&nfs4_deviceid_lock); + atomic_inc(&new->ref); + + return new; +} +EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node); + +/* + * Dereference a deviceid node and delete it when its reference count drops + * to zero. + * + * @d deviceid node to put + * + * @ret true iff the node was deleted + */ +bool +nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) +{ + if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock)) + return false; + hlist_del_init_rcu(&d->node); + spin_unlock(&nfs4_deviceid_lock); + synchronize_rcu(); + d->ld->free_deviceid_node(d); + return true; +} +EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node); + +static void +_deviceid_purge_client(const struct nfs_client *clp, long hash) +{ + struct nfs4_deviceid_node *d; + struct hlist_node *n; + HLIST_HEAD(tmp); + + spin_lock(&nfs4_deviceid_lock); + rcu_read_lock(); + hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) + if (d->nfs_client == clp && atomic_read(&d->ref)) { + hlist_del_init_rcu(&d->node); + hlist_add_head(&d->tmpnode, &tmp); + } + rcu_read_unlock(); + spin_unlock(&nfs4_deviceid_lock); + + if (hlist_empty(&tmp)) + return; + + synchronize_rcu(); + while (!hlist_empty(&tmp)) { + d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode); + hlist_del(&d->tmpnode); + if (atomic_dec_and_test(&d->ref)) + d->ld->free_deviceid_node(d); + } +} + +void +nfs4_deviceid_purge_client(const struct nfs_client *clp) +{ + long h; + + if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) + return; + for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++) + _deviceid_purge_client(clp, h); +} diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c new file mode 100644 index 00000000..f48125da --- /dev/null +++ b/fs/nfs/proc.c @@ -0,0 +1,746 @@ +/* + * linux/fs/nfs/proc.c + * + * Copyright (C) 1992, 1993, 1994 Rick Sladkey + * + * OS-independent nfs remote procedure call functions + * + * Tuned by Alan Cox for >3K buffers + * so at last we can have decent(ish) throughput off a + * Sun server. + * + * Coding optimized and cleaned up by Florian La Roche. + * Note: Error returns are optimized for NFS_OK, which isn't translated via + * nfs_stat_to_errno(), but happens to be already the right return code. + * + * Also, the code currently doesn't check the size of the packet, when + * it decodes the packet. + * + * Feel free to fix it and mail me the diffs if it worries you. + * + * Completely rewritten to support the new RPC call interface; + * rewrote and moved the entire XDR stuff to xdr.c + * --Olaf Kirch June 1996 + * + * The code below initializes all auto variables explicitly, otherwise + * it will fail to work as a module (gcc generates a memset call for an + * incomplete struct). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_PROC + +/* + * wrapper to handle the -EKEYEXPIRED error message. This should generally + * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't + * support the NFSERR_JUKEBOX error code, but we handle this situation in the + * same way that we handle that error with NFSv3. + */ +static int +nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) +{ + int res; + do { + res = rpc_call_sync(clnt, msg, flags); + if (res != -EKEYEXPIRED) + break; + schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); + res = -ERESTARTSYS; + } while (!fatal_signal_pending(current)); + return res; +} + +#define rpc_call_sync(clnt, msg, flags) nfs_rpc_wrapper(clnt, msg, flags) + +static int +nfs_async_handle_expired_key(struct rpc_task *task) +{ + if (task->tk_status != -EKEYEXPIRED) + return 0; + task->tk_status = 0; + rpc_restart_call(task); + rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); + return 1; +} + +/* + * Bare-bones access to getattr: this is for nfs_read_super. + */ +static int +nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + struct nfs_fattr *fattr = info->fattr; + struct nfs2_fsstat fsinfo; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_GETATTR], + .rpc_argp = fhandle, + .rpc_resp = fattr, + }; + int status; + + dprintk("%s: call getattr\n", __func__); + nfs_fattr_init(fattr); + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + dprintk("%s: reply getattr: %d\n", __func__, status); + if (status) + return status; + dprintk("%s: call statfs\n", __func__); + msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS]; + msg.rpc_resp = &fsinfo; + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + dprintk("%s: reply statfs: %d\n", __func__, status); + if (status) + return status; + info->rtmax = NFS_MAXDATA; + info->rtpref = fsinfo.tsize; + info->rtmult = fsinfo.bsize; + info->wtmax = NFS_MAXDATA; + info->wtpref = fsinfo.tsize; + info->wtmult = fsinfo.bsize; + info->dtpref = fsinfo.tsize; + info->maxfilesize = 0x7FFFFFFF; + info->lease_time = 0; + return 0; +} + +/* + * One function for each procedure in the NFS protocol. + */ +static int +nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) +{ + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_GETATTR], + .rpc_argp = fhandle, + .rpc_resp = fattr, + }; + int status; + + dprintk("NFS call getattr\n"); + nfs_fattr_init(fattr); + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply getattr: %d\n", status); + return status; +} + +static int +nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, + struct iattr *sattr) +{ + struct inode *inode = dentry->d_inode; + struct nfs_sattrargs arg = { + .fh = NFS_FH(inode), + .sattr = sattr + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_SETATTR], + .rpc_argp = &arg, + .rpc_resp = fattr, + }; + int status; + + /* Mask out the non-modebit related stuff from attr->ia_mode */ + sattr->ia_mode &= S_IALLUGO; + + dprintk("NFS call setattr\n"); + if (sattr->ia_valid & ATTR_FILE) + msg.rpc_cred = nfs_file_cred(sattr->ia_file); + nfs_fattr_init(fattr); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + if (status == 0) + nfs_setattr_update_inode(inode, sattr); + dprintk("NFS reply setattr: %d\n", status); + return status; +} + +static int +nfs_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) +{ + struct nfs_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, + .len = name->len + }; + struct nfs_diropok res = { + .fh = fhandle, + .fattr = fattr + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_LOOKUP], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status; + + dprintk("NFS call lookup %s\n", name->name); + nfs_fattr_init(fattr); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + dprintk("NFS reply lookup: %d\n", status); + return status; +} + +static int nfs_proc_readlink(struct inode *inode, struct page *page, + unsigned int pgbase, unsigned int pglen) +{ + struct nfs_readlinkargs args = { + .fh = NFS_FH(inode), + .pgbase = pgbase, + .pglen = pglen, + .pages = &page + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_READLINK], + .rpc_argp = &args, + }; + int status; + + dprintk("NFS call readlink\n"); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + dprintk("NFS reply readlink: %d\n", status); + return status; +} + +struct nfs_createdata { + struct nfs_createargs arg; + struct nfs_diropok res; + struct nfs_fh fhandle; + struct nfs_fattr fattr; +}; + +static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir, + struct dentry *dentry, struct iattr *sattr) +{ + struct nfs_createdata *data; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + + if (data != NULL) { + data->arg.fh = NFS_FH(dir); + data->arg.name = dentry->d_name.name; + data->arg.len = dentry->d_name.len; + data->arg.sattr = sattr; + nfs_fattr_init(&data->fattr); + data->fhandle.size = 0; + data->res.fh = &data->fhandle; + data->res.fattr = &data->fattr; + } + return data; +}; + +static void nfs_free_createdata(const struct nfs_createdata *data) +{ + kfree(data); +} + +static int +nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + int flags, struct nfs_open_context *ctx) +{ + struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_CREATE], + }; + int status = -ENOMEM; + + dprintk("NFS call create %s\n", dentry->d_name.name); + data = nfs_alloc_createdata(dir, dentry, sattr); + if (data == NULL) + goto out; + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + if (status == 0) + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + nfs_free_createdata(data); +out: + dprintk("NFS reply create: %d\n", status); + return status; +} + +/* + * In NFSv2, mknod is grafted onto the create call. + */ +static int +nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + dev_t rdev) +{ + struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_CREATE], + }; + umode_t mode; + int status = -ENOMEM; + + dprintk("NFS call mknod %s\n", dentry->d_name.name); + + mode = sattr->ia_mode; + if (S_ISFIFO(mode)) { + sattr->ia_mode = (mode & ~S_IFMT) | S_IFCHR; + sattr->ia_valid &= ~ATTR_SIZE; + } else if (S_ISCHR(mode) || S_ISBLK(mode)) { + sattr->ia_valid |= ATTR_SIZE; + sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */ + } + + data = nfs_alloc_createdata(dir, dentry, sattr); + if (data == NULL) + goto out; + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + + if (status == -EINVAL && S_ISFIFO(mode)) { + sattr->ia_mode = mode; + nfs_fattr_init(data->res.fattr); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + } + if (status == 0) + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + nfs_free_createdata(data); +out: + dprintk("NFS reply mknod: %d\n", status); + return status; +} + +static int +nfs_proc_remove(struct inode *dir, struct qstr *name) +{ + struct nfs_removeargs arg = { + .fh = NFS_FH(dir), + .name.len = name->len, + .name.name = name->name, + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_REMOVE], + .rpc_argp = &arg, + }; + int status; + + dprintk("NFS call remove %s\n", name->name); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + + dprintk("NFS reply remove: %d\n", status); + return status; +} + +static void +nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +{ + msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE]; +} + +static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) +{ + if (nfs_async_handle_expired_key(task)) + return 0; + nfs_mark_for_revalidate(dir); + return 1; +} + +static void +nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +{ + msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME]; +} + +static int +nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ + if (nfs_async_handle_expired_key(task)) + return 0; + nfs_mark_for_revalidate(old_dir); + nfs_mark_for_revalidate(new_dir); + return 1; +} + +static int +nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, + struct inode *new_dir, struct qstr *new_name) +{ + struct nfs_renameargs arg = { + .old_dir = NFS_FH(old_dir), + .old_name = old_name, + .new_dir = NFS_FH(new_dir), + .new_name = new_name, + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_RENAME], + .rpc_argp = &arg, + }; + int status; + + dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); + status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); + nfs_mark_for_revalidate(old_dir); + nfs_mark_for_revalidate(new_dir); + dprintk("NFS reply rename: %d\n", status); + return status; +} + +static int +nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +{ + struct nfs_linkargs arg = { + .fromfh = NFS_FH(inode), + .tofh = NFS_FH(dir), + .toname = name->name, + .tolen = name->len + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_LINK], + .rpc_argp = &arg, + }; + int status; + + dprintk("NFS call link %s\n", name->name); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_mark_for_revalidate(inode); + nfs_mark_for_revalidate(dir); + dprintk("NFS reply link: %d\n", status); + return status; +} + +static int +nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, + unsigned int len, struct iattr *sattr) +{ + struct nfs_fh *fh; + struct nfs_fattr *fattr; + struct nfs_symlinkargs arg = { + .fromfh = NFS_FH(dir), + .fromname = dentry->d_name.name, + .fromlen = dentry->d_name.len, + .pages = &page, + .pathlen = len, + .sattr = sattr + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK], + .rpc_argp = &arg, + }; + int status = -ENAMETOOLONG; + + dprintk("NFS call symlink %s\n", dentry->d_name.name); + + if (len > NFS2_MAXPATHLEN) + goto out; + + fh = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + status = -ENOMEM; + if (fh == NULL || fattr == NULL) + goto out_free; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + + /* + * V2 SYMLINK requests don't return any attributes. Setting the + * filehandle size to zero indicates to nfs_instantiate that it + * should fill in the data with a LOOKUP call on the wire. + */ + if (status == 0) + status = nfs_instantiate(dentry, fh, fattr); + +out_free: + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); +out: + dprintk("NFS reply symlink: %d\n", status); + return status; +} + +static int +nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) +{ + struct nfs_createdata *data; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], + }; + int status = -ENOMEM; + + dprintk("NFS call mkdir %s\n", dentry->d_name.name); + data = nfs_alloc_createdata(dir, dentry, sattr); + if (data == NULL) + goto out; + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + if (status == 0) + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + nfs_free_createdata(data); +out: + dprintk("NFS reply mkdir: %d\n", status); + return status; +} + +static int +nfs_proc_rmdir(struct inode *dir, struct qstr *name) +{ + struct nfs_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, + .len = name->len + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_RMDIR], + .rpc_argp = &arg, + }; + int status; + + dprintk("NFS call rmdir %s\n", name->name); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); + dprintk("NFS reply rmdir: %d\n", status); + return status; +} + +/* + * The READDIR implementation is somewhat hackish - we pass a temporary + * buffer to the encode function, which installs it in the receive + * the receive iovec. The decode function just parses the reply to make + * sure it is syntactically correct; the entries itself are decoded + * from nfs_readdir by calling the decode_entry function directly. + */ +static int +nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, + u64 cookie, struct page **pages, unsigned int count, int plus) +{ + struct inode *dir = dentry->d_inode; + struct nfs_readdirargs arg = { + .fh = NFS_FH(dir), + .cookie = cookie, + .count = count, + .pages = pages, + }; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_READDIR], + .rpc_argp = &arg, + .rpc_cred = cred, + }; + int status; + + dprintk("NFS call readdir %d\n", (unsigned int)cookie); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + + nfs_invalidate_atime(dir); + + dprintk("NFS reply readdir: %d\n", status); + return status; +} + +static int +nfs_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsstat *stat) +{ + struct nfs2_fsstat fsinfo; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_STATFS], + .rpc_argp = fhandle, + .rpc_resp = &fsinfo, + }; + int status; + + dprintk("NFS call statfs\n"); + nfs_fattr_init(stat->fattr); + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply statfs: %d\n", status); + if (status) + goto out; + stat->tbytes = (u64)fsinfo.blocks * fsinfo.bsize; + stat->fbytes = (u64)fsinfo.bfree * fsinfo.bsize; + stat->abytes = (u64)fsinfo.bavail * fsinfo.bsize; + stat->tfiles = 0; + stat->ffiles = 0; + stat->afiles = 0; +out: + return status; +} + +static int +nfs_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + struct nfs2_fsstat fsinfo; + struct rpc_message msg = { + .rpc_proc = &nfs_procedures[NFSPROC_STATFS], + .rpc_argp = fhandle, + .rpc_resp = &fsinfo, + }; + int status; + + dprintk("NFS call fsinfo\n"); + nfs_fattr_init(info->fattr); + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply fsinfo: %d\n", status); + if (status) + goto out; + info->rtmax = NFS_MAXDATA; + info->rtpref = fsinfo.tsize; + info->rtmult = fsinfo.bsize; + info->wtmax = NFS_MAXDATA; + info->wtpref = fsinfo.tsize; + info->wtmult = fsinfo.bsize; + info->dtpref = fsinfo.tsize; + info->maxfilesize = 0x7FFFFFFF; + info->lease_time = 0; +out: + return status; +} + +static int +nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *info) +{ + info->max_link = 0; + info->max_namelen = NFS2_MAXNAMLEN; + return 0; +} + +static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) +{ + if (nfs_async_handle_expired_key(task)) + return -EAGAIN; + + nfs_invalidate_atime(data->inode); + if (task->tk_status >= 0) { + nfs_refresh_inode(data->inode, data->res.fattr); + /* Emulate the eof flag, which isn't normally needed in NFSv2 + * as it is guaranteed to always return the file attributes + */ + if (data->args.offset + data->args.count >= data->res.fattr->size) + data->res.eof = 1; + } + return 0; +} + +static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +{ + msg->rpc_proc = &nfs_procedures[NFSPROC_READ]; +} + +static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) +{ + if (nfs_async_handle_expired_key(task)) + return -EAGAIN; + + if (task->tk_status >= 0) + nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); + return 0; +} + +static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +{ + /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */ + data->args.stable = NFS_FILE_SYNC; + msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE]; +} + +static void +nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) +{ + BUG(); +} + +static int +nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + + return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); +} + +/* Helper functions for NFS lock bounds checking */ +#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL) +static int nfs_lock_check_bounds(const struct file_lock *fl) +{ + __s32 start, end; + + start = (__s32)fl->fl_start; + if ((loff_t)start != fl->fl_start) + goto out_einval; + + if (fl->fl_end != OFFSET_MAX) { + end = (__s32)fl->fl_end; + if ((loff_t)end != fl->fl_end) + goto out_einval; + } else + end = NFS_LOCK32_OFFSET_MAX; + + if (start < 0 || start > end) + goto out_einval; + return 0; +out_einval: + return -EINVAL; +} + +const struct nfs_rpc_ops nfs_v2_clientops = { + .version = 2, /* protocol version */ + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs_dir_inode_operations, + .file_inode_ops = &nfs_file_inode_operations, + .file_ops = &nfs_file_operations, + .getroot = nfs_proc_get_root, + .getattr = nfs_proc_getattr, + .setattr = nfs_proc_setattr, + .lookup = nfs_proc_lookup, + .access = NULL, /* access */ + .readlink = nfs_proc_readlink, + .create = nfs_proc_create, + .remove = nfs_proc_remove, + .unlink_setup = nfs_proc_unlink_setup, + .unlink_done = nfs_proc_unlink_done, + .rename = nfs_proc_rename, + .rename_setup = nfs_proc_rename_setup, + .rename_done = nfs_proc_rename_done, + .link = nfs_proc_link, + .symlink = nfs_proc_symlink, + .mkdir = nfs_proc_mkdir, + .rmdir = nfs_proc_rmdir, + .readdir = nfs_proc_readdir, + .mknod = nfs_proc_mknod, + .statfs = nfs_proc_statfs, + .fsinfo = nfs_proc_fsinfo, + .pathconf = nfs_proc_pathconf, + .decode_dirent = nfs2_decode_dirent, + .read_setup = nfs_proc_read_setup, + .read_done = nfs_read_done, + .write_setup = nfs_proc_write_setup, + .write_done = nfs_write_done, + .commit_setup = nfs_proc_commit_setup, + .lock = nfs_proc_lock, + .lock_check_bounds = nfs_lock_check_bounds, + .close_context = nfs_close_context, + .init_client = nfs_init_client, +}; diff --git a/fs/nfs/read.c b/fs/nfs/read.c new file mode 100644 index 00000000..20a7f952 --- /dev/null +++ b/fs/nfs/read.c @@ -0,0 +1,704 @@ +/* + * linux/fs/nfs/read.c + * + * Block I/O for NFS + * + * Partial copy of Linus' read cache modifications to fs/nfs/file.c + * modified for async RPC by okir@monad.swb.de + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "pnfs.h" + +#include "nfs4_fs.h" +#include "internal.h" +#include "iostat.h" +#include "fscache.h" + +#define NFSDBG_FACILITY NFSDBG_PAGECACHE + +static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc); +static int nfs_pagein_one(struct nfs_pageio_descriptor *desc); +static const struct rpc_call_ops nfs_read_partial_ops; +static const struct rpc_call_ops nfs_read_full_ops; + +static struct kmem_cache *nfs_rdata_cachep; +static mempool_t *nfs_rdata_mempool; + +#define MIN_POOL_READ (32) + +struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) +{ + struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL); + + if (p) { + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->pages); + p->npages = pagecount; + if (pagecount <= ARRAY_SIZE(p->page_array)) + p->pagevec = p->page_array; + else { + p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); + if (!p->pagevec) { + mempool_free(p, nfs_rdata_mempool); + p = NULL; + } + } + } + return p; +} + +void nfs_readdata_free(struct nfs_read_data *p) +{ + if (p && (p->pagevec != &p->page_array[0])) + kfree(p->pagevec); + mempool_free(p, nfs_rdata_mempool); +} + +static void nfs_readdata_release(struct nfs_read_data *rdata) +{ + put_lseg(rdata->lseg); + put_nfs_open_context(rdata->args.context); + nfs_readdata_free(rdata); +} + +static +int nfs_return_empty_page(struct page *page) +{ + zero_user(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + unlock_page(page); + return 0; +} + +static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) +{ + unsigned int remainder = data->args.count - data->res.count; + unsigned int base = data->args.pgbase + data->res.count; + unsigned int pglen; + struct page **pages; + + if (data->res.eof == 0 || remainder == 0) + return; + /* + * Note: "remainder" can never be negative, since we check for + * this in the XDR code. + */ + pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; + base &= ~PAGE_CACHE_MASK; + pglen = PAGE_CACHE_SIZE - base; + for (;;) { + if (remainder <= pglen) { + zero_user(*pages, base, remainder); + break; + } + zero_user(*pages, base, pglen); + pages++; + remainder -= pglen; + pglen = PAGE_CACHE_SIZE; + base = 0; + } +} + +int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, + struct page *page) +{ + struct nfs_page *new; + unsigned int len; + struct nfs_pageio_descriptor pgio; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); + new = nfs_create_request(ctx, inode, page, 0, len); + if (IS_ERR(new)) { + unlock_page(page); + return PTR_ERR(new); + } + if (len < PAGE_CACHE_SIZE) + zero_user_segment(page, len, PAGE_CACHE_SIZE); + + nfs_pageio_init(&pgio, inode, NULL, 0, 0); + nfs_list_add_request(new, &pgio.pg_list); + pgio.pg_count = len; + + if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) + nfs_pagein_multi(&pgio); + else + nfs_pagein_one(&pgio); + return 0; +} + +static void nfs_readpage_release(struct nfs_page *req) +{ + struct inode *d_inode = req->wb_context->path.dentry->d_inode; + + if (PageUptodate(req->wb_page)) + nfs_readpage_to_fscache(d_inode, req->wb_page, 0); + + unlock_page(req->wb_page); + + dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", + req->wb_context->path.dentry->d_inode->i_sb->s_id, + (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), + req->wb_bytes, + (long long)req_offset(req)); + nfs_release_request(req); +} + +int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops) +{ + struct inode *inode = data->inode; + int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC | swap_flags, + }; + + /* Set up the initial task struct. */ + NFS_PROTO(inode)->read_setup(data, &msg); + + dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ " + "offset %llu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL_GPL(nfs_initiate_read); + +/* + * Set up the NFS read request struct + */ +static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, + const struct rpc_call_ops *call_ops, + unsigned int count, unsigned int offset, + struct pnfs_layout_segment *lseg) +{ + struct inode *inode = req->wb_context->path.dentry->d_inode; + + data->req = req; + data->inode = inode; + data->cred = req->wb_context->cred; + data->lseg = get_lseg(lseg); + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; + data->args.pgbase = req->wb_pgbase + offset; + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); + data->args.lock_context = req->wb_lock_context; + + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.eof = 0; + nfs_fattr_init(&data->fattr); + + if (data->lseg && + (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) + return 0; + + return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops); +} + +static void +nfs_async_read_error(struct list_head *head) +{ + struct nfs_page *req; + + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + SetPageError(req->wb_page); + nfs_readpage_release(req); + } +} + +/* + * Generate multiple requests to fill a single page. + * + * We optimize to reduce the number of read operations on the wire. If we + * detect that we're reading a page, or an area of a page, that is past the + * end of file, we do not generate NFS read operations but just clear the + * parts of the page that would have come back zero from the server anyway. + * + * We rely on the cached value of i_size to make this determination; another + * client can fill pages on the server past our cached end-of-file, but we + * won't see the new data until our attribute cache is updated. This is more + * or less conventional NFS client behavior. + */ +static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) +{ + struct nfs_page *req = nfs_list_entry(desc->pg_list.next); + struct page *page = req->wb_page; + struct nfs_read_data *data; + size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes; + unsigned int offset; + int requests = 0; + int ret = 0; + struct pnfs_layout_segment *lseg; + LIST_HEAD(list); + + nfs_list_remove_request(req); + + nbytes = desc->pg_count; + do { + size_t len = min(nbytes,rsize); + + data = nfs_readdata_alloc(1); + if (!data) + goto out_bad; + list_add(&data->pages, &list); + requests++; + nbytes -= len; + } while(nbytes != 0); + atomic_set(&req->wb_complete, requests); + + BUG_ON(desc->pg_lseg != NULL); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, + req_offset(req), desc->pg_count, + IOMODE_READ, GFP_KERNEL); + ClearPageError(page); + offset = 0; + nbytes = desc->pg_count; + do { + int ret2; + + data = list_entry(list.next, struct nfs_read_data, pages); + list_del_init(&data->pages); + + data->pagevec[0] = page; + + if (nbytes < rsize) + rsize = nbytes; + ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, + rsize, offset, lseg); + if (ret == 0) + ret = ret2; + offset += rsize; + nbytes -= rsize; + } while (nbytes != 0); + put_lseg(lseg); + desc->pg_lseg = NULL; + + return ret; + +out_bad: + while (!list_empty(&list)) { + data = list_entry(list.next, struct nfs_read_data, pages); + list_del(&data->pages); + nfs_readdata_free(data); + } + SetPageError(page); + nfs_readpage_release(req); + return -ENOMEM; +} + +static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) +{ + struct nfs_page *req; + struct page **pages; + struct nfs_read_data *data; + struct list_head *head = &desc->pg_list; + struct pnfs_layout_segment *lseg = desc->pg_lseg; + int ret = -ENOMEM; + + data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base, + desc->pg_count)); + if (!data) { + nfs_async_read_error(head); + goto out; + } + + pages = data->pagevec; + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_list_add_request(req, &data->pages); + ClearPageError(req->wb_page); + *pages++ = req->wb_page; + } + req = nfs_list_entry(data->pages.next); + if ((!lseg) && list_is_singular(&data->pages)) + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, + req_offset(req), desc->pg_count, + IOMODE_READ, GFP_KERNEL); + + ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, + 0, lseg); +out: + put_lseg(lseg); + desc->pg_lseg = NULL; + return ret; +} + +/* + * This is the callback from RPC telling us whether a reply was + * received or some error occurred (timeout or socket shutdown). + */ +int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) +{ + int status; + + dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid, + task->tk_status); + + status = NFS_PROTO(data->inode)->read_done(task, data); + if (status != 0) + return status; + + nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, data->res.count); + + if (task->tk_status == -ESTALE) { + set_bit(NFS_INO_STALE, &NFS_I(data->inode)->flags); + nfs_mark_for_revalidate(data->inode); + } + return 0; +} + +static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data) +{ + struct nfs_readargs *argp = &data->args; + struct nfs_readres *resp = &data->res; + + if (resp->eof || resp->count == argp->count) + return; + + /* This is a short read! */ + nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); + /* Has the server at least made some progress? */ + if (resp->count == 0) + return; + + /* Yes, so retry the read at the end of the data */ + data->mds_offset += resp->count; + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; + nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); +} + +/* + * Handle a read reply that fills part of a page. + */ +static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata) +{ + struct nfs_read_data *data = calldata; + + if (nfs_readpage_result(task, data) != 0) + return; + if (task->tk_status < 0) + return; + + nfs_readpage_truncate_uninitialised_page(data); + nfs_readpage_retry(task, data); +} + +static void nfs_readpage_release_partial(void *calldata) +{ + struct nfs_read_data *data = calldata; + struct nfs_page *req = data->req; + struct page *page = req->wb_page; + int status = data->task.tk_status; + + if (status < 0) + SetPageError(page); + + if (atomic_dec_and_test(&req->wb_complete)) { + if (!PageError(page)) + SetPageUptodate(page); + nfs_readpage_release(req); + } + nfs_readdata_release(calldata); +} + +#if defined(CONFIG_NFS_V4_1) +void nfs_read_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_read_data *data = calldata; + + if (nfs4_setup_sequence(NFS_SERVER(data->inode), + &data->args.seq_args, &data->res.seq_res, + 0, task)) + return; + rpc_call_start(task); +} +#endif /* CONFIG_NFS_V4_1 */ + +static const struct rpc_call_ops nfs_read_partial_ops = { +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_read_prepare, +#endif /* CONFIG_NFS_V4_1 */ + .rpc_call_done = nfs_readpage_result_partial, + .rpc_release = nfs_readpage_release_partial, +}; + +static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) +{ + unsigned int count = data->res.count; + unsigned int base = data->args.pgbase; + struct page **pages; + + if (data->res.eof) + count = data->args.count; + if (unlikely(count == 0)) + return; + pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; + base &= ~PAGE_CACHE_MASK; + count += base; + for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) + SetPageUptodate(*pages); + if (count == 0) + return; + /* Was this a short read? */ + if (data->res.eof || data->res.count == data->args.count) + SetPageUptodate(*pages); +} + +/* + * This is the callback from RPC telling us whether a reply was + * received or some error occurred (timeout or socket shutdown). + */ +static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) +{ + struct nfs_read_data *data = calldata; + + if (nfs_readpage_result(task, data) != 0) + return; + if (task->tk_status < 0) + return; + /* + * Note: nfs_readpage_retry may change the values of + * data->args. In the multi-page case, we therefore need + * to ensure that we call nfs_readpage_set_pages_uptodate() + * first. + */ + nfs_readpage_truncate_uninitialised_page(data); + nfs_readpage_set_pages_uptodate(data); + nfs_readpage_retry(task, data); +} + +static void nfs_readpage_release_full(void *calldata) +{ + struct nfs_read_data *data = calldata; + + while (!list_empty(&data->pages)) { + struct nfs_page *req = nfs_list_entry(data->pages.next); + + nfs_list_remove_request(req); + nfs_readpage_release(req); + } + nfs_readdata_release(calldata); +} + +static const struct rpc_call_ops nfs_read_full_ops = { +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_read_prepare, +#endif /* CONFIG_NFS_V4_1 */ + .rpc_call_done = nfs_readpage_result_full, + .rpc_release = nfs_readpage_release_full, +}; + +/* + * Read a page over NFS. + * We read the page synchronously in the following case: + * - The error flag is set for this page. This happens only when a + * previous async read operation failed. + */ +int nfs_readpage(struct file *file, struct page *page) +{ + struct nfs_open_context *ctx; + struct inode *inode = page->mapping->host; + int error; + + dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", + page, PAGE_CACHE_SIZE, page->index); + nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); + nfs_add_stats(inode, NFSIOS_READPAGES, 1); + + /* + * Try to flush any pending writes to the file.. + * + * NOTE! Because we own the page lock, there cannot + * be any new pending writes generated at this point + * for this page (other pages can be written to). + */ + error = nfs_wb_page(inode, page); + if (error) + goto out_unlock; + if (PageUptodate(page)) + goto out_unlock; + + error = -ESTALE; + if (NFS_STALE(inode)) + goto out_unlock; + + if (file == NULL) { + error = -EBADF; + ctx = nfs_find_open_context(inode, NULL, FMODE_READ); + if (ctx == NULL) + goto out_unlock; + } else + ctx = get_nfs_open_context(nfs_file_open_context(file)); + + if (!IS_SYNC(inode)) { + error = nfs_readpage_from_fscache(ctx, inode, page); + if (error == 0) + goto out; + } + + error = nfs_readpage_async(ctx, inode, page); + +out: + put_nfs_open_context(ctx); + return error; +out_unlock: + unlock_page(page); + return error; +} + +struct nfs_readdesc { + struct nfs_pageio_descriptor *pgio; + struct nfs_open_context *ctx; +}; + +static int +readpage_async_filler(void *data, struct page *page) +{ + struct nfs_readdesc *desc = (struct nfs_readdesc *)data; + struct inode *inode = page->mapping->host; + struct nfs_page *new; + unsigned int len; + int error; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); + + new = nfs_create_request(desc->ctx, inode, page, 0, len); + if (IS_ERR(new)) + goto out_error; + + if (len < PAGE_CACHE_SIZE) + zero_user_segment(page, len, PAGE_CACHE_SIZE); + if (!nfs_pageio_add_request(desc->pgio, new)) { + error = desc->pgio->pg_error; + goto out_unlock; + } + return 0; +out_error: + error = PTR_ERR(new); + SetPageError(page); +out_unlock: + unlock_page(page); + return error; +} + +int nfs_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct nfs_pageio_descriptor pgio; + struct nfs_readdesc desc = { + .pgio = &pgio, + }; + struct inode *inode = mapping->host; + struct nfs_server *server = NFS_SERVER(inode); + size_t rsize = server->rsize; + unsigned long npages; + int ret = -ESTALE; + + dprintk("NFS: nfs_readpages (%s/%Ld %d)\n", + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + nr_pages); + nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); + + if (NFS_STALE(inode)) + goto out; + + if (filp == NULL) { + desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); + if (desc.ctx == NULL) + return -EBADF; + } else + desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); + + /* attempt to read as many of the pages as possible from the cache + * - this returns -ENOBUFS immediately if the cookie is negative + */ + ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping, + pages, &nr_pages); + if (ret == 0) + goto read_complete; /* all pages were read */ + + if (rsize < PAGE_CACHE_SIZE) + nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); + else + nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0); + + ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); + + nfs_pageio_complete(&pgio); + npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); +read_complete: + put_nfs_open_context(desc.ctx); +out: + return ret; +} + +int __init nfs_init_readpagecache(void) +{ + nfs_rdata_cachep = kmem_cache_create("nfs_read_data", + sizeof(struct nfs_read_data), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (nfs_rdata_cachep == NULL) + return -ENOMEM; + + nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ, + nfs_rdata_cachep); + if (nfs_rdata_mempool == NULL) + return -ENOMEM; + + return 0; +} + +void nfs_destroy_readpagecache(void) +{ + mempool_destroy(nfs_rdata_mempool); + kmem_cache_destroy(nfs_rdata_cachep); +} diff --git a/fs/nfs/super.c b/fs/nfs/super.c new file mode 100644 index 00000000..8e7b61d5 --- /dev/null +++ b/fs/nfs/super.c @@ -0,0 +1,3099 @@ +/* + * linux/fs/nfs/super.c + * + * Copyright (C) 1992 Rick Sladkey + * + * nfs superblock handling functions + * + * Modularised by Alan Cox , while hacking some + * experimental NFS changes. Modularisation taken straight from SYS5 fs. + * + * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts. + * J.S.Peatfield@damtp.cam.ac.uk + * + * Split from inode.c by David Howells + * + * - superblocks are indexed on server only - all inodes, dentries, etc. associated with a + * particular server are held in the same superblock + * - NFS superblocks can have several effective roots to the dentry tree + * - directory type roots are spliced into the tree when a path from one root reaches the root + * of another (see nfs_lookup()) + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "iostat.h" +#include "internal.h" +#include "fscache.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +#ifdef CONFIG_NFS_V3 +#define NFS_DEFAULT_VERSION 3 +#else +#define NFS_DEFAULT_VERSION 2 +#endif + +enum { + /* Mount options that take no arguments */ + Opt_soft, Opt_hard, + Opt_posix, Opt_noposix, + Opt_cto, Opt_nocto, + Opt_ac, Opt_noac, + Opt_lock, Opt_nolock, + Opt_v2, Opt_v3, Opt_v4, + Opt_udp, Opt_tcp, Opt_rdma, + Opt_acl, Opt_noacl, + Opt_rdirplus, Opt_nordirplus, + Opt_sharecache, Opt_nosharecache, + Opt_resvport, Opt_noresvport, + Opt_fscache, Opt_nofscache, + + /* Mount options that take integer arguments */ + Opt_port, + Opt_rsize, Opt_wsize, Opt_bsize, + Opt_timeo, Opt_retrans, + Opt_acregmin, Opt_acregmax, + Opt_acdirmin, Opt_acdirmax, + Opt_actimeo, + Opt_namelen, + Opt_mountport, + Opt_mountvers, + Opt_nfsvers, + Opt_minorversion, + + /* Mount options that take string arguments */ + Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, + Opt_addr, Opt_mountaddr, Opt_clientaddr, + Opt_lookupcache, + Opt_fscache_uniq, + Opt_local_lock, + + /* Special mount options */ + Opt_userspace, Opt_deprecated, Opt_sloppy, + + Opt_err +}; + +static const match_table_t nfs_mount_option_tokens = { + { Opt_userspace, "bg" }, + { Opt_userspace, "fg" }, + { Opt_userspace, "retry=%s" }, + + { Opt_sloppy, "sloppy" }, + + { Opt_soft, "soft" }, + { Opt_hard, "hard" }, + { Opt_deprecated, "intr" }, + { Opt_deprecated, "nointr" }, + { Opt_posix, "posix" }, + { Opt_noposix, "noposix" }, + { Opt_cto, "cto" }, + { Opt_nocto, "nocto" }, + { Opt_ac, "ac" }, + { Opt_noac, "noac" }, + { Opt_lock, "lock" }, + { Opt_nolock, "nolock" }, + { Opt_v2, "v2" }, + { Opt_v3, "v3" }, + { Opt_v4, "v4" }, + { Opt_udp, "udp" }, + { Opt_tcp, "tcp" }, + { Opt_rdma, "rdma" }, + { Opt_acl, "acl" }, + { Opt_noacl, "noacl" }, + { Opt_rdirplus, "rdirplus" }, + { Opt_nordirplus, "nordirplus" }, + { Opt_sharecache, "sharecache" }, + { Opt_nosharecache, "nosharecache" }, + { Opt_resvport, "resvport" }, + { Opt_noresvport, "noresvport" }, + { Opt_fscache, "fsc" }, + { Opt_nofscache, "nofsc" }, + + { Opt_port, "port=%s" }, + { Opt_rsize, "rsize=%s" }, + { Opt_wsize, "wsize=%s" }, + { Opt_bsize, "bsize=%s" }, + { Opt_timeo, "timeo=%s" }, + { Opt_retrans, "retrans=%s" }, + { Opt_acregmin, "acregmin=%s" }, + { Opt_acregmax, "acregmax=%s" }, + { Opt_acdirmin, "acdirmin=%s" }, + { Opt_acdirmax, "acdirmax=%s" }, + { Opt_actimeo, "actimeo=%s" }, + { Opt_namelen, "namlen=%s" }, + { Opt_mountport, "mountport=%s" }, + { Opt_mountvers, "mountvers=%s" }, + { Opt_nfsvers, "nfsvers=%s" }, + { Opt_nfsvers, "vers=%s" }, + { Opt_minorversion, "minorversion=%s" }, + + { Opt_sec, "sec=%s" }, + { Opt_proto, "proto=%s" }, + { Opt_mountproto, "mountproto=%s" }, + { Opt_addr, "addr=%s" }, + { Opt_clientaddr, "clientaddr=%s" }, + { Opt_mounthost, "mounthost=%s" }, + { Opt_mountaddr, "mountaddr=%s" }, + + { Opt_lookupcache, "lookupcache=%s" }, + { Opt_fscache_uniq, "fsc=%s" }, + { Opt_local_lock, "local_lock=%s" }, + + { Opt_err, NULL } +}; + +enum { + Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma, + + Opt_xprt_err +}; + +static const match_table_t nfs_xprt_protocol_tokens = { + { Opt_xprt_udp, "udp" }, + { Opt_xprt_udp6, "udp6" }, + { Opt_xprt_tcp, "tcp" }, + { Opt_xprt_tcp6, "tcp6" }, + { Opt_xprt_rdma, "rdma" }, + + { Opt_xprt_err, NULL } +}; + +enum { + Opt_sec_none, Opt_sec_sys, + Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p, + Opt_sec_lkey, Opt_sec_lkeyi, Opt_sec_lkeyp, + Opt_sec_spkm, Opt_sec_spkmi, Opt_sec_spkmp, + + Opt_sec_err +}; + +static const match_table_t nfs_secflavor_tokens = { + { Opt_sec_none, "none" }, + { Opt_sec_none, "null" }, + { Opt_sec_sys, "sys" }, + + { Opt_sec_krb5, "krb5" }, + { Opt_sec_krb5i, "krb5i" }, + { Opt_sec_krb5p, "krb5p" }, + + { Opt_sec_lkey, "lkey" }, + { Opt_sec_lkeyi, "lkeyi" }, + { Opt_sec_lkeyp, "lkeyp" }, + + { Opt_sec_spkm, "spkm3" }, + { Opt_sec_spkmi, "spkm3i" }, + { Opt_sec_spkmp, "spkm3p" }, + + { Opt_sec_err, NULL } +}; + +enum { + Opt_lookupcache_all, Opt_lookupcache_positive, + Opt_lookupcache_none, + + Opt_lookupcache_err +}; + +static match_table_t nfs_lookupcache_tokens = { + { Opt_lookupcache_all, "all" }, + { Opt_lookupcache_positive, "pos" }, + { Opt_lookupcache_positive, "positive" }, + { Opt_lookupcache_none, "none" }, + + { Opt_lookupcache_err, NULL } +}; + +enum { + Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix, + Opt_local_lock_none, + + Opt_local_lock_err +}; + +static match_table_t nfs_local_lock_tokens = { + { Opt_local_lock_all, "all" }, + { Opt_local_lock_flock, "flock" }, + { Opt_local_lock_posix, "posix" }, + { Opt_local_lock_none, "none" }, + + { Opt_local_lock_err, NULL } +}; + + +static void nfs_umount_begin(struct super_block *); +static int nfs_statfs(struct dentry *, struct kstatfs *); +static int nfs_show_options(struct seq_file *, struct vfsmount *); +static int nfs_show_devname(struct seq_file *, struct vfsmount *); +static int nfs_show_path(struct seq_file *, struct vfsmount *); +static int nfs_show_stats(struct seq_file *, struct vfsmount *); +static struct dentry *nfs_fs_mount(struct file_system_type *, + int, const char *, void *); +static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); +static void nfs_put_super(struct super_block *); +static void nfs_kill_super(struct super_block *); +static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); + +static struct file_system_type nfs_fs_type = { + .owner = THIS_MODULE, + .name = "nfs", + .mount = nfs_fs_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +struct file_system_type nfs_xdev_fs_type = { + .owner = THIS_MODULE, + .name = "nfs", + .mount = nfs_xdev_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +static const struct super_operations nfs_sops = { + .alloc_inode = nfs_alloc_inode, + .destroy_inode = nfs_destroy_inode, + .write_inode = nfs_write_inode, + .put_super = nfs_put_super, + .statfs = nfs_statfs, + .evict_inode = nfs_evict_inode, + .umount_begin = nfs_umount_begin, + .show_options = nfs_show_options, + .show_devname = nfs_show_devname, + .show_path = nfs_show_path, + .show_stats = nfs_show_stats, + .remount_fs = nfs_remount, +}; + +#ifdef CONFIG_NFS_V4 +static int nfs4_validate_text_mount_data(void *options, + struct nfs_parsed_mount_data *args, const char *dev_name); +static struct dentry *nfs4_try_mount(int flags, const char *dev_name, + struct nfs_parsed_mount_data *data); +static struct dentry *nfs4_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); +static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); +static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); +static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); +static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); +static void nfs4_kill_super(struct super_block *sb); + +static struct file_system_type nfs4_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs4_mount, + .kill_sb = nfs4_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +static struct file_system_type nfs4_remote_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs4_remote_mount, + .kill_sb = nfs4_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +struct file_system_type nfs4_xdev_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs4_xdev_mount, + .kill_sb = nfs4_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +static struct file_system_type nfs4_remote_referral_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs4_remote_referral_mount, + .kill_sb = nfs4_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +struct file_system_type nfs4_referral_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs4_referral_mount, + .kill_sb = nfs4_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +static const struct super_operations nfs4_sops = { + .alloc_inode = nfs_alloc_inode, + .destroy_inode = nfs_destroy_inode, + .write_inode = nfs_write_inode, + .put_super = nfs_put_super, + .statfs = nfs_statfs, + .evict_inode = nfs4_evict_inode, + .umount_begin = nfs_umount_begin, + .show_options = nfs_show_options, + .show_devname = nfs_show_devname, + .show_path = nfs_show_path, + .show_stats = nfs_show_stats, + .remount_fs = nfs_remount, +}; +#endif + +static struct shrinker acl_shrinker = { + .shrink = nfs_access_cache_shrinker, + .seeks = DEFAULT_SEEKS, +}; + +/* + * Register the NFS filesystems + */ +int __init register_nfs_fs(void) +{ + int ret; + + ret = register_filesystem(&nfs_fs_type); + if (ret < 0) + goto error_0; + + ret = nfs_register_sysctl(); + if (ret < 0) + goto error_1; +#ifdef CONFIG_NFS_V4 + ret = register_filesystem(&nfs4_fs_type); + if (ret < 0) + goto error_2; +#endif + register_shrinker(&acl_shrinker); + return 0; + +#ifdef CONFIG_NFS_V4 +error_2: + nfs_unregister_sysctl(); +#endif +error_1: + unregister_filesystem(&nfs_fs_type); +error_0: + return ret; +} + +/* + * Unregister the NFS filesystems + */ +void __exit unregister_nfs_fs(void) +{ + unregister_shrinker(&acl_shrinker); +#ifdef CONFIG_NFS_V4 + unregister_filesystem(&nfs4_fs_type); +#endif + nfs_unregister_sysctl(); + unregister_filesystem(&nfs_fs_type); +} + +void nfs_sb_active(struct super_block *sb) +{ + struct nfs_server *server = NFS_SB(sb); + + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); +} + +void nfs_sb_deactive(struct super_block *sb) +{ + struct nfs_server *server = NFS_SB(sb); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); +} + +/* + * Deliver file system statistics to userspace + */ +static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct nfs_server *server = NFS_SB(dentry->d_sb); + unsigned char blockbits; + unsigned long blockres; + struct nfs_fh *fh = NFS_FH(dentry->d_inode); + struct nfs_fsstat res; + int error = -ENOMEM; + + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + goto out_err; + + error = server->nfs_client->rpc_ops->statfs(server, fh, &res); + if (unlikely(error == -ESTALE)) { + struct dentry *pd_dentry; + + pd_dentry = dget_parent(dentry); + if (pd_dentry != NULL) { + nfs_zap_caches(pd_dentry->d_inode); + dput(pd_dentry); + } + } + nfs_free_fattr(res.fattr); + if (error < 0) + goto out_err; + + buf->f_type = NFS_SUPER_MAGIC; + + /* + * Current versions of glibc do not correctly handle the + * case where f_frsize != f_bsize. Eventually we want to + * report the value of wtmult in this field. + */ + buf->f_frsize = dentry->d_sb->s_blocksize; + + /* + * On most *nix systems, f_blocks, f_bfree, and f_bavail + * are reported in units of f_frsize. Linux hasn't had + * an f_frsize field in its statfs struct until recently, + * thus historically Linux's sys_statfs reports these + * fields in units of f_bsize. + */ + buf->f_bsize = dentry->d_sb->s_blocksize; + blockbits = dentry->d_sb->s_blocksize_bits; + blockres = (1 << blockbits) - 1; + buf->f_blocks = (res.tbytes + blockres) >> blockbits; + buf->f_bfree = (res.fbytes + blockres) >> blockbits; + buf->f_bavail = (res.abytes + blockres) >> blockbits; + + buf->f_files = res.tfiles; + buf->f_ffree = res.afiles; + + buf->f_namelen = server->namelen; + + return 0; + + out_err: + dprintk("%s: statfs error = %d\n", __func__, -error); + return error; +} + +/* + * Map the security flavour number to a name + */ +static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) +{ + static const struct { + rpc_authflavor_t flavour; + const char *str; + } sec_flavours[] = { + { RPC_AUTH_NULL, "null" }, + { RPC_AUTH_UNIX, "sys" }, + { RPC_AUTH_GSS_KRB5, "krb5" }, + { RPC_AUTH_GSS_KRB5I, "krb5i" }, + { RPC_AUTH_GSS_KRB5P, "krb5p" }, + { RPC_AUTH_GSS_LKEY, "lkey" }, + { RPC_AUTH_GSS_LKEYI, "lkeyi" }, + { RPC_AUTH_GSS_LKEYP, "lkeyp" }, + { RPC_AUTH_GSS_SPKM, "spkm" }, + { RPC_AUTH_GSS_SPKMI, "spkmi" }, + { RPC_AUTH_GSS_SPKMP, "spkmp" }, + { UINT_MAX, "unknown" } + }; + int i; + + for (i = 0; sec_flavours[i].flavour != UINT_MAX; i++) { + if (sec_flavours[i].flavour == flavour) + break; + } + return sec_flavours[i].str; +} + +static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address; + + seq_printf(m, ",mountproto="); + switch (sap->sa_family) { + case AF_INET: + switch (nfss->mountd_protocol) { + case IPPROTO_UDP: + seq_printf(m, RPCBIND_NETID_UDP); + break; + case IPPROTO_TCP: + seq_printf(m, RPCBIND_NETID_TCP); + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } + break; + case AF_INET6: + switch (nfss->mountd_protocol) { + case IPPROTO_UDP: + seq_printf(m, RPCBIND_NETID_UDP6); + break; + case IPPROTO_TCP: + seq_printf(m, RPCBIND_NETID_TCP6); + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } +} + +static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address; + + if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE) + return; + + switch (sap->sa_family) { + case AF_INET: { + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + seq_printf(m, ",mountaddr=%pI4", &sin->sin_addr.s_addr); + break; + } + case AF_INET6: { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + seq_printf(m, ",mountaddr=%pI6c", &sin6->sin6_addr); + break; + } + default: + if (showdefaults) + seq_printf(m, ",mountaddr=unspecified"); + } + + if (nfss->mountd_version || showdefaults) + seq_printf(m, ",mountvers=%u", nfss->mountd_version); + if ((nfss->mountd_port && + nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) || + showdefaults) + seq_printf(m, ",mountport=%u", nfss->mountd_port); + + nfs_show_mountd_netid(m, nfss, showdefaults); +} + +#ifdef CONFIG_NFS_V4 +static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct nfs_client *clp = nfss->nfs_client; + + seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); + seq_printf(m, ",minorversion=%u", clp->cl_minorversion); +} +#else +static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ +} +#endif + +/* + * Describe the mount options in force on this server representation + */ +static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + static const struct proc_nfs_info { + int flag; + const char *str; + const char *nostr; + } nfs_info[] = { + { NFS_MOUNT_SOFT, ",soft", ",hard" }, + { NFS_MOUNT_POSIX, ",posix", "" }, + { NFS_MOUNT_NOCTO, ",nocto", "" }, + { NFS_MOUNT_NOAC, ",noac", "" }, + { NFS_MOUNT_NONLM, ",nolock", "" }, + { NFS_MOUNT_NOACL, ",noacl", "" }, + { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" }, + { NFS_MOUNT_UNSHARED, ",nosharecache", "" }, + { NFS_MOUNT_NORESVPORT, ",noresvport", "" }, + { 0, NULL, NULL } + }; + const struct proc_nfs_info *nfs_infop; + struct nfs_client *clp = nfss->nfs_client; + u32 version = clp->rpc_ops->version; + int local_flock, local_fcntl; + + seq_printf(m, ",vers=%u", version); + seq_printf(m, ",rsize=%u", nfss->rsize); + seq_printf(m, ",wsize=%u", nfss->wsize); + if (nfss->bsize != 0) + seq_printf(m, ",bsize=%u", nfss->bsize); + seq_printf(m, ",namlen=%u", nfss->namelen); + if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults) + seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ); + if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults) + seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ); + if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults) + seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ); + if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults) + seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ); + for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { + if (nfss->flags & nfs_infop->flag) + seq_puts(m, nfs_infop->str); + else + seq_puts(m, nfs_infop->nostr); + } + seq_printf(m, ",proto=%s", + rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID)); + if (version == 4) { + if (nfss->port != NFS_PORT) + seq_printf(m, ",port=%u", nfss->port); + } else + if (nfss->port) + seq_printf(m, ",port=%u", nfss->port); + + seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ); + seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries); + seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor)); + + if (version != 4) + nfs_show_mountd_options(m, nfss, showdefaults); + else + nfs_show_nfsv4_options(m, nfss, showdefaults); + + if (nfss->options & NFS_OPTION_FSCACHE) + seq_printf(m, ",fsc"); + + if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) { + if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + seq_printf(m, ",lookupcache=none"); + else + seq_printf(m, ",lookupcache=pos"); + } + + local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK; + local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL; + + if (!local_flock && !local_fcntl) + seq_printf(m, ",local_lock=none"); + else if (local_flock && local_fcntl) + seq_printf(m, ",local_lock=all"); + else if (local_flock) + seq_printf(m, ",local_lock=flock"); + else + seq_printf(m, ",local_lock=posix"); +} + +/* + * Describe the mount options on this VFS mountpoint + */ +static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + + nfs_show_mount_options(m, nfss, 0); + + seq_printf(m, ",addr=%s", + rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient, + RPC_DISPLAY_ADDR)); + + return 0; +} +#ifdef CONFIG_NFS_V4_1 +void show_sessions(struct seq_file *m, struct nfs_server *server) +{ + if (nfs4_has_session(server->nfs_client)) + seq_printf(m, ",sessions"); +} +#else +void show_sessions(struct seq_file *m, struct nfs_server *server) {} +#endif + +#ifdef CONFIG_NFS_V4_1 +void show_pnfs(struct seq_file *m, struct nfs_server *server) +{ + seq_printf(m, ",pnfs="); + if (server->pnfs_curr_ld) + seq_printf(m, "%s", server->pnfs_curr_ld->name); + else + seq_printf(m, "not configured"); +} +#else /* CONFIG_NFS_V4_1 */ +void show_pnfs(struct seq_file *m, struct nfs_server *server) {} +#endif /* CONFIG_NFS_V4_1 */ + +static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) +{ + char *page = (char *) __get_free_page(GFP_KERNEL); + char *devname, *dummy; + int err = 0; + if (!page) + return -ENOMEM; + devname = nfs_path(&dummy, mnt->mnt_root, page, PAGE_SIZE); + if (IS_ERR(devname)) + err = PTR_ERR(devname); + else + seq_escape(m, devname, " \t\n\\"); + free_page((unsigned long)page); + return err; +} + +static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt) +{ + seq_puts(m, "/"); + return 0; +} + +/* + * Present statistical information for this VFS mountpoint + */ +static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) +{ + int i, cpu; + struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + struct rpc_auth *auth = nfss->client->cl_auth; + struct nfs_iostats totals = { }; + + seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS); + + /* + * Display all mount option settings + */ + seq_printf(m, "\n\topts:\t"); + seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw"); + seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); + seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : ""); + seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); + nfs_show_mount_options(m, nfss, 1); + + seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); + + seq_printf(m, "\n\tcaps:\t"); + seq_printf(m, "caps=0x%x", nfss->caps); + seq_printf(m, ",wtmult=%u", nfss->wtmult); + seq_printf(m, ",dtsize=%u", nfss->dtsize); + seq_printf(m, ",bsize=%u", nfss->bsize); + seq_printf(m, ",namlen=%u", nfss->namelen); + +#ifdef CONFIG_NFS_V4 + if (nfss->nfs_client->rpc_ops->version == 4) { + seq_printf(m, "\n\tnfsv4:\t"); + seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); + seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); + show_sessions(m, nfss); + show_pnfs(m, nfss); + } +#endif + + /* + * Display security flavor in effect for this mount + */ + seq_printf(m, "\n\tsec:\tflavor=%u", auth->au_ops->au_flavor); + if (auth->au_flavor) + seq_printf(m, ",pseudoflavor=%u", auth->au_flavor); + + /* + * Display superblock I/O counters + */ + for_each_possible_cpu(cpu) { + struct nfs_iostats *stats; + + preempt_disable(); + stats = per_cpu_ptr(nfss->io_stats, cpu); + + for (i = 0; i < __NFSIOS_COUNTSMAX; i++) + totals.events[i] += stats->events[i]; + for (i = 0; i < __NFSIOS_BYTESMAX; i++) + totals.bytes[i] += stats->bytes[i]; +#ifdef CONFIG_NFS_FSCACHE + for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) + totals.fscache[i] += stats->fscache[i]; +#endif + + preempt_enable(); + } + + seq_printf(m, "\n\tevents:\t"); + for (i = 0; i < __NFSIOS_COUNTSMAX; i++) + seq_printf(m, "%lu ", totals.events[i]); + seq_printf(m, "\n\tbytes:\t"); + for (i = 0; i < __NFSIOS_BYTESMAX; i++) + seq_printf(m, "%Lu ", totals.bytes[i]); +#ifdef CONFIG_NFS_FSCACHE + if (nfss->options & NFS_OPTION_FSCACHE) { + seq_printf(m, "\n\tfsc:\t"); + for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) + seq_printf(m, "%Lu ", totals.bytes[i]); + } +#endif + seq_printf(m, "\n"); + + rpc_print_iostats(m, nfss->client); + + return 0; +} + +/* + * Begin unmount by attempting to remove all automounted mountpoints we added + * in response to xdev traversals and referrals + */ +static void nfs_umount_begin(struct super_block *sb) +{ + struct nfs_server *server; + struct rpc_clnt *rpc; + + server = NFS_SB(sb); + /* -EIO all pending I/O */ + rpc = server->client_acl; + if (!IS_ERR(rpc)) + rpc_killall_tasks(rpc); + rpc = server->client; + if (!IS_ERR(rpc)) + rpc_killall_tasks(rpc); +} + +static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version) +{ + struct nfs_parsed_mount_data *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data) { + data->acregmin = NFS_DEF_ACREGMIN; + data->acregmax = NFS_DEF_ACREGMAX; + data->acdirmin = NFS_DEF_ACDIRMIN; + data->acdirmax = NFS_DEF_ACDIRMAX; + data->mount_server.port = NFS_UNSPEC_PORT; + data->nfs_server.port = NFS_UNSPEC_PORT; + data->nfs_server.protocol = XPRT_TRANSPORT_TCP; + data->auth_flavors[0] = RPC_AUTH_UNIX; + data->auth_flavor_len = 1; + data->version = version; + data->minorversion = 0; + security_init_mnt_opts(&data->lsm_opts); + } + return data; +} + +static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data) +{ + if (data) { + kfree(data->client_address); + kfree(data->mount_server.hostname); + kfree(data->nfs_server.export_path); + kfree(data->nfs_server.hostname); + kfree(data->fscache_uniq); + security_free_mnt_opts(&data->lsm_opts); + kfree(data); + } +} + +/* + * Sanity-check a server address provided by the mount command. + * + * Address family must be initialized, and address must not be + * the ANY address for that family. + */ +static int nfs_verify_server_address(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: { + struct sockaddr_in *sa = (struct sockaddr_in *)addr; + return sa->sin_addr.s_addr != htonl(INADDR_ANY); + } + case AF_INET6: { + struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr; + return !ipv6_addr_any(sa); + } + } + + dfprintk(MOUNT, "NFS: Invalid IP address specified\n"); + return 0; +} + +/* + * Select between a default port value and a user-specified port value. + * If a zero value is set, then autobind will be used. + */ +static void nfs_set_port(struct sockaddr *sap, int *port, + const unsigned short default_port) +{ + if (*port == NFS_UNSPEC_PORT) + *port = default_port; + + rpc_set_port(sap, *port); +} + +/* + * Sanity check the NFS transport protocol. + * + */ +static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt) +{ + switch (mnt->nfs_server.protocol) { + case XPRT_TRANSPORT_UDP: + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + break; + default: + mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; + } +} + +/* + * For text based NFSv2/v3 mounts, the mount protocol transport default + * settings should depend upon the specified NFS transport. + */ +static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) +{ + nfs_validate_transport_protocol(mnt); + + if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP || + mnt->mount_server.protocol == XPRT_TRANSPORT_TCP) + return; + switch (mnt->nfs_server.protocol) { + case XPRT_TRANSPORT_UDP: + mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; + break; + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; + } +} + +/* + * Parse the value of the 'sec=' option. + */ +static int nfs_parse_security_flavors(char *value, + struct nfs_parsed_mount_data *mnt) +{ + substring_t args[MAX_OPT_ARGS]; + + dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value); + + switch (match_token(value, nfs_secflavor_tokens, args)) { + case Opt_sec_none: + mnt->auth_flavors[0] = RPC_AUTH_NULL; + break; + case Opt_sec_sys: + mnt->auth_flavors[0] = RPC_AUTH_UNIX; + break; + case Opt_sec_krb5: + mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; + break; + case Opt_sec_krb5i: + mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; + break; + case Opt_sec_krb5p: + mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; + break; + case Opt_sec_lkey: + mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; + break; + case Opt_sec_lkeyi: + mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; + break; + case Opt_sec_lkeyp: + mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; + break; + case Opt_sec_spkm: + mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; + break; + case Opt_sec_spkmi: + mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; + break; + case Opt_sec_spkmp: + mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; + break; + default: + return 0; + } + + mnt->flags |= NFS_MOUNT_SECFLAVOUR; + mnt->auth_flavor_len = 1; + return 1; +} + +static int nfs_get_option_str(substring_t args[], char **option) +{ + kfree(*option); + *option = match_strdup(args); + return !option; +} + +static int nfs_get_option_ul(substring_t args[], unsigned long *option) +{ + int rc; + char *string; + + string = match_strdup(args); + if (string == NULL) + return -ENOMEM; + rc = strict_strtoul(string, 10, option); + kfree(string); + + return rc; +} + +/* + * Error-check and convert a string of mount options from user space into + * a data structure. The whole mount string is processed; bad options are + * skipped as they are encountered. If there were no errors, return 1; + * otherwise return 0 (zero). + */ +static int nfs_parse_mount_options(char *raw, + struct nfs_parsed_mount_data *mnt) +{ + char *p, *string, *secdata; + int rc, sloppy = 0, invalid_option = 0; + unsigned short protofamily = AF_UNSPEC; + unsigned short mountfamily = AF_UNSPEC; + + if (!raw) { + dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); + return 1; + } + dfprintk(MOUNT, "NFS: nfs mount opts='%s'\n", raw); + + secdata = alloc_secdata(); + if (!secdata) + goto out_nomem; + + rc = security_sb_copy_data(raw, secdata); + if (rc) + goto out_security_failure; + + rc = security_sb_parse_opts_str(secdata, &mnt->lsm_opts); + if (rc) + goto out_security_failure; + + free_secdata(secdata); + + while ((p = strsep(&raw, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + unsigned long option; + int token; + + if (!*p) + continue; + + dfprintk(MOUNT, "NFS: parsing nfs mount option '%s'\n", p); + + token = match_token(p, nfs_mount_option_tokens, args); + switch (token) { + + /* + * boolean options: foo/nofoo + */ + case Opt_soft: + mnt->flags |= NFS_MOUNT_SOFT; + break; + case Opt_hard: + mnt->flags &= ~NFS_MOUNT_SOFT; + break; + case Opt_posix: + mnt->flags |= NFS_MOUNT_POSIX; + break; + case Opt_noposix: + mnt->flags &= ~NFS_MOUNT_POSIX; + break; + case Opt_cto: + mnt->flags &= ~NFS_MOUNT_NOCTO; + break; + case Opt_nocto: + mnt->flags |= NFS_MOUNT_NOCTO; + break; + case Opt_ac: + mnt->flags &= ~NFS_MOUNT_NOAC; + break; + case Opt_noac: + mnt->flags |= NFS_MOUNT_NOAC; + break; + case Opt_lock: + mnt->flags &= ~NFS_MOUNT_NONLM; + mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + case Opt_nolock: + mnt->flags |= NFS_MOUNT_NONLM; + mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + case Opt_v2: + mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 2; + break; + case Opt_v3: + mnt->flags |= NFS_MOUNT_VER3; + mnt->version = 3; + break; + case Opt_v4: + mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 4; + break; + case Opt_udp: + mnt->flags &= ~NFS_MOUNT_TCP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; + break; + case Opt_tcp: + mnt->flags |= NFS_MOUNT_TCP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; + break; + case Opt_rdma: + mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ + mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; + xprt_load_transport(p); + break; + case Opt_acl: + mnt->flags &= ~NFS_MOUNT_NOACL; + break; + case Opt_noacl: + mnt->flags |= NFS_MOUNT_NOACL; + break; + case Opt_rdirplus: + mnt->flags &= ~NFS_MOUNT_NORDIRPLUS; + break; + case Opt_nordirplus: + mnt->flags |= NFS_MOUNT_NORDIRPLUS; + break; + case Opt_sharecache: + mnt->flags &= ~NFS_MOUNT_UNSHARED; + break; + case Opt_nosharecache: + mnt->flags |= NFS_MOUNT_UNSHARED; + break; + case Opt_resvport: + mnt->flags &= ~NFS_MOUNT_NORESVPORT; + break; + case Opt_noresvport: + mnt->flags |= NFS_MOUNT_NORESVPORT; + break; + case Opt_fscache: + mnt->options |= NFS_OPTION_FSCACHE; + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; + case Opt_nofscache: + mnt->options &= ~NFS_OPTION_FSCACHE; + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; + + /* + * options that take numeric values + */ + case Opt_port: + if (nfs_get_option_ul(args, &option) || + option > USHRT_MAX) + goto out_invalid_value; + mnt->nfs_server.port = option; + break; + case Opt_rsize: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->rsize = option; + break; + case Opt_wsize: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->wsize = option; + break; + case Opt_bsize: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->bsize = option; + break; + case Opt_timeo: + if (nfs_get_option_ul(args, &option) || option == 0) + goto out_invalid_value; + mnt->timeo = option; + break; + case Opt_retrans: + if (nfs_get_option_ul(args, &option) || option == 0) + goto out_invalid_value; + mnt->retrans = option; + break; + case Opt_acregmin: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acregmin = option; + break; + case Opt_acregmax: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acregmax = option; + break; + case Opt_acdirmin: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acdirmin = option; + break; + case Opt_acdirmax: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acdirmax = option; + break; + case Opt_actimeo: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acregmin = mnt->acregmax = + mnt->acdirmin = mnt->acdirmax = option; + break; + case Opt_namelen: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->namlen = option; + break; + case Opt_mountport: + if (nfs_get_option_ul(args, &option) || + option > USHRT_MAX) + goto out_invalid_value; + mnt->mount_server.port = option; + break; + case Opt_mountvers: + if (nfs_get_option_ul(args, &option) || + option < NFS_MNT_VERSION || + option > NFS_MNT3_VERSION) + goto out_invalid_value; + mnt->mount_server.version = option; + break; + case Opt_nfsvers: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + switch (option) { + case NFS2_VERSION: + mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 2; + break; + case NFS3_VERSION: + mnt->flags |= NFS_MOUNT_VER3; + mnt->version = 3; + break; + case NFS4_VERSION: + mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 4; + break; + default: + goto out_invalid_value; + } + break; + case Opt_minorversion: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + if (option > NFS4_MAX_MINOR_VERSION) + goto out_invalid_value; + mnt->minorversion = option; + break; + + /* + * options that take text values + */ + case Opt_sec: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + rc = nfs_parse_security_flavors(string, mnt); + kfree(string); + if (!rc) { + dfprintk(MOUNT, "NFS: unrecognized " + "security flavor\n"); + return 0; + } + break; + case Opt_proto: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, + nfs_xprt_protocol_tokens, args); + + protofamily = AF_INET; + switch (token) { + case Opt_xprt_udp6: + protofamily = AF_INET6; + case Opt_xprt_udp: + mnt->flags &= ~NFS_MOUNT_TCP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; + break; + case Opt_xprt_tcp6: + protofamily = AF_INET6; + case Opt_xprt_tcp: + mnt->flags |= NFS_MOUNT_TCP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; + break; + case Opt_xprt_rdma: + /* vector side protocols to TCP */ + mnt->flags |= NFS_MOUNT_TCP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; + xprt_load_transport(string); + break; + default: + dfprintk(MOUNT, "NFS: unrecognized " + "transport protocol\n"); + kfree(string); + return 0; + } + kfree(string); + break; + case Opt_mountproto: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, + nfs_xprt_protocol_tokens, args); + kfree(string); + + mountfamily = AF_INET; + switch (token) { + case Opt_xprt_udp6: + mountfamily = AF_INET6; + case Opt_xprt_udp: + mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; + break; + case Opt_xprt_tcp6: + mountfamily = AF_INET6; + case Opt_xprt_tcp: + mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; + break; + case Opt_xprt_rdma: /* not used for side protocols */ + default: + dfprintk(MOUNT, "NFS: unrecognized " + "transport protocol\n"); + return 0; + } + break; + case Opt_addr: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + mnt->nfs_server.addrlen = + rpc_pton(string, strlen(string), + (struct sockaddr *) + &mnt->nfs_server.address, + sizeof(mnt->nfs_server.address)); + kfree(string); + if (mnt->nfs_server.addrlen == 0) + goto out_invalid_address; + break; + case Opt_clientaddr: + if (nfs_get_option_str(args, &mnt->client_address)) + goto out_nomem; + break; + case Opt_mounthost: + if (nfs_get_option_str(args, + &mnt->mount_server.hostname)) + goto out_nomem; + break; + case Opt_mountaddr: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + mnt->mount_server.addrlen = + rpc_pton(string, strlen(string), + (struct sockaddr *) + &mnt->mount_server.address, + sizeof(mnt->mount_server.address)); + kfree(string); + if (mnt->mount_server.addrlen == 0) + goto out_invalid_address; + break; + case Opt_lookupcache: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, + nfs_lookupcache_tokens, args); + kfree(string); + switch (token) { + case Opt_lookupcache_all: + mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE); + break; + case Opt_lookupcache_positive: + mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE; + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG; + break; + case Opt_lookupcache_none: + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE; + break; + default: + dfprintk(MOUNT, "NFS: invalid " + "lookupcache argument\n"); + return 0; + }; + break; + case Opt_fscache_uniq: + if (nfs_get_option_str(args, &mnt->fscache_uniq)) + goto out_nomem; + mnt->options |= NFS_OPTION_FSCACHE; + break; + case Opt_local_lock: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, nfs_local_lock_tokens, + args); + kfree(string); + switch (token) { + case Opt_local_lock_all: + mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + case Opt_local_lock_flock: + mnt->flags |= NFS_MOUNT_LOCAL_FLOCK; + break; + case Opt_local_lock_posix: + mnt->flags |= NFS_MOUNT_LOCAL_FCNTL; + break; + case Opt_local_lock_none: + mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + default: + dfprintk(MOUNT, "NFS: invalid " + "local_lock argument\n"); + return 0; + }; + break; + + /* + * Special options + */ + case Opt_sloppy: + sloppy = 1; + dfprintk(MOUNT, "NFS: relaxing parsing rules\n"); + break; + case Opt_userspace: + case Opt_deprecated: + dfprintk(MOUNT, "NFS: ignoring mount option " + "'%s'\n", p); + break; + + default: + invalid_option = 1; + dfprintk(MOUNT, "NFS: unrecognized mount option " + "'%s'\n", p); + } + } + + if (!sloppy && invalid_option) + return 0; + + /* + * verify that any proto=/mountproto= options match the address + * familiies in the addr=/mountaddr= options. + */ + if (protofamily != AF_UNSPEC && + protofamily != mnt->nfs_server.address.ss_family) + goto out_proto_mismatch; + + if (mountfamily != AF_UNSPEC) { + if (mnt->mount_server.addrlen) { + if (mountfamily != mnt->mount_server.address.ss_family) + goto out_mountproto_mismatch; + } else { + if (mountfamily != mnt->nfs_server.address.ss_family) + goto out_mountproto_mismatch; + } + } + + return 1; + +out_mountproto_mismatch: + printk(KERN_INFO "NFS: mount server address does not match mountproto= " + "option\n"); + return 0; +out_proto_mismatch: + printk(KERN_INFO "NFS: server address does not match proto= option\n"); + return 0; +out_invalid_address: + printk(KERN_INFO "NFS: bad IP address specified: %s\n", p); + return 0; +out_invalid_value: + printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p); + return 0; +out_nomem: + printk(KERN_INFO "NFS: not enough memory to parse option\n"); + return 0; +out_security_failure: + free_secdata(secdata); + printk(KERN_INFO "NFS: security options invalid: %d\n", rc); + return 0; +} + +/* + * Match the requested auth flavors with the list returned by + * the server. Returns zero and sets the mount's authentication + * flavor on success; returns -EACCES if server does not support + * the requested flavor. + */ +static int nfs_walk_authlist(struct nfs_parsed_mount_data *args, + struct nfs_mount_request *request) +{ + unsigned int i, j, server_authlist_len = *(request->auth_flav_len); + + /* + * Certain releases of Linux's mountd return an empty + * flavor list. To prevent behavioral regression with + * these servers (ie. rejecting mounts that used to + * succeed), revert to pre-2.6.32 behavior (no checking) + * if the returned flavor list is empty. + */ + if (server_authlist_len == 0) + return 0; + + /* + * We avoid sophisticated negotiating here, as there are + * plenty of cases where we can get it wrong, providing + * either too little or too much security. + * + * RFC 2623, section 2.7 suggests we SHOULD prefer the + * flavor listed first. However, some servers list + * AUTH_NULL first. Our caller plants AUTH_SYS, the + * preferred default, in args->auth_flavors[0] if user + * didn't specify sec= mount option. + */ + for (i = 0; i < args->auth_flavor_len; i++) + for (j = 0; j < server_authlist_len; j++) + if (args->auth_flavors[i] == request->auth_flavs[j]) { + dfprintk(MOUNT, "NFS: using auth flavor %d\n", + request->auth_flavs[j]); + args->auth_flavors[0] = request->auth_flavs[j]; + return 0; + } + + dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n"); + nfs_umount(request); + return -EACCES; +} + +/* + * Use the remote server's MOUNT service to request the NFS file handle + * corresponding to the provided path. + */ +static int nfs_try_mount(struct nfs_parsed_mount_data *args, + struct nfs_fh *root_fh) +{ + rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS]; + unsigned int server_authlist_len = ARRAY_SIZE(server_authlist); + struct nfs_mount_request request = { + .sap = (struct sockaddr *) + &args->mount_server.address, + .dirpath = args->nfs_server.export_path, + .protocol = args->mount_server.protocol, + .fh = root_fh, + .noresvport = args->flags & NFS_MOUNT_NORESVPORT, + .auth_flav_len = &server_authlist_len, + .auth_flavs = server_authlist, + }; + int status; + + if (args->mount_server.version == 0) { + switch (args->version) { + default: + args->mount_server.version = NFS_MNT3_VERSION; + break; + case 2: + args->mount_server.version = NFS_MNT_VERSION; + } + } + request.version = args->mount_server.version; + + if (args->mount_server.hostname) + request.hostname = args->mount_server.hostname; + else + request.hostname = args->nfs_server.hostname; + + /* + * Construct the mount server's address. + */ + if (args->mount_server.address.ss_family == AF_UNSPEC) { + memcpy(request.sap, &args->nfs_server.address, + args->nfs_server.addrlen); + args->mount_server.addrlen = args->nfs_server.addrlen; + } + request.salen = args->mount_server.addrlen; + nfs_set_port(request.sap, &args->mount_server.port, 0); + + /* + * Now ask the mount server to map our export path + * to a file handle. + */ + status = nfs_mount(&request); + if (status != 0) { + dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", + request.hostname, status); + return status; + } + + /* + * MNTv1 (NFSv2) does not support auth flavor negotiation. + */ + if (args->mount_server.version != NFS_MNT3_VERSION) + return 0; + return nfs_walk_authlist(args, &request); +} + +/* + * Split "dev_name" into "hostname:export_path". + * + * The leftmost colon demarks the split between the server's hostname + * and the export path. If the hostname starts with a left square + * bracket, then it may contain colons. + * + * Note: caller frees hostname and export path, even on error. + */ +static int nfs_parse_devname(const char *dev_name, + char **hostname, size_t maxnamlen, + char **export_path, size_t maxpathlen) +{ + size_t len; + char *end; + + /* Is the host name protected with square brakcets? */ + if (*dev_name == '[') { + end = strchr(++dev_name, ']'); + if (end == NULL || end[1] != ':') + goto out_bad_devname; + + len = end - dev_name; + end++; + } else { + char *comma; + + end = strchr(dev_name, ':'); + if (end == NULL) + goto out_bad_devname; + len = end - dev_name; + + /* kill possible hostname list: not supported */ + comma = strchr(dev_name, ','); + if (comma != NULL && comma < end) + *comma = 0; + } + + if (len > maxnamlen) + goto out_hostname; + + /* N.B. caller will free nfs_server.hostname in all cases */ + *hostname = kstrndup(dev_name, len, GFP_KERNEL); + if (*hostname == NULL) + goto out_nomem; + len = strlen(++end); + if (len > maxpathlen) + goto out_path; + *export_path = kstrndup(end, len, GFP_KERNEL); + if (!*export_path) + goto out_nomem; + + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path); + return 0; + +out_bad_devname: + dfprintk(MOUNT, "NFS: device name not in host:path format\n"); + return -EINVAL; + +out_nomem: + dfprintk(MOUNT, "NFS: not enough memory to parse device name\n"); + return -ENOMEM; + +out_hostname: + dfprintk(MOUNT, "NFS: server hostname too long\n"); + return -ENAMETOOLONG; + +out_path: + dfprintk(MOUNT, "NFS: export pathname too long\n"); + return -ENAMETOOLONG; +} + +/* + * Validate the NFS2/NFS3 mount data + * - fills in the mount root filehandle + * + * For option strings, user space handles the following behaviors: + * + * + DNS: mapping server host name to IP address ("addr=" option) + * + * + failure mode: how to behave if a mount request can't be handled + * immediately ("fg/bg" option) + * + * + retry: how often to retry a mount request ("retry=" option) + * + * + breaking back: trying proto=udp after proto=tcp, v2 after v3, + * mountproto=tcp after mountproto=udp, and so on + */ +static int nfs_validate_mount_data(void *options, + struct nfs_parsed_mount_data *args, + struct nfs_fh *mntfh, + const char *dev_name) +{ + struct nfs_mount_data *data = (struct nfs_mount_data *)options; + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + + if (data == NULL) + goto out_no_data; + + switch (data->version) { + case 1: + data->namlen = 0; + case 2: + data->bsize = 0; + case 3: + if (data->flags & NFS_MOUNT_VER3) + goto out_no_v3; + data->root.size = NFS2_FHSIZE; + memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); + case 4: + if (data->flags & NFS_MOUNT_SECFLAVOUR) + goto out_no_sec; + case 5: + memset(data->context, 0, sizeof(data->context)); + case 6: + if (data->flags & NFS_MOUNT_VER3) { + if (data->root.size > NFS3_FHSIZE || data->root.size == 0) + goto out_invalid_fh; + mntfh->size = data->root.size; + args->version = 3; + } else { + mntfh->size = NFS2_FHSIZE; + args->version = 2; + } + + + memcpy(mntfh->data, data->root.data, mntfh->size); + if (mntfh->size < sizeof(mntfh->data)) + memset(mntfh->data + mntfh->size, 0, + sizeof(mntfh->data) - mntfh->size); + + /* + * Translate to nfs_parsed_mount_data, which nfs_fill_super + * can deal with. + */ + args->flags = data->flags & NFS_MOUNT_FLAGMASK; + args->flags |= NFS_MOUNT_LEGACY_INTERFACE; + args->rsize = data->rsize; + args->wsize = data->wsize; + args->timeo = data->timeo; + args->retrans = data->retrans; + args->acregmin = data->acregmin; + args->acregmax = data->acregmax; + args->acdirmin = data->acdirmin; + args->acdirmax = data->acdirmax; + + memcpy(sap, &data->addr, sizeof(data->addr)); + args->nfs_server.addrlen = sizeof(data->addr); + if (!nfs_verify_server_address(sap)) + goto out_no_address; + + if (!(data->flags & NFS_MOUNT_TCP)) + args->nfs_server.protocol = XPRT_TRANSPORT_UDP; + /* N.B. caller will free nfs_server.hostname in all cases */ + args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); + args->namlen = data->namlen; + args->bsize = data->bsize; + + if (data->flags & NFS_MOUNT_SECFLAVOUR) + args->auth_flavors[0] = data->pseudoflavor; + if (!args->nfs_server.hostname) + goto out_nomem; + + if (!(data->flags & NFS_MOUNT_NONLM)) + args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK| + NFS_MOUNT_LOCAL_FCNTL); + else + args->flags |= (NFS_MOUNT_LOCAL_FLOCK| + NFS_MOUNT_LOCAL_FCNTL); + /* + * The legacy version 6 binary mount data from userspace has a + * field used only to transport selinux information into the + * the kernel. To continue to support that functionality we + * have a touch of selinux knowledge here in the NFS code. The + * userspace code converted context=blah to just blah so we are + * converting back to the full string selinux understands. + */ + if (data->context[0]){ +#ifdef CONFIG_SECURITY_SELINUX + int rc; + char *opts_str = kmalloc(sizeof(data->context) + 8, GFP_KERNEL); + if (!opts_str) + return -ENOMEM; + strcpy(opts_str, "context="); + data->context[NFS_MAX_CONTEXT_LEN] = '\0'; + strcat(opts_str, &data->context[0]); + rc = security_sb_parse_opts_str(opts_str, &args->lsm_opts); + kfree(opts_str); + if (rc) + return rc; +#else + return -EINVAL; +#endif + } + + break; + default: { + int status; + + if (nfs_parse_mount_options((char *)options, args) == 0) + return -EINVAL; + + if (!nfs_verify_server_address(sap)) + goto out_no_address; + + if (args->version == 4) +#ifdef CONFIG_NFS_V4 + return nfs4_validate_text_mount_data(options, + args, dev_name); +#else + goto out_v4_not_compiled; +#endif + + nfs_set_port(sap, &args->nfs_server.port, 0); + + nfs_set_mount_transport_protocol(args); + + status = nfs_parse_devname(dev_name, + &args->nfs_server.hostname, + PAGE_SIZE, + &args->nfs_server.export_path, + NFS_MAXPATHLEN); + if (!status) + status = nfs_try_mount(args, mntfh); + + kfree(args->nfs_server.export_path); + args->nfs_server.export_path = NULL; + + if (status) + return status; + + break; + } + } + +#ifndef CONFIG_NFS_V3 + if (args->version == 3) + goto out_v3_not_compiled; +#endif /* !CONFIG_NFS_V3 */ + + return 0; + +out_no_data: + dfprintk(MOUNT, "NFS: mount program didn't pass any mount data\n"); + return -EINVAL; + +out_no_v3: + dfprintk(MOUNT, "NFS: nfs_mount_data version %d does not support v3\n", + data->version); + return -EINVAL; + +out_no_sec: + dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n"); + return -EINVAL; + +#ifndef CONFIG_NFS_V3 +out_v3_not_compiled: + dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n"); + return -EPROTONOSUPPORT; +#endif /* !CONFIG_NFS_V3 */ + +#ifndef CONFIG_NFS_V4 +out_v4_not_compiled: + dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); + return -EPROTONOSUPPORT; +#endif /* !CONFIG_NFS_V4 */ + +out_nomem: + dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); + return -ENOMEM; + +out_no_address: + dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); + return -EINVAL; + +out_invalid_fh: + dfprintk(MOUNT, "NFS: invalid root filehandle\n"); + return -EINVAL; +} + +static int +nfs_compare_remount_data(struct nfs_server *nfss, + struct nfs_parsed_mount_data *data) +{ + if (data->flags != nfss->flags || + data->rsize != nfss->rsize || + data->wsize != nfss->wsize || + data->retrans != nfss->client->cl_timeout->to_retries || + data->auth_flavors[0] != nfss->client->cl_auth->au_flavor || + data->acregmin != nfss->acregmin / HZ || + data->acregmax != nfss->acregmax / HZ || + data->acdirmin != nfss->acdirmin / HZ || + data->acdirmax != nfss->acdirmax / HZ || + data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || + data->nfs_server.port != nfss->port || + data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || + !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address, + (struct sockaddr *)&nfss->nfs_client->cl_addr)) + return -EINVAL; + + return 0; +} + +static int +nfs_remount(struct super_block *sb, int *flags, char *raw_data) +{ + int error; + struct nfs_server *nfss = sb->s_fs_info; + struct nfs_parsed_mount_data *data; + struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data; + struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data; + u32 nfsvers = nfss->nfs_client->rpc_ops->version; + + /* + * Userspace mount programs that send binary options generally send + * them populated with default values. We have no way to know which + * ones were explicitly specified. Fall back to legacy behavior and + * just return success. + */ + if ((nfsvers == 4 && (!options4 || options4->version == 1)) || + (nfsvers <= 3 && (!options || (options->version >= 1 && + options->version <= 6)))) + return 0; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + + /* fill out struct with values from existing mount */ + data->flags = nfss->flags; + data->rsize = nfss->rsize; + data->wsize = nfss->wsize; + data->retrans = nfss->client->cl_timeout->to_retries; + data->auth_flavors[0] = nfss->client->cl_auth->au_flavor; + data->acregmin = nfss->acregmin / HZ; + data->acregmax = nfss->acregmax / HZ; + data->acdirmin = nfss->acdirmin / HZ; + data->acdirmax = nfss->acdirmax / HZ; + data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; + data->nfs_server.port = nfss->port; + data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; + memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, + data->nfs_server.addrlen); + + /* overwrite those values with any that were specified */ + error = nfs_parse_mount_options((char *)options, data); + if (error < 0) + goto out; + + /* + * noac is a special case. It implies -o sync, but that's not + * necessarily reflected in the mtab options. do_remount_sb + * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the + * remount options, so we have to explicitly reset it. + */ + if (data->flags & NFS_MOUNT_NOAC) + *flags |= MS_SYNCHRONOUS; + + /* compare new mount options with old ones */ + error = nfs_compare_remount_data(nfss, data); +out: + kfree(data); + return error; +} + +/* + * Initialise the common bits of the superblock + */ +static inline void nfs_initialise_sb(struct super_block *sb) +{ + struct nfs_server *server = NFS_SB(sb); + + sb->s_magic = NFS_SUPER_MAGIC; + + /* We probably want something more informative here */ + snprintf(sb->s_id, sizeof(sb->s_id), + "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev)); + + if (sb->s_blocksize == 0) + sb->s_blocksize = nfs_block_bits(server->wsize, + &sb->s_blocksize_bits); + + if (server->flags & NFS_MOUNT_NOAC) + sb->s_flags |= MS_SYNCHRONOUS; + + sb->s_bdi = &server->backing_dev_info; + + nfs_super_set_maxbytes(sb, server->maxfilesize); +} + +/* + * Finish setting up an NFS2/3 superblock + */ +static void nfs_fill_super(struct super_block *sb, + struct nfs_parsed_mount_data *data) +{ + struct nfs_server *server = NFS_SB(sb); + + sb->s_blocksize_bits = 0; + sb->s_blocksize = 0; + if (data->bsize) + sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); + + if (server->nfs_client->rpc_ops->version == 3) { + /* The VFS shouldn't apply the umask to mode bits. We will do + * so ourselves when necessary. + */ + sb->s_flags |= MS_POSIXACL; + sb->s_time_gran = 1; + } + + sb->s_op = &nfs_sops; + nfs_initialise_sb(sb); +} + +/* + * Finish setting up a cloned NFS2/3 superblock + */ +static void nfs_clone_super(struct super_block *sb, + const struct super_block *old_sb) +{ + struct nfs_server *server = NFS_SB(sb); + + sb->s_blocksize_bits = old_sb->s_blocksize_bits; + sb->s_blocksize = old_sb->s_blocksize; + sb->s_maxbytes = old_sb->s_maxbytes; + + if (server->nfs_client->rpc_ops->version == 3) { + /* The VFS shouldn't apply the umask to mode bits. We will do + * so ourselves when necessary. + */ + sb->s_flags |= MS_POSIXACL; + sb->s_time_gran = 1; + } + + sb->s_op = old_sb->s_op; + nfs_initialise_sb(sb); +} + +static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) +{ + const struct nfs_server *a = s->s_fs_info; + const struct rpc_clnt *clnt_a = a->client; + const struct rpc_clnt *clnt_b = b->client; + + if ((s->s_flags & NFS_MS_MASK) != (flags & NFS_MS_MASK)) + goto Ebusy; + if (a->nfs_client != b->nfs_client) + goto Ebusy; + if (a->flags != b->flags) + goto Ebusy; + if (a->wsize != b->wsize) + goto Ebusy; + if (a->rsize != b->rsize) + goto Ebusy; + if (a->acregmin != b->acregmin) + goto Ebusy; + if (a->acregmax != b->acregmax) + goto Ebusy; + if (a->acdirmin != b->acdirmin) + goto Ebusy; + if (a->acdirmax != b->acdirmax) + goto Ebusy; + if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) + goto Ebusy; + return 1; +Ebusy: + return 0; +} + +struct nfs_sb_mountdata { + struct nfs_server *server; + int mntflags; +}; + +static int nfs_set_super(struct super_block *s, void *data) +{ + struct nfs_sb_mountdata *sb_mntdata = data; + struct nfs_server *server = sb_mntdata->server; + int ret; + + s->s_flags = sb_mntdata->mntflags; + s->s_fs_info = server; + s->s_d_op = server->nfs_client->rpc_ops->dentry_ops; + ret = set_anon_super(s, server); + if (ret == 0) + server->s_dev = s->s_dev; + return ret; +} + +static int nfs_compare_super_address(struct nfs_server *server1, + struct nfs_server *server2) +{ + struct sockaddr *sap1, *sap2; + + sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr; + sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr; + + if (sap1->sa_family != sap2->sa_family) + return 0; + + switch (sap1->sa_family) { + case AF_INET: { + struct sockaddr_in *sin1 = (struct sockaddr_in *)sap1; + struct sockaddr_in *sin2 = (struct sockaddr_in *)sap2; + if (sin1->sin_addr.s_addr != sin2->sin_addr.s_addr) + return 0; + if (sin1->sin_port != sin2->sin_port) + return 0; + break; + } + case AF_INET6: { + struct sockaddr_in6 *sin1 = (struct sockaddr_in6 *)sap1; + struct sockaddr_in6 *sin2 = (struct sockaddr_in6 *)sap2; + if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) + return 0; + if (sin1->sin6_port != sin2->sin6_port) + return 0; + break; + } + default: + return 0; + } + + return 1; +} + +static int nfs_compare_super(struct super_block *sb, void *data) +{ + struct nfs_sb_mountdata *sb_mntdata = data; + struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb); + int mntflags = sb_mntdata->mntflags; + + if (!nfs_compare_super_address(old, server)) + return 0; + /* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */ + if (old->flags & NFS_MOUNT_UNSHARED) + return 0; + if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0) + return 0; + return nfs_compare_mount_options(sb, server, mntflags); +} + +static int nfs_bdi_register(struct nfs_server *server) +{ + return bdi_register_dev(&server->backing_dev_info, server->s_dev); +} + +static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) +{ + struct nfs_server *server = NULL; + struct super_block *s; + struct nfs_parsed_mount_data *data; + struct nfs_fh *mntfh; + struct dentry *mntroot = ERR_PTR(-ENOMEM); + int (*compare_super)(struct super_block *, void *) = nfs_compare_super; + struct nfs_sb_mountdata sb_mntdata = { + .mntflags = flags, + }; + int error; + + data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION); + mntfh = nfs_alloc_fhandle(); + if (data == NULL || mntfh == NULL) + goto out; + + /* Validate the mount data */ + error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); + if (error < 0) { + mntroot = ERR_PTR(error); + goto out; + } + +#ifdef CONFIG_NFS_V4 + if (data->version == 4) { + mntroot = nfs4_try_mount(flags, dev_name, data); + goto out; + } +#endif /* CONFIG_NFS_V4 */ + + /* Get a volume representation */ + server = nfs_create_server(data, mntfh); + if (IS_ERR(server)) { + mntroot = ERR_CAST(server); + goto out; + } + sb_mntdata.server = server; + + if (server->flags & NFS_MOUNT_UNSHARED) + compare_super = NULL; + + /* Get a superblock - note that we may end up sharing one that already exists */ + s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata); + if (IS_ERR(s)) { + mntroot = ERR_CAST(s); + goto out_err_nosb; + } + + if (s->s_fs_info != server) { + nfs_free_server(server); + server = NULL; + } else { + error = nfs_bdi_register(server); + if (error) { + mntroot = ERR_PTR(error); + goto error_splat_bdi; + } + } + + if (!s->s_root) { + /* initial superblock/root creation */ + nfs_fill_super(s, data); + nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL); + } + + mntroot = nfs_get_root(s, mntfh, dev_name); + if (IS_ERR(mntroot)) + goto error_splat_super; + + error = security_sb_set_mnt_opts(s, &data->lsm_opts); + if (error) + goto error_splat_root; + + s->s_flags |= MS_ACTIVE; + +out: + nfs_free_parsed_mount_data(data); + nfs_free_fhandle(mntfh); + return mntroot; + +out_err_nosb: + nfs_free_server(server); + goto out; + +error_splat_root: + dput(mntroot); + mntroot = ERR_PTR(error); +error_splat_super: + if (server && !s->s_root) + bdi_unregister(&server->backing_dev_info); +error_splat_bdi: + deactivate_locked_super(s); + goto out; +} + +/* + * Ensure that we unregister the bdi before kill_anon_super + * releases the device name + */ +static void nfs_put_super(struct super_block *s) +{ + struct nfs_server *server = NFS_SB(s); + + bdi_unregister(&server->backing_dev_info); +} + +/* + * Destroy an NFS2/3 superblock + */ +static void nfs_kill_super(struct super_block *s) +{ + struct nfs_server *server = NFS_SB(s); + + kill_anon_super(s); + nfs_fscache_release_super_cookie(s); + nfs_free_server(server); +} + +/* + * Clone an NFS2/3 server record on xdev traversal (FSID-change) + */ +static struct dentry * +nfs_xdev_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + struct nfs_clone_mount *data = raw_data; + struct super_block *s; + struct nfs_server *server; + struct dentry *mntroot; + int (*compare_super)(struct super_block *, void *) = nfs_compare_super; + struct nfs_sb_mountdata sb_mntdata = { + .mntflags = flags, + }; + int error; + + dprintk("--> nfs_xdev_mount()\n"); + + /* create a new volume representation */ + server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); + if (IS_ERR(server)) { + error = PTR_ERR(server); + goto out_err_noserver; + } + sb_mntdata.server = server; + + if (server->flags & NFS_MOUNT_UNSHARED) + compare_super = NULL; + + /* Get a superblock - note that we may end up sharing one that already exists */ + s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto out_err_nosb; + } + + if (s->s_fs_info != server) { + nfs_free_server(server); + server = NULL; + } else { + error = nfs_bdi_register(server); + if (error) + goto error_splat_bdi; + } + + if (!s->s_root) { + /* initial superblock/root creation */ + nfs_clone_super(s, data->sb); + nfs_fscache_get_super_cookie(s, NULL, data); + } + + mntroot = nfs_get_root(s, data->fh, dev_name); + if (IS_ERR(mntroot)) { + error = PTR_ERR(mntroot); + goto error_splat_super; + } + if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) { + dput(mntroot); + error = -ESTALE; + goto error_splat_super; + } + + s->s_flags |= MS_ACTIVE; + + /* clone any lsm security options from the parent to the new sb */ + security_sb_clone_mnt_opts(data->sb, s); + + dprintk("<-- nfs_xdev_mount() = 0\n"); + return mntroot; + +out_err_nosb: + nfs_free_server(server); +out_err_noserver: + dprintk("<-- nfs_xdev_mount() = %d [error]\n", error); + return ERR_PTR(error); + +error_splat_super: + if (server && !s->s_root) + bdi_unregister(&server->backing_dev_info); +error_splat_bdi: + deactivate_locked_super(s); + dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error); + return ERR_PTR(error); +} + +#ifdef CONFIG_NFS_V4 + +/* + * Finish setting up a cloned NFS4 superblock + */ +static void nfs4_clone_super(struct super_block *sb, + const struct super_block *old_sb) +{ + sb->s_blocksize_bits = old_sb->s_blocksize_bits; + sb->s_blocksize = old_sb->s_blocksize; + sb->s_maxbytes = old_sb->s_maxbytes; + sb->s_time_gran = 1; + sb->s_op = old_sb->s_op; + /* + * The VFS shouldn't apply the umask to mode bits. We will do + * so ourselves when necessary. + */ + sb->s_flags |= MS_POSIXACL; + sb->s_xattr = old_sb->s_xattr; + nfs_initialise_sb(sb); +} + +/* + * Set up an NFS4 superblock + */ +static void nfs4_fill_super(struct super_block *sb) +{ + sb->s_time_gran = 1; + sb->s_op = &nfs4_sops; + /* + * The VFS shouldn't apply the umask to mode bits. We will do + * so ourselves when necessary. + */ + sb->s_flags |= MS_POSIXACL; + sb->s_xattr = nfs4_xattr_handlers; + nfs_initialise_sb(sb); +} + +static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) +{ + args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3| + NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL); +} + +static int nfs4_validate_text_mount_data(void *options, + struct nfs_parsed_mount_data *args, + const char *dev_name) +{ + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + + nfs_set_port(sap, &args->nfs_server.port, NFS_PORT); + + nfs_validate_transport_protocol(args); + + nfs4_validate_mount_flags(args); + + if (args->version != 4) { + dfprintk(MOUNT, + "NFS4: Illegal mount version\n"); + return -EINVAL; + } + + if (args->auth_flavor_len > 1) { + dfprintk(MOUNT, + "NFS4: Too many RPC auth flavours specified\n"); + return -EINVAL; + } + + if (args->client_address == NULL) { + dfprintk(MOUNT, + "NFS4: mount program didn't pass callback address\n"); + return -EINVAL; + } + + return nfs_parse_devname(dev_name, + &args->nfs_server.hostname, + NFS4_MAXNAMLEN, + &args->nfs_server.export_path, + NFS4_MAXPATHLEN); +} + +/* + * Validate NFSv4 mount options + */ +static int nfs4_validate_mount_data(void *options, + struct nfs_parsed_mount_data *args, + const char *dev_name) +{ + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; + char *c; + + if (data == NULL) + goto out_no_data; + + switch (data->version) { + case 1: + if (data->host_addrlen > sizeof(args->nfs_server.address)) + goto out_no_address; + if (data->host_addrlen == 0) + goto out_no_address; + args->nfs_server.addrlen = data->host_addrlen; + if (copy_from_user(sap, data->host_addr, data->host_addrlen)) + return -EFAULT; + if (!nfs_verify_server_address(sap)) + goto out_no_address; + + if (data->auth_flavourlen) { + if (data->auth_flavourlen > 1) + goto out_inval_auth; + if (copy_from_user(&args->auth_flavors[0], + data->auth_flavours, + sizeof(args->auth_flavors[0]))) + return -EFAULT; + } + + c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); + if (IS_ERR(c)) + return PTR_ERR(c); + args->nfs_server.hostname = c; + + c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); + if (IS_ERR(c)) + return PTR_ERR(c); + args->nfs_server.export_path = c; + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c); + + c = strndup_user(data->client_addr.data, 16); + if (IS_ERR(c)) + return PTR_ERR(c); + args->client_address = c; + + /* + * Translate to nfs_parsed_mount_data, which nfs4_fill_super + * can deal with. + */ + + args->flags = data->flags & NFS4_MOUNT_FLAGMASK; + args->rsize = data->rsize; + args->wsize = data->wsize; + args->timeo = data->timeo; + args->retrans = data->retrans; + args->acregmin = data->acregmin; + args->acregmax = data->acregmax; + args->acdirmin = data->acdirmin; + args->acdirmax = data->acdirmax; + args->nfs_server.protocol = data->proto; + nfs_validate_transport_protocol(args); + + break; + default: + if (nfs_parse_mount_options((char *)options, args) == 0) + return -EINVAL; + + if (!nfs_verify_server_address(sap)) + return -EINVAL; + + return nfs4_validate_text_mount_data(options, args, dev_name); + } + + return 0; + +out_no_data: + dfprintk(MOUNT, "NFS4: mount program didn't pass any mount data\n"); + return -EINVAL; + +out_inval_auth: + dfprintk(MOUNT, "NFS4: Invalid number of RPC auth flavours %d\n", + data->auth_flavourlen); + return -EINVAL; + +out_no_address: + dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); + return -EINVAL; +} + +/* + * Get the superblock for the NFS4 root partition + */ +static struct dentry * +nfs4_remote_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + struct nfs_parsed_mount_data *data = raw_data; + struct super_block *s; + struct nfs_server *server; + struct nfs_fh *mntfh; + struct dentry *mntroot; + int (*compare_super)(struct super_block *, void *) = nfs_compare_super; + struct nfs_sb_mountdata sb_mntdata = { + .mntflags = flags, + }; + int error = -ENOMEM; + + mntfh = nfs_alloc_fhandle(); + if (data == NULL || mntfh == NULL) + goto out; + + /* Get a volume representation */ + server = nfs4_create_server(data, mntfh); + if (IS_ERR(server)) { + error = PTR_ERR(server); + goto out; + } + sb_mntdata.server = server; + + if (server->flags & NFS4_MOUNT_UNSHARED) + compare_super = NULL; + + /* Get a superblock - note that we may end up sharing one that already exists */ + s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto out_free; + } + + if (s->s_fs_info != server) { + nfs_free_server(server); + server = NULL; + } else { + error = nfs_bdi_register(server); + if (error) + goto error_splat_bdi; + } + + if (!s->s_root) { + /* initial superblock/root creation */ + nfs4_fill_super(s); + nfs_fscache_get_super_cookie( + s, data ? data->fscache_uniq : NULL, NULL); + } + + mntroot = nfs4_get_root(s, mntfh, dev_name); + if (IS_ERR(mntroot)) { + error = PTR_ERR(mntroot); + goto error_splat_super; + } + + error = security_sb_set_mnt_opts(s, &data->lsm_opts); + if (error) + goto error_splat_root; + + s->s_flags |= MS_ACTIVE; + + nfs_free_fhandle(mntfh); + return mntroot; + +out: + nfs_free_fhandle(mntfh); + return ERR_PTR(error); + +out_free: + nfs_free_server(server); + goto out; + +error_splat_root: + dput(mntroot); +error_splat_super: + if (server && !s->s_root) + bdi_unregister(&server->backing_dev_info); +error_splat_bdi: + deactivate_locked_super(s); + goto out; +} + +static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type, + int flags, void *data, const char *hostname) +{ + struct vfsmount *root_mnt; + char *root_devname; + size_t len; + + len = strlen(hostname) + 5; + root_devname = kmalloc(len, GFP_KERNEL); + if (root_devname == NULL) + return ERR_PTR(-ENOMEM); + /* Does hostname needs to be enclosed in brackets? */ + if (strchr(hostname, ':')) + snprintf(root_devname, len, "[%s]:/", hostname); + else + snprintf(root_devname, len, "%s:/", hostname); + root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data); + kfree(root_devname); + return root_mnt; +} + +struct nfs_referral_count { + struct list_head list; + const struct task_struct *task; + unsigned int referral_count; +}; + +static LIST_HEAD(nfs_referral_count_list); +static DEFINE_SPINLOCK(nfs_referral_count_list_lock); + +static struct nfs_referral_count *nfs_find_referral_count(void) +{ + struct nfs_referral_count *p; + + list_for_each_entry(p, &nfs_referral_count_list, list) { + if (p->task == current) + return p; + } + return NULL; +} + +#define NFS_MAX_NESTED_REFERRALS 2 + +static int nfs_referral_loop_protect(void) +{ + struct nfs_referral_count *p, *new; + int ret = -ENOMEM; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + goto out; + new->task = current; + new->referral_count = 1; + + ret = 0; + spin_lock(&nfs_referral_count_list_lock); + p = nfs_find_referral_count(); + if (p != NULL) { + if (p->referral_count >= NFS_MAX_NESTED_REFERRALS) + ret = -ELOOP; + else + p->referral_count++; + } else { + list_add(&new->list, &nfs_referral_count_list); + new = NULL; + } + spin_unlock(&nfs_referral_count_list_lock); + kfree(new); +out: + return ret; +} + +static void nfs_referral_loop_unprotect(void) +{ + struct nfs_referral_count *p; + + spin_lock(&nfs_referral_count_list_lock); + p = nfs_find_referral_count(); + p->referral_count--; + if (p->referral_count == 0) + list_del(&p->list); + else + p = NULL; + spin_unlock(&nfs_referral_count_list_lock); + kfree(p); +} + +static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, + const char *export_path) +{ + struct nameidata *nd = NULL; + struct mnt_namespace *ns_private; + struct super_block *s; + struct dentry *dentry; + int ret; + + nd = kmalloc(sizeof(*nd), GFP_KERNEL); + if (nd == NULL) + return ERR_PTR(-ENOMEM); + + ns_private = create_mnt_ns(root_mnt); + ret = PTR_ERR(ns_private); + if (IS_ERR(ns_private)) + goto out_mntput; + + ret = nfs_referral_loop_protect(); + if (ret != 0) + goto out_put_mnt_ns; + + ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, + export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, nd); + + nfs_referral_loop_unprotect(); + put_mnt_ns(ns_private); + + if (ret != 0) + goto out_err; + + s = nd->path.mnt->mnt_sb; + atomic_inc(&s->s_active); + dentry = dget(nd->path.dentry); + + path_put(&nd->path); + kfree(nd); + down_write(&s->s_umount); + return dentry; +out_put_mnt_ns: + put_mnt_ns(ns_private); +out_mntput: + mntput(root_mnt); +out_err: + kfree(nd); + return ERR_PTR(ret); +} + +static struct dentry *nfs4_try_mount(int flags, const char *dev_name, + struct nfs_parsed_mount_data *data) +{ + char *export_path; + struct vfsmount *root_mnt; + struct dentry *res; + + dfprintk(MOUNT, "--> nfs4_try_mount()\n"); + + export_path = data->nfs_server.export_path; + data->nfs_server.export_path = "/"; + root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data, + data->nfs_server.hostname); + data->nfs_server.export_path = export_path; + + res = ERR_CAST(root_mnt); + if (!IS_ERR(root_mnt)) + res = nfs_follow_remote_path(root_mnt, export_path); + + dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n", + IS_ERR(res) ? PTR_ERR(res) : 0, + IS_ERR(res) ? " [error]" : ""); + return res; +} + +/* + * Get the superblock for an NFS4 mountpoint + */ +static struct dentry *nfs4_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) +{ + struct nfs_parsed_mount_data *data; + int error = -ENOMEM; + struct dentry *res = ERR_PTR(-ENOMEM); + + data = nfs_alloc_parsed_mount_data(4); + if (data == NULL) + goto out; + + /* Validate the mount data */ + error = nfs4_validate_mount_data(raw_data, data, dev_name); + if (error < 0) { + res = ERR_PTR(error); + goto out; + } + + res = nfs4_try_mount(flags, dev_name, data); + if (IS_ERR(res)) + error = PTR_ERR(res); + +out: + nfs_free_parsed_mount_data(data); + dprintk("<-- nfs4_mount() = %d%s\n", error, + error != 0 ? " [error]" : ""); + return res; +} + +static void nfs4_kill_super(struct super_block *sb) +{ + struct nfs_server *server = NFS_SB(sb); + + dprintk("--> %s\n", __func__); + nfs_super_return_all_delegations(sb); + kill_anon_super(sb); + nfs_fscache_release_super_cookie(sb); + nfs_free_server(server); + dprintk("<-- %s\n", __func__); +} + +/* + * Clone an NFS4 server record on xdev traversal (FSID-change) + */ +static struct dentry * +nfs4_xdev_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + struct nfs_clone_mount *data = raw_data; + struct super_block *s; + struct nfs_server *server; + struct dentry *mntroot; + int (*compare_super)(struct super_block *, void *) = nfs_compare_super; + struct nfs_sb_mountdata sb_mntdata = { + .mntflags = flags, + }; + int error; + + dprintk("--> nfs4_xdev_mount()\n"); + + /* create a new volume representation */ + server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); + if (IS_ERR(server)) { + error = PTR_ERR(server); + goto out_err_noserver; + } + sb_mntdata.server = server; + + if (server->flags & NFS4_MOUNT_UNSHARED) + compare_super = NULL; + + /* Get a superblock - note that we may end up sharing one that already exists */ + s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto out_err_nosb; + } + + if (s->s_fs_info != server) { + nfs_free_server(server); + server = NULL; + } else { + error = nfs_bdi_register(server); + if (error) + goto error_splat_bdi; + } + + if (!s->s_root) { + /* initial superblock/root creation */ + nfs4_clone_super(s, data->sb); + nfs_fscache_get_super_cookie(s, NULL, data); + } + + mntroot = nfs4_get_root(s, data->fh, dev_name); + if (IS_ERR(mntroot)) { + error = PTR_ERR(mntroot); + goto error_splat_super; + } + if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) { + dput(mntroot); + error = -ESTALE; + goto error_splat_super; + } + + s->s_flags |= MS_ACTIVE; + + security_sb_clone_mnt_opts(data->sb, s); + + dprintk("<-- nfs4_xdev_mount() = 0\n"); + return mntroot; + +out_err_nosb: + nfs_free_server(server); +out_err_noserver: + dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error); + return ERR_PTR(error); + +error_splat_super: + if (server && !s->s_root) + bdi_unregister(&server->backing_dev_info); +error_splat_bdi: + deactivate_locked_super(s); + dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error); + return ERR_PTR(error); +} + +static struct dentry * +nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + struct nfs_clone_mount *data = raw_data; + struct super_block *s; + struct nfs_server *server; + struct dentry *mntroot; + struct nfs_fh *mntfh; + int (*compare_super)(struct super_block *, void *) = nfs_compare_super; + struct nfs_sb_mountdata sb_mntdata = { + .mntflags = flags, + }; + int error = -ENOMEM; + + dprintk("--> nfs4_referral_get_sb()\n"); + + mntfh = nfs_alloc_fhandle(); + if (mntfh == NULL) + goto out_err_nofh; + + /* create a new volume representation */ + server = nfs4_create_referral_server(data, mntfh); + if (IS_ERR(server)) { + error = PTR_ERR(server); + goto out_err_noserver; + } + sb_mntdata.server = server; + + if (server->flags & NFS4_MOUNT_UNSHARED) + compare_super = NULL; + + /* Get a superblock - note that we may end up sharing one that already exists */ + s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto out_err_nosb; + } + + if (s->s_fs_info != server) { + nfs_free_server(server); + server = NULL; + } else { + error = nfs_bdi_register(server); + if (error) + goto error_splat_bdi; + } + + if (!s->s_root) { + /* initial superblock/root creation */ + nfs4_fill_super(s); + nfs_fscache_get_super_cookie(s, NULL, data); + } + + mntroot = nfs4_get_root(s, mntfh, dev_name); + if (IS_ERR(mntroot)) { + error = PTR_ERR(mntroot); + goto error_splat_super; + } + if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) { + dput(mntroot); + error = -ESTALE; + goto error_splat_super; + } + + s->s_flags |= MS_ACTIVE; + + security_sb_clone_mnt_opts(data->sb, s); + + nfs_free_fhandle(mntfh); + dprintk("<-- nfs4_referral_get_sb() = 0\n"); + return mntroot; + +out_err_nosb: + nfs_free_server(server); +out_err_noserver: + nfs_free_fhandle(mntfh); +out_err_nofh: + dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); + return ERR_PTR(error); + +error_splat_super: + if (server && !s->s_root) + bdi_unregister(&server->backing_dev_info); +error_splat_bdi: + deactivate_locked_super(s); + nfs_free_fhandle(mntfh); + dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); + return ERR_PTR(error); +} + +/* + * Create an NFS4 server record on referral traversal + */ +static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) +{ + struct nfs_clone_mount *data = raw_data; + char *export_path; + struct vfsmount *root_mnt; + struct dentry *res; + + dprintk("--> nfs4_referral_mount()\n"); + + export_path = data->mnt_path; + data->mnt_path = "/"; + + root_mnt = nfs_do_root_mount(&nfs4_remote_referral_fs_type, + flags, data, data->hostname); + data->mnt_path = export_path; + + res = ERR_CAST(root_mnt); + if (!IS_ERR(root_mnt)) + res = nfs_follow_remote_path(root_mnt, export_path); + dprintk("<-- nfs4_referral_mount() = %ld%s\n", + IS_ERR(res) ? PTR_ERR(res) : 0, + IS_ERR(res) ? " [error]" : ""); + return res; +} + +#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c new file mode 100644 index 00000000..05c9e02f --- /dev/null +++ b/fs/nfs/symlink.c @@ -0,0 +1,78 @@ +/* + * linux/fs/nfs/symlink.c + * + * Copyright (C) 1992 Rick Sladkey + * + * Optimization changes Copyright (C) 1994 Florian La Roche + * + * Jun 7 1999, cache symlink lookups in the page cache. -DaveM + * + * nfs symlink handling code + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Symlink caching in the page cache is even more simplistic + * and straight-forward than readdir caching. + */ + +static int nfs_symlink_filler(struct inode *inode, struct page *page) +{ + int error; + + error = NFS_PROTO(inode)->readlink(inode, page, 0, PAGE_SIZE); + if (error < 0) + goto error; + SetPageUptodate(page); + unlock_page(page); + return 0; + +error: + SetPageError(page); + unlock_page(page); + return -EIO; +} + +static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + struct page *page; + void *err; + + err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); + if (err) + goto read_failed; + page = read_cache_page(&inode->i_data, 0, + (filler_t *)nfs_symlink_filler, inode); + if (IS_ERR(page)) { + err = page; + goto read_failed; + } + nd_set_link(nd, kmap(page)); + return page; + +read_failed: + nd_set_link(nd, err); + return NULL; +} + +/* + * symlinks can't do much... + */ +const struct inode_operations nfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = nfs_follow_link, + .put_link = page_put_link, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +}; diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c new file mode 100644 index 00000000..978aaeb8 --- /dev/null +++ b/fs/nfs/sysctl.c @@ -0,0 +1,92 @@ +/* + * linux/fs/nfs/sysctl.c + * + * Sysctl interface to NFS parameters + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "callback.h" + +#ifdef CONFIG_NFS_V4 +static const int nfs_set_port_min = 0; +static const int nfs_set_port_max = 65535; +#endif +static struct ctl_table_header *nfs_callback_sysctl_table; + +static ctl_table nfs_cb_sysctls[] = { +#ifdef CONFIG_NFS_V4 + { + .procname = "nfs_callback_tcpport", + .data = &nfs_callback_set_tcpport, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (int *)&nfs_set_port_min, + .extra2 = (int *)&nfs_set_port_max, + }, +#ifndef CONFIG_NFS_USE_NEW_IDMAPPER + { + .procname = "idmap_cache_timeout", + .data = &nfs_idmap_cache_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ +#endif + { + .procname = "nfs_mountpoint_timeout", + .data = &nfs_mountpoint_expiry_timeout, + .maxlen = sizeof(nfs_mountpoint_expiry_timeout), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nfs_congestion_kb", + .data = &nfs_congestion_kb, + .maxlen = sizeof(nfs_congestion_kb), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static ctl_table nfs_cb_sysctl_dir[] = { + { + .procname = "nfs", + .mode = 0555, + .child = nfs_cb_sysctls, + }, + { } +}; + +static ctl_table nfs_cb_sysctl_root[] = { + { + .procname = "fs", + .mode = 0555, + .child = nfs_cb_sysctl_dir, + }, + { } +}; + +int nfs_register_sysctl(void) +{ + nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root); + if (nfs_callback_sysctl_table == NULL) + return -ENOMEM; + return 0; +} + +void nfs_unregister_sysctl(void) +{ + unregister_sysctl_table(nfs_callback_sysctl_table); + nfs_callback_sysctl_table = NULL; +} diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c new file mode 100644 index 00000000..8d6864c2 --- /dev/null +++ b/fs/nfs/unlink.c @@ -0,0 +1,580 @@ +/* + * linux/fs/nfs/unlink.c + * + * nfs sillydelete handling + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "nfs4_fs.h" +#include "iostat.h" +#include "delegation.h" + +struct nfs_unlinkdata { + struct hlist_node list; + struct nfs_removeargs args; + struct nfs_removeres res; + struct inode *dir; + struct rpc_cred *cred; + struct nfs_fattr dir_attr; +}; + +/** + * nfs_free_unlinkdata - release data from a sillydelete operation. + * @data: pointer to unlink structure. + */ +static void +nfs_free_unlinkdata(struct nfs_unlinkdata *data) +{ + iput(data->dir); + put_rpccred(data->cred); + kfree(data->args.name.name); + kfree(data); +} + +#define NAME_ALLOC_LEN(len) ((len+16) & ~15) +/** + * nfs_copy_dname - copy dentry name to data structure + * @dentry: pointer to dentry + * @data: nfs_unlinkdata + */ +static int nfs_copy_dname(struct dentry *dentry, struct nfs_unlinkdata *data) +{ + char *str; + int len = dentry->d_name.len; + + str = kmemdup(dentry->d_name.name, NAME_ALLOC_LEN(len), GFP_KERNEL); + if (!str) + return -ENOMEM; + data->args.name.len = len; + data->args.name.name = str; + return 0; +} + +static void nfs_free_dname(struct nfs_unlinkdata *data) +{ + kfree(data->args.name.name); + data->args.name.name = NULL; + data->args.name.len = 0; +} + +static void nfs_dec_sillycount(struct inode *dir) +{ + struct nfs_inode *nfsi = NFS_I(dir); + if (atomic_dec_return(&nfsi->silly_count) == 1) + wake_up(&nfsi->waitqueue); +} + +/** + * nfs_async_unlink_done - Sillydelete post-processing + * @task: rpc_task of the sillydelete + * + * Do the directory attribute update. + */ +static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) +{ + struct nfs_unlinkdata *data = calldata; + struct inode *dir = data->dir; + + if (!NFS_PROTO(dir)->unlink_done(task, dir)) + nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client); +} + +/** + * nfs_async_unlink_release - Release the sillydelete data. + * @task: rpc_task of the sillydelete + * + * We need to call nfs_put_unlinkdata as a 'tk_release' task since the + * rpc_task would be freed too. + */ +static void nfs_async_unlink_release(void *calldata) +{ + struct nfs_unlinkdata *data = calldata; + struct super_block *sb = data->dir->i_sb; + + nfs_dec_sillycount(data->dir); + nfs_free_unlinkdata(data); + nfs_sb_deactive(sb); +} + +#if defined(CONFIG_NFS_V4_1) +void nfs_unlink_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_unlinkdata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->dir); + + if (nfs4_setup_sequence(server, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +} +#endif /* CONFIG_NFS_V4_1 */ + +static const struct rpc_call_ops nfs_unlink_ops = { + .rpc_call_done = nfs_async_unlink_done, + .rpc_release = nfs_async_unlink_release, +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_unlink_prepare, +#endif /* CONFIG_NFS_V4_1 */ +}; + +static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data) +{ + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_message = &msg, + .callback_ops = &nfs_unlink_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + struct rpc_task *task; + struct dentry *alias; + + alias = d_lookup(parent, &data->args.name); + if (alias != NULL) { + int ret = 0; + void *devname_garbage = NULL; + + /* + * Hey, we raced with lookup... See if we need to transfer + * the sillyrename information to the aliased dentry. + */ + nfs_free_dname(data); + spin_lock(&alias->d_lock); + if (alias->d_inode != NULL && + !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { + devname_garbage = alias->d_fsdata; + alias->d_fsdata = data; + alias->d_flags |= DCACHE_NFSFS_RENAMED; + ret = 1; + } + spin_unlock(&alias->d_lock); + nfs_dec_sillycount(dir); + dput(alias); + /* + * If we'd displaced old cached devname, free it. At that + * point dentry is definitely not a root, so we won't need + * that anymore. + */ + if (devname_garbage) + kfree(devname_garbage); + return ret; + } + data->dir = igrab(dir); + if (!data->dir) { + nfs_dec_sillycount(dir); + return 0; + } + nfs_sb_active(dir->i_sb); + data->args.fh = NFS_FH(dir); + nfs_fattr_init(data->res.dir_attr); + + NFS_PROTO(dir)->unlink_setup(&msg, dir); + + task_setup_data.rpc_client = NFS_CLIENT(dir); + task = rpc_run_task(&task_setup_data); + if (!IS_ERR(task)) + rpc_put_task_async(task); + return 1; +} + +static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) +{ + struct dentry *parent; + struct inode *dir; + int ret = 0; + + + parent = dget_parent(dentry); + if (parent == NULL) + goto out_free; + dir = parent->d_inode; + if (nfs_copy_dname(dentry, data) != 0) + goto out_dput; + /* Non-exclusive lock protects against concurrent lookup() calls */ + spin_lock(&dir->i_lock); + if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { + /* Deferred delete */ + hlist_add_head(&data->list, &NFS_I(dir)->silly_list); + spin_unlock(&dir->i_lock); + ret = 1; + goto out_dput; + } + spin_unlock(&dir->i_lock); + ret = nfs_do_call_unlink(parent, dir, data); +out_dput: + dput(parent); +out_free: + return ret; +} + +void nfs_block_sillyrename(struct dentry *dentry) +{ + struct nfs_inode *nfsi = NFS_I(dentry->d_inode); + + wait_event(nfsi->waitqueue, atomic_cmpxchg(&nfsi->silly_count, 1, 0) == 1); +} + +void nfs_unblock_sillyrename(struct dentry *dentry) +{ + struct inode *dir = dentry->d_inode; + struct nfs_inode *nfsi = NFS_I(dir); + struct nfs_unlinkdata *data; + + atomic_inc(&nfsi->silly_count); + spin_lock(&dir->i_lock); + while (!hlist_empty(&nfsi->silly_list)) { + if (!atomic_inc_not_zero(&nfsi->silly_count)) + break; + data = hlist_entry(nfsi->silly_list.first, struct nfs_unlinkdata, list); + hlist_del(&data->list); + spin_unlock(&dir->i_lock); + if (nfs_do_call_unlink(dentry, dir, data) == 0) + nfs_free_unlinkdata(data); + spin_lock(&dir->i_lock); + } + spin_unlock(&dir->i_lock); +} + +/** + * nfs_async_unlink - asynchronous unlinking of a file + * @dir: parent directory of dentry + * @dentry: dentry to unlink + */ +static int +nfs_async_unlink(struct inode *dir, struct dentry *dentry) +{ + struct nfs_unlinkdata *data; + int status = -ENOMEM; + void *devname_garbage = NULL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + goto out; + + data->cred = rpc_lookup_cred(); + if (IS_ERR(data->cred)) { + status = PTR_ERR(data->cred); + goto out_free; + } + data->res.dir_attr = &data->dir_attr; + + status = -EBUSY; + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + goto out_unlock; + dentry->d_flags |= DCACHE_NFSFS_RENAMED; + devname_garbage = dentry->d_fsdata; + dentry->d_fsdata = data; + spin_unlock(&dentry->d_lock); + /* + * If we'd displaced old cached devname, free it. At that + * point dentry is definitely not a root, so we won't need + * that anymore. + */ + if (devname_garbage) + kfree(devname_garbage); + return 0; +out_unlock: + spin_unlock(&dentry->d_lock); + put_rpccred(data->cred); +out_free: + kfree(data); +out: + return status; +} + +/** + * nfs_complete_unlink - Initialize completion of the sillydelete + * @dentry: dentry to delete + * @inode: inode + * + * Since we're most likely to be called by dentry_iput(), we + * only use the dentry to find the sillydelete. We then copy the name + * into the qstr. + */ +void +nfs_complete_unlink(struct dentry *dentry, struct inode *inode) +{ + struct nfs_unlinkdata *data = NULL; + + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; + data = dentry->d_fsdata; + dentry->d_fsdata = NULL; + } + spin_unlock(&dentry->d_lock); + + if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))) + nfs_free_unlinkdata(data); +} + +/* Cancel a queued async unlink. Called when a sillyrename run fails. */ +static void +nfs_cancel_async_unlink(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + struct nfs_unlinkdata *data = dentry->d_fsdata; + + dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; + dentry->d_fsdata = NULL; + spin_unlock(&dentry->d_lock); + nfs_free_unlinkdata(data); + return; + } + spin_unlock(&dentry->d_lock); +} + +struct nfs_renamedata { + struct nfs_renameargs args; + struct nfs_renameres res; + struct rpc_cred *cred; + struct inode *old_dir; + struct dentry *old_dentry; + struct nfs_fattr old_fattr; + struct inode *new_dir; + struct dentry *new_dentry; + struct nfs_fattr new_fattr; +}; + +/** + * nfs_async_rename_done - Sillyrename post-processing + * @task: rpc_task of the sillyrename + * @calldata: nfs_renamedata for the sillyrename + * + * Do the directory attribute updates and the d_move + */ +static void nfs_async_rename_done(struct rpc_task *task, void *calldata) +{ + struct nfs_renamedata *data = calldata; + struct inode *old_dir = data->old_dir; + struct inode *new_dir = data->new_dir; + + if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { + nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); + return; + } + + if (task->tk_status != 0) { + nfs_cancel_async_unlink(data->old_dentry); + return; + } + + nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir)); + d_move(data->old_dentry, data->new_dentry); +} + +/** + * nfs_async_rename_release - Release the sillyrename data. + * @calldata: the struct nfs_renamedata to be released + */ +static void nfs_async_rename_release(void *calldata) +{ + struct nfs_renamedata *data = calldata; + struct super_block *sb = data->old_dir->i_sb; + + if (data->old_dentry->d_inode) + nfs_mark_for_revalidate(data->old_dentry->d_inode); + + dput(data->old_dentry); + dput(data->new_dentry); + iput(data->old_dir); + iput(data->new_dir); + nfs_sb_deactive(sb); + put_rpccred(data->cred); + kfree(data); +} + +#if defined(CONFIG_NFS_V4_1) +static void nfs_rename_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_renamedata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->old_dir); + + if (nfs4_setup_sequence(server, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +} +#endif /* CONFIG_NFS_V4_1 */ + +static const struct rpc_call_ops nfs_rename_ops = { + .rpc_call_done = nfs_async_rename_done, + .rpc_release = nfs_async_rename_release, +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_rename_prepare, +#endif /* CONFIG_NFS_V4_1 */ +}; + +/** + * nfs_async_rename - perform an asynchronous rename operation + * @old_dir: directory that currently holds the dentry to be renamed + * @new_dir: target directory for the rename + * @old_dentry: original dentry to be renamed + * @new_dentry: dentry to which the old_dentry should be renamed + * + * It's expected that valid references to the dentries and inodes are held + */ +static struct rpc_task * +nfs_async_rename(struct inode *old_dir, struct inode *new_dir, + struct dentry *old_dentry, struct dentry *new_dentry) +{ + struct nfs_renamedata *data; + struct rpc_message msg = { }; + struct rpc_task_setup task_setup_data = { + .rpc_message = &msg, + .callback_ops = &nfs_rename_ops, + .workqueue = nfsiod_workqueue, + .rpc_client = NFS_CLIENT(old_dir), + .flags = RPC_TASK_ASYNC, + }; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return ERR_PTR(-ENOMEM); + task_setup_data.callback_data = data; + + data->cred = rpc_lookup_cred(); + if (IS_ERR(data->cred)) { + struct rpc_task *task = ERR_CAST(data->cred); + kfree(data); + return task; + } + + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + msg.rpc_cred = data->cred; + + /* set up nfs_renamedata */ + data->old_dir = old_dir; + ihold(old_dir); + data->new_dir = new_dir; + ihold(new_dir); + data->old_dentry = dget(old_dentry); + data->new_dentry = dget(new_dentry); + nfs_fattr_init(&data->old_fattr); + nfs_fattr_init(&data->new_fattr); + + /* set up nfs_renameargs */ + data->args.old_dir = NFS_FH(old_dir); + data->args.old_name = &old_dentry->d_name; + data->args.new_dir = NFS_FH(new_dir); + data->args.new_name = &new_dentry->d_name; + + /* set up nfs_renameres */ + data->res.old_fattr = &data->old_fattr; + data->res.new_fattr = &data->new_fattr; + + nfs_sb_active(old_dir->i_sb); + + NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir); + + return rpc_run_task(&task_setup_data); +} + +/** + * nfs_sillyrename - Perform a silly-rename of a dentry + * @dir: inode of directory that contains dentry + * @dentry: dentry to be sillyrenamed + * + * NFSv2/3 is stateless and the server doesn't know when the client is + * holding a file open. To prevent application problems when a file is + * unlinked while it's still open, the client performs a "silly-rename". + * That is, it renames the file to a hidden file in the same directory, + * and only performs the unlink once the last reference to it is put. + * + * The final cleanup is done during dentry_iput. + */ +int +nfs_sillyrename(struct inode *dir, struct dentry *dentry) +{ + static unsigned int sillycounter; + const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2; + const int countersize = sizeof(sillycounter)*2; + const int slen = sizeof(".nfs")+fileidsize+countersize-1; + char silly[slen+1]; + struct dentry *sdentry; + struct rpc_task *task; + int error = -EIO; + + dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + dentry->d_count); + nfs_inc_stats(dir, NFSIOS_SILLYRENAME); + + /* + * We don't allow a dentry to be silly-renamed twice. + */ + error = -EBUSY; + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + goto out; + + sprintf(silly, ".nfs%*.*Lx", + fileidsize, fileidsize, + (unsigned long long)NFS_FILEID(dentry->d_inode)); + + /* Return delegation in anticipation of the rename */ + nfs_inode_return_delegation(dentry->d_inode); + + sdentry = NULL; + do { + char *suffix = silly + slen - countersize; + + dput(sdentry); + sillycounter++; + sprintf(suffix, "%*.*x", countersize, countersize, sillycounter); + + dfprintk(VFS, "NFS: trying to rename %s to %s\n", + dentry->d_name.name, silly); + + sdentry = lookup_one_len(silly, dentry->d_parent, slen); + /* + * N.B. Better to return EBUSY here ... it could be + * dangerous to delete the file while it's in use. + */ + if (IS_ERR(sdentry)) + goto out; + } while (sdentry->d_inode != NULL); /* need negative lookup */ + + /* queue unlink first. Can't do this from rpc_release as it + * has to allocate memory + */ + error = nfs_async_unlink(dir, dentry); + if (error) + goto out_dput; + + /* run the rename task, undo unlink if it fails */ + task = nfs_async_rename(dir, dir, dentry, sdentry); + if (IS_ERR(task)) { + error = -EBUSY; + nfs_cancel_async_unlink(dentry); + goto out_dput; + } + + /* wait for the RPC task to complete, unless a SIGKILL intervenes */ + error = rpc_wait_for_completion_task(task); + if (error == 0) + error = task->tk_status; + rpc_put_task(task); +out_dput: + dput(sdentry); +out: + return error; +} diff --git a/fs/nfs/write.c b/fs/nfs/write.c new file mode 100644 index 00000000..f2f80c00 --- /dev/null +++ b/fs/nfs/write.c @@ -0,0 +1,1732 @@ +/* + * linux/fs/nfs/write.c + * + * Write file data over NFS. + * + * Copyright (C) 1996, 1997, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "delegation.h" +#include "internal.h" +#include "iostat.h" +#include "nfs4_fs.h" +#include "fscache.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PAGECACHE + +#define MIN_POOL_WRITE (32) +#define MIN_POOL_COMMIT (4) + +/* + * Local function declarations + */ +static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc, + struct inode *inode, int ioflags); +static void nfs_redirty_request(struct nfs_page *req); +static const struct rpc_call_ops nfs_write_partial_ops; +static const struct rpc_call_ops nfs_write_full_ops; +static const struct rpc_call_ops nfs_commit_ops; + +static struct kmem_cache *nfs_wdata_cachep; +static mempool_t *nfs_wdata_mempool; +static mempool_t *nfs_commit_mempool; + +struct nfs_write_data *nfs_commitdata_alloc(void) +{ + struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); + + if (p) { + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->pages); + } + return p; +} +EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); + +void nfs_commit_free(struct nfs_write_data *p) +{ + if (p && (p->pagevec != &p->page_array[0])) + kfree(p->pagevec); + mempool_free(p, nfs_commit_mempool); +} +EXPORT_SYMBOL_GPL(nfs_commit_free); + +struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) +{ + struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); + + if (p) { + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->pages); + p->npages = pagecount; + if (pagecount <= ARRAY_SIZE(p->page_array)) + p->pagevec = p->page_array; + else { + p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); + if (!p->pagevec) { + mempool_free(p, nfs_wdata_mempool); + p = NULL; + } + } + } + return p; +} + +void nfs_writedata_free(struct nfs_write_data *p) +{ + if (p && (p->pagevec != &p->page_array[0])) + kfree(p->pagevec); + mempool_free(p, nfs_wdata_mempool); +} + +static void nfs_writedata_release(struct nfs_write_data *wdata) +{ + put_lseg(wdata->lseg); + put_nfs_open_context(wdata->args.context); + nfs_writedata_free(wdata); +} + +static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) +{ + ctx->error = error; + smp_wmb(); + set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); +} + +static struct nfs_page *nfs_page_find_request_locked(struct page *page) +{ + struct nfs_page *req = NULL; + + if (PagePrivate(page)) { + req = (struct nfs_page *)page_private(page); + if (req != NULL) + kref_get(&req->wb_kref); + } + return req; +} + +static struct nfs_page *nfs_page_find_request(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct nfs_page *req = NULL; + + spin_lock(&inode->i_lock); + req = nfs_page_find_request_locked(page); + spin_unlock(&inode->i_lock); + return req; +} + +/* Adjust the file length if we're writing beyond the end */ +static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) +{ + struct inode *inode = page->mapping->host; + loff_t end, i_size; + pgoff_t end_index; + + spin_lock(&inode->i_lock); + i_size = i_size_read(inode); + end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + if (i_size > 0 && page->index < end_index) + goto out; + end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); + if (i_size >= end) + goto out; + i_size_write(inode, end); + nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); +out: + spin_unlock(&inode->i_lock); +} + +/* A writeback failed: mark the page as bad, and invalidate the page cache */ +static void nfs_set_pageerror(struct page *page) +{ + SetPageError(page); + nfs_zap_mapping(page->mapping->host, page->mapping); +} + +/* We can set the PG_uptodate flag if we see that a write request + * covers the full page. + */ +static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count) +{ + if (PageUptodate(page)) + return; + if (base != 0) + return; + if (count != nfs_page_length(page)) + return; + SetPageUptodate(page); +} + +static int wb_priority(struct writeback_control *wbc) +{ + if (wbc->for_reclaim) + return FLUSH_HIGHPRI | FLUSH_STABLE; + if (wbc->for_kupdate || wbc->for_background) + return FLUSH_LOWPRI | FLUSH_COND_STABLE; + return FLUSH_COND_STABLE; +} + +/* + * NFS congestion control + */ + +int nfs_congestion_kb; + +#define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10)) +#define NFS_CONGESTION_OFF_THRESH \ + (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) + +static int nfs_set_page_writeback(struct page *page) +{ + int ret = test_set_page_writeback(page); + + if (!ret) { + struct inode *inode = page->mapping->host; + struct nfs_server *nfss = NFS_SERVER(inode); + + page_cache_get(page); + if (atomic_long_inc_return(&nfss->writeback) > + NFS_CONGESTION_ON_THRESH) { + set_bdi_congested(&nfss->backing_dev_info, + BLK_RW_ASYNC); + } + } + return ret; +} + +static void nfs_end_page_writeback(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct nfs_server *nfss = NFS_SERVER(inode); + + end_page_writeback(page); + page_cache_release(page); + if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) + clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); +} + +static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) +{ + struct inode *inode = page->mapping->host; + struct nfs_page *req; + int ret; + + spin_lock(&inode->i_lock); + for (;;) { + req = nfs_page_find_request_locked(page); + if (req == NULL) + break; + if (nfs_set_page_tag_locked(req)) + break; + /* Note: If we hold the page lock, as is the case in nfs_writepage, + * then the call to nfs_set_page_tag_locked() will always + * succeed provided that someone hasn't already marked the + * request as dirty (in which case we don't care). + */ + spin_unlock(&inode->i_lock); + if (!nonblock) + ret = nfs_wait_on_request(req); + else + ret = -EAGAIN; + nfs_release_request(req); + if (ret != 0) + return ERR_PTR(ret); + spin_lock(&inode->i_lock); + } + spin_unlock(&inode->i_lock); + return req; +} + +/* + * Find an associated nfs write request, and prepare to flush it out + * May return an error if the user signalled nfs_wait_on_request(). + */ +static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, + struct page *page, bool nonblock) +{ + struct nfs_page *req; + int ret = 0; + + req = nfs_find_and_lock_request(page, nonblock); + if (!req) + goto out; + ret = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + ret = nfs_set_page_writeback(page); + BUG_ON(ret != 0); + BUG_ON(test_bit(PG_CLEAN, &req->wb_flags)); + + if (!nfs_pageio_add_request(pgio, req)) { + nfs_redirty_request(req); + ret = pgio->pg_error; + } +out: + return ret; +} + +static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) +{ + struct inode *inode = page->mapping->host; + int ret; + + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); + nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); + + nfs_pageio_cond_complete(pgio, page->index); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); + if (ret == -EAGAIN) { + redirty_page_for_writepage(wbc, page); + ret = 0; + } + return ret; +} + +/* + * Write an mmapped page to the server. + */ +static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) +{ + struct nfs_pageio_descriptor pgio; + int err; + + nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc)); + err = nfs_do_writepage(page, wbc, &pgio); + nfs_pageio_complete(&pgio); + if (err < 0) + return err; + if (pgio.pg_error < 0) + return pgio.pg_error; + return 0; +} + +int nfs_writepage(struct page *page, struct writeback_control *wbc) +{ + int ret; + + ret = nfs_writepage_locked(page, wbc); + unlock_page(page); + return ret; +} + +static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) +{ + int ret; + + ret = nfs_do_writepage(page, wbc, data); + unlock_page(page); + return ret; +} + +int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + unsigned long *bitlock = &NFS_I(inode)->flags; + struct nfs_pageio_descriptor pgio; + int err; + + /* Stop dirtying of new pages while we sync */ + err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (err) + goto out_err; + + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); + + nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); + err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); + nfs_pageio_complete(&pgio); + + clear_bit_unlock(NFS_INO_FLUSHING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_FLUSHING); + + if (err < 0) + goto out_err; + err = pgio.pg_error; + if (err < 0) + goto out_err; + return 0; +out_err: + return err; +} + +/* + * Insert a write request into an inode + */ +static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int error; + + error = radix_tree_preload(GFP_NOFS); + if (error != 0) + goto out; + + /* Lock the request! */ + nfs_lock_request_dontget(req); + + spin_lock(&inode->i_lock); + error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); + BUG_ON(error); + if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) + nfsi->change_attr++; + set_bit(PG_MAPPED, &req->wb_flags); + SetPagePrivate(req->wb_page); + set_page_private(req->wb_page, (unsigned long)req); + nfsi->npages++; + kref_get(&req->wb_kref); + radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, + NFS_PAGE_TAG_LOCKED); + spin_unlock(&inode->i_lock); + radix_tree_preload_end(); +out: + return error; +} + +/* + * Remove a write request from an inode + */ +static void nfs_inode_remove_request(struct nfs_page *req) +{ + struct inode *inode = req->wb_context->path.dentry->d_inode; + struct nfs_inode *nfsi = NFS_I(inode); + + BUG_ON (!NFS_WBACK_BUSY(req)); + + spin_lock(&inode->i_lock); + set_page_private(req->wb_page, 0); + ClearPagePrivate(req->wb_page); + clear_bit(PG_MAPPED, &req->wb_flags); + radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); + nfsi->npages--; + spin_unlock(&inode->i_lock); + nfs_release_request(req); +} + +static void +nfs_mark_request_dirty(struct nfs_page *req) +{ + __set_page_dirty_nobuffers(req->wb_page); +} + +#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +/* + * Add a request to the inode's commit list. + */ +static void +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +{ + struct inode *inode = req->wb_context->path.dentry->d_inode; + struct nfs_inode *nfsi = NFS_I(inode); + + spin_lock(&inode->i_lock); + set_bit(PG_CLEAN, &(req)->wb_flags); + radix_tree_tag_set(&nfsi->nfs_page_tree, + req->wb_index, + NFS_PAGE_TAG_COMMIT); + nfsi->ncommit++; + spin_unlock(&inode->i_lock); + pnfs_mark_request_commit(req, lseg); + inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); +} + +static int +nfs_clear_request_commit(struct nfs_page *req) +{ + struct page *page = req->wb_page; + + if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { + dec_zone_page_state(page, NR_UNSTABLE_NFS); + dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); + return 1; + } + return 0; +} + +static inline +int nfs_write_need_commit(struct nfs_write_data *data) +{ + if (data->verf.committed == NFS_DATA_SYNC) + return data->lseg == NULL; + else + return data->verf.committed != NFS_FILE_SYNC; +} + +static inline +int nfs_reschedule_unstable_write(struct nfs_page *req, + struct nfs_write_data *data) +{ + if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) { + nfs_mark_request_commit(req, data->lseg); + return 1; + } + if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { + nfs_mark_request_dirty(req); + return 1; + } + return 0; +} +#else +static inline void +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +{ +} + +static inline int +nfs_clear_request_commit(struct nfs_page *req) +{ + return 0; +} + +static inline +int nfs_write_need_commit(struct nfs_write_data *data) +{ + return 0; +} + +static inline +int nfs_reschedule_unstable_write(struct nfs_page *req, + struct nfs_write_data *data) +{ + return 0; +} +#endif + +#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +static int +nfs_need_commit(struct nfs_inode *nfsi) +{ + return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT); +} + +/* + * nfs_scan_commit - Scan an inode for commit requests + * @inode: NFS inode to scan + * @dst: destination list + * @idx_start: lower bound of page->index to scan. + * @npages: idx_start + npages sets the upper bound to scan. + * + * Moves requests from the inode's 'commit' request list. + * The requests are *not* checked to ensure that they form a contiguous set. + */ +static int +nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int ret; + + if (!nfs_need_commit(nfsi)) + return 0; + + spin_lock(&inode->i_lock); + ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); + if (ret > 0) + nfsi->ncommit -= ret; + spin_unlock(&inode->i_lock); + + if (nfs_need_commit(NFS_I(inode))) + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + + return ret; +} +#else +static inline int nfs_need_commit(struct nfs_inode *nfsi) +{ + return 0; +} + +static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) +{ + return 0; +} +#endif + +/* + * Search for an existing write request, and attempt to update + * it to reflect a new dirty region on a given page. + * + * If the attempt fails, then the existing request is flushed out + * to disk. + */ +static struct nfs_page *nfs_try_to_update_request(struct inode *inode, + struct page *page, + unsigned int offset, + unsigned int bytes) +{ + struct nfs_page *req; + unsigned int rqend; + unsigned int end; + int error; + + if (!PagePrivate(page)) + return NULL; + + end = offset + bytes; + spin_lock(&inode->i_lock); + + for (;;) { + req = nfs_page_find_request_locked(page); + if (req == NULL) + goto out_unlock; + + rqend = req->wb_offset + req->wb_bytes; + /* + * Tell the caller to flush out the request if + * the offsets are non-contiguous. + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ + if (offset > rqend + || end < req->wb_offset) + goto out_flushme; + + if (nfs_set_page_tag_locked(req)) + break; + + /* The request is locked, so wait and then retry */ + spin_unlock(&inode->i_lock); + error = nfs_wait_on_request(req); + nfs_release_request(req); + if (error != 0) + goto out_err; + spin_lock(&inode->i_lock); + } + + if (nfs_clear_request_commit(req) && + radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) { + NFS_I(inode)->ncommit--; + pnfs_clear_request_commit(req); + } + + /* Okay, the request matches. Update the region */ + if (offset < req->wb_offset) { + req->wb_offset = offset; + req->wb_pgbase = offset; + } + if (end > rqend) + req->wb_bytes = end - req->wb_offset; + else + req->wb_bytes = rqend - req->wb_offset; +out_unlock: + spin_unlock(&inode->i_lock); + return req; +out_flushme: + spin_unlock(&inode->i_lock); + nfs_release_request(req); + error = nfs_wb_page(inode, page); +out_err: + return ERR_PTR(error); +} + +/* + * Try to update an existing write request, or create one if there is none. + * + * Note: Should always be called with the Page Lock held to prevent races + * if we have to add a new request. Also assumes that the caller has + * already called nfs_flush_incompatible() if necessary. + */ +static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, + struct page *page, unsigned int offset, unsigned int bytes) +{ + struct inode *inode = page->mapping->host; + struct nfs_page *req; + int error; + + req = nfs_try_to_update_request(inode, page, offset, bytes); + if (req != NULL) + goto out; + req = nfs_create_request(ctx, inode, page, offset, bytes); + if (IS_ERR(req)) + goto out; + error = nfs_inode_add_request(inode, req); + if (error != 0) { + nfs_release_request(req); + req = ERR_PTR(error); + } +out: + return req; +} + +static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, + unsigned int offset, unsigned int count) +{ + struct nfs_page *req; + + req = nfs_setup_write_request(ctx, page, offset, count); + if (IS_ERR(req)) + return PTR_ERR(req); + /* Update file length */ + nfs_grow_file(page, offset, count); + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + return 0; +} + +int nfs_flush_incompatible(struct file *file, struct page *page) +{ + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct nfs_page *req; + int do_flush, status; + /* + * Look for a request corresponding to this page. If there + * is one, and it belongs to another file, we flush it out + * before we try to copy anything into the page. Do this + * due to the lack of an ACCESS-type call in NFSv2. + * Also do the same if we find a request from an existing + * dropped page. + */ + do { + req = nfs_page_find_request(page); + if (req == NULL) + return 0; + do_flush = req->wb_page != page || req->wb_context != ctx || + req->wb_lock_context->lockowner != current->files || + req->wb_lock_context->pid != current->tgid; + nfs_release_request(req); + if (!do_flush) + return 0; + status = nfs_wb_page(page->mapping->host, page); + } while (status == 0); + return status; +} + +/* + * If the page cache is marked as unsafe or invalid, then we can't rely on + * the PageUptodate() flag. In this case, we will need to turn off + * write optimisations that depend on the page contents being correct. + */ +static int nfs_write_pageuptodate(struct page *page, struct inode *inode) +{ + return PageUptodate(page) && + !(NFS_I(inode)->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA)); +} + +/* + * Update and possibly write a cached page of an NFS file. + * + * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad + * things with a page scheduled for an RPC call (e.g. invalidate it). + */ +int nfs_updatepage(struct file *file, struct page *page, + unsigned int offset, unsigned int count) +{ + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct inode *inode = page->mapping->host; + int status = 0; + + nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); + + dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, count, + (long long)(page_offset(page) + offset)); + + /* If we're not using byte range locks, and we know the page + * is up to date, it may be more efficient to extend the write + * to cover the entire page in order to avoid fragmentation + * inefficiencies. + */ + if (nfs_write_pageuptodate(page, inode) && + inode->i_flock == NULL && + !(file->f_flags & O_DSYNC)) { + count = max(count + offset, nfs_page_length(page)); + offset = 0; + } + + status = nfs_writepage_setup(ctx, page, offset, count); + if (status < 0) + nfs_set_pageerror(page); + else + __set_page_dirty_nobuffers(page); + + dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", + status, (long long)i_size_read(inode)); + return status; +} + +static void nfs_writepage_release(struct nfs_page *req, + struct nfs_write_data *data) +{ + struct page *page = req->wb_page; + + if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data)) + nfs_inode_remove_request(req); + nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); +} + +static int flush_task_priority(int how) +{ + switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) { + case FLUSH_HIGHPRI: + return RPC_PRIORITY_HIGH; + case FLUSH_LOWPRI: + return RPC_PRIORITY_LOW; + } + return RPC_PRIORITY_NORMAL; +} + +int nfs_initiate_write(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how) +{ + struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .task = &data->task, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + .priority = priority, + }; + int ret = 0; + + /* Set up the initial task struct. */ + NFS_PROTO(inode)->write_setup(data, &msg); + + dprintk("NFS: %5u initiated write call " + "(req %s/%lld, %u bytes @ offset %llu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + if (how & FLUSH_SYNC) { + ret = rpc_wait_for_completion_task(task); + if (ret == 0) + ret = task->tk_status; + } + rpc_put_task(task); +out: + return ret; +} +EXPORT_SYMBOL_GPL(nfs_initiate_write); + +/* + * Set up the argument/result storage required for the RPC call. + */ +static int nfs_write_rpcsetup(struct nfs_page *req, + struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, + unsigned int count, unsigned int offset, + struct pnfs_layout_segment *lseg, + int how) +{ + struct inode *inode = req->wb_context->path.dentry->d_inode; + + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + data->req = req; + data->inode = inode = req->wb_context->path.dentry->d_inode; + data->cred = req->wb_context->cred; + data->lseg = get_lseg(lseg); + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; + /* pnfs_set_layoutcommit needs this */ + data->mds_offset = data->args.offset; + data->args.pgbase = req->wb_pgbase + offset; + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); + data->args.lock_context = req->wb_lock_context; + data->args.stable = NFS_UNSTABLE; + if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { + data->args.stable = NFS_DATA_SYNC; + if (!nfs_need_commit(NFS_I(inode))) + data->args.stable = NFS_FILE_SYNC; + } + + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + + if (data->lseg && + (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) + return 0; + + return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); +} + +/* If a nfs_flush_* function fails, it should remove reqs from @head and + * call this on each, which will prepare them to be retried on next + * writeback using standard nfs. + */ +static void nfs_redirty_request(struct nfs_page *req) +{ + struct page *page = req->wb_page; + + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); +} + +/* + * Generate multiple small requests to write out a single + * contiguous dirty area on one page. + */ +static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) +{ + struct nfs_page *req = nfs_list_entry(desc->pg_list.next); + struct page *page = req->wb_page; + struct nfs_write_data *data; + size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes; + unsigned int offset; + int requests = 0; + int ret = 0; + struct pnfs_layout_segment *lseg; + LIST_HEAD(list); + + nfs_list_remove_request(req); + + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit || + desc->pg_count > wsize)) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + + + nbytes = desc->pg_count; + do { + size_t len = min(nbytes, wsize); + + data = nfs_writedata_alloc(1); + if (!data) + goto out_bad; + list_add(&data->pages, &list); + requests++; + nbytes -= len; + } while (nbytes != 0); + atomic_set(&req->wb_complete, requests); + + BUG_ON(desc->pg_lseg); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, + req_offset(req), desc->pg_count, + IOMODE_RW, GFP_NOFS); + ClearPageError(page); + offset = 0; + nbytes = desc->pg_count; + do { + int ret2; + + data = list_entry(list.next, struct nfs_write_data, pages); + list_del_init(&data->pages); + + data->pagevec[0] = page; + + if (nbytes < wsize) + wsize = nbytes; + ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, + wsize, offset, lseg, desc->pg_ioflags); + if (ret == 0) + ret = ret2; + offset += wsize; + nbytes -= wsize; + } while (nbytes != 0); + + put_lseg(lseg); + desc->pg_lseg = NULL; + return ret; + +out_bad: + while (!list_empty(&list)) { + data = list_entry(list.next, struct nfs_write_data, pages); + list_del(&data->pages); + nfs_writedata_free(data); + } + nfs_redirty_request(req); + return -ENOMEM; +} + +/* + * Create an RPC task for the given write request and kick it. + * The page must have been locked by the caller. + * + * It may happen that the page we're passed is not marked dirty. + * This is the case if nfs_updatepage detects a conflicting request + * that has been written but not committed. + */ +static int nfs_flush_one(struct nfs_pageio_descriptor *desc) +{ + struct nfs_page *req; + struct page **pages; + struct nfs_write_data *data; + struct list_head *head = &desc->pg_list; + struct pnfs_layout_segment *lseg = desc->pg_lseg; + int ret; + + data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, + desc->pg_count)); + if (!data) { + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_redirty_request(req); + } + ret = -ENOMEM; + goto out; + } + pages = data->pagevec; + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_list_add_request(req, &data->pages); + ClearPageError(req->wb_page); + *pages++ = req->wb_page; + } + req = nfs_list_entry(data->pages.next); + if ((!lseg) && list_is_singular(&data->pages)) + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, + req_offset(req), desc->pg_count, + IOMODE_RW, GFP_NOFS); + + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + + /* Set up the argument struct */ + ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); +out: + put_lseg(lseg); /* Cleans any gotten in ->pg_test */ + desc->pg_lseg = NULL; + return ret; +} + +static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags) +{ + size_t wsize = NFS_SERVER(inode)->wsize; + + if (wsize < PAGE_CACHE_SIZE) + nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); + else + nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags); +} + +/* + * Handle a write reply that flushed part of a page. + */ +static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) +{ + struct nfs_write_data *data = calldata; + + dprintk("NFS: %5u write(%s/%lld %d@%lld)", + task->tk_pid, + data->req->wb_context->path.dentry->d_inode->i_sb->s_id, + (long long) + NFS_FILEID(data->req->wb_context->path.dentry->d_inode), + data->req->wb_bytes, (long long)req_offset(data->req)); + + nfs_writeback_done(task, data); +} + +static void nfs_writeback_release_partial(void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_page *req = data->req; + struct page *page = req->wb_page; + int status = data->task.tk_status; + + if (status < 0) { + nfs_set_pageerror(page); + nfs_context_set_write_error(req->wb_context, status); + dprintk(", error = %d\n", status); + goto out; + } + + if (nfs_write_need_commit(data)) { + struct inode *inode = page->mapping->host; + + spin_lock(&inode->i_lock); + if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) { + /* Do nothing we need to resend the writes */ + } else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) { + memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); + dprintk(" defer commit\n"); + } else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) { + set_bit(PG_NEED_RESCHED, &req->wb_flags); + clear_bit(PG_NEED_COMMIT, &req->wb_flags); + dprintk(" server reboot detected\n"); + } + spin_unlock(&inode->i_lock); + } else + dprintk(" OK\n"); + +out: + if (atomic_dec_and_test(&req->wb_complete)) + nfs_writepage_release(req, data); + nfs_writedata_release(calldata); +} + +#if defined(CONFIG_NFS_V4_1) +void nfs_write_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_write_data *data = calldata; + + if (nfs4_setup_sequence(NFS_SERVER(data->inode), + &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +} +#endif /* CONFIG_NFS_V4_1 */ + +static const struct rpc_call_ops nfs_write_partial_ops = { +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_write_prepare, +#endif /* CONFIG_NFS_V4_1 */ + .rpc_call_done = nfs_writeback_done_partial, + .rpc_release = nfs_writeback_release_partial, +}; + +/* + * Handle a write reply that flushes a whole page. + * + * FIXME: There is an inherent race with invalidate_inode_pages and + * writebacks since the page->count is kept > 1 for as long + * as the page has a write request pending. + */ +static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) +{ + struct nfs_write_data *data = calldata; + + nfs_writeback_done(task, data); +} + +static void nfs_writeback_release_full(void *calldata) +{ + struct nfs_write_data *data = calldata; + int status = data->task.tk_status; + + /* Update attributes as result of writeback. */ + while (!list_empty(&data->pages)) { + struct nfs_page *req = nfs_list_entry(data->pages.next); + struct page *page = req->wb_page; + + nfs_list_remove_request(req); + + dprintk("NFS: %5u write (%s/%lld %d@%lld)", + data->task.tk_pid, + req->wb_context->path.dentry->d_inode->i_sb->s_id, + (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), + req->wb_bytes, + (long long)req_offset(req)); + + if (status < 0) { + nfs_set_pageerror(page); + nfs_context_set_write_error(req->wb_context, status); + dprintk(", error = %d\n", status); + goto remove_request; + } + + if (nfs_write_need_commit(data)) { + memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); + nfs_mark_request_commit(req, data->lseg); + dprintk(" marked for commit\n"); + goto next; + } + dprintk(" OK\n"); +remove_request: + nfs_inode_remove_request(req); + next: + nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); + } + nfs_writedata_release(calldata); +} + +static const struct rpc_call_ops nfs_write_full_ops = { +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_write_prepare, +#endif /* CONFIG_NFS_V4_1 */ + .rpc_call_done = nfs_writeback_done_full, + .rpc_release = nfs_writeback_release_full, +}; + + +/* + * This function is called when the WRITE call is complete. + */ +void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) +{ + struct nfs_writeargs *argp = &data->args; + struct nfs_writeres *resp = &data->res; + struct nfs_server *server = NFS_SERVER(data->inode); + int status; + + dprintk("NFS: %5u nfs_writeback_done (status %d)\n", + task->tk_pid, task->tk_status); + + /* + * ->write_done will attempt to use post-op attributes to detect + * conflicting writes by other clients. A strict interpretation + * of close-to-open would allow us to continue caching even if + * another writer had changed the file, but some applications + * depend on tighter cache coherency when writing. + */ + status = NFS_PROTO(data->inode)->write_done(task, data); + if (status != 0) + return; + nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); + +#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) + if (resp->verf->committed < argp->stable && task->tk_status >= 0) { + /* We tried a write call, but the server did not + * commit data to stable storage even though we + * requested it. + * Note: There is a known bug in Tru64 < 5.0 in which + * the server reports NFS_DATA_SYNC, but performs + * NFS_FILE_SYNC. We therefore implement this checking + * as a dprintk() in order to avoid filling syslog. + */ + static unsigned long complain; + + /* Note this will print the MDS for a DS write */ + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", + server->nfs_client->cl_hostname, + resp->verf->committed, argp->stable); + complain = jiffies + 300 * HZ; + } + } +#endif + /* Is this a short write? */ + if (task->tk_status >= 0 && resp->count < argp->count) { + static unsigned long complain; + + nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); + + /* Has the server at least made some progress? */ + if (resp->count != 0) { + /* Was this an NFSv2 write or an NFSv3 stable write? */ + if (resp->verf->committed != NFS_UNSTABLE) { + /* Resend from where the server left off */ + data->mds_offset += resp->count; + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; + } else { + /* Resend as a stable write in order to avoid + * headaches in the case of a server crash. + */ + argp->stable = NFS_FILE_SYNC; + } + nfs_restart_rpc(task, server->nfs_client); + return; + } + if (time_before(complain, jiffies)) { + printk(KERN_WARNING + "NFS: Server wrote zero bytes, expected %u.\n", + argp->count); + complain = jiffies + 300 * HZ; + } + /* Can't do anything about it except throw an error. */ + task->tk_status = -EIO; + } + return; +} + + +#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) +{ + int ret; + + if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) + return 1; + if (!may_wait) + return 0; + ret = out_of_line_wait_on_bit_lock(&nfsi->flags, + NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + return (ret < 0) ? ret : 1; +} + +void nfs_commit_clear_lock(struct nfs_inode *nfsi) +{ + clear_bit(NFS_INO_COMMIT, &nfsi->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); +} +EXPORT_SYMBOL_GPL(nfs_commit_clear_lock); + +void nfs_commitdata_release(void *data) +{ + struct nfs_write_data *wdata = data; + + put_lseg(wdata->lseg); + put_nfs_open_context(wdata->args.context); + nfs_commit_free(wdata); +} +EXPORT_SYMBOL_GPL(nfs_commitdata_release); + +int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how) +{ + struct rpc_task *task; + int priority = flush_task_priority(how); + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + .priority = priority, + }; + /* Set up the initial task struct. */ + NFS_PROTO(data->inode)->commit_setup(data, &msg); + + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (how & FLUSH_SYNC) + rpc_wait_for_completion_task(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL_GPL(nfs_initiate_commit); + +/* + * Set up the argument/result storage required for the RPC call. + */ +void nfs_init_commit(struct nfs_write_data *data, + struct list_head *head, + struct pnfs_layout_segment *lseg) +{ + struct nfs_page *first = nfs_list_entry(head->next); + struct inode *inode = first->wb_context->path.dentry->d_inode; + + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + list_splice_init(head, &data->pages); + + data->inode = inode; + data->cred = first->wb_context->cred; + data->lseg = lseg; /* reference transferred */ + data->mds_ops = &nfs_commit_ops; + + data->args.fh = NFS_FH(data->inode); + /* Note: we always request a commit of the entire inode */ + data->args.offset = 0; + data->args.count = 0; + data->args.context = get_nfs_open_context(first->wb_context); + data->res.count = 0; + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); +} +EXPORT_SYMBOL_GPL(nfs_init_commit); + +void nfs_retry_commit(struct list_head *page_list, + struct pnfs_layout_segment *lseg) +{ + struct nfs_page *req; + + while (!list_empty(page_list)) { + req = nfs_list_entry(page_list->next); + nfs_list_remove_request(req); + nfs_mark_request_commit(req, lseg); + dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + dec_bdi_stat(req->wb_page->mapping->backing_dev_info, + BDI_RECLAIMABLE); + nfs_clear_page_tag_locked(req); + } +} +EXPORT_SYMBOL_GPL(nfs_retry_commit); + +/* + * Commit dirty pages + */ +static int +nfs_commit_list(struct inode *inode, struct list_head *head, int how) +{ + struct nfs_write_data *data; + + data = nfs_commitdata_alloc(); + + if (!data) + goto out_bad; + + /* Set up the argument struct */ + nfs_init_commit(data, head, NULL); + return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how); + out_bad: + nfs_retry_commit(head, NULL); + nfs_commit_clear_lock(NFS_I(inode)); + return -ENOMEM; +} + +/* + * COMMIT call returned + */ +static void nfs_commit_done(struct rpc_task *task, void *calldata) +{ + struct nfs_write_data *data = calldata; + + dprintk("NFS: %5u nfs_commit_done (status %d)\n", + task->tk_pid, task->tk_status); + + /* Call the NFS version-specific code */ + NFS_PROTO(data->inode)->commit_done(task, data); +} + +void nfs_commit_release_pages(struct nfs_write_data *data) +{ + struct nfs_page *req; + int status = data->task.tk_status; + + while (!list_empty(&data->pages)) { + req = nfs_list_entry(data->pages.next); + nfs_list_remove_request(req); + nfs_clear_request_commit(req); + + dprintk("NFS: commit (%s/%lld %d@%lld)", + req->wb_context->path.dentry->d_inode->i_sb->s_id, + (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), + req->wb_bytes, + (long long)req_offset(req)); + if (status < 0) { + nfs_context_set_write_error(req->wb_context, status); + nfs_inode_remove_request(req); + dprintk(", error = %d\n", status); + goto next; + } + + /* Okay, COMMIT succeeded, apparently. Check the verifier + * returned by the server against all stored verfs. */ + if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) { + /* We have a match */ + nfs_inode_remove_request(req); + dprintk(" OK\n"); + goto next; + } + /* We have a mismatch. Write the page again */ + dprintk(" mismatch\n"); + nfs_mark_request_dirty(req); + next: + nfs_clear_page_tag_locked(req); + } +} +EXPORT_SYMBOL_GPL(nfs_commit_release_pages); + +static void nfs_commit_release(void *calldata) +{ + struct nfs_write_data *data = calldata; + + nfs_commit_release_pages(data); + nfs_commit_clear_lock(NFS_I(data->inode)); + nfs_commitdata_release(calldata); +} + +static const struct rpc_call_ops nfs_commit_ops = { +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_write_prepare, +#endif /* CONFIG_NFS_V4_1 */ + .rpc_call_done = nfs_commit_done, + .rpc_release = nfs_commit_release, +}; + +int nfs_commit_inode(struct inode *inode, int how) +{ + LIST_HEAD(head); + int may_wait = how & FLUSH_SYNC; + int res; + + res = nfs_commit_set_lock(NFS_I(inode), may_wait); + if (res <= 0) + goto out_mark_dirty; + res = nfs_scan_commit(inode, &head, 0, 0); + if (res) { + int error; + + error = pnfs_commit_list(inode, &head, how); + if (error == PNFS_NOT_ATTEMPTED) + error = nfs_commit_list(inode, &head, how); + if (error < 0) + return error; + if (!may_wait) + goto out_mark_dirty; + error = wait_on_bit(&NFS_I(inode)->flags, + NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + if (error < 0) + return error; + } else + nfs_commit_clear_lock(NFS_I(inode)); + return res; + /* Note: If we exit without ensuring that the commit is complete, + * we must mark the inode as dirty. Otherwise, future calls to + * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure + * that the data is on the disk. + */ +out_mark_dirty: + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + return res; +} + +static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int flags = FLUSH_SYNC; + int ret = 0; + + /* no commits means nothing needs to be done */ + if (!nfsi->ncommit) + return ret; + + if (wbc->sync_mode == WB_SYNC_NONE) { + /* Don't commit yet if this is a non-blocking flush and there + * are a lot of outstanding writes for this mapping. + */ + if (nfsi->ncommit <= (nfsi->npages >> 1)) + goto out_mark_dirty; + + /* don't wait for the COMMIT response */ + flags = 0; + } + + ret = nfs_commit_inode(inode, flags); + if (ret >= 0) { + if (wbc->sync_mode == WB_SYNC_NONE) { + if (ret < wbc->nr_to_write) + wbc->nr_to_write -= ret; + else + wbc->nr_to_write = 0; + } + return 0; + } +out_mark_dirty: + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + return ret; +} +#else +static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) +{ + return 0; +} +#endif + +int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int ret; + + ret = nfs_commit_unstable_pages(inode, wbc); + if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { + int status; + bool sync = true; + + if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || + wbc->for_background) + sync = false; + + status = pnfs_layoutcommit_inode(inode, sync); + if (status < 0) + return status; + } + return ret; +} + +/* + * flush the inode to disk. + */ +int nfs_wb_all(struct inode *inode) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = 0, + .range_end = LLONG_MAX, + }; + + return sync_inode(inode, &wbc); +} + +int nfs_wb_page_cancel(struct inode *inode, struct page *page) +{ + struct nfs_page *req; + int ret = 0; + + BUG_ON(!PageLocked(page)); + for (;;) { + wait_on_page_writeback(page); + req = nfs_page_find_request(page); + if (req == NULL) + break; + if (nfs_lock_request_dontget(req)) { + nfs_inode_remove_request(req); + /* + * In case nfs_inode_remove_request has marked the + * page as being dirty + */ + cancel_dirty_page(page, PAGE_CACHE_SIZE); + nfs_unlock_request(req); + break; + } + ret = nfs_wait_on_request(req); + nfs_release_request(req); + if (ret < 0) + break; + } + return ret; +} + +/* + * Write back all requests on one page - we do this before reading it. + */ +int nfs_wb_page(struct inode *inode, struct page *page) +{ + loff_t range_start = page_offset(page); + loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, + .range_start = range_start, + .range_end = range_end, + }; + int ret; + + for (;;) { + wait_on_page_writeback(page); + if (clear_page_dirty_for_io(page)) { + ret = nfs_writepage_locked(page, &wbc); + if (ret < 0) + goto out_error; + continue; + } + if (!PagePrivate(page)) + break; + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret < 0) + goto out_error; + } + return 0; +out_error: + return ret; +} + +#ifdef CONFIG_MIGRATION +int nfs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page) +{ + /* + * If PagePrivate is set, then the page is currently associated with + * an in-progress read or write request. Don't try to migrate it. + * + * FIXME: we could do this in principle, but we'll need a way to ensure + * that we can safely release the inode reference while holding + * the page lock. + */ + if (PagePrivate(page)) + return -EBUSY; + + nfs_fscache_release_page(page, GFP_KERNEL); + + return migrate_page(mapping, newpage, page); +} +#endif + +int __init nfs_init_writepagecache(void) +{ + nfs_wdata_cachep = kmem_cache_create("nfs_write_data", + sizeof(struct nfs_write_data), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (nfs_wdata_cachep == NULL) + return -ENOMEM; + + nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE, + nfs_wdata_cachep); + if (nfs_wdata_mempool == NULL) + return -ENOMEM; + + nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT, + nfs_wdata_cachep); + if (nfs_commit_mempool == NULL) + return -ENOMEM; + + /* + * NFS congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (nfs_congestion_kb > 256*1024) + nfs_congestion_kb = 256*1024; + + return 0; +} + +void nfs_destroy_writepagecache(void) +{ + mempool_destroy(nfs_commit_mempool); + mempool_destroy(nfs_wdata_mempool); + kmem_cache_destroy(nfs_wdata_cachep); +} + diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile new file mode 100644 index 00000000..f689ed82 --- /dev/null +++ b/fs/nfs_common/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for Linux filesystem routines that are shared by client and server. +# + +obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o + +nfs_acl-objs := nfsacl.o diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c new file mode 100644 index 00000000..6940439b --- /dev/null +++ b/fs/nfs_common/nfsacl.c @@ -0,0 +1,286 @@ +/* + * fs/nfs_common/nfsacl.c + * + * Copyright (C) 2002-2003 Andreas Gruenbacher + */ + +/* + * The Solaris nfsacl protocol represents some ACLs slightly differently + * than POSIX 1003.1e draft 17 does (and we do): + * + * - Minimal ACLs always have an ACL_MASK entry, so they have + * four instead of three entries. + * - The ACL_MASK entry in such minimal ACLs always has the same + * permissions as the ACL_GROUP_OBJ entry. (In extended ACLs + * the ACL_MASK and ACL_GROUP_OBJ entries may differ.) + * - The identifier fields of the ACL_USER_OBJ and ACL_GROUP_OBJ + * entries contain the identifiers of the owner and owning group. + * (In POSIX ACLs we always set them to ACL_UNDEFINED_ID). + * - ACL entries in the kernel are kept sorted in ascending order + * of (e_tag, e_id). Solaris ACLs are unsorted. + */ + +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); + +EXPORT_SYMBOL_GPL(nfsacl_encode); +EXPORT_SYMBOL_GPL(nfsacl_decode); + +struct nfsacl_encode_desc { + struct xdr_array2_desc desc; + unsigned int count; + struct posix_acl *acl; + int typeflag; + uid_t uid; + gid_t gid; +}; + +struct nfsacl_simple_acl { + struct posix_acl acl; + struct posix_acl_entry ace[4]; +}; + +static int +xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem) +{ + struct nfsacl_encode_desc *nfsacl_desc = + (struct nfsacl_encode_desc *) desc; + __be32 *p = elem; + + struct posix_acl_entry *entry = + &nfsacl_desc->acl->a_entries[nfsacl_desc->count++]; + + *p++ = htonl(entry->e_tag | nfsacl_desc->typeflag); + switch(entry->e_tag) { + case ACL_USER_OBJ: + *p++ = htonl(nfsacl_desc->uid); + break; + case ACL_GROUP_OBJ: + *p++ = htonl(nfsacl_desc->gid); + break; + case ACL_USER: + case ACL_GROUP: + *p++ = htonl(entry->e_id); + break; + default: /* Solaris depends on that! */ + *p++ = 0; + break; + } + *p++ = htonl(entry->e_perm & S_IRWXO); + return 0; +} + +/** + * nfsacl_encode - Encode an NFSv3 ACL + * + * @buf: destination xdr_buf to contain XDR encoded ACL + * @base: byte offset in xdr_buf where XDR'd ACL begins + * @inode: inode of file whose ACL this is + * @acl: posix_acl to encode + * @encode_entries: whether to encode ACEs as well + * @typeflag: ACL type: NFS_ACL_DEFAULT or zero + * + * Returns size of encoded ACL in bytes or a negative errno value. + */ +int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, + struct posix_acl *acl, int encode_entries, int typeflag) +{ + int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0; + struct nfsacl_encode_desc nfsacl_desc = { + .desc = { + .elem_size = 12, + .array_len = encode_entries ? entries : 0, + .xcode = xdr_nfsace_encode, + }, + .acl = acl, + .typeflag = typeflag, + .uid = inode->i_uid, + .gid = inode->i_gid, + }; + struct nfsacl_simple_acl aclbuf; + int err; + + if (entries > NFS_ACL_MAX_ENTRIES || + xdr_encode_word(buf, base, entries)) + return -EINVAL; + if (encode_entries && acl && acl->a_count == 3) { + struct posix_acl *acl2 = &aclbuf.acl; + + /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is + * invoked in contexts where a memory allocation failure is + * fatal. Fortunately this fake ACL is small enough to + * construct on the stack. */ + posix_acl_init(acl2, 4); + + /* Insert entries in canonical order: other orders seem + to confuse Solaris VxFS. */ + acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */ + acl2->a_entries[1] = acl->a_entries[1]; /* ACL_GROUP_OBJ */ + acl2->a_entries[2] = acl->a_entries[1]; /* ACL_MASK */ + acl2->a_entries[2].e_tag = ACL_MASK; + acl2->a_entries[3] = acl->a_entries[2]; /* ACL_OTHER */ + nfsacl_desc.acl = acl2; + } + err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc); + if (!err) + err = 8 + nfsacl_desc.desc.elem_size * + nfsacl_desc.desc.array_len; + return err; +} + +struct nfsacl_decode_desc { + struct xdr_array2_desc desc; + unsigned int count; + struct posix_acl *acl; +}; + +static int +xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem) +{ + struct nfsacl_decode_desc *nfsacl_desc = + (struct nfsacl_decode_desc *) desc; + __be32 *p = elem; + struct posix_acl_entry *entry; + + if (!nfsacl_desc->acl) { + if (desc->array_len > NFS_ACL_MAX_ENTRIES) + return -EINVAL; + nfsacl_desc->acl = posix_acl_alloc(desc->array_len, GFP_KERNEL); + if (!nfsacl_desc->acl) + return -ENOMEM; + nfsacl_desc->count = 0; + } + + entry = &nfsacl_desc->acl->a_entries[nfsacl_desc->count++]; + entry->e_tag = ntohl(*p++) & ~NFS_ACL_DEFAULT; + entry->e_id = ntohl(*p++); + entry->e_perm = ntohl(*p++); + + switch(entry->e_tag) { + case ACL_USER_OBJ: + case ACL_USER: + case ACL_GROUP_OBJ: + case ACL_GROUP: + case ACL_OTHER: + if (entry->e_perm & ~S_IRWXO) + return -EINVAL; + break; + case ACL_MASK: + /* Solaris sometimes sets additional bits in the mask */ + entry->e_perm &= S_IRWXO; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int +cmp_acl_entry(const void *x, const void *y) +{ + const struct posix_acl_entry *a = x, *b = y; + + if (a->e_tag != b->e_tag) + return a->e_tag - b->e_tag; + else if (a->e_id > b->e_id) + return 1; + else if (a->e_id < b->e_id) + return -1; + else + return 0; +} + +/* + * Convert from a Solaris ACL to a POSIX 1003.1e draft 17 ACL. + */ +static int +posix_acl_from_nfsacl(struct posix_acl *acl) +{ + struct posix_acl_entry *pa, *pe, + *group_obj = NULL, *mask = NULL; + + if (!acl) + return 0; + + sort(acl->a_entries, acl->a_count, sizeof(struct posix_acl_entry), + cmp_acl_entry, NULL); + + /* Clear undefined identifier fields and find the ACL_GROUP_OBJ + and ACL_MASK entries. */ + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch(pa->e_tag) { + case ACL_USER_OBJ: + pa->e_id = ACL_UNDEFINED_ID; + break; + case ACL_GROUP_OBJ: + pa->e_id = ACL_UNDEFINED_ID; + group_obj = pa; + break; + case ACL_MASK: + mask = pa; + /* fall through */ + case ACL_OTHER: + pa->e_id = ACL_UNDEFINED_ID; + break; + } + } + if (acl->a_count == 4 && group_obj && mask && + mask->e_perm == group_obj->e_perm) { + /* remove bogus ACL_MASK entry */ + memmove(mask, mask+1, (3 - (mask - acl->a_entries)) * + sizeof(struct posix_acl_entry)); + acl->a_count = 3; + } + return 0; +} + +/** + * nfsacl_decode - Decode an NFSv3 ACL + * + * @buf: xdr_buf containing XDR'd ACL data to decode + * @base: byte offset in xdr_buf where XDR'd ACL begins + * @aclcnt: count of ACEs in decoded posix_acl + * @pacl: buffer in which to place decoded posix_acl + * + * Returns the length of the decoded ACL in bytes, or a negative errno value. + */ +int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, + struct posix_acl **pacl) +{ + struct nfsacl_decode_desc nfsacl_desc = { + .desc = { + .elem_size = 12, + .xcode = pacl ? xdr_nfsace_decode : NULL, + }, + }; + u32 entries; + int err; + + if (xdr_decode_word(buf, base, &entries) || + entries > NFS_ACL_MAX_ENTRIES) + return -EINVAL; + nfsacl_desc.desc.array_maxlen = entries; + err = xdr_decode_array2(buf, base + 4, &nfsacl_desc.desc); + if (err) + return err; + if (pacl) { + if (entries != nfsacl_desc.desc.array_len || + posix_acl_from_nfsacl(nfsacl_desc.acl) != 0) { + posix_acl_release(nfsacl_desc.acl); + return -EINVAL; + } + *pacl = nfsacl_desc.acl; + } + if (aclcnt) + *aclcnt = entries; + return 8 + nfsacl_desc.desc.elem_size * + nfsacl_desc.desc.array_len; +} diff --git a/fs/nfsctl.c b/fs/nfsctl.c new file mode 100644 index 00000000..124e8fcb --- /dev/null +++ b/fs/nfsctl.c @@ -0,0 +1,100 @@ +/* + * fs/nfsctl.c + * + * This should eventually move to userland. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * open a file on nfsd fs + */ + +static struct file *do_open(char *name, int flags) +{ + struct vfsmount *mnt; + struct file *file; + + mnt = do_kern_mount("nfsd", 0, "nfsd", NULL); + if (IS_ERR(mnt)) + return (struct file *)mnt; + + file = file_open_root(mnt->mnt_root, mnt, name, flags); + + mntput(mnt); /* drop do_kern_mount reference */ + return file; +} + +static struct { + char *name; int wsize; int rsize; +} map[] = { + [NFSCTL_SVC] = { + .name = ".svc", + .wsize = sizeof(struct nfsctl_svc) + }, + [NFSCTL_ADDCLIENT] = { + .name = ".add", + .wsize = sizeof(struct nfsctl_client) + }, + [NFSCTL_DELCLIENT] = { + .name = ".del", + .wsize = sizeof(struct nfsctl_client) + }, + [NFSCTL_EXPORT] = { + .name = ".export", + .wsize = sizeof(struct nfsctl_export) + }, + [NFSCTL_UNEXPORT] = { + .name = ".unexport", + .wsize = sizeof(struct nfsctl_export) + }, + [NFSCTL_GETFD] = { + .name = ".getfd", + .wsize = sizeof(struct nfsctl_fdparm), + .rsize = NFS_FHSIZE + }, + [NFSCTL_GETFS] = { + .name = ".getfs", + .wsize = sizeof(struct nfsctl_fsparm), + .rsize = sizeof(struct knfsd_fh) + }, +}; + +SYSCALL_DEFINE3(nfsservctl, int, cmd, struct nfsctl_arg __user *, arg, + void __user *, res) +{ + struct file *file; + void __user *p = &arg->u; + int version; + int err; + + if (copy_from_user(&version, &arg->ca_version, sizeof(int))) + return -EFAULT; + + if (version != NFSCTL_VERSION) + return -EINVAL; + + if (cmd < 0 || cmd >= ARRAY_SIZE(map) || !map[cmd].name) + return -EINVAL; + + file = do_open(map[cmd].name, map[cmd].rsize ? O_RDWR : O_WRONLY); + if (IS_ERR(file)) + return PTR_ERR(file); + err = file->f_op->write(file, p, map[cmd].wsize, &file->f_pos); + if (err >= 0 && map[cmd].rsize) + err = file->f_op->read(file, res, map[cmd].rsize, &file->f_pos); + if (err >= 0) + err = 0; + fput(file); + return err; +} diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig new file mode 100644 index 00000000..fbb2a5ef --- /dev/null +++ b/fs/nfsd/Kconfig @@ -0,0 +1,94 @@ +config NFSD + tristate "NFS server support" + depends on INET + depends on FILE_LOCKING + select LOCKD + select SUNRPC + select EXPORTFS + select NFS_ACL_SUPPORT if NFSD_V2_ACL + help + Choose Y here if you want to allow other computers to access + files residing on this system using Sun's Network File System + protocol. To compile the NFS server support as a module, + choose M here: the module will be called nfsd. + + You may choose to use a user-space NFS server instead, in which + case you can choose N here. + + To export local file systems using NFS, you also need to install + user space programs which can be found in the Linux nfs-utils + package, available from http://linux-nfs.org/. More detail about + the Linux NFS server implementation is available via the + exports(5) man page. + + Below you can choose which versions of the NFS protocol are + available to clients mounting the NFS server on this system. + Support for NFS version 2 (RFC 1094) is always available when + CONFIG_NFSD is selected. + + If unsure, say N. + +config NFSD_DEPRECATED + bool "Include support for deprecated syscall interface to NFSD" + depends on NFSD + default y + help + The syscall interface to nfsd was obsoleted in 2.6.0 by a new + filesystem based interface. The old interface is due for removal + in 2.6.40. If you wish to remove the interface before then + say N. + + In unsure, say Y. + +config NFSD_V2_ACL + bool + depends on NFSD + +config NFSD_V3 + bool "NFS server support for NFS version 3" + depends on NFSD + help + This option enables support in your system's NFS server for + version 3 of the NFS protocol (RFC 1813). + + If unsure, say Y. + +config NFSD_V3_ACL + bool "NFS server support for the NFSv3 ACL protocol extension" + depends on NFSD_V3 + select NFSD_V2_ACL + help + Solaris NFS servers support an auxiliary NFSv3 ACL protocol that + never became an official part of the NFS version 3 protocol. + This protocol extension allows applications on NFS clients to + manipulate POSIX Access Control Lists on files residing on NFS + servers. NFS servers enforce POSIX ACLs on local files whether + this protocol is available or not. + + This option enables support in your system's NFS server for the + NFSv3 ACL protocol extension allowing NFS clients to manipulate + POSIX ACLs on files exported by your system's NFS server. NFS + clients which support the Solaris NFSv3 ACL protocol can then + access and modify ACLs on your NFS server. + + To store ACLs on your NFS server, you also need to enable ACL- + related CONFIG options for your local file systems of choice. + + If unsure, say N. + +config NFSD_V4 + bool "NFS server support for NFS version 4 (EXPERIMENTAL)" + depends on NFSD && PROC_FS && EXPERIMENTAL + select NFSD_V3 + select FS_POSIX_ACL + select SUNRPC_GSS + select CRYPTO + help + This option enables support in your system's NFS server for + version 4 of the NFS protocol (RFC 3530). + + To export files using NFSv4, you need to install additional user + space programs which can be found in the Linux nfs-utils package, + available from http://linux-nfs.org/. + + If unsure, say N. diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile new file mode 100644 index 00000000..9b118ee2 --- /dev/null +++ b/fs/nfsd/Makefile @@ -0,0 +1,13 @@ +# +# Makefile for the Linux nfs server +# + +obj-$(CONFIG_NFSD) += nfsd.o + +nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ + export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o +nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o +nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o +nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o +nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ + nfs4acl.o nfs4callback.o nfs4recover.o diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h new file mode 100644 index 00000000..34e5c40a --- /dev/null +++ b/fs/nfsd/acl.h @@ -0,0 +1,59 @@ +/* + * Common NFSv4 ACL handling definitions. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LINUX_NFS4_ACL_H +#define LINUX_NFS4_ACL_H + +#include + +/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to + * fit in a page: */ +#define NFS4_ACL_MAX 170 + +struct nfs4_acl *nfs4_acl_new(int); +int nfs4_acl_get_whotype(char *, u32); +int nfs4_acl_write_who(int who, char *p); +int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group, + uid_t who, u32 mask); + +#define NFS4_ACL_TYPE_DEFAULT 0x01 +#define NFS4_ACL_DIR 0x02 +#define NFS4_ACL_OWNER 0x04 + +struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *, + struct posix_acl *, unsigned int flags); +int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **, + struct posix_acl **, unsigned int flags); + +#endif /* LINUX_NFS4_ACL_H */ diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c new file mode 100644 index 00000000..79717a40 --- /dev/null +++ b/fs/nfsd/auth.c @@ -0,0 +1,95 @@ +/* Copyright (C) 1995, 1996 Olaf Kirch */ + +#include +#include "nfsd.h" +#include "auth.h" + +int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) +{ + struct exp_flavor_info *f; + struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + + for (f = exp->ex_flavors; f < end; f++) { + if (f->pseudoflavor == rqstp->rq_flavor) + return f->flags; + } + return exp->ex_flags; + +} + +int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) +{ + struct group_info *rqgi; + struct group_info *gi; + struct cred *new; + int i; + int flags = nfsexp_flags(rqstp, exp); + int ret; + + validate_process_creds(); + + /* discard any old override before preparing the new set */ + revert_creds(get_cred(current->real_cred)); + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->fsuid = rqstp->rq_cred.cr_uid; + new->fsgid = rqstp->rq_cred.cr_gid; + + rqgi = rqstp->rq_cred.cr_group_info; + + if (flags & NFSEXP_ALLSQUASH) { + new->fsuid = exp->ex_anon_uid; + new->fsgid = exp->ex_anon_gid; + gi = groups_alloc(0); + if (!gi) + goto oom; + } else if (flags & NFSEXP_ROOTSQUASH) { + if (!new->fsuid) + new->fsuid = exp->ex_anon_uid; + if (!new->fsgid) + new->fsgid = exp->ex_anon_gid; + + gi = groups_alloc(rqgi->ngroups); + if (!gi) + goto oom; + + for (i = 0; i < rqgi->ngroups; i++) { + if (!GROUP_AT(rqgi, i)) + GROUP_AT(gi, i) = exp->ex_anon_gid; + else + GROUP_AT(gi, i) = GROUP_AT(rqgi, i); + } + } else { + gi = get_group_info(rqgi); + } + + if (new->fsuid == (uid_t) -1) + new->fsuid = exp->ex_anon_uid; + if (new->fsgid == (gid_t) -1) + new->fsgid = exp->ex_anon_gid; + + ret = set_groups(new, gi); + put_group_info(gi); + if (ret < 0) + goto error; + + if (new->fsuid) + new->cap_effective = cap_drop_nfsd_set(new->cap_effective); + else + new->cap_effective = cap_raise_nfsd_set(new->cap_effective, + new->cap_permitted); + validate_process_creds(); + put_cred(override_creds(new)); + put_cred(new); + validate_process_creds(); + return 0; + +oom: + ret = -ENOMEM; +error: + abort_creds(new); + return ret; +} + diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h new file mode 100644 index 00000000..78b3c0e9 --- /dev/null +++ b/fs/nfsd/auth.h @@ -0,0 +1,22 @@ +/* + * nfsd-specific authentication stuff. + * uid/gid mapping not yet implemented. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#ifndef LINUX_NFSD_AUTH_H +#define LINUX_NFSD_AUTH_H + +#define nfsd_luid(rq, uid) ((u32)(uid)) +#define nfsd_lgid(rq, gid) ((u32)(gid)) +#define nfsd_ruid(rq, uid) ((u32)(uid)) +#define nfsd_rgid(rq, gid) ((u32)(gid)) + +/* + * Set the current process's fsuid/fsgid etc to those of the NFS + * client user + */ +int nfsd_setuser(struct svc_rqst *, struct svc_export *); + +#endif /* LINUX_NFSD_AUTH_H */ diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h new file mode 100644 index 00000000..d892be61 --- /dev/null +++ b/fs/nfsd/cache.h @@ -0,0 +1,83 @@ +/* + * Request reply cache. This was heavily inspired by the + * implementation in 4.3BSD/4.4BSD. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#ifndef NFSCACHE_H +#define NFSCACHE_H + +#include + +/* + * Representation of a reply cache entry. + */ +struct svc_cacherep { + struct hlist_node c_hash; + struct list_head c_lru; + + unsigned char c_state, /* unused, inprog, done */ + c_type, /* status, buffer */ + c_secure : 1; /* req came from port < 1024 */ + struct sockaddr_in c_addr; + __be32 c_xid; + u32 c_prot; + u32 c_proc; + u32 c_vers; + unsigned long c_timestamp; + union { + struct kvec u_vec; + __be32 u_status; + } c_u; +}; + +#define c_replvec c_u.u_vec +#define c_replstat c_u.u_status + +/* cache entry states */ +enum { + RC_UNUSED, + RC_INPROG, + RC_DONE +}; + +/* return values */ +enum { + RC_DROPIT, + RC_REPLY, + RC_DOIT, + RC_INTR +}; + +/* + * Cache types. + * We may want to add more types one day, e.g. for diropres and + * attrstat replies. Using cache entries with fixed length instead + * of buffer pointers may be more efficient. + */ +enum { + RC_NOCACHE, + RC_REPLSTAT, + RC_REPLBUFF, +}; + +/* + * If requests are retransmitted within this interval, they're dropped. + */ +#define RC_DELAY (HZ/5) + +int nfsd_reply_cache_init(void); +void nfsd_reply_cache_shutdown(void); +int nfsd_cache_lookup(struct svc_rqst *, int); +void nfsd_cache_update(struct svc_rqst *, int, __be32 *); + +#ifdef CONFIG_NFSD_V4 +void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp); +#else /* CONFIG_NFSD_V4 */ +static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp) +{ +} +#endif /* CONFIG_NFSD_V4 */ + +#endif /* NFSCACHE_H */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c new file mode 100644 index 00000000..4b470f60 --- /dev/null +++ b/fs/nfsd/export.c @@ -0,0 +1,1693 @@ +/* + * NFS exporting and validation. + * + * We maintain a list of clients, each of which has a list of + * exports. To export an fs to a given client, you first have + * to create the client entry with NFSCTL_ADDCLIENT, which + * creates a client control block and adds it to the hash + * table. Then, you call NFSCTL_EXPORT for each fs. + * + * + * Copyright (C) 1995, 1996 Olaf Kirch, + */ + +#include +#include +#include +#include + +#include +#include + +#include "nfsd.h" +#include "nfsfh.h" + +#define NFSDDBG_FACILITY NFSDDBG_EXPORT + +typedef struct auth_domain svc_client; +typedef struct svc_export svc_export; + +/* + * We have two caches. + * One maps client+vfsmnt+dentry to export options - the export map + * The other maps client+filehandle-fragment to export options. - the expkey map + * + * The export options are actually stored in the first map, and the + * second map contains a reference to the entry in the first map. + */ + +#define EXPKEY_HASHBITS 8 +#define EXPKEY_HASHMAX (1 << EXPKEY_HASHBITS) +#define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1) +static struct cache_head *expkey_table[EXPKEY_HASHMAX]; + +static void expkey_put(struct kref *ref) +{ + struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); + + if (test_bit(CACHE_VALID, &key->h.flags) && + !test_bit(CACHE_NEGATIVE, &key->h.flags)) + path_put(&key->ek_path); + auth_domain_put(key->ek_client); + kfree(key); +} + +static void expkey_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + /* client fsidtype \xfsid */ + struct svc_expkey *ek = container_of(h, struct svc_expkey, h); + char type[5]; + + qword_add(bpp, blen, ek->ek_client->name); + snprintf(type, 5, "%d", ek->ek_fsidtype); + qword_add(bpp, blen, type); + qword_addhex(bpp, blen, (char*)ek->ek_fsid, key_len(ek->ek_fsidtype)); + (*bpp)[-1] = '\n'; +} + +static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, expkey_request); +} + +static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old); +static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *); +static struct cache_detail svc_expkey_cache; + +static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) +{ + /* client fsidtype fsid [path] */ + char *buf; + int len; + struct auth_domain *dom = NULL; + int err; + int fsidtype; + char *ep; + struct svc_expkey key; + struct svc_expkey *ek = NULL; + + if (mlen < 1 || mesg[mlen-1] != '\n') + return -EINVAL; + mesg[mlen-1] = 0; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + err = -ENOMEM; + if (!buf) + goto out; + + err = -EINVAL; + if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + goto out; + + err = -ENOENT; + dom = auth_domain_find(buf); + if (!dom) + goto out; + dprintk("found domain %s\n", buf); + + err = -EINVAL; + if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + goto out; + fsidtype = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + dprintk("found fsidtype %d\n", fsidtype); + if (key_len(fsidtype)==0) /* invalid type */ + goto out; + if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + goto out; + dprintk("found fsid length %d\n", len); + if (len != key_len(fsidtype)) + goto out; + + /* OK, we seem to have a valid key */ + key.h.flags = 0; + key.h.expiry_time = get_expiry(&mesg); + if (key.h.expiry_time == 0) + goto out; + + key.ek_client = dom; + key.ek_fsidtype = fsidtype; + memcpy(key.ek_fsid, buf, len); + + ek = svc_expkey_lookup(&key); + err = -ENOMEM; + if (!ek) + goto out; + + /* now we want a pathname, or empty meaning NEGATIVE */ + err = -EINVAL; + len = qword_get(&mesg, buf, PAGE_SIZE); + if (len < 0) + goto out; + dprintk("Path seems to be <%s>\n", buf); + err = 0; + if (len == 0) { + set_bit(CACHE_NEGATIVE, &key.h.flags); + ek = svc_expkey_update(&key, ek); + if (!ek) + err = -ENOMEM; + } else { + err = kern_path(buf, 0, &key.ek_path); + if (err) + goto out; + + dprintk("Found the path %s\n", buf); + + ek = svc_expkey_update(&key, ek); + if (!ek) + err = -ENOMEM; + path_put(&key.ek_path); + } + cache_flush(); + out: + if (ek) + cache_put(&ek->h, &svc_expkey_cache); + if (dom) + auth_domain_put(dom); + kfree(buf); + return err; +} + +static int expkey_show(struct seq_file *m, + struct cache_detail *cd, + struct cache_head *h) +{ + struct svc_expkey *ek ; + int i; + + if (h ==NULL) { + seq_puts(m, "#domain fsidtype fsid [path]\n"); + return 0; + } + ek = container_of(h, struct svc_expkey, h); + seq_printf(m, "%s %d 0x", ek->ek_client->name, + ek->ek_fsidtype); + for (i=0; i < key_len(ek->ek_fsidtype)/4; i++) + seq_printf(m, "%08x", ek->ek_fsid[i]); + if (test_bit(CACHE_VALID, &h->flags) && + !test_bit(CACHE_NEGATIVE, &h->flags)) { + seq_printf(m, " "); + seq_path(m, &ek->ek_path, "\\ \t\n"); + } + seq_printf(m, "\n"); + return 0; +} + +static inline int expkey_match (struct cache_head *a, struct cache_head *b) +{ + struct svc_expkey *orig = container_of(a, struct svc_expkey, h); + struct svc_expkey *new = container_of(b, struct svc_expkey, h); + + if (orig->ek_fsidtype != new->ek_fsidtype || + orig->ek_client != new->ek_client || + memcmp(orig->ek_fsid, new->ek_fsid, key_len(orig->ek_fsidtype)) != 0) + return 0; + return 1; +} + +static inline void expkey_init(struct cache_head *cnew, + struct cache_head *citem) +{ + struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); + struct svc_expkey *item = container_of(citem, struct svc_expkey, h); + + kref_get(&item->ek_client->ref); + new->ek_client = item->ek_client; + new->ek_fsidtype = item->ek_fsidtype; + + memcpy(new->ek_fsid, item->ek_fsid, sizeof(new->ek_fsid)); +} + +static inline void expkey_update(struct cache_head *cnew, + struct cache_head *citem) +{ + struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); + struct svc_expkey *item = container_of(citem, struct svc_expkey, h); + + new->ek_path = item->ek_path; + path_get(&item->ek_path); +} + +static struct cache_head *expkey_alloc(void) +{ + struct svc_expkey *i = kmalloc(sizeof(*i), GFP_KERNEL); + if (i) + return &i->h; + else + return NULL; +} + +static struct cache_detail svc_expkey_cache = { + .owner = THIS_MODULE, + .hash_size = EXPKEY_HASHMAX, + .hash_table = expkey_table, + .name = "nfsd.fh", + .cache_put = expkey_put, + .cache_upcall = expkey_upcall, + .cache_parse = expkey_parse, + .cache_show = expkey_show, + .match = expkey_match, + .init = expkey_init, + .update = expkey_update, + .alloc = expkey_alloc, +}; + +static int +svc_expkey_hash(struct svc_expkey *item) +{ + int hash = item->ek_fsidtype; + char * cp = (char*)item->ek_fsid; + int len = key_len(item->ek_fsidtype); + + hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); + hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); + hash &= EXPKEY_HASHMASK; + return hash; +} + +static struct svc_expkey * +svc_expkey_lookup(struct svc_expkey *item) +{ + struct cache_head *ch; + int hash = svc_expkey_hash(item); + + ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h, + hash); + if (ch) + return container_of(ch, struct svc_expkey, h); + else + return NULL; +} + +static struct svc_expkey * +svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old) +{ + struct cache_head *ch; + int hash = svc_expkey_hash(new); + + ch = sunrpc_cache_update(&svc_expkey_cache, &new->h, + &old->h, hash); + if (ch) + return container_of(ch, struct svc_expkey, h); + else + return NULL; +} + + +#define EXPORT_HASHBITS 8 +#define EXPORT_HASHMAX (1<< EXPORT_HASHBITS) + +static struct cache_head *export_table[EXPORT_HASHMAX]; + +static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) +{ + int i; + + for (i = 0; i < fsloc->locations_count; i++) { + kfree(fsloc->locations[i].path); + kfree(fsloc->locations[i].hosts); + } + kfree(fsloc->locations); +} + +static void svc_export_put(struct kref *ref) +{ + struct svc_export *exp = container_of(ref, struct svc_export, h.ref); + path_put(&exp->ex_path); + auth_domain_put(exp->ex_client); + kfree(exp->ex_pathname); + nfsd4_fslocs_free(&exp->ex_fslocs); + kfree(exp); +} + +static void svc_export_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + /* client path */ + struct svc_export *exp = container_of(h, struct svc_export, h); + char *pth; + + qword_add(bpp, blen, exp->ex_client->name); + pth = d_path(&exp->ex_path, *bpp, *blen); + if (IS_ERR(pth)) { + /* is this correct? */ + (*bpp)[0] = '\n'; + return; + } + qword_add(bpp, blen, pth); + (*bpp)[-1] = '\n'; +} + +static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); +} + +static struct svc_export *svc_export_update(struct svc_export *new, + struct svc_export *old); +static struct svc_export *svc_export_lookup(struct svc_export *); + +static int check_export(struct inode *inode, int *flags, unsigned char *uuid) +{ + + /* + * We currently export only dirs, regular files, and (for v4 + * pseudoroot) symlinks. + */ + if (!S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode) && + !S_ISREG(inode->i_mode)) + return -ENOTDIR; + + /* + * Mountd should never pass down a writeable V4ROOT export, but, + * just to make sure: + */ + if (*flags & NFSEXP_V4ROOT) + *flags |= NFSEXP_READONLY; + + /* There are two requirements on a filesystem to be exportable. + * 1: We must be able to identify the filesystem from a number. + * either a device number (so FS_REQUIRES_DEV needed) + * or an FSID number (so NFSEXP_FSID or ->uuid is needed). + * 2: We must be able to find an inode from a filehandle. + * This means that s_export_op must be set. + */ + if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && + !(*flags & NFSEXP_FSID) && + uuid == NULL) { + dprintk("exp_export: export of non-dev fs without fsid\n"); + return -EINVAL; + } + + if (!inode->i_sb->s_export_op || + !inode->i_sb->s_export_op->fh_to_dentry) { + dprintk("exp_export: export of invalid fs type.\n"); + return -EINVAL; + } + + return 0; + +} + +#ifdef CONFIG_NFSD_V4 + +static int +fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) +{ + int len; + int migrated, i, err; + + /* listsize */ + err = get_int(mesg, &fsloc->locations_count); + if (err) + return err; + if (fsloc->locations_count > MAX_FS_LOCATIONS) + return -EINVAL; + if (fsloc->locations_count == 0) + return 0; + + fsloc->locations = kzalloc(fsloc->locations_count + * sizeof(struct nfsd4_fs_location), GFP_KERNEL); + if (!fsloc->locations) + return -ENOMEM; + for (i=0; i < fsloc->locations_count; i++) { + /* colon separated host list */ + err = -EINVAL; + len = qword_get(mesg, buf, PAGE_SIZE); + if (len <= 0) + goto out_free_all; + err = -ENOMEM; + fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL); + if (!fsloc->locations[i].hosts) + goto out_free_all; + err = -EINVAL; + /* slash separated path component list */ + len = qword_get(mesg, buf, PAGE_SIZE); + if (len <= 0) + goto out_free_all; + err = -ENOMEM; + fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL); + if (!fsloc->locations[i].path) + goto out_free_all; + } + /* migrated */ + err = get_int(mesg, &migrated); + if (err) + goto out_free_all; + err = -EINVAL; + if (migrated < 0 || migrated > 1) + goto out_free_all; + fsloc->migrated = migrated; + return 0; +out_free_all: + nfsd4_fslocs_free(fsloc); + return err; +} + +static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) +{ + int listsize, err; + struct exp_flavor_info *f; + + err = get_int(mesg, &listsize); + if (err) + return err; + if (listsize < 0 || listsize > MAX_SECINFO_LIST) + return -EINVAL; + + for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) { + err = get_int(mesg, &f->pseudoflavor); + if (err) + return err; + /* + * XXX: It would be nice to also check whether this + * pseudoflavor is supported, so we can discover the + * problem at export time instead of when a client fails + * to authenticate. + */ + err = get_int(mesg, &f->flags); + if (err) + return err; + /* Only some flags are allowed to differ between flavors: */ + if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags)) + return -EINVAL; + } + exp->ex_nflavors = listsize; + return 0; +} + +#else /* CONFIG_NFSD_V4 */ +static inline int +fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;} +static inline int +secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } +#endif + +static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) +{ + /* client path expiry [flags anonuid anongid fsid] */ + char *buf; + int len; + int err; + struct auth_domain *dom = NULL; + struct svc_export exp = {}, *expp; + int an_int; + + if (mesg[mlen-1] != '\n') + return -EINVAL; + mesg[mlen-1] = 0; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* client */ + err = -EINVAL; + len = qword_get(&mesg, buf, PAGE_SIZE); + if (len <= 0) + goto out; + + err = -ENOENT; + dom = auth_domain_find(buf); + if (!dom) + goto out; + + /* path */ + err = -EINVAL; + if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + goto out1; + + err = kern_path(buf, 0, &exp.ex_path); + if (err) + goto out1; + + exp.ex_client = dom; + + err = -ENOMEM; + exp.ex_pathname = kstrdup(buf, GFP_KERNEL); + if (!exp.ex_pathname) + goto out2; + + /* expiry */ + err = -EINVAL; + exp.h.expiry_time = get_expiry(&mesg); + if (exp.h.expiry_time == 0) + goto out3; + + /* flags */ + err = get_int(&mesg, &an_int); + if (err == -ENOENT) { + err = 0; + set_bit(CACHE_NEGATIVE, &exp.h.flags); + } else { + if (err || an_int < 0) + goto out3; + exp.ex_flags= an_int; + + /* anon uid */ + err = get_int(&mesg, &an_int); + if (err) + goto out3; + exp.ex_anon_uid= an_int; + + /* anon gid */ + err = get_int(&mesg, &an_int); + if (err) + goto out3; + exp.ex_anon_gid= an_int; + + /* fsid */ + err = get_int(&mesg, &an_int); + if (err) + goto out3; + exp.ex_fsid = an_int; + + while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) { + if (strcmp(buf, "fsloc") == 0) + err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); + else if (strcmp(buf, "uuid") == 0) { + /* expect a 16 byte uuid encoded as \xXXXX... */ + len = qword_get(&mesg, buf, PAGE_SIZE); + if (len != 16) + err = -EINVAL; + else { + exp.ex_uuid = + kmemdup(buf, 16, GFP_KERNEL); + if (exp.ex_uuid == NULL) + err = -ENOMEM; + } + } else if (strcmp(buf, "secinfo") == 0) + err = secinfo_parse(&mesg, buf, &exp); + else + /* quietly ignore unknown words and anything + * following. Newer user-space can try to set + * new values, then see what the result was. + */ + break; + if (err) + goto out4; + } + + err = check_export(exp.ex_path.dentry->d_inode, &exp.ex_flags, + exp.ex_uuid); + if (err) + goto out4; + } + + expp = svc_export_lookup(&exp); + if (expp) + expp = svc_export_update(&exp, expp); + else + err = -ENOMEM; + cache_flush(); + if (expp == NULL) + err = -ENOMEM; + else + exp_put(expp); +out4: + nfsd4_fslocs_free(&exp.ex_fslocs); + kfree(exp.ex_uuid); +out3: + kfree(exp.ex_pathname); +out2: + path_put(&exp.ex_path); +out1: + auth_domain_put(dom); +out: + kfree(buf); + return err; +} + +static void exp_flags(struct seq_file *m, int flag, int fsid, + uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs); +static void show_secinfo(struct seq_file *m, struct svc_export *exp); + +static int svc_export_show(struct seq_file *m, + struct cache_detail *cd, + struct cache_head *h) +{ + struct svc_export *exp ; + + if (h ==NULL) { + seq_puts(m, "#path domain(flags)\n"); + return 0; + } + exp = container_of(h, struct svc_export, h); + seq_path(m, &exp->ex_path, " \t\n\\"); + seq_putc(m, '\t'); + seq_escape(m, exp->ex_client->name, " \t\n\\"); + seq_putc(m, '('); + if (test_bit(CACHE_VALID, &h->flags) && + !test_bit(CACHE_NEGATIVE, &h->flags)) { + exp_flags(m, exp->ex_flags, exp->ex_fsid, + exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs); + if (exp->ex_uuid) { + int i; + seq_puts(m, ",uuid="); + for (i=0; i<16; i++) { + if ((i&3) == 0 && i) + seq_putc(m, ':'); + seq_printf(m, "%02x", exp->ex_uuid[i]); + } + } + show_secinfo(m, exp); + } + seq_puts(m, ")\n"); + return 0; +} +static int svc_export_match(struct cache_head *a, struct cache_head *b) +{ + struct svc_export *orig = container_of(a, struct svc_export, h); + struct svc_export *new = container_of(b, struct svc_export, h); + return orig->ex_client == new->ex_client && + orig->ex_path.dentry == new->ex_path.dentry && + orig->ex_path.mnt == new->ex_path.mnt; +} + +static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) +{ + struct svc_export *new = container_of(cnew, struct svc_export, h); + struct svc_export *item = container_of(citem, struct svc_export, h); + + kref_get(&item->ex_client->ref); + new->ex_client = item->ex_client; + new->ex_path.dentry = dget(item->ex_path.dentry); + new->ex_path.mnt = mntget(item->ex_path.mnt); + new->ex_pathname = NULL; + new->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = 0; +} + +static void export_update(struct cache_head *cnew, struct cache_head *citem) +{ + struct svc_export *new = container_of(cnew, struct svc_export, h); + struct svc_export *item = container_of(citem, struct svc_export, h); + int i; + + new->ex_flags = item->ex_flags; + new->ex_anon_uid = item->ex_anon_uid; + new->ex_anon_gid = item->ex_anon_gid; + new->ex_fsid = item->ex_fsid; + new->ex_uuid = item->ex_uuid; + item->ex_uuid = NULL; + new->ex_pathname = item->ex_pathname; + item->ex_pathname = NULL; + new->ex_fslocs.locations = item->ex_fslocs.locations; + item->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = item->ex_fslocs.locations_count; + item->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = item->ex_fslocs.migrated; + item->ex_fslocs.migrated = 0; + new->ex_nflavors = item->ex_nflavors; + for (i = 0; i < MAX_SECINFO_LIST; i++) { + new->ex_flavors[i] = item->ex_flavors[i]; + } +} + +static struct cache_head *svc_export_alloc(void) +{ + struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL); + if (i) + return &i->h; + else + return NULL; +} + +struct cache_detail svc_export_cache = { + .owner = THIS_MODULE, + .hash_size = EXPORT_HASHMAX, + .hash_table = export_table, + .name = "nfsd.export", + .cache_put = svc_export_put, + .cache_upcall = svc_export_upcall, + .cache_parse = svc_export_parse, + .cache_show = svc_export_show, + .match = svc_export_match, + .init = svc_export_init, + .update = export_update, + .alloc = svc_export_alloc, +}; + +static int +svc_export_hash(struct svc_export *exp) +{ + int hash; + + hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); + hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS); + hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS); + return hash; +} + +static struct svc_export * +svc_export_lookup(struct svc_export *exp) +{ + struct cache_head *ch; + int hash = svc_export_hash(exp); + + ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h, + hash); + if (ch) + return container_of(ch, struct svc_export, h); + else + return NULL; +} + +static struct svc_export * +svc_export_update(struct svc_export *new, struct svc_export *old) +{ + struct cache_head *ch; + int hash = svc_export_hash(old); + + ch = sunrpc_cache_update(&svc_export_cache, &new->h, + &old->h, + hash); + if (ch) + return container_of(ch, struct svc_export, h); + else + return NULL; +} + + +static struct svc_expkey * +exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) +{ + struct svc_expkey key, *ek; + int err; + + if (!clp) + return ERR_PTR(-ENOENT); + + key.ek_client = clp; + key.ek_fsidtype = fsid_type; + memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); + + ek = svc_expkey_lookup(&key); + if (ek == NULL) + return ERR_PTR(-ENOMEM); + err = cache_check(&svc_expkey_cache, &ek->h, reqp); + if (err) + return ERR_PTR(err); + return ek; +} + +#ifdef CONFIG_NFSD_DEPRECATED +static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv, + struct svc_export *exp) +{ + struct svc_expkey key, *ek; + + key.ek_client = clp; + key.ek_fsidtype = fsid_type; + memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); + key.ek_path = exp->ex_path; + key.h.expiry_time = NEVER; + key.h.flags = 0; + + ek = svc_expkey_lookup(&key); + if (ek) + ek = svc_expkey_update(&key,ek); + if (ek) { + cache_put(&ek->h, &svc_expkey_cache); + return 0; + } + return -ENOMEM; +} + +/* + * Find the client's export entry matching xdev/xino. + */ +static inline struct svc_expkey * +exp_get_key(svc_client *clp, dev_t dev, ino_t ino) +{ + u32 fsidv[3]; + + if (old_valid_dev(dev)) { + mk_fsid(FSID_DEV, fsidv, dev, ino, 0, NULL); + return exp_find_key(clp, FSID_DEV, fsidv, NULL); + } + mk_fsid(FSID_ENCODE_DEV, fsidv, dev, ino, 0, NULL); + return exp_find_key(clp, FSID_ENCODE_DEV, fsidv, NULL); +} + +/* + * Find the client's export entry matching fsid + */ +static inline struct svc_expkey * +exp_get_fsid_key(svc_client *clp, int fsid) +{ + u32 fsidv[2]; + + mk_fsid(FSID_NUM, fsidv, 0, 0, fsid, NULL); + + return exp_find_key(clp, FSID_NUM, fsidv, NULL); +} +#endif + +static svc_export *exp_get_by_name(svc_client *clp, const struct path *path, + struct cache_req *reqp) +{ + struct svc_export *exp, key; + int err; + + if (!clp) + return ERR_PTR(-ENOENT); + + key.ex_client = clp; + key.ex_path = *path; + + exp = svc_export_lookup(&key); + if (exp == NULL) + return ERR_PTR(-ENOMEM); + err = cache_check(&svc_export_cache, &exp->h, reqp); + if (err) + return ERR_PTR(err); + return exp; +} + +/* + * Find the export entry for a given dentry. + */ +static struct svc_export *exp_parent(svc_client *clp, struct path *path) +{ + struct dentry *saved = dget(path->dentry); + svc_export *exp = exp_get_by_name(clp, path, NULL); + + while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { + struct dentry *parent = dget_parent(path->dentry); + dput(path->dentry); + path->dentry = parent; + exp = exp_get_by_name(clp, path, NULL); + } + dput(path->dentry); + path->dentry = saved; + return exp; +} + +#ifdef CONFIG_NFSD_DEPRECATED +/* + * Hashtable locking. Write locks are placed only by user processes + * wanting to modify export information. + * Write locking only done in this file. Read locking + * needed externally. + */ + +static DECLARE_RWSEM(hash_sem); + +void +exp_readlock(void) +{ + down_read(&hash_sem); +} + +static inline void +exp_writelock(void) +{ + down_write(&hash_sem); +} + +void +exp_readunlock(void) +{ + up_read(&hash_sem); +} + +static inline void +exp_writeunlock(void) +{ + up_write(&hash_sem); +} +#else + +/* hash_sem not needed once deprecated interface is removed */ +void exp_readlock(void) {} +static inline void exp_writelock(void){} +void exp_readunlock(void) {} +static inline void exp_writeunlock(void){} + +#endif + +#ifdef CONFIG_NFSD_DEPRECATED +static void exp_do_unexport(svc_export *unexp); +static int exp_verify_string(char *cp, int max); + +static void exp_fsid_unhash(struct svc_export *exp) +{ + struct svc_expkey *ek; + + if ((exp->ex_flags & NFSEXP_FSID) == 0) + return; + + ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid); + if (!IS_ERR(ek)) { + sunrpc_invalidate(&ek->h, &svc_expkey_cache); + cache_put(&ek->h, &svc_expkey_cache); + } +} + +static int exp_fsid_hash(svc_client *clp, struct svc_export *exp) +{ + u32 fsid[2]; + + if ((exp->ex_flags & NFSEXP_FSID) == 0) + return 0; + + mk_fsid(FSID_NUM, fsid, 0, 0, exp->ex_fsid, NULL); + return exp_set_key(clp, FSID_NUM, fsid, exp); +} + +static int exp_hash(struct auth_domain *clp, struct svc_export *exp) +{ + u32 fsid[2]; + struct inode *inode = exp->ex_path.dentry->d_inode; + dev_t dev = inode->i_sb->s_dev; + + if (old_valid_dev(dev)) { + mk_fsid(FSID_DEV, fsid, dev, inode->i_ino, 0, NULL); + return exp_set_key(clp, FSID_DEV, fsid, exp); + } + mk_fsid(FSID_ENCODE_DEV, fsid, dev, inode->i_ino, 0, NULL); + return exp_set_key(clp, FSID_ENCODE_DEV, fsid, exp); +} + +static void exp_unhash(struct svc_export *exp) +{ + struct svc_expkey *ek; + struct inode *inode = exp->ex_path.dentry->d_inode; + + ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino); + if (!IS_ERR(ek)) { + sunrpc_invalidate(&ek->h, &svc_expkey_cache); + cache_put(&ek->h, &svc_expkey_cache); + } +} + +/* + * Export a file system. + */ +int +exp_export(struct nfsctl_export *nxp) +{ + svc_client *clp; + struct svc_export *exp = NULL; + struct svc_export new; + struct svc_expkey *fsid_key = NULL; + struct path path; + int err; + + /* Consistency check */ + err = -EINVAL; + if (!exp_verify_string(nxp->ex_path, NFS_MAXPATHLEN) || + !exp_verify_string(nxp->ex_client, NFSCLNT_IDMAX)) + goto out; + + dprintk("exp_export called for %s:%s (%x/%ld fl %x).\n", + nxp->ex_client, nxp->ex_path, + (unsigned)nxp->ex_dev, (long)nxp->ex_ino, + nxp->ex_flags); + + /* Try to lock the export table for update */ + exp_writelock(); + + /* Look up client info */ + if (!(clp = auth_domain_find(nxp->ex_client))) + goto out_unlock; + + + /* Look up the dentry */ + err = kern_path(nxp->ex_path, 0, &path); + if (err) + goto out_put_clp; + err = -EINVAL; + + exp = exp_get_by_name(clp, &path, NULL); + + memset(&new, 0, sizeof(new)); + + /* must make sure there won't be an ex_fsid clash */ + if ((nxp->ex_flags & NFSEXP_FSID) && + (!IS_ERR(fsid_key = exp_get_fsid_key(clp, nxp->ex_dev))) && + fsid_key->ek_path.mnt && + (fsid_key->ek_path.mnt != path.mnt || + fsid_key->ek_path.dentry != path.dentry)) + goto finish; + + if (!IS_ERR(exp)) { + /* just a flags/id/fsid update */ + + exp_fsid_unhash(exp); + exp->ex_flags = nxp->ex_flags; + exp->ex_anon_uid = nxp->ex_anon_uid; + exp->ex_anon_gid = nxp->ex_anon_gid; + exp->ex_fsid = nxp->ex_dev; + + err = exp_fsid_hash(clp, exp); + goto finish; + } + + err = check_export(path.dentry->d_inode, &nxp->ex_flags, NULL); + if (err) goto finish; + + err = -ENOMEM; + + dprintk("nfsd: creating export entry %p for client %p\n", exp, clp); + + new.h.expiry_time = NEVER; + new.h.flags = 0; + new.ex_pathname = kstrdup(nxp->ex_path, GFP_KERNEL); + if (!new.ex_pathname) + goto finish; + new.ex_client = clp; + new.ex_path = path; + new.ex_flags = nxp->ex_flags; + new.ex_anon_uid = nxp->ex_anon_uid; + new.ex_anon_gid = nxp->ex_anon_gid; + new.ex_fsid = nxp->ex_dev; + + exp = svc_export_lookup(&new); + if (exp) + exp = svc_export_update(&new, exp); + + if (!exp) + goto finish; + + if (exp_hash(clp, exp) || + exp_fsid_hash(clp, exp)) { + /* failed to create at least one index */ + exp_do_unexport(exp); + cache_flush(); + } else + err = 0; +finish: + kfree(new.ex_pathname); + if (!IS_ERR_OR_NULL(exp)) + exp_put(exp); + if (!IS_ERR_OR_NULL(fsid_key)) + cache_put(&fsid_key->h, &svc_expkey_cache); + path_put(&path); +out_put_clp: + auth_domain_put(clp); +out_unlock: + exp_writeunlock(); +out: + return err; +} + +/* + * Unexport a file system. The export entry has already + * been removed from the client's list of exported fs's. + */ +static void +exp_do_unexport(svc_export *unexp) +{ + sunrpc_invalidate(&unexp->h, &svc_export_cache); + exp_unhash(unexp); + exp_fsid_unhash(unexp); +} + + +/* + * unexport syscall. + */ +int +exp_unexport(struct nfsctl_export *nxp) +{ + struct auth_domain *dom; + svc_export *exp; + struct path path; + int err; + + /* Consistency check */ + if (!exp_verify_string(nxp->ex_path, NFS_MAXPATHLEN) || + !exp_verify_string(nxp->ex_client, NFSCLNT_IDMAX)) + return -EINVAL; + + exp_writelock(); + + err = -EINVAL; + dom = auth_domain_find(nxp->ex_client); + if (!dom) { + dprintk("nfsd: unexport couldn't find %s\n", nxp->ex_client); + goto out_unlock; + } + + err = kern_path(nxp->ex_path, 0, &path); + if (err) + goto out_domain; + + err = -EINVAL; + exp = exp_get_by_name(dom, &path, NULL); + path_put(&path); + if (IS_ERR(exp)) + goto out_domain; + + exp_do_unexport(exp); + exp_put(exp); + err = 0; + +out_domain: + auth_domain_put(dom); + cache_flush(); +out_unlock: + exp_writeunlock(); + return err; +} +#endif /* CONFIG_NFSD_DEPRECATED */ + +/* + * Obtain the root fh on behalf of a client. + * This could be done in user space, but I feel that it adds some safety + * since its harder to fool a kernel module than a user space program. + */ +int +exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize) +{ + struct svc_export *exp; + struct path path; + struct inode *inode; + struct svc_fh fh; + int err; + + err = -EPERM; + /* NB: we probably ought to check that it's NUL-terminated */ + if (kern_path(name, 0, &path)) { + printk("nfsd: exp_rootfh path not found %s", name); + return err; + } + inode = path.dentry->d_inode; + + dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", + name, path.dentry, clp->name, + inode->i_sb->s_id, inode->i_ino); + exp = exp_parent(clp, &path); + if (IS_ERR(exp)) { + err = PTR_ERR(exp); + goto out; + } + + /* + * fh must be initialized before calling fh_compose + */ + fh_init(&fh, maxsize); + if (fh_compose(&fh, exp, path.dentry, NULL)) + err = -EINVAL; + else + err = 0; + memcpy(f, &fh.fh_handle, sizeof(struct knfsd_fh)); + fh_put(&fh); + exp_put(exp); +out: + path_put(&path); + return err; +} + +static struct svc_export *exp_find(struct auth_domain *clp, int fsid_type, + u32 *fsidv, struct cache_req *reqp) +{ + struct svc_export *exp; + struct svc_expkey *ek = exp_find_key(clp, fsid_type, fsidv, reqp); + if (IS_ERR(ek)) + return ERR_CAST(ek); + + exp = exp_get_by_name(clp, &ek->ek_path, reqp); + cache_put(&ek->h, &svc_expkey_cache); + + if (IS_ERR(exp)) + return ERR_CAST(exp); + return exp; +} + +__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) +{ + struct exp_flavor_info *f; + struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + + /* legacy gss-only clients are always OK: */ + if (exp->ex_client == rqstp->rq_gssclient) + return 0; + /* ip-address based client; check sec= export option: */ + for (f = exp->ex_flavors; f < end; f++) { + if (f->pseudoflavor == rqstp->rq_flavor) + return 0; + } + /* defaults in absence of sec= options: */ + if (exp->ex_nflavors == 0) { + if (rqstp->rq_flavor == RPC_AUTH_NULL || + rqstp->rq_flavor == RPC_AUTH_UNIX) + return 0; + } + return nfserr_wrongsec; +} + +/* + * Uses rq_client and rq_gssclient to find an export; uses rq_client (an + * auth_unix client) if it's available and has secinfo information; + * otherwise, will try to use rq_gssclient. + * + * Called from functions that handle requests; functions that do work on + * behalf of mountd are passed a single client name to use, and should + * use exp_get_by_name() or exp_find(). + */ +struct svc_export * +rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) +{ + struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); + + if (rqstp->rq_client == NULL) + goto gss; + + /* First try the auth_unix client: */ + exp = exp_get_by_name(rqstp->rq_client, path, &rqstp->rq_chandle); + if (PTR_ERR(exp) == -ENOENT) + goto gss; + if (IS_ERR(exp)) + return exp; + /* If it has secinfo, assume there are no gss/... clients */ + if (exp->ex_nflavors > 0) + return exp; +gss: + /* Otherwise, try falling back on gss client */ + if (rqstp->rq_gssclient == NULL) + return exp; + gssexp = exp_get_by_name(rqstp->rq_gssclient, path, &rqstp->rq_chandle); + if (PTR_ERR(gssexp) == -ENOENT) + return exp; + if (!IS_ERR(exp)) + exp_put(exp); + return gssexp; +} + +struct svc_export * +rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv) +{ + struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); + + if (rqstp->rq_client == NULL) + goto gss; + + /* First try the auth_unix client: */ + exp = exp_find(rqstp->rq_client, fsid_type, fsidv, &rqstp->rq_chandle); + if (PTR_ERR(exp) == -ENOENT) + goto gss; + if (IS_ERR(exp)) + return exp; + /* If it has secinfo, assume there are no gss/... clients */ + if (exp->ex_nflavors > 0) + return exp; +gss: + /* Otherwise, try falling back on gss client */ + if (rqstp->rq_gssclient == NULL) + return exp; + gssexp = exp_find(rqstp->rq_gssclient, fsid_type, fsidv, + &rqstp->rq_chandle); + if (PTR_ERR(gssexp) == -ENOENT) + return exp; + if (!IS_ERR(exp)) + exp_put(exp); + return gssexp; +} + +struct svc_export * +rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) +{ + struct dentry *saved = dget(path->dentry); + struct svc_export *exp = rqst_exp_get_by_name(rqstp, path); + + while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { + struct dentry *parent = dget_parent(path->dentry); + dput(path->dentry); + path->dentry = parent; + exp = rqst_exp_get_by_name(rqstp, path); + } + dput(path->dentry); + path->dentry = saved; + return exp; +} + +static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp) +{ + u32 fsidv[2]; + + mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); + + return rqst_exp_find(rqstp, FSID_NUM, fsidv); +} + +/* + * Called when we need the filehandle for the root of the pseudofs, + * for a given NFSv4 client. The root is defined to be the + * export point with fsid==0 + */ +__be32 +exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) +{ + struct svc_export *exp; + __be32 rv; + + exp = find_fsidzero_export(rqstp); + if (IS_ERR(exp)) + return nfserrno(PTR_ERR(exp)); + rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); + exp_put(exp); + return rv; +} + +/* Iterator */ + +static void *e_start(struct seq_file *m, loff_t *pos) + __acquires(svc_export_cache.hash_lock) +{ + loff_t n = *pos; + unsigned hash, export; + struct cache_head *ch; + + exp_readlock(); + read_lock(&svc_export_cache.hash_lock); + if (!n--) + return SEQ_START_TOKEN; + hash = n >> 32; + export = n & ((1LL<<32) - 1); + + + for (ch=export_table[hash]; ch; ch=ch->next) + if (!export--) + return ch; + n &= ~((1LL<<32) - 1); + do { + hash++; + n += 1LL<<32; + } while(hash < EXPORT_HASHMAX && export_table[hash]==NULL); + if (hash >= EXPORT_HASHMAX) + return NULL; + *pos = n+1; + return export_table[hash]; +} + +static void *e_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct cache_head *ch = p; + int hash = (*pos >> 32); + + if (p == SEQ_START_TOKEN) + hash = 0; + else if (ch->next == NULL) { + hash++; + *pos += 1LL<<32; + } else { + ++*pos; + return ch->next; + } + *pos &= ~((1LL<<32) - 1); + while (hash < EXPORT_HASHMAX && export_table[hash] == NULL) { + hash++; + *pos += 1LL<<32; + } + if (hash >= EXPORT_HASHMAX) + return NULL; + ++*pos; + return export_table[hash]; +} + +static void e_stop(struct seq_file *m, void *p) + __releases(svc_export_cache.hash_lock) +{ + read_unlock(&svc_export_cache.hash_lock); + exp_readunlock(); +} + +static struct flags { + int flag; + char *name[2]; +} expflags[] = { + { NFSEXP_READONLY, {"ro", "rw"}}, + { NFSEXP_INSECURE_PORT, {"insecure", ""}}, + { NFSEXP_ROOTSQUASH, {"root_squash", "no_root_squash"}}, + { NFSEXP_ALLSQUASH, {"all_squash", ""}}, + { NFSEXP_ASYNC, {"async", "sync"}}, + { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, + { NFSEXP_NOHIDE, {"nohide", ""}}, + { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, + { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, + { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, + { NFSEXP_V4ROOT, {"v4root", ""}}, + { 0, {"", ""}} +}; + +static void show_expflags(struct seq_file *m, int flags, int mask) +{ + struct flags *flg; + int state, first = 0; + + for (flg = expflags; flg->flag; flg++) { + if (flg->flag & ~mask) + continue; + state = (flg->flag & flags) ? 0 : 1; + if (*flg->name[state]) + seq_printf(m, "%s%s", first++?",":"", flg->name[state]); + } +} + +static void show_secinfo_flags(struct seq_file *m, int flags) +{ + seq_printf(m, ","); + show_expflags(m, flags, NFSEXP_SECINFO_FLAGS); +} + +static bool secinfo_flags_equal(int f, int g) +{ + f &= NFSEXP_SECINFO_FLAGS; + g &= NFSEXP_SECINFO_FLAGS; + return f == g; +} + +static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end) +{ + int flags; + + flags = (*fp)->flags; + seq_printf(m, ",sec=%d", (*fp)->pseudoflavor); + (*fp)++; + while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) { + seq_printf(m, ":%d", (*fp)->pseudoflavor); + (*fp)++; + } + return flags; +} + +static void show_secinfo(struct seq_file *m, struct svc_export *exp) +{ + struct exp_flavor_info *f; + struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + int flags; + + if (exp->ex_nflavors == 0) + return; + f = exp->ex_flavors; + flags = show_secinfo_run(m, &f, end); + if (!secinfo_flags_equal(flags, exp->ex_flags)) + show_secinfo_flags(m, flags); + while (f != end) { + flags = show_secinfo_run(m, &f, end); + show_secinfo_flags(m, flags); + } +} + +static void exp_flags(struct seq_file *m, int flag, int fsid, + uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc) +{ + show_expflags(m, flag, NFSEXP_ALLFLAGS); + if (flag & NFSEXP_FSID) + seq_printf(m, ",fsid=%d", fsid); + if (anonu != (uid_t)-2 && anonu != (0x10000-2)) + seq_printf(m, ",anonuid=%u", anonu); + if (anong != (gid_t)-2 && anong != (0x10000-2)) + seq_printf(m, ",anongid=%u", anong); + if (fsloc && fsloc->locations_count > 0) { + char *loctype = (fsloc->migrated) ? "refer" : "replicas"; + int i; + + seq_printf(m, ",%s=", loctype); + seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\"); + seq_putc(m, '@'); + seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\"); + for (i = 1; i < fsloc->locations_count; i++) { + seq_putc(m, ';'); + seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\"); + seq_putc(m, '@'); + seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\"); + } + } +} + +static int e_show(struct seq_file *m, void *p) +{ + struct cache_head *cp = p; + struct svc_export *exp = container_of(cp, struct svc_export, h); + + if (p == SEQ_START_TOKEN) { + seq_puts(m, "# Version 1.1\n"); + seq_puts(m, "# Path Client(Flags) # IPs\n"); + return 0; + } + + cache_get(&exp->h); + if (cache_check(&svc_export_cache, &exp->h, NULL)) + return 0; + cache_put(&exp->h, &svc_export_cache); + return svc_export_show(m, &svc_export_cache, cp); +} + +const struct seq_operations nfs_exports_op = { + .start = e_start, + .next = e_next, + .stop = e_stop, + .show = e_show, +}; + +#ifdef CONFIG_NFSD_DEPRECATED +/* + * Add or modify a client. + * Change requests may involve the list of host addresses. The list of + * exports and possibly existing uid maps are left untouched. + */ +int +exp_addclient(struct nfsctl_client *ncp) +{ + struct auth_domain *dom; + int i, err; + struct in6_addr addr6; + + /* First, consistency check. */ + err = -EINVAL; + if (! exp_verify_string(ncp->cl_ident, NFSCLNT_IDMAX)) + goto out; + if (ncp->cl_naddr > NFSCLNT_ADDRMAX) + goto out; + + /* Lock the hashtable */ + exp_writelock(); + + dom = unix_domain_find(ncp->cl_ident); + + err = -ENOMEM; + if (!dom) + goto out_unlock; + + /* Insert client into hashtable. */ + for (i = 0; i < ncp->cl_naddr; i++) { + ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6); + auth_unix_add_addr(&init_net, &addr6, dom); + } + auth_unix_forget_old(dom); + auth_domain_put(dom); + + err = 0; + +out_unlock: + exp_writeunlock(); +out: + return err; +} + +/* + * Delete a client given an identifier. + */ +int +exp_delclient(struct nfsctl_client *ncp) +{ + int err; + struct auth_domain *dom; + + err = -EINVAL; + if (!exp_verify_string(ncp->cl_ident, NFSCLNT_IDMAX)) + goto out; + + /* Lock the hashtable */ + exp_writelock(); + + dom = auth_domain_find(ncp->cl_ident); + /* just make sure that no addresses work + * and that it will expire soon + */ + if (dom) { + err = auth_unix_forget_old(dom); + auth_domain_put(dom); + } + + exp_writeunlock(); +out: + return err; +} + +/* + * Verify that string is non-empty and does not exceed max length. + */ +static int +exp_verify_string(char *cp, int max) +{ + int i; + + for (i = 0; i < max; i++) + if (!cp[i]) + return i; + cp[i] = 0; + printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp); + return 0; +} +#endif /* CONFIG_NFSD_DEPRECATED */ + +/* + * Initialize the exports module. + */ +int +nfsd_export_init(void) +{ + int rv; + dprintk("nfsd: initializing export module.\n"); + + rv = cache_register(&svc_export_cache); + if (rv) + return rv; + rv = cache_register(&svc_expkey_cache); + if (rv) + cache_unregister(&svc_export_cache); + return rv; + +} + +/* + * Flush exports table - called when last nfsd thread is killed + */ +void +nfsd_export_flush(void) +{ + exp_writelock(); + cache_purge(&svc_expkey_cache); + cache_purge(&svc_export_cache); + exp_writeunlock(); +} + +/* + * Shutdown the exports module. + */ +void +nfsd_export_shutdown(void) +{ + + dprintk("nfsd: shutting down export module.\n"); + + exp_writelock(); + + cache_unregister(&svc_expkey_cache); + cache_unregister(&svc_export_cache); + svcauth_unix_purge(); + + exp_writeunlock(); + dprintk("nfsd: export shutdown complete.\n"); +} diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h new file mode 100644 index 00000000..2f3be132 --- /dev/null +++ b/fs/nfsd/idmap.h @@ -0,0 +1,62 @@ +/* + * Mapping of UID to name and vice versa. + * + * Copyright (c) 2002, 2003 The Regents of the University of + * Michigan. All rights reserved. +> * + * Marius Aamodt Eriksen + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LINUX_NFSD_IDMAP_H +#define LINUX_NFSD_IDMAP_H + +#include +#include + +/* XXX from linux/nfs_idmap.h */ +#define IDMAP_NAMESZ 128 + +#ifdef CONFIG_NFSD_V4 +int nfsd_idmap_init(void); +void nfsd_idmap_shutdown(void); +#else +static inline int nfsd_idmap_init(void) +{ + return 0; +} +static inline void nfsd_idmap_shutdown(void) +{ +} +#endif + +__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *); +__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *); +int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *); +int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *); + +#endif /* LINUX_NFSD_IDMAP_H */ diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c new file mode 100644 index 00000000..7c831a27 --- /dev/null +++ b/fs/nfsd/lockd.c @@ -0,0 +1,79 @@ +/* + * This file contains all the stubs needed when communicating with lockd. + * This level of indirection is necessary so we can run nfsd+lockd without + * requiring the nfs client to be compiled in/loaded, and vice versa. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include "nfsd.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_LOCKD + +#ifdef CONFIG_LOCKD_V4 +#define nlm_stale_fh nlm4_stale_fh +#define nlm_failed nlm4_failed +#else +#define nlm_stale_fh nlm_lck_denied_nolocks +#define nlm_failed nlm_lck_denied_nolocks +#endif +/* + * Note: we hold the dentry use count while the file is open. + */ +static __be32 +nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) +{ + __be32 nfserr; + struct svc_fh fh; + + /* must initialize before using! but maxsize doesn't matter */ + fh_init(&fh,0); + fh.fh_handle.fh_size = f->size; + memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size); + fh.fh_export = NULL; + + exp_readlock(); + nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); + fh_put(&fh); + exp_readunlock(); + /* We return nlm error codes as nlm doesn't know + * about nfsd, but nfsd does know about nlm.. + */ + switch (nfserr) { + case nfs_ok: + return 0; + case nfserr_dropit: + return nlm_drop_reply; + case nfserr_stale: + return nlm_stale_fh; + default: + return nlm_failed; + } +} + +static void +nlm_fclose(struct file *filp) +{ + fput(filp); +} + +static struct nlmsvc_binding nfsd_nlm_ops = { + .fopen = nlm_fopen, /* open file for locking */ + .fclose = nlm_fclose, /* close file */ +}; + +void +nfsd_lockd_init(void) +{ + dprintk("nfsd: initializing lockd\n"); + nlmsvc_ops = &nfsd_nlm_ops; +} + +void +nfsd_lockd_shutdown(void) +{ + nlmsvc_ops = NULL; +} diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c new file mode 100644 index 00000000..6aa5590c --- /dev/null +++ b/fs/nfsd/nfs2acl.c @@ -0,0 +1,356 @@ +/* + * Process version 2 NFSACL requests. + * + * Copyright (C) 2002-2003 Andreas Gruenbacher + */ + +#include "nfsd.h" +/* FIXME: nfsacl.h is a broken header */ +#include +#include +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC +#define RETURN_STATUS(st) { resp->status = (st); return (st); } + +/* + * NULL call. + */ +static __be32 +nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return nfs_ok; +} + +/* + * Get the Access and/or Default ACL of a file. + */ +static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, + struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) +{ + svc_fh *fh; + struct posix_acl *acl; + __be32 nfserr = 0; + + dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); + + fh = fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (nfserr) + RETURN_STATUS(nfserr); + + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + RETURN_STATUS(nfserr_inval); + resp->mask = argp->mask; + + if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { + acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) { + int err = PTR_ERR(acl); + + if (err == -ENODATA || err == -EOPNOTSUPP) + acl = NULL; + else { + nfserr = nfserrno(err); + goto fail; + } + } + if (acl == NULL) { + /* Solaris returns the inode's minimum ACL. */ + + struct inode *inode = fh->fh_dentry->d_inode; + acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + } + resp->acl_access = acl; + } + if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { + /* Check how Solaris handles requests for the Default ACL + of a non-directory! */ + + acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) { + int err = PTR_ERR(acl); + + if (err == -ENODATA || err == -EOPNOTSUPP) + acl = NULL; + else { + nfserr = nfserrno(err); + goto fail; + } + } + resp->acl_default = acl; + } + + /* resp->acl_{access,default} are released in nfssvc_release_getacl. */ + RETURN_STATUS(0); + +fail: + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); + RETURN_STATUS(nfserr); +} + +/* + * Set the Access and/or Default ACL of a file. + */ +static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, + struct nfsd3_setaclargs *argp, + struct nfsd_attrstat *resp) +{ + svc_fh *fh; + __be32 nfserr = 0; + + dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); + + fh = fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + + if (!nfserr) { + nfserr = nfserrno( nfsd_set_posix_acl( + fh, ACL_TYPE_ACCESS, argp->acl_access) ); + } + if (!nfserr) { + nfserr = nfserrno( nfsd_set_posix_acl( + fh, ACL_TYPE_DEFAULT, argp->acl_default) ); + } + + /* argp->acl_{access,default} may have been allocated in + nfssvc_decode_setaclargs. */ + posix_acl_release(argp->acl_access); + posix_acl_release(argp->acl_default); + return nfserr; +} + +/* + * Check file attributes + */ +static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp, + struct nfsd_fhandle *argp, struct nfsd_attrstat *resp) +{ + dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); + + fh_copy(&resp->fh, &argp->fh); + return fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); +} + +/* + * Check file access + */ +static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp, + struct nfsd3_accessres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: ACCESS(2acl) %s 0x%x\n", + SVCFH_fmt(&argp->fh), + argp->access); + + fh_copy(&resp->fh, &argp->fh); + resp->access = argp->access; + nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); + return nfserr; +} + +/* + * XDR decode functions + */ +static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_getaclargs *argp) +{ + if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + return 0; + argp->mask = ntohl(*p); p++; + + return xdr_argsize_check(rqstp, p); +} + + +static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_setaclargs *argp) +{ + struct kvec *head = rqstp->rq_arg.head; + unsigned int base; + int n; + + if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + return 0; + argp->mask = ntohl(*p++); + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || + !xdr_argsize_check(rqstp, p)) + return 0; + + base = (char *)p - (char *)head->iov_base; + n = nfsacl_decode(&rqstp->rq_arg, base, NULL, + (argp->mask & NFS_ACL) ? + &argp->acl_access : NULL); + if (n > 0) + n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, + (argp->mask & NFS_DFACL) ? + &argp->acl_default : NULL); + return (n > 0); +} + +static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_fhandle *argp) +{ + if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + return 0; + return xdr_argsize_check(rqstp, p); +} + +static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_accessargs *argp) +{ + if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + return 0; + argp->access = ntohl(*p++); + + return xdr_argsize_check(rqstp, p); +} + +/* + * XDR encode functions + */ + +/* + * There must be an encoding function for void results so svc_process + * will work properly. + */ +int +nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} + +/* GETACL */ +static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_getaclres *resp) +{ + struct dentry *dentry = resp->fh.fh_dentry; + struct inode *inode; + struct kvec *head = rqstp->rq_res.head; + unsigned int base; + int n; + int w; + + /* + * Since this is version 2, the check for nfserr in + * nfsd_dispatch actually ensures the following cannot happen. + * However, it seems fragile to depend on that. + */ + if (dentry == NULL || dentry->d_inode == NULL) + return 0; + inode = dentry->d_inode; + + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh); + *p++ = htonl(resp->mask); + if (!xdr_ressize_check(rqstp, p)) + return 0; + base = (char *)p - (char *)head->iov_base; + + rqstp->rq_res.page_len = w = nfsacl_size( + (resp->mask & NFS_ACL) ? resp->acl_access : NULL, + (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); + while (w > 0) { + if (!rqstp->rq_respages[rqstp->rq_resused++]) + return 0; + w -= PAGE_SIZE; + } + + n = nfsacl_encode(&rqstp->rq_res, base, inode, + resp->acl_access, + resp->mask & NFS_ACL, 0); + if (n > 0) + n = nfsacl_encode(&rqstp->rq_res, base + n, inode, + resp->acl_default, + resp->mask & NFS_DFACL, + NFS_ACL_DEFAULT); + if (n <= 0) + return 0; + return 1; +} + +static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_attrstat *resp) +{ + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh); + return xdr_ressize_check(rqstp, p); +} + +/* ACCESS */ +static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_accessres *resp) +{ + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh); + *p++ = htonl(resp->access); + return xdr_ressize_check(rqstp, p); +} + +/* + * XDR release functions + */ +static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_getaclres *resp) +{ + fh_put(&resp->fh); + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); + return 1; +} + +static int nfsaclsvc_release_attrstat(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_attrstat *resp) +{ + fh_put(&resp->fh); + return 1; +} + +static int nfsaclsvc_release_access(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_accessres *resp) +{ + fh_put(&resp->fh); + return 1; +} + +#define nfsaclsvc_decode_voidargs NULL +#define nfsaclsvc_release_void NULL +#define nfsd3_fhandleargs nfsd_fhandle +#define nfsd3_attrstatres nfsd_attrstat +#define nfsd3_voidres nfsd3_voidargs +struct nfsd3_voidargs { int dummy; }; + +#define PROC(name, argt, rest, relt, cache, respsize) \ + { (svc_procfunc) nfsacld_proc_##name, \ + (kxdrproc_t) nfsaclsvc_decode_##argt##args, \ + (kxdrproc_t) nfsaclsvc_encode_##rest##res, \ + (kxdrproc_t) nfsaclsvc_release_##relt, \ + sizeof(struct nfsd3_##argt##args), \ + sizeof(struct nfsd3_##rest##res), \ + 0, \ + cache, \ + respsize, \ + } + +#define ST 1 /* status*/ +#define AT 21 /* attributes */ +#define pAT (1+AT) /* post attributes - conditional */ +#define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ + +static struct svc_procedure nfsd_acl_procedures2[] = { + PROC(null, void, void, void, RC_NOCACHE, ST), + PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), + PROC(setacl, setacl, attrstat, attrstat, RC_NOCACHE, ST+AT), + PROC(getattr, fhandle, attrstat, attrstat, RC_NOCACHE, ST+AT), + PROC(access, access, access, access, RC_NOCACHE, ST+AT+1), +}; + +struct svc_version nfsd_acl_version2 = { + .vs_vers = 2, + .vs_nproc = 5, + .vs_proc = nfsd_acl_procedures2, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS3_SVC_XDRSIZE, + .vs_hidden = 0, +}; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c new file mode 100644 index 00000000..a596e9d9 --- /dev/null +++ b/fs/nfsd/nfs3acl.c @@ -0,0 +1,267 @@ +/* + * Process version 3 NFSACL requests. + * + * Copyright (C) 2002-2003 Andreas Gruenbacher + */ + +#include "nfsd.h" +/* FIXME: nfsacl.h is a broken header */ +#include +#include +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" + +#define RETURN_STATUS(st) { resp->status = (st); return (st); } + +/* + * NULL call. + */ +static __be32 +nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return nfs_ok; +} + +/* + * Get the Access and/or Default ACL of a file. + */ +static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, + struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) +{ + svc_fh *fh; + struct posix_acl *acl; + __be32 nfserr = 0; + + fh = fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (nfserr) + RETURN_STATUS(nfserr); + + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + RETURN_STATUS(nfserr_inval); + resp->mask = argp->mask; + + if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { + acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) { + int err = PTR_ERR(acl); + + if (err == -ENODATA || err == -EOPNOTSUPP) + acl = NULL; + else { + nfserr = nfserrno(err); + goto fail; + } + } + if (acl == NULL) { + /* Solaris returns the inode's minimum ACL. */ + + struct inode *inode = fh->fh_dentry->d_inode; + acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + } + resp->acl_access = acl; + } + if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { + /* Check how Solaris handles requests for the Default ACL + of a non-directory! */ + + acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) { + int err = PTR_ERR(acl); + + if (err == -ENODATA || err == -EOPNOTSUPP) + acl = NULL; + else { + nfserr = nfserrno(err); + goto fail; + } + } + resp->acl_default = acl; + } + + /* resp->acl_{access,default} are released in nfs3svc_release_getacl. */ + RETURN_STATUS(0); + +fail: + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); + RETURN_STATUS(nfserr); +} + +/* + * Set the Access and/or Default ACL of a file. + */ +static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp, + struct nfsd3_setaclargs *argp, + struct nfsd3_attrstat *resp) +{ + svc_fh *fh; + __be32 nfserr = 0; + + fh = fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + + if (!nfserr) { + nfserr = nfserrno( nfsd_set_posix_acl( + fh, ACL_TYPE_ACCESS, argp->acl_access) ); + } + if (!nfserr) { + nfserr = nfserrno( nfsd_set_posix_acl( + fh, ACL_TYPE_DEFAULT, argp->acl_default) ); + } + + /* argp->acl_{access,default} may have been allocated in + nfs3svc_decode_setaclargs. */ + posix_acl_release(argp->acl_access); + posix_acl_release(argp->acl_default); + RETURN_STATUS(nfserr); +} + +/* + * XDR decode functions + */ +static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_getaclargs *args) +{ + if (!(p = nfs3svc_decode_fh(p, &args->fh))) + return 0; + args->mask = ntohl(*p); p++; + + return xdr_argsize_check(rqstp, p); +} + + +static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_setaclargs *args) +{ + struct kvec *head = rqstp->rq_arg.head; + unsigned int base; + int n; + + if (!(p = nfs3svc_decode_fh(p, &args->fh))) + return 0; + args->mask = ntohl(*p++); + if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || + !xdr_argsize_check(rqstp, p)) + return 0; + + base = (char *)p - (char *)head->iov_base; + n = nfsacl_decode(&rqstp->rq_arg, base, NULL, + (args->mask & NFS_ACL) ? + &args->acl_access : NULL); + if (n > 0) + n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, + (args->mask & NFS_DFACL) ? + &args->acl_default : NULL); + return (n > 0); +} + +/* + * XDR encode functions + */ + +/* GETACL */ +static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_getaclres *resp) +{ + struct dentry *dentry = resp->fh.fh_dentry; + + p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); + if (resp->status == 0 && dentry && dentry->d_inode) { + struct inode *inode = dentry->d_inode; + struct kvec *head = rqstp->rq_res.head; + unsigned int base; + int n; + int w; + + *p++ = htonl(resp->mask); + if (!xdr_ressize_check(rqstp, p)) + return 0; + base = (char *)p - (char *)head->iov_base; + + rqstp->rq_res.page_len = w = nfsacl_size( + (resp->mask & NFS_ACL) ? resp->acl_access : NULL, + (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); + while (w > 0) { + if (!rqstp->rq_respages[rqstp->rq_resused++]) + return 0; + w -= PAGE_SIZE; + } + + n = nfsacl_encode(&rqstp->rq_res, base, inode, + resp->acl_access, + resp->mask & NFS_ACL, 0); + if (n > 0) + n = nfsacl_encode(&rqstp->rq_res, base + n, inode, + resp->acl_default, + resp->mask & NFS_DFACL, + NFS_ACL_DEFAULT); + if (n <= 0) + return 0; + } else + if (!xdr_ressize_check(rqstp, p)) + return 0; + + return 1; +} + +/* SETACL */ +static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_attrstat *resp) +{ + p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); + + return xdr_ressize_check(rqstp, p); +} + +/* + * XDR release functions + */ +static int nfs3svc_release_getacl(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_getaclres *resp) +{ + fh_put(&resp->fh); + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); + return 1; +} + +#define nfs3svc_decode_voidargs NULL +#define nfs3svc_release_void NULL +#define nfsd3_setaclres nfsd3_attrstat +#define nfsd3_voidres nfsd3_voidargs +struct nfsd3_voidargs { int dummy; }; + +#define PROC(name, argt, rest, relt, cache, respsize) \ + { (svc_procfunc) nfsd3_proc_##name, \ + (kxdrproc_t) nfs3svc_decode_##argt##args, \ + (kxdrproc_t) nfs3svc_encode_##rest##res, \ + (kxdrproc_t) nfs3svc_release_##relt, \ + sizeof(struct nfsd3_##argt##args), \ + sizeof(struct nfsd3_##rest##res), \ + 0, \ + cache, \ + respsize, \ + } + +#define ST 1 /* status*/ +#define AT 21 /* attributes */ +#define pAT (1+AT) /* post attributes - conditional */ +#define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ + +static struct svc_procedure nfsd_acl_procedures3[] = { + PROC(null, void, void, void, RC_NOCACHE, ST), + PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), + PROC(setacl, setacl, setacl, fhandle, RC_NOCACHE, ST+pAT), +}; + +struct svc_version nfsd_acl_version3 = { + .vs_vers = 3, + .vs_nproc = 3, + .vs_proc = nfsd_acl_procedures3, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS3_SVC_XDRSIZE, + .vs_hidden = 0, +}; + diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c new file mode 100644 index 00000000..9095f3c2 --- /dev/null +++ b/fs/nfsd/nfs3proc.c @@ -0,0 +1,896 @@ +/* + * Process version 3 NFS requests. + * + * Copyright (C) 1996, 1997, 1998 Olaf Kirch + */ + +#include +#include +#include + +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +#define RETURN_STATUS(st) { resp->status = (st); return (st); } + +static int nfs3_ftypes[] = { + 0, /* NF3NON */ + S_IFREG, /* NF3REG */ + S_IFDIR, /* NF3DIR */ + S_IFBLK, /* NF3BLK */ + S_IFCHR, /* NF3CHR */ + S_IFLNK, /* NF3LNK */ + S_IFSOCK, /* NF3SOCK */ + S_IFIFO, /* NF3FIFO */ +}; + +/* + * NULL call. + */ +static __be32 +nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return nfs_ok; +} + +/* + * Get a file's attributes + */ +static __be32 +nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, + struct nfsd3_attrstat *resp) +{ + int err; + __be32 nfserr; + + dprintk("nfsd: GETATTR(3) %s\n", + SVCFH_fmt(&argp->fh)); + + fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + if (nfserr) + RETURN_STATUS(nfserr); + + err = vfs_getattr(resp->fh.fh_export->ex_path.mnt, + resp->fh.fh_dentry, &resp->stat); + nfserr = nfserrno(err); + + RETURN_STATUS(nfserr); +} + +/* + * Set a file's attributes + */ +static __be32 +nfsd3_proc_setattr(struct svc_rqst *rqstp, struct nfsd3_sattrargs *argp, + struct nfsd3_attrstat *resp) +{ + __be32 nfserr; + + dprintk("nfsd: SETATTR(3) %s\n", + SVCFH_fmt(&argp->fh)); + + fh_copy(&resp->fh, &argp->fh); + nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs, + argp->check_guard, argp->guardtime); + RETURN_STATUS(nfserr); +} + +/* + * Look up a path name component + */ +static __be32 +nfsd3_proc_lookup(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp, + struct nfsd3_diropres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: LOOKUP(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + fh_copy(&resp->dirfh, &argp->fh); + fh_init(&resp->fh, NFS3_FHSIZE); + + nfserr = nfsd_lookup(rqstp, &resp->dirfh, + argp->name, + argp->len, + &resp->fh); + RETURN_STATUS(nfserr); +} + +/* + * Check file access + */ +static __be32 +nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp, + struct nfsd3_accessres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: ACCESS(3) %s 0x%x\n", + SVCFH_fmt(&argp->fh), + argp->access); + + fh_copy(&resp->fh, &argp->fh); + resp->access = argp->access; + nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); + RETURN_STATUS(nfserr); +} + +/* + * Read a symlink. + */ +static __be32 +nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd3_readlinkargs *argp, + struct nfsd3_readlinkres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh)); + + /* Read the symlink. */ + fh_copy(&resp->fh, &argp->fh); + resp->len = NFS3_MAXPATHLEN; + nfserr = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len); + RETURN_STATUS(nfserr); +} + +/* + * Read a portion of a file. + */ +static __be32 +nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, + struct nfsd3_readres *resp) +{ + __be32 nfserr; + u32 max_blocksize = svc_max_payload(rqstp); + + dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", + SVCFH_fmt(&argp->fh), + (unsigned long) argp->count, + (unsigned long long) argp->offset); + + /* Obtain buffer pointer for payload. + * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) + * + 1 (xdr opaque byte count) = 26 + */ + + resp->count = argp->count; + if (max_blocksize < resp->count) + resp->count = max_blocksize; + + svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); + + fh_copy(&resp->fh, &argp->fh); + nfserr = nfsd_read(rqstp, &resp->fh, + argp->offset, + rqstp->rq_vec, argp->vlen, + &resp->count); + if (nfserr == 0) { + struct inode *inode = resp->fh.fh_dentry->d_inode; + + resp->eof = (argp->offset + resp->count) >= inode->i_size; + } + + RETURN_STATUS(nfserr); +} + +/* + * Write data to a file + */ +static __be32 +nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, + struct nfsd3_writeres *resp) +{ + __be32 nfserr; + unsigned long cnt = argp->len; + + dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n", + SVCFH_fmt(&argp->fh), + argp->len, + (unsigned long long) argp->offset, + argp->stable? " stable" : ""); + + fh_copy(&resp->fh, &argp->fh); + resp->committed = argp->stable; + nfserr = nfsd_write(rqstp, &resp->fh, NULL, + argp->offset, + rqstp->rq_vec, argp->vlen, + &cnt, + &resp->committed); + resp->count = cnt; + RETURN_STATUS(nfserr); +} + +/* + * With NFSv3, CREATE processing is a lot easier than with NFSv2. + * At least in theory; we'll see how it fares in practice when the + * first reports about SunOS compatibility problems start to pour in... + */ +static __be32 +nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, + struct nfsd3_diropres *resp) +{ + svc_fh *dirfhp, *newfhp = NULL; + struct iattr *attr; + __be32 nfserr; + + dprintk("nfsd: CREATE(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + dirfhp = fh_copy(&resp->dirfh, &argp->fh); + newfhp = fh_init(&resp->fh, NFS3_FHSIZE); + attr = &argp->attrs; + + /* Get the directory inode */ + nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE); + if (nfserr) + RETURN_STATUS(nfserr); + + /* Unfudge the mode bits */ + attr->ia_mode &= ~S_IFMT; + if (!(attr->ia_valid & ATTR_MODE)) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode = S_IFREG; + } else { + attr->ia_mode = (attr->ia_mode & ~S_IFMT) | S_IFREG; + } + + /* Now create the file and set attributes */ + nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len, + attr, newfhp, + argp->createmode, argp->verf, NULL, NULL); + + RETURN_STATUS(nfserr); +} + +/* + * Make directory. This operation is not idempotent. + */ +static __be32 +nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, + struct nfsd3_diropres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: MKDIR(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + argp->attrs.ia_valid &= ~ATTR_SIZE; + fh_copy(&resp->dirfh, &argp->fh); + fh_init(&resp->fh, NFS3_FHSIZE); + nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, + &argp->attrs, S_IFDIR, 0, &resp->fh); + fh_unlock(&resp->dirfh); + RETURN_STATUS(nfserr); +} + +static __be32 +nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp, + struct nfsd3_diropres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n", + SVCFH_fmt(&argp->ffh), + argp->flen, argp->fname, + argp->tlen, argp->tname); + + fh_copy(&resp->dirfh, &argp->ffh); + fh_init(&resp->fh, NFS3_FHSIZE); + nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen, + argp->tname, argp->tlen, + &resp->fh, &argp->attrs); + RETURN_STATUS(nfserr); +} + +/* + * Make socket/fifo/device. + */ +static __be32 +nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp, + struct nfsd3_diropres *resp) +{ + __be32 nfserr; + int type; + dev_t rdev = 0; + + dprintk("nfsd: MKNOD(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + fh_copy(&resp->dirfh, &argp->fh); + fh_init(&resp->fh, NFS3_FHSIZE); + + if (argp->ftype == 0 || argp->ftype >= NF3BAD) + RETURN_STATUS(nfserr_inval); + if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) { + rdev = MKDEV(argp->major, argp->minor); + if (MAJOR(rdev) != argp->major || + MINOR(rdev) != argp->minor) + RETURN_STATUS(nfserr_inval); + } else + if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) + RETURN_STATUS(nfserr_inval); + + type = nfs3_ftypes[argp->ftype]; + nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, + &argp->attrs, type, rdev, &resp->fh); + fh_unlock(&resp->dirfh); + RETURN_STATUS(nfserr); +} + +/* + * Remove file/fifo/socket etc. + */ +static __be32 +nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp, + struct nfsd3_attrstat *resp) +{ + __be32 nfserr; + + dprintk("nfsd: REMOVE(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + /* Unlink. -S_IFDIR means file must not be a directory */ + fh_copy(&resp->fh, &argp->fh); + nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len); + fh_unlock(&resp->fh); + RETURN_STATUS(nfserr); +} + +/* + * Remove a directory + */ +static __be32 +nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp, + struct nfsd3_attrstat *resp) +{ + __be32 nfserr; + + dprintk("nfsd: RMDIR(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + fh_copy(&resp->fh, &argp->fh); + nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len); + fh_unlock(&resp->fh); + RETURN_STATUS(nfserr); +} + +static __be32 +nfsd3_proc_rename(struct svc_rqst *rqstp, struct nfsd3_renameargs *argp, + struct nfsd3_renameres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: RENAME(3) %s %.*s ->\n", + SVCFH_fmt(&argp->ffh), + argp->flen, + argp->fname); + dprintk("nfsd: -> %s %.*s\n", + SVCFH_fmt(&argp->tfh), + argp->tlen, + argp->tname); + + fh_copy(&resp->ffh, &argp->ffh); + fh_copy(&resp->tfh, &argp->tfh); + nfserr = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen, + &resp->tfh, argp->tname, argp->tlen); + RETURN_STATUS(nfserr); +} + +static __be32 +nfsd3_proc_link(struct svc_rqst *rqstp, struct nfsd3_linkargs *argp, + struct nfsd3_linkres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: LINK(3) %s ->\n", + SVCFH_fmt(&argp->ffh)); + dprintk("nfsd: -> %s %.*s\n", + SVCFH_fmt(&argp->tfh), + argp->tlen, + argp->tname); + + fh_copy(&resp->fh, &argp->ffh); + fh_copy(&resp->tfh, &argp->tfh); + nfserr = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen, + &resp->fh); + RETURN_STATUS(nfserr); +} + +/* + * Read a portion of a directory. + */ +static __be32 +nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp, + struct nfsd3_readdirres *resp) +{ + __be32 nfserr; + int count; + + dprintk("nfsd: READDIR(3) %s %d bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->count, (u32) argp->cookie); + + /* Make sure we've room for the NULL ptr & eof flag, and shrink to + * client read size */ + count = (argp->count >> 2) - 2; + + /* Read directory and encode entries on the fly */ + fh_copy(&resp->fh, &argp->fh); + + resp->buflen = count; + resp->common.err = nfs_ok; + resp->buffer = argp->buffer; + resp->rqstp = rqstp; + nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t*) &argp->cookie, + &resp->common, nfs3svc_encode_entry); + memcpy(resp->verf, argp->verf, 8); + resp->count = resp->buffer - argp->buffer; + if (resp->offset) + xdr_encode_hyper(resp->offset, argp->cookie); + + RETURN_STATUS(nfserr); +} + +/* + * Read a portion of a directory, including file handles and attrs. + * For now, we choose to ignore the dircount parameter. + */ +static __be32 +nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp, + struct nfsd3_readdirres *resp) +{ + __be32 nfserr; + int count = 0; + loff_t offset; + int i; + caddr_t page_addr = NULL; + + dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->count, (u32) argp->cookie); + + /* Convert byte count to number of words (i.e. >> 2), + * and reserve room for the NULL ptr & eof flag (-2 words) */ + resp->count = (argp->count >> 2) - 2; + + /* Read directory and encode entries on the fly */ + fh_copy(&resp->fh, &argp->fh); + + resp->common.err = nfs_ok; + resp->buffer = argp->buffer; + resp->buflen = resp->count; + resp->rqstp = rqstp; + offset = argp->cookie; + nfserr = nfsd_readdir(rqstp, &resp->fh, + &offset, + &resp->common, + nfs3svc_encode_entry_plus); + memcpy(resp->verf, argp->verf, 8); + for (i=1; irq_resused ; i++) { + page_addr = page_address(rqstp->rq_respages[i]); + + if (((caddr_t)resp->buffer >= page_addr) && + ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { + count += (caddr_t)resp->buffer - page_addr; + break; + } + count += PAGE_SIZE; + } + resp->count = count >> 2; + if (resp->offset) { + if (unlikely(resp->offset1)) { + /* we ended up with offset on a page boundary */ + *resp->offset = htonl(offset >> 32); + *resp->offset1 = htonl(offset & 0xffffffff); + resp->offset1 = NULL; + } else { + xdr_encode_hyper(resp->offset, offset); + } + } + + RETURN_STATUS(nfserr); +} + +/* + * Get file system stats + */ +static __be32 +nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, + struct nfsd3_fsstatres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: FSSTAT(3) %s\n", + SVCFH_fmt(&argp->fh)); + + nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); + fh_put(&argp->fh); + RETURN_STATUS(nfserr); +} + +/* + * Get file system info + */ +static __be32 +nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, + struct nfsd3_fsinfores *resp) +{ + __be32 nfserr; + u32 max_blocksize = svc_max_payload(rqstp); + + dprintk("nfsd: FSINFO(3) %s\n", + SVCFH_fmt(&argp->fh)); + + resp->f_rtmax = max_blocksize; + resp->f_rtpref = max_blocksize; + resp->f_rtmult = PAGE_SIZE; + resp->f_wtmax = max_blocksize; + resp->f_wtpref = max_blocksize; + resp->f_wtmult = PAGE_SIZE; + resp->f_dtpref = PAGE_SIZE; + resp->f_maxfilesize = ~(u32) 0; + resp->f_properties = NFS3_FSF_DEFAULT; + + nfserr = fh_verify(rqstp, &argp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + + /* Check special features of the file system. May request + * different read/write sizes for file systems known to have + * problems with large blocks */ + if (nfserr == 0) { + struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; + + /* Note that we don't care for remote fs's here */ + if (sb->s_magic == MSDOS_SUPER_MAGIC) { + resp->f_properties = NFS3_FSF_BILLYBOY; + } + resp->f_maxfilesize = sb->s_maxbytes; + } + + fh_put(&argp->fh); + RETURN_STATUS(nfserr); +} + +/* + * Get pathconf info for the specified file + */ +static __be32 +nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, + struct nfsd3_pathconfres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: PATHCONF(3) %s\n", + SVCFH_fmt(&argp->fh)); + + /* Set default pathconf */ + resp->p_link_max = 255; /* at least */ + resp->p_name_max = 255; /* at least */ + resp->p_no_trunc = 0; + resp->p_chown_restricted = 1; + resp->p_case_insensitive = 0; + resp->p_case_preserving = 1; + + nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); + + if (nfserr == 0) { + struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; + + /* Note that we don't care for remote fs's here */ + switch (sb->s_magic) { + case EXT2_SUPER_MAGIC: + resp->p_link_max = EXT2_LINK_MAX; + resp->p_name_max = EXT2_NAME_LEN; + break; + case MSDOS_SUPER_MAGIC: + resp->p_case_insensitive = 1; + resp->p_case_preserving = 0; + break; + } + } + + fh_put(&argp->fh); + RETURN_STATUS(nfserr); +} + + +/* + * Commit a file (range) to stable storage. + */ +static __be32 +nfsd3_proc_commit(struct svc_rqst * rqstp, struct nfsd3_commitargs *argp, + struct nfsd3_commitres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: COMMIT(3) %s %u@%Lu\n", + SVCFH_fmt(&argp->fh), + argp->count, + (unsigned long long) argp->offset); + + if (argp->offset > NFS_OFFSET_MAX) + RETURN_STATUS(nfserr_inval); + + fh_copy(&resp->fh, &argp->fh); + nfserr = nfsd_commit(rqstp, &resp->fh, argp->offset, argp->count); + + RETURN_STATUS(nfserr); +} + + +/* + * NFSv3 Server procedures. + * Only the results of non-idempotent operations are cached. + */ +#define nfs3svc_decode_fhandleargs nfs3svc_decode_fhandle +#define nfs3svc_encode_attrstatres nfs3svc_encode_attrstat +#define nfs3svc_encode_wccstatres nfs3svc_encode_wccstat +#define nfsd3_mkdirargs nfsd3_createargs +#define nfsd3_readdirplusargs nfsd3_readdirargs +#define nfsd3_fhandleargs nfsd_fhandle +#define nfsd3_fhandleres nfsd3_attrstat +#define nfsd3_attrstatres nfsd3_attrstat +#define nfsd3_wccstatres nfsd3_attrstat +#define nfsd3_createres nfsd3_diropres +#define nfsd3_voidres nfsd3_voidargs +struct nfsd3_voidargs { int dummy; }; + +#define PROC(name, argt, rest, relt, cache, respsize) \ + { (svc_procfunc) nfsd3_proc_##name, \ + (kxdrproc_t) nfs3svc_decode_##argt##args, \ + (kxdrproc_t) nfs3svc_encode_##rest##res, \ + (kxdrproc_t) nfs3svc_release_##relt, \ + sizeof(struct nfsd3_##argt##args), \ + sizeof(struct nfsd3_##rest##res), \ + 0, \ + cache, \ + respsize, \ + } + +#define ST 1 /* status*/ +#define FH 17 /* filehandle with length */ +#define AT 21 /* attributes */ +#define pAT (1+AT) /* post attributes - conditional */ +#define WC (7+pAT) /* WCC attributes */ + +static struct svc_procedure nfsd_procedures3[22] = { + [NFS3PROC_NULL] = { + .pc_func = (svc_procfunc) nfsd3_proc_null, + .pc_encode = (kxdrproc_t) nfs3svc_encode_voidres, + .pc_argsize = sizeof(struct nfsd3_voidargs), + .pc_ressize = sizeof(struct nfsd3_voidres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [NFS3PROC_GETATTR] = { + .pc_func = (svc_procfunc) nfsd3_proc_getattr, + .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_attrstatres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_ressize = sizeof(struct nfsd3_attrstatres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + }, + [NFS3PROC_SETATTR] = { + .pc_func = (svc_procfunc) nfsd3_proc_setattr, + .pc_decode = (kxdrproc_t) nfs3svc_decode_sattrargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_sattrargs), + .pc_ressize = sizeof(struct nfsd3_wccstatres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC, + }, + [NFS3PROC_LOOKUP] = { + .pc_func = (svc_procfunc) nfsd3_proc_lookup, + .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_diropres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_ressize = sizeof(struct nfsd3_diropres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+FH+pAT+pAT, + }, + [NFS3PROC_ACCESS] = { + .pc_func = (svc_procfunc) nfsd3_proc_access, + .pc_decode = (kxdrproc_t) nfs3svc_decode_accessargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_accessres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_accessargs), + .pc_ressize = sizeof(struct nfsd3_accessres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+1, + }, + [NFS3PROC_READLINK] = { + .pc_func = (svc_procfunc) nfsd3_proc_readlink, + .pc_decode = (kxdrproc_t) nfs3svc_decode_readlinkargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_readlinkres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_readlinkargs), + .pc_ressize = sizeof(struct nfsd3_readlinkres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4, + }, + [NFS3PROC_READ] = { + .pc_func = (svc_procfunc) nfsd3_proc_read, + .pc_decode = (kxdrproc_t) nfs3svc_decode_readargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_readres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_readargs), + .pc_ressize = sizeof(struct nfsd3_readres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4, + }, + [NFS3PROC_WRITE] = { + .pc_func = (svc_procfunc) nfsd3_proc_write, + .pc_decode = (kxdrproc_t) nfs3svc_decode_writeargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_writeres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_writeargs), + .pc_ressize = sizeof(struct nfsd3_writeres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC+4, + }, + [NFS3PROC_CREATE] = { + .pc_func = (svc_procfunc) nfsd3_proc_create, + .pc_decode = (kxdrproc_t) nfs3svc_decode_createargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_createres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_createargs), + .pc_ressize = sizeof(struct nfsd3_createres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+(1+FH+pAT)+WC, + }, + [NFS3PROC_MKDIR] = { + .pc_func = (svc_procfunc) nfsd3_proc_mkdir, + .pc_decode = (kxdrproc_t) nfs3svc_decode_mkdirargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_createres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_mkdirargs), + .pc_ressize = sizeof(struct nfsd3_createres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+(1+FH+pAT)+WC, + }, + [NFS3PROC_SYMLINK] = { + .pc_func = (svc_procfunc) nfsd3_proc_symlink, + .pc_decode = (kxdrproc_t) nfs3svc_decode_symlinkargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_createres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_symlinkargs), + .pc_ressize = sizeof(struct nfsd3_createres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+(1+FH+pAT)+WC, + }, + [NFS3PROC_MKNOD] = { + .pc_func = (svc_procfunc) nfsd3_proc_mknod, + .pc_decode = (kxdrproc_t) nfs3svc_decode_mknodargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_createres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_mknodargs), + .pc_ressize = sizeof(struct nfsd3_createres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+(1+FH+pAT)+WC, + }, + [NFS3PROC_REMOVE] = { + .pc_func = (svc_procfunc) nfsd3_proc_remove, + .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_ressize = sizeof(struct nfsd3_wccstatres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC, + }, + [NFS3PROC_RMDIR] = { + .pc_func = (svc_procfunc) nfsd3_proc_rmdir, + .pc_decode = (kxdrproc_t) nfs3svc_decode_diropargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_wccstatres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_ressize = sizeof(struct nfsd3_wccstatres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC, + }, + [NFS3PROC_RENAME] = { + .pc_func = (svc_procfunc) nfsd3_proc_rename, + .pc_decode = (kxdrproc_t) nfs3svc_decode_renameargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_renameres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_renameargs), + .pc_ressize = sizeof(struct nfsd3_renameres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC+WC, + }, + [NFS3PROC_LINK] = { + .pc_func = (svc_procfunc) nfsd3_proc_link, + .pc_decode = (kxdrproc_t) nfs3svc_decode_linkargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_linkres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_linkargs), + .pc_ressize = sizeof(struct nfsd3_linkres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+pAT+WC, + }, + [NFS3PROC_READDIR] = { + .pc_func = (svc_procfunc) nfsd3_proc_readdir, + .pc_decode = (kxdrproc_t) nfs3svc_decode_readdirargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_readdirres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_readdirargs), + .pc_ressize = sizeof(struct nfsd3_readdirres), + .pc_cachetype = RC_NOCACHE, + }, + [NFS3PROC_READDIRPLUS] = { + .pc_func = (svc_procfunc) nfsd3_proc_readdirplus, + .pc_decode = (kxdrproc_t) nfs3svc_decode_readdirplusargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_readdirres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_readdirplusargs), + .pc_ressize = sizeof(struct nfsd3_readdirres), + .pc_cachetype = RC_NOCACHE, + }, + [NFS3PROC_FSSTAT] = { + .pc_func = (svc_procfunc) nfsd3_proc_fsstat, + .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_fsstatres, + .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_ressize = sizeof(struct nfsd3_fsstatres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+2*6+1, + }, + [NFS3PROC_FSINFO] = { + .pc_func = (svc_procfunc) nfsd3_proc_fsinfo, + .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_fsinfores, + .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_ressize = sizeof(struct nfsd3_fsinfores), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+12, + }, + [NFS3PROC_PATHCONF] = { + .pc_func = (svc_procfunc) nfsd3_proc_pathconf, + .pc_decode = (kxdrproc_t) nfs3svc_decode_fhandleargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_pathconfres, + .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_ressize = sizeof(struct nfsd3_pathconfres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+6, + }, + [NFS3PROC_COMMIT] = { + .pc_func = (svc_procfunc) nfsd3_proc_commit, + .pc_decode = (kxdrproc_t) nfs3svc_decode_commitargs, + .pc_encode = (kxdrproc_t) nfs3svc_encode_commitres, + .pc_release = (kxdrproc_t) nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_commitargs), + .pc_ressize = sizeof(struct nfsd3_commitres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+WC+2, + }, +}; + +struct svc_version nfsd_version3 = { + .vs_vers = 3, + .vs_nproc = 22, + .vs_proc = nfsd_procedures3, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS3_SVC_XDRSIZE, +}; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c new file mode 100644 index 00000000..43f46cd9 --- /dev/null +++ b/fs/nfsd/nfs3xdr.c @@ -0,0 +1,1112 @@ +/* + * XDR support for nfsd/protocol version 3. + * + * Copyright (C) 1995, 1996, 1997 Olaf Kirch + * + * 2003-08-09 Jamie Lokier: Use htonl() for nanoseconds, not htons()! + */ + +#include +#include "xdr3.h" +#include "auth.h" + +#define NFSDDBG_FACILITY NFSDDBG_XDR + + +/* + * Mapping of S_IF* types to NFS file types + */ +static u32 nfs3_ftypes[] = { + NF3NON, NF3FIFO, NF3CHR, NF3BAD, + NF3DIR, NF3BAD, NF3BLK, NF3BAD, + NF3REG, NF3BAD, NF3LNK, NF3BAD, + NF3SOCK, NF3BAD, NF3LNK, NF3BAD, +}; + +/* + * XDR functions for basic NFS types + */ +static __be32 * +encode_time3(__be32 *p, struct timespec *time) +{ + *p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec); + return p; +} + +static __be32 * +decode_time3(__be32 *p, struct timespec *time) +{ + time->tv_sec = ntohl(*p++); + time->tv_nsec = ntohl(*p++); + return p; +} + +static __be32 * +decode_fh(__be32 *p, struct svc_fh *fhp) +{ + unsigned int size; + fh_init(fhp, NFS3_FHSIZE); + size = ntohl(*p++); + if (size > NFS3_FHSIZE) + return NULL; + + memcpy(&fhp->fh_handle.fh_base, p, size); + fhp->fh_handle.fh_size = size; + return p + XDR_QUADLEN(size); +} + +/* Helper function for NFSv3 ACL code */ +__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp) +{ + return decode_fh(p, fhp); +} + +static __be32 * +encode_fh(__be32 *p, struct svc_fh *fhp) +{ + unsigned int size = fhp->fh_handle.fh_size; + *p++ = htonl(size); + if (size) p[XDR_QUADLEN(size)-1]=0; + memcpy(p, &fhp->fh_handle.fh_base, size); + return p + XDR_QUADLEN(size); +} + +/* + * Decode a file name and make sure that the path contains + * no slashes or null bytes. + */ +static __be32 * +decode_filename(__be32 *p, char **namp, unsigned int *lenp) +{ + char *name; + unsigned int i; + + if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) { + for (i = 0, name = *namp; i < *lenp; i++, name++) { + if (*name == '\0' || *name == '/') + return NULL; + } + } + + return p; +} + +static __be32 * +decode_sattr3(__be32 *p, struct iattr *iap) +{ + u32 tmp; + + iap->ia_valid = 0; + + if (*p++) { + iap->ia_valid |= ATTR_MODE; + iap->ia_mode = ntohl(*p++); + } + if (*p++) { + iap->ia_valid |= ATTR_UID; + iap->ia_uid = ntohl(*p++); + } + if (*p++) { + iap->ia_valid |= ATTR_GID; + iap->ia_gid = ntohl(*p++); + } + if (*p++) { + u64 newsize; + + iap->ia_valid |= ATTR_SIZE; + p = xdr_decode_hyper(p, &newsize); + if (newsize <= NFS_OFFSET_MAX) + iap->ia_size = newsize; + else + iap->ia_size = NFS_OFFSET_MAX; + } + if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ + iap->ia_valid |= ATTR_ATIME; + } else if (tmp == 2) { /* set to client time */ + iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; + iap->ia_atime.tv_sec = ntohl(*p++); + iap->ia_atime.tv_nsec = ntohl(*p++); + } + if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ + iap->ia_valid |= ATTR_MTIME; + } else if (tmp == 2) { /* set to client time */ + iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; + iap->ia_mtime.tv_sec = ntohl(*p++); + iap->ia_mtime.tv_nsec = ntohl(*p++); + } + return p; +} + +static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp) +{ + u64 f; + switch(fsid_source(fhp)) { + default: + case FSIDSOURCE_DEV: + p = xdr_encode_hyper(p, (u64)huge_encode_dev + (fhp->fh_dentry->d_inode->i_sb->s_dev)); + break; + case FSIDSOURCE_FSID: + p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); + break; + case FSIDSOURCE_UUID: + f = ((u64*)fhp->fh_export->ex_uuid)[0]; + f ^= ((u64*)fhp->fh_export->ex_uuid)[1]; + p = xdr_encode_hyper(p, f); + break; + } + return p; +} + +static __be32 * +encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, + struct kstat *stat) +{ + *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); + *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) stat->nlink); + *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid)); + *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid)); + if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) { + p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); + } else { + p = xdr_encode_hyper(p, (u64) stat->size); + } + p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9); + *p++ = htonl((u32) MAJOR(stat->rdev)); + *p++ = htonl((u32) MINOR(stat->rdev)); + p = encode_fsid(p, fhp); + p = xdr_encode_hyper(p, stat->ino); + p = encode_time3(p, &stat->atime); + p = encode_time3(p, &stat->mtime); + p = encode_time3(p, &stat->ctime); + + return p; +} + +static __be32 * +encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +{ + /* Attributes to follow */ + *p++ = xdr_one; + return encode_fattr3(rqstp, p, fhp, &fhp->fh_post_attr); +} + +/* + * Encode post-operation attributes. + * The inode may be NULL if the call failed because of a stale file + * handle. In this case, no attributes are returned. + */ +static __be32 * +encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +{ + struct dentry *dentry = fhp->fh_dentry; + if (dentry && dentry->d_inode) { + int err; + struct kstat stat; + + err = vfs_getattr(fhp->fh_export->ex_path.mnt, dentry, &stat); + if (!err) { + *p++ = xdr_one; /* attributes follow */ + lease_get_mtime(dentry->d_inode, &stat.mtime); + return encode_fattr3(rqstp, p, fhp, &stat); + } + } + *p++ = xdr_zero; + return p; +} + +/* Helper for NFSv3 ACLs */ +__be32 * +nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +{ + return encode_post_op_attr(rqstp, p, fhp); +} + +/* + * Enocde weak cache consistency data + */ +static __be32 * +encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +{ + struct dentry *dentry = fhp->fh_dentry; + + if (dentry && dentry->d_inode && fhp->fh_post_saved) { + if (fhp->fh_pre_saved) { + *p++ = xdr_one; + p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size); + p = encode_time3(p, &fhp->fh_pre_mtime); + p = encode_time3(p, &fhp->fh_pre_ctime); + } else { + *p++ = xdr_zero; + } + return encode_saved_post_attr(rqstp, p, fhp); + } + /* no pre- or post-attrs */ + *p++ = xdr_zero; + return encode_post_op_attr(rqstp, p, fhp); +} + +/* + * Fill in the post_op attr for the wcc data + */ +void fill_post_wcc(struct svc_fh *fhp) +{ + int err; + + if (fhp->fh_post_saved) + printk("nfsd: inode locked twice during operation.\n"); + + err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, + &fhp->fh_post_attr); + fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version; + if (err) { + fhp->fh_post_saved = 0; + /* Grab the ctime anyway - set_change_info might use it */ + fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime; + } else + fhp->fh_post_saved = 1; +} + +/* + * XDR decode functions + */ +int +nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args) +{ + if (!(p = decode_fh(p, &args->fh))) + return 0; + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_sattrargs *args) +{ + if (!(p = decode_fh(p, &args->fh))) + return 0; + p = decode_sattr3(p, &args->attrs); + + if ((args->check_guard = ntohl(*p++)) != 0) { + struct timespec time; + p = decode_time3(p, &time); + args->guardtime = time.tv_sec; + } + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_diropargs *args) +{ + if (!(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_accessargs *args) +{ + if (!(p = decode_fh(p, &args->fh))) + return 0; + args->access = ntohl(*p++); + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_readargs *args) +{ + unsigned int len; + int v,pn; + u32 max_blocksize = svc_max_payload(rqstp); + + if (!(p = decode_fh(p, &args->fh))) + return 0; + p = xdr_decode_hyper(p, &args->offset); + + len = args->count = ntohl(*p++); + + if (len > max_blocksize) + len = max_blocksize; + + /* set up the kvec */ + v=0; + while (len > 0) { + pn = rqstp->rq_resused++; + rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]); + rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE; + len -= rqstp->rq_vec[v].iov_len; + v++; + } + args->vlen = v; + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_writeargs *args) +{ + unsigned int len, v, hdr, dlen; + u32 max_blocksize = svc_max_payload(rqstp); + + if (!(p = decode_fh(p, &args->fh))) + return 0; + p = xdr_decode_hyper(p, &args->offset); + + args->count = ntohl(*p++); + args->stable = ntohl(*p++); + len = args->len = ntohl(*p++); + /* + * The count must equal the amount of data passed. + */ + if (args->count != args->len) + return 0; + + /* + * Check to make sure that we got the right number of + * bytes. + */ + hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; + dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len + - hdr; + /* + * Round the length of the data which was specified up to + * the next multiple of XDR units and then compare that + * against the length which was actually received. + * Note that when RPCSEC/GSS (for example) is used, the + * data buffer can be padded so dlen might be larger + * than required. It must never be smaller. + */ + if (dlen < XDR_QUADLEN(len)*4) + return 0; + + if (args->count > max_blocksize) { + args->count = max_blocksize; + len = args->len = max_blocksize; + } + rqstp->rq_vec[0].iov_base = (void*)p; + rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + v = 0; + while (len > rqstp->rq_vec[v].iov_len) { + len -= rqstp->rq_vec[v].iov_len; + v++; + rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]); + rqstp->rq_vec[v].iov_len = PAGE_SIZE; + } + rqstp->rq_vec[v].iov_len = len; + args->vlen = v + 1; + return 1; +} + +int +nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_createargs *args) +{ + if (!(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + + switch (args->createmode = ntohl(*p++)) { + case NFS3_CREATE_UNCHECKED: + case NFS3_CREATE_GUARDED: + p = decode_sattr3(p, &args->attrs); + break; + case NFS3_CREATE_EXCLUSIVE: + args->verf = p; + p += 2; + break; + default: + return 0; + } + + return xdr_argsize_check(rqstp, p); +} +int +nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_createargs *args) +{ + if (!(p = decode_fh(p, &args->fh)) || + !(p = decode_filename(p, &args->name, &args->len))) + return 0; + p = decode_sattr3(p, &args->attrs); + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_symlinkargs *args) +{ + unsigned int len, avail; + char *old, *new; + struct kvec *vec; + + if (!(p = decode_fh(p, &args->ffh)) || + !(p = decode_filename(p, &args->fname, &args->flen)) + ) + return 0; + p = decode_sattr3(p, &args->attrs); + + /* now decode the pathname, which might be larger than the first page. + * As we have to check for nul's anyway, we copy it into a new page + * This page appears in the rq_res.pages list, but as pages_len is always + * 0, it won't get in the way + */ + len = ntohl(*p++); + if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE) + return 0; + args->tname = new = + page_address(rqstp->rq_respages[rqstp->rq_resused++]); + args->tlen = len; + /* first copy and check from the first page */ + old = (char*)p; + vec = &rqstp->rq_arg.head[0]; + avail = vec->iov_len - (old - (char*)vec->iov_base); + while (len && avail && *old) { + *new++ = *old++; + len--; + avail--; + } + /* now copy next page if there is one */ + if (len && !avail && rqstp->rq_arg.page_len) { + avail = rqstp->rq_arg.page_len; + if (avail > PAGE_SIZE) + avail = PAGE_SIZE; + old = page_address(rqstp->rq_arg.pages[0]); + } + while (len && avail && *old) { + *new++ = *old++; + len--; + avail--; + } + *new = '\0'; + if (len) + return 0; + + return 1; +} + +int +nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_mknodargs *args) +{ + if (!(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + + args->ftype = ntohl(*p++); + + if (args->ftype == NF3BLK || args->ftype == NF3CHR + || args->ftype == NF3SOCK || args->ftype == NF3FIFO) + p = decode_sattr3(p, &args->attrs); + + if (args->ftype == NF3BLK || args->ftype == NF3CHR) { + args->major = ntohl(*p++); + args->minor = ntohl(*p++); + } + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_renameargs *args) +{ + if (!(p = decode_fh(p, &args->ffh)) + || !(p = decode_filename(p, &args->fname, &args->flen)) + || !(p = decode_fh(p, &args->tfh)) + || !(p = decode_filename(p, &args->tname, &args->tlen))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_readlinkargs *args) +{ + if (!(p = decode_fh(p, &args->fh))) + return 0; + args->buffer = + page_address(rqstp->rq_respages[rqstp->rq_resused++]); + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_linkargs *args) +{ + if (!(p = decode_fh(p, &args->ffh)) + || !(p = decode_fh(p, &args->tfh)) + || !(p = decode_filename(p, &args->tname, &args->tlen))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_readdirargs *args) +{ + if (!(p = decode_fh(p, &args->fh))) + return 0; + p = xdr_decode_hyper(p, &args->cookie); + args->verf = p; p += 2; + args->dircount = ~0; + args->count = ntohl(*p++); + + if (args->count > PAGE_SIZE) + args->count = PAGE_SIZE; + + args->buffer = + page_address(rqstp->rq_respages[rqstp->rq_resused++]); + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_readdirargs *args) +{ + int len, pn; + u32 max_blocksize = svc_max_payload(rqstp); + + if (!(p = decode_fh(p, &args->fh))) + return 0; + p = xdr_decode_hyper(p, &args->cookie); + args->verf = p; p += 2; + args->dircount = ntohl(*p++); + args->count = ntohl(*p++); + + len = (args->count > max_blocksize) ? max_blocksize : + args->count; + args->count = len; + + while (len > 0) { + pn = rqstp->rq_resused++; + if (!args->buffer) + args->buffer = page_address(rqstp->rq_respages[pn]); + len -= PAGE_SIZE; + } + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_commitargs *args) +{ + if (!(p = decode_fh(p, &args->fh))) + return 0; + p = xdr_decode_hyper(p, &args->offset); + args->count = ntohl(*p++); + + return xdr_argsize_check(rqstp, p); +} + +/* + * XDR encode functions + */ +/* + * There must be an encoding function for void results so svc_process + * will work properly. + */ +int +nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} + +/* GETATTR */ +int +nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_attrstat *resp) +{ + if (resp->status == 0) { + lease_get_mtime(resp->fh.fh_dentry->d_inode, + &resp->stat.mtime); + p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat); + } + return xdr_ressize_check(rqstp, p); +} + +/* SETATTR, REMOVE, RMDIR */ +int +nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_attrstat *resp) +{ + p = encode_wcc_data(rqstp, p, &resp->fh); + return xdr_ressize_check(rqstp, p); +} + +/* LOOKUP */ +int +nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_diropres *resp) +{ + if (resp->status == 0) { + p = encode_fh(p, &resp->fh); + p = encode_post_op_attr(rqstp, p, &resp->fh); + } + p = encode_post_op_attr(rqstp, p, &resp->dirfh); + return xdr_ressize_check(rqstp, p); +} + +/* ACCESS */ +int +nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_accessres *resp) +{ + p = encode_post_op_attr(rqstp, p, &resp->fh); + if (resp->status == 0) + *p++ = htonl(resp->access); + return xdr_ressize_check(rqstp, p); +} + +/* READLINK */ +int +nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_readlinkres *resp) +{ + p = encode_post_op_attr(rqstp, p, &resp->fh); + if (resp->status == 0) { + *p++ = htonl(resp->len); + xdr_ressize_check(rqstp, p); + rqstp->rq_res.page_len = resp->len; + if (resp->len & 3) { + /* need to pad the tail */ + rqstp->rq_res.tail[0].iov_base = p; + *p = 0; + rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); + } + return 1; + } else + return xdr_ressize_check(rqstp, p); +} + +/* READ */ +int +nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_readres *resp) +{ + p = encode_post_op_attr(rqstp, p, &resp->fh); + if (resp->status == 0) { + *p++ = htonl(resp->count); + *p++ = htonl(resp->eof); + *p++ = htonl(resp->count); /* xdr opaque count */ + xdr_ressize_check(rqstp, p); + /* now update rqstp->rq_res to reflect data as well */ + rqstp->rq_res.page_len = resp->count; + if (resp->count & 3) { + /* need to pad the tail */ + rqstp->rq_res.tail[0].iov_base = p; + *p = 0; + rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3); + } + return 1; + } else + return xdr_ressize_check(rqstp, p); +} + +/* WRITE */ +int +nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_writeres *resp) +{ + p = encode_wcc_data(rqstp, p, &resp->fh); + if (resp->status == 0) { + *p++ = htonl(resp->count); + *p++ = htonl(resp->committed); + *p++ = htonl(nfssvc_boot.tv_sec); + *p++ = htonl(nfssvc_boot.tv_usec); + } + return xdr_ressize_check(rqstp, p); +} + +/* CREATE, MKDIR, SYMLINK, MKNOD */ +int +nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_diropres *resp) +{ + if (resp->status == 0) { + *p++ = xdr_one; + p = encode_fh(p, &resp->fh); + p = encode_post_op_attr(rqstp, p, &resp->fh); + } + p = encode_wcc_data(rqstp, p, &resp->dirfh); + return xdr_ressize_check(rqstp, p); +} + +/* RENAME */ +int +nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_renameres *resp) +{ + p = encode_wcc_data(rqstp, p, &resp->ffh); + p = encode_wcc_data(rqstp, p, &resp->tfh); + return xdr_ressize_check(rqstp, p); +} + +/* LINK */ +int +nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_linkres *resp) +{ + p = encode_post_op_attr(rqstp, p, &resp->fh); + p = encode_wcc_data(rqstp, p, &resp->tfh); + return xdr_ressize_check(rqstp, p); +} + +/* READDIR */ +int +nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_readdirres *resp) +{ + p = encode_post_op_attr(rqstp, p, &resp->fh); + + if (resp->status == 0) { + /* stupid readdir cookie */ + memcpy(p, resp->verf, 8); p += 2; + xdr_ressize_check(rqstp, p); + if (rqstp->rq_res.head[0].iov_len + (2<<2) > PAGE_SIZE) + return 1; /*No room for trailer */ + rqstp->rq_res.page_len = (resp->count) << 2; + + /* add the 'tail' to the end of the 'head' page - page 0. */ + rqstp->rq_res.tail[0].iov_base = p; + *p++ = 0; /* no more entries */ + *p++ = htonl(resp->common.err == nfserr_eof); + rqstp->rq_res.tail[0].iov_len = 2<<2; + return 1; + } else + return xdr_ressize_check(rqstp, p); +} + +static __be32 * +encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, + int namlen, u64 ino) +{ + *p++ = xdr_one; /* mark entry present */ + p = xdr_encode_hyper(p, ino); /* file id */ + p = xdr_encode_array(p, name, namlen);/* name length & name */ + + cd->offset = p; /* remember pointer */ + p = xdr_encode_hyper(p, NFS_OFFSET_MAX);/* offset of next entry */ + + return p; +} + +static __be32 +compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, + const char *name, int namlen) +{ + struct svc_export *exp; + struct dentry *dparent, *dchild; + __be32 rv = nfserr_noent; + + dparent = cd->fh.fh_dentry; + exp = cd->fh.fh_export; + + if (isdotent(name, namlen)) { + if (namlen == 2) { + dchild = dget_parent(dparent); + /* filesystem root - cannot return filehandle for ".." */ + if (dchild == dparent) + goto out; + } else + dchild = dget(dparent); + } else + dchild = lookup_one_len(name, dparent, namlen); + if (IS_ERR(dchild)) + return rv; + if (d_mountpoint(dchild)) + goto out; + if (!dchild->d_inode) + goto out; + rv = fh_compose(fhp, exp, dchild, &cd->fh); +out: + dput(dchild); + return rv; +} + +static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) +{ + struct svc_fh fh; + __be32 err; + + fh_init(&fh, NFS3_FHSIZE); + err = compose_entry_fh(cd, &fh, name, namlen); + if (err) { + *p++ = 0; + *p++ = 0; + goto out; + } + p = encode_post_op_attr(cd->rqstp, p, &fh); + *p++ = xdr_one; /* yes, a file handle follows */ + p = encode_fh(p, &fh); +out: + fh_put(&fh); + return p; +} + +/* + * Encode a directory entry. This one works for both normal readdir + * and readdirplus. + * The normal readdir reply requires 2 (fileid) + 1 (stringlen) + * + string + 2 (cookie) + 1 (next) words, i.e. 6 + strlen. + * + * The readdirplus baggage is 1+21 words for post_op_attr, plus the + * file handle. + */ + +#define NFS3_ENTRY_BAGGAGE (2 + 1 + 2 + 1) +#define NFS3_ENTRYPLUS_BAGGAGE (1 + 21 + 1 + (NFS3_FHSIZE >> 2)) +static int +encode_entry(struct readdir_cd *ccd, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type, int plus) +{ + struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres, + common); + __be32 *p = cd->buffer; + caddr_t curr_page_addr = NULL; + int pn; /* current page number */ + int slen; /* string (name) length */ + int elen; /* estimated entry length in words */ + int num_entry_words = 0; /* actual number of words */ + + if (cd->offset) { + u64 offset64 = offset; + + if (unlikely(cd->offset1)) { + /* we ended up with offset on a page boundary */ + *cd->offset = htonl(offset64 >> 32); + *cd->offset1 = htonl(offset64 & 0xffffffff); + cd->offset1 = NULL; + } else { + xdr_encode_hyper(cd->offset, offset64); + } + } + + /* + dprintk("encode_entry(%.*s @%ld%s)\n", + namlen, name, (long) offset, plus? " plus" : ""); + */ + + /* truncate filename if too long */ + if (namlen > NFS3_MAXNAMLEN) + namlen = NFS3_MAXNAMLEN; + + slen = XDR_QUADLEN(namlen); + elen = slen + NFS3_ENTRY_BAGGAGE + + (plus? NFS3_ENTRYPLUS_BAGGAGE : 0); + + if (cd->buflen < elen) { + cd->common.err = nfserr_toosmall; + return -EINVAL; + } + + /* determine which page in rq_respages[] we are currently filling */ + for (pn=1; pn < cd->rqstp->rq_resused; pn++) { + curr_page_addr = page_address(cd->rqstp->rq_respages[pn]); + + if (((caddr_t)cd->buffer >= curr_page_addr) && + ((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE)) + break; + } + + if ((caddr_t)(cd->buffer + elen) < (curr_page_addr + PAGE_SIZE)) { + /* encode entry in current page */ + + p = encode_entry_baggage(cd, p, name, namlen, ino); + + if (plus) + p = encode_entryplus_baggage(cd, p, name, namlen); + num_entry_words = p - cd->buffer; + } else if (cd->rqstp->rq_respages[pn+1] != NULL) { + /* temporarily encode entry into next page, then move back to + * current and next page in rq_respages[] */ + __be32 *p1, *tmp; + int len1, len2; + + /* grab next page for temporary storage of entry */ + p1 = tmp = page_address(cd->rqstp->rq_respages[pn+1]); + + p1 = encode_entry_baggage(cd, p1, name, namlen, ino); + + if (plus) + p1 = encode_entryplus_baggage(cd, p1, name, namlen); + + /* determine entry word length and lengths to go in pages */ + num_entry_words = p1 - tmp; + len1 = curr_page_addr + PAGE_SIZE - (caddr_t)cd->buffer; + if ((num_entry_words << 2) < len1) { + /* the actual number of words in the entry is less + * than elen and can still fit in the current page + */ + memmove(p, tmp, num_entry_words << 2); + p += num_entry_words; + + /* update offset */ + cd->offset = cd->buffer + (cd->offset - tmp); + } else { + unsigned int offset_r = (cd->offset - tmp) << 2; + + /* update pointer to offset location. + * This is a 64bit quantity, so we need to + * deal with 3 cases: + * - entirely in first page + * - entirely in second page + * - 4 bytes in each page + */ + if (offset_r + 8 <= len1) { + cd->offset = p + (cd->offset - tmp); + } else if (offset_r >= len1) { + cd->offset -= len1 >> 2; + } else { + /* sitting on the fence */ + BUG_ON(offset_r != len1 - 4); + cd->offset = p + (cd->offset - tmp); + cd->offset1 = tmp; + } + + len2 = (num_entry_words << 2) - len1; + + /* move from temp page to current and next pages */ + memmove(p, tmp, len1); + memmove(tmp, (caddr_t)tmp+len1, len2); + + p = tmp + (len2 >> 2); + } + } + else { + cd->common.err = nfserr_toosmall; + return -EINVAL; + } + + cd->buflen -= num_entry_words; + cd->buffer = p; + cd->common.err = nfs_ok; + return 0; + +} + +int +nfs3svc_encode_entry(void *cd, const char *name, + int namlen, loff_t offset, u64 ino, unsigned int d_type) +{ + return encode_entry(cd, name, namlen, offset, ino, d_type, 0); +} + +int +nfs3svc_encode_entry_plus(void *cd, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) +{ + return encode_entry(cd, name, namlen, offset, ino, d_type, 1); +} + +/* FSSTAT */ +int +nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_fsstatres *resp) +{ + struct kstatfs *s = &resp->stats; + u64 bs = s->f_bsize; + + *p++ = xdr_zero; /* no post_op_attr */ + + if (resp->status == 0) { + p = xdr_encode_hyper(p, bs * s->f_blocks); /* total bytes */ + p = xdr_encode_hyper(p, bs * s->f_bfree); /* free bytes */ + p = xdr_encode_hyper(p, bs * s->f_bavail); /* user available bytes */ + p = xdr_encode_hyper(p, s->f_files); /* total inodes */ + p = xdr_encode_hyper(p, s->f_ffree); /* free inodes */ + p = xdr_encode_hyper(p, s->f_ffree); /* user available inodes */ + *p++ = htonl(resp->invarsec); /* mean unchanged time */ + } + return xdr_ressize_check(rqstp, p); +} + +/* FSINFO */ +int +nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_fsinfores *resp) +{ + *p++ = xdr_zero; /* no post_op_attr */ + + if (resp->status == 0) { + *p++ = htonl(resp->f_rtmax); + *p++ = htonl(resp->f_rtpref); + *p++ = htonl(resp->f_rtmult); + *p++ = htonl(resp->f_wtmax); + *p++ = htonl(resp->f_wtpref); + *p++ = htonl(resp->f_wtmult); + *p++ = htonl(resp->f_dtpref); + p = xdr_encode_hyper(p, resp->f_maxfilesize); + *p++ = xdr_one; + *p++ = xdr_zero; + *p++ = htonl(resp->f_properties); + } + + return xdr_ressize_check(rqstp, p); +} + +/* PATHCONF */ +int +nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_pathconfres *resp) +{ + *p++ = xdr_zero; /* no post_op_attr */ + + if (resp->status == 0) { + *p++ = htonl(resp->p_link_max); + *p++ = htonl(resp->p_name_max); + *p++ = htonl(resp->p_no_trunc); + *p++ = htonl(resp->p_chown_restricted); + *p++ = htonl(resp->p_case_insensitive); + *p++ = htonl(resp->p_case_preserving); + } + + return xdr_ressize_check(rqstp, p); +} + +/* COMMIT */ +int +nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_commitres *resp) +{ + p = encode_wcc_data(rqstp, p, &resp->fh); + /* Write verifier */ + if (resp->status == 0) { + *p++ = htonl(nfssvc_boot.tv_sec); + *p++ = htonl(nfssvc_boot.tv_usec); + } + return xdr_ressize_check(rqstp, p); +} + +/* + * XDR release functions + */ +int +nfs3svc_release_fhandle(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_attrstat *resp) +{ + fh_put(&resp->fh); + return 1; +} + +int +nfs3svc_release_fhandle2(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_fhandle_pair *resp) +{ + fh_put(&resp->fh1); + fh_put(&resp->fh2); + return 1; +} diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c new file mode 100644 index 00000000..ad88f1c0 --- /dev/null +++ b/fs/nfsd/nfs4acl.c @@ -0,0 +1,838 @@ +/* + * Common NFSv4 ACL handling code. + * + * Copyright (c) 2002, 2003 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen + * Jeff Sedlak + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include "acl.h" + + +/* mode bit translations: */ +#define NFS4_READ_MODE (NFS4_ACE_READ_DATA) +#define NFS4_WRITE_MODE (NFS4_ACE_WRITE_DATA | NFS4_ACE_APPEND_DATA) +#define NFS4_EXECUTE_MODE NFS4_ACE_EXECUTE +#define NFS4_ANYONE_MODE (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL | NFS4_ACE_SYNCHRONIZE) +#define NFS4_OWNER_MODE (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL) + +/* We don't support these bits; insist they be neither allowed nor denied */ +#define NFS4_MASK_UNSUPP (NFS4_ACE_DELETE | NFS4_ACE_WRITE_OWNER \ + | NFS4_ACE_READ_NAMED_ATTRS | NFS4_ACE_WRITE_NAMED_ATTRS) + +/* flags used to simulate posix default ACLs */ +#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \ + | NFS4_ACE_DIRECTORY_INHERIT_ACE) + +#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS \ + | NFS4_ACE_INHERIT_ONLY_ACE \ + | NFS4_ACE_IDENTIFIER_GROUP) + +#define MASK_EQUAL(mask1, mask2) \ + ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) ) + +static u32 +mask_from_posix(unsigned short perm, unsigned int flags) +{ + int mask = NFS4_ANYONE_MODE; + + if (flags & NFS4_ACL_OWNER) + mask |= NFS4_OWNER_MODE; + if (perm & ACL_READ) + mask |= NFS4_READ_MODE; + if (perm & ACL_WRITE) + mask |= NFS4_WRITE_MODE; + if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR)) + mask |= NFS4_ACE_DELETE_CHILD; + if (perm & ACL_EXECUTE) + mask |= NFS4_EXECUTE_MODE; + return mask; +} + +static u32 +deny_mask_from_posix(unsigned short perm, u32 flags) +{ + u32 mask = 0; + + if (perm & ACL_READ) + mask |= NFS4_READ_MODE; + if (perm & ACL_WRITE) + mask |= NFS4_WRITE_MODE; + if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR)) + mask |= NFS4_ACE_DELETE_CHILD; + if (perm & ACL_EXECUTE) + mask |= NFS4_EXECUTE_MODE; + return mask; +} + +/* XXX: modify functions to return NFS errors; they're only ever + * used by nfs code, after all.... */ + +/* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the + * side of being more restrictive, so the mode bit mapping below is + * pessimistic. An optimistic version would be needed to handle DENY's, + * but we espect to coalesce all ALLOWs and DENYs before mapping to mode + * bits. */ + +static void +low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags) +{ + u32 write_mode = NFS4_WRITE_MODE; + + if (flags & NFS4_ACL_DIR) + write_mode |= NFS4_ACE_DELETE_CHILD; + *mode = 0; + if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE) + *mode |= ACL_READ; + if ((perm & write_mode) == write_mode) + *mode |= ACL_WRITE; + if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE) + *mode |= ACL_EXECUTE; +} + +struct ace_container { + struct nfs4_ace *ace; + struct list_head ace_l; +}; + +static short ace2type(struct nfs4_ace *); +static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, + unsigned int); + +struct nfs4_acl * +nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl, + unsigned int flags) +{ + struct nfs4_acl *acl; + int size = 0; + + if (pacl) { + if (posix_acl_valid(pacl) < 0) + return ERR_PTR(-EINVAL); + size += 2*pacl->a_count; + } + if (dpacl) { + if (posix_acl_valid(dpacl) < 0) + return ERR_PTR(-EINVAL); + size += 2*dpacl->a_count; + } + + /* Allocate for worst case: one (deny, allow) pair each: */ + acl = nfs4_acl_new(size); + if (acl == NULL) + return ERR_PTR(-ENOMEM); + + if (pacl) + _posix_to_nfsv4_one(pacl, acl, flags & ~NFS4_ACL_TYPE_DEFAULT); + + if (dpacl) + _posix_to_nfsv4_one(dpacl, acl, flags | NFS4_ACL_TYPE_DEFAULT); + + return acl; +} + +struct posix_acl_summary { + unsigned short owner; + unsigned short users; + unsigned short group; + unsigned short groups; + unsigned short other; + unsigned short mask; +}; + +static void +summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas) +{ + struct posix_acl_entry *pa, *pe; + + /* + * Only pas.users and pas.groups need initialization; previous + * posix_acl_valid() calls ensure that the other fields will be + * initialized in the following loop. But, just to placate gcc: + */ + memset(pas, 0, sizeof(*pas)); + pas->mask = 07; + + pe = acl->a_entries + acl->a_count; + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch (pa->e_tag) { + case ACL_USER_OBJ: + pas->owner = pa->e_perm; + break; + case ACL_GROUP_OBJ: + pas->group = pa->e_perm; + break; + case ACL_USER: + pas->users |= pa->e_perm; + break; + case ACL_GROUP: + pas->groups |= pa->e_perm; + break; + case ACL_OTHER: + pas->other = pa->e_perm; + break; + case ACL_MASK: + pas->mask = pa->e_perm; + break; + } + } + /* We'll only care about effective permissions: */ + pas->users &= pas->mask; + pas->group &= pas->mask; + pas->groups &= pas->mask; +} + +/* We assume the acl has been verified with posix_acl_valid. */ +static void +_posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, + unsigned int flags) +{ + struct posix_acl_entry *pa, *group_owner_entry; + struct nfs4_ace *ace; + struct posix_acl_summary pas; + unsigned short deny; + int eflag = ((flags & NFS4_ACL_TYPE_DEFAULT) ? + NFS4_INHERITANCE_FLAGS | NFS4_ACE_INHERIT_ONLY_ACE : 0); + + BUG_ON(pacl->a_count < 3); + summarize_posix_acl(pacl, &pas); + + pa = pacl->a_entries; + ace = acl->aces + acl->naces; + + /* We could deny everything not granted by the owner: */ + deny = ~pas.owner; + /* + * but it is equivalent (and simpler) to deny only what is not + * granted by later entries: + */ + deny &= pas.users | pas.group | pas.groups | pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_OWNER; + ace++; + acl->naces++; + } + + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER); + ace->whotype = NFS4_ACL_WHO_OWNER; + ace++; + acl->naces++; + pa++; + + while (pa->e_tag == ACL_USER) { + deny = ~(pa->e_perm & pas.mask); + deny &= pas.groups | pas.group | pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who = pa->e_id; + ace++; + acl->naces++; + } + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm & pas.mask, + flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who = pa->e_id; + ace++; + acl->naces++; + pa++; + } + + /* In the case of groups, we apply allow ACEs first, then deny ACEs, + * since a user can be in more than one group. */ + + /* allow ACEs */ + + group_owner_entry = pa; + + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pas.group, flags); + ace->whotype = NFS4_ACL_WHO_GROUP; + ace++; + acl->naces++; + pa++; + + while (pa->e_tag == ACL_GROUP) { + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; + ace->access_mask = mask_from_posix(pa->e_perm & pas.mask, + flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who = pa->e_id; + ace++; + acl->naces++; + pa++; + } + + /* deny ACEs */ + + pa = group_owner_entry; + + deny = ~pas.group & pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_GROUP; + ace++; + acl->naces++; + } + pa++; + + while (pa->e_tag == ACL_GROUP) { + deny = ~(pa->e_perm & pas.mask); + deny &= pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who = pa->e_id; + ace++; + acl->naces++; + } + pa++; + } + + if (pa->e_tag == ACL_MASK) + pa++; + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm, flags); + ace->whotype = NFS4_ACL_WHO_EVERYONE; + acl->naces++; +} + +static void +sort_pacl_range(struct posix_acl *pacl, int start, int end) { + int sorted = 0, i; + struct posix_acl_entry tmp; + + /* We just do a bubble sort; easy to do in place, and we're not + * expecting acl's to be long enough to justify anything more. */ + while (!sorted) { + sorted = 1; + for (i = start; i < end; i++) { + if (pacl->a_entries[i].e_id + > pacl->a_entries[i+1].e_id) { + sorted = 0; + tmp = pacl->a_entries[i]; + pacl->a_entries[i] = pacl->a_entries[i+1]; + pacl->a_entries[i+1] = tmp; + } + } + } +} + +static void +sort_pacl(struct posix_acl *pacl) +{ + /* posix_acl_valid requires that users and groups be in order + * by uid/gid. */ + int i, j; + + if (pacl->a_count <= 4) + return; /* no users or groups */ + i = 1; + while (pacl->a_entries[i].e_tag == ACL_USER) + i++; + sort_pacl_range(pacl, 1, i-1); + + BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ); + j = ++i; + while (pacl->a_entries[j].e_tag == ACL_GROUP) + j++; + sort_pacl_range(pacl, i, j-1); + return; +} + +/* + * While processing the NFSv4 ACE, this maintains bitmasks representing + * which permission bits have been allowed and which denied to a given + * entity: */ +struct posix_ace_state { + u32 allow; + u32 deny; +}; + +struct posix_user_ace_state { + uid_t uid; + struct posix_ace_state perms; +}; + +struct posix_ace_state_array { + int n; + struct posix_user_ace_state aces[]; +}; + +/* + * While processing the NFSv4 ACE, this maintains the partial permissions + * calculated so far: */ + +struct posix_acl_state { + int empty; + struct posix_ace_state owner; + struct posix_ace_state group; + struct posix_ace_state other; + struct posix_ace_state everyone; + struct posix_ace_state mask; /* Deny unused in this case */ + struct posix_ace_state_array *users; + struct posix_ace_state_array *groups; +}; + +static int +init_state(struct posix_acl_state *state, int cnt) +{ + int alloc; + + memset(state, 0, sizeof(struct posix_acl_state)); + state->empty = 1; + /* + * In the worst case, each individual acl could be for a distinct + * named user or group, but we don't no which, so we allocate + * enough space for either: + */ + alloc = sizeof(struct posix_ace_state_array) + + cnt*sizeof(struct posix_user_ace_state); + state->users = kzalloc(alloc, GFP_KERNEL); + if (!state->users) + return -ENOMEM; + state->groups = kzalloc(alloc, GFP_KERNEL); + if (!state->groups) { + kfree(state->users); + return -ENOMEM; + } + return 0; +} + +static void +free_state(struct posix_acl_state *state) { + kfree(state->users); + kfree(state->groups); +} + +static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_state *astate) +{ + state->mask.allow |= astate->allow; +} + +/* + * Certain bits (SYNCHRONIZE, DELETE, WRITE_OWNER, READ/WRITE_NAMED_ATTRS, + * READ_ATTRIBUTES, READ_ACL) are currently unenforceable and don't translate + * to traditional read/write/execute permissions. + * + * It's problematic to reject acls that use certain mode bits, because it + * places the burden on users to learn the rules about which bits one + * particular server sets, without giving the user a lot of help--we return an + * error that could mean any number of different things. To make matters + * worse, the problematic bits might be introduced by some application that's + * automatically mapping from some other acl model. + * + * So wherever possible we accept anything, possibly erring on the side of + * denying more permissions than necessary. + * + * However we do reject *explicit* DENY's of a few bits representing + * permissions we could never deny: + */ + +static inline int check_deny(u32 mask, int isowner) +{ + if (mask & (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL)) + return -EINVAL; + if (!isowner) + return 0; + if (mask & (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL)) + return -EINVAL; + return 0; +} + +static struct posix_acl * +posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) +{ + struct posix_acl_entry *pace; + struct posix_acl *pacl; + int nace; + int i, error = 0; + + /* + * ACLs with no ACEs are treated differently in the inheritable + * and effective cases: when there are no inheritable ACEs, we + * set a zero-length default posix acl: + */ + if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) { + pacl = posix_acl_alloc(0, GFP_KERNEL); + return pacl ? pacl : ERR_PTR(-ENOMEM); + } + /* + * When there are no effective ACEs, the following will end + * up setting a 3-element effective posix ACL with all + * permissions zero. + */ + nace = 4 + state->users->n + state->groups->n; + pacl = posix_acl_alloc(nace, GFP_KERNEL); + if (!pacl) + return ERR_PTR(-ENOMEM); + + pace = pacl->a_entries; + pace->e_tag = ACL_USER_OBJ; + error = check_deny(state->owner.deny, 1); + if (error) + goto out_err; + low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags); + pace->e_id = ACL_UNDEFINED_ID; + + for (i=0; i < state->users->n; i++) { + pace++; + pace->e_tag = ACL_USER; + error = check_deny(state->users->aces[i].perms.deny, 0); + if (error) + goto out_err; + low_mode_from_nfs4(state->users->aces[i].perms.allow, + &pace->e_perm, flags); + pace->e_id = state->users->aces[i].uid; + add_to_mask(state, &state->users->aces[i].perms); + } + + pace++; + pace->e_tag = ACL_GROUP_OBJ; + error = check_deny(state->group.deny, 0); + if (error) + goto out_err; + low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags); + pace->e_id = ACL_UNDEFINED_ID; + add_to_mask(state, &state->group); + + for (i=0; i < state->groups->n; i++) { + pace++; + pace->e_tag = ACL_GROUP; + error = check_deny(state->groups->aces[i].perms.deny, 0); + if (error) + goto out_err; + low_mode_from_nfs4(state->groups->aces[i].perms.allow, + &pace->e_perm, flags); + pace->e_id = state->groups->aces[i].uid; + add_to_mask(state, &state->groups->aces[i].perms); + } + + pace++; + pace->e_tag = ACL_MASK; + low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); + pace->e_id = ACL_UNDEFINED_ID; + + pace++; + pace->e_tag = ACL_OTHER; + error = check_deny(state->other.deny, 0); + if (error) + goto out_err; + low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags); + pace->e_id = ACL_UNDEFINED_ID; + + return pacl; +out_err: + posix_acl_release(pacl); + return ERR_PTR(error); +} + +static inline void allow_bits(struct posix_ace_state *astate, u32 mask) +{ + /* Allow all bits in the mask not already denied: */ + astate->allow |= mask & ~astate->deny; +} + +static inline void deny_bits(struct posix_ace_state *astate, u32 mask) +{ + /* Deny all bits in the mask not already allowed: */ + astate->deny |= mask & ~astate->allow; +} + +static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array *a, uid_t uid) +{ + int i; + + for (i = 0; i < a->n; i++) + if (a->aces[i].uid == uid) + return i; + /* Not found: */ + a->n++; + a->aces[i].uid = uid; + a->aces[i].perms.allow = state->everyone.allow; + a->aces[i].perms.deny = state->everyone.deny; + + return i; +} + +static void deny_bits_array(struct posix_ace_state_array *a, u32 mask) +{ + int i; + + for (i=0; i < a->n; i++) + deny_bits(&a->aces[i].perms, mask); +} + +static void allow_bits_array(struct posix_ace_state_array *a, u32 mask) +{ + int i; + + for (i=0; i < a->n; i++) + allow_bits(&a->aces[i].perms, mask); +} + +static void process_one_v4_ace(struct posix_acl_state *state, + struct nfs4_ace *ace) +{ + u32 mask = ace->access_mask; + int i; + + state->empty = 0; + + switch (ace2type(ace)) { + case ACL_USER_OBJ: + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->owner, mask); + } else { + deny_bits(&state->owner, mask); + } + break; + case ACL_USER: + i = find_uid(state, state->users, ace->who); + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->users->aces[i].perms, mask); + } else { + deny_bits(&state->users->aces[i].perms, mask); + mask = state->users->aces[i].perms.deny; + deny_bits(&state->owner, mask); + } + break; + case ACL_GROUP_OBJ: + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->group, mask); + } else { + deny_bits(&state->group, mask); + mask = state->group.deny; + deny_bits(&state->owner, mask); + deny_bits(&state->everyone, mask); + deny_bits_array(state->users, mask); + deny_bits_array(state->groups, mask); + } + break; + case ACL_GROUP: + i = find_uid(state, state->groups, ace->who); + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->groups->aces[i].perms, mask); + } else { + deny_bits(&state->groups->aces[i].perms, mask); + mask = state->groups->aces[i].perms.deny; + deny_bits(&state->owner, mask); + deny_bits(&state->group, mask); + deny_bits(&state->everyone, mask); + deny_bits_array(state->users, mask); + deny_bits_array(state->groups, mask); + } + break; + case ACL_OTHER: + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->owner, mask); + allow_bits(&state->group, mask); + allow_bits(&state->other, mask); + allow_bits(&state->everyone, mask); + allow_bits_array(state->users, mask); + allow_bits_array(state->groups, mask); + } else { + deny_bits(&state->owner, mask); + deny_bits(&state->group, mask); + deny_bits(&state->other, mask); + deny_bits(&state->everyone, mask); + deny_bits_array(state->users, mask); + deny_bits_array(state->groups, mask); + } + } +} + +int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl, + struct posix_acl **dpacl, unsigned int flags) +{ + struct posix_acl_state effective_acl_state, default_acl_state; + struct nfs4_ace *ace; + int ret; + + ret = init_state(&effective_acl_state, acl->naces); + if (ret) + return ret; + ret = init_state(&default_acl_state, acl->naces); + if (ret) + goto out_estate; + ret = -EINVAL; + for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { + if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE && + ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE) + goto out_dstate; + if (ace->flag & ~NFS4_SUPPORTED_FLAGS) + goto out_dstate; + if ((ace->flag & NFS4_INHERITANCE_FLAGS) == 0) { + process_one_v4_ace(&effective_acl_state, ace); + continue; + } + if (!(flags & NFS4_ACL_DIR)) + goto out_dstate; + /* + * Note that when only one of FILE_INHERIT or DIRECTORY_INHERIT + * is set, we're effectively turning on the other. That's OK, + * according to rfc 3530. + */ + process_one_v4_ace(&default_acl_state, ace); + + if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE)) + process_one_v4_ace(&effective_acl_state, ace); + } + *pacl = posix_state_to_acl(&effective_acl_state, flags); + if (IS_ERR(*pacl)) { + ret = PTR_ERR(*pacl); + *pacl = NULL; + goto out_dstate; + } + *dpacl = posix_state_to_acl(&default_acl_state, + flags | NFS4_ACL_TYPE_DEFAULT); + if (IS_ERR(*dpacl)) { + ret = PTR_ERR(*dpacl); + *dpacl = NULL; + posix_acl_release(*pacl); + *pacl = NULL; + goto out_dstate; + } + sort_pacl(*pacl); + sort_pacl(*dpacl); + ret = 0; +out_dstate: + free_state(&default_acl_state); +out_estate: + free_state(&effective_acl_state); + return ret; +} + +static short +ace2type(struct nfs4_ace *ace) +{ + switch (ace->whotype) { + case NFS4_ACL_WHO_NAMED: + return (ace->flag & NFS4_ACE_IDENTIFIER_GROUP ? + ACL_GROUP : ACL_USER); + case NFS4_ACL_WHO_OWNER: + return ACL_USER_OBJ; + case NFS4_ACL_WHO_GROUP: + return ACL_GROUP_OBJ; + case NFS4_ACL_WHO_EVERYONE: + return ACL_OTHER; + } + BUG(); + return -1; +} + +EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4); +EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix); + +struct nfs4_acl * +nfs4_acl_new(int n) +{ + struct nfs4_acl *acl; + + acl = kmalloc(sizeof(*acl) + n*sizeof(struct nfs4_ace), GFP_KERNEL); + if (acl == NULL) + return NULL; + acl->naces = 0; + return acl; +} + +static struct { + char *string; + int stringlen; + int type; +} s2t_map[] = { + { + .string = "OWNER@", + .stringlen = sizeof("OWNER@") - 1, + .type = NFS4_ACL_WHO_OWNER, + }, + { + .string = "GROUP@", + .stringlen = sizeof("GROUP@") - 1, + .type = NFS4_ACL_WHO_GROUP, + }, + { + .string = "EVERYONE@", + .stringlen = sizeof("EVERYONE@") - 1, + .type = NFS4_ACL_WHO_EVERYONE, + }, +}; + +int +nfs4_acl_get_whotype(char *p, u32 len) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(s2t_map); i++) { + if (s2t_map[i].stringlen == len && + 0 == memcmp(s2t_map[i].string, p, len)) + return s2t_map[i].type; + } + return NFS4_ACL_WHO_NAMED; +} + +int +nfs4_acl_write_who(int who, char *p) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(s2t_map); i++) { + if (s2t_map[i].type == who) { + memcpy(p, s2t_map[i].string, s2t_map[i].stringlen); + return s2t_map[i].stringlen; + } + } + BUG(); + return -1; +} + +EXPORT_SYMBOL(nfs4_acl_new); +EXPORT_SYMBOL(nfs4_acl_get_whotype); +EXPORT_SYMBOL(nfs4_acl_write_who); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c new file mode 100644 index 00000000..02eb4edf --- /dev/null +++ b/fs/nfsd/nfs4callback.c @@ -0,0 +1,1026 @@ +/* + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include "nfsd.h" +#include "state.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +#define NFSPROC4_CB_NULL 0 +#define NFSPROC4_CB_COMPOUND 1 + +/* Index of predefined Linux callback client operations */ + +enum { + NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_SEQUENCE, +}; + +#define NFS4_MAXTAGLEN 20 + +#define NFS4_enc_cb_null_sz 0 +#define NFS4_dec_cb_null_sz 0 +#define cb_compound_enc_hdr_sz 4 +#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) +#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2) +#define cb_sequence_enc_sz (sessionid_sz + 4 + \ + 1 /* no referring calls list yet */) +#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4) + +#define op_enc_sz 1 +#define op_dec_sz 2 +#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) +#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2) +#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + enc_stateid_sz + \ + enc_nfs4_fh_sz) + +#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) + +struct nfs4_cb_compound_hdr { + /* args */ + u32 ident; /* minorversion 0 only */ + u32 nops; + __be32 *nops_p; + u32 minorversion; + /* res */ + int status; +}; + +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("NFS: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + +static __be32 *xdr_encode_empty_array(__be32 *p) +{ + *p++ = xdr_zero; + return p; +} + +/* + * Encode/decode NFSv4 CB basic data types + * + * Basic NFSv4 callback data types are defined in section 15 of RFC + * 3530: "Network File System (NFS) version 4 Protocol" and section + * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version + * 1 Protocol" + */ + +/* + * nfs_cb_opnum4 + * + * enum nfs_cb_opnum4 { + * OP_CB_GETATTR = 3, + * ... + * }; + */ +enum nfs_cb_opnum4 { + OP_CB_GETATTR = 3, + OP_CB_RECALL = 4, + OP_CB_LAYOUTRECALL = 5, + OP_CB_NOTIFY = 6, + OP_CB_PUSH_DELEG = 7, + OP_CB_RECALL_ANY = 8, + OP_CB_RECALLABLE_OBJ_AVAIL = 9, + OP_CB_RECALL_SLOT = 10, + OP_CB_SEQUENCE = 11, + OP_CB_WANTS_CANCELLED = 12, + OP_CB_NOTIFY_LOCK = 13, + OP_CB_NOTIFY_DEVICEID = 14, + OP_CB_ILLEGAL = 10044 +}; + +static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(op); +} + +/* + * nfs_fh4 + * + * typedef opaque nfs_fh4; + */ +static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh) +{ + u32 length = fh->fh_size; + __be32 *p; + + BUG_ON(length > NFS4_FHSIZE); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, &fh->fh_base, length); +} + +/* + * stateid4 + * + * struct stateid4 { + * uint32_t seqid; + * opaque other[12]; + * }; + */ +static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(sid->si_generation); + xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE); +} + +/* + * sessionid4 + * + * typedef opaque sessionid4[NFS4_SESSIONID_SIZE]; + */ +static void encode_sessionid4(struct xdr_stream *xdr, + const struct nfsd4_session *session) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN); + xdr_encode_opaque_fixed(p, session->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN); +} + +/* + * nfsstat4 + */ +static const struct { + int stat; + int errno; +} nfs_cb_errtbl[] = { + { NFS4_OK, 0 }, + { NFS4ERR_PERM, -EPERM }, + { NFS4ERR_NOENT, -ENOENT }, + { NFS4ERR_IO, -EIO }, + { NFS4ERR_NXIO, -ENXIO }, + { NFS4ERR_ACCESS, -EACCES }, + { NFS4ERR_EXIST, -EEXIST }, + { NFS4ERR_XDEV, -EXDEV }, + { NFS4ERR_NOTDIR, -ENOTDIR }, + { NFS4ERR_ISDIR, -EISDIR }, + { NFS4ERR_INVAL, -EINVAL }, + { NFS4ERR_FBIG, -EFBIG }, + { NFS4ERR_NOSPC, -ENOSPC }, + { NFS4ERR_ROFS, -EROFS }, + { NFS4ERR_MLINK, -EMLINK }, + { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFS4ERR_NOTEMPTY, -ENOTEMPTY }, + { NFS4ERR_DQUOT, -EDQUOT }, + { NFS4ERR_STALE, -ESTALE }, + { NFS4ERR_BADHANDLE, -EBADHANDLE }, + { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, + { NFS4ERR_NOTSUPP, -ENOTSUPP }, + { NFS4ERR_TOOSMALL, -ETOOSMALL }, + { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, + { NFS4ERR_BADTYPE, -EBADTYPE }, + { NFS4ERR_LOCKED, -EAGAIN }, + { NFS4ERR_RESOURCE, -EREMOTEIO }, + { NFS4ERR_SYMLINK, -ELOOP }, + { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, + { NFS4ERR_DEADLOCK, -EDEADLK }, + { -1, -EIO } +}; + +/* + * If we cannot translate the error, the recovery routines should + * handle it. + * + * Note: remaining NFSv4 error codes have values > 10000, so should + * not conflict with native Linux error codes. + */ +static int nfs_cb_stat_to_errno(int status) +{ + int i; + + for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) { + if (nfs_cb_errtbl[i].stat == status) + return nfs_cb_errtbl[i].errno; + } + + dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status); + return -status; +} + +static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected, + enum nfsstat4 *status) +{ + __be32 *p; + u32 op; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + op = be32_to_cpup(p++); + if (unlikely(op != expected)) + goto out_unexpected; + *status = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +out_unexpected: + dprintk("NFSD: Callback server returned operation %d but " + "we issued a request for %d\n", op, expected); + return -EIO; +} + +/* + * CB_COMPOUND4args + * + * struct CB_COMPOUND4args { + * utf8str_cs tag; + * uint32_t minorversion; + * uint32_t callback_ident; + * nfs_cb_argop4 argarray<>; + * }; +*/ +static void encode_cb_compound4args(struct xdr_stream *xdr, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 * p; + + p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4); + p = xdr_encode_empty_array(p); /* empty tag */ + *p++ = cpu_to_be32(hdr->minorversion); + *p++ = cpu_to_be32(hdr->ident); + + hdr->nops_p = p; + *p = cpu_to_be32(hdr->nops); /* argarray element count */ +} + +/* + * Update argarray element count + */ +static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr) +{ + BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS); + *hdr->nops_p = cpu_to_be32(hdr->nops); +} + +/* + * CB_COMPOUND4res + * + * struct CB_COMPOUND4res { + * nfsstat4 status; + * utf8str_cs tag; + * nfs_cb_resop4 resarray<>; + * }; + */ +static int decode_cb_compound4res(struct xdr_stream *xdr, + struct nfs4_cb_compound_hdr *hdr) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + hdr->status = be32_to_cpup(p++); + /* Ignore the tag */ + length = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, length + 4); + if (unlikely(p == NULL)) + goto out_overflow; + hdr->nops = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * CB_RECALL4args + * + * struct CB_RECALL4args { + * stateid4 stateid; + * bool truncate; + * nfs_fh4 fh; + * }; + */ +static void encode_cb_recall4args(struct xdr_stream *xdr, + const struct nfs4_delegation *dp, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 *p; + + encode_nfs_cb_opnum4(xdr, OP_CB_RECALL); + encode_stateid4(xdr, &dp->dl_stateid); + + p = xdr_reserve_space(xdr, 4); + *p++ = xdr_zero; /* truncate */ + + encode_nfs_fh4(xdr, &dp->dl_fh); + + hdr->nops++; +} + +/* + * CB_SEQUENCE4args + * + * struct CB_SEQUENCE4args { + * sessionid4 csa_sessionid; + * sequenceid4 csa_sequenceid; + * slotid4 csa_slotid; + * slotid4 csa_highest_slotid; + * bool csa_cachethis; + * referring_call_list4 csa_referring_call_lists<>; + * }; + */ +static void encode_cb_sequence4args(struct xdr_stream *xdr, + const struct nfsd4_callback *cb, + struct nfs4_cb_compound_hdr *hdr) +{ + struct nfsd4_session *session = cb->cb_clp->cl_cb_session; + __be32 *p; + + if (hdr->minorversion == 0) + return; + + encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE); + encode_sessionid4(xdr, session); + + p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4); + *p++ = cpu_to_be32(session->se_cb_seq_nr); /* csa_sequenceid */ + *p++ = xdr_zero; /* csa_slotid */ + *p++ = xdr_zero; /* csa_highest_slotid */ + *p++ = xdr_zero; /* csa_cachethis */ + xdr_encode_empty_array(p); /* csa_referring_call_lists */ + + hdr->nops++; +} + +/* + * CB_SEQUENCE4resok + * + * struct CB_SEQUENCE4resok { + * sessionid4 csr_sessionid; + * sequenceid4 csr_sequenceid; + * slotid4 csr_slotid; + * slotid4 csr_highest_slotid; + * slotid4 csr_target_highest_slotid; + * }; + * + * union CB_SEQUENCE4res switch (nfsstat4 csr_status) { + * case NFS4_OK: + * CB_SEQUENCE4resok csr_resok4; + * default: + * void; + * }; + * + * Our current back channel implmentation supports a single backchannel + * with a single slot. + */ +static int decode_cb_sequence4resok(struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + struct nfsd4_session *session = cb->cb_clp->cl_cb_session; + struct nfs4_sessionid id; + int status; + __be32 *p; + u32 dummy; + + status = -ESERVERFAULT; + + /* + * If the server returns different values for sessionID, slotID or + * sequence number, the server is looney tunes. + */ + p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); + if (memcmp(id.data, session->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN) != 0) { + dprintk("NFS: %s Invalid session id\n", __func__); + goto out; + } + p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); + + dummy = be32_to_cpup(p++); + if (dummy != session->se_cb_seq_nr) { + dprintk("NFS: %s Invalid sequence number\n", __func__); + goto out; + } + + dummy = be32_to_cpup(p++); + if (dummy != 0) { + dprintk("NFS: %s Invalid slotid\n", __func__); + goto out; + } + + /* + * FIXME: process highest slotid and target highest slotid + */ + status = 0; +out: + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_cb_sequence4res(struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + enum nfsstat4 nfserr; + int status; + + if (cb->cb_minorversion == 0) + return 0; + + status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr); + if (unlikely(status)) + goto out; + if (unlikely(nfserr != NFS4_OK)) + goto out_default; + status = decode_cb_sequence4resok(xdr, cb); +out: + return status; +out_default: + return nfs_cb_stat_to_errno(nfserr); +} + +/* + * NFSv4.0 and NFSv4.1 XDR encode functions + * + * NFSv4.0 callback argument types are defined in section 15 of RFC + * 3530: "Network File System (NFS) version 4 Protocol" and section 20 + * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1 + * Protocol". + */ + +/* + * NB: Without this zero space reservation, callbacks over krb5p fail + */ +static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, + void *__unused) +{ + xdr_reserve_space(xdr, 0); +} + +/* + * 20.2. Operation 4: CB_RECALL - Recall a Delegation + */ +static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfsd4_callback *cb) +{ + const struct nfs4_delegation *args = cb->cb_op; + struct nfs4_cb_compound_hdr hdr = { + .ident = cb->cb_clp->cl_cb_ident, + .minorversion = cb->cb_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_recall4args(xdr, args, &hdr); + encode_cb_nops(&hdr); +} + + +/* + * NFSv4.0 and NFSv4.1 XDR decode functions + * + * NFSv4.0 callback result types are defined in section 15 of RFC + * 3530: "Network File System (NFS) version 4 Protocol" and section 20 + * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1 + * Protocol". + */ + +static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, + void *__unused) +{ + return 0; +} + +/* + * 20.2. Operation 4: CB_RECALL - Recall a Delegation + */ +static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + struct nfs4_cb_compound_hdr hdr; + enum nfsstat4 nfserr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + goto out; + + if (cb != NULL) { + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status)) + goto out; + } + + status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr); + if (unlikely(status)) + goto out; + if (unlikely(nfserr != NFS4_OK)) + status = nfs_cb_stat_to_errno(nfserr); +out: + return status; +} + +/* + * RPC procedure tables + */ +#define PROC(proc, call, argtype, restype) \ +[NFSPROC4_CLNT_##proc] = { \ + .p_proc = NFSPROC4_CB_##call, \ + .p_encode = (kxdreproc_t)nfs4_xdr_enc_##argtype, \ + .p_decode = (kxdrdproc_t)nfs4_xdr_dec_##restype, \ + .p_arglen = NFS4_enc_##argtype##_sz, \ + .p_replen = NFS4_dec_##restype##_sz, \ + .p_statidx = NFSPROC4_CB_##call, \ + .p_name = #proc, \ +} + +static struct rpc_procinfo nfs4_cb_procedures[] = { + PROC(CB_NULL, NULL, cb_null, cb_null), + PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall), +}; + +static struct rpc_version nfs_cb_version4 = { +/* + * Note on the callback rpc program version number: despite language in rfc + * 5661 section 18.36.3 requiring servers to use 4 in this field, the + * official xdr descriptions for both 4.0 and 4.1 specify version 1, and + * in practice that appears to be what implementations use. The section + * 18.36.3 language is expected to be fixed in an erratum. + */ + .number = 1, + .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), + .procs = nfs4_cb_procedures +}; + +static struct rpc_version *nfs_cb_version[] = { + &nfs_cb_version4, +}; + +static struct rpc_program cb_program; + +static struct rpc_stat cb_stats = { + .program = &cb_program +}; + +#define NFS4_CALLBACK 0x40000000 +static struct rpc_program cb_program = { + .name = "nfs4_cb", + .number = NFS4_CALLBACK, + .nrvers = ARRAY_SIZE(nfs_cb_version), + .version = nfs_cb_version, + .stats = &cb_stats, + .pipe_dir_name = "/nfsd4_cb", +}; + +static int max_cb_time(void) +{ + return max(nfsd4_lease/10, (time_t)1) * HZ; +} + + +static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) +{ + struct rpc_timeout timeparms = { + .to_initval = max_cb_time(), + .to_retries = 0, + }; + struct rpc_create_args args = { + .net = &init_net, + .address = (struct sockaddr *) &conn->cb_addr, + .addrsize = conn->cb_addrlen, + .saddress = (struct sockaddr *) &conn->cb_saddr, + .timeout = &timeparms, + .program = &cb_program, + .version = 0, + .authflavor = clp->cl_flavor, + .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), + }; + struct rpc_clnt *client; + + if (clp->cl_minorversion == 0) { + if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) + return -EINVAL; + args.client_name = clp->cl_principal; + args.prognumber = conn->cb_prog, + args.protocol = XPRT_TRANSPORT_TCP; + clp->cl_cb_ident = conn->cb_ident; + } else { + if (!conn->cb_xprt) + return -EINVAL; + clp->cl_cb_conn.cb_xprt = conn->cb_xprt; + clp->cl_cb_session = ses; + args.bc_xprt = conn->cb_xprt; + args.prognumber = clp->cl_cb_session->se_cb_prog; + args.protocol = XPRT_TRANSPORT_BC_TCP; + } + /* Create RPC client */ + client = rpc_create(&args); + if (IS_ERR(client)) { + dprintk("NFSD: couldn't create callback client: %ld\n", + PTR_ERR(client)); + return PTR_ERR(client); + } + clp->cl_cb_client = client; + return 0; + +} + +static void warn_no_callback_path(struct nfs4_client *clp, int reason) +{ + dprintk("NFSD: warning: no callback path to client %.*s: error %d\n", + (int)clp->cl_name.len, clp->cl_name.data, reason); +} + +static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) +{ + clp->cl_cb_state = NFSD4_CB_DOWN; + warn_no_callback_path(clp, reason); +} + +static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); + + if (task->tk_status) + nfsd4_mark_cb_down(clp, task->tk_status); + else + clp->cl_cb_state = NFSD4_CB_UP; +} + +static const struct rpc_call_ops nfsd4_cb_probe_ops = { + /* XXX: release method to ensure we set the cb channel down if + * necessary on early failure? */ + .rpc_call_done = nfsd4_cb_probe_done, +}; + +static struct rpc_cred *callback_cred; + +int set_callback_cred(void) +{ + if (callback_cred) + return 0; + callback_cred = rpc_lookup_machine_cred(); + if (!callback_cred) + return -ENOMEM; + return 0; +} + +static struct workqueue_struct *callback_wq; + +static void run_nfsd4_cb(struct nfsd4_callback *cb) +{ + queue_work(callback_wq, &cb->cb_work); +} + +static void do_probe_callback(struct nfs4_client *clp) +{ + struct nfsd4_callback *cb = &clp->cl_cb_null; + + cb->cb_op = NULL; + cb->cb_clp = clp; + + cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL]; + cb->cb_msg.rpc_argp = NULL; + cb->cb_msg.rpc_resp = NULL; + cb->cb_msg.rpc_cred = callback_cred; + + cb->cb_ops = &nfsd4_cb_probe_ops; + + run_nfsd4_cb(cb); +} + +/* + * Poke the callback thread to process any updates to the callback + * parameters, and send a null probe. + */ +void nfsd4_probe_callback(struct nfs4_client *clp) +{ + /* XXX: atomicity? Also, should we be using cl_cb_flags? */ + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); + do_probe_callback(clp); +} + +void nfsd4_probe_callback_sync(struct nfs4_client *clp) +{ + nfsd4_probe_callback(clp); + flush_workqueue(callback_wq); +} + +void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) +{ + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + spin_lock(&clp->cl_lock); + memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn)); + spin_unlock(&clp->cl_lock); +} + +/* + * There's currently a single callback channel slot. + * If the slot is available, then mark it busy. Otherwise, set the + * thread for sleeping on the callback RPC wait queue. + */ +static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task) +{ + if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { + rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); + dprintk("%s slot is busy\n", __func__); + return false; + } + return true; +} + +/* + * TODO: cb_sequence should support referring call lists, cachethis, multiple + * slots, and mark callback channel down on communication errors. + */ +static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) +{ + struct nfsd4_callback *cb = calldata; + struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); + struct nfs4_client *clp = dp->dl_client; + u32 minorversion = clp->cl_minorversion; + + cb->cb_minorversion = minorversion; + if (minorversion) { + if (!nfsd41_cb_get_slot(clp, task)) + return; + } + spin_lock(&clp->cl_lock); + if (list_empty(&cb->cb_per_client)) { + /* This is the first call, not a restart */ + cb->cb_done = false; + list_add(&cb->cb_per_client, &clp->cl_callbacks); + } + spin_unlock(&clp->cl_lock); + rpc_call_start(task); +} + +static void nfsd4_cb_done(struct rpc_task *task, void *calldata) +{ + struct nfsd4_callback *cb = calldata; + struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); + struct nfs4_client *clp = dp->dl_client; + + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_minorversion); + + if (clp->cl_minorversion) { + /* No need for lock, access serialized in nfsd4_cb_prepare */ + ++clp->cl_cb_session->se_cb_seq_nr; + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_wake_up_next(&clp->cl_cb_waitq); + dprintk("%s: freed slot, new seqid=%d\n", __func__, + clp->cl_cb_session->se_cb_seq_nr); + + /* We're done looking into the sequence information */ + task->tk_msg.rpc_resp = NULL; + } +} + + +static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) +{ + struct nfsd4_callback *cb = calldata; + struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); + struct nfs4_client *clp = dp->dl_client; + struct rpc_clnt *current_rpc_client = clp->cl_cb_client; + + nfsd4_cb_done(task, calldata); + + if (current_rpc_client != task->tk_client) { + /* We're shutting down or changing cl_cb_client; leave + * it to nfsd4_process_cb_update to restart the call if + * necessary. */ + return; + } + + if (cb->cb_done) + return; + switch (task->tk_status) { + case 0: + cb->cb_done = true; + return; + case -EBADHANDLE: + case -NFS4ERR_BAD_STATEID: + /* Race: client probably got cb_recall + * before open reply granting delegation */ + break; + default: + /* Network partition? */ + nfsd4_mark_cb_down(clp, task->tk_status); + } + if (dp->dl_retries--) { + rpc_delay(task, 2*HZ); + task->tk_status = 0; + rpc_restart_call_prepare(task); + return; + } + nfsd4_mark_cb_down(clp, task->tk_status); + cb->cb_done = true; +} + +static void nfsd4_cb_recall_release(void *calldata) +{ + struct nfsd4_callback *cb = calldata; + struct nfs4_client *clp = cb->cb_clp; + struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); + + if (cb->cb_done) { + spin_lock(&clp->cl_lock); + list_del(&cb->cb_per_client); + spin_unlock(&clp->cl_lock); + nfs4_put_delegation(dp); + } +} + +static const struct rpc_call_ops nfsd4_cb_recall_ops = { + .rpc_call_prepare = nfsd4_cb_prepare, + .rpc_call_done = nfsd4_cb_recall_done, + .rpc_release = nfsd4_cb_recall_release, +}; + +int nfsd4_create_callback_queue(void) +{ + callback_wq = create_singlethread_workqueue("nfsd4_callbacks"); + if (!callback_wq) + return -ENOMEM; + return 0; +} + +void nfsd4_destroy_callback_queue(void) +{ + destroy_workqueue(callback_wq); +} + +/* must be called under the state lock */ +void nfsd4_shutdown_callback(struct nfs4_client *clp) +{ + set_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags); + /* + * Note this won't actually result in a null callback; + * instead, nfsd4_do_callback_rpc() will detect the killed + * client, destroy the rpc client, and stop: + */ + do_probe_callback(clp); + flush_workqueue(callback_wq); +} + +static void nfsd4_release_cb(struct nfsd4_callback *cb) +{ + if (cb->cb_ops->rpc_release) + cb->cb_ops->rpc_release(cb); +} + +/* requires cl_lock: */ +static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) +{ + struct nfsd4_session *s; + struct nfsd4_conn *c; + + list_for_each_entry(s, &clp->cl_sessions, se_perclnt) { + list_for_each_entry(c, &s->se_conns, cn_persession) { + if (c->cn_flags & NFS4_CDFC4_BACK) + return c; + } + } + return NULL; +} + +static void nfsd4_process_cb_update(struct nfsd4_callback *cb) +{ + struct nfs4_cb_conn conn; + struct nfs4_client *clp = cb->cb_clp; + struct nfsd4_session *ses = NULL; + struct nfsd4_conn *c; + int err; + + /* + * This is either an update, or the client dying; in either case, + * kill the old client: + */ + if (clp->cl_cb_client) { + rpc_shutdown_client(clp->cl_cb_client); + clp->cl_cb_client = NULL; + } + if (clp->cl_cb_conn.cb_xprt) { + svc_xprt_put(clp->cl_cb_conn.cb_xprt); + clp->cl_cb_conn.cb_xprt = NULL; + } + if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags)) + return; + spin_lock(&clp->cl_lock); + /* + * Only serialized callback code is allowed to clear these + * flags; main nfsd code can only set them: + */ + BUG_ON(!clp->cl_cb_flags); + clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); + memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); + c = __nfsd4_find_backchannel(clp); + if (c) { + svc_xprt_get(c->cn_xprt); + conn.cb_xprt = c->cn_xprt; + ses = c->cn_session; + } + spin_unlock(&clp->cl_lock); + + err = setup_callback_client(clp, &conn, ses); + if (err) { + warn_no_callback_path(clp, err); + return; + } + /* Yay, the callback channel's back! Restart any callbacks: */ + list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client) + run_nfsd4_cb(cb); +} + +void nfsd4_do_callback_rpc(struct work_struct *w) +{ + struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work); + struct nfs4_client *clp = cb->cb_clp; + struct rpc_clnt *clnt; + + if (clp->cl_cb_flags) + nfsd4_process_cb_update(cb); + + clnt = clp->cl_cb_client; + if (!clnt) { + /* Callback channel broken, or client killed; give up: */ + nfsd4_release_cb(cb); + return; + } + rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + cb->cb_ops, cb); +} + +void nfsd4_cb_recall(struct nfs4_delegation *dp) +{ + struct nfsd4_callback *cb = &dp->dl_recall; + struct nfs4_client *clp = dp->dl_client; + + dp->dl_retries = 1; + cb->cb_op = dp; + cb->cb_clp = clp; + cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; + cb->cb_msg.rpc_argp = cb; + cb->cb_msg.rpc_resp = cb; + cb->cb_msg.rpc_cred = callback_cred; + + cb->cb_ops = &nfsd4_cb_recall_ops; + dp->dl_retries = 1; + + INIT_LIST_HEAD(&cb->cb_per_client); + cb->cb_done = true; + + run_nfsd4_cb(&dp->dl_recall); +} diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c new file mode 100644 index 00000000..55780a22 --- /dev/null +++ b/fs/nfsd/nfs4idmap.c @@ -0,0 +1,587 @@ +/* + * Mapping of UID/GIDs to name and vice versa. + * + * Copyright (c) 2002, 2003 The Regents of the University of + * Michigan. All rights reserved. + * + * Marius Aamodt Eriksen + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include "idmap.h" +#include "nfsd.h" + +/* + * Cache entry + */ + +/* + * XXX we know that IDMAP_NAMESZ < PAGE_SIZE, but it's ugly to rely on + * that. + */ + +#define IDMAP_TYPE_USER 0 +#define IDMAP_TYPE_GROUP 1 + +struct ent { + struct cache_head h; + int type; /* User / Group */ + uid_t id; + char name[IDMAP_NAMESZ]; + char authname[IDMAP_NAMESZ]; +}; + +/* Common entry handling */ + +#define ENT_HASHBITS 8 +#define ENT_HASHMAX (1 << ENT_HASHBITS) + +static void +ent_init(struct cache_head *cnew, struct cache_head *citm) +{ + struct ent *new = container_of(cnew, struct ent, h); + struct ent *itm = container_of(citm, struct ent, h); + + new->id = itm->id; + new->type = itm->type; + + strlcpy(new->name, itm->name, sizeof(new->name)); + strlcpy(new->authname, itm->authname, sizeof(new->name)); +} + +static void +ent_put(struct kref *ref) +{ + struct ent *map = container_of(ref, struct ent, h.ref); + kfree(map); +} + +static struct cache_head * +ent_alloc(void) +{ + struct ent *e = kmalloc(sizeof(*e), GFP_KERNEL); + if (e) + return &e->h; + else + return NULL; +} + +/* + * ID -> Name cache + */ + +static struct cache_head *idtoname_table[ENT_HASHMAX]; + +static uint32_t +idtoname_hash(struct ent *ent) +{ + uint32_t hash; + + hash = hash_str(ent->authname, ENT_HASHBITS); + hash = hash_long(hash ^ ent->id, ENT_HASHBITS); + + /* Flip LSB for user/group */ + if (ent->type == IDMAP_TYPE_GROUP) + hash ^= 1; + + return hash; +} + +static void +idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, + int *blen) +{ + struct ent *ent = container_of(ch, struct ent, h); + char idstr[11]; + + qword_add(bpp, blen, ent->authname); + snprintf(idstr, sizeof(idstr), "%u", ent->id); + qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user"); + qword_add(bpp, blen, idstr); + + (*bpp)[-1] = '\n'; +} + +static int +idtoname_upcall(struct cache_detail *cd, struct cache_head *ch) +{ + return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request); +} + +static int +idtoname_match(struct cache_head *ca, struct cache_head *cb) +{ + struct ent *a = container_of(ca, struct ent, h); + struct ent *b = container_of(cb, struct ent, h); + + return (a->id == b->id && a->type == b->type && + strcmp(a->authname, b->authname) == 0); +} + +static int +idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) +{ + struct ent *ent; + + if (h == NULL) { + seq_puts(m, "#domain type id [name]\n"); + return 0; + } + ent = container_of(h, struct ent, h); + seq_printf(m, "%s %s %u", ent->authname, + ent->type == IDMAP_TYPE_GROUP ? "group" : "user", + ent->id); + if (test_bit(CACHE_VALID, &h->flags)) + seq_printf(m, " %s", ent->name); + seq_printf(m, "\n"); + return 0; +} + +static void +warn_no_idmapd(struct cache_detail *detail, int has_died) +{ + printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n", + has_died ? "died" : "not been started"); +} + + +static int idtoname_parse(struct cache_detail *, char *, int); +static struct ent *idtoname_lookup(struct ent *); +static struct ent *idtoname_update(struct ent *, struct ent *); + +static struct cache_detail idtoname_cache = { + .owner = THIS_MODULE, + .hash_size = ENT_HASHMAX, + .hash_table = idtoname_table, + .name = "nfs4.idtoname", + .cache_put = ent_put, + .cache_upcall = idtoname_upcall, + .cache_parse = idtoname_parse, + .cache_show = idtoname_show, + .warn_no_listener = warn_no_idmapd, + .match = idtoname_match, + .init = ent_init, + .update = ent_init, + .alloc = ent_alloc, +}; + +static int +idtoname_parse(struct cache_detail *cd, char *buf, int buflen) +{ + struct ent ent, *res; + char *buf1, *bp; + int len; + int error = -EINVAL; + + if (buf[buflen - 1] != '\n') + return (-EINVAL); + buf[buflen - 1]= '\0'; + + buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (buf1 == NULL) + return (-ENOMEM); + + memset(&ent, 0, sizeof(ent)); + + /* Authentication name */ + if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + goto out; + memcpy(ent.authname, buf1, sizeof(ent.authname)); + + /* Type */ + if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + goto out; + ent.type = strcmp(buf1, "user") == 0 ? + IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; + + /* ID */ + if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + goto out; + ent.id = simple_strtoul(buf1, &bp, 10); + if (bp == buf1) + goto out; + + /* expiry */ + ent.h.expiry_time = get_expiry(&buf); + if (ent.h.expiry_time == 0) + goto out; + + error = -ENOMEM; + res = idtoname_lookup(&ent); + if (!res) + goto out; + + /* Name */ + error = -EINVAL; + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len < 0) + goto out; + if (len == 0) + set_bit(CACHE_NEGATIVE, &ent.h.flags); + else if (len >= IDMAP_NAMESZ) + goto out; + else + memcpy(ent.name, buf1, sizeof(ent.name)); + error = -ENOMEM; + res = idtoname_update(&ent, res); + if (res == NULL) + goto out; + + cache_put(&res->h, &idtoname_cache); + + error = 0; +out: + kfree(buf1); + + return error; +} + + +static struct ent * +idtoname_lookup(struct ent *item) +{ + struct cache_head *ch = sunrpc_cache_lookup(&idtoname_cache, + &item->h, + idtoname_hash(item)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + +static struct ent * +idtoname_update(struct ent *new, struct ent *old) +{ + struct cache_head *ch = sunrpc_cache_update(&idtoname_cache, + &new->h, &old->h, + idtoname_hash(new)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + + +/* + * Name -> ID cache + */ + +static struct cache_head *nametoid_table[ENT_HASHMAX]; + +static inline int +nametoid_hash(struct ent *ent) +{ + return hash_str(ent->name, ENT_HASHBITS); +} + +static void +nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, + int *blen) +{ + struct ent *ent = container_of(ch, struct ent, h); + + qword_add(bpp, blen, ent->authname); + qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user"); + qword_add(bpp, blen, ent->name); + + (*bpp)[-1] = '\n'; +} + +static int +nametoid_upcall(struct cache_detail *cd, struct cache_head *ch) +{ + return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request); +} + +static int +nametoid_match(struct cache_head *ca, struct cache_head *cb) +{ + struct ent *a = container_of(ca, struct ent, h); + struct ent *b = container_of(cb, struct ent, h); + + return (a->type == b->type && strcmp(a->name, b->name) == 0 && + strcmp(a->authname, b->authname) == 0); +} + +static int +nametoid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) +{ + struct ent *ent; + + if (h == NULL) { + seq_puts(m, "#domain type name [id]\n"); + return 0; + } + ent = container_of(h, struct ent, h); + seq_printf(m, "%s %s %s", ent->authname, + ent->type == IDMAP_TYPE_GROUP ? "group" : "user", + ent->name); + if (test_bit(CACHE_VALID, &h->flags)) + seq_printf(m, " %u", ent->id); + seq_printf(m, "\n"); + return 0; +} + +static struct ent *nametoid_lookup(struct ent *); +static struct ent *nametoid_update(struct ent *, struct ent *); +static int nametoid_parse(struct cache_detail *, char *, int); + +static struct cache_detail nametoid_cache = { + .owner = THIS_MODULE, + .hash_size = ENT_HASHMAX, + .hash_table = nametoid_table, + .name = "nfs4.nametoid", + .cache_put = ent_put, + .cache_upcall = nametoid_upcall, + .cache_parse = nametoid_parse, + .cache_show = nametoid_show, + .warn_no_listener = warn_no_idmapd, + .match = nametoid_match, + .init = ent_init, + .update = ent_init, + .alloc = ent_alloc, +}; + +static int +nametoid_parse(struct cache_detail *cd, char *buf, int buflen) +{ + struct ent ent, *res; + char *buf1; + int error = -EINVAL; + + if (buf[buflen - 1] != '\n') + return (-EINVAL); + buf[buflen - 1]= '\0'; + + buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (buf1 == NULL) + return (-ENOMEM); + + memset(&ent, 0, sizeof(ent)); + + /* Authentication name */ + if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + goto out; + memcpy(ent.authname, buf1, sizeof(ent.authname)); + + /* Type */ + if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + goto out; + ent.type = strcmp(buf1, "user") == 0 ? + IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; + + /* Name */ + error = qword_get(&buf, buf1, PAGE_SIZE); + if (error <= 0 || error >= IDMAP_NAMESZ) + goto out; + memcpy(ent.name, buf1, sizeof(ent.name)); + + /* expiry */ + ent.h.expiry_time = get_expiry(&buf); + if (ent.h.expiry_time == 0) + goto out; + + /* ID */ + error = get_int(&buf, &ent.id); + if (error == -EINVAL) + goto out; + if (error == -ENOENT) + set_bit(CACHE_NEGATIVE, &ent.h.flags); + + error = -ENOMEM; + res = nametoid_lookup(&ent); + if (res == NULL) + goto out; + res = nametoid_update(&ent, res); + if (res == NULL) + goto out; + + cache_put(&res->h, &nametoid_cache); + error = 0; +out: + kfree(buf1); + + return (error); +} + + +static struct ent * +nametoid_lookup(struct ent *item) +{ + struct cache_head *ch = sunrpc_cache_lookup(&nametoid_cache, + &item->h, + nametoid_hash(item)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + +static struct ent * +nametoid_update(struct ent *new, struct ent *old) +{ + struct cache_head *ch = sunrpc_cache_update(&nametoid_cache, + &new->h, &old->h, + nametoid_hash(new)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + +/* + * Exported API + */ + +int +nfsd_idmap_init(void) +{ + int rv; + + rv = cache_register(&idtoname_cache); + if (rv) + return rv; + rv = cache_register(&nametoid_cache); + if (rv) + cache_unregister(&idtoname_cache); + return rv; +} + +void +nfsd_idmap_shutdown(void) +{ + cache_unregister(&idtoname_cache); + cache_unregister(&nametoid_cache); +} + +static int +idmap_lookup(struct svc_rqst *rqstp, + struct ent *(*lookup_fn)(struct ent *), struct ent *key, + struct cache_detail *detail, struct ent **item) +{ + int ret; + + *item = lookup_fn(key); + if (!*item) + return -ENOMEM; + retry: + ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle); + + if (ret == -ETIMEDOUT) { + struct ent *prev_item = *item; + *item = lookup_fn(key); + if (*item != prev_item) + goto retry; + cache_put(&(*item)->h, detail); + } + return ret; +} + +static char * +rqst_authname(struct svc_rqst *rqstp) +{ + struct auth_domain *clp; + + clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client; + return clp->name; +} + +static __be32 +idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, + uid_t *id) +{ + struct ent *item, key = { + .type = type, + }; + int ret; + + if (namelen + 1 > sizeof(key.name)) + return nfserr_badowner; + memcpy(key.name, name, namelen); + key.name[namelen] = '\0'; + strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); + ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item); + if (ret == -ENOENT) + return nfserr_badowner; + if (ret) + return nfserrno(ret); + *id = item->id; + cache_put(&item->h, &nametoid_cache); + return 0; +} + +static int +idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) +{ + struct ent *item, key = { + .id = id, + .type = type, + }; + int ret; + + strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); + ret = idmap_lookup(rqstp, idtoname_lookup, &key, &idtoname_cache, &item); + if (ret == -ENOENT) + return sprintf(name, "%u", id); + if (ret) + return ret; + ret = strlen(item->name); + BUG_ON(ret > IDMAP_NAMESZ); + memcpy(name, item->name, ret); + cache_put(&item->h, &idtoname_cache); + return ret; +} + +__be32 +nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, + __u32 *id) +{ + return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id); +} + +__be32 +nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, + __u32 *id) +{ + return idmap_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, id); +} + +int +nfsd_map_uid_to_name(struct svc_rqst *rqstp, __u32 id, char *name) +{ + return idmap_id_to_name(rqstp, IDMAP_TYPE_USER, id, name); +} + +int +nfsd_map_gid_to_name(struct svc_rqst *rqstp, __u32 id, char *name) +{ + return idmap_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name); +} diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c new file mode 100644 index 00000000..d06a02c1 --- /dev/null +++ b/fs/nfsd/nfs4proc.c @@ -0,0 +1,1472 @@ +/* + * Server-side procedures for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include + +#include "cache.h" +#include "xdr4.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +static u32 nfsd_attrmask[] = { + NFSD_WRITEABLE_ATTRS_WORD0, + NFSD_WRITEABLE_ATTRS_WORD1, + NFSD_WRITEABLE_ATTRS_WORD2 +}; + +static u32 nfsd41_ex_attrmask[] = { + NFSD_SUPPATTR_EXCLCREAT_WORD0, + NFSD_SUPPATTR_EXCLCREAT_WORD1, + NFSD_SUPPATTR_EXCLCREAT_WORD2 +}; + +static __be32 +check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + u32 *bmval, u32 *writable) +{ + struct dentry *dentry = cstate->current_fh.fh_dentry; + + /* + * Check about attributes are supported by the NFSv4 server or not. + * According to spec, unsupported attributes return ERR_ATTRNOTSUPP. + */ + if ((bmval[0] & ~nfsd_suppattrs0(cstate->minorversion)) || + (bmval[1] & ~nfsd_suppattrs1(cstate->minorversion)) || + (bmval[2] & ~nfsd_suppattrs2(cstate->minorversion))) + return nfserr_attrnotsupp; + + /* + * Check FATTR4_WORD0_ACL can be supported + * in current environment or not. + */ + if (bmval[0] & FATTR4_WORD0_ACL) { + if (!IS_POSIXACL(dentry->d_inode)) + return nfserr_attrnotsupp; + } + + /* + * According to spec, read-only attributes return ERR_INVAL. + */ + if (writable) { + if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) || + (bmval[2] & ~writable[2])) + return nfserr_inval; + } + + return nfs_ok; +} + +static __be32 +nfsd4_check_open_attributes(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, struct nfsd4_open *open) +{ + __be32 status = nfs_ok; + + if (open->op_create == NFS4_OPEN_CREATE) { + if (open->op_createmode == NFS4_CREATE_UNCHECKED + || open->op_createmode == NFS4_CREATE_GUARDED) + status = check_attr_support(rqstp, cstate, + open->op_bmval, nfsd_attrmask); + else if (open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1) + status = check_attr_support(rqstp, cstate, + open->op_bmval, nfsd41_ex_attrmask); + } + + return status; +} + +static int +is_create_with_attrs(struct nfsd4_open *open) +{ + return open->op_create == NFS4_OPEN_CREATE + && (open->op_createmode == NFS4_CREATE_UNCHECKED + || open->op_createmode == NFS4_CREATE_GUARDED + || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1); +} + +/* + * if error occurs when setting the acl, just clear the acl bit + * in the returned attr bitmap. + */ +static void +do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl, u32 *bmval) +{ + __be32 status; + + status = nfsd4_set_nfs4_acl(rqstp, fhp, acl); + if (status) + /* + * We should probably fail the whole open at this point, + * but we've already created the file, so it's too late; + * So this seems the least of evils: + */ + bmval[0] &= ~FATTR4_WORD0_ACL; +} + +static inline void +fh_dup2(struct svc_fh *dst, struct svc_fh *src) +{ + fh_put(dst); + dget(src->fh_dentry); + if (src->fh_export) + cache_get(&src->fh_export->h); + *dst = *src; +} + +static __be32 +do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode) +{ + __be32 status; + + if (open->op_truncate && + !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) + return nfserr_inval; + + accmode |= NFSD_MAY_READ_IF_EXEC; + + if (open->op_share_access & NFS4_SHARE_ACCESS_READ) + accmode |= NFSD_MAY_READ; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) + accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC); + if (open->op_share_deny & NFS4_SHARE_DENY_READ) + accmode |= NFSD_MAY_WRITE; + + status = fh_verify(rqstp, current_fh, S_IFREG, accmode); + + return status; +} + +static __be32 +do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) +{ + struct svc_fh resfh; + __be32 status; + int created = 0; + + fh_init(&resfh, NFS4_FHSIZE); + open->op_truncate = 0; + + if (open->op_create) { + /* FIXME: check session persistence and pnfs flags. + * The nfsv4.1 spec requires the following semantics: + * + * Persistent | pNFS | Server REQUIRED | Client Allowed + * Reply Cache | server | | + * -------------+--------+-----------------+-------------------- + * no | no | EXCLUSIVE4_1 | EXCLUSIVE4_1 + * | | | (SHOULD) + * | | and EXCLUSIVE4 | or EXCLUSIVE4 + * | | | (SHOULD NOT) + * no | yes | EXCLUSIVE4_1 | EXCLUSIVE4_1 + * yes | no | GUARDED4 | GUARDED4 + * yes | yes | GUARDED4 | GUARDED4 + */ + + /* + * Note: create modes (UNCHECKED,GUARDED...) are the same + * in NFSv4 as in v3 except EXCLUSIVE4_1. + */ + status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, + open->op_fname.len, &open->op_iattr, + &resfh, open->op_createmode, + (u32 *)open->op_verf.data, + &open->op_truncate, &created); + + /* + * Following rfc 3530 14.2.16, use the returned bitmask + * to indicate which attributes we used to store the + * verifier: + */ + if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) + open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS | + FATTR4_WORD1_TIME_MODIFY); + } else { + status = nfsd_lookup(rqstp, current_fh, + open->op_fname.data, open->op_fname.len, &resfh); + fh_unlock(current_fh); + } + if (status) + goto out; + + if (is_create_with_attrs(open) && open->op_acl != NULL) + do_set_nfs4_acl(rqstp, &resfh, open->op_acl, open->op_bmval); + + set_change_info(&open->op_cinfo, current_fh); + fh_dup2(current_fh, &resfh); + + /* set reply cache */ + fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + &resfh.fh_handle); + if (!created) + status = do_open_permission(rqstp, current_fh, open, + NFSD_MAY_NOP); + +out: + fh_put(&resfh); + return status; +} + +static __be32 +do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) +{ + __be32 status; + + /* Only reclaims from previously confirmed clients are valid */ + if ((status = nfs4_check_open_reclaim(&open->op_clientid))) + return status; + + /* We don't know the target directory, and therefore can not + * set the change info + */ + + memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); + + /* set replay cache */ + fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + ¤t_fh->fh_handle); + + open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && + (open->op_iattr.ia_size == 0); + + status = do_open_permission(rqstp, current_fh, open, + NFSD_MAY_OWNER_OVERRIDE); + + return status; +} + +static void +copy_clientid(clientid_t *clid, struct nfsd4_session *session) +{ + struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)session->se_sessionid.data; + + clid->cl_boot = sid->clientid.cl_boot; + clid->cl_id = sid->clientid.cl_id; +} + +static __be32 +nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_open *open) +{ + __be32 status; + struct nfsd4_compoundres *resp; + + dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", + (int)open->op_fname.len, open->op_fname.data, + open->op_stateowner); + + /* This check required by spec. */ + if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) + return nfserr_inval; + + if (nfsd4_has_session(cstate)) + copy_clientid(&open->op_clientid, cstate->session); + + nfs4_lock_state(); + + /* check seqid for replay. set nfs4_owner */ + resp = rqstp->rq_resp; + status = nfsd4_process_open1(&resp->cstate, open); + if (status == nfserr_replay_me) { + struct nfs4_replay *rp = &open->op_stateowner->so_replay; + fh_put(&cstate->current_fh); + fh_copy_shallow(&cstate->current_fh.fh_handle, + &rp->rp_openfh); + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); + if (status) + dprintk("nfsd4_open: replay failed" + " restoring previous filehandle\n"); + else + status = nfserr_replay_me; + } + if (status) + goto out; + + status = nfsd4_check_open_attributes(rqstp, cstate, open); + if (status) + goto out; + + /* Openowner is now set, so sequence id will get bumped. Now we need + * these checks before we do any creates: */ + status = nfserr_grace; + if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + goto out; + status = nfserr_no_grace; + if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + goto out; + + switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_NULL: + /* + * (1) set CURRENT_FH to the file being opened, + * creating it if necessary, (2) set open->op_cinfo, + * (3) set open->op_truncate if the file is to be + * truncated after opening, (4) do permission checking. + */ + status = do_open_lookup(rqstp, &cstate->current_fh, + open); + if (status) + goto out; + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + open->op_stateowner->so_confirmed = 1; + /* + * The CURRENT_FH is already set to the file being + * opened. (1) set open->op_cinfo, (2) set + * open->op_truncate if the file is to be truncated + * after opening, (3) do permission checking. + */ + status = do_open_fhandle(rqstp, &cstate->current_fh, + open); + if (status) + goto out; + break; + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + open->op_stateowner->so_confirmed = 1; + dprintk("NFSD: unsupported OPEN claim type %d\n", + open->op_claim_type); + status = nfserr_notsupp; + goto out; + default: + dprintk("NFSD: Invalid OPEN claim type %d\n", + open->op_claim_type); + status = nfserr_inval; + goto out; + } + /* + * nfsd4_process_open2() does the actual opening of the file. If + * successful, it (1) truncates the file if open->op_truncate was + * set, (2) sets open->op_stateid, (3) sets open->op_delegation. + */ + status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); +out: + if (open->op_stateowner) { + nfs4_get_stateowner(open->op_stateowner); + cstate->replay_owner = open->op_stateowner; + } + nfs4_unlock_state(); + return status; +} + +/* + * filehandle-manipulating ops. + */ +static __be32 +nfsd4_getfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct svc_fh **getfh) +{ + if (!cstate->current_fh.fh_dentry) + return nfserr_nofilehandle; + + *getfh = &cstate->current_fh; + return nfs_ok; +} + +static __be32 +nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_putfh *putfh) +{ + fh_put(&cstate->current_fh); + cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen; + memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval, + putfh->pf_fhlen); + return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS); +} + +static __be32 +nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + void *arg) +{ + __be32 status; + + fh_put(&cstate->current_fh); + status = exp_pseudoroot(rqstp, &cstate->current_fh); + return status; +} + +static __be32 +nfsd4_restorefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + void *arg) +{ + if (!cstate->save_fh.fh_dentry) + return nfserr_restorefh; + + fh_dup2(&cstate->current_fh, &cstate->save_fh); + return nfs_ok; +} + +static __be32 +nfsd4_savefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + void *arg) +{ + if (!cstate->current_fh.fh_dentry) + return nfserr_nofilehandle; + + fh_dup2(&cstate->save_fh, &cstate->current_fh); + return nfs_ok; +} + +/* + * misc nfsv4 ops + */ +static __be32 +nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_access *access) +{ + if (access->ac_req_access & ~NFS3_ACCESS_FULL) + return nfserr_inval; + + access->ac_resp_access = access->ac_req_access; + return nfsd_access(rqstp, &cstate->current_fh, &access->ac_resp_access, + &access->ac_supported); +} + +static __be32 +nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_commit *commit) +{ + __be32 status; + + u32 *p = (u32 *)commit->co_verf.data; + *p++ = nfssvc_boot.tv_sec; + *p++ = nfssvc_boot.tv_usec; + + status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + commit->co_count); + if (status == nfserr_symlink) + status = nfserr_inval; + return status; +} + +static __be32 +nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_create *create) +{ + struct svc_fh resfh; + __be32 status; + dev_t rdev; + + fh_init(&resfh, NFS4_FHSIZE); + + status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, + NFSD_MAY_CREATE); + if (status == nfserr_symlink) + status = nfserr_notdir; + if (status) + return status; + + status = check_attr_support(rqstp, cstate, create->cr_bmval, + nfsd_attrmask); + if (status) + return status; + + switch (create->cr_type) { + case NF4LNK: + /* ugh! we have to null-terminate the linktext, or + * vfs_symlink() will choke. it is always safe to + * null-terminate by brute force, since at worst we + * will overwrite the first byte of the create namelen + * in the XDR buffer, which has already been extracted + * during XDR decode. + */ + create->cr_linkname[create->cr_linklen] = 0; + + status = nfsd_symlink(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + create->cr_linkname, create->cr_linklen, + &resfh, &create->cr_iattr); + break; + + case NF4BLK: + rdev = MKDEV(create->cr_specdata1, create->cr_specdata2); + if (MAJOR(rdev) != create->cr_specdata1 || + MINOR(rdev) != create->cr_specdata2) + return nfserr_inval; + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFBLK, rdev, &resfh); + break; + + case NF4CHR: + rdev = MKDEV(create->cr_specdata1, create->cr_specdata2); + if (MAJOR(rdev) != create->cr_specdata1 || + MINOR(rdev) != create->cr_specdata2) + return nfserr_inval; + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr,S_IFCHR, rdev, &resfh); + break; + + case NF4SOCK: + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFSOCK, 0, &resfh); + break; + + case NF4FIFO: + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFIFO, 0, &resfh); + break; + + case NF4DIR: + create->cr_iattr.ia_valid &= ~ATTR_SIZE; + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFDIR, 0, &resfh); + break; + + default: + status = nfserr_badtype; + } + + if (status) + goto out; + + if (create->cr_acl != NULL) + do_set_nfs4_acl(rqstp, &resfh, create->cr_acl, + create->cr_bmval); + + fh_unlock(&cstate->current_fh); + set_change_info(&create->cr_cinfo, &cstate->current_fh); + fh_dup2(&cstate->current_fh, &resfh); +out: + fh_put(&resfh); + return status; +} + +static __be32 +nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_getattr *getattr) +{ + __be32 status; + + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); + if (status) + return status; + + if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) + return nfserr_inval; + + getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion); + getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); + getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); + + getattr->ga_fhp = &cstate->current_fh; + return nfs_ok; +} + +static __be32 +nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_link *link) +{ + __be32 status = nfserr_nofilehandle; + + if (!cstate->save_fh.fh_dentry) + return status; + status = nfsd_link(rqstp, &cstate->current_fh, + link->li_name, link->li_namelen, &cstate->save_fh); + if (!status) + set_change_info(&link->li_cinfo, &cstate->current_fh); + return status; +} + +static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh) +{ + struct svc_fh tmp_fh; + __be32 ret; + + fh_init(&tmp_fh, NFS4_FHSIZE); + ret = exp_pseudoroot(rqstp, &tmp_fh); + if (ret) + return ret; + if (tmp_fh.fh_dentry == fh->fh_dentry) { + fh_put(&tmp_fh); + return nfserr_noent; + } + fh_put(&tmp_fh); + return nfsd_lookup(rqstp, fh, "..", 2, fh); +} + +static __be32 +nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + void *arg) +{ + return nfsd4_do_lookupp(rqstp, &cstate->current_fh); +} + +static __be32 +nfsd4_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_lookup *lookup) +{ + return nfsd_lookup(rqstp, &cstate->current_fh, + lookup->lo_name, lookup->lo_len, + &cstate->current_fh); +} + +static __be32 +nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_read *read) +{ + __be32 status; + + /* no need to check permission - this will be done in nfsd_read() */ + + read->rd_filp = NULL; + if (read->rd_offset >= OFFSET_MAX) + return nfserr_inval; + + nfs4_lock_state(); + /* check stateid */ + if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid, + RD_STATE, &read->rd_filp))) { + dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); + goto out; + } + if (read->rd_filp) + get_file(read->rd_filp); + status = nfs_ok; +out: + nfs4_unlock_state(); + read->rd_rqstp = rqstp; + read->rd_fhp = &cstate->current_fh; + return status; +} + +static __be32 +nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_readdir *readdir) +{ + u64 cookie = readdir->rd_cookie; + static const nfs4_verifier zeroverf; + + /* no need to check permission - this will be done in nfsd_readdir() */ + + if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) + return nfserr_inval; + + readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion); + readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); + readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); + + if ((cookie == 1) || (cookie == 2) || + (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) + return nfserr_bad_cookie; + + readdir->rd_rqstp = rqstp; + readdir->rd_fhp = &cstate->current_fh; + return nfs_ok; +} + +static __be32 +nfsd4_readlink(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_readlink *readlink) +{ + readlink->rl_rqstp = rqstp; + readlink->rl_fhp = &cstate->current_fh; + return nfs_ok; +} + +static __be32 +nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_remove *remove) +{ + __be32 status; + + if (locks_in_grace()) + return nfserr_grace; + status = nfsd_unlink(rqstp, &cstate->current_fh, 0, + remove->rm_name, remove->rm_namelen); + if (status == nfserr_symlink) + return nfserr_notdir; + if (!status) { + fh_unlock(&cstate->current_fh); + set_change_info(&remove->rm_cinfo, &cstate->current_fh); + } + return status; +} + +static __be32 +nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_rename *rename) +{ + __be32 status = nfserr_nofilehandle; + + if (!cstate->save_fh.fh_dentry) + return status; + if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags + & NFSEXP_NOSUBTREECHECK)) + return nfserr_grace; + status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, + rename->rn_snamelen, &cstate->current_fh, + rename->rn_tname, rename->rn_tnamelen); + + /* the underlying filesystem returns different error's than required + * by NFSv4. both save_fh and current_fh have been verified.. */ + if (status == nfserr_isdir) + status = nfserr_exist; + else if ((status == nfserr_notdir) && + (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) && + S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode))) + status = nfserr_exist; + else if (status == nfserr_symlink) + status = nfserr_notdir; + + if (!status) { + set_change_info(&rename->rn_sinfo, &cstate->current_fh); + set_change_info(&rename->rn_tinfo, &cstate->save_fh); + } + return status; +} + +static __be32 +nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_secinfo *secinfo) +{ + struct svc_fh resfh; + struct svc_export *exp; + struct dentry *dentry; + __be32 err; + + fh_init(&resfh, NFS4_FHSIZE); + err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC); + if (err) + return err; + err = nfsd_lookup_dentry(rqstp, &cstate->current_fh, + secinfo->si_name, secinfo->si_namelen, + &exp, &dentry); + if (err) + return err; + if (dentry->d_inode == NULL) { + exp_put(exp); + err = nfserr_noent; + } else + secinfo->si_exp = exp; + dput(dentry); + if (cstate->minorversion) + /* See rfc 5661 section 2.6.3.1.1.8 */ + fh_put(&cstate->current_fh); + return err; +} + +static __be32 +nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_secinfo_no_name *sin) +{ + __be32 err; + + switch (sin->sin_style) { + case NFS4_SECINFO_STYLE4_CURRENT_FH: + break; + case NFS4_SECINFO_STYLE4_PARENT: + err = nfsd4_do_lookupp(rqstp, &cstate->current_fh); + if (err) + return err; + break; + default: + return nfserr_inval; + } + exp_get(cstate->current_fh.fh_export); + sin->sin_exp = cstate->current_fh.fh_export; + fh_put(&cstate->current_fh); + return nfs_ok; +} + +static __be32 +nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_setattr *setattr) +{ + __be32 status = nfs_ok; + int err; + + if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { + nfs4_lock_state(); + status = nfs4_preprocess_stateid_op(cstate, + &setattr->sa_stateid, WR_STATE, NULL); + nfs4_unlock_state(); + if (status) { + dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); + return status; + } + } + err = mnt_want_write(cstate->current_fh.fh_export->ex_path.mnt); + if (err) + return nfserrno(err); + status = nfs_ok; + + status = check_attr_support(rqstp, cstate, setattr->sa_bmval, + nfsd_attrmask); + if (status) + goto out; + + if (setattr->sa_acl != NULL) + status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh, + setattr->sa_acl); + if (status) + goto out; + status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr, + 0, (time_t)0); +out: + mnt_drop_write(cstate->current_fh.fh_export->ex_path.mnt); + return status; +} + +static __be32 +nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_write *write) +{ + stateid_t *stateid = &write->wr_stateid; + struct file *filp = NULL; + u32 *p; + __be32 status = nfs_ok; + unsigned long cnt; + + /* no need to check permission - this will be done in nfsd_write() */ + + if (write->wr_offset >= OFFSET_MAX) + return nfserr_inval; + + nfs4_lock_state(); + status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp); + if (filp) + get_file(filp); + nfs4_unlock_state(); + + if (status) { + dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); + return status; + } + + cnt = write->wr_buflen; + write->wr_how_written = write->wr_stable_how; + p = (u32 *)write->wr_verifier.data; + *p++ = nfssvc_boot.tv_sec; + *p++ = nfssvc_boot.tv_usec; + + status = nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, write->wr_vlen, + &cnt, &write->wr_how_written); + if (filp) + fput(filp); + + write->wr_bytes_written = cnt; + + if (status == nfserr_symlink) + status = nfserr_inval; + return status; +} + +/* This routine never returns NFS_OK! If there are no other errors, it + * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the + * attributes matched. VERIFY is implemented by mapping NFSERR_SAME + * to NFS_OK after the call; NVERIFY by mapping NFSERR_NOT_SAME to NFS_OK. + */ +static __be32 +_nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_verify *verify) +{ + __be32 *buf, *p; + int count; + __be32 status; + + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); + if (status) + return status; + + status = check_attr_support(rqstp, cstate, verify->ve_bmval, NULL); + if (status) + return status; + + if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR) + || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)) + return nfserr_inval; + if (verify->ve_attrlen & 3) + return nfserr_inval; + + /* count in words: + * bitmap_len(1) + bitmap(2) + attr_len(1) = 4 + */ + count = 4 + (verify->ve_attrlen >> 2); + buf = kmalloc(count << 2, GFP_KERNEL); + if (!buf) + return nfserr_jukebox; + + status = nfsd4_encode_fattr(&cstate->current_fh, + cstate->current_fh.fh_export, + cstate->current_fh.fh_dentry, buf, + &count, verify->ve_bmval, + rqstp, 0); + + /* this means that nfsd4_encode_fattr() ran out of space */ + if (status == nfserr_resource && count == 0) + status = nfserr_not_same; + if (status) + goto out_kfree; + + /* skip bitmap */ + p = buf + 1 + ntohl(buf[0]); + status = nfserr_not_same; + if (ntohl(*p++) != verify->ve_attrlen) + goto out_kfree; + if (!memcmp(p, verify->ve_attrval, verify->ve_attrlen)) + status = nfserr_same; + +out_kfree: + kfree(buf); + return status; +} + +static __be32 +nfsd4_nverify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_verify *verify) +{ + __be32 status; + + status = _nfsd4_verify(rqstp, cstate, verify); + return status == nfserr_not_same ? nfs_ok : status; +} + +static __be32 +nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_verify *verify) +{ + __be32 status; + + status = _nfsd4_verify(rqstp, cstate, verify); + return status == nfserr_same ? nfs_ok : status; +} + +/* + * NULL call. + */ +static __be32 +nfsd4_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return nfs_ok; +} + +static inline void nfsd4_increment_op_stats(u32 opnum) +{ + if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP) + nfsdstats.nfs4_opcount[opnum]++; +} + +typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, + void *); +enum nfsd4_op_flags { + ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ + ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ + ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */ + /* For rfc 5661 section 2.6.3.1.1: */ + OP_HANDLES_WRONGSEC = 1 << 3, + OP_IS_PUTFH_LIKE = 1 << 4, +}; + +struct nfsd4_operation { + nfsd4op_func op_func; + u32 op_flags; + char *op_name; +}; + +static struct nfsd4_operation nfsd4_ops[]; + +static const char *nfsd4_op_name(unsigned opnum); + +/* + * Enforce NFSv4.1 COMPOUND ordering rules: + * + * Also note, enforced elsewhere: + * - SEQUENCE other than as first op results in + * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().) + * - BIND_CONN_TO_SESSION must be the only op in its compound. + * (Enforced in nfsd4_bind_conn_to_session().) + * - DESTROY_SESSION must be the final operation in a compound, if + * sessionid's in SEQUENCE and DESTROY_SESSION are the same. + * (Enforced in nfsd4_destroy_session().) + */ +static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args) +{ + struct nfsd4_op *op = &args->ops[0]; + + /* These ordering requirements don't apply to NFSv4.0: */ + if (args->minorversion == 0) + return nfs_ok; + /* This is weird, but OK, not our problem: */ + if (args->opcnt == 0) + return nfs_ok; + if (op->status == nfserr_op_illegal) + return nfs_ok; + if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP)) + return nfserr_op_not_in_session; + if (op->opnum == OP_SEQUENCE) + return nfs_ok; + if (args->opcnt != 1) + return nfserr_not_only_op; + return nfs_ok; +} + +static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op) +{ + return &nfsd4_ops[op->opnum]; +} + +static bool need_wrongsec_check(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + struct nfsd4_op *this = &argp->ops[resp->opcnt - 1]; + struct nfsd4_op *next = &argp->ops[resp->opcnt]; + struct nfsd4_operation *thisd; + struct nfsd4_operation *nextd; + + thisd = OPDESC(this); + /* + * Most ops check wronsec on our own; only the putfh-like ops + * have special rules. + */ + if (!(thisd->op_flags & OP_IS_PUTFH_LIKE)) + return false; + /* + * rfc 5661 2.6.3.1.1.6: don't bother erroring out a + * put-filehandle operation if we're not going to use the + * result: + */ + if (argp->opcnt == resp->opcnt) + return false; + + nextd = OPDESC(next); + /* + * Rest of 2.6.3.1.1: certain operations will return WRONGSEC + * errors themselves as necessary; others should check for them + * now: + */ + return !(nextd->op_flags & OP_HANDLES_WRONGSEC); +} + +/* + * COMPOUND call. + */ +static __be32 +nfsd4_proc_compound(struct svc_rqst *rqstp, + struct nfsd4_compoundargs *args, + struct nfsd4_compoundres *resp) +{ + struct nfsd4_op *op; + struct nfsd4_operation *opdesc; + struct nfsd4_compound_state *cstate = &resp->cstate; + int slack_bytes; + __be32 status; + + resp->xbuf = &rqstp->rq_res; + resp->p = rqstp->rq_res.head[0].iov_base + + rqstp->rq_res.head[0].iov_len; + resp->tagp = resp->p; + /* reserve space for: taglen, tag, and opcnt */ + resp->p += 2 + XDR_QUADLEN(args->taglen); + resp->end = rqstp->rq_res.head[0].iov_base + PAGE_SIZE; + resp->taglen = args->taglen; + resp->tag = args->tag; + resp->opcnt = 0; + resp->rqstp = rqstp; + resp->cstate.minorversion = args->minorversion; + resp->cstate.replay_owner = NULL; + resp->cstate.session = NULL; + fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); + fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); + /* + * Don't use the deferral mechanism for NFSv4; compounds make it + * too hard to avoid non-idempotency problems. + */ + rqstp->rq_usedeferral = 0; + + /* + * According to RFC3010, this takes precedence over all other errors. + */ + status = nfserr_minor_vers_mismatch; + if (args->minorversion > nfsd_supported_minorversion) + goto out; + + status = nfs41_check_op_ordering(args); + if (status) { + op = &args->ops[0]; + op->status = status; + goto encode_op; + } + + while (!status && resp->opcnt < args->opcnt) { + op = &args->ops[resp->opcnt++]; + + dprintk("nfsv4 compound op #%d/%d: %d (%s)\n", + resp->opcnt, args->opcnt, op->opnum, + nfsd4_op_name(op->opnum)); + /* + * The XDR decode routines may have pre-set op->status; + * for example, if there is a miscellaneous XDR error + * it will be set to nfserr_bad_xdr. + */ + if (op->status) + goto encode_op; + + /* We must be able to encode a successful response to + * this operation, with enough room left over to encode a + * failed response to the next operation. If we don't + * have enough room, fail with ERR_RESOURCE. + */ + slack_bytes = (char *)resp->end - (char *)resp->p; + if (slack_bytes < COMPOUND_SLACK_SPACE + + COMPOUND_ERR_SLACK_SPACE) { + BUG_ON(slack_bytes < COMPOUND_ERR_SLACK_SPACE); + op->status = nfserr_resource; + goto encode_op; + } + + opdesc = OPDESC(op); + + if (!cstate->current_fh.fh_dentry) { + if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) { + op->status = nfserr_nofilehandle; + goto encode_op; + } + } else if (cstate->current_fh.fh_export->ex_fslocs.migrated && + !(opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) { + op->status = nfserr_moved; + goto encode_op; + } + + if (opdesc->op_func) + op->status = opdesc->op_func(rqstp, cstate, &op->u); + else + BUG_ON(op->status == nfs_ok); + + if (!op->status && need_wrongsec_check(rqstp)) + op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp); + +encode_op: + /* Only from SEQUENCE */ + if (resp->cstate.status == nfserr_replay_cache) { + dprintk("%s NFS4.1 replay from cache\n", __func__); + status = op->status; + goto out; + } + if (op->status == nfserr_replay_me) { + op->replay = &cstate->replay_owner->so_replay; + nfsd4_encode_replay(resp, op); + status = op->status = op->replay->rp_status; + } else { + nfsd4_encode_operation(resp, op); + status = op->status; + } + + dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n", + args->ops, args->opcnt, resp->opcnt, op->opnum, + be32_to_cpu(status)); + + if (cstate->replay_owner) { + nfs4_put_stateowner(cstate->replay_owner); + cstate->replay_owner = NULL; + } + /* XXX Ugh, we need to get rid of this kind of special case: */ + if (op->opnum == OP_READ && op->u.read.rd_filp) + fput(op->u.read.rd_filp); + + nfsd4_increment_op_stats(op->opnum); + } + + resp->cstate.status = status; + fh_put(&resp->cstate.current_fh); + fh_put(&resp->cstate.save_fh); + BUG_ON(resp->cstate.replay_owner); +out: + nfsd4_release_compoundargs(args); + /* Reset deferral mechanism for RPC deferrals */ + rqstp->rq_usedeferral = 1; + dprintk("nfsv4 compound returned %d\n", ntohl(status)); + return status; +} + +static struct nfsd4_operation nfsd4_ops[] = { + [OP_ACCESS] = { + .op_func = (nfsd4op_func)nfsd4_access, + .op_name = "OP_ACCESS", + }, + [OP_CLOSE] = { + .op_func = (nfsd4op_func)nfsd4_close, + .op_name = "OP_CLOSE", + }, + [OP_COMMIT] = { + .op_func = (nfsd4op_func)nfsd4_commit, + .op_name = "OP_COMMIT", + }, + [OP_CREATE] = { + .op_func = (nfsd4op_func)nfsd4_create, + .op_name = "OP_CREATE", + }, + [OP_DELEGRETURN] = { + .op_func = (nfsd4op_func)nfsd4_delegreturn, + .op_name = "OP_DELEGRETURN", + }, + [OP_GETATTR] = { + .op_func = (nfsd4op_func)nfsd4_getattr, + .op_flags = ALLOWED_ON_ABSENT_FS, + .op_name = "OP_GETATTR", + }, + [OP_GETFH] = { + .op_func = (nfsd4op_func)nfsd4_getfh, + .op_name = "OP_GETFH", + }, + [OP_LINK] = { + .op_func = (nfsd4op_func)nfsd4_link, + .op_name = "OP_LINK", + }, + [OP_LOCK] = { + .op_func = (nfsd4op_func)nfsd4_lock, + .op_name = "OP_LOCK", + }, + [OP_LOCKT] = { + .op_func = (nfsd4op_func)nfsd4_lockt, + .op_name = "OP_LOCKT", + }, + [OP_LOCKU] = { + .op_func = (nfsd4op_func)nfsd4_locku, + .op_name = "OP_LOCKU", + }, + [OP_LOOKUP] = { + .op_func = (nfsd4op_func)nfsd4_lookup, + .op_flags = OP_HANDLES_WRONGSEC, + .op_name = "OP_LOOKUP", + }, + [OP_LOOKUPP] = { + .op_func = (nfsd4op_func)nfsd4_lookupp, + .op_flags = OP_HANDLES_WRONGSEC, + .op_name = "OP_LOOKUPP", + }, + [OP_NVERIFY] = { + .op_func = (nfsd4op_func)nfsd4_nverify, + .op_name = "OP_NVERIFY", + }, + [OP_OPEN] = { + .op_func = (nfsd4op_func)nfsd4_open, + .op_flags = OP_HANDLES_WRONGSEC, + .op_name = "OP_OPEN", + }, + [OP_OPEN_CONFIRM] = { + .op_func = (nfsd4op_func)nfsd4_open_confirm, + .op_name = "OP_OPEN_CONFIRM", + }, + [OP_OPEN_DOWNGRADE] = { + .op_func = (nfsd4op_func)nfsd4_open_downgrade, + .op_name = "OP_OPEN_DOWNGRADE", + }, + [OP_PUTFH] = { + .op_func = (nfsd4op_func)nfsd4_putfh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE, + .op_name = "OP_PUTFH", + }, + [OP_PUTPUBFH] = { + .op_func = (nfsd4op_func)nfsd4_putrootfh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE, + .op_name = "OP_PUTPUBFH", + }, + [OP_PUTROOTFH] = { + .op_func = (nfsd4op_func)nfsd4_putrootfh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE, + .op_name = "OP_PUTROOTFH", + }, + [OP_READ] = { + .op_func = (nfsd4op_func)nfsd4_read, + .op_name = "OP_READ", + }, + [OP_READDIR] = { + .op_func = (nfsd4op_func)nfsd4_readdir, + .op_name = "OP_READDIR", + }, + [OP_READLINK] = { + .op_func = (nfsd4op_func)nfsd4_readlink, + .op_name = "OP_READLINK", + }, + [OP_REMOVE] = { + .op_func = (nfsd4op_func)nfsd4_remove, + .op_name = "OP_REMOVE", + }, + [OP_RENAME] = { + .op_name = "OP_RENAME", + .op_func = (nfsd4op_func)nfsd4_rename, + }, + [OP_RENEW] = { + .op_func = (nfsd4op_func)nfsd4_renew, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_RENEW", + }, + [OP_RESTOREFH] = { + .op_func = (nfsd4op_func)nfsd4_restorefh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE, + .op_name = "OP_RESTOREFH", + }, + [OP_SAVEFH] = { + .op_func = (nfsd4op_func)nfsd4_savefh, + .op_flags = OP_HANDLES_WRONGSEC, + .op_name = "OP_SAVEFH", + }, + [OP_SECINFO] = { + .op_func = (nfsd4op_func)nfsd4_secinfo, + .op_flags = OP_HANDLES_WRONGSEC, + .op_name = "OP_SECINFO", + }, + [OP_SETATTR] = { + .op_func = (nfsd4op_func)nfsd4_setattr, + .op_name = "OP_SETATTR", + }, + [OP_SETCLIENTID] = { + .op_func = (nfsd4op_func)nfsd4_setclientid, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_SETCLIENTID", + }, + [OP_SETCLIENTID_CONFIRM] = { + .op_func = (nfsd4op_func)nfsd4_setclientid_confirm, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_SETCLIENTID_CONFIRM", + }, + [OP_VERIFY] = { + .op_func = (nfsd4op_func)nfsd4_verify, + .op_name = "OP_VERIFY", + }, + [OP_WRITE] = { + .op_func = (nfsd4op_func)nfsd4_write, + .op_name = "OP_WRITE", + }, + [OP_RELEASE_LOCKOWNER] = { + .op_func = (nfsd4op_func)nfsd4_release_lockowner, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_name = "OP_RELEASE_LOCKOWNER", + }, + + /* NFSv4.1 operations */ + [OP_EXCHANGE_ID] = { + .op_func = (nfsd4op_func)nfsd4_exchange_id, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_EXCHANGE_ID", + }, + [OP_BIND_CONN_TO_SESSION] = { + .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_BIND_CONN_TO_SESSION", + }, + [OP_CREATE_SESSION] = { + .op_func = (nfsd4op_func)nfsd4_create_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_CREATE_SESSION", + }, + [OP_DESTROY_SESSION] = { + .op_func = (nfsd4op_func)nfsd4_destroy_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_DESTROY_SESSION", + }, + [OP_SEQUENCE] = { + .op_func = (nfsd4op_func)nfsd4_sequence, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_SEQUENCE", + }, + [OP_RECLAIM_COMPLETE] = { + .op_func = (nfsd4op_func)nfsd4_reclaim_complete, + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_RECLAIM_COMPLETE", + }, + [OP_SECINFO_NO_NAME] = { + .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, + .op_flags = OP_HANDLES_WRONGSEC, + .op_name = "OP_SECINFO_NO_NAME", + }, +}; + +static const char *nfsd4_op_name(unsigned opnum) +{ + if (opnum < ARRAY_SIZE(nfsd4_ops)) + return nfsd4_ops[opnum].op_name; + return "unknown_operation"; +} + +#define nfsd4_voidres nfsd4_voidargs +struct nfsd4_voidargs { int dummy; }; + +/* + * TODO: At the present time, the NFSv4 server does not do XID caching + * of requests. Implementing XID caching would not be a serious problem, + * although it would require a mild change in interfaces since one + * doesn't know whether an NFSv4 request is idempotent until after the + * XDR decode. However, XID caching totally confuses pynfs (Peter + * Astrand's regression testsuite for NFSv4 servers), which reuses + * XID's liberally, so I've left it unimplemented until pynfs generates + * better XID's. + */ +static struct svc_procedure nfsd_procedures4[2] = { + [NFSPROC4_NULL] = { + .pc_func = (svc_procfunc) nfsd4_proc_null, + .pc_encode = (kxdrproc_t) nfs4svc_encode_voidres, + .pc_argsize = sizeof(struct nfsd4_voidargs), + .pc_ressize = sizeof(struct nfsd4_voidres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = 1, + }, + [NFSPROC4_COMPOUND] = { + .pc_func = (svc_procfunc) nfsd4_proc_compound, + .pc_decode = (kxdrproc_t) nfs4svc_decode_compoundargs, + .pc_encode = (kxdrproc_t) nfs4svc_encode_compoundres, + .pc_argsize = sizeof(struct nfsd4_compoundargs), + .pc_ressize = sizeof(struct nfsd4_compoundres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = NFSD_BUFSIZE/4, + }, +}; + +struct svc_version nfsd_version4 = { + .vs_vers = 4, + .vs_nproc = 2, + .vs_proc = nfsd_procedures4, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS4_SVC_XDRSIZE, +}; + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c new file mode 100644 index 00000000..be268148 --- /dev/null +++ b/fs/nfsd/nfs4recover.c @@ -0,0 +1,402 @@ +/* +* Copyright (c) 2004 The Regents of the University of Michigan. +* All rights reserved. +* +* Andy Adamson +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* 3. Neither the name of the University nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED +* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#include +#include +#include +#include +#include + +#include "nfsd.h" +#include "state.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +/* Globals */ +static struct file *rec_file; + +static int +nfs4_save_creds(const struct cred **original_creds) +{ + struct cred *new; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->fsuid = 0; + new->fsgid = 0; + *original_creds = override_creds(new); + put_cred(new); + return 0; +} + +static void +nfs4_reset_creds(const struct cred *original) +{ + revert_creds(original); +} + +static void +md5_to_hex(char *out, char *md5) +{ + int i; + + for (i=0; i<16; i++) { + unsigned char c = md5[i]; + + *out++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); + *out++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); + } + *out = '\0'; +} + +__be32 +nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname) +{ + struct xdr_netobj cksum; + struct hash_desc desc; + struct scatterlist sg; + __be32 status = nfserr_jukebox; + + dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", + clname->len, clname->data); + desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(desc.tfm)) + goto out_no_tfm; + cksum.len = crypto_hash_digestsize(desc.tfm); + cksum.data = kmalloc(cksum.len, GFP_KERNEL); + if (cksum.data == NULL) + goto out; + + sg_init_one(&sg, clname->data, clname->len); + + if (crypto_hash_digest(&desc, &sg, sg.length, cksum.data)) + goto out; + + md5_to_hex(dname, cksum.data); + + status = nfs_ok; +out: + kfree(cksum.data); + crypto_free_hash(desc.tfm); +out_no_tfm: + return status; +} + +int +nfsd4_create_clid_dir(struct nfs4_client *clp) +{ + const struct cred *original_cred; + char *dname = clp->cl_recdir; + struct dentry *dir, *dentry; + int status; + + dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); + + if (!rec_file || clp->cl_firststate) + return 0; + + status = nfs4_save_creds(&original_cred); + if (status < 0) + return status; + + dir = rec_file->f_path.dentry; + /* lock the parent */ + mutex_lock(&dir->d_inode->i_mutex); + + dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + goto out_unlock; + } + status = -EEXIST; + if (dentry->d_inode) { + dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); + goto out_put; + } + status = mnt_want_write(rec_file->f_path.mnt); + if (status) + goto out_put; + status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); + mnt_drop_write(rec_file->f_path.mnt); +out_put: + dput(dentry); +out_unlock: + mutex_unlock(&dir->d_inode->i_mutex); + if (status == 0) { + clp->cl_firststate = 1; + vfs_fsync(rec_file, 0); + } + nfs4_reset_creds(original_cred); + dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); + return status; +} + +typedef int (recdir_func)(struct dentry *, struct dentry *); + +struct name_list { + char name[HEXDIR_LEN]; + struct list_head list; +}; + +static int +nfsd4_build_namelist(void *arg, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct list_head *names = arg; + struct name_list *entry; + + if (namlen != HEXDIR_LEN - 1) + return 0; + entry = kmalloc(sizeof(struct name_list), GFP_KERNEL); + if (entry == NULL) + return -ENOMEM; + memcpy(entry->name, name, HEXDIR_LEN - 1); + entry->name[HEXDIR_LEN - 1] = '\0'; + list_add(&entry->list, names); + return 0; +} + +static int +nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) +{ + const struct cred *original_cred; + struct file *filp; + LIST_HEAD(names); + struct name_list *entry; + struct dentry *dentry; + int status; + + if (!rec_file) + return 0; + + status = nfs4_save_creds(&original_cred); + if (status < 0) + return status; + + filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY, + current_cred()); + status = PTR_ERR(filp); + if (IS_ERR(filp)) + goto out; + status = vfs_readdir(filp, nfsd4_build_namelist, &names); + fput(filp); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + while (!list_empty(&names)) { + entry = list_entry(names.next, struct name_list, list); + + dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + break; + } + status = f(dir, dentry); + dput(dentry); + if (status) + break; + list_del(&entry->list); + kfree(entry); + } + mutex_unlock(&dir->d_inode->i_mutex); +out: + while (!list_empty(&names)) { + entry = list_entry(names.next, struct name_list, list); + list_del(&entry->list); + kfree(entry); + } + nfs4_reset_creds(original_cred); + return status; +} + +static int +nfsd4_unlink_clid_dir(char *name, int namlen) +{ + struct dentry *dir, *dentry; + int status; + + dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); + + dir = rec_file->f_path.dentry; + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = lookup_one_len(name, dir, namlen); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + goto out_unlock; + } + status = -ENOENT; + if (!dentry->d_inode) + goto out; + status = vfs_rmdir(dir->d_inode, dentry); +out: + dput(dentry); +out_unlock: + mutex_unlock(&dir->d_inode->i_mutex); + return status; +} + +void +nfsd4_remove_clid_dir(struct nfs4_client *clp) +{ + const struct cred *original_cred; + int status; + + if (!rec_file || !clp->cl_firststate) + return; + + status = mnt_want_write(rec_file->f_path.mnt); + if (status) + goto out; + clp->cl_firststate = 0; + + status = nfs4_save_creds(&original_cred); + if (status < 0) + goto out; + + status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); + nfs4_reset_creds(original_cred); + if (status == 0) + vfs_fsync(rec_file, 0); + mnt_drop_write(rec_file->f_path.mnt); +out: + if (status) + printk("NFSD: Failed to remove expired client state directory" + " %.*s\n", HEXDIR_LEN, clp->cl_recdir); + return; +} + +static int +purge_old(struct dentry *parent, struct dentry *child) +{ + int status; + + if (nfs4_has_reclaimed_state(child->d_name.name, false)) + return 0; + + status = vfs_rmdir(parent->d_inode, child); + if (status) + printk("failed to remove client recovery directory %s\n", + child->d_name.name); + /* Keep trying, success or failure: */ + return 0; +} + +void +nfsd4_recdir_purge_old(void) { + int status; + + if (!rec_file) + return; + status = mnt_want_write(rec_file->f_path.mnt); + if (status) + goto out; + status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old); + if (status == 0) + vfs_fsync(rec_file, 0); + mnt_drop_write(rec_file->f_path.mnt); +out: + if (status) + printk("nfsd4: failed to purge old clients from recovery" + " directory %s\n", rec_file->f_path.dentry->d_name.name); +} + +static int +load_recdir(struct dentry *parent, struct dentry *child) +{ + if (child->d_name.len != HEXDIR_LEN - 1) { + printk("nfsd4: illegal name %s in recovery directory\n", + child->d_name.name); + /* Keep trying; maybe the others are OK: */ + return 0; + } + nfs4_client_to_reclaim(child->d_name.name); + return 0; +} + +int +nfsd4_recdir_load(void) { + int status; + + if (!rec_file) + return 0; + + status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir); + if (status) + printk("nfsd4: failed loading clients from recovery" + " directory %s\n", rec_file->f_path.dentry->d_name.name); + return status; +} + +/* + * Hold reference to the recovery directory. + */ + +void +nfsd4_init_recdir(char *rec_dirname) +{ + const struct cred *original_cred; + int status; + + printk("NFSD: Using %s as the NFSv4 state recovery directory\n", + rec_dirname); + + BUG_ON(rec_file); + + status = nfs4_save_creds(&original_cred); + if (status < 0) { + printk("NFSD: Unable to change credentials to find recovery" + " directory: error %d\n", + status); + return; + } + + rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0); + if (IS_ERR(rec_file)) { + printk("NFSD: unable to find recovery directory %s\n", + rec_dirname); + rec_file = NULL; + } + + nfs4_reset_creds(original_cred); +} + +void +nfsd4_shutdown_recdir(void) +{ + if (!rec_file) + return; + fput(rec_file); + rec_file = NULL; +} diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c new file mode 100644 index 00000000..92f7eb7c --- /dev/null +++ b/fs/nfsd/nfs4state.c @@ -0,0 +1,4485 @@ +/* +* Copyright (c) 2001 The Regents of the University of Michigan. +* All rights reserved. +* +* Kendrick Smith +* Andy Adamson +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* 3. Neither the name of the University nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED +* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#include +#include +#include +#include +#include +#include +#include +#include "xdr4.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +/* Globals */ +time_t nfsd4_lease = 90; /* default lease time */ +time_t nfsd4_grace = 90; +static time_t boot_time; +static u32 current_ownerid = 1; +static u32 current_fileid = 1; +static u32 current_delegid = 1; +static stateid_t zerostateid; /* bits all 0 */ +static stateid_t onestateid; /* bits all 1 */ +static u64 current_sessionid = 1; + +#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) +#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) + +/* forward declarations */ +static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); +static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); +static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; +static void nfs4_set_recdir(char *recdir); + +/* Locking: */ + +/* Currently used for almost all code touching nfsv4 state: */ +static DEFINE_MUTEX(client_mutex); + +/* + * Currently used for the del_recall_lru and file hash table. In an + * effort to decrease the scope of the client_mutex, this spinlock may + * eventually cover more: + */ +static DEFINE_SPINLOCK(recall_lock); + +static struct kmem_cache *stateowner_slab = NULL; +static struct kmem_cache *file_slab = NULL; +static struct kmem_cache *stateid_slab = NULL; +static struct kmem_cache *deleg_slab = NULL; + +void +nfs4_lock_state(void) +{ + mutex_lock(&client_mutex); +} + +void +nfs4_unlock_state(void) +{ + mutex_unlock(&client_mutex); +} + +static inline u32 +opaque_hashval(const void *ptr, int nbytes) +{ + unsigned char *cptr = (unsigned char *) ptr; + + u32 x = 0; + while (nbytes--) { + x *= 37; + x += *cptr++; + } + return x; +} + +static struct list_head del_recall_lru; + +static inline void +put_nfs4_file(struct nfs4_file *fi) +{ + if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { + list_del(&fi->fi_hash); + spin_unlock(&recall_lock); + iput(fi->fi_inode); + kmem_cache_free(file_slab, fi); + } +} + +static inline void +get_nfs4_file(struct nfs4_file *fi) +{ + atomic_inc(&fi->fi_ref); +} + +static int num_delegations; +unsigned int max_delegations; + +/* + * Open owner state (share locks) + */ + +/* hash tables for nfs4_stateowner */ +#define OWNER_HASH_BITS 8 +#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) +#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) + +#define ownerid_hashval(id) \ + ((id) & OWNER_HASH_MASK) +#define ownerstr_hashval(clientid, ownername) \ + (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) + +static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; +static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; + +/* hash table for nfs4_file */ +#define FILE_HASH_BITS 8 +#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) + +/* hash table for (open)nfs4_stateid */ +#define STATEID_HASH_BITS 10 +#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) +#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) + +#define file_hashval(x) \ + hash_ptr(x, FILE_HASH_BITS) +#define stateid_hashval(owner_id, file_id) \ + (((owner_id) + (file_id)) & STATEID_HASH_MASK) + +static struct list_head file_hashtbl[FILE_HASH_SIZE]; +static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; + +static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) +{ + BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR])); + atomic_inc(&fp->fi_access[oflag]); +} + +static void nfs4_file_get_access(struct nfs4_file *fp, int oflag) +{ + if (oflag == O_RDWR) { + __nfs4_file_get_access(fp, O_RDONLY); + __nfs4_file_get_access(fp, O_WRONLY); + } else + __nfs4_file_get_access(fp, oflag); +} + +static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag) +{ + if (fp->fi_fds[oflag]) { + fput(fp->fi_fds[oflag]); + fp->fi_fds[oflag] = NULL; + } +} + +static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) +{ + if (atomic_dec_and_test(&fp->fi_access[oflag])) { + nfs4_file_put_fd(fp, oflag); + /* + * It's also safe to get rid of the RDWR open *if* + * we no longer have need of the other kind of access + * or if we already have the other kind of open: + */ + if (fp->fi_fds[1-oflag] + || atomic_read(&fp->fi_access[1 - oflag]) == 0) + nfs4_file_put_fd(fp, O_RDWR); + } +} + +static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) +{ + if (oflag == O_RDWR) { + __nfs4_file_put_access(fp, O_RDONLY); + __nfs4_file_put_access(fp, O_WRONLY); + } else + __nfs4_file_put_access(fp, oflag); +} + +static struct nfs4_delegation * +alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) +{ + struct nfs4_delegation *dp; + struct nfs4_file *fp = stp->st_file; + + dprintk("NFSD alloc_init_deleg\n"); + /* + * Major work on the lease subsystem (for example, to support + * calbacks on stat) will be required before we can support + * write delegations properly. + */ + if (type != NFS4_OPEN_DELEGATE_READ) + return NULL; + if (fp->fi_had_conflict) + return NULL; + if (num_delegations > max_delegations) + return NULL; + dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); + if (dp == NULL) + return dp; + num_delegations++; + INIT_LIST_HEAD(&dp->dl_perfile); + INIT_LIST_HEAD(&dp->dl_perclnt); + INIT_LIST_HEAD(&dp->dl_recall_lru); + dp->dl_client = clp; + get_nfs4_file(fp); + dp->dl_file = fp; + dp->dl_type = type; + dp->dl_stateid.si_boot = boot_time; + dp->dl_stateid.si_stateownerid = current_delegid++; + dp->dl_stateid.si_fileid = 0; + dp->dl_stateid.si_generation = 0; + fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); + dp->dl_time = 0; + atomic_set(&dp->dl_count, 1); + INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc); + return dp; +} + +void +nfs4_put_delegation(struct nfs4_delegation *dp) +{ + if (atomic_dec_and_test(&dp->dl_count)) { + dprintk("NFSD: freeing dp %p\n",dp); + put_nfs4_file(dp->dl_file); + kmem_cache_free(deleg_slab, dp); + num_delegations--; + } +} + +static void nfs4_put_deleg_lease(struct nfs4_file *fp) +{ + if (atomic_dec_and_test(&fp->fi_delegees)) { + vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease); + fp->fi_lease = NULL; + fput(fp->fi_deleg_file); + fp->fi_deleg_file = NULL; + } +} + +/* Called under the state lock. */ +static void +unhash_delegation(struct nfs4_delegation *dp) +{ + list_del_init(&dp->dl_perclnt); + spin_lock(&recall_lock); + list_del_init(&dp->dl_perfile); + list_del_init(&dp->dl_recall_lru); + spin_unlock(&recall_lock); + nfs4_put_deleg_lease(dp->dl_file); + nfs4_put_delegation(dp); +} + +/* + * SETCLIENTID state + */ + +/* client_lock protects the client lru list and session hash table */ +static DEFINE_SPINLOCK(client_lock); + +/* Hash tables for nfs4_clientid state */ +#define CLIENT_HASH_BITS 4 +#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) +#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) + +#define clientid_hashval(id) \ + ((id) & CLIENT_HASH_MASK) +#define clientstr_hashval(name) \ + (opaque_hashval((name), 8) & CLIENT_HASH_MASK) +/* + * reclaim_str_hashtbl[] holds known client info from previous reset/reboot + * used in reboot/reset lease grace period processing + * + * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed + * setclientid_confirmed info. + * + * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed + * setclientid info. + * + * client_lru holds client queue ordered by nfs4_client.cl_time + * for lease renewal. + * + * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time + * for last close replay. + */ +static struct list_head reclaim_str_hashtbl[CLIENT_HASH_SIZE]; +static int reclaim_str_hashtbl_size = 0; +static struct list_head conf_id_hashtbl[CLIENT_HASH_SIZE]; +static struct list_head conf_str_hashtbl[CLIENT_HASH_SIZE]; +static struct list_head unconf_str_hashtbl[CLIENT_HASH_SIZE]; +static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE]; +static struct list_head client_lru; +static struct list_head close_lru; + +/* + * We store the NONE, READ, WRITE, and BOTH bits separately in the + * st_{access,deny}_bmap field of the stateid, in order to track not + * only what share bits are currently in force, but also what + * combinations of share bits previous opens have used. This allows us + * to enforce the recommendation of rfc 3530 14.2.19 that the server + * return an error if the client attempt to downgrade to a combination + * of share bits not explicable by closing some of its previous opens. + * + * XXX: This enforcement is actually incomplete, since we don't keep + * track of access/deny bit combinations; so, e.g., we allow: + * + * OPEN allow read, deny write + * OPEN allow both, deny none + * DOWNGRADE allow read, deny none + * + * which we should reject. + */ +static void +set_access(unsigned int *access, unsigned long bmap) { + int i; + + *access = 0; + for (i = 1; i < 4; i++) { + if (test_bit(i, &bmap)) + *access |= i; + } +} + +static void +set_deny(unsigned int *deny, unsigned long bmap) { + int i; + + *deny = 0; + for (i = 0; i < 4; i++) { + if (test_bit(i, &bmap)) + *deny |= i ; + } +} + +static int +test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { + unsigned int access, deny; + + set_access(&access, stp->st_access_bmap); + set_deny(&deny, stp->st_deny_bmap); + if ((access & open->op_share_deny) || (deny & open->op_share_access)) + return 0; + return 1; +} + +static int nfs4_access_to_omode(u32 access) +{ + switch (access & NFS4_SHARE_ACCESS_BOTH) { + case NFS4_SHARE_ACCESS_READ: + return O_RDONLY; + case NFS4_SHARE_ACCESS_WRITE: + return O_WRONLY; + case NFS4_SHARE_ACCESS_BOTH: + return O_RDWR; + } + BUG(); +} + +static void unhash_generic_stateid(struct nfs4_stateid *stp) +{ + list_del(&stp->st_hash); + list_del(&stp->st_perfile); + list_del(&stp->st_perstateowner); +} + +static void free_generic_stateid(struct nfs4_stateid *stp) +{ + int i; + + if (stp->st_access_bmap) { + for (i = 1; i < 4; i++) { + if (test_bit(i, &stp->st_access_bmap)) + nfs4_file_put_access(stp->st_file, + nfs4_access_to_omode(i)); + } + } + put_nfs4_file(stp->st_file); + kmem_cache_free(stateid_slab, stp); +} + +static void release_lock_stateid(struct nfs4_stateid *stp) +{ + struct file *file; + + unhash_generic_stateid(stp); + file = find_any_file(stp->st_file); + if (file) + locks_remove_posix(file, (fl_owner_t)stp->st_stateowner); + free_generic_stateid(stp); +} + +static void unhash_lockowner(struct nfs4_stateowner *sop) +{ + struct nfs4_stateid *stp; + + list_del(&sop->so_idhash); + list_del(&sop->so_strhash); + list_del(&sop->so_perstateid); + while (!list_empty(&sop->so_stateids)) { + stp = list_first_entry(&sop->so_stateids, + struct nfs4_stateid, st_perstateowner); + release_lock_stateid(stp); + } +} + +static void release_lockowner(struct nfs4_stateowner *sop) +{ + unhash_lockowner(sop); + nfs4_put_stateowner(sop); +} + +static void +release_stateid_lockowners(struct nfs4_stateid *open_stp) +{ + struct nfs4_stateowner *lock_sop; + + while (!list_empty(&open_stp->st_lockowners)) { + lock_sop = list_entry(open_stp->st_lockowners.next, + struct nfs4_stateowner, so_perstateid); + /* list_del(&open_stp->st_lockowners); */ + BUG_ON(lock_sop->so_is_open_owner); + release_lockowner(lock_sop); + } +} + +static void release_open_stateid(struct nfs4_stateid *stp) +{ + unhash_generic_stateid(stp); + release_stateid_lockowners(stp); + free_generic_stateid(stp); +} + +static void unhash_openowner(struct nfs4_stateowner *sop) +{ + struct nfs4_stateid *stp; + + list_del(&sop->so_idhash); + list_del(&sop->so_strhash); + list_del(&sop->so_perclient); + list_del(&sop->so_perstateid); /* XXX: necessary? */ + while (!list_empty(&sop->so_stateids)) { + stp = list_first_entry(&sop->so_stateids, + struct nfs4_stateid, st_perstateowner); + release_open_stateid(stp); + } +} + +static void release_openowner(struct nfs4_stateowner *sop) +{ + unhash_openowner(sop); + list_del(&sop->so_close_lru); + nfs4_put_stateowner(sop); +} + +#define SESSION_HASH_SIZE 512 +static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE]; + +static inline int +hash_sessionid(struct nfs4_sessionid *sessionid) +{ + struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid; + + return sid->sequence % SESSION_HASH_SIZE; +} + +static inline void +dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) +{ + u32 *ptr = (u32 *)(&sessionid->data[0]); + dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]); +} + +static void +gen_sessionid(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd4_sessionid *sid; + + sid = (struct nfsd4_sessionid *)ses->se_sessionid.data; + sid->clientid = clp->cl_clientid; + sid->sequence = current_sessionid++; + sid->reserved = 0; +} + +/* + * The protocol defines ca_maxresponssize_cached to include the size of + * the rpc header, but all we need to cache is the data starting after + * the end of the initial SEQUENCE operation--the rest we regenerate + * each time. Therefore we can advertise a ca_maxresponssize_cached + * value that is the number of bytes in our cache plus a few additional + * bytes. In order to stay on the safe side, and not promise more than + * we can cache, those additional bytes must be the minimum possible: 24 + * bytes of rpc header (xid through accept state, with AUTH_NULL + * verifier), 12 for the compound header (with zero-length tag), and 44 + * for the SEQUENCE op response: + */ +#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44) + +static void +free_session_slots(struct nfsd4_session *ses) +{ + int i; + + for (i = 0; i < ses->se_fchannel.maxreqs; i++) + kfree(ses->se_slots[i]); +} + +/* + * We don't actually need to cache the rpc and session headers, so we + * can allocate a little less for each slot: + */ +static inline int slot_bytes(struct nfsd4_channel_attrs *ca) +{ + return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; +} + +static int nfsd4_sanitize_slot_size(u32 size) +{ + size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */ + size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE); + + return size; +} + +/* + * XXX: If we run out of reserved DRC memory we could (up to a point) + * re-negotiate active sessions and reduce their slot usage to make + * rooom for new connections. For now we just fail the create session. + */ +static int nfsd4_get_drc_mem(int slotsize, u32 num) +{ + int avail; + + num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION); + + spin_lock(&nfsd_drc_lock); + avail = min_t(int, NFSD_MAX_MEM_PER_SESSION, + nfsd_drc_max_mem - nfsd_drc_mem_used); + num = min_t(int, num, avail / slotsize); + nfsd_drc_mem_used += num * slotsize; + spin_unlock(&nfsd_drc_lock); + + return num; +} + +static void nfsd4_put_drc_mem(int slotsize, int num) +{ + spin_lock(&nfsd_drc_lock); + nfsd_drc_mem_used -= slotsize * num; + spin_unlock(&nfsd_drc_lock); +} + +static struct nfsd4_session *alloc_session(int slotsize, int numslots) +{ + struct nfsd4_session *new; + int mem, i; + + BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *) + + sizeof(struct nfsd4_session) > PAGE_SIZE); + mem = numslots * sizeof(struct nfsd4_slot *); + + new = kzalloc(sizeof(*new) + mem, GFP_KERNEL); + if (!new) + return NULL; + /* allocate each struct nfsd4_slot and data cache in one piece */ + for (i = 0; i < numslots; i++) { + mem = sizeof(struct nfsd4_slot) + slotsize; + new->se_slots[i] = kzalloc(mem, GFP_KERNEL); + if (!new->se_slots[i]) + goto out_free; + } + return new; +out_free: + while (i--) + kfree(new->se_slots[i]); + kfree(new); + return NULL; +} + +static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize) +{ + u32 maxrpc = nfsd_serv->sv_max_mesg; + + new->maxreqs = numslots; + new->maxresp_cached = min_t(u32, req->maxresp_cached, + slotsize + NFSD_MIN_HDR_SEQ_SZ); + new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc); + new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc); + new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND); +} + +static void free_conn(struct nfsd4_conn *c) +{ + svc_xprt_put(c->cn_xprt); + kfree(c); +} + +static void nfsd4_conn_lost(struct svc_xpt_user *u) +{ + struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user); + struct nfs4_client *clp = c->cn_session->se_client; + + spin_lock(&clp->cl_lock); + if (!list_empty(&c->cn_persession)) { + list_del(&c->cn_persession); + free_conn(c); + } + spin_unlock(&clp->cl_lock); + nfsd4_probe_callback(clp); +} + +static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags) +{ + struct nfsd4_conn *conn; + + conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL); + if (!conn) + return NULL; + svc_xprt_get(rqstp->rq_xprt); + conn->cn_xprt = rqstp->rq_xprt; + conn->cn_flags = flags; + INIT_LIST_HEAD(&conn->cn_xpt_user.list); + return conn; +} + +static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) +{ + conn->cn_session = ses; + list_add(&conn->cn_persession, &ses->se_conns); +} + +static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + + spin_lock(&clp->cl_lock); + __nfsd4_hash_conn(conn, ses); + spin_unlock(&clp->cl_lock); +} + +static int nfsd4_register_conn(struct nfsd4_conn *conn) +{ + conn->cn_xpt_user.callback = nfsd4_conn_lost; + return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); +} + +static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir) +{ + struct nfsd4_conn *conn; + int ret; + + conn = alloc_conn(rqstp, dir); + if (!conn) + return nfserr_jukebox; + nfsd4_hash_conn(conn, ses); + ret = nfsd4_register_conn(conn); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&conn->cn_xpt_user); + return nfs_ok; +} + +static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses) +{ + u32 dir = NFS4_CDFC4_FORE; + + if (ses->se_flags & SESSION4_BACK_CHAN) + dir |= NFS4_CDFC4_BACK; + + return nfsd4_new_conn(rqstp, ses, dir); +} + +/* must be called under client_lock */ +static void nfsd4_del_conns(struct nfsd4_session *s) +{ + struct nfs4_client *clp = s->se_client; + struct nfsd4_conn *c; + + spin_lock(&clp->cl_lock); + while (!list_empty(&s->se_conns)) { + c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession); + list_del_init(&c->cn_persession); + spin_unlock(&clp->cl_lock); + + unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user); + free_conn(c); + + spin_lock(&clp->cl_lock); + } + spin_unlock(&clp->cl_lock); +} + +void free_session(struct kref *kref) +{ + struct nfsd4_session *ses; + int mem; + + ses = container_of(kref, struct nfsd4_session, se_ref); + nfsd4_del_conns(ses); + spin_lock(&nfsd_drc_lock); + mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel); + nfsd_drc_mem_used -= mem; + spin_unlock(&nfsd_drc_lock); + free_session_slots(ses); + kfree(ses); +} + +static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses) +{ + struct nfsd4_session *new; + struct nfsd4_channel_attrs *fchan = &cses->fore_channel; + int numslots, slotsize; + int status; + int idx; + + /* + * Note decreasing slot size below client's request may + * make it difficult for client to function correctly, whereas + * decreasing the number of slots will (just?) affect + * performance. When short on memory we therefore prefer to + * decrease number of slots instead of their size. + */ + slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached); + numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs); + if (numslots < 1) + return NULL; + + new = alloc_session(slotsize, numslots); + if (!new) { + nfsd4_put_drc_mem(slotsize, fchan->maxreqs); + return NULL; + } + init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize); + + new->se_client = clp; + gen_sessionid(new); + + INIT_LIST_HEAD(&new->se_conns); + + new->se_cb_seq_nr = 1; + new->se_flags = cses->flags; + new->se_cb_prog = cses->callback_prog; + kref_init(&new->se_ref); + idx = hash_sessionid(&new->se_sessionid); + spin_lock(&client_lock); + list_add(&new->se_hash, &sessionid_hashtbl[idx]); + spin_lock(&clp->cl_lock); + list_add(&new->se_perclnt, &clp->cl_sessions); + spin_unlock(&clp->cl_lock); + spin_unlock(&client_lock); + + status = nfsd4_new_conn_from_crses(rqstp, new); + /* whoops: benny points out, status is ignored! (err, or bogus) */ + if (status) { + free_session(&new->se_ref); + return NULL; + } + if (cses->flags & SESSION4_BACK_CHAN) { + struct sockaddr *sa = svc_addr(rqstp); + /* + * This is a little silly; with sessions there's no real + * use for the callback address. Use the peer address + * as a reasonable default for now, but consider fixing + * the rpc client not to require an address in the + * future: + */ + rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); + clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); + } + nfsd4_probe_callback(clp); + return new; +} + +/* caller must hold client_lock */ +static struct nfsd4_session * +find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) +{ + struct nfsd4_session *elem; + int idx; + + dump_sessionid(__func__, sessionid); + idx = hash_sessionid(sessionid); + /* Search in the appropriate list */ + list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) { + if (!memcmp(elem->se_sessionid.data, sessionid->data, + NFS4_MAX_SESSIONID_LEN)) { + return elem; + } + } + + dprintk("%s: session not found\n", __func__); + return NULL; +} + +/* caller must hold client_lock */ +static void +unhash_session(struct nfsd4_session *ses) +{ + list_del(&ses->se_hash); + spin_lock(&ses->se_client->cl_lock); + list_del(&ses->se_perclnt); + spin_unlock(&ses->se_client->cl_lock); +} + +/* must be called under the client_lock */ +static inline void +renew_client_locked(struct nfs4_client *clp) +{ + if (is_client_expired(clp)) { + dprintk("%s: client (clientid %08x/%08x) already expired\n", + __func__, + clp->cl_clientid.cl_boot, + clp->cl_clientid.cl_id); + return; + } + + /* + * Move client to the end to the LRU list. + */ + dprintk("renewing client (clientid %08x/%08x)\n", + clp->cl_clientid.cl_boot, + clp->cl_clientid.cl_id); + list_move_tail(&clp->cl_lru, &client_lru); + clp->cl_time = get_seconds(); +} + +static inline void +renew_client(struct nfs4_client *clp) +{ + spin_lock(&client_lock); + renew_client_locked(clp); + spin_unlock(&client_lock); +} + +/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ +static int +STALE_CLIENTID(clientid_t *clid) +{ + if (clid->cl_boot == boot_time) + return 0; + dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n", + clid->cl_boot, clid->cl_id, boot_time); + return 1; +} + +/* + * XXX Should we use a slab cache ? + * This type of memory management is somewhat inefficient, but we use it + * anyway since SETCLIENTID is not a common operation. + */ +static struct nfs4_client *alloc_client(struct xdr_netobj name) +{ + struct nfs4_client *clp; + + clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL); + if (clp == NULL) + return NULL; + clp->cl_name.data = kmalloc(name.len, GFP_KERNEL); + if (clp->cl_name.data == NULL) { + kfree(clp); + return NULL; + } + memcpy(clp->cl_name.data, name.data, name.len); + clp->cl_name.len = name.len; + return clp; +} + +static inline void +free_client(struct nfs4_client *clp) +{ + while (!list_empty(&clp->cl_sessions)) { + struct nfsd4_session *ses; + ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, + se_perclnt); + list_del(&ses->se_perclnt); + nfsd4_put_session(ses); + } + if (clp->cl_cred.cr_group_info) + put_group_info(clp->cl_cred.cr_group_info); + kfree(clp->cl_principal); + kfree(clp->cl_name.data); + kfree(clp); +} + +void +release_session_client(struct nfsd4_session *session) +{ + struct nfs4_client *clp = session->se_client; + + if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock)) + return; + if (is_client_expired(clp)) { + free_client(clp); + session->se_client = NULL; + } else + renew_client_locked(clp); + spin_unlock(&client_lock); +} + +/* must be called under the client_lock */ +static inline void +unhash_client_locked(struct nfs4_client *clp) +{ + struct nfsd4_session *ses; + + mark_client_expired(clp); + list_del(&clp->cl_lru); + spin_lock(&clp->cl_lock); + list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) + list_del_init(&ses->se_hash); + spin_unlock(&clp->cl_lock); +} + +static void +expire_client(struct nfs4_client *clp) +{ + struct nfs4_stateowner *sop; + struct nfs4_delegation *dp; + struct list_head reaplist; + + INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + while (!list_empty(&clp->cl_delegations)) { + dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); + list_del_init(&dp->dl_perclnt); + list_move(&dp->dl_recall_lru, &reaplist); + } + spin_unlock(&recall_lock); + while (!list_empty(&reaplist)) { + dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); + list_del_init(&dp->dl_recall_lru); + unhash_delegation(dp); + } + while (!list_empty(&clp->cl_openowners)) { + sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); + release_openowner(sop); + } + nfsd4_shutdown_callback(clp); + if (clp->cl_cb_conn.cb_xprt) + svc_xprt_put(clp->cl_cb_conn.cb_xprt); + list_del(&clp->cl_idhash); + list_del(&clp->cl_strhash); + spin_lock(&client_lock); + unhash_client_locked(clp); + if (atomic_read(&clp->cl_refcount) == 0) + free_client(clp); + spin_unlock(&client_lock); +} + +static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) +{ + memcpy(target->cl_verifier.data, source->data, + sizeof(target->cl_verifier.data)); +} + +static void copy_clid(struct nfs4_client *target, struct nfs4_client *source) +{ + target->cl_clientid.cl_boot = source->cl_clientid.cl_boot; + target->cl_clientid.cl_id = source->cl_clientid.cl_id; +} + +static void copy_cred(struct svc_cred *target, struct svc_cred *source) +{ + target->cr_uid = source->cr_uid; + target->cr_gid = source->cr_gid; + target->cr_group_info = source->cr_group_info; + get_group_info(target->cr_group_info); +} + +static int same_name(const char *n1, const char *n2) +{ + return 0 == memcmp(n1, n2, HEXDIR_LEN); +} + +static int +same_verf(nfs4_verifier *v1, nfs4_verifier *v2) +{ + return 0 == memcmp(v1->data, v2->data, sizeof(v1->data)); +} + +static int +same_clid(clientid_t *cl1, clientid_t *cl2) +{ + return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id); +} + +/* XXX what about NGROUP */ +static int +same_creds(struct svc_cred *cr1, struct svc_cred *cr2) +{ + return cr1->cr_uid == cr2->cr_uid; +} + +static void gen_clid(struct nfs4_client *clp) +{ + static u32 current_clientid = 1; + + clp->cl_clientid.cl_boot = boot_time; + clp->cl_clientid.cl_id = current_clientid++; +} + +static void gen_confirm(struct nfs4_client *clp) +{ + static u32 i; + u32 *p; + + p = (u32 *)clp->cl_confirm.data; + *p++ = get_seconds(); + *p++ = i++; +} + +static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, + struct svc_rqst *rqstp, nfs4_verifier *verf) +{ + struct nfs4_client *clp; + struct sockaddr *sa = svc_addr(rqstp); + char *princ; + + clp = alloc_client(name); + if (clp == NULL) + return NULL; + + INIT_LIST_HEAD(&clp->cl_sessions); + + princ = svc_gss_principal(rqstp); + if (princ) { + clp->cl_principal = kstrdup(princ, GFP_KERNEL); + if (clp->cl_principal == NULL) { + free_client(clp); + return NULL; + } + } + + memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); + atomic_set(&clp->cl_refcount, 0); + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_lru); + INIT_LIST_HEAD(&clp->cl_callbacks); + spin_lock_init(&clp->cl_lock); + INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc); + clp->cl_time = get_seconds(); + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); + copy_verf(clp, verf); + rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); + clp->cl_flavor = rqstp->rq_flavor; + copy_cred(&clp->cl_cred, &rqstp->rq_cred); + gen_confirm(clp); + clp->cl_cb_session = NULL; + return clp; +} + +static int check_name(struct xdr_netobj name) +{ + if (name.len == 0) + return 0; + if (name.len > NFS4_OPAQUE_LIMIT) { + dprintk("NFSD: check_name: name too long(%d)!\n", name.len); + return 0; + } + return 1; +} + +static void +add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval) +{ + unsigned int idhashval; + + list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]); + idhashval = clientid_hashval(clp->cl_clientid.cl_id); + list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]); + renew_client(clp); +} + +static void +move_to_confirmed(struct nfs4_client *clp) +{ + unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id); + unsigned int strhashval; + + dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); + list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); + strhashval = clientstr_hashval(clp->cl_recdir); + list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); + renew_client(clp); +} + +static struct nfs4_client * +find_confirmed_client(clientid_t *clid) +{ + struct nfs4_client *clp; + unsigned int idhashval = clientid_hashval(clid->cl_id); + + list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { + if (same_clid(&clp->cl_clientid, clid)) + return clp; + } + return NULL; +} + +static struct nfs4_client * +find_unconfirmed_client(clientid_t *clid) +{ + struct nfs4_client *clp; + unsigned int idhashval = clientid_hashval(clid->cl_id); + + list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) { + if (same_clid(&clp->cl_clientid, clid)) + return clp; + } + return NULL; +} + +static bool clp_used_exchangeid(struct nfs4_client *clp) +{ + return clp->cl_exchange_flags != 0; +} + +static struct nfs4_client * +find_confirmed_client_by_str(const char *dname, unsigned int hashval) +{ + struct nfs4_client *clp; + + list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { + if (same_name(clp->cl_recdir, dname)) + return clp; + } + return NULL; +} + +static struct nfs4_client * +find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) +{ + struct nfs4_client *clp; + + list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { + if (same_name(clp->cl_recdir, dname)) + return clp; + } + return NULL; +} + +static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr) +{ + switch (family) { + case AF_INET: + ((struct sockaddr_in *)sa)->sin_family = AF_INET; + ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr; + return; + case AF_INET6: + ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6; + ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6; + return; + } +} + +static void +gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp) +{ + struct nfs4_cb_conn *conn = &clp->cl_cb_conn; + struct sockaddr *sa = svc_addr(rqstp); + u32 scopeid = rpc_get_scope_id(sa); + unsigned short expected_family; + + /* Currently, we only support tcp and tcp6 for the callback channel */ + if (se->se_callback_netid_len == 3 && + !memcmp(se->se_callback_netid_val, "tcp", 3)) + expected_family = AF_INET; + else if (se->se_callback_netid_len == 4 && + !memcmp(se->se_callback_netid_val, "tcp6", 4)) + expected_family = AF_INET6; + else + goto out_err; + + conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val, + se->se_callback_addr_len, + (struct sockaddr *)&conn->cb_addr, + sizeof(conn->cb_addr)); + + if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family) + goto out_err; + + if (conn->cb_addr.ss_family == AF_INET6) + ((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid; + + conn->cb_prog = se->se_callback_prog; + conn->cb_ident = se->se_callback_ident; + rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr); + return; +out_err: + conn->cb_addr.ss_family = AF_UNSPEC; + conn->cb_addrlen = 0; + dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " + "will not receive delegations\n", + clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); + + return; +} + +/* + * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size. + */ +void +nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) +{ + struct nfsd4_slot *slot = resp->cstate.slot; + unsigned int base; + + dprintk("--> %s slot %p\n", __func__, slot); + + slot->sl_opcnt = resp->opcnt; + slot->sl_status = resp->cstate.status; + + if (nfsd4_not_cached(resp)) { + slot->sl_datalen = 0; + return; + } + slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap; + base = (char *)resp->cstate.datap - + (char *)resp->xbuf->head[0].iov_base; + if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data, + slot->sl_datalen)) + WARN("%s: sessions DRC could not cache compound\n", __func__); + return; +} + +/* + * Encode the replay sequence operation from the slot values. + * If cachethis is FALSE encode the uncached rep error on the next + * operation which sets resp->p and increments resp->opcnt for + * nfs4svc_encode_compoundres. + * + */ +static __be32 +nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, + struct nfsd4_compoundres *resp) +{ + struct nfsd4_op *op; + struct nfsd4_slot *slot = resp->cstate.slot; + + dprintk("--> %s resp->opcnt %d cachethis %u \n", __func__, + resp->opcnt, resp->cstate.slot->sl_cachethis); + + /* Encode the replayed sequence operation */ + op = &args->ops[resp->opcnt - 1]; + nfsd4_encode_operation(resp, op); + + /* Return nfserr_retry_uncached_rep in next operation. */ + if (args->opcnt > 1 && slot->sl_cachethis == 0) { + op = &args->ops[resp->opcnt++]; + op->status = nfserr_retry_uncached_rep; + nfsd4_encode_operation(resp, op); + } + return op->status; +} + +/* + * The sequence operation is not cached because we can use the slot and + * session values. + */ +__be32 +nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, + struct nfsd4_sequence *seq) +{ + struct nfsd4_slot *slot = resp->cstate.slot; + __be32 status; + + dprintk("--> %s slot %p\n", __func__, slot); + + /* Either returns 0 or nfserr_retry_uncached */ + status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp); + if (status == nfserr_retry_uncached_rep) + return status; + + /* The sequence operation has been encoded, cstate->datap set. */ + memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen); + + resp->opcnt = slot->sl_opcnt; + resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen); + status = slot->sl_status; + + return status; +} + +/* + * Set the exchange_id flags returned by the server. + */ +static void +nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) +{ + /* pNFS is not supported */ + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; + + /* Referrals are supported, Migration is not. */ + new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; + + /* set the wire flags to return to client. */ + clid->flags = new->cl_exchange_flags; +} + +__be32 +nfsd4_exchange_id(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_exchange_id *exid) +{ + struct nfs4_client *unconf, *conf, *new; + int status; + unsigned int strhashval; + char dname[HEXDIR_LEN]; + char addr_str[INET6_ADDRSTRLEN]; + nfs4_verifier verf = exid->verifier; + struct sockaddr *sa = svc_addr(rqstp); + + rpc_ntop(sa, addr_str, sizeof(addr_str)); + dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " + "ip_addr=%s flags %x, spa_how %d\n", + __func__, rqstp, exid, exid->clname.len, exid->clname.data, + addr_str, exid->flags, exid->spa_how); + + if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) + return nfserr_inval; + + /* Currently only support SP4_NONE */ + switch (exid->spa_how) { + case SP4_NONE: + break; + case SP4_SSV: + return nfserr_serverfault; + default: + BUG(); /* checked by xdr code */ + case SP4_MACH_CRED: + return nfserr_serverfault; /* no excuse :-/ */ + } + + status = nfs4_make_rec_clidname(dname, &exid->clname); + + if (status) + goto error; + + strhashval = clientstr_hashval(dname); + + nfs4_lock_state(); + status = nfs_ok; + + conf = find_confirmed_client_by_str(dname, strhashval); + if (conf) { + if (!clp_used_exchangeid(conf)) { + status = nfserr_clid_inuse; /* XXX: ? */ + goto out; + } + if (!same_verf(&verf, &conf->cl_verifier)) { + /* 18.35.4 case 8 */ + if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { + status = nfserr_not_same; + goto out; + } + /* Client reboot: destroy old state */ + expire_client(conf); + goto out_new; + } + if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { + /* 18.35.4 case 9 */ + if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { + status = nfserr_perm; + goto out; + } + expire_client(conf); + goto out_new; + } + /* + * Set bit when the owner id and verifier map to an already + * confirmed client id (18.35.3). + */ + exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; + + /* + * Falling into 18.35.4 case 2, possible router replay. + * Leave confirmed record intact and return same result. + */ + copy_verf(conf, &verf); + new = conf; + goto out_copy; + } + + /* 18.35.4 case 7 */ + if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { + status = nfserr_noent; + goto out; + } + + unconf = find_unconfirmed_client_by_str(dname, strhashval); + if (unconf) { + /* + * Possible retry or client restart. Per 18.35.4 case 4, + * a new unconfirmed record should be generated regardless + * of whether any properties have changed. + */ + expire_client(unconf); + } + +out_new: + /* Normal case */ + new = create_client(exid->clname, dname, rqstp, &verf); + if (new == NULL) { + status = nfserr_jukebox; + goto out; + } + + gen_clid(new); + add_to_unconfirmed(new, strhashval); +out_copy: + exid->clientid.cl_boot = new->cl_clientid.cl_boot; + exid->clientid.cl_id = new->cl_clientid.cl_id; + + exid->seqid = 1; + nfsd4_set_ex_flags(new, exid); + + dprintk("nfsd4_exchange_id seqid %d flags %x\n", + new->cl_cs_slot.sl_seqid, new->cl_exchange_flags); + status = nfs_ok; + +out: + nfs4_unlock_state(); +error: + dprintk("nfsd4_exchange_id returns %d\n", ntohl(status)); + return status; +} + +static int +check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse) +{ + dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid, + slot_seqid); + + /* The slot is in use, and no response has been sent. */ + if (slot_inuse) { + if (seqid == slot_seqid) + return nfserr_jukebox; + else + return nfserr_seq_misordered; + } + /* Normal */ + if (likely(seqid == slot_seqid + 1)) + return nfs_ok; + /* Replay */ + if (seqid == slot_seqid) + return nfserr_replay_cache; + /* Wraparound */ + if (seqid == 1 && (slot_seqid + 1) == 0) + return nfs_ok; + /* Misordered replay or misordered new request */ + return nfserr_seq_misordered; +} + +/* + * Cache the create session result into the create session single DRC + * slot cache by saving the xdr structure. sl_seqid has been set. + * Do this for solo or embedded create session operations. + */ +static void +nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses, + struct nfsd4_clid_slot *slot, int nfserr) +{ + slot->sl_status = nfserr; + memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses)); +} + +static __be32 +nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses, + struct nfsd4_clid_slot *slot) +{ + memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses)); + return slot->sl_status; +} + +__be32 +nfsd4_create_session(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_create_session *cr_ses) +{ + struct sockaddr *sa = svc_addr(rqstp); + struct nfs4_client *conf, *unconf; + struct nfsd4_session *new; + struct nfsd4_clid_slot *cs_slot = NULL; + bool confirm_me = false; + int status = 0; + + if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) + return nfserr_inval; + + nfs4_lock_state(); + unconf = find_unconfirmed_client(&cr_ses->clientid); + conf = find_confirmed_client(&cr_ses->clientid); + + if (conf) { + cs_slot = &conf->cl_cs_slot; + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); + if (status == nfserr_replay_cache) { + dprintk("Got a create_session replay! seqid= %d\n", + cs_slot->sl_seqid); + /* Return the cached reply status */ + status = nfsd4_replay_create_session(cr_ses, cs_slot); + goto out; + } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) { + status = nfserr_seq_misordered; + dprintk("Sequence misordered!\n"); + dprintk("Expected seqid= %d but got seqid= %d\n", + cs_slot->sl_seqid, cr_ses->seqid); + goto out; + } + } else if (unconf) { + if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || + !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { + status = nfserr_clid_inuse; + goto out; + } + + cs_slot = &unconf->cl_cs_slot; + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); + if (status) { + /* an unconfirmed replay returns misordered */ + status = nfserr_seq_misordered; + goto out; + } + + confirm_me = true; + conf = unconf; + } else { + status = nfserr_stale_clientid; + goto out; + } + + /* + * XXX: we should probably set this at creation time, and check + * for consistent minorversion use throughout: + */ + conf->cl_minorversion = 1; + /* + * We do not support RDMA or persistent sessions + */ + cr_ses->flags &= ~SESSION4_PERSIST; + cr_ses->flags &= ~SESSION4_RDMA; + + status = nfserr_jukebox; + new = alloc_init_session(rqstp, conf, cr_ses); + if (!new) + goto out; + status = nfs_ok; + memcpy(cr_ses->sessionid.data, new->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + memcpy(&cr_ses->fore_channel, &new->se_fchannel, + sizeof(struct nfsd4_channel_attrs)); + cs_slot->sl_seqid++; + cr_ses->seqid = cs_slot->sl_seqid; + + /* cache solo and embedded create sessions under the state lock */ + nfsd4_cache_create_session(cr_ses, cs_slot, status); + if (confirm_me) + move_to_confirmed(conf); +out: + nfs4_unlock_state(); + dprintk("%s returns %d\n", __func__, ntohl(status)); + return status; +} + +static bool nfsd4_last_compound_op(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + + return argp->opcnt == resp->opcnt; +} + +static __be32 nfsd4_map_bcts_dir(u32 *dir) +{ + switch (*dir) { + case NFS4_CDFC4_FORE: + case NFS4_CDFC4_BACK: + return nfs_ok; + case NFS4_CDFC4_FORE_OR_BOTH: + case NFS4_CDFC4_BACK_OR_BOTH: + *dir = NFS4_CDFC4_BOTH; + return nfs_ok; + }; + return nfserr_inval; +} + +__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_bind_conn_to_session *bcts) +{ + __be32 status; + + if (!nfsd4_last_compound_op(rqstp)) + return nfserr_not_only_op; + spin_lock(&client_lock); + cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid); + /* Sorta weird: we only need the refcnt'ing because new_conn acquires + * client_lock iself: */ + if (cstate->session) { + nfsd4_get_session(cstate->session); + atomic_inc(&cstate->session->se_client->cl_refcount); + } + spin_unlock(&client_lock); + if (!cstate->session) + return nfserr_badsession; + + status = nfsd4_map_bcts_dir(&bcts->dir); + if (!status) + nfsd4_new_conn(rqstp, cstate->session, bcts->dir); + return status; +} + +static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) +{ + if (!session) + return 0; + return !memcmp(sid, &session->se_sessionid, sizeof(*sid)); +} + +__be32 +nfsd4_destroy_session(struct svc_rqst *r, + struct nfsd4_compound_state *cstate, + struct nfsd4_destroy_session *sessionid) +{ + struct nfsd4_session *ses; + u32 status = nfserr_badsession; + + /* Notes: + * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid + * - Should we return nfserr_back_chan_busy if waiting for + * callbacks on to-be-destroyed session? + * - Do we need to clear any callback info from previous session? + */ + + if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { + if (!nfsd4_last_compound_op(r)) + return nfserr_not_only_op; + } + dump_sessionid(__func__, &sessionid->sessionid); + spin_lock(&client_lock); + ses = find_in_sessionid_hashtbl(&sessionid->sessionid); + if (!ses) { + spin_unlock(&client_lock); + goto out; + } + + unhash_session(ses); + spin_unlock(&client_lock); + + nfs4_lock_state(); + nfsd4_probe_callback_sync(ses->se_client); + nfs4_unlock_state(); + + nfsd4_del_conns(ses); + + nfsd4_put_session(ses); + status = nfs_ok; +out: + dprintk("%s returns %d\n", __func__, ntohl(status)); + return status; +} + +static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s) +{ + struct nfsd4_conn *c; + + list_for_each_entry(c, &s->se_conns, cn_persession) { + if (c->cn_xprt == xpt) { + return c; + } + } + return NULL; +} + +static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd4_conn *c; + int ret; + + spin_lock(&clp->cl_lock); + c = __nfsd4_find_conn(new->cn_xprt, ses); + if (c) { + spin_unlock(&clp->cl_lock); + free_conn(new); + return; + } + __nfsd4_hash_conn(new, ses); + spin_unlock(&clp->cl_lock); + ret = nfsd4_register_conn(new); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&new->cn_xpt_user); + return; +} + +static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session) +{ + struct nfsd4_compoundargs *args = rqstp->rq_argp; + + return args->opcnt > session->se_fchannel.maxops; +} + +__be32 +nfsd4_sequence(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_sequence *seq) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_session *session; + struct nfsd4_slot *slot; + struct nfsd4_conn *conn; + int status; + + if (resp->opcnt != 1) + return nfserr_sequence_pos; + + /* + * Will be either used or freed by nfsd4_sequence_check_conn + * below. + */ + conn = alloc_conn(rqstp, NFS4_CDFC4_FORE); + if (!conn) + return nfserr_jukebox; + + spin_lock(&client_lock); + status = nfserr_badsession; + session = find_in_sessionid_hashtbl(&seq->sessionid); + if (!session) + goto out; + + status = nfserr_too_many_ops; + if (nfsd4_session_too_many_ops(rqstp, session)) + goto out; + + status = nfserr_badslot; + if (seq->slotid >= session->se_fchannel.maxreqs) + goto out; + + slot = session->se_slots[seq->slotid]; + dprintk("%s: slotid %d\n", __func__, seq->slotid); + + /* We do not negotiate the number of slots yet, so set the + * maxslots to the session maxreqs which is used to encode + * sr_highest_slotid and the sr_target_slot id to maxslots */ + seq->maxslots = session->se_fchannel.maxreqs; + + status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_inuse); + if (status == nfserr_replay_cache) { + cstate->slot = slot; + cstate->session = session; + /* Return the cached reply status and set cstate->status + * for nfsd4_proc_compound processing */ + status = nfsd4_replay_cache_entry(resp, seq); + cstate->status = nfserr_replay_cache; + goto out; + } + if (status) + goto out; + + nfsd4_sequence_check_conn(conn, session); + conn = NULL; + + /* Success! bump slot seqid */ + slot->sl_inuse = true; + slot->sl_seqid = seq->seqid; + slot->sl_cachethis = seq->cachethis; + + cstate->slot = slot; + cstate->session = session; + +out: + /* Hold a session reference until done processing the compound. */ + if (cstate->session) { + struct nfs4_client *clp = session->se_client; + + nfsd4_get_session(cstate->session); + atomic_inc(&clp->cl_refcount); + if (clp->cl_cb_state == NFSD4_CB_DOWN) + seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN; + } + kfree(conn); + spin_unlock(&client_lock); + dprintk("%s: return %d\n", __func__, ntohl(status)); + return status; +} + +__be32 +nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) +{ + int status = 0; + + if (rc->rca_one_fs) { + if (!cstate->current_fh.fh_dentry) + return nfserr_nofilehandle; + /* + * We don't take advantage of the rca_one_fs case. + * That's OK, it's optional, we can safely ignore it. + */ + return nfs_ok; + } + + nfs4_lock_state(); + status = nfserr_complete_already; + if (cstate->session->se_client->cl_firststate) + goto out; + + status = nfserr_stale_clientid; + if (is_client_expired(cstate->session->se_client)) + /* + * The following error isn't really legal. + * But we only get here if the client just explicitly + * destroyed the client. Surely it no longer cares what + * error it gets back on an operation for the dead + * client. + */ + goto out; + + status = nfs_ok; + nfsd4_create_clid_dir(cstate->session->se_client); +out: + nfs4_unlock_state(); + return status; +} + +__be32 +nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_setclientid *setclid) +{ + struct xdr_netobj clname = { + .len = setclid->se_namelen, + .data = setclid->se_name, + }; + nfs4_verifier clverifier = setclid->se_verf; + unsigned int strhashval; + struct nfs4_client *conf, *unconf, *new; + __be32 status; + char dname[HEXDIR_LEN]; + + if (!check_name(clname)) + return nfserr_inval; + + status = nfs4_make_rec_clidname(dname, &clname); + if (status) + return status; + + /* + * XXX The Duplicate Request Cache (DRC) has been checked (??) + * We get here on a DRC miss. + */ + + strhashval = clientstr_hashval(dname); + + nfs4_lock_state(); + conf = find_confirmed_client_by_str(dname, strhashval); + if (conf) { + /* RFC 3530 14.2.33 CASE 0: */ + status = nfserr_clid_inuse; + if (clp_used_exchangeid(conf)) + goto out; + if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { + char addr_str[INET6_ADDRSTRLEN]; + rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str, + sizeof(addr_str)); + dprintk("NFSD: setclientid: string in use by client " + "at %s\n", addr_str); + goto out; + } + } + /* + * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION") + * has a description of SETCLIENTID request processing consisting + * of 5 bullet points, labeled as CASE0 - CASE4 below. + */ + unconf = find_unconfirmed_client_by_str(dname, strhashval); + status = nfserr_jukebox; + if (!conf) { + /* + * RFC 3530 14.2.33 CASE 4: + * placed first, because it is the normal case + */ + if (unconf) + expire_client(unconf); + new = create_client(clname, dname, rqstp, &clverifier); + if (new == NULL) + goto out; + gen_clid(new); + } else if (same_verf(&conf->cl_verifier, &clverifier)) { + /* + * RFC 3530 14.2.33 CASE 1: + * probable callback update + */ + if (unconf) { + /* Note this is removing unconfirmed {*x***}, + * which is stronger than RFC recommended {vxc**}. + * This has the advantage that there is at most + * one {*x***} in either list at any time. + */ + expire_client(unconf); + } + new = create_client(clname, dname, rqstp, &clverifier); + if (new == NULL) + goto out; + copy_clid(new, conf); + } else if (!unconf) { + /* + * RFC 3530 14.2.33 CASE 2: + * probable client reboot; state will be removed if + * confirmed. + */ + new = create_client(clname, dname, rqstp, &clverifier); + if (new == NULL) + goto out; + gen_clid(new); + } else { + /* + * RFC 3530 14.2.33 CASE 3: + * probable client reboot; state will be removed if + * confirmed. + */ + expire_client(unconf); + new = create_client(clname, dname, rqstp, &clverifier); + if (new == NULL) + goto out; + gen_clid(new); + } + /* + * XXX: we should probably set this at creation time, and check + * for consistent minorversion use throughout: + */ + new->cl_minorversion = 0; + gen_callback(new, setclid, rqstp); + add_to_unconfirmed(new, strhashval); + setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; + setclid->se_clientid.cl_id = new->cl_clientid.cl_id; + memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data)); + status = nfs_ok; +out: + nfs4_unlock_state(); + return status; +} + + +/* + * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has + * a description of SETCLIENTID_CONFIRM request processing consisting of 4 + * bullets, labeled as CASE1 - CASE4 below. + */ +__be32 +nfsd4_setclientid_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_setclientid_confirm *setclientid_confirm) +{ + struct sockaddr *sa = svc_addr(rqstp); + struct nfs4_client *conf, *unconf; + nfs4_verifier confirm = setclientid_confirm->sc_confirm; + clientid_t * clid = &setclientid_confirm->sc_clientid; + __be32 status; + + if (STALE_CLIENTID(clid)) + return nfserr_stale_clientid; + /* + * XXX The Duplicate Request Cache (DRC) has been checked (??) + * We get here on a DRC miss. + */ + + nfs4_lock_state(); + + conf = find_confirmed_client(clid); + unconf = find_unconfirmed_client(clid); + + status = nfserr_clid_inuse; + if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa)) + goto out; + if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa)) + goto out; + + /* + * section 14.2.34 of RFC 3530 has a description of + * SETCLIENTID_CONFIRM request processing consisting + * of 4 bullet points, labeled as CASE1 - CASE4 below. + */ + if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) { + /* + * RFC 3530 14.2.34 CASE 1: + * callback update + */ + if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) + status = nfserr_clid_inuse; + else { + nfsd4_change_callback(conf, &unconf->cl_cb_conn); + nfsd4_probe_callback(conf); + expire_client(unconf); + status = nfs_ok; + + } + } else if (conf && !unconf) { + /* + * RFC 3530 14.2.34 CASE 2: + * probable retransmitted request; play it safe and + * do nothing. + */ + if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) + status = nfserr_clid_inuse; + else + status = nfs_ok; + } else if (!conf && unconf + && same_verf(&unconf->cl_confirm, &confirm)) { + /* + * RFC 3530 14.2.34 CASE 3: + * Normal case; new or rebooted client: + */ + if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) { + status = nfserr_clid_inuse; + } else { + unsigned int hash = + clientstr_hashval(unconf->cl_recdir); + conf = find_confirmed_client_by_str(unconf->cl_recdir, + hash); + if (conf) { + nfsd4_remove_clid_dir(conf); + expire_client(conf); + } + move_to_confirmed(unconf); + conf = unconf; + nfsd4_probe_callback(conf); + status = nfs_ok; + } + } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) + && (!unconf || (unconf && !same_verf(&unconf->cl_confirm, + &confirm)))) { + /* + * RFC 3530 14.2.34 CASE 4: + * Client probably hasn't noticed that we rebooted yet. + */ + status = nfserr_stale_clientid; + } else { + /* check that we have hit one of the cases...*/ + status = nfserr_clid_inuse; + } +out: + nfs4_unlock_state(); + return status; +} + +/* OPEN Share state helper functions */ +static inline struct nfs4_file * +alloc_init_file(struct inode *ino) +{ + struct nfs4_file *fp; + unsigned int hashval = file_hashval(ino); + + fp = kmem_cache_alloc(file_slab, GFP_KERNEL); + if (fp) { + atomic_set(&fp->fi_ref, 1); + INIT_LIST_HEAD(&fp->fi_hash); + INIT_LIST_HEAD(&fp->fi_stateids); + INIT_LIST_HEAD(&fp->fi_delegations); + fp->fi_inode = igrab(ino); + fp->fi_id = current_fileid++; + fp->fi_had_conflict = false; + fp->fi_lease = NULL; + memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); + memset(fp->fi_access, 0, sizeof(fp->fi_access)); + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); + return fp; + } + return NULL; +} + +static void +nfsd4_free_slab(struct kmem_cache **slab) +{ + if (*slab == NULL) + return; + kmem_cache_destroy(*slab); + *slab = NULL; +} + +void +nfsd4_free_slabs(void) +{ + nfsd4_free_slab(&stateowner_slab); + nfsd4_free_slab(&file_slab); + nfsd4_free_slab(&stateid_slab); + nfsd4_free_slab(&deleg_slab); +} + +static int +nfsd4_init_slabs(void) +{ + stateowner_slab = kmem_cache_create("nfsd4_stateowners", + sizeof(struct nfs4_stateowner), 0, 0, NULL); + if (stateowner_slab == NULL) + goto out_nomem; + file_slab = kmem_cache_create("nfsd4_files", + sizeof(struct nfs4_file), 0, 0, NULL); + if (file_slab == NULL) + goto out_nomem; + stateid_slab = kmem_cache_create("nfsd4_stateids", + sizeof(struct nfs4_stateid), 0, 0, NULL); + if (stateid_slab == NULL) + goto out_nomem; + deleg_slab = kmem_cache_create("nfsd4_delegations", + sizeof(struct nfs4_delegation), 0, 0, NULL); + if (deleg_slab == NULL) + goto out_nomem; + return 0; +out_nomem: + nfsd4_free_slabs(); + dprintk("nfsd4: out of memory while initializing nfsv4\n"); + return -ENOMEM; +} + +void +nfs4_free_stateowner(struct kref *kref) +{ + struct nfs4_stateowner *sop = + container_of(kref, struct nfs4_stateowner, so_ref); + kfree(sop->so_owner.data); + kmem_cache_free(stateowner_slab, sop); +} + +static inline struct nfs4_stateowner * +alloc_stateowner(struct xdr_netobj *owner) +{ + struct nfs4_stateowner *sop; + + if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) { + if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) { + memcpy(sop->so_owner.data, owner->data, owner->len); + sop->so_owner.len = owner->len; + kref_init(&sop->so_ref); + return sop; + } + kmem_cache_free(stateowner_slab, sop); + } + return NULL; +} + +static struct nfs4_stateowner * +alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { + struct nfs4_stateowner *sop; + struct nfs4_replay *rp; + unsigned int idhashval; + + if (!(sop = alloc_stateowner(&open->op_owner))) + return NULL; + idhashval = ownerid_hashval(current_ownerid); + INIT_LIST_HEAD(&sop->so_idhash); + INIT_LIST_HEAD(&sop->so_strhash); + INIT_LIST_HEAD(&sop->so_perclient); + INIT_LIST_HEAD(&sop->so_stateids); + INIT_LIST_HEAD(&sop->so_perstateid); /* not used */ + INIT_LIST_HEAD(&sop->so_close_lru); + sop->so_time = 0; + list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]); + list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]); + list_add(&sop->so_perclient, &clp->cl_openowners); + sop->so_is_open_owner = 1; + sop->so_id = current_ownerid++; + sop->so_client = clp; + sop->so_seqid = open->op_seqid; + sop->so_confirmed = 0; + rp = &sop->so_replay; + rp->rp_status = nfserr_serverfault; + rp->rp_buflen = 0; + rp->rp_buf = rp->rp_ibuf; + return sop; +} + +static inline void +init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { + struct nfs4_stateowner *sop = open->op_stateowner; + unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); + + INIT_LIST_HEAD(&stp->st_hash); + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); + INIT_LIST_HEAD(&stp->st_perfile); + list_add(&stp->st_hash, &stateid_hashtbl[hashval]); + list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perfile, &fp->fi_stateids); + stp->st_stateowner = sop; + get_nfs4_file(fp); + stp->st_file = fp; + stp->st_stateid.si_boot = boot_time; + stp->st_stateid.si_stateownerid = sop->so_id; + stp->st_stateid.si_fileid = fp->fi_id; + stp->st_stateid.si_generation = 0; + stp->st_access_bmap = 0; + stp->st_deny_bmap = 0; + __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK, + &stp->st_access_bmap); + __set_bit(open->op_share_deny, &stp->st_deny_bmap); + stp->st_openstp = NULL; +} + +static void +move_to_close_lru(struct nfs4_stateowner *sop) +{ + dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); + + list_move_tail(&sop->so_close_lru, &close_lru); + sop->so_time = get_seconds(); +} + +static int +same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, + clientid_t *clid) +{ + return (sop->so_owner.len == owner->len) && + 0 == memcmp(sop->so_owner.data, owner->data, owner->len) && + (sop->so_client->cl_clientid.cl_id == clid->cl_id); +} + +static struct nfs4_stateowner * +find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) +{ + struct nfs4_stateowner *so = NULL; + + list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) + return so; + } + return NULL; +} + +/* search file_hashtbl[] for file */ +static struct nfs4_file * +find_file(struct inode *ino) +{ + unsigned int hashval = file_hashval(ino); + struct nfs4_file *fp; + + spin_lock(&recall_lock); + list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { + if (fp->fi_inode == ino) { + get_nfs4_file(fp); + spin_unlock(&recall_lock); + return fp; + } + } + spin_unlock(&recall_lock); + return NULL; +} + +static inline int access_valid(u32 x, u32 minorversion) +{ + if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) + return 0; + if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH) + return 0; + x &= ~NFS4_SHARE_ACCESS_MASK; + if (minorversion && x) { + if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL) + return 0; + if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED) + return 0; + x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK); + } + if (x) + return 0; + return 1; +} + +static inline int deny_valid(u32 x) +{ + /* Note: unlike access bits, deny bits may be zero. */ + return x <= NFS4_SHARE_DENY_BOTH; +} + +/* + * Called to check deny when READ with all zero stateid or + * WRITE with all zero or all one stateid + */ +static __be32 +nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) +{ + struct inode *ino = current_fh->fh_dentry->d_inode; + struct nfs4_file *fp; + struct nfs4_stateid *stp; + __be32 ret; + + dprintk("NFSD: nfs4_share_conflict\n"); + + fp = find_file(ino); + if (!fp) + return nfs_ok; + ret = nfserr_locked; + /* Search for conflicting share reservations */ + list_for_each_entry(stp, &fp->fi_stateids, st_perfile) { + if (test_bit(deny_type, &stp->st_deny_bmap) || + test_bit(NFS4_SHARE_DENY_BOTH, &stp->st_deny_bmap)) + goto out; + } + ret = nfs_ok; +out: + put_nfs4_file(fp); + return ret; +} + +static void nfsd_break_one_deleg(struct nfs4_delegation *dp) +{ + /* We're assuming the state code never drops its reference + * without first removing the lease. Since we're in this lease + * callback (and since the lease code is serialized by the kernel + * lock) we know the server hasn't removed the lease yet, we know + * it's safe to take a reference: */ + atomic_inc(&dp->dl_count); + + list_add_tail(&dp->dl_recall_lru, &del_recall_lru); + + /* only place dl_time is set. protected by lock_flocks*/ + dp->dl_time = get_seconds(); + + nfsd4_cb_recall(dp); +} + +/* Called from break_lease() with lock_flocks() held. */ +static void nfsd_break_deleg_cb(struct file_lock *fl) +{ + struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; + struct nfs4_delegation *dp; + + BUG_ON(!fp); + /* We assume break_lease is only called once per lease: */ + BUG_ON(fp->fi_had_conflict); + /* + * We don't want the locks code to timeout the lease for us; + * we'll remove it ourself if a delegation isn't returned + * in time: + */ + fl->fl_break_time = 0; + + spin_lock(&recall_lock); + fp->fi_had_conflict = true; + list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) + nfsd_break_one_deleg(dp); + spin_unlock(&recall_lock); +} + +static +int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) +{ + if (arg & F_UNLCK) + return lease_modify(onlist, arg); + else + return -EAGAIN; +} + +static const struct lock_manager_operations nfsd_lease_mng_ops = { + .fl_break = nfsd_break_deleg_cb, + .fl_change = nfsd_change_deleg_cb, +}; + + +__be32 +nfsd4_process_open1(struct nfsd4_compound_state *cstate, + struct nfsd4_open *open) +{ + clientid_t *clientid = &open->op_clientid; + struct nfs4_client *clp = NULL; + unsigned int strhashval; + struct nfs4_stateowner *sop = NULL; + + if (!check_name(open->op_owner)) + return nfserr_inval; + + if (STALE_CLIENTID(&open->op_clientid)) + return nfserr_stale_clientid; + + strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner); + sop = find_openstateowner_str(strhashval, open); + open->op_stateowner = sop; + if (!sop) { + /* Make sure the client's lease hasn't expired. */ + clp = find_confirmed_client(clientid); + if (clp == NULL) + return nfserr_expired; + goto renew; + } + /* When sessions are used, skip open sequenceid processing */ + if (nfsd4_has_session(cstate)) + goto renew; + if (!sop->so_confirmed) { + /* Replace unconfirmed owners without checking for replay. */ + clp = sop->so_client; + release_openowner(sop); + open->op_stateowner = NULL; + goto renew; + } + if (open->op_seqid == sop->so_seqid - 1) { + if (sop->so_replay.rp_buflen) + return nfserr_replay_me; + /* The original OPEN failed so spectacularly + * that we don't even have replay data saved! + * Therefore, we have no choice but to continue + * processing this OPEN; presumably, we'll + * fail again for the same reason. + */ + dprintk("nfsd4_process_open1: replay with no replay cache\n"); + goto renew; + } + if (open->op_seqid != sop->so_seqid) + return nfserr_bad_seqid; +renew: + if (open->op_stateowner == NULL) { + sop = alloc_init_open_stateowner(strhashval, clp, open); + if (sop == NULL) + return nfserr_jukebox; + open->op_stateowner = sop; + } + list_del_init(&sop->so_close_lru); + renew_client(sop->so_client); + return nfs_ok; +} + +static inline __be32 +nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) +{ + if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ)) + return nfserr_openmode; + else + return nfs_ok; +} + +static struct nfs4_delegation * +find_delegation_file(struct nfs4_file *fp, stateid_t *stid) +{ + struct nfs4_delegation *dp; + + spin_lock(&recall_lock); + list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) + if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) { + spin_unlock(&recall_lock); + return dp; + } + spin_unlock(&recall_lock); + return NULL; +} + +static int share_access_to_flags(u32 share_access) +{ + share_access &= ~NFS4_SHARE_WANT_MASK; + + return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; +} + +static __be32 +nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, + struct nfs4_delegation **dp) +{ + int flags; + __be32 status = nfserr_bad_stateid; + + *dp = find_delegation_file(fp, &open->op_delegate_stateid); + if (*dp == NULL) + goto out; + flags = share_access_to_flags(open->op_share_access); + status = nfs4_check_delegmode(*dp, flags); + if (status) + *dp = NULL; +out: + if (open->op_claim_type != NFS4_OPEN_CLAIM_DELEGATE_CUR) + return nfs_ok; + if (status) + return status; + open->op_stateowner->so_confirmed = 1; + return nfs_ok; +} + +static __be32 +nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp) +{ + struct nfs4_stateid *local; + __be32 status = nfserr_share_denied; + struct nfs4_stateowner *sop = open->op_stateowner; + + list_for_each_entry(local, &fp->fi_stateids, st_perfile) { + /* ignore lock owners */ + if (local->st_stateowner->so_is_open_owner == 0) + continue; + /* remember if we have seen this open owner */ + if (local->st_stateowner == sop) + *stpp = local; + /* check for conflicting share reservations */ + if (!test_share(local, open)) + goto out; + } + status = 0; +out: + return status; +} + +static inline struct nfs4_stateid * +nfs4_alloc_stateid(void) +{ + return kmem_cache_alloc(stateid_slab, GFP_KERNEL); +} + +static inline int nfs4_access_to_access(u32 nfs4_access) +{ + int flags = 0; + + if (nfs4_access & NFS4_SHARE_ACCESS_READ) + flags |= NFSD_MAY_READ; + if (nfs4_access & NFS4_SHARE_ACCESS_WRITE) + flags |= NFSD_MAY_WRITE; + return flags; +} + +static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, + struct svc_fh *cur_fh, struct nfsd4_open *open) +{ + __be32 status; + int oflag = nfs4_access_to_omode(open->op_share_access); + int access = nfs4_access_to_access(open->op_share_access); + + /* CLAIM_DELEGATE_CUR is used in response to a broken lease; + * allowing it to break the lease and return EAGAIN leaves the + * client unable to make progress in returning the delegation */ + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + access |= NFSD_MAY_NOT_BREAK_LEASE; + + if (!fp->fi_fds[oflag]) { + status = nfsd_open(rqstp, cur_fh, S_IFREG, access, + &fp->fi_fds[oflag]); + if (status) + return status; + } + nfs4_file_get_access(fp, oflag); + + return nfs_ok; +} + +static __be32 +nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp, + struct nfs4_file *fp, struct svc_fh *cur_fh, + struct nfsd4_open *open) +{ + struct nfs4_stateid *stp; + __be32 status; + + stp = nfs4_alloc_stateid(); + if (stp == NULL) + return nfserr_jukebox; + + status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); + if (status) { + kmem_cache_free(stateid_slab, stp); + return status; + } + *stpp = stp; + return 0; +} + +static inline __be32 +nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, + struct nfsd4_open *open) +{ + struct iattr iattr = { + .ia_valid = ATTR_SIZE, + .ia_size = 0, + }; + if (!open->op_truncate) + return 0; + if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) + return nfserr_inval; + return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0); +} + +static __be32 +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) +{ + u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK; + bool new_access; + __be32 status; + + new_access = !test_bit(op_share_access, &stp->st_access_bmap); + if (new_access) { + status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); + if (status) + return status; + } + status = nfsd4_truncate(rqstp, cur_fh, open); + if (status) { + if (new_access) { + int oflag = nfs4_access_to_omode(op_share_access); + nfs4_file_put_access(fp, oflag); + } + return status; + } + /* remember the open */ + __set_bit(op_share_access, &stp->st_access_bmap); + __set_bit(open->op_share_deny, &stp->st_deny_bmap); + + return nfs_ok; +} + + +static void +nfs4_set_claim_prev(struct nfsd4_open *open) +{ + open->op_stateowner->so_confirmed = 1; + open->op_stateowner->so_client->cl_firststate = 1; +} + +/* Should we give out recallable state?: */ +static bool nfsd4_cb_channel_good(struct nfs4_client *clp) +{ + if (clp->cl_cb_state == NFSD4_CB_UP) + return true; + /* + * In the sessions case, since we don't have to establish a + * separate connection for callbacks, we assume it's OK + * until we hear otherwise: + */ + return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; +} + +static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag) +{ + struct file_lock *fl; + + fl = locks_alloc_lock(); + if (!fl) + return NULL; + locks_init_lock(fl); + fl->fl_lmops = &nfsd_lease_mng_ops; + fl->fl_flags = FL_LEASE; + fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; + fl->fl_end = OFFSET_MAX; + fl->fl_owner = (fl_owner_t)(dp->dl_file); + fl->fl_pid = current->tgid; + return fl; +} + +static int nfs4_setlease(struct nfs4_delegation *dp, int flag) +{ + struct nfs4_file *fp = dp->dl_file; + struct file_lock *fl; + int status; + + fl = nfs4_alloc_init_lease(dp, flag); + if (!fl) + return -ENOMEM; + fl->fl_file = find_readable_file(fp); + list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); + status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); + if (status) { + list_del_init(&dp->dl_perclnt); + locks_free_lock(fl); + return -ENOMEM; + } + fp->fi_lease = fl; + fp->fi_deleg_file = fl->fl_file; + get_file(fp->fi_deleg_file); + atomic_set(&fp->fi_delegees, 1); + list_add(&dp->dl_perfile, &fp->fi_delegations); + return 0; +} + +static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) +{ + struct nfs4_file *fp = dp->dl_file; + + if (!fp->fi_lease) + return nfs4_setlease(dp, flag); + spin_lock(&recall_lock); + if (fp->fi_had_conflict) { + spin_unlock(&recall_lock); + return -EAGAIN; + } + atomic_inc(&fp->fi_delegees); + list_add(&dp->dl_perfile, &fp->fi_delegations); + spin_unlock(&recall_lock); + list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); + return 0; +} + +/* + * Attempt to hand out a delegation. + */ +static void +nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp) +{ + struct nfs4_delegation *dp; + struct nfs4_stateowner *sop = stp->st_stateowner; + int cb_up; + int status, flag = 0; + + cb_up = nfsd4_cb_channel_good(sop->so_client); + flag = NFS4_OPEN_DELEGATE_NONE; + open->op_recall = 0; + switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_PREVIOUS: + if (!cb_up) + open->op_recall = 1; + flag = open->op_delegate_type; + if (flag == NFS4_OPEN_DELEGATE_NONE) + goto out; + break; + case NFS4_OPEN_CLAIM_NULL: + /* Let's not give out any delegations till everyone's + * had the chance to reclaim theirs.... */ + if (locks_in_grace()) + goto out; + if (!cb_up || !sop->so_confirmed) + goto out; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) + flag = NFS4_OPEN_DELEGATE_WRITE; + else + flag = NFS4_OPEN_DELEGATE_READ; + break; + default: + goto out; + } + + dp = alloc_init_deleg(sop->so_client, stp, fh, flag); + if (dp == NULL) + goto out_no_deleg; + status = nfs4_set_delegation(dp, flag); + if (status) + goto out_free; + + memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); + + dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", + STATEID_VAL(&dp->dl_stateid)); +out: + if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS + && flag == NFS4_OPEN_DELEGATE_NONE + && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) + dprintk("NFSD: WARNING: refusing delegation reclaim\n"); + open->op_delegate_type = flag; + return; +out_free: + nfs4_put_delegation(dp); +out_no_deleg: + flag = NFS4_OPEN_DELEGATE_NONE; + goto out; +} + +/* + * called with nfs4_lock_state() held. + */ +__be32 +nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfs4_file *fp = NULL; + struct inode *ino = current_fh->fh_dentry->d_inode; + struct nfs4_stateid *stp = NULL; + struct nfs4_delegation *dp = NULL; + __be32 status; + + status = nfserr_inval; + if (!access_valid(open->op_share_access, resp->cstate.minorversion) + || !deny_valid(open->op_share_deny)) + goto out; + /* + * Lookup file; if found, lookup stateid and check open request, + * and check for delegations in the process of being recalled. + * If not found, create the nfs4_file struct + */ + fp = find_file(ino); + if (fp) { + if ((status = nfs4_check_open(fp, open, &stp))) + goto out; + status = nfs4_check_deleg(fp, open, &dp); + if (status) + goto out; + } else { + status = nfserr_bad_stateid; + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + goto out; + status = nfserr_jukebox; + fp = alloc_init_file(ino); + if (fp == NULL) + goto out; + } + + /* + * OPEN the file, or upgrade an existing OPEN. + * If truncate fails, the OPEN fails. + */ + if (stp) { + /* Stateid was found, this is an OPEN upgrade */ + status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); + if (status) + goto out; + update_stateid(&stp->st_stateid); + } else { + status = nfs4_new_open(rqstp, &stp, fp, current_fh, open); + if (status) + goto out; + init_stateid(stp, fp, open); + status = nfsd4_truncate(rqstp, current_fh, open); + if (status) { + release_open_stateid(stp); + goto out; + } + if (nfsd4_has_session(&resp->cstate)) + update_stateid(&stp->st_stateid); + } + memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); + + if (nfsd4_has_session(&resp->cstate)) + open->op_stateowner->so_confirmed = 1; + + /* + * Attempt to hand out a delegation. No error return, because the + * OPEN succeeds even if we fail. + */ + nfs4_open_delegation(current_fh, open, stp); + + status = nfs_ok; + + dprintk("%s: stateid=" STATEID_FMT "\n", __func__, + STATEID_VAL(&stp->st_stateid)); +out: + if (fp) + put_nfs4_file(fp); + if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + nfs4_set_claim_prev(open); + /* + * To finish the open response, we just need to set the rflags. + */ + open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; + if (!open->op_stateowner->so_confirmed && + !nfsd4_has_session(&resp->cstate)) + open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; + + return status; +} + +__be32 +nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + clientid_t *clid) +{ + struct nfs4_client *clp; + __be32 status; + + nfs4_lock_state(); + dprintk("process_renew(%08x/%08x): starting\n", + clid->cl_boot, clid->cl_id); + status = nfserr_stale_clientid; + if (STALE_CLIENTID(clid)) + goto out; + clp = find_confirmed_client(clid); + status = nfserr_expired; + if (clp == NULL) { + /* We assume the client took too long to RENEW. */ + dprintk("nfsd4_renew: clientid not found!\n"); + goto out; + } + renew_client(clp); + status = nfserr_cb_path_down; + if (!list_empty(&clp->cl_delegations) + && clp->cl_cb_state != NFSD4_CB_UP) + goto out; + status = nfs_ok; +out: + nfs4_unlock_state(); + return status; +} + +static struct lock_manager nfsd4_manager = { +}; + +static void +nfsd4_end_grace(void) +{ + dprintk("NFSD: end of grace period\n"); + nfsd4_recdir_purge_old(); + locks_end_grace(&nfsd4_manager); + /* + * Now that every NFSv4 client has had the chance to recover and + * to see the (possibly new, possibly shorter) lease time, we + * can safely set the next grace time to the current lease time: + */ + nfsd4_grace = nfsd4_lease; +} + +static time_t +nfs4_laundromat(void) +{ + struct nfs4_client *clp; + struct nfs4_stateowner *sop; + struct nfs4_delegation *dp; + struct list_head *pos, *next, reaplist; + time_t cutoff = get_seconds() - nfsd4_lease; + time_t t, clientid_val = nfsd4_lease; + time_t u, test_val = nfsd4_lease; + + nfs4_lock_state(); + + dprintk("NFSD: laundromat service - starting\n"); + if (locks_in_grace()) + nfsd4_end_grace(); + INIT_LIST_HEAD(&reaplist); + spin_lock(&client_lock); + list_for_each_safe(pos, next, &client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { + t = clp->cl_time - cutoff; + if (clientid_val > t) + clientid_val = t; + break; + } + if (atomic_read(&clp->cl_refcount)) { + dprintk("NFSD: client in use (clientid %08x)\n", + clp->cl_clientid.cl_id); + continue; + } + unhash_client_locked(clp); + list_add(&clp->cl_lru, &reaplist); + } + spin_unlock(&client_lock); + list_for_each_safe(pos, next, &reaplist) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + dprintk("NFSD: purging unused client (clientid %08x)\n", + clp->cl_clientid.cl_id); + nfsd4_remove_clid_dir(clp); + expire_client(clp); + } + spin_lock(&recall_lock); + list_for_each_safe(pos, next, &del_recall_lru) { + dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); + if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) { + u = dp->dl_time - cutoff; + if (test_val > u) + test_val = u; + break; + } + list_move(&dp->dl_recall_lru, &reaplist); + } + spin_unlock(&recall_lock); + list_for_each_safe(pos, next, &reaplist) { + dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); + list_del_init(&dp->dl_recall_lru); + unhash_delegation(dp); + } + test_val = nfsd4_lease; + list_for_each_safe(pos, next, &close_lru) { + sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); + if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { + u = sop->so_time - cutoff; + if (test_val > u) + test_val = u; + break; + } + dprintk("NFSD: purging unused open stateowner (so_id %d)\n", + sop->so_id); + release_openowner(sop); + } + if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) + clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; + nfs4_unlock_state(); + return clientid_val; +} + +static struct workqueue_struct *laundry_wq; +static void laundromat_main(struct work_struct *); +static DECLARE_DELAYED_WORK(laundromat_work, laundromat_main); + +static void +laundromat_main(struct work_struct *not_used) +{ + time_t t; + + t = nfs4_laundromat(); + dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t); + queue_delayed_work(laundry_wq, &laundromat_work, t*HZ); +} + +static struct nfs4_stateowner * +search_close_lru(u32 st_id, int flags) +{ + struct nfs4_stateowner *local = NULL; + + if (flags & CLOSE_STATE) { + list_for_each_entry(local, &close_lru, so_close_lru) { + if (local->so_id == st_id) + return local; + } + } + return NULL; +} + +static inline int +nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp) +{ + return fhp->fh_dentry->d_inode != stp->st_file->fi_inode; +} + +static int +STALE_STATEID(stateid_t *stateid) +{ + if (stateid->si_boot == boot_time) + return 0; + dprintk("NFSD: stale stateid " STATEID_FMT "!\n", + STATEID_VAL(stateid)); + return 1; +} + +static inline int +access_permit_read(unsigned long access_bmap) +{ + return test_bit(NFS4_SHARE_ACCESS_READ, &access_bmap) || + test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap) || + test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap); +} + +static inline int +access_permit_write(unsigned long access_bmap) +{ + return test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap) || + test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap); +} + +static +__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags) +{ + __be32 status = nfserr_openmode; + + /* For lock stateid's, we test the parent open, not the lock: */ + if (stp->st_openstp) + stp = stp->st_openstp; + if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap))) + goto out; + if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap))) + goto out; + status = nfs_ok; +out: + return status; +} + +static inline __be32 +check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) +{ + if (ONE_STATEID(stateid) && (flags & RD_STATE)) + return nfs_ok; + else if (locks_in_grace()) { + /* Answer in remaining cases depends on existence of + * conflicting state; so we must wait out the grace period. */ + return nfserr_grace; + } else if (flags & WR_STATE) + return nfs4_share_conflict(current_fh, + NFS4_SHARE_DENY_WRITE); + else /* (flags & RD_STATE) && ZERO_STATEID(stateid) */ + return nfs4_share_conflict(current_fh, + NFS4_SHARE_DENY_READ); +} + +/* + * Allow READ/WRITE during grace period on recovered state only for files + * that are not able to provide mandatory locking. + */ +static inline int +grace_disallows_io(struct inode *inode) +{ + return locks_in_grace() && mandatory_lock(inode); +} + +static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags) +{ + /* + * When sessions are used the stateid generation number is ignored + * when it is zero. + */ + if ((flags & HAS_SESSION) && in->si_generation == 0) + goto out; + + /* If the client sends us a stateid from the future, it's buggy: */ + if (in->si_generation > ref->si_generation) + return nfserr_bad_stateid; + /* + * The following, however, can happen. For example, if the + * client sends an open and some IO at the same time, the open + * may bump si_generation while the IO is still in flight. + * Thanks to hard links and renames, the client never knows what + * file an open will affect. So it could avoid that situation + * only by serializing all opens and IO from the same open + * owner. To recover from the old_stateid error, the client + * will just have to retry the IO: + */ + if (in->si_generation < ref->si_generation) + return nfserr_old_stateid; +out: + return nfs_ok; +} + +static int is_delegation_stateid(stateid_t *stateid) +{ + return stateid->si_fileid == 0; +} + +/* +* Checks for stateid operations +*/ +__be32 +nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, + stateid_t *stateid, int flags, struct file **filpp) +{ + struct nfs4_stateid *stp = NULL; + struct nfs4_delegation *dp = NULL; + struct svc_fh *current_fh = &cstate->current_fh; + struct inode *ino = current_fh->fh_dentry->d_inode; + __be32 status; + + if (filpp) + *filpp = NULL; + + if (grace_disallows_io(ino)) + return nfserr_grace; + + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + return check_special_stateids(current_fh, stateid, flags); + + status = nfserr_stale_stateid; + if (STALE_STATEID(stateid)) + goto out; + + /* + * We assume that any stateid that has the current boot time, + * but that we can't find, is expired: + */ + status = nfserr_expired; + if (is_delegation_stateid(stateid)) { + dp = find_delegation_stateid(ino, stateid); + if (!dp) + goto out; + status = check_stateid_generation(stateid, &dp->dl_stateid, + flags); + if (status) + goto out; + status = nfs4_check_delegmode(dp, flags); + if (status) + goto out; + renew_client(dp->dl_client); + if (filpp) { + *filpp = dp->dl_file->fi_deleg_file; + BUG_ON(!*filpp); + } + } else { /* open or lock stateid */ + stp = find_stateid(stateid, flags); + if (!stp) + goto out; + status = nfserr_bad_stateid; + if (nfs4_check_fh(current_fh, stp)) + goto out; + if (!stp->st_stateowner->so_confirmed) + goto out; + status = check_stateid_generation(stateid, &stp->st_stateid, + flags); + if (status) + goto out; + status = nfs4_check_openmode(stp, flags); + if (status) + goto out; + renew_client(stp->st_stateowner->so_client); + if (filpp) { + if (flags & RD_STATE) + *filpp = find_readable_file(stp->st_file); + else + *filpp = find_writeable_file(stp->st_file); + } + } + status = nfs_ok; +out: + return status; +} + +static inline int +setlkflg (int type) +{ + return (type == NFS4_READW_LT || type == NFS4_READ_LT) ? + RD_STATE : WR_STATE; +} + +/* + * Checks for sequence id mutating operations. + */ +static __be32 +nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, + stateid_t *stateid, int flags, + struct nfs4_stateowner **sopp, + struct nfs4_stateid **stpp, struct nfsd4_lock *lock) +{ + struct nfs4_stateid *stp; + struct nfs4_stateowner *sop; + struct svc_fh *current_fh = &cstate->current_fh; + __be32 status; + + dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, + seqid, STATEID_VAL(stateid)); + + *stpp = NULL; + *sopp = NULL; + + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { + dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); + return nfserr_bad_stateid; + } + + if (STALE_STATEID(stateid)) + return nfserr_stale_stateid; + + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + + /* + * We return BAD_STATEID if filehandle doesn't match stateid, + * the confirmed flag is incorrecly set, or the generation + * number is incorrect. + */ + stp = find_stateid(stateid, flags); + if (stp == NULL) { + /* + * Also, we should make sure this isn't just the result of + * a replayed close: + */ + sop = search_close_lru(stateid->si_stateownerid, flags); + /* It's not stale; let's assume it's expired: */ + if (sop == NULL) + return nfserr_expired; + *sopp = sop; + goto check_replay; + } + + *stpp = stp; + *sopp = sop = stp->st_stateowner; + + if (lock) { + clientid_t *lockclid = &lock->v.new.clientid; + struct nfs4_client *clp = sop->so_client; + int lkflg = 0; + __be32 status; + + lkflg = setlkflg(lock->lk_type); + + if (lock->lk_is_new) { + if (!sop->so_is_open_owner) + return nfserr_bad_stateid; + if (!(flags & HAS_SESSION) && + !same_clid(&clp->cl_clientid, lockclid)) + return nfserr_bad_stateid; + /* stp is the open stateid */ + status = nfs4_check_openmode(stp, lkflg); + if (status) + return status; + } else { + /* stp is the lock stateid */ + status = nfs4_check_openmode(stp->st_openstp, lkflg); + if (status) + return status; + } + } + + if (nfs4_check_fh(current_fh, stp)) { + dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); + return nfserr_bad_stateid; + } + + /* + * We now validate the seqid and stateid generation numbers. + * For the moment, we ignore the possibility of + * generation number wraparound. + */ + if (!(flags & HAS_SESSION) && seqid != sop->so_seqid) + goto check_replay; + + if (sop->so_confirmed && flags & CONFIRM) { + dprintk("NFSD: preprocess_seqid_op: expected" + " unconfirmed stateowner!\n"); + return nfserr_bad_stateid; + } + if (!sop->so_confirmed && !(flags & CONFIRM)) { + dprintk("NFSD: preprocess_seqid_op: stateowner not" + " confirmed yet!\n"); + return nfserr_bad_stateid; + } + status = check_stateid_generation(stateid, &stp->st_stateid, flags); + if (status) + return status; + renew_client(sop->so_client); + return nfs_ok; + +check_replay: + if (seqid == sop->so_seqid - 1) { + dprintk("NFSD: preprocess_seqid_op: retransmission?\n"); + /* indicate replay to calling function */ + return nfserr_replay_me; + } + dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n", + sop->so_seqid, seqid); + *sopp = NULL; + return nfserr_bad_seqid; +} + +__be32 +nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_open_confirm *oc) +{ + __be32 status; + struct nfs4_stateowner *sop; + struct nfs4_stateid *stp; + + dprintk("NFSD: nfsd4_open_confirm on file %.*s\n", + (int)cstate->current_fh.fh_dentry->d_name.len, + cstate->current_fh.fh_dentry->d_name.name); + + status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0); + if (status) + return status; + + nfs4_lock_state(); + + if ((status = nfs4_preprocess_seqid_op(cstate, + oc->oc_seqid, &oc->oc_req_stateid, + CONFIRM | OPEN_STATE, + &oc->oc_stateowner, &stp, NULL))) + goto out; + + sop = oc->oc_stateowner; + sop->so_confirmed = 1; + update_stateid(&stp->st_stateid); + memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t)); + dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", + __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid)); + + nfsd4_create_clid_dir(sop->so_client); +out: + if (oc->oc_stateowner) { + nfs4_get_stateowner(oc->oc_stateowner); + cstate->replay_owner = oc->oc_stateowner; + } + nfs4_unlock_state(); + return status; +} + +static inline void nfs4_file_downgrade(struct nfs4_stateid *stp, unsigned int to_access) +{ + int i; + + for (i = 1; i < 4; i++) { + if (test_bit(i, &stp->st_access_bmap) + && ((i & to_access) != i)) { + nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(i)); + __clear_bit(i, &stp->st_access_bmap); + } + } +} + +static void +reset_union_bmap_deny(unsigned long deny, unsigned long *bmap) +{ + int i; + for (i = 0; i < 4; i++) { + if ((i & deny) != i) + __clear_bit(i, bmap); + } +} + +__be32 +nfsd4_open_downgrade(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_open_downgrade *od) +{ + __be32 status; + struct nfs4_stateid *stp; + + dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", + (int)cstate->current_fh.fh_dentry->d_name.len, + cstate->current_fh.fh_dentry->d_name.name); + + if (!access_valid(od->od_share_access, cstate->minorversion) + || !deny_valid(od->od_share_deny)) + return nfserr_inval; + /* We don't yet support WANT bits: */ + od->od_share_access &= NFS4_SHARE_ACCESS_MASK; + + nfs4_lock_state(); + if ((status = nfs4_preprocess_seqid_op(cstate, + od->od_seqid, + &od->od_stateid, + OPEN_STATE, + &od->od_stateowner, &stp, NULL))) + goto out; + + status = nfserr_inval; + if (!test_bit(od->od_share_access, &stp->st_access_bmap)) { + dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n", + stp->st_access_bmap, od->od_share_access); + goto out; + } + if (!test_bit(od->od_share_deny, &stp->st_deny_bmap)) { + dprintk("NFSD:deny not a subset current bitmap: 0x%lx, input deny=%08x\n", + stp->st_deny_bmap, od->od_share_deny); + goto out; + } + nfs4_file_downgrade(stp, od->od_share_access); + + reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); + + update_stateid(&stp->st_stateid); + memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t)); + status = nfs_ok; +out: + if (od->od_stateowner) { + nfs4_get_stateowner(od->od_stateowner); + cstate->replay_owner = od->od_stateowner; + } + nfs4_unlock_state(); + return status; +} + +/* + * nfs4_unlock_state() called after encode + */ +__be32 +nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_close *close) +{ + __be32 status; + struct nfs4_stateid *stp; + + dprintk("NFSD: nfsd4_close on file %.*s\n", + (int)cstate->current_fh.fh_dentry->d_name.len, + cstate->current_fh.fh_dentry->d_name.name); + + nfs4_lock_state(); + /* check close_lru for replay */ + if ((status = nfs4_preprocess_seqid_op(cstate, + close->cl_seqid, + &close->cl_stateid, + OPEN_STATE | CLOSE_STATE, + &close->cl_stateowner, &stp, NULL))) + goto out; + status = nfs_ok; + update_stateid(&stp->st_stateid); + memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); + + /* release_stateid() calls nfsd_close() if needed */ + release_open_stateid(stp); + + /* place unused nfs4_stateowners on so_close_lru list to be + * released by the laundromat service after the lease period + * to enable us to handle CLOSE replay + */ + if (list_empty(&close->cl_stateowner->so_stateids)) + move_to_close_lru(close->cl_stateowner); +out: + if (close->cl_stateowner) { + nfs4_get_stateowner(close->cl_stateowner); + cstate->replay_owner = close->cl_stateowner; + } + nfs4_unlock_state(); + return status; +} + +__be32 +nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_delegreturn *dr) +{ + struct nfs4_delegation *dp; + stateid_t *stateid = &dr->dr_stateid; + struct inode *inode; + __be32 status; + int flags = 0; + + if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) + return status; + inode = cstate->current_fh.fh_dentry->d_inode; + + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + nfs4_lock_state(); + status = nfserr_bad_stateid; + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + goto out; + status = nfserr_stale_stateid; + if (STALE_STATEID(stateid)) + goto out; + status = nfserr_bad_stateid; + if (!is_delegation_stateid(stateid)) + goto out; + status = nfserr_expired; + dp = find_delegation_stateid(inode, stateid); + if (!dp) + goto out; + status = check_stateid_generation(stateid, &dp->dl_stateid, flags); + if (status) + goto out; + renew_client(dp->dl_client); + + unhash_delegation(dp); +out: + nfs4_unlock_state(); + + return status; +} + + +/* + * Lock owner state (byte-range locks) + */ +#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start)) +#define LOCK_HASH_BITS 8 +#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) +#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) + +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end: NFS4_MAX_UINT64; +} + +/* last octet in a range */ +static inline u64 +last_byte_offset(u64 start, u64 len) +{ + u64 end; + + BUG_ON(!len); + end = start + len; + return end > start ? end - 1: NFS4_MAX_UINT64; +} + +#define lockownerid_hashval(id) \ + ((id) & LOCK_HASH_MASK) + +static inline unsigned int +lock_ownerstr_hashval(struct inode *inode, u32 cl_id, + struct xdr_netobj *ownername) +{ + return (file_hashval(inode) + cl_id + + opaque_hashval(ownername->data, ownername->len)) + & LOCK_HASH_MASK; +} + +static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE]; +static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; +static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; + +static struct nfs4_stateid * +find_stateid(stateid_t *stid, int flags) +{ + struct nfs4_stateid *local; + u32 st_id = stid->si_stateownerid; + u32 f_id = stid->si_fileid; + unsigned int hashval; + + dprintk("NFSD: find_stateid flags 0x%x\n",flags); + if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) { + hashval = stateid_hashval(st_id, f_id); + list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { + if ((local->st_stateid.si_stateownerid == st_id) && + (local->st_stateid.si_fileid == f_id)) + return local; + } + } + + if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) { + hashval = stateid_hashval(st_id, f_id); + list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { + if ((local->st_stateid.si_stateownerid == st_id) && + (local->st_stateid.si_fileid == f_id)) + return local; + } + } + return NULL; +} + +static struct nfs4_delegation * +find_delegation_stateid(struct inode *ino, stateid_t *stid) +{ + struct nfs4_file *fp; + struct nfs4_delegation *dl; + + dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__, + STATEID_VAL(stid)); + + fp = find_file(ino); + if (!fp) + return NULL; + dl = find_delegation_file(fp, stid); + put_nfs4_file(fp); + return dl; +} + +/* + * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that + * we can't properly handle lock requests that go beyond the (2^63 - 1)-th + * byte, because of sign extension problems. Since NFSv4 calls for 64-bit + * locking, this prevents us from being completely protocol-compliant. The + * real solution to this problem is to start using unsigned file offsets in + * the VFS, but this is a very deep change! + */ +static inline void +nfs4_transform_lock_offset(struct file_lock *lock) +{ + if (lock->fl_start < 0) + lock->fl_start = OFFSET_MAX; + if (lock->fl_end < 0) + lock->fl_end = OFFSET_MAX; +} + +/* Hack!: For now, we're defining this just so we can use a pointer to it + * as a unique cookie to identify our (NFSv4's) posix locks. */ +static const struct lock_manager_operations nfsd_posix_mng_ops = { +}; + +static inline void +nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) +{ + struct nfs4_stateowner *sop; + + if (fl->fl_lmops == &nfsd_posix_mng_ops) { + sop = (struct nfs4_stateowner *) fl->fl_owner; + kref_get(&sop->so_ref); + deny->ld_sop = sop; + deny->ld_clientid = sop->so_client->cl_clientid; + } else { + deny->ld_sop = NULL; + deny->ld_clientid.cl_boot = 0; + deny->ld_clientid.cl_id = 0; + } + deny->ld_start = fl->fl_start; + deny->ld_length = NFS4_MAX_UINT64; + if (fl->fl_end != NFS4_MAX_UINT64) + deny->ld_length = fl->fl_end - fl->fl_start + 1; + deny->ld_type = NFS4_READ_LT; + if (fl->fl_type != F_RDLCK) + deny->ld_type = NFS4_WRITE_LT; +} + +static struct nfs4_stateowner * +find_lockstateowner_str(struct inode *inode, clientid_t *clid, + struct xdr_netobj *owner) +{ + unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner); + struct nfs4_stateowner *op; + + list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) { + if (same_owner_str(op, owner, clid)) + return op; + } + return NULL; +} + +/* + * Alloc a lock owner structure. + * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has + * occurred. + * + * strhashval = lock_ownerstr_hashval + */ + +static struct nfs4_stateowner * +alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) { + struct nfs4_stateowner *sop; + struct nfs4_replay *rp; + unsigned int idhashval; + + if (!(sop = alloc_stateowner(&lock->lk_new_owner))) + return NULL; + idhashval = lockownerid_hashval(current_ownerid); + INIT_LIST_HEAD(&sop->so_idhash); + INIT_LIST_HEAD(&sop->so_strhash); + INIT_LIST_HEAD(&sop->so_perclient); + INIT_LIST_HEAD(&sop->so_stateids); + INIT_LIST_HEAD(&sop->so_perstateid); + INIT_LIST_HEAD(&sop->so_close_lru); /* not used */ + sop->so_time = 0; + list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]); + list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]); + list_add(&sop->so_perstateid, &open_stp->st_lockowners); + sop->so_is_open_owner = 0; + sop->so_id = current_ownerid++; + sop->so_client = clp; + /* It is the openowner seqid that will be incremented in encode in the + * case of new lockowners; so increment the lock seqid manually: */ + sop->so_seqid = lock->lk_new_lock_seqid + 1; + sop->so_confirmed = 1; + rp = &sop->so_replay; + rp->rp_status = nfserr_serverfault; + rp->rp_buflen = 0; + rp->rp_buf = rp->rp_ibuf; + return sop; +} + +static struct nfs4_stateid * +alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp) +{ + struct nfs4_stateid *stp; + unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); + + stp = nfs4_alloc_stateid(); + if (stp == NULL) + goto out; + INIT_LIST_HEAD(&stp->st_hash); + INIT_LIST_HEAD(&stp->st_perfile); + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ + list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); + list_add(&stp->st_perfile, &fp->fi_stateids); + list_add(&stp->st_perstateowner, &sop->so_stateids); + stp->st_stateowner = sop; + get_nfs4_file(fp); + stp->st_file = fp; + stp->st_stateid.si_boot = boot_time; + stp->st_stateid.si_stateownerid = sop->so_id; + stp->st_stateid.si_fileid = fp->fi_id; + stp->st_stateid.si_generation = 0; + stp->st_access_bmap = 0; + stp->st_deny_bmap = open_stp->st_deny_bmap; + stp->st_openstp = open_stp; + +out: + return stp; +} + +static int +check_lock_length(u64 offset, u64 length) +{ + return ((length == 0) || ((length != NFS4_MAX_UINT64) && + LOFF_OVERFLOW(offset, length))); +} + +static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access) +{ + struct nfs4_file *fp = lock_stp->st_file; + int oflag = nfs4_access_to_omode(access); + + if (test_bit(access, &lock_stp->st_access_bmap)) + return; + nfs4_file_get_access(fp, oflag); + __set_bit(access, &lock_stp->st_access_bmap); +} + +/* + * LOCK operation + */ +__be32 +nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_lock *lock) +{ + struct nfs4_stateowner *open_sop = NULL; + struct nfs4_stateowner *lock_sop = NULL; + struct nfs4_stateid *lock_stp; + struct nfs4_file *fp; + struct file *filp = NULL; + struct file_lock file_lock; + struct file_lock conflock; + __be32 status = 0; + unsigned int strhashval; + int err; + + dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", + (long long) lock->lk_offset, + (long long) lock->lk_length); + + if (check_lock_length(lock->lk_offset, lock->lk_length)) + return nfserr_inval; + + if ((status = fh_verify(rqstp, &cstate->current_fh, + S_IFREG, NFSD_MAY_LOCK))) { + dprintk("NFSD: nfsd4_lock: permission denied!\n"); + return status; + } + + nfs4_lock_state(); + + if (lock->lk_is_new) { + /* + * Client indicates that this is a new lockowner. + * Use open owner and open stateid to create lock owner and + * lock stateid. + */ + struct nfs4_stateid *open_stp = NULL; + + status = nfserr_stale_clientid; + if (!nfsd4_has_session(cstate) && + STALE_CLIENTID(&lock->lk_new_clientid)) + goto out; + + /* validate and update open stateid and open seqid */ + status = nfs4_preprocess_seqid_op(cstate, + lock->lk_new_open_seqid, + &lock->lk_new_open_stateid, + OPEN_STATE, + &lock->lk_replay_owner, &open_stp, + lock); + if (status) + goto out; + open_sop = lock->lk_replay_owner; + /* create lockowner and lock stateid */ + fp = open_stp->st_file; + strhashval = lock_ownerstr_hashval(fp->fi_inode, + open_sop->so_client->cl_clientid.cl_id, + &lock->v.new.owner); + /* XXX: Do we need to check for duplicate stateowners on + * the same file, or should they just be allowed (and + * create new stateids)? */ + status = nfserr_jukebox; + lock_sop = alloc_init_lock_stateowner(strhashval, + open_sop->so_client, open_stp, lock); + if (lock_sop == NULL) + goto out; + lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp); + if (lock_stp == NULL) + goto out; + } else { + /* lock (lock owner + lock stateid) already exists */ + status = nfs4_preprocess_seqid_op(cstate, + lock->lk_old_lock_seqid, + &lock->lk_old_lock_stateid, + LOCK_STATE, + &lock->lk_replay_owner, &lock_stp, lock); + if (status) + goto out; + lock_sop = lock->lk_replay_owner; + fp = lock_stp->st_file; + } + /* lock->lk_replay_owner and lock_stp have been created or found */ + + status = nfserr_grace; + if (locks_in_grace() && !lock->lk_reclaim) + goto out; + status = nfserr_no_grace; + if (!locks_in_grace() && lock->lk_reclaim) + goto out; + + locks_init_lock(&file_lock); + switch (lock->lk_type) { + case NFS4_READ_LT: + case NFS4_READW_LT: + filp = find_readable_file(lock_stp->st_file); + if (filp) + get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); + file_lock.fl_type = F_RDLCK; + break; + case NFS4_WRITE_LT: + case NFS4_WRITEW_LT: + filp = find_writeable_file(lock_stp->st_file); + if (filp) + get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); + file_lock.fl_type = F_WRLCK; + break; + default: + status = nfserr_inval; + goto out; + } + if (!filp) { + status = nfserr_openmode; + goto out; + } + file_lock.fl_owner = (fl_owner_t)lock_sop; + file_lock.fl_pid = current->tgid; + file_lock.fl_file = filp; + file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; + + file_lock.fl_start = lock->lk_offset; + file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); + nfs4_transform_lock_offset(&file_lock); + + /* + * Try to lock the file in the VFS. + * Note: locks.c uses the BKL to protect the inode's lock list. + */ + + err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock); + switch (-err) { + case 0: /* success! */ + update_stateid(&lock_stp->st_stateid); + memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid, + sizeof(stateid_t)); + status = 0; + break; + case (EAGAIN): /* conflock holds conflicting lock */ + status = nfserr_denied; + dprintk("NFSD: nfsd4_lock: conflicting lock found!\n"); + nfs4_set_lock_denied(&conflock, &lock->lk_denied); + break; + case (EDEADLK): + status = nfserr_deadlock; + break; + default: + dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err); + status = nfserrno(err); + break; + } +out: + if (status && lock->lk_is_new && lock_sop) + release_lockowner(lock_sop); + if (lock->lk_replay_owner) { + nfs4_get_stateowner(lock->lk_replay_owner); + cstate->replay_owner = lock->lk_replay_owner; + } + nfs4_unlock_state(); + return status; +} + +/* + * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, + * so we do a temporary open here just to get an open file to pass to + * vfs_test_lock. (Arguably perhaps test_lock should be done with an + * inode operation.) + */ +static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) +{ + struct file *file; + __be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + if (!err) { + err = nfserrno(vfs_test_lock(file, lock)); + nfsd_close(file); + } + return err; +} + +/* + * LOCKT operation + */ +__be32 +nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_lockt *lockt) +{ + struct inode *inode; + struct file_lock file_lock; + __be32 status; + + if (locks_in_grace()) + return nfserr_grace; + + if (check_lock_length(lockt->lt_offset, lockt->lt_length)) + return nfserr_inval; + + lockt->lt_stateowner = NULL; + nfs4_lock_state(); + + status = nfserr_stale_clientid; + if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid)) + goto out; + + if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { + dprintk("NFSD: nfsd4_lockt: fh_verify() failed!\n"); + if (status == nfserr_symlink) + status = nfserr_inval; + goto out; + } + + inode = cstate->current_fh.fh_dentry->d_inode; + locks_init_lock(&file_lock); + switch (lockt->lt_type) { + case NFS4_READ_LT: + case NFS4_READW_LT: + file_lock.fl_type = F_RDLCK; + break; + case NFS4_WRITE_LT: + case NFS4_WRITEW_LT: + file_lock.fl_type = F_WRLCK; + break; + default: + dprintk("NFSD: nfs4_lockt: bad lock type!\n"); + status = nfserr_inval; + goto out; + } + + lockt->lt_stateowner = find_lockstateowner_str(inode, + &lockt->lt_clientid, &lockt->lt_owner); + if (lockt->lt_stateowner) + file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner; + file_lock.fl_pid = current->tgid; + file_lock.fl_flags = FL_POSIX; + + file_lock.fl_start = lockt->lt_offset; + file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); + + nfs4_transform_lock_offset(&file_lock); + + status = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock); + if (status) + goto out; + + if (file_lock.fl_type != F_UNLCK) { + status = nfserr_denied; + nfs4_set_lock_denied(&file_lock, &lockt->lt_denied); + } +out: + nfs4_unlock_state(); + return status; +} + +__be32 +nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_locku *locku) +{ + struct nfs4_stateid *stp; + struct file *filp = NULL; + struct file_lock file_lock; + __be32 status; + int err; + + dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n", + (long long) locku->lu_offset, + (long long) locku->lu_length); + + if (check_lock_length(locku->lu_offset, locku->lu_length)) + return nfserr_inval; + + nfs4_lock_state(); + + if ((status = nfs4_preprocess_seqid_op(cstate, + locku->lu_seqid, + &locku->lu_stateid, + LOCK_STATE, + &locku->lu_stateowner, &stp, NULL))) + goto out; + + filp = find_any_file(stp->st_file); + if (!filp) { + status = nfserr_lock_range; + goto out; + } + BUG_ON(!filp); + locks_init_lock(&file_lock); + file_lock.fl_type = F_UNLCK; + file_lock.fl_owner = (fl_owner_t) locku->lu_stateowner; + file_lock.fl_pid = current->tgid; + file_lock.fl_file = filp; + file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; + file_lock.fl_start = locku->lu_offset; + + file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length); + nfs4_transform_lock_offset(&file_lock); + + /* + * Try to unlock the file in the VFS. + */ + err = vfs_lock_file(filp, F_SETLK, &file_lock, NULL); + if (err) { + dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); + goto out_nfserr; + } + /* + * OK, unlock succeeded; the only thing left to do is update the stateid. + */ + update_stateid(&stp->st_stateid); + memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t)); + +out: + if (locku->lu_stateowner) { + nfs4_get_stateowner(locku->lu_stateowner); + cstate->replay_owner = locku->lu_stateowner; + } + nfs4_unlock_state(); + return status; + +out_nfserr: + status = nfserrno(err); + goto out; +} + +/* + * returns + * 1: locks held by lockowner + * 0: no locks held by lockowner + */ +static int +check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner) +{ + struct file_lock **flpp; + struct inode *inode = filp->fi_inode; + int status = 0; + + lock_flocks(); + for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { + if ((*flpp)->fl_owner == (fl_owner_t)lowner) { + status = 1; + goto out; + } + } +out: + unlock_flocks(); + return status; +} + +__be32 +nfsd4_release_lockowner(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_release_lockowner *rlockowner) +{ + clientid_t *clid = &rlockowner->rl_clientid; + struct nfs4_stateowner *sop; + struct nfs4_stateid *stp; + struct xdr_netobj *owner = &rlockowner->rl_owner; + struct list_head matches; + int i; + __be32 status; + + dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", + clid->cl_boot, clid->cl_id); + + /* XXX check for lease expiration */ + + status = nfserr_stale_clientid; + if (STALE_CLIENTID(clid)) + return status; + + nfs4_lock_state(); + + status = nfserr_locks_held; + /* XXX: we're doing a linear search through all the lockowners. + * Yipes! For now we'll just hope clients aren't really using + * release_lockowner much, but eventually we have to fix these + * data structures. */ + INIT_LIST_HEAD(&matches); + for (i = 0; i < LOCK_HASH_SIZE; i++) { + list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) { + if (!same_owner_str(sop, owner, clid)) + continue; + list_for_each_entry(stp, &sop->so_stateids, + st_perstateowner) { + if (check_for_locks(stp->st_file, sop)) + goto out; + /* Note: so_perclient unused for lockowners, + * so it's OK to fool with here. */ + list_add(&sop->so_perclient, &matches); + } + } + } + /* Clients probably won't expect us to return with some (but not all) + * of the lockowner state released; so don't release any until all + * have been checked. */ + status = nfs_ok; + while (!list_empty(&matches)) { + sop = list_entry(matches.next, struct nfs4_stateowner, + so_perclient); + /* unhash_stateowner deletes so_perclient only + * for openowners. */ + list_del(&sop->so_perclient); + release_lockowner(sop); + } +out: + nfs4_unlock_state(); + return status; +} + +static inline struct nfs4_client_reclaim * +alloc_reclaim(void) +{ + return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL); +} + +int +nfs4_has_reclaimed_state(const char *name, bool use_exchange_id) +{ + unsigned int strhashval = clientstr_hashval(name); + struct nfs4_client *clp; + + clp = find_confirmed_client_by_str(name, strhashval); + return clp ? 1 : 0; +} + +/* + * failure => all reset bets are off, nfserr_no_grace... + */ +int +nfs4_client_to_reclaim(const char *name) +{ + unsigned int strhashval; + struct nfs4_client_reclaim *crp = NULL; + + dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name); + crp = alloc_reclaim(); + if (!crp) + return 0; + strhashval = clientstr_hashval(name); + INIT_LIST_HEAD(&crp->cr_strhash); + list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]); + memcpy(crp->cr_recdir, name, HEXDIR_LEN); + reclaim_str_hashtbl_size++; + return 1; +} + +static void +nfs4_release_reclaim(void) +{ + struct nfs4_client_reclaim *crp = NULL; + int i; + + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + while (!list_empty(&reclaim_str_hashtbl[i])) { + crp = list_entry(reclaim_str_hashtbl[i].next, + struct nfs4_client_reclaim, cr_strhash); + list_del(&crp->cr_strhash); + kfree(crp); + reclaim_str_hashtbl_size--; + } + } + BUG_ON(reclaim_str_hashtbl_size); +} + +/* + * called from OPEN, CLAIM_PREVIOUS with a new clientid. */ +static struct nfs4_client_reclaim * +nfs4_find_reclaim_client(clientid_t *clid) +{ + unsigned int strhashval; + struct nfs4_client *clp; + struct nfs4_client_reclaim *crp = NULL; + + + /* find clientid in conf_id_hashtbl */ + clp = find_confirmed_client(clid); + if (clp == NULL) + return NULL; + + dprintk("NFSD: nfs4_find_reclaim_client for %.*s with recdir %s\n", + clp->cl_name.len, clp->cl_name.data, + clp->cl_recdir); + + /* find clp->cl_name in reclaim_str_hashtbl */ + strhashval = clientstr_hashval(clp->cl_recdir); + list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) { + if (same_name(crp->cr_recdir, clp->cl_recdir)) { + return crp; + } + } + return NULL; +} + +/* +* Called from OPEN. Look for clientid in reclaim list. +*/ +__be32 +nfs4_check_open_reclaim(clientid_t *clid) +{ + return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad; +} + +/* initialization to perform at module load time: */ + +int +nfs4_state_init(void) +{ + int i, status; + + status = nfsd4_init_slabs(); + if (status) + return status; + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + INIT_LIST_HEAD(&conf_id_hashtbl[i]); + INIT_LIST_HEAD(&conf_str_hashtbl[i]); + INIT_LIST_HEAD(&unconf_str_hashtbl[i]); + INIT_LIST_HEAD(&unconf_id_hashtbl[i]); + INIT_LIST_HEAD(&reclaim_str_hashtbl[i]); + } + for (i = 0; i < SESSION_HASH_SIZE; i++) + INIT_LIST_HEAD(&sessionid_hashtbl[i]); + for (i = 0; i < FILE_HASH_SIZE; i++) { + INIT_LIST_HEAD(&file_hashtbl[i]); + } + for (i = 0; i < OWNER_HASH_SIZE; i++) { + INIT_LIST_HEAD(&ownerstr_hashtbl[i]); + INIT_LIST_HEAD(&ownerid_hashtbl[i]); + } + for (i = 0; i < STATEID_HASH_SIZE; i++) { + INIT_LIST_HEAD(&stateid_hashtbl[i]); + INIT_LIST_HEAD(&lockstateid_hashtbl[i]); + } + for (i = 0; i < LOCK_HASH_SIZE; i++) { + INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]); + INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]); + } + memset(&onestateid, ~0, sizeof(stateid_t)); + INIT_LIST_HEAD(&close_lru); + INIT_LIST_HEAD(&client_lru); + INIT_LIST_HEAD(&del_recall_lru); + reclaim_str_hashtbl_size = 0; + return 0; +} + +static void +nfsd4_load_reboot_recovery_data(void) +{ + int status; + + nfs4_lock_state(); + nfsd4_init_recdir(user_recovery_dirname); + status = nfsd4_recdir_load(); + nfs4_unlock_state(); + if (status) + printk("NFSD: Failure reading reboot recovery data\n"); +} + +/* + * Since the lifetime of a delegation isn't limited to that of an open, a + * client may quite reasonably hang on to a delegation as long as it has + * the inode cached. This becomes an obvious problem the first time a + * client's inode cache approaches the size of the server's total memory. + * + * For now we avoid this problem by imposing a hard limit on the number + * of delegations, which varies according to the server's memory size. + */ +static void +set_max_delegations(void) +{ + /* + * Allow at most 4 delegations per megabyte of RAM. Quick + * estimates suggest that in the worst case (where every delegation + * is for a different inode), a delegation could take about 1.5K, + * giving a worst case usage of about 6% of memory. + */ + max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT); +} + +/* initialization to perform when the nfsd service is started: */ + +static int +__nfs4_state_start(void) +{ + int ret; + + boot_time = get_seconds(); + locks_start_grace(&nfsd4_manager); + printk(KERN_INFO "NFSD: starting %ld-second grace period\n", + nfsd4_grace); + ret = set_callback_cred(); + if (ret) + return -ENOMEM; + laundry_wq = create_singlethread_workqueue("nfsd4"); + if (laundry_wq == NULL) + return -ENOMEM; + ret = nfsd4_create_callback_queue(); + if (ret) + goto out_free_laundry; + queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ); + set_max_delegations(); + return 0; +out_free_laundry: + destroy_workqueue(laundry_wq); + return ret; +} + +int +nfs4_state_start(void) +{ + nfsd4_load_reboot_recovery_data(); + return __nfs4_state_start(); +} + +static void +__nfs4_state_shutdown(void) +{ + int i; + struct nfs4_client *clp = NULL; + struct nfs4_delegation *dp = NULL; + struct list_head *pos, *next, reaplist; + + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + while (!list_empty(&conf_id_hashtbl[i])) { + clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); + expire_client(clp); + } + while (!list_empty(&unconf_str_hashtbl[i])) { + clp = list_entry(unconf_str_hashtbl[i].next, struct nfs4_client, cl_strhash); + expire_client(clp); + } + } + INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + list_for_each_safe(pos, next, &del_recall_lru) { + dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); + list_move(&dp->dl_recall_lru, &reaplist); + } + spin_unlock(&recall_lock); + list_for_each_safe(pos, next, &reaplist) { + dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); + list_del_init(&dp->dl_recall_lru); + unhash_delegation(dp); + } + + nfsd4_shutdown_recdir(); +} + +void +nfs4_state_shutdown(void) +{ + cancel_delayed_work_sync(&laundromat_work); + destroy_workqueue(laundry_wq); + locks_end_grace(&nfsd4_manager); + nfs4_lock_state(); + nfs4_release_reclaim(); + __nfs4_state_shutdown(); + nfs4_unlock_state(); + nfsd4_destroy_callback_queue(); +} + +/* + * user_recovery_dirname is protected by the nfsd_mutex since it's only + * accessed when nfsd is starting. + */ +static void +nfs4_set_recdir(char *recdir) +{ + strcpy(user_recovery_dirname, recdir); +} + +/* + * Change the NFSv4 recovery directory to recdir. + */ +int +nfs4_reset_recoverydir(char *recdir) +{ + int status; + struct path path; + + status = kern_path(recdir, LOOKUP_FOLLOW, &path); + if (status) + return status; + status = -ENOTDIR; + if (S_ISDIR(path.dentry->d_inode->i_mode)) { + nfs4_set_recdir(recdir); + status = 0; + } + path_put(&path); + return status; +} + +char * +nfs4_recoverydir(void) +{ + return user_recovery_dirname; +} diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c new file mode 100644 index 00000000..6c740974 --- /dev/null +++ b/fs/nfsd/nfs4xdr.c @@ -0,0 +1,3408 @@ +/* + * Server-side XDR for NFSv4 + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * TODO: Neil Brown made the following observation: We currently + * initially reserve NFSD_BUFSIZE space on the transmit queue and + * never release any of that until the request is complete. + * It would be good to calculate a new maximum response size while + * decoding the COMPOUND, and call svc_reserve with this number + * at the end of nfs4svc_decode_compoundargs. + */ + +#include +#include +#include +#include +#include + +#include "idmap.h" +#include "acl.h" +#include "xdr4.h" +#include "vfs.h" + + +#define NFSDDBG_FACILITY NFSDDBG_XDR + +/* + * As per referral draft, the fsid for a referral MUST be different from the fsid of the containing + * directory in order to indicate to the client that a filesystem boundary is present + * We use a fixed fsid for a referral + */ +#define NFS4_REFERRAL_FSID_MAJOR 0x8000000ULL +#define NFS4_REFERRAL_FSID_MINOR 0x8000000ULL + +static __be32 +check_filename(char *str, int len, __be32 err) +{ + int i; + + if (len == 0) + return nfserr_inval; + if (isdotent(str, len)) + return err; + for (i = 0; i < len; i++) + if (str[i] == '/') + return err; + return 0; +} + +#define DECODE_HEAD \ + __be32 *p; \ + __be32 status +#define DECODE_TAIL \ + status = 0; \ +out: \ + return status; \ +xdr_error: \ + dprintk("NFSD: xdr error (%s:%d)\n", \ + __FILE__, __LINE__); \ + status = nfserr_bad_xdr; \ + goto out + +#define READ32(x) (x) = ntohl(*p++) +#define READ64(x) do { \ + (x) = (u64)ntohl(*p++) << 32; \ + (x) |= ntohl(*p++); \ +} while (0) +#define READTIME(x) do { \ + p++; \ + (x) = ntohl(*p++); \ + p++; \ +} while (0) +#define READMEM(x,nbytes) do { \ + x = (char *)p; \ + p += XDR_QUADLEN(nbytes); \ +} while (0) +#define SAVEMEM(x,nbytes) do { \ + if (!(x = (p==argp->tmp || p == argp->tmpp) ? \ + savemem(argp, p, nbytes) : \ + (char *)p)) { \ + dprintk("NFSD: xdr error (%s:%d)\n", \ + __FILE__, __LINE__); \ + goto xdr_error; \ + } \ + p += XDR_QUADLEN(nbytes); \ +} while (0) +#define COPYMEM(x,nbytes) do { \ + memcpy((x), p, nbytes); \ + p += XDR_QUADLEN(nbytes); \ +} while (0) + +/* READ_BUF, read_buf(): nbytes must be <= PAGE_SIZE */ +#define READ_BUF(nbytes) do { \ + if (nbytes <= (u32)((char *)argp->end - (char *)argp->p)) { \ + p = argp->p; \ + argp->p += XDR_QUADLEN(nbytes); \ + } else if (!(p = read_buf(argp, nbytes))) { \ + dprintk("NFSD: xdr error (%s:%d)\n", \ + __FILE__, __LINE__); \ + goto xdr_error; \ + } \ +} while (0) + +static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) +{ + /* We want more bytes than seem to be available. + * Maybe we need a new page, maybe we have just run out + */ + unsigned int avail = (char *)argp->end - (char *)argp->p; + __be32 *p; + if (avail + argp->pagelen < nbytes) + return NULL; + if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */ + return NULL; + /* ok, we can do it with the current plus the next page */ + if (nbytes <= sizeof(argp->tmp)) + p = argp->tmp; + else { + kfree(argp->tmpp); + p = argp->tmpp = kmalloc(nbytes, GFP_KERNEL); + if (!p) + return NULL; + + } + /* + * The following memcpy is safe because read_buf is always + * called with nbytes > avail, and the two cases above both + * guarantee p points to at least nbytes bytes. + */ + memcpy(p, argp->p, avail); + /* step to next page */ + argp->p = page_address(argp->pagelist[0]); + argp->pagelist++; + if (argp->pagelen < PAGE_SIZE) { + argp->end = argp->p + (argp->pagelen>>2); + argp->pagelen = 0; + } else { + argp->end = argp->p + (PAGE_SIZE>>2); + argp->pagelen -= PAGE_SIZE; + } + memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); + argp->p += XDR_QUADLEN(nbytes - avail); + return p; +} + +static int zero_clientid(clientid_t *clid) +{ + return (clid->cl_boot == 0) && (clid->cl_id == 0); +} + +static int +defer_free(struct nfsd4_compoundargs *argp, + void (*release)(const void *), void *p) +{ + struct tmpbuf *tb; + + tb = kmalloc(sizeof(*tb), GFP_KERNEL); + if (!tb) + return -ENOMEM; + tb->buf = p; + tb->release = release; + tb->next = argp->to_free; + argp->to_free = tb; + return 0; +} + +static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) +{ + if (p == argp->tmp) { + p = kmalloc(nbytes, GFP_KERNEL); + if (!p) + return NULL; + memcpy(p, argp->tmp, nbytes); + } else { + BUG_ON(p != argp->tmpp); + argp->tmpp = NULL; + } + if (defer_free(argp, kfree, p)) { + kfree(p); + return NULL; + } else + return (char *)p; +} + +static __be32 +nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) +{ + u32 bmlen; + DECODE_HEAD; + + bmval[0] = 0; + bmval[1] = 0; + bmval[2] = 0; + + READ_BUF(4); + READ32(bmlen); + if (bmlen > 1000) + goto xdr_error; + + READ_BUF(bmlen << 2); + if (bmlen > 0) + READ32(bmval[0]); + if (bmlen > 1) + READ32(bmval[1]); + if (bmlen > 2) + READ32(bmval[2]); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, + struct iattr *iattr, struct nfs4_acl **acl) +{ + int expected_len, len = 0; + u32 dummy32; + char *buf; + int host_err; + + DECODE_HEAD; + iattr->ia_valid = 0; + if ((status = nfsd4_decode_bitmap(argp, bmval))) + return status; + + READ_BUF(4); + READ32(expected_len); + + if (bmval[0] & FATTR4_WORD0_SIZE) { + READ_BUF(8); + len += 8; + READ64(iattr->ia_size); + iattr->ia_valid |= ATTR_SIZE; + } + if (bmval[0] & FATTR4_WORD0_ACL) { + int nace; + struct nfs4_ace *ace; + + READ_BUF(4); len += 4; + READ32(nace); + + if (nace > NFS4_ACL_MAX) + return nfserr_resource; + + *acl = nfs4_acl_new(nace); + if (*acl == NULL) { + host_err = -ENOMEM; + goto out_nfserr; + } + defer_free(argp, kfree, *acl); + + (*acl)->naces = nace; + for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) { + READ_BUF(16); len += 16; + READ32(ace->type); + READ32(ace->flag); + READ32(ace->access_mask); + READ32(dummy32); + READ_BUF(dummy32); + len += XDR_QUADLEN(dummy32) << 2; + READMEM(buf, dummy32); + ace->whotype = nfs4_acl_get_whotype(buf, dummy32); + status = nfs_ok; + if (ace->whotype != NFS4_ACL_WHO_NAMED) + ace->who = 0; + else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + status = nfsd_map_name_to_gid(argp->rqstp, + buf, dummy32, &ace->who); + else + status = nfsd_map_name_to_uid(argp->rqstp, + buf, dummy32, &ace->who); + if (status) + return status; + } + } else + *acl = NULL; + if (bmval[1] & FATTR4_WORD1_MODE) { + READ_BUF(4); + len += 4; + READ32(iattr->ia_mode); + iattr->ia_mode &= (S_IFMT | S_IALLUGO); + iattr->ia_valid |= ATTR_MODE; + } + if (bmval[1] & FATTR4_WORD1_OWNER) { + READ_BUF(4); + len += 4; + READ32(dummy32); + READ_BUF(dummy32); + len += (XDR_QUADLEN(dummy32) << 2); + READMEM(buf, dummy32); + if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) + return status; + iattr->ia_valid |= ATTR_UID; + } + if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) { + READ_BUF(4); + len += 4; + READ32(dummy32); + READ_BUF(dummy32); + len += (XDR_QUADLEN(dummy32) << 2); + READMEM(buf, dummy32); + if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) + return status; + iattr->ia_valid |= ATTR_GID; + } + if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { + READ_BUF(4); + len += 4; + READ32(dummy32); + switch (dummy32) { + case NFS4_SET_TO_CLIENT_TIME: + /* We require the high 32 bits of 'seconds' to be 0, and we ignore + all 32 bits of 'nseconds'. */ + READ_BUF(12); + len += 12; + READ32(dummy32); + if (dummy32) + return nfserr_inval; + READ32(iattr->ia_atime.tv_sec); + READ32(iattr->ia_atime.tv_nsec); + if (iattr->ia_atime.tv_nsec >= (u32)1000000000) + return nfserr_inval; + iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); + break; + case NFS4_SET_TO_SERVER_TIME: + iattr->ia_valid |= ATTR_ATIME; + break; + default: + goto xdr_error; + } + } + if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { + READ_BUF(4); + len += 4; + READ32(dummy32); + switch (dummy32) { + case NFS4_SET_TO_CLIENT_TIME: + /* We require the high 32 bits of 'seconds' to be 0, and we ignore + all 32 bits of 'nseconds'. */ + READ_BUF(12); + len += 12; + READ32(dummy32); + if (dummy32) + return nfserr_inval; + READ32(iattr->ia_mtime.tv_sec); + READ32(iattr->ia_mtime.tv_nsec); + if (iattr->ia_mtime.tv_nsec >= (u32)1000000000) + return nfserr_inval; + iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); + break; + case NFS4_SET_TO_SERVER_TIME: + iattr->ia_valid |= ATTR_MTIME; + break; + default: + goto xdr_error; + } + } + if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0 + || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1 + || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2) + READ_BUF(expected_len - len); + else if (len != expected_len) + goto xdr_error; + + DECODE_TAIL; + +out_nfserr: + status = nfserrno(host_err); + goto out; +} + +static __be32 +nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid) +{ + DECODE_HEAD; + + READ_BUF(sizeof(stateid_t)); + READ32(sid->si_generation); + COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(access->ac_req_access); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) +{ + DECODE_HEAD; + + READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); + COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); + READ32(bcts->dir); + /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker + * could help us figure out we should be using it. */ + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) +{ + DECODE_HEAD; + + close->cl_stateowner = NULL; + READ_BUF(4); + READ32(close->cl_seqid); + return nfsd4_decode_stateid(argp, &close->cl_stateid); + + DECODE_TAIL; +} + + +static __be32 +nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit) +{ + DECODE_HEAD; + + READ_BUF(12); + READ64(commit->co_offset); + READ32(commit->co_count); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(create->cr_type); + switch (create->cr_type) { + case NF4LNK: + READ_BUF(4); + READ32(create->cr_linklen); + READ_BUF(create->cr_linklen); + SAVEMEM(create->cr_linkname, create->cr_linklen); + break; + case NF4BLK: + case NF4CHR: + READ_BUF(8); + READ32(create->cr_specdata1); + READ32(create->cr_specdata2); + break; + case NF4SOCK: + case NF4FIFO: + case NF4DIR: + default: + break; + } + + READ_BUF(4); + READ32(create->cr_namelen); + READ_BUF(create->cr_namelen); + SAVEMEM(create->cr_name, create->cr_namelen); + if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval))) + return status; + + status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, + &create->cr_acl); + if (status) + goto out; + + DECODE_TAIL; +} + +static inline __be32 +nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) +{ + return nfsd4_decode_stateid(argp, &dr->dr_stateid); +} + +static inline __be32 +nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr) +{ + return nfsd4_decode_bitmap(argp, getattr->ga_bmval); +} + +static __be32 +nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(link->li_namelen); + READ_BUF(link->li_namelen); + SAVEMEM(link->li_name, link->li_namelen); + if ((status = check_filename(link->li_name, link->li_namelen, nfserr_inval))) + return status; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) +{ + DECODE_HEAD; + + lock->lk_replay_owner = NULL; + /* + * type, reclaim(boolean), offset, length, new_lock_owner(boolean) + */ + READ_BUF(28); + READ32(lock->lk_type); + if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT)) + goto xdr_error; + READ32(lock->lk_reclaim); + READ64(lock->lk_offset); + READ64(lock->lk_length); + READ32(lock->lk_is_new); + + if (lock->lk_is_new) { + READ_BUF(4); + READ32(lock->lk_new_open_seqid); + status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid); + if (status) + return status; + READ_BUF(8 + sizeof(clientid_t)); + READ32(lock->lk_new_lock_seqid); + COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); + READ32(lock->lk_new_owner.len); + READ_BUF(lock->lk_new_owner.len); + READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); + } else { + status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid); + if (status) + return status; + READ_BUF(4); + READ32(lock->lk_old_lock_seqid); + } + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) +{ + DECODE_HEAD; + + READ_BUF(32); + READ32(lockt->lt_type); + if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT)) + goto xdr_error; + READ64(lockt->lt_offset); + READ64(lockt->lt_length); + COPYMEM(&lockt->lt_clientid, 8); + READ32(lockt->lt_owner.len); + READ_BUF(lockt->lt_owner.len); + READMEM(lockt->lt_owner.data, lockt->lt_owner.len); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) +{ + DECODE_HEAD; + + locku->lu_stateowner = NULL; + READ_BUF(8); + READ32(locku->lu_type); + if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) + goto xdr_error; + READ32(locku->lu_seqid); + status = nfsd4_decode_stateid(argp, &locku->lu_stateid); + if (status) + return status; + READ_BUF(16); + READ64(locku->lu_offset); + READ64(locku->lu_length); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(lookup->lo_len); + READ_BUF(lookup->lo_len); + SAVEMEM(lookup->lo_name, lookup->lo_len); + if ((status = check_filename(lookup->lo_name, lookup->lo_len, nfserr_noent))) + return status; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +{ + DECODE_HEAD; + + memset(open->op_bmval, 0, sizeof(open->op_bmval)); + open->op_iattr.ia_valid = 0; + open->op_stateowner = NULL; + + /* seqid, share_access, share_deny, clientid, ownerlen */ + READ_BUF(16 + sizeof(clientid_t)); + READ32(open->op_seqid); + READ32(open->op_share_access); + READ32(open->op_share_deny); + COPYMEM(&open->op_clientid, sizeof(clientid_t)); + READ32(open->op_owner.len); + + /* owner, open_flag */ + READ_BUF(open->op_owner.len + 4); + SAVEMEM(open->op_owner.data, open->op_owner.len); + READ32(open->op_create); + switch (open->op_create) { + case NFS4_OPEN_NOCREATE: + break; + case NFS4_OPEN_CREATE: + READ_BUF(4); + READ32(open->op_createmode); + switch (open->op_createmode) { + case NFS4_CREATE_UNCHECKED: + case NFS4_CREATE_GUARDED: + status = nfsd4_decode_fattr(argp, open->op_bmval, + &open->op_iattr, &open->op_acl); + if (status) + goto out; + break; + case NFS4_CREATE_EXCLUSIVE: + READ_BUF(8); + COPYMEM(open->op_verf.data, 8); + break; + case NFS4_CREATE_EXCLUSIVE4_1: + if (argp->minorversion < 1) + goto xdr_error; + READ_BUF(8); + COPYMEM(open->op_verf.data, 8); + status = nfsd4_decode_fattr(argp, open->op_bmval, + &open->op_iattr, &open->op_acl); + if (status) + goto out; + break; + default: + goto xdr_error; + } + break; + default: + goto xdr_error; + } + + /* open_claim */ + READ_BUF(4); + READ32(open->op_claim_type); + switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + READ_BUF(4); + READ32(open->op_fname.len); + READ_BUF(open->op_fname.len); + SAVEMEM(open->op_fname.data, open->op_fname.len); + if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) + return status; + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + READ_BUF(4); + READ32(open->op_delegate_type); + break; + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + if (status) + return status; + READ_BUF(4); + READ32(open->op_fname.len); + READ_BUF(open->op_fname.len); + SAVEMEM(open->op_fname.data, open->op_fname.len); + if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) + return status; + break; + default: + goto xdr_error; + } + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf) +{ + DECODE_HEAD; + + open_conf->oc_stateowner = NULL; + status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); + if (status) + return status; + READ_BUF(4); + READ32(open_conf->oc_seqid); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down) +{ + DECODE_HEAD; + + open_down->od_stateowner = NULL; + status = nfsd4_decode_stateid(argp, &open_down->od_stateid); + if (status) + return status; + READ_BUF(12); + READ32(open_down->od_seqid); + READ32(open_down->od_share_access); + READ32(open_down->od_share_deny); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(putfh->pf_fhlen); + if (putfh->pf_fhlen > NFS4_FHSIZE) + goto xdr_error; + READ_BUF(putfh->pf_fhlen); + SAVEMEM(putfh->pf_fhval, putfh->pf_fhlen); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &read->rd_stateid); + if (status) + return status; + READ_BUF(12); + READ64(read->rd_offset); + READ32(read->rd_length); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir) +{ + DECODE_HEAD; + + READ_BUF(24); + READ64(readdir->rd_cookie); + COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data)); + READ32(readdir->rd_dircount); /* just in case you needed a useless field... */ + READ32(readdir->rd_maxcount); + if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval))) + goto out; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(remove->rm_namelen); + READ_BUF(remove->rm_namelen); + SAVEMEM(remove->rm_name, remove->rm_namelen); + if ((status = check_filename(remove->rm_name, remove->rm_namelen, nfserr_noent))) + return status; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(rename->rn_snamelen); + READ_BUF(rename->rn_snamelen + 4); + SAVEMEM(rename->rn_sname, rename->rn_snamelen); + READ32(rename->rn_tnamelen); + READ_BUF(rename->rn_tnamelen); + SAVEMEM(rename->rn_tname, rename->rn_tnamelen); + if ((status = check_filename(rename->rn_sname, rename->rn_snamelen, nfserr_noent))) + return status; + if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen, nfserr_inval))) + return status; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid) +{ + DECODE_HEAD; + + READ_BUF(sizeof(clientid_t)); + COPYMEM(clientid, sizeof(clientid_t)); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, + struct nfsd4_secinfo *secinfo) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(secinfo->si_namelen); + READ_BUF(secinfo->si_namelen); + SAVEMEM(secinfo->si_name, secinfo->si_namelen); + status = check_filename(secinfo->si_name, secinfo->si_namelen, + nfserr_noent); + if (status) + return status; + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, + struct nfsd4_secinfo_no_name *sin) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(sin->sin_style); + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) +{ + __be32 status; + + status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); + if (status) + return status; + return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, + &setattr->sa_acl); +} + +static __be32 +nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid) +{ + DECODE_HEAD; + + READ_BUF(12); + COPYMEM(setclientid->se_verf.data, 8); + READ32(setclientid->se_namelen); + + READ_BUF(setclientid->se_namelen + 8); + SAVEMEM(setclientid->se_name, setclientid->se_namelen); + READ32(setclientid->se_callback_prog); + READ32(setclientid->se_callback_netid_len); + + READ_BUF(setclientid->se_callback_netid_len + 4); + SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len); + READ32(setclientid->se_callback_addr_len); + + READ_BUF(setclientid->se_callback_addr_len + 4); + SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len); + READ32(setclientid->se_callback_ident); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c) +{ + DECODE_HEAD; + + READ_BUF(8 + sizeof(nfs4_verifier)); + COPYMEM(&scd_c->sc_clientid, 8); + COPYMEM(&scd_c->sc_confirm, sizeof(nfs4_verifier)); + + DECODE_TAIL; +} + +/* Also used for NVERIFY */ +static __be32 +nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify) +{ +#if 0 + struct nfsd4_compoundargs save = { + .p = argp->p, + .end = argp->end, + .rqstp = argp->rqstp, + }; + u32 ve_bmval[2]; + struct iattr ve_iattr; /* request */ + struct nfs4_acl *ve_acl; /* request */ +#endif + DECODE_HEAD; + + if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval))) + goto out; + + /* For convenience's sake, we compare raw xdr'd attributes in + * nfsd4_proc_verify; however we still decode here just to return + * correct error in case of bad xdr. */ +#if 0 + status = nfsd4_decode_fattr(ve_bmval, &ve_iattr, &ve_acl); + if (status == nfserr_inval) { + status = nfserrno(status); + goto out; + } +#endif + READ_BUF(4); + READ32(verify->ve_attrlen); + READ_BUF(verify->ve_attrlen); + SAVEMEM(verify->ve_attrval, verify->ve_attrlen); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) +{ + int avail; + int v; + int len; + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &write->wr_stateid); + if (status) + return status; + READ_BUF(16); + READ64(write->wr_offset); + READ32(write->wr_stable_how); + if (write->wr_stable_how > 2) + goto xdr_error; + READ32(write->wr_buflen); + + /* Sorry .. no magic macros for this.. * + * READ_BUF(write->wr_buflen); + * SAVEMEM(write->wr_buf, write->wr_buflen); + */ + avail = (char*)argp->end - (char*)argp->p; + if (avail + argp->pagelen < write->wr_buflen) { + dprintk("NFSD: xdr error (%s:%d)\n", + __FILE__, __LINE__); + goto xdr_error; + } + argp->rqstp->rq_vec[0].iov_base = p; + argp->rqstp->rq_vec[0].iov_len = avail; + v = 0; + len = write->wr_buflen; + while (len > argp->rqstp->rq_vec[v].iov_len) { + len -= argp->rqstp->rq_vec[v].iov_len; + v++; + argp->rqstp->rq_vec[v].iov_base = page_address(argp->pagelist[0]); + argp->pagelist++; + if (argp->pagelen >= PAGE_SIZE) { + argp->rqstp->rq_vec[v].iov_len = PAGE_SIZE; + argp->pagelen -= PAGE_SIZE; + } else { + argp->rqstp->rq_vec[v].iov_len = argp->pagelen; + argp->pagelen -= len; + } + } + argp->end = (__be32*) (argp->rqstp->rq_vec[v].iov_base + argp->rqstp->rq_vec[v].iov_len); + argp->p = (__be32*) (argp->rqstp->rq_vec[v].iov_base + (XDR_QUADLEN(len) << 2)); + argp->rqstp->rq_vec[v].iov_len = len; + write->wr_vlen = v+1; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner) +{ + DECODE_HEAD; + + READ_BUF(12); + COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t)); + READ32(rlockowner->rl_owner.len); + READ_BUF(rlockowner->rl_owner.len); + READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); + + if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid)) + return nfserr_inval; + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + int dummy, tmp; + DECODE_HEAD; + + READ_BUF(NFS4_VERIFIER_SIZE); + COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE); + + READ_BUF(4); + READ32(exid->clname.len); + + READ_BUF(exid->clname.len); + SAVEMEM(exid->clname.data, exid->clname.len); + + READ_BUF(4); + READ32(exid->flags); + + /* Ignore state_protect4_a */ + READ_BUF(4); + READ32(exid->spa_how); + switch (exid->spa_how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + /* spo_must_enforce */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + p += dummy; + + /* spo_must_allow */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + p += dummy; + break; + case SP4_SSV: + /* ssp_ops */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + p += dummy; + + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + p += dummy; + + /* ssp_hash_algs<> */ + READ_BUF(4); + READ32(tmp); + while (tmp--) { + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + } + + /* ssp_encr_algs<> */ + READ_BUF(4); + READ32(tmp); + while (tmp--) { + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + } + + /* ssp_window and ssp_num_gss_handles */ + READ_BUF(8); + READ32(dummy); + READ32(dummy); + break; + default: + goto xdr_error; + } + + /* Ignore Implementation ID */ + READ_BUF(4); /* nfs_impl_id4 array length */ + READ32(dummy); + + if (dummy > 1) + goto xdr_error; + + if (dummy == 1) { + /* nii_domain */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + + /* nii_name */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + + /* nii_date */ + READ_BUF(12); + p += 3; + } + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, + struct nfsd4_create_session *sess) +{ + DECODE_HEAD; + + u32 dummy; + char *machine_name; + int i; + int nr_secflavs; + + READ_BUF(16); + COPYMEM(&sess->clientid, 8); + READ32(sess->seqid); + READ32(sess->flags); + + /* Fore channel attrs */ + READ_BUF(28); + READ32(dummy); /* headerpadsz is always 0 */ + READ32(sess->fore_channel.maxreq_sz); + READ32(sess->fore_channel.maxresp_sz); + READ32(sess->fore_channel.maxresp_cached); + READ32(sess->fore_channel.maxops); + READ32(sess->fore_channel.maxreqs); + READ32(sess->fore_channel.nr_rdma_attrs); + if (sess->fore_channel.nr_rdma_attrs == 1) { + READ_BUF(4); + READ32(sess->fore_channel.rdma_attrs); + } else if (sess->fore_channel.nr_rdma_attrs > 1) { + dprintk("Too many fore channel attr bitmaps!\n"); + goto xdr_error; + } + + /* Back channel attrs */ + READ_BUF(28); + READ32(dummy); /* headerpadsz is always 0 */ + READ32(sess->back_channel.maxreq_sz); + READ32(sess->back_channel.maxresp_sz); + READ32(sess->back_channel.maxresp_cached); + READ32(sess->back_channel.maxops); + READ32(sess->back_channel.maxreqs); + READ32(sess->back_channel.nr_rdma_attrs); + if (sess->back_channel.nr_rdma_attrs == 1) { + READ_BUF(4); + READ32(sess->back_channel.rdma_attrs); + } else if (sess->back_channel.nr_rdma_attrs > 1) { + dprintk("Too many back channel attr bitmaps!\n"); + goto xdr_error; + } + + READ_BUF(8); + READ32(sess->callback_prog); + + /* callback_sec_params4 */ + READ32(nr_secflavs); + for (i = 0; i < nr_secflavs; ++i) { + READ_BUF(4); + READ32(dummy); + switch (dummy) { + case RPC_AUTH_NULL: + /* Nothing to read */ + break; + case RPC_AUTH_UNIX: + READ_BUF(8); + /* stamp */ + READ32(dummy); + + /* machine name */ + READ32(dummy); + READ_BUF(dummy); + SAVEMEM(machine_name, dummy); + + /* uid, gid */ + READ_BUF(8); + READ32(sess->uid); + READ32(sess->gid); + + /* more gids */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + break; + case RPC_AUTH_GSS: + dprintk("RPC_AUTH_GSS callback secflavor " + "not supported!\n"); + READ_BUF(8); + /* gcbp_service */ + READ32(dummy); + /* gcbp_handle_from_server */ + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + /* gcbp_handle_from_client */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + break; + default: + dprintk("Illegal callback secflavor\n"); + return nfserr_inval; + } + } + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, + struct nfsd4_destroy_session *destroy_session) +{ + DECODE_HEAD; + READ_BUF(NFS4_MAX_SESSIONID_LEN); + COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, + struct nfsd4_sequence *seq) +{ + DECODE_HEAD; + + READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); + COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); + READ32(seq->seqid); + READ32(seq->slotid); + READ32(seq->maxslots); + READ32(seq->cachethis); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(rc->rca_one_fs); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) +{ + return nfs_ok; +} + +static __be32 +nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) +{ + return nfserr_notsupp; +} + +typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); + +static nfsd4_dec nfsd4_dec_ops[] = { + [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access, + [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close, + [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit, + [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create, + [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn, + [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr, + [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_LINK] = (nfsd4_dec)nfsd4_decode_link, + [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock, + [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt, + [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku, + [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup, + [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop, + [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify, + [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open, + [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, + [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, + [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, + [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_READ] = (nfsd4_dec)nfsd4_decode_read, + [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, + [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop, + [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove, + [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename, + [OP_RENEW] = (nfsd4_dec)nfsd4_decode_renew, + [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo, + [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr, + [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_setclientid, + [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm, + [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify, + [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write, + [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, +}; + +static nfsd4_dec nfsd41_dec_ops[] = { + [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access, + [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close, + [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit, + [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create, + [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn, + [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr, + [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_LINK] = (nfsd4_dec)nfsd4_decode_link, + [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock, + [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt, + [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku, + [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup, + [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop, + [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify, + [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open, + [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, + [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, + [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_READ] = (nfsd4_dec)nfsd4_decode_read, + [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, + [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop, + [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove, + [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename, + [OP_RENEW] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo, + [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr, + [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SETCLIENTID_CONFIRM]= (nfsd4_dec)nfsd4_decode_notsupp, + [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify, + [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write, + [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_notsupp, + + /* new operations for NFSv4.1 */ + [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session, + [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, + [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, + [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, + [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, + [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, +}; + +struct nfsd4_minorversion_ops { + nfsd4_dec *decoders; + int nops; +}; + +static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { + [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, + [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) }, +}; + +static __be32 +nfsd4_decode_compound(struct nfsd4_compoundargs *argp) +{ + DECODE_HEAD; + struct nfsd4_op *op; + struct nfsd4_minorversion_ops *ops; + int i; + + /* + * XXX: According to spec, we should check the tag + * for UTF-8 compliance. I'm postponing this for + * now because it seems that some clients do use + * binary tags. + */ + READ_BUF(4); + READ32(argp->taglen); + READ_BUF(argp->taglen + 8); + SAVEMEM(argp->tag, argp->taglen); + READ32(argp->minorversion); + READ32(argp->opcnt); + + if (argp->taglen > NFSD4_MAX_TAGLEN) + goto xdr_error; + if (argp->opcnt > 100) + goto xdr_error; + + if (argp->opcnt > ARRAY_SIZE(argp->iops)) { + argp->ops = kmalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); + if (!argp->ops) { + argp->ops = argp->iops; + dprintk("nfsd: couldn't allocate room for COMPOUND\n"); + goto xdr_error; + } + } + + if (argp->minorversion >= ARRAY_SIZE(nfsd4_minorversion)) + argp->opcnt = 0; + + ops = &nfsd4_minorversion[argp->minorversion]; + for (i = 0; i < argp->opcnt; i++) { + op = &argp->ops[i]; + op->replay = NULL; + + /* + * We can't use READ_BUF() here because we need to handle + * a missing opcode as an OP_WRITE + 1. So we need to check + * to see if we're truly at the end of our buffer or if there + * is another page we need to flip to. + */ + + if (argp->p == argp->end) { + if (argp->pagelen < 4) { + /* There isn't an opcode still on the wire */ + op->opnum = OP_WRITE + 1; + op->status = nfserr_bad_xdr; + argp->opcnt = i+1; + break; + } + + /* + * False alarm. We just hit a page boundary, but there + * is still data available. Move pointer across page + * boundary. *snip from READ_BUF* + */ + argp->p = page_address(argp->pagelist[0]); + argp->pagelist++; + if (argp->pagelen < PAGE_SIZE) { + argp->end = argp->p + (argp->pagelen>>2); + argp->pagelen = 0; + } else { + argp->end = argp->p + (PAGE_SIZE>>2); + argp->pagelen -= PAGE_SIZE; + } + } + op->opnum = ntohl(*argp->p++); + + if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP) + op->status = ops->decoders[op->opnum](argp, &op->u); + else { + op->opnum = OP_ILLEGAL; + op->status = nfserr_op_illegal; + } + + if (op->status) { + argp->opcnt = i+1; + break; + } + } + + DECODE_TAIL; +} + +#define WRITE32(n) *p++ = htonl(n) +#define WRITE64(n) do { \ + *p++ = htonl((u32)((n) >> 32)); \ + *p++ = htonl((u32)(n)); \ +} while (0) +#define WRITEMEM(ptr,nbytes) do { if (nbytes > 0) { \ + *(p + XDR_QUADLEN(nbytes) -1) = 0; \ + memcpy(p, ptr, nbytes); \ + p += XDR_QUADLEN(nbytes); \ +}} while (0) + +static void write32(__be32 **p, u32 n) +{ + *(*p)++ = n; +} + +static void write64(__be32 **p, u64 n) +{ + write32(p, (u32)(n >> 32)); + write32(p, (u32)n); +} + +static void write_change(__be32 **p, struct kstat *stat, struct inode *inode) +{ + if (IS_I_VERSION(inode)) { + write64(p, inode->i_version); + } else { + write32(p, stat->ctime.tv_sec); + write32(p, stat->ctime.tv_nsec); + } +} + +static void write_cinfo(__be32 **p, struct nfsd4_change_info *c) +{ + write32(p, c->atomic); + if (c->change_supported) { + write64(p, c->before_change); + write64(p, c->after_change); + } else { + write32(p, c->before_ctime_sec); + write32(p, c->before_ctime_nsec); + write32(p, c->after_ctime_sec); + write32(p, c->after_ctime_nsec); + } +} + +#define RESERVE_SPACE(nbytes) do { \ + p = resp->p; \ + BUG_ON(p + XDR_QUADLEN(nbytes) > resp->end); \ +} while (0) +#define ADJUST_ARGS() resp->p = p + +/* + * Header routine to setup seqid operation replay cache + */ +#define ENCODE_SEQID_OP_HEAD \ + __be32 *save; \ + \ + save = resp->p; + +static bool seqid_mutating_err(__be32 err) +{ + /* rfc 3530 section 8.1.5: */ + return err != nfserr_stale_clientid && + err != nfserr_stale_stateid && + err != nfserr_bad_stateid && + err != nfserr_bad_seqid && + err != nfserr_bad_xdr && + err != nfserr_resource && + err != nfserr_nofilehandle; +} + +/* + * Routine for encoding the result of a "seqid-mutating" NFSv4 operation. This + * is where sequence id's are incremented, and the replay cache is filled. + * Note that we increment sequence id's here, at the last moment, so we're sure + * we know whether the error to be returned is a sequence id mutating error. + */ + +#define ENCODE_SEQID_OP_TAIL(stateowner) do { \ + if (seqid_mutating_err(nfserr) && stateowner) { \ + stateowner->so_seqid++; \ + stateowner->so_replay.rp_status = nfserr; \ + stateowner->so_replay.rp_buflen = \ + (((char *)(resp)->p - (char *)save)); \ + memcpy(stateowner->so_replay.rp_buf, save, \ + stateowner->so_replay.rp_buflen); \ + } } while (0); + +/* Encode as an array of strings the string given with components + * separated @sep. + */ +static __be32 nfsd4_encode_components(char sep, char *components, + __be32 **pp, int *buflen) +{ + __be32 *p = *pp; + __be32 *countp = p; + int strlen, count=0; + char *str, *end; + + dprintk("nfsd4_encode_components(%s)\n", components); + if ((*buflen -= 4) < 0) + return nfserr_resource; + WRITE32(0); /* We will fill this in with @count later */ + end = str = components; + while (*end) { + for (; *end && (*end != sep); end++) + ; /* Point to end of component */ + strlen = end - str; + if (strlen) { + if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0) + return nfserr_resource; + WRITE32(strlen); + WRITEMEM(str, strlen); + count++; + } + else + end++; + str = end; + } + *pp = p; + p = countp; + WRITE32(count); + return 0; +} + +/* + * encode a location element of a fs_locations structure + */ +static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location, + __be32 **pp, int *buflen) +{ + __be32 status; + __be32 *p = *pp; + + status = nfsd4_encode_components(':', location->hosts, &p, buflen); + if (status) + return status; + status = nfsd4_encode_components('/', location->path, &p, buflen); + if (status) + return status; + *pp = p; + return 0; +} + +/* + * Return the path to an export point in the pseudo filesystem namespace + * Returned string is safe to use as long as the caller holds a reference + * to @exp. + */ +static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat) +{ + struct svc_fh tmp_fh; + char *path = NULL, *rootpath; + size_t rootlen; + + fh_init(&tmp_fh, NFS4_FHSIZE); + *stat = exp_pseudoroot(rqstp, &tmp_fh); + if (*stat) + return NULL; + rootpath = tmp_fh.fh_export->ex_pathname; + + path = exp->ex_pathname; + + rootlen = strlen(rootpath); + if (strncmp(path, rootpath, rootlen)) { + dprintk("nfsd: fs_locations failed;" + "%s is not contained in %s\n", path, rootpath); + *stat = nfserr_notsupp; + path = NULL; + goto out; + } + path += rootlen; +out: + fh_put(&tmp_fh); + return path; +} + +/* + * encode a fs_locations structure + */ +static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp, + struct svc_export *exp, + __be32 **pp, int *buflen) +{ + __be32 status; + int i; + __be32 *p = *pp; + struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; + char *root = nfsd4_path(rqstp, exp, &status); + + if (status) + return status; + status = nfsd4_encode_components('/', root, &p, buflen); + if (status) + return status; + if ((*buflen -= 4) < 0) + return nfserr_resource; + WRITE32(fslocs->locations_count); + for (i=0; ilocations_count; i++) { + status = nfsd4_encode_fs_location4(&fslocs->locations[i], + &p, buflen); + if (status) + return status; + } + *pp = p; + return 0; +} + +static u32 nfs4_ftypes[16] = { + NF4BAD, NF4FIFO, NF4CHR, NF4BAD, + NF4DIR, NF4BAD, NF4BLK, NF4BAD, + NF4REG, NF4BAD, NF4LNK, NF4BAD, + NF4SOCK, NF4BAD, NF4LNK, NF4BAD, +}; + +static __be32 +nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, + __be32 **p, int *buflen) +{ + int status; + + if (*buflen < (XDR_QUADLEN(IDMAP_NAMESZ) << 2) + 4) + return nfserr_resource; + if (whotype != NFS4_ACL_WHO_NAMED) + status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1)); + else if (group) + status = nfsd_map_gid_to_name(rqstp, id, (u8 *)(*p + 1)); + else + status = nfsd_map_uid_to_name(rqstp, id, (u8 *)(*p + 1)); + if (status < 0) + return nfserrno(status); + *p = xdr_encode_opaque(*p, NULL, status); + *buflen -= (XDR_QUADLEN(status) << 2) + 4; + BUG_ON(*buflen < 0); + return 0; +} + +static inline __be32 +nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, __be32 **p, int *buflen) +{ + return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen); +} + +static inline __be32 +nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, __be32 **p, int *buflen) +{ + return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen); +} + +static inline __be32 +nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group, + __be32 **p, int *buflen) +{ + return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen); +} + +#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ + FATTR4_WORD0_RDATTR_ERROR) +#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID + +static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) +{ + /* As per referral draft: */ + if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS || + *bmval1 & ~WORD1_ABSENT_FS_ATTRS) { + if (*bmval0 & FATTR4_WORD0_RDATTR_ERROR || + *bmval0 & FATTR4_WORD0_FS_LOCATIONS) + *rdattr_err = NFSERR_MOVED; + else + return nfserr_moved; + } + *bmval0 &= WORD0_ABSENT_FS_ATTRS; + *bmval1 &= WORD1_ABSENT_FS_ATTRS; + return 0; +} + +/* + * Note: @fhp can be NULL; in this case, we might have to compose the filehandle + * ourselves. + * + * @countp is the buffer size in _words_; upon successful return this becomes + * replaced with the number of words written. + */ +__be32 +nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval, + struct svc_rqst *rqstp, int ignore_crossmnt) +{ + u32 bmval0 = bmval[0]; + u32 bmval1 = bmval[1]; + u32 bmval2 = bmval[2]; + struct kstat stat; + struct svc_fh tempfh; + struct kstatfs statfs; + int buflen = *countp << 2; + __be32 *attrlenp; + u32 dummy; + u64 dummy64; + u32 rdattr_err = 0; + __be32 *p = buffer; + __be32 status; + int err; + int aclsupport = 0; + struct nfs4_acl *acl = NULL; + struct nfsd4_compoundres *resp = rqstp->rq_resp; + u32 minorversion = resp->cstate.minorversion; + struct path path = { + .mnt = exp->ex_path.mnt, + .dentry = dentry, + }; + + BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); + BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion)); + BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion)); + BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion)); + + if (exp->ex_fslocs.migrated) { + BUG_ON(bmval[2]); + status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err); + if (status) + goto out; + } + + err = vfs_getattr(exp->ex_path.mnt, dentry, &stat); + if (err) + goto out_nfserr; + if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | + FATTR4_WORD0_MAXNAME)) || + (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | + FATTR4_WORD1_SPACE_TOTAL))) { + err = vfs_statfs(&path, &statfs); + if (err) + goto out_nfserr; + } + if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { + fh_init(&tempfh, NFS4_FHSIZE); + status = fh_compose(&tempfh, exp, dentry, NULL); + if (status) + goto out; + fhp = &tempfh; + } + if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT + | FATTR4_WORD0_SUPPORTED_ATTRS)) { + err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl); + aclsupport = (err == 0); + if (bmval0 & FATTR4_WORD0_ACL) { + if (err == -EOPNOTSUPP) + bmval0 &= ~FATTR4_WORD0_ACL; + else if (err == -EINVAL) { + status = nfserr_attrnotsupp; + goto out; + } else if (err != 0) + goto out_nfserr; + } + } + + if (bmval2) { + if ((buflen -= 16) < 0) + goto out_resource; + WRITE32(3); + WRITE32(bmval0); + WRITE32(bmval1); + WRITE32(bmval2); + } else if (bmval1) { + if ((buflen -= 12) < 0) + goto out_resource; + WRITE32(2); + WRITE32(bmval0); + WRITE32(bmval1); + } else { + if ((buflen -= 8) < 0) + goto out_resource; + WRITE32(1); + WRITE32(bmval0); + } + attrlenp = p++; /* to be backfilled later */ + + if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { + u32 word0 = nfsd_suppattrs0(minorversion); + u32 word1 = nfsd_suppattrs1(minorversion); + u32 word2 = nfsd_suppattrs2(minorversion); + + if (!aclsupport) + word0 &= ~FATTR4_WORD0_ACL; + if (!word2) { + if ((buflen -= 12) < 0) + goto out_resource; + WRITE32(2); + WRITE32(word0); + WRITE32(word1); + } else { + if ((buflen -= 16) < 0) + goto out_resource; + WRITE32(3); + WRITE32(word0); + WRITE32(word1); + WRITE32(word2); + } + } + if (bmval0 & FATTR4_WORD0_TYPE) { + if ((buflen -= 4) < 0) + goto out_resource; + dummy = nfs4_ftypes[(stat.mode & S_IFMT) >> 12]; + if (dummy == NF4BAD) + goto out_serverfault; + WRITE32(dummy); + } + if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) { + if ((buflen -= 4) < 0) + goto out_resource; + if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) + WRITE32(NFS4_FH_PERSISTENT); + else + WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME); + } + if (bmval0 & FATTR4_WORD0_CHANGE) { + if ((buflen -= 8) < 0) + goto out_resource; + write_change(&p, &stat, dentry->d_inode); + } + if (bmval0 & FATTR4_WORD0_SIZE) { + if ((buflen -= 8) < 0) + goto out_resource; + WRITE64(stat.size); + } + if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(1); + } + if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(1); + } + if (bmval0 & FATTR4_WORD0_NAMED_ATTR) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(0); + } + if (bmval0 & FATTR4_WORD0_FSID) { + if ((buflen -= 16) < 0) + goto out_resource; + if (exp->ex_fslocs.migrated) { + WRITE64(NFS4_REFERRAL_FSID_MAJOR); + WRITE64(NFS4_REFERRAL_FSID_MINOR); + } else switch(fsid_source(fhp)) { + case FSIDSOURCE_FSID: + WRITE64((u64)exp->ex_fsid); + WRITE64((u64)0); + break; + case FSIDSOURCE_DEV: + WRITE32(0); + WRITE32(MAJOR(stat.dev)); + WRITE32(0); + WRITE32(MINOR(stat.dev)); + break; + case FSIDSOURCE_UUID: + WRITEMEM(exp->ex_uuid, 16); + break; + } + } + if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(0); + } + if (bmval0 & FATTR4_WORD0_LEASE_TIME) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(nfsd4_lease); + } + if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(rdattr_err); + } + if (bmval0 & FATTR4_WORD0_ACL) { + struct nfs4_ace *ace; + + if (acl == NULL) { + if ((buflen -= 4) < 0) + goto out_resource; + + WRITE32(0); + goto out_acl; + } + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(acl->naces); + + for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { + if ((buflen -= 4*3) < 0) + goto out_resource; + WRITE32(ace->type); + WRITE32(ace->flag); + WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL); + status = nfsd4_encode_aclname(rqstp, ace->whotype, + ace->who, ace->flag & NFS4_ACE_IDENTIFIER_GROUP, + &p, &buflen); + if (status == nfserr_resource) + goto out_resource; + if (status) + goto out; + } + } +out_acl: + if (bmval0 & FATTR4_WORD0_ACLSUPPORT) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(aclsupport ? + ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0); + } + if (bmval0 & FATTR4_WORD0_CANSETTIME) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(1); + } + if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(1); + } + if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(1); + } + if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(1); + } + if (bmval0 & FATTR4_WORD0_FILEHANDLE) { + buflen -= (XDR_QUADLEN(fhp->fh_handle.fh_size) << 2) + 4; + if (buflen < 0) + goto out_resource; + WRITE32(fhp->fh_handle.fh_size); + WRITEMEM(&fhp->fh_handle.fh_base, fhp->fh_handle.fh_size); + } + if (bmval0 & FATTR4_WORD0_FILEID) { + if ((buflen -= 8) < 0) + goto out_resource; + WRITE64(stat.ino); + } + if (bmval0 & FATTR4_WORD0_FILES_AVAIL) { + if ((buflen -= 8) < 0) + goto out_resource; + WRITE64((u64) statfs.f_ffree); + } + if (bmval0 & FATTR4_WORD0_FILES_FREE) { + if ((buflen -= 8) < 0) + goto out_resource; + WRITE64((u64) statfs.f_ffree); + } + if (bmval0 & FATTR4_WORD0_FILES_TOTAL) { + if ((buflen -= 8) < 0) + goto out_resource; + WRITE64((u64) statfs.f_files); + } + if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { + status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen); + if (status == nfserr_resource) + goto out_resource; + if (status) + goto out; + } + if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(1); + } + if (bmval0 & FATTR4_WORD0_MAXFILESIZE) { + if ((buflen -= 8) < 0) + goto out_resource; + WRITE64(~(u64)0); + } + if (bmval0 & FATTR4_WORD0_MAXLINK) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(255); + } + if (bmval0 & FATTR4_WORD0_MAXNAME) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(statfs.f_namelen); + } + if (bmval0 & FATTR4_WORD0_MAXREAD) { + if ((buflen -= 8) < 0) + goto out_resource; + WRITE64((u64) svc_max_payload(rqstp)); + } + if (bmval0 & FATTR4_WORD0_MAXWRITE) { + if ((buflen -= 8) < 0) + goto out_resource; + WRITE64((u64) svc_max_payload(rqstp)); + } + if (bmval1 & FATTR4_WORD1_MODE) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(stat.mode & S_IALLUGO); + } + if (bmval1 & FATTR4_WORD1_NO_TRUNC) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(1); + } + if (bmval1 & FATTR4_WORD1_NUMLINKS) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(stat.nlink); + } + if (bmval1 & FATTR4_WORD1_OWNER) { + status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen); + if (status == nfserr_resource) + goto out_resource; + if (status) + goto out; + } + if (bmval1 & FATTR4_WORD1_OWNER_GROUP) { + status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen); + if (status == nfserr_resource) + goto out_resource; + if (status) + goto out; + } + if (bmval1 & FATTR4_WORD1_RAWDEV) { + if ((buflen -= 8) < 0) + goto out_resource; + WRITE32((u32) MAJOR(stat.rdev)); + WRITE32((u32) MINOR(stat.rdev)); + } + if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) { + if ((buflen -= 8) < 0) + goto out_resource; + dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize; + WRITE64(dummy64); + } + if (bmval1 & FATTR4_WORD1_SPACE_FREE) { + if ((buflen -= 8) < 0) + goto out_resource; + dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize; + WRITE64(dummy64); + } + if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) { + if ((buflen -= 8) < 0) + goto out_resource; + dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize; + WRITE64(dummy64); + } + if (bmval1 & FATTR4_WORD1_SPACE_USED) { + if ((buflen -= 8) < 0) + goto out_resource; + dummy64 = (u64)stat.blocks << 9; + WRITE64(dummy64); + } + if (bmval1 & FATTR4_WORD1_TIME_ACCESS) { + if ((buflen -= 12) < 0) + goto out_resource; + WRITE32(0); + WRITE32(stat.atime.tv_sec); + WRITE32(stat.atime.tv_nsec); + } + if (bmval1 & FATTR4_WORD1_TIME_DELTA) { + if ((buflen -= 12) < 0) + goto out_resource; + WRITE32(0); + WRITE32(1); + WRITE32(0); + } + if (bmval1 & FATTR4_WORD1_TIME_METADATA) { + if ((buflen -= 12) < 0) + goto out_resource; + WRITE32(0); + WRITE32(stat.ctime.tv_sec); + WRITE32(stat.ctime.tv_nsec); + } + if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { + if ((buflen -= 12) < 0) + goto out_resource; + WRITE32(0); + WRITE32(stat.mtime.tv_sec); + WRITE32(stat.mtime.tv_nsec); + } + if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { + if ((buflen -= 8) < 0) + goto out_resource; + /* + * Get parent's attributes if not ignoring crossmount + * and this is the root of a cross-mounted filesystem. + */ + if (ignore_crossmnt == 0 && + dentry == exp->ex_path.mnt->mnt_root) { + struct path path = exp->ex_path; + path_get(&path); + while (follow_up(&path)) { + if (path.dentry != path.mnt->mnt_root) + break; + } + err = vfs_getattr(path.mnt, path.dentry, &stat); + path_put(&path); + if (err) + goto out_nfserr; + } + WRITE64(stat.ino); + } + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + WRITE32(3); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2); + } + + *attrlenp = htonl((char *)p - (char *)attrlenp - 4); + *countp = p - buffer; + status = nfs_ok; + +out: + kfree(acl); + if (fhp == &tempfh) + fh_put(&tempfh); + return status; +out_nfserr: + status = nfserrno(err); + goto out; +out_resource: + *countp = 0; + status = nfserr_resource; + goto out; +out_serverfault: + status = nfserr_serverfault; + goto out; +} + +static inline int attributes_need_mount(u32 *bmval) +{ + if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME)) + return 1; + if (bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID) + return 1; + return 0; +} + +static __be32 +nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, + const char *name, int namlen, __be32 *p, int *buflen) +{ + struct svc_export *exp = cd->rd_fhp->fh_export; + struct dentry *dentry; + __be32 nfserr; + int ignore_crossmnt = 0; + + dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); + if (IS_ERR(dentry)) + return nfserrno(PTR_ERR(dentry)); + if (!dentry->d_inode) { + /* + * nfsd_buffered_readdir drops the i_mutex between + * readdir and calling this callback, leaving a window + * where this directory entry could have gone away. + */ + dput(dentry); + return nfserr_noent; + } + + exp_get(exp); + /* + * In the case of a mountpoint, the client may be asking for + * attributes that are only properties of the underlying filesystem + * as opposed to the cross-mounted file system. In such a case, + * we will not follow the cross mount and will fill the attribtutes + * directly from the mountpoint dentry. + */ + if (nfsd_mountpoint(dentry, exp)) { + int err; + + if (!(exp->ex_flags & NFSEXP_V4ROOT) + && !attributes_need_mount(cd->rd_bmval)) { + ignore_crossmnt = 1; + goto out_encode; + } + /* + * Why the heck aren't we just using nfsd_lookup?? + * Different "."/".." handling? Something else? + * At least, add a comment here to explain.... + */ + err = nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp); + if (err) { + nfserr = nfserrno(err); + goto out_put; + } + nfserr = check_nfsd_access(exp, cd->rd_rqstp); + if (nfserr) + goto out_put; + + } +out_encode: + nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, + cd->rd_rqstp, ignore_crossmnt); +out_put: + dput(dentry); + exp_put(exp); + return nfserr; +} + +static __be32 * +nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr) +{ + __be32 *attrlenp; + + if (buflen < 6) + return NULL; + *p++ = htonl(2); + *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */ + *p++ = htonl(0); /* bmval1 */ + + attrlenp = p++; + *p++ = nfserr; /* no htonl */ + *attrlenp = htonl((char *)p - (char *)attrlenp - 4); + return p; +} + +static int +nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = ccdv; + struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common); + int buflen; + __be32 *p = cd->buffer; + __be32 *cookiep; + __be32 nfserr = nfserr_toosmall; + + /* In nfsv4, "." and ".." never make it onto the wire.. */ + if (name && isdotent(name, namlen)) { + cd->common.err = nfs_ok; + return 0; + } + + if (cd->offset) + xdr_encode_hyper(cd->offset, (u64) offset); + + buflen = cd->buflen - 4 - XDR_QUADLEN(namlen); + if (buflen < 0) + goto fail; + + *p++ = xdr_one; /* mark entry present */ + cookiep = p; + p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ + p = xdr_encode_array(p, name, namlen); /* name length & name */ + + nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, p, &buflen); + switch (nfserr) { + case nfs_ok: + p += buflen; + break; + case nfserr_resource: + nfserr = nfserr_toosmall; + goto fail; + case nfserr_noent: + goto skip_entry; + default: + /* + * If the client requested the RDATTR_ERROR attribute, + * we stuff the error code into this attribute + * and continue. If this attribute was not requested, + * then in accordance with the spec, we fail the + * entire READDIR operation(!) + */ + if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)) + goto fail; + p = nfsd4_encode_rdattr_error(p, buflen, nfserr); + if (p == NULL) { + nfserr = nfserr_toosmall; + goto fail; + } + } + cd->buflen -= (p - cd->buffer); + cd->buffer = p; + cd->offset = cookiep; +skip_entry: + cd->common.err = nfs_ok; + return 0; +fail: + cd->common.err = nfserr; + return -EINVAL; +} + +static void +nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid) +{ + __be32 *p; + + RESERVE_SPACE(sizeof(stateid_t)); + WRITE32(sid->si_generation); + WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + ADJUST_ARGS(); +} + +static __be32 +nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) +{ + __be32 *p; + + if (!nfserr) { + RESERVE_SPACE(8); + WRITE32(access->ac_supported); + WRITE32(access->ac_resp_access); + ADJUST_ARGS(); + } + return nfserr; +} + +static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) +{ + __be32 *p; + + if (!nfserr) { + RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8); + WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); + WRITE32(bcts->dir); + /* XXX: ? */ + WRITE32(0); + ADJUST_ARGS(); + } + return nfserr; +} + +static __be32 +nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) +{ + ENCODE_SEQID_OP_HEAD; + + if (!nfserr) + nfsd4_encode_stateid(resp, &close->cl_stateid); + + ENCODE_SEQID_OP_TAIL(close->cl_stateowner); + return nfserr; +} + + +static __be32 +nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) +{ + __be32 *p; + + if (!nfserr) { + RESERVE_SPACE(8); + WRITEMEM(commit->co_verf.data, 8); + ADJUST_ARGS(); + } + return nfserr; +} + +static __be32 +nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) +{ + __be32 *p; + + if (!nfserr) { + RESERVE_SPACE(32); + write_cinfo(&p, &create->cr_cinfo); + WRITE32(2); + WRITE32(create->cr_bmval[0]); + WRITE32(create->cr_bmval[1]); + ADJUST_ARGS(); + } + return nfserr; +} + +static __be32 +nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr) +{ + struct svc_fh *fhp = getattr->ga_fhp; + int buflen; + + if (nfserr) + return nfserr; + + buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2); + nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry, + resp->p, &buflen, getattr->ga_bmval, + resp->rqstp, 0); + if (!nfserr) + resp->p += buflen; + return nfserr; +} + +static __be32 +nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp) +{ + struct svc_fh *fhp = *fhpp; + unsigned int len; + __be32 *p; + + if (!nfserr) { + len = fhp->fh_handle.fh_size; + RESERVE_SPACE(len + 4); + WRITE32(len); + WRITEMEM(&fhp->fh_handle.fh_base, len); + ADJUST_ARGS(); + } + return nfserr; +} + +/* +* Including all fields other than the name, a LOCK4denied structure requires +* 8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes. +*/ +static void +nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld) +{ + __be32 *p; + + RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0)); + WRITE64(ld->ld_start); + WRITE64(ld->ld_length); + WRITE32(ld->ld_type); + if (ld->ld_sop) { + WRITEMEM(&ld->ld_clientid, 8); + WRITE32(ld->ld_sop->so_owner.len); + WRITEMEM(ld->ld_sop->so_owner.data, ld->ld_sop->so_owner.len); + kref_put(&ld->ld_sop->so_ref, nfs4_free_stateowner); + } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ + WRITE64((u64)0); /* clientid */ + WRITE32(0); /* length of owner name */ + } + ADJUST_ARGS(); +} + +static __be32 +nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) +{ + ENCODE_SEQID_OP_HEAD; + + if (!nfserr) + nfsd4_encode_stateid(resp, &lock->lk_resp_stateid); + else if (nfserr == nfserr_denied) + nfsd4_encode_lock_denied(resp, &lock->lk_denied); + + ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); + return nfserr; +} + +static __be32 +nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt) +{ + if (nfserr == nfserr_denied) + nfsd4_encode_lock_denied(resp, &lockt->lt_denied); + return nfserr; +} + +static __be32 +nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) +{ + ENCODE_SEQID_OP_HEAD; + + if (!nfserr) + nfsd4_encode_stateid(resp, &locku->lu_stateid); + + ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); + return nfserr; +} + + +static __be32 +nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) +{ + __be32 *p; + + if (!nfserr) { + RESERVE_SPACE(20); + write_cinfo(&p, &link->li_cinfo); + ADJUST_ARGS(); + } + return nfserr; +} + + +static __be32 +nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) +{ + __be32 *p; + ENCODE_SEQID_OP_HEAD; + + if (nfserr) + goto out; + + nfsd4_encode_stateid(resp, &open->op_stateid); + RESERVE_SPACE(40); + write_cinfo(&p, &open->op_cinfo); + WRITE32(open->op_rflags); + WRITE32(2); + WRITE32(open->op_bmval[0]); + WRITE32(open->op_bmval[1]); + WRITE32(open->op_delegate_type); + ADJUST_ARGS(); + + switch (open->op_delegate_type) { + case NFS4_OPEN_DELEGATE_NONE: + break; + case NFS4_OPEN_DELEGATE_READ: + nfsd4_encode_stateid(resp, &open->op_delegate_stateid); + RESERVE_SPACE(20); + WRITE32(open->op_recall); + + /* + * TODO: ACE's in delegations + */ + WRITE32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + WRITE32(0); + WRITE32(0); + WRITE32(0); /* XXX: is NULL principal ok? */ + ADJUST_ARGS(); + break; + case NFS4_OPEN_DELEGATE_WRITE: + nfsd4_encode_stateid(resp, &open->op_delegate_stateid); + RESERVE_SPACE(32); + WRITE32(0); + + /* + * TODO: space_limit's in delegations + */ + WRITE32(NFS4_LIMIT_SIZE); + WRITE32(~(u32)0); + WRITE32(~(u32)0); + + /* + * TODO: ACE's in delegations + */ + WRITE32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + WRITE32(0); + WRITE32(0); + WRITE32(0); /* XXX: is NULL principal ok? */ + ADJUST_ARGS(); + break; + default: + BUG(); + } + /* XXX save filehandle here */ +out: + ENCODE_SEQID_OP_TAIL(open->op_stateowner); + return nfserr; +} + +static __be32 +nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) +{ + ENCODE_SEQID_OP_HEAD; + + if (!nfserr) + nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); + + ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); + return nfserr; +} + +static __be32 +nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) +{ + ENCODE_SEQID_OP_HEAD; + + if (!nfserr) + nfsd4_encode_stateid(resp, &od->od_stateid); + + ENCODE_SEQID_OP_TAIL(od->od_stateowner); + return nfserr; +} + +static __be32 +nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_read *read) +{ + u32 eof; + int v, pn; + unsigned long maxcount; + long len; + __be32 *p; + + if (nfserr) + return nfserr; + if (resp->xbuf->page_len) + return nfserr_resource; + + RESERVE_SPACE(8); /* eof flag and byte count */ + + maxcount = svc_max_payload(resp->rqstp); + if (maxcount > read->rd_length) + maxcount = read->rd_length; + + len = maxcount; + v = 0; + while (len > 0) { + pn = resp->rqstp->rq_resused++; + resp->rqstp->rq_vec[v].iov_base = + page_address(resp->rqstp->rq_respages[pn]); + resp->rqstp->rq_vec[v].iov_len = + len < PAGE_SIZE ? len : PAGE_SIZE; + v++; + len -= PAGE_SIZE; + } + read->rd_vlen = v; + + nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp, + read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, + &maxcount); + + if (nfserr == nfserr_symlink) + nfserr = nfserr_inval; + if (nfserr) + return nfserr; + eof = (read->rd_offset + maxcount >= + read->rd_fhp->fh_dentry->d_inode->i_size); + + WRITE32(eof); + WRITE32(maxcount); + ADJUST_ARGS(); + resp->xbuf->head[0].iov_len = (char*)p + - (char*)resp->xbuf->head[0].iov_base; + resp->xbuf->page_len = maxcount; + + /* Use rest of head for padding and remaining ops: */ + resp->xbuf->tail[0].iov_base = p; + resp->xbuf->tail[0].iov_len = 0; + if (maxcount&3) { + RESERVE_SPACE(4); + WRITE32(0); + resp->xbuf->tail[0].iov_base += maxcount&3; + resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); + ADJUST_ARGS(); + } + return 0; +} + +static __be32 +nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink) +{ + int maxcount; + char *page; + __be32 *p; + + if (nfserr) + return nfserr; + if (resp->xbuf->page_len) + return nfserr_resource; + + page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]); + + maxcount = PAGE_SIZE; + RESERVE_SPACE(4); + + /* + * XXX: By default, the ->readlink() VFS op will truncate symlinks + * if they would overflow the buffer. Is this kosher in NFSv4? If + * not, one easy fix is: if ->readlink() precisely fills the buffer, + * assume that truncation occurred, and return NFS4ERR_RESOURCE. + */ + nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, page, &maxcount); + if (nfserr == nfserr_isdir) + return nfserr_inval; + if (nfserr) + return nfserr; + + WRITE32(maxcount); + ADJUST_ARGS(); + resp->xbuf->head[0].iov_len = (char*)p + - (char*)resp->xbuf->head[0].iov_base; + resp->xbuf->page_len = maxcount; + + /* Use rest of head for padding and remaining ops: */ + resp->xbuf->tail[0].iov_base = p; + resp->xbuf->tail[0].iov_len = 0; + if (maxcount&3) { + RESERVE_SPACE(4); + WRITE32(0); + resp->xbuf->tail[0].iov_base += maxcount&3; + resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); + ADJUST_ARGS(); + } + return 0; +} + +static __be32 +nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir) +{ + int maxcount; + loff_t offset; + __be32 *page, *savep, *tailbase; + __be32 *p; + + if (nfserr) + return nfserr; + if (resp->xbuf->page_len) + return nfserr_resource; + + RESERVE_SPACE(8); /* verifier */ + savep = p; + + /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */ + WRITE32(0); + WRITE32(0); + ADJUST_ARGS(); + resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; + tailbase = p; + + maxcount = PAGE_SIZE; + if (maxcount > readdir->rd_maxcount) + maxcount = readdir->rd_maxcount; + + /* + * Convert from bytes to words, account for the two words already + * written, make sure to leave two words at the end for the next + * pointer and eof field. + */ + maxcount = (maxcount >> 2) - 4; + if (maxcount < 0) { + nfserr = nfserr_toosmall; + goto err_no_verf; + } + + page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]); + readdir->common.err = 0; + readdir->buflen = maxcount; + readdir->buffer = page; + readdir->offset = NULL; + + offset = readdir->rd_cookie; + nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, + &offset, + &readdir->common, nfsd4_encode_dirent); + if (nfserr == nfs_ok && + readdir->common.err == nfserr_toosmall && + readdir->buffer == page) + nfserr = nfserr_toosmall; + if (nfserr == nfserr_symlink) + nfserr = nfserr_notdir; + if (nfserr) + goto err_no_verf; + + if (readdir->offset) + xdr_encode_hyper(readdir->offset, offset); + + p = readdir->buffer; + *p++ = 0; /* no more entries */ + *p++ = htonl(readdir->common.err == nfserr_eof); + resp->xbuf->page_len = ((char*)p) - (char*)page_address( + resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); + + /* Use rest of head for padding and remaining ops: */ + resp->xbuf->tail[0].iov_base = tailbase; + resp->xbuf->tail[0].iov_len = 0; + resp->p = resp->xbuf->tail[0].iov_base; + resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4; + + return 0; +err_no_verf: + p = savep; + ADJUST_ARGS(); + return nfserr; +} + +static __be32 +nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) +{ + __be32 *p; + + if (!nfserr) { + RESERVE_SPACE(20); + write_cinfo(&p, &remove->rm_cinfo); + ADJUST_ARGS(); + } + return nfserr; +} + +static __be32 +nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) +{ + __be32 *p; + + if (!nfserr) { + RESERVE_SPACE(40); + write_cinfo(&p, &rename->rn_sinfo); + write_cinfo(&p, &rename->rn_tinfo); + ADJUST_ARGS(); + } + return nfserr; +} + +static __be32 +nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, + __be32 nfserr,struct svc_export *exp) +{ + int i = 0; + u32 nflavs; + struct exp_flavor_info *flavs; + struct exp_flavor_info def_flavs[2]; + __be32 *p; + + if (nfserr) + goto out; + if (exp->ex_nflavors) { + flavs = exp->ex_flavors; + nflavs = exp->ex_nflavors; + } else { /* Handling of some defaults in absence of real secinfo: */ + flavs = def_flavs; + if (exp->ex_client->flavour->flavour == RPC_AUTH_UNIX) { + nflavs = 2; + flavs[0].pseudoflavor = RPC_AUTH_UNIX; + flavs[1].pseudoflavor = RPC_AUTH_NULL; + } else if (exp->ex_client->flavour->flavour == RPC_AUTH_GSS) { + nflavs = 1; + flavs[0].pseudoflavor + = svcauth_gss_flavor(exp->ex_client); + } else { + nflavs = 1; + flavs[0].pseudoflavor + = exp->ex_client->flavour->flavour; + } + } + + RESERVE_SPACE(4); + WRITE32(nflavs); + ADJUST_ARGS(); + for (i = 0; i < nflavs; i++) { + u32 flav = flavs[i].pseudoflavor; + struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav); + + if (gm) { + RESERVE_SPACE(4); + WRITE32(RPC_AUTH_GSS); + ADJUST_ARGS(); + RESERVE_SPACE(4 + gm->gm_oid.len); + WRITE32(gm->gm_oid.len); + WRITEMEM(gm->gm_oid.data, gm->gm_oid.len); + ADJUST_ARGS(); + RESERVE_SPACE(4); + WRITE32(0); /* qop */ + ADJUST_ARGS(); + RESERVE_SPACE(4); + WRITE32(gss_pseudoflavor_to_service(gm, flav)); + ADJUST_ARGS(); + gss_mech_put(gm); + } else { + RESERVE_SPACE(4); + WRITE32(flav); + ADJUST_ARGS(); + } + } +out: + if (exp) + exp_put(exp); + return nfserr; +} + +static __be32 +nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_secinfo *secinfo) +{ + return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp); +} + +static __be32 +nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_secinfo_no_name *secinfo) +{ + return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp); +} + +/* + * The SETATTR encode routine is special -- it always encodes a bitmap, + * regardless of the error status. + */ +static __be32 +nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) +{ + __be32 *p; + + RESERVE_SPACE(12); + if (nfserr) { + WRITE32(2); + WRITE32(0); + WRITE32(0); + } + else { + WRITE32(2); + WRITE32(setattr->sa_bmval[0]); + WRITE32(setattr->sa_bmval[1]); + } + ADJUST_ARGS(); + return nfserr; +} + +static __be32 +nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) +{ + __be32 *p; + + if (!nfserr) { + RESERVE_SPACE(8 + sizeof(nfs4_verifier)); + WRITEMEM(&scd->se_clientid, 8); + WRITEMEM(&scd->se_confirm, sizeof(nfs4_verifier)); + ADJUST_ARGS(); + } + else if (nfserr == nfserr_clid_inuse) { + RESERVE_SPACE(8); + WRITE32(0); + WRITE32(0); + ADJUST_ARGS(); + } + return nfserr; +} + +static __be32 +nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) +{ + __be32 *p; + + if (!nfserr) { + RESERVE_SPACE(16); + WRITE32(write->wr_bytes_written); + WRITE32(write->wr_how_written); + WRITEMEM(write->wr_verifier.data, 8); + ADJUST_ARGS(); + } + return nfserr; +} + +static __be32 +nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_exchange_id *exid) +{ + __be32 *p; + char *major_id; + char *server_scope; + int major_id_sz; + int server_scope_sz; + uint64_t minor_id = 0; + + if (nfserr) + return nfserr; + + major_id = utsname()->nodename; + major_id_sz = strlen(major_id); + server_scope = utsname()->nodename; + server_scope_sz = strlen(server_scope); + + RESERVE_SPACE( + 8 /* eir_clientid */ + + 4 /* eir_sequenceid */ + + 4 /* eir_flags */ + + 4 /* spr_how (SP4_NONE) */ + + 8 /* so_minor_id */ + + 4 /* so_major_id.len */ + + (XDR_QUADLEN(major_id_sz) * 4) + + 4 /* eir_server_scope.len */ + + (XDR_QUADLEN(server_scope_sz) * 4) + + 4 /* eir_server_impl_id.count (0) */); + + WRITEMEM(&exid->clientid, 8); + WRITE32(exid->seqid); + WRITE32(exid->flags); + + /* state_protect4_r. Currently only support SP4_NONE */ + BUG_ON(exid->spa_how != SP4_NONE); + WRITE32(exid->spa_how); + + /* The server_owner struct */ + WRITE64(minor_id); /* Minor id */ + /* major id */ + WRITE32(major_id_sz); + WRITEMEM(major_id, major_id_sz); + + /* Server scope */ + WRITE32(server_scope_sz); + WRITEMEM(server_scope, server_scope_sz); + + /* Implementation id */ + WRITE32(0); /* zero length nfs_impl_id4 array */ + ADJUST_ARGS(); + return 0; +} + +static __be32 +nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_create_session *sess) +{ + __be32 *p; + + if (nfserr) + return nfserr; + + RESERVE_SPACE(24); + WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN); + WRITE32(sess->seqid); + WRITE32(sess->flags); + ADJUST_ARGS(); + + RESERVE_SPACE(28); + WRITE32(0); /* headerpadsz */ + WRITE32(sess->fore_channel.maxreq_sz); + WRITE32(sess->fore_channel.maxresp_sz); + WRITE32(sess->fore_channel.maxresp_cached); + WRITE32(sess->fore_channel.maxops); + WRITE32(sess->fore_channel.maxreqs); + WRITE32(sess->fore_channel.nr_rdma_attrs); + ADJUST_ARGS(); + + if (sess->fore_channel.nr_rdma_attrs) { + RESERVE_SPACE(4); + WRITE32(sess->fore_channel.rdma_attrs); + ADJUST_ARGS(); + } + + RESERVE_SPACE(28); + WRITE32(0); /* headerpadsz */ + WRITE32(sess->back_channel.maxreq_sz); + WRITE32(sess->back_channel.maxresp_sz); + WRITE32(sess->back_channel.maxresp_cached); + WRITE32(sess->back_channel.maxops); + WRITE32(sess->back_channel.maxreqs); + WRITE32(sess->back_channel.nr_rdma_attrs); + ADJUST_ARGS(); + + if (sess->back_channel.nr_rdma_attrs) { + RESERVE_SPACE(4); + WRITE32(sess->back_channel.rdma_attrs); + ADJUST_ARGS(); + } + return 0; +} + +static __be32 +nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_destroy_session *destroy_session) +{ + return nfserr; +} + +static __be32 +nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_sequence *seq) +{ + __be32 *p; + + if (nfserr) + return nfserr; + + RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20); + WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); + WRITE32(seq->seqid); + WRITE32(seq->slotid); + WRITE32(seq->maxslots); + /* For now: target_maxslots = maxslots */ + WRITE32(seq->maxslots); + WRITE32(seq->status_flags); + + ADJUST_ARGS(); + resp->cstate.datap = p; /* DRC cache data pointer */ + return 0; +} + +static __be32 +nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) +{ + return nfserr; +} + +typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); + +/* + * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1 + * since we don't need to filter out obsolete ops as this is + * done in the decoding phase. + */ +static nfsd4_enc nfsd4_enc_ops[] = { + [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, + [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, + [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit, + [OP_CREATE] = (nfsd4_enc)nfsd4_encode_create, + [OP_DELEGPURGE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_DELEGRETURN] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETATTR] = (nfsd4_enc)nfsd4_encode_getattr, + [OP_GETFH] = (nfsd4_enc)nfsd4_encode_getfh, + [OP_LINK] = (nfsd4_enc)nfsd4_encode_link, + [OP_LOCK] = (nfsd4_enc)nfsd4_encode_lock, + [OP_LOCKT] = (nfsd4_enc)nfsd4_encode_lockt, + [OP_LOCKU] = (nfsd4_enc)nfsd4_encode_locku, + [OP_LOOKUP] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, + [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, + [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, + [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, + [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_PUTPUBFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_PUTROOTFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_READ] = (nfsd4_enc)nfsd4_encode_read, + [OP_READDIR] = (nfsd4_enc)nfsd4_encode_readdir, + [OP_READLINK] = (nfsd4_enc)nfsd4_encode_readlink, + [OP_REMOVE] = (nfsd4_enc)nfsd4_encode_remove, + [OP_RENAME] = (nfsd4_enc)nfsd4_encode_rename, + [OP_RENEW] = (nfsd4_enc)nfsd4_encode_noop, + [OP_RESTOREFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SAVEFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SECINFO] = (nfsd4_enc)nfsd4_encode_secinfo, + [OP_SETATTR] = (nfsd4_enc)nfsd4_encode_setattr, + [OP_SETCLIENTID] = (nfsd4_enc)nfsd4_encode_setclientid, + [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop, + [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, + [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, + + /* NFSv4.1 operations */ + [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, + [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session, + [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, + [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, + [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, + [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, + [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, +}; + +/* + * Calculate the total amount of memory that the compound response has taken + * after encoding the current operation. + * + * pad: add on 8 bytes for the next operation's op_code and status so that + * there is room to cache a failure on the next operation. + * + * Compare this length to the session se_fmaxresp_cached. + * + * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so + * will be at least a page and will therefore hold the xdr_buf head. + */ +static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) +{ + int status = 0; + struct xdr_buf *xb = &resp->rqstp->rq_res; + struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; + struct nfsd4_session *session = NULL; + struct nfsd4_slot *slot = resp->cstate.slot; + u32 length, tlen = 0, pad = 8; + + if (!nfsd4_has_session(&resp->cstate)) + return status; + + session = resp->cstate.session; + if (session == NULL || slot->sl_cachethis == 0) + return status; + + if (resp->opcnt >= args->opcnt) + pad = 0; /* this is the last operation */ + + if (xb->page_len == 0) { + length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; + } else { + if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0) + tlen = (char *)resp->p - (char *)xb->tail[0].iov_base; + + length = xb->head[0].iov_len + xb->page_len + tlen + pad; + } + dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__, + length, xb->page_len, tlen, pad); + + if (length <= session->se_fchannel.maxresp_cached) + return status; + else + return nfserr_rep_too_big_to_cache; +} + +void +nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) +{ + __be32 *statp; + __be32 *p; + + RESERVE_SPACE(8); + WRITE32(op->opnum); + statp = p++; /* to be backfilled at the end */ + ADJUST_ARGS(); + + if (op->opnum == OP_ILLEGAL) + goto status; + BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || + !nfsd4_enc_ops[op->opnum]); + op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); + /* nfsd4_check_drc_limit guarantees enough room for error status */ + if (!op->status && nfsd4_check_drc_limit(resp)) + op->status = nfserr_rep_too_big_to_cache; +status: + /* + * Note: We write the status directly, instead of using WRITE32(), + * since it is already in network byte order. + */ + *statp = op->status; +} + +/* + * Encode the reply stored in the stateowner reply cache + * + * XDR note: do not encode rp->rp_buflen: the buffer contains the + * previously sent already encoded operation. + * + * called with nfs4_lock_state() held + */ +void +nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op) +{ + __be32 *p; + struct nfs4_replay *rp = op->replay; + + BUG_ON(!rp); + + RESERVE_SPACE(8); + WRITE32(op->opnum); + *p++ = rp->rp_status; /* already xdr'ed */ + ADJUST_ARGS(); + + RESERVE_SPACE(rp->rp_buflen); + WRITEMEM(rp->rp_buf, rp->rp_buflen); + ADJUST_ARGS(); +} + +int +nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} + +void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args) +{ + if (args->ops != args->iops) { + kfree(args->ops); + args->ops = args->iops; + } + kfree(args->tmpp); + args->tmpp = NULL; + while (args->to_free) { + struct tmpbuf *tb = args->to_free; + args->to_free = tb->next; + tb->release(tb->buf); + kfree(tb); + } +} + +int +nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args) +{ + __be32 status; + + args->p = p; + args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len; + args->pagelist = rqstp->rq_arg.pages; + args->pagelen = rqstp->rq_arg.page_len; + args->tmpp = NULL; + args->to_free = NULL; + args->ops = args->iops; + args->rqstp = rqstp; + + status = nfsd4_decode_compound(args); + if (status) { + nfsd4_release_compoundargs(args); + } + return !status; +} + +int +nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundres *resp) +{ + /* + * All that remains is to write the tag and operation count... + */ + struct nfsd4_compound_state *cs = &resp->cstate; + struct kvec *iov; + p = resp->tagp; + *p++ = htonl(resp->taglen); + memcpy(p, resp->tag, resp->taglen); + p += XDR_QUADLEN(resp->taglen); + *p++ = htonl(resp->opcnt); + + if (rqstp->rq_res.page_len) + iov = &rqstp->rq_res.tail[0]; + else + iov = &rqstp->rq_res.head[0]; + iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; + BUG_ON(iov->iov_len > PAGE_SIZE); + if (nfsd4_has_session(cs)) { + if (cs->status != nfserr_replay_cache) { + nfsd4_store_cache_entry(resp); + dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__); + cs->slot->sl_inuse = false; + } + /* Renew the clientid on success and on replay */ + release_session_client(cs->session); + nfsd4_put_session(cs->session); + } + return 1; +} + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c new file mode 100644 index 00000000..4666a209 --- /dev/null +++ b/fs/nfsd/nfscache.c @@ -0,0 +1,322 @@ +/* + * Request reply cache. This is currently a global cache, but this may + * change in the future and be a per-client cache. + * + * This code is heavily inspired by the 44BSD implementation, although + * it does things a bit differently. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include + +#include "nfsd.h" +#include "cache.h" + +/* Size of reply cache. Common values are: + * 4.3BSD: 128 + * 4.4BSD: 256 + * Solaris2: 1024 + * DEC Unix: 512-4096 + */ +#define CACHESIZE 1024 +#define HASHSIZE 64 + +static struct hlist_head * cache_hash; +static struct list_head lru_head; +static int cache_disabled = 1; + +/* + * Calculate the hash index from an XID. + */ +static inline u32 request_hash(u32 xid) +{ + u32 h = xid; + h ^= (xid >> 24); + return h & (HASHSIZE-1); +} + +static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); + +/* + * locking for the reply cache: + * A cache entry is "single use" if c_state == RC_INPROG + * Otherwise, it when accessing _prev or _next, the lock must be held. + */ +static DEFINE_SPINLOCK(cache_lock); + +int nfsd_reply_cache_init(void) +{ + struct svc_cacherep *rp; + int i; + + INIT_LIST_HEAD(&lru_head); + i = CACHESIZE; + while (i) { + rp = kmalloc(sizeof(*rp), GFP_KERNEL); + if (!rp) + goto out_nomem; + list_add(&rp->c_lru, &lru_head); + rp->c_state = RC_UNUSED; + rp->c_type = RC_NOCACHE; + INIT_HLIST_NODE(&rp->c_hash); + i--; + } + + cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); + if (!cache_hash) + goto out_nomem; + + cache_disabled = 0; + return 0; +out_nomem: + printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); + nfsd_reply_cache_shutdown(); + return -ENOMEM; +} + +void nfsd_reply_cache_shutdown(void) +{ + struct svc_cacherep *rp; + + while (!list_empty(&lru_head)) { + rp = list_entry(lru_head.next, struct svc_cacherep, c_lru); + if (rp->c_state == RC_DONE && rp->c_type == RC_REPLBUFF) + kfree(rp->c_replvec.iov_base); + list_del(&rp->c_lru); + kfree(rp); + } + + cache_disabled = 1; + + kfree (cache_hash); + cache_hash = NULL; +} + +/* + * Move cache entry to end of LRU list + */ +static void +lru_put_end(struct svc_cacherep *rp) +{ + list_move_tail(&rp->c_lru, &lru_head); +} + +/* + * Move a cache entry from one hash list to another + */ +static void +hash_refile(struct svc_cacherep *rp) +{ + hlist_del_init(&rp->c_hash); + hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid)); +} + +/* + * Try to find an entry matching the current call in the cache. When none + * is found, we grab the oldest unlocked entry off the LRU list. + * Note that no operation within the loop may sleep. + */ +int +nfsd_cache_lookup(struct svc_rqst *rqstp, int type) +{ + struct hlist_node *hn; + struct hlist_head *rh; + struct svc_cacherep *rp; + __be32 xid = rqstp->rq_xid; + u32 proto = rqstp->rq_prot, + vers = rqstp->rq_vers, + proc = rqstp->rq_proc; + unsigned long age; + int rtn; + + rqstp->rq_cacherep = NULL; + if (cache_disabled || type == RC_NOCACHE) { + nfsdstats.rcnocache++; + return RC_DOIT; + } + + spin_lock(&cache_lock); + rtn = RC_DOIT; + + rh = &cache_hash[request_hash(xid)]; + hlist_for_each_entry(rp, hn, rh, c_hash) { + if (rp->c_state != RC_UNUSED && + xid == rp->c_xid && proc == rp->c_proc && + proto == rp->c_prot && vers == rp->c_vers && + time_before(jiffies, rp->c_timestamp + 120*HZ) && + memcmp((char*)&rqstp->rq_addr, (char*)&rp->c_addr, sizeof(rp->c_addr))==0) { + nfsdstats.rchits++; + goto found_entry; + } + } + nfsdstats.rcmisses++; + + /* This loop shouldn't take more than a few iterations normally */ + { + int safe = 0; + list_for_each_entry(rp, &lru_head, c_lru) { + if (rp->c_state != RC_INPROG) + break; + if (safe++ > CACHESIZE) { + printk("nfsd: loop in repcache LRU list\n"); + cache_disabled = 1; + goto out; + } + } + } + + /* All entries on the LRU are in-progress. This should not happen */ + if (&rp->c_lru == &lru_head) { + static int complaints; + + printk(KERN_WARNING "nfsd: all repcache entries locked!\n"); + if (++complaints > 5) { + printk(KERN_WARNING "nfsd: disabling repcache.\n"); + cache_disabled = 1; + } + goto out; + } + + rqstp->rq_cacherep = rp; + rp->c_state = RC_INPROG; + rp->c_xid = xid; + rp->c_proc = proc; + memcpy(&rp->c_addr, svc_addr_in(rqstp), sizeof(rp->c_addr)); + rp->c_prot = proto; + rp->c_vers = vers; + rp->c_timestamp = jiffies; + + hash_refile(rp); + + /* release any buffer */ + if (rp->c_type == RC_REPLBUFF) { + kfree(rp->c_replvec.iov_base); + rp->c_replvec.iov_base = NULL; + } + rp->c_type = RC_NOCACHE; + out: + spin_unlock(&cache_lock); + return rtn; + +found_entry: + /* We found a matching entry which is either in progress or done. */ + age = jiffies - rp->c_timestamp; + rp->c_timestamp = jiffies; + lru_put_end(rp); + + rtn = RC_DROPIT; + /* Request being processed or excessive rexmits */ + if (rp->c_state == RC_INPROG || age < RC_DELAY) + goto out; + + /* From the hall of fame of impractical attacks: + * Is this a user who tries to snoop on the cache? */ + rtn = RC_DOIT; + if (!rqstp->rq_secure && rp->c_secure) + goto out; + + /* Compose RPC reply header */ + switch (rp->c_type) { + case RC_NOCACHE: + break; + case RC_REPLSTAT: + svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat); + rtn = RC_REPLY; + break; + case RC_REPLBUFF: + if (!nfsd_cache_append(rqstp, &rp->c_replvec)) + goto out; /* should not happen */ + rtn = RC_REPLY; + break; + default: + printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type); + rp->c_state = RC_UNUSED; + } + + goto out; +} + +/* + * Update a cache entry. This is called from nfsd_dispatch when + * the procedure has been executed and the complete reply is in + * rqstp->rq_res. + * + * We're copying around data here rather than swapping buffers because + * the toplevel loop requires max-sized buffers, which would be a waste + * of memory for a cache with a max reply size of 100 bytes (diropokres). + * + * If we should start to use different types of cache entries tailored + * specifically for attrstat and fh's, we may save even more space. + * + * Also note that a cachetype of RC_NOCACHE can legally be passed when + * nfsd failed to encode a reply that otherwise would have been cached. + * In this case, nfsd_cache_update is called with statp == NULL. + */ +void +nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) +{ + struct svc_cacherep *rp; + struct kvec *resv = &rqstp->rq_res.head[0], *cachv; + int len; + + if (!(rp = rqstp->rq_cacherep) || cache_disabled) + return; + + len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); + len >>= 2; + + /* Don't cache excessive amounts of data and XDR failures */ + if (!statp || len > (256 >> 2)) { + rp->c_state = RC_UNUSED; + return; + } + + switch (cachetype) { + case RC_REPLSTAT: + if (len != 1) + printk("nfsd: RC_REPLSTAT/reply len %d!\n",len); + rp->c_replstat = *statp; + break; + case RC_REPLBUFF: + cachv = &rp->c_replvec; + cachv->iov_base = kmalloc(len << 2, GFP_KERNEL); + if (!cachv->iov_base) { + spin_lock(&cache_lock); + rp->c_state = RC_UNUSED; + spin_unlock(&cache_lock); + return; + } + cachv->iov_len = len << 2; + memcpy(cachv->iov_base, statp, len << 2); + break; + } + spin_lock(&cache_lock); + lru_put_end(rp); + rp->c_secure = rqstp->rq_secure; + rp->c_type = cachetype; + rp->c_state = RC_DONE; + rp->c_timestamp = jiffies; + spin_unlock(&cache_lock); + return; +} + +/* + * Copy cached reply to current reply buffer. Should always fit. + * FIXME as reply is in a page, we should just attach the page, and + * keep a refcount.... + */ +static int +nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) +{ + struct kvec *vec = &rqstp->rq_res.head[0]; + + if (vec->iov_len + data->iov_len > PAGE_SIZE) { + printk(KERN_WARNING "nfsd: cached reply too large (%Zd).\n", + data->iov_len); + return 0; + } + memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len); + vec->iov_len += data->iov_len; + return 1; +} diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c new file mode 100644 index 00000000..2b1449dd --- /dev/null +++ b/fs/nfsd/nfsctl.c @@ -0,0 +1,1527 @@ +/* + * Syscall interface to knfsd. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "idmap.h" +#include "nfsd.h" +#include "cache.h" + +/* + * We have a single directory with several nodes in it. + */ +enum { + NFSD_Root = 1, +#ifdef CONFIG_NFSD_DEPRECATED + NFSD_Svc, + NFSD_Add, + NFSD_Del, + NFSD_Export, + NFSD_Unexport, + NFSD_Getfd, + NFSD_Getfs, +#endif + NFSD_List, + NFSD_Export_features, + NFSD_Fh, + NFSD_FO_UnlockIP, + NFSD_FO_UnlockFS, + NFSD_Threads, + NFSD_Pool_Threads, + NFSD_Pool_Stats, + NFSD_Versions, + NFSD_Ports, + NFSD_MaxBlkSize, + NFSD_SupportedEnctypes, + /* + * The below MUST come last. Otherwise we leave a hole in nfsd_files[] + * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops + */ +#ifdef CONFIG_NFSD_V4 + NFSD_Leasetime, + NFSD_Gracetime, + NFSD_RecoveryDir, +#endif +}; + +/* + * write() for these nodes. + */ +#ifdef CONFIG_NFSD_DEPRECATED +static ssize_t write_svc(struct file *file, char *buf, size_t size); +static ssize_t write_add(struct file *file, char *buf, size_t size); +static ssize_t write_del(struct file *file, char *buf, size_t size); +static ssize_t write_export(struct file *file, char *buf, size_t size); +static ssize_t write_unexport(struct file *file, char *buf, size_t size); +static ssize_t write_getfd(struct file *file, char *buf, size_t size); +static ssize_t write_getfs(struct file *file, char *buf, size_t size); +#endif +static ssize_t write_filehandle(struct file *file, char *buf, size_t size); +static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); +static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); +static ssize_t write_threads(struct file *file, char *buf, size_t size); +static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); +static ssize_t write_versions(struct file *file, char *buf, size_t size); +static ssize_t write_ports(struct file *file, char *buf, size_t size); +static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); +#ifdef CONFIG_NFSD_V4 +static ssize_t write_leasetime(struct file *file, char *buf, size_t size); +static ssize_t write_gracetime(struct file *file, char *buf, size_t size); +static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); +#endif + +static ssize_t (*write_op[])(struct file *, char *, size_t) = { +#ifdef CONFIG_NFSD_DEPRECATED + [NFSD_Svc] = write_svc, + [NFSD_Add] = write_add, + [NFSD_Del] = write_del, + [NFSD_Export] = write_export, + [NFSD_Unexport] = write_unexport, + [NFSD_Getfd] = write_getfd, + [NFSD_Getfs] = write_getfs, +#endif + [NFSD_Fh] = write_filehandle, + [NFSD_FO_UnlockIP] = write_unlock_ip, + [NFSD_FO_UnlockFS] = write_unlock_fs, + [NFSD_Threads] = write_threads, + [NFSD_Pool_Threads] = write_pool_threads, + [NFSD_Versions] = write_versions, + [NFSD_Ports] = write_ports, + [NFSD_MaxBlkSize] = write_maxblksize, +#ifdef CONFIG_NFSD_V4 + [NFSD_Leasetime] = write_leasetime, + [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, +#endif +}; + +static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) +{ + ino_t ino = file->f_path.dentry->d_inode->i_ino; + char *data; + ssize_t rv; + + if (ino >= ARRAY_SIZE(write_op) || !write_op[ino]) + return -EINVAL; + + data = simple_transaction_get(file, buf, size); + if (IS_ERR(data)) + return PTR_ERR(data); + + rv = write_op[ino](file, data, size); + if (rv >= 0) { + simple_transaction_set(file, rv); + rv = size; + } + return rv; +} + +static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) +{ +#ifdef CONFIG_NFSD_DEPRECATED + static int warned; + if (file->f_dentry->d_name.name[0] == '.' && !warned) { + printk(KERN_INFO + "Warning: \"%s\" uses deprecated NFSD interface: %s." + " This will be removed in 2.6.40\n", + current->comm, file->f_dentry->d_name.name); + warned = 1; + } +#endif + if (! file->private_data) { + /* An attempt to read a transaction file without writing + * causes a 0-byte write so that the file can return + * state information + */ + ssize_t rv = nfsctl_transaction_write(file, buf, 0, pos); + if (rv < 0) + return rv; + } + return simple_transaction_read(file, buf, size, pos); +} + +static const struct file_operations transaction_ops = { + .write = nfsctl_transaction_write, + .read = nfsctl_transaction_read, + .release = simple_transaction_release, + .llseek = default_llseek, +}; + +static int exports_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nfs_exports_op); +} + +static const struct file_operations exports_operations = { + .open = exports_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + +static int export_features_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%x 0x%x\n", NFSEXP_ALLFLAGS, NFSEXP_SECINFO_FLAGS); + return 0; +} + +static int export_features_open(struct inode *inode, struct file *file) +{ + return single_open(file, export_features_show, NULL); +} + +static struct file_operations export_features_operations = { + .open = export_features_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) +static int supported_enctypes_show(struct seq_file *m, void *v) +{ + seq_printf(m, KRB5_SUPPORTED_ENCTYPES); + return 0; +} + +static int supported_enctypes_open(struct inode *inode, struct file *file) +{ + return single_open(file, supported_enctypes_show, NULL); +} + +static struct file_operations supported_enctypes_ops = { + .open = supported_enctypes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ + +extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); +extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); + +static const struct file_operations pool_stats_operations = { + .open = nfsd_pool_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = nfsd_pool_stats_release, + .owner = THIS_MODULE, +}; + +/*----------------------------------------------------------------------------*/ +/* + * payload - write methods + */ + +#ifdef CONFIG_NFSD_DEPRECATED +/** + * write_svc - Start kernel's NFSD server + * + * Deprecated. /proc/fs/nfsd/threads is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_svc + * svc_port: port number of this + * server's listener + * svc_nthreads: number of threads to start + * size: size in bytes of passed in nfsctl_svc + * Output: + * On success: returns zero + * On error: return code is negative errno value + */ +static ssize_t write_svc(struct file *file, char *buf, size_t size) +{ + struct nfsctl_svc *data; + int err; + if (size < sizeof(*data)) + return -EINVAL; + data = (struct nfsctl_svc*) buf; + err = nfsd_svc(data->svc_port, data->svc_nthreads); + if (err < 0) + return err; + return 0; +} + +/** + * write_add - Add or modify client entry in auth unix cache + * + * Deprecated. /proc/net/rpc/auth.unix.ip is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_client + * cl_ident: '\0'-terminated C string + * containing domain name + * of client + * cl_naddr: no. of items in cl_addrlist + * cl_addrlist: array of client addresses + * cl_fhkeytype: ignored + * cl_fhkeylen: ignored + * cl_fhkey: ignored + * size: size in bytes of passed in nfsctl_client + * Output: + * On success: returns zero + * On error: return code is negative errno value + * + * Note: Only AF_INET client addresses are passed in, since + * nfsctl_client.cl_addrlist contains only in_addr fields for addresses. + */ +static ssize_t write_add(struct file *file, char *buf, size_t size) +{ + struct nfsctl_client *data; + if (size < sizeof(*data)) + return -EINVAL; + data = (struct nfsctl_client *)buf; + return exp_addclient(data); +} + +/** + * write_del - Remove client from auth unix cache + * + * Deprecated. /proc/net/rpc/auth.unix.ip is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_client + * cl_ident: '\0'-terminated C string + * containing domain name + * of client + * cl_naddr: ignored + * cl_addrlist: ignored + * cl_fhkeytype: ignored + * cl_fhkeylen: ignored + * cl_fhkey: ignored + * size: size in bytes of passed in nfsctl_client + * Output: + * On success: returns zero + * On error: return code is negative errno value + * + * Note: Only AF_INET client addresses are passed in, since + * nfsctl_client.cl_addrlist contains only in_addr fields for addresses. + */ +static ssize_t write_del(struct file *file, char *buf, size_t size) +{ + struct nfsctl_client *data; + if (size < sizeof(*data)) + return -EINVAL; + data = (struct nfsctl_client *)buf; + return exp_delclient(data); +} + +/** + * write_export - Export part or all of a local file system + * + * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_export + * ex_client: '\0'-terminated C string + * containing domain name + * of client allowed to access + * this export + * ex_path: '\0'-terminated C string + * containing pathname of + * directory in local file system + * ex_dev: fsid to use for this export + * ex_ino: ignored + * ex_flags: export flags for this export + * ex_anon_uid: UID to use for anonymous + * requests + * ex_anon_gid: GID to use for anonymous + * requests + * size: size in bytes of passed in nfsctl_export + * Output: + * On success: returns zero + * On error: return code is negative errno value + */ +static ssize_t write_export(struct file *file, char *buf, size_t size) +{ + struct nfsctl_export *data; + if (size < sizeof(*data)) + return -EINVAL; + data = (struct nfsctl_export*)buf; + return exp_export(data); +} + +/** + * write_unexport - Unexport a previously exported file system + * + * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_export + * ex_client: '\0'-terminated C string + * containing domain name + * of client no longer allowed + * to access this export + * ex_path: '\0'-terminated C string + * containing pathname of + * directory in local file system + * ex_dev: ignored + * ex_ino: ignored + * ex_flags: ignored + * ex_anon_uid: ignored + * ex_anon_gid: ignored + * size: size in bytes of passed in nfsctl_export + * Output: + * On success: returns zero + * On error: return code is negative errno value + */ +static ssize_t write_unexport(struct file *file, char *buf, size_t size) +{ + struct nfsctl_export *data; + + if (size < sizeof(*data)) + return -EINVAL; + data = (struct nfsctl_export*)buf; + return exp_unexport(data); +} + +/** + * write_getfs - Get a variable-length NFS file handle by path + * + * Deprecated. /proc/fs/nfsd/filehandle is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_fsparm + * gd_addr: socket address of client + * gd_path: '\0'-terminated C string + * containing pathname of + * directory in local file system + * gd_maxlen: maximum size of returned file + * handle + * size: size in bytes of passed in nfsctl_fsparm + * Output: + * On success: passed-in buffer filled with a knfsd_fh structure + * (a variable-length raw NFS file handle); + * return code is the size in bytes of the file handle + * On error: return code is negative errno value + * + * Note: Only AF_INET client addresses are passed in, since gd_addr + * is the same size as a struct sockaddr_in. + */ +static ssize_t write_getfs(struct file *file, char *buf, size_t size) +{ + struct nfsctl_fsparm *data; + struct sockaddr_in *sin; + struct auth_domain *clp; + int err = 0; + struct knfsd_fh *res; + struct in6_addr in6; + + if (size < sizeof(*data)) + return -EINVAL; + data = (struct nfsctl_fsparm*)buf; + err = -EPROTONOSUPPORT; + if (data->gd_addr.sa_family != AF_INET) + goto out; + sin = (struct sockaddr_in *)&data->gd_addr; + if (data->gd_maxlen > NFS3_FHSIZE) + data->gd_maxlen = NFS3_FHSIZE; + + res = (struct knfsd_fh*)buf; + + exp_readlock(); + + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); + + clp = auth_unix_lookup(&init_net, &in6); + if (!clp) + err = -EPERM; + else { + err = exp_rootfh(clp, data->gd_path, res, data->gd_maxlen); + auth_domain_put(clp); + } + exp_readunlock(); + if (err == 0) + err = res->fh_size + offsetof(struct knfsd_fh, fh_base); + out: + return err; +} + +/** + * write_getfd - Get a fixed-length NFS file handle by path (used by mountd) + * + * Deprecated. /proc/fs/nfsd/filehandle is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_fdparm + * gd_addr: socket address of client + * gd_path: '\0'-terminated C string + * containing pathname of + * directory in local file system + * gd_version: fdparm structure version + * size: size in bytes of passed in nfsctl_fdparm + * Output: + * On success: passed-in buffer filled with nfsctl_res + * (a fixed-length raw NFS file handle); + * return code is the size in bytes of the file handle + * On error: return code is negative errno value + * + * Note: Only AF_INET client addresses are passed in, since gd_addr + * is the same size as a struct sockaddr_in. + */ +static ssize_t write_getfd(struct file *file, char *buf, size_t size) +{ + struct nfsctl_fdparm *data; + struct sockaddr_in *sin; + struct auth_domain *clp; + int err = 0; + struct knfsd_fh fh; + char *res; + struct in6_addr in6; + + if (size < sizeof(*data)) + return -EINVAL; + data = (struct nfsctl_fdparm*)buf; + err = -EPROTONOSUPPORT; + if (data->gd_addr.sa_family != AF_INET) + goto out; + err = -EINVAL; + if (data->gd_version < 2 || data->gd_version > NFSSVC_MAXVERS) + goto out; + + res = buf; + sin = (struct sockaddr_in *)&data->gd_addr; + exp_readlock(); + + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); + + clp = auth_unix_lookup(&init_net, &in6); + if (!clp) + err = -EPERM; + else { + err = exp_rootfh(clp, data->gd_path, &fh, NFS_FHSIZE); + auth_domain_put(clp); + } + exp_readunlock(); + + if (err == 0) { + memset(res,0, NFS_FHSIZE); + memcpy(res, &fh.fh_base, fh.fh_size); + err = NFS_FHSIZE; + } + out: + return err; +} +#endif /* CONFIG_NFSD_DEPRECATED */ + +/** + * write_unlock_ip - Release all locks used by a client + * + * Experimental. + * + * Input: + * buf: '\n'-terminated C string containing a + * presentation format IP address + * size: length of C string in @buf + * Output: + * On success: returns zero if all specified locks were released; + * returns one if one or more locks were not released + * On error: return code is negative errno value + */ +static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) +{ + struct sockaddr_storage address; + struct sockaddr *sap = (struct sockaddr *)&address; + size_t salen = sizeof(address); + char *fo_path; + + /* sanity check */ + if (size == 0) + return -EINVAL; + + if (buf[size-1] != '\n') + return -EINVAL; + + fo_path = buf; + if (qword_get(&buf, fo_path, size) < 0) + return -EINVAL; + + if (rpc_pton(fo_path, size, sap, salen) == 0) + return -EINVAL; + + return nlmsvc_unlock_all_by_ip(sap); +} + +/** + * write_unlock_fs - Release all locks on a local file system + * + * Experimental. + * + * Input: + * buf: '\n'-terminated C string containing the + * absolute pathname of a local file system + * size: length of C string in @buf + * Output: + * On success: returns zero if all specified locks were released; + * returns one if one or more locks were not released + * On error: return code is negative errno value + */ +static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) +{ + struct path path; + char *fo_path; + int error; + + /* sanity check */ + if (size == 0) + return -EINVAL; + + if (buf[size-1] != '\n') + return -EINVAL; + + fo_path = buf; + if (qword_get(&buf, fo_path, size) < 0) + return -EINVAL; + + error = kern_path(fo_path, 0, &path); + if (error) + return error; + + /* + * XXX: Needs better sanity checking. Otherwise we could end up + * releasing locks on the wrong file system. + * + * For example: + * 1. Does the path refer to a directory? + * 2. Is that directory a mount point, or + * 3. Is that directory the root of an exported file system? + */ + error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb); + + path_put(&path); + return error; +} + +/** + * write_filehandle - Get a variable-length NFS file handle by path + * + * On input, the buffer contains a '\n'-terminated C string comprised of + * three alphanumeric words separated by whitespace. The string may + * contain escape sequences. + * + * Input: + * buf: + * domain: client domain name + * path: export pathname + * maxsize: numeric maximum size of + * @buf + * size: length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing a ASCII hex text version + * of the NFS file handle; + * return code is the size in bytes of the string + * On error: return code is negative errno value + */ +static ssize_t write_filehandle(struct file *file, char *buf, size_t size) +{ + char *dname, *path; + int uninitialized_var(maxsize); + char *mesg = buf; + int len; + struct auth_domain *dom; + struct knfsd_fh fh; + + if (size == 0) + return -EINVAL; + + if (buf[size-1] != '\n') + return -EINVAL; + buf[size-1] = 0; + + dname = mesg; + len = qword_get(&mesg, dname, size); + if (len <= 0) + return -EINVAL; + + path = dname+len+1; + len = qword_get(&mesg, path, size); + if (len <= 0) + return -EINVAL; + + len = get_int(&mesg, &maxsize); + if (len) + return len; + + if (maxsize < NFS_FHSIZE) + return -EINVAL; + if (maxsize > NFS3_FHSIZE) + maxsize = NFS3_FHSIZE; + + if (qword_get(&mesg, mesg, size)>0) + return -EINVAL; + + /* we have all the words, they are in buf.. */ + dom = unix_domain_find(dname); + if (!dom) + return -ENOMEM; + + len = exp_rootfh(dom, path, &fh, maxsize); + auth_domain_put(dom); + if (len) + return len; + + mesg = buf; + len = SIMPLE_TRANSACTION_LIMIT; + qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size); + mesg[-1] = '\n'; + return mesg - buf; +} + +/** + * write_threads - Start NFSD, or report the current number of running threads + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string numeric value representing the number of + * running NFSD threads; + * return code is the size in bytes of the string + * On error: return code is zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the + * number of NFSD threads to start + * size: non-zero length of C string in @buf + * Output: + * On success: NFS service is started; + * passed-in buffer filled with '\n'-terminated C + * string numeric value representing the number of + * running NFSD threads; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_threads(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + int rv; + if (size > 0) { + int newthreads; + rv = get_int(&mesg, &newthreads); + if (rv) + return rv; + if (newthreads < 0) + return -EINVAL; + rv = nfsd_svc(NFS_PORT, newthreads); + if (rv < 0) + return rv; + } else + rv = nfsd_nrthreads(); + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv); +} + +/** + * write_pool_threads - Set or report the current number of threads per pool + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing whitespace- + * separated unsigned integer values + * representing the number of NFSD + * threads to start in each pool + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing integer values representing the + * number of NFSD threads in each pool; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) +{ + /* if size > 0, look for an array of number of threads per node + * and apply them then write out number of threads per node as reply + */ + char *mesg = buf; + int i; + int rv; + int len; + int npools; + int *nthreads; + + mutex_lock(&nfsd_mutex); + npools = nfsd_nrpools(); + if (npools == 0) { + /* + * NFS is shut down. The admin can start it by + * writing to the threads file but NOT the pool_threads + * file, sorry. Report zero threads. + */ + mutex_unlock(&nfsd_mutex); + strcpy(buf, "0\n"); + return strlen(buf); + } + + nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL); + rv = -ENOMEM; + if (nthreads == NULL) + goto out_free; + + if (size > 0) { + for (i = 0; i < npools; i++) { + rv = get_int(&mesg, &nthreads[i]); + if (rv == -ENOENT) + break; /* fewer numbers than pools */ + if (rv) + goto out_free; /* syntax error */ + rv = -EINVAL; + if (nthreads[i] < 0) + goto out_free; + } + rv = nfsd_set_nrthreads(i, nthreads); + if (rv) + goto out_free; + } + + rv = nfsd_get_nrthreads(npools, nthreads); + if (rv) + goto out_free; + + mesg = buf; + size = SIMPLE_TRANSACTION_LIMIT; + for (i = 0; i < npools && size > 0; i++) { + snprintf(mesg, size, "%d%c", nthreads[i], (i == npools-1 ? '\n' : ' ')); + len = strlen(mesg); + size -= len; + mesg += len; + } + rv = mesg - buf; +out_free: + kfree(nthreads); + mutex_unlock(&nfsd_mutex); + return rv; +} + +static ssize_t __write_versions(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + char *vers, *minorp, sign; + int len, num, remaining; + unsigned minor; + ssize_t tlen = 0; + char *sep; + + if (size>0) { + if (nfsd_serv) + /* Cannot change versions without updating + * nfsd_serv->sv_xdrsize, and reallocing + * rq_argp and rq_resp + */ + return -EBUSY; + if (buf[size-1] != '\n') + return -EINVAL; + buf[size-1] = 0; + + vers = mesg; + len = qword_get(&mesg, vers, size); + if (len <= 0) return -EINVAL; + do { + sign = *vers; + if (sign == '+' || sign == '-') + num = simple_strtol((vers+1), &minorp, 0); + else + num = simple_strtol(vers, &minorp, 0); + if (*minorp == '.') { + if (num < 4) + return -EINVAL; + minor = simple_strtoul(minorp+1, NULL, 0); + if (minor == 0) + return -EINVAL; + if (nfsd_minorversion(minor, sign == '-' ? + NFSD_CLEAR : NFSD_SET) < 0) + return -EINVAL; + goto next; + } + switch(num) { + case 2: + case 3: + case 4: + nfsd_vers(num, sign == '-' ? NFSD_CLEAR : NFSD_SET); + break; + default: + return -EINVAL; + } + next: + vers += len + 1; + } while ((len = qword_get(&mesg, vers, size)) > 0); + /* If all get turned off, turn them back on, as + * having no versions is BAD + */ + nfsd_reset_versions(); + } + + /* Now write current state into reply buffer */ + len = 0; + sep = ""; + remaining = SIMPLE_TRANSACTION_LIMIT; + for (num=2 ; num <= 4 ; num++) + if (nfsd_vers(num, NFSD_AVAIL)) { + len = snprintf(buf, remaining, "%s%c%d", sep, + nfsd_vers(num, NFSD_TEST)?'+':'-', + num); + sep = " "; + + if (len > remaining) + break; + remaining -= len; + buf += len; + tlen += len; + } + if (nfsd_vers(4, NFSD_AVAIL)) + for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; + minor++) { + len = snprintf(buf, remaining, " %c4.%u", + (nfsd_vers(4, NFSD_TEST) && + nfsd_minorversion(minor, NFSD_TEST)) ? + '+' : '-', + minor); + + if (len > remaining) + break; + remaining -= len; + buf += len; + tlen += len; + } + + len = snprintf(buf, remaining, "\n"); + if (len > remaining) + return -EINVAL; + return tlen + len; +} + +/** + * write_versions - Set or report the available NFS protocol versions + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing positive or negative integer + * values representing the current status of each + * protocol version; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + * + * OR + * + * Input: + * buf: C string containing whitespace- + * separated positive or negative + * integer values representing NFS + * protocol versions to enable ("+n") + * or disable ("-n") + * size: non-zero length of C string in @buf + * Output: + * On success: status of zero or more protocol versions has + * been updated; passed-in buffer filled with + * '\n'-terminated C string containing positive + * or negative integer values representing the + * current status of each protocol version; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_versions(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_versions(file, buf, size); + mutex_unlock(&nfsd_mutex); + return rv; +} + +/* + * Zero-length write. Return a list of NFSD's current listener + * transports. + */ +static ssize_t __write_ports_names(char *buf) +{ + if (nfsd_serv == NULL) + return 0; + return svc_xprt_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT); +} + +/* + * A single 'fd' number was written, in which case it must be for + * a socket of a supported family/protocol, and we use it as an + * nfsd listener. + */ +static ssize_t __write_ports_addfd(char *buf) +{ + char *mesg = buf; + int fd, err; + + err = get_int(&mesg, &fd); + if (err != 0 || fd < 0) + return -EINVAL; + + err = nfsd_create_serv(); + if (err != 0) + return err; + + err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); + if (err < 0) { + svc_destroy(nfsd_serv); + return err; + } + + /* Decrease the count, but don't shut down the service */ + nfsd_serv->sv_nrthreads--; + return err; +} + +/* + * A '-' followed by the 'name' of a socket means we close the socket. + */ +static ssize_t __write_ports_delfd(char *buf) +{ + char *toclose; + int len = 0; + + toclose = kstrdup(buf + 1, GFP_KERNEL); + if (toclose == NULL) + return -ENOMEM; + + if (nfsd_serv != NULL) + len = svc_sock_names(nfsd_serv, buf, + SIMPLE_TRANSACTION_LIMIT, toclose); + kfree(toclose); + return len; +} + +/* + * A transport listener is added by writing it's transport name and + * a port number. + */ +static ssize_t __write_ports_addxprt(char *buf) +{ + char transport[16]; + struct svc_xprt *xprt; + int port, err; + + if (sscanf(buf, "%15s %4u", transport, &port) != 2) + return -EINVAL; + + if (port < 1 || port > USHRT_MAX) + return -EINVAL; + + err = nfsd_create_serv(); + if (err != 0) + return err; + + err = svc_create_xprt(nfsd_serv, transport, &init_net, + PF_INET, port, SVC_SOCK_ANONYMOUS); + if (err < 0) + goto out_err; + + err = svc_create_xprt(nfsd_serv, transport, &init_net, + PF_INET6, port, SVC_SOCK_ANONYMOUS); + if (err < 0 && err != -EAFNOSUPPORT) + goto out_close; + + /* Decrease the count, but don't shut down the service */ + nfsd_serv->sv_nrthreads--; + return 0; +out_close: + xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port); + if (xprt != NULL) { + svc_close_xprt(xprt); + svc_xprt_put(xprt); + } +out_err: + svc_destroy(nfsd_serv); + return err; +} + +/* + * A transport listener is removed by writing a "-", it's transport + * name, and it's port number. + */ +static ssize_t __write_ports_delxprt(char *buf) +{ + struct svc_xprt *xprt; + char transport[16]; + int port; + + if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2) + return -EINVAL; + + if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL) + return -EINVAL; + + xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); + if (xprt == NULL) + return -ENOTCONN; + + svc_close_xprt(xprt); + svc_xprt_put(xprt); + return 0; +} + +static ssize_t __write_ports(struct file *file, char *buf, size_t size) +{ + if (size == 0) + return __write_ports_names(buf); + + if (isdigit(buf[0])) + return __write_ports_addfd(buf); + + if (buf[0] == '-' && isdigit(buf[1])) + return __write_ports_delfd(buf); + + if (isalpha(buf[0])) + return __write_ports_addxprt(buf); + + if (buf[0] == '-' && isalpha(buf[1])) + return __write_ports_delxprt(buf); + + return -EINVAL; +} + +/** + * write_ports - Pass a socket file descriptor or transport name to listen on + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with a '\n'-terminated C + * string containing a whitespace-separated list of + * named NFSD listeners; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing a bound + * but unconnected socket that is to be + * used as an NFSD listener; listen(3) + * must be called for a SOCK_STREAM + * socket, otherwise it is ignored + * size: non-zero length of C string in @buf + * Output: + * On success: NFS service is started; + * passed-in buffer filled with a '\n'-terminated C + * string containing a unique alphanumeric name of + * the listener; + * return code is the size in bytes of the string + * On error: return code is a negative errno value + * + * OR + * + * Input: + * buf: C string containing a "-" followed + * by an integer value representing a + * previously passed in socket file + * descriptor + * size: non-zero length of C string in @buf + * Output: + * On success: NFS service no longer listens on that socket; + * passed-in buffer filled with a '\n'-terminated C + * string containing a unique name of the listener; + * return code is the size in bytes of the string + * On error: return code is a negative errno value + * + * OR + * + * Input: + * buf: C string containing a transport + * name and an unsigned integer value + * representing the port to listen on, + * separated by whitespace + * size: non-zero length of C string in @buf + * Output: + * On success: returns zero; NFS service is started + * On error: return code is a negative errno value + * + * OR + * + * Input: + * buf: C string containing a "-" followed + * by a transport name and an unsigned + * integer value representing the port + * to listen on, separated by whitespace + * size: non-zero length of C string in @buf + * Output: + * On success: returns zero; NFS service no longer listens + * on that transport + * On error: return code is a negative errno value + */ +static ssize_t write_ports(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_ports(file, buf, size); + mutex_unlock(&nfsd_mutex); + return rv; +} + + +int nfsd_max_blksize; + +/** + * write_maxblksize - Set or report the current NFS blksize + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the new + * NFS blksize + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C string + * containing numeric value of the current NFS blksize + * setting; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + if (size > 0) { + int bsize; + int rv = get_int(&mesg, &bsize); + if (rv) + return rv; + /* force bsize into allowed range and + * required alignment. + */ + if (bsize < 1024) + bsize = 1024; + if (bsize > NFSSVC_MAXBLKSIZE) + bsize = NFSSVC_MAXBLKSIZE; + bsize &= ~(1024-1); + mutex_lock(&nfsd_mutex); + if (nfsd_serv) { + mutex_unlock(&nfsd_mutex); + return -EBUSY; + } + nfsd_max_blksize = bsize; + mutex_unlock(&nfsd_mutex); + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", + nfsd_max_blksize); +} + +#ifdef CONFIG_NFSD_V4 +static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) +{ + char *mesg = buf; + int rv, i; + + if (size > 0) { + if (nfsd_serv) + return -EBUSY; + rv = get_int(&mesg, &i); + if (rv) + return rv; + /* + * Some sanity checking. We don't have a reason for + * these particular numbers, but problems with the + * extremes are: + * - Too short: the briefest network outage may + * cause clients to lose all their locks. Also, + * the frequent polling may be wasteful. + * - Too long: do you really want reboot recovery + * to take more than an hour? Or to make other + * clients wait an hour before being able to + * revoke a dead client's locks? + */ + if (i < 10 || i > 3600) + return -EINVAL; + *time = i; + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time); +} + +static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __nfsd4_write_time(file, buf, size, time); + mutex_unlock(&nfsd_mutex); + return rv; +} + +/** + * write_leasetime - Set or report the current NFSv4 lease time + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the new + * NFSv4 lease expiry time + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing unsigned integer value of the + * current lease expiry time; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_leasetime(struct file *file, char *buf, size_t size) +{ + return nfsd4_write_time(file, buf, size, &nfsd4_lease); +} + +/** + * write_gracetime - Set or report current NFSv4 grace period time + * + * As above, but sets the time of the NFSv4 grace period. + * + * Note this should never be set to less than the *previous* + * lease-period time, but we don't try to enforce this. (In the common + * case (a new boot), we don't know what the previous lease time was + * anyway.) + */ +static ssize_t write_gracetime(struct file *file, char *buf, size_t size) +{ + return nfsd4_write_time(file, buf, size, &nfsd4_grace); +} + +extern char *nfs4_recoverydir(void); + +static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + char *recdir; + int len, status; + + if (size > 0) { + if (nfsd_serv) + return -EBUSY; + if (size > PATH_MAX || buf[size-1] != '\n') + return -EINVAL; + buf[size-1] = 0; + + recdir = mesg; + len = qword_get(&mesg, recdir, size); + if (len <= 0) + return -EINVAL; + + status = nfs4_reset_recoverydir(recdir); + if (status) + return status; + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n", + nfs4_recoverydir()); +} + +/** + * write_recoverydir - Set or report the pathname of the recovery directory + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing the pathname + * of the directory on a local file + * system containing permanent NFSv4 + * recovery data + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C string + * containing the current recovery pathname setting; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_recoverydir(file, buf, size); + mutex_unlock(&nfsd_mutex); + return rv; +} + +#endif + +/*----------------------------------------------------------------------------*/ +/* + * populating the filesystem. + */ + +static int nfsd_fill_super(struct super_block * sb, void * data, int silent) +{ + static struct tree_descr nfsd_files[] = { +#ifdef CONFIG_NFSD_DEPRECATED + [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR}, + [NFSD_Add] = {".add", &transaction_ops, S_IWUSR}, + [NFSD_Del] = {".del", &transaction_ops, S_IWUSR}, + [NFSD_Export] = {".export", &transaction_ops, S_IWUSR}, + [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR}, + [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR}, +#endif + [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, + [NFSD_Export_features] = {"export_features", + &export_features_operations, S_IRUGO}, + [NFSD_FO_UnlockIP] = {"unlock_ip", + &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_FO_UnlockFS] = {"unlock_filesystem", + &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO}, + [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, + [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) + [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ +#ifdef CONFIG_NFSD_V4 + [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, +#endif + /* last one */ {""} + }; + return simple_fill_super(sb, 0x6e667364, nfsd_files); +} + +static struct dentry *nfsd_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, nfsd_fill_super); +} + +static struct file_system_type nfsd_fs_type = { + .owner = THIS_MODULE, + .name = "nfsd", + .mount = nfsd_mount, + .kill_sb = kill_litter_super, +}; + +#ifdef CONFIG_PROC_FS +static int create_proc_exports_entry(void) +{ + struct proc_dir_entry *entry; + + entry = proc_mkdir("fs/nfs", NULL); + if (!entry) + return -ENOMEM; + entry = proc_create("exports", 0, entry, &exports_operations); + if (!entry) + return -ENOMEM; + return 0; +} +#else /* CONFIG_PROC_FS */ +static int create_proc_exports_entry(void) +{ + return 0; +} +#endif + +static int __init init_nfsd(void) +{ + int retval; + printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); + + retval = nfs4_state_init(); /* nfs4 locking state */ + if (retval) + return retval; + nfsd_stat_init(); /* Statistics */ + retval = nfsd_reply_cache_init(); + if (retval) + goto out_free_stat; + retval = nfsd_export_init(); + if (retval) + goto out_free_cache; + nfsd_lockd_init(); /* lockd->nfsd callbacks */ + retval = nfsd_idmap_init(); + if (retval) + goto out_free_lockd; + retval = create_proc_exports_entry(); + if (retval) + goto out_free_idmap; + retval = register_filesystem(&nfsd_fs_type); + if (retval) + goto out_free_all; + return 0; +out_free_all: + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); +out_free_idmap: + nfsd_idmap_shutdown(); +out_free_lockd: + nfsd_lockd_shutdown(); + nfsd_export_shutdown(); +out_free_cache: + nfsd_reply_cache_shutdown(); +out_free_stat: + nfsd_stat_shutdown(); + nfsd4_free_slabs(); + return retval; +} + +static void __exit exit_nfsd(void) +{ + nfsd_export_shutdown(); + nfsd_reply_cache_shutdown(); + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); + nfsd_stat_shutdown(); + nfsd_lockd_shutdown(); + nfsd_idmap_shutdown(); + nfsd4_free_slabs(); + unregister_filesystem(&nfsd_fs_type); +} + +MODULE_AUTHOR("Olaf Kirch "); +MODULE_LICENSE("GPL"); +module_init(init_nfsd) +module_exit(exit_nfsd) diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h new file mode 100644 index 00000000..7ecfa242 --- /dev/null +++ b/fs/nfsd/nfsd.h @@ -0,0 +1,340 @@ +/* + * Hodge-podge collection of knfsd-related stuff. + * I will sort this out later. + * + * Copyright (C) 1995-1997 Olaf Kirch + */ + +#ifndef LINUX_NFSD_NFSD_H +#define LINUX_NFSD_NFSD_H + +#include +#include + +#include +#include +#include +/* + * nfsd version + */ +#define NFSD_SUPPORTED_MINOR_VERSION 1 + +struct readdir_cd { + __be32 err; /* 0, nfserr, or nfserr_eof */ +}; + + +extern struct svc_program nfsd_program; +extern struct svc_version nfsd_version2, nfsd_version3, + nfsd_version4; +extern u32 nfsd_supported_minorversion; +extern struct mutex nfsd_mutex; +extern struct svc_serv *nfsd_serv; +extern spinlock_t nfsd_drc_lock; +extern unsigned int nfsd_drc_max_mem; +extern unsigned int nfsd_drc_mem_used; + +extern const struct seq_operations nfs_exports_op; + +/* + * Function prototypes. + */ +int nfsd_svc(unsigned short port, int nrservs); +int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); + +int nfsd_nrthreads(void); +int nfsd_nrpools(void); +int nfsd_get_nrthreads(int n, int *); +int nfsd_set_nrthreads(int n, int *); + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +#ifdef CONFIG_NFSD_V2_ACL +extern struct svc_version nfsd_acl_version2; +#else +#define nfsd_acl_version2 NULL +#endif +#ifdef CONFIG_NFSD_V3_ACL +extern struct svc_version nfsd_acl_version3; +#else +#define nfsd_acl_version3 NULL +#endif +#endif + +enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL }; +int nfsd_vers(int vers, enum vers_op change); +int nfsd_minorversion(u32 minorversion, enum vers_op change); +void nfsd_reset_versions(void); +int nfsd_create_serv(void); + +extern int nfsd_max_blksize; + +static inline int nfsd_v4client(struct svc_rqst *rq) +{ + return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4; +} + +/* + * NFSv4 State + */ +#ifdef CONFIG_NFSD_V4 +extern unsigned int max_delegations; +int nfs4_state_init(void); +void nfsd4_free_slabs(void); +int nfs4_state_start(void); +void nfs4_state_shutdown(void); +void nfs4_reset_lease(time_t leasetime); +int nfs4_reset_recoverydir(char *recdir); +#else +static inline int nfs4_state_init(void) { return 0; } +static inline void nfsd4_free_slabs(void) { } +static inline int nfs4_state_start(void) { return 0; } +static inline void nfs4_state_shutdown(void) { } +static inline void nfs4_reset_lease(time_t leasetime) { } +static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } +#endif + +/* + * lockd binding + */ +void nfsd_lockd_init(void); +void nfsd_lockd_shutdown(void); + + +/* + * These macros provide pre-xdr'ed values for faster operation. + */ +#define nfs_ok cpu_to_be32(NFS_OK) +#define nfserr_perm cpu_to_be32(NFSERR_PERM) +#define nfserr_noent cpu_to_be32(NFSERR_NOENT) +#define nfserr_io cpu_to_be32(NFSERR_IO) +#define nfserr_nxio cpu_to_be32(NFSERR_NXIO) +#define nfserr_eagain cpu_to_be32(NFSERR_EAGAIN) +#define nfserr_acces cpu_to_be32(NFSERR_ACCES) +#define nfserr_exist cpu_to_be32(NFSERR_EXIST) +#define nfserr_xdev cpu_to_be32(NFSERR_XDEV) +#define nfserr_nodev cpu_to_be32(NFSERR_NODEV) +#define nfserr_notdir cpu_to_be32(NFSERR_NOTDIR) +#define nfserr_isdir cpu_to_be32(NFSERR_ISDIR) +#define nfserr_inval cpu_to_be32(NFSERR_INVAL) +#define nfserr_fbig cpu_to_be32(NFSERR_FBIG) +#define nfserr_nospc cpu_to_be32(NFSERR_NOSPC) +#define nfserr_rofs cpu_to_be32(NFSERR_ROFS) +#define nfserr_mlink cpu_to_be32(NFSERR_MLINK) +#define nfserr_opnotsupp cpu_to_be32(NFSERR_OPNOTSUPP) +#define nfserr_nametoolong cpu_to_be32(NFSERR_NAMETOOLONG) +#define nfserr_notempty cpu_to_be32(NFSERR_NOTEMPTY) +#define nfserr_dquot cpu_to_be32(NFSERR_DQUOT) +#define nfserr_stale cpu_to_be32(NFSERR_STALE) +#define nfserr_remote cpu_to_be32(NFSERR_REMOTE) +#define nfserr_wflush cpu_to_be32(NFSERR_WFLUSH) +#define nfserr_badhandle cpu_to_be32(NFSERR_BADHANDLE) +#define nfserr_notsync cpu_to_be32(NFSERR_NOT_SYNC) +#define nfserr_badcookie cpu_to_be32(NFSERR_BAD_COOKIE) +#define nfserr_notsupp cpu_to_be32(NFSERR_NOTSUPP) +#define nfserr_toosmall cpu_to_be32(NFSERR_TOOSMALL) +#define nfserr_serverfault cpu_to_be32(NFSERR_SERVERFAULT) +#define nfserr_badtype cpu_to_be32(NFSERR_BADTYPE) +#define nfserr_jukebox cpu_to_be32(NFSERR_JUKEBOX) +#define nfserr_denied cpu_to_be32(NFSERR_DENIED) +#define nfserr_deadlock cpu_to_be32(NFSERR_DEADLOCK) +#define nfserr_expired cpu_to_be32(NFSERR_EXPIRED) +#define nfserr_bad_cookie cpu_to_be32(NFSERR_BAD_COOKIE) +#define nfserr_same cpu_to_be32(NFSERR_SAME) +#define nfserr_clid_inuse cpu_to_be32(NFSERR_CLID_INUSE) +#define nfserr_stale_clientid cpu_to_be32(NFSERR_STALE_CLIENTID) +#define nfserr_resource cpu_to_be32(NFSERR_RESOURCE) +#define nfserr_moved cpu_to_be32(NFSERR_MOVED) +#define nfserr_nofilehandle cpu_to_be32(NFSERR_NOFILEHANDLE) +#define nfserr_minor_vers_mismatch cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH) +#define nfserr_share_denied cpu_to_be32(NFSERR_SHARE_DENIED) +#define nfserr_stale_stateid cpu_to_be32(NFSERR_STALE_STATEID) +#define nfserr_old_stateid cpu_to_be32(NFSERR_OLD_STATEID) +#define nfserr_bad_stateid cpu_to_be32(NFSERR_BAD_STATEID) +#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID) +#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK) +#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME) +#define nfserr_lock_range cpu_to_be32(NFSERR_LOCK_RANGE) +#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH) +#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) +#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) +#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE) +#define nfserr_badowner cpu_to_be32(NFSERR_BADOWNER) +#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD) +#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL) +#define nfserr_grace cpu_to_be32(NFSERR_GRACE) +#define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE) +#define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD) +#define nfserr_badname cpu_to_be32(NFSERR_BADNAME) +#define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN) +#define nfserr_locked cpu_to_be32(NFSERR_LOCKED) +#define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC) +#define nfserr_badiomode cpu_to_be32(NFS4ERR_BADIOMODE) +#define nfserr_badlayout cpu_to_be32(NFS4ERR_BADLAYOUT) +#define nfserr_bad_session_digest cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST) +#define nfserr_badsession cpu_to_be32(NFS4ERR_BADSESSION) +#define nfserr_badslot cpu_to_be32(NFS4ERR_BADSLOT) +#define nfserr_complete_already cpu_to_be32(NFS4ERR_COMPLETE_ALREADY) +#define nfserr_conn_not_bound_to_session cpu_to_be32(NFS4ERR_CONN_NOT_BOUND_TO_SESSION) +#define nfserr_deleg_already_wanted cpu_to_be32(NFS4ERR_DELEG_ALREADY_WANTED) +#define nfserr_back_chan_busy cpu_to_be32(NFS4ERR_BACK_CHAN_BUSY) +#define nfserr_layouttrylater cpu_to_be32(NFS4ERR_LAYOUTTRYLATER) +#define nfserr_layoutunavailable cpu_to_be32(NFS4ERR_LAYOUTUNAVAILABLE) +#define nfserr_nomatching_layout cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT) +#define nfserr_recallconflict cpu_to_be32(NFS4ERR_RECALLCONFLICT) +#define nfserr_unknown_layouttype cpu_to_be32(NFS4ERR_UNKNOWN_LAYOUTTYPE) +#define nfserr_seq_misordered cpu_to_be32(NFS4ERR_SEQ_MISORDERED) +#define nfserr_sequence_pos cpu_to_be32(NFS4ERR_SEQUENCE_POS) +#define nfserr_req_too_big cpu_to_be32(NFS4ERR_REQ_TOO_BIG) +#define nfserr_rep_too_big cpu_to_be32(NFS4ERR_REP_TOO_BIG) +#define nfserr_rep_too_big_to_cache cpu_to_be32(NFS4ERR_REP_TOO_BIG_TO_CACHE) +#define nfserr_retry_uncached_rep cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP) +#define nfserr_unsafe_compound cpu_to_be32(NFS4ERR_UNSAFE_COMPOUND) +#define nfserr_too_many_ops cpu_to_be32(NFS4ERR_TOO_MANY_OPS) +#define nfserr_op_not_in_session cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION) +#define nfserr_hash_alg_unsupp cpu_to_be32(NFS4ERR_HASH_ALG_UNSUPP) +#define nfserr_clientid_busy cpu_to_be32(NFS4ERR_CLIENTID_BUSY) +#define nfserr_pnfs_io_hole cpu_to_be32(NFS4ERR_PNFS_IO_HOLE) +#define nfserr_seq_false_retry cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY) +#define nfserr_bad_high_slot cpu_to_be32(NFS4ERR_BAD_HIGH_SLOT) +#define nfserr_deadsession cpu_to_be32(NFS4ERR_DEADSESSION) +#define nfserr_encr_alg_unsupp cpu_to_be32(NFS4ERR_ENCR_ALG_UNSUPP) +#define nfserr_pnfs_no_layout cpu_to_be32(NFS4ERR_PNFS_NO_LAYOUT) +#define nfserr_not_only_op cpu_to_be32(NFS4ERR_NOT_ONLY_OP) +#define nfserr_wrong_cred cpu_to_be32(NFS4ERR_WRONG_CRED) +#define nfserr_wrong_type cpu_to_be32(NFS4ERR_WRONG_TYPE) +#define nfserr_dirdeleg_unavail cpu_to_be32(NFS4ERR_DIRDELEG_UNAVAIL) +#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG) +#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT) +#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) + +/* error codes for internal use */ +/* if a request fails due to kmalloc failure, it gets dropped. + * Client should resend eventually + */ +#define nfserr_dropit cpu_to_be32(30000) +/* end-of-file indicator in readdir */ +#define nfserr_eof cpu_to_be32(30001) +/* replay detected */ +#define nfserr_replay_me cpu_to_be32(11001) +/* nfs41 replay detected */ +#define nfserr_replay_cache cpu_to_be32(11002) + +/* Check for dir entries '.' and '..' */ +#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) + +/* + * Time of server startup + */ +extern struct timeval nfssvc_boot; + +#ifdef CONFIG_NFSD_V4 + +extern time_t nfsd4_lease; +extern time_t nfsd4_grace; + +/* before processing a COMPOUND operation, we have to check that there + * is enough space in the buffer for XDR encode to succeed. otherwise, + * we might process an operation with side effects, and be unable to + * tell the client that the operation succeeded. + * + * COMPOUND_SLACK_SPACE - this is the minimum bytes of buffer space + * needed to encode an "ordinary" _successful_ operation. (GETATTR, + * READ, READDIR, and READLINK have their own buffer checks.) if we + * fall below this level, we fail the next operation with NFS4ERR_RESOURCE. + * + * COMPOUND_ERR_SLACK_SPACE - this is the minimum bytes of buffer space + * needed to encode an operation which has failed with NFS4ERR_RESOURCE. + * care is taken to ensure that we never fall below this level for any + * reason. + */ +#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ +#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ + +#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */ + +/* + * The following attributes are currently not supported by the NFSv4 server: + * ARCHIVE (deprecated anyway) + * HIDDEN (unlikely to be supported any time soon) + * MIMETYPE (unlikely to be supported any time soon) + * QUOTA_* (will be supported in a forthcoming patch) + * SYSTEM (unlikely to be supported any time soon) + * TIME_BACKUP (unlikely to be supported any time soon) + * TIME_CREATE (unlikely to be supported any time soon) + */ +#define NFSD4_SUPPORTED_ATTRS_WORD0 \ +(FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \ + | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_LINK_SUPPORT \ + | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR | FATTR4_WORD0_FSID \ + | FATTR4_WORD0_UNIQUE_HANDLES | FATTR4_WORD0_LEASE_TIME | FATTR4_WORD0_RDATTR_ERROR \ + | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_CANSETTIME | FATTR4_WORD0_CASE_INSENSITIVE \ + | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED \ + | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \ + | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_HOMOGENEOUS \ + | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \ + | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_ACL) + +#define NFSD4_SUPPORTED_ATTRS_WORD1 \ +(FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \ + | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \ + | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \ + | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \ + | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \ + | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID) + +#define NFSD4_SUPPORTED_ATTRS_WORD2 0 + +#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + +#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + NFSD4_SUPPORTED_ATTRS_WORD1 + +#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ + (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) + +static inline u32 nfsd_suppattrs0(u32 minorversion) +{ + return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0 + : NFSD4_SUPPORTED_ATTRS_WORD0; +} + +static inline u32 nfsd_suppattrs1(u32 minorversion) +{ + return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1 + : NFSD4_SUPPORTED_ATTRS_WORD1; +} + +static inline u32 nfsd_suppattrs2(u32 minorversion) +{ + return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2 + : NFSD4_SUPPORTED_ATTRS_WORD2; +} + +/* These will return ERR_INVAL if specified in GETATTR or READDIR. */ +#define NFSD_WRITEONLY_ATTRS_WORD1 \ +(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) + +/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ +#define NFSD_WRITEABLE_ATTRS_WORD0 \ +(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL ) +#define NFSD_WRITEABLE_ATTRS_WORD1 \ +(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) +#define NFSD_WRITEABLE_ATTRS_WORD2 0 + +#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ + NFSD_WRITEABLE_ATTRS_WORD0 +/* + * we currently store the exclusive create verifier in the v_{a,m}time + * attributes so the client can't set these at create time using EXCLUSIVE4_1 + */ +#define NFSD_SUPPATTR_EXCLCREAT_WORD1 \ + (NFSD_WRITEABLE_ATTRS_WORD1 & \ + ~(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)) +#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \ + NFSD_WRITEABLE_ATTRS_WORD2 + +#endif /* CONFIG_NFSD_V4 */ + +#endif /* LINUX_NFSD_NFSD_H */ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c new file mode 100644 index 00000000..90c6aa6d --- /dev/null +++ b/fs/nfsd/nfsfh.c @@ -0,0 +1,693 @@ +/* + * NFS server file handle treatment. + * + * Copyright (C) 1995, 1996 Olaf Kirch + * Portions Copyright (C) 1999 G. Allen Morris III + * Extensive rewrite by Neil Brown Southern-Spring 1999 + * ... and again Southern-Winter 2001 to support export_operations + */ + +#include + +#include +#include "nfsd.h" +#include "vfs.h" +#include "auth.h" + +#define NFSDDBG_FACILITY NFSDDBG_FH + + +/* + * our acceptability function. + * if NOSUBTREECHECK, accept anything + * if not, require that we can walk up to exp->ex_dentry + * doing some checks on the 'x' bits + */ +static int nfsd_acceptable(void *expv, struct dentry *dentry) +{ + struct svc_export *exp = expv; + int rv; + struct dentry *tdentry; + struct dentry *parent; + + if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) + return 1; + + tdentry = dget(dentry); + while (tdentry != exp->ex_path.dentry && !IS_ROOT(tdentry)) { + /* make sure parents give x permission to user */ + int err; + parent = dget_parent(tdentry); + err = inode_permission(parent->d_inode, MAY_EXEC); + if (err < 0) { + dput(parent); + break; + } + dput(tdentry); + tdentry = parent; + } + if (tdentry != exp->ex_path.dentry) + dprintk("nfsd_acceptable failed at %p %s\n", tdentry, tdentry->d_name.name); + rv = (tdentry == exp->ex_path.dentry); + dput(tdentry); + return rv; +} + +/* Type check. The correct error return for type mismatches does not seem to be + * generally agreed upon. SunOS seems to use EISDIR if file isn't S_IFREG; a + * comment in the NFSv3 spec says this is incorrect (implementation notes for + * the write call). + */ +static inline __be32 +nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type) +{ + /* Type can be negative when creating hardlinks - not to a dir */ + if (type > 0 && (mode & S_IFMT) != type) { + if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) + return nfserr_symlink; + else if (type == S_IFDIR) + return nfserr_notdir; + else if ((mode & S_IFMT) == S_IFDIR) + return nfserr_isdir; + else + return nfserr_inval; + } + if (type < 0 && (mode & S_IFMT) == -type) { + if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) + return nfserr_symlink; + else if (type == -S_IFDIR) + return nfserr_isdir; + else + return nfserr_notdir; + } + return 0; +} + +static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, + struct svc_export *exp) +{ + int flags = nfsexp_flags(rqstp, exp); + + /* Check if the request originated from a secure port. */ + if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) { + RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + dprintk(KERN_WARNING + "nfsd: request from insecure port %s!\n", + svc_print_addr(rqstp, buf, sizeof(buf))); + return nfserr_perm; + } + + /* Set user creds for this exportpoint */ + return nfserrno(nfsd_setuser(rqstp, exp)); +} + +static inline __be32 check_pseudo_root(struct svc_rqst *rqstp, + struct dentry *dentry, struct svc_export *exp) +{ + if (!(exp->ex_flags & NFSEXP_V4ROOT)) + return nfs_ok; + /* + * v2/v3 clients have no need for the V4ROOT export--they use + * the mount protocl instead; also, further V4ROOT checks may be + * in v4-specific code, in which case v2/v3 clients could bypass + * them. + */ + if (!nfsd_v4client(rqstp)) + return nfserr_stale; + /* + * We're exposing only the directories and symlinks that have to be + * traversed on the way to real exports: + */ + if (unlikely(!S_ISDIR(dentry->d_inode->i_mode) && + !S_ISLNK(dentry->d_inode->i_mode))) + return nfserr_stale; + /* + * A pseudoroot export gives permission to access only one + * single directory; the kernel has to make another upcall + * before granting access to anything else under it: + */ + if (unlikely(dentry != exp->ex_path.dentry)) + return nfserr_stale; + return nfs_ok; +} + +/* + * Use the given filehandle to look up the corresponding export and + * dentry. On success, the results are used to set fh_export and + * fh_dentry. + */ +static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) +{ + struct knfsd_fh *fh = &fhp->fh_handle; + struct fid *fid = NULL, sfid; + struct svc_export *exp; + struct dentry *dentry; + int fileid_type; + int data_left = fh->fh_size/4; + __be32 error; + + error = nfserr_stale; + if (rqstp->rq_vers > 2) + error = nfserr_badhandle; + if (rqstp->rq_vers == 4 && fh->fh_size == 0) + return nfserr_nofilehandle; + + if (fh->fh_version == 1) { + int len; + + if (--data_left < 0) + return error; + if (fh->fh_auth_type != 0) + return error; + len = key_len(fh->fh_fsid_type) / 4; + if (len == 0) + return error; + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { + /* deprecated, convert to type 3 */ + len = key_len(FSID_ENCODE_DEV)/4; + fh->fh_fsid_type = FSID_ENCODE_DEV; + fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl(fh->fh_fsid[0]), ntohl(fh->fh_fsid[1]))); + fh->fh_fsid[1] = fh->fh_fsid[2]; + } + data_left -= len; + if (data_left < 0) + return error; + exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); + fid = (struct fid *)(fh->fh_auth + len); + } else { + __u32 tfh[2]; + dev_t xdev; + ino_t xino; + + if (fh->fh_size != NFS_FHSIZE) + return error; + /* assume old filehandle format */ + xdev = old_decode_dev(fh->ofh_xdev); + xino = u32_to_ino_t(fh->ofh_xino); + mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL); + exp = rqst_exp_find(rqstp, FSID_DEV, tfh); + } + + error = nfserr_stale; + if (PTR_ERR(exp) == -ENOENT) + return error; + + if (IS_ERR(exp)) + return nfserrno(PTR_ERR(exp)); + + if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) { + /* Elevate privileges so that the lack of 'r' or 'x' + * permission on some parent directory will + * not stop exportfs_decode_fh from being able + * to reconnect a directory into the dentry cache. + * The same problem can affect "SUBTREECHECK" exports, + * but as nfsd_acceptable depends on correct + * access control settings being in effect, we cannot + * fix that case easily. + */ + struct cred *new = prepare_creds(); + if (!new) + return nfserrno(-ENOMEM); + new->cap_effective = + cap_raise_nfsd_set(new->cap_effective, + new->cap_permitted); + put_cred(override_creds(new)); + put_cred(new); + } else { + error = nfsd_setuser_and_check_port(rqstp, exp); + if (error) + goto out; + } + + /* + * Look up the dentry using the NFS file handle. + */ + error = nfserr_stale; + if (rqstp->rq_vers > 2) + error = nfserr_badhandle; + + if (fh->fh_version != 1) { + sfid.i32.ino = fh->ofh_ino; + sfid.i32.gen = fh->ofh_generation; + sfid.i32.parent_ino = fh->ofh_dirino; + fid = &sfid; + data_left = 3; + if (fh->ofh_dirino == 0) + fileid_type = FILEID_INO32_GEN; + else + fileid_type = FILEID_INO32_GEN_PARENT; + } else + fileid_type = fh->fh_fileid_type; + + if (fileid_type == FILEID_ROOT) + dentry = dget(exp->ex_path.dentry); + else { + dentry = exportfs_decode_fh(exp->ex_path.mnt, fid, + data_left, fileid_type, + nfsd_acceptable, exp); + } + if (dentry == NULL) + goto out; + if (IS_ERR(dentry)) { + if (PTR_ERR(dentry) != -EINVAL) + error = nfserrno(PTR_ERR(dentry)); + goto out; + } + + if (S_ISDIR(dentry->d_inode->i_mode) && + (dentry->d_flags & DCACHE_DISCONNECTED)) { + printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + } + + fhp->fh_dentry = dentry; + fhp->fh_export = exp; + return 0; +out: + exp_put(exp); + return error; +} + +/** + * fh_verify - filehandle lookup and access checking + * @rqstp: pointer to current rpc request + * @fhp: filehandle to be verified + * @type: expected type of object pointed to by filehandle + * @access: type of access needed to object + * + * Look up a dentry from the on-the-wire filehandle, check the client's + * access to the export, and set the current task's credentials. + * + * Regardless of success or failure of fh_verify(), fh_put() should be + * called on @fhp when the caller is finished with the filehandle. + * + * fh_verify() may be called multiple times on a given filehandle, for + * example, when processing an NFSv4 compound. The first call will look + * up a dentry using the on-the-wire filehandle. Subsequent calls will + * skip the lookup and just perform the other checks and possibly change + * the current task's credentials. + * + * @type specifies the type of object expected using one of the S_IF* + * constants defined in include/linux/stat.h. The caller may use zero + * to indicate that it doesn't care, or a negative integer to indicate + * that it expects something not of the given type. + * + * @access is formed from the NFSD_MAY_* constants defined in + * include/linux/nfsd/nfsd.h. + */ +__be32 +fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) +{ + struct svc_export *exp; + struct dentry *dentry; + __be32 error; + + dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp)); + + if (!fhp->fh_dentry) { + error = nfsd_set_fh_dentry(rqstp, fhp); + if (error) + goto out; + } + dentry = fhp->fh_dentry; + exp = fhp->fh_export; + /* + * We still have to do all these permission checks, even when + * fh_dentry is already set: + * - fh_verify may be called multiple times with different + * "access" arguments (e.g. nfsd_proc_create calls + * fh_verify(...,NFSD_MAY_EXEC) first, then later (in + * nfsd_create) calls fh_verify(...,NFSD_MAY_CREATE). + * - in the NFSv4 case, the filehandle may have been filled + * in by fh_compose, and given a dentry, but further + * compound operations performed with that filehandle + * still need permissions checks. In the worst case, a + * mountpoint crossing may have changed the export + * options, and we may now need to use a different uid + * (for example, if different id-squashing options are in + * effect on the new filesystem). + */ + error = check_pseudo_root(rqstp, dentry, exp); + if (error) + goto out; + + error = nfsd_setuser_and_check_port(rqstp, exp); + if (error) + goto out; + + error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type); + if (error) + goto out; + + /* + * pseudoflavor restrictions are not enforced on NLM, + * which clients virtually always use auth_sys for, + * even while using RPCSEC_GSS for NFS. + */ + if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS) + goto skip_pseudoflavor_check; + /* + * Clients may expect to be able to use auth_sys during mount, + * even if they use gss for everything else; see section 2.3.2 + * of rfc 2623. + */ + if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT + && exp->ex_path.dentry == dentry) + goto skip_pseudoflavor_check; + + error = check_nfsd_access(exp, rqstp); + if (error) + goto out; + +skip_pseudoflavor_check: + /* Finally, check access permissions. */ + error = nfsd_permission(rqstp, exp, dentry, access); + + if (error) { + dprintk("fh_verify: %s/%s permission failure, " + "acc=%x, error=%d\n", + dentry->d_parent->d_name.name, + dentry->d_name.name, + access, ntohl(error)); + } +out: + if (error == nfserr_stale) + nfsdstats.fh_stale++; + return error; +} + + +/* + * Compose a file handle for an NFS reply. + * + * Note that when first composed, the dentry may not yet have + * an inode. In this case a call to fh_update should be made + * before the fh goes out on the wire ... + */ +static void _fh_update(struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry) +{ + if (dentry != exp->ex_path.dentry) { + struct fid *fid = (struct fid *) + (fhp->fh_handle.fh_auth + fhp->fh_handle.fh_size/4 - 1); + int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4; + int subtreecheck = !(exp->ex_flags & NFSEXP_NOSUBTREECHECK); + + fhp->fh_handle.fh_fileid_type = + exportfs_encode_fh(dentry, fid, &maxsize, subtreecheck); + fhp->fh_handle.fh_size += maxsize * 4; + } else { + fhp->fh_handle.fh_fileid_type = FILEID_ROOT; + } +} + +/* + * for composing old style file handles + */ +static inline void _fh_update_old(struct dentry *dentry, + struct svc_export *exp, + struct knfsd_fh *fh) +{ + fh->ofh_ino = ino_t_to_u32(dentry->d_inode->i_ino); + fh->ofh_generation = dentry->d_inode->i_generation; + if (S_ISDIR(dentry->d_inode->i_mode) || + (exp->ex_flags & NFSEXP_NOSUBTREECHECK)) + fh->ofh_dirino = 0; +} + +static bool is_root_export(struct svc_export *exp) +{ + return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root; +} + +static struct super_block *exp_sb(struct svc_export *exp) +{ + return exp->ex_path.dentry->d_inode->i_sb; +} + +static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) +{ + switch (fsid_type) { + case FSID_DEV: + if (!old_valid_dev(exp_sb(exp)->s_dev)) + return 0; + /* FALL THROUGH */ + case FSID_MAJOR_MINOR: + case FSID_ENCODE_DEV: + return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV; + case FSID_NUM: + return exp->ex_flags & NFSEXP_FSID; + case FSID_UUID8: + case FSID_UUID16: + if (!is_root_export(exp)) + return 0; + /* fall through */ + case FSID_UUID4_INUM: + case FSID_UUID16_INUM: + return exp->ex_uuid != NULL; + } + return 1; +} + + +static void set_version_and_fsid_type(struct svc_fh *fhp, struct svc_export *exp, struct svc_fh *ref_fh) +{ + u8 version; + u8 fsid_type; +retry: + version = 1; + if (ref_fh && ref_fh->fh_export == exp) { + version = ref_fh->fh_handle.fh_version; + fsid_type = ref_fh->fh_handle.fh_fsid_type; + + ref_fh = NULL; + + switch (version) { + case 0xca: + fsid_type = FSID_DEV; + break; + case 1: + break; + default: + goto retry; + } + + /* + * As the fsid -> filesystem mapping was guided by + * user-space, there is no guarantee that the filesystem + * actually supports that fsid type. If it doesn't we + * loop around again without ref_fh set. + */ + if (!fsid_type_ok_for_exp(fsid_type, exp)) + goto retry; + } else if (exp->ex_flags & NFSEXP_FSID) { + fsid_type = FSID_NUM; + } else if (exp->ex_uuid) { + if (fhp->fh_maxsize >= 64) { + if (is_root_export(exp)) + fsid_type = FSID_UUID16; + else + fsid_type = FSID_UUID16_INUM; + } else { + if (is_root_export(exp)) + fsid_type = FSID_UUID8; + else + fsid_type = FSID_UUID4_INUM; + } + } else if (!old_valid_dev(exp_sb(exp)->s_dev)) + /* for newer device numbers, we must use a newer fsid format */ + fsid_type = FSID_ENCODE_DEV; + else + fsid_type = FSID_DEV; + fhp->fh_handle.fh_version = version; + if (version) + fhp->fh_handle.fh_fsid_type = fsid_type; +} + +__be32 +fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, + struct svc_fh *ref_fh) +{ + /* ref_fh is a reference file handle. + * if it is non-null and for the same filesystem, then we should compose + * a filehandle which is of the same version, where possible. + * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca + * Then create a 32byte filehandle using nfs_fhbase_old + * + */ + + struct inode * inode = dentry->d_inode; + struct dentry *parent = dentry->d_parent; + __u32 *datap; + dev_t ex_dev = exp_sb(exp)->s_dev; + + dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n", + MAJOR(ex_dev), MINOR(ex_dev), + (long) exp->ex_path.dentry->d_inode->i_ino, + parent->d_name.name, dentry->d_name.name, + (inode ? inode->i_ino : 0)); + + /* Choose filehandle version and fsid type based on + * the reference filehandle (if it is in the same export) + * or the export options. + */ + set_version_and_fsid_type(fhp, exp, ref_fh); + + if (ref_fh == fhp) + fh_put(ref_fh); + + if (fhp->fh_locked || fhp->fh_dentry) { + printk(KERN_ERR "fh_compose: fh %s/%s not initialized!\n", + parent->d_name.name, dentry->d_name.name); + } + if (fhp->fh_maxsize < NFS_FHSIZE) + printk(KERN_ERR "fh_compose: called with maxsize %d! %s/%s\n", + fhp->fh_maxsize, + parent->d_name.name, dentry->d_name.name); + + fhp->fh_dentry = dget(dentry); /* our internal copy */ + fhp->fh_export = exp; + cache_get(&exp->h); + + if (fhp->fh_handle.fh_version == 0xca) { + /* old style filehandle please */ + memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE); + fhp->fh_handle.fh_size = NFS_FHSIZE; + fhp->fh_handle.ofh_dcookie = 0xfeebbaca; + fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev); + fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev; + fhp->fh_handle.ofh_xino = + ino_t_to_u32(exp->ex_path.dentry->d_inode->i_ino); + fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry)); + if (inode) + _fh_update_old(dentry, exp, &fhp->fh_handle); + } else { + int len; + fhp->fh_handle.fh_auth_type = 0; + datap = fhp->fh_handle.fh_auth+0; + mk_fsid(fhp->fh_handle.fh_fsid_type, datap, ex_dev, + exp->ex_path.dentry->d_inode->i_ino, + exp->ex_fsid, exp->ex_uuid); + + len = key_len(fhp->fh_handle.fh_fsid_type); + datap += len/4; + fhp->fh_handle.fh_size = 4 + len; + + if (inode) + _fh_update(fhp, exp, dentry); + if (fhp->fh_handle.fh_fileid_type == 255) { + fh_put(fhp); + return nfserr_opnotsupp; + } + } + + return 0; +} + +/* + * Update file handle information after changing a dentry. + * This is only called by nfsd_create, nfsd_create_v3 and nfsd_proc_create + */ +__be32 +fh_update(struct svc_fh *fhp) +{ + struct dentry *dentry; + + if (!fhp->fh_dentry) + goto out_bad; + + dentry = fhp->fh_dentry; + if (!dentry->d_inode) + goto out_negative; + if (fhp->fh_handle.fh_version != 1) { + _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle); + } else { + if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT) + goto out; + + _fh_update(fhp, fhp->fh_export, dentry); + if (fhp->fh_handle.fh_fileid_type == 255) + return nfserr_opnotsupp; + } +out: + return 0; + +out_bad: + printk(KERN_ERR "fh_update: fh not verified!\n"); + goto out; +out_negative: + printk(KERN_ERR "fh_update: %s/%s still negative!\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + goto out; +} + +/* + * Release a file handle. + */ +void +fh_put(struct svc_fh *fhp) +{ + struct dentry * dentry = fhp->fh_dentry; + struct svc_export * exp = fhp->fh_export; + if (dentry) { + fh_unlock(fhp); + fhp->fh_dentry = NULL; + dput(dentry); +#ifdef CONFIG_NFSD_V3 + fhp->fh_pre_saved = 0; + fhp->fh_post_saved = 0; +#endif + } + if (exp) { + cache_put(&exp->h, &svc_export_cache); + fhp->fh_export = NULL; + } + return; +} + +/* + * Shorthand for dprintk()'s + */ +char * SVCFH_fmt(struct svc_fh *fhp) +{ + struct knfsd_fh *fh = &fhp->fh_handle; + + static char buf[80]; + sprintf(buf, "%d: %08x %08x %08x %08x %08x %08x", + fh->fh_size, + fh->fh_base.fh_pad[0], + fh->fh_base.fh_pad[1], + fh->fh_base.fh_pad[2], + fh->fh_base.fh_pad[3], + fh->fh_base.fh_pad[4], + fh->fh_base.fh_pad[5]); + return buf; +} + +enum fsid_source fsid_source(struct svc_fh *fhp) +{ + if (fhp->fh_handle.fh_version != 1) + return FSIDSOURCE_DEV; + switch(fhp->fh_handle.fh_fsid_type) { + case FSID_DEV: + case FSID_ENCODE_DEV: + case FSID_MAJOR_MINOR: + if (exp_sb(fhp->fh_export)->s_type->fs_flags & FS_REQUIRES_DEV) + return FSIDSOURCE_DEV; + break; + case FSID_NUM: + if (fhp->fh_export->ex_flags & NFSEXP_FSID) + return FSIDSOURCE_FSID; + break; + default: + break; + } + /* either a UUID type filehandle, or the filehandle doesn't + * match the export. + */ + if (fhp->fh_export->ex_flags & NFSEXP_FSID) + return FSIDSOURCE_FSID; + if (fhp->fh_export->ex_uuid) + return FSIDSOURCE_UUID; + return FSIDSOURCE_DEV; +} diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h new file mode 100644 index 00000000..c16f8d83 --- /dev/null +++ b/fs/nfsd/nfsfh.h @@ -0,0 +1,206 @@ +/* Copyright (C) 1995, 1996, 1997 Olaf Kirch */ + +#ifndef _LINUX_NFSD_FH_INT_H +#define _LINUX_NFSD_FH_INT_H + +#include + +enum nfsd_fsid { + FSID_DEV = 0, + FSID_NUM, + FSID_MAJOR_MINOR, + FSID_ENCODE_DEV, + FSID_UUID4_INUM, + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, +}; + +enum fsid_source { + FSIDSOURCE_DEV, + FSIDSOURCE_FSID, + FSIDSOURCE_UUID, +}; +extern enum fsid_source fsid_source(struct svc_fh *fhp); + + +/* This might look a little large to "inline" but in all calls except + * one, 'vers' is constant so moste of the function disappears. + */ +static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino, + u32 fsid, unsigned char *uuid) +{ + u32 *up; + switch(vers) { + case FSID_DEV: + fsidv[0] = htonl((MAJOR(dev)<<16) | + MINOR(dev)); + fsidv[1] = ino_t_to_u32(ino); + break; + case FSID_NUM: + fsidv[0] = fsid; + break; + case FSID_MAJOR_MINOR: + fsidv[0] = htonl(MAJOR(dev)); + fsidv[1] = htonl(MINOR(dev)); + fsidv[2] = ino_t_to_u32(ino); + break; + + case FSID_ENCODE_DEV: + fsidv[0] = new_encode_dev(dev); + fsidv[1] = ino_t_to_u32(ino); + break; + + case FSID_UUID4_INUM: + /* 4 byte fsid and inode number */ + up = (u32*)uuid; + fsidv[0] = ino_t_to_u32(ino); + fsidv[1] = up[0] ^ up[1] ^ up[2] ^ up[3]; + break; + + case FSID_UUID8: + /* 8 byte fsid */ + up = (u32*)uuid; + fsidv[0] = up[0] ^ up[2]; + fsidv[1] = up[1] ^ up[3]; + break; + + case FSID_UUID16: + /* 16 byte fsid - NFSv3+ only */ + memcpy(fsidv, uuid, 16); + break; + + case FSID_UUID16_INUM: + /* 8 byte inode and 16 byte fsid */ + *(u64*)fsidv = (u64)ino; + memcpy(fsidv+2, uuid, 16); + break; + default: BUG(); + } +} + +static inline int key_len(int type) +{ + switch(type) { + case FSID_DEV: return 8; + case FSID_NUM: return 4; + case FSID_MAJOR_MINOR: return 12; + case FSID_ENCODE_DEV: return 8; + case FSID_UUID4_INUM: return 8; + case FSID_UUID8: return 8; + case FSID_UUID16: return 16; + case FSID_UUID16_INUM: return 24; + default: return 0; + } +} + +/* + * Shorthand for dprintk()'s + */ +extern char * SVCFH_fmt(struct svc_fh *fhp); + +/* + * Function prototypes + */ +__be32 fh_verify(struct svc_rqst *, struct svc_fh *, int, int); +__be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *); +__be32 fh_update(struct svc_fh *); +void fh_put(struct svc_fh *); + +static __inline__ struct svc_fh * +fh_copy(struct svc_fh *dst, struct svc_fh *src) +{ + WARN_ON(src->fh_dentry || src->fh_locked); + + *dst = *src; + return dst; +} + +static inline void +fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src) +{ + dst->fh_size = src->fh_size; + memcpy(&dst->fh_base, &src->fh_base, src->fh_size); +} + +static __inline__ struct svc_fh * +fh_init(struct svc_fh *fhp, int maxsize) +{ + memset(fhp, 0, sizeof(*fhp)); + fhp->fh_maxsize = maxsize; + return fhp; +} + +#ifdef CONFIG_NFSD_V3 +/* + * Fill in the pre_op attr for the wcc data + */ +static inline void +fill_pre_wcc(struct svc_fh *fhp) +{ + struct inode *inode; + + inode = fhp->fh_dentry->d_inode; + if (!fhp->fh_pre_saved) { + fhp->fh_pre_mtime = inode->i_mtime; + fhp->fh_pre_ctime = inode->i_ctime; + fhp->fh_pre_size = inode->i_size; + fhp->fh_pre_change = inode->i_version; + fhp->fh_pre_saved = 1; + } +} + +extern void fill_post_wcc(struct svc_fh *); +#else +#define fill_pre_wcc(ignored) +#define fill_post_wcc(notused) +#endif /* CONFIG_NFSD_V3 */ + + +/* + * Lock a file handle/inode + * NOTE: both fh_lock and fh_unlock are done "by hand" in + * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once + * so, any changes here should be reflected there. + */ + +static inline void +fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) +{ + struct dentry *dentry = fhp->fh_dentry; + struct inode *inode; + + BUG_ON(!dentry); + + if (fhp->fh_locked) { + printk(KERN_WARNING "fh_lock: %s/%s already locked!\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + return; + } + + inode = dentry->d_inode; + mutex_lock_nested(&inode->i_mutex, subclass); + fill_pre_wcc(fhp); + fhp->fh_locked = 1; +} + +static inline void +fh_lock(struct svc_fh *fhp) +{ + fh_lock_nested(fhp, I_MUTEX_NORMAL); +} + +/* + * Unlock a file handle/inode + */ +static inline void +fh_unlock(struct svc_fh *fhp) +{ + if (fhp->fh_locked) { + fill_post_wcc(fhp); + mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); + fhp->fh_locked = 0; + } +} + +#endif /* _LINUX_NFSD_FH_INT_H */ diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c new file mode 100644 index 00000000..e15dc45f --- /dev/null +++ b/fs/nfsd/nfsproc.c @@ -0,0 +1,755 @@ +/* + * Process version 2 NFS requests. + * + * Copyright (C) 1995-1997 Olaf Kirch + */ + +#include + +#include "cache.h" +#include "xdr.h" +#include "vfs.h" + +typedef struct svc_rqst svc_rqst; +typedef struct svc_buf svc_buf; + +#define NFSDDBG_FACILITY NFSDDBG_PROC + + +static __be32 +nfsd_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return nfs_ok; +} + +static __be32 +nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp) +{ + if (err) return err; + return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt, + resp->fh.fh_dentry, + &resp->stat)); +} +static __be32 +nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp) +{ + if (err) return err; + return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt, + resp->fh.fh_dentry, + &resp->stat)); +} +/* + * Get a file's attributes + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, + struct nfsd_attrstat *resp) +{ + __be32 nfserr; + dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); + + fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + return nfsd_return_attrs(nfserr, resp); +} + +/* + * Set a file's attributes + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp, + struct nfsd_attrstat *resp) +{ + __be32 nfserr; + dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n", + SVCFH_fmt(&argp->fh), + argp->attrs.ia_valid, (long) argp->attrs.ia_size); + + fh_copy(&resp->fh, &argp->fh); + nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,0, (time_t)0); + return nfsd_return_attrs(nfserr, resp); +} + +/* + * Look up a path name component + * Note: the dentry in the resp->fh may be negative if the file + * doesn't exist yet. + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_lookup(struct svc_rqst *rqstp, struct nfsd_diropargs *argp, + struct nfsd_diropres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: LOOKUP %s %.*s\n", + SVCFH_fmt(&argp->fh), argp->len, argp->name); + + fh_init(&resp->fh, NFS_FHSIZE); + nfserr = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len, + &resp->fh); + + fh_put(&argp->fh); + return nfsd_return_dirop(nfserr, resp); +} + +/* + * Read a symlink. + */ +static __be32 +nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_readlinkargs *argp, + struct nfsd_readlinkres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh)); + + /* Read the symlink. */ + resp->len = NFS_MAXPATHLEN; + nfserr = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len); + + fh_put(&argp->fh); + return nfserr; +} + +/* + * Read a portion of a file. + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp, + struct nfsd_readres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: READ %s %d bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->count, argp->offset); + + /* Obtain buffer pointer for payload. 19 is 1 word for + * status, 17 words for fattr, and 1 word for the byte count. + */ + + if (NFSSVC_MAXBLKSIZE_V2 < argp->count) { + char buf[RPC_MAX_ADDRBUFLEN]; + printk(KERN_NOTICE + "oversized read request from %s (%d bytes)\n", + svc_print_addr(rqstp, buf, sizeof(buf)), + argp->count); + argp->count = NFSSVC_MAXBLKSIZE_V2; + } + svc_reserve_auth(rqstp, (19<<2) + argp->count + 4); + + resp->count = argp->count; + nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), + argp->offset, + rqstp->rq_vec, argp->vlen, + &resp->count); + + if (nfserr) return nfserr; + return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt, + resp->fh.fh_dentry, + &resp->stat)); +} + +/* + * Write data to a file + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp, + struct nfsd_attrstat *resp) +{ + __be32 nfserr; + int stable = 1; + unsigned long cnt = argp->len; + + dprintk("nfsd: WRITE %s %d bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->len, argp->offset); + + nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, + argp->offset, + rqstp->rq_vec, argp->vlen, + &cnt, + &stable); + return nfsd_return_attrs(nfserr, resp); +} + +/* + * CREATE processing is complicated. The keyword here is `overloaded.' + * The parent directory is kept locked between the check for existence + * and the actual create() call in compliance with VFS protocols. + * N.B. After this call _both_ argp->fh and resp->fh need an fh_put + */ +static __be32 +nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, + struct nfsd_diropres *resp) +{ + svc_fh *dirfhp = &argp->fh; + svc_fh *newfhp = &resp->fh; + struct iattr *attr = &argp->attrs; + struct inode *inode; + struct dentry *dchild; + int type, mode; + __be32 nfserr; + dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); + + dprintk("nfsd: CREATE %s %.*s\n", + SVCFH_fmt(dirfhp), argp->len, argp->name); + + /* First verify the parent file handle */ + nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC); + if (nfserr) + goto done; /* must fh_put dirfhp even on error */ + + /* Check for NFSD_MAY_WRITE in nfsd_create if necessary */ + + nfserr = nfserr_acces; + if (!argp->len) + goto done; + nfserr = nfserr_exist; + if (isdotent(argp->name, argp->len)) + goto done; + fh_lock_nested(dirfhp, I_MUTEX_PARENT); + dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len); + if (IS_ERR(dchild)) { + nfserr = nfserrno(PTR_ERR(dchild)); + goto out_unlock; + } + fh_init(newfhp, NFS_FHSIZE); + nfserr = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp); + if (!nfserr && !dchild->d_inode) + nfserr = nfserr_noent; + dput(dchild); + if (nfserr) { + if (nfserr != nfserr_noent) + goto out_unlock; + /* + * If the new file handle wasn't verified, we can't tell + * whether the file exists or not. Time to bail ... + */ + nfserr = nfserr_acces; + if (!newfhp->fh_dentry) { + printk(KERN_WARNING + "nfsd_proc_create: file handle not verified\n"); + goto out_unlock; + } + } + + inode = newfhp->fh_dentry->d_inode; + + /* Unfudge the mode bits */ + if (attr->ia_valid & ATTR_MODE) { + type = attr->ia_mode & S_IFMT; + mode = attr->ia_mode & ~S_IFMT; + if (!type) { + /* no type, so if target exists, assume same as that, + * else assume a file */ + if (inode) { + type = inode->i_mode & S_IFMT; + switch(type) { + case S_IFCHR: + case S_IFBLK: + /* reserve rdev for later checking */ + rdev = inode->i_rdev; + attr->ia_valid |= ATTR_SIZE; + + /* FALLTHROUGH */ + case S_IFIFO: + /* this is probably a permission check.. + * at least IRIX implements perm checking on + * echo thing > device-special-file-or-pipe + * by doing a CREATE with type==0 + */ + nfserr = nfsd_permission(rqstp, + newfhp->fh_export, + newfhp->fh_dentry, + NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS); + if (nfserr && nfserr != nfserr_rofs) + goto out_unlock; + } + } else + type = S_IFREG; + } + } else if (inode) { + type = inode->i_mode & S_IFMT; + mode = inode->i_mode & ~S_IFMT; + } else { + type = S_IFREG; + mode = 0; /* ??? */ + } + + attr->ia_valid |= ATTR_MODE; + attr->ia_mode = mode; + + /* Special treatment for non-regular files according to the + * gospel of sun micro + */ + if (type != S_IFREG) { + if (type != S_IFBLK && type != S_IFCHR) { + rdev = 0; + } else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) { + /* If you think you've seen the worst, grok this. */ + type = S_IFIFO; + } else { + /* Okay, char or block special */ + if (!rdev) + rdev = wanted; + } + + /* we've used the SIZE information, so discard it */ + attr->ia_valid &= ~ATTR_SIZE; + + /* Make sure the type and device matches */ + nfserr = nfserr_exist; + if (inode && type != (inode->i_mode & S_IFMT)) + goto out_unlock; + } + + nfserr = 0; + if (!inode) { + /* File doesn't exist. Create it and set attrs */ + nfserr = nfsd_create(rqstp, dirfhp, argp->name, argp->len, + attr, type, rdev, newfhp); + } else if (type == S_IFREG) { + dprintk("nfsd: existing %s, valid=%x, size=%ld\n", + argp->name, attr->ia_valid, (long) attr->ia_size); + /* File already exists. We ignore all attributes except + * size, so that creat() behaves exactly like + * open(..., O_CREAT|O_TRUNC|O_WRONLY). + */ + attr->ia_valid &= ATTR_SIZE; + if (attr->ia_valid) + nfserr = nfsd_setattr(rqstp, newfhp, attr, 0, (time_t)0); + } + +out_unlock: + /* We don't really need to unlock, as fh_put does it. */ + fh_unlock(dirfhp); + +done: + fh_put(dirfhp); + return nfsd_return_dirop(nfserr, resp); +} + +static __be32 +nfsd_proc_remove(struct svc_rqst *rqstp, struct nfsd_diropargs *argp, + void *resp) +{ + __be32 nfserr; + + dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh), + argp->len, argp->name); + + /* Unlink. -SIFDIR means file must not be a directory */ + nfserr = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR, argp->name, argp->len); + fh_put(&argp->fh); + return nfserr; +} + +static __be32 +nfsd_proc_rename(struct svc_rqst *rqstp, struct nfsd_renameargs *argp, + void *resp) +{ + __be32 nfserr; + + dprintk("nfsd: RENAME %s %.*s -> \n", + SVCFH_fmt(&argp->ffh), argp->flen, argp->fname); + dprintk("nfsd: -> %s %.*s\n", + SVCFH_fmt(&argp->tfh), argp->tlen, argp->tname); + + nfserr = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen, + &argp->tfh, argp->tname, argp->tlen); + fh_put(&argp->ffh); + fh_put(&argp->tfh); + return nfserr; +} + +static __be32 +nfsd_proc_link(struct svc_rqst *rqstp, struct nfsd_linkargs *argp, + void *resp) +{ + __be32 nfserr; + + dprintk("nfsd: LINK %s ->\n", + SVCFH_fmt(&argp->ffh)); + dprintk("nfsd: %s %.*s\n", + SVCFH_fmt(&argp->tfh), + argp->tlen, + argp->tname); + + nfserr = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen, + &argp->ffh); + fh_put(&argp->ffh); + fh_put(&argp->tfh); + return nfserr; +} + +static __be32 +nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp, + void *resp) +{ + struct svc_fh newfh; + __be32 nfserr; + + dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n", + SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, + argp->tlen, argp->tname); + + fh_init(&newfh, NFS_FHSIZE); + /* + * Create the link, look up new file and set attrs. + */ + nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, + argp->tname, argp->tlen, + &newfh, &argp->attrs); + + + fh_put(&argp->ffh); + fh_put(&newfh); + return nfserr; +} + +/* + * Make directory. This operation is not idempotent. + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_mkdir(struct svc_rqst *rqstp, struct nfsd_createargs *argp, + struct nfsd_diropres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); + + if (resp->fh.fh_dentry) { + printk(KERN_WARNING + "nfsd_proc_mkdir: response already verified??\n"); + } + + argp->attrs.ia_valid &= ~ATTR_SIZE; + fh_init(&resp->fh, NFS_FHSIZE); + nfserr = nfsd_create(rqstp, &argp->fh, argp->name, argp->len, + &argp->attrs, S_IFDIR, 0, &resp->fh); + fh_put(&argp->fh); + return nfsd_return_dirop(nfserr, resp); +} + +/* + * Remove a directory + */ +static __be32 +nfsd_proc_rmdir(struct svc_rqst *rqstp, struct nfsd_diropargs *argp, + void *resp) +{ + __be32 nfserr; + + dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); + + nfserr = nfsd_unlink(rqstp, &argp->fh, S_IFDIR, argp->name, argp->len); + fh_put(&argp->fh); + return nfserr; +} + +/* + * Read a portion of a directory. + */ +static __be32 +nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp, + struct nfsd_readdirres *resp) +{ + int count; + __be32 nfserr; + loff_t offset; + + dprintk("nfsd: READDIR %s %d bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->count, argp->cookie); + + /* Shrink to the client read size */ + count = (argp->count >> 2) - 2; + + /* Make sure we've room for the NULL ptr & eof flag */ + count -= 2; + if (count < 0) + count = 0; + + resp->buffer = argp->buffer; + resp->offset = NULL; + resp->buflen = count; + resp->common.err = nfs_ok; + /* Read directory and encode entries on the fly */ + offset = argp->cookie; + nfserr = nfsd_readdir(rqstp, &argp->fh, &offset, + &resp->common, nfssvc_encode_entry); + + resp->count = resp->buffer - argp->buffer; + if (resp->offset) + *resp->offset = htonl(offset); + + fh_put(&argp->fh); + return nfserr; +} + +/* + * Get file system info + */ +static __be32 +nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, + struct nfsd_statfsres *resp) +{ + __be32 nfserr; + + dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); + + nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, + NFSD_MAY_BYPASS_GSS_ON_ROOT); + fh_put(&argp->fh); + return nfserr; +} + +/* + * NFSv2 Server procedures. + * Only the results of non-idempotent operations are cached. + */ +struct nfsd_void { int dummy; }; + +#define ST 1 /* status */ +#define FH 8 /* filehandle */ +#define AT 18 /* attributes */ + +static struct svc_procedure nfsd_procedures2[18] = { + [NFSPROC_NULL] = { + .pc_func = (svc_procfunc) nfsd_proc_null, + .pc_decode = (kxdrproc_t) nfssvc_decode_void, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_void), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [NFSPROC_GETATTR] = { + .pc_func = (svc_procfunc) nfsd_proc_getattr, + .pc_decode = (kxdrproc_t) nfssvc_decode_fhandle, + .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat, + .pc_release = (kxdrproc_t) nfssvc_release_fhandle, + .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + }, + [NFSPROC_SETATTR] = { + .pc_func = (svc_procfunc) nfsd_proc_setattr, + .pc_decode = (kxdrproc_t) nfssvc_decode_sattrargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat, + .pc_release = (kxdrproc_t) nfssvc_release_fhandle, + .pc_argsize = sizeof(struct nfsd_sattrargs), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+AT, + }, + [NFSPROC_ROOT] = { + .pc_decode = (kxdrproc_t) nfssvc_decode_void, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_void), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [NFSPROC_LOOKUP] = { + .pc_func = (svc_procfunc) nfsd_proc_lookup, + .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_diropres, + .pc_release = (kxdrproc_t) nfssvc_release_fhandle, + .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_ressize = sizeof(struct nfsd_diropres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+FH+AT, + }, + [NFSPROC_READLINK] = { + .pc_func = (svc_procfunc) nfsd_proc_readlink, + .pc_decode = (kxdrproc_t) nfssvc_decode_readlinkargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_readlinkres, + .pc_argsize = sizeof(struct nfsd_readlinkargs), + .pc_ressize = sizeof(struct nfsd_readlinkres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4, + }, + [NFSPROC_READ] = { + .pc_func = (svc_procfunc) nfsd_proc_read, + .pc_decode = (kxdrproc_t) nfssvc_decode_readargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_readres, + .pc_release = (kxdrproc_t) nfssvc_release_fhandle, + .pc_argsize = sizeof(struct nfsd_readargs), + .pc_ressize = sizeof(struct nfsd_readres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4, + }, + [NFSPROC_WRITECACHE] = { + .pc_decode = (kxdrproc_t) nfssvc_decode_void, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_void), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [NFSPROC_WRITE] = { + .pc_func = (svc_procfunc) nfsd_proc_write, + .pc_decode = (kxdrproc_t) nfssvc_decode_writeargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_attrstat, + .pc_release = (kxdrproc_t) nfssvc_release_fhandle, + .pc_argsize = sizeof(struct nfsd_writeargs), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+AT, + }, + [NFSPROC_CREATE] = { + .pc_func = (svc_procfunc) nfsd_proc_create, + .pc_decode = (kxdrproc_t) nfssvc_decode_createargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_diropres, + .pc_release = (kxdrproc_t) nfssvc_release_fhandle, + .pc_argsize = sizeof(struct nfsd_createargs), + .pc_ressize = sizeof(struct nfsd_diropres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+FH+AT, + }, + [NFSPROC_REMOVE] = { + .pc_func = (svc_procfunc) nfsd_proc_remove, + .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_RENAME] = { + .pc_func = (svc_procfunc) nfsd_proc_rename, + .pc_decode = (kxdrproc_t) nfssvc_decode_renameargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_renameargs), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_LINK] = { + .pc_func = (svc_procfunc) nfsd_proc_link, + .pc_decode = (kxdrproc_t) nfssvc_decode_linkargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_linkargs), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_SYMLINK] = { + .pc_func = (svc_procfunc) nfsd_proc_symlink, + .pc_decode = (kxdrproc_t) nfssvc_decode_symlinkargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_symlinkargs), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_MKDIR] = { + .pc_func = (svc_procfunc) nfsd_proc_mkdir, + .pc_decode = (kxdrproc_t) nfssvc_decode_createargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_diropres, + .pc_release = (kxdrproc_t) nfssvc_release_fhandle, + .pc_argsize = sizeof(struct nfsd_createargs), + .pc_ressize = sizeof(struct nfsd_diropres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+FH+AT, + }, + [NFSPROC_RMDIR] = { + .pc_func = (svc_procfunc) nfsd_proc_rmdir, + .pc_decode = (kxdrproc_t) nfssvc_decode_diropargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_READDIR] = { + .pc_func = (svc_procfunc) nfsd_proc_readdir, + .pc_decode = (kxdrproc_t) nfssvc_decode_readdirargs, + .pc_encode = (kxdrproc_t) nfssvc_encode_readdirres, + .pc_argsize = sizeof(struct nfsd_readdirargs), + .pc_ressize = sizeof(struct nfsd_readdirres), + .pc_cachetype = RC_NOCACHE, + }, + [NFSPROC_STATFS] = { + .pc_func = (svc_procfunc) nfsd_proc_statfs, + .pc_decode = (kxdrproc_t) nfssvc_decode_fhandle, + .pc_encode = (kxdrproc_t) nfssvc_encode_statfsres, + .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_ressize = sizeof(struct nfsd_statfsres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+5, + }, +}; + + +struct svc_version nfsd_version2 = { + .vs_vers = 2, + .vs_nproc = 18, + .vs_proc = nfsd_procedures2, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS2_SVC_XDRSIZE, +}; + +/* + * Map errnos to NFS errnos. + */ +__be32 +nfserrno (int errno) +{ + static struct { + __be32 nfserr; + int syserr; + } nfs_errtbl[] = { + { nfs_ok, 0 }, + { nfserr_perm, -EPERM }, + { nfserr_noent, -ENOENT }, + { nfserr_io, -EIO }, + { nfserr_nxio, -ENXIO }, + { nfserr_acces, -EACCES }, + { nfserr_exist, -EEXIST }, + { nfserr_xdev, -EXDEV }, + { nfserr_mlink, -EMLINK }, + { nfserr_nodev, -ENODEV }, + { nfserr_notdir, -ENOTDIR }, + { nfserr_isdir, -EISDIR }, + { nfserr_inval, -EINVAL }, + { nfserr_fbig, -EFBIG }, + { nfserr_nospc, -ENOSPC }, + { nfserr_rofs, -EROFS }, + { nfserr_mlink, -EMLINK }, + { nfserr_nametoolong, -ENAMETOOLONG }, + { nfserr_notempty, -ENOTEMPTY }, +#ifdef EDQUOT + { nfserr_dquot, -EDQUOT }, +#endif + { nfserr_stale, -ESTALE }, + { nfserr_jukebox, -ETIMEDOUT }, + { nfserr_jukebox, -ERESTARTSYS }, + { nfserr_jukebox, -EAGAIN }, + { nfserr_jukebox, -EWOULDBLOCK }, + { nfserr_jukebox, -ENOMEM }, + { nfserr_io, -ETXTBSY }, + { nfserr_notsupp, -EOPNOTSUPP }, + { nfserr_toosmall, -ETOOSMALL }, + { nfserr_serverfault, -ESERVERFAULT }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) { + if (nfs_errtbl[i].syserr == errno) + return nfs_errtbl[i].nfserr; + } + printk (KERN_INFO "nfsd: non-standard errno: %d\n", errno); + return nfserr_io; +} + diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c new file mode 100644 index 00000000..18743c4d --- /dev/null +++ b/fs/nfsd/nfssvc.c @@ -0,0 +1,663 @@ +/* + * Central processing for nfsd. + * + * Authors: Olaf Kirch (okir@monad.swb.de) + * + * Copyright (C) 1995, 1996, 1997 Olaf Kirch + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "nfsd.h" +#include "cache.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_SVC + +extern struct svc_program nfsd_program; +static int nfsd(void *vrqstp); +struct timeval nfssvc_boot; + +/* + * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members + * of the svc_serv struct. In particular, ->sv_nrthreads but also to some + * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt + * + * If (out side the lock) nfsd_serv is non-NULL, then it must point to a + * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number + * of nfsd threads must exist and each must listed in ->sp_all_threads in each + * entry of ->sv_pools[]. + * + * Transitions of the thread count between zero and non-zero are of particular + * interest since the svc_serv needs to be created and initialized at that + * point, or freed. + * + * Finally, the nfsd_mutex also protects some of the global variables that are + * accessed when nfsd starts and that are settable via the write_* routines in + * nfsctl.c. In particular: + * + * user_recovery_dirname + * user_lease_time + * nfsd_versions + */ +DEFINE_MUTEX(nfsd_mutex); +struct svc_serv *nfsd_serv; + +/* + * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used. + * nfsd_drc_max_pages limits the total amount of memory available for + * version 4.1 DRC caches. + * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. + */ +spinlock_t nfsd_drc_lock; +unsigned int nfsd_drc_max_mem; +unsigned int nfsd_drc_mem_used; + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +static struct svc_stat nfsd_acl_svcstats; +static struct svc_version * nfsd_acl_version[] = { + [2] = &nfsd_acl_version2, + [3] = &nfsd_acl_version3, +}; + +#define NFSD_ACL_MINVERS 2 +#define NFSD_ACL_NRVERS ARRAY_SIZE(nfsd_acl_version) +static struct svc_version *nfsd_acl_versions[NFSD_ACL_NRVERS]; + +static struct svc_program nfsd_acl_program = { + .pg_prog = NFS_ACL_PROGRAM, + .pg_nvers = NFSD_ACL_NRVERS, + .pg_vers = nfsd_acl_versions, + .pg_name = "nfsacl", + .pg_class = "nfsd", + .pg_stats = &nfsd_acl_svcstats, + .pg_authenticate = &svc_set_client, +}; + +static struct svc_stat nfsd_acl_svcstats = { + .program = &nfsd_acl_program, +}; +#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ + +static struct svc_version * nfsd_version[] = { + [2] = &nfsd_version2, +#if defined(CONFIG_NFSD_V3) + [3] = &nfsd_version3, +#endif +#if defined(CONFIG_NFSD_V4) + [4] = &nfsd_version4, +#endif +}; + +#define NFSD_MINVERS 2 +#define NFSD_NRVERS ARRAY_SIZE(nfsd_version) +static struct svc_version *nfsd_versions[NFSD_NRVERS]; + +struct svc_program nfsd_program = { +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + .pg_next = &nfsd_acl_program, +#endif + .pg_prog = NFS_PROGRAM, /* program number */ + .pg_nvers = NFSD_NRVERS, /* nr of entries in nfsd_version */ + .pg_vers = nfsd_versions, /* version table */ + .pg_name = "nfsd", /* program name */ + .pg_class = "nfsd", /* authentication class */ + .pg_stats = &nfsd_svcstats, /* version table */ + .pg_authenticate = &svc_set_client, /* export authentication */ + +}; + +u32 nfsd_supported_minorversion; + +int nfsd_vers(int vers, enum vers_op change) +{ + if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) + return 0; + switch(change) { + case NFSD_SET: + nfsd_versions[vers] = nfsd_version[vers]; +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + if (vers < NFSD_ACL_NRVERS) + nfsd_acl_versions[vers] = nfsd_acl_version[vers]; +#endif + break; + case NFSD_CLEAR: + nfsd_versions[vers] = NULL; +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + if (vers < NFSD_ACL_NRVERS) + nfsd_acl_versions[vers] = NULL; +#endif + break; + case NFSD_TEST: + return nfsd_versions[vers] != NULL; + case NFSD_AVAIL: + return nfsd_version[vers] != NULL; + } + return 0; +} + +int nfsd_minorversion(u32 minorversion, enum vers_op change) +{ + if (minorversion > NFSD_SUPPORTED_MINOR_VERSION) + return -1; + switch(change) { + case NFSD_SET: + nfsd_supported_minorversion = minorversion; + break; + case NFSD_CLEAR: + if (minorversion == 0) + return -1; + nfsd_supported_minorversion = minorversion - 1; + break; + case NFSD_TEST: + return minorversion <= nfsd_supported_minorversion; + case NFSD_AVAIL: + return minorversion <= NFSD_SUPPORTED_MINOR_VERSION; + } + return 0; +} + +/* + * Maximum number of nfsd processes + */ +#define NFSD_MAXSERVS 8192 + +int nfsd_nrthreads(void) +{ + int rv = 0; + mutex_lock(&nfsd_mutex); + if (nfsd_serv) + rv = nfsd_serv->sv_nrthreads; + mutex_unlock(&nfsd_mutex); + return rv; +} + +static int nfsd_init_socks(int port) +{ + int error; + if (!list_empty(&nfsd_serv->sv_permsocks)) + return 0; + + error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port, + SVC_SOCK_DEFAULTS); + if (error < 0) + return error; + + error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port, + SVC_SOCK_DEFAULTS); + if (error < 0) + return error; + + return 0; +} + +static bool nfsd_up = false; + +static int nfsd_startup(unsigned short port, int nrservs) +{ + int ret; + + if (nfsd_up) + return 0; + /* + * Readahead param cache - will no-op if it already exists. + * (Note therefore results will be suboptimal if number of + * threads is modified after nfsd start.) + */ + ret = nfsd_racache_init(2*nrservs); + if (ret) + return ret; + ret = nfsd_init_socks(port); + if (ret) + goto out_racache; + ret = lockd_up(); + if (ret) + goto out_racache; + ret = nfs4_state_start(); + if (ret) + goto out_lockd; + nfsd_up = true; + return 0; +out_lockd: + lockd_down(); +out_racache: + nfsd_racache_shutdown(); + return ret; +} + +static void nfsd_shutdown(void) +{ + /* + * write_ports can create the server without actually starting + * any threads--if we get shut down before any threads are + * started, then nfsd_last_thread will be run before any of this + * other initialization has been done. + */ + if (!nfsd_up) + return; + nfs4_state_shutdown(); + lockd_down(); + nfsd_racache_shutdown(); + nfsd_up = false; +} + +static void nfsd_last_thread(struct svc_serv *serv) +{ + /* When last nfsd thread exits we need to do some clean-up */ + nfsd_serv = NULL; + nfsd_shutdown(); + + printk(KERN_WARNING "nfsd: last server has exited, flushing export " + "cache\n"); + nfsd_export_flush(); +} + +void nfsd_reset_versions(void) +{ + int found_one = 0; + int i; + + for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) { + if (nfsd_program.pg_vers[i]) + found_one = 1; + } + + if (!found_one) { + for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) + nfsd_program.pg_vers[i] = nfsd_version[i]; +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) + nfsd_acl_program.pg_vers[i] = + nfsd_acl_version[i]; +#endif + } +} + +/* + * Each session guarantees a negotiated per slot memory cache for replies + * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated + * NFSv4.1 server might want to use more memory for a DRC than a machine + * with mutiple services. + * + * Impose a hard limit on the number of pages for the DRC which varies + * according to the machines free pages. This is of course only a default. + * + * For now this is a #defined shift which could be under admin control + * in the future. + */ +static void set_max_drc(void) +{ + #define NFSD_DRC_SIZE_SHIFT 10 + nfsd_drc_max_mem = (nr_free_buffer_pages() + >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; + nfsd_drc_mem_used = 0; + spin_lock_init(&nfsd_drc_lock); + dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem); +} + +int nfsd_create_serv(void) +{ + int err = 0; + + WARN_ON(!mutex_is_locked(&nfsd_mutex)); + if (nfsd_serv) { + svc_get(nfsd_serv); + return 0; + } + if (nfsd_max_blksize == 0) { + /* choose a suitable default */ + struct sysinfo i; + si_meminfo(&i); + /* Aim for 1/4096 of memory per thread + * This gives 1MB on 4Gig machines + * But only uses 32K on 128M machines. + * Bottom out at 8K on 32M and smaller. + * Of course, this is only a default. + */ + nfsd_max_blksize = NFSSVC_MAXBLKSIZE; + i.totalram <<= PAGE_SHIFT - 12; + while (nfsd_max_blksize > i.totalram && + nfsd_max_blksize >= 8*1024*2) + nfsd_max_blksize /= 2; + } + nfsd_reset_versions(); + + nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, + nfsd_last_thread, nfsd, THIS_MODULE); + if (nfsd_serv == NULL) + return -ENOMEM; + + set_max_drc(); + do_gettimeofday(&nfssvc_boot); /* record boot time */ + return err; +} + +int nfsd_nrpools(void) +{ + if (nfsd_serv == NULL) + return 0; + else + return nfsd_serv->sv_nrpools; +} + +int nfsd_get_nrthreads(int n, int *nthreads) +{ + int i = 0; + + if (nfsd_serv != NULL) { + for (i = 0; i < nfsd_serv->sv_nrpools && i < n; i++) + nthreads[i] = nfsd_serv->sv_pools[i].sp_nrthreads; + } + + return 0; +} + +int nfsd_set_nrthreads(int n, int *nthreads) +{ + int i = 0; + int tot = 0; + int err = 0; + + WARN_ON(!mutex_is_locked(&nfsd_mutex)); + + if (nfsd_serv == NULL || n <= 0) + return 0; + + if (n > nfsd_serv->sv_nrpools) + n = nfsd_serv->sv_nrpools; + + /* enforce a global maximum number of threads */ + tot = 0; + for (i = 0; i < n; i++) { + if (nthreads[i] > NFSD_MAXSERVS) + nthreads[i] = NFSD_MAXSERVS; + tot += nthreads[i]; + } + if (tot > NFSD_MAXSERVS) { + /* total too large: scale down requested numbers */ + for (i = 0; i < n && tot > 0; i++) { + int new = nthreads[i] * NFSD_MAXSERVS / tot; + tot -= (nthreads[i] - new); + nthreads[i] = new; + } + for (i = 0; i < n && tot > 0; i++) { + nthreads[i]--; + tot--; + } + } + + /* + * There must always be a thread in pool 0; the admin + * can't shut down NFS completely using pool_threads. + */ + if (nthreads[0] == 0) + nthreads[0] = 1; + + /* apply the new numbers */ + svc_get(nfsd_serv); + for (i = 0; i < n; i++) { + err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i], + nthreads[i]); + if (err) + break; + } + svc_destroy(nfsd_serv); + + return err; +} + +/* + * Adjust the number of threads and return the new number of threads. + * This is also the function that starts the server if necessary, if + * this is the first time nrservs is nonzero. + */ +int +nfsd_svc(unsigned short port, int nrservs) +{ + int error; + bool nfsd_up_before; + + mutex_lock(&nfsd_mutex); + dprintk("nfsd: creating service\n"); + if (nrservs <= 0) + nrservs = 0; + if (nrservs > NFSD_MAXSERVS) + nrservs = NFSD_MAXSERVS; + error = 0; + if (nrservs == 0 && nfsd_serv == NULL) + goto out; + + error = nfsd_create_serv(); + if (error) + goto out; + + nfsd_up_before = nfsd_up; + + error = nfsd_startup(port, nrservs); + if (error) + goto out_destroy; + error = svc_set_num_threads(nfsd_serv, NULL, nrservs); + if (error) + goto out_shutdown; + /* We are holding a reference to nfsd_serv which + * we don't want to count in the return value, + * so subtract 1 + */ + error = nfsd_serv->sv_nrthreads - 1; +out_shutdown: + if (error < 0 && !nfsd_up_before) + nfsd_shutdown(); +out_destroy: + svc_destroy(nfsd_serv); /* Release server */ +out: + mutex_unlock(&nfsd_mutex); + return error; +} + + +/* + * This is the NFS server kernel thread + */ +static int +nfsd(void *vrqstp) +{ + struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; + int err, preverr = 0; + + /* Lock module and set up kernel thread */ + mutex_lock(&nfsd_mutex); + + /* At this point, the thread shares current->fs + * with the init process. We need to create files with a + * umask of 0 instead of init's umask. */ + if (unshare_fs_struct() < 0) { + printk("Unable to start nfsd thread: out of memory\n"); + goto out; + } + + current->fs->umask = 0; + + /* + * thread is spawned with all signals set to SIG_IGN, re-enable + * the ones that will bring down the thread + */ + allow_signal(SIGKILL); + allow_signal(SIGHUP); + allow_signal(SIGINT); + allow_signal(SIGQUIT); + + nfsdstats.th_cnt++; + mutex_unlock(&nfsd_mutex); + + /* + * We want less throttling in balance_dirty_pages() so that nfs to + * localhost doesn't cause nfsd to lock up due to all the client's + * dirty pages. + */ + current->flags |= PF_LESS_THROTTLE; + set_freezable(); + + /* + * The main request loop + */ + for (;;) { + /* + * Find a socket with data available and call its + * recvfrom routine. + */ + while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN) + ; + if (err == -EINTR) + break; + else if (err < 0) { + if (err != preverr) { + printk(KERN_WARNING "%s: unexpected error " + "from svc_recv (%d)\n", __func__, -err); + preverr = err; + } + schedule_timeout_uninterruptible(HZ); + continue; + } + + + /* Lock the export hash tables for reading. */ + exp_readlock(); + + validate_process_creds(); + svc_process(rqstp); + validate_process_creds(); + + /* Unlock export hash tables */ + exp_readunlock(); + } + + /* Clear signals before calling svc_exit_thread() */ + flush_signals(current); + + mutex_lock(&nfsd_mutex); + nfsdstats.th_cnt --; + +out: + /* Release the thread */ + svc_exit_thread(rqstp); + + /* Release module */ + mutex_unlock(&nfsd_mutex); + module_put_and_exit(0); + return 0; +} + +static __be32 map_new_errors(u32 vers, __be32 nfserr) +{ + if (nfserr == nfserr_jukebox && vers == 2) + return nfserr_dropit; + if (nfserr == nfserr_wrongsec && vers < 4) + return nfserr_acces; + return nfserr; +} + +int +nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) +{ + struct svc_procedure *proc; + kxdrproc_t xdr; + __be32 nfserr; + __be32 *nfserrp; + + dprintk("nfsd_dispatch: vers %d proc %d\n", + rqstp->rq_vers, rqstp->rq_proc); + proc = rqstp->rq_procinfo; + + /* Check whether we have this call in the cache. */ + switch (nfsd_cache_lookup(rqstp, proc->pc_cachetype)) { + case RC_INTR: + case RC_DROPIT: + return 0; + case RC_REPLY: + return 1; + case RC_DOIT:; + /* do it */ + } + + /* Decode arguments */ + xdr = proc->pc_decode; + if (xdr && !xdr(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base, + rqstp->rq_argp)) { + dprintk("nfsd: failed to decode arguments!\n"); + nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + *statp = rpc_garbage_args; + return 1; + } + + /* need to grab the location to store the status, as + * nfsv4 does some encoding while processing + */ + nfserrp = rqstp->rq_res.head[0].iov_base + + rqstp->rq_res.head[0].iov_len; + rqstp->rq_res.head[0].iov_len += sizeof(__be32); + + /* Now call the procedure handler, and encode NFS status. */ + nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); + nfserr = map_new_errors(rqstp->rq_vers, nfserr); + if (nfserr == nfserr_dropit || rqstp->rq_dropme) { + dprintk("nfsd: Dropping request; may be revisited later\n"); + nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + return 0; + } + + if (rqstp->rq_proc != 0) + *nfserrp++ = nfserr; + + /* Encode result. + * For NFSv2, additional info is never returned in case of an error. + */ + if (!(nfserr && rqstp->rq_vers == 2)) { + xdr = proc->pc_encode; + if (xdr && !xdr(rqstp, nfserrp, + rqstp->rq_resp)) { + /* Failed to encode result. Release cache entry */ + dprintk("nfsd: failed to encode result!\n"); + nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + *statp = rpc_system_err; + return 1; + } + } + + /* Store reply in cache. */ + nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); + return 1; +} + +int nfsd_pool_stats_open(struct inode *inode, struct file *file) +{ + int ret; + mutex_lock(&nfsd_mutex); + if (nfsd_serv == NULL) { + mutex_unlock(&nfsd_mutex); + return -ENODEV; + } + /* bump up the psudo refcount while traversing */ + svc_get(nfsd_serv); + ret = svc_pool_stats_open(nfsd_serv, file); + mutex_unlock(&nfsd_mutex); + return ret; +} + +int nfsd_pool_stats_release(struct inode *inode, struct file *file) +{ + int ret = seq_release(inode, file); + mutex_lock(&nfsd_mutex); + /* this function really, really should have been called svc_put() */ + svc_destroy(nfsd_serv); + mutex_unlock(&nfsd_mutex); + return ret; +} diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c new file mode 100644 index 00000000..65ec595e --- /dev/null +++ b/fs/nfsd/nfsxdr.c @@ -0,0 +1,545 @@ +/* + * XDR support for nfsd + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include "xdr.h" +#include "auth.h" + +#define NFSDDBG_FACILITY NFSDDBG_XDR + +/* + * Mapping of S_IF* types to NFS file types + */ +static u32 nfs_ftypes[] = { + NFNON, NFCHR, NFCHR, NFBAD, + NFDIR, NFBAD, NFBLK, NFBAD, + NFREG, NFBAD, NFLNK, NFBAD, + NFSOCK, NFBAD, NFLNK, NFBAD, +}; + + +/* + * XDR functions for basic NFS types + */ +static __be32 * +decode_fh(__be32 *p, struct svc_fh *fhp) +{ + fh_init(fhp, NFS_FHSIZE); + memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE); + fhp->fh_handle.fh_size = NFS_FHSIZE; + + /* FIXME: Look up export pointer here and verify + * Sun Secure RPC if requested */ + return p + (NFS_FHSIZE >> 2); +} + +/* Helper function for NFSv2 ACL code */ +__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp) +{ + return decode_fh(p, fhp); +} + +static __be32 * +encode_fh(__be32 *p, struct svc_fh *fhp) +{ + memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE); + return p + (NFS_FHSIZE>> 2); +} + +/* + * Decode a file name and make sure that the path contains + * no slashes or null bytes. + */ +static __be32 * +decode_filename(__be32 *p, char **namp, unsigned int *lenp) +{ + char *name; + unsigned int i; + + if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) { + for (i = 0, name = *namp; i < *lenp; i++, name++) { + if (*name == '\0' || *name == '/') + return NULL; + } + } + + return p; +} + +static __be32 * +decode_pathname(__be32 *p, char **namp, unsigned int *lenp) +{ + char *name; + unsigned int i; + + if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) { + for (i = 0, name = *namp; i < *lenp; i++, name++) { + if (*name == '\0') + return NULL; + } + } + + return p; +} + +static __be32 * +decode_sattr(__be32 *p, struct iattr *iap) +{ + u32 tmp, tmp1; + + iap->ia_valid = 0; + + /* Sun client bug compatibility check: some sun clients seem to + * put 0xffff in the mode field when they mean 0xffffffff. + * Quoting the 4.4BSD nfs server code: Nah nah nah nah na nah. + */ + if ((tmp = ntohl(*p++)) != (u32)-1 && tmp != 0xffff) { + iap->ia_valid |= ATTR_MODE; + iap->ia_mode = tmp; + } + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_valid |= ATTR_UID; + iap->ia_uid = tmp; + } + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_valid |= ATTR_GID; + iap->ia_gid = tmp; + } + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_valid |= ATTR_SIZE; + iap->ia_size = tmp; + } + tmp = ntohl(*p++); tmp1 = ntohl(*p++); + if (tmp != (u32)-1 && tmp1 != (u32)-1) { + iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; + iap->ia_atime.tv_sec = tmp; + iap->ia_atime.tv_nsec = tmp1 * 1000; + } + tmp = ntohl(*p++); tmp1 = ntohl(*p++); + if (tmp != (u32)-1 && tmp1 != (u32)-1) { + iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; + iap->ia_mtime.tv_sec = tmp; + iap->ia_mtime.tv_nsec = tmp1 * 1000; + /* + * Passing the invalid value useconds=1000000 for mtime + * is a Sun convention for "set both mtime and atime to + * current server time". It's needed to make permissions + * checks for the "touch" program across v2 mounts to + * Solaris and Irix boxes work correctly. See description of + * sattr in section 6.1 of "NFS Illustrated" by + * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5 + */ + if (tmp1 == 1000000) + iap->ia_valid &= ~(ATTR_ATIME_SET|ATTR_MTIME_SET); + } + return p; +} + +static __be32 * +encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, + struct kstat *stat) +{ + struct dentry *dentry = fhp->fh_dentry; + int type; + struct timespec time; + u32 f; + + type = (stat->mode & S_IFMT); + + *p++ = htonl(nfs_ftypes[type >> 12]); + *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) stat->nlink); + *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid)); + *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid)); + + if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) { + *p++ = htonl(NFS_MAXPATHLEN); + } else { + *p++ = htonl((u32) stat->size); + } + *p++ = htonl((u32) stat->blksize); + if (S_ISCHR(type) || S_ISBLK(type)) + *p++ = htonl(new_encode_dev(stat->rdev)); + else + *p++ = htonl(0xffffffff); + *p++ = htonl((u32) stat->blocks); + switch (fsid_source(fhp)) { + default: + case FSIDSOURCE_DEV: + *p++ = htonl(new_encode_dev(stat->dev)); + break; + case FSIDSOURCE_FSID: + *p++ = htonl((u32) fhp->fh_export->ex_fsid); + break; + case FSIDSOURCE_UUID: + f = ((u32*)fhp->fh_export->ex_uuid)[0]; + f ^= ((u32*)fhp->fh_export->ex_uuid)[1]; + f ^= ((u32*)fhp->fh_export->ex_uuid)[2]; + f ^= ((u32*)fhp->fh_export->ex_uuid)[3]; + *p++ = htonl(f); + break; + } + *p++ = htonl((u32) stat->ino); + *p++ = htonl((u32) stat->atime.tv_sec); + *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0); + lease_get_mtime(dentry->d_inode, &time); + *p++ = htonl((u32) time.tv_sec); + *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); + *p++ = htonl((u32) stat->ctime.tv_sec); + *p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0); + + return p; +} + +/* Helper function for NFSv2 ACL code */ +__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +{ + struct kstat stat; + vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, &stat); + return encode_fattr(rqstp, p, fhp, &stat); +} + +/* + * XDR decode functions + */ +int +nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args) +{ + if (!(p = decode_fh(p, &args->fh))) + return 0; + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_sattrargs *args) +{ + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = decode_sattr(p, &args->attrs); + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_diropargs *args) +{ + if (!(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_readargs *args) +{ + unsigned int len; + int v,pn; + if (!(p = decode_fh(p, &args->fh))) + return 0; + + args->offset = ntohl(*p++); + len = args->count = ntohl(*p++); + p++; /* totalcount - unused */ + + if (len > NFSSVC_MAXBLKSIZE_V2) + len = NFSSVC_MAXBLKSIZE_V2; + + /* set up somewhere to store response. + * We take pages, put them on reslist and include in iovec + */ + v=0; + while (len > 0) { + pn = rqstp->rq_resused++; + rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]); + rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE; + len -= rqstp->rq_vec[v].iov_len; + v++; + } + args->vlen = v; + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_writeargs *args) +{ + unsigned int len, hdr, dlen; + int v; + + if (!(p = decode_fh(p, &args->fh))) + return 0; + + p++; /* beginoffset */ + args->offset = ntohl(*p++); /* offset */ + p++; /* totalcount */ + len = args->len = ntohl(*p++); + /* + * The protocol specifies a maximum of 8192 bytes. + */ + if (len > NFSSVC_MAXBLKSIZE_V2) + return 0; + + /* + * Check to make sure that we got the right number of + * bytes. + */ + hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; + dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len + - hdr; + + /* + * Round the length of the data which was specified up to + * the next multiple of XDR units and then compare that + * against the length which was actually received. + * Note that when RPCSEC/GSS (for example) is used, the + * data buffer can be padded so dlen might be larger + * than required. It must never be smaller. + */ + if (dlen < XDR_QUADLEN(len)*4) + return 0; + + rqstp->rq_vec[0].iov_base = (void*)p; + rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + v = 0; + while (len > rqstp->rq_vec[v].iov_len) { + len -= rqstp->rq_vec[v].iov_len; + v++; + rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]); + rqstp->rq_vec[v].iov_len = PAGE_SIZE; + } + rqstp->rq_vec[v].iov_len = len; + args->vlen = v + 1; + return 1; +} + +int +nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_createargs *args) +{ + if ( !(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + p = decode_sattr(p, &args->attrs); + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_renameargs *args) +{ + if (!(p = decode_fh(p, &args->ffh)) + || !(p = decode_filename(p, &args->fname, &args->flen)) + || !(p = decode_fh(p, &args->tfh)) + || !(p = decode_filename(p, &args->tname, &args->tlen))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readlinkargs *args) +{ + if (!(p = decode_fh(p, &args->fh))) + return 0; + args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]); + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_linkargs *args) +{ + if (!(p = decode_fh(p, &args->ffh)) + || !(p = decode_fh(p, &args->tfh)) + || !(p = decode_filename(p, &args->tname, &args->tlen))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_symlinkargs *args) +{ + if ( !(p = decode_fh(p, &args->ffh)) + || !(p = decode_filename(p, &args->fname, &args->flen)) + || !(p = decode_pathname(p, &args->tname, &args->tlen))) + return 0; + p = decode_sattr(p, &args->attrs); + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_readdirargs *args) +{ + if (!(p = decode_fh(p, &args->fh))) + return 0; + args->cookie = ntohl(*p++); + args->count = ntohl(*p++); + if (args->count > PAGE_SIZE) + args->count = PAGE_SIZE; + + args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]); + + return xdr_argsize_check(rqstp, p); +} + +/* + * XDR encode functions + */ +int +nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} + +int +nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_attrstat *resp) +{ + p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); + return xdr_ressize_check(rqstp, p); +} + +int +nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_diropres *resp) +{ + p = encode_fh(p, &resp->fh); + p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); + return xdr_ressize_check(rqstp, p); +} + +int +nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_readlinkres *resp) +{ + *p++ = htonl(resp->len); + xdr_ressize_check(rqstp, p); + rqstp->rq_res.page_len = resp->len; + if (resp->len & 3) { + /* need to pad the tail */ + rqstp->rq_res.tail[0].iov_base = p; + *p = 0; + rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); + } + return 1; +} + +int +nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_readres *resp) +{ + p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); + *p++ = htonl(resp->count); + xdr_ressize_check(rqstp, p); + + /* now update rqstp->rq_res to reflect data as well */ + rqstp->rq_res.page_len = resp->count; + if (resp->count & 3) { + /* need to pad the tail */ + rqstp->rq_res.tail[0].iov_base = p; + *p = 0; + rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3); + } + return 1; +} + +int +nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_readdirres *resp) +{ + xdr_ressize_check(rqstp, p); + p = resp->buffer; + *p++ = 0; /* no more entries */ + *p++ = htonl((resp->common.err == nfserr_eof)); + rqstp->rq_res.page_len = (((unsigned long)p-1) & ~PAGE_MASK)+1; + + return 1; +} + +int +nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_statfsres *resp) +{ + struct kstatfs *stat = &resp->stats; + + *p++ = htonl(NFSSVC_MAXBLKSIZE_V2); /* max transfer size */ + *p++ = htonl(stat->f_bsize); + *p++ = htonl(stat->f_blocks); + *p++ = htonl(stat->f_bfree); + *p++ = htonl(stat->f_bavail); + return xdr_ressize_check(rqstp, p); +} + +int +nfssvc_encode_entry(void *ccdv, const char *name, + int namlen, loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = ccdv; + struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common); + __be32 *p = cd->buffer; + int buflen, slen; + + /* + dprintk("nfsd: entry(%.*s off %ld ino %ld)\n", + namlen, name, offset, ino); + */ + + if (offset > ~((u32) 0)) { + cd->common.err = nfserr_fbig; + return -EINVAL; + } + if (cd->offset) + *cd->offset = htonl(offset); + if (namlen > NFS2_MAXNAMLEN) + namlen = NFS2_MAXNAMLEN;/* truncate filename */ + + slen = XDR_QUADLEN(namlen); + if ((buflen = cd->buflen - slen - 4) < 0) { + cd->common.err = nfserr_toosmall; + return -EINVAL; + } + if (ino > ~((u32) 0)) { + cd->common.err = nfserr_fbig; + return -EINVAL; + } + *p++ = xdr_one; /* mark entry present */ + *p++ = htonl((u32) ino); /* file id */ + p = xdr_encode_array(p, name, namlen);/* name length & name */ + cd->offset = p; /* remember pointer */ + *p++ = htonl(~0U); /* offset of next entry */ + + cd->buflen = buflen; + cd->buffer = p; + cd->common.err = nfs_ok; + return 0; +} + +/* + * XDR release functions + */ +int +nfssvc_release_fhandle(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_fhandle *resp) +{ + fh_put(&resp->fh); + return 1; +} diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h new file mode 100644 index 00000000..858c7bae --- /dev/null +++ b/fs/nfsd/state.h @@ -0,0 +1,492 @@ +/* + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _NFSD4_STATE_H +#define _NFSD4_STATE_H + +#include +#include +#include "nfsfh.h" + +typedef struct { + u32 cl_boot; + u32 cl_id; +} clientid_t; + +typedef struct { + u32 so_boot; + u32 so_stateownerid; + u32 so_fileid; +} stateid_opaque_t; + +typedef struct { + u32 si_generation; + stateid_opaque_t si_opaque; +} stateid_t; +#define si_boot si_opaque.so_boot +#define si_stateownerid si_opaque.so_stateownerid +#define si_fileid si_opaque.so_fileid + +#define STATEID_FMT "(%08x/%08x/%08x/%08x)" +#define STATEID_VAL(s) \ + (s)->si_boot, \ + (s)->si_stateownerid, \ + (s)->si_fileid, \ + (s)->si_generation + +struct nfsd4_callback { + void *cb_op; + struct nfs4_client *cb_clp; + struct list_head cb_per_client; + u32 cb_minorversion; + struct rpc_message cb_msg; + const struct rpc_call_ops *cb_ops; + struct work_struct cb_work; + bool cb_done; +}; + +struct nfs4_delegation { + struct list_head dl_perfile; + struct list_head dl_perclnt; + struct list_head dl_recall_lru; /* delegation recalled */ + atomic_t dl_count; /* ref count */ + struct nfs4_client *dl_client; + struct nfs4_file *dl_file; + u32 dl_type; + time_t dl_time; +/* For recall: */ + stateid_t dl_stateid; + struct knfsd_fh dl_fh; + int dl_retries; + struct nfsd4_callback dl_recall; +}; + +/* client delegation callback info */ +struct nfs4_cb_conn { + /* SETCLIENTID info */ + struct sockaddr_storage cb_addr; + struct sockaddr_storage cb_saddr; + size_t cb_addrlen; + u32 cb_prog; /* used only in 4.0 case; + per-session otherwise */ + u32 cb_ident; /* minorversion 0 only */ + struct svc_xprt *cb_xprt; /* minorversion 1 only */ +}; + +/* Maximum number of slots per session. 160 is useful for long haul TCP */ +#define NFSD_MAX_SLOTS_PER_SESSION 160 +/* Maximum number of operations per session compound */ +#define NFSD_MAX_OPS_PER_COMPOUND 16 +/* Maximum session per slot cache size */ +#define NFSD_SLOT_CACHE_SIZE 1024 +/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ +#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32 +#define NFSD_MAX_MEM_PER_SESSION \ + (NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE) + +struct nfsd4_slot { + bool sl_inuse; + bool sl_cachethis; + u16 sl_opcnt; + u32 sl_seqid; + __be32 sl_status; + u32 sl_datalen; + char sl_data[]; +}; + +struct nfsd4_channel_attrs { + u32 headerpadsz; + u32 maxreq_sz; + u32 maxresp_sz; + u32 maxresp_cached; + u32 maxops; + u32 maxreqs; + u32 nr_rdma_attrs; + u32 rdma_attrs; +}; + +struct nfsd4_create_session { + clientid_t clientid; + struct nfs4_sessionid sessionid; + u32 seqid; + u32 flags; + struct nfsd4_channel_attrs fore_channel; + struct nfsd4_channel_attrs back_channel; + u32 callback_prog; + u32 uid; + u32 gid; +}; + +struct nfsd4_bind_conn_to_session { + struct nfs4_sessionid sessionid; + u32 dir; +}; + +/* The single slot clientid cache structure */ +struct nfsd4_clid_slot { + u32 sl_seqid; + __be32 sl_status; + struct nfsd4_create_session sl_cr_ses; +}; + +struct nfsd4_conn { + struct list_head cn_persession; + struct svc_xprt *cn_xprt; + struct svc_xpt_user cn_xpt_user; + struct nfsd4_session *cn_session; +/* CDFC4_FORE, CDFC4_BACK: */ + unsigned char cn_flags; +}; + +struct nfsd4_session { + struct kref se_ref; + struct list_head se_hash; /* hash by sessionid */ + struct list_head se_perclnt; + u32 se_flags; + struct nfs4_client *se_client; + struct nfs4_sessionid se_sessionid; + struct nfsd4_channel_attrs se_fchannel; + struct nfsd4_channel_attrs se_bchannel; + struct list_head se_conns; + u32 se_cb_prog; + u32 se_cb_seq_nr; + struct nfsd4_slot *se_slots[]; /* forward channel slots */ +}; + +static inline void +nfsd4_put_session(struct nfsd4_session *ses) +{ + extern void free_session(struct kref *kref); + kref_put(&ses->se_ref, free_session); +} + +static inline void +nfsd4_get_session(struct nfsd4_session *ses) +{ + kref_get(&ses->se_ref); +} + +/* formatted contents of nfs4_sessionid */ +struct nfsd4_sessionid { + clientid_t clientid; + u32 sequence; + u32 reserved; +}; + +#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */ + +/* + * struct nfs4_client - one per client. Clientids live here. + * o Each nfs4_client is hashed by clientid. + * + * o Each nfs4_clients is also hashed by name + * (the opaque quantity initially sent by the client to identify itself). + * + * o cl_perclient list is used to ensure no dangling stateowner references + * when we expire the nfs4_client + */ +struct nfs4_client { + struct list_head cl_idhash; /* hash by cl_clientid.id */ + struct list_head cl_strhash; /* hash by cl_name */ + struct list_head cl_openowners; + struct list_head cl_delegations; + struct list_head cl_lru; /* tail queue */ + struct xdr_netobj cl_name; /* id generated by client */ + char cl_recdir[HEXDIR_LEN]; /* recovery dir */ + nfs4_verifier cl_verifier; /* generated by client */ + time_t cl_time; /* time of last lease renewal */ + struct sockaddr_storage cl_addr; /* client ipaddress */ + u32 cl_flavor; /* setclientid pseudoflavor */ + char *cl_principal; /* setclientid principal name */ + struct svc_cred cl_cred; /* setclientid principal */ + clientid_t cl_clientid; /* generated by server */ + nfs4_verifier cl_confirm; /* generated by server */ + u32 cl_firststate; /* recovery dir creation */ + u32 cl_minorversion; + + /* for v4.0 and v4.1 callbacks: */ + struct nfs4_cb_conn cl_cb_conn; +#define NFSD4_CLIENT_CB_UPDATE 1 +#define NFSD4_CLIENT_KILL 2 + unsigned long cl_cb_flags; + struct rpc_clnt *cl_cb_client; + u32 cl_cb_ident; +#define NFSD4_CB_UP 0 +#define NFSD4_CB_UNKNOWN 1 +#define NFSD4_CB_DOWN 2 + int cl_cb_state; + struct nfsd4_callback cl_cb_null; + struct nfsd4_session *cl_cb_session; + struct list_head cl_callbacks; /* list of in-progress callbacks */ + + /* for all client information that callback code might need: */ + spinlock_t cl_lock; + + /* for nfs41 */ + struct list_head cl_sessions; + struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ + u32 cl_exchange_flags; + /* number of rpc's in progress over an associated session: */ + atomic_t cl_refcount; + + /* for nfs41 callbacks */ + /* We currently support a single back channel with a single slot */ + unsigned long cl_cb_slot_busy; + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ +}; + +static inline void +mark_client_expired(struct nfs4_client *clp) +{ + clp->cl_time = 0; +} + +static inline bool +is_client_expired(struct nfs4_client *clp) +{ + return clp->cl_time == 0; +} + +/* struct nfs4_client_reset + * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl + * upon lease reset, or from upcall to state_daemon (to read in state + * from non-volitile storage) upon reboot. + */ +struct nfs4_client_reclaim { + struct list_head cr_strhash; /* hash by cr_name */ + char cr_recdir[HEXDIR_LEN]; /* recover dir */ +}; + +static inline void +update_stateid(stateid_t *stateid) +{ + stateid->si_generation++; +} + +/* A reasonable value for REPLAY_ISIZE was estimated as follows: + * The OPEN response, typically the largest, requires + * 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) + + * 4(deleg. type) + 8(deleg. stateid) + 4(deleg. recall flag) + + * 20(deleg. space limit) + ~32(deleg. ace) = 112 bytes + */ + +#define NFSD4_REPLAY_ISIZE 112 + +/* + * Replay buffer, where the result of the last seqid-mutating operation + * is cached. + */ +struct nfs4_replay { + __be32 rp_status; + unsigned int rp_buflen; + char *rp_buf; + unsigned intrp_allocated; + struct knfsd_fh rp_openfh; + char rp_ibuf[NFSD4_REPLAY_ISIZE]; +}; + +/* +* nfs4_stateowner can either be an open_owner, or a lock_owner +* +* so_idhash: stateid_hashtbl[] for open owner, lockstateid_hashtbl[] +* for lock_owner +* so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[] +* for lock_owner +* so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client +* struct is reaped. +* so_perfilestate: heads the list of nfs4_stateid (either open or lock) +* and is used to ensure no dangling nfs4_stateid references when we +* release a stateowner. +* so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when +* close is called to reap associated byte-range locks +* so_close_lru: (open) stateowner is placed on this list instead of being +* reaped (when so_perfilestate is empty) to hold the last close replay. +* reaped by laundramat thread after lease period. +*/ +struct nfs4_stateowner { + struct kref so_ref; + struct list_head so_idhash; /* hash by so_id */ + struct list_head so_strhash; /* hash by op_name */ + struct list_head so_perclient; + struct list_head so_stateids; + struct list_head so_perstateid; /* for lockowners only */ + struct list_head so_close_lru; /* tail queue */ + time_t so_time; /* time of placement on so_close_lru */ + int so_is_open_owner; /* 1=openowner,0=lockowner */ + u32 so_id; + struct nfs4_client * so_client; + /* after increment in ENCODE_SEQID_OP_TAIL, represents the next + * sequence id expected from the client: */ + u32 so_seqid; + struct xdr_netobj so_owner; /* open owner name */ + int so_confirmed; /* successful OPEN_CONFIRM? */ + struct nfs4_replay so_replay; +}; + +/* +* nfs4_file: a file opened by some number of (open) nfs4_stateowners. +* o fi_perfile list is used to search for conflicting +* share_acces, share_deny on the file. +*/ +struct nfs4_file { + atomic_t fi_ref; + struct list_head fi_hash; /* hash by "struct inode *" */ + struct list_head fi_stateids; + struct list_head fi_delegations; + /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ + struct file * fi_fds[3]; + /* + * Each open or lock stateid contributes 1 to either + * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending + * on open or lock mode: + */ + atomic_t fi_access[2]; + struct file *fi_deleg_file; + struct file_lock *fi_lease; + atomic_t fi_delegees; + struct inode *fi_inode; + u32 fi_id; /* used with stateowner->so_id + * for stateid_hashtbl hash */ + bool fi_had_conflict; +}; + +/* XXX: for first cut may fall back on returning file that doesn't work + * at all? */ +static inline struct file *find_writeable_file(struct nfs4_file *f) +{ + if (f->fi_fds[O_WRONLY]) + return f->fi_fds[O_WRONLY]; + return f->fi_fds[O_RDWR]; +} + +static inline struct file *find_readable_file(struct nfs4_file *f) +{ + if (f->fi_fds[O_RDONLY]) + return f->fi_fds[O_RDONLY]; + return f->fi_fds[O_RDWR]; +} + +static inline struct file *find_any_file(struct nfs4_file *f) +{ + if (f->fi_fds[O_RDWR]) + return f->fi_fds[O_RDWR]; + else if (f->fi_fds[O_WRONLY]) + return f->fi_fds[O_WRONLY]; + else + return f->fi_fds[O_RDONLY]; +} + +/* +* nfs4_stateid can either be an open stateid or (eventually) a lock stateid +* +* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file +* +* st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry +* st_perfile: file_hashtbl[] entry. +* st_perfile_state: nfs4_stateowner->so_perfilestate +* st_perlockowner: (open stateid) list of lock nfs4_stateowners +* st_access_bmap: used only for open stateid +* st_deny_bmap: used only for open stateid +* st_openstp: open stateid lock stateid was derived from +* +* XXX: open stateids and lock stateids have diverged sufficiently that +* we should consider defining separate structs for the two cases. +*/ + +struct nfs4_stateid { + struct list_head st_hash; + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_lockowners; + struct nfs4_stateowner * st_stateowner; + struct nfs4_file * st_file; + stateid_t st_stateid; + unsigned long st_access_bmap; + unsigned long st_deny_bmap; + struct nfs4_stateid * st_openstp; +}; + +/* flags for preprocess_seqid_op() */ +#define HAS_SESSION 0x00000001 +#define CONFIRM 0x00000002 +#define OPEN_STATE 0x00000004 +#define LOCK_STATE 0x00000008 +#define RD_STATE 0x00000010 +#define WR_STATE 0x00000020 +#define CLOSE_STATE 0x00000040 + +struct nfsd4_compound_state; + +extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, + stateid_t *stateid, int flags, struct file **filp); +extern void nfs4_lock_state(void); +extern void nfs4_unlock_state(void); +extern int nfs4_in_grace(void); +extern __be32 nfs4_check_open_reclaim(clientid_t *clid); +extern void nfs4_free_stateowner(struct kref *kref); +extern int set_callback_cred(void); +extern void nfsd4_probe_callback(struct nfs4_client *clp); +extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); +extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); +extern void nfsd4_do_callback_rpc(struct work_struct *); +extern void nfsd4_cb_recall(struct nfs4_delegation *dp); +extern int nfsd4_create_callback_queue(void); +extern void nfsd4_destroy_callback_queue(void); +extern void nfsd4_shutdown_callback(struct nfs4_client *); +extern void nfs4_put_delegation(struct nfs4_delegation *dp); +extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); +extern void nfsd4_init_recdir(char *recdir_name); +extern int nfsd4_recdir_load(void); +extern void nfsd4_shutdown_recdir(void); +extern int nfs4_client_to_reclaim(const char *name); +extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); +extern void nfsd4_recdir_purge_old(void); +extern int nfsd4_create_clid_dir(struct nfs4_client *clp); +extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); +extern void release_session_client(struct nfsd4_session *); + +static inline void +nfs4_put_stateowner(struct nfs4_stateowner *so) +{ + kref_put(&so->so_ref, nfs4_free_stateowner); +} + +static inline void +nfs4_get_stateowner(struct nfs4_stateowner *so) +{ + kref_get(&so->so_ref); +} + +#endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c new file mode 100644 index 00000000..a2e2402b --- /dev/null +++ b/fs/nfsd/stats.c @@ -0,0 +1,104 @@ +/* + * procfs-based user access to knfsd statistics + * + * /proc/net/rpc/nfsd + * + * Format: + * rc + * Statistsics for the reply cache + * fh + * statistics for filehandle lookup + * io + * statistics for IO throughput + * th <10%-20%> <20%-30%> ... <90%-100%> <100%> + * time (seconds) when nfsd thread usage above thresholds + * and number of times that all threads were in use + * ra cache-size <10% <20% <30% ... <100% not-found + * number of times that read-ahead entry was found that deep in + * the cache. + * plus generic RPC stats (see net/sunrpc/stats.c) + * + * Copyright (C) 1995, 1996, 1997 Olaf Kirch + */ + +#include +#include +#include +#include + +#include "nfsd.h" + +struct nfsd_stats nfsdstats; +struct svc_stat nfsd_svcstats = { + .program = &nfsd_program, +}; + +static int nfsd_proc_show(struct seq_file *seq, void *v) +{ + int i; + + seq_printf(seq, "rc %u %u %u\nfh %u %u %u %u %u\nio %u %u\n", + nfsdstats.rchits, + nfsdstats.rcmisses, + nfsdstats.rcnocache, + nfsdstats.fh_stale, + nfsdstats.fh_lookup, + nfsdstats.fh_anon, + nfsdstats.fh_nocache_dir, + nfsdstats.fh_nocache_nondir, + nfsdstats.io_read, + nfsdstats.io_write); + /* thread usage: */ + seq_printf(seq, "th %u %u", nfsdstats.th_cnt, nfsdstats.th_fullcnt); + for (i=0; i<10; i++) { + unsigned int jifs = nfsdstats.th_usage[i]; + unsigned int sec = jifs / HZ, msec = (jifs % HZ)*1000/HZ; + seq_printf(seq, " %u.%03u", sec, msec); + } + + /* newline and ra-cache */ + seq_printf(seq, "\nra %u", nfsdstats.ra_size); + for (i=0; i<11; i++) + seq_printf(seq, " %u", nfsdstats.ra_depth[i]); + seq_putc(seq, '\n'); + + /* show my rpc info */ + svc_seq_show(seq, &nfsd_svcstats); + +#ifdef CONFIG_NFSD_V4 + /* Show count for individual nfsv4 operations */ + /* Writing operation numbers 0 1 2 also for maintaining uniformity */ + seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1); + for (i = 0; i <= LAST_NFS4_OP; i++) + seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]); + + seq_putc(seq, '\n'); +#endif + + return 0; +} + +static int nfsd_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, nfsd_proc_show, NULL); +} + +static const struct file_operations nfsd_proc_fops = { + .owner = THIS_MODULE, + .open = nfsd_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +void +nfsd_stat_init(void) +{ + svc_proc_register(&nfsd_svcstats, &nfsd_proc_fops); +} + +void +nfsd_stat_shutdown(void) +{ + svc_proc_unregister("nfsd"); +} diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c new file mode 100644 index 00000000..acf88aea --- /dev/null +++ b/fs/nfsd/vfs.c @@ -0,0 +1,2273 @@ +/* + * File operations used by nfsd. Some of these have been ripped from + * other parts of the kernel because they weren't exported, others + * are partial duplicates with added or changed functionality. + * + * Note that several functions dget() the dentry upon which they want + * to act, most notably those that create directory entries. Response + * dentry's are dput()'d if necessary in the release callback. + * So if you notice code paths that apparently fail to dput() the + * dentry, don't worry--they have been taken care of. + * + * Copyright (C) 1995-1999 Olaf Kirch + * Zerocpy NFS support (C) 2002 Hirokazu Takahashi + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_NFSD_V3 +#include "xdr3.h" +#endif /* CONFIG_NFSD_V3 */ + +#ifdef CONFIG_NFSD_V4 +#include "acl.h" +#include "idmap.h" +#endif /* CONFIG_NFSD_V4 */ + +#include "nfsd.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_FILEOP + + +/* + * This is a cache of readahead params that help us choose the proper + * readahead strategy. Initially, we set all readahead parameters to 0 + * and let the VFS handle things. + * If you increase the number of cached files very much, you'll need to + * add a hash table here. + */ +struct raparms { + struct raparms *p_next; + unsigned int p_count; + ino_t p_ino; + dev_t p_dev; + int p_set; + struct file_ra_state p_ra; + unsigned int p_hindex; +}; + +struct raparm_hbucket { + struct raparms *pb_head; + spinlock_t pb_lock; +} ____cacheline_aligned_in_smp; + +#define RAPARM_HASH_BITS 4 +#define RAPARM_HASH_SIZE (1<ex_path.mnt), + .dentry = dget(dentry)}; + int err = 0; + + err = follow_down(&path); + if (err < 0) + goto out; + + exp2 = rqst_exp_get_by_name(rqstp, &path); + if (IS_ERR(exp2)) { + err = PTR_ERR(exp2); + /* + * We normally allow NFS clients to continue + * "underneath" a mountpoint that is not exported. + * The exception is V4ROOT, where no traversal is ever + * allowed without an explicit export of the new + * directory. + */ + if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT)) + err = 0; + path_put(&path); + goto out; + } + if (nfsd_v4client(rqstp) || + (exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { + /* successfully crossed mount point */ + /* + * This is subtle: path.dentry is *not* on path.mnt + * at this point. The only reason we are safe is that + * original mnt is pinned down by exp, so we should + * put path *before* putting exp + */ + *dpp = path.dentry; + path.dentry = dentry; + *expp = exp2; + exp2 = exp; + } + path_put(&path); + exp_put(exp2); +out: + return err; +} + +static void follow_to_parent(struct path *path) +{ + struct dentry *dp; + + while (path->dentry == path->mnt->mnt_root && follow_up(path)) + ; + dp = dget_parent(path->dentry); + dput(path->dentry); + path->dentry = dp; +} + +static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp) +{ + struct svc_export *exp2; + struct path path = {.mnt = mntget((*exp)->ex_path.mnt), + .dentry = dget(dparent)}; + + follow_to_parent(&path); + + exp2 = rqst_exp_parent(rqstp, &path); + if (PTR_ERR(exp2) == -ENOENT) { + *dentryp = dget(dparent); + } else if (IS_ERR(exp2)) { + path_put(&path); + return PTR_ERR(exp2); + } else { + *dentryp = dget(path.dentry); + exp_put(*exp); + *exp = exp2; + } + path_put(&path); + return 0; +} + +/* + * For nfsd purposes, we treat V4ROOT exports as though there was an + * export at *every* directory. + */ +int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) +{ + if (d_mountpoint(dentry)) + return 1; + if (!(exp->ex_flags & NFSEXP_V4ROOT)) + return 0; + return dentry->d_inode != NULL; +} + +__be32 +nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, + const char *name, unsigned int len, + struct svc_export **exp_ret, struct dentry **dentry_ret) +{ + struct svc_export *exp; + struct dentry *dparent; + struct dentry *dentry; + int host_err; + + dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); + + dparent = fhp->fh_dentry; + exp = fhp->fh_export; + exp_get(exp); + + /* Lookup the name, but don't follow links */ + if (isdotent(name, len)) { + if (len==1) + dentry = dget(dparent); + else if (dparent != exp->ex_path.dentry) + dentry = dget_parent(dparent); + else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp)) + dentry = dget(dparent); /* .. == . just like at / */ + else { + /* checking mountpoint crossing is very different when stepping up */ + host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry); + if (host_err) + goto out_nfserr; + } + } else { + fh_lock(fhp); + dentry = lookup_one_len(name, dparent, len); + host_err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_nfserr; + /* + * check if we have crossed a mount point ... + */ + if (nfsd_mountpoint(dentry, exp)) { + if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { + dput(dentry); + goto out_nfserr; + } + } + } + *dentry_ret = dentry; + *exp_ret = exp; + return 0; + +out_nfserr: + exp_put(exp); + return nfserrno(host_err); +} + +/* + * Look up one component of a pathname. + * N.B. After this call _both_ fhp and resfh need an fh_put + * + * If the lookup would cross a mountpoint, and the mounted filesystem + * is exported to the client with NFSEXP_NOHIDE, then the lookup is + * accepted as it stands and the mounted directory is + * returned. Otherwise the covered directory is returned. + * NOTE: this mountpoint crossing is not supported properly by all + * clients and is explicitly disallowed for NFSv3 + * NeilBrown + */ +__be32 +nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, + unsigned int len, struct svc_fh *resfh) +{ + struct svc_export *exp; + struct dentry *dentry; + __be32 err; + + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (err) + return err; + err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); + if (err) + return err; + err = check_nfsd_access(exp, rqstp); + if (err) + goto out; + /* + * Note: we compose the file handle now, but as the + * dentry may be negative, it may need to be updated. + */ + err = fh_compose(resfh, exp, dentry, fhp); + if (!err && !dentry->d_inode) + err = nfserr_noent; +out: + dput(dentry); + exp_put(exp); + return err; +} + +static int nfsd_break_lease(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + return break_lease(inode, O_WRONLY | O_NONBLOCK); +} + +/* + * Commit metadata changes to stable storage. + */ +static int +commit_metadata(struct svc_fh *fhp) +{ + struct inode *inode = fhp->fh_dentry->d_inode; + const struct export_operations *export_ops = inode->i_sb->s_export_op; + + if (!EX_ISSYNC(fhp->fh_export)) + return 0; + + if (export_ops->commit_metadata) + return export_ops->commit_metadata(inode); + return sync_inode_metadata(inode, 1); +} + +/* + * Set various file attributes. + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, + int check_guard, time_t guardtime) +{ + struct dentry *dentry; + struct inode *inode; + int accmode = NFSD_MAY_SATTR; + int ftype = 0; + __be32 err; + int host_err; + int size_change = 0; + + if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) + accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; + if (iap->ia_valid & ATTR_SIZE) + ftype = S_IFREG; + + /* Get inode */ + err = fh_verify(rqstp, fhp, ftype, accmode); + if (err) + goto out; + + dentry = fhp->fh_dentry; + inode = dentry->d_inode; + + /* Ignore any mode updates on symlinks */ + if (S_ISLNK(inode->i_mode)) + iap->ia_valid &= ~ATTR_MODE; + + if (!iap->ia_valid) + goto out; + + /* + * NFSv2 does not differentiate between "set-[ac]time-to-now" + * which only requires access, and "set-[ac]time-to-X" which + * requires ownership. + * So if it looks like it might be "set both to the same time which + * is close to now", and if inode_change_ok fails, then we + * convert to "set to now" instead of "set to explicit time" + * + * We only call inode_change_ok as the last test as technically + * it is not an interface that we should be using. It is only + * valid if the filesystem does not define it's own i_op->setattr. + */ +#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) +#define MAX_TOUCH_TIME_ERROR (30*60) + if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET && + iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) { + /* + * Looks probable. + * + * Now just make sure time is in the right ballpark. + * Solaris, at least, doesn't seem to care what the time + * request is. We require it be within 30 minutes of now. + */ + time_t delta = iap->ia_atime.tv_sec - get_seconds(); + if (delta < 0) + delta = -delta; + if (delta < MAX_TOUCH_TIME_ERROR && + inode_change_ok(inode, iap) != 0) { + /* + * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. + * This will cause notify_change to set these times + * to "now" + */ + iap->ia_valid &= ~BOTH_TIME_SET; + } + } + + /* + * The size case is special. + * It changes the file as well as the attributes. + */ + if (iap->ia_valid & ATTR_SIZE) { + if (iap->ia_size < inode->i_size) { + err = nfsd_permission(rqstp, fhp->fh_export, dentry, + NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); + if (err) + goto out; + } + + host_err = get_write_access(inode); + if (host_err) + goto out_nfserr; + + size_change = 1; + host_err = locks_verify_truncate(inode, NULL, iap->ia_size); + if (host_err) { + put_write_access(inode); + goto out_nfserr; + } + } + + /* sanitize the mode change */ + if (iap->ia_valid & ATTR_MODE) { + iap->ia_mode &= S_IALLUGO; + iap->ia_mode |= (inode->i_mode & ~S_IALLUGO); + } + + /* Revoke setuid/setgid on chown */ + if (!S_ISDIR(inode->i_mode) && + (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || + ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) { + iap->ia_valid |= ATTR_KILL_PRIV; + if (iap->ia_valid & ATTR_MODE) { + /* we're setting mode too, just clear the s*id bits */ + iap->ia_mode &= ~S_ISUID; + if (iap->ia_mode & S_IXGRP) + iap->ia_mode &= ~S_ISGID; + } else { + /* set ATTR_KILL_* bits and let VFS handle it */ + iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); + } + } + + /* Change the attributes. */ + + iap->ia_valid |= ATTR_CTIME; + + err = nfserr_notsync; + if (!check_guard || guardtime == inode->i_ctime.tv_sec) { + host_err = nfsd_break_lease(inode); + if (host_err) + goto out_nfserr; + fh_lock(fhp); + + host_err = notify_change(dentry, iap); + err = nfserrno(host_err); + fh_unlock(fhp); + } + if (size_change) + put_write_access(inode); + if (!err) + commit_metadata(fhp); +out: + return err; + +out_nfserr: + err = nfserrno(host_err); + goto out; +} + +#if defined(CONFIG_NFSD_V2_ACL) || \ + defined(CONFIG_NFSD_V3_ACL) || \ + defined(CONFIG_NFSD_V4) +static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) +{ + ssize_t buflen; + ssize_t ret; + + buflen = vfs_getxattr(dentry, key, NULL, 0); + if (buflen <= 0) + return buflen; + + *buf = kmalloc(buflen, GFP_KERNEL); + if (!*buf) + return -ENOMEM; + + ret = vfs_getxattr(dentry, key, *buf, buflen); + if (ret < 0) + kfree(*buf); + return ret; +} +#endif + +#if defined(CONFIG_NFSD_V4) +static int +set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key) +{ + int len; + size_t buflen; + char *buf = NULL; + int error = 0; + + buflen = posix_acl_xattr_size(pacl->a_count); + buf = kmalloc(buflen, GFP_KERNEL); + error = -ENOMEM; + if (buf == NULL) + goto out; + + len = posix_acl_to_xattr(pacl, buf, buflen); + if (len < 0) { + error = len; + goto out; + } + + error = vfs_setxattr(dentry, key, buf, len, 0); +out: + kfree(buf); + return error; +} + +__be32 +nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl) +{ + __be32 error; + int host_error; + struct dentry *dentry; + struct inode *inode; + struct posix_acl *pacl = NULL, *dpacl = NULL; + unsigned int flags = 0; + + /* Get inode */ + error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); + if (error) + return error; + + dentry = fhp->fh_dentry; + inode = dentry->d_inode; + if (S_ISDIR(inode->i_mode)) + flags = NFS4_ACL_DIR; + + host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); + if (host_error == -EINVAL) { + return nfserr_attrnotsupp; + } else if (host_error < 0) + goto out_nfserr; + + host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); + if (host_error < 0) + goto out_release; + + if (S_ISDIR(inode->i_mode)) + host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT); + +out_release: + posix_acl_release(pacl); + posix_acl_release(dpacl); +out_nfserr: + if (host_error == -EOPNOTSUPP) + return nfserr_attrnotsupp; + else + return nfserrno(host_error); +} + +static struct posix_acl * +_get_posix_acl(struct dentry *dentry, char *key) +{ + void *buf = NULL; + struct posix_acl *pacl = NULL; + int buflen; + + buflen = nfsd_getxattr(dentry, key, &buf); + if (!buflen) + buflen = -ENODATA; + if (buflen <= 0) + return ERR_PTR(buflen); + + pacl = posix_acl_from_xattr(buf, buflen); + kfree(buf); + return pacl; +} + +int +nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl) +{ + struct inode *inode = dentry->d_inode; + int error = 0; + struct posix_acl *pacl = NULL, *dpacl = NULL; + unsigned int flags = 0; + + pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS); + if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA) + pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + if (IS_ERR(pacl)) { + error = PTR_ERR(pacl); + pacl = NULL; + goto out; + } + + if (S_ISDIR(inode->i_mode)) { + dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT); + if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA) + dpacl = NULL; + else if (IS_ERR(dpacl)) { + error = PTR_ERR(dpacl); + dpacl = NULL; + goto out; + } + flags = NFS4_ACL_DIR; + } + + *acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags); + if (IS_ERR(*acl)) { + error = PTR_ERR(*acl); + *acl = NULL; + } + out: + posix_acl_release(pacl); + posix_acl_release(dpacl); + return error; +} + +#endif /* defined(CONFIG_NFSD_V4) */ + +#ifdef CONFIG_NFSD_V3 +/* + * Check server access rights to a file system object + */ +struct accessmap { + u32 access; + int how; +}; +static struct accessmap nfs3_regaccess[] = { + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_TRUNC }, + { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE }, + + { 0, 0 } +}; + +static struct accessmap nfs3_diraccess[] = { + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_LOOKUP, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC}, + { NFS3_ACCESS_EXTEND, NFSD_MAY_EXEC|NFSD_MAY_WRITE }, + { NFS3_ACCESS_DELETE, NFSD_MAY_REMOVE }, + + { 0, 0 } +}; + +static struct accessmap nfs3_anyaccess[] = { + /* Some clients - Solaris 2.6 at least, make an access call + * to the server to check for access for things like /dev/null + * (which really, the server doesn't care about). So + * We provide simple access checking for them, looking + * mainly at mode bits, and we make sure to ignore read-only + * filesystem checks + */ + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS }, + { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS }, + + { 0, 0 } +}; + +__be32 +nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported) +{ + struct accessmap *map; + struct svc_export *export; + struct dentry *dentry; + u32 query, result = 0, sresult = 0; + __be32 error; + + error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); + if (error) + goto out; + + export = fhp->fh_export; + dentry = fhp->fh_dentry; + + if (S_ISREG(dentry->d_inode->i_mode)) + map = nfs3_regaccess; + else if (S_ISDIR(dentry->d_inode->i_mode)) + map = nfs3_diraccess; + else + map = nfs3_anyaccess; + + + query = *access; + for (; map->access; map++) { + if (map->access & query) { + __be32 err2; + + sresult |= map->access; + + err2 = nfsd_permission(rqstp, export, dentry, map->how); + switch (err2) { + case nfs_ok: + result |= map->access; + break; + + /* the following error codes just mean the access was not allowed, + * rather than an error occurred */ + case nfserr_rofs: + case nfserr_acces: + case nfserr_perm: + /* simply don't "or" in the access bit. */ + break; + default: + error = err2; + goto out; + } + } + } + *access = result; + if (supported) + *supported = sresult; + + out: + return error; +} +#endif /* CONFIG_NFSD_V3 */ + +static int nfsd_open_break_lease(struct inode *inode, int access) +{ + unsigned int mode; + + if (access & NFSD_MAY_NOT_BREAK_LEASE) + return 0; + mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY; + return break_lease(inode, mode | O_NONBLOCK); +} + +/* + * Open an existing file or directory. + * The access argument indicates the type of open (read/write/lock) + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, + int access, struct file **filp) +{ + struct dentry *dentry; + struct inode *inode; + int flags = O_RDONLY|O_LARGEFILE; + __be32 err; + int host_err = 0; + + validate_process_creds(); + + /* + * If we get here, then the client has already done an "open", + * and (hopefully) checked permission - so allow OWNER_OVERRIDE + * in case a chmod has now revoked permission. + */ + err = fh_verify(rqstp, fhp, type, access | NFSD_MAY_OWNER_OVERRIDE); + if (err) + goto out; + + dentry = fhp->fh_dentry; + inode = dentry->d_inode; + + /* Disallow write access to files with the append-only bit set + * or any access when mandatory locking enabled + */ + err = nfserr_perm; + if (IS_APPEND(inode) && (access & NFSD_MAY_WRITE)) + goto out; + /* + * We must ignore files (but only files) which might have mandatory + * locks on them because there is no way to know if the accesser has + * the lock. + */ + if (S_ISREG((inode)->i_mode) && mandatory_lock(inode)) + goto out; + + if (!inode->i_fop) + goto out; + + host_err = nfsd_open_break_lease(inode, access); + if (host_err) /* NOMEM or WOULDBLOCK */ + goto out_nfserr; + + if (access & NFSD_MAY_WRITE) { + if (access & NFSD_MAY_READ) + flags = O_RDWR|O_LARGEFILE; + else + flags = O_WRONLY|O_LARGEFILE; + } + *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt), + flags, current_cred()); + if (IS_ERR(*filp)) + host_err = PTR_ERR(*filp); + else + host_err = ima_file_check(*filp, access); +out_nfserr: + err = nfserrno(host_err); +out: + validate_process_creds(); + return err; +} + +/* + * Close a file. + */ +void +nfsd_close(struct file *filp) +{ + fput(filp); +} + +/* + * Obtain the readahead parameters for the file + * specified by (dev, ino). + */ + +static inline struct raparms * +nfsd_get_raparms(dev_t dev, ino_t ino) +{ + struct raparms *ra, **rap, **frap = NULL; + int depth = 0; + unsigned int hash; + struct raparm_hbucket *rab; + + hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; + rab = &raparm_hash[hash]; + + spin_lock(&rab->pb_lock); + for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { + if (ra->p_ino == ino && ra->p_dev == dev) + goto found; + depth++; + if (ra->p_count == 0) + frap = rap; + } + depth = nfsdstats.ra_size; + if (!frap) { + spin_unlock(&rab->pb_lock); + return NULL; + } + rap = frap; + ra = *frap; + ra->p_dev = dev; + ra->p_ino = ino; + ra->p_set = 0; + ra->p_hindex = hash; +found: + if (rap != &rab->pb_head) { + *rap = ra->p_next; + ra->p_next = rab->pb_head; + rab->pb_head = ra; + } + ra->p_count++; + nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; + spin_unlock(&rab->pb_lock); + return ra; +} + +/* + * Grab and keep cached pages associated with a file in the svc_rqst + * so that they can be passed to the network sendmsg/sendpage routines + * directly. They will be released after the sending has completed. + */ +static int +nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + struct svc_rqst *rqstp = sd->u.data; + struct page **pp = rqstp->rq_respages + rqstp->rq_resused; + struct page *page = buf->page; + size_t size; + + size = sd->len; + + if (rqstp->rq_res.page_len == 0) { + get_page(page); + put_page(*pp); + *pp = page; + rqstp->rq_resused++; + rqstp->rq_res.page_base = buf->offset; + rqstp->rq_res.page_len = size; + } else if (page != pp[-1]) { + get_page(page); + if (*pp) + put_page(*pp); + *pp = page; + rqstp->rq_resused++; + rqstp->rq_res.page_len += size; + } else + rqstp->rq_res.page_len += size; + + return size; +} + +static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, + struct splice_desc *sd) +{ + return __splice_from_pipe(pipe, sd, nfsd_splice_actor); +} + +static __be32 +nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, + loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +{ + mm_segment_t oldfs; + __be32 err; + int host_err; + + err = nfserr_perm; + + if (file->f_op->splice_read && rqstp->rq_splice_ok) { + struct splice_desc sd = { + .len = 0, + .total_len = *count, + .pos = offset, + .u.data = rqstp, + }; + + rqstp->rq_resused = 1; + host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); + } else { + oldfs = get_fs(); + set_fs(KERNEL_DS); + host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); + set_fs(oldfs); + } + + if (host_err >= 0) { + nfsdstats.io_read += host_err; + *count = host_err; + err = 0; + fsnotify_access(file); + } else + err = nfserrno(host_err); + return err; +} + +static void kill_suid(struct dentry *dentry) +{ + struct iattr ia; + ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; + + mutex_lock(&dentry->d_inode->i_mutex); + notify_change(dentry, &ia); + mutex_unlock(&dentry->d_inode->i_mutex); +} + +/* + * Gathered writes: If another process is currently writing to the file, + * there's a high chance this is another nfsd (triggered by a bulk write + * from a client's biod). Rather than syncing the file with each write + * request, we sleep for 10 msec. + * + * I don't know if this roughly approximates C. Juszak's idea of + * gathered writes, but it's a nice and simple solution (IMHO), and it + * seems to work:-) + * + * Note: we do this only in the NFSv2 case, since v3 and higher have a + * better tool (separate unstable writes and commits) for solving this + * problem. + */ +static int wait_for_concurrent_writes(struct file *file) +{ + struct inode *inode = file->f_path.dentry->d_inode; + static ino_t last_ino; + static dev_t last_dev; + int err = 0; + + if (atomic_read(&inode->i_writecount) > 1 + || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) { + dprintk("nfsd: write defer %d\n", task_pid_nr(current)); + msleep(10); + dprintk("nfsd: write resume %d\n", task_pid_nr(current)); + } + + if (inode->i_state & I_DIRTY) { + dprintk("nfsd: write sync %d\n", task_pid_nr(current)); + err = vfs_fsync(file, 0); + } + last_ino = inode->i_ino; + last_dev = inode->i_sb->s_dev; + return err; +} + +static __be32 +nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, + loff_t offset, struct kvec *vec, int vlen, + unsigned long *cnt, int *stablep) +{ + struct svc_export *exp; + struct dentry *dentry; + struct inode *inode; + mm_segment_t oldfs; + __be32 err = 0; + int host_err; + int stable = *stablep; + int use_wgather; + + dentry = file->f_path.dentry; + inode = dentry->d_inode; + exp = fhp->fh_export; + + /* + * Request sync writes if + * - the sync export option has been set, or + * - the client requested O_SYNC behavior (NFSv3 feature). + * - The file system doesn't support fsync(). + * When NFSv2 gathered writes have been configured for this volume, + * flushing the data to disk is handled separately below. + */ + use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); + + if (!file->f_op->fsync) {/* COMMIT3 cannot work */ + stable = 2; + *stablep = 2; /* FILE_SYNC */ + } + + if (!EX_ISSYNC(exp)) + stable = 0; + if (stable && !use_wgather) { + spin_lock(&file->f_lock); + file->f_flags |= O_SYNC; + spin_unlock(&file->f_lock); + } + + /* Write the data. */ + oldfs = get_fs(); set_fs(KERNEL_DS); + host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); + set_fs(oldfs); + if (host_err < 0) + goto out_nfserr; + *cnt = host_err; + nfsdstats.io_write += host_err; + fsnotify_modify(file); + + /* clear setuid/setgid flag after write */ + if (inode->i_mode & (S_ISUID | S_ISGID)) + kill_suid(dentry); + + if (stable && use_wgather) + host_err = wait_for_concurrent_writes(file); + +out_nfserr: + dprintk("nfsd: write complete host_err=%d\n", host_err); + if (host_err >= 0) + err = 0; + else + err = nfserrno(host_err); + return err; +} + +/* + * Read data from a file. count must contain the requested read count + * on entry. On return, *count contains the number of bytes actually read. + * N.B. After this call fhp needs an fh_put + */ +__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +{ + struct file *file; + struct inode *inode; + struct raparms *ra; + __be32 err; + + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + if (err) + return err; + + inode = file->f_path.dentry->d_inode; + + /* Get readahead parameters */ + ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); + + if (ra && ra->p_set) + file->f_ra = ra->p_ra; + + err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); + + /* Write back readahead params */ + if (ra) { + struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; + spin_lock(&rab->pb_lock); + ra->p_ra = file->f_ra; + ra->p_set = 1; + ra->p_count--; + spin_unlock(&rab->pb_lock); + } + + nfsd_close(file); + return err; +} + +/* As above, but use the provided file descriptor. */ +__be32 +nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, + loff_t offset, struct kvec *vec, int vlen, + unsigned long *count) +{ + __be32 err; + + if (file) { + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, + NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE); + if (err) + goto out; + err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); + } else /* Note file may still be NULL in NFSv4 special stateid case: */ + err = nfsd_read(rqstp, fhp, offset, vec, vlen, count); +out: + return err; +} + +/* + * Write data to a file. + * The stable flag requests synchronous writes. + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, + loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, + int *stablep) +{ + __be32 err = 0; + + if (file) { + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, + NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE); + if (err) + goto out; + err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, + stablep); + } else { + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); + if (err) + goto out; + + if (cnt) + err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, + cnt, stablep); + nfsd_close(file); + } +out: + return err; +} + +#ifdef CONFIG_NFSD_V3 +/* + * Commit all pending writes to stable storage. + * + * Note: we only guarantee that data that lies within the range specified + * by the 'offset' and 'count' parameters will be synced. + * + * Unfortunately we cannot lock the file to make sure we return full WCC + * data to the client, as locking happens lower down in the filesystem. + */ +__be32 +nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, unsigned long count) +{ + struct file *file; + loff_t end = LLONG_MAX; + __be32 err = nfserr_inval; + + if (offset < 0) + goto out; + if (count != 0) { + end = offset + (loff_t)count - 1; + if (end < offset) + goto out; + } + + err = nfsd_open(rqstp, fhp, S_IFREG, + NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file); + if (err) + goto out; + if (EX_ISSYNC(fhp->fh_export)) { + int err2 = vfs_fsync_range(file, offset, end, 0); + + if (err2 != -EINVAL) + err = nfserrno(err2); + else + err = nfserr_notsupp; + } + + nfsd_close(file); +out: + return err; +} +#endif /* CONFIG_NFSD_V3 */ + +static __be32 +nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, + struct iattr *iap) +{ + /* + * Mode has already been set earlier in create: + */ + iap->ia_valid &= ~ATTR_MODE; + /* + * Setting uid/gid works only for root. Irix appears to + * send along the gid on create when it tries to implement + * setgid directories via NFS: + */ + if (current_fsuid() != 0) + iap->ia_valid &= ~(ATTR_UID|ATTR_GID); + if (iap->ia_valid) + return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); + return 0; +} + +/* HPUX client sometimes creates a file in mode 000, and sets size to 0. + * setting size to 0 may fail for some specific file systems by the permission + * checking which requires WRITE permission but the mode is 000. + * we ignore the resizing(to 0) on the just new created file, since the size is + * 0 after file created. + * + * call this only after vfs_create() is called. + * */ +static void +nfsd_check_ignore_resizing(struct iattr *iap) +{ + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; +} + +/* + * Create a file (regular, directory, device, fifo); UNIX sockets + * not yet implemented. + * If the response fh has been verified, the parent directory should + * already be locked. Note that the parent directory is left locked. + * + * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp + */ +__be32 +nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, struct iattr *iap, + int type, dev_t rdev, struct svc_fh *resfhp) +{ + struct dentry *dentry, *dchild = NULL; + struct inode *dirp; + __be32 err; + __be32 err2; + int host_err; + + err = nfserr_perm; + if (!flen) + goto out; + err = nfserr_exist; + if (isdotent(fname, flen)) + goto out; + + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + + dentry = fhp->fh_dentry; + dirp = dentry->d_inode; + + err = nfserr_notdir; + if (!dirp->i_op->lookup) + goto out; + /* + * Check whether the response file handle has been verified yet. + * If it has, the parent directory should already be locked. + */ + if (!resfhp->fh_dentry) { + /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ + fh_lock_nested(fhp, I_MUTEX_PARENT); + dchild = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(dchild); + if (IS_ERR(dchild)) + goto out_nfserr; + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); + if (err) + goto out; + } else { + /* called from nfsd_proc_create */ + dchild = dget(resfhp->fh_dentry); + if (!fhp->fh_locked) { + /* not actually possible */ + printk(KERN_ERR + "nfsd_create: parent %s/%s not locked!\n", + dentry->d_parent->d_name.name, + dentry->d_name.name); + err = nfserr_io; + goto out; + } + } + /* + * Make sure the child dentry is still negative ... + */ + err = nfserr_exist; + if (dchild->d_inode) { + dprintk("nfsd_create: dentry %s/%s not negative!\n", + dentry->d_name.name, dchild->d_name.name); + goto out; + } + + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; + + err = nfserr_inval; + if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) { + printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", + type); + goto out; + } + + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; + + /* + * Get the dir op function pointer. + */ + err = 0; + switch (type) { + case S_IFREG: + host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); + if (!host_err) + nfsd_check_ignore_resizing(iap); + break; + case S_IFDIR: + host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); + break; + } + if (host_err < 0) { + mnt_drop_write(fhp->fh_export->ex_path.mnt); + goto out_nfserr; + } + + err = nfsd_create_setattr(rqstp, resfhp, iap); + + /* + * nfsd_setattr already committed the child. Transactional filesystems + * had a chance to commit changes for both parent and child + * simultaneously making the following commit_metadata a noop. + */ + err2 = nfserrno(commit_metadata(fhp)); + if (err2) + err = err2; + mnt_drop_write(fhp->fh_export->ex_path.mnt); + /* + * Update the file handle to get the new inode info. + */ + if (!err) + err = fh_update(resfhp); +out: + if (dchild && !IS_ERR(dchild)) + dput(dchild); + return err; + +out_nfserr: + err = nfserrno(host_err); + goto out; +} + +#ifdef CONFIG_NFSD_V3 + +static inline int nfsd_create_is_exclusive(int createmode) +{ + return createmode == NFS3_CREATE_EXCLUSIVE + || createmode == NFS4_CREATE_EXCLUSIVE4_1; +} + +/* + * NFSv3 and NFSv4 version of nfsd_create + */ +__be32 +do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, struct iattr *iap, + struct svc_fh *resfhp, int createmode, u32 *verifier, + int *truncp, int *created) +{ + struct dentry *dentry, *dchild = NULL; + struct inode *dirp; + __be32 err; + int host_err; + __u32 v_mtime=0, v_atime=0; + + err = nfserr_perm; + if (!flen) + goto out; + err = nfserr_exist; + if (isdotent(fname, flen)) + goto out; + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (err) + goto out; + + dentry = fhp->fh_dentry; + dirp = dentry->d_inode; + + /* Get all the sanity checks out of the way before + * we lock the parent. */ + err = nfserr_notdir; + if (!dirp->i_op->lookup) + goto out; + fh_lock_nested(fhp, I_MUTEX_PARENT); + + /* + * Compose the response file handle. + */ + dchild = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(dchild); + if (IS_ERR(dchild)) + goto out_nfserr; + + /* If file doesn't exist, check for permissions to create one */ + if (!dchild->d_inode) { + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + } + + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); + if (err) + goto out; + + if (nfsd_create_is_exclusive(createmode)) { + /* solaris7 gets confused (bugid 4218508) if these have + * the high bit set, so just clear the high bits. If this is + * ever changed to use different attrs for storing the + * verifier, then do_open_lookup() will also need to be fixed + * accordingly. + */ + v_mtime = verifier[0]&0x7fffffff; + v_atime = verifier[1]&0x7fffffff; + } + + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; + if (dchild->d_inode) { + err = 0; + + switch (createmode) { + case NFS3_CREATE_UNCHECKED: + if (! S_ISREG(dchild->d_inode->i_mode)) + err = nfserr_exist; + else if (truncp) { + /* in nfsv4, we need to treat this case a little + * differently. we don't want to truncate the + * file now; this would be wrong if the OPEN + * fails for some other reason. furthermore, + * if the size is nonzero, we should ignore it + * according to spec! + */ + *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size; + } + else { + iap->ia_valid &= ATTR_SIZE; + goto set_attr; + } + break; + case NFS3_CREATE_EXCLUSIVE: + if ( dchild->d_inode->i_mtime.tv_sec == v_mtime + && dchild->d_inode->i_atime.tv_sec == v_atime + && dchild->d_inode->i_size == 0 ) + break; + case NFS4_CREATE_EXCLUSIVE4_1: + if ( dchild->d_inode->i_mtime.tv_sec == v_mtime + && dchild->d_inode->i_atime.tv_sec == v_atime + && dchild->d_inode->i_size == 0 ) + goto set_attr; + /* fallthru */ + case NFS3_CREATE_GUARDED: + err = nfserr_exist; + } + mnt_drop_write(fhp->fh_export->ex_path.mnt); + goto out; + } + + host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); + if (host_err < 0) { + mnt_drop_write(fhp->fh_export->ex_path.mnt); + goto out_nfserr; + } + if (created) + *created = 1; + + nfsd_check_ignore_resizing(iap); + + if (nfsd_create_is_exclusive(createmode)) { + /* Cram the verifier into atime/mtime */ + iap->ia_valid = ATTR_MTIME|ATTR_ATIME + | ATTR_MTIME_SET|ATTR_ATIME_SET; + /* XXX someone who knows this better please fix it for nsec */ + iap->ia_mtime.tv_sec = v_mtime; + iap->ia_atime.tv_sec = v_atime; + iap->ia_mtime.tv_nsec = 0; + iap->ia_atime.tv_nsec = 0; + } + + set_attr: + err = nfsd_create_setattr(rqstp, resfhp, iap); + + /* + * nfsd_setattr already committed the child (and possibly also the parent). + */ + if (!err) + err = nfserrno(commit_metadata(fhp)); + + mnt_drop_write(fhp->fh_export->ex_path.mnt); + /* + * Update the filehandle to get the new inode info. + */ + if (!err) + err = fh_update(resfhp); + + out: + fh_unlock(fhp); + if (dchild && !IS_ERR(dchild)) + dput(dchild); + return err; + + out_nfserr: + err = nfserrno(host_err); + goto out; +} +#endif /* CONFIG_NFSD_V3 */ + +/* + * Read a symlink. On entry, *lenp must contain the maximum path length that + * fits into the buffer. On return, it contains the true length. + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) +{ + struct dentry *dentry; + struct inode *inode; + mm_segment_t oldfs; + __be32 err; + int host_err; + + err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP); + if (err) + goto out; + + dentry = fhp->fh_dentry; + inode = dentry->d_inode; + + err = nfserr_inval; + if (!inode->i_op->readlink) + goto out; + + touch_atime(fhp->fh_export->ex_path.mnt, dentry); + /* N.B. Why does this call need a get_fs()?? + * Remove the set_fs and watch the fireworks:-) --okir + */ + + oldfs = get_fs(); set_fs(KERNEL_DS); + host_err = inode->i_op->readlink(dentry, buf, *lenp); + set_fs(oldfs); + + if (host_err < 0) + goto out_nfserr; + *lenp = host_err; + err = 0; +out: + return err; + +out_nfserr: + err = nfserrno(host_err); + goto out; +} + +/* + * Create a symlink and look up its inode + * N.B. After this call _both_ fhp and resfhp need an fh_put + */ +__be32 +nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, + char *path, int plen, + struct svc_fh *resfhp, + struct iattr *iap) +{ + struct dentry *dentry, *dnew; + __be32 err, cerr; + int host_err; + + err = nfserr_noent; + if (!flen || !plen) + goto out; + err = nfserr_exist; + if (isdotent(fname, flen)) + goto out; + + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + fh_lock(fhp); + dentry = fhp->fh_dentry; + dnew = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(dnew); + if (IS_ERR(dnew)) + goto out_nfserr; + + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; + + if (unlikely(path[plen] != 0)) { + char *path_alloced = kmalloc(plen+1, GFP_KERNEL); + if (path_alloced == NULL) + host_err = -ENOMEM; + else { + strncpy(path_alloced, path, plen); + path_alloced[plen] = 0; + host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced); + kfree(path_alloced); + } + } else + host_err = vfs_symlink(dentry->d_inode, dnew, path); + err = nfserrno(host_err); + if (!err) + err = nfserrno(commit_metadata(fhp)); + fh_unlock(fhp); + + mnt_drop_write(fhp->fh_export->ex_path.mnt); + + cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); + dput(dnew); + if (err==0) err = cerr; +out: + return err; + +out_nfserr: + err = nfserrno(host_err); + goto out; +} + +/* + * Create a hardlink + * N.B. After this call _both_ ffhp and tfhp need an fh_put + */ +__be32 +nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, + char *name, int len, struct svc_fh *tfhp) +{ + struct dentry *ddir, *dnew, *dold; + struct inode *dirp; + __be32 err; + int host_err; + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP); + if (err) + goto out; + + err = nfserr_perm; + if (!len) + goto out; + err = nfserr_exist; + if (isdotent(name, len)) + goto out; + + fh_lock_nested(ffhp, I_MUTEX_PARENT); + ddir = ffhp->fh_dentry; + dirp = ddir->d_inode; + + dnew = lookup_one_len(name, ddir, len); + host_err = PTR_ERR(dnew); + if (IS_ERR(dnew)) + goto out_nfserr; + + dold = tfhp->fh_dentry; + + host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt); + if (host_err) { + err = nfserrno(host_err); + goto out_dput; + } + err = nfserr_noent; + if (!dold->d_inode) + goto out_drop_write; + host_err = nfsd_break_lease(dold->d_inode); + if (host_err) { + err = nfserrno(host_err); + goto out_drop_write; + } + host_err = vfs_link(dold, dirp, dnew); + if (!host_err) { + err = nfserrno(commit_metadata(ffhp)); + if (!err) + err = nfserrno(commit_metadata(tfhp)); + } else { + if (host_err == -EXDEV && rqstp->rq_vers == 2) + err = nfserr_acces; + else + err = nfserrno(host_err); + } +out_drop_write: + mnt_drop_write(tfhp->fh_export->ex_path.mnt); +out_dput: + dput(dnew); +out_unlock: + fh_unlock(ffhp); +out: + return err; + +out_nfserr: + err = nfserrno(host_err); + goto out_unlock; +} + +/* + * Rename a file + * N.B. After this call _both_ ffhp and tfhp need an fh_put + */ +__be32 +nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, + struct svc_fh *tfhp, char *tname, int tlen) +{ + struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; + struct inode *fdir, *tdir; + __be32 err; + int host_err; + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) + goto out; + err = fh_verify(rqstp, tfhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + + fdentry = ffhp->fh_dentry; + fdir = fdentry->d_inode; + + tdentry = tfhp->fh_dentry; + tdir = tdentry->d_inode; + + err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev; + if (ffhp->fh_export != tfhp->fh_export) + goto out; + + err = nfserr_perm; + if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) + goto out; + + /* cannot use fh_lock as we need deadlock protective ordering + * so do it by hand */ + trap = lock_rename(tdentry, fdentry); + ffhp->fh_locked = tfhp->fh_locked = 1; + fill_pre_wcc(ffhp); + fill_pre_wcc(tfhp); + + odentry = lookup_one_len(fname, fdentry, flen); + host_err = PTR_ERR(odentry); + if (IS_ERR(odentry)) + goto out_nfserr; + + host_err = -ENOENT; + if (!odentry->d_inode) + goto out_dput_old; + host_err = -EINVAL; + if (odentry == trap) + goto out_dput_old; + + ndentry = lookup_one_len(tname, tdentry, tlen); + host_err = PTR_ERR(ndentry); + if (IS_ERR(ndentry)) + goto out_dput_old; + host_err = -ENOTEMPTY; + if (ndentry == trap) + goto out_dput_new; + + host_err = -EXDEV; + if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) + goto out_dput_new; + host_err = mnt_want_write(ffhp->fh_export->ex_path.mnt); + if (host_err) + goto out_dput_new; + + host_err = nfsd_break_lease(odentry->d_inode); + if (host_err) + goto out_drop_write; + if (ndentry->d_inode) { + host_err = nfsd_break_lease(ndentry->d_inode); + if (host_err) + goto out_drop_write; + } + host_err = vfs_rename(fdir, odentry, tdir, ndentry); + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) + host_err = commit_metadata(ffhp); + } +out_drop_write: + mnt_drop_write(ffhp->fh_export->ex_path.mnt); + out_dput_new: + dput(ndentry); + out_dput_old: + dput(odentry); + out_nfserr: + err = nfserrno(host_err); + + /* we cannot reply on fh_unlock on the two filehandles, + * as that would do the wrong thing if the two directories + * were the same, so again we do it by hand + */ + fill_post_wcc(ffhp); + fill_post_wcc(tfhp); + unlock_rename(tdentry, fdentry); + ffhp->fh_locked = tfhp->fh_locked = 0; + +out: + return err; +} + +/* + * Unlink a file or directory + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, + char *fname, int flen) +{ + struct dentry *dentry, *rdentry; + struct inode *dirp; + __be32 err; + int host_err; + + err = nfserr_acces; + if (!flen || isdotent(fname, flen)) + goto out; + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) + goto out; + + fh_lock_nested(fhp, I_MUTEX_PARENT); + dentry = fhp->fh_dentry; + dirp = dentry->d_inode; + + rdentry = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(rdentry); + if (IS_ERR(rdentry)) + goto out_nfserr; + + if (!rdentry->d_inode) { + dput(rdentry); + err = nfserr_noent; + goto out; + } + + if (!type) + type = rdentry->d_inode->i_mode & S_IFMT; + + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_put; + + host_err = nfsd_break_lease(rdentry->d_inode); + if (host_err) + goto out_drop_write; + if (type != S_IFDIR) + host_err = vfs_unlink(dirp, rdentry); + else + host_err = vfs_rmdir(dirp, rdentry); + if (!host_err) + host_err = commit_metadata(fhp); +out_drop_write: + mnt_drop_write(fhp->fh_export->ex_path.mnt); +out_put: + dput(rdentry); + +out_nfserr: + err = nfserrno(host_err); +out: + return err; +} + +/* + * We do this buffering because we must not call back into the file + * system's ->lookup() method from the filldir callback. That may well + * deadlock a number of file systems. + * + * This is based heavily on the implementation of same in XFS. + */ +struct buffered_dirent { + u64 ino; + loff_t offset; + int namlen; + unsigned int d_type; + char name[]; +}; + +struct readdir_data { + char *dirent; + size_t used; + int full; +}; + +static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_data *buf = __buf; + struct buffered_dirent *de = (void *)(buf->dirent + buf->used); + unsigned int reclen; + + reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64)); + if (buf->used + reclen > PAGE_SIZE) { + buf->full = 1; + return -EINVAL; + } + + de->namlen = namlen; + de->offset = offset; + de->ino = ino; + de->d_type = d_type; + memcpy(de->name, name, namlen); + buf->used += reclen; + + return 0; +} + +static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func, + struct readdir_cd *cdp, loff_t *offsetp) +{ + struct readdir_data buf; + struct buffered_dirent *de; + int host_err; + int size; + loff_t offset; + + buf.dirent = (void *)__get_free_page(GFP_KERNEL); + if (!buf.dirent) + return nfserrno(-ENOMEM); + + offset = *offsetp; + + while (1) { + struct inode *dir_inode = file->f_path.dentry->d_inode; + unsigned int reclen; + + cdp->err = nfserr_eof; /* will be cleared on successful read */ + buf.used = 0; + buf.full = 0; + + host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf); + if (buf.full) + host_err = 0; + + if (host_err < 0) + break; + + size = buf.used; + + if (!size) + break; + + /* + * Various filldir functions may end up calling back into + * lookup_one_len() and the file system's ->lookup() method. + * These expect i_mutex to be held, as it would within readdir. + */ + host_err = mutex_lock_killable(&dir_inode->i_mutex); + if (host_err) + break; + + de = (struct buffered_dirent *)buf.dirent; + while (size > 0) { + offset = de->offset; + + if (func(cdp, de->name, de->namlen, de->offset, + de->ino, de->d_type)) + break; + + if (cdp->err != nfs_ok) + break; + + reclen = ALIGN(sizeof(*de) + de->namlen, + sizeof(u64)); + size -= reclen; + de = (struct buffered_dirent *)((char *)de + reclen); + } + mutex_unlock(&dir_inode->i_mutex); + if (size > 0) /* We bailed out early */ + break; + + offset = vfs_llseek(file, 0, SEEK_CUR); + } + + free_page((unsigned long)(buf.dirent)); + + if (host_err) + return nfserrno(host_err); + + *offsetp = offset; + return cdp->err; +} + +/* + * Read entries from a directory. + * The NFSv3/4 verifier we ignore for now. + */ +__be32 +nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, + struct readdir_cd *cdp, filldir_t func) +{ + __be32 err; + struct file *file; + loff_t offset = *offsetp; + + err = nfsd_open(rqstp, fhp, S_IFDIR, NFSD_MAY_READ, &file); + if (err) + goto out; + + offset = vfs_llseek(file, offset, 0); + if (offset < 0) { + err = nfserrno((int)offset); + goto out_close; + } + + err = nfsd_buffered_readdir(file, func, cdp, offsetp); + + if (err == nfserr_eof || err == nfserr_toosmall) + err = nfs_ok; /* can still be found in ->err */ +out_close: + nfsd_close(file); +out: + return err; +} + +/* + * Get file system stats + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) +{ + __be32 err; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); + if (!err) { + struct path path = { + .mnt = fhp->fh_export->ex_path.mnt, + .dentry = fhp->fh_dentry, + }; + if (vfs_statfs(&path, stat)) + err = nfserr_io; + } + return err; +} + +static int exp_rdonly(struct svc_rqst *rqstp, struct svc_export *exp) +{ + return nfsexp_flags(rqstp, exp) & NFSEXP_READONLY; +} + +/* + * Check for a user's access permissions to this inode. + */ +__be32 +nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, + struct dentry *dentry, int acc) +{ + struct inode *inode = dentry->d_inode; + int err; + + if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP) + return 0; +#if 0 + dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n", + acc, + (acc & NFSD_MAY_READ)? " read" : "", + (acc & NFSD_MAY_WRITE)? " write" : "", + (acc & NFSD_MAY_EXEC)? " exec" : "", + (acc & NFSD_MAY_SATTR)? " sattr" : "", + (acc & NFSD_MAY_TRUNC)? " trunc" : "", + (acc & NFSD_MAY_LOCK)? " lock" : "", + (acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "", + inode->i_mode, + IS_IMMUTABLE(inode)? " immut" : "", + IS_APPEND(inode)? " append" : "", + __mnt_is_readonly(exp->ex_path.mnt)? " ro" : ""); + dprintk(" owner %d/%d user %d/%d\n", + inode->i_uid, inode->i_gid, current_fsuid(), current_fsgid()); +#endif + + /* Normally we reject any write/sattr etc access on a read-only file + * system. But if it is IRIX doing check on write-access for a + * device special file, we ignore rofs. + */ + if (!(acc & NFSD_MAY_LOCAL_ACCESS)) + if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) { + if (exp_rdonly(rqstp, exp) || + __mnt_is_readonly(exp->ex_path.mnt)) + return nfserr_rofs; + if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode)) + return nfserr_perm; + } + if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode)) + return nfserr_perm; + + if (acc & NFSD_MAY_LOCK) { + /* If we cannot rely on authentication in NLM requests, + * just allow locks, otherwise require read permission, or + * ownership + */ + if (exp->ex_flags & NFSEXP_NOAUTHNLM) + return 0; + else + acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE; + } + /* + * The file owner always gets access permission for accesses that + * would normally be checked at open time. This is to make + * file access work even when the client has done a fchmod(fd, 0). + * + * However, `cp foo bar' should fail nevertheless when bar is + * readonly. A sensible way to do this might be to reject all + * attempts to truncate a read-only file, because a creat() call + * always implies file truncation. + * ... but this isn't really fair. A process may reasonably call + * ftruncate on an open file descriptor on a file with perm 000. + * We must trust the client to do permission checking - using "ACCESS" + * with NFSv3. + */ + if ((acc & NFSD_MAY_OWNER_OVERRIDE) && + inode->i_uid == current_fsuid()) + return 0; + + /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ + err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC)); + + /* Allow read access to binaries even when mode 111 */ + if (err == -EACCES && S_ISREG(inode->i_mode) && + (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) || + acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC))) + err = inode_permission(inode, MAY_EXEC); + + return err? nfserrno(err) : 0; +} + +void +nfsd_racache_shutdown(void) +{ + struct raparms *raparm, *last_raparm; + unsigned int i; + + dprintk("nfsd: freeing readahead buffers.\n"); + + for (i = 0; i < RAPARM_HASH_SIZE; i++) { + raparm = raparm_hash[i].pb_head; + while(raparm) { + last_raparm = raparm; + raparm = raparm->p_next; + kfree(last_raparm); + } + raparm_hash[i].pb_head = NULL; + } +} +/* + * Initialize readahead param cache + */ +int +nfsd_racache_init(int cache_size) +{ + int i; + int j = 0; + int nperbucket; + struct raparms **raparm = NULL; + + + if (raparm_hash[0].pb_head) + return 0; + nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); + if (nperbucket < 2) + nperbucket = 2; + cache_size = nperbucket * RAPARM_HASH_SIZE; + + dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); + + for (i = 0; i < RAPARM_HASH_SIZE; i++) { + spin_lock_init(&raparm_hash[i].pb_lock); + + raparm = &raparm_hash[i].pb_head; + for (j = 0; j < nperbucket; j++) { + *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); + if (!*raparm) + goto out_nomem; + raparm = &(*raparm)->p_next; + } + *raparm = NULL; + } + + nfsdstats.ra_size = cache_size; + return 0; + +out_nomem: + dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); + nfsd_racache_shutdown(); + return -ENOMEM; +} + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +struct posix_acl * +nfsd_get_posix_acl(struct svc_fh *fhp, int type) +{ + struct inode *inode = fhp->fh_dentry->d_inode; + char *name; + void *value = NULL; + ssize_t size; + struct posix_acl *acl; + + if (!IS_POSIXACL(inode)) + return ERR_PTR(-EOPNOTSUPP); + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return ERR_PTR(-EOPNOTSUPP); + } + + size = nfsd_getxattr(fhp->fh_dentry, name, &value); + if (size < 0) + return ERR_PTR(size); + + acl = posix_acl_from_xattr(value, size); + kfree(value); + return acl; +} + +int +nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) +{ + struct inode *inode = fhp->fh_dentry->d_inode; + char *name; + void *value = NULL; + size_t size; + int error; + + if (!IS_POSIXACL(inode) || + !inode->i_op->setxattr || !inode->i_op->removexattr) + return -EOPNOTSUPP; + switch(type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return -EOPNOTSUPP; + } + + if (acl && acl->a_count) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_KERNEL); + if (!value) + return -ENOMEM; + error = posix_acl_to_xattr(acl, value, size); + if (error < 0) + goto getout; + size = error; + } else + size = 0; + + error = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (error) + goto getout; + if (size) + error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0); + else { + if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT) + error = 0; + else { + error = vfs_removexattr(fhp->fh_dentry, name); + if (error == -ENODATA) + error = 0; + } + } + mnt_drop_write(fhp->fh_export->ex_path.mnt); + +getout: + kfree(value); + return error; +} +#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h new file mode 100644 index 00000000..a22e40e2 --- /dev/null +++ b/fs/nfsd/vfs.h @@ -0,0 +1,109 @@ +/* + * Copyright (C) 1995-1997 Olaf Kirch + */ + +#ifndef LINUX_NFSD_VFS_H +#define LINUX_NFSD_VFS_H + +#include "nfsfh.h" + +/* + * Flags for nfsd_permission + */ +#define NFSD_MAY_NOP 0 +#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */ +#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */ +#define NFSD_MAY_READ 4 /* == MAY_READ */ +#define NFSD_MAY_SATTR 8 +#define NFSD_MAY_TRUNC 16 +#define NFSD_MAY_LOCK 32 +#define NFSD_MAY_MASK 63 + +/* extra hints to permission and open routines: */ +#define NFSD_MAY_OWNER_OVERRIDE 64 +#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ +#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 +#define NFSD_MAY_NOT_BREAK_LEASE 512 +#define NFSD_MAY_BYPASS_GSS 1024 +#define NFSD_MAY_READ_IF_EXEC 2048 + +#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) +#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) + +/* + * Callback function for readdir + */ +typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int); + +/* nfsd/vfs.c */ +int fh_lock_parent(struct svc_fh *, struct dentry *); +int nfsd_racache_init(int); +void nfsd_racache_shutdown(void); +int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, + struct svc_export **expp); +__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, + const char *, unsigned int, struct svc_fh *); +__be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, + const char *, unsigned int, + struct svc_export **, struct dentry **); +__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, + struct iattr *, int, time_t); +int nfsd_mountpoint(struct dentry *, struct svc_export *); +#ifdef CONFIG_NFSD_V4 +__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *, + struct nfs4_acl *); +int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **); +#endif /* CONFIG_NFSD_V4 */ +__be32 nfsd_create(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + int type, dev_t rdev, struct svc_fh *res); +#ifdef CONFIG_NFSD_V3 +__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); +__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + struct svc_fh *res, int createmode, + u32 *verifier, int *truncp, int *created); +__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, + loff_t, unsigned long); +#endif /* CONFIG_NFSD_V3 */ +__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int, + int, struct file **); +void nfsd_close(struct file *); +__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, + loff_t, struct kvec *, int, unsigned long *); +__be32 nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *, + loff_t, struct kvec *, int, unsigned long *); +__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, + loff_t, struct kvec *,int, unsigned long *, int *); +__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, + char *, int *); +__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, + char *name, int len, char *path, int plen, + struct svc_fh *res, struct iattr *); +__be32 nfsd_link(struct svc_rqst *, struct svc_fh *, + char *, int, struct svc_fh *); +__be32 nfsd_rename(struct svc_rqst *, + struct svc_fh *, char *, int, + struct svc_fh *, char *, int); +__be32 nfsd_remove(struct svc_rqst *, + struct svc_fh *, char *, int); +__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, + char *name, int len); +int nfsd_truncate(struct svc_rqst *, struct svc_fh *, + unsigned long size); +__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, + loff_t *, struct readdir_cd *, filldir_t); +__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, + struct kstatfs *, int access); + +int nfsd_notify_change(struct inode *, struct iattr *); +__be32 nfsd_permission(struct svc_rqst *, struct svc_export *, + struct dentry *, int); +int nfsd_sync_dir(struct dentry *dp); + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); +int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *); +#endif + +#endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h new file mode 100644 index 00000000..53b1863d --- /dev/null +++ b/fs/nfsd/xdr.h @@ -0,0 +1,173 @@ +/* XDR types for nfsd. This is mainly a typing exercise. */ + +#ifndef LINUX_NFSD_H +#define LINUX_NFSD_H + +#include +#include "nfsd.h" +#include "nfsfh.h" + +struct nfsd_fhandle { + struct svc_fh fh; +}; + +struct nfsd_sattrargs { + struct svc_fh fh; + struct iattr attrs; +}; + +struct nfsd_diropargs { + struct svc_fh fh; + char * name; + unsigned int len; +}; + +struct nfsd_readargs { + struct svc_fh fh; + __u32 offset; + __u32 count; + int vlen; +}; + +struct nfsd_writeargs { + svc_fh fh; + __u32 offset; + int len; + int vlen; +}; + +struct nfsd_createargs { + struct svc_fh fh; + char * name; + unsigned int len; + struct iattr attrs; +}; + +struct nfsd_renameargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd_readlinkargs { + struct svc_fh fh; + char * buffer; +}; + +struct nfsd_linkargs { + struct svc_fh ffh; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd_symlinkargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + char * tname; + unsigned int tlen; + struct iattr attrs; +}; + +struct nfsd_readdirargs { + struct svc_fh fh; + __u32 cookie; + __u32 count; + __be32 * buffer; +}; + +struct nfsd_attrstat { + struct svc_fh fh; + struct kstat stat; +}; + +struct nfsd_diropres { + struct svc_fh fh; + struct kstat stat; +}; + +struct nfsd_readlinkres { + int len; +}; + +struct nfsd_readres { + struct svc_fh fh; + unsigned long count; + struct kstat stat; +}; + +struct nfsd_readdirres { + int count; + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; +}; + +struct nfsd_statfsres { + struct kstatfs stats; +}; + +/* + * Storage requirements for XDR arguments and results. + */ +union nfsd_xdrstore { + struct nfsd_sattrargs sattr; + struct nfsd_diropargs dirop; + struct nfsd_readargs read; + struct nfsd_writeargs write; + struct nfsd_createargs create; + struct nfsd_renameargs rename; + struct nfsd_linkargs link; + struct nfsd_symlinkargs symlink; + struct nfsd_readdirargs readdir; +}; + +#define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore) + + +int nfssvc_decode_void(struct svc_rqst *, __be32 *, void *); +int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); +int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *, + struct nfsd_sattrargs *); +int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *, + struct nfsd_diropargs *); +int nfssvc_decode_readargs(struct svc_rqst *, __be32 *, + struct nfsd_readargs *); +int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *, + struct nfsd_writeargs *); +int nfssvc_decode_createargs(struct svc_rqst *, __be32 *, + struct nfsd_createargs *); +int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *, + struct nfsd_renameargs *); +int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *, + struct nfsd_readlinkargs *); +int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *, + struct nfsd_linkargs *); +int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *, + struct nfsd_symlinkargs *); +int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *, + struct nfsd_readdirargs *); +int nfssvc_encode_void(struct svc_rqst *, __be32 *, void *); +int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *, struct nfsd_attrstat *); +int nfssvc_encode_diropres(struct svc_rqst *, __be32 *, struct nfsd_diropres *); +int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *, struct nfsd_readlinkres *); +int nfssvc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd_readres *); +int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *, struct nfsd_statfsres *); +int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *, struct nfsd_readdirres *); + +int nfssvc_encode_entry(void *, const char *name, + int namlen, loff_t offset, u64 ino, unsigned int); + +int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); + +/* Helper functions for NFSv2 ACL code */ +__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp); +__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp); + +#endif /* LINUX_NFSD_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h new file mode 100644 index 00000000..7df980eb --- /dev/null +++ b/fs/nfsd/xdr3.h @@ -0,0 +1,344 @@ +/* + * XDR types for NFSv3 in nfsd. + * + * Copyright (C) 1996-1998, Olaf Kirch + */ + +#ifndef _LINUX_NFSD_XDR3_H +#define _LINUX_NFSD_XDR3_H + +#include "xdr.h" + +struct nfsd3_sattrargs { + struct svc_fh fh; + struct iattr attrs; + int check_guard; + time_t guardtime; +}; + +struct nfsd3_diropargs { + struct svc_fh fh; + char * name; + unsigned int len; +}; + +struct nfsd3_accessargs { + struct svc_fh fh; + unsigned int access; +}; + +struct nfsd3_readargs { + struct svc_fh fh; + __u64 offset; + __u32 count; + int vlen; +}; + +struct nfsd3_writeargs { + svc_fh fh; + __u64 offset; + __u32 count; + int stable; + __u32 len; + int vlen; +}; + +struct nfsd3_createargs { + struct svc_fh fh; + char * name; + unsigned int len; + int createmode; + struct iattr attrs; + __be32 * verf; +}; + +struct nfsd3_mknodargs { + struct svc_fh fh; + char * name; + unsigned int len; + __u32 ftype; + __u32 major, minor; + struct iattr attrs; +}; + +struct nfsd3_renameargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd3_readlinkargs { + struct svc_fh fh; + char * buffer; +}; + +struct nfsd3_linkargs { + struct svc_fh ffh; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd3_symlinkargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + char * tname; + unsigned int tlen; + struct iattr attrs; +}; + +struct nfsd3_readdirargs { + struct svc_fh fh; + __u64 cookie; + __u32 dircount; + __u32 count; + __be32 * verf; + __be32 * buffer; +}; + +struct nfsd3_commitargs { + struct svc_fh fh; + __u64 offset; + __u32 count; +}; + +struct nfsd3_getaclargs { + struct svc_fh fh; + int mask; +}; + +struct posix_acl; +struct nfsd3_setaclargs { + struct svc_fh fh; + int mask; + struct posix_acl *acl_access; + struct posix_acl *acl_default; +}; + +struct nfsd3_attrstat { + __be32 status; + struct svc_fh fh; + struct kstat stat; +}; + +/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */ +struct nfsd3_diropres { + __be32 status; + struct svc_fh dirfh; + struct svc_fh fh; +}; + +struct nfsd3_accessres { + __be32 status; + struct svc_fh fh; + __u32 access; +}; + +struct nfsd3_readlinkres { + __be32 status; + struct svc_fh fh; + __u32 len; +}; + +struct nfsd3_readres { + __be32 status; + struct svc_fh fh; + unsigned long count; + int eof; +}; + +struct nfsd3_writeres { + __be32 status; + struct svc_fh fh; + unsigned long count; + int committed; +}; + +struct nfsd3_renameres { + __be32 status; + struct svc_fh ffh; + struct svc_fh tfh; +}; + +struct nfsd3_linkres { + __be32 status; + struct svc_fh tfh; + struct svc_fh fh; +}; + +struct nfsd3_readdirres { + __be32 status; + struct svc_fh fh; + int count; + __be32 verf[2]; + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; + __be32 * offset1; + struct svc_rqst * rqstp; + +}; + +struct nfsd3_fsstatres { + __be32 status; + struct kstatfs stats; + __u32 invarsec; +}; + +struct nfsd3_fsinfores { + __be32 status; + __u32 f_rtmax; + __u32 f_rtpref; + __u32 f_rtmult; + __u32 f_wtmax; + __u32 f_wtpref; + __u32 f_wtmult; + __u32 f_dtpref; + __u64 f_maxfilesize; + __u32 f_properties; +}; + +struct nfsd3_pathconfres { + __be32 status; + __u32 p_link_max; + __u32 p_name_max; + __u32 p_no_trunc; + __u32 p_chown_restricted; + __u32 p_case_insensitive; + __u32 p_case_preserving; +}; + +struct nfsd3_commitres { + __be32 status; + struct svc_fh fh; +}; + +struct nfsd3_getaclres { + __be32 status; + struct svc_fh fh; + int mask; + struct posix_acl *acl_access; + struct posix_acl *acl_default; +}; + +/* dummy type for release */ +struct nfsd3_fhandle_pair { + __u32 dummy; + struct svc_fh fh1; + struct svc_fh fh2; +}; + +/* + * Storage requirements for XDR arguments and results. + */ +union nfsd3_xdrstore { + struct nfsd3_sattrargs sattrargs; + struct nfsd3_diropargs diropargs; + struct nfsd3_readargs readargs; + struct nfsd3_writeargs writeargs; + struct nfsd3_createargs createargs; + struct nfsd3_renameargs renameargs; + struct nfsd3_linkargs linkargs; + struct nfsd3_symlinkargs symlinkargs; + struct nfsd3_readdirargs readdirargs; + struct nfsd3_diropres diropres; + struct nfsd3_accessres accessres; + struct nfsd3_readlinkres readlinkres; + struct nfsd3_readres readres; + struct nfsd3_writeres writeres; + struct nfsd3_renameres renameres; + struct nfsd3_linkres linkres; + struct nfsd3_readdirres readdirres; + struct nfsd3_fsstatres fsstatres; + struct nfsd3_fsinfores fsinfores; + struct nfsd3_pathconfres pathconfres; + struct nfsd3_commitres commitres; + struct nfsd3_getaclres getaclres; +}; + +#define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore) + +int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); +int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *, + struct nfsd3_sattrargs *); +int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *, + struct nfsd3_diropargs *); +int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *, + struct nfsd3_accessargs *); +int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *, + struct nfsd3_readargs *); +int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *, + struct nfsd3_writeargs *); +int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *, + struct nfsd3_createargs *); +int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *, + struct nfsd3_createargs *); +int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *, + struct nfsd3_mknodargs *); +int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *, + struct nfsd3_renameargs *); +int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *, + struct nfsd3_readlinkargs *); +int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *, + struct nfsd3_linkargs *); +int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *, + struct nfsd3_symlinkargs *); +int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *, + struct nfsd3_readdirargs *); +int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *, + struct nfsd3_readdirargs *); +int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *, + struct nfsd3_commitargs *); +int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *, void *); +int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *, + struct nfsd3_diropres *); +int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *, + struct nfsd3_accessres *); +int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *, + struct nfsd3_readlinkres *); +int nfs3svc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd3_readres *); +int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *, struct nfsd3_writeres *); +int nfs3svc_encode_createres(struct svc_rqst *, __be32 *, + struct nfsd3_diropres *); +int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *, + struct nfsd3_renameres *); +int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *, + struct nfsd3_linkres *); +int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *, + struct nfsd3_readdirres *); +int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *, + struct nfsd3_fsstatres *); +int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *, + struct nfsd3_fsinfores *); +int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *, + struct nfsd3_pathconfres *); +int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *, + struct nfsd3_commitres *); + +int nfs3svc_release_fhandle(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_release_fhandle2(struct svc_rqst *, __be32 *, + struct nfsd3_fhandle_pair *); +int nfs3svc_encode_entry(void *, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int); +int nfs3svc_encode_entry_plus(void *, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int); +/* Helper functions for NFSv3 ACL code */ +__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, + struct svc_fh *fhp); +__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp); + + +#endif /* _LINUX_NFSD_XDR3_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h new file mode 100644 index 00000000..366401e1 --- /dev/null +++ b/fs/nfsd/xdr4.h @@ -0,0 +1,573 @@ +/* + * Server-side types for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _LINUX_NFSD_XDR4_H +#define _LINUX_NFSD_XDR4_H + +#include "state.h" +#include "nfsd.h" + +#define NFSD4_MAX_TAGLEN 128 +#define XDR_LEN(n) (((n) + 3) & ~3) + +struct nfsd4_compound_state { + struct svc_fh current_fh; + struct svc_fh save_fh; + struct nfs4_stateowner *replay_owner; + /* For sessions DRC */ + struct nfsd4_session *session; + struct nfsd4_slot *slot; + __be32 *datap; + size_t iovlen; + u32 minorversion; + u32 status; +}; + +static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs) +{ + return cs->slot != NULL; +} + +struct nfsd4_change_info { + u32 atomic; + bool change_supported; + u32 before_ctime_sec; + u32 before_ctime_nsec; + u64 before_change; + u32 after_ctime_sec; + u32 after_ctime_nsec; + u64 after_change; +}; + +struct nfsd4_access { + u32 ac_req_access; /* request */ + u32 ac_supported; /* response */ + u32 ac_resp_access; /* response */ +}; + +struct nfsd4_close { + u32 cl_seqid; /* request */ + stateid_t cl_stateid; /* request+response */ + struct nfs4_stateowner * cl_stateowner; /* response */ +}; + +struct nfsd4_commit { + u64 co_offset; /* request */ + u32 co_count; /* request */ + nfs4_verifier co_verf; /* response */ +}; + +struct nfsd4_create { + u32 cr_namelen; /* request */ + char * cr_name; /* request */ + u32 cr_type; /* request */ + union { /* request */ + struct { + u32 namelen; + char *name; + } link; /* NF4LNK */ + struct { + u32 specdata1; + u32 specdata2; + } dev; /* NF4BLK, NF4CHR */ + } u; + u32 cr_bmval[3]; /* request */ + struct iattr cr_iattr; /* request */ + struct nfsd4_change_info cr_cinfo; /* response */ + struct nfs4_acl *cr_acl; +}; +#define cr_linklen u.link.namelen +#define cr_linkname u.link.name +#define cr_specdata1 u.dev.specdata1 +#define cr_specdata2 u.dev.specdata2 + +struct nfsd4_delegreturn { + stateid_t dr_stateid; +}; + +struct nfsd4_getattr { + u32 ga_bmval[3]; /* request */ + struct svc_fh *ga_fhp; /* response */ +}; + +struct nfsd4_link { + u32 li_namelen; /* request */ + char * li_name; /* request */ + struct nfsd4_change_info li_cinfo; /* response */ +}; + +struct nfsd4_lock_denied { + clientid_t ld_clientid; + struct nfs4_stateowner *ld_sop; + u64 ld_start; + u64 ld_length; + u32 ld_type; +}; + +struct nfsd4_lock { + /* request */ + u32 lk_type; + u32 lk_reclaim; /* boolean */ + u64 lk_offset; + u64 lk_length; + u32 lk_is_new; + union { + struct { + u32 open_seqid; + stateid_t open_stateid; + u32 lock_seqid; + clientid_t clientid; + struct xdr_netobj owner; + } new; + struct { + stateid_t lock_stateid; + u32 lock_seqid; + } old; + } v; + + /* response */ + union { + struct { + stateid_t stateid; + } ok; + struct nfsd4_lock_denied denied; + } u; + /* The lk_replay_owner is the open owner in the open_to_lock_owner + * case and the lock owner otherwise: */ + struct nfs4_stateowner *lk_replay_owner; +}; +#define lk_new_open_seqid v.new.open_seqid +#define lk_new_open_stateid v.new.open_stateid +#define lk_new_lock_seqid v.new.lock_seqid +#define lk_new_clientid v.new.clientid +#define lk_new_owner v.new.owner +#define lk_old_lock_stateid v.old.lock_stateid +#define lk_old_lock_seqid v.old.lock_seqid + +#define lk_rflags u.ok.rflags +#define lk_resp_stateid u.ok.stateid +#define lk_denied u.denied + + +struct nfsd4_lockt { + u32 lt_type; + clientid_t lt_clientid; + struct xdr_netobj lt_owner; + u64 lt_offset; + u64 lt_length; + struct nfs4_stateowner * lt_stateowner; + struct nfsd4_lock_denied lt_denied; +}; + + +struct nfsd4_locku { + u32 lu_type; + u32 lu_seqid; + stateid_t lu_stateid; + u64 lu_offset; + u64 lu_length; + struct nfs4_stateowner *lu_stateowner; +}; + + +struct nfsd4_lookup { + u32 lo_len; /* request */ + char * lo_name; /* request */ +}; + +struct nfsd4_putfh { + u32 pf_fhlen; /* request */ + char *pf_fhval; /* request */ +}; + +struct nfsd4_open { + u32 op_claim_type; /* request */ + struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */ + u32 op_delegate_type; /* request - CLAIM_PREV only */ + stateid_t op_delegate_stateid; /* request - response */ + u32 op_create; /* request */ + u32 op_createmode; /* request */ + u32 op_bmval[3]; /* request */ + struct iattr iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ + nfs4_verifier verf; /* EXCLUSIVE4 */ + clientid_t op_clientid; /* request */ + struct xdr_netobj op_owner; /* request */ + u32 op_seqid; /* request */ + u32 op_share_access; /* request */ + u32 op_share_deny; /* request */ + stateid_t op_stateid; /* response */ + u32 op_recall; /* recall */ + struct nfsd4_change_info op_cinfo; /* response */ + u32 op_rflags; /* response */ + int op_truncate; /* used during processing */ + struct nfs4_stateowner *op_stateowner; /* used during processing */ + struct nfs4_acl *op_acl; +}; +#define op_iattr iattr +#define op_verf verf + +struct nfsd4_open_confirm { + stateid_t oc_req_stateid /* request */; + u32 oc_seqid /* request */; + stateid_t oc_resp_stateid /* response */; + struct nfs4_stateowner * oc_stateowner; /* response */ +}; + +struct nfsd4_open_downgrade { + stateid_t od_stateid; + u32 od_seqid; + u32 od_share_access; + u32 od_share_deny; + struct nfs4_stateowner *od_stateowner; +}; + + +struct nfsd4_read { + stateid_t rd_stateid; /* request */ + u64 rd_offset; /* request */ + u32 rd_length; /* request */ + int rd_vlen; + struct file *rd_filp; + + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh * rd_fhp; /* response */ +}; + +struct nfsd4_readdir { + u64 rd_cookie; /* request */ + nfs4_verifier rd_verf; /* request */ + u32 rd_dircount; /* request */ + u32 rd_maxcount; /* request */ + u32 rd_bmval[3]; /* request */ + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh * rd_fhp; /* response */ + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; +}; + +struct nfsd4_release_lockowner { + clientid_t rl_clientid; + struct xdr_netobj rl_owner; +}; +struct nfsd4_readlink { + struct svc_rqst *rl_rqstp; /* request */ + struct svc_fh * rl_fhp; /* request */ +}; + +struct nfsd4_remove { + u32 rm_namelen; /* request */ + char * rm_name; /* request */ + struct nfsd4_change_info rm_cinfo; /* response */ +}; + +struct nfsd4_rename { + u32 rn_snamelen; /* request */ + char * rn_sname; /* request */ + u32 rn_tnamelen; /* request */ + char * rn_tname; /* request */ + struct nfsd4_change_info rn_sinfo; /* response */ + struct nfsd4_change_info rn_tinfo; /* response */ +}; + +struct nfsd4_secinfo { + u32 si_namelen; /* request */ + char *si_name; /* request */ + struct svc_export *si_exp; /* response */ +}; + +struct nfsd4_secinfo_no_name { + u32 sin_style; /* request */ + struct svc_export *sin_exp; /* response */ +}; + +struct nfsd4_setattr { + stateid_t sa_stateid; /* request */ + u32 sa_bmval[3]; /* request */ + struct iattr sa_iattr; /* request */ + struct nfs4_acl *sa_acl; +}; + +struct nfsd4_setclientid { + nfs4_verifier se_verf; /* request */ + u32 se_namelen; /* request */ + char * se_name; /* request */ + u32 se_callback_prog; /* request */ + u32 se_callback_netid_len; /* request */ + char * se_callback_netid_val; /* request */ + u32 se_callback_addr_len; /* request */ + char * se_callback_addr_val; /* request */ + u32 se_callback_ident; /* request */ + clientid_t se_clientid; /* response */ + nfs4_verifier se_confirm; /* response */ +}; + +struct nfsd4_setclientid_confirm { + clientid_t sc_clientid; + nfs4_verifier sc_confirm; +}; + +/* also used for NVERIFY */ +struct nfsd4_verify { + u32 ve_bmval[3]; /* request */ + u32 ve_attrlen; /* request */ + char * ve_attrval; /* request */ +}; + +struct nfsd4_write { + stateid_t wr_stateid; /* request */ + u64 wr_offset; /* request */ + u32 wr_stable_how; /* request */ + u32 wr_buflen; /* request */ + int wr_vlen; + + u32 wr_bytes_written; /* response */ + u32 wr_how_written; /* response */ + nfs4_verifier wr_verifier; /* response */ +}; + +struct nfsd4_exchange_id { + nfs4_verifier verifier; + struct xdr_netobj clname; + u32 flags; + clientid_t clientid; + u32 seqid; + int spa_how; +}; + +struct nfsd4_sequence { + struct nfs4_sessionid sessionid; /* request/response */ + u32 seqid; /* request/response */ + u32 slotid; /* request/response */ + u32 maxslots; /* request/response */ + u32 cachethis; /* request */ +#if 0 + u32 target_maxslots; /* response */ +#endif /* not yet */ + u32 status_flags; /* response */ +}; + +struct nfsd4_destroy_session { + struct nfs4_sessionid sessionid; +}; + +struct nfsd4_reclaim_complete { + u32 rca_one_fs; +}; + +struct nfsd4_op { + int opnum; + __be32 status; + union { + struct nfsd4_access access; + struct nfsd4_close close; + struct nfsd4_commit commit; + struct nfsd4_create create; + struct nfsd4_delegreturn delegreturn; + struct nfsd4_getattr getattr; + struct svc_fh * getfh; + struct nfsd4_link link; + struct nfsd4_lock lock; + struct nfsd4_lockt lockt; + struct nfsd4_locku locku; + struct nfsd4_lookup lookup; + struct nfsd4_verify nverify; + struct nfsd4_open open; + struct nfsd4_open_confirm open_confirm; + struct nfsd4_open_downgrade open_downgrade; + struct nfsd4_putfh putfh; + struct nfsd4_read read; + struct nfsd4_readdir readdir; + struct nfsd4_readlink readlink; + struct nfsd4_remove remove; + struct nfsd4_rename rename; + clientid_t renew; + struct nfsd4_secinfo secinfo; + struct nfsd4_setattr setattr; + struct nfsd4_setclientid setclientid; + struct nfsd4_setclientid_confirm setclientid_confirm; + struct nfsd4_verify verify; + struct nfsd4_write write; + struct nfsd4_release_lockowner release_lockowner; + + /* NFSv4.1 */ + struct nfsd4_exchange_id exchange_id; + struct nfsd4_bind_conn_to_session bind_conn_to_session; + struct nfsd4_create_session create_session; + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; + struct nfsd4_reclaim_complete reclaim_complete; + } u; + struct nfs4_replay * replay; +}; + +struct nfsd4_compoundargs { + /* scratch variables for XDR decode */ + __be32 * p; + __be32 * end; + struct page ** pagelist; + int pagelen; + __be32 tmp[8]; + __be32 * tmpp; + struct tmpbuf { + struct tmpbuf *next; + void (*release)(const void *); + void *buf; + } *to_free; + + struct svc_rqst *rqstp; + + u32 taglen; + char * tag; + u32 minorversion; + u32 opcnt; + struct nfsd4_op *ops; + struct nfsd4_op iops[8]; +}; + +struct nfsd4_compoundres { + /* scratch variables for XDR encode */ + __be32 * p; + __be32 * end; + struct xdr_buf * xbuf; + struct svc_rqst * rqstp; + + u32 taglen; + char * tag; + u32 opcnt; + __be32 * tagp; /* tag, opcount encode location */ + struct nfsd4_compound_state cstate; +}; + +static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp) +{ + struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; + return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE; +} + +static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) +{ + return !resp->cstate.slot->sl_cachethis || nfsd4_is_solo_sequence(resp); +} + +#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) + +static inline void +set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) +{ + BUG_ON(!fhp->fh_pre_saved); + cinfo->atomic = fhp->fh_post_saved; + cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); + + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; + cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; + cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; + cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; + +} + +int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); +int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *, + struct nfsd4_compoundargs *); +int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, + struct nfsd4_compoundres *); +void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); +void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); +__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, __be32 *buffer, int *countp, + u32 *bmval, struct svc_rqst *, int ignore_crossmnt); +extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_setclientid *setclid); +extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_setclientid_confirm *setclientid_confirm); +extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp); +extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, + struct nfsd4_sequence *seq); +extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_exchange_id *); +extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *); +extern __be32 nfsd4_create_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_create_session *); +extern __be32 nfsd4_sequence(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_sequence *); +extern __be32 nfsd4_destroy_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_destroy_session *); +__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); +extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, + struct nfsd4_open *open); +extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, + struct svc_fh *current_fh, struct nfsd4_open *open); +extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); +extern __be32 nfsd4_close(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_close *close); +extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_open_downgrade *od); +extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *, + struct nfsd4_lock *lock); +extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_lockt *lockt); +extern __be32 nfsd4_locku(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_locku *locku); +extern __be32 +nfsd4_release_lockowner(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_release_lockowner *rlockowner); +extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *); +extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_delegreturn *dr); +extern __be32 nfsd4_renew(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, clientid_t *clid); +#endif + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig new file mode 100644 index 00000000..251da07b --- /dev/null +++ b/fs/nilfs2/Kconfig @@ -0,0 +1,25 @@ +config NILFS2_FS + tristate "NILFS2 file system support (EXPERIMENTAL)" + depends on EXPERIMENTAL + select CRC32 + help + NILFS2 is a log-structured file system (LFS) supporting continuous + snapshotting. In addition to versioning capability of the entire + file system, users can even restore files mistakenly overwritten or + destroyed just a few seconds ago. Since this file system can keep + consistency like conventional LFS, it achieves quick recovery after + system crashes. + + NILFS2 creates a number of checkpoints every few seconds or per + synchronous write basis (unless there is no change). Users can + select significant versions among continuously created checkpoints, + and can change them into snapshots which will be preserved for long + periods until they are changed back to checkpoints. Each + snapshot is mountable as a read-only file system concurrently with + its writable mount, and this feature is convenient for online backup. + + Some features including atime, extended attributes, and POSIX ACLs, + are not supported yet. + + To compile this file system support as a module, choose M here: the + module will be called nilfs2. If unsure, say N. diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile new file mode 100644 index 00000000..85c98737 --- /dev/null +++ b/fs/nilfs2/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_NILFS2_FS) += nilfs2.o +nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \ + btnode.o bmap.o btree.o direct.o dat.o recovery.o \ + the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \ + ifile.o alloc.o gcinode.o ioctl.o diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c new file mode 100644 index 00000000..eed4d7b2 --- /dev/null +++ b/fs/nilfs2/alloc.c @@ -0,0 +1,721 @@ +/* + * alloc.c - NILFS dat/inode allocator + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Original code was written by Koji Sato . + * Two allocators were unified by Ryusuke Konishi , + * Amagai Yoshiji . + */ + +#include +#include +#include +#include +#include +#include "mdt.h" +#include "alloc.h" + + +/** + * nilfs_palloc_groups_per_desc_block - get the number of groups that a group + * descriptor block can maintain + * @inode: inode of metadata file using this allocator + */ +static inline unsigned long +nilfs_palloc_groups_per_desc_block(const struct inode *inode) +{ + return (1UL << inode->i_blkbits) / + sizeof(struct nilfs_palloc_group_desc); +} + +/** + * nilfs_palloc_groups_count - get maximum number of groups + * @inode: inode of metadata file using this allocator + */ +static inline unsigned long +nilfs_palloc_groups_count(const struct inode *inode) +{ + return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */)); +} + +/** + * nilfs_palloc_init_blockgroup - initialize private variables for allocator + * @inode: inode of metadata file using this allocator + * @entry_size: size of the persistent object + */ +int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + + mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS); + if (!mi->mi_bgl) + return -ENOMEM; + + bgl_lock_init(mi->mi_bgl); + + nilfs_mdt_set_entry_size(inode, entry_size, 0); + + mi->mi_blocks_per_group = + DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode), + mi->mi_entries_per_block) + 1; + /* Number of blocks in a group including entry blocks and + a bitmap block */ + mi->mi_blocks_per_desc_block = + nilfs_palloc_groups_per_desc_block(inode) * + mi->mi_blocks_per_group + 1; + /* Number of blocks per descriptor including the + descriptor block */ + return 0; +} + +/** + * nilfs_palloc_group - get group number and offset from an entry number + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + * @offset: pointer to store offset number in the group + */ +static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, + unsigned long *offset) +{ + __u64 group = nr; + + *offset = do_div(group, nilfs_palloc_entries_per_group(inode)); + return group; +} + +/** + * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block + * @inode: inode of metadata file using this allocator + * @group: group number + * + * nilfs_palloc_desc_blkoff() returns block offset of the descriptor + * block which contains a descriptor of the specified group. + */ +static unsigned long +nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) +{ + unsigned long desc_block = + group / nilfs_palloc_groups_per_desc_block(inode); + return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block; +} + +/** + * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block + * @inode: inode of metadata file using this allocator + * @group: group number + * + * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap + * block used to allocate/deallocate entries in the specified group. + */ +static unsigned long +nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) +{ + unsigned long desc_offset = + group % nilfs_palloc_groups_per_desc_block(inode); + return nilfs_palloc_desc_blkoff(inode, group) + 1 + + desc_offset * NILFS_MDT(inode)->mi_blocks_per_group; +} + +/** + * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group + * @inode: inode of metadata file using this allocator + * @group: group number + * @desc: pointer to descriptor structure for the group + */ +static unsigned long +nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, + const struct nilfs_palloc_group_desc *desc) +{ + unsigned long nfree; + + spin_lock(nilfs_mdt_bgl_lock(inode, group)); + nfree = le32_to_cpu(desc->pg_nfrees); + spin_unlock(nilfs_mdt_bgl_lock(inode, group)); + return nfree; +} + +/** + * nilfs_palloc_group_desc_add_entries - adjust count of free entries + * @inode: inode of metadata file using this allocator + * @group: group number + * @desc: pointer to descriptor structure for the group + * @n: delta to be added + */ +static void +nilfs_palloc_group_desc_add_entries(struct inode *inode, + unsigned long group, + struct nilfs_palloc_group_desc *desc, + u32 n) +{ + spin_lock(nilfs_mdt_bgl_lock(inode, group)); + le32_add_cpu(&desc->pg_nfrees, n); + spin_unlock(nilfs_mdt_bgl_lock(inode, group)); +} + +/** + * nilfs_palloc_entry_blkoff - get block offset of an entry block + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + */ +static unsigned long +nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) +{ + unsigned long group, group_offset; + + group = nilfs_palloc_group(inode, nr, &group_offset); + + return nilfs_palloc_bitmap_blkoff(inode, group) + 1 + + group_offset / NILFS_MDT(inode)->mi_entries_per_block; +} + +/** + * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block + * @inode: inode of metadata file + * @bh: buffer head of the buffer to be initialized + * @kaddr: kernel address mapped for the page including the buffer + */ +static void nilfs_palloc_desc_block_init(struct inode *inode, + struct buffer_head *bh, void *kaddr) +{ + struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh); + unsigned long n = nilfs_palloc_groups_per_desc_block(inode); + __le32 nfrees; + + nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode)); + while (n-- > 0) { + desc->pg_nfrees = nfrees; + desc++; + } +} + +static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, + int create, + void (*init_block)(struct inode *, + struct buffer_head *, + void *), + struct buffer_head **bhp, + struct nilfs_bh_assoc *prev, + spinlock_t *lock) +{ + int ret; + + spin_lock(lock); + if (prev->bh && blkoff == prev->blkoff) { + get_bh(prev->bh); + *bhp = prev->bh; + spin_unlock(lock); + return 0; + } + spin_unlock(lock); + + ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp); + if (!ret) { + spin_lock(lock); + /* + * The following code must be safe for change of the + * cache contents during the get block call. + */ + brelse(prev->bh); + get_bh(*bhp); + prev->bh = *bhp; + prev->blkoff = blkoff; + spin_unlock(lock); + } + return ret; +} + +/** + * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block + * @inode: inode of metadata file using this allocator + * @group: group number + * @create: create flag + * @bhp: pointer to store the resultant buffer head + */ +static int nilfs_palloc_get_desc_block(struct inode *inode, + unsigned long group, + int create, struct buffer_head **bhp) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_get_block(inode, + nilfs_palloc_desc_blkoff(inode, group), + create, nilfs_palloc_desc_block_init, + bhp, &cache->prev_desc, &cache->lock); +} + +/** + * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block + * @inode: inode of metadata file using this allocator + * @group: group number + * @create: create flag + * @bhp: pointer to store the resultant buffer head + */ +static int nilfs_palloc_get_bitmap_block(struct inode *inode, + unsigned long group, + int create, struct buffer_head **bhp) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_get_block(inode, + nilfs_palloc_bitmap_blkoff(inode, group), + create, NULL, bhp, + &cache->prev_bitmap, &cache->lock); +} + +/** + * nilfs_palloc_get_entry_block - get buffer head of an entry block + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + * @create: create flag + * @bhp: pointer to store the resultant buffer head + */ +int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, + int create, struct buffer_head **bhp) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_get_block(inode, + nilfs_palloc_entry_blkoff(inode, nr), + create, NULL, bhp, + &cache->prev_entry, &cache->lock); +} + +/** + * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor + * @inode: inode of metadata file using this allocator + * @group: group number + * @bh: buffer head of the buffer storing the group descriptor block + * @kaddr: kernel address mapped for the page including the buffer + */ +static struct nilfs_palloc_group_desc * +nilfs_palloc_block_get_group_desc(const struct inode *inode, + unsigned long group, + const struct buffer_head *bh, void *kaddr) +{ + return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) + + group % nilfs_palloc_groups_per_desc_block(inode); +} + +/** + * nilfs_palloc_block_get_entry - get kernel address of an entry + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + * @bh: buffer head of the buffer storing the entry block + * @kaddr: kernel address mapped for the page including the buffer + */ +void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, + const struct buffer_head *bh, void *kaddr) +{ + unsigned long entry_offset, group_offset; + + nilfs_palloc_group(inode, nr, &group_offset); + entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block; + + return kaddr + bh_offset(bh) + + entry_offset * NILFS_MDT(inode)->mi_entry_size; +} + +/** + * nilfs_palloc_find_available_slot - find available slot in a group + * @inode: inode of metadata file using this allocator + * @group: group number + * @target: offset number of an entry in the group (start point) + * @bitmap: bitmap of the group + * @bsize: size in bits + */ +static int nilfs_palloc_find_available_slot(struct inode *inode, + unsigned long group, + unsigned long target, + unsigned char *bitmap, + int bsize) +{ + int curr, pos, end, i; + + if (target > 0) { + end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1); + if (end > bsize) + end = bsize; + pos = nilfs_find_next_zero_bit(bitmap, end, target); + if (pos < end && + !nilfs_set_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), pos, bitmap)) + return pos; + } else + end = 0; + + for (i = 0, curr = end; + i < bsize; + i += BITS_PER_LONG, curr += BITS_PER_LONG) { + /* wrap around */ + if (curr >= bsize) + curr = 0; + while (*((unsigned long *)bitmap + curr / BITS_PER_LONG) + != ~0UL) { + end = curr + BITS_PER_LONG; + if (end > bsize) + end = bsize; + pos = nilfs_find_next_zero_bit(bitmap, end, curr); + if ((pos < end) && + !nilfs_set_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), pos, + bitmap)) + return pos; + } + } + return -ENOSPC; +} + +/** + * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups + * in a group descriptor block + * @inode: inode of metadata file using this allocator + * @curr: current group number + * @max: maximum number of groups + */ +static unsigned long +nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, + unsigned long curr, unsigned long max) +{ + return min_t(unsigned long, + nilfs_palloc_groups_per_desc_block(inode) - + curr % nilfs_palloc_groups_per_desc_block(inode), + max - curr + 1); +} + +/** + * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the allocation + */ +int nilfs_palloc_prepare_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct buffer_head *desc_bh, *bitmap_bh; + struct nilfs_palloc_group_desc *desc; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + unsigned long group, maxgroup, ngroups; + unsigned long group_offset, maxgroup_offset; + unsigned long n, entries_per_group, groups_per_desc_block; + unsigned long i, j; + int pos, ret; + + ngroups = nilfs_palloc_groups_count(inode); + maxgroup = ngroups - 1; + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + entries_per_group = nilfs_palloc_entries_per_group(inode); + groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode); + + for (i = 0; i < ngroups; i += n) { + if (group >= ngroups) { + /* wrap around */ + group = 0; + maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr, + &maxgroup_offset) - 1; + } + ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); + if (ret < 0) + return ret; + desc_kaddr = kmap(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + n = nilfs_palloc_rest_groups_in_desc_block(inode, group, + maxgroup); + for (j = 0; j < n; j++, desc++, group++) { + if (nilfs_palloc_group_desc_nfrees(inode, group, desc) + > 0) { + ret = nilfs_palloc_get_bitmap_block( + inode, group, 1, &bitmap_bh); + if (ret < 0) + goto out_desc; + bitmap_kaddr = kmap(bitmap_bh->b_page); + bitmap = bitmap_kaddr + bh_offset(bitmap_bh); + pos = nilfs_palloc_find_available_slot( + inode, group, group_offset, bitmap, + entries_per_group); + if (pos >= 0) { + /* found a free entry */ + nilfs_palloc_group_desc_add_entries( + inode, group, desc, -1); + req->pr_entry_nr = + entries_per_group * group + pos; + kunmap(desc_bh->b_page); + kunmap(bitmap_bh->b_page); + + req->pr_desc_bh = desc_bh; + req->pr_bitmap_bh = bitmap_bh; + return 0; + } + kunmap(bitmap_bh->b_page); + brelse(bitmap_bh); + } + + group_offset = 0; + } + + kunmap(desc_bh->b_page); + brelse(desc_bh); + } + + /* no entries left */ + return -ENOSPC; + + out_desc: + kunmap(desc_bh->b_page); + brelse(desc_bh); + return ret; +} + +/** + * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the allocation + */ +void nilfs_palloc_commit_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + mark_buffer_dirty(req->pr_bitmap_bh); + mark_buffer_dirty(req->pr_desc_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); +} + +/** + * nilfs_palloc_commit_free_entry - finish deallocating a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the removal + */ +void nilfs_palloc_commit_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct nilfs_palloc_group_desc *desc; + unsigned long group, group_offset; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + desc_kaddr = kmap(req->pr_desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc(inode, group, + req->pr_desc_bh, desc_kaddr); + bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); + bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); + + if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) + printk(KERN_WARNING "%s: entry number %llu already freed\n", + __func__, (unsigned long long)req->pr_entry_nr); + else + nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + + kunmap(req->pr_bitmap_bh->b_page); + kunmap(req->pr_desc_bh->b_page); + + mark_buffer_dirty(req->pr_desc_bh); + mark_buffer_dirty(req->pr_bitmap_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); +} + +/** + * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the allocation + */ +void nilfs_palloc_abort_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct nilfs_palloc_group_desc *desc; + void *desc_kaddr, *bitmap_kaddr; + unsigned char *bitmap; + unsigned long group, group_offset; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + desc_kaddr = kmap(req->pr_desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc(inode, group, + req->pr_desc_bh, desc_kaddr); + bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); + bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); + if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) + printk(KERN_WARNING "%s: entry number %llu already freed\n", + __func__, (unsigned long long)req->pr_entry_nr); + else + nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + + kunmap(req->pr_bitmap_bh->b_page); + kunmap(req->pr_desc_bh->b_page); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); + + req->pr_entry_nr = 0; + req->pr_bitmap_bh = NULL; + req->pr_desc_bh = NULL; +} + +/** + * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the removal + */ +int nilfs_palloc_prepare_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct buffer_head *desc_bh, *bitmap_bh; + unsigned long group, group_offset; + int ret; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); + if (ret < 0) + return ret; + ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh); + if (ret < 0) { + brelse(desc_bh); + return ret; + } + + req->pr_desc_bh = desc_bh; + req->pr_bitmap_bh = bitmap_bh; + return 0; +} + +/** + * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the removal + */ +void nilfs_palloc_abort_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); + + req->pr_entry_nr = 0; + req->pr_bitmap_bh = NULL; + req->pr_desc_bh = NULL; +} + +/** + * nilfs_palloc_group_is_in - judge if an entry is in a group + * @inode: inode of metadata file using this allocator + * @group: group number + * @nr: serial number of the entry (e.g. inode number) + */ +static int +nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) +{ + __u64 first, last; + + first = group * nilfs_palloc_entries_per_group(inode); + last = first + nilfs_palloc_entries_per_group(inode) - 1; + return (nr >= first) && (nr <= last); +} + +/** + * nilfs_palloc_freev - deallocate a set of persistent objects + * @inode: inode of metadata file using this allocator + * @entry_nrs: array of entry numbers to be deallocated + * @nitems: number of entries stored in @entry_nrs + */ +int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) +{ + struct buffer_head *desc_bh, *bitmap_bh; + struct nilfs_palloc_group_desc *desc; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + unsigned long group, group_offset; + int i, j, n, ret; + + for (i = 0; i < nitems; i = j) { + group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset); + ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh); + if (ret < 0) + return ret; + ret = nilfs_palloc_get_bitmap_block(inode, group, 0, + &bitmap_bh); + if (ret < 0) { + brelse(desc_bh); + return ret; + } + desc_kaddr = kmap(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + bitmap_kaddr = kmap(bitmap_bh->b_page); + bitmap = bitmap_kaddr + bh_offset(bitmap_bh); + for (j = i, n = 0; + (j < nitems) && nilfs_palloc_group_is_in(inode, group, + entry_nrs[j]); + j++) { + nilfs_palloc_group(inode, entry_nrs[j], &group_offset); + if (!nilfs_clear_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) { + printk(KERN_WARNING + "%s: entry number %llu already freed\n", + __func__, + (unsigned long long)entry_nrs[j]); + } else { + n++; + } + } + nilfs_palloc_group_desc_add_entries(inode, group, desc, n); + + kunmap(bitmap_bh->b_page); + kunmap(desc_bh->b_page); + + mark_buffer_dirty(desc_bh); + mark_buffer_dirty(bitmap_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(bitmap_bh); + brelse(desc_bh); + } + return 0; +} + +void nilfs_palloc_setup_cache(struct inode *inode, + struct nilfs_palloc_cache *cache) +{ + NILFS_MDT(inode)->mi_palloc_cache = cache; + spin_lock_init(&cache->lock); +} + +void nilfs_palloc_clear_cache(struct inode *inode) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + spin_lock(&cache->lock); + brelse(cache->prev_desc.bh); + brelse(cache->prev_bitmap.bh); + brelse(cache->prev_entry.bh); + cache->prev_desc.bh = NULL; + cache->prev_bitmap.bh = NULL; + cache->prev_entry.bh = NULL; + spin_unlock(&cache->lock); +} + +void nilfs_palloc_destroy_cache(struct inode *inode) +{ + nilfs_palloc_clear_cache(inode); + NILFS_MDT(inode)->mi_palloc_cache = NULL; +} diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h new file mode 100644 index 00000000..f5fde36b --- /dev/null +++ b/fs/nilfs2/alloc.h @@ -0,0 +1,100 @@ +/* + * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Original code was written by Koji Sato . + * Two allocators were unified by Ryusuke Konishi , + * Amagai Yoshiji . + */ + +#ifndef _NILFS_ALLOC_H +#define _NILFS_ALLOC_H + +#include +#include +#include + +/** + * nilfs_palloc_entries_per_group - get the number of entries per group + * @inode: inode of metadata file using this allocator + * + * The number of entries per group is defined by the number of bits + * that a bitmap block can maintain. + */ +static inline unsigned long +nilfs_palloc_entries_per_group(const struct inode *inode) +{ + return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */); +} + +int nilfs_palloc_init_blockgroup(struct inode *, unsigned); +int nilfs_palloc_get_entry_block(struct inode *, __u64, int, + struct buffer_head **); +void *nilfs_palloc_block_get_entry(const struct inode *, __u64, + const struct buffer_head *, void *); + +/** + * nilfs_palloc_req - persistent allocator request and reply + * @pr_entry_nr: entry number (vblocknr or inode number) + * @pr_desc_bh: buffer head of the buffer containing block group descriptors + * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap + * @pr_entry_bh: buffer head of the buffer containing translation entries + */ +struct nilfs_palloc_req { + __u64 pr_entry_nr; + struct buffer_head *pr_desc_bh; + struct buffer_head *pr_bitmap_bh; + struct buffer_head *pr_entry_bh; +}; + +int nilfs_palloc_prepare_alloc_entry(struct inode *, + struct nilfs_palloc_req *); +void nilfs_palloc_commit_alloc_entry(struct inode *, + struct nilfs_palloc_req *); +void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *); +void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *); +int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *); +void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *); +int nilfs_palloc_freev(struct inode *, __u64 *, size_t); + +#define nilfs_set_bit_atomic ext2_set_bit_atomic +#define nilfs_clear_bit_atomic ext2_clear_bit_atomic +#define nilfs_find_next_zero_bit find_next_zero_bit_le + +/* + * persistent object allocator cache + */ + +struct nilfs_bh_assoc { + unsigned long blkoff; + struct buffer_head *bh; +}; + +struct nilfs_palloc_cache { + spinlock_t lock; + struct nilfs_bh_assoc prev_desc; + struct nilfs_bh_assoc prev_bitmap; + struct nilfs_bh_assoc prev_entry; +}; + +void nilfs_palloc_setup_cache(struct inode *inode, + struct nilfs_palloc_cache *cache); +void nilfs_palloc_clear_cache(struct inode *inode); +void nilfs_palloc_destroy_cache(struct inode *inode); + +#endif /* _NILFS_ALLOC_H */ diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c new file mode 100644 index 00000000..aadbd0b5 --- /dev/null +++ b/fs/nilfs2/bmap.c @@ -0,0 +1,567 @@ +/* + * bmap.c - NILFS block mapping. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#include +#include +#include +#include "nilfs.h" +#include "bmap.h" +#include "btree.h" +#include "direct.h" +#include "btnode.h" +#include "mdt.h" +#include "dat.h" +#include "alloc.h" + +struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) +{ + struct the_nilfs *nilfs = bmap->b_inode->i_sb->s_fs_info; + + return nilfs->ns_dat; +} + +static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap, + const char *fname, int err) +{ + struct inode *inode = bmap->b_inode; + + if (err == -EINVAL) { + nilfs_error(inode->i_sb, fname, + "broken bmap (inode number=%lu)\n", inode->i_ino); + err = -EIO; + } + return err; +} + +/** + * nilfs_bmap_lookup_at_level - find a data block or node block + * @bmap: bmap + * @key: key + * @level: level + * @ptrp: place to store the value associated to @key + * + * Description: nilfs_bmap_lookup_at_level() finds a record whose key + * matches @key in the block at @level of the bmap. + * + * Return Value: On success, 0 is returned and the record associated with @key + * is stored in the place pointed by @ptrp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A record associated with @key does not exist. + */ +int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, + __u64 *ptrp) +{ + sector_t blocknr; + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp); + if (ret < 0) { + ret = nilfs_bmap_convert_error(bmap, __func__, ret); + goto out; + } + if (NILFS_BMAP_USE_VBN(bmap)) { + ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp, + &blocknr); + if (!ret) + *ptrp = blocknr; + } + + out: + up_read(&bmap->b_sem); + return ret; +} + +int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp, + unsigned maxblocks) +{ + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks); + up_read(&bmap->b_sem); + + return nilfs_bmap_convert_error(bmap, __func__, ret); +} + +static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +{ + __u64 keys[NILFS_BMAP_SMALL_HIGH + 1]; + __u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1]; + int ret, n; + + if (bmap->b_ops->bop_check_insert != NULL) { + ret = bmap->b_ops->bop_check_insert(bmap, key); + if (ret > 0) { + n = bmap->b_ops->bop_gather_data( + bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1); + if (n < 0) + return n; + ret = nilfs_btree_convert_and_insert( + bmap, key, ptr, keys, ptrs, n); + if (ret == 0) + bmap->b_u.u_flags |= NILFS_BMAP_LARGE; + + return ret; + } else if (ret < 0) + return ret; + } + + return bmap->b_ops->bop_insert(bmap, key, ptr); +} + +/** + * nilfs_bmap_insert - insert a new key-record pair into a bmap + * @bmap: bmap + * @key: key + * @rec: record + * + * Description: nilfs_bmap_insert() inserts the new key-record pair specified + * by @key and @rec into @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EEXIST - A record associated with @key already exist. + */ +int nilfs_bmap_insert(struct nilfs_bmap *bmap, + unsigned long key, + unsigned long rec) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_insert(bmap, key, rec); + up_write(&bmap->b_sem); + + return nilfs_bmap_convert_error(bmap, __func__, ret); +} + +static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key) +{ + __u64 keys[NILFS_BMAP_LARGE_LOW + 1]; + __u64 ptrs[NILFS_BMAP_LARGE_LOW + 1]; + int ret, n; + + if (bmap->b_ops->bop_check_delete != NULL) { + ret = bmap->b_ops->bop_check_delete(bmap, key); + if (ret > 0) { + n = bmap->b_ops->bop_gather_data( + bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1); + if (n < 0) + return n; + ret = nilfs_direct_delete_and_convert( + bmap, key, keys, ptrs, n); + if (ret == 0) + bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE; + + return ret; + } else if (ret < 0) + return ret; + } + + return bmap->b_ops->bop_delete(bmap, key); +} + +int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key) +{ + __u64 lastkey; + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + up_read(&bmap->b_sem); + + if (ret < 0) + ret = nilfs_bmap_convert_error(bmap, __func__, ret); + else + *key = lastkey; + return ret; +} + +/** + * nilfs_bmap_delete - delete a key-record pair from a bmap + * @bmap: bmap + * @key: key + * + * Description: nilfs_bmap_delete() deletes the key-record pair specified by + * @key from @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A record associated with @key does not exist. + */ +int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_delete(bmap, key); + up_write(&bmap->b_sem); + + return nilfs_bmap_convert_error(bmap, __func__, ret); +} + +static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key) +{ + __u64 lastkey; + int ret; + + ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + return ret; + } + + while (key <= lastkey) { + ret = nilfs_bmap_do_delete(bmap, lastkey); + if (ret < 0) + return ret; + ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + return ret; + } + } + return 0; +} + +/** + * nilfs_bmap_truncate - truncate a bmap to a specified key + * @bmap: bmap + * @key: key + * + * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are + * greater than or equal to @key from @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_truncate(bmap, key); + up_write(&bmap->b_sem); + + return nilfs_bmap_convert_error(bmap, __func__, ret); +} + +/** + * nilfs_bmap_clear - free resources a bmap holds + * @bmap: bmap + * + * Description: nilfs_bmap_clear() frees resources associated with @bmap. + */ +void nilfs_bmap_clear(struct nilfs_bmap *bmap) +{ + down_write(&bmap->b_sem); + if (bmap->b_ops->bop_clear != NULL) + bmap->b_ops->bop_clear(bmap); + up_write(&bmap->b_sem); +} + +/** + * nilfs_bmap_propagate - propagate dirty state + * @bmap: bmap + * @bh: buffer head + * + * Description: nilfs_bmap_propagate() marks the buffers that directly or + * indirectly refer to the block specified by @bh dirty. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh) +{ + int ret; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_propagate(bmap, bh); + up_write(&bmap->b_sem); + + return nilfs_bmap_convert_error(bmap, __func__, ret); +} + +/** + * nilfs_bmap_lookup_dirty_buffers - + * @bmap: bmap + * @listp: pointer to buffer head list + */ +void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap, + struct list_head *listp) +{ + if (bmap->b_ops->bop_lookup_dirty_buffers != NULL) + bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp); +} + +/** + * nilfs_bmap_assign - assign a new block number to a block + * @bmap: bmap + * @bhp: pointer to buffer head + * @blocknr: block number + * @binfo: block information + * + * Description: nilfs_bmap_assign() assigns the block number @blocknr to the + * buffer specified by @bh. + * + * Return Value: On success, 0 is returned and the buffer head of a newly + * create buffer and the block information associated with the buffer are + * stored in the place pointed by @bh and @binfo, respectively. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_assign(struct nilfs_bmap *bmap, + struct buffer_head **bh, + unsigned long blocknr, + union nilfs_binfo *binfo) +{ + int ret; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo); + up_write(&bmap->b_sem); + + return nilfs_bmap_convert_error(bmap, __func__, ret); +} + +/** + * nilfs_bmap_mark - mark block dirty + * @bmap: bmap + * @key: key + * @level: level + * + * Description: nilfs_bmap_mark() marks the block specified by @key and @level + * as dirty. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level) +{ + int ret; + + if (bmap->b_ops->bop_mark == NULL) + return 0; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_mark(bmap, key, level); + up_write(&bmap->b_sem); + + return nilfs_bmap_convert_error(bmap, __func__, ret); +} + +/** + * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state + * @bmap: bmap + * + * Description: nilfs_test_and_clear() is the atomic operation to test and + * clear the dirty state of @bmap. + * + * Return Value: 1 is returned if @bmap is dirty, or 0 if clear. + */ +int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_dirty(bmap); + nilfs_bmap_clear_dirty(bmap); + up_write(&bmap->b_sem); + return ret; +} + + +/* + * Internal use only + */ +__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, + const struct buffer_head *bh) +{ + struct buffer_head *pbh; + __u64 key; + + key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT - + bmap->b_inode->i_blkbits); + for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page) + key++; + + return key; +} + +__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key) +{ + __s64 diff; + + diff = key - bmap->b_last_allocated_key; + if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) && + (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) && + (bmap->b_last_allocated_ptr + diff > 0)) + return bmap->b_last_allocated_ptr + diff; + else + return NILFS_BMAP_INVALID_PTR; +} + +#define NILFS_BMAP_GROUP_DIV 8 +__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap) +{ + struct inode *dat = nilfs_bmap_get_dat(bmap); + unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat); + unsigned long group = bmap->b_inode->i_ino / entries_per_group; + + return group * entries_per_group + + (bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) * + (entries_per_group / NILFS_BMAP_GROUP_DIV); +} + +static struct lock_class_key nilfs_bmap_dat_lock_key; +static struct lock_class_key nilfs_bmap_mdt_lock_key; + +/** + * nilfs_bmap_read - read a bmap from an inode + * @bmap: bmap + * @raw_inode: on-disk inode + * + * Description: nilfs_bmap_read() initializes the bmap @bmap. + * + * Return Value: On success, 0 is returned. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) +{ + if (raw_inode == NULL) + memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE); + else + memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE); + + init_rwsem(&bmap->b_sem); + bmap->b_state = 0; + bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; + switch (bmap->b_inode->i_ino) { + case NILFS_DAT_INO: + bmap->b_ptr_type = NILFS_BMAP_PTR_P; + bmap->b_last_allocated_key = 0; + bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; + lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); + break; + case NILFS_CPFILE_INO: + case NILFS_SUFILE_INO: + bmap->b_ptr_type = NILFS_BMAP_PTR_VS; + bmap->b_last_allocated_key = 0; + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key); + break; + case NILFS_IFILE_INO: + lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key); + /* Fall through */ + default: + bmap->b_ptr_type = NILFS_BMAP_PTR_VM; + bmap->b_last_allocated_key = 0; + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + break; + } + + return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ? + nilfs_btree_init(bmap) : nilfs_direct_init(bmap); +} + +/** + * nilfs_bmap_write - write back a bmap to an inode + * @bmap: bmap + * @raw_inode: on-disk inode + * + * Description: nilfs_bmap_write() stores @bmap in @raw_inode. + */ +void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) +{ + down_write(&bmap->b_sem); + memcpy(raw_inode->i_bmap, bmap->b_u.u_data, + NILFS_INODE_BMAP_SIZE * sizeof(__le64)); + if (bmap->b_inode->i_ino == NILFS_DAT_INO) + bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; + + up_write(&bmap->b_sem); +} + +void nilfs_bmap_init_gc(struct nilfs_bmap *bmap) +{ + memset(&bmap->b_u, 0, NILFS_BMAP_SIZE); + init_rwsem(&bmap->b_sem); + bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; + bmap->b_ptr_type = NILFS_BMAP_PTR_U; + bmap->b_last_allocated_key = 0; + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + bmap->b_state = 0; + nilfs_btree_init_gc(bmap); +} + +void nilfs_bmap_save(const struct nilfs_bmap *bmap, + struct nilfs_bmap_store *store) +{ + memcpy(store->data, bmap->b_u.u_data, sizeof(store->data)); + store->last_allocated_key = bmap->b_last_allocated_key; + store->last_allocated_ptr = bmap->b_last_allocated_ptr; + store->state = bmap->b_state; +} + +void nilfs_bmap_restore(struct nilfs_bmap *bmap, + const struct nilfs_bmap_store *store) +{ + memcpy(bmap->b_u.u_data, store->data, sizeof(store->data)); + bmap->b_last_allocated_key = store->last_allocated_key; + bmap->b_last_allocated_ptr = store->last_allocated_ptr; + bmap->b_state = store->state; +} diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h new file mode 100644 index 00000000..40d9f453 --- /dev/null +++ b/fs/nilfs2/bmap.h @@ -0,0 +1,270 @@ +/* + * bmap.h - NILFS block mapping. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#ifndef _NILFS_BMAP_H +#define _NILFS_BMAP_H + +#include +#include +#include +#include +#include "alloc.h" +#include "dat.h" + +#define NILFS_BMAP_INVALID_PTR 0 + +#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff)) + + +struct nilfs_bmap; + +/** + * union nilfs_bmap_ptr_req - request for bmap ptr + * @bpr_ptr: bmap pointer + * @bpr_req: request for persistent allocator + */ +union nilfs_bmap_ptr_req { + __u64 bpr_ptr; + struct nilfs_palloc_req bpr_req; +}; + +/** + * struct nilfs_bmap_stats - bmap statistics + * @bs_nblocks: number of blocks created or deleted + */ +struct nilfs_bmap_stats { + unsigned int bs_nblocks; +}; + +/** + * struct nilfs_bmap_operations - bmap operation table + */ +struct nilfs_bmap_operations { + int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *); + int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *, + unsigned); + int (*bop_insert)(struct nilfs_bmap *, __u64, __u64); + int (*bop_delete)(struct nilfs_bmap *, __u64); + void (*bop_clear)(struct nilfs_bmap *); + + int (*bop_propagate)(struct nilfs_bmap *, struct buffer_head *); + void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *, + struct list_head *); + + int (*bop_assign)(struct nilfs_bmap *, + struct buffer_head **, + sector_t, + union nilfs_binfo *); + int (*bop_mark)(struct nilfs_bmap *, __u64, int); + + /* The following functions are internal use only. */ + int (*bop_last_key)(const struct nilfs_bmap *, __u64 *); + int (*bop_check_insert)(const struct nilfs_bmap *, __u64); + int (*bop_check_delete)(struct nilfs_bmap *, __u64); + int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int); +}; + + +#define NILFS_BMAP_SIZE (NILFS_INODE_BMAP_SIZE * sizeof(__le64)) +#define NILFS_BMAP_KEY_BIT (sizeof(unsigned long) * 8 /* CHAR_BIT */) +#define NILFS_BMAP_NEW_PTR_INIT \ + (1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1)) + +static inline int nilfs_bmap_is_new_ptr(unsigned long ptr) +{ + return !!(ptr & NILFS_BMAP_NEW_PTR_INIT); +} + + +/** + * struct nilfs_bmap - bmap structure + * @b_u: raw data + * @b_sem: semaphore + * @b_inode: owner of bmap + * @b_ops: bmap operation table + * @b_last_allocated_key: last allocated key for data block + * @b_last_allocated_ptr: last allocated ptr for data block + * @b_ptr_type: pointer type + * @b_state: state + * @b_nchildren_per_block: maximum number of child nodes for non-root nodes + */ +struct nilfs_bmap { + union { + __u8 u_flags; + __le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)]; + } b_u; + struct rw_semaphore b_sem; + struct inode *b_inode; + const struct nilfs_bmap_operations *b_ops; + __u64 b_last_allocated_key; + __u64 b_last_allocated_ptr; + int b_ptr_type; + int b_state; + __u16 b_nchildren_per_block; +}; + +/* pointer type */ +#define NILFS_BMAP_PTR_P 0 /* physical block number (i.e. LBN) */ +#define NILFS_BMAP_PTR_VS 1 /* virtual block number (single + version) */ +#define NILFS_BMAP_PTR_VM 2 /* virtual block number (has multiple + versions) */ +#define NILFS_BMAP_PTR_U (-1) /* never perform pointer operations */ + +#define NILFS_BMAP_USE_VBN(bmap) ((bmap)->b_ptr_type > 0) + +/* state */ +#define NILFS_BMAP_DIRTY 0x00000001 + +struct nilfs_bmap_store { + __le64 data[NILFS_BMAP_SIZE / sizeof(__le64)]; + __u64 last_allocated_key; + __u64 last_allocated_ptr; + int state; +}; + +int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); +int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); +void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); +int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); +int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); +int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); +int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *); +int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long); +void nilfs_bmap_clear(struct nilfs_bmap *); +int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *); +void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *); +int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **, + unsigned long, union nilfs_binfo *); +int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *); +int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int); + +void nilfs_bmap_init_gc(struct nilfs_bmap *); + +void nilfs_bmap_save(const struct nilfs_bmap *, struct nilfs_bmap_store *); +void nilfs_bmap_restore(struct nilfs_bmap *, const struct nilfs_bmap_store *); + +static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key, + __u64 *ptr) +{ + return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr); +} + +/* + * Internal use only + */ +struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *); + +static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + struct inode *dat) +{ + if (dat) + return nilfs_dat_prepare_alloc(dat, &req->bpr_req); + /* ignore target ptr */ + req->bpr_ptr = bmap->b_last_allocated_ptr++; + return 0; +} + +static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + struct inode *dat) +{ + if (dat) + nilfs_dat_commit_alloc(dat, &req->bpr_req); +} + +static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + struct inode *dat) +{ + if (dat) + nilfs_dat_abort_alloc(dat, &req->bpr_req); + else + bmap->b_last_allocated_ptr--; +} + +static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + struct inode *dat) +{ + return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0; +} + +static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + struct inode *dat) +{ + if (dat) + nilfs_dat_commit_end(dat, &req->bpr_req, + bmap->b_ptr_type == NILFS_BMAP_PTR_VS); +} + +static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + struct inode *dat) +{ + if (dat) + nilfs_dat_abort_end(dat, &req->bpr_req); +} + +static inline void nilfs_bmap_set_target_v(struct nilfs_bmap *bmap, __u64 key, + __u64 ptr) +{ + bmap->b_last_allocated_key = key; + bmap->b_last_allocated_ptr = ptr; +} + +__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, + const struct buffer_head *); + +__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); +__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); + + +/* Assume that bmap semaphore is locked. */ +static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap) +{ + return !!(bmap->b_state & NILFS_BMAP_DIRTY); +} + +/* Assume that bmap semaphore is locked. */ +static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap) +{ + bmap->b_state |= NILFS_BMAP_DIRTY; +} + +/* Assume that bmap semaphore is locked. */ +static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap) +{ + bmap->b_state &= ~NILFS_BMAP_DIRTY; +} + + +#define NILFS_BMAP_LARGE 0x1 + +#define NILFS_BMAP_SMALL_LOW NILFS_DIRECT_KEY_MIN +#define NILFS_BMAP_SMALL_HIGH NILFS_DIRECT_KEY_MAX +#define NILFS_BMAP_LARGE_LOW NILFS_BTREE_ROOT_NCHILDREN_MAX +#define NILFS_BMAP_LARGE_HIGH NILFS_BTREE_KEY_MAX + +#endif /* _NILFS_BMAP_H */ diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c new file mode 100644 index 00000000..a35ae35e --- /dev/null +++ b/fs/nilfs2/btnode.c @@ -0,0 +1,297 @@ +/* + * btnode.c - NILFS B-tree node cache + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * This file was originally written by Seiji Kihara + * and fully revised by Ryusuke Konishi for + * stabilization and simplification. + * + */ + +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "mdt.h" +#include "dat.h" +#include "page.h" +#include "btnode.h" + +void nilfs_btnode_cache_clear(struct address_space *btnc) +{ + invalidate_mapping_pages(btnc, 0, -1); + truncate_inode_pages(btnc, 0); +} + +struct buffer_head * +nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) +{ + struct inode *inode = NILFS_BTNC_I(btnc); + struct buffer_head *bh; + + bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); + if (unlikely(!bh)) + return NULL; + + if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) || + buffer_dirty(bh))) { + brelse(bh); + BUG(); + } + memset(bh->b_data, 0, 1 << inode->i_blkbits); + bh->b_bdev = inode->i_sb->s_bdev; + bh->b_blocknr = blocknr; + set_buffer_mapped(bh); + set_buffer_uptodate(bh); + + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + return bh; +} + +int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, + sector_t pblocknr, int mode, + struct buffer_head **pbh, sector_t *submit_ptr) +{ + struct buffer_head *bh; + struct inode *inode = NILFS_BTNC_I(btnc); + struct page *page; + int err; + + bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); + if (unlikely(!bh)) + return -ENOMEM; + + err = -EEXIST; /* internal code */ + page = bh->b_page; + + if (buffer_uptodate(bh) || buffer_dirty(bh)) + goto found; + + if (pblocknr == 0) { + pblocknr = blocknr; + if (inode->i_ino != NILFS_DAT_INO) { + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + /* blocknr is a virtual block number */ + err = nilfs_dat_translate(nilfs->ns_dat, blocknr, + &pblocknr); + if (unlikely(err)) { + brelse(bh); + goto out_locked; + } + } + } + + if (mode == READA) { + if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) { + err = -EBUSY; /* internal code */ + brelse(bh); + goto out_locked; + } + } else { /* mode == READ */ + lock_buffer(bh); + } + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + err = -EEXIST; /* internal code */ + goto found; + } + set_buffer_mapped(bh); + bh->b_bdev = inode->i_sb->s_bdev; + bh->b_blocknr = pblocknr; /* set block address for read */ + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(mode, bh); + bh->b_blocknr = blocknr; /* set back to the given block address */ + *submit_ptr = pblocknr; + err = 0; +found: + *pbh = bh; + +out_locked: + unlock_page(page); + page_cache_release(page); + return err; +} + +/** + * nilfs_btnode_delete - delete B-tree node buffer + * @bh: buffer to be deleted + * + * nilfs_btnode_delete() invalidates the specified buffer and delete the page + * including the buffer if the page gets unbusy. + */ +void nilfs_btnode_delete(struct buffer_head *bh) +{ + struct address_space *mapping; + struct page *page = bh->b_page; + pgoff_t index = page_index(page); + int still_dirty; + + page_cache_get(page); + lock_page(page); + wait_on_page_writeback(page); + + nilfs_forget_buffer(bh); + still_dirty = PageDirty(page); + mapping = page->mapping; + unlock_page(page); + page_cache_release(page); + + if (!still_dirty && mapping) + invalidate_inode_pages2_range(mapping, index, index); +} + +/** + * nilfs_btnode_prepare_change_key + * prepare to move contents of the block for old key to one of new key. + * the old buffer will not be removed, but might be reused for new buffer. + * it might return -ENOMEM because of memory allocation errors, + * and might return -EIO because of disk read errors. + */ +int nilfs_btnode_prepare_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *obh, *nbh; + struct inode *inode = NILFS_BTNC_I(btnc); + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + int err; + + if (oldkey == newkey) + return 0; + + obh = ctxt->bh; + ctxt->newbh = NULL; + + if (inode->i_blkbits == PAGE_CACHE_SHIFT) { + lock_page(obh->b_page); + /* + * We cannot call radix_tree_preload for the kernels older + * than 2.6.23, because it is not exported for modules. + */ +retry: + err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (err) + goto failed_unlock; + /* BUG_ON(oldkey != obh->b_page->index); */ + if (unlikely(oldkey != obh->b_page->index)) + NILFS_PAGE_BUG(obh->b_page, + "invalid oldkey %lld (newkey=%lld)", + (unsigned long long)oldkey, + (unsigned long long)newkey); + + spin_lock_irq(&btnc->tree_lock); + err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); + spin_unlock_irq(&btnc->tree_lock); + /* + * Note: page->index will not change to newkey until + * nilfs_btnode_commit_change_key() will be called. + * To protect the page in intermediate state, the page lock + * is held. + */ + radix_tree_preload_end(); + if (!err) + return 0; + else if (err != -EEXIST) + goto failed_unlock; + + err = invalidate_inode_pages2_range(btnc, newkey, newkey); + if (!err) + goto retry; + /* fallback to copy mode */ + unlock_page(obh->b_page); + } + + nbh = nilfs_btnode_create_block(btnc, newkey); + if (!nbh) + return -ENOMEM; + + BUG_ON(nbh == obh); + ctxt->newbh = nbh; + return 0; + + failed_unlock: + unlock_page(obh->b_page); + return err; +} + +/** + * nilfs_btnode_commit_change_key + * commit the change_key operation prepared by prepare_change_key(). + */ +void nilfs_btnode_commit_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh; + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + struct page *opage; + + if (oldkey == newkey) + return; + + if (nbh == NULL) { /* blocksize == pagesize */ + opage = obh->b_page; + if (unlikely(oldkey != opage->index)) + NILFS_PAGE_BUG(opage, + "invalid oldkey %lld (newkey=%lld)", + (unsigned long long)oldkey, + (unsigned long long)newkey); + mark_buffer_dirty(obh); + + spin_lock_irq(&btnc->tree_lock); + radix_tree_delete(&btnc->page_tree, oldkey); + radix_tree_tag_set(&btnc->page_tree, newkey, + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&btnc->tree_lock); + + opage->index = obh->b_blocknr = newkey; + unlock_page(opage); + } else { + nilfs_copy_buffer(nbh, obh); + mark_buffer_dirty(nbh); + + nbh->b_blocknr = newkey; + ctxt->bh = nbh; + nilfs_btnode_delete(obh); /* will decrement bh->b_count */ + } +} + +/** + * nilfs_btnode_abort_change_key + * abort the change_key operation prepared by prepare_change_key(). + */ +void nilfs_btnode_abort_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *nbh = ctxt->newbh; + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + + if (oldkey == newkey) + return; + + if (nbh == NULL) { /* blocksize == pagesize */ + spin_lock_irq(&btnc->tree_lock); + radix_tree_delete(&btnc->page_tree, newkey); + spin_unlock_irq(&btnc->tree_lock); + unlock_page(ctxt->bh->b_page); + } else + brelse(nbh); +} diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h new file mode 100644 index 00000000..3a4dd2d8 --- /dev/null +++ b/fs/nilfs2/btnode.h @@ -0,0 +1,53 @@ +/* + * btnode.h - NILFS B-tree node cache + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Seiji Kihara + * Revised by Ryusuke Konishi + */ + +#ifndef _NILFS_BTNODE_H +#define _NILFS_BTNODE_H + +#include +#include +#include +#include + + +struct nilfs_btnode_chkey_ctxt { + __u64 oldkey; + __u64 newkey; + struct buffer_head *bh; + struct buffer_head *newbh; +}; + +void nilfs_btnode_cache_clear(struct address_space *); +struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, + __u64 blocknr); +int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int, + struct buffer_head **, sector_t *); +void nilfs_btnode_delete(struct buffer_head *); +int nilfs_btnode_prepare_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); +void nilfs_btnode_commit_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); +void nilfs_btnode_abort_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); + +#endif /* _NILFS_BTNODE_H */ diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c new file mode 100644 index 00000000..b2e3ff34 --- /dev/null +++ b/fs/nilfs2/btree.c @@ -0,0 +1,2310 @@ +/* + * btree.c - NILFS B-tree. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#include +#include +#include +#include +#include "nilfs.h" +#include "page.h" +#include "btnode.h" +#include "btree.h" +#include "alloc.h" +#include "dat.h" + +static struct nilfs_btree_path *nilfs_btree_alloc_path(void) +{ + struct nilfs_btree_path *path; + int level = NILFS_BTREE_LEVEL_DATA; + + path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); + if (path == NULL) + goto out; + + for (; level < NILFS_BTREE_LEVEL_MAX; level++) { + path[level].bp_bh = NULL; + path[level].bp_sib_bh = NULL; + path[level].bp_index = 0; + path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_op = NULL; + } + +out: + return path; +} + +static void nilfs_btree_free_path(struct nilfs_btree_path *path) +{ + int level = NILFS_BTREE_LEVEL_DATA; + + for (; level < NILFS_BTREE_LEVEL_MAX; level++) + brelse(path[level].bp_bh); + + kmem_cache_free(nilfs_btree_path_cache, path); +} + +/* + * B-tree node operations + */ +static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree, + __u64 ptr, struct buffer_head **bhp) +{ + struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct buffer_head *bh; + + bh = nilfs_btnode_create_block(btnc, ptr); + if (!bh) + return -ENOMEM; + + set_buffer_nilfs_volatile(bh); + *bhp = bh; + return 0; +} + +static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node) +{ + return node->bn_flags; +} + +static void +nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags) +{ + node->bn_flags = flags; +} + +static int nilfs_btree_node_root(const struct nilfs_btree_node *node) +{ + return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT; +} + +static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node) +{ + return node->bn_level; +} + +static void +nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level) +{ + node->bn_level = level; +} + +static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node) +{ + return le16_to_cpu(node->bn_nchildren); +} + +static void +nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren) +{ + node->bn_nchildren = cpu_to_le16(nchildren); +} + +static int nilfs_btree_node_size(const struct nilfs_bmap *btree) +{ + return 1 << btree->b_inode->i_blkbits; +} + +static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree) +{ + return btree->b_nchildren_per_block; +} + +static __le64 * +nilfs_btree_node_dkeys(const struct nilfs_btree_node *node) +{ + return (__le64 *)((char *)(node + 1) + + (nilfs_btree_node_root(node) ? + 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); +} + +static __le64 * +nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax) +{ + return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax); +} + +static __u64 +nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index) +{ + return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index)); +} + +static void +nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key) +{ + *(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key); +} + +static __u64 +nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index, + int ncmax) +{ + return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index)); +} + +static void +nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr, + int ncmax) +{ + *(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr); +} + +static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags, + int level, int nchildren, int ncmax, + const __u64 *keys, const __u64 *ptrs) +{ + __le64 *dkeys; + __le64 *dptrs; + int i; + + nilfs_btree_node_set_flags(node, flags); + nilfs_btree_node_set_level(node, level); + nilfs_btree_node_set_nchildren(node, nchildren); + + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, ncmax); + for (i = 0; i < nchildren; i++) { + dkeys[i] = cpu_to_le64(keys[i]); + dptrs[i] = cpu_to_le64(ptrs[i]); + } +} + +/* Assume the buffer heads corresponding to left and right are locked. */ +static void nilfs_btree_node_move_left(struct nilfs_btree_node *left, + struct nilfs_btree_node *right, + int n, int lncmax, int rncmax) +{ + __le64 *ldkeys, *rdkeys; + __le64 *ldptrs, *rdptrs; + int lnchildren, rnchildren; + + ldkeys = nilfs_btree_node_dkeys(left); + ldptrs = nilfs_btree_node_dptrs(left, lncmax); + lnchildren = nilfs_btree_node_get_nchildren(left); + + rdkeys = nilfs_btree_node_dkeys(right); + rdptrs = nilfs_btree_node_dptrs(right, rncmax); + rnchildren = nilfs_btree_node_get_nchildren(right); + + memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); + memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs)); + memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys)); + memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs)); + + lnchildren += n; + rnchildren -= n; + nilfs_btree_node_set_nchildren(left, lnchildren); + nilfs_btree_node_set_nchildren(right, rnchildren); +} + +/* Assume that the buffer heads corresponding to left and right are locked. */ +static void nilfs_btree_node_move_right(struct nilfs_btree_node *left, + struct nilfs_btree_node *right, + int n, int lncmax, int rncmax) +{ + __le64 *ldkeys, *rdkeys; + __le64 *ldptrs, *rdptrs; + int lnchildren, rnchildren; + + ldkeys = nilfs_btree_node_dkeys(left); + ldptrs = nilfs_btree_node_dptrs(left, lncmax); + lnchildren = nilfs_btree_node_get_nchildren(left); + + rdkeys = nilfs_btree_node_dkeys(right); + rdptrs = nilfs_btree_node_dptrs(right, rncmax); + rnchildren = nilfs_btree_node_get_nchildren(right); + + memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); + memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs)); + memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys)); + memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs)); + + lnchildren -= n; + rnchildren += n; + nilfs_btree_node_set_nchildren(left, lnchildren); + nilfs_btree_node_set_nchildren(right, rnchildren); +} + +/* Assume that the buffer head corresponding to node is locked. */ +static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index, + __u64 key, __u64 ptr, int ncmax) +{ + __le64 *dkeys; + __le64 *dptrs; + int nchildren; + + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, ncmax); + nchildren = nilfs_btree_node_get_nchildren(node); + if (index < nchildren) { + memmove(dkeys + index + 1, dkeys + index, + (nchildren - index) * sizeof(*dkeys)); + memmove(dptrs + index + 1, dptrs + index, + (nchildren - index) * sizeof(*dptrs)); + } + dkeys[index] = cpu_to_le64(key); + dptrs[index] = cpu_to_le64(ptr); + nchildren++; + nilfs_btree_node_set_nchildren(node, nchildren); +} + +/* Assume that the buffer head corresponding to node is locked. */ +static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index, + __u64 *keyp, __u64 *ptrp, int ncmax) +{ + __u64 key; + __u64 ptr; + __le64 *dkeys; + __le64 *dptrs; + int nchildren; + + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, ncmax); + key = le64_to_cpu(dkeys[index]); + ptr = le64_to_cpu(dptrs[index]); + nchildren = nilfs_btree_node_get_nchildren(node); + if (keyp != NULL) + *keyp = key; + if (ptrp != NULL) + *ptrp = ptr; + + if (index < nchildren - 1) { + memmove(dkeys + index, dkeys + index + 1, + (nchildren - index - 1) * sizeof(*dkeys)); + memmove(dptrs + index, dptrs + index + 1, + (nchildren - index - 1) * sizeof(*dptrs)); + } + nchildren--; + nilfs_btree_node_set_nchildren(node, nchildren); +} + +static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node, + __u64 key, int *indexp) +{ + __u64 nkey; + int index, low, high, s; + + /* binary search */ + low = 0; + high = nilfs_btree_node_get_nchildren(node) - 1; + index = 0; + s = 0; + while (low <= high) { + index = (low + high) / 2; + nkey = nilfs_btree_node_get_key(node, index); + if (nkey == key) { + s = 0; + goto out; + } else if (nkey < key) { + low = index + 1; + s = -1; + } else { + high = index - 1; + s = 1; + } + } + + /* adjust index */ + if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) { + if (s > 0 && index > 0) + index--; + } else if (s < 0) + index++; + + out: + *indexp = index; + + return s == 0; +} + +/** + * nilfs_btree_node_broken - verify consistency of btree node + * @node: btree node block to be examined + * @size: node size (in bytes) + * @blocknr: block number + * + * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. + */ +static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, + size_t size, sector_t blocknr) +{ + int level, flags, nchildren; + int ret = 0; + + level = nilfs_btree_node_get_level(node); + flags = nilfs_btree_node_get_flags(node); + nchildren = nilfs_btree_node_get_nchildren(node); + + if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || + level >= NILFS_BTREE_LEVEL_MAX || + (flags & NILFS_BTREE_NODE_ROOT) || + nchildren < 0 || + nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) { + printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): " + "level = %d, flags = 0x%x, nchildren = %d\n", + (unsigned long long)blocknr, level, flags, nchildren); + ret = 1; + } + return ret; +} + +int nilfs_btree_broken_node_block(struct buffer_head *bh) +{ + int ret; + + if (buffer_nilfs_checked(bh)) + return 0; + + ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data, + bh->b_size, bh->b_blocknr); + if (likely(!ret)) + set_buffer_nilfs_checked(bh); + return ret; +} + +static struct nilfs_btree_node * +nilfs_btree_get_root(const struct nilfs_bmap *btree) +{ + return (struct nilfs_btree_node *)btree->b_u.u_data; +} + +static struct nilfs_btree_node * +nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level) +{ + return (struct nilfs_btree_node *)path[level].bp_bh->b_data; +} + +static struct nilfs_btree_node * +nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level) +{ + return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; +} + +static int nilfs_btree_height(const struct nilfs_bmap *btree) +{ + return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1; +} + +static struct nilfs_btree_node * +nilfs_btree_get_node(const struct nilfs_bmap *btree, + const struct nilfs_btree_path *path, + int level, int *ncmaxp) +{ + struct nilfs_btree_node *node; + + if (level == nilfs_btree_height(btree) - 1) { + node = nilfs_btree_get_root(btree); + *ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX; + } else { + node = nilfs_btree_get_nonroot_node(path, level); + *ncmaxp = nilfs_btree_nchildren_per_block(btree); + } + return node; +} + +static int +nilfs_btree_bad_node(struct nilfs_btree_node *node, int level) +{ + if (unlikely(nilfs_btree_node_get_level(node) != level)) { + dump_stack(); + printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n", + nilfs_btree_node_get_level(node), level); + return 1; + } + return 0; +} + +struct nilfs_btree_readahead_info { + struct nilfs_btree_node *node; /* parent node */ + int max_ra_blocks; /* max nof blocks to read ahead */ + int index; /* current index on the parent node */ + int ncmax; /* nof children in the parent node */ +}; + +static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, + struct buffer_head **bhp, + const struct nilfs_btree_readahead_info *ra) +{ + struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct buffer_head *bh, *ra_bh; + sector_t submit_ptr = 0; + int ret; + + ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr); + if (ret) { + if (ret != -EEXIST) + return ret; + goto out_check; + } + + if (ra) { + int i, n; + __u64 ptr2; + + /* read ahead sibling nodes */ + for (n = ra->max_ra_blocks, i = ra->index + 1; + n > 0 && i < ra->ncmax; n--, i++) { + ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax); + + ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA, + &ra_bh, &submit_ptr); + if (likely(!ret || ret == -EEXIST)) + brelse(ra_bh); + else if (ret != -EBUSY) + break; + if (!buffer_locked(bh)) + goto out_no_wait; + } + } + + wait_on_buffer(bh); + + out_no_wait: + if (!buffer_uptodate(bh)) { + brelse(bh); + return -EIO; + } + + out_check: + if (nilfs_btree_broken_node_block(bh)) { + clear_buffer_uptodate(bh); + brelse(bh); + return -EINVAL; + } + + *bhp = bh; + return 0; +} + +static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, + struct buffer_head **bhp) +{ + return __nilfs_btree_get_block(btree, ptr, bhp, NULL); +} + +static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + __u64 key, __u64 *ptrp, int minlevel, + int readahead) +{ + struct nilfs_btree_node *node; + struct nilfs_btree_readahead_info p, *ra; + __u64 ptr; + int level, index, found, ncmax, ret; + + node = nilfs_btree_get_root(btree); + level = nilfs_btree_node_get_level(node); + if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0) + return -ENOENT; + + found = nilfs_btree_node_lookup(node, key, &index); + ptr = nilfs_btree_node_get_ptr(node, index, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + path[level].bp_bh = NULL; + path[level].bp_index = index; + + ncmax = nilfs_btree_nchildren_per_block(btree); + + while (--level >= minlevel) { + ra = NULL; + if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) { + p.node = nilfs_btree_get_node(btree, path, level + 1, + &p.ncmax); + p.index = index; + p.max_ra_blocks = 7; + ra = &p; + } + ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh, + ra); + if (ret < 0) + return ret; + + node = nilfs_btree_get_nonroot_node(path, level); + if (nilfs_btree_bad_node(node, level)) + return -EINVAL; + if (!found) + found = nilfs_btree_node_lookup(node, key, &index); + else + index = 0; + if (index < ncmax) { + ptr = nilfs_btree_node_get_ptr(node, index, ncmax); + } else { + WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); + /* insert */ + ptr = NILFS_BMAP_INVALID_PTR; + } + path[level].bp_index = index; + } + if (!found) + return -ENOENT; + + if (ptrp != NULL) + *ptrp = ptr; + + return 0; +} + +static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + __u64 ptr; + int index, level, ncmax, ret; + + node = nilfs_btree_get_root(btree); + index = nilfs_btree_node_get_nchildren(node) - 1; + if (index < 0) + return -ENOENT; + level = nilfs_btree_node_get_level(node); + ptr = nilfs_btree_node_get_ptr(node, index, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + path[level].bp_bh = NULL; + path[level].bp_index = index; + ncmax = nilfs_btree_nchildren_per_block(btree); + + for (level--; level > 0; level--) { + ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); + if (ret < 0) + return ret; + node = nilfs_btree_get_nonroot_node(path, level); + if (nilfs_btree_bad_node(node, level)) + return -EINVAL; + index = nilfs_btree_node_get_nchildren(node) - 1; + ptr = nilfs_btree_node_get_ptr(node, index, ncmax); + path[level].bp_index = index; + } + + if (keyp != NULL) + *keyp = nilfs_btree_node_get_key(node, index); + if (ptrp != NULL) + *ptrp = ptr; + + return 0; +} + +static int nilfs_btree_lookup(const struct nilfs_bmap *btree, + __u64 key, int level, __u64 *ptrp) +{ + struct nilfs_btree_path *path; + int ret; + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0); + + nilfs_btree_free_path(path); + + return ret; +} + +static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree, + __u64 key, __u64 *ptrp, unsigned maxblocks) +{ + struct nilfs_btree_path *path; + struct nilfs_btree_node *node; + struct inode *dat = NULL; + __u64 ptr, ptr2; + sector_t blocknr; + int level = NILFS_BTREE_LEVEL_NODE_MIN; + int ret, cnt, index, maxlevel, ncmax; + struct nilfs_btree_readahead_info p; + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1); + if (ret < 0) + goto out; + + if (NILFS_BMAP_USE_VBN(btree)) { + dat = nilfs_bmap_get_dat(btree); + ret = nilfs_dat_translate(dat, ptr, &blocknr); + if (ret < 0) + goto out; + ptr = blocknr; + } + cnt = 1; + if (cnt == maxblocks) + goto end; + + maxlevel = nilfs_btree_height(btree) - 1; + node = nilfs_btree_get_node(btree, path, level, &ncmax); + index = path[level].bp_index + 1; + for (;;) { + while (index < nilfs_btree_node_get_nchildren(node)) { + if (nilfs_btree_node_get_key(node, index) != + key + cnt) + goto end; + ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax); + if (dat) { + ret = nilfs_dat_translate(dat, ptr2, &blocknr); + if (ret < 0) + goto out; + ptr2 = blocknr; + } + if (ptr2 != ptr + cnt || ++cnt == maxblocks) + goto end; + index++; + continue; + } + if (level == maxlevel) + break; + + /* look-up right sibling node */ + p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax); + p.index = path[level + 1].bp_index + 1; + p.max_ra_blocks = 7; + if (p.index >= nilfs_btree_node_get_nchildren(p.node) || + nilfs_btree_node_get_key(p.node, p.index) != key + cnt) + break; + ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax); + path[level + 1].bp_index = p.index; + + brelse(path[level].bp_bh); + path[level].bp_bh = NULL; + + ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh, + &p); + if (ret < 0) + goto out; + node = nilfs_btree_get_nonroot_node(path, level); + ncmax = nilfs_btree_nchildren_per_block(btree); + index = 0; + path[level].bp_index = index; + } + end: + *ptrp = ptr; + ret = cnt; + out: + nilfs_btree_free_path(path); + return ret; +} + +static void nilfs_btree_promote_key(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 key) +{ + if (level < nilfs_btree_height(btree) - 1) { + do { + nilfs_btree_node_set_key( + nilfs_btree_get_nonroot_node(path, level), + path[level].bp_index, key); + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + } while ((path[level].bp_index == 0) && + (++level < nilfs_btree_height(btree) - 1)); + } + + /* root */ + if (level == nilfs_btree_height(btree) - 1) { + nilfs_btree_node_set_key(nilfs_btree_get_root(btree), + path[level].bp_index, key); + } +} + +static void nilfs_btree_do_insert(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + int ncblk; + + if (level < nilfs_btree_height(btree) - 1) { + node = nilfs_btree_get_nonroot_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); + nilfs_btree_node_insert(node, path[level].bp_index, + *keyp, *ptrp, ncblk); + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + + if (path[level].bp_index == 0) + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(node, + 0)); + } else { + node = nilfs_btree_get_root(btree); + nilfs_btree_node_insert(node, path[level].bp_index, + *keyp, *ptrp, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + } +} + +static void nilfs_btree_carry_left(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int nchildren, lnchildren, n, move, ncblk; + + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + lnchildren = nilfs_btree_node_get_nchildren(left); + ncblk = nilfs_btree_nchildren_per_block(btree); + move = 0; + + n = (nchildren + lnchildren + 1) / 2 - lnchildren; + if (n > path[level].bp_index) { + /* move insert point */ + n--; + move = 1; + } + + nilfs_btree_node_move_left(left, node, n, ncblk, ncblk); + + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + mark_buffer_dirty(path[level].bp_sib_bh); + + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(node, 0)); + + if (move) { + brelse(path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index += lnchildren; + path[level + 1].bp_index--; + } else { + brelse(path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level].bp_index -= n; + } + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); +} + +static void nilfs_btree_carry_right(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int nchildren, rnchildren, n, move, ncblk; + + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + rnchildren = nilfs_btree_node_get_nchildren(right); + ncblk = nilfs_btree_nchildren_per_block(btree); + move = 0; + + n = (nchildren + rnchildren + 1) / 2 - rnchildren; + if (n > nchildren - path[level].bp_index) { + /* move insert point */ + n--; + move = 1; + } + + nilfs_btree_node_move_right(node, right, n, ncblk, ncblk); + + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + mark_buffer_dirty(path[level].bp_sib_bh); + + path[level + 1].bp_index++; + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(right, 0)); + path[level + 1].bp_index--; + + if (move) { + brelse(path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index -= nilfs_btree_node_get_nchildren(node); + path[level + 1].bp_index++; + } else { + brelse(path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + } + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); +} + +static void nilfs_btree_split(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + __u64 newkey; + __u64 newptr; + int nchildren, n, move, ncblk; + + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + ncblk = nilfs_btree_nchildren_per_block(btree); + move = 0; + + n = (nchildren + 1) / 2; + if (n > nchildren - path[level].bp_index) { + n--; + move = 1; + } + + nilfs_btree_node_move_right(node, right, n, ncblk, ncblk); + + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + mark_buffer_dirty(path[level].bp_sib_bh); + + newkey = nilfs_btree_node_get_key(right, 0); + newptr = path[level].bp_newreq.bpr_ptr; + + if (move) { + path[level].bp_index -= nilfs_btree_node_get_nchildren(node); + nilfs_btree_node_insert(right, path[level].bp_index, + *keyp, *ptrp, ncblk); + + *keyp = nilfs_btree_node_get_key(right, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; + + brelse(path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + } else { + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); + + *keyp = nilfs_btree_node_get_key(right, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; + + brelse(path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + } + + path[level + 1].bp_index++; +} + +static void nilfs_btree_grow(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *root, *child; + int n, ncblk; + + root = nilfs_btree_get_root(btree); + child = nilfs_btree_get_sib_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); + + n = nilfs_btree_node_get_nchildren(root); + + nilfs_btree_node_move_right(root, child, n, + NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk); + nilfs_btree_node_set_level(root, level + 1); + + if (!buffer_dirty(path[level].bp_sib_bh)) + mark_buffer_dirty(path[level].bp_sib_bh); + + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); + + *keyp = nilfs_btree_node_get_key(child, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; +} + +static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree, + const struct nilfs_btree_path *path) +{ + struct nilfs_btree_node *node; + int level, ncmax; + + if (path == NULL) + return NILFS_BMAP_INVALID_PTR; + + /* left sibling */ + level = NILFS_BTREE_LEVEL_NODE_MIN; + if (path[level].bp_index > 0) { + node = nilfs_btree_get_node(btree, path, level, &ncmax); + return nilfs_btree_node_get_ptr(node, + path[level].bp_index - 1, + ncmax); + } + + /* parent */ + level = NILFS_BTREE_LEVEL_NODE_MIN + 1; + if (level <= nilfs_btree_height(btree) - 1) { + node = nilfs_btree_get_node(btree, path, level, &ncmax); + return nilfs_btree_node_get_ptr(node, path[level].bp_index, + ncmax); + } + + return NILFS_BMAP_INVALID_PTR; +} + +static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree, + const struct nilfs_btree_path *path, + __u64 key) +{ + __u64 ptr; + + ptr = nilfs_bmap_find_target_seq(btree, key); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* sequential access */ + return ptr; + else { + ptr = nilfs_btree_find_near(btree, path); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* near */ + return ptr; + } + /* block group */ + return nilfs_bmap_find_target_in_group(btree); +} + +static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int *levelp, __u64 key, __u64 ptr, + struct nilfs_bmap_stats *stats) +{ + struct buffer_head *bh; + struct nilfs_btree_node *node, *parent, *sib; + __u64 sibptr; + int pindex, level, ncmax, ncblk, ret; + struct inode *dat = NULL; + + stats->bs_nblocks = 0; + level = NILFS_BTREE_LEVEL_DATA; + + /* allocate a new ptr for data block */ + if (NILFS_BMAP_USE_VBN(btree)) { + path[level].bp_newreq.bpr_ptr = + nilfs_btree_find_target_v(btree, path, key); + dat = nilfs_bmap_get_dat(btree); + } + + ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat); + if (ret < 0) + goto err_out_data; + + ncblk = nilfs_btree_nchildren_per_block(btree); + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < nilfs_btree_height(btree) - 1; + level++) { + node = nilfs_btree_get_nonroot_node(path, level); + if (nilfs_btree_node_get_nchildren(node) < ncblk) { + path[level].bp_op = nilfs_btree_do_insert; + stats->bs_nblocks++; + goto out; + } + + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + pindex = path[level + 1].bp_index; + + /* left sibling */ + if (pindex > 0) { + sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1, + ncmax); + ret = nilfs_btree_get_block(btree, sibptr, &bh); + if (ret < 0) + goto err_out_child_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(sib) < ncblk) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_carry_left; + stats->bs_nblocks++; + goto out; + } else { + brelse(bh); + } + } + + /* right sibling */ + if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) { + sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1, + ncmax); + ret = nilfs_btree_get_block(btree, sibptr, &bh); + if (ret < 0) + goto err_out_child_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(sib) < ncblk) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_carry_right; + stats->bs_nblocks++; + goto out; + } else { + brelse(bh); + } + } + + /* split */ + path[level].bp_newreq.bpr_ptr = + path[level - 1].bp_newreq.bpr_ptr + 1; + ret = nilfs_bmap_prepare_alloc_ptr(btree, + &path[level].bp_newreq, dat); + if (ret < 0) + goto err_out_child_node; + ret = nilfs_btree_get_new_block(btree, + path[level].bp_newreq.bpr_ptr, + &bh); + if (ret < 0) + goto err_out_curr_node; + + stats->bs_nblocks++; + + sib = (struct nilfs_btree_node *)bh->b_data; + nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL); + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_split; + } + + /* root */ + node = nilfs_btree_get_root(btree); + if (nilfs_btree_node_get_nchildren(node) < + NILFS_BTREE_ROOT_NCHILDREN_MAX) { + path[level].bp_op = nilfs_btree_do_insert; + stats->bs_nblocks++; + goto out; + } + + /* grow */ + path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; + ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat); + if (ret < 0) + goto err_out_child_node; + ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, + &bh); + if (ret < 0) + goto err_out_curr_node; + + nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data, + 0, level, 0, ncblk, NULL, NULL); + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_grow; + + level++; + path[level].bp_op = nilfs_btree_do_insert; + + /* a newly-created node block and a data block are added */ + stats->bs_nblocks += 2; + + /* success */ + out: + *levelp = level; + return ret; + + /* error */ + err_out_curr_node: + nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); + err_out_child_node: + for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { + nilfs_btnode_delete(path[level].bp_sib_bh); + nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); + + } + + nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); + err_out_data: + *levelp = level; + stats->bs_nblocks = 0; + return ret; +} + +static void nilfs_btree_commit_insert(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int maxlevel, __u64 key, __u64 ptr) +{ + struct inode *dat = NULL; + int level; + + set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); + ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; + if (NILFS_BMAP_USE_VBN(btree)) { + nilfs_bmap_set_target_v(btree, key, ptr); + dat = nilfs_bmap_get_dat(btree); + } + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { + nilfs_bmap_commit_alloc_ptr(btree, + &path[level - 1].bp_newreq, dat); + path[level].bp_op(btree, path, level, &key, &ptr); + } + + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); +} + +static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr) +{ + struct nilfs_btree_path *path; + struct nilfs_bmap_stats stats; + int level, ret; + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, + NILFS_BTREE_LEVEL_NODE_MIN, 0); + if (ret != -ENOENT) { + if (ret == 0) + ret = -EEXIST; + goto out; + } + + ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats); + if (ret < 0) + goto out; + nilfs_btree_commit_insert(btree, path, level, key, ptr); + nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks); + + out: + nilfs_btree_free_path(path); + return ret; +} + +static void nilfs_btree_do_delete(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + int ncblk; + + if (level < nilfs_btree_height(btree) - 1) { + node = nilfs_btree_get_nonroot_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); + nilfs_btree_node_delete(node, path[level].bp_index, + keyp, ptrp, ncblk); + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + if (path[level].bp_index == 0) + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(node, 0)); + } else { + node = nilfs_btree_get_root(btree); + nilfs_btree_node_delete(node, path[level].bp_index, + keyp, ptrp, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + } +} + +static void nilfs_btree_borrow_left(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int nchildren, lnchildren, n, ncblk; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + lnchildren = nilfs_btree_node_get_nchildren(left); + ncblk = nilfs_btree_nchildren_per_block(btree); + + n = (nchildren + lnchildren) / 2 - nchildren; + + nilfs_btree_node_move_right(left, node, n, ncblk, ncblk); + + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + mark_buffer_dirty(path[level].bp_sib_bh); + + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(node, 0)); + + brelse(path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level].bp_index += n; +} + +static void nilfs_btree_borrow_right(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int nchildren, rnchildren, n, ncblk; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + rnchildren = nilfs_btree_node_get_nchildren(right); + ncblk = nilfs_btree_nchildren_per_block(btree); + + n = (nchildren + rnchildren) / 2 - nchildren; + + nilfs_btree_node_move_left(node, right, n, ncblk, ncblk); + + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + mark_buffer_dirty(path[level].bp_sib_bh); + + path[level + 1].bp_index++; + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(right, 0)); + path[level + 1].bp_index--; + + brelse(path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; +} + +static void nilfs_btree_concat_left(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int n, ncblk; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); + + n = nilfs_btree_node_get_nchildren(node); + + nilfs_btree_node_move_left(left, node, n, ncblk, ncblk); + + if (!buffer_dirty(path[level].bp_sib_bh)) + mark_buffer_dirty(path[level].bp_sib_bh); + + nilfs_btnode_delete(path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index += nilfs_btree_node_get_nchildren(left); +} + +static void nilfs_btree_concat_right(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int n, ncblk; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); + + n = nilfs_btree_node_get_nchildren(right); + + nilfs_btree_node_move_left(node, right, n, ncblk, ncblk); + + if (!buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + + nilfs_btnode_delete(path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level + 1].bp_index++; +} + +static void nilfs_btree_shrink(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *root, *child; + int n, ncblk; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + root = nilfs_btree_get_root(btree); + child = nilfs_btree_get_nonroot_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); + + nilfs_btree_node_delete(root, 0, NULL, NULL, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + nilfs_btree_node_set_level(root, level); + n = nilfs_btree_node_get_nchildren(child); + nilfs_btree_node_move_left(root, child, n, + NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk); + + nilfs_btnode_delete(path[level].bp_bh); + path[level].bp_bh = NULL; +} + +static void nilfs_btree_nop(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ +} + +static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int *levelp, + struct nilfs_bmap_stats *stats, + struct inode *dat) +{ + struct buffer_head *bh; + struct nilfs_btree_node *node, *parent, *sib; + __u64 sibptr; + int pindex, dindex, level, ncmin, ncmax, ncblk, ret; + + ret = 0; + stats->bs_nblocks = 0; + ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); + ncblk = nilfs_btree_nchildren_per_block(btree); + + for (level = NILFS_BTREE_LEVEL_NODE_MIN, dindex = path[level].bp_index; + level < nilfs_btree_height(btree) - 1; + level++) { + node = nilfs_btree_get_nonroot_node(path, level); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(node, dindex, ncblk); + ret = nilfs_bmap_prepare_end_ptr(btree, + &path[level].bp_oldreq, dat); + if (ret < 0) + goto err_out_child_node; + + if (nilfs_btree_node_get_nchildren(node) > ncmin) { + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + goto out; + } + + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + pindex = path[level + 1].bp_index; + dindex = pindex; + + if (pindex > 0) { + /* left sibling */ + sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1, + ncmax); + ret = nilfs_btree_get_block(btree, sibptr, &bh); + if (ret < 0) + goto err_out_curr_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(sib) > ncmin) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_borrow_left; + stats->bs_nblocks++; + goto out; + } else { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_concat_left; + stats->bs_nblocks++; + /* continue; */ + } + } else if (pindex < + nilfs_btree_node_get_nchildren(parent) - 1) { + /* right sibling */ + sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1, + ncmax); + ret = nilfs_btree_get_block(btree, sibptr, &bh); + if (ret < 0) + goto err_out_curr_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(sib) > ncmin) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_borrow_right; + stats->bs_nblocks++; + goto out; + } else { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_concat_right; + stats->bs_nblocks++; + /* + * When merging right sibling node + * into the current node, pointer to + * the right sibling node must be + * terminated instead. The adjustment + * below is required for that. + */ + dindex = pindex + 1; + /* continue; */ + } + } else { + /* no siblings */ + /* the only child of the root node */ + WARN_ON(level != nilfs_btree_height(btree) - 2); + if (nilfs_btree_node_get_nchildren(node) - 1 <= + NILFS_BTREE_ROOT_NCHILDREN_MAX) { + path[level].bp_op = nilfs_btree_shrink; + stats->bs_nblocks += 2; + level++; + path[level].bp_op = nilfs_btree_nop; + goto shrink_root_child; + } else { + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + goto out; + } + } + } + + /* child of the root node is deleted */ + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + +shrink_root_child: + node = nilfs_btree_get_root(btree); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(node, dindex, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + + ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat); + if (ret < 0) + goto err_out_child_node; + + /* success */ + out: + *levelp = level; + return ret; + + /* error */ + err_out_curr_node: + nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat); + err_out_child_node: + for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { + brelse(path[level].bp_sib_bh); + nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat); + } + *levelp = level; + stats->bs_nblocks = 0; + return ret; +} + +static void nilfs_btree_commit_delete(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int maxlevel, struct inode *dat) +{ + int level; + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { + nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat); + path[level].bp_op(btree, path, level, NULL, NULL); + } + + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); +} + +static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key) + +{ + struct nilfs_btree_path *path; + struct nilfs_bmap_stats stats; + struct inode *dat; + int level, ret; + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, + NILFS_BTREE_LEVEL_NODE_MIN, 0); + if (ret < 0) + goto out; + + + dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL; + + ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat); + if (ret < 0) + goto out; + nilfs_btree_commit_delete(btree, path, level, dat); + nilfs_inode_sub_blocks(btree->b_inode, stats.bs_nblocks); + +out: + nilfs_btree_free_path(path); + return ret; +} + +static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp) +{ + struct nilfs_btree_path *path; + int ret; + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); + + nilfs_btree_free_path(path); + + return ret; +} + +static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key) +{ + struct buffer_head *bh; + struct nilfs_btree_node *root, *node; + __u64 maxkey, nextmaxkey; + __u64 ptr; + int nchildren, ret; + + root = nilfs_btree_get_root(btree); + switch (nilfs_btree_height(btree)) { + case 2: + bh = NULL; + node = root; + break; + case 3: + nchildren = nilfs_btree_node_get_nchildren(root); + if (nchildren > 1) + return 0; + ptr = nilfs_btree_node_get_ptr(root, nchildren - 1, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + ret = nilfs_btree_get_block(btree, ptr, &bh); + if (ret < 0) + return ret; + node = (struct nilfs_btree_node *)bh->b_data; + break; + default: + return 0; + } + + nchildren = nilfs_btree_node_get_nchildren(node); + maxkey = nilfs_btree_node_get_key(node, nchildren - 1); + nextmaxkey = (nchildren > 1) ? + nilfs_btree_node_get_key(node, nchildren - 2) : 0; + if (bh != NULL) + brelse(bh); + + return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); +} + +static int nilfs_btree_gather_data(struct nilfs_bmap *btree, + __u64 *keys, __u64 *ptrs, int nitems) +{ + struct buffer_head *bh; + struct nilfs_btree_node *node, *root; + __le64 *dkeys; + __le64 *dptrs; + __u64 ptr; + int nchildren, ncmax, i, ret; + + root = nilfs_btree_get_root(btree); + switch (nilfs_btree_height(btree)) { + case 2: + bh = NULL; + node = root; + ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX; + break; + case 3: + nchildren = nilfs_btree_node_get_nchildren(root); + WARN_ON(nchildren > 1); + ptr = nilfs_btree_node_get_ptr(root, nchildren - 1, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + ret = nilfs_btree_get_block(btree, ptr, &bh); + if (ret < 0) + return ret; + node = (struct nilfs_btree_node *)bh->b_data; + ncmax = nilfs_btree_nchildren_per_block(btree); + break; + default: + node = NULL; + return -EINVAL; + } + + nchildren = nilfs_btree_node_get_nchildren(node); + if (nchildren < nitems) + nitems = nchildren; + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, ncmax); + for (i = 0; i < nitems; i++) { + keys[i] = le64_to_cpu(dkeys[i]); + ptrs[i] = le64_to_cpu(dptrs[i]); + } + + if (bh != NULL) + brelse(bh); + + return nitems; +} + +static int +nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key, + union nilfs_bmap_ptr_req *dreq, + union nilfs_bmap_ptr_req *nreq, + struct buffer_head **bhp, + struct nilfs_bmap_stats *stats) +{ + struct buffer_head *bh; + struct inode *dat = NULL; + int ret; + + stats->bs_nblocks = 0; + + /* for data */ + /* cannot find near ptr */ + if (NILFS_BMAP_USE_VBN(btree)) { + dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); + dat = nilfs_bmap_get_dat(btree); + } + + ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat); + if (ret < 0) + return ret; + + *bhp = NULL; + stats->bs_nblocks++; + if (nreq != NULL) { + nreq->bpr_ptr = dreq->bpr_ptr + 1; + ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat); + if (ret < 0) + goto err_out_dreq; + + ret = nilfs_btree_get_new_block(btree, nreq->bpr_ptr, &bh); + if (ret < 0) + goto err_out_nreq; + + *bhp = bh; + stats->bs_nblocks++; + } + + /* success */ + return 0; + + /* error */ + err_out_nreq: + nilfs_bmap_abort_alloc_ptr(btree, nreq, dat); + err_out_dreq: + nilfs_bmap_abort_alloc_ptr(btree, dreq, dat); + stats->bs_nblocks = 0; + return ret; + +} + +static void +nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree, + __u64 key, __u64 ptr, + const __u64 *keys, const __u64 *ptrs, + int n, + union nilfs_bmap_ptr_req *dreq, + union nilfs_bmap_ptr_req *nreq, + struct buffer_head *bh) +{ + struct nilfs_btree_node *node; + struct inode *dat; + __u64 tmpptr; + int ncblk; + + /* free resources */ + if (btree->b_ops->bop_clear != NULL) + btree->b_ops->bop_clear(btree); + + /* ptr must be a pointer to a buffer head. */ + set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); + + /* convert and insert */ + dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL; + nilfs_btree_init(btree); + if (nreq != NULL) { + nilfs_bmap_commit_alloc_ptr(btree, dreq, dat); + nilfs_bmap_commit_alloc_ptr(btree, nreq, dat); + + /* create child node at level 1 */ + node = (struct nilfs_btree_node *)bh->b_data; + ncblk = nilfs_btree_nchildren_per_block(btree); + nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs); + nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk); + if (!buffer_dirty(bh)) + mark_buffer_dirty(bh); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); + + brelse(bh); + + /* create root node at level 2 */ + node = nilfs_btree_get_root(btree); + tmpptr = nreq->bpr_ptr; + nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1, + NILFS_BTREE_ROOT_NCHILDREN_MAX, + &keys[0], &tmpptr); + } else { + nilfs_bmap_commit_alloc_ptr(btree, dreq, dat); + + /* create root node at level 1 */ + node = nilfs_btree_get_root(btree); + nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n, + NILFS_BTREE_ROOT_NCHILDREN_MAX, + keys, ptrs); + nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); + } + + if (NILFS_BMAP_USE_VBN(btree)) + nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr); +} + +/** + * nilfs_btree_convert_and_insert - + * @bmap: + * @key: + * @ptr: + * @keys: + * @ptrs: + * @n: + */ +int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree, + __u64 key, __u64 ptr, + const __u64 *keys, const __u64 *ptrs, int n) +{ + struct buffer_head *bh; + union nilfs_bmap_ptr_req dreq, nreq, *di, *ni; + struct nilfs_bmap_stats stats; + int ret; + + if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) { + di = &dreq; + ni = NULL; + } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX( + 1 << btree->b_inode->i_blkbits)) { + di = &dreq; + ni = &nreq; + } else { + di = NULL; + ni = NULL; + BUG(); + } + + ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh, + &stats); + if (ret < 0) + return ret; + nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n, + di, ni, bh); + nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks); + return 0; +} + +static int nilfs_btree_propagate_p(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head *bh) +{ + while ((++level < nilfs_btree_height(btree) - 1) && + !buffer_dirty(path[level].bp_bh)) + mark_buffer_dirty(path[level].bp_bh); + + return 0; +} + +static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, struct inode *dat) +{ + struct nilfs_btree_node *parent; + int ncmax, ret; + + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, + ncmax); + path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; + ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); + if (ret < 0) + return ret; + + if (buffer_nilfs_node(path[level].bp_bh)) { + path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr; + path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr; + path[level].bp_ctxt.bh = path[level].bp_bh; + ret = nilfs_btnode_prepare_change_key( + &NILFS_BMAP_I(btree)->i_btnode_cache, + &path[level].bp_ctxt); + if (ret < 0) { + nilfs_dat_abort_update(dat, + &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); + return ret; + } + } + + return 0; +} + +static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, struct inode *dat) +{ + struct nilfs_btree_node *parent; + int ncmax; + + nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req, + btree->b_ptr_type == NILFS_BMAP_PTR_VS); + + if (buffer_nilfs_node(path[level].bp_bh)) { + nilfs_btnode_commit_change_key( + &NILFS_BMAP_I(btree)->i_btnode_cache, + &path[level].bp_ctxt); + path[level].bp_bh = path[level].bp_ctxt.bh; + } + set_buffer_nilfs_volatile(path[level].bp_bh); + + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, + path[level].bp_newreq.bpr_ptr, ncmax); +} + +static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, struct inode *dat) +{ + nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); + if (buffer_nilfs_node(path[level].bp_bh)) + nilfs_btnode_abort_change_key( + &NILFS_BMAP_I(btree)->i_btnode_cache, + &path[level].bp_ctxt); +} + +static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int minlevel, int *maxlevelp, + struct inode *dat) +{ + int level, ret; + + level = minlevel; + if (!buffer_nilfs_volatile(path[level].bp_bh)) { + ret = nilfs_btree_prepare_update_v(btree, path, level, dat); + if (ret < 0) + return ret; + } + while ((++level < nilfs_btree_height(btree) - 1) && + !buffer_dirty(path[level].bp_bh)) { + + WARN_ON(buffer_nilfs_volatile(path[level].bp_bh)); + ret = nilfs_btree_prepare_update_v(btree, path, level, dat); + if (ret < 0) + goto out; + } + + /* success */ + *maxlevelp = level - 1; + return 0; + + /* error */ + out: + while (--level > minlevel) + nilfs_btree_abort_update_v(btree, path, level, dat); + if (!buffer_nilfs_volatile(path[level].bp_bh)) + nilfs_btree_abort_update_v(btree, path, level, dat); + return ret; +} + +static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int minlevel, int maxlevel, + struct buffer_head *bh, + struct inode *dat) +{ + int level; + + if (!buffer_nilfs_volatile(path[minlevel].bp_bh)) + nilfs_btree_commit_update_v(btree, path, minlevel, dat); + + for (level = minlevel + 1; level <= maxlevel; level++) + nilfs_btree_commit_update_v(btree, path, level, dat); +} + +static int nilfs_btree_propagate_v(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, struct buffer_head *bh) +{ + int maxlevel = 0, ret; + struct nilfs_btree_node *parent; + struct inode *dat = nilfs_bmap_get_dat(btree); + __u64 ptr; + int ncmax; + + get_bh(bh); + path[level].bp_bh = bh; + ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel, + dat); + if (ret < 0) + goto out; + + if (buffer_nilfs_volatile(path[level].bp_bh)) { + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + ptr = nilfs_btree_node_get_ptr(parent, + path[level + 1].bp_index, + ncmax); + ret = nilfs_dat_mark_dirty(dat, ptr); + if (ret < 0) + goto out; + } + + nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat); + + out: + brelse(path[level].bp_bh); + path[level].bp_bh = NULL; + return ret; +} + +static int nilfs_btree_propagate(struct nilfs_bmap *btree, + struct buffer_head *bh) +{ + struct nilfs_btree_path *path; + struct nilfs_btree_node *node; + __u64 key; + int level, ret; + + WARN_ON(!buffer_dirty(bh)); + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + if (buffer_nilfs_node(bh)) { + node = (struct nilfs_btree_node *)bh->b_data; + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); + } else { + key = nilfs_bmap_data_get_key(btree, bh); + level = NILFS_BTREE_LEVEL_DATA; + } + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); + if (ret < 0) { + if (unlikely(ret == -ENOENT)) + printk(KERN_CRIT "%s: key = %llu, level == %d\n", + __func__, (unsigned long long)key, level); + goto out; + } + + ret = NILFS_BMAP_USE_VBN(btree) ? + nilfs_btree_propagate_v(btree, path, level, bh) : + nilfs_btree_propagate_p(btree, path, level, bh); + + out: + nilfs_btree_free_path(path); + + return ret; +} + +static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree, + struct buffer_head *bh) +{ + return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr); +} + +static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree, + struct list_head *lists, + struct buffer_head *bh) +{ + struct list_head *head; + struct buffer_head *cbh; + struct nilfs_btree_node *node, *cnode; + __u64 key, ckey; + int level; + + get_bh(bh); + node = (struct nilfs_btree_node *)bh->b_data; + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); + if (level < NILFS_BTREE_LEVEL_NODE_MIN || + level >= NILFS_BTREE_LEVEL_MAX) { + dump_stack(); + printk(KERN_WARNING + "%s: invalid btree level: %d (key=%llu, ino=%lu, " + "blocknr=%llu)\n", + __func__, level, (unsigned long long)key, + NILFS_BMAP_I(btree)->vfs_inode.i_ino, + (unsigned long long)bh->b_blocknr); + return; + } + + list_for_each(head, &lists[level]) { + cbh = list_entry(head, struct buffer_head, b_assoc_buffers); + cnode = (struct nilfs_btree_node *)cbh->b_data; + ckey = nilfs_btree_node_get_key(cnode, 0); + if (key < ckey) + break; + } + list_add_tail(&bh->b_assoc_buffers, head); +} + +static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, + struct list_head *listp) +{ + struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct list_head lists[NILFS_BTREE_LEVEL_MAX]; + struct pagevec pvec; + struct buffer_head *bh, *head; + pgoff_t index = 0; + int level, i; + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < NILFS_BTREE_LEVEL_MAX; + level++) + INIT_LIST_HEAD(&lists[level]); + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + bh = head = page_buffers(pvec.pages[i]); + do { + if (buffer_dirty(bh)) + nilfs_btree_add_dirty_buffer(btree, + lists, bh); + } while ((bh = bh->b_this_page) != head); + } + pagevec_release(&pvec); + cond_resched(); + } + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < NILFS_BTREE_LEVEL_MAX; + level++) + list_splice_tail(&lists[level], listp); +} + +static int nilfs_btree_assign_p(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree_node *parent; + __u64 key; + __u64 ptr; + int ncmax, ret; + + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, + ncmax); + if (buffer_nilfs_node(*bh)) { + path[level].bp_ctxt.oldkey = ptr; + path[level].bp_ctxt.newkey = blocknr; + path[level].bp_ctxt.bh = *bh; + ret = nilfs_btnode_prepare_change_key( + &NILFS_BMAP_I(btree)->i_btnode_cache, + &path[level].bp_ctxt); + if (ret < 0) + return ret; + nilfs_btnode_commit_change_key( + &NILFS_BMAP_I(btree)->i_btnode_cache, + &path[level].bp_ctxt); + *bh = path[level].bp_ctxt.bh; + } + + nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr, + ncmax); + + key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); + /* on-disk format */ + binfo->bi_dat.bi_blkoff = cpu_to_le64(key); + binfo->bi_dat.bi_level = level; + + return 0; +} + +static int nilfs_btree_assign_v(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree_node *parent; + struct inode *dat = nilfs_bmap_get_dat(btree); + __u64 key; + __u64 ptr; + union nilfs_bmap_ptr_req req; + int ncmax, ret; + + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, + ncmax); + req.bpr_ptr = ptr; + ret = nilfs_dat_prepare_start(dat, &req.bpr_req); + if (ret < 0) + return ret; + nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); + + key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); + /* on-disk format */ + binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr); + binfo->bi_v.bi_blkoff = cpu_to_le64(key); + + return 0; +} + +static int nilfs_btree_assign(struct nilfs_bmap *btree, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree_path *path; + struct nilfs_btree_node *node; + __u64 key; + int level, ret; + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + if (buffer_nilfs_node(*bh)) { + node = (struct nilfs_btree_node *)(*bh)->b_data; + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); + } else { + key = nilfs_bmap_data_get_key(btree, *bh); + level = NILFS_BTREE_LEVEL_DATA; + } + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + + ret = NILFS_BMAP_USE_VBN(btree) ? + nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) : + nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); + + out: + nilfs_btree_free_path(path); + + return ret; +} + +static int nilfs_btree_assign_gc(struct nilfs_bmap *btree, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree_node *node; + __u64 key; + int ret; + + ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr, + blocknr); + if (ret < 0) + return ret; + + if (buffer_nilfs_node(*bh)) { + node = (struct nilfs_btree_node *)(*bh)->b_data; + key = nilfs_btree_node_get_key(node, 0); + } else + key = nilfs_bmap_data_get_key(btree, *bh); + + /* on-disk format */ + binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr); + binfo->bi_v.bi_blkoff = cpu_to_le64(key); + + return 0; +} + +static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level) +{ + struct buffer_head *bh; + struct nilfs_btree_path *path; + __u64 ptr; + int ret; + + path = nilfs_btree_alloc_path(); + if (path == NULL) + return -ENOMEM; + + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + ret = nilfs_btree_get_block(btree, ptr, &bh); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + + if (!buffer_dirty(bh)) + mark_buffer_dirty(bh); + brelse(bh); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); + + out: + nilfs_btree_free_path(path); + return ret; +} + +static const struct nilfs_bmap_operations nilfs_btree_ops = { + .bop_lookup = nilfs_btree_lookup, + .bop_lookup_contig = nilfs_btree_lookup_contig, + .bop_insert = nilfs_btree_insert, + .bop_delete = nilfs_btree_delete, + .bop_clear = NULL, + + .bop_propagate = nilfs_btree_propagate, + + .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers, + + .bop_assign = nilfs_btree_assign, + .bop_mark = nilfs_btree_mark, + + .bop_last_key = nilfs_btree_last_key, + .bop_check_insert = NULL, + .bop_check_delete = nilfs_btree_check_delete, + .bop_gather_data = nilfs_btree_gather_data, +}; + +static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { + .bop_lookup = NULL, + .bop_lookup_contig = NULL, + .bop_insert = NULL, + .bop_delete = NULL, + .bop_clear = NULL, + + .bop_propagate = nilfs_btree_propagate_gc, + + .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers, + + .bop_assign = nilfs_btree_assign_gc, + .bop_mark = NULL, + + .bop_last_key = NULL, + .bop_check_insert = NULL, + .bop_check_delete = NULL, + .bop_gather_data = NULL, +}; + +int nilfs_btree_init(struct nilfs_bmap *bmap) +{ + bmap->b_ops = &nilfs_btree_ops; + bmap->b_nchildren_per_block = + NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap)); + return 0; +} + +void nilfs_btree_init_gc(struct nilfs_bmap *bmap) +{ + bmap->b_ops = &nilfs_btree_ops_gc; + bmap->b_nchildren_per_block = + NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap)); +} diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h new file mode 100644 index 00000000..22c02e35 --- /dev/null +++ b/fs/nilfs2/btree.h @@ -0,0 +1,77 @@ +/* + * btree.h - NILFS B-tree. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#ifndef _NILFS_BTREE_H +#define _NILFS_BTREE_H + +#include +#include +#include +#include +#include "btnode.h" +#include "bmap.h" + +/** + * struct nilfs_btree_path - A path on which B-tree operations are executed + * @bp_bh: buffer head of node block + * @bp_sib_bh: buffer head of sibling node block + * @bp_index: index of child node + * @bp_oldreq: ptr end request for old ptr + * @bp_newreq: ptr alloc request for new ptr + * @bp_op: rebalance operation + */ +struct nilfs_btree_path { + struct buffer_head *bp_bh; + struct buffer_head *bp_sib_bh; + int bp_index; + union nilfs_bmap_ptr_req bp_oldreq; + union nilfs_bmap_ptr_req bp_newreq; + struct nilfs_btnode_chkey_ctxt bp_ctxt; + void (*bp_op)(struct nilfs_bmap *, struct nilfs_btree_path *, + int, __u64 *, __u64 *); +}; + +#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE +#define NILFS_BTREE_ROOT_NCHILDREN_MAX \ + ((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) / \ + (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */))) +#define NILFS_BTREE_ROOT_NCHILDREN_MIN 0 +#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE (sizeof(__le64)) +#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) \ + (((nodesize) - sizeof(struct nilfs_btree_node) - \ + NILFS_BTREE_NODE_EXTRA_PAD_SIZE) / \ + (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */))) +#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize) \ + ((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1) +#define NILFS_BTREE_KEY_MIN ((__u64)0) +#define NILFS_BTREE_KEY_MAX (~(__u64)0) + +extern struct kmem_cache *nilfs_btree_path_cache; + +int nilfs_btree_init(struct nilfs_bmap *); +int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64, + const __u64 *, const __u64 *, int); +void nilfs_btree_init_gc(struct nilfs_bmap *); + +int nilfs_btree_broken_node_block(struct buffer_head *bh); + +#endif /* _NILFS_BTREE_H */ diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c new file mode 100644 index 00000000..c9b342c8 --- /dev/null +++ b/fs/nilfs2/cpfile.c @@ -0,0 +1,965 @@ +/* + * cpfile.c - NILFS checkpoint file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#include +#include +#include +#include +#include +#include +#include "mdt.h" +#include "cpfile.h" + + +static inline unsigned long +nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile) +{ + return NILFS_MDT(cpfile)->mi_entries_per_block; +} + +/* block number from the beginning of the file */ +static unsigned long +nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno) +{ + __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; + do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); + return (unsigned long)tcno; +} + +/* offset in block */ +static unsigned long +nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno) +{ + __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; + return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); +} + +static unsigned long +nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile, + __u64 curr, + __u64 max) +{ + return min_t(__u64, + nilfs_cpfile_checkpoints_per_block(cpfile) - + nilfs_cpfile_get_offset(cpfile, curr), + max - curr); +} + +static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile, + __u64 cno) +{ + return nilfs_cpfile_get_blkoff(cpfile, cno) == 0; +} + +static unsigned int +nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr, + unsigned int n) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + unsigned int count; + + count = le32_to_cpu(cp->cp_checkpoints_count) + n; + cp->cp_checkpoints_count = cpu_to_le32(count); + return count; +} + +static unsigned int +nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr, + unsigned int n) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + unsigned int count; + + WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n); + count = le32_to_cpu(cp->cp_checkpoints_count) - n; + cp->cp_checkpoints_count = cpu_to_le32(count); + return count; +} + +static inline struct nilfs_cpfile_header * +nilfs_cpfile_block_get_header(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr) +{ + return kaddr + bh_offset(bh); +} + +static struct nilfs_checkpoint * +nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno, + struct buffer_head *bh, + void *kaddr) +{ + return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) * + NILFS_MDT(cpfile)->mi_entry_size; +} + +static void nilfs_cpfile_block_init(struct inode *cpfile, + struct buffer_head *bh, + void *kaddr) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + int n = nilfs_cpfile_checkpoints_per_block(cpfile); + + while (n-- > 0) { + nilfs_checkpoint_set_invalid(cp); + cp = (void *)cp + cpsz; + } +} + +static inline int nilfs_cpfile_get_header_block(struct inode *cpfile, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp); +} + +static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile, + __u64 cno, + int create, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(cpfile, + nilfs_cpfile_get_blkoff(cpfile, cno), + create, nilfs_cpfile_block_init, bhp); +} + +static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile, + __u64 cno) +{ + return nilfs_mdt_delete_block(cpfile, + nilfs_cpfile_get_blkoff(cpfile, cno)); +} + +/** + * nilfs_cpfile_get_checkpoint - get a checkpoint + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @create: create flag + * @cpp: pointer to a checkpoint + * @bhp: pointer to a buffer head + * + * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint + * specified by @cno. A new checkpoint will be created if @cno is the current + * checkpoint number and @create is nonzero. + * + * Return Value: On success, 0 is returned, and the checkpoint and the + * buffer head of the buffer on which the checkpoint is located are stored in + * the place pointed by @cpp and @bhp, respectively. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + * + * %-EINVAL - invalid checkpoint. + */ +int nilfs_cpfile_get_checkpoint(struct inode *cpfile, + __u64 cno, + int create, + struct nilfs_checkpoint **cpp, + struct buffer_head **bhp) +{ + struct buffer_head *header_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) || + (cno < nilfs_mdt_cno(cpfile) && create))) + return -EINVAL; + + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_sem; + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh); + if (ret < 0) + goto out_header; + kaddr = kmap(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + if (!create) { + kunmap(cp_bh->b_page); + brelse(cp_bh); + ret = -ENOENT; + goto out_header; + } + /* a newly-created checkpoint */ + nilfs_checkpoint_clear_invalid(cp); + if (!nilfs_cpfile_is_in_first(cpfile, cno)) + nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh, + kaddr, 1); + mark_buffer_dirty(cp_bh); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, + kaddr); + le64_add_cpu(&header->ch_ncheckpoints, 1); + kunmap_atomic(kaddr, KM_USER0); + mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + } + + if (cpp != NULL) + *cpp = cp; + *bhp = cp_bh; + + out_header: + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_put_checkpoint - put a checkpoint + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @bh: buffer head + * + * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint + * specified by @cno. @bh must be the buffer head which has been returned by + * a previous call to nilfs_cpfile_get_checkpoint() with @cno. + */ +void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno, + struct buffer_head *bh) +{ + kunmap(bh->b_page); + brelse(bh); +} + +/** + * nilfs_cpfile_delete_checkpoints - delete checkpoints + * @cpfile: inode of checkpoint file + * @start: start checkpoint number + * @end: end checkpoint numer + * + * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in + * the period from @start to @end, excluding @end itself. The checkpoints + * which have been already deleted are ignored. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - invalid checkpoints. + */ +int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, + __u64 start, + __u64 end) +{ + struct buffer_head *header_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + __u64 cno; + void *kaddr; + unsigned long tnicps; + int ret, ncps, nicps, count, i; + + if (unlikely(start == 0 || start > end)) { + printk(KERN_ERR "%s: invalid range of checkpoint numbers: " + "[%llu, %llu)\n", __func__, + (unsigned long long)start, (unsigned long long)end); + return -EINVAL; + } + + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_sem; + tnicps = 0; + + for (cno = start; cno < end; cno += ncps) { + ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) { + if (ret != -ENOENT) + break; + /* skip hole */ + ret = 0; + continue; + } + + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint( + cpfile, cno, cp_bh, kaddr); + nicps = 0; + for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) { + WARN_ON(nilfs_checkpoint_snapshot(cp)); + if (!nilfs_checkpoint_invalid(cp)) { + nilfs_checkpoint_set_invalid(cp); + nicps++; + } + } + if (nicps > 0) { + tnicps += nicps; + mark_buffer_dirty(cp_bh); + nilfs_mdt_mark_dirty(cpfile); + if (!nilfs_cpfile_is_in_first(cpfile, cno)) { + count = + nilfs_cpfile_block_sub_valid_checkpoints( + cpfile, cp_bh, kaddr, nicps); + if (count == 0) { + /* make hole */ + kunmap_atomic(kaddr, KM_USER0); + brelse(cp_bh); + ret = + nilfs_cpfile_delete_checkpoint_block( + cpfile, cno); + if (ret == 0) + continue; + printk(KERN_ERR + "%s: cannot delete block\n", + __func__); + break; + } + } + } + + kunmap_atomic(kaddr, KM_USER0); + brelse(cp_bh); + } + + if (tnicps > 0) { + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, + kaddr); + le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps); + mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + kunmap_atomic(kaddr, KM_USER0); + } + + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile, + struct nilfs_checkpoint *cp, + struct nilfs_cpinfo *ci) +{ + ci->ci_flags = le32_to_cpu(cp->cp_flags); + ci->ci_cno = le64_to_cpu(cp->cp_cno); + ci->ci_create = le64_to_cpu(cp->cp_create); + ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc); + ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count); + ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count); + ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next); +} + +static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, + void *buf, unsigned cisz, size_t nci) +{ + struct nilfs_checkpoint *cp; + struct nilfs_cpinfo *ci = buf; + struct buffer_head *bh; + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop; + void *kaddr; + int n, ret; + int ncps, i; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_read(&NILFS_MDT(cpfile)->mi_sem); + + for (n = 0; cno < cur_cno && n < nci; cno += ncps) { + ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out; + continue; /* skip hole */ + } + + kaddr = kmap_atomic(bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) { + if (!nilfs_checkpoint_invalid(cp)) { + nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, + ci); + ci = (void *)ci + cisz; + n++; + } + } + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + } + + ret = n; + if (n > 0) { + ci = (void *)ci - cisz; + *cnop = ci->ci_cno + 1; + } + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, + void *buf, unsigned cisz, size_t nci) +{ + struct buffer_head *bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_cpinfo *ci = buf; + __u64 curr = *cnop, next; + unsigned long curr_blkoff, next_blkoff; + void *kaddr; + int n = 0, ret; + + down_read(&NILFS_MDT(cpfile)->mi_sem); + + if (curr == 0) { + ret = nilfs_cpfile_get_header_block(cpfile, &bh); + if (ret < 0) + goto out; + kaddr = kmap_atomic(bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + curr = le64_to_cpu(header->ch_snapshot_list.ssl_next); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + if (curr == 0) { + ret = 0; + goto out; + } + } else if (unlikely(curr == ~(__u64)0)) { + ret = 0; + goto out; + } + + curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh); + if (unlikely(ret < 0)) { + if (ret == -ENOENT) + ret = 0; /* No snapshots (started from a hole block) */ + goto out; + } + kaddr = kmap_atomic(bh->b_page, KM_USER0); + while (n < nci) { + cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr); + curr = ~(__u64)0; /* Terminator */ + if (unlikely(nilfs_checkpoint_invalid(cp) || + !nilfs_checkpoint_snapshot(cp))) + break; + nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, ci); + ci = (void *)ci + cisz; + n++; + next = le64_to_cpu(cp->cp_snapshot_list.ssl_next); + if (next == 0) + break; /* reach end of the snapshot list */ + + next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next); + if (curr_blkoff != next_blkoff) { + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, + 0, &bh); + if (unlikely(ret < 0)) { + WARN_ON(ret == -ENOENT); + goto out; + } + kaddr = kmap_atomic(bh->b_page, KM_USER0); + } + curr = next; + curr_blkoff = next_blkoff; + } + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + *cnop = curr; + ret = n; + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_get_cpinfo - + * @cpfile: + * @cno: + * @ci: + * @nci: + */ + +ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode, + void *buf, unsigned cisz, size_t nci) +{ + switch (mode) { + case NILFS_CHECKPOINT: + return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, buf, cisz, nci); + case NILFS_SNAPSHOT: + return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, buf, cisz, nci); + default: + return -EINVAL; + } +} + +/** + * nilfs_cpfile_delete_checkpoint - + * @cpfile: + * @cno: + */ +int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno) +{ + struct nilfs_cpinfo ci; + __u64 tcno = cno; + ssize_t nci; + + nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, sizeof(ci), 1); + if (nci < 0) + return nci; + else if (nci == 0 || ci.ci_cno != cno) + return -ENOENT; + else if (nilfs_cpinfo_snapshot(&ci)) + return -EBUSY; + + return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1); +} + +static struct nilfs_snapshot_list * +nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile, + __u64 cno, + struct buffer_head *bh, + void *kaddr) +{ + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + + if (cno != 0) { + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + list = &cp->cp_snapshot_list; + } else { + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + list = &header->ch_snapshot_list; + } + return list; +} + +static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + __u64 curr, prev; + unsigned long curr_blkoff, prev_blkoff; + void *kaddr; + int ret; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + ret = -ENOENT; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + if (nilfs_checkpoint_snapshot(cp)) { + ret = 0; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + kunmap_atomic(kaddr, KM_USER0); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_cp; + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + list = &header->ch_snapshot_list; + curr_bh = header_bh; + get_bh(curr_bh); + curr = 0; + curr_blkoff = 0; + prev = le64_to_cpu(list->ssl_prev); + while (prev > cno) { + prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev); + curr = prev; + if (curr_blkoff != prev_blkoff) { + kunmap_atomic(kaddr, KM_USER0); + brelse(curr_bh); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, + 0, &curr_bh); + if (ret < 0) + goto out_header; + kaddr = kmap_atomic(curr_bh->b_page, KM_USER0); + } + curr_blkoff = prev_blkoff; + cp = nilfs_cpfile_block_get_checkpoint( + cpfile, curr, curr_bh, kaddr); + list = &cp->cp_snapshot_list; + prev = le64_to_cpu(list->ssl_prev); + } + kunmap_atomic(kaddr, KM_USER0); + + if (prev != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, + &prev_bh); + if (ret < 0) + goto out_curr; + } else { + prev_bh = header_bh; + get_bh(prev_bh); + } + + kaddr = kmap_atomic(curr_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, curr, curr_bh, kaddr); + list->ssl_prev = cpu_to_le64(cno); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr); + cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev); + nilfs_checkpoint_set_snapshot(cp); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(prev_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, prev, prev_bh, kaddr); + list->ssl_next = cpu_to_le64(cno); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + le64_add_cpu(&header->ch_nsnapshots, 1); + kunmap_atomic(kaddr, KM_USER0); + + mark_buffer_dirty(prev_bh); + mark_buffer_dirty(curr_bh); + mark_buffer_dirty(cp_bh); + mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + + brelse(prev_bh); + + out_curr: + brelse(curr_bh); + + out_header: + brelse(header_bh); + + out_cp: + brelse(cp_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + __u64 next, prev; + void *kaddr; + int ret; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + ret = -ENOENT; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + if (!nilfs_checkpoint_snapshot(cp)) { + ret = 0; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + + list = &cp->cp_snapshot_list; + next = le64_to_cpu(list->ssl_next); + prev = le64_to_cpu(list->ssl_prev); + kunmap_atomic(kaddr, KM_USER0); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_cp; + if (next != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0, + &next_bh); + if (ret < 0) + goto out_header; + } else { + next_bh = header_bh; + get_bh(next_bh); + } + if (prev != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, + &prev_bh); + if (ret < 0) + goto out_next; + } else { + prev_bh = header_bh; + get_bh(prev_bh); + } + + kaddr = kmap_atomic(next_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, next, next_bh, kaddr); + list->ssl_prev = cpu_to_le64(prev); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(prev_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, prev, prev_bh, kaddr); + list->ssl_next = cpu_to_le64(next); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + cp->cp_snapshot_list.ssl_next = cpu_to_le64(0); + cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0); + nilfs_checkpoint_clear_snapshot(cp); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + le64_add_cpu(&header->ch_nsnapshots, -1); + kunmap_atomic(kaddr, KM_USER0); + + mark_buffer_dirty(next_bh); + mark_buffer_dirty(prev_bh); + mark_buffer_dirty(cp_bh); + mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + + brelse(prev_bh); + + out_next: + brelse(next_bh); + + out_header: + brelse(header_bh); + + out_cp: + brelse(cp_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_is_snapshot - + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * + * Description: + * + * Return Value: On success, 1 is returned if the checkpoint specified by + * @cno is a snapshot, or 0 if not. On error, one of the following negative + * error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + */ +int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *bh; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + /* CP number is invalid if it's zero or larger than the + largest exist one.*/ + if (cno == 0 || cno >= nilfs_mdt_cno(cpfile)) + return -ENOENT; + down_read(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); + if (ret < 0) + goto out; + kaddr = kmap_atomic(bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) + ret = -ENOENT; + else + ret = nilfs_checkpoint_snapshot(cp); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_change_cpmode - change checkpoint mode + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @status: mode of checkpoint + * + * Description: nilfs_change_cpmode() changes the mode of the checkpoint + * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + */ +int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) +{ + int ret; + + switch (mode) { + case NILFS_CHECKPOINT: + if (nilfs_checkpoint_is_mounted(cpfile->i_sb, cno)) + /* + * Current implementation does not have to protect + * plain read-only mounts since they are exclusive + * with a read/write mount and are protected from the + * cleaner. + */ + ret = -EBUSY; + else + ret = nilfs_cpfile_clear_snapshot(cpfile, cno); + return ret; + case NILFS_SNAPSHOT: + return nilfs_cpfile_set_snapshot(cpfile, cno); + default: + return -EINVAL; + } +} + +/** + * nilfs_cpfile_get_stat - get checkpoint statistics + * @cpfile: inode of checkpoint file + * @stat: pointer to a structure of checkpoint statistics + * + * Description: nilfs_cpfile_get_stat() returns information about checkpoints. + * + * Return Value: On success, 0 is returned, and checkpoints information is + * stored in the place pointed by @stat. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) +{ + struct buffer_head *bh; + struct nilfs_cpfile_header *header; + void *kaddr; + int ret; + + down_read(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + cpstat->cs_cno = nilfs_mdt_cno(cpfile); + cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints); + cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + + out_sem: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_read - read or get cpfile inode + * @sb: super block instance + * @cpsize: size of a checkpoint entry + * @raw_inode: on-disk cpfile inode + * @inodep: buffer to store the inode + */ +int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, + struct nilfs_inode *raw_inode, struct inode **inodep) +{ + struct inode *cpfile; + int err; + + cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO); + if (unlikely(!cpfile)) + return -ENOMEM; + if (!(cpfile->i_state & I_NEW)) + goto out; + + err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0); + if (err) + goto failed; + + nilfs_mdt_set_entry_size(cpfile, cpsize, + sizeof(struct nilfs_cpfile_header)); + + err = nilfs_read_inode_common(cpfile, raw_inode); + if (err) + goto failed; + + unlock_new_inode(cpfile); + out: + *inodep = cpfile; + return 0; + failed: + iget_failed(cpfile); + return err; +} diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h new file mode 100644 index 00000000..a242b9a3 --- /dev/null +++ b/fs/nilfs2/cpfile.h @@ -0,0 +1,46 @@ +/* + * cpfile.h - NILFS checkpoint file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#ifndef _NILFS_CPFILE_H +#define _NILFS_CPFILE_H + +#include +#include +#include + + +int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, + struct nilfs_checkpoint **, + struct buffer_head **); +void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *); +int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64); +int nilfs_cpfile_delete_checkpoint(struct inode *, __u64); +int nilfs_cpfile_change_cpmode(struct inode *, __u64, int); +int nilfs_cpfile_is_snapshot(struct inode *, __u64); +int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *); +ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned, + size_t); + +int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, + struct nilfs_inode *raw_inode, struct inode **inodep); + +#endif /* _NILFS_CPFILE_H */ diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c new file mode 100644 index 00000000..fcc2f869 --- /dev/null +++ b/fs/nilfs2/dat.c @@ -0,0 +1,511 @@ +/* + * dat.c - NILFS disk address translation. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#include +#include +#include +#include +#include "nilfs.h" +#include "mdt.h" +#include "alloc.h" +#include "dat.h" + + +#define NILFS_CNO_MIN ((__u64)1) +#define NILFS_CNO_MAX (~(__u64)0) + +struct nilfs_dat_info { + struct nilfs_mdt_info mi; + struct nilfs_palloc_cache palloc_cache; + struct nilfs_shadow_map shadow; +}; + +static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat) +{ + return (struct nilfs_dat_info *)NILFS_MDT(dat); +} + +static int nilfs_dat_prepare_entry(struct inode *dat, + struct nilfs_palloc_req *req, int create) +{ + return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr, + create, &req->pr_entry_bh); +} + +static void nilfs_dat_commit_entry(struct inode *dat, + struct nilfs_palloc_req *req) +{ + mark_buffer_dirty(req->pr_entry_bh); + nilfs_mdt_mark_dirty(dat); + brelse(req->pr_entry_bh); +} + +static void nilfs_dat_abort_entry(struct inode *dat, + struct nilfs_palloc_req *req) +{ + brelse(req->pr_entry_bh); +} + +int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + int ret; + + ret = nilfs_palloc_prepare_alloc_entry(dat, req); + if (ret < 0) + return ret; + + ret = nilfs_dat_prepare_entry(dat, req, 1); + if (ret < 0) + nilfs_palloc_abort_alloc_entry(dat, req); + + return ret; +} + +void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(NILFS_CNO_MIN); + entry->de_end = cpu_to_le64(NILFS_CNO_MAX); + entry->de_blocknr = cpu_to_le64(0); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_palloc_commit_alloc_entry(dat, req); + nilfs_dat_commit_entry(dat, req); +} + +void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + nilfs_dat_abort_entry(dat, req); + nilfs_palloc_abort_alloc_entry(dat, req); +} + +static void nilfs_dat_commit_free(struct inode *dat, + struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(NILFS_CNO_MIN); + entry->de_end = cpu_to_le64(NILFS_CNO_MIN); + entry->de_blocknr = cpu_to_le64(0); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_dat_commit_entry(dat, req); + nilfs_palloc_commit_free_entry(dat, req); +} + +int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) +{ + int ret; + + ret = nilfs_dat_prepare_entry(dat, req, 0); + WARN_ON(ret == -ENOENT); + return ret; +} + +void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, + sector_t blocknr) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat)); + entry->de_blocknr = cpu_to_le64(blocknr); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_dat_commit_entry(dat, req); +} + +int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + __u64 start; + sector_t blocknr; + void *kaddr; + int ret; + + ret = nilfs_dat_prepare_entry(dat, req, 0); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + start = le64_to_cpu(entry->de_start); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr, KM_USER0); + + if (blocknr == 0) { + ret = nilfs_palloc_prepare_free_entry(dat, req); + if (ret < 0) { + nilfs_dat_abort_entry(dat, req); + return ret; + } + } + + return 0; +} + +void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req, + int dead) +{ + struct nilfs_dat_entry *entry; + __u64 start, end; + sector_t blocknr; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + end = start = le64_to_cpu(entry->de_start); + if (!dead) { + end = nilfs_mdt_cno(dat); + WARN_ON(start > end); + } + entry->de_end = cpu_to_le64(end); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr, KM_USER0); + + if (blocknr == 0) + nilfs_dat_commit_free(dat, req); + else + nilfs_dat_commit_entry(dat, req); +} + +void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + __u64 start; + sector_t blocknr; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + start = le64_to_cpu(entry->de_start); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr, KM_USER0); + + if (start == nilfs_mdt_cno(dat) && blocknr == 0) + nilfs_palloc_abort_free_entry(dat, req); + nilfs_dat_abort_entry(dat, req); +} + +int nilfs_dat_prepare_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq) +{ + int ret; + + ret = nilfs_dat_prepare_end(dat, oldreq); + if (!ret) { + ret = nilfs_dat_prepare_alloc(dat, newreq); + if (ret < 0) + nilfs_dat_abort_end(dat, oldreq); + } + return ret; +} + +void nilfs_dat_commit_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq, int dead) +{ + nilfs_dat_commit_end(dat, oldreq, dead); + nilfs_dat_commit_alloc(dat, newreq); +} + +void nilfs_dat_abort_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq) +{ + nilfs_dat_abort_end(dat, oldreq); + nilfs_dat_abort_alloc(dat, newreq); +} + +/** + * nilfs_dat_mark_dirty - + * @dat: DAT file inode + * @vblocknr: virtual block number + * + * Description: + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr) +{ + struct nilfs_palloc_req req; + int ret; + + req.pr_entry_nr = vblocknr; + ret = nilfs_dat_prepare_entry(dat, &req, 0); + if (ret == 0) + nilfs_dat_commit_entry(dat, &req); + return ret; +} + +/** + * nilfs_dat_freev - free virtual block numbers + * @dat: DAT file inode + * @vblocknrs: array of virtual block numbers + * @nitems: number of virtual block numbers + * + * Description: nilfs_dat_freev() frees the virtual block numbers specified by + * @vblocknrs and @nitems. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The virtual block number have not been allocated. + */ +int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems) +{ + return nilfs_palloc_freev(dat, vblocknrs, nitems); +} + +/** + * nilfs_dat_move - change a block number + * @dat: DAT file inode + * @vblocknr: virtual block number + * @blocknr: block number + * + * Description: nilfs_dat_move() changes the block number associated with + * @vblocknr to @blocknr. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) +{ + struct buffer_head *entry_bh; + struct nilfs_dat_entry *entry; + void *kaddr; + int ret; + + ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); + if (ret < 0) + return ret; + + /* + * The given disk block number (blocknr) is not yet written to + * the device at this point. + * + * To prevent nilfs_dat_translate() from returning the + * uncommitted block number, this makes a copy of the entry + * buffer and redirects nilfs_dat_translate() to the copy. + */ + if (!buffer_nilfs_redirected(entry_bh)) { + ret = nilfs_mdt_freeze_buffer(dat, entry_bh); + if (ret) { + brelse(entry_bh); + return ret; + } + } + + kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); + if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { + printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__, + (unsigned long long)vblocknr, + (unsigned long long)le64_to_cpu(entry->de_start), + (unsigned long long)le64_to_cpu(entry->de_end)); + kunmap_atomic(kaddr, KM_USER0); + brelse(entry_bh); + return -EINVAL; + } + WARN_ON(blocknr == 0); + entry->de_blocknr = cpu_to_le64(blocknr); + kunmap_atomic(kaddr, KM_USER0); + + mark_buffer_dirty(entry_bh); + nilfs_mdt_mark_dirty(dat); + + brelse(entry_bh); + + return 0; +} + +/** + * nilfs_dat_translate - translate a virtual block number to a block number + * @dat: DAT file inode + * @vblocknr: virtual block number + * @blocknrp: pointer to a block number + * + * Description: nilfs_dat_translate() maps the virtual block number @vblocknr + * to the corresponding block number. + * + * Return Value: On success, 0 is returned and the block number associated + * with @vblocknr is stored in the place pointed by @blocknrp. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A block number associated with @vblocknr does not exist. + */ +int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) +{ + struct buffer_head *entry_bh, *bh; + struct nilfs_dat_entry *entry; + sector_t blocknr; + void *kaddr; + int ret; + + ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); + if (ret < 0) + return ret; + + if (!nilfs_doing_gc() && buffer_nilfs_redirected(entry_bh)) { + bh = nilfs_mdt_get_frozen_buffer(dat, entry_bh); + if (bh) { + WARN_ON(!buffer_uptodate(bh)); + brelse(entry_bh); + entry_bh = bh; + } + } + + kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); + blocknr = le64_to_cpu(entry->de_blocknr); + if (blocknr == 0) { + ret = -ENOENT; + goto out; + } + *blocknrp = blocknr; + + out: + kunmap_atomic(kaddr, KM_USER0); + brelse(entry_bh); + return ret; +} + +ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz, + size_t nvi) +{ + struct buffer_head *entry_bh; + struct nilfs_dat_entry *entry; + struct nilfs_vinfo *vinfo = buf; + __u64 first, last; + void *kaddr; + unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block; + int i, j, n, ret; + + for (i = 0; i < nvi; i += n) { + ret = nilfs_palloc_get_entry_block(dat, vinfo->vi_vblocknr, + 0, &entry_bh); + if (ret < 0) + return ret; + kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); + /* last virtual block number in this block */ + first = vinfo->vi_vblocknr; + do_div(first, entries_per_block); + first *= entries_per_block; + last = first + entries_per_block - 1; + for (j = i, n = 0; + j < nvi && vinfo->vi_vblocknr >= first && + vinfo->vi_vblocknr <= last; + j++, n++, vinfo = (void *)vinfo + visz) { + entry = nilfs_palloc_block_get_entry( + dat, vinfo->vi_vblocknr, entry_bh, kaddr); + vinfo->vi_start = le64_to_cpu(entry->de_start); + vinfo->vi_end = le64_to_cpu(entry->de_end); + vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr); + } + kunmap_atomic(kaddr, KM_USER0); + brelse(entry_bh); + } + + return nvi; +} + +/** + * nilfs_dat_read - read or get dat inode + * @sb: super block instance + * @entry_size: size of a dat entry + * @raw_inode: on-disk dat inode + * @inodep: buffer to store the inode + */ +int nilfs_dat_read(struct super_block *sb, size_t entry_size, + struct nilfs_inode *raw_inode, struct inode **inodep) +{ + static struct lock_class_key dat_lock_key; + struct inode *dat; + struct nilfs_dat_info *di; + int err; + + dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO); + if (unlikely(!dat)) + return -ENOMEM; + if (!(dat->i_state & I_NEW)) + goto out; + + err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di)); + if (err) + goto failed; + + err = nilfs_palloc_init_blockgroup(dat, entry_size); + if (err) + goto failed; + + di = NILFS_DAT_I(dat); + lockdep_set_class(&di->mi.mi_sem, &dat_lock_key); + nilfs_palloc_setup_cache(dat, &di->palloc_cache); + nilfs_mdt_setup_shadow_map(dat, &di->shadow); + + err = nilfs_read_inode_common(dat, raw_inode); + if (err) + goto failed; + + unlock_new_inode(dat); + out: + *inodep = dat; + return 0; + failed: + iget_failed(dat); + return err; +} diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h new file mode 100644 index 00000000..cbd8e973 --- /dev/null +++ b/fs/nilfs2/dat.h @@ -0,0 +1,59 @@ +/* + * dat.h - NILFS disk address translation. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#ifndef _NILFS_DAT_H +#define _NILFS_DAT_H + +#include +#include +#include + + +struct nilfs_palloc_req; + +int nilfs_dat_translate(struct inode *, __u64, sector_t *); + +int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *); +int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *, + sector_t); +int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int); +void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *); +int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *); +void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *, int); +void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *); + +int nilfs_dat_mark_dirty(struct inode *, __u64); +int nilfs_dat_freev(struct inode *, __u64 *, size_t); +int nilfs_dat_move(struct inode *, __u64, sector_t); +ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t); + +int nilfs_dat_read(struct super_block *sb, size_t entry_size, + struct nilfs_inode *raw_inode, struct inode **inodep); + +#endif /* _NILFS_DAT_H */ diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c new file mode 100644 index 00000000..3a192394 --- /dev/null +++ b/fs/nilfs2/dir.c @@ -0,0 +1,688 @@ +/* + * dir.c - NILFS directory entry operations + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Modified for NILFS by Amagai Yoshiji + */ +/* + * linux/fs/ext2/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * All code that works with directory layout had been switched to pagecache + * and moved here. AV + */ + +#include +#include "nilfs.h" +#include "page.h" + +/* + * nilfs uses block-sized chunks. Arguably, sector-sized ones would be + * more robust, but we have what we have + */ +static inline unsigned nilfs_chunk_size(struct inode *inode) +{ + return inode->i_sb->s_blocksize; +} + +static inline void nilfs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +/* + * Return the offset into page `page_nr' of the last valid + * byte in that page, plus one. + */ +static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr) +{ + unsigned last_byte = inode->i_size; + + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte; +} + +static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to) +{ + loff_t pos = page_offset(page) + from; + return __block_write_begin(page, pos, to - from, nilfs_get_block); +} + +static void nilfs_commit_chunk(struct page *page, + struct address_space *mapping, + unsigned from, unsigned to) +{ + struct inode *dir = mapping->host; + loff_t pos = page_offset(page) + from; + unsigned len = to - from; + unsigned nr_dirty, copied; + int err; + + nr_dirty = nilfs_page_count_clean_buffers(page, from, to); + copied = block_write_end(NULL, mapping, pos, len, len, page, NULL); + if (pos + copied > dir->i_size) + i_size_write(dir, pos + copied); + if (IS_DIRSYNC(dir)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + err = nilfs_set_file_dirty(dir, nr_dirty); + WARN_ON(err); /* do not happen */ + unlock_page(page); +} + +static void nilfs_check_page(struct page *page) +{ + struct inode *dir = page->mapping->host; + struct super_block *sb = dir->i_sb; + unsigned chunk_size = nilfs_chunk_size(dir); + char *kaddr = page_address(page); + unsigned offs, rec_len; + unsigned limit = PAGE_CACHE_SIZE; + struct nilfs_dir_entry *p; + char *error; + + if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_CACHE_MASK; + if (limit & (chunk_size - 1)) + goto Ebadsize; + if (!limit) + goto out; + } + for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) { + p = (struct nilfs_dir_entry *)(kaddr + offs); + rec_len = nilfs_rec_len_from_disk(p->rec_len); + + if (rec_len < NILFS_DIR_REC_LEN(1)) + goto Eshort; + if (rec_len & 3) + goto Ealign; + if (rec_len < NILFS_DIR_REC_LEN(p->name_len)) + goto Enamelen; + if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) + goto Espan; + } + if (offs != limit) + goto Eend; +out: + SetPageChecked(page); + return; + + /* Too bad, we had an error */ + +Ebadsize: + nilfs_error(sb, "nilfs_check_page", + "size of directory #%lu is not a multiple of chunk size", + dir->i_ino + ); + goto fail; +Eshort: + error = "rec_len is smaller than minimal"; + goto bad_entry; +Ealign: + error = "unaligned directory entry"; + goto bad_entry; +Enamelen: + error = "rec_len is too small for name_len"; + goto bad_entry; +Espan: + error = "directory entry across blocks"; +bad_entry: + nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<inode), + rec_len, p->name_len); + goto fail; +Eend: + p = (struct nilfs_dir_entry *)(kaddr + offs); + nilfs_error(sb, "nilfs_check_page", + "entry in directory #%lu spans the page boundary" + "offset=%lu, inode=%lu", + dir->i_ino, (page->index<inode)); +fail: + SetPageChecked(page); + SetPageError(page); +} + +static struct page *nilfs_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + + if (!IS_ERR(page)) { + kmap(page); + if (!PageChecked(page)) + nilfs_check_page(page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + nilfs_put_page(page); + return ERR_PTR(-EIO); +} + +/* + * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure. + * + * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller. + */ +static int +nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * p is at least 6 bytes before the end of page + */ +static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p) +{ + return (struct nilfs_dir_entry *)((char *)p + + nilfs_rec_len_from_disk(p->rec_len)); +} + +static unsigned char +nilfs_filetype_table[NILFS_FT_MAX] = { + [NILFS_FT_UNKNOWN] = DT_UNKNOWN, + [NILFS_FT_REG_FILE] = DT_REG, + [NILFS_FT_DIR] = DT_DIR, + [NILFS_FT_CHRDEV] = DT_CHR, + [NILFS_FT_BLKDEV] = DT_BLK, + [NILFS_FT_FIFO] = DT_FIFO, + [NILFS_FT_SOCK] = DT_SOCK, + [NILFS_FT_SYMLINK] = DT_LNK, +}; + +#define S_SHIFT 12 +static unsigned char +nilfs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = NILFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = NILFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = NILFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = NILFS_FT_SYMLINK, +}; + +static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode) +{ + mode_t mode = inode->i_mode; + + de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + loff_t pos = filp->f_pos; + struct inode *inode = filp->f_dentry->d_inode; + struct super_block *sb = inode->i_sb; + unsigned int offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = dir_pages(inode); +/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */ + unsigned char *types = NULL; + int ret; + + if (pos > inode->i_size - NILFS_DIR_REC_LEN(1)) + goto success; + + types = nilfs_filetype_table; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + struct nilfs_dir_entry *de; + struct page *page = nilfs_get_page(inode, n); + + if (IS_ERR(page)) { + nilfs_error(sb, __func__, "bad page in #%lu", + inode->i_ino); + filp->f_pos += PAGE_CACHE_SIZE - offset; + ret = -EIO; + goto done; + } + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)(kaddr + offset); + limit = kaddr + nilfs_last_byte(inode, n) - + NILFS_DIR_REC_LEN(1); + for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) { + if (de->rec_len == 0) { + nilfs_error(sb, __func__, + "zero-length directory entry"); + ret = -EIO; + nilfs_put_page(page); + goto done; + } + if (de->inode) { + int over; + unsigned char d_type = DT_UNKNOWN; + + if (types && de->file_type < NILFS_FT_MAX) + d_type = types[de->file_type]; + + offset = (char *)de - kaddr; + over = filldir(dirent, de->name, de->name_len, + (n<inode), d_type); + if (over) { + nilfs_put_page(page); + goto success; + } + } + filp->f_pos += nilfs_rec_len_from_disk(de->rec_len); + } + nilfs_put_page(page); + } + +success: + ret = 0; +done: + return ret; +} + +/* + * nilfs_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the page in which the entry was found, and the entry itself + * (as a parameter - res_dir). Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct nilfs_dir_entry * +nilfs_find_entry(struct inode *dir, const struct qstr *qstr, + struct page **res_page) +{ + const unsigned char *name = qstr->name; + int namelen = qstr->len; + unsigned reclen = NILFS_DIR_REC_LEN(namelen); + unsigned long start, n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + struct nilfs_inode_info *ei = NILFS_I(dir); + struct nilfs_dir_entry *de; + + if (npages == 0) + goto out; + + /* OFFSET_CACHE */ + *res_page = NULL; + + start = ei->i_dir_start_lookup; + if (start >= npages) + start = 0; + n = start; + do { + char *kaddr; + page = nilfs_get_page(dir, n); + if (!IS_ERR(page)) { + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(dir, n) - reclen; + while ((char *) de <= kaddr) { + if (de->rec_len == 0) { + nilfs_error(dir->i_sb, __func__, + "zero-length directory entry"); + nilfs_put_page(page); + goto out; + } + if (nilfs_match(namelen, name, de)) + goto found; + de = nilfs_next_entry(de); + } + nilfs_put_page(page); + } + if (++n >= npages) + n = 0; + /* next page is past the blocks we've got */ + if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { + nilfs_error(dir->i_sb, __func__, + "dir %lu size %lld exceeds block count %llu", + dir->i_ino, dir->i_size, + (unsigned long long)dir->i_blocks); + goto out; + } + } while (n != start); +out: + return NULL; + +found: + *res_page = page; + ei->i_dir_start_lookup = n; + return de; +} + +struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p) +{ + struct page *page = nilfs_get_page(dir, 0); + struct nilfs_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = nilfs_next_entry( + (struct nilfs_dir_entry *)page_address(page)); + *p = page; + } + return de; +} + +ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) +{ + ino_t res = 0; + struct nilfs_dir_entry *de; + struct page *page; + + de = nilfs_find_entry(dir, qstr, &page); + if (de) { + res = le64_to_cpu(de->inode); + kunmap(page); + page_cache_release(page); + } + return res; +} + +/* Releases the page */ +void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, + struct page *page, struct inode *inode) +{ + unsigned from = (char *) de - (char *) page_address(page); + unsigned to = from + nilfs_rec_len_from_disk(de->rec_len); + struct address_space *mapping = page->mapping; + int err; + + lock_page(page); + err = nilfs_prepare_chunk(page, from, to); + BUG_ON(err); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + nilfs_commit_chunk(page, mapping, from, to); + nilfs_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; +} + +/* + * Parent is locked. + */ +int nilfs_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + const unsigned char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned chunk_size = nilfs_chunk_size(dir); + unsigned reclen = NILFS_DIR_REC_LEN(namelen); + unsigned short rec_len, name_len; + struct page *page = NULL; + struct nilfs_dir_entry *de; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr; + unsigned from, to; + int err; + + /* + * We take care of directory expansion in the same loop. + * This code plays outside i_size, so it locks the page + * to protect that region. + */ + for (n = 0; n <= npages; n++) { + char *dir_end; + + page = nilfs_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = page_address(page); + dir_end = kaddr + nilfs_last_byte(dir, n); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE - reclen; + while ((char *)de <= kaddr) { + if ((char *)de == dir_end) { + /* We hit i_size */ + name_len = 0; + rec_len = chunk_size; + de->rec_len = nilfs_rec_len_to_disk(chunk_size); + de->inode = 0; + goto got_it; + } + if (de->rec_len == 0) { + nilfs_error(dir->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out_unlock; + } + err = -EEXIST; + if (nilfs_match(namelen, name, de)) + goto out_unlock; + name_len = NILFS_DIR_REC_LEN(de->name_len); + rec_len = nilfs_rec_len_from_disk(de->rec_len); + if (!de->inode && rec_len >= reclen) + goto got_it; + if (rec_len >= name_len + reclen) + goto got_it; + de = (struct nilfs_dir_entry *)((char *)de + rec_len); + } + unlock_page(page); + nilfs_put_page(page); + } + BUG(); + return -EINVAL; + +got_it: + from = (char *)de - (char *)page_address(page); + to = from + rec_len; + err = nilfs_prepare_chunk(page, from, to); + if (err) + goto out_unlock; + if (de->inode) { + struct nilfs_dir_entry *de1; + + de1 = (struct nilfs_dir_entry *)((char *)de + name_len); + de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len); + de->rec_len = nilfs_rec_len_to_disk(name_len); + de = de1; + } + de->name_len = namelen; + memcpy(de->name, name, namelen); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + nilfs_commit_chunk(page, page->mapping, from, to); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + nilfs_mark_inode_dirty(dir); + /* OFFSET_CACHE */ +out_put: + nilfs_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +/* + * nilfs_delete_entry deletes a directory entry by merging it with the + * previous entry. Page is up-to-date. Releases the page. + */ +int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + char *kaddr = page_address(page); + unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); + unsigned to = ((char *)dir - kaddr) + + nilfs_rec_len_from_disk(dir->rec_len); + struct nilfs_dir_entry *pde = NULL; + struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from); + int err; + + while ((char *)de < (char *)dir) { + if (de->rec_len == 0) { + nilfs_error(inode->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out; + } + pde = de; + de = nilfs_next_entry(de); + } + if (pde) + from = (char *)pde - (char *)page_address(page); + lock_page(page); + err = nilfs_prepare_chunk(page, from, to); + BUG_ON(err); + if (pde) + pde->rec_len = nilfs_rec_len_to_disk(to - from); + dir->inode = 0; + nilfs_commit_chunk(page, mapping, from, to); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; +out: + nilfs_put_page(page); + return err; +} + +/* + * Set the first fragment of directory. + */ +int nilfs_make_empty(struct inode *inode, struct inode *parent) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); + unsigned chunk_size = nilfs_chunk_size(inode); + struct nilfs_dir_entry *de; + int err; + void *kaddr; + + if (!page) + return -ENOMEM; + + err = nilfs_prepare_chunk(page, 0, chunk_size); + if (unlikely(err)) { + unlock_page(page); + goto fail; + } + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr, 0, chunk_size); + de = (struct nilfs_dir_entry *)kaddr; + de->name_len = 1; + de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1)); + memcpy(de->name, ".\0\0", 4); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + + de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1)); + de->name_len = 2; + de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1)); + de->inode = cpu_to_le64(parent->i_ino); + memcpy(de->name, "..\0", 4); + nilfs_set_de_type(de, inode); + kunmap_atomic(kaddr, KM_USER0); + nilfs_commit_chunk(page, mapping, 0, chunk_size); +fail: + page_cache_release(page); + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int nilfs_empty_dir(struct inode *inode) +{ + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + + for (i = 0; i < npages; i++) { + char *kaddr; + struct nilfs_dir_entry *de; + + page = nilfs_get_page(inode, i); + if (IS_ERR(page)) + continue; + + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1); + + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + nilfs_error(inode->i_sb, __func__, + "zero-length directory entry " + "(kaddr=%p, de=%p)\n", kaddr, de); + goto not_empty; + } + if (de->inode != 0) { + /* check for . and .. */ + if (de->name[0] != '.') + goto not_empty; + if (de->name_len > 2) + goto not_empty; + if (de->name_len < 2) { + if (de->inode != + cpu_to_le64(inode->i_ino)) + goto not_empty; + } else if (de->name[1] != '.') + goto not_empty; + } + de = nilfs_next_entry(de); + } + nilfs_put_page(page); + } + return 1; + +not_empty: + nilfs_put_page(page); + return 0; +} + +const struct file_operations nilfs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = nilfs_readdir, + .unlocked_ioctl = nilfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = nilfs_compat_ioctl, +#endif /* CONFIG_COMPAT */ + .fsync = nilfs_sync_file, + +}; diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c new file mode 100644 index 00000000..82f4865e --- /dev/null +++ b/fs/nilfs2/direct.c @@ -0,0 +1,369 @@ +/* + * direct.c - NILFS direct block pointer. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#include +#include "nilfs.h" +#include "page.h" +#include "direct.h" +#include "alloc.h" +#include "dat.h" + +static inline __le64 *nilfs_direct_dptrs(const struct nilfs_bmap *direct) +{ + return (__le64 *) + ((struct nilfs_direct_node *)direct->b_u.u_data + 1); +} + +static inline __u64 +nilfs_direct_get_ptr(const struct nilfs_bmap *direct, __u64 key) +{ + return le64_to_cpu(*(nilfs_direct_dptrs(direct) + key)); +} + +static inline void nilfs_direct_set_ptr(struct nilfs_bmap *direct, + __u64 key, __u64 ptr) +{ + *(nilfs_direct_dptrs(direct) + key) = cpu_to_le64(ptr); +} + +static int nilfs_direct_lookup(const struct nilfs_bmap *direct, + __u64 key, int level, __u64 *ptrp) +{ + __u64 ptr; + + if (key > NILFS_DIRECT_KEY_MAX || level != 1) + return -ENOENT; + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr == NILFS_BMAP_INVALID_PTR) + return -ENOENT; + + *ptrp = ptr; + return 0; +} + +static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct, + __u64 key, __u64 *ptrp, + unsigned maxblocks) +{ + struct inode *dat = NULL; + __u64 ptr, ptr2; + sector_t blocknr; + int ret, cnt; + + if (key > NILFS_DIRECT_KEY_MAX) + return -ENOENT; + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr == NILFS_BMAP_INVALID_PTR) + return -ENOENT; + + if (NILFS_BMAP_USE_VBN(direct)) { + dat = nilfs_bmap_get_dat(direct); + ret = nilfs_dat_translate(dat, ptr, &blocknr); + if (ret < 0) + return ret; + ptr = blocknr; + } + + maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1); + for (cnt = 1; cnt < maxblocks && + (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) != + NILFS_BMAP_INVALID_PTR; + cnt++) { + if (dat) { + ret = nilfs_dat_translate(dat, ptr2, &blocknr); + if (ret < 0) + return ret; + ptr2 = blocknr; + } + if (ptr2 != ptr + cnt) + break; + } + *ptrp = ptr; + return cnt; +} + +static __u64 +nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key) +{ + __u64 ptr; + + ptr = nilfs_bmap_find_target_seq(direct, key); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* sequential access */ + return ptr; + else + /* block group */ + return nilfs_bmap_find_target_in_group(direct); +} + +static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +{ + union nilfs_bmap_ptr_req req; + struct inode *dat = NULL; + struct buffer_head *bh; + int ret; + + if (key > NILFS_DIRECT_KEY_MAX) + return -ENOENT; + if (nilfs_direct_get_ptr(bmap, key) != NILFS_BMAP_INVALID_PTR) + return -EEXIST; + + if (NILFS_BMAP_USE_VBN(bmap)) { + req.bpr_ptr = nilfs_direct_find_target_v(bmap, key); + dat = nilfs_bmap_get_dat(bmap); + } + ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat); + if (!ret) { + /* ptr must be a pointer to a buffer head. */ + bh = (struct buffer_head *)((unsigned long)ptr); + set_buffer_nilfs_volatile(bh); + + nilfs_bmap_commit_alloc_ptr(bmap, &req, dat); + nilfs_direct_set_ptr(bmap, key, req.bpr_ptr); + + if (!nilfs_bmap_dirty(bmap)) + nilfs_bmap_set_dirty(bmap); + + if (NILFS_BMAP_USE_VBN(bmap)) + nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr); + + nilfs_inode_add_blocks(bmap->b_inode, 1); + } + return ret; +} + +static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) +{ + union nilfs_bmap_ptr_req req; + struct inode *dat; + int ret; + + if (key > NILFS_DIRECT_KEY_MAX || + nilfs_direct_get_ptr(bmap, key) == NILFS_BMAP_INVALID_PTR) + return -ENOENT; + + dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; + req.bpr_ptr = nilfs_direct_get_ptr(bmap, key); + + ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat); + if (!ret) { + nilfs_bmap_commit_end_ptr(bmap, &req, dat); + nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR); + nilfs_inode_sub_blocks(bmap->b_inode, 1); + } + return ret; +} + +static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp) +{ + __u64 key, lastkey; + + lastkey = NILFS_DIRECT_KEY_MAX + 1; + for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++) + if (nilfs_direct_get_ptr(direct, key) != + NILFS_BMAP_INVALID_PTR) + lastkey = key; + + if (lastkey == NILFS_DIRECT_KEY_MAX + 1) + return -ENOENT; + + *keyp = lastkey; + + return 0; +} + +static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key) +{ + return key > NILFS_DIRECT_KEY_MAX; +} + +static int nilfs_direct_gather_data(struct nilfs_bmap *direct, + __u64 *keys, __u64 *ptrs, int nitems) +{ + __u64 key; + __u64 ptr; + int n; + + if (nitems > NILFS_DIRECT_NBLOCKS) + nitems = NILFS_DIRECT_NBLOCKS; + n = 0; + for (key = 0; key < nitems; key++) { + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr != NILFS_BMAP_INVALID_PTR) { + keys[n] = key; + ptrs[n] = ptr; + n++; + } + } + return n; +} + +int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, + __u64 key, __u64 *keys, __u64 *ptrs, int n) +{ + __le64 *dptrs; + int ret, i, j; + + /* no need to allocate any resource for conversion */ + + /* delete */ + ret = bmap->b_ops->bop_delete(bmap, key); + if (ret < 0) + return ret; + + /* free resources */ + if (bmap->b_ops->bop_clear != NULL) + bmap->b_ops->bop_clear(bmap); + + /* convert */ + dptrs = nilfs_direct_dptrs(bmap); + for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) { + if ((j < n) && (i == keys[j])) { + dptrs[i] = (i != key) ? + cpu_to_le64(ptrs[j]) : + NILFS_BMAP_INVALID_PTR; + j++; + } else + dptrs[i] = NILFS_BMAP_INVALID_PTR; + } + + nilfs_direct_init(bmap); + return 0; +} + +static int nilfs_direct_propagate(struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + struct nilfs_palloc_req oldreq, newreq; + struct inode *dat; + __u64 key; + __u64 ptr; + int ret; + + if (!NILFS_BMAP_USE_VBN(bmap)) + return 0; + + dat = nilfs_bmap_get_dat(bmap); + key = nilfs_bmap_data_get_key(bmap, bh); + ptr = nilfs_direct_get_ptr(bmap, key); + if (!buffer_nilfs_volatile(bh)) { + oldreq.pr_entry_nr = ptr; + newreq.pr_entry_nr = ptr; + ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq); + if (ret < 0) + return ret; + nilfs_dat_commit_update(dat, &oldreq, &newreq, + bmap->b_ptr_type == NILFS_BMAP_PTR_VS); + set_buffer_nilfs_volatile(bh); + nilfs_direct_set_ptr(bmap, key, newreq.pr_entry_nr); + } else + ret = nilfs_dat_mark_dirty(dat, ptr); + + return ret; +} + +static int nilfs_direct_assign_v(struct nilfs_bmap *direct, + __u64 key, __u64 ptr, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct inode *dat = nilfs_bmap_get_dat(direct); + union nilfs_bmap_ptr_req req; + int ret; + + req.bpr_ptr = ptr; + ret = nilfs_dat_prepare_start(dat, &req.bpr_req); + if (!ret) { + nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); + binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr); + binfo->bi_v.bi_blkoff = cpu_to_le64(key); + } + return ret; +} + +static int nilfs_direct_assign_p(struct nilfs_bmap *direct, + __u64 key, __u64 ptr, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + nilfs_direct_set_ptr(direct, key, blocknr); + + binfo->bi_dat.bi_blkoff = cpu_to_le64(key); + binfo->bi_dat.bi_level = 0; + + return 0; +} + +static int nilfs_direct_assign(struct nilfs_bmap *bmap, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + __u64 key; + __u64 ptr; + + key = nilfs_bmap_data_get_key(bmap, *bh); + if (unlikely(key > NILFS_DIRECT_KEY_MAX)) { + printk(KERN_CRIT "%s: invalid key: %llu\n", __func__, + (unsigned long long)key); + return -EINVAL; + } + ptr = nilfs_direct_get_ptr(bmap, key); + if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) { + printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__, + (unsigned long long)ptr); + return -EINVAL; + } + + return NILFS_BMAP_USE_VBN(bmap) ? + nilfs_direct_assign_v(bmap, key, ptr, bh, blocknr, binfo) : + nilfs_direct_assign_p(bmap, key, ptr, bh, blocknr, binfo); +} + +static const struct nilfs_bmap_operations nilfs_direct_ops = { + .bop_lookup = nilfs_direct_lookup, + .bop_lookup_contig = nilfs_direct_lookup_contig, + .bop_insert = nilfs_direct_insert, + .bop_delete = nilfs_direct_delete, + .bop_clear = NULL, + + .bop_propagate = nilfs_direct_propagate, + + .bop_lookup_dirty_buffers = NULL, + + .bop_assign = nilfs_direct_assign, + .bop_mark = NULL, + + .bop_last_key = nilfs_direct_last_key, + .bop_check_insert = nilfs_direct_check_insert, + .bop_check_delete = NULL, + .bop_gather_data = nilfs_direct_gather_data, +}; + + +int nilfs_direct_init(struct nilfs_bmap *bmap) +{ + bmap->b_ops = &nilfs_direct_ops; + return 0; +} diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h new file mode 100644 index 00000000..dc643de2 --- /dev/null +++ b/fs/nilfs2/direct.h @@ -0,0 +1,51 @@ +/* + * direct.h - NILFS direct block pointer. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#ifndef _NILFS_DIRECT_H +#define _NILFS_DIRECT_H + +#include +#include +#include "bmap.h" + + +/** + * struct nilfs_direct_node - direct node + * @dn_flags: flags + * @dn_pad: padding + */ +struct nilfs_direct_node { + __u8 dn_flags; + __u8 pad[7]; +}; + +#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1) +#define NILFS_DIRECT_KEY_MIN 0 +#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) + + +int nilfs_direct_init(struct nilfs_bmap *); +int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *, + __u64 *, int); + + +#endif /* _NILFS_DIRECT_H */ diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h new file mode 100644 index 00000000..a71cc412 --- /dev/null +++ b/fs/nilfs2/export.h @@ -0,0 +1,17 @@ +#ifndef NILFS_EXPORT_H +#define NILFS_EXPORT_H + +#include + +extern const struct export_operations nilfs_export_ops; + +struct nilfs_fid { + u64 cno; + u64 ino; + u32 gen; + + u32 parent_gen; + u64 parent_ino; +} __attribute__ ((packed)); + +#endif diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c new file mode 100644 index 00000000..d7eeca62 --- /dev/null +++ b/fs/nilfs2/file.c @@ -0,0 +1,159 @@ +/* + * file.c - NILFS regular file handling primitives including fsync(). + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji , + * Ryusuke Konishi + */ + +#include +#include +#include +#include "nilfs.h" +#include "segment.h" + +int nilfs_sync_file(struct file *file, int datasync) +{ + /* + * Called from fsync() system call + * This is the only entry point that can catch write and synch + * timing for both data blocks and intermediate blocks. + * + * This function should be implemented when the writeback function + * will be implemented. + */ + struct inode *inode = file->f_mapping->host; + int err; + + if (!nilfs_inode_dirty(inode)) + return 0; + + if (datasync) + err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0, + LLONG_MAX); + else + err = nilfs_construct_segment(inode->i_sb); + + return err; +} + +static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct nilfs_transaction_info ti; + int ret; + + if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info))) + return VM_FAULT_SIGBUS; /* -ENOSPC */ + + lock_page(page); + if (page->mapping != inode->i_mapping || + page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { + unlock_page(page); + return VM_FAULT_NOPAGE; /* make the VM retry the fault */ + } + + /* + * check to see if the page is mapped already (no holes) + */ + if (PageMappedToDisk(page)) + goto mapped; + + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + int fully_mapped = 1; + + bh = head = page_buffers(page); + do { + if (!buffer_mapped(bh)) { + fully_mapped = 0; + break; + } + } while (bh = bh->b_this_page, bh != head); + + if (fully_mapped) { + SetPageMappedToDisk(page); + goto mapped; + } + } + unlock_page(page); + + /* + * fill hole blocks + */ + ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); + /* never returns -ENOMEM, but may return -ENOSPC */ + if (unlikely(ret)) + return VM_FAULT_SIGBUS; + + ret = block_page_mkwrite(vma, vmf, nilfs_get_block); + if (ret != VM_FAULT_LOCKED) { + nilfs_transaction_abort(inode->i_sb); + return ret; + } + nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits)); + nilfs_transaction_commit(inode->i_sb); + + mapped: + wait_on_page_writeback(page); + return VM_FAULT_LOCKED; +} + +static const struct vm_operations_struct nilfs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = nilfs_page_mkwrite, +}; + +static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &nilfs_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} + +/* + * We have mostly NULL's here: the current defaults are ok for + * the nilfs filesystem. + */ +const struct file_operations nilfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .unlocked_ioctl = nilfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = nilfs_compat_ioctl, +#endif /* CONFIG_COMPAT */ + .mmap = nilfs_file_mmap, + .open = generic_file_open, + /* .release = nilfs_release_file, */ + .fsync = nilfs_sync_file, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations nilfs_file_inode_operations = { + .truncate = nilfs_truncate, + .setattr = nilfs_setattr, + .permission = nilfs_permission, + .fiemap = nilfs_fiemap, +}; + +/* end of file */ diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c new file mode 100644 index 00000000..08a07a21 --- /dev/null +++ b/fs/nilfs2/gcinode.c @@ -0,0 +1,196 @@ +/* + * gcinode.c - dummy inodes to buffer blocks for garbage collection + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Seiji Kihara , Amagai Yoshiji , + * and Ryusuke Konishi . + * Revised by Ryusuke Konishi . + * + */ +/* + * This file adds the cache of on-disk blocks to be moved in garbage + * collection. The disk blocks are held with dummy inodes (called + * gcinodes), and this file provides lookup function of the dummy + * inodes and their buffer read function. + * + * Buffers and pages held by the dummy inodes will be released each + * time after they are copied to a new log. Dirty blocks made on the + * current generation and the blocks to be moved by GC never overlap + * because the dirty blocks make a new generation; they rather must be + * written individually. + */ + +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "btree.h" +#include "btnode.h" +#include "page.h" +#include "mdt.h" +#include "dat.h" +#include "ifile.h" + +/* + * nilfs_gccache_submit_read_data() - add data buffer and submit read request + * @inode - gc inode + * @blkoff - dummy offset treated as the key for the page cache + * @pbn - physical block number of the block + * @vbn - virtual block number of the block, 0 for non-virtual block + * @out_bh - indirect pointer to a buffer_head struct to receive the results + * + * Description: nilfs_gccache_submit_read_data() registers the data buffer + * specified by @pbn to the GC pagecache with the key @blkoff. + * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer. + * + * Return Value: On success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The block specified with @pbn does not exist. + */ +int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, + sector_t pbn, __u64 vbn, + struct buffer_head **out_bh) +{ + struct buffer_head *bh; + int err; + + bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); + if (unlikely(!bh)) + return -ENOMEM; + + if (buffer_uptodate(bh)) + goto out; + + if (pbn == 0) { + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn); + if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */ + brelse(bh); + goto failed; + } + } + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + goto out; + } + + if (!buffer_mapped(bh)) { + bh->b_bdev = inode->i_sb->s_bdev; + set_buffer_mapped(bh); + } + bh->b_blocknr = pbn; + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(READ, bh); + if (vbn) + bh->b_blocknr = vbn; + out: + err = 0; + *out_bh = bh; + + failed: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + return err; +} + +/* + * nilfs_gccache_submit_read_node() - add node buffer and submit read request + * @inode - gc inode + * @pbn - physical block number for the block + * @vbn - virtual block number for the block + * @out_bh - indirect pointer to a buffer_head struct to receive the results + * + * Description: nilfs_gccache_submit_read_node() registers the node buffer + * specified by @vbn to the GC pagecache. @pbn can be supplied by the + * caller to avoid translation of the disk block address. + * + * Return Value: On success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, + __u64 vbn, struct buffer_head **out_bh) +{ + int ret; + + ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, + vbn ? : pbn, pbn, READ, out_bh, &pbn); + if (ret == -EEXIST) /* internal code (cache hit) */ + ret = 0; + return ret; +} + +int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) +{ + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + return -EIO; + if (buffer_dirty(bh)) + return -EEXIST; + + if (buffer_nilfs_node(bh) && nilfs_btree_broken_node_block(bh)) { + clear_buffer_uptodate(bh); + return -EIO; + } + mark_buffer_dirty(bh); + return 0; +} + +int nilfs_init_gcinode(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + inode->i_mode = S_IFREG; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + inode->i_mapping->a_ops = &empty_aops; + inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; + + ii->i_flags = 0; + nilfs_bmap_init_gc(ii->i_bmap); + + return 0; +} + +/** + * nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes + */ +void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs) +{ + struct list_head *head = &nilfs->ns_gc_inodes; + struct nilfs_inode_info *ii; + + while (!list_empty(head)) { + ii = list_first_entry(head, struct nilfs_inode_info, i_dirty); + list_del_init(&ii->i_dirty); + iput(&ii->vfs_inode); + } +} diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c new file mode 100644 index 00000000..684d7630 --- /dev/null +++ b/fs/nilfs2/ifile.c @@ -0,0 +1,201 @@ +/* + * ifile.c - NILFS inode file + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji . + * Revised by Ryusuke Konishi . + * + */ + +#include +#include +#include "nilfs.h" +#include "mdt.h" +#include "alloc.h" +#include "ifile.h" + + +struct nilfs_ifile_info { + struct nilfs_mdt_info mi; + struct nilfs_palloc_cache palloc_cache; +}; + +static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile) +{ + return (struct nilfs_ifile_info *)NILFS_MDT(ifile); +} + +/** + * nilfs_ifile_create_inode - create a new disk inode + * @ifile: ifile inode + * @out_ino: pointer to a variable to store inode number + * @out_bh: buffer_head contains newly allocated disk inode + * + * Return Value: On success, 0 is returned and the newly allocated inode + * number is stored in the place pointed by @ino, and buffer_head pointer + * that contains newly allocated disk inode structure is stored in the + * place pointed by @out_bh + * On error, one of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOSPC - No inode left. + */ +int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino, + struct buffer_head **out_bh) +{ + struct nilfs_palloc_req req; + int ret; + + req.pr_entry_nr = 0; /* 0 says find free inode from beginning of + a group. dull code!! */ + req.pr_entry_bh = NULL; + + ret = nilfs_palloc_prepare_alloc_entry(ifile, &req); + if (!ret) { + ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1, + &req.pr_entry_bh); + if (ret < 0) + nilfs_palloc_abort_alloc_entry(ifile, &req); + } + if (ret < 0) { + brelse(req.pr_entry_bh); + return ret; + } + nilfs_palloc_commit_alloc_entry(ifile, &req); + mark_buffer_dirty(req.pr_entry_bh); + nilfs_mdt_mark_dirty(ifile); + *out_ino = (ino_t)req.pr_entry_nr; + *out_bh = req.pr_entry_bh; + return 0; +} + +/** + * nilfs_ifile_delete_inode - delete a disk inode + * @ifile: ifile inode + * @ino: inode number + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The inode number @ino have not been allocated. + */ +int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) +{ + struct nilfs_palloc_req req = { + .pr_entry_nr = ino, .pr_entry_bh = NULL + }; + struct nilfs_inode *raw_inode; + void *kaddr; + int ret; + + ret = nilfs_palloc_prepare_free_entry(ifile, &req); + if (!ret) { + ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0, + &req.pr_entry_bh); + if (ret < 0) + nilfs_palloc_abort_free_entry(ifile, &req); + } + if (ret < 0) { + brelse(req.pr_entry_bh); + return ret; + } + + kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0); + raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr, + req.pr_entry_bh, kaddr); + raw_inode->i_flags = 0; + kunmap_atomic(kaddr, KM_USER0); + + mark_buffer_dirty(req.pr_entry_bh); + brelse(req.pr_entry_bh); + + nilfs_palloc_commit_free_entry(ifile, &req); + + return 0; +} + +int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, + struct buffer_head **out_bh) +{ + struct super_block *sb = ifile->i_sb; + int err; + + if (unlikely(!NILFS_VALID_INODE(sb, ino))) { + nilfs_error(sb, __func__, "bad inode number: %lu", + (unsigned long) ino); + return -EINVAL; + } + + err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh); + if (unlikely(err)) + nilfs_warning(sb, __func__, "unable to read inode: %lu", + (unsigned long) ino); + return err; +} + +/** + * nilfs_ifile_read - read or get ifile inode + * @sb: super block instance + * @root: root object + * @inode_size: size of an inode + * @raw_inode: on-disk ifile inode + * @inodep: buffer to store the inode + */ +int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, + size_t inode_size, struct nilfs_inode *raw_inode, + struct inode **inodep) +{ + struct inode *ifile; + int err; + + ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO); + if (unlikely(!ifile)) + return -ENOMEM; + if (!(ifile->i_state & I_NEW)) + goto out; + + err = nilfs_mdt_init(ifile, NILFS_MDT_GFP, + sizeof(struct nilfs_ifile_info)); + if (err) + goto failed; + + err = nilfs_palloc_init_blockgroup(ifile, inode_size); + if (err) + goto failed; + + nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache); + + err = nilfs_read_inode_common(ifile, raw_inode); + if (err) + goto failed; + + unlock_new_inode(ifile); + out: + *inodep = ifile; + return 0; + failed: + iget_failed(ifile); + return err; +} diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h new file mode 100644 index 00000000..59b6f2b5 --- /dev/null +++ b/fs/nilfs2/ifile.h @@ -0,0 +1,56 @@ +/* + * ifile.h - NILFS inode file + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji + * Revised by Ryusuke Konishi + * + */ + +#ifndef _NILFS_IFILE_H +#define _NILFS_IFILE_H + +#include +#include +#include +#include "mdt.h" +#include "alloc.h" + + +static inline struct nilfs_inode * +nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) +{ + void *kaddr = kmap(ibh->b_page); + return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr); +} + +static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino, + struct buffer_head *ibh) +{ + kunmap(ibh->b_page); +} + +int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **); +int nilfs_ifile_delete_inode(struct inode *, ino_t); +int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); + +int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, + size_t inode_size, struct nilfs_inode *raw_inode, + struct inode **inodep); + +#endif /* _NILFS_IFILE_H */ diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c new file mode 100644 index 00000000..b9b45fc2 --- /dev/null +++ b/fs/nilfs2/inode.c @@ -0,0 +1,1064 @@ +/* + * inode.c - NILFS inode operations. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + * + */ + +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "btnode.h" +#include "segment.h" +#include "page.h" +#include "mdt.h" +#include "cpfile.h" +#include "ifile.h" + +struct nilfs_iget_args { + u64 ino; + __u64 cno; + struct nilfs_root *root; + int for_gc; +}; + +void nilfs_inode_add_blocks(struct inode *inode, int n) +{ + struct nilfs_root *root = NILFS_I(inode)->i_root; + + inode_add_bytes(inode, (1 << inode->i_blkbits) * n); + if (root) + atomic_add(n, &root->blocks_count); +} + +void nilfs_inode_sub_blocks(struct inode *inode, int n) +{ + struct nilfs_root *root = NILFS_I(inode)->i_root; + + inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); + if (root) + atomic_sub(n, &root->blocks_count); +} + +/** + * nilfs_get_block() - get a file block on the filesystem (callback function) + * @inode - inode struct of the target file + * @blkoff - file block number + * @bh_result - buffer head to be mapped on + * @create - indicate whether allocating the block or not when it has not + * been allocated yet. + * + * This function does not issue actual read request of the specified data + * block. It is done by VFS. + */ +int nilfs_get_block(struct inode *inode, sector_t blkoff, + struct buffer_head *bh_result, int create) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + __u64 blknum = 0; + int err = 0, ret; + unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; + + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + if (ret >= 0) { /* found */ + map_bh(bh_result, inode->i_sb, blknum); + if (ret > 0) + bh_result->b_size = (ret << inode->i_blkbits); + goto out; + } + /* data block was not found */ + if (ret == -ENOENT && create) { + struct nilfs_transaction_info ti; + + bh_result->b_blocknr = 0; + err = nilfs_transaction_begin(inode->i_sb, &ti, 1); + if (unlikely(err)) + goto out; + err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff, + (unsigned long)bh_result); + if (unlikely(err != 0)) { + if (err == -EEXIST) { + /* + * The get_block() function could be called + * from multiple callers for an inode. + * However, the page having this block must + * be locked in this case. + */ + printk(KERN_WARNING + "nilfs_get_block: a race condition " + "while inserting a data block. " + "(inode number=%lu, file block " + "offset=%llu)\n", + inode->i_ino, + (unsigned long long)blkoff); + err = 0; + } + nilfs_transaction_abort(inode->i_sb); + goto out; + } + nilfs_mark_inode_dirty(inode); + nilfs_transaction_commit(inode->i_sb); /* never fails */ + /* Error handling should be detailed */ + set_buffer_new(bh_result); + set_buffer_delay(bh_result); + map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed + to proper value */ + } else if (ret == -ENOENT) { + /* not found is not error (e.g. hole); must return without + the mapped state flag. */ + ; + } else { + err = ret; + } + + out: + return err; +} + +/** + * nilfs_readpage() - implement readpage() method of nilfs_aops {} + * address_space_operations. + * @file - file struct of the file to be read + * @page - the page to be read + */ +static int nilfs_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, nilfs_get_block); +} + +/** + * nilfs_readpages() - implement readpages() method of nilfs_aops {} + * address_space_operations. + * @file - file struct of the file to be read + * @mapping - address_space struct used for reading multiple pages + * @pages - the pages to be read + * @nr_pages - number of pages to be read + */ +static int nilfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); +} + +static int nilfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + int err = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + err = nilfs_construct_dsync_segment(inode->i_sb, inode, + wbc->range_start, + wbc->range_end); + return err; +} + +static int nilfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + int err; + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + + if (wbc->sync_mode == WB_SYNC_ALL) { + err = nilfs_construct_segment(inode->i_sb); + if (unlikely(err)) + return err; + } else if (wbc->for_reclaim) + nilfs_flush_segment(inode->i_sb, inode->i_ino); + + return 0; +} + +static int nilfs_set_page_dirty(struct page *page) +{ + int ret = __set_page_dirty_buffers(page); + + if (ret) { + struct inode *inode = page->mapping->host; + unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); + + nilfs_set_file_dirty(inode, nr_dirty); + } + return ret; +} + +static int nilfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) + +{ + struct inode *inode = mapping->host; + int err = nilfs_transaction_begin(inode->i_sb, NULL, 1); + + if (unlikely(err)) + return err; + + err = block_write_begin(mapping, pos, len, flags, pagep, + nilfs_get_block); + if (unlikely(err)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + + nilfs_transaction_abort(inode->i_sb); + } + return err; +} + +static int nilfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned nr_dirty; + int err; + + nr_dirty = nilfs_page_count_clean_buffers(page, start, + start + copied); + copied = generic_write_end(file, mapping, pos, len, copied, page, + fsdata); + nilfs_set_file_dirty(inode, nr_dirty); + err = nilfs_transaction_commit(inode->i_sb); + return err ? : copied; +} + +static ssize_t +nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t size; + + if (rw == WRITE) + return 0; + + /* Needs synchronization with the cleaner */ + size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, nilfs_get_block, NULL); + + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely((rw & WRITE) && size < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + + if (end > isize) + vmtruncate(inode, isize); + } + + return size; +} + +const struct address_space_operations nilfs_aops = { + .writepage = nilfs_writepage, + .readpage = nilfs_readpage, + .writepages = nilfs_writepages, + .set_page_dirty = nilfs_set_page_dirty, + .readpages = nilfs_readpages, + .write_begin = nilfs_write_begin, + .write_end = nilfs_write_end, + /* .releasepage = nilfs_releasepage, */ + .invalidatepage = block_invalidatepage, + .direct_IO = nilfs_direct_IO, + .is_partially_uptodate = block_is_partially_uptodate, +}; + +struct inode *nilfs_new_inode(struct inode *dir, int mode) +{ + struct super_block *sb = dir->i_sb; + struct the_nilfs *nilfs = sb->s_fs_info; + struct inode *inode; + struct nilfs_inode_info *ii; + struct nilfs_root *root; + int err = -ENOMEM; + ino_t ino; + + inode = new_inode(sb); + if (unlikely(!inode)) + goto failed; + + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + + root = NILFS_I(dir)->i_root; + ii = NILFS_I(inode); + ii->i_state = 1 << NILFS_I_NEW; + ii->i_root = root; + + err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh); + if (unlikely(err)) + goto failed_ifile_create_inode; + /* reference count of i_bh inherits from nilfs_mdt_read_block() */ + + atomic_inc(&root->inodes_count); + inode_init_owner(inode, dir, mode); + inode->i_ino = ino; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { + err = nilfs_bmap_read(ii->i_bmap, NULL); + if (err < 0) + goto failed_bmap; + + set_bit(NILFS_I_BMAP, &ii->i_state); + /* No lock is needed; iget() ensures it. */ + } + + ii->i_flags = nilfs_mask_flags( + mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED); + + /* ii->i_file_acl = 0; */ + /* ii->i_dir_acl = 0; */ + ii->i_dir_start_lookup = 0; + nilfs_set_inode_flags(inode); + spin_lock(&nilfs->ns_next_gen_lock); + inode->i_generation = nilfs->ns_next_generation++; + spin_unlock(&nilfs->ns_next_gen_lock); + insert_inode_hash(inode); + + err = nilfs_init_acl(inode, dir); + if (unlikely(err)) + goto failed_acl; /* never occur. When supporting + nilfs_init_acl(), proper cancellation of + above jobs should be considered */ + + return inode; + + failed_acl: + failed_bmap: + inode->i_nlink = 0; + iput(inode); /* raw_inode will be deleted through + generic_delete_inode() */ + goto failed; + + failed_ifile_create_inode: + make_bad_inode(inode); + iput(inode); /* if i_nlink == 1, generic_forget_inode() will be + called */ + failed: + return ERR_PTR(err); +} + +void nilfs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = NILFS_I(inode)->i_flags; + + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | + S_DIRSYNC); + if (flags & FS_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & FS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & FS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & FS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & FS_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); +} + +int nilfs_read_inode_common(struct inode *inode, + struct nilfs_inode *raw_inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid); + inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid); + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + inode->i_size = le64_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); + inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + if (inode->i_nlink == 0 && inode->i_mode == 0) + return -EINVAL; /* this inode is deleted */ + + inode->i_blocks = le64_to_cpu(raw_inode->i_blocks); + ii->i_flags = le32_to_cpu(raw_inode->i_flags); +#if 0 + ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + ii->i_dir_acl = S_ISREG(inode->i_mode) ? + 0 : le32_to_cpu(raw_inode->i_dir_acl); +#endif + ii->i_dir_start_lookup = 0; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) { + err = nilfs_bmap_read(ii->i_bmap, raw_inode); + if (err < 0) + return err; + set_bit(NILFS_I_BMAP, &ii->i_state); + /* No lock is needed; iget() ensures it. */ + } + return 0; +} + +static int __nilfs_read_inode(struct super_block *sb, + struct nilfs_root *root, unsigned long ino, + struct inode *inode) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct buffer_head *bh; + struct nilfs_inode *raw_inode; + int err; + + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh); + if (unlikely(err)) + goto bad_inode; + + raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh); + + err = nilfs_read_inode_common(inode, raw_inode); + if (err) + goto failed_unmap; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &nilfs_file_inode_operations; + inode->i_fop = &nilfs_file_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &nilfs_dir_inode_operations; + inode->i_fop = &nilfs_dir_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &nilfs_symlink_inode_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else { + inode->i_op = &nilfs_special_inode_operations; + init_special_inode( + inode, inode->i_mode, + huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); + } + nilfs_ifile_unmap_inode(root->ifile, ino, bh); + brelse(bh); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + nilfs_set_inode_flags(inode); + return 0; + + failed_unmap: + nilfs_ifile_unmap_inode(root->ifile, ino, bh); + brelse(bh); + + bad_inode: + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + return err; +} + +static int nilfs_iget_test(struct inode *inode, void *opaque) +{ + struct nilfs_iget_args *args = opaque; + struct nilfs_inode_info *ii; + + if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root) + return 0; + + ii = NILFS_I(inode); + if (!test_bit(NILFS_I_GCINODE, &ii->i_state)) + return !args->for_gc; + + return args->for_gc && args->cno == ii->i_cno; +} + +static int nilfs_iget_set(struct inode *inode, void *opaque) +{ + struct nilfs_iget_args *args = opaque; + + inode->i_ino = args->ino; + if (args->for_gc) { + NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE; + NILFS_I(inode)->i_cno = args->cno; + NILFS_I(inode)->i_root = NULL; + } else { + if (args->root && args->ino == NILFS_ROOT_INO) + nilfs_get_root(args->root); + NILFS_I(inode)->i_root = args->root; + } + return 0; +} + +struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root, + unsigned long ino) +{ + struct nilfs_iget_args args = { + .ino = ino, .root = root, .cno = 0, .for_gc = 0 + }; + + return ilookup5(sb, ino, nilfs_iget_test, &args); +} + +struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root, + unsigned long ino) +{ + struct nilfs_iget_args args = { + .ino = ino, .root = root, .cno = 0, .for_gc = 0 + }; + + return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); +} + +struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, + unsigned long ino) +{ + struct inode *inode; + int err; + + inode = nilfs_iget_locked(sb, root, ino); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = __nilfs_read_inode(sb, root, ino, inode); + if (unlikely(err)) { + iget_failed(inode); + return ERR_PTR(err); + } + unlock_new_inode(inode); + return inode; +} + +struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, + __u64 cno) +{ + struct nilfs_iget_args args = { + .ino = ino, .root = NULL, .cno = cno, .for_gc = 1 + }; + struct inode *inode; + int err; + + inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = nilfs_init_gcinode(inode); + if (unlikely(err)) { + iget_failed(inode); + return ERR_PTR(err); + } + unlock_new_inode(inode); + return inode; +} + +void nilfs_write_inode_common(struct inode *inode, + struct nilfs_inode *raw_inode, int has_bmap) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + raw_inode->i_uid = cpu_to_le32(inode->i_uid); + raw_inode->i_gid = cpu_to_le32(inode->i_gid); + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le64(inode->i_size); + raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); + raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + raw_inode->i_blocks = cpu_to_le64(inode->i_blocks); + + raw_inode->i_flags = cpu_to_le32(ii->i_flags); + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + + if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) { + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + /* zero-fill unused portion in the case of super root block */ + raw_inode->i_xattr = 0; + raw_inode->i_pad = 0; + memset((void *)raw_inode + sizeof(*raw_inode), 0, + nilfs->ns_inode_size - sizeof(*raw_inode)); + } + + if (has_bmap) + nilfs_bmap_write(ii->i_bmap, raw_inode); + else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_device_code = + cpu_to_le64(huge_encode_dev(inode->i_rdev)); + /* When extending inode, nilfs->ns_inode_size should be checked + for substitutions of appended fields */ +} + +void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) +{ + ino_t ino = inode->i_ino; + struct nilfs_inode_info *ii = NILFS_I(inode); + struct inode *ifile = ii->i_root->ifile; + struct nilfs_inode *raw_inode; + + raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh); + + if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) + memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size); + set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); + + nilfs_write_inode_common(inode, raw_inode, 0); + /* XXX: call with has_bmap = 0 is a workaround to avoid + deadlock of bmap. This delays update of i_bmap to just + before writing */ + nilfs_ifile_unmap_inode(ifile, ino, ibh); +} + +#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ + +static void nilfs_truncate_bmap(struct nilfs_inode_info *ii, + unsigned long from) +{ + unsigned long b; + int ret; + + if (!test_bit(NILFS_I_BMAP, &ii->i_state)) + return; +repeat: + ret = nilfs_bmap_last_key(ii->i_bmap, &b); + if (ret == -ENOENT) + return; + else if (ret < 0) + goto failed; + + if (b < from) + return; + + b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from); + ret = nilfs_bmap_truncate(ii->i_bmap, b); + nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb); + if (!ret || (ret == -ENOMEM && + nilfs_bmap_truncate(ii->i_bmap, b) == 0)) + goto repeat; + +failed: + nilfs_warning(ii->vfs_inode.i_sb, __func__, + "failed to truncate bmap (ino=%lu, err=%d)", + ii->vfs_inode.i_ino, ret); +} + +void nilfs_truncate(struct inode *inode) +{ + unsigned long blkoff; + unsigned int blocksize; + struct nilfs_transaction_info ti; + struct super_block *sb = inode->i_sb; + struct nilfs_inode_info *ii = NILFS_I(inode); + + if (!test_bit(NILFS_I_BMAP, &ii->i_state)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + + blocksize = sb->s_blocksize; + blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits; + nilfs_transaction_begin(sb, &ti, 0); /* never fails */ + + block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block); + + nilfs_truncate_bmap(ii, blkoff); + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (IS_SYNC(inode)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + + nilfs_mark_inode_dirty(inode); + nilfs_set_file_dirty(inode, 0); + nilfs_transaction_commit(sb); + /* May construct a logical segment and may fail in sync mode. + But truncate has no return value. */ +} + +static void nilfs_clear_inode(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + + /* + * Free resources allocated in nilfs_read_inode(), here. + */ + BUG_ON(!list_empty(&ii->i_dirty)); + brelse(ii->i_bh); + ii->i_bh = NULL; + + if (mdi && mdi->mi_palloc_cache) + nilfs_palloc_destroy_cache(inode); + + if (test_bit(NILFS_I_BMAP, &ii->i_state)) + nilfs_bmap_clear(ii->i_bmap); + + nilfs_btnode_cache_clear(&ii->i_btnode_cache); + + if (ii->i_root && inode->i_ino == NILFS_ROOT_INO) + nilfs_put_root(ii->i_root); +} + +void nilfs_evict_inode(struct inode *inode) +{ + struct nilfs_transaction_info ti; + struct super_block *sb = inode->i_sb; + struct nilfs_inode_info *ii = NILFS_I(inode); + int ret; + + if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + nilfs_clear_inode(inode); + return; + } + nilfs_transaction_begin(sb, &ti, 0); /* never fails */ + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + + /* TODO: some of the following operations may fail. */ + nilfs_truncate_bmap(ii, 0); + nilfs_mark_inode_dirty(inode); + end_writeback(inode); + + ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); + if (!ret) + atomic_dec(&ii->i_root->inodes_count); + + nilfs_clear_inode(inode); + + if (IS_SYNC(inode)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + nilfs_transaction_commit(sb); + /* May construct a logical segment and may fail in sync mode. + But delete_inode has no return value. */ +} + +int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct nilfs_transaction_info ti; + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + int err; + + err = inode_change_ok(inode, iattr); + if (err) + return err; + + err = nilfs_transaction_begin(sb, &ti, 0); + if (unlikely(err)) + return err; + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(inode)) { + err = vmtruncate(inode, iattr->ia_size); + if (unlikely(err)) + goto out_err; + } + + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + + if (iattr->ia_valid & ATTR_MODE) { + err = nilfs_acl_chmod(inode); + if (unlikely(err)) + goto out_err; + } + + return nilfs_transaction_commit(sb); + +out_err: + nilfs_transaction_abort(sb); + return err; +} + +int nilfs_permission(struct inode *inode, int mask, unsigned int flags) +{ + struct nilfs_root *root = NILFS_I(inode)->i_root; + if ((mask & MAY_WRITE) && root && + root->cno != NILFS_CPTREE_CURRENT_CNO) + return -EROFS; /* snapshot is not writable */ + + return generic_permission(inode, mask, flags, NULL); +} + +int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + spin_lock(&nilfs->ns_inode_lock); + if (ii->i_bh == NULL) { + spin_unlock(&nilfs->ns_inode_lock); + err = nilfs_ifile_get_inode_block(ii->i_root->ifile, + inode->i_ino, pbh); + if (unlikely(err)) + return err; + spin_lock(&nilfs->ns_inode_lock); + if (ii->i_bh == NULL) + ii->i_bh = *pbh; + else { + brelse(*pbh); + *pbh = ii->i_bh; + } + } else + *pbh = ii->i_bh; + + get_bh(*pbh); + spin_unlock(&nilfs->ns_inode_lock); + return 0; +} + +int nilfs_inode_dirty(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + int ret = 0; + + if (!list_empty(&ii->i_dirty)) { + spin_lock(&nilfs->ns_inode_lock); + ret = test_bit(NILFS_I_DIRTY, &ii->i_state) || + test_bit(NILFS_I_BUSY, &ii->i_state); + spin_unlock(&nilfs->ns_inode_lock); + } + return ret; +} + +int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + atomic_add(nr_dirty, &nilfs->ns_ndirtyblks); + + if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state)) + return 0; + + spin_lock(&nilfs->ns_inode_lock); + if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && + !test_bit(NILFS_I_BUSY, &ii->i_state)) { + /* Because this routine may race with nilfs_dispose_list(), + we have to check NILFS_I_QUEUED here, too. */ + if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) { + /* This will happen when somebody is freeing + this inode. */ + nilfs_warning(inode->i_sb, __func__, + "cannot get inode (ino=%lu)\n", + inode->i_ino); + spin_unlock(&nilfs->ns_inode_lock); + return -EINVAL; /* NILFS_I_DIRTY may remain for + freeing inode */ + } + list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files); + set_bit(NILFS_I_QUEUED, &ii->i_state); + } + spin_unlock(&nilfs->ns_inode_lock); + return 0; +} + +int nilfs_mark_inode_dirty(struct inode *inode) +{ + struct buffer_head *ibh; + int err; + + err = nilfs_load_inode_block(inode, &ibh); + if (unlikely(err)) { + nilfs_warning(inode->i_sb, __func__, + "failed to reget inode block.\n"); + return err; + } + nilfs_update_inode(inode, ibh); + mark_buffer_dirty(ibh); + nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile); + brelse(ibh); + return 0; +} + +/** + * nilfs_dirty_inode - reflect changes on given inode to an inode block. + * @inode: inode of the file to be registered. + * + * nilfs_dirty_inode() loads a inode block containing the specified + * @inode and copies data from a nilfs_inode to a corresponding inode + * entry in the inode block. This operation is excluded from the segment + * construction. This function can be called both as a single operation + * and as a part of indivisible file operations. + */ +void nilfs_dirty_inode(struct inode *inode, int flags) +{ + struct nilfs_transaction_info ti; + struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + + if (is_bad_inode(inode)) { + nilfs_warning(inode->i_sb, __func__, + "tried to mark bad_inode dirty. ignored.\n"); + dump_stack(); + return; + } + if (mdi) { + nilfs_mdt_mark_dirty(inode); + return; + } + nilfs_transaction_begin(inode->i_sb, &ti, 0); + nilfs_mark_inode_dirty(inode); + nilfs_transaction_commit(inode->i_sb); /* never fails */ +} + +int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + __u64 logical = 0, phys = 0, size = 0; + __u32 flags = 0; + loff_t isize; + sector_t blkoff, end_blkoff; + sector_t delalloc_blkoff; + unsigned long delalloc_blklen; + unsigned int blkbits = inode->i_blkbits; + int ret, n; + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + + blkoff = start >> blkbits; + end_blkoff = (start + len - 1) >> blkbits; + + delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff, + &delalloc_blkoff); + + do { + __u64 blkphy; + unsigned int maxblocks; + + if (delalloc_blklen && blkoff == delalloc_blkoff) { + if (size) { + /* End of the current extent */ + ret = fiemap_fill_next_extent( + fieinfo, logical, phys, size, flags); + if (ret) + break; + } + if (blkoff > end_blkoff) + break; + + flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC; + logical = blkoff << blkbits; + phys = 0; + size = delalloc_blklen << blkbits; + + blkoff = delalloc_blkoff + delalloc_blklen; + delalloc_blklen = nilfs_find_uncommitted_extent( + inode, blkoff, &delalloc_blkoff); + continue; + } + + /* + * Limit the number of blocks that we look up so as + * not to get into the next delayed allocation extent. + */ + maxblocks = INT_MAX; + if (delalloc_blklen) + maxblocks = min_t(sector_t, delalloc_blkoff - blkoff, + maxblocks); + blkphy = 0; + + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + n = nilfs_bmap_lookup_contig( + NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + + if (n < 0) { + int past_eof; + + if (unlikely(n != -ENOENT)) + break; /* error */ + + /* HOLE */ + blkoff++; + past_eof = ((blkoff << blkbits) >= isize); + + if (size) { + /* End of the current extent */ + + if (past_eof) + flags |= FIEMAP_EXTENT_LAST; + + ret = fiemap_fill_next_extent( + fieinfo, logical, phys, size, flags); + if (ret) + break; + size = 0; + } + if (blkoff > end_blkoff || past_eof) + break; + } else { + if (size) { + if (phys && blkphy << blkbits == phys + size) { + /* The current extent goes on */ + size += n << blkbits; + } else { + /* Terminate the current extent */ + ret = fiemap_fill_next_extent( + fieinfo, logical, phys, size, + flags); + if (ret || blkoff > end_blkoff) + break; + + /* Start another extent */ + flags = FIEMAP_EXTENT_MERGED; + logical = blkoff << blkbits; + phys = blkphy << blkbits; + size = n << blkbits; + } + } else { + /* Start a new extent */ + flags = FIEMAP_EXTENT_MERGED; + logical = blkoff << blkbits; + phys = blkphy << blkbits; + size = n << blkbits; + } + blkoff += n; + } + cond_resched(); + } while (true); + + /* If ret is 1 then we just hit the end of the extent array */ + if (ret == 1) + ret = 0; + + mutex_unlock(&inode->i_mutex); + return ret; +} diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c new file mode 100644 index 00000000..3e654273 --- /dev/null +++ b/fs/nilfs2/ioctl.c @@ -0,0 +1,863 @@ +/* + * ioctl.c - NILFS ioctl operations. + * + * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#include +#include +#include +#include /* capable() */ +#include /* copy_from_user(), copy_to_user() */ +#include +#include /* compat_ptr() */ +#include /* mnt_want_write(), mnt_drop_write() */ +#include +#include +#include "nilfs.h" +#include "segment.h" +#include "bmap.h" +#include "cpfile.h" +#include "sufile.h" +#include "dat.h" + + +static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, + struct nilfs_argv *argv, int dir, + ssize_t (*dofunc)(struct the_nilfs *, + __u64 *, int, + void *, size_t, size_t)) +{ + void *buf; + void __user *base = (void __user *)(unsigned long)argv->v_base; + size_t maxmembs, total, n; + ssize_t nr; + int ret, i; + __u64 pos, ppos; + + if (argv->v_nmembs == 0) + return 0; + + if (argv->v_size > PAGE_SIZE) + return -EINVAL; + + buf = (void *)__get_free_pages(GFP_NOFS, 0); + if (unlikely(!buf)) + return -ENOMEM; + maxmembs = PAGE_SIZE / argv->v_size; + + ret = 0; + total = 0; + pos = argv->v_index; + for (i = 0; i < argv->v_nmembs; i += n) { + n = (argv->v_nmembs - i < maxmembs) ? + argv->v_nmembs - i : maxmembs; + if ((dir & _IOC_WRITE) && + copy_from_user(buf, base + argv->v_size * i, + argv->v_size * n)) { + ret = -EFAULT; + break; + } + ppos = pos; + nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size, + n); + if (nr < 0) { + ret = nr; + break; + } + if ((dir & _IOC_READ) && + copy_to_user(base + argv->v_size * i, buf, + argv->v_size * nr)) { + ret = -EFAULT; + break; + } + total += nr; + if ((size_t)nr < n) + break; + if (pos == ppos) + pos += n; + } + argv->v_nmembs = total; + + free_pages((unsigned long)buf, 0); + return ret; +} + +static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp) +{ + unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE; + + return put_user(flags, (int __user *)argp); +} + +static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, + void __user *argp) +{ + struct nilfs_transaction_info ti; + unsigned int flags, oldflags; + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *)argp)) + return -EFAULT; + + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + + flags = nilfs_mask_flags(inode->i_mode, flags); + + mutex_lock(&inode->i_mutex); + + oldflags = NILFS_I(inode)->i_flags; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by the + * relevant capability. + */ + ret = -EPERM; + if (((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) && + !capable(CAP_LINUX_IMMUTABLE)) + goto out; + + ret = nilfs_transaction_begin(inode->i_sb, &ti, 0); + if (ret) + goto out; + + NILFS_I(inode)->i_flags = (oldflags & ~FS_FL_USER_MODIFIABLE) | + (flags & FS_FL_USER_MODIFIABLE); + + nilfs_set_inode_flags(inode); + inode->i_ctime = CURRENT_TIME; + if (IS_SYNC(inode)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + + nilfs_mark_inode_dirty(inode); + ret = nilfs_transaction_commit(inode->i_sb); +out: + mutex_unlock(&inode->i_mutex); + mnt_drop_write(filp->f_path.mnt); + return ret; +} + +static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp) +{ + return put_user(inode->i_generation, (int __user *)argp); +} + +static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_transaction_info ti; + struct nilfs_cpmode cpmode; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + + ret = -EFAULT; + if (copy_from_user(&cpmode, argp, sizeof(cpmode))) + goto out; + + down_read(&inode->i_sb->s_umount); + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_cpfile_change_cpmode( + nilfs->ns_cpfile, cpmode.cm_cno, cpmode.cm_mode); + if (unlikely(ret < 0)) + nilfs_transaction_abort(inode->i_sb); + else + nilfs_transaction_commit(inode->i_sb); /* never fails */ + + up_read(&inode->i_sb->s_umount); +out: + mnt_drop_write(filp->f_path.mnt); + return ret; +} + +static int +nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_transaction_info ti; + __u64 cno; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + + ret = -EFAULT; + if (copy_from_user(&cno, argp, sizeof(cno))) + goto out; + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_cpfile_delete_checkpoint(nilfs->ns_cpfile, cno); + if (unlikely(ret < 0)) + nilfs_transaction_abort(inode->i_sb); + else + nilfs_transaction_commit(inode->i_sb); /* never fails */ +out: + mnt_drop_write(filp->f_path.mnt); + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf, + size, nmembs); + up_read(&nilfs->ns_segctor_sem); + return ret; +} + +static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_cpstat cpstat; + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &cpstat, sizeof(cpstat))) + ret = -EFAULT; + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, size, + nmembs); + up_read(&nilfs->ns_segctor_sem); + return ret; +} + +static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_sustat sustat; + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &sustat, sizeof(sustat))) + ret = -EFAULT; + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_dat_get_vinfo(nilfs->ns_dat, buf, size, nmembs); + up_read(&nilfs->ns_segctor_sem); + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap; + struct nilfs_bdesc *bdescs = buf; + int ret, i; + + down_read(&nilfs->ns_segctor_sem); + for (i = 0; i < nmembs; i++) { + ret = nilfs_bmap_lookup_at_level(bmap, + bdescs[i].bd_offset, + bdescs[i].bd_level + 1, + &bdescs[i].bd_blocknr); + if (ret < 0) { + if (ret != -ENOENT) { + up_read(&nilfs->ns_segctor_sem); + return ret; + } + bdescs[i].bd_blocknr = 0; + } + } + up_read(&nilfs->ns_segctor_sem); + return nmembs; +} + +static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + if (argv.v_size != sizeof(struct nilfs_bdesc)) + return -EINVAL; + + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), + nilfs_ioctl_do_get_bdescs); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +static int nilfs_ioctl_move_inode_block(struct inode *inode, + struct nilfs_vdesc *vdesc, + struct list_head *buffers) +{ + struct buffer_head *bh; + int ret; + + if (vdesc->vd_flags == 0) + ret = nilfs_gccache_submit_read_data( + inode, vdesc->vd_offset, vdesc->vd_blocknr, + vdesc->vd_vblocknr, &bh); + else + ret = nilfs_gccache_submit_read_node( + inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh); + + if (unlikely(ret < 0)) { + if (ret == -ENOENT) + printk(KERN_CRIT + "%s: invalid virtual block address (%s): " + "ino=%llu, cno=%llu, offset=%llu, " + "blocknr=%llu, vblocknr=%llu\n", + __func__, vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); + return ret; + } + if (unlikely(!list_empty(&bh->b_assoc_buffers))) { + printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, " + "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n", + __func__, vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); + brelse(bh); + return -EEXIST; + } + list_add_tail(&bh->b_assoc_buffers, buffers); + return 0; +} + +static int nilfs_ioctl_move_blocks(struct super_block *sb, + struct nilfs_argv *argv, void *buf) +{ + size_t nmembs = argv->v_nmembs; + struct the_nilfs *nilfs = sb->s_fs_info; + struct inode *inode; + struct nilfs_vdesc *vdesc; + struct buffer_head *bh, *n; + LIST_HEAD(buffers); + ino_t ino; + __u64 cno; + int i, ret; + + for (i = 0, vdesc = buf; i < nmembs; ) { + ino = vdesc->vd_ino; + cno = vdesc->vd_cno; + inode = nilfs_iget_for_gc(sb, ino, cno); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto failed; + } + if (list_empty(&NILFS_I(inode)->i_dirty)) { + /* + * Add the inode to GC inode list. Garbage Collection + * is serialized and no two processes manipulate the + * list simultaneously. + */ + igrab(inode); + list_add(&NILFS_I(inode)->i_dirty, + &nilfs->ns_gc_inodes); + } + + do { + ret = nilfs_ioctl_move_inode_block(inode, vdesc, + &buffers); + if (unlikely(ret < 0)) { + iput(inode); + goto failed; + } + vdesc++; + } while (++i < nmembs && + vdesc->vd_ino == ino && vdesc->vd_cno == cno); + + iput(inode); /* The inode still remains in GC inode list */ + } + + list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { + ret = nilfs_gccache_wait_and_mark_dirty(bh); + if (unlikely(ret < 0)) { + WARN_ON(ret == -EEXIST); + goto failed; + } + list_del_init(&bh->b_assoc_buffers); + brelse(bh); + } + return nmembs; + + failed: + list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + brelse(bh); + } + return ret; +} + +static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs, + struct nilfs_argv *argv, void *buf) +{ + size_t nmembs = argv->v_nmembs; + struct inode *cpfile = nilfs->ns_cpfile; + struct nilfs_period *periods = buf; + int ret, i; + + for (i = 0; i < nmembs; i++) { + ret = nilfs_cpfile_delete_checkpoints( + cpfile, periods[i].p_start, periods[i].p_end); + if (ret < 0) + return ret; + } + return nmembs; +} + +static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, + struct nilfs_argv *argv, void *buf) +{ + size_t nmembs = argv->v_nmembs; + int ret; + + ret = nilfs_dat_freev(nilfs->ns_dat, buf, nmembs); + + return (ret < 0) ? ret : nmembs; +} + +static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, + struct nilfs_argv *argv, void *buf) +{ + size_t nmembs = argv->v_nmembs; + struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap; + struct nilfs_bdesc *bdescs = buf; + int ret, i; + + for (i = 0; i < nmembs; i++) { + /* XXX: use macro or inline func to check liveness */ + ret = nilfs_bmap_lookup_at_level(bmap, + bdescs[i].bd_offset, + bdescs[i].bd_level + 1, + &bdescs[i].bd_blocknr); + if (ret < 0) { + if (ret != -ENOENT) + return ret; + bdescs[i].bd_blocknr = 0; + } + if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr) + /* skip dead block */ + continue; + if (bdescs[i].bd_level == 0) { + ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat, + bdescs[i].bd_offset); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + } else { + ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset, + bdescs[i].bd_level); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + } + } + return nmembs; +} + +int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, + struct nilfs_argv *argv, void **kbufs) +{ + const char *msg; + int ret; + + ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]); + if (ret < 0) { + /* + * can safely abort because checkpoints can be removed + * independently. + */ + msg = "cannot delete checkpoints"; + goto failed; + } + ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], kbufs[2]); + if (ret < 0) { + /* + * can safely abort because DAT file is updated atomically + * using a copy-on-write technique. + */ + msg = "cannot delete virtual blocks from DAT file"; + goto failed; + } + ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], kbufs[3]); + if (ret < 0) { + /* + * can safely abort because the operation is nondestructive. + */ + msg = "cannot mark copying blocks dirty"; + goto failed; + } + return 0; + + failed: + printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n", + msg, ret); + return ret; +} + +static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct nilfs_argv argv[5]; + static const size_t argsz[5] = { + sizeof(struct nilfs_vdesc), + sizeof(struct nilfs_period), + sizeof(__u64), + sizeof(struct nilfs_bdesc), + sizeof(__u64), + }; + void __user *base; + void *kbufs[5]; + struct the_nilfs *nilfs; + size_t len, nsegs; + int n, ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + + ret = -EFAULT; + if (copy_from_user(argv, argp, sizeof(argv))) + goto out; + + ret = -EINVAL; + nsegs = argv[4].v_nmembs; + if (argv[4].v_size != argsz[4]) + goto out; + + /* + * argv[4] points to segment numbers this ioctl cleans. We + * use kmalloc() for its buffer because memory used for the + * segment numbers is enough small. + */ + kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base, + nsegs * sizeof(__u64)); + if (IS_ERR(kbufs[4])) { + ret = PTR_ERR(kbufs[4]); + goto out; + } + nilfs = inode->i_sb->s_fs_info; + + for (n = 0; n < 4; n++) { + ret = -EINVAL; + if (argv[n].v_size != argsz[n]) + goto out_free; + + if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment) + goto out_free; + + len = argv[n].v_size * argv[n].v_nmembs; + base = (void __user *)(unsigned long)argv[n].v_base; + if (len == 0) { + kbufs[n] = NULL; + continue; + } + + kbufs[n] = vmalloc(len); + if (!kbufs[n]) { + ret = -ENOMEM; + goto out_free; + } + if (copy_from_user(kbufs[n], base, len)) { + ret = -EFAULT; + vfree(kbufs[n]); + goto out_free; + } + } + + /* + * nilfs_ioctl_move_blocks() will call nilfs_iget_for_gc(), + * which will operates an inode list without blocking. + * To protect the list from concurrent operations, + * nilfs_ioctl_move_blocks should be atomic operation. + */ + if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) { + ret = -EBUSY; + goto out_free; + } + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]); + if (ret < 0) + printk(KERN_ERR "NILFS: GC failed during preparation: " + "cannot read source blocks: err=%d\n", ret); + else + ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + + nilfs_remove_all_gcinodes(nilfs); + clear_nilfs_gc_running(nilfs); + +out_free: + while (--n >= 0) + vfree(kbufs[n]); + kfree(kbufs[4]); +out: + mnt_drop_write(filp->f_path.mnt); + return ret; +} + +static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + __u64 cno; + int ret; + struct the_nilfs *nilfs; + + ret = nilfs_construct_segment(inode->i_sb); + if (ret < 0) + return ret; + + if (argp != NULL) { + nilfs = inode->i_sb->s_fs_info; + down_read(&nilfs->ns_segctor_sem); + cno = nilfs->ns_cno - 1; + up_read(&nilfs->ns_segctor_sem); + if (copy_to_user(argp, &cno, sizeof(cno))) + return -EFAULT; + } + return 0; +} + +static int nilfs_ioctl_resize(struct inode *inode, struct file *filp, + void __user *argp) +{ + __u64 newsize; + int ret = -EPERM; + + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + goto out; + + ret = -EFAULT; + if (copy_from_user(&newsize, argp, sizeof(newsize))) + goto out_drop_write; + + ret = nilfs_resize_fs(inode->i_sb, newsize); + +out_drop_write: + mnt_drop_write(filp->f_path.mnt); +out: + return ret; +} + +static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + __u64 range[2]; + __u64 minseg, maxseg; + unsigned long segbytes; + int ret = -EPERM; + + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -EFAULT; + if (copy_from_user(range, argp, sizeof(__u64[2]))) + goto out; + + ret = -ERANGE; + if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode)) + goto out; + + segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize; + + minseg = range[0] + segbytes - 1; + do_div(minseg, segbytes); + maxseg = NILFS_SB2_OFFSET_BYTES(range[1]); + do_div(maxseg, segbytes); + maxseg--; + + ret = nilfs_sufile_set_alloc_range(nilfs->ns_sufile, minseg, maxseg); +out: + return ret; +} + +static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp, + size_t membsz, + ssize_t (*dofunc)(struct the_nilfs *, + __u64 *, int, + void *, size_t, size_t)) + +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + if (argv.v_size < membsz) + return -EINVAL; + + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case FS_IOC_GETFLAGS: + return nilfs_ioctl_getflags(inode, argp); + case FS_IOC_SETFLAGS: + return nilfs_ioctl_setflags(inode, filp, argp); + case FS_IOC_GETVERSION: + return nilfs_ioctl_getversion(inode, argp); + case NILFS_IOCTL_CHANGE_CPMODE: + return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp); + case NILFS_IOCTL_DELETE_CHECKPOINT: + return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_CPINFO: + return nilfs_ioctl_get_info(inode, filp, cmd, argp, + sizeof(struct nilfs_cpinfo), + nilfs_ioctl_do_get_cpinfo); + case NILFS_IOCTL_GET_CPSTAT: + return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_SUINFO: + return nilfs_ioctl_get_info(inode, filp, cmd, argp, + sizeof(struct nilfs_suinfo), + nilfs_ioctl_do_get_suinfo); + case NILFS_IOCTL_GET_SUSTAT: + return nilfs_ioctl_get_sustat(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_VINFO: + return nilfs_ioctl_get_info(inode, filp, cmd, argp, + sizeof(struct nilfs_vinfo), + nilfs_ioctl_do_get_vinfo); + case NILFS_IOCTL_GET_BDESCS: + return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp); + case NILFS_IOCTL_CLEAN_SEGMENTS: + return nilfs_ioctl_clean_segments(inode, filp, cmd, argp); + case NILFS_IOCTL_SYNC: + return nilfs_ioctl_sync(inode, filp, cmd, argp); + case NILFS_IOCTL_RESIZE: + return nilfs_ioctl_resize(inode, filp, argp); + case NILFS_IOCTL_SET_ALLOC_RANGE: + return nilfs_ioctl_set_alloc_range(inode, argp); + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; + break; + case NILFS_IOCTL_CHANGE_CPMODE: + case NILFS_IOCTL_DELETE_CHECKPOINT: + case NILFS_IOCTL_GET_CPINFO: + case NILFS_IOCTL_GET_CPSTAT: + case NILFS_IOCTL_GET_SUINFO: + case NILFS_IOCTL_GET_SUSTAT: + case NILFS_IOCTL_GET_VINFO: + case NILFS_IOCTL_GET_BDESCS: + case NILFS_IOCTL_CLEAN_SEGMENTS: + case NILFS_IOCTL_SYNC: + case NILFS_IOCTL_RESIZE: + case NILFS_IOCTL_SET_ALLOC_RANGE: + break; + default: + return -ENOIOCTLCMD; + } + return nilfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c new file mode 100644 index 00000000..800e8d78 --- /dev/null +++ b/fs/nilfs2/mdt.c @@ -0,0 +1,589 @@ +/* + * mdt.c - meta data file for NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + */ + +#include +#include +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "btnode.h" +#include "segment.h" +#include "page.h" +#include "mdt.h" + + +#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) + + +static int +nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, + struct buffer_head *bh, + void (*init_block)(struct inode *, + struct buffer_head *, void *)) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + void *kaddr; + int ret; + + /* Caller exclude read accesses using page lock */ + + /* set_buffer_new(bh); */ + bh->b_blocknr = 0; + + ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh); + if (unlikely(ret)) + return ret; + + set_buffer_mapped(bh); + + kaddr = kmap_atomic(bh->b_page, KM_USER0); + memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits); + if (init_block) + init_block(inode, bh, kaddr); + flush_dcache_page(bh->b_page); + kunmap_atomic(kaddr, KM_USER0); + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(inode); + return 0; +} + +static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, + struct buffer_head **out_bh, + void (*init_block)(struct inode *, + struct buffer_head *, + void *)) +{ + struct super_block *sb = inode->i_sb; + struct nilfs_transaction_info ti; + struct buffer_head *bh; + int err; + + nilfs_transaction_begin(sb, &ti, 0); + + err = -ENOMEM; + bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0); + if (unlikely(!bh)) + goto failed_unlock; + + err = -EEXIST; + if (buffer_uptodate(bh)) + goto failed_bh; + + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + goto failed_bh; + + bh->b_bdev = sb->s_bdev; + err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); + if (likely(!err)) { + get_bh(bh); + *out_bh = bh; + } + + failed_bh: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + brelse(bh); + + failed_unlock: + if (likely(!err)) + err = nilfs_transaction_commit(sb); + else + nilfs_transaction_abort(sb); + + return err; +} + +static int +nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, + int mode, struct buffer_head **out_bh) +{ + struct buffer_head *bh; + __u64 blknum = 0; + int ret = -ENOMEM; + + bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); + if (unlikely(!bh)) + goto failed; + + ret = -EEXIST; /* internal code */ + if (buffer_uptodate(bh)) + goto out; + + if (mode == READA) { + if (!trylock_buffer(bh)) { + ret = -EBUSY; + goto failed_bh; + } + } else /* mode == READ */ + lock_buffer(bh); + + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + goto out; + } + + ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum); + if (unlikely(ret)) { + unlock_buffer(bh); + goto failed_bh; + } + map_bh(bh, inode->i_sb, (sector_t)blknum); + + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(mode, bh); + ret = 0; + out: + get_bh(bh); + *out_bh = bh; + + failed_bh: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + brelse(bh); + failed: + return ret; +} + +static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, + int readahead, struct buffer_head **out_bh) +{ + struct buffer_head *first_bh, *bh; + unsigned long blkoff; + int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS; + int err; + + err = nilfs_mdt_submit_block(inode, block, READ, &first_bh); + if (err == -EEXIST) /* internal code */ + goto out; + + if (unlikely(err)) + goto failed; + + if (readahead) { + blkoff = block + 1; + for (i = 0; i < nr_ra_blocks; i++, blkoff++) { + err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh); + if (likely(!err || err == -EEXIST)) + brelse(bh); + else if (err != -EBUSY) + break; + /* abort readahead if bmap lookup failed */ + if (!buffer_locked(first_bh)) + goto out_no_wait; + } + } + + wait_on_buffer(first_bh); + + out_no_wait: + err = -EIO; + if (!buffer_uptodate(first_bh)) + goto failed_bh; + out: + *out_bh = first_bh; + return 0; + + failed_bh: + brelse(first_bh); + failed: + return err; +} + +/** + * nilfs_mdt_get_block - read or create a buffer on meta data file. + * @inode: inode of the meta data file + * @blkoff: block offset + * @create: create flag + * @init_block: initializer used for newly allocated block + * @out_bh: output of a pointer to the buffer_head + * + * nilfs_mdt_get_block() looks up the specified buffer and tries to create + * a new buffer if @create is not zero. On success, the returned buffer is + * assured to be either existing or formatted using a buffer lock on success. + * @out_bh is substituted only when zero is returned. + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - the specified block does not exist (hole block) + * + * %-EROFS - Read only filesystem (for create mode) + */ +int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, + void (*init_block)(struct inode *, + struct buffer_head *, void *), + struct buffer_head **out_bh) +{ + int ret; + + /* Should be rewritten with merging nilfs_mdt_read_block() */ + retry: + ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh); + if (!create || ret != -ENOENT) + return ret; + + ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block); + if (unlikely(ret == -EEXIST)) { + /* create = 0; */ /* limit read-create loop retries */ + goto retry; + } + return ret; +} + +/** + * nilfs_mdt_delete_block - make a hole on the meta data file. + * @inode: inode of the meta data file + * @block: block offset + * + * Return Value: On success, zero is returned. + * On error, one of the following negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + */ +int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + err = nilfs_bmap_delete(ii->i_bmap, block); + if (!err || err == -ENOENT) { + nilfs_mdt_mark_dirty(inode); + nilfs_mdt_forget_block(inode, block); + } + return err; +} + +/** + * nilfs_mdt_forget_block - discard dirty state and try to remove the page + * @inode: inode of the meta data file + * @block: block offset + * + * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and + * tries to release the page including the buffer from a page cache. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EBUSY - page has an active buffer. + * + * %-ENOENT - page cache has no page addressed by the offset. + */ +int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) +{ + pgoff_t index = (pgoff_t)block >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + struct page *page; + unsigned long first_block; + int ret = 0; + int still_dirty; + + page = find_lock_page(inode->i_mapping, index); + if (!page) + return -ENOENT; + + wait_on_page_writeback(page); + + first_block = (unsigned long)index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + if (page_has_buffers(page)) { + struct buffer_head *bh; + + bh = nilfs_page_get_nth_block(page, block - first_block); + nilfs_forget_buffer(bh); + } + still_dirty = PageDirty(page); + unlock_page(page); + page_cache_release(page); + + if (still_dirty || + invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0) + ret = -EBUSY; + return ret; +} + +/** + * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty. + * @inode: inode of the meta data file + * @block: block offset + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - the specified block does not exist (hole block) + */ +int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) +{ + struct buffer_head *bh; + int err; + + err = nilfs_mdt_read_block(inode, block, 0, &bh); + if (unlikely(err)) + return err; + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(inode); + brelse(bh); + return 0; +} + +int nilfs_mdt_fetch_dirty(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) { + set_bit(NILFS_I_DIRTY, &ii->i_state); + return 1; + } + return test_bit(NILFS_I_DIRTY, &ii->i_state); +} + +static int +nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode; + struct super_block *sb; + int err = 0; + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + + inode = page->mapping->host; + if (!inode) + return 0; + + sb = inode->i_sb; + + if (wbc->sync_mode == WB_SYNC_ALL) + err = nilfs_construct_segment(sb); + else if (wbc->for_reclaim) + nilfs_flush_segment(sb, inode->i_ino); + + return err; +} + + +static const struct address_space_operations def_mdt_aops = { + .writepage = nilfs_mdt_write_page, +}; + +static const struct inode_operations def_mdt_iops; +static const struct file_operations def_mdt_fops; + + +int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz) +{ + struct nilfs_mdt_info *mi; + + mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS); + if (!mi) + return -ENOMEM; + + init_rwsem(&mi->mi_sem); + inode->i_private = mi; + + inode->i_mode = S_IFREG; + mapping_set_gfp_mask(inode->i_mapping, gfp_mask); + inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; + + inode->i_op = &def_mdt_iops; + inode->i_fop = &def_mdt_fops; + inode->i_mapping->a_ops = &def_mdt_aops; + + return 0; +} + +void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size, + unsigned header_size) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + + mi->mi_entry_size = entry_size; + mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size; + mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); +} + +/** + * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file + * @inode: inode of the metadata file + * @shadow: shadow mapping + */ +int nilfs_mdt_setup_shadow_map(struct inode *inode, + struct nilfs_shadow_map *shadow) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + struct backing_dev_info *bdi = inode->i_sb->s_bdi; + + INIT_LIST_HEAD(&shadow->frozen_buffers); + address_space_init_once(&shadow->frozen_data); + nilfs_mapping_init(&shadow->frozen_data, inode, bdi); + address_space_init_once(&shadow->frozen_btnodes); + nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi); + mi->mi_shadow = shadow; + return 0; +} + +/** + * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map + * @inode: inode of the metadata file + */ +int nilfs_mdt_save_to_shadow_map(struct inode *inode) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + struct nilfs_inode_info *ii = NILFS_I(inode); + struct nilfs_shadow_map *shadow = mi->mi_shadow; + int ret; + + ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping); + if (ret) + goto out; + + ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes, + &ii->i_btnode_cache); + if (ret) + goto out; + + nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store); + out: + return ret; +} + +int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) +{ + struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow; + struct buffer_head *bh_frozen; + struct page *page; + int blkbits = inode->i_blkbits; + + page = grab_cache_page(&shadow->frozen_data, bh->b_page->index); + if (!page) + return -ENOMEM; + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << blkbits, 0); + + bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits); + + if (!buffer_uptodate(bh_frozen)) + nilfs_copy_buffer(bh_frozen, bh); + if (list_empty(&bh_frozen->b_assoc_buffers)) { + list_add_tail(&bh_frozen->b_assoc_buffers, + &shadow->frozen_buffers); + set_buffer_nilfs_redirected(bh); + } else { + brelse(bh_frozen); /* already frozen */ + } + + unlock_page(page); + page_cache_release(page); + return 0; +} + +struct buffer_head * +nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) +{ + struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow; + struct buffer_head *bh_frozen = NULL; + struct page *page; + int n; + + page = find_lock_page(&shadow->frozen_data, bh->b_page->index); + if (page) { + if (page_has_buffers(page)) { + n = bh_offset(bh) >> inode->i_blkbits; + bh_frozen = nilfs_page_get_nth_block(page, n); + } + unlock_page(page); + page_cache_release(page); + } + return bh_frozen; +} + +static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow) +{ + struct list_head *head = &shadow->frozen_buffers; + struct buffer_head *bh; + + while (!list_empty(head)) { + bh = list_first_entry(head, struct buffer_head, + b_assoc_buffers); + list_del_init(&bh->b_assoc_buffers); + brelse(bh); /* drop ref-count to make it releasable */ + } +} + +/** + * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state + * @inode: inode of the metadata file + */ +void nilfs_mdt_restore_from_shadow_map(struct inode *inode) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + struct nilfs_inode_info *ii = NILFS_I(inode); + struct nilfs_shadow_map *shadow = mi->mi_shadow; + + down_write(&mi->mi_sem); + + if (mi->mi_palloc_cache) + nilfs_palloc_clear_cache(inode); + + nilfs_clear_dirty_pages(inode->i_mapping); + nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data); + + nilfs_clear_dirty_pages(&ii->i_btnode_cache); + nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes); + + nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store); + + up_write(&mi->mi_sem); +} + +/** + * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches + * @inode: inode of the metadata file + */ +void nilfs_mdt_clear_shadow_map(struct inode *inode) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + struct nilfs_shadow_map *shadow = mi->mi_shadow; + + down_write(&mi->mi_sem); + nilfs_release_frozen_buffers(shadow); + truncate_inode_pages(&shadow->frozen_data, 0); + truncate_inode_pages(&shadow->frozen_btnodes, 0); + up_write(&mi->mi_sem); +} diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h new file mode 100644 index 00000000..ab20a4ba --- /dev/null +++ b/fs/nilfs2/mdt.h @@ -0,0 +1,110 @@ +/* + * mdt.h - NILFS meta data file prototype and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + */ + +#ifndef _NILFS_MDT_H +#define _NILFS_MDT_H + +#include +#include +#include "nilfs.h" +#include "page.h" + +struct nilfs_shadow_map { + struct nilfs_bmap_store bmap_store; + struct address_space frozen_data; + struct address_space frozen_btnodes; + struct list_head frozen_buffers; +}; + +/** + * struct nilfs_mdt_info - on-memory private data of meta data files + * @mi_sem: reader/writer semaphore for meta data operations + * @mi_bgl: per-blockgroup locking + * @mi_entry_size: size of an entry + * @mi_first_entry_offset: offset to the first entry + * @mi_entries_per_block: number of entries in a block + * @mi_palloc_cache: persistent object allocator cache + * @mi_shadow: shadow of bmap and page caches + * @mi_blocks_per_group: number of blocks in a group + * @mi_blocks_per_desc_block: number of blocks per descriptor block + */ +struct nilfs_mdt_info { + struct rw_semaphore mi_sem; + struct blockgroup_lock *mi_bgl; + unsigned mi_entry_size; + unsigned mi_first_entry_offset; + unsigned long mi_entries_per_block; + struct nilfs_palloc_cache *mi_palloc_cache; + struct nilfs_shadow_map *mi_shadow; + unsigned long mi_blocks_per_group; + unsigned long mi_blocks_per_desc_block; +}; + +static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode) +{ + return inode->i_private; +} + +/* Default GFP flags using highmem */ +#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM) + +int nilfs_mdt_get_block(struct inode *, unsigned long, int, + void (*init_block)(struct inode *, + struct buffer_head *, void *), + struct buffer_head **); +int nilfs_mdt_delete_block(struct inode *, unsigned long); +int nilfs_mdt_forget_block(struct inode *, unsigned long); +int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); +int nilfs_mdt_fetch_dirty(struct inode *); + +int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz); +void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned); + +int nilfs_mdt_setup_shadow_map(struct inode *inode, + struct nilfs_shadow_map *shadow); +int nilfs_mdt_save_to_shadow_map(struct inode *inode); +void nilfs_mdt_restore_from_shadow_map(struct inode *inode); +void nilfs_mdt_clear_shadow_map(struct inode *inode); +int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh); +struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode, + struct buffer_head *bh); + +static inline void nilfs_mdt_mark_dirty(struct inode *inode) +{ + if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state)) + set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state); +} + +static inline void nilfs_mdt_clear_dirty(struct inode *inode) +{ + clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state); +} + +static inline __u64 nilfs_mdt_cno(struct inode *inode) +{ + return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno; +} + +#define nilfs_mdt_bgl_lock(inode, bg) \ + (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock) + +#endif /* _NILFS_MDT_H */ diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c new file mode 100644 index 00000000..546849b3 --- /dev/null +++ b/fs/nilfs2/namei.c @@ -0,0 +1,594 @@ +/* + * namei.c - NILFS pathname lookup operations. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Modified for NILFS by Amagai Yoshiji , + * Ryusuke Konishi + */ +/* + * linux/fs/ext2/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include "nilfs.h" +#include "export.h" + +#define NILFS_FID_SIZE_NON_CONNECTABLE \ + (offsetof(struct nilfs_fid, parent_gen) / 4) +#define NILFS_FID_SIZE_CONNECTABLE (sizeof(struct nilfs_fid) / 4) + +static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = nilfs_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +/* + * Methods themselves. + */ + +static struct dentry * +nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode; + ino_t ino; + + if (dentry->d_name.len > NILFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ino = nilfs_inode_by_name(dir, &dentry->d_name); + inode = NULL; + if (ino) { + inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + return d_splice_alias(inode, dentry); +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + inode = nilfs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &nilfs_file_inode_operations; + inode->i_fop = &nilfs_file_operations; + inode->i_mapping->a_ops = &nilfs_aops; + nilfs_mark_inode_dirty(inode); + err = nilfs_add_nondir(dentry, inode); + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int +nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + inode = nilfs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); + nilfs_mark_inode_dirty(inode); + err = nilfs_add_nondir(dentry, inode); + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct nilfs_transaction_info ti; + struct super_block *sb = dir->i_sb; + unsigned l = strlen(symname)+1; + struct inode *inode; + int err; + + if (l > sb->s_blocksize) + return -ENAMETOOLONG; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + /* slow symlink */ + inode->i_op = &nilfs_symlink_inode_operations; + inode->i_mapping->a_ops = &nilfs_aops; + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + + /* mark_inode_dirty(inode); */ + /* page_symlink() do this */ + + err = nilfs_add_nondir(dentry, inode); +out: + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; + +out_fail: + drop_nlink(inode); + nilfs_mark_inode_dirty(inode); + iput(inode); + goto out; +} + +static int nilfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + struct nilfs_transaction_info ti; + int err; + + if (inode->i_nlink >= NILFS_LINK_MAX) + return -EMLINK; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inode->i_ctime = CURRENT_TIME; + inode_inc_link_count(inode); + ihold(inode); + + err = nilfs_add_nondir(dentry, inode); + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + if (dir->i_nlink >= NILFS_LINK_MAX) + return -EMLINK; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inc_nlink(dir); + + inode = nilfs_new_inode(dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + inode->i_op = &nilfs_dir_inode_operations; + inode->i_fop = &nilfs_dir_operations; + inode->i_mapping->a_ops = &nilfs_aops; + + inc_nlink(inode); + + err = nilfs_make_empty(inode, dir); + if (err) + goto out_fail; + + err = nilfs_add_link(dentry, inode); + if (err) + goto out_fail; + + nilfs_mark_inode_dirty(inode); + d_instantiate(dentry, inode); +out: + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; + +out_fail: + drop_nlink(inode); + drop_nlink(inode); + nilfs_mark_inode_dirty(inode); + iput(inode); +out_dir: + drop_nlink(dir); + nilfs_mark_inode_dirty(dir); + goto out; +} + +static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode; + struct nilfs_dir_entry *de; + struct page *page; + int err; + + err = -ENOENT; + de = nilfs_find_entry(dir, &dentry->d_name, &page); + if (!de) + goto out; + + inode = dentry->d_inode; + err = -EIO; + if (le64_to_cpu(de->inode) != inode->i_ino) + goto out; + + if (!inode->i_nlink) { + nilfs_warning(inode->i_sb, __func__, + "deleting nonexistent file (%lu), %d\n", + inode->i_ino, inode->i_nlink); + inode->i_nlink = 1; + } + err = nilfs_delete_entry(de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + drop_nlink(inode); + err = 0; +out: + return err; +} + +static int nilfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 0); + if (err) + return err; + + err = nilfs_do_unlink(dir, dentry); + + if (!err) { + nilfs_mark_inode_dirty(dir); + nilfs_mark_inode_dirty(dentry->d_inode); + err = nilfs_transaction_commit(dir->i_sb); + } else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 0); + if (err) + return err; + + err = -ENOTEMPTY; + if (nilfs_empty_dir(inode)) { + err = nilfs_do_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + drop_nlink(inode); + nilfs_mark_inode_dirty(inode); + drop_nlink(dir); + nilfs_mark_inode_dirty(dir); + } + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct page *dir_page = NULL; + struct nilfs_dir_entry *dir_de = NULL; + struct page *old_page; + struct nilfs_dir_entry *old_de; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1); + if (unlikely(err)) + return err; + + err = -ENOENT; + old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = nilfs_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page *new_page; + struct nilfs_dir_entry *new_de; + + err = -ENOTEMPTY; + if (dir_de && !nilfs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page); + if (!new_de) + goto out_dir; + nilfs_set_link(new_dir, new_de, new_page, old_inode); + nilfs_mark_inode_dirty(new_dir); + new_inode->i_ctime = CURRENT_TIME; + if (dir_de) + drop_nlink(new_inode); + drop_nlink(new_inode); + nilfs_mark_inode_dirty(new_inode); + } else { + if (dir_de) { + err = -EMLINK; + if (new_dir->i_nlink >= NILFS_LINK_MAX) + goto out_dir; + } + err = nilfs_add_link(new_dentry, old_inode); + if (err) + goto out_dir; + if (dir_de) { + inc_nlink(new_dir); + nilfs_mark_inode_dirty(new_dir); + } + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = CURRENT_TIME; + + nilfs_delete_entry(old_de, old_page); + + if (dir_de) { + nilfs_set_link(old_inode, dir_de, dir_page, new_dir); + drop_nlink(old_dir); + } + nilfs_mark_inode_dirty(old_dir); + nilfs_mark_inode_dirty(old_inode); + + err = nilfs_transaction_commit(old_dir->i_sb); + return err; + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + nilfs_transaction_abort(old_dir->i_sb); + return err; +} + +/* + * Export operations + */ +static struct dentry *nilfs_get_parent(struct dentry *child) +{ + unsigned long ino; + struct inode *inode; + struct qstr dotdot = {.name = "..", .len = 2}; + struct nilfs_root *root; + + ino = nilfs_inode_by_name(child->d_inode, &dotdot); + if (!ino) + return ERR_PTR(-ENOENT); + + root = NILFS_I(child->d_inode)->i_root; + + inode = nilfs_iget(child->d_inode->i_sb, root, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_obtain_alias(inode); +} + +static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno, + u64 ino, u32 gen) +{ + struct nilfs_root *root; + struct inode *inode; + + if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO) + return ERR_PTR(-ESTALE); + + root = nilfs_lookup_root(sb->s_fs_info, cno); + if (!root) + return ERR_PTR(-ESTALE); + + inode = nilfs_iget(sb, root, ino); + nilfs_put_root(root); + + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (gen && inode->i_generation != gen) { + iput(inode); + return ERR_PTR(-ESTALE); + } + return d_obtain_alias(inode); +} + +static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct nilfs_fid *fid = (struct nilfs_fid *)fh; + + if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE && + fh_len != NILFS_FID_SIZE_CONNECTABLE) || + (fh_type != FILEID_NILFS_WITH_PARENT && + fh_type != FILEID_NILFS_WITHOUT_PARENT)) + return NULL; + + return nilfs_get_dentry(sb, fid->cno, fid->ino, fid->gen); +} + +static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct nilfs_fid *fid = (struct nilfs_fid *)fh; + + if (fh_len != NILFS_FID_SIZE_CONNECTABLE || + fh_type != FILEID_NILFS_WITH_PARENT) + return NULL; + + return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen); +} + +static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp, + int connectable) +{ + struct nilfs_fid *fid = (struct nilfs_fid *)fh; + struct inode *inode = dentry->d_inode; + struct nilfs_root *root = NILFS_I(inode)->i_root; + int type; + + if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE || + (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE)) + return 255; + + fid->cno = root->cno; + fid->ino = inode->i_ino; + fid->gen = inode->i_generation; + + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + + spin_lock(&dentry->d_lock); + parent = dentry->d_parent->d_inode; + fid->parent_ino = parent->i_ino; + fid->parent_gen = parent->i_generation; + spin_unlock(&dentry->d_lock); + + type = FILEID_NILFS_WITH_PARENT; + *lenp = NILFS_FID_SIZE_CONNECTABLE; + } else { + type = FILEID_NILFS_WITHOUT_PARENT; + *lenp = NILFS_FID_SIZE_NON_CONNECTABLE; + } + + return type; +} + +const struct inode_operations nilfs_dir_inode_operations = { + .create = nilfs_create, + .lookup = nilfs_lookup, + .link = nilfs_link, + .unlink = nilfs_unlink, + .symlink = nilfs_symlink, + .mkdir = nilfs_mkdir, + .rmdir = nilfs_rmdir, + .mknod = nilfs_mknod, + .rename = nilfs_rename, + .setattr = nilfs_setattr, + .permission = nilfs_permission, + .fiemap = nilfs_fiemap, +}; + +const struct inode_operations nilfs_special_inode_operations = { + .setattr = nilfs_setattr, + .permission = nilfs_permission, +}; + +const struct inode_operations nilfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .permission = nilfs_permission, +}; + +const struct export_operations nilfs_export_ops = { + .encode_fh = nilfs_encode_fh, + .fh_to_dentry = nilfs_fh_to_dentry, + .fh_to_parent = nilfs_fh_to_parent, + .get_parent = nilfs_get_parent, +}; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h new file mode 100644 index 00000000..f02b9ad4 --- /dev/null +++ b/fs/nilfs2/nilfs.h @@ -0,0 +1,326 @@ +/* + * nilfs.h - NILFS local header file. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato + * Ryusuke Konishi + */ + +#ifndef _NILFS_H +#define _NILFS_H + +#include +#include +#include +#include +#include +#include "the_nilfs.h" +#include "bmap.h" + +/* + * nilfs inode data in memory + */ +struct nilfs_inode_info { + __u32 i_flags; + unsigned long i_state; /* Dynamic state flags */ + struct nilfs_bmap *i_bmap; + struct nilfs_bmap i_bmap_data; + __u64 i_xattr; /* sector_t ??? */ + __u32 i_dir_start_lookup; + __u64 i_cno; /* check point number for GC inode */ + struct address_space i_btnode_cache; + struct list_head i_dirty; /* List for connecting dirty files */ + +#ifdef CONFIG_NILFS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_sem even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif + struct buffer_head *i_bh; /* i_bh contains a new or dirty + disk inode */ + struct nilfs_root *i_root; + struct inode vfs_inode; +}; + +static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode) +{ + return container_of(inode, struct nilfs_inode_info, vfs_inode); +} + +static inline struct nilfs_inode_info * +NILFS_BMAP_I(const struct nilfs_bmap *bmap) +{ + return container_of(bmap, struct nilfs_inode_info, i_bmap_data); +} + +static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) +{ + struct nilfs_inode_info *ii = + container_of(btnc, struct nilfs_inode_info, i_btnode_cache); + return &ii->vfs_inode; +} + +/* + * Dynamic state flags of NILFS on-memory inode (i_state) + */ +enum { + NILFS_I_NEW = 0, /* Inode is newly created */ + NILFS_I_DIRTY, /* The file is dirty */ + NILFS_I_QUEUED, /* inode is in dirty_files list */ + NILFS_I_BUSY, /* inode is grabbed by a segment + constructor */ + NILFS_I_COLLECTED, /* All dirty blocks are collected */ + NILFS_I_UPDATED, /* The file has been written back */ + NILFS_I_INODE_DIRTY, /* write_inode is requested */ + NILFS_I_BMAP, /* has bmap and btnode_cache */ + NILFS_I_GCINODE, /* inode for GC, on memory only */ +}; + +/* + * commit flags for nilfs_commit_super and nilfs_sync_super + */ +enum { + NILFS_SB_COMMIT = 0, /* Commit a super block alternately */ + NILFS_SB_COMMIT_ALL /* Commit both super blocks */ +}; + +/* + * Macros to check inode numbers + */ +#define NILFS_MDT_INO_BITS \ + ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \ + 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \ + 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO)) + +#define NILFS_SYS_INO_BITS \ + ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS) + +#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino) + +#define NILFS_MDT_INODE(sb, ino) \ + ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino)))) +#define NILFS_VALID_INODE(sb, ino) \ + ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino)))) + +/** + * struct nilfs_transaction_info: context information for synchronization + * @ti_magic: Magic number + * @ti_save: Backup of journal_info field of task_struct + * @ti_flags: Flags + * @ti_count: Nest level + * @ti_garbage: List of inode to be put when releasing semaphore + */ +struct nilfs_transaction_info { + u32 ti_magic; + void *ti_save; + /* This should never used. If this happens, + one of other filesystems has a bug. */ + unsigned short ti_flags; + unsigned short ti_count; + struct list_head ti_garbage; +}; + +/* ti_magic */ +#define NILFS_TI_MAGIC 0xd9e392fb + +/* ti_flags */ +#define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */ +#define NILFS_TI_SYNC 0x0002 /* Force to construct segment at the + end of transaction. */ +#define NILFS_TI_GC 0x0004 /* GC context */ +#define NILFS_TI_COMMIT 0x0008 /* Change happened or not */ +#define NILFS_TI_WRITER 0x0010 /* Constructor context */ + + +int nilfs_transaction_begin(struct super_block *, + struct nilfs_transaction_info *, int); +int nilfs_transaction_commit(struct super_block *); +void nilfs_transaction_abort(struct super_block *); + +static inline void nilfs_set_transaction_flag(unsigned int flag) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + ti->ti_flags |= flag; +} + +static inline int nilfs_test_transaction_flag(unsigned int flag) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC) + return 0; + return !!(ti->ti_flags & flag); +} + +static inline int nilfs_doing_gc(void) +{ + return nilfs_test_transaction_flag(NILFS_TI_GC); +} + +static inline int nilfs_doing_construction(void) +{ + return nilfs_test_transaction_flag(NILFS_TI_WRITER); +} + +/* + * function prototype + */ +#ifdef CONFIG_NILFS_POSIX_ACL +#error "NILFS: not yet supported POSIX ACL" +extern int nilfs_acl_chmod(struct inode *); +extern int nilfs_init_acl(struct inode *, struct inode *); +#else +static inline int nilfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int nilfs_init_acl(struct inode *inode, struct inode *dir) +{ + inode->i_mode &= ~current_umask(); + return 0; +} +#endif + +#define NILFS_ATIME_DISABLE + +/* Flags that should be inherited by new inodes from their parent. */ +#define NILFS_FL_INHERITED \ + (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | FS_SYNC_FL | \ + FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL |\ + FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & ~(FS_DIRSYNC_FL | FS_TOPDIR_FL); + else + return flags & (FS_NODUMP_FL | FS_NOATIME_FL); +} + +/* dir.c */ +extern int nilfs_add_link(struct dentry *, struct inode *); +extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *); +extern int nilfs_make_empty(struct inode *, struct inode *); +extern struct nilfs_dir_entry * +nilfs_find_entry(struct inode *, const struct qstr *, struct page **); +extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *); +extern int nilfs_empty_dir(struct inode *); +extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **); +extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, + struct page *, struct inode *); + +/* file.c */ +extern int nilfs_sync_file(struct file *, int); + +/* ioctl.c */ +long nilfs_ioctl(struct file *, unsigned int, unsigned long); +long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *, + void **); + +/* inode.c */ +void nilfs_inode_add_blocks(struct inode *inode, int n); +void nilfs_inode_sub_blocks(struct inode *inode, int n); +extern struct inode *nilfs_new_inode(struct inode *, int); +extern void nilfs_free_inode(struct inode *); +extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern void nilfs_set_inode_flags(struct inode *); +extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *); +extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int); +struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root, + unsigned long ino); +struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root, + unsigned long ino); +struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, + unsigned long ino); +extern struct inode *nilfs_iget_for_gc(struct super_block *sb, + unsigned long ino, __u64 cno); +extern void nilfs_update_inode(struct inode *, struct buffer_head *); +extern void nilfs_truncate(struct inode *); +extern void nilfs_evict_inode(struct inode *); +extern int nilfs_setattr(struct dentry *, struct iattr *); +int nilfs_permission(struct inode *inode, int mask, unsigned int flags); +int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); +extern int nilfs_inode_dirty(struct inode *); +int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty); +extern int nilfs_mark_inode_dirty(struct inode *); +extern void nilfs_dirty_inode(struct inode *, int flags); +int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); + +/* super.c */ +extern struct inode *nilfs_alloc_inode(struct super_block *); +extern void nilfs_destroy_inode(struct inode *); +extern void nilfs_error(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void nilfs_warning(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern struct nilfs_super_block * +nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); +extern int nilfs_store_magic_and_option(struct super_block *, + struct nilfs_super_block *, char *); +extern int nilfs_check_feature_compatibility(struct super_block *, + struct nilfs_super_block *); +extern void nilfs_set_log_cursor(struct nilfs_super_block *, + struct the_nilfs *); +struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb, + int flip); +int nilfs_commit_super(struct super_block *sb, int flag); +int nilfs_cleanup_super(struct super_block *sb); +int nilfs_resize_fs(struct super_block *sb, __u64 newsize); +int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, + struct nilfs_root **root); +int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno); + +/* gcinode.c */ +int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64, + struct buffer_head **); +int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64, + struct buffer_head **); +int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *); +int nilfs_init_gcinode(struct inode *inode); +void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs); + +/* + * Inodes and files operations + */ +extern const struct file_operations nilfs_dir_operations; +extern const struct inode_operations nilfs_file_inode_operations; +extern const struct file_operations nilfs_file_operations; +extern const struct address_space_operations nilfs_aops; +extern const struct inode_operations nilfs_dir_inode_operations; +extern const struct inode_operations nilfs_special_inode_operations; +extern const struct inode_operations nilfs_symlink_inode_operations; + +/* + * filesystem type + */ +extern struct file_system_type nilfs_fs_type; + + +#endif /* _NILFS_H */ diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c new file mode 100644 index 00000000..65221a04 --- /dev/null +++ b/fs/nilfs2/page.c @@ -0,0 +1,551 @@ +/* + * page.c - buffer/page management specific to NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi , + * Seiji Kihara . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "page.h" +#include "mdt.h" + + +#define NILFS_BUFFER_INHERENT_BITS \ + ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \ + (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Checked)) + +static struct buffer_head * +__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, + int blkbits, unsigned long b_state) + +{ + unsigned long first_block; + struct buffer_head *bh; + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << blkbits, b_state); + + first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits); + bh = nilfs_page_get_nth_block(page, block - first_block); + + touch_buffer(bh); + wait_on_buffer(bh); + return bh; +} + +struct buffer_head *nilfs_grab_buffer(struct inode *inode, + struct address_space *mapping, + unsigned long blkoff, + unsigned long b_state) +{ + int blkbits = inode->i_blkbits; + pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits); + struct page *page; + struct buffer_head *bh; + + page = grab_cache_page(mapping, index); + if (unlikely(!page)) + return NULL; + + bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state); + if (unlikely(!bh)) { + unlock_page(page); + page_cache_release(page); + return NULL; + } + return bh; +} + +/** + * nilfs_forget_buffer - discard dirty state + * @inode: owner inode of the buffer + * @bh: buffer head of the buffer to be discarded + */ +void nilfs_forget_buffer(struct buffer_head *bh) +{ + struct page *page = bh->b_page; + + lock_buffer(bh); + clear_buffer_nilfs_volatile(bh); + clear_buffer_nilfs_checked(bh); + clear_buffer_nilfs_redirected(bh); + clear_buffer_dirty(bh); + if (nilfs_page_buffers_clean(page)) + __nilfs_clear_page_dirty(page); + + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + bh->b_blocknr = -1; + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + unlock_buffer(bh); + brelse(bh); +} + +/** + * nilfs_copy_buffer -- copy buffer data and flags + * @dbh: destination buffer + * @sbh: source buffer + */ +void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) +{ + void *kaddr0, *kaddr1; + unsigned long bits; + struct page *spage = sbh->b_page, *dpage = dbh->b_page; + struct buffer_head *bh; + + kaddr0 = kmap_atomic(spage, KM_USER0); + kaddr1 = kmap_atomic(dpage, KM_USER1); + memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size); + kunmap_atomic(kaddr1, KM_USER1); + kunmap_atomic(kaddr0, KM_USER0); + + dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS; + dbh->b_blocknr = sbh->b_blocknr; + dbh->b_bdev = sbh->b_bdev; + + bh = dbh; + bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped)); + while ((bh = bh->b_this_page) != dbh) { + lock_buffer(bh); + bits &= bh->b_state; + unlock_buffer(bh); + } + if (bits & (1UL << BH_Uptodate)) + SetPageUptodate(dpage); + else + ClearPageUptodate(dpage); + if (bits & (1UL << BH_Mapped)) + SetPageMappedToDisk(dpage); + else + ClearPageMappedToDisk(dpage); +} + +/** + * nilfs_page_buffers_clean - check if a page has dirty buffers or not. + * @page: page to be checked + * + * nilfs_page_buffers_clean() returns zero if the page has dirty buffers. + * Otherwise, it returns non-zero value. + */ +int nilfs_page_buffers_clean(struct page *page) +{ + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (buffer_dirty(bh)) + return 0; + bh = bh->b_this_page; + } while (bh != head); + return 1; +} + +void nilfs_page_bug(struct page *page) +{ + struct address_space *m; + unsigned long ino; + + if (unlikely(!page)) { + printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n"); + return; + } + + m = page->mapping; + ino = m ? m->host->i_ino : 0; + + printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx " + "mapping=%p ino=%lu\n", + page, atomic_read(&page->_count), + (unsigned long long)page->index, page->flags, m, ino); + + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + int i = 0; + + bh = head = page_buffers(page); + do { + printk(KERN_CRIT + " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n", + i++, bh, atomic_read(&bh->b_count), + (unsigned long long)bh->b_blocknr, bh->b_state); + bh = bh->b_this_page; + } while (bh != head); + } +} + +/** + * nilfs_copy_page -- copy the page with buffers + * @dst: destination page + * @src: source page + * @copy_dirty: flag whether to copy dirty states on the page's buffer heads. + * + * This function is for both data pages and btnode pages. The dirty flag + * should be treated by caller. The page must not be under i/o. + * Both src and dst page must be locked + */ +static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) +{ + struct buffer_head *dbh, *dbufs, *sbh, *sbufs; + unsigned long mask = NILFS_BUFFER_INHERENT_BITS; + + BUG_ON(PageWriteback(dst)); + + sbh = sbufs = page_buffers(src); + if (!page_has_buffers(dst)) + create_empty_buffers(dst, sbh->b_size, 0); + + if (copy_dirty) + mask |= (1UL << BH_Dirty); + + dbh = dbufs = page_buffers(dst); + do { + lock_buffer(sbh); + lock_buffer(dbh); + dbh->b_state = sbh->b_state & mask; + dbh->b_blocknr = sbh->b_blocknr; + dbh->b_bdev = sbh->b_bdev; + sbh = sbh->b_this_page; + dbh = dbh->b_this_page; + } while (dbh != dbufs); + + copy_highpage(dst, src); + + if (PageUptodate(src) && !PageUptodate(dst)) + SetPageUptodate(dst); + else if (!PageUptodate(src) && PageUptodate(dst)) + ClearPageUptodate(dst); + if (PageMappedToDisk(src) && !PageMappedToDisk(dst)) + SetPageMappedToDisk(dst); + else if (!PageMappedToDisk(src) && PageMappedToDisk(dst)) + ClearPageMappedToDisk(dst); + + do { + unlock_buffer(sbh); + unlock_buffer(dbh); + sbh = sbh->b_this_page; + dbh = dbh->b_this_page; + } while (dbh != dbufs); +} + +int nilfs_copy_dirty_pages(struct address_space *dmap, + struct address_space *smap) +{ + struct pagevec pvec; + unsigned int i; + pgoff_t index = 0; + int err = 0; + + pagevec_init(&pvec, 0); +repeat: + if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) + return 0; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i], *dpage; + + lock_page(page); + if (unlikely(!PageDirty(page))) + NILFS_PAGE_BUG(page, "inconsistent dirty state"); + + dpage = grab_cache_page(dmap, page->index); + if (unlikely(!dpage)) { + /* No empty page is added to the page cache */ + err = -ENOMEM; + unlock_page(page); + break; + } + if (unlikely(!page_has_buffers(page))) + NILFS_PAGE_BUG(page, + "found empty page in dat page cache"); + + nilfs_copy_page(dpage, page, 1); + __set_page_dirty_nobuffers(dpage); + + unlock_page(dpage); + page_cache_release(dpage); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + + if (likely(!err)) + goto repeat; + return err; +} + +/** + * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache + * @dmap: destination page cache + * @smap: source page cache + * + * No pages must no be added to the cache during this process. + * This must be ensured by the caller. + */ +void nilfs_copy_back_pages(struct address_space *dmap, + struct address_space *smap) +{ + struct pagevec pvec; + unsigned int i, n; + pgoff_t index = 0; + int err; + + pagevec_init(&pvec, 0); +repeat: + n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE); + if (!n) + return; + index = pvec.pages[n - 1]->index + 1; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i], *dpage; + pgoff_t offset = page->index; + + lock_page(page); + dpage = find_lock_page(dmap, offset); + if (dpage) { + /* override existing page on the destination cache */ + WARN_ON(PageDirty(dpage)); + nilfs_copy_page(dpage, page, 0); + unlock_page(dpage); + page_cache_release(dpage); + } else { + struct page *page2; + + /* move the page to the destination cache */ + spin_lock_irq(&smap->tree_lock); + page2 = radix_tree_delete(&smap->page_tree, offset); + WARN_ON(page2 != page); + + smap->nrpages--; + spin_unlock_irq(&smap->tree_lock); + + spin_lock_irq(&dmap->tree_lock); + err = radix_tree_insert(&dmap->page_tree, offset, page); + if (unlikely(err < 0)) { + WARN_ON(err == -EEXIST); + page->mapping = NULL; + page_cache_release(page); /* for cache */ + } else { + page->mapping = dmap; + dmap->nrpages++; + if (PageDirty(page)) + radix_tree_tag_set(&dmap->page_tree, + offset, + PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&dmap->tree_lock); + } + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + + goto repeat; +} + +void nilfs_clear_dirty_pages(struct address_space *mapping) +{ + struct pagevec pvec; + unsigned int i; + pgoff_t index = 0; + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + lock_page(page); + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + bh = head = page_buffers(page); + do { + lock_buffer(bh); + clear_buffer_dirty(bh); + clear_buffer_nilfs_volatile(bh); + clear_buffer_nilfs_checked(bh); + clear_buffer_nilfs_redirected(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + unlock_buffer(bh); + bh = bh->b_this_page; + } while (bh != head); + + __nilfs_clear_page_dirty(page); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +unsigned nilfs_page_count_clean_buffers(struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + struct buffer_head *bh, *head; + unsigned nc = 0; + + for (bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + bh->b_size; + if (block_end > from && block_start < to && !buffer_dirty(bh)) + nc++; + } + return nc; +} + +void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, + struct backing_dev_info *bdi) +{ + mapping->host = inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = bdi; + mapping->a_ops = &empty_aops; +} + +/* + * NILFS2 needs clear_page_dirty() in the following two cases: + * + * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears + * page dirty flags when it copies back pages from the shadow cache + * (gcdat->{i_mapping,i_btnode_cache}) to its original cache + * (dat->{i_mapping,i_btnode_cache}). + * + * 2) Some B-tree operations like insertion or deletion may dispose buffers + * in dirty state, and this needs to cancel the dirty state of their pages. + */ +int __nilfs_clear_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (mapping) { + spin_lock_irq(&mapping->tree_lock); + if (test_bit(PG_dirty, &page->flags)) { + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&mapping->tree_lock); + return clear_page_dirty_for_io(page); + } + spin_unlock_irq(&mapping->tree_lock); + return 0; + } + return TestClearPageDirty(page); +} + +/** + * nilfs_find_uncommitted_extent - find extent of uncommitted data + * @inode: inode + * @start_blk: start block offset (in) + * @blkoff: start offset of the found extent (out) + * + * This function searches an extent of buffers marked "delayed" which + * starts from a block offset equal to or larger than @start_blk. If + * such an extent was found, this will store the start offset in + * @blkoff and return its length in blocks. Otherwise, zero is + * returned. + */ +unsigned long nilfs_find_uncommitted_extent(struct inode *inode, + sector_t start_blk, + sector_t *blkoff) +{ + unsigned int i; + pgoff_t index; + unsigned int nblocks_in_page; + unsigned long length = 0; + sector_t b; + struct pagevec pvec; + struct page *page; + + if (inode->i_mapping->nrpages == 0) + return 0; + + index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + pagevec_init(&pvec, 0); + +repeat: + pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE, + pvec.pages); + if (pvec.nr == 0) + return length; + + if (length > 0 && pvec.pages[0]->index > index) + goto out; + + b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + i = 0; + do { + page = pvec.pages[i]; + + lock_page(page); + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (b < start_blk) + continue; + if (buffer_delay(bh)) { + if (length == 0) + *blkoff = b; + length++; + } else if (length > 0) { + goto out_locked; + } + } while (++b, bh = bh->b_this_page, bh != head); + } else { + if (length > 0) + goto out_locked; + + b += nblocks_in_page; + } + unlock_page(page); + + } while (++i < pagevec_count(&pvec)); + + index = page->index + 1; + pagevec_release(&pvec); + cond_resched(); + goto repeat; + +out_locked: + unlock_page(page); +out: + pagevec_release(&pvec); + return length; +} diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h new file mode 100644 index 00000000..fb7de716 --- /dev/null +++ b/fs/nilfs2/page.h @@ -0,0 +1,80 @@ +/* + * page.h - buffer/page management specific to NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi , + * Seiji Kihara . + */ + +#ifndef _NILFS_PAGE_H +#define _NILFS_PAGE_H + +#include +#include "nilfs.h" + +/* + * Extended buffer state bits + */ +enum { + BH_NILFS_Allocated = BH_PrivateStart, + BH_NILFS_Node, + BH_NILFS_Volatile, + BH_NILFS_Checked, + BH_NILFS_Redirected, +}; + +BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ +BUFFER_FNS(NILFS_Volatile, nilfs_volatile) +BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */ +BUFFER_FNS(NILFS_Redirected, nilfs_redirected) /* redirected to a copy */ + + +int __nilfs_clear_page_dirty(struct page *); + +struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *, + unsigned long, unsigned long); +void nilfs_forget_buffer(struct buffer_head *); +void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *); +int nilfs_page_buffers_clean(struct page *); +void nilfs_page_bug(struct page *); + +int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); +void nilfs_copy_back_pages(struct address_space *, struct address_space *); +void nilfs_clear_dirty_pages(struct address_space *); +void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, + struct backing_dev_info *bdi); +unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); +unsigned long nilfs_find_uncommitted_extent(struct inode *inode, + sector_t start_blk, + sector_t *blkoff); + +#define NILFS_PAGE_BUG(page, m, a...) \ + do { nilfs_page_bug(page); BUG(); } while (0) + +static inline struct buffer_head * +nilfs_page_get_nth_block(struct page *page, unsigned int count) +{ + struct buffer_head *bh = page_buffers(page); + + while (count-- > 0) + bh = bh->b_this_page; + get_bh(bh); + return bh; +} + +#endif /* _NILFS_PAGE_H */ diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c new file mode 100644 index 00000000..a604ac03 --- /dev/null +++ b/fs/nilfs2/recovery.c @@ -0,0 +1,963 @@ +/* + * recovery.c - NILFS recovery logic + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + */ + +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "segment.h" +#include "sufile.h" +#include "page.h" +#include "segbuf.h" + +/* + * Segment check result + */ +enum { + NILFS_SEG_VALID, + NILFS_SEG_NO_SUPER_ROOT, + NILFS_SEG_FAIL_IO, + NILFS_SEG_FAIL_MAGIC, + NILFS_SEG_FAIL_SEQ, + NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, + NILFS_SEG_FAIL_CHECKSUM_FULL, + NILFS_SEG_FAIL_CONSISTENCY, +}; + +/* work structure for recovery */ +struct nilfs_recovery_block { + ino_t ino; /* Inode number of the file that this block + belongs to */ + sector_t blocknr; /* block number */ + __u64 vblocknr; /* virtual block number */ + unsigned long blkoff; /* File offset of the data block (per block) */ + struct list_head list; +}; + + +static int nilfs_warn_segment_error(int err) +{ + switch (err) { + case NILFS_SEG_FAIL_IO: + printk(KERN_WARNING + "NILFS warning: I/O error on loading last segment\n"); + return -EIO; + case NILFS_SEG_FAIL_MAGIC: + printk(KERN_WARNING + "NILFS warning: Segment magic number invalid\n"); + break; + case NILFS_SEG_FAIL_SEQ: + printk(KERN_WARNING + "NILFS warning: Sequence number mismatch\n"); + break; + case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: + printk(KERN_WARNING + "NILFS warning: Checksum error in super root\n"); + break; + case NILFS_SEG_FAIL_CHECKSUM_FULL: + printk(KERN_WARNING + "NILFS warning: Checksum error in segment payload\n"); + break; + case NILFS_SEG_FAIL_CONSISTENCY: + printk(KERN_WARNING + "NILFS warning: Inconsistent segment\n"); + break; + case NILFS_SEG_NO_SUPER_ROOT: + printk(KERN_WARNING + "NILFS warning: No super root in the last segment\n"); + break; + } + return -EINVAL; +} + +/** + * nilfs_compute_checksum - compute checksum of blocks continuously + * @nilfs: nilfs object + * @bhs: buffer head of start block + * @sum: place to store result + * @offset: offset bytes in the first block + * @check_bytes: number of bytes to be checked + * @start: DBN of start block + * @nblock: number of blocks to be checked + */ +static int nilfs_compute_checksum(struct the_nilfs *nilfs, + struct buffer_head *bhs, u32 *sum, + unsigned long offset, u64 check_bytes, + sector_t start, unsigned long nblock) +{ + unsigned int blocksize = nilfs->ns_blocksize; + unsigned long size; + u32 crc; + + BUG_ON(offset >= blocksize); + check_bytes -= offset; + size = min_t(u64, check_bytes, blocksize - offset); + crc = crc32_le(nilfs->ns_crc_seed, + (unsigned char *)bhs->b_data + offset, size); + if (--nblock > 0) { + do { + struct buffer_head *bh; + + bh = __bread(nilfs->ns_bdev, ++start, blocksize); + if (!bh) + return -EIO; + check_bytes -= size; + size = min_t(u64, check_bytes, blocksize); + crc = crc32_le(crc, bh->b_data, size); + brelse(bh); + } while (--nblock > 0); + } + *sum = crc; + return 0; +} + +/** + * nilfs_read_super_root_block - read super root block + * @nilfs: nilfs object + * @sr_block: disk block number of the super root block + * @pbh: address of a buffer_head pointer to return super root buffer + * @check: CRC check flag + */ +int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block, + struct buffer_head **pbh, int check) +{ + struct buffer_head *bh_sr; + struct nilfs_super_root *sr; + u32 crc; + int ret; + + *pbh = NULL; + bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize); + if (unlikely(!bh_sr)) { + ret = NILFS_SEG_FAIL_IO; + goto failed; + } + + sr = (struct nilfs_super_root *)bh_sr->b_data; + if (check) { + unsigned bytes = le16_to_cpu(sr->sr_bytes); + + if (bytes == 0 || bytes > nilfs->ns_blocksize) { + ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; + goto failed_bh; + } + if (nilfs_compute_checksum( + nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes, + sr_block, 1)) { + ret = NILFS_SEG_FAIL_IO; + goto failed_bh; + } + if (crc != le32_to_cpu(sr->sr_sum)) { + ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; + goto failed_bh; + } + } + *pbh = bh_sr; + return 0; + + failed_bh: + brelse(bh_sr); + + failed: + return nilfs_warn_segment_error(ret); +} + +/** + * nilfs_read_log_header - read summary header of the specified log + * @nilfs: nilfs object + * @start_blocknr: start block number of the log + * @sum: pointer to return segment summary structure + */ +static struct buffer_head * +nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr, + struct nilfs_segment_summary **sum) +{ + struct buffer_head *bh_sum; + + bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); + if (bh_sum) + *sum = (struct nilfs_segment_summary *)bh_sum->b_data; + return bh_sum; +} + +/** + * nilfs_validate_log - verify consistency of log + * @nilfs: nilfs object + * @seg_seq: sequence number of segment + * @bh_sum: buffer head of summary block + * @sum: segment summary struct + */ +static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq, + struct buffer_head *bh_sum, + struct nilfs_segment_summary *sum) +{ + unsigned long nblock; + u32 crc; + int ret; + + ret = NILFS_SEG_FAIL_MAGIC; + if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) + goto out; + + ret = NILFS_SEG_FAIL_SEQ; + if (le64_to_cpu(sum->ss_seq) != seg_seq) + goto out; + + nblock = le32_to_cpu(sum->ss_nblocks); + ret = NILFS_SEG_FAIL_CONSISTENCY; + if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment)) + /* This limits the number of blocks read in the CRC check */ + goto out; + + ret = NILFS_SEG_FAIL_IO; + if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum), + ((u64)nblock << nilfs->ns_blocksize_bits), + bh_sum->b_blocknr, nblock)) + goto out; + + ret = NILFS_SEG_FAIL_CHECKSUM_FULL; + if (crc != le32_to_cpu(sum->ss_datasum)) + goto out; + ret = 0; +out: + return ret; +} + +/** + * nilfs_read_summary_info - read an item on summary blocks of a log + * @nilfs: nilfs object + * @pbh: the current buffer head on summary blocks [in, out] + * @offset: the current byte offset on summary blocks [in, out] + * @bytes: byte size of the item to be read + */ +static void *nilfs_read_summary_info(struct the_nilfs *nilfs, + struct buffer_head **pbh, + unsigned int *offset, unsigned int bytes) +{ + void *ptr; + sector_t blocknr; + + BUG_ON((*pbh)->b_size < *offset); + if (bytes > (*pbh)->b_size - *offset) { + blocknr = (*pbh)->b_blocknr; + brelse(*pbh); + *pbh = __bread(nilfs->ns_bdev, blocknr + 1, + nilfs->ns_blocksize); + if (unlikely(!*pbh)) + return NULL; + *offset = 0; + } + ptr = (*pbh)->b_data + *offset; + *offset += bytes; + return ptr; +} + +/** + * nilfs_skip_summary_info - skip items on summary blocks of a log + * @nilfs: nilfs object + * @pbh: the current buffer head on summary blocks [in, out] + * @offset: the current byte offset on summary blocks [in, out] + * @bytes: byte size of the item to be skipped + * @count: number of items to be skipped + */ +static void nilfs_skip_summary_info(struct the_nilfs *nilfs, + struct buffer_head **pbh, + unsigned int *offset, unsigned int bytes, + unsigned long count) +{ + unsigned int rest_item_in_current_block + = ((*pbh)->b_size - *offset) / bytes; + + if (count <= rest_item_in_current_block) { + *offset += bytes * count; + } else { + sector_t blocknr = (*pbh)->b_blocknr; + unsigned int nitem_per_block = (*pbh)->b_size / bytes; + unsigned int bcnt; + + count -= rest_item_in_current_block; + bcnt = DIV_ROUND_UP(count, nitem_per_block); + *offset = bytes * (count - (bcnt - 1) * nitem_per_block); + + brelse(*pbh); + *pbh = __bread(nilfs->ns_bdev, blocknr + bcnt, + nilfs->ns_blocksize); + } +} + +/** + * nilfs_scan_dsync_log - get block information of a log written for data sync + * @nilfs: nilfs object + * @start_blocknr: start block number of the log + * @sum: log summary information + * @head: list head to add nilfs_recovery_block struct + */ +static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr, + struct nilfs_segment_summary *sum, + struct list_head *head) +{ + struct buffer_head *bh; + unsigned int offset; + u32 nfinfo, sumbytes; + sector_t blocknr; + ino_t ino; + int err = -EIO; + + nfinfo = le32_to_cpu(sum->ss_nfinfo); + if (!nfinfo) + return 0; + + sumbytes = le32_to_cpu(sum->ss_sumbytes); + blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize); + bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); + if (unlikely(!bh)) + goto out; + + offset = le16_to_cpu(sum->ss_bytes); + for (;;) { + unsigned long nblocks, ndatablk, nnodeblk; + struct nilfs_finfo *finfo; + + finfo = nilfs_read_summary_info(nilfs, &bh, &offset, + sizeof(*finfo)); + if (unlikely(!finfo)) + goto out; + + ino = le64_to_cpu(finfo->fi_ino); + nblocks = le32_to_cpu(finfo->fi_nblocks); + ndatablk = le32_to_cpu(finfo->fi_ndatablk); + nnodeblk = nblocks - ndatablk; + + while (ndatablk-- > 0) { + struct nilfs_recovery_block *rb; + struct nilfs_binfo_v *binfo; + + binfo = nilfs_read_summary_info(nilfs, &bh, &offset, + sizeof(*binfo)); + if (unlikely(!binfo)) + goto out; + + rb = kmalloc(sizeof(*rb), GFP_NOFS); + if (unlikely(!rb)) { + err = -ENOMEM; + goto out; + } + rb->ino = ino; + rb->blocknr = blocknr++; + rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr); + rb->blkoff = le64_to_cpu(binfo->bi_blkoff); + /* INIT_LIST_HEAD(&rb->list); */ + list_add_tail(&rb->list, head); + } + if (--nfinfo == 0) + break; + blocknr += nnodeblk; /* always 0 for data sync logs */ + nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64), + nnodeblk); + if (unlikely(!bh)) + goto out; + } + err = 0; + out: + brelse(bh); /* brelse(NULL) is just ignored */ + return err; +} + +static void dispose_recovery_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct nilfs_recovery_block *rb; + + rb = list_first_entry(head, struct nilfs_recovery_block, list); + list_del(&rb->list); + kfree(rb); + } +} + +struct nilfs_segment_entry { + struct list_head list; + __u64 segnum; +}; + +static int nilfs_segment_list_add(struct list_head *head, __u64 segnum) +{ + struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS); + + if (unlikely(!ent)) + return -ENOMEM; + + ent->segnum = segnum; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, head); + return 0; +} + +void nilfs_dispose_segment_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct nilfs_segment_entry *ent; + + ent = list_first_entry(head, struct nilfs_segment_entry, list); + list_del(&ent->list); + kfree(ent); + } +} + +static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, + struct super_block *sb, + struct nilfs_recovery_info *ri) +{ + struct list_head *head = &ri->ri_used_segments; + struct nilfs_segment_entry *ent, *n; + struct inode *sufile = nilfs->ns_sufile; + __u64 segnum[4]; + int err; + int i; + + segnum[0] = nilfs->ns_segnum; + segnum[1] = nilfs->ns_nextnum; + segnum[2] = ri->ri_segnum; + segnum[3] = ri->ri_nextnum; + + /* + * Releasing the next segment of the latest super root. + * The next segment is invalidated by this recovery. + */ + err = nilfs_sufile_free(sufile, segnum[1]); + if (unlikely(err)) + goto failed; + + for (i = 1; i < 4; i++) { + err = nilfs_segment_list_add(head, segnum[i]); + if (unlikely(err)) + goto failed; + } + + /* + * Collecting segments written after the latest super root. + * These are marked dirty to avoid being reallocated in the next write. + */ + list_for_each_entry_safe(ent, n, head, list) { + if (ent->segnum != segnum[0]) { + err = nilfs_sufile_scrap(sufile, ent->segnum); + if (unlikely(err)) + goto failed; + } + list_del(&ent->list); + kfree(ent); + } + + /* Allocate new segments for recovery */ + err = nilfs_sufile_alloc(sufile, &segnum[0]); + if (unlikely(err)) + goto failed; + + nilfs->ns_pseg_offset = 0; + nilfs->ns_seg_seq = ri->ri_seq + 2; + nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0]; + + failed: + /* No need to recover sufile because it will be destroyed on error */ + return err; +} + +static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, + struct nilfs_recovery_block *rb, + struct page *page) +{ + struct buffer_head *bh_org; + void *kaddr; + + bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); + if (unlikely(!bh_org)) + return -EIO; + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh_org); + return 0; +} + +static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, + struct super_block *sb, + struct nilfs_root *root, + struct list_head *head, + unsigned long *nr_salvaged_blocks) +{ + struct inode *inode; + struct nilfs_recovery_block *rb, *n; + unsigned blocksize = nilfs->ns_blocksize; + struct page *page; + loff_t pos; + int err = 0, err2 = 0; + + list_for_each_entry_safe(rb, n, head, list) { + inode = nilfs_iget(sb, root, rb->ino); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + inode = NULL; + goto failed_inode; + } + + pos = rb->blkoff << inode->i_blkbits; + err = block_write_begin(inode->i_mapping, pos, blocksize, + 0, &page, nilfs_get_block); + if (unlikely(err)) { + loff_t isize = inode->i_size; + if (pos + blocksize > isize) + vmtruncate(inode, isize); + goto failed_inode; + } + + err = nilfs_recovery_copy_block(nilfs, rb, page); + if (unlikely(err)) + goto failed_page; + + err = nilfs_set_file_dirty(inode, 1); + if (unlikely(err)) + goto failed_page; + + block_write_end(NULL, inode->i_mapping, pos, blocksize, + blocksize, page, NULL); + + unlock_page(page); + page_cache_release(page); + + (*nr_salvaged_blocks)++; + goto next; + + failed_page: + unlock_page(page); + page_cache_release(page); + + failed_inode: + printk(KERN_WARNING + "NILFS warning: error recovering data block " + "(err=%d, ino=%lu, block-offset=%llu)\n", + err, (unsigned long)rb->ino, + (unsigned long long)rb->blkoff); + if (!err2) + err2 = err; + next: + iput(inode); /* iput(NULL) is just ignored */ + list_del_init(&rb->list); + kfree(rb); + } + return err2; +} + +/** + * nilfs_do_roll_forward - salvage logical segments newer than the latest + * checkpoint + * @nilfs: nilfs object + * @sb: super block instance + * @ri: pointer to a nilfs_recovery_info + */ +static int nilfs_do_roll_forward(struct the_nilfs *nilfs, + struct super_block *sb, + struct nilfs_root *root, + struct nilfs_recovery_info *ri) +{ + struct buffer_head *bh_sum = NULL; + struct nilfs_segment_summary *sum; + sector_t pseg_start; + sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ + unsigned long nsalvaged_blocks = 0; + unsigned int flags; + u64 seg_seq; + __u64 segnum, nextnum = 0; + int empty_seg = 0; + int err = 0, ret; + LIST_HEAD(dsync_blocks); /* list of data blocks to be recovered */ + enum { + RF_INIT_ST, + RF_DSYNC_ST, /* scanning data-sync segments */ + }; + int state = RF_INIT_ST; + + pseg_start = ri->ri_lsegs_start; + seg_seq = ri->ri_lsegs_start_seq; + segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + + while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { + brelse(bh_sum); + bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); + if (!bh_sum) { + err = -EIO; + goto failed; + } + + ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); + if (ret) { + if (ret == NILFS_SEG_FAIL_IO) { + err = -EIO; + goto failed; + } + goto strayed; + } + + flags = le16_to_cpu(sum->ss_flags); + if (flags & NILFS_SS_SR) + goto confused; + + /* Found a valid partial segment; do recovery actions */ + nextnum = nilfs_get_segnum_of_block(nilfs, + le64_to_cpu(sum->ss_next)); + empty_seg = 0; + nilfs->ns_ctime = le64_to_cpu(sum->ss_create); + if (!(flags & NILFS_SS_GC)) + nilfs->ns_nongc_ctime = nilfs->ns_ctime; + + switch (state) { + case RF_INIT_ST: + if (!(flags & NILFS_SS_LOGBGN) || + !(flags & NILFS_SS_SYNDT)) + goto try_next_pseg; + state = RF_DSYNC_ST; + /* Fall through */ + case RF_DSYNC_ST: + if (!(flags & NILFS_SS_SYNDT)) + goto confused; + + err = nilfs_scan_dsync_log(nilfs, pseg_start, sum, + &dsync_blocks); + if (unlikely(err)) + goto failed; + if (flags & NILFS_SS_LOGEND) { + err = nilfs_recover_dsync_blocks( + nilfs, sb, root, &dsync_blocks, + &nsalvaged_blocks); + if (unlikely(err)) + goto failed; + state = RF_INIT_ST; + } + break; /* Fall through to try_next_pseg */ + } + + try_next_pseg: + if (pseg_start == ri->ri_lsegs_end) + break; + pseg_start += le32_to_cpu(sum->ss_nblocks); + if (pseg_start < seg_end) + continue; + goto feed_segment; + + strayed: + if (pseg_start == ri->ri_lsegs_end) + break; + + feed_segment: + /* Looking to the next full segment */ + if (empty_seg++) + break; + seg_seq++; + segnum = nextnum; + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + pseg_start = seg_start; + } + + if (nsalvaged_blocks) { + printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n", + sb->s_id, nsalvaged_blocks); + ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; + } + out: + brelse(bh_sum); + dispose_recovery_list(&dsync_blocks); + return err; + + confused: + err = -EINVAL; + failed: + printk(KERN_ERR + "NILFS (device %s): Error roll-forwarding " + "(err=%d, pseg block=%llu). ", + sb->s_id, err, (unsigned long long)pseg_start); + goto out; +} + +static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, + struct nilfs_recovery_info *ri) +{ + struct buffer_head *bh; + int err; + + if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) != + nilfs_get_segnum_of_block(nilfs, ri->ri_super_root)) + return; + + bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize); + BUG_ON(!bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_dirty(bh); + err = sync_dirty_buffer(bh); + if (unlikely(err)) + printk(KERN_WARNING + "NILFS warning: buffer sync write failed during " + "post-cleaning of recovery.\n"); + brelse(bh); +} + +/** + * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint + * @nilfs: nilfs object + * @sb: super block instance + * @ri: pointer to a nilfs_recovery_info struct to store search results. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EINVAL - Inconsistent filesystem state. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, + struct super_block *sb, + struct nilfs_recovery_info *ri) +{ + struct nilfs_root *root; + int err; + + if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) + return 0; + + err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root); + if (unlikely(err)) { + printk(KERN_ERR + "NILFS: error loading the latest checkpoint.\n"); + return err; + } + + err = nilfs_do_roll_forward(nilfs, sb, root, ri); + if (unlikely(err)) + goto failed; + + if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) { + err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: Error preparing segments for " + "recovery.\n"); + goto failed; + } + + err = nilfs_attach_log_writer(sb, root); + if (unlikely(err)) + goto failed; + + set_nilfs_discontinued(nilfs); + err = nilfs_construct_segment(sb); + nilfs_detach_log_writer(sb); + + if (unlikely(err)) { + printk(KERN_ERR "NILFS: Oops! recovery failed. " + "(err=%d)\n", err); + goto failed; + } + + nilfs_finish_roll_forward(nilfs, ri); + } + + failed: + nilfs_put_root(root); + return err; +} + +/** + * nilfs_search_super_root - search the latest valid super root + * @nilfs: the_nilfs + * @ri: pointer to a nilfs_recovery_info struct to store search results. + * + * nilfs_search_super_root() looks for the latest super-root from a partial + * segment pointed by the superblock. It sets up struct the_nilfs through + * this search. It fills nilfs_recovery_info (ri) required for recovery. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EINVAL - No valid segment found + * + * %-EIO - I/O error + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_search_super_root(struct the_nilfs *nilfs, + struct nilfs_recovery_info *ri) +{ + struct buffer_head *bh_sum = NULL; + struct nilfs_segment_summary *sum; + sector_t pseg_start, pseg_end, sr_pseg_start = 0; + sector_t seg_start, seg_end; /* range of full segment (block number) */ + sector_t b, end; + unsigned long nblocks; + unsigned int flags; + u64 seg_seq; + __u64 segnum, nextnum = 0; + __u64 cno; + LIST_HEAD(segments); + int empty_seg = 0, scan_newer = 0; + int ret; + + pseg_start = nilfs->ns_last_pseg; + seg_seq = nilfs->ns_last_seq; + cno = nilfs->ns_last_cno; + segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); + + /* Calculate range of segment */ + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + + /* Read ahead segment */ + b = seg_start; + while (b <= seg_end) + __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize); + + for (;;) { + brelse(bh_sum); + ret = NILFS_SEG_FAIL_IO; + bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); + if (!bh_sum) + goto failed; + + ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); + if (ret) { + if (ret == NILFS_SEG_FAIL_IO) + goto failed; + goto strayed; + } + + nblocks = le32_to_cpu(sum->ss_nblocks); + pseg_end = pseg_start + nblocks - 1; + if (unlikely(pseg_end > seg_end)) { + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto strayed; + } + + /* A valid partial segment */ + ri->ri_pseg_start = pseg_start; + ri->ri_seq = seg_seq; + ri->ri_segnum = segnum; + nextnum = nilfs_get_segnum_of_block(nilfs, + le64_to_cpu(sum->ss_next)); + ri->ri_nextnum = nextnum; + empty_seg = 0; + + flags = le16_to_cpu(sum->ss_flags); + if (!(flags & NILFS_SS_SR) && !scan_newer) { + /* This will never happen because a superblock + (last_segment) always points to a pseg + having a super root. */ + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto failed; + } + + if (pseg_start == seg_start) { + nilfs_get_segment_range(nilfs, nextnum, &b, &end); + while (b <= end) + __breadahead(nilfs->ns_bdev, b++, + nilfs->ns_blocksize); + } + if (!(flags & NILFS_SS_SR)) { + if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) { + ri->ri_lsegs_start = pseg_start; + ri->ri_lsegs_start_seq = seg_seq; + } + if (flags & NILFS_SS_LOGEND) + ri->ri_lsegs_end = pseg_start; + goto try_next_pseg; + } + + /* A valid super root was found. */ + ri->ri_cno = cno++; + ri->ri_super_root = pseg_end; + ri->ri_lsegs_start = ri->ri_lsegs_end = 0; + + nilfs_dispose_segment_list(&segments); + sr_pseg_start = pseg_start; + nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start; + nilfs->ns_seg_seq = seg_seq; + nilfs->ns_segnum = segnum; + nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */ + nilfs->ns_ctime = le64_to_cpu(sum->ss_create); + nilfs->ns_nextnum = nextnum; + + if (scan_newer) + ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED; + else { + if (nilfs->ns_mount_state & NILFS_VALID_FS) + goto super_root_found; + scan_newer = 1; + } + + try_next_pseg: + /* Standing on a course, or met an inconsistent state */ + pseg_start += nblocks; + if (pseg_start < seg_end) + continue; + goto feed_segment; + + strayed: + /* Off the trail */ + if (!scan_newer) + /* + * This can happen if a checkpoint was written without + * barriers, or as a result of an I/O failure. + */ + goto failed; + + feed_segment: + /* Looking to the next full segment */ + if (empty_seg++) + goto super_root_found; /* found a valid super root */ + + ret = nilfs_segment_list_add(&segments, segnum); + if (unlikely(ret)) + goto failed; + + seg_seq++; + segnum = nextnum; + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + pseg_start = seg_start; + } + + super_root_found: + /* Updating pointers relating to the latest checkpoint */ + brelse(bh_sum); + list_splice_tail(&segments, &ri->ri_used_segments); + nilfs->ns_last_pseg = sr_pseg_start; + nilfs->ns_last_seq = nilfs->ns_seg_seq; + nilfs->ns_last_cno = ri->ri_cno; + return 0; + + failed: + brelse(bh_sum); + nilfs_dispose_segment_list(&segments); + return (ret < 0) ? ret : nilfs_warn_segment_error(ret); +} diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c new file mode 100644 index 00000000..850a7c02 --- /dev/null +++ b/fs/nilfs2/segbuf.c @@ -0,0 +1,536 @@ +/* + * segbuf.c - NILFS segment buffer + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + * + */ + +#include +#include +#include +#include +#include +#include "page.h" +#include "segbuf.h" + + +struct nilfs_write_info { + struct the_nilfs *nilfs; + struct bio *bio; + int start, end; /* The region to be submitted */ + int rest_blocks; + int max_pages; + int nr_vecs; + sector_t blocknr; +}; + +static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, + struct the_nilfs *nilfs); +static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf); + +struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) +{ + struct nilfs_segment_buffer *segbuf; + + segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS); + if (unlikely(!segbuf)) + return NULL; + + segbuf->sb_super = sb; + INIT_LIST_HEAD(&segbuf->sb_list); + INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); + INIT_LIST_HEAD(&segbuf->sb_payload_buffers); + segbuf->sb_super_root = NULL; + + init_completion(&segbuf->sb_bio_event); + atomic_set(&segbuf->sb_err, 0); + segbuf->sb_nbio = 0; + + return segbuf; +} + +void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf) +{ + kmem_cache_free(nilfs_segbuf_cachep, segbuf); +} + +void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum, + unsigned long offset, struct the_nilfs *nilfs) +{ + segbuf->sb_segnum = segnum; + nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start, + &segbuf->sb_fseg_end); + + segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset; + segbuf->sb_rest_blocks = + segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; +} + +/** + * nilfs_segbuf_map_cont - map a new log behind a given log + * @segbuf: new segment buffer + * @prev: segment buffer containing a log to be continued + */ +void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, + struct nilfs_segment_buffer *prev) +{ + segbuf->sb_segnum = prev->sb_segnum; + segbuf->sb_fseg_start = prev->sb_fseg_start; + segbuf->sb_fseg_end = prev->sb_fseg_end; + segbuf->sb_pseg_start = prev->sb_pseg_start + prev->sb_sum.nblocks; + segbuf->sb_rest_blocks = + segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; +} + +void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf, + __u64 nextnum, struct the_nilfs *nilfs) +{ + segbuf->sb_nextnum = nextnum; + segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum); +} + +int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf) +{ + struct buffer_head *bh; + + bh = sb_getblk(segbuf->sb_super, + segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk); + if (unlikely(!bh)) + return -ENOMEM; + + nilfs_segbuf_add_segsum_buffer(segbuf, bh); + return 0; +} + +int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + + bh = sb_getblk(segbuf->sb_super, + segbuf->sb_pseg_start + segbuf->sb_sum.nblocks); + if (unlikely(!bh)) + return -ENOMEM; + + nilfs_segbuf_add_payload_buffer(segbuf, bh); + *bhp = bh; + return 0; +} + +int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, + time_t ctime, __u64 cno) +{ + int err; + + segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0; + err = nilfs_segbuf_extend_segsum(segbuf); + if (unlikely(err)) + return err; + + segbuf->sb_sum.flags = flags; + segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); + segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; + segbuf->sb_sum.ctime = ctime; + segbuf->sb_sum.cno = cno; + return 0; +} + +/* + * Setup segment summary + */ +void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) +{ + struct nilfs_segment_summary *raw_sum; + struct buffer_head *bh_sum; + + bh_sum = list_entry(segbuf->sb_segsum_buffers.next, + struct buffer_head, b_assoc_buffers); + raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data; + + raw_sum->ss_magic = cpu_to_le32(NILFS_SEGSUM_MAGIC); + raw_sum->ss_bytes = cpu_to_le16(sizeof(*raw_sum)); + raw_sum->ss_flags = cpu_to_le16(segbuf->sb_sum.flags); + raw_sum->ss_seq = cpu_to_le64(segbuf->sb_sum.seg_seq); + raw_sum->ss_create = cpu_to_le64(segbuf->sb_sum.ctime); + raw_sum->ss_next = cpu_to_le64(segbuf->sb_sum.next); + raw_sum->ss_nblocks = cpu_to_le32(segbuf->sb_sum.nblocks); + raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo); + raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes); + raw_sum->ss_pad = 0; + raw_sum->ss_cno = cpu_to_le64(segbuf->sb_sum.cno); +} + +/* + * CRC calculation routines + */ +static void +nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed) +{ + struct buffer_head *bh; + struct nilfs_segment_summary *raw_sum; + unsigned long size, bytes = segbuf->sb_sum.sumbytes; + u32 crc; + + bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, + b_assoc_buffers); + + raw_sum = (struct nilfs_segment_summary *)bh->b_data; + size = min_t(unsigned long, bytes, bh->b_size); + crc = crc32_le(seed, + (unsigned char *)raw_sum + + sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum), + size - (sizeof(raw_sum->ss_datasum) + + sizeof(raw_sum->ss_sumsum))); + + list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + bytes -= size; + size = min_t(unsigned long, bytes, bh->b_size); + crc = crc32_le(crc, bh->b_data, size); + } + raw_sum->ss_sumsum = cpu_to_le32(crc); +} + +static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, + u32 seed) +{ + struct buffer_head *bh; + struct nilfs_segment_summary *raw_sum; + void *kaddr; + u32 crc; + + bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, + b_assoc_buffers); + raw_sum = (struct nilfs_segment_summary *)bh->b_data; + crc = crc32_le(seed, + (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum), + bh->b_size - sizeof(raw_sum->ss_datasum)); + + list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + crc = crc32_le(crc, bh->b_data, bh->b_size); + } + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + kaddr = kmap_atomic(bh->b_page, KM_USER0); + crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size); + kunmap_atomic(kaddr, KM_USER0); + } + raw_sum->ss_datasum = cpu_to_le32(crc); +} + +static void +nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf, + u32 seed) +{ + struct nilfs_super_root *raw_sr; + struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info; + unsigned srsize; + u32 crc; + + raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data; + srsize = NILFS_SR_BYTES(nilfs->ns_inode_size); + crc = crc32_le(seed, + (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum), + srsize - sizeof(raw_sr->sr_sum)); + raw_sr->sr_sum = cpu_to_le32(crc); +} + +static void nilfs_release_buffers(struct list_head *list) +{ + struct buffer_head *bh, *n; + + list_for_each_entry_safe(bh, n, list, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + brelse(bh); + } +} + +static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf) +{ + nilfs_release_buffers(&segbuf->sb_segsum_buffers); + nilfs_release_buffers(&segbuf->sb_payload_buffers); + segbuf->sb_super_root = NULL; +} + +/* + * Iterators for segment buffers + */ +void nilfs_clear_logs(struct list_head *logs) +{ + struct nilfs_segment_buffer *segbuf; + + list_for_each_entry(segbuf, logs, sb_list) + nilfs_segbuf_clear(segbuf); +} + +void nilfs_truncate_logs(struct list_head *logs, + struct nilfs_segment_buffer *last) +{ + struct nilfs_segment_buffer *n, *segbuf; + + segbuf = list_prepare_entry(last, logs, sb_list); + list_for_each_entry_safe_continue(segbuf, n, logs, sb_list) { + list_del_init(&segbuf->sb_list); + nilfs_segbuf_clear(segbuf); + nilfs_segbuf_free(segbuf); + } +} + +int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs) +{ + struct nilfs_segment_buffer *segbuf; + int ret = 0; + + list_for_each_entry(segbuf, logs, sb_list) { + ret = nilfs_segbuf_write(segbuf, nilfs); + if (ret) + break; + } + return ret; +} + +int nilfs_wait_on_logs(struct list_head *logs) +{ + struct nilfs_segment_buffer *segbuf; + int err, ret = 0; + + list_for_each_entry(segbuf, logs, sb_list) { + err = nilfs_segbuf_wait(segbuf); + if (err && !ret) + ret = err; + } + return ret; +} + +/** + * nilfs_add_checksums_on_logs - add checksums on the logs + * @logs: list of segment buffers storing target logs + * @seed: checksum seed value + */ +void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed) +{ + struct nilfs_segment_buffer *segbuf; + + list_for_each_entry(segbuf, logs, sb_list) { + if (segbuf->sb_super_root) + nilfs_segbuf_fill_in_super_root_crc(segbuf, seed); + nilfs_segbuf_fill_in_segsum_crc(segbuf, seed); + nilfs_segbuf_fill_in_data_crc(segbuf, seed); + } +} + +/* + * BIO operations + */ +static void nilfs_end_bio_write(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct nilfs_segment_buffer *segbuf = bio->bi_private; + + if (err == -EOPNOTSUPP) { + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + bio_put(bio); + /* to be detected by submit_seg_bio() */ + } + + if (!uptodate) + atomic_inc(&segbuf->sb_err); + + bio_put(bio); + complete(&segbuf->sb_bio_event); +} + +static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi, int mode) +{ + struct bio *bio = wi->bio; + int err; + + if (segbuf->sb_nbio > 0 && + bdi_write_congested(segbuf->sb_super->s_bdi)) { + wait_for_completion(&segbuf->sb_bio_event); + segbuf->sb_nbio--; + if (unlikely(atomic_read(&segbuf->sb_err))) { + bio_put(bio); + err = -EIO; + goto failed; + } + } + + bio->bi_end_io = nilfs_end_bio_write; + bio->bi_private = segbuf; + bio_get(bio); + submit_bio(mode, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + bio_put(bio); + err = -EOPNOTSUPP; + goto failed; + } + segbuf->sb_nbio++; + bio_put(bio); + + wi->bio = NULL; + wi->rest_blocks -= wi->end - wi->start; + wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); + wi->start = wi->end; + return 0; + + failed: + wi->bio = NULL; + return err; +} + +/** + * nilfs_alloc_seg_bio - allocate a new bio for writing log + * @nilfs: nilfs object + * @start: start block number of the bio + * @nr_vecs: request size of page vector. + * + * Return Value: On success, pointer to the struct bio is returned. + * On error, NULL is returned. + */ +static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start, + int nr_vecs) +{ + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, nr_vecs); + if (bio == NULL) { + while (!bio && (nr_vecs >>= 1)) + bio = bio_alloc(GFP_NOIO, nr_vecs); + } + if (likely(bio)) { + bio->bi_bdev = nilfs->ns_bdev; + bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9); + } + return bio; +} + +static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi) +{ + wi->bio = NULL; + wi->rest_blocks = segbuf->sb_sum.nblocks; + wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev); + wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); + wi->start = wi->end = 0; + wi->blocknr = segbuf->sb_pseg_start; +} + +static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi, + struct buffer_head *bh, int mode) +{ + int len, err; + + BUG_ON(wi->nr_vecs <= 0); + repeat: + if (!wi->bio) { + wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end, + wi->nr_vecs); + if (unlikely(!wi->bio)) + return -ENOMEM; + } + + len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh)); + if (len == bh->b_size) { + wi->end++; + return 0; + } + /* bio is FULL */ + err = nilfs_segbuf_submit_bio(segbuf, wi, mode); + /* never submit current bh */ + if (likely(!err)) + goto repeat; + return err; +} + +/** + * nilfs_segbuf_write - submit write requests of a log + * @segbuf: buffer storing a log to be written + * @nilfs: nilfs object + * + * Return Value: On Success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error + * + * %-ENOMEM - Insufficient memory available. + */ +static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, + struct the_nilfs *nilfs) +{ + struct nilfs_write_info wi; + struct buffer_head *bh; + int res = 0, rw = WRITE; + + wi.nilfs = nilfs; + nilfs_segbuf_prepare_write(segbuf, &wi); + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { + res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw); + if (unlikely(res)) + goto failed_bio; + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw); + if (unlikely(res)) + goto failed_bio; + } + + if (wi.bio) { + /* + * Last BIO is always sent through the following + * submission. + */ + rw |= REQ_SYNC; + res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); + } + + failed_bio: + return res; +} + +/** + * nilfs_segbuf_wait - wait for completion of requested BIOs + * @segbuf: segment buffer + * + * Return Value: On Success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error + */ +static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) +{ + int err = 0; + + if (!segbuf->sb_nbio) + return 0; + + do { + wait_for_completion(&segbuf->sb_bio_event); + } while (--segbuf->sb_nbio > 0); + + if (unlikely(atomic_read(&segbuf->sb_err) > 0)) { + printk(KERN_ERR "NILFS: IO error writing segment\n"); + err = -EIO; + } + return err; +} diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h new file mode 100644 index 00000000..b04f08cc --- /dev/null +++ b/fs/nilfs2/segbuf.h @@ -0,0 +1,184 @@ +/* + * segbuf.h - NILFS Segment buffer prototypes and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + * + */ +#ifndef _NILFS_SEGBUF_H +#define _NILFS_SEGBUF_H + +#include +#include +#include +#include + +/** + * struct nilfs_segsum_info - On-memory segment summary + * @flags: Flags + * @nfinfo: Number of file information structures + * @nblocks: Number of blocks included in the partial segment + * @nsumblk: Number of summary blocks + * @sumbytes: Byte count of segment summary + * @nfileblk: Total number of file blocks + * @seg_seq: Segment sequence number + * @cno: Checkpoint number + * @ctime: Creation time + * @next: Block number of the next full segment + */ +struct nilfs_segsum_info { + unsigned int flags; + unsigned long nfinfo; + unsigned long nblocks; + unsigned long nsumblk; + unsigned long sumbytes; + unsigned long nfileblk; + u64 seg_seq; + __u64 cno; + time_t ctime; + sector_t next; +}; + +/** + * struct nilfs_segment_buffer - Segment buffer + * @sb_super: back pointer to a superblock struct + * @sb_list: List head to chain this structure + * @sb_sum: On-memory segment summary + * @sb_segnum: Index number of the full segment + * @sb_nextnum: Index number of the next full segment + * @sb_fseg_start: Start block number of the full segment + * @sb_fseg_end: End block number of the full segment + * @sb_pseg_start: Disk block number of partial segment + * @sb_rest_blocks: Number of residual blocks in the current segment + * @sb_segsum_buffers: List of buffers for segment summaries + * @sb_payload_buffers: List of buffers for segment payload + * @sb_super_root: Pointer to buffer storing a super root block (if exists) + * @sb_nbio: Number of flying bio requests + * @sb_err: I/O error status + * @sb_bio_event: Completion event of log writing + */ +struct nilfs_segment_buffer { + struct super_block *sb_super; + struct list_head sb_list; + + /* Segment information */ + struct nilfs_segsum_info sb_sum; + __u64 sb_segnum; + __u64 sb_nextnum; + sector_t sb_fseg_start, sb_fseg_end; + sector_t sb_pseg_start; + unsigned sb_rest_blocks; + + /* Buffers */ + struct list_head sb_segsum_buffers; + struct list_head sb_payload_buffers; /* including super root */ + struct buffer_head *sb_super_root; + + /* io status */ + int sb_nbio; + atomic_t sb_err; + struct completion sb_bio_event; +}; + +#define NILFS_LIST_SEGBUF(head) \ + list_entry((head), struct nilfs_segment_buffer, sb_list) +#define NILFS_NEXT_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.next) +#define NILFS_PREV_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.prev) +#define NILFS_LAST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->prev) +#define NILFS_FIRST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->next) +#define NILFS_SEGBUF_IS_LAST(segbuf, head) ((segbuf)->sb_list.next == (head)) + +#define nilfs_for_each_segbuf_before(s, t, h) \ + for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \ + (s) = NILFS_NEXT_SEGBUF(s)) + +#define NILFS_SEGBUF_FIRST_BH(head) \ + (list_entry((head)->next, struct buffer_head, b_assoc_buffers)) +#define NILFS_SEGBUF_NEXT_BH(bh) \ + (list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \ + b_assoc_buffers)) +#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head) + +extern struct kmem_cache *nilfs_segbuf_cachep; + +struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *); +void nilfs_segbuf_free(struct nilfs_segment_buffer *); +void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long, + struct the_nilfs *); +void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, + struct nilfs_segment_buffer *prev); +void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, + struct the_nilfs *); +int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64); +int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); +int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, + struct buffer_head **); +void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); + +static inline int nilfs_segbuf_simplex(struct nilfs_segment_buffer *segbuf) +{ + unsigned int flags = segbuf->sb_sum.flags; + + return (flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == + (NILFS_SS_LOGBGN | NILFS_SS_LOGEND); +} + +static inline int nilfs_segbuf_empty(struct nilfs_segment_buffer *segbuf) +{ + return segbuf->sb_sum.nblocks == segbuf->sb_sum.nsumblk; +} + +static inline void +nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers); + segbuf->sb_sum.nblocks++; + segbuf->sb_sum.nsumblk++; +} + +static inline void +nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers); + segbuf->sb_sum.nblocks++; +} + +static inline void +nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + get_bh(bh); + nilfs_segbuf_add_payload_buffer(segbuf, bh); + segbuf->sb_sum.nfileblk++; +} + +void nilfs_clear_logs(struct list_head *logs); +void nilfs_truncate_logs(struct list_head *logs, + struct nilfs_segment_buffer *last); +int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs); +int nilfs_wait_on_logs(struct list_head *logs); +void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed); + +static inline void nilfs_destroy_logs(struct list_head *logs) +{ + nilfs_truncate_logs(logs, NULL); +} + +#endif /* _NILFS_SEGBUF_H */ diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c new file mode 100644 index 00000000..bb24ab6c --- /dev/null +++ b/fs/nilfs2/segment.c @@ -0,0 +1,2707 @@ +/* + * segment.c - NILFS segment constructor. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "btnode.h" +#include "page.h" +#include "segment.h" +#include "sufile.h" +#include "cpfile.h" +#include "ifile.h" +#include "segbuf.h" + + +/* + * Segment constructor + */ +#define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */ + +#define SC_MAX_SEGDELTA 64 /* Upper limit of the number of segments + appended in collection retry loop */ + +/* Construction mode */ +enum { + SC_LSEG_SR = 1, /* Make a logical segment having a super root */ + SC_LSEG_DSYNC, /* Flush data blocks of a given file and make + a logical segment without a super root */ + SC_FLUSH_FILE, /* Flush data files, leads to segment writes without + creating a checkpoint */ + SC_FLUSH_DAT, /* Flush DAT file. This also creates segments without + a checkpoint */ +}; + +/* Stage numbers of dirty block collection */ +enum { + NILFS_ST_INIT = 0, + NILFS_ST_GC, /* Collecting dirty blocks for GC */ + NILFS_ST_FILE, + NILFS_ST_IFILE, + NILFS_ST_CPFILE, + NILFS_ST_SUFILE, + NILFS_ST_DAT, + NILFS_ST_SR, /* Super root */ + NILFS_ST_DSYNC, /* Data sync blocks */ + NILFS_ST_DONE, +}; + +/* State flags of collection */ +#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */ +#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */ +#define NILFS_CF_SUFREED 0x0004 /* segment usages has been freed */ +#define NILFS_CF_HISTORY_MASK (NILFS_CF_IFILE_STARTED | NILFS_CF_SUFREED) + +/* Operations depending on the construction mode and file type */ +struct nilfs_sc_operations { + int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + void (*write_data_binfo)(struct nilfs_sc_info *, + struct nilfs_segsum_pointer *, + union nilfs_binfo *); + void (*write_node_binfo)(struct nilfs_sc_info *, + struct nilfs_segsum_pointer *, + union nilfs_binfo *); +}; + +/* + * Other definitions + */ +static void nilfs_segctor_start_timer(struct nilfs_sc_info *); +static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int); +static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *); +static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int); + +#define nilfs_cnt32_gt(a, b) \ + (typecheck(__u32, a) && typecheck(__u32, b) && \ + ((__s32)(b) - (__s32)(a) < 0)) +#define nilfs_cnt32_ge(a, b) \ + (typecheck(__u32, a) && typecheck(__u32, b) && \ + ((__s32)(a) - (__s32)(b) >= 0)) +#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a) +#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a) + +static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) +{ + struct nilfs_transaction_info *cur_ti = current->journal_info; + void *save = NULL; + + if (cur_ti) { + if (cur_ti->ti_magic == NILFS_TI_MAGIC) + return ++cur_ti->ti_count; + else { + /* + * If journal_info field is occupied by other FS, + * it is saved and will be restored on + * nilfs_transaction_commit(). + */ + printk(KERN_WARNING + "NILFS warning: journal info from a different " + "FS\n"); + save = current->journal_info; + } + } + if (!ti) { + ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS); + if (!ti) + return -ENOMEM; + ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC; + } else { + ti->ti_flags = 0; + } + ti->ti_count = 0; + ti->ti_save = save; + ti->ti_magic = NILFS_TI_MAGIC; + current->journal_info = ti; + return 0; +} + +/** + * nilfs_transaction_begin - start indivisible file operations. + * @sb: super block + * @ti: nilfs_transaction_info + * @vacancy_check: flags for vacancy rate checks + * + * nilfs_transaction_begin() acquires a reader/writer semaphore, called + * the segment semaphore, to make a segment construction and write tasks + * exclusive. The function is used with nilfs_transaction_commit() in pairs. + * The region enclosed by these two functions can be nested. To avoid a + * deadlock, the semaphore is only acquired or released in the outermost call. + * + * This function allocates a nilfs_transaction_info struct to keep context + * information on it. It is initialized and hooked onto the current task in + * the outermost call. If a pre-allocated struct is given to @ti, it is used + * instead; otherwise a new struct is assigned from a slab. + * + * When @vacancy_check flag is set, this function will check the amount of + * free space, and will wait for the GC to reclaim disk space if low capacity. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-ENOSPC - No space left on device + */ +int nilfs_transaction_begin(struct super_block *sb, + struct nilfs_transaction_info *ti, + int vacancy_check) +{ + struct the_nilfs *nilfs; + int ret = nilfs_prepare_segment_lock(ti); + + if (unlikely(ret < 0)) + return ret; + if (ret > 0) + return 0; + + vfs_check_frozen(sb, SB_FREEZE_WRITE); + + nilfs = sb->s_fs_info; + down_read(&nilfs->ns_segctor_sem); + if (vacancy_check && nilfs_near_disk_full(nilfs)) { + up_read(&nilfs->ns_segctor_sem); + ret = -ENOSPC; + goto failed; + } + return 0; + + failed: + ti = current->journal_info; + current->journal_info = ti->ti_save; + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); + return ret; +} + +/** + * nilfs_transaction_commit - commit indivisible file operations. + * @sb: super block + * + * nilfs_transaction_commit() releases the read semaphore which is + * acquired by nilfs_transaction_begin(). This is only performed + * in outermost call of this function. If a commit flag is set, + * nilfs_transaction_commit() sets a timer to start the segment + * constructor. If a sync flag is set, it starts construction + * directly. + */ +int nilfs_transaction_commit(struct super_block *sb) +{ + struct nilfs_transaction_info *ti = current->journal_info; + struct the_nilfs *nilfs = sb->s_fs_info; + int err = 0; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + ti->ti_flags |= NILFS_TI_COMMIT; + if (ti->ti_count > 0) { + ti->ti_count--; + return 0; + } + if (nilfs->ns_writer) { + struct nilfs_sc_info *sci = nilfs->ns_writer; + + if (ti->ti_flags & NILFS_TI_COMMIT) + nilfs_segctor_start_timer(sci); + if (atomic_read(&nilfs->ns_ndirtyblks) > sci->sc_watermark) + nilfs_segctor_do_flush(sci, 0); + } + up_read(&nilfs->ns_segctor_sem); + current->journal_info = ti->ti_save; + + if (ti->ti_flags & NILFS_TI_SYNC) + err = nilfs_construct_segment(sb); + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); + return err; +} + +void nilfs_transaction_abort(struct super_block *sb) +{ + struct nilfs_transaction_info *ti = current->journal_info; + struct the_nilfs *nilfs = sb->s_fs_info; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + if (ti->ti_count > 0) { + ti->ti_count--; + return; + } + up_read(&nilfs->ns_segctor_sem); + + current->journal_info = ti->ti_save; + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); +} + +void nilfs_relax_pressure_in_lock(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_sc_info *sci = nilfs->ns_writer; + + if (!sci || !sci->sc_flush_request) + return; + + set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); + up_read(&nilfs->ns_segctor_sem); + + down_write(&nilfs->ns_segctor_sem); + if (sci->sc_flush_request && + test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) { + struct nilfs_transaction_info *ti = current->journal_info; + + ti->ti_flags |= NILFS_TI_WRITER; + nilfs_segctor_do_immediate_flush(sci); + ti->ti_flags &= ~NILFS_TI_WRITER; + } + downgrade_write(&nilfs->ns_segctor_sem); +} + +static void nilfs_transaction_lock(struct super_block *sb, + struct nilfs_transaction_info *ti, + int gcflag) +{ + struct nilfs_transaction_info *cur_ti = current->journal_info; + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_sc_info *sci = nilfs->ns_writer; + + WARN_ON(cur_ti); + ti->ti_flags = NILFS_TI_WRITER; + ti->ti_count = 0; + ti->ti_save = cur_ti; + ti->ti_magic = NILFS_TI_MAGIC; + INIT_LIST_HEAD(&ti->ti_garbage); + current->journal_info = ti; + + for (;;) { + down_write(&nilfs->ns_segctor_sem); + if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) + break; + + nilfs_segctor_do_immediate_flush(sci); + + up_write(&nilfs->ns_segctor_sem); + yield(); + } + if (gcflag) + ti->ti_flags |= NILFS_TI_GC; +} + +static void nilfs_transaction_unlock(struct super_block *sb) +{ + struct nilfs_transaction_info *ti = current->journal_info; + struct the_nilfs *nilfs = sb->s_fs_info; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + BUG_ON(ti->ti_count > 0); + + up_write(&nilfs->ns_segctor_sem); + current->journal_info = ti->ti_save; + if (!list_empty(&ti->ti_garbage)) + nilfs_dispose_list(nilfs, &ti->ti_garbage, 0); +} + +static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + unsigned bytes) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + unsigned blocksize = sci->sc_super->s_blocksize; + void *p; + + if (unlikely(ssp->offset + bytes > blocksize)) { + ssp->offset = 0; + BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh, + &segbuf->sb_segsum_buffers)); + ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh); + } + p = ssp->bh->b_data + ssp->offset; + ssp->offset += bytes; + return p; +} + +/** + * nilfs_segctor_reset_segment_buffer - reset the current segment buffer + * @sci: nilfs_sc_info + */ +static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + struct buffer_head *sumbh; + unsigned sumbytes; + unsigned flags = 0; + int err; + + if (nilfs_doing_gc()) + flags = NILFS_SS_GC; + err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, sci->sc_cno); + if (unlikely(err)) + return err; + + sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers); + sumbytes = segbuf->sb_sum.sumbytes; + sci->sc_finfo_ptr.bh = sumbh; sci->sc_finfo_ptr.offset = sumbytes; + sci->sc_binfo_ptr.bh = sumbh; sci->sc_binfo_ptr.offset = sumbytes; + sci->sc_blk_cnt = sci->sc_datablk_cnt = 0; + return 0; +} + +static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci) +{ + sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; + if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs)) + return -E2BIG; /* The current segment is filled up + (internal code) */ + sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg); + return nilfs_segctor_reset_segment_buffer(sci); +} + +static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + int err; + + if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) { + err = nilfs_segctor_feed_segment(sci); + if (err) + return err; + segbuf = sci->sc_curseg; + } + err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root); + if (likely(!err)) + segbuf->sb_sum.flags |= NILFS_SS_SR; + return err; +} + +/* + * Functions for making segment summary and payloads + */ +static int nilfs_segctor_segsum_block_required( + struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp, + unsigned binfo_size) +{ + unsigned blocksize = sci->sc_super->s_blocksize; + /* Size of finfo and binfo is enough small against blocksize */ + + return ssp->offset + binfo_size + + (!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) > + blocksize; +} + +static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci, + struct inode *inode) +{ + sci->sc_curseg->sb_sum.nfinfo++; + sci->sc_binfo_ptr = sci->sc_finfo_ptr; + nilfs_segctor_map_segsum_entry( + sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo)); + + if (NILFS_I(inode)->i_root && + !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) + set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); + /* skip finfo */ +} + +static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci, + struct inode *inode) +{ + struct nilfs_finfo *finfo; + struct nilfs_inode_info *ii; + struct nilfs_segment_buffer *segbuf; + __u64 cno; + + if (sci->sc_blk_cnt == 0) + return; + + ii = NILFS_I(inode); + + if (test_bit(NILFS_I_GCINODE, &ii->i_state)) + cno = ii->i_cno; + else if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) + cno = 0; + else + cno = sci->sc_cno; + + finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr, + sizeof(*finfo)); + finfo->fi_ino = cpu_to_le64(inode->i_ino); + finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt); + finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt); + finfo->fi_cno = cpu_to_le64(cno); + + segbuf = sci->sc_curseg; + segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset + + sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1); + sci->sc_finfo_ptr = sci->sc_binfo_ptr; + sci->sc_blk_cnt = sci->sc_datablk_cnt = 0; +} + +static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode, + unsigned binfo_size) +{ + struct nilfs_segment_buffer *segbuf; + int required, err = 0; + + retry: + segbuf = sci->sc_curseg; + required = nilfs_segctor_segsum_block_required( + sci, &sci->sc_binfo_ptr, binfo_size); + if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) { + nilfs_segctor_end_finfo(sci, inode); + err = nilfs_segctor_feed_segment(sci); + if (err) + return err; + goto retry; + } + if (unlikely(required)) { + err = nilfs_segbuf_extend_segsum(segbuf); + if (unlikely(err)) + goto failed; + } + if (sci->sc_blk_cnt == 0) + nilfs_segctor_begin_finfo(sci, inode); + + nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size); + /* Substitution to vblocknr is delayed until update_blocknr() */ + nilfs_segbuf_add_file_buffer(segbuf, bh); + sci->sc_blk_cnt++; + failed: + return err; +} + +/* + * Callback functions that enumerate, mark, and collect dirty blocks + */ +static int nilfs_collect_file_data(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + int err; + + err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); + if (err < 0) + return err; + + err = nilfs_segctor_add_file_block(sci, bh, inode, + sizeof(struct nilfs_binfo_v)); + if (!err) + sci->sc_datablk_cnt++; + return err; +} + +static int nilfs_collect_file_node(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode) +{ + return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); +} + +static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode) +{ + WARN_ON(!buffer_dirty(bh)); + return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); +} + +static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry( + sci, ssp, sizeof(*binfo_v)); + *binfo_v = binfo->bi_v; +} + +static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + __le64 *vblocknr = nilfs_segctor_map_segsum_entry( + sci, ssp, sizeof(*vblocknr)); + *vblocknr = binfo->bi_v.bi_vblocknr; +} + +static struct nilfs_sc_operations nilfs_sc_file_ops = { + .collect_data = nilfs_collect_file_data, + .collect_node = nilfs_collect_file_node, + .collect_bmap = nilfs_collect_file_bmap, + .write_data_binfo = nilfs_write_file_data_binfo, + .write_node_binfo = nilfs_write_file_node_binfo, +}; + +static int nilfs_collect_dat_data(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + int err; + + err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); + if (err < 0) + return err; + + err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); + if (!err) + sci->sc_datablk_cnt++; + return err; +} + +static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + WARN_ON(!buffer_dirty(bh)); + return nilfs_segctor_add_file_block(sci, bh, inode, + sizeof(struct nilfs_binfo_dat)); +} + +static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + __le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp, + sizeof(*blkoff)); + *blkoff = binfo->bi_dat.bi_blkoff; +} + +static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + struct nilfs_binfo_dat *binfo_dat = + nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat)); + *binfo_dat = binfo->bi_dat; +} + +static struct nilfs_sc_operations nilfs_sc_dat_ops = { + .collect_data = nilfs_collect_dat_data, + .collect_node = nilfs_collect_file_node, + .collect_bmap = nilfs_collect_dat_bmap, + .write_data_binfo = nilfs_write_dat_data_binfo, + .write_node_binfo = nilfs_write_dat_node_binfo, +}; + +static struct nilfs_sc_operations nilfs_sc_dsync_ops = { + .collect_data = nilfs_collect_file_data, + .collect_node = NULL, + .collect_bmap = NULL, + .write_data_binfo = nilfs_write_file_data_binfo, + .write_node_binfo = NULL, +}; + +static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, + struct list_head *listp, + size_t nlimit, + loff_t start, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + pgoff_t index = 0, last = ULONG_MAX; + size_t ndirties = 0; + int i; + + if (unlikely(start != 0 || end != LLONG_MAX)) { + /* + * A valid range is given for sync-ing data pages. The + * range is rounded to per-page; extra dirty buffers + * may be included if blocksize < pagesize. + */ + index = start >> PAGE_SHIFT; + last = end >> PAGE_SHIFT; + } + pagevec_init(&pvec, 0); + repeat: + if (unlikely(index > last) || + !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + min_t(pgoff_t, last - index, + PAGEVEC_SIZE - 1) + 1)) + return ndirties; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct buffer_head *bh, *head; + struct page *page = pvec.pages[i]; + + if (unlikely(page->index > last)) + break; + + lock_page(page); + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << inode->i_blkbits, 0); + unlock_page(page); + + bh = head = page_buffers(page); + do { + if (!buffer_dirty(bh)) + continue; + get_bh(bh); + list_add_tail(&bh->b_assoc_buffers, listp); + ndirties++; + if (unlikely(ndirties >= nlimit)) { + pagevec_release(&pvec); + cond_resched(); + return ndirties; + } + } while (bh = bh->b_this_page, bh != head); + } + pagevec_release(&pvec); + cond_resched(); + goto repeat; +} + +static void nilfs_lookup_dirty_node_buffers(struct inode *inode, + struct list_head *listp) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct address_space *mapping = &ii->i_btnode_cache; + struct pagevec pvec; + struct buffer_head *bh, *head; + unsigned int i; + pgoff_t index = 0; + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + bh = head = page_buffers(pvec.pages[i]); + do { + if (buffer_dirty(bh)) { + get_bh(bh); + list_add_tail(&bh->b_assoc_buffers, + listp); + } + bh = bh->b_this_page; + } while (bh != head); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +static void nilfs_dispose_list(struct the_nilfs *nilfs, + struct list_head *head, int force) +{ + struct nilfs_inode_info *ii, *n; + struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii; + unsigned nv = 0; + + while (!list_empty(head)) { + spin_lock(&nilfs->ns_inode_lock); + list_for_each_entry_safe(ii, n, head, i_dirty) { + list_del_init(&ii->i_dirty); + if (force) { + if (unlikely(ii->i_bh)) { + brelse(ii->i_bh); + ii->i_bh = NULL; + } + } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) { + set_bit(NILFS_I_QUEUED, &ii->i_state); + list_add_tail(&ii->i_dirty, + &nilfs->ns_dirty_files); + continue; + } + ivec[nv++] = ii; + if (nv == SC_N_INODEVEC) + break; + } + spin_unlock(&nilfs->ns_inode_lock); + + for (pii = ivec; nv > 0; pii++, nv--) + iput(&(*pii)->vfs_inode); + } +} + +static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs, + struct nilfs_root *root) +{ + int ret = 0; + + if (nilfs_mdt_fetch_dirty(root->ifile)) + ret++; + if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile)) + ret++; + if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile)) + ret++; + if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat)) + ret++; + return ret; +} + +static int nilfs_segctor_clean(struct nilfs_sc_info *sci) +{ + return list_empty(&sci->sc_dirty_files) && + !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) && + sci->sc_nfreesegs == 0 && + (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes)); +} + +static int nilfs_segctor_confirm(struct nilfs_sc_info *sci) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + int ret = 0; + + if (nilfs_test_metadata_dirty(nilfs, sci->sc_root)) + set_bit(NILFS_SC_DIRTY, &sci->sc_flags); + + spin_lock(&nilfs->ns_inode_lock); + if (list_empty(&nilfs->ns_dirty_files) && nilfs_segctor_clean(sci)) + ret++; + + spin_unlock(&nilfs->ns_inode_lock); + return ret; +} + +static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + + nilfs_mdt_clear_dirty(sci->sc_root->ifile); + nilfs_mdt_clear_dirty(nilfs->ns_cpfile); + nilfs_mdt_clear_dirty(nilfs->ns_sufile); + nilfs_mdt_clear_dirty(nilfs->ns_dat); +} + +static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + struct buffer_head *bh_cp; + struct nilfs_checkpoint *raw_cp; + int err; + + /* XXX: this interface will be changed */ + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1, + &raw_cp, &bh_cp); + if (likely(!err)) { + /* The following code is duplicated with cpfile. But, it is + needed to collect the checkpoint even if it was not newly + created */ + mark_buffer_dirty(bh_cp); + nilfs_mdt_mark_dirty(nilfs->ns_cpfile); + nilfs_cpfile_put_checkpoint( + nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); + } else + WARN_ON(err == -EINVAL || err == -ENOENT); + + return err; +} + +static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + struct buffer_head *bh_cp; + struct nilfs_checkpoint *raw_cp; + int err; + + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0, + &raw_cp, &bh_cp); + if (unlikely(err)) { + WARN_ON(err == -EINVAL || err == -ENOENT); + goto failed_ibh; + } + raw_cp->cp_snapshot_list.ssl_next = 0; + raw_cp->cp_snapshot_list.ssl_prev = 0; + raw_cp->cp_inodes_count = + cpu_to_le64(atomic_read(&sci->sc_root->inodes_count)); + raw_cp->cp_blocks_count = + cpu_to_le64(atomic_read(&sci->sc_root->blocks_count)); + raw_cp->cp_nblk_inc = + cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); + raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); + raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno); + + if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) + nilfs_checkpoint_clear_minor(raw_cp); + else + nilfs_checkpoint_set_minor(raw_cp); + + nilfs_write_inode_common(sci->sc_root->ifile, + &raw_cp->cp_ifile_inode, 1); + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); + return 0; + + failed_ibh: + return err; +} + +static void nilfs_fill_in_file_bmap(struct inode *ifile, + struct nilfs_inode_info *ii) + +{ + struct buffer_head *ibh; + struct nilfs_inode *raw_inode; + + if (test_bit(NILFS_I_BMAP, &ii->i_state)) { + ibh = ii->i_bh; + BUG_ON(!ibh); + raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino, + ibh); + nilfs_bmap_write(ii->i_bmap, raw_inode); + nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh); + } +} + +static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) { + nilfs_fill_in_file_bmap(sci->sc_root->ifile, ii); + set_bit(NILFS_I_COLLECTED, &ii->i_state); + } +} + +static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct buffer_head *bh_sr; + struct nilfs_super_root *raw_sr; + unsigned isz, srsz; + + bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; + raw_sr = (struct nilfs_super_root *)bh_sr->b_data; + isz = nilfs->ns_inode_size; + srsz = NILFS_SR_BYTES(isz); + + raw_sr->sr_bytes = cpu_to_le16(srsz); + raw_sr->sr_nongc_ctime + = cpu_to_le64(nilfs_doing_gc() ? + nilfs->ns_nongc_ctime : sci->sc_seg_ctime); + raw_sr->sr_flags = 0; + + nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr + + NILFS_SR_DAT_OFFSET(isz), 1); + nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr + + NILFS_SR_CPFILE_OFFSET(isz), 1); + nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr + + NILFS_SR_SUFILE_OFFSET(isz), 1); + memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz); +} + +static void nilfs_redirty_inodes(struct list_head *head) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, head, i_dirty) { + if (test_bit(NILFS_I_COLLECTED, &ii->i_state)) + clear_bit(NILFS_I_COLLECTED, &ii->i_state); + } +} + +static void nilfs_drop_collected_inodes(struct list_head *head) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, head, i_dirty) { + if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state)) + continue; + + clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state); + set_bit(NILFS_I_UPDATED, &ii->i_state); + } +} + +static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci, + struct inode *inode, + struct list_head *listp, + int (*collect)(struct nilfs_sc_info *, + struct buffer_head *, + struct inode *)) +{ + struct buffer_head *bh, *n; + int err = 0; + + if (collect) { + list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + err = collect(sci, bh, inode); + brelse(bh); + if (unlikely(err)) + goto dispose_buffers; + } + return 0; + } + + dispose_buffers: + while (!list_empty(listp)) { + bh = list_first_entry(listp, struct buffer_head, + b_assoc_buffers); + list_del_init(&bh->b_assoc_buffers); + brelse(bh); + } + return err; +} + +static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci) +{ + /* Remaining number of blocks within segment buffer */ + return sci->sc_segbuf_nblocks - + (sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks); +} + +static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci, + struct inode *inode, + struct nilfs_sc_operations *sc_ops) +{ + LIST_HEAD(data_buffers); + LIST_HEAD(node_buffers); + int err; + + if (!(sci->sc_stage.flags & NILFS_CF_NODE)) { + size_t n, rest = nilfs_segctor_buffer_rest(sci); + + n = nilfs_lookup_dirty_data_buffers( + inode, &data_buffers, rest + 1, 0, LLONG_MAX); + if (n > rest) { + err = nilfs_segctor_apply_buffers( + sci, inode, &data_buffers, + sc_ops->collect_data); + BUG_ON(!err); /* always receive -E2BIG or true error */ + goto break_or_fail; + } + } + nilfs_lookup_dirty_node_buffers(inode, &node_buffers); + + if (!(sci->sc_stage.flags & NILFS_CF_NODE)) { + err = nilfs_segctor_apply_buffers( + sci, inode, &data_buffers, sc_ops->collect_data); + if (unlikely(err)) { + /* dispose node list */ + nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, NULL); + goto break_or_fail; + } + sci->sc_stage.flags |= NILFS_CF_NODE; + } + /* Collect node */ + err = nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, sc_ops->collect_node); + if (unlikely(err)) + goto break_or_fail; + + nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers); + err = nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, sc_ops->collect_bmap); + if (unlikely(err)) + goto break_or_fail; + + nilfs_segctor_end_finfo(sci, inode); + sci->sc_stage.flags &= ~NILFS_CF_NODE; + + break_or_fail: + return err; +} + +static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci, + struct inode *inode) +{ + LIST_HEAD(data_buffers); + size_t n, rest = nilfs_segctor_buffer_rest(sci); + int err; + + n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1, + sci->sc_dsync_start, + sci->sc_dsync_end); + + err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers, + nilfs_collect_file_data); + if (!err) { + nilfs_segctor_end_finfo(sci, inode); + BUG_ON(n > rest); + /* always receive -E2BIG or true error if n > rest */ + } + return err; +} + +static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + struct list_head *head; + struct nilfs_inode_info *ii; + size_t ndone; + int err = 0; + + switch (sci->sc_stage.scnt) { + case NILFS_ST_INIT: + /* Pre-processes */ + sci->sc_stage.flags = 0; + + if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) { + sci->sc_nblk_inc = 0; + sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN; + if (mode == SC_LSEG_DSYNC) { + sci->sc_stage.scnt = NILFS_ST_DSYNC; + goto dsync_mode; + } + } + + sci->sc_stage.dirty_file_ptr = NULL; + sci->sc_stage.gc_inode_ptr = NULL; + if (mode == SC_FLUSH_DAT) { + sci->sc_stage.scnt = NILFS_ST_DAT; + goto dat_stage; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_GC: + if (nilfs_doing_gc()) { + head = &sci->sc_gc_inodes; + ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr, + head, i_dirty); + list_for_each_entry_continue(ii, head, i_dirty) { + err = nilfs_segctor_scan_file( + sci, &ii->vfs_inode, + &nilfs_sc_file_ops); + if (unlikely(err)) { + sci->sc_stage.gc_inode_ptr = list_entry( + ii->i_dirty.prev, + struct nilfs_inode_info, + i_dirty); + goto break_or_fail; + } + set_bit(NILFS_I_COLLECTED, &ii->i_state); + } + sci->sc_stage.gc_inode_ptr = NULL; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_FILE: + head = &sci->sc_dirty_files; + ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head, + i_dirty); + list_for_each_entry_continue(ii, head, i_dirty) { + clear_bit(NILFS_I_DIRTY, &ii->i_state); + + err = nilfs_segctor_scan_file(sci, &ii->vfs_inode, + &nilfs_sc_file_ops); + if (unlikely(err)) { + sci->sc_stage.dirty_file_ptr = + list_entry(ii->i_dirty.prev, + struct nilfs_inode_info, + i_dirty); + goto break_or_fail; + } + /* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */ + /* XXX: required ? */ + } + sci->sc_stage.dirty_file_ptr = NULL; + if (mode == SC_FLUSH_FILE) { + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + } + sci->sc_stage.scnt++; + sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; + /* Fall through */ + case NILFS_ST_IFILE: + err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; + /* Creating a checkpoint */ + err = nilfs_segctor_create_checkpoint(sci); + if (unlikely(err)) + break; + /* Fall through */ + case NILFS_ST_CPFILE: + err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_SUFILE: + err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs, + sci->sc_nfreesegs, &ndone); + if (unlikely(err)) { + nilfs_sufile_cancel_freev(nilfs->ns_sufile, + sci->sc_freesegs, ndone, + NULL); + break; + } + sci->sc_stage.flags |= NILFS_CF_SUFREED; + + err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_DAT: + dat_stage: + err = nilfs_segctor_scan_file(sci, nilfs->ns_dat, + &nilfs_sc_dat_ops); + if (unlikely(err)) + break; + if (mode == SC_FLUSH_DAT) { + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_SR: + if (mode == SC_LSEG_SR) { + /* Appending a super root */ + err = nilfs_segctor_add_super_root(sci); + if (unlikely(err)) + break; + } + /* End of a logical segment */ + sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + case NILFS_ST_DSYNC: + dsync_mode: + sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT; + ii = sci->sc_dsync_inode; + if (!test_bit(NILFS_I_BUSY, &ii->i_state)) + break; + + err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode); + if (unlikely(err)) + break; + sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + case NILFS_ST_DONE: + return 0; + default: + BUG(); + } + + break_or_fail: + return err; +} + +/** + * nilfs_segctor_begin_construction - setup segment buffer to make a new log + * @sci: nilfs_sc_info + * @nilfs: nilfs object + */ +static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct nilfs_segment_buffer *segbuf, *prev; + __u64 nextnum; + int err, alloc = 0; + + segbuf = nilfs_segbuf_new(sci->sc_super); + if (unlikely(!segbuf)) + return -ENOMEM; + + if (list_empty(&sci->sc_write_logs)) { + nilfs_segbuf_map(segbuf, nilfs->ns_segnum, + nilfs->ns_pseg_offset, nilfs); + if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { + nilfs_shift_to_next_segment(nilfs); + nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs); + } + + segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq; + nextnum = nilfs->ns_nextnum; + + if (nilfs->ns_segnum == nilfs->ns_nextnum) + /* Start from the head of a new full segment */ + alloc++; + } else { + /* Continue logs */ + prev = NILFS_LAST_SEGBUF(&sci->sc_write_logs); + nilfs_segbuf_map_cont(segbuf, prev); + segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq; + nextnum = prev->sb_nextnum; + + if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { + nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs); + segbuf->sb_sum.seg_seq++; + alloc++; + } + } + + err = nilfs_sufile_mark_dirty(nilfs->ns_sufile, segbuf->sb_segnum); + if (err) + goto failed; + + if (alloc) { + err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum); + if (err) + goto failed; + } + nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs); + + BUG_ON(!list_empty(&sci->sc_segbufs)); + list_add_tail(&segbuf->sb_list, &sci->sc_segbufs); + sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks; + return 0; + + failed: + nilfs_segbuf_free(segbuf); + return err; +} + +static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int nadd) +{ + struct nilfs_segment_buffer *segbuf, *prev; + struct inode *sufile = nilfs->ns_sufile; + __u64 nextnextnum; + LIST_HEAD(list); + int err, ret, i; + + prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs); + /* + * Since the segment specified with nextnum might be allocated during + * the previous construction, the buffer including its segusage may + * not be dirty. The following call ensures that the buffer is dirty + * and will pin the buffer on memory until the sufile is written. + */ + err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum); + if (unlikely(err)) + return err; + + for (i = 0; i < nadd; i++) { + /* extend segment info */ + err = -ENOMEM; + segbuf = nilfs_segbuf_new(sci->sc_super); + if (unlikely(!segbuf)) + goto failed; + + /* map this buffer to region of segment on-disk */ + nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs); + sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks; + + /* allocate the next next full segment */ + err = nilfs_sufile_alloc(sufile, &nextnextnum); + if (unlikely(err)) + goto failed_segbuf; + + segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1; + nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs); + + list_add_tail(&segbuf->sb_list, &list); + prev = segbuf; + } + list_splice_tail(&list, &sci->sc_segbufs); + return 0; + + failed_segbuf: + nilfs_segbuf_free(segbuf); + failed: + list_for_each_entry(segbuf, &list, sb_list) { + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + } + nilfs_destroy_logs(&list); + return err; +} + +static void nilfs_free_incomplete_logs(struct list_head *logs, + struct the_nilfs *nilfs) +{ + struct nilfs_segment_buffer *segbuf, *prev; + struct inode *sufile = nilfs->ns_sufile; + int ret; + + segbuf = NILFS_FIRST_SEGBUF(logs); + if (nilfs->ns_nextnum != segbuf->sb_nextnum) { + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + } + if (atomic_read(&segbuf->sb_err)) { + /* Case 1: The first segment failed */ + if (segbuf->sb_pseg_start != segbuf->sb_fseg_start) + /* Case 1a: Partial segment appended into an existing + segment */ + nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start, + segbuf->sb_fseg_end); + else /* Case 1b: New full segment */ + set_nilfs_discontinued(nilfs); + } + + prev = segbuf; + list_for_each_entry_continue(segbuf, logs, sb_list) { + if (prev->sb_nextnum != segbuf->sb_nextnum) { + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + } + if (atomic_read(&segbuf->sb_err) && + segbuf->sb_segnum != nilfs->ns_nextnum) + /* Case 2: extended segment (!= next) failed */ + nilfs_sufile_set_error(sufile, segbuf->sb_segnum); + prev = segbuf; + } +} + +static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci, + struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf; + unsigned long live_blocks; + int ret; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + live_blocks = segbuf->sb_sum.nblocks + + (segbuf->sb_pseg_start - segbuf->sb_fseg_start); + ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, + live_blocks, + sci->sc_seg_ctime); + WARN_ON(ret); /* always succeed because the segusage is dirty */ + } +} + +static void nilfs_cancel_segusage(struct list_head *logs, struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf; + int ret; + + segbuf = NILFS_FIRST_SEGBUF(logs); + ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, + segbuf->sb_pseg_start - + segbuf->sb_fseg_start, 0); + WARN_ON(ret); /* always succeed because the segusage is dirty */ + + list_for_each_entry_continue(segbuf, logs, sb_list) { + ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, + 0, 0); + WARN_ON(ret); /* always succeed */ + } +} + +static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci, + struct nilfs_segment_buffer *last, + struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf = last; + int ret; + + list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { + sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks; + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); + } + nilfs_truncate_logs(&sci->sc_segbufs, last); +} + + +static int nilfs_segctor_collect(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int mode) +{ + struct nilfs_cstage prev_stage = sci->sc_stage; + int err, nadd = 1; + + /* Collection retry loop */ + for (;;) { + sci->sc_nblk_this_inc = 0; + sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + + err = nilfs_segctor_reset_segment_buffer(sci); + if (unlikely(err)) + goto failed; + + err = nilfs_segctor_collect_blocks(sci, mode); + sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; + if (!err) + break; + + if (unlikely(err != -E2BIG)) + goto failed; + + /* The current segment is filled up */ + if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) + break; + + nilfs_clear_logs(&sci->sc_segbufs); + + err = nilfs_segctor_extend_segments(sci, nilfs, nadd); + if (unlikely(err)) + return err; + + if (sci->sc_stage.flags & NILFS_CF_SUFREED) { + err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, + sci->sc_freesegs, + sci->sc_nfreesegs, + NULL); + WARN_ON(err); /* do not happen */ + } + nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); + sci->sc_stage = prev_stage; + } + nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile); + return 0; + + failed: + return err; +} + +static void nilfs_list_replace_buffer(struct buffer_head *old_bh, + struct buffer_head *new_bh) +{ + BUG_ON(!list_empty(&new_bh->b_assoc_buffers)); + + list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers); + /* The caller must release old_bh */ +} + +static int +nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, + struct nilfs_segment_buffer *segbuf, + int mode) +{ + struct inode *inode = NULL; + sector_t blocknr; + unsigned long nfinfo = segbuf->sb_sum.nfinfo; + unsigned long nblocks = 0, ndatablk = 0; + struct nilfs_sc_operations *sc_op = NULL; + struct nilfs_segsum_pointer ssp; + struct nilfs_finfo *finfo = NULL; + union nilfs_binfo binfo; + struct buffer_head *bh, *bh_org; + ino_t ino = 0; + int err = 0; + + if (!nfinfo) + goto out; + + blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk; + ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers); + ssp.offset = sizeof(struct nilfs_segment_summary); + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + if (bh == segbuf->sb_super_root) + break; + if (!finfo) { + finfo = nilfs_segctor_map_segsum_entry( + sci, &ssp, sizeof(*finfo)); + ino = le64_to_cpu(finfo->fi_ino); + nblocks = le32_to_cpu(finfo->fi_nblocks); + ndatablk = le32_to_cpu(finfo->fi_ndatablk); + + inode = bh->b_page->mapping->host; + + if (mode == SC_LSEG_DSYNC) + sc_op = &nilfs_sc_dsync_ops; + else if (ino == NILFS_DAT_INO) + sc_op = &nilfs_sc_dat_ops; + else /* file blocks */ + sc_op = &nilfs_sc_file_ops; + } + bh_org = bh; + get_bh(bh_org); + err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr, + &binfo); + if (bh != bh_org) + nilfs_list_replace_buffer(bh_org, bh); + brelse(bh_org); + if (unlikely(err)) + goto failed_bmap; + + if (ndatablk > 0) + sc_op->write_data_binfo(sci, &ssp, &binfo); + else + sc_op->write_node_binfo(sci, &ssp, &binfo); + + blocknr++; + if (--nblocks == 0) { + finfo = NULL; + if (--nfinfo == 0) + break; + } else if (ndatablk > 0) + ndatablk--; + } + out: + return 0; + + failed_bmap: + return err; +} + +static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_segment_buffer *segbuf; + int err; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode); + if (unlikely(err)) + return err; + nilfs_segbuf_fill_in_segsum(segbuf); + } + return 0; +} + +static void nilfs_begin_page_io(struct page *page) +{ + if (!page || PageWriteback(page)) + /* For split b-tree node pages, this function may be called + twice. We ignore the 2nd or later calls by this check. */ + return; + + lock_page(page); + clear_page_dirty_for_io(page); + set_page_writeback(page); + unlock_page(page); +} + +static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + struct buffer_head *bh; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + if (bh->b_page != bd_page) { + if (bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + } + bd_page = bh->b_page; + } + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + if (bh == segbuf->sb_super_root) { + if (bh->b_page != bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + bd_page = bh->b_page; + } + break; + } + if (bh->b_page != fs_page) { + nilfs_begin_page_io(fs_page); + fs_page = bh->b_page; + } + } + } + if (bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + } + nilfs_begin_page_io(fs_page); +} + +static int nilfs_segctor_write(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + int ret; + + ret = nilfs_write_logs(&sci->sc_segbufs, nilfs); + list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs); + return ret; +} + +static void nilfs_end_page_io(struct page *page, int err) +{ + if (!page) + return; + + if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) { + /* + * For b-tree node pages, this function may be called twice + * or more because they might be split in a segment. + */ + if (PageDirty(page)) { + /* + * For pages holding split b-tree node buffers, dirty + * flag on the buffers may be cleared discretely. + * In that case, the page is once redirtied for + * remaining buffers, and it must be cancelled if + * all the buffers get cleaned later. + */ + lock_page(page); + if (nilfs_page_buffers_clean(page)) + __nilfs_clear_page_dirty(page); + unlock_page(page); + } + return; + } + + if (!err) { + if (!nilfs_page_buffers_clean(page)) + __set_page_dirty_nobuffers(page); + ClearPageError(page); + } else { + __set_page_dirty_nobuffers(page); + SetPageError(page); + } + + end_page_writeback(page); +} + +static void nilfs_abort_logs(struct list_head *logs, int err) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + struct buffer_head *bh; + + if (list_empty(logs)) + return; + + list_for_each_entry(segbuf, logs, sb_list) { + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + if (bh->b_page != bd_page) { + if (bd_page) + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + if (bh == segbuf->sb_super_root) { + if (bh->b_page != bd_page) { + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + break; + } + if (bh->b_page != fs_page) { + nilfs_end_page_io(fs_page, err); + fs_page = bh->b_page; + } + } + } + if (bd_page) + end_page_writeback(bd_page); + + nilfs_end_page_io(fs_page, err); +} + +static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int err) +{ + LIST_HEAD(logs); + int ret; + + list_splice_tail_init(&sci->sc_write_logs, &logs); + ret = nilfs_wait_on_logs(&logs); + nilfs_abort_logs(&logs, ret ? : err); + + list_splice_tail_init(&sci->sc_segbufs, &logs); + nilfs_cancel_segusage(&logs, nilfs->ns_sufile); + nilfs_free_incomplete_logs(&logs, nilfs); + + if (sci->sc_stage.flags & NILFS_CF_SUFREED) { + ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile, + sci->sc_freesegs, + sci->sc_nfreesegs, + NULL); + WARN_ON(ret); /* do not happen */ + } + + nilfs_destroy_logs(&logs); +} + +static void nilfs_set_next_segment(struct the_nilfs *nilfs, + struct nilfs_segment_buffer *segbuf) +{ + nilfs->ns_segnum = segbuf->sb_segnum; + nilfs->ns_nextnum = segbuf->sb_nextnum; + nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start + + segbuf->sb_sum.nblocks; + nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq; + nilfs->ns_ctime = segbuf->sb_sum.ctime; +} + +static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + int update_sr = false; + + list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { + struct buffer_head *bh; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + if (bh->b_page != bd_page) { + if (bd_page) + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + } + /* + * We assume that the buffers which belong to the same page + * continue over the buffer list. + * Under this assumption, the last BHs of pages is + * identifiable by the discontinuity of bh->b_page + * (page != fs_page). + * + * For B-tree node blocks, however, this assumption is not + * guaranteed. The cleanup code of B-tree node pages needs + * special care. + */ + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + clear_buffer_delay(bh); + clear_buffer_nilfs_volatile(bh); + clear_buffer_nilfs_redirected(bh); + if (bh == segbuf->sb_super_root) { + if (bh->b_page != bd_page) { + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + update_sr = true; + break; + } + if (bh->b_page != fs_page) { + nilfs_end_page_io(fs_page, 0); + fs_page = bh->b_page; + } + } + + if (!nilfs_segbuf_simplex(segbuf)) { + if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) { + set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); + sci->sc_lseg_stime = jiffies; + } + if (segbuf->sb_sum.flags & NILFS_SS_LOGEND) + clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); + } + } + /* + * Since pages may continue over multiple segment buffers, + * end of the last page must be checked outside of the loop. + */ + if (bd_page) + end_page_writeback(bd_page); + + nilfs_end_page_io(fs_page, 0); + + nilfs_drop_collected_inodes(&sci->sc_dirty_files); + + if (nilfs_doing_gc()) + nilfs_drop_collected_inodes(&sci->sc_gc_inodes); + else + nilfs->ns_nongc_ctime = sci->sc_seg_ctime; + + sci->sc_nblk_inc += sci->sc_nblk_this_inc; + + segbuf = NILFS_LAST_SEGBUF(&sci->sc_write_logs); + nilfs_set_next_segment(nilfs, segbuf); + + if (update_sr) { + nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, + segbuf->sb_sum.seg_seq, nilfs->ns_cno++); + + clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); + clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); + set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); + nilfs_segctor_clear_metadata_dirty(sci); + } else + clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); +} + +static int nilfs_segctor_wait(struct nilfs_sc_info *sci) +{ + int ret; + + ret = nilfs_wait_on_logs(&sci->sc_write_logs); + if (!ret) { + nilfs_segctor_complete_write(sci); + nilfs_destroy_logs(&sci->sc_write_logs); + } + return ret; +} + +static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct nilfs_inode_info *ii, *n; + struct inode *ifile = sci->sc_root->ifile; + + spin_lock(&nilfs->ns_inode_lock); + retry: + list_for_each_entry_safe(ii, n, &nilfs->ns_dirty_files, i_dirty) { + if (!ii->i_bh) { + struct buffer_head *ibh; + int err; + + spin_unlock(&nilfs->ns_inode_lock); + err = nilfs_ifile_get_inode_block( + ifile, ii->vfs_inode.i_ino, &ibh); + if (unlikely(err)) { + nilfs_warning(sci->sc_super, __func__, + "failed to get inode block.\n"); + return err; + } + mark_buffer_dirty(ibh); + nilfs_mdt_mark_dirty(ifile); + spin_lock(&nilfs->ns_inode_lock); + if (likely(!ii->i_bh)) + ii->i_bh = ibh; + else + brelse(ibh); + goto retry; + } + + clear_bit(NILFS_I_QUEUED, &ii->i_state); + set_bit(NILFS_I_BUSY, &ii->i_state); + list_move_tail(&ii->i_dirty, &sci->sc_dirty_files); + } + spin_unlock(&nilfs->ns_inode_lock); + + return 0; +} + +static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct nilfs_transaction_info *ti = current->journal_info; + struct nilfs_inode_info *ii, *n; + + spin_lock(&nilfs->ns_inode_lock); + list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { + if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) || + test_bit(NILFS_I_DIRTY, &ii->i_state)) + continue; + + clear_bit(NILFS_I_BUSY, &ii->i_state); + brelse(ii->i_bh); + ii->i_bh = NULL; + list_move_tail(&ii->i_dirty, &ti->ti_garbage); + } + spin_unlock(&nilfs->ns_inode_lock); +} + +/* + * Main procedure of segment constructor + */ +static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + int err; + + sci->sc_stage.scnt = NILFS_ST_INIT; + sci->sc_cno = nilfs->ns_cno; + + err = nilfs_segctor_collect_dirty_files(sci, nilfs); + if (unlikely(err)) + goto out; + + if (nilfs_test_metadata_dirty(nilfs, sci->sc_root)) + set_bit(NILFS_SC_DIRTY, &sci->sc_flags); + + if (nilfs_segctor_clean(sci)) + goto out; + + do { + sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK; + + err = nilfs_segctor_begin_construction(sci, nilfs); + if (unlikely(err)) + goto out; + + /* Update time stamp */ + sci->sc_seg_ctime = get_seconds(); + + err = nilfs_segctor_collect(sci, nilfs, mode); + if (unlikely(err)) + goto failed; + + /* Avoid empty segment */ + if (sci->sc_stage.scnt == NILFS_ST_DONE && + nilfs_segbuf_empty(sci->sc_curseg)) { + nilfs_segctor_abort_construction(sci, nilfs, 1); + goto out; + } + + err = nilfs_segctor_assign(sci, mode); + if (unlikely(err)) + goto failed; + + if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) + nilfs_segctor_fill_in_file_bmap(sci); + + if (mode == SC_LSEG_SR && + sci->sc_stage.scnt >= NILFS_ST_CPFILE) { + err = nilfs_segctor_fill_in_checkpoint(sci); + if (unlikely(err)) + goto failed_to_write; + + nilfs_segctor_fill_in_super_root(sci, nilfs); + } + nilfs_segctor_update_segusage(sci, nilfs->ns_sufile); + + /* Write partial segments */ + nilfs_segctor_prepare_write(sci); + + nilfs_add_checksums_on_logs(&sci->sc_segbufs, + nilfs->ns_crc_seed); + + err = nilfs_segctor_write(sci, nilfs); + if (unlikely(err)) + goto failed_to_write; + + if (sci->sc_stage.scnt == NILFS_ST_DONE || + nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) { + /* + * At this point, we avoid double buffering + * for blocksize < pagesize because page dirty + * flag is turned off during write and dirty + * buffers are not properly collected for + * pages crossing over segments. + */ + err = nilfs_segctor_wait(sci); + if (err) + goto failed_to_write; + } + } while (sci->sc_stage.scnt != NILFS_ST_DONE); + + out: + nilfs_segctor_drop_written_files(sci, nilfs); + return err; + + failed_to_write: + if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) + nilfs_redirty_inodes(&sci->sc_dirty_files); + + failed: + if (nilfs_doing_gc()) + nilfs_redirty_inodes(&sci->sc_gc_inodes); + nilfs_segctor_abort_construction(sci, nilfs, err); + goto out; +} + +/** + * nilfs_segctor_start_timer - set timer of background write + * @sci: nilfs_sc_info + * + * If the timer has already been set, it ignores the new request. + * This function MUST be called within a section locking the segment + * semaphore. + */ +static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci) +{ + spin_lock(&sci->sc_state_lock); + if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { + sci->sc_timer.expires = jiffies + sci->sc_interval; + add_timer(&sci->sc_timer); + sci->sc_state |= NILFS_SEGCTOR_COMMIT; + } + spin_unlock(&sci->sc_state_lock); +} + +static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn) +{ + spin_lock(&sci->sc_state_lock); + if (!(sci->sc_flush_request & (1 << bn))) { + unsigned long prev_req = sci->sc_flush_request; + + sci->sc_flush_request |= (1 << bn); + if (!prev_req) + wake_up(&sci->sc_wait_daemon); + } + spin_unlock(&sci->sc_state_lock); +} + +/** + * nilfs_flush_segment - trigger a segment construction for resource control + * @sb: super block + * @ino: inode number of the file to be flushed out. + */ +void nilfs_flush_segment(struct super_block *sb, ino_t ino) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_sc_info *sci = nilfs->ns_writer; + + if (!sci || nilfs_doing_construction()) + return; + nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0); + /* assign bit 0 to data files */ +} + +struct nilfs_segctor_wait_request { + wait_queue_t wq; + __u32 seq; + int err; + atomic_t done; +}; + +static int nilfs_segctor_sync(struct nilfs_sc_info *sci) +{ + struct nilfs_segctor_wait_request wait_req; + int err = 0; + + spin_lock(&sci->sc_state_lock); + init_wait(&wait_req.wq); + wait_req.err = 0; + atomic_set(&wait_req.done, 0); + wait_req.seq = ++sci->sc_seq_request; + spin_unlock(&sci->sc_state_lock); + + init_waitqueue_entry(&wait_req.wq, current); + add_wait_queue(&sci->sc_wait_request, &wait_req.wq); + set_current_state(TASK_INTERRUPTIBLE); + wake_up(&sci->sc_wait_daemon); + + for (;;) { + if (atomic_read(&wait_req.done)) { + err = wait_req.err; + break; + } + if (!signal_pending(current)) { + schedule(); + continue; + } + err = -ERESTARTSYS; + break; + } + finish_wait(&sci->sc_wait_request, &wait_req.wq); + return err; +} + +static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err) +{ + struct nilfs_segctor_wait_request *wrq, *n; + unsigned long flags; + + spin_lock_irqsave(&sci->sc_wait_request.lock, flags); + list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list, + wq.task_list) { + if (!atomic_read(&wrq->done) && + nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) { + wrq->err = err; + atomic_set(&wrq->done, 1); + } + if (atomic_read(&wrq->done)) { + wrq->wq.func(&wrq->wq, + TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 0, NULL); + } + } + spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags); +} + +/** + * nilfs_construct_segment - construct a logical segment + * @sb: super block + * + * Return Value: On success, 0 is retured. On errors, one of the following + * negative error code is returned. + * + * %-EROFS - Read only filesystem. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_construct_segment(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_sc_info *sci = nilfs->ns_writer; + struct nilfs_transaction_info *ti; + int err; + + if (!sci) + return -EROFS; + + /* A call inside transactions causes a deadlock. */ + BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC); + + err = nilfs_segctor_sync(sci); + return err; +} + +/** + * nilfs_construct_dsync_segment - construct a data-only logical segment + * @sb: super block + * @inode: inode whose data blocks should be written out + * @start: start byte offset + * @end: end byte offset (inclusive) + * + * Return Value: On success, 0 is retured. On errors, one of the following + * negative error code is returned. + * + * %-EROFS - Read only filesystem. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, + loff_t start, loff_t end) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_sc_info *sci = nilfs->ns_writer; + struct nilfs_inode_info *ii; + struct nilfs_transaction_info ti; + int err = 0; + + if (!sci) + return -EROFS; + + nilfs_transaction_lock(sb, &ti, 0); + + ii = NILFS_I(inode); + if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) || + nilfs_test_opt(nilfs, STRICT_ORDER) || + test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || + nilfs_discontinued(nilfs)) { + nilfs_transaction_unlock(sb); + err = nilfs_segctor_sync(sci); + return err; + } + + spin_lock(&nilfs->ns_inode_lock); + if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && + !test_bit(NILFS_I_BUSY, &ii->i_state)) { + spin_unlock(&nilfs->ns_inode_lock); + nilfs_transaction_unlock(sb); + return 0; + } + spin_unlock(&nilfs->ns_inode_lock); + sci->sc_dsync_inode = ii; + sci->sc_dsync_start = start; + sci->sc_dsync_end = end; + + err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC); + + nilfs_transaction_unlock(sb); + return err; +} + +#define FLUSH_FILE_BIT (0x1) /* data file only */ +#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */ + +/** + * nilfs_segctor_accept - record accepted sequence count of log-write requests + * @sci: segment constructor object + */ +static void nilfs_segctor_accept(struct nilfs_sc_info *sci) +{ + spin_lock(&sci->sc_state_lock); + sci->sc_seq_accepted = sci->sc_seq_request; + spin_unlock(&sci->sc_state_lock); + del_timer_sync(&sci->sc_timer); +} + +/** + * nilfs_segctor_notify - notify the result of request to caller threads + * @sci: segment constructor object + * @mode: mode of log forming + * @err: error code to be notified + */ +static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err) +{ + /* Clear requests (even when the construction failed) */ + spin_lock(&sci->sc_state_lock); + + if (mode == SC_LSEG_SR) { + sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; + sci->sc_seq_done = sci->sc_seq_accepted; + nilfs_segctor_wakeup(sci, err); + sci->sc_flush_request = 0; + } else { + if (mode == SC_FLUSH_FILE) + sci->sc_flush_request &= ~FLUSH_FILE_BIT; + else if (mode == SC_FLUSH_DAT) + sci->sc_flush_request &= ~FLUSH_DAT_BIT; + + /* re-enable timer if checkpoint creation was not done */ + if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && + time_before(jiffies, sci->sc_timer.expires)) + add_timer(&sci->sc_timer); + } + spin_unlock(&sci->sc_state_lock); +} + +/** + * nilfs_segctor_construct - form logs and write them to disk + * @sci: segment constructor object + * @mode: mode of log forming + */ +static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + struct nilfs_super_block **sbp; + int err = 0; + + nilfs_segctor_accept(sci); + + if (nilfs_discontinued(nilfs)) + mode = SC_LSEG_SR; + if (!nilfs_segctor_confirm(sci)) + err = nilfs_segctor_do_construct(sci, mode); + + if (likely(!err)) { + if (mode != SC_FLUSH_DAT) + atomic_set(&nilfs->ns_ndirtyblks, 0); + if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && + nilfs_discontinued(nilfs)) { + down_write(&nilfs->ns_sem); + err = -EIO; + sbp = nilfs_prepare_super(sci->sc_super, + nilfs_sb_will_flip(nilfs)); + if (likely(sbp)) { + nilfs_set_log_cursor(sbp[0], nilfs); + err = nilfs_commit_super(sci->sc_super, + NILFS_SB_COMMIT); + } + up_write(&nilfs->ns_sem); + } + } + + nilfs_segctor_notify(sci, mode, err); + return err; +} + +static void nilfs_construction_timeout(unsigned long data) +{ + struct task_struct *p = (struct task_struct *)data; + wake_up_process(p); +} + +static void +nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head) +{ + struct nilfs_inode_info *ii, *n; + + list_for_each_entry_safe(ii, n, head, i_dirty) { + if (!test_bit(NILFS_I_UPDATED, &ii->i_state)) + continue; + list_del_init(&ii->i_dirty); + iput(&ii->vfs_inode); + } +} + +int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, + void **kbufs) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_sc_info *sci = nilfs->ns_writer; + struct nilfs_transaction_info ti; + int err; + + if (unlikely(!sci)) + return -EROFS; + + nilfs_transaction_lock(sb, &ti, 1); + + err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat); + if (unlikely(err)) + goto out_unlock; + + err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs); + if (unlikely(err)) { + nilfs_mdt_restore_from_shadow_map(nilfs->ns_dat); + goto out_unlock; + } + + sci->sc_freesegs = kbufs[4]; + sci->sc_nfreesegs = argv[4].v_nmembs; + list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes); + + for (;;) { + err = nilfs_segctor_construct(sci, SC_LSEG_SR); + nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes); + + if (likely(!err)) + break; + + nilfs_warning(sb, __func__, + "segment construction failed. (err=%d)", err); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(sci->sc_interval); + } + if (nilfs_test_opt(nilfs, DISCARD)) { + int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs, + sci->sc_nfreesegs); + if (ret) { + printk(KERN_WARNING + "NILFS warning: error %d on discard request, " + "turning discards off for the device\n", ret); + nilfs_clear_opt(nilfs, DISCARD); + } + } + + out_unlock: + sci->sc_freesegs = NULL; + sci->sc_nfreesegs = 0; + nilfs_mdt_clear_shadow_map(nilfs->ns_dat); + nilfs_transaction_unlock(sb); + return err; +} + +static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_transaction_info ti; + + nilfs_transaction_lock(sci->sc_super, &ti, 0); + nilfs_segctor_construct(sci, mode); + + /* + * Unclosed segment should be retried. We do this using sc_timer. + * Timeout of sc_timer will invoke complete construction which leads + * to close the current logical segment. + */ + if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) + nilfs_segctor_start_timer(sci); + + nilfs_transaction_unlock(sci->sc_super); +} + +static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci) +{ + int mode = 0; + int err; + + spin_lock(&sci->sc_state_lock); + mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ? + SC_FLUSH_DAT : SC_FLUSH_FILE; + spin_unlock(&sci->sc_state_lock); + + if (mode) { + err = nilfs_segctor_do_construct(sci, mode); + + spin_lock(&sci->sc_state_lock); + sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ? + ~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT; + spin_unlock(&sci->sc_state_lock); + } + clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); +} + +static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci) +{ + if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || + time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) { + if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT)) + return SC_FLUSH_FILE; + else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT)) + return SC_FLUSH_DAT; + } + return SC_LSEG_SR; +} + +/** + * nilfs_segctor_thread - main loop of the segment constructor thread. + * @arg: pointer to a struct nilfs_sc_info. + * + * nilfs_segctor_thread() initializes a timer and serves as a daemon + * to execute segment constructions. + */ +static int nilfs_segctor_thread(void *arg) +{ + struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + int timeout = 0; + + sci->sc_timer.data = (unsigned long)current; + sci->sc_timer.function = nilfs_construction_timeout; + + /* start sync. */ + sci->sc_task = current; + wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */ + printk(KERN_INFO + "segctord starting. Construction interval = %lu seconds, " + "CP frequency < %lu seconds\n", + sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ); + + spin_lock(&sci->sc_state_lock); + loop: + for (;;) { + int mode; + + if (sci->sc_state & NILFS_SEGCTOR_QUIT) + goto end_thread; + + if (timeout || sci->sc_seq_request != sci->sc_seq_done) + mode = SC_LSEG_SR; + else if (!sci->sc_flush_request) + break; + else + mode = nilfs_segctor_flush_mode(sci); + + spin_unlock(&sci->sc_state_lock); + nilfs_segctor_thread_construct(sci, mode); + spin_lock(&sci->sc_state_lock); + timeout = 0; + } + + + if (freezing(current)) { + spin_unlock(&sci->sc_state_lock); + refrigerator(); + spin_lock(&sci->sc_state_lock); + } else { + DEFINE_WAIT(wait); + int should_sleep = 1; + + prepare_to_wait(&sci->sc_wait_daemon, &wait, + TASK_INTERRUPTIBLE); + + if (sci->sc_seq_request != sci->sc_seq_done) + should_sleep = 0; + else if (sci->sc_flush_request) + should_sleep = 0; + else if (sci->sc_state & NILFS_SEGCTOR_COMMIT) + should_sleep = time_before(jiffies, + sci->sc_timer.expires); + + if (should_sleep) { + spin_unlock(&sci->sc_state_lock); + schedule(); + spin_lock(&sci->sc_state_lock); + } + finish_wait(&sci->sc_wait_daemon, &wait); + timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && + time_after_eq(jiffies, sci->sc_timer.expires)); + + if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs)) + set_nilfs_discontinued(nilfs); + } + goto loop; + + end_thread: + spin_unlock(&sci->sc_state_lock); + + /* end sync. */ + sci->sc_task = NULL; + wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */ + return 0; +} + +static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci) +{ + struct task_struct *t; + + t = kthread_run(nilfs_segctor_thread, sci, "segctord"); + if (IS_ERR(t)) { + int err = PTR_ERR(t); + + printk(KERN_ERR "NILFS: error %d creating segctord thread\n", + err); + return err; + } + wait_event(sci->sc_wait_task, sci->sc_task != NULL); + return 0; +} + +static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci) + __acquires(&sci->sc_state_lock) + __releases(&sci->sc_state_lock) +{ + sci->sc_state |= NILFS_SEGCTOR_QUIT; + + while (sci->sc_task) { + wake_up(&sci->sc_wait_daemon); + spin_unlock(&sci->sc_state_lock); + wait_event(sci->sc_wait_task, sci->sc_task == NULL); + spin_lock(&sci->sc_state_lock); + } +} + +/* + * Setup & clean-up functions + */ +static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, + struct nilfs_root *root) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_sc_info *sci; + + sci = kzalloc(sizeof(*sci), GFP_KERNEL); + if (!sci) + return NULL; + + sci->sc_super = sb; + + nilfs_get_root(root); + sci->sc_root = root; + + init_waitqueue_head(&sci->sc_wait_request); + init_waitqueue_head(&sci->sc_wait_daemon); + init_waitqueue_head(&sci->sc_wait_task); + spin_lock_init(&sci->sc_state_lock); + INIT_LIST_HEAD(&sci->sc_dirty_files); + INIT_LIST_HEAD(&sci->sc_segbufs); + INIT_LIST_HEAD(&sci->sc_write_logs); + INIT_LIST_HEAD(&sci->sc_gc_inodes); + init_timer(&sci->sc_timer); + + sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; + sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; + sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK; + + if (nilfs->ns_interval) + sci->sc_interval = HZ * nilfs->ns_interval; + if (nilfs->ns_watermark) + sci->sc_watermark = nilfs->ns_watermark; + return sci; +} + +static void nilfs_segctor_write_out(struct nilfs_sc_info *sci) +{ + int ret, retrycount = NILFS_SC_CLEANUP_RETRY; + + /* The segctord thread was stopped and its timer was removed. + But some tasks remain. */ + do { + struct nilfs_transaction_info ti; + + nilfs_transaction_lock(sci->sc_super, &ti, 0); + ret = nilfs_segctor_construct(sci, SC_LSEG_SR); + nilfs_transaction_unlock(sci->sc_super); + + } while (ret && retrycount-- > 0); +} + +/** + * nilfs_segctor_destroy - destroy the segment constructor. + * @sci: nilfs_sc_info + * + * nilfs_segctor_destroy() kills the segctord thread and frees + * the nilfs_sc_info struct. + * Caller must hold the segment semaphore. + */ +static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) +{ + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + int flag; + + up_write(&nilfs->ns_segctor_sem); + + spin_lock(&sci->sc_state_lock); + nilfs_segctor_kill_thread(sci); + flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request + || sci->sc_seq_request != sci->sc_seq_done); + spin_unlock(&sci->sc_state_lock); + + if (flag || !nilfs_segctor_confirm(sci)) + nilfs_segctor_write_out(sci); + + if (!list_empty(&sci->sc_dirty_files)) { + nilfs_warning(sci->sc_super, __func__, + "dirty file(s) after the final construction\n"); + nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1); + } + + WARN_ON(!list_empty(&sci->sc_segbufs)); + WARN_ON(!list_empty(&sci->sc_write_logs)); + + nilfs_put_root(sci->sc_root); + + down_write(&nilfs->ns_segctor_sem); + + del_timer_sync(&sci->sc_timer); + kfree(sci); +} + +/** + * nilfs_attach_log_writer - attach log writer + * @sb: super block instance + * @root: root object of the current filesystem tree + * + * This allocates a log writer object, initializes it, and starts the + * log writer. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + int err; + + if (nilfs->ns_writer) { + /* + * This happens if the filesystem was remounted + * read/write after nilfs_error degenerated it into a + * read-only mount. + */ + nilfs_detach_log_writer(sb); + } + + nilfs->ns_writer = nilfs_segctor_new(sb, root); + if (!nilfs->ns_writer) + return -ENOMEM; + + err = nilfs_segctor_start_thread(nilfs->ns_writer); + if (err) { + kfree(nilfs->ns_writer); + nilfs->ns_writer = NULL; + } + return err; +} + +/** + * nilfs_detach_log_writer - destroy log writer + * @sb: super block instance + * + * This kills log writer daemon, frees the log writer object, and + * destroys list of dirty files. + */ +void nilfs_detach_log_writer(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + LIST_HEAD(garbage_list); + + down_write(&nilfs->ns_segctor_sem); + if (nilfs->ns_writer) { + nilfs_segctor_destroy(nilfs->ns_writer); + nilfs->ns_writer = NULL; + } + + /* Force to free the list of dirty files */ + spin_lock(&nilfs->ns_inode_lock); + if (!list_empty(&nilfs->ns_dirty_files)) { + list_splice_init(&nilfs->ns_dirty_files, &garbage_list); + nilfs_warning(sb, __func__, + "Hit dirty file after stopped log writer\n"); + } + spin_unlock(&nilfs->ns_inode_lock); + up_write(&nilfs->ns_segctor_sem); + + nilfs_dispose_list(nilfs, &garbage_list, 1); +} diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h new file mode 100644 index 00000000..38a1d001 --- /dev/null +++ b/fs/nilfs2/segment.h @@ -0,0 +1,246 @@ +/* + * segment.h - NILFS Segment constructor prototypes and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + * + */ +#ifndef _NILFS_SEGMENT_H +#define _NILFS_SEGMENT_H + +#include +#include +#include +#include +#include "nilfs.h" + +struct nilfs_root; + +/** + * struct nilfs_recovery_info - Recovery information + * @ri_need_recovery: Recovery status + * @ri_super_root: Block number of the last super root + * @ri_ri_cno: Number of the last checkpoint + * @ri_lsegs_start: Region for roll-forwarding (start block number) + * @ri_lsegs_end: Region for roll-forwarding (end block number) + * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start + * @ri_used_segments: List of segments to be mark active + * @ri_pseg_start: Block number of the last partial segment + * @ri_seq: Sequence number on the last partial segment + * @ri_segnum: Segment number on the last partial segment + * @ri_nextnum: Next segment number on the last partial segment + */ +struct nilfs_recovery_info { + int ri_need_recovery; + sector_t ri_super_root; + __u64 ri_cno; + + sector_t ri_lsegs_start; + sector_t ri_lsegs_end; + u64 ri_lsegs_start_seq; + struct list_head ri_used_segments; + sector_t ri_pseg_start; + u64 ri_seq; + __u64 ri_segnum; + __u64 ri_nextnum; +}; + +/* ri_need_recovery */ +#define NILFS_RECOVERY_SR_UPDATED 1 /* The super root was updated */ +#define NILFS_RECOVERY_ROLLFORWARD_DONE 2 /* Rollforward was carried out */ + +/** + * struct nilfs_cstage - Context of collection stage + * @scnt: Stage count + * @flags: State flags + * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file + * @gc_inode_ptr: Pointer on the list of gc-inodes + */ +struct nilfs_cstage { + int scnt; + unsigned flags; + struct nilfs_inode_info *dirty_file_ptr; + struct nilfs_inode_info *gc_inode_ptr; +}; + +struct nilfs_segment_buffer; + +struct nilfs_segsum_pointer { + struct buffer_head *bh; + unsigned offset; /* offset in bytes */ +}; + +/** + * struct nilfs_sc_info - Segment constructor information + * @sc_super: Back pointer to super_block struct + * @sc_root: root object of the current filesystem tree + * @sc_nblk_inc: Block count of current generation + * @sc_dirty_files: List of files to be written + * @sc_gc_inodes: List of GC inodes having blocks to be written + * @sc_freesegs: array of segment numbers to be freed + * @sc_nfreesegs: number of segments on @sc_freesegs + * @sc_dsync_inode: inode whose data pages are written for a sync operation + * @sc_dsync_start: start byte offset of data pages + * @sc_dsync_end: end byte offset of data pages (inclusive) + * @sc_segbufs: List of segment buffers + * @sc_write_logs: List of segment buffers to hold logs under writing + * @sc_segbuf_nblocks: Number of available blocks in segment buffers. + * @sc_curseg: Current segment buffer + * @sc_stage: Collection stage + * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary + * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary + * @sc_blk_cnt: Block count of a file + * @sc_datablk_cnt: Data block count of a file + * @sc_nblk_this_inc: Number of blocks included in the current logical segment + * @sc_seg_ctime: Creation time + * @sc_cno: checkpoint number of current log + * @sc_flags: Internal flags + * @sc_state_lock: spinlock for sc_state and so on + * @sc_state: Segctord state flags + * @sc_flush_request: inode bitmap of metadata files to be flushed + * @sc_wait_request: Client request queue + * @sc_wait_daemon: Daemon wait queue + * @sc_wait_task: Start/end wait queue to control segctord task + * @sc_seq_request: Request counter + * @sc_seq_accept: Accepted request count + * @sc_seq_done: Completion counter + * @sc_sync: Request of explicit sync operation + * @sc_interval: Timeout value of background construction + * @sc_mjcp_freq: Frequency of creating checkpoints + * @sc_lseg_stime: Start time of the latest logical segment + * @sc_watermark: Watermark for the number of dirty buffers + * @sc_timer: Timer for segctord + * @sc_task: current thread of segctord + */ +struct nilfs_sc_info { + struct super_block *sc_super; + struct nilfs_root *sc_root; + + unsigned long sc_nblk_inc; + + struct list_head sc_dirty_files; + struct list_head sc_gc_inodes; + + __u64 *sc_freesegs; + size_t sc_nfreesegs; + + struct nilfs_inode_info *sc_dsync_inode; + loff_t sc_dsync_start; + loff_t sc_dsync_end; + + /* Segment buffers */ + struct list_head sc_segbufs; + struct list_head sc_write_logs; + unsigned long sc_segbuf_nblocks; + struct nilfs_segment_buffer *sc_curseg; + + struct nilfs_cstage sc_stage; + + struct nilfs_segsum_pointer sc_finfo_ptr; + struct nilfs_segsum_pointer sc_binfo_ptr; + unsigned long sc_blk_cnt; + unsigned long sc_datablk_cnt; + unsigned long sc_nblk_this_inc; + time_t sc_seg_ctime; + __u64 sc_cno; + unsigned long sc_flags; + + spinlock_t sc_state_lock; + unsigned long sc_state; + unsigned long sc_flush_request; + + wait_queue_head_t sc_wait_request; + wait_queue_head_t sc_wait_daemon; + wait_queue_head_t sc_wait_task; + + __u32 sc_seq_request; + __u32 sc_seq_accepted; + __u32 sc_seq_done; + + int sc_sync; + unsigned long sc_interval; + unsigned long sc_mjcp_freq; + unsigned long sc_lseg_stime; /* in 1/HZ seconds */ + unsigned long sc_watermark; + + struct timer_list sc_timer; + struct task_struct *sc_task; +}; + +/* sc_flags */ +enum { + NILFS_SC_DIRTY, /* One or more dirty meta-data blocks exist */ + NILFS_SC_UNCLOSED, /* Logical segment is not closed */ + NILFS_SC_SUPER_ROOT, /* The latest segment has a super root */ + NILFS_SC_PRIOR_FLUSH, /* Requesting immediate flush without making a + checkpoint */ + NILFS_SC_HAVE_DELTA, /* Next checkpoint will have update of files + other than DAT, cpfile, sufile, or files + moved by GC */ +}; + +/* sc_state */ +#define NILFS_SEGCTOR_QUIT 0x0001 /* segctord is being destroyed */ +#define NILFS_SEGCTOR_COMMIT 0x0004 /* committed transaction exists */ + +/* + * Constant parameters + */ +#define NILFS_SC_CLEANUP_RETRY 3 /* Retry count of construction when + destroying segctord */ + +/* + * Default values of timeout, in seconds. + */ +#define NILFS_SC_DEFAULT_TIMEOUT 5 /* Timeout value of dirty blocks. + It triggers construction of a + logical segment with a super root */ +#define NILFS_SC_DEFAULT_SR_FREQ 30 /* Maximum frequency of super root + creation */ + +/* + * The default threshold amount of data, in block counts. + */ +#define NILFS_SC_DEFAULT_WATERMARK 3600 + +/* super.c */ +extern struct kmem_cache *nilfs_transaction_cachep; + +/* segment.c */ +extern void nilfs_relax_pressure_in_lock(struct super_block *); + +extern int nilfs_construct_segment(struct super_block *); +extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *, + loff_t, loff_t); +extern void nilfs_flush_segment(struct super_block *, ino_t); +extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *, + void **); + +int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root); +void nilfs_detach_log_writer(struct super_block *sb); + +/* recovery.c */ +extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t, + struct buffer_head **, int); +extern int nilfs_search_super_root(struct the_nilfs *, + struct nilfs_recovery_info *); +int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, struct super_block *sb, + struct nilfs_recovery_info *ri); +extern void nilfs_dispose_segment_list(struct list_head *); + +#endif /* _NILFS_SEGMENT_H */ diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c new file mode 100644 index 00000000..0a0aba61 --- /dev/null +++ b/fs/nilfs2/sufile.c @@ -0,0 +1,921 @@ +/* + * sufile.c - NILFS segment usage file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + * Revised by Ryusuke Konishi . + */ + +#include +#include +#include +#include +#include +#include +#include "mdt.h" +#include "sufile.h" + + +struct nilfs_sufile_info { + struct nilfs_mdt_info mi; + unsigned long ncleansegs;/* number of clean segments */ + __u64 allocmin; /* lower limit of allocatable segment range */ + __u64 allocmax; /* upper limit of allocatable segment range */ +}; + +static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile) +{ + return (struct nilfs_sufile_info *)NILFS_MDT(sufile); +} + +static inline unsigned long +nilfs_sufile_segment_usages_per_block(const struct inode *sufile) +{ + return NILFS_MDT(sufile)->mi_entries_per_block; +} + +static unsigned long +nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum) +{ + __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; + do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); + return (unsigned long)t; +} + +static unsigned long +nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum) +{ + __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; + return do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); +} + +static unsigned long +nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr, + __u64 max) +{ + return min_t(unsigned long, + nilfs_sufile_segment_usages_per_block(sufile) - + nilfs_sufile_get_offset(sufile, curr), + max - curr + 1); +} + +static struct nilfs_segment_usage * +nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum, + struct buffer_head *bh, void *kaddr) +{ + return kaddr + bh_offset(bh) + + nilfs_sufile_get_offset(sufile, segnum) * + NILFS_MDT(sufile)->mi_entry_size; +} + +static inline int nilfs_sufile_get_header_block(struct inode *sufile, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp); +} + +static inline int +nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum, + int create, struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(sufile, + nilfs_sufile_get_blkoff(sufile, segnum), + create, NULL, bhp); +} + +static int nilfs_sufile_delete_segment_usage_block(struct inode *sufile, + __u64 segnum) +{ + return nilfs_mdt_delete_block(sufile, + nilfs_sufile_get_blkoff(sufile, segnum)); +} + +static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, + u64 ncleanadd, u64 ndirtyadd) +{ + struct nilfs_sufile_header *header; + void *kaddr; + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = kaddr + bh_offset(header_bh); + le64_add_cpu(&header->sh_ncleansegs, ncleanadd); + le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd); + kunmap_atomic(kaddr, KM_USER0); + + mark_buffer_dirty(header_bh); +} + +/** + * nilfs_sufile_get_ncleansegs - return the number of clean segments + * @sufile: inode of segment usage file + */ +unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile) +{ + return NILFS_SUI(sufile)->ncleansegs; +} + +/** + * nilfs_sufile_updatev - modify multiple segment usages at a time + * @sufile: inode of segment usage file + * @segnumv: array of segment numbers + * @nsegs: size of @segnumv array + * @create: creation flag + * @ndone: place to store number of modified segments on @segnumv + * @dofunc: primitive operation for the update + * + * Description: nilfs_sufile_updatev() repeatedly calls @dofunc + * against the given array of segments. The @dofunc is called with + * buffers of a header block and the sufile block in which the target + * segment usage entry is contained. If @ndone is given, the number + * of successfully modified segments from the head is stored in the + * place @ndone points to. + * + * Return Value: On success, zero is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - Given segment usage is in hole block (may be returned if + * @create is zero) + * + * %-EINVAL - Invalid segment usage number + */ +int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs, + int create, size_t *ndone, + void (*dofunc)(struct inode *, __u64, + struct buffer_head *, + struct buffer_head *)) +{ + struct buffer_head *header_bh, *bh; + unsigned long blkoff, prev_blkoff; + __u64 *seg; + size_t nerr = 0, n = 0; + int ret = 0; + + if (unlikely(nsegs == 0)) + goto out; + + down_write(&NILFS_MDT(sufile)->mi_sem); + for (seg = segnumv; seg < segnumv + nsegs; seg++) { + if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) { + printk(KERN_WARNING + "%s: invalid segment number: %llu\n", __func__, + (unsigned long long)*seg); + nerr++; + } + } + if (nerr > 0) { + ret = -EINVAL; + goto out_sem; + } + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + seg = segnumv; + blkoff = nilfs_sufile_get_blkoff(sufile, *seg); + ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh); + if (ret < 0) + goto out_header; + + for (;;) { + dofunc(sufile, *seg, header_bh, bh); + + if (++seg >= segnumv + nsegs) + break; + prev_blkoff = blkoff; + blkoff = nilfs_sufile_get_blkoff(sufile, *seg); + if (blkoff == prev_blkoff) + continue; + + /* get different block */ + brelse(bh); + ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh); + if (unlikely(ret < 0)) + goto out_header; + } + brelse(bh); + + out_header: + n = seg - segnumv; + brelse(header_bh); + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + out: + if (ndone) + *ndone = n; + return ret; +} + +int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create, + void (*dofunc)(struct inode *, __u64, + struct buffer_head *, + struct buffer_head *)) +{ + struct buffer_head *header_bh, *bh; + int ret; + + if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) { + printk(KERN_WARNING "%s: invalid segment number: %llu\n", + __func__, (unsigned long long)segnum); + return -EINVAL; + } + down_write(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh); + if (!ret) { + dofunc(sufile, segnum, header_bh, bh); + brelse(bh); + } + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_set_alloc_range - limit range of segment to be allocated + * @sufile: inode of segment usage file + * @start: minimum segment number of allocatable region (inclusive) + * @end: maximum segment number of allocatable region (inclusive) + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-ERANGE - invalid segment region + */ +int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end) +{ + struct nilfs_sufile_info *sui = NILFS_SUI(sufile); + __u64 nsegs; + int ret = -ERANGE; + + down_write(&NILFS_MDT(sufile)->mi_sem); + nsegs = nilfs_sufile_get_nsegments(sufile); + + if (start <= end && end < nsegs) { + sui->allocmin = start; + sui->allocmax = end; + ret = 0; + } + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_alloc - allocate a segment + * @sufile: inode of segment usage file + * @segnump: pointer to segment number + * + * Description: nilfs_sufile_alloc() allocates a clean segment. + * + * Return Value: On success, 0 is returned and the segment number of the + * allocated segment is stored in the place pointed by @segnump. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOSPC - No clean segment left. + */ +int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) +{ + struct buffer_head *header_bh, *su_bh; + struct nilfs_sufile_header *header; + struct nilfs_segment_usage *su; + struct nilfs_sufile_info *sui = NILFS_SUI(sufile); + size_t susz = NILFS_MDT(sufile)->mi_entry_size; + __u64 segnum, maxsegnum, last_alloc; + void *kaddr; + unsigned long nsegments, ncleansegs, nsus, cnt; + int ret, j; + + down_write(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = kaddr + bh_offset(header_bh); + ncleansegs = le64_to_cpu(header->sh_ncleansegs); + last_alloc = le64_to_cpu(header->sh_last_alloc); + kunmap_atomic(kaddr, KM_USER0); + + nsegments = nilfs_sufile_get_nsegments(sufile); + maxsegnum = sui->allocmax; + segnum = last_alloc + 1; + if (segnum < sui->allocmin || segnum > sui->allocmax) + segnum = sui->allocmin; + + for (cnt = 0; cnt < nsegments; cnt += nsus) { + if (segnum > maxsegnum) { + if (cnt < sui->allocmax - sui->allocmin + 1) { + /* + * wrap around in the limited region. + * if allocation started from + * sui->allocmin, this never happens. + */ + segnum = sui->allocmin; + maxsegnum = last_alloc; + } else if (segnum > sui->allocmin && + sui->allocmax + 1 < nsegments) { + segnum = sui->allocmax + 1; + maxsegnum = nsegments - 1; + } else if (sui->allocmin > 0) { + segnum = 0; + maxsegnum = sui->allocmin - 1; + } else { + break; /* never happens */ + } + } + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, + &su_bh); + if (ret < 0) + goto out_header; + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + + nsus = nilfs_sufile_segment_usages_in_block( + sufile, segnum, maxsegnum); + for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) { + if (!nilfs_segment_usage_clean(su)) + continue; + /* found a clean segment */ + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = kaddr + bh_offset(header_bh); + le64_add_cpu(&header->sh_ncleansegs, -1); + le64_add_cpu(&header->sh_ndirtysegs, 1); + header->sh_last_alloc = cpu_to_le64(segnum); + kunmap_atomic(kaddr, KM_USER0); + + sui->ncleansegs--; + mark_buffer_dirty(header_bh); + mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); + brelse(su_bh); + *segnump = segnum; + goto out_header; + } + + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + } + + /* no segments left */ + ret = -ENOSPC; + + out_header: + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) +{ + struct nilfs_segment_usage *su; + void *kaddr; + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (unlikely(!nilfs_segment_usage_clean(su))) { + printk(KERN_WARNING "%s: segment %llu must be clean\n", + __func__, (unsigned long long)segnum); + kunmap_atomic(kaddr, KM_USER0); + return; + } + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_sufile_mod_counter(header_bh, -1, 1); + NILFS_SUI(sufile)->ncleansegs--; + + mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); +} + +void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) +{ + struct nilfs_segment_usage *su; + void *kaddr; + int clean, dirty; + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) && + su->su_nblocks == cpu_to_le32(0)) { + kunmap_atomic(kaddr, KM_USER0); + return; + } + clean = nilfs_segment_usage_clean(su); + dirty = nilfs_segment_usage_dirty(su); + + /* make the segment garbage */ + su->su_lastmod = cpu_to_le64(0); + su->su_nblocks = cpu_to_le32(0); + su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); + NILFS_SUI(sufile)->ncleansegs -= clean; + + mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); +} + +void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) +{ + struct nilfs_segment_usage *su; + void *kaddr; + int sudirty; + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (nilfs_segment_usage_clean(su)) { + printk(KERN_WARNING "%s: segment %llu is already clean\n", + __func__, (unsigned long long)segnum); + kunmap_atomic(kaddr, KM_USER0); + return; + } + WARN_ON(nilfs_segment_usage_error(su)); + WARN_ON(!nilfs_segment_usage_dirty(su)); + + sudirty = nilfs_segment_usage_dirty(su); + nilfs_segment_usage_set_clean(su); + kunmap_atomic(kaddr, KM_USER0); + mark_buffer_dirty(su_bh); + + nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); + NILFS_SUI(sufile)->ncleansegs++; + + nilfs_mdt_mark_dirty(sufile); +} + +/** + * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty + * @sufile: inode of segment usage file + * @segnum: segment number + */ +int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) +{ + struct buffer_head *bh; + int ret; + + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); + if (!ret) { + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(sufile); + brelse(bh); + } + return ret; +} + +/** + * nilfs_sufile_set_segment_usage - set usage of a segment + * @sufile: inode of segment usage file + * @segnum: segment number + * @nblocks: number of live blocks in the segment + * @modtime: modification time (option) + */ +int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, + unsigned long nblocks, time_t modtime) +{ + struct buffer_head *bh; + struct nilfs_segment_usage *su; + void *kaddr; + int ret; + + down_write(&NILFS_MDT(sufile)->mi_sem); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); + if (ret < 0) + goto out_sem; + + kaddr = kmap_atomic(bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + WARN_ON(nilfs_segment_usage_error(su)); + if (modtime) + su->su_lastmod = cpu_to_le64(modtime); + su->su_nblocks = cpu_to_le32(nblocks); + kunmap_atomic(kaddr, KM_USER0); + + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(sufile); + brelse(bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_get_stat - get segment usage statistics + * @sufile: inode of segment usage file + * @stat: pointer to a structure of segment usage statistics + * + * Description: nilfs_sufile_get_stat() returns information about segment + * usage. + * + * Return Value: On success, 0 is returned, and segment usage information is + * stored in the place pointed by @stat. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) +{ + struct buffer_head *header_bh; + struct nilfs_sufile_header *header; + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + void *kaddr; + int ret; + + down_read(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = kaddr + bh_offset(header_bh); + sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile); + sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); + sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs); + sustat->ss_ctime = nilfs->ns_ctime; + sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime; + spin_lock(&nilfs->ns_last_segment_lock); + sustat->ss_prot_seq = nilfs->ns_prot_seq; + spin_unlock(&nilfs->ns_last_segment_lock); + kunmap_atomic(kaddr, KM_USER0); + brelse(header_bh); + + out_sem: + up_read(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) +{ + struct nilfs_segment_usage *su; + void *kaddr; + int suclean; + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (nilfs_segment_usage_error(su)) { + kunmap_atomic(kaddr, KM_USER0); + return; + } + suclean = nilfs_segment_usage_clean(su); + nilfs_segment_usage_set_error(su); + kunmap_atomic(kaddr, KM_USER0); + + if (suclean) { + nilfs_sufile_mod_counter(header_bh, -1, 0); + NILFS_SUI(sufile)->ncleansegs--; + } + mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); +} + +/** + * nilfs_sufile_truncate_range - truncate range of segment array + * @sufile: inode of segment usage file + * @start: start segment number (inclusive) + * @end: end segment number (inclusive) + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid number of segments specified + * + * %-EBUSY - Dirty or active segments are present in the range + */ +static int nilfs_sufile_truncate_range(struct inode *sufile, + __u64 start, __u64 end) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *header_bh; + struct buffer_head *su_bh; + struct nilfs_segment_usage *su, *su2; + size_t susz = NILFS_MDT(sufile)->mi_entry_size; + unsigned long segusages_per_block; + unsigned long nsegs, ncleaned; + __u64 segnum; + void *kaddr; + ssize_t n, nc; + int ret; + int j; + + nsegs = nilfs_sufile_get_nsegments(sufile); + + ret = -EINVAL; + if (start > end || start >= nsegs) + goto out; + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out; + + segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); + ncleaned = 0; + + for (segnum = start; segnum <= end; segnum += n) { + n = min_t(unsigned long, + segusages_per_block - + nilfs_sufile_get_offset(sufile, segnum), + end - segnum + 1); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, + &su_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out_header; + /* hole */ + continue; + } + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + su2 = su; + for (j = 0; j < n; j++, su = (void *)su + susz) { + if ((le32_to_cpu(su->su_flags) & + ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) || + nilfs_segment_is_active(nilfs, segnum + j)) { + ret = -EBUSY; + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + goto out_header; + } + } + nc = 0; + for (su = su2, j = 0; j < n; j++, su = (void *)su + susz) { + if (nilfs_segment_usage_error(su)) { + nilfs_segment_usage_set_clean(su); + nc++; + } + } + kunmap_atomic(kaddr, KM_USER0); + if (nc > 0) { + mark_buffer_dirty(su_bh); + ncleaned += nc; + } + brelse(su_bh); + + if (n == segusages_per_block) { + /* make hole */ + nilfs_sufile_delete_segment_usage_block(sufile, segnum); + } + } + ret = 0; + +out_header: + if (ncleaned > 0) { + NILFS_SUI(sufile)->ncleansegs += ncleaned; + nilfs_sufile_mod_counter(header_bh, ncleaned, 0); + nilfs_mdt_mark_dirty(sufile); + } + brelse(header_bh); +out: + return ret; +} + +/** + * nilfs_sufile_resize - resize segment array + * @sufile: inode of segment usage file + * @newnsegs: new number of segments + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOSPC - Enough free space is not left for shrinking + * + * %-EBUSY - Dirty or active segments exist in the region to be truncated + */ +int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *header_bh; + struct nilfs_sufile_header *header; + struct nilfs_sufile_info *sui = NILFS_SUI(sufile); + void *kaddr; + unsigned long nsegs, nrsvsegs; + int ret = 0; + + down_write(&NILFS_MDT(sufile)->mi_sem); + + nsegs = nilfs_sufile_get_nsegments(sufile); + if (nsegs == newnsegs) + goto out; + + ret = -ENOSPC; + nrsvsegs = nilfs_nrsvsegs(nilfs, newnsegs); + if (newnsegs < nsegs && nsegs - newnsegs + nrsvsegs > sui->ncleansegs) + goto out; + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out; + + if (newnsegs > nsegs) { + sui->ncleansegs += newnsegs - nsegs; + } else /* newnsegs < nsegs */ { + ret = nilfs_sufile_truncate_range(sufile, newnsegs, nsegs - 1); + if (ret < 0) + goto out_header; + + sui->ncleansegs -= nsegs - newnsegs; + } + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = kaddr + bh_offset(header_bh); + header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs); + kunmap_atomic(kaddr, KM_USER0); + + mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(sufile); + nilfs_set_nsegments(nilfs, newnsegs); + +out_header: + brelse(header_bh); +out: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_get_suinfo - + * @sufile: inode of segment usage file + * @segnum: segment number to start looking + * @buf: array of suinfo + * @sisz: byte size of suinfo + * @nsi: size of suinfo array + * + * Description: + * + * Return Value: On success, 0 is returned and .... On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, + unsigned sisz, size_t nsi) +{ + struct buffer_head *su_bh; + struct nilfs_segment_usage *su; + struct nilfs_suinfo *si = buf; + size_t susz = NILFS_MDT(sufile)->mi_entry_size; + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + void *kaddr; + unsigned long nsegs, segusages_per_block; + ssize_t n; + int ret, i, j; + + down_read(&NILFS_MDT(sufile)->mi_sem); + + segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); + nsegs = min_t(unsigned long, + nilfs_sufile_get_nsegments(sufile) - segnum, + nsi); + for (i = 0; i < nsegs; i += n, segnum += n) { + n = min_t(unsigned long, + segusages_per_block - + nilfs_sufile_get_offset(sufile, segnum), + nsegs - i); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, + &su_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out; + /* hole */ + memset(si, 0, sisz * n); + si = (void *)si + sisz * n; + continue; + } + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + for (j = 0; j < n; + j++, su = (void *)su + susz, si = (void *)si + sisz) { + si->sui_lastmod = le64_to_cpu(su->su_lastmod); + si->sui_nblocks = le32_to_cpu(su->su_nblocks); + si->sui_flags = le32_to_cpu(su->su_flags) & + ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); + if (nilfs_segment_is_active(nilfs, segnum + j)) + si->sui_flags |= + (1UL << NILFS_SEGMENT_USAGE_ACTIVE); + } + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + } + ret = nsegs; + + out: + up_read(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_read - read or get sufile inode + * @sb: super block instance + * @susize: size of a segment usage entry + * @raw_inode: on-disk sufile inode + * @inodep: buffer to store the inode + */ +int nilfs_sufile_read(struct super_block *sb, size_t susize, + struct nilfs_inode *raw_inode, struct inode **inodep) +{ + struct inode *sufile; + struct nilfs_sufile_info *sui; + struct buffer_head *header_bh; + struct nilfs_sufile_header *header; + void *kaddr; + int err; + + sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO); + if (unlikely(!sufile)) + return -ENOMEM; + if (!(sufile->i_state & I_NEW)) + goto out; + + err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui)); + if (err) + goto failed; + + nilfs_mdt_set_entry_size(sufile, susize, + sizeof(struct nilfs_sufile_header)); + + err = nilfs_read_inode_common(sufile, raw_inode); + if (err) + goto failed; + + err = nilfs_sufile_get_header_block(sufile, &header_bh); + if (err) + goto failed; + + sui = NILFS_SUI(sufile); + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = kaddr + bh_offset(header_bh); + sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs); + kunmap_atomic(kaddr, KM_USER0); + brelse(header_bh); + + sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1; + sui->allocmin = 0; + + unlock_new_inode(sufile); + out: + *inodep = sufile; + return 0; + failed: + iget_failed(sufile); + return err; +} diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h new file mode 100644 index 00000000..e84bc5b5 --- /dev/null +++ b/fs/nilfs2/sufile.h @@ -0,0 +1,144 @@ +/* + * sufile.h - NILFS segment usage file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato . + */ + +#ifndef _NILFS_SUFILE_H +#define _NILFS_SUFILE_H + +#include +#include +#include +#include "mdt.h" + + +static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) +{ + return ((struct the_nilfs *)sufile->i_sb->s_fs_info)->ns_nsegments; +} + +unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile); + +int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end); +int nilfs_sufile_alloc(struct inode *, __u64 *); +int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum); +int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, + unsigned long nblocks, time_t modtime); +int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); +ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, + size_t); + +int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *, + void (*dofunc)(struct inode *, __u64, + struct buffer_head *, + struct buffer_head *)); +int nilfs_sufile_update(struct inode *, __u64, int, + void (*dofunc)(struct inode *, __u64, + struct buffer_head *, + struct buffer_head *)); +void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); +void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); +void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); +void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); + +int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs); +int nilfs_sufile_read(struct super_block *sb, size_t susize, + struct nilfs_inode *raw_inode, struct inode **inodep); + +/** + * nilfs_sufile_scrap - make a segment garbage + * @sufile: inode of segment usage file + * @segnum: segment number to be freed + */ +static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_update(sufile, segnum, 1, nilfs_sufile_do_scrap); +} + +/** + * nilfs_sufile_free - free segment + * @sufile: inode of segment usage file + * @segnum: segment number to be freed + */ +static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_update(sufile, segnum, 0, nilfs_sufile_do_free); +} + +/** + * nilfs_sufile_freev - free segments + * @sufile: inode of segment usage file + * @segnumv: array of segment numbers + * @nsegs: size of @segnumv array + * @ndone: place to store the number of freed segments + */ +static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv, + size_t nsegs, size_t *ndone) +{ + return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone, + nilfs_sufile_do_free); +} + +/** + * nilfs_sufile_cancel_freev - reallocate freeing segments + * @sufile: inode of segment usage file + * @segnumv: array of segment numbers + * @nsegs: size of @segnumv array + * @ndone: place to store the number of cancelled segments + * + * Return Value: On success, 0 is returned. On error, a negative error codes + * is returned. + */ +static inline int nilfs_sufile_cancel_freev(struct inode *sufile, + __u64 *segnumv, size_t nsegs, + size_t *ndone) +{ + return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone, + nilfs_sufile_do_cancel_free); +} + +/** + * nilfs_sufile_set_error - mark a segment as erroneous + * @sufile: inode of segment usage file + * @segnum: segment number + * + * Description: nilfs_sufile_set_error() marks the segment specified by + * @segnum as erroneous. The error segment will never be used again. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid segment usage number. + */ +static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_update(sufile, segnum, 0, + nilfs_sufile_do_set_error); +} + +#endif /* _NILFS_SUFILE_H */ diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c new file mode 100644 index 00000000..8351c44a --- /dev/null +++ b/fs/nilfs2/super.c @@ -0,0 +1,1462 @@ +/* + * super.c - NILFS module and super block management. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + */ +/* + * linux/fs/ext2/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "export.h" +#include "mdt.h" +#include "alloc.h" +#include "btree.h" +#include "btnode.h" +#include "page.h" +#include "cpfile.h" +#include "sufile.h" /* nilfs_sufile_resize(), nilfs_sufile_set_alloc_range() */ +#include "ifile.h" +#include "dat.h" +#include "segment.h" +#include "segbuf.h" + +MODULE_AUTHOR("NTT Corp."); +MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " + "(NILFS)"); +MODULE_LICENSE("GPL"); + +static struct kmem_cache *nilfs_inode_cachep; +struct kmem_cache *nilfs_transaction_cachep; +struct kmem_cache *nilfs_segbuf_cachep; +struct kmem_cache *nilfs_btree_path_cache; + +static int nilfs_setup_super(struct super_block *sb, int is_mount); +static int nilfs_remount(struct super_block *sb, int *flags, char *data); + +static void nilfs_set_error(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp; + + down_write(&nilfs->ns_sem); + if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { + nilfs->ns_mount_state |= NILFS_ERROR_FS; + sbp = nilfs_prepare_super(sb, 0); + if (likely(sbp)) { + sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS); + if (sbp[1]) + sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS); + nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL); + } + } + up_write(&nilfs->ns_sem); +} + +/** + * nilfs_error() - report failure condition on a filesystem + * + * nilfs_error() sets an ERROR_FS flag on the superblock as well as + * reporting an error message. It should be called when NILFS detects + * incoherences or defects of meta data on disk. As for sustainable + * errors such as a single-shot I/O error, nilfs_warning() or the printk() + * function should be used instead. + * + * The segment constructor must not call this function because it can + * kill itself. + */ +void nilfs_error(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); + + if (!(sb->s_flags & MS_RDONLY)) { + nilfs_set_error(sb); + + if (nilfs_test_opt(nilfs, ERRORS_RO)) { + printk(KERN_CRIT "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + } + } + + if (nilfs_test_opt(nilfs, ERRORS_PANIC)) + panic("NILFS (device %s): panic forced after error\n", + sb->s_id); +} + +void nilfs_warning(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); +} + + +struct inode *nilfs_alloc_inode(struct super_block *sb) +{ + struct nilfs_inode_info *ii; + + ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS); + if (!ii) + return NULL; + ii->i_bh = NULL; + ii->i_state = 0; + ii->i_cno = 0; + ii->vfs_inode.i_version = 1; + nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi); + return &ii->vfs_inode; +} + +static void nilfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + + INIT_LIST_HEAD(&inode->i_dentry); + + if (mdi) { + kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ + kfree(mdi); + } + kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); +} + +void nilfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, nilfs_i_callback); +} + +static int nilfs_sync_super(struct super_block *sb, int flag) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + int err; + + retry: + set_buffer_dirty(nilfs->ns_sbh[0]); + if (nilfs_test_opt(nilfs, BARRIER)) { + err = __sync_dirty_buffer(nilfs->ns_sbh[0], + WRITE_SYNC | WRITE_FLUSH_FUA); + } else { + err = sync_dirty_buffer(nilfs->ns_sbh[0]); + } + + if (unlikely(err)) { + printk(KERN_ERR + "NILFS: unable to write superblock (err=%d)\n", err); + if (err == -EIO && nilfs->ns_sbh[1]) { + /* + * sbp[0] points to newer log than sbp[1], + * so copy sbp[0] to sbp[1] to take over sbp[0]. + */ + memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0], + nilfs->ns_sbsize); + nilfs_fall_back_super_block(nilfs); + goto retry; + } + } else { + struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; + + nilfs->ns_sbwcount++; + + /* + * The latest segment becomes trailable from the position + * written in superblock. + */ + clear_nilfs_discontinued(nilfs); + + /* update GC protection for recent segments */ + if (nilfs->ns_sbh[1]) { + if (flag == NILFS_SB_COMMIT_ALL) { + set_buffer_dirty(nilfs->ns_sbh[1]); + if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0) + goto out; + } + if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) < + le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno)) + sbp = nilfs->ns_sbp[1]; + } + + spin_lock(&nilfs->ns_last_segment_lock); + nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq); + spin_unlock(&nilfs->ns_last_segment_lock); + } + out: + return err; +} + +void nilfs_set_log_cursor(struct nilfs_super_block *sbp, + struct the_nilfs *nilfs) +{ + sector_t nfreeblocks; + + /* nilfs->ns_sem must be locked by the caller. */ + nilfs_count_free_blocks(nilfs, &nfreeblocks); + sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks); + + spin_lock(&nilfs->ns_last_segment_lock); + sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq); + sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg); + sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno); + spin_unlock(&nilfs->ns_last_segment_lock); +} + +struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb, + int flip) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + + /* nilfs->ns_sem must be locked by the caller. */ + if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { + if (sbp[1] && + sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) { + memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); + } else { + printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", + sb->s_id); + return NULL; + } + } else if (sbp[1] && + sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + } + + if (flip && sbp[1]) + nilfs_swap_super_block(nilfs); + + return sbp; +} + +int nilfs_commit_super(struct super_block *sb, int flag) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + time_t t; + + /* nilfs->ns_sem must be locked by the caller. */ + t = get_seconds(); + nilfs->ns_sbwtime = t; + sbp[0]->s_wtime = cpu_to_le64(t); + sbp[0]->s_sum = 0; + sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, + (unsigned char *)sbp[0], + nilfs->ns_sbsize)); + if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) { + sbp[1]->s_wtime = sbp[0]->s_wtime; + sbp[1]->s_sum = 0; + sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, + (unsigned char *)sbp[1], + nilfs->ns_sbsize)); + } + clear_nilfs_sb_dirty(nilfs); + return nilfs_sync_super(sb, flag); +} + +/** + * nilfs_cleanup_super() - write filesystem state for cleanup + * @sb: super block instance to be unmounted or degraded to read-only + * + * This function restores state flags in the on-disk super block. + * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the + * filesystem was not clean previously. + */ +int nilfs_cleanup_super(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp; + int flag = NILFS_SB_COMMIT; + int ret = -EIO; + + sbp = nilfs_prepare_super(sb, 0); + if (sbp) { + sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); + nilfs_set_log_cursor(sbp[0], nilfs); + if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) { + /* + * make the "clean" flag also to the opposite + * super block if both super blocks point to + * the same checkpoint. + */ + sbp[1]->s_state = sbp[0]->s_state; + flag = NILFS_SB_COMMIT_ALL; + } + ret = nilfs_commit_super(sb, flag); + } + return ret; +} + +/** + * nilfs_move_2nd_super - relocate secondary super block + * @sb: super block instance + * @sb2off: new offset of the secondary super block (in bytes) + */ +static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct buffer_head *nsbh; + struct nilfs_super_block *nsbp; + sector_t blocknr, newblocknr; + unsigned long offset; + int sb2i = -1; /* array index of the secondary superblock */ + int ret = 0; + + /* nilfs->ns_sem must be locked by the caller. */ + if (nilfs->ns_sbh[1] && + nilfs->ns_sbh[1]->b_blocknr > nilfs->ns_first_data_block) { + sb2i = 1; + blocknr = nilfs->ns_sbh[1]->b_blocknr; + } else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) { + sb2i = 0; + blocknr = nilfs->ns_sbh[0]->b_blocknr; + } + if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off) + goto out; /* super block location is unchanged */ + + /* Get new super block buffer */ + newblocknr = sb2off >> nilfs->ns_blocksize_bits; + offset = sb2off & (nilfs->ns_blocksize - 1); + nsbh = sb_getblk(sb, newblocknr); + if (!nsbh) { + printk(KERN_WARNING + "NILFS warning: unable to move secondary superblock " + "to block %llu\n", (unsigned long long)newblocknr); + ret = -EIO; + goto out; + } + nsbp = (void *)nsbh->b_data + offset; + memset(nsbp, 0, nilfs->ns_blocksize); + + if (sb2i >= 0) { + memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize); + brelse(nilfs->ns_sbh[sb2i]); + nilfs->ns_sbh[sb2i] = nsbh; + nilfs->ns_sbp[sb2i] = nsbp; + } else if (nilfs->ns_sbh[0]->b_blocknr < nilfs->ns_first_data_block) { + /* secondary super block will be restored to index 1 */ + nilfs->ns_sbh[1] = nsbh; + nilfs->ns_sbp[1] = nsbp; + } else { + brelse(nsbh); + } +out: + return ret; +} + +/** + * nilfs_resize_fs - resize the filesystem + * @sb: super block instance + * @newsize: new size of the filesystem (in bytes) + */ +int nilfs_resize_fs(struct super_block *sb, __u64 newsize) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp; + __u64 devsize, newnsegs; + loff_t sb2off; + int ret; + + ret = -ERANGE; + devsize = i_size_read(sb->s_bdev->bd_inode); + if (newsize > devsize) + goto out; + + /* + * Write lock is required to protect some functions depending + * on the number of segments, the number of reserved segments, + * and so forth. + */ + down_write(&nilfs->ns_segctor_sem); + + sb2off = NILFS_SB2_OFFSET_BYTES(newsize); + newnsegs = sb2off >> nilfs->ns_blocksize_bits; + do_div(newnsegs, nilfs->ns_blocks_per_segment); + + ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs); + up_write(&nilfs->ns_segctor_sem); + if (ret < 0) + goto out; + + ret = nilfs_construct_segment(sb); + if (ret < 0) + goto out; + + down_write(&nilfs->ns_sem); + nilfs_move_2nd_super(sb, sb2off); + ret = -EIO; + sbp = nilfs_prepare_super(sb, 0); + if (likely(sbp)) { + nilfs_set_log_cursor(sbp[0], nilfs); + /* + * Drop NILFS_RESIZE_FS flag for compatibility with + * mount-time resize which may be implemented in a + * future release. + */ + sbp[0]->s_state = cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & + ~NILFS_RESIZE_FS); + sbp[0]->s_dev_size = cpu_to_le64(newsize); + sbp[0]->s_nsegments = cpu_to_le64(nilfs->ns_nsegments); + if (sbp[1]) + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + ret = nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL); + } + up_write(&nilfs->ns_sem); + + /* + * Reset the range of allocatable segments last. This order + * is important in the case of expansion because the secondary + * superblock must be protected from log write until migration + * completes. + */ + if (!ret) + nilfs_sufile_set_alloc_range(nilfs->ns_sufile, 0, newnsegs - 1); +out: + return ret; +} + +static void nilfs_put_super(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + + nilfs_detach_log_writer(sb); + + if (!(sb->s_flags & MS_RDONLY)) { + down_write(&nilfs->ns_sem); + nilfs_cleanup_super(sb); + up_write(&nilfs->ns_sem); + } + + iput(nilfs->ns_sufile); + iput(nilfs->ns_cpfile); + iput(nilfs->ns_dat); + + destroy_nilfs(nilfs); + sb->s_fs_info = NULL; +} + +static int nilfs_sync_fs(struct super_block *sb, int wait) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp; + int err = 0; + + /* This function is called when super block should be written back */ + if (wait) + err = nilfs_construct_segment(sb); + + down_write(&nilfs->ns_sem); + if (nilfs_sb_dirty(nilfs)) { + sbp = nilfs_prepare_super(sb, nilfs_sb_will_flip(nilfs)); + if (likely(sbp)) { + nilfs_set_log_cursor(sbp[0], nilfs); + nilfs_commit_super(sb, NILFS_SB_COMMIT); + } + } + up_write(&nilfs->ns_sem); + + return err; +} + +int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, + struct nilfs_root **rootp) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_root *root; + struct nilfs_checkpoint *raw_cp; + struct buffer_head *bh_cp; + int err = -ENOMEM; + + root = nilfs_find_or_create_root( + nilfs, curr_mnt ? NILFS_CPTREE_CURRENT_CNO : cno); + if (!root) + return err; + + if (root->ifile) + goto reuse; /* already attached checkpoint */ + + down_read(&nilfs->ns_segctor_sem); + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, + &bh_cp); + up_read(&nilfs->ns_segctor_sem); + if (unlikely(err)) { + if (err == -ENOENT || err == -EINVAL) { + printk(KERN_ERR + "NILFS: Invalid checkpoint " + "(checkpoint number=%llu)\n", + (unsigned long long)cno); + err = -EINVAL; + } + goto failed; + } + + err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size, + &raw_cp->cp_ifile_inode, &root->ifile); + if (err) + goto failed_bh; + + atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); + atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); + + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); + + reuse: + *rootp = root; + return 0; + + failed_bh: + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); + failed: + nilfs_put_root(root); + + return err; +} + +static int nilfs_freeze(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + int err; + + if (sb->s_flags & MS_RDONLY) + return 0; + + /* Mark super block clean */ + down_write(&nilfs->ns_sem); + err = nilfs_cleanup_super(sb); + up_write(&nilfs->ns_sem); + return err; +} + +static int nilfs_unfreeze(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + + if (sb->s_flags & MS_RDONLY) + return 0; + + down_write(&nilfs->ns_sem); + nilfs_setup_super(sb, false); + up_write(&nilfs->ns_sem); + return 0; +} + +static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root; + struct the_nilfs *nilfs = root->nilfs; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + unsigned long long blocks; + unsigned long overhead; + unsigned long nrsvblocks; + sector_t nfreeblocks; + int err; + + /* + * Compute all of the segment blocks + * + * The blocks before first segment and after last segment + * are excluded. + */ + blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments + - nilfs->ns_first_data_block; + nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment; + + /* + * Compute the overhead + * + * When distributing meta data blocks outside segment structure, + * We must count them as the overhead. + */ + overhead = 0; + + err = nilfs_count_free_blocks(nilfs, &nfreeblocks); + if (unlikely(err)) + return err; + + buf->f_type = NILFS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = blocks - overhead; + buf->f_bfree = nfreeblocks; + buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? + (buf->f_bfree - nrsvblocks) : 0; + buf->f_files = atomic_read(&root->inodes_count); + buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ + buf->f_namelen = NILFS_NAME_LEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + +static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct super_block *sb = vfs->mnt_sb; + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root; + + if (!nilfs_test_opt(nilfs, BARRIER)) + seq_puts(seq, ",nobarrier"); + if (root->cno != NILFS_CPTREE_CURRENT_CNO) + seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno); + if (nilfs_test_opt(nilfs, ERRORS_PANIC)) + seq_puts(seq, ",errors=panic"); + if (nilfs_test_opt(nilfs, ERRORS_CONT)) + seq_puts(seq, ",errors=continue"); + if (nilfs_test_opt(nilfs, STRICT_ORDER)) + seq_puts(seq, ",order=strict"); + if (nilfs_test_opt(nilfs, NORECOVERY)) + seq_puts(seq, ",norecovery"); + if (nilfs_test_opt(nilfs, DISCARD)) + seq_puts(seq, ",discard"); + + return 0; +} + +static const struct super_operations nilfs_sops = { + .alloc_inode = nilfs_alloc_inode, + .destroy_inode = nilfs_destroy_inode, + .dirty_inode = nilfs_dirty_inode, + /* .write_inode = nilfs_write_inode, */ + /* .put_inode = nilfs_put_inode, */ + /* .drop_inode = nilfs_drop_inode, */ + .evict_inode = nilfs_evict_inode, + .put_super = nilfs_put_super, + /* .write_super = nilfs_write_super, */ + .sync_fs = nilfs_sync_fs, + .freeze_fs = nilfs_freeze, + .unfreeze_fs = nilfs_unfreeze, + /* .write_super_lockfs */ + /* .unlockfs */ + .statfs = nilfs_statfs, + .remount_fs = nilfs_remount, + /* .umount_begin */ + .show_options = nilfs_show_options +}; + +enum { + Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, + Opt_discard, Opt_nodiscard, Opt_err, +}; + +static match_table_t tokens = { + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_snapshot, "cp=%u"}, + {Opt_order, "order=%s"}, + {Opt_norecovery, "norecovery"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_err, NULL} +}; + +static int parse_options(char *options, struct super_block *sb, int is_remount) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + char *p; + substring_t args[MAX_OPT_ARGS]; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_barrier: + nilfs_set_opt(nilfs, BARRIER); + break; + case Opt_nobarrier: + nilfs_clear_opt(nilfs, BARRIER); + break; + case Opt_order: + if (strcmp(args[0].from, "relaxed") == 0) + /* Ordered data semantics */ + nilfs_clear_opt(nilfs, STRICT_ORDER); + else if (strcmp(args[0].from, "strict") == 0) + /* Strict in-order semantics */ + nilfs_set_opt(nilfs, STRICT_ORDER); + else + return 0; + break; + case Opt_err_panic: + nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_PANIC); + break; + case Opt_err_ro: + nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_RO); + break; + case Opt_err_cont: + nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_CONT); + break; + case Opt_snapshot: + if (is_remount) { + printk(KERN_ERR + "NILFS: \"%s\" option is invalid " + "for remount.\n", p); + return 0; + } + break; + case Opt_norecovery: + nilfs_set_opt(nilfs, NORECOVERY); + break; + case Opt_discard: + nilfs_set_opt(nilfs, DISCARD); + break; + case Opt_nodiscard: + nilfs_clear_opt(nilfs, DISCARD); + break; + default: + printk(KERN_ERR + "NILFS: Unrecognized mount option \"%s\"\n", p); + return 0; + } + } + return 1; +} + +static inline void +nilfs_set_default_options(struct super_block *sb, + struct nilfs_super_block *sbp) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + + nilfs->ns_mount_opt = + NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; +} + +static int nilfs_setup_super(struct super_block *sb, int is_mount) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp; + int max_mnt_count; + int mnt_count; + + /* nilfs->ns_sem must be locked by the caller. */ + sbp = nilfs_prepare_super(sb, 0); + if (!sbp) + return -EIO; + + if (!is_mount) + goto skip_mount_setup; + + max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count); + mnt_count = le16_to_cpu(sbp[0]->s_mnt_count); + + if (nilfs->ns_mount_state & NILFS_ERROR_FS) { + printk(KERN_WARNING + "NILFS warning: mounting fs with errors\n"); +#if 0 + } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) { + printk(KERN_WARNING + "NILFS warning: maximal mount count reached\n"); +#endif + } + if (!max_mnt_count) + sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); + + sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1); + sbp[0]->s_mtime = cpu_to_le64(get_seconds()); + +skip_mount_setup: + sbp[0]->s_state = + cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); + /* synchronize sbp[1] with sbp[0] */ + if (sbp[1]) + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + return nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL); +} + +struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, + u64 pos, int blocksize, + struct buffer_head **pbh) +{ + unsigned long long sb_index = pos; + unsigned long offset; + + offset = do_div(sb_index, blocksize); + *pbh = sb_bread(sb, sb_index); + if (!*pbh) + return NULL; + return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset); +} + +int nilfs_store_magic_and_option(struct super_block *sb, + struct nilfs_super_block *sbp, + char *data) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + + sb->s_magic = le16_to_cpu(sbp->s_magic); + + /* FS independent flags */ +#ifdef NILFS_ATIME_DISABLE + sb->s_flags |= MS_NOATIME; +#endif + + nilfs_set_default_options(sb, sbp); + + nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid); + nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid); + nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval); + nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max); + + return !parse_options(data, sb, 0) ? -EINVAL : 0 ; +} + +int nilfs_check_feature_compatibility(struct super_block *sb, + struct nilfs_super_block *sbp) +{ + __u64 features; + + features = le64_to_cpu(sbp->s_feature_incompat) & + ~NILFS_FEATURE_INCOMPAT_SUPP; + if (features) { + printk(KERN_ERR "NILFS: couldn't mount because of unsupported " + "optional features (%llx)\n", + (unsigned long long)features); + return -EINVAL; + } + features = le64_to_cpu(sbp->s_feature_compat_ro) & + ~NILFS_FEATURE_COMPAT_RO_SUPP; + if (!(sb->s_flags & MS_RDONLY) && features) { + printk(KERN_ERR "NILFS: couldn't mount RDWR because of " + "unsupported optional features (%llx)\n", + (unsigned long long)features); + return -EINVAL; + } + return 0; +} + +static int nilfs_get_root_dentry(struct super_block *sb, + struct nilfs_root *root, + struct dentry **root_dentry) +{ + struct inode *inode; + struct dentry *dentry; + int ret = 0; + + inode = nilfs_iget(sb, root, NILFS_ROOT_INO); + if (IS_ERR(inode)) { + printk(KERN_ERR "NILFS: get root inode failed\n"); + ret = PTR_ERR(inode); + goto out; + } + if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) { + iput(inode); + printk(KERN_ERR "NILFS: corrupt root inode.\n"); + ret = -EINVAL; + goto out; + } + + if (root->cno == NILFS_CPTREE_CURRENT_CNO) { + dentry = d_find_alias(inode); + if (!dentry) { + dentry = d_alloc_root(inode); + if (!dentry) { + iput(inode); + ret = -ENOMEM; + goto failed_dentry; + } + } else { + iput(inode); + } + } else { + dentry = d_obtain_alias(inode); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto failed_dentry; + } + } + *root_dentry = dentry; + out: + return ret; + + failed_dentry: + printk(KERN_ERR "NILFS: get root dentry failed\n"); + goto out; +} + +static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, + struct dentry **root_dentry) +{ + struct the_nilfs *nilfs = s->s_fs_info; + struct nilfs_root *root; + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) { + ret = (ret == -ENOENT) ? -EINVAL : ret; + goto out; + } else if (!ret) { + printk(KERN_ERR "NILFS: The specified checkpoint is " + "not a snapshot (checkpoint number=%llu).\n", + (unsigned long long)cno); + ret = -EINVAL; + goto out; + } + + ret = nilfs_attach_checkpoint(s, cno, false, &root); + if (ret) { + printk(KERN_ERR "NILFS: error loading snapshot " + "(checkpoint number=%llu).\n", + (unsigned long long)cno); + goto out; + } + ret = nilfs_get_root_dentry(s, root, root_dentry); + nilfs_put_root(root); + out: + return ret; +} + +static int nilfs_tree_was_touched(struct dentry *root_dentry) +{ + return root_dentry->d_count > 1; +} + +/** + * nilfs_try_to_shrink_tree() - try to shrink dentries of a checkpoint + * @root_dentry: root dentry of the tree to be shrunk + * + * This function returns true if the tree was in-use. + */ +static int nilfs_try_to_shrink_tree(struct dentry *root_dentry) +{ + if (have_submounts(root_dentry)) + return true; + shrink_dcache_parent(root_dentry); + return nilfs_tree_was_touched(root_dentry); +} + +int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_root *root; + struct inode *inode; + struct dentry *dentry; + int ret; + + if (cno < 0 || cno > nilfs->ns_cno) + return false; + + if (cno >= nilfs_last_cno(nilfs)) + return true; /* protect recent checkpoints */ + + ret = false; + root = nilfs_lookup_root(nilfs, cno); + if (root) { + inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO); + if (inode) { + dentry = d_find_alias(inode); + if (dentry) { + if (nilfs_tree_was_touched(dentry)) + ret = nilfs_try_to_shrink_tree(dentry); + dput(dentry); + } + iput(inode); + } + nilfs_put_root(root); + } + return ret; +} + +/** + * nilfs_fill_super() - initialize a super block instance + * @sb: super_block + * @data: mount options + * @silent: silent mode flag + * + * This function is called exclusively by nilfs->ns_mount_mutex. + * So, the recovery process is protected from other simultaneous mounts. + */ +static int +nilfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct the_nilfs *nilfs; + struct nilfs_root *fsroot; + struct backing_dev_info *bdi; + __u64 cno; + int err; + + nilfs = alloc_nilfs(sb->s_bdev); + if (!nilfs) + return -ENOMEM; + + sb->s_fs_info = nilfs; + + err = init_nilfs(nilfs, sb, (char *)data); + if (err) + goto failed_nilfs; + + sb->s_op = &nilfs_sops; + sb->s_export_op = &nilfs_export_ops; + sb->s_root = NULL; + sb->s_time_gran = 1; + + bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + sb->s_bdi = bdi ? : &default_backing_dev_info; + + err = load_nilfs(nilfs, sb); + if (err) + goto failed_nilfs; + + cno = nilfs_last_cno(nilfs); + err = nilfs_attach_checkpoint(sb, cno, true, &fsroot); + if (err) { + printk(KERN_ERR "NILFS: error loading last checkpoint " + "(checkpoint number=%llu).\n", (unsigned long long)cno); + goto failed_unload; + } + + if (!(sb->s_flags & MS_RDONLY)) { + err = nilfs_attach_log_writer(sb, fsroot); + if (err) + goto failed_checkpoint; + } + + err = nilfs_get_root_dentry(sb, fsroot, &sb->s_root); + if (err) + goto failed_segctor; + + nilfs_put_root(fsroot); + + if (!(sb->s_flags & MS_RDONLY)) { + down_write(&nilfs->ns_sem); + nilfs_setup_super(sb, true); + up_write(&nilfs->ns_sem); + } + + return 0; + + failed_segctor: + nilfs_detach_log_writer(sb); + + failed_checkpoint: + nilfs_put_root(fsroot); + + failed_unload: + iput(nilfs->ns_sufile); + iput(nilfs->ns_cpfile); + iput(nilfs->ns_dat); + + failed_nilfs: + destroy_nilfs(nilfs); + return err; +} + +static int nilfs_remount(struct super_block *sb, int *flags, char *data) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + unsigned long old_sb_flags; + unsigned long old_mount_opt; + int err; + + old_sb_flags = sb->s_flags; + old_mount_opt = nilfs->ns_mount_opt; + + if (!parse_options(data, sb, 1)) { + err = -EINVAL; + goto restore_opts; + } + sb->s_flags = (sb->s_flags & ~MS_POSIXACL); + + err = -EINVAL; + + if (!nilfs_valid_fs(nilfs)) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount because the filesystem is in an " + "incomplete recovery state.\n", sb->s_id); + goto restore_opts; + } + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + goto out; + if (*flags & MS_RDONLY) { + /* Shutting down log writer */ + nilfs_detach_log_writer(sb); + sb->s_flags |= MS_RDONLY; + + /* + * Remounting a valid RW partition RDONLY, so set + * the RDONLY flag and then mark the partition as valid again. + */ + down_write(&nilfs->ns_sem); + nilfs_cleanup_super(sb); + up_write(&nilfs->ns_sem); + } else { + __u64 features; + struct nilfs_root *root; + + /* + * Mounting a RDONLY partition read-write, so reread and + * store the current valid flag. (It may have been changed + * by fsck since we originally mounted the partition.) + */ + down_read(&nilfs->ns_sem); + features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & + ~NILFS_FEATURE_COMPAT_RO_SUPP; + up_read(&nilfs->ns_sem); + if (features) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount RDWR because of unsupported optional " + "features (%llx)\n", + sb->s_id, (unsigned long long)features); + err = -EROFS; + goto restore_opts; + } + + sb->s_flags &= ~MS_RDONLY; + + root = NILFS_I(sb->s_root->d_inode)->i_root; + err = nilfs_attach_log_writer(sb, root); + if (err) + goto restore_opts; + + down_write(&nilfs->ns_sem); + nilfs_setup_super(sb, true); + up_write(&nilfs->ns_sem); + } + out: + return 0; + + restore_opts: + sb->s_flags = old_sb_flags; + nilfs->ns_mount_opt = old_mount_opt; + return err; +} + +struct nilfs_super_data { + struct block_device *bdev; + __u64 cno; + int flags; +}; + +/** + * nilfs_identify - pre-read mount options needed to identify mount instance + * @data: mount options + * @sd: nilfs_super_data + */ +static int nilfs_identify(char *data, struct nilfs_super_data *sd) +{ + char *p, *options = data; + substring_t args[MAX_OPT_ARGS]; + int token; + int ret = 0; + + do { + p = strsep(&options, ","); + if (p != NULL && *p) { + token = match_token(p, tokens, args); + if (token == Opt_snapshot) { + if (!(sd->flags & MS_RDONLY)) { + ret++; + } else { + sd->cno = simple_strtoull(args[0].from, + NULL, 0); + /* + * No need to see the end pointer; + * match_token() has done syntax + * checking. + */ + if (sd->cno == 0) + ret++; + } + } + if (ret) + printk(KERN_ERR + "NILFS: invalid mount option: %s\n", p); + } + if (!options) + break; + BUG_ON(options == data); + *(options - 1) = ','; + } while (!ret); + return ret; +} + +static int nilfs_set_bdev_super(struct super_block *s, void *data) +{ + s->s_bdev = data; + s->s_dev = s->s_bdev->bd_dev; + return 0; +} + +static int nilfs_test_bdev_super(struct super_block *s, void *data) +{ + return (void *)s->s_bdev == data; +} + +static struct dentry * +nilfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + struct nilfs_super_data sd; + struct super_block *s; + fmode_t mode = FMODE_READ | FMODE_EXCL; + struct dentry *root_dentry; + int err, s_new = false; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type); + if (IS_ERR(sd.bdev)) + return ERR_CAST(sd.bdev); + + sd.cno = 0; + sd.flags = flags; + if (nilfs_identify((char *)data, &sd)) { + err = -EINVAL; + goto failed; + } + + /* + * once the super is inserted into the list by sget, s_umount + * will protect the lockfs code from trying to start a snapshot + * while we are mounting + */ + mutex_lock(&sd.bdev->bd_fsfreeze_mutex); + if (sd.bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&sd.bdev->bd_fsfreeze_mutex); + err = -EBUSY; + goto failed; + } + s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, sd.bdev); + mutex_unlock(&sd.bdev->bd_fsfreeze_mutex); + if (IS_ERR(s)) { + err = PTR_ERR(s); + goto failed; + } + + if (!s->s_root) { + char b[BDEVNAME_SIZE]; + + s_new = true; + + /* New superblock instance created */ + s->s_flags = flags; + s->s_mode = mode; + strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); + sb_set_blocksize(s, block_size(sd.bdev)); + + err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0); + if (err) + goto failed_super; + + s->s_flags |= MS_ACTIVE; + } else if (!sd.cno) { + int busy = false; + + if (nilfs_tree_was_touched(s->s_root)) { + busy = nilfs_try_to_shrink_tree(s->s_root); + if (busy && (flags ^ s->s_flags) & MS_RDONLY) { + printk(KERN_ERR "NILFS: the device already " + "has a %s mount.\n", + (s->s_flags & MS_RDONLY) ? + "read-only" : "read/write"); + err = -EBUSY; + goto failed_super; + } + } + if (!busy) { + /* + * Try remount to setup mount states if the current + * tree is not mounted and only snapshots use this sb. + */ + err = nilfs_remount(s, &flags, data); + if (err) + goto failed_super; + } + } + + if (sd.cno) { + err = nilfs_attach_snapshot(s, sd.cno, &root_dentry); + if (err) + goto failed_super; + } else { + root_dentry = dget(s->s_root); + } + + if (!s_new) + blkdev_put(sd.bdev, mode); + + return root_dentry; + + failed_super: + deactivate_locked_super(s); + + failed: + if (!s_new) + blkdev_put(sd.bdev, mode); + return ERR_PTR(err); +} + +struct file_system_type nilfs_fs_type = { + .owner = THIS_MODULE, + .name = "nilfs2", + .mount = nilfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static void nilfs_inode_init_once(void *obj) +{ + struct nilfs_inode_info *ii = obj; + + INIT_LIST_HEAD(&ii->i_dirty); +#ifdef CONFIG_NILFS_XATTR + init_rwsem(&ii->xattr_sem); +#endif + address_space_init_once(&ii->i_btnode_cache); + ii->i_bmap = &ii->i_bmap_data; + inode_init_once(&ii->vfs_inode); +} + +static void nilfs_segbuf_init_once(void *obj) +{ + memset(obj, 0, sizeof(struct nilfs_segment_buffer)); +} + +static void nilfs_destroy_cachep(void) +{ + if (nilfs_inode_cachep) + kmem_cache_destroy(nilfs_inode_cachep); + if (nilfs_transaction_cachep) + kmem_cache_destroy(nilfs_transaction_cachep); + if (nilfs_segbuf_cachep) + kmem_cache_destroy(nilfs_segbuf_cachep); + if (nilfs_btree_path_cache) + kmem_cache_destroy(nilfs_btree_path_cache); +} + +static int __init nilfs_init_cachep(void) +{ + nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", + sizeof(struct nilfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once); + if (!nilfs_inode_cachep) + goto fail; + + nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache", + sizeof(struct nilfs_transaction_info), 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (!nilfs_transaction_cachep) + goto fail; + + nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache", + sizeof(struct nilfs_segment_buffer), 0, + SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once); + if (!nilfs_segbuf_cachep) + goto fail; + + nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache", + sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX, + 0, 0, NULL); + if (!nilfs_btree_path_cache) + goto fail; + + return 0; + +fail: + nilfs_destroy_cachep(); + return -ENOMEM; +} + +static int __init init_nilfs_fs(void) +{ + int err; + + err = nilfs_init_cachep(); + if (err) + goto fail; + + err = register_filesystem(&nilfs_fs_type); + if (err) + goto free_cachep; + + printk(KERN_INFO "NILFS version 2 loaded\n"); + return 0; + +free_cachep: + nilfs_destroy_cachep(); +fail: + return err; +} + +static void __exit exit_nilfs_fs(void) +{ + nilfs_destroy_cachep(); + unregister_filesystem(&nilfs_fs_type); +} + +module_init(init_nilfs_fs) +module_exit(exit_nilfs_fs) diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c new file mode 100644 index 00000000..35a89708 --- /dev/null +++ b/fs/nilfs2/the_nilfs.c @@ -0,0 +1,784 @@ +/* + * the_nilfs.c - the_nilfs shared structure. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + * + */ + +#include +#include +#include +#include +#include +#include +#include "nilfs.h" +#include "segment.h" +#include "alloc.h" +#include "cpfile.h" +#include "sufile.h" +#include "dat.h" +#include "segbuf.h" + + +static int nilfs_valid_sb(struct nilfs_super_block *sbp); + +void nilfs_set_last_segment(struct the_nilfs *nilfs, + sector_t start_blocknr, u64 seq, __u64 cno) +{ + spin_lock(&nilfs->ns_last_segment_lock); + nilfs->ns_last_pseg = start_blocknr; + nilfs->ns_last_seq = seq; + nilfs->ns_last_cno = cno; + + if (!nilfs_sb_dirty(nilfs)) { + if (nilfs->ns_prev_seq == nilfs->ns_last_seq) + goto stay_cursor; + + set_nilfs_sb_dirty(nilfs); + } + nilfs->ns_prev_seq = nilfs->ns_last_seq; + + stay_cursor: + spin_unlock(&nilfs->ns_last_segment_lock); +} + +/** + * alloc_nilfs - allocate a nilfs object + * @bdev: block device to which the_nilfs is related + * + * Return Value: On success, pointer to the_nilfs is returned. + * On error, NULL is returned. + */ +struct the_nilfs *alloc_nilfs(struct block_device *bdev) +{ + struct the_nilfs *nilfs; + + nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL); + if (!nilfs) + return NULL; + + nilfs->ns_bdev = bdev; + atomic_set(&nilfs->ns_ndirtyblks, 0); + init_rwsem(&nilfs->ns_sem); + INIT_LIST_HEAD(&nilfs->ns_dirty_files); + INIT_LIST_HEAD(&nilfs->ns_gc_inodes); + spin_lock_init(&nilfs->ns_inode_lock); + spin_lock_init(&nilfs->ns_next_gen_lock); + spin_lock_init(&nilfs->ns_last_segment_lock); + nilfs->ns_cptree = RB_ROOT; + spin_lock_init(&nilfs->ns_cptree_lock); + init_rwsem(&nilfs->ns_segctor_sem); + + return nilfs; +} + +/** + * destroy_nilfs - destroy nilfs object + * @nilfs: nilfs object to be released + */ +void destroy_nilfs(struct the_nilfs *nilfs) +{ + might_sleep(); + if (nilfs_init(nilfs)) { + brelse(nilfs->ns_sbh[0]); + brelse(nilfs->ns_sbh[1]); + } + kfree(nilfs); +} + +static int nilfs_load_super_root(struct the_nilfs *nilfs, + struct super_block *sb, sector_t sr_block) +{ + struct buffer_head *bh_sr; + struct nilfs_super_root *raw_sr; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct nilfs_inode *rawi; + unsigned dat_entry_size, segment_usage_size, checkpoint_size; + unsigned inode_size; + int err; + + err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1); + if (unlikely(err)) + return err; + + down_read(&nilfs->ns_sem); + dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size); + checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size); + segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size); + up_read(&nilfs->ns_sem); + + inode_size = nilfs->ns_inode_size; + + rawi = (void *)bh_sr->b_data + NILFS_SR_DAT_OFFSET(inode_size); + err = nilfs_dat_read(sb, dat_entry_size, rawi, &nilfs->ns_dat); + if (err) + goto failed; + + rawi = (void *)bh_sr->b_data + NILFS_SR_CPFILE_OFFSET(inode_size); + err = nilfs_cpfile_read(sb, checkpoint_size, rawi, &nilfs->ns_cpfile); + if (err) + goto failed_dat; + + rawi = (void *)bh_sr->b_data + NILFS_SR_SUFILE_OFFSET(inode_size); + err = nilfs_sufile_read(sb, segment_usage_size, rawi, + &nilfs->ns_sufile); + if (err) + goto failed_cpfile; + + raw_sr = (struct nilfs_super_root *)bh_sr->b_data; + nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime); + + failed: + brelse(bh_sr); + return err; + + failed_cpfile: + iput(nilfs->ns_cpfile); + + failed_dat: + iput(nilfs->ns_dat); + goto failed; +} + +static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri) +{ + memset(ri, 0, sizeof(*ri)); + INIT_LIST_HEAD(&ri->ri_used_segments); +} + +static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri) +{ + nilfs_dispose_segment_list(&ri->ri_used_segments); +} + +/** + * nilfs_store_log_cursor - load log cursor from a super block + * @nilfs: nilfs object + * @sbp: buffer storing super block to be read + * + * nilfs_store_log_cursor() reads the last position of the log + * containing a super root from a given super block, and initializes + * relevant information on the nilfs object preparatory for log + * scanning and recovery. + */ +static int nilfs_store_log_cursor(struct the_nilfs *nilfs, + struct nilfs_super_block *sbp) +{ + int ret = 0; + + nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg); + nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno); + nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq); + + nilfs->ns_prev_seq = nilfs->ns_last_seq; + nilfs->ns_seg_seq = nilfs->ns_last_seq; + nilfs->ns_segnum = + nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg); + nilfs->ns_cno = nilfs->ns_last_cno + 1; + if (nilfs->ns_segnum >= nilfs->ns_nsegments) { + printk(KERN_ERR "NILFS invalid last segment number.\n"); + ret = -EINVAL; + } + return ret; +} + +/** + * load_nilfs - load and recover the nilfs + * @nilfs: the_nilfs structure to be released + * @sb: super block isntance used to recover past segment + * + * load_nilfs() searches and load the latest super root, + * attaches the last segment, and does recovery if needed. + * The caller must call this exclusively for simultaneous mounts. + */ +int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) +{ + struct nilfs_recovery_info ri; + unsigned int s_flags = sb->s_flags; + int really_read_only = bdev_read_only(nilfs->ns_bdev); + int valid_fs = nilfs_valid_fs(nilfs); + int err; + + if (!valid_fs) { + printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); + if (s_flags & MS_RDONLY) { + printk(KERN_INFO "NILFS: INFO: recovery " + "required for readonly filesystem.\n"); + printk(KERN_INFO "NILFS: write access will " + "be enabled during recovery.\n"); + } + } + + nilfs_init_recovery_info(&ri); + + err = nilfs_search_super_root(nilfs, &ri); + if (unlikely(err)) { + struct nilfs_super_block **sbp = nilfs->ns_sbp; + int blocksize; + + if (err != -EINVAL) + goto scan_error; + + if (!nilfs_valid_sb(sbp[1])) { + printk(KERN_WARNING + "NILFS warning: unable to fall back to spare" + "super block\n"); + goto scan_error; + } + printk(KERN_INFO + "NILFS: try rollback from an earlier position\n"); + + /* + * restore super block with its spare and reconfigure + * relevant states of the nilfs object. + */ + memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); + nilfs->ns_crc_seed = le32_to_cpu(sbp[0]->s_crc_seed); + nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); + + /* verify consistency between two super blocks */ + blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size); + if (blocksize != nilfs->ns_blocksize) { + printk(KERN_WARNING + "NILFS warning: blocksize differs between " + "two super blocks (%d != %d)\n", + blocksize, nilfs->ns_blocksize); + goto scan_error; + } + + err = nilfs_store_log_cursor(nilfs, sbp[0]); + if (err) + goto scan_error; + + /* drop clean flag to allow roll-forward and recovery */ + nilfs->ns_mount_state &= ~NILFS_VALID_FS; + valid_fs = 0; + + err = nilfs_search_super_root(nilfs, &ri); + if (err) + goto scan_error; + } + + err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: error loading super root.\n"); + goto failed; + } + + if (valid_fs) + goto skip_recovery; + + if (s_flags & MS_RDONLY) { + __u64 features; + + if (nilfs_test_opt(nilfs, NORECOVERY)) { + printk(KERN_INFO "NILFS: norecovery option specified. " + "skipping roll-forward recovery\n"); + goto skip_recovery; + } + features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & + ~NILFS_FEATURE_COMPAT_RO_SUPP; + if (features) { + printk(KERN_ERR "NILFS: couldn't proceed with " + "recovery because of unsupported optional " + "features (%llx)\n", + (unsigned long long)features); + err = -EROFS; + goto failed_unload; + } + if (really_read_only) { + printk(KERN_ERR "NILFS: write access " + "unavailable, cannot proceed.\n"); + err = -EROFS; + goto failed_unload; + } + sb->s_flags &= ~MS_RDONLY; + } else if (nilfs_test_opt(nilfs, NORECOVERY)) { + printk(KERN_ERR "NILFS: recovery cancelled because norecovery " + "option was specified for a read/write mount\n"); + err = -EINVAL; + goto failed_unload; + } + + err = nilfs_salvage_orphan_logs(nilfs, sb, &ri); + if (err) + goto failed_unload; + + down_write(&nilfs->ns_sem); + nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */ + err = nilfs_cleanup_super(sb); + up_write(&nilfs->ns_sem); + + if (err) { + printk(KERN_ERR "NILFS: failed to update super block. " + "recovery unfinished.\n"); + goto failed_unload; + } + printk(KERN_INFO "NILFS: recovery complete.\n"); + + skip_recovery: + nilfs_clear_recovery_info(&ri); + sb->s_flags = s_flags; + return 0; + + scan_error: + printk(KERN_ERR "NILFS: error searching super root.\n"); + goto failed; + + failed_unload: + iput(nilfs->ns_cpfile); + iput(nilfs->ns_sufile); + iput(nilfs->ns_dat); + + failed: + nilfs_clear_recovery_info(&ri); + sb->s_flags = s_flags; + return err; +} + +static unsigned long long nilfs_max_size(unsigned int blkbits) +{ + unsigned int max_bits; + unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */ + + max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */ + if (max_bits < 64) + res = min_t(unsigned long long, res, (1ULL << max_bits) - 1); + return res; +} + +/** + * nilfs_nrsvsegs - calculate the number of reserved segments + * @nilfs: nilfs object + * @nsegs: total number of segments + */ +unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs) +{ + return max_t(unsigned long, NILFS_MIN_NRSVSEGS, + DIV_ROUND_UP(nsegs * nilfs->ns_r_segments_percentage, + 100)); +} + +void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs) +{ + nilfs->ns_nsegments = nsegs; + nilfs->ns_nrsvsegs = nilfs_nrsvsegs(nilfs, nsegs); +} + +static int nilfs_store_disk_layout(struct the_nilfs *nilfs, + struct nilfs_super_block *sbp) +{ + if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) { + printk(KERN_ERR "NILFS: unsupported revision " + "(superblock rev.=%d.%d, current rev.=%d.%d). " + "Please check the version of mkfs.nilfs.\n", + le32_to_cpu(sbp->s_rev_level), + le16_to_cpu(sbp->s_minor_rev_level), + NILFS_CURRENT_REV, NILFS_MINOR_REV); + return -EINVAL; + } + nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes); + if (nilfs->ns_sbsize > BLOCK_SIZE) + return -EINVAL; + + nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size); + nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); + + nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); + if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { + printk(KERN_ERR "NILFS: too short segment.\n"); + return -EINVAL; + } + + nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block); + nilfs->ns_r_segments_percentage = + le32_to_cpu(sbp->s_r_segments_percentage); + nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments)); + nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed); + return 0; +} + +static int nilfs_valid_sb(struct nilfs_super_block *sbp) +{ + static unsigned char sum[4]; + const int sumoff = offsetof(struct nilfs_super_block, s_sum); + size_t bytes; + u32 crc; + + if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC) + return 0; + bytes = le16_to_cpu(sbp->s_bytes); + if (bytes > BLOCK_SIZE) + return 0; + crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp, + sumoff); + crc = crc32_le(crc, sum, 4); + crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4, + bytes - sumoff - 4); + return crc == le32_to_cpu(sbp->s_sum); +} + +static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) +{ + return offset < ((le64_to_cpu(sbp->s_nsegments) * + le32_to_cpu(sbp->s_blocks_per_segment)) << + (le32_to_cpu(sbp->s_log_block_size) + 10)); +} + +static void nilfs_release_super_block(struct the_nilfs *nilfs) +{ + int i; + + for (i = 0; i < 2; i++) { + if (nilfs->ns_sbp[i]) { + brelse(nilfs->ns_sbh[i]); + nilfs->ns_sbh[i] = NULL; + nilfs->ns_sbp[i] = NULL; + } + } +} + +void nilfs_fall_back_super_block(struct the_nilfs *nilfs) +{ + brelse(nilfs->ns_sbh[0]); + nilfs->ns_sbh[0] = nilfs->ns_sbh[1]; + nilfs->ns_sbp[0] = nilfs->ns_sbp[1]; + nilfs->ns_sbh[1] = NULL; + nilfs->ns_sbp[1] = NULL; +} + +void nilfs_swap_super_block(struct the_nilfs *nilfs) +{ + struct buffer_head *tsbh = nilfs->ns_sbh[0]; + struct nilfs_super_block *tsbp = nilfs->ns_sbp[0]; + + nilfs->ns_sbh[0] = nilfs->ns_sbh[1]; + nilfs->ns_sbp[0] = nilfs->ns_sbp[1]; + nilfs->ns_sbh[1] = tsbh; + nilfs->ns_sbp[1] = tsbp; +} + +static int nilfs_load_super_block(struct the_nilfs *nilfs, + struct super_block *sb, int blocksize, + struct nilfs_super_block **sbpp) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct buffer_head **sbh = nilfs->ns_sbh; + u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size); + int valid[2], swp = 0; + + sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize, + &sbh[0]); + sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]); + + if (!sbp[0]) { + if (!sbp[1]) { + printk(KERN_ERR "NILFS: unable to read superblock\n"); + return -EIO; + } + printk(KERN_WARNING + "NILFS warning: unable to read primary superblock " + "(blocksize = %d)\n", blocksize); + } else if (!sbp[1]) { + printk(KERN_WARNING + "NILFS warning: unable to read secondary superblock " + "(blocksize = %d)\n", blocksize); + } + + /* + * Compare two super blocks and set 1 in swp if the secondary + * super block is valid and newer. Otherwise, set 0 in swp. + */ + valid[0] = nilfs_valid_sb(sbp[0]); + valid[1] = nilfs_valid_sb(sbp[1]); + swp = valid[1] && (!valid[0] || + le64_to_cpu(sbp[1]->s_last_cno) > + le64_to_cpu(sbp[0]->s_last_cno)); + + if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) { + brelse(sbh[1]); + sbh[1] = NULL; + sbp[1] = NULL; + valid[1] = 0; + swp = 0; + } + if (!valid[swp]) { + nilfs_release_super_block(nilfs); + printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n", + sb->s_id); + return -EINVAL; + } + + if (!valid[!swp]) + printk(KERN_WARNING "NILFS warning: broken superblock. " + "using spare superblock (blocksize = %d).\n", blocksize); + if (swp) + nilfs_swap_super_block(nilfs); + + nilfs->ns_sbwcount = 0; + nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); + nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq); + *sbpp = sbp[0]; + return 0; +} + +/** + * init_nilfs - initialize a NILFS instance. + * @nilfs: the_nilfs structure + * @sb: super block + * @data: mount options + * + * init_nilfs() performs common initialization per block device (e.g. + * reading the super block, getting disk layout information, initializing + * shared fields in the_nilfs). + * + * Return Value: On success, 0 is returned. On error, a negative error + * code is returned. + */ +int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) +{ + struct nilfs_super_block *sbp; + int blocksize; + int err; + + down_write(&nilfs->ns_sem); + + blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE); + if (!blocksize) { + printk(KERN_ERR "NILFS: unable to set blocksize\n"); + err = -EINVAL; + goto out; + } + err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); + if (err) + goto out; + + err = nilfs_store_magic_and_option(sb, sbp, data); + if (err) + goto failed_sbh; + + err = nilfs_check_feature_compatibility(sb, sbp); + if (err) + goto failed_sbh; + + blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); + if (blocksize < NILFS_MIN_BLOCK_SIZE || + blocksize > NILFS_MAX_BLOCK_SIZE) { + printk(KERN_ERR "NILFS: couldn't mount because of unsupported " + "filesystem blocksize %d\n", blocksize); + err = -EINVAL; + goto failed_sbh; + } + if (sb->s_blocksize != blocksize) { + int hw_blocksize = bdev_logical_block_size(sb->s_bdev); + + if (blocksize < hw_blocksize) { + printk(KERN_ERR + "NILFS: blocksize %d too small for device " + "(sector-size = %d).\n", + blocksize, hw_blocksize); + err = -EINVAL; + goto failed_sbh; + } + nilfs_release_super_block(nilfs); + sb_set_blocksize(sb, blocksize); + + err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); + if (err) + goto out; + /* not failed_sbh; sbh is released automatically + when reloading fails. */ + } + nilfs->ns_blocksize_bits = sb->s_blocksize_bits; + nilfs->ns_blocksize = blocksize; + + get_random_bytes(&nilfs->ns_next_generation, + sizeof(nilfs->ns_next_generation)); + + err = nilfs_store_disk_layout(nilfs, sbp); + if (err) + goto failed_sbh; + + sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits); + + nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); + + err = nilfs_store_log_cursor(nilfs, sbp); + if (err) + goto failed_sbh; + + set_nilfs_init(nilfs); + err = 0; + out: + up_write(&nilfs->ns_sem); + return err; + + failed_sbh: + nilfs_release_super_block(nilfs); + goto out; +} + +int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, + size_t nsegs) +{ + sector_t seg_start, seg_end; + sector_t start = 0, nblocks = 0; + unsigned int sects_per_block; + __u64 *sn; + int ret = 0; + + sects_per_block = (1 << nilfs->ns_blocksize_bits) / + bdev_logical_block_size(nilfs->ns_bdev); + for (sn = segnump; sn < segnump + nsegs; sn++) { + nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end); + + if (!nblocks) { + start = seg_start; + nblocks = seg_end - seg_start + 1; + } else if (start + nblocks == seg_start) { + nblocks += seg_end - seg_start + 1; + } else { + ret = blkdev_issue_discard(nilfs->ns_bdev, + start * sects_per_block, + nblocks * sects_per_block, + GFP_NOFS, 0); + if (ret < 0) + return ret; + nblocks = 0; + } + } + if (nblocks) + ret = blkdev_issue_discard(nilfs->ns_bdev, + start * sects_per_block, + nblocks * sects_per_block, + GFP_NOFS, 0); + return ret; +} + +int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) +{ + unsigned long ncleansegs; + + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; + return 0; +} + +int nilfs_near_disk_full(struct the_nilfs *nilfs) +{ + unsigned long ncleansegs, nincsegs; + + ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); + nincsegs = atomic_read(&nilfs->ns_ndirtyblks) / + nilfs->ns_blocks_per_segment + 1; + + return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs; +} + +struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno) +{ + struct rb_node *n; + struct nilfs_root *root; + + spin_lock(&nilfs->ns_cptree_lock); + n = nilfs->ns_cptree.rb_node; + while (n) { + root = rb_entry(n, struct nilfs_root, rb_node); + + if (cno < root->cno) { + n = n->rb_left; + } else if (cno > root->cno) { + n = n->rb_right; + } else { + atomic_inc(&root->count); + spin_unlock(&nilfs->ns_cptree_lock); + return root; + } + } + spin_unlock(&nilfs->ns_cptree_lock); + + return NULL; +} + +struct nilfs_root * +nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) +{ + struct rb_node **p, *parent; + struct nilfs_root *root, *new; + + root = nilfs_lookup_root(nilfs, cno); + if (root) + return root; + + new = kmalloc(sizeof(*root), GFP_KERNEL); + if (!new) + return NULL; + + spin_lock(&nilfs->ns_cptree_lock); + + p = &nilfs->ns_cptree.rb_node; + parent = NULL; + + while (*p) { + parent = *p; + root = rb_entry(parent, struct nilfs_root, rb_node); + + if (cno < root->cno) { + p = &(*p)->rb_left; + } else if (cno > root->cno) { + p = &(*p)->rb_right; + } else { + atomic_inc(&root->count); + spin_unlock(&nilfs->ns_cptree_lock); + kfree(new); + return root; + } + } + + new->cno = cno; + new->ifile = NULL; + new->nilfs = nilfs; + atomic_set(&new->count, 1); + atomic_set(&new->inodes_count, 0); + atomic_set(&new->blocks_count, 0); + + rb_link_node(&new->rb_node, parent, p); + rb_insert_color(&new->rb_node, &nilfs->ns_cptree); + + spin_unlock(&nilfs->ns_cptree_lock); + + return new; +} + +void nilfs_put_root(struct nilfs_root *root) +{ + if (atomic_dec_and_test(&root->count)) { + struct the_nilfs *nilfs = root->nilfs; + + spin_lock(&nilfs->ns_cptree_lock); + rb_erase(&root->rb_node, &nilfs->ns_cptree); + spin_unlock(&nilfs->ns_cptree_lock); + if (root->ifile) + iput(root->ifile); + + kfree(root); + } +} diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h new file mode 100644 index 00000000..9992b113 --- /dev/null +++ b/fs/nilfs2/the_nilfs.h @@ -0,0 +1,356 @@ +/* + * the_nilfs.h - the_nilfs shared structure. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi + * + */ + +#ifndef _THE_NILFS_H +#define _THE_NILFS_H + +#include +#include +#include +#include +#include +#include +#include + +struct nilfs_sc_info; + +/* the_nilfs struct */ +enum { + THE_NILFS_INIT = 0, /* Information from super_block is set */ + THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ + THE_NILFS_GC_RUNNING, /* gc process is running */ + THE_NILFS_SB_DIRTY, /* super block is dirty */ +}; + +/** + * struct the_nilfs - struct to supervise multiple nilfs mount points + * @ns_flags: flags + * @ns_bdev: block device + * @ns_sem: semaphore for shared states + * @ns_sbh: buffer heads of on-disk super blocks + * @ns_sbp: pointers to super block data + * @ns_sbwtime: previous write time of super block + * @ns_sbwcount: write count of super block + * @ns_sbsize: size of valid data in super block + * @ns_seg_seq: segment sequence counter + * @ns_segnum: index number of the latest full segment. + * @ns_nextnum: index number of the full segment index to be used next + * @ns_pseg_offset: offset of next partial segment in the current full segment + * @ns_cno: next checkpoint number + * @ns_ctime: write time of the last segment + * @ns_nongc_ctime: write time of the last segment not for cleaner operation + * @ns_ndirtyblks: Number of dirty data blocks + * @ns_last_segment_lock: lock protecting fields for the latest segment + * @ns_last_pseg: start block number of the latest segment + * @ns_last_seq: sequence value of the latest segment + * @ns_last_cno: checkpoint number of the latest segment + * @ns_prot_seq: least sequence number of segments which must not be reclaimed + * @ns_prev_seq: base sequence number used to decide if advance log cursor + * @ns_writer: log writer + * @ns_segctor_sem: semaphore protecting log write + * @ns_dat: DAT file inode + * @ns_cpfile: checkpoint file inode + * @ns_sufile: segusage file inode + * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root) + * @ns_cptree_lock: lock protecting @ns_cptree + * @ns_dirty_files: list of dirty files + * @ns_inode_lock: lock protecting @ns_dirty_files + * @ns_gc_inodes: dummy inodes to keep live blocks + * @ns_next_generation: next generation number for inodes + * @ns_next_gen_lock: lock protecting @ns_next_generation + * @ns_mount_opt: mount options + * @ns_resuid: uid for reserved blocks + * @ns_resgid: gid for reserved blocks + * @ns_interval: checkpoint creation interval + * @ns_watermark: watermark for the number of dirty buffers + * @ns_blocksize_bits: bit length of block size + * @ns_blocksize: block size + * @ns_nsegments: number of segments in filesystem + * @ns_blocks_per_segment: number of blocks per segment + * @ns_r_segments_percentage: reserved segments percentage + * @ns_nrsvsegs: number of reserved segments + * @ns_first_data_block: block number of first data block + * @ns_inode_size: size of on-disk inode + * @ns_first_ino: first not-special inode number + * @ns_crc_seed: seed value of CRC32 calculation + */ +struct the_nilfs { + unsigned long ns_flags; + + struct block_device *ns_bdev; + struct rw_semaphore ns_sem; + + /* + * used for + * - loading the latest checkpoint exclusively. + * - allocating a new full segment. + * - protecting s_dirt in the super_block struct + * (see nilfs_write_super) and the following fields. + */ + struct buffer_head *ns_sbh[2]; + struct nilfs_super_block *ns_sbp[2]; + time_t ns_sbwtime; + unsigned ns_sbwcount; + unsigned ns_sbsize; + unsigned ns_mount_state; + + /* + * Following fields are dedicated to a writable FS-instance. + * Except for the period seeking checkpoint, code outside the segment + * constructor must lock a segment semaphore while accessing these + * fields. + * The writable FS-instance is sole during a lifetime of the_nilfs. + */ + u64 ns_seg_seq; + __u64 ns_segnum; + __u64 ns_nextnum; + unsigned long ns_pseg_offset; + __u64 ns_cno; + time_t ns_ctime; + time_t ns_nongc_ctime; + atomic_t ns_ndirtyblks; + + /* + * The following fields hold information on the latest partial segment + * written to disk with a super root. These fields are protected by + * ns_last_segment_lock. + */ + spinlock_t ns_last_segment_lock; + sector_t ns_last_pseg; + u64 ns_last_seq; + __u64 ns_last_cno; + u64 ns_prot_seq; + u64 ns_prev_seq; + + struct nilfs_sc_info *ns_writer; + struct rw_semaphore ns_segctor_sem; + + /* + * Following fields are lock free except for the period before + * the_nilfs is initialized. + */ + struct inode *ns_dat; + struct inode *ns_cpfile; + struct inode *ns_sufile; + + /* Checkpoint tree */ + struct rb_root ns_cptree; + spinlock_t ns_cptree_lock; + + /* Dirty inode list */ + struct list_head ns_dirty_files; + spinlock_t ns_inode_lock; + + /* GC inode list */ + struct list_head ns_gc_inodes; + + /* Inode allocator */ + u32 ns_next_generation; + spinlock_t ns_next_gen_lock; + + /* Mount options */ + unsigned long ns_mount_opt; + + uid_t ns_resuid; + gid_t ns_resgid; + unsigned long ns_interval; + unsigned long ns_watermark; + + /* Disk layout information (static) */ + unsigned int ns_blocksize_bits; + unsigned int ns_blocksize; + unsigned long ns_nsegments; + unsigned long ns_blocks_per_segment; + unsigned long ns_r_segments_percentage; + unsigned long ns_nrsvsegs; + unsigned long ns_first_data_block; + int ns_inode_size; + int ns_first_ino; + u32 ns_crc_seed; +}; + +#define THE_NILFS_FNS(bit, name) \ +static inline void set_nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} \ +static inline void clear_nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} \ +static inline int nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} + +THE_NILFS_FNS(INIT, init) +THE_NILFS_FNS(DISCONTINUED, discontinued) +THE_NILFS_FNS(GC_RUNNING, gc_running) +THE_NILFS_FNS(SB_DIRTY, sb_dirty) + +/* + * Mount option operations + */ +#define nilfs_clear_opt(nilfs, opt) \ + do { (nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt; } while (0) +#define nilfs_set_opt(nilfs, opt) \ + do { (nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt; } while (0) +#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt) +#define nilfs_write_opt(nilfs, mask, opt) \ + do { (nilfs)->ns_mount_opt = \ + (((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) | \ + NILFS_MOUNT_##opt); \ + } while (0) + +/** + * struct nilfs_root - nilfs root object + * @cno: checkpoint number + * @rb_node: red-black tree node + * @count: refcount of this structure + * @nilfs: nilfs object + * @ifile: inode file + * @root: root inode + * @inodes_count: number of inodes + * @blocks_count: number of blocks (Reserved) + */ +struct nilfs_root { + __u64 cno; + struct rb_node rb_node; + + atomic_t count; + struct the_nilfs *nilfs; + struct inode *ifile; + + atomic_t inodes_count; + atomic_t blocks_count; +}; + +/* Special checkpoint number */ +#define NILFS_CPTREE_CURRENT_CNO 0 + +/* Minimum interval of periodical update of superblocks (in seconds) */ +#define NILFS_SB_FREQ 10 + +static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) +{ + u64 t = get_seconds(); + return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ; +} + +static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs) +{ + int flip_bits = nilfs->ns_sbwcount & 0x0FL; + return (flip_bits != 0x08 && flip_bits != 0x0F); +} + +void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); +struct the_nilfs *alloc_nilfs(struct block_device *bdev); +void destroy_nilfs(struct the_nilfs *nilfs); +int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data); +int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb); +unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs); +void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs); +int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t); +int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); +struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno); +struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs, + __u64 cno); +void nilfs_put_root(struct nilfs_root *root); +int nilfs_near_disk_full(struct the_nilfs *); +void nilfs_fall_back_super_block(struct the_nilfs *); +void nilfs_swap_super_block(struct the_nilfs *); + + +static inline void nilfs_get_root(struct nilfs_root *root) +{ + atomic_inc(&root->count); +} + +static inline int nilfs_valid_fs(struct the_nilfs *nilfs) +{ + unsigned valid_fs; + + down_read(&nilfs->ns_sem); + valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS); + up_read(&nilfs->ns_sem); + return valid_fs; +} + +static inline void +nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum, + sector_t *seg_start, sector_t *seg_end) +{ + *seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum; + *seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1; + if (segnum == 0) + *seg_start = nilfs->ns_first_data_block; +} + +static inline sector_t +nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum) +{ + return (segnum == 0) ? nilfs->ns_first_data_block : + (sector_t)nilfs->ns_blocks_per_segment * segnum; +} + +static inline __u64 +nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr) +{ + sector_t segnum = blocknr; + + sector_div(segnum, nilfs->ns_blocks_per_segment); + return segnum; +} + +static inline void +nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start, + sector_t seg_end) +{ + /* terminate the current full segment (used in case of I/O-error) */ + nilfs->ns_pseg_offset = seg_end - seg_start + 1; +} + +static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs) +{ + /* move forward with a full segment */ + nilfs->ns_segnum = nilfs->ns_nextnum; + nilfs->ns_pseg_offset = 0; + nilfs->ns_seg_seq++; +} + +static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs) +{ + __u64 cno; + + spin_lock(&nilfs->ns_last_segment_lock); + cno = nilfs->ns_last_cno; + spin_unlock(&nilfs->ns_last_segment_lock); + return cno; +} + +static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n) +{ + return n == nilfs->ns_segnum || n == nilfs->ns_nextnum; +} + +#endif /* _THE_NILFS_H */ diff --git a/fs/nls/Kconfig b/fs/nls/Kconfig new file mode 100644 index 00000000..a39edc41 --- /dev/null +++ b/fs/nls/Kconfig @@ -0,0 +1,464 @@ +# +# Native language support configuration +# + +menuconfig NLS + tristate "Native language support" + ---help--- + The base Native Language Support. A number of filesystems + depend on it (e.g. FAT, JOLIET, NT, BEOS filesystems), as well + as the ability of some filesystems to use native languages + (NCP, SMB). + + If unsure, say Y. + + To compile this code as a module, choose M here: the module + will be called nls_base. + +if NLS + +config NLS_DEFAULT + string "Default NLS Option" + default "iso8859-1" + ---help--- + The default NLS used when mounting file system. Note, that this is + the NLS used by your console, not the NLS used by a specific file + system (if different) to store data (filenames) on a disk. + Currently, the valid values are: + big5, cp437, cp737, cp775, cp850, cp852, cp855, cp857, cp860, cp861, + cp862, cp863, cp864, cp865, cp866, cp869, cp874, cp932, cp936, + cp949, cp950, cp1251, cp1255, euc-jp, euc-kr, gb2312, iso8859-1, + iso8859-2, iso8859-3, iso8859-4, iso8859-5, iso8859-6, iso8859-7, + iso8859-8, iso8859-9, iso8859-13, iso8859-14, iso8859-15, + koi8-r, koi8-ru, koi8-u, sjis, tis-620, utf8. + If you specify a wrong value, it will use the built-in NLS; + compatible with iso8859-1. + + If unsure, specify it as "iso8859-1". + +config NLS_CODEPAGE_437 + tristate "Codepage 437 (United States, Canada)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored + in so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage that is used in + the United States and parts of Canada. This is recommended. + +config NLS_CODEPAGE_737 + tristate "Codepage 737 (Greek)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored + in so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage that is used for + Greek. If unsure, say N. + +config NLS_CODEPAGE_775 + tristate "Codepage 775 (Baltic Rim)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored + in so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage that is used + for the Baltic Rim Languages (Latvian and Lithuanian). If unsure, + say N. + +config NLS_CODEPAGE_850 + tristate "Codepage 850 (Europe)" + ---help--- + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage that is used for + much of Europe -- United Kingdom, Germany, Spain, Italy, and [add + more countries here]. It has some characters useful to many European + languages that are not part of the US codepage 437. + + If unsure, say Y. + +config NLS_CODEPAGE_852 + tristate "Codepage 852 (Central/Eastern Europe)" + ---help--- + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the Latin 2 codepage used by DOS + for much of Central and Eastern Europe. It has all the required + characters for these languages: Albanian, Croatian, Czech, English, + Finnish, Hungarian, Irish, German, Polish, Romanian, Serbian (Latin + transcription), Slovak, Slovenian, and Sorbian. + +config NLS_CODEPAGE_855 + tristate "Codepage 855 (Cyrillic)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Cyrillic. + +config NLS_CODEPAGE_857 + tristate "Codepage 857 (Turkish)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Turkish. + +config NLS_CODEPAGE_860 + tristate "Codepage 860 (Portuguese)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Portuguese. + +config NLS_CODEPAGE_861 + tristate "Codepage 861 (Icelandic)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Icelandic. + +config NLS_CODEPAGE_862 + tristate "Codepage 862 (Hebrew)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Hebrew. + +config NLS_CODEPAGE_863 + tristate "Codepage 863 (Canadian French)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Canadian + French. + +config NLS_CODEPAGE_864 + tristate "Codepage 864 (Arabic)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Arabic. + +config NLS_CODEPAGE_865 + tristate "Codepage 865 (Norwegian, Danish)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for the Nordic + European countries. + +config NLS_CODEPAGE_866 + tristate "Codepage 866 (Cyrillic/Russian)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for + Cyrillic/Russian. + +config NLS_CODEPAGE_869 + tristate "Codepage 869 (Greek)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Greek. + +config NLS_CODEPAGE_936 + tristate "Simplified Chinese charset (CP936, GB2312)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Simplified + Chinese(GBK). + +config NLS_CODEPAGE_950 + tristate "Traditional Chinese charset (Big5)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Traditional + Chinese(Big5). + +config NLS_CODEPAGE_932 + tristate "Japanese charsets (Shift-JIS, EUC-JP)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Shift-JIS + or EUC-JP. To use EUC-JP, you can use 'euc-jp' as mount option or + NLS Default value during kernel configuration, instead of 'cp932'. + +config NLS_CODEPAGE_949 + tristate "Korean charset (CP949, EUC-KR)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for UHC. + +config NLS_CODEPAGE_874 + tristate "Thai charset (CP874, TIS-620)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Thai. + +config NLS_ISO8859_8 + tristate "Hebrew charsets (ISO-8859-8, CP1255)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for ISO8859-8, the Hebrew + character set. + +config NLS_CODEPAGE_1250 + tristate "Windows CP1250 (Slavic/Central European Languages)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CDROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Windows CP-1250 + character set, which works for most Latin-written Slavic and Central + European languages: Czech, German, Hungarian, Polish, Rumanian, Croatian, + Slovak, Slovene. + +config NLS_CODEPAGE_1251 + tristate "Windows CP1251 (Bulgarian, Belarusian)" + help + The Microsoft FAT file system family can deal with filenames in + native language character sets. These character sets are stored in + so-called DOS codepages. You need to include the appropriate + codepage if you want to be able to read/write these filenames on + DOS/Windows partitions correctly. This does apply to the filenames + only, not to the file contents. You can include several codepages; + say Y here if you want to include the DOS codepage for Russian and + Bulgarian and Belarusian. + +config NLS_ASCII + tristate "ASCII (United States)" + help + An ASCII NLS module is needed if you want to override the + DEFAULT NLS with this very basic charset and don't want any + non-ASCII characters to be translated. + +config NLS_ISO8859_1 + tristate "NLS ISO 8859-1 (Latin 1; Western European Languages)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 1 character + set, which covers most West European languages such as Albanian, + Catalan, Danish, Dutch, English, Faeroese, Finnish, French, German, + Galician, Irish, Icelandic, Italian, Norwegian, Portuguese, Spanish, + and Swedish. It is also the default for the US. If unsure, say Y. + +config NLS_ISO8859_2 + tristate "NLS ISO 8859-2 (Latin 2; Slavic/Central European Languages)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 2 character + set, which works for most Latin-written Slavic and Central European + languages: Czech, German, Hungarian, Polish, Rumanian, Croatian, + Slovak, Slovene. + +config NLS_ISO8859_3 + tristate "NLS ISO 8859-3 (Latin 3; Esperanto, Galician, Maltese, Turkish)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 3 character + set, which is popular with authors of Esperanto, Galician, Maltese, + and Turkish. + +config NLS_ISO8859_4 + tristate "NLS ISO 8859-4 (Latin 4; old Baltic charset)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 4 character + set which introduces letters for Estonian, Latvian, and + Lithuanian. It is an incomplete predecessor of Latin 7. + +config NLS_ISO8859_5 + tristate "NLS ISO 8859-5 (Cyrillic)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for ISO8859-5, a Cyrillic + character set with which you can type Bulgarian, Belarusian, + Macedonian, Russian, Serbian, and Ukrainian. Note that the charset + KOI8-R is preferred in Russia. + +config NLS_ISO8859_6 + tristate "NLS ISO 8859-6 (Arabic)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for ISO8859-6, the Arabic + character set. + +config NLS_ISO8859_7 + tristate "NLS ISO 8859-7 (Modern Greek)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for ISO8859-7, the Modern + Greek character set. + +config NLS_ISO8859_9 + tristate "NLS ISO 8859-9 (Latin 5; Turkish)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 5 character + set, and it replaces the rarely needed Icelandic letters in Latin 1 + with the Turkish ones. Useful in Turkey. + +config NLS_ISO8859_13 + tristate "NLS ISO 8859-13 (Latin 7; Baltic)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 7 character + set, which supports modern Baltic languages including Latvian + and Lithuanian. + +config NLS_ISO8859_14 + tristate "NLS ISO 8859-14 (Latin 8; Celtic)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 8 character + set, which adds the last accented vowels for Welsh (aka Cymraeg) + (and Manx Gaelic) that were missing in Latin 1. + has further information. + +config NLS_ISO8859_15 + tristate "NLS ISO 8859-15 (Latin 9; Western European Languages with Euro)" + ---help--- + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the Latin 9 character + set, which covers most West European languages such as Albanian, + Catalan, Danish, Dutch, English, Estonian, Faeroese, Finnish, + French, German, Galician, Irish, Icelandic, Italian, Norwegian, + Portuguese, Spanish, and Swedish. Latin 9 is an update to + Latin 1 (ISO 8859-1) that removes a handful of rarely used + characters and instead adds support for Estonian, corrects the + support for French and Finnish, and adds the new Euro character. + If unsure, say Y. + +config NLS_KOI8_R + tristate "NLS KOI8-R (Russian)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the preferred Russian + character set. + +config NLS_KOI8_U + tristate "NLS KOI8-U/RU (Ukrainian, Belarusian)" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the preferred Ukrainian + (koi8-u) and Belarusian (koi8-ru) character sets. + +config NLS_UTF8 + tristate "NLS UTF-8" + help + If you want to display filenames with native language characters + from the Microsoft FAT file system family or from JOLIET CD-ROMs + correctly on the screen, you need to include the appropriate + input/output character sets. Say Y here for the UTF-8 encoding of + the Unicode/ISO9646 universal character set. + +endif # NLS diff --git a/fs/nls/Makefile b/fs/nls/Makefile new file mode 100644 index 00000000..f499dd7c --- /dev/null +++ b/fs/nls/Makefile @@ -0,0 +1,44 @@ +# +# Makefile for native language support +# + +obj-$(CONFIG_NLS) += nls_base.o + +obj-$(CONFIG_NLS_CODEPAGE_437) += nls_cp437.o +obj-$(CONFIG_NLS_CODEPAGE_737) += nls_cp737.o +obj-$(CONFIG_NLS_CODEPAGE_775) += nls_cp775.o +obj-$(CONFIG_NLS_CODEPAGE_850) += nls_cp850.o +obj-$(CONFIG_NLS_CODEPAGE_852) += nls_cp852.o +obj-$(CONFIG_NLS_CODEPAGE_855) += nls_cp855.o +obj-$(CONFIG_NLS_CODEPAGE_857) += nls_cp857.o +obj-$(CONFIG_NLS_CODEPAGE_860) += nls_cp860.o +obj-$(CONFIG_NLS_CODEPAGE_861) += nls_cp861.o +obj-$(CONFIG_NLS_CODEPAGE_862) += nls_cp862.o +obj-$(CONFIG_NLS_CODEPAGE_863) += nls_cp863.o +obj-$(CONFIG_NLS_CODEPAGE_864) += nls_cp864.o +obj-$(CONFIG_NLS_CODEPAGE_865) += nls_cp865.o +obj-$(CONFIG_NLS_CODEPAGE_866) += nls_cp866.o +obj-$(CONFIG_NLS_CODEPAGE_869) += nls_cp869.o +obj-$(CONFIG_NLS_CODEPAGE_874) += nls_cp874.o +obj-$(CONFIG_NLS_CODEPAGE_932) += nls_cp932.o nls_euc-jp.o +obj-$(CONFIG_NLS_CODEPAGE_936) += nls_cp936.o +obj-$(CONFIG_NLS_CODEPAGE_949) += nls_cp949.o +obj-$(CONFIG_NLS_CODEPAGE_950) += nls_cp950.o +obj-$(CONFIG_NLS_CODEPAGE_1250) += nls_cp1250.o +obj-$(CONFIG_NLS_CODEPAGE_1251) += nls_cp1251.o +obj-$(CONFIG_NLS_ASCII) += nls_ascii.o +obj-$(CONFIG_NLS_ISO8859_1) += nls_iso8859-1.o +obj-$(CONFIG_NLS_ISO8859_2) += nls_iso8859-2.o +obj-$(CONFIG_NLS_ISO8859_3) += nls_iso8859-3.o +obj-$(CONFIG_NLS_ISO8859_4) += nls_iso8859-4.o +obj-$(CONFIG_NLS_ISO8859_5) += nls_iso8859-5.o +obj-$(CONFIG_NLS_ISO8859_6) += nls_iso8859-6.o +obj-$(CONFIG_NLS_ISO8859_7) += nls_iso8859-7.o +obj-$(CONFIG_NLS_ISO8859_8) += nls_cp1255.o +obj-$(CONFIG_NLS_ISO8859_9) += nls_iso8859-9.o +obj-$(CONFIG_NLS_ISO8859_13) += nls_iso8859-13.o +obj-$(CONFIG_NLS_ISO8859_14) += nls_iso8859-14.o +obj-$(CONFIG_NLS_ISO8859_15) += nls_iso8859-15.o +obj-$(CONFIG_NLS_KOI8_R) += nls_koi8-r.o +obj-$(CONFIG_NLS_KOI8_U) += nls_koi8-u.o nls_koi8-ru.o +obj-$(CONFIG_NLS_UTF8) += nls_utf8.o diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c new file mode 100644 index 00000000..7020e940 --- /dev/null +++ b/fs/nls/nls_ascii.c @@ -0,0 +1,167 @@ +/* + * linux/fs/nls/nls_ascii.c + * + * Charset ascii translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "ascii", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_ascii(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_ascii(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_ascii) +module_exit(exit_nls_ascii) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c new file mode 100644 index 00000000..44a88a9f --- /dev/null +++ b/fs/nls/nls_base.c @@ -0,0 +1,524 @@ +/* + * linux/fs/nls/nls_base.c + * + * Native language support--charsets and unicode translations. + * By Gordon Chaffee 1996, 1997 + * + * Unicode based case conversion 1999 by Wolfram Pienkoss + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct nls_table default_table; +static struct nls_table *tables = &default_table; +static DEFINE_SPINLOCK(nls_lock); + +/* + * Sample implementation from Unicode home page. + * http://www.stonehand.com/unicode/standard/fss-utf.html + */ +struct utf8_table { + int cmask; + int cval; + int shift; + long lmask; + long lval; +}; + +static const struct utf8_table utf8_table[] = +{ + {0x80, 0x00, 0*6, 0x7F, 0, /* 1 byte sequence */}, + {0xE0, 0xC0, 1*6, 0x7FF, 0x80, /* 2 byte sequence */}, + {0xF0, 0xE0, 2*6, 0xFFFF, 0x800, /* 3 byte sequence */}, + {0xF8, 0xF0, 3*6, 0x1FFFFF, 0x10000, /* 4 byte sequence */}, + {0xFC, 0xF8, 4*6, 0x3FFFFFF, 0x200000, /* 5 byte sequence */}, + {0xFE, 0xFC, 5*6, 0x7FFFFFFF, 0x4000000, /* 6 byte sequence */}, + {0, /* end of table */} +}; + +#define UNICODE_MAX 0x0010ffff +#define PLANE_SIZE 0x00010000 + +#define SURROGATE_MASK 0xfffff800 +#define SURROGATE_PAIR 0x0000d800 +#define SURROGATE_LOW 0x00000400 +#define SURROGATE_BITS 0x000003ff + +int utf8_to_utf32(const u8 *s, int len, unicode_t *pu) +{ + unsigned long l; + int c0, c, nc; + const struct utf8_table *t; + + nc = 0; + c0 = *s; + l = c0; + for (t = utf8_table; t->cmask; t++) { + nc++; + if ((c0 & t->cmask) == t->cval) { + l &= t->lmask; + if (l < t->lval || l > UNICODE_MAX || + (l & SURROGATE_MASK) == SURROGATE_PAIR) + return -1; + *pu = (unicode_t) l; + return nc; + } + if (len <= nc) + return -1; + s++; + c = (*s ^ 0x80) & 0xFF; + if (c & 0xC0) + return -1; + l = (l << 6) | c; + } + return -1; +} +EXPORT_SYMBOL(utf8_to_utf32); + +int utf32_to_utf8(unicode_t u, u8 *s, int maxlen) +{ + unsigned long l; + int c, nc; + const struct utf8_table *t; + + if (!s) + return 0; + + l = u; + if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR) + return -1; + + nc = 0; + for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) { + nc++; + if (l <= t->lmask) { + c = t->shift; + *s = (u8) (t->cval | (l >> c)); + while (c > 0) { + c -= 6; + s++; + *s = (u8) (0x80 | ((l >> c) & 0x3F)); + } + return nc; + } + } + return -1; +} +EXPORT_SYMBOL(utf32_to_utf8); + +int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs) +{ + u16 *op; + int size; + unicode_t u; + + op = pwcs; + while (*s && len > 0) { + if (*s & 0x80) { + size = utf8_to_utf32(s, len, &u); + if (size < 0) + return -EINVAL; + + if (u >= PLANE_SIZE) { + u -= PLANE_SIZE; + *op++ = (wchar_t) (SURROGATE_PAIR | + ((u >> 10) & SURROGATE_BITS)); + *op++ = (wchar_t) (SURROGATE_PAIR | + SURROGATE_LOW | + (u & SURROGATE_BITS)); + } else { + *op++ = (wchar_t) u; + } + s += size; + len -= size; + } else { + *op++ = *s++; + len--; + } + } + return op - pwcs; +} +EXPORT_SYMBOL(utf8s_to_utf16s); + +static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian) +{ + switch (endian) { + default: + return c; + case UTF16_LITTLE_ENDIAN: + return __le16_to_cpu(c); + case UTF16_BIG_ENDIAN: + return __be16_to_cpu(c); + } +} + +int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, + u8 *s, int maxlen) +{ + u8 *op; + int size; + unsigned long u, v; + + op = s; + while (len > 0 && maxlen > 0) { + u = get_utf16(*pwcs, endian); + if (!u) + break; + pwcs++; + len--; + if (u > 0x7f) { + if ((u & SURROGATE_MASK) == SURROGATE_PAIR) { + if (u & SURROGATE_LOW) { + /* Ignore character and move on */ + continue; + } + if (len <= 0) + break; + v = get_utf16(*pwcs, endian); + if ((v & SURROGATE_MASK) != SURROGATE_PAIR || + !(v & SURROGATE_LOW)) { + /* Ignore character and move on */ + continue; + } + u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10) + + (v & SURROGATE_BITS); + pwcs++; + len--; + } + size = utf32_to_utf8(u, op, maxlen); + if (size == -1) { + /* Ignore character and move on */ + } else { + op += size; + maxlen -= size; + } + } else { + *op++ = (u8) u; + maxlen--; + } + } + return op - s; +} +EXPORT_SYMBOL(utf16s_to_utf8s); + +int register_nls(struct nls_table * nls) +{ + struct nls_table ** tmp = &tables; + + if (nls->next) + return -EBUSY; + + spin_lock(&nls_lock); + while (*tmp) { + if (nls == *tmp) { + spin_unlock(&nls_lock); + return -EBUSY; + } + tmp = &(*tmp)->next; + } + nls->next = tables; + tables = nls; + spin_unlock(&nls_lock); + return 0; +} + +int unregister_nls(struct nls_table * nls) +{ + struct nls_table ** tmp = &tables; + + spin_lock(&nls_lock); + while (*tmp) { + if (nls == *tmp) { + *tmp = nls->next; + spin_unlock(&nls_lock); + return 0; + } + tmp = &(*tmp)->next; + } + spin_unlock(&nls_lock); + return -EINVAL; +} + +static struct nls_table *find_nls(char *charset) +{ + struct nls_table *nls; + spin_lock(&nls_lock); + for (nls = tables; nls; nls = nls->next) { + if (!strcmp(nls->charset, charset)) + break; + if (nls->alias && !strcmp(nls->alias, charset)) + break; + } + if (nls && !try_module_get(nls->owner)) + nls = NULL; + spin_unlock(&nls_lock); + return nls; +} + +struct nls_table *load_nls(char *charset) +{ + return try_then_request_module(find_nls(charset), "nls_%s", charset); +} + +void unload_nls(struct nls_table *nls) +{ + if (nls) + module_put(nls->owner); +} + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x00a1, 0x00a2, 0x00a3, + 0x00a4, 0x00a5, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x00aa, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x00af, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x00b2, 0x00b3, + 0x00b4, 0x00b5, 0x00b6, 0x00b7, + 0x00b8, 0x00b9, 0x00ba, 0x00bb, + 0x00bc, 0x00bd, 0x00be, 0x00bf, + /* 0xc0*/ + 0x00c0, 0x00c1, 0x00c2, 0x00c3, + 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, + 0x00cc, 0x00cd, 0x00ce, 0x00cf, + /* 0xd0*/ + 0x00d0, 0x00d1, 0x00d2, 0x00d3, + 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x00d9, 0x00da, 0x00db, + 0x00dc, 0x00dd, 0x00de, 0x00df, + /* 0xe0*/ + 0x00e0, 0x00e1, 0x00e2, 0x00e3, + 0x00e4, 0x00e5, 0x00e6, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, + 0x00ec, 0x00ed, 0x00ee, 0x00ef, + /* 0xf0*/ + 0x00f0, 0x00f1, 0x00f2, 0x00f3, + 0x00f4, 0x00f5, 0x00f6, 0x00f7, + 0x00f8, 0x00f9, 0x00fa, 0x00fb, + 0x00fc, 0x00fd, 0x00fe, 0x00ff, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00 +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table default_table = { + .charset = "default", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, +}; + +/* Returns a simple default translation table */ +struct nls_table *load_nls_default(void) +{ + struct nls_table *default_nls; + + default_nls = load_nls(CONFIG_NLS_DEFAULT); + if (default_nls != NULL) + return default_nls; + else + return &default_table; +} + +EXPORT_SYMBOL(register_nls); +EXPORT_SYMBOL(unregister_nls); +EXPORT_SYMBOL(unload_nls); +EXPORT_SYMBOL(load_nls); +EXPORT_SYMBOL(load_nls_default); + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c new file mode 100644 index 00000000..c8471fe7 --- /dev/null +++ b/fs/nls/nls_cp1250.c @@ -0,0 +1,347 @@ +/* + * linux/fs/nls/nls_cp1250.c + * + * Charset cp1250 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x20ac, 0x0000, 0x201a, 0x0000, + 0x201e, 0x2026, 0x2020, 0x2021, + 0x0000, 0x2030, 0x0160, 0x2039, + 0x015a, 0x0164, 0x017d, 0x0179, + /* 0x90*/ + 0x0000, 0x2018, 0x2019, 0x201c, + 0x201d, 0x2022, 0x2013, 0x2014, + 0x0000, 0x2122, 0x0161, 0x203a, + 0x015b, 0x0165, 0x017e, 0x017a, + /* 0xa0*/ + 0x00a0, 0x02c7, 0x02d8, 0x0141, + 0x00a4, 0x0104, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x015e, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x017b, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x02db, 0x0142, + 0x00b4, 0x00b5, 0x00b6, 0x00b7, + 0x00b8, 0x0105, 0x015f, 0x00bb, + 0x013d, 0x02dd, 0x013e, 0x017c, + /* 0xc0*/ + 0x0154, 0x00c1, 0x00c2, 0x0102, + 0x00c4, 0x0139, 0x0106, 0x00c7, + 0x010c, 0x00c9, 0x0118, 0x00cb, + 0x011a, 0x00cd, 0x00ce, 0x010e, + /* 0xd0*/ + 0x0110, 0x0143, 0x0147, 0x00d3, + 0x00d4, 0x0150, 0x00d6, 0x00d7, + 0x0158, 0x016e, 0x00da, 0x0170, + 0x00dc, 0x00dd, 0x0162, 0x00df, + /* 0xe0*/ + 0x0155, 0x00e1, 0x00e2, 0x0103, + 0x00e4, 0x013a, 0x0107, 0x00e7, + 0x010d, 0x00e9, 0x0119, 0x00eb, + 0x011b, 0x00ed, 0x00ee, 0x010f, + /* 0xf0*/ + 0x0111, 0x0144, 0x0148, 0x00f3, + 0x00f4, 0x0151, 0x00f6, 0x00f7, + 0x0159, 0x016f, 0x00fa, 0x0171, + 0x00fc, 0x00fd, 0x0163, 0x02d9, + }; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0x00, 0x00, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */ + 0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0x00, 0x00, 0xda, 0x00, 0xdc, 0xdd, 0x00, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */ + 0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0x00, 0x00, 0xfa, 0x00, 0xfc, 0xfd, 0x00, 0x00, /* 0xf8-0xff */ + }; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0xc3, 0xe3, 0xa5, 0xb9, 0xc6, 0xe6, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0xcf, 0xef, /* 0x08-0x0f */ + 0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xca, 0xea, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0xc5, 0xe5, 0x00, 0x00, 0xbc, 0xbe, 0x00, /* 0x38-0x3f */ + 0x00, 0xa3, 0xb3, 0xd1, 0xf1, 0x00, 0x00, 0xd2, /* 0x40-0x47 */ + 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xd5, 0xf5, 0x00, 0x00, 0xc0, 0xe0, 0x00, 0x00, /* 0x50-0x57 */ + 0xd8, 0xf8, 0x8c, 0x9c, 0x00, 0x00, 0xaa, 0xba, /* 0x58-0x5f */ + 0x8a, 0x9a, 0xde, 0xfe, 0x8d, 0x9d, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd9, 0xf9, /* 0x68-0x6f */ + 0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x8f, 0x9f, 0xaf, 0xbf, 0x8e, 0x9e, 0x00, /* 0x78-0x7f */ + + }; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa1, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0xa2, 0xff, 0x00, 0xb2, 0x00, 0xbd, 0x00, 0x00, /* 0xd8-0xdf */ + }; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */ + 0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + }; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + }; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, NULL, NULL, NULL, NULL, NULL, NULL, + }; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x00, 0x82, 0x00, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x00, 0x89, 0x9a, 0x8b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x88-0x8f */ + 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x00, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xb3, 0xa4, 0xb9, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xba, 0xab, 0xac, 0xad, 0xae, 0xbf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbe, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ + }; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x00, 0x82, 0x00, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x00, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x00, 0x99, 0x8a, 0x9b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xa3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xa5, 0xaa, 0xbb, 0xbc, 0xbd, 0xbc, 0xaf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0x00, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ + }; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp1250", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp1250(void) +{ + return register_nls(&table); +} +static void __exit exit_nls_cp1250(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp1250) +module_exit(exit_nls_cp1250) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c new file mode 100644 index 00000000..1939b46e --- /dev/null +++ b/fs/nls/nls_cp1251.c @@ -0,0 +1,302 @@ +/* + * linux/fs/nls/nls_cp1251.c + * + * Charset cp1251 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0402, 0x0403, 0x201a, 0x0453, + 0x201e, 0x2026, 0x2020, 0x2021, + 0x20ac, 0x2030, 0x0409, 0x2039, + 0x040a, 0x040c, 0x040b, 0x040f, + /* 0x90*/ + 0x0452, 0x2018, 0x2019, 0x201c, + 0x201d, 0x2022, 0x2013, 0x2014, + 0x0000, 0x2122, 0x0459, 0x203a, + 0x045a, 0x045c, 0x045b, 0x045f, + /* 0xa0*/ + 0x00a0, 0x040e, 0x045e, 0x0408, + 0x00a4, 0x0490, 0x00a6, 0x00a7, + 0x0401, 0x00a9, 0x0404, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x0407, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x0406, 0x0456, + 0x0491, 0x00b5, 0x00b6, 0x00b7, + 0x0451, 0x2116, 0x0454, 0x00bb, + 0x0458, 0x0405, 0x0455, 0x0457, + /* 0xc0*/ + 0x0410, 0x0411, 0x0412, 0x0413, + 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, + 0x041c, 0x041d, 0x041e, 0x041f, + /* 0xd0*/ + 0x0420, 0x0421, 0x0422, 0x0423, + 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, + 0x042c, 0x042d, 0x042e, 0x042f, + /* 0xe0*/ + 0x0430, 0x0431, 0x0432, 0x0433, + 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0439, 0x043a, 0x043b, + 0x043c, 0x043d, 0x043e, 0x043f, + /* 0xf0*/ + 0x0440, 0x0441, 0x0442, 0x0443, + 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044a, 0x044b, + 0x044c, 0x044d, 0x044e, 0x044f, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0x00, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0x00, 0x00, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page04[256] = { + 0x00, 0xa8, 0x80, 0x81, 0xaa, 0xbd, 0xb2, 0xaf, /* 0x00-0x07 */ + 0xa3, 0x8a, 0x8c, 0x8e, 0x8d, 0x00, 0xa1, 0x8f, /* 0x08-0x0f */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x10-0x17 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x18-0x1f */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x20-0x27 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0x28-0x2f */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x30-0x37 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x38-0x3f */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x40-0x47 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0x48-0x4f */ + 0x00, 0xb8, 0x90, 0x83, 0xba, 0xbe, 0xb3, 0xbf, /* 0x50-0x57 */ + 0xbc, 0x9a, 0x9c, 0x9e, 0x9d, 0x00, 0xa2, 0x9f, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0xa5, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */ + 0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x90, 0x83, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x9a, 0x8b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa2, 0xa2, 0xbc, 0xa4, 0xb4, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xb8, 0xa9, 0xba, 0xab, 0xac, 0xad, 0xae, 0xbf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb3, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + 0x80, 0x81, 0x82, 0x81, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x80, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x8a, 0x9b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa1, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb2, 0xa5, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xa8, 0xb9, 0xaa, 0xbb, 0xa3, 0xbd, 0xbd, 0xaf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp1251", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp1251(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp1251(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp1251) +module_exit(exit_nls_cp1251) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c new file mode 100644 index 00000000..8120ae2e --- /dev/null +++ b/fs/nls/nls_cp1255.c @@ -0,0 +1,385 @@ +/* + * linux/fs/nls/nls_cp1255.c + * + * Charset cp1255 translation tables. + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x20ac, 0x0000, 0x201a, 0x0192, + 0x201e, 0x2026, 0x2020, 0x2021, + 0x02c6, 0x2030, 0x0000, 0x2039, + 0x0000, 0x0000, 0x0000, 0x0000, + /* 0x90*/ + 0x0000, 0x2018, 0x2019, 0x201c, + 0x201d, 0x2022, 0x2013, 0x2014, + 0x02dc, 0x2122, 0x0000, 0x203a, + 0x0000, 0x0000, 0x0000, 0x0000, + /* 0xa0*/ + 0x00a0, 0x00a1, 0x00a2, 0x00a3, + 0x20aa, 0x00a5, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x00d7, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x203e, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x00b2, 0x00b3, + 0x00b4, 0x00b5, 0x00b6, 0x00b7, + 0x00b8, 0x00b9, 0x00f7, 0x00bb, + 0x00bc, 0x00bd, 0x00be, 0x00bf, + /* 0xc0*/ + 0x05b0, 0x05b1, 0x05b2, 0x05b3, + 0x05b4, 0x05b5, 0x05b6, 0x05b7, + 0x05b8, 0x05b9, 0x0000, 0x05bb, + 0x05bc, 0x05bd, 0x05be, 0x05bf, + /* 0xd0*/ + 0x05c0, 0x05c1, 0x05c2, 0x05c3, + 0x05f0, 0x05f1, 0x05f2, 0x05f3, + 0x05f4, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x2017, + /* 0xe0*/ + 0x05d0, 0x05d1, 0x05d2, 0x05d3, + 0x05d4, 0x05d5, 0x05d6, 0x05d7, + 0x05d8, 0x05d9, 0x05da, 0x05db, + 0x05dc, 0x05dd, 0x05de, 0x05df, + /* 0xf0*/ + 0x05e0, 0x05e1, 0x05e2, 0x05e3, + 0x05e4, 0x05e5, 0x05e6, 0x05e7, + 0x05e8, 0x05e9, 0x05ea, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0x00, 0xa2, 0xa3, 0x00, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0x00, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xba, /* 0xf0-0xf7 */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ +}; + +static const unsigned char page05[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xb0-0xb7 */ + 0xc8, 0xc9, 0x00, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xb8-0xbf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xd0-0xd7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xd8-0xdf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xe0-0xe7 */ + 0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0xfe, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */ + 0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0xa4, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, NULL, NULL, page05, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, page21, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp1255", + .alias = "iso8859-8", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp1255(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp1255(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp1255) +module_exit(exit_nls_cp1255) + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NLS(iso8859-8); diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c new file mode 100644 index 00000000..ff37a462 --- /dev/null +++ b/fs/nls/nls_cp437.c @@ -0,0 +1,388 @@ +/* + * linux/fs/nls/nls_cp437.c + * + * Charset cp437 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00e4, 0x00e0, 0x00e5, 0x00e7, + 0x00ea, 0x00eb, 0x00e8, 0x00ef, + 0x00ee, 0x00ec, 0x00c4, 0x00c5, + /* 0x90*/ + 0x00c9, 0x00e6, 0x00c6, 0x00f4, + 0x00f6, 0x00f2, 0x00fb, 0x00f9, + 0x00ff, 0x00d6, 0x00dc, 0x00a2, + 0x00a3, 0x00a5, 0x20a7, 0x0192, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x00aa, 0x00ba, + 0x00bf, 0x2310, 0x00ac, 0x00bd, + 0x00bc, 0x00a1, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x03b1, 0x00df, 0x0393, 0x03c0, + 0x03a3, 0x03c3, 0x00b5, 0x03c4, + 0x03a6, 0x0398, 0x03a9, 0x03b4, + 0x221e, 0x03c6, 0x03b5, 0x2229, + /* 0xf0*/ + 0x2261, 0x00b1, 0x2265, 0x2264, + 0x2320, 0x2321, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x207f, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0xad, 0x9b, 0x9c, 0x00, 0x9d, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */ + 0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */ + 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */ + 0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */ + 0x00, 0xa4, 0x95, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */ + 0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x98, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */ + 0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0x00, 0x8e, 0x00, 0x8f, 0x80, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x92, 0x92, 0x00, 0x99, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp437", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp437(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp437(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp437) +module_exit(exit_nls_cp437) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c new file mode 100644 index 00000000..f5576b8b --- /dev/null +++ b/fs/nls/nls_cp737.c @@ -0,0 +1,351 @@ +/* + * linux/fs/nls/nls_cp737.c + * + * Charset cp737 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0391, 0x0392, 0x0393, 0x0394, + 0x0395, 0x0396, 0x0397, 0x0398, + 0x0399, 0x039a, 0x039b, 0x039c, + 0x039d, 0x039e, 0x039f, 0x03a0, + /* 0x90*/ + 0x03a1, 0x03a3, 0x03a4, 0x03a5, + 0x03a6, 0x03a7, 0x03a8, 0x03a9, + 0x03b1, 0x03b2, 0x03b3, 0x03b4, + 0x03b5, 0x03b6, 0x03b7, 0x03b8, + /* 0xa0*/ + 0x03b9, 0x03ba, 0x03bb, 0x03bc, + 0x03bd, 0x03be, 0x03bf, 0x03c0, + 0x03c1, 0x03c3, 0x03c2, 0x03c4, + 0x03c5, 0x03c6, 0x03c7, 0x03c8, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x03c9, 0x03ac, 0x03ad, 0x03ae, + 0x03ca, 0x03af, 0x03cc, 0x03cd, + 0x03cb, 0x03ce, 0x0386, 0x0388, + 0x0389, 0x038a, 0x038c, 0x038e, + /* 0xf0*/ + 0x038f, 0x00b1, 0x2265, 0x2264, + 0x03aa, 0x03ab, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x207f, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0x00, 0x00, 0xfa, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, /* 0xf0-0xf7 */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xea, 0x00, /* 0x80-0x87 */ + 0xeb, 0xec, 0xed, 0x00, 0xee, 0x00, 0xef, 0xf0, /* 0x88-0x8f */ + 0x00, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, /* 0x90-0x97 */ + 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, /* 0x98-0x9f */ + 0x8f, 0x90, 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, /* 0xa0-0xa7 */ + 0x96, 0x97, 0xf4, 0xf5, 0xe1, 0xe2, 0xe3, 0xe5, /* 0xa8-0xaf */ + 0x00, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, /* 0xb0-0xb7 */ + 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, /* 0xb8-0xbf */ + 0xa7, 0xa8, 0xaa, 0xa9, 0xab, 0xac, 0xad, 0xae, /* 0xc0-0xc7 */ + 0xaf, 0xe0, 0xe4, 0xe8, 0xe6, 0xe7, 0xe9, 0x00, /* 0xc8-0xcf */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x80-0x87 */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x88-0x8f */ + 0xa8, 0xa9, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xe0, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xe1, 0xe2, 0xe3, 0xe5, 0xe6, 0xe7, /* 0xe8-0xef */ + 0xe9, 0xf1, 0xf2, 0xf3, 0xe4, 0xe8, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x98-0x9f */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0xa0-0xa7 */ + 0x90, 0x91, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x97, 0xea, 0xeb, 0xec, 0xf4, 0xed, 0xee, 0xef, /* 0xe0-0xe7 */ + 0xf5, 0xf0, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp737", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp737(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp737(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp737) +module_exit(exit_nls_cp737) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c new file mode 100644 index 00000000..4905635d --- /dev/null +++ b/fs/nls/nls_cp775.c @@ -0,0 +1,320 @@ +/* + * linux/fs/nls/nls_cp775.c + * + * Charset cp775 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0106, 0x00fc, 0x00e9, 0x0101, + 0x00e4, 0x0123, 0x00e5, 0x0107, + 0x0142, 0x0113, 0x0156, 0x0157, + 0x012b, 0x0179, 0x00c4, 0x00c5, + /* 0x90*/ + 0x00c9, 0x00e6, 0x00c6, 0x014d, + 0x00f6, 0x0122, 0x00a2, 0x015a, + 0x015b, 0x00d6, 0x00dc, 0x00f8, + 0x00a3, 0x00d8, 0x00d7, 0x00a4, + /* 0xa0*/ + 0x0100, 0x012a, 0x00f3, 0x017b, + 0x017c, 0x017a, 0x201d, 0x00a6, + 0x00a9, 0x00ae, 0x00ac, 0x00bd, + 0x00bc, 0x0141, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x0104, 0x010c, 0x0118, + 0x0116, 0x2563, 0x2551, 0x2557, + 0x255d, 0x012e, 0x0160, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x0172, 0x016a, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x017d, + /* 0xd0*/ + 0x0105, 0x010d, 0x0119, 0x0117, + 0x012f, 0x0161, 0x0173, 0x016b, + 0x017e, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x00d3, 0x00df, 0x014c, 0x0143, + 0x00f5, 0x00d5, 0x00b5, 0x0144, + 0x0136, 0x0137, 0x013b, 0x013c, + 0x0146, 0x0112, 0x0145, 0x2019, + /* 0xf0*/ + 0x00ad, 0x00b1, 0x201c, 0x00be, + 0x00b6, 0x00a7, 0x00f7, 0x201e, + 0x00b0, 0x2219, 0x00b7, 0x00b9, + 0x00b3, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0x00, 0x96, 0x9c, 0x9f, 0x00, 0xa7, 0xf5, /* 0xa0-0xa7 */ + 0x00, 0xa8, 0x00, 0xae, 0xaa, 0xf0, 0xa9, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0xfc, 0x00, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */ + 0x00, 0xfb, 0x00, 0xaf, 0xac, 0xab, 0xf3, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0xe0, 0x00, 0xe5, 0x99, 0x9e, /* 0xd0-0xd7 */ + 0x9d, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x84, 0x86, 0x91, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0xa2, 0x00, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */ + 0x9b, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0xa0, 0x83, 0x00, 0x00, 0xb5, 0xd0, 0x80, 0x87, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xb6, 0xd1, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0xed, 0x89, 0x00, 0x00, 0xb8, 0xd3, /* 0x10-0x17 */ + 0xb7, 0xd2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x95, 0x85, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0xa1, 0x8c, 0x00, 0x00, 0xbd, 0xd4, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0xe9, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0xea, 0xeb, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0xad, 0x88, 0xe3, 0xe7, 0xee, 0xec, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xe2, 0x93, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8a, 0x8b, /* 0x50-0x57 */ + 0x00, 0x00, 0x97, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xbe, 0xd5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0xc7, 0xd7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0xc6, 0xd6, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x8d, 0xa5, 0xa3, 0xa4, 0xcf, 0xd8, 0x00, /* 0x78-0x7f */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xef, 0x00, 0x00, 0xf2, 0xa6, 0xf7, 0x00, /* 0x18-0x1f */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ + 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ + 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ + 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8b, 0x8b, 0x8c, 0xa5, 0x84, 0x86, /* 0x88-0x8f */ + 0x82, 0x91, 0x91, 0x93, 0x94, 0x85, 0x96, 0x98, /* 0x90-0x97 */ + 0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */ + 0x83, 0x8c, 0xa2, 0xa4, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0x88, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xd0, 0xd1, 0xd2, /* 0xb0-0xb7 */ + 0xd3, 0xb9, 0xba, 0xbb, 0xbc, 0xd4, 0xd5, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xd6, 0xd7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xd8, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xa2, 0xe1, 0x93, 0xe7, 0xe4, 0xe4, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe9, 0xe9, 0xeb, 0xeb, 0xec, 0x89, 0xec, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0xa0, 0x8e, 0x95, 0x8f, 0x80, /* 0x80-0x87 */ + 0xad, 0xed, 0x8a, 0x8a, 0xa1, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x92, 0x92, 0xe2, 0x99, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x97, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xe0, 0xa3, 0xa3, 0x8d, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xb5, 0xb6, 0xb7, 0xb8, 0xbd, 0xbe, 0xc6, 0xc7, /* 0xd0-0xd7 */ + 0xcf, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0xe3, /* 0xe0-0xe7 */ + 0xe8, 0xe8, 0xea, 0xea, 0xee, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp775", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp775(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp775(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp775) +module_exit(exit_nls_cp775) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c new file mode 100644 index 00000000..fe5bdad5 --- /dev/null +++ b/fs/nls/nls_cp850.c @@ -0,0 +1,316 @@ +/* + * linux/fs/nls/nls_cp850.c + * + * Charset cp850 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00e4, 0x00e0, 0x00e5, 0x00e7, + 0x00ea, 0x00eb, 0x00e8, 0x00ef, + 0x00ee, 0x00ec, 0x00c4, 0x00c5, + /* 0x90*/ + 0x00c9, 0x00e6, 0x00c6, 0x00f4, + 0x00f6, 0x00f2, 0x00fb, 0x00f9, + 0x00ff, 0x00d6, 0x00dc, 0x00f8, + 0x00a3, 0x00d8, 0x00d7, 0x0192, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x00aa, 0x00ba, + 0x00bf, 0x00ae, 0x00ac, 0x00bd, + 0x00bc, 0x00a1, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x00c1, 0x00c2, 0x00c0, + 0x00a9, 0x2563, 0x2551, 0x2557, + 0x255d, 0x00a2, 0x00a5, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x00e3, 0x00c3, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x00a4, + /* 0xd0*/ + 0x00f0, 0x00d0, 0x00ca, 0x00cb, + 0x00c8, 0x0131, 0x00cd, 0x00ce, + 0x00cf, 0x2518, 0x250c, 0x2588, + 0x2584, 0x00a6, 0x00cc, 0x2580, + /* 0xe0*/ + 0x00d3, 0x00df, 0x00d4, 0x00d2, + 0x00f5, 0x00d5, 0x00b5, 0x00fe, + 0x00de, 0x00da, 0x00db, 0x00d9, + 0x00fd, 0x00dd, 0x00af, 0x00b4, + /* 0xf0*/ + 0x00ad, 0x00b1, 0x2017, 0x00be, + 0x00b6, 0x00a7, 0x00f7, 0x00b8, + 0x00b0, 0x00a8, 0x00b7, 0x00b9, + 0x00b3, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0xad, 0xbd, 0x9c, 0xcf, 0xbe, 0xdd, 0xf5, /* 0xa0-0xa7 */ + 0xf9, 0xb8, 0xa6, 0xae, 0xaa, 0xf0, 0xa9, 0xee, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0xfc, 0xef, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */ + 0xf7, 0xfb, 0xa7, 0xaf, 0xac, 0xab, 0xf3, 0xa8, /* 0xb8-0xbf */ + 0xb7, 0xb5, 0xb6, 0xc7, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */ + 0xd4, 0x90, 0xd2, 0xd3, 0xde, 0xd6, 0xd7, 0xd8, /* 0xc8-0xcf */ + 0xd1, 0xa5, 0xe3, 0xe0, 0xe2, 0xe5, 0x99, 0x9e, /* 0xd0-0xd7 */ + 0x9d, 0xeb, 0xe9, 0xea, 0x9a, 0xed, 0xe8, 0xe1, /* 0xd8-0xdf */ + 0x85, 0xa0, 0x83, 0xc6, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */ + 0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */ + 0xd0, 0xa4, 0x95, 0xa2, 0x93, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */ + 0x9b, 0x97, 0xa3, 0x96, 0x81, 0xec, 0xe7, 0x98, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0xd5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, /* 0x10-0x17 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ + 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ + 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ + 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, NULL, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */ + 0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0x85, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd0, 0x88, 0x89, 0x8a, 0xd5, 0xa1, 0x8c, /* 0xd0-0xd7 */ + 0x8b, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0x8d, 0xdf, /* 0xd8-0xdf */ + 0xa2, 0xe1, 0x93, 0x95, 0xe4, 0xe4, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe7, 0xa3, 0x96, 0x97, 0xec, 0xec, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xb7, 0x8f, 0x80, /* 0x80-0x87 */ + 0xd2, 0xd3, 0xd4, 0xd8, 0xd7, 0xde, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x92, 0x92, 0xe2, 0x99, 0xe3, 0xea, 0xeb, /* 0x90-0x97 */ + 0x00, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0xb5, 0xd6, 0xe0, 0xe9, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd1, 0xd1, 0xd2, 0xd3, 0xd4, 0x49, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0xe8, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xed, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp850", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp850(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp850(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp850) +module_exit(exit_nls_cp850) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c new file mode 100644 index 00000000..ceb1c016 --- /dev/null +++ b/fs/nls/nls_cp852.c @@ -0,0 +1,338 @@ +/* + * linux/fs/nls/nls_cp852.c + * + * Charset cp852 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00e4, 0x016f, 0x0107, 0x00e7, + 0x0142, 0x00eb, 0x0150, 0x0151, + 0x00ee, 0x0179, 0x00c4, 0x0106, + /* 0x90*/ + 0x00c9, 0x0139, 0x013a, 0x00f4, + 0x00f6, 0x013d, 0x013e, 0x015a, + 0x015b, 0x00d6, 0x00dc, 0x0164, + 0x0165, 0x0141, 0x00d7, 0x010d, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x0104, 0x0105, 0x017d, 0x017e, + 0x0118, 0x0119, 0x00ac, 0x017a, + 0x010c, 0x015f, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x00c1, 0x00c2, 0x011a, + 0x015e, 0x2563, 0x2551, 0x2557, + 0x255d, 0x017b, 0x017c, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x0102, 0x0103, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x00a4, + /* 0xd0*/ + 0x0111, 0x0110, 0x010e, 0x00cb, + 0x010f, 0x0147, 0x00cd, 0x00ce, + 0x011b, 0x2518, 0x250c, 0x2588, + 0x2584, 0x0162, 0x016e, 0x2580, + /* 0xe0*/ + 0x00d3, 0x00df, 0x00d4, 0x0143, + 0x0144, 0x0148, 0x0160, 0x0161, + 0x0154, 0x00da, 0x0155, 0x0170, + 0x00fd, 0x00dd, 0x0163, 0x00b4, + /* 0xf0*/ + 0x00ad, 0x02dd, 0x02db, 0x02c7, + 0x02d8, 0x00a7, 0x00f7, 0x00b8, + 0x00b0, 0x00a8, 0x02d9, 0x0171, + 0x0158, 0x0159, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0xf5, /* 0xa0-0xa7 */ + 0xf9, 0x00, 0x00, 0xae, 0xaa, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0xf7, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0xb5, 0xb6, 0x00, 0x8e, 0x00, 0x00, 0x80, /* 0xc0-0xc7 */ + 0x00, 0x90, 0x00, 0xd3, 0x00, 0xd6, 0xd7, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0xe0, 0xe2, 0x00, 0x99, 0x9e, /* 0xd0-0xd7 */ + 0x00, 0x00, 0xe9, 0x00, 0x9a, 0xed, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x00, 0xa0, 0x83, 0x00, 0x84, 0x00, 0x00, 0x87, /* 0xe0-0xe7 */ + 0x00, 0x82, 0x00, 0x89, 0x00, 0xa1, 0x8c, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */ + 0x00, 0x00, 0xa3, 0x00, 0x81, 0xec, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0xc6, 0xc7, 0xa4, 0xa5, 0x8f, 0x86, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xac, 0x9f, 0xd2, 0xd4, /* 0x08-0x0f */ + 0xd1, 0xd0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xa8, 0xa9, 0xb7, 0xd8, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x91, 0x92, 0x00, 0x00, 0x95, 0x96, 0x00, /* 0x38-0x3f */ + 0x00, 0x9d, 0x88, 0xe3, 0xe4, 0x00, 0x00, 0xd5, /* 0x40-0x47 */ + 0xe5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x8a, 0x8b, 0x00, 0x00, 0xe8, 0xea, 0x00, 0x00, /* 0x50-0x57 */ + 0xfc, 0xfd, 0x97, 0x98, 0x00, 0x00, 0xb8, 0xad, /* 0x58-0x5f */ + 0xe6, 0xe7, 0xdd, 0xee, 0x9b, 0x9c, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0x85, /* 0x68-0x6f */ + 0xeb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x8d, 0xab, 0xbd, 0xbe, 0xa6, 0xa7, 0x00, /* 0x78-0x7f */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf3, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0xf4, 0xfa, 0x00, 0xf2, 0x00, 0xf1, 0x00, 0x00, /* 0xd8-0xdf */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ + 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ + 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ + 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8b, 0x8b, 0x8c, 0xab, 0x84, 0x86, /* 0x88-0x8f */ + 0x82, 0x92, 0x92, 0x93, 0x94, 0x96, 0x96, 0x98, /* 0x90-0x97 */ + 0x98, 0x94, 0x81, 0x9c, 0x9c, 0x88, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa5, 0xa5, 0xa7, 0xa7, /* 0xa0-0xa7 */ + 0xa9, 0xa9, 0xaa, 0xab, 0x9f, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0xd8, /* 0xb0-0xb7 */ + 0xad, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd0, 0xd4, 0x89, 0xd4, 0xe5, 0xa1, 0x8c, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xee, 0x85, 0xdf, /* 0xd8-0xdf */ + 0xa2, 0xe1, 0x93, 0xe4, 0xe4, 0xe5, 0xe7, 0xe7, /* 0xe0-0xe7 */ + 0xea, 0xa3, 0xea, 0xfb, 0xec, 0xec, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfd, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xde, 0x8f, 0x80, /* 0x80-0x87 */ + 0x9d, 0xd3, 0x8a, 0x8a, 0xd7, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x91, 0xe2, 0x99, 0x95, 0x95, 0x97, /* 0x90-0x97 */ + 0x97, 0x99, 0x9a, 0x9b, 0x9b, 0x9d, 0x9e, 0xac, /* 0x98-0x9f */ + 0xb5, 0xd6, 0xe0, 0xe9, 0xa4, 0xa4, 0xa6, 0xa6, /* 0xa0-0xa7 */ + 0xa8, 0xa8, 0xaa, 0x8d, 0xac, 0xb8, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbd, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd1, 0xd1, 0xd2, 0xd3, 0xd2, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xb7, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe3, 0xd5, 0xe6, 0xe6, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xe8, 0xeb, 0xed, 0xed, 0xdd, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xeb, 0xfc, 0xfc, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp852", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp852(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp852(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp852) +module_exit(exit_nls_cp852) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c new file mode 100644 index 00000000..cc7f5fb2 --- /dev/null +++ b/fs/nls/nls_cp855.c @@ -0,0 +1,300 @@ +/* + * linux/fs/nls/nls_cp855.c + * + * Charset cp855 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0452, 0x0402, 0x0453, 0x0403, + 0x0451, 0x0401, 0x0454, 0x0404, + 0x0455, 0x0405, 0x0456, 0x0406, + 0x0457, 0x0407, 0x0458, 0x0408, + /* 0x90*/ + 0x0459, 0x0409, 0x045a, 0x040a, + 0x045b, 0x040b, 0x045c, 0x040c, + 0x045e, 0x040e, 0x045f, 0x040f, + 0x044e, 0x042e, 0x044a, 0x042a, + /* 0xa0*/ + 0x0430, 0x0410, 0x0431, 0x0411, + 0x0446, 0x0426, 0x0434, 0x0414, + 0x0435, 0x0415, 0x0444, 0x0424, + 0x0433, 0x0413, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x0445, 0x0425, 0x0438, + 0x0418, 0x2563, 0x2551, 0x2557, + 0x255d, 0x0439, 0x0419, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x043a, 0x041a, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x00a4, + /* 0xd0*/ + 0x043b, 0x041b, 0x043c, 0x041c, + 0x043d, 0x041d, 0x043e, 0x041e, + 0x043f, 0x2518, 0x250c, 0x2588, + 0x2584, 0x041f, 0x044f, 0x2580, + /* 0xe0*/ + 0x042f, 0x0440, 0x0420, 0x0441, + 0x0421, 0x0442, 0x0422, 0x0443, + 0x0423, 0x0436, 0x0416, 0x0432, + 0x0412, 0x044c, 0x042c, 0x2116, + /* 0xf0*/ + 0x00ad, 0x044b, 0x042b, 0x0437, + 0x0417, 0x0448, 0x0428, 0x044d, + 0x042d, 0x0449, 0x0429, 0x0447, + 0x0427, 0x00a7, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0xfd, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0xae, 0x00, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ +}; + +static const unsigned char page04[256] = { + 0x00, 0x85, 0x81, 0x83, 0x87, 0x89, 0x8b, 0x8d, /* 0x00-0x07 */ + 0x8f, 0x91, 0x93, 0x95, 0x97, 0x00, 0x99, 0x9b, /* 0x08-0x0f */ + 0xa1, 0xa3, 0xec, 0xad, 0xa7, 0xa9, 0xea, 0xf4, /* 0x10-0x17 */ + 0xb8, 0xbe, 0xc7, 0xd1, 0xd3, 0xd5, 0xd7, 0xdd, /* 0x18-0x1f */ + 0xe2, 0xe4, 0xe6, 0xe8, 0xab, 0xb6, 0xa5, 0xfc, /* 0x20-0x27 */ + 0xf6, 0xfa, 0x9f, 0xf2, 0xee, 0xf8, 0x9d, 0xe0, /* 0x28-0x2f */ + 0xa0, 0xa2, 0xeb, 0xac, 0xa6, 0xa8, 0xe9, 0xf3, /* 0x30-0x37 */ + 0xb7, 0xbd, 0xc6, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, /* 0x38-0x3f */ + 0xe1, 0xe3, 0xe5, 0xe7, 0xaa, 0xb5, 0xa4, 0xfb, /* 0x40-0x47 */ + 0xf5, 0xf9, 0x9e, 0xf1, 0xed, 0xf7, 0x9c, 0xde, /* 0x48-0x4f */ + 0x00, 0x84, 0x80, 0x82, 0x86, 0x88, 0x8a, 0x8c, /* 0x50-0x57 */ + 0x8e, 0x90, 0x92, 0x94, 0x96, 0x00, 0x98, 0x9a, /* 0x58-0x5f */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xef, 0x00, /* 0x10-0x17 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ + 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ + 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ + 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, page21, NULL, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x80, 0x82, 0x82, 0x84, 0x84, 0x86, 0x86, /* 0x80-0x87 */ + 0x88, 0x88, 0x8a, 0x8a, 0x8c, 0x8c, 0x8e, 0x8e, /* 0x88-0x8f */ + 0x90, 0x90, 0x92, 0x92, 0x94, 0x94, 0x96, 0x96, /* 0x90-0x97 */ + 0x98, 0x98, 0x9a, 0x9a, 0x9c, 0x9c, 0x9e, 0x9e, /* 0x98-0x9f */ + 0xa0, 0xa0, 0xa2, 0xa2, 0xa4, 0xa4, 0xa6, 0xa6, /* 0xa0-0xa7 */ + 0xa8, 0xa8, 0xaa, 0xaa, 0xac, 0xac, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb5, 0xb7, /* 0xb0-0xb7 */ + 0xb7, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbd, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd0, 0xd2, 0xd2, 0xd4, 0xd4, 0xd6, 0xd6, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xd8, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xde, 0xe1, 0xe1, 0xe3, 0xe3, 0xe5, 0xe5, 0xe7, /* 0xe0-0xe7 */ + 0xe7, 0xe9, 0xe9, 0xeb, 0xeb, 0xed, 0xed, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf1, 0xf3, 0xf3, 0xf5, 0xf5, 0xf7, /* 0xf0-0xf7 */ + 0xf7, 0xf9, 0xf9, 0xfb, 0xfb, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x81, 0x81, 0x83, 0x83, 0x85, 0x85, 0x87, 0x87, /* 0x80-0x87 */ + 0x89, 0x89, 0x8b, 0x8b, 0x8d, 0x8d, 0x8f, 0x8f, /* 0x88-0x8f */ + 0x91, 0x91, 0x93, 0x93, 0x95, 0x95, 0x97, 0x97, /* 0x90-0x97 */ + 0x99, 0x99, 0x9b, 0x9b, 0x9d, 0x9d, 0x9f, 0x9f, /* 0x98-0x9f */ + 0xa1, 0xa1, 0xa3, 0xa3, 0xa5, 0xa5, 0xa7, 0xa7, /* 0xa0-0xa7 */ + 0xa9, 0xa9, 0xab, 0xab, 0xad, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb6, 0xb6, 0xb8, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd1, 0xd1, 0xd3, 0xd3, 0xd5, 0xd5, 0xd7, 0xd7, /* 0xd0-0xd7 */ + 0xdd, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xe0, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe2, 0xe2, 0xe4, 0xe4, 0xe6, 0xe6, 0xe8, /* 0xe0-0xe7 */ + 0xe8, 0xea, 0xea, 0xec, 0xec, 0xee, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf2, 0xf2, 0xf4, 0xf4, 0xf6, 0xf6, 0xf8, /* 0xf0-0xf7 */ + 0xf8, 0xfa, 0xfa, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp855", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp855(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp855(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp855) +module_exit(exit_nls_cp855) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c new file mode 100644 index 00000000..e418e198 --- /dev/null +++ b/fs/nls/nls_cp857.c @@ -0,0 +1,302 @@ +/* + * linux/fs/nls/nls_cp857.c + * + * Charset cp857 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00e4, 0x00e0, 0x00e5, 0x00e7, + 0x00ea, 0x00eb, 0x00e8, 0x00ef, + 0x00ee, 0x0131, 0x00c4, 0x00c5, + /* 0x90*/ + 0x00c9, 0x00e6, 0x00c6, 0x00f4, + 0x00f6, 0x00f2, 0x00fb, 0x00f9, + 0x0130, 0x00d6, 0x00dc, 0x00f8, + 0x00a3, 0x00d8, 0x015e, 0x015f, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x011e, 0x011f, + 0x00bf, 0x00ae, 0x00ac, 0x00bd, + 0x00bc, 0x00a1, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x00c1, 0x00c2, 0x00c0, + 0x00a9, 0x2563, 0x2551, 0x2557, + 0x255d, 0x00a2, 0x00a5, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x00e3, 0x00c3, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x00a4, + /* 0xd0*/ + 0x00ba, 0x00aa, 0x00ca, 0x00cb, + 0x00c8, 0x0000, 0x00cd, 0x00ce, + 0x00cf, 0x2518, 0x250c, 0x2588, + 0x2584, 0x00a6, 0x00cc, 0x2580, + /* 0xe0*/ + 0x00d3, 0x00df, 0x00d4, 0x00d2, + 0x00f5, 0x00d5, 0x00b5, 0x0000, + 0x00d7, 0x00da, 0x00db, 0x00d9, + 0x00ec, 0x00ff, 0x00af, 0x00b4, + /* 0xf0*/ + 0x00ad, 0x00b1, 0x0000, 0x00be, + 0x00b6, 0x00a7, 0x00f7, 0x00b8, + 0x00b0, 0x00a8, 0x00b7, 0x00b9, + 0x00b3, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0xad, 0xbd, 0x9c, 0xcf, 0xbe, 0xdd, 0xf5, /* 0xa0-0xa7 */ + 0xf9, 0xb8, 0xd1, 0xae, 0xaa, 0xf0, 0xa9, 0xee, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0xfc, 0xef, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */ + 0xf7, 0xfb, 0xd0, 0xaf, 0xac, 0xab, 0xf3, 0xa8, /* 0xb8-0xbf */ + 0xb7, 0xb5, 0xb6, 0xc7, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */ + 0xd4, 0x90, 0xd2, 0xd3, 0xde, 0xd6, 0xd7, 0xd8, /* 0xc8-0xcf */ + 0x00, 0xa5, 0xe3, 0xe0, 0xe2, 0xe5, 0x99, 0xe8, /* 0xd0-0xd7 */ + 0x9d, 0xeb, 0xe9, 0xea, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x85, 0xa0, 0x83, 0xc6, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */ + 0x8a, 0x82, 0x88, 0x89, 0xec, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */ + 0x00, 0xa4, 0x95, 0xa2, 0x93, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */ + 0x9b, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0xed, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa6, 0xa7, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x98, 0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, 0x9f, /* 0x58-0x5f */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ + 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ + 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ + 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */ + 0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x69, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9f, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa7, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0x85, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0x88, 0x89, 0x8a, 0x00, 0xa1, 0x8c, /* 0xd0-0xd7 */ + 0x8b, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xec, 0xdf, /* 0xd8-0xdf */ + 0xa2, 0xe1, 0x93, 0x95, 0xe4, 0xe4, 0xe6, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xa3, 0x96, 0x97, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xb7, 0x8f, 0x80, /* 0x80-0x87 */ + 0xd2, 0xd3, 0xd4, 0xd8, 0xd7, 0x49, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x92, 0x92, 0xe2, 0x99, 0xe3, 0xea, 0xeb, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x9e, /* 0x98-0x9f */ + 0xb5, 0xd6, 0xe0, 0xe9, 0xa5, 0xa5, 0xa6, 0xa6, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xde, 0x00, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp857", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp857(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp857(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp857) +module_exit(exit_nls_cp857) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c new file mode 100644 index 00000000..a86c97d1 --- /dev/null +++ b/fs/nls/nls_cp860.c @@ -0,0 +1,365 @@ +/* + * linux/fs/nls/nls_cp860.c + * + * Charset cp860 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00e3, 0x00e0, 0x00c1, 0x00e7, + 0x00ea, 0x00ca, 0x00e8, 0x00cd, + 0x00d4, 0x00ec, 0x00c3, 0x00c2, + /* 0x90*/ + 0x00c9, 0x00c0, 0x00c8, 0x00f4, + 0x00f5, 0x00f2, 0x00da, 0x00f9, + 0x00cc, 0x00d5, 0x00dc, 0x00a2, + 0x00a3, 0x00d9, 0x20a7, 0x00d3, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x00aa, 0x00ba, + 0x00bf, 0x00d2, 0x00ac, 0x00bd, + 0x00bc, 0x00a1, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x03b1, 0x00df, 0x0393, 0x03c0, + 0x03a3, 0x03c3, 0x00b5, 0x03c4, + 0x03a6, 0x0398, 0x03a9, 0x03b4, + 0x221e, 0x03c6, 0x03b5, 0x2229, + /* 0xf0*/ + 0x2261, 0x00b1, 0x2265, 0x2264, + 0x2320, 0x2321, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x207f, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0xad, 0x9b, 0x9c, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */ + 0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */ + 0x91, 0x86, 0x8f, 0x8e, 0x00, 0x00, 0x00, 0x80, /* 0xc0-0xc7 */ + 0x92, 0x90, 0x89, 0x00, 0x98, 0x8b, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0xa5, 0xa9, 0x9f, 0x8c, 0x99, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x9d, 0x96, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x85, 0xa0, 0x83, 0x84, 0x00, 0x00, 0x00, 0x87, /* 0xe0-0xe7 */ + 0x8a, 0x82, 0x88, 0x00, 0x8d, 0xa1, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0xa4, 0x95, 0xa2, 0x93, 0x94, 0x00, 0xf6, /* 0xf0-0xf7 */ + 0x00, 0x97, 0xa3, 0x00, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0xa0, 0x87, /* 0x80-0x87 */ + 0x88, 0x88, 0x8a, 0xa1, 0x93, 0x8d, 0x84, 0x83, /* 0x88-0x8f */ + 0x82, 0x85, 0x8a, 0x93, 0x94, 0x95, 0xa3, 0x97, /* 0x90-0x97 */ + 0x8d, 0x94, 0x81, 0x9b, 0x9c, 0x97, 0x9e, 0xa2, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0x95, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0x8f, 0x8e, 0x91, 0x86, 0x80, /* 0x80-0x87 */ + 0x89, 0x89, 0x92, 0x8b, 0x8c, 0x98, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x8c, 0x99, 0xa9, 0x96, 0x9d, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0x86, 0x8b, 0x9f, 0x96, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp860", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp860(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp860(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp860) +module_exit(exit_nls_cp860) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c new file mode 100644 index 00000000..bd920227 --- /dev/null +++ b/fs/nls/nls_cp861.c @@ -0,0 +1,388 @@ +/* + * linux/fs/nls/nls_cp861.c + * + * Charset cp861 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00e4, 0x00e0, 0x00e5, 0x00e7, + 0x00ea, 0x00eb, 0x00e8, 0x00d0, + 0x00f0, 0x00de, 0x00c4, 0x00c5, + /* 0x90*/ + 0x00c9, 0x00e6, 0x00c6, 0x00f4, + 0x00f6, 0x00fe, 0x00fb, 0x00dd, + 0x00fd, 0x00d6, 0x00dc, 0x00f8, + 0x00a3, 0x00d8, 0x20a7, 0x0192, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00c1, 0x00cd, 0x00d3, 0x00da, + 0x00bf, 0x2310, 0x00ac, 0x00bd, + 0x00bc, 0x00a1, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x03b1, 0x00df, 0x0393, 0x03c0, + 0x03a3, 0x03c3, 0x00b5, 0x03c4, + 0x03a6, 0x0398, 0x03a9, 0x03b4, + 0x221e, 0x03c6, 0x03b5, 0x2229, + /* 0xf0*/ + 0x2261, 0x00b1, 0x2265, 0x2264, + 0x2320, 0x2321, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x207f, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0xad, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */ + 0x00, 0xa4, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */ + 0x00, 0x90, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, /* 0xc8-0xcf */ + 0x8b, 0x00, 0x00, 0xa6, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */ + 0x9d, 0x00, 0xa7, 0x00, 0x9a, 0x97, 0x8d, 0xe1, /* 0xd8-0xdf */ + 0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */ + 0x8a, 0x82, 0x88, 0x89, 0x00, 0xa1, 0x00, 0x00, /* 0xe8-0xef */ + 0x8c, 0x00, 0x00, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */ + 0x9b, 0x00, 0xa3, 0x96, 0x81, 0x98, 0x95, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8c, 0x8c, 0x95, 0x84, 0x86, /* 0x88-0x8f */ + 0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x98, /* 0x90-0x97 */ + 0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa0, 0xa1, 0xa2, 0xa3, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0x00, 0x8e, 0x00, 0x8f, 0x80, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x8b, 0x8b, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x92, 0x92, 0x00, 0x99, 0x8d, 0x00, 0x97, /* 0x90-0x97 */ + 0x97, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0xa4, 0xa5, 0xa6, 0xa7, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp861", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp861(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp861(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp861) +module_exit(exit_nls_cp861) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c new file mode 100644 index 00000000..e9b68eb3 --- /dev/null +++ b/fs/nls/nls_cp862.c @@ -0,0 +1,422 @@ +/* + * linux/fs/nls/nls_cp862.c + * + * Charset cp862 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x05d0, 0x05d1, 0x05d2, 0x05d3, + 0x05d4, 0x05d5, 0x05d6, 0x05d7, + 0x05d8, 0x05d9, 0x05da, 0x05db, + 0x05dc, 0x05dd, 0x05de, 0x05df, + /* 0x90*/ + 0x05e0, 0x05e1, 0x05e2, 0x05e3, + 0x05e4, 0x05e5, 0x05e6, 0x05e7, + 0x05e8, 0x05e9, 0x05ea, 0x00a2, + 0x00a3, 0x00a5, 0x20a7, 0x0192, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x00aa, 0x00ba, + 0x00bf, 0x2310, 0x00ac, 0x00bd, + 0x00bc, 0x00a1, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x03b1, 0x00df, 0x0393, 0x03c0, + 0x03a3, 0x03c3, 0x00b5, 0x03c4, + 0x03a6, 0x0398, 0x03a9, 0x03b4, + 0x221e, 0x03c6, 0x03b5, 0x2229, + /* 0xf0*/ + 0x2261, 0x00b1, 0x2265, 0x2264, + 0x2320, 0x2321, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x207f, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0xad, 0x9b, 0x9c, 0x00, 0x9d, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */ + 0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0xa4, 0x00, 0xa2, 0x00, 0x00, 0x00, 0xf6, /* 0xf0-0xf7 */ + 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */ +}; + +static const unsigned char page05[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0xd0-0xd7 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0xd8-0xdf */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0xe0-0xe7 */ + 0x98, 0x99, 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, page03, NULL, page05, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp862", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp862(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp862(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp862) +module_exit(exit_nls_cp862) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c new file mode 100644 index 00000000..f8a9b07a --- /dev/null +++ b/fs/nls/nls_cp863.c @@ -0,0 +1,382 @@ +/* + * linux/fs/nls/nls_cp863.c + * + * Charset cp863 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00c2, 0x00e0, 0x00b6, 0x00e7, + 0x00ea, 0x00eb, 0x00e8, 0x00ef, + 0x00ee, 0x2017, 0x00c0, 0x00a7, + /* 0x90*/ + 0x00c9, 0x00c8, 0x00ca, 0x00f4, + 0x00cb, 0x00cf, 0x00fb, 0x00f9, + 0x00a4, 0x00d4, 0x00dc, 0x00a2, + 0x00a3, 0x00d9, 0x00db, 0x0192, + /* 0xa0*/ + 0x00a6, 0x00b4, 0x00f3, 0x00fa, + 0x00a8, 0x00b8, 0x00b3, 0x00af, + 0x00ce, 0x2310, 0x00ac, 0x00bd, + 0x00bc, 0x00be, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x03b1, 0x00df, 0x0393, 0x03c0, + 0x03a3, 0x03c3, 0x00b5, 0x03c4, + 0x03a6, 0x0398, 0x03a9, 0x03b4, + 0x221e, 0x03c6, 0x03b5, 0x2229, + /* 0xf0*/ + 0x2261, 0x00b1, 0x2265, 0x2264, + 0x2320, 0x2321, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x207f, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0x00, 0x9b, 0x9c, 0x98, 0x00, 0xa0, 0x8f, /* 0xa0-0xa7 */ + 0xa4, 0x00, 0x00, 0xae, 0xaa, 0x00, 0x00, 0xa7, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0xa6, 0xa1, 0xe6, 0x86, 0xfa, /* 0xb0-0xb7 */ + 0xa5, 0x00, 0x00, 0xaf, 0xac, 0xab, 0xad, 0x00, /* 0xb8-0xbf */ + 0x8e, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xc0-0xc7 */ + 0x91, 0x90, 0x92, 0x94, 0x00, 0x00, 0xa8, 0x95, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x9d, 0x00, 0x9e, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x85, 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x87, /* 0xe0-0xe7 */ + 0x8a, 0x82, 0x88, 0x89, 0x00, 0x00, 0x8c, 0x8b, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0xa2, 0x93, 0x00, 0x00, 0xf6, /* 0xf0-0xf7 */ + 0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8d, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x83, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x85, 0x8f, /* 0x88-0x8f */ + 0x82, 0x8a, 0x88, 0x93, 0x89, 0x8b, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x93, 0x81, 0x9b, 0x9c, 0x97, 0x96, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0x8c, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0x84, 0x84, 0x8e, 0x86, 0x80, /* 0x80-0x87 */ + 0x92, 0x94, 0x91, 0x95, 0xa8, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x99, 0x94, 0x95, 0x9e, 0x9d, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0xa0, 0xa1, 0x00, 0x00, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp863", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp863(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp863(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp863) +module_exit(exit_nls_cp863) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c new file mode 100644 index 00000000..8d31f435 --- /dev/null +++ b/fs/nls/nls_cp864.c @@ -0,0 +1,408 @@ +/* + * linux/fs/nls/nls_cp864.c + * + * Charset cp864 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x066a, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00b0, 0x00b7, 0x2219, 0x221a, + 0x2592, 0x2500, 0x2502, 0x253c, + 0x2524, 0x252c, 0x251c, 0x2534, + 0x2510, 0x250c, 0x2514, 0x2518, + /* 0x90*/ + 0x03b2, 0x221e, 0x03c6, 0x00b1, + 0x00bd, 0x00bc, 0x2248, 0x00ab, + 0x00bb, 0xfef7, 0xfef8, 0x0000, + 0x0000, 0xfefb, 0xfefc, 0x0000, + /* 0xa0*/ + 0x00a0, 0x00ad, 0xfe82, 0x00a3, + 0x00a4, 0xfe84, 0x0000, 0x0000, + 0xfe8e, 0xfe8f, 0xfe95, 0xfe99, + 0x060c, 0xfe9d, 0xfea1, 0xfea5, + /* 0xb0*/ + 0x0660, 0x0661, 0x0662, 0x0663, + 0x0664, 0x0665, 0x0666, 0x0667, + 0x0668, 0x0669, 0xfed1, 0x061b, + 0xfeb1, 0xfeb5, 0xfeb9, 0x061f, + /* 0xc0*/ + 0x00a2, 0xfe80, 0xfe81, 0xfe83, + 0xfe85, 0xfeca, 0xfe8b, 0xfe8d, + 0xfe91, 0xfe93, 0xfe97, 0xfe9b, + 0xfe9f, 0xfea3, 0xfea7, 0xfea9, + /* 0xd0*/ + 0xfeab, 0xfead, 0xfeaf, 0xfeb3, + 0xfeb7, 0xfebb, 0xfebf, 0xfec1, + 0xfec5, 0xfecb, 0xfecf, 0x00a6, + 0x00ac, 0x00f7, 0x00d7, 0xfec9, + /* 0xe0*/ + 0x0640, 0xfed3, 0xfed7, 0xfedb, + 0xfedf, 0xfee3, 0xfee7, 0xfeeb, + 0xfeed, 0xfeef, 0xfef3, 0xfebd, + 0xfecc, 0xfece, 0xfecd, 0xfee1, + /* 0xf0*/ + 0xfe7d, 0x0651, 0xfee5, 0xfee9, + 0xfeec, 0xfef0, 0xfef2, 0xfed0, + 0xfed5, 0xfef5, 0xfef6, 0xfedd, + 0xfed9, 0xfef1, 0x25a0, 0x0000, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x00, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0x00, 0xc0, 0xa3, 0xa4, 0x00, 0xdb, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x97, 0xdc, 0xa1, 0x00, 0x00, /* 0xa8-0xaf */ + 0x80, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x98, 0x95, 0x94, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdd, /* 0xf0-0xf7 */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x00, /* 0xc0-0xc7 */ +}; + +static const unsigned char page06[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0xf1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x60-0x67 */ + 0xb8, 0xb9, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x82, 0x83, 0x00, 0x00, 0x00, 0x91, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ +}; + +static const unsigned char page25[256] = { + 0x85, 0x00, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x8c, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x8f, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char pagefe[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, /* 0x78-0x7f */ + + 0xc1, 0xc2, 0xa2, 0xc3, 0xa5, 0xc4, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0xc6, 0x00, 0xc7, 0xa8, 0xa9, /* 0x88-0x8f */ + 0x00, 0xc8, 0x00, 0xc9, 0x00, 0xaa, 0x00, 0xca, /* 0x90-0x97 */ + 0x00, 0xab, 0x00, 0xcb, 0x00, 0xad, 0x00, 0xcc, /* 0x98-0x9f */ + 0x00, 0xae, 0x00, 0xcd, 0x00, 0xaf, 0x00, 0xce, /* 0xa0-0xa7 */ + 0x00, 0xcf, 0x00, 0xd0, 0x00, 0xd1, 0x00, 0xd2, /* 0xa8-0xaf */ + 0x00, 0xbc, 0x00, 0xd3, 0x00, 0xbd, 0x00, 0xd4, /* 0xb0-0xb7 */ + 0x00, 0xbe, 0x00, 0xd5, 0x00, 0xeb, 0x00, 0xd6, /* 0xb8-0xbf */ + 0x00, 0xd7, 0x00, 0x00, 0x00, 0xd8, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0xdf, 0xc5, 0xd9, 0xec, 0xee, 0xed, 0xda, /* 0xc8-0xcf */ + 0xf7, 0xba, 0x00, 0xe1, 0x00, 0xf8, 0x00, 0xe2, /* 0xd0-0xd7 */ + 0x00, 0xfc, 0x00, 0xe3, 0x00, 0xfb, 0x00, 0xe4, /* 0xd8-0xdf */ + 0x00, 0xef, 0x00, 0xe5, 0x00, 0xf2, 0x00, 0xe6, /* 0xe0-0xe7 */ + 0x00, 0xf3, 0x00, 0xe7, 0xf4, 0xe8, 0x00, 0xe9, /* 0xe8-0xef */ + 0xf5, 0xfd, 0xf6, 0xea, 0x00, 0xf9, 0xfa, 0x99, /* 0xf0-0xf7 */ + 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, page03, NULL, NULL, page06, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, page22, NULL, NULL, page25, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, pagefe, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0x00, 0x00, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x00, 0x91, 0x00, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0x00, 0x00, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp864", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp864(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp864(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp864) +module_exit(exit_nls_cp864) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c new file mode 100644 index 00000000..4bd902fe --- /dev/null +++ b/fs/nls/nls_cp865.c @@ -0,0 +1,388 @@ +/* + * linux/fs/nls/nls_cp865.c + * + * Charset cp865 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x00c7, 0x00fc, 0x00e9, 0x00e2, + 0x00e4, 0x00e0, 0x00e5, 0x00e7, + 0x00ea, 0x00eb, 0x00e8, 0x00ef, + 0x00ee, 0x00ec, 0x00c4, 0x00c5, + /* 0x90*/ + 0x00c9, 0x00e6, 0x00c6, 0x00f4, + 0x00f6, 0x00f2, 0x00fb, 0x00f9, + 0x00ff, 0x00d6, 0x00dc, 0x00f8, + 0x00a3, 0x00d8, 0x20a7, 0x0192, + /* 0xa0*/ + 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x00aa, 0x00ba, + 0x00bf, 0x2310, 0x00ac, 0x00bd, + 0x00bc, 0x00a1, 0x00ab, 0x00a4, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x03b1, 0x00df, 0x0393, 0x03c0, + 0x03a3, 0x03c3, 0x00b5, 0x03c4, + 0x03a6, 0x0398, 0x03a9, 0x03b4, + 0x221e, 0x03c6, 0x03b5, 0x2229, + /* 0xf0*/ + 0x2261, 0x00b1, 0x2265, 0x2264, + 0x2320, 0x2321, 0x00f7, 0x2248, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x207f, 0x00b2, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0xad, 0x00, 0x9c, 0xaf, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */ + 0x00, 0x00, 0xa7, 0x00, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */ + 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */ + 0x9d, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ + 0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */ + 0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */ + 0x00, 0xa4, 0x95, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */ + 0x9b, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x98, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */ + 0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x9a, 0x90, 0x00, 0x8e, 0x00, 0x8f, 0x80, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x92, 0x92, 0x00, 0x99, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp865", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp865(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp865(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp865) +module_exit(exit_nls_cp865) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c new file mode 100644 index 00000000..bdc7cb39 --- /dev/null +++ b/fs/nls/nls_cp866.c @@ -0,0 +1,306 @@ +/* + * linux/fs/nls/nls_cp866.c + * + * Charset cp866 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0410, 0x0411, 0x0412, 0x0413, + 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, + 0x041c, 0x041d, 0x041e, 0x041f, + /* 0x90*/ + 0x0420, 0x0421, 0x0422, 0x0423, + 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, + 0x042c, 0x042d, 0x042e, 0x042f, + /* 0xa0*/ + 0x0430, 0x0431, 0x0432, 0x0433, + 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0439, 0x043a, 0x043b, + 0x043c, 0x043d, 0x043e, 0x043f, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, + 0x255d, 0x255c, 0x255b, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x2567, + /* 0xd0*/ + 0x2568, 0x2564, 0x2565, 0x2559, + 0x2558, 0x2552, 0x2553, 0x256b, + 0x256a, 0x2518, 0x250c, 0x2588, + 0x2584, 0x258c, 0x2590, 0x2580, + /* 0xe0*/ + 0x0440, 0x0441, 0x0442, 0x0443, + 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044a, 0x044b, + 0x044c, 0x044d, 0x044e, 0x044f, + /* 0xf0*/ + 0x0401, 0x0451, 0x0404, 0x0454, + 0x0407, 0x0457, 0x040e, 0x045e, + 0x00b0, 0x2219, 0x00b7, 0x221a, + 0x2116, 0x00a4, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, /* 0xb0-0xb7 */ +}; + +static const unsigned char page04[256] = { + 0x00, 0xf0, 0x00, 0x00, 0xf2, 0x00, 0x00, 0xf4, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0x00, /* 0x08-0x0f */ + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x10-0x17 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x18-0x1f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x20-0x27 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x28-0x2f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x30-0x37 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0x38-0x3f */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */ + 0x00, 0xf1, 0x00, 0x00, 0xf3, 0x00, 0x00, 0xf5, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf7, 0x00, /* 0x58-0x5f */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, /* 0x10-0x17 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, page21, page22, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x80-0x87 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0x88-0x8f */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x90-0x97 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf1, 0xf1, 0xf3, 0xf3, 0xf5, 0xf5, 0xf7, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0xa0-0xa7 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0xe0-0xe7 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0xe8-0xef */ + 0xf0, 0xf0, 0xf2, 0xf2, 0xf4, 0xf4, 0xf6, 0xf6, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp866", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp866(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp866(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp866) +module_exit(exit_nls_cp866) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c new file mode 100644 index 00000000..9f283a2b --- /dev/null +++ b/fs/nls/nls_cp869.c @@ -0,0 +1,316 @@ +/* + * linux/fs/nls/nls_cp869.c + * + * Charset cp869 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0386, 0x0000, + 0x00b7, 0x00ac, 0x00a6, 0x2018, + 0x2019, 0x0388, 0x2015, 0x0389, + /* 0x90*/ + 0x038a, 0x03aa, 0x038c, 0x0000, + 0x0000, 0x038e, 0x03ab, 0x00a9, + 0x038f, 0x00b2, 0x00b3, 0x03ac, + 0x00a3, 0x03ad, 0x03ae, 0x03af, + /* 0xa0*/ + 0x03ca, 0x0390, 0x03cc, 0x03cd, + 0x0391, 0x0392, 0x0393, 0x0394, + 0x0395, 0x0396, 0x0397, 0x00bd, + 0x0398, 0x0399, 0x00ab, 0x00bb, + /* 0xb0*/ + 0x2591, 0x2592, 0x2593, 0x2502, + 0x2524, 0x039a, 0x039b, 0x039c, + 0x039d, 0x2563, 0x2551, 0x2557, + 0x255d, 0x039e, 0x039f, 0x2510, + /* 0xc0*/ + 0x2514, 0x2534, 0x252c, 0x251c, + 0x2500, 0x253c, 0x03a0, 0x03a1, + 0x255a, 0x2554, 0x2569, 0x2566, + 0x2560, 0x2550, 0x256c, 0x03a3, + /* 0xd0*/ + 0x03a4, 0x03a5, 0x03a6, 0x03a7, + 0x03a8, 0x03a9, 0x03b1, 0x03b2, + 0x03b3, 0x2518, 0x250c, 0x2588, + 0x2584, 0x03b4, 0x03b5, 0x2580, + /* 0xe0*/ + 0x03b6, 0x03b7, 0x03b8, 0x03b9, + 0x03ba, 0x03bb, 0x03bc, 0x03bd, + 0x03be, 0x03bf, 0x03c0, 0x03c1, + 0x03c3, 0x03c2, 0x03c4, 0x0384, + /* 0xf0*/ + 0x00ad, 0x00b1, 0x03c5, 0x03c6, + 0x03c7, 0x00a7, 0x03c8, 0x0385, + 0x00b0, 0x00a8, 0x03c9, 0x03cb, + 0x03b0, 0x03ce, 0x25a0, 0x00a0, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xff, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x8a, 0xf5, /* 0xa0-0xa7 */ + 0xf9, 0x97, 0x00, 0xae, 0x89, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */ + 0xf8, 0xf1, 0x99, 0x9a, 0x00, 0x00, 0x00, 0x88, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xaf, 0x00, 0xab, 0x00, 0x00, /* 0xb8-0xbf */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0xef, 0xf7, 0x86, 0x00, /* 0x80-0x87 */ + 0x8d, 0x8f, 0x90, 0x00, 0x92, 0x00, 0x95, 0x98, /* 0x88-0x8f */ + 0xa1, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, /* 0x90-0x97 */ + 0xac, 0xad, 0xb5, 0xb6, 0xb7, 0xb8, 0xbd, 0xbe, /* 0x98-0x9f */ + 0xc6, 0xc7, 0x00, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, /* 0xa0-0xa7 */ + 0xd4, 0xd5, 0x91, 0x96, 0x9b, 0x9d, 0x9e, 0x9f, /* 0xa8-0xaf */ + 0xfc, 0xd6, 0xd7, 0xd8, 0xdd, 0xde, 0xe0, 0xe1, /* 0xb0-0xb7 */ + 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, /* 0xb8-0xbf */ + 0xea, 0xeb, 0xed, 0xec, 0xee, 0xf2, 0xf3, 0xf4, /* 0xc0-0xc7 */ + 0xf6, 0xfa, 0xa0, 0xfb, 0xa2, 0xa3, 0xfd, 0x00, /* 0xc8-0xcf */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, /* 0x10-0x17 */ + 0x8b, 0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ +}; + +static const unsigned char page25[256] = { + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ + 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ + 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ + 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, NULL, NULL, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9b, 0x00, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x9d, 0x8e, 0x9e, /* 0x88-0x8f */ + 0x9f, 0xa0, 0xa2, 0x00, 0x00, 0xa3, 0xfb, 0x97, /* 0x90-0x97 */ + 0xfd, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xd6, 0xd7, 0xd8, 0xdd, /* 0xa0-0xa7 */ + 0xde, 0xe0, 0xe1, 0xab, 0xe2, 0xe3, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xe4, 0xe5, 0xe6, /* 0xb0-0xb7 */ + 0xe7, 0xb9, 0xba, 0xbb, 0xbc, 0xe8, 0xe9, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xea, 0xeb, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xec, /* 0xc8-0xcf */ + 0xee, 0xf2, 0xf3, 0xf4, 0xf6, 0xfa, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0x00, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x00, 0x00, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x86, 0x9c, 0x8d, 0x8f, 0x90, /* 0x98-0x9f */ + 0x91, 0xa1, 0x92, 0x95, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xa4, 0xa5, /* 0xd0-0xd7 */ + 0xa6, 0xd9, 0xda, 0xdb, 0xdc, 0xa7, 0xa8, 0xdf, /* 0xd8-0xdf */ + 0xa9, 0xaa, 0xac, 0xad, 0xb5, 0xb6, 0xb7, 0xb8, /* 0xe0-0xe7 */ + 0xbd, 0xbe, 0xc6, 0xc7, 0xcf, 0xcf, 0xd0, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xd1, 0xd2, 0xd3, 0xf5, 0xd4, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xd5, 0x96, 0xfc, 0x98, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp869", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp869(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp869(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp869) +module_exit(exit_nls_cp869) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c new file mode 100644 index 00000000..0b3c4886 --- /dev/null +++ b/fs/nls/nls_cp874.c @@ -0,0 +1,276 @@ +/* + * linux/fs/nls/nls_cp874.c + * + * Charset cp874 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x2026, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + /* 0x90*/ + 0x0000, 0x2018, 0x2019, 0x201c, + 0x201d, 0x2022, 0x2013, 0x2014, + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + /* 0xa0*/ + 0x00a0, 0x0e01, 0x0e02, 0x0e03, + 0x0e04, 0x0e05, 0x0e06, 0x0e07, + 0x0e08, 0x0e09, 0x0e0a, 0x0e0b, + 0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f, + /* 0xb0*/ + 0x0e10, 0x0e11, 0x0e12, 0x0e13, + 0x0e14, 0x0e15, 0x0e16, 0x0e17, + 0x0e18, 0x0e19, 0x0e1a, 0x0e1b, + 0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f, + /* 0xc0*/ + 0x0e20, 0x0e21, 0x0e22, 0x0e23, + 0x0e24, 0x0e25, 0x0e26, 0x0e27, + 0x0e28, 0x0e29, 0x0e2a, 0x0e2b, + 0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f, + /* 0xd0*/ + 0x0e30, 0x0e31, 0x0e32, 0x0e33, + 0x0e34, 0x0e35, 0x0e36, 0x0e37, + 0x0e38, 0x0e39, 0x0e3a, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0e3f, + /* 0xe0*/ + 0x0e40, 0x0e41, 0x0e42, 0x0e43, + 0x0e44, 0x0e45, 0x0e46, 0x0e47, + 0x0e48, 0x0e49, 0x0e4a, 0x0e4b, + 0x0e4c, 0x0e4d, 0x0e4e, 0x0e4f, + /* 0xf0*/ + 0x0e50, 0x0e51, 0x0e52, 0x0e53, + 0x0e54, 0x0e55, 0x0e56, 0x0e57, + 0x0e58, 0x0e59, 0x0e5a, 0x0e5b, + 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char page0e[256] = { + 0x00, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x00-0x07 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0x08-0x0f */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x10-0x17 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0x18-0x1f */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */ + 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0x38-0x3f */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x50-0x57 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x91, 0x92, 0x00, 0x00, 0x93, 0x94, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, page0e, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "cp874", + .alias = "tis-620", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp874(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp874(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp874) +module_exit(exit_nls_cp874) + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NLS(tis-620); diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c new file mode 100644 index 00000000..0ffed6f1 --- /dev/null +++ b/fs/nls/nls_cp932.c @@ -0,0 +1,7934 @@ +/* + * linux/fs/nls/nls_cp932.c + * + * Charset cp932 translation tables. + * This translation table was generated automatically, the + * original table can be download from the Microsoft website. + * (http://www.microsoft.com/typography/unicode/unicodecp.htm) + */ + +#include +#include +#include +#include +#include + +static const wchar_t c2u_81[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x3000,0x3001,0x3002,0xFF0C,0xFF0E,0x30FB,0xFF1A,0xFF1B,/* 0x40-0x47 */ + 0xFF1F,0xFF01,0x309B,0x309C,0x00B4,0xFF40,0x00A8,0xFF3E,/* 0x48-0x4F */ + 0xFFE3,0xFF3F,0x30FD,0x30FE,0x309D,0x309E,0x3003,0x4EDD,/* 0x50-0x57 */ + 0x3005,0x3006,0x3007,0x30FC,0x2015,0x2010,0xFF0F,0xFF3C,/* 0x58-0x5F */ + 0xFF5E,0x2225,0xFF5C,0x2026,0x2025,0x2018,0x2019,0x201C,/* 0x60-0x67 */ + 0x201D,0xFF08,0xFF09,0x3014,0x3015,0xFF3B,0xFF3D,0xFF5B,/* 0x68-0x6F */ + 0xFF5D,0x3008,0x3009,0x300A,0x300B,0x300C,0x300D,0x300E,/* 0x70-0x77 */ + 0x300F,0x3010,0x3011,0xFF0B,0xFF0D,0x00B1,0x00D7,0x0000,/* 0x78-0x7F */ + + 0x00F7,0xFF1D,0x2260,0xFF1C,0xFF1E,0x2266,0x2267,0x221E,/* 0x80-0x87 */ + 0x2234,0x2642,0x2640,0x00B0,0x2032,0x2033,0x2103,0xFFE5,/* 0x88-0x8F */ + 0xFF04,0xFFE0,0xFFE1,0xFF05,0xFF03,0xFF06,0xFF0A,0xFF20,/* 0x90-0x97 */ + 0x00A7,0x2606,0x2605,0x25CB,0x25CF,0x25CE,0x25C7,0x25C6,/* 0x98-0x9F */ + 0x25A1,0x25A0,0x25B3,0x25B2,0x25BD,0x25BC,0x203B,0x3012,/* 0xA0-0xA7 */ + 0x2192,0x2190,0x2191,0x2193,0x3013,0x0000,0x0000,0x0000,/* 0xA8-0xAF */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xB0-0xB7 */ + 0x2208,0x220B,0x2286,0x2287,0x2282,0x2283,0x222A,0x2229,/* 0xB8-0xBF */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */ + 0x2227,0x2228,0xFFE2,0x21D2,0x21D4,0x2200,0x2203,0x0000,/* 0xC8-0xCF */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD0-0xD7 */ + 0x0000,0x0000,0x2220,0x22A5,0x2312,0x2202,0x2207,0x2261,/* 0xD8-0xDF */ + 0x2252,0x226A,0x226B,0x221A,0x223D,0x221D,0x2235,0x222B,/* 0xE0-0xE7 */ + 0x222C,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xE8-0xEF */ + 0x212B,0x2030,0x266F,0x266D,0x266A,0x2020,0x2021,0x00B6,/* 0xF0-0xF7 */ + 0x0000,0x0000,0x0000,0x0000,0x25EF,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_82[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xFF10,/* 0x48-0x4F */ + 0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,0xFF18,/* 0x50-0x57 */ + 0xFF19,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0xFF21,0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,0xFF28,/* 0x60-0x67 */ + 0xFF29,0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,0xFF30,/* 0x68-0x6F */ + 0xFF31,0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,0xFF38,/* 0x70-0x77 */ + 0xFF39,0xFF3A,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0x80-0x87 */ + 0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0x88-0x8F */ + 0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0xFF57,/* 0x90-0x97 */ + 0xFF58,0xFF59,0xFF5A,0x0000,0x0000,0x0000,0x0000,0x3041,/* 0x98-0x9F */ + 0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,0x3048,0x3049,/* 0xA0-0xA7 */ + 0x304A,0x304B,0x304C,0x304D,0x304E,0x304F,0x3050,0x3051,/* 0xA8-0xAF */ + 0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,0x3058,0x3059,/* 0xB0-0xB7 */ + 0x305A,0x305B,0x305C,0x305D,0x305E,0x305F,0x3060,0x3061,/* 0xB8-0xBF */ + 0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,0x3068,0x3069,/* 0xC0-0xC7 */ + 0x306A,0x306B,0x306C,0x306D,0x306E,0x306F,0x3070,0x3071,/* 0xC8-0xCF */ + 0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,0x3078,0x3079,/* 0xD0-0xD7 */ + 0x307A,0x307B,0x307C,0x307D,0x307E,0x307F,0x3080,0x3081,/* 0xD8-0xDF */ + 0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,0x3088,0x3089,/* 0xE0-0xE7 */ + 0x308A,0x308B,0x308C,0x308D,0x308E,0x308F,0x3090,0x3091,/* 0xE8-0xEF */ + 0x3092,0x3093,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_83[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x30A1,0x30A2,0x30A3,0x30A4,0x30A5,0x30A6,0x30A7,0x30A8,/* 0x40-0x47 */ + 0x30A9,0x30AA,0x30AB,0x30AC,0x30AD,0x30AE,0x30AF,0x30B0,/* 0x48-0x4F */ + 0x30B1,0x30B2,0x30B3,0x30B4,0x30B5,0x30B6,0x30B7,0x30B8,/* 0x50-0x57 */ + 0x30B9,0x30BA,0x30BB,0x30BC,0x30BD,0x30BE,0x30BF,0x30C0,/* 0x58-0x5F */ + 0x30C1,0x30C2,0x30C3,0x30C4,0x30C5,0x30C6,0x30C7,0x30C8,/* 0x60-0x67 */ + 0x30C9,0x30CA,0x30CB,0x30CC,0x30CD,0x30CE,0x30CF,0x30D0,/* 0x68-0x6F */ + 0x30D1,0x30D2,0x30D3,0x30D4,0x30D5,0x30D6,0x30D7,0x30D8,/* 0x70-0x77 */ + 0x30D9,0x30DA,0x30DB,0x30DC,0x30DD,0x30DE,0x30DF,0x0000,/* 0x78-0x7F */ + + 0x30E0,0x30E1,0x30E2,0x30E3,0x30E4,0x30E5,0x30E6,0x30E7,/* 0x80-0x87 */ + 0x30E8,0x30E9,0x30EA,0x30EB,0x30EC,0x30ED,0x30EE,0x30EF,/* 0x88-0x8F */ + 0x30F0,0x30F1,0x30F2,0x30F3,0x30F4,0x30F5,0x30F6,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0391,/* 0x98-0x9F */ + 0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,0x0398,0x0399,/* 0xA0-0xA7 */ + 0x039A,0x039B,0x039C,0x039D,0x039E,0x039F,0x03A0,0x03A1,/* 0xA8-0xAF */ + 0x03A3,0x03A4,0x03A5,0x03A6,0x03A7,0x03A8,0x03A9,0x0000,/* 0xB0-0xB7 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x03B1,/* 0xB8-0xBF */ + 0x03B2,0x03B3,0x03B4,0x03B5,0x03B6,0x03B7,0x03B8,0x03B9,/* 0xC0-0xC7 */ + 0x03BA,0x03BB,0x03BC,0x03BD,0x03BE,0x03BF,0x03C0,0x03C1,/* 0xC8-0xCF */ + 0x03C3,0x03C4,0x03C5,0x03C6,0x03C7,0x03C8,0x03C9,0x0000,/* 0xD0-0xD7 */ +}; + +static const wchar_t c2u_84[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,0x0416,/* 0x40-0x47 */ + 0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,0x041E,/* 0x48-0x4F */ + 0x041F,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,0x0426,/* 0x50-0x57 */ + 0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,0x042E,/* 0x58-0x5F */ + 0x042F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,0x0436,/* 0x70-0x77 */ + 0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,0x0000,/* 0x78-0x7F */ + + 0x043E,0x043F,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,/* 0x80-0x87 */ + 0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,/* 0x88-0x8F */ + 0x044E,0x044F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x2500,/* 0x98-0x9F */ + 0x2502,0x250C,0x2510,0x2518,0x2514,0x251C,0x252C,0x2524,/* 0xA0-0xA7 */ + 0x2534,0x253C,0x2501,0x2503,0x250F,0x2513,0x251B,0x2517,/* 0xA8-0xAF */ + 0x2523,0x2533,0x252B,0x253B,0x254B,0x2520,0x252F,0x2528,/* 0xB0-0xB7 */ + 0x2537,0x253F,0x251D,0x2530,0x2525,0x2538,0x2542,0x0000,/* 0xB8-0xBF */ +}; + +static const wchar_t c2u_87[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x2460,0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,0x2467,/* 0x40-0x47 */ + 0x2468,0x2469,0x246A,0x246B,0x246C,0x246D,0x246E,0x246F,/* 0x48-0x4F */ + 0x2470,0x2471,0x2472,0x2473,0x2160,0x2161,0x2162,0x2163,/* 0x50-0x57 */ + 0x2164,0x2165,0x2166,0x2167,0x2168,0x2169,0x0000,0x3349,/* 0x58-0x5F */ + 0x3314,0x3322,0x334D,0x3318,0x3327,0x3303,0x3336,0x3351,/* 0x60-0x67 */ + 0x3357,0x330D,0x3326,0x3323,0x332B,0x334A,0x333B,0x339C,/* 0x68-0x6F */ + 0x339D,0x339E,0x338E,0x338F,0x33C4,0x33A1,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x337B,0x0000,/* 0x78-0x7F */ + + 0x301D,0x301F,0x2116,0x33CD,0x2121,0x32A4,0x32A5,0x32A6,/* 0x80-0x87 */ + 0x32A7,0x32A8,0x3231,0x3232,0x3239,0x337E,0x337D,0x337C,/* 0x88-0x8F */ + 0x2252,0x2261,0x222B,0x222E,0x2211,0x221A,0x22A5,0x2220,/* 0x90-0x97 */ + 0x221F,0x22BF,0x2235,0x2229,0x222A,0x0000,0x0000,0x0000,/* 0x98-0x9F */ +}; + +static const wchar_t c2u_88[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x4E9C,/* 0x98-0x9F */ + 0x5516,0x5A03,0x963F,0x54C0,0x611B,0x6328,0x59F6,0x9022,/* 0xA0-0xA7 */ + 0x8475,0x831C,0x7A50,0x60AA,0x63E1,0x6E25,0x65ED,0x8466,/* 0xA8-0xAF */ + 0x82A6,0x9BF5,0x6893,0x5727,0x65A1,0x6271,0x5B9B,0x59D0,/* 0xB0-0xB7 */ + 0x867B,0x98F4,0x7D62,0x7DBE,0x9B8E,0x6216,0x7C9F,0x88B7,/* 0xB8-0xBF */ + 0x5B89,0x5EB5,0x6309,0x6697,0x6848,0x95C7,0x978D,0x674F,/* 0xC0-0xC7 */ + 0x4EE5,0x4F0A,0x4F4D,0x4F9D,0x5049,0x56F2,0x5937,0x59D4,/* 0xC8-0xCF */ + 0x5A01,0x5C09,0x60DF,0x610F,0x6170,0x6613,0x6905,0x70BA,/* 0xD0-0xD7 */ + 0x754F,0x7570,0x79FB,0x7DAD,0x7DEF,0x80C3,0x840E,0x8863,/* 0xD8-0xDF */ + 0x8B02,0x9055,0x907A,0x533B,0x4E95,0x4EA5,0x57DF,0x80B2,/* 0xE0-0xE7 */ + 0x90C1,0x78EF,0x4E00,0x58F1,0x6EA2,0x9038,0x7A32,0x8328,/* 0xE8-0xEF */ + 0x828B,0x9C2F,0x5141,0x5370,0x54BD,0x54E1,0x56E0,0x59FB,/* 0xF0-0xF7 */ + 0x5F15,0x98F2,0x6DEB,0x80E4,0x852D,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_89[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9662,0x9670,0x96A0,0x97FB,0x540B,0x53F3,0x5B87,0x70CF,/* 0x40-0x47 */ + 0x7FBD,0x8FC2,0x96E8,0x536F,0x9D5C,0x7ABA,0x4E11,0x7893,/* 0x48-0x4F */ + 0x81FC,0x6E26,0x5618,0x5504,0x6B1D,0x851A,0x9C3B,0x59E5,/* 0x50-0x57 */ + 0x53A9,0x6D66,0x74DC,0x958F,0x5642,0x4E91,0x904B,0x96F2,/* 0x58-0x5F */ + 0x834F,0x990C,0x53E1,0x55B6,0x5B30,0x5F71,0x6620,0x66F3,/* 0x60-0x67 */ + 0x6804,0x6C38,0x6CF3,0x6D29,0x745B,0x76C8,0x7A4E,0x9834,/* 0x68-0x6F */ + 0x82F1,0x885B,0x8A60,0x92ED,0x6DB2,0x75AB,0x76CA,0x99C5,/* 0x70-0x77 */ + 0x60A6,0x8B01,0x8D8A,0x95B2,0x698E,0x53AD,0x5186,0x0000,/* 0x78-0x7F */ + + 0x5712,0x5830,0x5944,0x5BB4,0x5EF6,0x6028,0x63A9,0x63F4,/* 0x80-0x87 */ + 0x6CBF,0x6F14,0x708E,0x7114,0x7159,0x71D5,0x733F,0x7E01,/* 0x88-0x8F */ + 0x8276,0x82D1,0x8597,0x9060,0x925B,0x9D1B,0x5869,0x65BC,/* 0x90-0x97 */ + 0x6C5A,0x7525,0x51F9,0x592E,0x5965,0x5F80,0x5FDC,0x62BC,/* 0x98-0x9F */ + 0x65FA,0x6A2A,0x6B27,0x6BB4,0x738B,0x7FC1,0x8956,0x9D2C,/* 0xA0-0xA7 */ + 0x9D0E,0x9EC4,0x5CA1,0x6C96,0x837B,0x5104,0x5C4B,0x61B6,/* 0xA8-0xAF */ + 0x81C6,0x6876,0x7261,0x4E59,0x4FFA,0x5378,0x6069,0x6E29,/* 0xB0-0xB7 */ + 0x7A4F,0x97F3,0x4E0B,0x5316,0x4EEE,0x4F55,0x4F3D,0x4FA1,/* 0xB8-0xBF */ + 0x4F73,0x52A0,0x53EF,0x5609,0x590F,0x5AC1,0x5BB6,0x5BE1,/* 0xC0-0xC7 */ + 0x79D1,0x6687,0x679C,0x67B6,0x6B4C,0x6CB3,0x706B,0x73C2,/* 0xC8-0xCF */ + 0x798D,0x79BE,0x7A3C,0x7B87,0x82B1,0x82DB,0x8304,0x8377,/* 0xD0-0xD7 */ + 0x83EF,0x83D3,0x8766,0x8AB2,0x5629,0x8CA8,0x8FE6,0x904E,/* 0xD8-0xDF */ + 0x971E,0x868A,0x4FC4,0x5CE8,0x6211,0x7259,0x753B,0x81E5,/* 0xE0-0xE7 */ + 0x82BD,0x86FE,0x8CC0,0x96C5,0x9913,0x99D5,0x4ECB,0x4F1A,/* 0xE8-0xEF */ + 0x89E3,0x56DE,0x584A,0x58CA,0x5EFB,0x5FEB,0x602A,0x6094,/* 0xF0-0xF7 */ + 0x6062,0x61D0,0x6212,0x62D0,0x6539,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8A[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9B41,0x6666,0x68B0,0x6D77,0x7070,0x754C,0x7686,0x7D75,/* 0x40-0x47 */ + 0x82A5,0x87F9,0x958B,0x968E,0x8C9D,0x51F1,0x52BE,0x5916,/* 0x48-0x4F */ + 0x54B3,0x5BB3,0x5D16,0x6168,0x6982,0x6DAF,0x788D,0x84CB,/* 0x50-0x57 */ + 0x8857,0x8A72,0x93A7,0x9AB8,0x6D6C,0x99A8,0x86D9,0x57A3,/* 0x58-0x5F */ + 0x67FF,0x86CE,0x920E,0x5283,0x5687,0x5404,0x5ED3,0x62E1,/* 0x60-0x67 */ + 0x64B9,0x683C,0x6838,0x6BBB,0x7372,0x78BA,0x7A6B,0x899A,/* 0x68-0x6F */ + 0x89D2,0x8D6B,0x8F03,0x90ED,0x95A3,0x9694,0x9769,0x5B66,/* 0x70-0x77 */ + 0x5CB3,0x697D,0x984D,0x984E,0x639B,0x7B20,0x6A2B,0x0000,/* 0x78-0x7F */ + + 0x6A7F,0x68B6,0x9C0D,0x6F5F,0x5272,0x559D,0x6070,0x62EC,/* 0x80-0x87 */ + 0x6D3B,0x6E07,0x6ED1,0x845B,0x8910,0x8F44,0x4E14,0x9C39,/* 0x88-0x8F */ + 0x53F6,0x691B,0x6A3A,0x9784,0x682A,0x515C,0x7AC3,0x84B2,/* 0x90-0x97 */ + 0x91DC,0x938C,0x565B,0x9D28,0x6822,0x8305,0x8431,0x7CA5,/* 0x98-0x9F */ + 0x5208,0x82C5,0x74E6,0x4E7E,0x4F83,0x51A0,0x5BD2,0x520A,/* 0xA0-0xA7 */ + 0x52D8,0x52E7,0x5DFB,0x559A,0x582A,0x59E6,0x5B8C,0x5B98,/* 0xA8-0xAF */ + 0x5BDB,0x5E72,0x5E79,0x60A3,0x611F,0x6163,0x61BE,0x63DB,/* 0xB0-0xB7 */ + 0x6562,0x67D1,0x6853,0x68FA,0x6B3E,0x6B53,0x6C57,0x6F22,/* 0xB8-0xBF */ + 0x6F97,0x6F45,0x74B0,0x7518,0x76E3,0x770B,0x7AFF,0x7BA1,/* 0xC0-0xC7 */ + 0x7C21,0x7DE9,0x7F36,0x7FF0,0x809D,0x8266,0x839E,0x89B3,/* 0xC8-0xCF */ + 0x8ACC,0x8CAB,0x9084,0x9451,0x9593,0x9591,0x95A2,0x9665,/* 0xD0-0xD7 */ + 0x97D3,0x9928,0x8218,0x4E38,0x542B,0x5CB8,0x5DCC,0x73A9,/* 0xD8-0xDF */ + 0x764C,0x773C,0x5CA9,0x7FEB,0x8D0B,0x96C1,0x9811,0x9854,/* 0xE0-0xE7 */ + 0x9858,0x4F01,0x4F0E,0x5371,0x559C,0x5668,0x57FA,0x5947,/* 0xE8-0xEF */ + 0x5B09,0x5BC4,0x5C90,0x5E0C,0x5E7E,0x5FCC,0x63EE,0x673A,/* 0xF0-0xF7 */ + 0x65D7,0x65E2,0x671F,0x68CB,0x68C4,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8B[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6A5F,0x5E30,0x6BC5,0x6C17,0x6C7D,0x757F,0x7948,0x5B63,/* 0x40-0x47 */ + 0x7A00,0x7D00,0x5FBD,0x898F,0x8A18,0x8CB4,0x8D77,0x8ECC,/* 0x48-0x4F */ + 0x8F1D,0x98E2,0x9A0E,0x9B3C,0x4E80,0x507D,0x5100,0x5993,/* 0x50-0x57 */ + 0x5B9C,0x622F,0x6280,0x64EC,0x6B3A,0x72A0,0x7591,0x7947,/* 0x58-0x5F */ + 0x7FA9,0x87FB,0x8ABC,0x8B70,0x63AC,0x83CA,0x97A0,0x5409,/* 0x60-0x67 */ + 0x5403,0x55AB,0x6854,0x6A58,0x8A70,0x7827,0x6775,0x9ECD,/* 0x68-0x6F */ + 0x5374,0x5BA2,0x811A,0x8650,0x9006,0x4E18,0x4E45,0x4EC7,/* 0x70-0x77 */ + 0x4F11,0x53CA,0x5438,0x5BAE,0x5F13,0x6025,0x6551,0x0000,/* 0x78-0x7F */ + + 0x673D,0x6C42,0x6C72,0x6CE3,0x7078,0x7403,0x7A76,0x7AAE,/* 0x80-0x87 */ + 0x7B08,0x7D1A,0x7CFE,0x7D66,0x65E7,0x725B,0x53BB,0x5C45,/* 0x88-0x8F */ + 0x5DE8,0x62D2,0x62E0,0x6319,0x6E20,0x865A,0x8A31,0x8DDD,/* 0x90-0x97 */ + 0x92F8,0x6F01,0x79A6,0x9B5A,0x4EA8,0x4EAB,0x4EAC,0x4F9B,/* 0x98-0x9F */ + 0x4FA0,0x50D1,0x5147,0x7AF6,0x5171,0x51F6,0x5354,0x5321,/* 0xA0-0xA7 */ + 0x537F,0x53EB,0x55AC,0x5883,0x5CE1,0x5F37,0x5F4A,0x602F,/* 0xA8-0xAF */ + 0x6050,0x606D,0x631F,0x6559,0x6A4B,0x6CC1,0x72C2,0x72ED,/* 0xB0-0xB7 */ + 0x77EF,0x80F8,0x8105,0x8208,0x854E,0x90F7,0x93E1,0x97FF,/* 0xB8-0xBF */ + 0x9957,0x9A5A,0x4EF0,0x51DD,0x5C2D,0x6681,0x696D,0x5C40,/* 0xC0-0xC7 */ + 0x66F2,0x6975,0x7389,0x6850,0x7C81,0x50C5,0x52E4,0x5747,/* 0xC8-0xCF */ + 0x5DFE,0x9326,0x65A4,0x6B23,0x6B3D,0x7434,0x7981,0x79BD,/* 0xD0-0xD7 */ + 0x7B4B,0x7DCA,0x82B9,0x83CC,0x887F,0x895F,0x8B39,0x8FD1,/* 0xD8-0xDF */ + 0x91D1,0x541F,0x9280,0x4E5D,0x5036,0x53E5,0x533A,0x72D7,/* 0xE0-0xE7 */ + 0x7396,0x77E9,0x82E6,0x8EAF,0x99C6,0x99C8,0x99D2,0x5177,/* 0xE8-0xEF */ + 0x611A,0x865E,0x55B0,0x7A7A,0x5076,0x5BD3,0x9047,0x9685,/* 0xF0-0xF7 */ + 0x4E32,0x6ADB,0x91E7,0x5C51,0x5C48,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8C[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6398,0x7A9F,0x6C93,0x9774,0x8F61,0x7AAA,0x718A,0x9688,/* 0x40-0x47 */ + 0x7C82,0x6817,0x7E70,0x6851,0x936C,0x52F2,0x541B,0x85AB,/* 0x48-0x4F */ + 0x8A13,0x7FA4,0x8ECD,0x90E1,0x5366,0x8888,0x7941,0x4FC2,/* 0x50-0x57 */ + 0x50BE,0x5211,0x5144,0x5553,0x572D,0x73EA,0x578B,0x5951,/* 0x58-0x5F */ + 0x5F62,0x5F84,0x6075,0x6176,0x6167,0x61A9,0x63B2,0x643A,/* 0x60-0x67 */ + 0x656C,0x666F,0x6842,0x6E13,0x7566,0x7A3D,0x7CFB,0x7D4C,/* 0x68-0x6F */ + 0x7D99,0x7E4B,0x7F6B,0x830E,0x834A,0x86CD,0x8A08,0x8A63,/* 0x70-0x77 */ + 0x8B66,0x8EFD,0x981A,0x9D8F,0x82B8,0x8FCE,0x9BE8,0x0000,/* 0x78-0x7F */ + + 0x5287,0x621F,0x6483,0x6FC0,0x9699,0x6841,0x5091,0x6B20,/* 0x80-0x87 */ + 0x6C7A,0x6F54,0x7A74,0x7D50,0x8840,0x8A23,0x6708,0x4EF6,/* 0x88-0x8F */ + 0x5039,0x5026,0x5065,0x517C,0x5238,0x5263,0x55A7,0x570F,/* 0x90-0x97 */ + 0x5805,0x5ACC,0x5EFA,0x61B2,0x61F8,0x62F3,0x6372,0x691C,/* 0x98-0x9F */ + 0x6A29,0x727D,0x72AC,0x732E,0x7814,0x786F,0x7D79,0x770C,/* 0xA0-0xA7 */ + 0x80A9,0x898B,0x8B19,0x8CE2,0x8ED2,0x9063,0x9375,0x967A,/* 0xA8-0xAF */ + 0x9855,0x9A13,0x9E78,0x5143,0x539F,0x53B3,0x5E7B,0x5F26,/* 0xB0-0xB7 */ + 0x6E1B,0x6E90,0x7384,0x73FE,0x7D43,0x8237,0x8A00,0x8AFA,/* 0xB8-0xBF */ + 0x9650,0x4E4E,0x500B,0x53E4,0x547C,0x56FA,0x59D1,0x5B64,/* 0xC0-0xC7 */ + 0x5DF1,0x5EAB,0x5F27,0x6238,0x6545,0x67AF,0x6E56,0x72D0,/* 0xC8-0xCF */ + 0x7CCA,0x88B4,0x80A1,0x80E1,0x83F0,0x864E,0x8A87,0x8DE8,/* 0xD0-0xD7 */ + 0x9237,0x96C7,0x9867,0x9F13,0x4E94,0x4E92,0x4F0D,0x5348,/* 0xD8-0xDF */ + 0x5449,0x543E,0x5A2F,0x5F8C,0x5FA1,0x609F,0x68A7,0x6A8E,/* 0xE0-0xE7 */ + 0x745A,0x7881,0x8A9E,0x8AA4,0x8B77,0x9190,0x4E5E,0x9BC9,/* 0xE8-0xEF */ + 0x4EA4,0x4F7C,0x4FAF,0x5019,0x5016,0x5149,0x516C,0x529F,/* 0xF0-0xF7 */ + 0x52B9,0x52FE,0x539A,0x53E3,0x5411,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8D[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x540E,0x5589,0x5751,0x57A2,0x597D,0x5B54,0x5B5D,0x5B8F,/* 0x40-0x47 */ + 0x5DE5,0x5DE7,0x5DF7,0x5E78,0x5E83,0x5E9A,0x5EB7,0x5F18,/* 0x48-0x4F */ + 0x6052,0x614C,0x6297,0x62D8,0x63A7,0x653B,0x6602,0x6643,/* 0x50-0x57 */ + 0x66F4,0x676D,0x6821,0x6897,0x69CB,0x6C5F,0x6D2A,0x6D69,/* 0x58-0x5F */ + 0x6E2F,0x6E9D,0x7532,0x7687,0x786C,0x7A3F,0x7CE0,0x7D05,/* 0x60-0x67 */ + 0x7D18,0x7D5E,0x7DB1,0x8015,0x8003,0x80AF,0x80B1,0x8154,/* 0x68-0x6F */ + 0x818F,0x822A,0x8352,0x884C,0x8861,0x8B1B,0x8CA2,0x8CFC,/* 0x70-0x77 */ + 0x90CA,0x9175,0x9271,0x783F,0x92FC,0x95A4,0x964D,0x0000,/* 0x78-0x7F */ + + 0x9805,0x9999,0x9AD8,0x9D3B,0x525B,0x52AB,0x53F7,0x5408,/* 0x80-0x87 */ + 0x58D5,0x62F7,0x6FE0,0x8C6A,0x8F5F,0x9EB9,0x514B,0x523B,/* 0x88-0x8F */ + 0x544A,0x56FD,0x7A40,0x9177,0x9D60,0x9ED2,0x7344,0x6F09,/* 0x90-0x97 */ + 0x8170,0x7511,0x5FFD,0x60DA,0x9AA8,0x72DB,0x8FBC,0x6B64,/* 0x98-0x9F */ + 0x9803,0x4ECA,0x56F0,0x5764,0x58BE,0x5A5A,0x6068,0x61C7,/* 0xA0-0xA7 */ + 0x660F,0x6606,0x6839,0x68B1,0x6DF7,0x75D5,0x7D3A,0x826E,/* 0xA8-0xAF */ + 0x9B42,0x4E9B,0x4F50,0x53C9,0x5506,0x5D6F,0x5DE6,0x5DEE,/* 0xB0-0xB7 */ + 0x67FB,0x6C99,0x7473,0x7802,0x8A50,0x9396,0x88DF,0x5750,/* 0xB8-0xBF */ + 0x5EA7,0x632B,0x50B5,0x50AC,0x518D,0x6700,0x54C9,0x585E,/* 0xC0-0xC7 */ + 0x59BB,0x5BB0,0x5F69,0x624D,0x63A1,0x683D,0x6B73,0x6E08,/* 0xC8-0xCF */ + 0x707D,0x91C7,0x7280,0x7815,0x7826,0x796D,0x658E,0x7D30,/* 0xD0-0xD7 */ + 0x83DC,0x88C1,0x8F09,0x969B,0x5264,0x5728,0x6750,0x7F6A,/* 0xD8-0xDF */ + 0x8CA1,0x51B4,0x5742,0x962A,0x583A,0x698A,0x80B4,0x54B2,/* 0xE0-0xE7 */ + 0x5D0E,0x57FC,0x7895,0x9DFA,0x4F5C,0x524A,0x548B,0x643E,/* 0xE8-0xEF */ + 0x6628,0x6714,0x67F5,0x7A84,0x7B56,0x7D22,0x932F,0x685C,/* 0xF0-0xF7 */ + 0x9BAD,0x7B39,0x5319,0x518A,0x5237,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8E[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5BDF,0x62F6,0x64AE,0x64E6,0x672D,0x6BBA,0x85A9,0x96D1,/* 0x40-0x47 */ + 0x7690,0x9BD6,0x634C,0x9306,0x9BAB,0x76BF,0x6652,0x4E09,/* 0x48-0x4F */ + 0x5098,0x53C2,0x5C71,0x60E8,0x6492,0x6563,0x685F,0x71E6,/* 0x50-0x57 */ + 0x73CA,0x7523,0x7B97,0x7E82,0x8695,0x8B83,0x8CDB,0x9178,/* 0x58-0x5F */ + 0x9910,0x65AC,0x66AB,0x6B8B,0x4ED5,0x4ED4,0x4F3A,0x4F7F,/* 0x60-0x67 */ + 0x523A,0x53F8,0x53F2,0x55E3,0x56DB,0x58EB,0x59CB,0x59C9,/* 0x68-0x6F */ + 0x59FF,0x5B50,0x5C4D,0x5E02,0x5E2B,0x5FD7,0x601D,0x6307,/* 0x70-0x77 */ + 0x652F,0x5B5C,0x65AF,0x65BD,0x65E8,0x679D,0x6B62,0x0000,/* 0x78-0x7F */ + + 0x6B7B,0x6C0F,0x7345,0x7949,0x79C1,0x7CF8,0x7D19,0x7D2B,/* 0x80-0x87 */ + 0x80A2,0x8102,0x81F3,0x8996,0x8A5E,0x8A69,0x8A66,0x8A8C,/* 0x88-0x8F */ + 0x8AEE,0x8CC7,0x8CDC,0x96CC,0x98FC,0x6B6F,0x4E8B,0x4F3C,/* 0x90-0x97 */ + 0x4F8D,0x5150,0x5B57,0x5BFA,0x6148,0x6301,0x6642,0x6B21,/* 0x98-0x9F */ + 0x6ECB,0x6CBB,0x723E,0x74BD,0x75D4,0x78C1,0x793A,0x800C,/* 0xA0-0xA7 */ + 0x8033,0x81EA,0x8494,0x8F9E,0x6C50,0x9E7F,0x5F0F,0x8B58,/* 0xA8-0xAF */ + 0x9D2B,0x7AFA,0x8EF8,0x5B8D,0x96EB,0x4E03,0x53F1,0x57F7,/* 0xB0-0xB7 */ + 0x5931,0x5AC9,0x5BA4,0x6089,0x6E7F,0x6F06,0x75BE,0x8CEA,/* 0xB8-0xBF */ + 0x5B9F,0x8500,0x7BE0,0x5072,0x67F4,0x829D,0x5C61,0x854A,/* 0xC0-0xC7 */ + 0x7E1E,0x820E,0x5199,0x5C04,0x6368,0x8D66,0x659C,0x716E,/* 0xC8-0xCF */ + 0x793E,0x7D17,0x8005,0x8B1D,0x8ECA,0x906E,0x86C7,0x90AA,/* 0xD0-0xD7 */ + 0x501F,0x52FA,0x5C3A,0x6753,0x707C,0x7235,0x914C,0x91C8,/* 0xD8-0xDF */ + 0x932B,0x82E5,0x5BC2,0x5F31,0x60F9,0x4E3B,0x53D6,0x5B88,/* 0xE0-0xE7 */ + 0x624B,0x6731,0x6B8A,0x72E9,0x73E0,0x7A2E,0x816B,0x8DA3,/* 0xE8-0xEF */ + 0x9152,0x9996,0x5112,0x53D7,0x546A,0x5BFF,0x6388,0x6A39,/* 0xF0-0xF7 */ + 0x7DAC,0x9700,0x56DA,0x53CE,0x5468,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8F[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5B97,0x5C31,0x5DDE,0x4FEE,0x6101,0x62FE,0x6D32,0x79C0,/* 0x40-0x47 */ + 0x79CB,0x7D42,0x7E4D,0x7FD2,0x81ED,0x821F,0x8490,0x8846,/* 0x48-0x4F */ + 0x8972,0x8B90,0x8E74,0x8F2F,0x9031,0x914B,0x916C,0x96C6,/* 0x50-0x57 */ + 0x919C,0x4EC0,0x4F4F,0x5145,0x5341,0x5F93,0x620E,0x67D4,/* 0x58-0x5F */ + 0x6C41,0x6E0B,0x7363,0x7E26,0x91CD,0x9283,0x53D4,0x5919,/* 0x60-0x67 */ + 0x5BBF,0x6DD1,0x795D,0x7E2E,0x7C9B,0x587E,0x719F,0x51FA,/* 0x68-0x6F */ + 0x8853,0x8FF0,0x4FCA,0x5CFB,0x6625,0x77AC,0x7AE3,0x821C,/* 0x70-0x77 */ + 0x99FF,0x51C6,0x5FAA,0x65EC,0x696F,0x6B89,0x6DF3,0x0000,/* 0x78-0x7F */ + + 0x6E96,0x6F64,0x76FE,0x7D14,0x5DE1,0x9075,0x9187,0x9806,/* 0x80-0x87 */ + 0x51E6,0x521D,0x6240,0x6691,0x66D9,0x6E1A,0x5EB6,0x7DD2,/* 0x88-0x8F */ + 0x7F72,0x66F8,0x85AF,0x85F7,0x8AF8,0x52A9,0x53D9,0x5973,/* 0x90-0x97 */ + 0x5E8F,0x5F90,0x6055,0x92E4,0x9664,0x50B7,0x511F,0x52DD,/* 0x98-0x9F */ + 0x5320,0x5347,0x53EC,0x54E8,0x5546,0x5531,0x5617,0x5968,/* 0xA0-0xA7 */ + 0x59BE,0x5A3C,0x5BB5,0x5C06,0x5C0F,0x5C11,0x5C1A,0x5E84,/* 0xA8-0xAF */ + 0x5E8A,0x5EE0,0x5F70,0x627F,0x6284,0x62DB,0x638C,0x6377,/* 0xB0-0xB7 */ + 0x6607,0x660C,0x662D,0x6676,0x677E,0x68A2,0x6A1F,0x6A35,/* 0xB8-0xBF */ + 0x6CBC,0x6D88,0x6E09,0x6E58,0x713C,0x7126,0x7167,0x75C7,/* 0xC0-0xC7 */ + 0x7701,0x785D,0x7901,0x7965,0x79F0,0x7AE0,0x7B11,0x7CA7,/* 0xC8-0xCF */ + 0x7D39,0x8096,0x83D6,0x848B,0x8549,0x885D,0x88F3,0x8A1F,/* 0xD0-0xD7 */ + 0x8A3C,0x8A54,0x8A73,0x8C61,0x8CDE,0x91A4,0x9266,0x937E,/* 0xD8-0xDF */ + 0x9418,0x969C,0x9798,0x4E0A,0x4E08,0x4E1E,0x4E57,0x5197,/* 0xE0-0xE7 */ + 0x5270,0x57CE,0x5834,0x58CC,0x5B22,0x5E38,0x60C5,0x64FE,/* 0xE8-0xEF */ + 0x6761,0x6756,0x6D44,0x72B6,0x7573,0x7A63,0x84B8,0x8B72,/* 0xF0-0xF7 */ + 0x91B8,0x9320,0x5631,0x57F4,0x98FE,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_90[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x62ED,0x690D,0x6B96,0x71ED,0x7E54,0x8077,0x8272,0x89E6,/* 0x40-0x47 */ + 0x98DF,0x8755,0x8FB1,0x5C3B,0x4F38,0x4FE1,0x4FB5,0x5507,/* 0x48-0x4F */ + 0x5A20,0x5BDD,0x5BE9,0x5FC3,0x614E,0x632F,0x65B0,0x664B,/* 0x50-0x57 */ + 0x68EE,0x699B,0x6D78,0x6DF1,0x7533,0x75B9,0x771F,0x795E,/* 0x58-0x5F */ + 0x79E6,0x7D33,0x81E3,0x82AF,0x85AA,0x89AA,0x8A3A,0x8EAB,/* 0x60-0x67 */ + 0x8F9B,0x9032,0x91DD,0x9707,0x4EBA,0x4EC1,0x5203,0x5875,/* 0x68-0x6F */ + 0x58EC,0x5C0B,0x751A,0x5C3D,0x814E,0x8A0A,0x8FC5,0x9663,/* 0x70-0x77 */ + 0x976D,0x7B25,0x8ACF,0x9808,0x9162,0x56F3,0x53A8,0x0000,/* 0x78-0x7F */ + + 0x9017,0x5439,0x5782,0x5E25,0x63A8,0x6C34,0x708A,0x7761,/* 0x80-0x87 */ + 0x7C8B,0x7FE0,0x8870,0x9042,0x9154,0x9310,0x9318,0x968F,/* 0x88-0x8F */ + 0x745E,0x9AC4,0x5D07,0x5D69,0x6570,0x67A2,0x8DA8,0x96DB,/* 0x90-0x97 */ + 0x636E,0x6749,0x6919,0x83C5,0x9817,0x96C0,0x88FE,0x6F84,/* 0x98-0x9F */ + 0x647A,0x5BF8,0x4E16,0x702C,0x755D,0x662F,0x51C4,0x5236,/* 0xA0-0xA7 */ + 0x52E2,0x59D3,0x5F81,0x6027,0x6210,0x653F,0x6574,0x661F,/* 0xA8-0xAF */ + 0x6674,0x68F2,0x6816,0x6B63,0x6E05,0x7272,0x751F,0x76DB,/* 0xB0-0xB7 */ + 0x7CBE,0x8056,0x58F0,0x88FD,0x897F,0x8AA0,0x8A93,0x8ACB,/* 0xB8-0xBF */ + 0x901D,0x9192,0x9752,0x9759,0x6589,0x7A0E,0x8106,0x96BB,/* 0xC0-0xC7 */ + 0x5E2D,0x60DC,0x621A,0x65A5,0x6614,0x6790,0x77F3,0x7A4D,/* 0xC8-0xCF */ + 0x7C4D,0x7E3E,0x810A,0x8CAC,0x8D64,0x8DE1,0x8E5F,0x78A9,/* 0xD0-0xD7 */ + 0x5207,0x62D9,0x63A5,0x6442,0x6298,0x8A2D,0x7A83,0x7BC0,/* 0xD8-0xDF */ + 0x8AAC,0x96EA,0x7D76,0x820C,0x8749,0x4ED9,0x5148,0x5343,/* 0xE0-0xE7 */ + 0x5360,0x5BA3,0x5C02,0x5C16,0x5DDD,0x6226,0x6247,0x64B0,/* 0xE8-0xEF */ + 0x6813,0x6834,0x6CC9,0x6D45,0x6D17,0x67D3,0x6F5C,0x714E,/* 0xF0-0xF7 */ + 0x717D,0x65CB,0x7A7F,0x7BAD,0x7DDA,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_91[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7E4A,0x7FA8,0x817A,0x821B,0x8239,0x85A6,0x8A6E,0x8CCE,/* 0x40-0x47 */ + 0x8DF5,0x9078,0x9077,0x92AD,0x9291,0x9583,0x9BAE,0x524D,/* 0x48-0x4F */ + 0x5584,0x6F38,0x7136,0x5168,0x7985,0x7E55,0x81B3,0x7CCE,/* 0x50-0x57 */ + 0x564C,0x5851,0x5CA8,0x63AA,0x66FE,0x66FD,0x695A,0x72D9,/* 0x58-0x5F */ + 0x758F,0x758E,0x790E,0x7956,0x79DF,0x7C97,0x7D20,0x7D44,/* 0x60-0x67 */ + 0x8607,0x8A34,0x963B,0x9061,0x9F20,0x50E7,0x5275,0x53CC,/* 0x68-0x6F */ + 0x53E2,0x5009,0x55AA,0x58EE,0x594F,0x723D,0x5B8B,0x5C64,/* 0x70-0x77 */ + 0x531D,0x60E3,0x60F3,0x635C,0x6383,0x633F,0x63BB,0x0000,/* 0x78-0x7F */ + + 0x64CD,0x65E9,0x66F9,0x5DE3,0x69CD,0x69FD,0x6F15,0x71E5,/* 0x80-0x87 */ + 0x4E89,0x75E9,0x76F8,0x7A93,0x7CDF,0x7DCF,0x7D9C,0x8061,/* 0x88-0x8F */ + 0x8349,0x8358,0x846C,0x84BC,0x85FB,0x88C5,0x8D70,0x9001,/* 0x90-0x97 */ + 0x906D,0x9397,0x971C,0x9A12,0x50CF,0x5897,0x618E,0x81D3,/* 0x98-0x9F */ + 0x8535,0x8D08,0x9020,0x4FC3,0x5074,0x5247,0x5373,0x606F,/* 0xA0-0xA7 */ + 0x6349,0x675F,0x6E2C,0x8DB3,0x901F,0x4FD7,0x5C5E,0x8CCA,/* 0xA8-0xAF */ + 0x65CF,0x7D9A,0x5352,0x8896,0x5176,0x63C3,0x5B58,0x5B6B,/* 0xB0-0xB7 */ + 0x5C0A,0x640D,0x6751,0x905C,0x4ED6,0x591A,0x592A,0x6C70,/* 0xB8-0xBF */ + 0x8A51,0x553E,0x5815,0x59A5,0x60F0,0x6253,0x67C1,0x8235,/* 0xC0-0xC7 */ + 0x6955,0x9640,0x99C4,0x9A28,0x4F53,0x5806,0x5BFE,0x8010,/* 0xC8-0xCF */ + 0x5CB1,0x5E2F,0x5F85,0x6020,0x614B,0x6234,0x66FF,0x6CF0,/* 0xD0-0xD7 */ + 0x6EDE,0x80CE,0x817F,0x82D4,0x888B,0x8CB8,0x9000,0x902E,/* 0xD8-0xDF */ + 0x968A,0x9EDB,0x9BDB,0x4EE3,0x53F0,0x5927,0x7B2C,0x918D,/* 0xE0-0xE7 */ + 0x984C,0x9DF9,0x6EDD,0x7027,0x5353,0x5544,0x5B85,0x6258,/* 0xE8-0xEF */ + 0x629E,0x62D3,0x6CA2,0x6FEF,0x7422,0x8A17,0x9438,0x6FC1,/* 0xF0-0xF7 */ + 0x8AFE,0x8338,0x51E7,0x86F8,0x53EA,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_92[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x53E9,0x4F46,0x9054,0x8FB0,0x596A,0x8131,0x5DFD,0x7AEA,/* 0x40-0x47 */ + 0x8FBF,0x68DA,0x8C37,0x72F8,0x9C48,0x6A3D,0x8AB0,0x4E39,/* 0x48-0x4F */ + 0x5358,0x5606,0x5766,0x62C5,0x63A2,0x65E6,0x6B4E,0x6DE1,/* 0x50-0x57 */ + 0x6E5B,0x70AD,0x77ED,0x7AEF,0x7BAA,0x7DBB,0x803D,0x80C6,/* 0x58-0x5F */ + 0x86CB,0x8A95,0x935B,0x56E3,0x58C7,0x5F3E,0x65AD,0x6696,/* 0x60-0x67 */ + 0x6A80,0x6BB5,0x7537,0x8AC7,0x5024,0x77E5,0x5730,0x5F1B,/* 0x68-0x6F */ + 0x6065,0x667A,0x6C60,0x75F4,0x7A1A,0x7F6E,0x81F4,0x8718,/* 0x70-0x77 */ + 0x9045,0x99B3,0x7BC9,0x755C,0x7AF9,0x7B51,0x84C4,0x0000,/* 0x78-0x7F */ + + 0x9010,0x79E9,0x7A92,0x8336,0x5AE1,0x7740,0x4E2D,0x4EF2,/* 0x80-0x87 */ + 0x5B99,0x5FE0,0x62BD,0x663C,0x67F1,0x6CE8,0x866B,0x8877,/* 0x88-0x8F */ + 0x8A3B,0x914E,0x92F3,0x99D0,0x6A17,0x7026,0x732A,0x82E7,/* 0x90-0x97 */ + 0x8457,0x8CAF,0x4E01,0x5146,0x51CB,0x558B,0x5BF5,0x5E16,/* 0x98-0x9F */ + 0x5E33,0x5E81,0x5F14,0x5F35,0x5F6B,0x5FB4,0x61F2,0x6311,/* 0xA0-0xA7 */ + 0x66A2,0x671D,0x6F6E,0x7252,0x753A,0x773A,0x8074,0x8139,/* 0xA8-0xAF */ + 0x8178,0x8776,0x8ABF,0x8ADC,0x8D85,0x8DF3,0x929A,0x9577,/* 0xB0-0xB7 */ + 0x9802,0x9CE5,0x52C5,0x6357,0x76F4,0x6715,0x6C88,0x73CD,/* 0xB8-0xBF */ + 0x8CC3,0x93AE,0x9673,0x6D25,0x589C,0x690E,0x69CC,0x8FFD,/* 0xC0-0xC7 */ + 0x939A,0x75DB,0x901A,0x585A,0x6802,0x63B4,0x69FB,0x4F43,/* 0xC8-0xCF */ + 0x6F2C,0x67D8,0x8FBB,0x8526,0x7DB4,0x9354,0x693F,0x6F70,/* 0xD0-0xD7 */ + 0x576A,0x58F7,0x5B2C,0x7D2C,0x722A,0x540A,0x91E3,0x9DB4,/* 0xD8-0xDF */ + 0x4EAD,0x4F4E,0x505C,0x5075,0x5243,0x8C9E,0x5448,0x5824,/* 0xE0-0xE7 */ + 0x5B9A,0x5E1D,0x5E95,0x5EAD,0x5EF7,0x5F1F,0x608C,0x62B5,/* 0xE8-0xEF */ + 0x633A,0x63D0,0x68AF,0x6C40,0x7887,0x798E,0x7A0B,0x7DE0,/* 0xF0-0xF7 */ + 0x8247,0x8A02,0x8AE6,0x8E44,0x9013,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_93[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x90B8,0x912D,0x91D8,0x9F0E,0x6CE5,0x6458,0x64E2,0x6575,/* 0x40-0x47 */ + 0x6EF4,0x7684,0x7B1B,0x9069,0x93D1,0x6EBA,0x54F2,0x5FB9,/* 0x48-0x4F */ + 0x64A4,0x8F4D,0x8FED,0x9244,0x5178,0x586B,0x5929,0x5C55,/* 0x50-0x57 */ + 0x5E97,0x6DFB,0x7E8F,0x751C,0x8CBC,0x8EE2,0x985B,0x70B9,/* 0x58-0x5F */ + 0x4F1D,0x6BBF,0x6FB1,0x7530,0x96FB,0x514E,0x5410,0x5835,/* 0x60-0x67 */ + 0x5857,0x59AC,0x5C60,0x5F92,0x6597,0x675C,0x6E21,0x767B,/* 0x68-0x6F */ + 0x83DF,0x8CED,0x9014,0x90FD,0x934D,0x7825,0x783A,0x52AA,/* 0x70-0x77 */ + 0x5EA6,0x571F,0x5974,0x6012,0x5012,0x515A,0x51AC,0x0000,/* 0x78-0x7F */ + + 0x51CD,0x5200,0x5510,0x5854,0x5858,0x5957,0x5B95,0x5CF6,/* 0x80-0x87 */ + 0x5D8B,0x60BC,0x6295,0x642D,0x6771,0x6843,0x68BC,0x68DF,/* 0x88-0x8F */ + 0x76D7,0x6DD8,0x6E6F,0x6D9B,0x706F,0x71C8,0x5F53,0x75D8,/* 0x90-0x97 */ + 0x7977,0x7B49,0x7B54,0x7B52,0x7CD6,0x7D71,0x5230,0x8463,/* 0x98-0x9F */ + 0x8569,0x85E4,0x8A0E,0x8B04,0x8C46,0x8E0F,0x9003,0x900F,/* 0xA0-0xA7 */ + 0x9419,0x9676,0x982D,0x9A30,0x95D8,0x50CD,0x52D5,0x540C,/* 0xA8-0xAF */ + 0x5802,0x5C0E,0x61A7,0x649E,0x6D1E,0x77B3,0x7AE5,0x80F4,/* 0xB0-0xB7 */ + 0x8404,0x9053,0x9285,0x5CE0,0x9D07,0x533F,0x5F97,0x5FB3,/* 0xB8-0xBF */ + 0x6D9C,0x7279,0x7763,0x79BF,0x7BE4,0x6BD2,0x72EC,0x8AAD,/* 0xC0-0xC7 */ + 0x6803,0x6A61,0x51F8,0x7A81,0x6934,0x5C4A,0x9CF6,0x82EB,/* 0xC8-0xCF */ + 0x5BC5,0x9149,0x701E,0x5678,0x5C6F,0x60C7,0x6566,0x6C8C,/* 0xD0-0xD7 */ + 0x8C5A,0x9041,0x9813,0x5451,0x66C7,0x920D,0x5948,0x90A3,/* 0xD8-0xDF */ + 0x5185,0x4E4D,0x51EA,0x8599,0x8B0E,0x7058,0x637A,0x934B,/* 0xE0-0xE7 */ + 0x6962,0x99B4,0x7E04,0x7577,0x5357,0x6960,0x8EDF,0x96E3,/* 0xE8-0xEF */ + 0x6C5D,0x4E8C,0x5C3C,0x5F10,0x8FE9,0x5302,0x8CD1,0x8089,/* 0xF0-0xF7 */ + 0x8679,0x5EFF,0x65E5,0x4E73,0x5165,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_94[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5982,0x5C3F,0x97EE,0x4EFB,0x598A,0x5FCD,0x8A8D,0x6FE1,/* 0x40-0x47 */ + 0x79B0,0x7962,0x5BE7,0x8471,0x732B,0x71B1,0x5E74,0x5FF5,/* 0x48-0x4F */ + 0x637B,0x649A,0x71C3,0x7C98,0x4E43,0x5EFC,0x4E4B,0x57DC,/* 0x50-0x57 */ + 0x56A2,0x60A9,0x6FC3,0x7D0D,0x80FD,0x8133,0x81BF,0x8FB2,/* 0x58-0x5F */ + 0x8997,0x86A4,0x5DF4,0x628A,0x64AD,0x8987,0x6777,0x6CE2,/* 0x60-0x67 */ + 0x6D3E,0x7436,0x7834,0x5A46,0x7F75,0x82AD,0x99AC,0x4FF3,/* 0x68-0x6F */ + 0x5EC3,0x62DD,0x6392,0x6557,0x676F,0x76C3,0x724C,0x80CC,/* 0x70-0x77 */ + 0x80BA,0x8F29,0x914D,0x500D,0x57F9,0x5A92,0x6885,0x0000,/* 0x78-0x7F */ + + 0x6973,0x7164,0x72FD,0x8CB7,0x58F2,0x8CE0,0x966A,0x9019,/* 0x80-0x87 */ + 0x877F,0x79E4,0x77E7,0x8429,0x4F2F,0x5265,0x535A,0x62CD,/* 0x88-0x8F */ + 0x67CF,0x6CCA,0x767D,0x7B94,0x7C95,0x8236,0x8584,0x8FEB,/* 0x90-0x97 */ + 0x66DD,0x6F20,0x7206,0x7E1B,0x83AB,0x99C1,0x9EA6,0x51FD,/* 0x98-0x9F */ + 0x7BB1,0x7872,0x7BB8,0x8087,0x7B48,0x6AE8,0x5E61,0x808C,/* 0xA0-0xA7 */ + 0x7551,0x7560,0x516B,0x9262,0x6E8C,0x767A,0x9197,0x9AEA,/* 0xA8-0xAF */ + 0x4F10,0x7F70,0x629C,0x7B4F,0x95A5,0x9CE9,0x567A,0x5859,/* 0xB0-0xB7 */ + 0x86E4,0x96BC,0x4F34,0x5224,0x534A,0x53CD,0x53DB,0x5E06,/* 0xB8-0xBF */ + 0x642C,0x6591,0x677F,0x6C3E,0x6C4E,0x7248,0x72AF,0x73ED,/* 0xC0-0xC7 */ + 0x7554,0x7E41,0x822C,0x85E9,0x8CA9,0x7BC4,0x91C6,0x7169,/* 0xC8-0xCF */ + 0x9812,0x98EF,0x633D,0x6669,0x756A,0x76E4,0x78D0,0x8543,/* 0xD0-0xD7 */ + 0x86EE,0x532A,0x5351,0x5426,0x5983,0x5E87,0x5F7C,0x60B2,/* 0xD8-0xDF */ + 0x6249,0x6279,0x62AB,0x6590,0x6BD4,0x6CCC,0x75B2,0x76AE,/* 0xE0-0xE7 */ + 0x7891,0x79D8,0x7DCB,0x7F77,0x80A5,0x88AB,0x8AB9,0x8CBB,/* 0xE8-0xEF */ + 0x907F,0x975E,0x98DB,0x6A0B,0x7C38,0x5099,0x5C3E,0x5FAE,/* 0xF0-0xF7 */ + 0x6787,0x6BD8,0x7435,0x7709,0x7F8E,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_95[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9F3B,0x67CA,0x7A17,0x5339,0x758B,0x9AED,0x5F66,0x819D,/* 0x40-0x47 */ + 0x83F1,0x8098,0x5F3C,0x5FC5,0x7562,0x7B46,0x903C,0x6867,/* 0x48-0x4F */ + 0x59EB,0x5A9B,0x7D10,0x767E,0x8B2C,0x4FF5,0x5F6A,0x6A19,/* 0x50-0x57 */ + 0x6C37,0x6F02,0x74E2,0x7968,0x8868,0x8A55,0x8C79,0x5EDF,/* 0x58-0x5F */ + 0x63CF,0x75C5,0x79D2,0x82D7,0x9328,0x92F2,0x849C,0x86ED,/* 0x60-0x67 */ + 0x9C2D,0x54C1,0x5F6C,0x658C,0x6D5C,0x7015,0x8CA7,0x8CD3,/* 0x68-0x6F */ + 0x983B,0x654F,0x74F6,0x4E0D,0x4ED8,0x57E0,0x592B,0x5A66,/* 0x70-0x77 */ + 0x5BCC,0x51A8,0x5E03,0x5E9C,0x6016,0x6276,0x6577,0x0000,/* 0x78-0x7F */ + + 0x65A7,0x666E,0x6D6E,0x7236,0x7B26,0x8150,0x819A,0x8299,/* 0x80-0x87 */ + 0x8B5C,0x8CA0,0x8CE6,0x8D74,0x961C,0x9644,0x4FAE,0x64AB,/* 0x88-0x8F */ + 0x6B66,0x821E,0x8461,0x856A,0x90E8,0x5C01,0x6953,0x98A8,/* 0x90-0x97 */ + 0x847A,0x8557,0x4F0F,0x526F,0x5FA9,0x5E45,0x670D,0x798F,/* 0x98-0x9F */ + 0x8179,0x8907,0x8986,0x6DF5,0x5F17,0x6255,0x6CB8,0x4ECF,/* 0xA0-0xA7 */ + 0x7269,0x9B92,0x5206,0x543B,0x5674,0x58B3,0x61A4,0x626E,/* 0xA8-0xAF */ + 0x711A,0x596E,0x7C89,0x7CDE,0x7D1B,0x96F0,0x6587,0x805E,/* 0xB0-0xB7 */ + 0x4E19,0x4F75,0x5175,0x5840,0x5E63,0x5E73,0x5F0A,0x67C4,/* 0xB8-0xBF */ + 0x4E26,0x853D,0x9589,0x965B,0x7C73,0x9801,0x50FB,0x58C1,/* 0xC0-0xC7 */ + 0x7656,0x78A7,0x5225,0x77A5,0x8511,0x7B86,0x504F,0x5909,/* 0xC8-0xCF */ + 0x7247,0x7BC7,0x7DE8,0x8FBA,0x8FD4,0x904D,0x4FBF,0x52C9,/* 0xD0-0xD7 */ + 0x5A29,0x5F01,0x97AD,0x4FDD,0x8217,0x92EA,0x5703,0x6355,/* 0xD8-0xDF */ + 0x6B69,0x752B,0x88DC,0x8F14,0x7A42,0x52DF,0x5893,0x6155,/* 0xE0-0xE7 */ + 0x620A,0x66AE,0x6BCD,0x7C3F,0x83E9,0x5023,0x4FF8,0x5305,/* 0xE8-0xEF */ + 0x5446,0x5831,0x5949,0x5B9D,0x5CF0,0x5CEF,0x5D29,0x5E96,/* 0xF0-0xF7 */ + 0x62B1,0x6367,0x653E,0x65B9,0x670B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_96[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6CD5,0x6CE1,0x70F9,0x7832,0x7E2B,0x80DE,0x82B3,0x840C,/* 0x40-0x47 */ + 0x84EC,0x8702,0x8912,0x8A2A,0x8C4A,0x90A6,0x92D2,0x98FD,/* 0x48-0x4F */ + 0x9CF3,0x9D6C,0x4E4F,0x4EA1,0x508D,0x5256,0x574A,0x59A8,/* 0x50-0x57 */ + 0x5E3D,0x5FD8,0x5FD9,0x623F,0x66B4,0x671B,0x67D0,0x68D2,/* 0x58-0x5F */ + 0x5192,0x7D21,0x80AA,0x81A8,0x8B00,0x8C8C,0x8CBF,0x927E,/* 0x60-0x67 */ + 0x9632,0x5420,0x982C,0x5317,0x50D5,0x535C,0x58A8,0x64B2,/* 0x68-0x6F */ + 0x6734,0x7267,0x7766,0x7A46,0x91E6,0x52C3,0x6CA1,0x6B86,/* 0x70-0x77 */ + 0x5800,0x5E4C,0x5954,0x672C,0x7FFB,0x51E1,0x76C6,0x0000,/* 0x78-0x7F */ + + 0x6469,0x78E8,0x9B54,0x9EBB,0x57CB,0x59B9,0x6627,0x679A,/* 0x80-0x87 */ + 0x6BCE,0x54E9,0x69D9,0x5E55,0x819C,0x6795,0x9BAA,0x67FE,/* 0x88-0x8F */ + 0x9C52,0x685D,0x4EA6,0x4FE3,0x53C8,0x62B9,0x672B,0x6CAB,/* 0x90-0x97 */ + 0x8FC4,0x4FAD,0x7E6D,0x9EBF,0x4E07,0x6162,0x6E80,0x6F2B,/* 0x98-0x9F */ + 0x8513,0x5473,0x672A,0x9B45,0x5DF3,0x7B95,0x5CAC,0x5BC6,/* 0xA0-0xA7 */ + 0x871C,0x6E4A,0x84D1,0x7A14,0x8108,0x5999,0x7C8D,0x6C11,/* 0xA8-0xAF */ + 0x7720,0x52D9,0x5922,0x7121,0x725F,0x77DB,0x9727,0x9D61,/* 0xB0-0xB7 */ + 0x690B,0x5A7F,0x5A18,0x51A5,0x540D,0x547D,0x660E,0x76DF,/* 0xB8-0xBF */ + 0x8FF7,0x9298,0x9CF4,0x59EA,0x725D,0x6EC5,0x514D,0x68C9,/* 0xC0-0xC7 */ + 0x7DBF,0x7DEC,0x9762,0x9EBA,0x6478,0x6A21,0x8302,0x5984,/* 0xC8-0xCF */ + 0x5B5F,0x6BDB,0x731B,0x76F2,0x7DB2,0x8017,0x8499,0x5132,/* 0xD0-0xD7 */ + 0x6728,0x9ED9,0x76EE,0x6762,0x52FF,0x9905,0x5C24,0x623B,/* 0xD8-0xDF */ + 0x7C7E,0x8CB0,0x554F,0x60B6,0x7D0B,0x9580,0x5301,0x4E5F,/* 0xE0-0xE7 */ + 0x51B6,0x591C,0x723A,0x8036,0x91CE,0x5F25,0x77E2,0x5384,/* 0xE8-0xEF */ + 0x5F79,0x7D04,0x85AC,0x8A33,0x8E8D,0x9756,0x67F3,0x85AE,/* 0xF0-0xF7 */ + 0x9453,0x6109,0x6108,0x6CB9,0x7652,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_97[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8AED,0x8F38,0x552F,0x4F51,0x512A,0x52C7,0x53CB,0x5BA5,/* 0x40-0x47 */ + 0x5E7D,0x60A0,0x6182,0x63D6,0x6709,0x67DA,0x6E67,0x6D8C,/* 0x48-0x4F */ + 0x7336,0x7337,0x7531,0x7950,0x88D5,0x8A98,0x904A,0x9091,/* 0x50-0x57 */ + 0x90F5,0x96C4,0x878D,0x5915,0x4E88,0x4F59,0x4E0E,0x8A89,/* 0x58-0x5F */ + 0x8F3F,0x9810,0x50AD,0x5E7C,0x5996,0x5BB9,0x5EB8,0x63DA,/* 0x60-0x67 */ + 0x63FA,0x64C1,0x66DC,0x694A,0x69D8,0x6D0B,0x6EB6,0x7194,/* 0x68-0x6F */ + 0x7528,0x7AAF,0x7F8A,0x8000,0x8449,0x84C9,0x8981,0x8B21,/* 0x70-0x77 */ + 0x8E0A,0x9065,0x967D,0x990A,0x617E,0x6291,0x6B32,0x0000,/* 0x78-0x7F */ + + 0x6C83,0x6D74,0x7FCC,0x7FFC,0x6DC0,0x7F85,0x87BA,0x88F8,/* 0x80-0x87 */ + 0x6765,0x83B1,0x983C,0x96F7,0x6D1B,0x7D61,0x843D,0x916A,/* 0x88-0x8F */ + 0x4E71,0x5375,0x5D50,0x6B04,0x6FEB,0x85CD,0x862D,0x89A7,/* 0x90-0x97 */ + 0x5229,0x540F,0x5C65,0x674E,0x68A8,0x7406,0x7483,0x75E2,/* 0x98-0x9F */ + 0x88CF,0x88E1,0x91CC,0x96E2,0x9678,0x5F8B,0x7387,0x7ACB,/* 0xA0-0xA7 */ + 0x844E,0x63A0,0x7565,0x5289,0x6D41,0x6E9C,0x7409,0x7559,/* 0xA8-0xAF */ + 0x786B,0x7C92,0x9686,0x7ADC,0x9F8D,0x4FB6,0x616E,0x65C5,/* 0xB0-0xB7 */ + 0x865C,0x4E86,0x4EAE,0x50DA,0x4E21,0x51CC,0x5BEE,0x6599,/* 0xB8-0xBF */ + 0x6881,0x6DBC,0x731F,0x7642,0x77AD,0x7A1C,0x7CE7,0x826F,/* 0xC0-0xC7 */ + 0x8AD2,0x907C,0x91CF,0x9675,0x9818,0x529B,0x7DD1,0x502B,/* 0xC8-0xCF */ + 0x5398,0x6797,0x6DCB,0x71D0,0x7433,0x81E8,0x8F2A,0x96A3,/* 0xD0-0xD7 */ + 0x9C57,0x9E9F,0x7460,0x5841,0x6D99,0x7D2F,0x985E,0x4EE4,/* 0xD8-0xDF */ + 0x4F36,0x4F8B,0x51B7,0x52B1,0x5DBA,0x601C,0x73B2,0x793C,/* 0xE0-0xE7 */ + 0x82D3,0x9234,0x96B7,0x96F6,0x970A,0x9E97,0x9F62,0x66A6,/* 0xE8-0xEF */ + 0x6B74,0x5217,0x52A3,0x70C8,0x88C2,0x5EC9,0x604B,0x6190,/* 0xF0-0xF7 */ + 0x6F23,0x7149,0x7C3E,0x7DF4,0x806F,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_98[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x84EE,0x9023,0x932C,0x5442,0x9B6F,0x6AD3,0x7089,0x8CC2,/* 0x40-0x47 */ + 0x8DEF,0x9732,0x52B4,0x5A41,0x5ECA,0x5F04,0x6717,0x697C,/* 0x48-0x4F */ + 0x6994,0x6D6A,0x6F0F,0x7262,0x72FC,0x7BED,0x8001,0x807E,/* 0x50-0x57 */ + 0x874B,0x90CE,0x516D,0x9E93,0x7984,0x808B,0x9332,0x8AD6,/* 0x58-0x5F */ + 0x502D,0x548C,0x8A71,0x6B6A,0x8CC4,0x8107,0x60D1,0x67A0,/* 0x60-0x67 */ + 0x9DF2,0x4E99,0x4E98,0x9C10,0x8A6B,0x85C1,0x8568,0x6900,/* 0x68-0x6F */ + 0x6E7E,0x7897,0x8155,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x5F0C,/* 0x98-0x9F */ + 0x4E10,0x4E15,0x4E2A,0x4E31,0x4E36,0x4E3C,0x4E3F,0x4E42,/* 0xA0-0xA7 */ + 0x4E56,0x4E58,0x4E82,0x4E85,0x8C6B,0x4E8A,0x8212,0x5F0D,/* 0xA8-0xAF */ + 0x4E8E,0x4E9E,0x4E9F,0x4EA0,0x4EA2,0x4EB0,0x4EB3,0x4EB6,/* 0xB0-0xB7 */ + 0x4ECE,0x4ECD,0x4EC4,0x4EC6,0x4EC2,0x4ED7,0x4EDE,0x4EED,/* 0xB8-0xBF */ + 0x4EDF,0x4EF7,0x4F09,0x4F5A,0x4F30,0x4F5B,0x4F5D,0x4F57,/* 0xC0-0xC7 */ + 0x4F47,0x4F76,0x4F88,0x4F8F,0x4F98,0x4F7B,0x4F69,0x4F70,/* 0xC8-0xCF */ + 0x4F91,0x4F6F,0x4F86,0x4F96,0x5118,0x4FD4,0x4FDF,0x4FCE,/* 0xD0-0xD7 */ + 0x4FD8,0x4FDB,0x4FD1,0x4FDA,0x4FD0,0x4FE4,0x4FE5,0x501A,/* 0xD8-0xDF */ + 0x5028,0x5014,0x502A,0x5025,0x5005,0x4F1C,0x4FF6,0x5021,/* 0xE0-0xE7 */ + 0x5029,0x502C,0x4FFE,0x4FEF,0x5011,0x5006,0x5043,0x5047,/* 0xE8-0xEF */ + 0x6703,0x5055,0x5050,0x5048,0x505A,0x5056,0x506C,0x5078,/* 0xF0-0xF7 */ + 0x5080,0x509A,0x5085,0x50B4,0x50B2,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_99[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x50C9,0x50CA,0x50B3,0x50C2,0x50D6,0x50DE,0x50E5,0x50ED,/* 0x40-0x47 */ + 0x50E3,0x50EE,0x50F9,0x50F5,0x5109,0x5101,0x5102,0x5116,/* 0x48-0x4F */ + 0x5115,0x5114,0x511A,0x5121,0x513A,0x5137,0x513C,0x513B,/* 0x50-0x57 */ + 0x513F,0x5140,0x5152,0x514C,0x5154,0x5162,0x7AF8,0x5169,/* 0x58-0x5F */ + 0x516A,0x516E,0x5180,0x5182,0x56D8,0x518C,0x5189,0x518F,/* 0x60-0x67 */ + 0x5191,0x5193,0x5195,0x5196,0x51A4,0x51A6,0x51A2,0x51A9,/* 0x68-0x6F */ + 0x51AA,0x51AB,0x51B3,0x51B1,0x51B2,0x51B0,0x51B5,0x51BD,/* 0x70-0x77 */ + 0x51C5,0x51C9,0x51DB,0x51E0,0x8655,0x51E9,0x51ED,0x0000,/* 0x78-0x7F */ + + 0x51F0,0x51F5,0x51FE,0x5204,0x520B,0x5214,0x520E,0x5227,/* 0x80-0x87 */ + 0x522A,0x522E,0x5233,0x5239,0x524F,0x5244,0x524B,0x524C,/* 0x88-0x8F */ + 0x525E,0x5254,0x526A,0x5274,0x5269,0x5273,0x527F,0x527D,/* 0x90-0x97 */ + 0x528D,0x5294,0x5292,0x5271,0x5288,0x5291,0x8FA8,0x8FA7,/* 0x98-0x9F */ + 0x52AC,0x52AD,0x52BC,0x52B5,0x52C1,0x52CD,0x52D7,0x52DE,/* 0xA0-0xA7 */ + 0x52E3,0x52E6,0x98ED,0x52E0,0x52F3,0x52F5,0x52F8,0x52F9,/* 0xA8-0xAF */ + 0x5306,0x5308,0x7538,0x530D,0x5310,0x530F,0x5315,0x531A,/* 0xB0-0xB7 */ + 0x5323,0x532F,0x5331,0x5333,0x5338,0x5340,0x5346,0x5345,/* 0xB8-0xBF */ + 0x4E17,0x5349,0x534D,0x51D6,0x535E,0x5369,0x536E,0x5918,/* 0xC0-0xC7 */ + 0x537B,0x5377,0x5382,0x5396,0x53A0,0x53A6,0x53A5,0x53AE,/* 0xC8-0xCF */ + 0x53B0,0x53B6,0x53C3,0x7C12,0x96D9,0x53DF,0x66FC,0x71EE,/* 0xD0-0xD7 */ + 0x53EE,0x53E8,0x53ED,0x53FA,0x5401,0x543D,0x5440,0x542C,/* 0xD8-0xDF */ + 0x542D,0x543C,0x542E,0x5436,0x5429,0x541D,0x544E,0x548F,/* 0xE0-0xE7 */ + 0x5475,0x548E,0x545F,0x5471,0x5477,0x5470,0x5492,0x547B,/* 0xE8-0xEF */ + 0x5480,0x5476,0x5484,0x5490,0x5486,0x54C7,0x54A2,0x54B8,/* 0xF0-0xF7 */ + 0x54A5,0x54AC,0x54C4,0x54C8,0x54A8,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9A[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x54AB,0x54C2,0x54A4,0x54BE,0x54BC,0x54D8,0x54E5,0x54E6,/* 0x40-0x47 */ + 0x550F,0x5514,0x54FD,0x54EE,0x54ED,0x54FA,0x54E2,0x5539,/* 0x48-0x4F */ + 0x5540,0x5563,0x554C,0x552E,0x555C,0x5545,0x5556,0x5557,/* 0x50-0x57 */ + 0x5538,0x5533,0x555D,0x5599,0x5580,0x54AF,0x558A,0x559F,/* 0x58-0x5F */ + 0x557B,0x557E,0x5598,0x559E,0x55AE,0x557C,0x5583,0x55A9,/* 0x60-0x67 */ + 0x5587,0x55A8,0x55DA,0x55C5,0x55DF,0x55C4,0x55DC,0x55E4,/* 0x68-0x6F */ + 0x55D4,0x5614,0x55F7,0x5616,0x55FE,0x55FD,0x561B,0x55F9,/* 0x70-0x77 */ + 0x564E,0x5650,0x71DF,0x5634,0x5636,0x5632,0x5638,0x0000,/* 0x78-0x7F */ + + 0x566B,0x5664,0x562F,0x566C,0x566A,0x5686,0x5680,0x568A,/* 0x80-0x87 */ + 0x56A0,0x5694,0x568F,0x56A5,0x56AE,0x56B6,0x56B4,0x56C2,/* 0x88-0x8F */ + 0x56BC,0x56C1,0x56C3,0x56C0,0x56C8,0x56CE,0x56D1,0x56D3,/* 0x90-0x97 */ + 0x56D7,0x56EE,0x56F9,0x5700,0x56FF,0x5704,0x5709,0x5708,/* 0x98-0x9F */ + 0x570B,0x570D,0x5713,0x5718,0x5716,0x55C7,0x571C,0x5726,/* 0xA0-0xA7 */ + 0x5737,0x5738,0x574E,0x573B,0x5740,0x574F,0x5769,0x57C0,/* 0xA8-0xAF */ + 0x5788,0x5761,0x577F,0x5789,0x5793,0x57A0,0x57B3,0x57A4,/* 0xB0-0xB7 */ + 0x57AA,0x57B0,0x57C3,0x57C6,0x57D4,0x57D2,0x57D3,0x580A,/* 0xB8-0xBF */ + 0x57D6,0x57E3,0x580B,0x5819,0x581D,0x5872,0x5821,0x5862,/* 0xC0-0xC7 */ + 0x584B,0x5870,0x6BC0,0x5852,0x583D,0x5879,0x5885,0x58B9,/* 0xC8-0xCF */ + 0x589F,0x58AB,0x58BA,0x58DE,0x58BB,0x58B8,0x58AE,0x58C5,/* 0xD0-0xD7 */ + 0x58D3,0x58D1,0x58D7,0x58D9,0x58D8,0x58E5,0x58DC,0x58E4,/* 0xD8-0xDF */ + 0x58DF,0x58EF,0x58FA,0x58F9,0x58FB,0x58FC,0x58FD,0x5902,/* 0xE0-0xE7 */ + 0x590A,0x5910,0x591B,0x68A6,0x5925,0x592C,0x592D,0x5932,/* 0xE8-0xEF */ + 0x5938,0x593E,0x7AD2,0x5955,0x5950,0x594E,0x595A,0x5958,/* 0xF0-0xF7 */ + 0x5962,0x5960,0x5967,0x596C,0x5969,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9B[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5978,0x5981,0x599D,0x4F5E,0x4FAB,0x59A3,0x59B2,0x59C6,/* 0x40-0x47 */ + 0x59E8,0x59DC,0x598D,0x59D9,0x59DA,0x5A25,0x5A1F,0x5A11,/* 0x48-0x4F */ + 0x5A1C,0x5A09,0x5A1A,0x5A40,0x5A6C,0x5A49,0x5A35,0x5A36,/* 0x50-0x57 */ + 0x5A62,0x5A6A,0x5A9A,0x5ABC,0x5ABE,0x5ACB,0x5AC2,0x5ABD,/* 0x58-0x5F */ + 0x5AE3,0x5AD7,0x5AE6,0x5AE9,0x5AD6,0x5AFA,0x5AFB,0x5B0C,/* 0x60-0x67 */ + 0x5B0B,0x5B16,0x5B32,0x5AD0,0x5B2A,0x5B36,0x5B3E,0x5B43,/* 0x68-0x6F */ + 0x5B45,0x5B40,0x5B51,0x5B55,0x5B5A,0x5B5B,0x5B65,0x5B69,/* 0x70-0x77 */ + 0x5B70,0x5B73,0x5B75,0x5B78,0x6588,0x5B7A,0x5B80,0x0000,/* 0x78-0x7F */ + + 0x5B83,0x5BA6,0x5BB8,0x5BC3,0x5BC7,0x5BC9,0x5BD4,0x5BD0,/* 0x80-0x87 */ + 0x5BE4,0x5BE6,0x5BE2,0x5BDE,0x5BE5,0x5BEB,0x5BF0,0x5BF6,/* 0x88-0x8F */ + 0x5BF3,0x5C05,0x5C07,0x5C08,0x5C0D,0x5C13,0x5C20,0x5C22,/* 0x90-0x97 */ + 0x5C28,0x5C38,0x5C39,0x5C41,0x5C46,0x5C4E,0x5C53,0x5C50,/* 0x98-0x9F */ + 0x5C4F,0x5B71,0x5C6C,0x5C6E,0x4E62,0x5C76,0x5C79,0x5C8C,/* 0xA0-0xA7 */ + 0x5C91,0x5C94,0x599B,0x5CAB,0x5CBB,0x5CB6,0x5CBC,0x5CB7,/* 0xA8-0xAF */ + 0x5CC5,0x5CBE,0x5CC7,0x5CD9,0x5CE9,0x5CFD,0x5CFA,0x5CED,/* 0xB0-0xB7 */ + 0x5D8C,0x5CEA,0x5D0B,0x5D15,0x5D17,0x5D5C,0x5D1F,0x5D1B,/* 0xB8-0xBF */ + 0x5D11,0x5D14,0x5D22,0x5D1A,0x5D19,0x5D18,0x5D4C,0x5D52,/* 0xC0-0xC7 */ + 0x5D4E,0x5D4B,0x5D6C,0x5D73,0x5D76,0x5D87,0x5D84,0x5D82,/* 0xC8-0xCF */ + 0x5DA2,0x5D9D,0x5DAC,0x5DAE,0x5DBD,0x5D90,0x5DB7,0x5DBC,/* 0xD0-0xD7 */ + 0x5DC9,0x5DCD,0x5DD3,0x5DD2,0x5DD6,0x5DDB,0x5DEB,0x5DF2,/* 0xD8-0xDF */ + 0x5DF5,0x5E0B,0x5E1A,0x5E19,0x5E11,0x5E1B,0x5E36,0x5E37,/* 0xE0-0xE7 */ + 0x5E44,0x5E43,0x5E40,0x5E4E,0x5E57,0x5E54,0x5E5F,0x5E62,/* 0xE8-0xEF */ + 0x5E64,0x5E47,0x5E75,0x5E76,0x5E7A,0x9EBC,0x5E7F,0x5EA0,/* 0xF0-0xF7 */ + 0x5EC1,0x5EC2,0x5EC8,0x5ED0,0x5ECF,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9C[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5ED6,0x5EE3,0x5EDD,0x5EDA,0x5EDB,0x5EE2,0x5EE1,0x5EE8,/* 0x40-0x47 */ + 0x5EE9,0x5EEC,0x5EF1,0x5EF3,0x5EF0,0x5EF4,0x5EF8,0x5EFE,/* 0x48-0x4F */ + 0x5F03,0x5F09,0x5F5D,0x5F5C,0x5F0B,0x5F11,0x5F16,0x5F29,/* 0x50-0x57 */ + 0x5F2D,0x5F38,0x5F41,0x5F48,0x5F4C,0x5F4E,0x5F2F,0x5F51,/* 0x58-0x5F */ + 0x5F56,0x5F57,0x5F59,0x5F61,0x5F6D,0x5F73,0x5F77,0x5F83,/* 0x60-0x67 */ + 0x5F82,0x5F7F,0x5F8A,0x5F88,0x5F91,0x5F87,0x5F9E,0x5F99,/* 0x68-0x6F */ + 0x5F98,0x5FA0,0x5FA8,0x5FAD,0x5FBC,0x5FD6,0x5FFB,0x5FE4,/* 0x70-0x77 */ + 0x5FF8,0x5FF1,0x5FDD,0x60B3,0x5FFF,0x6021,0x6060,0x0000,/* 0x78-0x7F */ + + 0x6019,0x6010,0x6029,0x600E,0x6031,0x601B,0x6015,0x602B,/* 0x80-0x87 */ + 0x6026,0x600F,0x603A,0x605A,0x6041,0x606A,0x6077,0x605F,/* 0x88-0x8F */ + 0x604A,0x6046,0x604D,0x6063,0x6043,0x6064,0x6042,0x606C,/* 0x90-0x97 */ + 0x606B,0x6059,0x6081,0x608D,0x60E7,0x6083,0x609A,0x6084,/* 0x98-0x9F */ + 0x609B,0x6096,0x6097,0x6092,0x60A7,0x608B,0x60E1,0x60B8,/* 0xA0-0xA7 */ + 0x60E0,0x60D3,0x60B4,0x5FF0,0x60BD,0x60C6,0x60B5,0x60D8,/* 0xA8-0xAF */ + 0x614D,0x6115,0x6106,0x60F6,0x60F7,0x6100,0x60F4,0x60FA,/* 0xB0-0xB7 */ + 0x6103,0x6121,0x60FB,0x60F1,0x610D,0x610E,0x6147,0x613E,/* 0xB8-0xBF */ + 0x6128,0x6127,0x614A,0x613F,0x613C,0x612C,0x6134,0x613D,/* 0xC0-0xC7 */ + 0x6142,0x6144,0x6173,0x6177,0x6158,0x6159,0x615A,0x616B,/* 0xC8-0xCF */ + 0x6174,0x616F,0x6165,0x6171,0x615F,0x615D,0x6153,0x6175,/* 0xD0-0xD7 */ + 0x6199,0x6196,0x6187,0x61AC,0x6194,0x619A,0x618A,0x6191,/* 0xD8-0xDF */ + 0x61AB,0x61AE,0x61CC,0x61CA,0x61C9,0x61F7,0x61C8,0x61C3,/* 0xE0-0xE7 */ + 0x61C6,0x61BA,0x61CB,0x7F79,0x61CD,0x61E6,0x61E3,0x61F6,/* 0xE8-0xEF */ + 0x61FA,0x61F4,0x61FF,0x61FD,0x61FC,0x61FE,0x6200,0x6208,/* 0xF0-0xF7 */ + 0x6209,0x620D,0x620C,0x6214,0x621B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9D[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x621E,0x6221,0x622A,0x622E,0x6230,0x6232,0x6233,0x6241,/* 0x40-0x47 */ + 0x624E,0x625E,0x6263,0x625B,0x6260,0x6268,0x627C,0x6282,/* 0x48-0x4F */ + 0x6289,0x627E,0x6292,0x6293,0x6296,0x62D4,0x6283,0x6294,/* 0x50-0x57 */ + 0x62D7,0x62D1,0x62BB,0x62CF,0x62FF,0x62C6,0x64D4,0x62C8,/* 0x58-0x5F */ + 0x62DC,0x62CC,0x62CA,0x62C2,0x62C7,0x629B,0x62C9,0x630C,/* 0x60-0x67 */ + 0x62EE,0x62F1,0x6327,0x6302,0x6308,0x62EF,0x62F5,0x6350,/* 0x68-0x6F */ + 0x633E,0x634D,0x641C,0x634F,0x6396,0x638E,0x6380,0x63AB,/* 0x70-0x77 */ + 0x6376,0x63A3,0x638F,0x6389,0x639F,0x63B5,0x636B,0x0000,/* 0x78-0x7F */ + + 0x6369,0x63BE,0x63E9,0x63C0,0x63C6,0x63E3,0x63C9,0x63D2,/* 0x80-0x87 */ + 0x63F6,0x63C4,0x6416,0x6434,0x6406,0x6413,0x6426,0x6436,/* 0x88-0x8F */ + 0x651D,0x6417,0x6428,0x640F,0x6467,0x646F,0x6476,0x644E,/* 0x90-0x97 */ + 0x652A,0x6495,0x6493,0x64A5,0x64A9,0x6488,0x64BC,0x64DA,/* 0x98-0x9F */ + 0x64D2,0x64C5,0x64C7,0x64BB,0x64D8,0x64C2,0x64F1,0x64E7,/* 0xA0-0xA7 */ + 0x8209,0x64E0,0x64E1,0x62AC,0x64E3,0x64EF,0x652C,0x64F6,/* 0xA8-0xAF */ + 0x64F4,0x64F2,0x64FA,0x6500,0x64FD,0x6518,0x651C,0x6505,/* 0xB0-0xB7 */ + 0x6524,0x6523,0x652B,0x6534,0x6535,0x6537,0x6536,0x6538,/* 0xB8-0xBF */ + 0x754B,0x6548,0x6556,0x6555,0x654D,0x6558,0x655E,0x655D,/* 0xC0-0xC7 */ + 0x6572,0x6578,0x6582,0x6583,0x8B8A,0x659B,0x659F,0x65AB,/* 0xC8-0xCF */ + 0x65B7,0x65C3,0x65C6,0x65C1,0x65C4,0x65CC,0x65D2,0x65DB,/* 0xD0-0xD7 */ + 0x65D9,0x65E0,0x65E1,0x65F1,0x6772,0x660A,0x6603,0x65FB,/* 0xD8-0xDF */ + 0x6773,0x6635,0x6636,0x6634,0x661C,0x664F,0x6644,0x6649,/* 0xE0-0xE7 */ + 0x6641,0x665E,0x665D,0x6664,0x6667,0x6668,0x665F,0x6662,/* 0xE8-0xEF */ + 0x6670,0x6683,0x6688,0x668E,0x6689,0x6684,0x6698,0x669D,/* 0xF0-0xF7 */ + 0x66C1,0x66B9,0x66C9,0x66BE,0x66BC,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9E[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x66C4,0x66B8,0x66D6,0x66DA,0x66E0,0x663F,0x66E6,0x66E9,/* 0x40-0x47 */ + 0x66F0,0x66F5,0x66F7,0x670F,0x6716,0x671E,0x6726,0x6727,/* 0x48-0x4F */ + 0x9738,0x672E,0x673F,0x6736,0x6741,0x6738,0x6737,0x6746,/* 0x50-0x57 */ + 0x675E,0x6760,0x6759,0x6763,0x6764,0x6789,0x6770,0x67A9,/* 0x58-0x5F */ + 0x677C,0x676A,0x678C,0x678B,0x67A6,0x67A1,0x6785,0x67B7,/* 0x60-0x67 */ + 0x67EF,0x67B4,0x67EC,0x67B3,0x67E9,0x67B8,0x67E4,0x67DE,/* 0x68-0x6F */ + 0x67DD,0x67E2,0x67EE,0x67B9,0x67CE,0x67C6,0x67E7,0x6A9C,/* 0x70-0x77 */ + 0x681E,0x6846,0x6829,0x6840,0x684D,0x6832,0x684E,0x0000,/* 0x78-0x7F */ + + 0x68B3,0x682B,0x6859,0x6863,0x6877,0x687F,0x689F,0x688F,/* 0x80-0x87 */ + 0x68AD,0x6894,0x689D,0x689B,0x6883,0x6AAE,0x68B9,0x6874,/* 0x88-0x8F */ + 0x68B5,0x68A0,0x68BA,0x690F,0x688D,0x687E,0x6901,0x68CA,/* 0x90-0x97 */ + 0x6908,0x68D8,0x6922,0x6926,0x68E1,0x690C,0x68CD,0x68D4,/* 0x98-0x9F */ + 0x68E7,0x68D5,0x6936,0x6912,0x6904,0x68D7,0x68E3,0x6925,/* 0xA0-0xA7 */ + 0x68F9,0x68E0,0x68EF,0x6928,0x692A,0x691A,0x6923,0x6921,/* 0xA8-0xAF */ + 0x68C6,0x6979,0x6977,0x695C,0x6978,0x696B,0x6954,0x697E,/* 0xB0-0xB7 */ + 0x696E,0x6939,0x6974,0x693D,0x6959,0x6930,0x6961,0x695E,/* 0xB8-0xBF */ + 0x695D,0x6981,0x696A,0x69B2,0x69AE,0x69D0,0x69BF,0x69C1,/* 0xC0-0xC7 */ + 0x69D3,0x69BE,0x69CE,0x5BE8,0x69CA,0x69DD,0x69BB,0x69C3,/* 0xC8-0xCF */ + 0x69A7,0x6A2E,0x6991,0x69A0,0x699C,0x6995,0x69B4,0x69DE,/* 0xD0-0xD7 */ + 0x69E8,0x6A02,0x6A1B,0x69FF,0x6B0A,0x69F9,0x69F2,0x69E7,/* 0xD8-0xDF */ + 0x6A05,0x69B1,0x6A1E,0x69ED,0x6A14,0x69EB,0x6A0A,0x6A12,/* 0xE0-0xE7 */ + 0x6AC1,0x6A23,0x6A13,0x6A44,0x6A0C,0x6A72,0x6A36,0x6A78,/* 0xE8-0xEF */ + 0x6A47,0x6A62,0x6A59,0x6A66,0x6A48,0x6A38,0x6A22,0x6A90,/* 0xF0-0xF7 */ + 0x6A8D,0x6AA0,0x6A84,0x6AA2,0x6AA3,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9F[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6A97,0x8617,0x6ABB,0x6AC3,0x6AC2,0x6AB8,0x6AB3,0x6AAC,/* 0x40-0x47 */ + 0x6ADE,0x6AD1,0x6ADF,0x6AAA,0x6ADA,0x6AEA,0x6AFB,0x6B05,/* 0x48-0x4F */ + 0x8616,0x6AFA,0x6B12,0x6B16,0x9B31,0x6B1F,0x6B38,0x6B37,/* 0x50-0x57 */ + 0x76DC,0x6B39,0x98EE,0x6B47,0x6B43,0x6B49,0x6B50,0x6B59,/* 0x58-0x5F */ + 0x6B54,0x6B5B,0x6B5F,0x6B61,0x6B78,0x6B79,0x6B7F,0x6B80,/* 0x60-0x67 */ + 0x6B84,0x6B83,0x6B8D,0x6B98,0x6B95,0x6B9E,0x6BA4,0x6BAA,/* 0x68-0x6F */ + 0x6BAB,0x6BAF,0x6BB2,0x6BB1,0x6BB3,0x6BB7,0x6BBC,0x6BC6,/* 0x70-0x77 */ + 0x6BCB,0x6BD3,0x6BDF,0x6BEC,0x6BEB,0x6BF3,0x6BEF,0x0000,/* 0x78-0x7F */ + + 0x9EBE,0x6C08,0x6C13,0x6C14,0x6C1B,0x6C24,0x6C23,0x6C5E,/* 0x80-0x87 */ + 0x6C55,0x6C62,0x6C6A,0x6C82,0x6C8D,0x6C9A,0x6C81,0x6C9B,/* 0x88-0x8F */ + 0x6C7E,0x6C68,0x6C73,0x6C92,0x6C90,0x6CC4,0x6CF1,0x6CD3,/* 0x90-0x97 */ + 0x6CBD,0x6CD7,0x6CC5,0x6CDD,0x6CAE,0x6CB1,0x6CBE,0x6CBA,/* 0x98-0x9F */ + 0x6CDB,0x6CEF,0x6CD9,0x6CEA,0x6D1F,0x884D,0x6D36,0x6D2B,/* 0xA0-0xA7 */ + 0x6D3D,0x6D38,0x6D19,0x6D35,0x6D33,0x6D12,0x6D0C,0x6D63,/* 0xA8-0xAF */ + 0x6D93,0x6D64,0x6D5A,0x6D79,0x6D59,0x6D8E,0x6D95,0x6FE4,/* 0xB0-0xB7 */ + 0x6D85,0x6DF9,0x6E15,0x6E0A,0x6DB5,0x6DC7,0x6DE6,0x6DB8,/* 0xB8-0xBF */ + 0x6DC6,0x6DEC,0x6DDE,0x6DCC,0x6DE8,0x6DD2,0x6DC5,0x6DFA,/* 0xC0-0xC7 */ + 0x6DD9,0x6DE4,0x6DD5,0x6DEA,0x6DEE,0x6E2D,0x6E6E,0x6E2E,/* 0xC8-0xCF */ + 0x6E19,0x6E72,0x6E5F,0x6E3E,0x6E23,0x6E6B,0x6E2B,0x6E76,/* 0xD0-0xD7 */ + 0x6E4D,0x6E1F,0x6E43,0x6E3A,0x6E4E,0x6E24,0x6EFF,0x6E1D,/* 0xD8-0xDF */ + 0x6E38,0x6E82,0x6EAA,0x6E98,0x6EC9,0x6EB7,0x6ED3,0x6EBD,/* 0xE0-0xE7 */ + 0x6EAF,0x6EC4,0x6EB2,0x6ED4,0x6ED5,0x6E8F,0x6EA5,0x6EC2,/* 0xE8-0xEF */ + 0x6E9F,0x6F41,0x6F11,0x704C,0x6EEC,0x6EF8,0x6EFE,0x6F3F,/* 0xF0-0xF7 */ + 0x6EF2,0x6F31,0x6EEF,0x6F32,0x6ECC,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6F3E,0x6F13,0x6EF7,0x6F86,0x6F7A,0x6F78,0x6F81,0x6F80,/* 0x40-0x47 */ + 0x6F6F,0x6F5B,0x6FF3,0x6F6D,0x6F82,0x6F7C,0x6F58,0x6F8E,/* 0x48-0x4F */ + 0x6F91,0x6FC2,0x6F66,0x6FB3,0x6FA3,0x6FA1,0x6FA4,0x6FB9,/* 0x50-0x57 */ + 0x6FC6,0x6FAA,0x6FDF,0x6FD5,0x6FEC,0x6FD4,0x6FD8,0x6FF1,/* 0x58-0x5F */ + 0x6FEE,0x6FDB,0x7009,0x700B,0x6FFA,0x7011,0x7001,0x700F,/* 0x60-0x67 */ + 0x6FFE,0x701B,0x701A,0x6F74,0x701D,0x7018,0x701F,0x7030,/* 0x68-0x6F */ + 0x703E,0x7032,0x7051,0x7063,0x7099,0x7092,0x70AF,0x70F1,/* 0x70-0x77 */ + 0x70AC,0x70B8,0x70B3,0x70AE,0x70DF,0x70CB,0x70DD,0x0000,/* 0x78-0x7F */ + + 0x70D9,0x7109,0x70FD,0x711C,0x7119,0x7165,0x7155,0x7188,/* 0x80-0x87 */ + 0x7166,0x7162,0x714C,0x7156,0x716C,0x718F,0x71FB,0x7184,/* 0x88-0x8F */ + 0x7195,0x71A8,0x71AC,0x71D7,0x71B9,0x71BE,0x71D2,0x71C9,/* 0x90-0x97 */ + 0x71D4,0x71CE,0x71E0,0x71EC,0x71E7,0x71F5,0x71FC,0x71F9,/* 0x98-0x9F */ + 0x71FF,0x720D,0x7210,0x721B,0x7228,0x722D,0x722C,0x7230,/* 0xA0-0xA7 */ + 0x7232,0x723B,0x723C,0x723F,0x7240,0x7246,0x724B,0x7258,/* 0xA8-0xAF */ + 0x7274,0x727E,0x7282,0x7281,0x7287,0x7292,0x7296,0x72A2,/* 0xB0-0xB7 */ + 0x72A7,0x72B9,0x72B2,0x72C3,0x72C6,0x72C4,0x72CE,0x72D2,/* 0xB8-0xBF */ + 0x72E2,0x72E0,0x72E1,0x72F9,0x72F7,0x500F,0x7317,0x730A,/* 0xC0-0xC7 */ + 0x731C,0x7316,0x731D,0x7334,0x732F,0x7329,0x7325,0x733E,/* 0xC8-0xCF */ + 0x734E,0x734F,0x9ED8,0x7357,0x736A,0x7368,0x7370,0x7378,/* 0xD0-0xD7 */ + 0x7375,0x737B,0x737A,0x73C8,0x73B3,0x73CE,0x73BB,0x73C0,/* 0xD8-0xDF */ + 0x73E5,0x73EE,0x73DE,0x74A2,0x7405,0x746F,0x7425,0x73F8,/* 0xE0-0xE7 */ + 0x7432,0x743A,0x7455,0x743F,0x745F,0x7459,0x7441,0x745C,/* 0xE8-0xEF */ + 0x7469,0x7470,0x7463,0x746A,0x7476,0x747E,0x748B,0x749E,/* 0xF0-0xF7 */ + 0x74A7,0x74CA,0x74CF,0x74D4,0x73F1,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x74E0,0x74E3,0x74E7,0x74E9,0x74EE,0x74F2,0x74F0,0x74F1,/* 0x40-0x47 */ + 0x74F8,0x74F7,0x7504,0x7503,0x7505,0x750C,0x750E,0x750D,/* 0x48-0x4F */ + 0x7515,0x7513,0x751E,0x7526,0x752C,0x753C,0x7544,0x754D,/* 0x50-0x57 */ + 0x754A,0x7549,0x755B,0x7546,0x755A,0x7569,0x7564,0x7567,/* 0x58-0x5F */ + 0x756B,0x756D,0x7578,0x7576,0x7586,0x7587,0x7574,0x758A,/* 0x60-0x67 */ + 0x7589,0x7582,0x7594,0x759A,0x759D,0x75A5,0x75A3,0x75C2,/* 0x68-0x6F */ + 0x75B3,0x75C3,0x75B5,0x75BD,0x75B8,0x75BC,0x75B1,0x75CD,/* 0x70-0x77 */ + 0x75CA,0x75D2,0x75D9,0x75E3,0x75DE,0x75FE,0x75FF,0x0000,/* 0x78-0x7F */ + + 0x75FC,0x7601,0x75F0,0x75FA,0x75F2,0x75F3,0x760B,0x760D,/* 0x80-0x87 */ + 0x7609,0x761F,0x7627,0x7620,0x7621,0x7622,0x7624,0x7634,/* 0x88-0x8F */ + 0x7630,0x763B,0x7647,0x7648,0x7646,0x765C,0x7658,0x7661,/* 0x90-0x97 */ + 0x7662,0x7668,0x7669,0x766A,0x7667,0x766C,0x7670,0x7672,/* 0x98-0x9F */ + 0x7676,0x7678,0x767C,0x7680,0x7683,0x7688,0x768B,0x768E,/* 0xA0-0xA7 */ + 0x7696,0x7693,0x7699,0x769A,0x76B0,0x76B4,0x76B8,0x76B9,/* 0xA8-0xAF */ + 0x76BA,0x76C2,0x76CD,0x76D6,0x76D2,0x76DE,0x76E1,0x76E5,/* 0xB0-0xB7 */ + 0x76E7,0x76EA,0x862F,0x76FB,0x7708,0x7707,0x7704,0x7729,/* 0xB8-0xBF */ + 0x7724,0x771E,0x7725,0x7726,0x771B,0x7737,0x7738,0x7747,/* 0xC0-0xC7 */ + 0x775A,0x7768,0x776B,0x775B,0x7765,0x777F,0x777E,0x7779,/* 0xC8-0xCF */ + 0x778E,0x778B,0x7791,0x77A0,0x779E,0x77B0,0x77B6,0x77B9,/* 0xD0-0xD7 */ + 0x77BF,0x77BC,0x77BD,0x77BB,0x77C7,0x77CD,0x77D7,0x77DA,/* 0xD8-0xDF */ + 0x77DC,0x77E3,0x77EE,0x77FC,0x780C,0x7812,0x7926,0x7820,/* 0xE0-0xE7 */ + 0x792A,0x7845,0x788E,0x7874,0x7886,0x787C,0x789A,0x788C,/* 0xE8-0xEF */ + 0x78A3,0x78B5,0x78AA,0x78AF,0x78D1,0x78C6,0x78CB,0x78D4,/* 0xF0-0xF7 */ + 0x78BE,0x78BC,0x78C5,0x78CA,0x78EC,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x78E7,0x78DA,0x78FD,0x78F4,0x7907,0x7912,0x7911,0x7919,/* 0x40-0x47 */ + 0x792C,0x792B,0x7940,0x7960,0x7957,0x795F,0x795A,0x7955,/* 0x48-0x4F */ + 0x7953,0x797A,0x797F,0x798A,0x799D,0x79A7,0x9F4B,0x79AA,/* 0x50-0x57 */ + 0x79AE,0x79B3,0x79B9,0x79BA,0x79C9,0x79D5,0x79E7,0x79EC,/* 0x58-0x5F */ + 0x79E1,0x79E3,0x7A08,0x7A0D,0x7A18,0x7A19,0x7A20,0x7A1F,/* 0x60-0x67 */ + 0x7980,0x7A31,0x7A3B,0x7A3E,0x7A37,0x7A43,0x7A57,0x7A49,/* 0x68-0x6F */ + 0x7A61,0x7A62,0x7A69,0x9F9D,0x7A70,0x7A79,0x7A7D,0x7A88,/* 0x70-0x77 */ + 0x7A97,0x7A95,0x7A98,0x7A96,0x7AA9,0x7AC8,0x7AB0,0x0000,/* 0x78-0x7F */ + + 0x7AB6,0x7AC5,0x7AC4,0x7ABF,0x9083,0x7AC7,0x7ACA,0x7ACD,/* 0x80-0x87 */ + 0x7ACF,0x7AD5,0x7AD3,0x7AD9,0x7ADA,0x7ADD,0x7AE1,0x7AE2,/* 0x88-0x8F */ + 0x7AE6,0x7AED,0x7AF0,0x7B02,0x7B0F,0x7B0A,0x7B06,0x7B33,/* 0x90-0x97 */ + 0x7B18,0x7B19,0x7B1E,0x7B35,0x7B28,0x7B36,0x7B50,0x7B7A,/* 0x98-0x9F */ + 0x7B04,0x7B4D,0x7B0B,0x7B4C,0x7B45,0x7B75,0x7B65,0x7B74,/* 0xA0-0xA7 */ + 0x7B67,0x7B70,0x7B71,0x7B6C,0x7B6E,0x7B9D,0x7B98,0x7B9F,/* 0xA8-0xAF */ + 0x7B8D,0x7B9C,0x7B9A,0x7B8B,0x7B92,0x7B8F,0x7B5D,0x7B99,/* 0xB0-0xB7 */ + 0x7BCB,0x7BC1,0x7BCC,0x7BCF,0x7BB4,0x7BC6,0x7BDD,0x7BE9,/* 0xB8-0xBF */ + 0x7C11,0x7C14,0x7BE6,0x7BE5,0x7C60,0x7C00,0x7C07,0x7C13,/* 0xC0-0xC7 */ + 0x7BF3,0x7BF7,0x7C17,0x7C0D,0x7BF6,0x7C23,0x7C27,0x7C2A,/* 0xC8-0xCF */ + 0x7C1F,0x7C37,0x7C2B,0x7C3D,0x7C4C,0x7C43,0x7C54,0x7C4F,/* 0xD0-0xD7 */ + 0x7C40,0x7C50,0x7C58,0x7C5F,0x7C64,0x7C56,0x7C65,0x7C6C,/* 0xD8-0xDF */ + 0x7C75,0x7C83,0x7C90,0x7CA4,0x7CAD,0x7CA2,0x7CAB,0x7CA1,/* 0xE0-0xE7 */ + 0x7CA8,0x7CB3,0x7CB2,0x7CB1,0x7CAE,0x7CB9,0x7CBD,0x7CC0,/* 0xE8-0xEF */ + 0x7CC5,0x7CC2,0x7CD8,0x7CD2,0x7CDC,0x7CE2,0x9B3B,0x7CEF,/* 0xF0-0xF7 */ + 0x7CF2,0x7CF4,0x7CF6,0x7CFA,0x7D06,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7D02,0x7D1C,0x7D15,0x7D0A,0x7D45,0x7D4B,0x7D2E,0x7D32,/* 0x40-0x47 */ + 0x7D3F,0x7D35,0x7D46,0x7D73,0x7D56,0x7D4E,0x7D72,0x7D68,/* 0x48-0x4F */ + 0x7D6E,0x7D4F,0x7D63,0x7D93,0x7D89,0x7D5B,0x7D8F,0x7D7D,/* 0x50-0x57 */ + 0x7D9B,0x7DBA,0x7DAE,0x7DA3,0x7DB5,0x7DC7,0x7DBD,0x7DAB,/* 0x58-0x5F */ + 0x7E3D,0x7DA2,0x7DAF,0x7DDC,0x7DB8,0x7D9F,0x7DB0,0x7DD8,/* 0x60-0x67 */ + 0x7DDD,0x7DE4,0x7DDE,0x7DFB,0x7DF2,0x7DE1,0x7E05,0x7E0A,/* 0x68-0x6F */ + 0x7E23,0x7E21,0x7E12,0x7E31,0x7E1F,0x7E09,0x7E0B,0x7E22,/* 0x70-0x77 */ + 0x7E46,0x7E66,0x7E3B,0x7E35,0x7E39,0x7E43,0x7E37,0x0000,/* 0x78-0x7F */ + + 0x7E32,0x7E3A,0x7E67,0x7E5D,0x7E56,0x7E5E,0x7E59,0x7E5A,/* 0x80-0x87 */ + 0x7E79,0x7E6A,0x7E69,0x7E7C,0x7E7B,0x7E83,0x7DD5,0x7E7D,/* 0x88-0x8F */ + 0x8FAE,0x7E7F,0x7E88,0x7E89,0x7E8C,0x7E92,0x7E90,0x7E93,/* 0x90-0x97 */ + 0x7E94,0x7E96,0x7E8E,0x7E9B,0x7E9C,0x7F38,0x7F3A,0x7F45,/* 0x98-0x9F */ + 0x7F4C,0x7F4D,0x7F4E,0x7F50,0x7F51,0x7F55,0x7F54,0x7F58,/* 0xA0-0xA7 */ + 0x7F5F,0x7F60,0x7F68,0x7F69,0x7F67,0x7F78,0x7F82,0x7F86,/* 0xA8-0xAF */ + 0x7F83,0x7F88,0x7F87,0x7F8C,0x7F94,0x7F9E,0x7F9D,0x7F9A,/* 0xB0-0xB7 */ + 0x7FA3,0x7FAF,0x7FB2,0x7FB9,0x7FAE,0x7FB6,0x7FB8,0x8B71,/* 0xB8-0xBF */ + 0x7FC5,0x7FC6,0x7FCA,0x7FD5,0x7FD4,0x7FE1,0x7FE6,0x7FE9,/* 0xC0-0xC7 */ + 0x7FF3,0x7FF9,0x98DC,0x8006,0x8004,0x800B,0x8012,0x8018,/* 0xC8-0xCF */ + 0x8019,0x801C,0x8021,0x8028,0x803F,0x803B,0x804A,0x8046,/* 0xD0-0xD7 */ + 0x8052,0x8058,0x805A,0x805F,0x8062,0x8068,0x8073,0x8072,/* 0xD8-0xDF */ + 0x8070,0x8076,0x8079,0x807D,0x807F,0x8084,0x8086,0x8085,/* 0xE0-0xE7 */ + 0x809B,0x8093,0x809A,0x80AD,0x5190,0x80AC,0x80DB,0x80E5,/* 0xE8-0xEF */ + 0x80D9,0x80DD,0x80C4,0x80DA,0x80D6,0x8109,0x80EF,0x80F1,/* 0xF0-0xF7 */ + 0x811B,0x8129,0x8123,0x812F,0x814B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x968B,0x8146,0x813E,0x8153,0x8151,0x80FC,0x8171,0x816E,/* 0x40-0x47 */ + 0x8165,0x8166,0x8174,0x8183,0x8188,0x818A,0x8180,0x8182,/* 0x48-0x4F */ + 0x81A0,0x8195,0x81A4,0x81A3,0x815F,0x8193,0x81A9,0x81B0,/* 0x50-0x57 */ + 0x81B5,0x81BE,0x81B8,0x81BD,0x81C0,0x81C2,0x81BA,0x81C9,/* 0x58-0x5F */ + 0x81CD,0x81D1,0x81D9,0x81D8,0x81C8,0x81DA,0x81DF,0x81E0,/* 0x60-0x67 */ + 0x81E7,0x81FA,0x81FB,0x81FE,0x8201,0x8202,0x8205,0x8207,/* 0x68-0x6F */ + 0x820A,0x820D,0x8210,0x8216,0x8229,0x822B,0x8238,0x8233,/* 0x70-0x77 */ + 0x8240,0x8259,0x8258,0x825D,0x825A,0x825F,0x8264,0x0000,/* 0x78-0x7F */ + + 0x8262,0x8268,0x826A,0x826B,0x822E,0x8271,0x8277,0x8278,/* 0x80-0x87 */ + 0x827E,0x828D,0x8292,0x82AB,0x829F,0x82BB,0x82AC,0x82E1,/* 0x88-0x8F */ + 0x82E3,0x82DF,0x82D2,0x82F4,0x82F3,0x82FA,0x8393,0x8303,/* 0x90-0x97 */ + 0x82FB,0x82F9,0x82DE,0x8306,0x82DC,0x8309,0x82D9,0x8335,/* 0x98-0x9F */ + 0x8334,0x8316,0x8332,0x8331,0x8340,0x8339,0x8350,0x8345,/* 0xA0-0xA7 */ + 0x832F,0x832B,0x8317,0x8318,0x8385,0x839A,0x83AA,0x839F,/* 0xA8-0xAF */ + 0x83A2,0x8396,0x8323,0x838E,0x8387,0x838A,0x837C,0x83B5,/* 0xB0-0xB7 */ + 0x8373,0x8375,0x83A0,0x8389,0x83A8,0x83F4,0x8413,0x83EB,/* 0xB8-0xBF */ + 0x83CE,0x83FD,0x8403,0x83D8,0x840B,0x83C1,0x83F7,0x8407,/* 0xC0-0xC7 */ + 0x83E0,0x83F2,0x840D,0x8422,0x8420,0x83BD,0x8438,0x8506,/* 0xC8-0xCF */ + 0x83FB,0x846D,0x842A,0x843C,0x855A,0x8484,0x8477,0x846B,/* 0xD0-0xD7 */ + 0x84AD,0x846E,0x8482,0x8469,0x8446,0x842C,0x846F,0x8479,/* 0xD8-0xDF */ + 0x8435,0x84CA,0x8462,0x84B9,0x84BF,0x849F,0x84D9,0x84CD,/* 0xE0-0xE7 */ + 0x84BB,0x84DA,0x84D0,0x84C1,0x84C6,0x84D6,0x84A1,0x8521,/* 0xE8-0xEF */ + 0x84FF,0x84F4,0x8517,0x8518,0x852C,0x851F,0x8515,0x8514,/* 0xF0-0xF7 */ + 0x84FC,0x8540,0x8563,0x8558,0x8548,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8541,0x8602,0x854B,0x8555,0x8580,0x85A4,0x8588,0x8591,/* 0x40-0x47 */ + 0x858A,0x85A8,0x856D,0x8594,0x859B,0x85EA,0x8587,0x859C,/* 0x48-0x4F */ + 0x8577,0x857E,0x8590,0x85C9,0x85BA,0x85CF,0x85B9,0x85D0,/* 0x50-0x57 */ + 0x85D5,0x85DD,0x85E5,0x85DC,0x85F9,0x860A,0x8613,0x860B,/* 0x58-0x5F */ + 0x85FE,0x85FA,0x8606,0x8622,0x861A,0x8630,0x863F,0x864D,/* 0x60-0x67 */ + 0x4E55,0x8654,0x865F,0x8667,0x8671,0x8693,0x86A3,0x86A9,/* 0x68-0x6F */ + 0x86AA,0x868B,0x868C,0x86B6,0x86AF,0x86C4,0x86C6,0x86B0,/* 0x70-0x77 */ + 0x86C9,0x8823,0x86AB,0x86D4,0x86DE,0x86E9,0x86EC,0x0000,/* 0x78-0x7F */ + + 0x86DF,0x86DB,0x86EF,0x8712,0x8706,0x8708,0x8700,0x8703,/* 0x80-0x87 */ + 0x86FB,0x8711,0x8709,0x870D,0x86F9,0x870A,0x8734,0x873F,/* 0x88-0x8F */ + 0x8737,0x873B,0x8725,0x8729,0x871A,0x8760,0x875F,0x8778,/* 0x90-0x97 */ + 0x874C,0x874E,0x8774,0x8757,0x8768,0x876E,0x8759,0x8753,/* 0x98-0x9F */ + 0x8763,0x876A,0x8805,0x87A2,0x879F,0x8782,0x87AF,0x87CB,/* 0xA0-0xA7 */ + 0x87BD,0x87C0,0x87D0,0x96D6,0x87AB,0x87C4,0x87B3,0x87C7,/* 0xA8-0xAF */ + 0x87C6,0x87BB,0x87EF,0x87F2,0x87E0,0x880F,0x880D,0x87FE,/* 0xB0-0xB7 */ + 0x87F6,0x87F7,0x880E,0x87D2,0x8811,0x8816,0x8815,0x8822,/* 0xB8-0xBF */ + 0x8821,0x8831,0x8836,0x8839,0x8827,0x883B,0x8844,0x8842,/* 0xC0-0xC7 */ + 0x8852,0x8859,0x885E,0x8862,0x886B,0x8881,0x887E,0x889E,/* 0xC8-0xCF */ + 0x8875,0x887D,0x88B5,0x8872,0x8882,0x8897,0x8892,0x88AE,/* 0xD0-0xD7 */ + 0x8899,0x88A2,0x888D,0x88A4,0x88B0,0x88BF,0x88B1,0x88C3,/* 0xD8-0xDF */ + 0x88C4,0x88D4,0x88D8,0x88D9,0x88DD,0x88F9,0x8902,0x88FC,/* 0xE0-0xE7 */ + 0x88F4,0x88E8,0x88F2,0x8904,0x890C,0x890A,0x8913,0x8943,/* 0xE8-0xEF */ + 0x891E,0x8925,0x892A,0x892B,0x8941,0x8944,0x893B,0x8936,/* 0xF0-0xF7 */ + 0x8938,0x894C,0x891D,0x8960,0x895E,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8966,0x8964,0x896D,0x896A,0x896F,0x8974,0x8977,0x897E,/* 0x40-0x47 */ + 0x8983,0x8988,0x898A,0x8993,0x8998,0x89A1,0x89A9,0x89A6,/* 0x48-0x4F */ + 0x89AC,0x89AF,0x89B2,0x89BA,0x89BD,0x89BF,0x89C0,0x89DA,/* 0x50-0x57 */ + 0x89DC,0x89DD,0x89E7,0x89F4,0x89F8,0x8A03,0x8A16,0x8A10,/* 0x58-0x5F */ + 0x8A0C,0x8A1B,0x8A1D,0x8A25,0x8A36,0x8A41,0x8A5B,0x8A52,/* 0x60-0x67 */ + 0x8A46,0x8A48,0x8A7C,0x8A6D,0x8A6C,0x8A62,0x8A85,0x8A82,/* 0x68-0x6F */ + 0x8A84,0x8AA8,0x8AA1,0x8A91,0x8AA5,0x8AA6,0x8A9A,0x8AA3,/* 0x70-0x77 */ + 0x8AC4,0x8ACD,0x8AC2,0x8ADA,0x8AEB,0x8AF3,0x8AE7,0x0000,/* 0x78-0x7F */ + + 0x8AE4,0x8AF1,0x8B14,0x8AE0,0x8AE2,0x8AF7,0x8ADE,0x8ADB,/* 0x80-0x87 */ + 0x8B0C,0x8B07,0x8B1A,0x8AE1,0x8B16,0x8B10,0x8B17,0x8B20,/* 0x88-0x8F */ + 0x8B33,0x97AB,0x8B26,0x8B2B,0x8B3E,0x8B28,0x8B41,0x8B4C,/* 0x90-0x97 */ + 0x8B4F,0x8B4E,0x8B49,0x8B56,0x8B5B,0x8B5A,0x8B6B,0x8B5F,/* 0x98-0x9F */ + 0x8B6C,0x8B6F,0x8B74,0x8B7D,0x8B80,0x8B8C,0x8B8E,0x8B92,/* 0xA0-0xA7 */ + 0x8B93,0x8B96,0x8B99,0x8B9A,0x8C3A,0x8C41,0x8C3F,0x8C48,/* 0xA8-0xAF */ + 0x8C4C,0x8C4E,0x8C50,0x8C55,0x8C62,0x8C6C,0x8C78,0x8C7A,/* 0xB0-0xB7 */ + 0x8C82,0x8C89,0x8C85,0x8C8A,0x8C8D,0x8C8E,0x8C94,0x8C7C,/* 0xB8-0xBF */ + 0x8C98,0x621D,0x8CAD,0x8CAA,0x8CBD,0x8CB2,0x8CB3,0x8CAE,/* 0xC0-0xC7 */ + 0x8CB6,0x8CC8,0x8CC1,0x8CE4,0x8CE3,0x8CDA,0x8CFD,0x8CFA,/* 0xC8-0xCF */ + 0x8CFB,0x8D04,0x8D05,0x8D0A,0x8D07,0x8D0F,0x8D0D,0x8D10,/* 0xD0-0xD7 */ + 0x9F4E,0x8D13,0x8CCD,0x8D14,0x8D16,0x8D67,0x8D6D,0x8D71,/* 0xD8-0xDF */ + 0x8D73,0x8D81,0x8D99,0x8DC2,0x8DBE,0x8DBA,0x8DCF,0x8DDA,/* 0xE0-0xE7 */ + 0x8DD6,0x8DCC,0x8DDB,0x8DCB,0x8DEA,0x8DEB,0x8DDF,0x8DE3,/* 0xE8-0xEF */ + 0x8DFC,0x8E08,0x8E09,0x8DFF,0x8E1D,0x8E1E,0x8E10,0x8E1F,/* 0xF0-0xF7 */ + 0x8E42,0x8E35,0x8E30,0x8E34,0x8E4A,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8E47,0x8E49,0x8E4C,0x8E50,0x8E48,0x8E59,0x8E64,0x8E60,/* 0x40-0x47 */ + 0x8E2A,0x8E63,0x8E55,0x8E76,0x8E72,0x8E7C,0x8E81,0x8E87,/* 0x48-0x4F */ + 0x8E85,0x8E84,0x8E8B,0x8E8A,0x8E93,0x8E91,0x8E94,0x8E99,/* 0x50-0x57 */ + 0x8EAA,0x8EA1,0x8EAC,0x8EB0,0x8EC6,0x8EB1,0x8EBE,0x8EC5,/* 0x58-0x5F */ + 0x8EC8,0x8ECB,0x8EDB,0x8EE3,0x8EFC,0x8EFB,0x8EEB,0x8EFE,/* 0x60-0x67 */ + 0x8F0A,0x8F05,0x8F15,0x8F12,0x8F19,0x8F13,0x8F1C,0x8F1F,/* 0x68-0x6F */ + 0x8F1B,0x8F0C,0x8F26,0x8F33,0x8F3B,0x8F39,0x8F45,0x8F42,/* 0x70-0x77 */ + 0x8F3E,0x8F4C,0x8F49,0x8F46,0x8F4E,0x8F57,0x8F5C,0x0000,/* 0x78-0x7F */ + + 0x8F62,0x8F63,0x8F64,0x8F9C,0x8F9F,0x8FA3,0x8FAD,0x8FAF,/* 0x80-0x87 */ + 0x8FB7,0x8FDA,0x8FE5,0x8FE2,0x8FEA,0x8FEF,0x9087,0x8FF4,/* 0x88-0x8F */ + 0x9005,0x8FF9,0x8FFA,0x9011,0x9015,0x9021,0x900D,0x901E,/* 0x90-0x97 */ + 0x9016,0x900B,0x9027,0x9036,0x9035,0x9039,0x8FF8,0x904F,/* 0x98-0x9F */ + 0x9050,0x9051,0x9052,0x900E,0x9049,0x903E,0x9056,0x9058,/* 0xA0-0xA7 */ + 0x905E,0x9068,0x906F,0x9076,0x96A8,0x9072,0x9082,0x907D,/* 0xA8-0xAF */ + 0x9081,0x9080,0x908A,0x9089,0x908F,0x90A8,0x90AF,0x90B1,/* 0xB0-0xB7 */ + 0x90B5,0x90E2,0x90E4,0x6248,0x90DB,0x9102,0x9112,0x9119,/* 0xB8-0xBF */ + 0x9132,0x9130,0x914A,0x9156,0x9158,0x9163,0x9165,0x9169,/* 0xC0-0xC7 */ + 0x9173,0x9172,0x918B,0x9189,0x9182,0x91A2,0x91AB,0x91AF,/* 0xC8-0xCF */ + 0x91AA,0x91B5,0x91B4,0x91BA,0x91C0,0x91C1,0x91C9,0x91CB,/* 0xD0-0xD7 */ + 0x91D0,0x91D6,0x91DF,0x91E1,0x91DB,0x91FC,0x91F5,0x91F6,/* 0xD8-0xDF */ + 0x921E,0x91FF,0x9214,0x922C,0x9215,0x9211,0x925E,0x9257,/* 0xE0-0xE7 */ + 0x9245,0x9249,0x9264,0x9248,0x9295,0x923F,0x924B,0x9250,/* 0xE8-0xEF */ + 0x929C,0x9296,0x9293,0x929B,0x925A,0x92CF,0x92B9,0x92B7,/* 0xF0-0xF7 */ + 0x92E9,0x930F,0x92FA,0x9344,0x932E,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9319,0x9322,0x931A,0x9323,0x933A,0x9335,0x933B,0x935C,/* 0x40-0x47 */ + 0x9360,0x937C,0x936E,0x9356,0x93B0,0x93AC,0x93AD,0x9394,/* 0x48-0x4F */ + 0x93B9,0x93D6,0x93D7,0x93E8,0x93E5,0x93D8,0x93C3,0x93DD,/* 0x50-0x57 */ + 0x93D0,0x93C8,0x93E4,0x941A,0x9414,0x9413,0x9403,0x9407,/* 0x58-0x5F */ + 0x9410,0x9436,0x942B,0x9435,0x9421,0x943A,0x9441,0x9452,/* 0x60-0x67 */ + 0x9444,0x945B,0x9460,0x9462,0x945E,0x946A,0x9229,0x9470,/* 0x68-0x6F */ + 0x9475,0x9477,0x947D,0x945A,0x947C,0x947E,0x9481,0x947F,/* 0x70-0x77 */ + 0x9582,0x9587,0x958A,0x9594,0x9596,0x9598,0x9599,0x0000,/* 0x78-0x7F */ + + 0x95A0,0x95A8,0x95A7,0x95AD,0x95BC,0x95BB,0x95B9,0x95BE,/* 0x80-0x87 */ + 0x95CA,0x6FF6,0x95C3,0x95CD,0x95CC,0x95D5,0x95D4,0x95D6,/* 0x88-0x8F */ + 0x95DC,0x95E1,0x95E5,0x95E2,0x9621,0x9628,0x962E,0x962F,/* 0x90-0x97 */ + 0x9642,0x964C,0x964F,0x964B,0x9677,0x965C,0x965E,0x965D,/* 0x98-0x9F */ + 0x965F,0x9666,0x9672,0x966C,0x968D,0x9698,0x9695,0x9697,/* 0xA0-0xA7 */ + 0x96AA,0x96A7,0x96B1,0x96B2,0x96B0,0x96B4,0x96B6,0x96B8,/* 0xA8-0xAF */ + 0x96B9,0x96CE,0x96CB,0x96C9,0x96CD,0x894D,0x96DC,0x970D,/* 0xB0-0xB7 */ + 0x96D5,0x96F9,0x9704,0x9706,0x9708,0x9713,0x970E,0x9711,/* 0xB8-0xBF */ + 0x970F,0x9716,0x9719,0x9724,0x972A,0x9730,0x9739,0x973D,/* 0xC0-0xC7 */ + 0x973E,0x9744,0x9746,0x9748,0x9742,0x9749,0x975C,0x9760,/* 0xC8-0xCF */ + 0x9764,0x9766,0x9768,0x52D2,0x976B,0x9771,0x9779,0x9785,/* 0xD0-0xD7 */ + 0x977C,0x9781,0x977A,0x9786,0x978B,0x978F,0x9790,0x979C,/* 0xD8-0xDF */ + 0x97A8,0x97A6,0x97A3,0x97B3,0x97B4,0x97C3,0x97C6,0x97C8,/* 0xE0-0xE7 */ + 0x97CB,0x97DC,0x97ED,0x9F4F,0x97F2,0x7ADF,0x97F6,0x97F5,/* 0xE8-0xEF */ + 0x980F,0x980C,0x9838,0x9824,0x9821,0x9837,0x983D,0x9846,/* 0xF0-0xF7 */ + 0x984F,0x984B,0x986B,0x986F,0x9870,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9871,0x9874,0x9873,0x98AA,0x98AF,0x98B1,0x98B6,0x98C4,/* 0x40-0x47 */ + 0x98C3,0x98C6,0x98E9,0x98EB,0x9903,0x9909,0x9912,0x9914,/* 0x48-0x4F */ + 0x9918,0x9921,0x991D,0x991E,0x9924,0x9920,0x992C,0x992E,/* 0x50-0x57 */ + 0x993D,0x993E,0x9942,0x9949,0x9945,0x9950,0x994B,0x9951,/* 0x58-0x5F */ + 0x9952,0x994C,0x9955,0x9997,0x9998,0x99A5,0x99AD,0x99AE,/* 0x60-0x67 */ + 0x99BC,0x99DF,0x99DB,0x99DD,0x99D8,0x99D1,0x99ED,0x99EE,/* 0x68-0x6F */ + 0x99F1,0x99F2,0x99FB,0x99F8,0x9A01,0x9A0F,0x9A05,0x99E2,/* 0x70-0x77 */ + 0x9A19,0x9A2B,0x9A37,0x9A45,0x9A42,0x9A40,0x9A43,0x0000,/* 0x78-0x7F */ + + 0x9A3E,0x9A55,0x9A4D,0x9A5B,0x9A57,0x9A5F,0x9A62,0x9A65,/* 0x80-0x87 */ + 0x9A64,0x9A69,0x9A6B,0x9A6A,0x9AAD,0x9AB0,0x9ABC,0x9AC0,/* 0x88-0x8F */ + 0x9ACF,0x9AD1,0x9AD3,0x9AD4,0x9ADE,0x9ADF,0x9AE2,0x9AE3,/* 0x90-0x97 */ + 0x9AE6,0x9AEF,0x9AEB,0x9AEE,0x9AF4,0x9AF1,0x9AF7,0x9AFB,/* 0x98-0x9F */ + 0x9B06,0x9B18,0x9B1A,0x9B1F,0x9B22,0x9B23,0x9B25,0x9B27,/* 0xA0-0xA7 */ + 0x9B28,0x9B29,0x9B2A,0x9B2E,0x9B2F,0x9B32,0x9B44,0x9B43,/* 0xA8-0xAF */ + 0x9B4F,0x9B4D,0x9B4E,0x9B51,0x9B58,0x9B74,0x9B93,0x9B83,/* 0xB0-0xB7 */ + 0x9B91,0x9B96,0x9B97,0x9B9F,0x9BA0,0x9BA8,0x9BB4,0x9BC0,/* 0xB8-0xBF */ + 0x9BCA,0x9BB9,0x9BC6,0x9BCF,0x9BD1,0x9BD2,0x9BE3,0x9BE2,/* 0xC0-0xC7 */ + 0x9BE4,0x9BD4,0x9BE1,0x9C3A,0x9BF2,0x9BF1,0x9BF0,0x9C15,/* 0xC8-0xCF */ + 0x9C14,0x9C09,0x9C13,0x9C0C,0x9C06,0x9C08,0x9C12,0x9C0A,/* 0xD0-0xD7 */ + 0x9C04,0x9C2E,0x9C1B,0x9C25,0x9C24,0x9C21,0x9C30,0x9C47,/* 0xD8-0xDF */ + 0x9C32,0x9C46,0x9C3E,0x9C5A,0x9C60,0x9C67,0x9C76,0x9C78,/* 0xE0-0xE7 */ + 0x9CE7,0x9CEC,0x9CF0,0x9D09,0x9D08,0x9CEB,0x9D03,0x9D06,/* 0xE8-0xEF */ + 0x9D2A,0x9D26,0x9DAF,0x9D23,0x9D1F,0x9D44,0x9D15,0x9D12,/* 0xF0-0xF7 */ + 0x9D41,0x9D3F,0x9D3E,0x9D46,0x9D48,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9D5D,0x9D5E,0x9D64,0x9D51,0x9D50,0x9D59,0x9D72,0x9D89,/* 0x40-0x47 */ + 0x9D87,0x9DAB,0x9D6F,0x9D7A,0x9D9A,0x9DA4,0x9DA9,0x9DB2,/* 0x48-0x4F */ + 0x9DC4,0x9DC1,0x9DBB,0x9DB8,0x9DBA,0x9DC6,0x9DCF,0x9DC2,/* 0x50-0x57 */ + 0x9DD9,0x9DD3,0x9DF8,0x9DE6,0x9DED,0x9DEF,0x9DFD,0x9E1A,/* 0x58-0x5F */ + 0x9E1B,0x9E1E,0x9E75,0x9E79,0x9E7D,0x9E81,0x9E88,0x9E8B,/* 0x60-0x67 */ + 0x9E8C,0x9E92,0x9E95,0x9E91,0x9E9D,0x9EA5,0x9EA9,0x9EB8,/* 0x68-0x6F */ + 0x9EAA,0x9EAD,0x9761,0x9ECC,0x9ECE,0x9ECF,0x9ED0,0x9ED4,/* 0x70-0x77 */ + 0x9EDC,0x9EDE,0x9EDD,0x9EE0,0x9EE5,0x9EE8,0x9EEF,0x0000,/* 0x78-0x7F */ + + 0x9EF4,0x9EF6,0x9EF7,0x9EF9,0x9EFB,0x9EFC,0x9EFD,0x9F07,/* 0x80-0x87 */ + 0x9F08,0x76B7,0x9F15,0x9F21,0x9F2C,0x9F3E,0x9F4A,0x9F52,/* 0x88-0x8F */ + 0x9F54,0x9F63,0x9F5F,0x9F60,0x9F61,0x9F66,0x9F67,0x9F6C,/* 0x90-0x97 */ + 0x9F6A,0x9F77,0x9F72,0x9F76,0x9F95,0x9F9C,0x9FA0,0x582F,/* 0x98-0x9F */ + 0x69C7,0x9059,0x7464,0x51DC,0x7199,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_ED[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7E8A,0x891C,0x9348,0x9288,0x84DC,0x4FC9,0x70BB,0x6631,/* 0x40-0x47 */ + 0x68C8,0x92F9,0x66FB,0x5F45,0x4E28,0x4EE1,0x4EFC,0x4F00,/* 0x48-0x4F */ + 0x4F03,0x4F39,0x4F56,0x4F92,0x4F8A,0x4F9A,0x4F94,0x4FCD,/* 0x50-0x57 */ + 0x5040,0x5022,0x4FFF,0x501E,0x5046,0x5070,0x5042,0x5094,/* 0x58-0x5F */ + 0x50F4,0x50D8,0x514A,0x5164,0x519D,0x51BE,0x51EC,0x5215,/* 0x60-0x67 */ + 0x529C,0x52A6,0x52C0,0x52DB,0x5300,0x5307,0x5324,0x5372,/* 0x68-0x6F */ + 0x5393,0x53B2,0x53DD,0xFA0E,0x549C,0x548A,0x54A9,0x54FF,/* 0x70-0x77 */ + 0x5586,0x5759,0x5765,0x57AC,0x57C8,0x57C7,0xFA0F,0x0000,/* 0x78-0x7F */ + + 0xFA10,0x589E,0x58B2,0x590B,0x5953,0x595B,0x595D,0x5963,/* 0x80-0x87 */ + 0x59A4,0x59BA,0x5B56,0x5BC0,0x752F,0x5BD8,0x5BEC,0x5C1E,/* 0x88-0x8F */ + 0x5CA6,0x5CBA,0x5CF5,0x5D27,0x5D53,0xFA11,0x5D42,0x5D6D,/* 0x90-0x97 */ + 0x5DB8,0x5DB9,0x5DD0,0x5F21,0x5F34,0x5F67,0x5FB7,0x5FDE,/* 0x98-0x9F */ + 0x605D,0x6085,0x608A,0x60DE,0x60D5,0x6120,0x60F2,0x6111,/* 0xA0-0xA7 */ + 0x6137,0x6130,0x6198,0x6213,0x62A6,0x63F5,0x6460,0x649D,/* 0xA8-0xAF */ + 0x64CE,0x654E,0x6600,0x6615,0x663B,0x6609,0x662E,0x661E,/* 0xB0-0xB7 */ + 0x6624,0x6665,0x6657,0x6659,0xFA12,0x6673,0x6699,0x66A0,/* 0xB8-0xBF */ + 0x66B2,0x66BF,0x66FA,0x670E,0xF929,0x6766,0x67BB,0x6852,/* 0xC0-0xC7 */ + 0x67C0,0x6801,0x6844,0x68CF,0xFA13,0x6968,0xFA14,0x6998,/* 0xC8-0xCF */ + 0x69E2,0x6A30,0x6A6B,0x6A46,0x6A73,0x6A7E,0x6AE2,0x6AE4,/* 0xD0-0xD7 */ + 0x6BD6,0x6C3F,0x6C5C,0x6C86,0x6C6F,0x6CDA,0x6D04,0x6D87,/* 0xD8-0xDF */ + 0x6D6F,0x6D96,0x6DAC,0x6DCF,0x6DF8,0x6DF2,0x6DFC,0x6E39,/* 0xE0-0xE7 */ + 0x6E5C,0x6E27,0x6E3C,0x6EBF,0x6F88,0x6FB5,0x6FF5,0x7005,/* 0xE8-0xEF */ + 0x7007,0x7028,0x7085,0x70AB,0x710F,0x7104,0x715C,0x7146,/* 0xF0-0xF7 */ + 0x7147,0xFA15,0x71C1,0x71FE,0x72B1,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x72BE,0x7324,0xFA16,0x7377,0x73BD,0x73C9,0x73D6,0x73E3,/* 0x40-0x47 */ + 0x73D2,0x7407,0x73F5,0x7426,0x742A,0x7429,0x742E,0x7462,/* 0x48-0x4F */ + 0x7489,0x749F,0x7501,0x756F,0x7682,0x769C,0x769E,0x769B,/* 0x50-0x57 */ + 0x76A6,0xFA17,0x7746,0x52AF,0x7821,0x784E,0x7864,0x787A,/* 0x58-0x5F */ + 0x7930,0xFA18,0xFA19,0xFA1A,0x7994,0xFA1B,0x799B,0x7AD1,/* 0x60-0x67 */ + 0x7AE7,0xFA1C,0x7AEB,0x7B9E,0xFA1D,0x7D48,0x7D5C,0x7DB7,/* 0x68-0x6F */ + 0x7DA0,0x7DD6,0x7E52,0x7F47,0x7FA1,0xFA1E,0x8301,0x8362,/* 0x70-0x77 */ + 0x837F,0x83C7,0x83F6,0x8448,0x84B4,0x8553,0x8559,0x0000,/* 0x78-0x7F */ + + 0x856B,0xFA1F,0x85B0,0xFA20,0xFA21,0x8807,0x88F5,0x8A12,/* 0x80-0x87 */ + 0x8A37,0x8A79,0x8AA7,0x8ABE,0x8ADF,0xFA22,0x8AF6,0x8B53,/* 0x88-0x8F */ + 0x8B7F,0x8CF0,0x8CF4,0x8D12,0x8D76,0xFA23,0x8ECF,0xFA24,/* 0x90-0x97 */ + 0xFA25,0x9067,0x90DE,0xFA26,0x9115,0x9127,0x91DA,0x91D7,/* 0x98-0x9F */ + 0x91DE,0x91ED,0x91EE,0x91E4,0x91E5,0x9206,0x9210,0x920A,/* 0xA0-0xA7 */ + 0x923A,0x9240,0x923C,0x924E,0x9259,0x9251,0x9239,0x9267,/* 0xA8-0xAF */ + 0x92A7,0x9277,0x9278,0x92E7,0x92D7,0x92D9,0x92D0,0xFA27,/* 0xB0-0xB7 */ + 0x92D5,0x92E0,0x92D3,0x9325,0x9321,0x92FB,0xFA28,0x931E,/* 0xB8-0xBF */ + 0x92FF,0x931D,0x9302,0x9370,0x9357,0x93A4,0x93C6,0x93DE,/* 0xC0-0xC7 */ + 0x93F8,0x9431,0x9445,0x9448,0x9592,0xF9DC,0xFA29,0x969D,/* 0xC8-0xCF */ + 0x96AF,0x9733,0x973B,0x9743,0x974D,0x974F,0x9751,0x9755,/* 0xD0-0xD7 */ + 0x9857,0x9865,0xFA2A,0xFA2B,0x9927,0xFA2C,0x999E,0x9A4E,/* 0xD8-0xDF */ + 0x9AD9,0x9ADC,0x9B75,0x9B72,0x9B8F,0x9BB1,0x9BBB,0x9C00,/* 0xE0-0xE7 */ + 0x9D70,0x9D6B,0xFA2D,0x9E19,0x9ED1,0x0000,0x0000,0x2170,/* 0xE8-0xEF */ + 0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,0x2177,0x2178,/* 0xF0-0xF7 */ + 0x2179,0xFFE2,0xFFE4,0xFF07,0xFF02,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_FA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x2170,0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,0x2177,/* 0x40-0x47 */ + 0x2178,0x2179,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,/* 0x48-0x4F */ + 0x2166,0x2167,0x2168,0x2169,0xFFE2,0xFFE4,0xFF07,0xFF02,/* 0x50-0x57 */ + 0x3231,0x2116,0x2121,0x2235,0x7E8A,0x891C,0x9348,0x9288,/* 0x58-0x5F */ + 0x84DC,0x4FC9,0x70BB,0x6631,0x68C8,0x92F9,0x66FB,0x5F45,/* 0x60-0x67 */ + 0x4E28,0x4EE1,0x4EFC,0x4F00,0x4F03,0x4F39,0x4F56,0x4F92,/* 0x68-0x6F */ + 0x4F8A,0x4F9A,0x4F94,0x4FCD,0x5040,0x5022,0x4FFF,0x501E,/* 0x70-0x77 */ + 0x5046,0x5070,0x5042,0x5094,0x50F4,0x50D8,0x514A,0x0000,/* 0x78-0x7F */ + + 0x5164,0x519D,0x51BE,0x51EC,0x5215,0x529C,0x52A6,0x52C0,/* 0x80-0x87 */ + 0x52DB,0x5300,0x5307,0x5324,0x5372,0x5393,0x53B2,0x53DD,/* 0x88-0x8F */ + 0xFA0E,0x549C,0x548A,0x54A9,0x54FF,0x5586,0x5759,0x5765,/* 0x90-0x97 */ + 0x57AC,0x57C8,0x57C7,0xFA0F,0xFA10,0x589E,0x58B2,0x590B,/* 0x98-0x9F */ + 0x5953,0x595B,0x595D,0x5963,0x59A4,0x59BA,0x5B56,0x5BC0,/* 0xA0-0xA7 */ + 0x752F,0x5BD8,0x5BEC,0x5C1E,0x5CA6,0x5CBA,0x5CF5,0x5D27,/* 0xA8-0xAF */ + 0x5D53,0xFA11,0x5D42,0x5D6D,0x5DB8,0x5DB9,0x5DD0,0x5F21,/* 0xB0-0xB7 */ + 0x5F34,0x5F67,0x5FB7,0x5FDE,0x605D,0x6085,0x608A,0x60DE,/* 0xB8-0xBF */ + 0x60D5,0x6120,0x60F2,0x6111,0x6137,0x6130,0x6198,0x6213,/* 0xC0-0xC7 */ + 0x62A6,0x63F5,0x6460,0x649D,0x64CE,0x654E,0x6600,0x6615,/* 0xC8-0xCF */ + 0x663B,0x6609,0x662E,0x661E,0x6624,0x6665,0x6657,0x6659,/* 0xD0-0xD7 */ + 0xFA12,0x6673,0x6699,0x66A0,0x66B2,0x66BF,0x66FA,0x670E,/* 0xD8-0xDF */ + 0xF929,0x6766,0x67BB,0x6852,0x67C0,0x6801,0x6844,0x68CF,/* 0xE0-0xE7 */ + 0xFA13,0x6968,0xFA14,0x6998,0x69E2,0x6A30,0x6A6B,0x6A46,/* 0xE8-0xEF */ + 0x6A73,0x6A7E,0x6AE2,0x6AE4,0x6BD6,0x6C3F,0x6C5C,0x6C86,/* 0xF0-0xF7 */ + 0x6C6F,0x6CDA,0x6D04,0x6D87,0x6D6F,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_FB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6D96,0x6DAC,0x6DCF,0x6DF8,0x6DF2,0x6DFC,0x6E39,0x6E5C,/* 0x40-0x47 */ + 0x6E27,0x6E3C,0x6EBF,0x6F88,0x6FB5,0x6FF5,0x7005,0x7007,/* 0x48-0x4F */ + 0x7028,0x7085,0x70AB,0x710F,0x7104,0x715C,0x7146,0x7147,/* 0x50-0x57 */ + 0xFA15,0x71C1,0x71FE,0x72B1,0x72BE,0x7324,0xFA16,0x7377,/* 0x58-0x5F */ + 0x73BD,0x73C9,0x73D6,0x73E3,0x73D2,0x7407,0x73F5,0x7426,/* 0x60-0x67 */ + 0x742A,0x7429,0x742E,0x7462,0x7489,0x749F,0x7501,0x756F,/* 0x68-0x6F */ + 0x7682,0x769C,0x769E,0x769B,0x76A6,0xFA17,0x7746,0x52AF,/* 0x70-0x77 */ + 0x7821,0x784E,0x7864,0x787A,0x7930,0xFA18,0xFA19,0x0000,/* 0x78-0x7F */ + + 0xFA1A,0x7994,0xFA1B,0x799B,0x7AD1,0x7AE7,0xFA1C,0x7AEB,/* 0x80-0x87 */ + 0x7B9E,0xFA1D,0x7D48,0x7D5C,0x7DB7,0x7DA0,0x7DD6,0x7E52,/* 0x88-0x8F */ + 0x7F47,0x7FA1,0xFA1E,0x8301,0x8362,0x837F,0x83C7,0x83F6,/* 0x90-0x97 */ + 0x8448,0x84B4,0x8553,0x8559,0x856B,0xFA1F,0x85B0,0xFA20,/* 0x98-0x9F */ + 0xFA21,0x8807,0x88F5,0x8A12,0x8A37,0x8A79,0x8AA7,0x8ABE,/* 0xA0-0xA7 */ + 0x8ADF,0xFA22,0x8AF6,0x8B53,0x8B7F,0x8CF0,0x8CF4,0x8D12,/* 0xA8-0xAF */ + 0x8D76,0xFA23,0x8ECF,0xFA24,0xFA25,0x9067,0x90DE,0xFA26,/* 0xB0-0xB7 */ + 0x9115,0x9127,0x91DA,0x91D7,0x91DE,0x91ED,0x91EE,0x91E4,/* 0xB8-0xBF */ + 0x91E5,0x9206,0x9210,0x920A,0x923A,0x9240,0x923C,0x924E,/* 0xC0-0xC7 */ + 0x9259,0x9251,0x9239,0x9267,0x92A7,0x9277,0x9278,0x92E7,/* 0xC8-0xCF */ + 0x92D7,0x92D9,0x92D0,0xFA27,0x92D5,0x92E0,0x92D3,0x9325,/* 0xD0-0xD7 */ + 0x9321,0x92FB,0xFA28,0x931E,0x92FF,0x931D,0x9302,0x9370,/* 0xD8-0xDF */ + 0x9357,0x93A4,0x93C6,0x93DE,0x93F8,0x9431,0x9445,0x9448,/* 0xE0-0xE7 */ + 0x9592,0xF9DC,0xFA29,0x969D,0x96AF,0x9733,0x973B,0x9743,/* 0xE8-0xEF */ + 0x974D,0x974F,0x9751,0x9755,0x9857,0x9865,0xFA2A,0xFA2B,/* 0xF0-0xF7 */ + 0x9927,0xFA2C,0x999E,0x9A4E,0x9AD9,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_FC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9ADC,0x9B75,0x9B72,0x9B8F,0x9BB1,0x9BBB,0x9C00,0x9D70,/* 0x40-0x47 */ + 0x9D6B,0xFA2D,0x9E19,0x9ED1,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ +}; + +static const wchar_t *page_charset2uni[256] = { + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, c2u_81, c2u_82, c2u_83, c2u_84, NULL, NULL, c2u_87, + c2u_88, c2u_89, c2u_8A, c2u_8B, c2u_8C, c2u_8D, c2u_8E, c2u_8F, + c2u_90, c2u_91, c2u_92, c2u_93, c2u_94, c2u_95, c2u_96, c2u_97, + c2u_98, c2u_99, c2u_9A, c2u_9B, c2u_9C, c2u_9D, c2u_9E, c2u_9F, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, + c2u_E8, c2u_E9, c2u_EA, NULL, NULL, c2u_ED, c2u_EE, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, c2u_FA, c2u_FB, c2u_FC, NULL, NULL, NULL, +}; + +static const unsigned char u2c_00hi[256 - 0xA0][2] = { + {0x00, 0x00}, {0x00, 0x00}, {0x81, 0x91}, {0x81, 0x92},/* 0xA0-0xA3 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x81, 0x98},/* 0xA4-0xA7 */ + {0x81, 0x4E}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xA8-0xAB */ + {0x81, 0xCA}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xAC-0xAF */ + {0x81, 0x8B}, {0x81, 0x7D}, {0x00, 0x00}, {0x00, 0x00},/* 0xB0-0xB3 */ + {0x81, 0x4C}, {0x00, 0x00}, {0x81, 0xF7}, {0x00, 0x00},/* 0xB4-0xB7 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xB8-0xBB */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xBC-0xBF */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xC0-0xC3 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xC4-0xC7 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xC8-0xCB */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xCC-0xCF */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xD0-0xD3 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x81, 0x7E},/* 0xD4-0xD7 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xD8-0xDB */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xDC-0xDF */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xE0-0xE3 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xE4-0xE7 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xE8-0xEB */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xEC-0xEF */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xF0-0xF3 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x81, 0x80},/* 0xF4-0xF7 */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xF8-0xFB */ + {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00},/* 0xFC-0xFF */ +}; + +static const unsigned char u2c_03[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x83, 0x9F, 0x83, 0xA0, 0x83, 0xA1, /* 0x90-0x93 */ + 0x83, 0xA2, 0x83, 0xA3, 0x83, 0xA4, 0x83, 0xA5, /* 0x94-0x97 */ + 0x83, 0xA6, 0x83, 0xA7, 0x83, 0xA8, 0x83, 0xA9, /* 0x98-0x9B */ + 0x83, 0xAA, 0x83, 0xAB, 0x83, 0xAC, 0x83, 0xAD, /* 0x9C-0x9F */ + 0x83, 0xAE, 0x83, 0xAF, 0x00, 0x00, 0x83, 0xB0, /* 0xA0-0xA3 */ + 0x83, 0xB1, 0x83, 0xB2, 0x83, 0xB3, 0x83, 0xB4, /* 0xA4-0xA7 */ + 0x83, 0xB5, 0x83, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x83, 0xBF, 0x83, 0xC0, 0x83, 0xC1, /* 0xB0-0xB3 */ + 0x83, 0xC2, 0x83, 0xC3, 0x83, 0xC4, 0x83, 0xC5, /* 0xB4-0xB7 */ + 0x83, 0xC6, 0x83, 0xC7, 0x83, 0xC8, 0x83, 0xC9, /* 0xB8-0xBB */ + 0x83, 0xCA, 0x83, 0xCB, 0x83, 0xCC, 0x83, 0xCD, /* 0xBC-0xBF */ + 0x83, 0xCE, 0x83, 0xCF, 0x00, 0x00, 0x83, 0xD0, /* 0xC0-0xC3 */ + 0x83, 0xD1, 0x83, 0xD2, 0x83, 0xD3, 0x83, 0xD4, /* 0xC4-0xC7 */ + 0x83, 0xD5, 0x83, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ +}; + +static const unsigned char u2c_04[512] = { + 0x00, 0x00, 0x84, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x84, 0x40, 0x84, 0x41, 0x84, 0x42, 0x84, 0x43, /* 0x10-0x13 */ + 0x84, 0x44, 0x84, 0x45, 0x84, 0x47, 0x84, 0x48, /* 0x14-0x17 */ + 0x84, 0x49, 0x84, 0x4A, 0x84, 0x4B, 0x84, 0x4C, /* 0x18-0x1B */ + 0x84, 0x4D, 0x84, 0x4E, 0x84, 0x4F, 0x84, 0x50, /* 0x1C-0x1F */ + 0x84, 0x51, 0x84, 0x52, 0x84, 0x53, 0x84, 0x54, /* 0x20-0x23 */ + 0x84, 0x55, 0x84, 0x56, 0x84, 0x57, 0x84, 0x58, /* 0x24-0x27 */ + 0x84, 0x59, 0x84, 0x5A, 0x84, 0x5B, 0x84, 0x5C, /* 0x28-0x2B */ + 0x84, 0x5D, 0x84, 0x5E, 0x84, 0x5F, 0x84, 0x60, /* 0x2C-0x2F */ + 0x84, 0x70, 0x84, 0x71, 0x84, 0x72, 0x84, 0x73, /* 0x30-0x33 */ + 0x84, 0x74, 0x84, 0x75, 0x84, 0x77, 0x84, 0x78, /* 0x34-0x37 */ + 0x84, 0x79, 0x84, 0x7A, 0x84, 0x7B, 0x84, 0x7C, /* 0x38-0x3B */ + 0x84, 0x7D, 0x84, 0x7E, 0x84, 0x80, 0x84, 0x81, /* 0x3C-0x3F */ + 0x84, 0x82, 0x84, 0x83, 0x84, 0x84, 0x84, 0x85, /* 0x40-0x43 */ + 0x84, 0x86, 0x84, 0x87, 0x84, 0x88, 0x84, 0x89, /* 0x44-0x47 */ + 0x84, 0x8A, 0x84, 0x8B, 0x84, 0x8C, 0x84, 0x8D, /* 0x48-0x4B */ + 0x84, 0x8E, 0x84, 0x8F, 0x84, 0x90, 0x84, 0x91, /* 0x4C-0x4F */ + 0x00, 0x00, 0x84, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ +}; + +static const unsigned char u2c_20[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x81, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x81, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x81, 0x65, 0x81, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x81, 0x67, 0x81, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x81, 0xF5, 0x81, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x81, 0x64, 0x81, 0x63, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x81, 0xF1, 0x00, 0x00, 0x81, 0x8C, 0x81, 0x8D, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xA6, /* 0x38-0x3B */ +}; + +static const unsigned char u2c_21[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x8E, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0x59, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xFA, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xF0, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xFA, 0x4A, 0xFA, 0x4B, 0xFA, 0x4C, 0xFA, 0x4D, /* 0x60-0x63 */ + 0xFA, 0x4E, 0xFA, 0x4F, 0xFA, 0x50, 0xFA, 0x51, /* 0x64-0x67 */ + 0xFA, 0x52, 0xFA, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xEE, 0xEF, 0xEE, 0xF0, 0xEE, 0xF1, 0xEE, 0xF2, /* 0x70-0x73 */ + 0xEE, 0xF3, 0xEE, 0xF4, 0xEE, 0xF5, 0xEE, 0xF6, /* 0x74-0x77 */ + 0xEE, 0xF7, 0xEE, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x81, 0xA9, 0x81, 0xAA, 0x81, 0xA8, 0x81, 0xAB, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0xCB, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x81, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ +}; + +static const unsigned char u2c_22[512] = { + 0x81, 0xCD, 0x00, 0x00, 0x81, 0xDD, 0x81, 0xCE, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xDE, /* 0x04-0x07 */ + 0x81, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x81, 0xB9, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x87, 0x94, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x95, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x81, 0xE5, 0x81, 0x87, 0x87, 0x98, /* 0x1C-0x1F */ + 0x87, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x81, 0x61, 0x00, 0x00, 0x81, 0xC8, /* 0x24-0x27 */ + 0x81, 0xC9, 0x87, 0x9B, 0x87, 0x9C, 0x87, 0x92, /* 0x28-0x2B */ + 0x81, 0xE8, 0x00, 0x00, 0x87, 0x93, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x81, 0x88, 0xFA, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x81, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x90, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x81, 0x82, 0x87, 0x91, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0x85, 0x81, 0x86, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0xE1, 0x81, 0xE2, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x81, 0xBC, 0x81, 0xBD, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0xBA, 0x81, 0xBB, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x87, 0x96, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x99, /* 0xBC-0xBF */ +}; + +static const unsigned char u2c_23[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0xDC, 0x00, 0x00, /* 0x10-0x13 */ +}; + +static const unsigned char u2c_24[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x87, 0x40, 0x87, 0x41, 0x87, 0x42, 0x87, 0x43, /* 0x60-0x63 */ + 0x87, 0x44, 0x87, 0x45, 0x87, 0x46, 0x87, 0x47, /* 0x64-0x67 */ + 0x87, 0x48, 0x87, 0x49, 0x87, 0x4A, 0x87, 0x4B, /* 0x68-0x6B */ + 0x87, 0x4C, 0x87, 0x4D, 0x87, 0x4E, 0x87, 0x4F, /* 0x6C-0x6F */ + 0x87, 0x50, 0x87, 0x51, 0x87, 0x52, 0x87, 0x53, /* 0x70-0x73 */ +}; + +static const unsigned char u2c_25[512] = { + 0x84, 0x9F, 0x84, 0xAA, 0x84, 0xA0, 0x84, 0xAB, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x84, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAC, /* 0x0C-0x0F */ + 0x84, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAD, /* 0x10-0x13 */ + 0x84, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAF, /* 0x14-0x17 */ + 0x84, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x84, 0xAE, /* 0x18-0x1B */ + 0x84, 0xA5, 0x84, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x84, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB0, /* 0x20-0x23 */ + 0x84, 0xA7, 0x84, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x84, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB2, /* 0x28-0x2B */ + 0x84, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB6, /* 0x2C-0x2F */ + 0x84, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB1, /* 0x30-0x33 */ + 0x84, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB8, /* 0x34-0x37 */ + 0x84, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB3, /* 0x38-0x3B */ + 0x84, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB9, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x84, 0xBE, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0xB4, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x81, 0xA1, 0x81, 0xA0, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0xA3, 0x81, 0xA2, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x81, 0xA5, 0x81, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0x9F, 0x81, 0x9E, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x9B, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0x9D, 0x81, 0x9C, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0xFC, /* 0xEC-0xEF */ +}; + +static const unsigned char u2c_26[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x81, 0x9A, 0x81, 0x99, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x81, 0x8A, 0x00, 0x00, 0x81, 0x89, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x81, 0xF4, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x81, 0xF3, 0x00, 0x00, 0x81, 0xF2, /* 0x6C-0x6F */ +}; + +static const unsigned char u2c_30[512] = { + 0x81, 0x40, 0x81, 0x41, 0x81, 0x42, 0x81, 0x56, /* 0x00-0x03 */ + 0x00, 0x00, 0x81, 0x58, 0x81, 0x59, 0x81, 0x5A, /* 0x04-0x07 */ + 0x81, 0x71, 0x81, 0x72, 0x81, 0x73, 0x81, 0x74, /* 0x08-0x0B */ + 0x81, 0x75, 0x81, 0x76, 0x81, 0x77, 0x81, 0x78, /* 0x0C-0x0F */ + 0x81, 0x79, 0x81, 0x7A, 0x81, 0xA7, 0x81, 0xAC, /* 0x10-0x13 */ + 0x81, 0x6B, 0x81, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x87, 0x80, 0x00, 0x00, 0x87, 0x81, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x82, 0x9F, 0x82, 0xA0, 0x82, 0xA1, /* 0x40-0x43 */ + 0x82, 0xA2, 0x82, 0xA3, 0x82, 0xA4, 0x82, 0xA5, /* 0x44-0x47 */ + 0x82, 0xA6, 0x82, 0xA7, 0x82, 0xA8, 0x82, 0xA9, /* 0x48-0x4B */ + 0x82, 0xAA, 0x82, 0xAB, 0x82, 0xAC, 0x82, 0xAD, /* 0x4C-0x4F */ + 0x82, 0xAE, 0x82, 0xAF, 0x82, 0xB0, 0x82, 0xB1, /* 0x50-0x53 */ + 0x82, 0xB2, 0x82, 0xB3, 0x82, 0xB4, 0x82, 0xB5, /* 0x54-0x57 */ + 0x82, 0xB6, 0x82, 0xB7, 0x82, 0xB8, 0x82, 0xB9, /* 0x58-0x5B */ + 0x82, 0xBA, 0x82, 0xBB, 0x82, 0xBC, 0x82, 0xBD, /* 0x5C-0x5F */ + 0x82, 0xBE, 0x82, 0xBF, 0x82, 0xC0, 0x82, 0xC1, /* 0x60-0x63 */ + 0x82, 0xC2, 0x82, 0xC3, 0x82, 0xC4, 0x82, 0xC5, /* 0x64-0x67 */ + 0x82, 0xC6, 0x82, 0xC7, 0x82, 0xC8, 0x82, 0xC9, /* 0x68-0x6B */ + 0x82, 0xCA, 0x82, 0xCB, 0x82, 0xCC, 0x82, 0xCD, /* 0x6C-0x6F */ + 0x82, 0xCE, 0x82, 0xCF, 0x82, 0xD0, 0x82, 0xD1, /* 0x70-0x73 */ + 0x82, 0xD2, 0x82, 0xD3, 0x82, 0xD4, 0x82, 0xD5, /* 0x74-0x77 */ + 0x82, 0xD6, 0x82, 0xD7, 0x82, 0xD8, 0x82, 0xD9, /* 0x78-0x7B */ + 0x82, 0xDA, 0x82, 0xDB, 0x82, 0xDC, 0x82, 0xDD, /* 0x7C-0x7F */ + + 0x82, 0xDE, 0x82, 0xDF, 0x82, 0xE0, 0x82, 0xE1, /* 0x80-0x83 */ + 0x82, 0xE2, 0x82, 0xE3, 0x82, 0xE4, 0x82, 0xE5, /* 0x84-0x87 */ + 0x82, 0xE6, 0x82, 0xE7, 0x82, 0xE8, 0x82, 0xE9, /* 0x88-0x8B */ + 0x82, 0xEA, 0x82, 0xEB, 0x82, 0xEC, 0x82, 0xED, /* 0x8C-0x8F */ + 0x82, 0xEE, 0x82, 0xEF, 0x82, 0xF0, 0x82, 0xF1, /* 0x90-0x93 */ + 0x83, 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x4A, /* 0x98-0x9B */ + 0x81, 0x4B, 0x81, 0x54, 0x81, 0x55, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x83, 0x40, 0x83, 0x41, 0x83, 0x42, /* 0xA0-0xA3 */ + 0x83, 0x43, 0x83, 0x44, 0x83, 0x45, 0x83, 0x46, /* 0xA4-0xA7 */ + 0x83, 0x47, 0x83, 0x48, 0x83, 0x49, 0x83, 0x4A, /* 0xA8-0xAB */ + 0x83, 0x4B, 0x83, 0x4C, 0x83, 0x4D, 0x83, 0x4E, /* 0xAC-0xAF */ + 0x83, 0x4F, 0x83, 0x50, 0x83, 0x51, 0x83, 0x52, /* 0xB0-0xB3 */ + 0x83, 0x53, 0x83, 0x54, 0x83, 0x55, 0x83, 0x56, /* 0xB4-0xB7 */ + 0x83, 0x57, 0x83, 0x58, 0x83, 0x59, 0x83, 0x5A, /* 0xB8-0xBB */ + 0x83, 0x5B, 0x83, 0x5C, 0x83, 0x5D, 0x83, 0x5E, /* 0xBC-0xBF */ + 0x83, 0x5F, 0x83, 0x60, 0x83, 0x61, 0x83, 0x62, /* 0xC0-0xC3 */ + 0x83, 0x63, 0x83, 0x64, 0x83, 0x65, 0x83, 0x66, /* 0xC4-0xC7 */ + 0x83, 0x67, 0x83, 0x68, 0x83, 0x69, 0x83, 0x6A, /* 0xC8-0xCB */ + 0x83, 0x6B, 0x83, 0x6C, 0x83, 0x6D, 0x83, 0x6E, /* 0xCC-0xCF */ + 0x83, 0x6F, 0x83, 0x70, 0x83, 0x71, 0x83, 0x72, /* 0xD0-0xD3 */ + 0x83, 0x73, 0x83, 0x74, 0x83, 0x75, 0x83, 0x76, /* 0xD4-0xD7 */ + 0x83, 0x77, 0x83, 0x78, 0x83, 0x79, 0x83, 0x7A, /* 0xD8-0xDB */ + 0x83, 0x7B, 0x83, 0x7C, 0x83, 0x7D, 0x83, 0x7E, /* 0xDC-0xDF */ + 0x83, 0x80, 0x83, 0x81, 0x83, 0x82, 0x83, 0x83, /* 0xE0-0xE3 */ + 0x83, 0x84, 0x83, 0x85, 0x83, 0x86, 0x83, 0x87, /* 0xE4-0xE7 */ + 0x83, 0x88, 0x83, 0x89, 0x83, 0x8A, 0x83, 0x8B, /* 0xE8-0xEB */ + 0x83, 0x8C, 0x83, 0x8D, 0x83, 0x8E, 0x83, 0x8F, /* 0xEC-0xEF */ + 0x83, 0x90, 0x83, 0x91, 0x83, 0x92, 0x83, 0x93, /* 0xF0-0xF3 */ + 0x83, 0x94, 0x83, 0x95, 0x83, 0x96, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, 0x45, /* 0xF8-0xFB */ + 0x81, 0x5B, 0x81, 0x52, 0x81, 0x53, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_32[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xFA, 0x58, 0x87, 0x8B, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x87, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x87, 0x85, 0x87, 0x86, 0x87, 0x87, 0x87, 0x88, /* 0xA4-0xA7 */ + 0x87, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ +}; + +static const unsigned char u2c_33[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x65, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x87, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x87, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x87, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x61, 0x87, 0x6B, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x6A, 0x87, 0x64, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x6C, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x66, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x6E, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x87, 0x5F, 0x87, 0x6D, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x87, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x87, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x68, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x7E, /* 0x78-0x7B */ + 0x87, 0x8F, 0x87, 0x8E, 0x87, 0x8D, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x72, 0x87, 0x73, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x87, 0x6F, 0x87, 0x70, 0x87, 0x71, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x87, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x87, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x87, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ +}; + +static const unsigned char u2c_4E[512] = { + 0x88, 0xEA, 0x92, 0x9A, 0x00, 0x00, 0x8E, 0xB5, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x9C, /* 0x04-0x07 */ + 0x8F, 0xE4, 0x8E, 0x4F, 0x8F, 0xE3, 0x89, 0xBA, /* 0x08-0x0B */ + 0x00, 0x00, 0x95, 0x73, 0x97, 0x5E, 0x00, 0x00, /* 0x0C-0x0F */ + 0x98, 0xA0, 0x89, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x8A, 0x8E, 0x98, 0xA1, 0x90, 0xA2, 0x99, 0xC0, /* 0x14-0x17 */ + 0x8B, 0x75, 0x95, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xE5, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x97, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xC0, 0x00, 0x00, /* 0x24-0x27 */ + 0xED, 0x4C, 0x00, 0x00, 0x98, 0xA2, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x92, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x98, 0xA3, 0x8B, 0xF8, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0xA4, 0x00, 0x00, /* 0x34-0x37 */ + 0x8A, 0xDB, 0x92, 0x4F, 0x00, 0x00, 0x8E, 0xE5, /* 0x38-0x3B */ + 0x98, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x98, 0xA6, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0xA7, 0x94, 0x54, /* 0x40-0x43 */ + 0x00, 0x00, 0x8B, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x56, /* 0x48-0x4B */ + 0x00, 0x00, 0x93, 0xE1, 0x8C, 0xC1, 0x96, 0x52, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE5, 0x68, 0x98, 0xA8, 0x8F, 0xE6, /* 0x54-0x57 */ + 0x98, 0xA9, 0x89, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x8B, 0xE3, 0x8C, 0xEE, 0x96, 0xE7, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xA4, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x97, 0x90, 0x00, 0x00, 0x93, 0xFB, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xA3, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x8B, 0x54, 0x00, 0x00, 0x98, 0xAA, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x98, 0xAB, 0x97, 0xB9, 0x00, 0x00, /* 0x84-0x87 */ + 0x97, 0x5C, 0x91, 0x88, 0x98, 0xAD, 0x8E, 0x96, /* 0x88-0x8B */ + 0x93, 0xF1, 0x00, 0x00, 0x98, 0xB0, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x89, 0x5D, 0x8C, 0xDD, 0x00, 0x00, /* 0x90-0x93 */ + 0x8C, 0xDC, 0x88, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x98, 0x6A, 0x98, 0x69, 0x00, 0x00, 0x8D, 0xB1, /* 0x98-0x9B */ + 0x88, 0x9F, 0x00, 0x00, 0x98, 0xB1, 0x98, 0xB2, /* 0x9C-0x9F */ + 0x98, 0xB3, 0x96, 0x53, 0x98, 0xB4, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x8C, 0xF0, 0x88, 0xE5, 0x96, 0x92, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x8B, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x9D, /* 0xA8-0xAB */ + 0x8B, 0x9E, 0x92, 0xE0, 0x97, 0xBA, 0x00, 0x00, /* 0xAC-0xAF */ + 0x98, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x98, 0xB6, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0xB7, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0x6C, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x8F, 0x59, 0x90, 0x6D, 0x98, 0xBC, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x98, 0xBA, 0x00, 0x00, 0x98, 0xBB, 0x8B, 0x77, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xA1, 0x89, 0xEE, /* 0xC8-0xCB */ + 0x00, 0x00, 0x98, 0xB9, 0x98, 0xB8, 0x95, 0xA7, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x8E, 0x65, 0x8E, 0x64, 0x91, 0xBC, 0x98, 0xBD, /* 0xD4-0xD7 */ + 0x95, 0x74, 0x90, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x81, 0x57, 0x98, 0xBE, 0x98, 0xC0, /* 0xDC-0xDF */ + 0x00, 0x00, 0xED, 0x4D, 0x00, 0x00, 0x91, 0xE3, /* 0xE0-0xE3 */ + 0x97, 0xDF, 0x88, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x98, 0xBF, 0x89, 0xBC, 0x00, 0x00, /* 0xEC-0xEF */ + 0x8B, 0xC2, 0x00, 0x00, 0x92, 0x87, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x8F, 0x98, 0xC1, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x43, /* 0xF8-0xFB */ + 0xED, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_4F[512] = { + 0xED, 0x4F, 0x8A, 0xE9, 0x00, 0x00, 0xED, 0x50, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x98, 0xC2, 0x88, 0xC9, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x8C, 0xDE, 0x8A, 0xEA, 0x95, 0x9A, /* 0x0C-0x0F */ + 0x94, 0xB0, 0x8B, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xEF, 0x00, 0x00, /* 0x18-0x1B */ + 0x98, 0xE5, 0x93, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x8C, /* 0x2C-0x2F */ + 0x98, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x94, 0xBA, 0x00, 0x00, 0x97, 0xE0, 0x00, 0x00, /* 0x34-0x37 */ + 0x90, 0x4C, 0xED, 0x51, 0x8E, 0x66, 0x00, 0x00, /* 0x38-0x3B */ + 0x8E, 0x97, 0x89, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xCF, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x41, 0x98, 0xC8, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x88, 0xCA, 0x92, 0xE1, 0x8F, 0x5A, /* 0x4C-0x4F */ + 0x8D, 0xB2, 0x97, 0x43, 0x00, 0x00, 0x91, 0xCC, /* 0x50-0x53 */ + 0x00, 0x00, 0x89, 0xBD, 0xED, 0x52, 0x98, 0xC7, /* 0x54-0x57 */ + 0x00, 0x00, 0x97, 0x5D, 0x98, 0xC3, 0x98, 0xC5, /* 0x58-0x5B */ + 0x8D, 0xEC, 0x98, 0xC6, 0x9B, 0x43, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x98, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0xD1, /* 0x6C-0x6F */ + 0x98, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x89, 0xC0, /* 0x70-0x73 */ + 0x00, 0x00, 0x95, 0xB9, 0x98, 0xC9, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0xCD, /* 0x78-0x7B */ + 0x8C, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x67, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xA4, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0xD2, 0x00, 0x00, /* 0x84-0x87 */ + 0x98, 0xCA, 0x00, 0x00, 0xED, 0x54, 0x97, 0xE1, /* 0x88-0x8B */ + 0x00, 0x00, 0x8E, 0x98, 0x00, 0x00, 0x98, 0xCB, /* 0x8C-0x8F */ + 0x00, 0x00, 0x98, 0xD0, 0xED, 0x53, 0x00, 0x00, /* 0x90-0x93 */ + 0xED, 0x56, 0x00, 0x00, 0x98, 0xD3, 0x00, 0x00, /* 0x94-0x97 */ + 0x98, 0xCC, 0x00, 0x00, 0xED, 0x55, 0x8B, 0x9F, /* 0x98-0x9B */ + 0x00, 0x00, 0x88, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x8B, 0xA0, 0x89, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x44, /* 0xA8-0xAB */ + 0x00, 0x00, 0x96, 0x99, 0x95, 0x8E, 0x8C, 0xF2, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x90, 0x4E, 0x97, 0xB5, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xD6, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x57, 0x91, 0xA3, /* 0xC0-0xC3 */ + 0x89, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xED, 0x45, 0x8F, 0x72, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xED, 0x57, 0x98, 0xD7, 0x00, 0x00, /* 0xCC-0xCF */ + 0x98, 0xDC, 0x98, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x98, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x91, 0xAD, /* 0xD4-0xD7 */ + 0x98, 0xD8, 0x00, 0x00, 0x98, 0xDB, 0x98, 0xD9, /* 0xD8-0xDB */ + 0x00, 0x00, 0x95, 0xDB, 0x00, 0x00, 0x98, 0xD6, /* 0xDC-0xDF */ + 0x00, 0x00, 0x90, 0x4D, 0x00, 0x00, 0x96, 0x93, /* 0xE0-0xE3 */ + 0x98, 0xDD, 0x98, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x43, 0x98, 0xEB, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x6F, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x95, 0x55, 0x98, 0xE6, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x95, 0xEE, 0x00, 0x00, 0x89, 0xB4, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0xEA, 0xED, 0x5A, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_50[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x98, 0xE4, 0x98, 0xED, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x91, 0x71, 0x00, 0x00, 0x8C, 0xC2, /* 0x08-0x0B */ + 0x00, 0x00, 0x94, 0x7B, 0x00, 0x00, 0xE0, 0xC5, /* 0x0C-0x0F */ + 0x00, 0x00, 0x98, 0xEC, 0x93, 0x7C, 0x00, 0x00, /* 0x10-0x13 */ + 0x98, 0xE1, 0x00, 0x00, 0x8C, 0xF4, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x8C, 0xF3, 0x98, 0xDF, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x5B, 0x8E, 0xD8, /* 0x1C-0x1F */ + 0x00, 0x00, 0x98, 0xE7, 0xED, 0x59, 0x95, 0xED, /* 0x20-0x23 */ + 0x92, 0x6C, 0x98, 0xE3, 0x8C, 0x91, 0x00, 0x00, /* 0x24-0x27 */ + 0x98, 0xE0, 0x98, 0xE8, 0x98, 0xE2, 0x97, 0xCF, /* 0x28-0x2B */ + 0x98, 0xE9, 0x98, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xE4, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x8C, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xED, 0x58, 0x00, 0x00, 0xED, 0x5E, 0x98, 0xEE, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x5C, 0x98, 0xEF, /* 0x44-0x47 */ + 0x98, 0xF3, 0x88, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xCE, /* 0x4C-0x4F */ + 0x98, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x98, 0xF1, 0x98, 0xF5, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0xF4, 0x00, 0x00, /* 0x58-0x5B */ + 0x92, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x8C, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x98, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xED, 0x5D, 0x00, 0x00, 0x8E, 0xC3, 0x00, 0x00, /* 0x70-0x73 */ + 0x91, 0xA4, 0x92, 0xE3, 0x8B, 0xF4, 0x00, 0x00, /* 0x74-0x77 */ + 0x98, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x8B, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x98, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x98, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x96, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x8C, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xED, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x8E, 0x50, 0x94, 0xF5, 0x98, 0xF9, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x8D, 0xC3, 0x97, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0xFC, 0x99, 0x42, /* 0xB0-0xB3 */ + 0x98, 0xFB, 0x8D, 0xC2, 0x00, 0x00, 0x8F, 0x9D, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x58, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x43, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x8B, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x99, 0x40, 0x99, 0x41, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x93, 0xAD, 0x00, 0x00, 0x91, 0x9C, /* 0xCC-0xCF */ + 0x00, 0x00, 0x8B, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x96, 0x6C, 0x99, 0x44, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xED, 0x61, 0x00, 0x00, 0x97, 0xBB, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x45, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x48, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x99, 0x46, 0x00, 0x00, 0x91, 0x6D, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x99, 0x47, 0x99, 0x49, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xED, 0x60, 0x99, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x99, 0x4A, 0x00, 0x00, 0x95, 0xC6, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_51[512] = { + 0x8B, 0x56, 0x99, 0x4D, 0x99, 0x4E, 0x00, 0x00, /* 0x00-0x03 */ + 0x89, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x99, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xF2, 0x00, 0x00, /* 0x10-0x13 */ + 0x99, 0x51, 0x99, 0x50, 0x99, 0x4F, 0x00, 0x00, /* 0x14-0x17 */ + 0x98, 0xD4, 0x00, 0x00, 0x99, 0x52, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x9E, /* 0x1C-0x1F */ + 0x00, 0x00, 0x99, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x44, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xD7, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x55, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x54, 0x99, 0x57, /* 0x38-0x3B */ + 0x99, 0x56, 0x00, 0x00, 0x00, 0x00, 0x99, 0x58, /* 0x3C-0x3F */ + 0x99, 0x59, 0x88, 0xF2, 0x00, 0x00, 0x8C, 0xB3, /* 0x40-0x43 */ + 0x8C, 0x5A, 0x8F, 0x5B, 0x92, 0x9B, 0x8B, 0xA2, /* 0x44-0x47 */ + 0x90, 0xE6, 0x8C, 0xF5, 0xED, 0x62, 0x8D, 0x8E, /* 0x48-0x4B */ + 0x99, 0x5B, 0x96, 0xC6, 0x93, 0x65, 0x00, 0x00, /* 0x4C-0x4F */ + 0x8E, 0x99, 0x00, 0x00, 0x99, 0x5A, 0x00, 0x00, /* 0x50-0x53 */ + 0x99, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x7D, 0x00, 0x00, /* 0x58-0x5B */ + 0x8A, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x5D, 0x00, 0x00, /* 0x60-0x63 */ + 0xED, 0x63, 0x93, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x91, 0x53, 0x99, 0x5F, 0x99, 0x60, 0x94, 0xAA, /* 0x68-0x6B */ + 0x8C, 0xF6, 0x98, 0x5A, 0x99, 0x61, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x8B, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x95, 0xBA, 0x91, 0xB4, 0x8B, 0xEF, /* 0x74-0x77 */ + 0x93, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x8C, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x99, 0x62, 0x00, 0x00, 0x99, 0x63, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x93, 0xE0, 0x89, 0x7E, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x99, 0x66, 0x8D, 0xFB, 0x00, 0x00, /* 0x88-0x8B */ + 0x99, 0x65, 0x8D, 0xC4, 0x00, 0x00, 0x99, 0x67, /* 0x8C-0x8F */ + 0xE3, 0xEC, 0x99, 0x68, 0x96, 0x60, 0x99, 0x69, /* 0x90-0x93 */ + 0x00, 0x00, 0x99, 0x6A, 0x99, 0x6B, 0x8F, 0xE7, /* 0x94-0x97 */ + 0x00, 0x00, 0x8E, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xED, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x8A, 0xA5, 0x00, 0x00, 0x99, 0x6E, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x99, 0x6C, 0x96, 0xBB, 0x99, 0x6D, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x95, 0x79, 0x99, 0x6F, 0x99, 0x70, 0x99, 0x71, /* 0xA8-0xAB */ + 0x93, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x99, 0x75, 0x99, 0x73, 0x99, 0x74, 0x99, 0x72, /* 0xB0-0xB3 */ + 0x8D, 0xE1, 0x99, 0x76, 0x96, 0xE8, 0x97, 0xE2, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x99, 0x77, 0xED, 0x65, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x90, 0xA6, 0x99, 0x78, 0x8F, 0x79, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x99, 0x79, 0x00, 0x00, 0x92, 0x9C, /* 0xC8-0xCB */ + 0x97, 0xBD, 0x93, 0x80, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0xC3, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x7A, /* 0xD8-0xDB */ + 0xEA, 0xA3, 0x8B, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x99, 0x7B, 0x96, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x88, 0x91, 0xFA, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x99, 0x7D, 0x93, 0xE2, 0x00, 0x00, /* 0xE8-0xEB */ + 0xED, 0x66, 0x99, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x99, 0x80, 0x8A, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x99, 0x81, 0x8B, 0xA5, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x93, 0xCA, 0x89, 0x9A, 0x8F, 0x6F, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x94, 0x9F, 0x99, 0x82, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_52[512] = { + 0x93, 0x81, 0x00, 0x00, 0x00, 0x00, 0x90, 0x6E, /* 0x00-0x03 */ + 0x99, 0x83, 0x00, 0x00, 0x95, 0xAA, 0x90, 0xD8, /* 0x04-0x07 */ + 0x8A, 0xA0, 0x00, 0x00, 0x8A, 0xA7, 0x99, 0x84, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x86, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x8C, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x99, 0x85, 0xED, 0x67, 0x00, 0x00, 0x97, 0xF1, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x8F, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x94, 0xBB, 0x95, 0xCA, 0x00, 0x00, 0x99, 0x87, /* 0x24-0x27 */ + 0x00, 0x00, 0x97, 0x98, 0x99, 0x88, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x89, 0x00, 0x00, /* 0x2C-0x2F */ + 0x93, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x99, 0x8A, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xA7, 0x8D, 0xFC, /* 0x34-0x37 */ + 0x8C, 0x94, 0x99, 0x8B, 0x8E, 0x68, 0x8D, 0x8F, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xE4, /* 0x40-0x43 */ + 0x99, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x91, 0xA5, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xED, 0x99, 0x8E, /* 0x48-0x4B */ + 0x99, 0x8F, 0x91, 0x4F, 0x00, 0x00, 0x99, 0x8C, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x99, 0x91, 0x00, 0x00, 0x96, 0x55, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x84, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0x90, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x95, /* 0x60-0x63 */ + 0x8D, 0xDC, 0x94, 0x8D, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x99, 0x94, 0x99, 0x92, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x9B, /* 0x6C-0x6F */ + 0x8F, 0xE8, 0x99, 0x9B, 0x8A, 0x84, 0x99, 0x95, /* 0x70-0x73 */ + 0x99, 0x93, 0x91, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x99, 0x97, 0x00, 0x00, 0x99, 0x96, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x63, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x80, /* 0x84-0x87 */ + 0x99, 0x9C, 0x97, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x99, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x99, 0x9D, 0x99, 0x9A, 0x00, 0x00, /* 0x90-0x93 */ + 0x99, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xCD, /* 0x98-0x9B */ + 0xED, 0x68, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xF7, /* 0x9C-0x9F */ + 0x89, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x97, 0xF2, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x69, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x8F, 0x95, 0x93, 0x77, 0x8D, 0x85, /* 0xA8-0xAB */ + 0x99, 0xA0, 0x99, 0xA1, 0x00, 0x00, 0xEE, 0x5B, /* 0xAC-0xAF */ + 0x00, 0x00, 0x97, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x98, 0x4A, 0x99, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x8C, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x99, 0xA2, 0x00, 0x00, 0x8A, 0x4E, 0x00, 0x00, /* 0xBC-0xBF */ + 0xED, 0x6A, 0x99, 0xA4, 0x00, 0x00, 0x96, 0x75, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x92, 0xBA, 0x00, 0x00, 0x97, 0x45, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x95, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x99, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xD3, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x93, 0xAE, 0x00, 0x00, 0x99, 0xA6, /* 0xD4-0xD7 */ + 0x8A, 0xA8, 0x96, 0xB1, 0x00, 0x00, 0xED, 0x6B, /* 0xD8-0xDB */ + 0x00, 0x00, 0x8F, 0x9F, 0x99, 0xA7, 0x95, 0xE5, /* 0xDC-0xDF */ + 0x99, 0xAB, 0x00, 0x00, 0x90, 0xA8, 0x99, 0xA8, /* 0xE0-0xE3 */ + 0x8B, 0xCE, 0x00, 0x00, 0x99, 0xA9, 0x8A, 0xA9, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x4D, 0x99, 0xAC, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x99, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x99, 0xAE, 0x99, 0xAF, 0x8E, 0xD9, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0xF9, 0x96, 0xDC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_53[512] = { + 0xED, 0x6C, 0x96, 0xE6, 0x93, 0xF5, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x95, 0xEF, 0x99, 0xB0, 0xED, 0x6D, /* 0x04-0x07 */ + 0x99, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x99, 0xB3, 0x00, 0x00, 0x99, 0xB5, /* 0x0C-0x0F */ + 0x99, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x99, 0xB6, 0x89, 0xBB, 0x96, 0x6B, /* 0x14-0x17 */ + 0x00, 0x00, 0x8D, 0xFA, 0x99, 0xB7, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x91, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x8F, 0xA0, 0x8B, 0xA7, 0x00, 0x00, 0x99, 0xB8, /* 0x20-0x23 */ + 0xED, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xD9, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0xB9, /* 0x2C-0x2F */ + 0x00, 0x00, 0x99, 0xBA, 0x00, 0x00, 0x99, 0xBB, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x99, 0xBC, 0x95, 0x43, 0x8B, 0xE6, 0x88, 0xE3, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xBD, /* 0x3C-0x3F */ + 0x99, 0xBD, 0x8F, 0x5C, 0x00, 0x00, 0x90, 0xE7, /* 0x40-0x43 */ + 0x00, 0x00, 0x99, 0xBF, 0x99, 0xBE, 0x8F, 0xA1, /* 0x44-0x47 */ + 0x8C, 0xDF, 0x99, 0xC1, 0x94, 0xBC, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x99, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x94, 0xDA, 0x91, 0xB2, 0x91, 0xEC, /* 0x50-0x53 */ + 0x8B, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x93, 0xEC, /* 0x54-0x57 */ + 0x92, 0x50, 0x00, 0x00, 0x94, 0x8E, 0x00, 0x00, /* 0x58-0x5B */ + 0x96, 0x6D, 0x00, 0x00, 0x99, 0xC4, 0x00, 0x00, /* 0x5C-0x5F */ + 0x90, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x54, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x99, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0xC6, 0x89, 0x4B, /* 0x6C-0x6F */ + 0x88, 0xF3, 0x8A, 0xEB, 0xED, 0x6F, 0x91, 0xA6, /* 0x70-0x73 */ + 0x8B, 0x70, 0x97, 0x91, 0x00, 0x00, 0x99, 0xC9, /* 0x74-0x77 */ + 0x89, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x99, 0xC8, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xA8, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x99, 0xCA, 0x00, 0x00, /* 0x80-0x83 */ + 0x96, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0x70, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0xCB, 0x00, 0x00, /* 0x94-0x97 */ + 0x97, 0xD0, 0x00, 0x00, 0x8C, 0xFA, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xB4, /* 0x9C-0x9F */ + 0x99, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x99, 0xCE, 0x99, 0xCD, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x90, 0x7E, 0x89, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x89, 0x7D, 0x99, 0xCF, 0x00, 0x00, /* 0xAC-0xAF */ + 0x99, 0xD0, 0x00, 0x00, 0xED, 0x71, 0x8C, 0xB5, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0xD1, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x8E, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x51, 0x99, 0xD2, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x96, 0x94, 0x8D, 0xB3, 0x8B, 0x79, 0x97, 0x46, /* 0xC8-0xCB */ + 0x91, 0x6F, 0x94, 0xBD, 0x8E, 0xFB, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x8F, 0x66, 0x00, 0x00, 0x8E, 0xE6, 0x8E, 0xF3, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x8F, 0x96, 0x00, 0x00, 0x94, 0xBE, /* 0xD8-0xDB */ + 0x00, 0x00, 0xED, 0x72, 0x00, 0x00, 0x99, 0xD5, /* 0xDC-0xDF */ + 0x00, 0x00, 0x89, 0x62, 0x91, 0x70, 0x8C, 0xFB, /* 0xE0-0xE3 */ + 0x8C, 0xC3, 0x8B, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x99, 0xD9, 0x92, 0x40, 0x91, 0xFC, 0x8B, 0xA9, /* 0xE8-0xEB */ + 0x8F, 0xA2, 0x99, 0xDA, 0x99, 0xD8, 0x89, 0xC2, /* 0xEC-0xEF */ + 0x91, 0xE4, 0x8E, 0xB6, 0x8E, 0x6A, 0x89, 0x45, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0x90, 0x8D, 0x86, /* 0xF4-0xF7 */ + 0x8E, 0x69, 0x00, 0x00, 0x99, 0xDB, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_54[512] = { + 0x00, 0x00, 0x99, 0xDC, 0x00, 0x00, 0x8B, 0x68, /* 0x00-0x03 */ + 0x8A, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x8D, 0x87, 0x8B, 0x67, 0x92, 0xDD, 0x89, 0x44, /* 0x08-0x0B */ + 0x93, 0xAF, 0x96, 0xBC, 0x8D, 0x40, 0x97, 0x99, /* 0x0C-0x0F */ + 0x93, 0x66, 0x8C, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x4E, /* 0x18-0x1B */ + 0x00, 0x00, 0x99, 0xE5, 0x00, 0x00, 0x8B, 0xE1, /* 0x1C-0x1F */ + 0x96, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xDB, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x99, 0xE4, 0x00, 0x00, 0x8A, 0xDC, /* 0x28-0x2B */ + 0x99, 0xDF, 0x99, 0xE0, 0x99, 0xE2, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0xE3, 0x00, 0x00, /* 0x34-0x37 */ + 0x8B, 0x7A, 0x90, 0x81, 0x00, 0x00, 0x95, 0xAB, /* 0x38-0x3B */ + 0x99, 0xE1, 0x99, 0xDD, 0x8C, 0xE1, 0x00, 0x00, /* 0x3C-0x3F */ + 0x99, 0xDE, 0x00, 0x00, 0x98, 0x43, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xF0, 0x00, 0x00, /* 0x44-0x47 */ + 0x92, 0xE6, 0x8C, 0xE0, 0x8D, 0x90, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0xE6, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x93, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0xEA, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x8E, 0xFC, 0x00, 0x00, 0x8E, 0xF4, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x99, 0xED, 0x99, 0xEB, 0x00, 0x00, 0x96, 0xA1, /* 0x70-0x73 */ + 0x00, 0x00, 0x99, 0xE8, 0x99, 0xF1, 0x99, 0xEC, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0xEF, /* 0x78-0x7B */ + 0x8C, 0xC4, 0x96, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x99, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x99, 0xF2, 0x00, 0x00, 0x99, 0xF4, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x75, 0x8D, 0xEE, /* 0x88-0x8B */ + 0x98, 0x61, 0x00, 0x00, 0x99, 0xE9, 0x99, 0xE7, /* 0x8C-0x8F */ + 0x99, 0xF3, 0x00, 0x00, 0x99, 0xEE, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xED, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x99, 0xF6, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x9A, 0x42, 0x99, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x99, 0xFC, 0xED, 0x76, 0x00, 0x00, 0x9A, 0x40, /* 0xA8-0xAB */ + 0x99, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x5D, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xE7, 0x8A, 0x50, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x99, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x9A, 0x44, 0x88, 0xF4, 0x9A, 0x43, 0x00, 0x00, /* 0xBC-0xBF */ + 0x88, 0xA3, 0x95, 0x69, 0x9A, 0x41, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x99, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x99, 0xF5, /* 0xC4-0xC7 */ + 0x99, 0xFB, 0x8D, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x9A, 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x88, 0xF5, 0x9A, 0x4E, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x9A, 0x46, 0x9A, 0x47, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x8F, 0xA3, 0x96, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x9A, 0x4C, 0x9A, 0x4B, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x4E, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x4D, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x9A, 0x4A, 0x00, 0x00, 0xED, 0x77, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_55[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x89, 0x53, 0x00, 0x00, 0x8D, 0xB4, 0x90, 0x4F, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x48, /* 0x0C-0x0F */ + 0x93, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x9A, 0x49, 0x00, 0x00, 0x88, 0xA0, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x53, 0x97, 0x42, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8F, 0xA5, 0x00, 0x00, 0x9A, 0x59, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x9A, 0x58, 0x9A, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0xC1, 0x00, 0x00, /* 0x3C-0x3F */ + 0x9A, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x91, 0xED, 0x9A, 0x55, 0x8F, 0xA4, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x9A, 0x52, 0x00, 0x00, 0x00, 0x00, 0x96, 0xE2, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x5B, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x56, 0x9A, 0x57, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x9A, 0x54, 0x9A, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x51, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x60, /* 0x78-0x7B */ + 0x9A, 0x65, 0x00, 0x00, 0x9A, 0x61, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x9A, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x66, /* 0x80-0x83 */ + 0x91, 0x50, 0x00, 0x00, 0xED, 0x78, 0x9A, 0x68, /* 0x84-0x87 */ + 0x00, 0x00, 0x8D, 0x41, 0x9A, 0x5E, 0x92, 0x9D, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x9A, 0x62, 0x9A, 0x5B, 0x8A, 0xAB, 0x00, 0x00, /* 0x98-0x9B */ + 0x8A, 0xEC, 0x8A, 0x85, 0x9A, 0x63, 0x9A, 0x5F, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x96, /* 0xA4-0xA7 */ + 0x9A, 0x69, 0x9A, 0x67, 0x91, 0x72, 0x8B, 0x69, /* 0xA8-0xAB */ + 0x8B, 0xAA, 0x00, 0x00, 0x9A, 0x64, 0x00, 0x00, /* 0xAC-0xAF */ + 0x8B, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x63, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x9A, 0x6D, 0x9A, 0x6B, 0x00, 0x00, 0x9A, 0xA5, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x9A, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x6A, 0x00, 0x00, /* 0xD8-0xDB */ + 0x9A, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x6C, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x6B, /* 0xE0-0xE3 */ + 0x9A, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x72, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x9A, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x9A, 0x75, 0x9A, 0x74, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_56[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x51, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x89, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x9A, 0x71, 0x00, 0x00, 0x9A, 0x73, 0x8F, 0xA6, /* 0x14-0x17 */ + 0x89, 0x52, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x76, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x89, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x82, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8F, 0xFA, 0x9A, 0x7D, 0x00, 0x00, /* 0x30-0x33 */ + 0x9A, 0x7B, 0x00, 0x00, 0x9A, 0x7C, 0x00, 0x00, /* 0x34-0x37 */ + 0x9A, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x5C, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x91, 0x58, 0x00, 0x00, 0x9A, 0x78, 0x00, 0x00, /* 0x4C-0x4F */ + 0x9A, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x9A, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x9A, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x8A, 0xED, 0x00, 0x00, 0x9A, 0x84, 0x9A, 0x80, /* 0x68-0x6B */ + 0x9A, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x95, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x93, 0xD3, 0x00, 0x00, 0x94, 0xB6, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x9A, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x85, 0x8A, 0x64, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x87, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x8A, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x9A, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x9A, 0x88, 0x00, 0x00, 0x94, 0x58, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x9A, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x8C, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x9A, 0x8E, 0x00, 0x00, 0x9A, 0x8D, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x9A, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x9A, 0x93, 0x9A, 0x91, 0x9A, 0x8F, 0x9A, 0x92, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x9A, 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x95, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x9A, 0x96, 0x00, 0x00, 0x9A, 0x97, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x98, /* 0xD4-0xD7 */ + 0x99, 0x64, 0x00, 0x00, 0x8E, 0xFA, 0x8E, 0x6C, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xF1, 0x00, 0x00, /* 0xDC-0xDF */ + 0x88, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x92, 0x63, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0x99, 0x00, 0x00, /* 0xEC-0xEF */ + 0x8D, 0xA2, 0x00, 0x00, 0x88, 0xCD, 0x90, 0x7D, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x9A, 0x9A, 0x8C, 0xC5, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x8D, 0x91, 0x00, 0x00, 0x9A, 0x9C, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_57[512] = { + 0x9A, 0x9B, 0x00, 0x00, 0x00, 0x00, 0x95, 0xDE, /* 0x00-0x03 */ + 0x9A, 0x9D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x9A, 0x9F, 0x9A, 0x9E, 0x00, 0x00, 0x9A, 0xA0, /* 0x08-0x0B */ + 0x00, 0x00, 0x9A, 0xA1, 0x00, 0x00, 0x8C, 0x97, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x80, 0x9A, 0xA2, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xA4, 0x00, 0x00, /* 0x14-0x17 */ + 0x9A, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x9A, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x93, 0x79, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xA7, 0x88, 0xB3, /* 0x24-0x27 */ + 0x8D, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x8C, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x92, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xA8, /* 0x34-0x37 */ + 0x9A, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xAB, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x9A, 0xAC, 0x00, 0x00, 0x8D, 0xE2, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xCF, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x56, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xAA, 0x9A, 0xAD, /* 0x4C-0x4F */ + 0x8D, 0xBF, 0x8D, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xED, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x9A, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x8D, 0xA3, 0xED, 0x7A, 0x92, 0x52, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x9A, 0xAE, 0x92, 0xD8, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xB2, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x90, 0x82, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x9A, 0xB0, 0x9A, 0xB3, 0x00, 0x00, 0x8C, 0x5E, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xB4, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x9A, 0xB5, 0x00, 0x00, 0x8D, 0x43, 0x8A, 0x5F, /* 0xA0-0xA3 */ + 0x9A, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xB8, 0x00, 0x00, /* 0xA8-0xAB */ + 0xED, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x9A, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xB6, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x9A, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xBA, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xBB, 0xED, 0x7D, /* 0xC4-0xC7 */ + 0xED, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x96, 0x84, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xE9, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xBD, 0x9A, 0xBE, /* 0xD0-0xD3 */ + 0x9A, 0xBC, 0x00, 0x00, 0x9A, 0xC0, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x94, 0x57, 0x00, 0x00, 0x00, 0x00, 0x88, 0xE6, /* 0xDC-0xDF */ + 0x95, 0x75, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xC1, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x8F, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xB7, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x94, 0x7C, 0x8A, 0xEE, 0x00, 0x00, /* 0xF8-0xFB */ + 0x8D, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_58[512] = { + 0x96, 0x78, 0x00, 0x00, 0x93, 0xB0, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x8C, 0x98, 0x91, 0xCD, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xBF, 0x9A, 0xC2, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x91, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x9A, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x9A, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x9A, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x92, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xAC, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x9F, /* 0x2C-0x2F */ + 0x89, 0x81, 0x95, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x8F, 0xEA, 0x93, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xE4, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x9A, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x95, 0xBB, 0x97, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xF2, 0x9A, 0xC8, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x91, 0x59, 0x9A, 0xCB, 0x00, 0x00, /* 0x50-0x53 */ + 0x93, 0x83, 0x00, 0x00, 0x00, 0x00, 0x93, 0x68, /* 0x54-0x57 */ + 0x93, 0x84, 0x94, 0xB7, 0x92, 0xCB, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xC7, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xC7, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x89, 0x96, 0x00, 0x00, 0x93, 0x55, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x9A, 0xC9, 0x00, 0x00, 0x9A, 0xC5, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x90, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x9A, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x6D, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xAB, /* 0x80-0x83 */ + 0x00, 0x00, 0x9A, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xE6, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x9D, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x92, 0xC4, 0x00, 0x00, 0xED, 0x81, 0x9A, 0xD0, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x96, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x9A, 0xD1, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xD6, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x82, 0x95, 0xAD, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x9A, 0xD5, 0x9A, 0xCF, 0x9A, 0xD2, 0x9A, 0xD4, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xA4, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x95, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x9A, 0xD7, 0x00, 0x00, 0x92, 0x64, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xF3, 0x00, 0x00, /* 0xC8-0xCB */ + 0x8F, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x9A, 0xD9, 0x00, 0x00, 0x9A, 0xD8, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x8D, 0x88, 0x00, 0x00, 0x9A, 0xDA, /* 0xD4-0xD7 */ + 0x9A, 0xDC, 0x9A, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x9A, 0xDE, 0x00, 0x00, 0x9A, 0xD3, 0x9A, 0xE0, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x9A, 0xDF, 0x9A, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x6D, /* 0xE8-0xEB */ + 0x90, 0x70, 0x00, 0x00, 0x91, 0x73, 0x9A, 0xE1, /* 0xEC-0xEF */ + 0x90, 0xBA, 0x88, 0xEB, 0x94, 0x84, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xD9, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x9A, 0xE3, 0x9A, 0xE2, 0x9A, 0xE4, /* 0xF8-0xFB */ + 0x9A, 0xE5, 0x9A, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_59[512] = { + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xE7, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x95, 0xCF, 0x9A, 0xE8, 0xED, 0x83, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xC4, /* 0x0C-0x0F */ + 0x9A, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x97, 0x5B, 0x8A, 0x4F, 0x00, 0x00, /* 0x14-0x17 */ + 0x99, 0xC7, 0x8F, 0x67, 0x91, 0xBD, 0x9A, 0xEA, /* 0x18-0x1B */ + 0x96, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xB2, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x9A, 0xEC, 0x00, 0x00, 0x91, 0xE5, /* 0x24-0x27 */ + 0x00, 0x00, 0x93, 0x56, 0x91, 0xBE, 0x95, 0x76, /* 0x28-0x2B */ + 0x9A, 0xED, 0x9A, 0xEE, 0x89, 0x9B, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8E, 0xB8, 0x9A, 0xEF, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xCE, /* 0x34-0x37 */ + 0x9A, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xF1, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x89, 0x82, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xEF, /* 0x44-0x47 */ + 0x93, 0xDE, 0x95, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xF5, 0x91, 0x74, /* 0x4C-0x4F */ + 0x9A, 0xF4, 0x8C, 0x5F, 0x00, 0x00, 0xED, 0x84, /* 0x50-0x53 */ + 0x96, 0x7A, 0x9A, 0xF3, 0x00, 0x00, 0x93, 0x85, /* 0x54-0x57 */ + 0x9A, 0xF7, 0x00, 0x00, 0x9A, 0xF6, 0xED, 0x85, /* 0x58-0x5B */ + 0x00, 0x00, 0xED, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x9A, 0xF9, 0x00, 0x00, 0x9A, 0xF8, 0xED, 0x87, /* 0x60-0x63 */ + 0x00, 0x00, 0x89, 0x9C, 0x00, 0x00, 0x9A, 0xFA, /* 0x64-0x67 */ + 0x8F, 0xA7, 0x9A, 0xFC, 0x92, 0x44, 0x00, 0x00, /* 0x68-0x6B */ + 0x9A, 0xFB, 0x00, 0x00, 0x95, 0xB1, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x97, /* 0x70-0x73 */ + 0x93, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x9B, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x8D, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x9B, 0x41, 0x94, 0x40, 0x94, 0xDC, /* 0x80-0x83 */ + 0x96, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x44, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x9B, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x57, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x64, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x96, 0xAD, 0x00, 0x00, 0x9B, 0xAA, /* 0x98-0x9B */ + 0x00, 0x00, 0x9B, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x45, /* 0xA0-0xA3 */ + 0xED, 0x88, 0x91, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x96, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x93, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x46, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x96, 0x85, 0xED, 0x89, 0x8D, 0xC8, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xA8, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x47, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x8E, 0x6F, 0x00, 0x00, 0x8E, 0x6E, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x88, 0xB7, 0x8C, 0xC6, 0x00, 0x00, 0x90, 0xA9, /* 0xD0-0xD3 */ + 0x88, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x9B, 0x4B, 0x9B, 0x4C, 0x00, 0x00, /* 0xD8-0xDB */ + 0x9B, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x89, 0x57, 0x8A, 0xAD, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x9B, 0x48, 0x00, 0x00, 0x96, 0xC3, 0x95, 0x50, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0xA6, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xF7, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x70, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5A[512] = { + 0x00, 0x00, 0x88, 0xD0, 0x00, 0x00, 0x88, 0xA1, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x9B, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x9B, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x96, 0xBA, 0x00, 0x00, 0x9B, 0x52, 0x00, 0x00, /* 0x18-0x1B */ + 0x9B, 0x50, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x4E, /* 0x1C-0x1F */ + 0x90, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x9B, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x95, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xE2, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x9B, 0x56, 0x9B, 0x57, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x8F, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x9B, 0x53, 0x98, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x6B, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x9B, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xA5, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x58, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x77, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x59, 0x00, 0x00, /* 0x68-0x6B */ + 0x9B, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0xB9, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x7D, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x5A, 0x95, 0x51, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x9B, 0x5B, 0x9B, 0x5F, 0x9B, 0x5C, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x89, 0xC5, 0x9B, 0x5E, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x8E, 0xB9, 0x00, 0x00, 0x9B, 0x5D, /* 0xC8-0xCB */ + 0x8C, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x9B, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x64, 0x9B, 0x61, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x92, 0x84, 0x00, 0x00, 0x9B, 0x60, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x62, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x9B, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x65, 0x9B, 0x66, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_5B[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x8A, 0xF0, 0x00, 0x00, 0x9B, 0x68, /* 0x08-0x0B */ + 0x9B, 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x69, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xEC, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x6C, 0x00, 0x00, /* 0x28-0x2B */ + 0x92, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x89, 0x64, 0x00, 0x00, 0x9B, 0x6A, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x6D, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0x6E, 0x00, 0x00, /* 0x3C-0x3F */ + 0x9B, 0x71, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x6F, /* 0x40-0x43 */ + 0x00, 0x00, 0x9B, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x8E, 0x71, 0x9B, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x8D, 0x45, 0x9B, 0x73, 0xED, 0x8A, 0x8E, 0x9A, /* 0x54-0x57 */ + 0x91, 0xB6, 0x00, 0x00, 0x9B, 0x74, 0x9B, 0x75, /* 0x58-0x5B */ + 0x8E, 0x79, 0x8D, 0x46, 0x00, 0x00, 0x96, 0xD0, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x47, /* 0x60-0x63 */ + 0x8C, 0xC7, 0x9B, 0x76, 0x8A, 0x77, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x9B, 0x77, 0x00, 0x00, 0x91, 0xB7, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x9B, 0x78, 0x9B, 0xA1, 0x00, 0x00, 0x9B, 0x79, /* 0x70-0x73 */ + 0x00, 0x00, 0x9B, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x9B, 0x7B, 0x00, 0x00, 0x9B, 0x7D, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x9B, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x80, /* 0x80-0x83 */ + 0x00, 0x00, 0x91, 0xEE, 0x00, 0x00, 0x89, 0x46, /* 0x84-0x87 */ + 0x8E, 0xE7, 0x88, 0xC0, 0x00, 0x00, 0x91, 0x76, /* 0x88-0x8B */ + 0x8A, 0xAE, 0x8E, 0xB3, 0x00, 0x00, 0x8D, 0x47, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x93, 0x86, 0x00, 0x00, 0x8F, 0x40, /* 0x94-0x97 */ + 0x8A, 0xAF, 0x92, 0x88, 0x92, 0xE8, 0x88, 0xB6, /* 0x98-0x9B */ + 0x8B, 0x58, 0x95, 0xF3, 0x00, 0x00, 0x8E, 0xC0, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x71, 0x90, 0xE9, /* 0xA0-0xA3 */ + 0x8E, 0xBA, 0x97, 0x47, 0x9B, 0x81, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x7B, 0x00, 0x00, /* 0xAC-0xAF */ + 0x8D, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x51, /* 0xB0-0xB3 */ + 0x89, 0x83, 0x8F, 0xAA, 0x89, 0xC6, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x9B, 0x82, 0x97, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x68, /* 0xBC-0xBF */ + 0xED, 0x8B, 0x00, 0x00, 0x8E, 0xE2, 0x9B, 0x83, /* 0xC0-0xC3 */ + 0x8A, 0xF1, 0x93, 0xD0, 0x96, 0xA7, 0x9B, 0x84, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x9B, 0x85, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x95, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x9B, 0x87, 0x00, 0x00, 0x8A, 0xA6, 0x8B, 0xF5, /* 0xD0-0xD3 */ + 0x9B, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xED, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB0, /* 0xD8-0xDB */ + 0x00, 0x00, 0x90, 0x51, 0x9B, 0x8B, 0x8E, 0x40, /* 0xDC-0xDF */ + 0x00, 0x00, 0x89, 0xC7, 0x9B, 0x8A, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x9B, 0x88, 0x9B, 0x8C, 0x9B, 0x89, 0x94, 0x4A, /* 0xE4-0xE7 */ + 0x9E, 0xCB, 0x90, 0x52, 0x00, 0x00, 0x9B, 0x8D, /* 0xE8-0xEB */ + 0xED, 0x8E, 0x00, 0x00, 0x97, 0xBE, 0x00, 0x00, /* 0xEC-0xEF */ + 0x9B, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x9B, 0x90, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x92, 0x9E, 0x9B, 0x8F, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x90, 0xA1, 0x00, 0x00, 0x8E, 0x9B, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0xCE, 0x8E, 0xF5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5C[512] = { + 0x00, 0x00, 0x95, 0x95, 0x90, 0xEA, 0x00, 0x00, /* 0x00-0x03 */ + 0x8E, 0xCB, 0x9B, 0x91, 0x8F, 0xAB, 0x9B, 0x92, /* 0x04-0x07 */ + 0x9B, 0x93, 0x88, 0xD1, 0x91, 0xB8, 0x90, 0x71, /* 0x08-0x0B */ + 0x00, 0x00, 0x9B, 0x94, 0x93, 0xB1, 0x8F, 0xAC, /* 0x0C-0x0F */ + 0x00, 0x00, 0x8F, 0xAD, 0x00, 0x00, 0x9B, 0x95, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xEB, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xAE, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x8F, 0x00, 0x00, /* 0x1C-0x1F */ + 0x9B, 0x96, 0x00, 0x00, 0x9B, 0x97, 0x00, 0x00, /* 0x20-0x23 */ + 0x96, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x9B, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x8B, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8F, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x9B, 0x99, 0x9B, 0x9A, 0x8E, 0xDA, 0x90, 0x4B, /* 0x38-0x3B */ + 0x93, 0xF2, 0x90, 0x73, 0x94, 0xF6, 0x94, 0x41, /* 0x3C-0x3F */ + 0x8B, 0xC7, 0x9B, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x8B, 0x8F, 0x9B, 0x9C, 0x00, 0x00, /* 0x44-0x47 */ + 0x8B, 0xFC, 0x00, 0x00, 0x93, 0xCD, 0x89, 0xAE, /* 0x48-0x4B */ + 0x00, 0x00, 0x8E, 0x72, 0x9B, 0x9D, 0x9B, 0xA0, /* 0x4C-0x4F */ + 0x9B, 0x9F, 0x8B, 0xFB, 0x00, 0x00, 0x9B, 0x9E, /* 0x50-0x53 */ + 0x00, 0x00, 0x93, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0xAE, 0x00, 0x00, /* 0x5C-0x5F */ + 0x93, 0x6A, 0x8E, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x91, 0x77, 0x97, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x9B, 0xA2, 0x00, 0x00, 0x9B, 0xA3, 0x93, 0xD4, /* 0x6C-0x6F */ + 0x00, 0x00, 0x8E, 0x52, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xA5, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x9B, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x9B, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x8A, 0xF2, 0x9B, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x9B, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x89, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x90, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x91, 0x5A, 0x8A, 0xE2, 0x00, 0x00, 0x9B, 0xAB, /* 0xA8-0xAB */ + 0x96, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x91, 0xD0, 0x00, 0x00, 0x8A, 0x78, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xAD, 0x9B, 0xAF, /* 0xB4-0xB7 */ + 0x8A, 0xDD, 0x00, 0x00, 0xED, 0x91, 0x9B, 0xAC, /* 0xB8-0xBB */ + 0x9B, 0xAE, 0x00, 0x00, 0x9B, 0xB1, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x9B, 0xB0, 0x00, 0x00, 0x9B, 0xB2, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x9B, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x93, 0xBB, 0x8B, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x89, 0xE3, 0x9B, 0xB4, 0x9B, 0xB9, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x9B, 0xB7, 0x00, 0x00, 0x95, 0xF5, /* 0xEC-0xEF */ + 0x95, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xED, 0x92, 0x93, 0x87, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xB6, 0x8F, 0x73, /* 0xF8-0xFB */ + 0x00, 0x00, 0x9B, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x92, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xBA, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xE8, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x9B, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x9B, 0xC1, 0x9B, 0xBB, 0x8A, 0x52, 0x9B, 0xBC, /* 0x14-0x17 */ + 0x9B, 0xC5, 0x9B, 0xC4, 0x9B, 0xC3, 0x9B, 0xBF, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xBE, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xC2, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0x93, /* 0x24-0x27 */ + 0x00, 0x00, 0x95, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x96, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xC9, /* 0x48-0x4B */ + 0x9B, 0xC6, 0x00, 0x00, 0x9B, 0xC8, 0x00, 0x00, /* 0x4C-0x4F */ + 0x97, 0x92, 0x00, 0x00, 0x9B, 0xC7, 0xED, 0x94, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x9B, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x90, 0x93, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x9B, 0xCA, 0xED, 0x97, 0x00, 0x00, 0x8D, 0xB5, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xCB, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xCC, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xCF, 0x00, 0x00, /* 0x80-0x83 */ + 0x9B, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xCD, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x88, /* 0x88-0x8B */ + 0x9B, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x9B, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x9B, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xD0, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x9B, 0xD2, 0x00, 0x00, 0x9B, 0xD3, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xD6, /* 0xB4-0xB7 */ + 0xED, 0x98, 0xED, 0x99, 0x97, 0xE4, 0x00, 0x00, /* 0xB8-0xBB */ + 0x9B, 0xD7, 0x9B, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x9B, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x8A, 0xDE, 0x9B, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xED, 0x9A, 0x00, 0x00, 0x9B, 0xDB, 0x9B, 0xDA, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xDC, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xDD, /* 0xD8-0xDB */ + 0x00, 0x00, 0x90, 0xEC, 0x8F, 0x42, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x8F, 0x84, 0x00, 0x00, 0x91, 0x83, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x8D, 0x48, 0x8D, 0xB6, 0x8D, 0x49, /* 0xE4-0xE7 */ + 0x8B, 0x90, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xDE, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xB7, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x8C, 0xC8, 0x9B, 0xDF, 0x96, 0xA4, /* 0xF0-0xF3 */ + 0x94, 0x62, 0x9B, 0xE0, 0x00, 0x00, 0x8D, 0x4A, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xAA, /* 0xF8-0xFB */ + 0x00, 0x00, 0x92, 0x46, 0x8B, 0xD0, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x73, 0x95, 0x7A, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xBF, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xE1, /* 0x08-0x0B */ + 0x8A, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x9B, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x9F, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x9B, 0xE3, 0x9B, 0xE2, 0x9B, 0xE5, /* 0x18-0x1B */ + 0x00, 0x00, 0x92, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x90, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x74, /* 0x28-0x2B */ + 0x00, 0x00, 0x90, 0xC8, 0x00, 0x00, 0x91, 0xD1, /* 0x2C-0x2F */ + 0x8B, 0x41, 0x00, 0x00, 0x00, 0x00, 0x92, 0xA0, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x9B, 0xE6, 0x9B, 0xE7, /* 0x34-0x37 */ + 0x8F, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x96, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x9B, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xE9, /* 0x40-0x43 */ + 0x9B, 0xE8, 0x95, 0x9D, 0x00, 0x00, 0x9B, 0xF1, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x96, 0x79, 0x00, 0x00, 0x9B, 0xEB, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x9B, 0xED, 0x96, 0x8B, 0x00, 0x00, 0x9B, 0xEC, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xEE, /* 0x5C-0x5F */ + 0x00, 0x00, 0x94, 0xA6, 0x9B, 0xEF, 0x95, 0xBC, /* 0x60-0x63 */ + 0x9B, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB1, 0x95, 0xBD, /* 0x70-0x73 */ + 0x94, 0x4E, 0x9B, 0xF2, 0x9B, 0xF3, 0x00, 0x00, /* 0x74-0x77 */ + 0x8D, 0x4B, 0x8A, 0xB2, 0x9B, 0xF4, 0x8C, 0xB6, /* 0x78-0x7B */ + 0x97, 0x63, 0x97, 0x48, 0x8A, 0xF4, 0x9B, 0xF6, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x92, 0xA1, 0x00, 0x00, 0x8D, 0x4C, /* 0x80-0x83 */ + 0x8F, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x94, 0xDD, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xB0, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x98, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x92, 0xEA, 0x95, 0xF7, 0x93, 0x58, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0x4D, 0x00, 0x00, /* 0x98-0x9B */ + 0x95, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x9B, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x78, 0x8D, 0xC0, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xC9, /* 0xA8-0xAB */ + 0x00, 0x00, 0x92, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x88, 0xC1, 0x8F, 0x8E, 0x8D, 0x4E, /* 0xB4-0xB7 */ + 0x97, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x9B, 0xF8, 0x9B, 0xF9, 0x94, 0x70, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x9B, 0xFA, 0x97, 0xF5, 0x98, 0x4C, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9B, 0xFC, /* 0xCC-0xCF */ + 0x9B, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x66, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x40, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x43, 0x9C, 0x44, /* 0xD8-0xDB */ + 0x00, 0x00, 0x9C, 0x42, 0x00, 0x00, 0x95, 0x5F, /* 0xDC-0xDF */ + 0x8F, 0xB1, 0x9C, 0x46, 0x9C, 0x45, 0x9C, 0x41, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x9C, 0x47, 0x9C, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x9C, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x9C, 0x4C, 0x9C, 0x4A, 0x00, 0x00, 0x9C, 0x4B, /* 0xF0-0xF3 */ + 0x9C, 0x4D, 0x00, 0x00, 0x89, 0x84, 0x92, 0xEC, /* 0xF4-0xF7 */ + 0x9C, 0x4E, 0x00, 0x00, 0x8C, 0x9A, 0x89, 0xF4, /* 0xF8-0xFB */ + 0x94, 0x55, 0x00, 0x00, 0x9C, 0x4F, 0x93, 0xF9, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5F[512] = { + 0x00, 0x00, 0x95, 0xD9, 0x00, 0x00, 0x9C, 0x50, /* 0x00-0x03 */ + 0x98, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x9C, 0x51, 0x95, 0xBE, 0x9C, 0x54, /* 0x08-0x0B */ + 0x98, 0x9F, 0x98, 0xAF, 0x00, 0x00, 0x8E, 0xAE, /* 0x0C-0x0F */ + 0x93, 0xF3, 0x9C, 0x55, 0x00, 0x00, 0x8B, 0x7C, /* 0x10-0x13 */ + 0x92, 0xA2, 0x88, 0xF8, 0x9C, 0x56, 0x95, 0xA4, /* 0x14-0x17 */ + 0x8D, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x92, 0x6F, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xED, /* 0x1C-0x1F */ + 0x00, 0x00, 0xED, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x96, 0xED, 0x8C, 0xB7, 0x8C, 0xCA, /* 0x24-0x27 */ + 0x00, 0x00, 0x9C, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x9C, 0x58, 0x00, 0x00, 0x9C, 0x5E, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8E, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xED, 0x9C, 0x92, 0xA3, 0x00, 0x00, 0x8B, 0xAD, /* 0x34-0x37 */ + 0x9C, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x95, 0x4A, 0x00, 0x00, 0x92, 0x65, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x9C, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xED, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x9C, 0x5B, 0x00, 0x00, 0x8B, 0xAE, 0x00, 0x00, /* 0x48-0x4B */ + 0x9C, 0x5C, 0x00, 0x00, 0x9C, 0x5D, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x9C, 0x5F, 0x00, 0x00, 0x93, 0x96, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x60, 0x9C, 0x61, /* 0x54-0x57 */ + 0x00, 0x00, 0x9C, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x9C, 0x53, 0x9C, 0x52, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x9C, 0x63, 0x8C, 0x60, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x46, 0xED, 0x9D, /* 0x64-0x67 */ + 0x00, 0x00, 0x8D, 0xCA, 0x95, 0x56, 0x92, 0xA4, /* 0x68-0x6B */ + 0x95, 0x6A, 0x9C, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x8F, 0xB2, 0x89, 0x65, 0x00, 0x00, 0x9C, 0x65, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x66, /* 0x74-0x77 */ + 0x00, 0x00, 0x96, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x94, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x69, /* 0x7C-0x7F */ + + 0x89, 0x9D, 0x90, 0xAA, 0x9C, 0x68, 0x9C, 0x67, /* 0x80-0x83 */ + 0x8C, 0x61, 0x91, 0xD2, 0x00, 0x00, 0x9C, 0x6D, /* 0x84-0x87 */ + 0x9C, 0x6B, 0x00, 0x00, 0x9C, 0x6A, 0x97, 0xA5, /* 0x88-0x8B */ + 0x8C, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x8F, 0x99, 0x9C, 0x6C, 0x93, 0x6B, 0x8F, 0x5D, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xBE, /* 0x94-0x97 */ + 0x9C, 0x70, 0x9C, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x6E, 0x00, 0x00, /* 0x9C-0x9F */ + 0x9C, 0x71, 0x8C, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x9C, 0x72, 0x95, 0x9C, 0x8F, 0x7A, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x9C, 0x73, 0x94, 0xF7, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xBF, /* 0xB0-0xB3 */ + 0x92, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xED, 0x9E, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x93, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x9C, 0x74, 0x8B, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x53, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x95, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x8A, 0xF5, 0x94, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x75, 0x8E, 0x75, /* 0xD4-0xD7 */ + 0x96, 0x59, 0x96, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x89, 0x9E, 0x9C, 0x7A, 0xED, 0x9F, 0x00, 0x00, /* 0xDC-0xDF */ + 0x92, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x9C, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xF5, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x9C, 0xAB, 0x9C, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x94, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x9C, 0x78, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x76, /* 0xF8-0xFB */ + 0x00, 0x00, 0x8D, 0x9A, 0x00, 0x00, 0x9C, 0x7C, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_60[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x83, 0x9C, 0x89, /* 0x0C-0x0F */ + 0x9C, 0x81, 0x00, 0x00, 0x93, 0x7B, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x9C, 0x86, 0x95, 0x7C, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x9C, 0x80, 0x00, 0x00, 0x9C, 0x85, /* 0x18-0x1B */ + 0x97, 0xE5, 0x8E, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x91, 0xD3, 0x9C, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x8B, 0x7D, 0x9C, 0x88, 0x90, 0xAB, /* 0x24-0x27 */ + 0x89, 0x85, 0x9C, 0x82, 0x89, 0xF6, 0x9C, 0x87, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xAF, /* 0x2C-0x2F */ + 0x00, 0x00, 0x9C, 0x84, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x8A, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x9C, 0x8C, 0x9C, 0x96, 0x9C, 0x94, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x91, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x90, 0x97, 0xF6, /* 0x48-0x4B */ + 0x00, 0x00, 0x9C, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x8B, 0xB0, 0x00, 0x00, 0x8D, 0x50, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x8F, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x9C, 0x99, 0x9C, 0x8B, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xED, 0xA0, 0x00, 0x00, 0x9C, 0x8F, /* 0x5C-0x5F */ + 0x9C, 0x7E, 0x00, 0x00, 0x89, 0xF8, 0x9C, 0x93, /* 0x60-0x63 */ + 0x9C, 0x95, 0x92, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x8D, 0xA6, 0x89, 0xB6, 0x9C, 0x8D, 0x9C, 0x98, /* 0x68-0x6B */ + 0x9C, 0x97, 0x8B, 0xB1, 0x00, 0x00, 0x91, 0xA7, /* 0x6C-0x6F */ + 0x8A, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x8C, 0x62, 0x00, 0x00, 0x9C, 0x8E, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x9C, 0x9A, 0x00, 0x00, 0x9C, 0x9D, /* 0x80-0x83 */ + 0x9C, 0x9F, 0xED, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x8E, 0xBB, 0xED, 0xA2, 0x9C, 0xA5, /* 0x88-0x8B */ + 0x92, 0xEE, 0x9C, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0xA3, 0x00, 0x00, /* 0x90-0x93 */ + 0x89, 0xF7, 0x00, 0x00, 0x9C, 0xA1, 0x9C, 0xA2, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0x9E, 0x9C, 0xA0, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xE5, /* 0x9C-0x9F */ + 0x97, 0x49, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB3, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x78, 0x9C, 0xA4, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x94, 0x59, 0x88, 0xAB, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xDF, 0x9C, 0x7B, /* 0xB0-0xB3 */ + 0x9C, 0xAA, 0x9C, 0xAE, 0x96, 0xE3, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x9C, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x93, 0x89, 0x9C, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x8F, 0xEE, 0x9C, 0xAD, 0x93, 0xD5, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x98, 0x66, 0x00, 0x00, 0x9C, 0xA9, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xED, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x9C, 0xAF, 0x00, 0x00, 0x8D, 0x9B, 0x00, 0x00, /* 0xD8-0xDB */ + 0x90, 0xC9, 0x00, 0x00, 0xED, 0xA3, 0x88, 0xD2, /* 0xDC-0xDF */ + 0x9C, 0xA8, 0x9C, 0xA6, 0x00, 0x00, 0x91, 0x79, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x9C, /* 0xE4-0xE7 */ + 0x8E, 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x91, 0xC4, 0x9C, 0xBB, 0xED, 0xA6, 0x91, 0x7A, /* 0xF0-0xF3 */ + 0x9C, 0xB6, 0x00, 0x00, 0x9C, 0xB3, 0x9C, 0xB4, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x8E, 0xE4, 0x9C, 0xB7, 0x9C, 0xBA, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_61[512] = { + 0x9C, 0xB5, 0x8F, 0x44, 0x00, 0x00, 0x9C, 0xB8, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0xB2, 0x00, 0x00, /* 0x04-0x07 */ + 0x96, 0xFA, 0x96, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x9C, 0xBC, 0x9C, 0xBD, 0x88, 0xD3, /* 0x0C-0x0F */ + 0x00, 0x00, 0xED, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x9C, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xF0, 0x88, 0xA4, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB4, /* 0x1C-0x1F */ + 0xED, 0xA5, 0x9C, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xC1, /* 0x24-0x27 */ + 0x9C, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x9C, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xED, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x9C, 0xC6, 0x00, 0x00, 0x00, 0x00, 0xED, 0xA8, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x9C, 0xC4, 0x9C, 0xC7, 0x9C, 0xBF, 0x9C, 0xC3, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0xC8, 0x00, 0x00, /* 0x40-0x43 */ + 0x9C, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xBE, /* 0x44-0x47 */ + 0x8E, 0x9C, 0x00, 0x00, 0x9C, 0xC2, 0x91, 0xD4, /* 0x48-0x4B */ + 0x8D, 0x51, 0x9C, 0xB0, 0x90, 0x54, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xD6, /* 0x50-0x53 */ + 0x00, 0x00, 0x95, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x9C, 0xCC, 0x9C, 0xCD, 0x9C, 0xCE, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x9C, 0xD5, 0x00, 0x00, 0x9C, 0xD4, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x9D, 0x8A, 0xB5, /* 0x60-0x63 */ + 0x00, 0x00, 0x9C, 0xD2, 0x00, 0x00, 0x8C, 0x64, /* 0x64-0x67 */ + 0x8A, 0x53, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xCF, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xB6, 0x9C, 0xD1, /* 0x6C-0x6F */ + 0x88, 0xD4, 0x9C, 0xD3, 0x00, 0x00, 0x9C, 0xCA, /* 0x70-0x73 */ + 0x9C, 0xD0, 0x9C, 0xD7, 0x8C, 0x63, 0x9C, 0xCB, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x7C, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x97, 0x4A, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xDA, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0xDE, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0x9E, 0x00, 0x00, /* 0x8C-0x8F */ + 0x97, 0xF7, 0x9C, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x9C, 0xDC, 0x00, 0x00, 0x9C, 0xD9, 0x00, 0x00, /* 0x94-0x97 */ + 0xED, 0xAA, 0x9C, 0xD8, 0x9C, 0xDD, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x95, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x93, 0xB2, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x8C, 0x65, 0x00, 0x00, 0x9C, 0xE0, /* 0xA8-0xAB */ + 0x9C, 0xDB, 0x00, 0x00, 0x9C, 0xE1, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x9B, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xAF, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0xE9, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB6, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xE7, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0xE8, 0x8D, 0xA7, /* 0xC4-0xC7 */ + 0x9C, 0xE6, 0x9C, 0xE4, 0x9C, 0xE3, 0x9C, 0xEA, /* 0xC8-0xCB */ + 0x9C, 0xE2, 0x9C, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x89, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0xEE, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x9C, 0xED, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0xA6, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x9C, 0xF1, 0x00, 0x00, 0x9C, 0xEF, 0x9C, 0xE5, /* 0xF4-0xF7 */ + 0x8C, 0x9C, 0x00, 0x00, 0x9C, 0xF0, 0x00, 0x00, /* 0xF8-0xFB */ + 0x9C, 0xF4, 0x9C, 0xF3, 0x9C, 0xF5, 0x9C, 0xF2, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_62[512] = { + 0x9C, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x9C, 0xF7, 0x9C, 0xF8, 0x95, 0xE8, 0x00, 0x00, /* 0x08-0x0B */ + 0x9C, 0xFA, 0x9C, 0xF9, 0x8F, 0x5E, 0x00, 0x00, /* 0x0C-0x0F */ + 0x90, 0xAC, 0x89, 0xE4, 0x89, 0xFA, 0xED, 0xAB, /* 0x10-0x13 */ + 0x9C, 0xFB, 0x00, 0x00, 0x88, 0xBD, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xCA, 0x9C, 0xFC, /* 0x18-0x1B */ + 0x00, 0x00, 0xE6, 0xC1, 0x9D, 0x40, 0x8C, 0x81, /* 0x1C-0x1F */ + 0x00, 0x00, 0x9D, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xED, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x42, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x43, 0x8B, 0x59, /* 0x2C-0x2F */ + 0x9D, 0x44, 0x00, 0x00, 0x9D, 0x45, 0x9D, 0x46, /* 0x30-0x33 */ + 0x91, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x8C, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x96, 0xDF, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x5B, /* 0x3C-0x3F */ + 0x8F, 0x8A, 0x9D, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xEE, /* 0x44-0x47 */ + 0xE7, 0xBB, 0x94, 0xE0, 0x00, 0x00, 0x8E, 0xE8, /* 0x48-0x4B */ + 0x00, 0x00, 0x8D, 0xCB, 0x9D, 0x48, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xC5, /* 0x50-0x53 */ + 0x00, 0x00, 0x95, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x91, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x4B, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x49, 0x00, 0x00, /* 0x5C-0x5F */ + 0x9D, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x4A, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x9D, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xAF, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x88, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x7D, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x94, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x9D, 0x4E, 0x00, 0x00, 0x9D, 0x51, 0x8F, 0xB3, /* 0x7C-0x7F */ + + 0x8B, 0x5A, 0x00, 0x00, 0x9D, 0x4F, 0x9D, 0x56, /* 0x80-0x83 */ + 0x8F, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x9D, 0x50, 0x94, 0x63, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x97, 0x7D, 0x9D, 0x52, 0x9D, 0x53, /* 0x90-0x93 */ + 0x9D, 0x57, 0x93, 0x8A, 0x9D, 0x54, 0x8D, 0x52, /* 0x94-0x97 */ + 0x90, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x65, /* 0x98-0x9B */ + 0x94, 0xB2, 0x00, 0x00, 0x91, 0xF0, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xAC, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xE2, /* 0xA8-0xAB */ + 0x9D, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x95, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x92, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x96, 0x95, 0x00, 0x00, 0x9D, 0x5A, /* 0xB8-0xBB */ + 0x89, 0x9F, 0x92, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x63, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x92, 0x53, 0x9D, 0x5D, 0x9D, 0x64, /* 0xC4-0xC7 */ + 0x9D, 0x5F, 0x9D, 0x66, 0x9D, 0x62, 0x00, 0x00, /* 0xC8-0xCB */ + 0x9D, 0x61, 0x94, 0x8F, 0x00, 0x00, 0x9D, 0x5B, /* 0xCC-0xCF */ + 0x89, 0xFB, 0x9D, 0x59, 0x8B, 0x91, 0x91, 0xF1, /* 0xD0-0xD3 */ + 0x9D, 0x55, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x58, /* 0xD4-0xD7 */ + 0x8D, 0x53, 0x90, 0xD9, 0x00, 0x00, 0x8F, 0xB5, /* 0xD8-0xDB */ + 0x9D, 0x60, 0x94, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x8B, 0x92, 0x8A, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x8A, 0x87, 0x90, 0x40, 0x9D, 0x68, 0x9D, 0x6D, /* 0xEC-0xEF */ + 0x00, 0x00, 0x9D, 0x69, 0x00, 0x00, 0x8C, 0x9D, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x9D, 0x6E, 0x8E, 0x41, 0x8D, 0x89, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x45, 0x9D, 0x5C, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_63[512] = { + 0x00, 0x00, 0x8E, 0x9D, 0x9D, 0x6B, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x77, /* 0x04-0x07 */ + 0x9D, 0x6C, 0x88, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x9D, 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x92, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x8B, 0x93, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xB2, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x6A, /* 0x24-0x27 */ + 0x88, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xC1, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x55, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0xF0, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x94, 0xD2, 0x9D, 0x70, 0x91, 0x7D, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x91, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x8E, 0x4A, 0x9D, 0x71, 0x00, 0x00, 0x9D, 0x73, /* 0x4C-0x4F */ + 0x9D, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x95, 0xDF, 0x00, 0x00, 0x92, 0xBB, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x91, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xF9, /* 0x64-0x67 */ + 0x8E, 0xCC, 0x9D, 0x80, 0x00, 0x00, 0x9D, 0x7E, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0x98, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x9E, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x78, 0x8F, 0xB7, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0xE6, 0x94, 0x50, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x9D, 0x76, 0x00, 0x00, 0x00, 0x00, 0x91, 0x7C, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x8E, 0xF6, 0x9D, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x8F, 0xB6, 0x00, 0x00, 0x9D, 0x75, 0x9D, 0x7A, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x72, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x74, 0x00, 0x00, /* 0x94-0x97 */ + 0x8C, 0x40, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x7C, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x7C, /* 0x9C-0x9F */ + 0x97, 0xA9, 0x8D, 0xCC, 0x92, 0x54, 0x9D, 0x79, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x90, 0xDA, 0x00, 0x00, 0x8D, 0x54, /* 0xA4-0xA7 */ + 0x90, 0x84, 0x89, 0x86, 0x91, 0x5B, 0x9D, 0x77, /* 0xA8-0xAB */ + 0x8B, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x66, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x92, 0xCD, 0x9D, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x7E, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x81, 0x00, 0x00, /* 0xBC-0xBF */ + 0x9D, 0x83, 0x00, 0x00, 0x00, 0x00, 0x91, 0xB5, /* 0xC0-0xC3 */ + 0x9D, 0x89, 0x00, 0x00, 0x9D, 0x84, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x9D, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x60, /* 0xCC-0xCF */ + 0x92, 0xF1, 0x00, 0x00, 0x9D, 0x87, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x4B, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x67, 0x8A, 0xB7, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x88, 0xAC, 0x00, 0x00, 0x9D, 0x85, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x9D, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xF6, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x89, 0x87, 0xED, 0xAD, 0x9D, 0x88, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x68, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_64[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x8C, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x91, 0xB9, 0x00, 0x00, 0x9D, 0x93, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x8D, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x8A, 0x9D, 0x91, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x9D, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x8E, 0x00, 0x00, /* 0x24-0x27 */ + 0x9D, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x94, 0xC0, 0x93, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x9D, 0x8B, 0x00, 0x00, 0x9D, 0x8F, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x67, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xEF, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xDB, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x97, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x93, 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xED, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x94, /* 0x64-0x67 */ + 0x00, 0x00, 0x96, 0x80, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x95, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x96, 0x00, 0x00, /* 0x74-0x77 */ + 0x96, 0xCC, 0x00, 0x00, 0x90, 0xA0, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x82, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x9D, 0x9D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x54, 0x9D, 0x9A, /* 0x90-0x93 */ + 0x00, 0x00, 0x9D, 0x99, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x51, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xED, 0xAF, 0x93, 0xB3, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x93, 0x50, 0x9D, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x9D, 0x9C, 0x00, 0x00, 0x95, 0x8F, /* 0xA8-0xAB */ + 0x00, 0x00, 0x94, 0x64, 0x8E, 0x42, 0x00, 0x00, /* 0xAC-0xAF */ + 0x90, 0xEF, 0x00, 0x00, 0x96, 0x6F, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x8A, 0x68, 0x00, 0x00, 0x9D, 0xA3, /* 0xB8-0xBB */ + 0x9D, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x97, 0x69, 0x9D, 0xA5, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x9D, 0xA1, 0x00, 0x00, 0x9D, 0xA2, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x91, 0x80, 0xED, 0xB0, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0xA0, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x9D, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x9D, 0xA4, 0x00, 0x00, 0x9D, 0x9F, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x9D, 0xA9, 0x9D, 0xAA, 0x93, 0x46, 0x9D, 0xAC, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x43, 0x9D, 0xA7, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x8B, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xAD, /* 0xEC-0xEF */ + 0x00, 0x00, 0x9D, 0xA6, 0x9D, 0xB1, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x9D, 0xB0, 0x00, 0x00, 0x9D, 0xAF, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0xB2, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x9D, 0xB4, 0x8F, 0xEF, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_65[512] = { + 0x9D, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x9D, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x9D, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x9D, 0xB6, 0x9D, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xB9, /* 0x20-0x23 */ + 0x9D, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0x98, 0x9D, 0xBA, /* 0x28-0x2B */ + 0x9D, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x78, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x9D, 0xBB, 0x9D, 0xBC, 0x9D, 0xBE, 0x9D, 0xBD, /* 0x34-0x37 */ + 0x9D, 0xBF, 0x89, 0xFC, 0x00, 0x00, 0x8D, 0x55, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xFA, 0x90, 0xAD, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x8C, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x9D, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x9D, 0xC4, 0xED, 0xB1, 0x95, 0x71, /* 0x4C-0x4F */ + 0x00, 0x00, 0x8B, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x9D, 0xC3, 0x9D, 0xC2, 0x94, 0x73, /* 0x54-0x57 */ + 0x9D, 0xC5, 0x8B, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x9D, 0xC7, 0x9D, 0xC6, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xB8, 0x8E, 0x55, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0xD6, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x8C, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x90, 0x94, 0x00, 0x00, 0x9D, 0xC8, 0x00, 0x00, /* 0x70-0x73 */ + 0x90, 0xAE, 0x93, 0x47, 0x00, 0x00, 0x95, 0x7E, /* 0x74-0x77 */ + 0x9D, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x9D, 0xCA, 0x9D, 0xCB, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xB6, /* 0x84-0x87 */ + 0x9B, 0x7C, 0x90, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x95, 0x6B, 0x00, 0x00, 0x8D, 0xD6, 0x00, 0x00, /* 0x8C-0x8F */ + 0x94, 0xE3, 0x94, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x6C, /* 0x94-0x97 */ + 0x00, 0x00, 0x97, 0xBF, 0x00, 0x00, 0x9D, 0xCD, /* 0x98-0x9B */ + 0x8E, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xCE, /* 0x9C-0x9F */ + 0x00, 0x00, 0x88, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x8B, 0xD2, 0x90, 0xCB, 0x00, 0x00, 0x95, 0x80, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xCF, /* 0xA8-0xAB */ + 0x8E, 0x61, 0x92, 0x66, 0x00, 0x00, 0x8E, 0x7A, /* 0xAC-0xAF */ + 0x90, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xD0, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x95, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x89, 0x97, 0x8E, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x9D, 0xD3, 0x00, 0x00, 0x9D, 0xD1, /* 0xC0-0xC3 */ + 0x9D, 0xD4, 0x97, 0xB7, 0x9D, 0xD2, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xF9, /* 0xC8-0xCB */ + 0x9D, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x91, 0xB0, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0xD6, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xF8, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x9D, 0xD8, 0x00, 0x00, 0x9D, 0xD7, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x9D, 0xD9, 0x9D, 0xDA, 0x8A, 0xF9, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x93, 0xFA, 0x92, 0x55, 0x8B, 0x8C, /* 0xE4-0xE7 */ + 0x8E, 0x7C, 0x91, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x8F, 0x7B, 0x88, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x9D, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xA0, 0x9D, 0xDF, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_66[512] = { + 0xED, 0xB2, 0x00, 0x00, 0x8D, 0x56, 0x9D, 0xDE, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xA9, 0x8F, 0xB8, /* 0x04-0x07 */ + 0x00, 0x00, 0xED, 0xB5, 0x9D, 0xDD, 0x00, 0x00, /* 0x08-0x0B */ + 0x8F, 0xB9, 0x00, 0x00, 0x96, 0xBE, 0x8D, 0xA8, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xD5, /* 0x10-0x13 */ + 0x90, 0xCC, 0xED, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x9D, 0xE4, 0x00, 0x00, 0xED, 0xB7, 0x90, 0xAF, /* 0x1C-0x1F */ + 0x89, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xED, 0xB8, 0x8F, 0x74, 0x00, 0x00, 0x96, 0x86, /* 0x24-0x27 */ + 0x8D, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x8F, 0xBA, 0xED, 0xB6, 0x90, 0xA5, /* 0x2C-0x2F */ + 0x00, 0x00, 0xED, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x9D, 0xE3, 0x9D, 0xE1, 0x9D, 0xE2, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xB4, /* 0x38-0x3B */ + 0x92, 0x8B, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x45, /* 0x3C-0x3F */ + 0x00, 0x00, 0x9D, 0xE8, 0x8E, 0x9E, 0x8D, 0x57, /* 0x40-0x43 */ + 0x9D, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x9D, 0xE7, 0x00, 0x00, 0x90, 0x57, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0xE5, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x4E, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xBA, /* 0x54-0x57 */ + 0x00, 0x00, 0xED, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x9D, 0xEA, 0x9D, 0xE9, 0x9D, 0xEE, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0xEF, 0x00, 0x00, /* 0x60-0x63 */ + 0x9D, 0xEB, 0xED, 0xB9, 0x8A, 0x41, 0x9D, 0xEC, /* 0x64-0x67 */ + 0x9D, 0xED, 0x94, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x81, 0x8C, 0x69, /* 0x6C-0x6F */ + 0x9D, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xED, 0xBD, /* 0x70-0x73 */ + 0x90, 0xB0, 0x00, 0x00, 0x8F, 0xBB, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x71, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x8B, 0xC5, 0x00, 0x00, 0x9D, 0xF1, /* 0x80-0x83 */ + 0x9D, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x89, 0xC9, /* 0x84-0x87 */ + 0x9D, 0xF2, 0x9D, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0xF3, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x8F, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x67, 0x88, 0xC3, /* 0x94-0x97 */ + 0x9D, 0xF6, 0xED, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x9D, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xED, 0xBF, 0x00, 0x00, 0x92, 0xA8, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xEF, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x62, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xE9, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xC0, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x96, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x9E, 0x41, 0x9D, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x9D, 0xFC, 0x00, 0x00, 0x9D, 0xFB, 0xED, 0xC1, /* 0xBC-0xBF */ + 0x00, 0x00, 0x9D, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x9E, 0x40, 0x00, 0x00, 0x00, 0x00, 0x93, 0xDC, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x9D, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x42, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x8F, 0x8C, 0x9E, 0x43, 0x00, 0x00, /* 0xD8-0xDB */ + 0x97, 0x6A, 0x94, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x9E, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x46, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x9E, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x9E, 0x48, 0x00, 0x00, 0x8B, 0xC8, 0x89, 0x67, /* 0xF0-0xF3 */ + 0x8D, 0x58, 0x9E, 0x49, 0x00, 0x00, 0x9E, 0x4A, /* 0xF4-0xF7 */ + 0x8F, 0x91, 0x91, 0x82, 0xED, 0xC2, 0xED, 0x4A, /* 0xF8-0xFB */ + 0x99, 0xD6, 0x91, 0x5D, 0x91, 0x5C, 0x91, 0xD6, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_67[512] = { + 0x8D, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x98, 0xF0, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x8C, 0x8E, 0x97, 0x4C, 0x00, 0x00, 0x95, 0xFC, /* 0x08-0x0B */ + 0x00, 0x00, 0x95, 0x9E, 0xED, 0xC3, 0x9E, 0x4B, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x8D, 0xF1, 0x92, 0xBD, 0x9E, 0x4C, 0x98, 0x4E, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x5D, /* 0x18-0x1B */ + 0x00, 0x00, 0x92, 0xA9, 0x9E, 0x4D, 0x8A, 0xFA, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x4E, 0x9E, 0x4F, /* 0x24-0x27 */ + 0x96, 0xD8, 0x00, 0x00, 0x96, 0xA2, 0x96, 0x96, /* 0x28-0x2B */ + 0x96, 0x7B, 0x8E, 0x44, 0x9E, 0x51, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8E, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x96, 0x70, 0x00, 0x00, 0x9E, 0x53, 0x9E, 0x56, /* 0x34-0x37 */ + 0x9E, 0x55, 0x00, 0x00, 0x8A, 0xF7, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x8B, 0x80, 0x00, 0x00, 0x9E, 0x52, /* 0x3C-0x3F */ + 0x00, 0x00, 0x9E, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x57, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x90, 0x99, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x9B, 0x88, 0xC7, /* 0x4C-0x4F */ + 0x8D, 0xDE, 0x91, 0xBA, 0x00, 0x00, 0x8E, 0xDB, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xF1, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x9E, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x93, 0x6D, 0x00, 0x00, 0x9E, 0x58, 0x91, 0xA9, /* 0x5C-0x5F */ + 0x9E, 0x59, 0x8F, 0xF0, 0x96, 0xDB, 0x9E, 0x5B, /* 0x60-0x63 */ + 0x9E, 0x5C, 0x97, 0x88, 0xED, 0xC5, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x61, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x8D, 0x59, 0x00, 0x00, 0x94, 0x74, /* 0x6C-0x6F */ + 0x9E, 0x5E, 0x93, 0x8C, 0x9D, 0xDC, 0x9D, 0xE0, /* 0x70-0x73 */ + 0x00, 0x00, 0x8B, 0x6E, 0x00, 0x00, 0x94, 0x66, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x9E, 0x60, 0x00, 0x00, 0x8F, 0xBC, 0x94, 0xC2, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x9E, 0x66, 0x00, 0x00, 0x94, 0xF8, /* 0x84-0x87 */ + 0x00, 0x00, 0x9E, 0x5D, 0x00, 0x00, 0x9E, 0x63, /* 0x88-0x8B */ + 0x9E, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x90, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x96, 0x8D, 0x00, 0x00, 0x97, 0xD1, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x87, 0x00, 0x00, /* 0x98-0x9B */ + 0x89, 0xCA, 0x8E, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x98, 0x67, 0x9E, 0x65, 0x90, 0x95, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x64, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x9E, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xCD, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x6B, /* 0xB0-0xB3 */ + 0x9E, 0x69, 0x00, 0x00, 0x89, 0xCB, 0x9E, 0x67, /* 0xB4-0xB7 */ + 0x9E, 0x6D, 0x9E, 0x73, 0x00, 0x00, 0xED, 0xC6, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xED, 0xC8, 0x91, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x95, 0xBF, 0x00, 0x00, 0x9E, 0x75, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x41, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x74, 0x94, 0x90, /* 0xCC-0xCF */ + 0x96, 0x5E, 0x8A, 0xB9, 0x00, 0x00, 0x90, 0xF5, /* 0xD0-0xD3 */ + 0x8F, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x92, 0xD1, 0x00, 0x00, 0x97, 0x4D, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x9E, 0x70, 0x9E, 0x6F, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x71, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x9E, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x76, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x9E, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x9E, 0x6A, 0x00, 0x00, 0x9E, 0x72, 0x9E, 0x68, /* 0xEC-0xEF */ + 0x00, 0x00, 0x92, 0x8C, 0x00, 0x00, 0x96, 0xF6, /* 0xF0-0xF3 */ + 0x8E, 0xC4, 0x8D, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xB8, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x8F, 0x8A, 0x60, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_68[512] = { + 0x00, 0x00, 0xED, 0xC9, 0x92, 0xCC, 0x93, 0xC8, /* 0x00-0x03 */ + 0x89, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xF0, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xB2, 0x8C, 0x49, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x78, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x8D, 0x5A, 0x8A, 0x9C, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x9E, 0x7A, 0x8A, 0x94, 0x9E, 0x81, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x7D, 0x00, 0x00, /* 0x30-0x33 */ + 0x90, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x8A, 0x6A, 0x8D, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x8A, 0x69, 0x8D, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x9E, 0x7B, 0x8C, 0x85, 0x8C, 0x6A, 0x93, 0x8D, /* 0x40-0x43 */ + 0xED, 0xCA, 0x00, 0x00, 0x9E, 0x79, 0x00, 0x00, /* 0x44-0x47 */ + 0x88, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x9E, 0x7C, 0x9E, 0x7E, 0x00, 0x00, /* 0x4C-0x4F */ + 0x8B, 0xCB, 0x8C, 0x4B, 0xED, 0xC7, 0x8A, 0xBA, /* 0x50-0x53 */ + 0x8B, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x9E, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x8D, 0xF7, 0x96, 0x91, 0x00, 0x00, 0x8E, 0x56, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x83, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x4F, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x9E, 0x8F, 0x00, 0x00, 0x89, 0xB1, 0x9E, 0x84, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0x95, 0x9E, 0x85, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x97, 0xC0, 0x00, 0x00, 0x9E, 0x8C, /* 0x80-0x83 */ + 0x00, 0x00, 0x94, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x9E, 0x94, 0x00, 0x00, 0x9E, 0x87, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xB2, /* 0x90-0x93 */ + 0x9E, 0x89, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x5B, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0x8B, /* 0x98-0x9B */ + 0x00, 0x00, 0x9E, 0x8A, 0x00, 0x00, 0x9E, 0x86, /* 0x9C-0x9F */ + 0x9E, 0x91, 0x00, 0x00, 0x8F, 0xBD, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x9A, 0xEB, 0x8C, 0xE6, /* 0xA4-0xA7 */ + 0x97, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x9E, 0x88, 0x00, 0x00, 0x92, 0xF2, /* 0xAC-0xAF */ + 0x8A, 0x42, 0x8D, 0xAB, 0x00, 0x00, 0x9E, 0x80, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x9E, 0x90, 0x8A, 0x81, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x9E, 0x8E, 0x9E, 0x92, 0x00, 0x00, /* 0xB8-0xBB */ + 0x93, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x8A, 0xFC, 0x00, 0x00, 0x9E, 0xB0, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xED, 0x48, 0x96, 0xC7, 0x9E, 0x97, 0x8A, 0xFB, /* 0xC8-0xCB */ + 0x00, 0x00, 0x9E, 0x9E, 0x00, 0x00, 0xED, 0xCB, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x5F, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x9E, 0x9F, 0x9E, 0xA1, 0x00, 0x00, 0x9E, 0xA5, /* 0xD4-0xD7 */ + 0x9E, 0x99, 0x00, 0x00, 0x92, 0x49, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x8F, /* 0xDC-0xDF */ + 0x9E, 0xA9, 0x9E, 0x9C, 0x00, 0x00, 0x9E, 0xA6, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xA0, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0x58, 0x9E, 0xAA, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xB1, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x9E, 0xA8, 0x8A, 0xBB, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_69[512] = { + 0x98, 0x6F, 0x9E, 0x96, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x9E, 0xA4, 0x88, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x9E, 0x98, 0x00, 0x00, 0x00, 0x00, 0x96, 0xB8, /* 0x08-0x0B */ + 0x9E, 0x9D, 0x90, 0x41, 0x92, 0xC5, 0x9E, 0x93, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xA3, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x90, 0x9A, 0x9E, 0xAD, 0x8A, 0x91, /* 0x18-0x1B */ + 0x8C, 0x9F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x9E, 0xAF, 0x9E, 0x9A, 0x9E, 0xAE, /* 0x20-0x23 */ + 0x00, 0x00, 0x9E, 0xA7, 0x9E, 0x9B, 0x00, 0x00, /* 0x24-0x27 */ + 0x9E, 0xAB, 0x00, 0x00, 0x9E, 0xAC, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x9E, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x93, 0xCC, 0x00, 0x00, 0x9E, 0xA2, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x9E, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x9E, 0xBB, 0x00, 0x00, 0x92, 0xD6, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x6B, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x96, /* 0x50-0x53 */ + 0x9E, 0xB6, 0x91, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x9E, 0xBC, 0x91, 0x5E, 0x00, 0x00, /* 0x58-0x5B */ + 0x9E, 0xB3, 0x9E, 0xC0, 0x9E, 0xBF, 0x00, 0x00, /* 0x5C-0x5F */ + 0x93, 0xED, 0x9E, 0xBE, 0x93, 0xE8, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xED, 0xCD, 0x00, 0x00, 0x9E, 0xC2, 0x9E, 0xB5, /* 0x68-0x6B */ + 0x00, 0x00, 0x8B, 0xC6, 0x9E, 0xB8, 0x8F, 0x7C, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x80, /* 0x70-0x73 */ + 0x9E, 0xBA, 0x8B, 0xC9, 0x00, 0x00, 0x9E, 0xB2, /* 0x74-0x77 */ + 0x9E, 0xB4, 0x9E, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x98, 0x4F, 0x8A, 0x79, 0x9E, 0xB7, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x9E, 0xC1, 0x8A, 0x54, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xE5, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x7C, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x9E, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x98, 0x50, 0x9E, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xED, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x90, 0x59, /* 0x98-0x9B */ + 0x9E, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x9E, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xD0, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xC4, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x9E, 0xE1, 0x9E, 0xC3, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x9E, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xCE, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xC9, 0x9E, 0xC6, /* 0xBC-0xBF */ + 0x00, 0x00, 0x9E, 0xC7, 0x00, 0x00, 0x9E, 0xCF, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xA0, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xCC, 0x8D, 0x5C, /* 0xC8-0xCB */ + 0x92, 0xC6, 0x91, 0x84, 0x9E, 0xCA, 0x00, 0x00, /* 0xCC-0xCF */ + 0x9E, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xC8, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x97, 0x6C, 0x96, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x9E, 0xCD, 0x9E, 0xD7, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xD0, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xDF, /* 0xE4-0xE7 */ + 0x9E, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x9E, 0xE5, /* 0xE8-0xEB */ + 0x00, 0x00, 0x9E, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xDE, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x9E, 0xDD, 0x00, 0x00, 0x92, 0xCE, /* 0xF8-0xFB */ + 0x00, 0x00, 0x91, 0x85, 0x00, 0x00, 0x9E, 0xDB, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6A[512] = { + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xD9, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x9E, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xE6, 0x94, 0xF3, /* 0x08-0x0B */ + 0x9E, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xE7, 0x9E, 0xEA, /* 0x10-0x13 */ + 0x9E, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x92, 0x94, /* 0x14-0x17 */ + 0x00, 0x00, 0x95, 0x57, 0x00, 0x00, 0x9E, 0xDA, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xE2, 0x8F, 0xBE, /* 0x1C-0x1F */ + 0x00, 0x00, 0x96, 0xCD, 0x9E, 0xF6, 0x9E, 0xE9, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x8C, 0xA0, 0x89, 0xA1, 0x8A, 0x7E, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xD1, 0x00, 0x00, /* 0x2C-0x2F */ + 0xED, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x8F, 0xBF, 0x9E, 0xEE, 0x00, 0x00, /* 0x34-0x37 */ + 0x9E, 0xF5, 0x8E, 0xF7, 0x8A, 0x92, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x92, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x9E, 0xEB, 0x00, 0x00, 0xED, 0xD3, 0x9E, 0xF0, /* 0x44-0x47 */ + 0x9E, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xB4, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x8B, 0x6B, 0x9E, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x40, /* 0x5C-0x5F */ + 0x00, 0x00, 0x93, 0xC9, 0x9E, 0xF1, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xF3, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xD2, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xED, 0xED, 0xD4, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x9E, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xD5, 0x8A, 0x80, /* 0x7C-0x7F */ + + 0x92, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x9E, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x9E, 0xF8, 0x8C, 0xE7, 0x00, 0x00, /* 0x8C-0x8F */ + 0x9E, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x40, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x9E, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x9E, 0xF9, 0x00, 0x00, 0x9E, 0xFB, 0x9E, 0xFC, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x4B, 0x00, 0x00, /* 0xA8-0xAB */ + 0x9F, 0x47, 0x00, 0x00, 0x9E, 0x8D, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x46, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x9F, 0x45, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x42, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x9E, 0xE8, 0x9F, 0x44, 0x9F, 0x43, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x9F, 0x49, 0x00, 0x00, 0x98, 0x45, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x4C, 0x8B, 0xF9, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x48, 0x9F, 0x4A, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xD6, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xED, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x94, 0xA5, 0x00, 0x00, 0x9F, 0x4D, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x51, 0x9F, 0x4E, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_6B[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x97, 0x93, 0x9F, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x9E, 0xDC, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x52, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x53, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x89, 0x54, 0x00, 0x00, 0x9F, 0x55, /* 0x1C-0x1F */ + 0x8C, 0x87, 0x8E, 0x9F, 0x00, 0x00, 0x8B, 0xD3, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xA2, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x7E, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x57, /* 0x34-0x37 */ + 0x9F, 0x56, 0x9F, 0x59, 0x8B, 0x5C, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x8B, 0xD4, 0x8A, 0xBC, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x5C, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x5B, /* 0x44-0x47 */ + 0x00, 0x00, 0x9F, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x89, 0xCC, 0x00, 0x00, 0x92, 0x56, 0x00, 0x00, /* 0x4C-0x4F */ + 0x9F, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xBD, /* 0x50-0x53 */ + 0x9F, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x9F, 0x5F, 0x00, 0x00, 0x9F, 0x61, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x62, /* 0x5C-0x5F */ + 0x00, 0x00, 0x9F, 0x63, 0x8E, 0x7E, 0x90, 0xB3, /* 0x60-0x63 */ + 0x8D, 0x9F, 0x00, 0x00, 0x95, 0x90, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x95, 0xE0, 0x98, 0x63, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x95, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xCE, /* 0x70-0x73 */ + 0x97, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x9F, 0x64, 0x9F, 0x65, 0x00, 0x00, 0x8E, 0x80, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x66, /* 0x7C-0x7F */ + + 0x9F, 0x67, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x69, /* 0x80-0x83 */ + 0x9F, 0x68, 0x00, 0x00, 0x96, 0x77, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x8F, 0x7D, 0x8E, 0xEA, 0x8E, 0x63, /* 0x88-0x8B */ + 0x00, 0x00, 0x9F, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x9F, 0x6C, 0x90, 0x42, 0x00, 0x00, /* 0x94-0x97 */ + 0x9F, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x6D, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x9F, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x6F, 0x9F, 0x70, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x71, /* 0xAC-0xAF */ + 0x00, 0x00, 0x9F, 0x73, 0x9F, 0x72, 0x9F, 0x74, /* 0xB0-0xB3 */ + 0x89, 0xA3, 0x92, 0x69, 0x00, 0x00, 0x9F, 0x75, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x45, 0x8A, 0x6B, /* 0xB8-0xBB */ + 0x9F, 0x76, 0x00, 0x00, 0x00, 0x00, 0x93, 0x61, /* 0xBC-0xBF */ + 0x9A, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x8B, 0x42, 0x9F, 0x77, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x78, /* 0xC8-0xCB */ + 0x00, 0x00, 0x95, 0xEA, 0x96, 0x88, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0xC5, 0x9F, 0x79, /* 0xD0-0xD3 */ + 0x94, 0xE4, 0x00, 0x00, 0xED, 0xD8, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x94, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x96, 0xD1, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7A, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7C, /* 0xE8-0xEB */ + 0x9F, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7E, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x7D, /* 0xF0-0xF3 */ +}; + +static const unsigned char u2c_6C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x9F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x81, /* 0x0C-0x0F */ + 0x00, 0x00, 0x96, 0xAF, 0x00, 0x00, 0x9F, 0x82, /* 0x10-0x13 */ + 0x9F, 0x83, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x43, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x84, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x86, /* 0x20-0x23 */ + 0x9F, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x90, 0x85, 0x00, 0x00, 0x00, 0x00, 0x95, 0x58, /* 0x34-0x37 */ + 0x89, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xC3, 0xED, 0xD9, /* 0x3C-0x3F */ + 0x92, 0xF3, 0x8F, 0x60, 0x8B, 0x81, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xC4, 0x00, 0x00, /* 0x4C-0x4F */ + 0x8E, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x9F, 0x88, 0x00, 0x00, 0x8A, 0xBE, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x98, 0x00, 0x00, /* 0x58-0x5B */ + 0xED, 0xDA, 0x93, 0xF0, 0x9F, 0x87, 0x8D, 0x5D, /* 0x5C-0x5F */ + 0x92, 0x72, 0x00, 0x00, 0x9F, 0x89, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x9F, 0x91, 0x00, 0x00, 0x9F, 0x8A, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xDC, /* 0x6C-0x6F */ + 0x91, 0xBF, 0x00, 0x00, 0x8B, 0x82, 0x9F, 0x92, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x88, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x8B, 0x44, 0x9F, 0x90, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x9F, 0x8E, 0x9F, 0x8B, 0x97, 0x80, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xDB, 0x00, 0x00, /* 0x84-0x87 */ + 0x92, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x93, 0xD7, 0x9F, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x9F, 0x94, 0x00, 0x00, 0x9F, 0x93, 0x8C, 0x42, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xAB, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x8D, 0xB9, 0x9F, 0x8D, 0x9F, 0x8F, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x96, 0x76, 0x91, 0xF2, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x97, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x9C, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x9F, 0x9D, 0x00, 0x00, 0x89, 0xCD, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x95, 0xA6, 0x96, 0xFB, 0x9F, 0x9F, 0x8E, 0xA1, /* 0xB8-0xBB */ + 0x8F, 0xC0, 0x9F, 0x98, 0x9F, 0x9E, 0x89, 0x88, /* 0xBC-0xBF */ + 0x00, 0x00, 0x8B, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x9F, 0x95, 0x9F, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x90, 0xF2, 0x94, 0x91, 0x00, 0x00, /* 0xC8-0xCB */ + 0x94, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0x97, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x96, 0x40, 0x00, 0x00, 0x9F, 0x99, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x9F, 0xA2, 0xED, 0xDD, 0x9F, 0xA0, /* 0xD8-0xDB */ + 0x00, 0x00, 0x9F, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x96, 0x41, 0x94, 0x67, 0x8B, 0x83, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x93, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x92, 0x8D, 0x00, 0x00, 0x9F, 0xA3, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xA1, /* 0xEC-0xEF */ + 0x91, 0xD7, 0x9F, 0x96, 0x00, 0x00, 0x89, 0x6A, /* 0xF0-0xF3 */ +}; + +static const unsigned char u2c_6D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xED, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x6D, /* 0x08-0x0B */ + 0x9F, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xAD, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xF4, /* 0x14-0x17 */ + 0x00, 0x00, 0x9F, 0xAA, 0x00, 0x00, 0x97, 0x8C, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0xB4, 0x9F, 0xA4, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x92, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x89, 0x6B, 0x8D, 0x5E, 0x9F, 0xA7, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x46, 0x9F, 0xAC, /* 0x30-0x33 */ + 0x00, 0x00, 0x9F, 0xAB, 0x9F, 0xA6, 0x00, 0x00, /* 0x34-0x37 */ + 0x9F, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x88, /* 0x38-0x3B */ + 0x00, 0x00, 0x9F, 0xA8, 0x94, 0x68, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x97, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x8F, 0xF2, 0x90, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x9F, 0xB4, 0x9F, 0xB2, 0x00, 0x00, /* 0x58-0x5B */ + 0x95, 0x6C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xAF, /* 0x60-0x63 */ + 0x9F, 0xB1, 0x00, 0x00, 0x89, 0x59, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x8D, 0x5F, 0x98, 0x51, 0x00, 0x00, /* 0x68-0x6B */ + 0x8A, 0x5C, 0x00, 0x00, 0x95, 0x82, 0xED, 0xE0, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x97, 0x81, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x43, /* 0x74-0x77 */ + 0x90, 0x5A, 0x9F, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x9F, 0xB8, 0x00, 0x00, 0xED, 0xDF, /* 0x84-0x87 */ + 0x8F, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x97, 0x4F, 0x00, 0x00, 0x9F, 0xB5, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xB0, /* 0x90-0x93 */ + 0x00, 0x00, 0x9F, 0xB6, 0xED, 0xE1, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x97, 0xDC, 0x00, 0x00, 0x93, 0x93, /* 0x98-0x9B */ + 0x93, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xED, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x55, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x74, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x9F, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x9F, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x97, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x97, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x9F, 0xC6, 0x9F, 0xC0, 0x9F, 0xBD, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xD2, /* 0xC8-0xCB */ + 0x9F, 0xC3, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE3, /* 0xCC-0xCF */ + 0x00, 0x00, 0x8F, 0x69, 0x9F, 0xC5, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x9F, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x93, 0x91, 0x9F, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xC2, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x92, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x9F, 0xC9, 0x00, 0x00, 0x9F, 0xBE, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x9F, 0xC4, 0x00, 0x00, 0x9F, 0xCB, 0x88, 0xFA, /* 0xE8-0xEB */ + 0x9F, 0xC1, 0x00, 0x00, 0x9F, 0xCC, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x90, 0x5B, 0xED, 0xE5, 0x8F, 0x7E, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x95, 0xA3, 0x00, 0x00, 0x8D, 0xAC, /* 0xF4-0xF7 */ + 0xED, 0xE4, 0x9F, 0xB9, 0x9F, 0xC7, 0x93, 0x59, /* 0xF8-0xFB */ + 0xED, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x90, 0xB4, 0x00, 0x00, 0x8A, 0x89, /* 0x04-0x07 */ + 0x8D, 0xCF, 0x8F, 0xC2, 0x9F, 0xBB, 0x8F, 0x61, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x6B, /* 0x10-0x13 */ + 0x00, 0x00, 0x9F, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x9F, 0xD0, 0x8F, 0x8D, 0x8C, 0xB8, /* 0x18-0x1B */ + 0x00, 0x00, 0x9F, 0xDF, 0x00, 0x00, 0x9F, 0xD9, /* 0x1C-0x1F */ + 0x8B, 0x94, 0x93, 0x6E, 0x00, 0x00, 0x9F, 0xD4, /* 0x20-0x23 */ + 0x9F, 0xDD, 0x88, 0xAD, 0x89, 0x51, 0xED, 0xE9, /* 0x24-0x27 */ + 0x00, 0x00, 0x89, 0xB7, 0x00, 0x00, 0x9F, 0xD6, /* 0x28-0x2B */ + 0x91, 0xAA, 0x9F, 0xCD, 0x9F, 0xCF, 0x8D, 0x60, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x9F, 0xE0, 0xED, 0xE7, 0x9F, 0xDB, 0x00, 0x00, /* 0x38-0x3B */ + 0xED, 0xEA, 0x00, 0x00, 0x9F, 0xD3, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xDA, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xA9, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x9F, 0xD8, 0x9F, 0xDC, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0xCE, 0x00, 0x00, /* 0x54-0x57 */ + 0x8F, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x92, 0x58, /* 0x58-0x5B */ + 0xED, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xD2, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x4E, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xD5, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xCE, 0x93, 0x92, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xD1, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xD7, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x70, 0x8E, 0xBC, /* 0x7C-0x7F */ + + 0x96, 0x9E, 0x00, 0x00, 0x9F, 0xE1, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x94, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xED, /* 0x8C-0x8F */ + 0x8C, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x80, 0x00, 0x00, /* 0x94-0x97 */ + 0x9F, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x97, 0xAD, 0x8D, 0x61, 0x00, 0x00, 0x9F, 0xF0, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0xEC, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x9F, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xE2, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xE8, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xEA, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x6E, 0x9F, 0xE5, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x4D, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x9F, 0xE7, 0x00, 0x00, 0xED, 0xEB, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xEF, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x9F, 0xE9, 0x96, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x9F, 0xE4, 0x00, 0x00, 0x8E, 0xA0, /* 0xC8-0xCB */ + 0x9F, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x8A, 0x8A, 0x00, 0x00, 0x9F, 0xE6, /* 0xD0-0xD3 */ + 0x9F, 0xEB, 0x9F, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x91, 0xEA, 0x91, 0xD8, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x9F, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x9F, 0xFA, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xF8, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x93, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x42, /* 0xF4-0xF7 */ + 0x9F, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0xF6, 0x9F, 0xDE, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6F[512] = { + 0x00, 0x00, 0x8B, 0x99, 0x95, 0x59, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xBD, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x8D, 0x97, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x52, /* 0x0C-0x0F */ + 0x00, 0x00, 0x9F, 0xF2, 0x00, 0x00, 0xE0, 0x41, /* 0x10-0x13 */ + 0x89, 0x89, 0x91, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x94, 0x99, 0x00, 0x00, 0x8A, 0xBF, 0x97, 0xF8, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x9F, /* 0x28-0x2B */ + 0x92, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x9F, 0xF9, 0x9F, 0xFB, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x91, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x40, 0x9F, 0xF7, /* 0x3C-0x3F */ + 0x00, 0x00, 0x9F, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x8A, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x8C, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xE0, 0x4E, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x49, /* 0x58-0x5B */ + 0x90, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x83, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x8F, 0x81, 0x00, 0x00, 0xE0, 0x52, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xE0, 0x4B, 0x92, 0xAA, 0xE0, 0x48, /* 0x6C-0x6F */ + 0x92, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xE0, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xE0, 0x45, 0x00, 0x00, 0xE0, 0x44, 0x00, 0x00, /* 0x78-0x7B */ + 0xE0, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xE0, 0x47, 0xE0, 0x46, 0xE0, 0x4C, 0x00, 0x00, /* 0x80-0x83 */ + 0x90, 0x9F, 0x00, 0x00, 0xE0, 0x43, 0x00, 0x00, /* 0x84-0x87 */ + 0xED, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x4F, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE0, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xC0, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE0, 0x55, 0x00, 0x00, 0xE0, 0x54, /* 0xA0-0xA3 */ + 0xE0, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x59, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x93, 0x62, 0x00, 0x00, 0xE0, 0x53, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xED, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE0, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x8C, 0x83, 0x91, 0xF7, 0xE0, 0x51, 0x94, 0x5A, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x58, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE0, 0x5D, 0xE0, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE0, 0x5E, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x61, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x5A, /* 0xDC-0xDF */ + 0x8D, 0x8A, 0x94, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x9F, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x94, /* 0xE8-0xEB */ + 0xE0, 0x5C, 0x00, 0x00, 0xE0, 0x60, 0x91, 0xF3, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE0, 0x5F, 0x00, 0x00, 0xE0, 0x4A, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xED, 0xEE, 0xE8, 0x89, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x64, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x68, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_70[512] = { + 0x00, 0x00, 0xE0, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xED, 0xEF, 0x00, 0x00, 0xED, 0xF0, /* 0x04-0x07 */ + 0x00, 0x00, 0xE0, 0x62, 0x00, 0x00, 0xE0, 0x63, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x67, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE0, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x95, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE0, 0x6D, 0x00, 0x00, 0xE0, 0x6A, 0xE0, 0x69, /* 0x18-0x1B */ + 0x00, 0x00, 0xE0, 0x6C, 0x93, 0xD2, 0xE0, 0x6E, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x95, 0x91, 0xEB, /* 0x24-0x27 */ + 0xED, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x90, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE0, 0x6F, 0x00, 0x00, 0xE0, 0x71, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x70, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x9F, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xE0, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x93, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x73, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xCE, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x94, /* 0x6C-0x6F */ + 0x8A, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x8B, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x8E, 0xDC, 0x8D, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xED, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x98, 0x46, 0x90, 0x86, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x8A, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x75, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xE0, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xF3, /* 0xA8-0xAB */ + 0xE0, 0x78, 0x92, 0x59, 0xE0, 0x7B, 0xE0, 0x76, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x7A, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE0, 0x79, 0x93, 0x5F, 0x88, 0xD7, 0xED, 0x46, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x97, 0xF3, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x7D, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x47, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE0, 0x80, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE0, 0x7E, 0x00, 0x00, 0xE0, 0x7C, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE0, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x96, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE0, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_71[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xED, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xE0, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xF4, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x89, 0x8B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE0, 0x84, 0x95, 0xB0, 0x00, 0x00, /* 0x18-0x1B */ + 0xE0, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x96, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xC5, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0x52, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x8F, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xF7, 0xED, 0xF8, /* 0x44-0x47 */ + 0x00, 0x00, 0x97, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE0, 0x8A, 0x00, 0x00, 0x90, 0xF7, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE0, 0x86, 0xE0, 0x8B, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x89, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xED, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x89, 0x00, 0x00, /* 0x60-0x63 */ + 0x94, 0x81, 0xE0, 0x85, 0xE0, 0x88, 0x8F, 0xC6, /* 0x64-0x67 */ + 0x00, 0x00, 0x94, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE0, 0x8C, 0x00, 0x00, 0x8E, 0xCF, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x90, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xE0, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE0, 0x87, 0x00, 0x00, 0x8C, 0x46, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x8D, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x97, 0x6F, 0xE0, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xEA, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x6E, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE0, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE0, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x94, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE0, 0x94, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x95, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xED, 0xFA, 0x00, 0x00, 0x94, 0x52, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x93, 0x95, 0xE0, 0x97, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x99, 0x00, 0x00, /* 0xCC-0xCF */ + 0x97, 0xD3, 0x00, 0x00, 0xE0, 0x96, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE0, 0x98, 0x89, 0x8D, 0x00, 0x00, 0xE0, 0x93, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9A, 0x7A, /* 0xDC-0xDF */ + 0xE0, 0x9A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x91, 0x87, 0x8E, 0x57, 0xE0, 0x9C, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xE0, 0x9B, 0x90, 0x43, 0x99, 0xD7, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xE0, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE0, 0x9F, 0x00, 0x00, 0xE0, 0x8E, /* 0xF8-0xFB */ + 0xE0, 0x9E, 0x00, 0x00, 0xED, 0xFB, 0xE0, 0xA0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_72[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x9A, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE0, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE0, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA3, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xE0, 0xA4, 0x00, 0x00, 0x92, 0xDC, 0x00, 0x00, /* 0x28-0x2B */ + 0xE0, 0xA6, 0xE0, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE0, 0xA7, 0x00, 0x00, 0xE0, 0xA8, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x8E, 0xDD, 0x95, 0x83, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xEA, 0xE0, 0xA9, /* 0x38-0x3B */ + 0xE0, 0xAA, 0x91, 0x75, 0x8E, 0xA2, 0xE0, 0xAB, /* 0x3C-0x3F */ + 0xE0, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xAD, 0x95, 0xD0, /* 0x44-0x47 */ + 0x94, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xAE, /* 0x48-0x4B */ + 0x94, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0xAB, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xE0, 0xAF, 0x89, 0xE5, 0x00, 0x00, 0x8B, 0x8D, /* 0x58-0x5B */ + 0x00, 0x00, 0x96, 0xC4, 0x00, 0x00, 0x96, 0xB4, /* 0x5C-0x5F */ + 0x00, 0x00, 0x89, 0xB2, 0x98, 0x53, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x71, /* 0x64-0x67 */ + 0x00, 0x00, 0x95, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xB5, 0x00, 0x00, /* 0x70-0x73 */ + 0xE0, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x93, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x8C, 0xA1, 0xE0, 0xB1, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x8D, 0xD2, 0xE0, 0xB3, 0xE0, 0xB2, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB4, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB5, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB6, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x8B, 0x5D, 0x00, 0x00, 0xE0, 0xB7, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB8, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x8C, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x94, 0xC6, /* 0xAC-0xAF */ + 0x00, 0x00, 0xED, 0xFC, 0xE0, 0xBA, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xF3, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE0, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x40, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xB6, 0xE0, 0xBB, /* 0xC0-0xC3 */ + 0xE0, 0xBD, 0x00, 0x00, 0xE0, 0xBC, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xBE, 0x00, 0x00, /* 0xCC-0xCF */ + 0x8C, 0xCF, 0x00, 0x00, 0xE0, 0xBF, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xE7, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x91, 0x5F, 0x00, 0x00, 0x8D, 0x9D, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xE0, 0xC1, 0xE0, 0xC2, 0xE0, 0xC0, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x8E, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x93, 0xC6, 0x8B, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC4, /* 0xF4-0xF7 */ + 0x92, 0x4B, 0xE0, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x98, 0x54, 0x94, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_73[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC7, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC9, 0xE0, 0xC6, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0xD2, /* 0x18-0x1B */ + 0xE0, 0xC8, 0xE0, 0xCA, 0x00, 0x00, 0x97, 0xC2, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xEE, 0x41, 0xE0, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE0, 0xCD, 0x92, 0x96, 0x94, 0x4C, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0xA3, 0xE0, 0xCC, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xE0, 0xCB, 0x00, 0x00, 0x97, 0x50, 0x97, 0x51, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xCF, 0x89, 0x8E, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x8D, 0x96, 0x8E, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xD0, 0xE0, 0xD1, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xD3, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x62, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xE0, 0xD5, 0x00, 0x00, 0xE0, 0xD4, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE0, 0xD6, 0x00, 0x00, 0x8A, 0x6C, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xE0, 0xD8, 0x00, 0x00, 0xEE, 0x43, /* 0x74-0x77 */ + 0xE0, 0xD7, 0x00, 0x00, 0xE0, 0xDA, 0xE0, 0xD9, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x8C, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x97, 0xA6, /* 0x84-0x87 */ + 0x00, 0x00, 0x8B, 0xCA, 0x00, 0x00, 0x89, 0xA4, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xE8, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x8A, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xE6, 0xE0, 0xDC, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xDE, /* 0xB8-0xBB */ + 0x00, 0x00, 0xEE, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE0, 0xDF, 0x00, 0x00, 0x89, 0xCF, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE0, 0xDB, 0xEE, 0x45, 0x8E, 0x58, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x92, 0xBF, 0xE0, 0xDD, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x48, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x46, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE2, 0x00, 0x00, /* 0xDC-0xDF */ + 0x8E, 0xEC, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x47, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE0, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x5D, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x94, 0xC7, 0xE0, 0xE1, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE0, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xEE, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xE0, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0xBB, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_74[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x85, /* 0x00-0x03 */ + 0x00, 0x00, 0xE0, 0xE4, 0x97, 0x9D, 0xEE, 0x49, /* 0x04-0x07 */ + 0x00, 0x00, 0x97, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0xF4, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE0, 0xE6, 0xEE, 0x4B, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xEE, 0x4D, 0xEE, 0x4C, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x4E, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE8, 0x97, 0xD4, /* 0x30-0x33 */ + 0x8B, 0xD5, 0x94, 0xFA, 0x94, 0x69, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE9, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xEB, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE0, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE0, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xE0, 0xED, 0x8C, 0xE8, 0x89, 0x6C, /* 0x58-0x5B */ + 0xE0, 0xEF, 0x00, 0x00, 0x90, 0x90, 0xE0, 0xEC, /* 0x5C-0x5F */ + 0x97, 0xDA, 0x00, 0x00, 0xEE, 0x4F, 0xE0, 0xF2, /* 0x60-0x63 */ + 0xEA, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE0, 0xF0, 0xE0, 0xF3, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE5, /* 0x6C-0x6F */ + 0xE0, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x8D, 0xBA, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xF4, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xF5, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x9E, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xEE, 0x50, 0x00, 0x00, 0xE0, 0xF6, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xF7, 0xEE, 0x51, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE3, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xF8, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x8A, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x8E, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xF9, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xFA, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE0, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x89, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xE1, 0x40, 0x00, 0x00, 0x95, 0x5A, 0xE1, 0x41, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xA2, 0xE1, 0x42, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE1, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x44, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE1, 0x46, 0xE1, 0x47, 0xE1, 0x45, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x72, 0xE1, 0x49, /* 0xF4-0xF7 */ + 0xE1, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_75[512] = { + 0x00, 0x00, 0xEE, 0x52, 0x00, 0x00, 0xE1, 0x4B, /* 0x00-0x03 */ + 0xE1, 0x4A, 0xE1, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xE1, 0x4D, 0xE1, 0x4F, 0xE1, 0x4E, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x8D, 0x99, 0x00, 0x00, 0xE1, 0x51, /* 0x10-0x13 */ + 0x00, 0x00, 0xE1, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x8A, 0xC3, 0x00, 0x00, 0x90, 0x72, 0x00, 0x00, /* 0x18-0x1B */ + 0x93, 0x5B, 0x00, 0x00, 0xE1, 0x52, 0x90, 0xB6, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x59, /* 0x20-0x23 */ + 0x00, 0x00, 0x89, 0x99, 0xE1, 0x53, 0x00, 0x00, /* 0x24-0x27 */ + 0x97, 0x70, 0x00, 0x00, 0x00, 0x00, 0x95, 0xE1, /* 0x28-0x2B */ + 0xE1, 0x54, 0x00, 0x00, 0x00, 0x00, 0xED, 0x8C, /* 0x2C-0x2F */ + 0x93, 0x63, 0x97, 0x52, 0x8D, 0x62, 0x90, 0x5C, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x6A, /* 0x34-0x37 */ + 0x99, 0xB2, 0x00, 0x00, 0x92, 0xAC, 0x89, 0xE6, /* 0x38-0x3B */ + 0xE1, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE1, 0x56, 0x00, 0x00, 0xE1, 0x5B, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xE1, 0x59, 0xE1, 0x58, 0x9D, 0xC0, /* 0x48-0x4B */ + 0x8A, 0x45, 0xE1, 0x57, 0x00, 0x00, 0x88, 0xD8, /* 0x4C-0x4F */ + 0x00, 0x00, 0x94, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x94, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x97, 0xAF, 0xE1, 0x5C, 0xE1, 0x5A, /* 0x58-0x5B */ + 0x92, 0x7B, 0x90, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x94, 0xA9, 0x00, 0x00, 0x95, 0x4C, 0x00, 0x00, /* 0x60-0x63 */ + 0xE1, 0x5E, 0x97, 0xAA, 0x8C, 0x6C, 0xE1, 0x5F, /* 0x64-0x67 */ + 0x00, 0x00, 0xE1, 0x5D, 0x94, 0xD4, 0xE1, 0x60, /* 0x68-0x6B */ + 0x00, 0x00, 0xE1, 0x61, 0x00, 0x00, 0xEE, 0x53, /* 0x6C-0x6F */ + 0x88, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x8F, 0xF4, /* 0x70-0x73 */ + 0xE1, 0x66, 0x00, 0x00, 0xE1, 0x63, 0x93, 0xEB, /* 0x74-0x77 */ + 0xE1, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x45, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x69, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x64, 0xE1, 0x65, /* 0x84-0x87 */ + 0x00, 0x00, 0xE1, 0x68, 0xE1, 0x67, 0x95, 0x44, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0x61, 0x91, 0x60, /* 0x8C-0x8F */ + 0x00, 0x00, 0x8B, 0x5E, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE1, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x6B, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xE1, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x6E, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xE1, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x75, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xE1, 0x76, 0x94, 0xE6, 0xE1, 0x70, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE1, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE1, 0x74, 0x90, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xE1, 0x75, 0xE1, 0x73, 0x8E, 0xBE, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x6F, 0xE1, 0x71, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x95, 0x61, 0x00, 0x00, 0x8F, 0xC7, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x78, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE1, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x79, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x8E, 0xA4, 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x93, 0x97, 0xE1, 0x7A, 0x00, 0x00, 0x92, 0xC9, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x7C, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x9F, 0xE1, 0x7B, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x91, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE1, 0x82, 0x00, 0x00, 0xE1, 0x84, 0xE1, 0x85, /* 0xF0-0xF3 */ + 0x92, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x83, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE1, 0x80, 0x00, 0x00, 0xE1, 0x7D, 0xE1, 0x7E, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_76[512] = { + 0x00, 0x00, 0xE1, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xE1, 0x88, 0x00, 0x00, 0xE1, 0x86, /* 0x08-0x0B */ + 0x00, 0x00, 0xE1, 0x87, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x89, /* 0x1C-0x1F */ + 0xE1, 0x8B, 0xE1, 0x8C, 0xE1, 0x8D, 0x00, 0x00, /* 0x20-0x23 */ + 0xE1, 0x8E, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x8A, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE1, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xE1, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x91, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xC3, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x94, 0xE1, 0x92, /* 0x44-0x47 */ + 0xE1, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x8A, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xFC, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xC8, 0x00, 0x00, /* 0x54-0x57 */ + 0xE1, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xE1, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xE1, 0x97, 0xE1, 0x98, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x9C, /* 0x64-0x67 */ + 0xE1, 0x99, 0xE1, 0x9A, 0xE1, 0x9B, 0x00, 0x00, /* 0x68-0x6B */ + 0xE1, 0x9D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE1, 0x9E, 0x00, 0x00, 0xE1, 0x9F, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA0, 0x00, 0x00, /* 0x74-0x77 */ + 0xE1, 0xA1, 0x00, 0x00, 0x94, 0xAD, 0x93, 0x6F, /* 0x78-0x7B */ + 0xE1, 0xA2, 0x94, 0x92, 0x95, 0x53, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xE1, 0xA3, 0x00, 0x00, 0xEE, 0x54, 0xE1, 0xA4, /* 0x80-0x83 */ + 0x93, 0x49, 0x00, 0x00, 0x8A, 0x46, 0x8D, 0x63, /* 0x84-0x87 */ + 0xE1, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA6, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA7, 0x00, 0x00, /* 0x8C-0x8F */ + 0x8E, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA9, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA8, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xE1, 0xAA, 0xE1, 0xAB, 0xEE, 0x57, /* 0x98-0x9B */ + 0xEE, 0x55, 0x00, 0x00, 0xEE, 0x56, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x58, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xE7, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE1, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE1, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x89, /* 0xB4-0xB7 */ + 0xE1, 0xAE, 0xE1, 0xAF, 0xE1, 0xB0, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x4D, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB1, 0x94, 0x75, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x7E, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x89, 0x6D, 0x00, 0x00, 0x89, 0x76, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE1, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB4, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB3, 0x93, 0x90, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xB7, /* 0xD8-0xDB */ + 0x9F, 0x58, 0x00, 0x00, 0xE1, 0xB5, 0x96, 0xBF, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE1, 0xB6, 0x00, 0x00, 0x8A, 0xC4, /* 0xE0-0xE3 */ + 0x94, 0xD5, 0xE1, 0xB7, 0x00, 0x00, 0xE1, 0xB8, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB9, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xDA, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xD3, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x92, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x91, 0x8A, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBB, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x82, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_77[512] = { + 0x00, 0x00, 0x8F, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xE1, 0xBE, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBD, /* 0x04-0x07 */ + 0xE1, 0xBC, 0x94, 0xFB, 0x00, 0x00, 0x8A, 0xC5, /* 0x08-0x0B */ + 0x8C, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC4, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC1, 0x90, 0x5E, /* 0x1C-0x1F */ + 0x96, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xE1, 0xC0, 0xE1, 0xC2, 0xE1, 0xC3, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE1, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC5, /* 0x34-0x37 */ + 0xE1, 0xC6, 0x00, 0x00, 0x92, 0xAD, 0x00, 0x00, /* 0x38-0x3B */ + 0x8A, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x92, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x5A, 0xE1, 0xC7, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC8, 0xE1, 0xCB, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x90, 0x87, 0x00, 0x00, 0x93, 0xC2, /* 0x60-0x63 */ + 0x00, 0x00, 0xE1, 0xCC, 0x96, 0x72, 0x00, 0x00, /* 0x64-0x67 */ + 0xE1, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xCA, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xE1, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xCE, 0xE1, 0xCD, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD1, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD0, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE1, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD4, 0x00, 0x00, /* 0x9C-0x9F */ + 0xE1, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x95, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x8F, 0x75, 0x97, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE1, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x93, 0xB5, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD6, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE1, 0xD7, 0x00, 0x00, 0xE1, 0xDB, /* 0xB8-0xBB */ + 0xE1, 0xD9, 0xE1, 0xDA, 0x00, 0x00, 0xE1, 0xD8, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xDC, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE1, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xDE, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xDF, 0x96, 0xB5, /* 0xD8-0xDB */ + 0xE1, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xEE, 0xE1, 0xE1, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x92, 0x6D, 0x00, 0x00, 0x94, 0x8A, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x8B, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x92, 0x5A, 0xE1, 0xE2, 0x8B, 0xB8, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xCE, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE1, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_78[512] = { + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xBB, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xE1, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE5, 0x00, 0x00, /* 0x10-0x13 */ + 0x8C, 0xA4, 0x8D, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE1, 0xE7, 0xEE, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x93, 0x75, 0x8D, 0xD4, 0x8B, 0x6D, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x43, 0x00, 0x00, /* 0x30-0x33 */ + 0x94, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x76, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x7B, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xE1, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x5D, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x8F, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xEE, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xB0, /* 0x68-0x6B */ + 0x8D, 0x64, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xA5, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xA1, 0x00, 0x00, /* 0x70-0x73 */ + 0xE1, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x5F, 0x00, 0x00, /* 0x78-0x7B */ + 0xE1, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x8C, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xEC, 0x92, 0xF4, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xE1, 0xEF, 0x8A, 0x56, 0xE1, 0xEA, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x94, 0xE8, 0x00, 0x00, 0x89, 0x4F, /* 0x90-0x93 */ + 0x00, 0x00, 0x8D, 0xEA, 0x00, 0x00, 0x98, 0x71, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xEE, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF0, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xC9, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x90, 0xD7, 0xE1, 0xF2, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF3, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE1, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0x6D, 0x00, 0x00, /* 0xB8-0xBB */ + 0xE1, 0xF9, 0x00, 0x00, 0xE1, 0xF8, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x8E, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE1, 0xFA, 0xE1, 0xF5, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xFB, 0xE1, 0xF6, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x94, 0xD6, 0xE1, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE1, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x41, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x40, /* 0xE4-0xE7 */ + 0x96, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xE1, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x88, 0xE9, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE2, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE2, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_79[512] = { + 0x00, 0x00, 0x8F, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x44, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0x62, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE2, 0x46, 0xE2, 0x45, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE2, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE6, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE8, 0xE2, 0x49, /* 0x28-0x2B */ + 0xE2, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xEE, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xA6, 0x00, 0x00, /* 0x38-0x3B */ + 0x97, 0xE7, 0x00, 0x00, 0x8E, 0xD0, 0x00, 0x00, /* 0x3C-0x3F */ + 0xE2, 0x4A, 0x8C, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x5F, /* 0x44-0x47 */ + 0x8B, 0x46, 0x8E, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x97, 0x53, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x50, /* 0x50-0x53 */ + 0x00, 0x00, 0xE2, 0x4F, 0x91, 0x63, 0xE2, 0x4C, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x4E, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x8F, 0x6A, 0x90, 0x5F, 0xE2, 0x4D, /* 0x5C-0x5F */ + 0xE2, 0x4B, 0x00, 0x00, 0x94, 0x49, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x8F, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x95, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x8D, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0x98, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x51, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x52, /* 0x7C-0x7F */ + + 0xE2, 0x68, 0x8B, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x98, 0x5C, 0x91, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x53, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x89, 0xD0, 0x92, 0xF5, 0x95, 0x9F, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xEE, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x66, /* 0x98-0x9B */ + 0x00, 0x00, 0xE2, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x9A, 0xE2, 0x55, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x57, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x58, 0x00, 0x00, /* 0xAC-0xAF */ + 0x94, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x59, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE2, 0x5A, 0xE2, 0x5B, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x8B, 0xD7, 0x89, 0xD1, 0x93, 0xC3, /* 0xBC-0xBF */ + 0x8F, 0x47, 0x8E, 0x84, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE2, 0x5C, 0x00, 0x00, 0x8F, 0x48, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x89, 0xC8, 0x95, 0x62, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE2, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x94, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x64, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE2, 0x60, 0x00, 0x00, 0xE2, 0x61, /* 0xE0-0xE3 */ + 0x94, 0x89, 0x00, 0x00, 0x90, 0x60, 0xE2, 0x5E, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x92, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xE2, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x8F, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xDA, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_7A[512] = { + 0x8B, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xE2, 0x62, 0x00, 0x00, 0x00, 0x00, 0x92, 0xF6, /* 0x08-0x0B */ + 0x00, 0x00, 0xE2, 0x63, 0x90, 0xC5, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x96, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x95, 0x42, /* 0x14-0x17 */ + 0xE2, 0x64, 0xE2, 0x65, 0x92, 0x74, 0x00, 0x00, /* 0x18-0x1B */ + 0x97, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x67, /* 0x1C-0x1F */ + 0xE2, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xED, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE2, 0x69, 0x88, 0xEE, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x6C, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x6A, /* 0x38-0x3B */ + 0x89, 0xD2, 0x8C, 0x6D, 0xE2, 0x6B, 0x8D, 0x65, /* 0x3C-0x3F */ + 0x8D, 0x92, 0x00, 0x00, 0x95, 0xE4, 0xE2, 0x6D, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x73, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xE2, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x90, 0xCF, 0x89, 0x6E, 0x89, 0xB8, /* 0x4C-0x4F */ + 0x88, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x6E, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xE2, 0x70, 0xE2, 0x71, 0x8F, 0xF5, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE2, 0x72, 0x00, 0x00, 0x8A, 0x6E, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE2, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x8C, 0x8A, 0x00, 0x00, 0x8B, 0x86, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xE2, 0x75, 0x8B, 0xF3, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE2, 0x76, 0x00, 0x00, 0x90, 0xFA, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x93, 0xCB, 0x00, 0x00, 0x90, 0xDE, /* 0x80-0x83 */ + 0x8D, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE2, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x82, 0x91, 0x8B, /* 0x90-0x93 */ + 0x00, 0x00, 0xE2, 0x79, 0xE2, 0x7B, 0xE2, 0x78, /* 0x94-0x97 */ + 0xE2, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x41, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE2, 0x7C, 0x8C, 0x45, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x87, 0x97, 0x71, /* 0xAC-0xAF */ + 0xE2, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x80, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x4D, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x83, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x96, /* 0xC0-0xC3 */ + 0xE2, 0x82, 0xE2, 0x81, 0x00, 0x00, 0xE2, 0x85, /* 0xC4-0xC7 */ + 0xE2, 0x7D, 0x00, 0x00, 0xE2, 0x86, 0x97, 0xA7, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE2, 0x87, 0x00, 0x00, 0xE2, 0x88, /* 0xCC-0xCF */ + 0x00, 0x00, 0xEE, 0x67, 0x9A, 0xF2, 0xE2, 0x8A, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE2, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE2, 0x8B, 0xE2, 0x8C, 0x00, 0x00, /* 0xD8-0xDB */ + 0x97, 0xB3, 0xE2, 0x8D, 0x00, 0x00, 0xE8, 0xED, /* 0xDC-0xDF */ + 0x8F, 0xCD, 0xE2, 0x8E, 0xE2, 0x8F, 0x8F, 0x76, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x93, 0xB6, 0xE2, 0x90, 0xEE, 0x68, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x47, 0xEE, 0x6A, /* 0xE8-0xEB */ + 0x00, 0x00, 0xE2, 0x91, 0x00, 0x00, 0x92, 0x5B, /* 0xEC-0xEF */ + 0xE2, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xA3, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x99, 0x5E, 0x92, 0x7C, 0x8E, 0xB1, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xC6, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7B[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x93, 0x00, 0x00, /* 0x00-0x03 */ + 0xE2, 0xA0, 0x00, 0x00, 0xE2, 0x96, 0x00, 0x00, /* 0x04-0x07 */ + 0x8B, 0x88, 0x00, 0x00, 0xE2, 0x95, 0xE2, 0xA2, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x94, /* 0x0C-0x0F */ + 0x00, 0x00, 0x8F, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE2, 0x98, 0xE2, 0x99, 0x00, 0x00, 0x93, 0x4A, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x9A, 0x00, 0x00, /* 0x1C-0x1F */ + 0x8A, 0x7D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x90, 0x79, 0x95, 0x84, 0x00, 0x00, /* 0x24-0x27 */ + 0xE2, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x91, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x97, /* 0x30-0x33 */ + 0x00, 0x00, 0xE2, 0x9B, 0xE2, 0x9D, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x8D, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xE2, 0xA4, 0x95, 0x4D, 0x00, 0x00, /* 0x44-0x47 */ + 0x94, 0xA4, 0x93, 0x99, 0x00, 0x00, 0x8B, 0xD8, /* 0x48-0x4B */ + 0xE2, 0xA3, 0xE2, 0xA1, 0x00, 0x00, 0x94, 0xB3, /* 0x4C-0x4F */ + 0xE2, 0x9E, 0x92, 0x7D, 0x93, 0x9B, 0x00, 0x00, /* 0x50-0x53 */ + 0x93, 0x9A, 0x00, 0x00, 0x8D, 0xF4, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xE2, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xE2, 0xA6, 0x00, 0x00, 0xE2, 0xA8, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE2, 0xAB, 0x00, 0x00, 0xE2, 0xAC, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE2, 0xA9, 0xE2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xE2, 0xA7, 0xE2, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x9F, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xCD, 0x89, 0xD3, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xB3, /* 0x88-0x8B */ + 0x00, 0x00, 0xE2, 0xB0, 0x00, 0x00, 0xE2, 0xB5, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xB4, 0x00, 0x00, /* 0x90-0x93 */ + 0x94, 0x93, 0x96, 0xA5, 0x00, 0x00, 0x8E, 0x5A, /* 0x94-0x97 */ + 0xE2, 0xAE, 0xE2, 0xB7, 0xE2, 0xB2, 0x00, 0x00, /* 0x98-0x9B */ + 0xE2, 0xB1, 0xE2, 0xAD, 0xEE, 0x6B, 0xE2, 0xAF, /* 0x9C-0x9F */ + 0x00, 0x00, 0x8A, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x5C, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x90, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x94, 0xA0, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE2, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x94, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x90, 0xDF, 0xE2, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x94, 0xCD, 0x00, 0x00, 0xE2, 0xBD, 0x95, 0xD1, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x92, 0x7A, 0x00, 0x00, 0xE2, 0xB8, /* 0xC8-0xCB */ + 0xE2, 0xBA, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xBB, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE2, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x8E, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x93, 0xC4, 0xE2, 0xC3, 0xE2, 0xC2, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE2, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x98, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC8, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCC, 0xE2, 0xC9, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_7C[512] = { + 0xE2, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC6, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE2, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE2, 0xC0, 0x99, 0xD3, 0xE2, 0xC7, /* 0x10-0x13 */ + 0xE2, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCA, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD0, /* 0x1C-0x1F */ + 0x00, 0x00, 0x8A, 0xC8, 0x00, 0x00, 0xE2, 0xCD, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCE, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCF, 0xE2, 0xD2, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD1, /* 0x34-0x37 */ + 0x94, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xE2, 0xD3, 0x97, 0xFA, 0x95, 0xEB, /* 0x3C-0x3F */ + 0xE2, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD5, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE2, 0xD4, 0x90, 0xD0, 0x00, 0x00, 0xE2, 0xD7, /* 0x4C-0x4F */ + 0xE2, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xE2, 0xD6, 0x00, 0x00, 0xE2, 0xDD, 0x00, 0x00, /* 0x54-0x57 */ + 0xE2, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xDB, /* 0x5C-0x5F */ + 0xE2, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xE2, 0xDC, 0xE2, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE2, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xC4, /* 0x70-0x73 */ + 0x00, 0x00, 0xE2, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xE0, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x8B, 0xCC, 0x8C, 0x48, 0xE2, 0xE1, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x95, 0xB2, 0x00, 0x00, 0x90, 0x88, /* 0x88-0x8B */ + 0x00, 0x00, 0x96, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xE2, 0xE2, 0x00, 0x00, 0x97, 0xB1, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x94, 0x94, 0x00, 0x00, 0x91, 0x65, /* 0x94-0x97 */ + 0x94, 0x53, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x6C, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xBE, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE2, 0xE7, 0xE2, 0xE5, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE2, 0xE3, 0x8A, 0x9F, 0x00, 0x00, 0x8F, 0xCF, /* 0xA4-0xA7 */ + 0xE2, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xE6, /* 0xA8-0xAB */ + 0x00, 0x00, 0xE2, 0xE4, 0xE2, 0xEC, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xE2, 0xEB, 0xE2, 0xEA, 0xE2, 0xE9, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE2, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE2, 0xEE, 0x90, 0xB8, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE2, 0xEF, 0x00, 0x00, 0xE2, 0xF1, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE2, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0xD0, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0x57, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF3, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x9C, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE2, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE2, 0xF4, 0x00, 0x00, 0x95, 0xB3, 0x91, 0x8C, /* 0xDC-0xDF */ + 0x8D, 0x66, 0x00, 0x00, 0xE2, 0xF5, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xC6, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF7, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF8, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE2, 0xF9, 0x00, 0x00, 0xE2, 0xFA, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x8E, 0x85, 0x00, 0x00, 0xE2, 0xFB, 0x8C, 0x6E, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x8A, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7D[512] = { + 0x8B, 0x49, 0x00, 0x00, 0xE3, 0x40, 0x00, 0x00, /* 0x00-0x03 */ + 0x96, 0xF1, 0x8D, 0x67, 0xE2, 0xFC, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x43, 0x96, 0xE4, /* 0x08-0x0B */ + 0x00, 0x00, 0x94, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x95, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x8F, 0x83, 0xE3, 0x42, 0x00, 0x00, 0x8E, 0xD1, /* 0x14-0x17 */ + 0x8D, 0x68, 0x8E, 0x86, 0x8B, 0x89, 0x95, 0xB4, /* 0x18-0x1B */ + 0xE3, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x91, 0x66, 0x96, 0x61, 0x8D, 0xF5, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x87, /* 0x28-0x2B */ + 0x92, 0xDB, 0x00, 0x00, 0xE3, 0x46, 0x97, 0xDD, /* 0x2C-0x2F */ + 0x8D, 0xD7, 0x00, 0x00, 0xE3, 0x47, 0x90, 0x61, /* 0x30-0x33 */ + 0x00, 0x00, 0xE3, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x8F, 0xD0, 0x8D, 0xAE, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x48, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x49, 0x8C, 0xBC, /* 0x40-0x43 */ + 0x91, 0x67, 0xE3, 0x44, 0xE3, 0x4A, 0x00, 0x00, /* 0x44-0x47 */ + 0xEE, 0x6D, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x45, /* 0x48-0x4B */ + 0x8C, 0x6F, 0x00, 0x00, 0xE3, 0x4D, 0xE3, 0x51, /* 0x4C-0x4F */ + 0x8C, 0x8B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x4C, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x55, /* 0x58-0x5B */ + 0xEE, 0x6E, 0x00, 0x00, 0x8D, 0x69, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x97, 0x8D, 0x88, 0xBA, 0xE3, 0x52, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x8B, 0x00, 0x00, /* 0x64-0x67 */ + 0xE3, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x50, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x93, 0x9D, 0xE3, 0x4E, 0xE3, 0x4B, /* 0x70-0x73 */ + 0x00, 0x00, 0x8A, 0x47, 0x90, 0xE2, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x8C, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE3, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xE3, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x56, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x53, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x8C, 0x70, 0x91, 0xB1, 0xE3, 0x58, /* 0x98-0x9B */ + 0x91, 0x8E, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x65, /* 0x9C-0x9F */ + 0xEE, 0x70, 0x00, 0x00, 0xE3, 0x61, 0xE3, 0x5B, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x5F, /* 0xA8-0xAB */ + 0x8E, 0xF8, 0x88, 0xDB, 0xE3, 0x5A, 0xE3, 0x62, /* 0xAC-0xAF */ + 0xE3, 0x66, 0x8D, 0x6A, 0x96, 0xD4, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x92, 0xD4, 0xE3, 0x5C, 0x00, 0x00, 0xEE, 0x6F, /* 0xB4-0xB7 */ + 0xE3, 0x64, 0x00, 0x00, 0xE3, 0x59, 0x92, 0x5D, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE3, 0x5E, 0x88, 0xBB, 0x96, 0xC8, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x5D, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xD9, 0x94, 0xEA, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x8D, /* 0xCC-0xCF */ + 0x00, 0x00, 0x97, 0xCE, 0x8F, 0x8F, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE3, 0x8E, 0xEE, 0x71, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE3, 0x67, 0x00, 0x00, 0x90, 0xFC, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE3, 0x63, 0xE3, 0x68, 0xE3, 0x6A, 0x00, 0x00, /* 0xDC-0xDF */ + 0x92, 0xF7, 0xE3, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE3, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x95, 0xD2, 0x8A, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x96, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x88, 0xDC, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x6C, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x97, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x6B, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_7E[512] = { + 0x00, 0x00, 0x89, 0x8F, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x93, 0xEA, 0xE3, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xE3, 0x75, 0xE3, 0x6F, 0xE3, 0x76, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x72, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x9B, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xC8, 0xE3, 0x74, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE3, 0x71, 0xE3, 0x77, 0xE3, 0x70, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x63, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x96, 0x44, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x6B, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE3, 0x73, 0xE3, 0x80, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xE3, 0x7B, 0x00, 0x00, 0xE3, 0x7E, /* 0x34-0x37 */ + 0x00, 0x00, 0xE3, 0x7C, 0xE3, 0x81, 0xE3, 0x7A, /* 0x38-0x3B */ + 0x00, 0x00, 0xE3, 0x60, 0x90, 0xD1, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x94, 0xC9, 0x00, 0x00, 0xE3, 0x7D, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x78, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0x40, 0x8C, 0x71, /* 0x48-0x4B */ + 0x00, 0x00, 0x8F, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x72, 0x00, 0x00, /* 0x50-0x53 */ + 0x90, 0x44, 0x91, 0x55, 0xE3, 0x84, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xE3, 0x86, 0xE3, 0x87, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xE3, 0x83, 0xE3, 0x85, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x79, 0xE3, 0x82, /* 0x64-0x67 */ + 0x00, 0x00, 0xE3, 0x8A, 0xE3, 0x89, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x96, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x8C, 0x4A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xE3, 0x88, 0x00, 0x00, 0xE3, 0x8C, /* 0x78-0x7B */ + 0xE3, 0x8B, 0xE3, 0x8F, 0x00, 0x00, 0xE3, 0x91, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x5B, 0xE3, 0x8D, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE3, 0x92, 0xE3, 0x93, 0xED, 0x40, 0x00, 0x00, /* 0x88-0x8B */ + 0xE3, 0x94, 0x00, 0x00, 0xE3, 0x9A, 0x93, 0x5A, /* 0x8C-0x8F */ + 0xE3, 0x96, 0x00, 0x00, 0xE3, 0x95, 0xE3, 0x97, /* 0x90-0x93 */ + 0xE3, 0x98, 0x00, 0x00, 0xE3, 0x99, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x9B, /* 0x98-0x9B */ + 0xE3, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_7F[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xCA, 0x00, 0x00, /* 0x34-0x37 */ + 0xE3, 0x9D, 0x00, 0x00, 0xE3, 0x9E, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xE3, 0x9F, 0x00, 0x00, 0xEE, 0x73, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE3, 0xA0, 0xE3, 0xA1, 0xE3, 0xA2, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE3, 0xA3, 0xE3, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xE3, 0xA6, 0xE3, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xE3, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xA8, /* 0x5C-0x5F */ + 0xE3, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAC, /* 0x64-0x67 */ + 0xE3, 0xAA, 0xE3, 0xAB, 0x8D, 0xDF, 0x8C, 0x72, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0x75, 0x00, 0x00, /* 0x6C-0x6F */ + 0x94, 0xB1, 0x00, 0x00, 0x8F, 0x90, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x94, 0x6C, 0x00, 0x00, 0x94, 0xEB, /* 0x74-0x77 */ + 0xE3, 0xAD, 0x9C, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAE, 0xE3, 0xB0, /* 0x80-0x83 */ + 0x00, 0x00, 0x97, 0x85, 0xE3, 0xAF, 0xE3, 0xB2, /* 0x84-0x87 */ + 0xE3, 0xB1, 0x00, 0x00, 0x97, 0x72, 0x00, 0x00, /* 0x88-0x8B */ + 0xE3, 0xB3, 0x00, 0x00, 0x94, 0xFC, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE3, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xB7, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xE3, 0xB6, 0xE3, 0xB5, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xEE, 0x74, 0x00, 0x00, 0xE3, 0xB8, /* 0xA0-0xA3 */ + 0x8C, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x91, 0x41, 0x8B, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xBC, 0xE3, 0xB9, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xBA, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xBD, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE3, 0xBE, 0xE3, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x89, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x89, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE3, 0xC0, 0xE3, 0xC1, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xC2, 0x00, 0x00, /* 0xC8-0xCB */ + 0x97, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x4B, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE3, 0xC4, 0xE3, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x90, 0x89, 0xE3, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xC6, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE3, 0xC7, 0x00, 0x00, 0x8A, 0xE3, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x8A, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xC8, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE3, 0xC9, 0x00, 0x00, 0x96, 0x7C, /* 0xF8-0xFB */ + 0x97, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_80[512] = { + 0x97, 0x73, 0x98, 0x56, 0x00, 0x00, 0x8D, 0x6C, /* 0x00-0x03 */ + 0xE3, 0xCC, 0x8E, 0xD2, 0xE3, 0xCB, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xCD, /* 0x08-0x0B */ + 0x8E, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x91, 0xCF, 0x00, 0x00, 0xE3, 0xCE, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x8D, 0x6B, 0x00, 0x00, 0x96, 0xD5, /* 0x14-0x17 */ + 0xE3, 0xCF, 0xE3, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xE3, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE3, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xE3, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xA8, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0xEB, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD5, /* 0x38-0x3B */ + 0x00, 0x00, 0x92, 0x5E, 0x00, 0x00, 0xE3, 0xD4, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD7, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD6, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD8, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0xB9, 0x00, 0x00, /* 0x54-0x57 */ + 0xE3, 0xD9, 0x00, 0x00, 0xE3, 0xDA, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xB7, 0xE3, 0xDB, /* 0x5C-0x5F */ + 0x00, 0x00, 0x91, 0x8F, 0xE3, 0xDC, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xE3, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xFC, /* 0x6C-0x6F */ + 0xE3, 0xE0, 0x00, 0x00, 0xE3, 0xDF, 0xE3, 0xDE, /* 0x70-0x73 */ + 0x92, 0xAE, 0x00, 0x00, 0xE3, 0xE1, 0x90, 0x45, /* 0x74-0x77 */ + 0x00, 0x00, 0xE3, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE3, 0xE3, 0x98, 0x57, 0xE3, 0xE4, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xE3, 0xE5, 0xE3, 0xE7, 0xE3, 0xE6, 0x94, 0xA3, /* 0x84-0x87 */ + 0x00, 0x00, 0x93, 0xF7, 0x00, 0x00, 0x98, 0x5D, /* 0x88-0x8B */ + 0x94, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xE9, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xD1, 0x00, 0x00, /* 0x94-0x97 */ + 0x95, 0x49, 0x00, 0x00, 0xE3, 0xEA, 0xE3, 0xE8, /* 0x98-0x9B */ + 0x00, 0x00, 0x8A, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x8C, 0xD2, 0x8E, 0x88, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x94, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x8C, 0xA8, 0x96, 0x62, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE3, 0xED, 0xE3, 0xEB, 0x00, 0x00, 0x8D, 0x6D, /* 0xAC-0xAF */ + 0x00, 0x00, 0x8D, 0x6E, 0x88, 0xE7, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x8D, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x78, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xDD, /* 0xC0-0xC3 */ + 0xE3, 0xF2, 0x00, 0x00, 0x92, 0x5F, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x94, 0x77, 0x00, 0x00, 0x91, 0xD9, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xF4, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE3, 0xF0, 0xE3, 0xF3, 0xE3, 0xEE, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE3, 0xF1, 0x96, 0x45, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x8C, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x88, 0xFB, 0xE3, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xF6, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE3, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x93, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x8B, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE4, 0x45, 0x94, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_81[512] = { + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x89, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x8B, 0xBA, 0x90, 0xC6, 0x98, 0x65, /* 0x04-0x07 */ + 0x96, 0xAC, 0xE3, 0xF5, 0x90, 0xD2, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x72, 0xE3, 0xF8, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFA, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE3, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFB, /* 0x2C-0x2F */ + 0x00, 0x00, 0x92, 0x45, 0x00, 0x00, 0x94, 0x5D, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x92, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x42, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x41, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFC, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0x74, 0x00, 0x00, /* 0x4C-0x4F */ + 0x95, 0x85, 0xE4, 0x44, 0x00, 0x00, 0xE4, 0x43, /* 0x50-0x53 */ + 0x8D, 0x6F, 0x98, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x54, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xE4, 0x48, 0xE4, 0x49, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xEE, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x47, 0x00, 0x00, /* 0x6C-0x6F */ + 0x8D, 0x98, 0xE4, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xE4, 0x4A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x92, 0xB0, 0x95, 0xA0, 0x91, 0x42, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xDA, /* 0x7C-0x7F */ + + 0xE4, 0x4E, 0x00, 0x00, 0xE4, 0x4F, 0xE4, 0x4B, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE4, 0x4C, 0x00, 0x00, 0xE4, 0x4D, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x70, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x55, /* 0x90-0x93 */ + 0x00, 0x00, 0xE4, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x86, 0x00, 0x00, /* 0x98-0x9B */ + 0x96, 0x8C, 0x95, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xE4, 0x50, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x53, /* 0xA0-0xA3 */ + 0xE4, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x96, 0x63, 0xE4, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE4, 0x57, 0x00, 0x00, 0x00, 0x00, 0x91, 0x56, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE4, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE4, 0x5A, 0x00, 0x00, 0xE4, 0x5E, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE4, 0x5B, 0xE4, 0x59, 0x94, 0x5E, /* 0xBC-0xBF */ + 0xE4, 0x5C, 0x00, 0x00, 0xE4, 0x5D, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xB0, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE4, 0x64, 0xE4, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE4, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xE4, 0x61, 0x00, 0x00, 0x91, 0x9F, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE4, 0x63, 0xE4, 0x62, 0xE4, 0x65, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x66, /* 0xDC-0xDF */ + 0xE4, 0x67, 0x00, 0x00, 0x00, 0x00, 0x90, 0x62, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x89, 0xE7, 0x00, 0x00, 0xE4, 0x68, /* 0xE4-0xE7 */ + 0x97, 0xD5, 0x00, 0x00, 0x8E, 0xA9, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x8F, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x8A, /* 0xF0-0xF3 */ + 0x92, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x69, 0xE4, 0x6A, /* 0xF8-0xFB */ + 0x89, 0x50, 0x00, 0x00, 0xE4, 0x6B, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_82[512] = { + 0x00, 0x00, 0xE4, 0x6C, 0xE4, 0x6D, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xE4, 0x6E, 0x00, 0x00, 0xE4, 0x6F, /* 0x04-0x07 */ + 0x8B, 0xBB, 0x9D, 0xA8, 0xE4, 0x70, 0x00, 0x00, /* 0x08-0x0B */ + 0x90, 0xE3, 0xE4, 0x71, 0x8E, 0xC9, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE4, 0x72, 0x00, 0x00, 0x98, 0xAE, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x73, 0x95, 0xDC, /* 0x14-0x17 */ + 0x8A, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x91, 0x43, /* 0x18-0x1B */ + 0x8F, 0x77, 0x00, 0x00, 0x95, 0x91, 0x8F, 0x4D, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE4, 0x74, 0x8D, 0x71, 0xE4, 0x75, /* 0x28-0x2B */ + 0x94, 0xCA, 0x00, 0x00, 0xE4, 0x84, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x77, /* 0x30-0x33 */ + 0x00, 0x00, 0x91, 0xC7, 0x94, 0x95, 0x8C, 0xBD, /* 0x34-0x37 */ + 0xE4, 0x76, 0x91, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xE4, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xF8, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xE4, 0x7A, 0xE4, 0x79, 0xE4, 0x7C, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xE4, 0x7B, 0x00, 0x00, 0xE4, 0x7D, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x80, 0x00, 0x00, /* 0x60-0x63 */ + 0xE4, 0x7E, 0x00, 0x00, 0x8A, 0xCD, 0x00, 0x00, /* 0x64-0x67 */ + 0xE4, 0x81, 0x00, 0x00, 0xE4, 0x82, 0xE4, 0x83, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0xAF, 0x97, 0xC7, /* 0x6C-0x6F */ + 0x00, 0x00, 0xE4, 0x85, 0x90, 0x46, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x90, 0xE4, 0x86, /* 0x74-0x77 */ + 0xE4, 0x87, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x88, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xF0, /* 0x88-0x8B */ + 0x00, 0x00, 0xE4, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x8A, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x95, 0x87, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x8E, 0xC5, 0x00, 0x00, 0xE4, 0x8C, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x8A, 0x48, 0x88, 0xB0, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x8B, /* 0xA8-0xAB */ + 0xE4, 0x8E, 0x94, 0x6D, 0x00, 0x00, 0x90, 0x63, /* 0xAC-0xAF */ + 0x00, 0x00, 0x89, 0xD4, 0x00, 0x00, 0x96, 0x46, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x8C, 0x7C, 0x8B, 0xDA, 0x00, 0x00, 0xE4, 0x8D, /* 0xB8-0xBB */ + 0x00, 0x00, 0x89, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x8A, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x89, 0x91, 0xE4, 0x92, 0x97, 0xE8, /* 0xD0-0xD3 */ + 0x91, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x95, 0x63, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE4, 0x9E, 0x00, 0x00, 0x89, 0xD5, /* 0xD8-0xDB */ + 0xE4, 0x9C, 0x00, 0x00, 0xE4, 0x9A, 0xE4, 0x91, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE4, 0x8F, 0x00, 0x00, 0xE4, 0x90, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x8E, 0xE1, 0x8B, 0xEA, 0x92, 0x97, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xCF, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x89, 0x70, 0x00, 0x00, 0xE4, 0x94, /* 0xF0-0xF3 */ + 0xE4, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE4, 0x99, 0xE4, 0x95, 0xE4, 0x98, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_83[512] = { + 0x00, 0x00, 0xEE, 0x76, 0x96, 0xCE, 0xE4, 0x97, /* 0x00-0x03 */ + 0x89, 0xD6, 0x8A, 0x9D, 0xE4, 0x9B, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xE4, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x73, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xA1, 0xE4, 0xAA, /* 0x14-0x17 */ + 0xE4, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x88, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB2, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x88, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xA9, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xA8, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE4, 0xA3, 0xE4, 0xA2, 0x00, 0x00, /* 0x30-0x33 */ + 0xE4, 0xA0, 0xE4, 0x9F, 0x92, 0x83, 0x00, 0x00, /* 0x34-0x37 */ + 0x91, 0xF9, 0xE4, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xE4, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xE4, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x91, 0x90, 0x8C, 0x74, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x60, /* 0x4C-0x4F */ + 0xE4, 0xA6, 0x00, 0x00, 0x8D, 0x72, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x91, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x77, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB8, /* 0x70-0x73 */ + 0x00, 0x00, 0xE4, 0xB9, 0x00, 0x00, 0x89, 0xD7, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xAC, /* 0x78-0x7B */ + 0xE4, 0xB6, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x78, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xE4, 0xAC, 0x00, 0x00, 0xE4, 0xB4, /* 0x84-0x87 */ + 0x00, 0x00, 0xE4, 0xBB, 0xE4, 0xB5, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB3, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x96, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB1, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xAD, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0xCE, 0xE4, 0xAF, /* 0x9C-0x9F */ + 0xE4, 0xBA, 0x00, 0x00, 0xE4, 0xB0, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE4, 0xBC, 0x00, 0x00, 0xE4, 0xAE, 0x94, 0x9C, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x97, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE4, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE4, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xE4, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x90, 0x9B, 0x00, 0x00, 0xEE, 0x79, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x65, 0x00, 0x00, /* 0xC8-0xCB */ + 0x8B, 0xDB, 0x00, 0x00, 0xE4, 0xC0, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xD9, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0xD2, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE4, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x8D, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x93, 0x70, /* 0xDC-0xDF */ + 0xE4, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x95, 0xEC, 0x00, 0x00, 0xE4, 0xBF, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xD8, /* 0xEC-0xEF */ + 0x8C, 0xD4, 0x95, 0x48, 0xE4, 0xC9, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE4, 0xBD, 0x00, 0x00, 0xEE, 0x7A, 0xE4, 0xC6, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xD0, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE4, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_84[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC2, /* 0x00-0x03 */ + 0x93, 0xB8, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC7, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC4, /* 0x08-0x0B */ + 0x96, 0x47, 0xE4, 0xCA, 0x88, 0xDE, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xBE, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE4, 0xCC, 0x00, 0x00, 0xE4, 0xCB, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x94, 0x8B, 0xE4, 0xD2, 0x00, 0x00, /* 0x28-0x2B */ + 0xE4, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8A, 0x9E, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xE4, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xE4, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xE4, 0xD3, 0x97, 0x8E, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xDC, 0x00, 0x00, /* 0x44-0x47 */ + 0xEE, 0x7B, 0x97, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xA8, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x98, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x8B, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x95, 0x92, 0xE4, 0xE2, 0x93, 0x9F, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0xAF, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE4, 0xDB, 0x00, 0x00, 0xE4, 0xD7, /* 0x68-0x6B */ + 0x91, 0x92, 0xE4, 0xD1, 0xE4, 0xD9, 0xE4, 0xDE, /* 0x6C-0x6F */ + 0x00, 0x00, 0x94, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x88, 0xA8, 0x00, 0x00, 0xE4, 0xD6, /* 0x74-0x77 */ + 0x00, 0x00, 0xE4, 0xDF, 0x95, 0x98, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xDA, 0x00, 0x00, /* 0x80-0x83 */ + 0xE4, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0xD3, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x8F, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x8E, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x96, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x95, 0x66, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE5, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE4, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xE4, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0x97, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xEE, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x8F, 0xF6, 0xE4, 0xE3, 0x00, 0x00, 0xE4, 0xE8, /* 0xB8-0xBB */ + 0x91, 0x93, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE4, /* 0xBC-0xBF */ + 0x00, 0x00, 0xE4, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x92, 0x7E, 0x00, 0x00, 0xE4, 0xEC, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x97, 0x75, 0xE4, 0xE1, 0x8A, 0x57, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE4, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xE4, 0xEA, 0x96, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xED, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE4, 0xE6, 0xE4, 0xE9, 0x00, 0x00, /* 0xD8-0xDB */ + 0xED, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x96, 0x48, 0x00, 0x00, 0x98, 0x40, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE4, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE4, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_85[512] = { + 0x8E, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xCF, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x95, 0xCC, 0x00, 0x00, 0x96, 0xA0, /* 0x10-0x13 */ + 0xE4, 0xF7, 0xE4, 0xF6, 0x00, 0x00, 0xE4, 0xF2, /* 0x14-0x17 */ + 0xE4, 0xF3, 0x00, 0x00, 0x89, 0x55, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF5, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE4, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0xD3, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xE4, 0xF4, 0x88, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x91, 0xA0, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x95, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xE4, 0xF9, 0xE5, 0x40, 0x00, 0x00, 0x94, 0xD7, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xE4, 0xFC, 0x8F, 0xD4, 0x8E, 0xC7, 0xE5, 0x42, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xBC, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x7D, /* 0x50-0x53 */ + 0x00, 0x00, 0xE5, 0x43, 0x00, 0x00, 0x95, 0x99, /* 0x54-0x57 */ + 0xE4, 0xFB, 0xEE, 0x7E, 0xE4, 0xD4, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xFA, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x98, 0x6E, 0x93, 0xA0, 0x95, 0x93, 0xEE, 0x80, /* 0x68-0x6B */ + 0x00, 0x00, 0xE5, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x50, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x51, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xE5, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x94, 0x96, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x4E, /* 0x84-0x87 */ + 0xE5, 0x46, 0x00, 0x00, 0xE5, 0x48, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xE5, 0x52, 0xE5, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE5, 0x4B, 0x00, 0x00, 0x00, 0x00, 0x89, 0x92, /* 0x94-0x97 */ + 0x00, 0x00, 0x93, 0xE3, 0x00, 0x00, 0xE5, 0x4C, /* 0x98-0x9B */ + 0xE5, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE5, 0x45, 0x00, 0x00, 0x91, 0x45, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE5, 0x49, 0x8E, 0x46, 0x90, 0x64, 0x8C, 0x4F, /* 0xA8-0xAB */ + 0x96, 0xF2, 0x00, 0x00, 0x96, 0xF7, 0x8F, 0x92, /* 0xAC-0xAF */ + 0xEE, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE5, 0x56, 0xE5, 0x54, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x98, 0x6D, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE5, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x97, 0x95, 0x00, 0x00, 0xE5, 0x55, /* 0xCC-0xCF */ + 0xE5, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE5, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE5, 0x5B, 0xE5, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x93, 0xA1, 0xE5, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x94, 0xCB, 0xE5, 0x4D, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x93, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE5, 0x5C, 0xE5, 0x61, 0x91, 0x94, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x60, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_86[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x41, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x62, 0x91, 0x68, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x5D, 0xE5, 0x5F, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x5E, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x9F, 0x50, 0x9F, 0x41, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x64, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x63, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x97, 0x96, 0x00, 0x00, 0xE1, 0xBA, /* 0x2C-0x2F */ + 0xE5, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x66, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xE5, 0x67, 0x8C, 0xD5, 0x00, 0x00, /* 0x4C-0x4F */ + 0x8B, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xE5, 0x69, 0x99, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x95, 0x00, 0x00, /* 0x58-0x5B */ + 0x97, 0xB8, 0x00, 0x00, 0x8B, 0xF1, 0xE5, 0x6A, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x6B, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x8E, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xE5, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x93, 0xF8, 0x00, 0x00, 0x88, 0xB8, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xE1, 0xE5, 0x71, /* 0x88-0x8B */ + 0xE5, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x6D, /* 0x90-0x93 */ + 0x00, 0x00, 0x8E, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x6E, /* 0xA0-0xA3 */ + 0x94, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE5, 0x6F, 0xE5, 0x70, 0xE5, 0x7A, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x74, /* 0xAC-0xAF */ + 0xE5, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x73, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xE5, 0x75, 0x00, 0x00, 0xE5, 0x76, 0x8E, 0xD6, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE5, 0x78, 0x00, 0x00, 0x92, 0x60, /* 0xC8-0xCB */ + 0x00, 0x00, 0x8C, 0x75, 0x8A, 0x61, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE5, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x8A, 0x5E, 0x00, 0x00, 0xE5, 0x81, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x7C, 0xE5, 0x80, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x94, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE5, 0x7D, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xE5, 0x7E, 0x95, 0x67, 0x94, 0xD8, 0xE5, 0x82, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x91, 0xFB, 0xE5, 0x8C, 0x00, 0x00, 0xE5, 0x88, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xE9, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_87[512] = { + 0xE5, 0x86, 0x00, 0x00, 0x96, 0x49, 0xE5, 0x87, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x84, 0x00, 0x00, /* 0x04-0x07 */ + 0xE5, 0x85, 0xE5, 0x8A, 0xE5, 0x8D, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE5, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE5, 0x89, 0xE5, 0x83, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x92, 0x77, 0x00, 0x00, 0xE5, 0x94, 0x00, 0x00, /* 0x18-0x1B */ + 0x96, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE5, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE5, 0x93, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xE5, 0x8E, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x90, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x91, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x8F, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x90, 0xE4, 0x00, 0x00, 0x98, 0x58, /* 0x48-0x4B */ + 0xE5, 0x98, 0x00, 0x00, 0xE5, 0x99, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x9F, /* 0x50-0x53 */ + 0x00, 0x00, 0x90, 0x49, 0x00, 0x00, 0xE5, 0x9B, /* 0x54-0x57 */ + 0x00, 0x00, 0xE5, 0x9E, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x96, /* 0x5C-0x5F */ + 0xE5, 0x95, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA0, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xDA, 0x00, 0x00, /* 0x64-0x67 */ + 0xE5, 0x9C, 0x00, 0x00, 0xE5, 0xA1, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x9D, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xE5, 0x9A, 0x00, 0x00, 0x92, 0xB1, 0x00, 0x00, /* 0x74-0x77 */ + 0xE5, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0x88, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA5, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x97, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA4, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA3, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xAC, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA6, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xAE, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0x86, 0xE5, 0xB1, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE5, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE5, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xE5, 0xAD, 0x00, 0x00, 0xE5, 0xB0, 0xE5, 0xAF, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA7, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xE5, 0xAA, 0x00, 0x00, 0xE5, 0xBB, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xE5, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB2, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB3, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB8, 0xE5, 0xB9, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x8A, 0x49, 0x00, 0x00, 0x8B, 0x61, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB7, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_88[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xE5, 0xA2, 0x00, 0x00, 0xEE, 0x85, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE5, 0xB6, 0xE5, 0xBA, 0xE5, 0xB5, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE5, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xE5, 0xBE, 0xE5, 0xBD, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE5, 0xC0, 0xE5, 0xBF, 0xE5, 0x79, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC4, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE5, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC2, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xE5, 0xC3, 0x00, 0x00, 0xE5, 0xC5, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x8C, 0x8C, 0x00, 0x00, 0xE5, 0xC7, 0x00, 0x00, /* 0x40-0x43 */ + 0xE5, 0xC6, 0x00, 0x00, 0x8F, 0x4F, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x8D, 0x73, 0x9F, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC8, 0x8F, 0x70, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x58, /* 0x54-0x57 */ + 0x00, 0x00, 0xE5, 0xC9, 0x00, 0x00, 0x89, 0x71, /* 0x58-0x5B */ + 0x00, 0x00, 0x8F, 0xD5, 0xE5, 0xCA, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x8D, 0x74, 0xE5, 0xCB, 0x88, 0xDF, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x95, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xCC, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x90, 0x8A, 0x00, 0x00, 0xE5, 0xD3, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xE5, 0xD0, 0x00, 0x00, 0x92, 0x8F, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE5, 0xD1, 0xE5, 0xCE, 0x8B, 0xDC, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE5, 0xCD, 0xE5, 0xD4, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x8C, 0x55, 0x00, 0x00, 0x00, 0x00, 0x91, 0xDC, /* 0x88-0x8B */ + 0x00, 0x00, 0xE5, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xD6, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0xB3, 0xE5, 0xD5, /* 0x94-0x97 */ + 0x00, 0x00, 0xE5, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xCF, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xD9, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE5, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xED, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xD7, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE5, 0xDC, 0xE5, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x8C, 0xD1, 0xE5, 0xD2, 0x00, 0x00, 0x88, 0xBF, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xDD, /* 0xBC-0xBF */ + 0x00, 0x00, 0x8D, 0xD9, 0x97, 0xF4, 0xE5, 0xDF, /* 0xC0-0xC3 */ + 0xE5, 0xE0, 0x91, 0x95, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xA0, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE5, 0xE1, 0x97, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE5, 0xE2, 0xE5, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x95, 0xE2, 0xE5, 0xE4, 0x00, 0x00, 0x8D, 0xBE, /* 0xDC-0xDF */ + 0x00, 0x00, 0x97, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xE5, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xEA, 0x8F, 0xD6, /* 0xF0-0xF3 */ + 0xE5, 0xE8, 0xEE, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x97, 0x87, 0xE5, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE5, 0xE7, 0x90, 0xBB, 0x90, 0x9E, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_89[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE6, 0x00, 0x00, /* 0x00-0x03 */ + 0xE5, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x95, 0xA1, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xED, 0x00, 0x00, /* 0x08-0x0B */ + 0xE5, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x8A, 0x8C, 0x00, 0x00, 0x96, 0x4A, 0xE5, 0xEE, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xED, 0x41, 0xE5, 0xFA, 0xE5, 0xF0, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE5, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xF2, 0xE5, 0xF3, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xF7, 0x00, 0x00, /* 0x34-0x37 */ + 0xE5, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xF6, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE5, 0xF4, 0x00, 0x00, 0xE5, 0xEF, /* 0x40-0x43 */ + 0xE5, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE5, 0xF9, 0xE8, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xA6, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xFC, 0x8B, 0xDD, /* 0x5C-0x5F */ + 0xE5, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xE6, 0x41, 0x00, 0x00, 0xE6, 0x40, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x43, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xE6, 0x42, 0x00, 0x00, 0xE6, 0x44, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x8F, 0x50, 0x00, 0x00, /* 0x70-0x73 */ + 0xE6, 0x45, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x46, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x47, 0x90, 0xBC, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x97, 0x76, 0x00, 0x00, 0xE6, 0x48, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xA2, 0x94, 0x65, /* 0x84-0x87 */ + 0xE6, 0x49, 0x00, 0x00, 0xE6, 0x4A, 0x8C, 0xA9, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0x4B, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x4B, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x8B, 0x94, 0x60, /* 0x94-0x97 */ + 0xE6, 0x4C, 0x00, 0x00, 0x8A, 0x6F, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE6, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x4F, 0x97, 0x97, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE6, 0x4E, 0x90, 0x65, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE6, 0x50, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x51, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x52, 0x8A, 0xCF, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x53, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE6, 0x54, 0x00, 0x00, 0xE6, 0x55, /* 0xBC-0xBF */ + 0xE6, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x8A, 0x70, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x57, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE6, 0x58, 0xE6, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0xF0, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0x47, 0xE6, 0x5A, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE6, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xE6, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_8A[512] = { + 0x8C, 0xBE, 0x00, 0x00, 0x92, 0xF9, 0xE6, 0x5D, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x8C, 0x76, 0x00, 0x00, 0x90, 0x75, 0x00, 0x00, /* 0x08-0x0B */ + 0xE6, 0x60, 0x00, 0x00, 0x93, 0xA2, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE6, 0x5F, 0x00, 0x00, 0xEE, 0x87, 0x8C, 0x50, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x5E, 0x91, 0xF5, /* 0x14-0x17 */ + 0x8B, 0x4C, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x61, /* 0x18-0x1B */ + 0x00, 0x00, 0xE6, 0x62, 0x00, 0x00, 0x8F, 0xD7, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x8D, /* 0x20-0x23 */ + 0x00, 0x00, 0xE6, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x4B, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x90, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8B, 0x96, 0x00, 0x00, 0x96, 0xF3, /* 0x30-0x33 */ + 0x91, 0x69, 0x00, 0x00, 0xE6, 0x64, 0xEE, 0x88, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0x66, 0x92, 0x90, /* 0x38-0x3B */ + 0x8F, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE6, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x68, 0x00, 0x00, /* 0x44-0x47 */ + 0xE6, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x8D, 0xBC, 0x91, 0xC0, 0xE6, 0x67, 0x00, 0x00, /* 0x50-0x53 */ + 0x8F, 0xD9, 0x95, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x66, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x8C, 0x00, 0x00, /* 0x5C-0x5F */ + 0x89, 0x72, 0x00, 0x00, 0xE6, 0x6D, 0x8C, 0x77, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x8E, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x8E, 0x8D, 0x00, 0x00, 0x98, 0x6C, /* 0x68-0x6B */ + 0xE6, 0x6C, 0xE6, 0x6B, 0x91, 0x46, 0x00, 0x00, /* 0x6C-0x6F */ + 0x8B, 0x6C, 0x98, 0x62, 0x8A, 0x59, 0x8F, 0xDA, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xEE, 0x89, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xE6, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x6F, 0x00, 0x00, /* 0x80-0x83 */ + 0xE6, 0x70, 0xE6, 0x6E, 0x00, 0x00, 0x8C, 0xD6, /* 0x84-0x87 */ + 0x00, 0x00, 0x97, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x8E, 0x8F, 0x94, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE6, 0x73, 0x00, 0x00, 0x90, 0xBE, /* 0x90-0x93 */ + 0x00, 0x00, 0x92, 0x61, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x97, 0x55, 0x00, 0x00, 0xE6, 0x76, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0xEA, 0x00, 0x00, /* 0x9C-0x9F */ + 0x90, 0xBD, 0xE6, 0x72, 0x00, 0x00, 0xE6, 0x77, /* 0xA0-0xA3 */ + 0x8C, 0xEB, 0xE6, 0x74, 0xE6, 0x75, 0xEE, 0x8A, /* 0xA4-0xA7 */ + 0xE6, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x90, 0xE0, 0x93, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x92, 0x4E, 0x00, 0x00, 0x89, 0xDB, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x94, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x8B, 0x62, 0x00, 0x00, 0xEE, 0x8B, 0x92, 0xB2, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x7A, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xE6, 0x78, 0x00, 0x00, 0x00, 0x00, 0x92, 0x6B, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xBF, /* 0xC8-0xCB */ + 0x8A, 0xD0, 0xE6, 0x79, 0x00, 0x00, 0x90, 0x7A, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xC8, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x5F, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x7B, 0xE6, 0x87, /* 0xD8-0xDB */ + 0x92, 0xB3, 0x00, 0x00, 0xE6, 0x86, 0xEE, 0x8C, /* 0xDC-0xDF */ + 0xE6, 0x83, 0xE6, 0x8B, 0xE6, 0x84, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE6, 0x80, 0x00, 0x00, 0x92, 0xFA, 0xE6, 0x7E, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x7C, /* 0xE8-0xEB */ + 0x00, 0x00, 0x97, 0x40, 0x8E, 0x90, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE6, 0x81, 0x00, 0x00, 0xE6, 0x7D, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x8E, 0xE6, 0x85, /* 0xF4-0xF7 */ + 0x8F, 0x94, 0x00, 0x00, 0x8C, 0xBF, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0xF8, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8B[512] = { + 0x96, 0x64, 0x89, 0x79, 0x88, 0xE0, 0x00, 0x00, /* 0x00-0x03 */ + 0x93, 0xA3, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x89, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xE6, 0x88, 0x00, 0x00, 0x93, 0xE4, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE6, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xE6, 0x82, 0x00, 0x00, 0xE6, 0x8C, 0xE6, 0x8E, /* 0x14-0x17 */ + 0x00, 0x00, 0x8C, 0xAA, 0xE6, 0x8A, 0x8D, 0x75, /* 0x18-0x1B */ + 0x00, 0x00, 0x8E, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE6, 0x8F, 0x97, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x92, 0x00, 0x00, /* 0x24-0x27 */ + 0xE6, 0x95, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x93, /* 0x28-0x2B */ + 0x95, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x90, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x8B, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x94, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE6, 0x96, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xE6, 0x9A, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE6, 0x97, 0x00, 0x00, 0xE6, 0x99, 0xE6, 0x98, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x8F, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x9B, 0x00, 0x00, /* 0x54-0x57 */ + 0x8E, 0xAF, 0x00, 0x00, 0xE6, 0x9D, 0xE6, 0x9C, /* 0x58-0x5B */ + 0x95, 0x88, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x9F, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x78, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x9E, /* 0x68-0x6B */ + 0xE6, 0xA0, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xA1, /* 0x6C-0x6F */ + 0x8B, 0x63, 0xE3, 0xBF, 0x8F, 0xF7, 0x00, 0x00, /* 0x70-0x73 */ + 0xE6, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xEC, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE6, 0xA3, 0x00, 0x00, 0xEE, 0x90, /* 0x7C-0x7F */ + + 0xE6, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x5D, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x9D, 0xCC, 0x00, 0x00, /* 0x88-0x8B */ + 0xE6, 0xA5, 0x00, 0x00, 0xE6, 0xA6, 0x00, 0x00, /* 0x8C-0x8F */ + 0x8F, 0x51, 0x00, 0x00, 0xE6, 0xA7, 0xE6, 0xA8, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xA9, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xE6, 0xAA, 0xE6, 0xAB, 0x00, 0x00, /* 0x98-0x9B */ +}; + +static const unsigned char u2c_8C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x4A, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xAC, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xAE, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE6, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0xA4, 0x00, 0x00, /* 0x44-0x47 */ + 0xE6, 0xAF, 0x00, 0x00, 0x96, 0x4C, 0x00, 0x00, /* 0x48-0x4B */ + 0xE6, 0xB0, 0x00, 0x00, 0xE6, 0xB1, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE6, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE6, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0xD8, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x8F, 0xDB, 0xE6, 0xB4, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0x8B, 0x98, 0xAC, /* 0x68-0x6B */ + 0xE6, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xE6, 0xB6, 0x95, 0x5E, 0xE6, 0xB7, 0x00, 0x00, /* 0x78-0x7B */ + 0xE6, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xB8, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xE6, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xE6, 0xB9, 0xE6, 0xBB, 0x00, 0x00, /* 0x88-0x8B */ + 0x96, 0x65, 0xE6, 0xBC, 0xE6, 0xBD, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE6, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xE6, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x8A, 0x4C, 0x92, 0xE5, 0x00, 0x00, /* 0x9C-0x9F */ + 0x95, 0x89, 0x8D, 0xE0, 0x8D, 0x76, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x6E, /* 0xA4-0xA7 */ + 0x89, 0xDD, 0x94, 0xCC, 0xE6, 0xC3, 0x8A, 0xD1, /* 0xA8-0xAB */ + 0x90, 0xD3, 0xE6, 0xC2, 0xE6, 0xC7, 0x92, 0x99, /* 0xAC-0xAF */ + 0x96, 0xE1, 0x00, 0x00, 0xE6, 0xC5, 0xE6, 0xC6, /* 0xB0-0xB3 */ + 0x8B, 0x4D, 0x00, 0x00, 0xE6, 0xC8, 0x94, 0x83, /* 0xB4-0xB7 */ + 0x91, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x94, 0xEF, /* 0xB8-0xBB */ + 0x93, 0x5C, 0xE6, 0xC4, 0x00, 0x00, 0x96, 0x66, /* 0xBC-0xBF */ + 0x89, 0xEA, 0xE6, 0xCA, 0x98, 0x47, 0x92, 0xC0, /* 0xC0-0xC3 */ + 0x98, 0x64, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x91, /* 0xC4-0xC7 */ + 0xE6, 0xC9, 0x00, 0x00, 0x91, 0xAF, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE6, 0xDA, 0x91, 0x47, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x93, 0xF6, 0x00, 0x00, 0x95, 0x6F, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xCD, 0x8E, 0x5E, /* 0xD8-0xDB */ + 0x8E, 0x92, 0x00, 0x00, 0x8F, 0xDC, 0x00, 0x00, /* 0xDC-0xDF */ + 0x94, 0x85, 0x00, 0x00, 0x8C, 0xAB, 0xE6, 0xCC, /* 0xE0-0xE3 */ + 0xE6, 0xCB, 0x00, 0x00, 0x95, 0x8A, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xBF, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x93, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xEE, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xEE, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xCF, 0xE6, 0xD0, /* 0xF8-0xFB */ + 0x8D, 0x77, 0xE6, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xE6, 0xD1, 0xE6, 0xD2, 0x00, 0x00, 0xE6, 0xD4, /* 0x04-0x07 */ + 0x91, 0xA1, 0x00, 0x00, 0xE6, 0xD3, 0x8A, 0xE4, /* 0x08-0x0B */ + 0x00, 0x00, 0xE6, 0xD6, 0x00, 0x00, 0xE6, 0xD5, /* 0x0C-0x0F */ + 0xE6, 0xD7, 0x00, 0x00, 0xEE, 0x93, 0xE6, 0xD9, /* 0x10-0x13 */ + 0xE6, 0xDB, 0x00, 0x00, 0xE6, 0xDC, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x90, 0xD4, 0x00, 0x00, 0x8E, 0xCD, 0xE6, 0xDD, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x71, /* 0x68-0x6B */ + 0x00, 0x00, 0xE6, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x91, 0x96, 0xE6, 0xDF, 0x00, 0x00, 0xE6, 0xE0, /* 0x70-0x73 */ + 0x95, 0x8B, 0x00, 0x00, 0xEE, 0x94, 0x8B, 0x4E, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE6, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x92, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x7A, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xE6, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xEF, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x90, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xAB, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE5, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE4, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE3, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xEB, /* 0xC8-0xCB */ + 0xE6, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE6, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE8, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE7, 0xE6, 0xEA, /* 0xD8-0xDB */ + 0x00, 0x00, 0x8B, 0x97, 0x00, 0x00, 0xE6, 0xEE, /* 0xDC-0xDF */ + 0x00, 0x00, 0x90, 0xD5, 0x00, 0x00, 0xE6, 0xEF, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x8C, 0xD7, 0x00, 0x00, 0xE6, 0xEC, 0xE6, 0xED, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x48, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xB5, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x91, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE6, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xF3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xE6, 0xF1, 0xE6, 0xF2, 0x97, 0x78, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xA5, /* 0x0C-0x0F */ + 0xE6, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xE6, 0xF4, 0xE6, 0xF5, 0xE6, 0xF7, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x48, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE6, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xE6, 0xFB, 0xE6, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xF8, 0x00, 0x00, /* 0x40-0x43 */ + 0x92, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x40, /* 0x44-0x47 */ + 0xE7, 0x44, 0xE7, 0x41, 0xE6, 0xFC, 0x00, 0x00, /* 0x48-0x4B */ + 0xE7, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE7, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE7, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xE7, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0xD6, /* 0x5C-0x5F */ + 0xE7, 0x47, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x49, /* 0x60-0x63 */ + 0xE7, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x4C, 0x00, 0x00, /* 0x70-0x73 */ + 0x8F, 0x52, 0x00, 0x00, 0xE7, 0x4B, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xE7, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE7, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xE7, 0x51, 0xE7, 0x50, 0x00, 0x00, 0xE7, 0x4F, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x53, 0xE7, 0x52, /* 0x88-0x8B */ + 0x00, 0x00, 0x96, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE7, 0x55, 0x00, 0x00, 0xE7, 0x54, /* 0x90-0x93 */ + 0xE7, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xE7, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE7, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x58, 0x90, 0x67, /* 0xA8-0xAB */ + 0xE7, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xEB, /* 0xAC-0xAF */ + 0xE7, 0x5B, 0xE7, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x5E, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE7, 0x5F, 0xE7, 0x5C, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE7, 0x60, 0x00, 0x00, 0x8E, 0xD4, 0xE7, 0x61, /* 0xC8-0xCB */ + 0x8B, 0x4F, 0x8C, 0x52, 0x00, 0x00, 0xEE, 0x96, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0xAC, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x62, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xEE, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x5D, 0xE7, 0x63, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x66, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x8E, 0xB2, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x65, /* 0xF8-0xFB */ + 0xE7, 0x64, 0x8C, 0x79, 0xE7, 0x67, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8F[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x72, /* 0x00-0x03 */ + 0x00, 0x00, 0xE7, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x8D, 0xDA, 0xE7, 0x68, 0x00, 0x00, /* 0x08-0x0B */ + 0xE7, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x6B, 0xE7, 0x6D, /* 0x10-0x13 */ + 0x95, 0xE3, 0xE7, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE7, 0x6C, 0x00, 0x00, 0xE7, 0x70, /* 0x18-0x1B */ + 0xE7, 0x6E, 0x8B, 0x50, 0x00, 0x00, 0xE7, 0x6F, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x72, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x94, 0x79, 0x97, 0xD6, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x53, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x73, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x97, 0x41, 0xE7, 0x75, 0x00, 0x00, 0xE7, 0x74, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x78, 0x97, 0x60, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x77, 0x00, 0x00, /* 0x40-0x43 */ + 0x8A, 0x8D, 0xE7, 0x76, 0xE7, 0x7B, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xE7, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE7, 0x79, 0x93, 0x51, 0xE7, 0x7C, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x7D, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xE7, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x8C, /* 0x5C-0x5F */ + 0x00, 0x00, 0x8C, 0x44, 0xE7, 0x80, 0xE7, 0x81, /* 0x60-0x63 */ + 0xE7, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x68, /* 0x98-0x9B */ + 0xE7, 0x83, 0x00, 0x00, 0x8E, 0xAB, 0xE7, 0x84, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x85, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x9F, /* 0xA4-0xA7 */ + 0x99, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xE7, 0x86, 0xE3, 0x90, 0xE7, 0x87, /* 0xAC-0xAF */ + 0x92, 0x43, 0x90, 0x4A, 0x94, 0x5F, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x88, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0xD3, 0x92, 0xD2, /* 0xB8-0xBB */ + 0x8D, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x92, 0x48, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x49, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x96, 0x98, 0x90, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x8C, 0x7D, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x8B, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x95, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x89, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x8B, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE7, 0x8A, 0x89, 0xDE, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x93, 0xF4, 0xE7, 0x8C, 0x94, 0x97, /* 0xE8-0xEB */ + 0x00, 0x00, 0x93, 0x52, 0x00, 0x00, 0xE7, 0x8D, /* 0xEC-0xEF */ + 0x8F, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE7, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x96, 0xC0, /* 0xF4-0xF7 */ + 0xE7, 0x9E, 0xE7, 0x91, 0xE7, 0x92, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x92, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_90[512] = { + 0x91, 0xDE, 0x91, 0x97, 0x00, 0x00, 0x93, 0xA6, /* 0x00-0x03 */ + 0x00, 0x00, 0xE7, 0x90, 0x8B, 0x74, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x99, /* 0x08-0x0B */ + 0x00, 0x00, 0xE7, 0x96, 0xE7, 0xA3, 0x93, 0xA7, /* 0x0C-0x0F */ + 0x92, 0x80, 0xE7, 0x93, 0x00, 0x00, 0x92, 0xFC, /* 0x10-0x13 */ + 0x93, 0x72, 0xE7, 0x94, 0xE7, 0x98, 0x90, 0x80, /* 0x14-0x17 */ + 0x00, 0x00, 0x94, 0x87, 0x92, 0xCA, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x90, 0xC0, 0xE7, 0x97, 0x91, 0xAC, /* 0x1C-0x1F */ + 0x91, 0xA2, 0xE7, 0x95, 0x88, 0xA7, 0x98, 0x41, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x9A, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0xDF, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x8F, 0x54, 0x90, 0x69, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xE7, 0x9C, 0xE7, 0x9B, 0x00, 0x00, /* 0x34-0x37 */ + 0x88, 0xED, 0xE7, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x95, 0x4E, 0x00, 0x00, 0xE7, 0xA5, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x93, 0xD9, 0x90, 0x8B, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x92, 0x78, 0x00, 0x00, 0x8B, 0xF6, /* 0x44-0x47 */ + 0x00, 0x00, 0xE7, 0xA4, 0x97, 0x56, 0x89, 0x5E, /* 0x48-0x4B */ + 0x00, 0x00, 0x95, 0xD5, 0x89, 0xDF, 0xE7, 0x9F, /* 0x4C-0x4F */ + 0xE7, 0xA0, 0xE7, 0xA1, 0xE7, 0xA2, 0x93, 0xB9, /* 0x50-0x53 */ + 0x92, 0x42, 0x88, 0xE1, 0xE7, 0xA6, 0x00, 0x00, /* 0x54-0x57 */ + 0xE7, 0xA7, 0xEA, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x91, 0xBB, 0x00, 0x00, 0xE7, 0xA8, 0x00, 0x00, /* 0x5C-0x5F */ + 0x89, 0x93, 0x91, 0x6B, 0x00, 0x00, 0x8C, 0xAD, /* 0x60-0x63 */ + 0x00, 0x00, 0x97, 0x79, 0x00, 0x00, 0xEE, 0x99, /* 0x64-0x67 */ + 0xE7, 0xA9, 0x93, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x91, 0x98, 0x8E, 0xD5, 0xE7, 0xAA, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xAD, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x8F, 0x85, 0xE7, 0xAB, 0x91, 0x4A, /* 0x74-0x77 */ + 0x91, 0x49, 0x00, 0x00, 0x88, 0xE2, 0x00, 0x00, /* 0x78-0x7B */ + 0x97, 0xC9, 0xE7, 0xAF, 0x00, 0x00, 0x94, 0xF0, /* 0x7C-0x7F */ + + 0xE7, 0xB1, 0xE7, 0xB0, 0xE7, 0xAE, 0xE2, 0x84, /* 0x80-0x83 */ + 0x8A, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x8E, /* 0x84-0x87 */ + 0x00, 0x00, 0xE7, 0xB3, 0xE7, 0xB2, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB4, /* 0x8C-0x8F */ + 0x00, 0x00, 0x97, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, 0xDF, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x4D, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE7, 0xB5, 0x00, 0x00, 0x8E, 0xD7, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB6, /* 0xAC-0xAF */ + 0x00, 0x00, 0xE7, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE7, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x93, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x88, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x8D, 0x78, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x59, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBC, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x9A, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x8C, 0x53, 0xE7, 0xB9, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE7, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x95, 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x8A, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x97, 0x58, 0x00, 0x00, 0x8B, 0xBD, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x93, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_91[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBD, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBE, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xEE, 0x9C, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE7, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x9D, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x93, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE7, 0xC1, 0x00, 0x00, 0xE7, 0xC0, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x93, 0xD1, 0xE7, 0xC2, 0x8F, 0x55, /* 0x48-0x4B */ + 0x8E, 0xDE, 0x94, 0x7A, 0x92, 0x91, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xF0, 0x00, 0x00, /* 0x50-0x53 */ + 0x90, 0x8C, 0x00, 0x00, 0xE7, 0xC3, 0x00, 0x00, /* 0x54-0x57 */ + 0xE7, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x90, 0x7C, 0xE7, 0xC5, /* 0x60-0x63 */ + 0x00, 0x00, 0xE7, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE7, 0xC7, 0x97, 0x8F, 0x00, 0x00, /* 0x68-0x6B */ + 0x8F, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xC9, 0xE7, 0xC8, /* 0x70-0x73 */ + 0x00, 0x00, 0x8D, 0x79, 0x00, 0x00, 0x8D, 0x93, /* 0x74-0x77 */ + 0x8E, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xCC, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x86, /* 0x84-0x87 */ + 0x00, 0x00, 0xE7, 0xCB, 0x00, 0x00, 0xE7, 0xCA, /* 0x88-0x8B */ + 0x00, 0x00, 0x91, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x8C, 0xED, 0x00, 0x00, 0x90, 0xC1, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xAE, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x8F, 0x58, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xCD, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x8F, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xD0, 0xE7, 0xCE, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xCF, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE7, 0xD2, 0xE7, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x8F, 0xF8, 0x00, 0x00, 0xE7, 0xD3, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE7, 0xD4, 0xE7, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xCE, 0x8D, 0xD1, /* 0xC4-0xC7 */ + 0x8E, 0xDF, 0xE7, 0xD6, 0x00, 0x00, 0xE7, 0xD7, /* 0xC8-0xCB */ + 0x97, 0xA2, 0x8F, 0x64, 0x96, 0xEC, 0x97, 0xCA, /* 0xCC-0xCF */ + 0xE7, 0xD8, 0x8B, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xD9, 0xEE, 0x9F, /* 0xD4-0xD7 */ + 0x93, 0x42, 0x00, 0x00, 0xEE, 0x9E, 0xE7, 0xDC, /* 0xD8-0xDB */ + 0x8A, 0x98, 0x90, 0x6A, 0xEE, 0xA0, 0xE7, 0xDA, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE7, 0xDB, 0x00, 0x00, 0x92, 0xDE, /* 0xE0-0xE3 */ + 0xEE, 0xA3, 0xEE, 0xA4, 0x96, 0x74, 0x8B, 0xFA, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xEE, 0xA1, 0xEE, 0xA2, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xE7, 0xDE, 0xE7, 0xDF, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE7, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_92[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xA5, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xA7, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x93, 0xDD, 0x8A, 0x62, 0x00, 0x00, /* 0x0C-0x0F */ + 0xEE, 0xA6, 0xE7, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xE7, 0xE2, 0xE7, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE0, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE8, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xE7, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x97, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xD8, /* 0x34-0x37 */ + 0x00, 0x00, 0xEE, 0xAE, 0xEE, 0xA8, 0x00, 0x00, /* 0x38-0x3B */ + 0xEE, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xED, /* 0x3C-0x3F */ + 0xEE, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x93, 0x53, 0xE7, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xE7, 0xEB, 0xE7, 0xE9, 0x00, 0x00, 0xE7, 0xEE, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xAB, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE7, 0xEF, 0xEE, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE7, /* 0x54-0x57 */ + 0x00, 0x00, 0xEE, 0xAC, 0xE7, 0xF4, 0x89, 0x94, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE6, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xAB, 0x00, 0x00, /* 0x60-0x63 */ + 0xE7, 0xEA, 0x00, 0x00, 0x8F, 0xDE, 0xEE, 0xAF, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x8D, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB1, /* 0x74-0x77 */ + 0xEE, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x67, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x8B, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x65, /* 0x80-0x83 */ + 0x00, 0x00, 0x93, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xED, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x91, 0x4C, 0x00, 0x00, 0xE7, 0xF2, /* 0x90-0x93 */ + 0x00, 0x00, 0xE7, 0xEC, 0xE7, 0xF1, 0x00, 0x00, /* 0x94-0x97 */ + 0x96, 0xC1, 0x00, 0x00, 0x92, 0xB6, 0xE7, 0xF3, /* 0x98-0x9B */ + 0xE7, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB0, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x91, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF7, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE7, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF5, /* 0xCC-0xCF */ + 0xEE, 0xB6, 0x00, 0x00, 0x96, 0x4E, 0xEE, 0xBA, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xEE, 0xB8, 0x00, 0x00, 0xEE, 0xB4, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xEE, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xEE, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x8F, 0x9B, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB3, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE7, 0xF8, 0x95, 0xDD, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x89, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x95, 0x65, 0x92, 0x92, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x8B, 0x98, 0xED, 0x49, 0xE7, 0xFA, 0xEE, 0xBD, /* 0xF8-0xFB */ + 0x8D, 0x7C, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xC0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_93[512] = { + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xC2, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0x4B, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF9, /* 0x0C-0x0F */ + 0x90, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x90, 0x8E, 0xE8, 0x40, 0xE8, 0x42, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xEE, 0xC1, 0xEE, 0xBF, 0x00, 0x00, /* 0x1C-0x1F */ + 0x8F, 0xF9, 0xEE, 0xBC, 0xE8, 0x41, 0xE8, 0x43, /* 0x20-0x23 */ + 0x00, 0x00, 0xEE, 0xBB, 0x8B, 0xD1, 0x00, 0x00, /* 0x24-0x27 */ + 0x95, 0x64, 0x00, 0x00, 0x00, 0x00, 0x8E, 0xE0, /* 0x28-0x2B */ + 0x98, 0x42, 0x00, 0x00, 0xE7, 0xFC, 0x8D, 0xF6, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x5E, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xE8, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0x44, 0xE8, 0x46, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE7, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xED, 0x42, 0x00, 0x00, 0x00, 0x00, 0x93, 0xE7, /* 0x48-0x4B */ + 0x00, 0x00, 0x93, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x92, 0xD5, 0x00, 0x00, 0xE8, 0x4B, 0xEE, 0xC4, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x62, /* 0x58-0x5B */ + 0xE8, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xE8, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x8C, 0x4C, 0x00, 0x00, 0xE8, 0x4A, 0x00, 0x00, /* 0x6C-0x6F */ + 0xEE, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x8C, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xE8, 0x49, 0x00, 0x00, 0x8F, 0xDF, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x8A, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE8, 0x4F, 0x00, 0x00, 0x8D, 0xBD, 0x91, 0x99, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x92, 0xC8, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xEE, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x5A, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE8, 0x4D, 0xE8, 0x4E, 0x92, 0xC1, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE8, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE8, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x56, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xC6, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE8, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xE8, 0x58, 0x93, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0x51, 0xE8, 0x52, /* 0xD4-0xD7 */ + 0xE8, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE8, 0x57, 0xEE, 0xC7, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x8B, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE8, 0x5A, 0xE8, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xE8, 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xEE, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_94[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x5E, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x5F, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE8, 0x60, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x5D, /* 0x10-0x13 */ + 0xE8, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x8F, 0xE0, 0x93, 0xA8, 0xE8, 0x5B, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE8, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x62, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xEE, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xE8, 0x63, 0xE8, 0x61, 0x00, 0x00, /* 0x34-0x37 */ + 0x91, 0xF6, 0x00, 0x00, 0xE8, 0x65, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE8, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE8, 0x68, 0xEE, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xEE, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x8A, 0xD3, 0xE8, 0x67, 0x96, 0xF8, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0x73, 0xE8, 0x69, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0x6C, 0x00, 0x00, /* 0x5C-0x5F */ + 0xE8, 0x6A, 0x00, 0x00, 0xE8, 0x6B, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0x6D, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE8, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xE8, 0x70, 0x00, 0x00, 0xE8, 0x71, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xE8, 0x74, 0xE8, 0x72, 0xE8, 0x75, 0xE8, 0x77, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE8, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ +}; + +static const unsigned char u2c_95[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0xB7, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x96, 0xE5, 0x00, 0x00, 0xE8, 0x78, 0x91, 0x4D, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x79, /* 0x84-0x87 */ + 0x00, 0x00, 0x95, 0xC2, 0xE8, 0x7A, 0x8A, 0x4A, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x5B, /* 0x8C-0x8F */ + 0x00, 0x00, 0x8A, 0xD5, 0xEE, 0xCC, 0x8A, 0xD4, /* 0x90-0x93 */ + 0xE8, 0x7B, 0x00, 0x00, 0xE8, 0x7C, 0x00, 0x00, /* 0x94-0x97 */ + 0xE8, 0x7D, 0xE8, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xE8, 0x80, 0x00, 0x00, 0x8A, 0xD6, 0x8A, 0x74, /* 0xA0-0xA3 */ + 0x8D, 0x7D, 0x94, 0xB4, 0x00, 0x00, 0xE8, 0x82, /* 0xA4-0xA7 */ + 0xE8, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xE8, 0x83, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x7B, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE8, 0x86, 0x00, 0x00, 0xE8, 0x85, /* 0xB8-0xBB */ + 0xE8, 0x84, 0x00, 0x00, 0xE8, 0x87, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x8A, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xC5, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0x88, 0x00, 0x00, /* 0xC8-0xCB */ + 0xE8, 0x8C, 0xE8, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE8, 0x8E, 0xE8, 0x8D, 0xE8, 0x8F, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x93, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE8, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE8, 0x91, 0xE8, 0x93, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE8, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ +}; + +static const unsigned char u2c_96[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x95, 0x8C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE8, 0x94, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xE8, 0x95, 0x00, 0x00, 0x8D, 0xE3, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0x96, 0xE8, 0x97, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x68, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0x6A, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0xA2, /* 0x3C-0x3F */ + 0x91, 0xC9, 0x00, 0x00, 0xE8, 0x98, 0x00, 0x00, /* 0x40-0x43 */ + 0x95, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x9B, /* 0x48-0x4B */ + 0xE8, 0x99, 0x8D, 0x7E, 0x00, 0x00, 0xE8, 0x9A, /* 0x4C-0x4F */ + 0x8C, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0xC3, /* 0x58-0x5B */ + 0xE8, 0x9D, 0xE8, 0x9F, 0xE8, 0x9E, 0xE8, 0xA0, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x40, 0x90, 0x77, /* 0x60-0x63 */ + 0x8F, 0x9C, 0x8A, 0xD7, 0xE8, 0xA1, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0x86, 0x00, 0x00, /* 0x68-0x6B */ + 0xE8, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x89, 0x41, 0x00, 0x00, 0xE8, 0xA2, 0x92, 0xC2, /* 0x70-0x73 */ + 0x00, 0x00, 0x97, 0xCB, 0x93, 0xA9, 0xE8, 0x9C, /* 0x74-0x77 */ + 0x97, 0xA4, 0x00, 0x00, 0x8C, 0xAF, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x97, 0x7A, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x8B, 0xF7, 0x97, 0xB2, 0x00, 0x00, /* 0x84-0x87 */ + 0x8C, 0x47, 0x00, 0x00, 0x91, 0xE0, 0xE4, 0x40, /* 0x88-0x8B */ + 0x00, 0x00, 0xE8, 0xA4, 0x8A, 0x4B, 0x90, 0x8F, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x8A, 0x75, 0xE8, 0xA6, 0x00, 0x00, 0xE8, 0xA7, /* 0x94-0x97 */ + 0xE8, 0xA5, 0x8C, 0x84, 0x00, 0x00, 0x8D, 0xDB, /* 0x98-0x9B */ + 0x8F, 0xE1, 0xEE, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x89, 0x42, 0x00, 0x00, 0x00, 0x00, 0x97, 0xD7, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA9, /* 0xA4-0xA7 */ + 0xE7, 0xAC, 0x00, 0x00, 0xE8, 0xA8, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xD0, /* 0xAC-0xAF */ + 0xE8, 0xAC, 0xE8, 0xAA, 0xE8, 0xAB, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE8, 0xAD, 0x00, 0x00, 0xE8, 0xAE, 0x97, 0xEA, /* 0xB4-0xB7 */ + 0xE8, 0xAF, 0xE8, 0xB0, 0x00, 0x00, 0x90, 0xC7, /* 0xB8-0xBB */ + 0x94, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x90, 0x9D, 0x8A, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x97, 0x59, 0x89, 0xEB, 0x8F, 0x57, 0x8C, 0xD9, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE8, 0xB3, 0x00, 0x00, 0xE8, 0xB2, /* 0xC8-0xCB */ + 0x8E, 0x93, 0xE8, 0xB4, 0xE8, 0xB1, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x8E, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE8, 0xB8, 0xE5, 0xAB, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x99, 0xD4, 0x00, 0x00, 0x90, 0x97, /* 0xD8-0xDB */ + 0xE8, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xA3, 0x93, 0xEF, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x89, 0x4A, 0x00, 0x00, 0x90, 0xE1, 0x8E, 0xB4, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x95, 0xB5, 0x00, 0x00, 0x89, 0x5F, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xEB, 0x97, 0x8B, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE8, 0xB9, 0x00, 0x00, 0x93, 0x64, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_97[512] = { + 0x8E, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xE8, 0xBA, 0x00, 0x00, 0xE8, 0xBB, 0x90, 0x6B, /* 0x04-0x07 */ + 0xE8, 0xBC, 0x00, 0x00, 0x97, 0xEC, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE8, 0xB7, 0xE8, 0xBE, 0xE8, 0xC0, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE8, 0xBF, 0x00, 0x00, 0xE8, 0xBD, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xC1, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE8, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x91, 0x9A, 0x00, 0x00, 0x89, 0xE0, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xE8, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x96, 0xB6, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xC4, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE8, 0xC5, 0x00, 0x00, 0x98, 0x49, 0xEE, 0xD1, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x9E, 0x50, 0xE8, 0xC6, 0x00, 0x00, 0xEE, 0xD2, /* 0x38-0x3B */ + 0x00, 0x00, 0xE8, 0xC7, 0xE8, 0xC8, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xCC, 0xEE, 0xD3, /* 0x40-0x43 */ + 0xE8, 0xC9, 0x00, 0x00, 0xE8, 0xCA, 0x00, 0x00, /* 0x44-0x47 */ + 0xE8, 0xCB, 0xE8, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xEE, 0xD4, 0x00, 0x00, 0xEE, 0xD5, /* 0x4C-0x4F */ + 0x00, 0x00, 0xEE, 0xD6, 0x90, 0xC2, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xEE, 0xD7, 0x96, 0xF5, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x90, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xE8, 0xCE, 0x00, 0x00, 0x94, 0xF1, 0x00, 0x00, /* 0x5C-0x5F */ + 0xE8, 0xCF, 0xEA, 0x72, 0x96, 0xCA, 0x00, 0x00, /* 0x60-0x63 */ + 0xE8, 0xD0, 0x00, 0x00, 0xE8, 0xD1, 0x00, 0x00, /* 0x64-0x67 */ + 0xE8, 0xD2, 0x8A, 0x76, 0x00, 0x00, 0xE8, 0xD4, /* 0x68-0x6B */ + 0x00, 0x00, 0x90, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xE8, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x8C, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xE8, 0xD6, 0xE8, 0xDA, 0x00, 0x00, /* 0x78-0x7B */ + 0xE8, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE8, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x8A, 0x93, 0xE8, 0xD7, 0xE8, 0xDB, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xDC, /* 0x88-0x8B */ + 0x00, 0x00, 0x88, 0xC6, 0x00, 0x00, 0xE8, 0xDD, /* 0x8C-0x8F */ + 0xE8, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x8F, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xE8, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x8B, 0x66, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE2, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE1, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE8, 0xE0, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x91, /* 0xA8-0xAB */ + 0x00, 0x00, 0x95, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE3, /* 0xB0-0xB3 */ + 0xE8, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE5, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE6, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE8, 0xE7, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE8, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0xD8, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE8, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xE8, 0xEA, 0x94, 0x42, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xEC, 0x89, 0xB9, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xE8, 0xEF, 0xE8, 0xEE, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x43, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8B, 0xBF, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_98[512] = { + 0x00, 0x00, 0x95, 0xC5, 0x92, 0xB8, 0x8D, 0xA0, /* 0x00-0x03 */ + 0x00, 0x00, 0x8D, 0x80, 0x8F, 0x87, 0x00, 0x00, /* 0x04-0x07 */ + 0x90, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xE8, 0xF1, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF0, /* 0x0C-0x0F */ + 0x97, 0x61, 0x8A, 0xE6, 0x94, 0xD0, 0x93, 0xDA, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x9C, /* 0x14-0x17 */ + 0x97, 0xCC, 0x00, 0x00, 0x8C, 0x7A, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE8, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xE8, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x96, 0x6A, 0x93, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x89, 0x6F, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF5, /* 0x34-0x37 */ + 0xE8, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x95, 0x70, /* 0x38-0x3B */ + 0x97, 0x8A, 0xE8, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF7, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF9, /* 0x48-0x4B */ + 0x91, 0xE8, 0x8A, 0x7A, 0x8A, 0x7B, 0xE8, 0xF8, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x8A, 0xE7, 0x8C, 0xB0, 0x00, 0x00, 0xEE, 0xD8, /* 0x54-0x57 */ + 0x8A, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x93, 0x5E, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x97, 0xDE, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xEE, 0xD9, 0x00, 0x00, 0x8C, 0xDA, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xFA, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xFB, /* 0x6C-0x6F */ + 0xE8, 0xFC, 0xE9, 0x40, 0x00, 0x00, 0xE9, 0x42, /* 0x70-0x73 */ + 0xE9, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x95, 0x97, 0x00, 0x00, 0xE9, 0x43, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x44, /* 0xAC-0xAF */ + 0x00, 0x00, 0xE9, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x46, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x48, /* 0xC0-0xC3 */ + 0xE9, 0x47, 0x00, 0x00, 0xE9, 0x49, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x94, 0xF2, /* 0xD8-0xDB */ + 0xE3, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x90, 0x48, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x51, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE9, 0x4A, 0x00, 0x00, 0xE9, 0x4B, /* 0xE8-0xEB */ + 0x00, 0x00, 0x99, 0xAA, 0x9F, 0x5A, 0x94, 0xD1, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0xF9, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x88, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x8E, 0x94, 0x96, 0x4F, 0x8F, 0xFC, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_99[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x4C, /* 0x00-0x03 */ + 0x00, 0x00, 0x96, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xE9, 0x4D, 0x97, 0x7B, 0x00, 0x00, /* 0x08-0x0B */ + 0x89, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x8E, 0x60, 0x00, 0x00, 0xE9, 0x4E, 0x89, 0xEC, /* 0x10-0x13 */ + 0xE9, 0x4F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE9, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xE9, 0x52, 0xE9, 0x53, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE9, 0x55, 0xE9, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xE9, 0x54, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xDC, /* 0x24-0x27 */ + 0x8A, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xE9, 0x56, 0x00, 0x00, 0xE9, 0x57, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xE9, 0x58, 0xE9, 0x59, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x5A, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xE9, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xE9, 0x5B, 0x00, 0x00, 0xE9, 0x5E, /* 0x48-0x4B */ + 0xE9, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE9, 0x5D, 0xE9, 0x5F, 0xE9, 0x60, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE9, 0x62, 0x00, 0x00, 0x8B, 0xC0, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x8E, 0xF1, 0xE9, 0x63, /* 0x94-0x97 */ + 0xE9, 0x64, 0x8D, 0x81, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xDE, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xE9, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x8A, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x94, 0x6E, 0xE9, 0x66, 0xE9, 0x67, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x79, /* 0xB0-0xB3 */ + 0x93, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xE9, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x94, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x91, 0xCA, 0x89, 0x77, 0x8B, 0xEC, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x8B, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x92, 0x93, 0xE9, 0x6D, 0x8B, 0xEE, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x89, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE9, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x6A, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE9, 0x6B, 0x00, 0x00, 0xE9, 0x69, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x77, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xE9, 0x6E, 0xE9, 0x6F, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE9, 0x70, 0xE9, 0x71, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xE9, 0x73, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x72, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x78, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9A[512] = { + 0x00, 0x00, 0xE9, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xE9, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0x52, 0xE9, 0x75, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x91, 0x9B, 0x8C, 0xB1, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE9, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x91, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x79, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x93, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x7A, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x80, 0x00, 0x00, /* 0x3C-0x3F */ + 0xE9, 0x7D, 0x00, 0x00, 0xE9, 0x7C, 0xE9, 0x7E, /* 0x40-0x43 */ + 0x00, 0x00, 0xE9, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xE9, 0x82, 0xEE, 0xDF, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE9, 0x81, 0x00, 0x00, 0xE9, 0x84, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x8B, 0xC1, 0xE9, 0x83, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x85, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x86, 0x00, 0x00, /* 0x60-0x63 */ + 0xE9, 0x88, 0xE9, 0x87, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE9, 0x89, 0xE9, 0x8B, 0xE9, 0x8A, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x8D, 0x9C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xE9, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE9, 0x8D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x8A, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xE9, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE9, 0x8F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x90, 0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x90, /* 0xCC-0xCF */ + 0x00, 0x00, 0xE9, 0x91, 0x00, 0x00, 0xE9, 0x92, /* 0xD0-0xD3 */ + 0xE9, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x8D, 0x82, 0xEE, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xEE, 0xE1, 0x00, 0x00, 0xE9, 0x94, 0xE9, 0x95, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x96, 0xE9, 0x97, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x98, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x94, 0xAF, 0xE9, 0x9A, /* 0xE8-0xEB */ + 0x00, 0x00, 0x95, 0x45, 0xE9, 0x9B, 0xE9, 0x99, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE9, 0x9D, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE9, 0x9C, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x9E, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x9F, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_9B[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xA0, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE9, 0xA1, 0x00, 0x00, 0xE9, 0xA2, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xA3, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xA4, 0xE9, 0xA5, /* 0x20-0x23 */ + 0x00, 0x00, 0xE9, 0xA6, 0x00, 0x00, 0xE9, 0xA7, /* 0x24-0x27 */ + 0xE9, 0xA8, 0xE9, 0xA9, 0xE9, 0xAA, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xAB, 0xE9, 0xAC, /* 0x2C-0x2F */ + 0x00, 0x00, 0x9F, 0x54, 0xE9, 0xAD, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF6, /* 0x38-0x3B */ + 0x8B, 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x8A, 0x40, 0x8D, 0xB0, 0xE9, 0xAF, /* 0x40-0x43 */ + 0xE9, 0xAE, 0x96, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xE9, 0xB1, 0xE9, 0xB2, 0xE9, 0xB0, /* 0x4C-0x4F */ + 0x00, 0x00, 0xE9, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x96, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xE9, 0xB4, 0x00, 0x00, 0x8B, 0x9B, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x44, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE3, 0x00, 0x00, /* 0x70-0x73 */ + 0xE9, 0xB5, 0xEE, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB7, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0xBC, 0xEE, 0xE4, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE9, 0xB8, 0x95, 0xA9, 0xE9, 0xB6, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB9, 0xE9, 0xBA, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xBB, /* 0x9C-0x9F */ + 0xE9, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE9, 0xBD, 0x00, 0x00, 0x96, 0x8E, 0x8E, 0x4C, /* 0xA8-0xAB */ + 0x00, 0x00, 0x8D, 0xF8, 0x91, 0x4E, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xEE, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE9, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE9, 0xC1, 0x00, 0x00, 0xEE, 0xE6, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE9, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC2, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x8C, 0xEF, 0xE9, 0xC0, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC3, /* 0xCC-0xCF */ + 0x00, 0x00, 0xE9, 0xC4, 0xE9, 0xC5, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE9, 0xC9, 0x00, 0x00, 0x8E, 0x49, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x91, 0xE2, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE9, 0xCA, 0xE9, 0xC7, 0xE9, 0xC6, /* 0xE0-0xE3 */ + 0xE9, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x8C, 0x7E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE9, 0xCE, 0xE9, 0xCD, 0xE9, 0xCC, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x88, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_9C[512] = { + 0xEE, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xE9, 0xD8, 0x00, 0x00, 0xE9, 0xD4, 0x00, 0x00, /* 0x04-0x07 */ + 0xE9, 0xD5, 0xE9, 0xD1, 0xE9, 0xD7, 0x00, 0x00, /* 0x08-0x0B */ + 0xE9, 0xD3, 0x8A, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x98, 0x6B, 0x00, 0x00, 0xE9, 0xD6, 0xE9, 0xD2, /* 0x10-0x13 */ + 0xE9, 0xD0, 0xE9, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xDA, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xE9, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xE9, 0xDC, 0xE9, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x95, 0x68, 0xE9, 0xD9, 0x88, 0xF1, /* 0x2C-0x2F */ + 0xE9, 0xDE, 0x00, 0x00, 0xE9, 0xE0, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x8A, 0x8F, 0xE9, 0xCB, 0x89, 0x56, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE2, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE1, 0xE9, 0xDF, /* 0x44-0x47 */ + 0x92, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x96, 0x90, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0xD8, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE3, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xE9, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE5, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE6, 0x00, 0x00, /* 0x74-0x77 */ + 0xE9, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x92, 0xB9, 0x00, 0x00, 0xE9, 0xE8, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x94, 0xB5, 0x00, 0x00, 0xE9, 0xED, /* 0xE8-0xEB */ + 0xE9, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE9, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x96, 0x50, /* 0xF0-0xF3 */ + 0x96, 0xC2, 0x00, 0x00, 0x93, 0xCE, 0x00, 0x00, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_9D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xEE, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xEF, 0x93, 0xBC, /* 0x04-0x07 */ + 0xE9, 0xEC, 0xE9, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0xA8, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF7, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xE9, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x89, 0x95, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF4, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF3, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF1, 0x00, 0x00, /* 0x24-0x27 */ + 0x8A, 0x9B, 0x00, 0x00, 0xE9, 0xF0, 0x8E, 0xB0, /* 0x28-0x2B */ + 0x89, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8D, 0x83, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xFA, 0xE9, 0xF9, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE9, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE9, 0xF5, 0x00, 0x00, 0xE9, 0xFB, 0x00, 0x00, /* 0x44-0x47 */ + 0xE9, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xEA, 0x44, 0xEA, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xEA, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x89, 0x4C, 0xEA, 0x40, 0xEA, 0x41, 0x00, 0x00, /* 0x5C-0x5F */ + 0x8D, 0x94, 0x96, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xEA, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE9, /* 0x68-0x6B */ + 0x96, 0x51, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x4A, /* 0x6C-0x6F */ + 0xEE, 0xE8, 0x00, 0x00, 0xEA, 0x46, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x4B, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x48, /* 0x84-0x87 */ + 0x00, 0x00, 0xEA, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0x7B, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x4C, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xEA, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xEA, 0x4E, 0x00, 0x00, 0xEA, 0x49, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF2, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x4F, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x92, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xEA, 0x53, 0x00, 0x00, 0xEA, 0x54, 0xEA, 0x52, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xEA, 0x51, 0xEA, 0x57, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xEA, 0x50, 0x00, 0x00, 0xEA, 0x55, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x56, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x59, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xEA, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x5B, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xEA, 0x5C, 0x00, 0x00, 0xEA, 0x5D, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x68, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xEA, 0x5A, 0x91, 0xE9, 0x8D, 0xEB, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xEA, 0x5E, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xEE, 0xEB, 0xEA, 0x5F, 0xEA, 0x60, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x61, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xEA, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x8C, 0xB2, 0xEA, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xEA, 0x64, 0x00, 0x00, 0x8E, 0xAD, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xEA, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xEA, 0x66, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x67, /* 0x88-0x8B */ + 0xEA, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xEA, 0x6B, 0xEA, 0x69, 0x98, 0x5B, /* 0x90-0x93 */ + 0x00, 0x00, 0xEA, 0x6A, 0x00, 0x00, 0x97, 0xED, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xEA, 0x6C, 0x00, 0x00, 0x97, 0xD9, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xEA, 0x6D, 0x94, 0x9E, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xEA, 0x6E, 0xEA, 0x70, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xEA, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xEA, 0x6F, 0x8D, 0x8D, 0x96, 0xCB, 0x96, 0x83, /* 0xB8-0xBB */ + 0x9B, 0xF5, 0x00, 0x00, 0x9F, 0x80, 0x96, 0x9B, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x89, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xEA, 0x73, 0x8B, 0x6F, 0xEA, 0x74, 0xEA, 0x75, /* 0xCC-0xCF */ + 0xEA, 0x76, 0xEE, 0xEC, 0x8D, 0x95, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xEA, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xE0, 0xD2, 0x96, 0xD9, 0x00, 0x00, 0x91, 0xE1, /* 0xD8-0xDB */ + 0xEA, 0x78, 0xEA, 0x7A, 0xEA, 0x79, 0x00, 0x00, /* 0xDC-0xDF */ + 0xEA, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xEA, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xEA, 0x7D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x7E, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xEA, 0x80, 0x00, 0x00, 0xEA, 0x81, 0xEA, 0x82, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xEA, 0x83, 0x00, 0x00, 0xEA, 0x84, /* 0xF8-0xFB */ + 0xEA, 0x85, 0xEA, 0x86, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9F[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x87, /* 0x04-0x07 */ + 0xEA, 0x88, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x93, 0x43, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8C, 0xDB, /* 0x10-0x13 */ + 0x00, 0x00, 0xEA, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x91, 0x6C, 0xEA, 0x8B, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xEA, 0x8C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x40, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x8D, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x8E, 0xE2, 0x56, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xD8, 0xE8, 0xEB, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x8F, 0x00, 0x00, /* 0x50-0x53 */ + 0xEA, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x92, /* 0x5C-0x5F */ + 0xEA, 0x93, 0xEA, 0x94, 0x97, 0xEE, 0xEA, 0x91, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x95, 0xEA, 0x96, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x98, 0x00, 0x00, /* 0x68-0x6B */ + 0xEA, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x9A, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0x9B, 0xEA, 0x99, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x97, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xEA, 0x9C, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xEA, 0x9D, 0xE2, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xEA, 0x9E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ +}; + +static const unsigned char u2c_DC[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ +}; + +static const unsigned char u2c_F9[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xED, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xEE, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ +}; + +static const unsigned char u2c_FA[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x73, 0xED, 0x7E, /* 0x0C-0x0F */ + 0xED, 0x80, 0xED, 0x95, 0xED, 0xBC, 0xED, 0xCC, /* 0x10-0x13 */ + 0xED, 0xCE, 0xED, 0xF9, 0xEE, 0x42, 0xEE, 0x59, /* 0x14-0x17 */ + 0xEE, 0x61, 0xEE, 0x62, 0xEE, 0x63, 0xEE, 0x65, /* 0x18-0x1B */ + 0xEE, 0x69, 0xEE, 0x6C, 0xEE, 0x75, 0xEE, 0x81, /* 0x1C-0x1F */ + 0xEE, 0x83, 0xEE, 0x84, 0xEE, 0x8D, 0xEE, 0x95, /* 0x20-0x23 */ + 0xEE, 0x97, 0xEE, 0x98, 0xEE, 0x9B, 0xEE, 0xB7, /* 0x24-0x27 */ + 0xEE, 0xBE, 0xEE, 0xCE, 0xEE, 0xDA, 0xEE, 0xDB, /* 0x28-0x2B */ + 0xEE, 0xDD, 0xEE, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ +}; + +static const unsigned char u2c_FF[512] = { + 0x00, 0x00, 0x81, 0x49, 0xEE, 0xFC, 0x81, 0x94, /* 0x00-0x03 */ + 0x81, 0x90, 0x81, 0x93, 0x81, 0x95, 0xEE, 0xFB, /* 0x04-0x07 */ + 0x81, 0x69, 0x81, 0x6A, 0x81, 0x96, 0x81, 0x7B, /* 0x08-0x0B */ + 0x81, 0x43, 0x81, 0x7C, 0x81, 0x44, 0x81, 0x5E, /* 0x0C-0x0F */ + 0x82, 0x4F, 0x82, 0x50, 0x82, 0x51, 0x82, 0x52, /* 0x10-0x13 */ + 0x82, 0x53, 0x82, 0x54, 0x82, 0x55, 0x82, 0x56, /* 0x14-0x17 */ + 0x82, 0x57, 0x82, 0x58, 0x81, 0x46, 0x81, 0x47, /* 0x18-0x1B */ + 0x81, 0x83, 0x81, 0x81, 0x81, 0x84, 0x81, 0x48, /* 0x1C-0x1F */ + 0x81, 0x97, 0x82, 0x60, 0x82, 0x61, 0x82, 0x62, /* 0x20-0x23 */ + 0x82, 0x63, 0x82, 0x64, 0x82, 0x65, 0x82, 0x66, /* 0x24-0x27 */ + 0x82, 0x67, 0x82, 0x68, 0x82, 0x69, 0x82, 0x6A, /* 0x28-0x2B */ + 0x82, 0x6B, 0x82, 0x6C, 0x82, 0x6D, 0x82, 0x6E, /* 0x2C-0x2F */ + 0x82, 0x6F, 0x82, 0x70, 0x82, 0x71, 0x82, 0x72, /* 0x30-0x33 */ + 0x82, 0x73, 0x82, 0x74, 0x82, 0x75, 0x82, 0x76, /* 0x34-0x37 */ + 0x82, 0x77, 0x82, 0x78, 0x82, 0x79, 0x81, 0x6D, /* 0x38-0x3B */ + 0x81, 0x5F, 0x81, 0x6E, 0x81, 0x4F, 0x81, 0x51, /* 0x3C-0x3F */ + 0x81, 0x4D, 0x82, 0x81, 0x82, 0x82, 0x82, 0x83, /* 0x40-0x43 */ + 0x82, 0x84, 0x82, 0x85, 0x82, 0x86, 0x82, 0x87, /* 0x44-0x47 */ + 0x82, 0x88, 0x82, 0x89, 0x82, 0x8A, 0x82, 0x8B, /* 0x48-0x4B */ + 0x82, 0x8C, 0x82, 0x8D, 0x82, 0x8E, 0x82, 0x8F, /* 0x4C-0x4F */ + 0x82, 0x90, 0x82, 0x91, 0x82, 0x92, 0x82, 0x93, /* 0x50-0x53 */ + 0x82, 0x94, 0x82, 0x95, 0x82, 0x96, 0x82, 0x97, /* 0x54-0x57 */ + 0x82, 0x98, 0x82, 0x99, 0x82, 0x9A, 0x81, 0x6F, /* 0x58-0x5B */ + 0x81, 0x62, 0x81, 0x70, 0x81, 0x60, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0xA1, 0x00, 0xA2, 0x00, 0xA3, /* 0x60-0x63 */ + 0x00, 0xA4, 0x00, 0xA5, 0x00, 0xA6, 0x00, 0xA7, /* 0x64-0x67 */ + 0x00, 0xA8, 0x00, 0xA9, 0x00, 0xAA, 0x00, 0xAB, /* 0x68-0x6B */ + 0x00, 0xAC, 0x00, 0xAD, 0x00, 0xAE, 0x00, 0xAF, /* 0x6C-0x6F */ + 0x00, 0xB0, 0x00, 0xB1, 0x00, 0xB2, 0x00, 0xB3, /* 0x70-0x73 */ + 0x00, 0xB4, 0x00, 0xB5, 0x00, 0xB6, 0x00, 0xB7, /* 0x74-0x77 */ + 0x00, 0xB8, 0x00, 0xB9, 0x00, 0xBA, 0x00, 0xBB, /* 0x78-0x7B */ + 0x00, 0xBC, 0x00, 0xBD, 0x00, 0xBE, 0x00, 0xBF, /* 0x7C-0x7F */ + + 0x00, 0xC0, 0x00, 0xC1, 0x00, 0xC2, 0x00, 0xC3, /* 0x80-0x83 */ + 0x00, 0xC4, 0x00, 0xC5, 0x00, 0xC6, 0x00, 0xC7, /* 0x84-0x87 */ + 0x00, 0xC8, 0x00, 0xC9, 0x00, 0xCA, 0x00, 0xCB, /* 0x88-0x8B */ + 0x00, 0xCC, 0x00, 0xCD, 0x00, 0xCE, 0x00, 0xCF, /* 0x8C-0x8F */ + 0x00, 0xD0, 0x00, 0xD1, 0x00, 0xD2, 0x00, 0xD3, /* 0x90-0x93 */ + 0x00, 0xD4, 0x00, 0xD5, 0x00, 0xD6, 0x00, 0xD7, /* 0x94-0x97 */ + 0x00, 0xD8, 0x00, 0xD9, 0x00, 0xDA, 0x00, 0xDB, /* 0x98-0x9B */ + 0x00, 0xDC, 0x00, 0xDD, 0x00, 0xDE, 0x00, 0xDF, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x81, 0x91, 0x81, 0x92, 0x81, 0xCA, 0x81, 0x50, /* 0xE0-0xE3 */ + 0xEE, 0xFA, 0x81, 0x8F, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + NULL, NULL, NULL, u2c_03, u2c_04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_20, u2c_21, u2c_22, u2c_23, u2c_24, u2c_25, u2c_26, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_30, NULL, u2c_32, u2c_33, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, u2c_4E, u2c_4F, + u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, + u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, + u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, + u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, + u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, + u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, + u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, + u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, + u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, + u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, u2c_DC, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, u2c_F9, u2c_FA, NULL, NULL, NULL, NULL, u2c_FF, }; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(const wchar_t uni, + unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni&0xFF; + unsigned char ch = (uni>>8)&0xFF; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + if (ch == 0xFF && 0x61 <= cl && cl <= 0x9F) { + out[0] = cl + 0x40; + return 1; + } + uni2charset = page_uni2charset[ch]; + if (uni2charset) { + if (boundlen < 2) + return -ENAMETOOLONG; + + out[0] = uni2charset[cl*2]; + out[1] = uni2charset[cl*2+1]; + if (out[0] == 0x00 && out[1] == 0x00) + return -EINVAL; + return 2; + } else if (ch == 0) { + if (cl <= 0x7F) { + out[0] = cl; + return 1; + } else if (0xA0 <= cl) { + out[0] = u2c_00hi[cl - 0xA0][0]; + out[1] = u2c_00hi[cl - 0xA0][1]; + if (out[0] && out[1]) + return 2; + } + return -EINVAL; + } + else + return -EINVAL; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + unsigned char ch, cl; + const wchar_t *charset2uni; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + if (rawstring[0] <= 0x7F) { + *uni = rawstring[0]; + return 1; + } + if (0xA1 <= rawstring[0] && rawstring[0] <= 0xDF) { + *uni = 0xFF00 | (rawstring[0] - 0x40); + return 1; + } + + if (boundlen < 2) + return -ENAMETOOLONG; + ch = rawstring[0]; + cl = rawstring[1]; + charset2uni = page_charset2uni[ch]; + if (charset2uni && cl) { + *uni = charset2uni[cl]; + if (*uni == 0x0000) + return -EINVAL; + return 2; + } + else + return -EINVAL; +} + +static struct nls_table table = { + .charset = "cp932", + .alias = "sjis", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp932(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp932(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp932) +module_exit(exit_nls_cp932) + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NLS(sjis); diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c new file mode 100644 index 00000000..82770301 --- /dev/null +++ b/fs/nls/nls_cp936.c @@ -0,0 +1,11112 @@ +/* + * linux/fs/nls/nls_cp936.c + * + * Charset cp936 translation tables. + * This translation table was generated automatically, the + * original table can be download from the Microsoft website. + * (http://www.microsoft.com/typography/unicode/unicodecp.htm) + */ + +#include +#include +#include +#include +#include + +static const wchar_t c2u_81[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x4E02,0x4E04,0x4E05,0x4E06,0x4E0F,0x4E12,0x4E17,0x4E1F,/* 0x40-0x47 */ + 0x4E20,0x4E21,0x4E23,0x4E26,0x4E29,0x4E2E,0x4E2F,0x4E31,/* 0x48-0x4F */ + 0x4E33,0x4E35,0x4E37,0x4E3C,0x4E40,0x4E41,0x4E42,0x4E44,/* 0x50-0x57 */ + 0x4E46,0x4E4A,0x4E51,0x4E55,0x4E57,0x4E5A,0x4E5B,0x4E62,/* 0x58-0x5F */ + 0x4E63,0x4E64,0x4E65,0x4E67,0x4E68,0x4E6A,0x4E6B,0x4E6C,/* 0x60-0x67 */ + 0x4E6D,0x4E6E,0x4E6F,0x4E72,0x4E74,0x4E75,0x4E76,0x4E77,/* 0x68-0x6F */ + 0x4E78,0x4E79,0x4E7A,0x4E7B,0x4E7C,0x4E7D,0x4E7F,0x4E80,/* 0x70-0x77 */ + 0x4E81,0x4E82,0x4E83,0x4E84,0x4E85,0x4E87,0x4E8A,0x0000,/* 0x78-0x7F */ + + 0x4E90,0x4E96,0x4E97,0x4E99,0x4E9C,0x4E9D,0x4E9E,0x4EA3,/* 0x80-0x87 */ + 0x4EAA,0x4EAF,0x4EB0,0x4EB1,0x4EB4,0x4EB6,0x4EB7,0x4EB8,/* 0x88-0x8F */ + 0x4EB9,0x4EBC,0x4EBD,0x4EBE,0x4EC8,0x4ECC,0x4ECF,0x4ED0,/* 0x90-0x97 */ + 0x4ED2,0x4EDA,0x4EDB,0x4EDC,0x4EE0,0x4EE2,0x4EE6,0x4EE7,/* 0x98-0x9F */ + 0x4EE9,0x4EED,0x4EEE,0x4EEF,0x4EF1,0x4EF4,0x4EF8,0x4EF9,/* 0xA0-0xA7 */ + 0x4EFA,0x4EFC,0x4EFE,0x4F00,0x4F02,0x4F03,0x4F04,0x4F05,/* 0xA8-0xAF */ + 0x4F06,0x4F07,0x4F08,0x4F0B,0x4F0C,0x4F12,0x4F13,0x4F14,/* 0xB0-0xB7 */ + 0x4F15,0x4F16,0x4F1C,0x4F1D,0x4F21,0x4F23,0x4F28,0x4F29,/* 0xB8-0xBF */ + 0x4F2C,0x4F2D,0x4F2E,0x4F31,0x4F33,0x4F35,0x4F37,0x4F39,/* 0xC0-0xC7 */ + 0x4F3B,0x4F3E,0x4F3F,0x4F40,0x4F41,0x4F42,0x4F44,0x4F45,/* 0xC8-0xCF */ + 0x4F47,0x4F48,0x4F49,0x4F4A,0x4F4B,0x4F4C,0x4F52,0x4F54,/* 0xD0-0xD7 */ + 0x4F56,0x4F61,0x4F62,0x4F66,0x4F68,0x4F6A,0x4F6B,0x4F6D,/* 0xD8-0xDF */ + 0x4F6E,0x4F71,0x4F72,0x4F75,0x4F77,0x4F78,0x4F79,0x4F7A,/* 0xE0-0xE7 */ + 0x4F7D,0x4F80,0x4F81,0x4F82,0x4F85,0x4F86,0x4F87,0x4F8A,/* 0xE8-0xEF */ + 0x4F8C,0x4F8E,0x4F90,0x4F92,0x4F93,0x4F95,0x4F96,0x4F98,/* 0xF0-0xF7 */ + 0x4F99,0x4F9A,0x4F9C,0x4F9E,0x4F9F,0x4FA1,0x4FA2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_82[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x4FA4,0x4FAB,0x4FAD,0x4FB0,0x4FB1,0x4FB2,0x4FB3,0x4FB4,/* 0x40-0x47 */ + 0x4FB6,0x4FB7,0x4FB8,0x4FB9,0x4FBA,0x4FBB,0x4FBC,0x4FBD,/* 0x48-0x4F */ + 0x4FBE,0x4FC0,0x4FC1,0x4FC2,0x4FC6,0x4FC7,0x4FC8,0x4FC9,/* 0x50-0x57 */ + 0x4FCB,0x4FCC,0x4FCD,0x4FD2,0x4FD3,0x4FD4,0x4FD5,0x4FD6,/* 0x58-0x5F */ + 0x4FD9,0x4FDB,0x4FE0,0x4FE2,0x4FE4,0x4FE5,0x4FE7,0x4FEB,/* 0x60-0x67 */ + 0x4FEC,0x4FF0,0x4FF2,0x4FF4,0x4FF5,0x4FF6,0x4FF7,0x4FF9,/* 0x68-0x6F */ + 0x4FFB,0x4FFC,0x4FFD,0x4FFF,0x5000,0x5001,0x5002,0x5003,/* 0x70-0x77 */ + 0x5004,0x5005,0x5006,0x5007,0x5008,0x5009,0x500A,0x0000,/* 0x78-0x7F */ + + 0x500B,0x500E,0x5010,0x5011,0x5013,0x5015,0x5016,0x5017,/* 0x80-0x87 */ + 0x501B,0x501D,0x501E,0x5020,0x5022,0x5023,0x5024,0x5027,/* 0x88-0x8F */ + 0x502B,0x502F,0x5030,0x5031,0x5032,0x5033,0x5034,0x5035,/* 0x90-0x97 */ + 0x5036,0x5037,0x5038,0x5039,0x503B,0x503D,0x503F,0x5040,/* 0x98-0x9F */ + 0x5041,0x5042,0x5044,0x5045,0x5046,0x5049,0x504A,0x504B,/* 0xA0-0xA7 */ + 0x504D,0x5050,0x5051,0x5052,0x5053,0x5054,0x5056,0x5057,/* 0xA8-0xAF */ + 0x5058,0x5059,0x505B,0x505D,0x505E,0x505F,0x5060,0x5061,/* 0xB0-0xB7 */ + 0x5062,0x5063,0x5064,0x5066,0x5067,0x5068,0x5069,0x506A,/* 0xB8-0xBF */ + 0x506B,0x506D,0x506E,0x506F,0x5070,0x5071,0x5072,0x5073,/* 0xC0-0xC7 */ + 0x5074,0x5075,0x5078,0x5079,0x507A,0x507C,0x507D,0x5081,/* 0xC8-0xCF */ + 0x5082,0x5083,0x5084,0x5086,0x5087,0x5089,0x508A,0x508B,/* 0xD0-0xD7 */ + 0x508C,0x508E,0x508F,0x5090,0x5091,0x5092,0x5093,0x5094,/* 0xD8-0xDF */ + 0x5095,0x5096,0x5097,0x5098,0x5099,0x509A,0x509B,0x509C,/* 0xE0-0xE7 */ + 0x509D,0x509E,0x509F,0x50A0,0x50A1,0x50A2,0x50A4,0x50A6,/* 0xE8-0xEF */ + 0x50AA,0x50AB,0x50AD,0x50AE,0x50AF,0x50B0,0x50B1,0x50B3,/* 0xF0-0xF7 */ + 0x50B4,0x50B5,0x50B6,0x50B7,0x50B8,0x50B9,0x50BC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_83[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x50BD,0x50BE,0x50BF,0x50C0,0x50C1,0x50C2,0x50C3,0x50C4,/* 0x40-0x47 */ + 0x50C5,0x50C6,0x50C7,0x50C8,0x50C9,0x50CA,0x50CB,0x50CC,/* 0x48-0x4F */ + 0x50CD,0x50CE,0x50D0,0x50D1,0x50D2,0x50D3,0x50D4,0x50D5,/* 0x50-0x57 */ + 0x50D7,0x50D8,0x50D9,0x50DB,0x50DC,0x50DD,0x50DE,0x50DF,/* 0x58-0x5F */ + 0x50E0,0x50E1,0x50E2,0x50E3,0x50E4,0x50E5,0x50E8,0x50E9,/* 0x60-0x67 */ + 0x50EA,0x50EB,0x50EF,0x50F0,0x50F1,0x50F2,0x50F4,0x50F6,/* 0x68-0x6F */ + 0x50F7,0x50F8,0x50F9,0x50FA,0x50FC,0x50FD,0x50FE,0x50FF,/* 0x70-0x77 */ + 0x5100,0x5101,0x5102,0x5103,0x5104,0x5105,0x5108,0x0000,/* 0x78-0x7F */ + + 0x5109,0x510A,0x510C,0x510D,0x510E,0x510F,0x5110,0x5111,/* 0x80-0x87 */ + 0x5113,0x5114,0x5115,0x5116,0x5117,0x5118,0x5119,0x511A,/* 0x88-0x8F */ + 0x511B,0x511C,0x511D,0x511E,0x511F,0x5120,0x5122,0x5123,/* 0x90-0x97 */ + 0x5124,0x5125,0x5126,0x5127,0x5128,0x5129,0x512A,0x512B,/* 0x98-0x9F */ + 0x512C,0x512D,0x512E,0x512F,0x5130,0x5131,0x5132,0x5133,/* 0xA0-0xA7 */ + 0x5134,0x5135,0x5136,0x5137,0x5138,0x5139,0x513A,0x513B,/* 0xA8-0xAF */ + 0x513C,0x513D,0x513E,0x5142,0x5147,0x514A,0x514C,0x514E,/* 0xB0-0xB7 */ + 0x514F,0x5150,0x5152,0x5153,0x5157,0x5158,0x5159,0x515B,/* 0xB8-0xBF */ + 0x515D,0x515E,0x515F,0x5160,0x5161,0x5163,0x5164,0x5166,/* 0xC0-0xC7 */ + 0x5167,0x5169,0x516A,0x516F,0x5172,0x517A,0x517E,0x517F,/* 0xC8-0xCF */ + 0x5183,0x5184,0x5186,0x5187,0x518A,0x518B,0x518E,0x518F,/* 0xD0-0xD7 */ + 0x5190,0x5191,0x5193,0x5194,0x5198,0x519A,0x519D,0x519E,/* 0xD8-0xDF */ + 0x519F,0x51A1,0x51A3,0x51A6,0x51A7,0x51A8,0x51A9,0x51AA,/* 0xE0-0xE7 */ + 0x51AD,0x51AE,0x51B4,0x51B8,0x51B9,0x51BA,0x51BE,0x51BF,/* 0xE8-0xEF */ + 0x51C1,0x51C2,0x51C3,0x51C5,0x51C8,0x51CA,0x51CD,0x51CE,/* 0xF0-0xF7 */ + 0x51D0,0x51D2,0x51D3,0x51D4,0x51D5,0x51D6,0x51D7,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_84[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x51D8,0x51D9,0x51DA,0x51DC,0x51DE,0x51DF,0x51E2,0x51E3,/* 0x40-0x47 */ + 0x51E5,0x51E6,0x51E7,0x51E8,0x51E9,0x51EA,0x51EC,0x51EE,/* 0x48-0x4F */ + 0x51F1,0x51F2,0x51F4,0x51F7,0x51FE,0x5204,0x5205,0x5209,/* 0x50-0x57 */ + 0x520B,0x520C,0x520F,0x5210,0x5213,0x5214,0x5215,0x521C,/* 0x58-0x5F */ + 0x521E,0x521F,0x5221,0x5222,0x5223,0x5225,0x5226,0x5227,/* 0x60-0x67 */ + 0x522A,0x522C,0x522F,0x5231,0x5232,0x5234,0x5235,0x523C,/* 0x68-0x6F */ + 0x523E,0x5244,0x5245,0x5246,0x5247,0x5248,0x5249,0x524B,/* 0x70-0x77 */ + 0x524E,0x524F,0x5252,0x5253,0x5255,0x5257,0x5258,0x0000,/* 0x78-0x7F */ + + 0x5259,0x525A,0x525B,0x525D,0x525F,0x5260,0x5262,0x5263,/* 0x80-0x87 */ + 0x5264,0x5266,0x5268,0x526B,0x526C,0x526D,0x526E,0x5270,/* 0x88-0x8F */ + 0x5271,0x5273,0x5274,0x5275,0x5276,0x5277,0x5278,0x5279,/* 0x90-0x97 */ + 0x527A,0x527B,0x527C,0x527E,0x5280,0x5283,0x5284,0x5285,/* 0x98-0x9F */ + 0x5286,0x5287,0x5289,0x528A,0x528B,0x528C,0x528D,0x528E,/* 0xA0-0xA7 */ + 0x528F,0x5291,0x5292,0x5294,0x5295,0x5296,0x5297,0x5298,/* 0xA8-0xAF */ + 0x5299,0x529A,0x529C,0x52A4,0x52A5,0x52A6,0x52A7,0x52AE,/* 0xB0-0xB7 */ + 0x52AF,0x52B0,0x52B4,0x52B5,0x52B6,0x52B7,0x52B8,0x52B9,/* 0xB8-0xBF */ + 0x52BA,0x52BB,0x52BC,0x52BD,0x52C0,0x52C1,0x52C2,0x52C4,/* 0xC0-0xC7 */ + 0x52C5,0x52C6,0x52C8,0x52CA,0x52CC,0x52CD,0x52CE,0x52CF,/* 0xC8-0xCF */ + 0x52D1,0x52D3,0x52D4,0x52D5,0x52D7,0x52D9,0x52DA,0x52DB,/* 0xD0-0xD7 */ + 0x52DC,0x52DD,0x52DE,0x52E0,0x52E1,0x52E2,0x52E3,0x52E5,/* 0xD8-0xDF */ + 0x52E6,0x52E7,0x52E8,0x52E9,0x52EA,0x52EB,0x52EC,0x52ED,/* 0xE0-0xE7 */ + 0x52EE,0x52EF,0x52F1,0x52F2,0x52F3,0x52F4,0x52F5,0x52F6,/* 0xE8-0xEF */ + 0x52F7,0x52F8,0x52FB,0x52FC,0x52FD,0x5301,0x5302,0x5303,/* 0xF0-0xF7 */ + 0x5304,0x5307,0x5309,0x530A,0x530B,0x530C,0x530E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_85[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5311,0x5312,0x5313,0x5314,0x5318,0x531B,0x531C,0x531E,/* 0x40-0x47 */ + 0x531F,0x5322,0x5324,0x5325,0x5327,0x5328,0x5329,0x532B,/* 0x48-0x4F */ + 0x532C,0x532D,0x532F,0x5330,0x5331,0x5332,0x5333,0x5334,/* 0x50-0x57 */ + 0x5335,0x5336,0x5337,0x5338,0x533C,0x533D,0x5340,0x5342,/* 0x58-0x5F */ + 0x5344,0x5346,0x534B,0x534C,0x534D,0x5350,0x5354,0x5358,/* 0x60-0x67 */ + 0x5359,0x535B,0x535D,0x5365,0x5368,0x536A,0x536C,0x536D,/* 0x68-0x6F */ + 0x5372,0x5376,0x5379,0x537B,0x537C,0x537D,0x537E,0x5380,/* 0x70-0x77 */ + 0x5381,0x5383,0x5387,0x5388,0x538A,0x538E,0x538F,0x0000,/* 0x78-0x7F */ + + 0x5390,0x5391,0x5392,0x5393,0x5394,0x5396,0x5397,0x5399,/* 0x80-0x87 */ + 0x539B,0x539C,0x539E,0x53A0,0x53A1,0x53A4,0x53A7,0x53AA,/* 0x88-0x8F */ + 0x53AB,0x53AC,0x53AD,0x53AF,0x53B0,0x53B1,0x53B2,0x53B3,/* 0x90-0x97 */ + 0x53B4,0x53B5,0x53B7,0x53B8,0x53B9,0x53BA,0x53BC,0x53BD,/* 0x98-0x9F */ + 0x53BE,0x53C0,0x53C3,0x53C4,0x53C5,0x53C6,0x53C7,0x53CE,/* 0xA0-0xA7 */ + 0x53CF,0x53D0,0x53D2,0x53D3,0x53D5,0x53DA,0x53DC,0x53DD,/* 0xA8-0xAF */ + 0x53DE,0x53E1,0x53E2,0x53E7,0x53F4,0x53FA,0x53FE,0x53FF,/* 0xB0-0xB7 */ + 0x5400,0x5402,0x5405,0x5407,0x540B,0x5414,0x5418,0x5419,/* 0xB8-0xBF */ + 0x541A,0x541C,0x5422,0x5424,0x5425,0x542A,0x5430,0x5433,/* 0xC0-0xC7 */ + 0x5436,0x5437,0x543A,0x543D,0x543F,0x5441,0x5442,0x5444,/* 0xC8-0xCF */ + 0x5445,0x5447,0x5449,0x544C,0x544D,0x544E,0x544F,0x5451,/* 0xD0-0xD7 */ + 0x545A,0x545D,0x545E,0x545F,0x5460,0x5461,0x5463,0x5465,/* 0xD8-0xDF */ + 0x5467,0x5469,0x546A,0x546B,0x546C,0x546D,0x546E,0x546F,/* 0xE0-0xE7 */ + 0x5470,0x5474,0x5479,0x547A,0x547E,0x547F,0x5481,0x5483,/* 0xE8-0xEF */ + 0x5485,0x5487,0x5488,0x5489,0x548A,0x548D,0x5491,0x5493,/* 0xF0-0xF7 */ + 0x5497,0x5498,0x549C,0x549E,0x549F,0x54A0,0x54A1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_86[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x54A2,0x54A5,0x54AE,0x54B0,0x54B2,0x54B5,0x54B6,0x54B7,/* 0x40-0x47 */ + 0x54B9,0x54BA,0x54BC,0x54BE,0x54C3,0x54C5,0x54CA,0x54CB,/* 0x48-0x4F */ + 0x54D6,0x54D8,0x54DB,0x54E0,0x54E1,0x54E2,0x54E3,0x54E4,/* 0x50-0x57 */ + 0x54EB,0x54EC,0x54EF,0x54F0,0x54F1,0x54F4,0x54F5,0x54F6,/* 0x58-0x5F */ + 0x54F7,0x54F8,0x54F9,0x54FB,0x54FE,0x5500,0x5502,0x5503,/* 0x60-0x67 */ + 0x5504,0x5505,0x5508,0x550A,0x550B,0x550C,0x550D,0x550E,/* 0x68-0x6F */ + 0x5512,0x5513,0x5515,0x5516,0x5517,0x5518,0x5519,0x551A,/* 0x70-0x77 */ + 0x551C,0x551D,0x551E,0x551F,0x5521,0x5525,0x5526,0x0000,/* 0x78-0x7F */ + + 0x5528,0x5529,0x552B,0x552D,0x5532,0x5534,0x5535,0x5536,/* 0x80-0x87 */ + 0x5538,0x5539,0x553A,0x553B,0x553D,0x5540,0x5542,0x5545,/* 0x88-0x8F */ + 0x5547,0x5548,0x554B,0x554C,0x554D,0x554E,0x554F,0x5551,/* 0x90-0x97 */ + 0x5552,0x5553,0x5554,0x5557,0x5558,0x5559,0x555A,0x555B,/* 0x98-0x9F */ + 0x555D,0x555E,0x555F,0x5560,0x5562,0x5563,0x5568,0x5569,/* 0xA0-0xA7 */ + 0x556B,0x556F,0x5570,0x5571,0x5572,0x5573,0x5574,0x5579,/* 0xA8-0xAF */ + 0x557A,0x557D,0x557F,0x5585,0x5586,0x558C,0x558D,0x558E,/* 0xB0-0xB7 */ + 0x5590,0x5592,0x5593,0x5595,0x5596,0x5597,0x559A,0x559B,/* 0xB8-0xBF */ + 0x559E,0x55A0,0x55A1,0x55A2,0x55A3,0x55A4,0x55A5,0x55A6,/* 0xC0-0xC7 */ + 0x55A8,0x55A9,0x55AA,0x55AB,0x55AC,0x55AD,0x55AE,0x55AF,/* 0xC8-0xCF */ + 0x55B0,0x55B2,0x55B4,0x55B6,0x55B8,0x55BA,0x55BC,0x55BF,/* 0xD0-0xD7 */ + 0x55C0,0x55C1,0x55C2,0x55C3,0x55C6,0x55C7,0x55C8,0x55CA,/* 0xD8-0xDF */ + 0x55CB,0x55CE,0x55CF,0x55D0,0x55D5,0x55D7,0x55D8,0x55D9,/* 0xE0-0xE7 */ + 0x55DA,0x55DB,0x55DE,0x55E0,0x55E2,0x55E7,0x55E9,0x55ED,/* 0xE8-0xEF */ + 0x55EE,0x55F0,0x55F1,0x55F4,0x55F6,0x55F8,0x55F9,0x55FA,/* 0xF0-0xF7 */ + 0x55FB,0x55FC,0x55FF,0x5602,0x5603,0x5604,0x5605,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_87[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5606,0x5607,0x560A,0x560B,0x560D,0x5610,0x5611,0x5612,/* 0x40-0x47 */ + 0x5613,0x5614,0x5615,0x5616,0x5617,0x5619,0x561A,0x561C,/* 0x48-0x4F */ + 0x561D,0x5620,0x5621,0x5622,0x5625,0x5626,0x5628,0x5629,/* 0x50-0x57 */ + 0x562A,0x562B,0x562E,0x562F,0x5630,0x5633,0x5635,0x5637,/* 0x58-0x5F */ + 0x5638,0x563A,0x563C,0x563D,0x563E,0x5640,0x5641,0x5642,/* 0x60-0x67 */ + 0x5643,0x5644,0x5645,0x5646,0x5647,0x5648,0x5649,0x564A,/* 0x68-0x6F */ + 0x564B,0x564F,0x5650,0x5651,0x5652,0x5653,0x5655,0x5656,/* 0x70-0x77 */ + 0x565A,0x565B,0x565D,0x565E,0x565F,0x5660,0x5661,0x0000,/* 0x78-0x7F */ + + 0x5663,0x5665,0x5666,0x5667,0x566D,0x566E,0x566F,0x5670,/* 0x80-0x87 */ + 0x5672,0x5673,0x5674,0x5675,0x5677,0x5678,0x5679,0x567A,/* 0x88-0x8F */ + 0x567D,0x567E,0x567F,0x5680,0x5681,0x5682,0x5683,0x5684,/* 0x90-0x97 */ + 0x5687,0x5688,0x5689,0x568A,0x568B,0x568C,0x568D,0x5690,/* 0x98-0x9F */ + 0x5691,0x5692,0x5694,0x5695,0x5696,0x5697,0x5698,0x5699,/* 0xA0-0xA7 */ + 0x569A,0x569B,0x569C,0x569D,0x569E,0x569F,0x56A0,0x56A1,/* 0xA8-0xAF */ + 0x56A2,0x56A4,0x56A5,0x56A6,0x56A7,0x56A8,0x56A9,0x56AA,/* 0xB0-0xB7 */ + 0x56AB,0x56AC,0x56AD,0x56AE,0x56B0,0x56B1,0x56B2,0x56B3,/* 0xB8-0xBF */ + 0x56B4,0x56B5,0x56B6,0x56B8,0x56B9,0x56BA,0x56BB,0x56BD,/* 0xC0-0xC7 */ + 0x56BE,0x56BF,0x56C0,0x56C1,0x56C2,0x56C3,0x56C4,0x56C5,/* 0xC8-0xCF */ + 0x56C6,0x56C7,0x56C8,0x56C9,0x56CB,0x56CC,0x56CD,0x56CE,/* 0xD0-0xD7 */ + 0x56CF,0x56D0,0x56D1,0x56D2,0x56D3,0x56D5,0x56D6,0x56D8,/* 0xD8-0xDF */ + 0x56D9,0x56DC,0x56E3,0x56E5,0x56E6,0x56E7,0x56E8,0x56E9,/* 0xE0-0xE7 */ + 0x56EA,0x56EC,0x56EE,0x56EF,0x56F2,0x56F3,0x56F6,0x56F7,/* 0xE8-0xEF */ + 0x56F8,0x56FB,0x56FC,0x5700,0x5701,0x5702,0x5705,0x5707,/* 0xF0-0xF7 */ + 0x570B,0x570C,0x570D,0x570E,0x570F,0x5710,0x5711,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_88[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5712,0x5713,0x5714,0x5715,0x5716,0x5717,0x5718,0x5719,/* 0x40-0x47 */ + 0x571A,0x571B,0x571D,0x571E,0x5720,0x5721,0x5722,0x5724,/* 0x48-0x4F */ + 0x5725,0x5726,0x5727,0x572B,0x5731,0x5732,0x5734,0x5735,/* 0x50-0x57 */ + 0x5736,0x5737,0x5738,0x573C,0x573D,0x573F,0x5741,0x5743,/* 0x58-0x5F */ + 0x5744,0x5745,0x5746,0x5748,0x5749,0x574B,0x5752,0x5753,/* 0x60-0x67 */ + 0x5754,0x5755,0x5756,0x5758,0x5759,0x5762,0x5763,0x5765,/* 0x68-0x6F */ + 0x5767,0x576C,0x576E,0x5770,0x5771,0x5772,0x5774,0x5775,/* 0x70-0x77 */ + 0x5778,0x5779,0x577A,0x577D,0x577E,0x577F,0x5780,0x0000,/* 0x78-0x7F */ + + 0x5781,0x5787,0x5788,0x5789,0x578A,0x578D,0x578E,0x578F,/* 0x80-0x87 */ + 0x5790,0x5791,0x5794,0x5795,0x5796,0x5797,0x5798,0x5799,/* 0x88-0x8F */ + 0x579A,0x579C,0x579D,0x579E,0x579F,0x57A5,0x57A8,0x57AA,/* 0x90-0x97 */ + 0x57AC,0x57AF,0x57B0,0x57B1,0x57B3,0x57B5,0x57B6,0x57B7,/* 0x98-0x9F */ + 0x57B9,0x57BA,0x57BB,0x57BC,0x57BD,0x57BE,0x57BF,0x57C0,/* 0xA0-0xA7 */ + 0x57C1,0x57C4,0x57C5,0x57C6,0x57C7,0x57C8,0x57C9,0x57CA,/* 0xA8-0xAF */ + 0x57CC,0x57CD,0x57D0,0x57D1,0x57D3,0x57D6,0x57D7,0x57DB,/* 0xB0-0xB7 */ + 0x57DC,0x57DE,0x57E1,0x57E2,0x57E3,0x57E5,0x57E6,0x57E7,/* 0xB8-0xBF */ + 0x57E8,0x57E9,0x57EA,0x57EB,0x57EC,0x57EE,0x57F0,0x57F1,/* 0xC0-0xC7 */ + 0x57F2,0x57F3,0x57F5,0x57F6,0x57F7,0x57FB,0x57FC,0x57FE,/* 0xC8-0xCF */ + 0x57FF,0x5801,0x5803,0x5804,0x5805,0x5808,0x5809,0x580A,/* 0xD0-0xD7 */ + 0x580C,0x580E,0x580F,0x5810,0x5812,0x5813,0x5814,0x5816,/* 0xD8-0xDF */ + 0x5817,0x5818,0x581A,0x581B,0x581C,0x581D,0x581F,0x5822,/* 0xE0-0xE7 */ + 0x5823,0x5825,0x5826,0x5827,0x5828,0x5829,0x582B,0x582C,/* 0xE8-0xEF */ + 0x582D,0x582E,0x582F,0x5831,0x5832,0x5833,0x5834,0x5836,/* 0xF0-0xF7 */ + 0x5837,0x5838,0x5839,0x583A,0x583B,0x583C,0x583D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_89[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x583E,0x583F,0x5840,0x5841,0x5842,0x5843,0x5845,0x5846,/* 0x40-0x47 */ + 0x5847,0x5848,0x5849,0x584A,0x584B,0x584E,0x584F,0x5850,/* 0x48-0x4F */ + 0x5852,0x5853,0x5855,0x5856,0x5857,0x5859,0x585A,0x585B,/* 0x50-0x57 */ + 0x585C,0x585D,0x585F,0x5860,0x5861,0x5862,0x5863,0x5864,/* 0x58-0x5F */ + 0x5866,0x5867,0x5868,0x5869,0x586A,0x586D,0x586E,0x586F,/* 0x60-0x67 */ + 0x5870,0x5871,0x5872,0x5873,0x5874,0x5875,0x5876,0x5877,/* 0x68-0x6F */ + 0x5878,0x5879,0x587A,0x587B,0x587C,0x587D,0x587F,0x5882,/* 0x70-0x77 */ + 0x5884,0x5886,0x5887,0x5888,0x588A,0x588B,0x588C,0x0000,/* 0x78-0x7F */ + + 0x588D,0x588E,0x588F,0x5890,0x5891,0x5894,0x5895,0x5896,/* 0x80-0x87 */ + 0x5897,0x5898,0x589B,0x589C,0x589D,0x58A0,0x58A1,0x58A2,/* 0x88-0x8F */ + 0x58A3,0x58A4,0x58A5,0x58A6,0x58A7,0x58AA,0x58AB,0x58AC,/* 0x90-0x97 */ + 0x58AD,0x58AE,0x58AF,0x58B0,0x58B1,0x58B2,0x58B3,0x58B4,/* 0x98-0x9F */ + 0x58B5,0x58B6,0x58B7,0x58B8,0x58B9,0x58BA,0x58BB,0x58BD,/* 0xA0-0xA7 */ + 0x58BE,0x58BF,0x58C0,0x58C2,0x58C3,0x58C4,0x58C6,0x58C7,/* 0xA8-0xAF */ + 0x58C8,0x58C9,0x58CA,0x58CB,0x58CC,0x58CD,0x58CE,0x58CF,/* 0xB0-0xB7 */ + 0x58D0,0x58D2,0x58D3,0x58D4,0x58D6,0x58D7,0x58D8,0x58D9,/* 0xB8-0xBF */ + 0x58DA,0x58DB,0x58DC,0x58DD,0x58DE,0x58DF,0x58E0,0x58E1,/* 0xC0-0xC7 */ + 0x58E2,0x58E3,0x58E5,0x58E6,0x58E7,0x58E8,0x58E9,0x58EA,/* 0xC8-0xCF */ + 0x58ED,0x58EF,0x58F1,0x58F2,0x58F4,0x58F5,0x58F7,0x58F8,/* 0xD0-0xD7 */ + 0x58FA,0x58FB,0x58FC,0x58FD,0x58FE,0x58FF,0x5900,0x5901,/* 0xD8-0xDF */ + 0x5903,0x5905,0x5906,0x5908,0x5909,0x590A,0x590B,0x590C,/* 0xE0-0xE7 */ + 0x590E,0x5910,0x5911,0x5912,0x5913,0x5917,0x5918,0x591B,/* 0xE8-0xEF */ + 0x591D,0x591E,0x5920,0x5921,0x5922,0x5923,0x5926,0x5928,/* 0xF0-0xF7 */ + 0x592C,0x5930,0x5932,0x5933,0x5935,0x5936,0x593B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8A[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x593D,0x593E,0x593F,0x5940,0x5943,0x5945,0x5946,0x594A,/* 0x40-0x47 */ + 0x594C,0x594D,0x5950,0x5952,0x5953,0x5959,0x595B,0x595C,/* 0x48-0x4F */ + 0x595D,0x595E,0x595F,0x5961,0x5963,0x5964,0x5966,0x5967,/* 0x50-0x57 */ + 0x5968,0x5969,0x596A,0x596B,0x596C,0x596D,0x596E,0x596F,/* 0x58-0x5F */ + 0x5970,0x5971,0x5972,0x5975,0x5977,0x597A,0x597B,0x597C,/* 0x60-0x67 */ + 0x597E,0x597F,0x5980,0x5985,0x5989,0x598B,0x598C,0x598E,/* 0x68-0x6F */ + 0x598F,0x5990,0x5991,0x5994,0x5995,0x5998,0x599A,0x599B,/* 0x70-0x77 */ + 0x599C,0x599D,0x599F,0x59A0,0x59A1,0x59A2,0x59A6,0x0000,/* 0x78-0x7F */ + + 0x59A7,0x59AC,0x59AD,0x59B0,0x59B1,0x59B3,0x59B4,0x59B5,/* 0x80-0x87 */ + 0x59B6,0x59B7,0x59B8,0x59BA,0x59BC,0x59BD,0x59BF,0x59C0,/* 0x88-0x8F */ + 0x59C1,0x59C2,0x59C3,0x59C4,0x59C5,0x59C7,0x59C8,0x59C9,/* 0x90-0x97 */ + 0x59CC,0x59CD,0x59CE,0x59CF,0x59D5,0x59D6,0x59D9,0x59DB,/* 0x98-0x9F */ + 0x59DE,0x59DF,0x59E0,0x59E1,0x59E2,0x59E4,0x59E6,0x59E7,/* 0xA0-0xA7 */ + 0x59E9,0x59EA,0x59EB,0x59ED,0x59EE,0x59EF,0x59F0,0x59F1,/* 0xA8-0xAF */ + 0x59F2,0x59F3,0x59F4,0x59F5,0x59F6,0x59F7,0x59F8,0x59FA,/* 0xB0-0xB7 */ + 0x59FC,0x59FD,0x59FE,0x5A00,0x5A02,0x5A0A,0x5A0B,0x5A0D,/* 0xB8-0xBF */ + 0x5A0E,0x5A0F,0x5A10,0x5A12,0x5A14,0x5A15,0x5A16,0x5A17,/* 0xC0-0xC7 */ + 0x5A19,0x5A1A,0x5A1B,0x5A1D,0x5A1E,0x5A21,0x5A22,0x5A24,/* 0xC8-0xCF */ + 0x5A26,0x5A27,0x5A28,0x5A2A,0x5A2B,0x5A2C,0x5A2D,0x5A2E,/* 0xD0-0xD7 */ + 0x5A2F,0x5A30,0x5A33,0x5A35,0x5A37,0x5A38,0x5A39,0x5A3A,/* 0xD8-0xDF */ + 0x5A3B,0x5A3D,0x5A3E,0x5A3F,0x5A41,0x5A42,0x5A43,0x5A44,/* 0xE0-0xE7 */ + 0x5A45,0x5A47,0x5A48,0x5A4B,0x5A4C,0x5A4D,0x5A4E,0x5A4F,/* 0xE8-0xEF */ + 0x5A50,0x5A51,0x5A52,0x5A53,0x5A54,0x5A56,0x5A57,0x5A58,/* 0xF0-0xF7 */ + 0x5A59,0x5A5B,0x5A5C,0x5A5D,0x5A5E,0x5A5F,0x5A60,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8B[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5A61,0x5A63,0x5A64,0x5A65,0x5A66,0x5A68,0x5A69,0x5A6B,/* 0x40-0x47 */ + 0x5A6C,0x5A6D,0x5A6E,0x5A6F,0x5A70,0x5A71,0x5A72,0x5A73,/* 0x48-0x4F */ + 0x5A78,0x5A79,0x5A7B,0x5A7C,0x5A7D,0x5A7E,0x5A80,0x5A81,/* 0x50-0x57 */ + 0x5A82,0x5A83,0x5A84,0x5A85,0x5A86,0x5A87,0x5A88,0x5A89,/* 0x58-0x5F */ + 0x5A8A,0x5A8B,0x5A8C,0x5A8D,0x5A8E,0x5A8F,0x5A90,0x5A91,/* 0x60-0x67 */ + 0x5A93,0x5A94,0x5A95,0x5A96,0x5A97,0x5A98,0x5A99,0x5A9C,/* 0x68-0x6F */ + 0x5A9D,0x5A9E,0x5A9F,0x5AA0,0x5AA1,0x5AA2,0x5AA3,0x5AA4,/* 0x70-0x77 */ + 0x5AA5,0x5AA6,0x5AA7,0x5AA8,0x5AA9,0x5AAB,0x5AAC,0x0000,/* 0x78-0x7F */ + + 0x5AAD,0x5AAE,0x5AAF,0x5AB0,0x5AB1,0x5AB4,0x5AB6,0x5AB7,/* 0x80-0x87 */ + 0x5AB9,0x5ABA,0x5ABB,0x5ABC,0x5ABD,0x5ABF,0x5AC0,0x5AC3,/* 0x88-0x8F */ + 0x5AC4,0x5AC5,0x5AC6,0x5AC7,0x5AC8,0x5ACA,0x5ACB,0x5ACD,/* 0x90-0x97 */ + 0x5ACE,0x5ACF,0x5AD0,0x5AD1,0x5AD3,0x5AD5,0x5AD7,0x5AD9,/* 0x98-0x9F */ + 0x5ADA,0x5ADB,0x5ADD,0x5ADE,0x5ADF,0x5AE2,0x5AE4,0x5AE5,/* 0xA0-0xA7 */ + 0x5AE7,0x5AE8,0x5AEA,0x5AEC,0x5AED,0x5AEE,0x5AEF,0x5AF0,/* 0xA8-0xAF */ + 0x5AF2,0x5AF3,0x5AF4,0x5AF5,0x5AF6,0x5AF7,0x5AF8,0x5AF9,/* 0xB0-0xB7 */ + 0x5AFA,0x5AFB,0x5AFC,0x5AFD,0x5AFE,0x5AFF,0x5B00,0x5B01,/* 0xB8-0xBF */ + 0x5B02,0x5B03,0x5B04,0x5B05,0x5B06,0x5B07,0x5B08,0x5B0A,/* 0xC0-0xC7 */ + 0x5B0B,0x5B0C,0x5B0D,0x5B0E,0x5B0F,0x5B10,0x5B11,0x5B12,/* 0xC8-0xCF */ + 0x5B13,0x5B14,0x5B15,0x5B18,0x5B19,0x5B1A,0x5B1B,0x5B1C,/* 0xD0-0xD7 */ + 0x5B1D,0x5B1E,0x5B1F,0x5B20,0x5B21,0x5B22,0x5B23,0x5B24,/* 0xD8-0xDF */ + 0x5B25,0x5B26,0x5B27,0x5B28,0x5B29,0x5B2A,0x5B2B,0x5B2C,/* 0xE0-0xE7 */ + 0x5B2D,0x5B2E,0x5B2F,0x5B30,0x5B31,0x5B33,0x5B35,0x5B36,/* 0xE8-0xEF */ + 0x5B38,0x5B39,0x5B3A,0x5B3B,0x5B3C,0x5B3D,0x5B3E,0x5B3F,/* 0xF0-0xF7 */ + 0x5B41,0x5B42,0x5B43,0x5B44,0x5B45,0x5B46,0x5B47,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8C[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5B48,0x5B49,0x5B4A,0x5B4B,0x5B4C,0x5B4D,0x5B4E,0x5B4F,/* 0x40-0x47 */ + 0x5B52,0x5B56,0x5B5E,0x5B60,0x5B61,0x5B67,0x5B68,0x5B6B,/* 0x48-0x4F */ + 0x5B6D,0x5B6E,0x5B6F,0x5B72,0x5B74,0x5B76,0x5B77,0x5B78,/* 0x50-0x57 */ + 0x5B79,0x5B7B,0x5B7C,0x5B7E,0x5B7F,0x5B82,0x5B86,0x5B8A,/* 0x58-0x5F */ + 0x5B8D,0x5B8E,0x5B90,0x5B91,0x5B92,0x5B94,0x5B96,0x5B9F,/* 0x60-0x67 */ + 0x5BA7,0x5BA8,0x5BA9,0x5BAC,0x5BAD,0x5BAE,0x5BAF,0x5BB1,/* 0x68-0x6F */ + 0x5BB2,0x5BB7,0x5BBA,0x5BBB,0x5BBC,0x5BC0,0x5BC1,0x5BC3,/* 0x70-0x77 */ + 0x5BC8,0x5BC9,0x5BCA,0x5BCB,0x5BCD,0x5BCE,0x5BCF,0x0000,/* 0x78-0x7F */ + + 0x5BD1,0x5BD4,0x5BD5,0x5BD6,0x5BD7,0x5BD8,0x5BD9,0x5BDA,/* 0x80-0x87 */ + 0x5BDB,0x5BDC,0x5BE0,0x5BE2,0x5BE3,0x5BE6,0x5BE7,0x5BE9,/* 0x88-0x8F */ + 0x5BEA,0x5BEB,0x5BEC,0x5BED,0x5BEF,0x5BF1,0x5BF2,0x5BF3,/* 0x90-0x97 */ + 0x5BF4,0x5BF5,0x5BF6,0x5BF7,0x5BFD,0x5BFE,0x5C00,0x5C02,/* 0x98-0x9F */ + 0x5C03,0x5C05,0x5C07,0x5C08,0x5C0B,0x5C0C,0x5C0D,0x5C0E,/* 0xA0-0xA7 */ + 0x5C10,0x5C12,0x5C13,0x5C17,0x5C19,0x5C1B,0x5C1E,0x5C1F,/* 0xA8-0xAF */ + 0x5C20,0x5C21,0x5C23,0x5C26,0x5C28,0x5C29,0x5C2A,0x5C2B,/* 0xB0-0xB7 */ + 0x5C2D,0x5C2E,0x5C2F,0x5C30,0x5C32,0x5C33,0x5C35,0x5C36,/* 0xB8-0xBF */ + 0x5C37,0x5C43,0x5C44,0x5C46,0x5C47,0x5C4C,0x5C4D,0x5C52,/* 0xC0-0xC7 */ + 0x5C53,0x5C54,0x5C56,0x5C57,0x5C58,0x5C5A,0x5C5B,0x5C5C,/* 0xC8-0xCF */ + 0x5C5D,0x5C5F,0x5C62,0x5C64,0x5C67,0x5C68,0x5C69,0x5C6A,/* 0xD0-0xD7 */ + 0x5C6B,0x5C6C,0x5C6D,0x5C70,0x5C72,0x5C73,0x5C74,0x5C75,/* 0xD8-0xDF */ + 0x5C76,0x5C77,0x5C78,0x5C7B,0x5C7C,0x5C7D,0x5C7E,0x5C80,/* 0xE0-0xE7 */ + 0x5C83,0x5C84,0x5C85,0x5C86,0x5C87,0x5C89,0x5C8A,0x5C8B,/* 0xE8-0xEF */ + 0x5C8E,0x5C8F,0x5C92,0x5C93,0x5C95,0x5C9D,0x5C9E,0x5C9F,/* 0xF0-0xF7 */ + 0x5CA0,0x5CA1,0x5CA4,0x5CA5,0x5CA6,0x5CA7,0x5CA8,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8D[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5CAA,0x5CAE,0x5CAF,0x5CB0,0x5CB2,0x5CB4,0x5CB6,0x5CB9,/* 0x40-0x47 */ + 0x5CBA,0x5CBB,0x5CBC,0x5CBE,0x5CC0,0x5CC2,0x5CC3,0x5CC5,/* 0x48-0x4F */ + 0x5CC6,0x5CC7,0x5CC8,0x5CC9,0x5CCA,0x5CCC,0x5CCD,0x5CCE,/* 0x50-0x57 */ + 0x5CCF,0x5CD0,0x5CD1,0x5CD3,0x5CD4,0x5CD5,0x5CD6,0x5CD7,/* 0x58-0x5F */ + 0x5CD8,0x5CDA,0x5CDB,0x5CDC,0x5CDD,0x5CDE,0x5CDF,0x5CE0,/* 0x60-0x67 */ + 0x5CE2,0x5CE3,0x5CE7,0x5CE9,0x5CEB,0x5CEC,0x5CEE,0x5CEF,/* 0x68-0x6F */ + 0x5CF1,0x5CF2,0x5CF3,0x5CF4,0x5CF5,0x5CF6,0x5CF7,0x5CF8,/* 0x70-0x77 */ + 0x5CF9,0x5CFA,0x5CFC,0x5CFD,0x5CFE,0x5CFF,0x5D00,0x0000,/* 0x78-0x7F */ + + 0x5D01,0x5D04,0x5D05,0x5D08,0x5D09,0x5D0A,0x5D0B,0x5D0C,/* 0x80-0x87 */ + 0x5D0D,0x5D0F,0x5D10,0x5D11,0x5D12,0x5D13,0x5D15,0x5D17,/* 0x88-0x8F */ + 0x5D18,0x5D19,0x5D1A,0x5D1C,0x5D1D,0x5D1F,0x5D20,0x5D21,/* 0x90-0x97 */ + 0x5D22,0x5D23,0x5D25,0x5D28,0x5D2A,0x5D2B,0x5D2C,0x5D2F,/* 0x98-0x9F */ + 0x5D30,0x5D31,0x5D32,0x5D33,0x5D35,0x5D36,0x5D37,0x5D38,/* 0xA0-0xA7 */ + 0x5D39,0x5D3A,0x5D3B,0x5D3C,0x5D3F,0x5D40,0x5D41,0x5D42,/* 0xA8-0xAF */ + 0x5D43,0x5D44,0x5D45,0x5D46,0x5D48,0x5D49,0x5D4D,0x5D4E,/* 0xB0-0xB7 */ + 0x5D4F,0x5D50,0x5D51,0x5D52,0x5D53,0x5D54,0x5D55,0x5D56,/* 0xB8-0xBF */ + 0x5D57,0x5D59,0x5D5A,0x5D5C,0x5D5E,0x5D5F,0x5D60,0x5D61,/* 0xC0-0xC7 */ + 0x5D62,0x5D63,0x5D64,0x5D65,0x5D66,0x5D67,0x5D68,0x5D6A,/* 0xC8-0xCF */ + 0x5D6D,0x5D6E,0x5D70,0x5D71,0x5D72,0x5D73,0x5D75,0x5D76,/* 0xD0-0xD7 */ + 0x5D77,0x5D78,0x5D79,0x5D7A,0x5D7B,0x5D7C,0x5D7D,0x5D7E,/* 0xD8-0xDF */ + 0x5D7F,0x5D80,0x5D81,0x5D83,0x5D84,0x5D85,0x5D86,0x5D87,/* 0xE0-0xE7 */ + 0x5D88,0x5D89,0x5D8A,0x5D8B,0x5D8C,0x5D8D,0x5D8E,0x5D8F,/* 0xE8-0xEF */ + 0x5D90,0x5D91,0x5D92,0x5D93,0x5D94,0x5D95,0x5D96,0x5D97,/* 0xF0-0xF7 */ + 0x5D98,0x5D9A,0x5D9B,0x5D9C,0x5D9E,0x5D9F,0x5DA0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8E[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5DA1,0x5DA2,0x5DA3,0x5DA4,0x5DA5,0x5DA6,0x5DA7,0x5DA8,/* 0x40-0x47 */ + 0x5DA9,0x5DAA,0x5DAB,0x5DAC,0x5DAD,0x5DAE,0x5DAF,0x5DB0,/* 0x48-0x4F */ + 0x5DB1,0x5DB2,0x5DB3,0x5DB4,0x5DB5,0x5DB6,0x5DB8,0x5DB9,/* 0x50-0x57 */ + 0x5DBA,0x5DBB,0x5DBC,0x5DBD,0x5DBE,0x5DBF,0x5DC0,0x5DC1,/* 0x58-0x5F */ + 0x5DC2,0x5DC3,0x5DC4,0x5DC6,0x5DC7,0x5DC8,0x5DC9,0x5DCA,/* 0x60-0x67 */ + 0x5DCB,0x5DCC,0x5DCE,0x5DCF,0x5DD0,0x5DD1,0x5DD2,0x5DD3,/* 0x68-0x6F */ + 0x5DD4,0x5DD5,0x5DD6,0x5DD7,0x5DD8,0x5DD9,0x5DDA,0x5DDC,/* 0x70-0x77 */ + 0x5DDF,0x5DE0,0x5DE3,0x5DE4,0x5DEA,0x5DEC,0x5DED,0x0000,/* 0x78-0x7F */ + + 0x5DF0,0x5DF5,0x5DF6,0x5DF8,0x5DF9,0x5DFA,0x5DFB,0x5DFC,/* 0x80-0x87 */ + 0x5DFF,0x5E00,0x5E04,0x5E07,0x5E09,0x5E0A,0x5E0B,0x5E0D,/* 0x88-0x8F */ + 0x5E0E,0x5E12,0x5E13,0x5E17,0x5E1E,0x5E1F,0x5E20,0x5E21,/* 0x90-0x97 */ + 0x5E22,0x5E23,0x5E24,0x5E25,0x5E28,0x5E29,0x5E2A,0x5E2B,/* 0x98-0x9F */ + 0x5E2C,0x5E2F,0x5E30,0x5E32,0x5E33,0x5E34,0x5E35,0x5E36,/* 0xA0-0xA7 */ + 0x5E39,0x5E3A,0x5E3E,0x5E3F,0x5E40,0x5E41,0x5E43,0x5E46,/* 0xA8-0xAF */ + 0x5E47,0x5E48,0x5E49,0x5E4A,0x5E4B,0x5E4D,0x5E4E,0x5E4F,/* 0xB0-0xB7 */ + 0x5E50,0x5E51,0x5E52,0x5E53,0x5E56,0x5E57,0x5E58,0x5E59,/* 0xB8-0xBF */ + 0x5E5A,0x5E5C,0x5E5D,0x5E5F,0x5E60,0x5E63,0x5E64,0x5E65,/* 0xC0-0xC7 */ + 0x5E66,0x5E67,0x5E68,0x5E69,0x5E6A,0x5E6B,0x5E6C,0x5E6D,/* 0xC8-0xCF */ + 0x5E6E,0x5E6F,0x5E70,0x5E71,0x5E75,0x5E77,0x5E79,0x5E7E,/* 0xD0-0xD7 */ + 0x5E81,0x5E82,0x5E83,0x5E85,0x5E88,0x5E89,0x5E8C,0x5E8D,/* 0xD8-0xDF */ + 0x5E8E,0x5E92,0x5E98,0x5E9B,0x5E9D,0x5EA1,0x5EA2,0x5EA3,/* 0xE0-0xE7 */ + 0x5EA4,0x5EA8,0x5EA9,0x5EAA,0x5EAB,0x5EAC,0x5EAE,0x5EAF,/* 0xE8-0xEF */ + 0x5EB0,0x5EB1,0x5EB2,0x5EB4,0x5EBA,0x5EBB,0x5EBC,0x5EBD,/* 0xF0-0xF7 */ + 0x5EBF,0x5EC0,0x5EC1,0x5EC2,0x5EC3,0x5EC4,0x5EC5,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8F[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5EC6,0x5EC7,0x5EC8,0x5ECB,0x5ECC,0x5ECD,0x5ECE,0x5ECF,/* 0x40-0x47 */ + 0x5ED0,0x5ED4,0x5ED5,0x5ED7,0x5ED8,0x5ED9,0x5EDA,0x5EDC,/* 0x48-0x4F */ + 0x5EDD,0x5EDE,0x5EDF,0x5EE0,0x5EE1,0x5EE2,0x5EE3,0x5EE4,/* 0x50-0x57 */ + 0x5EE5,0x5EE6,0x5EE7,0x5EE9,0x5EEB,0x5EEC,0x5EED,0x5EEE,/* 0x58-0x5F */ + 0x5EEF,0x5EF0,0x5EF1,0x5EF2,0x5EF3,0x5EF5,0x5EF8,0x5EF9,/* 0x60-0x67 */ + 0x5EFB,0x5EFC,0x5EFD,0x5F05,0x5F06,0x5F07,0x5F09,0x5F0C,/* 0x68-0x6F */ + 0x5F0D,0x5F0E,0x5F10,0x5F12,0x5F14,0x5F16,0x5F19,0x5F1A,/* 0x70-0x77 */ + 0x5F1C,0x5F1D,0x5F1E,0x5F21,0x5F22,0x5F23,0x5F24,0x0000,/* 0x78-0x7F */ + + 0x5F28,0x5F2B,0x5F2C,0x5F2E,0x5F30,0x5F32,0x5F33,0x5F34,/* 0x80-0x87 */ + 0x5F35,0x5F36,0x5F37,0x5F38,0x5F3B,0x5F3D,0x5F3E,0x5F3F,/* 0x88-0x8F */ + 0x5F41,0x5F42,0x5F43,0x5F44,0x5F45,0x5F46,0x5F47,0x5F48,/* 0x90-0x97 */ + 0x5F49,0x5F4A,0x5F4B,0x5F4C,0x5F4D,0x5F4E,0x5F4F,0x5F51,/* 0x98-0x9F */ + 0x5F54,0x5F59,0x5F5A,0x5F5B,0x5F5C,0x5F5E,0x5F5F,0x5F60,/* 0xA0-0xA7 */ + 0x5F63,0x5F65,0x5F67,0x5F68,0x5F6B,0x5F6E,0x5F6F,0x5F72,/* 0xA8-0xAF */ + 0x5F74,0x5F75,0x5F76,0x5F78,0x5F7A,0x5F7D,0x5F7E,0x5F7F,/* 0xB0-0xB7 */ + 0x5F83,0x5F86,0x5F8D,0x5F8E,0x5F8F,0x5F91,0x5F93,0x5F94,/* 0xB8-0xBF */ + 0x5F96,0x5F9A,0x5F9B,0x5F9D,0x5F9E,0x5F9F,0x5FA0,0x5FA2,/* 0xC0-0xC7 */ + 0x5FA3,0x5FA4,0x5FA5,0x5FA6,0x5FA7,0x5FA9,0x5FAB,0x5FAC,/* 0xC8-0xCF */ + 0x5FAF,0x5FB0,0x5FB1,0x5FB2,0x5FB3,0x5FB4,0x5FB6,0x5FB8,/* 0xD0-0xD7 */ + 0x5FB9,0x5FBA,0x5FBB,0x5FBE,0x5FBF,0x5FC0,0x5FC1,0x5FC2,/* 0xD8-0xDF */ + 0x5FC7,0x5FC8,0x5FCA,0x5FCB,0x5FCE,0x5FD3,0x5FD4,0x5FD5,/* 0xE0-0xE7 */ + 0x5FDA,0x5FDB,0x5FDC,0x5FDE,0x5FDF,0x5FE2,0x5FE3,0x5FE5,/* 0xE8-0xEF */ + 0x5FE6,0x5FE8,0x5FE9,0x5FEC,0x5FEF,0x5FF0,0x5FF2,0x5FF3,/* 0xF0-0xF7 */ + 0x5FF4,0x5FF6,0x5FF7,0x5FF9,0x5FFA,0x5FFC,0x6007,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_90[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6008,0x6009,0x600B,0x600C,0x6010,0x6011,0x6013,0x6017,/* 0x40-0x47 */ + 0x6018,0x601A,0x601E,0x601F,0x6022,0x6023,0x6024,0x602C,/* 0x48-0x4F */ + 0x602D,0x602E,0x6030,0x6031,0x6032,0x6033,0x6034,0x6036,/* 0x50-0x57 */ + 0x6037,0x6038,0x6039,0x603A,0x603D,0x603E,0x6040,0x6044,/* 0x58-0x5F */ + 0x6045,0x6046,0x6047,0x6048,0x6049,0x604A,0x604C,0x604E,/* 0x60-0x67 */ + 0x604F,0x6051,0x6053,0x6054,0x6056,0x6057,0x6058,0x605B,/* 0x68-0x6F */ + 0x605C,0x605E,0x605F,0x6060,0x6061,0x6065,0x6066,0x606E,/* 0x70-0x77 */ + 0x6071,0x6072,0x6074,0x6075,0x6077,0x607E,0x6080,0x0000,/* 0x78-0x7F */ + + 0x6081,0x6082,0x6085,0x6086,0x6087,0x6088,0x608A,0x608B,/* 0x80-0x87 */ + 0x608E,0x608F,0x6090,0x6091,0x6093,0x6095,0x6097,0x6098,/* 0x88-0x8F */ + 0x6099,0x609C,0x609E,0x60A1,0x60A2,0x60A4,0x60A5,0x60A7,/* 0x90-0x97 */ + 0x60A9,0x60AA,0x60AE,0x60B0,0x60B3,0x60B5,0x60B6,0x60B7,/* 0x98-0x9F */ + 0x60B9,0x60BA,0x60BD,0x60BE,0x60BF,0x60C0,0x60C1,0x60C2,/* 0xA0-0xA7 */ + 0x60C3,0x60C4,0x60C7,0x60C8,0x60C9,0x60CC,0x60CD,0x60CE,/* 0xA8-0xAF */ + 0x60CF,0x60D0,0x60D2,0x60D3,0x60D4,0x60D6,0x60D7,0x60D9,/* 0xB0-0xB7 */ + 0x60DB,0x60DE,0x60E1,0x60E2,0x60E3,0x60E4,0x60E5,0x60EA,/* 0xB8-0xBF */ + 0x60F1,0x60F2,0x60F5,0x60F7,0x60F8,0x60FB,0x60FC,0x60FD,/* 0xC0-0xC7 */ + 0x60FE,0x60FF,0x6102,0x6103,0x6104,0x6105,0x6107,0x610A,/* 0xC8-0xCF */ + 0x610B,0x610C,0x6110,0x6111,0x6112,0x6113,0x6114,0x6116,/* 0xD0-0xD7 */ + 0x6117,0x6118,0x6119,0x611B,0x611C,0x611D,0x611E,0x6121,/* 0xD8-0xDF */ + 0x6122,0x6125,0x6128,0x6129,0x612A,0x612C,0x612D,0x612E,/* 0xE0-0xE7 */ + 0x612F,0x6130,0x6131,0x6132,0x6133,0x6134,0x6135,0x6136,/* 0xE8-0xEF */ + 0x6137,0x6138,0x6139,0x613A,0x613B,0x613C,0x613D,0x613E,/* 0xF0-0xF7 */ + 0x6140,0x6141,0x6142,0x6143,0x6144,0x6145,0x6146,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_91[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6147,0x6149,0x614B,0x614D,0x614F,0x6150,0x6152,0x6153,/* 0x40-0x47 */ + 0x6154,0x6156,0x6157,0x6158,0x6159,0x615A,0x615B,0x615C,/* 0x48-0x4F */ + 0x615E,0x615F,0x6160,0x6161,0x6163,0x6164,0x6165,0x6166,/* 0x50-0x57 */ + 0x6169,0x616A,0x616B,0x616C,0x616D,0x616E,0x616F,0x6171,/* 0x58-0x5F */ + 0x6172,0x6173,0x6174,0x6176,0x6178,0x6179,0x617A,0x617B,/* 0x60-0x67 */ + 0x617C,0x617D,0x617E,0x617F,0x6180,0x6181,0x6182,0x6183,/* 0x68-0x6F */ + 0x6184,0x6185,0x6186,0x6187,0x6188,0x6189,0x618A,0x618C,/* 0x70-0x77 */ + 0x618D,0x618F,0x6190,0x6191,0x6192,0x6193,0x6195,0x0000,/* 0x78-0x7F */ + + 0x6196,0x6197,0x6198,0x6199,0x619A,0x619B,0x619C,0x619E,/* 0x80-0x87 */ + 0x619F,0x61A0,0x61A1,0x61A2,0x61A3,0x61A4,0x61A5,0x61A6,/* 0x88-0x8F */ + 0x61AA,0x61AB,0x61AD,0x61AE,0x61AF,0x61B0,0x61B1,0x61B2,/* 0x90-0x97 */ + 0x61B3,0x61B4,0x61B5,0x61B6,0x61B8,0x61B9,0x61BA,0x61BB,/* 0x98-0x9F */ + 0x61BC,0x61BD,0x61BF,0x61C0,0x61C1,0x61C3,0x61C4,0x61C5,/* 0xA0-0xA7 */ + 0x61C6,0x61C7,0x61C9,0x61CC,0x61CD,0x61CE,0x61CF,0x61D0,/* 0xA8-0xAF */ + 0x61D3,0x61D5,0x61D6,0x61D7,0x61D8,0x61D9,0x61DA,0x61DB,/* 0xB0-0xB7 */ + 0x61DC,0x61DD,0x61DE,0x61DF,0x61E0,0x61E1,0x61E2,0x61E3,/* 0xB8-0xBF */ + 0x61E4,0x61E5,0x61E7,0x61E8,0x61E9,0x61EA,0x61EB,0x61EC,/* 0xC0-0xC7 */ + 0x61ED,0x61EE,0x61EF,0x61F0,0x61F1,0x61F2,0x61F3,0x61F4,/* 0xC8-0xCF */ + 0x61F6,0x61F7,0x61F8,0x61F9,0x61FA,0x61FB,0x61FC,0x61FD,/* 0xD0-0xD7 */ + 0x61FE,0x6200,0x6201,0x6202,0x6203,0x6204,0x6205,0x6207,/* 0xD8-0xDF */ + 0x6209,0x6213,0x6214,0x6219,0x621C,0x621D,0x621E,0x6220,/* 0xE0-0xE7 */ + 0x6223,0x6226,0x6227,0x6228,0x6229,0x622B,0x622D,0x622F,/* 0xE8-0xEF */ + 0x6230,0x6231,0x6232,0x6235,0x6236,0x6238,0x6239,0x623A,/* 0xF0-0xF7 */ + 0x623B,0x623C,0x6242,0x6244,0x6245,0x6246,0x624A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_92[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x624F,0x6250,0x6255,0x6256,0x6257,0x6259,0x625A,0x625C,/* 0x40-0x47 */ + 0x625D,0x625E,0x625F,0x6260,0x6261,0x6262,0x6264,0x6265,/* 0x48-0x4F */ + 0x6268,0x6271,0x6272,0x6274,0x6275,0x6277,0x6278,0x627A,/* 0x50-0x57 */ + 0x627B,0x627D,0x6281,0x6282,0x6283,0x6285,0x6286,0x6287,/* 0x58-0x5F */ + 0x6288,0x628B,0x628C,0x628D,0x628E,0x628F,0x6290,0x6294,/* 0x60-0x67 */ + 0x6299,0x629C,0x629D,0x629E,0x62A3,0x62A6,0x62A7,0x62A9,/* 0x68-0x6F */ + 0x62AA,0x62AD,0x62AE,0x62AF,0x62B0,0x62B2,0x62B3,0x62B4,/* 0x70-0x77 */ + 0x62B6,0x62B7,0x62B8,0x62BA,0x62BE,0x62C0,0x62C1,0x0000,/* 0x78-0x7F */ + + 0x62C3,0x62CB,0x62CF,0x62D1,0x62D5,0x62DD,0x62DE,0x62E0,/* 0x80-0x87 */ + 0x62E1,0x62E4,0x62EA,0x62EB,0x62F0,0x62F2,0x62F5,0x62F8,/* 0x88-0x8F */ + 0x62F9,0x62FA,0x62FB,0x6300,0x6303,0x6304,0x6305,0x6306,/* 0x90-0x97 */ + 0x630A,0x630B,0x630C,0x630D,0x630F,0x6310,0x6312,0x6313,/* 0x98-0x9F */ + 0x6314,0x6315,0x6317,0x6318,0x6319,0x631C,0x6326,0x6327,/* 0xA0-0xA7 */ + 0x6329,0x632C,0x632D,0x632E,0x6330,0x6331,0x6333,0x6334,/* 0xA8-0xAF */ + 0x6335,0x6336,0x6337,0x6338,0x633B,0x633C,0x633E,0x633F,/* 0xB0-0xB7 */ + 0x6340,0x6341,0x6344,0x6347,0x6348,0x634A,0x6351,0x6352,/* 0xB8-0xBF */ + 0x6353,0x6354,0x6356,0x6357,0x6358,0x6359,0x635A,0x635B,/* 0xC0-0xC7 */ + 0x635C,0x635D,0x6360,0x6364,0x6365,0x6366,0x6368,0x636A,/* 0xC8-0xCF */ + 0x636B,0x636C,0x636F,0x6370,0x6372,0x6373,0x6374,0x6375,/* 0xD0-0xD7 */ + 0x6378,0x6379,0x637C,0x637D,0x637E,0x637F,0x6381,0x6383,/* 0xD8-0xDF */ + 0x6384,0x6385,0x6386,0x638B,0x638D,0x6391,0x6393,0x6394,/* 0xE0-0xE7 */ + 0x6395,0x6397,0x6399,0x639A,0x639B,0x639C,0x639D,0x639E,/* 0xE8-0xEF */ + 0x639F,0x63A1,0x63A4,0x63A6,0x63AB,0x63AF,0x63B1,0x63B2,/* 0xF0-0xF7 */ + 0x63B5,0x63B6,0x63B9,0x63BB,0x63BD,0x63BF,0x63C0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_93[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x63C1,0x63C2,0x63C3,0x63C5,0x63C7,0x63C8,0x63CA,0x63CB,/* 0x40-0x47 */ + 0x63CC,0x63D1,0x63D3,0x63D4,0x63D5,0x63D7,0x63D8,0x63D9,/* 0x48-0x4F */ + 0x63DA,0x63DB,0x63DC,0x63DD,0x63DF,0x63E2,0x63E4,0x63E5,/* 0x50-0x57 */ + 0x63E6,0x63E7,0x63E8,0x63EB,0x63EC,0x63EE,0x63EF,0x63F0,/* 0x58-0x5F */ + 0x63F1,0x63F3,0x63F5,0x63F7,0x63F9,0x63FA,0x63FB,0x63FC,/* 0x60-0x67 */ + 0x63FE,0x6403,0x6404,0x6406,0x6407,0x6408,0x6409,0x640A,/* 0x68-0x6F */ + 0x640D,0x640E,0x6411,0x6412,0x6415,0x6416,0x6417,0x6418,/* 0x70-0x77 */ + 0x6419,0x641A,0x641D,0x641F,0x6422,0x6423,0x6424,0x0000,/* 0x78-0x7F */ + + 0x6425,0x6427,0x6428,0x6429,0x642B,0x642E,0x642F,0x6430,/* 0x80-0x87 */ + 0x6431,0x6432,0x6433,0x6435,0x6436,0x6437,0x6438,0x6439,/* 0x88-0x8F */ + 0x643B,0x643C,0x643E,0x6440,0x6442,0x6443,0x6449,0x644B,/* 0x90-0x97 */ + 0x644C,0x644D,0x644E,0x644F,0x6450,0x6451,0x6453,0x6455,/* 0x98-0x9F */ + 0x6456,0x6457,0x6459,0x645A,0x645B,0x645C,0x645D,0x645F,/* 0xA0-0xA7 */ + 0x6460,0x6461,0x6462,0x6463,0x6464,0x6465,0x6466,0x6468,/* 0xA8-0xAF */ + 0x646A,0x646B,0x646C,0x646E,0x646F,0x6470,0x6471,0x6472,/* 0xB0-0xB7 */ + 0x6473,0x6474,0x6475,0x6476,0x6477,0x647B,0x647C,0x647D,/* 0xB8-0xBF */ + 0x647E,0x647F,0x6480,0x6481,0x6483,0x6486,0x6488,0x6489,/* 0xC0-0xC7 */ + 0x648A,0x648B,0x648C,0x648D,0x648E,0x648F,0x6490,0x6493,/* 0xC8-0xCF */ + 0x6494,0x6497,0x6498,0x649A,0x649B,0x649C,0x649D,0x649F,/* 0xD0-0xD7 */ + 0x64A0,0x64A1,0x64A2,0x64A3,0x64A5,0x64A6,0x64A7,0x64A8,/* 0xD8-0xDF */ + 0x64AA,0x64AB,0x64AF,0x64B1,0x64B2,0x64B3,0x64B4,0x64B6,/* 0xE0-0xE7 */ + 0x64B9,0x64BB,0x64BD,0x64BE,0x64BF,0x64C1,0x64C3,0x64C4,/* 0xE8-0xEF */ + 0x64C6,0x64C7,0x64C8,0x64C9,0x64CA,0x64CB,0x64CC,0x64CF,/* 0xF0-0xF7 */ + 0x64D1,0x64D3,0x64D4,0x64D5,0x64D6,0x64D9,0x64DA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_94[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x64DB,0x64DC,0x64DD,0x64DF,0x64E0,0x64E1,0x64E3,0x64E5,/* 0x40-0x47 */ + 0x64E7,0x64E8,0x64E9,0x64EA,0x64EB,0x64EC,0x64ED,0x64EE,/* 0x48-0x4F */ + 0x64EF,0x64F0,0x64F1,0x64F2,0x64F3,0x64F4,0x64F5,0x64F6,/* 0x50-0x57 */ + 0x64F7,0x64F8,0x64F9,0x64FA,0x64FB,0x64FC,0x64FD,0x64FE,/* 0x58-0x5F */ + 0x64FF,0x6501,0x6502,0x6503,0x6504,0x6505,0x6506,0x6507,/* 0x60-0x67 */ + 0x6508,0x650A,0x650B,0x650C,0x650D,0x650E,0x650F,0x6510,/* 0x68-0x6F */ + 0x6511,0x6513,0x6514,0x6515,0x6516,0x6517,0x6519,0x651A,/* 0x70-0x77 */ + 0x651B,0x651C,0x651D,0x651E,0x651F,0x6520,0x6521,0x0000,/* 0x78-0x7F */ + + 0x6522,0x6523,0x6524,0x6526,0x6527,0x6528,0x6529,0x652A,/* 0x80-0x87 */ + 0x652C,0x652D,0x6530,0x6531,0x6532,0x6533,0x6537,0x653A,/* 0x88-0x8F */ + 0x653C,0x653D,0x6540,0x6541,0x6542,0x6543,0x6544,0x6546,/* 0x90-0x97 */ + 0x6547,0x654A,0x654B,0x654D,0x654E,0x6550,0x6552,0x6553,/* 0x98-0x9F */ + 0x6554,0x6557,0x6558,0x655A,0x655C,0x655F,0x6560,0x6561,/* 0xA0-0xA7 */ + 0x6564,0x6565,0x6567,0x6568,0x6569,0x656A,0x656D,0x656E,/* 0xA8-0xAF */ + 0x656F,0x6571,0x6573,0x6575,0x6576,0x6578,0x6579,0x657A,/* 0xB0-0xB7 */ + 0x657B,0x657C,0x657D,0x657E,0x657F,0x6580,0x6581,0x6582,/* 0xB8-0xBF */ + 0x6583,0x6584,0x6585,0x6586,0x6588,0x6589,0x658A,0x658D,/* 0xC0-0xC7 */ + 0x658E,0x658F,0x6592,0x6594,0x6595,0x6596,0x6598,0x659A,/* 0xC8-0xCF */ + 0x659D,0x659E,0x65A0,0x65A2,0x65A3,0x65A6,0x65A8,0x65AA,/* 0xD0-0xD7 */ + 0x65AC,0x65AE,0x65B1,0x65B2,0x65B3,0x65B4,0x65B5,0x65B6,/* 0xD8-0xDF */ + 0x65B7,0x65B8,0x65BA,0x65BB,0x65BE,0x65BF,0x65C0,0x65C2,/* 0xE0-0xE7 */ + 0x65C7,0x65C8,0x65C9,0x65CA,0x65CD,0x65D0,0x65D1,0x65D3,/* 0xE8-0xEF */ + 0x65D4,0x65D5,0x65D8,0x65D9,0x65DA,0x65DB,0x65DC,0x65DD,/* 0xF0-0xF7 */ + 0x65DE,0x65DF,0x65E1,0x65E3,0x65E4,0x65EA,0x65EB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_95[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x65F2,0x65F3,0x65F4,0x65F5,0x65F8,0x65F9,0x65FB,0x65FC,/* 0x40-0x47 */ + 0x65FD,0x65FE,0x65FF,0x6601,0x6604,0x6605,0x6607,0x6608,/* 0x48-0x4F */ + 0x6609,0x660B,0x660D,0x6610,0x6611,0x6612,0x6616,0x6617,/* 0x50-0x57 */ + 0x6618,0x661A,0x661B,0x661C,0x661E,0x6621,0x6622,0x6623,/* 0x58-0x5F */ + 0x6624,0x6626,0x6629,0x662A,0x662B,0x662C,0x662E,0x6630,/* 0x60-0x67 */ + 0x6632,0x6633,0x6637,0x6638,0x6639,0x663A,0x663B,0x663D,/* 0x68-0x6F */ + 0x663F,0x6640,0x6642,0x6644,0x6645,0x6646,0x6647,0x6648,/* 0x70-0x77 */ + 0x6649,0x664A,0x664D,0x664E,0x6650,0x6651,0x6658,0x0000,/* 0x78-0x7F */ + + 0x6659,0x665B,0x665C,0x665D,0x665E,0x6660,0x6662,0x6663,/* 0x80-0x87 */ + 0x6665,0x6667,0x6669,0x666A,0x666B,0x666C,0x666D,0x6671,/* 0x88-0x8F */ + 0x6672,0x6673,0x6675,0x6678,0x6679,0x667B,0x667C,0x667D,/* 0x90-0x97 */ + 0x667F,0x6680,0x6681,0x6683,0x6685,0x6686,0x6688,0x6689,/* 0x98-0x9F */ + 0x668A,0x668B,0x668D,0x668E,0x668F,0x6690,0x6692,0x6693,/* 0xA0-0xA7 */ + 0x6694,0x6695,0x6698,0x6699,0x669A,0x669B,0x669C,0x669E,/* 0xA8-0xAF */ + 0x669F,0x66A0,0x66A1,0x66A2,0x66A3,0x66A4,0x66A5,0x66A6,/* 0xB0-0xB7 */ + 0x66A9,0x66AA,0x66AB,0x66AC,0x66AD,0x66AF,0x66B0,0x66B1,/* 0xB8-0xBF */ + 0x66B2,0x66B3,0x66B5,0x66B6,0x66B7,0x66B8,0x66BA,0x66BB,/* 0xC0-0xC7 */ + 0x66BC,0x66BD,0x66BF,0x66C0,0x66C1,0x66C2,0x66C3,0x66C4,/* 0xC8-0xCF */ + 0x66C5,0x66C6,0x66C7,0x66C8,0x66C9,0x66CA,0x66CB,0x66CC,/* 0xD0-0xD7 */ + 0x66CD,0x66CE,0x66CF,0x66D0,0x66D1,0x66D2,0x66D3,0x66D4,/* 0xD8-0xDF */ + 0x66D5,0x66D6,0x66D7,0x66D8,0x66DA,0x66DE,0x66DF,0x66E0,/* 0xE0-0xE7 */ + 0x66E1,0x66E2,0x66E3,0x66E4,0x66E5,0x66E7,0x66E8,0x66EA,/* 0xE8-0xEF */ + 0x66EB,0x66EC,0x66ED,0x66EE,0x66EF,0x66F1,0x66F5,0x66F6,/* 0xF0-0xF7 */ + 0x66F8,0x66FA,0x66FB,0x66FD,0x6701,0x6702,0x6703,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_96[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6704,0x6705,0x6706,0x6707,0x670C,0x670E,0x670F,0x6711,/* 0x40-0x47 */ + 0x6712,0x6713,0x6716,0x6718,0x6719,0x671A,0x671C,0x671E,/* 0x48-0x4F */ + 0x6720,0x6721,0x6722,0x6723,0x6724,0x6725,0x6727,0x6729,/* 0x50-0x57 */ + 0x672E,0x6730,0x6732,0x6733,0x6736,0x6737,0x6738,0x6739,/* 0x58-0x5F */ + 0x673B,0x673C,0x673E,0x673F,0x6741,0x6744,0x6745,0x6747,/* 0x60-0x67 */ + 0x674A,0x674B,0x674D,0x6752,0x6754,0x6755,0x6757,0x6758,/* 0x68-0x6F */ + 0x6759,0x675A,0x675B,0x675D,0x6762,0x6763,0x6764,0x6766,/* 0x70-0x77 */ + 0x6767,0x676B,0x676C,0x676E,0x6771,0x6774,0x6776,0x0000,/* 0x78-0x7F */ + + 0x6778,0x6779,0x677A,0x677B,0x677D,0x6780,0x6782,0x6783,/* 0x80-0x87 */ + 0x6785,0x6786,0x6788,0x678A,0x678C,0x678D,0x678E,0x678F,/* 0x88-0x8F */ + 0x6791,0x6792,0x6793,0x6794,0x6796,0x6799,0x679B,0x679F,/* 0x90-0x97 */ + 0x67A0,0x67A1,0x67A4,0x67A6,0x67A9,0x67AC,0x67AE,0x67B1,/* 0x98-0x9F */ + 0x67B2,0x67B4,0x67B9,0x67BA,0x67BB,0x67BC,0x67BD,0x67BE,/* 0xA0-0xA7 */ + 0x67BF,0x67C0,0x67C2,0x67C5,0x67C6,0x67C7,0x67C8,0x67C9,/* 0xA8-0xAF */ + 0x67CA,0x67CB,0x67CC,0x67CD,0x67CE,0x67D5,0x67D6,0x67D7,/* 0xB0-0xB7 */ + 0x67DB,0x67DF,0x67E1,0x67E3,0x67E4,0x67E6,0x67E7,0x67E8,/* 0xB8-0xBF */ + 0x67EA,0x67EB,0x67ED,0x67EE,0x67F2,0x67F5,0x67F6,0x67F7,/* 0xC0-0xC7 */ + 0x67F8,0x67F9,0x67FA,0x67FB,0x67FC,0x67FE,0x6801,0x6802,/* 0xC8-0xCF */ + 0x6803,0x6804,0x6806,0x680D,0x6810,0x6812,0x6814,0x6815,/* 0xD0-0xD7 */ + 0x6818,0x6819,0x681A,0x681B,0x681C,0x681E,0x681F,0x6820,/* 0xD8-0xDF */ + 0x6822,0x6823,0x6824,0x6825,0x6826,0x6827,0x6828,0x682B,/* 0xE0-0xE7 */ + 0x682C,0x682D,0x682E,0x682F,0x6830,0x6831,0x6834,0x6835,/* 0xE8-0xEF */ + 0x6836,0x683A,0x683B,0x683F,0x6847,0x684B,0x684D,0x684F,/* 0xF0-0xF7 */ + 0x6852,0x6856,0x6857,0x6858,0x6859,0x685A,0x685B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_97[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x685C,0x685D,0x685E,0x685F,0x686A,0x686C,0x686D,0x686E,/* 0x40-0x47 */ + 0x686F,0x6870,0x6871,0x6872,0x6873,0x6875,0x6878,0x6879,/* 0x48-0x4F */ + 0x687A,0x687B,0x687C,0x687D,0x687E,0x687F,0x6880,0x6882,/* 0x50-0x57 */ + 0x6884,0x6887,0x6888,0x6889,0x688A,0x688B,0x688C,0x688D,/* 0x58-0x5F */ + 0x688E,0x6890,0x6891,0x6892,0x6894,0x6895,0x6896,0x6898,/* 0x60-0x67 */ + 0x6899,0x689A,0x689B,0x689C,0x689D,0x689E,0x689F,0x68A0,/* 0x68-0x6F */ + 0x68A1,0x68A3,0x68A4,0x68A5,0x68A9,0x68AA,0x68AB,0x68AC,/* 0x70-0x77 */ + 0x68AE,0x68B1,0x68B2,0x68B4,0x68B6,0x68B7,0x68B8,0x0000,/* 0x78-0x7F */ + + 0x68B9,0x68BA,0x68BB,0x68BC,0x68BD,0x68BE,0x68BF,0x68C1,/* 0x80-0x87 */ + 0x68C3,0x68C4,0x68C5,0x68C6,0x68C7,0x68C8,0x68CA,0x68CC,/* 0x88-0x8F */ + 0x68CE,0x68CF,0x68D0,0x68D1,0x68D3,0x68D4,0x68D6,0x68D7,/* 0x90-0x97 */ + 0x68D9,0x68DB,0x68DC,0x68DD,0x68DE,0x68DF,0x68E1,0x68E2,/* 0x98-0x9F */ + 0x68E4,0x68E5,0x68E6,0x68E7,0x68E8,0x68E9,0x68EA,0x68EB,/* 0xA0-0xA7 */ + 0x68EC,0x68ED,0x68EF,0x68F2,0x68F3,0x68F4,0x68F6,0x68F7,/* 0xA8-0xAF */ + 0x68F8,0x68FB,0x68FD,0x68FE,0x68FF,0x6900,0x6902,0x6903,/* 0xB0-0xB7 */ + 0x6904,0x6906,0x6907,0x6908,0x6909,0x690A,0x690C,0x690F,/* 0xB8-0xBF */ + 0x6911,0x6913,0x6914,0x6915,0x6916,0x6917,0x6918,0x6919,/* 0xC0-0xC7 */ + 0x691A,0x691B,0x691C,0x691D,0x691E,0x6921,0x6922,0x6923,/* 0xC8-0xCF */ + 0x6925,0x6926,0x6927,0x6928,0x6929,0x692A,0x692B,0x692C,/* 0xD0-0xD7 */ + 0x692E,0x692F,0x6931,0x6932,0x6933,0x6935,0x6936,0x6937,/* 0xD8-0xDF */ + 0x6938,0x693A,0x693B,0x693C,0x693E,0x6940,0x6941,0x6943,/* 0xE0-0xE7 */ + 0x6944,0x6945,0x6946,0x6947,0x6948,0x6949,0x694A,0x694B,/* 0xE8-0xEF */ + 0x694C,0x694D,0x694E,0x694F,0x6950,0x6951,0x6952,0x6953,/* 0xF0-0xF7 */ + 0x6955,0x6956,0x6958,0x6959,0x695B,0x695C,0x695F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_98[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6961,0x6962,0x6964,0x6965,0x6967,0x6968,0x6969,0x696A,/* 0x40-0x47 */ + 0x696C,0x696D,0x696F,0x6970,0x6972,0x6973,0x6974,0x6975,/* 0x48-0x4F */ + 0x6976,0x697A,0x697B,0x697D,0x697E,0x697F,0x6981,0x6983,/* 0x50-0x57 */ + 0x6985,0x698A,0x698B,0x698C,0x698E,0x698F,0x6990,0x6991,/* 0x58-0x5F */ + 0x6992,0x6993,0x6996,0x6997,0x6999,0x699A,0x699D,0x699E,/* 0x60-0x67 */ + 0x699F,0x69A0,0x69A1,0x69A2,0x69A3,0x69A4,0x69A5,0x69A6,/* 0x68-0x6F */ + 0x69A9,0x69AA,0x69AC,0x69AE,0x69AF,0x69B0,0x69B2,0x69B3,/* 0x70-0x77 */ + 0x69B5,0x69B6,0x69B8,0x69B9,0x69BA,0x69BC,0x69BD,0x0000,/* 0x78-0x7F */ + + 0x69BE,0x69BF,0x69C0,0x69C2,0x69C3,0x69C4,0x69C5,0x69C6,/* 0x80-0x87 */ + 0x69C7,0x69C8,0x69C9,0x69CB,0x69CD,0x69CF,0x69D1,0x69D2,/* 0x88-0x8F */ + 0x69D3,0x69D5,0x69D6,0x69D7,0x69D8,0x69D9,0x69DA,0x69DC,/* 0x90-0x97 */ + 0x69DD,0x69DE,0x69E1,0x69E2,0x69E3,0x69E4,0x69E5,0x69E6,/* 0x98-0x9F */ + 0x69E7,0x69E8,0x69E9,0x69EA,0x69EB,0x69EC,0x69EE,0x69EF,/* 0xA0-0xA7 */ + 0x69F0,0x69F1,0x69F3,0x69F4,0x69F5,0x69F6,0x69F7,0x69F8,/* 0xA8-0xAF */ + 0x69F9,0x69FA,0x69FB,0x69FC,0x69FE,0x6A00,0x6A01,0x6A02,/* 0xB0-0xB7 */ + 0x6A03,0x6A04,0x6A05,0x6A06,0x6A07,0x6A08,0x6A09,0x6A0B,/* 0xB8-0xBF */ + 0x6A0C,0x6A0D,0x6A0E,0x6A0F,0x6A10,0x6A11,0x6A12,0x6A13,/* 0xC0-0xC7 */ + 0x6A14,0x6A15,0x6A16,0x6A19,0x6A1A,0x6A1B,0x6A1C,0x6A1D,/* 0xC8-0xCF */ + 0x6A1E,0x6A20,0x6A22,0x6A23,0x6A24,0x6A25,0x6A26,0x6A27,/* 0xD0-0xD7 */ + 0x6A29,0x6A2B,0x6A2C,0x6A2D,0x6A2E,0x6A30,0x6A32,0x6A33,/* 0xD8-0xDF */ + 0x6A34,0x6A36,0x6A37,0x6A38,0x6A39,0x6A3A,0x6A3B,0x6A3C,/* 0xE0-0xE7 */ + 0x6A3F,0x6A40,0x6A41,0x6A42,0x6A43,0x6A45,0x6A46,0x6A48,/* 0xE8-0xEF */ + 0x6A49,0x6A4A,0x6A4B,0x6A4C,0x6A4D,0x6A4E,0x6A4F,0x6A51,/* 0xF0-0xF7 */ + 0x6A52,0x6A53,0x6A54,0x6A55,0x6A56,0x6A57,0x6A5A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_99[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6A5C,0x6A5D,0x6A5E,0x6A5F,0x6A60,0x6A62,0x6A63,0x6A64,/* 0x40-0x47 */ + 0x6A66,0x6A67,0x6A68,0x6A69,0x6A6A,0x6A6B,0x6A6C,0x6A6D,/* 0x48-0x4F */ + 0x6A6E,0x6A6F,0x6A70,0x6A72,0x6A73,0x6A74,0x6A75,0x6A76,/* 0x50-0x57 */ + 0x6A77,0x6A78,0x6A7A,0x6A7B,0x6A7D,0x6A7E,0x6A7F,0x6A81,/* 0x58-0x5F */ + 0x6A82,0x6A83,0x6A85,0x6A86,0x6A87,0x6A88,0x6A89,0x6A8A,/* 0x60-0x67 */ + 0x6A8B,0x6A8C,0x6A8D,0x6A8F,0x6A92,0x6A93,0x6A94,0x6A95,/* 0x68-0x6F */ + 0x6A96,0x6A98,0x6A99,0x6A9A,0x6A9B,0x6A9C,0x6A9D,0x6A9E,/* 0x70-0x77 */ + 0x6A9F,0x6AA1,0x6AA2,0x6AA3,0x6AA4,0x6AA5,0x6AA6,0x0000,/* 0x78-0x7F */ + + 0x6AA7,0x6AA8,0x6AAA,0x6AAD,0x6AAE,0x6AAF,0x6AB0,0x6AB1,/* 0x80-0x87 */ + 0x6AB2,0x6AB3,0x6AB4,0x6AB5,0x6AB6,0x6AB7,0x6AB8,0x6AB9,/* 0x88-0x8F */ + 0x6ABA,0x6ABB,0x6ABC,0x6ABD,0x6ABE,0x6ABF,0x6AC0,0x6AC1,/* 0x90-0x97 */ + 0x6AC2,0x6AC3,0x6AC4,0x6AC5,0x6AC6,0x6AC7,0x6AC8,0x6AC9,/* 0x98-0x9F */ + 0x6ACA,0x6ACB,0x6ACC,0x6ACD,0x6ACE,0x6ACF,0x6AD0,0x6AD1,/* 0xA0-0xA7 */ + 0x6AD2,0x6AD3,0x6AD4,0x6AD5,0x6AD6,0x6AD7,0x6AD8,0x6AD9,/* 0xA8-0xAF */ + 0x6ADA,0x6ADB,0x6ADC,0x6ADD,0x6ADE,0x6ADF,0x6AE0,0x6AE1,/* 0xB0-0xB7 */ + 0x6AE2,0x6AE3,0x6AE4,0x6AE5,0x6AE6,0x6AE7,0x6AE8,0x6AE9,/* 0xB8-0xBF */ + 0x6AEA,0x6AEB,0x6AEC,0x6AED,0x6AEE,0x6AEF,0x6AF0,0x6AF1,/* 0xC0-0xC7 */ + 0x6AF2,0x6AF3,0x6AF4,0x6AF5,0x6AF6,0x6AF7,0x6AF8,0x6AF9,/* 0xC8-0xCF */ + 0x6AFA,0x6AFB,0x6AFC,0x6AFD,0x6AFE,0x6AFF,0x6B00,0x6B01,/* 0xD0-0xD7 */ + 0x6B02,0x6B03,0x6B04,0x6B05,0x6B06,0x6B07,0x6B08,0x6B09,/* 0xD8-0xDF */ + 0x6B0A,0x6B0B,0x6B0C,0x6B0D,0x6B0E,0x6B0F,0x6B10,0x6B11,/* 0xE0-0xE7 */ + 0x6B12,0x6B13,0x6B14,0x6B15,0x6B16,0x6B17,0x6B18,0x6B19,/* 0xE8-0xEF */ + 0x6B1A,0x6B1B,0x6B1C,0x6B1D,0x6B1E,0x6B1F,0x6B25,0x6B26,/* 0xF0-0xF7 */ + 0x6B28,0x6B29,0x6B2A,0x6B2B,0x6B2C,0x6B2D,0x6B2E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9A[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6B2F,0x6B30,0x6B31,0x6B33,0x6B34,0x6B35,0x6B36,0x6B38,/* 0x40-0x47 */ + 0x6B3B,0x6B3C,0x6B3D,0x6B3F,0x6B40,0x6B41,0x6B42,0x6B44,/* 0x48-0x4F */ + 0x6B45,0x6B48,0x6B4A,0x6B4B,0x6B4D,0x6B4E,0x6B4F,0x6B50,/* 0x50-0x57 */ + 0x6B51,0x6B52,0x6B53,0x6B54,0x6B55,0x6B56,0x6B57,0x6B58,/* 0x58-0x5F */ + 0x6B5A,0x6B5B,0x6B5C,0x6B5D,0x6B5E,0x6B5F,0x6B60,0x6B61,/* 0x60-0x67 */ + 0x6B68,0x6B69,0x6B6B,0x6B6C,0x6B6D,0x6B6E,0x6B6F,0x6B70,/* 0x68-0x6F */ + 0x6B71,0x6B72,0x6B73,0x6B74,0x6B75,0x6B76,0x6B77,0x6B78,/* 0x70-0x77 */ + 0x6B7A,0x6B7D,0x6B7E,0x6B7F,0x6B80,0x6B85,0x6B88,0x0000,/* 0x78-0x7F */ + + 0x6B8C,0x6B8E,0x6B8F,0x6B90,0x6B91,0x6B94,0x6B95,0x6B97,/* 0x80-0x87 */ + 0x6B98,0x6B99,0x6B9C,0x6B9D,0x6B9E,0x6B9F,0x6BA0,0x6BA2,/* 0x88-0x8F */ + 0x6BA3,0x6BA4,0x6BA5,0x6BA6,0x6BA7,0x6BA8,0x6BA9,0x6BAB,/* 0x90-0x97 */ + 0x6BAC,0x6BAD,0x6BAE,0x6BAF,0x6BB0,0x6BB1,0x6BB2,0x6BB6,/* 0x98-0x9F */ + 0x6BB8,0x6BB9,0x6BBA,0x6BBB,0x6BBC,0x6BBD,0x6BBE,0x6BC0,/* 0xA0-0xA7 */ + 0x6BC3,0x6BC4,0x6BC6,0x6BC7,0x6BC8,0x6BC9,0x6BCA,0x6BCC,/* 0xA8-0xAF */ + 0x6BCE,0x6BD0,0x6BD1,0x6BD8,0x6BDA,0x6BDC,0x6BDD,0x6BDE,/* 0xB0-0xB7 */ + 0x6BDF,0x6BE0,0x6BE2,0x6BE3,0x6BE4,0x6BE5,0x6BE6,0x6BE7,/* 0xB8-0xBF */ + 0x6BE8,0x6BE9,0x6BEC,0x6BED,0x6BEE,0x6BF0,0x6BF1,0x6BF2,/* 0xC0-0xC7 */ + 0x6BF4,0x6BF6,0x6BF7,0x6BF8,0x6BFA,0x6BFB,0x6BFC,0x6BFE,/* 0xC8-0xCF */ + 0x6BFF,0x6C00,0x6C01,0x6C02,0x6C03,0x6C04,0x6C08,0x6C09,/* 0xD0-0xD7 */ + 0x6C0A,0x6C0B,0x6C0C,0x6C0E,0x6C12,0x6C17,0x6C1C,0x6C1D,/* 0xD8-0xDF */ + 0x6C1E,0x6C20,0x6C23,0x6C25,0x6C2B,0x6C2C,0x6C2D,0x6C31,/* 0xE0-0xE7 */ + 0x6C33,0x6C36,0x6C37,0x6C39,0x6C3A,0x6C3B,0x6C3C,0x6C3E,/* 0xE8-0xEF */ + 0x6C3F,0x6C43,0x6C44,0x6C45,0x6C48,0x6C4B,0x6C4C,0x6C4D,/* 0xF0-0xF7 */ + 0x6C4E,0x6C4F,0x6C51,0x6C52,0x6C53,0x6C56,0x6C58,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9B[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6C59,0x6C5A,0x6C62,0x6C63,0x6C65,0x6C66,0x6C67,0x6C6B,/* 0x40-0x47 */ + 0x6C6C,0x6C6D,0x6C6E,0x6C6F,0x6C71,0x6C73,0x6C75,0x6C77,/* 0x48-0x4F */ + 0x6C78,0x6C7A,0x6C7B,0x6C7C,0x6C7F,0x6C80,0x6C84,0x6C87,/* 0x50-0x57 */ + 0x6C8A,0x6C8B,0x6C8D,0x6C8E,0x6C91,0x6C92,0x6C95,0x6C96,/* 0x58-0x5F */ + 0x6C97,0x6C98,0x6C9A,0x6C9C,0x6C9D,0x6C9E,0x6CA0,0x6CA2,/* 0x60-0x67 */ + 0x6CA8,0x6CAC,0x6CAF,0x6CB0,0x6CB4,0x6CB5,0x6CB6,0x6CB7,/* 0x68-0x6F */ + 0x6CBA,0x6CC0,0x6CC1,0x6CC2,0x6CC3,0x6CC6,0x6CC7,0x6CC8,/* 0x70-0x77 */ + 0x6CCB,0x6CCD,0x6CCE,0x6CCF,0x6CD1,0x6CD2,0x6CD8,0x0000,/* 0x78-0x7F */ + + 0x6CD9,0x6CDA,0x6CDC,0x6CDD,0x6CDF,0x6CE4,0x6CE6,0x6CE7,/* 0x80-0x87 */ + 0x6CE9,0x6CEC,0x6CED,0x6CF2,0x6CF4,0x6CF9,0x6CFF,0x6D00,/* 0x88-0x8F */ + 0x6D02,0x6D03,0x6D05,0x6D06,0x6D08,0x6D09,0x6D0A,0x6D0D,/* 0x90-0x97 */ + 0x6D0F,0x6D10,0x6D11,0x6D13,0x6D14,0x6D15,0x6D16,0x6D18,/* 0x98-0x9F */ + 0x6D1C,0x6D1D,0x6D1F,0x6D20,0x6D21,0x6D22,0x6D23,0x6D24,/* 0xA0-0xA7 */ + 0x6D26,0x6D28,0x6D29,0x6D2C,0x6D2D,0x6D2F,0x6D30,0x6D34,/* 0xA8-0xAF */ + 0x6D36,0x6D37,0x6D38,0x6D3A,0x6D3F,0x6D40,0x6D42,0x6D44,/* 0xB0-0xB7 */ + 0x6D49,0x6D4C,0x6D50,0x6D55,0x6D56,0x6D57,0x6D58,0x6D5B,/* 0xB8-0xBF */ + 0x6D5D,0x6D5F,0x6D61,0x6D62,0x6D64,0x6D65,0x6D67,0x6D68,/* 0xC0-0xC7 */ + 0x6D6B,0x6D6C,0x6D6D,0x6D70,0x6D71,0x6D72,0x6D73,0x6D75,/* 0xC8-0xCF */ + 0x6D76,0x6D79,0x6D7A,0x6D7B,0x6D7D,0x6D7E,0x6D7F,0x6D80,/* 0xD0-0xD7 */ + 0x6D81,0x6D83,0x6D84,0x6D86,0x6D87,0x6D8A,0x6D8B,0x6D8D,/* 0xD8-0xDF */ + 0x6D8F,0x6D90,0x6D92,0x6D96,0x6D97,0x6D98,0x6D99,0x6D9A,/* 0xE0-0xE7 */ + 0x6D9C,0x6DA2,0x6DA5,0x6DAC,0x6DAD,0x6DB0,0x6DB1,0x6DB3,/* 0xE8-0xEF */ + 0x6DB4,0x6DB6,0x6DB7,0x6DB9,0x6DBA,0x6DBB,0x6DBC,0x6DBD,/* 0xF0-0xF7 */ + 0x6DBE,0x6DC1,0x6DC2,0x6DC3,0x6DC8,0x6DC9,0x6DCA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9C[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6DCD,0x6DCE,0x6DCF,0x6DD0,0x6DD2,0x6DD3,0x6DD4,0x6DD5,/* 0x40-0x47 */ + 0x6DD7,0x6DDA,0x6DDB,0x6DDC,0x6DDF,0x6DE2,0x6DE3,0x6DE5,/* 0x48-0x4F */ + 0x6DE7,0x6DE8,0x6DE9,0x6DEA,0x6DED,0x6DEF,0x6DF0,0x6DF2,/* 0x50-0x57 */ + 0x6DF4,0x6DF5,0x6DF6,0x6DF8,0x6DFA,0x6DFD,0x6DFE,0x6DFF,/* 0x58-0x5F */ + 0x6E00,0x6E01,0x6E02,0x6E03,0x6E04,0x6E06,0x6E07,0x6E08,/* 0x60-0x67 */ + 0x6E09,0x6E0B,0x6E0F,0x6E12,0x6E13,0x6E15,0x6E18,0x6E19,/* 0x68-0x6F */ + 0x6E1B,0x6E1C,0x6E1E,0x6E1F,0x6E22,0x6E26,0x6E27,0x6E28,/* 0x70-0x77 */ + 0x6E2A,0x6E2C,0x6E2E,0x6E30,0x6E31,0x6E33,0x6E35,0x0000,/* 0x78-0x7F */ + + 0x6E36,0x6E37,0x6E39,0x6E3B,0x6E3C,0x6E3D,0x6E3E,0x6E3F,/* 0x80-0x87 */ + 0x6E40,0x6E41,0x6E42,0x6E45,0x6E46,0x6E47,0x6E48,0x6E49,/* 0x88-0x8F */ + 0x6E4A,0x6E4B,0x6E4C,0x6E4F,0x6E50,0x6E51,0x6E52,0x6E55,/* 0x90-0x97 */ + 0x6E57,0x6E59,0x6E5A,0x6E5C,0x6E5D,0x6E5E,0x6E60,0x6E61,/* 0x98-0x9F */ + 0x6E62,0x6E63,0x6E64,0x6E65,0x6E66,0x6E67,0x6E68,0x6E69,/* 0xA0-0xA7 */ + 0x6E6A,0x6E6C,0x6E6D,0x6E6F,0x6E70,0x6E71,0x6E72,0x6E73,/* 0xA8-0xAF */ + 0x6E74,0x6E75,0x6E76,0x6E77,0x6E78,0x6E79,0x6E7A,0x6E7B,/* 0xB0-0xB7 */ + 0x6E7C,0x6E7D,0x6E80,0x6E81,0x6E82,0x6E84,0x6E87,0x6E88,/* 0xB8-0xBF */ + 0x6E8A,0x6E8B,0x6E8C,0x6E8D,0x6E8E,0x6E91,0x6E92,0x6E93,/* 0xC0-0xC7 */ + 0x6E94,0x6E95,0x6E96,0x6E97,0x6E99,0x6E9A,0x6E9B,0x6E9D,/* 0xC8-0xCF */ + 0x6E9E,0x6EA0,0x6EA1,0x6EA3,0x6EA4,0x6EA6,0x6EA8,0x6EA9,/* 0xD0-0xD7 */ + 0x6EAB,0x6EAC,0x6EAD,0x6EAE,0x6EB0,0x6EB3,0x6EB5,0x6EB8,/* 0xD8-0xDF */ + 0x6EB9,0x6EBC,0x6EBE,0x6EBF,0x6EC0,0x6EC3,0x6EC4,0x6EC5,/* 0xE0-0xE7 */ + 0x6EC6,0x6EC8,0x6EC9,0x6ECA,0x6ECC,0x6ECD,0x6ECE,0x6ED0,/* 0xE8-0xEF */ + 0x6ED2,0x6ED6,0x6ED8,0x6ED9,0x6EDB,0x6EDC,0x6EDD,0x6EE3,/* 0xF0-0xF7 */ + 0x6EE7,0x6EEA,0x6EEB,0x6EEC,0x6EED,0x6EEE,0x6EEF,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9D[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6EF0,0x6EF1,0x6EF2,0x6EF3,0x6EF5,0x6EF6,0x6EF7,0x6EF8,/* 0x40-0x47 */ + 0x6EFA,0x6EFB,0x6EFC,0x6EFD,0x6EFE,0x6EFF,0x6F00,0x6F01,/* 0x48-0x4F */ + 0x6F03,0x6F04,0x6F05,0x6F07,0x6F08,0x6F0A,0x6F0B,0x6F0C,/* 0x50-0x57 */ + 0x6F0D,0x6F0E,0x6F10,0x6F11,0x6F12,0x6F16,0x6F17,0x6F18,/* 0x58-0x5F */ + 0x6F19,0x6F1A,0x6F1B,0x6F1C,0x6F1D,0x6F1E,0x6F1F,0x6F21,/* 0x60-0x67 */ + 0x6F22,0x6F23,0x6F25,0x6F26,0x6F27,0x6F28,0x6F2C,0x6F2E,/* 0x68-0x6F */ + 0x6F30,0x6F32,0x6F34,0x6F35,0x6F37,0x6F38,0x6F39,0x6F3A,/* 0x70-0x77 */ + 0x6F3B,0x6F3C,0x6F3D,0x6F3F,0x6F40,0x6F41,0x6F42,0x0000,/* 0x78-0x7F */ + + 0x6F43,0x6F44,0x6F45,0x6F48,0x6F49,0x6F4A,0x6F4C,0x6F4E,/* 0x80-0x87 */ + 0x6F4F,0x6F50,0x6F51,0x6F52,0x6F53,0x6F54,0x6F55,0x6F56,/* 0x88-0x8F */ + 0x6F57,0x6F59,0x6F5A,0x6F5B,0x6F5D,0x6F5F,0x6F60,0x6F61,/* 0x90-0x97 */ + 0x6F63,0x6F64,0x6F65,0x6F67,0x6F68,0x6F69,0x6F6A,0x6F6B,/* 0x98-0x9F */ + 0x6F6C,0x6F6F,0x6F70,0x6F71,0x6F73,0x6F75,0x6F76,0x6F77,/* 0xA0-0xA7 */ + 0x6F79,0x6F7B,0x6F7D,0x6F7E,0x6F7F,0x6F80,0x6F81,0x6F82,/* 0xA8-0xAF */ + 0x6F83,0x6F85,0x6F86,0x6F87,0x6F8A,0x6F8B,0x6F8F,0x6F90,/* 0xB0-0xB7 */ + 0x6F91,0x6F92,0x6F93,0x6F94,0x6F95,0x6F96,0x6F97,0x6F98,/* 0xB8-0xBF */ + 0x6F99,0x6F9A,0x6F9B,0x6F9D,0x6F9E,0x6F9F,0x6FA0,0x6FA2,/* 0xC0-0xC7 */ + 0x6FA3,0x6FA4,0x6FA5,0x6FA6,0x6FA8,0x6FA9,0x6FAA,0x6FAB,/* 0xC8-0xCF */ + 0x6FAC,0x6FAD,0x6FAE,0x6FAF,0x6FB0,0x6FB1,0x6FB2,0x6FB4,/* 0xD0-0xD7 */ + 0x6FB5,0x6FB7,0x6FB8,0x6FBA,0x6FBB,0x6FBC,0x6FBD,0x6FBE,/* 0xD8-0xDF */ + 0x6FBF,0x6FC1,0x6FC3,0x6FC4,0x6FC5,0x6FC6,0x6FC7,0x6FC8,/* 0xE0-0xE7 */ + 0x6FCA,0x6FCB,0x6FCC,0x6FCD,0x6FCE,0x6FCF,0x6FD0,0x6FD3,/* 0xE8-0xEF */ + 0x6FD4,0x6FD5,0x6FD6,0x6FD7,0x6FD8,0x6FD9,0x6FDA,0x6FDB,/* 0xF0-0xF7 */ + 0x6FDC,0x6FDD,0x6FDF,0x6FE2,0x6FE3,0x6FE4,0x6FE5,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9E[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6FE6,0x6FE7,0x6FE8,0x6FE9,0x6FEA,0x6FEB,0x6FEC,0x6FED,/* 0x40-0x47 */ + 0x6FF0,0x6FF1,0x6FF2,0x6FF3,0x6FF4,0x6FF5,0x6FF6,0x6FF7,/* 0x48-0x4F */ + 0x6FF8,0x6FF9,0x6FFA,0x6FFB,0x6FFC,0x6FFD,0x6FFE,0x6FFF,/* 0x50-0x57 */ + 0x7000,0x7001,0x7002,0x7003,0x7004,0x7005,0x7006,0x7007,/* 0x58-0x5F */ + 0x7008,0x7009,0x700A,0x700B,0x700C,0x700D,0x700E,0x700F,/* 0x60-0x67 */ + 0x7010,0x7012,0x7013,0x7014,0x7015,0x7016,0x7017,0x7018,/* 0x68-0x6F */ + 0x7019,0x701C,0x701D,0x701E,0x701F,0x7020,0x7021,0x7022,/* 0x70-0x77 */ + 0x7024,0x7025,0x7026,0x7027,0x7028,0x7029,0x702A,0x0000,/* 0x78-0x7F */ + + 0x702B,0x702C,0x702D,0x702E,0x702F,0x7030,0x7031,0x7032,/* 0x80-0x87 */ + 0x7033,0x7034,0x7036,0x7037,0x7038,0x703A,0x703B,0x703C,/* 0x88-0x8F */ + 0x703D,0x703E,0x703F,0x7040,0x7041,0x7042,0x7043,0x7044,/* 0x90-0x97 */ + 0x7045,0x7046,0x7047,0x7048,0x7049,0x704A,0x704B,0x704D,/* 0x98-0x9F */ + 0x704E,0x7050,0x7051,0x7052,0x7053,0x7054,0x7055,0x7056,/* 0xA0-0xA7 */ + 0x7057,0x7058,0x7059,0x705A,0x705B,0x705C,0x705D,0x705F,/* 0xA8-0xAF */ + 0x7060,0x7061,0x7062,0x7063,0x7064,0x7065,0x7066,0x7067,/* 0xB0-0xB7 */ + 0x7068,0x7069,0x706A,0x706E,0x7071,0x7072,0x7073,0x7074,/* 0xB8-0xBF */ + 0x7077,0x7079,0x707A,0x707B,0x707D,0x7081,0x7082,0x7083,/* 0xC0-0xC7 */ + 0x7084,0x7086,0x7087,0x7088,0x708B,0x708C,0x708D,0x708F,/* 0xC8-0xCF */ + 0x7090,0x7091,0x7093,0x7097,0x7098,0x709A,0x709B,0x709E,/* 0xD0-0xD7 */ + 0x709F,0x70A0,0x70A1,0x70A2,0x70A3,0x70A4,0x70A5,0x70A6,/* 0xD8-0xDF */ + 0x70A7,0x70A8,0x70A9,0x70AA,0x70B0,0x70B2,0x70B4,0x70B5,/* 0xE0-0xE7 */ + 0x70B6,0x70BA,0x70BE,0x70BF,0x70C4,0x70C5,0x70C6,0x70C7,/* 0xE8-0xEF */ + 0x70C9,0x70CB,0x70CC,0x70CD,0x70CE,0x70CF,0x70D0,0x70D1,/* 0xF0-0xF7 */ + 0x70D2,0x70D3,0x70D4,0x70D5,0x70D6,0x70D7,0x70DA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9F[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x70DC,0x70DD,0x70DE,0x70E0,0x70E1,0x70E2,0x70E3,0x70E5,/* 0x40-0x47 */ + 0x70EA,0x70EE,0x70F0,0x70F1,0x70F2,0x70F3,0x70F4,0x70F5,/* 0x48-0x4F */ + 0x70F6,0x70F8,0x70FA,0x70FB,0x70FC,0x70FE,0x70FF,0x7100,/* 0x50-0x57 */ + 0x7101,0x7102,0x7103,0x7104,0x7105,0x7106,0x7107,0x7108,/* 0x58-0x5F */ + 0x710B,0x710C,0x710D,0x710E,0x710F,0x7111,0x7112,0x7114,/* 0x60-0x67 */ + 0x7117,0x711B,0x711C,0x711D,0x711E,0x711F,0x7120,0x7121,/* 0x68-0x6F */ + 0x7122,0x7123,0x7124,0x7125,0x7127,0x7128,0x7129,0x712A,/* 0x70-0x77 */ + 0x712B,0x712C,0x712D,0x712E,0x7132,0x7133,0x7134,0x0000,/* 0x78-0x7F */ + + 0x7135,0x7137,0x7138,0x7139,0x713A,0x713B,0x713C,0x713D,/* 0x80-0x87 */ + 0x713E,0x713F,0x7140,0x7141,0x7142,0x7143,0x7144,0x7146,/* 0x88-0x8F */ + 0x7147,0x7148,0x7149,0x714B,0x714D,0x714F,0x7150,0x7151,/* 0x90-0x97 */ + 0x7152,0x7153,0x7154,0x7155,0x7156,0x7157,0x7158,0x7159,/* 0x98-0x9F */ + 0x715A,0x715B,0x715D,0x715F,0x7160,0x7161,0x7162,0x7163,/* 0xA0-0xA7 */ + 0x7165,0x7169,0x716A,0x716B,0x716C,0x716D,0x716F,0x7170,/* 0xA8-0xAF */ + 0x7171,0x7174,0x7175,0x7176,0x7177,0x7179,0x717B,0x717C,/* 0xB0-0xB7 */ + 0x717E,0x717F,0x7180,0x7181,0x7182,0x7183,0x7185,0x7186,/* 0xB8-0xBF */ + 0x7187,0x7188,0x7189,0x718B,0x718C,0x718D,0x718E,0x7190,/* 0xC0-0xC7 */ + 0x7191,0x7192,0x7193,0x7195,0x7196,0x7197,0x719A,0x719B,/* 0xC8-0xCF */ + 0x719C,0x719D,0x719E,0x71A1,0x71A2,0x71A3,0x71A4,0x71A5,/* 0xD0-0xD7 */ + 0x71A6,0x71A7,0x71A9,0x71AA,0x71AB,0x71AD,0x71AE,0x71AF,/* 0xD8-0xDF */ + 0x71B0,0x71B1,0x71B2,0x71B4,0x71B6,0x71B7,0x71B8,0x71BA,/* 0xE0-0xE7 */ + 0x71BB,0x71BC,0x71BD,0x71BE,0x71BF,0x71C0,0x71C1,0x71C2,/* 0xE8-0xEF */ + 0x71C4,0x71C5,0x71C6,0x71C7,0x71C8,0x71C9,0x71CA,0x71CB,/* 0xF0-0xF7 */ + 0x71CC,0x71CD,0x71CF,0x71D0,0x71D1,0x71D2,0x71D3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x71D6,0x71D7,0x71D8,0x71D9,0x71DA,0x71DB,0x71DC,0x71DD,/* 0x40-0x47 */ + 0x71DE,0x71DF,0x71E1,0x71E2,0x71E3,0x71E4,0x71E6,0x71E8,/* 0x48-0x4F */ + 0x71E9,0x71EA,0x71EB,0x71EC,0x71ED,0x71EF,0x71F0,0x71F1,/* 0x50-0x57 */ + 0x71F2,0x71F3,0x71F4,0x71F5,0x71F6,0x71F7,0x71F8,0x71FA,/* 0x58-0x5F */ + 0x71FB,0x71FC,0x71FD,0x71FE,0x71FF,0x7200,0x7201,0x7202,/* 0x60-0x67 */ + 0x7203,0x7204,0x7205,0x7207,0x7208,0x7209,0x720A,0x720B,/* 0x68-0x6F */ + 0x720C,0x720D,0x720E,0x720F,0x7210,0x7211,0x7212,0x7213,/* 0x70-0x77 */ + 0x7214,0x7215,0x7216,0x7217,0x7218,0x7219,0x721A,0x0000,/* 0x78-0x7F */ + + 0x721B,0x721C,0x721E,0x721F,0x7220,0x7221,0x7222,0x7223,/* 0x80-0x87 */ + 0x7224,0x7225,0x7226,0x7227,0x7229,0x722B,0x722D,0x722E,/* 0x88-0x8F */ + 0x722F,0x7232,0x7233,0x7234,0x723A,0x723C,0x723E,0x7240,/* 0x90-0x97 */ + 0x7241,0x7242,0x7243,0x7244,0x7245,0x7246,0x7249,0x724A,/* 0x98-0x9F */ + 0x724B,0x724E,0x724F,0x7250,0x7251,0x7253,0x7254,0x7255,/* 0xA0-0xA7 */ + 0x7257,0x7258,0x725A,0x725C,0x725E,0x7260,0x7263,0x7264,/* 0xA8-0xAF */ + 0x7265,0x7268,0x726A,0x726B,0x726C,0x726D,0x7270,0x7271,/* 0xB0-0xB7 */ + 0x7273,0x7274,0x7276,0x7277,0x7278,0x727B,0x727C,0x727D,/* 0xB8-0xBF */ + 0x7282,0x7283,0x7285,0x7286,0x7287,0x7288,0x7289,0x728C,/* 0xC0-0xC7 */ + 0x728E,0x7290,0x7291,0x7293,0x7294,0x7295,0x7296,0x7297,/* 0xC8-0xCF */ + 0x7298,0x7299,0x729A,0x729B,0x729C,0x729D,0x729E,0x72A0,/* 0xD0-0xD7 */ + 0x72A1,0x72A2,0x72A3,0x72A4,0x72A5,0x72A6,0x72A7,0x72A8,/* 0xD8-0xDF */ + 0x72A9,0x72AA,0x72AB,0x72AE,0x72B1,0x72B2,0x72B3,0x72B5,/* 0xE0-0xE7 */ + 0x72BA,0x72BB,0x72BC,0x72BD,0x72BE,0x72BF,0x72C0,0x72C5,/* 0xE8-0xEF */ + 0x72C6,0x72C7,0x72C9,0x72CA,0x72CB,0x72CC,0x72CF,0x72D1,/* 0xF0-0xF7 */ + 0x72D3,0x72D4,0x72D5,0x72D6,0x72D8,0x72DA,0x72DB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x3000,0x3001,0x3002,0x00B7,0x02C9,0x02C7,0x00A8,/* 0xA0-0xA7 */ + 0x3003,0x3005,0x2014,0xFF5E,0x2016,0x2026,0x2018,0x2019,/* 0xA8-0xAF */ + 0x201C,0x201D,0x3014,0x3015,0x3008,0x3009,0x300A,0x300B,/* 0xB0-0xB7 */ + 0x300C,0x300D,0x300E,0x300F,0x3016,0x3017,0x3010,0x3011,/* 0xB8-0xBF */ + 0x00B1,0x00D7,0x00F7,0x2236,0x2227,0x2228,0x2211,0x220F,/* 0xC0-0xC7 */ + 0x222A,0x2229,0x2208,0x2237,0x221A,0x22A5,0x2225,0x2220,/* 0xC8-0xCF */ + 0x2312,0x2299,0x222B,0x222E,0x2261,0x224C,0x2248,0x223D,/* 0xD0-0xD7 */ + 0x221D,0x2260,0x226E,0x226F,0x2264,0x2265,0x221E,0x2235,/* 0xD8-0xDF */ + 0x2234,0x2642,0x2640,0x00B0,0x2032,0x2033,0x2103,0xFF04,/* 0xE0-0xE7 */ + 0x00A4,0xFFE0,0xFFE1,0x2030,0x00A7,0x2116,0x2606,0x2605,/* 0xE8-0xEF */ + 0x25CB,0x25CF,0x25CE,0x25C7,0x25C6,0x25A1,0x25A0,0x25B3,/* 0xF0-0xF7 */ + 0x25B2,0x203B,0x2192,0x2190,0x2191,0x2193,0x3013,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x2170,0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,/* 0xA0-0xA7 */ + 0x2177,0x2178,0x2179,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA8-0xAF */ + 0x0000,0x2488,0x2489,0x248A,0x248B,0x248C,0x248D,0x248E,/* 0xB0-0xB7 */ + 0x248F,0x2490,0x2491,0x2492,0x2493,0x2494,0x2495,0x2496,/* 0xB8-0xBF */ + 0x2497,0x2498,0x2499,0x249A,0x249B,0x2474,0x2475,0x2476,/* 0xC0-0xC7 */ + 0x2477,0x2478,0x2479,0x247A,0x247B,0x247C,0x247D,0x247E,/* 0xC8-0xCF */ + 0x247F,0x2480,0x2481,0x2482,0x2483,0x2484,0x2485,0x2486,/* 0xD0-0xD7 */ + 0x2487,0x2460,0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,/* 0xD8-0xDF */ + 0x2467,0x2468,0x2469,0x0000,0x0000,0x3220,0x3221,0x3222,/* 0xE0-0xE7 */ + 0x3223,0x3224,0x3225,0x3226,0x3227,0x3228,0x3229,0x0000,/* 0xE8-0xEF */ + 0x0000,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,/* 0xF0-0xF7 */ + 0x2167,0x2168,0x2169,0x216A,0x216B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xFF01,0xFF02,0xFF03,0xFFE5,0xFF05,0xFF06,0xFF07,/* 0xA0-0xA7 */ + 0xFF08,0xFF09,0xFF0A,0xFF0B,0xFF0C,0xFF0D,0xFF0E,0xFF0F,/* 0xA8-0xAF */ + 0xFF10,0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,/* 0xB0-0xB7 */ + 0xFF18,0xFF19,0xFF1A,0xFF1B,0xFF1C,0xFF1D,0xFF1E,0xFF1F,/* 0xB8-0xBF */ + 0xFF20,0xFF21,0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,/* 0xC0-0xC7 */ + 0xFF28,0xFF29,0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,/* 0xC8-0xCF */ + 0xFF30,0xFF31,0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,/* 0xD0-0xD7 */ + 0xFF38,0xFF39,0xFF3A,0xFF3B,0xFF3C,0xFF3D,0xFF3E,0xFF3F,/* 0xD8-0xDF */ + 0xFF40,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0xE0-0xE7 */ + 0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0xE8-0xEF */ + 0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0xFF57,/* 0xF0-0xF7 */ + 0xFF58,0xFF59,0xFF5A,0xFF5B,0xFF5C,0xFF5D,0xFFE3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x3041,0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,/* 0xA0-0xA7 */ + 0x3048,0x3049,0x304A,0x304B,0x304C,0x304D,0x304E,0x304F,/* 0xA8-0xAF */ + 0x3050,0x3051,0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,/* 0xB0-0xB7 */ + 0x3058,0x3059,0x305A,0x305B,0x305C,0x305D,0x305E,0x305F,/* 0xB8-0xBF */ + 0x3060,0x3061,0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,/* 0xC0-0xC7 */ + 0x3068,0x3069,0x306A,0x306B,0x306C,0x306D,0x306E,0x306F,/* 0xC8-0xCF */ + 0x3070,0x3071,0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,/* 0xD0-0xD7 */ + 0x3078,0x3079,0x307A,0x307B,0x307C,0x307D,0x307E,0x307F,/* 0xD8-0xDF */ + 0x3080,0x3081,0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,/* 0xE0-0xE7 */ + 0x3088,0x3089,0x308A,0x308B,0x308C,0x308D,0x308E,0x308F,/* 0xE8-0xEF */ + 0x3090,0x3091,0x3092,0x3093,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_A5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x30A1,0x30A2,0x30A3,0x30A4,0x30A5,0x30A6,0x30A7,/* 0xA0-0xA7 */ + 0x30A8,0x30A9,0x30AA,0x30AB,0x30AC,0x30AD,0x30AE,0x30AF,/* 0xA8-0xAF */ + 0x30B0,0x30B1,0x30B2,0x30B3,0x30B4,0x30B5,0x30B6,0x30B7,/* 0xB0-0xB7 */ + 0x30B8,0x30B9,0x30BA,0x30BB,0x30BC,0x30BD,0x30BE,0x30BF,/* 0xB8-0xBF */ + 0x30C0,0x30C1,0x30C2,0x30C3,0x30C4,0x30C5,0x30C6,0x30C7,/* 0xC0-0xC7 */ + 0x30C8,0x30C9,0x30CA,0x30CB,0x30CC,0x30CD,0x30CE,0x30CF,/* 0xC8-0xCF */ + 0x30D0,0x30D1,0x30D2,0x30D3,0x30D4,0x30D5,0x30D6,0x30D7,/* 0xD0-0xD7 */ + 0x30D8,0x30D9,0x30DA,0x30DB,0x30DC,0x30DD,0x30DE,0x30DF,/* 0xD8-0xDF */ + 0x30E0,0x30E1,0x30E2,0x30E3,0x30E4,0x30E5,0x30E6,0x30E7,/* 0xE0-0xE7 */ + 0x30E8,0x30E9,0x30EA,0x30EB,0x30EC,0x30ED,0x30EE,0x30EF,/* 0xE8-0xEF */ + 0x30F0,0x30F1,0x30F2,0x30F3,0x30F4,0x30F5,0x30F6,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_A6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x0391,0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,/* 0xA0-0xA7 */ + 0x0398,0x0399,0x039A,0x039B,0x039C,0x039D,0x039E,0x039F,/* 0xA8-0xAF */ + 0x03A0,0x03A1,0x03A3,0x03A4,0x03A5,0x03A6,0x03A7,0x03A8,/* 0xB0-0xB7 */ + 0x03A9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xB8-0xBF */ + 0x0000,0x03B1,0x03B2,0x03B3,0x03B4,0x03B5,0x03B6,0x03B7,/* 0xC0-0xC7 */ + 0x03B8,0x03B9,0x03BA,0x03BB,0x03BC,0x03BD,0x03BE,0x03BF,/* 0xC8-0xCF */ + 0x03C0,0x03C1,0x03C3,0x03C4,0x03C5,0x03C6,0x03C7,0x03C8,/* 0xD0-0xD7 */ + 0x03C9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD8-0xDF */ + 0xFE35,0xFE36,0xFE39,0xFE3A,0xFE3F,0xFE40,0xFE3D,0xFE3E,/* 0xE0-0xE7 */ + 0xFE41,0xFE42,0xFE43,0xFE44,0x0000,0x0000,0xFE3B,0xFE3C,/* 0xE8-0xEF */ + 0xFE37,0xFE38,0xFE31,0x0000,0xFE33,0xFE34,0x0000,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_A7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,/* 0xA0-0xA7 */ + 0x0416,0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,/* 0xA8-0xAF */ + 0x041E,0x041F,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,/* 0xB0-0xB7 */ + 0x0426,0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,/* 0xB8-0xBF */ + 0x042E,0x042F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC8-0xCF */ + 0x0000,0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,/* 0xD0-0xD7 */ + 0x0436,0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,/* 0xD8-0xDF */ + 0x043E,0x043F,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,/* 0xE0-0xE7 */ + 0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,/* 0xE8-0xEF */ + 0x044E,0x044F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_A8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x02CA,0x02CB,0x02D9,0x2013,0x2015,0x2025,0x2035,0x2105,/* 0x40-0x47 */ + 0x2109,0x2196,0x2197,0x2198,0x2199,0x2215,0x221F,0x2223,/* 0x48-0x4F */ + 0x2252,0x2266,0x2267,0x22BF,0x2550,0x2551,0x2552,0x2553,/* 0x50-0x57 */ + 0x2554,0x2555,0x2556,0x2557,0x2558,0x2559,0x255A,0x255B,/* 0x58-0x5F */ + 0x255C,0x255D,0x255E,0x255F,0x2560,0x2561,0x2562,0x2563,/* 0x60-0x67 */ + 0x2564,0x2565,0x2566,0x2567,0x2568,0x2569,0x256A,0x256B,/* 0x68-0x6F */ + 0x256C,0x256D,0x256E,0x256F,0x2570,0x2571,0x2572,0x2573,/* 0x70-0x77 */ + 0x2581,0x2582,0x2583,0x2584,0x2585,0x2586,0x2587,0x0000,/* 0x78-0x7F */ + + 0x2588,0x2589,0x258A,0x258B,0x258C,0x258D,0x258E,0x258F,/* 0x80-0x87 */ + 0x2593,0x2594,0x2595,0x25BC,0x25BD,0x25E2,0x25E3,0x25E4,/* 0x88-0x8F */ + 0x25E5,0x2609,0x2295,0x3012,0x301D,0x301E,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x0101,0x00E1,0x01CE,0x00E0,0x0113,0x00E9,0x011B,/* 0xA0-0xA7 */ + 0x00E8,0x012B,0x00ED,0x01D0,0x00EC,0x014D,0x00F3,0x01D2,/* 0xA8-0xAF */ + 0x00F2,0x016B,0x00FA,0x01D4,0x00F9,0x01D6,0x01D8,0x01DA,/* 0xB0-0xB7 */ + 0x01DC,0x00FC,0x00EA,0x0251,0x0000,0x0144,0x0148,0x0000,/* 0xB8-0xBF */ + 0x0261,0x0000,0x0000,0x0000,0x0000,0x3105,0x3106,0x3107,/* 0xC0-0xC7 */ + 0x3108,0x3109,0x310A,0x310B,0x310C,0x310D,0x310E,0x310F,/* 0xC8-0xCF */ + 0x3110,0x3111,0x3112,0x3113,0x3114,0x3115,0x3116,0x3117,/* 0xD0-0xD7 */ + 0x3118,0x3119,0x311A,0x311B,0x311C,0x311D,0x311E,0x311F,/* 0xD8-0xDF */ + 0x3120,0x3121,0x3122,0x3123,0x3124,0x3125,0x3126,0x3127,/* 0xE0-0xE7 */ + 0x3128,0x3129,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xE8-0xEF */ +}; + +static const wchar_t c2u_A9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x3021,0x3022,0x3023,0x3024,0x3025,0x3026,0x3027,0x3028,/* 0x40-0x47 */ + 0x3029,0x32A3,0x338E,0x338F,0x339C,0x339D,0x339E,0x33A1,/* 0x48-0x4F */ + 0x33C4,0x33CE,0x33D1,0x33D2,0x33D5,0xFE30,0xFFE2,0xFFE4,/* 0x50-0x57 */ + 0x0000,0x2121,0x3231,0x0000,0x2010,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x30FC,0x309B,0x309C,0x30FD,0x30FE,0x3006,0x309D,0x309E,/* 0x60-0x67 */ + 0xFE49,0xFE4A,0xFE4B,0xFE4C,0xFE4D,0xFE4E,0xFE4F,0xFE50,/* 0x68-0x6F */ + 0xFE51,0xFE52,0xFE54,0xFE55,0xFE56,0xFE57,0xFE59,0xFE5A,/* 0x70-0x77 */ + 0xFE5B,0xFE5C,0xFE5D,0xFE5E,0xFE5F,0xFE60,0xFE61,0x0000,/* 0x78-0x7F */ + + 0xFE62,0xFE63,0xFE64,0xFE65,0xFE66,0xFE68,0xFE69,0xFE6A,/* 0x80-0x87 */ + 0xFE6B,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x3007,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x0000,0x0000,0x0000,0x2500,0x2501,0x2502,0x2503,/* 0xA0-0xA7 */ + 0x2504,0x2505,0x2506,0x2507,0x2508,0x2509,0x250A,0x250B,/* 0xA8-0xAF */ + 0x250C,0x250D,0x250E,0x250F,0x2510,0x2511,0x2512,0x2513,/* 0xB0-0xB7 */ + 0x2514,0x2515,0x2516,0x2517,0x2518,0x2519,0x251A,0x251B,/* 0xB8-0xBF */ + 0x251C,0x251D,0x251E,0x251F,0x2520,0x2521,0x2522,0x2523,/* 0xC0-0xC7 */ + 0x2524,0x2525,0x2526,0x2527,0x2528,0x2529,0x252A,0x252B,/* 0xC8-0xCF */ + 0x252C,0x252D,0x252E,0x252F,0x2530,0x2531,0x2532,0x2533,/* 0xD0-0xD7 */ + 0x2534,0x2535,0x2536,0x2537,0x2538,0x2539,0x253A,0x253B,/* 0xD8-0xDF */ + 0x253C,0x253D,0x253E,0x253F,0x2540,0x2541,0x2542,0x2543,/* 0xE0-0xE7 */ + 0x2544,0x2545,0x2546,0x2547,0x2548,0x2549,0x254A,0x254B,/* 0xE8-0xEF */ +}; + +static const wchar_t c2u_AA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x72DC,0x72DD,0x72DF,0x72E2,0x72E3,0x72E4,0x72E5,0x72E6,/* 0x40-0x47 */ + 0x72E7,0x72EA,0x72EB,0x72F5,0x72F6,0x72F9,0x72FD,0x72FE,/* 0x48-0x4F */ + 0x72FF,0x7300,0x7302,0x7304,0x7305,0x7306,0x7307,0x7308,/* 0x50-0x57 */ + 0x7309,0x730B,0x730C,0x730D,0x730F,0x7310,0x7311,0x7312,/* 0x58-0x5F */ + 0x7314,0x7318,0x7319,0x731A,0x731F,0x7320,0x7323,0x7324,/* 0x60-0x67 */ + 0x7326,0x7327,0x7328,0x732D,0x732F,0x7330,0x7332,0x7333,/* 0x68-0x6F */ + 0x7335,0x7336,0x733A,0x733B,0x733C,0x733D,0x7340,0x7341,/* 0x70-0x77 */ + 0x7342,0x7343,0x7344,0x7345,0x7346,0x7347,0x7348,0x0000,/* 0x78-0x7F */ + + 0x7349,0x734A,0x734B,0x734C,0x734E,0x734F,0x7351,0x7353,/* 0x80-0x87 */ + 0x7354,0x7355,0x7356,0x7358,0x7359,0x735A,0x735B,0x735C,/* 0x88-0x8F */ + 0x735D,0x735E,0x735F,0x7361,0x7362,0x7363,0x7364,0x7365,/* 0x90-0x97 */ + 0x7366,0x7367,0x7368,0x7369,0x736A,0x736B,0x736E,0x7370,/* 0x98-0x9F */ + 0x7371,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_AB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7372,0x7373,0x7374,0x7375,0x7376,0x7377,0x7378,0x7379,/* 0x40-0x47 */ + 0x737A,0x737B,0x737C,0x737D,0x737F,0x7380,0x7381,0x7382,/* 0x48-0x4F */ + 0x7383,0x7385,0x7386,0x7388,0x738A,0x738C,0x738D,0x738F,/* 0x50-0x57 */ + 0x7390,0x7392,0x7393,0x7394,0x7395,0x7397,0x7398,0x7399,/* 0x58-0x5F */ + 0x739A,0x739C,0x739D,0x739E,0x73A0,0x73A1,0x73A3,0x73A4,/* 0x60-0x67 */ + 0x73A5,0x73A6,0x73A7,0x73A8,0x73AA,0x73AC,0x73AD,0x73B1,/* 0x68-0x6F */ + 0x73B4,0x73B5,0x73B6,0x73B8,0x73B9,0x73BC,0x73BD,0x73BE,/* 0x70-0x77 */ + 0x73BF,0x73C1,0x73C3,0x73C4,0x73C5,0x73C6,0x73C7,0x0000,/* 0x78-0x7F */ + + 0x73CB,0x73CC,0x73CE,0x73D2,0x73D3,0x73D4,0x73D5,0x73D6,/* 0x80-0x87 */ + 0x73D7,0x73D8,0x73DA,0x73DB,0x73DC,0x73DD,0x73DF,0x73E1,/* 0x88-0x8F */ + 0x73E2,0x73E3,0x73E4,0x73E6,0x73E8,0x73EA,0x73EB,0x73EC,/* 0x90-0x97 */ + 0x73EE,0x73EF,0x73F0,0x73F1,0x73F3,0x73F4,0x73F5,0x73F6,/* 0x98-0x9F */ + 0x73F7,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_AC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x73F8,0x73F9,0x73FA,0x73FB,0x73FC,0x73FD,0x73FE,0x73FF,/* 0x40-0x47 */ + 0x7400,0x7401,0x7402,0x7404,0x7407,0x7408,0x740B,0x740C,/* 0x48-0x4F */ + 0x740D,0x740E,0x7411,0x7412,0x7413,0x7414,0x7415,0x7416,/* 0x50-0x57 */ + 0x7417,0x7418,0x7419,0x741C,0x741D,0x741E,0x741F,0x7420,/* 0x58-0x5F */ + 0x7421,0x7423,0x7424,0x7427,0x7429,0x742B,0x742D,0x742F,/* 0x60-0x67 */ + 0x7431,0x7432,0x7437,0x7438,0x7439,0x743A,0x743B,0x743D,/* 0x68-0x6F */ + 0x743E,0x743F,0x7440,0x7442,0x7443,0x7444,0x7445,0x7446,/* 0x70-0x77 */ + 0x7447,0x7448,0x7449,0x744A,0x744B,0x744C,0x744D,0x0000,/* 0x78-0x7F */ + + 0x744E,0x744F,0x7450,0x7451,0x7452,0x7453,0x7454,0x7456,/* 0x80-0x87 */ + 0x7458,0x745D,0x7460,0x7461,0x7462,0x7463,0x7464,0x7465,/* 0x88-0x8F */ + 0x7466,0x7467,0x7468,0x7469,0x746A,0x746B,0x746C,0x746E,/* 0x90-0x97 */ + 0x746F,0x7471,0x7472,0x7473,0x7474,0x7475,0x7478,0x7479,/* 0x98-0x9F */ + 0x747A,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_AD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x747B,0x747C,0x747D,0x747F,0x7482,0x7484,0x7485,0x7486,/* 0x40-0x47 */ + 0x7488,0x7489,0x748A,0x748C,0x748D,0x748F,0x7491,0x7492,/* 0x48-0x4F */ + 0x7493,0x7494,0x7495,0x7496,0x7497,0x7498,0x7499,0x749A,/* 0x50-0x57 */ + 0x749B,0x749D,0x749F,0x74A0,0x74A1,0x74A2,0x74A3,0x74A4,/* 0x58-0x5F */ + 0x74A5,0x74A6,0x74AA,0x74AB,0x74AC,0x74AD,0x74AE,0x74AF,/* 0x60-0x67 */ + 0x74B0,0x74B1,0x74B2,0x74B3,0x74B4,0x74B5,0x74B6,0x74B7,/* 0x68-0x6F */ + 0x74B8,0x74B9,0x74BB,0x74BC,0x74BD,0x74BE,0x74BF,0x74C0,/* 0x70-0x77 */ + 0x74C1,0x74C2,0x74C3,0x74C4,0x74C5,0x74C6,0x74C7,0x0000,/* 0x78-0x7F */ + + 0x74C8,0x74C9,0x74CA,0x74CB,0x74CC,0x74CD,0x74CE,0x74CF,/* 0x80-0x87 */ + 0x74D0,0x74D1,0x74D3,0x74D4,0x74D5,0x74D6,0x74D7,0x74D8,/* 0x88-0x8F */ + 0x74D9,0x74DA,0x74DB,0x74DD,0x74DF,0x74E1,0x74E5,0x74E7,/* 0x90-0x97 */ + 0x74E8,0x74E9,0x74EA,0x74EB,0x74EC,0x74ED,0x74F0,0x74F1,/* 0x98-0x9F */ + 0x74F2,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_AE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x74F3,0x74F5,0x74F8,0x74F9,0x74FA,0x74FB,0x74FC,0x74FD,/* 0x40-0x47 */ + 0x74FE,0x7500,0x7501,0x7502,0x7503,0x7505,0x7506,0x7507,/* 0x48-0x4F */ + 0x7508,0x7509,0x750A,0x750B,0x750C,0x750E,0x7510,0x7512,/* 0x50-0x57 */ + 0x7514,0x7515,0x7516,0x7517,0x751B,0x751D,0x751E,0x7520,/* 0x58-0x5F */ + 0x7521,0x7522,0x7523,0x7524,0x7526,0x7527,0x752A,0x752E,/* 0x60-0x67 */ + 0x7534,0x7536,0x7539,0x753C,0x753D,0x753F,0x7541,0x7542,/* 0x68-0x6F */ + 0x7543,0x7544,0x7546,0x7547,0x7549,0x754A,0x754D,0x7550,/* 0x70-0x77 */ + 0x7551,0x7552,0x7553,0x7555,0x7556,0x7557,0x7558,0x0000,/* 0x78-0x7F */ + + 0x755D,0x755E,0x755F,0x7560,0x7561,0x7562,0x7563,0x7564,/* 0x80-0x87 */ + 0x7567,0x7568,0x7569,0x756B,0x756C,0x756D,0x756E,0x756F,/* 0x88-0x8F */ + 0x7570,0x7571,0x7573,0x7575,0x7576,0x7577,0x757A,0x757B,/* 0x90-0x97 */ + 0x757C,0x757D,0x757E,0x7580,0x7581,0x7582,0x7584,0x7585,/* 0x98-0x9F */ + 0x7587,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_AF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7588,0x7589,0x758A,0x758C,0x758D,0x758E,0x7590,0x7593,/* 0x40-0x47 */ + 0x7595,0x7598,0x759B,0x759C,0x759E,0x75A2,0x75A6,0x75A7,/* 0x48-0x4F */ + 0x75A8,0x75A9,0x75AA,0x75AD,0x75B6,0x75B7,0x75BA,0x75BB,/* 0x50-0x57 */ + 0x75BF,0x75C0,0x75C1,0x75C6,0x75CB,0x75CC,0x75CE,0x75CF,/* 0x58-0x5F */ + 0x75D0,0x75D1,0x75D3,0x75D7,0x75D9,0x75DA,0x75DC,0x75DD,/* 0x60-0x67 */ + 0x75DF,0x75E0,0x75E1,0x75E5,0x75E9,0x75EC,0x75ED,0x75EE,/* 0x68-0x6F */ + 0x75EF,0x75F2,0x75F3,0x75F5,0x75F6,0x75F7,0x75F8,0x75FA,/* 0x70-0x77 */ + 0x75FB,0x75FD,0x75FE,0x7602,0x7604,0x7606,0x7607,0x0000,/* 0x78-0x7F */ + + 0x7608,0x7609,0x760B,0x760D,0x760E,0x760F,0x7611,0x7612,/* 0x80-0x87 */ + 0x7613,0x7614,0x7616,0x761A,0x761C,0x761D,0x761E,0x7621,/* 0x88-0x8F */ + 0x7623,0x7627,0x7628,0x762C,0x762E,0x762F,0x7631,0x7632,/* 0x90-0x97 */ + 0x7636,0x7637,0x7639,0x763A,0x763B,0x763D,0x7641,0x7642,/* 0x98-0x9F */ + 0x7644,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_B0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7645,0x7646,0x7647,0x7648,0x7649,0x764A,0x764B,0x764E,/* 0x40-0x47 */ + 0x764F,0x7650,0x7651,0x7652,0x7653,0x7655,0x7657,0x7658,/* 0x48-0x4F */ + 0x7659,0x765A,0x765B,0x765D,0x765F,0x7660,0x7661,0x7662,/* 0x50-0x57 */ + 0x7664,0x7665,0x7666,0x7667,0x7668,0x7669,0x766A,0x766C,/* 0x58-0x5F */ + 0x766D,0x766E,0x7670,0x7671,0x7672,0x7673,0x7674,0x7675,/* 0x60-0x67 */ + 0x7676,0x7677,0x7679,0x767A,0x767C,0x767F,0x7680,0x7681,/* 0x68-0x6F */ + 0x7683,0x7685,0x7689,0x768A,0x768C,0x768D,0x768F,0x7690,/* 0x70-0x77 */ + 0x7692,0x7694,0x7695,0x7697,0x7698,0x769A,0x769B,0x0000,/* 0x78-0x7F */ + + 0x769C,0x769D,0x769E,0x769F,0x76A0,0x76A1,0x76A2,0x76A3,/* 0x80-0x87 */ + 0x76A5,0x76A6,0x76A7,0x76A8,0x76A9,0x76AA,0x76AB,0x76AC,/* 0x88-0x8F */ + 0x76AD,0x76AF,0x76B0,0x76B3,0x76B5,0x76B6,0x76B7,0x76B8,/* 0x90-0x97 */ + 0x76B9,0x76BA,0x76BB,0x76BC,0x76BD,0x76BE,0x76C0,0x76C1,/* 0x98-0x9F */ + 0x76C3,0x554A,0x963F,0x57C3,0x6328,0x54CE,0x5509,0x54C0,/* 0xA0-0xA7 */ + 0x7691,0x764C,0x853C,0x77EE,0x827E,0x788D,0x7231,0x9698,/* 0xA8-0xAF */ + 0x978D,0x6C28,0x5B89,0x4FFA,0x6309,0x6697,0x5CB8,0x80FA,/* 0xB0-0xB7 */ + 0x6848,0x80AE,0x6602,0x76CE,0x51F9,0x6556,0x71AC,0x7FF1,/* 0xB8-0xBF */ + 0x8884,0x50B2,0x5965,0x61CA,0x6FB3,0x82AD,0x634C,0x6252,/* 0xC0-0xC7 */ + 0x53ED,0x5427,0x7B06,0x516B,0x75A4,0x5DF4,0x62D4,0x8DCB,/* 0xC8-0xCF */ + 0x9776,0x628A,0x8019,0x575D,0x9738,0x7F62,0x7238,0x767D,/* 0xD0-0xD7 */ + 0x67CF,0x767E,0x6446,0x4F70,0x8D25,0x62DC,0x7A17,0x6591,/* 0xD8-0xDF */ + 0x73ED,0x642C,0x6273,0x822C,0x9881,0x677F,0x7248,0x626E,/* 0xE0-0xE7 */ + 0x62CC,0x4F34,0x74E3,0x534A,0x529E,0x7ECA,0x90A6,0x5E2E,/* 0xE8-0xEF */ + 0x6886,0x699C,0x8180,0x7ED1,0x68D2,0x78C5,0x868C,0x9551,/* 0xF0-0xF7 */ + 0x508D,0x8C24,0x82DE,0x80DE,0x5305,0x8912,0x5265,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x76C4,0x76C7,0x76C9,0x76CB,0x76CC,0x76D3,0x76D5,0x76D9,/* 0x40-0x47 */ + 0x76DA,0x76DC,0x76DD,0x76DE,0x76E0,0x76E1,0x76E2,0x76E3,/* 0x48-0x4F */ + 0x76E4,0x76E6,0x76E7,0x76E8,0x76E9,0x76EA,0x76EB,0x76EC,/* 0x50-0x57 */ + 0x76ED,0x76F0,0x76F3,0x76F5,0x76F6,0x76F7,0x76FA,0x76FB,/* 0x58-0x5F */ + 0x76FD,0x76FF,0x7700,0x7702,0x7703,0x7705,0x7706,0x770A,/* 0x60-0x67 */ + 0x770C,0x770E,0x770F,0x7710,0x7711,0x7712,0x7713,0x7714,/* 0x68-0x6F */ + 0x7715,0x7716,0x7717,0x7718,0x771B,0x771C,0x771D,0x771E,/* 0x70-0x77 */ + 0x7721,0x7723,0x7724,0x7725,0x7727,0x772A,0x772B,0x0000,/* 0x78-0x7F */ + + 0x772C,0x772E,0x7730,0x7731,0x7732,0x7733,0x7734,0x7739,/* 0x80-0x87 */ + 0x773B,0x773D,0x773E,0x773F,0x7742,0x7744,0x7745,0x7746,/* 0x88-0x8F */ + 0x7748,0x7749,0x774A,0x774B,0x774C,0x774D,0x774E,0x774F,/* 0x90-0x97 */ + 0x7752,0x7753,0x7754,0x7755,0x7756,0x7757,0x7758,0x7759,/* 0x98-0x9F */ + 0x775C,0x8584,0x96F9,0x4FDD,0x5821,0x9971,0x5B9D,0x62B1,/* 0xA0-0xA7 */ + 0x62A5,0x66B4,0x8C79,0x9C8D,0x7206,0x676F,0x7891,0x60B2,/* 0xA8-0xAF */ + 0x5351,0x5317,0x8F88,0x80CC,0x8D1D,0x94A1,0x500D,0x72C8,/* 0xB0-0xB7 */ + 0x5907,0x60EB,0x7119,0x88AB,0x5954,0x82EF,0x672C,0x7B28,/* 0xB8-0xBF */ + 0x5D29,0x7EF7,0x752D,0x6CF5,0x8E66,0x8FF8,0x903C,0x9F3B,/* 0xC0-0xC7 */ + 0x6BD4,0x9119,0x7B14,0x5F7C,0x78A7,0x84D6,0x853D,0x6BD5,/* 0xC8-0xCF */ + 0x6BD9,0x6BD6,0x5E01,0x5E87,0x75F9,0x95ED,0x655D,0x5F0A,/* 0xD0-0xD7 */ + 0x5FC5,0x8F9F,0x58C1,0x81C2,0x907F,0x965B,0x97AD,0x8FB9,/* 0xD8-0xDF */ + 0x7F16,0x8D2C,0x6241,0x4FBF,0x53D8,0x535E,0x8FA8,0x8FA9,/* 0xE0-0xE7 */ + 0x8FAB,0x904D,0x6807,0x5F6A,0x8198,0x8868,0x9CD6,0x618B,/* 0xE8-0xEF */ + 0x522B,0x762A,0x5F6C,0x658C,0x6FD2,0x6EE8,0x5BBE,0x6448,/* 0xF0-0xF7 */ + 0x5175,0x51B0,0x67C4,0x4E19,0x79C9,0x997C,0x70B3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x775D,0x775E,0x775F,0x7760,0x7764,0x7767,0x7769,0x776A,/* 0x40-0x47 */ + 0x776D,0x776E,0x776F,0x7770,0x7771,0x7772,0x7773,0x7774,/* 0x48-0x4F */ + 0x7775,0x7776,0x7777,0x7778,0x777A,0x777B,0x777C,0x7781,/* 0x50-0x57 */ + 0x7782,0x7783,0x7786,0x7787,0x7788,0x7789,0x778A,0x778B,/* 0x58-0x5F */ + 0x778F,0x7790,0x7793,0x7794,0x7795,0x7796,0x7797,0x7798,/* 0x60-0x67 */ + 0x7799,0x779A,0x779B,0x779C,0x779D,0x779E,0x77A1,0x77A3,/* 0x68-0x6F */ + 0x77A4,0x77A6,0x77A8,0x77AB,0x77AD,0x77AE,0x77AF,0x77B1,/* 0x70-0x77 */ + 0x77B2,0x77B4,0x77B6,0x77B7,0x77B8,0x77B9,0x77BA,0x0000,/* 0x78-0x7F */ + + 0x77BC,0x77BE,0x77C0,0x77C1,0x77C2,0x77C3,0x77C4,0x77C5,/* 0x80-0x87 */ + 0x77C6,0x77C7,0x77C8,0x77C9,0x77CA,0x77CB,0x77CC,0x77CE,/* 0x88-0x8F */ + 0x77CF,0x77D0,0x77D1,0x77D2,0x77D3,0x77D4,0x77D5,0x77D6,/* 0x90-0x97 */ + 0x77D8,0x77D9,0x77DA,0x77DD,0x77DE,0x77DF,0x77E0,0x77E1,/* 0x98-0x9F */ + 0x77E4,0x75C5,0x5E76,0x73BB,0x83E0,0x64AD,0x62E8,0x94B5,/* 0xA0-0xA7 */ + 0x6CE2,0x535A,0x52C3,0x640F,0x94C2,0x7B94,0x4F2F,0x5E1B,/* 0xA8-0xAF */ + 0x8236,0x8116,0x818A,0x6E24,0x6CCA,0x9A73,0x6355,0x535C,/* 0xB0-0xB7 */ + 0x54FA,0x8865,0x57E0,0x4E0D,0x5E03,0x6B65,0x7C3F,0x90E8,/* 0xB8-0xBF */ + 0x6016,0x64E6,0x731C,0x88C1,0x6750,0x624D,0x8D22,0x776C,/* 0xC0-0xC7 */ + 0x8E29,0x91C7,0x5F69,0x83DC,0x8521,0x9910,0x53C2,0x8695,/* 0xC8-0xCF */ + 0x6B8B,0x60ED,0x60E8,0x707F,0x82CD,0x8231,0x4ED3,0x6CA7,/* 0xD0-0xD7 */ + 0x85CF,0x64CD,0x7CD9,0x69FD,0x66F9,0x8349,0x5395,0x7B56,/* 0xD8-0xDF */ + 0x4FA7,0x518C,0x6D4B,0x5C42,0x8E6D,0x63D2,0x53C9,0x832C,/* 0xE0-0xE7 */ + 0x8336,0x67E5,0x78B4,0x643D,0x5BDF,0x5C94,0x5DEE,0x8BE7,/* 0xE8-0xEF */ + 0x62C6,0x67F4,0x8C7A,0x6400,0x63BA,0x8749,0x998B,0x8C17,/* 0xF0-0xF7 */ + 0x7F20,0x94F2,0x4EA7,0x9610,0x98A4,0x660C,0x7316,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x77E6,0x77E8,0x77EA,0x77EF,0x77F0,0x77F1,0x77F2,0x77F4,/* 0x40-0x47 */ + 0x77F5,0x77F7,0x77F9,0x77FA,0x77FB,0x77FC,0x7803,0x7804,/* 0x48-0x4F */ + 0x7805,0x7806,0x7807,0x7808,0x780A,0x780B,0x780E,0x780F,/* 0x50-0x57 */ + 0x7810,0x7813,0x7815,0x7819,0x781B,0x781E,0x7820,0x7821,/* 0x58-0x5F */ + 0x7822,0x7824,0x7828,0x782A,0x782B,0x782E,0x782F,0x7831,/* 0x60-0x67 */ + 0x7832,0x7833,0x7835,0x7836,0x783D,0x783F,0x7841,0x7842,/* 0x68-0x6F */ + 0x7843,0x7844,0x7846,0x7848,0x7849,0x784A,0x784B,0x784D,/* 0x70-0x77 */ + 0x784F,0x7851,0x7853,0x7854,0x7858,0x7859,0x785A,0x0000,/* 0x78-0x7F */ + + 0x785B,0x785C,0x785E,0x785F,0x7860,0x7861,0x7862,0x7863,/* 0x80-0x87 */ + 0x7864,0x7865,0x7866,0x7867,0x7868,0x7869,0x786F,0x7870,/* 0x88-0x8F */ + 0x7871,0x7872,0x7873,0x7874,0x7875,0x7876,0x7878,0x7879,/* 0x90-0x97 */ + 0x787A,0x787B,0x787D,0x787E,0x787F,0x7880,0x7881,0x7882,/* 0x98-0x9F */ + 0x7883,0x573A,0x5C1D,0x5E38,0x957F,0x507F,0x80A0,0x5382,/* 0xA0-0xA7 */ + 0x655E,0x7545,0x5531,0x5021,0x8D85,0x6284,0x949E,0x671D,/* 0xA8-0xAF */ + 0x5632,0x6F6E,0x5DE2,0x5435,0x7092,0x8F66,0x626F,0x64A4,/* 0xB0-0xB7 */ + 0x63A3,0x5F7B,0x6F88,0x90F4,0x81E3,0x8FB0,0x5C18,0x6668,/* 0xB8-0xBF */ + 0x5FF1,0x6C89,0x9648,0x8D81,0x886C,0x6491,0x79F0,0x57CE,/* 0xC0-0xC7 */ + 0x6A59,0x6210,0x5448,0x4E58,0x7A0B,0x60E9,0x6F84,0x8BDA,/* 0xC8-0xCF */ + 0x627F,0x901E,0x9A8B,0x79E4,0x5403,0x75F4,0x6301,0x5319,/* 0xD0-0xD7 */ + 0x6C60,0x8FDF,0x5F1B,0x9A70,0x803B,0x9F7F,0x4F88,0x5C3A,/* 0xD8-0xDF */ + 0x8D64,0x7FC5,0x65A5,0x70BD,0x5145,0x51B2,0x866B,0x5D07,/* 0xE0-0xE7 */ + 0x5BA0,0x62BD,0x916C,0x7574,0x8E0C,0x7A20,0x6101,0x7B79,/* 0xE8-0xEF */ + 0x4EC7,0x7EF8,0x7785,0x4E11,0x81ED,0x521D,0x51FA,0x6A71,/* 0xF0-0xF7 */ + 0x53A8,0x8E87,0x9504,0x96CF,0x6EC1,0x9664,0x695A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7884,0x7885,0x7886,0x7888,0x788A,0x788B,0x788F,0x7890,/* 0x40-0x47 */ + 0x7892,0x7894,0x7895,0x7896,0x7899,0x789D,0x789E,0x78A0,/* 0x48-0x4F */ + 0x78A2,0x78A4,0x78A6,0x78A8,0x78A9,0x78AA,0x78AB,0x78AC,/* 0x50-0x57 */ + 0x78AD,0x78AE,0x78AF,0x78B5,0x78B6,0x78B7,0x78B8,0x78BA,/* 0x58-0x5F */ + 0x78BB,0x78BC,0x78BD,0x78BF,0x78C0,0x78C2,0x78C3,0x78C4,/* 0x60-0x67 */ + 0x78C6,0x78C7,0x78C8,0x78CC,0x78CD,0x78CE,0x78CF,0x78D1,/* 0x68-0x6F */ + 0x78D2,0x78D3,0x78D6,0x78D7,0x78D8,0x78DA,0x78DB,0x78DC,/* 0x70-0x77 */ + 0x78DD,0x78DE,0x78DF,0x78E0,0x78E1,0x78E2,0x78E3,0x0000,/* 0x78-0x7F */ + + 0x78E4,0x78E5,0x78E6,0x78E7,0x78E9,0x78EA,0x78EB,0x78ED,/* 0x80-0x87 */ + 0x78EE,0x78EF,0x78F0,0x78F1,0x78F3,0x78F5,0x78F6,0x78F8,/* 0x88-0x8F */ + 0x78F9,0x78FB,0x78FC,0x78FD,0x78FE,0x78FF,0x7900,0x7902,/* 0x90-0x97 */ + 0x7903,0x7904,0x7906,0x7907,0x7908,0x7909,0x790A,0x790B,/* 0x98-0x9F */ + 0x790C,0x7840,0x50A8,0x77D7,0x6410,0x89E6,0x5904,0x63E3,/* 0xA0-0xA7 */ + 0x5DDD,0x7A7F,0x693D,0x4F20,0x8239,0x5598,0x4E32,0x75AE,/* 0xA8-0xAF */ + 0x7A97,0x5E62,0x5E8A,0x95EF,0x521B,0x5439,0x708A,0x6376,/* 0xB0-0xB7 */ + 0x9524,0x5782,0x6625,0x693F,0x9187,0x5507,0x6DF3,0x7EAF,/* 0xB8-0xBF */ + 0x8822,0x6233,0x7EF0,0x75B5,0x8328,0x78C1,0x96CC,0x8F9E,/* 0xC0-0xC7 */ + 0x6148,0x74F7,0x8BCD,0x6B64,0x523A,0x8D50,0x6B21,0x806A,/* 0xC8-0xCF */ + 0x8471,0x56F1,0x5306,0x4ECE,0x4E1B,0x51D1,0x7C97,0x918B,/* 0xD0-0xD7 */ + 0x7C07,0x4FC3,0x8E7F,0x7BE1,0x7A9C,0x6467,0x5D14,0x50AC,/* 0xD8-0xDF */ + 0x8106,0x7601,0x7CB9,0x6DEC,0x7FE0,0x6751,0x5B58,0x5BF8,/* 0xE0-0xE7 */ + 0x78CB,0x64AE,0x6413,0x63AA,0x632B,0x9519,0x642D,0x8FBE,/* 0xE8-0xEF */ + 0x7B54,0x7629,0x6253,0x5927,0x5446,0x6B79,0x50A3,0x6234,/* 0xF0-0xF7 */ + 0x5E26,0x6B86,0x4EE3,0x8D37,0x888B,0x5F85,0x902E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x790D,0x790E,0x790F,0x7910,0x7911,0x7912,0x7914,0x7915,/* 0x40-0x47 */ + 0x7916,0x7917,0x7918,0x7919,0x791A,0x791B,0x791C,0x791D,/* 0x48-0x4F */ + 0x791F,0x7920,0x7921,0x7922,0x7923,0x7925,0x7926,0x7927,/* 0x50-0x57 */ + 0x7928,0x7929,0x792A,0x792B,0x792C,0x792D,0x792E,0x792F,/* 0x58-0x5F */ + 0x7930,0x7931,0x7932,0x7933,0x7935,0x7936,0x7937,0x7938,/* 0x60-0x67 */ + 0x7939,0x793D,0x793F,0x7942,0x7943,0x7944,0x7945,0x7947,/* 0x68-0x6F */ + 0x794A,0x794B,0x794C,0x794D,0x794E,0x794F,0x7950,0x7951,/* 0x70-0x77 */ + 0x7952,0x7954,0x7955,0x7958,0x7959,0x7961,0x7963,0x0000,/* 0x78-0x7F */ + + 0x7964,0x7966,0x7969,0x796A,0x796B,0x796C,0x796E,0x7970,/* 0x80-0x87 */ + 0x7971,0x7972,0x7973,0x7974,0x7975,0x7976,0x7979,0x797B,/* 0x88-0x8F */ + 0x797C,0x797D,0x797E,0x797F,0x7982,0x7983,0x7986,0x7987,/* 0x90-0x97 */ + 0x7988,0x7989,0x798B,0x798C,0x798D,0x798E,0x7990,0x7991,/* 0x98-0x9F */ + 0x7992,0x6020,0x803D,0x62C5,0x4E39,0x5355,0x90F8,0x63B8,/* 0xA0-0xA7 */ + 0x80C6,0x65E6,0x6C2E,0x4F46,0x60EE,0x6DE1,0x8BDE,0x5F39,/* 0xA8-0xAF */ + 0x86CB,0x5F53,0x6321,0x515A,0x8361,0x6863,0x5200,0x6363,/* 0xB0-0xB7 */ + 0x8E48,0x5012,0x5C9B,0x7977,0x5BFC,0x5230,0x7A3B,0x60BC,/* 0xB8-0xBF */ + 0x9053,0x76D7,0x5FB7,0x5F97,0x7684,0x8E6C,0x706F,0x767B,/* 0xC0-0xC7 */ + 0x7B49,0x77AA,0x51F3,0x9093,0x5824,0x4F4E,0x6EF4,0x8FEA,/* 0xC8-0xCF */ + 0x654C,0x7B1B,0x72C4,0x6DA4,0x7FDF,0x5AE1,0x62B5,0x5E95,/* 0xD0-0xD7 */ + 0x5730,0x8482,0x7B2C,0x5E1D,0x5F1F,0x9012,0x7F14,0x98A0,/* 0xD8-0xDF */ + 0x6382,0x6EC7,0x7898,0x70B9,0x5178,0x975B,0x57AB,0x7535,/* 0xE0-0xE7 */ + 0x4F43,0x7538,0x5E97,0x60E6,0x5960,0x6DC0,0x6BBF,0x7889,/* 0xE8-0xEF */ + 0x53FC,0x96D5,0x51CB,0x5201,0x6389,0x540A,0x9493,0x8C03,/* 0xF0-0xF7 */ + 0x8DCC,0x7239,0x789F,0x8776,0x8FED,0x8C0D,0x53E0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7993,0x7994,0x7995,0x7996,0x7997,0x7998,0x7999,0x799B,/* 0x40-0x47 */ + 0x799C,0x799D,0x799E,0x799F,0x79A0,0x79A1,0x79A2,0x79A3,/* 0x48-0x4F */ + 0x79A4,0x79A5,0x79A6,0x79A8,0x79A9,0x79AA,0x79AB,0x79AC,/* 0x50-0x57 */ + 0x79AD,0x79AE,0x79AF,0x79B0,0x79B1,0x79B2,0x79B4,0x79B5,/* 0x58-0x5F */ + 0x79B6,0x79B7,0x79B8,0x79BC,0x79BF,0x79C2,0x79C4,0x79C5,/* 0x60-0x67 */ + 0x79C7,0x79C8,0x79CA,0x79CC,0x79CE,0x79CF,0x79D0,0x79D3,/* 0x68-0x6F */ + 0x79D4,0x79D6,0x79D7,0x79D9,0x79DA,0x79DB,0x79DC,0x79DD,/* 0x70-0x77 */ + 0x79DE,0x79E0,0x79E1,0x79E2,0x79E5,0x79E8,0x79EA,0x0000,/* 0x78-0x7F */ + + 0x79EC,0x79EE,0x79F1,0x79F2,0x79F3,0x79F4,0x79F5,0x79F6,/* 0x80-0x87 */ + 0x79F7,0x79F9,0x79FA,0x79FC,0x79FE,0x79FF,0x7A01,0x7A04,/* 0x88-0x8F */ + 0x7A05,0x7A07,0x7A08,0x7A09,0x7A0A,0x7A0C,0x7A0F,0x7A10,/* 0x90-0x97 */ + 0x7A11,0x7A12,0x7A13,0x7A15,0x7A16,0x7A18,0x7A19,0x7A1B,/* 0x98-0x9F */ + 0x7A1C,0x4E01,0x76EF,0x53EE,0x9489,0x9876,0x9F0E,0x952D,/* 0xA0-0xA7 */ + 0x5B9A,0x8BA2,0x4E22,0x4E1C,0x51AC,0x8463,0x61C2,0x52A8,/* 0xA8-0xAF */ + 0x680B,0x4F97,0x606B,0x51BB,0x6D1E,0x515C,0x6296,0x6597,/* 0xB0-0xB7 */ + 0x9661,0x8C46,0x9017,0x75D8,0x90FD,0x7763,0x6BD2,0x728A,/* 0xB8-0xBF */ + 0x72EC,0x8BFB,0x5835,0x7779,0x8D4C,0x675C,0x9540,0x809A,/* 0xC0-0xC7 */ + 0x5EA6,0x6E21,0x5992,0x7AEF,0x77ED,0x953B,0x6BB5,0x65AD,/* 0xC8-0xCF */ + 0x7F0E,0x5806,0x5151,0x961F,0x5BF9,0x58A9,0x5428,0x8E72,/* 0xD0-0xD7 */ + 0x6566,0x987F,0x56E4,0x949D,0x76FE,0x9041,0x6387,0x54C6,/* 0xD8-0xDF */ + 0x591A,0x593A,0x579B,0x8EB2,0x6735,0x8DFA,0x8235,0x5241,/* 0xE0-0xE7 */ + 0x60F0,0x5815,0x86FE,0x5CE8,0x9E45,0x4FC4,0x989D,0x8BB9,/* 0xE8-0xEF */ + 0x5A25,0x6076,0x5384,0x627C,0x904F,0x9102,0x997F,0x6069,/* 0xF0-0xF7 */ + 0x800C,0x513F,0x8033,0x5C14,0x9975,0x6D31,0x4E8C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7A1D,0x7A1F,0x7A21,0x7A22,0x7A24,0x7A25,0x7A26,0x7A27,/* 0x40-0x47 */ + 0x7A28,0x7A29,0x7A2A,0x7A2B,0x7A2C,0x7A2D,0x7A2E,0x7A2F,/* 0x48-0x4F */ + 0x7A30,0x7A31,0x7A32,0x7A34,0x7A35,0x7A36,0x7A38,0x7A3A,/* 0x50-0x57 */ + 0x7A3E,0x7A40,0x7A41,0x7A42,0x7A43,0x7A44,0x7A45,0x7A47,/* 0x58-0x5F */ + 0x7A48,0x7A49,0x7A4A,0x7A4B,0x7A4C,0x7A4D,0x7A4E,0x7A4F,/* 0x60-0x67 */ + 0x7A50,0x7A52,0x7A53,0x7A54,0x7A55,0x7A56,0x7A58,0x7A59,/* 0x68-0x6F */ + 0x7A5A,0x7A5B,0x7A5C,0x7A5D,0x7A5E,0x7A5F,0x7A60,0x7A61,/* 0x70-0x77 */ + 0x7A62,0x7A63,0x7A64,0x7A65,0x7A66,0x7A67,0x7A68,0x0000,/* 0x78-0x7F */ + + 0x7A69,0x7A6A,0x7A6B,0x7A6C,0x7A6D,0x7A6E,0x7A6F,0x7A71,/* 0x80-0x87 */ + 0x7A72,0x7A73,0x7A75,0x7A7B,0x7A7C,0x7A7D,0x7A7E,0x7A82,/* 0x88-0x8F */ + 0x7A85,0x7A87,0x7A89,0x7A8A,0x7A8B,0x7A8C,0x7A8E,0x7A8F,/* 0x90-0x97 */ + 0x7A90,0x7A93,0x7A94,0x7A99,0x7A9A,0x7A9B,0x7A9E,0x7AA1,/* 0x98-0x9F */ + 0x7AA2,0x8D30,0x53D1,0x7F5A,0x7B4F,0x4F10,0x4E4F,0x9600,/* 0xA0-0xA7 */ + 0x6CD5,0x73D0,0x85E9,0x5E06,0x756A,0x7FFB,0x6A0A,0x77FE,/* 0xA8-0xAF */ + 0x9492,0x7E41,0x51E1,0x70E6,0x53CD,0x8FD4,0x8303,0x8D29,/* 0xB0-0xB7 */ + 0x72AF,0x996D,0x6CDB,0x574A,0x82B3,0x65B9,0x80AA,0x623F,/* 0xB8-0xBF */ + 0x9632,0x59A8,0x4EFF,0x8BBF,0x7EBA,0x653E,0x83F2,0x975E,/* 0xC0-0xC7 */ + 0x5561,0x98DE,0x80A5,0x532A,0x8BFD,0x5420,0x80BA,0x5E9F,/* 0xC8-0xCF */ + 0x6CB8,0x8D39,0x82AC,0x915A,0x5429,0x6C1B,0x5206,0x7EB7,/* 0xD0-0xD7 */ + 0x575F,0x711A,0x6C7E,0x7C89,0x594B,0x4EFD,0x5FFF,0x6124,/* 0xD8-0xDF */ + 0x7CAA,0x4E30,0x5C01,0x67AB,0x8702,0x5CF0,0x950B,0x98CE,/* 0xE0-0xE7 */ + 0x75AF,0x70FD,0x9022,0x51AF,0x7F1D,0x8BBD,0x5949,0x51E4,/* 0xE8-0xEF */ + 0x4F5B,0x5426,0x592B,0x6577,0x80A4,0x5B75,0x6276,0x62C2,/* 0xF0-0xF7 */ + 0x8F90,0x5E45,0x6C1F,0x7B26,0x4F0F,0x4FD8,0x670D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7AA3,0x7AA4,0x7AA7,0x7AA9,0x7AAA,0x7AAB,0x7AAE,0x7AAF,/* 0x40-0x47 */ + 0x7AB0,0x7AB1,0x7AB2,0x7AB4,0x7AB5,0x7AB6,0x7AB7,0x7AB8,/* 0x48-0x4F */ + 0x7AB9,0x7ABA,0x7ABB,0x7ABC,0x7ABD,0x7ABE,0x7AC0,0x7AC1,/* 0x50-0x57 */ + 0x7AC2,0x7AC3,0x7AC4,0x7AC5,0x7AC6,0x7AC7,0x7AC8,0x7AC9,/* 0x58-0x5F */ + 0x7ACA,0x7ACC,0x7ACD,0x7ACE,0x7ACF,0x7AD0,0x7AD1,0x7AD2,/* 0x60-0x67 */ + 0x7AD3,0x7AD4,0x7AD5,0x7AD7,0x7AD8,0x7ADA,0x7ADB,0x7ADC,/* 0x68-0x6F */ + 0x7ADD,0x7AE1,0x7AE2,0x7AE4,0x7AE7,0x7AE8,0x7AE9,0x7AEA,/* 0x70-0x77 */ + 0x7AEB,0x7AEC,0x7AEE,0x7AF0,0x7AF1,0x7AF2,0x7AF3,0x0000,/* 0x78-0x7F */ + + 0x7AF4,0x7AF5,0x7AF6,0x7AF7,0x7AF8,0x7AFB,0x7AFC,0x7AFE,/* 0x80-0x87 */ + 0x7B00,0x7B01,0x7B02,0x7B05,0x7B07,0x7B09,0x7B0C,0x7B0D,/* 0x88-0x8F */ + 0x7B0E,0x7B10,0x7B12,0x7B13,0x7B16,0x7B17,0x7B18,0x7B1A,/* 0x90-0x97 */ + 0x7B1C,0x7B1D,0x7B1F,0x7B21,0x7B22,0x7B23,0x7B27,0x7B29,/* 0x98-0x9F */ + 0x7B2D,0x6D6E,0x6DAA,0x798F,0x88B1,0x5F17,0x752B,0x629A,/* 0xA0-0xA7 */ + 0x8F85,0x4FEF,0x91DC,0x65A7,0x812F,0x8151,0x5E9C,0x8150,/* 0xA8-0xAF */ + 0x8D74,0x526F,0x8986,0x8D4B,0x590D,0x5085,0x4ED8,0x961C,/* 0xB0-0xB7 */ + 0x7236,0x8179,0x8D1F,0x5BCC,0x8BA3,0x9644,0x5987,0x7F1A,/* 0xB8-0xBF */ + 0x5490,0x5676,0x560E,0x8BE5,0x6539,0x6982,0x9499,0x76D6,/* 0xC0-0xC7 */ + 0x6E89,0x5E72,0x7518,0x6746,0x67D1,0x7AFF,0x809D,0x8D76,/* 0xC8-0xCF */ + 0x611F,0x79C6,0x6562,0x8D63,0x5188,0x521A,0x94A2,0x7F38,/* 0xD0-0xD7 */ + 0x809B,0x7EB2,0x5C97,0x6E2F,0x6760,0x7BD9,0x768B,0x9AD8,/* 0xD8-0xDF */ + 0x818F,0x7F94,0x7CD5,0x641E,0x9550,0x7A3F,0x544A,0x54E5,/* 0xE0-0xE7 */ + 0x6B4C,0x6401,0x6208,0x9E3D,0x80F3,0x7599,0x5272,0x9769,/* 0xE8-0xEF */ + 0x845B,0x683C,0x86E4,0x9601,0x9694,0x94EC,0x4E2A,0x5404,/* 0xF0-0xF7 */ + 0x7ED9,0x6839,0x8DDF,0x8015,0x66F4,0x5E9A,0x7FB9,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7B2F,0x7B30,0x7B32,0x7B34,0x7B35,0x7B36,0x7B37,0x7B39,/* 0x40-0x47 */ + 0x7B3B,0x7B3D,0x7B3F,0x7B40,0x7B41,0x7B42,0x7B43,0x7B44,/* 0x48-0x4F */ + 0x7B46,0x7B48,0x7B4A,0x7B4D,0x7B4E,0x7B53,0x7B55,0x7B57,/* 0x50-0x57 */ + 0x7B59,0x7B5C,0x7B5E,0x7B5F,0x7B61,0x7B63,0x7B64,0x7B65,/* 0x58-0x5F */ + 0x7B66,0x7B67,0x7B68,0x7B69,0x7B6A,0x7B6B,0x7B6C,0x7B6D,/* 0x60-0x67 */ + 0x7B6F,0x7B70,0x7B73,0x7B74,0x7B76,0x7B78,0x7B7A,0x7B7C,/* 0x68-0x6F */ + 0x7B7D,0x7B7F,0x7B81,0x7B82,0x7B83,0x7B84,0x7B86,0x7B87,/* 0x70-0x77 */ + 0x7B88,0x7B89,0x7B8A,0x7B8B,0x7B8C,0x7B8E,0x7B8F,0x0000,/* 0x78-0x7F */ + + 0x7B91,0x7B92,0x7B93,0x7B96,0x7B98,0x7B99,0x7B9A,0x7B9B,/* 0x80-0x87 */ + 0x7B9E,0x7B9F,0x7BA0,0x7BA3,0x7BA4,0x7BA5,0x7BAE,0x7BAF,/* 0x88-0x8F */ + 0x7BB0,0x7BB2,0x7BB3,0x7BB5,0x7BB6,0x7BB7,0x7BB9,0x7BBA,/* 0x90-0x97 */ + 0x7BBB,0x7BBC,0x7BBD,0x7BBE,0x7BBF,0x7BC0,0x7BC2,0x7BC3,/* 0x98-0x9F */ + 0x7BC4,0x57C2,0x803F,0x6897,0x5DE5,0x653B,0x529F,0x606D,/* 0xA0-0xA7 */ + 0x9F9A,0x4F9B,0x8EAC,0x516C,0x5BAB,0x5F13,0x5DE9,0x6C5E,/* 0xA8-0xAF */ + 0x62F1,0x8D21,0x5171,0x94A9,0x52FE,0x6C9F,0x82DF,0x72D7,/* 0xB0-0xB7 */ + 0x57A2,0x6784,0x8D2D,0x591F,0x8F9C,0x83C7,0x5495,0x7B8D,/* 0xB8-0xBF */ + 0x4F30,0x6CBD,0x5B64,0x59D1,0x9F13,0x53E4,0x86CA,0x9AA8,/* 0xC0-0xC7 */ + 0x8C37,0x80A1,0x6545,0x987E,0x56FA,0x96C7,0x522E,0x74DC,/* 0xC8-0xCF */ + 0x5250,0x5BE1,0x6302,0x8902,0x4E56,0x62D0,0x602A,0x68FA,/* 0xD0-0xD7 */ + 0x5173,0x5B98,0x51A0,0x89C2,0x7BA1,0x9986,0x7F50,0x60EF,/* 0xD8-0xDF */ + 0x704C,0x8D2F,0x5149,0x5E7F,0x901B,0x7470,0x89C4,0x572D,/* 0xE0-0xE7 */ + 0x7845,0x5F52,0x9F9F,0x95FA,0x8F68,0x9B3C,0x8BE1,0x7678,/* 0xE8-0xEF */ + 0x6842,0x67DC,0x8DEA,0x8D35,0x523D,0x8F8A,0x6EDA,0x68CD,/* 0xF0-0xF7 */ + 0x9505,0x90ED,0x56FD,0x679C,0x88F9,0x8FC7,0x54C8,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7BC5,0x7BC8,0x7BC9,0x7BCA,0x7BCB,0x7BCD,0x7BCE,0x7BCF,/* 0x40-0x47 */ + 0x7BD0,0x7BD2,0x7BD4,0x7BD5,0x7BD6,0x7BD7,0x7BD8,0x7BDB,/* 0x48-0x4F */ + 0x7BDC,0x7BDE,0x7BDF,0x7BE0,0x7BE2,0x7BE3,0x7BE4,0x7BE7,/* 0x50-0x57 */ + 0x7BE8,0x7BE9,0x7BEB,0x7BEC,0x7BED,0x7BEF,0x7BF0,0x7BF2,/* 0x58-0x5F */ + 0x7BF3,0x7BF4,0x7BF5,0x7BF6,0x7BF8,0x7BF9,0x7BFA,0x7BFB,/* 0x60-0x67 */ + 0x7BFD,0x7BFF,0x7C00,0x7C01,0x7C02,0x7C03,0x7C04,0x7C05,/* 0x68-0x6F */ + 0x7C06,0x7C08,0x7C09,0x7C0A,0x7C0D,0x7C0E,0x7C10,0x7C11,/* 0x70-0x77 */ + 0x7C12,0x7C13,0x7C14,0x7C15,0x7C17,0x7C18,0x7C19,0x0000,/* 0x78-0x7F */ + + 0x7C1A,0x7C1B,0x7C1C,0x7C1D,0x7C1E,0x7C20,0x7C21,0x7C22,/* 0x80-0x87 */ + 0x7C23,0x7C24,0x7C25,0x7C28,0x7C29,0x7C2B,0x7C2C,0x7C2D,/* 0x88-0x8F */ + 0x7C2E,0x7C2F,0x7C30,0x7C31,0x7C32,0x7C33,0x7C34,0x7C35,/* 0x90-0x97 */ + 0x7C36,0x7C37,0x7C39,0x7C3A,0x7C3B,0x7C3C,0x7C3D,0x7C3E,/* 0x98-0x9F */ + 0x7C42,0x9AB8,0x5B69,0x6D77,0x6C26,0x4EA5,0x5BB3,0x9A87,/* 0xA0-0xA7 */ + 0x9163,0x61A8,0x90AF,0x97E9,0x542B,0x6DB5,0x5BD2,0x51FD,/* 0xA8-0xAF */ + 0x558A,0x7F55,0x7FF0,0x64BC,0x634D,0x65F1,0x61BE,0x608D,/* 0xB0-0xB7 */ + 0x710A,0x6C57,0x6C49,0x592F,0x676D,0x822A,0x58D5,0x568E,/* 0xB8-0xBF */ + 0x8C6A,0x6BEB,0x90DD,0x597D,0x8017,0x53F7,0x6D69,0x5475,/* 0xC0-0xC7 */ + 0x559D,0x8377,0x83CF,0x6838,0x79BE,0x548C,0x4F55,0x5408,/* 0xC8-0xCF */ + 0x76D2,0x8C89,0x9602,0x6CB3,0x6DB8,0x8D6B,0x8910,0x9E64,/* 0xD0-0xD7 */ + 0x8D3A,0x563F,0x9ED1,0x75D5,0x5F88,0x72E0,0x6068,0x54FC,/* 0xD8-0xDF */ + 0x4EA8,0x6A2A,0x8861,0x6052,0x8F70,0x54C4,0x70D8,0x8679,/* 0xE0-0xE7 */ + 0x9E3F,0x6D2A,0x5B8F,0x5F18,0x7EA2,0x5589,0x4FAF,0x7334,/* 0xE8-0xEF */ + 0x543C,0x539A,0x5019,0x540E,0x547C,0x4E4E,0x5FFD,0x745A,/* 0xF0-0xF7 */ + 0x58F6,0x846B,0x80E1,0x8774,0x72D0,0x7CCA,0x6E56,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7C43,0x7C44,0x7C45,0x7C46,0x7C47,0x7C48,0x7C49,0x7C4A,/* 0x40-0x47 */ + 0x7C4B,0x7C4C,0x7C4E,0x7C4F,0x7C50,0x7C51,0x7C52,0x7C53,/* 0x48-0x4F */ + 0x7C54,0x7C55,0x7C56,0x7C57,0x7C58,0x7C59,0x7C5A,0x7C5B,/* 0x50-0x57 */ + 0x7C5C,0x7C5D,0x7C5E,0x7C5F,0x7C60,0x7C61,0x7C62,0x7C63,/* 0x58-0x5F */ + 0x7C64,0x7C65,0x7C66,0x7C67,0x7C68,0x7C69,0x7C6A,0x7C6B,/* 0x60-0x67 */ + 0x7C6C,0x7C6D,0x7C6E,0x7C6F,0x7C70,0x7C71,0x7C72,0x7C75,/* 0x68-0x6F */ + 0x7C76,0x7C77,0x7C78,0x7C79,0x7C7A,0x7C7E,0x7C7F,0x7C80,/* 0x70-0x77 */ + 0x7C81,0x7C82,0x7C83,0x7C84,0x7C85,0x7C86,0x7C87,0x0000,/* 0x78-0x7F */ + + 0x7C88,0x7C8A,0x7C8B,0x7C8C,0x7C8D,0x7C8E,0x7C8F,0x7C90,/* 0x80-0x87 */ + 0x7C93,0x7C94,0x7C96,0x7C99,0x7C9A,0x7C9B,0x7CA0,0x7CA1,/* 0x88-0x8F */ + 0x7CA3,0x7CA6,0x7CA7,0x7CA8,0x7CA9,0x7CAB,0x7CAC,0x7CAD,/* 0x90-0x97 */ + 0x7CAF,0x7CB0,0x7CB4,0x7CB5,0x7CB6,0x7CB7,0x7CB8,0x7CBA,/* 0x98-0x9F */ + 0x7CBB,0x5F27,0x864E,0x552C,0x62A4,0x4E92,0x6CAA,0x6237,/* 0xA0-0xA7 */ + 0x82B1,0x54D7,0x534E,0x733E,0x6ED1,0x753B,0x5212,0x5316,/* 0xA8-0xAF */ + 0x8BDD,0x69D0,0x5F8A,0x6000,0x6DEE,0x574F,0x6B22,0x73AF,/* 0xB0-0xB7 */ + 0x6853,0x8FD8,0x7F13,0x6362,0x60A3,0x5524,0x75EA,0x8C62,/* 0xB8-0xBF */ + 0x7115,0x6DA3,0x5BA6,0x5E7B,0x8352,0x614C,0x9EC4,0x78FA,/* 0xC0-0xC7 */ + 0x8757,0x7C27,0x7687,0x51F0,0x60F6,0x714C,0x6643,0x5E4C,/* 0xC8-0xCF */ + 0x604D,0x8C0E,0x7070,0x6325,0x8F89,0x5FBD,0x6062,0x86D4,/* 0xD0-0xD7 */ + 0x56DE,0x6BC1,0x6094,0x6167,0x5349,0x60E0,0x6666,0x8D3F,/* 0xD8-0xDF */ + 0x79FD,0x4F1A,0x70E9,0x6C47,0x8BB3,0x8BF2,0x7ED8,0x8364,/* 0xE0-0xE7 */ + 0x660F,0x5A5A,0x9B42,0x6D51,0x6DF7,0x8C41,0x6D3B,0x4F19,/* 0xE8-0xEF */ + 0x706B,0x83B7,0x6216,0x60D1,0x970D,0x8D27,0x7978,0x51FB,/* 0xF0-0xF7 */ + 0x573E,0x57FA,0x673A,0x7578,0x7A3D,0x79EF,0x7B95,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7CBF,0x7CC0,0x7CC2,0x7CC3,0x7CC4,0x7CC6,0x7CC9,0x7CCB,/* 0x40-0x47 */ + 0x7CCE,0x7CCF,0x7CD0,0x7CD1,0x7CD2,0x7CD3,0x7CD4,0x7CD8,/* 0x48-0x4F */ + 0x7CDA,0x7CDB,0x7CDD,0x7CDE,0x7CE1,0x7CE2,0x7CE3,0x7CE4,/* 0x50-0x57 */ + 0x7CE5,0x7CE6,0x7CE7,0x7CE9,0x7CEA,0x7CEB,0x7CEC,0x7CED,/* 0x58-0x5F */ + 0x7CEE,0x7CF0,0x7CF1,0x7CF2,0x7CF3,0x7CF4,0x7CF5,0x7CF6,/* 0x60-0x67 */ + 0x7CF7,0x7CF9,0x7CFA,0x7CFC,0x7CFD,0x7CFE,0x7CFF,0x7D00,/* 0x68-0x6F */ + 0x7D01,0x7D02,0x7D03,0x7D04,0x7D05,0x7D06,0x7D07,0x7D08,/* 0x70-0x77 */ + 0x7D09,0x7D0B,0x7D0C,0x7D0D,0x7D0E,0x7D0F,0x7D10,0x0000,/* 0x78-0x7F */ + + 0x7D11,0x7D12,0x7D13,0x7D14,0x7D15,0x7D16,0x7D17,0x7D18,/* 0x80-0x87 */ + 0x7D19,0x7D1A,0x7D1B,0x7D1C,0x7D1D,0x7D1E,0x7D1F,0x7D21,/* 0x88-0x8F */ + 0x7D23,0x7D24,0x7D25,0x7D26,0x7D28,0x7D29,0x7D2A,0x7D2C,/* 0x90-0x97 */ + 0x7D2D,0x7D2E,0x7D30,0x7D31,0x7D32,0x7D33,0x7D34,0x7D35,/* 0x98-0x9F */ + 0x7D36,0x808C,0x9965,0x8FF9,0x6FC0,0x8BA5,0x9E21,0x59EC,/* 0xA0-0xA7 */ + 0x7EE9,0x7F09,0x5409,0x6781,0x68D8,0x8F91,0x7C4D,0x96C6,/* 0xA8-0xAF */ + 0x53CA,0x6025,0x75BE,0x6C72,0x5373,0x5AC9,0x7EA7,0x6324,/* 0xB0-0xB7 */ + 0x51E0,0x810A,0x5DF1,0x84DF,0x6280,0x5180,0x5B63,0x4F0E,/* 0xB8-0xBF */ + 0x796D,0x5242,0x60B8,0x6D4E,0x5BC4,0x5BC2,0x8BA1,0x8BB0,/* 0xC0-0xC7 */ + 0x65E2,0x5FCC,0x9645,0x5993,0x7EE7,0x7EAA,0x5609,0x67B7,/* 0xC8-0xCF */ + 0x5939,0x4F73,0x5BB6,0x52A0,0x835A,0x988A,0x8D3E,0x7532,/* 0xD0-0xD7 */ + 0x94BE,0x5047,0x7A3C,0x4EF7,0x67B6,0x9A7E,0x5AC1,0x6B7C,/* 0xD8-0xDF */ + 0x76D1,0x575A,0x5C16,0x7B3A,0x95F4,0x714E,0x517C,0x80A9,/* 0xE0-0xE7 */ + 0x8270,0x5978,0x7F04,0x8327,0x68C0,0x67EC,0x78B1,0x7877,/* 0xE8-0xEF */ + 0x62E3,0x6361,0x7B80,0x4FED,0x526A,0x51CF,0x8350,0x69DB,/* 0xF0-0xF7 */ + 0x9274,0x8DF5,0x8D31,0x89C1,0x952E,0x7BAD,0x4EF6,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7D37,0x7D38,0x7D39,0x7D3A,0x7D3B,0x7D3C,0x7D3D,0x7D3E,/* 0x40-0x47 */ + 0x7D3F,0x7D40,0x7D41,0x7D42,0x7D43,0x7D44,0x7D45,0x7D46,/* 0x48-0x4F */ + 0x7D47,0x7D48,0x7D49,0x7D4A,0x7D4B,0x7D4C,0x7D4D,0x7D4E,/* 0x50-0x57 */ + 0x7D4F,0x7D50,0x7D51,0x7D52,0x7D53,0x7D54,0x7D55,0x7D56,/* 0x58-0x5F */ + 0x7D57,0x7D58,0x7D59,0x7D5A,0x7D5B,0x7D5C,0x7D5D,0x7D5E,/* 0x60-0x67 */ + 0x7D5F,0x7D60,0x7D61,0x7D62,0x7D63,0x7D64,0x7D65,0x7D66,/* 0x68-0x6F */ + 0x7D67,0x7D68,0x7D69,0x7D6A,0x7D6B,0x7D6C,0x7D6D,0x7D6F,/* 0x70-0x77 */ + 0x7D70,0x7D71,0x7D72,0x7D73,0x7D74,0x7D75,0x7D76,0x0000,/* 0x78-0x7F */ + + 0x7D78,0x7D79,0x7D7A,0x7D7B,0x7D7C,0x7D7D,0x7D7E,0x7D7F,/* 0x80-0x87 */ + 0x7D80,0x7D81,0x7D82,0x7D83,0x7D84,0x7D85,0x7D86,0x7D87,/* 0x88-0x8F */ + 0x7D88,0x7D89,0x7D8A,0x7D8B,0x7D8C,0x7D8D,0x7D8E,0x7D8F,/* 0x90-0x97 */ + 0x7D90,0x7D91,0x7D92,0x7D93,0x7D94,0x7D95,0x7D96,0x7D97,/* 0x98-0x9F */ + 0x7D98,0x5065,0x8230,0x5251,0x996F,0x6E10,0x6E85,0x6DA7,/* 0xA0-0xA7 */ + 0x5EFA,0x50F5,0x59DC,0x5C06,0x6D46,0x6C5F,0x7586,0x848B,/* 0xA8-0xAF */ + 0x6868,0x5956,0x8BB2,0x5320,0x9171,0x964D,0x8549,0x6912,/* 0xB0-0xB7 */ + 0x7901,0x7126,0x80F6,0x4EA4,0x90CA,0x6D47,0x9A84,0x5A07,/* 0xB8-0xBF */ + 0x56BC,0x6405,0x94F0,0x77EB,0x4FA5,0x811A,0x72E1,0x89D2,/* 0xC0-0xC7 */ + 0x997A,0x7F34,0x7EDE,0x527F,0x6559,0x9175,0x8F7F,0x8F83,/* 0xC8-0xCF */ + 0x53EB,0x7A96,0x63ED,0x63A5,0x7686,0x79F8,0x8857,0x9636,/* 0xD0-0xD7 */ + 0x622A,0x52AB,0x8282,0x6854,0x6770,0x6377,0x776B,0x7AED,/* 0xD8-0xDF */ + 0x6D01,0x7ED3,0x89E3,0x59D0,0x6212,0x85C9,0x82A5,0x754C,/* 0xE0-0xE7 */ + 0x501F,0x4ECB,0x75A5,0x8BEB,0x5C4A,0x5DFE,0x7B4B,0x65A4,/* 0xE8-0xEF */ + 0x91D1,0x4ECA,0x6D25,0x895F,0x7D27,0x9526,0x4EC5,0x8C28,/* 0xF0-0xF7 */ + 0x8FDB,0x9773,0x664B,0x7981,0x8FD1,0x70EC,0x6D78,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7D99,0x7D9A,0x7D9B,0x7D9C,0x7D9D,0x7D9E,0x7D9F,0x7DA0,/* 0x40-0x47 */ + 0x7DA1,0x7DA2,0x7DA3,0x7DA4,0x7DA5,0x7DA7,0x7DA8,0x7DA9,/* 0x48-0x4F */ + 0x7DAA,0x7DAB,0x7DAC,0x7DAD,0x7DAF,0x7DB0,0x7DB1,0x7DB2,/* 0x50-0x57 */ + 0x7DB3,0x7DB4,0x7DB5,0x7DB6,0x7DB7,0x7DB8,0x7DB9,0x7DBA,/* 0x58-0x5F */ + 0x7DBB,0x7DBC,0x7DBD,0x7DBE,0x7DBF,0x7DC0,0x7DC1,0x7DC2,/* 0x60-0x67 */ + 0x7DC3,0x7DC4,0x7DC5,0x7DC6,0x7DC7,0x7DC8,0x7DC9,0x7DCA,/* 0x68-0x6F */ + 0x7DCB,0x7DCC,0x7DCD,0x7DCE,0x7DCF,0x7DD0,0x7DD1,0x7DD2,/* 0x70-0x77 */ + 0x7DD3,0x7DD4,0x7DD5,0x7DD6,0x7DD7,0x7DD8,0x7DD9,0x0000,/* 0x78-0x7F */ + + 0x7DDA,0x7DDB,0x7DDC,0x7DDD,0x7DDE,0x7DDF,0x7DE0,0x7DE1,/* 0x80-0x87 */ + 0x7DE2,0x7DE3,0x7DE4,0x7DE5,0x7DE6,0x7DE7,0x7DE8,0x7DE9,/* 0x88-0x8F */ + 0x7DEA,0x7DEB,0x7DEC,0x7DED,0x7DEE,0x7DEF,0x7DF0,0x7DF1,/* 0x90-0x97 */ + 0x7DF2,0x7DF3,0x7DF4,0x7DF5,0x7DF6,0x7DF7,0x7DF8,0x7DF9,/* 0x98-0x9F */ + 0x7DFA,0x5C3D,0x52B2,0x8346,0x5162,0x830E,0x775B,0x6676,/* 0xA0-0xA7 */ + 0x9CB8,0x4EAC,0x60CA,0x7CBE,0x7CB3,0x7ECF,0x4E95,0x8B66,/* 0xA8-0xAF */ + 0x666F,0x9888,0x9759,0x5883,0x656C,0x955C,0x5F84,0x75C9,/* 0xB0-0xB7 */ + 0x9756,0x7ADF,0x7ADE,0x51C0,0x70AF,0x7A98,0x63EA,0x7A76,/* 0xB8-0xBF */ + 0x7EA0,0x7396,0x97ED,0x4E45,0x7078,0x4E5D,0x9152,0x53A9,/* 0xC0-0xC7 */ + 0x6551,0x65E7,0x81FC,0x8205,0x548E,0x5C31,0x759A,0x97A0,/* 0xC8-0xCF */ + 0x62D8,0x72D9,0x75BD,0x5C45,0x9A79,0x83CA,0x5C40,0x5480,/* 0xD0-0xD7 */ + 0x77E9,0x4E3E,0x6CAE,0x805A,0x62D2,0x636E,0x5DE8,0x5177,/* 0xD8-0xDF */ + 0x8DDD,0x8E1E,0x952F,0x4FF1,0x53E5,0x60E7,0x70AC,0x5267,/* 0xE0-0xE7 */ + 0x6350,0x9E43,0x5A1F,0x5026,0x7737,0x5377,0x7EE2,0x6485,/* 0xE8-0xEF */ + 0x652B,0x6289,0x6398,0x5014,0x7235,0x89C9,0x51B3,0x8BC0,/* 0xF0-0xF7 */ + 0x7EDD,0x5747,0x83CC,0x94A7,0x519B,0x541B,0x5CFB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7DFB,0x7DFC,0x7DFD,0x7DFE,0x7DFF,0x7E00,0x7E01,0x7E02,/* 0x40-0x47 */ + 0x7E03,0x7E04,0x7E05,0x7E06,0x7E07,0x7E08,0x7E09,0x7E0A,/* 0x48-0x4F */ + 0x7E0B,0x7E0C,0x7E0D,0x7E0E,0x7E0F,0x7E10,0x7E11,0x7E12,/* 0x50-0x57 */ + 0x7E13,0x7E14,0x7E15,0x7E16,0x7E17,0x7E18,0x7E19,0x7E1A,/* 0x58-0x5F */ + 0x7E1B,0x7E1C,0x7E1D,0x7E1E,0x7E1F,0x7E20,0x7E21,0x7E22,/* 0x60-0x67 */ + 0x7E23,0x7E24,0x7E25,0x7E26,0x7E27,0x7E28,0x7E29,0x7E2A,/* 0x68-0x6F */ + 0x7E2B,0x7E2C,0x7E2D,0x7E2E,0x7E2F,0x7E30,0x7E31,0x7E32,/* 0x70-0x77 */ + 0x7E33,0x7E34,0x7E35,0x7E36,0x7E37,0x7E38,0x7E39,0x0000,/* 0x78-0x7F */ + + 0x7E3A,0x7E3C,0x7E3D,0x7E3E,0x7E3F,0x7E40,0x7E42,0x7E43,/* 0x80-0x87 */ + 0x7E44,0x7E45,0x7E46,0x7E48,0x7E49,0x7E4A,0x7E4B,0x7E4C,/* 0x88-0x8F */ + 0x7E4D,0x7E4E,0x7E4F,0x7E50,0x7E51,0x7E52,0x7E53,0x7E54,/* 0x90-0x97 */ + 0x7E55,0x7E56,0x7E57,0x7E58,0x7E59,0x7E5A,0x7E5B,0x7E5C,/* 0x98-0x9F */ + 0x7E5D,0x4FCA,0x7AE3,0x6D5A,0x90E1,0x9A8F,0x5580,0x5496,/* 0xA0-0xA7 */ + 0x5361,0x54AF,0x5F00,0x63E9,0x6977,0x51EF,0x6168,0x520A,/* 0xA8-0xAF */ + 0x582A,0x52D8,0x574E,0x780D,0x770B,0x5EB7,0x6177,0x7CE0,/* 0xB0-0xB7 */ + 0x625B,0x6297,0x4EA2,0x7095,0x8003,0x62F7,0x70E4,0x9760,/* 0xB8-0xBF */ + 0x5777,0x82DB,0x67EF,0x68F5,0x78D5,0x9897,0x79D1,0x58F3,/* 0xC0-0xC7 */ + 0x54B3,0x53EF,0x6E34,0x514B,0x523B,0x5BA2,0x8BFE,0x80AF,/* 0xC8-0xCF */ + 0x5543,0x57A6,0x6073,0x5751,0x542D,0x7A7A,0x6050,0x5B54,/* 0xD0-0xD7 */ + 0x63A7,0x62A0,0x53E3,0x6263,0x5BC7,0x67AF,0x54ED,0x7A9F,/* 0xD8-0xDF */ + 0x82E6,0x9177,0x5E93,0x88E4,0x5938,0x57AE,0x630E,0x8DE8,/* 0xE0-0xE7 */ + 0x80EF,0x5757,0x7B77,0x4FA9,0x5FEB,0x5BBD,0x6B3E,0x5321,/* 0xE8-0xEF */ + 0x7B50,0x72C2,0x6846,0x77FF,0x7736,0x65F7,0x51B5,0x4E8F,/* 0xF0-0xF7 */ + 0x76D4,0x5CBF,0x7AA5,0x8475,0x594E,0x9B41,0x5080,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7E5E,0x7E5F,0x7E60,0x7E61,0x7E62,0x7E63,0x7E64,0x7E65,/* 0x40-0x47 */ + 0x7E66,0x7E67,0x7E68,0x7E69,0x7E6A,0x7E6B,0x7E6C,0x7E6D,/* 0x48-0x4F */ + 0x7E6E,0x7E6F,0x7E70,0x7E71,0x7E72,0x7E73,0x7E74,0x7E75,/* 0x50-0x57 */ + 0x7E76,0x7E77,0x7E78,0x7E79,0x7E7A,0x7E7B,0x7E7C,0x7E7D,/* 0x58-0x5F */ + 0x7E7E,0x7E7F,0x7E80,0x7E81,0x7E83,0x7E84,0x7E85,0x7E86,/* 0x60-0x67 */ + 0x7E87,0x7E88,0x7E89,0x7E8A,0x7E8B,0x7E8C,0x7E8D,0x7E8E,/* 0x68-0x6F */ + 0x7E8F,0x7E90,0x7E91,0x7E92,0x7E93,0x7E94,0x7E95,0x7E96,/* 0x70-0x77 */ + 0x7E97,0x7E98,0x7E99,0x7E9A,0x7E9C,0x7E9D,0x7E9E,0x0000,/* 0x78-0x7F */ + + 0x7EAE,0x7EB4,0x7EBB,0x7EBC,0x7ED6,0x7EE4,0x7EEC,0x7EF9,/* 0x80-0x87 */ + 0x7F0A,0x7F10,0x7F1E,0x7F37,0x7F39,0x7F3B,0x7F3C,0x7F3D,/* 0x88-0x8F */ + 0x7F3E,0x7F3F,0x7F40,0x7F41,0x7F43,0x7F46,0x7F47,0x7F48,/* 0x90-0x97 */ + 0x7F49,0x7F4A,0x7F4B,0x7F4C,0x7F4D,0x7F4E,0x7F4F,0x7F52,/* 0x98-0x9F */ + 0x7F53,0x9988,0x6127,0x6E83,0x5764,0x6606,0x6346,0x56F0,/* 0xA0-0xA7 */ + 0x62EC,0x6269,0x5ED3,0x9614,0x5783,0x62C9,0x5587,0x8721,/* 0xA8-0xAF */ + 0x814A,0x8FA3,0x5566,0x83B1,0x6765,0x8D56,0x84DD,0x5A6A,/* 0xB0-0xB7 */ + 0x680F,0x62E6,0x7BEE,0x9611,0x5170,0x6F9C,0x8C30,0x63FD,/* 0xB8-0xBF */ + 0x89C8,0x61D2,0x7F06,0x70C2,0x6EE5,0x7405,0x6994,0x72FC,/* 0xC0-0xC7 */ + 0x5ECA,0x90CE,0x6717,0x6D6A,0x635E,0x52B3,0x7262,0x8001,/* 0xC8-0xCF */ + 0x4F6C,0x59E5,0x916A,0x70D9,0x6D9D,0x52D2,0x4E50,0x96F7,/* 0xD0-0xD7 */ + 0x956D,0x857E,0x78CA,0x7D2F,0x5121,0x5792,0x64C2,0x808B,/* 0xD8-0xDF */ + 0x7C7B,0x6CEA,0x68F1,0x695E,0x51B7,0x5398,0x68A8,0x7281,/* 0xE0-0xE7 */ + 0x9ECE,0x7BF1,0x72F8,0x79BB,0x6F13,0x7406,0x674E,0x91CC,/* 0xE8-0xEF */ + 0x9CA4,0x793C,0x8389,0x8354,0x540F,0x6817,0x4E3D,0x5389,/* 0xF0-0xF7 */ + 0x52B1,0x783E,0x5386,0x5229,0x5088,0x4F8B,0x4FD0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7F56,0x7F59,0x7F5B,0x7F5C,0x7F5D,0x7F5E,0x7F60,0x7F63,/* 0x40-0x47 */ + 0x7F64,0x7F65,0x7F66,0x7F67,0x7F6B,0x7F6C,0x7F6D,0x7F6F,/* 0x48-0x4F */ + 0x7F70,0x7F73,0x7F75,0x7F76,0x7F77,0x7F78,0x7F7A,0x7F7B,/* 0x50-0x57 */ + 0x7F7C,0x7F7D,0x7F7F,0x7F80,0x7F82,0x7F83,0x7F84,0x7F85,/* 0x58-0x5F */ + 0x7F86,0x7F87,0x7F88,0x7F89,0x7F8B,0x7F8D,0x7F8F,0x7F90,/* 0x60-0x67 */ + 0x7F91,0x7F92,0x7F93,0x7F95,0x7F96,0x7F97,0x7F98,0x7F99,/* 0x68-0x6F */ + 0x7F9B,0x7F9C,0x7FA0,0x7FA2,0x7FA3,0x7FA5,0x7FA6,0x7FA8,/* 0x70-0x77 */ + 0x7FA9,0x7FAA,0x7FAB,0x7FAC,0x7FAD,0x7FAE,0x7FB1,0x0000,/* 0x78-0x7F */ + + 0x7FB3,0x7FB4,0x7FB5,0x7FB6,0x7FB7,0x7FBA,0x7FBB,0x7FBE,/* 0x80-0x87 */ + 0x7FC0,0x7FC2,0x7FC3,0x7FC4,0x7FC6,0x7FC7,0x7FC8,0x7FC9,/* 0x88-0x8F */ + 0x7FCB,0x7FCD,0x7FCF,0x7FD0,0x7FD1,0x7FD2,0x7FD3,0x7FD6,/* 0x90-0x97 */ + 0x7FD7,0x7FD9,0x7FDA,0x7FDB,0x7FDC,0x7FDD,0x7FDE,0x7FE2,/* 0x98-0x9F */ + 0x7FE3,0x75E2,0x7ACB,0x7C92,0x6CA5,0x96B6,0x529B,0x7483,/* 0xA0-0xA7 */ + 0x54E9,0x4FE9,0x8054,0x83B2,0x8FDE,0x9570,0x5EC9,0x601C,/* 0xA8-0xAF */ + 0x6D9F,0x5E18,0x655B,0x8138,0x94FE,0x604B,0x70BC,0x7EC3,/* 0xB0-0xB7 */ + 0x7CAE,0x51C9,0x6881,0x7CB1,0x826F,0x4E24,0x8F86,0x91CF,/* 0xB8-0xBF */ + 0x667E,0x4EAE,0x8C05,0x64A9,0x804A,0x50DA,0x7597,0x71CE,/* 0xC0-0xC7 */ + 0x5BE5,0x8FBD,0x6F66,0x4E86,0x6482,0x9563,0x5ED6,0x6599,/* 0xC8-0xCF */ + 0x5217,0x88C2,0x70C8,0x52A3,0x730E,0x7433,0x6797,0x78F7,/* 0xD0-0xD7 */ + 0x9716,0x4E34,0x90BB,0x9CDE,0x6DCB,0x51DB,0x8D41,0x541D,/* 0xD8-0xDF */ + 0x62CE,0x73B2,0x83F1,0x96F6,0x9F84,0x94C3,0x4F36,0x7F9A,/* 0xE0-0xE7 */ + 0x51CC,0x7075,0x9675,0x5CAD,0x9886,0x53E6,0x4EE4,0x6E9C,/* 0xE8-0xEF */ + 0x7409,0x69B4,0x786B,0x998F,0x7559,0x5218,0x7624,0x6D41,/* 0xF0-0xF7 */ + 0x67F3,0x516D,0x9F99,0x804B,0x5499,0x7B3C,0x7ABF,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7FE4,0x7FE7,0x7FE8,0x7FEA,0x7FEB,0x7FEC,0x7FED,0x7FEF,/* 0x40-0x47 */ + 0x7FF2,0x7FF4,0x7FF5,0x7FF6,0x7FF7,0x7FF8,0x7FF9,0x7FFA,/* 0x48-0x4F */ + 0x7FFD,0x7FFE,0x7FFF,0x8002,0x8007,0x8008,0x8009,0x800A,/* 0x50-0x57 */ + 0x800E,0x800F,0x8011,0x8013,0x801A,0x801B,0x801D,0x801E,/* 0x58-0x5F */ + 0x801F,0x8021,0x8023,0x8024,0x802B,0x802C,0x802D,0x802E,/* 0x60-0x67 */ + 0x802F,0x8030,0x8032,0x8034,0x8039,0x803A,0x803C,0x803E,/* 0x68-0x6F */ + 0x8040,0x8041,0x8044,0x8045,0x8047,0x8048,0x8049,0x804E,/* 0x70-0x77 */ + 0x804F,0x8050,0x8051,0x8053,0x8055,0x8056,0x8057,0x0000,/* 0x78-0x7F */ + + 0x8059,0x805B,0x805C,0x805D,0x805E,0x805F,0x8060,0x8061,/* 0x80-0x87 */ + 0x8062,0x8063,0x8064,0x8065,0x8066,0x8067,0x8068,0x806B,/* 0x88-0x8F */ + 0x806C,0x806D,0x806E,0x806F,0x8070,0x8072,0x8073,0x8074,/* 0x90-0x97 */ + 0x8075,0x8076,0x8077,0x8078,0x8079,0x807A,0x807B,0x807C,/* 0x98-0x9F */ + 0x807D,0x9686,0x5784,0x62E2,0x9647,0x697C,0x5A04,0x6402,/* 0xA0-0xA7 */ + 0x7BD3,0x6F0F,0x964B,0x82A6,0x5362,0x9885,0x5E90,0x7089,/* 0xA8-0xAF */ + 0x63B3,0x5364,0x864F,0x9C81,0x9E93,0x788C,0x9732,0x8DEF,/* 0xB0-0xB7 */ + 0x8D42,0x9E7F,0x6F5E,0x7984,0x5F55,0x9646,0x622E,0x9A74,/* 0xB8-0xBF */ + 0x5415,0x94DD,0x4FA3,0x65C5,0x5C65,0x5C61,0x7F15,0x8651,/* 0xC0-0xC7 */ + 0x6C2F,0x5F8B,0x7387,0x6EE4,0x7EFF,0x5CE6,0x631B,0x5B6A,/* 0xC8-0xCF */ + 0x6EE6,0x5375,0x4E71,0x63A0,0x7565,0x62A1,0x8F6E,0x4F26,/* 0xD0-0xD7 */ + 0x4ED1,0x6CA6,0x7EB6,0x8BBA,0x841D,0x87BA,0x7F57,0x903B,/* 0xD8-0xDF */ + 0x9523,0x7BA9,0x9AA1,0x88F8,0x843D,0x6D1B,0x9A86,0x7EDC,/* 0xE0-0xE7 */ + 0x5988,0x9EBB,0x739B,0x7801,0x8682,0x9A6C,0x9A82,0x561B,/* 0xE8-0xEF */ + 0x5417,0x57CB,0x4E70,0x9EA6,0x5356,0x8FC8,0x8109,0x7792,/* 0xF0-0xF7 */ + 0x9992,0x86EE,0x6EE1,0x8513,0x66FC,0x6162,0x6F2B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x807E,0x8081,0x8082,0x8085,0x8088,0x808A,0x808D,0x808E,/* 0x40-0x47 */ + 0x808F,0x8090,0x8091,0x8092,0x8094,0x8095,0x8097,0x8099,/* 0x48-0x4F */ + 0x809E,0x80A3,0x80A6,0x80A7,0x80A8,0x80AC,0x80B0,0x80B3,/* 0x50-0x57 */ + 0x80B5,0x80B6,0x80B8,0x80B9,0x80BB,0x80C5,0x80C7,0x80C8,/* 0x58-0x5F */ + 0x80C9,0x80CA,0x80CB,0x80CF,0x80D0,0x80D1,0x80D2,0x80D3,/* 0x60-0x67 */ + 0x80D4,0x80D5,0x80D8,0x80DF,0x80E0,0x80E2,0x80E3,0x80E6,/* 0x68-0x6F */ + 0x80EE,0x80F5,0x80F7,0x80F9,0x80FB,0x80FE,0x80FF,0x8100,/* 0x70-0x77 */ + 0x8101,0x8103,0x8104,0x8105,0x8107,0x8108,0x810B,0x0000,/* 0x78-0x7F */ + + 0x810C,0x8115,0x8117,0x8119,0x811B,0x811C,0x811D,0x811F,/* 0x80-0x87 */ + 0x8120,0x8121,0x8122,0x8123,0x8124,0x8125,0x8126,0x8127,/* 0x88-0x8F */ + 0x8128,0x8129,0x812A,0x812B,0x812D,0x812E,0x8130,0x8133,/* 0x90-0x97 */ + 0x8134,0x8135,0x8137,0x8139,0x813A,0x813B,0x813C,0x813D,/* 0x98-0x9F */ + 0x813F,0x8C29,0x8292,0x832B,0x76F2,0x6C13,0x5FD9,0x83BD,/* 0xA0-0xA7 */ + 0x732B,0x8305,0x951A,0x6BDB,0x77DB,0x94C6,0x536F,0x8302,/* 0xA8-0xAF */ + 0x5192,0x5E3D,0x8C8C,0x8D38,0x4E48,0x73AB,0x679A,0x6885,/* 0xB0-0xB7 */ + 0x9176,0x9709,0x7164,0x6CA1,0x7709,0x5A92,0x9541,0x6BCF,/* 0xB8-0xBF */ + 0x7F8E,0x6627,0x5BD0,0x59B9,0x5A9A,0x95E8,0x95F7,0x4EEC,/* 0xC0-0xC7 */ + 0x840C,0x8499,0x6AAC,0x76DF,0x9530,0x731B,0x68A6,0x5B5F,/* 0xC8-0xCF */ + 0x772F,0x919A,0x9761,0x7CDC,0x8FF7,0x8C1C,0x5F25,0x7C73,/* 0xD0-0xD7 */ + 0x79D8,0x89C5,0x6CCC,0x871C,0x5BC6,0x5E42,0x68C9,0x7720,/* 0xD8-0xDF */ + 0x7EF5,0x5195,0x514D,0x52C9,0x5A29,0x7F05,0x9762,0x82D7,/* 0xE0-0xE7 */ + 0x63CF,0x7784,0x85D0,0x79D2,0x6E3A,0x5E99,0x5999,0x8511,/* 0xE8-0xEF */ + 0x706D,0x6C11,0x62BF,0x76BF,0x654F,0x60AF,0x95FD,0x660E,/* 0xF0-0xF7 */ + 0x879F,0x9E23,0x94ED,0x540D,0x547D,0x8C2C,0x6478,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8140,0x8141,0x8142,0x8143,0x8144,0x8145,0x8147,0x8149,/* 0x40-0x47 */ + 0x814D,0x814E,0x814F,0x8152,0x8156,0x8157,0x8158,0x815B,/* 0x48-0x4F */ + 0x815C,0x815D,0x815E,0x815F,0x8161,0x8162,0x8163,0x8164,/* 0x50-0x57 */ + 0x8166,0x8168,0x816A,0x816B,0x816C,0x816F,0x8172,0x8173,/* 0x58-0x5F */ + 0x8175,0x8176,0x8177,0x8178,0x8181,0x8183,0x8184,0x8185,/* 0x60-0x67 */ + 0x8186,0x8187,0x8189,0x818B,0x818C,0x818D,0x818E,0x8190,/* 0x68-0x6F */ + 0x8192,0x8193,0x8194,0x8195,0x8196,0x8197,0x8199,0x819A,/* 0x70-0x77 */ + 0x819E,0x819F,0x81A0,0x81A1,0x81A2,0x81A4,0x81A5,0x0000,/* 0x78-0x7F */ + + 0x81A7,0x81A9,0x81AB,0x81AC,0x81AD,0x81AE,0x81AF,0x81B0,/* 0x80-0x87 */ + 0x81B1,0x81B2,0x81B4,0x81B5,0x81B6,0x81B7,0x81B8,0x81B9,/* 0x88-0x8F */ + 0x81BC,0x81BD,0x81BE,0x81BF,0x81C4,0x81C5,0x81C7,0x81C8,/* 0x90-0x97 */ + 0x81C9,0x81CB,0x81CD,0x81CE,0x81CF,0x81D0,0x81D1,0x81D2,/* 0x98-0x9F */ + 0x81D3,0x6479,0x8611,0x6A21,0x819C,0x78E8,0x6469,0x9B54,/* 0xA0-0xA7 */ + 0x62B9,0x672B,0x83AB,0x58A8,0x9ED8,0x6CAB,0x6F20,0x5BDE,/* 0xA8-0xAF */ + 0x964C,0x8C0B,0x725F,0x67D0,0x62C7,0x7261,0x4EA9,0x59C6,/* 0xB0-0xB7 */ + 0x6BCD,0x5893,0x66AE,0x5E55,0x52DF,0x6155,0x6728,0x76EE,/* 0xB8-0xBF */ + 0x7766,0x7267,0x7A46,0x62FF,0x54EA,0x5450,0x94A0,0x90A3,/* 0xC0-0xC7 */ + 0x5A1C,0x7EB3,0x6C16,0x4E43,0x5976,0x8010,0x5948,0x5357,/* 0xC8-0xCF */ + 0x7537,0x96BE,0x56CA,0x6320,0x8111,0x607C,0x95F9,0x6DD6,/* 0xD0-0xD7 */ + 0x5462,0x9981,0x5185,0x5AE9,0x80FD,0x59AE,0x9713,0x502A,/* 0xD8-0xDF */ + 0x6CE5,0x5C3C,0x62DF,0x4F60,0x533F,0x817B,0x9006,0x6EBA,/* 0xE0-0xE7 */ + 0x852B,0x62C8,0x5E74,0x78BE,0x64B5,0x637B,0x5FF5,0x5A18,/* 0xE8-0xEF */ + 0x917F,0x9E1F,0x5C3F,0x634F,0x8042,0x5B7D,0x556E,0x954A,/* 0xF0-0xF7 */ + 0x954D,0x6D85,0x60A8,0x67E0,0x72DE,0x51DD,0x5B81,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x81D4,0x81D5,0x81D6,0x81D7,0x81D8,0x81D9,0x81DA,0x81DB,/* 0x40-0x47 */ + 0x81DC,0x81DD,0x81DE,0x81DF,0x81E0,0x81E1,0x81E2,0x81E4,/* 0x48-0x4F */ + 0x81E5,0x81E6,0x81E8,0x81E9,0x81EB,0x81EE,0x81EF,0x81F0,/* 0x50-0x57 */ + 0x81F1,0x81F2,0x81F5,0x81F6,0x81F7,0x81F8,0x81F9,0x81FA,/* 0x58-0x5F */ + 0x81FD,0x81FF,0x8203,0x8207,0x8208,0x8209,0x820A,0x820B,/* 0x60-0x67 */ + 0x820E,0x820F,0x8211,0x8213,0x8215,0x8216,0x8217,0x8218,/* 0x68-0x6F */ + 0x8219,0x821A,0x821D,0x8220,0x8224,0x8225,0x8226,0x8227,/* 0x70-0x77 */ + 0x8229,0x822E,0x8232,0x823A,0x823C,0x823D,0x823F,0x0000,/* 0x78-0x7F */ + + 0x8240,0x8241,0x8242,0x8243,0x8245,0x8246,0x8248,0x824A,/* 0x80-0x87 */ + 0x824C,0x824D,0x824E,0x8250,0x8251,0x8252,0x8253,0x8254,/* 0x88-0x8F */ + 0x8255,0x8256,0x8257,0x8259,0x825B,0x825C,0x825D,0x825E,/* 0x90-0x97 */ + 0x8260,0x8261,0x8262,0x8263,0x8264,0x8265,0x8266,0x8267,/* 0x98-0x9F */ + 0x8269,0x62E7,0x6CDE,0x725B,0x626D,0x94AE,0x7EBD,0x8113,/* 0xA0-0xA7 */ + 0x6D53,0x519C,0x5F04,0x5974,0x52AA,0x6012,0x5973,0x6696,/* 0xA8-0xAF */ + 0x8650,0x759F,0x632A,0x61E6,0x7CEF,0x8BFA,0x54E6,0x6B27,/* 0xB0-0xB7 */ + 0x9E25,0x6BB4,0x85D5,0x5455,0x5076,0x6CA4,0x556A,0x8DB4,/* 0xB8-0xBF */ + 0x722C,0x5E15,0x6015,0x7436,0x62CD,0x6392,0x724C,0x5F98,/* 0xC0-0xC7 */ + 0x6E43,0x6D3E,0x6500,0x6F58,0x76D8,0x78D0,0x76FC,0x7554,/* 0xC8-0xCF */ + 0x5224,0x53DB,0x4E53,0x5E9E,0x65C1,0x802A,0x80D6,0x629B,/* 0xD0-0xD7 */ + 0x5486,0x5228,0x70AE,0x888D,0x8DD1,0x6CE1,0x5478,0x80DA,/* 0xD8-0xDF */ + 0x57F9,0x88F4,0x8D54,0x966A,0x914D,0x4F69,0x6C9B,0x55B7,/* 0xE0-0xE7 */ + 0x76C6,0x7830,0x62A8,0x70F9,0x6F8E,0x5F6D,0x84EC,0x68DA,/* 0xE8-0xEF */ + 0x787C,0x7BF7,0x81A8,0x670B,0x9E4F,0x6367,0x78B0,0x576F,/* 0xF0-0xF7 */ + 0x7812,0x9739,0x6279,0x62AB,0x5288,0x7435,0x6BD7,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x826A,0x826B,0x826C,0x826D,0x8271,0x8275,0x8276,0x8277,/* 0x40-0x47 */ + 0x8278,0x827B,0x827C,0x8280,0x8281,0x8283,0x8285,0x8286,/* 0x48-0x4F */ + 0x8287,0x8289,0x828C,0x8290,0x8293,0x8294,0x8295,0x8296,/* 0x50-0x57 */ + 0x829A,0x829B,0x829E,0x82A0,0x82A2,0x82A3,0x82A7,0x82B2,/* 0x58-0x5F */ + 0x82B5,0x82B6,0x82BA,0x82BB,0x82BC,0x82BF,0x82C0,0x82C2,/* 0x60-0x67 */ + 0x82C3,0x82C5,0x82C6,0x82C9,0x82D0,0x82D6,0x82D9,0x82DA,/* 0x68-0x6F */ + 0x82DD,0x82E2,0x82E7,0x82E8,0x82E9,0x82EA,0x82EC,0x82ED,/* 0x70-0x77 */ + 0x82EE,0x82F0,0x82F2,0x82F3,0x82F5,0x82F6,0x82F8,0x0000,/* 0x78-0x7F */ + + 0x82FA,0x82FC,0x82FD,0x82FE,0x82FF,0x8300,0x830A,0x830B,/* 0x80-0x87 */ + 0x830D,0x8310,0x8312,0x8313,0x8316,0x8318,0x8319,0x831D,/* 0x88-0x8F */ + 0x831E,0x831F,0x8320,0x8321,0x8322,0x8323,0x8324,0x8325,/* 0x90-0x97 */ + 0x8326,0x8329,0x832A,0x832E,0x8330,0x8332,0x8337,0x833B,/* 0x98-0x9F */ + 0x833D,0x5564,0x813E,0x75B2,0x76AE,0x5339,0x75DE,0x50FB,/* 0xA0-0xA7 */ + 0x5C41,0x8B6C,0x7BC7,0x504F,0x7247,0x9A97,0x98D8,0x6F02,/* 0xA8-0xAF */ + 0x74E2,0x7968,0x6487,0x77A5,0x62FC,0x9891,0x8D2B,0x54C1,/* 0xB0-0xB7 */ + 0x8058,0x4E52,0x576A,0x82F9,0x840D,0x5E73,0x51ED,0x74F6,/* 0xB8-0xBF */ + 0x8BC4,0x5C4F,0x5761,0x6CFC,0x9887,0x5A46,0x7834,0x9B44,/* 0xC0-0xC7 */ + 0x8FEB,0x7C95,0x5256,0x6251,0x94FA,0x4EC6,0x8386,0x8461,/* 0xC8-0xCF */ + 0x83E9,0x84B2,0x57D4,0x6734,0x5703,0x666E,0x6D66,0x8C31,/* 0xD0-0xD7 */ + 0x66DD,0x7011,0x671F,0x6B3A,0x6816,0x621A,0x59BB,0x4E03,/* 0xD8-0xDF */ + 0x51C4,0x6F06,0x67D2,0x6C8F,0x5176,0x68CB,0x5947,0x6B67,/* 0xE0-0xE7 */ + 0x7566,0x5D0E,0x8110,0x9F50,0x65D7,0x7948,0x7941,0x9A91,/* 0xE8-0xEF */ + 0x8D77,0x5C82,0x4E5E,0x4F01,0x542F,0x5951,0x780C,0x5668,/* 0xF0-0xF7 */ + 0x6C14,0x8FC4,0x5F03,0x6C7D,0x6CE3,0x8BAB,0x6390,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x833E,0x833F,0x8341,0x8342,0x8344,0x8345,0x8348,0x834A,/* 0x40-0x47 */ + 0x834B,0x834C,0x834D,0x834E,0x8353,0x8355,0x8356,0x8357,/* 0x48-0x4F */ + 0x8358,0x8359,0x835D,0x8362,0x8370,0x8371,0x8372,0x8373,/* 0x50-0x57 */ + 0x8374,0x8375,0x8376,0x8379,0x837A,0x837E,0x837F,0x8380,/* 0x58-0x5F */ + 0x8381,0x8382,0x8383,0x8384,0x8387,0x8388,0x838A,0x838B,/* 0x60-0x67 */ + 0x838C,0x838D,0x838F,0x8390,0x8391,0x8394,0x8395,0x8396,/* 0x68-0x6F */ + 0x8397,0x8399,0x839A,0x839D,0x839F,0x83A1,0x83A2,0x83A3,/* 0x70-0x77 */ + 0x83A4,0x83A5,0x83A6,0x83A7,0x83AC,0x83AD,0x83AE,0x0000,/* 0x78-0x7F */ + + 0x83AF,0x83B5,0x83BB,0x83BE,0x83BF,0x83C2,0x83C3,0x83C4,/* 0x80-0x87 */ + 0x83C6,0x83C8,0x83C9,0x83CB,0x83CD,0x83CE,0x83D0,0x83D1,/* 0x88-0x8F */ + 0x83D2,0x83D3,0x83D5,0x83D7,0x83D9,0x83DA,0x83DB,0x83DE,/* 0x90-0x97 */ + 0x83E2,0x83E3,0x83E4,0x83E6,0x83E7,0x83E8,0x83EB,0x83EC,/* 0x98-0x9F */ + 0x83ED,0x6070,0x6D3D,0x7275,0x6266,0x948E,0x94C5,0x5343,/* 0xA0-0xA7 */ + 0x8FC1,0x7B7E,0x4EDF,0x8C26,0x4E7E,0x9ED4,0x94B1,0x94B3,/* 0xA8-0xAF */ + 0x524D,0x6F5C,0x9063,0x6D45,0x8C34,0x5811,0x5D4C,0x6B20,/* 0xB0-0xB7 */ + 0x6B49,0x67AA,0x545B,0x8154,0x7F8C,0x5899,0x8537,0x5F3A,/* 0xB8-0xBF */ + 0x62A2,0x6A47,0x9539,0x6572,0x6084,0x6865,0x77A7,0x4E54,/* 0xC0-0xC7 */ + 0x4FA8,0x5DE7,0x9798,0x64AC,0x7FD8,0x5CED,0x4FCF,0x7A8D,/* 0xC8-0xCF */ + 0x5207,0x8304,0x4E14,0x602F,0x7A83,0x94A6,0x4FB5,0x4EB2,/* 0xD0-0xD7 */ + 0x79E6,0x7434,0x52E4,0x82B9,0x64D2,0x79BD,0x5BDD,0x6C81,/* 0xD8-0xDF */ + 0x9752,0x8F7B,0x6C22,0x503E,0x537F,0x6E05,0x64CE,0x6674,/* 0xE0-0xE7 */ + 0x6C30,0x60C5,0x9877,0x8BF7,0x5E86,0x743C,0x7A77,0x79CB,/* 0xE8-0xEF */ + 0x4E18,0x90B1,0x7403,0x6C42,0x56DA,0x914B,0x6CC5,0x8D8B,/* 0xF0-0xF7 */ + 0x533A,0x86C6,0x66F2,0x8EAF,0x5C48,0x9A71,0x6E20,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x83EE,0x83EF,0x83F3,0x83F4,0x83F5,0x83F6,0x83F7,0x83FA,/* 0x40-0x47 */ + 0x83FB,0x83FC,0x83FE,0x83FF,0x8400,0x8402,0x8405,0x8407,/* 0x48-0x4F */ + 0x8408,0x8409,0x840A,0x8410,0x8412,0x8413,0x8414,0x8415,/* 0x50-0x57 */ + 0x8416,0x8417,0x8419,0x841A,0x841B,0x841E,0x841F,0x8420,/* 0x58-0x5F */ + 0x8421,0x8422,0x8423,0x8429,0x842A,0x842B,0x842C,0x842D,/* 0x60-0x67 */ + 0x842E,0x842F,0x8430,0x8432,0x8433,0x8434,0x8435,0x8436,/* 0x68-0x6F */ + 0x8437,0x8439,0x843A,0x843B,0x843E,0x843F,0x8440,0x8441,/* 0x70-0x77 */ + 0x8442,0x8443,0x8444,0x8445,0x8447,0x8448,0x8449,0x0000,/* 0x78-0x7F */ + + 0x844A,0x844B,0x844C,0x844D,0x844E,0x844F,0x8450,0x8452,/* 0x80-0x87 */ + 0x8453,0x8454,0x8455,0x8456,0x8458,0x845D,0x845E,0x845F,/* 0x88-0x8F */ + 0x8460,0x8462,0x8464,0x8465,0x8466,0x8467,0x8468,0x846A,/* 0x90-0x97 */ + 0x846E,0x846F,0x8470,0x8472,0x8474,0x8477,0x8479,0x847B,/* 0x98-0x9F */ + 0x847C,0x53D6,0x5A36,0x9F8B,0x8DA3,0x53BB,0x5708,0x98A7,/* 0xA0-0xA7 */ + 0x6743,0x919B,0x6CC9,0x5168,0x75CA,0x62F3,0x72AC,0x5238,/* 0xA8-0xAF */ + 0x529D,0x7F3A,0x7094,0x7638,0x5374,0x9E4A,0x69B7,0x786E,/* 0xB0-0xB7 */ + 0x96C0,0x88D9,0x7FA4,0x7136,0x71C3,0x5189,0x67D3,0x74E4,/* 0xB8-0xBF */ + 0x58E4,0x6518,0x56B7,0x8BA9,0x9976,0x6270,0x7ED5,0x60F9,/* 0xC0-0xC7 */ + 0x70ED,0x58EC,0x4EC1,0x4EBA,0x5FCD,0x97E7,0x4EFB,0x8BA4,/* 0xC8-0xCF */ + 0x5203,0x598A,0x7EAB,0x6254,0x4ECD,0x65E5,0x620E,0x8338,/* 0xD0-0xD7 */ + 0x84C9,0x8363,0x878D,0x7194,0x6EB6,0x5BB9,0x7ED2,0x5197,/* 0xD8-0xDF */ + 0x63C9,0x67D4,0x8089,0x8339,0x8815,0x5112,0x5B7A,0x5982,/* 0xE0-0xE7 */ + 0x8FB1,0x4E73,0x6C5D,0x5165,0x8925,0x8F6F,0x962E,0x854A,/* 0xE8-0xEF */ + 0x745E,0x9510,0x95F0,0x6DA6,0x82E5,0x5F31,0x6492,0x6D12,/* 0xF0-0xF7 */ + 0x8428,0x816E,0x9CC3,0x585E,0x8D5B,0x4E09,0x53C1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x847D,0x847E,0x847F,0x8480,0x8481,0x8483,0x8484,0x8485,/* 0x40-0x47 */ + 0x8486,0x848A,0x848D,0x848F,0x8490,0x8491,0x8492,0x8493,/* 0x48-0x4F */ + 0x8494,0x8495,0x8496,0x8498,0x849A,0x849B,0x849D,0x849E,/* 0x50-0x57 */ + 0x849F,0x84A0,0x84A2,0x84A3,0x84A4,0x84A5,0x84A6,0x84A7,/* 0x58-0x5F */ + 0x84A8,0x84A9,0x84AA,0x84AB,0x84AC,0x84AD,0x84AE,0x84B0,/* 0x60-0x67 */ + 0x84B1,0x84B3,0x84B5,0x84B6,0x84B7,0x84BB,0x84BC,0x84BE,/* 0x68-0x6F */ + 0x84C0,0x84C2,0x84C3,0x84C5,0x84C6,0x84C7,0x84C8,0x84CB,/* 0x70-0x77 */ + 0x84CC,0x84CE,0x84CF,0x84D2,0x84D4,0x84D5,0x84D7,0x0000,/* 0x78-0x7F */ + + 0x84D8,0x84D9,0x84DA,0x84DB,0x84DC,0x84DE,0x84E1,0x84E2,/* 0x80-0x87 */ + 0x84E4,0x84E7,0x84E8,0x84E9,0x84EA,0x84EB,0x84ED,0x84EE,/* 0x88-0x8F */ + 0x84EF,0x84F1,0x84F2,0x84F3,0x84F4,0x84F5,0x84F6,0x84F7,/* 0x90-0x97 */ + 0x84F8,0x84F9,0x84FA,0x84FB,0x84FD,0x84FE,0x8500,0x8501,/* 0x98-0x9F */ + 0x8502,0x4F1E,0x6563,0x6851,0x55D3,0x4E27,0x6414,0x9A9A,/* 0xA0-0xA7 */ + 0x626B,0x5AC2,0x745F,0x8272,0x6DA9,0x68EE,0x50E7,0x838E,/* 0xA8-0xAF */ + 0x7802,0x6740,0x5239,0x6C99,0x7EB1,0x50BB,0x5565,0x715E,/* 0xB0-0xB7 */ + 0x7B5B,0x6652,0x73CA,0x82EB,0x6749,0x5C71,0x5220,0x717D,/* 0xB8-0xBF */ + 0x886B,0x95EA,0x9655,0x64C5,0x8D61,0x81B3,0x5584,0x6C55,/* 0xC0-0xC7 */ + 0x6247,0x7F2E,0x5892,0x4F24,0x5546,0x8D4F,0x664C,0x4E0A,/* 0xC8-0xCF */ + 0x5C1A,0x88F3,0x68A2,0x634E,0x7A0D,0x70E7,0x828D,0x52FA,/* 0xD0-0xD7 */ + 0x97F6,0x5C11,0x54E8,0x90B5,0x7ECD,0x5962,0x8D4A,0x86C7,/* 0xD8-0xDF */ + 0x820C,0x820D,0x8D66,0x6444,0x5C04,0x6151,0x6D89,0x793E,/* 0xE0-0xE7 */ + 0x8BBE,0x7837,0x7533,0x547B,0x4F38,0x8EAB,0x6DF1,0x5A20,/* 0xE8-0xEF */ + 0x7EC5,0x795E,0x6C88,0x5BA1,0x5A76,0x751A,0x80BE,0x614E,/* 0xF0-0xF7 */ + 0x6E17,0x58F0,0x751F,0x7525,0x7272,0x5347,0x7EF3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8503,0x8504,0x8505,0x8506,0x8507,0x8508,0x8509,0x850A,/* 0x40-0x47 */ + 0x850B,0x850D,0x850E,0x850F,0x8510,0x8512,0x8514,0x8515,/* 0x48-0x4F */ + 0x8516,0x8518,0x8519,0x851B,0x851C,0x851D,0x851E,0x8520,/* 0x50-0x57 */ + 0x8522,0x8523,0x8524,0x8525,0x8526,0x8527,0x8528,0x8529,/* 0x58-0x5F */ + 0x852A,0x852D,0x852E,0x852F,0x8530,0x8531,0x8532,0x8533,/* 0x60-0x67 */ + 0x8534,0x8535,0x8536,0x853E,0x853F,0x8540,0x8541,0x8542,/* 0x68-0x6F */ + 0x8544,0x8545,0x8546,0x8547,0x854B,0x854C,0x854D,0x854E,/* 0x70-0x77 */ + 0x854F,0x8550,0x8551,0x8552,0x8553,0x8554,0x8555,0x0000,/* 0x78-0x7F */ + + 0x8557,0x8558,0x855A,0x855B,0x855C,0x855D,0x855F,0x8560,/* 0x80-0x87 */ + 0x8561,0x8562,0x8563,0x8565,0x8566,0x8567,0x8569,0x856A,/* 0x88-0x8F */ + 0x856B,0x856C,0x856D,0x856E,0x856F,0x8570,0x8571,0x8573,/* 0x90-0x97 */ + 0x8575,0x8576,0x8577,0x8578,0x857C,0x857D,0x857F,0x8580,/* 0x98-0x9F */ + 0x8581,0x7701,0x76DB,0x5269,0x80DC,0x5723,0x5E08,0x5931,/* 0xA0-0xA7 */ + 0x72EE,0x65BD,0x6E7F,0x8BD7,0x5C38,0x8671,0x5341,0x77F3,/* 0xA8-0xAF */ + 0x62FE,0x65F6,0x4EC0,0x98DF,0x8680,0x5B9E,0x8BC6,0x53F2,/* 0xB0-0xB7 */ + 0x77E2,0x4F7F,0x5C4E,0x9A76,0x59CB,0x5F0F,0x793A,0x58EB,/* 0xB8-0xBF */ + 0x4E16,0x67FF,0x4E8B,0x62ED,0x8A93,0x901D,0x52BF,0x662F,/* 0xC0-0xC7 */ + 0x55DC,0x566C,0x9002,0x4ED5,0x4F8D,0x91CA,0x9970,0x6C0F,/* 0xC8-0xCF */ + 0x5E02,0x6043,0x5BA4,0x89C6,0x8BD5,0x6536,0x624B,0x9996,/* 0xD0-0xD7 */ + 0x5B88,0x5BFF,0x6388,0x552E,0x53D7,0x7626,0x517D,0x852C,/* 0xD8-0xDF */ + 0x67A2,0x68B3,0x6B8A,0x6292,0x8F93,0x53D4,0x8212,0x6DD1,/* 0xE0-0xE7 */ + 0x758F,0x4E66,0x8D4E,0x5B70,0x719F,0x85AF,0x6691,0x66D9,/* 0xE8-0xEF */ + 0x7F72,0x8700,0x9ECD,0x9F20,0x5C5E,0x672F,0x8FF0,0x6811,/* 0xF0-0xF7 */ + 0x675F,0x620D,0x7AD6,0x5885,0x5EB6,0x6570,0x6F31,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8582,0x8583,0x8586,0x8588,0x8589,0x858A,0x858B,0x858C,/* 0x40-0x47 */ + 0x858D,0x858E,0x8590,0x8591,0x8592,0x8593,0x8594,0x8595,/* 0x48-0x4F */ + 0x8596,0x8597,0x8598,0x8599,0x859A,0x859D,0x859E,0x859F,/* 0x50-0x57 */ + 0x85A0,0x85A1,0x85A2,0x85A3,0x85A5,0x85A6,0x85A7,0x85A9,/* 0x58-0x5F */ + 0x85AB,0x85AC,0x85AD,0x85B1,0x85B2,0x85B3,0x85B4,0x85B5,/* 0x60-0x67 */ + 0x85B6,0x85B8,0x85BA,0x85BB,0x85BC,0x85BD,0x85BE,0x85BF,/* 0x68-0x6F */ + 0x85C0,0x85C2,0x85C3,0x85C4,0x85C5,0x85C6,0x85C7,0x85C8,/* 0x70-0x77 */ + 0x85CA,0x85CB,0x85CC,0x85CD,0x85CE,0x85D1,0x85D2,0x0000,/* 0x78-0x7F */ + + 0x85D4,0x85D6,0x85D7,0x85D8,0x85D9,0x85DA,0x85DB,0x85DD,/* 0x80-0x87 */ + 0x85DE,0x85DF,0x85E0,0x85E1,0x85E2,0x85E3,0x85E5,0x85E6,/* 0x88-0x8F */ + 0x85E7,0x85E8,0x85EA,0x85EB,0x85EC,0x85ED,0x85EE,0x85EF,/* 0x90-0x97 */ + 0x85F0,0x85F1,0x85F2,0x85F3,0x85F4,0x85F5,0x85F6,0x85F7,/* 0x98-0x9F */ + 0x85F8,0x6055,0x5237,0x800D,0x6454,0x8870,0x7529,0x5E05,/* 0xA0-0xA7 */ + 0x6813,0x62F4,0x971C,0x53CC,0x723D,0x8C01,0x6C34,0x7761,/* 0xA8-0xAF */ + 0x7A0E,0x542E,0x77AC,0x987A,0x821C,0x8BF4,0x7855,0x6714,/* 0xB0-0xB7 */ + 0x70C1,0x65AF,0x6495,0x5636,0x601D,0x79C1,0x53F8,0x4E1D,/* 0xB8-0xBF */ + 0x6B7B,0x8086,0x5BFA,0x55E3,0x56DB,0x4F3A,0x4F3C,0x9972,/* 0xC0-0xC7 */ + 0x5DF3,0x677E,0x8038,0x6002,0x9882,0x9001,0x5B8B,0x8BBC,/* 0xC8-0xCF */ + 0x8BF5,0x641C,0x8258,0x64DE,0x55FD,0x82CF,0x9165,0x4FD7,/* 0xD0-0xD7 */ + 0x7D20,0x901F,0x7C9F,0x50F3,0x5851,0x6EAF,0x5BBF,0x8BC9,/* 0xD8-0xDF */ + 0x8083,0x9178,0x849C,0x7B97,0x867D,0x968B,0x968F,0x7EE5,/* 0xE0-0xE7 */ + 0x9AD3,0x788E,0x5C81,0x7A57,0x9042,0x96A7,0x795F,0x5B59,/* 0xE8-0xEF */ + 0x635F,0x7B0B,0x84D1,0x68AD,0x5506,0x7F29,0x7410,0x7D22,/* 0xF0-0xF7 */ + 0x9501,0x6240,0x584C,0x4ED6,0x5B83,0x5979,0x5854,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x85F9,0x85FA,0x85FC,0x85FD,0x85FE,0x8600,0x8601,0x8602,/* 0x40-0x47 */ + 0x8603,0x8604,0x8606,0x8607,0x8608,0x8609,0x860A,0x860B,/* 0x48-0x4F */ + 0x860C,0x860D,0x860E,0x860F,0x8610,0x8612,0x8613,0x8614,/* 0x50-0x57 */ + 0x8615,0x8617,0x8618,0x8619,0x861A,0x861B,0x861C,0x861D,/* 0x58-0x5F */ + 0x861E,0x861F,0x8620,0x8621,0x8622,0x8623,0x8624,0x8625,/* 0x60-0x67 */ + 0x8626,0x8628,0x862A,0x862B,0x862C,0x862D,0x862E,0x862F,/* 0x68-0x6F */ + 0x8630,0x8631,0x8632,0x8633,0x8634,0x8635,0x8636,0x8637,/* 0x70-0x77 */ + 0x8639,0x863A,0x863B,0x863D,0x863E,0x863F,0x8640,0x0000,/* 0x78-0x7F */ + + 0x8641,0x8642,0x8643,0x8644,0x8645,0x8646,0x8647,0x8648,/* 0x80-0x87 */ + 0x8649,0x864A,0x864B,0x864C,0x8652,0x8653,0x8655,0x8656,/* 0x88-0x8F */ + 0x8657,0x8658,0x8659,0x865B,0x865C,0x865D,0x865F,0x8660,/* 0x90-0x97 */ + 0x8661,0x8663,0x8664,0x8665,0x8666,0x8667,0x8668,0x8669,/* 0x98-0x9F */ + 0x866A,0x736D,0x631E,0x8E4B,0x8E0F,0x80CE,0x82D4,0x62AC,/* 0xA0-0xA7 */ + 0x53F0,0x6CF0,0x915E,0x592A,0x6001,0x6C70,0x574D,0x644A,/* 0xA8-0xAF */ + 0x8D2A,0x762B,0x6EE9,0x575B,0x6A80,0x75F0,0x6F6D,0x8C2D,/* 0xB0-0xB7 */ + 0x8C08,0x5766,0x6BEF,0x8892,0x78B3,0x63A2,0x53F9,0x70AD,/* 0xB8-0xBF */ + 0x6C64,0x5858,0x642A,0x5802,0x68E0,0x819B,0x5510,0x7CD6,/* 0xC0-0xC7 */ + 0x5018,0x8EBA,0x6DCC,0x8D9F,0x70EB,0x638F,0x6D9B,0x6ED4,/* 0xC8-0xCF */ + 0x7EE6,0x8404,0x6843,0x9003,0x6DD8,0x9676,0x8BA8,0x5957,/* 0xD0-0xD7 */ + 0x7279,0x85E4,0x817E,0x75BC,0x8A8A,0x68AF,0x5254,0x8E22,/* 0xD8-0xDF */ + 0x9511,0x63D0,0x9898,0x8E44,0x557C,0x4F53,0x66FF,0x568F,/* 0xE0-0xE7 */ + 0x60D5,0x6D95,0x5243,0x5C49,0x5929,0x6DFB,0x586B,0x7530,/* 0xE8-0xEF */ + 0x751C,0x606C,0x8214,0x8146,0x6311,0x6761,0x8FE2,0x773A,/* 0xF0-0xF7 */ + 0x8DF3,0x8D34,0x94C1,0x5E16,0x5385,0x542C,0x70C3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x866D,0x866F,0x8670,0x8672,0x8673,0x8674,0x8675,0x8676,/* 0x40-0x47 */ + 0x8677,0x8678,0x8683,0x8684,0x8685,0x8686,0x8687,0x8688,/* 0x48-0x4F */ + 0x8689,0x868E,0x868F,0x8690,0x8691,0x8692,0x8694,0x8696,/* 0x50-0x57 */ + 0x8697,0x8698,0x8699,0x869A,0x869B,0x869E,0x869F,0x86A0,/* 0x58-0x5F */ + 0x86A1,0x86A2,0x86A5,0x86A6,0x86AB,0x86AD,0x86AE,0x86B2,/* 0x60-0x67 */ + 0x86B3,0x86B7,0x86B8,0x86B9,0x86BB,0x86BC,0x86BD,0x86BE,/* 0x68-0x6F */ + 0x86BF,0x86C1,0x86C2,0x86C3,0x86C5,0x86C8,0x86CC,0x86CD,/* 0x70-0x77 */ + 0x86D2,0x86D3,0x86D5,0x86D6,0x86D7,0x86DA,0x86DC,0x0000,/* 0x78-0x7F */ + + 0x86DD,0x86E0,0x86E1,0x86E2,0x86E3,0x86E5,0x86E6,0x86E7,/* 0x80-0x87 */ + 0x86E8,0x86EA,0x86EB,0x86EC,0x86EF,0x86F5,0x86F6,0x86F7,/* 0x88-0x8F */ + 0x86FA,0x86FB,0x86FC,0x86FD,0x86FF,0x8701,0x8704,0x8705,/* 0x90-0x97 */ + 0x8706,0x870B,0x870C,0x870E,0x870F,0x8710,0x8711,0x8714,/* 0x98-0x9F */ + 0x8716,0x6C40,0x5EF7,0x505C,0x4EAD,0x5EAD,0x633A,0x8247,/* 0xA0-0xA7 */ + 0x901A,0x6850,0x916E,0x77B3,0x540C,0x94DC,0x5F64,0x7AE5,/* 0xA8-0xAF */ + 0x6876,0x6345,0x7B52,0x7EDF,0x75DB,0x5077,0x6295,0x5934,/* 0xB0-0xB7 */ + 0x900F,0x51F8,0x79C3,0x7A81,0x56FE,0x5F92,0x9014,0x6D82,/* 0xB8-0xBF */ + 0x5C60,0x571F,0x5410,0x5154,0x6E4D,0x56E2,0x63A8,0x9893,/* 0xC0-0xC7 */ + 0x817F,0x8715,0x892A,0x9000,0x541E,0x5C6F,0x81C0,0x62D6,/* 0xC8-0xCF */ + 0x6258,0x8131,0x9E35,0x9640,0x9A6E,0x9A7C,0x692D,0x59A5,/* 0xD0-0xD7 */ + 0x62D3,0x553E,0x6316,0x54C7,0x86D9,0x6D3C,0x5A03,0x74E6,/* 0xD8-0xDF */ + 0x889C,0x6B6A,0x5916,0x8C4C,0x5F2F,0x6E7E,0x73A9,0x987D,/* 0xE0-0xE7 */ + 0x4E38,0x70F7,0x5B8C,0x7897,0x633D,0x665A,0x7696,0x60CB,/* 0xE8-0xEF */ + 0x5B9B,0x5A49,0x4E07,0x8155,0x6C6A,0x738B,0x4EA1,0x6789,/* 0xF0-0xF7 */ + 0x7F51,0x5F80,0x65FA,0x671B,0x5FD8,0x5984,0x5A01,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8719,0x871B,0x871D,0x871F,0x8720,0x8724,0x8726,0x8727,/* 0x40-0x47 */ + 0x8728,0x872A,0x872B,0x872C,0x872D,0x872F,0x8730,0x8732,/* 0x48-0x4F */ + 0x8733,0x8735,0x8736,0x8738,0x8739,0x873A,0x873C,0x873D,/* 0x50-0x57 */ + 0x8740,0x8741,0x8742,0x8743,0x8744,0x8745,0x8746,0x874A,/* 0x58-0x5F */ + 0x874B,0x874D,0x874F,0x8750,0x8751,0x8752,0x8754,0x8755,/* 0x60-0x67 */ + 0x8756,0x8758,0x875A,0x875B,0x875C,0x875D,0x875E,0x875F,/* 0x68-0x6F */ + 0x8761,0x8762,0x8766,0x8767,0x8768,0x8769,0x876A,0x876B,/* 0x70-0x77 */ + 0x876C,0x876D,0x876F,0x8771,0x8772,0x8773,0x8775,0x0000,/* 0x78-0x7F */ + + 0x8777,0x8778,0x8779,0x877A,0x877F,0x8780,0x8781,0x8784,/* 0x80-0x87 */ + 0x8786,0x8787,0x8789,0x878A,0x878C,0x878E,0x878F,0x8790,/* 0x88-0x8F */ + 0x8791,0x8792,0x8794,0x8795,0x8796,0x8798,0x8799,0x879A,/* 0x90-0x97 */ + 0x879B,0x879C,0x879D,0x879E,0x87A0,0x87A1,0x87A2,0x87A3,/* 0x98-0x9F */ + 0x87A4,0x5DCD,0x5FAE,0x5371,0x97E6,0x8FDD,0x6845,0x56F4,/* 0xA0-0xA7 */ + 0x552F,0x60DF,0x4E3A,0x6F4D,0x7EF4,0x82C7,0x840E,0x59D4,/* 0xA8-0xAF */ + 0x4F1F,0x4F2A,0x5C3E,0x7EAC,0x672A,0x851A,0x5473,0x754F,/* 0xB0-0xB7 */ + 0x80C3,0x5582,0x9B4F,0x4F4D,0x6E2D,0x8C13,0x5C09,0x6170,/* 0xB8-0xBF */ + 0x536B,0x761F,0x6E29,0x868A,0x6587,0x95FB,0x7EB9,0x543B,/* 0xC0-0xC7 */ + 0x7A33,0x7D0A,0x95EE,0x55E1,0x7FC1,0x74EE,0x631D,0x8717,/* 0xC8-0xCF */ + 0x6DA1,0x7A9D,0x6211,0x65A1,0x5367,0x63E1,0x6C83,0x5DEB,/* 0xD0-0xD7 */ + 0x545C,0x94A8,0x4E4C,0x6C61,0x8BEC,0x5C4B,0x65E0,0x829C,/* 0xD8-0xDF */ + 0x68A7,0x543E,0x5434,0x6BCB,0x6B66,0x4E94,0x6342,0x5348,/* 0xE0-0xE7 */ + 0x821E,0x4F0D,0x4FAE,0x575E,0x620A,0x96FE,0x6664,0x7269,/* 0xE8-0xEF */ + 0x52FF,0x52A1,0x609F,0x8BEF,0x6614,0x7199,0x6790,0x897F,/* 0xF0-0xF7 */ + 0x7852,0x77FD,0x6670,0x563B,0x5438,0x9521,0x727A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x87A5,0x87A6,0x87A7,0x87A9,0x87AA,0x87AE,0x87B0,0x87B1,/* 0x40-0x47 */ + 0x87B2,0x87B4,0x87B6,0x87B7,0x87B8,0x87B9,0x87BB,0x87BC,/* 0x48-0x4F */ + 0x87BE,0x87BF,0x87C1,0x87C2,0x87C3,0x87C4,0x87C5,0x87C7,/* 0x50-0x57 */ + 0x87C8,0x87C9,0x87CC,0x87CD,0x87CE,0x87CF,0x87D0,0x87D4,/* 0x58-0x5F */ + 0x87D5,0x87D6,0x87D7,0x87D8,0x87D9,0x87DA,0x87DC,0x87DD,/* 0x60-0x67 */ + 0x87DE,0x87DF,0x87E1,0x87E2,0x87E3,0x87E4,0x87E6,0x87E7,/* 0x68-0x6F */ + 0x87E8,0x87E9,0x87EB,0x87EC,0x87ED,0x87EF,0x87F0,0x87F1,/* 0x70-0x77 */ + 0x87F2,0x87F3,0x87F4,0x87F5,0x87F6,0x87F7,0x87F8,0x0000,/* 0x78-0x7F */ + + 0x87FA,0x87FB,0x87FC,0x87FD,0x87FF,0x8800,0x8801,0x8802,/* 0x80-0x87 */ + 0x8804,0x8805,0x8806,0x8807,0x8808,0x8809,0x880B,0x880C,/* 0x88-0x8F */ + 0x880D,0x880E,0x880F,0x8810,0x8811,0x8812,0x8814,0x8817,/* 0x90-0x97 */ + 0x8818,0x8819,0x881A,0x881C,0x881D,0x881E,0x881F,0x8820,/* 0x98-0x9F */ + 0x8823,0x7A00,0x606F,0x5E0C,0x6089,0x819D,0x5915,0x60DC,/* 0xA0-0xA7 */ + 0x7184,0x70EF,0x6EAA,0x6C50,0x7280,0x6A84,0x88AD,0x5E2D,/* 0xA8-0xAF */ + 0x4E60,0x5AB3,0x559C,0x94E3,0x6D17,0x7CFB,0x9699,0x620F,/* 0xB0-0xB7 */ + 0x7EC6,0x778E,0x867E,0x5323,0x971E,0x8F96,0x6687,0x5CE1,/* 0xB8-0xBF */ + 0x4FA0,0x72ED,0x4E0B,0x53A6,0x590F,0x5413,0x6380,0x9528,/* 0xC0-0xC7 */ + 0x5148,0x4ED9,0x9C9C,0x7EA4,0x54B8,0x8D24,0x8854,0x8237,/* 0xC8-0xCF */ + 0x95F2,0x6D8E,0x5F26,0x5ACC,0x663E,0x9669,0x73B0,0x732E,/* 0xD0-0xD7 */ + 0x53BF,0x817A,0x9985,0x7FA1,0x5BAA,0x9677,0x9650,0x7EBF,/* 0xD8-0xDF */ + 0x76F8,0x53A2,0x9576,0x9999,0x7BB1,0x8944,0x6E58,0x4E61,/* 0xE0-0xE7 */ + 0x7FD4,0x7965,0x8BE6,0x60F3,0x54CD,0x4EAB,0x9879,0x5DF7,/* 0xE8-0xEF */ + 0x6A61,0x50CF,0x5411,0x8C61,0x8427,0x785D,0x9704,0x524A,/* 0xF0-0xF7 */ + 0x54EE,0x56A3,0x9500,0x6D88,0x5BB5,0x6DC6,0x6653,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8824,0x8825,0x8826,0x8827,0x8828,0x8829,0x882A,0x882B,/* 0x40-0x47 */ + 0x882C,0x882D,0x882E,0x882F,0x8830,0x8831,0x8833,0x8834,/* 0x48-0x4F */ + 0x8835,0x8836,0x8837,0x8838,0x883A,0x883B,0x883D,0x883E,/* 0x50-0x57 */ + 0x883F,0x8841,0x8842,0x8843,0x8846,0x8847,0x8848,0x8849,/* 0x58-0x5F */ + 0x884A,0x884B,0x884E,0x884F,0x8850,0x8851,0x8852,0x8853,/* 0x60-0x67 */ + 0x8855,0x8856,0x8858,0x885A,0x885B,0x885C,0x885D,0x885E,/* 0x68-0x6F */ + 0x885F,0x8860,0x8866,0x8867,0x886A,0x886D,0x886F,0x8871,/* 0x70-0x77 */ + 0x8873,0x8874,0x8875,0x8876,0x8878,0x8879,0x887A,0x0000,/* 0x78-0x7F */ + + 0x887B,0x887C,0x8880,0x8883,0x8886,0x8887,0x8889,0x888A,/* 0x80-0x87 */ + 0x888C,0x888E,0x888F,0x8890,0x8891,0x8893,0x8894,0x8895,/* 0x88-0x8F */ + 0x8897,0x8898,0x8899,0x889A,0x889B,0x889D,0x889E,0x889F,/* 0x90-0x97 */ + 0x88A0,0x88A1,0x88A3,0x88A5,0x88A6,0x88A7,0x88A8,0x88A9,/* 0x98-0x9F */ + 0x88AA,0x5C0F,0x5B5D,0x6821,0x8096,0x5578,0x7B11,0x6548,/* 0xA0-0xA7 */ + 0x6954,0x4E9B,0x6B47,0x874E,0x978B,0x534F,0x631F,0x643A,/* 0xA8-0xAF */ + 0x90AA,0x659C,0x80C1,0x8C10,0x5199,0x68B0,0x5378,0x87F9,/* 0xB0-0xB7 */ + 0x61C8,0x6CC4,0x6CFB,0x8C22,0x5C51,0x85AA,0x82AF,0x950C,/* 0xB8-0xBF */ + 0x6B23,0x8F9B,0x65B0,0x5FFB,0x5FC3,0x4FE1,0x8845,0x661F,/* 0xC0-0xC7 */ + 0x8165,0x7329,0x60FA,0x5174,0x5211,0x578B,0x5F62,0x90A2,/* 0xC8-0xCF */ + 0x884C,0x9192,0x5E78,0x674F,0x6027,0x59D3,0x5144,0x51F6,/* 0xD0-0xD7 */ + 0x80F8,0x5308,0x6C79,0x96C4,0x718A,0x4F11,0x4FEE,0x7F9E,/* 0xD8-0xDF */ + 0x673D,0x55C5,0x9508,0x79C0,0x8896,0x7EE3,0x589F,0x620C,/* 0xE0-0xE7 */ + 0x9700,0x865A,0x5618,0x987B,0x5F90,0x8BB8,0x84C4,0x9157,/* 0xE8-0xEF */ + 0x53D9,0x65ED,0x5E8F,0x755C,0x6064,0x7D6E,0x5A7F,0x7EEA,/* 0xF0-0xF7 */ + 0x7EED,0x8F69,0x55A7,0x5BA3,0x60AC,0x65CB,0x7384,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x88AC,0x88AE,0x88AF,0x88B0,0x88B2,0x88B3,0x88B4,0x88B5,/* 0x40-0x47 */ + 0x88B6,0x88B8,0x88B9,0x88BA,0x88BB,0x88BD,0x88BE,0x88BF,/* 0x48-0x4F */ + 0x88C0,0x88C3,0x88C4,0x88C7,0x88C8,0x88CA,0x88CB,0x88CC,/* 0x50-0x57 */ + 0x88CD,0x88CF,0x88D0,0x88D1,0x88D3,0x88D6,0x88D7,0x88DA,/* 0x58-0x5F */ + 0x88DB,0x88DC,0x88DD,0x88DE,0x88E0,0x88E1,0x88E6,0x88E7,/* 0x60-0x67 */ + 0x88E9,0x88EA,0x88EB,0x88EC,0x88ED,0x88EE,0x88EF,0x88F2,/* 0x68-0x6F */ + 0x88F5,0x88F6,0x88F7,0x88FA,0x88FB,0x88FD,0x88FF,0x8900,/* 0x70-0x77 */ + 0x8901,0x8903,0x8904,0x8905,0x8906,0x8907,0x8908,0x0000,/* 0x78-0x7F */ + + 0x8909,0x890B,0x890C,0x890D,0x890E,0x890F,0x8911,0x8914,/* 0x80-0x87 */ + 0x8915,0x8916,0x8917,0x8918,0x891C,0x891D,0x891E,0x891F,/* 0x88-0x8F */ + 0x8920,0x8922,0x8923,0x8924,0x8926,0x8927,0x8928,0x8929,/* 0x90-0x97 */ + 0x892C,0x892D,0x892E,0x892F,0x8931,0x8932,0x8933,0x8935,/* 0x98-0x9F */ + 0x8937,0x9009,0x7663,0x7729,0x7EDA,0x9774,0x859B,0x5B66,/* 0xA0-0xA7 */ + 0x7A74,0x96EA,0x8840,0x52CB,0x718F,0x5FAA,0x65EC,0x8BE2,/* 0xA8-0xAF */ + 0x5BFB,0x9A6F,0x5DE1,0x6B89,0x6C5B,0x8BAD,0x8BAF,0x900A,/* 0xB0-0xB7 */ + 0x8FC5,0x538B,0x62BC,0x9E26,0x9E2D,0x5440,0x4E2B,0x82BD,/* 0xB8-0xBF */ + 0x7259,0x869C,0x5D16,0x8859,0x6DAF,0x96C5,0x54D1,0x4E9A,/* 0xC0-0xC7 */ + 0x8BB6,0x7109,0x54BD,0x9609,0x70DF,0x6DF9,0x76D0,0x4E25,/* 0xC8-0xCF */ + 0x7814,0x8712,0x5CA9,0x5EF6,0x8A00,0x989C,0x960E,0x708E,/* 0xD0-0xD7 */ + 0x6CBF,0x5944,0x63A9,0x773C,0x884D,0x6F14,0x8273,0x5830,/* 0xD8-0xDF */ + 0x71D5,0x538C,0x781A,0x96C1,0x5501,0x5F66,0x7130,0x5BB4,/* 0xE0-0xE7 */ + 0x8C1A,0x9A8C,0x6B83,0x592E,0x9E2F,0x79E7,0x6768,0x626C,/* 0xE8-0xEF */ + 0x4F6F,0x75A1,0x7F8A,0x6D0B,0x9633,0x6C27,0x4EF0,0x75D2,/* 0xF0-0xF7 */ + 0x517B,0x6837,0x6F3E,0x9080,0x8170,0x5996,0x7476,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8938,0x8939,0x893A,0x893B,0x893C,0x893D,0x893E,0x893F,/* 0x40-0x47 */ + 0x8940,0x8942,0x8943,0x8945,0x8946,0x8947,0x8948,0x8949,/* 0x48-0x4F */ + 0x894A,0x894B,0x894C,0x894D,0x894E,0x894F,0x8950,0x8951,/* 0x50-0x57 */ + 0x8952,0x8953,0x8954,0x8955,0x8956,0x8957,0x8958,0x8959,/* 0x58-0x5F */ + 0x895A,0x895B,0x895C,0x895D,0x8960,0x8961,0x8962,0x8963,/* 0x60-0x67 */ + 0x8964,0x8965,0x8967,0x8968,0x8969,0x896A,0x896B,0x896C,/* 0x68-0x6F */ + 0x896D,0x896E,0x896F,0x8970,0x8971,0x8972,0x8973,0x8974,/* 0x70-0x77 */ + 0x8975,0x8976,0x8977,0x8978,0x8979,0x897A,0x897C,0x0000,/* 0x78-0x7F */ + + 0x897D,0x897E,0x8980,0x8982,0x8984,0x8985,0x8987,0x8988,/* 0x80-0x87 */ + 0x8989,0x898A,0x898B,0x898C,0x898D,0x898E,0x898F,0x8990,/* 0x88-0x8F */ + 0x8991,0x8992,0x8993,0x8994,0x8995,0x8996,0x8997,0x8998,/* 0x90-0x97 */ + 0x8999,0x899A,0x899B,0x899C,0x899D,0x899E,0x899F,0x89A0,/* 0x98-0x9F */ + 0x89A1,0x6447,0x5C27,0x9065,0x7A91,0x8C23,0x59DA,0x54AC,/* 0xA0-0xA7 */ + 0x8200,0x836F,0x8981,0x8000,0x6930,0x564E,0x8036,0x7237,/* 0xA8-0xAF */ + 0x91CE,0x51B6,0x4E5F,0x9875,0x6396,0x4E1A,0x53F6,0x66F3,/* 0xB0-0xB7 */ + 0x814B,0x591C,0x6DB2,0x4E00,0x58F9,0x533B,0x63D6,0x94F1,/* 0xB8-0xBF */ + 0x4F9D,0x4F0A,0x8863,0x9890,0x5937,0x9057,0x79FB,0x4EEA,/* 0xC0-0xC7 */ + 0x80F0,0x7591,0x6C82,0x5B9C,0x59E8,0x5F5D,0x6905,0x8681,/* 0xC8-0xCF */ + 0x501A,0x5DF2,0x4E59,0x77E3,0x4EE5,0x827A,0x6291,0x6613,/* 0xD0-0xD7 */ + 0x9091,0x5C79,0x4EBF,0x5F79,0x81C6,0x9038,0x8084,0x75AB,/* 0xD8-0xDF */ + 0x4EA6,0x88D4,0x610F,0x6BC5,0x5FC6,0x4E49,0x76CA,0x6EA2,/* 0xE0-0xE7 */ + 0x8BE3,0x8BAE,0x8C0A,0x8BD1,0x5F02,0x7FFC,0x7FCC,0x7ECE,/* 0xE8-0xEF */ + 0x8335,0x836B,0x56E0,0x6BB7,0x97F3,0x9634,0x59FB,0x541F,/* 0xF0-0xF7 */ + 0x94F6,0x6DEB,0x5BC5,0x996E,0x5C39,0x5F15,0x9690,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x89A2,0x89A3,0x89A4,0x89A5,0x89A6,0x89A7,0x89A8,0x89A9,/* 0x40-0x47 */ + 0x89AA,0x89AB,0x89AC,0x89AD,0x89AE,0x89AF,0x89B0,0x89B1,/* 0x48-0x4F */ + 0x89B2,0x89B3,0x89B4,0x89B5,0x89B6,0x89B7,0x89B8,0x89B9,/* 0x50-0x57 */ + 0x89BA,0x89BB,0x89BC,0x89BD,0x89BE,0x89BF,0x89C0,0x89C3,/* 0x58-0x5F */ + 0x89CD,0x89D3,0x89D4,0x89D5,0x89D7,0x89D8,0x89D9,0x89DB,/* 0x60-0x67 */ + 0x89DD,0x89DF,0x89E0,0x89E1,0x89E2,0x89E4,0x89E7,0x89E8,/* 0x68-0x6F */ + 0x89E9,0x89EA,0x89EC,0x89ED,0x89EE,0x89F0,0x89F1,0x89F2,/* 0x70-0x77 */ + 0x89F4,0x89F5,0x89F6,0x89F7,0x89F8,0x89F9,0x89FA,0x0000,/* 0x78-0x7F */ + + 0x89FB,0x89FC,0x89FD,0x89FE,0x89FF,0x8A01,0x8A02,0x8A03,/* 0x80-0x87 */ + 0x8A04,0x8A05,0x8A06,0x8A08,0x8A09,0x8A0A,0x8A0B,0x8A0C,/* 0x88-0x8F */ + 0x8A0D,0x8A0E,0x8A0F,0x8A10,0x8A11,0x8A12,0x8A13,0x8A14,/* 0x90-0x97 */ + 0x8A15,0x8A16,0x8A17,0x8A18,0x8A19,0x8A1A,0x8A1B,0x8A1C,/* 0x98-0x9F */ + 0x8A1D,0x5370,0x82F1,0x6A31,0x5A74,0x9E70,0x5E94,0x7F28,/* 0xA0-0xA7 */ + 0x83B9,0x8424,0x8425,0x8367,0x8747,0x8FCE,0x8D62,0x76C8,/* 0xA8-0xAF */ + 0x5F71,0x9896,0x786C,0x6620,0x54DF,0x62E5,0x4F63,0x81C3,/* 0xB0-0xB7 */ + 0x75C8,0x5EB8,0x96CD,0x8E0A,0x86F9,0x548F,0x6CF3,0x6D8C,/* 0xB8-0xBF */ + 0x6C38,0x607F,0x52C7,0x7528,0x5E7D,0x4F18,0x60A0,0x5FE7,/* 0xC0-0xC7 */ + 0x5C24,0x7531,0x90AE,0x94C0,0x72B9,0x6CB9,0x6E38,0x9149,/* 0xC8-0xCF */ + 0x6709,0x53CB,0x53F3,0x4F51,0x91C9,0x8BF1,0x53C8,0x5E7C,/* 0xD0-0xD7 */ + 0x8FC2,0x6DE4,0x4E8E,0x76C2,0x6986,0x865E,0x611A,0x8206,/* 0xD8-0xDF */ + 0x4F59,0x4FDE,0x903E,0x9C7C,0x6109,0x6E1D,0x6E14,0x9685,/* 0xE0-0xE7 */ + 0x4E88,0x5A31,0x96E8,0x4E0E,0x5C7F,0x79B9,0x5B87,0x8BED,/* 0xE8-0xEF */ + 0x7FBD,0x7389,0x57DF,0x828B,0x90C1,0x5401,0x9047,0x55BB,/* 0xF0-0xF7 */ + 0x5CEA,0x5FA1,0x6108,0x6B32,0x72F1,0x80B2,0x8A89,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8A1E,0x8A1F,0x8A20,0x8A21,0x8A22,0x8A23,0x8A24,0x8A25,/* 0x40-0x47 */ + 0x8A26,0x8A27,0x8A28,0x8A29,0x8A2A,0x8A2B,0x8A2C,0x8A2D,/* 0x48-0x4F */ + 0x8A2E,0x8A2F,0x8A30,0x8A31,0x8A32,0x8A33,0x8A34,0x8A35,/* 0x50-0x57 */ + 0x8A36,0x8A37,0x8A38,0x8A39,0x8A3A,0x8A3B,0x8A3C,0x8A3D,/* 0x58-0x5F */ + 0x8A3F,0x8A40,0x8A41,0x8A42,0x8A43,0x8A44,0x8A45,0x8A46,/* 0x60-0x67 */ + 0x8A47,0x8A49,0x8A4A,0x8A4B,0x8A4C,0x8A4D,0x8A4E,0x8A4F,/* 0x68-0x6F */ + 0x8A50,0x8A51,0x8A52,0x8A53,0x8A54,0x8A55,0x8A56,0x8A57,/* 0x70-0x77 */ + 0x8A58,0x8A59,0x8A5A,0x8A5B,0x8A5C,0x8A5D,0x8A5E,0x0000,/* 0x78-0x7F */ + + 0x8A5F,0x8A60,0x8A61,0x8A62,0x8A63,0x8A64,0x8A65,0x8A66,/* 0x80-0x87 */ + 0x8A67,0x8A68,0x8A69,0x8A6A,0x8A6B,0x8A6C,0x8A6D,0x8A6E,/* 0x88-0x8F */ + 0x8A6F,0x8A70,0x8A71,0x8A72,0x8A73,0x8A74,0x8A75,0x8A76,/* 0x90-0x97 */ + 0x8A77,0x8A78,0x8A7A,0x8A7B,0x8A7C,0x8A7D,0x8A7E,0x8A7F,/* 0x98-0x9F */ + 0x8A80,0x6D74,0x5BD3,0x88D5,0x9884,0x8C6B,0x9A6D,0x9E33,/* 0xA0-0xA7 */ + 0x6E0A,0x51A4,0x5143,0x57A3,0x8881,0x539F,0x63F4,0x8F95,/* 0xA8-0xAF */ + 0x56ED,0x5458,0x5706,0x733F,0x6E90,0x7F18,0x8FDC,0x82D1,/* 0xB0-0xB7 */ + 0x613F,0x6028,0x9662,0x66F0,0x7EA6,0x8D8A,0x8DC3,0x94A5,/* 0xB8-0xBF */ + 0x5CB3,0x7CA4,0x6708,0x60A6,0x9605,0x8018,0x4E91,0x90E7,/* 0xC0-0xC7 */ + 0x5300,0x9668,0x5141,0x8FD0,0x8574,0x915D,0x6655,0x97F5,/* 0xC8-0xCF */ + 0x5B55,0x531D,0x7838,0x6742,0x683D,0x54C9,0x707E,0x5BB0,/* 0xD0-0xD7 */ + 0x8F7D,0x518D,0x5728,0x54B1,0x6512,0x6682,0x8D5E,0x8D43,/* 0xD8-0xDF */ + 0x810F,0x846C,0x906D,0x7CDF,0x51FF,0x85FB,0x67A3,0x65E9,/* 0xE0-0xE7 */ + 0x6FA1,0x86A4,0x8E81,0x566A,0x9020,0x7682,0x7076,0x71E5,/* 0xE8-0xEF */ + 0x8D23,0x62E9,0x5219,0x6CFD,0x8D3C,0x600E,0x589E,0x618E,/* 0xF0-0xF7 */ + 0x66FE,0x8D60,0x624E,0x55B3,0x6E23,0x672D,0x8F67,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8A81,0x8A82,0x8A83,0x8A84,0x8A85,0x8A86,0x8A87,0x8A88,/* 0x40-0x47 */ + 0x8A8B,0x8A8C,0x8A8D,0x8A8E,0x8A8F,0x8A90,0x8A91,0x8A92,/* 0x48-0x4F */ + 0x8A94,0x8A95,0x8A96,0x8A97,0x8A98,0x8A99,0x8A9A,0x8A9B,/* 0x50-0x57 */ + 0x8A9C,0x8A9D,0x8A9E,0x8A9F,0x8AA0,0x8AA1,0x8AA2,0x8AA3,/* 0x58-0x5F */ + 0x8AA4,0x8AA5,0x8AA6,0x8AA7,0x8AA8,0x8AA9,0x8AAA,0x8AAB,/* 0x60-0x67 */ + 0x8AAC,0x8AAD,0x8AAE,0x8AAF,0x8AB0,0x8AB1,0x8AB2,0x8AB3,/* 0x68-0x6F */ + 0x8AB4,0x8AB5,0x8AB6,0x8AB7,0x8AB8,0x8AB9,0x8ABA,0x8ABB,/* 0x70-0x77 */ + 0x8ABC,0x8ABD,0x8ABE,0x8ABF,0x8AC0,0x8AC1,0x8AC2,0x0000,/* 0x78-0x7F */ + + 0x8AC3,0x8AC4,0x8AC5,0x8AC6,0x8AC7,0x8AC8,0x8AC9,0x8ACA,/* 0x80-0x87 */ + 0x8ACB,0x8ACC,0x8ACD,0x8ACE,0x8ACF,0x8AD0,0x8AD1,0x8AD2,/* 0x88-0x8F */ + 0x8AD3,0x8AD4,0x8AD5,0x8AD6,0x8AD7,0x8AD8,0x8AD9,0x8ADA,/* 0x90-0x97 */ + 0x8ADB,0x8ADC,0x8ADD,0x8ADE,0x8ADF,0x8AE0,0x8AE1,0x8AE2,/* 0x98-0x9F */ + 0x8AE3,0x94E1,0x95F8,0x7728,0x6805,0x69A8,0x548B,0x4E4D,/* 0xA0-0xA7 */ + 0x70B8,0x8BC8,0x6458,0x658B,0x5B85,0x7A84,0x503A,0x5BE8,/* 0xA8-0xAF */ + 0x77BB,0x6BE1,0x8A79,0x7C98,0x6CBE,0x76CF,0x65A9,0x8F97,/* 0xB0-0xB7 */ + 0x5D2D,0x5C55,0x8638,0x6808,0x5360,0x6218,0x7AD9,0x6E5B,/* 0xB8-0xBF */ + 0x7EFD,0x6A1F,0x7AE0,0x5F70,0x6F33,0x5F20,0x638C,0x6DA8,/* 0xC0-0xC7 */ + 0x6756,0x4E08,0x5E10,0x8D26,0x4ED7,0x80C0,0x7634,0x969C,/* 0xC8-0xCF */ + 0x62DB,0x662D,0x627E,0x6CBC,0x8D75,0x7167,0x7F69,0x5146,/* 0xD0-0xD7 */ + 0x8087,0x53EC,0x906E,0x6298,0x54F2,0x86F0,0x8F99,0x8005,/* 0xD8-0xDF */ + 0x9517,0x8517,0x8FD9,0x6D59,0x73CD,0x659F,0x771F,0x7504,/* 0xE0-0xE7 */ + 0x7827,0x81FB,0x8D1E,0x9488,0x4FA6,0x6795,0x75B9,0x8BCA,/* 0xE8-0xEF */ + 0x9707,0x632F,0x9547,0x9635,0x84B8,0x6323,0x7741,0x5F81,/* 0xF0-0xF7 */ + 0x72F0,0x4E89,0x6014,0x6574,0x62EF,0x6B63,0x653F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8AE4,0x8AE5,0x8AE6,0x8AE7,0x8AE8,0x8AE9,0x8AEA,0x8AEB,/* 0x40-0x47 */ + 0x8AEC,0x8AED,0x8AEE,0x8AEF,0x8AF0,0x8AF1,0x8AF2,0x8AF3,/* 0x48-0x4F */ + 0x8AF4,0x8AF5,0x8AF6,0x8AF7,0x8AF8,0x8AF9,0x8AFA,0x8AFB,/* 0x50-0x57 */ + 0x8AFC,0x8AFD,0x8AFE,0x8AFF,0x8B00,0x8B01,0x8B02,0x8B03,/* 0x58-0x5F */ + 0x8B04,0x8B05,0x8B06,0x8B08,0x8B09,0x8B0A,0x8B0B,0x8B0C,/* 0x60-0x67 */ + 0x8B0D,0x8B0E,0x8B0F,0x8B10,0x8B11,0x8B12,0x8B13,0x8B14,/* 0x68-0x6F */ + 0x8B15,0x8B16,0x8B17,0x8B18,0x8B19,0x8B1A,0x8B1B,0x8B1C,/* 0x70-0x77 */ + 0x8B1D,0x8B1E,0x8B1F,0x8B20,0x8B21,0x8B22,0x8B23,0x0000,/* 0x78-0x7F */ + + 0x8B24,0x8B25,0x8B27,0x8B28,0x8B29,0x8B2A,0x8B2B,0x8B2C,/* 0x80-0x87 */ + 0x8B2D,0x8B2E,0x8B2F,0x8B30,0x8B31,0x8B32,0x8B33,0x8B34,/* 0x88-0x8F */ + 0x8B35,0x8B36,0x8B37,0x8B38,0x8B39,0x8B3A,0x8B3B,0x8B3C,/* 0x90-0x97 */ + 0x8B3D,0x8B3E,0x8B3F,0x8B40,0x8B41,0x8B42,0x8B43,0x8B44,/* 0x98-0x9F */ + 0x8B45,0x5E27,0x75C7,0x90D1,0x8BC1,0x829D,0x679D,0x652F,/* 0xA0-0xA7 */ + 0x5431,0x8718,0x77E5,0x80A2,0x8102,0x6C41,0x4E4B,0x7EC7,/* 0xA8-0xAF */ + 0x804C,0x76F4,0x690D,0x6B96,0x6267,0x503C,0x4F84,0x5740,/* 0xB0-0xB7 */ + 0x6307,0x6B62,0x8DBE,0x53EA,0x65E8,0x7EB8,0x5FD7,0x631A,/* 0xB8-0xBF */ + 0x63B7,0x81F3,0x81F4,0x7F6E,0x5E1C,0x5CD9,0x5236,0x667A,/* 0xC0-0xC7 */ + 0x79E9,0x7A1A,0x8D28,0x7099,0x75D4,0x6EDE,0x6CBB,0x7A92,/* 0xC8-0xCF */ + 0x4E2D,0x76C5,0x5FE0,0x949F,0x8877,0x7EC8,0x79CD,0x80BF,/* 0xD0-0xD7 */ + 0x91CD,0x4EF2,0x4F17,0x821F,0x5468,0x5DDE,0x6D32,0x8BCC,/* 0xD8-0xDF */ + 0x7CA5,0x8F74,0x8098,0x5E1A,0x5492,0x76B1,0x5B99,0x663C,/* 0xE0-0xE7 */ + 0x9AA4,0x73E0,0x682A,0x86DB,0x6731,0x732A,0x8BF8,0x8BDB,/* 0xE8-0xEF */ + 0x9010,0x7AF9,0x70DB,0x716E,0x62C4,0x77A9,0x5631,0x4E3B,/* 0xF0-0xF7 */ + 0x8457,0x67F1,0x52A9,0x86C0,0x8D2E,0x94F8,0x7B51,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8B46,0x8B47,0x8B48,0x8B49,0x8B4A,0x8B4B,0x8B4C,0x8B4D,/* 0x40-0x47 */ + 0x8B4E,0x8B4F,0x8B50,0x8B51,0x8B52,0x8B53,0x8B54,0x8B55,/* 0x48-0x4F */ + 0x8B56,0x8B57,0x8B58,0x8B59,0x8B5A,0x8B5B,0x8B5C,0x8B5D,/* 0x50-0x57 */ + 0x8B5E,0x8B5F,0x8B60,0x8B61,0x8B62,0x8B63,0x8B64,0x8B65,/* 0x58-0x5F */ + 0x8B67,0x8B68,0x8B69,0x8B6A,0x8B6B,0x8B6D,0x8B6E,0x8B6F,/* 0x60-0x67 */ + 0x8B70,0x8B71,0x8B72,0x8B73,0x8B74,0x8B75,0x8B76,0x8B77,/* 0x68-0x6F */ + 0x8B78,0x8B79,0x8B7A,0x8B7B,0x8B7C,0x8B7D,0x8B7E,0x8B7F,/* 0x70-0x77 */ + 0x8B80,0x8B81,0x8B82,0x8B83,0x8B84,0x8B85,0x8B86,0x0000,/* 0x78-0x7F */ + + 0x8B87,0x8B88,0x8B89,0x8B8A,0x8B8B,0x8B8C,0x8B8D,0x8B8E,/* 0x80-0x87 */ + 0x8B8F,0x8B90,0x8B91,0x8B92,0x8B93,0x8B94,0x8B95,0x8B96,/* 0x88-0x8F */ + 0x8B97,0x8B98,0x8B99,0x8B9A,0x8B9B,0x8B9C,0x8B9D,0x8B9E,/* 0x90-0x97 */ + 0x8B9F,0x8BAC,0x8BB1,0x8BBB,0x8BC7,0x8BD0,0x8BEA,0x8C09,/* 0x98-0x9F */ + 0x8C1E,0x4F4F,0x6CE8,0x795D,0x9A7B,0x6293,0x722A,0x62FD,/* 0xA0-0xA7 */ + 0x4E13,0x7816,0x8F6C,0x64B0,0x8D5A,0x7BC6,0x6869,0x5E84,/* 0xA8-0xAF */ + 0x88C5,0x5986,0x649E,0x58EE,0x72B6,0x690E,0x9525,0x8FFD,/* 0xB0-0xB7 */ + 0x8D58,0x5760,0x7F00,0x8C06,0x51C6,0x6349,0x62D9,0x5353,/* 0xB8-0xBF */ + 0x684C,0x7422,0x8301,0x914C,0x5544,0x7740,0x707C,0x6D4A,/* 0xC0-0xC7 */ + 0x5179,0x54A8,0x8D44,0x59FF,0x6ECB,0x6DC4,0x5B5C,0x7D2B,/* 0xC8-0xCF */ + 0x4ED4,0x7C7D,0x6ED3,0x5B50,0x81EA,0x6E0D,0x5B57,0x9B03,/* 0xD0-0xD7 */ + 0x68D5,0x8E2A,0x5B97,0x7EFC,0x603B,0x7EB5,0x90B9,0x8D70,/* 0xD8-0xDF */ + 0x594F,0x63CD,0x79DF,0x8DB3,0x5352,0x65CF,0x7956,0x8BC5,/* 0xE0-0xE7 */ + 0x963B,0x7EC4,0x94BB,0x7E82,0x5634,0x9189,0x6700,0x7F6A,/* 0xE8-0xEF */ + 0x5C0A,0x9075,0x6628,0x5DE6,0x4F50,0x67DE,0x505A,0x4F5C,/* 0xF0-0xF7 */ + 0x5750,0x5EA7,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8C38,0x8C39,0x8C3A,0x8C3B,0x8C3C,0x8C3D,0x8C3E,0x8C3F,/* 0x40-0x47 */ + 0x8C40,0x8C42,0x8C43,0x8C44,0x8C45,0x8C48,0x8C4A,0x8C4B,/* 0x48-0x4F */ + 0x8C4D,0x8C4E,0x8C4F,0x8C50,0x8C51,0x8C52,0x8C53,0x8C54,/* 0x50-0x57 */ + 0x8C56,0x8C57,0x8C58,0x8C59,0x8C5B,0x8C5C,0x8C5D,0x8C5E,/* 0x58-0x5F */ + 0x8C5F,0x8C60,0x8C63,0x8C64,0x8C65,0x8C66,0x8C67,0x8C68,/* 0x60-0x67 */ + 0x8C69,0x8C6C,0x8C6D,0x8C6E,0x8C6F,0x8C70,0x8C71,0x8C72,/* 0x68-0x6F */ + 0x8C74,0x8C75,0x8C76,0x8C77,0x8C7B,0x8C7C,0x8C7D,0x8C7E,/* 0x70-0x77 */ + 0x8C7F,0x8C80,0x8C81,0x8C83,0x8C84,0x8C86,0x8C87,0x0000,/* 0x78-0x7F */ + + 0x8C88,0x8C8B,0x8C8D,0x8C8E,0x8C8F,0x8C90,0x8C91,0x8C92,/* 0x80-0x87 */ + 0x8C93,0x8C95,0x8C96,0x8C97,0x8C99,0x8C9A,0x8C9B,0x8C9C,/* 0x88-0x8F */ + 0x8C9D,0x8C9E,0x8C9F,0x8CA0,0x8CA1,0x8CA2,0x8CA3,0x8CA4,/* 0x90-0x97 */ + 0x8CA5,0x8CA6,0x8CA7,0x8CA8,0x8CA9,0x8CAA,0x8CAB,0x8CAC,/* 0x98-0x9F */ + 0x8CAD,0x4E8D,0x4E0C,0x5140,0x4E10,0x5EFF,0x5345,0x4E15,/* 0xA0-0xA7 */ + 0x4E98,0x4E1E,0x9B32,0x5B6C,0x5669,0x4E28,0x79BA,0x4E3F,/* 0xA8-0xAF */ + 0x5315,0x4E47,0x592D,0x723B,0x536E,0x6C10,0x56DF,0x80E4,/* 0xB0-0xB7 */ + 0x9997,0x6BD3,0x777E,0x9F17,0x4E36,0x4E9F,0x9F10,0x4E5C,/* 0xB8-0xBF */ + 0x4E69,0x4E93,0x8288,0x5B5B,0x556C,0x560F,0x4EC4,0x538D,/* 0xC0-0xC7 */ + 0x539D,0x53A3,0x53A5,0x53AE,0x9765,0x8D5D,0x531A,0x53F5,/* 0xC8-0xCF */ + 0x5326,0x532E,0x533E,0x8D5C,0x5366,0x5363,0x5202,0x5208,/* 0xD0-0xD7 */ + 0x520E,0x522D,0x5233,0x523F,0x5240,0x524C,0x525E,0x5261,/* 0xD8-0xDF */ + 0x525C,0x84AF,0x527D,0x5282,0x5281,0x5290,0x5293,0x5182,/* 0xE0-0xE7 */ + 0x7F54,0x4EBB,0x4EC3,0x4EC9,0x4EC2,0x4EE8,0x4EE1,0x4EEB,/* 0xE8-0xEF */ + 0x4EDE,0x4F1B,0x4EF3,0x4F22,0x4F64,0x4EF5,0x4F25,0x4F27,/* 0xF0-0xF7 */ + 0x4F09,0x4F2B,0x4F5E,0x4F67,0x6538,0x4F5A,0x4F5D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8CAE,0x8CAF,0x8CB0,0x8CB1,0x8CB2,0x8CB3,0x8CB4,0x8CB5,/* 0x40-0x47 */ + 0x8CB6,0x8CB7,0x8CB8,0x8CB9,0x8CBA,0x8CBB,0x8CBC,0x8CBD,/* 0x48-0x4F */ + 0x8CBE,0x8CBF,0x8CC0,0x8CC1,0x8CC2,0x8CC3,0x8CC4,0x8CC5,/* 0x50-0x57 */ + 0x8CC6,0x8CC7,0x8CC8,0x8CC9,0x8CCA,0x8CCB,0x8CCC,0x8CCD,/* 0x58-0x5F */ + 0x8CCE,0x8CCF,0x8CD0,0x8CD1,0x8CD2,0x8CD3,0x8CD4,0x8CD5,/* 0x60-0x67 */ + 0x8CD6,0x8CD7,0x8CD8,0x8CD9,0x8CDA,0x8CDB,0x8CDC,0x8CDD,/* 0x68-0x6F */ + 0x8CDE,0x8CDF,0x8CE0,0x8CE1,0x8CE2,0x8CE3,0x8CE4,0x8CE5,/* 0x70-0x77 */ + 0x8CE6,0x8CE7,0x8CE8,0x8CE9,0x8CEA,0x8CEB,0x8CEC,0x0000,/* 0x78-0x7F */ + + 0x8CED,0x8CEE,0x8CEF,0x8CF0,0x8CF1,0x8CF2,0x8CF3,0x8CF4,/* 0x80-0x87 */ + 0x8CF5,0x8CF6,0x8CF7,0x8CF8,0x8CF9,0x8CFA,0x8CFB,0x8CFC,/* 0x88-0x8F */ + 0x8CFD,0x8CFE,0x8CFF,0x8D00,0x8D01,0x8D02,0x8D03,0x8D04,/* 0x90-0x97 */ + 0x8D05,0x8D06,0x8D07,0x8D08,0x8D09,0x8D0A,0x8D0B,0x8D0C,/* 0x98-0x9F */ + 0x8D0D,0x4F5F,0x4F57,0x4F32,0x4F3D,0x4F76,0x4F74,0x4F91,/* 0xA0-0xA7 */ + 0x4F89,0x4F83,0x4F8F,0x4F7E,0x4F7B,0x4FAA,0x4F7C,0x4FAC,/* 0xA8-0xAF */ + 0x4F94,0x4FE6,0x4FE8,0x4FEA,0x4FC5,0x4FDA,0x4FE3,0x4FDC,/* 0xB0-0xB7 */ + 0x4FD1,0x4FDF,0x4FF8,0x5029,0x504C,0x4FF3,0x502C,0x500F,/* 0xB8-0xBF */ + 0x502E,0x502D,0x4FFE,0x501C,0x500C,0x5025,0x5028,0x507E,/* 0xC0-0xC7 */ + 0x5043,0x5055,0x5048,0x504E,0x506C,0x507B,0x50A5,0x50A7,/* 0xC8-0xCF */ + 0x50A9,0x50BA,0x50D6,0x5106,0x50ED,0x50EC,0x50E6,0x50EE,/* 0xD0-0xD7 */ + 0x5107,0x510B,0x4EDD,0x6C3D,0x4F58,0x4F65,0x4FCE,0x9FA0,/* 0xD8-0xDF */ + 0x6C46,0x7C74,0x516E,0x5DFD,0x9EC9,0x9998,0x5181,0x5914,/* 0xE0-0xE7 */ + 0x52F9,0x530D,0x8A07,0x5310,0x51EB,0x5919,0x5155,0x4EA0,/* 0xE8-0xEF */ + 0x5156,0x4EB3,0x886E,0x88A4,0x4EB5,0x8114,0x88D2,0x7980,/* 0xF0-0xF7 */ + 0x5B34,0x8803,0x7FB8,0x51AB,0x51B1,0x51BD,0x51BC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8D0E,0x8D0F,0x8D10,0x8D11,0x8D12,0x8D13,0x8D14,0x8D15,/* 0x40-0x47 */ + 0x8D16,0x8D17,0x8D18,0x8D19,0x8D1A,0x8D1B,0x8D1C,0x8D20,/* 0x48-0x4F */ + 0x8D51,0x8D52,0x8D57,0x8D5F,0x8D65,0x8D68,0x8D69,0x8D6A,/* 0x50-0x57 */ + 0x8D6C,0x8D6E,0x8D6F,0x8D71,0x8D72,0x8D78,0x8D79,0x8D7A,/* 0x58-0x5F */ + 0x8D7B,0x8D7C,0x8D7D,0x8D7E,0x8D7F,0x8D80,0x8D82,0x8D83,/* 0x60-0x67 */ + 0x8D86,0x8D87,0x8D88,0x8D89,0x8D8C,0x8D8D,0x8D8E,0x8D8F,/* 0x68-0x6F */ + 0x8D90,0x8D92,0x8D93,0x8D95,0x8D96,0x8D97,0x8D98,0x8D99,/* 0x70-0x77 */ + 0x8D9A,0x8D9B,0x8D9C,0x8D9D,0x8D9E,0x8DA0,0x8DA1,0x0000,/* 0x78-0x7F */ + + 0x8DA2,0x8DA4,0x8DA5,0x8DA6,0x8DA7,0x8DA8,0x8DA9,0x8DAA,/* 0x80-0x87 */ + 0x8DAB,0x8DAC,0x8DAD,0x8DAE,0x8DAF,0x8DB0,0x8DB2,0x8DB6,/* 0x88-0x8F */ + 0x8DB7,0x8DB9,0x8DBB,0x8DBD,0x8DC0,0x8DC1,0x8DC2,0x8DC5,/* 0x90-0x97 */ + 0x8DC7,0x8DC8,0x8DC9,0x8DCA,0x8DCD,0x8DD0,0x8DD2,0x8DD3,/* 0x98-0x9F */ + 0x8DD4,0x51C7,0x5196,0x51A2,0x51A5,0x8BA0,0x8BA6,0x8BA7,/* 0xA0-0xA7 */ + 0x8BAA,0x8BB4,0x8BB5,0x8BB7,0x8BC2,0x8BC3,0x8BCB,0x8BCF,/* 0xA8-0xAF */ + 0x8BCE,0x8BD2,0x8BD3,0x8BD4,0x8BD6,0x8BD8,0x8BD9,0x8BDC,/* 0xB0-0xB7 */ + 0x8BDF,0x8BE0,0x8BE4,0x8BE8,0x8BE9,0x8BEE,0x8BF0,0x8BF3,/* 0xB8-0xBF */ + 0x8BF6,0x8BF9,0x8BFC,0x8BFF,0x8C00,0x8C02,0x8C04,0x8C07,/* 0xC0-0xC7 */ + 0x8C0C,0x8C0F,0x8C11,0x8C12,0x8C14,0x8C15,0x8C16,0x8C19,/* 0xC8-0xCF */ + 0x8C1B,0x8C18,0x8C1D,0x8C1F,0x8C20,0x8C21,0x8C25,0x8C27,/* 0xD0-0xD7 */ + 0x8C2A,0x8C2B,0x8C2E,0x8C2F,0x8C32,0x8C33,0x8C35,0x8C36,/* 0xD8-0xDF */ + 0x5369,0x537A,0x961D,0x9622,0x9621,0x9631,0x962A,0x963D,/* 0xE0-0xE7 */ + 0x963C,0x9642,0x9649,0x9654,0x965F,0x9667,0x966C,0x9672,/* 0xE8-0xEF */ + 0x9674,0x9688,0x968D,0x9697,0x96B0,0x9097,0x909B,0x909D,/* 0xF0-0xF7 */ + 0x9099,0x90AC,0x90A1,0x90B4,0x90B3,0x90B6,0x90BA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8DD5,0x8DD8,0x8DD9,0x8DDC,0x8DE0,0x8DE1,0x8DE2,0x8DE5,/* 0x40-0x47 */ + 0x8DE6,0x8DE7,0x8DE9,0x8DED,0x8DEE,0x8DF0,0x8DF1,0x8DF2,/* 0x48-0x4F */ + 0x8DF4,0x8DF6,0x8DFC,0x8DFE,0x8DFF,0x8E00,0x8E01,0x8E02,/* 0x50-0x57 */ + 0x8E03,0x8E04,0x8E06,0x8E07,0x8E08,0x8E0B,0x8E0D,0x8E0E,/* 0x58-0x5F */ + 0x8E10,0x8E11,0x8E12,0x8E13,0x8E15,0x8E16,0x8E17,0x8E18,/* 0x60-0x67 */ + 0x8E19,0x8E1A,0x8E1B,0x8E1C,0x8E20,0x8E21,0x8E24,0x8E25,/* 0x68-0x6F */ + 0x8E26,0x8E27,0x8E28,0x8E2B,0x8E2D,0x8E30,0x8E32,0x8E33,/* 0x70-0x77 */ + 0x8E34,0x8E36,0x8E37,0x8E38,0x8E3B,0x8E3C,0x8E3E,0x0000,/* 0x78-0x7F */ + + 0x8E3F,0x8E43,0x8E45,0x8E46,0x8E4C,0x8E4D,0x8E4E,0x8E4F,/* 0x80-0x87 */ + 0x8E50,0x8E53,0x8E54,0x8E55,0x8E56,0x8E57,0x8E58,0x8E5A,/* 0x88-0x8F */ + 0x8E5B,0x8E5C,0x8E5D,0x8E5E,0x8E5F,0x8E60,0x8E61,0x8E62,/* 0x90-0x97 */ + 0x8E63,0x8E64,0x8E65,0x8E67,0x8E68,0x8E6A,0x8E6B,0x8E6E,/* 0x98-0x9F */ + 0x8E71,0x90B8,0x90B0,0x90CF,0x90C5,0x90BE,0x90D0,0x90C4,/* 0xA0-0xA7 */ + 0x90C7,0x90D3,0x90E6,0x90E2,0x90DC,0x90D7,0x90DB,0x90EB,/* 0xA8-0xAF */ + 0x90EF,0x90FE,0x9104,0x9122,0x911E,0x9123,0x9131,0x912F,/* 0xB0-0xB7 */ + 0x9139,0x9143,0x9146,0x520D,0x5942,0x52A2,0x52AC,0x52AD,/* 0xB8-0xBF */ + 0x52BE,0x54FF,0x52D0,0x52D6,0x52F0,0x53DF,0x71EE,0x77CD,/* 0xC0-0xC7 */ + 0x5EF4,0x51F5,0x51FC,0x9B2F,0x53B6,0x5F01,0x755A,0x5DEF,/* 0xC8-0xCF */ + 0x574C,0x57A9,0x57A1,0x587E,0x58BC,0x58C5,0x58D1,0x5729,/* 0xD0-0xD7 */ + 0x572C,0x572A,0x5733,0x5739,0x572E,0x572F,0x575C,0x573B,/* 0xD8-0xDF */ + 0x5742,0x5769,0x5785,0x576B,0x5786,0x577C,0x577B,0x5768,/* 0xE0-0xE7 */ + 0x576D,0x5776,0x5773,0x57AD,0x57A4,0x578C,0x57B2,0x57CF,/* 0xE8-0xEF */ + 0x57A7,0x57B4,0x5793,0x57A0,0x57D5,0x57D8,0x57DA,0x57D9,/* 0xF0-0xF7 */ + 0x57D2,0x57B8,0x57F4,0x57EF,0x57F8,0x57E4,0x57DD,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8E73,0x8E75,0x8E77,0x8E78,0x8E79,0x8E7A,0x8E7B,0x8E7D,/* 0x40-0x47 */ + 0x8E7E,0x8E80,0x8E82,0x8E83,0x8E84,0x8E86,0x8E88,0x8E89,/* 0x48-0x4F */ + 0x8E8A,0x8E8B,0x8E8C,0x8E8D,0x8E8E,0x8E91,0x8E92,0x8E93,/* 0x50-0x57 */ + 0x8E95,0x8E96,0x8E97,0x8E98,0x8E99,0x8E9A,0x8E9B,0x8E9D,/* 0x58-0x5F */ + 0x8E9F,0x8EA0,0x8EA1,0x8EA2,0x8EA3,0x8EA4,0x8EA5,0x8EA6,/* 0x60-0x67 */ + 0x8EA7,0x8EA8,0x8EA9,0x8EAA,0x8EAD,0x8EAE,0x8EB0,0x8EB1,/* 0x68-0x6F */ + 0x8EB3,0x8EB4,0x8EB5,0x8EB6,0x8EB7,0x8EB8,0x8EB9,0x8EBB,/* 0x70-0x77 */ + 0x8EBC,0x8EBD,0x8EBE,0x8EBF,0x8EC0,0x8EC1,0x8EC2,0x0000,/* 0x78-0x7F */ + + 0x8EC3,0x8EC4,0x8EC5,0x8EC6,0x8EC7,0x8EC8,0x8EC9,0x8ECA,/* 0x80-0x87 */ + 0x8ECB,0x8ECC,0x8ECD,0x8ECF,0x8ED0,0x8ED1,0x8ED2,0x8ED3,/* 0x88-0x8F */ + 0x8ED4,0x8ED5,0x8ED6,0x8ED7,0x8ED8,0x8ED9,0x8EDA,0x8EDB,/* 0x90-0x97 */ + 0x8EDC,0x8EDD,0x8EDE,0x8EDF,0x8EE0,0x8EE1,0x8EE2,0x8EE3,/* 0x98-0x9F */ + 0x8EE4,0x580B,0x580D,0x57FD,0x57ED,0x5800,0x581E,0x5819,/* 0xA0-0xA7 */ + 0x5844,0x5820,0x5865,0x586C,0x5881,0x5889,0x589A,0x5880,/* 0xA8-0xAF */ + 0x99A8,0x9F19,0x61FF,0x8279,0x827D,0x827F,0x828F,0x828A,/* 0xB0-0xB7 */ + 0x82A8,0x8284,0x828E,0x8291,0x8297,0x8299,0x82AB,0x82B8,/* 0xB8-0xBF */ + 0x82BE,0x82B0,0x82C8,0x82CA,0x82E3,0x8298,0x82B7,0x82AE,/* 0xC0-0xC7 */ + 0x82CB,0x82CC,0x82C1,0x82A9,0x82B4,0x82A1,0x82AA,0x829F,/* 0xC8-0xCF */ + 0x82C4,0x82CE,0x82A4,0x82E1,0x8309,0x82F7,0x82E4,0x830F,/* 0xD0-0xD7 */ + 0x8307,0x82DC,0x82F4,0x82D2,0x82D8,0x830C,0x82FB,0x82D3,/* 0xD8-0xDF */ + 0x8311,0x831A,0x8306,0x8314,0x8315,0x82E0,0x82D5,0x831C,/* 0xE0-0xE7 */ + 0x8351,0x835B,0x835C,0x8308,0x8392,0x833C,0x8334,0x8331,/* 0xE8-0xEF */ + 0x839B,0x835E,0x832F,0x834F,0x8347,0x8343,0x835F,0x8340,/* 0xF0-0xF7 */ + 0x8317,0x8360,0x832D,0x833A,0x8333,0x8366,0x8365,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8EE5,0x8EE6,0x8EE7,0x8EE8,0x8EE9,0x8EEA,0x8EEB,0x8EEC,/* 0x40-0x47 */ + 0x8EED,0x8EEE,0x8EEF,0x8EF0,0x8EF1,0x8EF2,0x8EF3,0x8EF4,/* 0x48-0x4F */ + 0x8EF5,0x8EF6,0x8EF7,0x8EF8,0x8EF9,0x8EFA,0x8EFB,0x8EFC,/* 0x50-0x57 */ + 0x8EFD,0x8EFE,0x8EFF,0x8F00,0x8F01,0x8F02,0x8F03,0x8F04,/* 0x58-0x5F */ + 0x8F05,0x8F06,0x8F07,0x8F08,0x8F09,0x8F0A,0x8F0B,0x8F0C,/* 0x60-0x67 */ + 0x8F0D,0x8F0E,0x8F0F,0x8F10,0x8F11,0x8F12,0x8F13,0x8F14,/* 0x68-0x6F */ + 0x8F15,0x8F16,0x8F17,0x8F18,0x8F19,0x8F1A,0x8F1B,0x8F1C,/* 0x70-0x77 */ + 0x8F1D,0x8F1E,0x8F1F,0x8F20,0x8F21,0x8F22,0x8F23,0x0000,/* 0x78-0x7F */ + + 0x8F24,0x8F25,0x8F26,0x8F27,0x8F28,0x8F29,0x8F2A,0x8F2B,/* 0x80-0x87 */ + 0x8F2C,0x8F2D,0x8F2E,0x8F2F,0x8F30,0x8F31,0x8F32,0x8F33,/* 0x88-0x8F */ + 0x8F34,0x8F35,0x8F36,0x8F37,0x8F38,0x8F39,0x8F3A,0x8F3B,/* 0x90-0x97 */ + 0x8F3C,0x8F3D,0x8F3E,0x8F3F,0x8F40,0x8F41,0x8F42,0x8F43,/* 0x98-0x9F */ + 0x8F44,0x8368,0x831B,0x8369,0x836C,0x836A,0x836D,0x836E,/* 0xA0-0xA7 */ + 0x83B0,0x8378,0x83B3,0x83B4,0x83A0,0x83AA,0x8393,0x839C,/* 0xA8-0xAF */ + 0x8385,0x837C,0x83B6,0x83A9,0x837D,0x83B8,0x837B,0x8398,/* 0xB0-0xB7 */ + 0x839E,0x83A8,0x83BA,0x83BC,0x83C1,0x8401,0x83E5,0x83D8,/* 0xB8-0xBF */ + 0x5807,0x8418,0x840B,0x83DD,0x83FD,0x83D6,0x841C,0x8438,/* 0xC0-0xC7 */ + 0x8411,0x8406,0x83D4,0x83DF,0x840F,0x8403,0x83F8,0x83F9,/* 0xC8-0xCF */ + 0x83EA,0x83C5,0x83C0,0x8426,0x83F0,0x83E1,0x845C,0x8451,/* 0xD0-0xD7 */ + 0x845A,0x8459,0x8473,0x8487,0x8488,0x847A,0x8489,0x8478,/* 0xD8-0xDF */ + 0x843C,0x8446,0x8469,0x8476,0x848C,0x848E,0x8431,0x846D,/* 0xE0-0xE7 */ + 0x84C1,0x84CD,0x84D0,0x84E6,0x84BD,0x84D3,0x84CA,0x84BF,/* 0xE8-0xEF */ + 0x84BA,0x84E0,0x84A1,0x84B9,0x84B4,0x8497,0x84E5,0x84E3,/* 0xF0-0xF7 */ + 0x850C,0x750D,0x8538,0x84F0,0x8539,0x851F,0x853A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8F45,0x8F46,0x8F47,0x8F48,0x8F49,0x8F4A,0x8F4B,0x8F4C,/* 0x40-0x47 */ + 0x8F4D,0x8F4E,0x8F4F,0x8F50,0x8F51,0x8F52,0x8F53,0x8F54,/* 0x48-0x4F */ + 0x8F55,0x8F56,0x8F57,0x8F58,0x8F59,0x8F5A,0x8F5B,0x8F5C,/* 0x50-0x57 */ + 0x8F5D,0x8F5E,0x8F5F,0x8F60,0x8F61,0x8F62,0x8F63,0x8F64,/* 0x58-0x5F */ + 0x8F65,0x8F6A,0x8F80,0x8F8C,0x8F92,0x8F9D,0x8FA0,0x8FA1,/* 0x60-0x67 */ + 0x8FA2,0x8FA4,0x8FA5,0x8FA6,0x8FA7,0x8FAA,0x8FAC,0x8FAD,/* 0x68-0x6F */ + 0x8FAE,0x8FAF,0x8FB2,0x8FB3,0x8FB4,0x8FB5,0x8FB7,0x8FB8,/* 0x70-0x77 */ + 0x8FBA,0x8FBB,0x8FBC,0x8FBF,0x8FC0,0x8FC3,0x8FC6,0x0000,/* 0x78-0x7F */ + + 0x8FC9,0x8FCA,0x8FCB,0x8FCC,0x8FCD,0x8FCF,0x8FD2,0x8FD6,/* 0x80-0x87 */ + 0x8FD7,0x8FDA,0x8FE0,0x8FE1,0x8FE3,0x8FE7,0x8FEC,0x8FEF,/* 0x88-0x8F */ + 0x8FF1,0x8FF2,0x8FF4,0x8FF5,0x8FF6,0x8FFA,0x8FFB,0x8FFC,/* 0x90-0x97 */ + 0x8FFE,0x8FFF,0x9007,0x9008,0x900C,0x900E,0x9013,0x9015,/* 0x98-0x9F */ + 0x9018,0x8556,0x853B,0x84FF,0x84FC,0x8559,0x8548,0x8568,/* 0xA0-0xA7 */ + 0x8564,0x855E,0x857A,0x77A2,0x8543,0x8572,0x857B,0x85A4,/* 0xA8-0xAF */ + 0x85A8,0x8587,0x858F,0x8579,0x85AE,0x859C,0x8585,0x85B9,/* 0xB0-0xB7 */ + 0x85B7,0x85B0,0x85D3,0x85C1,0x85DC,0x85FF,0x8627,0x8605,/* 0xB8-0xBF */ + 0x8629,0x8616,0x863C,0x5EFE,0x5F08,0x593C,0x5941,0x8037,/* 0xC0-0xC7 */ + 0x5955,0x595A,0x5958,0x530F,0x5C22,0x5C25,0x5C2C,0x5C34,/* 0xC8-0xCF */ + 0x624C,0x626A,0x629F,0x62BB,0x62CA,0x62DA,0x62D7,0x62EE,/* 0xD0-0xD7 */ + 0x6322,0x62F6,0x6339,0x634B,0x6343,0x63AD,0x63F6,0x6371,/* 0xD8-0xDF */ + 0x637A,0x638E,0x63B4,0x636D,0x63AC,0x638A,0x6369,0x63AE,/* 0xE0-0xE7 */ + 0x63BC,0x63F2,0x63F8,0x63E0,0x63FF,0x63C4,0x63DE,0x63CE,/* 0xE8-0xEF */ + 0x6452,0x63C6,0x63BE,0x6445,0x6441,0x640B,0x641B,0x6420,/* 0xF0-0xF7 */ + 0x640C,0x6426,0x6421,0x645E,0x6484,0x646D,0x6496,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9019,0x901C,0x9023,0x9024,0x9025,0x9027,0x9028,0x9029,/* 0x40-0x47 */ + 0x902A,0x902B,0x902C,0x9030,0x9031,0x9032,0x9033,0x9034,/* 0x48-0x4F */ + 0x9037,0x9039,0x903A,0x903D,0x903F,0x9040,0x9043,0x9045,/* 0x50-0x57 */ + 0x9046,0x9048,0x9049,0x904A,0x904B,0x904C,0x904E,0x9054,/* 0x58-0x5F */ + 0x9055,0x9056,0x9059,0x905A,0x905C,0x905D,0x905E,0x905F,/* 0x60-0x67 */ + 0x9060,0x9061,0x9064,0x9066,0x9067,0x9069,0x906A,0x906B,/* 0x68-0x6F */ + 0x906C,0x906F,0x9070,0x9071,0x9072,0x9073,0x9076,0x9077,/* 0x70-0x77 */ + 0x9078,0x9079,0x907A,0x907B,0x907C,0x907E,0x9081,0x0000,/* 0x78-0x7F */ + + 0x9084,0x9085,0x9086,0x9087,0x9089,0x908A,0x908C,0x908D,/* 0x80-0x87 */ + 0x908E,0x908F,0x9090,0x9092,0x9094,0x9096,0x9098,0x909A,/* 0x88-0x8F */ + 0x909C,0x909E,0x909F,0x90A0,0x90A4,0x90A5,0x90A7,0x90A8,/* 0x90-0x97 */ + 0x90A9,0x90AB,0x90AD,0x90B2,0x90B7,0x90BC,0x90BD,0x90BF,/* 0x98-0x9F */ + 0x90C0,0x647A,0x64B7,0x64B8,0x6499,0x64BA,0x64C0,0x64D0,/* 0xA0-0xA7 */ + 0x64D7,0x64E4,0x64E2,0x6509,0x6525,0x652E,0x5F0B,0x5FD2,/* 0xA8-0xAF */ + 0x7519,0x5F11,0x535F,0x53F1,0x53FD,0x53E9,0x53E8,0x53FB,/* 0xB0-0xB7 */ + 0x5412,0x5416,0x5406,0x544B,0x5452,0x5453,0x5454,0x5456,/* 0xB8-0xBF */ + 0x5443,0x5421,0x5457,0x5459,0x5423,0x5432,0x5482,0x5494,/* 0xC0-0xC7 */ + 0x5477,0x5471,0x5464,0x549A,0x549B,0x5484,0x5476,0x5466,/* 0xC8-0xCF */ + 0x549D,0x54D0,0x54AD,0x54C2,0x54B4,0x54D2,0x54A7,0x54A6,/* 0xD0-0xD7 */ + 0x54D3,0x54D4,0x5472,0x54A3,0x54D5,0x54BB,0x54BF,0x54CC,/* 0xD8-0xDF */ + 0x54D9,0x54DA,0x54DC,0x54A9,0x54AA,0x54A4,0x54DD,0x54CF,/* 0xE0-0xE7 */ + 0x54DE,0x551B,0x54E7,0x5520,0x54FD,0x5514,0x54F3,0x5522,/* 0xE8-0xEF */ + 0x5523,0x550F,0x5511,0x5527,0x552A,0x5567,0x558F,0x55B5,/* 0xF0-0xF7 */ + 0x5549,0x556D,0x5541,0x5555,0x553F,0x5550,0x553C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x90C2,0x90C3,0x90C6,0x90C8,0x90C9,0x90CB,0x90CC,0x90CD,/* 0x40-0x47 */ + 0x90D2,0x90D4,0x90D5,0x90D6,0x90D8,0x90D9,0x90DA,0x90DE,/* 0x48-0x4F */ + 0x90DF,0x90E0,0x90E3,0x90E4,0x90E5,0x90E9,0x90EA,0x90EC,/* 0x50-0x57 */ + 0x90EE,0x90F0,0x90F1,0x90F2,0x90F3,0x90F5,0x90F6,0x90F7,/* 0x58-0x5F */ + 0x90F9,0x90FA,0x90FB,0x90FC,0x90FF,0x9100,0x9101,0x9103,/* 0x60-0x67 */ + 0x9105,0x9106,0x9107,0x9108,0x9109,0x910A,0x910B,0x910C,/* 0x68-0x6F */ + 0x910D,0x910E,0x910F,0x9110,0x9111,0x9112,0x9113,0x9114,/* 0x70-0x77 */ + 0x9115,0x9116,0x9117,0x9118,0x911A,0x911B,0x911C,0x0000,/* 0x78-0x7F */ + + 0x911D,0x911F,0x9120,0x9121,0x9124,0x9125,0x9126,0x9127,/* 0x80-0x87 */ + 0x9128,0x9129,0x912A,0x912B,0x912C,0x912D,0x912E,0x9130,/* 0x88-0x8F */ + 0x9132,0x9133,0x9134,0x9135,0x9136,0x9137,0x9138,0x913A,/* 0x90-0x97 */ + 0x913B,0x913C,0x913D,0x913E,0x913F,0x9140,0x9141,0x9142,/* 0x98-0x9F */ + 0x9144,0x5537,0x5556,0x5575,0x5576,0x5577,0x5533,0x5530,/* 0xA0-0xA7 */ + 0x555C,0x558B,0x55D2,0x5583,0x55B1,0x55B9,0x5588,0x5581,/* 0xA8-0xAF */ + 0x559F,0x557E,0x55D6,0x5591,0x557B,0x55DF,0x55BD,0x55BE,/* 0xB0-0xB7 */ + 0x5594,0x5599,0x55EA,0x55F7,0x55C9,0x561F,0x55D1,0x55EB,/* 0xB8-0xBF */ + 0x55EC,0x55D4,0x55E6,0x55DD,0x55C4,0x55EF,0x55E5,0x55F2,/* 0xC0-0xC7 */ + 0x55F3,0x55CC,0x55CD,0x55E8,0x55F5,0x55E4,0x8F94,0x561E,/* 0xC8-0xCF */ + 0x5608,0x560C,0x5601,0x5624,0x5623,0x55FE,0x5600,0x5627,/* 0xD0-0xD7 */ + 0x562D,0x5658,0x5639,0x5657,0x562C,0x564D,0x5662,0x5659,/* 0xD8-0xDF */ + 0x565C,0x564C,0x5654,0x5686,0x5664,0x5671,0x566B,0x567B,/* 0xE0-0xE7 */ + 0x567C,0x5685,0x5693,0x56AF,0x56D4,0x56D7,0x56DD,0x56E1,/* 0xE8-0xEF */ + 0x56F5,0x56EB,0x56F9,0x56FF,0x5704,0x570A,0x5709,0x571C,/* 0xF0-0xF7 */ + 0x5E0F,0x5E19,0x5E14,0x5E11,0x5E31,0x5E3B,0x5E3C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9145,0x9147,0x9148,0x9151,0x9153,0x9154,0x9155,0x9156,/* 0x40-0x47 */ + 0x9158,0x9159,0x915B,0x915C,0x915F,0x9160,0x9166,0x9167,/* 0x48-0x4F */ + 0x9168,0x916B,0x916D,0x9173,0x917A,0x917B,0x917C,0x9180,/* 0x50-0x57 */ + 0x9181,0x9182,0x9183,0x9184,0x9186,0x9188,0x918A,0x918E,/* 0x58-0x5F */ + 0x918F,0x9193,0x9194,0x9195,0x9196,0x9197,0x9198,0x9199,/* 0x60-0x67 */ + 0x919C,0x919D,0x919E,0x919F,0x91A0,0x91A1,0x91A4,0x91A5,/* 0x68-0x6F */ + 0x91A6,0x91A7,0x91A8,0x91A9,0x91AB,0x91AC,0x91B0,0x91B1,/* 0x70-0x77 */ + 0x91B2,0x91B3,0x91B6,0x91B7,0x91B8,0x91B9,0x91BB,0x0000,/* 0x78-0x7F */ + + 0x91BC,0x91BD,0x91BE,0x91BF,0x91C0,0x91C1,0x91C2,0x91C3,/* 0x80-0x87 */ + 0x91C4,0x91C5,0x91C6,0x91C8,0x91CB,0x91D0,0x91D2,0x91D3,/* 0x88-0x8F */ + 0x91D4,0x91D5,0x91D6,0x91D7,0x91D8,0x91D9,0x91DA,0x91DB,/* 0x90-0x97 */ + 0x91DD,0x91DE,0x91DF,0x91E0,0x91E1,0x91E2,0x91E3,0x91E4,/* 0x98-0x9F */ + 0x91E5,0x5E37,0x5E44,0x5E54,0x5E5B,0x5E5E,0x5E61,0x5C8C,/* 0xA0-0xA7 */ + 0x5C7A,0x5C8D,0x5C90,0x5C96,0x5C88,0x5C98,0x5C99,0x5C91,/* 0xA8-0xAF */ + 0x5C9A,0x5C9C,0x5CB5,0x5CA2,0x5CBD,0x5CAC,0x5CAB,0x5CB1,/* 0xB0-0xB7 */ + 0x5CA3,0x5CC1,0x5CB7,0x5CC4,0x5CD2,0x5CE4,0x5CCB,0x5CE5,/* 0xB8-0xBF */ + 0x5D02,0x5D03,0x5D27,0x5D26,0x5D2E,0x5D24,0x5D1E,0x5D06,/* 0xC0-0xC7 */ + 0x5D1B,0x5D58,0x5D3E,0x5D34,0x5D3D,0x5D6C,0x5D5B,0x5D6F,/* 0xC8-0xCF */ + 0x5D5D,0x5D6B,0x5D4B,0x5D4A,0x5D69,0x5D74,0x5D82,0x5D99,/* 0xD0-0xD7 */ + 0x5D9D,0x8C73,0x5DB7,0x5DC5,0x5F73,0x5F77,0x5F82,0x5F87,/* 0xD8-0xDF */ + 0x5F89,0x5F8C,0x5F95,0x5F99,0x5F9C,0x5FA8,0x5FAD,0x5FB5,/* 0xE0-0xE7 */ + 0x5FBC,0x8862,0x5F61,0x72AD,0x72B0,0x72B4,0x72B7,0x72B8,/* 0xE8-0xEF */ + 0x72C3,0x72C1,0x72CE,0x72CD,0x72D2,0x72E8,0x72EF,0x72E9,/* 0xF0-0xF7 */ + 0x72F2,0x72F4,0x72F7,0x7301,0x72F3,0x7303,0x72FA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x91E6,0x91E7,0x91E8,0x91E9,0x91EA,0x91EB,0x91EC,0x91ED,/* 0x40-0x47 */ + 0x91EE,0x91EF,0x91F0,0x91F1,0x91F2,0x91F3,0x91F4,0x91F5,/* 0x48-0x4F */ + 0x91F6,0x91F7,0x91F8,0x91F9,0x91FA,0x91FB,0x91FC,0x91FD,/* 0x50-0x57 */ + 0x91FE,0x91FF,0x9200,0x9201,0x9202,0x9203,0x9204,0x9205,/* 0x58-0x5F */ + 0x9206,0x9207,0x9208,0x9209,0x920A,0x920B,0x920C,0x920D,/* 0x60-0x67 */ + 0x920E,0x920F,0x9210,0x9211,0x9212,0x9213,0x9214,0x9215,/* 0x68-0x6F */ + 0x9216,0x9217,0x9218,0x9219,0x921A,0x921B,0x921C,0x921D,/* 0x70-0x77 */ + 0x921E,0x921F,0x9220,0x9221,0x9222,0x9223,0x9224,0x0000,/* 0x78-0x7F */ + + 0x9225,0x9226,0x9227,0x9228,0x9229,0x922A,0x922B,0x922C,/* 0x80-0x87 */ + 0x922D,0x922E,0x922F,0x9230,0x9231,0x9232,0x9233,0x9234,/* 0x88-0x8F */ + 0x9235,0x9236,0x9237,0x9238,0x9239,0x923A,0x923B,0x923C,/* 0x90-0x97 */ + 0x923D,0x923E,0x923F,0x9240,0x9241,0x9242,0x9243,0x9244,/* 0x98-0x9F */ + 0x9245,0x72FB,0x7317,0x7313,0x7321,0x730A,0x731E,0x731D,/* 0xA0-0xA7 */ + 0x7315,0x7322,0x7339,0x7325,0x732C,0x7338,0x7331,0x7350,/* 0xA8-0xAF */ + 0x734D,0x7357,0x7360,0x736C,0x736F,0x737E,0x821B,0x5925,/* 0xB0-0xB7 */ + 0x98E7,0x5924,0x5902,0x9963,0x9967,0x9968,0x9969,0x996A,/* 0xB8-0xBF */ + 0x996B,0x996C,0x9974,0x9977,0x997D,0x9980,0x9984,0x9987,/* 0xC0-0xC7 */ + 0x998A,0x998D,0x9990,0x9991,0x9993,0x9994,0x9995,0x5E80,/* 0xC8-0xCF */ + 0x5E91,0x5E8B,0x5E96,0x5EA5,0x5EA0,0x5EB9,0x5EB5,0x5EBE,/* 0xD0-0xD7 */ + 0x5EB3,0x8D53,0x5ED2,0x5ED1,0x5EDB,0x5EE8,0x5EEA,0x81BA,/* 0xD8-0xDF */ + 0x5FC4,0x5FC9,0x5FD6,0x5FCF,0x6003,0x5FEE,0x6004,0x5FE1,/* 0xE0-0xE7 */ + 0x5FE4,0x5FFE,0x6005,0x6006,0x5FEA,0x5FED,0x5FF8,0x6019,/* 0xE8-0xEF */ + 0x6035,0x6026,0x601B,0x600F,0x600D,0x6029,0x602B,0x600A,/* 0xF0-0xF7 */ + 0x603F,0x6021,0x6078,0x6079,0x607B,0x607A,0x6042,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9246,0x9247,0x9248,0x9249,0x924A,0x924B,0x924C,0x924D,/* 0x40-0x47 */ + 0x924E,0x924F,0x9250,0x9251,0x9252,0x9253,0x9254,0x9255,/* 0x48-0x4F */ + 0x9256,0x9257,0x9258,0x9259,0x925A,0x925B,0x925C,0x925D,/* 0x50-0x57 */ + 0x925E,0x925F,0x9260,0x9261,0x9262,0x9263,0x9264,0x9265,/* 0x58-0x5F */ + 0x9266,0x9267,0x9268,0x9269,0x926A,0x926B,0x926C,0x926D,/* 0x60-0x67 */ + 0x926E,0x926F,0x9270,0x9271,0x9272,0x9273,0x9275,0x9276,/* 0x68-0x6F */ + 0x9277,0x9278,0x9279,0x927A,0x927B,0x927C,0x927D,0x927E,/* 0x70-0x77 */ + 0x927F,0x9280,0x9281,0x9282,0x9283,0x9284,0x9285,0x0000,/* 0x78-0x7F */ + + 0x9286,0x9287,0x9288,0x9289,0x928A,0x928B,0x928C,0x928D,/* 0x80-0x87 */ + 0x928F,0x9290,0x9291,0x9292,0x9293,0x9294,0x9295,0x9296,/* 0x88-0x8F */ + 0x9297,0x9298,0x9299,0x929A,0x929B,0x929C,0x929D,0x929E,/* 0x90-0x97 */ + 0x929F,0x92A0,0x92A1,0x92A2,0x92A3,0x92A4,0x92A5,0x92A6,/* 0x98-0x9F */ + 0x92A7,0x606A,0x607D,0x6096,0x609A,0x60AD,0x609D,0x6083,/* 0xA0-0xA7 */ + 0x6092,0x608C,0x609B,0x60EC,0x60BB,0x60B1,0x60DD,0x60D8,/* 0xA8-0xAF */ + 0x60C6,0x60DA,0x60B4,0x6120,0x6126,0x6115,0x6123,0x60F4,/* 0xB0-0xB7 */ + 0x6100,0x610E,0x612B,0x614A,0x6175,0x61AC,0x6194,0x61A7,/* 0xB8-0xBF */ + 0x61B7,0x61D4,0x61F5,0x5FDD,0x96B3,0x95E9,0x95EB,0x95F1,/* 0xC0-0xC7 */ + 0x95F3,0x95F5,0x95F6,0x95FC,0x95FE,0x9603,0x9604,0x9606,/* 0xC8-0xCF */ + 0x9608,0x960A,0x960B,0x960C,0x960D,0x960F,0x9612,0x9615,/* 0xD0-0xD7 */ + 0x9616,0x9617,0x9619,0x961A,0x4E2C,0x723F,0x6215,0x6C35,/* 0xD8-0xDF */ + 0x6C54,0x6C5C,0x6C4A,0x6CA3,0x6C85,0x6C90,0x6C94,0x6C8C,/* 0xE0-0xE7 */ + 0x6C68,0x6C69,0x6C74,0x6C76,0x6C86,0x6CA9,0x6CD0,0x6CD4,/* 0xE8-0xEF */ + 0x6CAD,0x6CF7,0x6CF8,0x6CF1,0x6CD7,0x6CB2,0x6CE0,0x6CD6,/* 0xF0-0xF7 */ + 0x6CFA,0x6CEB,0x6CEE,0x6CB1,0x6CD3,0x6CEF,0x6CFE,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x92A8,0x92A9,0x92AA,0x92AB,0x92AC,0x92AD,0x92AF,0x92B0,/* 0x40-0x47 */ + 0x92B1,0x92B2,0x92B3,0x92B4,0x92B5,0x92B6,0x92B7,0x92B8,/* 0x48-0x4F */ + 0x92B9,0x92BA,0x92BB,0x92BC,0x92BD,0x92BE,0x92BF,0x92C0,/* 0x50-0x57 */ + 0x92C1,0x92C2,0x92C3,0x92C4,0x92C5,0x92C6,0x92C7,0x92C9,/* 0x58-0x5F */ + 0x92CA,0x92CB,0x92CC,0x92CD,0x92CE,0x92CF,0x92D0,0x92D1,/* 0x60-0x67 */ + 0x92D2,0x92D3,0x92D4,0x92D5,0x92D6,0x92D7,0x92D8,0x92D9,/* 0x68-0x6F */ + 0x92DA,0x92DB,0x92DC,0x92DD,0x92DE,0x92DF,0x92E0,0x92E1,/* 0x70-0x77 */ + 0x92E2,0x92E3,0x92E4,0x92E5,0x92E6,0x92E7,0x92E8,0x0000,/* 0x78-0x7F */ + + 0x92E9,0x92EA,0x92EB,0x92EC,0x92ED,0x92EE,0x92EF,0x92F0,/* 0x80-0x87 */ + 0x92F1,0x92F2,0x92F3,0x92F4,0x92F5,0x92F6,0x92F7,0x92F8,/* 0x88-0x8F */ + 0x92F9,0x92FA,0x92FB,0x92FC,0x92FD,0x92FE,0x92FF,0x9300,/* 0x90-0x97 */ + 0x9301,0x9302,0x9303,0x9304,0x9305,0x9306,0x9307,0x9308,/* 0x98-0x9F */ + 0x9309,0x6D39,0x6D27,0x6D0C,0x6D43,0x6D48,0x6D07,0x6D04,/* 0xA0-0xA7 */ + 0x6D19,0x6D0E,0x6D2B,0x6D4D,0x6D2E,0x6D35,0x6D1A,0x6D4F,/* 0xA8-0xAF */ + 0x6D52,0x6D54,0x6D33,0x6D91,0x6D6F,0x6D9E,0x6DA0,0x6D5E,/* 0xB0-0xB7 */ + 0x6D93,0x6D94,0x6D5C,0x6D60,0x6D7C,0x6D63,0x6E1A,0x6DC7,/* 0xB8-0xBF */ + 0x6DC5,0x6DDE,0x6E0E,0x6DBF,0x6DE0,0x6E11,0x6DE6,0x6DDD,/* 0xC0-0xC7 */ + 0x6DD9,0x6E16,0x6DAB,0x6E0C,0x6DAE,0x6E2B,0x6E6E,0x6E4E,/* 0xC8-0xCF */ + 0x6E6B,0x6EB2,0x6E5F,0x6E86,0x6E53,0x6E54,0x6E32,0x6E25,/* 0xD0-0xD7 */ + 0x6E44,0x6EDF,0x6EB1,0x6E98,0x6EE0,0x6F2D,0x6EE2,0x6EA5,/* 0xD8-0xDF */ + 0x6EA7,0x6EBD,0x6EBB,0x6EB7,0x6ED7,0x6EB4,0x6ECF,0x6E8F,/* 0xE0-0xE7 */ + 0x6EC2,0x6E9F,0x6F62,0x6F46,0x6F47,0x6F24,0x6F15,0x6EF9,/* 0xE8-0xEF */ + 0x6F2F,0x6F36,0x6F4B,0x6F74,0x6F2A,0x6F09,0x6F29,0x6F89,/* 0xF0-0xF7 */ + 0x6F8D,0x6F8C,0x6F78,0x6F72,0x6F7C,0x6F7A,0x6FD1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x930A,0x930B,0x930C,0x930D,0x930E,0x930F,0x9310,0x9311,/* 0x40-0x47 */ + 0x9312,0x9313,0x9314,0x9315,0x9316,0x9317,0x9318,0x9319,/* 0x48-0x4F */ + 0x931A,0x931B,0x931C,0x931D,0x931E,0x931F,0x9320,0x9321,/* 0x50-0x57 */ + 0x9322,0x9323,0x9324,0x9325,0x9326,0x9327,0x9328,0x9329,/* 0x58-0x5F */ + 0x932A,0x932B,0x932C,0x932D,0x932E,0x932F,0x9330,0x9331,/* 0x60-0x67 */ + 0x9332,0x9333,0x9334,0x9335,0x9336,0x9337,0x9338,0x9339,/* 0x68-0x6F */ + 0x933A,0x933B,0x933C,0x933D,0x933F,0x9340,0x9341,0x9342,/* 0x70-0x77 */ + 0x9343,0x9344,0x9345,0x9346,0x9347,0x9348,0x9349,0x0000,/* 0x78-0x7F */ + + 0x934A,0x934B,0x934C,0x934D,0x934E,0x934F,0x9350,0x9351,/* 0x80-0x87 */ + 0x9352,0x9353,0x9354,0x9355,0x9356,0x9357,0x9358,0x9359,/* 0x88-0x8F */ + 0x935A,0x935B,0x935C,0x935D,0x935E,0x935F,0x9360,0x9361,/* 0x90-0x97 */ + 0x9362,0x9363,0x9364,0x9365,0x9366,0x9367,0x9368,0x9369,/* 0x98-0x9F */ + 0x936B,0x6FC9,0x6FA7,0x6FB9,0x6FB6,0x6FC2,0x6FE1,0x6FEE,/* 0xA0-0xA7 */ + 0x6FDE,0x6FE0,0x6FEF,0x701A,0x7023,0x701B,0x7039,0x7035,/* 0xA8-0xAF */ + 0x704F,0x705E,0x5B80,0x5B84,0x5B95,0x5B93,0x5BA5,0x5BB8,/* 0xB0-0xB7 */ + 0x752F,0x9A9E,0x6434,0x5BE4,0x5BEE,0x8930,0x5BF0,0x8E47,/* 0xB8-0xBF */ + 0x8B07,0x8FB6,0x8FD3,0x8FD5,0x8FE5,0x8FEE,0x8FE4,0x8FE9,/* 0xC0-0xC7 */ + 0x8FE6,0x8FF3,0x8FE8,0x9005,0x9004,0x900B,0x9026,0x9011,/* 0xC8-0xCF */ + 0x900D,0x9016,0x9021,0x9035,0x9036,0x902D,0x902F,0x9044,/* 0xD0-0xD7 */ + 0x9051,0x9052,0x9050,0x9068,0x9058,0x9062,0x905B,0x66B9,/* 0xD8-0xDF */ + 0x9074,0x907D,0x9082,0x9088,0x9083,0x908B,0x5F50,0x5F57,/* 0xE0-0xE7 */ + 0x5F56,0x5F58,0x5C3B,0x54AB,0x5C50,0x5C59,0x5B71,0x5C63,/* 0xE8-0xEF */ + 0x5C66,0x7FBC,0x5F2A,0x5F29,0x5F2D,0x8274,0x5F3C,0x9B3B,/* 0xF0-0xF7 */ + 0x5C6E,0x5981,0x5983,0x598D,0x59A9,0x59AA,0x59A3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x936C,0x936D,0x936E,0x936F,0x9370,0x9371,0x9372,0x9373,/* 0x40-0x47 */ + 0x9374,0x9375,0x9376,0x9377,0x9378,0x9379,0x937A,0x937B,/* 0x48-0x4F */ + 0x937C,0x937D,0x937E,0x937F,0x9380,0x9381,0x9382,0x9383,/* 0x50-0x57 */ + 0x9384,0x9385,0x9386,0x9387,0x9388,0x9389,0x938A,0x938B,/* 0x58-0x5F */ + 0x938C,0x938D,0x938E,0x9390,0x9391,0x9392,0x9393,0x9394,/* 0x60-0x67 */ + 0x9395,0x9396,0x9397,0x9398,0x9399,0x939A,0x939B,0x939C,/* 0x68-0x6F */ + 0x939D,0x939E,0x939F,0x93A0,0x93A1,0x93A2,0x93A3,0x93A4,/* 0x70-0x77 */ + 0x93A5,0x93A6,0x93A7,0x93A8,0x93A9,0x93AA,0x93AB,0x0000,/* 0x78-0x7F */ + + 0x93AC,0x93AD,0x93AE,0x93AF,0x93B0,0x93B1,0x93B2,0x93B3,/* 0x80-0x87 */ + 0x93B4,0x93B5,0x93B6,0x93B7,0x93B8,0x93B9,0x93BA,0x93BB,/* 0x88-0x8F */ + 0x93BC,0x93BD,0x93BE,0x93BF,0x93C0,0x93C1,0x93C2,0x93C3,/* 0x90-0x97 */ + 0x93C4,0x93C5,0x93C6,0x93C7,0x93C8,0x93C9,0x93CB,0x93CC,/* 0x98-0x9F */ + 0x93CD,0x5997,0x59CA,0x59AB,0x599E,0x59A4,0x59D2,0x59B2,/* 0xA0-0xA7 */ + 0x59AF,0x59D7,0x59BE,0x5A05,0x5A06,0x59DD,0x5A08,0x59E3,/* 0xA8-0xAF */ + 0x59D8,0x59F9,0x5A0C,0x5A09,0x5A32,0x5A34,0x5A11,0x5A23,/* 0xB0-0xB7 */ + 0x5A13,0x5A40,0x5A67,0x5A4A,0x5A55,0x5A3C,0x5A62,0x5A75,/* 0xB8-0xBF */ + 0x80EC,0x5AAA,0x5A9B,0x5A77,0x5A7A,0x5ABE,0x5AEB,0x5AB2,/* 0xC0-0xC7 */ + 0x5AD2,0x5AD4,0x5AB8,0x5AE0,0x5AE3,0x5AF1,0x5AD6,0x5AE6,/* 0xC8-0xCF */ + 0x5AD8,0x5ADC,0x5B09,0x5B17,0x5B16,0x5B32,0x5B37,0x5B40,/* 0xD0-0xD7 */ + 0x5C15,0x5C1C,0x5B5A,0x5B65,0x5B73,0x5B51,0x5B53,0x5B62,/* 0xD8-0xDF */ + 0x9A75,0x9A77,0x9A78,0x9A7A,0x9A7F,0x9A7D,0x9A80,0x9A81,/* 0xE0-0xE7 */ + 0x9A85,0x9A88,0x9A8A,0x9A90,0x9A92,0x9A93,0x9A96,0x9A98,/* 0xE8-0xEF */ + 0x9A9B,0x9A9C,0x9A9D,0x9A9F,0x9AA0,0x9AA2,0x9AA3,0x9AA5,/* 0xF0-0xF7 */ + 0x9AA7,0x7E9F,0x7EA1,0x7EA3,0x7EA5,0x7EA8,0x7EA9,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x93CE,0x93CF,0x93D0,0x93D1,0x93D2,0x93D3,0x93D4,0x93D5,/* 0x40-0x47 */ + 0x93D7,0x93D8,0x93D9,0x93DA,0x93DB,0x93DC,0x93DD,0x93DE,/* 0x48-0x4F */ + 0x93DF,0x93E0,0x93E1,0x93E2,0x93E3,0x93E4,0x93E5,0x93E6,/* 0x50-0x57 */ + 0x93E7,0x93E8,0x93E9,0x93EA,0x93EB,0x93EC,0x93ED,0x93EE,/* 0x58-0x5F */ + 0x93EF,0x93F0,0x93F1,0x93F2,0x93F3,0x93F4,0x93F5,0x93F6,/* 0x60-0x67 */ + 0x93F7,0x93F8,0x93F9,0x93FA,0x93FB,0x93FC,0x93FD,0x93FE,/* 0x68-0x6F */ + 0x93FF,0x9400,0x9401,0x9402,0x9403,0x9404,0x9405,0x9406,/* 0x70-0x77 */ + 0x9407,0x9408,0x9409,0x940A,0x940B,0x940C,0x940D,0x0000,/* 0x78-0x7F */ + + 0x940E,0x940F,0x9410,0x9411,0x9412,0x9413,0x9414,0x9415,/* 0x80-0x87 */ + 0x9416,0x9417,0x9418,0x9419,0x941A,0x941B,0x941C,0x941D,/* 0x88-0x8F */ + 0x941E,0x941F,0x9420,0x9421,0x9422,0x9423,0x9424,0x9425,/* 0x90-0x97 */ + 0x9426,0x9427,0x9428,0x9429,0x942A,0x942B,0x942C,0x942D,/* 0x98-0x9F */ + 0x942E,0x7EAD,0x7EB0,0x7EBE,0x7EC0,0x7EC1,0x7EC2,0x7EC9,/* 0xA0-0xA7 */ + 0x7ECB,0x7ECC,0x7ED0,0x7ED4,0x7ED7,0x7EDB,0x7EE0,0x7EE1,/* 0xA8-0xAF */ + 0x7EE8,0x7EEB,0x7EEE,0x7EEF,0x7EF1,0x7EF2,0x7F0D,0x7EF6,/* 0xB0-0xB7 */ + 0x7EFA,0x7EFB,0x7EFE,0x7F01,0x7F02,0x7F03,0x7F07,0x7F08,/* 0xB8-0xBF */ + 0x7F0B,0x7F0C,0x7F0F,0x7F11,0x7F12,0x7F17,0x7F19,0x7F1C,/* 0xC0-0xC7 */ + 0x7F1B,0x7F1F,0x7F21,0x7F22,0x7F23,0x7F24,0x7F25,0x7F26,/* 0xC8-0xCF */ + 0x7F27,0x7F2A,0x7F2B,0x7F2C,0x7F2D,0x7F2F,0x7F30,0x7F31,/* 0xD0-0xD7 */ + 0x7F32,0x7F33,0x7F35,0x5E7A,0x757F,0x5DDB,0x753E,0x9095,/* 0xD8-0xDF */ + 0x738E,0x7391,0x73AE,0x73A2,0x739F,0x73CF,0x73C2,0x73D1,/* 0xE0-0xE7 */ + 0x73B7,0x73B3,0x73C0,0x73C9,0x73C8,0x73E5,0x73D9,0x987C,/* 0xE8-0xEF */ + 0x740A,0x73E9,0x73E7,0x73DE,0x73BA,0x73F2,0x740F,0x742A,/* 0xF0-0xF7 */ + 0x745B,0x7426,0x7425,0x7428,0x7430,0x742E,0x742C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x942F,0x9430,0x9431,0x9432,0x9433,0x9434,0x9435,0x9436,/* 0x40-0x47 */ + 0x9437,0x9438,0x9439,0x943A,0x943B,0x943C,0x943D,0x943F,/* 0x48-0x4F */ + 0x9440,0x9441,0x9442,0x9443,0x9444,0x9445,0x9446,0x9447,/* 0x50-0x57 */ + 0x9448,0x9449,0x944A,0x944B,0x944C,0x944D,0x944E,0x944F,/* 0x58-0x5F */ + 0x9450,0x9451,0x9452,0x9453,0x9454,0x9455,0x9456,0x9457,/* 0x60-0x67 */ + 0x9458,0x9459,0x945A,0x945B,0x945C,0x945D,0x945E,0x945F,/* 0x68-0x6F */ + 0x9460,0x9461,0x9462,0x9463,0x9464,0x9465,0x9466,0x9467,/* 0x70-0x77 */ + 0x9468,0x9469,0x946A,0x946C,0x946D,0x946E,0x946F,0x0000,/* 0x78-0x7F */ + + 0x9470,0x9471,0x9472,0x9473,0x9474,0x9475,0x9476,0x9477,/* 0x80-0x87 */ + 0x9478,0x9479,0x947A,0x947B,0x947C,0x947D,0x947E,0x947F,/* 0x88-0x8F */ + 0x9480,0x9481,0x9482,0x9483,0x9484,0x9491,0x9496,0x9498,/* 0x90-0x97 */ + 0x94C7,0x94CF,0x94D3,0x94D4,0x94DA,0x94E6,0x94FB,0x951C,/* 0x98-0x9F */ + 0x9520,0x741B,0x741A,0x7441,0x745C,0x7457,0x7455,0x7459,/* 0xA0-0xA7 */ + 0x7477,0x746D,0x747E,0x749C,0x748E,0x7480,0x7481,0x7487,/* 0xA8-0xAF */ + 0x748B,0x749E,0x74A8,0x74A9,0x7490,0x74A7,0x74D2,0x74BA,/* 0xB0-0xB7 */ + 0x97EA,0x97EB,0x97EC,0x674C,0x6753,0x675E,0x6748,0x6769,/* 0xB8-0xBF */ + 0x67A5,0x6787,0x676A,0x6773,0x6798,0x67A7,0x6775,0x67A8,/* 0xC0-0xC7 */ + 0x679E,0x67AD,0x678B,0x6777,0x677C,0x67F0,0x6809,0x67D8,/* 0xC8-0xCF */ + 0x680A,0x67E9,0x67B0,0x680C,0x67D9,0x67B5,0x67DA,0x67B3,/* 0xD0-0xD7 */ + 0x67DD,0x6800,0x67C3,0x67B8,0x67E2,0x680E,0x67C1,0x67FD,/* 0xD8-0xDF */ + 0x6832,0x6833,0x6860,0x6861,0x684E,0x6862,0x6844,0x6864,/* 0xE0-0xE7 */ + 0x6883,0x681D,0x6855,0x6866,0x6841,0x6867,0x6840,0x683E,/* 0xE8-0xEF */ + 0x684A,0x6849,0x6829,0x68B5,0x688F,0x6874,0x6877,0x6893,/* 0xF0-0xF7 */ + 0x686B,0x68C2,0x696E,0x68FC,0x691F,0x6920,0x68F9,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9527,0x9533,0x953D,0x9543,0x9548,0x954B,0x9555,0x955A,/* 0x40-0x47 */ + 0x9560,0x956E,0x9574,0x9575,0x9577,0x9578,0x9579,0x957A,/* 0x48-0x4F */ + 0x957B,0x957C,0x957D,0x957E,0x9580,0x9581,0x9582,0x9583,/* 0x50-0x57 */ + 0x9584,0x9585,0x9586,0x9587,0x9588,0x9589,0x958A,0x958B,/* 0x58-0x5F */ + 0x958C,0x958D,0x958E,0x958F,0x9590,0x9591,0x9592,0x9593,/* 0x60-0x67 */ + 0x9594,0x9595,0x9596,0x9597,0x9598,0x9599,0x959A,0x959B,/* 0x68-0x6F */ + 0x959C,0x959D,0x959E,0x959F,0x95A0,0x95A1,0x95A2,0x95A3,/* 0x70-0x77 */ + 0x95A4,0x95A5,0x95A6,0x95A7,0x95A8,0x95A9,0x95AA,0x0000,/* 0x78-0x7F */ + + 0x95AB,0x95AC,0x95AD,0x95AE,0x95AF,0x95B0,0x95B1,0x95B2,/* 0x80-0x87 */ + 0x95B3,0x95B4,0x95B5,0x95B6,0x95B7,0x95B8,0x95B9,0x95BA,/* 0x88-0x8F */ + 0x95BB,0x95BC,0x95BD,0x95BE,0x95BF,0x95C0,0x95C1,0x95C2,/* 0x90-0x97 */ + 0x95C3,0x95C4,0x95C5,0x95C6,0x95C7,0x95C8,0x95C9,0x95CA,/* 0x98-0x9F */ + 0x95CB,0x6924,0x68F0,0x690B,0x6901,0x6957,0x68E3,0x6910,/* 0xA0-0xA7 */ + 0x6971,0x6939,0x6960,0x6942,0x695D,0x6984,0x696B,0x6980,/* 0xA8-0xAF */ + 0x6998,0x6978,0x6934,0x69CC,0x6987,0x6988,0x69CE,0x6989,/* 0xB0-0xB7 */ + 0x6966,0x6963,0x6979,0x699B,0x69A7,0x69BB,0x69AB,0x69AD,/* 0xB8-0xBF */ + 0x69D4,0x69B1,0x69C1,0x69CA,0x69DF,0x6995,0x69E0,0x698D,/* 0xC0-0xC7 */ + 0x69FF,0x6A2F,0x69ED,0x6A17,0x6A18,0x6A65,0x69F2,0x6A44,/* 0xC8-0xCF */ + 0x6A3E,0x6AA0,0x6A50,0x6A5B,0x6A35,0x6A8E,0x6A79,0x6A3D,/* 0xD0-0xD7 */ + 0x6A28,0x6A58,0x6A7C,0x6A91,0x6A90,0x6AA9,0x6A97,0x6AAB,/* 0xD8-0xDF */ + 0x7337,0x7352,0x6B81,0x6B82,0x6B87,0x6B84,0x6B92,0x6B93,/* 0xE0-0xE7 */ + 0x6B8D,0x6B9A,0x6B9B,0x6BA1,0x6BAA,0x8F6B,0x8F6D,0x8F71,/* 0xE8-0xEF */ + 0x8F72,0x8F73,0x8F75,0x8F76,0x8F78,0x8F77,0x8F79,0x8F7A,/* 0xF0-0xF7 */ + 0x8F7C,0x8F7E,0x8F81,0x8F82,0x8F84,0x8F87,0x8F8B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x95CC,0x95CD,0x95CE,0x95CF,0x95D0,0x95D1,0x95D2,0x95D3,/* 0x40-0x47 */ + 0x95D4,0x95D5,0x95D6,0x95D7,0x95D8,0x95D9,0x95DA,0x95DB,/* 0x48-0x4F */ + 0x95DC,0x95DD,0x95DE,0x95DF,0x95E0,0x95E1,0x95E2,0x95E3,/* 0x50-0x57 */ + 0x95E4,0x95E5,0x95E6,0x95E7,0x95EC,0x95FF,0x9607,0x9613,/* 0x58-0x5F */ + 0x9618,0x961B,0x961E,0x9620,0x9623,0x9624,0x9625,0x9626,/* 0x60-0x67 */ + 0x9627,0x9628,0x9629,0x962B,0x962C,0x962D,0x962F,0x9630,/* 0x68-0x6F */ + 0x9637,0x9638,0x9639,0x963A,0x963E,0x9641,0x9643,0x964A,/* 0x70-0x77 */ + 0x964E,0x964F,0x9651,0x9652,0x9653,0x9656,0x9657,0x0000,/* 0x78-0x7F */ + + 0x9658,0x9659,0x965A,0x965C,0x965D,0x965E,0x9660,0x9663,/* 0x80-0x87 */ + 0x9665,0x9666,0x966B,0x966D,0x966E,0x966F,0x9670,0x9671,/* 0x88-0x8F */ + 0x9673,0x9678,0x9679,0x967A,0x967B,0x967C,0x967D,0x967E,/* 0x90-0x97 */ + 0x967F,0x9680,0x9681,0x9682,0x9683,0x9684,0x9687,0x9689,/* 0x98-0x9F */ + 0x968A,0x8F8D,0x8F8E,0x8F8F,0x8F98,0x8F9A,0x8ECE,0x620B,/* 0xA0-0xA7 */ + 0x6217,0x621B,0x621F,0x6222,0x6221,0x6225,0x6224,0x622C,/* 0xA8-0xAF */ + 0x81E7,0x74EF,0x74F4,0x74FF,0x750F,0x7511,0x7513,0x6534,/* 0xB0-0xB7 */ + 0x65EE,0x65EF,0x65F0,0x660A,0x6619,0x6772,0x6603,0x6615,/* 0xB8-0xBF */ + 0x6600,0x7085,0x66F7,0x661D,0x6634,0x6631,0x6636,0x6635,/* 0xC0-0xC7 */ + 0x8006,0x665F,0x6654,0x6641,0x664F,0x6656,0x6661,0x6657,/* 0xC8-0xCF */ + 0x6677,0x6684,0x668C,0x66A7,0x669D,0x66BE,0x66DB,0x66DC,/* 0xD0-0xD7 */ + 0x66E6,0x66E9,0x8D32,0x8D33,0x8D36,0x8D3B,0x8D3D,0x8D40,/* 0xD8-0xDF */ + 0x8D45,0x8D46,0x8D48,0x8D49,0x8D47,0x8D4D,0x8D55,0x8D59,/* 0xE0-0xE7 */ + 0x89C7,0x89CA,0x89CB,0x89CC,0x89CE,0x89CF,0x89D0,0x89D1,/* 0xE8-0xEF */ + 0x726E,0x729F,0x725D,0x7266,0x726F,0x727E,0x727F,0x7284,/* 0xF0-0xF7 */ + 0x728B,0x728D,0x728F,0x7292,0x6308,0x6332,0x63B0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x968C,0x968E,0x9691,0x9692,0x9693,0x9695,0x9696,0x969A,/* 0x40-0x47 */ + 0x969B,0x969D,0x969E,0x969F,0x96A0,0x96A1,0x96A2,0x96A3,/* 0x48-0x4F */ + 0x96A4,0x96A5,0x96A6,0x96A8,0x96A9,0x96AA,0x96AB,0x96AC,/* 0x50-0x57 */ + 0x96AD,0x96AE,0x96AF,0x96B1,0x96B2,0x96B4,0x96B5,0x96B7,/* 0x58-0x5F */ + 0x96B8,0x96BA,0x96BB,0x96BF,0x96C2,0x96C3,0x96C8,0x96CA,/* 0x60-0x67 */ + 0x96CB,0x96D0,0x96D1,0x96D3,0x96D4,0x96D6,0x96D7,0x96D8,/* 0x68-0x6F */ + 0x96D9,0x96DA,0x96DB,0x96DC,0x96DD,0x96DE,0x96DF,0x96E1,/* 0x70-0x77 */ + 0x96E2,0x96E3,0x96E4,0x96E5,0x96E6,0x96E7,0x96EB,0x0000,/* 0x78-0x7F */ + + 0x96EC,0x96ED,0x96EE,0x96F0,0x96F1,0x96F2,0x96F4,0x96F5,/* 0x80-0x87 */ + 0x96F8,0x96FA,0x96FB,0x96FC,0x96FD,0x96FF,0x9702,0x9703,/* 0x88-0x8F */ + 0x9705,0x970A,0x970B,0x970C,0x9710,0x9711,0x9712,0x9714,/* 0x90-0x97 */ + 0x9715,0x9717,0x9718,0x9719,0x971A,0x971B,0x971D,0x971F,/* 0x98-0x9F */ + 0x9720,0x643F,0x64D8,0x8004,0x6BEA,0x6BF3,0x6BFD,0x6BF5,/* 0xA0-0xA7 */ + 0x6BF9,0x6C05,0x6C07,0x6C06,0x6C0D,0x6C15,0x6C18,0x6C19,/* 0xA8-0xAF */ + 0x6C1A,0x6C21,0x6C29,0x6C24,0x6C2A,0x6C32,0x6535,0x6555,/* 0xB0-0xB7 */ + 0x656B,0x724D,0x7252,0x7256,0x7230,0x8662,0x5216,0x809F,/* 0xB8-0xBF */ + 0x809C,0x8093,0x80BC,0x670A,0x80BD,0x80B1,0x80AB,0x80AD,/* 0xC0-0xC7 */ + 0x80B4,0x80B7,0x80E7,0x80E8,0x80E9,0x80EA,0x80DB,0x80C2,/* 0xC8-0xCF */ + 0x80C4,0x80D9,0x80CD,0x80D7,0x6710,0x80DD,0x80EB,0x80F1,/* 0xD0-0xD7 */ + 0x80F4,0x80ED,0x810D,0x810E,0x80F2,0x80FC,0x6715,0x8112,/* 0xD8-0xDF */ + 0x8C5A,0x8136,0x811E,0x812C,0x8118,0x8132,0x8148,0x814C,/* 0xE0-0xE7 */ + 0x8153,0x8174,0x8159,0x815A,0x8171,0x8160,0x8169,0x817C,/* 0xE8-0xEF */ + 0x817D,0x816D,0x8167,0x584D,0x5AB5,0x8188,0x8182,0x8191,/* 0xF0-0xF7 */ + 0x6ED5,0x81A3,0x81AA,0x81CC,0x6726,0x81CA,0x81BB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9721,0x9722,0x9723,0x9724,0x9725,0x9726,0x9727,0x9728,/* 0x40-0x47 */ + 0x9729,0x972B,0x972C,0x972E,0x972F,0x9731,0x9733,0x9734,/* 0x48-0x4F */ + 0x9735,0x9736,0x9737,0x973A,0x973B,0x973C,0x973D,0x973F,/* 0x50-0x57 */ + 0x9740,0x9741,0x9742,0x9743,0x9744,0x9745,0x9746,0x9747,/* 0x58-0x5F */ + 0x9748,0x9749,0x974A,0x974B,0x974C,0x974D,0x974E,0x974F,/* 0x60-0x67 */ + 0x9750,0x9751,0x9754,0x9755,0x9757,0x9758,0x975A,0x975C,/* 0x68-0x6F */ + 0x975D,0x975F,0x9763,0x9764,0x9766,0x9767,0x9768,0x976A,/* 0x70-0x77 */ + 0x976B,0x976C,0x976D,0x976E,0x976F,0x9770,0x9771,0x0000,/* 0x78-0x7F */ + + 0x9772,0x9775,0x9777,0x9778,0x9779,0x977A,0x977B,0x977D,/* 0x80-0x87 */ + 0x977E,0x977F,0x9780,0x9781,0x9782,0x9783,0x9784,0x9786,/* 0x88-0x8F */ + 0x9787,0x9788,0x9789,0x978A,0x978C,0x978E,0x978F,0x9790,/* 0x90-0x97 */ + 0x9793,0x9795,0x9796,0x9797,0x9799,0x979A,0x979B,0x979C,/* 0x98-0x9F */ + 0x979D,0x81C1,0x81A6,0x6B24,0x6B37,0x6B39,0x6B43,0x6B46,/* 0xA0-0xA7 */ + 0x6B59,0x98D1,0x98D2,0x98D3,0x98D5,0x98D9,0x98DA,0x6BB3,/* 0xA8-0xAF */ + 0x5F40,0x6BC2,0x89F3,0x6590,0x9F51,0x6593,0x65BC,0x65C6,/* 0xB0-0xB7 */ + 0x65C4,0x65C3,0x65CC,0x65CE,0x65D2,0x65D6,0x7080,0x709C,/* 0xB8-0xBF */ + 0x7096,0x709D,0x70BB,0x70C0,0x70B7,0x70AB,0x70B1,0x70E8,/* 0xC0-0xC7 */ + 0x70CA,0x7110,0x7113,0x7116,0x712F,0x7131,0x7173,0x715C,/* 0xC8-0xCF */ + 0x7168,0x7145,0x7172,0x714A,0x7178,0x717A,0x7198,0x71B3,/* 0xD0-0xD7 */ + 0x71B5,0x71A8,0x71A0,0x71E0,0x71D4,0x71E7,0x71F9,0x721D,/* 0xD8-0xDF */ + 0x7228,0x706C,0x7118,0x7166,0x71B9,0x623E,0x623D,0x6243,/* 0xE0-0xE7 */ + 0x6248,0x6249,0x793B,0x7940,0x7946,0x7949,0x795B,0x795C,/* 0xE8-0xEF */ + 0x7953,0x795A,0x7962,0x7957,0x7960,0x796F,0x7967,0x797A,/* 0xF0-0xF7 */ + 0x7985,0x798A,0x799A,0x79A7,0x79B3,0x5FD1,0x5FD0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_ED[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x979E,0x979F,0x97A1,0x97A2,0x97A4,0x97A5,0x97A6,0x97A7,/* 0x40-0x47 */ + 0x97A8,0x97A9,0x97AA,0x97AC,0x97AE,0x97B0,0x97B1,0x97B3,/* 0x48-0x4F */ + 0x97B5,0x97B6,0x97B7,0x97B8,0x97B9,0x97BA,0x97BB,0x97BC,/* 0x50-0x57 */ + 0x97BD,0x97BE,0x97BF,0x97C0,0x97C1,0x97C2,0x97C3,0x97C4,/* 0x58-0x5F */ + 0x97C5,0x97C6,0x97C7,0x97C8,0x97C9,0x97CA,0x97CB,0x97CC,/* 0x60-0x67 */ + 0x97CD,0x97CE,0x97CF,0x97D0,0x97D1,0x97D2,0x97D3,0x97D4,/* 0x68-0x6F */ + 0x97D5,0x97D6,0x97D7,0x97D8,0x97D9,0x97DA,0x97DB,0x97DC,/* 0x70-0x77 */ + 0x97DD,0x97DE,0x97DF,0x97E0,0x97E1,0x97E2,0x97E3,0x0000,/* 0x78-0x7F */ + + 0x97E4,0x97E5,0x97E8,0x97EE,0x97EF,0x97F0,0x97F1,0x97F2,/* 0x80-0x87 */ + 0x97F4,0x97F7,0x97F8,0x97F9,0x97FA,0x97FB,0x97FC,0x97FD,/* 0x88-0x8F */ + 0x97FE,0x97FF,0x9800,0x9801,0x9802,0x9803,0x9804,0x9805,/* 0x90-0x97 */ + 0x9806,0x9807,0x9808,0x9809,0x980A,0x980B,0x980C,0x980D,/* 0x98-0x9F */ + 0x980E,0x603C,0x605D,0x605A,0x6067,0x6041,0x6059,0x6063,/* 0xA0-0xA7 */ + 0x60AB,0x6106,0x610D,0x615D,0x61A9,0x619D,0x61CB,0x61D1,/* 0xA8-0xAF */ + 0x6206,0x8080,0x807F,0x6C93,0x6CF6,0x6DFC,0x77F6,0x77F8,/* 0xB0-0xB7 */ + 0x7800,0x7809,0x7817,0x7818,0x7811,0x65AB,0x782D,0x781C,/* 0xB8-0xBF */ + 0x781D,0x7839,0x783A,0x783B,0x781F,0x783C,0x7825,0x782C,/* 0xC0-0xC7 */ + 0x7823,0x7829,0x784E,0x786D,0x7856,0x7857,0x7826,0x7850,/* 0xC8-0xCF */ + 0x7847,0x784C,0x786A,0x789B,0x7893,0x789A,0x7887,0x789C,/* 0xD0-0xD7 */ + 0x78A1,0x78A3,0x78B2,0x78B9,0x78A5,0x78D4,0x78D9,0x78C9,/* 0xD8-0xDF */ + 0x78EC,0x78F2,0x7905,0x78F4,0x7913,0x7924,0x791E,0x7934,/* 0xE0-0xE7 */ + 0x9F9B,0x9EF9,0x9EFB,0x9EFC,0x76F1,0x7704,0x770D,0x76F9,/* 0xE8-0xEF */ + 0x7707,0x7708,0x771A,0x7722,0x7719,0x772D,0x7726,0x7735,/* 0xF0-0xF7 */ + 0x7738,0x7750,0x7751,0x7747,0x7743,0x775A,0x7768,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x980F,0x9810,0x9811,0x9812,0x9813,0x9814,0x9815,0x9816,/* 0x40-0x47 */ + 0x9817,0x9818,0x9819,0x981A,0x981B,0x981C,0x981D,0x981E,/* 0x48-0x4F */ + 0x981F,0x9820,0x9821,0x9822,0x9823,0x9824,0x9825,0x9826,/* 0x50-0x57 */ + 0x9827,0x9828,0x9829,0x982A,0x982B,0x982C,0x982D,0x982E,/* 0x58-0x5F */ + 0x982F,0x9830,0x9831,0x9832,0x9833,0x9834,0x9835,0x9836,/* 0x60-0x67 */ + 0x9837,0x9838,0x9839,0x983A,0x983B,0x983C,0x983D,0x983E,/* 0x68-0x6F */ + 0x983F,0x9840,0x9841,0x9842,0x9843,0x9844,0x9845,0x9846,/* 0x70-0x77 */ + 0x9847,0x9848,0x9849,0x984A,0x984B,0x984C,0x984D,0x0000,/* 0x78-0x7F */ + + 0x984E,0x984F,0x9850,0x9851,0x9852,0x9853,0x9854,0x9855,/* 0x80-0x87 */ + 0x9856,0x9857,0x9858,0x9859,0x985A,0x985B,0x985C,0x985D,/* 0x88-0x8F */ + 0x985E,0x985F,0x9860,0x9861,0x9862,0x9863,0x9864,0x9865,/* 0x90-0x97 */ + 0x9866,0x9867,0x9868,0x9869,0x986A,0x986B,0x986C,0x986D,/* 0x98-0x9F */ + 0x986E,0x7762,0x7765,0x777F,0x778D,0x777D,0x7780,0x778C,/* 0xA0-0xA7 */ + 0x7791,0x779F,0x77A0,0x77B0,0x77B5,0x77BD,0x753A,0x7540,/* 0xA8-0xAF */ + 0x754E,0x754B,0x7548,0x755B,0x7572,0x7579,0x7583,0x7F58,/* 0xB0-0xB7 */ + 0x7F61,0x7F5F,0x8A48,0x7F68,0x7F74,0x7F71,0x7F79,0x7F81,/* 0xB8-0xBF */ + 0x7F7E,0x76CD,0x76E5,0x8832,0x9485,0x9486,0x9487,0x948B,/* 0xC0-0xC7 */ + 0x948A,0x948C,0x948D,0x948F,0x9490,0x9494,0x9497,0x9495,/* 0xC8-0xCF */ + 0x949A,0x949B,0x949C,0x94A3,0x94A4,0x94AB,0x94AA,0x94AD,/* 0xD0-0xD7 */ + 0x94AC,0x94AF,0x94B0,0x94B2,0x94B4,0x94B6,0x94B7,0x94B8,/* 0xD8-0xDF */ + 0x94B9,0x94BA,0x94BC,0x94BD,0x94BF,0x94C4,0x94C8,0x94C9,/* 0xE0-0xE7 */ + 0x94CA,0x94CB,0x94CC,0x94CD,0x94CE,0x94D0,0x94D1,0x94D2,/* 0xE8-0xEF */ + 0x94D5,0x94D6,0x94D7,0x94D9,0x94D8,0x94DB,0x94DE,0x94DF,/* 0xF0-0xF7 */ + 0x94E0,0x94E2,0x94E4,0x94E5,0x94E7,0x94E8,0x94EA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x986F,0x9870,0x9871,0x9872,0x9873,0x9874,0x988B,0x988E,/* 0x40-0x47 */ + 0x9892,0x9895,0x9899,0x98A3,0x98A8,0x98A9,0x98AA,0x98AB,/* 0x48-0x4F */ + 0x98AC,0x98AD,0x98AE,0x98AF,0x98B0,0x98B1,0x98B2,0x98B3,/* 0x50-0x57 */ + 0x98B4,0x98B5,0x98B6,0x98B7,0x98B8,0x98B9,0x98BA,0x98BB,/* 0x58-0x5F */ + 0x98BC,0x98BD,0x98BE,0x98BF,0x98C0,0x98C1,0x98C2,0x98C3,/* 0x60-0x67 */ + 0x98C4,0x98C5,0x98C6,0x98C7,0x98C8,0x98C9,0x98CA,0x98CB,/* 0x68-0x6F */ + 0x98CC,0x98CD,0x98CF,0x98D0,0x98D4,0x98D6,0x98D7,0x98DB,/* 0x70-0x77 */ + 0x98DC,0x98DD,0x98E0,0x98E1,0x98E2,0x98E3,0x98E4,0x0000,/* 0x78-0x7F */ + + 0x98E5,0x98E6,0x98E9,0x98EA,0x98EB,0x98EC,0x98ED,0x98EE,/* 0x80-0x87 */ + 0x98EF,0x98F0,0x98F1,0x98F2,0x98F3,0x98F4,0x98F5,0x98F6,/* 0x88-0x8F */ + 0x98F7,0x98F8,0x98F9,0x98FA,0x98FB,0x98FC,0x98FD,0x98FE,/* 0x90-0x97 */ + 0x98FF,0x9900,0x9901,0x9902,0x9903,0x9904,0x9905,0x9906,/* 0x98-0x9F */ + 0x9907,0x94E9,0x94EB,0x94EE,0x94EF,0x94F3,0x94F4,0x94F5,/* 0xA0-0xA7 */ + 0x94F7,0x94F9,0x94FC,0x94FD,0x94FF,0x9503,0x9502,0x9506,/* 0xA8-0xAF */ + 0x9507,0x9509,0x950A,0x950D,0x950E,0x950F,0x9512,0x9513,/* 0xB0-0xB7 */ + 0x9514,0x9515,0x9516,0x9518,0x951B,0x951D,0x951E,0x951F,/* 0xB8-0xBF */ + 0x9522,0x952A,0x952B,0x9529,0x952C,0x9531,0x9532,0x9534,/* 0xC0-0xC7 */ + 0x9536,0x9537,0x9538,0x953C,0x953E,0x953F,0x9542,0x9535,/* 0xC8-0xCF */ + 0x9544,0x9545,0x9546,0x9549,0x954C,0x954E,0x954F,0x9552,/* 0xD0-0xD7 */ + 0x9553,0x9554,0x9556,0x9557,0x9558,0x9559,0x955B,0x955E,/* 0xD8-0xDF */ + 0x955F,0x955D,0x9561,0x9562,0x9564,0x9565,0x9566,0x9567,/* 0xE0-0xE7 */ + 0x9568,0x9569,0x956A,0x956B,0x956C,0x956F,0x9571,0x9572,/* 0xE8-0xEF */ + 0x9573,0x953A,0x77E7,0x77EC,0x96C9,0x79D5,0x79ED,0x79E3,/* 0xF0-0xF7 */ + 0x79EB,0x7A06,0x5D47,0x7A03,0x7A02,0x7A1E,0x7A14,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9908,0x9909,0x990A,0x990B,0x990C,0x990E,0x990F,0x9911,/* 0x40-0x47 */ + 0x9912,0x9913,0x9914,0x9915,0x9916,0x9917,0x9918,0x9919,/* 0x48-0x4F */ + 0x991A,0x991B,0x991C,0x991D,0x991E,0x991F,0x9920,0x9921,/* 0x50-0x57 */ + 0x9922,0x9923,0x9924,0x9925,0x9926,0x9927,0x9928,0x9929,/* 0x58-0x5F */ + 0x992A,0x992B,0x992C,0x992D,0x992F,0x9930,0x9931,0x9932,/* 0x60-0x67 */ + 0x9933,0x9934,0x9935,0x9936,0x9937,0x9938,0x9939,0x993A,/* 0x68-0x6F */ + 0x993B,0x993C,0x993D,0x993E,0x993F,0x9940,0x9941,0x9942,/* 0x70-0x77 */ + 0x9943,0x9944,0x9945,0x9946,0x9947,0x9948,0x9949,0x0000,/* 0x78-0x7F */ + + 0x994A,0x994B,0x994C,0x994D,0x994E,0x994F,0x9950,0x9951,/* 0x80-0x87 */ + 0x9952,0x9953,0x9956,0x9957,0x9958,0x9959,0x995A,0x995B,/* 0x88-0x8F */ + 0x995C,0x995D,0x995E,0x995F,0x9960,0x9961,0x9962,0x9964,/* 0x90-0x97 */ + 0x9966,0x9973,0x9978,0x9979,0x997B,0x997E,0x9982,0x9983,/* 0x98-0x9F */ + 0x9989,0x7A39,0x7A37,0x7A51,0x9ECF,0x99A5,0x7A70,0x7688,/* 0xA0-0xA7 */ + 0x768E,0x7693,0x7699,0x76A4,0x74DE,0x74E0,0x752C,0x9E20,/* 0xA8-0xAF */ + 0x9E22,0x9E28,0x9E29,0x9E2A,0x9E2B,0x9E2C,0x9E32,0x9E31,/* 0xB0-0xB7 */ + 0x9E36,0x9E38,0x9E37,0x9E39,0x9E3A,0x9E3E,0x9E41,0x9E42,/* 0xB8-0xBF */ + 0x9E44,0x9E46,0x9E47,0x9E48,0x9E49,0x9E4B,0x9E4C,0x9E4E,/* 0xC0-0xC7 */ + 0x9E51,0x9E55,0x9E57,0x9E5A,0x9E5B,0x9E5C,0x9E5E,0x9E63,/* 0xC8-0xCF */ + 0x9E66,0x9E67,0x9E68,0x9E69,0x9E6A,0x9E6B,0x9E6C,0x9E71,/* 0xD0-0xD7 */ + 0x9E6D,0x9E73,0x7592,0x7594,0x7596,0x75A0,0x759D,0x75AC,/* 0xD8-0xDF */ + 0x75A3,0x75B3,0x75B4,0x75B8,0x75C4,0x75B1,0x75B0,0x75C3,/* 0xE0-0xE7 */ + 0x75C2,0x75D6,0x75CD,0x75E3,0x75E8,0x75E6,0x75E4,0x75EB,/* 0xE8-0xEF */ + 0x75E7,0x7603,0x75F1,0x75FC,0x75FF,0x7610,0x7600,0x7605,/* 0xF0-0xF7 */ + 0x760C,0x7617,0x760A,0x7625,0x7618,0x7615,0x7619,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x998C,0x998E,0x999A,0x999B,0x999C,0x999D,0x999E,0x999F,/* 0x40-0x47 */ + 0x99A0,0x99A1,0x99A2,0x99A3,0x99A4,0x99A6,0x99A7,0x99A9,/* 0x48-0x4F */ + 0x99AA,0x99AB,0x99AC,0x99AD,0x99AE,0x99AF,0x99B0,0x99B1,/* 0x50-0x57 */ + 0x99B2,0x99B3,0x99B4,0x99B5,0x99B6,0x99B7,0x99B8,0x99B9,/* 0x58-0x5F */ + 0x99BA,0x99BB,0x99BC,0x99BD,0x99BE,0x99BF,0x99C0,0x99C1,/* 0x60-0x67 */ + 0x99C2,0x99C3,0x99C4,0x99C5,0x99C6,0x99C7,0x99C8,0x99C9,/* 0x68-0x6F */ + 0x99CA,0x99CB,0x99CC,0x99CD,0x99CE,0x99CF,0x99D0,0x99D1,/* 0x70-0x77 */ + 0x99D2,0x99D3,0x99D4,0x99D5,0x99D6,0x99D7,0x99D8,0x0000,/* 0x78-0x7F */ + + 0x99D9,0x99DA,0x99DB,0x99DC,0x99DD,0x99DE,0x99DF,0x99E0,/* 0x80-0x87 */ + 0x99E1,0x99E2,0x99E3,0x99E4,0x99E5,0x99E6,0x99E7,0x99E8,/* 0x88-0x8F */ + 0x99E9,0x99EA,0x99EB,0x99EC,0x99ED,0x99EE,0x99EF,0x99F0,/* 0x90-0x97 */ + 0x99F1,0x99F2,0x99F3,0x99F4,0x99F5,0x99F6,0x99F7,0x99F8,/* 0x98-0x9F */ + 0x99F9,0x761B,0x763C,0x7622,0x7620,0x7640,0x762D,0x7630,/* 0xA0-0xA7 */ + 0x763F,0x7635,0x7643,0x763E,0x7633,0x764D,0x765E,0x7654,/* 0xA8-0xAF */ + 0x765C,0x7656,0x766B,0x766F,0x7FCA,0x7AE6,0x7A78,0x7A79,/* 0xB0-0xB7 */ + 0x7A80,0x7A86,0x7A88,0x7A95,0x7AA6,0x7AA0,0x7AAC,0x7AA8,/* 0xB8-0xBF */ + 0x7AAD,0x7AB3,0x8864,0x8869,0x8872,0x887D,0x887F,0x8882,/* 0xC0-0xC7 */ + 0x88A2,0x88C6,0x88B7,0x88BC,0x88C9,0x88E2,0x88CE,0x88E3,/* 0xC8-0xCF */ + 0x88E5,0x88F1,0x891A,0x88FC,0x88E8,0x88FE,0x88F0,0x8921,/* 0xD0-0xD7 */ + 0x8919,0x8913,0x891B,0x890A,0x8934,0x892B,0x8936,0x8941,/* 0xD8-0xDF */ + 0x8966,0x897B,0x758B,0x80E5,0x76B2,0x76B4,0x77DC,0x8012,/* 0xE0-0xE7 */ + 0x8014,0x8016,0x801C,0x8020,0x8022,0x8025,0x8026,0x8027,/* 0xE8-0xEF */ + 0x8029,0x8028,0x8031,0x800B,0x8035,0x8043,0x8046,0x804D,/* 0xF0-0xF7 */ + 0x8052,0x8069,0x8071,0x8983,0x9878,0x9880,0x9883,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x99FA,0x99FB,0x99FC,0x99FD,0x99FE,0x99FF,0x9A00,0x9A01,/* 0x40-0x47 */ + 0x9A02,0x9A03,0x9A04,0x9A05,0x9A06,0x9A07,0x9A08,0x9A09,/* 0x48-0x4F */ + 0x9A0A,0x9A0B,0x9A0C,0x9A0D,0x9A0E,0x9A0F,0x9A10,0x9A11,/* 0x50-0x57 */ + 0x9A12,0x9A13,0x9A14,0x9A15,0x9A16,0x9A17,0x9A18,0x9A19,/* 0x58-0x5F */ + 0x9A1A,0x9A1B,0x9A1C,0x9A1D,0x9A1E,0x9A1F,0x9A20,0x9A21,/* 0x60-0x67 */ + 0x9A22,0x9A23,0x9A24,0x9A25,0x9A26,0x9A27,0x9A28,0x9A29,/* 0x68-0x6F */ + 0x9A2A,0x9A2B,0x9A2C,0x9A2D,0x9A2E,0x9A2F,0x9A30,0x9A31,/* 0x70-0x77 */ + 0x9A32,0x9A33,0x9A34,0x9A35,0x9A36,0x9A37,0x9A38,0x0000,/* 0x78-0x7F */ + + 0x9A39,0x9A3A,0x9A3B,0x9A3C,0x9A3D,0x9A3E,0x9A3F,0x9A40,/* 0x80-0x87 */ + 0x9A41,0x9A42,0x9A43,0x9A44,0x9A45,0x9A46,0x9A47,0x9A48,/* 0x88-0x8F */ + 0x9A49,0x9A4A,0x9A4B,0x9A4C,0x9A4D,0x9A4E,0x9A4F,0x9A50,/* 0x90-0x97 */ + 0x9A51,0x9A52,0x9A53,0x9A54,0x9A55,0x9A56,0x9A57,0x9A58,/* 0x98-0x9F */ + 0x9A59,0x9889,0x988C,0x988D,0x988F,0x9894,0x989A,0x989B,/* 0xA0-0xA7 */ + 0x989E,0x989F,0x98A1,0x98A2,0x98A5,0x98A6,0x864D,0x8654,/* 0xA8-0xAF */ + 0x866C,0x866E,0x867F,0x867A,0x867C,0x867B,0x86A8,0x868D,/* 0xB0-0xB7 */ + 0x868B,0x86AC,0x869D,0x86A7,0x86A3,0x86AA,0x8693,0x86A9,/* 0xB8-0xBF */ + 0x86B6,0x86C4,0x86B5,0x86CE,0x86B0,0x86BA,0x86B1,0x86AF,/* 0xC0-0xC7 */ + 0x86C9,0x86CF,0x86B4,0x86E9,0x86F1,0x86F2,0x86ED,0x86F3,/* 0xC8-0xCF */ + 0x86D0,0x8713,0x86DE,0x86F4,0x86DF,0x86D8,0x86D1,0x8703,/* 0xD0-0xD7 */ + 0x8707,0x86F8,0x8708,0x870A,0x870D,0x8709,0x8723,0x873B,/* 0xD8-0xDF */ + 0x871E,0x8725,0x872E,0x871A,0x873E,0x8748,0x8734,0x8731,/* 0xE0-0xE7 */ + 0x8729,0x8737,0x873F,0x8782,0x8722,0x877D,0x877E,0x877B,/* 0xE8-0xEF */ + 0x8760,0x8770,0x874C,0x876E,0x878B,0x8753,0x8763,0x877C,/* 0xF0-0xF7 */ + 0x8764,0x8759,0x8765,0x8793,0x87AF,0x87A8,0x87D2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9A5A,0x9A5B,0x9A5C,0x9A5D,0x9A5E,0x9A5F,0x9A60,0x9A61,/* 0x40-0x47 */ + 0x9A62,0x9A63,0x9A64,0x9A65,0x9A66,0x9A67,0x9A68,0x9A69,/* 0x48-0x4F */ + 0x9A6A,0x9A6B,0x9A72,0x9A83,0x9A89,0x9A8D,0x9A8E,0x9A94,/* 0x50-0x57 */ + 0x9A95,0x9A99,0x9AA6,0x9AA9,0x9AAA,0x9AAB,0x9AAC,0x9AAD,/* 0x58-0x5F */ + 0x9AAE,0x9AAF,0x9AB2,0x9AB3,0x9AB4,0x9AB5,0x9AB9,0x9ABB,/* 0x60-0x67 */ + 0x9ABD,0x9ABE,0x9ABF,0x9AC3,0x9AC4,0x9AC6,0x9AC7,0x9AC8,/* 0x68-0x6F */ + 0x9AC9,0x9ACA,0x9ACD,0x9ACE,0x9ACF,0x9AD0,0x9AD2,0x9AD4,/* 0x70-0x77 */ + 0x9AD5,0x9AD6,0x9AD7,0x9AD9,0x9ADA,0x9ADB,0x9ADC,0x0000,/* 0x78-0x7F */ + + 0x9ADD,0x9ADE,0x9AE0,0x9AE2,0x9AE3,0x9AE4,0x9AE5,0x9AE7,/* 0x80-0x87 */ + 0x9AE8,0x9AE9,0x9AEA,0x9AEC,0x9AEE,0x9AF0,0x9AF1,0x9AF2,/* 0x88-0x8F */ + 0x9AF3,0x9AF4,0x9AF5,0x9AF6,0x9AF7,0x9AF8,0x9AFA,0x9AFC,/* 0x90-0x97 */ + 0x9AFD,0x9AFE,0x9AFF,0x9B00,0x9B01,0x9B02,0x9B04,0x9B05,/* 0x98-0x9F */ + 0x9B06,0x87C6,0x8788,0x8785,0x87AD,0x8797,0x8783,0x87AB,/* 0xA0-0xA7 */ + 0x87E5,0x87AC,0x87B5,0x87B3,0x87CB,0x87D3,0x87BD,0x87D1,/* 0xA8-0xAF */ + 0x87C0,0x87CA,0x87DB,0x87EA,0x87E0,0x87EE,0x8816,0x8813,/* 0xB0-0xB7 */ + 0x87FE,0x880A,0x881B,0x8821,0x8839,0x883C,0x7F36,0x7F42,/* 0xB8-0xBF */ + 0x7F44,0x7F45,0x8210,0x7AFA,0x7AFD,0x7B08,0x7B03,0x7B04,/* 0xC0-0xC7 */ + 0x7B15,0x7B0A,0x7B2B,0x7B0F,0x7B47,0x7B38,0x7B2A,0x7B19,/* 0xC8-0xCF */ + 0x7B2E,0x7B31,0x7B20,0x7B25,0x7B24,0x7B33,0x7B3E,0x7B1E,/* 0xD0-0xD7 */ + 0x7B58,0x7B5A,0x7B45,0x7B75,0x7B4C,0x7B5D,0x7B60,0x7B6E,/* 0xD8-0xDF */ + 0x7B7B,0x7B62,0x7B72,0x7B71,0x7B90,0x7BA6,0x7BA7,0x7BB8,/* 0xE0-0xE7 */ + 0x7BAC,0x7B9D,0x7BA8,0x7B85,0x7BAA,0x7B9C,0x7BA2,0x7BAB,/* 0xE8-0xEF */ + 0x7BB4,0x7BD1,0x7BC1,0x7BCC,0x7BDD,0x7BDA,0x7BE5,0x7BE6,/* 0xF0-0xF7 */ + 0x7BEA,0x7C0C,0x7BFE,0x7BFC,0x7C0F,0x7C16,0x7C0B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9B07,0x9B09,0x9B0A,0x9B0B,0x9B0C,0x9B0D,0x9B0E,0x9B10,/* 0x40-0x47 */ + 0x9B11,0x9B12,0x9B14,0x9B15,0x9B16,0x9B17,0x9B18,0x9B19,/* 0x48-0x4F */ + 0x9B1A,0x9B1B,0x9B1C,0x9B1D,0x9B1E,0x9B20,0x9B21,0x9B22,/* 0x50-0x57 */ + 0x9B24,0x9B25,0x9B26,0x9B27,0x9B28,0x9B29,0x9B2A,0x9B2B,/* 0x58-0x5F */ + 0x9B2C,0x9B2D,0x9B2E,0x9B30,0x9B31,0x9B33,0x9B34,0x9B35,/* 0x60-0x67 */ + 0x9B36,0x9B37,0x9B38,0x9B39,0x9B3A,0x9B3D,0x9B3E,0x9B3F,/* 0x68-0x6F */ + 0x9B40,0x9B46,0x9B4A,0x9B4B,0x9B4C,0x9B4E,0x9B50,0x9B52,/* 0x70-0x77 */ + 0x9B53,0x9B55,0x9B56,0x9B57,0x9B58,0x9B59,0x9B5A,0x0000,/* 0x78-0x7F */ + + 0x9B5B,0x9B5C,0x9B5D,0x9B5E,0x9B5F,0x9B60,0x9B61,0x9B62,/* 0x80-0x87 */ + 0x9B63,0x9B64,0x9B65,0x9B66,0x9B67,0x9B68,0x9B69,0x9B6A,/* 0x88-0x8F */ + 0x9B6B,0x9B6C,0x9B6D,0x9B6E,0x9B6F,0x9B70,0x9B71,0x9B72,/* 0x90-0x97 */ + 0x9B73,0x9B74,0x9B75,0x9B76,0x9B77,0x9B78,0x9B79,0x9B7A,/* 0x98-0x9F */ + 0x9B7B,0x7C1F,0x7C2A,0x7C26,0x7C38,0x7C41,0x7C40,0x81FE,/* 0xA0-0xA7 */ + 0x8201,0x8202,0x8204,0x81EC,0x8844,0x8221,0x8222,0x8223,/* 0xA8-0xAF */ + 0x822D,0x822F,0x8228,0x822B,0x8238,0x823B,0x8233,0x8234,/* 0xB0-0xB7 */ + 0x823E,0x8244,0x8249,0x824B,0x824F,0x825A,0x825F,0x8268,/* 0xB8-0xBF */ + 0x887E,0x8885,0x8888,0x88D8,0x88DF,0x895E,0x7F9D,0x7F9F,/* 0xC0-0xC7 */ + 0x7FA7,0x7FAF,0x7FB0,0x7FB2,0x7C7C,0x6549,0x7C91,0x7C9D,/* 0xC8-0xCF */ + 0x7C9C,0x7C9E,0x7CA2,0x7CB2,0x7CBC,0x7CBD,0x7CC1,0x7CC7,/* 0xD0-0xD7 */ + 0x7CCC,0x7CCD,0x7CC8,0x7CC5,0x7CD7,0x7CE8,0x826E,0x66A8,/* 0xD8-0xDF */ + 0x7FBF,0x7FCE,0x7FD5,0x7FE5,0x7FE1,0x7FE6,0x7FE9,0x7FEE,/* 0xE0-0xE7 */ + 0x7FF3,0x7CF8,0x7D77,0x7DA6,0x7DAE,0x7E47,0x7E9B,0x9EB8,/* 0xE8-0xEF */ + 0x9EB4,0x8D73,0x8D84,0x8D94,0x8D91,0x8DB1,0x8D67,0x8D6D,/* 0xF0-0xF7 */ + 0x8C47,0x8C49,0x914A,0x9150,0x914E,0x914F,0x9164,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9B7C,0x9B7D,0x9B7E,0x9B7F,0x9B80,0x9B81,0x9B82,0x9B83,/* 0x40-0x47 */ + 0x9B84,0x9B85,0x9B86,0x9B87,0x9B88,0x9B89,0x9B8A,0x9B8B,/* 0x48-0x4F */ + 0x9B8C,0x9B8D,0x9B8E,0x9B8F,0x9B90,0x9B91,0x9B92,0x9B93,/* 0x50-0x57 */ + 0x9B94,0x9B95,0x9B96,0x9B97,0x9B98,0x9B99,0x9B9A,0x9B9B,/* 0x58-0x5F */ + 0x9B9C,0x9B9D,0x9B9E,0x9B9F,0x9BA0,0x9BA1,0x9BA2,0x9BA3,/* 0x60-0x67 */ + 0x9BA4,0x9BA5,0x9BA6,0x9BA7,0x9BA8,0x9BA9,0x9BAA,0x9BAB,/* 0x68-0x6F */ + 0x9BAC,0x9BAD,0x9BAE,0x9BAF,0x9BB0,0x9BB1,0x9BB2,0x9BB3,/* 0x70-0x77 */ + 0x9BB4,0x9BB5,0x9BB6,0x9BB7,0x9BB8,0x9BB9,0x9BBA,0x0000,/* 0x78-0x7F */ + + 0x9BBB,0x9BBC,0x9BBD,0x9BBE,0x9BBF,0x9BC0,0x9BC1,0x9BC2,/* 0x80-0x87 */ + 0x9BC3,0x9BC4,0x9BC5,0x9BC6,0x9BC7,0x9BC8,0x9BC9,0x9BCA,/* 0x88-0x8F */ + 0x9BCB,0x9BCC,0x9BCD,0x9BCE,0x9BCF,0x9BD0,0x9BD1,0x9BD2,/* 0x90-0x97 */ + 0x9BD3,0x9BD4,0x9BD5,0x9BD6,0x9BD7,0x9BD8,0x9BD9,0x9BDA,/* 0x98-0x9F */ + 0x9BDB,0x9162,0x9161,0x9170,0x9169,0x916F,0x917D,0x917E,/* 0xA0-0xA7 */ + 0x9172,0x9174,0x9179,0x918C,0x9185,0x9190,0x918D,0x9191,/* 0xA8-0xAF */ + 0x91A2,0x91A3,0x91AA,0x91AD,0x91AE,0x91AF,0x91B5,0x91B4,/* 0xB0-0xB7 */ + 0x91BA,0x8C55,0x9E7E,0x8DB8,0x8DEB,0x8E05,0x8E59,0x8E69,/* 0xB8-0xBF */ + 0x8DB5,0x8DBF,0x8DBC,0x8DBA,0x8DC4,0x8DD6,0x8DD7,0x8DDA,/* 0xC0-0xC7 */ + 0x8DDE,0x8DCE,0x8DCF,0x8DDB,0x8DC6,0x8DEC,0x8DF7,0x8DF8,/* 0xC8-0xCF */ + 0x8DE3,0x8DF9,0x8DFB,0x8DE4,0x8E09,0x8DFD,0x8E14,0x8E1D,/* 0xD0-0xD7 */ + 0x8E1F,0x8E2C,0x8E2E,0x8E23,0x8E2F,0x8E3A,0x8E40,0x8E39,/* 0xD8-0xDF */ + 0x8E35,0x8E3D,0x8E31,0x8E49,0x8E41,0x8E42,0x8E51,0x8E52,/* 0xE0-0xE7 */ + 0x8E4A,0x8E70,0x8E76,0x8E7C,0x8E6F,0x8E74,0x8E85,0x8E8F,/* 0xE8-0xEF */ + 0x8E94,0x8E90,0x8E9C,0x8E9E,0x8C78,0x8C82,0x8C8A,0x8C85,/* 0xF0-0xF7 */ + 0x8C98,0x8C94,0x659B,0x89D6,0x89DE,0x89DA,0x89DC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9BDC,0x9BDD,0x9BDE,0x9BDF,0x9BE0,0x9BE1,0x9BE2,0x9BE3,/* 0x40-0x47 */ + 0x9BE4,0x9BE5,0x9BE6,0x9BE7,0x9BE8,0x9BE9,0x9BEA,0x9BEB,/* 0x48-0x4F */ + 0x9BEC,0x9BED,0x9BEE,0x9BEF,0x9BF0,0x9BF1,0x9BF2,0x9BF3,/* 0x50-0x57 */ + 0x9BF4,0x9BF5,0x9BF6,0x9BF7,0x9BF8,0x9BF9,0x9BFA,0x9BFB,/* 0x58-0x5F */ + 0x9BFC,0x9BFD,0x9BFE,0x9BFF,0x9C00,0x9C01,0x9C02,0x9C03,/* 0x60-0x67 */ + 0x9C04,0x9C05,0x9C06,0x9C07,0x9C08,0x9C09,0x9C0A,0x9C0B,/* 0x68-0x6F */ + 0x9C0C,0x9C0D,0x9C0E,0x9C0F,0x9C10,0x9C11,0x9C12,0x9C13,/* 0x70-0x77 */ + 0x9C14,0x9C15,0x9C16,0x9C17,0x9C18,0x9C19,0x9C1A,0x0000,/* 0x78-0x7F */ + + 0x9C1B,0x9C1C,0x9C1D,0x9C1E,0x9C1F,0x9C20,0x9C21,0x9C22,/* 0x80-0x87 */ + 0x9C23,0x9C24,0x9C25,0x9C26,0x9C27,0x9C28,0x9C29,0x9C2A,/* 0x88-0x8F */ + 0x9C2B,0x9C2C,0x9C2D,0x9C2E,0x9C2F,0x9C30,0x9C31,0x9C32,/* 0x90-0x97 */ + 0x9C33,0x9C34,0x9C35,0x9C36,0x9C37,0x9C38,0x9C39,0x9C3A,/* 0x98-0x9F */ + 0x9C3B,0x89E5,0x89EB,0x89EF,0x8A3E,0x8B26,0x9753,0x96E9,/* 0xA0-0xA7 */ + 0x96F3,0x96EF,0x9706,0x9701,0x9708,0x970F,0x970E,0x972A,/* 0xA8-0xAF */ + 0x972D,0x9730,0x973E,0x9F80,0x9F83,0x9F85,0x9F86,0x9F87,/* 0xB0-0xB7 */ + 0x9F88,0x9F89,0x9F8A,0x9F8C,0x9EFE,0x9F0B,0x9F0D,0x96B9,/* 0xB8-0xBF */ + 0x96BC,0x96BD,0x96CE,0x96D2,0x77BF,0x96E0,0x928E,0x92AE,/* 0xC0-0xC7 */ + 0x92C8,0x933E,0x936A,0x93CA,0x938F,0x943E,0x946B,0x9C7F,/* 0xC8-0xCF */ + 0x9C82,0x9C85,0x9C86,0x9C87,0x9C88,0x7A23,0x9C8B,0x9C8E,/* 0xD0-0xD7 */ + 0x9C90,0x9C91,0x9C92,0x9C94,0x9C95,0x9C9A,0x9C9B,0x9C9E,/* 0xD8-0xDF */ + 0x9C9F,0x9CA0,0x9CA1,0x9CA2,0x9CA3,0x9CA5,0x9CA6,0x9CA7,/* 0xE0-0xE7 */ + 0x9CA8,0x9CA9,0x9CAB,0x9CAD,0x9CAE,0x9CB0,0x9CB1,0x9CB2,/* 0xE8-0xEF */ + 0x9CB3,0x9CB4,0x9CB5,0x9CB6,0x9CB7,0x9CBA,0x9CBB,0x9CBC,/* 0xF0-0xF7 */ + 0x9CBD,0x9CC4,0x9CC5,0x9CC6,0x9CC7,0x9CCA,0x9CCB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9C3C,0x9C3D,0x9C3E,0x9C3F,0x9C40,0x9C41,0x9C42,0x9C43,/* 0x40-0x47 */ + 0x9C44,0x9C45,0x9C46,0x9C47,0x9C48,0x9C49,0x9C4A,0x9C4B,/* 0x48-0x4F */ + 0x9C4C,0x9C4D,0x9C4E,0x9C4F,0x9C50,0x9C51,0x9C52,0x9C53,/* 0x50-0x57 */ + 0x9C54,0x9C55,0x9C56,0x9C57,0x9C58,0x9C59,0x9C5A,0x9C5B,/* 0x58-0x5F */ + 0x9C5C,0x9C5D,0x9C5E,0x9C5F,0x9C60,0x9C61,0x9C62,0x9C63,/* 0x60-0x67 */ + 0x9C64,0x9C65,0x9C66,0x9C67,0x9C68,0x9C69,0x9C6A,0x9C6B,/* 0x68-0x6F */ + 0x9C6C,0x9C6D,0x9C6E,0x9C6F,0x9C70,0x9C71,0x9C72,0x9C73,/* 0x70-0x77 */ + 0x9C74,0x9C75,0x9C76,0x9C77,0x9C78,0x9C79,0x9C7A,0x0000,/* 0x78-0x7F */ + + 0x9C7B,0x9C7D,0x9C7E,0x9C80,0x9C83,0x9C84,0x9C89,0x9C8A,/* 0x80-0x87 */ + 0x9C8C,0x9C8F,0x9C93,0x9C96,0x9C97,0x9C98,0x9C99,0x9C9D,/* 0x88-0x8F */ + 0x9CAA,0x9CAC,0x9CAF,0x9CB9,0x9CBE,0x9CBF,0x9CC0,0x9CC1,/* 0x90-0x97 */ + 0x9CC2,0x9CC8,0x9CC9,0x9CD1,0x9CD2,0x9CDA,0x9CDB,0x9CE0,/* 0x98-0x9F */ + 0x9CE1,0x9CCC,0x9CCD,0x9CCE,0x9CCF,0x9CD0,0x9CD3,0x9CD4,/* 0xA0-0xA7 */ + 0x9CD5,0x9CD7,0x9CD8,0x9CD9,0x9CDC,0x9CDD,0x9CDF,0x9CE2,/* 0xA8-0xAF */ + 0x977C,0x9785,0x9791,0x9792,0x9794,0x97AF,0x97AB,0x97A3,/* 0xB0-0xB7 */ + 0x97B2,0x97B4,0x9AB1,0x9AB0,0x9AB7,0x9E58,0x9AB6,0x9ABA,/* 0xB8-0xBF */ + 0x9ABC,0x9AC1,0x9AC0,0x9AC5,0x9AC2,0x9ACB,0x9ACC,0x9AD1,/* 0xC0-0xC7 */ + 0x9B45,0x9B43,0x9B47,0x9B49,0x9B48,0x9B4D,0x9B51,0x98E8,/* 0xC8-0xCF */ + 0x990D,0x992E,0x9955,0x9954,0x9ADF,0x9AE1,0x9AE6,0x9AEF,/* 0xD0-0xD7 */ + 0x9AEB,0x9AFB,0x9AED,0x9AF9,0x9B08,0x9B0F,0x9B13,0x9B1F,/* 0xD8-0xDF */ + 0x9B23,0x9EBD,0x9EBE,0x7E3B,0x9E82,0x9E87,0x9E88,0x9E8B,/* 0xE0-0xE7 */ + 0x9E92,0x93D6,0x9E9D,0x9E9F,0x9EDB,0x9EDC,0x9EDD,0x9EE0,/* 0xE8-0xEF */ + 0x9EDF,0x9EE2,0x9EE9,0x9EE7,0x9EE5,0x9EEA,0x9EEF,0x9F22,/* 0xF0-0xF7 */ + 0x9F2C,0x9F2F,0x9F39,0x9F37,0x9F3D,0x9F3E,0x9F44,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9CE3,0x9CE4,0x9CE5,0x9CE6,0x9CE7,0x9CE8,0x9CE9,0x9CEA,/* 0x40-0x47 */ + 0x9CEB,0x9CEC,0x9CED,0x9CEE,0x9CEF,0x9CF0,0x9CF1,0x9CF2,/* 0x48-0x4F */ + 0x9CF3,0x9CF4,0x9CF5,0x9CF6,0x9CF7,0x9CF8,0x9CF9,0x9CFA,/* 0x50-0x57 */ + 0x9CFB,0x9CFC,0x9CFD,0x9CFE,0x9CFF,0x9D00,0x9D01,0x9D02,/* 0x58-0x5F */ + 0x9D03,0x9D04,0x9D05,0x9D06,0x9D07,0x9D08,0x9D09,0x9D0A,/* 0x60-0x67 */ + 0x9D0B,0x9D0C,0x9D0D,0x9D0E,0x9D0F,0x9D10,0x9D11,0x9D12,/* 0x68-0x6F */ + 0x9D13,0x9D14,0x9D15,0x9D16,0x9D17,0x9D18,0x9D19,0x9D1A,/* 0x70-0x77 */ + 0x9D1B,0x9D1C,0x9D1D,0x9D1E,0x9D1F,0x9D20,0x9D21,0x0000,/* 0x78-0x7F */ + + 0x9D22,0x9D23,0x9D24,0x9D25,0x9D26,0x9D27,0x9D28,0x9D29,/* 0x80-0x87 */ + 0x9D2A,0x9D2B,0x9D2C,0x9D2D,0x9D2E,0x9D2F,0x9D30,0x9D31,/* 0x88-0x8F */ + 0x9D32,0x9D33,0x9D34,0x9D35,0x9D36,0x9D37,0x9D38,0x9D39,/* 0x90-0x97 */ + 0x9D3A,0x9D3B,0x9D3C,0x9D3D,0x9D3E,0x9D3F,0x9D40,0x9D41,/* 0x98-0x9F */ + 0x9D42,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_F9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9D43,0x9D44,0x9D45,0x9D46,0x9D47,0x9D48,0x9D49,0x9D4A,/* 0x40-0x47 */ + 0x9D4B,0x9D4C,0x9D4D,0x9D4E,0x9D4F,0x9D50,0x9D51,0x9D52,/* 0x48-0x4F */ + 0x9D53,0x9D54,0x9D55,0x9D56,0x9D57,0x9D58,0x9D59,0x9D5A,/* 0x50-0x57 */ + 0x9D5B,0x9D5C,0x9D5D,0x9D5E,0x9D5F,0x9D60,0x9D61,0x9D62,/* 0x58-0x5F */ + 0x9D63,0x9D64,0x9D65,0x9D66,0x9D67,0x9D68,0x9D69,0x9D6A,/* 0x60-0x67 */ + 0x9D6B,0x9D6C,0x9D6D,0x9D6E,0x9D6F,0x9D70,0x9D71,0x9D72,/* 0x68-0x6F */ + 0x9D73,0x9D74,0x9D75,0x9D76,0x9D77,0x9D78,0x9D79,0x9D7A,/* 0x70-0x77 */ + 0x9D7B,0x9D7C,0x9D7D,0x9D7E,0x9D7F,0x9D80,0x9D81,0x0000,/* 0x78-0x7F */ + + 0x9D82,0x9D83,0x9D84,0x9D85,0x9D86,0x9D87,0x9D88,0x9D89,/* 0x80-0x87 */ + 0x9D8A,0x9D8B,0x9D8C,0x9D8D,0x9D8E,0x9D8F,0x9D90,0x9D91,/* 0x88-0x8F */ + 0x9D92,0x9D93,0x9D94,0x9D95,0x9D96,0x9D97,0x9D98,0x9D99,/* 0x90-0x97 */ + 0x9D9A,0x9D9B,0x9D9C,0x9D9D,0x9D9E,0x9D9F,0x9DA0,0x9DA1,/* 0x98-0x9F */ + 0x9DA2,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_FA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9DA3,0x9DA4,0x9DA5,0x9DA6,0x9DA7,0x9DA8,0x9DA9,0x9DAA,/* 0x40-0x47 */ + 0x9DAB,0x9DAC,0x9DAD,0x9DAE,0x9DAF,0x9DB0,0x9DB1,0x9DB2,/* 0x48-0x4F */ + 0x9DB3,0x9DB4,0x9DB5,0x9DB6,0x9DB7,0x9DB8,0x9DB9,0x9DBA,/* 0x50-0x57 */ + 0x9DBB,0x9DBC,0x9DBD,0x9DBE,0x9DBF,0x9DC0,0x9DC1,0x9DC2,/* 0x58-0x5F */ + 0x9DC3,0x9DC4,0x9DC5,0x9DC6,0x9DC7,0x9DC8,0x9DC9,0x9DCA,/* 0x60-0x67 */ + 0x9DCB,0x9DCC,0x9DCD,0x9DCE,0x9DCF,0x9DD0,0x9DD1,0x9DD2,/* 0x68-0x6F */ + 0x9DD3,0x9DD4,0x9DD5,0x9DD6,0x9DD7,0x9DD8,0x9DD9,0x9DDA,/* 0x70-0x77 */ + 0x9DDB,0x9DDC,0x9DDD,0x9DDE,0x9DDF,0x9DE0,0x9DE1,0x0000,/* 0x78-0x7F */ + + 0x9DE2,0x9DE3,0x9DE4,0x9DE5,0x9DE6,0x9DE7,0x9DE8,0x9DE9,/* 0x80-0x87 */ + 0x9DEA,0x9DEB,0x9DEC,0x9DED,0x9DEE,0x9DEF,0x9DF0,0x9DF1,/* 0x88-0x8F */ + 0x9DF2,0x9DF3,0x9DF4,0x9DF5,0x9DF6,0x9DF7,0x9DF8,0x9DF9,/* 0x90-0x97 */ + 0x9DFA,0x9DFB,0x9DFC,0x9DFD,0x9DFE,0x9DFF,0x9E00,0x9E01,/* 0x98-0x9F */ + 0x9E02,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_FB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9E03,0x9E04,0x9E05,0x9E06,0x9E07,0x9E08,0x9E09,0x9E0A,/* 0x40-0x47 */ + 0x9E0B,0x9E0C,0x9E0D,0x9E0E,0x9E0F,0x9E10,0x9E11,0x9E12,/* 0x48-0x4F */ + 0x9E13,0x9E14,0x9E15,0x9E16,0x9E17,0x9E18,0x9E19,0x9E1A,/* 0x50-0x57 */ + 0x9E1B,0x9E1C,0x9E1D,0x9E1E,0x9E24,0x9E27,0x9E2E,0x9E30,/* 0x58-0x5F */ + 0x9E34,0x9E3B,0x9E3C,0x9E40,0x9E4D,0x9E50,0x9E52,0x9E53,/* 0x60-0x67 */ + 0x9E54,0x9E56,0x9E59,0x9E5D,0x9E5F,0x9E60,0x9E61,0x9E62,/* 0x68-0x6F */ + 0x9E65,0x9E6E,0x9E6F,0x9E72,0x9E74,0x9E75,0x9E76,0x9E77,/* 0x70-0x77 */ + 0x9E78,0x9E79,0x9E7A,0x9E7B,0x9E7C,0x9E7D,0x9E80,0x0000,/* 0x78-0x7F */ + + 0x9E81,0x9E83,0x9E84,0x9E85,0x9E86,0x9E89,0x9E8A,0x9E8C,/* 0x80-0x87 */ + 0x9E8D,0x9E8E,0x9E8F,0x9E90,0x9E91,0x9E94,0x9E95,0x9E96,/* 0x88-0x8F */ + 0x9E97,0x9E98,0x9E99,0x9E9A,0x9E9B,0x9E9C,0x9E9E,0x9EA0,/* 0x90-0x97 */ + 0x9EA1,0x9EA2,0x9EA3,0x9EA4,0x9EA5,0x9EA7,0x9EA8,0x9EA9,/* 0x98-0x9F */ + 0x9EAA,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_FC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9EAB,0x9EAC,0x9EAD,0x9EAE,0x9EAF,0x9EB0,0x9EB1,0x9EB2,/* 0x40-0x47 */ + 0x9EB3,0x9EB5,0x9EB6,0x9EB7,0x9EB9,0x9EBA,0x9EBC,0x9EBF,/* 0x48-0x4F */ + 0x9EC0,0x9EC1,0x9EC2,0x9EC3,0x9EC5,0x9EC6,0x9EC7,0x9EC8,/* 0x50-0x57 */ + 0x9ECA,0x9ECB,0x9ECC,0x9ED0,0x9ED2,0x9ED3,0x9ED5,0x9ED6,/* 0x58-0x5F */ + 0x9ED7,0x9ED9,0x9EDA,0x9EDE,0x9EE1,0x9EE3,0x9EE4,0x9EE6,/* 0x60-0x67 */ + 0x9EE8,0x9EEB,0x9EEC,0x9EED,0x9EEE,0x9EF0,0x9EF1,0x9EF2,/* 0x68-0x6F */ + 0x9EF3,0x9EF4,0x9EF5,0x9EF6,0x9EF7,0x9EF8,0x9EFA,0x9EFD,/* 0x70-0x77 */ + 0x9EFF,0x9F00,0x9F01,0x9F02,0x9F03,0x9F04,0x9F05,0x0000,/* 0x78-0x7F */ + + 0x9F06,0x9F07,0x9F08,0x9F09,0x9F0A,0x9F0C,0x9F0F,0x9F11,/* 0x80-0x87 */ + 0x9F12,0x9F14,0x9F15,0x9F16,0x9F18,0x9F1A,0x9F1B,0x9F1C,/* 0x88-0x8F */ + 0x9F1D,0x9F1E,0x9F1F,0x9F21,0x9F23,0x9F24,0x9F25,0x9F26,/* 0x90-0x97 */ + 0x9F27,0x9F28,0x9F29,0x9F2A,0x9F2B,0x9F2D,0x9F2E,0x9F30,/* 0x98-0x9F */ + 0x9F31,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_FD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9F32,0x9F33,0x9F34,0x9F35,0x9F36,0x9F38,0x9F3A,0x9F3C,/* 0x40-0x47 */ + 0x9F3F,0x9F40,0x9F41,0x9F42,0x9F43,0x9F45,0x9F46,0x9F47,/* 0x48-0x4F */ + 0x9F48,0x9F49,0x9F4A,0x9F4B,0x9F4C,0x9F4D,0x9F4E,0x9F4F,/* 0x50-0x57 */ + 0x9F52,0x9F53,0x9F54,0x9F55,0x9F56,0x9F57,0x9F58,0x9F59,/* 0x58-0x5F */ + 0x9F5A,0x9F5B,0x9F5C,0x9F5D,0x9F5E,0x9F5F,0x9F60,0x9F61,/* 0x60-0x67 */ + 0x9F62,0x9F63,0x9F64,0x9F65,0x9F66,0x9F67,0x9F68,0x9F69,/* 0x68-0x6F */ + 0x9F6A,0x9F6B,0x9F6C,0x9F6D,0x9F6E,0x9F6F,0x9F70,0x9F71,/* 0x70-0x77 */ + 0x9F72,0x9F73,0x9F74,0x9F75,0x9F76,0x9F77,0x9F78,0x0000,/* 0x78-0x7F */ + + 0x9F79,0x9F7A,0x9F7B,0x9F7C,0x9F7D,0x9F7E,0x9F81,0x9F82,/* 0x80-0x87 */ + 0x9F8D,0x9F8E,0x9F8F,0x9F90,0x9F91,0x9F92,0x9F93,0x9F94,/* 0x88-0x8F */ + 0x9F95,0x9F96,0x9F97,0x9F98,0x9F9C,0x9F9D,0x9F9E,0x9FA1,/* 0x90-0x97 */ + 0x9FA2,0x9FA3,0x9FA4,0x9FA5,0xF92C,0xF979,0xF995,0xF9E7,/* 0x98-0x9F */ + 0xF9F1,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_FE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0xFA0C,0xFA0D,0xFA0E,0xFA0F,0xFA11,0xFA13,0xFA14,0xFA18,/* 0x40-0x47 */ + 0xFA1F,0xFA20,0xFA21,0xFA23,0xFA24,0xFA27,0xFA28,0xFA29,/* 0x48-0x4F */ +}; + +static const wchar_t *page_charset2uni[256] = { + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, c2u_81, c2u_82, c2u_83, c2u_84, c2u_85, c2u_86, c2u_87, + c2u_88, c2u_89, c2u_8A, c2u_8B, c2u_8C, c2u_8D, c2u_8E, c2u_8F, + c2u_90, c2u_91, c2u_92, c2u_93, c2u_94, c2u_95, c2u_96, c2u_97, + c2u_98, c2u_99, c2u_9A, c2u_9B, c2u_9C, c2u_9D, c2u_9E, c2u_9F, + c2u_A0, c2u_A1, c2u_A2, c2u_A3, c2u_A4, c2u_A5, c2u_A6, c2u_A7, + c2u_A8, c2u_A9, c2u_AA, c2u_AB, c2u_AC, c2u_AD, c2u_AE, c2u_AF, + c2u_B0, c2u_B1, c2u_B2, c2u_B3, c2u_B4, c2u_B5, c2u_B6, c2u_B7, + c2u_B8, c2u_B9, c2u_BA, c2u_BB, c2u_BC, c2u_BD, c2u_BE, c2u_BF, + c2u_C0, c2u_C1, c2u_C2, c2u_C3, c2u_C4, c2u_C5, c2u_C6, c2u_C7, + c2u_C8, c2u_C9, c2u_CA, c2u_CB, c2u_CC, c2u_CD, c2u_CE, c2u_CF, + c2u_D0, c2u_D1, c2u_D2, c2u_D3, c2u_D4, c2u_D5, c2u_D6, c2u_D7, + c2u_D8, c2u_D9, c2u_DA, c2u_DB, c2u_DC, c2u_DD, c2u_DE, c2u_DF, + c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, + c2u_E8, c2u_E9, c2u_EA, c2u_EB, c2u_EC, c2u_ED, c2u_EE, c2u_EF, + c2u_F0, c2u_F1, c2u_F2, c2u_F3, c2u_F4, c2u_F5, c2u_F6, c2u_F7, + c2u_F8, c2u_F9, c2u_FA, c2u_FB, c2u_FC, c2u_FD, c2u_FE, NULL, +}; + +static const unsigned char u2c_00[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xA1, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xEC, /* 0xA4-0xA7 */ + 0xA1, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xA1, 0xE3, 0xA1, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xA4, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC1, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xA8, 0xA4, 0xA8, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xA8, 0xA8, 0xA8, 0xA6, 0xA8, 0xBA, 0x00, 0x00, /* 0xE8-0xEB */ + 0xA8, 0xAC, 0xA8, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xB0, 0xA8, 0xAE, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC2, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xA8, 0xB4, 0xA8, 0xB2, 0x00, 0x00, /* 0xF8-0xFB */ + 0xA8, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_01[512] = { + 0xA8, 0xA1, 0xA8, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA5, 0xA8, 0xA5, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA7, 0xA8, 0xA7, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA9, 0xA8, 0xA9, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xA8, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xA8, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xA8, 0xAD, 0xA8, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xB1, 0xA8, 0xB1, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xA1, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xA8, 0xA3, 0xA8, 0xA3, 0xA8, 0xAB, /* 0xCC-0xCF */ + 0xA8, 0xAB, 0xA8, 0xAF, 0xA8, 0xAF, 0xA8, 0xB3, /* 0xD0-0xD3 */ + 0xA8, 0xB3, 0xA8, 0xB5, 0xA8, 0xB5, 0xA8, 0xB6, /* 0xD4-0xD7 */ + 0xA8, 0xB6, 0xA8, 0xB7, 0xA8, 0xB7, 0xA8, 0xB8, /* 0xD8-0xDB */ + 0xA8, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ +}; + +static const unsigned char u2c_02[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xA8, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xA8, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xA6, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xA1, 0xA5, 0xA8, 0x40, 0xA8, 0x41, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xA8, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ +}; + +static const unsigned char u2c_03[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xA6, 0xA1, 0xA6, 0xA2, 0xA6, 0xA3, /* 0x90-0x93 */ + 0xA6, 0xA4, 0xA6, 0xA5, 0xA6, 0xA6, 0xA6, 0xA7, /* 0x94-0x97 */ + 0xA6, 0xA8, 0xA6, 0xA9, 0xA6, 0xAA, 0xA6, 0xAB, /* 0x98-0x9B */ + 0xA6, 0xAC, 0xA6, 0xAD, 0xA6, 0xAE, 0xA6, 0xAF, /* 0x9C-0x9F */ + 0xA6, 0xB0, 0xA6, 0xB1, 0x00, 0x00, 0xA6, 0xB2, /* 0xA0-0xA3 */ + 0xA6, 0xB3, 0xA6, 0xB4, 0xA6, 0xB5, 0xA6, 0xB6, /* 0xA4-0xA7 */ + 0xA6, 0xB7, 0xA6, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xA6, 0xC1, 0xA6, 0xC2, 0xA6, 0xC3, /* 0xB0-0xB3 */ + 0xA6, 0xC4, 0xA6, 0xC5, 0xA6, 0xC6, 0xA6, 0xC7, /* 0xB4-0xB7 */ + 0xA6, 0xC8, 0xA6, 0xC9, 0xA6, 0xCA, 0xA6, 0xCB, /* 0xB8-0xBB */ + 0xA6, 0xCC, 0xA6, 0xCD, 0xA6, 0xCE, 0xA6, 0xCF, /* 0xBC-0xBF */ + 0xA6, 0xD0, 0xA6, 0xD1, 0x00, 0x00, 0xA6, 0xD2, /* 0xC0-0xC3 */ + 0xA6, 0xD3, 0xA6, 0xD4, 0xA6, 0xD5, 0xA6, 0xD6, /* 0xC4-0xC7 */ + 0xA6, 0xD7, 0xA6, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ +}; + +static const unsigned char u2c_04[512] = { + 0x00, 0x00, 0xA7, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xA7, 0xA1, 0xA7, 0xA2, 0xA7, 0xA3, 0xA7, 0xA4, /* 0x10-0x13 */ + 0xA7, 0xA5, 0xA7, 0xA6, 0xA7, 0xA8, 0xA7, 0xA9, /* 0x14-0x17 */ + 0xA7, 0xAA, 0xA7, 0xAB, 0xA7, 0xAC, 0xA7, 0xAD, /* 0x18-0x1B */ + 0xA7, 0xAE, 0xA7, 0xAF, 0xA7, 0xB0, 0xA7, 0xB1, /* 0x1C-0x1F */ + 0xA7, 0xB2, 0xA7, 0xB3, 0xA7, 0xB4, 0xA7, 0xB5, /* 0x20-0x23 */ + 0xA7, 0xB6, 0xA7, 0xB7, 0xA7, 0xB8, 0xA7, 0xB9, /* 0x24-0x27 */ + 0xA7, 0xBA, 0xA7, 0xBB, 0xA7, 0xBC, 0xA7, 0xBD, /* 0x28-0x2B */ + 0xA7, 0xBE, 0xA7, 0xBF, 0xA7, 0xC0, 0xA7, 0xC1, /* 0x2C-0x2F */ + 0xA7, 0xD1, 0xA7, 0xD2, 0xA7, 0xD3, 0xA7, 0xD4, /* 0x30-0x33 */ + 0xA7, 0xD5, 0xA7, 0xD6, 0xA7, 0xD8, 0xA7, 0xD9, /* 0x34-0x37 */ + 0xA7, 0xDA, 0xA7, 0xDB, 0xA7, 0xDC, 0xA7, 0xDD, /* 0x38-0x3B */ + 0xA7, 0xDE, 0xA7, 0xDF, 0xA7, 0xE0, 0xA7, 0xE1, /* 0x3C-0x3F */ + 0xA7, 0xE2, 0xA7, 0xE3, 0xA7, 0xE4, 0xA7, 0xE5, /* 0x40-0x43 */ + 0xA7, 0xE6, 0xA7, 0xE7, 0xA7, 0xE8, 0xA7, 0xE9, /* 0x44-0x47 */ + 0xA7, 0xEA, 0xA7, 0xEB, 0xA7, 0xEC, 0xA7, 0xED, /* 0x48-0x4B */ + 0xA7, 0xEE, 0xA7, 0xEF, 0xA7, 0xF0, 0xA7, 0xF1, /* 0x4C-0x4F */ + 0x00, 0x00, 0xA7, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ +}; + +static const unsigned char u2c_20[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xA9, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x43, /* 0x10-0x13 */ + 0xA1, 0xAA, 0xA8, 0x44, 0xA1, 0xAC, 0x00, 0x00, /* 0x14-0x17 */ + 0xA1, 0xAE, 0xA1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xA1, 0xB0, 0xA1, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xA8, 0x45, 0xA1, 0xAD, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xA1, 0xEB, 0x00, 0x00, 0xA1, 0xE4, 0xA1, 0xE5, /* 0x30-0x33 */ + 0x00, 0x00, 0xA8, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF9, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xA3, 0xFE, 0x00, 0x00, /* 0x3C-0x3F */ +}; + +static const unsigned char u2c_21[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xE6, /* 0x00-0x03 */ + 0x00, 0x00, 0xA8, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xA8, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xED, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xA9, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA2, 0xF1, 0xA2, 0xF2, 0xA2, 0xF3, 0xA2, 0xF4, /* 0x60-0x63 */ + 0xA2, 0xF5, 0xA2, 0xF6, 0xA2, 0xF7, 0xA2, 0xF8, /* 0x64-0x67 */ + 0xA2, 0xF9, 0xA2, 0xFA, 0xA2, 0xFB, 0xA2, 0xFC, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xA2, 0xA1, 0xA2, 0xA2, 0xA2, 0xA3, 0xA2, 0xA4, /* 0x70-0x73 */ + 0xA2, 0xA5, 0xA2, 0xA6, 0xA2, 0xA7, 0xA2, 0xA8, /* 0x74-0x77 */ + 0xA2, 0xA9, 0xA2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xA1, 0xFB, 0xA1, 0xFC, 0xA1, 0xFA, 0xA1, 0xFD, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0x49, 0xA8, 0x4A, /* 0x94-0x97 */ + 0xA8, 0x4B, 0xA8, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ +}; + +static const unsigned char u2c_22[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xA1, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC7, /* 0x0C-0x0F */ + 0x00, 0x00, 0xA1, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xA8, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xA1, 0xE3, 0x00, 0x00, 0xA1, 0xCC, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xA1, 0xD8, 0xA1, 0xDE, 0xA8, 0x4E, /* 0x1C-0x1F */ + 0xA1, 0xCF, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x4F, /* 0x20-0x23 */ + 0x00, 0x00, 0xA1, 0xCE, 0x00, 0x00, 0xA1, 0xC4, /* 0x24-0x27 */ + 0xA1, 0xC5, 0xA1, 0xC9, 0xA1, 0xC8, 0xA1, 0xD2, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD3, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xA1, 0xE0, 0xA1, 0xDF, 0xA1, 0xC3, 0xA1, 0xCB, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xA1, 0xAB, 0xA1, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xA1, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xA1, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0x50, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA1, 0xD9, 0xA1, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xA1, 0xDC, 0xA1, 0xDD, 0xA8, 0x51, 0xA8, 0x52, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xDA, 0xA1, 0xDB, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xA8, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xA1, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xA1, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x53, /* 0xBC-0xBF */ +}; + +static const unsigned char u2c_23[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD0, 0x00, 0x00, /* 0x10-0x13 */ +}; + +static const unsigned char u2c_24[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA2, 0xD9, 0xA2, 0xDA, 0xA2, 0xDB, 0xA2, 0xDC, /* 0x60-0x63 */ + 0xA2, 0xDD, 0xA2, 0xDE, 0xA2, 0xDF, 0xA2, 0xE0, /* 0x64-0x67 */ + 0xA2, 0xE1, 0xA2, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xA2, 0xC5, 0xA2, 0xC6, 0xA2, 0xC7, 0xA2, 0xC8, /* 0x74-0x77 */ + 0xA2, 0xC9, 0xA2, 0xCA, 0xA2, 0xCB, 0xA2, 0xCC, /* 0x78-0x7B */ + 0xA2, 0xCD, 0xA2, 0xCE, 0xA2, 0xCF, 0xA2, 0xD0, /* 0x7C-0x7F */ + + 0xA2, 0xD1, 0xA2, 0xD2, 0xA2, 0xD3, 0xA2, 0xD4, /* 0x80-0x83 */ + 0xA2, 0xD5, 0xA2, 0xD6, 0xA2, 0xD7, 0xA2, 0xD8, /* 0x84-0x87 */ + 0xA2, 0xB1, 0xA2, 0xB2, 0xA2, 0xB3, 0xA2, 0xB4, /* 0x88-0x8B */ + 0xA2, 0xB5, 0xA2, 0xB6, 0xA2, 0xB7, 0xA2, 0xB8, /* 0x8C-0x8F */ + 0xA2, 0xB9, 0xA2, 0xBA, 0xA2, 0xBB, 0xA2, 0xBC, /* 0x90-0x93 */ + 0xA2, 0xBD, 0xA2, 0xBE, 0xA2, 0xBF, 0xA2, 0xC0, /* 0x94-0x97 */ + 0xA2, 0xC1, 0xA2, 0xC2, 0xA2, 0xC3, 0xA2, 0xC4, /* 0x98-0x9B */ +}; + +static const unsigned char u2c_25[512] = { + 0xA9, 0xA4, 0xA9, 0xA5, 0xA9, 0xA6, 0xA9, 0xA7, /* 0x00-0x03 */ + 0xA9, 0xA8, 0xA9, 0xA9, 0xA9, 0xAA, 0xA9, 0xAB, /* 0x04-0x07 */ + 0xA9, 0xAC, 0xA9, 0xAD, 0xA9, 0xAE, 0xA9, 0xAF, /* 0x08-0x0B */ + 0xA9, 0xB0, 0xA9, 0xB1, 0xA9, 0xB2, 0xA9, 0xB3, /* 0x0C-0x0F */ + 0xA9, 0xB4, 0xA9, 0xB5, 0xA9, 0xB6, 0xA9, 0xB7, /* 0x10-0x13 */ + 0xA9, 0xB8, 0xA9, 0xB9, 0xA9, 0xBA, 0xA9, 0xBB, /* 0x14-0x17 */ + 0xA9, 0xBC, 0xA9, 0xBD, 0xA9, 0xBE, 0xA9, 0xBF, /* 0x18-0x1B */ + 0xA9, 0xC0, 0xA9, 0xC1, 0xA9, 0xC2, 0xA9, 0xC3, /* 0x1C-0x1F */ + 0xA9, 0xC4, 0xA9, 0xC5, 0xA9, 0xC6, 0xA9, 0xC7, /* 0x20-0x23 */ + 0xA9, 0xC8, 0xA9, 0xC9, 0xA9, 0xCA, 0xA9, 0xCB, /* 0x24-0x27 */ + 0xA9, 0xCC, 0xA9, 0xCD, 0xA9, 0xCE, 0xA9, 0xCF, /* 0x28-0x2B */ + 0xA9, 0xD0, 0xA9, 0xD1, 0xA9, 0xD2, 0xA9, 0xD3, /* 0x2C-0x2F */ + 0xA9, 0xD4, 0xA9, 0xD5, 0xA9, 0xD6, 0xA9, 0xD7, /* 0x30-0x33 */ + 0xA9, 0xD8, 0xA9, 0xD9, 0xA9, 0xDA, 0xA9, 0xDB, /* 0x34-0x37 */ + 0xA9, 0xDC, 0xA9, 0xDD, 0xA9, 0xDE, 0xA9, 0xDF, /* 0x38-0x3B */ + 0xA9, 0xE0, 0xA9, 0xE1, 0xA9, 0xE2, 0xA9, 0xE3, /* 0x3C-0x3F */ + 0xA9, 0xE4, 0xA9, 0xE5, 0xA9, 0xE6, 0xA9, 0xE7, /* 0x40-0x43 */ + 0xA9, 0xE8, 0xA9, 0xE9, 0xA9, 0xEA, 0xA9, 0xEB, /* 0x44-0x47 */ + 0xA9, 0xEC, 0xA9, 0xED, 0xA9, 0xEE, 0xA9, 0xEF, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xA8, 0x54, 0xA8, 0x55, 0xA8, 0x56, 0xA8, 0x57, /* 0x50-0x53 */ + 0xA8, 0x58, 0xA8, 0x59, 0xA8, 0x5A, 0xA8, 0x5B, /* 0x54-0x57 */ + 0xA8, 0x5C, 0xA8, 0x5D, 0xA8, 0x5E, 0xA8, 0x5F, /* 0x58-0x5B */ + 0xA8, 0x60, 0xA8, 0x61, 0xA8, 0x62, 0xA8, 0x63, /* 0x5C-0x5F */ + 0xA8, 0x64, 0xA8, 0x65, 0xA8, 0x66, 0xA8, 0x67, /* 0x60-0x63 */ + 0xA8, 0x68, 0xA8, 0x69, 0xA8, 0x6A, 0xA8, 0x6B, /* 0x64-0x67 */ + 0xA8, 0x6C, 0xA8, 0x6D, 0xA8, 0x6E, 0xA8, 0x6F, /* 0x68-0x6B */ + 0xA8, 0x70, 0xA8, 0x71, 0xA8, 0x72, 0xA8, 0x73, /* 0x6C-0x6F */ + 0xA8, 0x74, 0xA8, 0x75, 0xA8, 0x76, 0xA8, 0x77, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xA8, 0x78, 0xA8, 0x79, 0xA8, 0x7A, /* 0x80-0x83 */ + 0xA8, 0x7B, 0xA8, 0x7C, 0xA8, 0x7D, 0xA8, 0x7E, /* 0x84-0x87 */ + 0xA8, 0x80, 0xA8, 0x81, 0xA8, 0x82, 0xA8, 0x83, /* 0x88-0x8B */ + 0xA8, 0x84, 0xA8, 0x85, 0xA8, 0x86, 0xA8, 0x87, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x88, /* 0x90-0x93 */ + 0xA8, 0x89, 0xA8, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xA1, 0xF6, 0xA1, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF8, 0xA1, 0xF7, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xA8, 0x8B, 0xA8, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF4, 0xA1, 0xF3, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF0, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF2, 0xA1, 0xF1, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0x8D, 0xA8, 0x8E, /* 0xE0-0xE3 */ + 0xA8, 0x8F, 0xA8, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_26[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xA1, 0xEF, 0xA1, 0xEE, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xA8, 0x91, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xA1, 0xE2, 0x00, 0x00, 0xA1, 0xE1, 0x00, 0x00, /* 0x40-0x43 */ +}; + +static const unsigned char u2c_30[512] = { + 0xA1, 0xA1, 0xA1, 0xA2, 0xA1, 0xA3, 0xA1, 0xA8, /* 0x00-0x03 */ + 0x00, 0x00, 0xA1, 0xA9, 0xA9, 0x65, 0xA9, 0x96, /* 0x04-0x07 */ + 0xA1, 0xB4, 0xA1, 0xB5, 0xA1, 0xB6, 0xA1, 0xB7, /* 0x08-0x0B */ + 0xA1, 0xB8, 0xA1, 0xB9, 0xA1, 0xBA, 0xA1, 0xBB, /* 0x0C-0x0F */ + 0xA1, 0xBE, 0xA1, 0xBF, 0xA8, 0x93, 0xA1, 0xFE, /* 0x10-0x13 */ + 0xA1, 0xB2, 0xA1, 0xB3, 0xA1, 0xBC, 0xA1, 0xBD, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xA8, 0x94, 0xA8, 0x95, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xA9, 0x40, 0xA9, 0x41, 0xA9, 0x42, /* 0x20-0x23 */ + 0xA9, 0x43, 0xA9, 0x44, 0xA9, 0x45, 0xA9, 0x46, /* 0x24-0x27 */ + 0xA9, 0x47, 0xA9, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, /* 0x40-0x43 */ + 0xA4, 0xA4, 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, /* 0x44-0x47 */ + 0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xAA, 0xA4, 0xAB, /* 0x48-0x4B */ + 0xA4, 0xAC, 0xA4, 0xAD, 0xA4, 0xAE, 0xA4, 0xAF, /* 0x4C-0x4F */ + 0xA4, 0xB0, 0xA4, 0xB1, 0xA4, 0xB2, 0xA4, 0xB3, /* 0x50-0x53 */ + 0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0x54-0x57 */ + 0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0x58-0x5B */ + 0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0xA4, 0xBF, /* 0x5C-0x5F */ + 0xA4, 0xC0, 0xA4, 0xC1, 0xA4, 0xC2, 0xA4, 0xC3, /* 0x60-0x63 */ + 0xA4, 0xC4, 0xA4, 0xC5, 0xA4, 0xC6, 0xA4, 0xC7, /* 0x64-0x67 */ + 0xA4, 0xC8, 0xA4, 0xC9, 0xA4, 0xCA, 0xA4, 0xCB, /* 0x68-0x6B */ + 0xA4, 0xCC, 0xA4, 0xCD, 0xA4, 0xCE, 0xA4, 0xCF, /* 0x6C-0x6F */ + 0xA4, 0xD0, 0xA4, 0xD1, 0xA4, 0xD2, 0xA4, 0xD3, /* 0x70-0x73 */ + 0xA4, 0xD4, 0xA4, 0xD5, 0xA4, 0xD6, 0xA4, 0xD7, /* 0x74-0x77 */ + 0xA4, 0xD8, 0xA4, 0xD9, 0xA4, 0xDA, 0xA4, 0xDB, /* 0x78-0x7B */ + 0xA4, 0xDC, 0xA4, 0xDD, 0xA4, 0xDE, 0xA4, 0xDF, /* 0x7C-0x7F */ + + 0xA4, 0xE0, 0xA4, 0xE1, 0xA4, 0xE2, 0xA4, 0xE3, /* 0x80-0x83 */ + 0xA4, 0xE4, 0xA4, 0xE5, 0xA4, 0xE6, 0xA4, 0xE7, /* 0x84-0x87 */ + 0xA4, 0xE8, 0xA4, 0xE9, 0xA4, 0xEA, 0xA4, 0xEB, /* 0x88-0x8B */ + 0xA4, 0xEC, 0xA4, 0xED, 0xA4, 0xEE, 0xA4, 0xEF, /* 0x8C-0x8F */ + 0xA4, 0xF0, 0xA4, 0xF1, 0xA4, 0xF2, 0xA4, 0xF3, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA9, 0x61, /* 0x98-0x9B */ + 0xA9, 0x62, 0xA9, 0x66, 0xA9, 0x67, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xA5, 0xA1, 0xA5, 0xA2, 0xA5, 0xA3, /* 0xA0-0xA3 */ + 0xA5, 0xA4, 0xA5, 0xA5, 0xA5, 0xA6, 0xA5, 0xA7, /* 0xA4-0xA7 */ + 0xA5, 0xA8, 0xA5, 0xA9, 0xA5, 0xAA, 0xA5, 0xAB, /* 0xA8-0xAB */ + 0xA5, 0xAC, 0xA5, 0xAD, 0xA5, 0xAE, 0xA5, 0xAF, /* 0xAC-0xAF */ + 0xA5, 0xB0, 0xA5, 0xB1, 0xA5, 0xB2, 0xA5, 0xB3, /* 0xB0-0xB3 */ + 0xA5, 0xB4, 0xA5, 0xB5, 0xA5, 0xB6, 0xA5, 0xB7, /* 0xB4-0xB7 */ + 0xA5, 0xB8, 0xA5, 0xB9, 0xA5, 0xBA, 0xA5, 0xBB, /* 0xB8-0xBB */ + 0xA5, 0xBC, 0xA5, 0xBD, 0xA5, 0xBE, 0xA5, 0xBF, /* 0xBC-0xBF */ + 0xA5, 0xC0, 0xA5, 0xC1, 0xA5, 0xC2, 0xA5, 0xC3, /* 0xC0-0xC3 */ + 0xA5, 0xC4, 0xA5, 0xC5, 0xA5, 0xC6, 0xA5, 0xC7, /* 0xC4-0xC7 */ + 0xA5, 0xC8, 0xA5, 0xC9, 0xA5, 0xCA, 0xA5, 0xCB, /* 0xC8-0xCB */ + 0xA5, 0xCC, 0xA5, 0xCD, 0xA5, 0xCE, 0xA5, 0xCF, /* 0xCC-0xCF */ + 0xA5, 0xD0, 0xA5, 0xD1, 0xA5, 0xD2, 0xA5, 0xD3, /* 0xD0-0xD3 */ + 0xA5, 0xD4, 0xA5, 0xD5, 0xA5, 0xD6, 0xA5, 0xD7, /* 0xD4-0xD7 */ + 0xA5, 0xD8, 0xA5, 0xD9, 0xA5, 0xDA, 0xA5, 0xDB, /* 0xD8-0xDB */ + 0xA5, 0xDC, 0xA5, 0xDD, 0xA5, 0xDE, 0xA5, 0xDF, /* 0xDC-0xDF */ + 0xA5, 0xE0, 0xA5, 0xE1, 0xA5, 0xE2, 0xA5, 0xE3, /* 0xE0-0xE3 */ + 0xA5, 0xE4, 0xA5, 0xE5, 0xA5, 0xE6, 0xA5, 0xE7, /* 0xE4-0xE7 */ + 0xA5, 0xE8, 0xA5, 0xE9, 0xA5, 0xEA, 0xA5, 0xEB, /* 0xE8-0xEB */ + 0xA5, 0xEC, 0xA5, 0xED, 0xA5, 0xEE, 0xA5, 0xEF, /* 0xEC-0xEF */ + 0xA5, 0xF0, 0xA5, 0xF1, 0xA5, 0xF2, 0xA5, 0xF3, /* 0xF0-0xF3 */ + 0xA5, 0xF4, 0xA5, 0xF5, 0xA5, 0xF6, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xA9, 0x60, 0xA9, 0x63, 0xA9, 0x64, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_31[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xA8, 0xC5, 0xA8, 0xC6, 0xA8, 0xC7, /* 0x04-0x07 */ + 0xA8, 0xC8, 0xA8, 0xC9, 0xA8, 0xCA, 0xA8, 0xCB, /* 0x08-0x0B */ + 0xA8, 0xCC, 0xA8, 0xCD, 0xA8, 0xCE, 0xA8, 0xCF, /* 0x0C-0x0F */ + 0xA8, 0xD0, 0xA8, 0xD1, 0xA8, 0xD2, 0xA8, 0xD3, /* 0x10-0x13 */ + 0xA8, 0xD4, 0xA8, 0xD5, 0xA8, 0xD6, 0xA8, 0xD7, /* 0x14-0x17 */ + 0xA8, 0xD8, 0xA8, 0xD9, 0xA8, 0xDA, 0xA8, 0xDB, /* 0x18-0x1B */ + 0xA8, 0xDC, 0xA8, 0xDD, 0xA8, 0xDE, 0xA8, 0xDF, /* 0x1C-0x1F */ + 0xA8, 0xE0, 0xA8, 0xE1, 0xA8, 0xE2, 0xA8, 0xE3, /* 0x20-0x23 */ + 0xA8, 0xE4, 0xA8, 0xE5, 0xA8, 0xE6, 0xA8, 0xE7, /* 0x24-0x27 */ + 0xA8, 0xE8, 0xA8, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0xBB, 0xB6, 0xFE, /* 0x90-0x93 */ + 0xC8, 0xFD, 0xCB, 0xC4, 0xC9, 0xCF, 0xD6, 0xD0, /* 0x94-0x97 */ + 0xCF, 0xC2, 0xBC, 0xD7, 0xD2, 0xD2, 0xB1, 0xFB, /* 0x98-0x9B */ + 0xB6, 0xA1, 0xCC, 0xEC, 0xB5, 0xD8, 0xC8, 0xCB, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_32[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xA2, 0xE5, 0xA2, 0xE6, 0xA2, 0xE7, 0xA2, 0xE8, /* 0x20-0x23 */ + 0xA2, 0xE9, 0xA2, 0xEA, 0xA2, 0xEB, 0xA2, 0xEC, /* 0x24-0x27 */ + 0xA2, 0xED, 0xA2, 0xEE, 0xD4, 0xC2, 0xBB, 0xF0, /* 0x28-0x2B */ + 0xCB, 0xAE, 0xC4, 0xBE, 0xBD, 0xF0, 0xCD, 0xC1, /* 0x2C-0x2F */ + 0xC8, 0xD5, 0xA9, 0x5A, 0xD3, 0xD0, 0xC9, 0xE7, /* 0x30-0x33 */ + 0xC3, 0xFB, 0xCC, 0xD8, 0xB2, 0xC6, 0xD7, 0xA3, /* 0x34-0x37 */ + 0xC0, 0xCD, 0xB4, 0xFA, 0xBA, 0xF4, 0xD1, 0xA7, /* 0x38-0x3B */ + 0xBC, 0xE0, 0xC6, 0xF3, 0xD7, 0xCA, 0xD0, 0xAD, /* 0x3C-0x3F */ + 0xBC, 0xC0, 0xD0, 0xDD, 0xD7, 0xD4, 0xD6, 0xC1, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xD2, 0xBB, 0xB6, 0xFE, 0xC8, 0xFD, 0xCB, 0xC4, /* 0x80-0x83 */ + 0xCE, 0xE5, 0xC1, 0xF9, 0xC6, 0xDF, 0xB0, 0xCB, /* 0x84-0x87 */ + 0xBE, 0xC5, 0xCA, 0xAE, 0xD4, 0xC2, 0xBB, 0xF0, /* 0x88-0x8B */ + 0xCB, 0xAE, 0xC4, 0xBE, 0xBD, 0xF0, 0xCD, 0xC1, /* 0x8C-0x8F */ + 0xC8, 0xD5, 0xD6, 0xEA, 0xD3, 0xD0, 0xC9, 0xE7, /* 0x90-0x93 */ + 0xC3, 0xFB, 0xCC, 0xD8, 0xB2, 0xC6, 0xD7, 0xA3, /* 0x94-0x97 */ + 0xC0, 0xCD, 0xC3, 0xD8, 0xC4, 0xD0, 0xC5, 0xAE, /* 0x98-0x9B */ + 0xCA, 0xCA, 0xD3, 0xC5, 0x00, 0x00, 0xD7, 0xA2, /* 0x9C-0x9F */ + 0xCF, 0xEE, 0xD0, 0xDD, 0xD0, 0xB4, 0xA9, 0x49, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xD2, 0xBD, 0xD7, 0xDA, 0xD1, 0xA7, /* 0xA8-0xAB */ + 0xBC, 0xE0, 0xC6, 0xF3, 0xD7, 0xCA, 0xD0, 0xAD, /* 0xAC-0xAF */ + 0xD2, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ +}; + +static const unsigned char u2c_33[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0x4A, 0xA9, 0x4B, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xA9, 0x4C, 0xA9, 0x4D, 0xA9, 0x4E, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xA9, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xA9, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0x51, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xA9, 0x52, 0xA9, 0x53, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xA9, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ +}; + +static const unsigned char u2c_4E[512] = { + 0xD2, 0xBB, 0xB6, 0xA1, 0x81, 0x40, 0xC6, 0xDF, /* 0x00-0x03 */ + 0x81, 0x41, 0x81, 0x42, 0x81, 0x43, 0xCD, 0xF2, /* 0x04-0x07 */ + 0xD5, 0xC9, 0xC8, 0xFD, 0xC9, 0xCF, 0xCF, 0xC2, /* 0x08-0x0B */ + 0xD8, 0xA2, 0xB2, 0xBB, 0xD3, 0xEB, 0x81, 0x44, /* 0x0C-0x0F */ + 0xD8, 0xA4, 0xB3, 0xF3, 0x81, 0x45, 0xD7, 0xA8, /* 0x10-0x13 */ + 0xC7, 0xD2, 0xD8, 0xA7, 0xCA, 0xC0, 0x81, 0x46, /* 0x14-0x17 */ + 0xC7, 0xF0, 0xB1, 0xFB, 0xD2, 0xB5, 0xB4, 0xD4, /* 0x18-0x1B */ + 0xB6, 0xAB, 0xCB, 0xBF, 0xD8, 0xA9, 0x81, 0x47, /* 0x1C-0x1F */ + 0x81, 0x48, 0x81, 0x49, 0xB6, 0xAA, 0x81, 0x4A, /* 0x20-0x23 */ + 0xC1, 0xBD, 0xD1, 0xCF, 0x81, 0x4B, 0xC9, 0xA5, /* 0x24-0x27 */ + 0xD8, 0xAD, 0x81, 0x4C, 0xB8, 0xF6, 0xD1, 0xBE, /* 0x28-0x2B */ + 0xE3, 0xDC, 0xD6, 0xD0, 0x81, 0x4D, 0x81, 0x4E, /* 0x2C-0x2F */ + 0xB7, 0xE1, 0x81, 0x4F, 0xB4, 0xAE, 0x81, 0x50, /* 0x30-0x33 */ + 0xC1, 0xD9, 0x81, 0x51, 0xD8, 0xBC, 0x81, 0x52, /* 0x34-0x37 */ + 0xCD, 0xE8, 0xB5, 0xA4, 0xCE, 0xAA, 0xD6, 0xF7, /* 0x38-0x3B */ + 0x81, 0x53, 0xC0, 0xF6, 0xBE, 0xD9, 0xD8, 0xAF, /* 0x3C-0x3F */ + 0x81, 0x54, 0x81, 0x55, 0x81, 0x56, 0xC4, 0xCB, /* 0x40-0x43 */ + 0x81, 0x57, 0xBE, 0xC3, 0x81, 0x58, 0xD8, 0xB1, /* 0x44-0x47 */ + 0xC3, 0xB4, 0xD2, 0xE5, 0x81, 0x59, 0xD6, 0xAE, /* 0x48-0x4B */ + 0xCE, 0xDA, 0xD5, 0xA7, 0xBA, 0xF5, 0xB7, 0xA6, /* 0x4C-0x4F */ + 0xC0, 0xD6, 0x81, 0x5A, 0xC6, 0xB9, 0xC5, 0xD2, /* 0x50-0x53 */ + 0xC7, 0xC7, 0x81, 0x5B, 0xB9, 0xD4, 0x81, 0x5C, /* 0x54-0x57 */ + 0xB3, 0xCB, 0xD2, 0xD2, 0x81, 0x5D, 0x81, 0x5E, /* 0x58-0x5B */ + 0xD8, 0xBF, 0xBE, 0xC5, 0xC6, 0xF2, 0xD2, 0xB2, /* 0x5C-0x5F */ + 0xCF, 0xB0, 0xCF, 0xE7, 0x81, 0x5F, 0x81, 0x60, /* 0x60-0x63 */ + 0x81, 0x61, 0x81, 0x62, 0xCA, 0xE9, 0x81, 0x63, /* 0x64-0x67 */ + 0x81, 0x64, 0xD8, 0xC0, 0x81, 0x65, 0x81, 0x66, /* 0x68-0x6B */ + 0x81, 0x67, 0x81, 0x68, 0x81, 0x69, 0x81, 0x6A, /* 0x6C-0x6F */ + 0xC2, 0xF2, 0xC2, 0xD2, 0x81, 0x6B, 0xC8, 0xE9, /* 0x70-0x73 */ + 0x81, 0x6C, 0x81, 0x6D, 0x81, 0x6E, 0x81, 0x6F, /* 0x74-0x77 */ + 0x81, 0x70, 0x81, 0x71, 0x81, 0x72, 0x81, 0x73, /* 0x78-0x7B */ + 0x81, 0x74, 0x81, 0x75, 0xC7, 0xAC, 0x81, 0x76, /* 0x7C-0x7F */ + + 0x81, 0x77, 0x81, 0x78, 0x81, 0x79, 0x81, 0x7A, /* 0x80-0x83 */ + 0x81, 0x7B, 0x81, 0x7C, 0xC1, 0xCB, 0x81, 0x7D, /* 0x84-0x87 */ + 0xD3, 0xE8, 0xD5, 0xF9, 0x81, 0x7E, 0xCA, 0xC2, /* 0x88-0x8B */ + 0xB6, 0xFE, 0xD8, 0xA1, 0xD3, 0xDA, 0xBF, 0xF7, /* 0x8C-0x8F */ + 0x81, 0x80, 0xD4, 0xC6, 0xBB, 0xA5, 0xD8, 0xC1, /* 0x90-0x93 */ + 0xCE, 0xE5, 0xBE, 0xAE, 0x81, 0x81, 0x81, 0x82, /* 0x94-0x97 */ + 0xD8, 0xA8, 0x81, 0x83, 0xD1, 0xC7, 0xD0, 0xA9, /* 0x98-0x9B */ + 0x81, 0x84, 0x81, 0x85, 0x81, 0x86, 0xD8, 0xBD, /* 0x9C-0x9F */ + 0xD9, 0xEF, 0xCD, 0xF6, 0xBF, 0xBA, 0x81, 0x87, /* 0xA0-0xA3 */ + 0xBD, 0xBB, 0xBA, 0xA5, 0xD2, 0xE0, 0xB2, 0xFA, /* 0xA4-0xA7 */ + 0xBA, 0xE0, 0xC4, 0xB6, 0x81, 0x88, 0xCF, 0xED, /* 0xA8-0xAB */ + 0xBE, 0xA9, 0xCD, 0xA4, 0xC1, 0xC1, 0x81, 0x89, /* 0xAC-0xAF */ + 0x81, 0x8A, 0x81, 0x8B, 0xC7, 0xD7, 0xD9, 0xF1, /* 0xB0-0xB3 */ + 0x81, 0x8C, 0xD9, 0xF4, 0x81, 0x8D, 0x81, 0x8E, /* 0xB4-0xB7 */ + 0x81, 0x8F, 0x81, 0x90, 0xC8, 0xCB, 0xD8, 0xE9, /* 0xB8-0xBB */ + 0x81, 0x91, 0x81, 0x92, 0x81, 0x93, 0xD2, 0xDA, /* 0xBC-0xBF */ + 0xCA, 0xB2, 0xC8, 0xCA, 0xD8, 0xEC, 0xD8, 0xEA, /* 0xC0-0xC3 */ + 0xD8, 0xC6, 0xBD, 0xF6, 0xC6, 0xCD, 0xB3, 0xF0, /* 0xC4-0xC7 */ + 0x81, 0x94, 0xD8, 0xEB, 0xBD, 0xF1, 0xBD, 0xE9, /* 0xC8-0xCB */ + 0x81, 0x95, 0xC8, 0xD4, 0xB4, 0xD3, 0x81, 0x96, /* 0xCC-0xCF */ + 0x81, 0x97, 0xC2, 0xD8, 0x81, 0x98, 0xB2, 0xD6, /* 0xD0-0xD3 */ + 0xD7, 0xD0, 0xCA, 0xCB, 0xCB, 0xFB, 0xD5, 0xCC, /* 0xD4-0xD7 */ + 0xB8, 0xB6, 0xCF, 0xC9, 0x81, 0x99, 0x81, 0x9A, /* 0xD8-0xDB */ + 0x81, 0x9B, 0xD9, 0xDA, 0xD8, 0xF0, 0xC7, 0xAA, /* 0xDC-0xDF */ + 0x81, 0x9C, 0xD8, 0xEE, 0x81, 0x9D, 0xB4, 0xFA, /* 0xE0-0xE3 */ + 0xC1, 0xEE, 0xD2, 0xD4, 0x81, 0x9E, 0x81, 0x9F, /* 0xE4-0xE7 */ + 0xD8, 0xED, 0x81, 0xA0, 0xD2, 0xC7, 0xD8, 0xEF, /* 0xE8-0xEB */ + 0xC3, 0xC7, 0x81, 0xA1, 0x81, 0xA2, 0x81, 0xA3, /* 0xEC-0xEF */ + 0xD1, 0xF6, 0x81, 0xA4, 0xD6, 0xD9, 0xD8, 0xF2, /* 0xF0-0xF3 */ + 0x81, 0xA5, 0xD8, 0xF5, 0xBC, 0xFE, 0xBC, 0xDB, /* 0xF4-0xF7 */ + 0x81, 0xA6, 0x81, 0xA7, 0x81, 0xA8, 0xC8, 0xCE, /* 0xF8-0xFB */ + 0x81, 0xA9, 0xB7, 0xDD, 0x81, 0xAA, 0xB7, 0xC2, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_4F[512] = { + 0x81, 0xAB, 0xC6, 0xF3, 0x81, 0xAC, 0x81, 0xAD, /* 0x00-0x03 */ + 0x81, 0xAE, 0x81, 0xAF, 0x81, 0xB0, 0x81, 0xB1, /* 0x04-0x07 */ + 0x81, 0xB2, 0xD8, 0xF8, 0xD2, 0xC1, 0x81, 0xB3, /* 0x08-0x0B */ + 0x81, 0xB4, 0xCE, 0xE9, 0xBC, 0xBF, 0xB7, 0xFC, /* 0x0C-0x0F */ + 0xB7, 0xA5, 0xD0, 0xDD, 0x81, 0xB5, 0x81, 0xB6, /* 0x10-0x13 */ + 0x81, 0xB7, 0x81, 0xB8, 0x81, 0xB9, 0xD6, 0xDA, /* 0x14-0x17 */ + 0xD3, 0xC5, 0xBB, 0xEF, 0xBB, 0xE1, 0xD8, 0xF1, /* 0x18-0x1B */ + 0x81, 0xBA, 0x81, 0xBB, 0xC9, 0xA1, 0xCE, 0xB0, /* 0x1C-0x1F */ + 0xB4, 0xAB, 0x81, 0xBC, 0xD8, 0xF3, 0x81, 0xBD, /* 0x20-0x23 */ + 0xC9, 0xCB, 0xD8, 0xF6, 0xC2, 0xD7, 0xD8, 0xF7, /* 0x24-0x27 */ + 0x81, 0xBE, 0x81, 0xBF, 0xCE, 0xB1, 0xD8, 0xF9, /* 0x28-0x2B */ + 0x81, 0xC0, 0x81, 0xC1, 0x81, 0xC2, 0xB2, 0xAE, /* 0x2C-0x2F */ + 0xB9, 0xC0, 0x81, 0xC3, 0xD9, 0xA3, 0x81, 0xC4, /* 0x30-0x33 */ + 0xB0, 0xE9, 0x81, 0xC5, 0xC1, 0xE6, 0x81, 0xC6, /* 0x34-0x37 */ + 0xC9, 0xEC, 0x81, 0xC7, 0xCB, 0xC5, 0x81, 0xC8, /* 0x38-0x3B */ + 0xCB, 0xC6, 0xD9, 0xA4, 0x81, 0xC9, 0x81, 0xCA, /* 0x3C-0x3F */ + 0x81, 0xCB, 0x81, 0xCC, 0x81, 0xCD, 0xB5, 0xE8, /* 0x40-0x43 */ + 0x81, 0xCE, 0x81, 0xCF, 0xB5, 0xAB, 0x81, 0xD0, /* 0x44-0x47 */ + 0x81, 0xD1, 0x81, 0xD2, 0x81, 0xD3, 0x81, 0xD4, /* 0x48-0x4B */ + 0x81, 0xD5, 0xCE, 0xBB, 0xB5, 0xCD, 0xD7, 0xA1, /* 0x4C-0x4F */ + 0xD7, 0xF4, 0xD3, 0xD3, 0x81, 0xD6, 0xCC, 0xE5, /* 0x50-0x53 */ + 0x81, 0xD7, 0xBA, 0xCE, 0x81, 0xD8, 0xD9, 0xA2, /* 0x54-0x57 */ + 0xD9, 0xDC, 0xD3, 0xE0, 0xD8, 0xFD, 0xB7, 0xF0, /* 0x58-0x5B */ + 0xD7, 0xF7, 0xD8, 0xFE, 0xD8, 0xFA, 0xD9, 0xA1, /* 0x5C-0x5F */ + 0xC4, 0xE3, 0x81, 0xD9, 0x81, 0xDA, 0xD3, 0xB6, /* 0x60-0x63 */ + 0xD8, 0xF4, 0xD9, 0xDD, 0x81, 0xDB, 0xD8, 0xFB, /* 0x64-0x67 */ + 0x81, 0xDC, 0xC5, 0xE5, 0x81, 0xDD, 0x81, 0xDE, /* 0x68-0x6B */ + 0xC0, 0xD0, 0x81, 0xDF, 0x81, 0xE0, 0xD1, 0xF0, /* 0x6C-0x6F */ + 0xB0, 0xDB, 0x81, 0xE1, 0x81, 0xE2, 0xBC, 0xD1, /* 0x70-0x73 */ + 0xD9, 0xA6, 0x81, 0xE3, 0xD9, 0xA5, 0x81, 0xE4, /* 0x74-0x77 */ + 0x81, 0xE5, 0x81, 0xE6, 0x81, 0xE7, 0xD9, 0xAC, /* 0x78-0x7B */ + 0xD9, 0xAE, 0x81, 0xE8, 0xD9, 0xAB, 0xCA, 0xB9, /* 0x7C-0x7F */ + + 0x81, 0xE9, 0x81, 0xEA, 0x81, 0xEB, 0xD9, 0xA9, /* 0x80-0x83 */ + 0xD6, 0xB6, 0x81, 0xEC, 0x81, 0xED, 0x81, 0xEE, /* 0x84-0x87 */ + 0xB3, 0xDE, 0xD9, 0xA8, 0x81, 0xEF, 0xC0, 0xFD, /* 0x88-0x8B */ + 0x81, 0xF0, 0xCA, 0xCC, 0x81, 0xF1, 0xD9, 0xAA, /* 0x8C-0x8F */ + 0x81, 0xF2, 0xD9, 0xA7, 0x81, 0xF3, 0x81, 0xF4, /* 0x90-0x93 */ + 0xD9, 0xB0, 0x81, 0xF5, 0x81, 0xF6, 0xB6, 0xB1, /* 0x94-0x97 */ + 0x81, 0xF7, 0x81, 0xF8, 0x81, 0xF9, 0xB9, 0xA9, /* 0x98-0x9B */ + 0x81, 0xFA, 0xD2, 0xC0, 0x81, 0xFB, 0x81, 0xFC, /* 0x9C-0x9F */ + 0xCF, 0xC0, 0x81, 0xFD, 0x81, 0xFE, 0xC2, 0xC2, /* 0xA0-0xA3 */ + 0x82, 0x40, 0xBD, 0xC4, 0xD5, 0xEC, 0xB2, 0xE0, /* 0xA4-0xA7 */ + 0xC7, 0xC8, 0xBF, 0xEB, 0xD9, 0xAD, 0x82, 0x41, /* 0xA8-0xAB */ + 0xD9, 0xAF, 0x82, 0x42, 0xCE, 0xEA, 0xBA, 0xEE, /* 0xAC-0xAF */ + 0x82, 0x43, 0x82, 0x44, 0x82, 0x45, 0x82, 0x46, /* 0xB0-0xB3 */ + 0x82, 0x47, 0xC7, 0xD6, 0x82, 0x48, 0x82, 0x49, /* 0xB4-0xB7 */ + 0x82, 0x4A, 0x82, 0x4B, 0x82, 0x4C, 0x82, 0x4D, /* 0xB8-0xBB */ + 0x82, 0x4E, 0x82, 0x4F, 0x82, 0x50, 0xB1, 0xE3, /* 0xBC-0xBF */ + 0x82, 0x51, 0x82, 0x52, 0x82, 0x53, 0xB4, 0xD9, /* 0xC0-0xC3 */ + 0xB6, 0xED, 0xD9, 0xB4, 0x82, 0x54, 0x82, 0x55, /* 0xC4-0xC7 */ + 0x82, 0x56, 0x82, 0x57, 0xBF, 0xA1, 0x82, 0x58, /* 0xC8-0xCB */ + 0x82, 0x59, 0x82, 0x5A, 0xD9, 0xDE, 0xC7, 0xCE, /* 0xCC-0xCF */ + 0xC0, 0xFE, 0xD9, 0xB8, 0x82, 0x5B, 0x82, 0x5C, /* 0xD0-0xD3 */ + 0x82, 0x5D, 0x82, 0x5E, 0x82, 0x5F, 0xCB, 0xD7, /* 0xD4-0xD7 */ + 0xB7, 0xFD, 0x82, 0x60, 0xD9, 0xB5, 0x82, 0x61, /* 0xD8-0xDB */ + 0xD9, 0xB7, 0xB1, 0xA3, 0xD3, 0xE1, 0xD9, 0xB9, /* 0xDC-0xDF */ + 0x82, 0x62, 0xD0, 0xC5, 0x82, 0x63, 0xD9, 0xB6, /* 0xE0-0xE3 */ + 0x82, 0x64, 0x82, 0x65, 0xD9, 0xB1, 0x82, 0x66, /* 0xE4-0xE7 */ + 0xD9, 0xB2, 0xC1, 0xA9, 0xD9, 0xB3, 0x82, 0x67, /* 0xE8-0xEB */ + 0x82, 0x68, 0xBC, 0xF3, 0xD0, 0xDE, 0xB8, 0xA9, /* 0xEC-0xEF */ + 0x82, 0x69, 0xBE, 0xE3, 0x82, 0x6A, 0xD9, 0xBD, /* 0xF0-0xF3 */ + 0x82, 0x6B, 0x82, 0x6C, 0x82, 0x6D, 0x82, 0x6E, /* 0xF4-0xF7 */ + 0xD9, 0xBA, 0x82, 0x6F, 0xB0, 0xB3, 0x82, 0x70, /* 0xF8-0xFB */ + 0x82, 0x71, 0x82, 0x72, 0xD9, 0xC2, 0x82, 0x73, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_50[512] = { + 0x82, 0x74, 0x82, 0x75, 0x82, 0x76, 0x82, 0x77, /* 0x00-0x03 */ + 0x82, 0x78, 0x82, 0x79, 0x82, 0x7A, 0x82, 0x7B, /* 0x04-0x07 */ + 0x82, 0x7C, 0x82, 0x7D, 0x82, 0x7E, 0x82, 0x80, /* 0x08-0x0B */ + 0xD9, 0xC4, 0xB1, 0xB6, 0x82, 0x81, 0xD9, 0xBF, /* 0x0C-0x0F */ + 0x82, 0x82, 0x82, 0x83, 0xB5, 0xB9, 0x82, 0x84, /* 0x10-0x13 */ + 0xBE, 0xF3, 0x82, 0x85, 0x82, 0x86, 0x82, 0x87, /* 0x14-0x17 */ + 0xCC, 0xC8, 0xBA, 0xF2, 0xD2, 0xD0, 0x82, 0x88, /* 0x18-0x1B */ + 0xD9, 0xC3, 0x82, 0x89, 0x82, 0x8A, 0xBD, 0xE8, /* 0x1C-0x1F */ + 0x82, 0x8B, 0xB3, 0xAB, 0x82, 0x8C, 0x82, 0x8D, /* 0x20-0x23 */ + 0x82, 0x8E, 0xD9, 0xC5, 0xBE, 0xEB, 0x82, 0x8F, /* 0x24-0x27 */ + 0xD9, 0xC6, 0xD9, 0xBB, 0xC4, 0xDF, 0x82, 0x90, /* 0x28-0x2B */ + 0xD9, 0xBE, 0xD9, 0xC1, 0xD9, 0xC0, 0x82, 0x91, /* 0x2C-0x2F */ + 0x82, 0x92, 0x82, 0x93, 0x82, 0x94, 0x82, 0x95, /* 0x30-0x33 */ + 0x82, 0x96, 0x82, 0x97, 0x82, 0x98, 0x82, 0x99, /* 0x34-0x37 */ + 0x82, 0x9A, 0x82, 0x9B, 0xD5, 0xAE, 0x82, 0x9C, /* 0x38-0x3B */ + 0xD6, 0xB5, 0x82, 0x9D, 0xC7, 0xE3, 0x82, 0x9E, /* 0x3C-0x3F */ + 0x82, 0x9F, 0x82, 0xA0, 0x82, 0xA1, 0xD9, 0xC8, /* 0x40-0x43 */ + 0x82, 0xA2, 0x82, 0xA3, 0x82, 0xA4, 0xBC, 0xD9, /* 0x44-0x47 */ + 0xD9, 0xCA, 0x82, 0xA5, 0x82, 0xA6, 0x82, 0xA7, /* 0x48-0x4B */ + 0xD9, 0xBC, 0x82, 0xA8, 0xD9, 0xCB, 0xC6, 0xAB, /* 0x4C-0x4F */ + 0x82, 0xA9, 0x82, 0xAA, 0x82, 0xAB, 0x82, 0xAC, /* 0x50-0x53 */ + 0x82, 0xAD, 0xD9, 0xC9, 0x82, 0xAE, 0x82, 0xAF, /* 0x54-0x57 */ + 0x82, 0xB0, 0x82, 0xB1, 0xD7, 0xF6, 0x82, 0xB2, /* 0x58-0x5B */ + 0xCD, 0xA3, 0x82, 0xB3, 0x82, 0xB4, 0x82, 0xB5, /* 0x5C-0x5F */ + 0x82, 0xB6, 0x82, 0xB7, 0x82, 0xB8, 0x82, 0xB9, /* 0x60-0x63 */ + 0x82, 0xBA, 0xBD, 0xA1, 0x82, 0xBB, 0x82, 0xBC, /* 0x64-0x67 */ + 0x82, 0xBD, 0x82, 0xBE, 0x82, 0xBF, 0x82, 0xC0, /* 0x68-0x6B */ + 0xD9, 0xCC, 0x82, 0xC1, 0x82, 0xC2, 0x82, 0xC3, /* 0x6C-0x6F */ + 0x82, 0xC4, 0x82, 0xC5, 0x82, 0xC6, 0x82, 0xC7, /* 0x70-0x73 */ + 0x82, 0xC8, 0x82, 0xC9, 0xC5, 0xBC, 0xCD, 0xB5, /* 0x74-0x77 */ + 0x82, 0xCA, 0x82, 0xCB, 0x82, 0xCC, 0xD9, 0xCD, /* 0x78-0x7B */ + 0x82, 0xCD, 0x82, 0xCE, 0xD9, 0xC7, 0xB3, 0xA5, /* 0x7C-0x7F */ + + 0xBF, 0xFE, 0x82, 0xCF, 0x82, 0xD0, 0x82, 0xD1, /* 0x80-0x83 */ + 0x82, 0xD2, 0xB8, 0xB5, 0x82, 0xD3, 0x82, 0xD4, /* 0x84-0x87 */ + 0xC0, 0xFC, 0x82, 0xD5, 0x82, 0xD6, 0x82, 0xD7, /* 0x88-0x8B */ + 0x82, 0xD8, 0xB0, 0xF8, 0x82, 0xD9, 0x82, 0xDA, /* 0x8C-0x8F */ + 0x82, 0xDB, 0x82, 0xDC, 0x82, 0xDD, 0x82, 0xDE, /* 0x90-0x93 */ + 0x82, 0xDF, 0x82, 0xE0, 0x82, 0xE1, 0x82, 0xE2, /* 0x94-0x97 */ + 0x82, 0xE3, 0x82, 0xE4, 0x82, 0xE5, 0x82, 0xE6, /* 0x98-0x9B */ + 0x82, 0xE7, 0x82, 0xE8, 0x82, 0xE9, 0x82, 0xEA, /* 0x9C-0x9F */ + 0x82, 0xEB, 0x82, 0xEC, 0x82, 0xED, 0xB4, 0xF6, /* 0xA0-0xA3 */ + 0x82, 0xEE, 0xD9, 0xCE, 0x82, 0xEF, 0xD9, 0xCF, /* 0xA4-0xA7 */ + 0xB4, 0xA2, 0xD9, 0xD0, 0x82, 0xF0, 0x82, 0xF1, /* 0xA8-0xAB */ + 0xB4, 0xDF, 0x82, 0xF2, 0x82, 0xF3, 0x82, 0xF4, /* 0xAC-0xAF */ + 0x82, 0xF5, 0x82, 0xF6, 0xB0, 0xC1, 0x82, 0xF7, /* 0xB0-0xB3 */ + 0x82, 0xF8, 0x82, 0xF9, 0x82, 0xFA, 0x82, 0xFB, /* 0xB4-0xB7 */ + 0x82, 0xFC, 0x82, 0xFD, 0xD9, 0xD1, 0xC9, 0xB5, /* 0xB8-0xBB */ + 0x82, 0xFE, 0x83, 0x40, 0x83, 0x41, 0x83, 0x42, /* 0xBC-0xBF */ + 0x83, 0x43, 0x83, 0x44, 0x83, 0x45, 0x83, 0x46, /* 0xC0-0xC3 */ + 0x83, 0x47, 0x83, 0x48, 0x83, 0x49, 0x83, 0x4A, /* 0xC4-0xC7 */ + 0x83, 0x4B, 0x83, 0x4C, 0x83, 0x4D, 0x83, 0x4E, /* 0xC8-0xCB */ + 0x83, 0x4F, 0x83, 0x50, 0x83, 0x51, 0xCF, 0xF1, /* 0xCC-0xCF */ + 0x83, 0x52, 0x83, 0x53, 0x83, 0x54, 0x83, 0x55, /* 0xD0-0xD3 */ + 0x83, 0x56, 0x83, 0x57, 0xD9, 0xD2, 0x83, 0x58, /* 0xD4-0xD7 */ + 0x83, 0x59, 0x83, 0x5A, 0xC1, 0xC5, 0x83, 0x5B, /* 0xD8-0xDB */ + 0x83, 0x5C, 0x83, 0x5D, 0x83, 0x5E, 0x83, 0x5F, /* 0xDC-0xDF */ + 0x83, 0x60, 0x83, 0x61, 0x83, 0x62, 0x83, 0x63, /* 0xE0-0xE3 */ + 0x83, 0x64, 0x83, 0x65, 0xD9, 0xD6, 0xC9, 0xAE, /* 0xE4-0xE7 */ + 0x83, 0x66, 0x83, 0x67, 0x83, 0x68, 0x83, 0x69, /* 0xE8-0xEB */ + 0xD9, 0xD5, 0xD9, 0xD4, 0xD9, 0xD7, 0x83, 0x6A, /* 0xEC-0xEF */ + 0x83, 0x6B, 0x83, 0x6C, 0x83, 0x6D, 0xCB, 0xDB, /* 0xF0-0xF3 */ + 0x83, 0x6E, 0xBD, 0xA9, 0x83, 0x6F, 0x83, 0x70, /* 0xF4-0xF7 */ + 0x83, 0x71, 0x83, 0x72, 0x83, 0x73, 0xC6, 0xA7, /* 0xF8-0xFB */ + 0x83, 0x74, 0x83, 0x75, 0x83, 0x76, 0x83, 0x77, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_51[512] = { + 0x83, 0x78, 0x83, 0x79, 0x83, 0x7A, 0x83, 0x7B, /* 0x00-0x03 */ + 0x83, 0x7C, 0x83, 0x7D, 0xD9, 0xD3, 0xD9, 0xD8, /* 0x04-0x07 */ + 0x83, 0x7E, 0x83, 0x80, 0x83, 0x81, 0xD9, 0xD9, /* 0x08-0x0B */ + 0x83, 0x82, 0x83, 0x83, 0x83, 0x84, 0x83, 0x85, /* 0x0C-0x0F */ + 0x83, 0x86, 0x83, 0x87, 0xC8, 0xE5, 0x83, 0x88, /* 0x10-0x13 */ + 0x83, 0x89, 0x83, 0x8A, 0x83, 0x8B, 0x83, 0x8C, /* 0x14-0x17 */ + 0x83, 0x8D, 0x83, 0x8E, 0x83, 0x8F, 0x83, 0x90, /* 0x18-0x1B */ + 0x83, 0x91, 0x83, 0x92, 0x83, 0x93, 0x83, 0x94, /* 0x1C-0x1F */ + 0x83, 0x95, 0xC0, 0xDC, 0x83, 0x96, 0x83, 0x97, /* 0x20-0x23 */ + 0x83, 0x98, 0x83, 0x99, 0x83, 0x9A, 0x83, 0x9B, /* 0x24-0x27 */ + 0x83, 0x9C, 0x83, 0x9D, 0x83, 0x9E, 0x83, 0x9F, /* 0x28-0x2B */ + 0x83, 0xA0, 0x83, 0xA1, 0x83, 0xA2, 0x83, 0xA3, /* 0x2C-0x2F */ + 0x83, 0xA4, 0x83, 0xA5, 0x83, 0xA6, 0x83, 0xA7, /* 0x30-0x33 */ + 0x83, 0xA8, 0x83, 0xA9, 0x83, 0xAA, 0x83, 0xAB, /* 0x34-0x37 */ + 0x83, 0xAC, 0x83, 0xAD, 0x83, 0xAE, 0x83, 0xAF, /* 0x38-0x3B */ + 0x83, 0xB0, 0x83, 0xB1, 0x83, 0xB2, 0xB6, 0xF9, /* 0x3C-0x3F */ + 0xD8, 0xA3, 0xD4, 0xCA, 0x83, 0xB3, 0xD4, 0xAA, /* 0x40-0x43 */ + 0xD0, 0xD6, 0xB3, 0xE4, 0xD5, 0xD7, 0x83, 0xB4, /* 0x44-0x47 */ + 0xCF, 0xC8, 0xB9, 0xE2, 0x83, 0xB5, 0xBF, 0xCB, /* 0x48-0x4B */ + 0x83, 0xB6, 0xC3, 0xE2, 0x83, 0xB7, 0x83, 0xB8, /* 0x4C-0x4F */ + 0x83, 0xB9, 0xB6, 0xD2, 0x83, 0xBA, 0x83, 0xBB, /* 0x50-0x53 */ + 0xCD, 0xC3, 0xD9, 0xEE, 0xD9, 0xF0, 0x83, 0xBC, /* 0x54-0x57 */ + 0x83, 0xBD, 0x83, 0xBE, 0xB5, 0xB3, 0x83, 0xBF, /* 0x58-0x5B */ + 0xB6, 0xB5, 0x83, 0xC0, 0x83, 0xC1, 0x83, 0xC2, /* 0x5C-0x5F */ + 0x83, 0xC3, 0x83, 0xC4, 0xBE, 0xA4, 0x83, 0xC5, /* 0x60-0x63 */ + 0x83, 0xC6, 0xC8, 0xEB, 0x83, 0xC7, 0x83, 0xC8, /* 0x64-0x67 */ + 0xC8, 0xAB, 0x83, 0xC9, 0x83, 0xCA, 0xB0, 0xCB, /* 0x68-0x6B */ + 0xB9, 0xAB, 0xC1, 0xF9, 0xD9, 0xE2, 0x83, 0xCB, /* 0x6C-0x6F */ + 0xC0, 0xBC, 0xB9, 0xB2, 0x83, 0xCC, 0xB9, 0xD8, /* 0x70-0x73 */ + 0xD0, 0xCB, 0xB1, 0xF8, 0xC6, 0xE4, 0xBE, 0xDF, /* 0x74-0x77 */ + 0xB5, 0xE4, 0xD7, 0xC8, 0x83, 0xCD, 0xD1, 0xF8, /* 0x78-0x7B */ + 0xBC, 0xE6, 0xCA, 0xDE, 0x83, 0xCE, 0x83, 0xCF, /* 0x7C-0x7F */ + + 0xBC, 0xBD, 0xD9, 0xE6, 0xD8, 0xE7, 0x83, 0xD0, /* 0x80-0x83 */ + 0x83, 0xD1, 0xC4, 0xDA, 0x83, 0xD2, 0x83, 0xD3, /* 0x84-0x87 */ + 0xB8, 0xD4, 0xC8, 0xBD, 0x83, 0xD4, 0x83, 0xD5, /* 0x88-0x8B */ + 0xB2, 0xE1, 0xD4, 0xD9, 0x83, 0xD6, 0x83, 0xD7, /* 0x8C-0x8F */ + 0x83, 0xD8, 0x83, 0xD9, 0xC3, 0xB0, 0x83, 0xDA, /* 0x90-0x93 */ + 0x83, 0xDB, 0xC3, 0xE1, 0xDA, 0xA2, 0xC8, 0xDF, /* 0x94-0x97 */ + 0x83, 0xDC, 0xD0, 0xB4, 0x83, 0xDD, 0xBE, 0xFC, /* 0x98-0x9B */ + 0xC5, 0xA9, 0x83, 0xDE, 0x83, 0xDF, 0x83, 0xE0, /* 0x9C-0x9F */ + 0xB9, 0xDA, 0x83, 0xE1, 0xDA, 0xA3, 0x83, 0xE2, /* 0xA0-0xA3 */ + 0xD4, 0xA9, 0xDA, 0xA4, 0x83, 0xE3, 0x83, 0xE4, /* 0xA4-0xA7 */ + 0x83, 0xE5, 0x83, 0xE6, 0x83, 0xE7, 0xD9, 0xFB, /* 0xA8-0xAB */ + 0xB6, 0xAC, 0x83, 0xE8, 0x83, 0xE9, 0xB7, 0xEB, /* 0xAC-0xAF */ + 0xB1, 0xF9, 0xD9, 0xFC, 0xB3, 0xE5, 0xBE, 0xF6, /* 0xB0-0xB3 */ + 0x83, 0xEA, 0xBF, 0xF6, 0xD2, 0xB1, 0xC0, 0xE4, /* 0xB4-0xB7 */ + 0x83, 0xEB, 0x83, 0xEC, 0x83, 0xED, 0xB6, 0xB3, /* 0xB8-0xBB */ + 0xD9, 0xFE, 0xD9, 0xFD, 0x83, 0xEE, 0x83, 0xEF, /* 0xBC-0xBF */ + 0xBE, 0xBB, 0x83, 0xF0, 0x83, 0xF1, 0x83, 0xF2, /* 0xC0-0xC3 */ + 0xC6, 0xE0, 0x83, 0xF3, 0xD7, 0xBC, 0xDA, 0xA1, /* 0xC4-0xC7 */ + 0x83, 0xF4, 0xC1, 0xB9, 0x83, 0xF5, 0xB5, 0xF2, /* 0xC8-0xCB */ + 0xC1, 0xE8, 0x83, 0xF6, 0x83, 0xF7, 0xBC, 0xF5, /* 0xCC-0xCF */ + 0x83, 0xF8, 0xB4, 0xD5, 0x83, 0xF9, 0x83, 0xFA, /* 0xD0-0xD3 */ + 0x83, 0xFB, 0x83, 0xFC, 0x83, 0xFD, 0x83, 0xFE, /* 0xD4-0xD7 */ + 0x84, 0x40, 0x84, 0x41, 0x84, 0x42, 0xC1, 0xDD, /* 0xD8-0xDB */ + 0x84, 0x43, 0xC4, 0xFD, 0x84, 0x44, 0x84, 0x45, /* 0xDC-0xDF */ + 0xBC, 0xB8, 0xB7, 0xB2, 0x84, 0x46, 0x84, 0x47, /* 0xE0-0xE3 */ + 0xB7, 0xEF, 0x84, 0x48, 0x84, 0x49, 0x84, 0x4A, /* 0xE4-0xE7 */ + 0x84, 0x4B, 0x84, 0x4C, 0x84, 0x4D, 0xD9, 0xEC, /* 0xE8-0xEB */ + 0x84, 0x4E, 0xC6, 0xBE, 0x84, 0x4F, 0xBF, 0xAD, /* 0xEC-0xEF */ + 0xBB, 0xCB, 0x84, 0x50, 0x84, 0x51, 0xB5, 0xCA, /* 0xF0-0xF3 */ + 0x84, 0x52, 0xDB, 0xC9, 0xD0, 0xD7, 0x84, 0x53, /* 0xF4-0xF7 */ + 0xCD, 0xB9, 0xB0, 0xBC, 0xB3, 0xF6, 0xBB, 0xF7, /* 0xF8-0xFB */ + 0xDB, 0xCA, 0xBA, 0xAF, 0x84, 0x54, 0xD4, 0xE4, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_52[512] = { + 0xB5, 0xB6, 0xB5, 0xF3, 0xD8, 0xD6, 0xC8, 0xD0, /* 0x00-0x03 */ + 0x84, 0x55, 0x84, 0x56, 0xB7, 0xD6, 0xC7, 0xD0, /* 0x04-0x07 */ + 0xD8, 0xD7, 0x84, 0x57, 0xBF, 0xAF, 0x84, 0x58, /* 0x08-0x0B */ + 0x84, 0x59, 0xDB, 0xBB, 0xD8, 0xD8, 0x84, 0x5A, /* 0x0C-0x0F */ + 0x84, 0x5B, 0xD0, 0xCC, 0xBB, 0xAE, 0x84, 0x5C, /* 0x10-0x13 */ + 0x84, 0x5D, 0x84, 0x5E, 0xEB, 0xBE, 0xC1, 0xD0, /* 0x14-0x17 */ + 0xC1, 0xF5, 0xD4, 0xF2, 0xB8, 0xD5, 0xB4, 0xB4, /* 0x18-0x1B */ + 0x84, 0x5F, 0xB3, 0xF5, 0x84, 0x60, 0x84, 0x61, /* 0x1C-0x1F */ + 0xC9, 0xBE, 0x84, 0x62, 0x84, 0x63, 0x84, 0x64, /* 0x20-0x23 */ + 0xC5, 0xD0, 0x84, 0x65, 0x84, 0x66, 0x84, 0x67, /* 0x24-0x27 */ + 0xC5, 0xD9, 0xC0, 0xFB, 0x84, 0x68, 0xB1, 0xF0, /* 0x28-0x2B */ + 0x84, 0x69, 0xD8, 0xD9, 0xB9, 0xCE, 0x84, 0x6A, /* 0x2C-0x2F */ + 0xB5, 0xBD, 0x84, 0x6B, 0x84, 0x6C, 0xD8, 0xDA, /* 0x30-0x33 */ + 0x84, 0x6D, 0x84, 0x6E, 0xD6, 0xC6, 0xCB, 0xA2, /* 0x34-0x37 */ + 0xC8, 0xAF, 0xC9, 0xB2, 0xB4, 0xCC, 0xBF, 0xCC, /* 0x38-0x3B */ + 0x84, 0x6F, 0xB9, 0xF4, 0x84, 0x70, 0xD8, 0xDB, /* 0x3C-0x3F */ + 0xD8, 0xDC, 0xB6, 0xE7, 0xBC, 0xC1, 0xCC, 0xEA, /* 0x40-0x43 */ + 0x84, 0x71, 0x84, 0x72, 0x84, 0x73, 0x84, 0x74, /* 0x44-0x47 */ + 0x84, 0x75, 0x84, 0x76, 0xCF, 0xF7, 0x84, 0x77, /* 0x48-0x4B */ + 0xD8, 0xDD, 0xC7, 0xB0, 0x84, 0x78, 0x84, 0x79, /* 0x4C-0x4F */ + 0xB9, 0xD0, 0xBD, 0xA3, 0x84, 0x7A, 0x84, 0x7B, /* 0x50-0x53 */ + 0xCC, 0xDE, 0x84, 0x7C, 0xC6, 0xCA, 0x84, 0x7D, /* 0x54-0x57 */ + 0x84, 0x7E, 0x84, 0x80, 0x84, 0x81, 0x84, 0x82, /* 0x58-0x5B */ + 0xD8, 0xE0, 0x84, 0x83, 0xD8, 0xDE, 0x84, 0x84, /* 0x5C-0x5F */ + 0x84, 0x85, 0xD8, 0xDF, 0x84, 0x86, 0x84, 0x87, /* 0x60-0x63 */ + 0x84, 0x88, 0xB0, 0xFE, 0x84, 0x89, 0xBE, 0xE7, /* 0x64-0x67 */ + 0x84, 0x8A, 0xCA, 0xA3, 0xBC, 0xF4, 0x84, 0x8B, /* 0x68-0x6B */ + 0x84, 0x8C, 0x84, 0x8D, 0x84, 0x8E, 0xB8, 0xB1, /* 0x6C-0x6F */ + 0x84, 0x8F, 0x84, 0x90, 0xB8, 0xEE, 0x84, 0x91, /* 0x70-0x73 */ + 0x84, 0x92, 0x84, 0x93, 0x84, 0x94, 0x84, 0x95, /* 0x74-0x77 */ + 0x84, 0x96, 0x84, 0x97, 0x84, 0x98, 0x84, 0x99, /* 0x78-0x7B */ + 0x84, 0x9A, 0xD8, 0xE2, 0x84, 0x9B, 0xBD, 0xCB, /* 0x7C-0x7F */ + + 0x84, 0x9C, 0xD8, 0xE4, 0xD8, 0xE3, 0x84, 0x9D, /* 0x80-0x83 */ + 0x84, 0x9E, 0x84, 0x9F, 0x84, 0xA0, 0x84, 0xA1, /* 0x84-0x87 */ + 0xC5, 0xFC, 0x84, 0xA2, 0x84, 0xA3, 0x84, 0xA4, /* 0x88-0x8B */ + 0x84, 0xA5, 0x84, 0xA6, 0x84, 0xA7, 0x84, 0xA8, /* 0x8C-0x8F */ + 0xD8, 0xE5, 0x84, 0xA9, 0x84, 0xAA, 0xD8, 0xE6, /* 0x90-0x93 */ + 0x84, 0xAB, 0x84, 0xAC, 0x84, 0xAD, 0x84, 0xAE, /* 0x94-0x97 */ + 0x84, 0xAF, 0x84, 0xB0, 0x84, 0xB1, 0xC1, 0xA6, /* 0x98-0x9B */ + 0x84, 0xB2, 0xC8, 0xB0, 0xB0, 0xEC, 0xB9, 0xA6, /* 0x9C-0x9F */ + 0xBC, 0xD3, 0xCE, 0xF1, 0xDB, 0xBD, 0xC1, 0xD3, /* 0xA0-0xA3 */ + 0x84, 0xB3, 0x84, 0xB4, 0x84, 0xB5, 0x84, 0xB6, /* 0xA4-0xA7 */ + 0xB6, 0xAF, 0xD6, 0xFA, 0xC5, 0xAC, 0xBD, 0xD9, /* 0xA8-0xAB */ + 0xDB, 0xBE, 0xDB, 0xBF, 0x84, 0xB7, 0x84, 0xB8, /* 0xAC-0xAF */ + 0x84, 0xB9, 0xC0, 0xF8, 0xBE, 0xA2, 0xC0, 0xCD, /* 0xB0-0xB3 */ + 0x84, 0xBA, 0x84, 0xBB, 0x84, 0xBC, 0x84, 0xBD, /* 0xB4-0xB7 */ + 0x84, 0xBE, 0x84, 0xBF, 0x84, 0xC0, 0x84, 0xC1, /* 0xB8-0xBB */ + 0x84, 0xC2, 0x84, 0xC3, 0xDB, 0xC0, 0xCA, 0xC6, /* 0xBC-0xBF */ + 0x84, 0xC4, 0x84, 0xC5, 0x84, 0xC6, 0xB2, 0xAA, /* 0xC0-0xC3 */ + 0x84, 0xC7, 0x84, 0xC8, 0x84, 0xC9, 0xD3, 0xC2, /* 0xC4-0xC7 */ + 0x84, 0xCA, 0xC3, 0xE3, 0x84, 0xCB, 0xD1, 0xAB, /* 0xC8-0xCB */ + 0x84, 0xCC, 0x84, 0xCD, 0x84, 0xCE, 0x84, 0xCF, /* 0xCC-0xCF */ + 0xDB, 0xC2, 0x84, 0xD0, 0xC0, 0xD5, 0x84, 0xD1, /* 0xD0-0xD3 */ + 0x84, 0xD2, 0x84, 0xD3, 0xDB, 0xC3, 0x84, 0xD4, /* 0xD4-0xD7 */ + 0xBF, 0xB1, 0x84, 0xD5, 0x84, 0xD6, 0x84, 0xD7, /* 0xD8-0xDB */ + 0x84, 0xD8, 0x84, 0xD9, 0x84, 0xDA, 0xC4, 0xBC, /* 0xDC-0xDF */ + 0x84, 0xDB, 0x84, 0xDC, 0x84, 0xDD, 0x84, 0xDE, /* 0xE0-0xE3 */ + 0xC7, 0xDA, 0x84, 0xDF, 0x84, 0xE0, 0x84, 0xE1, /* 0xE4-0xE7 */ + 0x84, 0xE2, 0x84, 0xE3, 0x84, 0xE4, 0x84, 0xE5, /* 0xE8-0xEB */ + 0x84, 0xE6, 0x84, 0xE7, 0x84, 0xE8, 0x84, 0xE9, /* 0xEC-0xEF */ + 0xDB, 0xC4, 0x84, 0xEA, 0x84, 0xEB, 0x84, 0xEC, /* 0xF0-0xF3 */ + 0x84, 0xED, 0x84, 0xEE, 0x84, 0xEF, 0x84, 0xF0, /* 0xF4-0xF7 */ + 0x84, 0xF1, 0xD9, 0xE8, 0xC9, 0xD7, 0x84, 0xF2, /* 0xF8-0xFB */ + 0x84, 0xF3, 0x84, 0xF4, 0xB9, 0xB4, 0xCE, 0xF0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_53[512] = { + 0xD4, 0xC8, 0x84, 0xF5, 0x84, 0xF6, 0x84, 0xF7, /* 0x00-0x03 */ + 0x84, 0xF8, 0xB0, 0xFC, 0xB4, 0xD2, 0x84, 0xF9, /* 0x04-0x07 */ + 0xD0, 0xD9, 0x84, 0xFA, 0x84, 0xFB, 0x84, 0xFC, /* 0x08-0x0B */ + 0x84, 0xFD, 0xD9, 0xE9, 0x84, 0xFE, 0xDE, 0xCB, /* 0x0C-0x0F */ + 0xD9, 0xEB, 0x85, 0x40, 0x85, 0x41, 0x85, 0x42, /* 0x10-0x13 */ + 0x85, 0x43, 0xD8, 0xB0, 0xBB, 0xAF, 0xB1, 0xB1, /* 0x14-0x17 */ + 0x85, 0x44, 0xB3, 0xD7, 0xD8, 0xCE, 0x85, 0x45, /* 0x18-0x1B */ + 0x85, 0x46, 0xD4, 0xD1, 0x85, 0x47, 0x85, 0x48, /* 0x1C-0x1F */ + 0xBD, 0xB3, 0xBF, 0xEF, 0x85, 0x49, 0xCF, 0xBB, /* 0x20-0x23 */ + 0x85, 0x4A, 0x85, 0x4B, 0xD8, 0xD0, 0x85, 0x4C, /* 0x24-0x27 */ + 0x85, 0x4D, 0x85, 0x4E, 0xB7, 0xCB, 0x85, 0x4F, /* 0x28-0x2B */ + 0x85, 0x50, 0x85, 0x51, 0xD8, 0xD1, 0x85, 0x52, /* 0x2C-0x2F */ + 0x85, 0x53, 0x85, 0x54, 0x85, 0x55, 0x85, 0x56, /* 0x30-0x33 */ + 0x85, 0x57, 0x85, 0x58, 0x85, 0x59, 0x85, 0x5A, /* 0x34-0x37 */ + 0x85, 0x5B, 0xC6, 0xA5, 0xC7, 0xF8, 0xD2, 0xBD, /* 0x38-0x3B */ + 0x85, 0x5C, 0x85, 0x5D, 0xD8, 0xD2, 0xC4, 0xE4, /* 0x3C-0x3F */ + 0x85, 0x5E, 0xCA, 0xAE, 0x85, 0x5F, 0xC7, 0xA7, /* 0x40-0x43 */ + 0x85, 0x60, 0xD8, 0xA6, 0x85, 0x61, 0xC9, 0xFD, /* 0x44-0x47 */ + 0xCE, 0xE7, 0xBB, 0xDC, 0xB0, 0xEB, 0x85, 0x62, /* 0x48-0x4B */ + 0x85, 0x63, 0x85, 0x64, 0xBB, 0xAA, 0xD0, 0xAD, /* 0x4C-0x4F */ + 0x85, 0x65, 0xB1, 0xB0, 0xD7, 0xE4, 0xD7, 0xBF, /* 0x50-0x53 */ + 0x85, 0x66, 0xB5, 0xA5, 0xC2, 0xF4, 0xC4, 0xCF, /* 0x54-0x57 */ + 0x85, 0x67, 0x85, 0x68, 0xB2, 0xA9, 0x85, 0x69, /* 0x58-0x5B */ + 0xB2, 0xB7, 0x85, 0x6A, 0xB1, 0xE5, 0xDF, 0xB2, /* 0x5C-0x5F */ + 0xD5, 0xBC, 0xBF, 0xA8, 0xC2, 0xAC, 0xD8, 0xD5, /* 0x60-0x63 */ + 0xC2, 0xB1, 0x85, 0x6B, 0xD8, 0xD4, 0xCE, 0xD4, /* 0x64-0x67 */ + 0x85, 0x6C, 0xDA, 0xE0, 0x85, 0x6D, 0xCE, 0xC0, /* 0x68-0x6B */ + 0x85, 0x6E, 0x85, 0x6F, 0xD8, 0xB4, 0xC3, 0xAE, /* 0x6C-0x6F */ + 0xD3, 0xA1, 0xCE, 0xA3, 0x85, 0x70, 0xBC, 0xB4, /* 0x70-0x73 */ + 0xC8, 0xB4, 0xC2, 0xD1, 0x85, 0x71, 0xBE, 0xED, /* 0x74-0x77 */ + 0xD0, 0xB6, 0x85, 0x72, 0xDA, 0xE1, 0x85, 0x73, /* 0x78-0x7B */ + 0x85, 0x74, 0x85, 0x75, 0x85, 0x76, 0xC7, 0xE4, /* 0x7C-0x7F */ + + 0x85, 0x77, 0x85, 0x78, 0xB3, 0xA7, 0x85, 0x79, /* 0x80-0x83 */ + 0xB6, 0xF2, 0xCC, 0xFC, 0xC0, 0xFA, 0x85, 0x7A, /* 0x84-0x87 */ + 0x85, 0x7B, 0xC0, 0xF7, 0x85, 0x7C, 0xD1, 0xB9, /* 0x88-0x8B */ + 0xD1, 0xE1, 0xD8, 0xC7, 0x85, 0x7D, 0x85, 0x7E, /* 0x8C-0x8F */ + 0x85, 0x80, 0x85, 0x81, 0x85, 0x82, 0x85, 0x83, /* 0x90-0x93 */ + 0x85, 0x84, 0xB2, 0xDE, 0x85, 0x85, 0x85, 0x86, /* 0x94-0x97 */ + 0xC0, 0xE5, 0x85, 0x87, 0xBA, 0xF1, 0x85, 0x88, /* 0x98-0x9B */ + 0x85, 0x89, 0xD8, 0xC8, 0x85, 0x8A, 0xD4, 0xAD, /* 0x9C-0x9F */ + 0x85, 0x8B, 0x85, 0x8C, 0xCF, 0xE1, 0xD8, 0xC9, /* 0xA0-0xA3 */ + 0x85, 0x8D, 0xD8, 0xCA, 0xCF, 0xC3, 0x85, 0x8E, /* 0xA4-0xA7 */ + 0xB3, 0xF8, 0xBE, 0xC7, 0x85, 0x8F, 0x85, 0x90, /* 0xA8-0xAB */ + 0x85, 0x91, 0x85, 0x92, 0xD8, 0xCB, 0x85, 0x93, /* 0xAC-0xAF */ + 0x85, 0x94, 0x85, 0x95, 0x85, 0x96, 0x85, 0x97, /* 0xB0-0xB3 */ + 0x85, 0x98, 0x85, 0x99, 0xDB, 0xCC, 0x85, 0x9A, /* 0xB4-0xB7 */ + 0x85, 0x9B, 0x85, 0x9C, 0x85, 0x9D, 0xC8, 0xA5, /* 0xB8-0xBB */ + 0x85, 0x9E, 0x85, 0x9F, 0x85, 0xA0, 0xCF, 0xD8, /* 0xBC-0xBF */ + 0x85, 0xA1, 0xC8, 0xFE, 0xB2, 0xCE, 0x85, 0xA2, /* 0xC0-0xC3 */ + 0x85, 0xA3, 0x85, 0xA4, 0x85, 0xA5, 0x85, 0xA6, /* 0xC4-0xC7 */ + 0xD3, 0xD6, 0xB2, 0xE6, 0xBC, 0xB0, 0xD3, 0xD1, /* 0xC8-0xCB */ + 0xCB, 0xAB, 0xB7, 0xB4, 0x85, 0xA7, 0x85, 0xA8, /* 0xCC-0xCF */ + 0x85, 0xA9, 0xB7, 0xA2, 0x85, 0xAA, 0x85, 0xAB, /* 0xD0-0xD3 */ + 0xCA, 0xE5, 0x85, 0xAC, 0xC8, 0xA1, 0xCA, 0xDC, /* 0xD4-0xD7 */ + 0xB1, 0xE4, 0xD0, 0xF0, 0x85, 0xAD, 0xC5, 0xD1, /* 0xD8-0xDB */ + 0x85, 0xAE, 0x85, 0xAF, 0x85, 0xB0, 0xDB, 0xC5, /* 0xDC-0xDF */ + 0xB5, 0xFE, 0x85, 0xB1, 0x85, 0xB2, 0xBF, 0xDA, /* 0xE0-0xE3 */ + 0xB9, 0xC5, 0xBE, 0xE4, 0xC1, 0xED, 0x85, 0xB3, /* 0xE4-0xE7 */ + 0xDF, 0xB6, 0xDF, 0xB5, 0xD6, 0xBB, 0xBD, 0xD0, /* 0xE8-0xEB */ + 0xD5, 0xD9, 0xB0, 0xC8, 0xB6, 0xA3, 0xBF, 0xC9, /* 0xEC-0xEF */ + 0xCC, 0xA8, 0xDF, 0xB3, 0xCA, 0xB7, 0xD3, 0xD2, /* 0xF0-0xF3 */ + 0x85, 0xB4, 0xD8, 0xCF, 0xD2, 0xB6, 0xBA, 0xC5, /* 0xF4-0xF7 */ + 0xCB, 0xBE, 0xCC, 0xBE, 0x85, 0xB5, 0xDF, 0xB7, /* 0xF8-0xFB */ + 0xB5, 0xF0, 0xDF, 0xB4, 0x85, 0xB6, 0x85, 0xB7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_54[512] = { + 0x85, 0xB8, 0xD3, 0xF5, 0x85, 0xB9, 0xB3, 0xD4, /* 0x00-0x03 */ + 0xB8, 0xF7, 0x85, 0xBA, 0xDF, 0xBA, 0x85, 0xBB, /* 0x04-0x07 */ + 0xBA, 0xCF, 0xBC, 0xAA, 0xB5, 0xF5, 0x85, 0xBC, /* 0x08-0x0B */ + 0xCD, 0xAC, 0xC3, 0xFB, 0xBA, 0xF3, 0xC0, 0xF4, /* 0x0C-0x0F */ + 0xCD, 0xC2, 0xCF, 0xF2, 0xDF, 0xB8, 0xCF, 0xC5, /* 0x10-0x13 */ + 0x85, 0xBD, 0xC2, 0xC0, 0xDF, 0xB9, 0xC2, 0xF0, /* 0x14-0x17 */ + 0x85, 0xBE, 0x85, 0xBF, 0x85, 0xC0, 0xBE, 0xFD, /* 0x18-0x1B */ + 0x85, 0xC1, 0xC1, 0xDF, 0xCD, 0xCC, 0xD2, 0xF7, /* 0x1C-0x1F */ + 0xB7, 0xCD, 0xDF, 0xC1, 0x85, 0xC2, 0xDF, 0xC4, /* 0x20-0x23 */ + 0x85, 0xC3, 0x85, 0xC4, 0xB7, 0xF1, 0xB0, 0xC9, /* 0x24-0x27 */ + 0xB6, 0xD6, 0xB7, 0xD4, 0x85, 0xC5, 0xBA, 0xAC, /* 0x28-0x2B */ + 0xCC, 0xFD, 0xBF, 0xD4, 0xCB, 0xB1, 0xC6, 0xF4, /* 0x2C-0x2F */ + 0x85, 0xC6, 0xD6, 0xA8, 0xDF, 0xC5, 0x85, 0xC7, /* 0x30-0x33 */ + 0xCE, 0xE2, 0xB3, 0xB3, 0x85, 0xC8, 0x85, 0xC9, /* 0x34-0x37 */ + 0xCE, 0xFC, 0xB4, 0xB5, 0x85, 0xCA, 0xCE, 0xC7, /* 0x38-0x3B */ + 0xBA, 0xF0, 0x85, 0xCB, 0xCE, 0xE1, 0x85, 0xCC, /* 0x3C-0x3F */ + 0xD1, 0xBD, 0x85, 0xCD, 0x85, 0xCE, 0xDF, 0xC0, /* 0x40-0x43 */ + 0x85, 0xCF, 0x85, 0xD0, 0xB4, 0xF4, 0x85, 0xD1, /* 0x44-0x47 */ + 0xB3, 0xCA, 0x85, 0xD2, 0xB8, 0xE6, 0xDF, 0xBB, /* 0x48-0x4B */ + 0x85, 0xD3, 0x85, 0xD4, 0x85, 0xD5, 0x85, 0xD6, /* 0x4C-0x4F */ + 0xC4, 0xC5, 0x85, 0xD7, 0xDF, 0xBC, 0xDF, 0xBD, /* 0x50-0x53 */ + 0xDF, 0xBE, 0xC5, 0xBB, 0xDF, 0xBF, 0xDF, 0xC2, /* 0x54-0x57 */ + 0xD4, 0xB1, 0xDF, 0xC3, 0x85, 0xD8, 0xC7, 0xBA, /* 0x58-0x5B */ + 0xCE, 0xD8, 0x85, 0xD9, 0x85, 0xDA, 0x85, 0xDB, /* 0x5C-0x5F */ + 0x85, 0xDC, 0x85, 0xDD, 0xC4, 0xD8, 0x85, 0xDE, /* 0x60-0x63 */ + 0xDF, 0xCA, 0x85, 0xDF, 0xDF, 0xCF, 0x85, 0xE0, /* 0x64-0x67 */ + 0xD6, 0xDC, 0x85, 0xE1, 0x85, 0xE2, 0x85, 0xE3, /* 0x68-0x6B */ + 0x85, 0xE4, 0x85, 0xE5, 0x85, 0xE6, 0x85, 0xE7, /* 0x6C-0x6F */ + 0x85, 0xE8, 0xDF, 0xC9, 0xDF, 0xDA, 0xCE, 0xB6, /* 0x70-0x73 */ + 0x85, 0xE9, 0xBA, 0xC7, 0xDF, 0xCE, 0xDF, 0xC8, /* 0x74-0x77 */ + 0xC5, 0xDE, 0x85, 0xEA, 0x85, 0xEB, 0xC9, 0xEB, /* 0x78-0x7B */ + 0xBA, 0xF4, 0xC3, 0xFC, 0x85, 0xEC, 0x85, 0xED, /* 0x7C-0x7F */ + + 0xBE, 0xD7, 0x85, 0xEE, 0xDF, 0xC6, 0x85, 0xEF, /* 0x80-0x83 */ + 0xDF, 0xCD, 0x85, 0xF0, 0xC5, 0xD8, 0x85, 0xF1, /* 0x84-0x87 */ + 0x85, 0xF2, 0x85, 0xF3, 0x85, 0xF4, 0xD5, 0xA6, /* 0x88-0x8B */ + 0xBA, 0xCD, 0x85, 0xF5, 0xBE, 0xCC, 0xD3, 0xBD, /* 0x8C-0x8F */ + 0xB8, 0xC0, 0x85, 0xF6, 0xD6, 0xE4, 0x85, 0xF7, /* 0x90-0x93 */ + 0xDF, 0xC7, 0xB9, 0xBE, 0xBF, 0xA7, 0x85, 0xF8, /* 0x94-0x97 */ + 0x85, 0xF9, 0xC1, 0xFC, 0xDF, 0xCB, 0xDF, 0xCC, /* 0x98-0x9B */ + 0x85, 0xFA, 0xDF, 0xD0, 0x85, 0xFB, 0x85, 0xFC, /* 0x9C-0x9F */ + 0x85, 0xFD, 0x85, 0xFE, 0x86, 0x40, 0xDF, 0xDB, /* 0xA0-0xA3 */ + 0xDF, 0xE5, 0x86, 0x41, 0xDF, 0xD7, 0xDF, 0xD6, /* 0xA4-0xA7 */ + 0xD7, 0xC9, 0xDF, 0xE3, 0xDF, 0xE4, 0xE5, 0xEB, /* 0xA8-0xAB */ + 0xD2, 0xA7, 0xDF, 0xD2, 0x86, 0x42, 0xBF, 0xA9, /* 0xAC-0xAF */ + 0x86, 0x43, 0xD4, 0xDB, 0x86, 0x44, 0xBF, 0xC8, /* 0xB0-0xB3 */ + 0xDF, 0xD4, 0x86, 0x45, 0x86, 0x46, 0x86, 0x47, /* 0xB4-0xB7 */ + 0xCF, 0xCC, 0x86, 0x48, 0x86, 0x49, 0xDF, 0xDD, /* 0xB8-0xBB */ + 0x86, 0x4A, 0xD1, 0xCA, 0x86, 0x4B, 0xDF, 0xDE, /* 0xBC-0xBF */ + 0xB0, 0xA7, 0xC6, 0xB7, 0xDF, 0xD3, 0x86, 0x4C, /* 0xC0-0xC3 */ + 0xBA, 0xE5, 0x86, 0x4D, 0xB6, 0xDF, 0xCD, 0xDB, /* 0xC4-0xC7 */ + 0xB9, 0xFE, 0xD4, 0xD5, 0x86, 0x4E, 0x86, 0x4F, /* 0xC8-0xCB */ + 0xDF, 0xDF, 0xCF, 0xEC, 0xB0, 0xA5, 0xDF, 0xE7, /* 0xCC-0xCF */ + 0xDF, 0xD1, 0xD1, 0xC6, 0xDF, 0xD5, 0xDF, 0xD8, /* 0xD0-0xD3 */ + 0xDF, 0xD9, 0xDF, 0xDC, 0x86, 0x50, 0xBB, 0xA9, /* 0xD4-0xD7 */ + 0x86, 0x51, 0xDF, 0xE0, 0xDF, 0xE1, 0x86, 0x52, /* 0xD8-0xDB */ + 0xDF, 0xE2, 0xDF, 0xE6, 0xDF, 0xE8, 0xD3, 0xB4, /* 0xDC-0xDF */ + 0x86, 0x53, 0x86, 0x54, 0x86, 0x55, 0x86, 0x56, /* 0xE0-0xE3 */ + 0x86, 0x57, 0xB8, 0xE7, 0xC5, 0xB6, 0xDF, 0xEA, /* 0xE4-0xE7 */ + 0xC9, 0xDA, 0xC1, 0xA8, 0xC4, 0xC4, 0x86, 0x58, /* 0xE8-0xEB */ + 0x86, 0x59, 0xBF, 0xDE, 0xCF, 0xF8, 0x86, 0x5A, /* 0xEC-0xEF */ + 0x86, 0x5B, 0x86, 0x5C, 0xD5, 0xDC, 0xDF, 0xEE, /* 0xF0-0xF3 */ + 0x86, 0x5D, 0x86, 0x5E, 0x86, 0x5F, 0x86, 0x60, /* 0xF4-0xF7 */ + 0x86, 0x61, 0x86, 0x62, 0xB2, 0xB8, 0x86, 0x63, /* 0xF8-0xFB */ + 0xBA, 0xDF, 0xDF, 0xEC, 0x86, 0x64, 0xDB, 0xC1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_55[512] = { + 0x86, 0x65, 0xD1, 0xE4, 0x86, 0x66, 0x86, 0x67, /* 0x00-0x03 */ + 0x86, 0x68, 0x86, 0x69, 0xCB, 0xF4, 0xB4, 0xBD, /* 0x04-0x07 */ + 0x86, 0x6A, 0xB0, 0xA6, 0x86, 0x6B, 0x86, 0x6C, /* 0x08-0x0B */ + 0x86, 0x6D, 0x86, 0x6E, 0x86, 0x6F, 0xDF, 0xF1, /* 0x0C-0x0F */ + 0xCC, 0xC6, 0xDF, 0xF2, 0x86, 0x70, 0x86, 0x71, /* 0x10-0x13 */ + 0xDF, 0xED, 0x86, 0x72, 0x86, 0x73, 0x86, 0x74, /* 0x14-0x17 */ + 0x86, 0x75, 0x86, 0x76, 0x86, 0x77, 0xDF, 0xE9, /* 0x18-0x1B */ + 0x86, 0x78, 0x86, 0x79, 0x86, 0x7A, 0x86, 0x7B, /* 0x1C-0x1F */ + 0xDF, 0xEB, 0x86, 0x7C, 0xDF, 0xEF, 0xDF, 0xF0, /* 0x20-0x23 */ + 0xBB, 0xBD, 0x86, 0x7D, 0x86, 0x7E, 0xDF, 0xF3, /* 0x24-0x27 */ + 0x86, 0x80, 0x86, 0x81, 0xDF, 0xF4, 0x86, 0x82, /* 0x28-0x2B */ + 0xBB, 0xA3, 0x86, 0x83, 0xCA, 0xDB, 0xCE, 0xA8, /* 0x2C-0x2F */ + 0xE0, 0xA7, 0xB3, 0xAA, 0x86, 0x84, 0xE0, 0xA6, /* 0x30-0x33 */ + 0x86, 0x85, 0x86, 0x86, 0x86, 0x87, 0xE0, 0xA1, /* 0x34-0x37 */ + 0x86, 0x88, 0x86, 0x89, 0x86, 0x8A, 0x86, 0x8B, /* 0x38-0x3B */ + 0xDF, 0xFE, 0x86, 0x8C, 0xCD, 0xD9, 0xDF, 0xFC, /* 0x3C-0x3F */ + 0x86, 0x8D, 0xDF, 0xFA, 0x86, 0x8E, 0xBF, 0xD0, /* 0x40-0x43 */ + 0xD7, 0xC4, 0x86, 0x8F, 0xC9, 0xCC, 0x86, 0x90, /* 0x44-0x47 */ + 0x86, 0x91, 0xDF, 0xF8, 0xB0, 0xA1, 0x86, 0x92, /* 0x48-0x4B */ + 0x86, 0x93, 0x86, 0x94, 0x86, 0x95, 0x86, 0x96, /* 0x4C-0x4F */ + 0xDF, 0xFD, 0x86, 0x97, 0x86, 0x98, 0x86, 0x99, /* 0x50-0x53 */ + 0x86, 0x9A, 0xDF, 0xFB, 0xE0, 0xA2, 0x86, 0x9B, /* 0x54-0x57 */ + 0x86, 0x9C, 0x86, 0x9D, 0x86, 0x9E, 0x86, 0x9F, /* 0x58-0x5B */ + 0xE0, 0xA8, 0x86, 0xA0, 0x86, 0xA1, 0x86, 0xA2, /* 0x5C-0x5F */ + 0x86, 0xA3, 0xB7, 0xC8, 0x86, 0xA4, 0x86, 0xA5, /* 0x60-0x63 */ + 0xC6, 0xA1, 0xC9, 0xB6, 0xC0, 0xB2, 0xDF, 0xF5, /* 0x64-0x67 */ + 0x86, 0xA6, 0x86, 0xA7, 0xC5, 0xBE, 0x86, 0xA8, /* 0x68-0x6B */ + 0xD8, 0xC4, 0xDF, 0xF9, 0xC4, 0xF6, 0x86, 0xA9, /* 0x6C-0x6F */ + 0x86, 0xAA, 0x86, 0xAB, 0x86, 0xAC, 0x86, 0xAD, /* 0x70-0x73 */ + 0x86, 0xAE, 0xE0, 0xA3, 0xE0, 0xA4, 0xE0, 0xA5, /* 0x74-0x77 */ + 0xD0, 0xA5, 0x86, 0xAF, 0x86, 0xB0, 0xE0, 0xB4, /* 0x78-0x7B */ + 0xCC, 0xE4, 0x86, 0xB1, 0xE0, 0xB1, 0x86, 0xB2, /* 0x7C-0x7F */ + + 0xBF, 0xA6, 0xE0, 0xAF, 0xCE, 0xB9, 0xE0, 0xAB, /* 0x80-0x83 */ + 0xC9, 0xC6, 0x86, 0xB3, 0x86, 0xB4, 0xC0, 0xAE, /* 0x84-0x87 */ + 0xE0, 0xAE, 0xBA, 0xED, 0xBA, 0xB0, 0xE0, 0xA9, /* 0x88-0x8B */ + 0x86, 0xB5, 0x86, 0xB6, 0x86, 0xB7, 0xDF, 0xF6, /* 0x8C-0x8F */ + 0x86, 0xB8, 0xE0, 0xB3, 0x86, 0xB9, 0x86, 0xBA, /* 0x90-0x93 */ + 0xE0, 0xB8, 0x86, 0xBB, 0x86, 0xBC, 0x86, 0xBD, /* 0x94-0x97 */ + 0xB4, 0xAD, 0xE0, 0xB9, 0x86, 0xBE, 0x86, 0xBF, /* 0x98-0x9B */ + 0xCF, 0xB2, 0xBA, 0xC8, 0x86, 0xC0, 0xE0, 0xB0, /* 0x9C-0x9F */ + 0x86, 0xC1, 0x86, 0xC2, 0x86, 0xC3, 0x86, 0xC4, /* 0xA0-0xA3 */ + 0x86, 0xC5, 0x86, 0xC6, 0x86, 0xC7, 0xD0, 0xFA, /* 0xA4-0xA7 */ + 0x86, 0xC8, 0x86, 0xC9, 0x86, 0xCA, 0x86, 0xCB, /* 0xA8-0xAB */ + 0x86, 0xCC, 0x86, 0xCD, 0x86, 0xCE, 0x86, 0xCF, /* 0xAC-0xAF */ + 0x86, 0xD0, 0xE0, 0xAC, 0x86, 0xD1, 0xD4, 0xFB, /* 0xB0-0xB3 */ + 0x86, 0xD2, 0xDF, 0xF7, 0x86, 0xD3, 0xC5, 0xE7, /* 0xB4-0xB7 */ + 0x86, 0xD4, 0xE0, 0xAD, 0x86, 0xD5, 0xD3, 0xF7, /* 0xB8-0xBB */ + 0x86, 0xD6, 0xE0, 0xB6, 0xE0, 0xB7, 0x86, 0xD7, /* 0xBC-0xBF */ + 0x86, 0xD8, 0x86, 0xD9, 0x86, 0xDA, 0x86, 0xDB, /* 0xC0-0xC3 */ + 0xE0, 0xC4, 0xD0, 0xE1, 0x86, 0xDC, 0x86, 0xDD, /* 0xC4-0xC7 */ + 0x86, 0xDE, 0xE0, 0xBC, 0x86, 0xDF, 0x86, 0xE0, /* 0xC8-0xCB */ + 0xE0, 0xC9, 0xE0, 0xCA, 0x86, 0xE1, 0x86, 0xE2, /* 0xCC-0xCF */ + 0x86, 0xE3, 0xE0, 0xBE, 0xE0, 0xAA, 0xC9, 0xA4, /* 0xD0-0xD3 */ + 0xE0, 0xC1, 0x86, 0xE4, 0xE0, 0xB2, 0x86, 0xE5, /* 0xD4-0xD7 */ + 0x86, 0xE6, 0x86, 0xE7, 0x86, 0xE8, 0x86, 0xE9, /* 0xD8-0xDB */ + 0xCA, 0xC8, 0xE0, 0xC3, 0x86, 0xEA, 0xE0, 0xB5, /* 0xDC-0xDF */ + 0x86, 0xEB, 0xCE, 0xCB, 0x86, 0xEC, 0xCB, 0xC3, /* 0xE0-0xE3 */ + 0xE0, 0xCD, 0xE0, 0xC6, 0xE0, 0xC2, 0x86, 0xED, /* 0xE4-0xE7 */ + 0xE0, 0xCB, 0x86, 0xEE, 0xE0, 0xBA, 0xE0, 0xBF, /* 0xE8-0xEB */ + 0xE0, 0xC0, 0x86, 0xEF, 0x86, 0xF0, 0xE0, 0xC5, /* 0xEC-0xEF */ + 0x86, 0xF1, 0x86, 0xF2, 0xE0, 0xC7, 0xE0, 0xC8, /* 0xF0-0xF3 */ + 0x86, 0xF3, 0xE0, 0xCC, 0x86, 0xF4, 0xE0, 0xBB, /* 0xF4-0xF7 */ + 0x86, 0xF5, 0x86, 0xF6, 0x86, 0xF7, 0x86, 0xF8, /* 0xF8-0xFB */ + 0x86, 0xF9, 0xCB, 0xD4, 0xE0, 0xD5, 0x86, 0xFA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_56[512] = { + 0xE0, 0xD6, 0xE0, 0xD2, 0x86, 0xFB, 0x86, 0xFC, /* 0x00-0x03 */ + 0x86, 0xFD, 0x86, 0xFE, 0x87, 0x40, 0x87, 0x41, /* 0x04-0x07 */ + 0xE0, 0xD0, 0xBC, 0xCE, 0x87, 0x42, 0x87, 0x43, /* 0x08-0x0B */ + 0xE0, 0xD1, 0x87, 0x44, 0xB8, 0xC2, 0xD8, 0xC5, /* 0x0C-0x0F */ + 0x87, 0x45, 0x87, 0x46, 0x87, 0x47, 0x87, 0x48, /* 0x10-0x13 */ + 0x87, 0x49, 0x87, 0x4A, 0x87, 0x4B, 0x87, 0x4C, /* 0x14-0x17 */ + 0xD0, 0xEA, 0x87, 0x4D, 0x87, 0x4E, 0xC2, 0xEF, /* 0x18-0x1B */ + 0x87, 0x4F, 0x87, 0x50, 0xE0, 0xCF, 0xE0, 0xBD, /* 0x1C-0x1F */ + 0x87, 0x51, 0x87, 0x52, 0x87, 0x53, 0xE0, 0xD4, /* 0x20-0x23 */ + 0xE0, 0xD3, 0x87, 0x54, 0x87, 0x55, 0xE0, 0xD7, /* 0x24-0x27 */ + 0x87, 0x56, 0x87, 0x57, 0x87, 0x58, 0x87, 0x59, /* 0x28-0x2B */ + 0xE0, 0xDC, 0xE0, 0xD8, 0x87, 0x5A, 0x87, 0x5B, /* 0x2C-0x2F */ + 0x87, 0x5C, 0xD6, 0xF6, 0xB3, 0xB0, 0x87, 0x5D, /* 0x30-0x33 */ + 0xD7, 0xEC, 0x87, 0x5E, 0xCB, 0xBB, 0x87, 0x5F, /* 0x34-0x37 */ + 0x87, 0x60, 0xE0, 0xDA, 0x87, 0x61, 0xCE, 0xFB, /* 0x38-0x3B */ + 0x87, 0x62, 0x87, 0x63, 0x87, 0x64, 0xBA, 0xD9, /* 0x3C-0x3F */ + 0x87, 0x65, 0x87, 0x66, 0x87, 0x67, 0x87, 0x68, /* 0x40-0x43 */ + 0x87, 0x69, 0x87, 0x6A, 0x87, 0x6B, 0x87, 0x6C, /* 0x44-0x47 */ + 0x87, 0x6D, 0x87, 0x6E, 0x87, 0x6F, 0x87, 0x70, /* 0x48-0x4B */ + 0xE0, 0xE1, 0xE0, 0xDD, 0xD2, 0xAD, 0x87, 0x71, /* 0x4C-0x4F */ + 0x87, 0x72, 0x87, 0x73, 0x87, 0x74, 0x87, 0x75, /* 0x50-0x53 */ + 0xE0, 0xE2, 0x87, 0x76, 0x87, 0x77, 0xE0, 0xDB, /* 0x54-0x57 */ + 0xE0, 0xD9, 0xE0, 0xDF, 0x87, 0x78, 0x87, 0x79, /* 0x58-0x5B */ + 0xE0, 0xE0, 0x87, 0x7A, 0x87, 0x7B, 0x87, 0x7C, /* 0x5C-0x5F */ + 0x87, 0x7D, 0x87, 0x7E, 0xE0, 0xDE, 0x87, 0x80, /* 0x60-0x63 */ + 0xE0, 0xE4, 0x87, 0x81, 0x87, 0x82, 0x87, 0x83, /* 0x64-0x67 */ + 0xC6, 0xF7, 0xD8, 0xAC, 0xD4, 0xEB, 0xE0, 0xE6, /* 0x68-0x6B */ + 0xCA, 0xC9, 0x87, 0x84, 0x87, 0x85, 0x87, 0x86, /* 0x6C-0x6F */ + 0x87, 0x87, 0xE0, 0xE5, 0x87, 0x88, 0x87, 0x89, /* 0x70-0x73 */ + 0x87, 0x8A, 0x87, 0x8B, 0xB8, 0xC1, 0x87, 0x8C, /* 0x74-0x77 */ + 0x87, 0x8D, 0x87, 0x8E, 0x87, 0x8F, 0xE0, 0xE7, /* 0x78-0x7B */ + 0xE0, 0xE8, 0x87, 0x90, 0x87, 0x91, 0x87, 0x92, /* 0x7C-0x7F */ + + 0x87, 0x93, 0x87, 0x94, 0x87, 0x95, 0x87, 0x96, /* 0x80-0x83 */ + 0x87, 0x97, 0xE0, 0xE9, 0xE0, 0xE3, 0x87, 0x98, /* 0x84-0x87 */ + 0x87, 0x99, 0x87, 0x9A, 0x87, 0x9B, 0x87, 0x9C, /* 0x88-0x8B */ + 0x87, 0x9D, 0x87, 0x9E, 0xBA, 0xBF, 0xCC, 0xE7, /* 0x8C-0x8F */ + 0x87, 0x9F, 0x87, 0xA0, 0x87, 0xA1, 0xE0, 0xEA, /* 0x90-0x93 */ + 0x87, 0xA2, 0x87, 0xA3, 0x87, 0xA4, 0x87, 0xA5, /* 0x94-0x97 */ + 0x87, 0xA6, 0x87, 0xA7, 0x87, 0xA8, 0x87, 0xA9, /* 0x98-0x9B */ + 0x87, 0xAA, 0x87, 0xAB, 0x87, 0xAC, 0x87, 0xAD, /* 0x9C-0x9F */ + 0x87, 0xAE, 0x87, 0xAF, 0x87, 0xB0, 0xCF, 0xF9, /* 0xA0-0xA3 */ + 0x87, 0xB1, 0x87, 0xB2, 0x87, 0xB3, 0x87, 0xB4, /* 0xA4-0xA7 */ + 0x87, 0xB5, 0x87, 0xB6, 0x87, 0xB7, 0x87, 0xB8, /* 0xA8-0xAB */ + 0x87, 0xB9, 0x87, 0xBA, 0x87, 0xBB, 0xE0, 0xEB, /* 0xAC-0xAF */ + 0x87, 0xBC, 0x87, 0xBD, 0x87, 0xBE, 0x87, 0xBF, /* 0xB0-0xB3 */ + 0x87, 0xC0, 0x87, 0xC1, 0x87, 0xC2, 0xC8, 0xC2, /* 0xB4-0xB7 */ + 0x87, 0xC3, 0x87, 0xC4, 0x87, 0xC5, 0x87, 0xC6, /* 0xB8-0xBB */ + 0xBD, 0xC0, 0x87, 0xC7, 0x87, 0xC8, 0x87, 0xC9, /* 0xBC-0xBF */ + 0x87, 0xCA, 0x87, 0xCB, 0x87, 0xCC, 0x87, 0xCD, /* 0xC0-0xC3 */ + 0x87, 0xCE, 0x87, 0xCF, 0x87, 0xD0, 0x87, 0xD1, /* 0xC4-0xC7 */ + 0x87, 0xD2, 0x87, 0xD3, 0xC4, 0xD2, 0x87, 0xD4, /* 0xC8-0xCB */ + 0x87, 0xD5, 0x87, 0xD6, 0x87, 0xD7, 0x87, 0xD8, /* 0xCC-0xCF */ + 0x87, 0xD9, 0x87, 0xDA, 0x87, 0xDB, 0x87, 0xDC, /* 0xD0-0xD3 */ + 0xE0, 0xEC, 0x87, 0xDD, 0x87, 0xDE, 0xE0, 0xED, /* 0xD4-0xD7 */ + 0x87, 0xDF, 0x87, 0xE0, 0xC7, 0xF4, 0xCB, 0xC4, /* 0xD8-0xDB */ + 0x87, 0xE1, 0xE0, 0xEE, 0xBB, 0xD8, 0xD8, 0xB6, /* 0xDC-0xDF */ + 0xD2, 0xF2, 0xE0, 0xEF, 0xCD, 0xC5, 0x87, 0xE2, /* 0xE0-0xE3 */ + 0xB6, 0xDA, 0x87, 0xE3, 0x87, 0xE4, 0x87, 0xE5, /* 0xE4-0xE7 */ + 0x87, 0xE6, 0x87, 0xE7, 0x87, 0xE8, 0xE0, 0xF1, /* 0xE8-0xEB */ + 0x87, 0xE9, 0xD4, 0xB0, 0x87, 0xEA, 0x87, 0xEB, /* 0xEC-0xEF */ + 0xC0, 0xA7, 0xB4, 0xD1, 0x87, 0xEC, 0x87, 0xED, /* 0xF0-0xF3 */ + 0xCE, 0xA7, 0xE0, 0xF0, 0x87, 0xEE, 0x87, 0xEF, /* 0xF4-0xF7 */ + 0x87, 0xF0, 0xE0, 0xF2, 0xB9, 0xCC, 0x87, 0xF1, /* 0xF8-0xFB */ + 0x87, 0xF2, 0xB9, 0xFA, 0xCD, 0xBC, 0xE0, 0xF3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_57[512] = { + 0x87, 0xF3, 0x87, 0xF4, 0x87, 0xF5, 0xC6, 0xD4, /* 0x00-0x03 */ + 0xE0, 0xF4, 0x87, 0xF6, 0xD4, 0xB2, 0x87, 0xF7, /* 0x04-0x07 */ + 0xC8, 0xA6, 0xE0, 0xF6, 0xE0, 0xF5, 0x87, 0xF8, /* 0x08-0x0B */ + 0x87, 0xF9, 0x87, 0xFA, 0x87, 0xFB, 0x87, 0xFC, /* 0x0C-0x0F */ + 0x87, 0xFD, 0x87, 0xFE, 0x88, 0x40, 0x88, 0x41, /* 0x10-0x13 */ + 0x88, 0x42, 0x88, 0x43, 0x88, 0x44, 0x88, 0x45, /* 0x14-0x17 */ + 0x88, 0x46, 0x88, 0x47, 0x88, 0x48, 0x88, 0x49, /* 0x18-0x1B */ + 0xE0, 0xF7, 0x88, 0x4A, 0x88, 0x4B, 0xCD, 0xC1, /* 0x1C-0x1F */ + 0x88, 0x4C, 0x88, 0x4D, 0x88, 0x4E, 0xCA, 0xA5, /* 0x20-0x23 */ + 0x88, 0x4F, 0x88, 0x50, 0x88, 0x51, 0x88, 0x52, /* 0x24-0x27 */ + 0xD4, 0xDA, 0xDB, 0xD7, 0xDB, 0xD9, 0x88, 0x53, /* 0x28-0x2B */ + 0xDB, 0xD8, 0xB9, 0xE7, 0xDB, 0xDC, 0xDB, 0xDD, /* 0x2C-0x2F */ + 0xB5, 0xD8, 0x88, 0x54, 0x88, 0x55, 0xDB, 0xDA, /* 0x30-0x33 */ + 0x88, 0x56, 0x88, 0x57, 0x88, 0x58, 0x88, 0x59, /* 0x34-0x37 */ + 0x88, 0x5A, 0xDB, 0xDB, 0xB3, 0xA1, 0xDB, 0xDF, /* 0x38-0x3B */ + 0x88, 0x5B, 0x88, 0x5C, 0xBB, 0xF8, 0x88, 0x5D, /* 0x3C-0x3F */ + 0xD6, 0xB7, 0x88, 0x5E, 0xDB, 0xE0, 0x88, 0x5F, /* 0x40-0x43 */ + 0x88, 0x60, 0x88, 0x61, 0x88, 0x62, 0xBE, 0xF9, /* 0x44-0x47 */ + 0x88, 0x63, 0x88, 0x64, 0xB7, 0xBB, 0x88, 0x65, /* 0x48-0x4B */ + 0xDB, 0xD0, 0xCC, 0xAE, 0xBF, 0xB2, 0xBB, 0xB5, /* 0x4C-0x4F */ + 0xD7, 0xF8, 0xBF, 0xD3, 0x88, 0x66, 0x88, 0x67, /* 0x50-0x53 */ + 0x88, 0x68, 0x88, 0x69, 0x88, 0x6A, 0xBF, 0xE9, /* 0x54-0x57 */ + 0x88, 0x6B, 0x88, 0x6C, 0xBC, 0xE1, 0xCC, 0xB3, /* 0x58-0x5B */ + 0xDB, 0xDE, 0xB0, 0xD3, 0xCE, 0xEB, 0xB7, 0xD8, /* 0x5C-0x5F */ + 0xD7, 0xB9, 0xC6, 0xC2, 0x88, 0x6D, 0x88, 0x6E, /* 0x60-0x63 */ + 0xC0, 0xA4, 0x88, 0x6F, 0xCC, 0xB9, 0x88, 0x70, /* 0x64-0x67 */ + 0xDB, 0xE7, 0xDB, 0xE1, 0xC6, 0xBA, 0xDB, 0xE3, /* 0x68-0x6B */ + 0x88, 0x71, 0xDB, 0xE8, 0x88, 0x72, 0xC5, 0xF7, /* 0x6C-0x6F */ + 0x88, 0x73, 0x88, 0x74, 0x88, 0x75, 0xDB, 0xEA, /* 0x70-0x73 */ + 0x88, 0x76, 0x88, 0x77, 0xDB, 0xE9, 0xBF, 0xC0, /* 0x74-0x77 */ + 0x88, 0x78, 0x88, 0x79, 0x88, 0x7A, 0xDB, 0xE6, /* 0x78-0x7B */ + 0xDB, 0xE5, 0x88, 0x7B, 0x88, 0x7C, 0x88, 0x7D, /* 0x7C-0x7F */ + + 0x88, 0x7E, 0x88, 0x80, 0xB4, 0xB9, 0xC0, 0xAC, /* 0x80-0x83 */ + 0xC2, 0xA2, 0xDB, 0xE2, 0xDB, 0xE4, 0x88, 0x81, /* 0x84-0x87 */ + 0x88, 0x82, 0x88, 0x83, 0x88, 0x84, 0xD0, 0xCD, /* 0x88-0x8B */ + 0xDB, 0xED, 0x88, 0x85, 0x88, 0x86, 0x88, 0x87, /* 0x8C-0x8F */ + 0x88, 0x88, 0x88, 0x89, 0xC0, 0xDD, 0xDB, 0xF2, /* 0x90-0x93 */ + 0x88, 0x8A, 0x88, 0x8B, 0x88, 0x8C, 0x88, 0x8D, /* 0x94-0x97 */ + 0x88, 0x8E, 0x88, 0x8F, 0x88, 0x90, 0xB6, 0xE2, /* 0x98-0x9B */ + 0x88, 0x91, 0x88, 0x92, 0x88, 0x93, 0x88, 0x94, /* 0x9C-0x9F */ + 0xDB, 0xF3, 0xDB, 0xD2, 0xB9, 0xB8, 0xD4, 0xAB, /* 0xA0-0xA3 */ + 0xDB, 0xEC, 0x88, 0x95, 0xBF, 0xD1, 0xDB, 0xF0, /* 0xA4-0xA7 */ + 0x88, 0x96, 0xDB, 0xD1, 0x88, 0x97, 0xB5, 0xE6, /* 0xA8-0xAB */ + 0x88, 0x98, 0xDB, 0xEB, 0xBF, 0xE5, 0x88, 0x99, /* 0xAC-0xAF */ + 0x88, 0x9A, 0x88, 0x9B, 0xDB, 0xEE, 0x88, 0x9C, /* 0xB0-0xB3 */ + 0xDB, 0xF1, 0x88, 0x9D, 0x88, 0x9E, 0x88, 0x9F, /* 0xB4-0xB7 */ + 0xDB, 0xF9, 0x88, 0xA0, 0x88, 0xA1, 0x88, 0xA2, /* 0xB8-0xBB */ + 0x88, 0xA3, 0x88, 0xA4, 0x88, 0xA5, 0x88, 0xA6, /* 0xBC-0xBF */ + 0x88, 0xA7, 0x88, 0xA8, 0xB9, 0xA1, 0xB0, 0xA3, /* 0xC0-0xC3 */ + 0x88, 0xA9, 0x88, 0xAA, 0x88, 0xAB, 0x88, 0xAC, /* 0xC4-0xC7 */ + 0x88, 0xAD, 0x88, 0xAE, 0x88, 0xAF, 0xC2, 0xF1, /* 0xC8-0xCB */ + 0x88, 0xB0, 0x88, 0xB1, 0xB3, 0xC7, 0xDB, 0xEF, /* 0xCC-0xCF */ + 0x88, 0xB2, 0x88, 0xB3, 0xDB, 0xF8, 0x88, 0xB4, /* 0xD0-0xD3 */ + 0xC6, 0xD2, 0xDB, 0xF4, 0x88, 0xB5, 0x88, 0xB6, /* 0xD4-0xD7 */ + 0xDB, 0xF5, 0xDB, 0xF7, 0xDB, 0xF6, 0x88, 0xB7, /* 0xD8-0xDB */ + 0x88, 0xB8, 0xDB, 0xFE, 0x88, 0xB9, 0xD3, 0xF2, /* 0xDC-0xDF */ + 0xB2, 0xBA, 0x88, 0xBA, 0x88, 0xBB, 0x88, 0xBC, /* 0xE0-0xE3 */ + 0xDB, 0xFD, 0x88, 0xBD, 0x88, 0xBE, 0x88, 0xBF, /* 0xE4-0xE7 */ + 0x88, 0xC0, 0x88, 0xC1, 0x88, 0xC2, 0x88, 0xC3, /* 0xE8-0xEB */ + 0x88, 0xC4, 0xDC, 0xA4, 0x88, 0xC5, 0xDB, 0xFB, /* 0xEC-0xEF */ + 0x88, 0xC6, 0x88, 0xC7, 0x88, 0xC8, 0x88, 0xC9, /* 0xF0-0xF3 */ + 0xDB, 0xFA, 0x88, 0xCA, 0x88, 0xCB, 0x88, 0xCC, /* 0xF4-0xF7 */ + 0xDB, 0xFC, 0xC5, 0xE0, 0xBB, 0xF9, 0x88, 0xCD, /* 0xF8-0xFB */ + 0x88, 0xCE, 0xDC, 0xA3, 0x88, 0xCF, 0x88, 0xD0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_58[512] = { + 0xDC, 0xA5, 0x88, 0xD1, 0xCC, 0xC3, 0x88, 0xD2, /* 0x00-0x03 */ + 0x88, 0xD3, 0x88, 0xD4, 0xB6, 0xD1, 0xDD, 0xC0, /* 0x04-0x07 */ + 0x88, 0xD5, 0x88, 0xD6, 0x88, 0xD7, 0xDC, 0xA1, /* 0x08-0x0B */ + 0x88, 0xD8, 0xDC, 0xA2, 0x88, 0xD9, 0x88, 0xDA, /* 0x0C-0x0F */ + 0x88, 0xDB, 0xC7, 0xB5, 0x88, 0xDC, 0x88, 0xDD, /* 0x10-0x13 */ + 0x88, 0xDE, 0xB6, 0xE9, 0x88, 0xDF, 0x88, 0xE0, /* 0x14-0x17 */ + 0x88, 0xE1, 0xDC, 0xA7, 0x88, 0xE2, 0x88, 0xE3, /* 0x18-0x1B */ + 0x88, 0xE4, 0x88, 0xE5, 0xDC, 0xA6, 0x88, 0xE6, /* 0x1C-0x1F */ + 0xDC, 0xA9, 0xB1, 0xA4, 0x88, 0xE7, 0x88, 0xE8, /* 0x20-0x23 */ + 0xB5, 0xCC, 0x88, 0xE9, 0x88, 0xEA, 0x88, 0xEB, /* 0x24-0x27 */ + 0x88, 0xEC, 0x88, 0xED, 0xBF, 0xB0, 0x88, 0xEE, /* 0x28-0x2B */ + 0x88, 0xEF, 0x88, 0xF0, 0x88, 0xF1, 0x88, 0xF2, /* 0x2C-0x2F */ + 0xD1, 0xDF, 0x88, 0xF3, 0x88, 0xF4, 0x88, 0xF5, /* 0x30-0x33 */ + 0x88, 0xF6, 0xB6, 0xC2, 0x88, 0xF7, 0x88, 0xF8, /* 0x34-0x37 */ + 0x88, 0xF9, 0x88, 0xFA, 0x88, 0xFB, 0x88, 0xFC, /* 0x38-0x3B */ + 0x88, 0xFD, 0x88, 0xFE, 0x89, 0x40, 0x89, 0x41, /* 0x3C-0x3F */ + 0x89, 0x42, 0x89, 0x43, 0x89, 0x44, 0x89, 0x45, /* 0x40-0x43 */ + 0xDC, 0xA8, 0x89, 0x46, 0x89, 0x47, 0x89, 0x48, /* 0x44-0x47 */ + 0x89, 0x49, 0x89, 0x4A, 0x89, 0x4B, 0x89, 0x4C, /* 0x48-0x4B */ + 0xCB, 0xFA, 0xEB, 0xF3, 0x89, 0x4D, 0x89, 0x4E, /* 0x4C-0x4F */ + 0x89, 0x4F, 0xCB, 0xDC, 0x89, 0x50, 0x89, 0x51, /* 0x50-0x53 */ + 0xCB, 0xFE, 0x89, 0x52, 0x89, 0x53, 0x89, 0x54, /* 0x54-0x57 */ + 0xCC, 0xC1, 0x89, 0x55, 0x89, 0x56, 0x89, 0x57, /* 0x58-0x5B */ + 0x89, 0x58, 0x89, 0x59, 0xC8, 0xFB, 0x89, 0x5A, /* 0x5C-0x5F */ + 0x89, 0x5B, 0x89, 0x5C, 0x89, 0x5D, 0x89, 0x5E, /* 0x60-0x63 */ + 0x89, 0x5F, 0xDC, 0xAA, 0x89, 0x60, 0x89, 0x61, /* 0x64-0x67 */ + 0x89, 0x62, 0x89, 0x63, 0x89, 0x64, 0xCC, 0xEE, /* 0x68-0x6B */ + 0xDC, 0xAB, 0x89, 0x65, 0x89, 0x66, 0x89, 0x67, /* 0x6C-0x6F */ + 0x89, 0x68, 0x89, 0x69, 0x89, 0x6A, 0x89, 0x6B, /* 0x70-0x73 */ + 0x89, 0x6C, 0x89, 0x6D, 0x89, 0x6E, 0x89, 0x6F, /* 0x74-0x77 */ + 0x89, 0x70, 0x89, 0x71, 0x89, 0x72, 0x89, 0x73, /* 0x78-0x7B */ + 0x89, 0x74, 0x89, 0x75, 0xDB, 0xD3, 0x89, 0x76, /* 0x7C-0x7F */ + + 0xDC, 0xAF, 0xDC, 0xAC, 0x89, 0x77, 0xBE, 0xB3, /* 0x80-0x83 */ + 0x89, 0x78, 0xCA, 0xFB, 0x89, 0x79, 0x89, 0x7A, /* 0x84-0x87 */ + 0x89, 0x7B, 0xDC, 0xAD, 0x89, 0x7C, 0x89, 0x7D, /* 0x88-0x8B */ + 0x89, 0x7E, 0x89, 0x80, 0x89, 0x81, 0x89, 0x82, /* 0x8C-0x8F */ + 0x89, 0x83, 0x89, 0x84, 0xC9, 0xCA, 0xC4, 0xB9, /* 0x90-0x93 */ + 0x89, 0x85, 0x89, 0x86, 0x89, 0x87, 0x89, 0x88, /* 0x94-0x97 */ + 0x89, 0x89, 0xC7, 0xBD, 0xDC, 0xAE, 0x89, 0x8A, /* 0x98-0x9B */ + 0x89, 0x8B, 0x89, 0x8C, 0xD4, 0xF6, 0xD0, 0xE6, /* 0x9C-0x9F */ + 0x89, 0x8D, 0x89, 0x8E, 0x89, 0x8F, 0x89, 0x90, /* 0xA0-0xA3 */ + 0x89, 0x91, 0x89, 0x92, 0x89, 0x93, 0x89, 0x94, /* 0xA4-0xA7 */ + 0xC4, 0xAB, 0xB6, 0xD5, 0x89, 0x95, 0x89, 0x96, /* 0xA8-0xAB */ + 0x89, 0x97, 0x89, 0x98, 0x89, 0x99, 0x89, 0x9A, /* 0xAC-0xAF */ + 0x89, 0x9B, 0x89, 0x9C, 0x89, 0x9D, 0x89, 0x9E, /* 0xB0-0xB3 */ + 0x89, 0x9F, 0x89, 0xA0, 0x89, 0xA1, 0x89, 0xA2, /* 0xB4-0xB7 */ + 0x89, 0xA3, 0x89, 0xA4, 0x89, 0xA5, 0x89, 0xA6, /* 0xB8-0xBB */ + 0xDB, 0xD4, 0x89, 0xA7, 0x89, 0xA8, 0x89, 0xA9, /* 0xBC-0xBF */ + 0x89, 0xAA, 0xB1, 0xDA, 0x89, 0xAB, 0x89, 0xAC, /* 0xC0-0xC3 */ + 0x89, 0xAD, 0xDB, 0xD5, 0x89, 0xAE, 0x89, 0xAF, /* 0xC4-0xC7 */ + 0x89, 0xB0, 0x89, 0xB1, 0x89, 0xB2, 0x89, 0xB3, /* 0xC8-0xCB */ + 0x89, 0xB4, 0x89, 0xB5, 0x89, 0xB6, 0x89, 0xB7, /* 0xCC-0xCF */ + 0x89, 0xB8, 0xDB, 0xD6, 0x89, 0xB9, 0x89, 0xBA, /* 0xD0-0xD3 */ + 0x89, 0xBB, 0xBA, 0xBE, 0x89, 0xBC, 0x89, 0xBD, /* 0xD4-0xD7 */ + 0x89, 0xBE, 0x89, 0xBF, 0x89, 0xC0, 0x89, 0xC1, /* 0xD8-0xDB */ + 0x89, 0xC2, 0x89, 0xC3, 0x89, 0xC4, 0x89, 0xC5, /* 0xDC-0xDF */ + 0x89, 0xC6, 0x89, 0xC7, 0x89, 0xC8, 0x89, 0xC9, /* 0xE0-0xE3 */ + 0xC8, 0xC0, 0x89, 0xCA, 0x89, 0xCB, 0x89, 0xCC, /* 0xE4-0xE7 */ + 0x89, 0xCD, 0x89, 0xCE, 0x89, 0xCF, 0xCA, 0xBF, /* 0xE8-0xEB */ + 0xC8, 0xC9, 0x89, 0xD0, 0xD7, 0xB3, 0x89, 0xD1, /* 0xEC-0xEF */ + 0xC9, 0xF9, 0x89, 0xD2, 0x89, 0xD3, 0xBF, 0xC7, /* 0xF0-0xF3 */ + 0x89, 0xD4, 0x89, 0xD5, 0xBA, 0xF8, 0x89, 0xD6, /* 0xF4-0xF7 */ + 0x89, 0xD7, 0xD2, 0xBC, 0x89, 0xD8, 0x89, 0xD9, /* 0xF8-0xFB */ + 0x89, 0xDA, 0x89, 0xDB, 0x89, 0xDC, 0x89, 0xDD, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_59[512] = { + 0x89, 0xDE, 0x89, 0xDF, 0xE2, 0xBA, 0x89, 0xE0, /* 0x00-0x03 */ + 0xB4, 0xA6, 0x89, 0xE1, 0x89, 0xE2, 0xB1, 0xB8, /* 0x04-0x07 */ + 0x89, 0xE3, 0x89, 0xE4, 0x89, 0xE5, 0x89, 0xE6, /* 0x08-0x0B */ + 0x89, 0xE7, 0xB8, 0xB4, 0x89, 0xE8, 0xCF, 0xC4, /* 0x0C-0x0F */ + 0x89, 0xE9, 0x89, 0xEA, 0x89, 0xEB, 0x89, 0xEC, /* 0x10-0x13 */ + 0xD9, 0xE7, 0xCF, 0xA6, 0xCD, 0xE2, 0x89, 0xED, /* 0x14-0x17 */ + 0x89, 0xEE, 0xD9, 0xED, 0xB6, 0xE0, 0x89, 0xEF, /* 0x18-0x1B */ + 0xD2, 0xB9, 0x89, 0xF0, 0x89, 0xF1, 0xB9, 0xBB, /* 0x1C-0x1F */ + 0x89, 0xF2, 0x89, 0xF3, 0x89, 0xF4, 0x89, 0xF5, /* 0x20-0x23 */ + 0xE2, 0xB9, 0xE2, 0xB7, 0x89, 0xF6, 0xB4, 0xF3, /* 0x24-0x27 */ + 0x89, 0xF7, 0xCC, 0xEC, 0xCC, 0xAB, 0xB7, 0xF2, /* 0x28-0x2B */ + 0x89, 0xF8, 0xD8, 0xB2, 0xD1, 0xEB, 0xBA, 0xBB, /* 0x2C-0x2F */ + 0x89, 0xF9, 0xCA, 0xA7, 0x89, 0xFA, 0x89, 0xFB, /* 0x30-0x33 */ + 0xCD, 0xB7, 0x89, 0xFC, 0x89, 0xFD, 0xD2, 0xC4, /* 0x34-0x37 */ + 0xBF, 0xE4, 0xBC, 0xD0, 0xB6, 0xE1, 0x89, 0xFE, /* 0x38-0x3B */ + 0xDE, 0xC5, 0x8A, 0x40, 0x8A, 0x41, 0x8A, 0x42, /* 0x3C-0x3F */ + 0x8A, 0x43, 0xDE, 0xC6, 0xDB, 0xBC, 0x8A, 0x44, /* 0x40-0x43 */ + 0xD1, 0xD9, 0x8A, 0x45, 0x8A, 0x46, 0xC6, 0xE6, /* 0x44-0x47 */ + 0xC4, 0xCE, 0xB7, 0xEE, 0x8A, 0x47, 0xB7, 0xDC, /* 0x48-0x4B */ + 0x8A, 0x48, 0x8A, 0x49, 0xBF, 0xFC, 0xD7, 0xE0, /* 0x4C-0x4F */ + 0x8A, 0x4A, 0xC6, 0xF5, 0x8A, 0x4B, 0x8A, 0x4C, /* 0x50-0x53 */ + 0xB1, 0xBC, 0xDE, 0xC8, 0xBD, 0xB1, 0xCC, 0xD7, /* 0x54-0x57 */ + 0xDE, 0xCA, 0x8A, 0x4D, 0xDE, 0xC9, 0x8A, 0x4E, /* 0x58-0x5B */ + 0x8A, 0x4F, 0x8A, 0x50, 0x8A, 0x51, 0x8A, 0x52, /* 0x5C-0x5F */ + 0xB5, 0xEC, 0x8A, 0x53, 0xC9, 0xDD, 0x8A, 0x54, /* 0x60-0x63 */ + 0x8A, 0x55, 0xB0, 0xC2, 0x8A, 0x56, 0x8A, 0x57, /* 0x64-0x67 */ + 0x8A, 0x58, 0x8A, 0x59, 0x8A, 0x5A, 0x8A, 0x5B, /* 0x68-0x6B */ + 0x8A, 0x5C, 0x8A, 0x5D, 0x8A, 0x5E, 0x8A, 0x5F, /* 0x6C-0x6F */ + 0x8A, 0x60, 0x8A, 0x61, 0x8A, 0x62, 0xC5, 0xAE, /* 0x70-0x73 */ + 0xC5, 0xAB, 0x8A, 0x63, 0xC4, 0xCC, 0x8A, 0x64, /* 0x74-0x77 */ + 0xBC, 0xE9, 0xCB, 0xFD, 0x8A, 0x65, 0x8A, 0x66, /* 0x78-0x7B */ + 0x8A, 0x67, 0xBA, 0xC3, 0x8A, 0x68, 0x8A, 0x69, /* 0x7C-0x7F */ + + 0x8A, 0x6A, 0xE5, 0xF9, 0xC8, 0xE7, 0xE5, 0xFA, /* 0x80-0x83 */ + 0xCD, 0xFD, 0x8A, 0x6B, 0xD7, 0xB1, 0xB8, 0xBE, /* 0x84-0x87 */ + 0xC2, 0xE8, 0x8A, 0x6C, 0xC8, 0xD1, 0x8A, 0x6D, /* 0x88-0x8B */ + 0x8A, 0x6E, 0xE5, 0xFB, 0x8A, 0x6F, 0x8A, 0x70, /* 0x8C-0x8F */ + 0x8A, 0x71, 0x8A, 0x72, 0xB6, 0xCA, 0xBC, 0xCB, /* 0x90-0x93 */ + 0x8A, 0x73, 0x8A, 0x74, 0xD1, 0xFD, 0xE6, 0xA1, /* 0x94-0x97 */ + 0x8A, 0x75, 0xC3, 0xEE, 0x8A, 0x76, 0x8A, 0x77, /* 0x98-0x9B */ + 0x8A, 0x78, 0x8A, 0x79, 0xE6, 0xA4, 0x8A, 0x7A, /* 0x9C-0x9F */ + 0x8A, 0x7B, 0x8A, 0x7C, 0x8A, 0x7D, 0xE5, 0xFE, /* 0xA0-0xA3 */ + 0xE6, 0xA5, 0xCD, 0xD7, 0x8A, 0x7E, 0x8A, 0x80, /* 0xA4-0xA7 */ + 0xB7, 0xC1, 0xE5, 0xFC, 0xE5, 0xFD, 0xE6, 0xA3, /* 0xA8-0xAB */ + 0x8A, 0x81, 0x8A, 0x82, 0xC4, 0xDD, 0xE6, 0xA8, /* 0xAC-0xAF */ + 0x8A, 0x83, 0x8A, 0x84, 0xE6, 0xA7, 0x8A, 0x85, /* 0xB0-0xB3 */ + 0x8A, 0x86, 0x8A, 0x87, 0x8A, 0x88, 0x8A, 0x89, /* 0xB4-0xB7 */ + 0x8A, 0x8A, 0xC3, 0xC3, 0x8A, 0x8B, 0xC6, 0xDE, /* 0xB8-0xBB */ + 0x8A, 0x8C, 0x8A, 0x8D, 0xE6, 0xAA, 0x8A, 0x8E, /* 0xBC-0xBF */ + 0x8A, 0x8F, 0x8A, 0x90, 0x8A, 0x91, 0x8A, 0x92, /* 0xC0-0xC3 */ + 0x8A, 0x93, 0x8A, 0x94, 0xC4, 0xB7, 0x8A, 0x95, /* 0xC4-0xC7 */ + 0x8A, 0x96, 0x8A, 0x97, 0xE6, 0xA2, 0xCA, 0xBC, /* 0xC8-0xCB */ + 0x8A, 0x98, 0x8A, 0x99, 0x8A, 0x9A, 0x8A, 0x9B, /* 0xCC-0xCF */ + 0xBD, 0xE3, 0xB9, 0xC3, 0xE6, 0xA6, 0xD0, 0xD5, /* 0xD0-0xD3 */ + 0xCE, 0xAF, 0x8A, 0x9C, 0x8A, 0x9D, 0xE6, 0xA9, /* 0xD4-0xD7 */ + 0xE6, 0xB0, 0x8A, 0x9E, 0xD2, 0xA6, 0x8A, 0x9F, /* 0xD8-0xDB */ + 0xBD, 0xAA, 0xE6, 0xAD, 0x8A, 0xA0, 0x8A, 0xA1, /* 0xDC-0xDF */ + 0x8A, 0xA2, 0x8A, 0xA3, 0x8A, 0xA4, 0xE6, 0xAF, /* 0xE0-0xE3 */ + 0x8A, 0xA5, 0xC0, 0xD1, 0x8A, 0xA6, 0x8A, 0xA7, /* 0xE4-0xE7 */ + 0xD2, 0xCC, 0x8A, 0xA8, 0x8A, 0xA9, 0x8A, 0xAA, /* 0xE8-0xEB */ + 0xBC, 0xA7, 0x8A, 0xAB, 0x8A, 0xAC, 0x8A, 0xAD, /* 0xEC-0xEF */ + 0x8A, 0xAE, 0x8A, 0xAF, 0x8A, 0xB0, 0x8A, 0xB1, /* 0xF0-0xF3 */ + 0x8A, 0xB2, 0x8A, 0xB3, 0x8A, 0xB4, 0x8A, 0xB5, /* 0xF4-0xF7 */ + 0x8A, 0xB6, 0xE6, 0xB1, 0x8A, 0xB7, 0xD2, 0xF6, /* 0xF8-0xFB */ + 0x8A, 0xB8, 0x8A, 0xB9, 0x8A, 0xBA, 0xD7, 0xCB, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5A[512] = { + 0x8A, 0xBB, 0xCD, 0xFE, 0x8A, 0xBC, 0xCD, 0xDE, /* 0x00-0x03 */ + 0xC2, 0xA6, 0xE6, 0xAB, 0xE6, 0xAC, 0xBD, 0xBF, /* 0x04-0x07 */ + 0xE6, 0xAE, 0xE6, 0xB3, 0x8A, 0xBD, 0x8A, 0xBE, /* 0x08-0x0B */ + 0xE6, 0xB2, 0x8A, 0xBF, 0x8A, 0xC0, 0x8A, 0xC1, /* 0x0C-0x0F */ + 0x8A, 0xC2, 0xE6, 0xB6, 0x8A, 0xC3, 0xE6, 0xB8, /* 0x10-0x13 */ + 0x8A, 0xC4, 0x8A, 0xC5, 0x8A, 0xC6, 0x8A, 0xC7, /* 0x14-0x17 */ + 0xC4, 0xEF, 0x8A, 0xC8, 0x8A, 0xC9, 0x8A, 0xCA, /* 0x18-0x1B */ + 0xC4, 0xC8, 0x8A, 0xCB, 0x8A, 0xCC, 0xBE, 0xEA, /* 0x1C-0x1F */ + 0xC9, 0xEF, 0x8A, 0xCD, 0x8A, 0xCE, 0xE6, 0xB7, /* 0x20-0x23 */ + 0x8A, 0xCF, 0xB6, 0xF0, 0x8A, 0xD0, 0x8A, 0xD1, /* 0x24-0x27 */ + 0x8A, 0xD2, 0xC3, 0xE4, 0x8A, 0xD3, 0x8A, 0xD4, /* 0x28-0x2B */ + 0x8A, 0xD5, 0x8A, 0xD6, 0x8A, 0xD7, 0x8A, 0xD8, /* 0x2C-0x2F */ + 0x8A, 0xD9, 0xD3, 0xE9, 0xE6, 0xB4, 0x8A, 0xDA, /* 0x30-0x33 */ + 0xE6, 0xB5, 0x8A, 0xDB, 0xC8, 0xA2, 0x8A, 0xDC, /* 0x34-0x37 */ + 0x8A, 0xDD, 0x8A, 0xDE, 0x8A, 0xDF, 0x8A, 0xE0, /* 0x38-0x3B */ + 0xE6, 0xBD, 0x8A, 0xE1, 0x8A, 0xE2, 0x8A, 0xE3, /* 0x3C-0x3F */ + 0xE6, 0xB9, 0x8A, 0xE4, 0x8A, 0xE5, 0x8A, 0xE6, /* 0x40-0x43 */ + 0x8A, 0xE7, 0x8A, 0xE8, 0xC6, 0xC5, 0x8A, 0xE9, /* 0x44-0x47 */ + 0x8A, 0xEA, 0xCD, 0xF1, 0xE6, 0xBB, 0x8A, 0xEB, /* 0x48-0x4B */ + 0x8A, 0xEC, 0x8A, 0xED, 0x8A, 0xEE, 0x8A, 0xEF, /* 0x4C-0x4F */ + 0x8A, 0xF0, 0x8A, 0xF1, 0x8A, 0xF2, 0x8A, 0xF3, /* 0x50-0x53 */ + 0x8A, 0xF4, 0xE6, 0xBC, 0x8A, 0xF5, 0x8A, 0xF6, /* 0x54-0x57 */ + 0x8A, 0xF7, 0x8A, 0xF8, 0xBB, 0xE9, 0x8A, 0xF9, /* 0x58-0x5B */ + 0x8A, 0xFA, 0x8A, 0xFB, 0x8A, 0xFC, 0x8A, 0xFD, /* 0x5C-0x5F */ + 0x8A, 0xFE, 0x8B, 0x40, 0xE6, 0xBE, 0x8B, 0x41, /* 0x60-0x63 */ + 0x8B, 0x42, 0x8B, 0x43, 0x8B, 0x44, 0xE6, 0xBA, /* 0x64-0x67 */ + 0x8B, 0x45, 0x8B, 0x46, 0xC0, 0xB7, 0x8B, 0x47, /* 0x68-0x6B */ + 0x8B, 0x48, 0x8B, 0x49, 0x8B, 0x4A, 0x8B, 0x4B, /* 0x6C-0x6F */ + 0x8B, 0x4C, 0x8B, 0x4D, 0x8B, 0x4E, 0x8B, 0x4F, /* 0x70-0x73 */ + 0xD3, 0xA4, 0xE6, 0xBF, 0xC9, 0xF4, 0xE6, 0xC3, /* 0x74-0x77 */ + 0x8B, 0x50, 0x8B, 0x51, 0xE6, 0xC4, 0x8B, 0x52, /* 0x78-0x7B */ + 0x8B, 0x53, 0x8B, 0x54, 0x8B, 0x55, 0xD0, 0xF6, /* 0x7C-0x7F */ + + 0x8B, 0x56, 0x8B, 0x57, 0x8B, 0x58, 0x8B, 0x59, /* 0x80-0x83 */ + 0x8B, 0x5A, 0x8B, 0x5B, 0x8B, 0x5C, 0x8B, 0x5D, /* 0x84-0x87 */ + 0x8B, 0x5E, 0x8B, 0x5F, 0x8B, 0x60, 0x8B, 0x61, /* 0x88-0x8B */ + 0x8B, 0x62, 0x8B, 0x63, 0x8B, 0x64, 0x8B, 0x65, /* 0x8C-0x8F */ + 0x8B, 0x66, 0x8B, 0x67, 0xC3, 0xBD, 0x8B, 0x68, /* 0x90-0x93 */ + 0x8B, 0x69, 0x8B, 0x6A, 0x8B, 0x6B, 0x8B, 0x6C, /* 0x94-0x97 */ + 0x8B, 0x6D, 0x8B, 0x6E, 0xC3, 0xC4, 0xE6, 0xC2, /* 0x98-0x9B */ + 0x8B, 0x6F, 0x8B, 0x70, 0x8B, 0x71, 0x8B, 0x72, /* 0x9C-0x9F */ + 0x8B, 0x73, 0x8B, 0x74, 0x8B, 0x75, 0x8B, 0x76, /* 0xA0-0xA3 */ + 0x8B, 0x77, 0x8B, 0x78, 0x8B, 0x79, 0x8B, 0x7A, /* 0xA4-0xA7 */ + 0x8B, 0x7B, 0x8B, 0x7C, 0xE6, 0xC1, 0x8B, 0x7D, /* 0xA8-0xAB */ + 0x8B, 0x7E, 0x8B, 0x80, 0x8B, 0x81, 0x8B, 0x82, /* 0xAC-0xAF */ + 0x8B, 0x83, 0x8B, 0x84, 0xE6, 0xC7, 0xCF, 0xB1, /* 0xB0-0xB3 */ + 0x8B, 0x85, 0xEB, 0xF4, 0x8B, 0x86, 0x8B, 0x87, /* 0xB4-0xB7 */ + 0xE6, 0xCA, 0x8B, 0x88, 0x8B, 0x89, 0x8B, 0x8A, /* 0xB8-0xBB */ + 0x8B, 0x8B, 0x8B, 0x8C, 0xE6, 0xC5, 0x8B, 0x8D, /* 0xBC-0xBF */ + 0x8B, 0x8E, 0xBC, 0xDE, 0xC9, 0xA9, 0x8B, 0x8F, /* 0xC0-0xC3 */ + 0x8B, 0x90, 0x8B, 0x91, 0x8B, 0x92, 0x8B, 0x93, /* 0xC4-0xC7 */ + 0x8B, 0x94, 0xBC, 0xB5, 0x8B, 0x95, 0x8B, 0x96, /* 0xC8-0xCB */ + 0xCF, 0xD3, 0x8B, 0x97, 0x8B, 0x98, 0x8B, 0x99, /* 0xCC-0xCF */ + 0x8B, 0x9A, 0x8B, 0x9B, 0xE6, 0xC8, 0x8B, 0x9C, /* 0xD0-0xD3 */ + 0xE6, 0xC9, 0x8B, 0x9D, 0xE6, 0xCE, 0x8B, 0x9E, /* 0xD4-0xD7 */ + 0xE6, 0xD0, 0x8B, 0x9F, 0x8B, 0xA0, 0x8B, 0xA1, /* 0xD8-0xDB */ + 0xE6, 0xD1, 0x8B, 0xA2, 0x8B, 0xA3, 0x8B, 0xA4, /* 0xDC-0xDF */ + 0xE6, 0xCB, 0xB5, 0xD5, 0x8B, 0xA5, 0xE6, 0xCC, /* 0xE0-0xE3 */ + 0x8B, 0xA6, 0x8B, 0xA7, 0xE6, 0xCF, 0x8B, 0xA8, /* 0xE4-0xE7 */ + 0x8B, 0xA9, 0xC4, 0xDB, 0x8B, 0xAA, 0xE6, 0xC6, /* 0xE8-0xEB */ + 0x8B, 0xAB, 0x8B, 0xAC, 0x8B, 0xAD, 0x8B, 0xAE, /* 0xEC-0xEF */ + 0x8B, 0xAF, 0xE6, 0xCD, 0x8B, 0xB0, 0x8B, 0xB1, /* 0xF0-0xF3 */ + 0x8B, 0xB2, 0x8B, 0xB3, 0x8B, 0xB4, 0x8B, 0xB5, /* 0xF4-0xF7 */ + 0x8B, 0xB6, 0x8B, 0xB7, 0x8B, 0xB8, 0x8B, 0xB9, /* 0xF8-0xFB */ + 0x8B, 0xBA, 0x8B, 0xBB, 0x8B, 0xBC, 0x8B, 0xBD, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5B[512] = { + 0x8B, 0xBE, 0x8B, 0xBF, 0x8B, 0xC0, 0x8B, 0xC1, /* 0x00-0x03 */ + 0x8B, 0xC2, 0x8B, 0xC3, 0x8B, 0xC4, 0x8B, 0xC5, /* 0x04-0x07 */ + 0x8B, 0xC6, 0xE6, 0xD2, 0x8B, 0xC7, 0x8B, 0xC8, /* 0x08-0x0B */ + 0x8B, 0xC9, 0x8B, 0xCA, 0x8B, 0xCB, 0x8B, 0xCC, /* 0x0C-0x0F */ + 0x8B, 0xCD, 0x8B, 0xCE, 0x8B, 0xCF, 0x8B, 0xD0, /* 0x10-0x13 */ + 0x8B, 0xD1, 0x8B, 0xD2, 0xE6, 0xD4, 0xE6, 0xD3, /* 0x14-0x17 */ + 0x8B, 0xD3, 0x8B, 0xD4, 0x8B, 0xD5, 0x8B, 0xD6, /* 0x18-0x1B */ + 0x8B, 0xD7, 0x8B, 0xD8, 0x8B, 0xD9, 0x8B, 0xDA, /* 0x1C-0x1F */ + 0x8B, 0xDB, 0x8B, 0xDC, 0x8B, 0xDD, 0x8B, 0xDE, /* 0x20-0x23 */ + 0x8B, 0xDF, 0x8B, 0xE0, 0x8B, 0xE1, 0x8B, 0xE2, /* 0x24-0x27 */ + 0x8B, 0xE3, 0x8B, 0xE4, 0x8B, 0xE5, 0x8B, 0xE6, /* 0x28-0x2B */ + 0x8B, 0xE7, 0x8B, 0xE8, 0x8B, 0xE9, 0x8B, 0xEA, /* 0x2C-0x2F */ + 0x8B, 0xEB, 0x8B, 0xEC, 0xE6, 0xD5, 0x8B, 0xED, /* 0x30-0x33 */ + 0xD9, 0xF8, 0x8B, 0xEE, 0x8B, 0xEF, 0xE6, 0xD6, /* 0x34-0x37 */ + 0x8B, 0xF0, 0x8B, 0xF1, 0x8B, 0xF2, 0x8B, 0xF3, /* 0x38-0x3B */ + 0x8B, 0xF4, 0x8B, 0xF5, 0x8B, 0xF6, 0x8B, 0xF7, /* 0x3C-0x3F */ + 0xE6, 0xD7, 0x8B, 0xF8, 0x8B, 0xF9, 0x8B, 0xFA, /* 0x40-0x43 */ + 0x8B, 0xFB, 0x8B, 0xFC, 0x8B, 0xFD, 0x8B, 0xFE, /* 0x44-0x47 */ + 0x8C, 0x40, 0x8C, 0x41, 0x8C, 0x42, 0x8C, 0x43, /* 0x48-0x4B */ + 0x8C, 0x44, 0x8C, 0x45, 0x8C, 0x46, 0x8C, 0x47, /* 0x4C-0x4F */ + 0xD7, 0xD3, 0xE6, 0xDD, 0x8C, 0x48, 0xE6, 0xDE, /* 0x50-0x53 */ + 0xBF, 0xD7, 0xD4, 0xD0, 0x8C, 0x49, 0xD7, 0xD6, /* 0x54-0x57 */ + 0xB4, 0xE6, 0xCB, 0xEF, 0xE6, 0xDA, 0xD8, 0xC3, /* 0x58-0x5B */ + 0xD7, 0xCE, 0xD0, 0xA2, 0x8C, 0x4A, 0xC3, 0xCF, /* 0x5C-0x5F */ + 0x8C, 0x4B, 0x8C, 0x4C, 0xE6, 0xDF, 0xBC, 0xBE, /* 0x60-0x63 */ + 0xB9, 0xC2, 0xE6, 0xDB, 0xD1, 0xA7, 0x8C, 0x4D, /* 0x64-0x67 */ + 0x8C, 0x4E, 0xBA, 0xA2, 0xC2, 0xCF, 0x8C, 0x4F, /* 0x68-0x6B */ + 0xD8, 0xAB, 0x8C, 0x50, 0x8C, 0x51, 0x8C, 0x52, /* 0x6C-0x6F */ + 0xCA, 0xEB, 0xE5, 0xEE, 0x8C, 0x53, 0xE6, 0xDC, /* 0x70-0x73 */ + 0x8C, 0x54, 0xB7, 0xF5, 0x8C, 0x55, 0x8C, 0x56, /* 0x74-0x77 */ + 0x8C, 0x57, 0x8C, 0x58, 0xC8, 0xE6, 0x8C, 0x59, /* 0x78-0x7B */ + 0x8C, 0x5A, 0xC4, 0xF5, 0x8C, 0x5B, 0x8C, 0x5C, /* 0x7C-0x7F */ + + 0xE5, 0xB2, 0xC4, 0xFE, 0x8C, 0x5D, 0xCB, 0xFC, /* 0x80-0x83 */ + 0xE5, 0xB3, 0xD5, 0xAC, 0x8C, 0x5E, 0xD3, 0xEE, /* 0x84-0x87 */ + 0xCA, 0xD8, 0xB0, 0xB2, 0x8C, 0x5F, 0xCB, 0xCE, /* 0x88-0x8B */ + 0xCD, 0xEA, 0x8C, 0x60, 0x8C, 0x61, 0xBA, 0xEA, /* 0x8C-0x8F */ + 0x8C, 0x62, 0x8C, 0x63, 0x8C, 0x64, 0xE5, 0xB5, /* 0x90-0x93 */ + 0x8C, 0x65, 0xE5, 0xB4, 0x8C, 0x66, 0xD7, 0xDA, /* 0x94-0x97 */ + 0xB9, 0xD9, 0xD6, 0xE6, 0xB6, 0xA8, 0xCD, 0xF0, /* 0x98-0x9B */ + 0xD2, 0xCB, 0xB1, 0xA6, 0xCA, 0xB5, 0x8C, 0x67, /* 0x9C-0x9F */ + 0xB3, 0xE8, 0xC9, 0xF3, 0xBF, 0xCD, 0xD0, 0xFB, /* 0xA0-0xA3 */ + 0xCA, 0xD2, 0xE5, 0xB6, 0xBB, 0xC2, 0x8C, 0x68, /* 0xA4-0xA7 */ + 0x8C, 0x69, 0x8C, 0x6A, 0xCF, 0xDC, 0xB9, 0xAC, /* 0xA8-0xAB */ + 0x8C, 0x6B, 0x8C, 0x6C, 0x8C, 0x6D, 0x8C, 0x6E, /* 0xAC-0xAF */ + 0xD4, 0xD7, 0x8C, 0x6F, 0x8C, 0x70, 0xBA, 0xA6, /* 0xB0-0xB3 */ + 0xD1, 0xE7, 0xCF, 0xFC, 0xBC, 0xD2, 0x8C, 0x71, /* 0xB4-0xB7 */ + 0xE5, 0xB7, 0xC8, 0xDD, 0x8C, 0x72, 0x8C, 0x73, /* 0xB8-0xBB */ + 0x8C, 0x74, 0xBF, 0xED, 0xB1, 0xF6, 0xCB, 0xDE, /* 0xBC-0xBF */ + 0x8C, 0x75, 0x8C, 0x76, 0xBC, 0xC5, 0x8C, 0x77, /* 0xC0-0xC3 */ + 0xBC, 0xC4, 0xD2, 0xFA, 0xC3, 0xDC, 0xBF, 0xDC, /* 0xC4-0xC7 */ + 0x8C, 0x78, 0x8C, 0x79, 0x8C, 0x7A, 0x8C, 0x7B, /* 0xC8-0xCB */ + 0xB8, 0xBB, 0x8C, 0x7C, 0x8C, 0x7D, 0x8C, 0x7E, /* 0xCC-0xCF */ + 0xC3, 0xC2, 0x8C, 0x80, 0xBA, 0xAE, 0xD4, 0xA2, /* 0xD0-0xD3 */ + 0x8C, 0x81, 0x8C, 0x82, 0x8C, 0x83, 0x8C, 0x84, /* 0xD4-0xD7 */ + 0x8C, 0x85, 0x8C, 0x86, 0x8C, 0x87, 0x8C, 0x88, /* 0xD8-0xDB */ + 0x8C, 0x89, 0xC7, 0xDE, 0xC4, 0xAF, 0xB2, 0xEC, /* 0xDC-0xDF */ + 0x8C, 0x8A, 0xB9, 0xD1, 0x8C, 0x8B, 0x8C, 0x8C, /* 0xE0-0xE3 */ + 0xE5, 0xBB, 0xC1, 0xC8, 0x8C, 0x8D, 0x8C, 0x8E, /* 0xE4-0xE7 */ + 0xD5, 0xAF, 0x8C, 0x8F, 0x8C, 0x90, 0x8C, 0x91, /* 0xE8-0xEB */ + 0x8C, 0x92, 0x8C, 0x93, 0xE5, 0xBC, 0x8C, 0x94, /* 0xEC-0xEF */ + 0xE5, 0xBE, 0x8C, 0x95, 0x8C, 0x96, 0x8C, 0x97, /* 0xF0-0xF3 */ + 0x8C, 0x98, 0x8C, 0x99, 0x8C, 0x9A, 0x8C, 0x9B, /* 0xF4-0xF7 */ + 0xB4, 0xE7, 0xB6, 0xD4, 0xCB, 0xC2, 0xD1, 0xB0, /* 0xF8-0xFB */ + 0xB5, 0xBC, 0x8C, 0x9C, 0x8C, 0x9D, 0xCA, 0xD9, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5C[512] = { + 0x8C, 0x9E, 0xB7, 0xE2, 0x8C, 0x9F, 0x8C, 0xA0, /* 0x00-0x03 */ + 0xC9, 0xE4, 0x8C, 0xA1, 0xBD, 0xAB, 0x8C, 0xA2, /* 0x04-0x07 */ + 0x8C, 0xA3, 0xCE, 0xBE, 0xD7, 0xF0, 0x8C, 0xA4, /* 0x08-0x0B */ + 0x8C, 0xA5, 0x8C, 0xA6, 0x8C, 0xA7, 0xD0, 0xA1, /* 0x0C-0x0F */ + 0x8C, 0xA8, 0xC9, 0xD9, 0x8C, 0xA9, 0x8C, 0xAA, /* 0x10-0x13 */ + 0xB6, 0xFB, 0xE6, 0xD8, 0xBC, 0xE2, 0x8C, 0xAB, /* 0x14-0x17 */ + 0xB3, 0xBE, 0x8C, 0xAC, 0xC9, 0xD0, 0x8C, 0xAD, /* 0x18-0x1B */ + 0xE6, 0xD9, 0xB3, 0xA2, 0x8C, 0xAE, 0x8C, 0xAF, /* 0x1C-0x1F */ + 0x8C, 0xB0, 0x8C, 0xB1, 0xDE, 0xCC, 0x8C, 0xB2, /* 0x20-0x23 */ + 0xD3, 0xC8, 0xDE, 0xCD, 0x8C, 0xB3, 0xD2, 0xA2, /* 0x24-0x27 */ + 0x8C, 0xB4, 0x8C, 0xB5, 0x8C, 0xB6, 0x8C, 0xB7, /* 0x28-0x2B */ + 0xDE, 0xCE, 0x8C, 0xB8, 0x8C, 0xB9, 0x8C, 0xBA, /* 0x2C-0x2F */ + 0x8C, 0xBB, 0xBE, 0xCD, 0x8C, 0xBC, 0x8C, 0xBD, /* 0x30-0x33 */ + 0xDE, 0xCF, 0x8C, 0xBE, 0x8C, 0xBF, 0x8C, 0xC0, /* 0x34-0x37 */ + 0xCA, 0xAC, 0xD2, 0xFC, 0xB3, 0xDF, 0xE5, 0xEA, /* 0x38-0x3B */ + 0xC4, 0xE1, 0xBE, 0xA1, 0xCE, 0xB2, 0xC4, 0xF2, /* 0x3C-0x3F */ + 0xBE, 0xD6, 0xC6, 0xA8, 0xB2, 0xE3, 0x8C, 0xC1, /* 0x40-0x43 */ + 0x8C, 0xC2, 0xBE, 0xD3, 0x8C, 0xC3, 0x8C, 0xC4, /* 0x44-0x47 */ + 0xC7, 0xFC, 0xCC, 0xEB, 0xBD, 0xEC, 0xCE, 0xDD, /* 0x48-0x4B */ + 0x8C, 0xC5, 0x8C, 0xC6, 0xCA, 0xBA, 0xC6, 0xC1, /* 0x4C-0x4F */ + 0xE5, 0xEC, 0xD0, 0xBC, 0x8C, 0xC7, 0x8C, 0xC8, /* 0x50-0x53 */ + 0x8C, 0xC9, 0xD5, 0xB9, 0x8C, 0xCA, 0x8C, 0xCB, /* 0x54-0x57 */ + 0x8C, 0xCC, 0xE5, 0xED, 0x8C, 0xCD, 0x8C, 0xCE, /* 0x58-0x5B */ + 0x8C, 0xCF, 0x8C, 0xD0, 0xCA, 0xF4, 0x8C, 0xD1, /* 0x5C-0x5F */ + 0xCD, 0xC0, 0xC2, 0xC5, 0x8C, 0xD2, 0xE5, 0xEF, /* 0x60-0x63 */ + 0x8C, 0xD3, 0xC2, 0xC4, 0xE5, 0xF0, 0x8C, 0xD4, /* 0x64-0x67 */ + 0x8C, 0xD5, 0x8C, 0xD6, 0x8C, 0xD7, 0x8C, 0xD8, /* 0x68-0x6B */ + 0x8C, 0xD9, 0x8C, 0xDA, 0xE5, 0xF8, 0xCD, 0xCD, /* 0x6C-0x6F */ + 0x8C, 0xDB, 0xC9, 0xBD, 0x8C, 0xDC, 0x8C, 0xDD, /* 0x70-0x73 */ + 0x8C, 0xDE, 0x8C, 0xDF, 0x8C, 0xE0, 0x8C, 0xE1, /* 0x74-0x77 */ + 0x8C, 0xE2, 0xD2, 0xD9, 0xE1, 0xA8, 0x8C, 0xE3, /* 0x78-0x7B */ + 0x8C, 0xE4, 0x8C, 0xE5, 0x8C, 0xE6, 0xD3, 0xEC, /* 0x7C-0x7F */ + + 0x8C, 0xE7, 0xCB, 0xEA, 0xC6, 0xF1, 0x8C, 0xE8, /* 0x80-0x83 */ + 0x8C, 0xE9, 0x8C, 0xEA, 0x8C, 0xEB, 0x8C, 0xEC, /* 0x84-0x87 */ + 0xE1, 0xAC, 0x8C, 0xED, 0x8C, 0xEE, 0x8C, 0xEF, /* 0x88-0x8B */ + 0xE1, 0xA7, 0xE1, 0xA9, 0x8C, 0xF0, 0x8C, 0xF1, /* 0x8C-0x8F */ + 0xE1, 0xAA, 0xE1, 0xAF, 0x8C, 0xF2, 0x8C, 0xF3, /* 0x90-0x93 */ + 0xB2, 0xED, 0x8C, 0xF4, 0xE1, 0xAB, 0xB8, 0xDA, /* 0x94-0x97 */ + 0xE1, 0xAD, 0xE1, 0xAE, 0xE1, 0xB0, 0xB5, 0xBA, /* 0x98-0x9B */ + 0xE1, 0xB1, 0x8C, 0xF5, 0x8C, 0xF6, 0x8C, 0xF7, /* 0x9C-0x9F */ + 0x8C, 0xF8, 0x8C, 0xF9, 0xE1, 0xB3, 0xE1, 0xB8, /* 0xA0-0xA3 */ + 0x8C, 0xFA, 0x8C, 0xFB, 0x8C, 0xFC, 0x8C, 0xFD, /* 0xA4-0xA7 */ + 0x8C, 0xFE, 0xD1, 0xD2, 0x8D, 0x40, 0xE1, 0xB6, /* 0xA8-0xAB */ + 0xE1, 0xB5, 0xC1, 0xEB, 0x8D, 0x41, 0x8D, 0x42, /* 0xAC-0xAF */ + 0x8D, 0x43, 0xE1, 0xB7, 0x8D, 0x44, 0xD4, 0xC0, /* 0xB0-0xB3 */ + 0x8D, 0x45, 0xE1, 0xB2, 0x8D, 0x46, 0xE1, 0xBA, /* 0xB4-0xB7 */ + 0xB0, 0xB6, 0x8D, 0x47, 0x8D, 0x48, 0x8D, 0x49, /* 0xB8-0xBB */ + 0x8D, 0x4A, 0xE1, 0xB4, 0x8D, 0x4B, 0xBF, 0xF9, /* 0xBC-0xBF */ + 0x8D, 0x4C, 0xE1, 0xB9, 0x8D, 0x4D, 0x8D, 0x4E, /* 0xC0-0xC3 */ + 0xE1, 0xBB, 0x8D, 0x4F, 0x8D, 0x50, 0x8D, 0x51, /* 0xC4-0xC7 */ + 0x8D, 0x52, 0x8D, 0x53, 0x8D, 0x54, 0xE1, 0xBE, /* 0xC8-0xCB */ + 0x8D, 0x55, 0x8D, 0x56, 0x8D, 0x57, 0x8D, 0x58, /* 0xCC-0xCF */ + 0x8D, 0x59, 0x8D, 0x5A, 0xE1, 0xBC, 0x8D, 0x5B, /* 0xD0-0xD3 */ + 0x8D, 0x5C, 0x8D, 0x5D, 0x8D, 0x5E, 0x8D, 0x5F, /* 0xD4-0xD7 */ + 0x8D, 0x60, 0xD6, 0xC5, 0x8D, 0x61, 0x8D, 0x62, /* 0xD8-0xDB */ + 0x8D, 0x63, 0x8D, 0x64, 0x8D, 0x65, 0x8D, 0x66, /* 0xDC-0xDF */ + 0x8D, 0x67, 0xCF, 0xBF, 0x8D, 0x68, 0x8D, 0x69, /* 0xE0-0xE3 */ + 0xE1, 0xBD, 0xE1, 0xBF, 0xC2, 0xCD, 0x8D, 0x6A, /* 0xE4-0xE7 */ + 0xB6, 0xEB, 0x8D, 0x6B, 0xD3, 0xF8, 0x8D, 0x6C, /* 0xE8-0xEB */ + 0x8D, 0x6D, 0xC7, 0xCD, 0x8D, 0x6E, 0x8D, 0x6F, /* 0xEC-0xEF */ + 0xB7, 0xE5, 0x8D, 0x70, 0x8D, 0x71, 0x8D, 0x72, /* 0xF0-0xF3 */ + 0x8D, 0x73, 0x8D, 0x74, 0x8D, 0x75, 0x8D, 0x76, /* 0xF4-0xF7 */ + 0x8D, 0x77, 0x8D, 0x78, 0x8D, 0x79, 0xBE, 0xFE, /* 0xF8-0xFB */ + 0x8D, 0x7A, 0x8D, 0x7B, 0x8D, 0x7C, 0x8D, 0x7D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5D[512] = { + 0x8D, 0x7E, 0x8D, 0x80, 0xE1, 0xC0, 0xE1, 0xC1, /* 0x00-0x03 */ + 0x8D, 0x81, 0x8D, 0x82, 0xE1, 0xC7, 0xB3, 0xE7, /* 0x04-0x07 */ + 0x8D, 0x83, 0x8D, 0x84, 0x8D, 0x85, 0x8D, 0x86, /* 0x08-0x0B */ + 0x8D, 0x87, 0x8D, 0x88, 0xC6, 0xE9, 0x8D, 0x89, /* 0x0C-0x0F */ + 0x8D, 0x8A, 0x8D, 0x8B, 0x8D, 0x8C, 0x8D, 0x8D, /* 0x10-0x13 */ + 0xB4, 0xDE, 0x8D, 0x8E, 0xD1, 0xC2, 0x8D, 0x8F, /* 0x14-0x17 */ + 0x8D, 0x90, 0x8D, 0x91, 0x8D, 0x92, 0xE1, 0xC8, /* 0x18-0x1B */ + 0x8D, 0x93, 0x8D, 0x94, 0xE1, 0xC6, 0x8D, 0x95, /* 0x1C-0x1F */ + 0x8D, 0x96, 0x8D, 0x97, 0x8D, 0x98, 0x8D, 0x99, /* 0x20-0x23 */ + 0xE1, 0xC5, 0x8D, 0x9A, 0xE1, 0xC3, 0xE1, 0xC2, /* 0x24-0x27 */ + 0x8D, 0x9B, 0xB1, 0xC0, 0x8D, 0x9C, 0x8D, 0x9D, /* 0x28-0x2B */ + 0x8D, 0x9E, 0xD5, 0xB8, 0xE1, 0xC4, 0x8D, 0x9F, /* 0x2C-0x2F */ + 0x8D, 0xA0, 0x8D, 0xA1, 0x8D, 0xA2, 0x8D, 0xA3, /* 0x30-0x33 */ + 0xE1, 0xCB, 0x8D, 0xA4, 0x8D, 0xA5, 0x8D, 0xA6, /* 0x34-0x37 */ + 0x8D, 0xA7, 0x8D, 0xA8, 0x8D, 0xA9, 0x8D, 0xAA, /* 0x38-0x3B */ + 0x8D, 0xAB, 0xE1, 0xCC, 0xE1, 0xCA, 0x8D, 0xAC, /* 0x3C-0x3F */ + 0x8D, 0xAD, 0x8D, 0xAE, 0x8D, 0xAF, 0x8D, 0xB0, /* 0x40-0x43 */ + 0x8D, 0xB1, 0x8D, 0xB2, 0x8D, 0xB3, 0xEF, 0xFA, /* 0x44-0x47 */ + 0x8D, 0xB4, 0x8D, 0xB5, 0xE1, 0xD3, 0xE1, 0xD2, /* 0x48-0x4B */ + 0xC7, 0xB6, 0x8D, 0xB6, 0x8D, 0xB7, 0x8D, 0xB8, /* 0x4C-0x4F */ + 0x8D, 0xB9, 0x8D, 0xBA, 0x8D, 0xBB, 0x8D, 0xBC, /* 0x50-0x53 */ + 0x8D, 0xBD, 0x8D, 0xBE, 0x8D, 0xBF, 0x8D, 0xC0, /* 0x54-0x57 */ + 0xE1, 0xC9, 0x8D, 0xC1, 0x8D, 0xC2, 0xE1, 0xCE, /* 0x58-0x5B */ + 0x8D, 0xC3, 0xE1, 0xD0, 0x8D, 0xC4, 0x8D, 0xC5, /* 0x5C-0x5F */ + 0x8D, 0xC6, 0x8D, 0xC7, 0x8D, 0xC8, 0x8D, 0xC9, /* 0x60-0x63 */ + 0x8D, 0xCA, 0x8D, 0xCB, 0x8D, 0xCC, 0x8D, 0xCD, /* 0x64-0x67 */ + 0x8D, 0xCE, 0xE1, 0xD4, 0x8D, 0xCF, 0xE1, 0xD1, /* 0x68-0x6B */ + 0xE1, 0xCD, 0x8D, 0xD0, 0x8D, 0xD1, 0xE1, 0xCF, /* 0x6C-0x6F */ + 0x8D, 0xD2, 0x8D, 0xD3, 0x8D, 0xD4, 0x8D, 0xD5, /* 0x70-0x73 */ + 0xE1, 0xD5, 0x8D, 0xD6, 0x8D, 0xD7, 0x8D, 0xD8, /* 0x74-0x77 */ + 0x8D, 0xD9, 0x8D, 0xDA, 0x8D, 0xDB, 0x8D, 0xDC, /* 0x78-0x7B */ + 0x8D, 0xDD, 0x8D, 0xDE, 0x8D, 0xDF, 0x8D, 0xE0, /* 0x7C-0x7F */ + + 0x8D, 0xE1, 0x8D, 0xE2, 0xE1, 0xD6, 0x8D, 0xE3, /* 0x80-0x83 */ + 0x8D, 0xE4, 0x8D, 0xE5, 0x8D, 0xE6, 0x8D, 0xE7, /* 0x84-0x87 */ + 0x8D, 0xE8, 0x8D, 0xE9, 0x8D, 0xEA, 0x8D, 0xEB, /* 0x88-0x8B */ + 0x8D, 0xEC, 0x8D, 0xED, 0x8D, 0xEE, 0x8D, 0xEF, /* 0x8C-0x8F */ + 0x8D, 0xF0, 0x8D, 0xF1, 0x8D, 0xF2, 0x8D, 0xF3, /* 0x90-0x93 */ + 0x8D, 0xF4, 0x8D, 0xF5, 0x8D, 0xF6, 0x8D, 0xF7, /* 0x94-0x97 */ + 0x8D, 0xF8, 0xE1, 0xD7, 0x8D, 0xF9, 0x8D, 0xFA, /* 0x98-0x9B */ + 0x8D, 0xFB, 0xE1, 0xD8, 0x8D, 0xFC, 0x8D, 0xFD, /* 0x9C-0x9F */ + 0x8D, 0xFE, 0x8E, 0x40, 0x8E, 0x41, 0x8E, 0x42, /* 0xA0-0xA3 */ + 0x8E, 0x43, 0x8E, 0x44, 0x8E, 0x45, 0x8E, 0x46, /* 0xA4-0xA7 */ + 0x8E, 0x47, 0x8E, 0x48, 0x8E, 0x49, 0x8E, 0x4A, /* 0xA8-0xAB */ + 0x8E, 0x4B, 0x8E, 0x4C, 0x8E, 0x4D, 0x8E, 0x4E, /* 0xAC-0xAF */ + 0x8E, 0x4F, 0x8E, 0x50, 0x8E, 0x51, 0x8E, 0x52, /* 0xB0-0xB3 */ + 0x8E, 0x53, 0x8E, 0x54, 0x8E, 0x55, 0xE1, 0xDA, /* 0xB4-0xB7 */ + 0x8E, 0x56, 0x8E, 0x57, 0x8E, 0x58, 0x8E, 0x59, /* 0xB8-0xBB */ + 0x8E, 0x5A, 0x8E, 0x5B, 0x8E, 0x5C, 0x8E, 0x5D, /* 0xBC-0xBF */ + 0x8E, 0x5E, 0x8E, 0x5F, 0x8E, 0x60, 0x8E, 0x61, /* 0xC0-0xC3 */ + 0x8E, 0x62, 0xE1, 0xDB, 0x8E, 0x63, 0x8E, 0x64, /* 0xC4-0xC7 */ + 0x8E, 0x65, 0x8E, 0x66, 0x8E, 0x67, 0x8E, 0x68, /* 0xC8-0xCB */ + 0x8E, 0x69, 0xCE, 0xA1, 0x8E, 0x6A, 0x8E, 0x6B, /* 0xCC-0xCF */ + 0x8E, 0x6C, 0x8E, 0x6D, 0x8E, 0x6E, 0x8E, 0x6F, /* 0xD0-0xD3 */ + 0x8E, 0x70, 0x8E, 0x71, 0x8E, 0x72, 0x8E, 0x73, /* 0xD4-0xD7 */ + 0x8E, 0x74, 0x8E, 0x75, 0x8E, 0x76, 0xE7, 0xDD, /* 0xD8-0xDB */ + 0x8E, 0x77, 0xB4, 0xA8, 0xD6, 0xDD, 0x8E, 0x78, /* 0xDC-0xDF */ + 0x8E, 0x79, 0xD1, 0xB2, 0xB3, 0xB2, 0x8E, 0x7A, /* 0xE0-0xE3 */ + 0x8E, 0x7B, 0xB9, 0xA4, 0xD7, 0xF3, 0xC7, 0xC9, /* 0xE4-0xE7 */ + 0xBE, 0xDE, 0xB9, 0xAE, 0x8E, 0x7C, 0xCE, 0xD7, /* 0xE8-0xEB */ + 0x8E, 0x7D, 0x8E, 0x7E, 0xB2, 0xEE, 0xDB, 0xCF, /* 0xEC-0xEF */ + 0x8E, 0x80, 0xBC, 0xBA, 0xD2, 0xD1, 0xCB, 0xC8, /* 0xF0-0xF3 */ + 0xB0, 0xCD, 0x8E, 0x81, 0x8E, 0x82, 0xCF, 0xEF, /* 0xF4-0xF7 */ + 0x8E, 0x83, 0x8E, 0x84, 0x8E, 0x85, 0x8E, 0x86, /* 0xF8-0xFB */ + 0x8E, 0x87, 0xD9, 0xE3, 0xBD, 0xED, 0x8E, 0x88, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5E[512] = { + 0x8E, 0x89, 0xB1, 0xD2, 0xCA, 0xD0, 0xB2, 0xBC, /* 0x00-0x03 */ + 0x8E, 0x8A, 0xCB, 0xA7, 0xB7, 0xAB, 0x8E, 0x8B, /* 0x04-0x07 */ + 0xCA, 0xA6, 0x8E, 0x8C, 0x8E, 0x8D, 0x8E, 0x8E, /* 0x08-0x0B */ + 0xCF, 0xA3, 0x8E, 0x8F, 0x8E, 0x90, 0xE0, 0xF8, /* 0x0C-0x0F */ + 0xD5, 0xCA, 0xE0, 0xFB, 0x8E, 0x91, 0x8E, 0x92, /* 0x10-0x13 */ + 0xE0, 0xFA, 0xC5, 0xC1, 0xCC, 0xFB, 0x8E, 0x93, /* 0x14-0x17 */ + 0xC1, 0xB1, 0xE0, 0xF9, 0xD6, 0xE3, 0xB2, 0xAF, /* 0x18-0x1B */ + 0xD6, 0xC4, 0xB5, 0xDB, 0x8E, 0x94, 0x8E, 0x95, /* 0x1C-0x1F */ + 0x8E, 0x96, 0x8E, 0x97, 0x8E, 0x98, 0x8E, 0x99, /* 0x20-0x23 */ + 0x8E, 0x9A, 0x8E, 0x9B, 0xB4, 0xF8, 0xD6, 0xA1, /* 0x24-0x27 */ + 0x8E, 0x9C, 0x8E, 0x9D, 0x8E, 0x9E, 0x8E, 0x9F, /* 0x28-0x2B */ + 0x8E, 0xA0, 0xCF, 0xAF, 0xB0, 0xEF, 0x8E, 0xA1, /* 0x2C-0x2F */ + 0x8E, 0xA2, 0xE0, 0xFC, 0x8E, 0xA3, 0x8E, 0xA4, /* 0x30-0x33 */ + 0x8E, 0xA5, 0x8E, 0xA6, 0x8E, 0xA7, 0xE1, 0xA1, /* 0x34-0x37 */ + 0xB3, 0xA3, 0x8E, 0xA8, 0x8E, 0xA9, 0xE0, 0xFD, /* 0x38-0x3B */ + 0xE0, 0xFE, 0xC3, 0xB1, 0x8E, 0xAA, 0x8E, 0xAB, /* 0x3C-0x3F */ + 0x8E, 0xAC, 0x8E, 0xAD, 0xC3, 0xDD, 0x8E, 0xAE, /* 0x40-0x43 */ + 0xE1, 0xA2, 0xB7, 0xF9, 0x8E, 0xAF, 0x8E, 0xB0, /* 0x44-0x47 */ + 0x8E, 0xB1, 0x8E, 0xB2, 0x8E, 0xB3, 0x8E, 0xB4, /* 0x48-0x4B */ + 0xBB, 0xCF, 0x8E, 0xB5, 0x8E, 0xB6, 0x8E, 0xB7, /* 0x4C-0x4F */ + 0x8E, 0xB8, 0x8E, 0xB9, 0x8E, 0xBA, 0x8E, 0xBB, /* 0x50-0x53 */ + 0xE1, 0xA3, 0xC4, 0xBB, 0x8E, 0xBC, 0x8E, 0xBD, /* 0x54-0x57 */ + 0x8E, 0xBE, 0x8E, 0xBF, 0x8E, 0xC0, 0xE1, 0xA4, /* 0x58-0x5B */ + 0x8E, 0xC1, 0x8E, 0xC2, 0xE1, 0xA5, 0x8E, 0xC3, /* 0x5C-0x5F */ + 0x8E, 0xC4, 0xE1, 0xA6, 0xB4, 0xB1, 0x8E, 0xC5, /* 0x60-0x63 */ + 0x8E, 0xC6, 0x8E, 0xC7, 0x8E, 0xC8, 0x8E, 0xC9, /* 0x64-0x67 */ + 0x8E, 0xCA, 0x8E, 0xCB, 0x8E, 0xCC, 0x8E, 0xCD, /* 0x68-0x6B */ + 0x8E, 0xCE, 0x8E, 0xCF, 0x8E, 0xD0, 0x8E, 0xD1, /* 0x6C-0x6F */ + 0x8E, 0xD2, 0x8E, 0xD3, 0xB8, 0xC9, 0xC6, 0xBD, /* 0x70-0x73 */ + 0xC4, 0xEA, 0x8E, 0xD4, 0xB2, 0xA2, 0x8E, 0xD5, /* 0x74-0x77 */ + 0xD0, 0xD2, 0x8E, 0xD6, 0xE7, 0xDB, 0xBB, 0xC3, /* 0x78-0x7B */ + 0xD3, 0xD7, 0xD3, 0xC4, 0x8E, 0xD7, 0xB9, 0xE3, /* 0x7C-0x7F */ + + 0xE2, 0xCF, 0x8E, 0xD8, 0x8E, 0xD9, 0x8E, 0xDA, /* 0x80-0x83 */ + 0xD7, 0xAF, 0x8E, 0xDB, 0xC7, 0xEC, 0xB1, 0xD3, /* 0x84-0x87 */ + 0x8E, 0xDC, 0x8E, 0xDD, 0xB4, 0xB2, 0xE2, 0xD1, /* 0x88-0x8B */ + 0x8E, 0xDE, 0x8E, 0xDF, 0x8E, 0xE0, 0xD0, 0xF2, /* 0x8C-0x8F */ + 0xC2, 0xAE, 0xE2, 0xD0, 0x8E, 0xE1, 0xBF, 0xE2, /* 0x90-0x93 */ + 0xD3, 0xA6, 0xB5, 0xD7, 0xE2, 0xD2, 0xB5, 0xEA, /* 0x94-0x97 */ + 0x8E, 0xE2, 0xC3, 0xED, 0xB8, 0xFD, 0x8E, 0xE3, /* 0x98-0x9B */ + 0xB8, 0xAE, 0x8E, 0xE4, 0xC5, 0xD3, 0xB7, 0xCF, /* 0x9C-0x9F */ + 0xE2, 0xD4, 0x8E, 0xE5, 0x8E, 0xE6, 0x8E, 0xE7, /* 0xA0-0xA3 */ + 0x8E, 0xE8, 0xE2, 0xD3, 0xB6, 0xC8, 0xD7, 0xF9, /* 0xA4-0xA7 */ + 0x8E, 0xE9, 0x8E, 0xEA, 0x8E, 0xEB, 0x8E, 0xEC, /* 0xA8-0xAB */ + 0x8E, 0xED, 0xCD, 0xA5, 0x8E, 0xEE, 0x8E, 0xEF, /* 0xAC-0xAF */ + 0x8E, 0xF0, 0x8E, 0xF1, 0x8E, 0xF2, 0xE2, 0xD8, /* 0xB0-0xB3 */ + 0x8E, 0xF3, 0xE2, 0xD6, 0xCA, 0xFC, 0xBF, 0xB5, /* 0xB4-0xB7 */ + 0xD3, 0xB9, 0xE2, 0xD5, 0x8E, 0xF4, 0x8E, 0xF5, /* 0xB8-0xBB */ + 0x8E, 0xF6, 0x8E, 0xF7, 0xE2, 0xD7, 0x8E, 0xF8, /* 0xBC-0xBF */ + 0x8E, 0xF9, 0x8E, 0xFA, 0x8E, 0xFB, 0x8E, 0xFC, /* 0xC0-0xC3 */ + 0x8E, 0xFD, 0x8E, 0xFE, 0x8F, 0x40, 0x8F, 0x41, /* 0xC4-0xC7 */ + 0x8F, 0x42, 0xC1, 0xAE, 0xC0, 0xC8, 0x8F, 0x43, /* 0xC8-0xCB */ + 0x8F, 0x44, 0x8F, 0x45, 0x8F, 0x46, 0x8F, 0x47, /* 0xCC-0xCF */ + 0x8F, 0x48, 0xE2, 0xDB, 0xE2, 0xDA, 0xC0, 0xAA, /* 0xD0-0xD3 */ + 0x8F, 0x49, 0x8F, 0x4A, 0xC1, 0xCE, 0x8F, 0x4B, /* 0xD4-0xD7 */ + 0x8F, 0x4C, 0x8F, 0x4D, 0x8F, 0x4E, 0xE2, 0xDC, /* 0xD8-0xDB */ + 0x8F, 0x4F, 0x8F, 0x50, 0x8F, 0x51, 0x8F, 0x52, /* 0xDC-0xDF */ + 0x8F, 0x53, 0x8F, 0x54, 0x8F, 0x55, 0x8F, 0x56, /* 0xE0-0xE3 */ + 0x8F, 0x57, 0x8F, 0x58, 0x8F, 0x59, 0x8F, 0x5A, /* 0xE4-0xE7 */ + 0xE2, 0xDD, 0x8F, 0x5B, 0xE2, 0xDE, 0x8F, 0x5C, /* 0xE8-0xEB */ + 0x8F, 0x5D, 0x8F, 0x5E, 0x8F, 0x5F, 0x8F, 0x60, /* 0xEC-0xEF */ + 0x8F, 0x61, 0x8F, 0x62, 0x8F, 0x63, 0x8F, 0x64, /* 0xF0-0xF3 */ + 0xDB, 0xC8, 0x8F, 0x65, 0xD1, 0xD3, 0xCD, 0xA2, /* 0xF4-0xF7 */ + 0x8F, 0x66, 0x8F, 0x67, 0xBD, 0xA8, 0x8F, 0x68, /* 0xF8-0xFB */ + 0x8F, 0x69, 0x8F, 0x6A, 0xDE, 0xC3, 0xD8, 0xA5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5F[512] = { + 0xBF, 0xAA, 0xDB, 0xCD, 0xD2, 0xEC, 0xC6, 0xFA, /* 0x00-0x03 */ + 0xC5, 0xAA, 0x8F, 0x6B, 0x8F, 0x6C, 0x8F, 0x6D, /* 0x04-0x07 */ + 0xDE, 0xC4, 0x8F, 0x6E, 0xB1, 0xD7, 0xDF, 0xAE, /* 0x08-0x0B */ + 0x8F, 0x6F, 0x8F, 0x70, 0x8F, 0x71, 0xCA, 0xBD, /* 0x0C-0x0F */ + 0x8F, 0x72, 0xDF, 0xB1, 0x8F, 0x73, 0xB9, 0xAD, /* 0x10-0x13 */ + 0x8F, 0x74, 0xD2, 0xFD, 0x8F, 0x75, 0xB8, 0xA5, /* 0x14-0x17 */ + 0xBA, 0xEB, 0x8F, 0x76, 0x8F, 0x77, 0xB3, 0xDA, /* 0x18-0x1B */ + 0x8F, 0x78, 0x8F, 0x79, 0x8F, 0x7A, 0xB5, 0xDC, /* 0x1C-0x1F */ + 0xD5, 0xC5, 0x8F, 0x7B, 0x8F, 0x7C, 0x8F, 0x7D, /* 0x20-0x23 */ + 0x8F, 0x7E, 0xC3, 0xD6, 0xCF, 0xD2, 0xBB, 0xA1, /* 0x24-0x27 */ + 0x8F, 0x80, 0xE5, 0xF3, 0xE5, 0xF2, 0x8F, 0x81, /* 0x28-0x2B */ + 0x8F, 0x82, 0xE5, 0xF4, 0x8F, 0x83, 0xCD, 0xE4, /* 0x2C-0x2F */ + 0x8F, 0x84, 0xC8, 0xF5, 0x8F, 0x85, 0x8F, 0x86, /* 0x30-0x33 */ + 0x8F, 0x87, 0x8F, 0x88, 0x8F, 0x89, 0x8F, 0x8A, /* 0x34-0x37 */ + 0x8F, 0x8B, 0xB5, 0xAF, 0xC7, 0xBF, 0x8F, 0x8C, /* 0x38-0x3B */ + 0xE5, 0xF6, 0x8F, 0x8D, 0x8F, 0x8E, 0x8F, 0x8F, /* 0x3C-0x3F */ + 0xEC, 0xB0, 0x8F, 0x90, 0x8F, 0x91, 0x8F, 0x92, /* 0x40-0x43 */ + 0x8F, 0x93, 0x8F, 0x94, 0x8F, 0x95, 0x8F, 0x96, /* 0x44-0x47 */ + 0x8F, 0x97, 0x8F, 0x98, 0x8F, 0x99, 0x8F, 0x9A, /* 0x48-0x4B */ + 0x8F, 0x9B, 0x8F, 0x9C, 0x8F, 0x9D, 0x8F, 0x9E, /* 0x4C-0x4F */ + 0xE5, 0xE6, 0x8F, 0x9F, 0xB9, 0xE9, 0xB5, 0xB1, /* 0x50-0x53 */ + 0x8F, 0xA0, 0xC2, 0xBC, 0xE5, 0xE8, 0xE5, 0xE7, /* 0x54-0x57 */ + 0xE5, 0xE9, 0x8F, 0xA1, 0x8F, 0xA2, 0x8F, 0xA3, /* 0x58-0x5B */ + 0x8F, 0xA4, 0xD2, 0xCD, 0x8F, 0xA5, 0x8F, 0xA6, /* 0x5C-0x5F */ + 0x8F, 0xA7, 0xE1, 0xEA, 0xD0, 0xCE, 0x8F, 0xA8, /* 0x60-0x63 */ + 0xCD, 0xAE, 0x8F, 0xA9, 0xD1, 0xE5, 0x8F, 0xAA, /* 0x64-0x67 */ + 0x8F, 0xAB, 0xB2, 0xCA, 0xB1, 0xEB, 0x8F, 0xAC, /* 0x68-0x6B */ + 0xB1, 0xF2, 0xC5, 0xED, 0x8F, 0xAD, 0x8F, 0xAE, /* 0x6C-0x6F */ + 0xD5, 0xC3, 0xD3, 0xB0, 0x8F, 0xAF, 0xE1, 0xDC, /* 0x70-0x73 */ + 0x8F, 0xB0, 0x8F, 0xB1, 0x8F, 0xB2, 0xE1, 0xDD, /* 0x74-0x77 */ + 0x8F, 0xB3, 0xD2, 0xDB, 0x8F, 0xB4, 0xB3, 0xB9, /* 0x78-0x7B */ + 0xB1, 0xCB, 0x8F, 0xB5, 0x8F, 0xB6, 0x8F, 0xB7, /* 0x7C-0x7F */ + + 0xCD, 0xF9, 0xD5, 0xF7, 0xE1, 0xDE, 0x8F, 0xB8, /* 0x80-0x83 */ + 0xBE, 0xB6, 0xB4, 0xFD, 0x8F, 0xB9, 0xE1, 0xDF, /* 0x84-0x87 */ + 0xBA, 0xDC, 0xE1, 0xE0, 0xBB, 0xB2, 0xC2, 0xC9, /* 0x88-0x8B */ + 0xE1, 0xE1, 0x8F, 0xBA, 0x8F, 0xBB, 0x8F, 0xBC, /* 0x8C-0x8F */ + 0xD0, 0xEC, 0x8F, 0xBD, 0xCD, 0xBD, 0x8F, 0xBE, /* 0x90-0x93 */ + 0x8F, 0xBF, 0xE1, 0xE2, 0x8F, 0xC0, 0xB5, 0xC3, /* 0x94-0x97 */ + 0xC5, 0xC7, 0xE1, 0xE3, 0x8F, 0xC1, 0x8F, 0xC2, /* 0x98-0x9B */ + 0xE1, 0xE4, 0x8F, 0xC3, 0x8F, 0xC4, 0x8F, 0xC5, /* 0x9C-0x9F */ + 0x8F, 0xC6, 0xD3, 0xF9, 0x8F, 0xC7, 0x8F, 0xC8, /* 0xA0-0xA3 */ + 0x8F, 0xC9, 0x8F, 0xCA, 0x8F, 0xCB, 0x8F, 0xCC, /* 0xA4-0xA7 */ + 0xE1, 0xE5, 0x8F, 0xCD, 0xD1, 0xAD, 0x8F, 0xCE, /* 0xA8-0xAB */ + 0x8F, 0xCF, 0xE1, 0xE6, 0xCE, 0xA2, 0x8F, 0xD0, /* 0xAC-0xAF */ + 0x8F, 0xD1, 0x8F, 0xD2, 0x8F, 0xD3, 0x8F, 0xD4, /* 0xB0-0xB3 */ + 0x8F, 0xD5, 0xE1, 0xE7, 0x8F, 0xD6, 0xB5, 0xC2, /* 0xB4-0xB7 */ + 0x8F, 0xD7, 0x8F, 0xD8, 0x8F, 0xD9, 0x8F, 0xDA, /* 0xB8-0xBB */ + 0xE1, 0xE8, 0xBB, 0xD5, 0x8F, 0xDB, 0x8F, 0xDC, /* 0xBC-0xBF */ + 0x8F, 0xDD, 0x8F, 0xDE, 0x8F, 0xDF, 0xD0, 0xC4, /* 0xC0-0xC3 */ + 0xE2, 0xE0, 0xB1, 0xD8, 0xD2, 0xE4, 0x8F, 0xE0, /* 0xC4-0xC7 */ + 0x8F, 0xE1, 0xE2, 0xE1, 0x8F, 0xE2, 0x8F, 0xE3, /* 0xC8-0xCB */ + 0xBC, 0xC9, 0xC8, 0xCC, 0x8F, 0xE4, 0xE2, 0xE3, /* 0xCC-0xCF */ + 0xEC, 0xFE, 0xEC, 0xFD, 0xDF, 0xAF, 0x8F, 0xE5, /* 0xD0-0xD3 */ + 0x8F, 0xE6, 0x8F, 0xE7, 0xE2, 0xE2, 0xD6, 0xBE, /* 0xD4-0xD7 */ + 0xCD, 0xFC, 0xC3, 0xA6, 0x8F, 0xE8, 0x8F, 0xE9, /* 0xD8-0xDB */ + 0x8F, 0xEA, 0xE3, 0xC3, 0x8F, 0xEB, 0x8F, 0xEC, /* 0xDC-0xDF */ + 0xD6, 0xD2, 0xE2, 0xE7, 0x8F, 0xED, 0x8F, 0xEE, /* 0xE0-0xE3 */ + 0xE2, 0xE8, 0x8F, 0xEF, 0x8F, 0xF0, 0xD3, 0xC7, /* 0xE4-0xE7 */ + 0x8F, 0xF1, 0x8F, 0xF2, 0xE2, 0xEC, 0xBF, 0xEC, /* 0xE8-0xEB */ + 0x8F, 0xF3, 0xE2, 0xED, 0xE2, 0xE5, 0x8F, 0xF4, /* 0xEC-0xEF */ + 0x8F, 0xF5, 0xB3, 0xC0, 0x8F, 0xF6, 0x8F, 0xF7, /* 0xF0-0xF3 */ + 0x8F, 0xF8, 0xC4, 0xEE, 0x8F, 0xF9, 0x8F, 0xFA, /* 0xF4-0xF7 */ + 0xE2, 0xEE, 0x8F, 0xFB, 0x8F, 0xFC, 0xD0, 0xC3, /* 0xF8-0xFB */ + 0x8F, 0xFD, 0xBA, 0xF6, 0xE2, 0xE9, 0xB7, 0xDE, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_60[512] = { + 0xBB, 0xB3, 0xCC, 0xAC, 0xCB, 0xCB, 0xE2, 0xE4, /* 0x00-0x03 */ + 0xE2, 0xE6, 0xE2, 0xEA, 0xE2, 0xEB, 0x8F, 0xFE, /* 0x04-0x07 */ + 0x90, 0x40, 0x90, 0x41, 0xE2, 0xF7, 0x90, 0x42, /* 0x08-0x0B */ + 0x90, 0x43, 0xE2, 0xF4, 0xD4, 0xF5, 0xE2, 0xF3, /* 0x0C-0x0F */ + 0x90, 0x44, 0x90, 0x45, 0xC5, 0xAD, 0x90, 0x46, /* 0x10-0x13 */ + 0xD5, 0xFA, 0xC5, 0xC2, 0xB2, 0xC0, 0x90, 0x47, /* 0x14-0x17 */ + 0x90, 0x48, 0xE2, 0xEF, 0x90, 0x49, 0xE2, 0xF2, /* 0x18-0x1B */ + 0xC1, 0xAF, 0xCB, 0xBC, 0x90, 0x4A, 0x90, 0x4B, /* 0x1C-0x1F */ + 0xB5, 0xA1, 0xE2, 0xF9, 0x90, 0x4C, 0x90, 0x4D, /* 0x20-0x23 */ + 0x90, 0x4E, 0xBC, 0xB1, 0xE2, 0xF1, 0xD0, 0xD4, /* 0x24-0x27 */ + 0xD4, 0xB9, 0xE2, 0xF5, 0xB9, 0xD6, 0xE2, 0xF6, /* 0x28-0x2B */ + 0x90, 0x4F, 0x90, 0x50, 0x90, 0x51, 0xC7, 0xD3, /* 0x2C-0x2F */ + 0x90, 0x52, 0x90, 0x53, 0x90, 0x54, 0x90, 0x55, /* 0x30-0x33 */ + 0x90, 0x56, 0xE2, 0xF0, 0x90, 0x57, 0x90, 0x58, /* 0x34-0x37 */ + 0x90, 0x59, 0x90, 0x5A, 0x90, 0x5B, 0xD7, 0xDC, /* 0x38-0x3B */ + 0xED, 0xA1, 0x90, 0x5C, 0x90, 0x5D, 0xE2, 0xF8, /* 0x3C-0x3F */ + 0x90, 0x5E, 0xED, 0xA5, 0xE2, 0xFE, 0xCA, 0xD1, /* 0x40-0x43 */ + 0x90, 0x5F, 0x90, 0x60, 0x90, 0x61, 0x90, 0x62, /* 0x44-0x47 */ + 0x90, 0x63, 0x90, 0x64, 0x90, 0x65, 0xC1, 0xB5, /* 0x48-0x4B */ + 0x90, 0x66, 0xBB, 0xD0, 0x90, 0x67, 0x90, 0x68, /* 0x4C-0x4F */ + 0xBF, 0xD6, 0x90, 0x69, 0xBA, 0xE3, 0x90, 0x6A, /* 0x50-0x53 */ + 0x90, 0x6B, 0xCB, 0xA1, 0x90, 0x6C, 0x90, 0x6D, /* 0x54-0x57 */ + 0x90, 0x6E, 0xED, 0xA6, 0xED, 0xA3, 0x90, 0x6F, /* 0x58-0x5B */ + 0x90, 0x70, 0xED, 0xA2, 0x90, 0x71, 0x90, 0x72, /* 0x5C-0x5F */ + 0x90, 0x73, 0x90, 0x74, 0xBB, 0xD6, 0xED, 0xA7, /* 0x60-0x63 */ + 0xD0, 0xF4, 0x90, 0x75, 0x90, 0x76, 0xED, 0xA4, /* 0x64-0x67 */ + 0xBA, 0xDE, 0xB6, 0xF7, 0xE3, 0xA1, 0xB6, 0xB2, /* 0x68-0x6B */ + 0xCC, 0xF1, 0xB9, 0xA7, 0x90, 0x77, 0xCF, 0xA2, /* 0x6C-0x6F */ + 0xC7, 0xA1, 0x90, 0x78, 0x90, 0x79, 0xBF, 0xD2, /* 0x70-0x73 */ + 0x90, 0x7A, 0x90, 0x7B, 0xB6, 0xF1, 0x90, 0x7C, /* 0x74-0x77 */ + 0xE2, 0xFA, 0xE2, 0xFB, 0xE2, 0xFD, 0xE2, 0xFC, /* 0x78-0x7B */ + 0xC4, 0xD5, 0xE3, 0xA2, 0x90, 0x7D, 0xD3, 0xC1, /* 0x7C-0x7F */ + + 0x90, 0x7E, 0x90, 0x80, 0x90, 0x81, 0xE3, 0xA7, /* 0x80-0x83 */ + 0xC7, 0xC4, 0x90, 0x82, 0x90, 0x83, 0x90, 0x84, /* 0x84-0x87 */ + 0x90, 0x85, 0xCF, 0xA4, 0x90, 0x86, 0x90, 0x87, /* 0x88-0x8B */ + 0xE3, 0xA9, 0xBA, 0xB7, 0x90, 0x88, 0x90, 0x89, /* 0x8C-0x8F */ + 0x90, 0x8A, 0x90, 0x8B, 0xE3, 0xA8, 0x90, 0x8C, /* 0x90-0x93 */ + 0xBB, 0xDA, 0x90, 0x8D, 0xE3, 0xA3, 0x90, 0x8E, /* 0x94-0x97 */ + 0x90, 0x8F, 0x90, 0x90, 0xE3, 0xA4, 0xE3, 0xAA, /* 0x98-0x9B */ + 0x90, 0x91, 0xE3, 0xA6, 0x90, 0x92, 0xCE, 0xF2, /* 0x9C-0x9F */ + 0xD3, 0xC6, 0x90, 0x93, 0x90, 0x94, 0xBB, 0xBC, /* 0xA0-0xA3 */ + 0x90, 0x95, 0x90, 0x96, 0xD4, 0xC3, 0x90, 0x97, /* 0xA4-0xA7 */ + 0xC4, 0xFA, 0x90, 0x98, 0x90, 0x99, 0xED, 0xA8, /* 0xA8-0xAB */ + 0xD0, 0xFC, 0xE3, 0xA5, 0x90, 0x9A, 0xC3, 0xF5, /* 0xAC-0xAF */ + 0x90, 0x9B, 0xE3, 0xAD, 0xB1, 0xAF, 0x90, 0x9C, /* 0xB0-0xB3 */ + 0xE3, 0xB2, 0x90, 0x9D, 0x90, 0x9E, 0x90, 0x9F, /* 0xB4-0xB7 */ + 0xBC, 0xC2, 0x90, 0xA0, 0x90, 0xA1, 0xE3, 0xAC, /* 0xB8-0xBB */ + 0xB5, 0xBF, 0x90, 0xA2, 0x90, 0xA3, 0x90, 0xA4, /* 0xBC-0xBF */ + 0x90, 0xA5, 0x90, 0xA6, 0x90, 0xA7, 0x90, 0xA8, /* 0xC0-0xC3 */ + 0x90, 0xA9, 0xC7, 0xE9, 0xE3, 0xB0, 0x90, 0xAA, /* 0xC4-0xC7 */ + 0x90, 0xAB, 0x90, 0xAC, 0xBE, 0xAA, 0xCD, 0xEF, /* 0xC8-0xCB */ + 0x90, 0xAD, 0x90, 0xAE, 0x90, 0xAF, 0x90, 0xB0, /* 0xCC-0xCF */ + 0x90, 0xB1, 0xBB, 0xF3, 0x90, 0xB2, 0x90, 0xB3, /* 0xD0-0xD3 */ + 0x90, 0xB4, 0xCC, 0xE8, 0x90, 0xB5, 0x90, 0xB6, /* 0xD4-0xD7 */ + 0xE3, 0xAF, 0x90, 0xB7, 0xE3, 0xB1, 0x90, 0xB8, /* 0xD8-0xDB */ + 0xCF, 0xA7, 0xE3, 0xAE, 0x90, 0xB9, 0xCE, 0xA9, /* 0xDC-0xDF */ + 0xBB, 0xDD, 0x90, 0xBA, 0x90, 0xBB, 0x90, 0xBC, /* 0xE0-0xE3 */ + 0x90, 0xBD, 0x90, 0xBE, 0xB5, 0xEB, 0xBE, 0xE5, /* 0xE4-0xE7 */ + 0xB2, 0xD2, 0xB3, 0xCD, 0x90, 0xBF, 0xB1, 0xB9, /* 0xE8-0xEB */ + 0xE3, 0xAB, 0xB2, 0xD1, 0xB5, 0xAC, 0xB9, 0xDF, /* 0xEC-0xEF */ + 0xB6, 0xE8, 0x90, 0xC0, 0x90, 0xC1, 0xCF, 0xEB, /* 0xF0-0xF3 */ + 0xE3, 0xB7, 0x90, 0xC2, 0xBB, 0xCC, 0x90, 0xC3, /* 0xF4-0xF7 */ + 0x90, 0xC4, 0xC8, 0xC7, 0xD0, 0xCA, 0x90, 0xC5, /* 0xF8-0xFB */ + 0x90, 0xC6, 0x90, 0xC7, 0x90, 0xC8, 0x90, 0xC9, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_61[512] = { + 0xE3, 0xB8, 0xB3, 0xEE, 0x90, 0xCA, 0x90, 0xCB, /* 0x00-0x03 */ + 0x90, 0xCC, 0x90, 0xCD, 0xED, 0xA9, 0x90, 0xCE, /* 0x04-0x07 */ + 0xD3, 0xFA, 0xD3, 0xE4, 0x90, 0xCF, 0x90, 0xD0, /* 0x08-0x0B */ + 0x90, 0xD1, 0xED, 0xAA, 0xE3, 0xB9, 0xD2, 0xE2, /* 0x0C-0x0F */ + 0x90, 0xD2, 0x90, 0xD3, 0x90, 0xD4, 0x90, 0xD5, /* 0x10-0x13 */ + 0x90, 0xD6, 0xE3, 0xB5, 0x90, 0xD7, 0x90, 0xD8, /* 0x14-0x17 */ + 0x90, 0xD9, 0x90, 0xDA, 0xD3, 0xDE, 0x90, 0xDB, /* 0x18-0x1B */ + 0x90, 0xDC, 0x90, 0xDD, 0x90, 0xDE, 0xB8, 0xD0, /* 0x1C-0x1F */ + 0xE3, 0xB3, 0x90, 0xDF, 0x90, 0xE0, 0xE3, 0xB6, /* 0x20-0x23 */ + 0xB7, 0xDF, 0x90, 0xE1, 0xE3, 0xB4, 0xC0, 0xA2, /* 0x24-0x27 */ + 0x90, 0xE2, 0x90, 0xE3, 0x90, 0xE4, 0xE3, 0xBA, /* 0x28-0x2B */ + 0x90, 0xE5, 0x90, 0xE6, 0x90, 0xE7, 0x90, 0xE8, /* 0x2C-0x2F */ + 0x90, 0xE9, 0x90, 0xEA, 0x90, 0xEB, 0x90, 0xEC, /* 0x30-0x33 */ + 0x90, 0xED, 0x90, 0xEE, 0x90, 0xEF, 0x90, 0xF0, /* 0x34-0x37 */ + 0x90, 0xF1, 0x90, 0xF2, 0x90, 0xF3, 0x90, 0xF4, /* 0x38-0x3B */ + 0x90, 0xF5, 0x90, 0xF6, 0x90, 0xF7, 0xD4, 0xB8, /* 0x3C-0x3F */ + 0x90, 0xF8, 0x90, 0xF9, 0x90, 0xFA, 0x90, 0xFB, /* 0x40-0x43 */ + 0x90, 0xFC, 0x90, 0xFD, 0x90, 0xFE, 0x91, 0x40, /* 0x44-0x47 */ + 0xB4, 0xC8, 0x91, 0x41, 0xE3, 0xBB, 0x91, 0x42, /* 0x48-0x4B */ + 0xBB, 0xC5, 0x91, 0x43, 0xC9, 0xF7, 0x91, 0x44, /* 0x4C-0x4F */ + 0x91, 0x45, 0xC9, 0xE5, 0x91, 0x46, 0x91, 0x47, /* 0x50-0x53 */ + 0x91, 0x48, 0xC4, 0xBD, 0x91, 0x49, 0x91, 0x4A, /* 0x54-0x57 */ + 0x91, 0x4B, 0x91, 0x4C, 0x91, 0x4D, 0x91, 0x4E, /* 0x58-0x5B */ + 0x91, 0x4F, 0xED, 0xAB, 0x91, 0x50, 0x91, 0x51, /* 0x5C-0x5F */ + 0x91, 0x52, 0x91, 0x53, 0xC2, 0xFD, 0x91, 0x54, /* 0x60-0x63 */ + 0x91, 0x55, 0x91, 0x56, 0x91, 0x57, 0xBB, 0xDB, /* 0x64-0x67 */ + 0xBF, 0xAE, 0x91, 0x58, 0x91, 0x59, 0x91, 0x5A, /* 0x68-0x6B */ + 0x91, 0x5B, 0x91, 0x5C, 0x91, 0x5D, 0x91, 0x5E, /* 0x6C-0x6F */ + 0xCE, 0xBF, 0x91, 0x5F, 0x91, 0x60, 0x91, 0x61, /* 0x70-0x73 */ + 0x91, 0x62, 0xE3, 0xBC, 0x91, 0x63, 0xBF, 0xB6, /* 0x74-0x77 */ + 0x91, 0x64, 0x91, 0x65, 0x91, 0x66, 0x91, 0x67, /* 0x78-0x7B */ + 0x91, 0x68, 0x91, 0x69, 0x91, 0x6A, 0x91, 0x6B, /* 0x7C-0x7F */ + + 0x91, 0x6C, 0x91, 0x6D, 0x91, 0x6E, 0x91, 0x6F, /* 0x80-0x83 */ + 0x91, 0x70, 0x91, 0x71, 0x91, 0x72, 0x91, 0x73, /* 0x84-0x87 */ + 0x91, 0x74, 0x91, 0x75, 0x91, 0x76, 0xB1, 0xEF, /* 0x88-0x8B */ + 0x91, 0x77, 0x91, 0x78, 0xD4, 0xF7, 0x91, 0x79, /* 0x8C-0x8F */ + 0x91, 0x7A, 0x91, 0x7B, 0x91, 0x7C, 0x91, 0x7D, /* 0x90-0x93 */ + 0xE3, 0xBE, 0x91, 0x7E, 0x91, 0x80, 0x91, 0x81, /* 0x94-0x97 */ + 0x91, 0x82, 0x91, 0x83, 0x91, 0x84, 0x91, 0x85, /* 0x98-0x9B */ + 0x91, 0x86, 0xED, 0xAD, 0x91, 0x87, 0x91, 0x88, /* 0x9C-0x9F */ + 0x91, 0x89, 0x91, 0x8A, 0x91, 0x8B, 0x91, 0x8C, /* 0xA0-0xA3 */ + 0x91, 0x8D, 0x91, 0x8E, 0x91, 0x8F, 0xE3, 0xBF, /* 0xA4-0xA7 */ + 0xBA, 0xA9, 0xED, 0xAC, 0x91, 0x90, 0x91, 0x91, /* 0xA8-0xAB */ + 0xE3, 0xBD, 0x91, 0x92, 0x91, 0x93, 0x91, 0x94, /* 0xAC-0xAF */ + 0x91, 0x95, 0x91, 0x96, 0x91, 0x97, 0x91, 0x98, /* 0xB0-0xB3 */ + 0x91, 0x99, 0x91, 0x9A, 0x91, 0x9B, 0xE3, 0xC0, /* 0xB4-0xB7 */ + 0x91, 0x9C, 0x91, 0x9D, 0x91, 0x9E, 0x91, 0x9F, /* 0xB8-0xBB */ + 0x91, 0xA0, 0x91, 0xA1, 0xBA, 0xB6, 0x91, 0xA2, /* 0xBC-0xBF */ + 0x91, 0xA3, 0x91, 0xA4, 0xB6, 0xAE, 0x91, 0xA5, /* 0xC0-0xC3 */ + 0x91, 0xA6, 0x91, 0xA7, 0x91, 0xA8, 0x91, 0xA9, /* 0xC4-0xC7 */ + 0xD0, 0xB8, 0x91, 0xAA, 0xB0, 0xC3, 0xED, 0xAE, /* 0xC8-0xCB */ + 0x91, 0xAB, 0x91, 0xAC, 0x91, 0xAD, 0x91, 0xAE, /* 0xCC-0xCF */ + 0x91, 0xAF, 0xED, 0xAF, 0xC0, 0xC1, 0x91, 0xB0, /* 0xD0-0xD3 */ + 0xE3, 0xC1, 0x91, 0xB1, 0x91, 0xB2, 0x91, 0xB3, /* 0xD4-0xD7 */ + 0x91, 0xB4, 0x91, 0xB5, 0x91, 0xB6, 0x91, 0xB7, /* 0xD8-0xDB */ + 0x91, 0xB8, 0x91, 0xB9, 0x91, 0xBA, 0x91, 0xBB, /* 0xDC-0xDF */ + 0x91, 0xBC, 0x91, 0xBD, 0x91, 0xBE, 0x91, 0xBF, /* 0xE0-0xE3 */ + 0x91, 0xC0, 0x91, 0xC1, 0xC5, 0xB3, 0x91, 0xC2, /* 0xE4-0xE7 */ + 0x91, 0xC3, 0x91, 0xC4, 0x91, 0xC5, 0x91, 0xC6, /* 0xE8-0xEB */ + 0x91, 0xC7, 0x91, 0xC8, 0x91, 0xC9, 0x91, 0xCA, /* 0xEC-0xEF */ + 0x91, 0xCB, 0x91, 0xCC, 0x91, 0xCD, 0x91, 0xCE, /* 0xF0-0xF3 */ + 0x91, 0xCF, 0xE3, 0xC2, 0x91, 0xD0, 0x91, 0xD1, /* 0xF4-0xF7 */ + 0x91, 0xD2, 0x91, 0xD3, 0x91, 0xD4, 0x91, 0xD5, /* 0xF8-0xFB */ + 0x91, 0xD6, 0x91, 0xD7, 0x91, 0xD8, 0xDC, 0xB2, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_62[512] = { + 0x91, 0xD9, 0x91, 0xDA, 0x91, 0xDB, 0x91, 0xDC, /* 0x00-0x03 */ + 0x91, 0xDD, 0x91, 0xDE, 0xED, 0xB0, 0x91, 0xDF, /* 0x04-0x07 */ + 0xB8, 0xEA, 0x91, 0xE0, 0xCE, 0xEC, 0xEA, 0xA7, /* 0x08-0x0B */ + 0xD0, 0xE7, 0xCA, 0xF9, 0xC8, 0xD6, 0xCF, 0xB7, /* 0x0C-0x0F */ + 0xB3, 0xC9, 0xCE, 0xD2, 0xBD, 0xE4, 0x91, 0xE1, /* 0x10-0x13 */ + 0x91, 0xE2, 0xE3, 0xDE, 0xBB, 0xF2, 0xEA, 0xA8, /* 0x14-0x17 */ + 0xD5, 0xBD, 0x91, 0xE3, 0xC6, 0xDD, 0xEA, 0xA9, /* 0x18-0x1B */ + 0x91, 0xE4, 0x91, 0xE5, 0x91, 0xE6, 0xEA, 0xAA, /* 0x1C-0x1F */ + 0x91, 0xE7, 0xEA, 0xAC, 0xEA, 0xAB, 0x91, 0xE8, /* 0x20-0x23 */ + 0xEA, 0xAE, 0xEA, 0xAD, 0x91, 0xE9, 0x91, 0xEA, /* 0x24-0x27 */ + 0x91, 0xEB, 0x91, 0xEC, 0xBD, 0xD8, 0x91, 0xED, /* 0x28-0x2B */ + 0xEA, 0xAF, 0x91, 0xEE, 0xC2, 0xBE, 0x91, 0xEF, /* 0x2C-0x2F */ + 0x91, 0xF0, 0x91, 0xF1, 0x91, 0xF2, 0xB4, 0xC1, /* 0x30-0x33 */ + 0xB4, 0xF7, 0x91, 0xF3, 0x91, 0xF4, 0xBB, 0xA7, /* 0x34-0x37 */ + 0x91, 0xF5, 0x91, 0xF6, 0x91, 0xF7, 0x91, 0xF8, /* 0x38-0x3B */ + 0x91, 0xF9, 0xEC, 0xE6, 0xEC, 0xE5, 0xB7, 0xBF, /* 0x3C-0x3F */ + 0xCB, 0xF9, 0xB1, 0xE2, 0x91, 0xFA, 0xEC, 0xE7, /* 0x40-0x43 */ + 0x91, 0xFB, 0x91, 0xFC, 0x91, 0xFD, 0xC9, 0xC8, /* 0x44-0x47 */ + 0xEC, 0xE8, 0xEC, 0xE9, 0x91, 0xFE, 0xCA, 0xD6, /* 0x48-0x4B */ + 0xDE, 0xD0, 0xB2, 0xC5, 0xD4, 0xFA, 0x92, 0x40, /* 0x4C-0x4F */ + 0x92, 0x41, 0xC6, 0xCB, 0xB0, 0xC7, 0xB4, 0xF2, /* 0x50-0x53 */ + 0xC8, 0xD3, 0x92, 0x42, 0x92, 0x43, 0x92, 0x44, /* 0x54-0x57 */ + 0xCD, 0xD0, 0x92, 0x45, 0x92, 0x46, 0xBF, 0xB8, /* 0x58-0x5B */ + 0x92, 0x47, 0x92, 0x48, 0x92, 0x49, 0x92, 0x4A, /* 0x5C-0x5F */ + 0x92, 0x4B, 0x92, 0x4C, 0x92, 0x4D, 0xBF, 0xDB, /* 0x60-0x63 */ + 0x92, 0x4E, 0x92, 0x4F, 0xC7, 0xA4, 0xD6, 0xB4, /* 0x64-0x67 */ + 0x92, 0x50, 0xC0, 0xA9, 0xDE, 0xD1, 0xC9, 0xA8, /* 0x68-0x6B */ + 0xD1, 0xEF, 0xC5, 0xA4, 0xB0, 0xE7, 0xB3, 0xB6, /* 0x6C-0x6F */ + 0xC8, 0xC5, 0x92, 0x51, 0x92, 0x52, 0xB0, 0xE2, /* 0x70-0x73 */ + 0x92, 0x53, 0x92, 0x54, 0xB7, 0xF6, 0x92, 0x55, /* 0x74-0x77 */ + 0x92, 0x56, 0xC5, 0xFA, 0x92, 0x57, 0x92, 0x58, /* 0x78-0x7B */ + 0xB6, 0xF3, 0x92, 0x59, 0xD5, 0xD2, 0xB3, 0xD0, /* 0x7C-0x7F */ + + 0xBC, 0xBC, 0x92, 0x5A, 0x92, 0x5B, 0x92, 0x5C, /* 0x80-0x83 */ + 0xB3, 0xAD, 0x92, 0x5D, 0x92, 0x5E, 0x92, 0x5F, /* 0x84-0x87 */ + 0x92, 0x60, 0xBE, 0xF1, 0xB0, 0xD1, 0x92, 0x61, /* 0x88-0x8B */ + 0x92, 0x62, 0x92, 0x63, 0x92, 0x64, 0x92, 0x65, /* 0x8C-0x8F */ + 0x92, 0x66, 0xD2, 0xD6, 0xCA, 0xE3, 0xD7, 0xA5, /* 0x90-0x93 */ + 0x92, 0x67, 0xCD, 0xB6, 0xB6, 0xB6, 0xBF, 0xB9, /* 0x94-0x97 */ + 0xD5, 0xDB, 0x92, 0x68, 0xB8, 0xA7, 0xC5, 0xD7, /* 0x98-0x9B */ + 0x92, 0x69, 0x92, 0x6A, 0x92, 0x6B, 0xDE, 0xD2, /* 0x9C-0x9F */ + 0xBF, 0xD9, 0xC2, 0xD5, 0xC7, 0xC0, 0x92, 0x6C, /* 0xA0-0xA3 */ + 0xBB, 0xA4, 0xB1, 0xA8, 0x92, 0x6D, 0x92, 0x6E, /* 0xA4-0xA7 */ + 0xC5, 0xEA, 0x92, 0x6F, 0x92, 0x70, 0xC5, 0xFB, /* 0xA8-0xAB */ + 0xCC, 0xA7, 0x92, 0x71, 0x92, 0x72, 0x92, 0x73, /* 0xAC-0xAF */ + 0x92, 0x74, 0xB1, 0xA7, 0x92, 0x75, 0x92, 0x76, /* 0xB0-0xB3 */ + 0x92, 0x77, 0xB5, 0xD6, 0x92, 0x78, 0x92, 0x79, /* 0xB4-0xB7 */ + 0x92, 0x7A, 0xC4, 0xA8, 0x92, 0x7B, 0xDE, 0xD3, /* 0xB8-0xBB */ + 0xD1, 0xBA, 0xB3, 0xE9, 0x92, 0x7C, 0xC3, 0xF2, /* 0xBC-0xBF */ + 0x92, 0x7D, 0x92, 0x7E, 0xB7, 0xF7, 0x92, 0x80, /* 0xC0-0xC3 */ + 0xD6, 0xF4, 0xB5, 0xA3, 0xB2, 0xF0, 0xC4, 0xB4, /* 0xC4-0xC7 */ + 0xC4, 0xE9, 0xC0, 0xAD, 0xDE, 0xD4, 0x92, 0x81, /* 0xC8-0xCB */ + 0xB0, 0xE8, 0xC5, 0xC4, 0xC1, 0xE0, 0x92, 0x82, /* 0xCC-0xCF */ + 0xB9, 0xD5, 0x92, 0x83, 0xBE, 0xDC, 0xCD, 0xD8, /* 0xD0-0xD3 */ + 0xB0, 0xCE, 0x92, 0x84, 0xCD, 0xCF, 0xDE, 0xD6, /* 0xD4-0xD7 */ + 0xBE, 0xD0, 0xD7, 0xBE, 0xDE, 0xD5, 0xD5, 0xD0, /* 0xD8-0xDB */ + 0xB0, 0xDD, 0x92, 0x85, 0x92, 0x86, 0xC4, 0xE2, /* 0xDC-0xDF */ + 0x92, 0x87, 0x92, 0x88, 0xC2, 0xA3, 0xBC, 0xF0, /* 0xE0-0xE3 */ + 0x92, 0x89, 0xD3, 0xB5, 0xC0, 0xB9, 0xC5, 0xA1, /* 0xE4-0xE7 */ + 0xB2, 0xA6, 0xD4, 0xF1, 0x92, 0x8A, 0x92, 0x8B, /* 0xE8-0xEB */ + 0xC0, 0xA8, 0xCA, 0xC3, 0xDE, 0xD7, 0xD5, 0xFC, /* 0xEC-0xEF */ + 0x92, 0x8C, 0xB9, 0xB0, 0x92, 0x8D, 0xC8, 0xAD, /* 0xF0-0xF3 */ + 0xCB, 0xA9, 0x92, 0x8E, 0xDE, 0xD9, 0xBF, 0xBD, /* 0xF4-0xF7 */ + 0x92, 0x8F, 0x92, 0x90, 0x92, 0x91, 0x92, 0x92, /* 0xF8-0xFB */ + 0xC6, 0xB4, 0xD7, 0xA7, 0xCA, 0xB0, 0xC4, 0xC3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_63[512] = { + 0x92, 0x93, 0xB3, 0xD6, 0xB9, 0xD2, 0x92, 0x94, /* 0x00-0x03 */ + 0x92, 0x95, 0x92, 0x96, 0x92, 0x97, 0xD6, 0xB8, /* 0x04-0x07 */ + 0xEA, 0xFC, 0xB0, 0xB4, 0x92, 0x98, 0x92, 0x99, /* 0x08-0x0B */ + 0x92, 0x9A, 0x92, 0x9B, 0xBF, 0xE6, 0x92, 0x9C, /* 0x0C-0x0F */ + 0x92, 0x9D, 0xCC, 0xF4, 0x92, 0x9E, 0x92, 0x9F, /* 0x10-0x13 */ + 0x92, 0xA0, 0x92, 0xA1, 0xCD, 0xDA, 0x92, 0xA2, /* 0x14-0x17 */ + 0x92, 0xA3, 0x92, 0xA4, 0xD6, 0xBF, 0xC2, 0xCE, /* 0x18-0x1B */ + 0x92, 0xA5, 0xCE, 0xCE, 0xCC, 0xA2, 0xD0, 0xAE, /* 0x1C-0x1F */ + 0xC4, 0xD3, 0xB5, 0xB2, 0xDE, 0xD8, 0xD5, 0xF5, /* 0x20-0x23 */ + 0xBC, 0xB7, 0xBB, 0xD3, 0x92, 0xA6, 0x92, 0xA7, /* 0x24-0x27 */ + 0xB0, 0xA4, 0x92, 0xA8, 0xC5, 0xB2, 0xB4, 0xEC, /* 0x28-0x2B */ + 0x92, 0xA9, 0x92, 0xAA, 0x92, 0xAB, 0xD5, 0xF1, /* 0x2C-0x2F */ + 0x92, 0xAC, 0x92, 0xAD, 0xEA, 0xFD, 0x92, 0xAE, /* 0x30-0x33 */ + 0x92, 0xAF, 0x92, 0xB0, 0x92, 0xB1, 0x92, 0xB2, /* 0x34-0x37 */ + 0x92, 0xB3, 0xDE, 0xDA, 0xCD, 0xA6, 0x92, 0xB4, /* 0x38-0x3B */ + 0x92, 0xB5, 0xCD, 0xEC, 0x92, 0xB6, 0x92, 0xB7, /* 0x3C-0x3F */ + 0x92, 0xB8, 0x92, 0xB9, 0xCE, 0xE6, 0xDE, 0xDC, /* 0x40-0x43 */ + 0x92, 0xBA, 0xCD, 0xB1, 0xC0, 0xA6, 0x92, 0xBB, /* 0x44-0x47 */ + 0x92, 0xBC, 0xD7, 0xBD, 0x92, 0xBD, 0xDE, 0xDB, /* 0x48-0x4B */ + 0xB0, 0xC6, 0xBA, 0xB4, 0xC9, 0xD3, 0xC4, 0xF3, /* 0x4C-0x4F */ + 0xBE, 0xE8, 0x92, 0xBE, 0x92, 0xBF, 0x92, 0xC0, /* 0x50-0x53 */ + 0x92, 0xC1, 0xB2, 0xB6, 0x92, 0xC2, 0x92, 0xC3, /* 0x54-0x57 */ + 0x92, 0xC4, 0x92, 0xC5, 0x92, 0xC6, 0x92, 0xC7, /* 0x58-0x5B */ + 0x92, 0xC8, 0x92, 0xC9, 0xC0, 0xCC, 0xCB, 0xF0, /* 0x5C-0x5F */ + 0x92, 0xCA, 0xBC, 0xF1, 0xBB, 0xBB, 0xB5, 0xB7, /* 0x60-0x63 */ + 0x92, 0xCB, 0x92, 0xCC, 0x92, 0xCD, 0xC5, 0xF5, /* 0x64-0x67 */ + 0x92, 0xCE, 0xDE, 0xE6, 0x92, 0xCF, 0x92, 0xD0, /* 0x68-0x6B */ + 0x92, 0xD1, 0xDE, 0xE3, 0xBE, 0xDD, 0x92, 0xD2, /* 0x6C-0x6F */ + 0x92, 0xD3, 0xDE, 0xDF, 0x92, 0xD4, 0x92, 0xD5, /* 0x70-0x73 */ + 0x92, 0xD6, 0x92, 0xD7, 0xB4, 0xB7, 0xBD, 0xDD, /* 0x74-0x77 */ + 0x92, 0xD8, 0x92, 0xD9, 0xDE, 0xE0, 0xC4, 0xED, /* 0x78-0x7B */ + 0x92, 0xDA, 0x92, 0xDB, 0x92, 0xDC, 0x92, 0xDD, /* 0x7C-0x7F */ + + 0xCF, 0xC6, 0x92, 0xDE, 0xB5, 0xE0, 0x92, 0xDF, /* 0x80-0x83 */ + 0x92, 0xE0, 0x92, 0xE1, 0x92, 0xE2, 0xB6, 0xDE, /* 0x84-0x87 */ + 0xCA, 0xDA, 0xB5, 0xF4, 0xDE, 0xE5, 0x92, 0xE3, /* 0x88-0x8B */ + 0xD5, 0xC6, 0x92, 0xE4, 0xDE, 0xE1, 0xCC, 0xCD, /* 0x8C-0x8F */ + 0xC6, 0xFE, 0x92, 0xE5, 0xC5, 0xC5, 0x92, 0xE6, /* 0x90-0x93 */ + 0x92, 0xE7, 0x92, 0xE8, 0xD2, 0xB4, 0x92, 0xE9, /* 0x94-0x97 */ + 0xBE, 0xF2, 0x92, 0xEA, 0x92, 0xEB, 0x92, 0xEC, /* 0x98-0x9B */ + 0x92, 0xED, 0x92, 0xEE, 0x92, 0xEF, 0x92, 0xF0, /* 0x9C-0x9F */ + 0xC2, 0xD3, 0x92, 0xF1, 0xCC, 0xBD, 0xB3, 0xB8, /* 0xA0-0xA3 */ + 0x92, 0xF2, 0xBD, 0xD3, 0x92, 0xF3, 0xBF, 0xD8, /* 0xA4-0xA7 */ + 0xCD, 0xC6, 0xD1, 0xDA, 0xB4, 0xEB, 0x92, 0xF4, /* 0xA8-0xAB */ + 0xDE, 0xE4, 0xDE, 0xDD, 0xDE, 0xE7, 0x92, 0xF5, /* 0xAC-0xAF */ + 0xEA, 0xFE, 0x92, 0xF6, 0x92, 0xF7, 0xC2, 0xB0, /* 0xB0-0xB3 */ + 0xDE, 0xE2, 0x92, 0xF8, 0x92, 0xF9, 0xD6, 0xC0, /* 0xB4-0xB7 */ + 0xB5, 0xA7, 0x92, 0xFA, 0xB2, 0xF4, 0x92, 0xFB, /* 0xB8-0xBB */ + 0xDE, 0xE8, 0x92, 0xFC, 0xDE, 0xF2, 0x92, 0xFD, /* 0xBC-0xBF */ + 0x92, 0xFE, 0x93, 0x40, 0x93, 0x41, 0x93, 0x42, /* 0xC0-0xC3 */ + 0xDE, 0xED, 0x93, 0x43, 0xDE, 0xF1, 0x93, 0x44, /* 0xC4-0xC7 */ + 0x93, 0x45, 0xC8, 0xE0, 0x93, 0x46, 0x93, 0x47, /* 0xC8-0xCB */ + 0x93, 0x48, 0xD7, 0xE1, 0xDE, 0xEF, 0xC3, 0xE8, /* 0xCC-0xCF */ + 0xCC, 0xE1, 0x93, 0x49, 0xB2, 0xE5, 0x93, 0x4A, /* 0xD0-0xD3 */ + 0x93, 0x4B, 0x93, 0x4C, 0xD2, 0xBE, 0x93, 0x4D, /* 0xD4-0xD7 */ + 0x93, 0x4E, 0x93, 0x4F, 0x93, 0x50, 0x93, 0x51, /* 0xD8-0xDB */ + 0x93, 0x52, 0x93, 0x53, 0xDE, 0xEE, 0x93, 0x54, /* 0xDC-0xDF */ + 0xDE, 0xEB, 0xCE, 0xD5, 0x93, 0x55, 0xB4, 0xA7, /* 0xE0-0xE3 */ + 0x93, 0x56, 0x93, 0x57, 0x93, 0x58, 0x93, 0x59, /* 0xE4-0xE7 */ + 0x93, 0x5A, 0xBF, 0xAB, 0xBE, 0xBE, 0x93, 0x5B, /* 0xE8-0xEB */ + 0x93, 0x5C, 0xBD, 0xD2, 0x93, 0x5D, 0x93, 0x5E, /* 0xEC-0xEF */ + 0x93, 0x5F, 0x93, 0x60, 0xDE, 0xE9, 0x93, 0x61, /* 0xF0-0xF3 */ + 0xD4, 0xAE, 0x93, 0x62, 0xDE, 0xDE, 0x93, 0x63, /* 0xF4-0xF7 */ + 0xDE, 0xEA, 0x93, 0x64, 0x93, 0x65, 0x93, 0x66, /* 0xF8-0xFB */ + 0x93, 0x67, 0xC0, 0xBF, 0x93, 0x68, 0xDE, 0xEC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_64[512] = { + 0xB2, 0xF3, 0xB8, 0xE9, 0xC2, 0xA7, 0x93, 0x69, /* 0x00-0x03 */ + 0x93, 0x6A, 0xBD, 0xC1, 0x93, 0x6B, 0x93, 0x6C, /* 0x04-0x07 */ + 0x93, 0x6D, 0x93, 0x6E, 0x93, 0x6F, 0xDE, 0xF5, /* 0x08-0x0B */ + 0xDE, 0xF8, 0x93, 0x70, 0x93, 0x71, 0xB2, 0xAB, /* 0x0C-0x0F */ + 0xB4, 0xA4, 0x93, 0x72, 0x93, 0x73, 0xB4, 0xEA, /* 0x10-0x13 */ + 0xC9, 0xA6, 0x93, 0x74, 0x93, 0x75, 0x93, 0x76, /* 0x14-0x17 */ + 0x93, 0x77, 0x93, 0x78, 0x93, 0x79, 0xDE, 0xF6, /* 0x18-0x1B */ + 0xCB, 0xD1, 0x93, 0x7A, 0xB8, 0xE3, 0x93, 0x7B, /* 0x1C-0x1F */ + 0xDE, 0xF7, 0xDE, 0xFA, 0x93, 0x7C, 0x93, 0x7D, /* 0x20-0x23 */ + 0x93, 0x7E, 0x93, 0x80, 0xDE, 0xF9, 0x93, 0x81, /* 0x24-0x27 */ + 0x93, 0x82, 0x93, 0x83, 0xCC, 0xC2, 0x93, 0x84, /* 0x28-0x2B */ + 0xB0, 0xE1, 0xB4, 0xEE, 0x93, 0x85, 0x93, 0x86, /* 0x2C-0x2F */ + 0x93, 0x87, 0x93, 0x88, 0x93, 0x89, 0x93, 0x8A, /* 0x30-0x33 */ + 0xE5, 0xBA, 0x93, 0x8B, 0x93, 0x8C, 0x93, 0x8D, /* 0x34-0x37 */ + 0x93, 0x8E, 0x93, 0x8F, 0xD0, 0xAF, 0x93, 0x90, /* 0x38-0x3B */ + 0x93, 0x91, 0xB2, 0xEB, 0x93, 0x92, 0xEB, 0xA1, /* 0x3C-0x3F */ + 0x93, 0x93, 0xDE, 0xF4, 0x93, 0x94, 0x93, 0x95, /* 0x40-0x43 */ + 0xC9, 0xE3, 0xDE, 0xF3, 0xB0, 0xDA, 0xD2, 0xA1, /* 0x44-0x47 */ + 0xB1, 0xF7, 0x93, 0x96, 0xCC, 0xAF, 0x93, 0x97, /* 0x48-0x4B */ + 0x93, 0x98, 0x93, 0x99, 0x93, 0x9A, 0x93, 0x9B, /* 0x4C-0x4F */ + 0x93, 0x9C, 0x93, 0x9D, 0xDE, 0xF0, 0x93, 0x9E, /* 0x50-0x53 */ + 0xCB, 0xA4, 0x93, 0x9F, 0x93, 0xA0, 0x93, 0xA1, /* 0x54-0x57 */ + 0xD5, 0xAA, 0x93, 0xA2, 0x93, 0xA3, 0x93, 0xA4, /* 0x58-0x5B */ + 0x93, 0xA5, 0x93, 0xA6, 0xDE, 0xFB, 0x93, 0xA7, /* 0x5C-0x5F */ + 0x93, 0xA8, 0x93, 0xA9, 0x93, 0xAA, 0x93, 0xAB, /* 0x60-0x63 */ + 0x93, 0xAC, 0x93, 0xAD, 0x93, 0xAE, 0xB4, 0xDD, /* 0x64-0x67 */ + 0x93, 0xAF, 0xC4, 0xA6, 0x93, 0xB0, 0x93, 0xB1, /* 0x68-0x6B */ + 0x93, 0xB2, 0xDE, 0xFD, 0x93, 0xB3, 0x93, 0xB4, /* 0x6C-0x6F */ + 0x93, 0xB5, 0x93, 0xB6, 0x93, 0xB7, 0x93, 0xB8, /* 0x70-0x73 */ + 0x93, 0xB9, 0x93, 0xBA, 0x93, 0xBB, 0x93, 0xBC, /* 0x74-0x77 */ + 0xC3, 0xFE, 0xC4, 0xA1, 0xDF, 0xA1, 0x93, 0xBD, /* 0x78-0x7B */ + 0x93, 0xBE, 0x93, 0xBF, 0x93, 0xC0, 0x93, 0xC1, /* 0x7C-0x7F */ + + 0x93, 0xC2, 0x93, 0xC3, 0xC1, 0xCC, 0x93, 0xC4, /* 0x80-0x83 */ + 0xDE, 0xFC, 0xBE, 0xEF, 0x93, 0xC5, 0xC6, 0xB2, /* 0x84-0x87 */ + 0x93, 0xC6, 0x93, 0xC7, 0x93, 0xC8, 0x93, 0xC9, /* 0x88-0x8B */ + 0x93, 0xCA, 0x93, 0xCB, 0x93, 0xCC, 0x93, 0xCD, /* 0x8C-0x8F */ + 0x93, 0xCE, 0xB3, 0xC5, 0xC8, 0xF6, 0x93, 0xCF, /* 0x90-0x93 */ + 0x93, 0xD0, 0xCB, 0xBA, 0xDE, 0xFE, 0x93, 0xD1, /* 0x94-0x97 */ + 0x93, 0xD2, 0xDF, 0xA4, 0x93, 0xD3, 0x93, 0xD4, /* 0x98-0x9B */ + 0x93, 0xD5, 0x93, 0xD6, 0xD7, 0xB2, 0x93, 0xD7, /* 0x9C-0x9F */ + 0x93, 0xD8, 0x93, 0xD9, 0x93, 0xDA, 0x93, 0xDB, /* 0xA0-0xA3 */ + 0xB3, 0xB7, 0x93, 0xDC, 0x93, 0xDD, 0x93, 0xDE, /* 0xA4-0xA7 */ + 0x93, 0xDF, 0xC1, 0xC3, 0x93, 0xE0, 0x93, 0xE1, /* 0xA8-0xAB */ + 0xC7, 0xCB, 0xB2, 0xA5, 0xB4, 0xE9, 0x93, 0xE2, /* 0xAC-0xAF */ + 0xD7, 0xAB, 0x93, 0xE3, 0x93, 0xE4, 0x93, 0xE5, /* 0xB0-0xB3 */ + 0x93, 0xE6, 0xC4, 0xEC, 0x93, 0xE7, 0xDF, 0xA2, /* 0xB4-0xB7 */ + 0xDF, 0xA3, 0x93, 0xE8, 0xDF, 0xA5, 0x93, 0xE9, /* 0xB8-0xBB */ + 0xBA, 0xB3, 0x93, 0xEA, 0x93, 0xEB, 0x93, 0xEC, /* 0xBC-0xBF */ + 0xDF, 0xA6, 0x93, 0xED, 0xC0, 0xDE, 0x93, 0xEE, /* 0xC0-0xC3 */ + 0x93, 0xEF, 0xC9, 0xC3, 0x93, 0xF0, 0x93, 0xF1, /* 0xC4-0xC7 */ + 0x93, 0xF2, 0x93, 0xF3, 0x93, 0xF4, 0x93, 0xF5, /* 0xC8-0xCB */ + 0x93, 0xF6, 0xB2, 0xD9, 0xC7, 0xE6, 0x93, 0xF7, /* 0xCC-0xCF */ + 0xDF, 0xA7, 0x93, 0xF8, 0xC7, 0xDC, 0x93, 0xF9, /* 0xD0-0xD3 */ + 0x93, 0xFA, 0x93, 0xFB, 0x93, 0xFC, 0xDF, 0xA8, /* 0xD4-0xD7 */ + 0xEB, 0xA2, 0x93, 0xFD, 0x93, 0xFE, 0x94, 0x40, /* 0xD8-0xDB */ + 0x94, 0x41, 0x94, 0x42, 0xCB, 0xD3, 0x94, 0x43, /* 0xDC-0xDF */ + 0x94, 0x44, 0x94, 0x45, 0xDF, 0xAA, 0x94, 0x46, /* 0xE0-0xE3 */ + 0xDF, 0xA9, 0x94, 0x47, 0xB2, 0xC1, 0x94, 0x48, /* 0xE4-0xE7 */ + 0x94, 0x49, 0x94, 0x4A, 0x94, 0x4B, 0x94, 0x4C, /* 0xE8-0xEB */ + 0x94, 0x4D, 0x94, 0x4E, 0x94, 0x4F, 0x94, 0x50, /* 0xEC-0xEF */ + 0x94, 0x51, 0x94, 0x52, 0x94, 0x53, 0x94, 0x54, /* 0xF0-0xF3 */ + 0x94, 0x55, 0x94, 0x56, 0x94, 0x57, 0x94, 0x58, /* 0xF4-0xF7 */ + 0x94, 0x59, 0x94, 0x5A, 0x94, 0x5B, 0x94, 0x5C, /* 0xF8-0xFB */ + 0x94, 0x5D, 0x94, 0x5E, 0x94, 0x5F, 0x94, 0x60, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_65[512] = { + 0xC5, 0xCA, 0x94, 0x61, 0x94, 0x62, 0x94, 0x63, /* 0x00-0x03 */ + 0x94, 0x64, 0x94, 0x65, 0x94, 0x66, 0x94, 0x67, /* 0x04-0x07 */ + 0x94, 0x68, 0xDF, 0xAB, 0x94, 0x69, 0x94, 0x6A, /* 0x08-0x0B */ + 0x94, 0x6B, 0x94, 0x6C, 0x94, 0x6D, 0x94, 0x6E, /* 0x0C-0x0F */ + 0x94, 0x6F, 0x94, 0x70, 0xD4, 0xDC, 0x94, 0x71, /* 0x10-0x13 */ + 0x94, 0x72, 0x94, 0x73, 0x94, 0x74, 0x94, 0x75, /* 0x14-0x17 */ + 0xC8, 0xC1, 0x94, 0x76, 0x94, 0x77, 0x94, 0x78, /* 0x18-0x1B */ + 0x94, 0x79, 0x94, 0x7A, 0x94, 0x7B, 0x94, 0x7C, /* 0x1C-0x1F */ + 0x94, 0x7D, 0x94, 0x7E, 0x94, 0x80, 0x94, 0x81, /* 0x20-0x23 */ + 0x94, 0x82, 0xDF, 0xAC, 0x94, 0x83, 0x94, 0x84, /* 0x24-0x27 */ + 0x94, 0x85, 0x94, 0x86, 0x94, 0x87, 0xBE, 0xF0, /* 0x28-0x2B */ + 0x94, 0x88, 0x94, 0x89, 0xDF, 0xAD, 0xD6, 0xA7, /* 0x2C-0x2F */ + 0x94, 0x8A, 0x94, 0x8B, 0x94, 0x8C, 0x94, 0x8D, /* 0x30-0x33 */ + 0xEA, 0xB7, 0xEB, 0xB6, 0xCA, 0xD5, 0x94, 0x8E, /* 0x34-0x37 */ + 0xD8, 0xFC, 0xB8, 0xC4, 0x94, 0x8F, 0xB9, 0xA5, /* 0x38-0x3B */ + 0x94, 0x90, 0x94, 0x91, 0xB7, 0xC5, 0xD5, 0xFE, /* 0x3C-0x3F */ + 0x94, 0x92, 0x94, 0x93, 0x94, 0x94, 0x94, 0x95, /* 0x40-0x43 */ + 0x94, 0x96, 0xB9, 0xCA, 0x94, 0x97, 0x94, 0x98, /* 0x44-0x47 */ + 0xD0, 0xA7, 0xF4, 0xCD, 0x94, 0x99, 0x94, 0x9A, /* 0x48-0x4B */ + 0xB5, 0xD0, 0x94, 0x9B, 0x94, 0x9C, 0xC3, 0xF4, /* 0x4C-0x4F */ + 0x94, 0x9D, 0xBE, 0xC8, 0x94, 0x9E, 0x94, 0x9F, /* 0x50-0x53 */ + 0x94, 0xA0, 0xEB, 0xB7, 0xB0, 0xBD, 0x94, 0xA1, /* 0x54-0x57 */ + 0x94, 0xA2, 0xBD, 0xCC, 0x94, 0xA3, 0xC1, 0xB2, /* 0x58-0x5B */ + 0x94, 0xA4, 0xB1, 0xD6, 0xB3, 0xA8, 0x94, 0xA5, /* 0x5C-0x5F */ + 0x94, 0xA6, 0x94, 0xA7, 0xB8, 0xD2, 0xC9, 0xA2, /* 0x60-0x63 */ + 0x94, 0xA8, 0x94, 0xA9, 0xB6, 0xD8, 0x94, 0xAA, /* 0x64-0x67 */ + 0x94, 0xAB, 0x94, 0xAC, 0x94, 0xAD, 0xEB, 0xB8, /* 0x68-0x6B */ + 0xBE, 0xB4, 0x94, 0xAE, 0x94, 0xAF, 0x94, 0xB0, /* 0x6C-0x6F */ + 0xCA, 0xFD, 0x94, 0xB1, 0xC7, 0xC3, 0x94, 0xB2, /* 0x70-0x73 */ + 0xD5, 0xFB, 0x94, 0xB3, 0x94, 0xB4, 0xB7, 0xF3, /* 0x74-0x77 */ + 0x94, 0xB5, 0x94, 0xB6, 0x94, 0xB7, 0x94, 0xB8, /* 0x78-0x7B */ + 0x94, 0xB9, 0x94, 0xBA, 0x94, 0xBB, 0x94, 0xBC, /* 0x7C-0x7F */ + + 0x94, 0xBD, 0x94, 0xBE, 0x94, 0xBF, 0x94, 0xC0, /* 0x80-0x83 */ + 0x94, 0xC1, 0x94, 0xC2, 0x94, 0xC3, 0xCE, 0xC4, /* 0x84-0x87 */ + 0x94, 0xC4, 0x94, 0xC5, 0x94, 0xC6, 0xD5, 0xAB, /* 0x88-0x8B */ + 0xB1, 0xF3, 0x94, 0xC7, 0x94, 0xC8, 0x94, 0xC9, /* 0x8C-0x8F */ + 0xEC, 0xB3, 0xB0, 0xDF, 0x94, 0xCA, 0xEC, 0xB5, /* 0x90-0x93 */ + 0x94, 0xCB, 0x94, 0xCC, 0x94, 0xCD, 0xB6, 0xB7, /* 0x94-0x97 */ + 0x94, 0xCE, 0xC1, 0xCF, 0x94, 0xCF, 0xF5, 0xFA, /* 0x98-0x9B */ + 0xD0, 0xB1, 0x94, 0xD0, 0x94, 0xD1, 0xD5, 0xE5, /* 0x9C-0x9F */ + 0x94, 0xD2, 0xCE, 0xD3, 0x94, 0xD3, 0x94, 0xD4, /* 0xA0-0xA3 */ + 0xBD, 0xEF, 0xB3, 0xE2, 0x94, 0xD5, 0xB8, 0xAB, /* 0xA4-0xA7 */ + 0x94, 0xD6, 0xD5, 0xB6, 0x94, 0xD7, 0xED, 0xBD, /* 0xA8-0xAB */ + 0x94, 0xD8, 0xB6, 0xCF, 0x94, 0xD9, 0xCB, 0xB9, /* 0xAC-0xAF */ + 0xD0, 0xC2, 0x94, 0xDA, 0x94, 0xDB, 0x94, 0xDC, /* 0xB0-0xB3 */ + 0x94, 0xDD, 0x94, 0xDE, 0x94, 0xDF, 0x94, 0xE0, /* 0xB4-0xB7 */ + 0x94, 0xE1, 0xB7, 0xBD, 0x94, 0xE2, 0x94, 0xE3, /* 0xB8-0xBB */ + 0xEC, 0xB6, 0xCA, 0xA9, 0x94, 0xE4, 0x94, 0xE5, /* 0xBC-0xBF */ + 0x94, 0xE6, 0xC5, 0xD4, 0x94, 0xE7, 0xEC, 0xB9, /* 0xC0-0xC3 */ + 0xEC, 0xB8, 0xC2, 0xC3, 0xEC, 0xB7, 0x94, 0xE8, /* 0xC4-0xC7 */ + 0x94, 0xE9, 0x94, 0xEA, 0x94, 0xEB, 0xD0, 0xFD, /* 0xC8-0xCB */ + 0xEC, 0xBA, 0x94, 0xEC, 0xEC, 0xBB, 0xD7, 0xE5, /* 0xCC-0xCF */ + 0x94, 0xED, 0x94, 0xEE, 0xEC, 0xBC, 0x94, 0xEF, /* 0xD0-0xD3 */ + 0x94, 0xF0, 0x94, 0xF1, 0xEC, 0xBD, 0xC6, 0xEC, /* 0xD4-0xD7 */ + 0x94, 0xF2, 0x94, 0xF3, 0x94, 0xF4, 0x94, 0xF5, /* 0xD8-0xDB */ + 0x94, 0xF6, 0x94, 0xF7, 0x94, 0xF8, 0x94, 0xF9, /* 0xDC-0xDF */ + 0xCE, 0xDE, 0x94, 0xFA, 0xBC, 0xC8, 0x94, 0xFB, /* 0xE0-0xE3 */ + 0x94, 0xFC, 0xC8, 0xD5, 0xB5, 0xA9, 0xBE, 0xC9, /* 0xE4-0xE7 */ + 0xD6, 0xBC, 0xD4, 0xE7, 0x94, 0xFD, 0x94, 0xFE, /* 0xE8-0xEB */ + 0xD1, 0xAE, 0xD0, 0xF1, 0xEA, 0xB8, 0xEA, 0xB9, /* 0xEC-0xEF */ + 0xEA, 0xBA, 0xBA, 0xB5, 0x95, 0x40, 0x95, 0x41, /* 0xF0-0xF3 */ + 0x95, 0x42, 0x95, 0x43, 0xCA, 0xB1, 0xBF, 0xF5, /* 0xF4-0xF7 */ + 0x95, 0x44, 0x95, 0x45, 0xCD, 0xFA, 0x95, 0x46, /* 0xF8-0xFB */ + 0x95, 0x47, 0x95, 0x48, 0x95, 0x49, 0x95, 0x4A, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_66[512] = { + 0xEA, 0xC0, 0x95, 0x4B, 0xB0, 0xBA, 0xEA, 0xBE, /* 0x00-0x03 */ + 0x95, 0x4C, 0x95, 0x4D, 0xC0, 0xA5, 0x95, 0x4E, /* 0x04-0x07 */ + 0x95, 0x4F, 0x95, 0x50, 0xEA, 0xBB, 0x95, 0x51, /* 0x08-0x0B */ + 0xB2, 0xFD, 0x95, 0x52, 0xC3, 0xF7, 0xBB, 0xE8, /* 0x0C-0x0F */ + 0x95, 0x53, 0x95, 0x54, 0x95, 0x55, 0xD2, 0xD7, /* 0x10-0x13 */ + 0xCE, 0xF4, 0xEA, 0xBF, 0x95, 0x56, 0x95, 0x57, /* 0x14-0x17 */ + 0x95, 0x58, 0xEA, 0xBC, 0x95, 0x59, 0x95, 0x5A, /* 0x18-0x1B */ + 0x95, 0x5B, 0xEA, 0xC3, 0x95, 0x5C, 0xD0, 0xC7, /* 0x1C-0x1F */ + 0xD3, 0xB3, 0x95, 0x5D, 0x95, 0x5E, 0x95, 0x5F, /* 0x20-0x23 */ + 0x95, 0x60, 0xB4, 0xBA, 0x95, 0x61, 0xC3, 0xC1, /* 0x24-0x27 */ + 0xD7, 0xF2, 0x95, 0x62, 0x95, 0x63, 0x95, 0x64, /* 0x28-0x2B */ + 0x95, 0x65, 0xD5, 0xD1, 0x95, 0x66, 0xCA, 0xC7, /* 0x2C-0x2F */ + 0x95, 0x67, 0xEA, 0xC5, 0x95, 0x68, 0x95, 0x69, /* 0x30-0x33 */ + 0xEA, 0xC4, 0xEA, 0xC7, 0xEA, 0xC6, 0x95, 0x6A, /* 0x34-0x37 */ + 0x95, 0x6B, 0x95, 0x6C, 0x95, 0x6D, 0x95, 0x6E, /* 0x38-0x3B */ + 0xD6, 0xE7, 0x95, 0x6F, 0xCF, 0xD4, 0x95, 0x70, /* 0x3C-0x3F */ + 0x95, 0x71, 0xEA, 0xCB, 0x95, 0x72, 0xBB, 0xCE, /* 0x40-0x43 */ + 0x95, 0x73, 0x95, 0x74, 0x95, 0x75, 0x95, 0x76, /* 0x44-0x47 */ + 0x95, 0x77, 0x95, 0x78, 0x95, 0x79, 0xBD, 0xFA, /* 0x48-0x4B */ + 0xC9, 0xCE, 0x95, 0x7A, 0x95, 0x7B, 0xEA, 0xCC, /* 0x4C-0x4F */ + 0x95, 0x7C, 0x95, 0x7D, 0xC9, 0xB9, 0xCF, 0xFE, /* 0x50-0x53 */ + 0xEA, 0xCA, 0xD4, 0xCE, 0xEA, 0xCD, 0xEA, 0xCF, /* 0x54-0x57 */ + 0x95, 0x7E, 0x95, 0x80, 0xCD, 0xED, 0x95, 0x81, /* 0x58-0x5B */ + 0x95, 0x82, 0x95, 0x83, 0x95, 0x84, 0xEA, 0xC9, /* 0x5C-0x5F */ + 0x95, 0x85, 0xEA, 0xCE, 0x95, 0x86, 0x95, 0x87, /* 0x60-0x63 */ + 0xCE, 0xEE, 0x95, 0x88, 0xBB, 0xDE, 0x95, 0x89, /* 0x64-0x67 */ + 0xB3, 0xBF, 0x95, 0x8A, 0x95, 0x8B, 0x95, 0x8C, /* 0x68-0x6B */ + 0x95, 0x8D, 0x95, 0x8E, 0xC6, 0xD5, 0xBE, 0xB0, /* 0x6C-0x6F */ + 0xCE, 0xFA, 0x95, 0x8F, 0x95, 0x90, 0x95, 0x91, /* 0x70-0x73 */ + 0xC7, 0xE7, 0x95, 0x92, 0xBE, 0xA7, 0xEA, 0xD0, /* 0x74-0x77 */ + 0x95, 0x93, 0x95, 0x94, 0xD6, 0xC7, 0x95, 0x95, /* 0x78-0x7B */ + 0x95, 0x96, 0x95, 0x97, 0xC1, 0xC0, 0x95, 0x98, /* 0x7C-0x7F */ + + 0x95, 0x99, 0x95, 0x9A, 0xD4, 0xDD, 0x95, 0x9B, /* 0x80-0x83 */ + 0xEA, 0xD1, 0x95, 0x9C, 0x95, 0x9D, 0xCF, 0xBE, /* 0x84-0x87 */ + 0x95, 0x9E, 0x95, 0x9F, 0x95, 0xA0, 0x95, 0xA1, /* 0x88-0x8B */ + 0xEA, 0xD2, 0x95, 0xA2, 0x95, 0xA3, 0x95, 0xA4, /* 0x8C-0x8F */ + 0x95, 0xA5, 0xCA, 0xEE, 0x95, 0xA6, 0x95, 0xA7, /* 0x90-0x93 */ + 0x95, 0xA8, 0x95, 0xA9, 0xC5, 0xAF, 0xB0, 0xB5, /* 0x94-0x97 */ + 0x95, 0xAA, 0x95, 0xAB, 0x95, 0xAC, 0x95, 0xAD, /* 0x98-0x9B */ + 0x95, 0xAE, 0xEA, 0xD4, 0x95, 0xAF, 0x95, 0xB0, /* 0x9C-0x9F */ + 0x95, 0xB1, 0x95, 0xB2, 0x95, 0xB3, 0x95, 0xB4, /* 0xA0-0xA3 */ + 0x95, 0xB5, 0x95, 0xB6, 0x95, 0xB7, 0xEA, 0xD3, /* 0xA4-0xA7 */ + 0xF4, 0xDF, 0x95, 0xB8, 0x95, 0xB9, 0x95, 0xBA, /* 0xA8-0xAB */ + 0x95, 0xBB, 0x95, 0xBC, 0xC4, 0xBA, 0x95, 0xBD, /* 0xAC-0xAF */ + 0x95, 0xBE, 0x95, 0xBF, 0x95, 0xC0, 0x95, 0xC1, /* 0xB0-0xB3 */ + 0xB1, 0xA9, 0x95, 0xC2, 0x95, 0xC3, 0x95, 0xC4, /* 0xB4-0xB7 */ + 0x95, 0xC5, 0xE5, 0xDF, 0x95, 0xC6, 0x95, 0xC7, /* 0xB8-0xBB */ + 0x95, 0xC8, 0x95, 0xC9, 0xEA, 0xD5, 0x95, 0xCA, /* 0xBC-0xBF */ + 0x95, 0xCB, 0x95, 0xCC, 0x95, 0xCD, 0x95, 0xCE, /* 0xC0-0xC3 */ + 0x95, 0xCF, 0x95, 0xD0, 0x95, 0xD1, 0x95, 0xD2, /* 0xC4-0xC7 */ + 0x95, 0xD3, 0x95, 0xD4, 0x95, 0xD5, 0x95, 0xD6, /* 0xC8-0xCB */ + 0x95, 0xD7, 0x95, 0xD8, 0x95, 0xD9, 0x95, 0xDA, /* 0xCC-0xCF */ + 0x95, 0xDB, 0x95, 0xDC, 0x95, 0xDD, 0x95, 0xDE, /* 0xD0-0xD3 */ + 0x95, 0xDF, 0x95, 0xE0, 0x95, 0xE1, 0x95, 0xE2, /* 0xD4-0xD7 */ + 0x95, 0xE3, 0xCA, 0xEF, 0x95, 0xE4, 0xEA, 0xD6, /* 0xD8-0xDB */ + 0xEA, 0xD7, 0xC6, 0xD8, 0x95, 0xE5, 0x95, 0xE6, /* 0xDC-0xDF */ + 0x95, 0xE7, 0x95, 0xE8, 0x95, 0xE9, 0x95, 0xEA, /* 0xE0-0xE3 */ + 0x95, 0xEB, 0x95, 0xEC, 0xEA, 0xD8, 0x95, 0xED, /* 0xE4-0xE7 */ + 0x95, 0xEE, 0xEA, 0xD9, 0x95, 0xEF, 0x95, 0xF0, /* 0xE8-0xEB */ + 0x95, 0xF1, 0x95, 0xF2, 0x95, 0xF3, 0x95, 0xF4, /* 0xEC-0xEF */ + 0xD4, 0xBB, 0x95, 0xF5, 0xC7, 0xFA, 0xD2, 0xB7, /* 0xF0-0xF3 */ + 0xB8, 0xFC, 0x95, 0xF6, 0x95, 0xF7, 0xEA, 0xC2, /* 0xF4-0xF7 */ + 0x95, 0xF8, 0xB2, 0xDC, 0x95, 0xF9, 0x95, 0xFA, /* 0xF8-0xFB */ + 0xC2, 0xFC, 0x95, 0xFB, 0xD4, 0xF8, 0xCC, 0xE6, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_67[512] = { + 0xD7, 0xEE, 0x95, 0xFC, 0x95, 0xFD, 0x95, 0xFE, /* 0x00-0x03 */ + 0x96, 0x40, 0x96, 0x41, 0x96, 0x42, 0x96, 0x43, /* 0x04-0x07 */ + 0xD4, 0xC2, 0xD3, 0xD0, 0xEB, 0xC3, 0xC5, 0xF3, /* 0x08-0x0B */ + 0x96, 0x44, 0xB7, 0xFE, 0x96, 0x45, 0x96, 0x46, /* 0x0C-0x0F */ + 0xEB, 0xD4, 0x96, 0x47, 0x96, 0x48, 0x96, 0x49, /* 0x10-0x13 */ + 0xCB, 0xB7, 0xEB, 0xDE, 0x96, 0x4A, 0xC0, 0xCA, /* 0x14-0x17 */ + 0x96, 0x4B, 0x96, 0x4C, 0x96, 0x4D, 0xCD, 0xFB, /* 0x18-0x1B */ + 0x96, 0x4E, 0xB3, 0xAF, 0x96, 0x4F, 0xC6, 0xDA, /* 0x1C-0x1F */ + 0x96, 0x50, 0x96, 0x51, 0x96, 0x52, 0x96, 0x53, /* 0x20-0x23 */ + 0x96, 0x54, 0x96, 0x55, 0xEB, 0xFC, 0x96, 0x56, /* 0x24-0x27 */ + 0xC4, 0xBE, 0x96, 0x57, 0xCE, 0xB4, 0xC4, 0xA9, /* 0x28-0x2B */ + 0xB1, 0xBE, 0xD4, 0xFD, 0x96, 0x58, 0xCA, 0xF5, /* 0x2C-0x2F */ + 0x96, 0x59, 0xD6, 0xEC, 0x96, 0x5A, 0x96, 0x5B, /* 0x30-0x33 */ + 0xC6, 0xD3, 0xB6, 0xE4, 0x96, 0x5C, 0x96, 0x5D, /* 0x34-0x37 */ + 0x96, 0x5E, 0x96, 0x5F, 0xBB, 0xFA, 0x96, 0x60, /* 0x38-0x3B */ + 0x96, 0x61, 0xD0, 0xE0, 0x96, 0x62, 0x96, 0x63, /* 0x3C-0x3F */ + 0xC9, 0xB1, 0x96, 0x64, 0xD4, 0xD3, 0xC8, 0xA8, /* 0x40-0x43 */ + 0x96, 0x65, 0x96, 0x66, 0xB8, 0xCB, 0x96, 0x67, /* 0x44-0x47 */ + 0xE8, 0xBE, 0xC9, 0xBC, 0x96, 0x68, 0x96, 0x69, /* 0x48-0x4B */ + 0xE8, 0xBB, 0x96, 0x6A, 0xC0, 0xEE, 0xD0, 0xD3, /* 0x4C-0x4F */ + 0xB2, 0xC4, 0xB4, 0xE5, 0x96, 0x6B, 0xE8, 0xBC, /* 0x50-0x53 */ + 0x96, 0x6C, 0x96, 0x6D, 0xD5, 0xC8, 0x96, 0x6E, /* 0x54-0x57 */ + 0x96, 0x6F, 0x96, 0x70, 0x96, 0x71, 0x96, 0x72, /* 0x58-0x5B */ + 0xB6, 0xC5, 0x96, 0x73, 0xE8, 0xBD, 0xCA, 0xF8, /* 0x5C-0x5F */ + 0xB8, 0xDC, 0xCC, 0xF5, 0x96, 0x74, 0x96, 0x75, /* 0x60-0x63 */ + 0x96, 0x76, 0xC0, 0xB4, 0x96, 0x77, 0x96, 0x78, /* 0x64-0x67 */ + 0xD1, 0xEE, 0xE8, 0xBF, 0xE8, 0xC2, 0x96, 0x79, /* 0x68-0x6B */ + 0x96, 0x7A, 0xBA, 0xBC, 0x96, 0x7B, 0xB1, 0xAD, /* 0x6C-0x6F */ + 0xBD, 0xDC, 0x96, 0x7C, 0xEA, 0xBD, 0xE8, 0xC3, /* 0x70-0x73 */ + 0x96, 0x7D, 0xE8, 0xC6, 0x96, 0x7E, 0xE8, 0xCB, /* 0x74-0x77 */ + 0x96, 0x80, 0x96, 0x81, 0x96, 0x82, 0x96, 0x83, /* 0x78-0x7B */ + 0xE8, 0xCC, 0x96, 0x84, 0xCB, 0xC9, 0xB0, 0xE5, /* 0x7C-0x7F */ + + 0x96, 0x85, 0xBC, 0xAB, 0x96, 0x86, 0x96, 0x87, /* 0x80-0x83 */ + 0xB9, 0xB9, 0x96, 0x88, 0x96, 0x89, 0xE8, 0xC1, /* 0x84-0x87 */ + 0x96, 0x8A, 0xCD, 0xF7, 0x96, 0x8B, 0xE8, 0xCA, /* 0x88-0x8B */ + 0x96, 0x8C, 0x96, 0x8D, 0x96, 0x8E, 0x96, 0x8F, /* 0x8C-0x8F */ + 0xCE, 0xF6, 0x96, 0x90, 0x96, 0x91, 0x96, 0x92, /* 0x90-0x93 */ + 0x96, 0x93, 0xD5, 0xED, 0x96, 0x94, 0xC1, 0xD6, /* 0x94-0x97 */ + 0xE8, 0xC4, 0x96, 0x95, 0xC3, 0xB6, 0x96, 0x96, /* 0x98-0x9B */ + 0xB9, 0xFB, 0xD6, 0xA6, 0xE8, 0xC8, 0x96, 0x97, /* 0x9C-0x9F */ + 0x96, 0x98, 0x96, 0x99, 0xCA, 0xE0, 0xD4, 0xE6, /* 0xA0-0xA3 */ + 0x96, 0x9A, 0xE8, 0xC0, 0x96, 0x9B, 0xE8, 0xC5, /* 0xA4-0xA7 */ + 0xE8, 0xC7, 0x96, 0x9C, 0xC7, 0xB9, 0xB7, 0xE3, /* 0xA8-0xAB */ + 0x96, 0x9D, 0xE8, 0xC9, 0x96, 0x9E, 0xBF, 0xDD, /* 0xAC-0xAF */ + 0xE8, 0xD2, 0x96, 0x9F, 0x96, 0xA0, 0xE8, 0xD7, /* 0xB0-0xB3 */ + 0x96, 0xA1, 0xE8, 0xD5, 0xBC, 0xDC, 0xBC, 0xCF, /* 0xB4-0xB7 */ + 0xE8, 0xDB, 0x96, 0xA2, 0x96, 0xA3, 0x96, 0xA4, /* 0xB8-0xBB */ + 0x96, 0xA5, 0x96, 0xA6, 0x96, 0xA7, 0x96, 0xA8, /* 0xBC-0xBF */ + 0x96, 0xA9, 0xE8, 0xDE, 0x96, 0xAA, 0xE8, 0xDA, /* 0xC0-0xC3 */ + 0xB1, 0xFA, 0x96, 0xAB, 0x96, 0xAC, 0x96, 0xAD, /* 0xC4-0xC7 */ + 0x96, 0xAE, 0x96, 0xAF, 0x96, 0xB0, 0x96, 0xB1, /* 0xC8-0xCB */ + 0x96, 0xB2, 0x96, 0xB3, 0x96, 0xB4, 0xB0, 0xD8, /* 0xCC-0xCF */ + 0xC4, 0xB3, 0xB8, 0xCC, 0xC6, 0xE2, 0xC8, 0xBE, /* 0xD0-0xD3 */ + 0xC8, 0xE1, 0x96, 0xB5, 0x96, 0xB6, 0x96, 0xB7, /* 0xD4-0xD7 */ + 0xE8, 0xCF, 0xE8, 0xD4, 0xE8, 0xD6, 0x96, 0xB8, /* 0xD8-0xDB */ + 0xB9, 0xF1, 0xE8, 0xD8, 0xD7, 0xF5, 0x96, 0xB9, /* 0xDC-0xDF */ + 0xC4, 0xFB, 0x96, 0xBA, 0xE8, 0xDC, 0x96, 0xBB, /* 0xE0-0xE3 */ + 0x96, 0xBC, 0xB2, 0xE9, 0x96, 0xBD, 0x96, 0xBE, /* 0xE4-0xE7 */ + 0x96, 0xBF, 0xE8, 0xD1, 0x96, 0xC0, 0x96, 0xC1, /* 0xE8-0xEB */ + 0xBC, 0xED, 0x96, 0xC2, 0x96, 0xC3, 0xBF, 0xC2, /* 0xEC-0xEF */ + 0xE8, 0xCD, 0xD6, 0xF9, 0x96, 0xC4, 0xC1, 0xF8, /* 0xF0-0xF3 */ + 0xB2, 0xF1, 0x96, 0xC5, 0x96, 0xC6, 0x96, 0xC7, /* 0xF4-0xF7 */ + 0x96, 0xC8, 0x96, 0xC9, 0x96, 0xCA, 0x96, 0xCB, /* 0xF8-0xFB */ + 0x96, 0xCC, 0xE8, 0xDF, 0x96, 0xCD, 0xCA, 0xC1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_68[512] = { + 0xE8, 0xD9, 0x96, 0xCE, 0x96, 0xCF, 0x96, 0xD0, /* 0x00-0x03 */ + 0x96, 0xD1, 0xD5, 0xA4, 0x96, 0xD2, 0xB1, 0xEA, /* 0x04-0x07 */ + 0xD5, 0xBB, 0xE8, 0xCE, 0xE8, 0xD0, 0xB6, 0xB0, /* 0x08-0x0B */ + 0xE8, 0xD3, 0x96, 0xD3, 0xE8, 0xDD, 0xC0, 0xB8, /* 0x0C-0x0F */ + 0x96, 0xD4, 0xCA, 0xF7, 0x96, 0xD5, 0xCB, 0xA8, /* 0x10-0x13 */ + 0x96, 0xD6, 0x96, 0xD7, 0xC6, 0xDC, 0xC0, 0xF5, /* 0x14-0x17 */ + 0x96, 0xD8, 0x96, 0xD9, 0x96, 0xDA, 0x96, 0xDB, /* 0x18-0x1B */ + 0x96, 0xDC, 0xE8, 0xE9, 0x96, 0xDD, 0x96, 0xDE, /* 0x1C-0x1F */ + 0x96, 0xDF, 0xD0, 0xA3, 0x96, 0xE0, 0x96, 0xE1, /* 0x20-0x23 */ + 0x96, 0xE2, 0x96, 0xE3, 0x96, 0xE4, 0x96, 0xE5, /* 0x24-0x27 */ + 0x96, 0xE6, 0xE8, 0xF2, 0xD6, 0xEA, 0x96, 0xE7, /* 0x28-0x2B */ + 0x96, 0xE8, 0x96, 0xE9, 0x96, 0xEA, 0x96, 0xEB, /* 0x2C-0x2F */ + 0x96, 0xEC, 0x96, 0xED, 0xE8, 0xE0, 0xE8, 0xE1, /* 0x30-0x33 */ + 0x96, 0xEE, 0x96, 0xEF, 0x96, 0xF0, 0xD1, 0xF9, /* 0x34-0x37 */ + 0xBA, 0xCB, 0xB8, 0xF9, 0x96, 0xF1, 0x96, 0xF2, /* 0x38-0x3B */ + 0xB8, 0xF1, 0xD4, 0xD4, 0xE8, 0xEF, 0x96, 0xF3, /* 0x3C-0x3F */ + 0xE8, 0xEE, 0xE8, 0xEC, 0xB9, 0xF0, 0xCC, 0xD2, /* 0x40-0x43 */ + 0xE8, 0xE6, 0xCE, 0xA6, 0xBF, 0xF2, 0x96, 0xF4, /* 0x44-0x47 */ + 0xB0, 0xB8, 0xE8, 0xF1, 0xE8, 0xF0, 0x96, 0xF5, /* 0x48-0x4B */ + 0xD7, 0xC0, 0x96, 0xF6, 0xE8, 0xE4, 0x96, 0xF7, /* 0x4C-0x4F */ + 0xCD, 0xA9, 0xC9, 0xA3, 0x96, 0xF8, 0xBB, 0xB8, /* 0x50-0x53 */ + 0xBD, 0xDB, 0xE8, 0xEA, 0x96, 0xF9, 0x96, 0xFA, /* 0x54-0x57 */ + 0x96, 0xFB, 0x96, 0xFC, 0x96, 0xFD, 0x96, 0xFE, /* 0x58-0x5B */ + 0x97, 0x40, 0x97, 0x41, 0x97, 0x42, 0x97, 0x43, /* 0x5C-0x5F */ + 0xE8, 0xE2, 0xE8, 0xE3, 0xE8, 0xE5, 0xB5, 0xB5, /* 0x60-0x63 */ + 0xE8, 0xE7, 0xC7, 0xC5, 0xE8, 0xEB, 0xE8, 0xED, /* 0x64-0x67 */ + 0xBD, 0xB0, 0xD7, 0xAE, 0x97, 0x44, 0xE8, 0xF8, /* 0x68-0x6B */ + 0x97, 0x45, 0x97, 0x46, 0x97, 0x47, 0x97, 0x48, /* 0x6C-0x6F */ + 0x97, 0x49, 0x97, 0x4A, 0x97, 0x4B, 0x97, 0x4C, /* 0x70-0x73 */ + 0xE8, 0xF5, 0x97, 0x4D, 0xCD, 0xB0, 0xE8, 0xF6, /* 0x74-0x77 */ + 0x97, 0x4E, 0x97, 0x4F, 0x97, 0x50, 0x97, 0x51, /* 0x78-0x7B */ + 0x97, 0x52, 0x97, 0x53, 0x97, 0x54, 0x97, 0x55, /* 0x7C-0x7F */ + + 0x97, 0x56, 0xC1, 0xBA, 0x97, 0x57, 0xE8, 0xE8, /* 0x80-0x83 */ + 0x97, 0x58, 0xC3, 0xB7, 0xB0, 0xF0, 0x97, 0x59, /* 0x84-0x87 */ + 0x97, 0x5A, 0x97, 0x5B, 0x97, 0x5C, 0x97, 0x5D, /* 0x88-0x8B */ + 0x97, 0x5E, 0x97, 0x5F, 0x97, 0x60, 0xE8, 0xF4, /* 0x8C-0x8F */ + 0x97, 0x61, 0x97, 0x62, 0x97, 0x63, 0xE8, 0xF7, /* 0x90-0x93 */ + 0x97, 0x64, 0x97, 0x65, 0x97, 0x66, 0xB9, 0xA3, /* 0x94-0x97 */ + 0x97, 0x67, 0x97, 0x68, 0x97, 0x69, 0x97, 0x6A, /* 0x98-0x9B */ + 0x97, 0x6B, 0x97, 0x6C, 0x97, 0x6D, 0x97, 0x6E, /* 0x9C-0x9F */ + 0x97, 0x6F, 0x97, 0x70, 0xC9, 0xD2, 0x97, 0x71, /* 0xA0-0xA3 */ + 0x97, 0x72, 0x97, 0x73, 0xC3, 0xCE, 0xCE, 0xE0, /* 0xA4-0xA7 */ + 0xC0, 0xE6, 0x97, 0x74, 0x97, 0x75, 0x97, 0x76, /* 0xA8-0xAB */ + 0x97, 0x77, 0xCB, 0xF3, 0x97, 0x78, 0xCC, 0xDD, /* 0xAC-0xAF */ + 0xD0, 0xB5, 0x97, 0x79, 0x97, 0x7A, 0xCA, 0xE1, /* 0xB0-0xB3 */ + 0x97, 0x7B, 0xE8, 0xF3, 0x97, 0x7C, 0x97, 0x7D, /* 0xB4-0xB7 */ + 0x97, 0x7E, 0x97, 0x80, 0x97, 0x81, 0x97, 0x82, /* 0xB8-0xBB */ + 0x97, 0x83, 0x97, 0x84, 0x97, 0x85, 0x97, 0x86, /* 0xBC-0xBF */ + 0xBC, 0xEC, 0x97, 0x87, 0xE8, 0xF9, 0x97, 0x88, /* 0xC0-0xC3 */ + 0x97, 0x89, 0x97, 0x8A, 0x97, 0x8B, 0x97, 0x8C, /* 0xC4-0xC7 */ + 0x97, 0x8D, 0xC3, 0xDE, 0x97, 0x8E, 0xC6, 0xE5, /* 0xC8-0xCB */ + 0x97, 0x8F, 0xB9, 0xF7, 0x97, 0x90, 0x97, 0x91, /* 0xCC-0xCF */ + 0x97, 0x92, 0x97, 0x93, 0xB0, 0xF4, 0x97, 0x94, /* 0xD0-0xD3 */ + 0x97, 0x95, 0xD7, 0xD8, 0x97, 0x96, 0x97, 0x97, /* 0xD4-0xD7 */ + 0xBC, 0xAC, 0x97, 0x98, 0xC5, 0xEF, 0x97, 0x99, /* 0xD8-0xDB */ + 0x97, 0x9A, 0x97, 0x9B, 0x97, 0x9C, 0x97, 0x9D, /* 0xDC-0xDF */ + 0xCC, 0xC4, 0x97, 0x9E, 0x97, 0x9F, 0xE9, 0xA6, /* 0xE0-0xE3 */ + 0x97, 0xA0, 0x97, 0xA1, 0x97, 0xA2, 0x97, 0xA3, /* 0xE4-0xE7 */ + 0x97, 0xA4, 0x97, 0xA5, 0x97, 0xA6, 0x97, 0xA7, /* 0xE8-0xEB */ + 0x97, 0xA8, 0x97, 0xA9, 0xC9, 0xAD, 0x97, 0xAA, /* 0xEC-0xEF */ + 0xE9, 0xA2, 0xC0, 0xE2, 0x97, 0xAB, 0x97, 0xAC, /* 0xF0-0xF3 */ + 0x97, 0xAD, 0xBF, 0xC3, 0x97, 0xAE, 0x97, 0xAF, /* 0xF4-0xF7 */ + 0x97, 0xB0, 0xE8, 0xFE, 0xB9, 0xD7, 0x97, 0xB1, /* 0xF8-0xFB */ + 0xE8, 0xFB, 0x97, 0xB2, 0x97, 0xB3, 0x97, 0xB4, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_69[512] = { + 0x97, 0xB5, 0xE9, 0xA4, 0x97, 0xB6, 0x97, 0xB7, /* 0x00-0x03 */ + 0x97, 0xB8, 0xD2, 0xCE, 0x97, 0xB9, 0x97, 0xBA, /* 0x04-0x07 */ + 0x97, 0xBB, 0x97, 0xBC, 0x97, 0xBD, 0xE9, 0xA3, /* 0x08-0x0B */ + 0x97, 0xBE, 0xD6, 0xB2, 0xD7, 0xB5, 0x97, 0xBF, /* 0x0C-0x0F */ + 0xE9, 0xA7, 0x97, 0xC0, 0xBD, 0xB7, 0x97, 0xC1, /* 0x10-0x13 */ + 0x97, 0xC2, 0x97, 0xC3, 0x97, 0xC4, 0x97, 0xC5, /* 0x14-0x17 */ + 0x97, 0xC6, 0x97, 0xC7, 0x97, 0xC8, 0x97, 0xC9, /* 0x18-0x1B */ + 0x97, 0xCA, 0x97, 0xCB, 0x97, 0xCC, 0xE8, 0xFC, /* 0x1C-0x1F */ + 0xE8, 0xFD, 0x97, 0xCD, 0x97, 0xCE, 0x97, 0xCF, /* 0x20-0x23 */ + 0xE9, 0xA1, 0x97, 0xD0, 0x97, 0xD1, 0x97, 0xD2, /* 0x24-0x27 */ + 0x97, 0xD3, 0x97, 0xD4, 0x97, 0xD5, 0x97, 0xD6, /* 0x28-0x2B */ + 0x97, 0xD7, 0xCD, 0xD6, 0x97, 0xD8, 0x97, 0xD9, /* 0x2C-0x2F */ + 0xD2, 0xAC, 0x97, 0xDA, 0x97, 0xDB, 0x97, 0xDC, /* 0x30-0x33 */ + 0xE9, 0xB2, 0x97, 0xDD, 0x97, 0xDE, 0x97, 0xDF, /* 0x34-0x37 */ + 0x97, 0xE0, 0xE9, 0xA9, 0x97, 0xE1, 0x97, 0xE2, /* 0x38-0x3B */ + 0x97, 0xE3, 0xB4, 0xAA, 0x97, 0xE4, 0xB4, 0xBB, /* 0x3C-0x3F */ + 0x97, 0xE5, 0x97, 0xE6, 0xE9, 0xAB, 0x97, 0xE7, /* 0x40-0x43 */ + 0x97, 0xE8, 0x97, 0xE9, 0x97, 0xEA, 0x97, 0xEB, /* 0x44-0x47 */ + 0x97, 0xEC, 0x97, 0xED, 0x97, 0xEE, 0x97, 0xEF, /* 0x48-0x4B */ + 0x97, 0xF0, 0x97, 0xF1, 0x97, 0xF2, 0x97, 0xF3, /* 0x4C-0x4F */ + 0x97, 0xF4, 0x97, 0xF5, 0x97, 0xF6, 0x97, 0xF7, /* 0x50-0x53 */ + 0xD0, 0xA8, 0x97, 0xF8, 0x97, 0xF9, 0xE9, 0xA5, /* 0x54-0x57 */ + 0x97, 0xFA, 0x97, 0xFB, 0xB3, 0xFE, 0x97, 0xFC, /* 0x58-0x5B */ + 0x97, 0xFD, 0xE9, 0xAC, 0xC0, 0xE3, 0x97, 0xFE, /* 0x5C-0x5F */ + 0xE9, 0xAA, 0x98, 0x40, 0x98, 0x41, 0xE9, 0xB9, /* 0x60-0x63 */ + 0x98, 0x42, 0x98, 0x43, 0xE9, 0xB8, 0x98, 0x44, /* 0x64-0x67 */ + 0x98, 0x45, 0x98, 0x46, 0x98, 0x47, 0xE9, 0xAE, /* 0x68-0x6B */ + 0x98, 0x48, 0x98, 0x49, 0xE8, 0xFA, 0x98, 0x4A, /* 0x6C-0x6F */ + 0x98, 0x4B, 0xE9, 0xA8, 0x98, 0x4C, 0x98, 0x4D, /* 0x70-0x73 */ + 0x98, 0x4E, 0x98, 0x4F, 0x98, 0x50, 0xBF, 0xAC, /* 0x74-0x77 */ + 0xE9, 0xB1, 0xE9, 0xBA, 0x98, 0x51, 0x98, 0x52, /* 0x78-0x7B */ + 0xC2, 0xA5, 0x98, 0x53, 0x98, 0x54, 0x98, 0x55, /* 0x7C-0x7F */ + + 0xE9, 0xAF, 0x98, 0x56, 0xB8, 0xC5, 0x98, 0x57, /* 0x80-0x83 */ + 0xE9, 0xAD, 0x98, 0x58, 0xD3, 0xDC, 0xE9, 0xB4, /* 0x84-0x87 */ + 0xE9, 0xB5, 0xE9, 0xB7, 0x98, 0x59, 0x98, 0x5A, /* 0x88-0x8B */ + 0x98, 0x5B, 0xE9, 0xC7, 0x98, 0x5C, 0x98, 0x5D, /* 0x8C-0x8F */ + 0x98, 0x5E, 0x98, 0x5F, 0x98, 0x60, 0x98, 0x61, /* 0x90-0x93 */ + 0xC0, 0xC6, 0xE9, 0xC5, 0x98, 0x62, 0x98, 0x63, /* 0x94-0x97 */ + 0xE9, 0xB0, 0x98, 0x64, 0x98, 0x65, 0xE9, 0xBB, /* 0x98-0x9B */ + 0xB0, 0xF1, 0x98, 0x66, 0x98, 0x67, 0x98, 0x68, /* 0x9C-0x9F */ + 0x98, 0x69, 0x98, 0x6A, 0x98, 0x6B, 0x98, 0x6C, /* 0xA0-0xA3 */ + 0x98, 0x6D, 0x98, 0x6E, 0x98, 0x6F, 0xE9, 0xBC, /* 0xA4-0xA7 */ + 0xD5, 0xA5, 0x98, 0x70, 0x98, 0x71, 0xE9, 0xBE, /* 0xA8-0xAB */ + 0x98, 0x72, 0xE9, 0xBF, 0x98, 0x73, 0x98, 0x74, /* 0xAC-0xAF */ + 0x98, 0x75, 0xE9, 0xC1, 0x98, 0x76, 0x98, 0x77, /* 0xB0-0xB3 */ + 0xC1, 0xF1, 0x98, 0x78, 0x98, 0x79, 0xC8, 0xB6, /* 0xB4-0xB7 */ + 0x98, 0x7A, 0x98, 0x7B, 0x98, 0x7C, 0xE9, 0xBD, /* 0xB8-0xBB */ + 0x98, 0x7D, 0x98, 0x7E, 0x98, 0x80, 0x98, 0x81, /* 0xBC-0xBF */ + 0x98, 0x82, 0xE9, 0xC2, 0x98, 0x83, 0x98, 0x84, /* 0xC0-0xC3 */ + 0x98, 0x85, 0x98, 0x86, 0x98, 0x87, 0x98, 0x88, /* 0xC4-0xC7 */ + 0x98, 0x89, 0x98, 0x8A, 0xE9, 0xC3, 0x98, 0x8B, /* 0xC8-0xCB */ + 0xE9, 0xB3, 0x98, 0x8C, 0xE9, 0xB6, 0x98, 0x8D, /* 0xCC-0xCF */ + 0xBB, 0xB1, 0x98, 0x8E, 0x98, 0x8F, 0x98, 0x90, /* 0xD0-0xD3 */ + 0xE9, 0xC0, 0x98, 0x91, 0x98, 0x92, 0x98, 0x93, /* 0xD4-0xD7 */ + 0x98, 0x94, 0x98, 0x95, 0x98, 0x96, 0xBC, 0xF7, /* 0xD8-0xDB */ + 0x98, 0x97, 0x98, 0x98, 0x98, 0x99, 0xE9, 0xC4, /* 0xDC-0xDF */ + 0xE9, 0xC6, 0x98, 0x9A, 0x98, 0x9B, 0x98, 0x9C, /* 0xE0-0xE3 */ + 0x98, 0x9D, 0x98, 0x9E, 0x98, 0x9F, 0x98, 0xA0, /* 0xE4-0xE7 */ + 0x98, 0xA1, 0x98, 0xA2, 0x98, 0xA3, 0x98, 0xA4, /* 0xE8-0xEB */ + 0x98, 0xA5, 0xE9, 0xCA, 0x98, 0xA6, 0x98, 0xA7, /* 0xEC-0xEF */ + 0x98, 0xA8, 0x98, 0xA9, 0xE9, 0xCE, 0x98, 0xAA, /* 0xF0-0xF3 */ + 0x98, 0xAB, 0x98, 0xAC, 0x98, 0xAD, 0x98, 0xAE, /* 0xF4-0xF7 */ + 0x98, 0xAF, 0x98, 0xB0, 0x98, 0xB1, 0x98, 0xB2, /* 0xF8-0xFB */ + 0x98, 0xB3, 0xB2, 0xDB, 0x98, 0xB4, 0xE9, 0xC8, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6A[512] = { + 0x98, 0xB5, 0x98, 0xB6, 0x98, 0xB7, 0x98, 0xB8, /* 0x00-0x03 */ + 0x98, 0xB9, 0x98, 0xBA, 0x98, 0xBB, 0x98, 0xBC, /* 0x04-0x07 */ + 0x98, 0xBD, 0x98, 0xBE, 0xB7, 0xAE, 0x98, 0xBF, /* 0x08-0x0B */ + 0x98, 0xC0, 0x98, 0xC1, 0x98, 0xC2, 0x98, 0xC3, /* 0x0C-0x0F */ + 0x98, 0xC4, 0x98, 0xC5, 0x98, 0xC6, 0x98, 0xC7, /* 0x10-0x13 */ + 0x98, 0xC8, 0x98, 0xC9, 0x98, 0xCA, 0xE9, 0xCB, /* 0x14-0x17 */ + 0xE9, 0xCC, 0x98, 0xCB, 0x98, 0xCC, 0x98, 0xCD, /* 0x18-0x1B */ + 0x98, 0xCE, 0x98, 0xCF, 0x98, 0xD0, 0xD5, 0xC1, /* 0x1C-0x1F */ + 0x98, 0xD1, 0xC4, 0xA3, 0x98, 0xD2, 0x98, 0xD3, /* 0x20-0x23 */ + 0x98, 0xD4, 0x98, 0xD5, 0x98, 0xD6, 0x98, 0xD7, /* 0x24-0x27 */ + 0xE9, 0xD8, 0x98, 0xD8, 0xBA, 0xE1, 0x98, 0xD9, /* 0x28-0x2B */ + 0x98, 0xDA, 0x98, 0xDB, 0x98, 0xDC, 0xE9, 0xC9, /* 0x2C-0x2F */ + 0x98, 0xDD, 0xD3, 0xA3, 0x98, 0xDE, 0x98, 0xDF, /* 0x30-0x33 */ + 0x98, 0xE0, 0xE9, 0xD4, 0x98, 0xE1, 0x98, 0xE2, /* 0x34-0x37 */ + 0x98, 0xE3, 0x98, 0xE4, 0x98, 0xE5, 0x98, 0xE6, /* 0x38-0x3B */ + 0x98, 0xE7, 0xE9, 0xD7, 0xE9, 0xD0, 0x98, 0xE8, /* 0x3C-0x3F */ + 0x98, 0xE9, 0x98, 0xEA, 0x98, 0xEB, 0x98, 0xEC, /* 0x40-0x43 */ + 0xE9, 0xCF, 0x98, 0xED, 0x98, 0xEE, 0xC7, 0xC1, /* 0x44-0x47 */ + 0x98, 0xEF, 0x98, 0xF0, 0x98, 0xF1, 0x98, 0xF2, /* 0x48-0x4B */ + 0x98, 0xF3, 0x98, 0xF4, 0x98, 0xF5, 0x98, 0xF6, /* 0x4C-0x4F */ + 0xE9, 0xD2, 0x98, 0xF7, 0x98, 0xF8, 0x98, 0xF9, /* 0x50-0x53 */ + 0x98, 0xFA, 0x98, 0xFB, 0x98, 0xFC, 0x98, 0xFD, /* 0x54-0x57 */ + 0xE9, 0xD9, 0xB3, 0xC8, 0x98, 0xFE, 0xE9, 0xD3, /* 0x58-0x5B */ + 0x99, 0x40, 0x99, 0x41, 0x99, 0x42, 0x99, 0x43, /* 0x5C-0x5F */ + 0x99, 0x44, 0xCF, 0xF0, 0x99, 0x45, 0x99, 0x46, /* 0x60-0x63 */ + 0x99, 0x47, 0xE9, 0xCD, 0x99, 0x48, 0x99, 0x49, /* 0x64-0x67 */ + 0x99, 0x4A, 0x99, 0x4B, 0x99, 0x4C, 0x99, 0x4D, /* 0x68-0x6B */ + 0x99, 0x4E, 0x99, 0x4F, 0x99, 0x50, 0x99, 0x51, /* 0x6C-0x6F */ + 0x99, 0x52, 0xB3, 0xF7, 0x99, 0x53, 0x99, 0x54, /* 0x70-0x73 */ + 0x99, 0x55, 0x99, 0x56, 0x99, 0x57, 0x99, 0x58, /* 0x74-0x77 */ + 0x99, 0x59, 0xE9, 0xD6, 0x99, 0x5A, 0x99, 0x5B, /* 0x78-0x7B */ + 0xE9, 0xDA, 0x99, 0x5C, 0x99, 0x5D, 0x99, 0x5E, /* 0x7C-0x7F */ + + 0xCC, 0xB4, 0x99, 0x5F, 0x99, 0x60, 0x99, 0x61, /* 0x80-0x83 */ + 0xCF, 0xAD, 0x99, 0x62, 0x99, 0x63, 0x99, 0x64, /* 0x84-0x87 */ + 0x99, 0x65, 0x99, 0x66, 0x99, 0x67, 0x99, 0x68, /* 0x88-0x8B */ + 0x99, 0x69, 0x99, 0x6A, 0xE9, 0xD5, 0x99, 0x6B, /* 0x8C-0x8F */ + 0xE9, 0xDC, 0xE9, 0xDB, 0x99, 0x6C, 0x99, 0x6D, /* 0x90-0x93 */ + 0x99, 0x6E, 0x99, 0x6F, 0x99, 0x70, 0xE9, 0xDE, /* 0x94-0x97 */ + 0x99, 0x71, 0x99, 0x72, 0x99, 0x73, 0x99, 0x74, /* 0x98-0x9B */ + 0x99, 0x75, 0x99, 0x76, 0x99, 0x77, 0x99, 0x78, /* 0x9C-0x9F */ + 0xE9, 0xD1, 0x99, 0x79, 0x99, 0x7A, 0x99, 0x7B, /* 0xA0-0xA3 */ + 0x99, 0x7C, 0x99, 0x7D, 0x99, 0x7E, 0x99, 0x80, /* 0xA4-0xA7 */ + 0x99, 0x81, 0xE9, 0xDD, 0x99, 0x82, 0xE9, 0xDF, /* 0xA8-0xAB */ + 0xC3, 0xCA, 0x99, 0x83, 0x99, 0x84, 0x99, 0x85, /* 0xAC-0xAF */ + 0x99, 0x86, 0x99, 0x87, 0x99, 0x88, 0x99, 0x89, /* 0xB0-0xB3 */ + 0x99, 0x8A, 0x99, 0x8B, 0x99, 0x8C, 0x99, 0x8D, /* 0xB4-0xB7 */ + 0x99, 0x8E, 0x99, 0x8F, 0x99, 0x90, 0x99, 0x91, /* 0xB8-0xBB */ + 0x99, 0x92, 0x99, 0x93, 0x99, 0x94, 0x99, 0x95, /* 0xBC-0xBF */ + 0x99, 0x96, 0x99, 0x97, 0x99, 0x98, 0x99, 0x99, /* 0xC0-0xC3 */ + 0x99, 0x9A, 0x99, 0x9B, 0x99, 0x9C, 0x99, 0x9D, /* 0xC4-0xC7 */ + 0x99, 0x9E, 0x99, 0x9F, 0x99, 0xA0, 0x99, 0xA1, /* 0xC8-0xCB */ + 0x99, 0xA2, 0x99, 0xA3, 0x99, 0xA4, 0x99, 0xA5, /* 0xCC-0xCF */ + 0x99, 0xA6, 0x99, 0xA7, 0x99, 0xA8, 0x99, 0xA9, /* 0xD0-0xD3 */ + 0x99, 0xAA, 0x99, 0xAB, 0x99, 0xAC, 0x99, 0xAD, /* 0xD4-0xD7 */ + 0x99, 0xAE, 0x99, 0xAF, 0x99, 0xB0, 0x99, 0xB1, /* 0xD8-0xDB */ + 0x99, 0xB2, 0x99, 0xB3, 0x99, 0xB4, 0x99, 0xB5, /* 0xDC-0xDF */ + 0x99, 0xB6, 0x99, 0xB7, 0x99, 0xB8, 0x99, 0xB9, /* 0xE0-0xE3 */ + 0x99, 0xBA, 0x99, 0xBB, 0x99, 0xBC, 0x99, 0xBD, /* 0xE4-0xE7 */ + 0x99, 0xBE, 0x99, 0xBF, 0x99, 0xC0, 0x99, 0xC1, /* 0xE8-0xEB */ + 0x99, 0xC2, 0x99, 0xC3, 0x99, 0xC4, 0x99, 0xC5, /* 0xEC-0xEF */ + 0x99, 0xC6, 0x99, 0xC7, 0x99, 0xC8, 0x99, 0xC9, /* 0xF0-0xF3 */ + 0x99, 0xCA, 0x99, 0xCB, 0x99, 0xCC, 0x99, 0xCD, /* 0xF4-0xF7 */ + 0x99, 0xCE, 0x99, 0xCF, 0x99, 0xD0, 0x99, 0xD1, /* 0xF8-0xFB */ + 0x99, 0xD2, 0x99, 0xD3, 0x99, 0xD4, 0x99, 0xD5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6B[512] = { + 0x99, 0xD6, 0x99, 0xD7, 0x99, 0xD8, 0x99, 0xD9, /* 0x00-0x03 */ + 0x99, 0xDA, 0x99, 0xDB, 0x99, 0xDC, 0x99, 0xDD, /* 0x04-0x07 */ + 0x99, 0xDE, 0x99, 0xDF, 0x99, 0xE0, 0x99, 0xE1, /* 0x08-0x0B */ + 0x99, 0xE2, 0x99, 0xE3, 0x99, 0xE4, 0x99, 0xE5, /* 0x0C-0x0F */ + 0x99, 0xE6, 0x99, 0xE7, 0x99, 0xE8, 0x99, 0xE9, /* 0x10-0x13 */ + 0x99, 0xEA, 0x99, 0xEB, 0x99, 0xEC, 0x99, 0xED, /* 0x14-0x17 */ + 0x99, 0xEE, 0x99, 0xEF, 0x99, 0xF0, 0x99, 0xF1, /* 0x18-0x1B */ + 0x99, 0xF2, 0x99, 0xF3, 0x99, 0xF4, 0x99, 0xF5, /* 0x1C-0x1F */ + 0xC7, 0xB7, 0xB4, 0xCE, 0xBB, 0xB6, 0xD0, 0xC0, /* 0x20-0x23 */ + 0xEC, 0xA3, 0x99, 0xF6, 0x99, 0xF7, 0xC5, 0xB7, /* 0x24-0x27 */ + 0x99, 0xF8, 0x99, 0xF9, 0x99, 0xFA, 0x99, 0xFB, /* 0x28-0x2B */ + 0x99, 0xFC, 0x99, 0xFD, 0x99, 0xFE, 0x9A, 0x40, /* 0x2C-0x2F */ + 0x9A, 0x41, 0x9A, 0x42, 0xD3, 0xFB, 0x9A, 0x43, /* 0x30-0x33 */ + 0x9A, 0x44, 0x9A, 0x45, 0x9A, 0x46, 0xEC, 0xA4, /* 0x34-0x37 */ + 0x9A, 0x47, 0xEC, 0xA5, 0xC6, 0xDB, 0x9A, 0x48, /* 0x38-0x3B */ + 0x9A, 0x49, 0x9A, 0x4A, 0xBF, 0xEE, 0x9A, 0x4B, /* 0x3C-0x3F */ + 0x9A, 0x4C, 0x9A, 0x4D, 0x9A, 0x4E, 0xEC, 0xA6, /* 0x40-0x43 */ + 0x9A, 0x4F, 0x9A, 0x50, 0xEC, 0xA7, 0xD0, 0xAA, /* 0x44-0x47 */ + 0x9A, 0x51, 0xC7, 0xB8, 0x9A, 0x52, 0x9A, 0x53, /* 0x48-0x4B */ + 0xB8, 0xE8, 0x9A, 0x54, 0x9A, 0x55, 0x9A, 0x56, /* 0x4C-0x4F */ + 0x9A, 0x57, 0x9A, 0x58, 0x9A, 0x59, 0x9A, 0x5A, /* 0x50-0x53 */ + 0x9A, 0x5B, 0x9A, 0x5C, 0x9A, 0x5D, 0x9A, 0x5E, /* 0x54-0x57 */ + 0x9A, 0x5F, 0xEC, 0xA8, 0x9A, 0x60, 0x9A, 0x61, /* 0x58-0x5B */ + 0x9A, 0x62, 0x9A, 0x63, 0x9A, 0x64, 0x9A, 0x65, /* 0x5C-0x5F */ + 0x9A, 0x66, 0x9A, 0x67, 0xD6, 0xB9, 0xD5, 0xFD, /* 0x60-0x63 */ + 0xB4, 0xCB, 0xB2, 0xBD, 0xCE, 0xE4, 0xC6, 0xE7, /* 0x64-0x67 */ + 0x9A, 0x68, 0x9A, 0x69, 0xCD, 0xE1, 0x9A, 0x6A, /* 0x68-0x6B */ + 0x9A, 0x6B, 0x9A, 0x6C, 0x9A, 0x6D, 0x9A, 0x6E, /* 0x6C-0x6F */ + 0x9A, 0x6F, 0x9A, 0x70, 0x9A, 0x71, 0x9A, 0x72, /* 0x70-0x73 */ + 0x9A, 0x73, 0x9A, 0x74, 0x9A, 0x75, 0x9A, 0x76, /* 0x74-0x77 */ + 0x9A, 0x77, 0xB4, 0xF5, 0x9A, 0x78, 0xCB, 0xC0, /* 0x78-0x7B */ + 0xBC, 0xDF, 0x9A, 0x79, 0x9A, 0x7A, 0x9A, 0x7B, /* 0x7C-0x7F */ + + 0x9A, 0x7C, 0xE9, 0xE2, 0xE9, 0xE3, 0xD1, 0xEA, /* 0x80-0x83 */ + 0xE9, 0xE5, 0x9A, 0x7D, 0xB4, 0xF9, 0xE9, 0xE4, /* 0x84-0x87 */ + 0x9A, 0x7E, 0xD1, 0xB3, 0xCA, 0xE2, 0xB2, 0xD0, /* 0x88-0x8B */ + 0x9A, 0x80, 0xE9, 0xE8, 0x9A, 0x81, 0x9A, 0x82, /* 0x8C-0x8F */ + 0x9A, 0x83, 0x9A, 0x84, 0xE9, 0xE6, 0xE9, 0xE7, /* 0x90-0x93 */ + 0x9A, 0x85, 0x9A, 0x86, 0xD6, 0xB3, 0x9A, 0x87, /* 0x94-0x97 */ + 0x9A, 0x88, 0x9A, 0x89, 0xE9, 0xE9, 0xE9, 0xEA, /* 0x98-0x9B */ + 0x9A, 0x8A, 0x9A, 0x8B, 0x9A, 0x8C, 0x9A, 0x8D, /* 0x9C-0x9F */ + 0x9A, 0x8E, 0xE9, 0xEB, 0x9A, 0x8F, 0x9A, 0x90, /* 0xA0-0xA3 */ + 0x9A, 0x91, 0x9A, 0x92, 0x9A, 0x93, 0x9A, 0x94, /* 0xA4-0xA7 */ + 0x9A, 0x95, 0x9A, 0x96, 0xE9, 0xEC, 0x9A, 0x97, /* 0xA8-0xAB */ + 0x9A, 0x98, 0x9A, 0x99, 0x9A, 0x9A, 0x9A, 0x9B, /* 0xAC-0xAF */ + 0x9A, 0x9C, 0x9A, 0x9D, 0x9A, 0x9E, 0xEC, 0xAF, /* 0xB0-0xB3 */ + 0xC5, 0xB9, 0xB6, 0xCE, 0x9A, 0x9F, 0xD2, 0xF3, /* 0xB4-0xB7 */ + 0x9A, 0xA0, 0x9A, 0xA1, 0x9A, 0xA2, 0x9A, 0xA3, /* 0xB8-0xBB */ + 0x9A, 0xA4, 0x9A, 0xA5, 0x9A, 0xA6, 0xB5, 0xEE, /* 0xBC-0xBF */ + 0x9A, 0xA7, 0xBB, 0xD9, 0xEC, 0xB1, 0x9A, 0xA8, /* 0xC0-0xC3 */ + 0x9A, 0xA9, 0xD2, 0xE3, 0x9A, 0xAA, 0x9A, 0xAB, /* 0xC4-0xC7 */ + 0x9A, 0xAC, 0x9A, 0xAD, 0x9A, 0xAE, 0xCE, 0xE3, /* 0xC8-0xCB */ + 0x9A, 0xAF, 0xC4, 0xB8, 0x9A, 0xB0, 0xC3, 0xBF, /* 0xCC-0xCF */ + 0x9A, 0xB1, 0x9A, 0xB2, 0xB6, 0xBE, 0xD8, 0xB9, /* 0xD0-0xD3 */ + 0xB1, 0xC8, 0xB1, 0xCF, 0xB1, 0xD1, 0xC5, 0xFE, /* 0xD4-0xD7 */ + 0x9A, 0xB3, 0xB1, 0xD0, 0x9A, 0xB4, 0xC3, 0xAB, /* 0xD8-0xDB */ + 0x9A, 0xB5, 0x9A, 0xB6, 0x9A, 0xB7, 0x9A, 0xB8, /* 0xDC-0xDF */ + 0x9A, 0xB9, 0xD5, 0xB1, 0x9A, 0xBA, 0x9A, 0xBB, /* 0xE0-0xE3 */ + 0x9A, 0xBC, 0x9A, 0xBD, 0x9A, 0xBE, 0x9A, 0xBF, /* 0xE4-0xE7 */ + 0x9A, 0xC0, 0x9A, 0xC1, 0xEB, 0xA4, 0xBA, 0xC1, /* 0xE8-0xEB */ + 0x9A, 0xC2, 0x9A, 0xC3, 0x9A, 0xC4, 0xCC, 0xBA, /* 0xEC-0xEF */ + 0x9A, 0xC5, 0x9A, 0xC6, 0x9A, 0xC7, 0xEB, 0xA5, /* 0xF0-0xF3 */ + 0x9A, 0xC8, 0xEB, 0xA7, 0x9A, 0xC9, 0x9A, 0xCA, /* 0xF4-0xF7 */ + 0x9A, 0xCB, 0xEB, 0xA8, 0x9A, 0xCC, 0x9A, 0xCD, /* 0xF8-0xFB */ + 0x9A, 0xCE, 0xEB, 0xA6, 0x9A, 0xCF, 0x9A, 0xD0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6C[512] = { + 0x9A, 0xD1, 0x9A, 0xD2, 0x9A, 0xD3, 0x9A, 0xD4, /* 0x00-0x03 */ + 0x9A, 0xD5, 0xEB, 0xA9, 0xEB, 0xAB, 0xEB, 0xAA, /* 0x04-0x07 */ + 0x9A, 0xD6, 0x9A, 0xD7, 0x9A, 0xD8, 0x9A, 0xD9, /* 0x08-0x0B */ + 0x9A, 0xDA, 0xEB, 0xAC, 0x9A, 0xDB, 0xCA, 0xCF, /* 0x0C-0x0F */ + 0xD8, 0xB5, 0xC3, 0xF1, 0x9A, 0xDC, 0xC3, 0xA5, /* 0x10-0x13 */ + 0xC6, 0xF8, 0xEB, 0xAD, 0xC4, 0xCA, 0x9A, 0xDD, /* 0x14-0x17 */ + 0xEB, 0xAE, 0xEB, 0xAF, 0xEB, 0xB0, 0xB7, 0xD5, /* 0x18-0x1B */ + 0x9A, 0xDE, 0x9A, 0xDF, 0x9A, 0xE0, 0xB7, 0xFA, /* 0x1C-0x1F */ + 0x9A, 0xE1, 0xEB, 0xB1, 0xC7, 0xE2, 0x9A, 0xE2, /* 0x20-0x23 */ + 0xEB, 0xB3, 0x9A, 0xE3, 0xBA, 0xA4, 0xD1, 0xF5, /* 0x24-0x27 */ + 0xB0, 0xB1, 0xEB, 0xB2, 0xEB, 0xB4, 0x9A, 0xE4, /* 0x28-0x2B */ + 0x9A, 0xE5, 0x9A, 0xE6, 0xB5, 0xAA, 0xC2, 0xC8, /* 0x2C-0x2F */ + 0xC7, 0xE8, 0x9A, 0xE7, 0xEB, 0xB5, 0x9A, 0xE8, /* 0x30-0x33 */ + 0xCB, 0xAE, 0xE3, 0xDF, 0x9A, 0xE9, 0x9A, 0xEA, /* 0x34-0x37 */ + 0xD3, 0xC0, 0x9A, 0xEB, 0x9A, 0xEC, 0x9A, 0xED, /* 0x38-0x3B */ + 0x9A, 0xEE, 0xD9, 0xDB, 0x9A, 0xEF, 0x9A, 0xF0, /* 0x3C-0x3F */ + 0xCD, 0xA1, 0xD6, 0xAD, 0xC7, 0xF3, 0x9A, 0xF1, /* 0x40-0x43 */ + 0x9A, 0xF2, 0x9A, 0xF3, 0xD9, 0xE0, 0xBB, 0xE3, /* 0x44-0x47 */ + 0x9A, 0xF4, 0xBA, 0xBA, 0xE3, 0xE2, 0x9A, 0xF5, /* 0x48-0x4B */ + 0x9A, 0xF6, 0x9A, 0xF7, 0x9A, 0xF8, 0x9A, 0xF9, /* 0x4C-0x4F */ + 0xCF, 0xAB, 0x9A, 0xFA, 0x9A, 0xFB, 0x9A, 0xFC, /* 0x50-0x53 */ + 0xE3, 0xE0, 0xC9, 0xC7, 0x9A, 0xFD, 0xBA, 0xB9, /* 0x54-0x57 */ + 0x9A, 0xFE, 0x9B, 0x40, 0x9B, 0x41, 0xD1, 0xB4, /* 0x58-0x5B */ + 0xE3, 0xE1, 0xC8, 0xEA, 0xB9, 0xAF, 0xBD, 0xAD, /* 0x5C-0x5F */ + 0xB3, 0xD8, 0xCE, 0xDB, 0x9B, 0x42, 0x9B, 0x43, /* 0x60-0x63 */ + 0xCC, 0xC0, 0x9B, 0x44, 0x9B, 0x45, 0x9B, 0x46, /* 0x64-0x67 */ + 0xE3, 0xE8, 0xE3, 0xE9, 0xCD, 0xF4, 0x9B, 0x47, /* 0x68-0x6B */ + 0x9B, 0x48, 0x9B, 0x49, 0x9B, 0x4A, 0x9B, 0x4B, /* 0x6C-0x6F */ + 0xCC, 0xAD, 0x9B, 0x4C, 0xBC, 0xB3, 0x9B, 0x4D, /* 0x70-0x73 */ + 0xE3, 0xEA, 0x9B, 0x4E, 0xE3, 0xEB, 0x9B, 0x4F, /* 0x74-0x77 */ + 0x9B, 0x50, 0xD0, 0xDA, 0x9B, 0x51, 0x9B, 0x52, /* 0x78-0x7B */ + 0x9B, 0x53, 0xC6, 0xFB, 0xB7, 0xDA, 0x9B, 0x54, /* 0x7C-0x7F */ + + 0x9B, 0x55, 0xC7, 0xDF, 0xD2, 0xCA, 0xCE, 0xD6, /* 0x80-0x83 */ + 0x9B, 0x56, 0xE3, 0xE4, 0xE3, 0xEC, 0x9B, 0x57, /* 0x84-0x87 */ + 0xC9, 0xF2, 0xB3, 0xC1, 0x9B, 0x58, 0x9B, 0x59, /* 0x88-0x8B */ + 0xE3, 0xE7, 0x9B, 0x5A, 0x9B, 0x5B, 0xC6, 0xE3, /* 0x8C-0x8F */ + 0xE3, 0xE5, 0x9B, 0x5C, 0x9B, 0x5D, 0xED, 0xB3, /* 0x90-0x93 */ + 0xE3, 0xE6, 0x9B, 0x5E, 0x9B, 0x5F, 0x9B, 0x60, /* 0x94-0x97 */ + 0x9B, 0x61, 0xC9, 0xB3, 0x9B, 0x62, 0xC5, 0xE6, /* 0x98-0x9B */ + 0x9B, 0x63, 0x9B, 0x64, 0x9B, 0x65, 0xB9, 0xB5, /* 0x9C-0x9F */ + 0x9B, 0x66, 0xC3, 0xBB, 0x9B, 0x67, 0xE3, 0xE3, /* 0xA0-0xA3 */ + 0xC5, 0xBD, 0xC1, 0xA4, 0xC2, 0xD9, 0xB2, 0xD7, /* 0xA4-0xA7 */ + 0x9B, 0x68, 0xE3, 0xED, 0xBB, 0xA6, 0xC4, 0xAD, /* 0xA8-0xAB */ + 0x9B, 0x69, 0xE3, 0xF0, 0xBE, 0xDA, 0x9B, 0x6A, /* 0xAC-0xAF */ + 0x9B, 0x6B, 0xE3, 0xFB, 0xE3, 0xF5, 0xBA, 0xD3, /* 0xB0-0xB3 */ + 0x9B, 0x6C, 0x9B, 0x6D, 0x9B, 0x6E, 0x9B, 0x6F, /* 0xB4-0xB7 */ + 0xB7, 0xD0, 0xD3, 0xCD, 0x9B, 0x70, 0xD6, 0xCE, /* 0xB8-0xBB */ + 0xD5, 0xD3, 0xB9, 0xC1, 0xD5, 0xB4, 0xD1, 0xD8, /* 0xBC-0xBF */ + 0x9B, 0x71, 0x9B, 0x72, 0x9B, 0x73, 0x9B, 0x74, /* 0xC0-0xC3 */ + 0xD0, 0xB9, 0xC7, 0xF6, 0x9B, 0x75, 0x9B, 0x76, /* 0xC4-0xC7 */ + 0x9B, 0x77, 0xC8, 0xAA, 0xB2, 0xB4, 0x9B, 0x78, /* 0xC8-0xCB */ + 0xC3, 0xDA, 0x9B, 0x79, 0x9B, 0x7A, 0x9B, 0x7B, /* 0xCC-0xCF */ + 0xE3, 0xEE, 0x9B, 0x7C, 0x9B, 0x7D, 0xE3, 0xFC, /* 0xD0-0xD3 */ + 0xE3, 0xEF, 0xB7, 0xA8, 0xE3, 0xF7, 0xE3, 0xF4, /* 0xD4-0xD7 */ + 0x9B, 0x7E, 0x9B, 0x80, 0x9B, 0x81, 0xB7, 0xBA, /* 0xD8-0xDB */ + 0x9B, 0x82, 0x9B, 0x83, 0xC5, 0xA2, 0x9B, 0x84, /* 0xDC-0xDF */ + 0xE3, 0xF6, 0xC5, 0xDD, 0xB2, 0xA8, 0xC6, 0xFC, /* 0xE0-0xE3 */ + 0x9B, 0x85, 0xC4, 0xE0, 0x9B, 0x86, 0x9B, 0x87, /* 0xE4-0xE7 */ + 0xD7, 0xA2, 0x9B, 0x88, 0xC0, 0xE1, 0xE3, 0xF9, /* 0xE8-0xEB */ + 0x9B, 0x89, 0x9B, 0x8A, 0xE3, 0xFA, 0xE3, 0xFD, /* 0xEC-0xEF */ + 0xCC, 0xA9, 0xE3, 0xF3, 0x9B, 0x8B, 0xD3, 0xBE, /* 0xF0-0xF3 */ + 0x9B, 0x8C, 0xB1, 0xC3, 0xED, 0xB4, 0xE3, 0xF1, /* 0xF4-0xF7 */ + 0xE3, 0xF2, 0x9B, 0x8D, 0xE3, 0xF8, 0xD0, 0xBA, /* 0xF8-0xFB */ + 0xC6, 0xC3, 0xD4, 0xF3, 0xE3, 0xFE, 0x9B, 0x8E, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6D[512] = { + 0x9B, 0x8F, 0xBD, 0xE0, 0x9B, 0x90, 0x9B, 0x91, /* 0x00-0x03 */ + 0xE4, 0xA7, 0x9B, 0x92, 0x9B, 0x93, 0xE4, 0xA6, /* 0x04-0x07 */ + 0x9B, 0x94, 0x9B, 0x95, 0x9B, 0x96, 0xD1, 0xF3, /* 0x08-0x0B */ + 0xE4, 0xA3, 0x9B, 0x97, 0xE4, 0xA9, 0x9B, 0x98, /* 0x0C-0x0F */ + 0x9B, 0x99, 0x9B, 0x9A, 0xC8, 0xF7, 0x9B, 0x9B, /* 0x10-0x13 */ + 0x9B, 0x9C, 0x9B, 0x9D, 0x9B, 0x9E, 0xCF, 0xB4, /* 0x14-0x17 */ + 0x9B, 0x9F, 0xE4, 0xA8, 0xE4, 0xAE, 0xC2, 0xE5, /* 0x18-0x1B */ + 0x9B, 0xA0, 0x9B, 0xA1, 0xB6, 0xB4, 0x9B, 0xA2, /* 0x1C-0x1F */ + 0x9B, 0xA3, 0x9B, 0xA4, 0x9B, 0xA5, 0x9B, 0xA6, /* 0x20-0x23 */ + 0x9B, 0xA7, 0xBD, 0xF2, 0x9B, 0xA8, 0xE4, 0xA2, /* 0x24-0x27 */ + 0x9B, 0xA9, 0x9B, 0xAA, 0xBA, 0xE9, 0xE4, 0xAA, /* 0x28-0x2B */ + 0x9B, 0xAB, 0x9B, 0xAC, 0xE4, 0xAC, 0x9B, 0xAD, /* 0x2C-0x2F */ + 0x9B, 0xAE, 0xB6, 0xFD, 0xD6, 0xDE, 0xE4, 0xB2, /* 0x30-0x33 */ + 0x9B, 0xAF, 0xE4, 0xAD, 0x9B, 0xB0, 0x9B, 0xB1, /* 0x34-0x37 */ + 0x9B, 0xB2, 0xE4, 0xA1, 0x9B, 0xB3, 0xBB, 0xEE, /* 0x38-0x3B */ + 0xCD, 0xDD, 0xC7, 0xA2, 0xC5, 0xC9, 0x9B, 0xB4, /* 0x3C-0x3F */ + 0x9B, 0xB5, 0xC1, 0xF7, 0x9B, 0xB6, 0xE4, 0xA4, /* 0x40-0x43 */ + 0x9B, 0xB7, 0xC7, 0xB3, 0xBD, 0xAC, 0xBD, 0xBD, /* 0x44-0x47 */ + 0xE4, 0xA5, 0x9B, 0xB8, 0xD7, 0xC7, 0xB2, 0xE2, /* 0x48-0x4B */ + 0x9B, 0xB9, 0xE4, 0xAB, 0xBC, 0xC3, 0xE4, 0xAF, /* 0x4C-0x4F */ + 0x9B, 0xBA, 0xBB, 0xEB, 0xE4, 0xB0, 0xC5, 0xA8, /* 0x50-0x53 */ + 0xE4, 0xB1, 0x9B, 0xBB, 0x9B, 0xBC, 0x9B, 0xBD, /* 0x54-0x57 */ + 0x9B, 0xBE, 0xD5, 0xE3, 0xBF, 0xA3, 0x9B, 0xBF, /* 0x58-0x5B */ + 0xE4, 0xBA, 0x9B, 0xC0, 0xE4, 0xB7, 0x9B, 0xC1, /* 0x5C-0x5F */ + 0xE4, 0xBB, 0x9B, 0xC2, 0x9B, 0xC3, 0xE4, 0xBD, /* 0x60-0x63 */ + 0x9B, 0xC4, 0x9B, 0xC5, 0xC6, 0xD6, 0x9B, 0xC6, /* 0x64-0x67 */ + 0x9B, 0xC7, 0xBA, 0xC6, 0xC0, 0xCB, 0x9B, 0xC8, /* 0x68-0x6B */ + 0x9B, 0xC9, 0x9B, 0xCA, 0xB8, 0xA1, 0xE4, 0xB4, /* 0x6C-0x6F */ + 0x9B, 0xCB, 0x9B, 0xCC, 0x9B, 0xCD, 0x9B, 0xCE, /* 0x70-0x73 */ + 0xD4, 0xA1, 0x9B, 0xCF, 0x9B, 0xD0, 0xBA, 0xA3, /* 0x74-0x77 */ + 0xBD, 0xFE, 0x9B, 0xD1, 0x9B, 0xD2, 0x9B, 0xD3, /* 0x78-0x7B */ + 0xE4, 0xBC, 0x9B, 0xD4, 0x9B, 0xD5, 0x9B, 0xD6, /* 0x7C-0x7F */ + + 0x9B, 0xD7, 0x9B, 0xD8, 0xCD, 0xBF, 0x9B, 0xD9, /* 0x80-0x83 */ + 0x9B, 0xDA, 0xC4, 0xF9, 0x9B, 0xDB, 0x9B, 0xDC, /* 0x84-0x87 */ + 0xCF, 0xFB, 0xC9, 0xE6, 0x9B, 0xDD, 0x9B, 0xDE, /* 0x88-0x8B */ + 0xD3, 0xBF, 0x9B, 0xDF, 0xCF, 0xD1, 0x9B, 0xE0, /* 0x8C-0x8F */ + 0x9B, 0xE1, 0xE4, 0xB3, 0x9B, 0xE2, 0xE4, 0xB8, /* 0x90-0x93 */ + 0xE4, 0xB9, 0xCC, 0xE9, 0x9B, 0xE3, 0x9B, 0xE4, /* 0x94-0x97 */ + 0x9B, 0xE5, 0x9B, 0xE6, 0x9B, 0xE7, 0xCC, 0xCE, /* 0x98-0x9B */ + 0x9B, 0xE8, 0xC0, 0xD4, 0xE4, 0xB5, 0xC1, 0xB0, /* 0x9C-0x9F */ + 0xE4, 0xB6, 0xCE, 0xD0, 0x9B, 0xE9, 0xBB, 0xC1, /* 0xA0-0xA3 */ + 0xB5, 0xD3, 0x9B, 0xEA, 0xC8, 0xF3, 0xBD, 0xA7, /* 0xA4-0xA7 */ + 0xD5, 0xC7, 0xC9, 0xAC, 0xB8, 0xA2, 0xE4, 0xCA, /* 0xA8-0xAB */ + 0x9B, 0xEB, 0x9B, 0xEC, 0xE4, 0xCC, 0xD1, 0xC4, /* 0xAC-0xAF */ + 0x9B, 0xED, 0x9B, 0xEE, 0xD2, 0xBA, 0x9B, 0xEF, /* 0xB0-0xB3 */ + 0x9B, 0xF0, 0xBA, 0xAD, 0x9B, 0xF1, 0x9B, 0xF2, /* 0xB4-0xB7 */ + 0xBA, 0xD4, 0x9B, 0xF3, 0x9B, 0xF4, 0x9B, 0xF5, /* 0xB8-0xBB */ + 0x9B, 0xF6, 0x9B, 0xF7, 0x9B, 0xF8, 0xE4, 0xC3, /* 0xBC-0xBF */ + 0xB5, 0xED, 0x9B, 0xF9, 0x9B, 0xFA, 0x9B, 0xFB, /* 0xC0-0xC3 */ + 0xD7, 0xCD, 0xE4, 0xC0, 0xCF, 0xFD, 0xE4, 0xBF, /* 0xC4-0xC7 */ + 0x9B, 0xFC, 0x9B, 0xFD, 0x9B, 0xFE, 0xC1, 0xDC, /* 0xC8-0xCB */ + 0xCC, 0xCA, 0x9C, 0x40, 0x9C, 0x41, 0x9C, 0x42, /* 0xCC-0xCF */ + 0x9C, 0x43, 0xCA, 0xE7, 0x9C, 0x44, 0x9C, 0x45, /* 0xD0-0xD3 */ + 0x9C, 0x46, 0x9C, 0x47, 0xC4, 0xD7, 0x9C, 0x48, /* 0xD4-0xD7 */ + 0xCC, 0xD4, 0xE4, 0xC8, 0x9C, 0x49, 0x9C, 0x4A, /* 0xD8-0xDB */ + 0x9C, 0x4B, 0xE4, 0xC7, 0xE4, 0xC1, 0x9C, 0x4C, /* 0xDC-0xDF */ + 0xE4, 0xC4, 0xB5, 0xAD, 0x9C, 0x4D, 0x9C, 0x4E, /* 0xE0-0xE3 */ + 0xD3, 0xD9, 0x9C, 0x4F, 0xE4, 0xC6, 0x9C, 0x50, /* 0xE4-0xE7 */ + 0x9C, 0x51, 0x9C, 0x52, 0x9C, 0x53, 0xD2, 0xF9, /* 0xE8-0xEB */ + 0xB4, 0xE3, 0x9C, 0x54, 0xBB, 0xB4, 0x9C, 0x55, /* 0xEC-0xEF */ + 0x9C, 0x56, 0xC9, 0xEE, 0x9C, 0x57, 0xB4, 0xBE, /* 0xF0-0xF3 */ + 0x9C, 0x58, 0x9C, 0x59, 0x9C, 0x5A, 0xBB, 0xEC, /* 0xF4-0xF7 */ + 0x9C, 0x5B, 0xD1, 0xCD, 0x9C, 0x5C, 0xCC, 0xED, /* 0xF8-0xFB */ + 0xED, 0xB5, 0x9C, 0x5D, 0x9C, 0x5E, 0x9C, 0x5F, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6E[512] = { + 0x9C, 0x60, 0x9C, 0x61, 0x9C, 0x62, 0x9C, 0x63, /* 0x00-0x03 */ + 0x9C, 0x64, 0xC7, 0xE5, 0x9C, 0x65, 0x9C, 0x66, /* 0x04-0x07 */ + 0x9C, 0x67, 0x9C, 0x68, 0xD4, 0xA8, 0x9C, 0x69, /* 0x08-0x0B */ + 0xE4, 0xCB, 0xD7, 0xD5, 0xE4, 0xC2, 0x9C, 0x6A, /* 0x0C-0x0F */ + 0xBD, 0xA5, 0xE4, 0xC5, 0x9C, 0x6B, 0x9C, 0x6C, /* 0x10-0x13 */ + 0xD3, 0xE6, 0x9C, 0x6D, 0xE4, 0xC9, 0xC9, 0xF8, /* 0x14-0x17 */ + 0x9C, 0x6E, 0x9C, 0x6F, 0xE4, 0xBE, 0x9C, 0x70, /* 0x18-0x1B */ + 0x9C, 0x71, 0xD3, 0xE5, 0x9C, 0x72, 0x9C, 0x73, /* 0x1C-0x1F */ + 0xC7, 0xFE, 0xB6, 0xC9, 0x9C, 0x74, 0xD4, 0xFC, /* 0x20-0x23 */ + 0xB2, 0xB3, 0xE4, 0xD7, 0x9C, 0x75, 0x9C, 0x76, /* 0x24-0x27 */ + 0x9C, 0x77, 0xCE, 0xC2, 0x9C, 0x78, 0xE4, 0xCD, /* 0x28-0x2B */ + 0x9C, 0x79, 0xCE, 0xBC, 0x9C, 0x7A, 0xB8, 0xDB, /* 0x2C-0x2F */ + 0x9C, 0x7B, 0x9C, 0x7C, 0xE4, 0xD6, 0x9C, 0x7D, /* 0x30-0x33 */ + 0xBF, 0xCA, 0x9C, 0x7E, 0x9C, 0x80, 0x9C, 0x81, /* 0x34-0x37 */ + 0xD3, 0xCE, 0x9C, 0x82, 0xC3, 0xEC, 0x9C, 0x83, /* 0x38-0x3B */ + 0x9C, 0x84, 0x9C, 0x85, 0x9C, 0x86, 0x9C, 0x87, /* 0x3C-0x3F */ + 0x9C, 0x88, 0x9C, 0x89, 0x9C, 0x8A, 0xC5, 0xC8, /* 0x40-0x43 */ + 0xE4, 0xD8, 0x9C, 0x8B, 0x9C, 0x8C, 0x9C, 0x8D, /* 0x44-0x47 */ + 0x9C, 0x8E, 0x9C, 0x8F, 0x9C, 0x90, 0x9C, 0x91, /* 0x48-0x4B */ + 0x9C, 0x92, 0xCD, 0xC4, 0xE4, 0xCF, 0x9C, 0x93, /* 0x4C-0x4F */ + 0x9C, 0x94, 0x9C, 0x95, 0x9C, 0x96, 0xE4, 0xD4, /* 0x50-0x53 */ + 0xE4, 0xD5, 0x9C, 0x97, 0xBA, 0xFE, 0x9C, 0x98, /* 0x54-0x57 */ + 0xCF, 0xE6, 0x9C, 0x99, 0x9C, 0x9A, 0xD5, 0xBF, /* 0x58-0x5B */ + 0x9C, 0x9B, 0x9C, 0x9C, 0x9C, 0x9D, 0xE4, 0xD2, /* 0x5C-0x5F */ + 0x9C, 0x9E, 0x9C, 0x9F, 0x9C, 0xA0, 0x9C, 0xA1, /* 0x60-0x63 */ + 0x9C, 0xA2, 0x9C, 0xA3, 0x9C, 0xA4, 0x9C, 0xA5, /* 0x64-0x67 */ + 0x9C, 0xA6, 0x9C, 0xA7, 0x9C, 0xA8, 0xE4, 0xD0, /* 0x68-0x6B */ + 0x9C, 0xA9, 0x9C, 0xAA, 0xE4, 0xCE, 0x9C, 0xAB, /* 0x6C-0x6F */ + 0x9C, 0xAC, 0x9C, 0xAD, 0x9C, 0xAE, 0x9C, 0xAF, /* 0x70-0x73 */ + 0x9C, 0xB0, 0x9C, 0xB1, 0x9C, 0xB2, 0x9C, 0xB3, /* 0x74-0x77 */ + 0x9C, 0xB4, 0x9C, 0xB5, 0x9C, 0xB6, 0x9C, 0xB7, /* 0x78-0x7B */ + 0x9C, 0xB8, 0x9C, 0xB9, 0xCD, 0xE5, 0xCA, 0xAA, /* 0x7C-0x7F */ + + 0x9C, 0xBA, 0x9C, 0xBB, 0x9C, 0xBC, 0xC0, 0xA3, /* 0x80-0x83 */ + 0x9C, 0xBD, 0xBD, 0xA6, 0xE4, 0xD3, 0x9C, 0xBE, /* 0x84-0x87 */ + 0x9C, 0xBF, 0xB8, 0xC8, 0x9C, 0xC0, 0x9C, 0xC1, /* 0x88-0x8B */ + 0x9C, 0xC2, 0x9C, 0xC3, 0x9C, 0xC4, 0xE4, 0xE7, /* 0x8C-0x8F */ + 0xD4, 0xB4, 0x9C, 0xC5, 0x9C, 0xC6, 0x9C, 0xC7, /* 0x90-0x93 */ + 0x9C, 0xC8, 0x9C, 0xC9, 0x9C, 0xCA, 0x9C, 0xCB, /* 0x94-0x97 */ + 0xE4, 0xDB, 0x9C, 0xCC, 0x9C, 0xCD, 0x9C, 0xCE, /* 0x98-0x9B */ + 0xC1, 0xEF, 0x9C, 0xCF, 0x9C, 0xD0, 0xE4, 0xE9, /* 0x9C-0x9F */ + 0x9C, 0xD1, 0x9C, 0xD2, 0xD2, 0xE7, 0x9C, 0xD3, /* 0xA0-0xA3 */ + 0x9C, 0xD4, 0xE4, 0xDF, 0x9C, 0xD5, 0xE4, 0xE0, /* 0xA4-0xA7 */ + 0x9C, 0xD6, 0x9C, 0xD7, 0xCF, 0xAA, 0x9C, 0xD8, /* 0xA8-0xAB */ + 0x9C, 0xD9, 0x9C, 0xDA, 0x9C, 0xDB, 0xCB, 0xDD, /* 0xAC-0xAF */ + 0x9C, 0xDC, 0xE4, 0xDA, 0xE4, 0xD1, 0x9C, 0xDD, /* 0xB0-0xB3 */ + 0xE4, 0xE5, 0x9C, 0xDE, 0xC8, 0xDC, 0xE4, 0xE3, /* 0xB4-0xB7 */ + 0x9C, 0xDF, 0x9C, 0xE0, 0xC4, 0xE7, 0xE4, 0xE2, /* 0xB8-0xBB */ + 0x9C, 0xE1, 0xE4, 0xE1, 0x9C, 0xE2, 0x9C, 0xE3, /* 0xBC-0xBF */ + 0x9C, 0xE4, 0xB3, 0xFC, 0xE4, 0xE8, 0x9C, 0xE5, /* 0xC0-0xC3 */ + 0x9C, 0xE6, 0x9C, 0xE7, 0x9C, 0xE8, 0xB5, 0xE1, /* 0xC4-0xC7 */ + 0x9C, 0xE9, 0x9C, 0xEA, 0x9C, 0xEB, 0xD7, 0xCC, /* 0xC8-0xCB */ + 0x9C, 0xEC, 0x9C, 0xED, 0x9C, 0xEE, 0xE4, 0xE6, /* 0xCC-0xCF */ + 0x9C, 0xEF, 0xBB, 0xAC, 0x9C, 0xF0, 0xD7, 0xD2, /* 0xD0-0xD3 */ + 0xCC, 0xCF, 0xEB, 0xF8, 0x9C, 0xF1, 0xE4, 0xE4, /* 0xD4-0xD7 */ + 0x9C, 0xF2, 0x9C, 0xF3, 0xB9, 0xF6, 0x9C, 0xF4, /* 0xD8-0xDB */ + 0x9C, 0xF5, 0x9C, 0xF6, 0xD6, 0xCD, 0xE4, 0xD9, /* 0xDC-0xDF */ + 0xE4, 0xDC, 0xC2, 0xFA, 0xE4, 0xDE, 0x9C, 0xF7, /* 0xE0-0xE3 */ + 0xC2, 0xCB, 0xC0, 0xC4, 0xC2, 0xD0, 0x9C, 0xF8, /* 0xE4-0xE7 */ + 0xB1, 0xF5, 0xCC, 0xB2, 0x9C, 0xF9, 0x9C, 0xFA, /* 0xE8-0xEB */ + 0x9C, 0xFB, 0x9C, 0xFC, 0x9C, 0xFD, 0x9C, 0xFE, /* 0xEC-0xEF */ + 0x9D, 0x40, 0x9D, 0x41, 0x9D, 0x42, 0x9D, 0x43, /* 0xF0-0xF3 */ + 0xB5, 0xCE, 0x9D, 0x44, 0x9D, 0x45, 0x9D, 0x46, /* 0xF4-0xF7 */ + 0x9D, 0x47, 0xE4, 0xEF, 0x9D, 0x48, 0x9D, 0x49, /* 0xF8-0xFB */ + 0x9D, 0x4A, 0x9D, 0x4B, 0x9D, 0x4C, 0x9D, 0x4D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6F[512] = { + 0x9D, 0x4E, 0x9D, 0x4F, 0xC6, 0xAF, 0x9D, 0x50, /* 0x00-0x03 */ + 0x9D, 0x51, 0x9D, 0x52, 0xC6, 0xE1, 0x9D, 0x53, /* 0x04-0x07 */ + 0x9D, 0x54, 0xE4, 0xF5, 0x9D, 0x55, 0x9D, 0x56, /* 0x08-0x0B */ + 0x9D, 0x57, 0x9D, 0x58, 0x9D, 0x59, 0xC2, 0xA9, /* 0x0C-0x0F */ + 0x9D, 0x5A, 0x9D, 0x5B, 0x9D, 0x5C, 0xC0, 0xEC, /* 0x10-0x13 */ + 0xD1, 0xDD, 0xE4, 0xEE, 0x9D, 0x5D, 0x9D, 0x5E, /* 0x14-0x17 */ + 0x9D, 0x5F, 0x9D, 0x60, 0x9D, 0x61, 0x9D, 0x62, /* 0x18-0x1B */ + 0x9D, 0x63, 0x9D, 0x64, 0x9D, 0x65, 0x9D, 0x66, /* 0x1C-0x1F */ + 0xC4, 0xAE, 0x9D, 0x67, 0x9D, 0x68, 0x9D, 0x69, /* 0x20-0x23 */ + 0xE4, 0xED, 0x9D, 0x6A, 0x9D, 0x6B, 0x9D, 0x6C, /* 0x24-0x27 */ + 0x9D, 0x6D, 0xE4, 0xF6, 0xE4, 0xF4, 0xC2, 0xFE, /* 0x28-0x2B */ + 0x9D, 0x6E, 0xE4, 0xDD, 0x9D, 0x6F, 0xE4, 0xF0, /* 0x2C-0x2F */ + 0x9D, 0x70, 0xCA, 0xFE, 0x9D, 0x71, 0xD5, 0xC4, /* 0x30-0x33 */ + 0x9D, 0x72, 0x9D, 0x73, 0xE4, 0xF1, 0x9D, 0x74, /* 0x34-0x37 */ + 0x9D, 0x75, 0x9D, 0x76, 0x9D, 0x77, 0x9D, 0x78, /* 0x38-0x3B */ + 0x9D, 0x79, 0x9D, 0x7A, 0xD1, 0xFA, 0x9D, 0x7B, /* 0x3C-0x3F */ + 0x9D, 0x7C, 0x9D, 0x7D, 0x9D, 0x7E, 0x9D, 0x80, /* 0x40-0x43 */ + 0x9D, 0x81, 0x9D, 0x82, 0xE4, 0xEB, 0xE4, 0xEC, /* 0x44-0x47 */ + 0x9D, 0x83, 0x9D, 0x84, 0x9D, 0x85, 0xE4, 0xF2, /* 0x48-0x4B */ + 0x9D, 0x86, 0xCE, 0xAB, 0x9D, 0x87, 0x9D, 0x88, /* 0x4C-0x4F */ + 0x9D, 0x89, 0x9D, 0x8A, 0x9D, 0x8B, 0x9D, 0x8C, /* 0x50-0x53 */ + 0x9D, 0x8D, 0x9D, 0x8E, 0x9D, 0x8F, 0x9D, 0x90, /* 0x54-0x57 */ + 0xC5, 0xCB, 0x9D, 0x91, 0x9D, 0x92, 0x9D, 0x93, /* 0x58-0x5B */ + 0xC7, 0xB1, 0x9D, 0x94, 0xC2, 0xBA, 0x9D, 0x95, /* 0x5C-0x5F */ + 0x9D, 0x96, 0x9D, 0x97, 0xE4, 0xEA, 0x9D, 0x98, /* 0x60-0x63 */ + 0x9D, 0x99, 0x9D, 0x9A, 0xC1, 0xCA, 0x9D, 0x9B, /* 0x64-0x67 */ + 0x9D, 0x9C, 0x9D, 0x9D, 0x9D, 0x9E, 0x9D, 0x9F, /* 0x68-0x6B */ + 0x9D, 0xA0, 0xCC, 0xB6, 0xB3, 0xB1, 0x9D, 0xA1, /* 0x6C-0x6F */ + 0x9D, 0xA2, 0x9D, 0xA3, 0xE4, 0xFB, 0x9D, 0xA4, /* 0x70-0x73 */ + 0xE4, 0xF3, 0x9D, 0xA5, 0x9D, 0xA6, 0x9D, 0xA7, /* 0x74-0x77 */ + 0xE4, 0xFA, 0x9D, 0xA8, 0xE4, 0xFD, 0x9D, 0xA9, /* 0x78-0x7B */ + 0xE4, 0xFC, 0x9D, 0xAA, 0x9D, 0xAB, 0x9D, 0xAC, /* 0x7C-0x7F */ + + 0x9D, 0xAD, 0x9D, 0xAE, 0x9D, 0xAF, 0x9D, 0xB0, /* 0x80-0x83 */ + 0xB3, 0xCE, 0x9D, 0xB1, 0x9D, 0xB2, 0x9D, 0xB3, /* 0x84-0x87 */ + 0xB3, 0xBA, 0xE4, 0xF7, 0x9D, 0xB4, 0x9D, 0xB5, /* 0x88-0x8B */ + 0xE4, 0xF9, 0xE4, 0xF8, 0xC5, 0xEC, 0x9D, 0xB6, /* 0x8C-0x8F */ + 0x9D, 0xB7, 0x9D, 0xB8, 0x9D, 0xB9, 0x9D, 0xBA, /* 0x90-0x93 */ + 0x9D, 0xBB, 0x9D, 0xBC, 0x9D, 0xBD, 0x9D, 0xBE, /* 0x94-0x97 */ + 0x9D, 0xBF, 0x9D, 0xC0, 0x9D, 0xC1, 0x9D, 0xC2, /* 0x98-0x9B */ + 0xC0, 0xBD, 0x9D, 0xC3, 0x9D, 0xC4, 0x9D, 0xC5, /* 0x9C-0x9F */ + 0x9D, 0xC6, 0xD4, 0xE8, 0x9D, 0xC7, 0x9D, 0xC8, /* 0xA0-0xA3 */ + 0x9D, 0xC9, 0x9D, 0xCA, 0x9D, 0xCB, 0xE5, 0xA2, /* 0xA4-0xA7 */ + 0x9D, 0xCC, 0x9D, 0xCD, 0x9D, 0xCE, 0x9D, 0xCF, /* 0xA8-0xAB */ + 0x9D, 0xD0, 0x9D, 0xD1, 0x9D, 0xD2, 0x9D, 0xD3, /* 0xAC-0xAF */ + 0x9D, 0xD4, 0x9D, 0xD5, 0x9D, 0xD6, 0xB0, 0xC4, /* 0xB0-0xB3 */ + 0x9D, 0xD7, 0x9D, 0xD8, 0xE5, 0xA4, 0x9D, 0xD9, /* 0xB4-0xB7 */ + 0x9D, 0xDA, 0xE5, 0xA3, 0x9D, 0xDB, 0x9D, 0xDC, /* 0xB8-0xBB */ + 0x9D, 0xDD, 0x9D, 0xDE, 0x9D, 0xDF, 0x9D, 0xE0, /* 0xBC-0xBF */ + 0xBC, 0xA4, 0x9D, 0xE1, 0xE5, 0xA5, 0x9D, 0xE2, /* 0xC0-0xC3 */ + 0x9D, 0xE3, 0x9D, 0xE4, 0x9D, 0xE5, 0x9D, 0xE6, /* 0xC4-0xC7 */ + 0x9D, 0xE7, 0xE5, 0xA1, 0x9D, 0xE8, 0x9D, 0xE9, /* 0xC8-0xCB */ + 0x9D, 0xEA, 0x9D, 0xEB, 0x9D, 0xEC, 0x9D, 0xED, /* 0xCC-0xCF */ + 0x9D, 0xEE, 0xE4, 0xFE, 0xB1, 0xF4, 0x9D, 0xEF, /* 0xD0-0xD3 */ + 0x9D, 0xF0, 0x9D, 0xF1, 0x9D, 0xF2, 0x9D, 0xF3, /* 0xD4-0xD7 */ + 0x9D, 0xF4, 0x9D, 0xF5, 0x9D, 0xF6, 0x9D, 0xF7, /* 0xD8-0xDB */ + 0x9D, 0xF8, 0x9D, 0xF9, 0xE5, 0xA8, 0x9D, 0xFA, /* 0xDC-0xDF */ + 0xE5, 0xA9, 0xE5, 0xA6, 0x9D, 0xFB, 0x9D, 0xFC, /* 0xE0-0xE3 */ + 0x9D, 0xFD, 0x9D, 0xFE, 0x9E, 0x40, 0x9E, 0x41, /* 0xE4-0xE7 */ + 0x9E, 0x42, 0x9E, 0x43, 0x9E, 0x44, 0x9E, 0x45, /* 0xE8-0xEB */ + 0x9E, 0x46, 0x9E, 0x47, 0xE5, 0xA7, 0xE5, 0xAA, /* 0xEC-0xEF */ + 0x9E, 0x48, 0x9E, 0x49, 0x9E, 0x4A, 0x9E, 0x4B, /* 0xF0-0xF3 */ + 0x9E, 0x4C, 0x9E, 0x4D, 0x9E, 0x4E, 0x9E, 0x4F, /* 0xF4-0xF7 */ + 0x9E, 0x50, 0x9E, 0x51, 0x9E, 0x52, 0x9E, 0x53, /* 0xF8-0xFB */ + 0x9E, 0x54, 0x9E, 0x55, 0x9E, 0x56, 0x9E, 0x57, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_70[512] = { + 0x9E, 0x58, 0x9E, 0x59, 0x9E, 0x5A, 0x9E, 0x5B, /* 0x00-0x03 */ + 0x9E, 0x5C, 0x9E, 0x5D, 0x9E, 0x5E, 0x9E, 0x5F, /* 0x04-0x07 */ + 0x9E, 0x60, 0x9E, 0x61, 0x9E, 0x62, 0x9E, 0x63, /* 0x08-0x0B */ + 0x9E, 0x64, 0x9E, 0x65, 0x9E, 0x66, 0x9E, 0x67, /* 0x0C-0x0F */ + 0x9E, 0x68, 0xC6, 0xD9, 0x9E, 0x69, 0x9E, 0x6A, /* 0x10-0x13 */ + 0x9E, 0x6B, 0x9E, 0x6C, 0x9E, 0x6D, 0x9E, 0x6E, /* 0x14-0x17 */ + 0x9E, 0x6F, 0x9E, 0x70, 0xE5, 0xAB, 0xE5, 0xAD, /* 0x18-0x1B */ + 0x9E, 0x71, 0x9E, 0x72, 0x9E, 0x73, 0x9E, 0x74, /* 0x1C-0x1F */ + 0x9E, 0x75, 0x9E, 0x76, 0x9E, 0x77, 0xE5, 0xAC, /* 0x20-0x23 */ + 0x9E, 0x78, 0x9E, 0x79, 0x9E, 0x7A, 0x9E, 0x7B, /* 0x24-0x27 */ + 0x9E, 0x7C, 0x9E, 0x7D, 0x9E, 0x7E, 0x9E, 0x80, /* 0x28-0x2B */ + 0x9E, 0x81, 0x9E, 0x82, 0x9E, 0x83, 0x9E, 0x84, /* 0x2C-0x2F */ + 0x9E, 0x85, 0x9E, 0x86, 0x9E, 0x87, 0x9E, 0x88, /* 0x30-0x33 */ + 0x9E, 0x89, 0xE5, 0xAF, 0x9E, 0x8A, 0x9E, 0x8B, /* 0x34-0x37 */ + 0x9E, 0x8C, 0xE5, 0xAE, 0x9E, 0x8D, 0x9E, 0x8E, /* 0x38-0x3B */ + 0x9E, 0x8F, 0x9E, 0x90, 0x9E, 0x91, 0x9E, 0x92, /* 0x3C-0x3F */ + 0x9E, 0x93, 0x9E, 0x94, 0x9E, 0x95, 0x9E, 0x96, /* 0x40-0x43 */ + 0x9E, 0x97, 0x9E, 0x98, 0x9E, 0x99, 0x9E, 0x9A, /* 0x44-0x47 */ + 0x9E, 0x9B, 0x9E, 0x9C, 0x9E, 0x9D, 0x9E, 0x9E, /* 0x48-0x4B */ + 0xB9, 0xE0, 0x9E, 0x9F, 0x9E, 0xA0, 0xE5, 0xB0, /* 0x4C-0x4F */ + 0x9E, 0xA1, 0x9E, 0xA2, 0x9E, 0xA3, 0x9E, 0xA4, /* 0x50-0x53 */ + 0x9E, 0xA5, 0x9E, 0xA6, 0x9E, 0xA7, 0x9E, 0xA8, /* 0x54-0x57 */ + 0x9E, 0xA9, 0x9E, 0xAA, 0x9E, 0xAB, 0x9E, 0xAC, /* 0x58-0x5B */ + 0x9E, 0xAD, 0x9E, 0xAE, 0xE5, 0xB1, 0x9E, 0xAF, /* 0x5C-0x5F */ + 0x9E, 0xB0, 0x9E, 0xB1, 0x9E, 0xB2, 0x9E, 0xB3, /* 0x60-0x63 */ + 0x9E, 0xB4, 0x9E, 0xB5, 0x9E, 0xB6, 0x9E, 0xB7, /* 0x64-0x67 */ + 0x9E, 0xB8, 0x9E, 0xB9, 0x9E, 0xBA, 0xBB, 0xF0, /* 0x68-0x6B */ + 0xEC, 0xE1, 0xC3, 0xF0, 0x9E, 0xBB, 0xB5, 0xC6, /* 0x6C-0x6F */ + 0xBB, 0xD2, 0x9E, 0xBC, 0x9E, 0xBD, 0x9E, 0xBE, /* 0x70-0x73 */ + 0x9E, 0xBF, 0xC1, 0xE9, 0xD4, 0xEE, 0x9E, 0xC0, /* 0x74-0x77 */ + 0xBE, 0xC4, 0x9E, 0xC1, 0x9E, 0xC2, 0x9E, 0xC3, /* 0x78-0x7B */ + 0xD7, 0xC6, 0x9E, 0xC4, 0xD4, 0xD6, 0xB2, 0xD3, /* 0x7C-0x7F */ + + 0xEC, 0xBE, 0x9E, 0xC5, 0x9E, 0xC6, 0x9E, 0xC7, /* 0x80-0x83 */ + 0x9E, 0xC8, 0xEA, 0xC1, 0x9E, 0xC9, 0x9E, 0xCA, /* 0x84-0x87 */ + 0x9E, 0xCB, 0xC2, 0xAF, 0xB4, 0xB6, 0x9E, 0xCC, /* 0x88-0x8B */ + 0x9E, 0xCD, 0x9E, 0xCE, 0xD1, 0xD7, 0x9E, 0xCF, /* 0x8C-0x8F */ + 0x9E, 0xD0, 0x9E, 0xD1, 0xB3, 0xB4, 0x9E, 0xD2, /* 0x90-0x93 */ + 0xC8, 0xB2, 0xBF, 0xBB, 0xEC, 0xC0, 0x9E, 0xD3, /* 0x94-0x97 */ + 0x9E, 0xD4, 0xD6, 0xCB, 0x9E, 0xD5, 0x9E, 0xD6, /* 0x98-0x9B */ + 0xEC, 0xBF, 0xEC, 0xC1, 0x9E, 0xD7, 0x9E, 0xD8, /* 0x9C-0x9F */ + 0x9E, 0xD9, 0x9E, 0xDA, 0x9E, 0xDB, 0x9E, 0xDC, /* 0xA0-0xA3 */ + 0x9E, 0xDD, 0x9E, 0xDE, 0x9E, 0xDF, 0x9E, 0xE0, /* 0xA4-0xA7 */ + 0x9E, 0xE1, 0x9E, 0xE2, 0x9E, 0xE3, 0xEC, 0xC5, /* 0xA8-0xAB */ + 0xBE, 0xE6, 0xCC, 0xBF, 0xC5, 0xDA, 0xBE, 0xBC, /* 0xAC-0xAF */ + 0x9E, 0xE4, 0xEC, 0xC6, 0x9E, 0xE5, 0xB1, 0xFE, /* 0xB0-0xB3 */ + 0x9E, 0xE6, 0x9E, 0xE7, 0x9E, 0xE8, 0xEC, 0xC4, /* 0xB4-0xB7 */ + 0xD5, 0xA8, 0xB5, 0xE3, 0x9E, 0xE9, 0xEC, 0xC2, /* 0xB8-0xBB */ + 0xC1, 0xB6, 0xB3, 0xE3, 0x9E, 0xEA, 0x9E, 0xEB, /* 0xBC-0xBF */ + 0xEC, 0xC3, 0xCB, 0xB8, 0xC0, 0xC3, 0xCC, 0xFE, /* 0xC0-0xC3 */ + 0x9E, 0xEC, 0x9E, 0xED, 0x9E, 0xEE, 0x9E, 0xEF, /* 0xC4-0xC7 */ + 0xC1, 0xD2, 0x9E, 0xF0, 0xEC, 0xC8, 0x9E, 0xF1, /* 0xC8-0xCB */ + 0x9E, 0xF2, 0x9E, 0xF3, 0x9E, 0xF4, 0x9E, 0xF5, /* 0xCC-0xCF */ + 0x9E, 0xF6, 0x9E, 0xF7, 0x9E, 0xF8, 0x9E, 0xF9, /* 0xD0-0xD3 */ + 0x9E, 0xFA, 0x9E, 0xFB, 0x9E, 0xFC, 0x9E, 0xFD, /* 0xD4-0xD7 */ + 0xBA, 0xE6, 0xC0, 0xD3, 0x9E, 0xFE, 0xD6, 0xF2, /* 0xD8-0xDB */ + 0x9F, 0x40, 0x9F, 0x41, 0x9F, 0x42, 0xD1, 0xCC, /* 0xDC-0xDF */ + 0x9F, 0x43, 0x9F, 0x44, 0x9F, 0x45, 0x9F, 0x46, /* 0xE0-0xE3 */ + 0xBF, 0xBE, 0x9F, 0x47, 0xB7, 0xB3, 0xC9, 0xD5, /* 0xE4-0xE7 */ + 0xEC, 0xC7, 0xBB, 0xE2, 0x9F, 0x48, 0xCC, 0xCC, /* 0xE8-0xEB */ + 0xBD, 0xFD, 0xC8, 0xC8, 0x9F, 0x49, 0xCF, 0xA9, /* 0xEC-0xEF */ + 0x9F, 0x4A, 0x9F, 0x4B, 0x9F, 0x4C, 0x9F, 0x4D, /* 0xF0-0xF3 */ + 0x9F, 0x4E, 0x9F, 0x4F, 0x9F, 0x50, 0xCD, 0xE9, /* 0xF4-0xF7 */ + 0x9F, 0x51, 0xC5, 0xEB, 0x9F, 0x52, 0x9F, 0x53, /* 0xF8-0xFB */ + 0x9F, 0x54, 0xB7, 0xE9, 0x9F, 0x55, 0x9F, 0x56, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_71[512] = { + 0x9F, 0x57, 0x9F, 0x58, 0x9F, 0x59, 0x9F, 0x5A, /* 0x00-0x03 */ + 0x9F, 0x5B, 0x9F, 0x5C, 0x9F, 0x5D, 0x9F, 0x5E, /* 0x04-0x07 */ + 0x9F, 0x5F, 0xD1, 0xC9, 0xBA, 0xB8, 0x9F, 0x60, /* 0x08-0x0B */ + 0x9F, 0x61, 0x9F, 0x62, 0x9F, 0x63, 0x9F, 0x64, /* 0x0C-0x0F */ + 0xEC, 0xC9, 0x9F, 0x65, 0x9F, 0x66, 0xEC, 0xCA, /* 0x10-0x13 */ + 0x9F, 0x67, 0xBB, 0xC0, 0xEC, 0xCB, 0x9F, 0x68, /* 0x14-0x17 */ + 0xEC, 0xE2, 0xB1, 0xBA, 0xB7, 0xD9, 0x9F, 0x69, /* 0x18-0x1B */ + 0x9F, 0x6A, 0x9F, 0x6B, 0x9F, 0x6C, 0x9F, 0x6D, /* 0x1C-0x1F */ + 0x9F, 0x6E, 0x9F, 0x6F, 0x9F, 0x70, 0x9F, 0x71, /* 0x20-0x23 */ + 0x9F, 0x72, 0x9F, 0x73, 0xBD, 0xB9, 0x9F, 0x74, /* 0x24-0x27 */ + 0x9F, 0x75, 0x9F, 0x76, 0x9F, 0x77, 0x9F, 0x78, /* 0x28-0x2B */ + 0x9F, 0x79, 0x9F, 0x7A, 0x9F, 0x7B, 0xEC, 0xCC, /* 0x2C-0x2F */ + 0xD1, 0xE6, 0xEC, 0xCD, 0x9F, 0x7C, 0x9F, 0x7D, /* 0x30-0x33 */ + 0x9F, 0x7E, 0x9F, 0x80, 0xC8, 0xBB, 0x9F, 0x81, /* 0x34-0x37 */ + 0x9F, 0x82, 0x9F, 0x83, 0x9F, 0x84, 0x9F, 0x85, /* 0x38-0x3B */ + 0x9F, 0x86, 0x9F, 0x87, 0x9F, 0x88, 0x9F, 0x89, /* 0x3C-0x3F */ + 0x9F, 0x8A, 0x9F, 0x8B, 0x9F, 0x8C, 0x9F, 0x8D, /* 0x40-0x43 */ + 0x9F, 0x8E, 0xEC, 0xD1, 0x9F, 0x8F, 0x9F, 0x90, /* 0x44-0x47 */ + 0x9F, 0x91, 0x9F, 0x92, 0xEC, 0xD3, 0x9F, 0x93, /* 0x48-0x4B */ + 0xBB, 0xCD, 0x9F, 0x94, 0xBC, 0xE5, 0x9F, 0x95, /* 0x4C-0x4F */ + 0x9F, 0x96, 0x9F, 0x97, 0x9F, 0x98, 0x9F, 0x99, /* 0x50-0x53 */ + 0x9F, 0x9A, 0x9F, 0x9B, 0x9F, 0x9C, 0x9F, 0x9D, /* 0x54-0x57 */ + 0x9F, 0x9E, 0x9F, 0x9F, 0x9F, 0xA0, 0x9F, 0xA1, /* 0x58-0x5B */ + 0xEC, 0xCF, 0x9F, 0xA2, 0xC9, 0xB7, 0x9F, 0xA3, /* 0x5C-0x5F */ + 0x9F, 0xA4, 0x9F, 0xA5, 0x9F, 0xA6, 0x9F, 0xA7, /* 0x60-0x63 */ + 0xC3, 0xBA, 0x9F, 0xA8, 0xEC, 0xE3, 0xD5, 0xD5, /* 0x64-0x67 */ + 0xEC, 0xD0, 0x9F, 0xA9, 0x9F, 0xAA, 0x9F, 0xAB, /* 0x68-0x6B */ + 0x9F, 0xAC, 0x9F, 0xAD, 0xD6, 0xF3, 0x9F, 0xAE, /* 0x6C-0x6F */ + 0x9F, 0xAF, 0x9F, 0xB0, 0xEC, 0xD2, 0xEC, 0xCE, /* 0x70-0x73 */ + 0x9F, 0xB1, 0x9F, 0xB2, 0x9F, 0xB3, 0x9F, 0xB4, /* 0x74-0x77 */ + 0xEC, 0xD4, 0x9F, 0xB5, 0xEC, 0xD5, 0x9F, 0xB6, /* 0x78-0x7B */ + 0x9F, 0xB7, 0xC9, 0xBF, 0x9F, 0xB8, 0x9F, 0xB9, /* 0x7C-0x7F */ + + 0x9F, 0xBA, 0x9F, 0xBB, 0x9F, 0xBC, 0x9F, 0xBD, /* 0x80-0x83 */ + 0xCF, 0xA8, 0x9F, 0xBE, 0x9F, 0xBF, 0x9F, 0xC0, /* 0x84-0x87 */ + 0x9F, 0xC1, 0x9F, 0xC2, 0xD0, 0xDC, 0x9F, 0xC3, /* 0x88-0x8B */ + 0x9F, 0xC4, 0x9F, 0xC5, 0x9F, 0xC6, 0xD1, 0xAC, /* 0x8C-0x8F */ + 0x9F, 0xC7, 0x9F, 0xC8, 0x9F, 0xC9, 0x9F, 0xCA, /* 0x90-0x93 */ + 0xC8, 0xDB, 0x9F, 0xCB, 0x9F, 0xCC, 0x9F, 0xCD, /* 0x94-0x97 */ + 0xEC, 0xD6, 0xCE, 0xF5, 0x9F, 0xCE, 0x9F, 0xCF, /* 0x98-0x9B */ + 0x9F, 0xD0, 0x9F, 0xD1, 0x9F, 0xD2, 0xCA, 0xEC, /* 0x9C-0x9F */ + 0xEC, 0xDA, 0x9F, 0xD3, 0x9F, 0xD4, 0x9F, 0xD5, /* 0xA0-0xA3 */ + 0x9F, 0xD6, 0x9F, 0xD7, 0x9F, 0xD8, 0x9F, 0xD9, /* 0xA4-0xA7 */ + 0xEC, 0xD9, 0x9F, 0xDA, 0x9F, 0xDB, 0x9F, 0xDC, /* 0xA8-0xAB */ + 0xB0, 0xBE, 0x9F, 0xDD, 0x9F, 0xDE, 0x9F, 0xDF, /* 0xAC-0xAF */ + 0x9F, 0xE0, 0x9F, 0xE1, 0x9F, 0xE2, 0xEC, 0xD7, /* 0xB0-0xB3 */ + 0x9F, 0xE3, 0xEC, 0xD8, 0x9F, 0xE4, 0x9F, 0xE5, /* 0xB4-0xB7 */ + 0x9F, 0xE6, 0xEC, 0xE4, 0x9F, 0xE7, 0x9F, 0xE8, /* 0xB8-0xBB */ + 0x9F, 0xE9, 0x9F, 0xEA, 0x9F, 0xEB, 0x9F, 0xEC, /* 0xBC-0xBF */ + 0x9F, 0xED, 0x9F, 0xEE, 0x9F, 0xEF, 0xC8, 0xBC, /* 0xC0-0xC3 */ + 0x9F, 0xF0, 0x9F, 0xF1, 0x9F, 0xF2, 0x9F, 0xF3, /* 0xC4-0xC7 */ + 0x9F, 0xF4, 0x9F, 0xF5, 0x9F, 0xF6, 0x9F, 0xF7, /* 0xC8-0xCB */ + 0x9F, 0xF8, 0x9F, 0xF9, 0xC1, 0xC7, 0x9F, 0xFA, /* 0xCC-0xCF */ + 0x9F, 0xFB, 0x9F, 0xFC, 0x9F, 0xFD, 0x9F, 0xFE, /* 0xD0-0xD3 */ + 0xEC, 0xDC, 0xD1, 0xE0, 0xA0, 0x40, 0xA0, 0x41, /* 0xD4-0xD7 */ + 0xA0, 0x42, 0xA0, 0x43, 0xA0, 0x44, 0xA0, 0x45, /* 0xD8-0xDB */ + 0xA0, 0x46, 0xA0, 0x47, 0xA0, 0x48, 0xA0, 0x49, /* 0xDC-0xDF */ + 0xEC, 0xDB, 0xA0, 0x4A, 0xA0, 0x4B, 0xA0, 0x4C, /* 0xE0-0xE3 */ + 0xA0, 0x4D, 0xD4, 0xEF, 0xA0, 0x4E, 0xEC, 0xDD, /* 0xE4-0xE7 */ + 0xA0, 0x4F, 0xA0, 0x50, 0xA0, 0x51, 0xA0, 0x52, /* 0xE8-0xEB */ + 0xA0, 0x53, 0xA0, 0x54, 0xDB, 0xC6, 0xA0, 0x55, /* 0xEC-0xEF */ + 0xA0, 0x56, 0xA0, 0x57, 0xA0, 0x58, 0xA0, 0x59, /* 0xF0-0xF3 */ + 0xA0, 0x5A, 0xA0, 0x5B, 0xA0, 0x5C, 0xA0, 0x5D, /* 0xF4-0xF7 */ + 0xA0, 0x5E, 0xEC, 0xDE, 0xA0, 0x5F, 0xA0, 0x60, /* 0xF8-0xFB */ + 0xA0, 0x61, 0xA0, 0x62, 0xA0, 0x63, 0xA0, 0x64, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_72[512] = { + 0xA0, 0x65, 0xA0, 0x66, 0xA0, 0x67, 0xA0, 0x68, /* 0x00-0x03 */ + 0xA0, 0x69, 0xA0, 0x6A, 0xB1, 0xAC, 0xA0, 0x6B, /* 0x04-0x07 */ + 0xA0, 0x6C, 0xA0, 0x6D, 0xA0, 0x6E, 0xA0, 0x6F, /* 0x08-0x0B */ + 0xA0, 0x70, 0xA0, 0x71, 0xA0, 0x72, 0xA0, 0x73, /* 0x0C-0x0F */ + 0xA0, 0x74, 0xA0, 0x75, 0xA0, 0x76, 0xA0, 0x77, /* 0x10-0x13 */ + 0xA0, 0x78, 0xA0, 0x79, 0xA0, 0x7A, 0xA0, 0x7B, /* 0x14-0x17 */ + 0xA0, 0x7C, 0xA0, 0x7D, 0xA0, 0x7E, 0xA0, 0x80, /* 0x18-0x1B */ + 0xA0, 0x81, 0xEC, 0xDF, 0xA0, 0x82, 0xA0, 0x83, /* 0x1C-0x1F */ + 0xA0, 0x84, 0xA0, 0x85, 0xA0, 0x86, 0xA0, 0x87, /* 0x20-0x23 */ + 0xA0, 0x88, 0xA0, 0x89, 0xA0, 0x8A, 0xA0, 0x8B, /* 0x24-0x27 */ + 0xEC, 0xE0, 0xA0, 0x8C, 0xD7, 0xA6, 0xA0, 0x8D, /* 0x28-0x2B */ + 0xC5, 0xC0, 0xA0, 0x8E, 0xA0, 0x8F, 0xA0, 0x90, /* 0x2C-0x2F */ + 0xEB, 0xBC, 0xB0, 0xAE, 0xA0, 0x91, 0xA0, 0x92, /* 0x30-0x33 */ + 0xA0, 0x93, 0xBE, 0xF4, 0xB8, 0xB8, 0xD2, 0xAF, /* 0x34-0x37 */ + 0xB0, 0xD6, 0xB5, 0xF9, 0xA0, 0x94, 0xD8, 0xB3, /* 0x38-0x3B */ + 0xA0, 0x95, 0xCB, 0xAC, 0xA0, 0x96, 0xE3, 0xDD, /* 0x3C-0x3F */ + 0xA0, 0x97, 0xA0, 0x98, 0xA0, 0x99, 0xA0, 0x9A, /* 0x40-0x43 */ + 0xA0, 0x9B, 0xA0, 0x9C, 0xA0, 0x9D, 0xC6, 0xAC, /* 0x44-0x47 */ + 0xB0, 0xE6, 0xA0, 0x9E, 0xA0, 0x9F, 0xA0, 0xA0, /* 0x48-0x4B */ + 0xC5, 0xC6, 0xEB, 0xB9, 0xA0, 0xA1, 0xA0, 0xA2, /* 0x4C-0x4F */ + 0xA0, 0xA3, 0xA0, 0xA4, 0xEB, 0xBA, 0xA0, 0xA5, /* 0x50-0x53 */ + 0xA0, 0xA6, 0xA0, 0xA7, 0xEB, 0xBB, 0xA0, 0xA8, /* 0x54-0x57 */ + 0xA0, 0xA9, 0xD1, 0xC0, 0xA0, 0xAA, 0xC5, 0xA3, /* 0x58-0x5B */ + 0xA0, 0xAB, 0xEA, 0xF2, 0xA0, 0xAC, 0xC4, 0xB2, /* 0x5C-0x5F */ + 0xA0, 0xAD, 0xC4, 0xB5, 0xC0, 0xCE, 0xA0, 0xAE, /* 0x60-0x63 */ + 0xA0, 0xAF, 0xA0, 0xB0, 0xEA, 0xF3, 0xC4, 0xC1, /* 0x64-0x67 */ + 0xA0, 0xB1, 0xCE, 0xEF, 0xA0, 0xB2, 0xA0, 0xB3, /* 0x68-0x6B */ + 0xA0, 0xB4, 0xA0, 0xB5, 0xEA, 0xF0, 0xEA, 0xF4, /* 0x6C-0x6F */ + 0xA0, 0xB6, 0xA0, 0xB7, 0xC9, 0xFC, 0xA0, 0xB8, /* 0x70-0x73 */ + 0xA0, 0xB9, 0xC7, 0xA3, 0xA0, 0xBA, 0xA0, 0xBB, /* 0x74-0x77 */ + 0xA0, 0xBC, 0xCC, 0xD8, 0xCE, 0xFE, 0xA0, 0xBD, /* 0x78-0x7B */ + 0xA0, 0xBE, 0xA0, 0xBF, 0xEA, 0xF5, 0xEA, 0xF6, /* 0x7C-0x7F */ + + 0xCF, 0xAC, 0xC0, 0xE7, 0xA0, 0xC0, 0xA0, 0xC1, /* 0x80-0x83 */ + 0xEA, 0xF7, 0xA0, 0xC2, 0xA0, 0xC3, 0xA0, 0xC4, /* 0x84-0x87 */ + 0xA0, 0xC5, 0xA0, 0xC6, 0xB6, 0xBF, 0xEA, 0xF8, /* 0x88-0x8B */ + 0xA0, 0xC7, 0xEA, 0xF9, 0xA0, 0xC8, 0xEA, 0xFA, /* 0x8C-0x8F */ + 0xA0, 0xC9, 0xA0, 0xCA, 0xEA, 0xFB, 0xA0, 0xCB, /* 0x90-0x93 */ + 0xA0, 0xCC, 0xA0, 0xCD, 0xA0, 0xCE, 0xA0, 0xCF, /* 0x94-0x97 */ + 0xA0, 0xD0, 0xA0, 0xD1, 0xA0, 0xD2, 0xA0, 0xD3, /* 0x98-0x9B */ + 0xA0, 0xD4, 0xA0, 0xD5, 0xA0, 0xD6, 0xEA, 0xF1, /* 0x9C-0x9F */ + 0xA0, 0xD7, 0xA0, 0xD8, 0xA0, 0xD9, 0xA0, 0xDA, /* 0xA0-0xA3 */ + 0xA0, 0xDB, 0xA0, 0xDC, 0xA0, 0xDD, 0xA0, 0xDE, /* 0xA4-0xA7 */ + 0xA0, 0xDF, 0xA0, 0xE0, 0xA0, 0xE1, 0xA0, 0xE2, /* 0xA8-0xAB */ + 0xC8, 0xAE, 0xE1, 0xEB, 0xA0, 0xE3, 0xB7, 0xB8, /* 0xAC-0xAF */ + 0xE1, 0xEC, 0xA0, 0xE4, 0xA0, 0xE5, 0xA0, 0xE6, /* 0xB0-0xB3 */ + 0xE1, 0xED, 0xA0, 0xE7, 0xD7, 0xB4, 0xE1, 0xEE, /* 0xB4-0xB7 */ + 0xE1, 0xEF, 0xD3, 0xCC, 0xA0, 0xE8, 0xA0, 0xE9, /* 0xB8-0xBB */ + 0xA0, 0xEA, 0xA0, 0xEB, 0xA0, 0xEC, 0xA0, 0xED, /* 0xBC-0xBF */ + 0xA0, 0xEE, 0xE1, 0xF1, 0xBF, 0xF1, 0xE1, 0xF0, /* 0xC0-0xC3 */ + 0xB5, 0xD2, 0xA0, 0xEF, 0xA0, 0xF0, 0xA0, 0xF1, /* 0xC4-0xC7 */ + 0xB1, 0xB7, 0xA0, 0xF2, 0xA0, 0xF3, 0xA0, 0xF4, /* 0xC8-0xCB */ + 0xA0, 0xF5, 0xE1, 0xF3, 0xE1, 0xF2, 0xA0, 0xF6, /* 0xCC-0xCF */ + 0xBA, 0xFC, 0xA0, 0xF7, 0xE1, 0xF4, 0xA0, 0xF8, /* 0xD0-0xD3 */ + 0xA0, 0xF9, 0xA0, 0xFA, 0xA0, 0xFB, 0xB9, 0xB7, /* 0xD4-0xD7 */ + 0xA0, 0xFC, 0xBE, 0xD1, 0xA0, 0xFD, 0xA0, 0xFE, /* 0xD8-0xDB */ + 0xAA, 0x40, 0xAA, 0x41, 0xC4, 0xFC, 0xAA, 0x42, /* 0xDC-0xDF */ + 0xBA, 0xDD, 0xBD, 0xC6, 0xAA, 0x43, 0xAA, 0x44, /* 0xE0-0xE3 */ + 0xAA, 0x45, 0xAA, 0x46, 0xAA, 0x47, 0xAA, 0x48, /* 0xE4-0xE7 */ + 0xE1, 0xF5, 0xE1, 0xF7, 0xAA, 0x49, 0xAA, 0x4A, /* 0xE8-0xEB */ + 0xB6, 0xC0, 0xCF, 0xC1, 0xCA, 0xA8, 0xE1, 0xF6, /* 0xEC-0xEF */ + 0xD5, 0xF8, 0xD3, 0xFC, 0xE1, 0xF8, 0xE1, 0xFC, /* 0xF0-0xF3 */ + 0xE1, 0xF9, 0xAA, 0x4B, 0xAA, 0x4C, 0xE1, 0xFA, /* 0xF4-0xF7 */ + 0xC0, 0xEA, 0xAA, 0x4D, 0xE1, 0xFE, 0xE2, 0xA1, /* 0xF8-0xFB */ + 0xC0, 0xC7, 0xAA, 0x4E, 0xAA, 0x4F, 0xAA, 0x50, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_73[512] = { + 0xAA, 0x51, 0xE1, 0xFB, 0xAA, 0x52, 0xE1, 0xFD, /* 0x00-0x03 */ + 0xAA, 0x53, 0xAA, 0x54, 0xAA, 0x55, 0xAA, 0x56, /* 0x04-0x07 */ + 0xAA, 0x57, 0xAA, 0x58, 0xE2, 0xA5, 0xAA, 0x59, /* 0x08-0x0B */ + 0xAA, 0x5A, 0xAA, 0x5B, 0xC1, 0xD4, 0xAA, 0x5C, /* 0x0C-0x0F */ + 0xAA, 0x5D, 0xAA, 0x5E, 0xAA, 0x5F, 0xE2, 0xA3, /* 0x10-0x13 */ + 0xAA, 0x60, 0xE2, 0xA8, 0xB2, 0xFE, 0xE2, 0xA2, /* 0x14-0x17 */ + 0xAA, 0x61, 0xAA, 0x62, 0xAA, 0x63, 0xC3, 0xCD, /* 0x18-0x1B */ + 0xB2, 0xC2, 0xE2, 0xA7, 0xE2, 0xA6, 0xAA, 0x64, /* 0x1C-0x1F */ + 0xAA, 0x65, 0xE2, 0xA4, 0xE2, 0xA9, 0xAA, 0x66, /* 0x20-0x23 */ + 0xAA, 0x67, 0xE2, 0xAB, 0xAA, 0x68, 0xAA, 0x69, /* 0x24-0x27 */ + 0xAA, 0x6A, 0xD0, 0xC9, 0xD6, 0xED, 0xC3, 0xA8, /* 0x28-0x2B */ + 0xE2, 0xAC, 0xAA, 0x6B, 0xCF, 0xD7, 0xAA, 0x6C, /* 0x2C-0x2F */ + 0xAA, 0x6D, 0xE2, 0xAE, 0xAA, 0x6E, 0xAA, 0x6F, /* 0x30-0x33 */ + 0xBA, 0xEF, 0xAA, 0x70, 0xAA, 0x71, 0xE9, 0xE0, /* 0x34-0x37 */ + 0xE2, 0xAD, 0xE2, 0xAA, 0xAA, 0x72, 0xAA, 0x73, /* 0x38-0x3B */ + 0xAA, 0x74, 0xAA, 0x75, 0xBB, 0xAB, 0xD4, 0xB3, /* 0x3C-0x3F */ + 0xAA, 0x76, 0xAA, 0x77, 0xAA, 0x78, 0xAA, 0x79, /* 0x40-0x43 */ + 0xAA, 0x7A, 0xAA, 0x7B, 0xAA, 0x7C, 0xAA, 0x7D, /* 0x44-0x47 */ + 0xAA, 0x7E, 0xAA, 0x80, 0xAA, 0x81, 0xAA, 0x82, /* 0x48-0x4B */ + 0xAA, 0x83, 0xE2, 0xB0, 0xAA, 0x84, 0xAA, 0x85, /* 0x4C-0x4F */ + 0xE2, 0xAF, 0xAA, 0x86, 0xE9, 0xE1, 0xAA, 0x87, /* 0x50-0x53 */ + 0xAA, 0x88, 0xAA, 0x89, 0xAA, 0x8A, 0xE2, 0xB1, /* 0x54-0x57 */ + 0xAA, 0x8B, 0xAA, 0x8C, 0xAA, 0x8D, 0xAA, 0x8E, /* 0x58-0x5B */ + 0xAA, 0x8F, 0xAA, 0x90, 0xAA, 0x91, 0xAA, 0x92, /* 0x5C-0x5F */ + 0xE2, 0xB2, 0xAA, 0x93, 0xAA, 0x94, 0xAA, 0x95, /* 0x60-0x63 */ + 0xAA, 0x96, 0xAA, 0x97, 0xAA, 0x98, 0xAA, 0x99, /* 0x64-0x67 */ + 0xAA, 0x9A, 0xAA, 0x9B, 0xAA, 0x9C, 0xAA, 0x9D, /* 0x68-0x6B */ + 0xE2, 0xB3, 0xCC, 0xA1, 0xAA, 0x9E, 0xE2, 0xB4, /* 0x6C-0x6F */ + 0xAA, 0x9F, 0xAA, 0xA0, 0xAB, 0x40, 0xAB, 0x41, /* 0x70-0x73 */ + 0xAB, 0x42, 0xAB, 0x43, 0xAB, 0x44, 0xAB, 0x45, /* 0x74-0x77 */ + 0xAB, 0x46, 0xAB, 0x47, 0xAB, 0x48, 0xAB, 0x49, /* 0x78-0x7B */ + 0xAB, 0x4A, 0xAB, 0x4B, 0xE2, 0xB5, 0xAB, 0x4C, /* 0x7C-0x7F */ + + 0xAB, 0x4D, 0xAB, 0x4E, 0xAB, 0x4F, 0xAB, 0x50, /* 0x80-0x83 */ + 0xD0, 0xFE, 0xAB, 0x51, 0xAB, 0x52, 0xC2, 0xCA, /* 0x84-0x87 */ + 0xAB, 0x53, 0xD3, 0xF1, 0xAB, 0x54, 0xCD, 0xF5, /* 0x88-0x8B */ + 0xAB, 0x55, 0xAB, 0x56, 0xE7, 0xE0, 0xAB, 0x57, /* 0x8C-0x8F */ + 0xAB, 0x58, 0xE7, 0xE1, 0xAB, 0x59, 0xAB, 0x5A, /* 0x90-0x93 */ + 0xAB, 0x5B, 0xAB, 0x5C, 0xBE, 0xC1, 0xAB, 0x5D, /* 0x94-0x97 */ + 0xAB, 0x5E, 0xAB, 0x5F, 0xAB, 0x60, 0xC2, 0xEA, /* 0x98-0x9B */ + 0xAB, 0x61, 0xAB, 0x62, 0xAB, 0x63, 0xE7, 0xE4, /* 0x9C-0x9F */ + 0xAB, 0x64, 0xAB, 0x65, 0xE7, 0xE3, 0xAB, 0x66, /* 0xA0-0xA3 */ + 0xAB, 0x67, 0xAB, 0x68, 0xAB, 0x69, 0xAB, 0x6A, /* 0xA4-0xA7 */ + 0xAB, 0x6B, 0xCD, 0xE6, 0xAB, 0x6C, 0xC3, 0xB5, /* 0xA8-0xAB */ + 0xAB, 0x6D, 0xAB, 0x6E, 0xE7, 0xE2, 0xBB, 0xB7, /* 0xAC-0xAF */ + 0xCF, 0xD6, 0xAB, 0x6F, 0xC1, 0xE1, 0xE7, 0xE9, /* 0xB0-0xB3 */ + 0xAB, 0x70, 0xAB, 0x71, 0xAB, 0x72, 0xE7, 0xE8, /* 0xB4-0xB7 */ + 0xAB, 0x73, 0xAB, 0x74, 0xE7, 0xF4, 0xB2, 0xA3, /* 0xB8-0xBB */ + 0xAB, 0x75, 0xAB, 0x76, 0xAB, 0x77, 0xAB, 0x78, /* 0xBC-0xBF */ + 0xE7, 0xEA, 0xAB, 0x79, 0xE7, 0xE6, 0xAB, 0x7A, /* 0xC0-0xC3 */ + 0xAB, 0x7B, 0xAB, 0x7C, 0xAB, 0x7D, 0xAB, 0x7E, /* 0xC4-0xC7 */ + 0xE7, 0xEC, 0xE7, 0xEB, 0xC9, 0xBA, 0xAB, 0x80, /* 0xC8-0xCB */ + 0xAB, 0x81, 0xD5, 0xE4, 0xAB, 0x82, 0xE7, 0xE5, /* 0xCC-0xCF */ + 0xB7, 0xA9, 0xE7, 0xE7, 0xAB, 0x83, 0xAB, 0x84, /* 0xD0-0xD3 */ + 0xAB, 0x85, 0xAB, 0x86, 0xAB, 0x87, 0xAB, 0x88, /* 0xD4-0xD7 */ + 0xAB, 0x89, 0xE7, 0xEE, 0xAB, 0x8A, 0xAB, 0x8B, /* 0xD8-0xDB */ + 0xAB, 0x8C, 0xAB, 0x8D, 0xE7, 0xF3, 0xAB, 0x8E, /* 0xDC-0xDF */ + 0xD6, 0xE9, 0xAB, 0x8F, 0xAB, 0x90, 0xAB, 0x91, /* 0xE0-0xE3 */ + 0xAB, 0x92, 0xE7, 0xED, 0xAB, 0x93, 0xE7, 0xF2, /* 0xE4-0xE7 */ + 0xAB, 0x94, 0xE7, 0xF1, 0xAB, 0x95, 0xAB, 0x96, /* 0xE8-0xEB */ + 0xAB, 0x97, 0xB0, 0xE0, 0xAB, 0x98, 0xAB, 0x99, /* 0xEC-0xEF */ + 0xAB, 0x9A, 0xAB, 0x9B, 0xE7, 0xF5, 0xAB, 0x9C, /* 0xF0-0xF3 */ + 0xAB, 0x9D, 0xAB, 0x9E, 0xAB, 0x9F, 0xAB, 0xA0, /* 0xF4-0xF7 */ + 0xAC, 0x40, 0xAC, 0x41, 0xAC, 0x42, 0xAC, 0x43, /* 0xF8-0xFB */ + 0xAC, 0x44, 0xAC, 0x45, 0xAC, 0x46, 0xAC, 0x47, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_74[512] = { + 0xAC, 0x48, 0xAC, 0x49, 0xAC, 0x4A, 0xC7, 0xF2, /* 0x00-0x03 */ + 0xAC, 0x4B, 0xC0, 0xC5, 0xC0, 0xED, 0xAC, 0x4C, /* 0x04-0x07 */ + 0xAC, 0x4D, 0xC1, 0xF0, 0xE7, 0xF0, 0xAC, 0x4E, /* 0x08-0x0B */ + 0xAC, 0x4F, 0xAC, 0x50, 0xAC, 0x51, 0xE7, 0xF6, /* 0x0C-0x0F */ + 0xCB, 0xF6, 0xAC, 0x52, 0xAC, 0x53, 0xAC, 0x54, /* 0x10-0x13 */ + 0xAC, 0x55, 0xAC, 0x56, 0xAC, 0x57, 0xAC, 0x58, /* 0x14-0x17 */ + 0xAC, 0x59, 0xAC, 0x5A, 0xE8, 0xA2, 0xE8, 0xA1, /* 0x18-0x1B */ + 0xAC, 0x5B, 0xAC, 0x5C, 0xAC, 0x5D, 0xAC, 0x5E, /* 0x1C-0x1F */ + 0xAC, 0x5F, 0xAC, 0x60, 0xD7, 0xC1, 0xAC, 0x61, /* 0x20-0x23 */ + 0xAC, 0x62, 0xE7, 0xFA, 0xE7, 0xF9, 0xAC, 0x63, /* 0x24-0x27 */ + 0xE7, 0xFB, 0xAC, 0x64, 0xE7, 0xF7, 0xAC, 0x65, /* 0x28-0x2B */ + 0xE7, 0xFE, 0xAC, 0x66, 0xE7, 0xFD, 0xAC, 0x67, /* 0x2C-0x2F */ + 0xE7, 0xFC, 0xAC, 0x68, 0xAC, 0x69, 0xC1, 0xD5, /* 0x30-0x33 */ + 0xC7, 0xD9, 0xC5, 0xFD, 0xC5, 0xC3, 0xAC, 0x6A, /* 0x34-0x37 */ + 0xAC, 0x6B, 0xAC, 0x6C, 0xAC, 0x6D, 0xAC, 0x6E, /* 0x38-0x3B */ + 0xC7, 0xED, 0xAC, 0x6F, 0xAC, 0x70, 0xAC, 0x71, /* 0x3C-0x3F */ + 0xAC, 0x72, 0xE8, 0xA3, 0xAC, 0x73, 0xAC, 0x74, /* 0x40-0x43 */ + 0xAC, 0x75, 0xAC, 0x76, 0xAC, 0x77, 0xAC, 0x78, /* 0x44-0x47 */ + 0xAC, 0x79, 0xAC, 0x7A, 0xAC, 0x7B, 0xAC, 0x7C, /* 0x48-0x4B */ + 0xAC, 0x7D, 0xAC, 0x7E, 0xAC, 0x80, 0xAC, 0x81, /* 0x4C-0x4F */ + 0xAC, 0x82, 0xAC, 0x83, 0xAC, 0x84, 0xAC, 0x85, /* 0x50-0x53 */ + 0xAC, 0x86, 0xE8, 0xA6, 0xAC, 0x87, 0xE8, 0xA5, /* 0x54-0x57 */ + 0xAC, 0x88, 0xE8, 0xA7, 0xBA, 0xF7, 0xE7, 0xF8, /* 0x58-0x5B */ + 0xE8, 0xA4, 0xAC, 0x89, 0xC8, 0xF0, 0xC9, 0xAA, /* 0x5C-0x5F */ + 0xAC, 0x8A, 0xAC, 0x8B, 0xAC, 0x8C, 0xAC, 0x8D, /* 0x60-0x63 */ + 0xAC, 0x8E, 0xAC, 0x8F, 0xAC, 0x90, 0xAC, 0x91, /* 0x64-0x67 */ + 0xAC, 0x92, 0xAC, 0x93, 0xAC, 0x94, 0xAC, 0x95, /* 0x68-0x6B */ + 0xAC, 0x96, 0xE8, 0xA9, 0xAC, 0x97, 0xAC, 0x98, /* 0x6C-0x6F */ + 0xB9, 0xE5, 0xAC, 0x99, 0xAC, 0x9A, 0xAC, 0x9B, /* 0x70-0x73 */ + 0xAC, 0x9C, 0xAC, 0x9D, 0xD1, 0xFE, 0xE8, 0xA8, /* 0x74-0x77 */ + 0xAC, 0x9E, 0xAC, 0x9F, 0xAC, 0xA0, 0xAD, 0x40, /* 0x78-0x7B */ + 0xAD, 0x41, 0xAD, 0x42, 0xE8, 0xAA, 0xAD, 0x43, /* 0x7C-0x7F */ + + 0xE8, 0xAD, 0xE8, 0xAE, 0xAD, 0x44, 0xC1, 0xA7, /* 0x80-0x83 */ + 0xAD, 0x45, 0xAD, 0x46, 0xAD, 0x47, 0xE8, 0xAF, /* 0x84-0x87 */ + 0xAD, 0x48, 0xAD, 0x49, 0xAD, 0x4A, 0xE8, 0xB0, /* 0x88-0x8B */ + 0xAD, 0x4B, 0xAD, 0x4C, 0xE8, 0xAC, 0xAD, 0x4D, /* 0x8C-0x8F */ + 0xE8, 0xB4, 0xAD, 0x4E, 0xAD, 0x4F, 0xAD, 0x50, /* 0x90-0x93 */ + 0xAD, 0x51, 0xAD, 0x52, 0xAD, 0x53, 0xAD, 0x54, /* 0x94-0x97 */ + 0xAD, 0x55, 0xAD, 0x56, 0xAD, 0x57, 0xAD, 0x58, /* 0x98-0x9B */ + 0xE8, 0xAB, 0xAD, 0x59, 0xE8, 0xB1, 0xAD, 0x5A, /* 0x9C-0x9F */ + 0xAD, 0x5B, 0xAD, 0x5C, 0xAD, 0x5D, 0xAD, 0x5E, /* 0xA0-0xA3 */ + 0xAD, 0x5F, 0xAD, 0x60, 0xAD, 0x61, 0xE8, 0xB5, /* 0xA4-0xA7 */ + 0xE8, 0xB2, 0xE8, 0xB3, 0xAD, 0x62, 0xAD, 0x63, /* 0xA8-0xAB */ + 0xAD, 0x64, 0xAD, 0x65, 0xAD, 0x66, 0xAD, 0x67, /* 0xAC-0xAF */ + 0xAD, 0x68, 0xAD, 0x69, 0xAD, 0x6A, 0xAD, 0x6B, /* 0xB0-0xB3 */ + 0xAD, 0x6C, 0xAD, 0x6D, 0xAD, 0x6E, 0xAD, 0x6F, /* 0xB4-0xB7 */ + 0xAD, 0x70, 0xAD, 0x71, 0xE8, 0xB7, 0xAD, 0x72, /* 0xB8-0xBB */ + 0xAD, 0x73, 0xAD, 0x74, 0xAD, 0x75, 0xAD, 0x76, /* 0xBC-0xBF */ + 0xAD, 0x77, 0xAD, 0x78, 0xAD, 0x79, 0xAD, 0x7A, /* 0xC0-0xC3 */ + 0xAD, 0x7B, 0xAD, 0x7C, 0xAD, 0x7D, 0xAD, 0x7E, /* 0xC4-0xC7 */ + 0xAD, 0x80, 0xAD, 0x81, 0xAD, 0x82, 0xAD, 0x83, /* 0xC8-0xCB */ + 0xAD, 0x84, 0xAD, 0x85, 0xAD, 0x86, 0xAD, 0x87, /* 0xCC-0xCF */ + 0xAD, 0x88, 0xAD, 0x89, 0xE8, 0xB6, 0xAD, 0x8A, /* 0xD0-0xD3 */ + 0xAD, 0x8B, 0xAD, 0x8C, 0xAD, 0x8D, 0xAD, 0x8E, /* 0xD4-0xD7 */ + 0xAD, 0x8F, 0xAD, 0x90, 0xAD, 0x91, 0xAD, 0x92, /* 0xD8-0xDB */ + 0xB9, 0xCF, 0xAD, 0x93, 0xF0, 0xAC, 0xAD, 0x94, /* 0xDC-0xDF */ + 0xF0, 0xAD, 0xAD, 0x95, 0xC6, 0xB0, 0xB0, 0xEA, /* 0xE0-0xE3 */ + 0xC8, 0xBF, 0xAD, 0x96, 0xCD, 0xDF, 0xAD, 0x97, /* 0xE4-0xE7 */ + 0xAD, 0x98, 0xAD, 0x99, 0xAD, 0x9A, 0xAD, 0x9B, /* 0xE8-0xEB */ + 0xAD, 0x9C, 0xAD, 0x9D, 0xCE, 0xCD, 0xEA, 0xB1, /* 0xEC-0xEF */ + 0xAD, 0x9E, 0xAD, 0x9F, 0xAD, 0xA0, 0xAE, 0x40, /* 0xF0-0xF3 */ + 0xEA, 0xB2, 0xAE, 0x41, 0xC6, 0xBF, 0xB4, 0xC9, /* 0xF4-0xF7 */ + 0xAE, 0x42, 0xAE, 0x43, 0xAE, 0x44, 0xAE, 0x45, /* 0xF8-0xFB */ + 0xAE, 0x46, 0xAE, 0x47, 0xAE, 0x48, 0xEA, 0xB3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_75[512] = { + 0xAE, 0x49, 0xAE, 0x4A, 0xAE, 0x4B, 0xAE, 0x4C, /* 0x00-0x03 */ + 0xD5, 0xE7, 0xAE, 0x4D, 0xAE, 0x4E, 0xAE, 0x4F, /* 0x04-0x07 */ + 0xAE, 0x50, 0xAE, 0x51, 0xAE, 0x52, 0xAE, 0x53, /* 0x08-0x0B */ + 0xAE, 0x54, 0xDD, 0xF9, 0xAE, 0x55, 0xEA, 0xB4, /* 0x0C-0x0F */ + 0xAE, 0x56, 0xEA, 0xB5, 0xAE, 0x57, 0xEA, 0xB6, /* 0x10-0x13 */ + 0xAE, 0x58, 0xAE, 0x59, 0xAE, 0x5A, 0xAE, 0x5B, /* 0x14-0x17 */ + 0xB8, 0xCA, 0xDF, 0xB0, 0xC9, 0xF5, 0xAE, 0x5C, /* 0x18-0x1B */ + 0xCC, 0xF0, 0xAE, 0x5D, 0xAE, 0x5E, 0xC9, 0xFA, /* 0x1C-0x1F */ + 0xAE, 0x5F, 0xAE, 0x60, 0xAE, 0x61, 0xAE, 0x62, /* 0x20-0x23 */ + 0xAE, 0x63, 0xC9, 0xFB, 0xAE, 0x64, 0xAE, 0x65, /* 0x24-0x27 */ + 0xD3, 0xC3, 0xCB, 0xA6, 0xAE, 0x66, 0xB8, 0xA6, /* 0x28-0x2B */ + 0xF0, 0xAE, 0xB1, 0xC2, 0xAE, 0x67, 0xE5, 0xB8, /* 0x2C-0x2F */ + 0xCC, 0xEF, 0xD3, 0xC9, 0xBC, 0xD7, 0xC9, 0xEA, /* 0x30-0x33 */ + 0xAE, 0x68, 0xB5, 0xE7, 0xAE, 0x69, 0xC4, 0xD0, /* 0x34-0x37 */ + 0xB5, 0xE9, 0xAE, 0x6A, 0xEE, 0xAE, 0xBB, 0xAD, /* 0x38-0x3B */ + 0xAE, 0x6B, 0xAE, 0x6C, 0xE7, 0xDE, 0xAE, 0x6D, /* 0x3C-0x3F */ + 0xEE, 0xAF, 0xAE, 0x6E, 0xAE, 0x6F, 0xAE, 0x70, /* 0x40-0x43 */ + 0xAE, 0x71, 0xB3, 0xA9, 0xAE, 0x72, 0xAE, 0x73, /* 0x44-0x47 */ + 0xEE, 0xB2, 0xAE, 0x74, 0xAE, 0x75, 0xEE, 0xB1, /* 0x48-0x4B */ + 0xBD, 0xE7, 0xAE, 0x76, 0xEE, 0xB0, 0xCE, 0xB7, /* 0x4C-0x4F */ + 0xAE, 0x77, 0xAE, 0x78, 0xAE, 0x79, 0xAE, 0x7A, /* 0x50-0x53 */ + 0xC5, 0xCF, 0xAE, 0x7B, 0xAE, 0x7C, 0xAE, 0x7D, /* 0x54-0x57 */ + 0xAE, 0x7E, 0xC1, 0xF4, 0xDB, 0xCE, 0xEE, 0xB3, /* 0x58-0x5B */ + 0xD0, 0xF3, 0xAE, 0x80, 0xAE, 0x81, 0xAE, 0x82, /* 0x5C-0x5F */ + 0xAE, 0x83, 0xAE, 0x84, 0xAE, 0x85, 0xAE, 0x86, /* 0x60-0x63 */ + 0xAE, 0x87, 0xC2, 0xD4, 0xC6, 0xE8, 0xAE, 0x88, /* 0x64-0x67 */ + 0xAE, 0x89, 0xAE, 0x8A, 0xB7, 0xAC, 0xAE, 0x8B, /* 0x68-0x6B */ + 0xAE, 0x8C, 0xAE, 0x8D, 0xAE, 0x8E, 0xAE, 0x8F, /* 0x6C-0x6F */ + 0xAE, 0x90, 0xAE, 0x91, 0xEE, 0xB4, 0xAE, 0x92, /* 0x70-0x73 */ + 0xB3, 0xEB, 0xAE, 0x93, 0xAE, 0x94, 0xAE, 0x95, /* 0x74-0x77 */ + 0xBB, 0xFB, 0xEE, 0xB5, 0xAE, 0x96, 0xAE, 0x97, /* 0x78-0x7B */ + 0xAE, 0x98, 0xAE, 0x99, 0xAE, 0x9A, 0xE7, 0xDC, /* 0x7C-0x7F */ + + 0xAE, 0x9B, 0xAE, 0x9C, 0xAE, 0x9D, 0xEE, 0xB6, /* 0x80-0x83 */ + 0xAE, 0x9E, 0xAE, 0x9F, 0xBD, 0xAE, 0xAE, 0xA0, /* 0x84-0x87 */ + 0xAF, 0x40, 0xAF, 0x41, 0xAF, 0x42, 0xF1, 0xE2, /* 0x88-0x8B */ + 0xAF, 0x43, 0xAF, 0x44, 0xAF, 0x45, 0xCA, 0xE8, /* 0x8C-0x8F */ + 0xAF, 0x46, 0xD2, 0xC9, 0xF0, 0xDA, 0xAF, 0x47, /* 0x90-0x93 */ + 0xF0, 0xDB, 0xAF, 0x48, 0xF0, 0xDC, 0xC1, 0xC6, /* 0x94-0x97 */ + 0xAF, 0x49, 0xB8, 0xED, 0xBE, 0xCE, 0xAF, 0x4A, /* 0x98-0x9B */ + 0xAF, 0x4B, 0xF0, 0xDE, 0xAF, 0x4C, 0xC5, 0xB1, /* 0x9C-0x9F */ + 0xF0, 0xDD, 0xD1, 0xF1, 0xAF, 0x4D, 0xF0, 0xE0, /* 0xA0-0xA3 */ + 0xB0, 0xCC, 0xBD, 0xEA, 0xAF, 0x4E, 0xAF, 0x4F, /* 0xA4-0xA7 */ + 0xAF, 0x50, 0xAF, 0x51, 0xAF, 0x52, 0xD2, 0xDF, /* 0xA8-0xAB */ + 0xF0, 0xDF, 0xAF, 0x53, 0xB4, 0xAF, 0xB7, 0xE8, /* 0xAC-0xAF */ + 0xF0, 0xE6, 0xF0, 0xE5, 0xC6, 0xA3, 0xF0, 0xE1, /* 0xB0-0xB3 */ + 0xF0, 0xE2, 0xB4, 0xC3, 0xAF, 0x54, 0xAF, 0x55, /* 0xB4-0xB7 */ + 0xF0, 0xE3, 0xD5, 0xEE, 0xAF, 0x56, 0xAF, 0x57, /* 0xB8-0xBB */ + 0xCC, 0xDB, 0xBE, 0xD2, 0xBC, 0xB2, 0xAF, 0x58, /* 0xBC-0xBF */ + 0xAF, 0x59, 0xAF, 0x5A, 0xF0, 0xE8, 0xF0, 0xE7, /* 0xC0-0xC3 */ + 0xF0, 0xE4, 0xB2, 0xA1, 0xAF, 0x5B, 0xD6, 0xA2, /* 0xC4-0xC7 */ + 0xD3, 0xB8, 0xBE, 0xB7, 0xC8, 0xAC, 0xAF, 0x5C, /* 0xC8-0xCB */ + 0xAF, 0x5D, 0xF0, 0xEA, 0xAF, 0x5E, 0xAF, 0x5F, /* 0xCC-0xCF */ + 0xAF, 0x60, 0xAF, 0x61, 0xD1, 0xF7, 0xAF, 0x62, /* 0xD0-0xD3 */ + 0xD6, 0xCC, 0xBA, 0xDB, 0xF0, 0xE9, 0xAF, 0x63, /* 0xD4-0xD7 */ + 0xB6, 0xBB, 0xAF, 0x64, 0xAF, 0x65, 0xCD, 0xB4, /* 0xD8-0xDB */ + 0xAF, 0x66, 0xAF, 0x67, 0xC6, 0xA6, 0xAF, 0x68, /* 0xDC-0xDF */ + 0xAF, 0x69, 0xAF, 0x6A, 0xC1, 0xA1, 0xF0, 0xEB, /* 0xE0-0xE3 */ + 0xF0, 0xEE, 0xAF, 0x6B, 0xF0, 0xED, 0xF0, 0xF0, /* 0xE4-0xE7 */ + 0xF0, 0xEC, 0xAF, 0x6C, 0xBB, 0xBE, 0xF0, 0xEF, /* 0xE8-0xEB */ + 0xAF, 0x6D, 0xAF, 0x6E, 0xAF, 0x6F, 0xAF, 0x70, /* 0xEC-0xEF */ + 0xCC, 0xB5, 0xF0, 0xF2, 0xAF, 0x71, 0xAF, 0x72, /* 0xF0-0xF3 */ + 0xB3, 0xD5, 0xAF, 0x73, 0xAF, 0x74, 0xAF, 0x75, /* 0xF4-0xF7 */ + 0xAF, 0x76, 0xB1, 0xD4, 0xAF, 0x77, 0xAF, 0x78, /* 0xF8-0xFB */ + 0xF0, 0xF3, 0xAF, 0x79, 0xAF, 0x7A, 0xF0, 0xF4, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_76[512] = { + 0xF0, 0xF6, 0xB4, 0xE1, 0xAF, 0x7B, 0xF0, 0xF1, /* 0x00-0x03 */ + 0xAF, 0x7C, 0xF0, 0xF7, 0xAF, 0x7D, 0xAF, 0x7E, /* 0x04-0x07 */ + 0xAF, 0x80, 0xAF, 0x81, 0xF0, 0xFA, 0xAF, 0x82, /* 0x08-0x0B */ + 0xF0, 0xF8, 0xAF, 0x83, 0xAF, 0x84, 0xAF, 0x85, /* 0x0C-0x0F */ + 0xF0, 0xF5, 0xAF, 0x86, 0xAF, 0x87, 0xAF, 0x88, /* 0x10-0x13 */ + 0xAF, 0x89, 0xF0, 0xFD, 0xAF, 0x8A, 0xF0, 0xF9, /* 0x14-0x17 */ + 0xF0, 0xFC, 0xF0, 0xFE, 0xAF, 0x8B, 0xF1, 0xA1, /* 0x18-0x1B */ + 0xAF, 0x8C, 0xAF, 0x8D, 0xAF, 0x8E, 0xCE, 0xC1, /* 0x1C-0x1F */ + 0xF1, 0xA4, 0xAF, 0x8F, 0xF1, 0xA3, 0xAF, 0x90, /* 0x20-0x23 */ + 0xC1, 0xF6, 0xF0, 0xFB, 0xCA, 0xDD, 0xAF, 0x91, /* 0x24-0x27 */ + 0xAF, 0x92, 0xB4, 0xF1, 0xB1, 0xF1, 0xCC, 0xB1, /* 0x28-0x2B */ + 0xAF, 0x93, 0xF1, 0xA6, 0xAF, 0x94, 0xAF, 0x95, /* 0x2C-0x2F */ + 0xF1, 0xA7, 0xAF, 0x96, 0xAF, 0x97, 0xF1, 0xAC, /* 0x30-0x33 */ + 0xD5, 0xCE, 0xF1, 0xA9, 0xAF, 0x98, 0xAF, 0x99, /* 0x34-0x37 */ + 0xC8, 0xB3, 0xAF, 0x9A, 0xAF, 0x9B, 0xAF, 0x9C, /* 0x38-0x3B */ + 0xF1, 0xA2, 0xAF, 0x9D, 0xF1, 0xAB, 0xF1, 0xA8, /* 0x3C-0x3F */ + 0xF1, 0xA5, 0xAF, 0x9E, 0xAF, 0x9F, 0xF1, 0xAA, /* 0x40-0x43 */ + 0xAF, 0xA0, 0xB0, 0x40, 0xB0, 0x41, 0xB0, 0x42, /* 0x44-0x47 */ + 0xB0, 0x43, 0xB0, 0x44, 0xB0, 0x45, 0xB0, 0x46, /* 0x48-0x4B */ + 0xB0, 0xA9, 0xF1, 0xAD, 0xB0, 0x47, 0xB0, 0x48, /* 0x4C-0x4F */ + 0xB0, 0x49, 0xB0, 0x4A, 0xB0, 0x4B, 0xB0, 0x4C, /* 0x50-0x53 */ + 0xF1, 0xAF, 0xB0, 0x4D, 0xF1, 0xB1, 0xB0, 0x4E, /* 0x54-0x57 */ + 0xB0, 0x4F, 0xB0, 0x50, 0xB0, 0x51, 0xB0, 0x52, /* 0x58-0x5B */ + 0xF1, 0xB0, 0xB0, 0x53, 0xF1, 0xAE, 0xB0, 0x54, /* 0x5C-0x5F */ + 0xB0, 0x55, 0xB0, 0x56, 0xB0, 0x57, 0xD1, 0xA2, /* 0x60-0x63 */ + 0xB0, 0x58, 0xB0, 0x59, 0xB0, 0x5A, 0xB0, 0x5B, /* 0x64-0x67 */ + 0xB0, 0x5C, 0xB0, 0x5D, 0xB0, 0x5E, 0xF1, 0xB2, /* 0x68-0x6B */ + 0xB0, 0x5F, 0xB0, 0x60, 0xB0, 0x61, 0xF1, 0xB3, /* 0x6C-0x6F */ + 0xB0, 0x62, 0xB0, 0x63, 0xB0, 0x64, 0xB0, 0x65, /* 0x70-0x73 */ + 0xB0, 0x66, 0xB0, 0x67, 0xB0, 0x68, 0xB0, 0x69, /* 0x74-0x77 */ + 0xB9, 0xEF, 0xB0, 0x6A, 0xB0, 0x6B, 0xB5, 0xC7, /* 0x78-0x7B */ + 0xB0, 0x6C, 0xB0, 0xD7, 0xB0, 0xD9, 0xB0, 0x6D, /* 0x7C-0x7F */ + + 0xB0, 0x6E, 0xB0, 0x6F, 0xD4, 0xED, 0xB0, 0x70, /* 0x80-0x83 */ + 0xB5, 0xC4, 0xB0, 0x71, 0xBD, 0xD4, 0xBB, 0xCA, /* 0x84-0x87 */ + 0xF0, 0xA7, 0xB0, 0x72, 0xB0, 0x73, 0xB8, 0xDE, /* 0x88-0x8B */ + 0xB0, 0x74, 0xB0, 0x75, 0xF0, 0xA8, 0xB0, 0x76, /* 0x8C-0x8F */ + 0xB0, 0x77, 0xB0, 0xA8, 0xB0, 0x78, 0xF0, 0xA9, /* 0x90-0x93 */ + 0xB0, 0x79, 0xB0, 0x7A, 0xCD, 0xEE, 0xB0, 0x7B, /* 0x94-0x97 */ + 0xB0, 0x7C, 0xF0, 0xAA, 0xB0, 0x7D, 0xB0, 0x7E, /* 0x98-0x9B */ + 0xB0, 0x80, 0xB0, 0x81, 0xB0, 0x82, 0xB0, 0x83, /* 0x9C-0x9F */ + 0xB0, 0x84, 0xB0, 0x85, 0xB0, 0x86, 0xB0, 0x87, /* 0xA0-0xA3 */ + 0xF0, 0xAB, 0xB0, 0x88, 0xB0, 0x89, 0xB0, 0x8A, /* 0xA4-0xA7 */ + 0xB0, 0x8B, 0xB0, 0x8C, 0xB0, 0x8D, 0xB0, 0x8E, /* 0xA8-0xAB */ + 0xB0, 0x8F, 0xB0, 0x90, 0xC6, 0xA4, 0xB0, 0x91, /* 0xAC-0xAF */ + 0xB0, 0x92, 0xD6, 0xE5, 0xF1, 0xE4, 0xB0, 0x93, /* 0xB0-0xB3 */ + 0xF1, 0xE5, 0xB0, 0x94, 0xB0, 0x95, 0xB0, 0x96, /* 0xB4-0xB7 */ + 0xB0, 0x97, 0xB0, 0x98, 0xB0, 0x99, 0xB0, 0x9A, /* 0xB8-0xBB */ + 0xB0, 0x9B, 0xB0, 0x9C, 0xB0, 0x9D, 0xC3, 0xF3, /* 0xBC-0xBF */ + 0xB0, 0x9E, 0xB0, 0x9F, 0xD3, 0xDB, 0xB0, 0xA0, /* 0xC0-0xC3 */ + 0xB1, 0x40, 0xD6, 0xD1, 0xC5, 0xE8, 0xB1, 0x41, /* 0xC4-0xC7 */ + 0xD3, 0xAF, 0xB1, 0x42, 0xD2, 0xE6, 0xB1, 0x43, /* 0xC8-0xCB */ + 0xB1, 0x44, 0xEE, 0xC1, 0xB0, 0xBB, 0xD5, 0xB5, /* 0xCC-0xCF */ + 0xD1, 0xCE, 0xBC, 0xE0, 0xBA, 0xD0, 0xB1, 0x45, /* 0xD0-0xD3 */ + 0xBF, 0xF8, 0xB1, 0x46, 0xB8, 0xC7, 0xB5, 0xC1, /* 0xD4-0xD7 */ + 0xC5, 0xCC, 0xB1, 0x47, 0xB1, 0x48, 0xCA, 0xA2, /* 0xD8-0xDB */ + 0xB1, 0x49, 0xB1, 0x4A, 0xB1, 0x4B, 0xC3, 0xCB, /* 0xDC-0xDF */ + 0xB1, 0x4C, 0xB1, 0x4D, 0xB1, 0x4E, 0xB1, 0x4F, /* 0xE0-0xE3 */ + 0xB1, 0x50, 0xEE, 0xC2, 0xB1, 0x51, 0xB1, 0x52, /* 0xE4-0xE7 */ + 0xB1, 0x53, 0xB1, 0x54, 0xB1, 0x55, 0xB1, 0x56, /* 0xE8-0xEB */ + 0xB1, 0x57, 0xB1, 0x58, 0xC4, 0xBF, 0xB6, 0xA2, /* 0xEC-0xEF */ + 0xB1, 0x59, 0xED, 0xEC, 0xC3, 0xA4, 0xB1, 0x5A, /* 0xF0-0xF3 */ + 0xD6, 0xB1, 0xB1, 0x5B, 0xB1, 0x5C, 0xB1, 0x5D, /* 0xF4-0xF7 */ + 0xCF, 0xE0, 0xED, 0xEF, 0xB1, 0x5E, 0xB1, 0x5F, /* 0xF8-0xFB */ + 0xC5, 0xCE, 0xB1, 0x60, 0xB6, 0xDC, 0xB1, 0x61, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_77[512] = { + 0xB1, 0x62, 0xCA, 0xA1, 0xB1, 0x63, 0xB1, 0x64, /* 0x00-0x03 */ + 0xED, 0xED, 0xB1, 0x65, 0xB1, 0x66, 0xED, 0xF0, /* 0x04-0x07 */ + 0xED, 0xF1, 0xC3, 0xBC, 0xB1, 0x67, 0xBF, 0xB4, /* 0x08-0x0B */ + 0xB1, 0x68, 0xED, 0xEE, 0xB1, 0x69, 0xB1, 0x6A, /* 0x0C-0x0F */ + 0xB1, 0x6B, 0xB1, 0x6C, 0xB1, 0x6D, 0xB1, 0x6E, /* 0x10-0x13 */ + 0xB1, 0x6F, 0xB1, 0x70, 0xB1, 0x71, 0xB1, 0x72, /* 0x14-0x17 */ + 0xB1, 0x73, 0xED, 0xF4, 0xED, 0xF2, 0xB1, 0x74, /* 0x18-0x1B */ + 0xB1, 0x75, 0xB1, 0x76, 0xB1, 0x77, 0xD5, 0xE6, /* 0x1C-0x1F */ + 0xC3, 0xDF, 0xB1, 0x78, 0xED, 0xF3, 0xB1, 0x79, /* 0x20-0x23 */ + 0xB1, 0x7A, 0xB1, 0x7B, 0xED, 0xF6, 0xB1, 0x7C, /* 0x24-0x27 */ + 0xD5, 0xA3, 0xD1, 0xA3, 0xB1, 0x7D, 0xB1, 0x7E, /* 0x28-0x2B */ + 0xB1, 0x80, 0xED, 0xF5, 0xB1, 0x81, 0xC3, 0xD0, /* 0x2C-0x2F */ + 0xB1, 0x82, 0xB1, 0x83, 0xB1, 0x84, 0xB1, 0x85, /* 0x30-0x33 */ + 0xB1, 0x86, 0xED, 0xF7, 0xBF, 0xF4, 0xBE, 0xEC, /* 0x34-0x37 */ + 0xED, 0xF8, 0xB1, 0x87, 0xCC, 0xF7, 0xB1, 0x88, /* 0x38-0x3B */ + 0xD1, 0xDB, 0xB1, 0x89, 0xB1, 0x8A, 0xB1, 0x8B, /* 0x3C-0x3F */ + 0xD7, 0xC5, 0xD5, 0xF6, 0xB1, 0x8C, 0xED, 0xFC, /* 0x40-0x43 */ + 0xB1, 0x8D, 0xB1, 0x8E, 0xB1, 0x8F, 0xED, 0xFB, /* 0x44-0x47 */ + 0xB1, 0x90, 0xB1, 0x91, 0xB1, 0x92, 0xB1, 0x93, /* 0x48-0x4B */ + 0xB1, 0x94, 0xB1, 0x95, 0xB1, 0x96, 0xB1, 0x97, /* 0x4C-0x4F */ + 0xED, 0xF9, 0xED, 0xFA, 0xB1, 0x98, 0xB1, 0x99, /* 0x50-0x53 */ + 0xB1, 0x9A, 0xB1, 0x9B, 0xB1, 0x9C, 0xB1, 0x9D, /* 0x54-0x57 */ + 0xB1, 0x9E, 0xB1, 0x9F, 0xED, 0xFD, 0xBE, 0xA6, /* 0x58-0x5B */ + 0xB1, 0xA0, 0xB2, 0x40, 0xB2, 0x41, 0xB2, 0x42, /* 0x5C-0x5F */ + 0xB2, 0x43, 0xCB, 0xAF, 0xEE, 0xA1, 0xB6, 0xBD, /* 0x60-0x63 */ + 0xB2, 0x44, 0xEE, 0xA2, 0xC4, 0xC0, 0xB2, 0x45, /* 0x64-0x67 */ + 0xED, 0xFE, 0xB2, 0x46, 0xB2, 0x47, 0xBD, 0xDE, /* 0x68-0x6B */ + 0xB2, 0xC7, 0xB2, 0x48, 0xB2, 0x49, 0xB2, 0x4A, /* 0x6C-0x6F */ + 0xB2, 0x4B, 0xB2, 0x4C, 0xB2, 0x4D, 0xB2, 0x4E, /* 0x70-0x73 */ + 0xB2, 0x4F, 0xB2, 0x50, 0xB2, 0x51, 0xB2, 0x52, /* 0x74-0x77 */ + 0xB2, 0x53, 0xB6, 0xC3, 0xB2, 0x54, 0xB2, 0x55, /* 0x78-0x7B */ + 0xB2, 0x56, 0xEE, 0xA5, 0xD8, 0xBA, 0xEE, 0xA3, /* 0x7C-0x7F */ + + 0xEE, 0xA6, 0xB2, 0x57, 0xB2, 0x58, 0xB2, 0x59, /* 0x80-0x83 */ + 0xC3, 0xE9, 0xB3, 0xF2, 0xB2, 0x5A, 0xB2, 0x5B, /* 0x84-0x87 */ + 0xB2, 0x5C, 0xB2, 0x5D, 0xB2, 0x5E, 0xB2, 0x5F, /* 0x88-0x8B */ + 0xEE, 0xA7, 0xEE, 0xA4, 0xCF, 0xB9, 0xB2, 0x60, /* 0x8C-0x8F */ + 0xB2, 0x61, 0xEE, 0xA8, 0xC2, 0xF7, 0xB2, 0x62, /* 0x90-0x93 */ + 0xB2, 0x63, 0xB2, 0x64, 0xB2, 0x65, 0xB2, 0x66, /* 0x94-0x97 */ + 0xB2, 0x67, 0xB2, 0x68, 0xB2, 0x69, 0xB2, 0x6A, /* 0x98-0x9B */ + 0xB2, 0x6B, 0xB2, 0x6C, 0xB2, 0x6D, 0xEE, 0xA9, /* 0x9C-0x9F */ + 0xEE, 0xAA, 0xB2, 0x6E, 0xDE, 0xAB, 0xB2, 0x6F, /* 0xA0-0xA3 */ + 0xB2, 0x70, 0xC6, 0xB3, 0xB2, 0x71, 0xC7, 0xC6, /* 0xA4-0xA7 */ + 0xB2, 0x72, 0xD6, 0xF5, 0xB5, 0xC9, 0xB2, 0x73, /* 0xA8-0xAB */ + 0xCB, 0xB2, 0xB2, 0x74, 0xB2, 0x75, 0xB2, 0x76, /* 0xAC-0xAF */ + 0xEE, 0xAB, 0xB2, 0x77, 0xB2, 0x78, 0xCD, 0xAB, /* 0xB0-0xB3 */ + 0xB2, 0x79, 0xEE, 0xAC, 0xB2, 0x7A, 0xB2, 0x7B, /* 0xB4-0xB7 */ + 0xB2, 0x7C, 0xB2, 0x7D, 0xB2, 0x7E, 0xD5, 0xB0, /* 0xB8-0xBB */ + 0xB2, 0x80, 0xEE, 0xAD, 0xB2, 0x81, 0xF6, 0xC4, /* 0xBC-0xBF */ + 0xB2, 0x82, 0xB2, 0x83, 0xB2, 0x84, 0xB2, 0x85, /* 0xC0-0xC3 */ + 0xB2, 0x86, 0xB2, 0x87, 0xB2, 0x88, 0xB2, 0x89, /* 0xC4-0xC7 */ + 0xB2, 0x8A, 0xB2, 0x8B, 0xB2, 0x8C, 0xB2, 0x8D, /* 0xC8-0xCB */ + 0xB2, 0x8E, 0xDB, 0xC7, 0xB2, 0x8F, 0xB2, 0x90, /* 0xCC-0xCF */ + 0xB2, 0x91, 0xB2, 0x92, 0xB2, 0x93, 0xB2, 0x94, /* 0xD0-0xD3 */ + 0xB2, 0x95, 0xB2, 0x96, 0xB2, 0x97, 0xB4, 0xA3, /* 0xD4-0xD7 */ + 0xB2, 0x98, 0xB2, 0x99, 0xB2, 0x9A, 0xC3, 0xAC, /* 0xD8-0xDB */ + 0xF1, 0xE6, 0xB2, 0x9B, 0xB2, 0x9C, 0xB2, 0x9D, /* 0xDC-0xDF */ + 0xB2, 0x9E, 0xB2, 0x9F, 0xCA, 0xB8, 0xD2, 0xD3, /* 0xE0-0xE3 */ + 0xB2, 0xA0, 0xD6, 0xAA, 0xB3, 0x40, 0xEF, 0xF2, /* 0xE4-0xE7 */ + 0xB3, 0x41, 0xBE, 0xD8, 0xB3, 0x42, 0xBD, 0xC3, /* 0xE8-0xEB */ + 0xEF, 0xF3, 0xB6, 0xCC, 0xB0, 0xAB, 0xB3, 0x43, /* 0xEC-0xEF */ + 0xB3, 0x44, 0xB3, 0x45, 0xB3, 0x46, 0xCA, 0xAF, /* 0xF0-0xF3 */ + 0xB3, 0x47, 0xB3, 0x48, 0xED, 0xB6, 0xB3, 0x49, /* 0xF4-0xF7 */ + 0xED, 0xB7, 0xB3, 0x4A, 0xB3, 0x4B, 0xB3, 0x4C, /* 0xF8-0xFB */ + 0xB3, 0x4D, 0xCE, 0xF9, 0xB7, 0xAF, 0xBF, 0xF3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_78[512] = { + 0xED, 0xB8, 0xC2, 0xEB, 0xC9, 0xB0, 0xB3, 0x4E, /* 0x00-0x03 */ + 0xB3, 0x4F, 0xB3, 0x50, 0xB3, 0x51, 0xB3, 0x52, /* 0x04-0x07 */ + 0xB3, 0x53, 0xED, 0xB9, 0xB3, 0x54, 0xB3, 0x55, /* 0x08-0x0B */ + 0xC6, 0xF6, 0xBF, 0xB3, 0xB3, 0x56, 0xB3, 0x57, /* 0x0C-0x0F */ + 0xB3, 0x58, 0xED, 0xBC, 0xC5, 0xF8, 0xB3, 0x59, /* 0x10-0x13 */ + 0xD1, 0xD0, 0xB3, 0x5A, 0xD7, 0xA9, 0xED, 0xBA, /* 0x14-0x17 */ + 0xED, 0xBB, 0xB3, 0x5B, 0xD1, 0xE2, 0xB3, 0x5C, /* 0x18-0x1B */ + 0xED, 0xBF, 0xED, 0xC0, 0xB3, 0x5D, 0xED, 0xC4, /* 0x1C-0x1F */ + 0xB3, 0x5E, 0xB3, 0x5F, 0xB3, 0x60, 0xED, 0xC8, /* 0x20-0x23 */ + 0xB3, 0x61, 0xED, 0xC6, 0xED, 0xCE, 0xD5, 0xE8, /* 0x24-0x27 */ + 0xB3, 0x62, 0xED, 0xC9, 0xB3, 0x63, 0xB3, 0x64, /* 0x28-0x2B */ + 0xED, 0xC7, 0xED, 0xBE, 0xB3, 0x65, 0xB3, 0x66, /* 0x2C-0x2F */ + 0xC5, 0xE9, 0xB3, 0x67, 0xB3, 0x68, 0xB3, 0x69, /* 0x30-0x33 */ + 0xC6, 0xC6, 0xB3, 0x6A, 0xB3, 0x6B, 0xC9, 0xE9, /* 0x34-0x37 */ + 0xD4, 0xD2, 0xED, 0xC1, 0xED, 0xC2, 0xED, 0xC3, /* 0x38-0x3B */ + 0xED, 0xC5, 0xB3, 0x6C, 0xC0, 0xF9, 0xB3, 0x6D, /* 0x3C-0x3F */ + 0xB4, 0xA1, 0xB3, 0x6E, 0xB3, 0x6F, 0xB3, 0x70, /* 0x40-0x43 */ + 0xB3, 0x71, 0xB9, 0xE8, 0xB3, 0x72, 0xED, 0xD0, /* 0x44-0x47 */ + 0xB3, 0x73, 0xB3, 0x74, 0xB3, 0x75, 0xB3, 0x76, /* 0x48-0x4B */ + 0xED, 0xD1, 0xB3, 0x77, 0xED, 0xCA, 0xB3, 0x78, /* 0x4C-0x4F */ + 0xED, 0xCF, 0xB3, 0x79, 0xCE, 0xF8, 0xB3, 0x7A, /* 0x50-0x53 */ + 0xB3, 0x7B, 0xCB, 0xB6, 0xED, 0xCC, 0xED, 0xCD, /* 0x54-0x57 */ + 0xB3, 0x7C, 0xB3, 0x7D, 0xB3, 0x7E, 0xB3, 0x80, /* 0x58-0x5B */ + 0xB3, 0x81, 0xCF, 0xF5, 0xB3, 0x82, 0xB3, 0x83, /* 0x5C-0x5F */ + 0xB3, 0x84, 0xB3, 0x85, 0xB3, 0x86, 0xB3, 0x87, /* 0x60-0x63 */ + 0xB3, 0x88, 0xB3, 0x89, 0xB3, 0x8A, 0xB3, 0x8B, /* 0x64-0x67 */ + 0xB3, 0x8C, 0xB3, 0x8D, 0xED, 0xD2, 0xC1, 0xF2, /* 0x68-0x6B */ + 0xD3, 0xB2, 0xED, 0xCB, 0xC8, 0xB7, 0xB3, 0x8E, /* 0x6C-0x6F */ + 0xB3, 0x8F, 0xB3, 0x90, 0xB3, 0x91, 0xB3, 0x92, /* 0x70-0x73 */ + 0xB3, 0x93, 0xB3, 0x94, 0xB3, 0x95, 0xBC, 0xEF, /* 0x74-0x77 */ + 0xB3, 0x96, 0xB3, 0x97, 0xB3, 0x98, 0xB3, 0x99, /* 0x78-0x7B */ + 0xC5, 0xF0, 0xB3, 0x9A, 0xB3, 0x9B, 0xB3, 0x9C, /* 0x7C-0x7F */ + + 0xB3, 0x9D, 0xB3, 0x9E, 0xB3, 0x9F, 0xB3, 0xA0, /* 0x80-0x83 */ + 0xB4, 0x40, 0xB4, 0x41, 0xB4, 0x42, 0xED, 0xD6, /* 0x84-0x87 */ + 0xB4, 0x43, 0xB5, 0xEF, 0xB4, 0x44, 0xB4, 0x45, /* 0x88-0x8B */ + 0xC2, 0xB5, 0xB0, 0xAD, 0xCB, 0xE9, 0xB4, 0x46, /* 0x8C-0x8F */ + 0xB4, 0x47, 0xB1, 0xAE, 0xB4, 0x48, 0xED, 0xD4, /* 0x90-0x93 */ + 0xB4, 0x49, 0xB4, 0x4A, 0xB4, 0x4B, 0xCD, 0xEB, /* 0x94-0x97 */ + 0xB5, 0xE2, 0xB4, 0x4C, 0xED, 0xD5, 0xED, 0xD3, /* 0x98-0x9B */ + 0xED, 0xD7, 0xB4, 0x4D, 0xB4, 0x4E, 0xB5, 0xFA, /* 0x9C-0x9F */ + 0xB4, 0x4F, 0xED, 0xD8, 0xB4, 0x50, 0xED, 0xD9, /* 0xA0-0xA3 */ + 0xB4, 0x51, 0xED, 0xDC, 0xB4, 0x52, 0xB1, 0xCC, /* 0xA4-0xA7 */ + 0xB4, 0x53, 0xB4, 0x54, 0xB4, 0x55, 0xB4, 0x56, /* 0xA8-0xAB */ + 0xB4, 0x57, 0xB4, 0x58, 0xB4, 0x59, 0xB4, 0x5A, /* 0xAC-0xAF */ + 0xC5, 0xF6, 0xBC, 0xEE, 0xED, 0xDA, 0xCC, 0xBC, /* 0xB0-0xB3 */ + 0xB2, 0xEA, 0xB4, 0x5B, 0xB4, 0x5C, 0xB4, 0x5D, /* 0xB4-0xB7 */ + 0xB4, 0x5E, 0xED, 0xDB, 0xB4, 0x5F, 0xB4, 0x60, /* 0xB8-0xBB */ + 0xB4, 0x61, 0xB4, 0x62, 0xC4, 0xEB, 0xB4, 0x63, /* 0xBC-0xBF */ + 0xB4, 0x64, 0xB4, 0xC5, 0xB4, 0x65, 0xB4, 0x66, /* 0xC0-0xC3 */ + 0xB4, 0x67, 0xB0, 0xF5, 0xB4, 0x68, 0xB4, 0x69, /* 0xC4-0xC7 */ + 0xB4, 0x6A, 0xED, 0xDF, 0xC0, 0xDA, 0xB4, 0xE8, /* 0xC8-0xCB */ + 0xB4, 0x6B, 0xB4, 0x6C, 0xB4, 0x6D, 0xB4, 0x6E, /* 0xCC-0xCF */ + 0xC5, 0xCD, 0xB4, 0x6F, 0xB4, 0x70, 0xB4, 0x71, /* 0xD0-0xD3 */ + 0xED, 0xDD, 0xBF, 0xC4, 0xB4, 0x72, 0xB4, 0x73, /* 0xD4-0xD7 */ + 0xB4, 0x74, 0xED, 0xDE, 0xB4, 0x75, 0xB4, 0x76, /* 0xD8-0xDB */ + 0xB4, 0x77, 0xB4, 0x78, 0xB4, 0x79, 0xB4, 0x7A, /* 0xDC-0xDF */ + 0xB4, 0x7B, 0xB4, 0x7C, 0xB4, 0x7D, 0xB4, 0x7E, /* 0xE0-0xE3 */ + 0xB4, 0x80, 0xB4, 0x81, 0xB4, 0x82, 0xB4, 0x83, /* 0xE4-0xE7 */ + 0xC4, 0xA5, 0xB4, 0x84, 0xB4, 0x85, 0xB4, 0x86, /* 0xE8-0xEB */ + 0xED, 0xE0, 0xB4, 0x87, 0xB4, 0x88, 0xB4, 0x89, /* 0xEC-0xEF */ + 0xB4, 0x8A, 0xB4, 0x8B, 0xED, 0xE1, 0xB4, 0x8C, /* 0xF0-0xF3 */ + 0xED, 0xE3, 0xB4, 0x8D, 0xB4, 0x8E, 0xC1, 0xD7, /* 0xF4-0xF7 */ + 0xB4, 0x8F, 0xB4, 0x90, 0xBB, 0xC7, 0xB4, 0x91, /* 0xF8-0xFB */ + 0xB4, 0x92, 0xB4, 0x93, 0xB4, 0x94, 0xB4, 0x95, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_79[512] = { + 0xB4, 0x96, 0xBD, 0xB8, 0xB4, 0x97, 0xB4, 0x98, /* 0x00-0x03 */ + 0xB4, 0x99, 0xED, 0xE2, 0xB4, 0x9A, 0xB4, 0x9B, /* 0x04-0x07 */ + 0xB4, 0x9C, 0xB4, 0x9D, 0xB4, 0x9E, 0xB4, 0x9F, /* 0x08-0x0B */ + 0xB4, 0xA0, 0xB5, 0x40, 0xB5, 0x41, 0xB5, 0x42, /* 0x0C-0x0F */ + 0xB5, 0x43, 0xB5, 0x44, 0xB5, 0x45, 0xED, 0xE4, /* 0x10-0x13 */ + 0xB5, 0x46, 0xB5, 0x47, 0xB5, 0x48, 0xB5, 0x49, /* 0x14-0x17 */ + 0xB5, 0x4A, 0xB5, 0x4B, 0xB5, 0x4C, 0xB5, 0x4D, /* 0x18-0x1B */ + 0xB5, 0x4E, 0xB5, 0x4F, 0xED, 0xE6, 0xB5, 0x50, /* 0x1C-0x1F */ + 0xB5, 0x51, 0xB5, 0x52, 0xB5, 0x53, 0xB5, 0x54, /* 0x20-0x23 */ + 0xED, 0xE5, 0xB5, 0x55, 0xB5, 0x56, 0xB5, 0x57, /* 0x24-0x27 */ + 0xB5, 0x58, 0xB5, 0x59, 0xB5, 0x5A, 0xB5, 0x5B, /* 0x28-0x2B */ + 0xB5, 0x5C, 0xB5, 0x5D, 0xB5, 0x5E, 0xB5, 0x5F, /* 0x2C-0x2F */ + 0xB5, 0x60, 0xB5, 0x61, 0xB5, 0x62, 0xB5, 0x63, /* 0x30-0x33 */ + 0xED, 0xE7, 0xB5, 0x64, 0xB5, 0x65, 0xB5, 0x66, /* 0x34-0x37 */ + 0xB5, 0x67, 0xB5, 0x68, 0xCA, 0xBE, 0xEC, 0xEA, /* 0x38-0x3B */ + 0xC0, 0xF1, 0xB5, 0x69, 0xC9, 0xE7, 0xB5, 0x6A, /* 0x3C-0x3F */ + 0xEC, 0xEB, 0xC6, 0xEE, 0xB5, 0x6B, 0xB5, 0x6C, /* 0x40-0x43 */ + 0xB5, 0x6D, 0xB5, 0x6E, 0xEC, 0xEC, 0xB5, 0x6F, /* 0x44-0x47 */ + 0xC6, 0xED, 0xEC, 0xED, 0xB5, 0x70, 0xB5, 0x71, /* 0x48-0x4B */ + 0xB5, 0x72, 0xB5, 0x73, 0xB5, 0x74, 0xB5, 0x75, /* 0x4C-0x4F */ + 0xB5, 0x76, 0xB5, 0x77, 0xB5, 0x78, 0xEC, 0xF0, /* 0x50-0x53 */ + 0xB5, 0x79, 0xB5, 0x7A, 0xD7, 0xE6, 0xEC, 0xF3, /* 0x54-0x57 */ + 0xB5, 0x7B, 0xB5, 0x7C, 0xEC, 0xF1, 0xEC, 0xEE, /* 0x58-0x5B */ + 0xEC, 0xEF, 0xD7, 0xA3, 0xC9, 0xF1, 0xCB, 0xEE, /* 0x5C-0x5F */ + 0xEC, 0xF4, 0xB5, 0x7D, 0xEC, 0xF2, 0xB5, 0x7E, /* 0x60-0x63 */ + 0xB5, 0x80, 0xCF, 0xE9, 0xB5, 0x81, 0xEC, 0xF6, /* 0x64-0x67 */ + 0xC6, 0xB1, 0xB5, 0x82, 0xB5, 0x83, 0xB5, 0x84, /* 0x68-0x6B */ + 0xB5, 0x85, 0xBC, 0xC0, 0xB5, 0x86, 0xEC, 0xF5, /* 0x6C-0x6F */ + 0xB5, 0x87, 0xB5, 0x88, 0xB5, 0x89, 0xB5, 0x8A, /* 0x70-0x73 */ + 0xB5, 0x8B, 0xB5, 0x8C, 0xB5, 0x8D, 0xB5, 0xBB, /* 0x74-0x77 */ + 0xBB, 0xF6, 0xB5, 0x8E, 0xEC, 0xF7, 0xB5, 0x8F, /* 0x78-0x7B */ + 0xB5, 0x90, 0xB5, 0x91, 0xB5, 0x92, 0xB5, 0x93, /* 0x7C-0x7F */ + + 0xD9, 0xF7, 0xBD, 0xFB, 0xB5, 0x94, 0xB5, 0x95, /* 0x80-0x83 */ + 0xC2, 0xBB, 0xEC, 0xF8, 0xB5, 0x96, 0xB5, 0x97, /* 0x84-0x87 */ + 0xB5, 0x98, 0xB5, 0x99, 0xEC, 0xF9, 0xB5, 0x9A, /* 0x88-0x8B */ + 0xB5, 0x9B, 0xB5, 0x9C, 0xB5, 0x9D, 0xB8, 0xA3, /* 0x8C-0x8F */ + 0xB5, 0x9E, 0xB5, 0x9F, 0xB5, 0xA0, 0xB6, 0x40, /* 0x90-0x93 */ + 0xB6, 0x41, 0xB6, 0x42, 0xB6, 0x43, 0xB6, 0x44, /* 0x94-0x97 */ + 0xB6, 0x45, 0xB6, 0x46, 0xEC, 0xFA, 0xB6, 0x47, /* 0x98-0x9B */ + 0xB6, 0x48, 0xB6, 0x49, 0xB6, 0x4A, 0xB6, 0x4B, /* 0x9C-0x9F */ + 0xB6, 0x4C, 0xB6, 0x4D, 0xB6, 0x4E, 0xB6, 0x4F, /* 0xA0-0xA3 */ + 0xB6, 0x50, 0xB6, 0x51, 0xB6, 0x52, 0xEC, 0xFB, /* 0xA4-0xA7 */ + 0xB6, 0x53, 0xB6, 0x54, 0xB6, 0x55, 0xB6, 0x56, /* 0xA8-0xAB */ + 0xB6, 0x57, 0xB6, 0x58, 0xB6, 0x59, 0xB6, 0x5A, /* 0xAC-0xAF */ + 0xB6, 0x5B, 0xB6, 0x5C, 0xB6, 0x5D, 0xEC, 0xFC, /* 0xB0-0xB3 */ + 0xB6, 0x5E, 0xB6, 0x5F, 0xB6, 0x60, 0xB6, 0x61, /* 0xB4-0xB7 */ + 0xB6, 0x62, 0xD3, 0xED, 0xD8, 0xAE, 0xC0, 0xEB, /* 0xB8-0xBB */ + 0xB6, 0x63, 0xC7, 0xDD, 0xBA, 0xCC, 0xB6, 0x64, /* 0xBC-0xBF */ + 0xD0, 0xE3, 0xCB, 0xBD, 0xB6, 0x65, 0xCD, 0xBA, /* 0xC0-0xC3 */ + 0xB6, 0x66, 0xB6, 0x67, 0xB8, 0xD1, 0xB6, 0x68, /* 0xC4-0xC7 */ + 0xB6, 0x69, 0xB1, 0xFC, 0xB6, 0x6A, 0xC7, 0xEF, /* 0xC8-0xCB */ + 0xB6, 0x6B, 0xD6, 0xD6, 0xB6, 0x6C, 0xB6, 0x6D, /* 0xCC-0xCF */ + 0xB6, 0x6E, 0xBF, 0xC6, 0xC3, 0xEB, 0xB6, 0x6F, /* 0xD0-0xD3 */ + 0xB6, 0x70, 0xEF, 0xF5, 0xB6, 0x71, 0xB6, 0x72, /* 0xD4-0xD7 */ + 0xC3, 0xD8, 0xB6, 0x73, 0xB6, 0x74, 0xB6, 0x75, /* 0xD8-0xDB */ + 0xB6, 0x76, 0xB6, 0x77, 0xB6, 0x78, 0xD7, 0xE2, /* 0xDC-0xDF */ + 0xB6, 0x79, 0xB6, 0x7A, 0xB6, 0x7B, 0xEF, 0xF7, /* 0xE0-0xE3 */ + 0xB3, 0xD3, 0xB6, 0x7C, 0xC7, 0xD8, 0xD1, 0xED, /* 0xE4-0xE7 */ + 0xB6, 0x7D, 0xD6, 0xC8, 0xB6, 0x7E, 0xEF, 0xF8, /* 0xE8-0xEB */ + 0xB6, 0x80, 0xEF, 0xF6, 0xB6, 0x81, 0xBB, 0xFD, /* 0xEC-0xEF */ + 0xB3, 0xC6, 0xB6, 0x82, 0xB6, 0x83, 0xB6, 0x84, /* 0xF0-0xF3 */ + 0xB6, 0x85, 0xB6, 0x86, 0xB6, 0x87, 0xB6, 0x88, /* 0xF4-0xF7 */ + 0xBD, 0xD5, 0xB6, 0x89, 0xB6, 0x8A, 0xD2, 0xC6, /* 0xF8-0xFB */ + 0xB6, 0x8B, 0xBB, 0xE0, 0xB6, 0x8C, 0xB6, 0x8D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7A[512] = { + 0xCF, 0xA1, 0xB6, 0x8E, 0xEF, 0xFC, 0xEF, 0xFB, /* 0x00-0x03 */ + 0xB6, 0x8F, 0xB6, 0x90, 0xEF, 0xF9, 0xB6, 0x91, /* 0x04-0x07 */ + 0xB6, 0x92, 0xB6, 0x93, 0xB6, 0x94, 0xB3, 0xCC, /* 0x08-0x0B */ + 0xB6, 0x95, 0xC9, 0xD4, 0xCB, 0xB0, 0xB6, 0x96, /* 0x0C-0x0F */ + 0xB6, 0x97, 0xB6, 0x98, 0xB6, 0x99, 0xB6, 0x9A, /* 0x10-0x13 */ + 0xEF, 0xFE, 0xB6, 0x9B, 0xB6, 0x9C, 0xB0, 0xDE, /* 0x14-0x17 */ + 0xB6, 0x9D, 0xB6, 0x9E, 0xD6, 0xC9, 0xB6, 0x9F, /* 0x18-0x1B */ + 0xB6, 0xA0, 0xB7, 0x40, 0xEF, 0xFD, 0xB7, 0x41, /* 0x1C-0x1F */ + 0xB3, 0xED, 0xB7, 0x42, 0xB7, 0x43, 0xF6, 0xD5, /* 0x20-0x23 */ + 0xB7, 0x44, 0xB7, 0x45, 0xB7, 0x46, 0xB7, 0x47, /* 0x24-0x27 */ + 0xB7, 0x48, 0xB7, 0x49, 0xB7, 0x4A, 0xB7, 0x4B, /* 0x28-0x2B */ + 0xB7, 0x4C, 0xB7, 0x4D, 0xB7, 0x4E, 0xB7, 0x4F, /* 0x2C-0x2F */ + 0xB7, 0x50, 0xB7, 0x51, 0xB7, 0x52, 0xCE, 0xC8, /* 0x30-0x33 */ + 0xB7, 0x53, 0xB7, 0x54, 0xB7, 0x55, 0xF0, 0xA2, /* 0x34-0x37 */ + 0xB7, 0x56, 0xF0, 0xA1, 0xB7, 0x57, 0xB5, 0xBE, /* 0x38-0x3B */ + 0xBC, 0xDA, 0xBB, 0xFC, 0xB7, 0x58, 0xB8, 0xE5, /* 0x3C-0x3F */ + 0xB7, 0x59, 0xB7, 0x5A, 0xB7, 0x5B, 0xB7, 0x5C, /* 0x40-0x43 */ + 0xB7, 0x5D, 0xB7, 0x5E, 0xC4, 0xC2, 0xB7, 0x5F, /* 0x44-0x47 */ + 0xB7, 0x60, 0xB7, 0x61, 0xB7, 0x62, 0xB7, 0x63, /* 0x48-0x4B */ + 0xB7, 0x64, 0xB7, 0x65, 0xB7, 0x66, 0xB7, 0x67, /* 0x4C-0x4F */ + 0xB7, 0x68, 0xF0, 0xA3, 0xB7, 0x69, 0xB7, 0x6A, /* 0x50-0x53 */ + 0xB7, 0x6B, 0xB7, 0x6C, 0xB7, 0x6D, 0xCB, 0xEB, /* 0x54-0x57 */ + 0xB7, 0x6E, 0xB7, 0x6F, 0xB7, 0x70, 0xB7, 0x71, /* 0x58-0x5B */ + 0xB7, 0x72, 0xB7, 0x73, 0xB7, 0x74, 0xB7, 0x75, /* 0x5C-0x5F */ + 0xB7, 0x76, 0xB7, 0x77, 0xB7, 0x78, 0xB7, 0x79, /* 0x60-0x63 */ + 0xB7, 0x7A, 0xB7, 0x7B, 0xB7, 0x7C, 0xB7, 0x7D, /* 0x64-0x67 */ + 0xB7, 0x7E, 0xB7, 0x80, 0xB7, 0x81, 0xB7, 0x82, /* 0x68-0x6B */ + 0xB7, 0x83, 0xB7, 0x84, 0xB7, 0x85, 0xB7, 0x86, /* 0x6C-0x6F */ + 0xF0, 0xA6, 0xB7, 0x87, 0xB7, 0x88, 0xB7, 0x89, /* 0x70-0x73 */ + 0xD1, 0xA8, 0xB7, 0x8A, 0xBE, 0xBF, 0xC7, 0xEE, /* 0x74-0x77 */ + 0xF1, 0xB6, 0xF1, 0xB7, 0xBF, 0xD5, 0xB7, 0x8B, /* 0x78-0x7B */ + 0xB7, 0x8C, 0xB7, 0x8D, 0xB7, 0x8E, 0xB4, 0xA9, /* 0x7C-0x7F */ + + 0xF1, 0xB8, 0xCD, 0xBB, 0xB7, 0x8F, 0xC7, 0xD4, /* 0x80-0x83 */ + 0xD5, 0xAD, 0xB7, 0x90, 0xF1, 0xB9, 0xB7, 0x91, /* 0x84-0x87 */ + 0xF1, 0xBA, 0xB7, 0x92, 0xB7, 0x93, 0xB7, 0x94, /* 0x88-0x8B */ + 0xB7, 0x95, 0xC7, 0xCF, 0xB7, 0x96, 0xB7, 0x97, /* 0x8C-0x8F */ + 0xB7, 0x98, 0xD2, 0xA4, 0xD6, 0xCF, 0xB7, 0x99, /* 0x90-0x93 */ + 0xB7, 0x9A, 0xF1, 0xBB, 0xBD, 0xD1, 0xB4, 0xB0, /* 0x94-0x97 */ + 0xBE, 0xBD, 0xB7, 0x9B, 0xB7, 0x9C, 0xB7, 0x9D, /* 0x98-0x9B */ + 0xB4, 0xDC, 0xCE, 0xD1, 0xB7, 0x9E, 0xBF, 0xDF, /* 0x9C-0x9F */ + 0xF1, 0xBD, 0xB7, 0x9F, 0xB7, 0xA0, 0xB8, 0x40, /* 0xA0-0xA3 */ + 0xB8, 0x41, 0xBF, 0xFA, 0xF1, 0xBC, 0xB8, 0x42, /* 0xA4-0xA7 */ + 0xF1, 0xBF, 0xB8, 0x43, 0xB8, 0x44, 0xB8, 0x45, /* 0xA8-0xAB */ + 0xF1, 0xBE, 0xF1, 0xC0, 0xB8, 0x46, 0xB8, 0x47, /* 0xAC-0xAF */ + 0xB8, 0x48, 0xB8, 0x49, 0xB8, 0x4A, 0xF1, 0xC1, /* 0xB0-0xB3 */ + 0xB8, 0x4B, 0xB8, 0x4C, 0xB8, 0x4D, 0xB8, 0x4E, /* 0xB4-0xB7 */ + 0xB8, 0x4F, 0xB8, 0x50, 0xB8, 0x51, 0xB8, 0x52, /* 0xB8-0xBB */ + 0xB8, 0x53, 0xB8, 0x54, 0xB8, 0x55, 0xC1, 0xFE, /* 0xBC-0xBF */ + 0xB8, 0x56, 0xB8, 0x57, 0xB8, 0x58, 0xB8, 0x59, /* 0xC0-0xC3 */ + 0xB8, 0x5A, 0xB8, 0x5B, 0xB8, 0x5C, 0xB8, 0x5D, /* 0xC4-0xC7 */ + 0xB8, 0x5E, 0xB8, 0x5F, 0xB8, 0x60, 0xC1, 0xA2, /* 0xC8-0xCB */ + 0xB8, 0x61, 0xB8, 0x62, 0xB8, 0x63, 0xB8, 0x64, /* 0xCC-0xCF */ + 0xB8, 0x65, 0xB8, 0x66, 0xB8, 0x67, 0xB8, 0x68, /* 0xD0-0xD3 */ + 0xB8, 0x69, 0xB8, 0x6A, 0xCA, 0xFA, 0xB8, 0x6B, /* 0xD4-0xD7 */ + 0xB8, 0x6C, 0xD5, 0xBE, 0xB8, 0x6D, 0xB8, 0x6E, /* 0xD8-0xDB */ + 0xB8, 0x6F, 0xB8, 0x70, 0xBE, 0xBA, 0xBE, 0xB9, /* 0xDC-0xDF */ + 0xD5, 0xC2, 0xB8, 0x71, 0xB8, 0x72, 0xBF, 0xA2, /* 0xE0-0xE3 */ + 0xB8, 0x73, 0xCD, 0xAF, 0xF1, 0xB5, 0xB8, 0x74, /* 0xE4-0xE7 */ + 0xB8, 0x75, 0xB8, 0x76, 0xB8, 0x77, 0xB8, 0x78, /* 0xE8-0xEB */ + 0xB8, 0x79, 0xBD, 0xDF, 0xB8, 0x7A, 0xB6, 0xCB, /* 0xEC-0xEF */ + 0xB8, 0x7B, 0xB8, 0x7C, 0xB8, 0x7D, 0xB8, 0x7E, /* 0xF0-0xF3 */ + 0xB8, 0x80, 0xB8, 0x81, 0xB8, 0x82, 0xB8, 0x83, /* 0xF4-0xF7 */ + 0xB8, 0x84, 0xD6, 0xF1, 0xF3, 0xC3, 0xB8, 0x85, /* 0xF8-0xFB */ + 0xB8, 0x86, 0xF3, 0xC4, 0xB8, 0x87, 0xB8, 0xCD, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7B[512] = { + 0xB8, 0x88, 0xB8, 0x89, 0xB8, 0x8A, 0xF3, 0xC6, /* 0x00-0x03 */ + 0xF3, 0xC7, 0xB8, 0x8B, 0xB0, 0xCA, 0xB8, 0x8C, /* 0x04-0x07 */ + 0xF3, 0xC5, 0xB8, 0x8D, 0xF3, 0xC9, 0xCB, 0xF1, /* 0x08-0x0B */ + 0xB8, 0x8E, 0xB8, 0x8F, 0xB8, 0x90, 0xF3, 0xCB, /* 0x0C-0x0F */ + 0xB8, 0x91, 0xD0, 0xA6, 0xB8, 0x92, 0xB8, 0x93, /* 0x10-0x13 */ + 0xB1, 0xCA, 0xF3, 0xC8, 0xB8, 0x94, 0xB8, 0x95, /* 0x14-0x17 */ + 0xB8, 0x96, 0xF3, 0xCF, 0xB8, 0x97, 0xB5, 0xD1, /* 0x18-0x1B */ + 0xB8, 0x98, 0xB8, 0x99, 0xF3, 0xD7, 0xB8, 0x9A, /* 0x1C-0x1F */ + 0xF3, 0xD2, 0xB8, 0x9B, 0xB8, 0x9C, 0xB8, 0x9D, /* 0x20-0x23 */ + 0xF3, 0xD4, 0xF3, 0xD3, 0xB7, 0xFB, 0xB8, 0x9E, /* 0x24-0x27 */ + 0xB1, 0xBF, 0xB8, 0x9F, 0xF3, 0xCE, 0xF3, 0xCA, /* 0x28-0x2B */ + 0xB5, 0xDA, 0xB8, 0xA0, 0xF3, 0xD0, 0xB9, 0x40, /* 0x2C-0x2F */ + 0xB9, 0x41, 0xF3, 0xD1, 0xB9, 0x42, 0xF3, 0xD5, /* 0x30-0x33 */ + 0xB9, 0x43, 0xB9, 0x44, 0xB9, 0x45, 0xB9, 0x46, /* 0x34-0x37 */ + 0xF3, 0xCD, 0xB9, 0x47, 0xBC, 0xE3, 0xB9, 0x48, /* 0x38-0x3B */ + 0xC1, 0xFD, 0xB9, 0x49, 0xF3, 0xD6, 0xB9, 0x4A, /* 0x3C-0x3F */ + 0xB9, 0x4B, 0xB9, 0x4C, 0xB9, 0x4D, 0xB9, 0x4E, /* 0x40-0x43 */ + 0xB9, 0x4F, 0xF3, 0xDA, 0xB9, 0x50, 0xF3, 0xCC, /* 0x44-0x47 */ + 0xB9, 0x51, 0xB5, 0xC8, 0xB9, 0x52, 0xBD, 0xEE, /* 0x48-0x4B */ + 0xF3, 0xDC, 0xB9, 0x53, 0xB9, 0x54, 0xB7, 0xA4, /* 0x4C-0x4F */ + 0xBF, 0xF0, 0xD6, 0xFE, 0xCD, 0xB2, 0xB9, 0x55, /* 0x50-0x53 */ + 0xB4, 0xF0, 0xB9, 0x56, 0xB2, 0xDF, 0xB9, 0x57, /* 0x54-0x57 */ + 0xF3, 0xD8, 0xB9, 0x58, 0xF3, 0xD9, 0xC9, 0xB8, /* 0x58-0x5B */ + 0xB9, 0x59, 0xF3, 0xDD, 0xB9, 0x5A, 0xB9, 0x5B, /* 0x5C-0x5F */ + 0xF3, 0xDE, 0xB9, 0x5C, 0xF3, 0xE1, 0xB9, 0x5D, /* 0x60-0x63 */ + 0xB9, 0x5E, 0xB9, 0x5F, 0xB9, 0x60, 0xB9, 0x61, /* 0x64-0x67 */ + 0xB9, 0x62, 0xB9, 0x63, 0xB9, 0x64, 0xB9, 0x65, /* 0x68-0x6B */ + 0xB9, 0x66, 0xB9, 0x67, 0xF3, 0xDF, 0xB9, 0x68, /* 0x6C-0x6F */ + 0xB9, 0x69, 0xF3, 0xE3, 0xF3, 0xE2, 0xB9, 0x6A, /* 0x70-0x73 */ + 0xB9, 0x6B, 0xF3, 0xDB, 0xB9, 0x6C, 0xBF, 0xEA, /* 0x74-0x77 */ + 0xB9, 0x6D, 0xB3, 0xEF, 0xB9, 0x6E, 0xF3, 0xE0, /* 0x78-0x7B */ + 0xB9, 0x6F, 0xB9, 0x70, 0xC7, 0xA9, 0xB9, 0x71, /* 0x7C-0x7F */ + + 0xBC, 0xF2, 0xB9, 0x72, 0xB9, 0x73, 0xB9, 0x74, /* 0x80-0x83 */ + 0xB9, 0x75, 0xF3, 0xEB, 0xB9, 0x76, 0xB9, 0x77, /* 0x84-0x87 */ + 0xB9, 0x78, 0xB9, 0x79, 0xB9, 0x7A, 0xB9, 0x7B, /* 0x88-0x8B */ + 0xB9, 0x7C, 0xB9, 0xBF, 0xB9, 0x7D, 0xB9, 0x7E, /* 0x8C-0x8F */ + 0xF3, 0xE4, 0xB9, 0x80, 0xB9, 0x81, 0xB9, 0x82, /* 0x90-0x93 */ + 0xB2, 0xAD, 0xBB, 0xFE, 0xB9, 0x83, 0xCB, 0xE3, /* 0x94-0x97 */ + 0xB9, 0x84, 0xB9, 0x85, 0xB9, 0x86, 0xB9, 0x87, /* 0x98-0x9B */ + 0xF3, 0xED, 0xF3, 0xE9, 0xB9, 0x88, 0xB9, 0x89, /* 0x9C-0x9F */ + 0xB9, 0x8A, 0xB9, 0xDC, 0xF3, 0xEE, 0xB9, 0x8B, /* 0xA0-0xA3 */ + 0xB9, 0x8C, 0xB9, 0x8D, 0xF3, 0xE5, 0xF3, 0xE6, /* 0xA4-0xA7 */ + 0xF3, 0xEA, 0xC2, 0xE1, 0xF3, 0xEC, 0xF3, 0xEF, /* 0xA8-0xAB */ + 0xF3, 0xE8, 0xBC, 0xFD, 0xB9, 0x8E, 0xB9, 0x8F, /* 0xAC-0xAF */ + 0xB9, 0x90, 0xCF, 0xE4, 0xB9, 0x91, 0xB9, 0x92, /* 0xB0-0xB3 */ + 0xF3, 0xF0, 0xB9, 0x93, 0xB9, 0x94, 0xB9, 0x95, /* 0xB4-0xB7 */ + 0xF3, 0xE7, 0xB9, 0x96, 0xB9, 0x97, 0xB9, 0x98, /* 0xB8-0xBB */ + 0xB9, 0x99, 0xB9, 0x9A, 0xB9, 0x9B, 0xB9, 0x9C, /* 0xBC-0xBF */ + 0xB9, 0x9D, 0xF3, 0xF2, 0xB9, 0x9E, 0xB9, 0x9F, /* 0xC0-0xC3 */ + 0xB9, 0xA0, 0xBA, 0x40, 0xD7, 0xAD, 0xC6, 0xAA, /* 0xC4-0xC7 */ + 0xBA, 0x41, 0xBA, 0x42, 0xBA, 0x43, 0xBA, 0x44, /* 0xC8-0xCB */ + 0xF3, 0xF3, 0xBA, 0x45, 0xBA, 0x46, 0xBA, 0x47, /* 0xCC-0xCF */ + 0xBA, 0x48, 0xF3, 0xF1, 0xBA, 0x49, 0xC2, 0xA8, /* 0xD0-0xD3 */ + 0xBA, 0x4A, 0xBA, 0x4B, 0xBA, 0x4C, 0xBA, 0x4D, /* 0xD4-0xD7 */ + 0xBA, 0x4E, 0xB8, 0xDD, 0xF3, 0xF5, 0xBA, 0x4F, /* 0xD8-0xDB */ + 0xBA, 0x50, 0xF3, 0xF4, 0xBA, 0x51, 0xBA, 0x52, /* 0xDC-0xDF */ + 0xBA, 0x53, 0xB4, 0xDB, 0xBA, 0x54, 0xBA, 0x55, /* 0xE0-0xE3 */ + 0xBA, 0x56, 0xF3, 0xF6, 0xF3, 0xF7, 0xBA, 0x57, /* 0xE4-0xE7 */ + 0xBA, 0x58, 0xBA, 0x59, 0xF3, 0xF8, 0xBA, 0x5A, /* 0xE8-0xEB */ + 0xBA, 0x5B, 0xBA, 0x5C, 0xC0, 0xBA, 0xBA, 0x5D, /* 0xEC-0xEF */ + 0xBA, 0x5E, 0xC0, 0xE9, 0xBA, 0x5F, 0xBA, 0x60, /* 0xF0-0xF3 */ + 0xBA, 0x61, 0xBA, 0x62, 0xBA, 0x63, 0xC5, 0xF1, /* 0xF4-0xF7 */ + 0xBA, 0x64, 0xBA, 0x65, 0xBA, 0x66, 0xBA, 0x67, /* 0xF8-0xFB */ + 0xF3, 0xFB, 0xBA, 0x68, 0xF3, 0xFA, 0xBA, 0x69, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7C[512] = { + 0xBA, 0x6A, 0xBA, 0x6B, 0xBA, 0x6C, 0xBA, 0x6D, /* 0x00-0x03 */ + 0xBA, 0x6E, 0xBA, 0x6F, 0xBA, 0x70, 0xB4, 0xD8, /* 0x04-0x07 */ + 0xBA, 0x71, 0xBA, 0x72, 0xBA, 0x73, 0xF3, 0xFE, /* 0x08-0x0B */ + 0xF3, 0xF9, 0xBA, 0x74, 0xBA, 0x75, 0xF3, 0xFC, /* 0x0C-0x0F */ + 0xBA, 0x76, 0xBA, 0x77, 0xBA, 0x78, 0xBA, 0x79, /* 0x10-0x13 */ + 0xBA, 0x7A, 0xBA, 0x7B, 0xF3, 0xFD, 0xBA, 0x7C, /* 0x14-0x17 */ + 0xBA, 0x7D, 0xBA, 0x7E, 0xBA, 0x80, 0xBA, 0x81, /* 0x18-0x1B */ + 0xBA, 0x82, 0xBA, 0x83, 0xBA, 0x84, 0xF4, 0xA1, /* 0x1C-0x1F */ + 0xBA, 0x85, 0xBA, 0x86, 0xBA, 0x87, 0xBA, 0x88, /* 0x20-0x23 */ + 0xBA, 0x89, 0xBA, 0x8A, 0xF4, 0xA3, 0xBB, 0xC9, /* 0x24-0x27 */ + 0xBA, 0x8B, 0xBA, 0x8C, 0xF4, 0xA2, 0xBA, 0x8D, /* 0x28-0x2B */ + 0xBA, 0x8E, 0xBA, 0x8F, 0xBA, 0x90, 0xBA, 0x91, /* 0x2C-0x2F */ + 0xBA, 0x92, 0xBA, 0x93, 0xBA, 0x94, 0xBA, 0x95, /* 0x30-0x33 */ + 0xBA, 0x96, 0xBA, 0x97, 0xBA, 0x98, 0xBA, 0x99, /* 0x34-0x37 */ + 0xF4, 0xA4, 0xBA, 0x9A, 0xBA, 0x9B, 0xBA, 0x9C, /* 0x38-0x3B */ + 0xBA, 0x9D, 0xBA, 0x9E, 0xBA, 0x9F, 0xB2, 0xBE, /* 0x3C-0x3F */ + 0xF4, 0xA6, 0xF4, 0xA5, 0xBA, 0xA0, 0xBB, 0x40, /* 0x40-0x43 */ + 0xBB, 0x41, 0xBB, 0x42, 0xBB, 0x43, 0xBB, 0x44, /* 0x44-0x47 */ + 0xBB, 0x45, 0xBB, 0x46, 0xBB, 0x47, 0xBB, 0x48, /* 0x48-0x4B */ + 0xBB, 0x49, 0xBC, 0xAE, 0xBB, 0x4A, 0xBB, 0x4B, /* 0x4C-0x4F */ + 0xBB, 0x4C, 0xBB, 0x4D, 0xBB, 0x4E, 0xBB, 0x4F, /* 0x50-0x53 */ + 0xBB, 0x50, 0xBB, 0x51, 0xBB, 0x52, 0xBB, 0x53, /* 0x54-0x57 */ + 0xBB, 0x54, 0xBB, 0x55, 0xBB, 0x56, 0xBB, 0x57, /* 0x58-0x5B */ + 0xBB, 0x58, 0xBB, 0x59, 0xBB, 0x5A, 0xBB, 0x5B, /* 0x5C-0x5F */ + 0xBB, 0x5C, 0xBB, 0x5D, 0xBB, 0x5E, 0xBB, 0x5F, /* 0x60-0x63 */ + 0xBB, 0x60, 0xBB, 0x61, 0xBB, 0x62, 0xBB, 0x63, /* 0x64-0x67 */ + 0xBB, 0x64, 0xBB, 0x65, 0xBB, 0x66, 0xBB, 0x67, /* 0x68-0x6B */ + 0xBB, 0x68, 0xBB, 0x69, 0xBB, 0x6A, 0xBB, 0x6B, /* 0x6C-0x6F */ + 0xBB, 0x6C, 0xBB, 0x6D, 0xBB, 0x6E, 0xC3, 0xD7, /* 0x70-0x73 */ + 0xD9, 0xE1, 0xBB, 0x6F, 0xBB, 0x70, 0xBB, 0x71, /* 0x74-0x77 */ + 0xBB, 0x72, 0xBB, 0x73, 0xBB, 0x74, 0xC0, 0xE0, /* 0x78-0x7B */ + 0xF4, 0xCC, 0xD7, 0xD1, 0xBB, 0x75, 0xBB, 0x76, /* 0x7C-0x7F */ + + 0xBB, 0x77, 0xBB, 0x78, 0xBB, 0x79, 0xBB, 0x7A, /* 0x80-0x83 */ + 0xBB, 0x7B, 0xBB, 0x7C, 0xBB, 0x7D, 0xBB, 0x7E, /* 0x84-0x87 */ + 0xBB, 0x80, 0xB7, 0xDB, 0xBB, 0x81, 0xBB, 0x82, /* 0x88-0x8B */ + 0xBB, 0x83, 0xBB, 0x84, 0xBB, 0x85, 0xBB, 0x86, /* 0x8C-0x8F */ + 0xBB, 0x87, 0xF4, 0xCE, 0xC1, 0xA3, 0xBB, 0x88, /* 0x90-0x93 */ + 0xBB, 0x89, 0xC6, 0xC9, 0xBB, 0x8A, 0xB4, 0xD6, /* 0x94-0x97 */ + 0xD5, 0xB3, 0xBB, 0x8B, 0xBB, 0x8C, 0xBB, 0x8D, /* 0x98-0x9B */ + 0xF4, 0xD0, 0xF4, 0xCF, 0xF4, 0xD1, 0xCB, 0xDA, /* 0x9C-0x9F */ + 0xBB, 0x8E, 0xBB, 0x8F, 0xF4, 0xD2, 0xBB, 0x90, /* 0xA0-0xA3 */ + 0xD4, 0xC1, 0xD6, 0xE0, 0xBB, 0x91, 0xBB, 0x92, /* 0xA4-0xA7 */ + 0xBB, 0x93, 0xBB, 0x94, 0xB7, 0xE0, 0xBB, 0x95, /* 0xA8-0xAB */ + 0xBB, 0x96, 0xBB, 0x97, 0xC1, 0xB8, 0xBB, 0x98, /* 0xAC-0xAF */ + 0xBB, 0x99, 0xC1, 0xBB, 0xF4, 0xD3, 0xBE, 0xAC, /* 0xB0-0xB3 */ + 0xBB, 0x9A, 0xBB, 0x9B, 0xBB, 0x9C, 0xBB, 0x9D, /* 0xB4-0xB7 */ + 0xBB, 0x9E, 0xB4, 0xE2, 0xBB, 0x9F, 0xBB, 0xA0, /* 0xB8-0xBB */ + 0xF4, 0xD4, 0xF4, 0xD5, 0xBE, 0xAB, 0xBC, 0x40, /* 0xBC-0xBF */ + 0xBC, 0x41, 0xF4, 0xD6, 0xBC, 0x42, 0xBC, 0x43, /* 0xC0-0xC3 */ + 0xBC, 0x44, 0xF4, 0xDB, 0xBC, 0x45, 0xF4, 0xD7, /* 0xC4-0xC7 */ + 0xF4, 0xDA, 0xBC, 0x46, 0xBA, 0xFD, 0xBC, 0x47, /* 0xC8-0xCB */ + 0xF4, 0xD8, 0xF4, 0xD9, 0xBC, 0x48, 0xBC, 0x49, /* 0xCC-0xCF */ + 0xBC, 0x4A, 0xBC, 0x4B, 0xBC, 0x4C, 0xBC, 0x4D, /* 0xD0-0xD3 */ + 0xBC, 0x4E, 0xB8, 0xE2, 0xCC, 0xC7, 0xF4, 0xDC, /* 0xD4-0xD7 */ + 0xBC, 0x4F, 0xB2, 0xDA, 0xBC, 0x50, 0xBC, 0x51, /* 0xD8-0xDB */ + 0xC3, 0xD3, 0xBC, 0x52, 0xBC, 0x53, 0xD4, 0xE3, /* 0xDC-0xDF */ + 0xBF, 0xB7, 0xBC, 0x54, 0xBC, 0x55, 0xBC, 0x56, /* 0xE0-0xE3 */ + 0xBC, 0x57, 0xBC, 0x58, 0xBC, 0x59, 0xBC, 0x5A, /* 0xE4-0xE7 */ + 0xF4, 0xDD, 0xBC, 0x5B, 0xBC, 0x5C, 0xBC, 0x5D, /* 0xE8-0xEB */ + 0xBC, 0x5E, 0xBC, 0x5F, 0xBC, 0x60, 0xC5, 0xB4, /* 0xEC-0xEF */ + 0xBC, 0x61, 0xBC, 0x62, 0xBC, 0x63, 0xBC, 0x64, /* 0xF0-0xF3 */ + 0xBC, 0x65, 0xBC, 0x66, 0xBC, 0x67, 0xBC, 0x68, /* 0xF4-0xF7 */ + 0xF4, 0xE9, 0xBC, 0x69, 0xBC, 0x6A, 0xCF, 0xB5, /* 0xF8-0xFB */ + 0xBC, 0x6B, 0xBC, 0x6C, 0xBC, 0x6D, 0xBC, 0x6E, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7D[512] = { + 0xBC, 0x6F, 0xBC, 0x70, 0xBC, 0x71, 0xBC, 0x72, /* 0x00-0x03 */ + 0xBC, 0x73, 0xBC, 0x74, 0xBC, 0x75, 0xBC, 0x76, /* 0x04-0x07 */ + 0xBC, 0x77, 0xBC, 0x78, 0xCE, 0xC9, 0xBC, 0x79, /* 0x08-0x0B */ + 0xBC, 0x7A, 0xBC, 0x7B, 0xBC, 0x7C, 0xBC, 0x7D, /* 0x0C-0x0F */ + 0xBC, 0x7E, 0xBC, 0x80, 0xBC, 0x81, 0xBC, 0x82, /* 0x10-0x13 */ + 0xBC, 0x83, 0xBC, 0x84, 0xBC, 0x85, 0xBC, 0x86, /* 0x14-0x17 */ + 0xBC, 0x87, 0xBC, 0x88, 0xBC, 0x89, 0xBC, 0x8A, /* 0x18-0x1B */ + 0xBC, 0x8B, 0xBC, 0x8C, 0xBC, 0x8D, 0xBC, 0x8E, /* 0x1C-0x1F */ + 0xCB, 0xD8, 0xBC, 0x8F, 0xCB, 0xF7, 0xBC, 0x90, /* 0x20-0x23 */ + 0xBC, 0x91, 0xBC, 0x92, 0xBC, 0x93, 0xBD, 0xF4, /* 0x24-0x27 */ + 0xBC, 0x94, 0xBC, 0x95, 0xBC, 0x96, 0xD7, 0xCF, /* 0x28-0x2B */ + 0xBC, 0x97, 0xBC, 0x98, 0xBC, 0x99, 0xC0, 0xDB, /* 0x2C-0x2F */ + 0xBC, 0x9A, 0xBC, 0x9B, 0xBC, 0x9C, 0xBC, 0x9D, /* 0x30-0x33 */ + 0xBC, 0x9E, 0xBC, 0x9F, 0xBC, 0xA0, 0xBD, 0x40, /* 0x34-0x37 */ + 0xBD, 0x41, 0xBD, 0x42, 0xBD, 0x43, 0xBD, 0x44, /* 0x38-0x3B */ + 0xBD, 0x45, 0xBD, 0x46, 0xBD, 0x47, 0xBD, 0x48, /* 0x3C-0x3F */ + 0xBD, 0x49, 0xBD, 0x4A, 0xBD, 0x4B, 0xBD, 0x4C, /* 0x40-0x43 */ + 0xBD, 0x4D, 0xBD, 0x4E, 0xBD, 0x4F, 0xBD, 0x50, /* 0x44-0x47 */ + 0xBD, 0x51, 0xBD, 0x52, 0xBD, 0x53, 0xBD, 0x54, /* 0x48-0x4B */ + 0xBD, 0x55, 0xBD, 0x56, 0xBD, 0x57, 0xBD, 0x58, /* 0x4C-0x4F */ + 0xBD, 0x59, 0xBD, 0x5A, 0xBD, 0x5B, 0xBD, 0x5C, /* 0x50-0x53 */ + 0xBD, 0x5D, 0xBD, 0x5E, 0xBD, 0x5F, 0xBD, 0x60, /* 0x54-0x57 */ + 0xBD, 0x61, 0xBD, 0x62, 0xBD, 0x63, 0xBD, 0x64, /* 0x58-0x5B */ + 0xBD, 0x65, 0xBD, 0x66, 0xBD, 0x67, 0xBD, 0x68, /* 0x5C-0x5F */ + 0xBD, 0x69, 0xBD, 0x6A, 0xBD, 0x6B, 0xBD, 0x6C, /* 0x60-0x63 */ + 0xBD, 0x6D, 0xBD, 0x6E, 0xBD, 0x6F, 0xBD, 0x70, /* 0x64-0x67 */ + 0xBD, 0x71, 0xBD, 0x72, 0xBD, 0x73, 0xBD, 0x74, /* 0x68-0x6B */ + 0xBD, 0x75, 0xBD, 0x76, 0xD0, 0xF5, 0xBD, 0x77, /* 0x6C-0x6F */ + 0xBD, 0x78, 0xBD, 0x79, 0xBD, 0x7A, 0xBD, 0x7B, /* 0x70-0x73 */ + 0xBD, 0x7C, 0xBD, 0x7D, 0xBD, 0x7E, 0xF4, 0xEA, /* 0x74-0x77 */ + 0xBD, 0x80, 0xBD, 0x81, 0xBD, 0x82, 0xBD, 0x83, /* 0x78-0x7B */ + 0xBD, 0x84, 0xBD, 0x85, 0xBD, 0x86, 0xBD, 0x87, /* 0x7C-0x7F */ + + 0xBD, 0x88, 0xBD, 0x89, 0xBD, 0x8A, 0xBD, 0x8B, /* 0x80-0x83 */ + 0xBD, 0x8C, 0xBD, 0x8D, 0xBD, 0x8E, 0xBD, 0x8F, /* 0x84-0x87 */ + 0xBD, 0x90, 0xBD, 0x91, 0xBD, 0x92, 0xBD, 0x93, /* 0x88-0x8B */ + 0xBD, 0x94, 0xBD, 0x95, 0xBD, 0x96, 0xBD, 0x97, /* 0x8C-0x8F */ + 0xBD, 0x98, 0xBD, 0x99, 0xBD, 0x9A, 0xBD, 0x9B, /* 0x90-0x93 */ + 0xBD, 0x9C, 0xBD, 0x9D, 0xBD, 0x9E, 0xBD, 0x9F, /* 0x94-0x97 */ + 0xBD, 0xA0, 0xBE, 0x40, 0xBE, 0x41, 0xBE, 0x42, /* 0x98-0x9B */ + 0xBE, 0x43, 0xBE, 0x44, 0xBE, 0x45, 0xBE, 0x46, /* 0x9C-0x9F */ + 0xBE, 0x47, 0xBE, 0x48, 0xBE, 0x49, 0xBE, 0x4A, /* 0xA0-0xA3 */ + 0xBE, 0x4B, 0xBE, 0x4C, 0xF4, 0xEB, 0xBE, 0x4D, /* 0xA4-0xA7 */ + 0xBE, 0x4E, 0xBE, 0x4F, 0xBE, 0x50, 0xBE, 0x51, /* 0xA8-0xAB */ + 0xBE, 0x52, 0xBE, 0x53, 0xF4, 0xEC, 0xBE, 0x54, /* 0xAC-0xAF */ + 0xBE, 0x55, 0xBE, 0x56, 0xBE, 0x57, 0xBE, 0x58, /* 0xB0-0xB3 */ + 0xBE, 0x59, 0xBE, 0x5A, 0xBE, 0x5B, 0xBE, 0x5C, /* 0xB4-0xB7 */ + 0xBE, 0x5D, 0xBE, 0x5E, 0xBE, 0x5F, 0xBE, 0x60, /* 0xB8-0xBB */ + 0xBE, 0x61, 0xBE, 0x62, 0xBE, 0x63, 0xBE, 0x64, /* 0xBC-0xBF */ + 0xBE, 0x65, 0xBE, 0x66, 0xBE, 0x67, 0xBE, 0x68, /* 0xC0-0xC3 */ + 0xBE, 0x69, 0xBE, 0x6A, 0xBE, 0x6B, 0xBE, 0x6C, /* 0xC4-0xC7 */ + 0xBE, 0x6D, 0xBE, 0x6E, 0xBE, 0x6F, 0xBE, 0x70, /* 0xC8-0xCB */ + 0xBE, 0x71, 0xBE, 0x72, 0xBE, 0x73, 0xBE, 0x74, /* 0xCC-0xCF */ + 0xBE, 0x75, 0xBE, 0x76, 0xBE, 0x77, 0xBE, 0x78, /* 0xD0-0xD3 */ + 0xBE, 0x79, 0xBE, 0x7A, 0xBE, 0x7B, 0xBE, 0x7C, /* 0xD4-0xD7 */ + 0xBE, 0x7D, 0xBE, 0x7E, 0xBE, 0x80, 0xBE, 0x81, /* 0xD8-0xDB */ + 0xBE, 0x82, 0xBE, 0x83, 0xBE, 0x84, 0xBE, 0x85, /* 0xDC-0xDF */ + 0xBE, 0x86, 0xBE, 0x87, 0xBE, 0x88, 0xBE, 0x89, /* 0xE0-0xE3 */ + 0xBE, 0x8A, 0xBE, 0x8B, 0xBE, 0x8C, 0xBE, 0x8D, /* 0xE4-0xE7 */ + 0xBE, 0x8E, 0xBE, 0x8F, 0xBE, 0x90, 0xBE, 0x91, /* 0xE8-0xEB */ + 0xBE, 0x92, 0xBE, 0x93, 0xBE, 0x94, 0xBE, 0x95, /* 0xEC-0xEF */ + 0xBE, 0x96, 0xBE, 0x97, 0xBE, 0x98, 0xBE, 0x99, /* 0xF0-0xF3 */ + 0xBE, 0x9A, 0xBE, 0x9B, 0xBE, 0x9C, 0xBE, 0x9D, /* 0xF4-0xF7 */ + 0xBE, 0x9E, 0xBE, 0x9F, 0xBE, 0xA0, 0xBF, 0x40, /* 0xF8-0xFB */ + 0xBF, 0x41, 0xBF, 0x42, 0xBF, 0x43, 0xBF, 0x44, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7E[512] = { + 0xBF, 0x45, 0xBF, 0x46, 0xBF, 0x47, 0xBF, 0x48, /* 0x00-0x03 */ + 0xBF, 0x49, 0xBF, 0x4A, 0xBF, 0x4B, 0xBF, 0x4C, /* 0x04-0x07 */ + 0xBF, 0x4D, 0xBF, 0x4E, 0xBF, 0x4F, 0xBF, 0x50, /* 0x08-0x0B */ + 0xBF, 0x51, 0xBF, 0x52, 0xBF, 0x53, 0xBF, 0x54, /* 0x0C-0x0F */ + 0xBF, 0x55, 0xBF, 0x56, 0xBF, 0x57, 0xBF, 0x58, /* 0x10-0x13 */ + 0xBF, 0x59, 0xBF, 0x5A, 0xBF, 0x5B, 0xBF, 0x5C, /* 0x14-0x17 */ + 0xBF, 0x5D, 0xBF, 0x5E, 0xBF, 0x5F, 0xBF, 0x60, /* 0x18-0x1B */ + 0xBF, 0x61, 0xBF, 0x62, 0xBF, 0x63, 0xBF, 0x64, /* 0x1C-0x1F */ + 0xBF, 0x65, 0xBF, 0x66, 0xBF, 0x67, 0xBF, 0x68, /* 0x20-0x23 */ + 0xBF, 0x69, 0xBF, 0x6A, 0xBF, 0x6B, 0xBF, 0x6C, /* 0x24-0x27 */ + 0xBF, 0x6D, 0xBF, 0x6E, 0xBF, 0x6F, 0xBF, 0x70, /* 0x28-0x2B */ + 0xBF, 0x71, 0xBF, 0x72, 0xBF, 0x73, 0xBF, 0x74, /* 0x2C-0x2F */ + 0xBF, 0x75, 0xBF, 0x76, 0xBF, 0x77, 0xBF, 0x78, /* 0x30-0x33 */ + 0xBF, 0x79, 0xBF, 0x7A, 0xBF, 0x7B, 0xBF, 0x7C, /* 0x34-0x37 */ + 0xBF, 0x7D, 0xBF, 0x7E, 0xBF, 0x80, 0xF7, 0xE3, /* 0x38-0x3B */ + 0xBF, 0x81, 0xBF, 0x82, 0xBF, 0x83, 0xBF, 0x84, /* 0x3C-0x3F */ + 0xBF, 0x85, 0xB7, 0xB1, 0xBF, 0x86, 0xBF, 0x87, /* 0x40-0x43 */ + 0xBF, 0x88, 0xBF, 0x89, 0xBF, 0x8A, 0xF4, 0xED, /* 0x44-0x47 */ + 0xBF, 0x8B, 0xBF, 0x8C, 0xBF, 0x8D, 0xBF, 0x8E, /* 0x48-0x4B */ + 0xBF, 0x8F, 0xBF, 0x90, 0xBF, 0x91, 0xBF, 0x92, /* 0x4C-0x4F */ + 0xBF, 0x93, 0xBF, 0x94, 0xBF, 0x95, 0xBF, 0x96, /* 0x50-0x53 */ + 0xBF, 0x97, 0xBF, 0x98, 0xBF, 0x99, 0xBF, 0x9A, /* 0x54-0x57 */ + 0xBF, 0x9B, 0xBF, 0x9C, 0xBF, 0x9D, 0xBF, 0x9E, /* 0x58-0x5B */ + 0xBF, 0x9F, 0xBF, 0xA0, 0xC0, 0x40, 0xC0, 0x41, /* 0x5C-0x5F */ + 0xC0, 0x42, 0xC0, 0x43, 0xC0, 0x44, 0xC0, 0x45, /* 0x60-0x63 */ + 0xC0, 0x46, 0xC0, 0x47, 0xC0, 0x48, 0xC0, 0x49, /* 0x64-0x67 */ + 0xC0, 0x4A, 0xC0, 0x4B, 0xC0, 0x4C, 0xC0, 0x4D, /* 0x68-0x6B */ + 0xC0, 0x4E, 0xC0, 0x4F, 0xC0, 0x50, 0xC0, 0x51, /* 0x6C-0x6F */ + 0xC0, 0x52, 0xC0, 0x53, 0xC0, 0x54, 0xC0, 0x55, /* 0x70-0x73 */ + 0xC0, 0x56, 0xC0, 0x57, 0xC0, 0x58, 0xC0, 0x59, /* 0x74-0x77 */ + 0xC0, 0x5A, 0xC0, 0x5B, 0xC0, 0x5C, 0xC0, 0x5D, /* 0x78-0x7B */ + 0xC0, 0x5E, 0xC0, 0x5F, 0xC0, 0x60, 0xC0, 0x61, /* 0x7C-0x7F */ + + 0xC0, 0x62, 0xC0, 0x63, 0xD7, 0xEB, 0xC0, 0x64, /* 0x80-0x83 */ + 0xC0, 0x65, 0xC0, 0x66, 0xC0, 0x67, 0xC0, 0x68, /* 0x84-0x87 */ + 0xC0, 0x69, 0xC0, 0x6A, 0xC0, 0x6B, 0xC0, 0x6C, /* 0x88-0x8B */ + 0xC0, 0x6D, 0xC0, 0x6E, 0xC0, 0x6F, 0xC0, 0x70, /* 0x8C-0x8F */ + 0xC0, 0x71, 0xC0, 0x72, 0xC0, 0x73, 0xC0, 0x74, /* 0x90-0x93 */ + 0xC0, 0x75, 0xC0, 0x76, 0xC0, 0x77, 0xC0, 0x78, /* 0x94-0x97 */ + 0xC0, 0x79, 0xC0, 0x7A, 0xC0, 0x7B, 0xF4, 0xEE, /* 0x98-0x9B */ + 0xC0, 0x7C, 0xC0, 0x7D, 0xC0, 0x7E, 0xE6, 0xF9, /* 0x9C-0x9F */ + 0xBE, 0xC0, 0xE6, 0xFA, 0xBA, 0xEC, 0xE6, 0xFB, /* 0xA0-0xA3 */ + 0xCF, 0xCB, 0xE6, 0xFC, 0xD4, 0xBC, 0xBC, 0xB6, /* 0xA4-0xA7 */ + 0xE6, 0xFD, 0xE6, 0xFE, 0xBC, 0xCD, 0xC8, 0xD2, /* 0xA8-0xAB */ + 0xCE, 0xB3, 0xE7, 0xA1, 0xC0, 0x80, 0xB4, 0xBF, /* 0xAC-0xAF */ + 0xE7, 0xA2, 0xC9, 0xB4, 0xB8, 0xD9, 0xC4, 0xC9, /* 0xB0-0xB3 */ + 0xC0, 0x81, 0xD7, 0xDD, 0xC2, 0xDA, 0xB7, 0xD7, /* 0xB4-0xB7 */ + 0xD6, 0xBD, 0xCE, 0xC6, 0xB7, 0xC4, 0xC0, 0x82, /* 0xB8-0xBB */ + 0xC0, 0x83, 0xC5, 0xA6, 0xE7, 0xA3, 0xCF, 0xDF, /* 0xBC-0xBF */ + 0xE7, 0xA4, 0xE7, 0xA5, 0xE7, 0xA6, 0xC1, 0xB7, /* 0xC0-0xC3 */ + 0xD7, 0xE9, 0xC9, 0xF0, 0xCF, 0xB8, 0xD6, 0xAF, /* 0xC4-0xC7 */ + 0xD6, 0xD5, 0xE7, 0xA7, 0xB0, 0xED, 0xE7, 0xA8, /* 0xC8-0xCB */ + 0xE7, 0xA9, 0xC9, 0xDC, 0xD2, 0xEF, 0xBE, 0xAD, /* 0xCC-0xCF */ + 0xE7, 0xAA, 0xB0, 0xF3, 0xC8, 0xDE, 0xBD, 0xE1, /* 0xD0-0xD3 */ + 0xE7, 0xAB, 0xC8, 0xC6, 0xC0, 0x84, 0xE7, 0xAC, /* 0xD4-0xD7 */ + 0xBB, 0xE6, 0xB8, 0xF8, 0xD1, 0xA4, 0xE7, 0xAD, /* 0xD8-0xDB */ + 0xC2, 0xE7, 0xBE, 0xF8, 0xBD, 0xCA, 0xCD, 0xB3, /* 0xDC-0xDF */ + 0xE7, 0xAE, 0xE7, 0xAF, 0xBE, 0xEE, 0xD0, 0xE5, /* 0xE0-0xE3 */ + 0xC0, 0x85, 0xCB, 0xE7, 0xCC, 0xD0, 0xBC, 0xCC, /* 0xE4-0xE7 */ + 0xE7, 0xB0, 0xBC, 0xA8, 0xD0, 0xF7, 0xE7, 0xB1, /* 0xE8-0xEB */ + 0xC0, 0x86, 0xD0, 0xF8, 0xE7, 0xB2, 0xE7, 0xB3, /* 0xEC-0xEF */ + 0xB4, 0xC2, 0xE7, 0xB4, 0xE7, 0xB5, 0xC9, 0xFE, /* 0xF0-0xF3 */ + 0xCE, 0xAC, 0xC3, 0xE0, 0xE7, 0xB7, 0xB1, 0xC1, /* 0xF4-0xF7 */ + 0xB3, 0xF1, 0xC0, 0x87, 0xE7, 0xB8, 0xE7, 0xB9, /* 0xF8-0xFB */ + 0xD7, 0xDB, 0xD5, 0xC0, 0xE7, 0xBA, 0xC2, 0xCC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7F[512] = { + 0xD7, 0xBA, 0xE7, 0xBB, 0xE7, 0xBC, 0xE7, 0xBD, /* 0x00-0x03 */ + 0xBC, 0xEA, 0xC3, 0xE5, 0xC0, 0xC2, 0xE7, 0xBE, /* 0x04-0x07 */ + 0xE7, 0xBF, 0xBC, 0xA9, 0xC0, 0x88, 0xE7, 0xC0, /* 0x08-0x0B */ + 0xE7, 0xC1, 0xE7, 0xB6, 0xB6, 0xD0, 0xE7, 0xC2, /* 0x0C-0x0F */ + 0xC0, 0x89, 0xE7, 0xC3, 0xE7, 0xC4, 0xBB, 0xBA, /* 0x10-0x13 */ + 0xB5, 0xDE, 0xC2, 0xC6, 0xB1, 0xE0, 0xE7, 0xC5, /* 0x14-0x17 */ + 0xD4, 0xB5, 0xE7, 0xC6, 0xB8, 0xBF, 0xE7, 0xC8, /* 0x18-0x1B */ + 0xE7, 0xC7, 0xB7, 0xEC, 0xC0, 0x8A, 0xE7, 0xC9, /* 0x1C-0x1F */ + 0xB2, 0xF8, 0xE7, 0xCA, 0xE7, 0xCB, 0xE7, 0xCC, /* 0x20-0x23 */ + 0xE7, 0xCD, 0xE7, 0xCE, 0xE7, 0xCF, 0xE7, 0xD0, /* 0x24-0x27 */ + 0xD3, 0xA7, 0xCB, 0xF5, 0xE7, 0xD1, 0xE7, 0xD2, /* 0x28-0x2B */ + 0xE7, 0xD3, 0xE7, 0xD4, 0xC9, 0xC9, 0xE7, 0xD5, /* 0x2C-0x2F */ + 0xE7, 0xD6, 0xE7, 0xD7, 0xE7, 0xD8, 0xE7, 0xD9, /* 0x30-0x33 */ + 0xBD, 0xC9, 0xE7, 0xDA, 0xF3, 0xBE, 0xC0, 0x8B, /* 0x34-0x37 */ + 0xB8, 0xD7, 0xC0, 0x8C, 0xC8, 0xB1, 0xC0, 0x8D, /* 0x38-0x3B */ + 0xC0, 0x8E, 0xC0, 0x8F, 0xC0, 0x90, 0xC0, 0x91, /* 0x3C-0x3F */ + 0xC0, 0x92, 0xC0, 0x93, 0xF3, 0xBF, 0xC0, 0x94, /* 0x40-0x43 */ + 0xF3, 0xC0, 0xF3, 0xC1, 0xC0, 0x95, 0xC0, 0x96, /* 0x44-0x47 */ + 0xC0, 0x97, 0xC0, 0x98, 0xC0, 0x99, 0xC0, 0x9A, /* 0x48-0x4B */ + 0xC0, 0x9B, 0xC0, 0x9C, 0xC0, 0x9D, 0xC0, 0x9E, /* 0x4C-0x4F */ + 0xB9, 0xDE, 0xCD, 0xF8, 0xC0, 0x9F, 0xC0, 0xA0, /* 0x50-0x53 */ + 0xD8, 0xE8, 0xBA, 0xB1, 0xC1, 0x40, 0xC2, 0xDE, /* 0x54-0x57 */ + 0xEE, 0xB7, 0xC1, 0x41, 0xB7, 0xA3, 0xC1, 0x42, /* 0x58-0x5B */ + 0xC1, 0x43, 0xC1, 0x44, 0xC1, 0x45, 0xEE, 0xB9, /* 0x5C-0x5F */ + 0xC1, 0x46, 0xEE, 0xB8, 0xB0, 0xD5, 0xC1, 0x47, /* 0x60-0x63 */ + 0xC1, 0x48, 0xC1, 0x49, 0xC1, 0x4A, 0xC1, 0x4B, /* 0x64-0x67 */ + 0xEE, 0xBB, 0xD5, 0xD6, 0xD7, 0xEF, 0xC1, 0x4C, /* 0x68-0x6B */ + 0xC1, 0x4D, 0xC1, 0x4E, 0xD6, 0xC3, 0xC1, 0x4F, /* 0x6C-0x6F */ + 0xC1, 0x50, 0xEE, 0xBD, 0xCA, 0xF0, 0xC1, 0x51, /* 0x70-0x73 */ + 0xEE, 0xBC, 0xC1, 0x52, 0xC1, 0x53, 0xC1, 0x54, /* 0x74-0x77 */ + 0xC1, 0x55, 0xEE, 0xBE, 0xC1, 0x56, 0xC1, 0x57, /* 0x78-0x7B */ + 0xC1, 0x58, 0xC1, 0x59, 0xEE, 0xC0, 0xC1, 0x5A, /* 0x7C-0x7F */ + + 0xC1, 0x5B, 0xEE, 0xBF, 0xC1, 0x5C, 0xC1, 0x5D, /* 0x80-0x83 */ + 0xC1, 0x5E, 0xC1, 0x5F, 0xC1, 0x60, 0xC1, 0x61, /* 0x84-0x87 */ + 0xC1, 0x62, 0xC1, 0x63, 0xD1, 0xF2, 0xC1, 0x64, /* 0x88-0x8B */ + 0xC7, 0xBC, 0xC1, 0x65, 0xC3, 0xC0, 0xC1, 0x66, /* 0x8C-0x8F */ + 0xC1, 0x67, 0xC1, 0x68, 0xC1, 0x69, 0xC1, 0x6A, /* 0x90-0x93 */ + 0xB8, 0xE1, 0xC1, 0x6B, 0xC1, 0x6C, 0xC1, 0x6D, /* 0x94-0x97 */ + 0xC1, 0x6E, 0xC1, 0x6F, 0xC1, 0xE7, 0xC1, 0x70, /* 0x98-0x9B */ + 0xC1, 0x71, 0xF4, 0xC6, 0xD0, 0xDF, 0xF4, 0xC7, /* 0x9C-0x9F */ + 0xC1, 0x72, 0xCF, 0xDB, 0xC1, 0x73, 0xC1, 0x74, /* 0xA0-0xA3 */ + 0xC8, 0xBA, 0xC1, 0x75, 0xC1, 0x76, 0xF4, 0xC8, /* 0xA4-0xA7 */ + 0xC1, 0x77, 0xC1, 0x78, 0xC1, 0x79, 0xC1, 0x7A, /* 0xA8-0xAB */ + 0xC1, 0x7B, 0xC1, 0x7C, 0xC1, 0x7D, 0xF4, 0xC9, /* 0xAC-0xAF */ + 0xF4, 0xCA, 0xC1, 0x7E, 0xF4, 0xCB, 0xC1, 0x80, /* 0xB0-0xB3 */ + 0xC1, 0x81, 0xC1, 0x82, 0xC1, 0x83, 0xC1, 0x84, /* 0xB4-0xB7 */ + 0xD9, 0xFA, 0xB8, 0xFE, 0xC1, 0x85, 0xC1, 0x86, /* 0xB8-0xBB */ + 0xE5, 0xF1, 0xD3, 0xF0, 0xC1, 0x87, 0xF4, 0xE0, /* 0xBC-0xBF */ + 0xC1, 0x88, 0xCE, 0xCC, 0xC1, 0x89, 0xC1, 0x8A, /* 0xC0-0xC3 */ + 0xC1, 0x8B, 0xB3, 0xE1, 0xC1, 0x8C, 0xC1, 0x8D, /* 0xC4-0xC7 */ + 0xC1, 0x8E, 0xC1, 0x8F, 0xF1, 0xB4, 0xC1, 0x90, /* 0xC8-0xCB */ + 0xD2, 0xEE, 0xC1, 0x91, 0xF4, 0xE1, 0xC1, 0x92, /* 0xCC-0xCF */ + 0xC1, 0x93, 0xC1, 0x94, 0xC1, 0x95, 0xC1, 0x96, /* 0xD0-0xD3 */ + 0xCF, 0xE8, 0xF4, 0xE2, 0xC1, 0x97, 0xC1, 0x98, /* 0xD4-0xD7 */ + 0xC7, 0xCC, 0xC1, 0x99, 0xC1, 0x9A, 0xC1, 0x9B, /* 0xD8-0xDB */ + 0xC1, 0x9C, 0xC1, 0x9D, 0xC1, 0x9E, 0xB5, 0xD4, /* 0xDC-0xDF */ + 0xB4, 0xE4, 0xF4, 0xE4, 0xC1, 0x9F, 0xC1, 0xA0, /* 0xE0-0xE3 */ + 0xC2, 0x40, 0xF4, 0xE3, 0xF4, 0xE5, 0xC2, 0x41, /* 0xE4-0xE7 */ + 0xC2, 0x42, 0xF4, 0xE6, 0xC2, 0x43, 0xC2, 0x44, /* 0xE8-0xEB */ + 0xC2, 0x45, 0xC2, 0x46, 0xF4, 0xE7, 0xC2, 0x47, /* 0xEC-0xEF */ + 0xBA, 0xB2, 0xB0, 0xBF, 0xC2, 0x48, 0xF4, 0xE8, /* 0xF0-0xF3 */ + 0xC2, 0x49, 0xC2, 0x4A, 0xC2, 0x4B, 0xC2, 0x4C, /* 0xF4-0xF7 */ + 0xC2, 0x4D, 0xC2, 0x4E, 0xC2, 0x4F, 0xB7, 0xAD, /* 0xF8-0xFB */ + 0xD2, 0xED, 0xC2, 0x50, 0xC2, 0x51, 0xC2, 0x52, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_80[512] = { + 0xD2, 0xAB, 0xC0, 0xCF, 0xC2, 0x53, 0xBF, 0xBC, /* 0x00-0x03 */ + 0xEB, 0xA3, 0xD5, 0xDF, 0xEA, 0xC8, 0xC2, 0x54, /* 0x04-0x07 */ + 0xC2, 0x55, 0xC2, 0x56, 0xC2, 0x57, 0xF1, 0xF3, /* 0x08-0x0B */ + 0xB6, 0xF8, 0xCB, 0xA3, 0xC2, 0x58, 0xC2, 0x59, /* 0x0C-0x0F */ + 0xC4, 0xCD, 0xC2, 0x5A, 0xF1, 0xE7, 0xC2, 0x5B, /* 0x10-0x13 */ + 0xF1, 0xE8, 0xB8, 0xFB, 0xF1, 0xE9, 0xBA, 0xC4, /* 0x14-0x17 */ + 0xD4, 0xC5, 0xB0, 0xD2, 0xC2, 0x5C, 0xC2, 0x5D, /* 0x18-0x1B */ + 0xF1, 0xEA, 0xC2, 0x5E, 0xC2, 0x5F, 0xC2, 0x60, /* 0x1C-0x1F */ + 0xF1, 0xEB, 0xC2, 0x61, 0xF1, 0xEC, 0xC2, 0x62, /* 0x20-0x23 */ + 0xC2, 0x63, 0xF1, 0xED, 0xF1, 0xEE, 0xF1, 0xEF, /* 0x24-0x27 */ + 0xF1, 0xF1, 0xF1, 0xF0, 0xC5, 0xD5, 0xC2, 0x64, /* 0x28-0x2B */ + 0xC2, 0x65, 0xC2, 0x66, 0xC2, 0x67, 0xC2, 0x68, /* 0x2C-0x2F */ + 0xC2, 0x69, 0xF1, 0xF2, 0xC2, 0x6A, 0xB6, 0xFA, /* 0x30-0x33 */ + 0xC2, 0x6B, 0xF1, 0xF4, 0xD2, 0xAE, 0xDE, 0xC7, /* 0x34-0x37 */ + 0xCB, 0xCA, 0xC2, 0x6C, 0xC2, 0x6D, 0xB3, 0xDC, /* 0x38-0x3B */ + 0xC2, 0x6E, 0xB5, 0xA2, 0xC2, 0x6F, 0xB9, 0xA2, /* 0x3C-0x3F */ + 0xC2, 0x70, 0xC2, 0x71, 0xC4, 0xF4, 0xF1, 0xF5, /* 0x40-0x43 */ + 0xC2, 0x72, 0xC2, 0x73, 0xF1, 0xF6, 0xC2, 0x74, /* 0x44-0x47 */ + 0xC2, 0x75, 0xC2, 0x76, 0xC1, 0xC4, 0xC1, 0xFB, /* 0x48-0x4B */ + 0xD6, 0xB0, 0xF1, 0xF7, 0xC2, 0x77, 0xC2, 0x78, /* 0x4C-0x4F */ + 0xC2, 0x79, 0xC2, 0x7A, 0xF1, 0xF8, 0xC2, 0x7B, /* 0x50-0x53 */ + 0xC1, 0xAA, 0xC2, 0x7C, 0xC2, 0x7D, 0xC2, 0x7E, /* 0x54-0x57 */ + 0xC6, 0xB8, 0xC2, 0x80, 0xBE, 0xDB, 0xC2, 0x81, /* 0x58-0x5B */ + 0xC2, 0x82, 0xC2, 0x83, 0xC2, 0x84, 0xC2, 0x85, /* 0x5C-0x5F */ + 0xC2, 0x86, 0xC2, 0x87, 0xC2, 0x88, 0xC2, 0x89, /* 0x60-0x63 */ + 0xC2, 0x8A, 0xC2, 0x8B, 0xC2, 0x8C, 0xC2, 0x8D, /* 0x64-0x67 */ + 0xC2, 0x8E, 0xF1, 0xF9, 0xB4, 0xCF, 0xC2, 0x8F, /* 0x68-0x6B */ + 0xC2, 0x90, 0xC2, 0x91, 0xC2, 0x92, 0xC2, 0x93, /* 0x6C-0x6F */ + 0xC2, 0x94, 0xF1, 0xFA, 0xC2, 0x95, 0xC2, 0x96, /* 0x70-0x73 */ + 0xC2, 0x97, 0xC2, 0x98, 0xC2, 0x99, 0xC2, 0x9A, /* 0x74-0x77 */ + 0xC2, 0x9B, 0xC2, 0x9C, 0xC2, 0x9D, 0xC2, 0x9E, /* 0x78-0x7B */ + 0xC2, 0x9F, 0xC2, 0xA0, 0xC3, 0x40, 0xED, 0xB2, /* 0x7C-0x7F */ + + 0xED, 0xB1, 0xC3, 0x41, 0xC3, 0x42, 0xCB, 0xE0, /* 0x80-0x83 */ + 0xD2, 0xDE, 0xC3, 0x43, 0xCB, 0xC1, 0xD5, 0xD8, /* 0x84-0x87 */ + 0xC3, 0x44, 0xC8, 0xE2, 0xC3, 0x45, 0xC0, 0xDF, /* 0x88-0x8B */ + 0xBC, 0xA1, 0xC3, 0x46, 0xC3, 0x47, 0xC3, 0x48, /* 0x8C-0x8F */ + 0xC3, 0x49, 0xC3, 0x4A, 0xC3, 0x4B, 0xEB, 0xC1, /* 0x90-0x93 */ + 0xC3, 0x4C, 0xC3, 0x4D, 0xD0, 0xA4, 0xC3, 0x4E, /* 0x94-0x97 */ + 0xD6, 0xE2, 0xC3, 0x4F, 0xB6, 0xC7, 0xB8, 0xD8, /* 0x98-0x9B */ + 0xEB, 0xC0, 0xB8, 0xCE, 0xC3, 0x50, 0xEB, 0xBF, /* 0x9C-0x9F */ + 0xB3, 0xA6, 0xB9, 0xC9, 0xD6, 0xAB, 0xC3, 0x51, /* 0xA0-0xA3 */ + 0xB7, 0xF4, 0xB7, 0xCA, 0xC3, 0x52, 0xC3, 0x53, /* 0xA4-0xA7 */ + 0xC3, 0x54, 0xBC, 0xE7, 0xB7, 0xBE, 0xEB, 0xC6, /* 0xA8-0xAB */ + 0xC3, 0x55, 0xEB, 0xC7, 0xB0, 0xB9, 0xBF, 0xCF, /* 0xAC-0xAF */ + 0xC3, 0x56, 0xEB, 0xC5, 0xD3, 0xFD, 0xC3, 0x57, /* 0xB0-0xB3 */ + 0xEB, 0xC8, 0xC3, 0x58, 0xC3, 0x59, 0xEB, 0xC9, /* 0xB4-0xB7 */ + 0xC3, 0x5A, 0xC3, 0x5B, 0xB7, 0xCE, 0xC3, 0x5C, /* 0xB8-0xBB */ + 0xEB, 0xC2, 0xEB, 0xC4, 0xC9, 0xF6, 0xD6, 0xD7, /* 0xBC-0xBF */ + 0xD5, 0xCD, 0xD0, 0xB2, 0xEB, 0xCF, 0xCE, 0xB8, /* 0xC0-0xC3 */ + 0xEB, 0xD0, 0xC3, 0x5D, 0xB5, 0xA8, 0xC3, 0x5E, /* 0xC4-0xC7 */ + 0xC3, 0x5F, 0xC3, 0x60, 0xC3, 0x61, 0xC3, 0x62, /* 0xC8-0xCB */ + 0xB1, 0xB3, 0xEB, 0xD2, 0xCC, 0xA5, 0xC3, 0x63, /* 0xCC-0xCF */ + 0xC3, 0x64, 0xC3, 0x65, 0xC3, 0x66, 0xC3, 0x67, /* 0xD0-0xD3 */ + 0xC3, 0x68, 0xC3, 0x69, 0xC5, 0xD6, 0xEB, 0xD3, /* 0xD4-0xD7 */ + 0xC3, 0x6A, 0xEB, 0xD1, 0xC5, 0xDF, 0xEB, 0xCE, /* 0xD8-0xDB */ + 0xCA, 0xA4, 0xEB, 0xD5, 0xB0, 0xFB, 0xC3, 0x6B, /* 0xDC-0xDF */ + 0xC3, 0x6C, 0xBA, 0xFA, 0xC3, 0x6D, 0xC3, 0x6E, /* 0xE0-0xE3 */ + 0xD8, 0xB7, 0xF1, 0xE3, 0xC3, 0x6F, 0xEB, 0xCA, /* 0xE4-0xE7 */ + 0xEB, 0xCB, 0xEB, 0xCC, 0xEB, 0xCD, 0xEB, 0xD6, /* 0xE8-0xEB */ + 0xE6, 0xC0, 0xEB, 0xD9, 0xC3, 0x70, 0xBF, 0xE8, /* 0xEC-0xEF */ + 0xD2, 0xC8, 0xEB, 0xD7, 0xEB, 0xDC, 0xB8, 0xEC, /* 0xF0-0xF3 */ + 0xEB, 0xD8, 0xC3, 0x71, 0xBD, 0xBA, 0xC3, 0x72, /* 0xF4-0xF7 */ + 0xD0, 0xD8, 0xC3, 0x73, 0xB0, 0xB7, 0xC3, 0x74, /* 0xF8-0xFB */ + 0xEB, 0xDD, 0xC4, 0xDC, 0xC3, 0x75, 0xC3, 0x76, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_81[512] = { + 0xC3, 0x77, 0xC3, 0x78, 0xD6, 0xAC, 0xC3, 0x79, /* 0x00-0x03 */ + 0xC3, 0x7A, 0xC3, 0x7B, 0xB4, 0xE0, 0xC3, 0x7C, /* 0x04-0x07 */ + 0xC3, 0x7D, 0xC2, 0xF6, 0xBC, 0xB9, 0xC3, 0x7E, /* 0x08-0x0B */ + 0xC3, 0x80, 0xEB, 0xDA, 0xEB, 0xDB, 0xD4, 0xE0, /* 0x0C-0x0F */ + 0xC6, 0xEA, 0xC4, 0xD4, 0xEB, 0xDF, 0xC5, 0xA7, /* 0x10-0x13 */ + 0xD9, 0xF5, 0xC3, 0x81, 0xB2, 0xB1, 0xC3, 0x82, /* 0x14-0x17 */ + 0xEB, 0xE4, 0xC3, 0x83, 0xBD, 0xC5, 0xC3, 0x84, /* 0x18-0x1B */ + 0xC3, 0x85, 0xC3, 0x86, 0xEB, 0xE2, 0xC3, 0x87, /* 0x1C-0x1F */ + 0xC3, 0x88, 0xC3, 0x89, 0xC3, 0x8A, 0xC3, 0x8B, /* 0x20-0x23 */ + 0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, 0xC3, 0x8F, /* 0x24-0x27 */ + 0xC3, 0x90, 0xC3, 0x91, 0xC3, 0x92, 0xC3, 0x93, /* 0x28-0x2B */ + 0xEB, 0xE3, 0xC3, 0x94, 0xC3, 0x95, 0xB8, 0xAC, /* 0x2C-0x2F */ + 0xC3, 0x96, 0xCD, 0xD1, 0xEB, 0xE5, 0xC3, 0x97, /* 0x30-0x33 */ + 0xC3, 0x98, 0xC3, 0x99, 0xEB, 0xE1, 0xC3, 0x9A, /* 0x34-0x37 */ + 0xC1, 0xB3, 0xC3, 0x9B, 0xC3, 0x9C, 0xC3, 0x9D, /* 0x38-0x3B */ + 0xC3, 0x9E, 0xC3, 0x9F, 0xC6, 0xA2, 0xC3, 0xA0, /* 0x3C-0x3F */ + 0xC4, 0x40, 0xC4, 0x41, 0xC4, 0x42, 0xC4, 0x43, /* 0x40-0x43 */ + 0xC4, 0x44, 0xC4, 0x45, 0xCC, 0xF3, 0xC4, 0x46, /* 0x44-0x47 */ + 0xEB, 0xE6, 0xC4, 0x47, 0xC0, 0xB0, 0xD2, 0xB8, /* 0x48-0x4B */ + 0xEB, 0xE7, 0xC4, 0x48, 0xC4, 0x49, 0xC4, 0x4A, /* 0x4C-0x4F */ + 0xB8, 0xAF, 0xB8, 0xAD, 0xC4, 0x4B, 0xEB, 0xE8, /* 0x50-0x53 */ + 0xC7, 0xBB, 0xCD, 0xF3, 0xC4, 0x4C, 0xC4, 0x4D, /* 0x54-0x57 */ + 0xC4, 0x4E, 0xEB, 0xEA, 0xEB, 0xEB, 0xC4, 0x4F, /* 0x58-0x5B */ + 0xC4, 0x50, 0xC4, 0x51, 0xC4, 0x52, 0xC4, 0x53, /* 0x5C-0x5F */ + 0xEB, 0xED, 0xC4, 0x54, 0xC4, 0x55, 0xC4, 0x56, /* 0x60-0x63 */ + 0xC4, 0x57, 0xD0, 0xC8, 0xC4, 0x58, 0xEB, 0xF2, /* 0x64-0x67 */ + 0xC4, 0x59, 0xEB, 0xEE, 0xC4, 0x5A, 0xC4, 0x5B, /* 0x68-0x6B */ + 0xC4, 0x5C, 0xEB, 0xF1, 0xC8, 0xF9, 0xC4, 0x5D, /* 0x6C-0x6F */ + 0xD1, 0xFC, 0xEB, 0xEC, 0xC4, 0x5E, 0xC4, 0x5F, /* 0x70-0x73 */ + 0xEB, 0xE9, 0xC4, 0x60, 0xC4, 0x61, 0xC4, 0x62, /* 0x74-0x77 */ + 0xC4, 0x63, 0xB8, 0xB9, 0xCF, 0xD9, 0xC4, 0xE5, /* 0x78-0x7B */ + 0xEB, 0xEF, 0xEB, 0xF0, 0xCC, 0xDA, 0xCD, 0xC8, /* 0x7C-0x7F */ + + 0xB0, 0xF2, 0xC4, 0x64, 0xEB, 0xF6, 0xC4, 0x65, /* 0x80-0x83 */ + 0xC4, 0x66, 0xC4, 0x67, 0xC4, 0x68, 0xC4, 0x69, /* 0x84-0x87 */ + 0xEB, 0xF5, 0xC4, 0x6A, 0xB2, 0xB2, 0xC4, 0x6B, /* 0x88-0x8B */ + 0xC4, 0x6C, 0xC4, 0x6D, 0xC4, 0x6E, 0xB8, 0xE0, /* 0x8C-0x8F */ + 0xC4, 0x6F, 0xEB, 0xF7, 0xC4, 0x70, 0xC4, 0x71, /* 0x90-0x93 */ + 0xC4, 0x72, 0xC4, 0x73, 0xC4, 0x74, 0xC4, 0x75, /* 0x94-0x97 */ + 0xB1, 0xEC, 0xC4, 0x76, 0xC4, 0x77, 0xCC, 0xC5, /* 0x98-0x9B */ + 0xC4, 0xA4, 0xCF, 0xA5, 0xC4, 0x78, 0xC4, 0x79, /* 0x9C-0x9F */ + 0xC4, 0x7A, 0xC4, 0x7B, 0xC4, 0x7C, 0xEB, 0xF9, /* 0xA0-0xA3 */ + 0xC4, 0x7D, 0xC4, 0x7E, 0xEC, 0xA2, 0xC4, 0x80, /* 0xA4-0xA7 */ + 0xC5, 0xF2, 0xC4, 0x81, 0xEB, 0xFA, 0xC4, 0x82, /* 0xA8-0xAB */ + 0xC4, 0x83, 0xC4, 0x84, 0xC4, 0x85, 0xC4, 0x86, /* 0xAC-0xAF */ + 0xC4, 0x87, 0xC4, 0x88, 0xC4, 0x89, 0xC9, 0xC5, /* 0xB0-0xB3 */ + 0xC4, 0x8A, 0xC4, 0x8B, 0xC4, 0x8C, 0xC4, 0x8D, /* 0xB4-0xB7 */ + 0xC4, 0x8E, 0xC4, 0x8F, 0xE2, 0xDF, 0xEB, 0xFE, /* 0xB8-0xBB */ + 0xC4, 0x90, 0xC4, 0x91, 0xC4, 0x92, 0xC4, 0x93, /* 0xBC-0xBF */ + 0xCD, 0xCE, 0xEC, 0xA1, 0xB1, 0xDB, 0xD3, 0xB7, /* 0xC0-0xC3 */ + 0xC4, 0x94, 0xC4, 0x95, 0xD2, 0xDC, 0xC4, 0x96, /* 0xC4-0xC7 */ + 0xC4, 0x97, 0xC4, 0x98, 0xEB, 0xFD, 0xC4, 0x99, /* 0xC8-0xCB */ + 0xEB, 0xFB, 0xC4, 0x9A, 0xC4, 0x9B, 0xC4, 0x9C, /* 0xCC-0xCF */ + 0xC4, 0x9D, 0xC4, 0x9E, 0xC4, 0x9F, 0xC4, 0xA0, /* 0xD0-0xD3 */ + 0xC5, 0x40, 0xC5, 0x41, 0xC5, 0x42, 0xC5, 0x43, /* 0xD4-0xD7 */ + 0xC5, 0x44, 0xC5, 0x45, 0xC5, 0x46, 0xC5, 0x47, /* 0xD8-0xDB */ + 0xC5, 0x48, 0xC5, 0x49, 0xC5, 0x4A, 0xC5, 0x4B, /* 0xDC-0xDF */ + 0xC5, 0x4C, 0xC5, 0x4D, 0xC5, 0x4E, 0xB3, 0xBC, /* 0xE0-0xE3 */ + 0xC5, 0x4F, 0xC5, 0x50, 0xC5, 0x51, 0xEA, 0xB0, /* 0xE4-0xE7 */ + 0xC5, 0x52, 0xC5, 0x53, 0xD7, 0xD4, 0xC5, 0x54, /* 0xE8-0xEB */ + 0xF4, 0xAB, 0xB3, 0xF4, 0xC5, 0x55, 0xC5, 0x56, /* 0xEC-0xEF */ + 0xC5, 0x57, 0xC5, 0x58, 0xC5, 0x59, 0xD6, 0xC1, /* 0xF0-0xF3 */ + 0xD6, 0xC2, 0xC5, 0x5A, 0xC5, 0x5B, 0xC5, 0x5C, /* 0xF4-0xF7 */ + 0xC5, 0x5D, 0xC5, 0x5E, 0xC5, 0x5F, 0xD5, 0xE9, /* 0xF8-0xFB */ + 0xBE, 0xCA, 0xC5, 0x60, 0xF4, 0xA7, 0xC5, 0x61, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_82[512] = { + 0xD2, 0xA8, 0xF4, 0xA8, 0xF4, 0xA9, 0xC5, 0x62, /* 0x00-0x03 */ + 0xF4, 0xAA, 0xBE, 0xCB, 0xD3, 0xDF, 0xC5, 0x63, /* 0x04-0x07 */ + 0xC5, 0x64, 0xC5, 0x65, 0xC5, 0x66, 0xC5, 0x67, /* 0x08-0x0B */ + 0xC9, 0xE0, 0xC9, 0xE1, 0xC5, 0x68, 0xC5, 0x69, /* 0x0C-0x0F */ + 0xF3, 0xC2, 0xC5, 0x6A, 0xCA, 0xE6, 0xC5, 0x6B, /* 0x10-0x13 */ + 0xCC, 0xF2, 0xC5, 0x6C, 0xC5, 0x6D, 0xC5, 0x6E, /* 0x14-0x17 */ + 0xC5, 0x6F, 0xC5, 0x70, 0xC5, 0x71, 0xE2, 0xB6, /* 0x18-0x1B */ + 0xCB, 0xB4, 0xC5, 0x72, 0xCE, 0xE8, 0xD6, 0xDB, /* 0x1C-0x1F */ + 0xC5, 0x73, 0xF4, 0xAD, 0xF4, 0xAE, 0xF4, 0xAF, /* 0x20-0x23 */ + 0xC5, 0x74, 0xC5, 0x75, 0xC5, 0x76, 0xC5, 0x77, /* 0x24-0x27 */ + 0xF4, 0xB2, 0xC5, 0x78, 0xBA, 0xBD, 0xF4, 0xB3, /* 0x28-0x2B */ + 0xB0, 0xE3, 0xF4, 0xB0, 0xC5, 0x79, 0xF4, 0xB1, /* 0x2C-0x2F */ + 0xBD, 0xA2, 0xB2, 0xD5, 0xC5, 0x7A, 0xF4, 0xB6, /* 0x30-0x33 */ + 0xF4, 0xB7, 0xB6, 0xE6, 0xB2, 0xB0, 0xCF, 0xCF, /* 0x34-0x37 */ + 0xF4, 0xB4, 0xB4, 0xAC, 0xC5, 0x7B, 0xF4, 0xB5, /* 0x38-0x3B */ + 0xC5, 0x7C, 0xC5, 0x7D, 0xF4, 0xB8, 0xC5, 0x7E, /* 0x3C-0x3F */ + 0xC5, 0x80, 0xC5, 0x81, 0xC5, 0x82, 0xC5, 0x83, /* 0x40-0x43 */ + 0xF4, 0xB9, 0xC5, 0x84, 0xC5, 0x85, 0xCD, 0xA7, /* 0x44-0x47 */ + 0xC5, 0x86, 0xF4, 0xBA, 0xC5, 0x87, 0xF4, 0xBB, /* 0x48-0x4B */ + 0xC5, 0x88, 0xC5, 0x89, 0xC5, 0x8A, 0xF4, 0xBC, /* 0x4C-0x4F */ + 0xC5, 0x8B, 0xC5, 0x8C, 0xC5, 0x8D, 0xC5, 0x8E, /* 0x50-0x53 */ + 0xC5, 0x8F, 0xC5, 0x90, 0xC5, 0x91, 0xC5, 0x92, /* 0x54-0x57 */ + 0xCB, 0xD2, 0xC5, 0x93, 0xF4, 0xBD, 0xC5, 0x94, /* 0x58-0x5B */ + 0xC5, 0x95, 0xC5, 0x96, 0xC5, 0x97, 0xF4, 0xBE, /* 0x5C-0x5F */ + 0xC5, 0x98, 0xC5, 0x99, 0xC5, 0x9A, 0xC5, 0x9B, /* 0x60-0x63 */ + 0xC5, 0x9C, 0xC5, 0x9D, 0xC5, 0x9E, 0xC5, 0x9F, /* 0x64-0x67 */ + 0xF4, 0xBF, 0xC5, 0xA0, 0xC6, 0x40, 0xC6, 0x41, /* 0x68-0x6B */ + 0xC6, 0x42, 0xC6, 0x43, 0xF4, 0xDE, 0xC1, 0xBC, /* 0x6C-0x6F */ + 0xBC, 0xE8, 0xC6, 0x44, 0xC9, 0xAB, 0xD1, 0xDE, /* 0x70-0x73 */ + 0xE5, 0xF5, 0xC6, 0x45, 0xC6, 0x46, 0xC6, 0x47, /* 0x74-0x77 */ + 0xC6, 0x48, 0xDC, 0xB3, 0xD2, 0xD5, 0xC6, 0x49, /* 0x78-0x7B */ + 0xC6, 0x4A, 0xDC, 0xB4, 0xB0, 0xAC, 0xDC, 0xB5, /* 0x7C-0x7F */ + + 0xC6, 0x4B, 0xC6, 0x4C, 0xBD, 0xDA, 0xC6, 0x4D, /* 0x80-0x83 */ + 0xDC, 0xB9, 0xC6, 0x4E, 0xC6, 0x4F, 0xC6, 0x50, /* 0x84-0x87 */ + 0xD8, 0xC2, 0xC6, 0x51, 0xDC, 0xB7, 0xD3, 0xF3, /* 0x88-0x8B */ + 0xC6, 0x52, 0xC9, 0xD6, 0xDC, 0xBA, 0xDC, 0xB6, /* 0x8C-0x8F */ + 0xC6, 0x53, 0xDC, 0xBB, 0xC3, 0xA2, 0xC6, 0x54, /* 0x90-0x93 */ + 0xC6, 0x55, 0xC6, 0x56, 0xC6, 0x57, 0xDC, 0xBC, /* 0x94-0x97 */ + 0xDC, 0xC5, 0xDC, 0xBD, 0xC6, 0x58, 0xC6, 0x59, /* 0x98-0x9B */ + 0xCE, 0xDF, 0xD6, 0xA5, 0xC6, 0x5A, 0xDC, 0xCF, /* 0x9C-0x9F */ + 0xC6, 0x5B, 0xDC, 0xCD, 0xC6, 0x5C, 0xC6, 0x5D, /* 0xA0-0xA3 */ + 0xDC, 0xD2, 0xBD, 0xE6, 0xC2, 0xAB, 0xC6, 0x5E, /* 0xA4-0xA7 */ + 0xDC, 0xB8, 0xDC, 0xCB, 0xDC, 0xCE, 0xDC, 0xBE, /* 0xA8-0xAB */ + 0xB7, 0xD2, 0xB0, 0xC5, 0xDC, 0xC7, 0xD0, 0xBE, /* 0xAC-0xAF */ + 0xDC, 0xC1, 0xBB, 0xA8, 0xC6, 0x5F, 0xB7, 0xBC, /* 0xB0-0xB3 */ + 0xDC, 0xCC, 0xC6, 0x60, 0xC6, 0x61, 0xDC, 0xC6, /* 0xB4-0xB7 */ + 0xDC, 0xBF, 0xC7, 0xDB, 0xC6, 0x62, 0xC6, 0x63, /* 0xB8-0xBB */ + 0xC6, 0x64, 0xD1, 0xBF, 0xDC, 0xC0, 0xC6, 0x65, /* 0xBC-0xBF */ + 0xC6, 0x66, 0xDC, 0xCA, 0xC6, 0x67, 0xC6, 0x68, /* 0xC0-0xC3 */ + 0xDC, 0xD0, 0xC6, 0x69, 0xC6, 0x6A, 0xCE, 0xAD, /* 0xC4-0xC7 */ + 0xDC, 0xC2, 0xC6, 0x6B, 0xDC, 0xC3, 0xDC, 0xC8, /* 0xC8-0xCB */ + 0xDC, 0xC9, 0xB2, 0xD4, 0xDC, 0xD1, 0xCB, 0xD5, /* 0xCC-0xCF */ + 0xC6, 0x6C, 0xD4, 0xB7, 0xDC, 0xDB, 0xDC, 0xDF, /* 0xD0-0xD3 */ + 0xCC, 0xA6, 0xDC, 0xE6, 0xC6, 0x6D, 0xC3, 0xE7, /* 0xD4-0xD7 */ + 0xDC, 0xDC, 0xC6, 0x6E, 0xC6, 0x6F, 0xBF, 0xC1, /* 0xD8-0xDB */ + 0xDC, 0xD9, 0xC6, 0x70, 0xB0, 0xFA, 0xB9, 0xB6, /* 0xDC-0xDF */ + 0xDC, 0xE5, 0xDC, 0xD3, 0xC6, 0x71, 0xDC, 0xC4, /* 0xE0-0xE3 */ + 0xDC, 0xD6, 0xC8, 0xF4, 0xBF, 0xE0, 0xC6, 0x72, /* 0xE4-0xE7 */ + 0xC6, 0x73, 0xC6, 0x74, 0xC6, 0x75, 0xC9, 0xBB, /* 0xE8-0xEB */ + 0xC6, 0x76, 0xC6, 0x77, 0xC6, 0x78, 0xB1, 0xBD, /* 0xEC-0xEF */ + 0xC6, 0x79, 0xD3, 0xA2, 0xC6, 0x7A, 0xC6, 0x7B, /* 0xF0-0xF3 */ + 0xDC, 0xDA, 0xC6, 0x7C, 0xC6, 0x7D, 0xDC, 0xD5, /* 0xF4-0xF7 */ + 0xC6, 0x7E, 0xC6, 0xBB, 0xC6, 0x80, 0xDC, 0xDE, /* 0xF8-0xFB */ + 0xC6, 0x81, 0xC6, 0x82, 0xC6, 0x83, 0xC6, 0x84, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_83[512] = { + 0xC6, 0x85, 0xD7, 0xC2, 0xC3, 0xAF, 0xB7, 0xB6, /* 0x00-0x03 */ + 0xC7, 0xD1, 0xC3, 0xA9, 0xDC, 0xE2, 0xDC, 0xD8, /* 0x04-0x07 */ + 0xDC, 0xEB, 0xDC, 0xD4, 0xC6, 0x86, 0xC6, 0x87, /* 0x08-0x0B */ + 0xDC, 0xDD, 0xC6, 0x88, 0xBE, 0xA5, 0xDC, 0xD7, /* 0x0C-0x0F */ + 0xC6, 0x89, 0xDC, 0xE0, 0xC6, 0x8A, 0xC6, 0x8B, /* 0x10-0x13 */ + 0xDC, 0xE3, 0xDC, 0xE4, 0xC6, 0x8C, 0xDC, 0xF8, /* 0x14-0x17 */ + 0xC6, 0x8D, 0xC6, 0x8E, 0xDC, 0xE1, 0xDD, 0xA2, /* 0x18-0x1B */ + 0xDC, 0xE7, 0xC6, 0x8F, 0xC6, 0x90, 0xC6, 0x91, /* 0x1C-0x1F */ + 0xC6, 0x92, 0xC6, 0x93, 0xC6, 0x94, 0xC6, 0x95, /* 0x20-0x23 */ + 0xC6, 0x96, 0xC6, 0x97, 0xC6, 0x98, 0xBC, 0xEB, /* 0x24-0x27 */ + 0xB4, 0xC4, 0xC6, 0x99, 0xC6, 0x9A, 0xC3, 0xA3, /* 0x28-0x2B */ + 0xB2, 0xE7, 0xDC, 0xFA, 0xC6, 0x9B, 0xDC, 0xF2, /* 0x2C-0x2F */ + 0xC6, 0x9C, 0xDC, 0xEF, 0xC6, 0x9D, 0xDC, 0xFC, /* 0x30-0x33 */ + 0xDC, 0xEE, 0xD2, 0xF0, 0xB2, 0xE8, 0xC6, 0x9E, /* 0x34-0x37 */ + 0xC8, 0xD7, 0xC8, 0xE3, 0xDC, 0xFB, 0xC6, 0x9F, /* 0x38-0x3B */ + 0xDC, 0xED, 0xC6, 0xA0, 0xC7, 0x40, 0xC7, 0x41, /* 0x3C-0x3F */ + 0xDC, 0xF7, 0xC7, 0x42, 0xC7, 0x43, 0xDC, 0xF5, /* 0x40-0x43 */ + 0xC7, 0x44, 0xC7, 0x45, 0xBE, 0xA3, 0xDC, 0xF4, /* 0x44-0x47 */ + 0xC7, 0x46, 0xB2, 0xDD, 0xC7, 0x47, 0xC7, 0x48, /* 0x48-0x4B */ + 0xC7, 0x49, 0xC7, 0x4A, 0xC7, 0x4B, 0xDC, 0xF3, /* 0x4C-0x4F */ + 0xBC, 0xF6, 0xDC, 0xE8, 0xBB, 0xC4, 0xC7, 0x4C, /* 0x50-0x53 */ + 0xC0, 0xF3, 0xC7, 0x4D, 0xC7, 0x4E, 0xC7, 0x4F, /* 0x54-0x57 */ + 0xC7, 0x50, 0xC7, 0x51, 0xBC, 0xD4, 0xDC, 0xE9, /* 0x58-0x5B */ + 0xDC, 0xEA, 0xC7, 0x52, 0xDC, 0xF1, 0xDC, 0xF6, /* 0x5C-0x5F */ + 0xDC, 0xF9, 0xB5, 0xB4, 0xC7, 0x53, 0xC8, 0xD9, /* 0x60-0x63 */ + 0xBB, 0xE7, 0xDC, 0xFE, 0xDC, 0xFD, 0xD3, 0xAB, /* 0x64-0x67 */ + 0xDD, 0xA1, 0xDD, 0xA3, 0xDD, 0xA5, 0xD2, 0xF1, /* 0x68-0x6B */ + 0xDD, 0xA4, 0xDD, 0xA6, 0xDD, 0xA7, 0xD2, 0xA9, /* 0x6C-0x6F */ + 0xC7, 0x54, 0xC7, 0x55, 0xC7, 0x56, 0xC7, 0x57, /* 0x70-0x73 */ + 0xC7, 0x58, 0xC7, 0x59, 0xC7, 0x5A, 0xBA, 0xC9, /* 0x74-0x77 */ + 0xDD, 0xA9, 0xC7, 0x5B, 0xC7, 0x5C, 0xDD, 0xB6, /* 0x78-0x7B */ + 0xDD, 0xB1, 0xDD, 0xB4, 0xC7, 0x5D, 0xC7, 0x5E, /* 0x7C-0x7F */ + + 0xC7, 0x5F, 0xC7, 0x60, 0xC7, 0x61, 0xC7, 0x62, /* 0x80-0x83 */ + 0xC7, 0x63, 0xDD, 0xB0, 0xC6, 0xCE, 0xC7, 0x64, /* 0x84-0x87 */ + 0xC7, 0x65, 0xC0, 0xF2, 0xC7, 0x66, 0xC7, 0x67, /* 0x88-0x8B */ + 0xC7, 0x68, 0xC7, 0x69, 0xC9, 0xAF, 0xC7, 0x6A, /* 0x8C-0x8F */ + 0xC7, 0x6B, 0xC7, 0x6C, 0xDC, 0xEC, 0xDD, 0xAE, /* 0x90-0x93 */ + 0xC7, 0x6D, 0xC7, 0x6E, 0xC7, 0x6F, 0xC7, 0x70, /* 0x94-0x97 */ + 0xDD, 0xB7, 0xC7, 0x71, 0xC7, 0x72, 0xDC, 0xF0, /* 0x98-0x9B */ + 0xDD, 0xAF, 0xC7, 0x73, 0xDD, 0xB8, 0xC7, 0x74, /* 0x9C-0x9F */ + 0xDD, 0xAC, 0xC7, 0x75, 0xC7, 0x76, 0xC7, 0x77, /* 0xA0-0xA3 */ + 0xC7, 0x78, 0xC7, 0x79, 0xC7, 0x7A, 0xC7, 0x7B, /* 0xA4-0xA7 */ + 0xDD, 0xB9, 0xDD, 0xB3, 0xDD, 0xAD, 0xC4, 0xAA, /* 0xA8-0xAB */ + 0xC7, 0x7C, 0xC7, 0x7D, 0xC7, 0x7E, 0xC7, 0x80, /* 0xAC-0xAF */ + 0xDD, 0xA8, 0xC0, 0xB3, 0xC1, 0xAB, 0xDD, 0xAA, /* 0xB0-0xB3 */ + 0xDD, 0xAB, 0xC7, 0x81, 0xDD, 0xB2, 0xBB, 0xF1, /* 0xB4-0xB7 */ + 0xDD, 0xB5, 0xD3, 0xA8, 0xDD, 0xBA, 0xC7, 0x82, /* 0xB8-0xBB */ + 0xDD, 0xBB, 0xC3, 0xA7, 0xC7, 0x83, 0xC7, 0x84, /* 0xBC-0xBF */ + 0xDD, 0xD2, 0xDD, 0xBC, 0xC7, 0x85, 0xC7, 0x86, /* 0xC0-0xC3 */ + 0xC7, 0x87, 0xDD, 0xD1, 0xC7, 0x88, 0xB9, 0xBD, /* 0xC4-0xC7 */ + 0xC7, 0x89, 0xC7, 0x8A, 0xBE, 0xD5, 0xC7, 0x8B, /* 0xC8-0xCB */ + 0xBE, 0xFA, 0xC7, 0x8C, 0xC7, 0x8D, 0xBA, 0xCA, /* 0xCC-0xCF */ + 0xC7, 0x8E, 0xC7, 0x8F, 0xC7, 0x90, 0xC7, 0x91, /* 0xD0-0xD3 */ + 0xDD, 0xCA, 0xC7, 0x92, 0xDD, 0xC5, 0xC7, 0x93, /* 0xD4-0xD7 */ + 0xDD, 0xBF, 0xC7, 0x94, 0xC7, 0x95, 0xC7, 0x96, /* 0xD8-0xDB */ + 0xB2, 0xCB, 0xDD, 0xC3, 0xC7, 0x97, 0xDD, 0xCB, /* 0xDC-0xDF */ + 0xB2, 0xA4, 0xDD, 0xD5, 0xC7, 0x98, 0xC7, 0x99, /* 0xE0-0xE3 */ + 0xC7, 0x9A, 0xDD, 0xBE, 0xC7, 0x9B, 0xC7, 0x9C, /* 0xE4-0xE7 */ + 0xC7, 0x9D, 0xC6, 0xD0, 0xDD, 0xD0, 0xC7, 0x9E, /* 0xE8-0xEB */ + 0xC7, 0x9F, 0xC7, 0xA0, 0xC8, 0x40, 0xC8, 0x41, /* 0xEC-0xEF */ + 0xDD, 0xD4, 0xC1, 0xE2, 0xB7, 0xC6, 0xC8, 0x42, /* 0xF0-0xF3 */ + 0xC8, 0x43, 0xC8, 0x44, 0xC8, 0x45, 0xC8, 0x46, /* 0xF4-0xF7 */ + 0xDD, 0xCE, 0xDD, 0xCF, 0xC8, 0x47, 0xC8, 0x48, /* 0xF8-0xFB */ + 0xC8, 0x49, 0xDD, 0xC4, 0xC8, 0x4A, 0xC8, 0x4B, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_84[512] = { + 0xC8, 0x4C, 0xDD, 0xBD, 0xC8, 0x4D, 0xDD, 0xCD, /* 0x00-0x03 */ + 0xCC, 0xD1, 0xC8, 0x4E, 0xDD, 0xC9, 0xC8, 0x4F, /* 0x04-0x07 */ + 0xC8, 0x50, 0xC8, 0x51, 0xC8, 0x52, 0xDD, 0xC2, /* 0x08-0x0B */ + 0xC3, 0xC8, 0xC6, 0xBC, 0xCE, 0xAE, 0xDD, 0xCC, /* 0x0C-0x0F */ + 0xC8, 0x53, 0xDD, 0xC8, 0xC8, 0x54, 0xC8, 0x55, /* 0x10-0x13 */ + 0xC8, 0x56, 0xC8, 0x57, 0xC8, 0x58, 0xC8, 0x59, /* 0x14-0x17 */ + 0xDD, 0xC1, 0xC8, 0x5A, 0xC8, 0x5B, 0xC8, 0x5C, /* 0x18-0x1B */ + 0xDD, 0xC6, 0xC2, 0xDC, 0xC8, 0x5D, 0xC8, 0x5E, /* 0x1C-0x1F */ + 0xC8, 0x5F, 0xC8, 0x60, 0xC8, 0x61, 0xC8, 0x62, /* 0x20-0x23 */ + 0xD3, 0xA9, 0xD3, 0xAA, 0xDD, 0xD3, 0xCF, 0xF4, /* 0x24-0x27 */ + 0xC8, 0xF8, 0xC8, 0x63, 0xC8, 0x64, 0xC8, 0x65, /* 0x28-0x2B */ + 0xC8, 0x66, 0xC8, 0x67, 0xC8, 0x68, 0xC8, 0x69, /* 0x2C-0x2F */ + 0xC8, 0x6A, 0xDD, 0xE6, 0xC8, 0x6B, 0xC8, 0x6C, /* 0x30-0x33 */ + 0xC8, 0x6D, 0xC8, 0x6E, 0xC8, 0x6F, 0xC8, 0x70, /* 0x34-0x37 */ + 0xDD, 0xC7, 0xC8, 0x71, 0xC8, 0x72, 0xC8, 0x73, /* 0x38-0x3B */ + 0xDD, 0xE0, 0xC2, 0xE4, 0xC8, 0x74, 0xC8, 0x75, /* 0x3C-0x3F */ + 0xC8, 0x76, 0xC8, 0x77, 0xC8, 0x78, 0xC8, 0x79, /* 0x40-0x43 */ + 0xC8, 0x7A, 0xC8, 0x7B, 0xDD, 0xE1, 0xC8, 0x7C, /* 0x44-0x47 */ + 0xC8, 0x7D, 0xC8, 0x7E, 0xC8, 0x80, 0xC8, 0x81, /* 0x48-0x4B */ + 0xC8, 0x82, 0xC8, 0x83, 0xC8, 0x84, 0xC8, 0x85, /* 0x4C-0x4F */ + 0xC8, 0x86, 0xDD, 0xD7, 0xC8, 0x87, 0xC8, 0x88, /* 0x50-0x53 */ + 0xC8, 0x89, 0xC8, 0x8A, 0xC8, 0x8B, 0xD6, 0xF8, /* 0x54-0x57 */ + 0xC8, 0x8C, 0xDD, 0xD9, 0xDD, 0xD8, 0xB8, 0xF0, /* 0x58-0x5B */ + 0xDD, 0xD6, 0xC8, 0x8D, 0xC8, 0x8E, 0xC8, 0x8F, /* 0x5C-0x5F */ + 0xC8, 0x90, 0xC6, 0xCF, 0xC8, 0x91, 0xB6, 0xAD, /* 0x60-0x63 */ + 0xC8, 0x92, 0xC8, 0x93, 0xC8, 0x94, 0xC8, 0x95, /* 0x64-0x67 */ + 0xC8, 0x96, 0xDD, 0xE2, 0xC8, 0x97, 0xBA, 0xF9, /* 0x68-0x6B */ + 0xD4, 0xE1, 0xDD, 0xE7, 0xC8, 0x98, 0xC8, 0x99, /* 0x6C-0x6F */ + 0xC8, 0x9A, 0xB4, 0xD0, 0xC8, 0x9B, 0xDD, 0xDA, /* 0x70-0x73 */ + 0xC8, 0x9C, 0xBF, 0xFB, 0xDD, 0xE3, 0xC8, 0x9D, /* 0x74-0x77 */ + 0xDD, 0xDF, 0xC8, 0x9E, 0xDD, 0xDD, 0xC8, 0x9F, /* 0x78-0x7B */ + 0xC8, 0xA0, 0xC9, 0x40, 0xC9, 0x41, 0xC9, 0x42, /* 0x7C-0x7F */ + + 0xC9, 0x43, 0xC9, 0x44, 0xB5, 0xD9, 0xC9, 0x45, /* 0x80-0x83 */ + 0xC9, 0x46, 0xC9, 0x47, 0xC9, 0x48, 0xDD, 0xDB, /* 0x84-0x87 */ + 0xDD, 0xDC, 0xDD, 0xDE, 0xC9, 0x49, 0xBD, 0xAF, /* 0x88-0x8B */ + 0xDD, 0xE4, 0xC9, 0x4A, 0xDD, 0xE5, 0xC9, 0x4B, /* 0x8C-0x8F */ + 0xC9, 0x4C, 0xC9, 0x4D, 0xC9, 0x4E, 0xC9, 0x4F, /* 0x90-0x93 */ + 0xC9, 0x50, 0xC9, 0x51, 0xC9, 0x52, 0xDD, 0xF5, /* 0x94-0x97 */ + 0xC9, 0x53, 0xC3, 0xC9, 0xC9, 0x54, 0xC9, 0x55, /* 0x98-0x9B */ + 0xCB, 0xE2, 0xC9, 0x56, 0xC9, 0x57, 0xC9, 0x58, /* 0x9C-0x9F */ + 0xC9, 0x59, 0xDD, 0xF2, 0xC9, 0x5A, 0xC9, 0x5B, /* 0xA0-0xA3 */ + 0xC9, 0x5C, 0xC9, 0x5D, 0xC9, 0x5E, 0xC9, 0x5F, /* 0xA4-0xA7 */ + 0xC9, 0x60, 0xC9, 0x61, 0xC9, 0x62, 0xC9, 0x63, /* 0xA8-0xAB */ + 0xC9, 0x64, 0xC9, 0x65, 0xC9, 0x66, 0xD8, 0xE1, /* 0xAC-0xAF */ + 0xC9, 0x67, 0xC9, 0x68, 0xC6, 0xD1, 0xC9, 0x69, /* 0xB0-0xB3 */ + 0xDD, 0xF4, 0xC9, 0x6A, 0xC9, 0x6B, 0xC9, 0x6C, /* 0xB4-0xB7 */ + 0xD5, 0xF4, 0xDD, 0xF3, 0xDD, 0xF0, 0xC9, 0x6D, /* 0xB8-0xBB */ + 0xC9, 0x6E, 0xDD, 0xEC, 0xC9, 0x6F, 0xDD, 0xEF, /* 0xBC-0xBF */ + 0xC9, 0x70, 0xDD, 0xE8, 0xC9, 0x71, 0xC9, 0x72, /* 0xC0-0xC3 */ + 0xD0, 0xEE, 0xC9, 0x73, 0xC9, 0x74, 0xC9, 0x75, /* 0xC4-0xC7 */ + 0xC9, 0x76, 0xC8, 0xD8, 0xDD, 0xEE, 0xC9, 0x77, /* 0xC8-0xCB */ + 0xC9, 0x78, 0xDD, 0xE9, 0xC9, 0x79, 0xC9, 0x7A, /* 0xCC-0xCF */ + 0xDD, 0xEA, 0xCB, 0xF2, 0xC9, 0x7B, 0xDD, 0xED, /* 0xD0-0xD3 */ + 0xC9, 0x7C, 0xC9, 0x7D, 0xB1, 0xCD, 0xC9, 0x7E, /* 0xD4-0xD7 */ + 0xC9, 0x80, 0xC9, 0x81, 0xC9, 0x82, 0xC9, 0x83, /* 0xD8-0xDB */ + 0xC9, 0x84, 0xC0, 0xB6, 0xC9, 0x85, 0xBC, 0xBB, /* 0xDC-0xDF */ + 0xDD, 0xF1, 0xC9, 0x86, 0xC9, 0x87, 0xDD, 0xF7, /* 0xE0-0xE3 */ + 0xC9, 0x88, 0xDD, 0xF6, 0xDD, 0xEB, 0xC9, 0x89, /* 0xE4-0xE7 */ + 0xC9, 0x8A, 0xC9, 0x8B, 0xC9, 0x8C, 0xC9, 0x8D, /* 0xE8-0xEB */ + 0xC5, 0xEE, 0xC9, 0x8E, 0xC9, 0x8F, 0xC9, 0x90, /* 0xEC-0xEF */ + 0xDD, 0xFB, 0xC9, 0x91, 0xC9, 0x92, 0xC9, 0x93, /* 0xF0-0xF3 */ + 0xC9, 0x94, 0xC9, 0x95, 0xC9, 0x96, 0xC9, 0x97, /* 0xF4-0xF7 */ + 0xC9, 0x98, 0xC9, 0x99, 0xC9, 0x9A, 0xC9, 0x9B, /* 0xF8-0xFB */ + 0xDE, 0xA4, 0xC9, 0x9C, 0xC9, 0x9D, 0xDE, 0xA3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_85[512] = { + 0xC9, 0x9E, 0xC9, 0x9F, 0xC9, 0xA0, 0xCA, 0x40, /* 0x00-0x03 */ + 0xCA, 0x41, 0xCA, 0x42, 0xCA, 0x43, 0xCA, 0x44, /* 0x04-0x07 */ + 0xCA, 0x45, 0xCA, 0x46, 0xCA, 0x47, 0xCA, 0x48, /* 0x08-0x0B */ + 0xDD, 0xF8, 0xCA, 0x49, 0xCA, 0x4A, 0xCA, 0x4B, /* 0x0C-0x0F */ + 0xCA, 0x4C, 0xC3, 0xEF, 0xCA, 0x4D, 0xC2, 0xFB, /* 0x10-0x13 */ + 0xCA, 0x4E, 0xCA, 0x4F, 0xCA, 0x50, 0xD5, 0xE1, /* 0x14-0x17 */ + 0xCA, 0x51, 0xCA, 0x52, 0xCE, 0xB5, 0xCA, 0x53, /* 0x18-0x1B */ + 0xCA, 0x54, 0xCA, 0x55, 0xCA, 0x56, 0xDD, 0xFD, /* 0x1C-0x1F */ + 0xCA, 0x57, 0xB2, 0xCC, 0xCA, 0x58, 0xCA, 0x59, /* 0x20-0x23 */ + 0xCA, 0x5A, 0xCA, 0x5B, 0xCA, 0x5C, 0xCA, 0x5D, /* 0x24-0x27 */ + 0xCA, 0x5E, 0xCA, 0x5F, 0xCA, 0x60, 0xC4, 0xE8, /* 0x28-0x2B */ + 0xCA, 0xDF, 0xCA, 0x61, 0xCA, 0x62, 0xCA, 0x63, /* 0x2C-0x2F */ + 0xCA, 0x64, 0xCA, 0x65, 0xCA, 0x66, 0xCA, 0x67, /* 0x30-0x33 */ + 0xCA, 0x68, 0xCA, 0x69, 0xCA, 0x6A, 0xC7, 0xBE, /* 0x34-0x37 */ + 0xDD, 0xFA, 0xDD, 0xFC, 0xDD, 0xFE, 0xDE, 0xA2, /* 0x38-0x3B */ + 0xB0, 0xAA, 0xB1, 0xCE, 0xCA, 0x6B, 0xCA, 0x6C, /* 0x3C-0x3F */ + 0xCA, 0x6D, 0xCA, 0x6E, 0xCA, 0x6F, 0xDE, 0xAC, /* 0x40-0x43 */ + 0xCA, 0x70, 0xCA, 0x71, 0xCA, 0x72, 0xCA, 0x73, /* 0x44-0x47 */ + 0xDE, 0xA6, 0xBD, 0xB6, 0xC8, 0xEF, 0xCA, 0x74, /* 0x48-0x4B */ + 0xCA, 0x75, 0xCA, 0x76, 0xCA, 0x77, 0xCA, 0x78, /* 0x4C-0x4F */ + 0xCA, 0x79, 0xCA, 0x7A, 0xCA, 0x7B, 0xCA, 0x7C, /* 0x50-0x53 */ + 0xCA, 0x7D, 0xCA, 0x7E, 0xDE, 0xA1, 0xCA, 0x80, /* 0x54-0x57 */ + 0xCA, 0x81, 0xDE, 0xA5, 0xCA, 0x82, 0xCA, 0x83, /* 0x58-0x5B */ + 0xCA, 0x84, 0xCA, 0x85, 0xDE, 0xA9, 0xCA, 0x86, /* 0x5C-0x5F */ + 0xCA, 0x87, 0xCA, 0x88, 0xCA, 0x89, 0xCA, 0x8A, /* 0x60-0x63 */ + 0xDE, 0xA8, 0xCA, 0x8B, 0xCA, 0x8C, 0xCA, 0x8D, /* 0x64-0x67 */ + 0xDE, 0xA7, 0xCA, 0x8E, 0xCA, 0x8F, 0xCA, 0x90, /* 0x68-0x6B */ + 0xCA, 0x91, 0xCA, 0x92, 0xCA, 0x93, 0xCA, 0x94, /* 0x6C-0x6F */ + 0xCA, 0x95, 0xCA, 0x96, 0xDE, 0xAD, 0xCA, 0x97, /* 0x70-0x73 */ + 0xD4, 0xCC, 0xCA, 0x98, 0xCA, 0x99, 0xCA, 0x9A, /* 0x74-0x77 */ + 0xCA, 0x9B, 0xDE, 0xB3, 0xDE, 0xAA, 0xDE, 0xAE, /* 0x78-0x7B */ + 0xCA, 0x9C, 0xCA, 0x9D, 0xC0, 0xD9, 0xCA, 0x9E, /* 0x7C-0x7F */ + + 0xCA, 0x9F, 0xCA, 0xA0, 0xCB, 0x40, 0xCB, 0x41, /* 0x80-0x83 */ + 0xB1, 0xA1, 0xDE, 0xB6, 0xCB, 0x42, 0xDE, 0xB1, /* 0x84-0x87 */ + 0xCB, 0x43, 0xCB, 0x44, 0xCB, 0x45, 0xCB, 0x46, /* 0x88-0x8B */ + 0xCB, 0x47, 0xCB, 0x48, 0xCB, 0x49, 0xDE, 0xB2, /* 0x8C-0x8F */ + 0xCB, 0x4A, 0xCB, 0x4B, 0xCB, 0x4C, 0xCB, 0x4D, /* 0x90-0x93 */ + 0xCB, 0x4E, 0xCB, 0x4F, 0xCB, 0x50, 0xCB, 0x51, /* 0x94-0x97 */ + 0xCB, 0x52, 0xCB, 0x53, 0xCB, 0x54, 0xD1, 0xA6, /* 0x98-0x9B */ + 0xDE, 0xB5, 0xCB, 0x55, 0xCB, 0x56, 0xCB, 0x57, /* 0x9C-0x9F */ + 0xCB, 0x58, 0xCB, 0x59, 0xCB, 0x5A, 0xCB, 0x5B, /* 0xA0-0xA3 */ + 0xDE, 0xAF, 0xCB, 0x5C, 0xCB, 0x5D, 0xCB, 0x5E, /* 0xA4-0xA7 */ + 0xDE, 0xB0, 0xCB, 0x5F, 0xD0, 0xBD, 0xCB, 0x60, /* 0xA8-0xAB */ + 0xCB, 0x61, 0xCB, 0x62, 0xDE, 0xB4, 0xCA, 0xED, /* 0xAC-0xAF */ + 0xDE, 0xB9, 0xCB, 0x63, 0xCB, 0x64, 0xCB, 0x65, /* 0xB0-0xB3 */ + 0xCB, 0x66, 0xCB, 0x67, 0xCB, 0x68, 0xDE, 0xB8, /* 0xB4-0xB7 */ + 0xCB, 0x69, 0xDE, 0xB7, 0xCB, 0x6A, 0xCB, 0x6B, /* 0xB8-0xBB */ + 0xCB, 0x6C, 0xCB, 0x6D, 0xCB, 0x6E, 0xCB, 0x6F, /* 0xBC-0xBF */ + 0xCB, 0x70, 0xDE, 0xBB, 0xCB, 0x71, 0xCB, 0x72, /* 0xC0-0xC3 */ + 0xCB, 0x73, 0xCB, 0x74, 0xCB, 0x75, 0xCB, 0x76, /* 0xC4-0xC7 */ + 0xCB, 0x77, 0xBD, 0xE5, 0xCB, 0x78, 0xCB, 0x79, /* 0xC8-0xCB */ + 0xCB, 0x7A, 0xCB, 0x7B, 0xCB, 0x7C, 0xB2, 0xD8, /* 0xCC-0xCF */ + 0xC3, 0xEA, 0xCB, 0x7D, 0xCB, 0x7E, 0xDE, 0xBA, /* 0xD0-0xD3 */ + 0xCB, 0x80, 0xC5, 0xBA, 0xCB, 0x81, 0xCB, 0x82, /* 0xD4-0xD7 */ + 0xCB, 0x83, 0xCB, 0x84, 0xCB, 0x85, 0xCB, 0x86, /* 0xD8-0xDB */ + 0xDE, 0xBC, 0xCB, 0x87, 0xCB, 0x88, 0xCB, 0x89, /* 0xDC-0xDF */ + 0xCB, 0x8A, 0xCB, 0x8B, 0xCB, 0x8C, 0xCB, 0x8D, /* 0xE0-0xE3 */ + 0xCC, 0xD9, 0xCB, 0x8E, 0xCB, 0x8F, 0xCB, 0x90, /* 0xE4-0xE7 */ + 0xCB, 0x91, 0xB7, 0xAA, 0xCB, 0x92, 0xCB, 0x93, /* 0xE8-0xEB */ + 0xCB, 0x94, 0xCB, 0x95, 0xCB, 0x96, 0xCB, 0x97, /* 0xEC-0xEF */ + 0xCB, 0x98, 0xCB, 0x99, 0xCB, 0x9A, 0xCB, 0x9B, /* 0xF0-0xF3 */ + 0xCB, 0x9C, 0xCB, 0x9D, 0xCB, 0x9E, 0xCB, 0x9F, /* 0xF4-0xF7 */ + 0xCB, 0xA0, 0xCC, 0x40, 0xCC, 0x41, 0xD4, 0xE5, /* 0xF8-0xFB */ + 0xCC, 0x42, 0xCC, 0x43, 0xCC, 0x44, 0xDE, 0xBD, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_86[512] = { + 0xCC, 0x45, 0xCC, 0x46, 0xCC, 0x47, 0xCC, 0x48, /* 0x00-0x03 */ + 0xCC, 0x49, 0xDE, 0xBF, 0xCC, 0x4A, 0xCC, 0x4B, /* 0x04-0x07 */ + 0xCC, 0x4C, 0xCC, 0x4D, 0xCC, 0x4E, 0xCC, 0x4F, /* 0x08-0x0B */ + 0xCC, 0x50, 0xCC, 0x51, 0xCC, 0x52, 0xCC, 0x53, /* 0x0C-0x0F */ + 0xCC, 0x54, 0xC4, 0xA2, 0xCC, 0x55, 0xCC, 0x56, /* 0x10-0x13 */ + 0xCC, 0x57, 0xCC, 0x58, 0xDE, 0xC1, 0xCC, 0x59, /* 0x14-0x17 */ + 0xCC, 0x5A, 0xCC, 0x5B, 0xCC, 0x5C, 0xCC, 0x5D, /* 0x18-0x1B */ + 0xCC, 0x5E, 0xCC, 0x5F, 0xCC, 0x60, 0xCC, 0x61, /* 0x1C-0x1F */ + 0xCC, 0x62, 0xCC, 0x63, 0xCC, 0x64, 0xCC, 0x65, /* 0x20-0x23 */ + 0xCC, 0x66, 0xCC, 0x67, 0xCC, 0x68, 0xDE, 0xBE, /* 0x24-0x27 */ + 0xCC, 0x69, 0xDE, 0xC0, 0xCC, 0x6A, 0xCC, 0x6B, /* 0x28-0x2B */ + 0xCC, 0x6C, 0xCC, 0x6D, 0xCC, 0x6E, 0xCC, 0x6F, /* 0x2C-0x2F */ + 0xCC, 0x70, 0xCC, 0x71, 0xCC, 0x72, 0xCC, 0x73, /* 0x30-0x33 */ + 0xCC, 0x74, 0xCC, 0x75, 0xCC, 0x76, 0xCC, 0x77, /* 0x34-0x37 */ + 0xD5, 0xBA, 0xCC, 0x78, 0xCC, 0x79, 0xCC, 0x7A, /* 0x38-0x3B */ + 0xDE, 0xC2, 0xCC, 0x7B, 0xCC, 0x7C, 0xCC, 0x7D, /* 0x3C-0x3F */ + 0xCC, 0x7E, 0xCC, 0x80, 0xCC, 0x81, 0xCC, 0x82, /* 0x40-0x43 */ + 0xCC, 0x83, 0xCC, 0x84, 0xCC, 0x85, 0xCC, 0x86, /* 0x44-0x47 */ + 0xCC, 0x87, 0xCC, 0x88, 0xCC, 0x89, 0xCC, 0x8A, /* 0x48-0x4B */ + 0xCC, 0x8B, 0xF2, 0xAE, 0xBB, 0xA2, 0xC2, 0xB2, /* 0x4C-0x4F */ + 0xC5, 0xB0, 0xC2, 0xC7, 0xCC, 0x8C, 0xCC, 0x8D, /* 0x50-0x53 */ + 0xF2, 0xAF, 0xCC, 0x8E, 0xCC, 0x8F, 0xCC, 0x90, /* 0x54-0x57 */ + 0xCC, 0x91, 0xCC, 0x92, 0xD0, 0xE9, 0xCC, 0x93, /* 0x58-0x5B */ + 0xCC, 0x94, 0xCC, 0x95, 0xD3, 0xDD, 0xCC, 0x96, /* 0x5C-0x5F */ + 0xCC, 0x97, 0xCC, 0x98, 0xEB, 0xBD, 0xCC, 0x99, /* 0x60-0x63 */ + 0xCC, 0x9A, 0xCC, 0x9B, 0xCC, 0x9C, 0xCC, 0x9D, /* 0x64-0x67 */ + 0xCC, 0x9E, 0xCC, 0x9F, 0xCC, 0xA0, 0xB3, 0xE6, /* 0x68-0x6B */ + 0xF2, 0xB0, 0xCD, 0x40, 0xF2, 0xB1, 0xCD, 0x41, /* 0x6C-0x6F */ + 0xCD, 0x42, 0xCA, 0xAD, 0xCD, 0x43, 0xCD, 0x44, /* 0x70-0x73 */ + 0xCD, 0x45, 0xCD, 0x46, 0xCD, 0x47, 0xCD, 0x48, /* 0x74-0x77 */ + 0xCD, 0x49, 0xBA, 0xE7, 0xF2, 0xB3, 0xF2, 0xB5, /* 0x78-0x7B */ + 0xF2, 0xB4, 0xCB, 0xE4, 0xCF, 0xBA, 0xF2, 0xB2, /* 0x7C-0x7F */ + + 0xCA, 0xB4, 0xD2, 0xCF, 0xC2, 0xEC, 0xCD, 0x4A, /* 0x80-0x83 */ + 0xCD, 0x4B, 0xCD, 0x4C, 0xCD, 0x4D, 0xCD, 0x4E, /* 0x84-0x87 */ + 0xCD, 0x4F, 0xCD, 0x50, 0xCE, 0xC3, 0xF2, 0xB8, /* 0x88-0x8B */ + 0xB0, 0xF6, 0xF2, 0xB7, 0xCD, 0x51, 0xCD, 0x52, /* 0x8C-0x8F */ + 0xCD, 0x53, 0xCD, 0x54, 0xCD, 0x55, 0xF2, 0xBE, /* 0x90-0x93 */ + 0xCD, 0x56, 0xB2, 0xCF, 0xCD, 0x57, 0xCD, 0x58, /* 0x94-0x97 */ + 0xCD, 0x59, 0xCD, 0x5A, 0xCD, 0x5B, 0xCD, 0x5C, /* 0x98-0x9B */ + 0xD1, 0xC1, 0xF2, 0xBA, 0xCD, 0x5D, 0xCD, 0x5E, /* 0x9C-0x9F */ + 0xCD, 0x5F, 0xCD, 0x60, 0xCD, 0x61, 0xF2, 0xBC, /* 0xA0-0xA3 */ + 0xD4, 0xE9, 0xCD, 0x62, 0xCD, 0x63, 0xF2, 0xBB, /* 0xA4-0xA7 */ + 0xF2, 0xB6, 0xF2, 0xBF, 0xF2, 0xBD, 0xCD, 0x64, /* 0xA8-0xAB */ + 0xF2, 0xB9, 0xCD, 0x65, 0xCD, 0x66, 0xF2, 0xC7, /* 0xAC-0xAF */ + 0xF2, 0xC4, 0xF2, 0xC6, 0xCD, 0x67, 0xCD, 0x68, /* 0xB0-0xB3 */ + 0xF2, 0xCA, 0xF2, 0xC2, 0xF2, 0xC0, 0xCD, 0x69, /* 0xB4-0xB7 */ + 0xCD, 0x6A, 0xCD, 0x6B, 0xF2, 0xC5, 0xCD, 0x6C, /* 0xB8-0xBB */ + 0xCD, 0x6D, 0xCD, 0x6E, 0xCD, 0x6F, 0xCD, 0x70, /* 0xBC-0xBF */ + 0xD6, 0xFB, 0xCD, 0x71, 0xCD, 0x72, 0xCD, 0x73, /* 0xC0-0xC3 */ + 0xF2, 0xC1, 0xCD, 0x74, 0xC7, 0xF9, 0xC9, 0xDF, /* 0xC4-0xC7 */ + 0xCD, 0x75, 0xF2, 0xC8, 0xB9, 0xC6, 0xB5, 0xB0, /* 0xC8-0xCB */ + 0xCD, 0x76, 0xCD, 0x77, 0xF2, 0xC3, 0xF2, 0xC9, /* 0xCC-0xCF */ + 0xF2, 0xD0, 0xF2, 0xD6, 0xCD, 0x78, 0xCD, 0x79, /* 0xD0-0xD3 */ + 0xBB, 0xD7, 0xCD, 0x7A, 0xCD, 0x7B, 0xCD, 0x7C, /* 0xD4-0xD7 */ + 0xF2, 0xD5, 0xCD, 0xDC, 0xCD, 0x7D, 0xD6, 0xEB, /* 0xD8-0xDB */ + 0xCD, 0x7E, 0xCD, 0x80, 0xF2, 0xD2, 0xF2, 0xD4, /* 0xDC-0xDF */ + 0xCD, 0x81, 0xCD, 0x82, 0xCD, 0x83, 0xCD, 0x84, /* 0xE0-0xE3 */ + 0xB8, 0xF2, 0xCD, 0x85, 0xCD, 0x86, 0xCD, 0x87, /* 0xE4-0xE7 */ + 0xCD, 0x88, 0xF2, 0xCB, 0xCD, 0x89, 0xCD, 0x8A, /* 0xE8-0xEB */ + 0xCD, 0x8B, 0xF2, 0xCE, 0xC2, 0xF9, 0xCD, 0x8C, /* 0xEC-0xEF */ + 0xD5, 0xDD, 0xF2, 0xCC, 0xF2, 0xCD, 0xF2, 0xCF, /* 0xF0-0xF3 */ + 0xF2, 0xD3, 0xCD, 0x8D, 0xCD, 0x8E, 0xCD, 0x8F, /* 0xF4-0xF7 */ + 0xF2, 0xD9, 0xD3, 0xBC, 0xCD, 0x90, 0xCD, 0x91, /* 0xF8-0xFB */ + 0xCD, 0x92, 0xCD, 0x93, 0xB6, 0xEA, 0xCD, 0x94, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_87[512] = { + 0xCA, 0xF1, 0xCD, 0x95, 0xB7, 0xE4, 0xF2, 0xD7, /* 0x00-0x03 */ + 0xCD, 0x96, 0xCD, 0x97, 0xCD, 0x98, 0xF2, 0xD8, /* 0x04-0x07 */ + 0xF2, 0xDA, 0xF2, 0xDD, 0xF2, 0xDB, 0xCD, 0x99, /* 0x08-0x0B */ + 0xCD, 0x9A, 0xF2, 0xDC, 0xCD, 0x9B, 0xCD, 0x9C, /* 0x0C-0x0F */ + 0xCD, 0x9D, 0xCD, 0x9E, 0xD1, 0xD1, 0xF2, 0xD1, /* 0x10-0x13 */ + 0xCD, 0x9F, 0xCD, 0xC9, 0xCD, 0xA0, 0xCE, 0xCF, /* 0x14-0x17 */ + 0xD6, 0xA9, 0xCE, 0x40, 0xF2, 0xE3, 0xCE, 0x41, /* 0x18-0x1B */ + 0xC3, 0xDB, 0xCE, 0x42, 0xF2, 0xE0, 0xCE, 0x43, /* 0x1C-0x1F */ + 0xCE, 0x44, 0xC0, 0xAF, 0xF2, 0xEC, 0xF2, 0xDE, /* 0x20-0x23 */ + 0xCE, 0x45, 0xF2, 0xE1, 0xCE, 0x46, 0xCE, 0x47, /* 0x24-0x27 */ + 0xCE, 0x48, 0xF2, 0xE8, 0xCE, 0x49, 0xCE, 0x4A, /* 0x28-0x2B */ + 0xCE, 0x4B, 0xCE, 0x4C, 0xF2, 0xE2, 0xCE, 0x4D, /* 0x2C-0x2F */ + 0xCE, 0x4E, 0xF2, 0xE7, 0xCE, 0x4F, 0xCE, 0x50, /* 0x30-0x33 */ + 0xF2, 0xE6, 0xCE, 0x51, 0xCE, 0x52, 0xF2, 0xE9, /* 0x34-0x37 */ + 0xCE, 0x53, 0xCE, 0x54, 0xCE, 0x55, 0xF2, 0xDF, /* 0x38-0x3B */ + 0xCE, 0x56, 0xCE, 0x57, 0xF2, 0xE4, 0xF2, 0xEA, /* 0x3C-0x3F */ + 0xCE, 0x58, 0xCE, 0x59, 0xCE, 0x5A, 0xCE, 0x5B, /* 0x40-0x43 */ + 0xCE, 0x5C, 0xCE, 0x5D, 0xCE, 0x5E, 0xD3, 0xAC, /* 0x44-0x47 */ + 0xF2, 0xE5, 0xB2, 0xF5, 0xCE, 0x5F, 0xCE, 0x60, /* 0x48-0x4B */ + 0xF2, 0xF2, 0xCE, 0x61, 0xD0, 0xAB, 0xCE, 0x62, /* 0x4C-0x4F */ + 0xCE, 0x63, 0xCE, 0x64, 0xCE, 0x65, 0xF2, 0xF5, /* 0x50-0x53 */ + 0xCE, 0x66, 0xCE, 0x67, 0xCE, 0x68, 0xBB, 0xC8, /* 0x54-0x57 */ + 0xCE, 0x69, 0xF2, 0xF9, 0xCE, 0x6A, 0xCE, 0x6B, /* 0x58-0x5B */ + 0xCE, 0x6C, 0xCE, 0x6D, 0xCE, 0x6E, 0xCE, 0x6F, /* 0x5C-0x5F */ + 0xF2, 0xF0, 0xCE, 0x70, 0xCE, 0x71, 0xF2, 0xF6, /* 0x60-0x63 */ + 0xF2, 0xF8, 0xF2, 0xFA, 0xCE, 0x72, 0xCE, 0x73, /* 0x64-0x67 */ + 0xCE, 0x74, 0xCE, 0x75, 0xCE, 0x76, 0xCE, 0x77, /* 0x68-0x6B */ + 0xCE, 0x78, 0xCE, 0x79, 0xF2, 0xF3, 0xCE, 0x7A, /* 0x6C-0x6F */ + 0xF2, 0xF1, 0xCE, 0x7B, 0xCE, 0x7C, 0xCE, 0x7D, /* 0x70-0x73 */ + 0xBA, 0xFB, 0xCE, 0x7E, 0xB5, 0xFB, 0xCE, 0x80, /* 0x74-0x77 */ + 0xCE, 0x81, 0xCE, 0x82, 0xCE, 0x83, 0xF2, 0xEF, /* 0x78-0x7B */ + 0xF2, 0xF7, 0xF2, 0xED, 0xF2, 0xEE, 0xCE, 0x84, /* 0x7C-0x7F */ + + 0xCE, 0x85, 0xCE, 0x86, 0xF2, 0xEB, 0xF3, 0xA6, /* 0x80-0x83 */ + 0xCE, 0x87, 0xF3, 0xA3, 0xCE, 0x88, 0xCE, 0x89, /* 0x84-0x87 */ + 0xF3, 0xA2, 0xCE, 0x8A, 0xCE, 0x8B, 0xF2, 0xF4, /* 0x88-0x8B */ + 0xCE, 0x8C, 0xC8, 0xDA, 0xCE, 0x8D, 0xCE, 0x8E, /* 0x8C-0x8F */ + 0xCE, 0x8F, 0xCE, 0x90, 0xCE, 0x91, 0xF2, 0xFB, /* 0x90-0x93 */ + 0xCE, 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xF3, 0xA5, /* 0x94-0x97 */ + 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, 0x98, /* 0x98-0x9B */ + 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xC3, 0xF8, /* 0x9C-0x9F */ + 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, /* 0xA0-0xA3 */ + 0xCE, 0xA0, 0xCF, 0x40, 0xCF, 0x41, 0xCF, 0x42, /* 0xA4-0xA7 */ + 0xF2, 0xFD, 0xCF, 0x43, 0xCF, 0x44, 0xF3, 0xA7, /* 0xA8-0xAB */ + 0xF3, 0xA9, 0xF3, 0xA4, 0xCF, 0x45, 0xF2, 0xFC, /* 0xAC-0xAF */ + 0xCF, 0x46, 0xCF, 0x47, 0xCF, 0x48, 0xF3, 0xAB, /* 0xB0-0xB3 */ + 0xCF, 0x49, 0xF3, 0xAA, 0xCF, 0x4A, 0xCF, 0x4B, /* 0xB4-0xB7 */ + 0xCF, 0x4C, 0xCF, 0x4D, 0xC2, 0xDD, 0xCF, 0x4E, /* 0xB8-0xBB */ + 0xCF, 0x4F, 0xF3, 0xAE, 0xCF, 0x50, 0xCF, 0x51, /* 0xBC-0xBF */ + 0xF3, 0xB0, 0xCF, 0x52, 0xCF, 0x53, 0xCF, 0x54, /* 0xC0-0xC3 */ + 0xCF, 0x55, 0xCF, 0x56, 0xF3, 0xA1, 0xCF, 0x57, /* 0xC4-0xC7 */ + 0xCF, 0x58, 0xCF, 0x59, 0xF3, 0xB1, 0xF3, 0xAC, /* 0xC8-0xCB */ + 0xCF, 0x5A, 0xCF, 0x5B, 0xCF, 0x5C, 0xCF, 0x5D, /* 0xCC-0xCF */ + 0xCF, 0x5E, 0xF3, 0xAF, 0xF2, 0xFE, 0xF3, 0xAD, /* 0xD0-0xD3 */ + 0xCF, 0x5F, 0xCF, 0x60, 0xCF, 0x61, 0xCF, 0x62, /* 0xD4-0xD7 */ + 0xCF, 0x63, 0xCF, 0x64, 0xCF, 0x65, 0xF3, 0xB2, /* 0xD8-0xDB */ + 0xCF, 0x66, 0xCF, 0x67, 0xCF, 0x68, 0xCF, 0x69, /* 0xDC-0xDF */ + 0xF3, 0xB4, 0xCF, 0x6A, 0xCF, 0x6B, 0xCF, 0x6C, /* 0xE0-0xE3 */ + 0xCF, 0x6D, 0xF3, 0xA8, 0xCF, 0x6E, 0xCF, 0x6F, /* 0xE4-0xE7 */ + 0xCF, 0x70, 0xCF, 0x71, 0xF3, 0xB3, 0xCF, 0x72, /* 0xE8-0xEB */ + 0xCF, 0x73, 0xCF, 0x74, 0xF3, 0xB5, 0xCF, 0x75, /* 0xEC-0xEF */ + 0xCF, 0x76, 0xCF, 0x77, 0xCF, 0x78, 0xCF, 0x79, /* 0xF0-0xF3 */ + 0xCF, 0x7A, 0xCF, 0x7B, 0xCF, 0x7C, 0xCF, 0x7D, /* 0xF4-0xF7 */ + 0xCF, 0x7E, 0xD0, 0xB7, 0xCF, 0x80, 0xCF, 0x81, /* 0xF8-0xFB */ + 0xCF, 0x82, 0xCF, 0x83, 0xF3, 0xB8, 0xCF, 0x84, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_88[512] = { + 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xD9, 0xF9, /* 0x00-0x03 */ + 0xCF, 0x88, 0xCF, 0x89, 0xCF, 0x8A, 0xCF, 0x8B, /* 0x04-0x07 */ + 0xCF, 0x8C, 0xCF, 0x8D, 0xF3, 0xB9, 0xCF, 0x8E, /* 0x08-0x0B */ + 0xCF, 0x8F, 0xCF, 0x90, 0xCF, 0x91, 0xCF, 0x92, /* 0x0C-0x0F */ + 0xCF, 0x93, 0xCF, 0x94, 0xCF, 0x95, 0xF3, 0xB7, /* 0x10-0x13 */ + 0xCF, 0x96, 0xC8, 0xE4, 0xF3, 0xB6, 0xCF, 0x97, /* 0x14-0x17 */ + 0xCF, 0x98, 0xCF, 0x99, 0xCF, 0x9A, 0xF3, 0xBA, /* 0x18-0x1B */ + 0xCF, 0x9B, 0xCF, 0x9C, 0xCF, 0x9D, 0xCF, 0x9E, /* 0x1C-0x1F */ + 0xCF, 0x9F, 0xF3, 0xBB, 0xB4, 0xC0, 0xCF, 0xA0, /* 0x20-0x23 */ + 0xD0, 0x40, 0xD0, 0x41, 0xD0, 0x42, 0xD0, 0x43, /* 0x24-0x27 */ + 0xD0, 0x44, 0xD0, 0x45, 0xD0, 0x46, 0xD0, 0x47, /* 0x28-0x2B */ + 0xD0, 0x48, 0xD0, 0x49, 0xD0, 0x4A, 0xD0, 0x4B, /* 0x2C-0x2F */ + 0xD0, 0x4C, 0xD0, 0x4D, 0xEE, 0xC3, 0xD0, 0x4E, /* 0x30-0x33 */ + 0xD0, 0x4F, 0xD0, 0x50, 0xD0, 0x51, 0xD0, 0x52, /* 0x34-0x37 */ + 0xD0, 0x53, 0xF3, 0xBC, 0xD0, 0x54, 0xD0, 0x55, /* 0x38-0x3B */ + 0xF3, 0xBD, 0xD0, 0x56, 0xD0, 0x57, 0xD0, 0x58, /* 0x3C-0x3F */ + 0xD1, 0xAA, 0xD0, 0x59, 0xD0, 0x5A, 0xD0, 0x5B, /* 0x40-0x43 */ + 0xF4, 0xAC, 0xD0, 0xC6, 0xD0, 0x5C, 0xD0, 0x5D, /* 0x44-0x47 */ + 0xD0, 0x5E, 0xD0, 0x5F, 0xD0, 0x60, 0xD0, 0x61, /* 0x48-0x4B */ + 0xD0, 0xD0, 0xD1, 0xDC, 0xD0, 0x62, 0xD0, 0x63, /* 0x4C-0x4F */ + 0xD0, 0x64, 0xD0, 0x65, 0xD0, 0x66, 0xD0, 0x67, /* 0x50-0x53 */ + 0xCF, 0xCE, 0xD0, 0x68, 0xD0, 0x69, 0xBD, 0xD6, /* 0x54-0x57 */ + 0xD0, 0x6A, 0xD1, 0xC3, 0xD0, 0x6B, 0xD0, 0x6C, /* 0x58-0x5B */ + 0xD0, 0x6D, 0xD0, 0x6E, 0xD0, 0x6F, 0xD0, 0x70, /* 0x5C-0x5F */ + 0xD0, 0x71, 0xBA, 0xE2, 0xE1, 0xE9, 0xD2, 0xC2, /* 0x60-0x63 */ + 0xF1, 0xC2, 0xB2, 0xB9, 0xD0, 0x72, 0xD0, 0x73, /* 0x64-0x67 */ + 0xB1, 0xED, 0xF1, 0xC3, 0xD0, 0x74, 0xC9, 0xC0, /* 0x68-0x6B */ + 0xB3, 0xC4, 0xD0, 0x75, 0xD9, 0xF2, 0xD0, 0x76, /* 0x6C-0x6F */ + 0xCB, 0xA5, 0xD0, 0x77, 0xF1, 0xC4, 0xD0, 0x78, /* 0x70-0x73 */ + 0xD0, 0x79, 0xD0, 0x7A, 0xD0, 0x7B, 0xD6, 0xD4, /* 0x74-0x77 */ + 0xD0, 0x7C, 0xD0, 0x7D, 0xD0, 0x7E, 0xD0, 0x80, /* 0x78-0x7B */ + 0xD0, 0x81, 0xF1, 0xC5, 0xF4, 0xC0, 0xF1, 0xC6, /* 0x7C-0x7F */ + + 0xD0, 0x82, 0xD4, 0xAC, 0xF1, 0xC7, 0xD0, 0x83, /* 0x80-0x83 */ + 0xB0, 0xC0, 0xF4, 0xC1, 0xD0, 0x84, 0xD0, 0x85, /* 0x84-0x87 */ + 0xF4, 0xC2, 0xD0, 0x86, 0xD0, 0x87, 0xB4, 0xFC, /* 0x88-0x8B */ + 0xD0, 0x88, 0xC5, 0xDB, 0xD0, 0x89, 0xD0, 0x8A, /* 0x8C-0x8F */ + 0xD0, 0x8B, 0xD0, 0x8C, 0xCC, 0xBB, 0xD0, 0x8D, /* 0x90-0x93 */ + 0xD0, 0x8E, 0xD0, 0x8F, 0xD0, 0xE4, 0xD0, 0x90, /* 0x94-0x97 */ + 0xD0, 0x91, 0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94, /* 0x98-0x9B */ + 0xCD, 0xE0, 0xD0, 0x95, 0xD0, 0x96, 0xD0, 0x97, /* 0x9C-0x9F */ + 0xD0, 0x98, 0xD0, 0x99, 0xF1, 0xC8, 0xD0, 0x9A, /* 0xA0-0xA3 */ + 0xD9, 0xF3, 0xD0, 0x9B, 0xD0, 0x9C, 0xD0, 0x9D, /* 0xA4-0xA7 */ + 0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0, 0xB1, 0xBB, /* 0xA8-0xAB */ + 0xD1, 0x40, 0xCF, 0xAE, 0xD1, 0x41, 0xD1, 0x42, /* 0xAC-0xAF */ + 0xD1, 0x43, 0xB8, 0xA4, 0xD1, 0x44, 0xD1, 0x45, /* 0xB0-0xB3 */ + 0xD1, 0x46, 0xD1, 0x47, 0xD1, 0x48, 0xF1, 0xCA, /* 0xB4-0xB7 */ + 0xD1, 0x49, 0xD1, 0x4A, 0xD1, 0x4B, 0xD1, 0x4C, /* 0xB8-0xBB */ + 0xF1, 0xCB, 0xD1, 0x4D, 0xD1, 0x4E, 0xD1, 0x4F, /* 0xBC-0xBF */ + 0xD1, 0x50, 0xB2, 0xC3, 0xC1, 0xD1, 0xD1, 0x51, /* 0xC0-0xC3 */ + 0xD1, 0x52, 0xD7, 0xB0, 0xF1, 0xC9, 0xD1, 0x53, /* 0xC4-0xC7 */ + 0xD1, 0x54, 0xF1, 0xCC, 0xD1, 0x55, 0xD1, 0x56, /* 0xC8-0xCB */ + 0xD1, 0x57, 0xD1, 0x58, 0xF1, 0xCE, 0xD1, 0x59, /* 0xCC-0xCF */ + 0xD1, 0x5A, 0xD1, 0x5B, 0xD9, 0xF6, 0xD1, 0x5C, /* 0xD0-0xD3 */ + 0xD2, 0xE1, 0xD4, 0xA3, 0xD1, 0x5D, 0xD1, 0x5E, /* 0xD4-0xD7 */ + 0xF4, 0xC3, 0xC8, 0xB9, 0xD1, 0x5F, 0xD1, 0x60, /* 0xD8-0xDB */ + 0xD1, 0x61, 0xD1, 0x62, 0xD1, 0x63, 0xF4, 0xC4, /* 0xDC-0xDF */ + 0xD1, 0x64, 0xD1, 0x65, 0xF1, 0xCD, 0xF1, 0xCF, /* 0xE0-0xE3 */ + 0xBF, 0xE3, 0xF1, 0xD0, 0xD1, 0x66, 0xD1, 0x67, /* 0xE4-0xE7 */ + 0xF1, 0xD4, 0xD1, 0x68, 0xD1, 0x69, 0xD1, 0x6A, /* 0xE8-0xEB */ + 0xD1, 0x6B, 0xD1, 0x6C, 0xD1, 0x6D, 0xD1, 0x6E, /* 0xEC-0xEF */ + 0xF1, 0xD6, 0xF1, 0xD1, 0xD1, 0x6F, 0xC9, 0xD1, /* 0xF0-0xF3 */ + 0xC5, 0xE1, 0xD1, 0x70, 0xD1, 0x71, 0xD1, 0x72, /* 0xF4-0xF7 */ + 0xC2, 0xE3, 0xB9, 0xFC, 0xD1, 0x73, 0xD1, 0x74, /* 0xF8-0xFB */ + 0xF1, 0xD3, 0xD1, 0x75, 0xF1, 0xD5, 0xD1, 0x76, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_89[512] = { + 0xD1, 0x77, 0xD1, 0x78, 0xB9, 0xD3, 0xD1, 0x79, /* 0x00-0x03 */ + 0xD1, 0x7A, 0xD1, 0x7B, 0xD1, 0x7C, 0xD1, 0x7D, /* 0x04-0x07 */ + 0xD1, 0x7E, 0xD1, 0x80, 0xF1, 0xDB, 0xD1, 0x81, /* 0x08-0x0B */ + 0xD1, 0x82, 0xD1, 0x83, 0xD1, 0x84, 0xD1, 0x85, /* 0x0C-0x0F */ + 0xBA, 0xD6, 0xD1, 0x86, 0xB0, 0xFD, 0xF1, 0xD9, /* 0x10-0x13 */ + 0xD1, 0x87, 0xD1, 0x88, 0xD1, 0x89, 0xD1, 0x8A, /* 0x14-0x17 */ + 0xD1, 0x8B, 0xF1, 0xD8, 0xF1, 0xD2, 0xF1, 0xDA, /* 0x18-0x1B */ + 0xD1, 0x8C, 0xD1, 0x8D, 0xD1, 0x8E, 0xD1, 0x8F, /* 0x1C-0x1F */ + 0xD1, 0x90, 0xF1, 0xD7, 0xD1, 0x91, 0xD1, 0x92, /* 0x20-0x23 */ + 0xD1, 0x93, 0xC8, 0xEC, 0xD1, 0x94, 0xD1, 0x95, /* 0x24-0x27 */ + 0xD1, 0x96, 0xD1, 0x97, 0xCD, 0xCA, 0xF1, 0xDD, /* 0x28-0x2B */ + 0xD1, 0x98, 0xD1, 0x99, 0xD1, 0x9A, 0xD1, 0x9B, /* 0x2C-0x2F */ + 0xE5, 0xBD, 0xD1, 0x9C, 0xD1, 0x9D, 0xD1, 0x9E, /* 0x30-0x33 */ + 0xF1, 0xDC, 0xD1, 0x9F, 0xF1, 0xDE, 0xD1, 0xA0, /* 0x34-0x37 */ + 0xD2, 0x40, 0xD2, 0x41, 0xD2, 0x42, 0xD2, 0x43, /* 0x38-0x3B */ + 0xD2, 0x44, 0xD2, 0x45, 0xD2, 0x46, 0xD2, 0x47, /* 0x3C-0x3F */ + 0xD2, 0x48, 0xF1, 0xDF, 0xD2, 0x49, 0xD2, 0x4A, /* 0x40-0x43 */ + 0xCF, 0xE5, 0xD2, 0x4B, 0xD2, 0x4C, 0xD2, 0x4D, /* 0x44-0x47 */ + 0xD2, 0x4E, 0xD2, 0x4F, 0xD2, 0x50, 0xD2, 0x51, /* 0x48-0x4B */ + 0xD2, 0x52, 0xD2, 0x53, 0xD2, 0x54, 0xD2, 0x55, /* 0x4C-0x4F */ + 0xD2, 0x56, 0xD2, 0x57, 0xD2, 0x58, 0xD2, 0x59, /* 0x50-0x53 */ + 0xD2, 0x5A, 0xD2, 0x5B, 0xD2, 0x5C, 0xD2, 0x5D, /* 0x54-0x57 */ + 0xD2, 0x5E, 0xD2, 0x5F, 0xD2, 0x60, 0xD2, 0x61, /* 0x58-0x5B */ + 0xD2, 0x62, 0xD2, 0x63, 0xF4, 0xC5, 0xBD, 0xF3, /* 0x5C-0x5F */ + 0xD2, 0x64, 0xD2, 0x65, 0xD2, 0x66, 0xD2, 0x67, /* 0x60-0x63 */ + 0xD2, 0x68, 0xD2, 0x69, 0xF1, 0xE0, 0xD2, 0x6A, /* 0x64-0x67 */ + 0xD2, 0x6B, 0xD2, 0x6C, 0xD2, 0x6D, 0xD2, 0x6E, /* 0x68-0x6B */ + 0xD2, 0x6F, 0xD2, 0x70, 0xD2, 0x71, 0xD2, 0x72, /* 0x6C-0x6F */ + 0xD2, 0x73, 0xD2, 0x74, 0xD2, 0x75, 0xD2, 0x76, /* 0x70-0x73 */ + 0xD2, 0x77, 0xD2, 0x78, 0xD2, 0x79, 0xD2, 0x7A, /* 0x74-0x77 */ + 0xD2, 0x7B, 0xD2, 0x7C, 0xD2, 0x7D, 0xF1, 0xE1, /* 0x78-0x7B */ + 0xD2, 0x7E, 0xD2, 0x80, 0xD2, 0x81, 0xCE, 0xF7, /* 0x7C-0x7F */ + + 0xD2, 0x82, 0xD2, 0xAA, 0xD2, 0x83, 0xF1, 0xFB, /* 0x80-0x83 */ + 0xD2, 0x84, 0xD2, 0x85, 0xB8, 0xB2, 0xD2, 0x86, /* 0x84-0x87 */ + 0xD2, 0x87, 0xD2, 0x88, 0xD2, 0x89, 0xD2, 0x8A, /* 0x88-0x8B */ + 0xD2, 0x8B, 0xD2, 0x8C, 0xD2, 0x8D, 0xD2, 0x8E, /* 0x8C-0x8F */ + 0xD2, 0x8F, 0xD2, 0x90, 0xD2, 0x91, 0xD2, 0x92, /* 0x90-0x93 */ + 0xD2, 0x93, 0xD2, 0x94, 0xD2, 0x95, 0xD2, 0x96, /* 0x94-0x97 */ + 0xD2, 0x97, 0xD2, 0x98, 0xD2, 0x99, 0xD2, 0x9A, /* 0x98-0x9B */ + 0xD2, 0x9B, 0xD2, 0x9C, 0xD2, 0x9D, 0xD2, 0x9E, /* 0x9C-0x9F */ + 0xD2, 0x9F, 0xD2, 0xA0, 0xD3, 0x40, 0xD3, 0x41, /* 0xA0-0xA3 */ + 0xD3, 0x42, 0xD3, 0x43, 0xD3, 0x44, 0xD3, 0x45, /* 0xA4-0xA7 */ + 0xD3, 0x46, 0xD3, 0x47, 0xD3, 0x48, 0xD3, 0x49, /* 0xA8-0xAB */ + 0xD3, 0x4A, 0xD3, 0x4B, 0xD3, 0x4C, 0xD3, 0x4D, /* 0xAC-0xAF */ + 0xD3, 0x4E, 0xD3, 0x4F, 0xD3, 0x50, 0xD3, 0x51, /* 0xB0-0xB3 */ + 0xD3, 0x52, 0xD3, 0x53, 0xD3, 0x54, 0xD3, 0x55, /* 0xB4-0xB7 */ + 0xD3, 0x56, 0xD3, 0x57, 0xD3, 0x58, 0xD3, 0x59, /* 0xB8-0xBB */ + 0xD3, 0x5A, 0xD3, 0x5B, 0xD3, 0x5C, 0xD3, 0x5D, /* 0xBC-0xBF */ + 0xD3, 0x5E, 0xBC, 0xFB, 0xB9, 0xDB, 0xD3, 0x5F, /* 0xC0-0xC3 */ + 0xB9, 0xE6, 0xC3, 0xD9, 0xCA, 0xD3, 0xEA, 0xE8, /* 0xC4-0xC7 */ + 0xC0, 0xC0, 0xBE, 0xF5, 0xEA, 0xE9, 0xEA, 0xEA, /* 0xC8-0xCB */ + 0xEA, 0xEB, 0xD3, 0x60, 0xEA, 0xEC, 0xEA, 0xED, /* 0xCC-0xCF */ + 0xEA, 0xEE, 0xEA, 0xEF, 0xBD, 0xC7, 0xD3, 0x61, /* 0xD0-0xD3 */ + 0xD3, 0x62, 0xD3, 0x63, 0xF5, 0xFB, 0xD3, 0x64, /* 0xD4-0xD7 */ + 0xD3, 0x65, 0xD3, 0x66, 0xF5, 0xFD, 0xD3, 0x67, /* 0xD8-0xDB */ + 0xF5, 0xFE, 0xD3, 0x68, 0xF5, 0xFC, 0xD3, 0x69, /* 0xDC-0xDF */ + 0xD3, 0x6A, 0xD3, 0x6B, 0xD3, 0x6C, 0xBD, 0xE2, /* 0xE0-0xE3 */ + 0xD3, 0x6D, 0xF6, 0xA1, 0xB4, 0xA5, 0xD3, 0x6E, /* 0xE4-0xE7 */ + 0xD3, 0x6F, 0xD3, 0x70, 0xD3, 0x71, 0xF6, 0xA2, /* 0xE8-0xEB */ + 0xD3, 0x72, 0xD3, 0x73, 0xD3, 0x74, 0xF6, 0xA3, /* 0xEC-0xEF */ + 0xD3, 0x75, 0xD3, 0x76, 0xD3, 0x77, 0xEC, 0xB2, /* 0xF0-0xF3 */ + 0xD3, 0x78, 0xD3, 0x79, 0xD3, 0x7A, 0xD3, 0x7B, /* 0xF4-0xF7 */ + 0xD3, 0x7C, 0xD3, 0x7D, 0xD3, 0x7E, 0xD3, 0x80, /* 0xF8-0xFB */ + 0xD3, 0x81, 0xD3, 0x82, 0xD3, 0x83, 0xD3, 0x84, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8A[512] = { + 0xD1, 0xD4, 0xD3, 0x85, 0xD3, 0x86, 0xD3, 0x87, /* 0x00-0x03 */ + 0xD3, 0x88, 0xD3, 0x89, 0xD3, 0x8A, 0xD9, 0xEA, /* 0x04-0x07 */ + 0xD3, 0x8B, 0xD3, 0x8C, 0xD3, 0x8D, 0xD3, 0x8E, /* 0x08-0x0B */ + 0xD3, 0x8F, 0xD3, 0x90, 0xD3, 0x91, 0xD3, 0x92, /* 0x0C-0x0F */ + 0xD3, 0x93, 0xD3, 0x94, 0xD3, 0x95, 0xD3, 0x96, /* 0x10-0x13 */ + 0xD3, 0x97, 0xD3, 0x98, 0xD3, 0x99, 0xD3, 0x9A, /* 0x14-0x17 */ + 0xD3, 0x9B, 0xD3, 0x9C, 0xD3, 0x9D, 0xD3, 0x9E, /* 0x18-0x1B */ + 0xD3, 0x9F, 0xD3, 0xA0, 0xD4, 0x40, 0xD4, 0x41, /* 0x1C-0x1F */ + 0xD4, 0x42, 0xD4, 0x43, 0xD4, 0x44, 0xD4, 0x45, /* 0x20-0x23 */ + 0xD4, 0x46, 0xD4, 0x47, 0xD4, 0x48, 0xD4, 0x49, /* 0x24-0x27 */ + 0xD4, 0x4A, 0xD4, 0x4B, 0xD4, 0x4C, 0xD4, 0x4D, /* 0x28-0x2B */ + 0xD4, 0x4E, 0xD4, 0x4F, 0xD4, 0x50, 0xD4, 0x51, /* 0x2C-0x2F */ + 0xD4, 0x52, 0xD4, 0x53, 0xD4, 0x54, 0xD4, 0x55, /* 0x30-0x33 */ + 0xD4, 0x56, 0xD4, 0x57, 0xD4, 0x58, 0xD4, 0x59, /* 0x34-0x37 */ + 0xD4, 0x5A, 0xD4, 0x5B, 0xD4, 0x5C, 0xD4, 0x5D, /* 0x38-0x3B */ + 0xD4, 0x5E, 0xD4, 0x5F, 0xF6, 0xA4, 0xD4, 0x60, /* 0x3C-0x3F */ + 0xD4, 0x61, 0xD4, 0x62, 0xD4, 0x63, 0xD4, 0x64, /* 0x40-0x43 */ + 0xD4, 0x65, 0xD4, 0x66, 0xD4, 0x67, 0xD4, 0x68, /* 0x44-0x47 */ + 0xEE, 0xBA, 0xD4, 0x69, 0xD4, 0x6A, 0xD4, 0x6B, /* 0x48-0x4B */ + 0xD4, 0x6C, 0xD4, 0x6D, 0xD4, 0x6E, 0xD4, 0x6F, /* 0x4C-0x4F */ + 0xD4, 0x70, 0xD4, 0x71, 0xD4, 0x72, 0xD4, 0x73, /* 0x50-0x53 */ + 0xD4, 0x74, 0xD4, 0x75, 0xD4, 0x76, 0xD4, 0x77, /* 0x54-0x57 */ + 0xD4, 0x78, 0xD4, 0x79, 0xD4, 0x7A, 0xD4, 0x7B, /* 0x58-0x5B */ + 0xD4, 0x7C, 0xD4, 0x7D, 0xD4, 0x7E, 0xD4, 0x80, /* 0x5C-0x5F */ + 0xD4, 0x81, 0xD4, 0x82, 0xD4, 0x83, 0xD4, 0x84, /* 0x60-0x63 */ + 0xD4, 0x85, 0xD4, 0x86, 0xD4, 0x87, 0xD4, 0x88, /* 0x64-0x67 */ + 0xD4, 0x89, 0xD4, 0x8A, 0xD4, 0x8B, 0xD4, 0x8C, /* 0x68-0x6B */ + 0xD4, 0x8D, 0xD4, 0x8E, 0xD4, 0x8F, 0xD4, 0x90, /* 0x6C-0x6F */ + 0xD4, 0x91, 0xD4, 0x92, 0xD4, 0x93, 0xD4, 0x94, /* 0x70-0x73 */ + 0xD4, 0x95, 0xD4, 0x96, 0xD4, 0x97, 0xD4, 0x98, /* 0x74-0x77 */ + 0xD4, 0x99, 0xD5, 0xB2, 0xD4, 0x9A, 0xD4, 0x9B, /* 0x78-0x7B */ + 0xD4, 0x9C, 0xD4, 0x9D, 0xD4, 0x9E, 0xD4, 0x9F, /* 0x7C-0x7F */ + + 0xD4, 0xA0, 0xD5, 0x40, 0xD5, 0x41, 0xD5, 0x42, /* 0x80-0x83 */ + 0xD5, 0x43, 0xD5, 0x44, 0xD5, 0x45, 0xD5, 0x46, /* 0x84-0x87 */ + 0xD5, 0x47, 0xD3, 0xFE, 0xCC, 0xDC, 0xD5, 0x48, /* 0x88-0x8B */ + 0xD5, 0x49, 0xD5, 0x4A, 0xD5, 0x4B, 0xD5, 0x4C, /* 0x8C-0x8F */ + 0xD5, 0x4D, 0xD5, 0x4E, 0xD5, 0x4F, 0xCA, 0xC4, /* 0x90-0x93 */ + 0xD5, 0x50, 0xD5, 0x51, 0xD5, 0x52, 0xD5, 0x53, /* 0x94-0x97 */ + 0xD5, 0x54, 0xD5, 0x55, 0xD5, 0x56, 0xD5, 0x57, /* 0x98-0x9B */ + 0xD5, 0x58, 0xD5, 0x59, 0xD5, 0x5A, 0xD5, 0x5B, /* 0x9C-0x9F */ + 0xD5, 0x5C, 0xD5, 0x5D, 0xD5, 0x5E, 0xD5, 0x5F, /* 0xA0-0xA3 */ + 0xD5, 0x60, 0xD5, 0x61, 0xD5, 0x62, 0xD5, 0x63, /* 0xA4-0xA7 */ + 0xD5, 0x64, 0xD5, 0x65, 0xD5, 0x66, 0xD5, 0x67, /* 0xA8-0xAB */ + 0xD5, 0x68, 0xD5, 0x69, 0xD5, 0x6A, 0xD5, 0x6B, /* 0xAC-0xAF */ + 0xD5, 0x6C, 0xD5, 0x6D, 0xD5, 0x6E, 0xD5, 0x6F, /* 0xB0-0xB3 */ + 0xD5, 0x70, 0xD5, 0x71, 0xD5, 0x72, 0xD5, 0x73, /* 0xB4-0xB7 */ + 0xD5, 0x74, 0xD5, 0x75, 0xD5, 0x76, 0xD5, 0x77, /* 0xB8-0xBB */ + 0xD5, 0x78, 0xD5, 0x79, 0xD5, 0x7A, 0xD5, 0x7B, /* 0xBC-0xBF */ + 0xD5, 0x7C, 0xD5, 0x7D, 0xD5, 0x7E, 0xD5, 0x80, /* 0xC0-0xC3 */ + 0xD5, 0x81, 0xD5, 0x82, 0xD5, 0x83, 0xD5, 0x84, /* 0xC4-0xC7 */ + 0xD5, 0x85, 0xD5, 0x86, 0xD5, 0x87, 0xD5, 0x88, /* 0xC8-0xCB */ + 0xD5, 0x89, 0xD5, 0x8A, 0xD5, 0x8B, 0xD5, 0x8C, /* 0xCC-0xCF */ + 0xD5, 0x8D, 0xD5, 0x8E, 0xD5, 0x8F, 0xD5, 0x90, /* 0xD0-0xD3 */ + 0xD5, 0x91, 0xD5, 0x92, 0xD5, 0x93, 0xD5, 0x94, /* 0xD4-0xD7 */ + 0xD5, 0x95, 0xD5, 0x96, 0xD5, 0x97, 0xD5, 0x98, /* 0xD8-0xDB */ + 0xD5, 0x99, 0xD5, 0x9A, 0xD5, 0x9B, 0xD5, 0x9C, /* 0xDC-0xDF */ + 0xD5, 0x9D, 0xD5, 0x9E, 0xD5, 0x9F, 0xD5, 0xA0, /* 0xE0-0xE3 */ + 0xD6, 0x40, 0xD6, 0x41, 0xD6, 0x42, 0xD6, 0x43, /* 0xE4-0xE7 */ + 0xD6, 0x44, 0xD6, 0x45, 0xD6, 0x46, 0xD6, 0x47, /* 0xE8-0xEB */ + 0xD6, 0x48, 0xD6, 0x49, 0xD6, 0x4A, 0xD6, 0x4B, /* 0xEC-0xEF */ + 0xD6, 0x4C, 0xD6, 0x4D, 0xD6, 0x4E, 0xD6, 0x4F, /* 0xF0-0xF3 */ + 0xD6, 0x50, 0xD6, 0x51, 0xD6, 0x52, 0xD6, 0x53, /* 0xF4-0xF7 */ + 0xD6, 0x54, 0xD6, 0x55, 0xD6, 0x56, 0xD6, 0x57, /* 0xF8-0xFB */ + 0xD6, 0x58, 0xD6, 0x59, 0xD6, 0x5A, 0xD6, 0x5B, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8B[512] = { + 0xD6, 0x5C, 0xD6, 0x5D, 0xD6, 0x5E, 0xD6, 0x5F, /* 0x00-0x03 */ + 0xD6, 0x60, 0xD6, 0x61, 0xD6, 0x62, 0xE5, 0xC0, /* 0x04-0x07 */ + 0xD6, 0x63, 0xD6, 0x64, 0xD6, 0x65, 0xD6, 0x66, /* 0x08-0x0B */ + 0xD6, 0x67, 0xD6, 0x68, 0xD6, 0x69, 0xD6, 0x6A, /* 0x0C-0x0F */ + 0xD6, 0x6B, 0xD6, 0x6C, 0xD6, 0x6D, 0xD6, 0x6E, /* 0x10-0x13 */ + 0xD6, 0x6F, 0xD6, 0x70, 0xD6, 0x71, 0xD6, 0x72, /* 0x14-0x17 */ + 0xD6, 0x73, 0xD6, 0x74, 0xD6, 0x75, 0xD6, 0x76, /* 0x18-0x1B */ + 0xD6, 0x77, 0xD6, 0x78, 0xD6, 0x79, 0xD6, 0x7A, /* 0x1C-0x1F */ + 0xD6, 0x7B, 0xD6, 0x7C, 0xD6, 0x7D, 0xD6, 0x7E, /* 0x20-0x23 */ + 0xD6, 0x80, 0xD6, 0x81, 0xF6, 0xA5, 0xD6, 0x82, /* 0x24-0x27 */ + 0xD6, 0x83, 0xD6, 0x84, 0xD6, 0x85, 0xD6, 0x86, /* 0x28-0x2B */ + 0xD6, 0x87, 0xD6, 0x88, 0xD6, 0x89, 0xD6, 0x8A, /* 0x2C-0x2F */ + 0xD6, 0x8B, 0xD6, 0x8C, 0xD6, 0x8D, 0xD6, 0x8E, /* 0x30-0x33 */ + 0xD6, 0x8F, 0xD6, 0x90, 0xD6, 0x91, 0xD6, 0x92, /* 0x34-0x37 */ + 0xD6, 0x93, 0xD6, 0x94, 0xD6, 0x95, 0xD6, 0x96, /* 0x38-0x3B */ + 0xD6, 0x97, 0xD6, 0x98, 0xD6, 0x99, 0xD6, 0x9A, /* 0x3C-0x3F */ + 0xD6, 0x9B, 0xD6, 0x9C, 0xD6, 0x9D, 0xD6, 0x9E, /* 0x40-0x43 */ + 0xD6, 0x9F, 0xD6, 0xA0, 0xD7, 0x40, 0xD7, 0x41, /* 0x44-0x47 */ + 0xD7, 0x42, 0xD7, 0x43, 0xD7, 0x44, 0xD7, 0x45, /* 0x48-0x4B */ + 0xD7, 0x46, 0xD7, 0x47, 0xD7, 0x48, 0xD7, 0x49, /* 0x4C-0x4F */ + 0xD7, 0x4A, 0xD7, 0x4B, 0xD7, 0x4C, 0xD7, 0x4D, /* 0x50-0x53 */ + 0xD7, 0x4E, 0xD7, 0x4F, 0xD7, 0x50, 0xD7, 0x51, /* 0x54-0x57 */ + 0xD7, 0x52, 0xD7, 0x53, 0xD7, 0x54, 0xD7, 0x55, /* 0x58-0x5B */ + 0xD7, 0x56, 0xD7, 0x57, 0xD7, 0x58, 0xD7, 0x59, /* 0x5C-0x5F */ + 0xD7, 0x5A, 0xD7, 0x5B, 0xD7, 0x5C, 0xD7, 0x5D, /* 0x60-0x63 */ + 0xD7, 0x5E, 0xD7, 0x5F, 0xBE, 0xAF, 0xD7, 0x60, /* 0x64-0x67 */ + 0xD7, 0x61, 0xD7, 0x62, 0xD7, 0x63, 0xD7, 0x64, /* 0x68-0x6B */ + 0xC6, 0xA9, 0xD7, 0x65, 0xD7, 0x66, 0xD7, 0x67, /* 0x6C-0x6F */ + 0xD7, 0x68, 0xD7, 0x69, 0xD7, 0x6A, 0xD7, 0x6B, /* 0x70-0x73 */ + 0xD7, 0x6C, 0xD7, 0x6D, 0xD7, 0x6E, 0xD7, 0x6F, /* 0x74-0x77 */ + 0xD7, 0x70, 0xD7, 0x71, 0xD7, 0x72, 0xD7, 0x73, /* 0x78-0x7B */ + 0xD7, 0x74, 0xD7, 0x75, 0xD7, 0x76, 0xD7, 0x77, /* 0x7C-0x7F */ + + 0xD7, 0x78, 0xD7, 0x79, 0xD7, 0x7A, 0xD7, 0x7B, /* 0x80-0x83 */ + 0xD7, 0x7C, 0xD7, 0x7D, 0xD7, 0x7E, 0xD7, 0x80, /* 0x84-0x87 */ + 0xD7, 0x81, 0xD7, 0x82, 0xD7, 0x83, 0xD7, 0x84, /* 0x88-0x8B */ + 0xD7, 0x85, 0xD7, 0x86, 0xD7, 0x87, 0xD7, 0x88, /* 0x8C-0x8F */ + 0xD7, 0x89, 0xD7, 0x8A, 0xD7, 0x8B, 0xD7, 0x8C, /* 0x90-0x93 */ + 0xD7, 0x8D, 0xD7, 0x8E, 0xD7, 0x8F, 0xD7, 0x90, /* 0x94-0x97 */ + 0xD7, 0x91, 0xD7, 0x92, 0xD7, 0x93, 0xD7, 0x94, /* 0x98-0x9B */ + 0xD7, 0x95, 0xD7, 0x96, 0xD7, 0x97, 0xD7, 0x98, /* 0x9C-0x9F */ + 0xDA, 0xA5, 0xBC, 0xC6, 0xB6, 0xA9, 0xB8, 0xBC, /* 0xA0-0xA3 */ + 0xC8, 0xCF, 0xBC, 0xA5, 0xDA, 0xA6, 0xDA, 0xA7, /* 0xA4-0xA7 */ + 0xCC, 0xD6, 0xC8, 0xC3, 0xDA, 0xA8, 0xC6, 0xFD, /* 0xA8-0xAB */ + 0xD7, 0x99, 0xD1, 0xB5, 0xD2, 0xE9, 0xD1, 0xB6, /* 0xAC-0xAF */ + 0xBC, 0xC7, 0xD7, 0x9A, 0xBD, 0xB2, 0xBB, 0xE4, /* 0xB0-0xB3 */ + 0xDA, 0xA9, 0xDA, 0xAA, 0xD1, 0xC8, 0xDA, 0xAB, /* 0xB4-0xB7 */ + 0xD0, 0xED, 0xB6, 0xEF, 0xC2, 0xDB, 0xD7, 0x9B, /* 0xB8-0xBB */ + 0xCB, 0xCF, 0xB7, 0xED, 0xC9, 0xE8, 0xB7, 0xC3, /* 0xBC-0xBF */ + 0xBE, 0xF7, 0xD6, 0xA4, 0xDA, 0xAC, 0xDA, 0xAD, /* 0xC0-0xC3 */ + 0xC6, 0xC0, 0xD7, 0xE7, 0xCA, 0xB6, 0xD7, 0x9C, /* 0xC4-0xC7 */ + 0xD5, 0xA9, 0xCB, 0xDF, 0xD5, 0xEF, 0xDA, 0xAE, /* 0xC8-0xCB */ + 0xD6, 0xDF, 0xB4, 0xCA, 0xDA, 0xB0, 0xDA, 0xAF, /* 0xCC-0xCF */ + 0xD7, 0x9D, 0xD2, 0xEB, 0xDA, 0xB1, 0xDA, 0xB2, /* 0xD0-0xD3 */ + 0xDA, 0xB3, 0xCA, 0xD4, 0xDA, 0xB4, 0xCA, 0xAB, /* 0xD4-0xD7 */ + 0xDA, 0xB5, 0xDA, 0xB6, 0xB3, 0xCF, 0xD6, 0xEF, /* 0xD8-0xDB */ + 0xDA, 0xB7, 0xBB, 0xB0, 0xB5, 0xAE, 0xDA, 0xB8, /* 0xDC-0xDF */ + 0xDA, 0xB9, 0xB9, 0xEE, 0xD1, 0xAF, 0xD2, 0xE8, /* 0xE0-0xE3 */ + 0xDA, 0xBA, 0xB8, 0xC3, 0xCF, 0xEA, 0xB2, 0xEF, /* 0xE4-0xE7 */ + 0xDA, 0xBB, 0xDA, 0xBC, 0xD7, 0x9E, 0xBD, 0xEB, /* 0xE8-0xEB */ + 0xCE, 0xDC, 0xD3, 0xEF, 0xDA, 0xBD, 0xCE, 0xF3, /* 0xEC-0xEF */ + 0xDA, 0xBE, 0xD3, 0xD5, 0xBB, 0xE5, 0xDA, 0xBF, /* 0xF0-0xF3 */ + 0xCB, 0xB5, 0xCB, 0xD0, 0xDA, 0xC0, 0xC7, 0xEB, /* 0xF4-0xF7 */ + 0xD6, 0xEE, 0xDA, 0xC1, 0xC5, 0xB5, 0xB6, 0xC1, /* 0xF8-0xFB */ + 0xDA, 0xC2, 0xB7, 0xCC, 0xBF, 0xCE, 0xDA, 0xC3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8C[512] = { + 0xDA, 0xC4, 0xCB, 0xAD, 0xDA, 0xC5, 0xB5, 0xF7, /* 0x00-0x03 */ + 0xDA, 0xC6, 0xC1, 0xC2, 0xD7, 0xBB, 0xDA, 0xC7, /* 0x04-0x07 */ + 0xCC, 0xB8, 0xD7, 0x9F, 0xD2, 0xEA, 0xC4, 0xB1, /* 0x08-0x0B */ + 0xDA, 0xC8, 0xB5, 0xFD, 0xBB, 0xD1, 0xDA, 0xC9, /* 0x0C-0x0F */ + 0xD0, 0xB3, 0xDA, 0xCA, 0xDA, 0xCB, 0xCE, 0xBD, /* 0x10-0x13 */ + 0xDA, 0xCC, 0xDA, 0xCD, 0xDA, 0xCE, 0xB2, 0xF7, /* 0x14-0x17 */ + 0xDA, 0xD1, 0xDA, 0xCF, 0xD1, 0xE8, 0xDA, 0xD0, /* 0x18-0x1B */ + 0xC3, 0xD5, 0xDA, 0xD2, 0xD7, 0xA0, 0xDA, 0xD3, /* 0x1C-0x1F */ + 0xDA, 0xD4, 0xDA, 0xD5, 0xD0, 0xBB, 0xD2, 0xA5, /* 0x20-0x23 */ + 0xB0, 0xF9, 0xDA, 0xD6, 0xC7, 0xAB, 0xDA, 0xD7, /* 0x24-0x27 */ + 0xBD, 0xF7, 0xC3, 0xA1, 0xDA, 0xD8, 0xDA, 0xD9, /* 0x28-0x2B */ + 0xC3, 0xFD, 0xCC, 0xB7, 0xDA, 0xDA, 0xDA, 0xDB, /* 0x2C-0x2F */ + 0xC0, 0xBE, 0xC6, 0xD7, 0xDA, 0xDC, 0xDA, 0xDD, /* 0x30-0x33 */ + 0xC7, 0xB4, 0xDA, 0xDE, 0xDA, 0xDF, 0xB9, 0xC8, /* 0x34-0x37 */ + 0xD8, 0x40, 0xD8, 0x41, 0xD8, 0x42, 0xD8, 0x43, /* 0x38-0x3B */ + 0xD8, 0x44, 0xD8, 0x45, 0xD8, 0x46, 0xD8, 0x47, /* 0x3C-0x3F */ + 0xD8, 0x48, 0xBB, 0xED, 0xD8, 0x49, 0xD8, 0x4A, /* 0x40-0x43 */ + 0xD8, 0x4B, 0xD8, 0x4C, 0xB6, 0xB9, 0xF4, 0xF8, /* 0x44-0x47 */ + 0xD8, 0x4D, 0xF4, 0xF9, 0xD8, 0x4E, 0xD8, 0x4F, /* 0x48-0x4B */ + 0xCD, 0xE3, 0xD8, 0x50, 0xD8, 0x51, 0xD8, 0x52, /* 0x4C-0x4F */ + 0xD8, 0x53, 0xD8, 0x54, 0xD8, 0x55, 0xD8, 0x56, /* 0x50-0x53 */ + 0xD8, 0x57, 0xF5, 0xB9, 0xD8, 0x58, 0xD8, 0x59, /* 0x54-0x57 */ + 0xD8, 0x5A, 0xD8, 0x5B, 0xEB, 0xE0, 0xD8, 0x5C, /* 0x58-0x5B */ + 0xD8, 0x5D, 0xD8, 0x5E, 0xD8, 0x5F, 0xD8, 0x60, /* 0x5C-0x5F */ + 0xD8, 0x61, 0xCF, 0xF3, 0xBB, 0xBF, 0xD8, 0x62, /* 0x60-0x63 */ + 0xD8, 0x63, 0xD8, 0x64, 0xD8, 0x65, 0xD8, 0x66, /* 0x64-0x67 */ + 0xD8, 0x67, 0xD8, 0x68, 0xBA, 0xC0, 0xD4, 0xA5, /* 0x68-0x6B */ + 0xD8, 0x69, 0xD8, 0x6A, 0xD8, 0x6B, 0xD8, 0x6C, /* 0x6C-0x6F */ + 0xD8, 0x6D, 0xD8, 0x6E, 0xD8, 0x6F, 0xE1, 0xD9, /* 0x70-0x73 */ + 0xD8, 0x70, 0xD8, 0x71, 0xD8, 0x72, 0xD8, 0x73, /* 0x74-0x77 */ + 0xF5, 0xF4, 0xB1, 0xAA, 0xB2, 0xF2, 0xD8, 0x74, /* 0x78-0x7B */ + 0xD8, 0x75, 0xD8, 0x76, 0xD8, 0x77, 0xD8, 0x78, /* 0x7C-0x7F */ + + 0xD8, 0x79, 0xD8, 0x7A, 0xF5, 0xF5, 0xD8, 0x7B, /* 0x80-0x83 */ + 0xD8, 0x7C, 0xF5, 0xF7, 0xD8, 0x7D, 0xD8, 0x7E, /* 0x84-0x87 */ + 0xD8, 0x80, 0xBA, 0xD1, 0xF5, 0xF6, 0xD8, 0x81, /* 0x88-0x8B */ + 0xC3, 0xB2, 0xD8, 0x82, 0xD8, 0x83, 0xD8, 0x84, /* 0x8C-0x8F */ + 0xD8, 0x85, 0xD8, 0x86, 0xD8, 0x87, 0xD8, 0x88, /* 0x90-0x93 */ + 0xF5, 0xF9, 0xD8, 0x89, 0xD8, 0x8A, 0xD8, 0x8B, /* 0x94-0x97 */ + 0xF5, 0xF8, 0xD8, 0x8C, 0xD8, 0x8D, 0xD8, 0x8E, /* 0x98-0x9B */ + 0xD8, 0x8F, 0xD8, 0x90, 0xD8, 0x91, 0xD8, 0x92, /* 0x9C-0x9F */ + 0xD8, 0x93, 0xD8, 0x94, 0xD8, 0x95, 0xD8, 0x96, /* 0xA0-0xA3 */ + 0xD8, 0x97, 0xD8, 0x98, 0xD8, 0x99, 0xD8, 0x9A, /* 0xA4-0xA7 */ + 0xD8, 0x9B, 0xD8, 0x9C, 0xD8, 0x9D, 0xD8, 0x9E, /* 0xA8-0xAB */ + 0xD8, 0x9F, 0xD8, 0xA0, 0xD9, 0x40, 0xD9, 0x41, /* 0xAC-0xAF */ + 0xD9, 0x42, 0xD9, 0x43, 0xD9, 0x44, 0xD9, 0x45, /* 0xB0-0xB3 */ + 0xD9, 0x46, 0xD9, 0x47, 0xD9, 0x48, 0xD9, 0x49, /* 0xB4-0xB7 */ + 0xD9, 0x4A, 0xD9, 0x4B, 0xD9, 0x4C, 0xD9, 0x4D, /* 0xB8-0xBB */ + 0xD9, 0x4E, 0xD9, 0x4F, 0xD9, 0x50, 0xD9, 0x51, /* 0xBC-0xBF */ + 0xD9, 0x52, 0xD9, 0x53, 0xD9, 0x54, 0xD9, 0x55, /* 0xC0-0xC3 */ + 0xD9, 0x56, 0xD9, 0x57, 0xD9, 0x58, 0xD9, 0x59, /* 0xC4-0xC7 */ + 0xD9, 0x5A, 0xD9, 0x5B, 0xD9, 0x5C, 0xD9, 0x5D, /* 0xC8-0xCB */ + 0xD9, 0x5E, 0xD9, 0x5F, 0xD9, 0x60, 0xD9, 0x61, /* 0xCC-0xCF */ + 0xD9, 0x62, 0xD9, 0x63, 0xD9, 0x64, 0xD9, 0x65, /* 0xD0-0xD3 */ + 0xD9, 0x66, 0xD9, 0x67, 0xD9, 0x68, 0xD9, 0x69, /* 0xD4-0xD7 */ + 0xD9, 0x6A, 0xD9, 0x6B, 0xD9, 0x6C, 0xD9, 0x6D, /* 0xD8-0xDB */ + 0xD9, 0x6E, 0xD9, 0x6F, 0xD9, 0x70, 0xD9, 0x71, /* 0xDC-0xDF */ + 0xD9, 0x72, 0xD9, 0x73, 0xD9, 0x74, 0xD9, 0x75, /* 0xE0-0xE3 */ + 0xD9, 0x76, 0xD9, 0x77, 0xD9, 0x78, 0xD9, 0x79, /* 0xE4-0xE7 */ + 0xD9, 0x7A, 0xD9, 0x7B, 0xD9, 0x7C, 0xD9, 0x7D, /* 0xE8-0xEB */ + 0xD9, 0x7E, 0xD9, 0x80, 0xD9, 0x81, 0xD9, 0x82, /* 0xEC-0xEF */ + 0xD9, 0x83, 0xD9, 0x84, 0xD9, 0x85, 0xD9, 0x86, /* 0xF0-0xF3 */ + 0xD9, 0x87, 0xD9, 0x88, 0xD9, 0x89, 0xD9, 0x8A, /* 0xF4-0xF7 */ + 0xD9, 0x8B, 0xD9, 0x8C, 0xD9, 0x8D, 0xD9, 0x8E, /* 0xF8-0xFB */ + 0xD9, 0x8F, 0xD9, 0x90, 0xD9, 0x91, 0xD9, 0x92, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8D[512] = { + 0xD9, 0x93, 0xD9, 0x94, 0xD9, 0x95, 0xD9, 0x96, /* 0x00-0x03 */ + 0xD9, 0x97, 0xD9, 0x98, 0xD9, 0x99, 0xD9, 0x9A, /* 0x04-0x07 */ + 0xD9, 0x9B, 0xD9, 0x9C, 0xD9, 0x9D, 0xD9, 0x9E, /* 0x08-0x0B */ + 0xD9, 0x9F, 0xD9, 0xA0, 0xDA, 0x40, 0xDA, 0x41, /* 0x0C-0x0F */ + 0xDA, 0x42, 0xDA, 0x43, 0xDA, 0x44, 0xDA, 0x45, /* 0x10-0x13 */ + 0xDA, 0x46, 0xDA, 0x47, 0xDA, 0x48, 0xDA, 0x49, /* 0x14-0x17 */ + 0xDA, 0x4A, 0xDA, 0x4B, 0xDA, 0x4C, 0xDA, 0x4D, /* 0x18-0x1B */ + 0xDA, 0x4E, 0xB1, 0xB4, 0xD5, 0xEA, 0xB8, 0xBA, /* 0x1C-0x1F */ + 0xDA, 0x4F, 0xB9, 0xB1, 0xB2, 0xC6, 0xD4, 0xF0, /* 0x20-0x23 */ + 0xCF, 0xCD, 0xB0, 0xDC, 0xD5, 0xCB, 0xBB, 0xF5, /* 0x24-0x27 */ + 0xD6, 0xCA, 0xB7, 0xB7, 0xCC, 0xB0, 0xC6, 0xB6, /* 0x28-0x2B */ + 0xB1, 0xE1, 0xB9, 0xBA, 0xD6, 0xFC, 0xB9, 0xE1, /* 0x2C-0x2F */ + 0xB7, 0xA1, 0xBC, 0xFA, 0xEA, 0xDA, 0xEA, 0xDB, /* 0x30-0x33 */ + 0xCC, 0xF9, 0xB9, 0xF3, 0xEA, 0xDC, 0xB4, 0xFB, /* 0x34-0x37 */ + 0xC3, 0xB3, 0xB7, 0xD1, 0xBA, 0xD8, 0xEA, 0xDD, /* 0x38-0x3B */ + 0xD4, 0xF4, 0xEA, 0xDE, 0xBC, 0xD6, 0xBB, 0xDF, /* 0x3C-0x3F */ + 0xEA, 0xDF, 0xC1, 0xDE, 0xC2, 0xB8, 0xD4, 0xDF, /* 0x40-0x43 */ + 0xD7, 0xCA, 0xEA, 0xE0, 0xEA, 0xE1, 0xEA, 0xE4, /* 0x44-0x47 */ + 0xEA, 0xE2, 0xEA, 0xE3, 0xC9, 0xDE, 0xB8, 0xB3, /* 0x48-0x4B */ + 0xB6, 0xC4, 0xEA, 0xE5, 0xCA, 0xEA, 0xC9, 0xCD, /* 0x4C-0x4F */ + 0xB4, 0xCD, 0xDA, 0x50, 0xDA, 0x51, 0xE2, 0xD9, /* 0x50-0x53 */ + 0xC5, 0xE2, 0xEA, 0xE6, 0xC0, 0xB5, 0xDA, 0x52, /* 0x54-0x57 */ + 0xD7, 0xB8, 0xEA, 0xE7, 0xD7, 0xAC, 0xC8, 0xFC, /* 0x58-0x5B */ + 0xD8, 0xD3, 0xD8, 0xCD, 0xD4, 0xDE, 0xDA, 0x53, /* 0x5C-0x5F */ + 0xD4, 0xF9, 0xC9, 0xC4, 0xD3, 0xAE, 0xB8, 0xD3, /* 0x60-0x63 */ + 0xB3, 0xE0, 0xDA, 0x54, 0xC9, 0xE2, 0xF4, 0xF6, /* 0x64-0x67 */ + 0xDA, 0x55, 0xDA, 0x56, 0xDA, 0x57, 0xBA, 0xD5, /* 0x68-0x6B */ + 0xDA, 0x58, 0xF4, 0xF7, 0xDA, 0x59, 0xDA, 0x5A, /* 0x6C-0x6F */ + 0xD7, 0xDF, 0xDA, 0x5B, 0xDA, 0x5C, 0xF4, 0xF1, /* 0x70-0x73 */ + 0xB8, 0xB0, 0xD5, 0xD4, 0xB8, 0xCF, 0xC6, 0xF0, /* 0x74-0x77 */ + 0xDA, 0x5D, 0xDA, 0x5E, 0xDA, 0x5F, 0xDA, 0x60, /* 0x78-0x7B */ + 0xDA, 0x61, 0xDA, 0x62, 0xDA, 0x63, 0xDA, 0x64, /* 0x7C-0x7F */ + + 0xDA, 0x65, 0xB3, 0xC3, 0xDA, 0x66, 0xDA, 0x67, /* 0x80-0x83 */ + 0xF4, 0xF2, 0xB3, 0xAC, 0xDA, 0x68, 0xDA, 0x69, /* 0x84-0x87 */ + 0xDA, 0x6A, 0xDA, 0x6B, 0xD4, 0xBD, 0xC7, 0xF7, /* 0x88-0x8B */ + 0xDA, 0x6C, 0xDA, 0x6D, 0xDA, 0x6E, 0xDA, 0x6F, /* 0x8C-0x8F */ + 0xDA, 0x70, 0xF4, 0xF4, 0xDA, 0x71, 0xDA, 0x72, /* 0x90-0x93 */ + 0xF4, 0xF3, 0xDA, 0x73, 0xDA, 0x74, 0xDA, 0x75, /* 0x94-0x97 */ + 0xDA, 0x76, 0xDA, 0x77, 0xDA, 0x78, 0xDA, 0x79, /* 0x98-0x9B */ + 0xDA, 0x7A, 0xDA, 0x7B, 0xDA, 0x7C, 0xCC, 0xCB, /* 0x9C-0x9F */ + 0xDA, 0x7D, 0xDA, 0x7E, 0xDA, 0x80, 0xC8, 0xA4, /* 0xA0-0xA3 */ + 0xDA, 0x81, 0xDA, 0x82, 0xDA, 0x83, 0xDA, 0x84, /* 0xA4-0xA7 */ + 0xDA, 0x85, 0xDA, 0x86, 0xDA, 0x87, 0xDA, 0x88, /* 0xA8-0xAB */ + 0xDA, 0x89, 0xDA, 0x8A, 0xDA, 0x8B, 0xDA, 0x8C, /* 0xAC-0xAF */ + 0xDA, 0x8D, 0xF4, 0xF5, 0xDA, 0x8E, 0xD7, 0xE3, /* 0xB0-0xB3 */ + 0xC5, 0xBF, 0xF5, 0xC0, 0xDA, 0x8F, 0xDA, 0x90, /* 0xB4-0xB7 */ + 0xF5, 0xBB, 0xDA, 0x91, 0xF5, 0xC3, 0xDA, 0x92, /* 0xB8-0xBB */ + 0xF5, 0xC2, 0xDA, 0x93, 0xD6, 0xBA, 0xF5, 0xC1, /* 0xBC-0xBF */ + 0xDA, 0x94, 0xDA, 0x95, 0xDA, 0x96, 0xD4, 0xBE, /* 0xC0-0xC3 */ + 0xF5, 0xC4, 0xDA, 0x97, 0xF5, 0xCC, 0xDA, 0x98, /* 0xC4-0xC7 */ + 0xDA, 0x99, 0xDA, 0x9A, 0xDA, 0x9B, 0xB0, 0xCF, /* 0xC8-0xCB */ + 0xB5, 0xF8, 0xDA, 0x9C, 0xF5, 0xC9, 0xF5, 0xCA, /* 0xCC-0xCF */ + 0xDA, 0x9D, 0xC5, 0xDC, 0xDA, 0x9E, 0xDA, 0x9F, /* 0xD0-0xD3 */ + 0xDA, 0xA0, 0xDB, 0x40, 0xF5, 0xC5, 0xF5, 0xC6, /* 0xD4-0xD7 */ + 0xDB, 0x41, 0xDB, 0x42, 0xF5, 0xC7, 0xF5, 0xCB, /* 0xD8-0xDB */ + 0xDB, 0x43, 0xBE, 0xE0, 0xF5, 0xC8, 0xB8, 0xFA, /* 0xDC-0xDF */ + 0xDB, 0x44, 0xDB, 0x45, 0xDB, 0x46, 0xF5, 0xD0, /* 0xE0-0xE3 */ + 0xF5, 0xD3, 0xDB, 0x47, 0xDB, 0x48, 0xDB, 0x49, /* 0xE4-0xE7 */ + 0xBF, 0xE7, 0xDB, 0x4A, 0xB9, 0xF2, 0xF5, 0xBC, /* 0xE8-0xEB */ + 0xF5, 0xCD, 0xDB, 0x4B, 0xDB, 0x4C, 0xC2, 0xB7, /* 0xEC-0xEF */ + 0xDB, 0x4D, 0xDB, 0x4E, 0xDB, 0x4F, 0xCC, 0xF8, /* 0xF0-0xF3 */ + 0xDB, 0x50, 0xBC, 0xF9, 0xDB, 0x51, 0xF5, 0xCE, /* 0xF4-0xF7 */ + 0xF5, 0xCF, 0xF5, 0xD1, 0xB6, 0xE5, 0xF5, 0xD2, /* 0xF8-0xFB */ + 0xDB, 0x52, 0xF5, 0xD5, 0xDB, 0x53, 0xDB, 0x54, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8E[512] = { + 0xDB, 0x55, 0xDB, 0x56, 0xDB, 0x57, 0xDB, 0x58, /* 0x00-0x03 */ + 0xDB, 0x59, 0xF5, 0xBD, 0xDB, 0x5A, 0xDB, 0x5B, /* 0x04-0x07 */ + 0xDB, 0x5C, 0xF5, 0xD4, 0xD3, 0xBB, 0xDB, 0x5D, /* 0x08-0x0B */ + 0xB3, 0xEC, 0xDB, 0x5E, 0xDB, 0x5F, 0xCC, 0xA4, /* 0x0C-0x0F */ + 0xDB, 0x60, 0xDB, 0x61, 0xDB, 0x62, 0xDB, 0x63, /* 0x10-0x13 */ + 0xF5, 0xD6, 0xDB, 0x64, 0xDB, 0x65, 0xDB, 0x66, /* 0x14-0x17 */ + 0xDB, 0x67, 0xDB, 0x68, 0xDB, 0x69, 0xDB, 0x6A, /* 0x18-0x1B */ + 0xDB, 0x6B, 0xF5, 0xD7, 0xBE, 0xE1, 0xF5, 0xD8, /* 0x1C-0x1F */ + 0xDB, 0x6C, 0xDB, 0x6D, 0xCC, 0xDF, 0xF5, 0xDB, /* 0x20-0x23 */ + 0xDB, 0x6E, 0xDB, 0x6F, 0xDB, 0x70, 0xDB, 0x71, /* 0x24-0x27 */ + 0xDB, 0x72, 0xB2, 0xC8, 0xD7, 0xD9, 0xDB, 0x73, /* 0x28-0x2B */ + 0xF5, 0xD9, 0xDB, 0x74, 0xF5, 0xDA, 0xF5, 0xDC, /* 0x2C-0x2F */ + 0xDB, 0x75, 0xF5, 0xE2, 0xDB, 0x76, 0xDB, 0x77, /* 0x30-0x33 */ + 0xDB, 0x78, 0xF5, 0xE0, 0xDB, 0x79, 0xDB, 0x7A, /* 0x34-0x37 */ + 0xDB, 0x7B, 0xF5, 0xDF, 0xF5, 0xDD, 0xDB, 0x7C, /* 0x38-0x3B */ + 0xDB, 0x7D, 0xF5, 0xE1, 0xDB, 0x7E, 0xDB, 0x80, /* 0x3C-0x3F */ + 0xF5, 0xDE, 0xF5, 0xE4, 0xF5, 0xE5, 0xDB, 0x81, /* 0x40-0x43 */ + 0xCC, 0xE3, 0xDB, 0x82, 0xDB, 0x83, 0xE5, 0xBF, /* 0x44-0x47 */ + 0xB5, 0xB8, 0xF5, 0xE3, 0xF5, 0xE8, 0xCC, 0xA3, /* 0x48-0x4B */ + 0xDB, 0x84, 0xDB, 0x85, 0xDB, 0x86, 0xDB, 0x87, /* 0x4C-0x4F */ + 0xDB, 0x88, 0xF5, 0xE6, 0xF5, 0xE7, 0xDB, 0x89, /* 0x50-0x53 */ + 0xDB, 0x8A, 0xDB, 0x8B, 0xDB, 0x8C, 0xDB, 0x8D, /* 0x54-0x57 */ + 0xDB, 0x8E, 0xF5, 0xBE, 0xDB, 0x8F, 0xDB, 0x90, /* 0x58-0x5B */ + 0xDB, 0x91, 0xDB, 0x92, 0xDB, 0x93, 0xDB, 0x94, /* 0x5C-0x5F */ + 0xDB, 0x95, 0xDB, 0x96, 0xDB, 0x97, 0xDB, 0x98, /* 0x60-0x63 */ + 0xDB, 0x99, 0xDB, 0x9A, 0xB1, 0xC4, 0xDB, 0x9B, /* 0x64-0x67 */ + 0xDB, 0x9C, 0xF5, 0xBF, 0xDB, 0x9D, 0xDB, 0x9E, /* 0x68-0x6B */ + 0xB5, 0xC5, 0xB2, 0xE4, 0xDB, 0x9F, 0xF5, 0xEC, /* 0x6C-0x6F */ + 0xF5, 0xE9, 0xDB, 0xA0, 0xB6, 0xD7, 0xDC, 0x40, /* 0x70-0x73 */ + 0xF5, 0xED, 0xDC, 0x41, 0xF5, 0xEA, 0xDC, 0x42, /* 0x74-0x77 */ + 0xDC, 0x43, 0xDC, 0x44, 0xDC, 0x45, 0xDC, 0x46, /* 0x78-0x7B */ + 0xF5, 0xEB, 0xDC, 0x47, 0xDC, 0x48, 0xB4, 0xDA, /* 0x7C-0x7F */ + + 0xDC, 0x49, 0xD4, 0xEA, 0xDC, 0x4A, 0xDC, 0x4B, /* 0x80-0x83 */ + 0xDC, 0x4C, 0xF5, 0xEE, 0xDC, 0x4D, 0xB3, 0xF9, /* 0x84-0x87 */ + 0xDC, 0x4E, 0xDC, 0x4F, 0xDC, 0x50, 0xDC, 0x51, /* 0x88-0x8B */ + 0xDC, 0x52, 0xDC, 0x53, 0xDC, 0x54, 0xF5, 0xEF, /* 0x8C-0x8F */ + 0xF5, 0xF1, 0xDC, 0x55, 0xDC, 0x56, 0xDC, 0x57, /* 0x90-0x93 */ + 0xF5, 0xF0, 0xDC, 0x58, 0xDC, 0x59, 0xDC, 0x5A, /* 0x94-0x97 */ + 0xDC, 0x5B, 0xDC, 0x5C, 0xDC, 0x5D, 0xDC, 0x5E, /* 0x98-0x9B */ + 0xF5, 0xF2, 0xDC, 0x5F, 0xF5, 0xF3, 0xDC, 0x60, /* 0x9C-0x9F */ + 0xDC, 0x61, 0xDC, 0x62, 0xDC, 0x63, 0xDC, 0x64, /* 0xA0-0xA3 */ + 0xDC, 0x65, 0xDC, 0x66, 0xDC, 0x67, 0xDC, 0x68, /* 0xA4-0xA7 */ + 0xDC, 0x69, 0xDC, 0x6A, 0xDC, 0x6B, 0xC9, 0xED, /* 0xA8-0xAB */ + 0xB9, 0xAA, 0xDC, 0x6C, 0xDC, 0x6D, 0xC7, 0xFB, /* 0xAC-0xAF */ + 0xDC, 0x6E, 0xDC, 0x6F, 0xB6, 0xE3, 0xDC, 0x70, /* 0xB0-0xB3 */ + 0xDC, 0x71, 0xDC, 0x72, 0xDC, 0x73, 0xDC, 0x74, /* 0xB4-0xB7 */ + 0xDC, 0x75, 0xDC, 0x76, 0xCC, 0xC9, 0xDC, 0x77, /* 0xB8-0xBB */ + 0xDC, 0x78, 0xDC, 0x79, 0xDC, 0x7A, 0xDC, 0x7B, /* 0xBC-0xBF */ + 0xDC, 0x7C, 0xDC, 0x7D, 0xDC, 0x7E, 0xDC, 0x80, /* 0xC0-0xC3 */ + 0xDC, 0x81, 0xDC, 0x82, 0xDC, 0x83, 0xDC, 0x84, /* 0xC4-0xC7 */ + 0xDC, 0x85, 0xDC, 0x86, 0xDC, 0x87, 0xDC, 0x88, /* 0xC8-0xCB */ + 0xDC, 0x89, 0xDC, 0x8A, 0xEA, 0xA6, 0xDC, 0x8B, /* 0xCC-0xCF */ + 0xDC, 0x8C, 0xDC, 0x8D, 0xDC, 0x8E, 0xDC, 0x8F, /* 0xD0-0xD3 */ + 0xDC, 0x90, 0xDC, 0x91, 0xDC, 0x92, 0xDC, 0x93, /* 0xD4-0xD7 */ + 0xDC, 0x94, 0xDC, 0x95, 0xDC, 0x96, 0xDC, 0x97, /* 0xD8-0xDB */ + 0xDC, 0x98, 0xDC, 0x99, 0xDC, 0x9A, 0xDC, 0x9B, /* 0xDC-0xDF */ + 0xDC, 0x9C, 0xDC, 0x9D, 0xDC, 0x9E, 0xDC, 0x9F, /* 0xE0-0xE3 */ + 0xDC, 0xA0, 0xDD, 0x40, 0xDD, 0x41, 0xDD, 0x42, /* 0xE4-0xE7 */ + 0xDD, 0x43, 0xDD, 0x44, 0xDD, 0x45, 0xDD, 0x46, /* 0xE8-0xEB */ + 0xDD, 0x47, 0xDD, 0x48, 0xDD, 0x49, 0xDD, 0x4A, /* 0xEC-0xEF */ + 0xDD, 0x4B, 0xDD, 0x4C, 0xDD, 0x4D, 0xDD, 0x4E, /* 0xF0-0xF3 */ + 0xDD, 0x4F, 0xDD, 0x50, 0xDD, 0x51, 0xDD, 0x52, /* 0xF4-0xF7 */ + 0xDD, 0x53, 0xDD, 0x54, 0xDD, 0x55, 0xDD, 0x56, /* 0xF8-0xFB */ + 0xDD, 0x57, 0xDD, 0x58, 0xDD, 0x59, 0xDD, 0x5A, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8F[512] = { + 0xDD, 0x5B, 0xDD, 0x5C, 0xDD, 0x5D, 0xDD, 0x5E, /* 0x00-0x03 */ + 0xDD, 0x5F, 0xDD, 0x60, 0xDD, 0x61, 0xDD, 0x62, /* 0x04-0x07 */ + 0xDD, 0x63, 0xDD, 0x64, 0xDD, 0x65, 0xDD, 0x66, /* 0x08-0x0B */ + 0xDD, 0x67, 0xDD, 0x68, 0xDD, 0x69, 0xDD, 0x6A, /* 0x0C-0x0F */ + 0xDD, 0x6B, 0xDD, 0x6C, 0xDD, 0x6D, 0xDD, 0x6E, /* 0x10-0x13 */ + 0xDD, 0x6F, 0xDD, 0x70, 0xDD, 0x71, 0xDD, 0x72, /* 0x14-0x17 */ + 0xDD, 0x73, 0xDD, 0x74, 0xDD, 0x75, 0xDD, 0x76, /* 0x18-0x1B */ + 0xDD, 0x77, 0xDD, 0x78, 0xDD, 0x79, 0xDD, 0x7A, /* 0x1C-0x1F */ + 0xDD, 0x7B, 0xDD, 0x7C, 0xDD, 0x7D, 0xDD, 0x7E, /* 0x20-0x23 */ + 0xDD, 0x80, 0xDD, 0x81, 0xDD, 0x82, 0xDD, 0x83, /* 0x24-0x27 */ + 0xDD, 0x84, 0xDD, 0x85, 0xDD, 0x86, 0xDD, 0x87, /* 0x28-0x2B */ + 0xDD, 0x88, 0xDD, 0x89, 0xDD, 0x8A, 0xDD, 0x8B, /* 0x2C-0x2F */ + 0xDD, 0x8C, 0xDD, 0x8D, 0xDD, 0x8E, 0xDD, 0x8F, /* 0x30-0x33 */ + 0xDD, 0x90, 0xDD, 0x91, 0xDD, 0x92, 0xDD, 0x93, /* 0x34-0x37 */ + 0xDD, 0x94, 0xDD, 0x95, 0xDD, 0x96, 0xDD, 0x97, /* 0x38-0x3B */ + 0xDD, 0x98, 0xDD, 0x99, 0xDD, 0x9A, 0xDD, 0x9B, /* 0x3C-0x3F */ + 0xDD, 0x9C, 0xDD, 0x9D, 0xDD, 0x9E, 0xDD, 0x9F, /* 0x40-0x43 */ + 0xDD, 0xA0, 0xDE, 0x40, 0xDE, 0x41, 0xDE, 0x42, /* 0x44-0x47 */ + 0xDE, 0x43, 0xDE, 0x44, 0xDE, 0x45, 0xDE, 0x46, /* 0x48-0x4B */ + 0xDE, 0x47, 0xDE, 0x48, 0xDE, 0x49, 0xDE, 0x4A, /* 0x4C-0x4F */ + 0xDE, 0x4B, 0xDE, 0x4C, 0xDE, 0x4D, 0xDE, 0x4E, /* 0x50-0x53 */ + 0xDE, 0x4F, 0xDE, 0x50, 0xDE, 0x51, 0xDE, 0x52, /* 0x54-0x57 */ + 0xDE, 0x53, 0xDE, 0x54, 0xDE, 0x55, 0xDE, 0x56, /* 0x58-0x5B */ + 0xDE, 0x57, 0xDE, 0x58, 0xDE, 0x59, 0xDE, 0x5A, /* 0x5C-0x5F */ + 0xDE, 0x5B, 0xDE, 0x5C, 0xDE, 0x5D, 0xDE, 0x5E, /* 0x60-0x63 */ + 0xDE, 0x5F, 0xDE, 0x60, 0xB3, 0xB5, 0xD4, 0xFE, /* 0x64-0x67 */ + 0xB9, 0xEC, 0xD0, 0xF9, 0xDE, 0x61, 0xE9, 0xED, /* 0x68-0x6B */ + 0xD7, 0xAA, 0xE9, 0xEE, 0xC2, 0xD6, 0xC8, 0xED, /* 0x6C-0x6F */ + 0xBA, 0xE4, 0xE9, 0xEF, 0xE9, 0xF0, 0xE9, 0xF1, /* 0x70-0x73 */ + 0xD6, 0xE1, 0xE9, 0xF2, 0xE9, 0xF3, 0xE9, 0xF5, /* 0x74-0x77 */ + 0xE9, 0xF4, 0xE9, 0xF6, 0xE9, 0xF7, 0xC7, 0xE1, /* 0x78-0x7B */ + 0xE9, 0xF8, 0xD4, 0xD8, 0xE9, 0xF9, 0xBD, 0xCE, /* 0x7C-0x7F */ + + 0xDE, 0x62, 0xE9, 0xFA, 0xE9, 0xFB, 0xBD, 0xCF, /* 0x80-0x83 */ + 0xE9, 0xFC, 0xB8, 0xA8, 0xC1, 0xBE, 0xE9, 0xFD, /* 0x84-0x87 */ + 0xB1, 0xB2, 0xBB, 0xD4, 0xB9, 0xF5, 0xE9, 0xFE, /* 0x88-0x8B */ + 0xDE, 0x63, 0xEA, 0xA1, 0xEA, 0xA2, 0xEA, 0xA3, /* 0x8C-0x8F */ + 0xB7, 0xF8, 0xBC, 0xAD, 0xDE, 0x64, 0xCA, 0xE4, /* 0x90-0x93 */ + 0xE0, 0xCE, 0xD4, 0xAF, 0xCF, 0xBD, 0xD5, 0xB7, /* 0x94-0x97 */ + 0xEA, 0xA4, 0xD5, 0xDE, 0xEA, 0xA5, 0xD0, 0xC1, /* 0x98-0x9B */ + 0xB9, 0xBC, 0xDE, 0x65, 0xB4, 0xC7, 0xB1, 0xD9, /* 0x9C-0x9F */ + 0xDE, 0x66, 0xDE, 0x67, 0xDE, 0x68, 0xC0, 0xB1, /* 0xA0-0xA3 */ + 0xDE, 0x69, 0xDE, 0x6A, 0xDE, 0x6B, 0xDE, 0x6C, /* 0xA4-0xA7 */ + 0xB1, 0xE6, 0xB1, 0xE7, 0xDE, 0x6D, 0xB1, 0xE8, /* 0xA8-0xAB */ + 0xDE, 0x6E, 0xDE, 0x6F, 0xDE, 0x70, 0xDE, 0x71, /* 0xAC-0xAF */ + 0xB3, 0xBD, 0xC8, 0xE8, 0xDE, 0x72, 0xDE, 0x73, /* 0xB0-0xB3 */ + 0xDE, 0x74, 0xDE, 0x75, 0xE5, 0xC1, 0xDE, 0x76, /* 0xB4-0xB7 */ + 0xDE, 0x77, 0xB1, 0xDF, 0xDE, 0x78, 0xDE, 0x79, /* 0xB8-0xBB */ + 0xDE, 0x7A, 0xC1, 0xC9, 0xB4, 0xEF, 0xDE, 0x7B, /* 0xBC-0xBF */ + 0xDE, 0x7C, 0xC7, 0xA8, 0xD3, 0xD8, 0xDE, 0x7D, /* 0xC0-0xC3 */ + 0xC6, 0xF9, 0xD1, 0xB8, 0xDE, 0x7E, 0xB9, 0xFD, /* 0xC4-0xC7 */ + 0xC2, 0xF5, 0xDE, 0x80, 0xDE, 0x81, 0xDE, 0x82, /* 0xC8-0xCB */ + 0xDE, 0x83, 0xDE, 0x84, 0xD3, 0xAD, 0xDE, 0x85, /* 0xCC-0xCF */ + 0xD4, 0xCB, 0xBD, 0xFC, 0xDE, 0x86, 0xE5, 0xC2, /* 0xD0-0xD3 */ + 0xB7, 0xB5, 0xE5, 0xC3, 0xDE, 0x87, 0xDE, 0x88, /* 0xD4-0xD7 */ + 0xBB, 0xB9, 0xD5, 0xE2, 0xDE, 0x89, 0xBD, 0xF8, /* 0xD8-0xDB */ + 0xD4, 0xB6, 0xCE, 0xA5, 0xC1, 0xAC, 0xB3, 0xD9, /* 0xDC-0xDF */ + 0xDE, 0x8A, 0xDE, 0x8B, 0xCC, 0xF6, 0xDE, 0x8C, /* 0xE0-0xE3 */ + 0xE5, 0xC6, 0xE5, 0xC4, 0xE5, 0xC8, 0xDE, 0x8D, /* 0xE4-0xE7 */ + 0xE5, 0xCA, 0xE5, 0xC7, 0xB5, 0xCF, 0xC6, 0xC8, /* 0xE8-0xEB */ + 0xDE, 0x8E, 0xB5, 0xFC, 0xE5, 0xC5, 0xDE, 0x8F, /* 0xEC-0xEF */ + 0xCA, 0xF6, 0xDE, 0x90, 0xDE, 0x91, 0xE5, 0xC9, /* 0xF0-0xF3 */ + 0xDE, 0x92, 0xDE, 0x93, 0xDE, 0x94, 0xC3, 0xD4, /* 0xF4-0xF7 */ + 0xB1, 0xC5, 0xBC, 0xA3, 0xDE, 0x95, 0xDE, 0x96, /* 0xF8-0xFB */ + 0xDE, 0x97, 0xD7, 0xB7, 0xDE, 0x98, 0xDE, 0x99, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_90[512] = { + 0xCD, 0xCB, 0xCB, 0xCD, 0xCA, 0xCA, 0xCC, 0xD3, /* 0x00-0x03 */ + 0xE5, 0xCC, 0xE5, 0xCB, 0xC4, 0xE6, 0xDE, 0x9A, /* 0x04-0x07 */ + 0xDE, 0x9B, 0xD1, 0xA1, 0xD1, 0xB7, 0xE5, 0xCD, /* 0x08-0x0B */ + 0xDE, 0x9C, 0xE5, 0xD0, 0xDE, 0x9D, 0xCD, 0xB8, /* 0x0C-0x0F */ + 0xD6, 0xF0, 0xE5, 0xCF, 0xB5, 0xDD, 0xDE, 0x9E, /* 0x10-0x13 */ + 0xCD, 0xBE, 0xDE, 0x9F, 0xE5, 0xD1, 0xB6, 0xBA, /* 0x14-0x17 */ + 0xDE, 0xA0, 0xDF, 0x40, 0xCD, 0xA8, 0xB9, 0xE4, /* 0x18-0x1B */ + 0xDF, 0x41, 0xCA, 0xC5, 0xB3, 0xD1, 0xCB, 0xD9, /* 0x1C-0x1F */ + 0xD4, 0xEC, 0xE5, 0xD2, 0xB7, 0xEA, 0xDF, 0x42, /* 0x20-0x23 */ + 0xDF, 0x43, 0xDF, 0x44, 0xE5, 0xCE, 0xDF, 0x45, /* 0x24-0x27 */ + 0xDF, 0x46, 0xDF, 0x47, 0xDF, 0x48, 0xDF, 0x49, /* 0x28-0x2B */ + 0xDF, 0x4A, 0xE5, 0xD5, 0xB4, 0xFE, 0xE5, 0xD6, /* 0x2C-0x2F */ + 0xDF, 0x4B, 0xDF, 0x4C, 0xDF, 0x4D, 0xDF, 0x4E, /* 0x30-0x33 */ + 0xDF, 0x4F, 0xE5, 0xD3, 0xE5, 0xD4, 0xDF, 0x50, /* 0x34-0x37 */ + 0xD2, 0xDD, 0xDF, 0x51, 0xDF, 0x52, 0xC2, 0xDF, /* 0x38-0x3B */ + 0xB1, 0xC6, 0xDF, 0x53, 0xD3, 0xE2, 0xDF, 0x54, /* 0x3C-0x3F */ + 0xDF, 0x55, 0xB6, 0xDD, 0xCB, 0xEC, 0xDF, 0x56, /* 0x40-0x43 */ + 0xE5, 0xD7, 0xDF, 0x57, 0xDF, 0x58, 0xD3, 0xF6, /* 0x44-0x47 */ + 0xDF, 0x59, 0xDF, 0x5A, 0xDF, 0x5B, 0xDF, 0x5C, /* 0x48-0x4B */ + 0xDF, 0x5D, 0xB1, 0xE9, 0xDF, 0x5E, 0xB6, 0xF4, /* 0x4C-0x4F */ + 0xE5, 0xDA, 0xE5, 0xD8, 0xE5, 0xD9, 0xB5, 0xC0, /* 0x50-0x53 */ + 0xDF, 0x5F, 0xDF, 0x60, 0xDF, 0x61, 0xD2, 0xC5, /* 0x54-0x57 */ + 0xE5, 0xDC, 0xDF, 0x62, 0xDF, 0x63, 0xE5, 0xDE, /* 0x58-0x5B */ + 0xDF, 0x64, 0xDF, 0x65, 0xDF, 0x66, 0xDF, 0x67, /* 0x5C-0x5F */ + 0xDF, 0x68, 0xDF, 0x69, 0xE5, 0xDD, 0xC7, 0xB2, /* 0x60-0x63 */ + 0xDF, 0x6A, 0xD2, 0xA3, 0xDF, 0x6B, 0xDF, 0x6C, /* 0x64-0x67 */ + 0xE5, 0xDB, 0xDF, 0x6D, 0xDF, 0x6E, 0xDF, 0x6F, /* 0x68-0x6B */ + 0xDF, 0x70, 0xD4, 0xE2, 0xD5, 0xDA, 0xDF, 0x71, /* 0x6C-0x6F */ + 0xDF, 0x72, 0xDF, 0x73, 0xDF, 0x74, 0xDF, 0x75, /* 0x70-0x73 */ + 0xE5, 0xE0, 0xD7, 0xF1, 0xDF, 0x76, 0xDF, 0x77, /* 0x74-0x77 */ + 0xDF, 0x78, 0xDF, 0x79, 0xDF, 0x7A, 0xDF, 0x7B, /* 0x78-0x7B */ + 0xDF, 0x7C, 0xE5, 0xE1, 0xDF, 0x7D, 0xB1, 0xDC, /* 0x7C-0x7F */ + + 0xD1, 0xFB, 0xDF, 0x7E, 0xE5, 0xE2, 0xE5, 0xE4, /* 0x80-0x83 */ + 0xDF, 0x80, 0xDF, 0x81, 0xDF, 0x82, 0xDF, 0x83, /* 0x84-0x87 */ + 0xE5, 0xE3, 0xDF, 0x84, 0xDF, 0x85, 0xE5, 0xE5, /* 0x88-0x8B */ + 0xDF, 0x86, 0xDF, 0x87, 0xDF, 0x88, 0xDF, 0x89, /* 0x8C-0x8F */ + 0xDF, 0x8A, 0xD2, 0xD8, 0xDF, 0x8B, 0xB5, 0xCB, /* 0x90-0x93 */ + 0xDF, 0x8C, 0xE7, 0xDF, 0xDF, 0x8D, 0xDA, 0xF5, /* 0x94-0x97 */ + 0xDF, 0x8E, 0xDA, 0xF8, 0xDF, 0x8F, 0xDA, 0xF6, /* 0x98-0x9B */ + 0xDF, 0x90, 0xDA, 0xF7, 0xDF, 0x91, 0xDF, 0x92, /* 0x9C-0x9F */ + 0xDF, 0x93, 0xDA, 0xFA, 0xD0, 0xCF, 0xC4, 0xC7, /* 0xA0-0xA3 */ + 0xDF, 0x94, 0xDF, 0x95, 0xB0, 0xEE, 0xDF, 0x96, /* 0xA4-0xA7 */ + 0xDF, 0x97, 0xDF, 0x98, 0xD0, 0xB0, 0xDF, 0x99, /* 0xA8-0xAB */ + 0xDA, 0xF9, 0xDF, 0x9A, 0xD3, 0xCA, 0xBA, 0xAA, /* 0xAC-0xAF */ + 0xDB, 0xA2, 0xC7, 0xF1, 0xDF, 0x9B, 0xDA, 0xFC, /* 0xB0-0xB3 */ + 0xDA, 0xFB, 0xC9, 0xDB, 0xDA, 0xFD, 0xDF, 0x9C, /* 0xB4-0xB7 */ + 0xDB, 0xA1, 0xD7, 0xDE, 0xDA, 0xFE, 0xC1, 0xDA, /* 0xB8-0xBB */ + 0xDF, 0x9D, 0xDF, 0x9E, 0xDB, 0xA5, 0xDF, 0x9F, /* 0xBC-0xBF */ + 0xDF, 0xA0, 0xD3, 0xF4, 0xE0, 0x40, 0xE0, 0x41, /* 0xC0-0xC3 */ + 0xDB, 0xA7, 0xDB, 0xA4, 0xE0, 0x42, 0xDB, 0xA8, /* 0xC4-0xC7 */ + 0xE0, 0x43, 0xE0, 0x44, 0xBD, 0xBC, 0xE0, 0x45, /* 0xC8-0xCB */ + 0xE0, 0x46, 0xE0, 0x47, 0xC0, 0xC9, 0xDB, 0xA3, /* 0xCC-0xCF */ + 0xDB, 0xA6, 0xD6, 0xA3, 0xE0, 0x48, 0xDB, 0xA9, /* 0xD0-0xD3 */ + 0xE0, 0x49, 0xE0, 0x4A, 0xE0, 0x4B, 0xDB, 0xAD, /* 0xD4-0xD7 */ + 0xE0, 0x4C, 0xE0, 0x4D, 0xE0, 0x4E, 0xDB, 0xAE, /* 0xD8-0xDB */ + 0xDB, 0xAC, 0xBA, 0xC2, 0xE0, 0x4F, 0xE0, 0x50, /* 0xDC-0xDF */ + 0xE0, 0x51, 0xBF, 0xA4, 0xDB, 0xAB, 0xE0, 0x52, /* 0xE0-0xE3 */ + 0xE0, 0x53, 0xE0, 0x54, 0xDB, 0xAA, 0xD4, 0xC7, /* 0xE4-0xE7 */ + 0xB2, 0xBF, 0xE0, 0x55, 0xE0, 0x56, 0xDB, 0xAF, /* 0xE8-0xEB */ + 0xE0, 0x57, 0xB9, 0xF9, 0xE0, 0x58, 0xDB, 0xB0, /* 0xEC-0xEF */ + 0xE0, 0x59, 0xE0, 0x5A, 0xE0, 0x5B, 0xE0, 0x5C, /* 0xF0-0xF3 */ + 0xB3, 0xBB, 0xE0, 0x5D, 0xE0, 0x5E, 0xE0, 0x5F, /* 0xF4-0xF7 */ + 0xB5, 0xA6, 0xE0, 0x60, 0xE0, 0x61, 0xE0, 0x62, /* 0xF8-0xFB */ + 0xE0, 0x63, 0xB6, 0xBC, 0xDB, 0xB1, 0xE0, 0x64, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_91[512] = { + 0xE0, 0x65, 0xE0, 0x66, 0xB6, 0xF5, 0xE0, 0x67, /* 0x00-0x03 */ + 0xDB, 0xB2, 0xE0, 0x68, 0xE0, 0x69, 0xE0, 0x6A, /* 0x04-0x07 */ + 0xE0, 0x6B, 0xE0, 0x6C, 0xE0, 0x6D, 0xE0, 0x6E, /* 0x08-0x0B */ + 0xE0, 0x6F, 0xE0, 0x70, 0xE0, 0x71, 0xE0, 0x72, /* 0x0C-0x0F */ + 0xE0, 0x73, 0xE0, 0x74, 0xE0, 0x75, 0xE0, 0x76, /* 0x10-0x13 */ + 0xE0, 0x77, 0xE0, 0x78, 0xE0, 0x79, 0xE0, 0x7A, /* 0x14-0x17 */ + 0xE0, 0x7B, 0xB1, 0xC9, 0xE0, 0x7C, 0xE0, 0x7D, /* 0x18-0x1B */ + 0xE0, 0x7E, 0xE0, 0x80, 0xDB, 0xB4, 0xE0, 0x81, /* 0x1C-0x1F */ + 0xE0, 0x82, 0xE0, 0x83, 0xDB, 0xB3, 0xDB, 0xB5, /* 0x20-0x23 */ + 0xE0, 0x84, 0xE0, 0x85, 0xE0, 0x86, 0xE0, 0x87, /* 0x24-0x27 */ + 0xE0, 0x88, 0xE0, 0x89, 0xE0, 0x8A, 0xE0, 0x8B, /* 0x28-0x2B */ + 0xE0, 0x8C, 0xE0, 0x8D, 0xE0, 0x8E, 0xDB, 0xB7, /* 0x2C-0x2F */ + 0xE0, 0x8F, 0xDB, 0xB6, 0xE0, 0x90, 0xE0, 0x91, /* 0x30-0x33 */ + 0xE0, 0x92, 0xE0, 0x93, 0xE0, 0x94, 0xE0, 0x95, /* 0x34-0x37 */ + 0xE0, 0x96, 0xDB, 0xB8, 0xE0, 0x97, 0xE0, 0x98, /* 0x38-0x3B */ + 0xE0, 0x99, 0xE0, 0x9A, 0xE0, 0x9B, 0xE0, 0x9C, /* 0x3C-0x3F */ + 0xE0, 0x9D, 0xE0, 0x9E, 0xE0, 0x9F, 0xDB, 0xB9, /* 0x40-0x43 */ + 0xE0, 0xA0, 0xE1, 0x40, 0xDB, 0xBA, 0xE1, 0x41, /* 0x44-0x47 */ + 0xE1, 0x42, 0xD3, 0xCF, 0xF4, 0xFA, 0xC7, 0xF5, /* 0x48-0x4B */ + 0xD7, 0xC3, 0xC5, 0xE4, 0xF4, 0xFC, 0xF4, 0xFD, /* 0x4C-0x4F */ + 0xF4, 0xFB, 0xE1, 0x43, 0xBE, 0xC6, 0xE1, 0x44, /* 0x50-0x53 */ + 0xE1, 0x45, 0xE1, 0x46, 0xE1, 0x47, 0xD0, 0xEF, /* 0x54-0x57 */ + 0xE1, 0x48, 0xE1, 0x49, 0xB7, 0xD3, 0xE1, 0x4A, /* 0x58-0x5B */ + 0xE1, 0x4B, 0xD4, 0xCD, 0xCC, 0xAA, 0xE1, 0x4C, /* 0x5C-0x5F */ + 0xE1, 0x4D, 0xF5, 0xA2, 0xF5, 0xA1, 0xBA, 0xA8, /* 0x60-0x63 */ + 0xF4, 0xFE, 0xCB, 0xD6, 0xE1, 0x4E, 0xE1, 0x4F, /* 0x64-0x67 */ + 0xE1, 0x50, 0xF5, 0xA4, 0xC0, 0xD2, 0xE1, 0x51, /* 0x68-0x6B */ + 0xB3, 0xEA, 0xE1, 0x52, 0xCD, 0xAA, 0xF5, 0xA5, /* 0x6C-0x6F */ + 0xF5, 0xA3, 0xBD, 0xB4, 0xF5, 0xA8, 0xE1, 0x53, /* 0x70-0x73 */ + 0xF5, 0xA9, 0xBD, 0xCD, 0xC3, 0xB8, 0xBF, 0xE1, /* 0x74-0x77 */ + 0xCB, 0xE1, 0xF5, 0xAA, 0xE1, 0x54, 0xE1, 0x55, /* 0x78-0x7B */ + 0xE1, 0x56, 0xF5, 0xA6, 0xF5, 0xA7, 0xC4, 0xF0, /* 0x7C-0x7F */ + + 0xE1, 0x57, 0xE1, 0x58, 0xE1, 0x59, 0xE1, 0x5A, /* 0x80-0x83 */ + 0xE1, 0x5B, 0xF5, 0xAC, 0xE1, 0x5C, 0xB4, 0xBC, /* 0x84-0x87 */ + 0xE1, 0x5D, 0xD7, 0xED, 0xE1, 0x5E, 0xB4, 0xD7, /* 0x88-0x8B */ + 0xF5, 0xAB, 0xF5, 0xAE, 0xE1, 0x5F, 0xE1, 0x60, /* 0x8C-0x8F */ + 0xF5, 0xAD, 0xF5, 0xAF, 0xD0, 0xD1, 0xE1, 0x61, /* 0x90-0x93 */ + 0xE1, 0x62, 0xE1, 0x63, 0xE1, 0x64, 0xE1, 0x65, /* 0x94-0x97 */ + 0xE1, 0x66, 0xE1, 0x67, 0xC3, 0xD1, 0xC8, 0xA9, /* 0x98-0x9B */ + 0xE1, 0x68, 0xE1, 0x69, 0xE1, 0x6A, 0xE1, 0x6B, /* 0x9C-0x9F */ + 0xE1, 0x6C, 0xE1, 0x6D, 0xF5, 0xB0, 0xF5, 0xB1, /* 0xA0-0xA3 */ + 0xE1, 0x6E, 0xE1, 0x6F, 0xE1, 0x70, 0xE1, 0x71, /* 0xA4-0xA7 */ + 0xE1, 0x72, 0xE1, 0x73, 0xF5, 0xB2, 0xE1, 0x74, /* 0xA8-0xAB */ + 0xE1, 0x75, 0xF5, 0xB3, 0xF5, 0xB4, 0xF5, 0xB5, /* 0xAC-0xAF */ + 0xE1, 0x76, 0xE1, 0x77, 0xE1, 0x78, 0xE1, 0x79, /* 0xB0-0xB3 */ + 0xF5, 0xB7, 0xF5, 0xB6, 0xE1, 0x7A, 0xE1, 0x7B, /* 0xB4-0xB7 */ + 0xE1, 0x7C, 0xE1, 0x7D, 0xF5, 0xB8, 0xE1, 0x7E, /* 0xB8-0xBB */ + 0xE1, 0x80, 0xE1, 0x81, 0xE1, 0x82, 0xE1, 0x83, /* 0xBC-0xBF */ + 0xE1, 0x84, 0xE1, 0x85, 0xE1, 0x86, 0xE1, 0x87, /* 0xC0-0xC3 */ + 0xE1, 0x88, 0xE1, 0x89, 0xE1, 0x8A, 0xB2, 0xC9, /* 0xC4-0xC7 */ + 0xE1, 0x8B, 0xD3, 0xD4, 0xCA, 0xCD, 0xE1, 0x8C, /* 0xC8-0xCB */ + 0xC0, 0xEF, 0xD6, 0xD8, 0xD2, 0xB0, 0xC1, 0xBF, /* 0xCC-0xCF */ + 0xE1, 0x8D, 0xBD, 0xF0, 0xE1, 0x8E, 0xE1, 0x8F, /* 0xD0-0xD3 */ + 0xE1, 0x90, 0xE1, 0x91, 0xE1, 0x92, 0xE1, 0x93, /* 0xD4-0xD7 */ + 0xE1, 0x94, 0xE1, 0x95, 0xE1, 0x96, 0xE1, 0x97, /* 0xD8-0xDB */ + 0xB8, 0xAA, 0xE1, 0x98, 0xE1, 0x99, 0xE1, 0x9A, /* 0xDC-0xDF */ + 0xE1, 0x9B, 0xE1, 0x9C, 0xE1, 0x9D, 0xE1, 0x9E, /* 0xE0-0xE3 */ + 0xE1, 0x9F, 0xE1, 0xA0, 0xE2, 0x40, 0xE2, 0x41, /* 0xE4-0xE7 */ + 0xE2, 0x42, 0xE2, 0x43, 0xE2, 0x44, 0xE2, 0x45, /* 0xE8-0xEB */ + 0xE2, 0x46, 0xE2, 0x47, 0xE2, 0x48, 0xE2, 0x49, /* 0xEC-0xEF */ + 0xE2, 0x4A, 0xE2, 0x4B, 0xE2, 0x4C, 0xE2, 0x4D, /* 0xF0-0xF3 */ + 0xE2, 0x4E, 0xE2, 0x4F, 0xE2, 0x50, 0xE2, 0x51, /* 0xF4-0xF7 */ + 0xE2, 0x52, 0xE2, 0x53, 0xE2, 0x54, 0xE2, 0x55, /* 0xF8-0xFB */ + 0xE2, 0x56, 0xE2, 0x57, 0xE2, 0x58, 0xE2, 0x59, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_92[512] = { + 0xE2, 0x5A, 0xE2, 0x5B, 0xE2, 0x5C, 0xE2, 0x5D, /* 0x00-0x03 */ + 0xE2, 0x5E, 0xE2, 0x5F, 0xE2, 0x60, 0xE2, 0x61, /* 0x04-0x07 */ + 0xE2, 0x62, 0xE2, 0x63, 0xE2, 0x64, 0xE2, 0x65, /* 0x08-0x0B */ + 0xE2, 0x66, 0xE2, 0x67, 0xE2, 0x68, 0xE2, 0x69, /* 0x0C-0x0F */ + 0xE2, 0x6A, 0xE2, 0x6B, 0xE2, 0x6C, 0xE2, 0x6D, /* 0x10-0x13 */ + 0xE2, 0x6E, 0xE2, 0x6F, 0xE2, 0x70, 0xE2, 0x71, /* 0x14-0x17 */ + 0xE2, 0x72, 0xE2, 0x73, 0xE2, 0x74, 0xE2, 0x75, /* 0x18-0x1B */ + 0xE2, 0x76, 0xE2, 0x77, 0xE2, 0x78, 0xE2, 0x79, /* 0x1C-0x1F */ + 0xE2, 0x7A, 0xE2, 0x7B, 0xE2, 0x7C, 0xE2, 0x7D, /* 0x20-0x23 */ + 0xE2, 0x7E, 0xE2, 0x80, 0xE2, 0x81, 0xE2, 0x82, /* 0x24-0x27 */ + 0xE2, 0x83, 0xE2, 0x84, 0xE2, 0x85, 0xE2, 0x86, /* 0x28-0x2B */ + 0xE2, 0x87, 0xE2, 0x88, 0xE2, 0x89, 0xE2, 0x8A, /* 0x2C-0x2F */ + 0xE2, 0x8B, 0xE2, 0x8C, 0xE2, 0x8D, 0xE2, 0x8E, /* 0x30-0x33 */ + 0xE2, 0x8F, 0xE2, 0x90, 0xE2, 0x91, 0xE2, 0x92, /* 0x34-0x37 */ + 0xE2, 0x93, 0xE2, 0x94, 0xE2, 0x95, 0xE2, 0x96, /* 0x38-0x3B */ + 0xE2, 0x97, 0xE2, 0x98, 0xE2, 0x99, 0xE2, 0x9A, /* 0x3C-0x3F */ + 0xE2, 0x9B, 0xE2, 0x9C, 0xE2, 0x9D, 0xE2, 0x9E, /* 0x40-0x43 */ + 0xE2, 0x9F, 0xE2, 0xA0, 0xE3, 0x40, 0xE3, 0x41, /* 0x44-0x47 */ + 0xE3, 0x42, 0xE3, 0x43, 0xE3, 0x44, 0xE3, 0x45, /* 0x48-0x4B */ + 0xE3, 0x46, 0xE3, 0x47, 0xE3, 0x48, 0xE3, 0x49, /* 0x4C-0x4F */ + 0xE3, 0x4A, 0xE3, 0x4B, 0xE3, 0x4C, 0xE3, 0x4D, /* 0x50-0x53 */ + 0xE3, 0x4E, 0xE3, 0x4F, 0xE3, 0x50, 0xE3, 0x51, /* 0x54-0x57 */ + 0xE3, 0x52, 0xE3, 0x53, 0xE3, 0x54, 0xE3, 0x55, /* 0x58-0x5B */ + 0xE3, 0x56, 0xE3, 0x57, 0xE3, 0x58, 0xE3, 0x59, /* 0x5C-0x5F */ + 0xE3, 0x5A, 0xE3, 0x5B, 0xE3, 0x5C, 0xE3, 0x5D, /* 0x60-0x63 */ + 0xE3, 0x5E, 0xE3, 0x5F, 0xE3, 0x60, 0xE3, 0x61, /* 0x64-0x67 */ + 0xE3, 0x62, 0xE3, 0x63, 0xE3, 0x64, 0xE3, 0x65, /* 0x68-0x6B */ + 0xE3, 0x66, 0xE3, 0x67, 0xE3, 0x68, 0xE3, 0x69, /* 0x6C-0x6F */ + 0xE3, 0x6A, 0xE3, 0x6B, 0xE3, 0x6C, 0xE3, 0x6D, /* 0x70-0x73 */ + 0xBC, 0xF8, 0xE3, 0x6E, 0xE3, 0x6F, 0xE3, 0x70, /* 0x74-0x77 */ + 0xE3, 0x71, 0xE3, 0x72, 0xE3, 0x73, 0xE3, 0x74, /* 0x78-0x7B */ + 0xE3, 0x75, 0xE3, 0x76, 0xE3, 0x77, 0xE3, 0x78, /* 0x7C-0x7F */ + + 0xE3, 0x79, 0xE3, 0x7A, 0xE3, 0x7B, 0xE3, 0x7C, /* 0x80-0x83 */ + 0xE3, 0x7D, 0xE3, 0x7E, 0xE3, 0x80, 0xE3, 0x81, /* 0x84-0x87 */ + 0xE3, 0x82, 0xE3, 0x83, 0xE3, 0x84, 0xE3, 0x85, /* 0x88-0x8B */ + 0xE3, 0x86, 0xE3, 0x87, 0xF6, 0xC6, 0xE3, 0x88, /* 0x8C-0x8F */ + 0xE3, 0x89, 0xE3, 0x8A, 0xE3, 0x8B, 0xE3, 0x8C, /* 0x90-0x93 */ + 0xE3, 0x8D, 0xE3, 0x8E, 0xE3, 0x8F, 0xE3, 0x90, /* 0x94-0x97 */ + 0xE3, 0x91, 0xE3, 0x92, 0xE3, 0x93, 0xE3, 0x94, /* 0x98-0x9B */ + 0xE3, 0x95, 0xE3, 0x96, 0xE3, 0x97, 0xE3, 0x98, /* 0x9C-0x9F */ + 0xE3, 0x99, 0xE3, 0x9A, 0xE3, 0x9B, 0xE3, 0x9C, /* 0xA0-0xA3 */ + 0xE3, 0x9D, 0xE3, 0x9E, 0xE3, 0x9F, 0xE3, 0xA0, /* 0xA4-0xA7 */ + 0xE4, 0x40, 0xE4, 0x41, 0xE4, 0x42, 0xE4, 0x43, /* 0xA8-0xAB */ + 0xE4, 0x44, 0xE4, 0x45, 0xF6, 0xC7, 0xE4, 0x46, /* 0xAC-0xAF */ + 0xE4, 0x47, 0xE4, 0x48, 0xE4, 0x49, 0xE4, 0x4A, /* 0xB0-0xB3 */ + 0xE4, 0x4B, 0xE4, 0x4C, 0xE4, 0x4D, 0xE4, 0x4E, /* 0xB4-0xB7 */ + 0xE4, 0x4F, 0xE4, 0x50, 0xE4, 0x51, 0xE4, 0x52, /* 0xB8-0xBB */ + 0xE4, 0x53, 0xE4, 0x54, 0xE4, 0x55, 0xE4, 0x56, /* 0xBC-0xBF */ + 0xE4, 0x57, 0xE4, 0x58, 0xE4, 0x59, 0xE4, 0x5A, /* 0xC0-0xC3 */ + 0xE4, 0x5B, 0xE4, 0x5C, 0xE4, 0x5D, 0xE4, 0x5E, /* 0xC4-0xC7 */ + 0xF6, 0xC8, 0xE4, 0x5F, 0xE4, 0x60, 0xE4, 0x61, /* 0xC8-0xCB */ + 0xE4, 0x62, 0xE4, 0x63, 0xE4, 0x64, 0xE4, 0x65, /* 0xCC-0xCF */ + 0xE4, 0x66, 0xE4, 0x67, 0xE4, 0x68, 0xE4, 0x69, /* 0xD0-0xD3 */ + 0xE4, 0x6A, 0xE4, 0x6B, 0xE4, 0x6C, 0xE4, 0x6D, /* 0xD4-0xD7 */ + 0xE4, 0x6E, 0xE4, 0x6F, 0xE4, 0x70, 0xE4, 0x71, /* 0xD8-0xDB */ + 0xE4, 0x72, 0xE4, 0x73, 0xE4, 0x74, 0xE4, 0x75, /* 0xDC-0xDF */ + 0xE4, 0x76, 0xE4, 0x77, 0xE4, 0x78, 0xE4, 0x79, /* 0xE0-0xE3 */ + 0xE4, 0x7A, 0xE4, 0x7B, 0xE4, 0x7C, 0xE4, 0x7D, /* 0xE4-0xE7 */ + 0xE4, 0x7E, 0xE4, 0x80, 0xE4, 0x81, 0xE4, 0x82, /* 0xE8-0xEB */ + 0xE4, 0x83, 0xE4, 0x84, 0xE4, 0x85, 0xE4, 0x86, /* 0xEC-0xEF */ + 0xE4, 0x87, 0xE4, 0x88, 0xE4, 0x89, 0xE4, 0x8A, /* 0xF0-0xF3 */ + 0xE4, 0x8B, 0xE4, 0x8C, 0xE4, 0x8D, 0xE4, 0x8E, /* 0xF4-0xF7 */ + 0xE4, 0x8F, 0xE4, 0x90, 0xE4, 0x91, 0xE4, 0x92, /* 0xF8-0xFB */ + 0xE4, 0x93, 0xE4, 0x94, 0xE4, 0x95, 0xE4, 0x96, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_93[512] = { + 0xE4, 0x97, 0xE4, 0x98, 0xE4, 0x99, 0xE4, 0x9A, /* 0x00-0x03 */ + 0xE4, 0x9B, 0xE4, 0x9C, 0xE4, 0x9D, 0xE4, 0x9E, /* 0x04-0x07 */ + 0xE4, 0x9F, 0xE4, 0xA0, 0xE5, 0x40, 0xE5, 0x41, /* 0x08-0x0B */ + 0xE5, 0x42, 0xE5, 0x43, 0xE5, 0x44, 0xE5, 0x45, /* 0x0C-0x0F */ + 0xE5, 0x46, 0xE5, 0x47, 0xE5, 0x48, 0xE5, 0x49, /* 0x10-0x13 */ + 0xE5, 0x4A, 0xE5, 0x4B, 0xE5, 0x4C, 0xE5, 0x4D, /* 0x14-0x17 */ + 0xE5, 0x4E, 0xE5, 0x4F, 0xE5, 0x50, 0xE5, 0x51, /* 0x18-0x1B */ + 0xE5, 0x52, 0xE5, 0x53, 0xE5, 0x54, 0xE5, 0x55, /* 0x1C-0x1F */ + 0xE5, 0x56, 0xE5, 0x57, 0xE5, 0x58, 0xE5, 0x59, /* 0x20-0x23 */ + 0xE5, 0x5A, 0xE5, 0x5B, 0xE5, 0x5C, 0xE5, 0x5D, /* 0x24-0x27 */ + 0xE5, 0x5E, 0xE5, 0x5F, 0xE5, 0x60, 0xE5, 0x61, /* 0x28-0x2B */ + 0xE5, 0x62, 0xE5, 0x63, 0xE5, 0x64, 0xE5, 0x65, /* 0x2C-0x2F */ + 0xE5, 0x66, 0xE5, 0x67, 0xE5, 0x68, 0xE5, 0x69, /* 0x30-0x33 */ + 0xE5, 0x6A, 0xE5, 0x6B, 0xE5, 0x6C, 0xE5, 0x6D, /* 0x34-0x37 */ + 0xE5, 0x6E, 0xE5, 0x6F, 0xE5, 0x70, 0xE5, 0x71, /* 0x38-0x3B */ + 0xE5, 0x72, 0xE5, 0x73, 0xF6, 0xC9, 0xE5, 0x74, /* 0x3C-0x3F */ + 0xE5, 0x75, 0xE5, 0x76, 0xE5, 0x77, 0xE5, 0x78, /* 0x40-0x43 */ + 0xE5, 0x79, 0xE5, 0x7A, 0xE5, 0x7B, 0xE5, 0x7C, /* 0x44-0x47 */ + 0xE5, 0x7D, 0xE5, 0x7E, 0xE5, 0x80, 0xE5, 0x81, /* 0x48-0x4B */ + 0xE5, 0x82, 0xE5, 0x83, 0xE5, 0x84, 0xE5, 0x85, /* 0x4C-0x4F */ + 0xE5, 0x86, 0xE5, 0x87, 0xE5, 0x88, 0xE5, 0x89, /* 0x50-0x53 */ + 0xE5, 0x8A, 0xE5, 0x8B, 0xE5, 0x8C, 0xE5, 0x8D, /* 0x54-0x57 */ + 0xE5, 0x8E, 0xE5, 0x8F, 0xE5, 0x90, 0xE5, 0x91, /* 0x58-0x5B */ + 0xE5, 0x92, 0xE5, 0x93, 0xE5, 0x94, 0xE5, 0x95, /* 0x5C-0x5F */ + 0xE5, 0x96, 0xE5, 0x97, 0xE5, 0x98, 0xE5, 0x99, /* 0x60-0x63 */ + 0xE5, 0x9A, 0xE5, 0x9B, 0xE5, 0x9C, 0xE5, 0x9D, /* 0x64-0x67 */ + 0xE5, 0x9E, 0xE5, 0x9F, 0xF6, 0xCA, 0xE5, 0xA0, /* 0x68-0x6B */ + 0xE6, 0x40, 0xE6, 0x41, 0xE6, 0x42, 0xE6, 0x43, /* 0x6C-0x6F */ + 0xE6, 0x44, 0xE6, 0x45, 0xE6, 0x46, 0xE6, 0x47, /* 0x70-0x73 */ + 0xE6, 0x48, 0xE6, 0x49, 0xE6, 0x4A, 0xE6, 0x4B, /* 0x74-0x77 */ + 0xE6, 0x4C, 0xE6, 0x4D, 0xE6, 0x4E, 0xE6, 0x4F, /* 0x78-0x7B */ + 0xE6, 0x50, 0xE6, 0x51, 0xE6, 0x52, 0xE6, 0x53, /* 0x7C-0x7F */ + + 0xE6, 0x54, 0xE6, 0x55, 0xE6, 0x56, 0xE6, 0x57, /* 0x80-0x83 */ + 0xE6, 0x58, 0xE6, 0x59, 0xE6, 0x5A, 0xE6, 0x5B, /* 0x84-0x87 */ + 0xE6, 0x5C, 0xE6, 0x5D, 0xE6, 0x5E, 0xE6, 0x5F, /* 0x88-0x8B */ + 0xE6, 0x60, 0xE6, 0x61, 0xE6, 0x62, 0xF6, 0xCC, /* 0x8C-0x8F */ + 0xE6, 0x63, 0xE6, 0x64, 0xE6, 0x65, 0xE6, 0x66, /* 0x90-0x93 */ + 0xE6, 0x67, 0xE6, 0x68, 0xE6, 0x69, 0xE6, 0x6A, /* 0x94-0x97 */ + 0xE6, 0x6B, 0xE6, 0x6C, 0xE6, 0x6D, 0xE6, 0x6E, /* 0x98-0x9B */ + 0xE6, 0x6F, 0xE6, 0x70, 0xE6, 0x71, 0xE6, 0x72, /* 0x9C-0x9F */ + 0xE6, 0x73, 0xE6, 0x74, 0xE6, 0x75, 0xE6, 0x76, /* 0xA0-0xA3 */ + 0xE6, 0x77, 0xE6, 0x78, 0xE6, 0x79, 0xE6, 0x7A, /* 0xA4-0xA7 */ + 0xE6, 0x7B, 0xE6, 0x7C, 0xE6, 0x7D, 0xE6, 0x7E, /* 0xA8-0xAB */ + 0xE6, 0x80, 0xE6, 0x81, 0xE6, 0x82, 0xE6, 0x83, /* 0xAC-0xAF */ + 0xE6, 0x84, 0xE6, 0x85, 0xE6, 0x86, 0xE6, 0x87, /* 0xB0-0xB3 */ + 0xE6, 0x88, 0xE6, 0x89, 0xE6, 0x8A, 0xE6, 0x8B, /* 0xB4-0xB7 */ + 0xE6, 0x8C, 0xE6, 0x8D, 0xE6, 0x8E, 0xE6, 0x8F, /* 0xB8-0xBB */ + 0xE6, 0x90, 0xE6, 0x91, 0xE6, 0x92, 0xE6, 0x93, /* 0xBC-0xBF */ + 0xE6, 0x94, 0xE6, 0x95, 0xE6, 0x96, 0xE6, 0x97, /* 0xC0-0xC3 */ + 0xE6, 0x98, 0xE6, 0x99, 0xE6, 0x9A, 0xE6, 0x9B, /* 0xC4-0xC7 */ + 0xE6, 0x9C, 0xE6, 0x9D, 0xF6, 0xCB, 0xE6, 0x9E, /* 0xC8-0xCB */ + 0xE6, 0x9F, 0xE6, 0xA0, 0xE7, 0x40, 0xE7, 0x41, /* 0xCC-0xCF */ + 0xE7, 0x42, 0xE7, 0x43, 0xE7, 0x44, 0xE7, 0x45, /* 0xD0-0xD3 */ + 0xE7, 0x46, 0xE7, 0x47, 0xF7, 0xE9, 0xE7, 0x48, /* 0xD4-0xD7 */ + 0xE7, 0x49, 0xE7, 0x4A, 0xE7, 0x4B, 0xE7, 0x4C, /* 0xD8-0xDB */ + 0xE7, 0x4D, 0xE7, 0x4E, 0xE7, 0x4F, 0xE7, 0x50, /* 0xDC-0xDF */ + 0xE7, 0x51, 0xE7, 0x52, 0xE7, 0x53, 0xE7, 0x54, /* 0xE0-0xE3 */ + 0xE7, 0x55, 0xE7, 0x56, 0xE7, 0x57, 0xE7, 0x58, /* 0xE4-0xE7 */ + 0xE7, 0x59, 0xE7, 0x5A, 0xE7, 0x5B, 0xE7, 0x5C, /* 0xE8-0xEB */ + 0xE7, 0x5D, 0xE7, 0x5E, 0xE7, 0x5F, 0xE7, 0x60, /* 0xEC-0xEF */ + 0xE7, 0x61, 0xE7, 0x62, 0xE7, 0x63, 0xE7, 0x64, /* 0xF0-0xF3 */ + 0xE7, 0x65, 0xE7, 0x66, 0xE7, 0x67, 0xE7, 0x68, /* 0xF4-0xF7 */ + 0xE7, 0x69, 0xE7, 0x6A, 0xE7, 0x6B, 0xE7, 0x6C, /* 0xF8-0xFB */ + 0xE7, 0x6D, 0xE7, 0x6E, 0xE7, 0x6F, 0xE7, 0x70, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_94[512] = { + 0xE7, 0x71, 0xE7, 0x72, 0xE7, 0x73, 0xE7, 0x74, /* 0x00-0x03 */ + 0xE7, 0x75, 0xE7, 0x76, 0xE7, 0x77, 0xE7, 0x78, /* 0x04-0x07 */ + 0xE7, 0x79, 0xE7, 0x7A, 0xE7, 0x7B, 0xE7, 0x7C, /* 0x08-0x0B */ + 0xE7, 0x7D, 0xE7, 0x7E, 0xE7, 0x80, 0xE7, 0x81, /* 0x0C-0x0F */ + 0xE7, 0x82, 0xE7, 0x83, 0xE7, 0x84, 0xE7, 0x85, /* 0x10-0x13 */ + 0xE7, 0x86, 0xE7, 0x87, 0xE7, 0x88, 0xE7, 0x89, /* 0x14-0x17 */ + 0xE7, 0x8A, 0xE7, 0x8B, 0xE7, 0x8C, 0xE7, 0x8D, /* 0x18-0x1B */ + 0xE7, 0x8E, 0xE7, 0x8F, 0xE7, 0x90, 0xE7, 0x91, /* 0x1C-0x1F */ + 0xE7, 0x92, 0xE7, 0x93, 0xE7, 0x94, 0xE7, 0x95, /* 0x20-0x23 */ + 0xE7, 0x96, 0xE7, 0x97, 0xE7, 0x98, 0xE7, 0x99, /* 0x24-0x27 */ + 0xE7, 0x9A, 0xE7, 0x9B, 0xE7, 0x9C, 0xE7, 0x9D, /* 0x28-0x2B */ + 0xE7, 0x9E, 0xE7, 0x9F, 0xE7, 0xA0, 0xE8, 0x40, /* 0x2C-0x2F */ + 0xE8, 0x41, 0xE8, 0x42, 0xE8, 0x43, 0xE8, 0x44, /* 0x30-0x33 */ + 0xE8, 0x45, 0xE8, 0x46, 0xE8, 0x47, 0xE8, 0x48, /* 0x34-0x37 */ + 0xE8, 0x49, 0xE8, 0x4A, 0xE8, 0x4B, 0xE8, 0x4C, /* 0x38-0x3B */ + 0xE8, 0x4D, 0xE8, 0x4E, 0xF6, 0xCD, 0xE8, 0x4F, /* 0x3C-0x3F */ + 0xE8, 0x50, 0xE8, 0x51, 0xE8, 0x52, 0xE8, 0x53, /* 0x40-0x43 */ + 0xE8, 0x54, 0xE8, 0x55, 0xE8, 0x56, 0xE8, 0x57, /* 0x44-0x47 */ + 0xE8, 0x58, 0xE8, 0x59, 0xE8, 0x5A, 0xE8, 0x5B, /* 0x48-0x4B */ + 0xE8, 0x5C, 0xE8, 0x5D, 0xE8, 0x5E, 0xE8, 0x5F, /* 0x4C-0x4F */ + 0xE8, 0x60, 0xE8, 0x61, 0xE8, 0x62, 0xE8, 0x63, /* 0x50-0x53 */ + 0xE8, 0x64, 0xE8, 0x65, 0xE8, 0x66, 0xE8, 0x67, /* 0x54-0x57 */ + 0xE8, 0x68, 0xE8, 0x69, 0xE8, 0x6A, 0xE8, 0x6B, /* 0x58-0x5B */ + 0xE8, 0x6C, 0xE8, 0x6D, 0xE8, 0x6E, 0xE8, 0x6F, /* 0x5C-0x5F */ + 0xE8, 0x70, 0xE8, 0x71, 0xE8, 0x72, 0xE8, 0x73, /* 0x60-0x63 */ + 0xE8, 0x74, 0xE8, 0x75, 0xE8, 0x76, 0xE8, 0x77, /* 0x64-0x67 */ + 0xE8, 0x78, 0xE8, 0x79, 0xE8, 0x7A, 0xF6, 0xCE, /* 0x68-0x6B */ + 0xE8, 0x7B, 0xE8, 0x7C, 0xE8, 0x7D, 0xE8, 0x7E, /* 0x6C-0x6F */ + 0xE8, 0x80, 0xE8, 0x81, 0xE8, 0x82, 0xE8, 0x83, /* 0x70-0x73 */ + 0xE8, 0x84, 0xE8, 0x85, 0xE8, 0x86, 0xE8, 0x87, /* 0x74-0x77 */ + 0xE8, 0x88, 0xE8, 0x89, 0xE8, 0x8A, 0xE8, 0x8B, /* 0x78-0x7B */ + 0xE8, 0x8C, 0xE8, 0x8D, 0xE8, 0x8E, 0xE8, 0x8F, /* 0x7C-0x7F */ + + 0xE8, 0x90, 0xE8, 0x91, 0xE8, 0x92, 0xE8, 0x93, /* 0x80-0x83 */ + 0xE8, 0x94, 0xEE, 0xC4, 0xEE, 0xC5, 0xEE, 0xC6, /* 0x84-0x87 */ + 0xD5, 0xEB, 0xB6, 0xA4, 0xEE, 0xC8, 0xEE, 0xC7, /* 0x88-0x8B */ + 0xEE, 0xC9, 0xEE, 0xCA, 0xC7, 0xA5, 0xEE, 0xCB, /* 0x8C-0x8F */ + 0xEE, 0xCC, 0xE8, 0x95, 0xB7, 0xB0, 0xB5, 0xF6, /* 0x90-0x93 */ + 0xEE, 0xCD, 0xEE, 0xCF, 0xE8, 0x96, 0xEE, 0xCE, /* 0x94-0x97 */ + 0xE8, 0x97, 0xB8, 0xC6, 0xEE, 0xD0, 0xEE, 0xD1, /* 0x98-0x9B */ + 0xEE, 0xD2, 0xB6, 0xDB, 0xB3, 0xAE, 0xD6, 0xD3, /* 0x9C-0x9F */ + 0xC4, 0xC6, 0xB1, 0xB5, 0xB8, 0xD6, 0xEE, 0xD3, /* 0xA0-0xA3 */ + 0xEE, 0xD4, 0xD4, 0xBF, 0xC7, 0xD5, 0xBE, 0xFB, /* 0xA4-0xA7 */ + 0xCE, 0xD9, 0xB9, 0xB3, 0xEE, 0xD6, 0xEE, 0xD5, /* 0xA8-0xAB */ + 0xEE, 0xD8, 0xEE, 0xD7, 0xC5, 0xA5, 0xEE, 0xD9, /* 0xAC-0xAF */ + 0xEE, 0xDA, 0xC7, 0xAE, 0xEE, 0xDB, 0xC7, 0xAF, /* 0xB0-0xB3 */ + 0xEE, 0xDC, 0xB2, 0xA7, 0xEE, 0xDD, 0xEE, 0xDE, /* 0xB4-0xB7 */ + 0xEE, 0xDF, 0xEE, 0xE0, 0xEE, 0xE1, 0xD7, 0xEA, /* 0xB8-0xBB */ + 0xEE, 0xE2, 0xEE, 0xE3, 0xBC, 0xD8, 0xEE, 0xE4, /* 0xBC-0xBF */ + 0xD3, 0xCB, 0xCC, 0xFA, 0xB2, 0xAC, 0xC1, 0xE5, /* 0xC0-0xC3 */ + 0xEE, 0xE5, 0xC7, 0xA6, 0xC3, 0xAD, 0xE8, 0x98, /* 0xC4-0xC7 */ + 0xEE, 0xE6, 0xEE, 0xE7, 0xEE, 0xE8, 0xEE, 0xE9, /* 0xC8-0xCB */ + 0xEE, 0xEA, 0xEE, 0xEB, 0xEE, 0xEC, 0xE8, 0x99, /* 0xCC-0xCF */ + 0xEE, 0xED, 0xEE, 0xEE, 0xEE, 0xEF, 0xE8, 0x9A, /* 0xD0-0xD3 */ + 0xE8, 0x9B, 0xEE, 0xF0, 0xEE, 0xF1, 0xEE, 0xF2, /* 0xD4-0xD7 */ + 0xEE, 0xF4, 0xEE, 0xF3, 0xE8, 0x9C, 0xEE, 0xF5, /* 0xD8-0xDB */ + 0xCD, 0xAD, 0xC2, 0xC1, 0xEE, 0xF6, 0xEE, 0xF7, /* 0xDC-0xDF */ + 0xEE, 0xF8, 0xD5, 0xA1, 0xEE, 0xF9, 0xCF, 0xB3, /* 0xE0-0xE3 */ + 0xEE, 0xFA, 0xEE, 0xFB, 0xE8, 0x9D, 0xEE, 0xFC, /* 0xE4-0xE7 */ + 0xEE, 0xFD, 0xEF, 0xA1, 0xEE, 0xFE, 0xEF, 0xA2, /* 0xE8-0xEB */ + 0xB8, 0xF5, 0xC3, 0xFA, 0xEF, 0xA3, 0xEF, 0xA4, /* 0xEC-0xEF */ + 0xBD, 0xC2, 0xD2, 0xBF, 0xB2, 0xF9, 0xEF, 0xA5, /* 0xF0-0xF3 */ + 0xEF, 0xA6, 0xEF, 0xA7, 0xD2, 0xF8, 0xEF, 0xA8, /* 0xF4-0xF7 */ + 0xD6, 0xFD, 0xEF, 0xA9, 0xC6, 0xCC, 0xE8, 0x9E, /* 0xF8-0xFB */ + 0xEF, 0xAA, 0xEF, 0xAB, 0xC1, 0xB4, 0xEF, 0xAC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_95[512] = { + 0xCF, 0xFA, 0xCB, 0xF8, 0xEF, 0xAE, 0xEF, 0xAD, /* 0x00-0x03 */ + 0xB3, 0xFA, 0xB9, 0xF8, 0xEF, 0xAF, 0xEF, 0xB0, /* 0x04-0x07 */ + 0xD0, 0xE2, 0xEF, 0xB1, 0xEF, 0xB2, 0xB7, 0xE6, /* 0x08-0x0B */ + 0xD0, 0xBF, 0xEF, 0xB3, 0xEF, 0xB4, 0xEF, 0xB5, /* 0x0C-0x0F */ + 0xC8, 0xF1, 0xCC, 0xE0, 0xEF, 0xB6, 0xEF, 0xB7, /* 0x10-0x13 */ + 0xEF, 0xB8, 0xEF, 0xB9, 0xEF, 0xBA, 0xD5, 0xE0, /* 0x14-0x17 */ + 0xEF, 0xBB, 0xB4, 0xED, 0xC3, 0xAA, 0xEF, 0xBC, /* 0x18-0x1B */ + 0xE8, 0x9F, 0xEF, 0xBD, 0xEF, 0xBE, 0xEF, 0xBF, /* 0x1C-0x1F */ + 0xE8, 0xA0, 0xCE, 0xFD, 0xEF, 0xC0, 0xC2, 0xE0, /* 0x20-0x23 */ + 0xB4, 0xB8, 0xD7, 0xB6, 0xBD, 0xF5, 0xE9, 0x40, /* 0x24-0x27 */ + 0xCF, 0xC7, 0xEF, 0xC3, 0xEF, 0xC1, 0xEF, 0xC2, /* 0x28-0x2B */ + 0xEF, 0xC4, 0xB6, 0xA7, 0xBC, 0xFC, 0xBE, 0xE2, /* 0x2C-0x2F */ + 0xC3, 0xCC, 0xEF, 0xC5, 0xEF, 0xC6, 0xE9, 0x41, /* 0x30-0x33 */ + 0xEF, 0xC7, 0xEF, 0xCF, 0xEF, 0xC8, 0xEF, 0xC9, /* 0x34-0x37 */ + 0xEF, 0xCA, 0xC7, 0xC2, 0xEF, 0xF1, 0xB6, 0xCD, /* 0x38-0x3B */ + 0xEF, 0xCB, 0xE9, 0x42, 0xEF, 0xCC, 0xEF, 0xCD, /* 0x3C-0x3F */ + 0xB6, 0xC6, 0xC3, 0xBE, 0xEF, 0xCE, 0xE9, 0x43, /* 0x40-0x43 */ + 0xEF, 0xD0, 0xEF, 0xD1, 0xEF, 0xD2, 0xD5, 0xF2, /* 0x44-0x47 */ + 0xE9, 0x44, 0xEF, 0xD3, 0xC4, 0xF7, 0xE9, 0x45, /* 0x48-0x4B */ + 0xEF, 0xD4, 0xC4, 0xF8, 0xEF, 0xD5, 0xEF, 0xD6, /* 0x4C-0x4F */ + 0xB8, 0xE4, 0xB0, 0xF7, 0xEF, 0xD7, 0xEF, 0xD8, /* 0x50-0x53 */ + 0xEF, 0xD9, 0xE9, 0x46, 0xEF, 0xDA, 0xEF, 0xDB, /* 0x54-0x57 */ + 0xEF, 0xDC, 0xEF, 0xDD, 0xE9, 0x47, 0xEF, 0xDE, /* 0x58-0x5B */ + 0xBE, 0xB5, 0xEF, 0xE1, 0xEF, 0xDF, 0xEF, 0xE0, /* 0x5C-0x5F */ + 0xE9, 0x48, 0xEF, 0xE2, 0xEF, 0xE3, 0xC1, 0xCD, /* 0x60-0x63 */ + 0xEF, 0xE4, 0xEF, 0xE5, 0xEF, 0xE6, 0xEF, 0xE7, /* 0x64-0x67 */ + 0xEF, 0xE8, 0xEF, 0xE9, 0xEF, 0xEA, 0xEF, 0xEB, /* 0x68-0x6B */ + 0xEF, 0xEC, 0xC0, 0xD8, 0xE9, 0x49, 0xEF, 0xED, /* 0x6C-0x6F */ + 0xC1, 0xAD, 0xEF, 0xEE, 0xEF, 0xEF, 0xEF, 0xF0, /* 0x70-0x73 */ + 0xE9, 0x4A, 0xE9, 0x4B, 0xCF, 0xE2, 0xE9, 0x4C, /* 0x74-0x77 */ + 0xE9, 0x4D, 0xE9, 0x4E, 0xE9, 0x4F, 0xE9, 0x50, /* 0x78-0x7B */ + 0xE9, 0x51, 0xE9, 0x52, 0xE9, 0x53, 0xB3, 0xA4, /* 0x7C-0x7F */ + + 0xE9, 0x54, 0xE9, 0x55, 0xE9, 0x56, 0xE9, 0x57, /* 0x80-0x83 */ + 0xE9, 0x58, 0xE9, 0x59, 0xE9, 0x5A, 0xE9, 0x5B, /* 0x84-0x87 */ + 0xE9, 0x5C, 0xE9, 0x5D, 0xE9, 0x5E, 0xE9, 0x5F, /* 0x88-0x8B */ + 0xE9, 0x60, 0xE9, 0x61, 0xE9, 0x62, 0xE9, 0x63, /* 0x8C-0x8F */ + 0xE9, 0x64, 0xE9, 0x65, 0xE9, 0x66, 0xE9, 0x67, /* 0x90-0x93 */ + 0xE9, 0x68, 0xE9, 0x69, 0xE9, 0x6A, 0xE9, 0x6B, /* 0x94-0x97 */ + 0xE9, 0x6C, 0xE9, 0x6D, 0xE9, 0x6E, 0xE9, 0x6F, /* 0x98-0x9B */ + 0xE9, 0x70, 0xE9, 0x71, 0xE9, 0x72, 0xE9, 0x73, /* 0x9C-0x9F */ + 0xE9, 0x74, 0xE9, 0x75, 0xE9, 0x76, 0xE9, 0x77, /* 0xA0-0xA3 */ + 0xE9, 0x78, 0xE9, 0x79, 0xE9, 0x7A, 0xE9, 0x7B, /* 0xA4-0xA7 */ + 0xE9, 0x7C, 0xE9, 0x7D, 0xE9, 0x7E, 0xE9, 0x80, /* 0xA8-0xAB */ + 0xE9, 0x81, 0xE9, 0x82, 0xE9, 0x83, 0xE9, 0x84, /* 0xAC-0xAF */ + 0xE9, 0x85, 0xE9, 0x86, 0xE9, 0x87, 0xE9, 0x88, /* 0xB0-0xB3 */ + 0xE9, 0x89, 0xE9, 0x8A, 0xE9, 0x8B, 0xE9, 0x8C, /* 0xB4-0xB7 */ + 0xE9, 0x8D, 0xE9, 0x8E, 0xE9, 0x8F, 0xE9, 0x90, /* 0xB8-0xBB */ + 0xE9, 0x91, 0xE9, 0x92, 0xE9, 0x93, 0xE9, 0x94, /* 0xBC-0xBF */ + 0xE9, 0x95, 0xE9, 0x96, 0xE9, 0x97, 0xE9, 0x98, /* 0xC0-0xC3 */ + 0xE9, 0x99, 0xE9, 0x9A, 0xE9, 0x9B, 0xE9, 0x9C, /* 0xC4-0xC7 */ + 0xE9, 0x9D, 0xE9, 0x9E, 0xE9, 0x9F, 0xE9, 0xA0, /* 0xC8-0xCB */ + 0xEA, 0x40, 0xEA, 0x41, 0xEA, 0x42, 0xEA, 0x43, /* 0xCC-0xCF */ + 0xEA, 0x44, 0xEA, 0x45, 0xEA, 0x46, 0xEA, 0x47, /* 0xD0-0xD3 */ + 0xEA, 0x48, 0xEA, 0x49, 0xEA, 0x4A, 0xEA, 0x4B, /* 0xD4-0xD7 */ + 0xEA, 0x4C, 0xEA, 0x4D, 0xEA, 0x4E, 0xEA, 0x4F, /* 0xD8-0xDB */ + 0xEA, 0x50, 0xEA, 0x51, 0xEA, 0x52, 0xEA, 0x53, /* 0xDC-0xDF */ + 0xEA, 0x54, 0xEA, 0x55, 0xEA, 0x56, 0xEA, 0x57, /* 0xE0-0xE3 */ + 0xEA, 0x58, 0xEA, 0x59, 0xEA, 0x5A, 0xEA, 0x5B, /* 0xE4-0xE7 */ + 0xC3, 0xC5, 0xE3, 0xC5, 0xC9, 0xC1, 0xE3, 0xC6, /* 0xE8-0xEB */ + 0xEA, 0x5C, 0xB1, 0xD5, 0xCE, 0xCA, 0xB4, 0xB3, /* 0xEC-0xEF */ + 0xC8, 0xF2, 0xE3, 0xC7, 0xCF, 0xD0, 0xE3, 0xC8, /* 0xF0-0xF3 */ + 0xBC, 0xE4, 0xE3, 0xC9, 0xE3, 0xCA, 0xC3, 0xC6, /* 0xF4-0xF7 */ + 0xD5, 0xA2, 0xC4, 0xD6, 0xB9, 0xEB, 0xCE, 0xC5, /* 0xF8-0xFB */ + 0xE3, 0xCB, 0xC3, 0xF6, 0xE3, 0xCC, 0xEA, 0x5D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_96[512] = { + 0xB7, 0xA7, 0xB8, 0xF3, 0xBA, 0xD2, 0xE3, 0xCD, /* 0x00-0x03 */ + 0xE3, 0xCE, 0xD4, 0xC4, 0xE3, 0xCF, 0xEA, 0x5E, /* 0x04-0x07 */ + 0xE3, 0xD0, 0xD1, 0xCB, 0xE3, 0xD1, 0xE3, 0xD2, /* 0x08-0x0B */ + 0xE3, 0xD3, 0xE3, 0xD4, 0xD1, 0xD6, 0xE3, 0xD5, /* 0x0C-0x0F */ + 0xB2, 0xFB, 0xC0, 0xBB, 0xE3, 0xD6, 0xEA, 0x5F, /* 0x10-0x13 */ + 0xC0, 0xAB, 0xE3, 0xD7, 0xE3, 0xD8, 0xE3, 0xD9, /* 0x14-0x17 */ + 0xEA, 0x60, 0xE3, 0xDA, 0xE3, 0xDB, 0xEA, 0x61, /* 0x18-0x1B */ + 0xB8, 0xB7, 0xDA, 0xE2, 0xEA, 0x62, 0xB6, 0xD3, /* 0x1C-0x1F */ + 0xEA, 0x63, 0xDA, 0xE4, 0xDA, 0xE3, 0xEA, 0x64, /* 0x20-0x23 */ + 0xEA, 0x65, 0xEA, 0x66, 0xEA, 0x67, 0xEA, 0x68, /* 0x24-0x27 */ + 0xEA, 0x69, 0xEA, 0x6A, 0xDA, 0xE6, 0xEA, 0x6B, /* 0x28-0x2B */ + 0xEA, 0x6C, 0xEA, 0x6D, 0xC8, 0xEE, 0xEA, 0x6E, /* 0x2C-0x2F */ + 0xEA, 0x6F, 0xDA, 0xE5, 0xB7, 0xC0, 0xD1, 0xF4, /* 0x30-0x33 */ + 0xD2, 0xF5, 0xD5, 0xF3, 0xBD, 0xD7, 0xEA, 0x70, /* 0x34-0x37 */ + 0xEA, 0x71, 0xEA, 0x72, 0xEA, 0x73, 0xD7, 0xE8, /* 0x38-0x3B */ + 0xDA, 0xE8, 0xDA, 0xE7, 0xEA, 0x74, 0xB0, 0xA2, /* 0x3C-0x3F */ + 0xCD, 0xD3, 0xEA, 0x75, 0xDA, 0xE9, 0xEA, 0x76, /* 0x40-0x43 */ + 0xB8, 0xBD, 0xBC, 0xCA, 0xC2, 0xBD, 0xC2, 0xA4, /* 0x44-0x47 */ + 0xB3, 0xC2, 0xDA, 0xEA, 0xEA, 0x77, 0xC2, 0xAA, /* 0x48-0x4B */ + 0xC4, 0xB0, 0xBD, 0xB5, 0xEA, 0x78, 0xEA, 0x79, /* 0x4C-0x4F */ + 0xCF, 0xDE, 0xEA, 0x7A, 0xEA, 0x7B, 0xEA, 0x7C, /* 0x50-0x53 */ + 0xDA, 0xEB, 0xC9, 0xC2, 0xEA, 0x7D, 0xEA, 0x7E, /* 0x54-0x57 */ + 0xEA, 0x80, 0xEA, 0x81, 0xEA, 0x82, 0xB1, 0xDD, /* 0x58-0x5B */ + 0xEA, 0x83, 0xEA, 0x84, 0xEA, 0x85, 0xDA, 0xEC, /* 0x5C-0x5F */ + 0xEA, 0x86, 0xB6, 0xB8, 0xD4, 0xBA, 0xEA, 0x87, /* 0x60-0x63 */ + 0xB3, 0xFD, 0xEA, 0x88, 0xEA, 0x89, 0xDA, 0xED, /* 0x64-0x67 */ + 0xD4, 0xC9, 0xCF, 0xD5, 0xC5, 0xE3, 0xEA, 0x8A, /* 0x68-0x6B */ + 0xDA, 0xEE, 0xEA, 0x8B, 0xEA, 0x8C, 0xEA, 0x8D, /* 0x6C-0x6F */ + 0xEA, 0x8E, 0xEA, 0x8F, 0xDA, 0xEF, 0xEA, 0x90, /* 0x70-0x73 */ + 0xDA, 0xF0, 0xC1, 0xEA, 0xCC, 0xD5, 0xCF, 0xDD, /* 0x74-0x77 */ + 0xEA, 0x91, 0xEA, 0x92, 0xEA, 0x93, 0xEA, 0x94, /* 0x78-0x7B */ + 0xEA, 0x95, 0xEA, 0x96, 0xEA, 0x97, 0xEA, 0x98, /* 0x7C-0x7F */ + + 0xEA, 0x99, 0xEA, 0x9A, 0xEA, 0x9B, 0xEA, 0x9C, /* 0x80-0x83 */ + 0xEA, 0x9D, 0xD3, 0xE7, 0xC2, 0xA1, 0xEA, 0x9E, /* 0x84-0x87 */ + 0xDA, 0xF1, 0xEA, 0x9F, 0xEA, 0xA0, 0xCB, 0xE5, /* 0x88-0x8B */ + 0xEB, 0x40, 0xDA, 0xF2, 0xEB, 0x41, 0xCB, 0xE6, /* 0x8C-0x8F */ + 0xD2, 0xFE, 0xEB, 0x42, 0xEB, 0x43, 0xEB, 0x44, /* 0x90-0x93 */ + 0xB8, 0xF4, 0xEB, 0x45, 0xEB, 0x46, 0xDA, 0xF3, /* 0x94-0x97 */ + 0xB0, 0xAF, 0xCF, 0xB6, 0xEB, 0x47, 0xEB, 0x48, /* 0x98-0x9B */ + 0xD5, 0xCF, 0xEB, 0x49, 0xEB, 0x4A, 0xEB, 0x4B, /* 0x9C-0x9F */ + 0xEB, 0x4C, 0xEB, 0x4D, 0xEB, 0x4E, 0xEB, 0x4F, /* 0xA0-0xA3 */ + 0xEB, 0x50, 0xEB, 0x51, 0xEB, 0x52, 0xCB, 0xED, /* 0xA4-0xA7 */ + 0xEB, 0x53, 0xEB, 0x54, 0xEB, 0x55, 0xEB, 0x56, /* 0xA8-0xAB */ + 0xEB, 0x57, 0xEB, 0x58, 0xEB, 0x59, 0xEB, 0x5A, /* 0xAC-0xAF */ + 0xDA, 0xF4, 0xEB, 0x5B, 0xEB, 0x5C, 0xE3, 0xC4, /* 0xB0-0xB3 */ + 0xEB, 0x5D, 0xEB, 0x5E, 0xC1, 0xA5, 0xEB, 0x5F, /* 0xB4-0xB7 */ + 0xEB, 0x60, 0xF6, 0xBF, 0xEB, 0x61, 0xEB, 0x62, /* 0xB8-0xBB */ + 0xF6, 0xC0, 0xF6, 0xC1, 0xC4, 0xD1, 0xEB, 0x63, /* 0xBC-0xBF */ + 0xC8, 0xB8, 0xD1, 0xE3, 0xEB, 0x64, 0xEB, 0x65, /* 0xC0-0xC3 */ + 0xD0, 0xDB, 0xD1, 0xC5, 0xBC, 0xAF, 0xB9, 0xCD, /* 0xC4-0xC7 */ + 0xEB, 0x66, 0xEF, 0xF4, 0xEB, 0x67, 0xEB, 0x68, /* 0xC8-0xCB */ + 0xB4, 0xC6, 0xD3, 0xBA, 0xF6, 0xC2, 0xB3, 0xFB, /* 0xCC-0xCF */ + 0xEB, 0x69, 0xEB, 0x6A, 0xF6, 0xC3, 0xEB, 0x6B, /* 0xD0-0xD3 */ + 0xEB, 0x6C, 0xB5, 0xF1, 0xEB, 0x6D, 0xEB, 0x6E, /* 0xD4-0xD7 */ + 0xEB, 0x6F, 0xEB, 0x70, 0xEB, 0x71, 0xEB, 0x72, /* 0xD8-0xDB */ + 0xEB, 0x73, 0xEB, 0x74, 0xEB, 0x75, 0xEB, 0x76, /* 0xDC-0xDF */ + 0xF6, 0xC5, 0xEB, 0x77, 0xEB, 0x78, 0xEB, 0x79, /* 0xE0-0xE3 */ + 0xEB, 0x7A, 0xEB, 0x7B, 0xEB, 0x7C, 0xEB, 0x7D, /* 0xE4-0xE7 */ + 0xD3, 0xEA, 0xF6, 0xA7, 0xD1, 0xA9, 0xEB, 0x7E, /* 0xE8-0xEB */ + 0xEB, 0x80, 0xEB, 0x81, 0xEB, 0x82, 0xF6, 0xA9, /* 0xEC-0xEF */ + 0xEB, 0x83, 0xEB, 0x84, 0xEB, 0x85, 0xF6, 0xA8, /* 0xF0-0xF3 */ + 0xEB, 0x86, 0xEB, 0x87, 0xC1, 0xE3, 0xC0, 0xD7, /* 0xF4-0xF7 */ + 0xEB, 0x88, 0xB1, 0xA2, 0xEB, 0x89, 0xEB, 0x8A, /* 0xF8-0xFB */ + 0xEB, 0x8B, 0xEB, 0x8C, 0xCE, 0xED, 0xEB, 0x8D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_97[512] = { + 0xD0, 0xE8, 0xF6, 0xAB, 0xEB, 0x8E, 0xEB, 0x8F, /* 0x00-0x03 */ + 0xCF, 0xF6, 0xEB, 0x90, 0xF6, 0xAA, 0xD5, 0xF0, /* 0x04-0x07 */ + 0xF6, 0xAC, 0xC3, 0xB9, 0xEB, 0x91, 0xEB, 0x92, /* 0x08-0x0B */ + 0xEB, 0x93, 0xBB, 0xF4, 0xF6, 0xAE, 0xF6, 0xAD, /* 0x0C-0x0F */ + 0xEB, 0x94, 0xEB, 0x95, 0xEB, 0x96, 0xC4, 0xDE, /* 0x10-0x13 */ + 0xEB, 0x97, 0xEB, 0x98, 0xC1, 0xD8, 0xEB, 0x99, /* 0x14-0x17 */ + 0xEB, 0x9A, 0xEB, 0x9B, 0xEB, 0x9C, 0xEB, 0x9D, /* 0x18-0x1B */ + 0xCB, 0xAA, 0xEB, 0x9E, 0xCF, 0xBC, 0xEB, 0x9F, /* 0x1C-0x1F */ + 0xEB, 0xA0, 0xEC, 0x40, 0xEC, 0x41, 0xEC, 0x42, /* 0x20-0x23 */ + 0xEC, 0x43, 0xEC, 0x44, 0xEC, 0x45, 0xEC, 0x46, /* 0x24-0x27 */ + 0xEC, 0x47, 0xEC, 0x48, 0xF6, 0xAF, 0xEC, 0x49, /* 0x28-0x2B */ + 0xEC, 0x4A, 0xF6, 0xB0, 0xEC, 0x4B, 0xEC, 0x4C, /* 0x2C-0x2F */ + 0xF6, 0xB1, 0xEC, 0x4D, 0xC2, 0xB6, 0xEC, 0x4E, /* 0x30-0x33 */ + 0xEC, 0x4F, 0xEC, 0x50, 0xEC, 0x51, 0xEC, 0x52, /* 0x34-0x37 */ + 0xB0, 0xD4, 0xC5, 0xF9, 0xEC, 0x53, 0xEC, 0x54, /* 0x38-0x3B */ + 0xEC, 0x55, 0xEC, 0x56, 0xF6, 0xB2, 0xEC, 0x57, /* 0x3C-0x3F */ + 0xEC, 0x58, 0xEC, 0x59, 0xEC, 0x5A, 0xEC, 0x5B, /* 0x40-0x43 */ + 0xEC, 0x5C, 0xEC, 0x5D, 0xEC, 0x5E, 0xEC, 0x5F, /* 0x44-0x47 */ + 0xEC, 0x60, 0xEC, 0x61, 0xEC, 0x62, 0xEC, 0x63, /* 0x48-0x4B */ + 0xEC, 0x64, 0xEC, 0x65, 0xEC, 0x66, 0xEC, 0x67, /* 0x4C-0x4F */ + 0xEC, 0x68, 0xEC, 0x69, 0xC7, 0xE0, 0xF6, 0xA6, /* 0x50-0x53 */ + 0xEC, 0x6A, 0xEC, 0x6B, 0xBE, 0xB8, 0xEC, 0x6C, /* 0x54-0x57 */ + 0xEC, 0x6D, 0xBE, 0xB2, 0xEC, 0x6E, 0xB5, 0xE5, /* 0x58-0x5B */ + 0xEC, 0x6F, 0xEC, 0x70, 0xB7, 0xC7, 0xEC, 0x71, /* 0x5C-0x5F */ + 0xBF, 0xBF, 0xC3, 0xD2, 0xC3, 0xE6, 0xEC, 0x72, /* 0x60-0x63 */ + 0xEC, 0x73, 0xD8, 0xCC, 0xEC, 0x74, 0xEC, 0x75, /* 0x64-0x67 */ + 0xEC, 0x76, 0xB8, 0xEF, 0xEC, 0x77, 0xEC, 0x78, /* 0x68-0x6B */ + 0xEC, 0x79, 0xEC, 0x7A, 0xEC, 0x7B, 0xEC, 0x7C, /* 0x6C-0x6F */ + 0xEC, 0x7D, 0xEC, 0x7E, 0xEC, 0x80, 0xBD, 0xF9, /* 0x70-0x73 */ + 0xD1, 0xA5, 0xEC, 0x81, 0xB0, 0xD0, 0xEC, 0x82, /* 0x74-0x77 */ + 0xEC, 0x83, 0xEC, 0x84, 0xEC, 0x85, 0xEC, 0x86, /* 0x78-0x7B */ + 0xF7, 0xB0, 0xEC, 0x87, 0xEC, 0x88, 0xEC, 0x89, /* 0x7C-0x7F */ + + 0xEC, 0x8A, 0xEC, 0x8B, 0xEC, 0x8C, 0xEC, 0x8D, /* 0x80-0x83 */ + 0xEC, 0x8E, 0xF7, 0xB1, 0xEC, 0x8F, 0xEC, 0x90, /* 0x84-0x87 */ + 0xEC, 0x91, 0xEC, 0x92, 0xEC, 0x93, 0xD0, 0xAC, /* 0x88-0x8B */ + 0xEC, 0x94, 0xB0, 0xB0, 0xEC, 0x95, 0xEC, 0x96, /* 0x8C-0x8F */ + 0xEC, 0x97, 0xF7, 0xB2, 0xF7, 0xB3, 0xEC, 0x98, /* 0x90-0x93 */ + 0xF7, 0xB4, 0xEC, 0x99, 0xEC, 0x9A, 0xEC, 0x9B, /* 0x94-0x97 */ + 0xC7, 0xCA, 0xEC, 0x9C, 0xEC, 0x9D, 0xEC, 0x9E, /* 0x98-0x9B */ + 0xEC, 0x9F, 0xEC, 0xA0, 0xED, 0x40, 0xED, 0x41, /* 0x9C-0x9F */ + 0xBE, 0xCF, 0xED, 0x42, 0xED, 0x43, 0xF7, 0xB7, /* 0xA0-0xA3 */ + 0xED, 0x44, 0xED, 0x45, 0xED, 0x46, 0xED, 0x47, /* 0xA4-0xA7 */ + 0xED, 0x48, 0xED, 0x49, 0xED, 0x4A, 0xF7, 0xB6, /* 0xA8-0xAB */ + 0xED, 0x4B, 0xB1, 0xDE, 0xED, 0x4C, 0xF7, 0xB5, /* 0xAC-0xAF */ + 0xED, 0x4D, 0xED, 0x4E, 0xF7, 0xB8, 0xED, 0x4F, /* 0xB0-0xB3 */ + 0xF7, 0xB9, 0xED, 0x50, 0xED, 0x51, 0xED, 0x52, /* 0xB4-0xB7 */ + 0xED, 0x53, 0xED, 0x54, 0xED, 0x55, 0xED, 0x56, /* 0xB8-0xBB */ + 0xED, 0x57, 0xED, 0x58, 0xED, 0x59, 0xED, 0x5A, /* 0xBC-0xBF */ + 0xED, 0x5B, 0xED, 0x5C, 0xED, 0x5D, 0xED, 0x5E, /* 0xC0-0xC3 */ + 0xED, 0x5F, 0xED, 0x60, 0xED, 0x61, 0xED, 0x62, /* 0xC4-0xC7 */ + 0xED, 0x63, 0xED, 0x64, 0xED, 0x65, 0xED, 0x66, /* 0xC8-0xCB */ + 0xED, 0x67, 0xED, 0x68, 0xED, 0x69, 0xED, 0x6A, /* 0xCC-0xCF */ + 0xED, 0x6B, 0xED, 0x6C, 0xED, 0x6D, 0xED, 0x6E, /* 0xD0-0xD3 */ + 0xED, 0x6F, 0xED, 0x70, 0xED, 0x71, 0xED, 0x72, /* 0xD4-0xD7 */ + 0xED, 0x73, 0xED, 0x74, 0xED, 0x75, 0xED, 0x76, /* 0xD8-0xDB */ + 0xED, 0x77, 0xED, 0x78, 0xED, 0x79, 0xED, 0x7A, /* 0xDC-0xDF */ + 0xED, 0x7B, 0xED, 0x7C, 0xED, 0x7D, 0xED, 0x7E, /* 0xE0-0xE3 */ + 0xED, 0x80, 0xED, 0x81, 0xCE, 0xA4, 0xC8, 0xCD, /* 0xE4-0xE7 */ + 0xED, 0x82, 0xBA, 0xAB, 0xE8, 0xB8, 0xE8, 0xB9, /* 0xE8-0xEB */ + 0xE8, 0xBA, 0xBE, 0xC2, 0xED, 0x83, 0xED, 0x84, /* 0xEC-0xEF */ + 0xED, 0x85, 0xED, 0x86, 0xED, 0x87, 0xD2, 0xF4, /* 0xF0-0xF3 */ + 0xED, 0x88, 0xD4, 0xCF, 0xC9, 0xD8, 0xED, 0x89, /* 0xF4-0xF7 */ + 0xED, 0x8A, 0xED, 0x8B, 0xED, 0x8C, 0xED, 0x8D, /* 0xF8-0xFB */ + 0xED, 0x8E, 0xED, 0x8F, 0xED, 0x90, 0xED, 0x91, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_98[512] = { + 0xED, 0x92, 0xED, 0x93, 0xED, 0x94, 0xED, 0x95, /* 0x00-0x03 */ + 0xED, 0x96, 0xED, 0x97, 0xED, 0x98, 0xED, 0x99, /* 0x04-0x07 */ + 0xED, 0x9A, 0xED, 0x9B, 0xED, 0x9C, 0xED, 0x9D, /* 0x08-0x0B */ + 0xED, 0x9E, 0xED, 0x9F, 0xED, 0xA0, 0xEE, 0x40, /* 0x0C-0x0F */ + 0xEE, 0x41, 0xEE, 0x42, 0xEE, 0x43, 0xEE, 0x44, /* 0x10-0x13 */ + 0xEE, 0x45, 0xEE, 0x46, 0xEE, 0x47, 0xEE, 0x48, /* 0x14-0x17 */ + 0xEE, 0x49, 0xEE, 0x4A, 0xEE, 0x4B, 0xEE, 0x4C, /* 0x18-0x1B */ + 0xEE, 0x4D, 0xEE, 0x4E, 0xEE, 0x4F, 0xEE, 0x50, /* 0x1C-0x1F */ + 0xEE, 0x51, 0xEE, 0x52, 0xEE, 0x53, 0xEE, 0x54, /* 0x20-0x23 */ + 0xEE, 0x55, 0xEE, 0x56, 0xEE, 0x57, 0xEE, 0x58, /* 0x24-0x27 */ + 0xEE, 0x59, 0xEE, 0x5A, 0xEE, 0x5B, 0xEE, 0x5C, /* 0x28-0x2B */ + 0xEE, 0x5D, 0xEE, 0x5E, 0xEE, 0x5F, 0xEE, 0x60, /* 0x2C-0x2F */ + 0xEE, 0x61, 0xEE, 0x62, 0xEE, 0x63, 0xEE, 0x64, /* 0x30-0x33 */ + 0xEE, 0x65, 0xEE, 0x66, 0xEE, 0x67, 0xEE, 0x68, /* 0x34-0x37 */ + 0xEE, 0x69, 0xEE, 0x6A, 0xEE, 0x6B, 0xEE, 0x6C, /* 0x38-0x3B */ + 0xEE, 0x6D, 0xEE, 0x6E, 0xEE, 0x6F, 0xEE, 0x70, /* 0x3C-0x3F */ + 0xEE, 0x71, 0xEE, 0x72, 0xEE, 0x73, 0xEE, 0x74, /* 0x40-0x43 */ + 0xEE, 0x75, 0xEE, 0x76, 0xEE, 0x77, 0xEE, 0x78, /* 0x44-0x47 */ + 0xEE, 0x79, 0xEE, 0x7A, 0xEE, 0x7B, 0xEE, 0x7C, /* 0x48-0x4B */ + 0xEE, 0x7D, 0xEE, 0x7E, 0xEE, 0x80, 0xEE, 0x81, /* 0x4C-0x4F */ + 0xEE, 0x82, 0xEE, 0x83, 0xEE, 0x84, 0xEE, 0x85, /* 0x50-0x53 */ + 0xEE, 0x86, 0xEE, 0x87, 0xEE, 0x88, 0xEE, 0x89, /* 0x54-0x57 */ + 0xEE, 0x8A, 0xEE, 0x8B, 0xEE, 0x8C, 0xEE, 0x8D, /* 0x58-0x5B */ + 0xEE, 0x8E, 0xEE, 0x8F, 0xEE, 0x90, 0xEE, 0x91, /* 0x5C-0x5F */ + 0xEE, 0x92, 0xEE, 0x93, 0xEE, 0x94, 0xEE, 0x95, /* 0x60-0x63 */ + 0xEE, 0x96, 0xEE, 0x97, 0xEE, 0x98, 0xEE, 0x99, /* 0x64-0x67 */ + 0xEE, 0x9A, 0xEE, 0x9B, 0xEE, 0x9C, 0xEE, 0x9D, /* 0x68-0x6B */ + 0xEE, 0x9E, 0xEE, 0x9F, 0xEE, 0xA0, 0xEF, 0x40, /* 0x6C-0x6F */ + 0xEF, 0x41, 0xEF, 0x42, 0xEF, 0x43, 0xEF, 0x44, /* 0x70-0x73 */ + 0xEF, 0x45, 0xD2, 0xB3, 0xB6, 0xA5, 0xC7, 0xEA, /* 0x74-0x77 */ + 0xF1, 0xFC, 0xCF, 0xEE, 0xCB, 0xB3, 0xD0, 0xEB, /* 0x78-0x7B */ + 0xE7, 0xEF, 0xCD, 0xE7, 0xB9, 0xCB, 0xB6, 0xD9, /* 0x7C-0x7F */ + + 0xF1, 0xFD, 0xB0, 0xE4, 0xCB, 0xCC, 0xF1, 0xFE, /* 0x80-0x83 */ + 0xD4, 0xA4, 0xC2, 0xAD, 0xC1, 0xEC, 0xC6, 0xC4, /* 0x84-0x87 */ + 0xBE, 0xB1, 0xF2, 0xA1, 0xBC, 0xD5, 0xEF, 0x46, /* 0x88-0x8B */ + 0xF2, 0xA2, 0xF2, 0xA3, 0xEF, 0x47, 0xF2, 0xA4, /* 0x8C-0x8F */ + 0xD2, 0xC3, 0xC6, 0xB5, 0xEF, 0x48, 0xCD, 0xC7, /* 0x90-0x93 */ + 0xF2, 0xA5, 0xEF, 0x49, 0xD3, 0xB1, 0xBF, 0xC5, /* 0x94-0x97 */ + 0xCC, 0xE2, 0xEF, 0x4A, 0xF2, 0xA6, 0xF2, 0xA7, /* 0x98-0x9B */ + 0xD1, 0xD5, 0xB6, 0xEE, 0xF2, 0xA8, 0xF2, 0xA9, /* 0x9C-0x9F */ + 0xB5, 0xDF, 0xF2, 0xAA, 0xF2, 0xAB, 0xEF, 0x4B, /* 0xA0-0xA3 */ + 0xB2, 0xFC, 0xF2, 0xAC, 0xF2, 0xAD, 0xC8, 0xA7, /* 0xA4-0xA7 */ + 0xEF, 0x4C, 0xEF, 0x4D, 0xEF, 0x4E, 0xEF, 0x4F, /* 0xA8-0xAB */ + 0xEF, 0x50, 0xEF, 0x51, 0xEF, 0x52, 0xEF, 0x53, /* 0xAC-0xAF */ + 0xEF, 0x54, 0xEF, 0x55, 0xEF, 0x56, 0xEF, 0x57, /* 0xB0-0xB3 */ + 0xEF, 0x58, 0xEF, 0x59, 0xEF, 0x5A, 0xEF, 0x5B, /* 0xB4-0xB7 */ + 0xEF, 0x5C, 0xEF, 0x5D, 0xEF, 0x5E, 0xEF, 0x5F, /* 0xB8-0xBB */ + 0xEF, 0x60, 0xEF, 0x61, 0xEF, 0x62, 0xEF, 0x63, /* 0xBC-0xBF */ + 0xEF, 0x64, 0xEF, 0x65, 0xEF, 0x66, 0xEF, 0x67, /* 0xC0-0xC3 */ + 0xEF, 0x68, 0xEF, 0x69, 0xEF, 0x6A, 0xEF, 0x6B, /* 0xC4-0xC7 */ + 0xEF, 0x6C, 0xEF, 0x6D, 0xEF, 0x6E, 0xEF, 0x6F, /* 0xC8-0xCB */ + 0xEF, 0x70, 0xEF, 0x71, 0xB7, 0xE7, 0xEF, 0x72, /* 0xCC-0xCF */ + 0xEF, 0x73, 0xEC, 0xA9, 0xEC, 0xAA, 0xEC, 0xAB, /* 0xD0-0xD3 */ + 0xEF, 0x74, 0xEC, 0xAC, 0xEF, 0x75, 0xEF, 0x76, /* 0xD4-0xD7 */ + 0xC6, 0xAE, 0xEC, 0xAD, 0xEC, 0xAE, 0xEF, 0x77, /* 0xD8-0xDB */ + 0xEF, 0x78, 0xEF, 0x79, 0xB7, 0xC9, 0xCA, 0xB3, /* 0xDC-0xDF */ + 0xEF, 0x7A, 0xEF, 0x7B, 0xEF, 0x7C, 0xEF, 0x7D, /* 0xE0-0xE3 */ + 0xEF, 0x7E, 0xEF, 0x80, 0xEF, 0x81, 0xE2, 0xB8, /* 0xE4-0xE7 */ + 0xF7, 0xCF, 0xEF, 0x82, 0xEF, 0x83, 0xEF, 0x84, /* 0xE8-0xEB */ + 0xEF, 0x85, 0xEF, 0x86, 0xEF, 0x87, 0xEF, 0x88, /* 0xEC-0xEF */ + 0xEF, 0x89, 0xEF, 0x8A, 0xEF, 0x8B, 0xEF, 0x8C, /* 0xF0-0xF3 */ + 0xEF, 0x8D, 0xEF, 0x8E, 0xEF, 0x8F, 0xEF, 0x90, /* 0xF4-0xF7 */ + 0xEF, 0x91, 0xEF, 0x92, 0xEF, 0x93, 0xEF, 0x94, /* 0xF8-0xFB */ + 0xEF, 0x95, 0xEF, 0x96, 0xEF, 0x97, 0xEF, 0x98, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_99[512] = { + 0xEF, 0x99, 0xEF, 0x9A, 0xEF, 0x9B, 0xEF, 0x9C, /* 0x00-0x03 */ + 0xEF, 0x9D, 0xEF, 0x9E, 0xEF, 0x9F, 0xEF, 0xA0, /* 0x04-0x07 */ + 0xF0, 0x40, 0xF0, 0x41, 0xF0, 0x42, 0xF0, 0x43, /* 0x08-0x0B */ + 0xF0, 0x44, 0xF7, 0xD0, 0xF0, 0x45, 0xF0, 0x46, /* 0x0C-0x0F */ + 0xB2, 0xCD, 0xF0, 0x47, 0xF0, 0x48, 0xF0, 0x49, /* 0x10-0x13 */ + 0xF0, 0x4A, 0xF0, 0x4B, 0xF0, 0x4C, 0xF0, 0x4D, /* 0x14-0x17 */ + 0xF0, 0x4E, 0xF0, 0x4F, 0xF0, 0x50, 0xF0, 0x51, /* 0x18-0x1B */ + 0xF0, 0x52, 0xF0, 0x53, 0xF0, 0x54, 0xF0, 0x55, /* 0x1C-0x1F */ + 0xF0, 0x56, 0xF0, 0x57, 0xF0, 0x58, 0xF0, 0x59, /* 0x20-0x23 */ + 0xF0, 0x5A, 0xF0, 0x5B, 0xF0, 0x5C, 0xF0, 0x5D, /* 0x24-0x27 */ + 0xF0, 0x5E, 0xF0, 0x5F, 0xF0, 0x60, 0xF0, 0x61, /* 0x28-0x2B */ + 0xF0, 0x62, 0xF0, 0x63, 0xF7, 0xD1, 0xF0, 0x64, /* 0x2C-0x2F */ + 0xF0, 0x65, 0xF0, 0x66, 0xF0, 0x67, 0xF0, 0x68, /* 0x30-0x33 */ + 0xF0, 0x69, 0xF0, 0x6A, 0xF0, 0x6B, 0xF0, 0x6C, /* 0x34-0x37 */ + 0xF0, 0x6D, 0xF0, 0x6E, 0xF0, 0x6F, 0xF0, 0x70, /* 0x38-0x3B */ + 0xF0, 0x71, 0xF0, 0x72, 0xF0, 0x73, 0xF0, 0x74, /* 0x3C-0x3F */ + 0xF0, 0x75, 0xF0, 0x76, 0xF0, 0x77, 0xF0, 0x78, /* 0x40-0x43 */ + 0xF0, 0x79, 0xF0, 0x7A, 0xF0, 0x7B, 0xF0, 0x7C, /* 0x44-0x47 */ + 0xF0, 0x7D, 0xF0, 0x7E, 0xF0, 0x80, 0xF0, 0x81, /* 0x48-0x4B */ + 0xF0, 0x82, 0xF0, 0x83, 0xF0, 0x84, 0xF0, 0x85, /* 0x4C-0x4F */ + 0xF0, 0x86, 0xF0, 0x87, 0xF0, 0x88, 0xF0, 0x89, /* 0x50-0x53 */ + 0xF7, 0xD3, 0xF7, 0xD2, 0xF0, 0x8A, 0xF0, 0x8B, /* 0x54-0x57 */ + 0xF0, 0x8C, 0xF0, 0x8D, 0xF0, 0x8E, 0xF0, 0x8F, /* 0x58-0x5B */ + 0xF0, 0x90, 0xF0, 0x91, 0xF0, 0x92, 0xF0, 0x93, /* 0x5C-0x5F */ + 0xF0, 0x94, 0xF0, 0x95, 0xF0, 0x96, 0xE2, 0xBB, /* 0x60-0x63 */ + 0xF0, 0x97, 0xBC, 0xA2, 0xF0, 0x98, 0xE2, 0xBC, /* 0x64-0x67 */ + 0xE2, 0xBD, 0xE2, 0xBE, 0xE2, 0xBF, 0xE2, 0xC0, /* 0x68-0x6B */ + 0xE2, 0xC1, 0xB7, 0xB9, 0xD2, 0xFB, 0xBD, 0xA4, /* 0x6C-0x6F */ + 0xCA, 0xCE, 0xB1, 0xA5, 0xCB, 0xC7, 0xF0, 0x99, /* 0x70-0x73 */ + 0xE2, 0xC2, 0xB6, 0xFC, 0xC8, 0xC4, 0xE2, 0xC3, /* 0x74-0x77 */ + 0xF0, 0x9A, 0xF0, 0x9B, 0xBD, 0xC8, 0xF0, 0x9C, /* 0x78-0x7B */ + 0xB1, 0xFD, 0xE2, 0xC4, 0xF0, 0x9D, 0xB6, 0xF6, /* 0x7C-0x7F */ + + 0xE2, 0xC5, 0xC4, 0xD9, 0xF0, 0x9E, 0xF0, 0x9F, /* 0x80-0x83 */ + 0xE2, 0xC6, 0xCF, 0xDA, 0xB9, 0xDD, 0xE2, 0xC7, /* 0x84-0x87 */ + 0xC0, 0xA1, 0xF0, 0xA0, 0xE2, 0xC8, 0xB2, 0xF6, /* 0x88-0x8B */ + 0xF1, 0x40, 0xE2, 0xC9, 0xF1, 0x41, 0xC1, 0xF3, /* 0x8C-0x8F */ + 0xE2, 0xCA, 0xE2, 0xCB, 0xC2, 0xF8, 0xE2, 0xCC, /* 0x90-0x93 */ + 0xE2, 0xCD, 0xE2, 0xCE, 0xCA, 0xD7, 0xD8, 0xB8, /* 0x94-0x97 */ + 0xD9, 0xE5, 0xCF, 0xE3, 0xF1, 0x42, 0xF1, 0x43, /* 0x98-0x9B */ + 0xF1, 0x44, 0xF1, 0x45, 0xF1, 0x46, 0xF1, 0x47, /* 0x9C-0x9F */ + 0xF1, 0x48, 0xF1, 0x49, 0xF1, 0x4A, 0xF1, 0x4B, /* 0xA0-0xA3 */ + 0xF1, 0x4C, 0xF0, 0xA5, 0xF1, 0x4D, 0xF1, 0x4E, /* 0xA4-0xA7 */ + 0xDC, 0xB0, 0xF1, 0x4F, 0xF1, 0x50, 0xF1, 0x51, /* 0xA8-0xAB */ + 0xF1, 0x52, 0xF1, 0x53, 0xF1, 0x54, 0xF1, 0x55, /* 0xAC-0xAF */ + 0xF1, 0x56, 0xF1, 0x57, 0xF1, 0x58, 0xF1, 0x59, /* 0xB0-0xB3 */ + 0xF1, 0x5A, 0xF1, 0x5B, 0xF1, 0x5C, 0xF1, 0x5D, /* 0xB4-0xB7 */ + 0xF1, 0x5E, 0xF1, 0x5F, 0xF1, 0x60, 0xF1, 0x61, /* 0xB8-0xBB */ + 0xF1, 0x62, 0xF1, 0x63, 0xF1, 0x64, 0xF1, 0x65, /* 0xBC-0xBF */ + 0xF1, 0x66, 0xF1, 0x67, 0xF1, 0x68, 0xF1, 0x69, /* 0xC0-0xC3 */ + 0xF1, 0x6A, 0xF1, 0x6B, 0xF1, 0x6C, 0xF1, 0x6D, /* 0xC4-0xC7 */ + 0xF1, 0x6E, 0xF1, 0x6F, 0xF1, 0x70, 0xF1, 0x71, /* 0xC8-0xCB */ + 0xF1, 0x72, 0xF1, 0x73, 0xF1, 0x74, 0xF1, 0x75, /* 0xCC-0xCF */ + 0xF1, 0x76, 0xF1, 0x77, 0xF1, 0x78, 0xF1, 0x79, /* 0xD0-0xD3 */ + 0xF1, 0x7A, 0xF1, 0x7B, 0xF1, 0x7C, 0xF1, 0x7D, /* 0xD4-0xD7 */ + 0xF1, 0x7E, 0xF1, 0x80, 0xF1, 0x81, 0xF1, 0x82, /* 0xD8-0xDB */ + 0xF1, 0x83, 0xF1, 0x84, 0xF1, 0x85, 0xF1, 0x86, /* 0xDC-0xDF */ + 0xF1, 0x87, 0xF1, 0x88, 0xF1, 0x89, 0xF1, 0x8A, /* 0xE0-0xE3 */ + 0xF1, 0x8B, 0xF1, 0x8C, 0xF1, 0x8D, 0xF1, 0x8E, /* 0xE4-0xE7 */ + 0xF1, 0x8F, 0xF1, 0x90, 0xF1, 0x91, 0xF1, 0x92, /* 0xE8-0xEB */ + 0xF1, 0x93, 0xF1, 0x94, 0xF1, 0x95, 0xF1, 0x96, /* 0xEC-0xEF */ + 0xF1, 0x97, 0xF1, 0x98, 0xF1, 0x99, 0xF1, 0x9A, /* 0xF0-0xF3 */ + 0xF1, 0x9B, 0xF1, 0x9C, 0xF1, 0x9D, 0xF1, 0x9E, /* 0xF4-0xF7 */ + 0xF1, 0x9F, 0xF1, 0xA0, 0xF2, 0x40, 0xF2, 0x41, /* 0xF8-0xFB */ + 0xF2, 0x42, 0xF2, 0x43, 0xF2, 0x44, 0xF2, 0x45, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9A[512] = { + 0xF2, 0x46, 0xF2, 0x47, 0xF2, 0x48, 0xF2, 0x49, /* 0x00-0x03 */ + 0xF2, 0x4A, 0xF2, 0x4B, 0xF2, 0x4C, 0xF2, 0x4D, /* 0x04-0x07 */ + 0xF2, 0x4E, 0xF2, 0x4F, 0xF2, 0x50, 0xF2, 0x51, /* 0x08-0x0B */ + 0xF2, 0x52, 0xF2, 0x53, 0xF2, 0x54, 0xF2, 0x55, /* 0x0C-0x0F */ + 0xF2, 0x56, 0xF2, 0x57, 0xF2, 0x58, 0xF2, 0x59, /* 0x10-0x13 */ + 0xF2, 0x5A, 0xF2, 0x5B, 0xF2, 0x5C, 0xF2, 0x5D, /* 0x14-0x17 */ + 0xF2, 0x5E, 0xF2, 0x5F, 0xF2, 0x60, 0xF2, 0x61, /* 0x18-0x1B */ + 0xF2, 0x62, 0xF2, 0x63, 0xF2, 0x64, 0xF2, 0x65, /* 0x1C-0x1F */ + 0xF2, 0x66, 0xF2, 0x67, 0xF2, 0x68, 0xF2, 0x69, /* 0x20-0x23 */ + 0xF2, 0x6A, 0xF2, 0x6B, 0xF2, 0x6C, 0xF2, 0x6D, /* 0x24-0x27 */ + 0xF2, 0x6E, 0xF2, 0x6F, 0xF2, 0x70, 0xF2, 0x71, /* 0x28-0x2B */ + 0xF2, 0x72, 0xF2, 0x73, 0xF2, 0x74, 0xF2, 0x75, /* 0x2C-0x2F */ + 0xF2, 0x76, 0xF2, 0x77, 0xF2, 0x78, 0xF2, 0x79, /* 0x30-0x33 */ + 0xF2, 0x7A, 0xF2, 0x7B, 0xF2, 0x7C, 0xF2, 0x7D, /* 0x34-0x37 */ + 0xF2, 0x7E, 0xF2, 0x80, 0xF2, 0x81, 0xF2, 0x82, /* 0x38-0x3B */ + 0xF2, 0x83, 0xF2, 0x84, 0xF2, 0x85, 0xF2, 0x86, /* 0x3C-0x3F */ + 0xF2, 0x87, 0xF2, 0x88, 0xF2, 0x89, 0xF2, 0x8A, /* 0x40-0x43 */ + 0xF2, 0x8B, 0xF2, 0x8C, 0xF2, 0x8D, 0xF2, 0x8E, /* 0x44-0x47 */ + 0xF2, 0x8F, 0xF2, 0x90, 0xF2, 0x91, 0xF2, 0x92, /* 0x48-0x4B */ + 0xF2, 0x93, 0xF2, 0x94, 0xF2, 0x95, 0xF2, 0x96, /* 0x4C-0x4F */ + 0xF2, 0x97, 0xF2, 0x98, 0xF2, 0x99, 0xF2, 0x9A, /* 0x50-0x53 */ + 0xF2, 0x9B, 0xF2, 0x9C, 0xF2, 0x9D, 0xF2, 0x9E, /* 0x54-0x57 */ + 0xF2, 0x9F, 0xF2, 0xA0, 0xF3, 0x40, 0xF3, 0x41, /* 0x58-0x5B */ + 0xF3, 0x42, 0xF3, 0x43, 0xF3, 0x44, 0xF3, 0x45, /* 0x5C-0x5F */ + 0xF3, 0x46, 0xF3, 0x47, 0xF3, 0x48, 0xF3, 0x49, /* 0x60-0x63 */ + 0xF3, 0x4A, 0xF3, 0x4B, 0xF3, 0x4C, 0xF3, 0x4D, /* 0x64-0x67 */ + 0xF3, 0x4E, 0xF3, 0x4F, 0xF3, 0x50, 0xF3, 0x51, /* 0x68-0x6B */ + 0xC2, 0xED, 0xD4, 0xA6, 0xCD, 0xD4, 0xD1, 0xB1, /* 0x6C-0x6F */ + 0xB3, 0xDB, 0xC7, 0xFD, 0xF3, 0x52, 0xB2, 0xB5, /* 0x70-0x73 */ + 0xC2, 0xBF, 0xE6, 0xE0, 0xCA, 0xBB, 0xE6, 0xE1, /* 0x74-0x77 */ + 0xE6, 0xE2, 0xBE, 0xD4, 0xE6, 0xE3, 0xD7, 0xA4, /* 0x78-0x7B */ + 0xCD, 0xD5, 0xE6, 0xE5, 0xBC, 0xDD, 0xE6, 0xE4, /* 0x7C-0x7F */ + + 0xE6, 0xE6, 0xE6, 0xE7, 0xC2, 0xEE, 0xF3, 0x53, /* 0x80-0x83 */ + 0xBD, 0xBE, 0xE6, 0xE8, 0xC2, 0xE6, 0xBA, 0xA7, /* 0x84-0x87 */ + 0xE6, 0xE9, 0xF3, 0x54, 0xE6, 0xEA, 0xB3, 0xD2, /* 0x88-0x8B */ + 0xD1, 0xE9, 0xF3, 0x55, 0xF3, 0x56, 0xBF, 0xA5, /* 0x8C-0x8F */ + 0xE6, 0xEB, 0xC6, 0xEF, 0xE6, 0xEC, 0xE6, 0xED, /* 0x90-0x93 */ + 0xF3, 0x57, 0xF3, 0x58, 0xE6, 0xEE, 0xC6, 0xAD, /* 0x94-0x97 */ + 0xE6, 0xEF, 0xF3, 0x59, 0xC9, 0xA7, 0xE6, 0xF0, /* 0x98-0x9B */ + 0xE6, 0xF1, 0xE6, 0xF2, 0xE5, 0xB9, 0xE6, 0xF3, /* 0x9C-0x9F */ + 0xE6, 0xF4, 0xC2, 0xE2, 0xE6, 0xF5, 0xE6, 0xF6, /* 0xA0-0xA3 */ + 0xD6, 0xE8, 0xE6, 0xF7, 0xF3, 0x5A, 0xE6, 0xF8, /* 0xA4-0xA7 */ + 0xB9, 0xC7, 0xF3, 0x5B, 0xF3, 0x5C, 0xF3, 0x5D, /* 0xA8-0xAB */ + 0xF3, 0x5E, 0xF3, 0x5F, 0xF3, 0x60, 0xF3, 0x61, /* 0xAC-0xAF */ + 0xF7, 0xBB, 0xF7, 0xBA, 0xF3, 0x62, 0xF3, 0x63, /* 0xB0-0xB3 */ + 0xF3, 0x64, 0xF3, 0x65, 0xF7, 0xBE, 0xF7, 0xBC, /* 0xB4-0xB7 */ + 0xBA, 0xA1, 0xF3, 0x66, 0xF7, 0xBF, 0xF3, 0x67, /* 0xB8-0xBB */ + 0xF7, 0xC0, 0xF3, 0x68, 0xF3, 0x69, 0xF3, 0x6A, /* 0xBC-0xBF */ + 0xF7, 0xC2, 0xF7, 0xC1, 0xF7, 0xC4, 0xF3, 0x6B, /* 0xC0-0xC3 */ + 0xF3, 0x6C, 0xF7, 0xC3, 0xF3, 0x6D, 0xF3, 0x6E, /* 0xC4-0xC7 */ + 0xF3, 0x6F, 0xF3, 0x70, 0xF3, 0x71, 0xF7, 0xC5, /* 0xC8-0xCB */ + 0xF7, 0xC6, 0xF3, 0x72, 0xF3, 0x73, 0xF3, 0x74, /* 0xCC-0xCF */ + 0xF3, 0x75, 0xF7, 0xC7, 0xF3, 0x76, 0xCB, 0xE8, /* 0xD0-0xD3 */ + 0xF3, 0x77, 0xF3, 0x78, 0xF3, 0x79, 0xF3, 0x7A, /* 0xD4-0xD7 */ + 0xB8, 0xDF, 0xF3, 0x7B, 0xF3, 0x7C, 0xF3, 0x7D, /* 0xD8-0xDB */ + 0xF3, 0x7E, 0xF3, 0x80, 0xF3, 0x81, 0xF7, 0xD4, /* 0xDC-0xDF */ + 0xF3, 0x82, 0xF7, 0xD5, 0xF3, 0x83, 0xF3, 0x84, /* 0xE0-0xE3 */ + 0xF3, 0x85, 0xF3, 0x86, 0xF7, 0xD6, 0xF3, 0x87, /* 0xE4-0xE7 */ + 0xF3, 0x88, 0xF3, 0x89, 0xF3, 0x8A, 0xF7, 0xD8, /* 0xE8-0xEB */ + 0xF3, 0x8B, 0xF7, 0xDA, 0xF3, 0x8C, 0xF7, 0xD7, /* 0xEC-0xEF */ + 0xF3, 0x8D, 0xF3, 0x8E, 0xF3, 0x8F, 0xF3, 0x90, /* 0xF0-0xF3 */ + 0xF3, 0x91, 0xF3, 0x92, 0xF3, 0x93, 0xF3, 0x94, /* 0xF4-0xF7 */ + 0xF3, 0x95, 0xF7, 0xDB, 0xF3, 0x96, 0xF7, 0xD9, /* 0xF8-0xFB */ + 0xF3, 0x97, 0xF3, 0x98, 0xF3, 0x99, 0xF3, 0x9A, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9B[512] = { + 0xF3, 0x9B, 0xF3, 0x9C, 0xF3, 0x9D, 0xD7, 0xD7, /* 0x00-0x03 */ + 0xF3, 0x9E, 0xF3, 0x9F, 0xF3, 0xA0, 0xF4, 0x40, /* 0x04-0x07 */ + 0xF7, 0xDC, 0xF4, 0x41, 0xF4, 0x42, 0xF4, 0x43, /* 0x08-0x0B */ + 0xF4, 0x44, 0xF4, 0x45, 0xF4, 0x46, 0xF7, 0xDD, /* 0x0C-0x0F */ + 0xF4, 0x47, 0xF4, 0x48, 0xF4, 0x49, 0xF7, 0xDE, /* 0x10-0x13 */ + 0xF4, 0x4A, 0xF4, 0x4B, 0xF4, 0x4C, 0xF4, 0x4D, /* 0x14-0x17 */ + 0xF4, 0x4E, 0xF4, 0x4F, 0xF4, 0x50, 0xF4, 0x51, /* 0x18-0x1B */ + 0xF4, 0x52, 0xF4, 0x53, 0xF4, 0x54, 0xF7, 0xDF, /* 0x1C-0x1F */ + 0xF4, 0x55, 0xF4, 0x56, 0xF4, 0x57, 0xF7, 0xE0, /* 0x20-0x23 */ + 0xF4, 0x58, 0xF4, 0x59, 0xF4, 0x5A, 0xF4, 0x5B, /* 0x24-0x27 */ + 0xF4, 0x5C, 0xF4, 0x5D, 0xF4, 0x5E, 0xF4, 0x5F, /* 0x28-0x2B */ + 0xF4, 0x60, 0xF4, 0x61, 0xF4, 0x62, 0xDB, 0xCB, /* 0x2C-0x2F */ + 0xF4, 0x63, 0xF4, 0x64, 0xD8, 0xAA, 0xF4, 0x65, /* 0x30-0x33 */ + 0xF4, 0x66, 0xF4, 0x67, 0xF4, 0x68, 0xF4, 0x69, /* 0x34-0x37 */ + 0xF4, 0x6A, 0xF4, 0x6B, 0xF4, 0x6C, 0xE5, 0xF7, /* 0x38-0x3B */ + 0xB9, 0xED, 0xF4, 0x6D, 0xF4, 0x6E, 0xF4, 0x6F, /* 0x3C-0x3F */ + 0xF4, 0x70, 0xBF, 0xFD, 0xBB, 0xEA, 0xF7, 0xC9, /* 0x40-0x43 */ + 0xC6, 0xC7, 0xF7, 0xC8, 0xF4, 0x71, 0xF7, 0xCA, /* 0x44-0x47 */ + 0xF7, 0xCC, 0xF7, 0xCB, 0xF4, 0x72, 0xF4, 0x73, /* 0x48-0x4B */ + 0xF4, 0x74, 0xF7, 0xCD, 0xF4, 0x75, 0xCE, 0xBA, /* 0x4C-0x4F */ + 0xF4, 0x76, 0xF7, 0xCE, 0xF4, 0x77, 0xF4, 0x78, /* 0x50-0x53 */ + 0xC4, 0xA7, 0xF4, 0x79, 0xF4, 0x7A, 0xF4, 0x7B, /* 0x54-0x57 */ + 0xF4, 0x7C, 0xF4, 0x7D, 0xF4, 0x7E, 0xF4, 0x80, /* 0x58-0x5B */ + 0xF4, 0x81, 0xF4, 0x82, 0xF4, 0x83, 0xF4, 0x84, /* 0x5C-0x5F */ + 0xF4, 0x85, 0xF4, 0x86, 0xF4, 0x87, 0xF4, 0x88, /* 0x60-0x63 */ + 0xF4, 0x89, 0xF4, 0x8A, 0xF4, 0x8B, 0xF4, 0x8C, /* 0x64-0x67 */ + 0xF4, 0x8D, 0xF4, 0x8E, 0xF4, 0x8F, 0xF4, 0x90, /* 0x68-0x6B */ + 0xF4, 0x91, 0xF4, 0x92, 0xF4, 0x93, 0xF4, 0x94, /* 0x6C-0x6F */ + 0xF4, 0x95, 0xF4, 0x96, 0xF4, 0x97, 0xF4, 0x98, /* 0x70-0x73 */ + 0xF4, 0x99, 0xF4, 0x9A, 0xF4, 0x9B, 0xF4, 0x9C, /* 0x74-0x77 */ + 0xF4, 0x9D, 0xF4, 0x9E, 0xF4, 0x9F, 0xF4, 0xA0, /* 0x78-0x7B */ + 0xF5, 0x40, 0xF5, 0x41, 0xF5, 0x42, 0xF5, 0x43, /* 0x7C-0x7F */ + + 0xF5, 0x44, 0xF5, 0x45, 0xF5, 0x46, 0xF5, 0x47, /* 0x80-0x83 */ + 0xF5, 0x48, 0xF5, 0x49, 0xF5, 0x4A, 0xF5, 0x4B, /* 0x84-0x87 */ + 0xF5, 0x4C, 0xF5, 0x4D, 0xF5, 0x4E, 0xF5, 0x4F, /* 0x88-0x8B */ + 0xF5, 0x50, 0xF5, 0x51, 0xF5, 0x52, 0xF5, 0x53, /* 0x8C-0x8F */ + 0xF5, 0x54, 0xF5, 0x55, 0xF5, 0x56, 0xF5, 0x57, /* 0x90-0x93 */ + 0xF5, 0x58, 0xF5, 0x59, 0xF5, 0x5A, 0xF5, 0x5B, /* 0x94-0x97 */ + 0xF5, 0x5C, 0xF5, 0x5D, 0xF5, 0x5E, 0xF5, 0x5F, /* 0x98-0x9B */ + 0xF5, 0x60, 0xF5, 0x61, 0xF5, 0x62, 0xF5, 0x63, /* 0x9C-0x9F */ + 0xF5, 0x64, 0xF5, 0x65, 0xF5, 0x66, 0xF5, 0x67, /* 0xA0-0xA3 */ + 0xF5, 0x68, 0xF5, 0x69, 0xF5, 0x6A, 0xF5, 0x6B, /* 0xA4-0xA7 */ + 0xF5, 0x6C, 0xF5, 0x6D, 0xF5, 0x6E, 0xF5, 0x6F, /* 0xA8-0xAB */ + 0xF5, 0x70, 0xF5, 0x71, 0xF5, 0x72, 0xF5, 0x73, /* 0xAC-0xAF */ + 0xF5, 0x74, 0xF5, 0x75, 0xF5, 0x76, 0xF5, 0x77, /* 0xB0-0xB3 */ + 0xF5, 0x78, 0xF5, 0x79, 0xF5, 0x7A, 0xF5, 0x7B, /* 0xB4-0xB7 */ + 0xF5, 0x7C, 0xF5, 0x7D, 0xF5, 0x7E, 0xF5, 0x80, /* 0xB8-0xBB */ + 0xF5, 0x81, 0xF5, 0x82, 0xF5, 0x83, 0xF5, 0x84, /* 0xBC-0xBF */ + 0xF5, 0x85, 0xF5, 0x86, 0xF5, 0x87, 0xF5, 0x88, /* 0xC0-0xC3 */ + 0xF5, 0x89, 0xF5, 0x8A, 0xF5, 0x8B, 0xF5, 0x8C, /* 0xC4-0xC7 */ + 0xF5, 0x8D, 0xF5, 0x8E, 0xF5, 0x8F, 0xF5, 0x90, /* 0xC8-0xCB */ + 0xF5, 0x91, 0xF5, 0x92, 0xF5, 0x93, 0xF5, 0x94, /* 0xCC-0xCF */ + 0xF5, 0x95, 0xF5, 0x96, 0xF5, 0x97, 0xF5, 0x98, /* 0xD0-0xD3 */ + 0xF5, 0x99, 0xF5, 0x9A, 0xF5, 0x9B, 0xF5, 0x9C, /* 0xD4-0xD7 */ + 0xF5, 0x9D, 0xF5, 0x9E, 0xF5, 0x9F, 0xF5, 0xA0, /* 0xD8-0xDB */ + 0xF6, 0x40, 0xF6, 0x41, 0xF6, 0x42, 0xF6, 0x43, /* 0xDC-0xDF */ + 0xF6, 0x44, 0xF6, 0x45, 0xF6, 0x46, 0xF6, 0x47, /* 0xE0-0xE3 */ + 0xF6, 0x48, 0xF6, 0x49, 0xF6, 0x4A, 0xF6, 0x4B, /* 0xE4-0xE7 */ + 0xF6, 0x4C, 0xF6, 0x4D, 0xF6, 0x4E, 0xF6, 0x4F, /* 0xE8-0xEB */ + 0xF6, 0x50, 0xF6, 0x51, 0xF6, 0x52, 0xF6, 0x53, /* 0xEC-0xEF */ + 0xF6, 0x54, 0xF6, 0x55, 0xF6, 0x56, 0xF6, 0x57, /* 0xF0-0xF3 */ + 0xF6, 0x58, 0xF6, 0x59, 0xF6, 0x5A, 0xF6, 0x5B, /* 0xF4-0xF7 */ + 0xF6, 0x5C, 0xF6, 0x5D, 0xF6, 0x5E, 0xF6, 0x5F, /* 0xF8-0xFB */ + 0xF6, 0x60, 0xF6, 0x61, 0xF6, 0x62, 0xF6, 0x63, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9C[512] = { + 0xF6, 0x64, 0xF6, 0x65, 0xF6, 0x66, 0xF6, 0x67, /* 0x00-0x03 */ + 0xF6, 0x68, 0xF6, 0x69, 0xF6, 0x6A, 0xF6, 0x6B, /* 0x04-0x07 */ + 0xF6, 0x6C, 0xF6, 0x6D, 0xF6, 0x6E, 0xF6, 0x6F, /* 0x08-0x0B */ + 0xF6, 0x70, 0xF6, 0x71, 0xF6, 0x72, 0xF6, 0x73, /* 0x0C-0x0F */ + 0xF6, 0x74, 0xF6, 0x75, 0xF6, 0x76, 0xF6, 0x77, /* 0x10-0x13 */ + 0xF6, 0x78, 0xF6, 0x79, 0xF6, 0x7A, 0xF6, 0x7B, /* 0x14-0x17 */ + 0xF6, 0x7C, 0xF6, 0x7D, 0xF6, 0x7E, 0xF6, 0x80, /* 0x18-0x1B */ + 0xF6, 0x81, 0xF6, 0x82, 0xF6, 0x83, 0xF6, 0x84, /* 0x1C-0x1F */ + 0xF6, 0x85, 0xF6, 0x86, 0xF6, 0x87, 0xF6, 0x88, /* 0x20-0x23 */ + 0xF6, 0x89, 0xF6, 0x8A, 0xF6, 0x8B, 0xF6, 0x8C, /* 0x24-0x27 */ + 0xF6, 0x8D, 0xF6, 0x8E, 0xF6, 0x8F, 0xF6, 0x90, /* 0x28-0x2B */ + 0xF6, 0x91, 0xF6, 0x92, 0xF6, 0x93, 0xF6, 0x94, /* 0x2C-0x2F */ + 0xF6, 0x95, 0xF6, 0x96, 0xF6, 0x97, 0xF6, 0x98, /* 0x30-0x33 */ + 0xF6, 0x99, 0xF6, 0x9A, 0xF6, 0x9B, 0xF6, 0x9C, /* 0x34-0x37 */ + 0xF6, 0x9D, 0xF6, 0x9E, 0xF6, 0x9F, 0xF6, 0xA0, /* 0x38-0x3B */ + 0xF7, 0x40, 0xF7, 0x41, 0xF7, 0x42, 0xF7, 0x43, /* 0x3C-0x3F */ + 0xF7, 0x44, 0xF7, 0x45, 0xF7, 0x46, 0xF7, 0x47, /* 0x40-0x43 */ + 0xF7, 0x48, 0xF7, 0x49, 0xF7, 0x4A, 0xF7, 0x4B, /* 0x44-0x47 */ + 0xF7, 0x4C, 0xF7, 0x4D, 0xF7, 0x4E, 0xF7, 0x4F, /* 0x48-0x4B */ + 0xF7, 0x50, 0xF7, 0x51, 0xF7, 0x52, 0xF7, 0x53, /* 0x4C-0x4F */ + 0xF7, 0x54, 0xF7, 0x55, 0xF7, 0x56, 0xF7, 0x57, /* 0x50-0x53 */ + 0xF7, 0x58, 0xF7, 0x59, 0xF7, 0x5A, 0xF7, 0x5B, /* 0x54-0x57 */ + 0xF7, 0x5C, 0xF7, 0x5D, 0xF7, 0x5E, 0xF7, 0x5F, /* 0x58-0x5B */ + 0xF7, 0x60, 0xF7, 0x61, 0xF7, 0x62, 0xF7, 0x63, /* 0x5C-0x5F */ + 0xF7, 0x64, 0xF7, 0x65, 0xF7, 0x66, 0xF7, 0x67, /* 0x60-0x63 */ + 0xF7, 0x68, 0xF7, 0x69, 0xF7, 0x6A, 0xF7, 0x6B, /* 0x64-0x67 */ + 0xF7, 0x6C, 0xF7, 0x6D, 0xF7, 0x6E, 0xF7, 0x6F, /* 0x68-0x6B */ + 0xF7, 0x70, 0xF7, 0x71, 0xF7, 0x72, 0xF7, 0x73, /* 0x6C-0x6F */ + 0xF7, 0x74, 0xF7, 0x75, 0xF7, 0x76, 0xF7, 0x77, /* 0x70-0x73 */ + 0xF7, 0x78, 0xF7, 0x79, 0xF7, 0x7A, 0xF7, 0x7B, /* 0x74-0x77 */ + 0xF7, 0x7C, 0xF7, 0x7D, 0xF7, 0x7E, 0xF7, 0x80, /* 0x78-0x7B */ + 0xD3, 0xE3, 0xF7, 0x81, 0xF7, 0x82, 0xF6, 0xCF, /* 0x7C-0x7F */ + + 0xF7, 0x83, 0xC2, 0xB3, 0xF6, 0xD0, 0xF7, 0x84, /* 0x80-0x83 */ + 0xF7, 0x85, 0xF6, 0xD1, 0xF6, 0xD2, 0xF6, 0xD3, /* 0x84-0x87 */ + 0xF6, 0xD4, 0xF7, 0x86, 0xF7, 0x87, 0xF6, 0xD6, /* 0x88-0x8B */ + 0xF7, 0x88, 0xB1, 0xAB, 0xF6, 0xD7, 0xF7, 0x89, /* 0x8C-0x8F */ + 0xF6, 0xD8, 0xF6, 0xD9, 0xF6, 0xDA, 0xF7, 0x8A, /* 0x90-0x93 */ + 0xF6, 0xDB, 0xF6, 0xDC, 0xF7, 0x8B, 0xF7, 0x8C, /* 0x94-0x97 */ + 0xF7, 0x8D, 0xF7, 0x8E, 0xF6, 0xDD, 0xF6, 0xDE, /* 0x98-0x9B */ + 0xCF, 0xCA, 0xF7, 0x8F, 0xF6, 0xDF, 0xF6, 0xE0, /* 0x9C-0x9F */ + 0xF6, 0xE1, 0xF6, 0xE2, 0xF6, 0xE3, 0xF6, 0xE4, /* 0xA0-0xA3 */ + 0xC0, 0xF0, 0xF6, 0xE5, 0xF6, 0xE6, 0xF6, 0xE7, /* 0xA4-0xA7 */ + 0xF6, 0xE8, 0xF6, 0xE9, 0xF7, 0x90, 0xF6, 0xEA, /* 0xA8-0xAB */ + 0xF7, 0x91, 0xF6, 0xEB, 0xF6, 0xEC, 0xF7, 0x92, /* 0xAC-0xAF */ + 0xF6, 0xED, 0xF6, 0xEE, 0xF6, 0xEF, 0xF6, 0xF0, /* 0xB0-0xB3 */ + 0xF6, 0xF1, 0xF6, 0xF2, 0xF6, 0xF3, 0xF6, 0xF4, /* 0xB4-0xB7 */ + 0xBE, 0xA8, 0xF7, 0x93, 0xF6, 0xF5, 0xF6, 0xF6, /* 0xB8-0xBB */ + 0xF6, 0xF7, 0xF6, 0xF8, 0xF7, 0x94, 0xF7, 0x95, /* 0xBC-0xBF */ + 0xF7, 0x96, 0xF7, 0x97, 0xF7, 0x98, 0xC8, 0xFA, /* 0xC0-0xC3 */ + 0xF6, 0xF9, 0xF6, 0xFA, 0xF6, 0xFB, 0xF6, 0xFC, /* 0xC4-0xC7 */ + 0xF7, 0x99, 0xF7, 0x9A, 0xF6, 0xFD, 0xF6, 0xFE, /* 0xC8-0xCB */ + 0xF7, 0xA1, 0xF7, 0xA2, 0xF7, 0xA3, 0xF7, 0xA4, /* 0xCC-0xCF */ + 0xF7, 0xA5, 0xF7, 0x9B, 0xF7, 0x9C, 0xF7, 0xA6, /* 0xD0-0xD3 */ + 0xF7, 0xA7, 0xF7, 0xA8, 0xB1, 0xEE, 0xF7, 0xA9, /* 0xD4-0xD7 */ + 0xF7, 0xAA, 0xF7, 0xAB, 0xF7, 0x9D, 0xF7, 0x9E, /* 0xD8-0xDB */ + 0xF7, 0xAC, 0xF7, 0xAD, 0xC1, 0xDB, 0xF7, 0xAE, /* 0xDC-0xDF */ + 0xF7, 0x9F, 0xF7, 0xA0, 0xF7, 0xAF, 0xF8, 0x40, /* 0xE0-0xE3 */ + 0xF8, 0x41, 0xF8, 0x42, 0xF8, 0x43, 0xF8, 0x44, /* 0xE4-0xE7 */ + 0xF8, 0x45, 0xF8, 0x46, 0xF8, 0x47, 0xF8, 0x48, /* 0xE8-0xEB */ + 0xF8, 0x49, 0xF8, 0x4A, 0xF8, 0x4B, 0xF8, 0x4C, /* 0xEC-0xEF */ + 0xF8, 0x4D, 0xF8, 0x4E, 0xF8, 0x4F, 0xF8, 0x50, /* 0xF0-0xF3 */ + 0xF8, 0x51, 0xF8, 0x52, 0xF8, 0x53, 0xF8, 0x54, /* 0xF4-0xF7 */ + 0xF8, 0x55, 0xF8, 0x56, 0xF8, 0x57, 0xF8, 0x58, /* 0xF8-0xFB */ + 0xF8, 0x59, 0xF8, 0x5A, 0xF8, 0x5B, 0xF8, 0x5C, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9D[512] = { + 0xF8, 0x5D, 0xF8, 0x5E, 0xF8, 0x5F, 0xF8, 0x60, /* 0x00-0x03 */ + 0xF8, 0x61, 0xF8, 0x62, 0xF8, 0x63, 0xF8, 0x64, /* 0x04-0x07 */ + 0xF8, 0x65, 0xF8, 0x66, 0xF8, 0x67, 0xF8, 0x68, /* 0x08-0x0B */ + 0xF8, 0x69, 0xF8, 0x6A, 0xF8, 0x6B, 0xF8, 0x6C, /* 0x0C-0x0F */ + 0xF8, 0x6D, 0xF8, 0x6E, 0xF8, 0x6F, 0xF8, 0x70, /* 0x10-0x13 */ + 0xF8, 0x71, 0xF8, 0x72, 0xF8, 0x73, 0xF8, 0x74, /* 0x14-0x17 */ + 0xF8, 0x75, 0xF8, 0x76, 0xF8, 0x77, 0xF8, 0x78, /* 0x18-0x1B */ + 0xF8, 0x79, 0xF8, 0x7A, 0xF8, 0x7B, 0xF8, 0x7C, /* 0x1C-0x1F */ + 0xF8, 0x7D, 0xF8, 0x7E, 0xF8, 0x80, 0xF8, 0x81, /* 0x20-0x23 */ + 0xF8, 0x82, 0xF8, 0x83, 0xF8, 0x84, 0xF8, 0x85, /* 0x24-0x27 */ + 0xF8, 0x86, 0xF8, 0x87, 0xF8, 0x88, 0xF8, 0x89, /* 0x28-0x2B */ + 0xF8, 0x8A, 0xF8, 0x8B, 0xF8, 0x8C, 0xF8, 0x8D, /* 0x2C-0x2F */ + 0xF8, 0x8E, 0xF8, 0x8F, 0xF8, 0x90, 0xF8, 0x91, /* 0x30-0x33 */ + 0xF8, 0x92, 0xF8, 0x93, 0xF8, 0x94, 0xF8, 0x95, /* 0x34-0x37 */ + 0xF8, 0x96, 0xF8, 0x97, 0xF8, 0x98, 0xF8, 0x99, /* 0x38-0x3B */ + 0xF8, 0x9A, 0xF8, 0x9B, 0xF8, 0x9C, 0xF8, 0x9D, /* 0x3C-0x3F */ + 0xF8, 0x9E, 0xF8, 0x9F, 0xF8, 0xA0, 0xF9, 0x40, /* 0x40-0x43 */ + 0xF9, 0x41, 0xF9, 0x42, 0xF9, 0x43, 0xF9, 0x44, /* 0x44-0x47 */ + 0xF9, 0x45, 0xF9, 0x46, 0xF9, 0x47, 0xF9, 0x48, /* 0x48-0x4B */ + 0xF9, 0x49, 0xF9, 0x4A, 0xF9, 0x4B, 0xF9, 0x4C, /* 0x4C-0x4F */ + 0xF9, 0x4D, 0xF9, 0x4E, 0xF9, 0x4F, 0xF9, 0x50, /* 0x50-0x53 */ + 0xF9, 0x51, 0xF9, 0x52, 0xF9, 0x53, 0xF9, 0x54, /* 0x54-0x57 */ + 0xF9, 0x55, 0xF9, 0x56, 0xF9, 0x57, 0xF9, 0x58, /* 0x58-0x5B */ + 0xF9, 0x59, 0xF9, 0x5A, 0xF9, 0x5B, 0xF9, 0x5C, /* 0x5C-0x5F */ + 0xF9, 0x5D, 0xF9, 0x5E, 0xF9, 0x5F, 0xF9, 0x60, /* 0x60-0x63 */ + 0xF9, 0x61, 0xF9, 0x62, 0xF9, 0x63, 0xF9, 0x64, /* 0x64-0x67 */ + 0xF9, 0x65, 0xF9, 0x66, 0xF9, 0x67, 0xF9, 0x68, /* 0x68-0x6B */ + 0xF9, 0x69, 0xF9, 0x6A, 0xF9, 0x6B, 0xF9, 0x6C, /* 0x6C-0x6F */ + 0xF9, 0x6D, 0xF9, 0x6E, 0xF9, 0x6F, 0xF9, 0x70, /* 0x70-0x73 */ + 0xF9, 0x71, 0xF9, 0x72, 0xF9, 0x73, 0xF9, 0x74, /* 0x74-0x77 */ + 0xF9, 0x75, 0xF9, 0x76, 0xF9, 0x77, 0xF9, 0x78, /* 0x78-0x7B */ + 0xF9, 0x79, 0xF9, 0x7A, 0xF9, 0x7B, 0xF9, 0x7C, /* 0x7C-0x7F */ + + 0xF9, 0x7D, 0xF9, 0x7E, 0xF9, 0x80, 0xF9, 0x81, /* 0x80-0x83 */ + 0xF9, 0x82, 0xF9, 0x83, 0xF9, 0x84, 0xF9, 0x85, /* 0x84-0x87 */ + 0xF9, 0x86, 0xF9, 0x87, 0xF9, 0x88, 0xF9, 0x89, /* 0x88-0x8B */ + 0xF9, 0x8A, 0xF9, 0x8B, 0xF9, 0x8C, 0xF9, 0x8D, /* 0x8C-0x8F */ + 0xF9, 0x8E, 0xF9, 0x8F, 0xF9, 0x90, 0xF9, 0x91, /* 0x90-0x93 */ + 0xF9, 0x92, 0xF9, 0x93, 0xF9, 0x94, 0xF9, 0x95, /* 0x94-0x97 */ + 0xF9, 0x96, 0xF9, 0x97, 0xF9, 0x98, 0xF9, 0x99, /* 0x98-0x9B */ + 0xF9, 0x9A, 0xF9, 0x9B, 0xF9, 0x9C, 0xF9, 0x9D, /* 0x9C-0x9F */ + 0xF9, 0x9E, 0xF9, 0x9F, 0xF9, 0xA0, 0xFA, 0x40, /* 0xA0-0xA3 */ + 0xFA, 0x41, 0xFA, 0x42, 0xFA, 0x43, 0xFA, 0x44, /* 0xA4-0xA7 */ + 0xFA, 0x45, 0xFA, 0x46, 0xFA, 0x47, 0xFA, 0x48, /* 0xA8-0xAB */ + 0xFA, 0x49, 0xFA, 0x4A, 0xFA, 0x4B, 0xFA, 0x4C, /* 0xAC-0xAF */ + 0xFA, 0x4D, 0xFA, 0x4E, 0xFA, 0x4F, 0xFA, 0x50, /* 0xB0-0xB3 */ + 0xFA, 0x51, 0xFA, 0x52, 0xFA, 0x53, 0xFA, 0x54, /* 0xB4-0xB7 */ + 0xFA, 0x55, 0xFA, 0x56, 0xFA, 0x57, 0xFA, 0x58, /* 0xB8-0xBB */ + 0xFA, 0x59, 0xFA, 0x5A, 0xFA, 0x5B, 0xFA, 0x5C, /* 0xBC-0xBF */ + 0xFA, 0x5D, 0xFA, 0x5E, 0xFA, 0x5F, 0xFA, 0x60, /* 0xC0-0xC3 */ + 0xFA, 0x61, 0xFA, 0x62, 0xFA, 0x63, 0xFA, 0x64, /* 0xC4-0xC7 */ + 0xFA, 0x65, 0xFA, 0x66, 0xFA, 0x67, 0xFA, 0x68, /* 0xC8-0xCB */ + 0xFA, 0x69, 0xFA, 0x6A, 0xFA, 0x6B, 0xFA, 0x6C, /* 0xCC-0xCF */ + 0xFA, 0x6D, 0xFA, 0x6E, 0xFA, 0x6F, 0xFA, 0x70, /* 0xD0-0xD3 */ + 0xFA, 0x71, 0xFA, 0x72, 0xFA, 0x73, 0xFA, 0x74, /* 0xD4-0xD7 */ + 0xFA, 0x75, 0xFA, 0x76, 0xFA, 0x77, 0xFA, 0x78, /* 0xD8-0xDB */ + 0xFA, 0x79, 0xFA, 0x7A, 0xFA, 0x7B, 0xFA, 0x7C, /* 0xDC-0xDF */ + 0xFA, 0x7D, 0xFA, 0x7E, 0xFA, 0x80, 0xFA, 0x81, /* 0xE0-0xE3 */ + 0xFA, 0x82, 0xFA, 0x83, 0xFA, 0x84, 0xFA, 0x85, /* 0xE4-0xE7 */ + 0xFA, 0x86, 0xFA, 0x87, 0xFA, 0x88, 0xFA, 0x89, /* 0xE8-0xEB */ + 0xFA, 0x8A, 0xFA, 0x8B, 0xFA, 0x8C, 0xFA, 0x8D, /* 0xEC-0xEF */ + 0xFA, 0x8E, 0xFA, 0x8F, 0xFA, 0x90, 0xFA, 0x91, /* 0xF0-0xF3 */ + 0xFA, 0x92, 0xFA, 0x93, 0xFA, 0x94, 0xFA, 0x95, /* 0xF4-0xF7 */ + 0xFA, 0x96, 0xFA, 0x97, 0xFA, 0x98, 0xFA, 0x99, /* 0xF8-0xFB */ + 0xFA, 0x9A, 0xFA, 0x9B, 0xFA, 0x9C, 0xFA, 0x9D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9E[512] = { + 0xFA, 0x9E, 0xFA, 0x9F, 0xFA, 0xA0, 0xFB, 0x40, /* 0x00-0x03 */ + 0xFB, 0x41, 0xFB, 0x42, 0xFB, 0x43, 0xFB, 0x44, /* 0x04-0x07 */ + 0xFB, 0x45, 0xFB, 0x46, 0xFB, 0x47, 0xFB, 0x48, /* 0x08-0x0B */ + 0xFB, 0x49, 0xFB, 0x4A, 0xFB, 0x4B, 0xFB, 0x4C, /* 0x0C-0x0F */ + 0xFB, 0x4D, 0xFB, 0x4E, 0xFB, 0x4F, 0xFB, 0x50, /* 0x10-0x13 */ + 0xFB, 0x51, 0xFB, 0x52, 0xFB, 0x53, 0xFB, 0x54, /* 0x14-0x17 */ + 0xFB, 0x55, 0xFB, 0x56, 0xFB, 0x57, 0xFB, 0x58, /* 0x18-0x1B */ + 0xFB, 0x59, 0xFB, 0x5A, 0xFB, 0x5B, 0xC4, 0xF1, /* 0x1C-0x1F */ + 0xF0, 0xAF, 0xBC, 0xA6, 0xF0, 0xB0, 0xC3, 0xF9, /* 0x20-0x23 */ + 0xFB, 0x5C, 0xC5, 0xB8, 0xD1, 0xBB, 0xFB, 0x5D, /* 0x24-0x27 */ + 0xF0, 0xB1, 0xF0, 0xB2, 0xF0, 0xB3, 0xF0, 0xB4, /* 0x28-0x2B */ + 0xF0, 0xB5, 0xD1, 0xBC, 0xFB, 0x5E, 0xD1, 0xEC, /* 0x2C-0x2F */ + 0xFB, 0x5F, 0xF0, 0xB7, 0xF0, 0xB6, 0xD4, 0xA7, /* 0x30-0x33 */ + 0xFB, 0x60, 0xCD, 0xD2, 0xF0, 0xB8, 0xF0, 0xBA, /* 0x34-0x37 */ + 0xF0, 0xB9, 0xF0, 0xBB, 0xF0, 0xBC, 0xFB, 0x61, /* 0x38-0x3B */ + 0xFB, 0x62, 0xB8, 0xEB, 0xF0, 0xBD, 0xBA, 0xE8, /* 0x3C-0x3F */ + 0xFB, 0x63, 0xF0, 0xBE, 0xF0, 0xBF, 0xBE, 0xE9, /* 0x40-0x43 */ + 0xF0, 0xC0, 0xB6, 0xEC, 0xF0, 0xC1, 0xF0, 0xC2, /* 0x44-0x47 */ + 0xF0, 0xC3, 0xF0, 0xC4, 0xC8, 0xB5, 0xF0, 0xC5, /* 0x48-0x4B */ + 0xF0, 0xC6, 0xFB, 0x64, 0xF0, 0xC7, 0xC5, 0xF4, /* 0x4C-0x4F */ + 0xFB, 0x65, 0xF0, 0xC8, 0xFB, 0x66, 0xFB, 0x67, /* 0x50-0x53 */ + 0xFB, 0x68, 0xF0, 0xC9, 0xFB, 0x69, 0xF0, 0xCA, /* 0x54-0x57 */ + 0xF7, 0xBD, 0xFB, 0x6A, 0xF0, 0xCB, 0xF0, 0xCC, /* 0x58-0x5B */ + 0xF0, 0xCD, 0xFB, 0x6B, 0xF0, 0xCE, 0xFB, 0x6C, /* 0x5C-0x5F */ + 0xFB, 0x6D, 0xFB, 0x6E, 0xFB, 0x6F, 0xF0, 0xCF, /* 0x60-0x63 */ + 0xBA, 0xD7, 0xFB, 0x70, 0xF0, 0xD0, 0xF0, 0xD1, /* 0x64-0x67 */ + 0xF0, 0xD2, 0xF0, 0xD3, 0xF0, 0xD4, 0xF0, 0xD5, /* 0x68-0x6B */ + 0xF0, 0xD6, 0xF0, 0xD8, 0xFB, 0x71, 0xFB, 0x72, /* 0x6C-0x6F */ + 0xD3, 0xA5, 0xF0, 0xD7, 0xFB, 0x73, 0xF0, 0xD9, /* 0x70-0x73 */ + 0xFB, 0x74, 0xFB, 0x75, 0xFB, 0x76, 0xFB, 0x77, /* 0x74-0x77 */ + 0xFB, 0x78, 0xFB, 0x79, 0xFB, 0x7A, 0xFB, 0x7B, /* 0x78-0x7B */ + 0xFB, 0x7C, 0xFB, 0x7D, 0xF5, 0xBA, 0xC2, 0xB9, /* 0x7C-0x7F */ + + 0xFB, 0x7E, 0xFB, 0x80, 0xF7, 0xE4, 0xFB, 0x81, /* 0x80-0x83 */ + 0xFB, 0x82, 0xFB, 0x83, 0xFB, 0x84, 0xF7, 0xE5, /* 0x84-0x87 */ + 0xF7, 0xE6, 0xFB, 0x85, 0xFB, 0x86, 0xF7, 0xE7, /* 0x88-0x8B */ + 0xFB, 0x87, 0xFB, 0x88, 0xFB, 0x89, 0xFB, 0x8A, /* 0x8C-0x8F */ + 0xFB, 0x8B, 0xFB, 0x8C, 0xF7, 0xE8, 0xC2, 0xB4, /* 0x90-0x93 */ + 0xFB, 0x8D, 0xFB, 0x8E, 0xFB, 0x8F, 0xFB, 0x90, /* 0x94-0x97 */ + 0xFB, 0x91, 0xFB, 0x92, 0xFB, 0x93, 0xFB, 0x94, /* 0x98-0x9B */ + 0xFB, 0x95, 0xF7, 0xEA, 0xFB, 0x96, 0xF7, 0xEB, /* 0x9C-0x9F */ + 0xFB, 0x97, 0xFB, 0x98, 0xFB, 0x99, 0xFB, 0x9A, /* 0xA0-0xA3 */ + 0xFB, 0x9B, 0xFB, 0x9C, 0xC2, 0xF3, 0xFB, 0x9D, /* 0xA4-0xA7 */ + 0xFB, 0x9E, 0xFB, 0x9F, 0xFB, 0xA0, 0xFC, 0x40, /* 0xA8-0xAB */ + 0xFC, 0x41, 0xFC, 0x42, 0xFC, 0x43, 0xFC, 0x44, /* 0xAC-0xAF */ + 0xFC, 0x45, 0xFC, 0x46, 0xFC, 0x47, 0xFC, 0x48, /* 0xB0-0xB3 */ + 0xF4, 0xF0, 0xFC, 0x49, 0xFC, 0x4A, 0xFC, 0x4B, /* 0xB4-0xB7 */ + 0xF4, 0xEF, 0xFC, 0x4C, 0xFC, 0x4D, 0xC2, 0xE9, /* 0xB8-0xBB */ + 0xFC, 0x4E, 0xF7, 0xE1, 0xF7, 0xE2, 0xFC, 0x4F, /* 0xBC-0xBF */ + 0xFC, 0x50, 0xFC, 0x51, 0xFC, 0x52, 0xFC, 0x53, /* 0xC0-0xC3 */ + 0xBB, 0xC6, 0xFC, 0x54, 0xFC, 0x55, 0xFC, 0x56, /* 0xC4-0xC7 */ + 0xFC, 0x57, 0xD9, 0xE4, 0xFC, 0x58, 0xFC, 0x59, /* 0xC8-0xCB */ + 0xFC, 0x5A, 0xCA, 0xF2, 0xC0, 0xE8, 0xF0, 0xA4, /* 0xCC-0xCF */ + 0xFC, 0x5B, 0xBA, 0xDA, 0xFC, 0x5C, 0xFC, 0x5D, /* 0xD0-0xD3 */ + 0xC7, 0xAD, 0xFC, 0x5E, 0xFC, 0x5F, 0xFC, 0x60, /* 0xD4-0xD7 */ + 0xC4, 0xAC, 0xFC, 0x61, 0xFC, 0x62, 0xF7, 0xEC, /* 0xD8-0xDB */ + 0xF7, 0xED, 0xF7, 0xEE, 0xFC, 0x63, 0xF7, 0xF0, /* 0xDC-0xDF */ + 0xF7, 0xEF, 0xFC, 0x64, 0xF7, 0xF1, 0xFC, 0x65, /* 0xE0-0xE3 */ + 0xFC, 0x66, 0xF7, 0xF4, 0xFC, 0x67, 0xF7, 0xF3, /* 0xE4-0xE7 */ + 0xFC, 0x68, 0xF7, 0xF2, 0xF7, 0xF5, 0xFC, 0x69, /* 0xE8-0xEB */ + 0xFC, 0x6A, 0xFC, 0x6B, 0xFC, 0x6C, 0xF7, 0xF6, /* 0xEC-0xEF */ + 0xFC, 0x6D, 0xFC, 0x6E, 0xFC, 0x6F, 0xFC, 0x70, /* 0xF0-0xF3 */ + 0xFC, 0x71, 0xFC, 0x72, 0xFC, 0x73, 0xFC, 0x74, /* 0xF4-0xF7 */ + 0xFC, 0x75, 0xED, 0xE9, 0xFC, 0x76, 0xED, 0xEA, /* 0xF8-0xFB */ + 0xED, 0xEB, 0xFC, 0x77, 0xF6, 0xBC, 0xFC, 0x78, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9F[512] = { + 0xFC, 0x79, 0xFC, 0x7A, 0xFC, 0x7B, 0xFC, 0x7C, /* 0x00-0x03 */ + 0xFC, 0x7D, 0xFC, 0x7E, 0xFC, 0x80, 0xFC, 0x81, /* 0x04-0x07 */ + 0xFC, 0x82, 0xFC, 0x83, 0xFC, 0x84, 0xF6, 0xBD, /* 0x08-0x0B */ + 0xFC, 0x85, 0xF6, 0xBE, 0xB6, 0xA6, 0xFC, 0x86, /* 0x0C-0x0F */ + 0xD8, 0xBE, 0xFC, 0x87, 0xFC, 0x88, 0xB9, 0xC4, /* 0x10-0x13 */ + 0xFC, 0x89, 0xFC, 0x8A, 0xFC, 0x8B, 0xD8, 0xBB, /* 0x14-0x17 */ + 0xFC, 0x8C, 0xDC, 0xB1, 0xFC, 0x8D, 0xFC, 0x8E, /* 0x18-0x1B */ + 0xFC, 0x8F, 0xFC, 0x90, 0xFC, 0x91, 0xFC, 0x92, /* 0x1C-0x1F */ + 0xCA, 0xF3, 0xFC, 0x93, 0xF7, 0xF7, 0xFC, 0x94, /* 0x20-0x23 */ + 0xFC, 0x95, 0xFC, 0x96, 0xFC, 0x97, 0xFC, 0x98, /* 0x24-0x27 */ + 0xFC, 0x99, 0xFC, 0x9A, 0xFC, 0x9B, 0xFC, 0x9C, /* 0x28-0x2B */ + 0xF7, 0xF8, 0xFC, 0x9D, 0xFC, 0x9E, 0xF7, 0xF9, /* 0x2C-0x2F */ + 0xFC, 0x9F, 0xFC, 0xA0, 0xFD, 0x40, 0xFD, 0x41, /* 0x30-0x33 */ + 0xFD, 0x42, 0xFD, 0x43, 0xFD, 0x44, 0xF7, 0xFB, /* 0x34-0x37 */ + 0xFD, 0x45, 0xF7, 0xFA, 0xFD, 0x46, 0xB1, 0xC7, /* 0x38-0x3B */ + 0xFD, 0x47, 0xF7, 0xFC, 0xF7, 0xFD, 0xFD, 0x48, /* 0x3C-0x3F */ + 0xFD, 0x49, 0xFD, 0x4A, 0xFD, 0x4B, 0xFD, 0x4C, /* 0x40-0x43 */ + 0xF7, 0xFE, 0xFD, 0x4D, 0xFD, 0x4E, 0xFD, 0x4F, /* 0x44-0x47 */ + 0xFD, 0x50, 0xFD, 0x51, 0xFD, 0x52, 0xFD, 0x53, /* 0x48-0x4B */ + 0xFD, 0x54, 0xFD, 0x55, 0xFD, 0x56, 0xFD, 0x57, /* 0x4C-0x4F */ + 0xC6, 0xEB, 0xEC, 0xB4, 0xFD, 0x58, 0xFD, 0x59, /* 0x50-0x53 */ + 0xFD, 0x5A, 0xFD, 0x5B, 0xFD, 0x5C, 0xFD, 0x5D, /* 0x54-0x57 */ + 0xFD, 0x5E, 0xFD, 0x5F, 0xFD, 0x60, 0xFD, 0x61, /* 0x58-0x5B */ + 0xFD, 0x62, 0xFD, 0x63, 0xFD, 0x64, 0xFD, 0x65, /* 0x5C-0x5F */ + 0xFD, 0x66, 0xFD, 0x67, 0xFD, 0x68, 0xFD, 0x69, /* 0x60-0x63 */ + 0xFD, 0x6A, 0xFD, 0x6B, 0xFD, 0x6C, 0xFD, 0x6D, /* 0x64-0x67 */ + 0xFD, 0x6E, 0xFD, 0x6F, 0xFD, 0x70, 0xFD, 0x71, /* 0x68-0x6B */ + 0xFD, 0x72, 0xFD, 0x73, 0xFD, 0x74, 0xFD, 0x75, /* 0x6C-0x6F */ + 0xFD, 0x76, 0xFD, 0x77, 0xFD, 0x78, 0xFD, 0x79, /* 0x70-0x73 */ + 0xFD, 0x7A, 0xFD, 0x7B, 0xFD, 0x7C, 0xFD, 0x7D, /* 0x74-0x77 */ + 0xFD, 0x7E, 0xFD, 0x80, 0xFD, 0x81, 0xFD, 0x82, /* 0x78-0x7B */ + 0xFD, 0x83, 0xFD, 0x84, 0xFD, 0x85, 0xB3, 0xDD, /* 0x7C-0x7F */ + + 0xF6, 0xB3, 0xFD, 0x86, 0xFD, 0x87, 0xF6, 0xB4, /* 0x80-0x83 */ + 0xC1, 0xE4, 0xF6, 0xB5, 0xF6, 0xB6, 0xF6, 0xB7, /* 0x84-0x87 */ + 0xF6, 0xB8, 0xF6, 0xB9, 0xF6, 0xBA, 0xC8, 0xA3, /* 0x88-0x8B */ + 0xF6, 0xBB, 0xFD, 0x88, 0xFD, 0x89, 0xFD, 0x8A, /* 0x8C-0x8F */ + 0xFD, 0x8B, 0xFD, 0x8C, 0xFD, 0x8D, 0xFD, 0x8E, /* 0x90-0x93 */ + 0xFD, 0x8F, 0xFD, 0x90, 0xFD, 0x91, 0xFD, 0x92, /* 0x94-0x97 */ + 0xFD, 0x93, 0xC1, 0xFA, 0xB9, 0xA8, 0xED, 0xE8, /* 0x98-0x9B */ + 0xFD, 0x94, 0xFD, 0x95, 0xFD, 0x96, 0xB9, 0xEA, /* 0x9C-0x9F */ + 0xD9, 0xDF, 0xFD, 0x97, 0xFD, 0x98, 0xFD, 0x99, /* 0xA0-0xA3 */ + 0xFD, 0x9A, 0xFD, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ +}; + +static const unsigned char u2c_DC[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ +}; + +static const unsigned char u2c_F9[512] = { + 0xD8, 0x4D, 0xB8, 0xFC, 0xDC, 0x87, 0xD9, 0x5A, /* 0x00-0x03 */ + 0xBB, 0xAC, 0xB4, 0xAE, 0xBE, 0xE4, 0xFD, 0x94, /* 0x04-0x07 */ + 0xFD, 0x94, 0xC6, 0xF5, 0xBD, 0xF0, 0xC0, 0xAE, /* 0x08-0x0B */ + 0xC4, 0xCE, 0x91, 0xD0, 0xB0, 0x5D, 0xC1, 0x5F, /* 0x0C-0x0F */ + 0xCC, 0x7D, 0xC2, 0xDD, 0xC2, 0xE3, 0xDF, 0x89, /* 0x10-0x13 */ + 0x98, 0xB7, 0xC2, 0xE5, 0xC0, 0xD3, 0xE7, 0xF3, /* 0x14-0x17 */ + 0xC2, 0xE4, 0xC0, 0xD2, 0xF1, 0x98, 0x81, 0x79, /* 0x18-0x1B */ + 0xC2, 0xD1, 0x99, 0xDA, 0xA0, 0x80, 0xCC, 0x6D, /* 0x1C-0x1F */ + 0xFB, 0x5B, 0x8D, 0xB9, 0x9E, 0x45, 0xCB, 0x7B, /* 0x20-0x23 */ + 0xD2, 0x68, 0xC0, 0xAD, 0xC5, 0x44, 0xCF, 0x9E, /* 0x24-0x27 */ + 0xC0, 0xC8, 0xC0, 0xCA, 0xC0, 0xCB, 0xC0, 0xC7, /* 0x28-0x2B */ + 0xFD, 0x9C, 0x81, 0xED, 0xC0, 0xE4, 0x84, 0xDA, /* 0x2C-0x2F */ + 0x93, 0xEF, 0x99, 0xA9, 0xA0, 0x74, 0xB1, 0x52, /* 0x30-0x33 */ + 0xC0, 0xCF, 0xCC, 0x4A, 0xCC, 0x94, 0xC2, 0xB7, /* 0x34-0x37 */ + 0xC2, 0xB6, 0xF4, 0x94, 0xFA, 0x98, 0xC2, 0xB5, /* 0x38-0x3B */ + 0xB5, 0x93, 0xBE, 0x47, 0xC7, 0x8A, 0xE4, 0x9B, /* 0x3C-0x3F */ + 0xC2, 0xB9, 0xD5, 0x93, 0x89, 0xC5, 0xC5, 0xAA, /* 0x40-0x43 */ + 0xBB, 0x5C, 0xC3, 0x40, 0xC0, 0xCE, 0xC0, 0xDA, /* 0x44-0x47 */ + 0xD9, 0x54, 0xC0, 0xD7, 0x89, 0xBE, 0x8C, 0xD2, /* 0x48-0x4B */ + 0x98, 0xC7, 0x9C, 0x49, 0xC2, 0xA9, 0xC0, 0xDB, /* 0x4C-0x4F */ + 0xBF, 0x7C, 0xC2, 0xAA, 0xC0, 0xD5, 0xC0, 0xDF, /* 0x50-0x53 */ + 0x84, 0x43, 0xC1, 0xE8, 0xB6, 0xA0, 0xBE, 0x63, /* 0x54-0x57 */ + 0xC1, 0xE2, 0xC1, 0xEA, 0xD7, 0x78, 0x92, 0x82, /* 0x58-0x5B */ + 0x98, 0xB7, 0xD6, 0x5A, 0xB5, 0xA4, 0x8C, 0x8E, /* 0x5C-0x5F */ + 0xC5, 0xAD, 0xC2, 0xCA, 0xAE, 0x90, 0xB1, 0xB1, /* 0x60-0x63 */ + 0xB4, 0x91, 0xB1, 0xE3, 0x8F, 0xCD, 0xB2, 0xBB, /* 0x64-0x67 */ + 0xC3, 0xDA, 0x94, 0xB5, 0xCB, 0xF7, 0x85, 0xA2, /* 0x68-0x6B */ + 0xC8, 0xFB, 0xCA, 0xA1, 0xC8, 0x7E, 0xD5, 0x66, /* 0x6C-0x6F */ + 0x9A, 0xA2, 0xB3, 0xBD, 0xC9, 0xF2, 0xCA, 0xB0, /* 0x70-0x73 */ + 0xC8, 0xF4, 0xC2, 0xD3, 0xC2, 0xD4, 0xC1, 0xC1, /* 0x74-0x77 */ + 0x83, 0xC9, 0xFD, 0x9D, 0xC1, 0xBA, 0xBC, 0x5A, /* 0x78-0x7B */ + 0xC1, 0xBC, 0xD5, 0x8F, 0xC1, 0xBF, 0x84, 0xEE, /* 0x7C-0x7F */ + + 0x85, 0xCE, 0xC5, 0xAE, 0x8F, 0x5D, 0xC2, 0xC3, /* 0x80-0x83 */ + 0x9E, 0x56, 0xB5, 0x5A, 0xE9, 0x82, 0xF3, 0x50, /* 0x84-0x87 */ + 0xFB, 0x90, 0xC0, 0xE8, 0xC1, 0xA6, 0x95, 0xD1, /* 0x88-0x8B */ + 0x9A, 0x76, 0xDE, 0x5D, 0xC4, 0xEA, 0x91, 0x7A, /* 0x8C-0x8F */ + 0x91, 0xD9, 0x93, 0xD3, 0x9D, 0x69, 0x9F, 0x92, /* 0x90-0x93 */ + 0xAD, 0x49, 0xFD, 0x9E, 0xBE, 0x9A, 0xC2, 0x93, /* 0x94-0x97 */ + 0xDD, 0x82, 0xC9, 0x8F, 0xDF, 0x42, 0xE5, 0x80, /* 0x98-0x9B */ + 0xC1, 0xD0, 0xC1, 0xD3, 0xD1, 0xCA, 0xC1, 0xD2, /* 0x9C-0x9F */ + 0xC1, 0xD1, 0xD5, 0x66, 0xC1, 0xAE, 0xC4, 0xEE, /* 0xA0-0xA3 */ + 0xC4, 0xED, 0x9A, 0x9A, 0xBA, 0x9F, 0xAB, 0x43, /* 0xA4-0xA7 */ + 0xC1, 0xEE, 0xE0, 0xF2, 0x8C, 0x8E, 0x8E, 0x58, /* 0xA8-0xAB */ + 0xC1, 0xAF, 0xC1, 0xE1, 0xAC, 0x93, 0xC1, 0xE7, /* 0xAC-0xAF */ + 0xF1, 0xF6, 0xE2, 0x8F, 0xC1, 0xE3, 0xEC, 0x60, /* 0xB0-0xB3 */ + 0xEE, 0x49, 0xC0, 0xFD, 0xB6, 0x59, 0xF5, 0xB7, /* 0xB4-0xB7 */ + 0xEB, 0x60, 0x90, 0xBA, 0xC1, 0xCB, 0xC1, 0xC5, /* 0xB8-0xBB */ + 0xE5, 0xBC, 0xC4, 0xF2, 0xC1, 0xCF, 0x98, 0xB7, /* 0xBC-0xBF */ + 0xC1, 0xC7, 0xAF, 0x9F, 0xDE, 0xA4, 0xDF, 0x7C, /* 0xC0-0xC3 */ + 0xFD, 0x88, 0x95, 0x9E, 0xC8, 0xEE, 0x84, 0xA2, /* 0xC4-0xC7 */ + 0x96, 0x83, 0xC1, 0xF8, 0xC1, 0xF7, 0xC1, 0xEF, /* 0xC8-0xCB */ + 0xC1, 0xF0, 0xC1, 0xF4, 0xC1, 0xF2, 0xBC, 0x7E, /* 0xCC-0xCF */ + 0xEE, 0x90, 0xC1, 0xF9, 0xC2, 0xBE, 0xEA, 0x91, /* 0xD0-0xD3 */ + 0x82, 0x90, 0x8D, 0x91, 0x9C, 0x53, 0xDD, 0x86, /* 0xD4-0xD7 */ + 0xC2, 0xC9, 0x90, 0xFC, 0xC0, 0xF5, 0xC2, 0xCA, /* 0xD8-0xDB */ + 0xC2, 0xA1, 0xC0, 0xFB, 0xC0, 0xF4, 0xC2, 0xC4, /* 0xDC-0xDF */ + 0xD2, 0xD7, 0xC0, 0xEE, 0xC0, 0xE6, 0xC4, 0xE0, /* 0xE0-0xE3 */ + 0xC0, 0xED, 0xC1, 0xA1, 0xEE, 0xBE, 0xFD, 0x9F, /* 0xE4-0xE7 */ + 0xD1, 0x65, 0xC0, 0xEF, 0xEB, 0x78, 0xC4, 0xE4, /* 0xE8-0xEB */ + 0xC4, 0xE7, 0xC1, 0xDF, 0x9F, 0xFB, 0xAD, 0x55, /* 0xEC-0xEF */ + 0xCC, 0x41, 0xFD, 0xA0, 0xF7, 0x5B, 0xF7, 0xEB, /* 0xF0-0xF3 */ + 0xC1, 0xD6, 0xC1, 0xDC, 0xC5, 0x52, 0xC1, 0xA2, /* 0xF4-0xF7 */ + 0xF3, 0xD2, 0xC1, 0xA3, 0xA0, 0xEE, 0xD6, 0xCB, /* 0xF8-0xFB */ + 0xD7, 0x52, 0xCA, 0xB2, 0xB2, 0xE8, 0xB4, 0xCC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_FA[512] = { + 0xC7, 0xD0, 0xB6, 0xC8, 0xCD, 0xD8, 0xCC, 0xC7, /* 0x00-0x03 */ + 0xD5, 0xAC, 0xB6, 0xB4, 0xB1, 0xA9, 0xDD, 0x97, /* 0x04-0x07 */ + 0xD0, 0xD0, 0xBD, 0xB5, 0xD2, 0x8A, 0xC0, 0xAA, /* 0x08-0x0B */ + 0xFE, 0x40, 0xFE, 0x41, 0xFE, 0x42, 0xFE, 0x43, /* 0x0C-0x0F */ + 0x89, 0x56, 0xFE, 0x44, 0xC7, 0xE7, 0xFE, 0x45, /* 0x10-0x13 */ + 0xFE, 0x46, 0x84, 0x44, 0xD8, 0x69, 0xD2, 0xE6, /* 0x14-0x17 */ + 0xFE, 0x47, 0xC9, 0xF1, 0xCF, 0xE9, 0xB8, 0xA3, /* 0x18-0x1B */ + 0xBE, 0xB8, 0xBE, 0xAB, 0xD3, 0xF0, 0xFE, 0x48, /* 0x1C-0x1F */ + 0xFE, 0x49, 0xFE, 0x4A, 0xD6, 0x54, 0xFE, 0x4B, /* 0x20-0x23 */ + 0xFE, 0x4C, 0xD2, 0xDD, 0xB6, 0xBC, 0xFE, 0x4D, /* 0x24-0x27 */ + 0xFE, 0x4E, 0xFE, 0x4F, 0xEF, 0x88, 0xEF, 0x95, /* 0x28-0x2B */ + 0xF0, 0x5E, 0xFA, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ +}; + +static const unsigned char u2c_FE[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xA9, 0x55, 0xA6, 0xF2, 0x00, 0x00, 0xA6, 0xF4, /* 0x30-0x33 */ + 0xA6, 0xF5, 0xA6, 0xE0, 0xA6, 0xE1, 0xA6, 0xF0, /* 0x34-0x37 */ + 0xA6, 0xF1, 0xA6, 0xE2, 0xA6, 0xE3, 0xA6, 0xEE, /* 0x38-0x3B */ + 0xA6, 0xEF, 0xA6, 0xE6, 0xA6, 0xE7, 0xA6, 0xE4, /* 0x3C-0x3F */ + 0xA6, 0xE5, 0xA6, 0xE8, 0xA6, 0xE9, 0xA6, 0xEA, /* 0x40-0x43 */ + 0xA6, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xA9, 0x68, 0xA9, 0x69, 0xA9, 0x6A, /* 0x48-0x4B */ + 0xA9, 0x6B, 0xA9, 0x6C, 0xA9, 0x6D, 0xA9, 0x6E, /* 0x4C-0x4F */ + 0xA9, 0x6F, 0xA9, 0x70, 0xA9, 0x71, 0x00, 0x00, /* 0x50-0x53 */ + 0xA9, 0x72, 0xA9, 0x73, 0xA9, 0x74, 0xA9, 0x75, /* 0x54-0x57 */ + 0x00, 0x00, 0xA9, 0x76, 0xA9, 0x77, 0xA9, 0x78, /* 0x58-0x5B */ + 0xA9, 0x79, 0xA9, 0x7A, 0xA9, 0x7B, 0xA9, 0x7C, /* 0x5C-0x5F */ + 0xA9, 0x7D, 0xA9, 0x7E, 0xA9, 0x80, 0xA9, 0x81, /* 0x60-0x63 */ + 0xA9, 0x82, 0xA9, 0x83, 0xA9, 0x84, 0x00, 0x00, /* 0x64-0x67 */ + 0xA9, 0x85, 0xA9, 0x86, 0xA9, 0x87, 0xA9, 0x88, /* 0x68-0x6B */ +}; + +static const unsigned char u2c_FF[512] = { + 0x00, 0x00, 0xA3, 0xA1, 0xA3, 0xA2, 0xA3, 0xA3, /* 0x00-0x03 */ + 0xA1, 0xE7, 0xA3, 0xA5, 0xA3, 0xA6, 0xA3, 0xA7, /* 0x04-0x07 */ + 0xA3, 0xA8, 0xA3, 0xA9, 0xA3, 0xAA, 0xA3, 0xAB, /* 0x08-0x0B */ + 0xA3, 0xAC, 0xA3, 0xAD, 0xA3, 0xAE, 0xA3, 0xAF, /* 0x0C-0x0F */ + 0xA3, 0xB0, 0xA3, 0xB1, 0xA3, 0xB2, 0xA3, 0xB3, /* 0x10-0x13 */ + 0xA3, 0xB4, 0xA3, 0xB5, 0xA3, 0xB6, 0xA3, 0xB7, /* 0x14-0x17 */ + 0xA3, 0xB8, 0xA3, 0xB9, 0xA3, 0xBA, 0xA3, 0xBB, /* 0x18-0x1B */ + 0xA3, 0xBC, 0xA3, 0xBD, 0xA3, 0xBE, 0xA3, 0xBF, /* 0x1C-0x1F */ + 0xA3, 0xC0, 0xA3, 0xC1, 0xA3, 0xC2, 0xA3, 0xC3, /* 0x20-0x23 */ + 0xA3, 0xC4, 0xA3, 0xC5, 0xA3, 0xC6, 0xA3, 0xC7, /* 0x24-0x27 */ + 0xA3, 0xC8, 0xA3, 0xC9, 0xA3, 0xCA, 0xA3, 0xCB, /* 0x28-0x2B */ + 0xA3, 0xCC, 0xA3, 0xCD, 0xA3, 0xCE, 0xA3, 0xCF, /* 0x2C-0x2F */ + 0xA3, 0xD0, 0xA3, 0xD1, 0xA3, 0xD2, 0xA3, 0xD3, /* 0x30-0x33 */ + 0xA3, 0xD4, 0xA3, 0xD5, 0xA3, 0xD6, 0xA3, 0xD7, /* 0x34-0x37 */ + 0xA3, 0xD8, 0xA3, 0xD9, 0xA3, 0xDA, 0xA3, 0xDB, /* 0x38-0x3B */ + 0xA3, 0xDC, 0xA3, 0xDD, 0xA3, 0xDE, 0xA3, 0xDF, /* 0x3C-0x3F */ + 0xA3, 0xE0, 0xA3, 0xE1, 0xA3, 0xE2, 0xA3, 0xE3, /* 0x40-0x43 */ + 0xA3, 0xE4, 0xA3, 0xE5, 0xA3, 0xE6, 0xA3, 0xE7, /* 0x44-0x47 */ + 0xA3, 0xE8, 0xA3, 0xE9, 0xA3, 0xEA, 0xA3, 0xEB, /* 0x48-0x4B */ + 0xA3, 0xEC, 0xA3, 0xED, 0xA3, 0xEE, 0xA3, 0xEF, /* 0x4C-0x4F */ + 0xA3, 0xF0, 0xA3, 0xF1, 0xA3, 0xF2, 0xA3, 0xF3, /* 0x50-0x53 */ + 0xA3, 0xF4, 0xA3, 0xF5, 0xA3, 0xF6, 0xA3, 0xF7, /* 0x54-0x57 */ + 0xA3, 0xF8, 0xA3, 0xF9, 0xA3, 0xFA, 0xA3, 0xFB, /* 0x58-0x5B */ + 0xA3, 0xFC, 0xA3, 0xFD, 0xA1, 0xAB, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xA1, 0xE9, 0xA1, 0xEA, 0xA9, 0x56, 0xA3, 0xFE, /* 0xE0-0xE3 */ + 0xA9, 0x57, 0xA3, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + u2c_00, u2c_01, u2c_02, u2c_03, u2c_04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_20, u2c_21, u2c_22, u2c_23, u2c_24, u2c_25, u2c_26, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_30, u2c_31, u2c_32, u2c_33, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, u2c_4E, u2c_4F, + u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, + u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, + u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, + u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, + u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, + u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, + u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, + u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, + u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, + u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, u2c_DC, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, u2c_F9, u2c_FA, NULL, NULL, NULL, u2c_FE, u2c_FF, }; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(const wchar_t uni, + unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni&0xFF; + unsigned char ch = (uni>>8)&0xFF; + unsigned char out0,out1; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + if (uni == 0x20ac) {/* Euro symbol.The only exception with a non-ascii unicode */ + out[0] = 0x80; + return 1; + } + + if (ch == 0) { /* handle the U00 plane*/ + /* if (cl == 0) return -EINVAL;*/ /*U0000 is legal in cp936*/ + out0 = u2c_00[cl*2]; + out1 = u2c_00[cl*2+1]; + if (out0 == 0x00 && out1 == 0x00) { + if (cl<0x80) { + out[0] = cl; + return 1; + } + return -EINVAL; + } else { + if (boundlen <= 1) + return -ENAMETOOLONG; + out[0] = out0; + out[1] = out1; + return 2; + } + } + + uni2charset = page_uni2charset[ch]; + if (uni2charset) { + if (boundlen <= 1) + return -ENAMETOOLONG; + out[0] = uni2charset[cl*2]; + out[1] = uni2charset[cl*2+1]; + if (out[0] == 0x00 && out[1] == 0x00) + return -EINVAL; + return 2; + } + else + return -EINVAL; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + unsigned char ch, cl; + const wchar_t *charset2uni; + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + if (boundlen == 1) { + if (rawstring[0]==0x80) { /* Euro symbol.The only exception with a non-ascii unicode */ + *uni = 0x20ac; + } else { + *uni = rawstring[0]; + } + return 1; + } + + ch = rawstring[0]; + cl = rawstring[1]; + + charset2uni = page_charset2uni[ch]; + if (charset2uni && cl) { + *uni = charset2uni[cl]; + if (*uni == 0x0000) + return -EINVAL; + n = 2; + } else{ + if (ch==0x80) {/* Euro symbol.The only exception with a non-ascii unicode */ + *uni = 0x20ac; + } else { + *uni = ch; + } + n = 1; + } + return n; +} + +static struct nls_table table = { + .charset = "cp936", + .alias = "gb2312", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp936(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp936(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp936) +module_exit(exit_nls_cp936) + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NLS(gb2312); diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c new file mode 100644 index 00000000..8a7a2fe8 --- /dev/null +++ b/fs/nls/nls_cp949.c @@ -0,0 +1,13947 @@ +/* + * linux/fs/nls/nls_cp949.c + * + * Charset cp949 translation tables. + * This translation table was generated automatically, the + * original table can be download from the Microsoft website. + * (http://www.microsoft.com/typography/unicode/unicodecp.htm) + */ + +#include +#include +#include +#include +#include + +static const wchar_t c2u_81[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xAC02,0xAC03,0xAC05,0xAC06,0xAC0B,0xAC0C,0xAC0D,/* 0x40-0x47 */ + 0xAC0E,0xAC0F,0xAC18,0xAC1E,0xAC1F,0xAC21,0xAC22,0xAC23,/* 0x48-0x4F */ + 0xAC25,0xAC26,0xAC27,0xAC28,0xAC29,0xAC2A,0xAC2B,0xAC2E,/* 0x50-0x57 */ + 0xAC32,0xAC33,0xAC34,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xAC35,0xAC36,0xAC37,0xAC3A,0xAC3B,0xAC3D,0xAC3E,/* 0x60-0x67 */ + 0xAC3F,0xAC41,0xAC42,0xAC43,0xAC44,0xAC45,0xAC46,0xAC47,/* 0x68-0x6F */ + 0xAC48,0xAC49,0xAC4A,0xAC4C,0xAC4E,0xAC4F,0xAC50,0xAC51,/* 0x70-0x77 */ + 0xAC52,0xAC53,0xAC55,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xAC56,0xAC57,0xAC59,0xAC5A,0xAC5B,0xAC5D,0xAC5E,/* 0x80-0x87 */ + 0xAC5F,0xAC60,0xAC61,0xAC62,0xAC63,0xAC64,0xAC65,0xAC66,/* 0x88-0x8F */ + 0xAC67,0xAC68,0xAC69,0xAC6A,0xAC6B,0xAC6C,0xAC6D,0xAC6E,/* 0x90-0x97 */ + 0xAC6F,0xAC72,0xAC73,0xAC75,0xAC76,0xAC79,0xAC7B,0xAC7C,/* 0x98-0x9F */ + 0xAC7D,0xAC7E,0xAC7F,0xAC82,0xAC87,0xAC88,0xAC8D,0xAC8E,/* 0xA0-0xA7 */ + 0xAC8F,0xAC91,0xAC92,0xAC93,0xAC95,0xAC96,0xAC97,0xAC98,/* 0xA8-0xAF */ + 0xAC99,0xAC9A,0xAC9B,0xAC9E,0xACA2,0xACA3,0xACA4,0xACA5,/* 0xB0-0xB7 */ + 0xACA6,0xACA7,0xACAB,0xACAD,0xACAE,0xACB1,0xACB2,0xACB3,/* 0xB8-0xBF */ + 0xACB4,0xACB5,0xACB6,0xACB7,0xACBA,0xACBE,0xACBF,0xACC0,/* 0xC0-0xC7 */ + 0xACC2,0xACC3,0xACC5,0xACC6,0xACC7,0xACC9,0xACCA,0xACCB,/* 0xC8-0xCF */ + 0xACCD,0xACCE,0xACCF,0xACD0,0xACD1,0xACD2,0xACD3,0xACD4,/* 0xD0-0xD7 */ + 0xACD6,0xACD8,0xACD9,0xACDA,0xACDB,0xACDC,0xACDD,0xACDE,/* 0xD8-0xDF */ + 0xACDF,0xACE2,0xACE3,0xACE5,0xACE6,0xACE9,0xACEB,0xACED,/* 0xE0-0xE7 */ + 0xACEE,0xACF2,0xACF4,0xACF7,0xACF8,0xACF9,0xACFA,0xACFB,/* 0xE8-0xEF */ + 0xACFE,0xACFF,0xAD01,0xAD02,0xAD03,0xAD05,0xAD07,0xAD08,/* 0xF0-0xF7 */ + 0xAD09,0xAD0A,0xAD0B,0xAD0E,0xAD10,0xAD12,0xAD13,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_82[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xAD14,0xAD15,0xAD16,0xAD17,0xAD19,0xAD1A,0xAD1B,/* 0x40-0x47 */ + 0xAD1D,0xAD1E,0xAD1F,0xAD21,0xAD22,0xAD23,0xAD24,0xAD25,/* 0x48-0x4F */ + 0xAD26,0xAD27,0xAD28,0xAD2A,0xAD2B,0xAD2E,0xAD2F,0xAD30,/* 0x50-0x57 */ + 0xAD31,0xAD32,0xAD33,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xAD36,0xAD37,0xAD39,0xAD3A,0xAD3B,0xAD3D,0xAD3E,/* 0x60-0x67 */ + 0xAD3F,0xAD40,0xAD41,0xAD42,0xAD43,0xAD46,0xAD48,0xAD4A,/* 0x68-0x6F */ + 0xAD4B,0xAD4C,0xAD4D,0xAD4E,0xAD4F,0xAD51,0xAD52,0xAD53,/* 0x70-0x77 */ + 0xAD55,0xAD56,0xAD57,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xAD59,0xAD5A,0xAD5B,0xAD5C,0xAD5D,0xAD5E,0xAD5F,/* 0x80-0x87 */ + 0xAD60,0xAD62,0xAD64,0xAD65,0xAD66,0xAD67,0xAD68,0xAD69,/* 0x88-0x8F */ + 0xAD6A,0xAD6B,0xAD6E,0xAD6F,0xAD71,0xAD72,0xAD77,0xAD78,/* 0x90-0x97 */ + 0xAD79,0xAD7A,0xAD7E,0xAD80,0xAD83,0xAD84,0xAD85,0xAD86,/* 0x98-0x9F */ + 0xAD87,0xAD8A,0xAD8B,0xAD8D,0xAD8E,0xAD8F,0xAD91,0xAD92,/* 0xA0-0xA7 */ + 0xAD93,0xAD94,0xAD95,0xAD96,0xAD97,0xAD98,0xAD99,0xAD9A,/* 0xA8-0xAF */ + 0xAD9B,0xAD9E,0xAD9F,0xADA0,0xADA1,0xADA2,0xADA3,0xADA5,/* 0xB0-0xB7 */ + 0xADA6,0xADA7,0xADA8,0xADA9,0xADAA,0xADAB,0xADAC,0xADAD,/* 0xB8-0xBF */ + 0xADAE,0xADAF,0xADB0,0xADB1,0xADB2,0xADB3,0xADB4,0xADB5,/* 0xC0-0xC7 */ + 0xADB6,0xADB8,0xADB9,0xADBA,0xADBB,0xADBC,0xADBD,0xADBE,/* 0xC8-0xCF */ + 0xADBF,0xADC2,0xADC3,0xADC5,0xADC6,0xADC7,0xADC9,0xADCA,/* 0xD0-0xD7 */ + 0xADCB,0xADCC,0xADCD,0xADCE,0xADCF,0xADD2,0xADD4,0xADD5,/* 0xD8-0xDF */ + 0xADD6,0xADD7,0xADD8,0xADD9,0xADDA,0xADDB,0xADDD,0xADDE,/* 0xE0-0xE7 */ + 0xADDF,0xADE1,0xADE2,0xADE3,0xADE5,0xADE6,0xADE7,0xADE8,/* 0xE8-0xEF */ + 0xADE9,0xADEA,0xADEB,0xADEC,0xADED,0xADEE,0xADEF,0xADF0,/* 0xF0-0xF7 */ + 0xADF1,0xADF2,0xADF3,0xADF4,0xADF5,0xADF6,0xADF7,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_83[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xADFA,0xADFB,0xADFD,0xADFE,0xAE02,0xAE03,0xAE04,/* 0x40-0x47 */ + 0xAE05,0xAE06,0xAE07,0xAE0A,0xAE0C,0xAE0E,0xAE0F,0xAE10,/* 0x48-0x4F */ + 0xAE11,0xAE12,0xAE13,0xAE15,0xAE16,0xAE17,0xAE18,0xAE19,/* 0x50-0x57 */ + 0xAE1A,0xAE1B,0xAE1C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xAE1D,0xAE1E,0xAE1F,0xAE20,0xAE21,0xAE22,0xAE23,/* 0x60-0x67 */ + 0xAE24,0xAE25,0xAE26,0xAE27,0xAE28,0xAE29,0xAE2A,0xAE2B,/* 0x68-0x6F */ + 0xAE2C,0xAE2D,0xAE2E,0xAE2F,0xAE32,0xAE33,0xAE35,0xAE36,/* 0x70-0x77 */ + 0xAE39,0xAE3B,0xAE3C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xAE3D,0xAE3E,0xAE3F,0xAE42,0xAE44,0xAE47,0xAE48,/* 0x80-0x87 */ + 0xAE49,0xAE4B,0xAE4F,0xAE51,0xAE52,0xAE53,0xAE55,0xAE57,/* 0x88-0x8F */ + 0xAE58,0xAE59,0xAE5A,0xAE5B,0xAE5E,0xAE62,0xAE63,0xAE64,/* 0x90-0x97 */ + 0xAE66,0xAE67,0xAE6A,0xAE6B,0xAE6D,0xAE6E,0xAE6F,0xAE71,/* 0x98-0x9F */ + 0xAE72,0xAE73,0xAE74,0xAE75,0xAE76,0xAE77,0xAE7A,0xAE7E,/* 0xA0-0xA7 */ + 0xAE7F,0xAE80,0xAE81,0xAE82,0xAE83,0xAE86,0xAE87,0xAE88,/* 0xA8-0xAF */ + 0xAE89,0xAE8A,0xAE8B,0xAE8D,0xAE8E,0xAE8F,0xAE90,0xAE91,/* 0xB0-0xB7 */ + 0xAE92,0xAE93,0xAE94,0xAE95,0xAE96,0xAE97,0xAE98,0xAE99,/* 0xB8-0xBF */ + 0xAE9A,0xAE9B,0xAE9C,0xAE9D,0xAE9E,0xAE9F,0xAEA0,0xAEA1,/* 0xC0-0xC7 */ + 0xAEA2,0xAEA3,0xAEA4,0xAEA5,0xAEA6,0xAEA7,0xAEA8,0xAEA9,/* 0xC8-0xCF */ + 0xAEAA,0xAEAB,0xAEAC,0xAEAD,0xAEAE,0xAEAF,0xAEB0,0xAEB1,/* 0xD0-0xD7 */ + 0xAEB2,0xAEB3,0xAEB4,0xAEB5,0xAEB6,0xAEB7,0xAEB8,0xAEB9,/* 0xD8-0xDF */ + 0xAEBA,0xAEBB,0xAEBF,0xAEC1,0xAEC2,0xAEC3,0xAEC5,0xAEC6,/* 0xE0-0xE7 */ + 0xAEC7,0xAEC8,0xAEC9,0xAECA,0xAECB,0xAECE,0xAED2,0xAED3,/* 0xE8-0xEF */ + 0xAED4,0xAED5,0xAED6,0xAED7,0xAEDA,0xAEDB,0xAEDD,0xAEDE,/* 0xF0-0xF7 */ + 0xAEDF,0xAEE0,0xAEE1,0xAEE2,0xAEE3,0xAEE4,0xAEE5,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_84[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xAEE6,0xAEE7,0xAEE9,0xAEEA,0xAEEC,0xAEEE,0xAEEF,/* 0x40-0x47 */ + 0xAEF0,0xAEF1,0xAEF2,0xAEF3,0xAEF5,0xAEF6,0xAEF7,0xAEF9,/* 0x48-0x4F */ + 0xAEFA,0xAEFB,0xAEFD,0xAEFE,0xAEFF,0xAF00,0xAF01,0xAF02,/* 0x50-0x57 */ + 0xAF03,0xAF04,0xAF05,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xAF06,0xAF09,0xAF0A,0xAF0B,0xAF0C,0xAF0E,0xAF0F,/* 0x60-0x67 */ + 0xAF11,0xAF12,0xAF13,0xAF14,0xAF15,0xAF16,0xAF17,0xAF18,/* 0x68-0x6F */ + 0xAF19,0xAF1A,0xAF1B,0xAF1C,0xAF1D,0xAF1E,0xAF1F,0xAF20,/* 0x70-0x77 */ + 0xAF21,0xAF22,0xAF23,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xAF24,0xAF25,0xAF26,0xAF27,0xAF28,0xAF29,0xAF2A,/* 0x80-0x87 */ + 0xAF2B,0xAF2E,0xAF2F,0xAF31,0xAF33,0xAF35,0xAF36,0xAF37,/* 0x88-0x8F */ + 0xAF38,0xAF39,0xAF3A,0xAF3B,0xAF3E,0xAF40,0xAF44,0xAF45,/* 0x90-0x97 */ + 0xAF46,0xAF47,0xAF4A,0xAF4B,0xAF4C,0xAF4D,0xAF4E,0xAF4F,/* 0x98-0x9F */ + 0xAF51,0xAF52,0xAF53,0xAF54,0xAF55,0xAF56,0xAF57,0xAF58,/* 0xA0-0xA7 */ + 0xAF59,0xAF5A,0xAF5B,0xAF5E,0xAF5F,0xAF60,0xAF61,0xAF62,/* 0xA8-0xAF */ + 0xAF63,0xAF66,0xAF67,0xAF68,0xAF69,0xAF6A,0xAF6B,0xAF6C,/* 0xB0-0xB7 */ + 0xAF6D,0xAF6E,0xAF6F,0xAF70,0xAF71,0xAF72,0xAF73,0xAF74,/* 0xB8-0xBF */ + 0xAF75,0xAF76,0xAF77,0xAF78,0xAF7A,0xAF7B,0xAF7C,0xAF7D,/* 0xC0-0xC7 */ + 0xAF7E,0xAF7F,0xAF81,0xAF82,0xAF83,0xAF85,0xAF86,0xAF87,/* 0xC8-0xCF */ + 0xAF89,0xAF8A,0xAF8B,0xAF8C,0xAF8D,0xAF8E,0xAF8F,0xAF92,/* 0xD0-0xD7 */ + 0xAF93,0xAF94,0xAF96,0xAF97,0xAF98,0xAF99,0xAF9A,0xAF9B,/* 0xD8-0xDF */ + 0xAF9D,0xAF9E,0xAF9F,0xAFA0,0xAFA1,0xAFA2,0xAFA3,0xAFA4,/* 0xE0-0xE7 */ + 0xAFA5,0xAFA6,0xAFA7,0xAFA8,0xAFA9,0xAFAA,0xAFAB,0xAFAC,/* 0xE8-0xEF */ + 0xAFAD,0xAFAE,0xAFAF,0xAFB0,0xAFB1,0xAFB2,0xAFB3,0xAFB4,/* 0xF0-0xF7 */ + 0xAFB5,0xAFB6,0xAFB7,0xAFBA,0xAFBB,0xAFBD,0xAFBE,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_85[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xAFBF,0xAFC1,0xAFC2,0xAFC3,0xAFC4,0xAFC5,0xAFC6,/* 0x40-0x47 */ + 0xAFCA,0xAFCC,0xAFCF,0xAFD0,0xAFD1,0xAFD2,0xAFD3,0xAFD5,/* 0x48-0x4F */ + 0xAFD6,0xAFD7,0xAFD8,0xAFD9,0xAFDA,0xAFDB,0xAFDD,0xAFDE,/* 0x50-0x57 */ + 0xAFDF,0xAFE0,0xAFE1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xAFE2,0xAFE3,0xAFE4,0xAFE5,0xAFE6,0xAFE7,0xAFEA,/* 0x60-0x67 */ + 0xAFEB,0xAFEC,0xAFED,0xAFEE,0xAFEF,0xAFF2,0xAFF3,0xAFF5,/* 0x68-0x6F */ + 0xAFF6,0xAFF7,0xAFF9,0xAFFA,0xAFFB,0xAFFC,0xAFFD,0xAFFE,/* 0x70-0x77 */ + 0xAFFF,0xB002,0xB003,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB005,0xB006,0xB007,0xB008,0xB009,0xB00A,0xB00B,/* 0x80-0x87 */ + 0xB00D,0xB00E,0xB00F,0xB011,0xB012,0xB013,0xB015,0xB016,/* 0x88-0x8F */ + 0xB017,0xB018,0xB019,0xB01A,0xB01B,0xB01E,0xB01F,0xB020,/* 0x90-0x97 */ + 0xB021,0xB022,0xB023,0xB024,0xB025,0xB026,0xB027,0xB029,/* 0x98-0x9F */ + 0xB02A,0xB02B,0xB02C,0xB02D,0xB02E,0xB02F,0xB030,0xB031,/* 0xA0-0xA7 */ + 0xB032,0xB033,0xB034,0xB035,0xB036,0xB037,0xB038,0xB039,/* 0xA8-0xAF */ + 0xB03A,0xB03B,0xB03C,0xB03D,0xB03E,0xB03F,0xB040,0xB041,/* 0xB0-0xB7 */ + 0xB042,0xB043,0xB046,0xB047,0xB049,0xB04B,0xB04D,0xB04F,/* 0xB8-0xBF */ + 0xB050,0xB051,0xB052,0xB056,0xB058,0xB05A,0xB05B,0xB05C,/* 0xC0-0xC7 */ + 0xB05E,0xB05F,0xB060,0xB061,0xB062,0xB063,0xB064,0xB065,/* 0xC8-0xCF */ + 0xB066,0xB067,0xB068,0xB069,0xB06A,0xB06B,0xB06C,0xB06D,/* 0xD0-0xD7 */ + 0xB06E,0xB06F,0xB070,0xB071,0xB072,0xB073,0xB074,0xB075,/* 0xD8-0xDF */ + 0xB076,0xB077,0xB078,0xB079,0xB07A,0xB07B,0xB07E,0xB07F,/* 0xE0-0xE7 */ + 0xB081,0xB082,0xB083,0xB085,0xB086,0xB087,0xB088,0xB089,/* 0xE8-0xEF */ + 0xB08A,0xB08B,0xB08E,0xB090,0xB092,0xB093,0xB094,0xB095,/* 0xF0-0xF7 */ + 0xB096,0xB097,0xB09B,0xB09D,0xB09E,0xB0A3,0xB0A4,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_86[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB0A5,0xB0A6,0xB0A7,0xB0AA,0xB0B0,0xB0B2,0xB0B6,/* 0x40-0x47 */ + 0xB0B7,0xB0B9,0xB0BA,0xB0BB,0xB0BD,0xB0BE,0xB0BF,0xB0C0,/* 0x48-0x4F */ + 0xB0C1,0xB0C2,0xB0C3,0xB0C6,0xB0CA,0xB0CB,0xB0CC,0xB0CD,/* 0x50-0x57 */ + 0xB0CE,0xB0CF,0xB0D2,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB0D3,0xB0D5,0xB0D6,0xB0D7,0xB0D9,0xB0DA,0xB0DB,/* 0x60-0x67 */ + 0xB0DC,0xB0DD,0xB0DE,0xB0DF,0xB0E1,0xB0E2,0xB0E3,0xB0E4,/* 0x68-0x6F */ + 0xB0E6,0xB0E7,0xB0E8,0xB0E9,0xB0EA,0xB0EB,0xB0EC,0xB0ED,/* 0x70-0x77 */ + 0xB0EE,0xB0EF,0xB0F0,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB0F1,0xB0F2,0xB0F3,0xB0F4,0xB0F5,0xB0F6,0xB0F7,/* 0x80-0x87 */ + 0xB0F8,0xB0F9,0xB0FA,0xB0FB,0xB0FC,0xB0FD,0xB0FE,0xB0FF,/* 0x88-0x8F */ + 0xB100,0xB101,0xB102,0xB103,0xB104,0xB105,0xB106,0xB107,/* 0x90-0x97 */ + 0xB10A,0xB10D,0xB10E,0xB10F,0xB111,0xB114,0xB115,0xB116,/* 0x98-0x9F */ + 0xB117,0xB11A,0xB11E,0xB11F,0xB120,0xB121,0xB122,0xB126,/* 0xA0-0xA7 */ + 0xB127,0xB129,0xB12A,0xB12B,0xB12D,0xB12E,0xB12F,0xB130,/* 0xA8-0xAF */ + 0xB131,0xB132,0xB133,0xB136,0xB13A,0xB13B,0xB13C,0xB13D,/* 0xB0-0xB7 */ + 0xB13E,0xB13F,0xB142,0xB143,0xB145,0xB146,0xB147,0xB149,/* 0xB8-0xBF */ + 0xB14A,0xB14B,0xB14C,0xB14D,0xB14E,0xB14F,0xB152,0xB153,/* 0xC0-0xC7 */ + 0xB156,0xB157,0xB159,0xB15A,0xB15B,0xB15D,0xB15E,0xB15F,/* 0xC8-0xCF */ + 0xB161,0xB162,0xB163,0xB164,0xB165,0xB166,0xB167,0xB168,/* 0xD0-0xD7 */ + 0xB169,0xB16A,0xB16B,0xB16C,0xB16D,0xB16E,0xB16F,0xB170,/* 0xD8-0xDF */ + 0xB171,0xB172,0xB173,0xB174,0xB175,0xB176,0xB177,0xB17A,/* 0xE0-0xE7 */ + 0xB17B,0xB17D,0xB17E,0xB17F,0xB181,0xB183,0xB184,0xB185,/* 0xE8-0xEF */ + 0xB186,0xB187,0xB18A,0xB18C,0xB18E,0xB18F,0xB190,0xB191,/* 0xF0-0xF7 */ + 0xB195,0xB196,0xB197,0xB199,0xB19A,0xB19B,0xB19D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_87[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB19E,0xB19F,0xB1A0,0xB1A1,0xB1A2,0xB1A3,0xB1A4,/* 0x40-0x47 */ + 0xB1A5,0xB1A6,0xB1A7,0xB1A9,0xB1AA,0xB1AB,0xB1AC,0xB1AD,/* 0x48-0x4F */ + 0xB1AE,0xB1AF,0xB1B0,0xB1B1,0xB1B2,0xB1B3,0xB1B4,0xB1B5,/* 0x50-0x57 */ + 0xB1B6,0xB1B7,0xB1B8,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB1B9,0xB1BA,0xB1BB,0xB1BC,0xB1BD,0xB1BE,0xB1BF,/* 0x60-0x67 */ + 0xB1C0,0xB1C1,0xB1C2,0xB1C3,0xB1C4,0xB1C5,0xB1C6,0xB1C7,/* 0x68-0x6F */ + 0xB1C8,0xB1C9,0xB1CA,0xB1CB,0xB1CD,0xB1CE,0xB1CF,0xB1D1,/* 0x70-0x77 */ + 0xB1D2,0xB1D3,0xB1D5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB1D6,0xB1D7,0xB1D8,0xB1D9,0xB1DA,0xB1DB,0xB1DE,/* 0x80-0x87 */ + 0xB1E0,0xB1E1,0xB1E2,0xB1E3,0xB1E4,0xB1E5,0xB1E6,0xB1E7,/* 0x88-0x8F */ + 0xB1EA,0xB1EB,0xB1ED,0xB1EE,0xB1EF,0xB1F1,0xB1F2,0xB1F3,/* 0x90-0x97 */ + 0xB1F4,0xB1F5,0xB1F6,0xB1F7,0xB1F8,0xB1FA,0xB1FC,0xB1FE,/* 0x98-0x9F */ + 0xB1FF,0xB200,0xB201,0xB202,0xB203,0xB206,0xB207,0xB209,/* 0xA0-0xA7 */ + 0xB20A,0xB20D,0xB20E,0xB20F,0xB210,0xB211,0xB212,0xB213,/* 0xA8-0xAF */ + 0xB216,0xB218,0xB21A,0xB21B,0xB21C,0xB21D,0xB21E,0xB21F,/* 0xB0-0xB7 */ + 0xB221,0xB222,0xB223,0xB224,0xB225,0xB226,0xB227,0xB228,/* 0xB8-0xBF */ + 0xB229,0xB22A,0xB22B,0xB22C,0xB22D,0xB22E,0xB22F,0xB230,/* 0xC0-0xC7 */ + 0xB231,0xB232,0xB233,0xB235,0xB236,0xB237,0xB238,0xB239,/* 0xC8-0xCF */ + 0xB23A,0xB23B,0xB23D,0xB23E,0xB23F,0xB240,0xB241,0xB242,/* 0xD0-0xD7 */ + 0xB243,0xB244,0xB245,0xB246,0xB247,0xB248,0xB249,0xB24A,/* 0xD8-0xDF */ + 0xB24B,0xB24C,0xB24D,0xB24E,0xB24F,0xB250,0xB251,0xB252,/* 0xE0-0xE7 */ + 0xB253,0xB254,0xB255,0xB256,0xB257,0xB259,0xB25A,0xB25B,/* 0xE8-0xEF */ + 0xB25D,0xB25E,0xB25F,0xB261,0xB262,0xB263,0xB264,0xB265,/* 0xF0-0xF7 */ + 0xB266,0xB267,0xB26A,0xB26B,0xB26C,0xB26D,0xB26E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_88[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB26F,0xB270,0xB271,0xB272,0xB273,0xB276,0xB277,/* 0x40-0x47 */ + 0xB278,0xB279,0xB27A,0xB27B,0xB27D,0xB27E,0xB27F,0xB280,/* 0x48-0x4F */ + 0xB281,0xB282,0xB283,0xB286,0xB287,0xB288,0xB28A,0xB28B,/* 0x50-0x57 */ + 0xB28C,0xB28D,0xB28E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB28F,0xB292,0xB293,0xB295,0xB296,0xB297,0xB29B,/* 0x60-0x67 */ + 0xB29C,0xB29D,0xB29E,0xB29F,0xB2A2,0xB2A4,0xB2A7,0xB2A8,/* 0x68-0x6F */ + 0xB2A9,0xB2AB,0xB2AD,0xB2AE,0xB2AF,0xB2B1,0xB2B2,0xB2B3,/* 0x70-0x77 */ + 0xB2B5,0xB2B6,0xB2B7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB2B8,0xB2B9,0xB2BA,0xB2BB,0xB2BC,0xB2BD,0xB2BE,/* 0x80-0x87 */ + 0xB2BF,0xB2C0,0xB2C1,0xB2C2,0xB2C3,0xB2C4,0xB2C5,0xB2C6,/* 0x88-0x8F */ + 0xB2C7,0xB2CA,0xB2CB,0xB2CD,0xB2CE,0xB2CF,0xB2D1,0xB2D3,/* 0x90-0x97 */ + 0xB2D4,0xB2D5,0xB2D6,0xB2D7,0xB2DA,0xB2DC,0xB2DE,0xB2DF,/* 0x98-0x9F */ + 0xB2E0,0xB2E1,0xB2E3,0xB2E7,0xB2E9,0xB2EA,0xB2F0,0xB2F1,/* 0xA0-0xA7 */ + 0xB2F2,0xB2F6,0xB2FC,0xB2FD,0xB2FE,0xB302,0xB303,0xB305,/* 0xA8-0xAF */ + 0xB306,0xB307,0xB309,0xB30A,0xB30B,0xB30C,0xB30D,0xB30E,/* 0xB0-0xB7 */ + 0xB30F,0xB312,0xB316,0xB317,0xB318,0xB319,0xB31A,0xB31B,/* 0xB8-0xBF */ + 0xB31D,0xB31E,0xB31F,0xB320,0xB321,0xB322,0xB323,0xB324,/* 0xC0-0xC7 */ + 0xB325,0xB326,0xB327,0xB328,0xB329,0xB32A,0xB32B,0xB32C,/* 0xC8-0xCF */ + 0xB32D,0xB32E,0xB32F,0xB330,0xB331,0xB332,0xB333,0xB334,/* 0xD0-0xD7 */ + 0xB335,0xB336,0xB337,0xB338,0xB339,0xB33A,0xB33B,0xB33C,/* 0xD8-0xDF */ + 0xB33D,0xB33E,0xB33F,0xB340,0xB341,0xB342,0xB343,0xB344,/* 0xE0-0xE7 */ + 0xB345,0xB346,0xB347,0xB348,0xB349,0xB34A,0xB34B,0xB34C,/* 0xE8-0xEF */ + 0xB34D,0xB34E,0xB34F,0xB350,0xB351,0xB352,0xB353,0xB357,/* 0xF0-0xF7 */ + 0xB359,0xB35A,0xB35D,0xB360,0xB361,0xB362,0xB363,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_89[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB366,0xB368,0xB36A,0xB36C,0xB36D,0xB36F,0xB372,/* 0x40-0x47 */ + 0xB373,0xB375,0xB376,0xB377,0xB379,0xB37A,0xB37B,0xB37C,/* 0x48-0x4F */ + 0xB37D,0xB37E,0xB37F,0xB382,0xB386,0xB387,0xB388,0xB389,/* 0x50-0x57 */ + 0xB38A,0xB38B,0xB38D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB38E,0xB38F,0xB391,0xB392,0xB393,0xB395,0xB396,/* 0x60-0x67 */ + 0xB397,0xB398,0xB399,0xB39A,0xB39B,0xB39C,0xB39D,0xB39E,/* 0x68-0x6F */ + 0xB39F,0xB3A2,0xB3A3,0xB3A4,0xB3A5,0xB3A6,0xB3A7,0xB3A9,/* 0x70-0x77 */ + 0xB3AA,0xB3AB,0xB3AD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB3AE,0xB3AF,0xB3B0,0xB3B1,0xB3B2,0xB3B3,0xB3B4,/* 0x80-0x87 */ + 0xB3B5,0xB3B6,0xB3B7,0xB3B8,0xB3B9,0xB3BA,0xB3BB,0xB3BC,/* 0x88-0x8F */ + 0xB3BD,0xB3BE,0xB3BF,0xB3C0,0xB3C1,0xB3C2,0xB3C3,0xB3C6,/* 0x90-0x97 */ + 0xB3C7,0xB3C9,0xB3CA,0xB3CD,0xB3CF,0xB3D1,0xB3D2,0xB3D3,/* 0x98-0x9F */ + 0xB3D6,0xB3D8,0xB3DA,0xB3DC,0xB3DE,0xB3DF,0xB3E1,0xB3E2,/* 0xA0-0xA7 */ + 0xB3E3,0xB3E5,0xB3E6,0xB3E7,0xB3E9,0xB3EA,0xB3EB,0xB3EC,/* 0xA8-0xAF */ + 0xB3ED,0xB3EE,0xB3EF,0xB3F0,0xB3F1,0xB3F2,0xB3F3,0xB3F4,/* 0xB0-0xB7 */ + 0xB3F5,0xB3F6,0xB3F7,0xB3F8,0xB3F9,0xB3FA,0xB3FB,0xB3FD,/* 0xB8-0xBF */ + 0xB3FE,0xB3FF,0xB400,0xB401,0xB402,0xB403,0xB404,0xB405,/* 0xC0-0xC7 */ + 0xB406,0xB407,0xB408,0xB409,0xB40A,0xB40B,0xB40C,0xB40D,/* 0xC8-0xCF */ + 0xB40E,0xB40F,0xB411,0xB412,0xB413,0xB414,0xB415,0xB416,/* 0xD0-0xD7 */ + 0xB417,0xB419,0xB41A,0xB41B,0xB41D,0xB41E,0xB41F,0xB421,/* 0xD8-0xDF */ + 0xB422,0xB423,0xB424,0xB425,0xB426,0xB427,0xB42A,0xB42C,/* 0xE0-0xE7 */ + 0xB42D,0xB42E,0xB42F,0xB430,0xB431,0xB432,0xB433,0xB435,/* 0xE8-0xEF */ + 0xB436,0xB437,0xB438,0xB439,0xB43A,0xB43B,0xB43C,0xB43D,/* 0xF0-0xF7 */ + 0xB43E,0xB43F,0xB440,0xB441,0xB442,0xB443,0xB444,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8A[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB445,0xB446,0xB447,0xB448,0xB449,0xB44A,0xB44B,/* 0x40-0x47 */ + 0xB44C,0xB44D,0xB44E,0xB44F,0xB452,0xB453,0xB455,0xB456,/* 0x48-0x4F */ + 0xB457,0xB459,0xB45A,0xB45B,0xB45C,0xB45D,0xB45E,0xB45F,/* 0x50-0x57 */ + 0xB462,0xB464,0xB466,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB467,0xB468,0xB469,0xB46A,0xB46B,0xB46D,0xB46E,/* 0x60-0x67 */ + 0xB46F,0xB470,0xB471,0xB472,0xB473,0xB474,0xB475,0xB476,/* 0x68-0x6F */ + 0xB477,0xB478,0xB479,0xB47A,0xB47B,0xB47C,0xB47D,0xB47E,/* 0x70-0x77 */ + 0xB47F,0xB481,0xB482,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB483,0xB484,0xB485,0xB486,0xB487,0xB489,0xB48A,/* 0x80-0x87 */ + 0xB48B,0xB48C,0xB48D,0xB48E,0xB48F,0xB490,0xB491,0xB492,/* 0x88-0x8F */ + 0xB493,0xB494,0xB495,0xB496,0xB497,0xB498,0xB499,0xB49A,/* 0x90-0x97 */ + 0xB49B,0xB49C,0xB49E,0xB49F,0xB4A0,0xB4A1,0xB4A2,0xB4A3,/* 0x98-0x9F */ + 0xB4A5,0xB4A6,0xB4A7,0xB4A9,0xB4AA,0xB4AB,0xB4AD,0xB4AE,/* 0xA0-0xA7 */ + 0xB4AF,0xB4B0,0xB4B1,0xB4B2,0xB4B3,0xB4B4,0xB4B6,0xB4B8,/* 0xA8-0xAF */ + 0xB4BA,0xB4BB,0xB4BC,0xB4BD,0xB4BE,0xB4BF,0xB4C1,0xB4C2,/* 0xB0-0xB7 */ + 0xB4C3,0xB4C5,0xB4C6,0xB4C7,0xB4C9,0xB4CA,0xB4CB,0xB4CC,/* 0xB8-0xBF */ + 0xB4CD,0xB4CE,0xB4CF,0xB4D1,0xB4D2,0xB4D3,0xB4D4,0xB4D6,/* 0xC0-0xC7 */ + 0xB4D7,0xB4D8,0xB4D9,0xB4DA,0xB4DB,0xB4DE,0xB4DF,0xB4E1,/* 0xC8-0xCF */ + 0xB4E2,0xB4E5,0xB4E7,0xB4E8,0xB4E9,0xB4EA,0xB4EB,0xB4EE,/* 0xD0-0xD7 */ + 0xB4F0,0xB4F2,0xB4F3,0xB4F4,0xB4F5,0xB4F6,0xB4F7,0xB4F9,/* 0xD8-0xDF */ + 0xB4FA,0xB4FB,0xB4FC,0xB4FD,0xB4FE,0xB4FF,0xB500,0xB501,/* 0xE0-0xE7 */ + 0xB502,0xB503,0xB504,0xB505,0xB506,0xB507,0xB508,0xB509,/* 0xE8-0xEF */ + 0xB50A,0xB50B,0xB50C,0xB50D,0xB50E,0xB50F,0xB510,0xB511,/* 0xF0-0xF7 */ + 0xB512,0xB513,0xB516,0xB517,0xB519,0xB51A,0xB51D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8B[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB51E,0xB51F,0xB520,0xB521,0xB522,0xB523,0xB526,/* 0x40-0x47 */ + 0xB52B,0xB52C,0xB52D,0xB52E,0xB52F,0xB532,0xB533,0xB535,/* 0x48-0x4F */ + 0xB536,0xB537,0xB539,0xB53A,0xB53B,0xB53C,0xB53D,0xB53E,/* 0x50-0x57 */ + 0xB53F,0xB542,0xB546,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB547,0xB548,0xB549,0xB54A,0xB54E,0xB54F,0xB551,/* 0x60-0x67 */ + 0xB552,0xB553,0xB555,0xB556,0xB557,0xB558,0xB559,0xB55A,/* 0x68-0x6F */ + 0xB55B,0xB55E,0xB562,0xB563,0xB564,0xB565,0xB566,0xB567,/* 0x70-0x77 */ + 0xB568,0xB569,0xB56A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB56B,0xB56C,0xB56D,0xB56E,0xB56F,0xB570,0xB571,/* 0x80-0x87 */ + 0xB572,0xB573,0xB574,0xB575,0xB576,0xB577,0xB578,0xB579,/* 0x88-0x8F */ + 0xB57A,0xB57B,0xB57C,0xB57D,0xB57E,0xB57F,0xB580,0xB581,/* 0x90-0x97 */ + 0xB582,0xB583,0xB584,0xB585,0xB586,0xB587,0xB588,0xB589,/* 0x98-0x9F */ + 0xB58A,0xB58B,0xB58C,0xB58D,0xB58E,0xB58F,0xB590,0xB591,/* 0xA0-0xA7 */ + 0xB592,0xB593,0xB594,0xB595,0xB596,0xB597,0xB598,0xB599,/* 0xA8-0xAF */ + 0xB59A,0xB59B,0xB59C,0xB59D,0xB59E,0xB59F,0xB5A2,0xB5A3,/* 0xB0-0xB7 */ + 0xB5A5,0xB5A6,0xB5A7,0xB5A9,0xB5AC,0xB5AD,0xB5AE,0xB5AF,/* 0xB8-0xBF */ + 0xB5B2,0xB5B6,0xB5B7,0xB5B8,0xB5B9,0xB5BA,0xB5BE,0xB5BF,/* 0xC0-0xC7 */ + 0xB5C1,0xB5C2,0xB5C3,0xB5C5,0xB5C6,0xB5C7,0xB5C8,0xB5C9,/* 0xC8-0xCF */ + 0xB5CA,0xB5CB,0xB5CE,0xB5D2,0xB5D3,0xB5D4,0xB5D5,0xB5D6,/* 0xD0-0xD7 */ + 0xB5D7,0xB5D9,0xB5DA,0xB5DB,0xB5DC,0xB5DD,0xB5DE,0xB5DF,/* 0xD8-0xDF */ + 0xB5E0,0xB5E1,0xB5E2,0xB5E3,0xB5E4,0xB5E5,0xB5E6,0xB5E7,/* 0xE0-0xE7 */ + 0xB5E8,0xB5E9,0xB5EA,0xB5EB,0xB5ED,0xB5EE,0xB5EF,0xB5F0,/* 0xE8-0xEF */ + 0xB5F1,0xB5F2,0xB5F3,0xB5F4,0xB5F5,0xB5F6,0xB5F7,0xB5F8,/* 0xF0-0xF7 */ + 0xB5F9,0xB5FA,0xB5FB,0xB5FC,0xB5FD,0xB5FE,0xB5FF,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8C[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB600,0xB601,0xB602,0xB603,0xB604,0xB605,0xB606,/* 0x40-0x47 */ + 0xB607,0xB608,0xB609,0xB60A,0xB60B,0xB60C,0xB60D,0xB60E,/* 0x48-0x4F */ + 0xB60F,0xB612,0xB613,0xB615,0xB616,0xB617,0xB619,0xB61A,/* 0x50-0x57 */ + 0xB61B,0xB61C,0xB61D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB61E,0xB61F,0xB620,0xB621,0xB622,0xB623,0xB624,/* 0x60-0x67 */ + 0xB626,0xB627,0xB628,0xB629,0xB62A,0xB62B,0xB62D,0xB62E,/* 0x68-0x6F */ + 0xB62F,0xB630,0xB631,0xB632,0xB633,0xB635,0xB636,0xB637,/* 0x70-0x77 */ + 0xB638,0xB639,0xB63A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB63B,0xB63C,0xB63D,0xB63E,0xB63F,0xB640,0xB641,/* 0x80-0x87 */ + 0xB642,0xB643,0xB644,0xB645,0xB646,0xB647,0xB649,0xB64A,/* 0x88-0x8F */ + 0xB64B,0xB64C,0xB64D,0xB64E,0xB64F,0xB650,0xB651,0xB652,/* 0x90-0x97 */ + 0xB653,0xB654,0xB655,0xB656,0xB657,0xB658,0xB659,0xB65A,/* 0x98-0x9F */ + 0xB65B,0xB65C,0xB65D,0xB65E,0xB65F,0xB660,0xB661,0xB662,/* 0xA0-0xA7 */ + 0xB663,0xB665,0xB666,0xB667,0xB669,0xB66A,0xB66B,0xB66C,/* 0xA8-0xAF */ + 0xB66D,0xB66E,0xB66F,0xB670,0xB671,0xB672,0xB673,0xB674,/* 0xB0-0xB7 */ + 0xB675,0xB676,0xB677,0xB678,0xB679,0xB67A,0xB67B,0xB67C,/* 0xB8-0xBF */ + 0xB67D,0xB67E,0xB67F,0xB680,0xB681,0xB682,0xB683,0xB684,/* 0xC0-0xC7 */ + 0xB685,0xB686,0xB687,0xB688,0xB689,0xB68A,0xB68B,0xB68C,/* 0xC8-0xCF */ + 0xB68D,0xB68E,0xB68F,0xB690,0xB691,0xB692,0xB693,0xB694,/* 0xD0-0xD7 */ + 0xB695,0xB696,0xB697,0xB698,0xB699,0xB69A,0xB69B,0xB69E,/* 0xD8-0xDF */ + 0xB69F,0xB6A1,0xB6A2,0xB6A3,0xB6A5,0xB6A6,0xB6A7,0xB6A8,/* 0xE0-0xE7 */ + 0xB6A9,0xB6AA,0xB6AD,0xB6AE,0xB6AF,0xB6B0,0xB6B2,0xB6B3,/* 0xE8-0xEF */ + 0xB6B4,0xB6B5,0xB6B6,0xB6B7,0xB6B8,0xB6B9,0xB6BA,0xB6BB,/* 0xF0-0xF7 */ + 0xB6BC,0xB6BD,0xB6BE,0xB6BF,0xB6C0,0xB6C1,0xB6C2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8D[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB6C3,0xB6C4,0xB6C5,0xB6C6,0xB6C7,0xB6C8,0xB6C9,/* 0x40-0x47 */ + 0xB6CA,0xB6CB,0xB6CC,0xB6CD,0xB6CE,0xB6CF,0xB6D0,0xB6D1,/* 0x48-0x4F */ + 0xB6D2,0xB6D3,0xB6D5,0xB6D6,0xB6D7,0xB6D8,0xB6D9,0xB6DA,/* 0x50-0x57 */ + 0xB6DB,0xB6DC,0xB6DD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB6DE,0xB6DF,0xB6E0,0xB6E1,0xB6E2,0xB6E3,0xB6E4,/* 0x60-0x67 */ + 0xB6E5,0xB6E6,0xB6E7,0xB6E8,0xB6E9,0xB6EA,0xB6EB,0xB6EC,/* 0x68-0x6F */ + 0xB6ED,0xB6EE,0xB6EF,0xB6F1,0xB6F2,0xB6F3,0xB6F5,0xB6F6,/* 0x70-0x77 */ + 0xB6F7,0xB6F9,0xB6FA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB6FB,0xB6FC,0xB6FD,0xB6FE,0xB6FF,0xB702,0xB703,/* 0x80-0x87 */ + 0xB704,0xB706,0xB707,0xB708,0xB709,0xB70A,0xB70B,0xB70C,/* 0x88-0x8F */ + 0xB70D,0xB70E,0xB70F,0xB710,0xB711,0xB712,0xB713,0xB714,/* 0x90-0x97 */ + 0xB715,0xB716,0xB717,0xB718,0xB719,0xB71A,0xB71B,0xB71C,/* 0x98-0x9F */ + 0xB71D,0xB71E,0xB71F,0xB720,0xB721,0xB722,0xB723,0xB724,/* 0xA0-0xA7 */ + 0xB725,0xB726,0xB727,0xB72A,0xB72B,0xB72D,0xB72E,0xB731,/* 0xA8-0xAF */ + 0xB732,0xB733,0xB734,0xB735,0xB736,0xB737,0xB73A,0xB73C,/* 0xB0-0xB7 */ + 0xB73D,0xB73E,0xB73F,0xB740,0xB741,0xB742,0xB743,0xB745,/* 0xB8-0xBF */ + 0xB746,0xB747,0xB749,0xB74A,0xB74B,0xB74D,0xB74E,0xB74F,/* 0xC0-0xC7 */ + 0xB750,0xB751,0xB752,0xB753,0xB756,0xB757,0xB758,0xB759,/* 0xC8-0xCF */ + 0xB75A,0xB75B,0xB75C,0xB75D,0xB75E,0xB75F,0xB761,0xB762,/* 0xD0-0xD7 */ + 0xB763,0xB765,0xB766,0xB767,0xB769,0xB76A,0xB76B,0xB76C,/* 0xD8-0xDF */ + 0xB76D,0xB76E,0xB76F,0xB772,0xB774,0xB776,0xB777,0xB778,/* 0xE0-0xE7 */ + 0xB779,0xB77A,0xB77B,0xB77E,0xB77F,0xB781,0xB782,0xB783,/* 0xE8-0xEF */ + 0xB785,0xB786,0xB787,0xB788,0xB789,0xB78A,0xB78B,0xB78E,/* 0xF0-0xF7 */ + 0xB793,0xB794,0xB795,0xB79A,0xB79B,0xB79D,0xB79E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8E[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB79F,0xB7A1,0xB7A2,0xB7A3,0xB7A4,0xB7A5,0xB7A6,/* 0x40-0x47 */ + 0xB7A7,0xB7AA,0xB7AE,0xB7AF,0xB7B0,0xB7B1,0xB7B2,0xB7B3,/* 0x48-0x4F */ + 0xB7B6,0xB7B7,0xB7B9,0xB7BA,0xB7BB,0xB7BC,0xB7BD,0xB7BE,/* 0x50-0x57 */ + 0xB7BF,0xB7C0,0xB7C1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB7C2,0xB7C3,0xB7C4,0xB7C5,0xB7C6,0xB7C8,0xB7CA,/* 0x60-0x67 */ + 0xB7CB,0xB7CC,0xB7CD,0xB7CE,0xB7CF,0xB7D0,0xB7D1,0xB7D2,/* 0x68-0x6F */ + 0xB7D3,0xB7D4,0xB7D5,0xB7D6,0xB7D7,0xB7D8,0xB7D9,0xB7DA,/* 0x70-0x77 */ + 0xB7DB,0xB7DC,0xB7DD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB7DE,0xB7DF,0xB7E0,0xB7E1,0xB7E2,0xB7E3,0xB7E4,/* 0x80-0x87 */ + 0xB7E5,0xB7E6,0xB7E7,0xB7E8,0xB7E9,0xB7EA,0xB7EB,0xB7EE,/* 0x88-0x8F */ + 0xB7EF,0xB7F1,0xB7F2,0xB7F3,0xB7F5,0xB7F6,0xB7F7,0xB7F8,/* 0x90-0x97 */ + 0xB7F9,0xB7FA,0xB7FB,0xB7FE,0xB802,0xB803,0xB804,0xB805,/* 0x98-0x9F */ + 0xB806,0xB80A,0xB80B,0xB80D,0xB80E,0xB80F,0xB811,0xB812,/* 0xA0-0xA7 */ + 0xB813,0xB814,0xB815,0xB816,0xB817,0xB81A,0xB81C,0xB81E,/* 0xA8-0xAF */ + 0xB81F,0xB820,0xB821,0xB822,0xB823,0xB826,0xB827,0xB829,/* 0xB0-0xB7 */ + 0xB82A,0xB82B,0xB82D,0xB82E,0xB82F,0xB830,0xB831,0xB832,/* 0xB8-0xBF */ + 0xB833,0xB836,0xB83A,0xB83B,0xB83C,0xB83D,0xB83E,0xB83F,/* 0xC0-0xC7 */ + 0xB841,0xB842,0xB843,0xB845,0xB846,0xB847,0xB848,0xB849,/* 0xC8-0xCF */ + 0xB84A,0xB84B,0xB84C,0xB84D,0xB84E,0xB84F,0xB850,0xB852,/* 0xD0-0xD7 */ + 0xB854,0xB855,0xB856,0xB857,0xB858,0xB859,0xB85A,0xB85B,/* 0xD8-0xDF */ + 0xB85E,0xB85F,0xB861,0xB862,0xB863,0xB865,0xB866,0xB867,/* 0xE0-0xE7 */ + 0xB868,0xB869,0xB86A,0xB86B,0xB86E,0xB870,0xB872,0xB873,/* 0xE8-0xEF */ + 0xB874,0xB875,0xB876,0xB877,0xB879,0xB87A,0xB87B,0xB87D,/* 0xF0-0xF7 */ + 0xB87E,0xB87F,0xB880,0xB881,0xB882,0xB883,0xB884,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_8F[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB885,0xB886,0xB887,0xB888,0xB889,0xB88A,0xB88B,/* 0x40-0x47 */ + 0xB88C,0xB88E,0xB88F,0xB890,0xB891,0xB892,0xB893,0xB894,/* 0x48-0x4F */ + 0xB895,0xB896,0xB897,0xB898,0xB899,0xB89A,0xB89B,0xB89C,/* 0x50-0x57 */ + 0xB89D,0xB89E,0xB89F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB8A0,0xB8A1,0xB8A2,0xB8A3,0xB8A4,0xB8A5,0xB8A6,/* 0x60-0x67 */ + 0xB8A7,0xB8A9,0xB8AA,0xB8AB,0xB8AC,0xB8AD,0xB8AE,0xB8AF,/* 0x68-0x6F */ + 0xB8B1,0xB8B2,0xB8B3,0xB8B5,0xB8B6,0xB8B7,0xB8B9,0xB8BA,/* 0x70-0x77 */ + 0xB8BB,0xB8BC,0xB8BD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB8BE,0xB8BF,0xB8C2,0xB8C4,0xB8C6,0xB8C7,0xB8C8,/* 0x80-0x87 */ + 0xB8C9,0xB8CA,0xB8CB,0xB8CD,0xB8CE,0xB8CF,0xB8D1,0xB8D2,/* 0x88-0x8F */ + 0xB8D3,0xB8D5,0xB8D6,0xB8D7,0xB8D8,0xB8D9,0xB8DA,0xB8DB,/* 0x90-0x97 */ + 0xB8DC,0xB8DE,0xB8E0,0xB8E2,0xB8E3,0xB8E4,0xB8E5,0xB8E6,/* 0x98-0x9F */ + 0xB8E7,0xB8EA,0xB8EB,0xB8ED,0xB8EE,0xB8EF,0xB8F1,0xB8F2,/* 0xA0-0xA7 */ + 0xB8F3,0xB8F4,0xB8F5,0xB8F6,0xB8F7,0xB8FA,0xB8FC,0xB8FE,/* 0xA8-0xAF */ + 0xB8FF,0xB900,0xB901,0xB902,0xB903,0xB905,0xB906,0xB907,/* 0xB0-0xB7 */ + 0xB908,0xB909,0xB90A,0xB90B,0xB90C,0xB90D,0xB90E,0xB90F,/* 0xB8-0xBF */ + 0xB910,0xB911,0xB912,0xB913,0xB914,0xB915,0xB916,0xB917,/* 0xC0-0xC7 */ + 0xB919,0xB91A,0xB91B,0xB91C,0xB91D,0xB91E,0xB91F,0xB921,/* 0xC8-0xCF */ + 0xB922,0xB923,0xB924,0xB925,0xB926,0xB927,0xB928,0xB929,/* 0xD0-0xD7 */ + 0xB92A,0xB92B,0xB92C,0xB92D,0xB92E,0xB92F,0xB930,0xB931,/* 0xD8-0xDF */ + 0xB932,0xB933,0xB934,0xB935,0xB936,0xB937,0xB938,0xB939,/* 0xE0-0xE7 */ + 0xB93A,0xB93B,0xB93E,0xB93F,0xB941,0xB942,0xB943,0xB945,/* 0xE8-0xEF */ + 0xB946,0xB947,0xB948,0xB949,0xB94A,0xB94B,0xB94D,0xB94E,/* 0xF0-0xF7 */ + 0xB950,0xB952,0xB953,0xB954,0xB955,0xB956,0xB957,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_90[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xB95A,0xB95B,0xB95D,0xB95E,0xB95F,0xB961,0xB962,/* 0x40-0x47 */ + 0xB963,0xB964,0xB965,0xB966,0xB967,0xB96A,0xB96C,0xB96E,/* 0x48-0x4F */ + 0xB96F,0xB970,0xB971,0xB972,0xB973,0xB976,0xB977,0xB979,/* 0x50-0x57 */ + 0xB97A,0xB97B,0xB97D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xB97E,0xB97F,0xB980,0xB981,0xB982,0xB983,0xB986,/* 0x60-0x67 */ + 0xB988,0xB98B,0xB98C,0xB98F,0xB990,0xB991,0xB992,0xB993,/* 0x68-0x6F */ + 0xB994,0xB995,0xB996,0xB997,0xB998,0xB999,0xB99A,0xB99B,/* 0x70-0x77 */ + 0xB99C,0xB99D,0xB99E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xB99F,0xB9A0,0xB9A1,0xB9A2,0xB9A3,0xB9A4,0xB9A5,/* 0x80-0x87 */ + 0xB9A6,0xB9A7,0xB9A8,0xB9A9,0xB9AA,0xB9AB,0xB9AE,0xB9AF,/* 0x88-0x8F */ + 0xB9B1,0xB9B2,0xB9B3,0xB9B5,0xB9B6,0xB9B7,0xB9B8,0xB9B9,/* 0x90-0x97 */ + 0xB9BA,0xB9BB,0xB9BE,0xB9C0,0xB9C2,0xB9C3,0xB9C4,0xB9C5,/* 0x98-0x9F */ + 0xB9C6,0xB9C7,0xB9CA,0xB9CB,0xB9CD,0xB9D3,0xB9D4,0xB9D5,/* 0xA0-0xA7 */ + 0xB9D6,0xB9D7,0xB9DA,0xB9DC,0xB9DF,0xB9E0,0xB9E2,0xB9E6,/* 0xA8-0xAF */ + 0xB9E7,0xB9E9,0xB9EA,0xB9EB,0xB9ED,0xB9EE,0xB9EF,0xB9F0,/* 0xB0-0xB7 */ + 0xB9F1,0xB9F2,0xB9F3,0xB9F6,0xB9FB,0xB9FC,0xB9FD,0xB9FE,/* 0xB8-0xBF */ + 0xB9FF,0xBA02,0xBA03,0xBA04,0xBA05,0xBA06,0xBA07,0xBA09,/* 0xC0-0xC7 */ + 0xBA0A,0xBA0B,0xBA0C,0xBA0D,0xBA0E,0xBA0F,0xBA10,0xBA11,/* 0xC8-0xCF */ + 0xBA12,0xBA13,0xBA14,0xBA16,0xBA17,0xBA18,0xBA19,0xBA1A,/* 0xD0-0xD7 */ + 0xBA1B,0xBA1C,0xBA1D,0xBA1E,0xBA1F,0xBA20,0xBA21,0xBA22,/* 0xD8-0xDF */ + 0xBA23,0xBA24,0xBA25,0xBA26,0xBA27,0xBA28,0xBA29,0xBA2A,/* 0xE0-0xE7 */ + 0xBA2B,0xBA2C,0xBA2D,0xBA2E,0xBA2F,0xBA30,0xBA31,0xBA32,/* 0xE8-0xEF */ + 0xBA33,0xBA34,0xBA35,0xBA36,0xBA37,0xBA3A,0xBA3B,0xBA3D,/* 0xF0-0xF7 */ + 0xBA3E,0xBA3F,0xBA41,0xBA43,0xBA44,0xBA45,0xBA46,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_91[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xBA47,0xBA4A,0xBA4C,0xBA4F,0xBA50,0xBA51,0xBA52,/* 0x40-0x47 */ + 0xBA56,0xBA57,0xBA59,0xBA5A,0xBA5B,0xBA5D,0xBA5E,0xBA5F,/* 0x48-0x4F */ + 0xBA60,0xBA61,0xBA62,0xBA63,0xBA66,0xBA6A,0xBA6B,0xBA6C,/* 0x50-0x57 */ + 0xBA6D,0xBA6E,0xBA6F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xBA72,0xBA73,0xBA75,0xBA76,0xBA77,0xBA79,0xBA7A,/* 0x60-0x67 */ + 0xBA7B,0xBA7C,0xBA7D,0xBA7E,0xBA7F,0xBA80,0xBA81,0xBA82,/* 0x68-0x6F */ + 0xBA86,0xBA88,0xBA89,0xBA8A,0xBA8B,0xBA8D,0xBA8E,0xBA8F,/* 0x70-0x77 */ + 0xBA90,0xBA91,0xBA92,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xBA93,0xBA94,0xBA95,0xBA96,0xBA97,0xBA98,0xBA99,/* 0x80-0x87 */ + 0xBA9A,0xBA9B,0xBA9C,0xBA9D,0xBA9E,0xBA9F,0xBAA0,0xBAA1,/* 0x88-0x8F */ + 0xBAA2,0xBAA3,0xBAA4,0xBAA5,0xBAA6,0xBAA7,0xBAAA,0xBAAD,/* 0x90-0x97 */ + 0xBAAE,0xBAAF,0xBAB1,0xBAB3,0xBAB4,0xBAB5,0xBAB6,0xBAB7,/* 0x98-0x9F */ + 0xBABA,0xBABC,0xBABE,0xBABF,0xBAC0,0xBAC1,0xBAC2,0xBAC3,/* 0xA0-0xA7 */ + 0xBAC5,0xBAC6,0xBAC7,0xBAC9,0xBACA,0xBACB,0xBACC,0xBACD,/* 0xA8-0xAF */ + 0xBACE,0xBACF,0xBAD0,0xBAD1,0xBAD2,0xBAD3,0xBAD4,0xBAD5,/* 0xB0-0xB7 */ + 0xBAD6,0xBAD7,0xBADA,0xBADB,0xBADC,0xBADD,0xBADE,0xBADF,/* 0xB8-0xBF */ + 0xBAE0,0xBAE1,0xBAE2,0xBAE3,0xBAE4,0xBAE5,0xBAE6,0xBAE7,/* 0xC0-0xC7 */ + 0xBAE8,0xBAE9,0xBAEA,0xBAEB,0xBAEC,0xBAED,0xBAEE,0xBAEF,/* 0xC8-0xCF */ + 0xBAF0,0xBAF1,0xBAF2,0xBAF3,0xBAF4,0xBAF5,0xBAF6,0xBAF7,/* 0xD0-0xD7 */ + 0xBAF8,0xBAF9,0xBAFA,0xBAFB,0xBAFD,0xBAFE,0xBAFF,0xBB01,/* 0xD8-0xDF */ + 0xBB02,0xBB03,0xBB05,0xBB06,0xBB07,0xBB08,0xBB09,0xBB0A,/* 0xE0-0xE7 */ + 0xBB0B,0xBB0C,0xBB0E,0xBB10,0xBB12,0xBB13,0xBB14,0xBB15,/* 0xE8-0xEF */ + 0xBB16,0xBB17,0xBB19,0xBB1A,0xBB1B,0xBB1D,0xBB1E,0xBB1F,/* 0xF0-0xF7 */ + 0xBB21,0xBB22,0xBB23,0xBB24,0xBB25,0xBB26,0xBB27,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_92[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xBB28,0xBB2A,0xBB2C,0xBB2D,0xBB2E,0xBB2F,0xBB30,/* 0x40-0x47 */ + 0xBB31,0xBB32,0xBB33,0xBB37,0xBB39,0xBB3A,0xBB3F,0xBB40,/* 0x48-0x4F */ + 0xBB41,0xBB42,0xBB43,0xBB46,0xBB48,0xBB4A,0xBB4B,0xBB4C,/* 0x50-0x57 */ + 0xBB4E,0xBB51,0xBB52,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xBB53,0xBB55,0xBB56,0xBB57,0xBB59,0xBB5A,0xBB5B,/* 0x60-0x67 */ + 0xBB5C,0xBB5D,0xBB5E,0xBB5F,0xBB60,0xBB62,0xBB64,0xBB65,/* 0x68-0x6F */ + 0xBB66,0xBB67,0xBB68,0xBB69,0xBB6A,0xBB6B,0xBB6D,0xBB6E,/* 0x70-0x77 */ + 0xBB6F,0xBB70,0xBB71,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xBB72,0xBB73,0xBB74,0xBB75,0xBB76,0xBB77,0xBB78,/* 0x80-0x87 */ + 0xBB79,0xBB7A,0xBB7B,0xBB7C,0xBB7D,0xBB7E,0xBB7F,0xBB80,/* 0x88-0x8F */ + 0xBB81,0xBB82,0xBB83,0xBB84,0xBB85,0xBB86,0xBB87,0xBB89,/* 0x90-0x97 */ + 0xBB8A,0xBB8B,0xBB8D,0xBB8E,0xBB8F,0xBB91,0xBB92,0xBB93,/* 0x98-0x9F */ + 0xBB94,0xBB95,0xBB96,0xBB97,0xBB98,0xBB99,0xBB9A,0xBB9B,/* 0xA0-0xA7 */ + 0xBB9C,0xBB9D,0xBB9E,0xBB9F,0xBBA0,0xBBA1,0xBBA2,0xBBA3,/* 0xA8-0xAF */ + 0xBBA5,0xBBA6,0xBBA7,0xBBA9,0xBBAA,0xBBAB,0xBBAD,0xBBAE,/* 0xB0-0xB7 */ + 0xBBAF,0xBBB0,0xBBB1,0xBBB2,0xBBB3,0xBBB5,0xBBB6,0xBBB8,/* 0xB8-0xBF */ + 0xBBB9,0xBBBA,0xBBBB,0xBBBC,0xBBBD,0xBBBE,0xBBBF,0xBBC1,/* 0xC0-0xC7 */ + 0xBBC2,0xBBC3,0xBBC5,0xBBC6,0xBBC7,0xBBC9,0xBBCA,0xBBCB,/* 0xC8-0xCF */ + 0xBBCC,0xBBCD,0xBBCE,0xBBCF,0xBBD1,0xBBD2,0xBBD4,0xBBD5,/* 0xD0-0xD7 */ + 0xBBD6,0xBBD7,0xBBD8,0xBBD9,0xBBDA,0xBBDB,0xBBDC,0xBBDD,/* 0xD8-0xDF */ + 0xBBDE,0xBBDF,0xBBE0,0xBBE1,0xBBE2,0xBBE3,0xBBE4,0xBBE5,/* 0xE0-0xE7 */ + 0xBBE6,0xBBE7,0xBBE8,0xBBE9,0xBBEA,0xBBEB,0xBBEC,0xBBED,/* 0xE8-0xEF */ + 0xBBEE,0xBBEF,0xBBF0,0xBBF1,0xBBF2,0xBBF3,0xBBF4,0xBBF5,/* 0xF0-0xF7 */ + 0xBBF6,0xBBF7,0xBBFA,0xBBFB,0xBBFD,0xBBFE,0xBC01,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_93[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xBC03,0xBC04,0xBC05,0xBC06,0xBC07,0xBC0A,0xBC0E,/* 0x40-0x47 */ + 0xBC10,0xBC12,0xBC13,0xBC19,0xBC1A,0xBC20,0xBC21,0xBC22,/* 0x48-0x4F */ + 0xBC23,0xBC26,0xBC28,0xBC2A,0xBC2B,0xBC2C,0xBC2E,0xBC2F,/* 0x50-0x57 */ + 0xBC32,0xBC33,0xBC35,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xBC36,0xBC37,0xBC39,0xBC3A,0xBC3B,0xBC3C,0xBC3D,/* 0x60-0x67 */ + 0xBC3E,0xBC3F,0xBC42,0xBC46,0xBC47,0xBC48,0xBC4A,0xBC4B,/* 0x68-0x6F */ + 0xBC4E,0xBC4F,0xBC51,0xBC52,0xBC53,0xBC54,0xBC55,0xBC56,/* 0x70-0x77 */ + 0xBC57,0xBC58,0xBC59,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xBC5A,0xBC5B,0xBC5C,0xBC5E,0xBC5F,0xBC60,0xBC61,/* 0x80-0x87 */ + 0xBC62,0xBC63,0xBC64,0xBC65,0xBC66,0xBC67,0xBC68,0xBC69,/* 0x88-0x8F */ + 0xBC6A,0xBC6B,0xBC6C,0xBC6D,0xBC6E,0xBC6F,0xBC70,0xBC71,/* 0x90-0x97 */ + 0xBC72,0xBC73,0xBC74,0xBC75,0xBC76,0xBC77,0xBC78,0xBC79,/* 0x98-0x9F */ + 0xBC7A,0xBC7B,0xBC7C,0xBC7D,0xBC7E,0xBC7F,0xBC80,0xBC81,/* 0xA0-0xA7 */ + 0xBC82,0xBC83,0xBC86,0xBC87,0xBC89,0xBC8A,0xBC8D,0xBC8F,/* 0xA8-0xAF */ + 0xBC90,0xBC91,0xBC92,0xBC93,0xBC96,0xBC98,0xBC9B,0xBC9C,/* 0xB0-0xB7 */ + 0xBC9D,0xBC9E,0xBC9F,0xBCA2,0xBCA3,0xBCA5,0xBCA6,0xBCA9,/* 0xB8-0xBF */ + 0xBCAA,0xBCAB,0xBCAC,0xBCAD,0xBCAE,0xBCAF,0xBCB2,0xBCB6,/* 0xC0-0xC7 */ + 0xBCB7,0xBCB8,0xBCB9,0xBCBA,0xBCBB,0xBCBE,0xBCBF,0xBCC1,/* 0xC8-0xCF */ + 0xBCC2,0xBCC3,0xBCC5,0xBCC6,0xBCC7,0xBCC8,0xBCC9,0xBCCA,/* 0xD0-0xD7 */ + 0xBCCB,0xBCCC,0xBCCE,0xBCD2,0xBCD3,0xBCD4,0xBCD6,0xBCD7,/* 0xD8-0xDF */ + 0xBCD9,0xBCDA,0xBCDB,0xBCDD,0xBCDE,0xBCDF,0xBCE0,0xBCE1,/* 0xE0-0xE7 */ + 0xBCE2,0xBCE3,0xBCE4,0xBCE5,0xBCE6,0xBCE7,0xBCE8,0xBCE9,/* 0xE8-0xEF */ + 0xBCEA,0xBCEB,0xBCEC,0xBCED,0xBCEE,0xBCEF,0xBCF0,0xBCF1,/* 0xF0-0xF7 */ + 0xBCF2,0xBCF3,0xBCF7,0xBCF9,0xBCFA,0xBCFB,0xBCFD,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_94[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xBCFE,0xBCFF,0xBD00,0xBD01,0xBD02,0xBD03,0xBD06,/* 0x40-0x47 */ + 0xBD08,0xBD0A,0xBD0B,0xBD0C,0xBD0D,0xBD0E,0xBD0F,0xBD11,/* 0x48-0x4F */ + 0xBD12,0xBD13,0xBD15,0xBD16,0xBD17,0xBD18,0xBD19,0xBD1A,/* 0x50-0x57 */ + 0xBD1B,0xBD1C,0xBD1D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xBD1E,0xBD1F,0xBD20,0xBD21,0xBD22,0xBD23,0xBD25,/* 0x60-0x67 */ + 0xBD26,0xBD27,0xBD28,0xBD29,0xBD2A,0xBD2B,0xBD2D,0xBD2E,/* 0x68-0x6F */ + 0xBD2F,0xBD30,0xBD31,0xBD32,0xBD33,0xBD34,0xBD35,0xBD36,/* 0x70-0x77 */ + 0xBD37,0xBD38,0xBD39,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xBD3A,0xBD3B,0xBD3C,0xBD3D,0xBD3E,0xBD3F,0xBD41,/* 0x80-0x87 */ + 0xBD42,0xBD43,0xBD44,0xBD45,0xBD46,0xBD47,0xBD4A,0xBD4B,/* 0x88-0x8F */ + 0xBD4D,0xBD4E,0xBD4F,0xBD51,0xBD52,0xBD53,0xBD54,0xBD55,/* 0x90-0x97 */ + 0xBD56,0xBD57,0xBD5A,0xBD5B,0xBD5C,0xBD5D,0xBD5E,0xBD5F,/* 0x98-0x9F */ + 0xBD60,0xBD61,0xBD62,0xBD63,0xBD65,0xBD66,0xBD67,0xBD69,/* 0xA0-0xA7 */ + 0xBD6A,0xBD6B,0xBD6C,0xBD6D,0xBD6E,0xBD6F,0xBD70,0xBD71,/* 0xA8-0xAF */ + 0xBD72,0xBD73,0xBD74,0xBD75,0xBD76,0xBD77,0xBD78,0xBD79,/* 0xB0-0xB7 */ + 0xBD7A,0xBD7B,0xBD7C,0xBD7D,0xBD7E,0xBD7F,0xBD82,0xBD83,/* 0xB8-0xBF */ + 0xBD85,0xBD86,0xBD8B,0xBD8C,0xBD8D,0xBD8E,0xBD8F,0xBD92,/* 0xC0-0xC7 */ + 0xBD94,0xBD96,0xBD97,0xBD98,0xBD9B,0xBD9D,0xBD9E,0xBD9F,/* 0xC8-0xCF */ + 0xBDA0,0xBDA1,0xBDA2,0xBDA3,0xBDA5,0xBDA6,0xBDA7,0xBDA8,/* 0xD0-0xD7 */ + 0xBDA9,0xBDAA,0xBDAB,0xBDAC,0xBDAD,0xBDAE,0xBDAF,0xBDB1,/* 0xD8-0xDF */ + 0xBDB2,0xBDB3,0xBDB4,0xBDB5,0xBDB6,0xBDB7,0xBDB9,0xBDBA,/* 0xE0-0xE7 */ + 0xBDBB,0xBDBC,0xBDBD,0xBDBE,0xBDBF,0xBDC0,0xBDC1,0xBDC2,/* 0xE8-0xEF */ + 0xBDC3,0xBDC4,0xBDC5,0xBDC6,0xBDC7,0xBDC8,0xBDC9,0xBDCA,/* 0xF0-0xF7 */ + 0xBDCB,0xBDCC,0xBDCD,0xBDCE,0xBDCF,0xBDD0,0xBDD1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_95[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xBDD2,0xBDD3,0xBDD6,0xBDD7,0xBDD9,0xBDDA,0xBDDB,/* 0x40-0x47 */ + 0xBDDD,0xBDDE,0xBDDF,0xBDE0,0xBDE1,0xBDE2,0xBDE3,0xBDE4,/* 0x48-0x4F */ + 0xBDE5,0xBDE6,0xBDE7,0xBDE8,0xBDEA,0xBDEB,0xBDEC,0xBDED,/* 0x50-0x57 */ + 0xBDEE,0xBDEF,0xBDF1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xBDF2,0xBDF3,0xBDF5,0xBDF6,0xBDF7,0xBDF9,0xBDFA,/* 0x60-0x67 */ + 0xBDFB,0xBDFC,0xBDFD,0xBDFE,0xBDFF,0xBE01,0xBE02,0xBE04,/* 0x68-0x6F */ + 0xBE06,0xBE07,0xBE08,0xBE09,0xBE0A,0xBE0B,0xBE0E,0xBE0F,/* 0x70-0x77 */ + 0xBE11,0xBE12,0xBE13,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xBE15,0xBE16,0xBE17,0xBE18,0xBE19,0xBE1A,0xBE1B,/* 0x80-0x87 */ + 0xBE1E,0xBE20,0xBE21,0xBE22,0xBE23,0xBE24,0xBE25,0xBE26,/* 0x88-0x8F */ + 0xBE27,0xBE28,0xBE29,0xBE2A,0xBE2B,0xBE2C,0xBE2D,0xBE2E,/* 0x90-0x97 */ + 0xBE2F,0xBE30,0xBE31,0xBE32,0xBE33,0xBE34,0xBE35,0xBE36,/* 0x98-0x9F */ + 0xBE37,0xBE38,0xBE39,0xBE3A,0xBE3B,0xBE3C,0xBE3D,0xBE3E,/* 0xA0-0xA7 */ + 0xBE3F,0xBE40,0xBE41,0xBE42,0xBE43,0xBE46,0xBE47,0xBE49,/* 0xA8-0xAF */ + 0xBE4A,0xBE4B,0xBE4D,0xBE4F,0xBE50,0xBE51,0xBE52,0xBE53,/* 0xB0-0xB7 */ + 0xBE56,0xBE58,0xBE5C,0xBE5D,0xBE5E,0xBE5F,0xBE62,0xBE63,/* 0xB8-0xBF */ + 0xBE65,0xBE66,0xBE67,0xBE69,0xBE6B,0xBE6C,0xBE6D,0xBE6E,/* 0xC0-0xC7 */ + 0xBE6F,0xBE72,0xBE76,0xBE77,0xBE78,0xBE79,0xBE7A,0xBE7E,/* 0xC8-0xCF */ + 0xBE7F,0xBE81,0xBE82,0xBE83,0xBE85,0xBE86,0xBE87,0xBE88,/* 0xD0-0xD7 */ + 0xBE89,0xBE8A,0xBE8B,0xBE8E,0xBE92,0xBE93,0xBE94,0xBE95,/* 0xD8-0xDF */ + 0xBE96,0xBE97,0xBE9A,0xBE9B,0xBE9C,0xBE9D,0xBE9E,0xBE9F,/* 0xE0-0xE7 */ + 0xBEA0,0xBEA1,0xBEA2,0xBEA3,0xBEA4,0xBEA5,0xBEA6,0xBEA7,/* 0xE8-0xEF */ + 0xBEA9,0xBEAA,0xBEAB,0xBEAC,0xBEAD,0xBEAE,0xBEAF,0xBEB0,/* 0xF0-0xF7 */ + 0xBEB1,0xBEB2,0xBEB3,0xBEB4,0xBEB5,0xBEB6,0xBEB7,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_96[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xBEB8,0xBEB9,0xBEBA,0xBEBB,0xBEBC,0xBEBD,0xBEBE,/* 0x40-0x47 */ + 0xBEBF,0xBEC0,0xBEC1,0xBEC2,0xBEC3,0xBEC4,0xBEC5,0xBEC6,/* 0x48-0x4F */ + 0xBEC7,0xBEC8,0xBEC9,0xBECA,0xBECB,0xBECC,0xBECD,0xBECE,/* 0x50-0x57 */ + 0xBECF,0xBED2,0xBED3,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xBED5,0xBED6,0xBED9,0xBEDA,0xBEDB,0xBEDC,0xBEDD,/* 0x60-0x67 */ + 0xBEDE,0xBEDF,0xBEE1,0xBEE2,0xBEE6,0xBEE7,0xBEE8,0xBEE9,/* 0x68-0x6F */ + 0xBEEA,0xBEEB,0xBEED,0xBEEE,0xBEEF,0xBEF0,0xBEF1,0xBEF2,/* 0x70-0x77 */ + 0xBEF3,0xBEF4,0xBEF5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xBEF6,0xBEF7,0xBEF8,0xBEF9,0xBEFA,0xBEFB,0xBEFC,/* 0x80-0x87 */ + 0xBEFD,0xBEFE,0xBEFF,0xBF00,0xBF02,0xBF03,0xBF04,0xBF05,/* 0x88-0x8F */ + 0xBF06,0xBF07,0xBF0A,0xBF0B,0xBF0C,0xBF0D,0xBF0E,0xBF0F,/* 0x90-0x97 */ + 0xBF10,0xBF11,0xBF12,0xBF13,0xBF14,0xBF15,0xBF16,0xBF17,/* 0x98-0x9F */ + 0xBF1A,0xBF1E,0xBF1F,0xBF20,0xBF21,0xBF22,0xBF23,0xBF24,/* 0xA0-0xA7 */ + 0xBF25,0xBF26,0xBF27,0xBF28,0xBF29,0xBF2A,0xBF2B,0xBF2C,/* 0xA8-0xAF */ + 0xBF2D,0xBF2E,0xBF2F,0xBF30,0xBF31,0xBF32,0xBF33,0xBF34,/* 0xB0-0xB7 */ + 0xBF35,0xBF36,0xBF37,0xBF38,0xBF39,0xBF3A,0xBF3B,0xBF3C,/* 0xB8-0xBF */ + 0xBF3D,0xBF3E,0xBF3F,0xBF42,0xBF43,0xBF45,0xBF46,0xBF47,/* 0xC0-0xC7 */ + 0xBF49,0xBF4A,0xBF4B,0xBF4C,0xBF4D,0xBF4E,0xBF4F,0xBF52,/* 0xC8-0xCF */ + 0xBF53,0xBF54,0xBF56,0xBF57,0xBF58,0xBF59,0xBF5A,0xBF5B,/* 0xD0-0xD7 */ + 0xBF5C,0xBF5D,0xBF5E,0xBF5F,0xBF60,0xBF61,0xBF62,0xBF63,/* 0xD8-0xDF */ + 0xBF64,0xBF65,0xBF66,0xBF67,0xBF68,0xBF69,0xBF6A,0xBF6B,/* 0xE0-0xE7 */ + 0xBF6C,0xBF6D,0xBF6E,0xBF6F,0xBF70,0xBF71,0xBF72,0xBF73,/* 0xE8-0xEF */ + 0xBF74,0xBF75,0xBF76,0xBF77,0xBF78,0xBF79,0xBF7A,0xBF7B,/* 0xF0-0xF7 */ + 0xBF7C,0xBF7D,0xBF7E,0xBF7F,0xBF80,0xBF81,0xBF82,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_97[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xBF83,0xBF84,0xBF85,0xBF86,0xBF87,0xBF88,0xBF89,/* 0x40-0x47 */ + 0xBF8A,0xBF8B,0xBF8C,0xBF8D,0xBF8E,0xBF8F,0xBF90,0xBF91,/* 0x48-0x4F */ + 0xBF92,0xBF93,0xBF95,0xBF96,0xBF97,0xBF98,0xBF99,0xBF9A,/* 0x50-0x57 */ + 0xBF9B,0xBF9C,0xBF9D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xBF9E,0xBF9F,0xBFA0,0xBFA1,0xBFA2,0xBFA3,0xBFA4,/* 0x60-0x67 */ + 0xBFA5,0xBFA6,0xBFA7,0xBFA8,0xBFA9,0xBFAA,0xBFAB,0xBFAC,/* 0x68-0x6F */ + 0xBFAD,0xBFAE,0xBFAF,0xBFB1,0xBFB2,0xBFB3,0xBFB4,0xBFB5,/* 0x70-0x77 */ + 0xBFB6,0xBFB7,0xBFB8,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xBFB9,0xBFBA,0xBFBB,0xBFBC,0xBFBD,0xBFBE,0xBFBF,/* 0x80-0x87 */ + 0xBFC0,0xBFC1,0xBFC2,0xBFC3,0xBFC4,0xBFC6,0xBFC7,0xBFC8,/* 0x88-0x8F */ + 0xBFC9,0xBFCA,0xBFCB,0xBFCE,0xBFCF,0xBFD1,0xBFD2,0xBFD3,/* 0x90-0x97 */ + 0xBFD5,0xBFD6,0xBFD7,0xBFD8,0xBFD9,0xBFDA,0xBFDB,0xBFDD,/* 0x98-0x9F */ + 0xBFDE,0xBFE0,0xBFE2,0xBFE3,0xBFE4,0xBFE5,0xBFE6,0xBFE7,/* 0xA0-0xA7 */ + 0xBFE8,0xBFE9,0xBFEA,0xBFEB,0xBFEC,0xBFED,0xBFEE,0xBFEF,/* 0xA8-0xAF */ + 0xBFF0,0xBFF1,0xBFF2,0xBFF3,0xBFF4,0xBFF5,0xBFF6,0xBFF7,/* 0xB0-0xB7 */ + 0xBFF8,0xBFF9,0xBFFA,0xBFFB,0xBFFC,0xBFFD,0xBFFE,0xBFFF,/* 0xB8-0xBF */ + 0xC000,0xC001,0xC002,0xC003,0xC004,0xC005,0xC006,0xC007,/* 0xC0-0xC7 */ + 0xC008,0xC009,0xC00A,0xC00B,0xC00C,0xC00D,0xC00E,0xC00F,/* 0xC8-0xCF */ + 0xC010,0xC011,0xC012,0xC013,0xC014,0xC015,0xC016,0xC017,/* 0xD0-0xD7 */ + 0xC018,0xC019,0xC01A,0xC01B,0xC01C,0xC01D,0xC01E,0xC01F,/* 0xD8-0xDF */ + 0xC020,0xC021,0xC022,0xC023,0xC024,0xC025,0xC026,0xC027,/* 0xE0-0xE7 */ + 0xC028,0xC029,0xC02A,0xC02B,0xC02C,0xC02D,0xC02E,0xC02F,/* 0xE8-0xEF */ + 0xC030,0xC031,0xC032,0xC033,0xC034,0xC035,0xC036,0xC037,/* 0xF0-0xF7 */ + 0xC038,0xC039,0xC03A,0xC03B,0xC03D,0xC03E,0xC03F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_98[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC040,0xC041,0xC042,0xC043,0xC044,0xC045,0xC046,/* 0x40-0x47 */ + 0xC047,0xC048,0xC049,0xC04A,0xC04B,0xC04C,0xC04D,0xC04E,/* 0x48-0x4F */ + 0xC04F,0xC050,0xC052,0xC053,0xC054,0xC055,0xC056,0xC057,/* 0x50-0x57 */ + 0xC059,0xC05A,0xC05B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC05D,0xC05E,0xC05F,0xC061,0xC062,0xC063,0xC064,/* 0x60-0x67 */ + 0xC065,0xC066,0xC067,0xC06A,0xC06B,0xC06C,0xC06D,0xC06E,/* 0x68-0x6F */ + 0xC06F,0xC070,0xC071,0xC072,0xC073,0xC074,0xC075,0xC076,/* 0x70-0x77 */ + 0xC077,0xC078,0xC079,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC07A,0xC07B,0xC07C,0xC07D,0xC07E,0xC07F,0xC080,/* 0x80-0x87 */ + 0xC081,0xC082,0xC083,0xC084,0xC085,0xC086,0xC087,0xC088,/* 0x88-0x8F */ + 0xC089,0xC08A,0xC08B,0xC08C,0xC08D,0xC08E,0xC08F,0xC092,/* 0x90-0x97 */ + 0xC093,0xC095,0xC096,0xC097,0xC099,0xC09A,0xC09B,0xC09C,/* 0x98-0x9F */ + 0xC09D,0xC09E,0xC09F,0xC0A2,0xC0A4,0xC0A6,0xC0A7,0xC0A8,/* 0xA0-0xA7 */ + 0xC0A9,0xC0AA,0xC0AB,0xC0AE,0xC0B1,0xC0B2,0xC0B7,0xC0B8,/* 0xA8-0xAF */ + 0xC0B9,0xC0BA,0xC0BB,0xC0BE,0xC0C2,0xC0C3,0xC0C4,0xC0C6,/* 0xB0-0xB7 */ + 0xC0C7,0xC0CA,0xC0CB,0xC0CD,0xC0CE,0xC0CF,0xC0D1,0xC0D2,/* 0xB8-0xBF */ + 0xC0D3,0xC0D4,0xC0D5,0xC0D6,0xC0D7,0xC0DA,0xC0DE,0xC0DF,/* 0xC0-0xC7 */ + 0xC0E0,0xC0E1,0xC0E2,0xC0E3,0xC0E6,0xC0E7,0xC0E9,0xC0EA,/* 0xC8-0xCF */ + 0xC0EB,0xC0ED,0xC0EE,0xC0EF,0xC0F0,0xC0F1,0xC0F2,0xC0F3,/* 0xD0-0xD7 */ + 0xC0F6,0xC0F8,0xC0FA,0xC0FB,0xC0FC,0xC0FD,0xC0FE,0xC0FF,/* 0xD8-0xDF */ + 0xC101,0xC102,0xC103,0xC105,0xC106,0xC107,0xC109,0xC10A,/* 0xE0-0xE7 */ + 0xC10B,0xC10C,0xC10D,0xC10E,0xC10F,0xC111,0xC112,0xC113,/* 0xE8-0xEF */ + 0xC114,0xC116,0xC117,0xC118,0xC119,0xC11A,0xC11B,0xC121,/* 0xF0-0xF7 */ + 0xC122,0xC125,0xC128,0xC129,0xC12A,0xC12B,0xC12E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_99[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC132,0xC133,0xC134,0xC135,0xC137,0xC13A,0xC13B,/* 0x40-0x47 */ + 0xC13D,0xC13E,0xC13F,0xC141,0xC142,0xC143,0xC144,0xC145,/* 0x48-0x4F */ + 0xC146,0xC147,0xC14A,0xC14E,0xC14F,0xC150,0xC151,0xC152,/* 0x50-0x57 */ + 0xC153,0xC156,0xC157,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC159,0xC15A,0xC15B,0xC15D,0xC15E,0xC15F,0xC160,/* 0x60-0x67 */ + 0xC161,0xC162,0xC163,0xC166,0xC16A,0xC16B,0xC16C,0xC16D,/* 0x68-0x6F */ + 0xC16E,0xC16F,0xC171,0xC172,0xC173,0xC175,0xC176,0xC177,/* 0x70-0x77 */ + 0xC179,0xC17A,0xC17B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC17C,0xC17D,0xC17E,0xC17F,0xC180,0xC181,0xC182,/* 0x80-0x87 */ + 0xC183,0xC184,0xC186,0xC187,0xC188,0xC189,0xC18A,0xC18B,/* 0x88-0x8F */ + 0xC18F,0xC191,0xC192,0xC193,0xC195,0xC197,0xC198,0xC199,/* 0x90-0x97 */ + 0xC19A,0xC19B,0xC19E,0xC1A0,0xC1A2,0xC1A3,0xC1A4,0xC1A6,/* 0x98-0x9F */ + 0xC1A7,0xC1AA,0xC1AB,0xC1AD,0xC1AE,0xC1AF,0xC1B1,0xC1B2,/* 0xA0-0xA7 */ + 0xC1B3,0xC1B4,0xC1B5,0xC1B6,0xC1B7,0xC1B8,0xC1B9,0xC1BA,/* 0xA8-0xAF */ + 0xC1BB,0xC1BC,0xC1BE,0xC1BF,0xC1C0,0xC1C1,0xC1C2,0xC1C3,/* 0xB0-0xB7 */ + 0xC1C5,0xC1C6,0xC1C7,0xC1C9,0xC1CA,0xC1CB,0xC1CD,0xC1CE,/* 0xB8-0xBF */ + 0xC1CF,0xC1D0,0xC1D1,0xC1D2,0xC1D3,0xC1D5,0xC1D6,0xC1D9,/* 0xC0-0xC7 */ + 0xC1DA,0xC1DB,0xC1DC,0xC1DD,0xC1DE,0xC1DF,0xC1E1,0xC1E2,/* 0xC8-0xCF */ + 0xC1E3,0xC1E5,0xC1E6,0xC1E7,0xC1E9,0xC1EA,0xC1EB,0xC1EC,/* 0xD0-0xD7 */ + 0xC1ED,0xC1EE,0xC1EF,0xC1F2,0xC1F4,0xC1F5,0xC1F6,0xC1F7,/* 0xD8-0xDF */ + 0xC1F8,0xC1F9,0xC1FA,0xC1FB,0xC1FE,0xC1FF,0xC201,0xC202,/* 0xE0-0xE7 */ + 0xC203,0xC205,0xC206,0xC207,0xC208,0xC209,0xC20A,0xC20B,/* 0xE8-0xEF */ + 0xC20E,0xC210,0xC212,0xC213,0xC214,0xC215,0xC216,0xC217,/* 0xF0-0xF7 */ + 0xC21A,0xC21B,0xC21D,0xC21E,0xC221,0xC222,0xC223,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9A[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC224,0xC225,0xC226,0xC227,0xC22A,0xC22C,0xC22E,/* 0x40-0x47 */ + 0xC230,0xC233,0xC235,0xC236,0xC237,0xC238,0xC239,0xC23A,/* 0x48-0x4F */ + 0xC23B,0xC23C,0xC23D,0xC23E,0xC23F,0xC240,0xC241,0xC242,/* 0x50-0x57 */ + 0xC243,0xC244,0xC245,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC246,0xC247,0xC249,0xC24A,0xC24B,0xC24C,0xC24D,/* 0x60-0x67 */ + 0xC24E,0xC24F,0xC252,0xC253,0xC255,0xC256,0xC257,0xC259,/* 0x68-0x6F */ + 0xC25A,0xC25B,0xC25C,0xC25D,0xC25E,0xC25F,0xC261,0xC262,/* 0x70-0x77 */ + 0xC263,0xC264,0xC266,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC267,0xC268,0xC269,0xC26A,0xC26B,0xC26E,0xC26F,/* 0x80-0x87 */ + 0xC271,0xC272,0xC273,0xC275,0xC276,0xC277,0xC278,0xC279,/* 0x88-0x8F */ + 0xC27A,0xC27B,0xC27E,0xC280,0xC282,0xC283,0xC284,0xC285,/* 0x90-0x97 */ + 0xC286,0xC287,0xC28A,0xC28B,0xC28C,0xC28D,0xC28E,0xC28F,/* 0x98-0x9F */ + 0xC291,0xC292,0xC293,0xC294,0xC295,0xC296,0xC297,0xC299,/* 0xA0-0xA7 */ + 0xC29A,0xC29C,0xC29E,0xC29F,0xC2A0,0xC2A1,0xC2A2,0xC2A3,/* 0xA8-0xAF */ + 0xC2A6,0xC2A7,0xC2A9,0xC2AA,0xC2AB,0xC2AE,0xC2AF,0xC2B0,/* 0xB0-0xB7 */ + 0xC2B1,0xC2B2,0xC2B3,0xC2B6,0xC2B8,0xC2BA,0xC2BB,0xC2BC,/* 0xB8-0xBF */ + 0xC2BD,0xC2BE,0xC2BF,0xC2C0,0xC2C1,0xC2C2,0xC2C3,0xC2C4,/* 0xC0-0xC7 */ + 0xC2C5,0xC2C6,0xC2C7,0xC2C8,0xC2C9,0xC2CA,0xC2CB,0xC2CC,/* 0xC8-0xCF */ + 0xC2CD,0xC2CE,0xC2CF,0xC2D0,0xC2D1,0xC2D2,0xC2D3,0xC2D4,/* 0xD0-0xD7 */ + 0xC2D5,0xC2D6,0xC2D7,0xC2D8,0xC2D9,0xC2DA,0xC2DB,0xC2DE,/* 0xD8-0xDF */ + 0xC2DF,0xC2E1,0xC2E2,0xC2E5,0xC2E6,0xC2E7,0xC2E8,0xC2E9,/* 0xE0-0xE7 */ + 0xC2EA,0xC2EE,0xC2F0,0xC2F2,0xC2F3,0xC2F4,0xC2F5,0xC2F7,/* 0xE8-0xEF */ + 0xC2FA,0xC2FD,0xC2FE,0xC2FF,0xC301,0xC302,0xC303,0xC304,/* 0xF0-0xF7 */ + 0xC305,0xC306,0xC307,0xC30A,0xC30B,0xC30E,0xC30F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9B[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC310,0xC311,0xC312,0xC316,0xC317,0xC319,0xC31A,/* 0x40-0x47 */ + 0xC31B,0xC31D,0xC31E,0xC31F,0xC320,0xC321,0xC322,0xC323,/* 0x48-0x4F */ + 0xC326,0xC327,0xC32A,0xC32B,0xC32C,0xC32D,0xC32E,0xC32F,/* 0x50-0x57 */ + 0xC330,0xC331,0xC332,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC333,0xC334,0xC335,0xC336,0xC337,0xC338,0xC339,/* 0x60-0x67 */ + 0xC33A,0xC33B,0xC33C,0xC33D,0xC33E,0xC33F,0xC340,0xC341,/* 0x68-0x6F */ + 0xC342,0xC343,0xC344,0xC346,0xC347,0xC348,0xC349,0xC34A,/* 0x70-0x77 */ + 0xC34B,0xC34C,0xC34D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC34E,0xC34F,0xC350,0xC351,0xC352,0xC353,0xC354,/* 0x80-0x87 */ + 0xC355,0xC356,0xC357,0xC358,0xC359,0xC35A,0xC35B,0xC35C,/* 0x88-0x8F */ + 0xC35D,0xC35E,0xC35F,0xC360,0xC361,0xC362,0xC363,0xC364,/* 0x90-0x97 */ + 0xC365,0xC366,0xC367,0xC36A,0xC36B,0xC36D,0xC36E,0xC36F,/* 0x98-0x9F */ + 0xC371,0xC373,0xC374,0xC375,0xC376,0xC377,0xC37A,0xC37B,/* 0xA0-0xA7 */ + 0xC37E,0xC37F,0xC380,0xC381,0xC382,0xC383,0xC385,0xC386,/* 0xA8-0xAF */ + 0xC387,0xC389,0xC38A,0xC38B,0xC38D,0xC38E,0xC38F,0xC390,/* 0xB0-0xB7 */ + 0xC391,0xC392,0xC393,0xC394,0xC395,0xC396,0xC397,0xC398,/* 0xB8-0xBF */ + 0xC399,0xC39A,0xC39B,0xC39C,0xC39D,0xC39E,0xC39F,0xC3A0,/* 0xC0-0xC7 */ + 0xC3A1,0xC3A2,0xC3A3,0xC3A4,0xC3A5,0xC3A6,0xC3A7,0xC3A8,/* 0xC8-0xCF */ + 0xC3A9,0xC3AA,0xC3AB,0xC3AC,0xC3AD,0xC3AE,0xC3AF,0xC3B0,/* 0xD0-0xD7 */ + 0xC3B1,0xC3B2,0xC3B3,0xC3B4,0xC3B5,0xC3B6,0xC3B7,0xC3B8,/* 0xD8-0xDF */ + 0xC3B9,0xC3BA,0xC3BB,0xC3BC,0xC3BD,0xC3BE,0xC3BF,0xC3C1,/* 0xE0-0xE7 */ + 0xC3C2,0xC3C3,0xC3C4,0xC3C5,0xC3C6,0xC3C7,0xC3C8,0xC3C9,/* 0xE8-0xEF */ + 0xC3CA,0xC3CB,0xC3CC,0xC3CD,0xC3CE,0xC3CF,0xC3D0,0xC3D1,/* 0xF0-0xF7 */ + 0xC3D2,0xC3D3,0xC3D4,0xC3D5,0xC3D6,0xC3D7,0xC3DA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9C[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC3DB,0xC3DD,0xC3DE,0xC3E1,0xC3E3,0xC3E4,0xC3E5,/* 0x40-0x47 */ + 0xC3E6,0xC3E7,0xC3EA,0xC3EB,0xC3EC,0xC3EE,0xC3EF,0xC3F0,/* 0x48-0x4F */ + 0xC3F1,0xC3F2,0xC3F3,0xC3F6,0xC3F7,0xC3F9,0xC3FA,0xC3FB,/* 0x50-0x57 */ + 0xC3FC,0xC3FD,0xC3FE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC3FF,0xC400,0xC401,0xC402,0xC403,0xC404,0xC405,/* 0x60-0x67 */ + 0xC406,0xC407,0xC409,0xC40A,0xC40B,0xC40C,0xC40D,0xC40E,/* 0x68-0x6F */ + 0xC40F,0xC411,0xC412,0xC413,0xC414,0xC415,0xC416,0xC417,/* 0x70-0x77 */ + 0xC418,0xC419,0xC41A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC41B,0xC41C,0xC41D,0xC41E,0xC41F,0xC420,0xC421,/* 0x80-0x87 */ + 0xC422,0xC423,0xC425,0xC426,0xC427,0xC428,0xC429,0xC42A,/* 0x88-0x8F */ + 0xC42B,0xC42D,0xC42E,0xC42F,0xC431,0xC432,0xC433,0xC435,/* 0x90-0x97 */ + 0xC436,0xC437,0xC438,0xC439,0xC43A,0xC43B,0xC43E,0xC43F,/* 0x98-0x9F */ + 0xC440,0xC441,0xC442,0xC443,0xC444,0xC445,0xC446,0xC447,/* 0xA0-0xA7 */ + 0xC449,0xC44A,0xC44B,0xC44C,0xC44D,0xC44E,0xC44F,0xC450,/* 0xA8-0xAF */ + 0xC451,0xC452,0xC453,0xC454,0xC455,0xC456,0xC457,0xC458,/* 0xB0-0xB7 */ + 0xC459,0xC45A,0xC45B,0xC45C,0xC45D,0xC45E,0xC45F,0xC460,/* 0xB8-0xBF */ + 0xC461,0xC462,0xC463,0xC466,0xC467,0xC469,0xC46A,0xC46B,/* 0xC0-0xC7 */ + 0xC46D,0xC46E,0xC46F,0xC470,0xC471,0xC472,0xC473,0xC476,/* 0xC8-0xCF */ + 0xC477,0xC478,0xC47A,0xC47B,0xC47C,0xC47D,0xC47E,0xC47F,/* 0xD0-0xD7 */ + 0xC481,0xC482,0xC483,0xC484,0xC485,0xC486,0xC487,0xC488,/* 0xD8-0xDF */ + 0xC489,0xC48A,0xC48B,0xC48C,0xC48D,0xC48E,0xC48F,0xC490,/* 0xE0-0xE7 */ + 0xC491,0xC492,0xC493,0xC495,0xC496,0xC497,0xC498,0xC499,/* 0xE8-0xEF */ + 0xC49A,0xC49B,0xC49D,0xC49E,0xC49F,0xC4A0,0xC4A1,0xC4A2,/* 0xF0-0xF7 */ + 0xC4A3,0xC4A4,0xC4A5,0xC4A6,0xC4A7,0xC4A8,0xC4A9,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9D[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC4AA,0xC4AB,0xC4AC,0xC4AD,0xC4AE,0xC4AF,0xC4B0,/* 0x40-0x47 */ + 0xC4B1,0xC4B2,0xC4B3,0xC4B4,0xC4B5,0xC4B6,0xC4B7,0xC4B9,/* 0x48-0x4F */ + 0xC4BA,0xC4BB,0xC4BD,0xC4BE,0xC4BF,0xC4C0,0xC4C1,0xC4C2,/* 0x50-0x57 */ + 0xC4C3,0xC4C4,0xC4C5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC4C6,0xC4C7,0xC4C8,0xC4C9,0xC4CA,0xC4CB,0xC4CC,/* 0x60-0x67 */ + 0xC4CD,0xC4CE,0xC4CF,0xC4D0,0xC4D1,0xC4D2,0xC4D3,0xC4D4,/* 0x68-0x6F */ + 0xC4D5,0xC4D6,0xC4D7,0xC4D8,0xC4D9,0xC4DA,0xC4DB,0xC4DC,/* 0x70-0x77 */ + 0xC4DD,0xC4DE,0xC4DF,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC4E0,0xC4E1,0xC4E2,0xC4E3,0xC4E4,0xC4E5,0xC4E6,/* 0x80-0x87 */ + 0xC4E7,0xC4E8,0xC4EA,0xC4EB,0xC4EC,0xC4ED,0xC4EE,0xC4EF,/* 0x88-0x8F */ + 0xC4F2,0xC4F3,0xC4F5,0xC4F6,0xC4F7,0xC4F9,0xC4FB,0xC4FC,/* 0x90-0x97 */ + 0xC4FD,0xC4FE,0xC502,0xC503,0xC504,0xC505,0xC506,0xC507,/* 0x98-0x9F */ + 0xC508,0xC509,0xC50A,0xC50B,0xC50D,0xC50E,0xC50F,0xC511,/* 0xA0-0xA7 */ + 0xC512,0xC513,0xC515,0xC516,0xC517,0xC518,0xC519,0xC51A,/* 0xA8-0xAF */ + 0xC51B,0xC51D,0xC51E,0xC51F,0xC520,0xC521,0xC522,0xC523,/* 0xB0-0xB7 */ + 0xC524,0xC525,0xC526,0xC527,0xC52A,0xC52B,0xC52D,0xC52E,/* 0xB8-0xBF */ + 0xC52F,0xC531,0xC532,0xC533,0xC534,0xC535,0xC536,0xC537,/* 0xC0-0xC7 */ + 0xC53A,0xC53C,0xC53E,0xC53F,0xC540,0xC541,0xC542,0xC543,/* 0xC8-0xCF */ + 0xC546,0xC547,0xC54B,0xC54F,0xC550,0xC551,0xC552,0xC556,/* 0xD0-0xD7 */ + 0xC55A,0xC55B,0xC55C,0xC55F,0xC562,0xC563,0xC565,0xC566,/* 0xD8-0xDF */ + 0xC567,0xC569,0xC56A,0xC56B,0xC56C,0xC56D,0xC56E,0xC56F,/* 0xE0-0xE7 */ + 0xC572,0xC576,0xC577,0xC578,0xC579,0xC57A,0xC57B,0xC57E,/* 0xE8-0xEF */ + 0xC57F,0xC581,0xC582,0xC583,0xC585,0xC586,0xC588,0xC589,/* 0xF0-0xF7 */ + 0xC58A,0xC58B,0xC58E,0xC590,0xC592,0xC593,0xC594,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9E[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC596,0xC599,0xC59A,0xC59B,0xC59D,0xC59E,0xC59F,/* 0x40-0x47 */ + 0xC5A1,0xC5A2,0xC5A3,0xC5A4,0xC5A5,0xC5A6,0xC5A7,0xC5A8,/* 0x48-0x4F */ + 0xC5AA,0xC5AB,0xC5AC,0xC5AD,0xC5AE,0xC5AF,0xC5B0,0xC5B1,/* 0x50-0x57 */ + 0xC5B2,0xC5B3,0xC5B6,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC5B7,0xC5BA,0xC5BF,0xC5C0,0xC5C1,0xC5C2,0xC5C3,/* 0x60-0x67 */ + 0xC5CB,0xC5CD,0xC5CF,0xC5D2,0xC5D3,0xC5D5,0xC5D6,0xC5D7,/* 0x68-0x6F */ + 0xC5D9,0xC5DA,0xC5DB,0xC5DC,0xC5DD,0xC5DE,0xC5DF,0xC5E2,/* 0x70-0x77 */ + 0xC5E4,0xC5E6,0xC5E7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC5E8,0xC5E9,0xC5EA,0xC5EB,0xC5EF,0xC5F1,0xC5F2,/* 0x80-0x87 */ + 0xC5F3,0xC5F5,0xC5F8,0xC5F9,0xC5FA,0xC5FB,0xC602,0xC603,/* 0x88-0x8F */ + 0xC604,0xC609,0xC60A,0xC60B,0xC60D,0xC60E,0xC60F,0xC611,/* 0x90-0x97 */ + 0xC612,0xC613,0xC614,0xC615,0xC616,0xC617,0xC61A,0xC61D,/* 0x98-0x9F */ + 0xC61E,0xC61F,0xC620,0xC621,0xC622,0xC623,0xC626,0xC627,/* 0xA0-0xA7 */ + 0xC629,0xC62A,0xC62B,0xC62F,0xC631,0xC632,0xC636,0xC638,/* 0xA8-0xAF */ + 0xC63A,0xC63C,0xC63D,0xC63E,0xC63F,0xC642,0xC643,0xC645,/* 0xB0-0xB7 */ + 0xC646,0xC647,0xC649,0xC64A,0xC64B,0xC64C,0xC64D,0xC64E,/* 0xB8-0xBF */ + 0xC64F,0xC652,0xC656,0xC657,0xC658,0xC659,0xC65A,0xC65B,/* 0xC0-0xC7 */ + 0xC65E,0xC65F,0xC661,0xC662,0xC663,0xC664,0xC665,0xC666,/* 0xC8-0xCF */ + 0xC667,0xC668,0xC669,0xC66A,0xC66B,0xC66D,0xC66E,0xC670,/* 0xD0-0xD7 */ + 0xC672,0xC673,0xC674,0xC675,0xC676,0xC677,0xC67A,0xC67B,/* 0xD8-0xDF */ + 0xC67D,0xC67E,0xC67F,0xC681,0xC682,0xC683,0xC684,0xC685,/* 0xE0-0xE7 */ + 0xC686,0xC687,0xC68A,0xC68C,0xC68E,0xC68F,0xC690,0xC691,/* 0xE8-0xEF */ + 0xC692,0xC693,0xC696,0xC697,0xC699,0xC69A,0xC69B,0xC69D,/* 0xF0-0xF7 */ + 0xC69E,0xC69F,0xC6A0,0xC6A1,0xC6A2,0xC6A3,0xC6A6,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_9F[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC6A8,0xC6AA,0xC6AB,0xC6AC,0xC6AD,0xC6AE,0xC6AF,/* 0x40-0x47 */ + 0xC6B2,0xC6B3,0xC6B5,0xC6B6,0xC6B7,0xC6BB,0xC6BC,0xC6BD,/* 0x48-0x4F */ + 0xC6BE,0xC6BF,0xC6C2,0xC6C4,0xC6C6,0xC6C7,0xC6C8,0xC6C9,/* 0x50-0x57 */ + 0xC6CA,0xC6CB,0xC6CE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC6CF,0xC6D1,0xC6D2,0xC6D3,0xC6D5,0xC6D6,0xC6D7,/* 0x60-0x67 */ + 0xC6D8,0xC6D9,0xC6DA,0xC6DB,0xC6DE,0xC6DF,0xC6E2,0xC6E3,/* 0x68-0x6F */ + 0xC6E4,0xC6E5,0xC6E6,0xC6E7,0xC6EA,0xC6EB,0xC6ED,0xC6EE,/* 0x70-0x77 */ + 0xC6EF,0xC6F1,0xC6F2,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC6F3,0xC6F4,0xC6F5,0xC6F6,0xC6F7,0xC6FA,0xC6FB,/* 0x80-0x87 */ + 0xC6FC,0xC6FE,0xC6FF,0xC700,0xC701,0xC702,0xC703,0xC706,/* 0x88-0x8F */ + 0xC707,0xC709,0xC70A,0xC70B,0xC70D,0xC70E,0xC70F,0xC710,/* 0x90-0x97 */ + 0xC711,0xC712,0xC713,0xC716,0xC718,0xC71A,0xC71B,0xC71C,/* 0x98-0x9F */ + 0xC71D,0xC71E,0xC71F,0xC722,0xC723,0xC725,0xC726,0xC727,/* 0xA0-0xA7 */ + 0xC729,0xC72A,0xC72B,0xC72C,0xC72D,0xC72E,0xC72F,0xC732,/* 0xA8-0xAF */ + 0xC734,0xC736,0xC738,0xC739,0xC73A,0xC73B,0xC73E,0xC73F,/* 0xB0-0xB7 */ + 0xC741,0xC742,0xC743,0xC745,0xC746,0xC747,0xC748,0xC749,/* 0xB8-0xBF */ + 0xC74B,0xC74E,0xC750,0xC759,0xC75A,0xC75B,0xC75D,0xC75E,/* 0xC0-0xC7 */ + 0xC75F,0xC761,0xC762,0xC763,0xC764,0xC765,0xC766,0xC767,/* 0xC8-0xCF */ + 0xC769,0xC76A,0xC76C,0xC76D,0xC76E,0xC76F,0xC770,0xC771,/* 0xD0-0xD7 */ + 0xC772,0xC773,0xC776,0xC777,0xC779,0xC77A,0xC77B,0xC77F,/* 0xD8-0xDF */ + 0xC780,0xC781,0xC782,0xC786,0xC78B,0xC78C,0xC78D,0xC78F,/* 0xE0-0xE7 */ + 0xC792,0xC793,0xC795,0xC799,0xC79B,0xC79C,0xC79D,0xC79E,/* 0xE8-0xEF */ + 0xC79F,0xC7A2,0xC7A7,0xC7A8,0xC7A9,0xC7AA,0xC7AB,0xC7AE,/* 0xF0-0xF7 */ + 0xC7AF,0xC7B1,0xC7B2,0xC7B3,0xC7B5,0xC7B6,0xC7B7,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC7B8,0xC7B9,0xC7BA,0xC7BB,0xC7BE,0xC7C2,0xC7C3,/* 0x40-0x47 */ + 0xC7C4,0xC7C5,0xC7C6,0xC7C7,0xC7CA,0xC7CB,0xC7CD,0xC7CF,/* 0x48-0x4F */ + 0xC7D1,0xC7D2,0xC7D3,0xC7D4,0xC7D5,0xC7D6,0xC7D7,0xC7D9,/* 0x50-0x57 */ + 0xC7DA,0xC7DB,0xC7DC,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC7DE,0xC7DF,0xC7E0,0xC7E1,0xC7E2,0xC7E3,0xC7E5,/* 0x60-0x67 */ + 0xC7E6,0xC7E7,0xC7E9,0xC7EA,0xC7EB,0xC7ED,0xC7EE,0xC7EF,/* 0x68-0x6F */ + 0xC7F0,0xC7F1,0xC7F2,0xC7F3,0xC7F4,0xC7F5,0xC7F6,0xC7F7,/* 0x70-0x77 */ + 0xC7F8,0xC7F9,0xC7FA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC7FB,0xC7FC,0xC7FD,0xC7FE,0xC7FF,0xC802,0xC803,/* 0x80-0x87 */ + 0xC805,0xC806,0xC807,0xC809,0xC80B,0xC80C,0xC80D,0xC80E,/* 0x88-0x8F */ + 0xC80F,0xC812,0xC814,0xC817,0xC818,0xC819,0xC81A,0xC81B,/* 0x90-0x97 */ + 0xC81E,0xC81F,0xC821,0xC822,0xC823,0xC825,0xC826,0xC827,/* 0x98-0x9F */ + 0xC828,0xC829,0xC82A,0xC82B,0xC82E,0xC830,0xC832,0xC833,/* 0xA0-0xA7 */ + 0xC834,0xC835,0xC836,0xC837,0xC839,0xC83A,0xC83B,0xC83D,/* 0xA8-0xAF */ + 0xC83E,0xC83F,0xC841,0xC842,0xC843,0xC844,0xC845,0xC846,/* 0xB0-0xB7 */ + 0xC847,0xC84A,0xC84B,0xC84E,0xC84F,0xC850,0xC851,0xC852,/* 0xB8-0xBF */ + 0xC853,0xC855,0xC856,0xC857,0xC858,0xC859,0xC85A,0xC85B,/* 0xC0-0xC7 */ + 0xC85C,0xC85D,0xC85E,0xC85F,0xC860,0xC861,0xC862,0xC863,/* 0xC8-0xCF */ + 0xC864,0xC865,0xC866,0xC867,0xC868,0xC869,0xC86A,0xC86B,/* 0xD0-0xD7 */ + 0xC86C,0xC86D,0xC86E,0xC86F,0xC872,0xC873,0xC875,0xC876,/* 0xD8-0xDF */ + 0xC877,0xC879,0xC87B,0xC87C,0xC87D,0xC87E,0xC87F,0xC882,/* 0xE0-0xE7 */ + 0xC884,0xC888,0xC889,0xC88A,0xC88E,0xC88F,0xC890,0xC891,/* 0xE8-0xEF */ + 0xC892,0xC893,0xC895,0xC896,0xC897,0xC898,0xC899,0xC89A,/* 0xF0-0xF7 */ + 0xC89B,0xC89C,0xC89E,0xC8A0,0xC8A2,0xC8A3,0xC8A4,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC8A5,0xC8A6,0xC8A7,0xC8A9,0xC8AA,0xC8AB,0xC8AC,/* 0x40-0x47 */ + 0xC8AD,0xC8AE,0xC8AF,0xC8B0,0xC8B1,0xC8B2,0xC8B3,0xC8B4,/* 0x48-0x4F */ + 0xC8B5,0xC8B6,0xC8B7,0xC8B8,0xC8B9,0xC8BA,0xC8BB,0xC8BE,/* 0x50-0x57 */ + 0xC8BF,0xC8C0,0xC8C1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC8C2,0xC8C3,0xC8C5,0xC8C6,0xC8C7,0xC8C9,0xC8CA,/* 0x60-0x67 */ + 0xC8CB,0xC8CD,0xC8CE,0xC8CF,0xC8D0,0xC8D1,0xC8D2,0xC8D3,/* 0x68-0x6F */ + 0xC8D6,0xC8D8,0xC8DA,0xC8DB,0xC8DC,0xC8DD,0xC8DE,0xC8DF,/* 0x70-0x77 */ + 0xC8E2,0xC8E3,0xC8E5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC8E6,0xC8E7,0xC8E8,0xC8E9,0xC8EA,0xC8EB,0xC8EC,/* 0x80-0x87 */ + 0xC8ED,0xC8EE,0xC8EF,0xC8F0,0xC8F1,0xC8F2,0xC8F3,0xC8F4,/* 0x88-0x8F */ + 0xC8F6,0xC8F7,0xC8F8,0xC8F9,0xC8FA,0xC8FB,0xC8FE,0xC8FF,/* 0x90-0x97 */ + 0xC901,0xC902,0xC903,0xC907,0xC908,0xC909,0xC90A,0xC90B,/* 0x98-0x9F */ + 0xC90E,0x3000,0x3001,0x3002,0x00B7,0x2025,0x2026,0x00A8,/* 0xA0-0xA7 */ + 0x3003,0x00AD,0x2015,0x2225,0xFF3C,0x223C,0x2018,0x2019,/* 0xA8-0xAF */ + 0x201C,0x201D,0x3014,0x3015,0x3008,0x3009,0x300A,0x300B,/* 0xB0-0xB7 */ + 0x300C,0x300D,0x300E,0x300F,0x3010,0x3011,0x00B1,0x00D7,/* 0xB8-0xBF */ + 0x00F7,0x2260,0x2264,0x2265,0x221E,0x2234,0x00B0,0x2032,/* 0xC0-0xC7 */ + 0x2033,0x2103,0x212B,0xFFE0,0xFFE1,0xFFE5,0x2642,0x2640,/* 0xC8-0xCF */ + 0x2220,0x22A5,0x2312,0x2202,0x2207,0x2261,0x2252,0x00A7,/* 0xD0-0xD7 */ + 0x203B,0x2606,0x2605,0x25CB,0x25CF,0x25CE,0x25C7,0x25C6,/* 0xD8-0xDF */ + 0x25A1,0x25A0,0x25B3,0x25B2,0x25BD,0x25BC,0x2192,0x2190,/* 0xE0-0xE7 */ + 0x2191,0x2193,0x2194,0x3013,0x226A,0x226B,0x221A,0x223D,/* 0xE8-0xEF */ + 0x221D,0x2235,0x222B,0x222C,0x2208,0x220B,0x2286,0x2287,/* 0xF0-0xF7 */ + 0x2282,0x2283,0x222A,0x2229,0x2227,0x2228,0xFFE2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC910,0xC912,0xC913,0xC914,0xC915,0xC916,0xC917,/* 0x40-0x47 */ + 0xC919,0xC91A,0xC91B,0xC91C,0xC91D,0xC91E,0xC91F,0xC920,/* 0x48-0x4F */ + 0xC921,0xC922,0xC923,0xC924,0xC925,0xC926,0xC927,0xC928,/* 0x50-0x57 */ + 0xC929,0xC92A,0xC92B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC92D,0xC92E,0xC92F,0xC930,0xC931,0xC932,0xC933,/* 0x60-0x67 */ + 0xC935,0xC936,0xC937,0xC938,0xC939,0xC93A,0xC93B,0xC93C,/* 0x68-0x6F */ + 0xC93D,0xC93E,0xC93F,0xC940,0xC941,0xC942,0xC943,0xC944,/* 0x70-0x77 */ + 0xC945,0xC946,0xC947,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC948,0xC949,0xC94A,0xC94B,0xC94C,0xC94D,0xC94E,/* 0x80-0x87 */ + 0xC94F,0xC952,0xC953,0xC955,0xC956,0xC957,0xC959,0xC95A,/* 0x88-0x8F */ + 0xC95B,0xC95C,0xC95D,0xC95E,0xC95F,0xC962,0xC964,0xC965,/* 0x90-0x97 */ + 0xC966,0xC967,0xC968,0xC969,0xC96A,0xC96B,0xC96D,0xC96E,/* 0x98-0x9F */ + 0xC96F,0x21D2,0x21D4,0x2200,0x2203,0x00B4,0xFF5E,0x02C7,/* 0xA0-0xA7 */ + 0x02D8,0x02DD,0x02DA,0x02D9,0x00B8,0x02DB,0x00A1,0x00BF,/* 0xA8-0xAF */ + 0x02D0,0x222E,0x2211,0x220F,0x00A4,0x2109,0x2030,0x25C1,/* 0xB0-0xB7 */ + 0x25C0,0x25B7,0x25B6,0x2664,0x2660,0x2661,0x2665,0x2667,/* 0xB8-0xBF */ + 0x2663,0x2299,0x25C8,0x25A3,0x25D0,0x25D1,0x2592,0x25A4,/* 0xC0-0xC7 */ + 0x25A5,0x25A8,0x25A7,0x25A6,0x25A9,0x2668,0x260F,0x260E,/* 0xC8-0xCF */ + 0x261C,0x261E,0x00B6,0x2020,0x2021,0x2195,0x2197,0x2199,/* 0xD0-0xD7 */ + 0x2196,0x2198,0x266D,0x2669,0x266A,0x266C,0x327F,0x321C,/* 0xD8-0xDF */ + 0x2116,0x33C7,0x2122,0x33C2,0x33D8,0x2121,0x20AC,0x00AE,/* 0xE0-0xE7 */ +}; + +static const wchar_t c2u_A3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC971,0xC972,0xC973,0xC975,0xC976,0xC977,0xC978,/* 0x40-0x47 */ + 0xC979,0xC97A,0xC97B,0xC97D,0xC97E,0xC97F,0xC980,0xC981,/* 0x48-0x4F */ + 0xC982,0xC983,0xC984,0xC985,0xC986,0xC987,0xC98A,0xC98B,/* 0x50-0x57 */ + 0xC98D,0xC98E,0xC98F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xC991,0xC992,0xC993,0xC994,0xC995,0xC996,0xC997,/* 0x60-0x67 */ + 0xC99A,0xC99C,0xC99E,0xC99F,0xC9A0,0xC9A1,0xC9A2,0xC9A3,/* 0x68-0x6F */ + 0xC9A4,0xC9A5,0xC9A6,0xC9A7,0xC9A8,0xC9A9,0xC9AA,0xC9AB,/* 0x70-0x77 */ + 0xC9AC,0xC9AD,0xC9AE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xC9AF,0xC9B0,0xC9B1,0xC9B2,0xC9B3,0xC9B4,0xC9B5,/* 0x80-0x87 */ + 0xC9B6,0xC9B7,0xC9B8,0xC9B9,0xC9BA,0xC9BB,0xC9BC,0xC9BD,/* 0x88-0x8F */ + 0xC9BE,0xC9BF,0xC9C2,0xC9C3,0xC9C5,0xC9C6,0xC9C9,0xC9CB,/* 0x90-0x97 */ + 0xC9CC,0xC9CD,0xC9CE,0xC9CF,0xC9D2,0xC9D4,0xC9D7,0xC9D8,/* 0x98-0x9F */ + 0xC9DB,0xFF01,0xFF02,0xFF03,0xFF04,0xFF05,0xFF06,0xFF07,/* 0xA0-0xA7 */ + 0xFF08,0xFF09,0xFF0A,0xFF0B,0xFF0C,0xFF0D,0xFF0E,0xFF0F,/* 0xA8-0xAF */ + 0xFF10,0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,/* 0xB0-0xB7 */ + 0xFF18,0xFF19,0xFF1A,0xFF1B,0xFF1C,0xFF1D,0xFF1E,0xFF1F,/* 0xB8-0xBF */ + 0xFF20,0xFF21,0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,/* 0xC0-0xC7 */ + 0xFF28,0xFF29,0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,/* 0xC8-0xCF */ + 0xFF30,0xFF31,0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,/* 0xD0-0xD7 */ + 0xFF38,0xFF39,0xFF3A,0xFF3B,0xFFE6,0xFF3D,0xFF3E,0xFF3F,/* 0xD8-0xDF */ + 0xFF40,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0xE0-0xE7 */ + 0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0xE8-0xEF */ + 0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0xFF57,/* 0xF0-0xF7 */ + 0xFF58,0xFF59,0xFF5A,0xFF5B,0xFF5C,0xFF5D,0xFFE3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xC9DE,0xC9DF,0xC9E1,0xC9E3,0xC9E5,0xC9E6,0xC9E8,/* 0x40-0x47 */ + 0xC9E9,0xC9EA,0xC9EB,0xC9EE,0xC9F2,0xC9F3,0xC9F4,0xC9F5,/* 0x48-0x4F */ + 0xC9F6,0xC9F7,0xC9FA,0xC9FB,0xC9FD,0xC9FE,0xC9FF,0xCA01,/* 0x50-0x57 */ + 0xCA02,0xCA03,0xCA04,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCA05,0xCA06,0xCA07,0xCA0A,0xCA0E,0xCA0F,0xCA10,/* 0x60-0x67 */ + 0xCA11,0xCA12,0xCA13,0xCA15,0xCA16,0xCA17,0xCA19,0xCA1A,/* 0x68-0x6F */ + 0xCA1B,0xCA1C,0xCA1D,0xCA1E,0xCA1F,0xCA20,0xCA21,0xCA22,/* 0x70-0x77 */ + 0xCA23,0xCA24,0xCA25,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCA26,0xCA27,0xCA28,0xCA2A,0xCA2B,0xCA2C,0xCA2D,/* 0x80-0x87 */ + 0xCA2E,0xCA2F,0xCA30,0xCA31,0xCA32,0xCA33,0xCA34,0xCA35,/* 0x88-0x8F */ + 0xCA36,0xCA37,0xCA38,0xCA39,0xCA3A,0xCA3B,0xCA3C,0xCA3D,/* 0x90-0x97 */ + 0xCA3E,0xCA3F,0xCA40,0xCA41,0xCA42,0xCA43,0xCA44,0xCA45,/* 0x98-0x9F */ + 0xCA46,0x3131,0x3132,0x3133,0x3134,0x3135,0x3136,0x3137,/* 0xA0-0xA7 */ + 0x3138,0x3139,0x313A,0x313B,0x313C,0x313D,0x313E,0x313F,/* 0xA8-0xAF */ + 0x3140,0x3141,0x3142,0x3143,0x3144,0x3145,0x3146,0x3147,/* 0xB0-0xB7 */ + 0x3148,0x3149,0x314A,0x314B,0x314C,0x314D,0x314E,0x314F,/* 0xB8-0xBF */ + 0x3150,0x3151,0x3152,0x3153,0x3154,0x3155,0x3156,0x3157,/* 0xC0-0xC7 */ + 0x3158,0x3159,0x315A,0x315B,0x315C,0x315D,0x315E,0x315F,/* 0xC8-0xCF */ + 0x3160,0x3161,0x3162,0x3163,0x3164,0x3165,0x3166,0x3167,/* 0xD0-0xD7 */ + 0x3168,0x3169,0x316A,0x316B,0x316C,0x316D,0x316E,0x316F,/* 0xD8-0xDF */ + 0x3170,0x3171,0x3172,0x3173,0x3174,0x3175,0x3176,0x3177,/* 0xE0-0xE7 */ + 0x3178,0x3179,0x317A,0x317B,0x317C,0x317D,0x317E,0x317F,/* 0xE8-0xEF */ + 0x3180,0x3181,0x3182,0x3183,0x3184,0x3185,0x3186,0x3187,/* 0xF0-0xF7 */ + 0x3188,0x3189,0x318A,0x318B,0x318C,0x318D,0x318E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCA47,0xCA48,0xCA49,0xCA4A,0xCA4B,0xCA4E,0xCA4F,/* 0x40-0x47 */ + 0xCA51,0xCA52,0xCA53,0xCA55,0xCA56,0xCA57,0xCA58,0xCA59,/* 0x48-0x4F */ + 0xCA5A,0xCA5B,0xCA5E,0xCA62,0xCA63,0xCA64,0xCA65,0xCA66,/* 0x50-0x57 */ + 0xCA67,0xCA69,0xCA6A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCA6B,0xCA6C,0xCA6D,0xCA6E,0xCA6F,0xCA70,0xCA71,/* 0x60-0x67 */ + 0xCA72,0xCA73,0xCA74,0xCA75,0xCA76,0xCA77,0xCA78,0xCA79,/* 0x68-0x6F */ + 0xCA7A,0xCA7B,0xCA7C,0xCA7E,0xCA7F,0xCA80,0xCA81,0xCA82,/* 0x70-0x77 */ + 0xCA83,0xCA85,0xCA86,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCA87,0xCA88,0xCA89,0xCA8A,0xCA8B,0xCA8C,0xCA8D,/* 0x80-0x87 */ + 0xCA8E,0xCA8F,0xCA90,0xCA91,0xCA92,0xCA93,0xCA94,0xCA95,/* 0x88-0x8F */ + 0xCA96,0xCA97,0xCA99,0xCA9A,0xCA9B,0xCA9C,0xCA9D,0xCA9E,/* 0x90-0x97 */ + 0xCA9F,0xCAA0,0xCAA1,0xCAA2,0xCAA3,0xCAA4,0xCAA5,0xCAA6,/* 0x98-0x9F */ + 0xCAA7,0x2170,0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,/* 0xA0-0xA7 */ + 0x2177,0x2178,0x2179,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA8-0xAF */ + 0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,0x2167,/* 0xB0-0xB7 */ + 0x2168,0x2169,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xB8-0xBF */ + 0x0000,0x0391,0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,/* 0xC0-0xC7 */ + 0x0398,0x0399,0x039A,0x039B,0x039C,0x039D,0x039E,0x039F,/* 0xC8-0xCF */ + 0x03A0,0x03A1,0x03A3,0x03A4,0x03A5,0x03A6,0x03A7,0x03A8,/* 0xD0-0xD7 */ + 0x03A9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD8-0xDF */ + 0x0000,0x03B1,0x03B2,0x03B3,0x03B4,0x03B5,0x03B6,0x03B7,/* 0xE0-0xE7 */ + 0x03B8,0x03B9,0x03BA,0x03BB,0x03BC,0x03BD,0x03BE,0x03BF,/* 0xE8-0xEF */ + 0x03C0,0x03C1,0x03C3,0x03C4,0x03C5,0x03C6,0x03C7,0x03C8,/* 0xF0-0xF7 */ + 0x03C9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCAA8,0xCAA9,0xCAAA,0xCAAB,0xCAAC,0xCAAD,0xCAAE,/* 0x40-0x47 */ + 0xCAAF,0xCAB0,0xCAB1,0xCAB2,0xCAB3,0xCAB4,0xCAB5,0xCAB6,/* 0x48-0x4F */ + 0xCAB7,0xCAB8,0xCAB9,0xCABA,0xCABB,0xCABE,0xCABF,0xCAC1,/* 0x50-0x57 */ + 0xCAC2,0xCAC3,0xCAC5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCAC6,0xCAC7,0xCAC8,0xCAC9,0xCACA,0xCACB,0xCACE,/* 0x60-0x67 */ + 0xCAD0,0xCAD2,0xCAD4,0xCAD5,0xCAD6,0xCAD7,0xCADA,0xCADB,/* 0x68-0x6F */ + 0xCADC,0xCADD,0xCADE,0xCADF,0xCAE1,0xCAE2,0xCAE3,0xCAE4,/* 0x70-0x77 */ + 0xCAE5,0xCAE6,0xCAE7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCAE8,0xCAE9,0xCAEA,0xCAEB,0xCAED,0xCAEE,0xCAEF,/* 0x80-0x87 */ + 0xCAF0,0xCAF1,0xCAF2,0xCAF3,0xCAF5,0xCAF6,0xCAF7,0xCAF8,/* 0x88-0x8F */ + 0xCAF9,0xCAFA,0xCAFB,0xCAFC,0xCAFD,0xCAFE,0xCAFF,0xCB00,/* 0x90-0x97 */ + 0xCB01,0xCB02,0xCB03,0xCB04,0xCB05,0xCB06,0xCB07,0xCB09,/* 0x98-0x9F */ + 0xCB0A,0x2500,0x2502,0x250C,0x2510,0x2518,0x2514,0x251C,/* 0xA0-0xA7 */ + 0x252C,0x2524,0x2534,0x253C,0x2501,0x2503,0x250F,0x2513,/* 0xA8-0xAF */ + 0x251B,0x2517,0x2523,0x2533,0x252B,0x253B,0x254B,0x2520,/* 0xB0-0xB7 */ + 0x252F,0x2528,0x2537,0x253F,0x251D,0x2530,0x2525,0x2538,/* 0xB8-0xBF */ + 0x2542,0x2512,0x2511,0x251A,0x2519,0x2516,0x2515,0x250E,/* 0xC0-0xC7 */ + 0x250D,0x251E,0x251F,0x2521,0x2522,0x2526,0x2527,0x2529,/* 0xC8-0xCF */ + 0x252A,0x252D,0x252E,0x2531,0x2532,0x2535,0x2536,0x2539,/* 0xD0-0xD7 */ + 0x253A,0x253D,0x253E,0x2540,0x2541,0x2543,0x2544,0x2545,/* 0xD8-0xDF */ + 0x2546,0x2547,0x2548,0x2549,0x254A,0x0000,0x0000,0x0000,/* 0xE0-0xE7 */ +}; + +static const wchar_t c2u_A7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCB0B,0xCB0C,0xCB0D,0xCB0E,0xCB0F,0xCB11,0xCB12,/* 0x40-0x47 */ + 0xCB13,0xCB15,0xCB16,0xCB17,0xCB19,0xCB1A,0xCB1B,0xCB1C,/* 0x48-0x4F */ + 0xCB1D,0xCB1E,0xCB1F,0xCB22,0xCB23,0xCB24,0xCB25,0xCB26,/* 0x50-0x57 */ + 0xCB27,0xCB28,0xCB29,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCB2A,0xCB2B,0xCB2C,0xCB2D,0xCB2E,0xCB2F,0xCB30,/* 0x60-0x67 */ + 0xCB31,0xCB32,0xCB33,0xCB34,0xCB35,0xCB36,0xCB37,0xCB38,/* 0x68-0x6F */ + 0xCB39,0xCB3A,0xCB3B,0xCB3C,0xCB3D,0xCB3E,0xCB3F,0xCB40,/* 0x70-0x77 */ + 0xCB42,0xCB43,0xCB44,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCB45,0xCB46,0xCB47,0xCB4A,0xCB4B,0xCB4D,0xCB4E,/* 0x80-0x87 */ + 0xCB4F,0xCB51,0xCB52,0xCB53,0xCB54,0xCB55,0xCB56,0xCB57,/* 0x88-0x8F */ + 0xCB5A,0xCB5B,0xCB5C,0xCB5E,0xCB5F,0xCB60,0xCB61,0xCB62,/* 0x90-0x97 */ + 0xCB63,0xCB65,0xCB66,0xCB67,0xCB68,0xCB69,0xCB6A,0xCB6B,/* 0x98-0x9F */ + 0xCB6C,0x3395,0x3396,0x3397,0x2113,0x3398,0x33C4,0x33A3,/* 0xA0-0xA7 */ + 0x33A4,0x33A5,0x33A6,0x3399,0x339A,0x339B,0x339C,0x339D,/* 0xA8-0xAF */ + 0x339E,0x339F,0x33A0,0x33A1,0x33A2,0x33CA,0x338D,0x338E,/* 0xB0-0xB7 */ + 0x338F,0x33CF,0x3388,0x3389,0x33C8,0x33A7,0x33A8,0x33B0,/* 0xB8-0xBF */ + 0x33B1,0x33B2,0x33B3,0x33B4,0x33B5,0x33B6,0x33B7,0x33B8,/* 0xC0-0xC7 */ + 0x33B9,0x3380,0x3381,0x3382,0x3383,0x3384,0x33BA,0x33BB,/* 0xC8-0xCF */ + 0x33BC,0x33BD,0x33BE,0x33BF,0x3390,0x3391,0x3392,0x3393,/* 0xD0-0xD7 */ + 0x3394,0x2126,0x33C0,0x33C1,0x338A,0x338B,0x338C,0x33D6,/* 0xD8-0xDF */ + 0x33C5,0x33AD,0x33AE,0x33AF,0x33DB,0x33A9,0x33AA,0x33AB,/* 0xE0-0xE7 */ + 0x33AC,0x33DD,0x33D0,0x33D3,0x33C3,0x33C9,0x33DC,0x33C6,/* 0xE8-0xEF */ +}; + +static const wchar_t c2u_A8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCB6D,0xCB6E,0xCB6F,0xCB70,0xCB71,0xCB72,0xCB73,/* 0x40-0x47 */ + 0xCB74,0xCB75,0xCB76,0xCB77,0xCB7A,0xCB7B,0xCB7C,0xCB7D,/* 0x48-0x4F */ + 0xCB7E,0xCB7F,0xCB80,0xCB81,0xCB82,0xCB83,0xCB84,0xCB85,/* 0x50-0x57 */ + 0xCB86,0xCB87,0xCB88,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCB89,0xCB8A,0xCB8B,0xCB8C,0xCB8D,0xCB8E,0xCB8F,/* 0x60-0x67 */ + 0xCB90,0xCB91,0xCB92,0xCB93,0xCB94,0xCB95,0xCB96,0xCB97,/* 0x68-0x6F */ + 0xCB98,0xCB99,0xCB9A,0xCB9B,0xCB9D,0xCB9E,0xCB9F,0xCBA0,/* 0x70-0x77 */ + 0xCBA1,0xCBA2,0xCBA3,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCBA4,0xCBA5,0xCBA6,0xCBA7,0xCBA8,0xCBA9,0xCBAA,/* 0x80-0x87 */ + 0xCBAB,0xCBAC,0xCBAD,0xCBAE,0xCBAF,0xCBB0,0xCBB1,0xCBB2,/* 0x88-0x8F */ + 0xCBB3,0xCBB4,0xCBB5,0xCBB6,0xCBB7,0xCBB9,0xCBBA,0xCBBB,/* 0x90-0x97 */ + 0xCBBC,0xCBBD,0xCBBE,0xCBBF,0xCBC0,0xCBC1,0xCBC2,0xCBC3,/* 0x98-0x9F */ + 0xCBC4,0x00C6,0x00D0,0x00AA,0x0126,0x0000,0x0132,0x0000,/* 0xA0-0xA7 */ + 0x013F,0x0141,0x00D8,0x0152,0x00BA,0x00DE,0x0166,0x014A,/* 0xA8-0xAF */ + 0x0000,0x3260,0x3261,0x3262,0x3263,0x3264,0x3265,0x3266,/* 0xB0-0xB7 */ + 0x3267,0x3268,0x3269,0x326A,0x326B,0x326C,0x326D,0x326E,/* 0xB8-0xBF */ + 0x326F,0x3270,0x3271,0x3272,0x3273,0x3274,0x3275,0x3276,/* 0xC0-0xC7 */ + 0x3277,0x3278,0x3279,0x327A,0x327B,0x24D0,0x24D1,0x24D2,/* 0xC8-0xCF */ + 0x24D3,0x24D4,0x24D5,0x24D6,0x24D7,0x24D8,0x24D9,0x24DA,/* 0xD0-0xD7 */ + 0x24DB,0x24DC,0x24DD,0x24DE,0x24DF,0x24E0,0x24E1,0x24E2,/* 0xD8-0xDF */ + 0x24E3,0x24E4,0x24E5,0x24E6,0x24E7,0x24E8,0x24E9,0x2460,/* 0xE0-0xE7 */ + 0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,0x2467,0x2468,/* 0xE8-0xEF */ + 0x2469,0x246A,0x246B,0x246C,0x246D,0x246E,0x00BD,0x2153,/* 0xF0-0xF7 */ + 0x2154,0x00BC,0x00BE,0x215B,0x215C,0x215D,0x215E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCBC5,0xCBC6,0xCBC7,0xCBC8,0xCBC9,0xCBCA,0xCBCB,/* 0x40-0x47 */ + 0xCBCC,0xCBCD,0xCBCE,0xCBCF,0xCBD0,0xCBD1,0xCBD2,0xCBD3,/* 0x48-0x4F */ + 0xCBD5,0xCBD6,0xCBD7,0xCBD8,0xCBD9,0xCBDA,0xCBDB,0xCBDC,/* 0x50-0x57 */ + 0xCBDD,0xCBDE,0xCBDF,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCBE0,0xCBE1,0xCBE2,0xCBE3,0xCBE5,0xCBE6,0xCBE8,/* 0x60-0x67 */ + 0xCBEA,0xCBEB,0xCBEC,0xCBED,0xCBEE,0xCBEF,0xCBF0,0xCBF1,/* 0x68-0x6F */ + 0xCBF2,0xCBF3,0xCBF4,0xCBF5,0xCBF6,0xCBF7,0xCBF8,0xCBF9,/* 0x70-0x77 */ + 0xCBFA,0xCBFB,0xCBFC,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCBFD,0xCBFE,0xCBFF,0xCC00,0xCC01,0xCC02,0xCC03,/* 0x80-0x87 */ + 0xCC04,0xCC05,0xCC06,0xCC07,0xCC08,0xCC09,0xCC0A,0xCC0B,/* 0x88-0x8F */ + 0xCC0E,0xCC0F,0xCC11,0xCC12,0xCC13,0xCC15,0xCC16,0xCC17,/* 0x90-0x97 */ + 0xCC18,0xCC19,0xCC1A,0xCC1B,0xCC1E,0xCC1F,0xCC20,0xCC23,/* 0x98-0x9F */ + 0xCC24,0x00E6,0x0111,0x00F0,0x0127,0x0131,0x0133,0x0138,/* 0xA0-0xA7 */ + 0x0140,0x0142,0x00F8,0x0153,0x00DF,0x00FE,0x0167,0x014B,/* 0xA8-0xAF */ + 0x0149,0x3200,0x3201,0x3202,0x3203,0x3204,0x3205,0x3206,/* 0xB0-0xB7 */ + 0x3207,0x3208,0x3209,0x320A,0x320B,0x320C,0x320D,0x320E,/* 0xB8-0xBF */ + 0x320F,0x3210,0x3211,0x3212,0x3213,0x3214,0x3215,0x3216,/* 0xC0-0xC7 */ + 0x3217,0x3218,0x3219,0x321A,0x321B,0x249C,0x249D,0x249E,/* 0xC8-0xCF */ + 0x249F,0x24A0,0x24A1,0x24A2,0x24A3,0x24A4,0x24A5,0x24A6,/* 0xD0-0xD7 */ + 0x24A7,0x24A8,0x24A9,0x24AA,0x24AB,0x24AC,0x24AD,0x24AE,/* 0xD8-0xDF */ + 0x24AF,0x24B0,0x24B1,0x24B2,0x24B3,0x24B4,0x24B5,0x2474,/* 0xE0-0xE7 */ + 0x2475,0x2476,0x2477,0x2478,0x2479,0x247A,0x247B,0x247C,/* 0xE8-0xEF */ + 0x247D,0x247E,0x247F,0x2480,0x2481,0x2482,0x00B9,0x00B2,/* 0xF0-0xF7 */ + 0x00B3,0x2074,0x207F,0x2081,0x2082,0x2083,0x2084,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_AA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCC25,0xCC26,0xCC2A,0xCC2B,0xCC2D,0xCC2F,0xCC31,/* 0x40-0x47 */ + 0xCC32,0xCC33,0xCC34,0xCC35,0xCC36,0xCC37,0xCC3A,0xCC3F,/* 0x48-0x4F */ + 0xCC40,0xCC41,0xCC42,0xCC43,0xCC46,0xCC47,0xCC49,0xCC4A,/* 0x50-0x57 */ + 0xCC4B,0xCC4D,0xCC4E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCC4F,0xCC50,0xCC51,0xCC52,0xCC53,0xCC56,0xCC5A,/* 0x60-0x67 */ + 0xCC5B,0xCC5C,0xCC5D,0xCC5E,0xCC5F,0xCC61,0xCC62,0xCC63,/* 0x68-0x6F */ + 0xCC65,0xCC67,0xCC69,0xCC6A,0xCC6B,0xCC6C,0xCC6D,0xCC6E,/* 0x70-0x77 */ + 0xCC6F,0xCC71,0xCC72,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCC73,0xCC74,0xCC76,0xCC77,0xCC78,0xCC79,0xCC7A,/* 0x80-0x87 */ + 0xCC7B,0xCC7C,0xCC7D,0xCC7E,0xCC7F,0xCC80,0xCC81,0xCC82,/* 0x88-0x8F */ + 0xCC83,0xCC84,0xCC85,0xCC86,0xCC87,0xCC88,0xCC89,0xCC8A,/* 0x90-0x97 */ + 0xCC8B,0xCC8C,0xCC8D,0xCC8E,0xCC8F,0xCC90,0xCC91,0xCC92,/* 0x98-0x9F */ + 0xCC93,0x3041,0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,/* 0xA0-0xA7 */ + 0x3048,0x3049,0x304A,0x304B,0x304C,0x304D,0x304E,0x304F,/* 0xA8-0xAF */ + 0x3050,0x3051,0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,/* 0xB0-0xB7 */ + 0x3058,0x3059,0x305A,0x305B,0x305C,0x305D,0x305E,0x305F,/* 0xB8-0xBF */ + 0x3060,0x3061,0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,/* 0xC0-0xC7 */ + 0x3068,0x3069,0x306A,0x306B,0x306C,0x306D,0x306E,0x306F,/* 0xC8-0xCF */ + 0x3070,0x3071,0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,/* 0xD0-0xD7 */ + 0x3078,0x3079,0x307A,0x307B,0x307C,0x307D,0x307E,0x307F,/* 0xD8-0xDF */ + 0x3080,0x3081,0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,/* 0xE0-0xE7 */ + 0x3088,0x3089,0x308A,0x308B,0x308C,0x308D,0x308E,0x308F,/* 0xE8-0xEF */ + 0x3090,0x3091,0x3092,0x3093,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_AB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCC94,0xCC95,0xCC96,0xCC97,0xCC9A,0xCC9B,0xCC9D,/* 0x40-0x47 */ + 0xCC9E,0xCC9F,0xCCA1,0xCCA2,0xCCA3,0xCCA4,0xCCA5,0xCCA6,/* 0x48-0x4F */ + 0xCCA7,0xCCAA,0xCCAE,0xCCAF,0xCCB0,0xCCB1,0xCCB2,0xCCB3,/* 0x50-0x57 */ + 0xCCB6,0xCCB7,0xCCB9,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCCBA,0xCCBB,0xCCBD,0xCCBE,0xCCBF,0xCCC0,0xCCC1,/* 0x60-0x67 */ + 0xCCC2,0xCCC3,0xCCC6,0xCCC8,0xCCCA,0xCCCB,0xCCCC,0xCCCD,/* 0x68-0x6F */ + 0xCCCE,0xCCCF,0xCCD1,0xCCD2,0xCCD3,0xCCD5,0xCCD6,0xCCD7,/* 0x70-0x77 */ + 0xCCD8,0xCCD9,0xCCDA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCCDB,0xCCDC,0xCCDD,0xCCDE,0xCCDF,0xCCE0,0xCCE1,/* 0x80-0x87 */ + 0xCCE2,0xCCE3,0xCCE5,0xCCE6,0xCCE7,0xCCE8,0xCCE9,0xCCEA,/* 0x88-0x8F */ + 0xCCEB,0xCCED,0xCCEE,0xCCEF,0xCCF1,0xCCF2,0xCCF3,0xCCF4,/* 0x90-0x97 */ + 0xCCF5,0xCCF6,0xCCF7,0xCCF8,0xCCF9,0xCCFA,0xCCFB,0xCCFC,/* 0x98-0x9F */ + 0xCCFD,0x30A1,0x30A2,0x30A3,0x30A4,0x30A5,0x30A6,0x30A7,/* 0xA0-0xA7 */ + 0x30A8,0x30A9,0x30AA,0x30AB,0x30AC,0x30AD,0x30AE,0x30AF,/* 0xA8-0xAF */ + 0x30B0,0x30B1,0x30B2,0x30B3,0x30B4,0x30B5,0x30B6,0x30B7,/* 0xB0-0xB7 */ + 0x30B8,0x30B9,0x30BA,0x30BB,0x30BC,0x30BD,0x30BE,0x30BF,/* 0xB8-0xBF */ + 0x30C0,0x30C1,0x30C2,0x30C3,0x30C4,0x30C5,0x30C6,0x30C7,/* 0xC0-0xC7 */ + 0x30C8,0x30C9,0x30CA,0x30CB,0x30CC,0x30CD,0x30CE,0x30CF,/* 0xC8-0xCF */ + 0x30D0,0x30D1,0x30D2,0x30D3,0x30D4,0x30D5,0x30D6,0x30D7,/* 0xD0-0xD7 */ + 0x30D8,0x30D9,0x30DA,0x30DB,0x30DC,0x30DD,0x30DE,0x30DF,/* 0xD8-0xDF */ + 0x30E0,0x30E1,0x30E2,0x30E3,0x30E4,0x30E5,0x30E6,0x30E7,/* 0xE0-0xE7 */ + 0x30E8,0x30E9,0x30EA,0x30EB,0x30EC,0x30ED,0x30EE,0x30EF,/* 0xE8-0xEF */ + 0x30F0,0x30F1,0x30F2,0x30F3,0x30F4,0x30F5,0x30F6,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_AC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCCFE,0xCCFF,0xCD00,0xCD02,0xCD03,0xCD04,0xCD05,/* 0x40-0x47 */ + 0xCD06,0xCD07,0xCD0A,0xCD0B,0xCD0D,0xCD0E,0xCD0F,0xCD11,/* 0x48-0x4F */ + 0xCD12,0xCD13,0xCD14,0xCD15,0xCD16,0xCD17,0xCD1A,0xCD1C,/* 0x50-0x57 */ + 0xCD1E,0xCD1F,0xCD20,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCD21,0xCD22,0xCD23,0xCD25,0xCD26,0xCD27,0xCD29,/* 0x60-0x67 */ + 0xCD2A,0xCD2B,0xCD2D,0xCD2E,0xCD2F,0xCD30,0xCD31,0xCD32,/* 0x68-0x6F */ + 0xCD33,0xCD34,0xCD35,0xCD36,0xCD37,0xCD38,0xCD3A,0xCD3B,/* 0x70-0x77 */ + 0xCD3C,0xCD3D,0xCD3E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCD3F,0xCD40,0xCD41,0xCD42,0xCD43,0xCD44,0xCD45,/* 0x80-0x87 */ + 0xCD46,0xCD47,0xCD48,0xCD49,0xCD4A,0xCD4B,0xCD4C,0xCD4D,/* 0x88-0x8F */ + 0xCD4E,0xCD4F,0xCD50,0xCD51,0xCD52,0xCD53,0xCD54,0xCD55,/* 0x90-0x97 */ + 0xCD56,0xCD57,0xCD58,0xCD59,0xCD5A,0xCD5B,0xCD5D,0xCD5E,/* 0x98-0x9F */ + 0xCD5F,0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,/* 0xA0-0xA7 */ + 0x0416,0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,/* 0xA8-0xAF */ + 0x041E,0x041F,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,/* 0xB0-0xB7 */ + 0x0426,0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,/* 0xB8-0xBF */ + 0x042E,0x042F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC8-0xCF */ + 0x0000,0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,/* 0xD0-0xD7 */ + 0x0436,0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,/* 0xD8-0xDF */ + 0x043E,0x043F,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,/* 0xE0-0xE7 */ + 0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,/* 0xE8-0xEF */ + 0x044E,0x044F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */ +}; + +static const wchar_t c2u_AD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCD61,0xCD62,0xCD63,0xCD65,0xCD66,0xCD67,0xCD68,/* 0x40-0x47 */ + 0xCD69,0xCD6A,0xCD6B,0xCD6E,0xCD70,0xCD72,0xCD73,0xCD74,/* 0x48-0x4F */ + 0xCD75,0xCD76,0xCD77,0xCD79,0xCD7A,0xCD7B,0xCD7C,0xCD7D,/* 0x50-0x57 */ + 0xCD7E,0xCD7F,0xCD80,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCD81,0xCD82,0xCD83,0xCD84,0xCD85,0xCD86,0xCD87,/* 0x60-0x67 */ + 0xCD89,0xCD8A,0xCD8B,0xCD8C,0xCD8D,0xCD8E,0xCD8F,0xCD90,/* 0x68-0x6F */ + 0xCD91,0xCD92,0xCD93,0xCD96,0xCD97,0xCD99,0xCD9A,0xCD9B,/* 0x70-0x77 */ + 0xCD9D,0xCD9E,0xCD9F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCDA0,0xCDA1,0xCDA2,0xCDA3,0xCDA6,0xCDA8,0xCDAA,/* 0x80-0x87 */ + 0xCDAB,0xCDAC,0xCDAD,0xCDAE,0xCDAF,0xCDB1,0xCDB2,0xCDB3,/* 0x88-0x8F */ + 0xCDB4,0xCDB5,0xCDB6,0xCDB7,0xCDB8,0xCDB9,0xCDBA,0xCDBB,/* 0x90-0x97 */ + 0xCDBC,0xCDBD,0xCDBE,0xCDBF,0xCDC0,0xCDC1,0xCDC2,0xCDC3,/* 0x98-0x9F */ + 0xCDC5,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_AE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCDC6,0xCDC7,0xCDC8,0xCDC9,0xCDCA,0xCDCB,0xCDCD,/* 0x40-0x47 */ + 0xCDCE,0xCDCF,0xCDD1,0xCDD2,0xCDD3,0xCDD4,0xCDD5,0xCDD6,/* 0x48-0x4F */ + 0xCDD7,0xCDD8,0xCDD9,0xCDDA,0xCDDB,0xCDDC,0xCDDD,0xCDDE,/* 0x50-0x57 */ + 0xCDDF,0xCDE0,0xCDE1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCDE2,0xCDE3,0xCDE4,0xCDE5,0xCDE6,0xCDE7,0xCDE9,/* 0x60-0x67 */ + 0xCDEA,0xCDEB,0xCDED,0xCDEE,0xCDEF,0xCDF1,0xCDF2,0xCDF3,/* 0x68-0x6F */ + 0xCDF4,0xCDF5,0xCDF6,0xCDF7,0xCDFA,0xCDFC,0xCDFE,0xCDFF,/* 0x70-0x77 */ + 0xCE00,0xCE01,0xCE02,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCE03,0xCE05,0xCE06,0xCE07,0xCE09,0xCE0A,0xCE0B,/* 0x80-0x87 */ + 0xCE0D,0xCE0E,0xCE0F,0xCE10,0xCE11,0xCE12,0xCE13,0xCE15,/* 0x88-0x8F */ + 0xCE16,0xCE17,0xCE18,0xCE1A,0xCE1B,0xCE1C,0xCE1D,0xCE1E,/* 0x90-0x97 */ + 0xCE1F,0xCE22,0xCE23,0xCE25,0xCE26,0xCE27,0xCE29,0xCE2A,/* 0x98-0x9F */ + 0xCE2B,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_AF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCE2C,0xCE2D,0xCE2E,0xCE2F,0xCE32,0xCE34,0xCE36,/* 0x40-0x47 */ + 0xCE37,0xCE38,0xCE39,0xCE3A,0xCE3B,0xCE3C,0xCE3D,0xCE3E,/* 0x48-0x4F */ + 0xCE3F,0xCE40,0xCE41,0xCE42,0xCE43,0xCE44,0xCE45,0xCE46,/* 0x50-0x57 */ + 0xCE47,0xCE48,0xCE49,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCE4A,0xCE4B,0xCE4C,0xCE4D,0xCE4E,0xCE4F,0xCE50,/* 0x60-0x67 */ + 0xCE51,0xCE52,0xCE53,0xCE54,0xCE55,0xCE56,0xCE57,0xCE5A,/* 0x68-0x6F */ + 0xCE5B,0xCE5D,0xCE5E,0xCE62,0xCE63,0xCE64,0xCE65,0xCE66,/* 0x70-0x77 */ + 0xCE67,0xCE6A,0xCE6C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCE6E,0xCE6F,0xCE70,0xCE71,0xCE72,0xCE73,0xCE76,/* 0x80-0x87 */ + 0xCE77,0xCE79,0xCE7A,0xCE7B,0xCE7D,0xCE7E,0xCE7F,0xCE80,/* 0x88-0x8F */ + 0xCE81,0xCE82,0xCE83,0xCE86,0xCE88,0xCE8A,0xCE8B,0xCE8C,/* 0x90-0x97 */ + 0xCE8D,0xCE8E,0xCE8F,0xCE92,0xCE93,0xCE95,0xCE96,0xCE97,/* 0x98-0x9F */ + 0xCE99,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */ +}; + +static const wchar_t c2u_B0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCE9A,0xCE9B,0xCE9C,0xCE9D,0xCE9E,0xCE9F,0xCEA2,/* 0x40-0x47 */ + 0xCEA6,0xCEA7,0xCEA8,0xCEA9,0xCEAA,0xCEAB,0xCEAE,0xCEAF,/* 0x48-0x4F */ + 0xCEB0,0xCEB1,0xCEB2,0xCEB3,0xCEB4,0xCEB5,0xCEB6,0xCEB7,/* 0x50-0x57 */ + 0xCEB8,0xCEB9,0xCEBA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCEBB,0xCEBC,0xCEBD,0xCEBE,0xCEBF,0xCEC0,0xCEC2,/* 0x60-0x67 */ + 0xCEC3,0xCEC4,0xCEC5,0xCEC6,0xCEC7,0xCEC8,0xCEC9,0xCECA,/* 0x68-0x6F */ + 0xCECB,0xCECC,0xCECD,0xCECE,0xCECF,0xCED0,0xCED1,0xCED2,/* 0x70-0x77 */ + 0xCED3,0xCED4,0xCED5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCED6,0xCED7,0xCED8,0xCED9,0xCEDA,0xCEDB,0xCEDC,/* 0x80-0x87 */ + 0xCEDD,0xCEDE,0xCEDF,0xCEE0,0xCEE1,0xCEE2,0xCEE3,0xCEE6,/* 0x88-0x8F */ + 0xCEE7,0xCEE9,0xCEEA,0xCEED,0xCEEE,0xCEEF,0xCEF0,0xCEF1,/* 0x90-0x97 */ + 0xCEF2,0xCEF3,0xCEF6,0xCEFA,0xCEFB,0xCEFC,0xCEFD,0xCEFE,/* 0x98-0x9F */ + 0xCEFF,0xAC00,0xAC01,0xAC04,0xAC07,0xAC08,0xAC09,0xAC0A,/* 0xA0-0xA7 */ + 0xAC10,0xAC11,0xAC12,0xAC13,0xAC14,0xAC15,0xAC16,0xAC17,/* 0xA8-0xAF */ + 0xAC19,0xAC1A,0xAC1B,0xAC1C,0xAC1D,0xAC20,0xAC24,0xAC2C,/* 0xB0-0xB7 */ + 0xAC2D,0xAC2F,0xAC30,0xAC31,0xAC38,0xAC39,0xAC3C,0xAC40,/* 0xB8-0xBF */ + 0xAC4B,0xAC4D,0xAC54,0xAC58,0xAC5C,0xAC70,0xAC71,0xAC74,/* 0xC0-0xC7 */ + 0xAC77,0xAC78,0xAC7A,0xAC80,0xAC81,0xAC83,0xAC84,0xAC85,/* 0xC8-0xCF */ + 0xAC86,0xAC89,0xAC8A,0xAC8B,0xAC8C,0xAC90,0xAC94,0xAC9C,/* 0xD0-0xD7 */ + 0xAC9D,0xAC9F,0xACA0,0xACA1,0xACA8,0xACA9,0xACAA,0xACAC,/* 0xD8-0xDF */ + 0xACAF,0xACB0,0xACB8,0xACB9,0xACBB,0xACBC,0xACBD,0xACC1,/* 0xE0-0xE7 */ + 0xACC4,0xACC8,0xACCC,0xACD5,0xACD7,0xACE0,0xACE1,0xACE4,/* 0xE8-0xEF */ + 0xACE7,0xACE8,0xACEA,0xACEC,0xACEF,0xACF0,0xACF1,0xACF3,/* 0xF0-0xF7 */ + 0xACF5,0xACF6,0xACFC,0xACFD,0xAD00,0xAD04,0xAD06,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCF02,0xCF03,0xCF05,0xCF06,0xCF07,0xCF09,0xCF0A,/* 0x40-0x47 */ + 0xCF0B,0xCF0C,0xCF0D,0xCF0E,0xCF0F,0xCF12,0xCF14,0xCF16,/* 0x48-0x4F */ + 0xCF17,0xCF18,0xCF19,0xCF1A,0xCF1B,0xCF1D,0xCF1E,0xCF1F,/* 0x50-0x57 */ + 0xCF21,0xCF22,0xCF23,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCF25,0xCF26,0xCF27,0xCF28,0xCF29,0xCF2A,0xCF2B,/* 0x60-0x67 */ + 0xCF2E,0xCF32,0xCF33,0xCF34,0xCF35,0xCF36,0xCF37,0xCF39,/* 0x68-0x6F */ + 0xCF3A,0xCF3B,0xCF3C,0xCF3D,0xCF3E,0xCF3F,0xCF40,0xCF41,/* 0x70-0x77 */ + 0xCF42,0xCF43,0xCF44,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCF45,0xCF46,0xCF47,0xCF48,0xCF49,0xCF4A,0xCF4B,/* 0x80-0x87 */ + 0xCF4C,0xCF4D,0xCF4E,0xCF4F,0xCF50,0xCF51,0xCF52,0xCF53,/* 0x88-0x8F */ + 0xCF56,0xCF57,0xCF59,0xCF5A,0xCF5B,0xCF5D,0xCF5E,0xCF5F,/* 0x90-0x97 */ + 0xCF60,0xCF61,0xCF62,0xCF63,0xCF66,0xCF68,0xCF6A,0xCF6B,/* 0x98-0x9F */ + 0xCF6C,0xAD0C,0xAD0D,0xAD0F,0xAD11,0xAD18,0xAD1C,0xAD20,/* 0xA0-0xA7 */ + 0xAD29,0xAD2C,0xAD2D,0xAD34,0xAD35,0xAD38,0xAD3C,0xAD44,/* 0xA8-0xAF */ + 0xAD45,0xAD47,0xAD49,0xAD50,0xAD54,0xAD58,0xAD61,0xAD63,/* 0xB0-0xB7 */ + 0xAD6C,0xAD6D,0xAD70,0xAD73,0xAD74,0xAD75,0xAD76,0xAD7B,/* 0xB8-0xBF */ + 0xAD7C,0xAD7D,0xAD7F,0xAD81,0xAD82,0xAD88,0xAD89,0xAD8C,/* 0xC0-0xC7 */ + 0xAD90,0xAD9C,0xAD9D,0xADA4,0xADB7,0xADC0,0xADC1,0xADC4,/* 0xC8-0xCF */ + 0xADC8,0xADD0,0xADD1,0xADD3,0xADDC,0xADE0,0xADE4,0xADF8,/* 0xD0-0xD7 */ + 0xADF9,0xADFC,0xADFF,0xAE00,0xAE01,0xAE08,0xAE09,0xAE0B,/* 0xD8-0xDF */ + 0xAE0D,0xAE14,0xAE30,0xAE31,0xAE34,0xAE37,0xAE38,0xAE3A,/* 0xE0-0xE7 */ + 0xAE40,0xAE41,0xAE43,0xAE45,0xAE46,0xAE4A,0xAE4C,0xAE4D,/* 0xE8-0xEF */ + 0xAE4E,0xAE50,0xAE54,0xAE56,0xAE5C,0xAE5D,0xAE5F,0xAE60,/* 0xF0-0xF7 */ + 0xAE61,0xAE65,0xAE68,0xAE69,0xAE6C,0xAE70,0xAE78,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCF6D,0xCF6E,0xCF6F,0xCF72,0xCF73,0xCF75,0xCF76,/* 0x40-0x47 */ + 0xCF77,0xCF79,0xCF7A,0xCF7B,0xCF7C,0xCF7D,0xCF7E,0xCF7F,/* 0x48-0x4F */ + 0xCF81,0xCF82,0xCF83,0xCF84,0xCF86,0xCF87,0xCF88,0xCF89,/* 0x50-0x57 */ + 0xCF8A,0xCF8B,0xCF8D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCF8E,0xCF8F,0xCF90,0xCF91,0xCF92,0xCF93,0xCF94,/* 0x60-0x67 */ + 0xCF95,0xCF96,0xCF97,0xCF98,0xCF99,0xCF9A,0xCF9B,0xCF9C,/* 0x68-0x6F */ + 0xCF9D,0xCF9E,0xCF9F,0xCFA0,0xCFA2,0xCFA3,0xCFA4,0xCFA5,/* 0x70-0x77 */ + 0xCFA6,0xCFA7,0xCFA9,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xCFAA,0xCFAB,0xCFAC,0xCFAD,0xCFAE,0xCFAF,0xCFB1,/* 0x80-0x87 */ + 0xCFB2,0xCFB3,0xCFB4,0xCFB5,0xCFB6,0xCFB7,0xCFB8,0xCFB9,/* 0x88-0x8F */ + 0xCFBA,0xCFBB,0xCFBC,0xCFBD,0xCFBE,0xCFBF,0xCFC0,0xCFC1,/* 0x90-0x97 */ + 0xCFC2,0xCFC3,0xCFC5,0xCFC6,0xCFC7,0xCFC8,0xCFC9,0xCFCA,/* 0x98-0x9F */ + 0xCFCB,0xAE79,0xAE7B,0xAE7C,0xAE7D,0xAE84,0xAE85,0xAE8C,/* 0xA0-0xA7 */ + 0xAEBC,0xAEBD,0xAEBE,0xAEC0,0xAEC4,0xAECC,0xAECD,0xAECF,/* 0xA8-0xAF */ + 0xAED0,0xAED1,0xAED8,0xAED9,0xAEDC,0xAEE8,0xAEEB,0xAEED,/* 0xB0-0xB7 */ + 0xAEF4,0xAEF8,0xAEFC,0xAF07,0xAF08,0xAF0D,0xAF10,0xAF2C,/* 0xB8-0xBF */ + 0xAF2D,0xAF30,0xAF32,0xAF34,0xAF3C,0xAF3D,0xAF3F,0xAF41,/* 0xC0-0xC7 */ + 0xAF42,0xAF43,0xAF48,0xAF49,0xAF50,0xAF5C,0xAF5D,0xAF64,/* 0xC8-0xCF */ + 0xAF65,0xAF79,0xAF80,0xAF84,0xAF88,0xAF90,0xAF91,0xAF95,/* 0xD0-0xD7 */ + 0xAF9C,0xAFB8,0xAFB9,0xAFBC,0xAFC0,0xAFC7,0xAFC8,0xAFC9,/* 0xD8-0xDF */ + 0xAFCB,0xAFCD,0xAFCE,0xAFD4,0xAFDC,0xAFE8,0xAFE9,0xAFF0,/* 0xE0-0xE7 */ + 0xAFF1,0xAFF4,0xAFF8,0xB000,0xB001,0xB004,0xB00C,0xB010,/* 0xE8-0xEF */ + 0xB014,0xB01C,0xB01D,0xB028,0xB044,0xB045,0xB048,0xB04A,/* 0xF0-0xF7 */ + 0xB04C,0xB04E,0xB053,0xB054,0xB055,0xB057,0xB059,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xCFCC,0xCFCD,0xCFCE,0xCFCF,0xCFD0,0xCFD1,0xCFD2,/* 0x40-0x47 */ + 0xCFD3,0xCFD4,0xCFD5,0xCFD6,0xCFD7,0xCFD8,0xCFD9,0xCFDA,/* 0x48-0x4F */ + 0xCFDB,0xCFDC,0xCFDD,0xCFDE,0xCFDF,0xCFE2,0xCFE3,0xCFE5,/* 0x50-0x57 */ + 0xCFE6,0xCFE7,0xCFE9,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xCFEA,0xCFEB,0xCFEC,0xCFED,0xCFEE,0xCFEF,0xCFF2,/* 0x60-0x67 */ + 0xCFF4,0xCFF6,0xCFF7,0xCFF8,0xCFF9,0xCFFA,0xCFFB,0xCFFD,/* 0x68-0x6F */ + 0xCFFE,0xCFFF,0xD001,0xD002,0xD003,0xD005,0xD006,0xD007,/* 0x70-0x77 */ + 0xD008,0xD009,0xD00A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD00B,0xD00C,0xD00D,0xD00E,0xD00F,0xD010,0xD012,/* 0x80-0x87 */ + 0xD013,0xD014,0xD015,0xD016,0xD017,0xD019,0xD01A,0xD01B,/* 0x88-0x8F */ + 0xD01C,0xD01D,0xD01E,0xD01F,0xD020,0xD021,0xD022,0xD023,/* 0x90-0x97 */ + 0xD024,0xD025,0xD026,0xD027,0xD028,0xD029,0xD02A,0xD02B,/* 0x98-0x9F */ + 0xD02C,0xB05D,0xB07C,0xB07D,0xB080,0xB084,0xB08C,0xB08D,/* 0xA0-0xA7 */ + 0xB08F,0xB091,0xB098,0xB099,0xB09A,0xB09C,0xB09F,0xB0A0,/* 0xA8-0xAF */ + 0xB0A1,0xB0A2,0xB0A8,0xB0A9,0xB0AB,0xB0AC,0xB0AD,0xB0AE,/* 0xB0-0xB7 */ + 0xB0AF,0xB0B1,0xB0B3,0xB0B4,0xB0B5,0xB0B8,0xB0BC,0xB0C4,/* 0xB8-0xBF */ + 0xB0C5,0xB0C7,0xB0C8,0xB0C9,0xB0D0,0xB0D1,0xB0D4,0xB0D8,/* 0xC0-0xC7 */ + 0xB0E0,0xB0E5,0xB108,0xB109,0xB10B,0xB10C,0xB110,0xB112,/* 0xC8-0xCF */ + 0xB113,0xB118,0xB119,0xB11B,0xB11C,0xB11D,0xB123,0xB124,/* 0xD0-0xD7 */ + 0xB125,0xB128,0xB12C,0xB134,0xB135,0xB137,0xB138,0xB139,/* 0xD8-0xDF */ + 0xB140,0xB141,0xB144,0xB148,0xB150,0xB151,0xB154,0xB155,/* 0xE0-0xE7 */ + 0xB158,0xB15C,0xB160,0xB178,0xB179,0xB17C,0xB180,0xB182,/* 0xE8-0xEF */ + 0xB188,0xB189,0xB18B,0xB18D,0xB192,0xB193,0xB194,0xB198,/* 0xF0-0xF7 */ + 0xB19C,0xB1A8,0xB1CC,0xB1D0,0xB1D4,0xB1DC,0xB1DD,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD02E,0xD02F,0xD030,0xD031,0xD032,0xD033,0xD036,/* 0x40-0x47 */ + 0xD037,0xD039,0xD03A,0xD03B,0xD03D,0xD03E,0xD03F,0xD040,/* 0x48-0x4F */ + 0xD041,0xD042,0xD043,0xD046,0xD048,0xD04A,0xD04B,0xD04C,/* 0x50-0x57 */ + 0xD04D,0xD04E,0xD04F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD051,0xD052,0xD053,0xD055,0xD056,0xD057,0xD059,/* 0x60-0x67 */ + 0xD05A,0xD05B,0xD05C,0xD05D,0xD05E,0xD05F,0xD061,0xD062,/* 0x68-0x6F */ + 0xD063,0xD064,0xD065,0xD066,0xD067,0xD068,0xD069,0xD06A,/* 0x70-0x77 */ + 0xD06B,0xD06E,0xD06F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD071,0xD072,0xD073,0xD075,0xD076,0xD077,0xD078,/* 0x80-0x87 */ + 0xD079,0xD07A,0xD07B,0xD07E,0xD07F,0xD080,0xD082,0xD083,/* 0x88-0x8F */ + 0xD084,0xD085,0xD086,0xD087,0xD088,0xD089,0xD08A,0xD08B,/* 0x90-0x97 */ + 0xD08C,0xD08D,0xD08E,0xD08F,0xD090,0xD091,0xD092,0xD093,/* 0x98-0x9F */ + 0xD094,0xB1DF,0xB1E8,0xB1E9,0xB1EC,0xB1F0,0xB1F9,0xB1FB,/* 0xA0-0xA7 */ + 0xB1FD,0xB204,0xB205,0xB208,0xB20B,0xB20C,0xB214,0xB215,/* 0xA8-0xAF */ + 0xB217,0xB219,0xB220,0xB234,0xB23C,0xB258,0xB25C,0xB260,/* 0xB0-0xB7 */ + 0xB268,0xB269,0xB274,0xB275,0xB27C,0xB284,0xB285,0xB289,/* 0xB8-0xBF */ + 0xB290,0xB291,0xB294,0xB298,0xB299,0xB29A,0xB2A0,0xB2A1,/* 0xC0-0xC7 */ + 0xB2A3,0xB2A5,0xB2A6,0xB2AA,0xB2AC,0xB2B0,0xB2B4,0xB2C8,/* 0xC8-0xCF */ + 0xB2C9,0xB2CC,0xB2D0,0xB2D2,0xB2D8,0xB2D9,0xB2DB,0xB2DD,/* 0xD0-0xD7 */ + 0xB2E2,0xB2E4,0xB2E5,0xB2E6,0xB2E8,0xB2EB,0xB2EC,0xB2ED,/* 0xD8-0xDF */ + 0xB2EE,0xB2EF,0xB2F3,0xB2F4,0xB2F5,0xB2F7,0xB2F8,0xB2F9,/* 0xE0-0xE7 */ + 0xB2FA,0xB2FB,0xB2FF,0xB300,0xB301,0xB304,0xB308,0xB310,/* 0xE8-0xEF */ + 0xB311,0xB313,0xB314,0xB315,0xB31C,0xB354,0xB355,0xB356,/* 0xF0-0xF7 */ + 0xB358,0xB35B,0xB35C,0xB35E,0xB35F,0xB364,0xB365,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD095,0xD096,0xD097,0xD098,0xD099,0xD09A,0xD09B,/* 0x40-0x47 */ + 0xD09C,0xD09D,0xD09E,0xD09F,0xD0A0,0xD0A1,0xD0A2,0xD0A3,/* 0x48-0x4F */ + 0xD0A6,0xD0A7,0xD0A9,0xD0AA,0xD0AB,0xD0AD,0xD0AE,0xD0AF,/* 0x50-0x57 */ + 0xD0B0,0xD0B1,0xD0B2,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD0B3,0xD0B6,0xD0B8,0xD0BA,0xD0BB,0xD0BC,0xD0BD,/* 0x60-0x67 */ + 0xD0BE,0xD0BF,0xD0C2,0xD0C3,0xD0C5,0xD0C6,0xD0C7,0xD0CA,/* 0x68-0x6F */ + 0xD0CB,0xD0CC,0xD0CD,0xD0CE,0xD0CF,0xD0D2,0xD0D6,0xD0D7,/* 0x70-0x77 */ + 0xD0D8,0xD0D9,0xD0DA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD0DB,0xD0DE,0xD0DF,0xD0E1,0xD0E2,0xD0E3,0xD0E5,/* 0x80-0x87 */ + 0xD0E6,0xD0E7,0xD0E8,0xD0E9,0xD0EA,0xD0EB,0xD0EE,0xD0F2,/* 0x88-0x8F */ + 0xD0F3,0xD0F4,0xD0F5,0xD0F6,0xD0F7,0xD0F9,0xD0FA,0xD0FB,/* 0x90-0x97 */ + 0xD0FC,0xD0FD,0xD0FE,0xD0FF,0xD100,0xD101,0xD102,0xD103,/* 0x98-0x9F */ + 0xD104,0xB367,0xB369,0xB36B,0xB36E,0xB370,0xB371,0xB374,/* 0xA0-0xA7 */ + 0xB378,0xB380,0xB381,0xB383,0xB384,0xB385,0xB38C,0xB390,/* 0xA8-0xAF */ + 0xB394,0xB3A0,0xB3A1,0xB3A8,0xB3AC,0xB3C4,0xB3C5,0xB3C8,/* 0xB0-0xB7 */ + 0xB3CB,0xB3CC,0xB3CE,0xB3D0,0xB3D4,0xB3D5,0xB3D7,0xB3D9,/* 0xB8-0xBF */ + 0xB3DB,0xB3DD,0xB3E0,0xB3E4,0xB3E8,0xB3FC,0xB410,0xB418,/* 0xC0-0xC7 */ + 0xB41C,0xB420,0xB428,0xB429,0xB42B,0xB434,0xB450,0xB451,/* 0xC8-0xCF */ + 0xB454,0xB458,0xB460,0xB461,0xB463,0xB465,0xB46C,0xB480,/* 0xD0-0xD7 */ + 0xB488,0xB49D,0xB4A4,0xB4A8,0xB4AC,0xB4B5,0xB4B7,0xB4B9,/* 0xD8-0xDF */ + 0xB4C0,0xB4C4,0xB4C8,0xB4D0,0xB4D5,0xB4DC,0xB4DD,0xB4E0,/* 0xE0-0xE7 */ + 0xB4E3,0xB4E4,0xB4E6,0xB4EC,0xB4ED,0xB4EF,0xB4F1,0xB4F8,/* 0xE8-0xEF */ + 0xB514,0xB515,0xB518,0xB51B,0xB51C,0xB524,0xB525,0xB527,/* 0xF0-0xF7 */ + 0xB528,0xB529,0xB52A,0xB530,0xB531,0xB534,0xB538,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD105,0xD106,0xD107,0xD108,0xD109,0xD10A,0xD10B,/* 0x40-0x47 */ + 0xD10C,0xD10E,0xD10F,0xD110,0xD111,0xD112,0xD113,0xD114,/* 0x48-0x4F */ + 0xD115,0xD116,0xD117,0xD118,0xD119,0xD11A,0xD11B,0xD11C,/* 0x50-0x57 */ + 0xD11D,0xD11E,0xD11F,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD120,0xD121,0xD122,0xD123,0xD124,0xD125,0xD126,/* 0x60-0x67 */ + 0xD127,0xD128,0xD129,0xD12A,0xD12B,0xD12C,0xD12D,0xD12E,/* 0x68-0x6F */ + 0xD12F,0xD132,0xD133,0xD135,0xD136,0xD137,0xD139,0xD13B,/* 0x70-0x77 */ + 0xD13C,0xD13D,0xD13E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD13F,0xD142,0xD146,0xD147,0xD148,0xD149,0xD14A,/* 0x80-0x87 */ + 0xD14B,0xD14E,0xD14F,0xD151,0xD152,0xD153,0xD155,0xD156,/* 0x88-0x8F */ + 0xD157,0xD158,0xD159,0xD15A,0xD15B,0xD15E,0xD160,0xD162,/* 0x90-0x97 */ + 0xD163,0xD164,0xD165,0xD166,0xD167,0xD169,0xD16A,0xD16B,/* 0x98-0x9F */ + 0xD16D,0xB540,0xB541,0xB543,0xB544,0xB545,0xB54B,0xB54C,/* 0xA0-0xA7 */ + 0xB54D,0xB550,0xB554,0xB55C,0xB55D,0xB55F,0xB560,0xB561,/* 0xA8-0xAF */ + 0xB5A0,0xB5A1,0xB5A4,0xB5A8,0xB5AA,0xB5AB,0xB5B0,0xB5B1,/* 0xB0-0xB7 */ + 0xB5B3,0xB5B4,0xB5B5,0xB5BB,0xB5BC,0xB5BD,0xB5C0,0xB5C4,/* 0xB8-0xBF */ + 0xB5CC,0xB5CD,0xB5CF,0xB5D0,0xB5D1,0xB5D8,0xB5EC,0xB610,/* 0xC0-0xC7 */ + 0xB611,0xB614,0xB618,0xB625,0xB62C,0xB634,0xB648,0xB664,/* 0xC8-0xCF */ + 0xB668,0xB69C,0xB69D,0xB6A0,0xB6A4,0xB6AB,0xB6AC,0xB6B1,/* 0xD0-0xD7 */ + 0xB6D4,0xB6F0,0xB6F4,0xB6F8,0xB700,0xB701,0xB705,0xB728,/* 0xD8-0xDF */ + 0xB729,0xB72C,0xB72F,0xB730,0xB738,0xB739,0xB73B,0xB744,/* 0xE0-0xE7 */ + 0xB748,0xB74C,0xB754,0xB755,0xB760,0xB764,0xB768,0xB770,/* 0xE8-0xEF */ + 0xB771,0xB773,0xB775,0xB77C,0xB77D,0xB780,0xB784,0xB78C,/* 0xF0-0xF7 */ + 0xB78D,0xB78F,0xB790,0xB791,0xB792,0xB796,0xB797,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD16E,0xD16F,0xD170,0xD171,0xD172,0xD173,0xD174,/* 0x40-0x47 */ + 0xD175,0xD176,0xD177,0xD178,0xD179,0xD17A,0xD17B,0xD17D,/* 0x48-0x4F */ + 0xD17E,0xD17F,0xD180,0xD181,0xD182,0xD183,0xD185,0xD186,/* 0x50-0x57 */ + 0xD187,0xD189,0xD18A,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD18B,0xD18C,0xD18D,0xD18E,0xD18F,0xD190,0xD191,/* 0x60-0x67 */ + 0xD192,0xD193,0xD194,0xD195,0xD196,0xD197,0xD198,0xD199,/* 0x68-0x6F */ + 0xD19A,0xD19B,0xD19C,0xD19D,0xD19E,0xD19F,0xD1A2,0xD1A3,/* 0x70-0x77 */ + 0xD1A5,0xD1A6,0xD1A7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD1A9,0xD1AA,0xD1AB,0xD1AC,0xD1AD,0xD1AE,0xD1AF,/* 0x80-0x87 */ + 0xD1B2,0xD1B4,0xD1B6,0xD1B7,0xD1B8,0xD1B9,0xD1BB,0xD1BD,/* 0x88-0x8F */ + 0xD1BE,0xD1BF,0xD1C1,0xD1C2,0xD1C3,0xD1C4,0xD1C5,0xD1C6,/* 0x90-0x97 */ + 0xD1C7,0xD1C8,0xD1C9,0xD1CA,0xD1CB,0xD1CC,0xD1CD,0xD1CE,/* 0x98-0x9F */ + 0xD1CF,0xB798,0xB799,0xB79C,0xB7A0,0xB7A8,0xB7A9,0xB7AB,/* 0xA0-0xA7 */ + 0xB7AC,0xB7AD,0xB7B4,0xB7B5,0xB7B8,0xB7C7,0xB7C9,0xB7EC,/* 0xA8-0xAF */ + 0xB7ED,0xB7F0,0xB7F4,0xB7FC,0xB7FD,0xB7FF,0xB800,0xB801,/* 0xB0-0xB7 */ + 0xB807,0xB808,0xB809,0xB80C,0xB810,0xB818,0xB819,0xB81B,/* 0xB8-0xBF */ + 0xB81D,0xB824,0xB825,0xB828,0xB82C,0xB834,0xB835,0xB837,/* 0xC0-0xC7 */ + 0xB838,0xB839,0xB840,0xB844,0xB851,0xB853,0xB85C,0xB85D,/* 0xC8-0xCF */ + 0xB860,0xB864,0xB86C,0xB86D,0xB86F,0xB871,0xB878,0xB87C,/* 0xD0-0xD7 */ + 0xB88D,0xB8A8,0xB8B0,0xB8B4,0xB8B8,0xB8C0,0xB8C1,0xB8C3,/* 0xD8-0xDF */ + 0xB8C5,0xB8CC,0xB8D0,0xB8D4,0xB8DD,0xB8DF,0xB8E1,0xB8E8,/* 0xE0-0xE7 */ + 0xB8E9,0xB8EC,0xB8F0,0xB8F8,0xB8F9,0xB8FB,0xB8FD,0xB904,/* 0xE8-0xEF */ + 0xB918,0xB920,0xB93C,0xB93D,0xB940,0xB944,0xB94C,0xB94F,/* 0xF0-0xF7 */ + 0xB951,0xB958,0xB959,0xB95C,0xB960,0xB968,0xB969,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD1D0,0xD1D1,0xD1D2,0xD1D3,0xD1D4,0xD1D5,0xD1D6,/* 0x40-0x47 */ + 0xD1D7,0xD1D9,0xD1DA,0xD1DB,0xD1DC,0xD1DD,0xD1DE,0xD1DF,/* 0x48-0x4F */ + 0xD1E0,0xD1E1,0xD1E2,0xD1E3,0xD1E4,0xD1E5,0xD1E6,0xD1E7,/* 0x50-0x57 */ + 0xD1E8,0xD1E9,0xD1EA,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD1EB,0xD1EC,0xD1ED,0xD1EE,0xD1EF,0xD1F0,0xD1F1,/* 0x60-0x67 */ + 0xD1F2,0xD1F3,0xD1F5,0xD1F6,0xD1F7,0xD1F9,0xD1FA,0xD1FB,/* 0x68-0x6F */ + 0xD1FC,0xD1FD,0xD1FE,0xD1FF,0xD200,0xD201,0xD202,0xD203,/* 0x70-0x77 */ + 0xD204,0xD205,0xD206,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD208,0xD20A,0xD20B,0xD20C,0xD20D,0xD20E,0xD20F,/* 0x80-0x87 */ + 0xD211,0xD212,0xD213,0xD214,0xD215,0xD216,0xD217,0xD218,/* 0x88-0x8F */ + 0xD219,0xD21A,0xD21B,0xD21C,0xD21D,0xD21E,0xD21F,0xD220,/* 0x90-0x97 */ + 0xD221,0xD222,0xD223,0xD224,0xD225,0xD226,0xD227,0xD228,/* 0x98-0x9F */ + 0xD229,0xB96B,0xB96D,0xB974,0xB975,0xB978,0xB97C,0xB984,/* 0xA0-0xA7 */ + 0xB985,0xB987,0xB989,0xB98A,0xB98D,0xB98E,0xB9AC,0xB9AD,/* 0xA8-0xAF */ + 0xB9B0,0xB9B4,0xB9BC,0xB9BD,0xB9BF,0xB9C1,0xB9C8,0xB9C9,/* 0xB0-0xB7 */ + 0xB9CC,0xB9CE,0xB9CF,0xB9D0,0xB9D1,0xB9D2,0xB9D8,0xB9D9,/* 0xB8-0xBF */ + 0xB9DB,0xB9DD,0xB9DE,0xB9E1,0xB9E3,0xB9E4,0xB9E5,0xB9E8,/* 0xC0-0xC7 */ + 0xB9EC,0xB9F4,0xB9F5,0xB9F7,0xB9F8,0xB9F9,0xB9FA,0xBA00,/* 0xC8-0xCF */ + 0xBA01,0xBA08,0xBA15,0xBA38,0xBA39,0xBA3C,0xBA40,0xBA42,/* 0xD0-0xD7 */ + 0xBA48,0xBA49,0xBA4B,0xBA4D,0xBA4E,0xBA53,0xBA54,0xBA55,/* 0xD8-0xDF */ + 0xBA58,0xBA5C,0xBA64,0xBA65,0xBA67,0xBA68,0xBA69,0xBA70,/* 0xE0-0xE7 */ + 0xBA71,0xBA74,0xBA78,0xBA83,0xBA84,0xBA85,0xBA87,0xBA8C,/* 0xE8-0xEF */ + 0xBAA8,0xBAA9,0xBAAB,0xBAAC,0xBAB0,0xBAB2,0xBAB8,0xBAB9,/* 0xF0-0xF7 */ + 0xBABB,0xBABD,0xBAC4,0xBAC8,0xBAD8,0xBAD9,0xBAFC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD22A,0xD22B,0xD22E,0xD22F,0xD231,0xD232,0xD233,/* 0x40-0x47 */ + 0xD235,0xD236,0xD237,0xD238,0xD239,0xD23A,0xD23B,0xD23E,/* 0x48-0x4F */ + 0xD240,0xD242,0xD243,0xD244,0xD245,0xD246,0xD247,0xD249,/* 0x50-0x57 */ + 0xD24A,0xD24B,0xD24C,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD24D,0xD24E,0xD24F,0xD250,0xD251,0xD252,0xD253,/* 0x60-0x67 */ + 0xD254,0xD255,0xD256,0xD257,0xD258,0xD259,0xD25A,0xD25B,/* 0x68-0x6F */ + 0xD25D,0xD25E,0xD25F,0xD260,0xD261,0xD262,0xD263,0xD265,/* 0x70-0x77 */ + 0xD266,0xD267,0xD268,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD269,0xD26A,0xD26B,0xD26C,0xD26D,0xD26E,0xD26F,/* 0x80-0x87 */ + 0xD270,0xD271,0xD272,0xD273,0xD274,0xD275,0xD276,0xD277,/* 0x88-0x8F */ + 0xD278,0xD279,0xD27A,0xD27B,0xD27C,0xD27D,0xD27E,0xD27F,/* 0x90-0x97 */ + 0xD282,0xD283,0xD285,0xD286,0xD287,0xD289,0xD28A,0xD28B,/* 0x98-0x9F */ + 0xD28C,0xBB00,0xBB04,0xBB0D,0xBB0F,0xBB11,0xBB18,0xBB1C,/* 0xA0-0xA7 */ + 0xBB20,0xBB29,0xBB2B,0xBB34,0xBB35,0xBB36,0xBB38,0xBB3B,/* 0xA8-0xAF */ + 0xBB3C,0xBB3D,0xBB3E,0xBB44,0xBB45,0xBB47,0xBB49,0xBB4D,/* 0xB0-0xB7 */ + 0xBB4F,0xBB50,0xBB54,0xBB58,0xBB61,0xBB63,0xBB6C,0xBB88,/* 0xB8-0xBF */ + 0xBB8C,0xBB90,0xBBA4,0xBBA8,0xBBAC,0xBBB4,0xBBB7,0xBBC0,/* 0xC0-0xC7 */ + 0xBBC4,0xBBC8,0xBBD0,0xBBD3,0xBBF8,0xBBF9,0xBBFC,0xBBFF,/* 0xC8-0xCF */ + 0xBC00,0xBC02,0xBC08,0xBC09,0xBC0B,0xBC0C,0xBC0D,0xBC0F,/* 0xD0-0xD7 */ + 0xBC11,0xBC14,0xBC15,0xBC16,0xBC17,0xBC18,0xBC1B,0xBC1C,/* 0xD8-0xDF */ + 0xBC1D,0xBC1E,0xBC1F,0xBC24,0xBC25,0xBC27,0xBC29,0xBC2D,/* 0xE0-0xE7 */ + 0xBC30,0xBC31,0xBC34,0xBC38,0xBC40,0xBC41,0xBC43,0xBC44,/* 0xE8-0xEF */ + 0xBC45,0xBC49,0xBC4C,0xBC4D,0xBC50,0xBC5D,0xBC84,0xBC85,/* 0xF0-0xF7 */ + 0xBC88,0xBC8B,0xBC8C,0xBC8E,0xBC94,0xBC95,0xBC97,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD28D,0xD28E,0xD28F,0xD292,0xD293,0xD294,0xD296,/* 0x40-0x47 */ + 0xD297,0xD298,0xD299,0xD29A,0xD29B,0xD29D,0xD29E,0xD29F,/* 0x48-0x4F */ + 0xD2A1,0xD2A2,0xD2A3,0xD2A5,0xD2A6,0xD2A7,0xD2A8,0xD2A9,/* 0x50-0x57 */ + 0xD2AA,0xD2AB,0xD2AD,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD2AE,0xD2AF,0xD2B0,0xD2B2,0xD2B3,0xD2B4,0xD2B5,/* 0x60-0x67 */ + 0xD2B6,0xD2B7,0xD2BA,0xD2BB,0xD2BD,0xD2BE,0xD2C1,0xD2C3,/* 0x68-0x6F */ + 0xD2C4,0xD2C5,0xD2C6,0xD2C7,0xD2CA,0xD2CC,0xD2CD,0xD2CE,/* 0x70-0x77 */ + 0xD2CF,0xD2D0,0xD2D1,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD2D2,0xD2D3,0xD2D5,0xD2D6,0xD2D7,0xD2D9,0xD2DA,/* 0x80-0x87 */ + 0xD2DB,0xD2DD,0xD2DE,0xD2DF,0xD2E0,0xD2E1,0xD2E2,0xD2E3,/* 0x88-0x8F */ + 0xD2E6,0xD2E7,0xD2E8,0xD2E9,0xD2EA,0xD2EB,0xD2EC,0xD2ED,/* 0x90-0x97 */ + 0xD2EE,0xD2EF,0xD2F2,0xD2F3,0xD2F5,0xD2F6,0xD2F7,0xD2F9,/* 0x98-0x9F */ + 0xD2FA,0xBC99,0xBC9A,0xBCA0,0xBCA1,0xBCA4,0xBCA7,0xBCA8,/* 0xA0-0xA7 */ + 0xBCB0,0xBCB1,0xBCB3,0xBCB4,0xBCB5,0xBCBC,0xBCBD,0xBCC0,/* 0xA8-0xAF */ + 0xBCC4,0xBCCD,0xBCCF,0xBCD0,0xBCD1,0xBCD5,0xBCD8,0xBCDC,/* 0xB0-0xB7 */ + 0xBCF4,0xBCF5,0xBCF6,0xBCF8,0xBCFC,0xBD04,0xBD05,0xBD07,/* 0xB8-0xBF */ + 0xBD09,0xBD10,0xBD14,0xBD24,0xBD2C,0xBD40,0xBD48,0xBD49,/* 0xC0-0xC7 */ + 0xBD4C,0xBD50,0xBD58,0xBD59,0xBD64,0xBD68,0xBD80,0xBD81,/* 0xC8-0xCF */ + 0xBD84,0xBD87,0xBD88,0xBD89,0xBD8A,0xBD90,0xBD91,0xBD93,/* 0xD0-0xD7 */ + 0xBD95,0xBD99,0xBD9A,0xBD9C,0xBDA4,0xBDB0,0xBDB8,0xBDD4,/* 0xD8-0xDF */ + 0xBDD5,0xBDD8,0xBDDC,0xBDE9,0xBDF0,0xBDF4,0xBDF8,0xBE00,/* 0xE0-0xE7 */ + 0xBE03,0xBE05,0xBE0C,0xBE0D,0xBE10,0xBE14,0xBE1C,0xBE1D,/* 0xE8-0xEF */ + 0xBE1F,0xBE44,0xBE45,0xBE48,0xBE4C,0xBE4E,0xBE54,0xBE55,/* 0xF0-0xF7 */ + 0xBE57,0xBE59,0xBE5A,0xBE5B,0xBE60,0xBE61,0xBE64,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD2FB,0xD2FC,0xD2FD,0xD2FE,0xD2FF,0xD302,0xD304,/* 0x40-0x47 */ + 0xD306,0xD307,0xD308,0xD309,0xD30A,0xD30B,0xD30F,0xD311,/* 0x48-0x4F */ + 0xD312,0xD313,0xD315,0xD317,0xD318,0xD319,0xD31A,0xD31B,/* 0x50-0x57 */ + 0xD31E,0xD322,0xD323,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD324,0xD326,0xD327,0xD32A,0xD32B,0xD32D,0xD32E,/* 0x60-0x67 */ + 0xD32F,0xD331,0xD332,0xD333,0xD334,0xD335,0xD336,0xD337,/* 0x68-0x6F */ + 0xD33A,0xD33E,0xD33F,0xD340,0xD341,0xD342,0xD343,0xD346,/* 0x70-0x77 */ + 0xD347,0xD348,0xD349,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD34A,0xD34B,0xD34C,0xD34D,0xD34E,0xD34F,0xD350,/* 0x80-0x87 */ + 0xD351,0xD352,0xD353,0xD354,0xD355,0xD356,0xD357,0xD358,/* 0x88-0x8F */ + 0xD359,0xD35A,0xD35B,0xD35C,0xD35D,0xD35E,0xD35F,0xD360,/* 0x90-0x97 */ + 0xD361,0xD362,0xD363,0xD364,0xD365,0xD366,0xD367,0xD368,/* 0x98-0x9F */ + 0xD369,0xBE68,0xBE6A,0xBE70,0xBE71,0xBE73,0xBE74,0xBE75,/* 0xA0-0xA7 */ + 0xBE7B,0xBE7C,0xBE7D,0xBE80,0xBE84,0xBE8C,0xBE8D,0xBE8F,/* 0xA8-0xAF */ + 0xBE90,0xBE91,0xBE98,0xBE99,0xBEA8,0xBED0,0xBED1,0xBED4,/* 0xB0-0xB7 */ + 0xBED7,0xBED8,0xBEE0,0xBEE3,0xBEE4,0xBEE5,0xBEEC,0xBF01,/* 0xB8-0xBF */ + 0xBF08,0xBF09,0xBF18,0xBF19,0xBF1B,0xBF1C,0xBF1D,0xBF40,/* 0xC0-0xC7 */ + 0xBF41,0xBF44,0xBF48,0xBF50,0xBF51,0xBF55,0xBF94,0xBFB0,/* 0xC8-0xCF */ + 0xBFC5,0xBFCC,0xBFCD,0xBFD0,0xBFD4,0xBFDC,0xBFDF,0xBFE1,/* 0xD0-0xD7 */ + 0xC03C,0xC051,0xC058,0xC05C,0xC060,0xC068,0xC069,0xC090,/* 0xD8-0xDF */ + 0xC091,0xC094,0xC098,0xC0A0,0xC0A1,0xC0A3,0xC0A5,0xC0AC,/* 0xE0-0xE7 */ + 0xC0AD,0xC0AF,0xC0B0,0xC0B3,0xC0B4,0xC0B5,0xC0B6,0xC0BC,/* 0xE8-0xEF */ + 0xC0BD,0xC0BF,0xC0C0,0xC0C1,0xC0C5,0xC0C8,0xC0C9,0xC0CC,/* 0xF0-0xF7 */ + 0xC0D0,0xC0D8,0xC0D9,0xC0DB,0xC0DC,0xC0DD,0xC0E4,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD36A,0xD36B,0xD36C,0xD36D,0xD36E,0xD36F,0xD370,/* 0x40-0x47 */ + 0xD371,0xD372,0xD373,0xD374,0xD375,0xD376,0xD377,0xD378,/* 0x48-0x4F */ + 0xD379,0xD37A,0xD37B,0xD37E,0xD37F,0xD381,0xD382,0xD383,/* 0x50-0x57 */ + 0xD385,0xD386,0xD387,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD388,0xD389,0xD38A,0xD38B,0xD38E,0xD392,0xD393,/* 0x60-0x67 */ + 0xD394,0xD395,0xD396,0xD397,0xD39A,0xD39B,0xD39D,0xD39E,/* 0x68-0x6F */ + 0xD39F,0xD3A1,0xD3A2,0xD3A3,0xD3A4,0xD3A5,0xD3A6,0xD3A7,/* 0x70-0x77 */ + 0xD3AA,0xD3AC,0xD3AE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD3AF,0xD3B0,0xD3B1,0xD3B2,0xD3B3,0xD3B5,0xD3B6,/* 0x80-0x87 */ + 0xD3B7,0xD3B9,0xD3BA,0xD3BB,0xD3BD,0xD3BE,0xD3BF,0xD3C0,/* 0x88-0x8F */ + 0xD3C1,0xD3C2,0xD3C3,0xD3C6,0xD3C7,0xD3CA,0xD3CB,0xD3CC,/* 0x90-0x97 */ + 0xD3CD,0xD3CE,0xD3CF,0xD3D1,0xD3D2,0xD3D3,0xD3D4,0xD3D5,/* 0x98-0x9F */ + 0xD3D6,0xC0E5,0xC0E8,0xC0EC,0xC0F4,0xC0F5,0xC0F7,0xC0F9,/* 0xA0-0xA7 */ + 0xC100,0xC104,0xC108,0xC110,0xC115,0xC11C,0xC11D,0xC11E,/* 0xA8-0xAF */ + 0xC11F,0xC120,0xC123,0xC124,0xC126,0xC127,0xC12C,0xC12D,/* 0xB0-0xB7 */ + 0xC12F,0xC130,0xC131,0xC136,0xC138,0xC139,0xC13C,0xC140,/* 0xB8-0xBF */ + 0xC148,0xC149,0xC14B,0xC14C,0xC14D,0xC154,0xC155,0xC158,/* 0xC0-0xC7 */ + 0xC15C,0xC164,0xC165,0xC167,0xC168,0xC169,0xC170,0xC174,/* 0xC8-0xCF */ + 0xC178,0xC185,0xC18C,0xC18D,0xC18E,0xC190,0xC194,0xC196,/* 0xD0-0xD7 */ + 0xC19C,0xC19D,0xC19F,0xC1A1,0xC1A5,0xC1A8,0xC1A9,0xC1AC,/* 0xD8-0xDF */ + 0xC1B0,0xC1BD,0xC1C4,0xC1C8,0xC1CC,0xC1D4,0xC1D7,0xC1D8,/* 0xE0-0xE7 */ + 0xC1E0,0xC1E4,0xC1E8,0xC1F0,0xC1F1,0xC1F3,0xC1FC,0xC1FD,/* 0xE8-0xEF */ + 0xC200,0xC204,0xC20C,0xC20D,0xC20F,0xC211,0xC218,0xC219,/* 0xF0-0xF7 */ + 0xC21C,0xC21F,0xC220,0xC228,0xC229,0xC22B,0xC22D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD3D7,0xD3D9,0xD3DA,0xD3DB,0xD3DC,0xD3DD,0xD3DE,/* 0x40-0x47 */ + 0xD3DF,0xD3E0,0xD3E2,0xD3E4,0xD3E5,0xD3E6,0xD3E7,0xD3E8,/* 0x48-0x4F */ + 0xD3E9,0xD3EA,0xD3EB,0xD3EE,0xD3EF,0xD3F1,0xD3F2,0xD3F3,/* 0x50-0x57 */ + 0xD3F5,0xD3F6,0xD3F7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD3F8,0xD3F9,0xD3FA,0xD3FB,0xD3FE,0xD400,0xD402,/* 0x60-0x67 */ + 0xD403,0xD404,0xD405,0xD406,0xD407,0xD409,0xD40A,0xD40B,/* 0x68-0x6F */ + 0xD40C,0xD40D,0xD40E,0xD40F,0xD410,0xD411,0xD412,0xD413,/* 0x70-0x77 */ + 0xD414,0xD415,0xD416,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD417,0xD418,0xD419,0xD41A,0xD41B,0xD41C,0xD41E,/* 0x80-0x87 */ + 0xD41F,0xD420,0xD421,0xD422,0xD423,0xD424,0xD425,0xD426,/* 0x88-0x8F */ + 0xD427,0xD428,0xD429,0xD42A,0xD42B,0xD42C,0xD42D,0xD42E,/* 0x90-0x97 */ + 0xD42F,0xD430,0xD431,0xD432,0xD433,0xD434,0xD435,0xD436,/* 0x98-0x9F */ + 0xD437,0xC22F,0xC231,0xC232,0xC234,0xC248,0xC250,0xC251,/* 0xA0-0xA7 */ + 0xC254,0xC258,0xC260,0xC265,0xC26C,0xC26D,0xC270,0xC274,/* 0xA8-0xAF */ + 0xC27C,0xC27D,0xC27F,0xC281,0xC288,0xC289,0xC290,0xC298,/* 0xB0-0xB7 */ + 0xC29B,0xC29D,0xC2A4,0xC2A5,0xC2A8,0xC2AC,0xC2AD,0xC2B4,/* 0xB8-0xBF */ + 0xC2B5,0xC2B7,0xC2B9,0xC2DC,0xC2DD,0xC2E0,0xC2E3,0xC2E4,/* 0xC0-0xC7 */ + 0xC2EB,0xC2EC,0xC2ED,0xC2EF,0xC2F1,0xC2F6,0xC2F8,0xC2F9,/* 0xC8-0xCF */ + 0xC2FB,0xC2FC,0xC300,0xC308,0xC309,0xC30C,0xC30D,0xC313,/* 0xD0-0xD7 */ + 0xC314,0xC315,0xC318,0xC31C,0xC324,0xC325,0xC328,0xC329,/* 0xD8-0xDF */ + 0xC345,0xC368,0xC369,0xC36C,0xC370,0xC372,0xC378,0xC379,/* 0xE0-0xE7 */ + 0xC37C,0xC37D,0xC384,0xC388,0xC38C,0xC3C0,0xC3D8,0xC3D9,/* 0xE8-0xEF */ + 0xC3DC,0xC3DF,0xC3E0,0xC3E2,0xC3E8,0xC3E9,0xC3ED,0xC3F4,/* 0xF0-0xF7 */ + 0xC3F5,0xC3F8,0xC408,0xC410,0xC424,0xC42C,0xC430,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD438,0xD439,0xD43A,0xD43B,0xD43C,0xD43D,0xD43E,/* 0x40-0x47 */ + 0xD43F,0xD441,0xD442,0xD443,0xD445,0xD446,0xD447,0xD448,/* 0x48-0x4F */ + 0xD449,0xD44A,0xD44B,0xD44C,0xD44D,0xD44E,0xD44F,0xD450,/* 0x50-0x57 */ + 0xD451,0xD452,0xD453,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD454,0xD455,0xD456,0xD457,0xD458,0xD459,0xD45A,/* 0x60-0x67 */ + 0xD45B,0xD45D,0xD45E,0xD45F,0xD461,0xD462,0xD463,0xD465,/* 0x68-0x6F */ + 0xD466,0xD467,0xD468,0xD469,0xD46A,0xD46B,0xD46C,0xD46E,/* 0x70-0x77 */ + 0xD470,0xD471,0xD472,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD473,0xD474,0xD475,0xD476,0xD477,0xD47A,0xD47B,/* 0x80-0x87 */ + 0xD47D,0xD47E,0xD481,0xD483,0xD484,0xD485,0xD486,0xD487,/* 0x88-0x8F */ + 0xD48A,0xD48C,0xD48E,0xD48F,0xD490,0xD491,0xD492,0xD493,/* 0x90-0x97 */ + 0xD495,0xD496,0xD497,0xD498,0xD499,0xD49A,0xD49B,0xD49C,/* 0x98-0x9F */ + 0xD49D,0xC434,0xC43C,0xC43D,0xC448,0xC464,0xC465,0xC468,/* 0xA0-0xA7 */ + 0xC46C,0xC474,0xC475,0xC479,0xC480,0xC494,0xC49C,0xC4B8,/* 0xA8-0xAF */ + 0xC4BC,0xC4E9,0xC4F0,0xC4F1,0xC4F4,0xC4F8,0xC4FA,0xC4FF,/* 0xB0-0xB7 */ + 0xC500,0xC501,0xC50C,0xC510,0xC514,0xC51C,0xC528,0xC529,/* 0xB8-0xBF */ + 0xC52C,0xC530,0xC538,0xC539,0xC53B,0xC53D,0xC544,0xC545,/* 0xC0-0xC7 */ + 0xC548,0xC549,0xC54A,0xC54C,0xC54D,0xC54E,0xC553,0xC554,/* 0xC8-0xCF */ + 0xC555,0xC557,0xC558,0xC559,0xC55D,0xC55E,0xC560,0xC561,/* 0xD0-0xD7 */ + 0xC564,0xC568,0xC570,0xC571,0xC573,0xC574,0xC575,0xC57C,/* 0xD8-0xDF */ + 0xC57D,0xC580,0xC584,0xC587,0xC58C,0xC58D,0xC58F,0xC591,/* 0xE0-0xE7 */ + 0xC595,0xC597,0xC598,0xC59C,0xC5A0,0xC5A9,0xC5B4,0xC5B5,/* 0xE8-0xEF */ + 0xC5B8,0xC5B9,0xC5BB,0xC5BC,0xC5BD,0xC5BE,0xC5C4,0xC5C5,/* 0xF0-0xF7 */ + 0xC5C6,0xC5C7,0xC5C8,0xC5C9,0xC5CA,0xC5CC,0xC5CE,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD49E,0xD49F,0xD4A0,0xD4A1,0xD4A2,0xD4A3,0xD4A4,/* 0x40-0x47 */ + 0xD4A5,0xD4A6,0xD4A7,0xD4A8,0xD4AA,0xD4AB,0xD4AC,0xD4AD,/* 0x48-0x4F */ + 0xD4AE,0xD4AF,0xD4B0,0xD4B1,0xD4B2,0xD4B3,0xD4B4,0xD4B5,/* 0x50-0x57 */ + 0xD4B6,0xD4B7,0xD4B8,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD4B9,0xD4BA,0xD4BB,0xD4BC,0xD4BD,0xD4BE,0xD4BF,/* 0x60-0x67 */ + 0xD4C0,0xD4C1,0xD4C2,0xD4C3,0xD4C4,0xD4C5,0xD4C6,0xD4C7,/* 0x68-0x6F */ + 0xD4C8,0xD4C9,0xD4CA,0xD4CB,0xD4CD,0xD4CE,0xD4CF,0xD4D1,/* 0x70-0x77 */ + 0xD4D2,0xD4D3,0xD4D5,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD4D6,0xD4D7,0xD4D8,0xD4D9,0xD4DA,0xD4DB,0xD4DD,/* 0x80-0x87 */ + 0xD4DE,0xD4E0,0xD4E1,0xD4E2,0xD4E3,0xD4E4,0xD4E5,0xD4E6,/* 0x88-0x8F */ + 0xD4E7,0xD4E9,0xD4EA,0xD4EB,0xD4ED,0xD4EE,0xD4EF,0xD4F1,/* 0x90-0x97 */ + 0xD4F2,0xD4F3,0xD4F4,0xD4F5,0xD4F6,0xD4F7,0xD4F9,0xD4FA,/* 0x98-0x9F */ + 0xD4FC,0xC5D0,0xC5D1,0xC5D4,0xC5D8,0xC5E0,0xC5E1,0xC5E3,/* 0xA0-0xA7 */ + 0xC5E5,0xC5EC,0xC5ED,0xC5EE,0xC5F0,0xC5F4,0xC5F6,0xC5F7,/* 0xA8-0xAF */ + 0xC5FC,0xC5FD,0xC5FE,0xC5FF,0xC600,0xC601,0xC605,0xC606,/* 0xB0-0xB7 */ + 0xC607,0xC608,0xC60C,0xC610,0xC618,0xC619,0xC61B,0xC61C,/* 0xB8-0xBF */ + 0xC624,0xC625,0xC628,0xC62C,0xC62D,0xC62E,0xC630,0xC633,/* 0xC0-0xC7 */ + 0xC634,0xC635,0xC637,0xC639,0xC63B,0xC640,0xC641,0xC644,/* 0xC8-0xCF */ + 0xC648,0xC650,0xC651,0xC653,0xC654,0xC655,0xC65C,0xC65D,/* 0xD0-0xD7 */ + 0xC660,0xC66C,0xC66F,0xC671,0xC678,0xC679,0xC67C,0xC680,/* 0xD8-0xDF */ + 0xC688,0xC689,0xC68B,0xC68D,0xC694,0xC695,0xC698,0xC69C,/* 0xE0-0xE7 */ + 0xC6A4,0xC6A5,0xC6A7,0xC6A9,0xC6B0,0xC6B1,0xC6B4,0xC6B8,/* 0xE8-0xEF */ + 0xC6B9,0xC6BA,0xC6C0,0xC6C1,0xC6C3,0xC6C5,0xC6CC,0xC6CD,/* 0xF0-0xF7 */ + 0xC6D0,0xC6D4,0xC6DC,0xC6DD,0xC6E0,0xC6E1,0xC6E8,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD4FE,0xD4FF,0xD500,0xD501,0xD502,0xD503,0xD505,/* 0x40-0x47 */ + 0xD506,0xD507,0xD509,0xD50A,0xD50B,0xD50D,0xD50E,0xD50F,/* 0x48-0x4F */ + 0xD510,0xD511,0xD512,0xD513,0xD516,0xD518,0xD519,0xD51A,/* 0x50-0x57 */ + 0xD51B,0xD51C,0xD51D,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD51E,0xD51F,0xD520,0xD521,0xD522,0xD523,0xD524,/* 0x60-0x67 */ + 0xD525,0xD526,0xD527,0xD528,0xD529,0xD52A,0xD52B,0xD52C,/* 0x68-0x6F */ + 0xD52D,0xD52E,0xD52F,0xD530,0xD531,0xD532,0xD533,0xD534,/* 0x70-0x77 */ + 0xD535,0xD536,0xD537,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD538,0xD539,0xD53A,0xD53B,0xD53E,0xD53F,0xD541,/* 0x80-0x87 */ + 0xD542,0xD543,0xD545,0xD546,0xD547,0xD548,0xD549,0xD54A,/* 0x88-0x8F */ + 0xD54B,0xD54E,0xD550,0xD552,0xD553,0xD554,0xD555,0xD556,/* 0x90-0x97 */ + 0xD557,0xD55A,0xD55B,0xD55D,0xD55E,0xD55F,0xD561,0xD562,/* 0x98-0x9F */ + 0xD563,0xC6E9,0xC6EC,0xC6F0,0xC6F8,0xC6F9,0xC6FD,0xC704,/* 0xA0-0xA7 */ + 0xC705,0xC708,0xC70C,0xC714,0xC715,0xC717,0xC719,0xC720,/* 0xA8-0xAF */ + 0xC721,0xC724,0xC728,0xC730,0xC731,0xC733,0xC735,0xC737,/* 0xB0-0xB7 */ + 0xC73C,0xC73D,0xC740,0xC744,0xC74A,0xC74C,0xC74D,0xC74F,/* 0xB8-0xBF */ + 0xC751,0xC752,0xC753,0xC754,0xC755,0xC756,0xC757,0xC758,/* 0xC0-0xC7 */ + 0xC75C,0xC760,0xC768,0xC76B,0xC774,0xC775,0xC778,0xC77C,/* 0xC8-0xCF */ + 0xC77D,0xC77E,0xC783,0xC784,0xC785,0xC787,0xC788,0xC789,/* 0xD0-0xD7 */ + 0xC78A,0xC78E,0xC790,0xC791,0xC794,0xC796,0xC797,0xC798,/* 0xD8-0xDF */ + 0xC79A,0xC7A0,0xC7A1,0xC7A3,0xC7A4,0xC7A5,0xC7A6,0xC7AC,/* 0xE0-0xE7 */ + 0xC7AD,0xC7B0,0xC7B4,0xC7BC,0xC7BD,0xC7BF,0xC7C0,0xC7C1,/* 0xE8-0xEF */ + 0xC7C8,0xC7C9,0xC7CC,0xC7CE,0xC7D0,0xC7D8,0xC7DD,0xC7E4,/* 0xF0-0xF7 */ + 0xC7E8,0xC7EC,0xC800,0xC801,0xC804,0xC808,0xC80A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD564,0xD566,0xD567,0xD56A,0xD56C,0xD56E,0xD56F,/* 0x40-0x47 */ + 0xD570,0xD571,0xD572,0xD573,0xD576,0xD577,0xD579,0xD57A,/* 0x48-0x4F */ + 0xD57B,0xD57D,0xD57E,0xD57F,0xD580,0xD581,0xD582,0xD583,/* 0x50-0x57 */ + 0xD586,0xD58A,0xD58B,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD58C,0xD58D,0xD58E,0xD58F,0xD591,0xD592,0xD593,/* 0x60-0x67 */ + 0xD594,0xD595,0xD596,0xD597,0xD598,0xD599,0xD59A,0xD59B,/* 0x68-0x6F */ + 0xD59C,0xD59D,0xD59E,0xD59F,0xD5A0,0xD5A1,0xD5A2,0xD5A3,/* 0x70-0x77 */ + 0xD5A4,0xD5A6,0xD5A7,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD5A8,0xD5A9,0xD5AA,0xD5AB,0xD5AC,0xD5AD,0xD5AE,/* 0x80-0x87 */ + 0xD5AF,0xD5B0,0xD5B1,0xD5B2,0xD5B3,0xD5B4,0xD5B5,0xD5B6,/* 0x88-0x8F */ + 0xD5B7,0xD5B8,0xD5B9,0xD5BA,0xD5BB,0xD5BC,0xD5BD,0xD5BE,/* 0x90-0x97 */ + 0xD5BF,0xD5C0,0xD5C1,0xD5C2,0xD5C3,0xD5C4,0xD5C5,0xD5C6,/* 0x98-0x9F */ + 0xD5C7,0xC810,0xC811,0xC813,0xC815,0xC816,0xC81C,0xC81D,/* 0xA0-0xA7 */ + 0xC820,0xC824,0xC82C,0xC82D,0xC82F,0xC831,0xC838,0xC83C,/* 0xA8-0xAF */ + 0xC840,0xC848,0xC849,0xC84C,0xC84D,0xC854,0xC870,0xC871,/* 0xB0-0xB7 */ + 0xC874,0xC878,0xC87A,0xC880,0xC881,0xC883,0xC885,0xC886,/* 0xB8-0xBF */ + 0xC887,0xC88B,0xC88C,0xC88D,0xC894,0xC89D,0xC89F,0xC8A1,/* 0xC0-0xC7 */ + 0xC8A8,0xC8BC,0xC8BD,0xC8C4,0xC8C8,0xC8CC,0xC8D4,0xC8D5,/* 0xC8-0xCF */ + 0xC8D7,0xC8D9,0xC8E0,0xC8E1,0xC8E4,0xC8F5,0xC8FC,0xC8FD,/* 0xD0-0xD7 */ + 0xC900,0xC904,0xC905,0xC906,0xC90C,0xC90D,0xC90F,0xC911,/* 0xD8-0xDF */ + 0xC918,0xC92C,0xC934,0xC950,0xC951,0xC954,0xC958,0xC960,/* 0xE0-0xE7 */ + 0xC961,0xC963,0xC96C,0xC970,0xC974,0xC97C,0xC988,0xC989,/* 0xE8-0xEF */ + 0xC98C,0xC990,0xC998,0xC999,0xC99B,0xC99D,0xC9C0,0xC9C1,/* 0xF0-0xF7 */ + 0xC9C4,0xC9C7,0xC9C8,0xC9CA,0xC9D0,0xC9D1,0xC9D3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD5CA,0xD5CB,0xD5CD,0xD5CE,0xD5CF,0xD5D1,0xD5D3,/* 0x40-0x47 */ + 0xD5D4,0xD5D5,0xD5D6,0xD5D7,0xD5DA,0xD5DC,0xD5DE,0xD5DF,/* 0x48-0x4F */ + 0xD5E0,0xD5E1,0xD5E2,0xD5E3,0xD5E6,0xD5E7,0xD5E9,0xD5EA,/* 0x50-0x57 */ + 0xD5EB,0xD5ED,0xD5EE,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD5EF,0xD5F0,0xD5F1,0xD5F2,0xD5F3,0xD5F6,0xD5F8,/* 0x60-0x67 */ + 0xD5FA,0xD5FB,0xD5FC,0xD5FD,0xD5FE,0xD5FF,0xD602,0xD603,/* 0x68-0x6F */ + 0xD605,0xD606,0xD607,0xD609,0xD60A,0xD60B,0xD60C,0xD60D,/* 0x70-0x77 */ + 0xD60E,0xD60F,0xD612,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD616,0xD617,0xD618,0xD619,0xD61A,0xD61B,0xD61D,/* 0x80-0x87 */ + 0xD61E,0xD61F,0xD621,0xD622,0xD623,0xD625,0xD626,0xD627,/* 0x88-0x8F */ + 0xD628,0xD629,0xD62A,0xD62B,0xD62C,0xD62E,0xD62F,0xD630,/* 0x90-0x97 */ + 0xD631,0xD632,0xD633,0xD634,0xD635,0xD636,0xD637,0xD63A,/* 0x98-0x9F */ + 0xD63B,0xC9D5,0xC9D6,0xC9D9,0xC9DA,0xC9DC,0xC9DD,0xC9E0,/* 0xA0-0xA7 */ + 0xC9E2,0xC9E4,0xC9E7,0xC9EC,0xC9ED,0xC9EF,0xC9F0,0xC9F1,/* 0xA8-0xAF */ + 0xC9F8,0xC9F9,0xC9FC,0xCA00,0xCA08,0xCA09,0xCA0B,0xCA0C,/* 0xB0-0xB7 */ + 0xCA0D,0xCA14,0xCA18,0xCA29,0xCA4C,0xCA4D,0xCA50,0xCA54,/* 0xB8-0xBF */ + 0xCA5C,0xCA5D,0xCA5F,0xCA60,0xCA61,0xCA68,0xCA7D,0xCA84,/* 0xC0-0xC7 */ + 0xCA98,0xCABC,0xCABD,0xCAC0,0xCAC4,0xCACC,0xCACD,0xCACF,/* 0xC8-0xCF */ + 0xCAD1,0xCAD3,0xCAD8,0xCAD9,0xCAE0,0xCAEC,0xCAF4,0xCB08,/* 0xD0-0xD7 */ + 0xCB10,0xCB14,0xCB18,0xCB20,0xCB21,0xCB41,0xCB48,0xCB49,/* 0xD8-0xDF */ + 0xCB4C,0xCB50,0xCB58,0xCB59,0xCB5D,0xCB64,0xCB78,0xCB79,/* 0xE0-0xE7 */ + 0xCB9C,0xCBB8,0xCBD4,0xCBE4,0xCBE7,0xCBE9,0xCC0C,0xCC0D,/* 0xE8-0xEF */ + 0xCC10,0xCC14,0xCC1C,0xCC1D,0xCC21,0xCC22,0xCC27,0xCC28,/* 0xF0-0xF7 */ + 0xCC29,0xCC2C,0xCC2E,0xCC30,0xCC38,0xCC39,0xCC3B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD63D,0xD63E,0xD63F,0xD641,0xD642,0xD643,0xD644,/* 0x40-0x47 */ + 0xD646,0xD647,0xD64A,0xD64C,0xD64E,0xD64F,0xD650,0xD652,/* 0x48-0x4F */ + 0xD653,0xD656,0xD657,0xD659,0xD65A,0xD65B,0xD65D,0xD65E,/* 0x50-0x57 */ + 0xD65F,0xD660,0xD661,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD662,0xD663,0xD664,0xD665,0xD666,0xD668,0xD66A,/* 0x60-0x67 */ + 0xD66B,0xD66C,0xD66D,0xD66E,0xD66F,0xD672,0xD673,0xD675,/* 0x68-0x6F */ + 0xD676,0xD677,0xD678,0xD679,0xD67A,0xD67B,0xD67C,0xD67D,/* 0x70-0x77 */ + 0xD67E,0xD67F,0xD680,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD681,0xD682,0xD684,0xD686,0xD687,0xD688,0xD689,/* 0x80-0x87 */ + 0xD68A,0xD68B,0xD68E,0xD68F,0xD691,0xD692,0xD693,0xD695,/* 0x88-0x8F */ + 0xD696,0xD697,0xD698,0xD699,0xD69A,0xD69B,0xD69C,0xD69E,/* 0x90-0x97 */ + 0xD6A0,0xD6A2,0xD6A3,0xD6A4,0xD6A5,0xD6A6,0xD6A7,0xD6A9,/* 0x98-0x9F */ + 0xD6AA,0xCC3C,0xCC3D,0xCC3E,0xCC44,0xCC45,0xCC48,0xCC4C,/* 0xA0-0xA7 */ + 0xCC54,0xCC55,0xCC57,0xCC58,0xCC59,0xCC60,0xCC64,0xCC66,/* 0xA8-0xAF */ + 0xCC68,0xCC70,0xCC75,0xCC98,0xCC99,0xCC9C,0xCCA0,0xCCA8,/* 0xB0-0xB7 */ + 0xCCA9,0xCCAB,0xCCAC,0xCCAD,0xCCB4,0xCCB5,0xCCB8,0xCCBC,/* 0xB8-0xBF */ + 0xCCC4,0xCCC5,0xCCC7,0xCCC9,0xCCD0,0xCCD4,0xCCE4,0xCCEC,/* 0xC0-0xC7 */ + 0xCCF0,0xCD01,0xCD08,0xCD09,0xCD0C,0xCD10,0xCD18,0xCD19,/* 0xC8-0xCF */ + 0xCD1B,0xCD1D,0xCD24,0xCD28,0xCD2C,0xCD39,0xCD5C,0xCD60,/* 0xD0-0xD7 */ + 0xCD64,0xCD6C,0xCD6D,0xCD6F,0xCD71,0xCD78,0xCD88,0xCD94,/* 0xD8-0xDF */ + 0xCD95,0xCD98,0xCD9C,0xCDA4,0xCDA5,0xCDA7,0xCDA9,0xCDB0,/* 0xE0-0xE7 */ + 0xCDC4,0xCDCC,0xCDD0,0xCDE8,0xCDEC,0xCDF0,0xCDF8,0xCDF9,/* 0xE8-0xEF */ + 0xCDFB,0xCDFD,0xCE04,0xCE08,0xCE0C,0xCE14,0xCE19,0xCE20,/* 0xF0-0xF7 */ + 0xCE21,0xCE24,0xCE28,0xCE30,0xCE31,0xCE33,0xCE35,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD6AB,0xD6AD,0xD6AE,0xD6AF,0xD6B1,0xD6B2,0xD6B3,/* 0x40-0x47 */ + 0xD6B4,0xD6B5,0xD6B6,0xD6B7,0xD6B8,0xD6BA,0xD6BC,0xD6BD,/* 0x48-0x4F */ + 0xD6BE,0xD6BF,0xD6C0,0xD6C1,0xD6C2,0xD6C3,0xD6C6,0xD6C7,/* 0x50-0x57 */ + 0xD6C9,0xD6CA,0xD6CB,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD6CD,0xD6CE,0xD6CF,0xD6D0,0xD6D2,0xD6D3,0xD6D5,/* 0x60-0x67 */ + 0xD6D6,0xD6D8,0xD6DA,0xD6DB,0xD6DC,0xD6DD,0xD6DE,0xD6DF,/* 0x68-0x6F */ + 0xD6E1,0xD6E2,0xD6E3,0xD6E5,0xD6E6,0xD6E7,0xD6E9,0xD6EA,/* 0x70-0x77 */ + 0xD6EB,0xD6EC,0xD6ED,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD6EE,0xD6EF,0xD6F1,0xD6F2,0xD6F3,0xD6F4,0xD6F6,/* 0x80-0x87 */ + 0xD6F7,0xD6F8,0xD6F9,0xD6FA,0xD6FB,0xD6FE,0xD6FF,0xD701,/* 0x88-0x8F */ + 0xD702,0xD703,0xD705,0xD706,0xD707,0xD708,0xD709,0xD70A,/* 0x90-0x97 */ + 0xD70B,0xD70C,0xD70D,0xD70E,0xD70F,0xD710,0xD712,0xD713,/* 0x98-0x9F */ + 0xD714,0xCE58,0xCE59,0xCE5C,0xCE5F,0xCE60,0xCE61,0xCE68,/* 0xA0-0xA7 */ + 0xCE69,0xCE6B,0xCE6D,0xCE74,0xCE75,0xCE78,0xCE7C,0xCE84,/* 0xA8-0xAF */ + 0xCE85,0xCE87,0xCE89,0xCE90,0xCE91,0xCE94,0xCE98,0xCEA0,/* 0xB0-0xB7 */ + 0xCEA1,0xCEA3,0xCEA4,0xCEA5,0xCEAC,0xCEAD,0xCEC1,0xCEE4,/* 0xB8-0xBF */ + 0xCEE5,0xCEE8,0xCEEB,0xCEEC,0xCEF4,0xCEF5,0xCEF7,0xCEF8,/* 0xC0-0xC7 */ + 0xCEF9,0xCF00,0xCF01,0xCF04,0xCF08,0xCF10,0xCF11,0xCF13,/* 0xC8-0xCF */ + 0xCF15,0xCF1C,0xCF20,0xCF24,0xCF2C,0xCF2D,0xCF2F,0xCF30,/* 0xD0-0xD7 */ + 0xCF31,0xCF38,0xCF54,0xCF55,0xCF58,0xCF5C,0xCF64,0xCF65,/* 0xD8-0xDF */ + 0xCF67,0xCF69,0xCF70,0xCF71,0xCF74,0xCF78,0xCF80,0xCF85,/* 0xE0-0xE7 */ + 0xCF8C,0xCFA1,0xCFA8,0xCFB0,0xCFC4,0xCFE0,0xCFE1,0xCFE4,/* 0xE8-0xEF */ + 0xCFE8,0xCFF0,0xCFF1,0xCFF3,0xCFF5,0xCFFC,0xD000,0xD004,/* 0xF0-0xF7 */ + 0xD011,0xD018,0xD02D,0xD034,0xD035,0xD038,0xD03C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD715,0xD716,0xD717,0xD71A,0xD71B,0xD71D,0xD71E,/* 0x40-0x47 */ + 0xD71F,0xD721,0xD722,0xD723,0xD724,0xD725,0xD726,0xD727,/* 0x48-0x4F */ + 0xD72A,0xD72C,0xD72E,0xD72F,0xD730,0xD731,0xD732,0xD733,/* 0x50-0x57 */ + 0xD736,0xD737,0xD739,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0xD73A,0xD73B,0xD73D,0xD73E,0xD73F,0xD740,0xD741,/* 0x60-0x67 */ + 0xD742,0xD743,0xD745,0xD746,0xD748,0xD74A,0xD74B,0xD74C,/* 0x68-0x6F */ + 0xD74D,0xD74E,0xD74F,0xD752,0xD753,0xD755,0xD75A,0xD75B,/* 0x70-0x77 */ + 0xD75C,0xD75D,0xD75E,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0xD75F,0xD762,0xD764,0xD766,0xD767,0xD768,0xD76A,/* 0x80-0x87 */ + 0xD76B,0xD76D,0xD76E,0xD76F,0xD771,0xD772,0xD773,0xD775,/* 0x88-0x8F */ + 0xD776,0xD777,0xD778,0xD779,0xD77A,0xD77B,0xD77E,0xD77F,/* 0x90-0x97 */ + 0xD780,0xD782,0xD783,0xD784,0xD785,0xD786,0xD787,0xD78A,/* 0x98-0x9F */ + 0xD78B,0xD044,0xD045,0xD047,0xD049,0xD050,0xD054,0xD058,/* 0xA0-0xA7 */ + 0xD060,0xD06C,0xD06D,0xD070,0xD074,0xD07C,0xD07D,0xD081,/* 0xA8-0xAF */ + 0xD0A4,0xD0A5,0xD0A8,0xD0AC,0xD0B4,0xD0B5,0xD0B7,0xD0B9,/* 0xB0-0xB7 */ + 0xD0C0,0xD0C1,0xD0C4,0xD0C8,0xD0C9,0xD0D0,0xD0D1,0xD0D3,/* 0xB8-0xBF */ + 0xD0D4,0xD0D5,0xD0DC,0xD0DD,0xD0E0,0xD0E4,0xD0EC,0xD0ED,/* 0xC0-0xC7 */ + 0xD0EF,0xD0F0,0xD0F1,0xD0F8,0xD10D,0xD130,0xD131,0xD134,/* 0xC8-0xCF */ + 0xD138,0xD13A,0xD140,0xD141,0xD143,0xD144,0xD145,0xD14C,/* 0xD0-0xD7 */ + 0xD14D,0xD150,0xD154,0xD15C,0xD15D,0xD15F,0xD161,0xD168,/* 0xD8-0xDF */ + 0xD16C,0xD17C,0xD184,0xD188,0xD1A0,0xD1A1,0xD1A4,0xD1A8,/* 0xE0-0xE7 */ + 0xD1B0,0xD1B1,0xD1B3,0xD1B5,0xD1BA,0xD1BC,0xD1C0,0xD1D8,/* 0xE8-0xEF */ + 0xD1F4,0xD1F8,0xD207,0xD209,0xD210,0xD22C,0xD22D,0xD230,/* 0xF0-0xF7 */ + 0xD234,0xD23C,0xD23D,0xD23F,0xD241,0xD248,0xD25C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0xD78D,0xD78E,0xD78F,0xD791,0xD792,0xD793,0xD794,/* 0x40-0x47 */ + 0xD795,0xD796,0xD797,0xD79A,0xD79C,0xD79E,0xD79F,0xD7A0,/* 0x48-0x4F */ + 0xD7A1,0xD7A2,0xD7A3,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xD264,0xD280,0xD281,0xD284,0xD288,0xD290,0xD291,/* 0xA0-0xA7 */ + 0xD295,0xD29C,0xD2A0,0xD2A4,0xD2AC,0xD2B1,0xD2B8,0xD2B9,/* 0xA8-0xAF */ + 0xD2BC,0xD2BF,0xD2C0,0xD2C2,0xD2C8,0xD2C9,0xD2CB,0xD2D4,/* 0xB0-0xB7 */ + 0xD2D8,0xD2DC,0xD2E4,0xD2E5,0xD2F0,0xD2F1,0xD2F4,0xD2F8,/* 0xB8-0xBF */ + 0xD300,0xD301,0xD303,0xD305,0xD30C,0xD30D,0xD30E,0xD310,/* 0xC0-0xC7 */ + 0xD314,0xD316,0xD31C,0xD31D,0xD31F,0xD320,0xD321,0xD325,/* 0xC8-0xCF */ + 0xD328,0xD329,0xD32C,0xD330,0xD338,0xD339,0xD33B,0xD33C,/* 0xD0-0xD7 */ + 0xD33D,0xD344,0xD345,0xD37C,0xD37D,0xD380,0xD384,0xD38C,/* 0xD8-0xDF */ + 0xD38D,0xD38F,0xD390,0xD391,0xD398,0xD399,0xD39C,0xD3A0,/* 0xE0-0xE7 */ + 0xD3A8,0xD3A9,0xD3AB,0xD3AD,0xD3B4,0xD3B8,0xD3BC,0xD3C4,/* 0xE8-0xEF */ + 0xD3C5,0xD3C8,0xD3C9,0xD3D0,0xD3D8,0xD3E1,0xD3E3,0xD3EC,/* 0xF0-0xF7 */ + 0xD3ED,0xD3F0,0xD3F4,0xD3FC,0xD3FD,0xD3FF,0xD401,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xD408,0xD41D,0xD440,0xD444,0xD45C,0xD460,0xD464,/* 0xA0-0xA7 */ + 0xD46D,0xD46F,0xD478,0xD479,0xD47C,0xD47F,0xD480,0xD482,/* 0xA8-0xAF */ + 0xD488,0xD489,0xD48B,0xD48D,0xD494,0xD4A9,0xD4CC,0xD4D0,/* 0xB0-0xB7 */ + 0xD4D4,0xD4DC,0xD4DF,0xD4E8,0xD4EC,0xD4F0,0xD4F8,0xD4FB,/* 0xB8-0xBF */ + 0xD4FD,0xD504,0xD508,0xD50C,0xD514,0xD515,0xD517,0xD53C,/* 0xC0-0xC7 */ + 0xD53D,0xD540,0xD544,0xD54C,0xD54D,0xD54F,0xD551,0xD558,/* 0xC8-0xCF */ + 0xD559,0xD55C,0xD560,0xD565,0xD568,0xD569,0xD56B,0xD56D,/* 0xD0-0xD7 */ + 0xD574,0xD575,0xD578,0xD57C,0xD584,0xD585,0xD587,0xD588,/* 0xD8-0xDF */ + 0xD589,0xD590,0xD5A5,0xD5C8,0xD5C9,0xD5CC,0xD5D0,0xD5D2,/* 0xE0-0xE7 */ + 0xD5D8,0xD5D9,0xD5DB,0xD5DD,0xD5E4,0xD5E5,0xD5E8,0xD5EC,/* 0xE8-0xEF */ + 0xD5F4,0xD5F5,0xD5F7,0xD5F9,0xD600,0xD601,0xD604,0xD608,/* 0xF0-0xF7 */ + 0xD610,0xD611,0xD613,0xD614,0xD615,0xD61C,0xD620,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xD624,0xD62D,0xD638,0xD639,0xD63C,0xD640,0xD645,/* 0xA0-0xA7 */ + 0xD648,0xD649,0xD64B,0xD64D,0xD651,0xD654,0xD655,0xD658,/* 0xA8-0xAF */ + 0xD65C,0xD667,0xD669,0xD670,0xD671,0xD674,0xD683,0xD685,/* 0xB0-0xB7 */ + 0xD68C,0xD68D,0xD690,0xD694,0xD69D,0xD69F,0xD6A1,0xD6A8,/* 0xB8-0xBF */ + 0xD6AC,0xD6B0,0xD6B9,0xD6BB,0xD6C4,0xD6C5,0xD6C8,0xD6CC,/* 0xC0-0xC7 */ + 0xD6D1,0xD6D4,0xD6D7,0xD6D9,0xD6E0,0xD6E4,0xD6E8,0xD6F0,/* 0xC8-0xCF */ + 0xD6F5,0xD6FC,0xD6FD,0xD700,0xD704,0xD711,0xD718,0xD719,/* 0xD0-0xD7 */ + 0xD71C,0xD720,0xD728,0xD729,0xD72B,0xD72D,0xD734,0xD735,/* 0xD8-0xDF */ + 0xD738,0xD73C,0xD744,0xD747,0xD749,0xD750,0xD751,0xD754,/* 0xE0-0xE7 */ + 0xD756,0xD757,0xD758,0xD759,0xD760,0xD761,0xD763,0xD765,/* 0xE8-0xEF */ + 0xD769,0xD76C,0xD770,0xD774,0xD77C,0xD77D,0xD781,0xD788,/* 0xF0-0xF7 */ + 0xD789,0xD78C,0xD790,0xD798,0xD799,0xD79B,0xD79D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x4F3D,0x4F73,0x5047,0x50F9,0x52A0,0x53EF,0x5475,/* 0xA0-0xA7 */ + 0x54E5,0x5609,0x5AC1,0x5BB6,0x6687,0x67B6,0x67B7,0x67EF,/* 0xA8-0xAF */ + 0x6B4C,0x73C2,0x75C2,0x7A3C,0x82DB,0x8304,0x8857,0x8888,/* 0xB0-0xB7 */ + 0x8A36,0x8CC8,0x8DCF,0x8EFB,0x8FE6,0x99D5,0x523B,0x5374,/* 0xB8-0xBF */ + 0x5404,0x606A,0x6164,0x6BBC,0x73CF,0x811A,0x89BA,0x89D2,/* 0xC0-0xC7 */ + 0x95A3,0x4F83,0x520A,0x58BE,0x5978,0x59E6,0x5E72,0x5E79,/* 0xC8-0xCF */ + 0x61C7,0x63C0,0x6746,0x67EC,0x687F,0x6F97,0x764E,0x770B,/* 0xD0-0xD7 */ + 0x78F5,0x7A08,0x7AFF,0x7C21,0x809D,0x826E,0x8271,0x8AEB,/* 0xD8-0xDF */ + 0x9593,0x4E6B,0x559D,0x66F7,0x6E34,0x78A3,0x7AED,0x845B,/* 0xE0-0xE7 */ + 0x8910,0x874E,0x97A8,0x52D8,0x574E,0x582A,0x5D4C,0x611F,/* 0xE8-0xEF */ + 0x61BE,0x6221,0x6562,0x67D1,0x6A44,0x6E1B,0x7518,0x75B3,/* 0xF0-0xF7 */ + 0x76E3,0x77B0,0x7D3A,0x90AF,0x9451,0x9452,0x9F95,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5323,0x5CAC,0x7532,0x80DB,0x9240,0x9598,0x525B,/* 0xA0-0xA7 */ + 0x5808,0x59DC,0x5CA1,0x5D17,0x5EB7,0x5F3A,0x5F4A,0x6177,/* 0xA8-0xAF */ + 0x6C5F,0x757A,0x7586,0x7CE0,0x7D73,0x7DB1,0x7F8C,0x8154,/* 0xB0-0xB7 */ + 0x8221,0x8591,0x8941,0x8B1B,0x92FC,0x964D,0x9C47,0x4ECB,/* 0xB8-0xBF */ + 0x4EF7,0x500B,0x51F1,0x584F,0x6137,0x613E,0x6168,0x6539,/* 0xC0-0xC7 */ + 0x69EA,0x6F11,0x75A5,0x7686,0x76D6,0x7B87,0x82A5,0x84CB,/* 0xC8-0xCF */ + 0xF900,0x93A7,0x958B,0x5580,0x5BA2,0x5751,0xF901,0x7CB3,/* 0xD0-0xD7 */ + 0x7FB9,0x91B5,0x5028,0x53BB,0x5C45,0x5DE8,0x62D2,0x636E,/* 0xD8-0xDF */ + 0x64DA,0x64E7,0x6E20,0x70AC,0x795B,0x8DDD,0x8E1E,0xF902,/* 0xE0-0xE7 */ + 0x907D,0x9245,0x92F8,0x4E7E,0x4EF6,0x5065,0x5DFE,0x5EFA,/* 0xE8-0xEF */ + 0x6106,0x6957,0x8171,0x8654,0x8E47,0x9375,0x9A2B,0x4E5E,/* 0xF0-0xF7 */ + 0x5091,0x6770,0x6840,0x5109,0x528D,0x5292,0x6AA2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x77BC,0x9210,0x9ED4,0x52AB,0x602F,0x8FF2,0x5048,/* 0xA0-0xA7 */ + 0x61A9,0x63ED,0x64CA,0x683C,0x6A84,0x6FC0,0x8188,0x89A1,/* 0xA8-0xAF */ + 0x9694,0x5805,0x727D,0x72AC,0x7504,0x7D79,0x7E6D,0x80A9,/* 0xB0-0xB7 */ + 0x898B,0x8B74,0x9063,0x9D51,0x6289,0x6C7A,0x6F54,0x7D50,/* 0xB8-0xBF */ + 0x7F3A,0x8A23,0x517C,0x614A,0x7B9D,0x8B19,0x9257,0x938C,/* 0xC0-0xC7 */ + 0x4EAC,0x4FD3,0x501E,0x50BE,0x5106,0x52C1,0x52CD,0x537F,/* 0xC8-0xCF */ + 0x5770,0x5883,0x5E9A,0x5F91,0x6176,0x61AC,0x64CE,0x656C,/* 0xD0-0xD7 */ + 0x666F,0x66BB,0x66F4,0x6897,0x6D87,0x7085,0x70F1,0x749F,/* 0xD8-0xDF */ + 0x74A5,0x74CA,0x75D9,0x786C,0x78EC,0x7ADF,0x7AF6,0x7D45,/* 0xE0-0xE7 */ + 0x7D93,0x8015,0x803F,0x811B,0x8396,0x8B66,0x8F15,0x9015,/* 0xE8-0xEF */ + 0x93E1,0x9803,0x9838,0x9A5A,0x9BE8,0x4FC2,0x5553,0x583A,/* 0xF0-0xF7 */ + 0x5951,0x5B63,0x5C46,0x60B8,0x6212,0x6842,0x68B0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x68E8,0x6EAA,0x754C,0x7678,0x78CE,0x7A3D,0x7CFB,/* 0xA0-0xA7 */ + 0x7E6B,0x7E7C,0x8A08,0x8AA1,0x8C3F,0x968E,0x9DC4,0x53E4,/* 0xA8-0xAF */ + 0x53E9,0x544A,0x5471,0x56FA,0x59D1,0x5B64,0x5C3B,0x5EAB,/* 0xB0-0xB7 */ + 0x62F7,0x6537,0x6545,0x6572,0x66A0,0x67AF,0x69C1,0x6CBD,/* 0xB8-0xBF */ + 0x75FC,0x7690,0x777E,0x7A3F,0x7F94,0x8003,0x80A1,0x818F,/* 0xC0-0xC7 */ + 0x82E6,0x82FD,0x83F0,0x85C1,0x8831,0x88B4,0x8AA5,0xF903,/* 0xC8-0xCF */ + 0x8F9C,0x932E,0x96C7,0x9867,0x9AD8,0x9F13,0x54ED,0x659B,/* 0xD0-0xD7 */ + 0x66F2,0x688F,0x7A40,0x8C37,0x9D60,0x56F0,0x5764,0x5D11,/* 0xD8-0xDF */ + 0x6606,0x68B1,0x68CD,0x6EFE,0x7428,0x889E,0x9BE4,0x6C68,/* 0xE0-0xE7 */ + 0xF904,0x9AA8,0x4F9B,0x516C,0x5171,0x529F,0x5B54,0x5DE5,/* 0xE8-0xEF */ + 0x6050,0x606D,0x62F1,0x63A7,0x653B,0x73D9,0x7A7A,0x86A3,/* 0xF0-0xF7 */ + 0x8CA2,0x978F,0x4E32,0x5BE1,0x6208,0x679C,0x74DC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x79D1,0x83D3,0x8A87,0x8AB2,0x8DE8,0x904E,0x934B,/* 0xA0-0xA7 */ + 0x9846,0x5ED3,0x69E8,0x85FF,0x90ED,0xF905,0x51A0,0x5B98,/* 0xA8-0xAF */ + 0x5BEC,0x6163,0x68FA,0x6B3E,0x704C,0x742F,0x74D8,0x7BA1,/* 0xB0-0xB7 */ + 0x7F50,0x83C5,0x89C0,0x8CAB,0x95DC,0x9928,0x522E,0x605D,/* 0xB8-0xBF */ + 0x62EC,0x9002,0x4F8A,0x5149,0x5321,0x58D9,0x5EE3,0x66E0,/* 0xC0-0xC7 */ + 0x6D38,0x709A,0x72C2,0x73D6,0x7B50,0x80F1,0x945B,0x5366,/* 0xC8-0xCF */ + 0x639B,0x7F6B,0x4E56,0x5080,0x584A,0x58DE,0x602A,0x6127,/* 0xD0-0xD7 */ + 0x62D0,0x69D0,0x9B41,0x5B8F,0x7D18,0x80B1,0x8F5F,0x4EA4,/* 0xD8-0xDF */ + 0x50D1,0x54AC,0x55AC,0x5B0C,0x5DA0,0x5DE7,0x652A,0x654E,/* 0xE0-0xE7 */ + 0x6821,0x6A4B,0x72E1,0x768E,0x77EF,0x7D5E,0x7FF9,0x81A0,/* 0xE8-0xEF */ + 0x854E,0x86DF,0x8F03,0x8F4E,0x90CA,0x9903,0x9A55,0x9BAB,/* 0xF0-0xF7 */ + 0x4E18,0x4E45,0x4E5D,0x4EC7,0x4FF1,0x5177,0x52FE,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5340,0x53E3,0x53E5,0x548E,0x5614,0x5775,0x57A2,/* 0xA0-0xA7 */ + 0x5BC7,0x5D87,0x5ED0,0x61FC,0x62D8,0x6551,0x67B8,0x67E9,/* 0xA8-0xAF */ + 0x69CB,0x6B50,0x6BC6,0x6BEC,0x6C42,0x6E9D,0x7078,0x72D7,/* 0xB0-0xB7 */ + 0x7396,0x7403,0x77BF,0x77E9,0x7A76,0x7D7F,0x8009,0x81FC,/* 0xB8-0xBF */ + 0x8205,0x820A,0x82DF,0x8862,0x8B33,0x8CFC,0x8EC0,0x9011,/* 0xC0-0xC7 */ + 0x90B1,0x9264,0x92B6,0x99D2,0x9A45,0x9CE9,0x9DD7,0x9F9C,/* 0xC8-0xCF */ + 0x570B,0x5C40,0x83CA,0x97A0,0x97AB,0x9EB4,0x541B,0x7A98,/* 0xD0-0xD7 */ + 0x7FA4,0x88D9,0x8ECD,0x90E1,0x5800,0x5C48,0x6398,0x7A9F,/* 0xD8-0xDF */ + 0x5BAE,0x5F13,0x7A79,0x7AAE,0x828E,0x8EAC,0x5026,0x5238,/* 0xE0-0xE7 */ + 0x52F8,0x5377,0x5708,0x62F3,0x6372,0x6B0A,0x6DC3,0x7737,/* 0xE8-0xEF */ + 0x53A5,0x7357,0x8568,0x8E76,0x95D5,0x673A,0x6AC3,0x6F70,/* 0xF0-0xF7 */ + 0x8A6D,0x8ECC,0x994B,0xF906,0x6677,0x6B78,0x8CB4,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9B3C,0xF907,0x53EB,0x572D,0x594E,0x63C6,0x69FB,/* 0xA0-0xA7 */ + 0x73EA,0x7845,0x7ABA,0x7AC5,0x7CFE,0x8475,0x898F,0x8D73,/* 0xA8-0xAF */ + 0x9035,0x95A8,0x52FB,0x5747,0x7547,0x7B60,0x83CC,0x921E,/* 0xB0-0xB7 */ + 0xF908,0x6A58,0x514B,0x524B,0x5287,0x621F,0x68D8,0x6975,/* 0xB8-0xBF */ + 0x9699,0x50C5,0x52A4,0x52E4,0x61C3,0x65A4,0x6839,0x69FF,/* 0xC0-0xC7 */ + 0x747E,0x7B4B,0x82B9,0x83EB,0x89B2,0x8B39,0x8FD1,0x9949,/* 0xC8-0xCF */ + 0xF909,0x4ECA,0x5997,0x64D2,0x6611,0x6A8E,0x7434,0x7981,/* 0xD0-0xD7 */ + 0x79BD,0x82A9,0x887E,0x887F,0x895F,0xF90A,0x9326,0x4F0B,/* 0xD8-0xDF */ + 0x53CA,0x6025,0x6271,0x6C72,0x7D1A,0x7D66,0x4E98,0x5162,/* 0xE0-0xE7 */ + 0x77DC,0x80AF,0x4F01,0x4F0E,0x5176,0x5180,0x55DC,0x5668,/* 0xE8-0xEF */ + 0x573B,0x57FA,0x57FC,0x5914,0x5947,0x5993,0x5BC4,0x5C90,/* 0xF0-0xF7 */ + 0x5D0E,0x5DF1,0x5E7E,0x5FCC,0x6280,0x65D7,0x65E3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x671E,0x671F,0x675E,0x68CB,0x68C4,0x6A5F,0x6B3A,/* 0xA0-0xA7 */ + 0x6C23,0x6C7D,0x6C82,0x6DC7,0x7398,0x7426,0x742A,0x7482,/* 0xA8-0xAF */ + 0x74A3,0x7578,0x757F,0x7881,0x78EF,0x7941,0x7947,0x7948,/* 0xB0-0xB7 */ + 0x797A,0x7B95,0x7D00,0x7DBA,0x7F88,0x8006,0x802D,0x808C,/* 0xB8-0xBF */ + 0x8A18,0x8B4F,0x8C48,0x8D77,0x9321,0x9324,0x98E2,0x9951,/* 0xC0-0xC7 */ + 0x9A0E,0x9A0F,0x9A65,0x9E92,0x7DCA,0x4F76,0x5409,0x62EE,/* 0xC8-0xCF */ + 0x6854,0x91D1,0x55AB,0x513A,0xF90B,0xF90C,0x5A1C,0x61E6,/* 0xD0-0xD7 */ + 0xF90D,0x62CF,0x62FF,0xF90E,0xF90F,0xF910,0xF911,0xF912,/* 0xD8-0xDF */ + 0xF913,0x90A3,0xF914,0xF915,0xF916,0xF917,0xF918,0x8AFE,/* 0xE0-0xE7 */ + 0xF919,0xF91A,0xF91B,0xF91C,0x6696,0xF91D,0x7156,0xF91E,/* 0xE8-0xEF */ + 0xF91F,0x96E3,0xF920,0x634F,0x637A,0x5357,0xF921,0x678F,/* 0xF0-0xF7 */ + 0x6960,0x6E73,0xF922,0x7537,0xF923,0xF924,0xF925,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7D0D,0xF926,0xF927,0x8872,0x56CA,0x5A18,0xF928,/* 0xA0-0xA7 */ + 0xF929,0xF92A,0xF92B,0xF92C,0x4E43,0xF92D,0x5167,0x5948,/* 0xA8-0xAF */ + 0x67F0,0x8010,0xF92E,0x5973,0x5E74,0x649A,0x79CA,0x5FF5,/* 0xB0-0xB7 */ + 0x606C,0x62C8,0x637B,0x5BE7,0x5BD7,0x52AA,0xF92F,0x5974,/* 0xB8-0xBF */ + 0x5F29,0x6012,0xF930,0xF931,0xF932,0x7459,0xF933,0xF934,/* 0xC0-0xC7 */ + 0xF935,0xF936,0xF937,0xF938,0x99D1,0xF939,0xF93A,0xF93B,/* 0xC8-0xCF */ + 0xF93C,0xF93D,0xF93E,0xF93F,0xF940,0xF941,0xF942,0xF943,/* 0xD0-0xD7 */ + 0x6FC3,0xF944,0xF945,0x81BF,0x8FB2,0x60F1,0xF946,0xF947,/* 0xD8-0xDF */ + 0x8166,0xF948,0xF949,0x5C3F,0xF94A,0xF94B,0xF94C,0xF94D,/* 0xE0-0xE7 */ + 0xF94E,0xF94F,0xF950,0xF951,0x5AE9,0x8A25,0x677B,0x7D10,/* 0xE8-0xEF */ + 0xF952,0xF953,0xF954,0xF955,0xF956,0xF957,0x80FD,0xF958,/* 0xF0-0xF7 */ + 0xF959,0x5C3C,0x6CE5,0x533F,0x6EBA,0x591A,0x8336,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x4E39,0x4EB6,0x4F46,0x55AE,0x5718,0x58C7,0x5F56,/* 0xA0-0xA7 */ + 0x65B7,0x65E6,0x6A80,0x6BB5,0x6E4D,0x77ED,0x7AEF,0x7C1E,/* 0xA8-0xAF */ + 0x7DDE,0x86CB,0x8892,0x9132,0x935B,0x64BB,0x6FBE,0x737A,/* 0xB0-0xB7 */ + 0x75B8,0x9054,0x5556,0x574D,0x61BA,0x64D4,0x66C7,0x6DE1,/* 0xB8-0xBF */ + 0x6E5B,0x6F6D,0x6FB9,0x75F0,0x8043,0x81BD,0x8541,0x8983,/* 0xC0-0xC7 */ + 0x8AC7,0x8B5A,0x931F,0x6C93,0x7553,0x7B54,0x8E0F,0x905D,/* 0xC8-0xCF */ + 0x5510,0x5802,0x5858,0x5E62,0x6207,0x649E,0x68E0,0x7576,/* 0xD0-0xD7 */ + 0x7CD6,0x87B3,0x9EE8,0x4EE3,0x5788,0x576E,0x5927,0x5C0D,/* 0xD8-0xDF */ + 0x5CB1,0x5E36,0x5F85,0x6234,0x64E1,0x73B3,0x81FA,0x888B,/* 0xE0-0xE7 */ + 0x8CB8,0x968A,0x9EDB,0x5B85,0x5FB7,0x60B3,0x5012,0x5200,/* 0xE8-0xEF */ + 0x5230,0x5716,0x5835,0x5857,0x5C0E,0x5C60,0x5CF6,0x5D8B,/* 0xF0-0xF7 */ + 0x5EA6,0x5F92,0x60BC,0x6311,0x6389,0x6417,0x6843,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x68F9,0x6AC2,0x6DD8,0x6E21,0x6ED4,0x6FE4,0x71FE,/* 0xA0-0xA7 */ + 0x76DC,0x7779,0x79B1,0x7A3B,0x8404,0x89A9,0x8CED,0x8DF3,/* 0xA8-0xAF */ + 0x8E48,0x9003,0x9014,0x9053,0x90FD,0x934D,0x9676,0x97DC,/* 0xB0-0xB7 */ + 0x6BD2,0x7006,0x7258,0x72A2,0x7368,0x7763,0x79BF,0x7BE4,/* 0xB8-0xBF */ + 0x7E9B,0x8B80,0x58A9,0x60C7,0x6566,0x65FD,0x66BE,0x6C8C,/* 0xC0-0xC7 */ + 0x711E,0x71C9,0x8C5A,0x9813,0x4E6D,0x7A81,0x4EDD,0x51AC,/* 0xC8-0xCF */ + 0x51CD,0x52D5,0x540C,0x61A7,0x6771,0x6850,0x68DF,0x6D1E,/* 0xD0-0xD7 */ + 0x6F7C,0x75BC,0x77B3,0x7AE5,0x80F4,0x8463,0x9285,0x515C,/* 0xD8-0xDF */ + 0x6597,0x675C,0x6793,0x75D8,0x7AC7,0x8373,0xF95A,0x8C46,/* 0xE0-0xE7 */ + 0x9017,0x982D,0x5C6F,0x81C0,0x829A,0x9041,0x906F,0x920D,/* 0xE8-0xEF */ + 0x5F97,0x5D9D,0x6A59,0x71C8,0x767B,0x7B49,0x85E4,0x8B04,/* 0xF0-0xF7 */ + 0x9127,0x9A30,0x5587,0x61F6,0xF95B,0x7669,0x7F85,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x863F,0x87BA,0x88F8,0x908F,0xF95C,0x6D1B,0x70D9,/* 0xA0-0xA7 */ + 0x73DE,0x7D61,0x843D,0xF95D,0x916A,0x99F1,0xF95E,0x4E82,/* 0xA8-0xAF */ + 0x5375,0x6B04,0x6B12,0x703E,0x721B,0x862D,0x9E1E,0x524C,/* 0xB0-0xB7 */ + 0x8FA3,0x5D50,0x64E5,0x652C,0x6B16,0x6FEB,0x7C43,0x7E9C,/* 0xB8-0xBF */ + 0x85CD,0x8964,0x89BD,0x62C9,0x81D8,0x881F,0x5ECA,0x6717,/* 0xC0-0xC7 */ + 0x6D6A,0x72FC,0x7405,0x746F,0x8782,0x90DE,0x4F86,0x5D0D,/* 0xC8-0xCF */ + 0x5FA0,0x840A,0x51B7,0x63A0,0x7565,0x4EAE,0x5006,0x5169,/* 0xD0-0xD7 */ + 0x51C9,0x6881,0x6A11,0x7CAE,0x7CB1,0x7CE7,0x826F,0x8AD2,/* 0xD8-0xDF */ + 0x8F1B,0x91CF,0x4FB6,0x5137,0x52F5,0x5442,0x5EEC,0x616E,/* 0xE0-0xE7 */ + 0x623E,0x65C5,0x6ADA,0x6FFE,0x792A,0x85DC,0x8823,0x95AD,/* 0xE8-0xEF */ + 0x9A62,0x9A6A,0x9E97,0x9ECE,0x529B,0x66C6,0x6B77,0x701D,/* 0xF0-0xF7 */ + 0x792B,0x8F62,0x9742,0x6190,0x6200,0x6523,0x6F23,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7149,0x7489,0x7DF4,0x806F,0x84EE,0x8F26,0x9023,/* 0xA0-0xA7 */ + 0x934A,0x51BD,0x5217,0x52A3,0x6D0C,0x70C8,0x88C2,0x5EC9,/* 0xA8-0xAF */ + 0x6582,0x6BAE,0x6FC2,0x7C3E,0x7375,0x4EE4,0x4F36,0x56F9,/* 0xB0-0xB7 */ + 0xF95F,0x5CBA,0x5DBA,0x601C,0x73B2,0x7B2D,0x7F9A,0x7FCE,/* 0xB8-0xBF */ + 0x8046,0x901E,0x9234,0x96F6,0x9748,0x9818,0x9F61,0x4F8B,/* 0xC0-0xC7 */ + 0x6FA7,0x79AE,0x91B4,0x96B7,0x52DE,0xF960,0x6488,0x64C4,/* 0xC8-0xCF */ + 0x6AD3,0x6F5E,0x7018,0x7210,0x76E7,0x8001,0x8606,0x865C,/* 0xD0-0xD7 */ + 0x8DEF,0x8F05,0x9732,0x9B6F,0x9DFA,0x9E75,0x788C,0x797F,/* 0xD8-0xDF */ + 0x7DA0,0x83C9,0x9304,0x9E7F,0x9E93,0x8AD6,0x58DF,0x5F04,/* 0xE0-0xE7 */ + 0x6727,0x7027,0x74CF,0x7C60,0x807E,0x5121,0x7028,0x7262,/* 0xE8-0xEF */ + 0x78CA,0x8CC2,0x8CDA,0x8CF4,0x96F7,0x4E86,0x50DA,0x5BEE,/* 0xF0-0xF7 */ + 0x5ED6,0x6599,0x71CE,0x7642,0x77AD,0x804A,0x84FC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x907C,0x9B27,0x9F8D,0x58D8,0x5A41,0x5C62,0x6A13,/* 0xA0-0xA7 */ + 0x6DDA,0x6F0F,0x763B,0x7D2F,0x7E37,0x851E,0x8938,0x93E4,/* 0xA8-0xAF */ + 0x964B,0x5289,0x65D2,0x67F3,0x69B4,0x6D41,0x6E9C,0x700F,/* 0xB0-0xB7 */ + 0x7409,0x7460,0x7559,0x7624,0x786B,0x8B2C,0x985E,0x516D,/* 0xB8-0xBF */ + 0x622E,0x9678,0x4F96,0x502B,0x5D19,0x6DEA,0x7DB8,0x8F2A,/* 0xC0-0xC7 */ + 0x5F8B,0x6144,0x6817,0xF961,0x9686,0x52D2,0x808B,0x51DC,/* 0xC8-0xCF */ + 0x51CC,0x695E,0x7A1C,0x7DBE,0x83F1,0x9675,0x4FDA,0x5229,/* 0xD0-0xD7 */ + 0x5398,0x540F,0x550E,0x5C65,0x60A7,0x674E,0x68A8,0x6D6C,/* 0xD8-0xDF */ + 0x7281,0x72F8,0x7406,0x7483,0xF962,0x75E2,0x7C6C,0x7F79,/* 0xE0-0xE7 */ + 0x7FB8,0x8389,0x88CF,0x88E1,0x91CC,0x91D0,0x96E2,0x9BC9,/* 0xE8-0xEF */ + 0x541D,0x6F7E,0x71D0,0x7498,0x85FA,0x8EAA,0x96A3,0x9C57,/* 0xF0-0xF7 */ + 0x9E9F,0x6797,0x6DCB,0x7433,0x81E8,0x9716,0x782C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7ACB,0x7B20,0x7C92,0x6469,0x746A,0x75F2,0x78BC,/* 0xA0-0xA7 */ + 0x78E8,0x99AC,0x9B54,0x9EBB,0x5BDE,0x5E55,0x6F20,0x819C,/* 0xA8-0xAF */ + 0x83AB,0x9088,0x4E07,0x534D,0x5A29,0x5DD2,0x5F4E,0x6162,/* 0xB0-0xB7 */ + 0x633D,0x6669,0x66FC,0x6EFF,0x6F2B,0x7063,0x779E,0x842C,/* 0xB8-0xBF */ + 0x8513,0x883B,0x8F13,0x9945,0x9C3B,0x551C,0x62B9,0x672B,/* 0xC0-0xC7 */ + 0x6CAB,0x8309,0x896A,0x977A,0x4EA1,0x5984,0x5FD8,0x5FD9,/* 0xC8-0xCF */ + 0x671B,0x7DB2,0x7F54,0x8292,0x832B,0x83BD,0x8F1E,0x9099,/* 0xD0-0xD7 */ + 0x57CB,0x59B9,0x5A92,0x5BD0,0x6627,0x679A,0x6885,0x6BCF,/* 0xD8-0xDF */ + 0x7164,0x7F75,0x8CB7,0x8CE3,0x9081,0x9B45,0x8108,0x8C8A,/* 0xE0-0xE7 */ + 0x964C,0x9A40,0x9EA5,0x5B5F,0x6C13,0x731B,0x76F2,0x76DF,/* 0xE8-0xEF */ + 0x840C,0x51AA,0x8993,0x514D,0x5195,0x52C9,0x68C9,0x6C94,/* 0xF0-0xF7 */ + 0x7704,0x7720,0x7DBF,0x7DEC,0x9762,0x9EB5,0x6EC5,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8511,0x51A5,0x540D,0x547D,0x660E,0x669D,0x6927,/* 0xA0-0xA7 */ + 0x6E9F,0x76BF,0x7791,0x8317,0x84C2,0x879F,0x9169,0x9298,/* 0xA8-0xAF */ + 0x9CF4,0x8882,0x4FAE,0x5192,0x52DF,0x59C6,0x5E3D,0x6155,/* 0xB0-0xB7 */ + 0x6478,0x6479,0x66AE,0x67D0,0x6A21,0x6BCD,0x6BDB,0x725F,/* 0xB8-0xBF */ + 0x7261,0x7441,0x7738,0x77DB,0x8017,0x82BC,0x8305,0x8B00,/* 0xC0-0xC7 */ + 0x8B28,0x8C8C,0x6728,0x6C90,0x7267,0x76EE,0x7766,0x7A46,/* 0xC8-0xCF */ + 0x9DA9,0x6B7F,0x6C92,0x5922,0x6726,0x8499,0x536F,0x5893,/* 0xD0-0xD7 */ + 0x5999,0x5EDF,0x63CF,0x6634,0x6773,0x6E3A,0x732B,0x7AD7,/* 0xD8-0xDF */ + 0x82D7,0x9328,0x52D9,0x5DEB,0x61AE,0x61CB,0x620A,0x62C7,/* 0xE0-0xE7 */ + 0x64AB,0x65E0,0x6959,0x6B66,0x6BCB,0x7121,0x73F7,0x755D,/* 0xE8-0xEF */ + 0x7E46,0x821E,0x8302,0x856A,0x8AA3,0x8CBF,0x9727,0x9D61,/* 0xF0-0xF7 */ + 0x58A8,0x9ED8,0x5011,0x520E,0x543B,0x554F,0x6587,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6C76,0x7D0A,0x7D0B,0x805E,0x868A,0x9580,0x96EF,/* 0xA0-0xA7 */ + 0x52FF,0x6C95,0x7269,0x5473,0x5A9A,0x5C3E,0x5D4B,0x5F4C,/* 0xA8-0xAF */ + 0x5FAE,0x672A,0x68B6,0x6963,0x6E3C,0x6E44,0x7709,0x7C73,/* 0xB0-0xB7 */ + 0x7F8E,0x8587,0x8B0E,0x8FF7,0x9761,0x9EF4,0x5CB7,0x60B6,/* 0xB8-0xBF */ + 0x610D,0x61AB,0x654F,0x65FB,0x65FC,0x6C11,0x6CEF,0x739F,/* 0xC0-0xC7 */ + 0x73C9,0x7DE1,0x9594,0x5BC6,0x871C,0x8B10,0x525D,0x535A,/* 0xC8-0xCF */ + 0x62CD,0x640F,0x64B2,0x6734,0x6A38,0x6CCA,0x73C0,0x749E,/* 0xD0-0xD7 */ + 0x7B94,0x7C95,0x7E1B,0x818A,0x8236,0x8584,0x8FEB,0x96F9,/* 0xD8-0xDF */ + 0x99C1,0x4F34,0x534A,0x53CD,0x53DB,0x62CC,0x642C,0x6500,/* 0xE0-0xE7 */ + 0x6591,0x69C3,0x6CEE,0x6F58,0x73ED,0x7554,0x7622,0x76E4,/* 0xE8-0xEF */ + 0x76FC,0x78D0,0x78FB,0x792C,0x7D46,0x822C,0x87E0,0x8FD4,/* 0xF0-0xF7 */ + 0x9812,0x98EF,0x52C3,0x62D4,0x64A5,0x6E24,0x6F51,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x767C,0x8DCB,0x91B1,0x9262,0x9AEE,0x9B43,0x5023,/* 0xA0-0xA7 */ + 0x508D,0x574A,0x59A8,0x5C28,0x5E47,0x5F77,0x623F,0x653E,/* 0xA8-0xAF */ + 0x65B9,0x65C1,0x6609,0x678B,0x699C,0x6EC2,0x78C5,0x7D21,/* 0xB0-0xB7 */ + 0x80AA,0x8180,0x822B,0x82B3,0x84A1,0x868C,0x8A2A,0x8B17,/* 0xB8-0xBF */ + 0x90A6,0x9632,0x9F90,0x500D,0x4FF3,0xF963,0x57F9,0x5F98,/* 0xC0-0xC7 */ + 0x62DC,0x6392,0x676F,0x6E43,0x7119,0x76C3,0x80CC,0x80DA,/* 0xC8-0xCF */ + 0x88F4,0x88F5,0x8919,0x8CE0,0x8F29,0x914D,0x966A,0x4F2F,/* 0xD0-0xD7 */ + 0x4F70,0x5E1B,0x67CF,0x6822,0x767D,0x767E,0x9B44,0x5E61,/* 0xD8-0xDF */ + 0x6A0A,0x7169,0x71D4,0x756A,0xF964,0x7E41,0x8543,0x85E9,/* 0xE0-0xE7 */ + 0x98DC,0x4F10,0x7B4F,0x7F70,0x95A5,0x51E1,0x5E06,0x68B5,/* 0xE8-0xEF */ + 0x6C3E,0x6C4E,0x6CDB,0x72AF,0x7BC4,0x8303,0x6CD5,0x743A,/* 0xF0-0xF7 */ + 0x50FB,0x5288,0x58C1,0x64D8,0x6A97,0x74A7,0x7656,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x78A7,0x8617,0x95E2,0x9739,0xF965,0x535E,0x5F01,/* 0xA0-0xA7 */ + 0x8B8A,0x8FA8,0x8FAF,0x908A,0x5225,0x77A5,0x9C49,0x9F08,/* 0xA8-0xAF */ + 0x4E19,0x5002,0x5175,0x5C5B,0x5E77,0x661E,0x663A,0x67C4,/* 0xB0-0xB7 */ + 0x68C5,0x70B3,0x7501,0x75C5,0x79C9,0x7ADD,0x8F27,0x9920,/* 0xB8-0xBF */ + 0x9A08,0x4FDD,0x5821,0x5831,0x5BF6,0x666E,0x6B65,0x6D11,/* 0xC0-0xC7 */ + 0x6E7A,0x6F7D,0x73E4,0x752B,0x83E9,0x88DC,0x8913,0x8B5C,/* 0xC8-0xCF */ + 0x8F14,0x4F0F,0x50D5,0x5310,0x535C,0x5B93,0x5FA9,0x670D,/* 0xD0-0xD7 */ + 0x798F,0x8179,0x832F,0x8514,0x8907,0x8986,0x8F39,0x8F3B,/* 0xD8-0xDF */ + 0x99A5,0x9C12,0x672C,0x4E76,0x4FF8,0x5949,0x5C01,0x5CEF,/* 0xE0-0xE7 */ + 0x5CF0,0x6367,0x68D2,0x70FD,0x71A2,0x742B,0x7E2B,0x84EC,/* 0xE8-0xEF */ + 0x8702,0x9022,0x92D2,0x9CF3,0x4E0D,0x4ED8,0x4FEF,0x5085,/* 0xF0-0xF7 */ + 0x5256,0x526F,0x5426,0x5490,0x57E0,0x592B,0x5A66,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5B5A,0x5B75,0x5BCC,0x5E9C,0xF966,0x6276,0x6577,/* 0xA0-0xA7 */ + 0x65A7,0x6D6E,0x6EA5,0x7236,0x7B26,0x7C3F,0x7F36,0x8150,/* 0xA8-0xAF */ + 0x8151,0x819A,0x8240,0x8299,0x83A9,0x8A03,0x8CA0,0x8CE6,/* 0xB0-0xB7 */ + 0x8CFB,0x8D74,0x8DBA,0x90E8,0x91DC,0x961C,0x9644,0x99D9,/* 0xB8-0xBF */ + 0x9CE7,0x5317,0x5206,0x5429,0x5674,0x58B3,0x5954,0x596E,/* 0xC0-0xC7 */ + 0x5FFF,0x61A4,0x626E,0x6610,0x6C7E,0x711A,0x76C6,0x7C89,/* 0xC8-0xCF */ + 0x7CDE,0x7D1B,0x82AC,0x8CC1,0x96F0,0xF967,0x4F5B,0x5F17,/* 0xD0-0xD7 */ + 0x5F7F,0x62C2,0x5D29,0x670B,0x68DA,0x787C,0x7E43,0x9D6C,/* 0xD8-0xDF */ + 0x4E15,0x5099,0x5315,0x532A,0x5351,0x5983,0x5A62,0x5E87,/* 0xE0-0xE7 */ + 0x60B2,0x618A,0x6249,0x6279,0x6590,0x6787,0x69A7,0x6BD4,/* 0xE8-0xEF */ + 0x6BD6,0x6BD7,0x6BD8,0x6CB8,0xF968,0x7435,0x75FA,0x7812,/* 0xF0-0xF7 */ + 0x7891,0x79D5,0x79D8,0x7C83,0x7DCB,0x7FE1,0x80A5,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x813E,0x81C2,0x83F2,0x871A,0x88E8,0x8AB9,0x8B6C,/* 0xA0-0xA7 */ + 0x8CBB,0x9119,0x975E,0x98DB,0x9F3B,0x56AC,0x5B2A,0x5F6C,/* 0xA8-0xAF */ + 0x658C,0x6AB3,0x6BAF,0x6D5C,0x6FF1,0x7015,0x725D,0x73AD,/* 0xB0-0xB7 */ + 0x8CA7,0x8CD3,0x983B,0x6191,0x6C37,0x8058,0x9A01,0x4E4D,/* 0xB8-0xBF */ + 0x4E8B,0x4E9B,0x4ED5,0x4F3A,0x4F3C,0x4F7F,0x4FDF,0x50FF,/* 0xC0-0xC7 */ + 0x53F2,0x53F8,0x5506,0x55E3,0x56DB,0x58EB,0x5962,0x5A11,/* 0xC8-0xCF */ + 0x5BEB,0x5BFA,0x5C04,0x5DF3,0x5E2B,0x5F99,0x601D,0x6368,/* 0xD0-0xD7 */ + 0x659C,0x65AF,0x67F6,0x67FB,0x68AD,0x6B7B,0x6C99,0x6CD7,/* 0xD8-0xDF */ + 0x6E23,0x7009,0x7345,0x7802,0x793E,0x7940,0x7960,0x79C1,/* 0xE0-0xE7 */ + 0x7BE9,0x7D17,0x7D72,0x8086,0x820D,0x838E,0x84D1,0x86C7,/* 0xE8-0xEF */ + 0x88DF,0x8A50,0x8A5E,0x8B1D,0x8CDC,0x8D66,0x8FAD,0x90AA,/* 0xF0-0xF7 */ + 0x98FC,0x99DF,0x9E9D,0x524A,0xF969,0x6714,0xF96A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5098,0x522A,0x5C71,0x6563,0x6C55,0x73CA,0x7523,/* 0xA0-0xA7 */ + 0x759D,0x7B97,0x849C,0x9178,0x9730,0x4E77,0x6492,0x6BBA,/* 0xA8-0xAF */ + 0x715E,0x85A9,0x4E09,0xF96B,0x6749,0x68EE,0x6E17,0x829F,/* 0xB0-0xB7 */ + 0x8518,0x886B,0x63F7,0x6F81,0x9212,0x98AF,0x4E0A,0x50B7,/* 0xB8-0xBF */ + 0x50CF,0x511F,0x5546,0x55AA,0x5617,0x5B40,0x5C19,0x5CE0,/* 0xC0-0xC7 */ + 0x5E38,0x5E8A,0x5EA0,0x5EC2,0x60F3,0x6851,0x6A61,0x6E58,/* 0xC8-0xCF */ + 0x723D,0x7240,0x72C0,0x76F8,0x7965,0x7BB1,0x7FD4,0x88F3,/* 0xD0-0xD7 */ + 0x89F4,0x8A73,0x8C61,0x8CDE,0x971C,0x585E,0x74BD,0x8CFD,/* 0xD8-0xDF */ + 0x55C7,0xF96C,0x7A61,0x7D22,0x8272,0x7272,0x751F,0x7525,/* 0xE0-0xE7 */ + 0xF96D,0x7B19,0x5885,0x58FB,0x5DBC,0x5E8F,0x5EB6,0x5F90,/* 0xE8-0xEF */ + 0x6055,0x6292,0x637F,0x654D,0x6691,0x66D9,0x66F8,0x6816,/* 0xF0-0xF7 */ + 0x68F2,0x7280,0x745E,0x7B6E,0x7D6E,0x7DD6,0x7F72,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x80E5,0x8212,0x85AF,0x897F,0x8A93,0x901D,0x92E4,/* 0xA0-0xA7 */ + 0x9ECD,0x9F20,0x5915,0x596D,0x5E2D,0x60DC,0x6614,0x6673,/* 0xA8-0xAF */ + 0x6790,0x6C50,0x6DC5,0x6F5F,0x77F3,0x78A9,0x84C6,0x91CB,/* 0xB0-0xB7 */ + 0x932B,0x4ED9,0x50CA,0x5148,0x5584,0x5B0B,0x5BA3,0x6247,/* 0xB8-0xBF */ + 0x657E,0x65CB,0x6E32,0x717D,0x7401,0x7444,0x7487,0x74BF,/* 0xC0-0xC7 */ + 0x766C,0x79AA,0x7DDA,0x7E55,0x7FA8,0x817A,0x81B3,0x8239,/* 0xC8-0xCF */ + 0x861A,0x87EC,0x8A75,0x8DE3,0x9078,0x9291,0x9425,0x994D,/* 0xD0-0xD7 */ + 0x9BAE,0x5368,0x5C51,0x6954,0x6CC4,0x6D29,0x6E2B,0x820C,/* 0xD8-0xDF */ + 0x859B,0x893B,0x8A2D,0x8AAA,0x96EA,0x9F67,0x5261,0x66B9,/* 0xE0-0xE7 */ + 0x6BB2,0x7E96,0x87FE,0x8D0D,0x9583,0x965D,0x651D,0x6D89,/* 0xE8-0xEF */ + 0x71EE,0xF96E,0x57CE,0x59D3,0x5BAC,0x6027,0x60FA,0x6210,/* 0xF0-0xF7 */ + 0x661F,0x665F,0x7329,0x73F9,0x76DB,0x7701,0x7B6C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8056,0x8072,0x8165,0x8AA0,0x9192,0x4E16,0x52E2,/* 0xA0-0xA7 */ + 0x6B72,0x6D17,0x7A05,0x7B39,0x7D30,0xF96F,0x8CB0,0x53EC,/* 0xA8-0xAF */ + 0x562F,0x5851,0x5BB5,0x5C0F,0x5C11,0x5DE2,0x6240,0x6383,/* 0xB0-0xB7 */ + 0x6414,0x662D,0x68B3,0x6CBC,0x6D88,0x6EAF,0x701F,0x70A4,/* 0xB8-0xBF */ + 0x71D2,0x7526,0x758F,0x758E,0x7619,0x7B11,0x7BE0,0x7C2B,/* 0xC0-0xC7 */ + 0x7D20,0x7D39,0x852C,0x856D,0x8607,0x8A34,0x900D,0x9061,/* 0xC8-0xCF */ + 0x90B5,0x92B7,0x97F6,0x9A37,0x4FD7,0x5C6C,0x675F,0x6D91,/* 0xD0-0xD7 */ + 0x7C9F,0x7E8C,0x8B16,0x8D16,0x901F,0x5B6B,0x5DFD,0x640D,/* 0xD8-0xDF */ + 0x84C0,0x905C,0x98E1,0x7387,0x5B8B,0x609A,0x677E,0x6DDE,/* 0xE0-0xE7 */ + 0x8A1F,0x8AA6,0x9001,0x980C,0x5237,0xF970,0x7051,0x788E,/* 0xE8-0xEF */ + 0x9396,0x8870,0x91D7,0x4FEE,0x53D7,0x55FD,0x56DA,0x5782,/* 0xF0-0xF7 */ + 0x58FD,0x5AC2,0x5B88,0x5CAB,0x5CC0,0x5E25,0x6101,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x620D,0x624B,0x6388,0x641C,0x6536,0x6578,0x6A39,/* 0xA0-0xA7 */ + 0x6B8A,0x6C34,0x6D19,0x6F31,0x71E7,0x72E9,0x7378,0x7407,/* 0xA8-0xAF */ + 0x74B2,0x7626,0x7761,0x79C0,0x7A57,0x7AEA,0x7CB9,0x7D8F,/* 0xB0-0xB7 */ + 0x7DAC,0x7E61,0x7F9E,0x8129,0x8331,0x8490,0x84DA,0x85EA,/* 0xB8-0xBF */ + 0x8896,0x8AB0,0x8B90,0x8F38,0x9042,0x9083,0x916C,0x9296,/* 0xC0-0xC7 */ + 0x92B9,0x968B,0x96A7,0x96A8,0x96D6,0x9700,0x9808,0x9996,/* 0xC8-0xCF */ + 0x9AD3,0x9B1A,0x53D4,0x587E,0x5919,0x5B70,0x5BBF,0x6DD1,/* 0xD0-0xD7 */ + 0x6F5A,0x719F,0x7421,0x74B9,0x8085,0x83FD,0x5DE1,0x5F87,/* 0xD8-0xDF */ + 0x5FAA,0x6042,0x65EC,0x6812,0x696F,0x6A53,0x6B89,0x6D35,/* 0xE0-0xE7 */ + 0x6DF3,0x73E3,0x76FE,0x77AC,0x7B4D,0x7D14,0x8123,0x821C,/* 0xE8-0xEF */ + 0x8340,0x84F4,0x8563,0x8A62,0x8AC4,0x9187,0x931E,0x9806,/* 0xF0-0xF7 */ + 0x99B4,0x620C,0x8853,0x8FF0,0x9265,0x5D07,0x5D27,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5D69,0x745F,0x819D,0x8768,0x6FD5,0x62FE,0x7FD2,/* 0xA0-0xA7 */ + 0x8936,0x8972,0x4E1E,0x4E58,0x50E7,0x52DD,0x5347,0x627F,/* 0xA8-0xAF */ + 0x6607,0x7E69,0x8805,0x965E,0x4F8D,0x5319,0x5636,0x59CB,/* 0xB0-0xB7 */ + 0x5AA4,0x5C38,0x5C4E,0x5C4D,0x5E02,0x5F11,0x6043,0x65BD,/* 0xB8-0xBF */ + 0x662F,0x6642,0x67BE,0x67F4,0x731C,0x77E2,0x793A,0x7FC5,/* 0xC0-0xC7 */ + 0x8494,0x84CD,0x8996,0x8A66,0x8A69,0x8AE1,0x8C55,0x8C7A,/* 0xC8-0xCF */ + 0x57F4,0x5BD4,0x5F0F,0x606F,0x62ED,0x690D,0x6B96,0x6E5C,/* 0xD0-0xD7 */ + 0x7184,0x7BD2,0x8755,0x8B58,0x8EFE,0x98DF,0x98FE,0x4F38,/* 0xD8-0xDF */ + 0x4F81,0x4FE1,0x547B,0x5A20,0x5BB8,0x613C,0x65B0,0x6668,/* 0xE0-0xE7 */ + 0x71FC,0x7533,0x795E,0x7D33,0x814E,0x81E3,0x8398,0x85AA,/* 0xE8-0xEF */ + 0x85CE,0x8703,0x8A0A,0x8EAB,0x8F9B,0xF971,0x8FC5,0x5931,/* 0xF0-0xF7 */ + 0x5BA4,0x5BE6,0x6089,0x5BE9,0x5C0B,0x5FC3,0x6C81,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xF972,0x6DF1,0x700B,0x751A,0x82AF,0x8AF6,0x4EC0,/* 0xA0-0xA7 */ + 0x5341,0xF973,0x96D9,0x6C0F,0x4E9E,0x4FC4,0x5152,0x555E,/* 0xA8-0xAF */ + 0x5A25,0x5CE8,0x6211,0x7259,0x82BD,0x83AA,0x86FE,0x8859,/* 0xB0-0xB7 */ + 0x8A1D,0x963F,0x96C5,0x9913,0x9D09,0x9D5D,0x580A,0x5CB3,/* 0xB8-0xBF */ + 0x5DBD,0x5E44,0x60E1,0x6115,0x63E1,0x6A02,0x6E25,0x9102,/* 0xC0-0xC7 */ + 0x9354,0x984E,0x9C10,0x9F77,0x5B89,0x5CB8,0x6309,0x664F,/* 0xC8-0xCF */ + 0x6848,0x773C,0x96C1,0x978D,0x9854,0x9B9F,0x65A1,0x8B01,/* 0xD0-0xD7 */ + 0x8ECB,0x95BC,0x5535,0x5CA9,0x5DD6,0x5EB5,0x6697,0x764C,/* 0xD8-0xDF */ + 0x83F4,0x95C7,0x58D3,0x62BC,0x72CE,0x9D28,0x4EF0,0x592E,/* 0xE0-0xE7 */ + 0x600F,0x663B,0x6B83,0x79E7,0x9D26,0x5393,0x54C0,0x57C3,/* 0xE8-0xEF */ + 0x5D16,0x611B,0x66D6,0x6DAF,0x788D,0x827E,0x9698,0x9744,/* 0xF0-0xF7 */ + 0x5384,0x627C,0x6396,0x6DB2,0x7E0A,0x814B,0x984D,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6AFB,0x7F4C,0x9DAF,0x9E1A,0x4E5F,0x503B,0x51B6,/* 0xA0-0xA7 */ + 0x591C,0x60F9,0x63F6,0x6930,0x723A,0x8036,0xF974,0x91CE,/* 0xA8-0xAF */ + 0x5F31,0xF975,0xF976,0x7D04,0x82E5,0x846F,0x84BB,0x85E5,/* 0xB0-0xB7 */ + 0x8E8D,0xF977,0x4F6F,0xF978,0xF979,0x58E4,0x5B43,0x6059,/* 0xB8-0xBF */ + 0x63DA,0x6518,0x656D,0x6698,0xF97A,0x694A,0x6A23,0x6D0B,/* 0xC0-0xC7 */ + 0x7001,0x716C,0x75D2,0x760D,0x79B3,0x7A70,0xF97B,0x7F8A,/* 0xC8-0xCF */ + 0xF97C,0x8944,0xF97D,0x8B93,0x91C0,0x967D,0xF97E,0x990A,/* 0xD0-0xD7 */ + 0x5704,0x5FA1,0x65BC,0x6F01,0x7600,0x79A6,0x8A9E,0x99AD,/* 0xD8-0xDF */ + 0x9B5A,0x9F6C,0x5104,0x61B6,0x6291,0x6A8D,0x81C6,0x5043,/* 0xE0-0xE7 */ + 0x5830,0x5F66,0x7109,0x8A00,0x8AFA,0x5B7C,0x8616,0x4FFA,/* 0xE8-0xEF */ + 0x513C,0x56B4,0x5944,0x63A9,0x6DF9,0x5DAA,0x696D,0x5186,/* 0xF0-0xF7 */ + 0x4E88,0x4F59,0xF97F,0xF980,0xF981,0x5982,0xF982,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xF983,0x6B5F,0x6C5D,0xF984,0x74B5,0x7916,0xF985,/* 0xA0-0xA7 */ + 0x8207,0x8245,0x8339,0x8F3F,0x8F5D,0xF986,0x9918,0xF987,/* 0xA8-0xAF */ + 0xF988,0xF989,0x4EA6,0xF98A,0x57DF,0x5F79,0x6613,0xF98B,/* 0xB0-0xB7 */ + 0xF98C,0x75AB,0x7E79,0x8B6F,0xF98D,0x9006,0x9A5B,0x56A5,/* 0xB8-0xBF */ + 0x5827,0x59F8,0x5A1F,0x5BB4,0xF98E,0x5EF6,0xF98F,0xF990,/* 0xC0-0xC7 */ + 0x6350,0x633B,0xF991,0x693D,0x6C87,0x6CBF,0x6D8E,0x6D93,/* 0xC8-0xCF */ + 0x6DF5,0x6F14,0xF992,0x70DF,0x7136,0x7159,0xF993,0x71C3,/* 0xD0-0xD7 */ + 0x71D5,0xF994,0x784F,0x786F,0xF995,0x7B75,0x7DE3,0xF996,/* 0xD8-0xDF */ + 0x7E2F,0xF997,0x884D,0x8EDF,0xF998,0xF999,0xF99A,0x925B,/* 0xE0-0xE7 */ + 0xF99B,0x9CF6,0xF99C,0xF99D,0xF99E,0x6085,0x6D85,0xF99F,/* 0xE8-0xEF */ + 0x71B1,0xF9A0,0xF9A1,0x95B1,0x53AD,0xF9A2,0xF9A3,0xF9A4,/* 0xF0-0xF7 */ + 0x67D3,0xF9A5,0x708E,0x7130,0x7430,0x8276,0x82D2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xF9A6,0x95BB,0x9AE5,0x9E7D,0x66C4,0xF9A7,0x71C1,/* 0xA0-0xA7 */ + 0x8449,0xF9A8,0xF9A9,0x584B,0xF9AA,0xF9AB,0x5DB8,0x5F71,/* 0xA8-0xAF */ + 0xF9AC,0x6620,0x668E,0x6979,0x69AE,0x6C38,0x6CF3,0x6E36,/* 0xB0-0xB7 */ + 0x6F41,0x6FDA,0x701B,0x702F,0x7150,0x71DF,0x7370,0xF9AD,/* 0xB8-0xBF */ + 0x745B,0xF9AE,0x74D4,0x76C8,0x7A4E,0x7E93,0xF9AF,0xF9B0,/* 0xC0-0xC7 */ + 0x82F1,0x8A60,0x8FCE,0xF9B1,0x9348,0xF9B2,0x9719,0xF9B3,/* 0xC8-0xCF */ + 0xF9B4,0x4E42,0x502A,0xF9B5,0x5208,0x53E1,0x66F3,0x6C6D,/* 0xD0-0xD7 */ + 0x6FCA,0x730A,0x777F,0x7A62,0x82AE,0x85DD,0x8602,0xF9B6,/* 0xD8-0xDF */ + 0x88D4,0x8A63,0x8B7D,0x8C6B,0xF9B7,0x92B3,0xF9B8,0x9713,/* 0xE0-0xE7 */ + 0x9810,0x4E94,0x4F0D,0x4FC9,0x50B2,0x5348,0x543E,0x5433,/* 0xE8-0xEF */ + 0x55DA,0x5862,0x58BA,0x5967,0x5A1B,0x5BE4,0x609F,0xF9B9,/* 0xF0-0xF7 */ + 0x61CA,0x6556,0x65FF,0x6664,0x68A7,0x6C5A,0x6FB3,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x70CF,0x71AC,0x7352,0x7B7D,0x8708,0x8AA4,0x9C32,/* 0xA0-0xA7 */ + 0x9F07,0x5C4B,0x6C83,0x7344,0x7389,0x923A,0x6EAB,0x7465,/* 0xA8-0xAF */ + 0x761F,0x7A69,0x7E15,0x860A,0x5140,0x58C5,0x64C1,0x74EE,/* 0xB0-0xB7 */ + 0x7515,0x7670,0x7FC1,0x9095,0x96CD,0x9954,0x6E26,0x74E6,/* 0xB8-0xBF */ + 0x7AA9,0x7AAA,0x81E5,0x86D9,0x8778,0x8A1B,0x5A49,0x5B8C,/* 0xC0-0xC7 */ + 0x5B9B,0x68A1,0x6900,0x6D63,0x73A9,0x7413,0x742C,0x7897,/* 0xC8-0xCF */ + 0x7DE9,0x7FEB,0x8118,0x8155,0x839E,0x8C4C,0x962E,0x9811,/* 0xD0-0xD7 */ + 0x66F0,0x5F80,0x65FA,0x6789,0x6C6A,0x738B,0x502D,0x5A03,/* 0xD8-0xDF */ + 0x6B6A,0x77EE,0x5916,0x5D6C,0x5DCD,0x7325,0x754F,0xF9BA,/* 0xE0-0xE7 */ + 0xF9BB,0x50E5,0x51F9,0x582F,0x592D,0x5996,0x59DA,0x5BE5,/* 0xE8-0xEF */ + 0xF9BC,0xF9BD,0x5DA2,0x62D7,0x6416,0x6493,0x64FE,0xF9BE,/* 0xF0-0xF7 */ + 0x66DC,0xF9BF,0x6A48,0xF9C0,0x71FF,0x7464,0xF9C1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7A88,0x7AAF,0x7E47,0x7E5E,0x8000,0x8170,0xF9C2,/* 0xA0-0xA7 */ + 0x87EF,0x8981,0x8B20,0x9059,0xF9C3,0x9080,0x9952,0x617E,/* 0xA8-0xAF */ + 0x6B32,0x6D74,0x7E1F,0x8925,0x8FB1,0x4FD1,0x50AD,0x5197,/* 0xB0-0xB7 */ + 0x52C7,0x57C7,0x5889,0x5BB9,0x5EB8,0x6142,0x6995,0x6D8C,/* 0xB8-0xBF */ + 0x6E67,0x6EB6,0x7194,0x7462,0x7528,0x752C,0x8073,0x8338,/* 0xC0-0xC7 */ + 0x84C9,0x8E0A,0x9394,0x93DE,0xF9C4,0x4E8E,0x4F51,0x5076,/* 0xC8-0xCF */ + 0x512A,0x53C8,0x53CB,0x53F3,0x5B87,0x5BD3,0x5C24,0x611A,/* 0xD0-0xD7 */ + 0x6182,0x65F4,0x725B,0x7397,0x7440,0x76C2,0x7950,0x7991,/* 0xD8-0xDF */ + 0x79B9,0x7D06,0x7FBD,0x828B,0x85D5,0x865E,0x8FC2,0x9047,/* 0xE0-0xE7 */ + 0x90F5,0x91EA,0x9685,0x96E8,0x96E9,0x52D6,0x5F67,0x65ED,/* 0xE8-0xEF */ + 0x6631,0x682F,0x715C,0x7A36,0x90C1,0x980A,0x4E91,0xF9C5,/* 0xF0-0xF7 */ + 0x6A52,0x6B9E,0x6F90,0x7189,0x8018,0x82B8,0x8553,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x904B,0x9695,0x96F2,0x97FB,0x851A,0x9B31,0x4E90,/* 0xA0-0xA7 */ + 0x718A,0x96C4,0x5143,0x539F,0x54E1,0x5713,0x5712,0x57A3,/* 0xA8-0xAF */ + 0x5A9B,0x5AC4,0x5BC3,0x6028,0x613F,0x63F4,0x6C85,0x6D39,/* 0xB0-0xB7 */ + 0x6E72,0x6E90,0x7230,0x733F,0x7457,0x82D1,0x8881,0x8F45,/* 0xB8-0xBF */ + 0x9060,0xF9C6,0x9662,0x9858,0x9D1B,0x6708,0x8D8A,0x925E,/* 0xC0-0xC7 */ + 0x4F4D,0x5049,0x50DE,0x5371,0x570D,0x59D4,0x5A01,0x5C09,/* 0xC8-0xCF */ + 0x6170,0x6690,0x6E2D,0x7232,0x744B,0x7DEF,0x80C3,0x840E,/* 0xD0-0xD7 */ + 0x8466,0x853F,0x875F,0x885B,0x8918,0x8B02,0x9055,0x97CB,/* 0xD8-0xDF */ + 0x9B4F,0x4E73,0x4F91,0x5112,0x516A,0xF9C7,0x552F,0x55A9,/* 0xE0-0xE7 */ + 0x5B7A,0x5BA5,0x5E7C,0x5E7D,0x5EBE,0x60A0,0x60DF,0x6108,/* 0xE8-0xEF */ + 0x6109,0x63C4,0x6538,0x6709,0xF9C8,0x67D4,0x67DA,0xF9C9,/* 0xF0-0xF7 */ + 0x6961,0x6962,0x6CB9,0x6D27,0xF9CA,0x6E38,0xF9CB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6FE1,0x7336,0x7337,0xF9CC,0x745C,0x7531,0xF9CD,/* 0xA0-0xA7 */ + 0x7652,0xF9CE,0xF9CF,0x7DAD,0x81FE,0x8438,0x88D5,0x8A98,/* 0xA8-0xAF */ + 0x8ADB,0x8AED,0x8E30,0x8E42,0x904A,0x903E,0x907A,0x9149,/* 0xB0-0xB7 */ + 0x91C9,0x936E,0xF9D0,0xF9D1,0x5809,0xF9D2,0x6BD3,0x8089,/* 0xB8-0xBF */ + 0x80B2,0xF9D3,0xF9D4,0x5141,0x596B,0x5C39,0xF9D5,0xF9D6,/* 0xC0-0xC7 */ + 0x6F64,0x73A7,0x80E4,0x8D07,0xF9D7,0x9217,0x958F,0xF9D8,/* 0xC8-0xCF */ + 0xF9D9,0xF9DA,0xF9DB,0x807F,0x620E,0x701C,0x7D68,0x878D,/* 0xD0-0xD7 */ + 0xF9DC,0x57A0,0x6069,0x6147,0x6BB7,0x8ABE,0x9280,0x96B1,/* 0xD8-0xDF */ + 0x4E59,0x541F,0x6DEB,0x852D,0x9670,0x97F3,0x98EE,0x63D6,/* 0xE0-0xE7 */ + 0x6CE3,0x9091,0x51DD,0x61C9,0x81BA,0x9DF9,0x4F9D,0x501A,/* 0xE8-0xEF */ + 0x5100,0x5B9C,0x610F,0x61FF,0x64EC,0x6905,0x6BC5,0x7591,/* 0xF0-0xF7 */ + 0x77E3,0x7FA9,0x8264,0x858F,0x87FB,0x8863,0x8ABC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8B70,0x91AB,0x4E8C,0x4EE5,0x4F0A,0xF9DD,0xF9DE,/* 0xA0-0xA7 */ + 0x5937,0x59E8,0xF9DF,0x5DF2,0x5F1B,0x5F5B,0x6021,0xF9E0,/* 0xA8-0xAF */ + 0xF9E1,0xF9E2,0xF9E3,0x723E,0x73E5,0xF9E4,0x7570,0x75CD,/* 0xB0-0xB7 */ + 0xF9E5,0x79FB,0xF9E6,0x800C,0x8033,0x8084,0x82E1,0x8351,/* 0xB8-0xBF */ + 0xF9E7,0xF9E8,0x8CBD,0x8CB3,0x9087,0xF9E9,0xF9EA,0x98F4,/* 0xC0-0xC7 */ + 0x990C,0xF9EB,0xF9EC,0x7037,0x76CA,0x7FCA,0x7FCC,0x7FFC,/* 0xC8-0xCF */ + 0x8B1A,0x4EBA,0x4EC1,0x5203,0x5370,0xF9ED,0x54BD,0x56E0,/* 0xD0-0xD7 */ + 0x59FB,0x5BC5,0x5F15,0x5FCD,0x6E6E,0xF9EE,0xF9EF,0x7D6A,/* 0xD8-0xDF */ + 0x8335,0xF9F0,0x8693,0x8A8D,0xF9F1,0x976D,0x9777,0xF9F2,/* 0xE0-0xE7 */ + 0xF9F3,0x4E00,0x4F5A,0x4F7E,0x58F9,0x65E5,0x6EA2,0x9038,/* 0xE8-0xEF */ + 0x93B0,0x99B9,0x4EFB,0x58EC,0x598A,0x59D9,0x6041,0xF9F4,/* 0xF0-0xF7 */ + 0xF9F5,0x7A14,0xF9F6,0x834F,0x8CC3,0x5165,0x5344,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_ED[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xF9F7,0xF9F8,0xF9F9,0x4ECD,0x5269,0x5B55,0x82BF,/* 0xA0-0xA7 */ + 0x4ED4,0x523A,0x54A8,0x59C9,0x59FF,0x5B50,0x5B57,0x5B5C,/* 0xA8-0xAF */ + 0x6063,0x6148,0x6ECB,0x7099,0x716E,0x7386,0x74F7,0x75B5,/* 0xB0-0xB7 */ + 0x78C1,0x7D2B,0x8005,0x81EA,0x8328,0x8517,0x85C9,0x8AEE,/* 0xB8-0xBF */ + 0x8CC7,0x96CC,0x4F5C,0x52FA,0x56BC,0x65AB,0x6628,0x707C,/* 0xC0-0xC7 */ + 0x70B8,0x7235,0x7DBD,0x828D,0x914C,0x96C0,0x9D72,0x5B71,/* 0xC8-0xCF */ + 0x68E7,0x6B98,0x6F7A,0x76DE,0x5C91,0x66AB,0x6F5B,0x7BB4,/* 0xD0-0xD7 */ + 0x7C2A,0x8836,0x96DC,0x4E08,0x4ED7,0x5320,0x5834,0x58BB,/* 0xD8-0xDF */ + 0x58EF,0x596C,0x5C07,0x5E33,0x5E84,0x5F35,0x638C,0x66B2,/* 0xE0-0xE7 */ + 0x6756,0x6A1F,0x6AA3,0x6B0C,0x6F3F,0x7246,0xF9FA,0x7350,/* 0xE8-0xEF */ + 0x748B,0x7AE0,0x7CA7,0x8178,0x81DF,0x81E7,0x838A,0x846C,/* 0xF0-0xF7 */ + 0x8523,0x8594,0x85CF,0x88DD,0x8D13,0x91AC,0x9577,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x969C,0x518D,0x54C9,0x5728,0x5BB0,0x624D,0x6750,/* 0xA0-0xA7 */ + 0x683D,0x6893,0x6E3D,0x6ED3,0x707D,0x7E21,0x88C1,0x8CA1,/* 0xA8-0xAF */ + 0x8F09,0x9F4B,0x9F4E,0x722D,0x7B8F,0x8ACD,0x931A,0x4F47,/* 0xB0-0xB7 */ + 0x4F4E,0x5132,0x5480,0x59D0,0x5E95,0x62B5,0x6775,0x696E,/* 0xB8-0xBF */ + 0x6A17,0x6CAE,0x6E1A,0x72D9,0x732A,0x75BD,0x7BB8,0x7D35,/* 0xC0-0xC7 */ + 0x82E7,0x83F9,0x8457,0x85F7,0x8A5B,0x8CAF,0x8E87,0x9019,/* 0xC8-0xCF */ + 0x90B8,0x96CE,0x9F5F,0x52E3,0x540A,0x5AE1,0x5BC2,0x6458,/* 0xD0-0xD7 */ + 0x6575,0x6EF4,0x72C4,0xF9FB,0x7684,0x7A4D,0x7B1B,0x7C4D,/* 0xD8-0xDF */ + 0x7E3E,0x7FDF,0x837B,0x8B2B,0x8CCA,0x8D64,0x8DE1,0x8E5F,/* 0xE0-0xE7 */ + 0x8FEA,0x8FF9,0x9069,0x93D1,0x4F43,0x4F7A,0x50B3,0x5168,/* 0xE8-0xEF */ + 0x5178,0x524D,0x526A,0x5861,0x587C,0x5960,0x5C08,0x5C55,/* 0xF0-0xF7 */ + 0x5EDB,0x609B,0x6230,0x6813,0x6BBF,0x6C08,0x6FB1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x714E,0x7420,0x7530,0x7538,0x7551,0x7672,0x7B4C,/* 0xA0-0xA7 */ + 0x7B8B,0x7BAD,0x7BC6,0x7E8F,0x8A6E,0x8F3E,0x8F49,0x923F,/* 0xA8-0xAF */ + 0x9293,0x9322,0x942B,0x96FB,0x985A,0x986B,0x991E,0x5207,/* 0xB0-0xB7 */ + 0x622A,0x6298,0x6D59,0x7664,0x7ACA,0x7BC0,0x7D76,0x5360,/* 0xB8-0xBF */ + 0x5CBE,0x5E97,0x6F38,0x70B9,0x7C98,0x9711,0x9B8E,0x9EDE,/* 0xC0-0xC7 */ + 0x63A5,0x647A,0x8776,0x4E01,0x4E95,0x4EAD,0x505C,0x5075,/* 0xC8-0xCF */ + 0x5448,0x59C3,0x5B9A,0x5E40,0x5EAD,0x5EF7,0x5F81,0x60C5,/* 0xD0-0xD7 */ + 0x633A,0x653F,0x6574,0x65CC,0x6676,0x6678,0x67FE,0x6968,/* 0xD8-0xDF */ + 0x6A89,0x6B63,0x6C40,0x6DC0,0x6DE8,0x6E1F,0x6E5E,0x701E,/* 0xE0-0xE7 */ + 0x70A1,0x738E,0x73FD,0x753A,0x775B,0x7887,0x798E,0x7A0B,/* 0xE8-0xEF */ + 0x7A7D,0x7CBE,0x7D8E,0x8247,0x8A02,0x8AEA,0x8C9E,0x912D,/* 0xF0-0xF7 */ + 0x914A,0x91D8,0x9266,0x92CC,0x9320,0x9706,0x9756,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x975C,0x9802,0x9F0E,0x5236,0x5291,0x557C,0x5824,/* 0xA0-0xA7 */ + 0x5E1D,0x5F1F,0x608C,0x63D0,0x68AF,0x6FDF,0x796D,0x7B2C,/* 0xA8-0xAF */ + 0x81CD,0x85BA,0x88FD,0x8AF8,0x8E44,0x918D,0x9664,0x969B,/* 0xB0-0xB7 */ + 0x973D,0x984C,0x9F4A,0x4FCE,0x5146,0x51CB,0x52A9,0x5632,/* 0xB8-0xBF */ + 0x5F14,0x5F6B,0x63AA,0x64CD,0x65E9,0x6641,0x66FA,0x66F9,/* 0xC0-0xC7 */ + 0x671D,0x689D,0x68D7,0x69FD,0x6F15,0x6F6E,0x7167,0x71E5,/* 0xC8-0xCF */ + 0x722A,0x74AA,0x773A,0x7956,0x795A,0x79DF,0x7A20,0x7A95,/* 0xD0-0xD7 */ + 0x7C97,0x7CDF,0x7D44,0x7E70,0x8087,0x85FB,0x86A4,0x8A54,/* 0xD8-0xDF */ + 0x8ABF,0x8D99,0x8E81,0x9020,0x906D,0x91E3,0x963B,0x96D5,/* 0xE0-0xE7 */ + 0x9CE5,0x65CF,0x7C07,0x8DB3,0x93C3,0x5B58,0x5C0A,0x5352,/* 0xE8-0xEF */ + 0x62D9,0x731D,0x5027,0x5B97,0x5F9E,0x60B0,0x616B,0x68D5,/* 0xF0-0xF7 */ + 0x6DD9,0x742E,0x7A2E,0x7D42,0x7D9C,0x7E31,0x816B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8E2A,0x8E35,0x937E,0x9418,0x4F50,0x5750,0x5DE6,/* 0xA0-0xA7 */ + 0x5EA7,0x632B,0x7F6A,0x4E3B,0x4F4F,0x4F8F,0x505A,0x59DD,/* 0xA8-0xAF */ + 0x80C4,0x546A,0x5468,0x55FE,0x594F,0x5B99,0x5DDE,0x5EDA,/* 0xB0-0xB7 */ + 0x665D,0x6731,0x67F1,0x682A,0x6CE8,0x6D32,0x6E4A,0x6F8D,/* 0xB8-0xBF */ + 0x70B7,0x73E0,0x7587,0x7C4C,0x7D02,0x7D2C,0x7DA2,0x821F,/* 0xC0-0xC7 */ + 0x86DB,0x8A3B,0x8A85,0x8D70,0x8E8A,0x8F33,0x9031,0x914E,/* 0xC8-0xCF */ + 0x9152,0x9444,0x99D0,0x7AF9,0x7CA5,0x4FCA,0x5101,0x51C6,/* 0xD0-0xD7 */ + 0x57C8,0x5BEF,0x5CFB,0x6659,0x6A3D,0x6D5A,0x6E96,0x6FEC,/* 0xD8-0xDF */ + 0x710C,0x756F,0x7AE3,0x8822,0x9021,0x9075,0x96CB,0x99FF,/* 0xE0-0xE7 */ + 0x8301,0x4E2D,0x4EF2,0x8846,0x91CD,0x537D,0x6ADB,0x696B,/* 0xE8-0xEF */ + 0x6C41,0x847A,0x589E,0x618E,0x66FE,0x62EF,0x70DD,0x7511,/* 0xF0-0xF7 */ + 0x75C7,0x7E52,0x84B8,0x8B49,0x8D08,0x4E4B,0x53EA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x54AB,0x5730,0x5740,0x5FD7,0x6301,0x6307,0x646F,/* 0xA0-0xA7 */ + 0x652F,0x65E8,0x667A,0x679D,0x67B3,0x6B62,0x6C60,0x6C9A,/* 0xA8-0xAF */ + 0x6F2C,0x77E5,0x7825,0x7949,0x7957,0x7D19,0x80A2,0x8102,/* 0xB0-0xB7 */ + 0x81F3,0x829D,0x82B7,0x8718,0x8A8C,0xF9FC,0x8D04,0x8DBE,/* 0xB8-0xBF */ + 0x9072,0x76F4,0x7A19,0x7A37,0x7E54,0x8077,0x5507,0x55D4,/* 0xC0-0xC7 */ + 0x5875,0x632F,0x6422,0x6649,0x664B,0x686D,0x699B,0x6B84,/* 0xC8-0xCF */ + 0x6D25,0x6EB1,0x73CD,0x7468,0x74A1,0x755B,0x75B9,0x76E1,/* 0xD0-0xD7 */ + 0x771E,0x778B,0x79E6,0x7E09,0x7E1D,0x81FB,0x852F,0x8897,/* 0xD8-0xDF */ + 0x8A3A,0x8CD1,0x8EEB,0x8FB0,0x9032,0x93AD,0x9663,0x9673,/* 0xE0-0xE7 */ + 0x9707,0x4F84,0x53F1,0x59EA,0x5AC9,0x5E19,0x684E,0x74C6,/* 0xE8-0xEF */ + 0x75BE,0x79E9,0x7A92,0x81A3,0x86ED,0x8CEA,0x8DCC,0x8FED,/* 0xF0-0xF7 */ + 0x659F,0x6715,0xF9FD,0x57F7,0x6F57,0x7DDD,0x8F2F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x93F6,0x96C6,0x5FB5,0x61F2,0x6F84,0x4E14,0x4F98,/* 0xA0-0xA7 */ + 0x501F,0x53C9,0x55DF,0x5D6F,0x5DEE,0x6B21,0x6B64,0x78CB,/* 0xA8-0xAF */ + 0x7B9A,0xF9FE,0x8E49,0x8ECA,0x906E,0x6349,0x643E,0x7740,/* 0xB0-0xB7 */ + 0x7A84,0x932F,0x947F,0x9F6A,0x64B0,0x6FAF,0x71E6,0x74A8,/* 0xB8-0xBF */ + 0x74DA,0x7AC4,0x7C12,0x7E82,0x7CB2,0x7E98,0x8B9A,0x8D0A,/* 0xC0-0xC7 */ + 0x947D,0x9910,0x994C,0x5239,0x5BDF,0x64E6,0x672D,0x7D2E,/* 0xC8-0xCF */ + 0x50ED,0x53C3,0x5879,0x6158,0x6159,0x61FA,0x65AC,0x7AD9,/* 0xD0-0xD7 */ + 0x8B92,0x8B96,0x5009,0x5021,0x5275,0x5531,0x5A3C,0x5EE0,/* 0xD8-0xDF */ + 0x5F70,0x6134,0x655E,0x660C,0x6636,0x66A2,0x69CD,0x6EC4,/* 0xE0-0xE7 */ + 0x6F32,0x7316,0x7621,0x7A93,0x8139,0x8259,0x83D6,0x84BC,/* 0xE8-0xEF */ + 0x50B5,0x57F0,0x5BC0,0x5BE8,0x5F69,0x63A1,0x7826,0x7DB5,/* 0xF0-0xF7 */ + 0x83DC,0x8521,0x91C7,0x91F5,0x518A,0x67F5,0x7B56,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8CAC,0x51C4,0x59BB,0x60BD,0x8655,0x501C,0xF9FF,/* 0xA0-0xA7 */ + 0x5254,0x5C3A,0x617D,0x621A,0x62D3,0x64F2,0x65A5,0x6ECC,/* 0xA8-0xAF */ + 0x7620,0x810A,0x8E60,0x965F,0x96BB,0x4EDF,0x5343,0x5598,/* 0xB0-0xB7 */ + 0x5929,0x5DDD,0x64C5,0x6CC9,0x6DFA,0x7394,0x7A7F,0x821B,/* 0xB8-0xBF */ + 0x85A6,0x8CE4,0x8E10,0x9077,0x91E7,0x95E1,0x9621,0x97C6,/* 0xC0-0xC7 */ + 0x51F8,0x54F2,0x5586,0x5FB9,0x64A4,0x6F88,0x7DB4,0x8F1F,/* 0xC8-0xCF */ + 0x8F4D,0x9435,0x50C9,0x5C16,0x6CBE,0x6DFB,0x751B,0x77BB,/* 0xD0-0xD7 */ + 0x7C3D,0x7C64,0x8A79,0x8AC2,0x581E,0x59BE,0x5E16,0x6377,/* 0xD8-0xDF */ + 0x7252,0x758A,0x776B,0x8ADC,0x8CBC,0x8F12,0x5EF3,0x6674,/* 0xE0-0xE7 */ + 0x6DF8,0x807D,0x83C1,0x8ACB,0x9751,0x9BD6,0xFA00,0x5243,/* 0xE8-0xEF */ + 0x66FF,0x6D95,0x6EEF,0x7DE0,0x8AE6,0x902E,0x905E,0x9AD4,/* 0xF0-0xF7 */ + 0x521D,0x527F,0x54E8,0x6194,0x6284,0x62DB,0x68A2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6912,0x695A,0x6A35,0x7092,0x7126,0x785D,0x7901,/* 0xA0-0xA7 */ + 0x790E,0x79D2,0x7A0D,0x8096,0x8278,0x82D5,0x8349,0x8549,/* 0xA8-0xAF */ + 0x8C82,0x8D85,0x9162,0x918B,0x91AE,0x4FC3,0x56D1,0x71ED,/* 0xB0-0xB7 */ + 0x77D7,0x8700,0x89F8,0x5BF8,0x5FD6,0x6751,0x90A8,0x53E2,/* 0xB8-0xBF */ + 0x585A,0x5BF5,0x60A4,0x6181,0x6460,0x7E3D,0x8070,0x8525,/* 0xC0-0xC7 */ + 0x9283,0x64AE,0x50AC,0x5D14,0x6700,0x589C,0x62BD,0x63A8,/* 0xC8-0xCF */ + 0x690E,0x6978,0x6A1E,0x6E6B,0x76BA,0x79CB,0x82BB,0x8429,/* 0xD0-0xD7 */ + 0x8ACF,0x8DA8,0x8FFD,0x9112,0x914B,0x919C,0x9310,0x9318,/* 0xD8-0xDF */ + 0x939A,0x96DB,0x9A36,0x9C0D,0x4E11,0x755C,0x795D,0x7AFA,/* 0xE0-0xE7 */ + 0x7B51,0x7BC9,0x7E2E,0x84C4,0x8E59,0x8E74,0x8EF8,0x9010,/* 0xE8-0xEF */ + 0x6625,0x693F,0x7443,0x51FA,0x672E,0x9EDC,0x5145,0x5FE0,/* 0xF0-0xF7 */ + 0x6C96,0x87F2,0x885D,0x8877,0x60B4,0x81B5,0x8403,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8D05,0x53D6,0x5439,0x5634,0x5A36,0x5C31,0x708A,/* 0xA0-0xA7 */ + 0x7FE0,0x805A,0x8106,0x81ED,0x8DA3,0x9189,0x9A5F,0x9DF2,/* 0xA8-0xAF */ + 0x5074,0x4EC4,0x53A0,0x60FB,0x6E2C,0x5C64,0x4F88,0x5024,/* 0xB0-0xB7 */ + 0x55E4,0x5CD9,0x5E5F,0x6065,0x6894,0x6CBB,0x6DC4,0x71BE,/* 0xB8-0xBF */ + 0x75D4,0x75F4,0x7661,0x7A1A,0x7A49,0x7DC7,0x7DFB,0x7F6E,/* 0xC0-0xC7 */ + 0x81F4,0x86A9,0x8F1C,0x96C9,0x99B3,0x9F52,0x5247,0x52C5,/* 0xC8-0xCF */ + 0x98ED,0x89AA,0x4E03,0x67D2,0x6F06,0x4FB5,0x5BE2,0x6795,/* 0xD0-0xD7 */ + 0x6C88,0x6D78,0x741B,0x7827,0x91DD,0x937C,0x87C4,0x79E4,/* 0xD8-0xDF */ + 0x7A31,0x5FEB,0x4ED6,0x54A4,0x553E,0x58AE,0x59A5,0x60F0,/* 0xE0-0xE7 */ + 0x6253,0x62D6,0x6736,0x6955,0x8235,0x9640,0x99B1,0x99DD,/* 0xE8-0xEF */ + 0x502C,0x5353,0x5544,0x577C,0xFA01,0x6258,0xFA02,0x64E2,/* 0xF0-0xF7 */ + 0x666B,0x67DD,0x6FC1,0x6FEF,0x7422,0x7438,0x8A17,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9438,0x5451,0x5606,0x5766,0x5F48,0x619A,0x6B4E,/* 0xA0-0xA7 */ + 0x7058,0x70AD,0x7DBB,0x8A95,0x596A,0x812B,0x63A2,0x7708,/* 0xA8-0xAF */ + 0x803D,0x8CAA,0x5854,0x642D,0x69BB,0x5B95,0x5E11,0x6E6F,/* 0xB0-0xB7 */ + 0xFA03,0x8569,0x514C,0x53F0,0x592A,0x6020,0x614B,0x6B86,/* 0xB8-0xBF */ + 0x6C70,0x6CF0,0x7B1E,0x80CE,0x82D4,0x8DC6,0x90B0,0x98B1,/* 0xC0-0xC7 */ + 0xFA04,0x64C7,0x6FA4,0x6491,0x6504,0x514E,0x5410,0x571F,/* 0xC8-0xCF */ + 0x8A0E,0x615F,0x6876,0xFA05,0x75DB,0x7B52,0x7D71,0x901A,/* 0xD0-0xD7 */ + 0x5806,0x69CC,0x817F,0x892A,0x9000,0x9839,0x5078,0x5957,/* 0xD8-0xDF */ + 0x59AC,0x6295,0x900F,0x9B2A,0x615D,0x7279,0x95D6,0x5761,/* 0xE0-0xE7 */ + 0x5A46,0x5DF4,0x628A,0x64AD,0x64FA,0x6777,0x6CE2,0x6D3E,/* 0xE8-0xEF */ + 0x722C,0x7436,0x7834,0x7F77,0x82AD,0x8DDB,0x9817,0x5224,/* 0xF0-0xF7 */ + 0x5742,0x677F,0x7248,0x74E3,0x8CA9,0x8FA6,0x9211,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x962A,0x516B,0x53ED,0x634C,0x4F69,0x5504,0x6096,/* 0xA0-0xA7 */ + 0x6557,0x6C9B,0x6D7F,0x724C,0x72FD,0x7A17,0x8987,0x8C9D,/* 0xA8-0xAF */ + 0x5F6D,0x6F8E,0x70F9,0x81A8,0x610E,0x4FBF,0x504F,0x6241,/* 0xB0-0xB7 */ + 0x7247,0x7BC7,0x7DE8,0x7FE9,0x904D,0x97AD,0x9A19,0x8CB6,/* 0xB8-0xBF */ + 0x576A,0x5E73,0x67B0,0x840D,0x8A55,0x5420,0x5B16,0x5E63,/* 0xC0-0xC7 */ + 0x5EE2,0x5F0A,0x6583,0x80BA,0x853D,0x9589,0x965B,0x4F48,/* 0xC8-0xCF */ + 0x5305,0x530D,0x530F,0x5486,0x54FA,0x5703,0x5E03,0x6016,/* 0xD0-0xD7 */ + 0x629B,0x62B1,0x6355,0xFA06,0x6CE1,0x6D66,0x75B1,0x7832,/* 0xD8-0xDF */ + 0x80DE,0x812F,0x82DE,0x8461,0x84B2,0x888D,0x8912,0x900B,/* 0xE0-0xE7 */ + 0x92EA,0x98FD,0x9B91,0x5E45,0x66B4,0x66DD,0x7011,0x7206,/* 0xE8-0xEF */ + 0xFA07,0x4FF5,0x527D,0x5F6A,0x6153,0x6753,0x6A19,0x6F02,/* 0xF0-0xF7 */ + 0x74E2,0x7968,0x8868,0x8C79,0x98C7,0x98C4,0x9A43,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x54C1,0x7A1F,0x6953,0x8AF7,0x8C4A,0x98A8,0x99AE,/* 0xA0-0xA7 */ + 0x5F7C,0x62AB,0x75B2,0x76AE,0x88AB,0x907F,0x9642,0x5339,/* 0xA8-0xAF */ + 0x5F3C,0x5FC5,0x6CCC,0x73CC,0x7562,0x758B,0x7B46,0x82FE,/* 0xB0-0xB7 */ + 0x999D,0x4E4F,0x903C,0x4E0B,0x4F55,0x53A6,0x590F,0x5EC8,/* 0xB8-0xBF */ + 0x6630,0x6CB3,0x7455,0x8377,0x8766,0x8CC0,0x9050,0x971E,/* 0xC0-0xC7 */ + 0x9C15,0x58D1,0x5B78,0x8650,0x8B14,0x9DB4,0x5BD2,0x6068,/* 0xC8-0xCF */ + 0x608D,0x65F1,0x6C57,0x6F22,0x6FA3,0x701A,0x7F55,0x7FF0,/* 0xD0-0xD7 */ + 0x9591,0x9592,0x9650,0x97D3,0x5272,0x8F44,0x51FD,0x542B,/* 0xD8-0xDF */ + 0x54B8,0x5563,0x558A,0x6ABB,0x6DB5,0x7DD8,0x8266,0x929C,/* 0xE0-0xE7 */ + 0x9677,0x9E79,0x5408,0x54C8,0x76D2,0x86E4,0x95A4,0x95D4,/* 0xE8-0xEF */ + 0x965C,0x4EA2,0x4F09,0x59EE,0x5AE6,0x5DF7,0x6052,0x6297,/* 0xF0-0xF7 */ + 0x676D,0x6841,0x6C86,0x6E2F,0x7F38,0x809B,0x822A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_FA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xFA08,0xFA09,0x9805,0x4EA5,0x5055,0x54B3,0x5793,/* 0xA0-0xA7 */ + 0x595A,0x5B69,0x5BB3,0x61C8,0x6977,0x6D77,0x7023,0x87F9,/* 0xA8-0xAF */ + 0x89E3,0x8A72,0x8AE7,0x9082,0x99ED,0x9AB8,0x52BE,0x6838,/* 0xB0-0xB7 */ + 0x5016,0x5E78,0x674F,0x8347,0x884C,0x4EAB,0x5411,0x56AE,/* 0xB8-0xBF */ + 0x73E6,0x9115,0x97FF,0x9909,0x9957,0x9999,0x5653,0x589F,/* 0xC0-0xC7 */ + 0x865B,0x8A31,0x61B2,0x6AF6,0x737B,0x8ED2,0x6B47,0x96AA,/* 0xC8-0xCF */ + 0x9A57,0x5955,0x7200,0x8D6B,0x9769,0x4FD4,0x5CF4,0x5F26,/* 0xD0-0xD7 */ + 0x61F8,0x665B,0x6CEB,0x70AB,0x7384,0x73B9,0x73FE,0x7729,/* 0xD8-0xDF */ + 0x774D,0x7D43,0x7D62,0x7E23,0x8237,0x8852,0xFA0A,0x8CE2,/* 0xE0-0xE7 */ + 0x9249,0x986F,0x5B51,0x7A74,0x8840,0x9801,0x5ACC,0x4FE0,/* 0xE8-0xEF */ + 0x5354,0x593E,0x5CFD,0x633E,0x6D79,0x72F9,0x8105,0x8107,/* 0xF0-0xF7 */ + 0x83A2,0x92CF,0x9830,0x4EA8,0x5144,0x5211,0x578B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_FB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5F62,0x6CC2,0x6ECE,0x7005,0x7050,0x70AF,0x7192,/* 0xA0-0xA7 */ + 0x73E9,0x7469,0x834A,0x87A2,0x8861,0x9008,0x90A2,0x93A3,/* 0xA8-0xAF */ + 0x99A8,0x516E,0x5F57,0x60E0,0x6167,0x66B3,0x8559,0x8E4A,/* 0xB0-0xB7 */ + 0x91AF,0x978B,0x4E4E,0x4E92,0x547C,0x58D5,0x58FA,0x597D,/* 0xB8-0xBF */ + 0x5CB5,0x5F27,0x6236,0x6248,0x660A,0x6667,0x6BEB,0x6D69,/* 0xC0-0xC7 */ + 0x6DCF,0x6E56,0x6EF8,0x6F94,0x6FE0,0x6FE9,0x705D,0x72D0,/* 0xC8-0xCF */ + 0x7425,0x745A,0x74E0,0x7693,0x795C,0x7CCA,0x7E1E,0x80E1,/* 0xD0-0xD7 */ + 0x82A6,0x846B,0x84BF,0x864E,0x865F,0x8774,0x8B77,0x8C6A,/* 0xD8-0xDF */ + 0x93AC,0x9800,0x9865,0x60D1,0x6216,0x9177,0x5A5A,0x660F,/* 0xE0-0xE7 */ + 0x6DF7,0x6E3E,0x743F,0x9B42,0x5FFD,0x60DA,0x7B0F,0x54C4,/* 0xE8-0xEF */ + 0x5F18,0x6C5E,0x6CD3,0x6D2A,0x70D8,0x7D05,0x8679,0x8A0C,/* 0xF0-0xF7 */ + 0x9D3B,0x5316,0x548C,0x5B05,0x6A3A,0x706B,0x7575,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_FC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x798D,0x79BE,0x82B1,0x83EF,0x8A71,0x8B41,0x8CA8,/* 0xA0-0xA7 */ + 0x9774,0xFA0B,0x64F4,0x652B,0x78BA,0x78BB,0x7A6B,0x4E38,/* 0xA8-0xAF */ + 0x559A,0x5950,0x5BA6,0x5E7B,0x60A3,0x63DB,0x6B61,0x6665,/* 0xB0-0xB7 */ + 0x6853,0x6E19,0x7165,0x74B0,0x7D08,0x9084,0x9A69,0x9C25,/* 0xB8-0xBF */ + 0x6D3B,0x6ED1,0x733E,0x8C41,0x95CA,0x51F0,0x5E4C,0x5FA8,/* 0xC0-0xC7 */ + 0x604D,0x60F6,0x6130,0x614C,0x6643,0x6644,0x69A5,0x6CC1,/* 0xC8-0xCF */ + 0x6E5F,0x6EC9,0x6F62,0x714C,0x749C,0x7687,0x7BC1,0x7C27,/* 0xD0-0xD7 */ + 0x8352,0x8757,0x9051,0x968D,0x9EC3,0x532F,0x56DE,0x5EFB,/* 0xD8-0xDF */ + 0x5F8A,0x6062,0x6094,0x61F7,0x6666,0x6703,0x6A9C,0x6DEE,/* 0xE0-0xE7 */ + 0x6FAE,0x7070,0x736A,0x7E6A,0x81BE,0x8334,0x86D4,0x8AA8,/* 0xE8-0xEF */ + 0x8CC4,0x5283,0x7372,0x5B96,0x6A6B,0x9404,0x54EE,0x5686,/* 0xF0-0xF7 */ + 0x5B5D,0x6548,0x6585,0x66C9,0x689F,0x6D8D,0x6DC6,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_FD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x723B,0x80B4,0x9175,0x9A4D,0x4FAF,0x5019,0x539A,/* 0xA0-0xA7 */ + 0x540E,0x543C,0x5589,0x55C5,0x5E3F,0x5F8C,0x673D,0x7166,/* 0xA8-0xAF */ + 0x73DD,0x9005,0x52DB,0x52F3,0x5864,0x58CE,0x7104,0x718F,/* 0xB0-0xB7 */ + 0x71FB,0x85B0,0x8A13,0x6688,0x85A8,0x55A7,0x6684,0x714A,/* 0xB8-0xBF */ + 0x8431,0x5349,0x5599,0x6BC1,0x5F59,0x5FBD,0x63EE,0x6689,/* 0xC0-0xC7 */ + 0x7147,0x8AF1,0x8F1D,0x9EBE,0x4F11,0x643A,0x70CB,0x7566,/* 0xC8-0xCF */ + 0x8667,0x6064,0x8B4E,0x9DF8,0x5147,0x51F6,0x5308,0x6D36,/* 0xD0-0xD7 */ + 0x80F8,0x9ED1,0x6615,0x6B23,0x7098,0x75D5,0x5403,0x5C79,/* 0xD8-0xDF */ + 0x7D07,0x8A16,0x6B20,0x6B3D,0x6B46,0x5438,0x6070,0x6D3D,/* 0xE0-0xE7 */ + 0x7FD5,0x8208,0x50D6,0x51DE,0x559C,0x566B,0x56CD,0x59EC,/* 0xE8-0xEF */ + 0x5B09,0x5E0C,0x6199,0x6198,0x6231,0x665E,0x66E6,0x7199,/* 0xF0-0xF7 */ + 0x71B9,0x71BA,0x72A7,0x79A7,0x7A00,0x7FB2,0x8A70,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t *page_charset2uni[256] = { + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, c2u_81, c2u_82, c2u_83, c2u_84, c2u_85, c2u_86, c2u_87, + c2u_88, c2u_89, c2u_8A, c2u_8B, c2u_8C, c2u_8D, c2u_8E, c2u_8F, + c2u_90, c2u_91, c2u_92, c2u_93, c2u_94, c2u_95, c2u_96, c2u_97, + c2u_98, c2u_99, c2u_9A, c2u_9B, c2u_9C, c2u_9D, c2u_9E, c2u_9F, + c2u_A0, c2u_A1, c2u_A2, c2u_A3, c2u_A4, c2u_A5, c2u_A6, c2u_A7, + c2u_A8, c2u_A9, c2u_AA, c2u_AB, c2u_AC, c2u_AD, c2u_AE, c2u_AF, + c2u_B0, c2u_B1, c2u_B2, c2u_B3, c2u_B4, c2u_B5, c2u_B6, c2u_B7, + c2u_B8, c2u_B9, c2u_BA, c2u_BB, c2u_BC, c2u_BD, c2u_BE, c2u_BF, + c2u_C0, c2u_C1, c2u_C2, c2u_C3, c2u_C4, c2u_C5, c2u_C6, c2u_C7, + c2u_C8, NULL, c2u_CA, c2u_CB, c2u_CC, c2u_CD, c2u_CE, c2u_CF, + c2u_D0, c2u_D1, c2u_D2, c2u_D3, c2u_D4, c2u_D5, c2u_D6, c2u_D7, + c2u_D8, c2u_D9, c2u_DA, c2u_DB, c2u_DC, c2u_DD, c2u_DE, c2u_DF, + c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, + c2u_E8, c2u_E9, c2u_EA, c2u_EB, c2u_EC, c2u_ED, c2u_EE, c2u_EF, + c2u_F0, c2u_F1, c2u_F2, c2u_F3, c2u_F4, c2u_F5, c2u_F6, c2u_F7, + c2u_F8, c2u_F9, c2u_FA, c2u_FB, c2u_FC, c2u_FD, NULL, NULL, +}; + +static const unsigned char u2c_01[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xA9, 0xA2, 0xA9, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA4, 0xA9, 0xA4, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xA9, 0xA5, 0xA8, 0xA6, 0xA9, 0xA6, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xA9, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA8, /* 0x3C-0x3F */ + 0xA9, 0xA8, 0xA8, 0xA9, 0xA9, 0xA9, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xA9, 0xB0, 0xA8, 0xAF, 0xA9, 0xAF, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xAB, 0xA9, 0xAB, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xAE, 0xA9, 0xAE, /* 0x64-0x67 */ +}; + +static const unsigned char u2c_02[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0xA7, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xA2, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xA2, 0xA8, 0xA2, 0xAB, 0xA2, 0xAA, 0xA2, 0xAD, /* 0xD8-0xDB */ + 0x00, 0x00, 0xA2, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ +}; + +static const unsigned char u2c_03[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xA5, 0xC1, 0xA5, 0xC2, 0xA5, 0xC3, /* 0x90-0x93 */ + 0xA5, 0xC4, 0xA5, 0xC5, 0xA5, 0xC6, 0xA5, 0xC7, /* 0x94-0x97 */ + 0xA5, 0xC8, 0xA5, 0xC9, 0xA5, 0xCA, 0xA5, 0xCB, /* 0x98-0x9B */ + 0xA5, 0xCC, 0xA5, 0xCD, 0xA5, 0xCE, 0xA5, 0xCF, /* 0x9C-0x9F */ + 0xA5, 0xD0, 0xA5, 0xD1, 0x00, 0x00, 0xA5, 0xD2, /* 0xA0-0xA3 */ + 0xA5, 0xD3, 0xA5, 0xD4, 0xA5, 0xD5, 0xA5, 0xD6, /* 0xA4-0xA7 */ + 0xA5, 0xD7, 0xA5, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xA5, 0xE1, 0xA5, 0xE2, 0xA5, 0xE3, /* 0xB0-0xB3 */ + 0xA5, 0xE4, 0xA5, 0xE5, 0xA5, 0xE6, 0xA5, 0xE7, /* 0xB4-0xB7 */ + 0xA5, 0xE8, 0xA5, 0xE9, 0xA5, 0xEA, 0xA5, 0xEB, /* 0xB8-0xBB */ + 0xA5, 0xEC, 0xA5, 0xED, 0xA5, 0xEE, 0xA5, 0xEF, /* 0xBC-0xBF */ + 0xA5, 0xF0, 0xA5, 0xF1, 0x00, 0x00, 0xA5, 0xF2, /* 0xC0-0xC3 */ + 0xA5, 0xF3, 0xA5, 0xF4, 0xA5, 0xF5, 0xA5, 0xF6, /* 0xC4-0xC7 */ + 0xA5, 0xF7, 0xA5, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ +}; + +static const unsigned char u2c_04[512] = { + 0x00, 0x00, 0xAC, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xAC, 0xA1, 0xAC, 0xA2, 0xAC, 0xA3, 0xAC, 0xA4, /* 0x10-0x13 */ + 0xAC, 0xA5, 0xAC, 0xA6, 0xAC, 0xA8, 0xAC, 0xA9, /* 0x14-0x17 */ + 0xAC, 0xAA, 0xAC, 0xAB, 0xAC, 0xAC, 0xAC, 0xAD, /* 0x18-0x1B */ + 0xAC, 0xAE, 0xAC, 0xAF, 0xAC, 0xB0, 0xAC, 0xB1, /* 0x1C-0x1F */ + 0xAC, 0xB2, 0xAC, 0xB3, 0xAC, 0xB4, 0xAC, 0xB5, /* 0x20-0x23 */ + 0xAC, 0xB6, 0xAC, 0xB7, 0xAC, 0xB8, 0xAC, 0xB9, /* 0x24-0x27 */ + 0xAC, 0xBA, 0xAC, 0xBB, 0xAC, 0xBC, 0xAC, 0xBD, /* 0x28-0x2B */ + 0xAC, 0xBE, 0xAC, 0xBF, 0xAC, 0xC0, 0xAC, 0xC1, /* 0x2C-0x2F */ + 0xAC, 0xD1, 0xAC, 0xD2, 0xAC, 0xD3, 0xAC, 0xD4, /* 0x30-0x33 */ + 0xAC, 0xD5, 0xAC, 0xD6, 0xAC, 0xD8, 0xAC, 0xD9, /* 0x34-0x37 */ + 0xAC, 0xDA, 0xAC, 0xDB, 0xAC, 0xDC, 0xAC, 0xDD, /* 0x38-0x3B */ + 0xAC, 0xDE, 0xAC, 0xDF, 0xAC, 0xE0, 0xAC, 0xE1, /* 0x3C-0x3F */ + 0xAC, 0xE2, 0xAC, 0xE3, 0xAC, 0xE4, 0xAC, 0xE5, /* 0x40-0x43 */ + 0xAC, 0xE6, 0xAC, 0xE7, 0xAC, 0xE8, 0xAC, 0xE9, /* 0x44-0x47 */ + 0xAC, 0xEA, 0xAC, 0xEB, 0xAC, 0xEC, 0xAC, 0xED, /* 0x48-0x4B */ + 0xAC, 0xEE, 0xAC, 0xEF, 0xAC, 0xF0, 0xAC, 0xF1, /* 0x4C-0x4F */ + 0x00, 0x00, 0xAC, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ +}; + +static const unsigned char u2c_11[512] = { + 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA4, 0xA4, 0xA7, /* 0x00-0x03 */ + 0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xB1, 0xA4, 0xB2, /* 0x04-0x07 */ + 0xA4, 0xB3, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0x08-0x0B */ + 0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0x0C-0x0F */ + 0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0x00, 0x00, /* 0x10-0x13 */ + 0xA4, 0xD5, 0xA4, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xDD, 0x00, 0x00, /* 0x18-0x1B */ + 0xA4, 0xDE, 0xA4, 0xE1, 0xA4, 0xE2, 0x00, 0x00, /* 0x1C-0x1F */ + 0xA4, 0xE3, 0xA4, 0xB4, 0xA4, 0xE4, 0xA4, 0xE5, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xE6, /* 0x24-0x27 */ + 0x00, 0x00, 0xA4, 0xE7, 0x00, 0x00, 0xA4, 0xE8, /* 0x28-0x2B */ + 0xA4, 0xE9, 0xA4, 0xEA, 0xA4, 0xEB, 0xA4, 0xEC, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xED, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xEE, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB5, 0xA4, 0xB6, /* 0x3C-0x3F */ + 0xA4, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xA4, 0xF2, 0xA4, 0xF3, 0xA4, 0xF0, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xA4, 0xB7, 0x00, 0x00, 0xA4, 0xB8, 0xA4, 0xB9, /* 0x4C-0x4F */ + 0xA4, 0xB8, 0xA4, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xA4, 0xBA, 0xA4, 0xBA, 0x00, 0x00, 0xA4, 0xF4, /* 0x54-0x57 */ + 0xA4, 0xF5, 0xA4, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xD4, /* 0x5C-0x5F */ + 0x00, 0x00, 0xA4, 0xBF, 0xA4, 0xC0, 0xA4, 0xC1, /* 0x60-0x63 */ + 0xA4, 0xC2, 0xA4, 0xC3, 0xA4, 0xC4, 0xA4, 0xC5, /* 0x64-0x67 */ + 0xA4, 0xC6, 0xA4, 0xC7, 0xA4, 0xC8, 0xA4, 0xC9, /* 0x68-0x6B */ + 0xA4, 0xCA, 0xA4, 0xCB, 0xA4, 0xCC, 0xA4, 0xCD, /* 0x6C-0x6F */ + 0xA4, 0xCE, 0xA4, 0xCF, 0xA4, 0xD0, 0xA4, 0xD1, /* 0x70-0x73 */ + 0xA4, 0xD2, 0xA4, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xA4, 0xF7, 0xA4, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xA4, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xA4, 0xFA, 0xA4, 0xFB, 0x00, 0x00, /* 0x90-0x93 */ + 0xA4, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xFD, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xA4, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, 0xA4, 0xA4, /* 0xA8-0xAB */ + 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, 0xA4, 0xA9, /* 0xAC-0xAF */ + 0xA4, 0xAA, 0xA4, 0xAB, 0xA4, 0xAC, 0xA4, 0xAD, /* 0xB0-0xB3 */ + 0xA4, 0xAE, 0xA4, 0xAF, 0xA4, 0xB0, 0xA4, 0xB1, /* 0xB4-0xB7 */ + 0xA4, 0xB2, 0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, /* 0xB8-0xBB */ + 0xA4, 0xB7, 0xA4, 0xB8, 0xA4, 0xBA, 0xA4, 0xBB, /* 0xBC-0xBF */ + 0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xD6, 0xA4, 0xD7, /* 0xC4-0xC7 */ + 0xA4, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xA4, 0xD9, 0x00, 0x00, 0xA4, 0xDA, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xDB, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xDC, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xA4, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xA4, 0xDE, 0xA4, 0xDF, 0x00, 0x00, 0xA4, 0xE0, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xE8, 0xA4, 0xEA, /* 0xE4-0xE7 */ + 0xA4, 0xEC, 0x00, 0x00, 0xA4, 0xED, 0xA4, 0xEF, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xF0, 0x00, 0x00, /* 0xEC-0xEF */ + 0xA4, 0xB7, 0xA4, 0xF2, 0xA4, 0xF3, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xA4, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xA4, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_20[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xA1, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xA1, 0xAE, 0xA1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xA1, 0xB0, 0xA1, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xA2, 0xD3, 0xA2, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xA1, 0xA5, 0xA1, 0xA6, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xA2, 0xB6, 0x00, 0x00, 0xA1, 0xC7, 0xA1, 0xC8, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD8, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xA9, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA9, 0xFA, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xA9, 0xFB, 0xA9, 0xFC, 0xA9, 0xFD, /* 0x80-0x83 */ + 0xA9, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ +}; + +static const unsigned char u2c_21[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC9, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xA2, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xA4, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0xE0, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xA2, 0xE5, 0xA2, 0xE2, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0xD9, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xCA, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xF7, /* 0x50-0x53 */ + 0xA8, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xFB, /* 0x58-0x5B */ + 0xA8, 0xFC, 0xA8, 0xFD, 0xA8, 0xFE, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA5, 0xB0, 0xA5, 0xB1, 0xA5, 0xB2, 0xA5, 0xB3, /* 0x60-0x63 */ + 0xA5, 0xB4, 0xA5, 0xB5, 0xA5, 0xB6, 0xA5, 0xB7, /* 0x64-0x67 */ + 0xA5, 0xB8, 0xA5, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xA5, 0xA1, 0xA5, 0xA2, 0xA5, 0xA3, 0xA5, 0xA4, /* 0x70-0x73 */ + 0xA5, 0xA5, 0xA5, 0xA6, 0xA5, 0xA7, 0xA5, 0xA8, /* 0x74-0x77 */ + 0xA5, 0xA9, 0xA5, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xA1, 0xE7, 0xA1, 0xE8, 0xA1, 0xE6, 0xA1, 0xE9, /* 0x90-0x93 */ + 0xA1, 0xEA, 0xA2, 0xD5, 0xA2, 0xD8, 0xA2, 0xD6, /* 0x94-0x97 */ + 0xA2, 0xD9, 0xA2, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0xA1, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xA2, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ +}; + +static const unsigned char u2c_22[512] = { + 0xA2, 0xA3, 0x00, 0x00, 0xA1, 0xD3, 0xA2, 0xA4, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD4, /* 0x04-0x07 */ + 0xA1, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF5, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0xB3, /* 0x0C-0x0F */ + 0x00, 0x00, 0xA2, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xEE, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xA1, 0xF0, 0xA1, 0xC4, 0x00, 0x00, /* 0x1C-0x1F */ + 0xA1, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xA1, 0xAB, 0x00, 0x00, 0xA1, 0xFC, /* 0x24-0x27 */ + 0xA1, 0xFD, 0xA1, 0xFB, 0xA1, 0xFA, 0xA1, 0xF2, /* 0x28-0x2B */ + 0xA1, 0xF3, 0x00, 0x00, 0xA2, 0xB1, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xA1, 0xC5, 0xA1, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xA1, 0xAD, 0xA1, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD6, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA1, 0xC1, 0xA1, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xA1, 0xC2, 0xA1, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xEC, 0xA1, 0xED, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF8, 0xA1, 0xF9, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF6, 0xA1, 0xF7, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xA2, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xA1, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ +}; + +static const unsigned char u2c_23[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD2, 0x00, 0x00, /* 0x10-0x13 */ +}; + +static const unsigned char u2c_24[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA8, 0xE7, 0xA8, 0xE8, 0xA8, 0xE9, 0xA8, 0xEA, /* 0x60-0x63 */ + 0xA8, 0xEB, 0xA8, 0xEC, 0xA8, 0xED, 0xA8, 0xEE, /* 0x64-0x67 */ + 0xA8, 0xEF, 0xA8, 0xF0, 0xA8, 0xF1, 0xA8, 0xF2, /* 0x68-0x6B */ + 0xA8, 0xF3, 0xA8, 0xF4, 0xA8, 0xF5, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xA9, 0xE7, 0xA9, 0xE8, 0xA9, 0xE9, 0xA9, 0xEA, /* 0x74-0x77 */ + 0xA9, 0xEB, 0xA9, 0xEC, 0xA9, 0xED, 0xA9, 0xEE, /* 0x78-0x7B */ + 0xA9, 0xEF, 0xA9, 0xF0, 0xA9, 0xF1, 0xA9, 0xF2, /* 0x7C-0x7F */ + + 0xA9, 0xF3, 0xA9, 0xF4, 0xA9, 0xF5, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xA9, 0xCD, 0xA9, 0xCE, 0xA9, 0xCF, 0xA9, 0xD0, /* 0x9C-0x9F */ + 0xA9, 0xD1, 0xA9, 0xD2, 0xA9, 0xD3, 0xA9, 0xD4, /* 0xA0-0xA3 */ + 0xA9, 0xD5, 0xA9, 0xD6, 0xA9, 0xD7, 0xA9, 0xD8, /* 0xA4-0xA7 */ + 0xA9, 0xD9, 0xA9, 0xDA, 0xA9, 0xDB, 0xA9, 0xDC, /* 0xA8-0xAB */ + 0xA9, 0xDD, 0xA9, 0xDE, 0xA9, 0xDF, 0xA9, 0xE0, /* 0xAC-0xAF */ + 0xA9, 0xE1, 0xA9, 0xE2, 0xA9, 0xE3, 0xA9, 0xE4, /* 0xB0-0xB3 */ + 0xA9, 0xE5, 0xA9, 0xE6, 0xA8, 0xCD, 0xA8, 0xCE, /* 0xB4-0xB7 */ + 0xA8, 0xCF, 0xA8, 0xD0, 0xA8, 0xD1, 0xA8, 0xD2, /* 0xB8-0xBB */ + 0xA8, 0xD3, 0xA8, 0xD4, 0xA8, 0xD5, 0xA8, 0xD6, /* 0xBC-0xBF */ + 0xA8, 0xD7, 0xA8, 0xD8, 0xA8, 0xD9, 0xA8, 0xDA, /* 0xC0-0xC3 */ + 0xA8, 0xDB, 0xA8, 0xDC, 0xA8, 0xDD, 0xA8, 0xDE, /* 0xC4-0xC7 */ + 0xA8, 0xDF, 0xA8, 0xE0, 0xA8, 0xE1, 0xA8, 0xE2, /* 0xC8-0xCB */ + 0xA8, 0xE3, 0xA8, 0xE4, 0xA8, 0xE5, 0xA8, 0xE6, /* 0xCC-0xCF */ + 0xA8, 0xCD, 0xA8, 0xCE, 0xA8, 0xCF, 0xA8, 0xD0, /* 0xD0-0xD3 */ + 0xA8, 0xD1, 0xA8, 0xD2, 0xA8, 0xD3, 0xA8, 0xD4, /* 0xD4-0xD7 */ + 0xA8, 0xD5, 0xA8, 0xD6, 0xA8, 0xD7, 0xA8, 0xD8, /* 0xD8-0xDB */ + 0xA8, 0xD9, 0xA8, 0xDA, 0xA8, 0xDB, 0xA8, 0xDC, /* 0xDC-0xDF */ + 0xA8, 0xDD, 0xA8, 0xDE, 0xA8, 0xDF, 0xA8, 0xE0, /* 0xE0-0xE3 */ + 0xA8, 0xE1, 0xA8, 0xE2, 0xA8, 0xE3, 0xA8, 0xE4, /* 0xE4-0xE7 */ + 0xA8, 0xE5, 0xA8, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ +}; + +static const unsigned char u2c_25[512] = { + 0xA6, 0xA1, 0xA6, 0xAC, 0xA6, 0xA2, 0xA6, 0xAD, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xA6, 0xA3, 0xA6, 0xC8, 0xA6, 0xC7, 0xA6, 0xAE, /* 0x0C-0x0F */ + 0xA6, 0xA4, 0xA6, 0xC2, 0xA6, 0xC1, 0xA6, 0xAF, /* 0x10-0x13 */ + 0xA6, 0xA6, 0xA6, 0xC6, 0xA6, 0xC5, 0xA6, 0xB1, /* 0x14-0x17 */ + 0xA6, 0xA5, 0xA6, 0xC4, 0xA6, 0xC3, 0xA6, 0xB0, /* 0x18-0x1B */ + 0xA6, 0xA7, 0xA6, 0xBC, 0xA6, 0xC9, 0xA6, 0xCA, /* 0x1C-0x1F */ + 0xA6, 0xB7, 0xA6, 0xCB, 0xA6, 0xCC, 0xA6, 0xB2, /* 0x20-0x23 */ + 0xA6, 0xA9, 0xA6, 0xBE, 0xA6, 0xCD, 0xA6, 0xCE, /* 0x24-0x27 */ + 0xA6, 0xB9, 0xA6, 0xCF, 0xA6, 0xD0, 0xA6, 0xB4, /* 0x28-0x2B */ + 0xA6, 0xA8, 0xA6, 0xD1, 0xA6, 0xD2, 0xA6, 0xB8, /* 0x2C-0x2F */ + 0xA6, 0xBD, 0xA6, 0xD3, 0xA6, 0xD4, 0xA6, 0xB3, /* 0x30-0x33 */ + 0xA6, 0xAA, 0xA6, 0xD5, 0xA6, 0xD6, 0xA6, 0xBA, /* 0x34-0x37 */ + 0xA6, 0xBF, 0xA6, 0xD7, 0xA6, 0xD8, 0xA6, 0xB5, /* 0x38-0x3B */ + 0xA6, 0xAB, 0xA6, 0xD9, 0xA6, 0xDA, 0xA6, 0xBB, /* 0x3C-0x3F */ + 0xA6, 0xDB, 0xA6, 0xDC, 0xA6, 0xC0, 0xA6, 0xDD, /* 0x40-0x43 */ + 0xA6, 0xDE, 0xA6, 0xDF, 0xA6, 0xE0, 0xA6, 0xE1, /* 0x44-0x47 */ + 0xA6, 0xE2, 0xA6, 0xE3, 0xA6, 0xE4, 0xA6, 0xB6, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0xC6, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xA1, 0xE1, 0xA1, 0xE0, 0x00, 0x00, 0xA2, 0xC3, /* 0xA0-0xA3 */ + 0xA2, 0xC7, 0xA2, 0xC8, 0xA2, 0xCB, 0xA2, 0xCA, /* 0xA4-0xA7 */ + 0xA2, 0xC9, 0xA2, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xE3, 0xA1, 0xE2, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0xBA, 0xA2, 0xB9, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xA1, 0xE5, 0xA1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xA2, 0xB8, 0xA2, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xDF, 0xA1, 0xDE, /* 0xC4-0xC7 */ + 0xA2, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xDB, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xDD, 0xA1, 0xDC, /* 0xCC-0xCF */ + 0xA2, 0xC4, 0xA2, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ +}; + +static const unsigned char u2c_26[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xA1, 0xDA, 0xA1, 0xD9, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0xCF, 0xA2, 0xCE, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xA2, 0xD0, 0x00, 0x00, 0xA2, 0xD1, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xA1, 0xCF, 0x00, 0x00, 0xA1, 0xCE, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA2, 0xBC, 0xA2, 0xBD, 0x00, 0x00, 0xA2, 0xC0, /* 0x60-0x63 */ + 0xA2, 0xBB, 0xA2, 0xBE, 0x00, 0x00, 0xA2, 0xBF, /* 0x64-0x67 */ + 0xA2, 0xCD, 0xA2, 0xDB, 0xA2, 0xDC, 0x00, 0x00, /* 0x68-0x6B */ + 0xA2, 0xDD, 0xA2, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ +}; + +static const unsigned char u2c_30[512] = { + 0xA1, 0xA1, 0xA1, 0xA2, 0xA1, 0xA3, 0xA1, 0xA8, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xA1, 0xB4, 0xA1, 0xB5, 0xA1, 0xB6, 0xA1, 0xB7, /* 0x08-0x0B */ + 0xA1, 0xB8, 0xA1, 0xB9, 0xA1, 0xBA, 0xA1, 0xBB, /* 0x0C-0x0F */ + 0xA1, 0xBC, 0xA1, 0xBD, 0x00, 0x00, 0xA1, 0xEB, /* 0x10-0x13 */ + 0xA1, 0xB2, 0xA1, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xAA, 0xA1, 0xAA, 0xA2, 0xAA, 0xA3, /* 0x40-0x43 */ + 0xAA, 0xA4, 0xAA, 0xA5, 0xAA, 0xA6, 0xAA, 0xA7, /* 0x44-0x47 */ + 0xAA, 0xA8, 0xAA, 0xA9, 0xAA, 0xAA, 0xAA, 0xAB, /* 0x48-0x4B */ + 0xAA, 0xAC, 0xAA, 0xAD, 0xAA, 0xAE, 0xAA, 0xAF, /* 0x4C-0x4F */ + 0xAA, 0xB0, 0xAA, 0xB1, 0xAA, 0xB2, 0xAA, 0xB3, /* 0x50-0x53 */ + 0xAA, 0xB4, 0xAA, 0xB5, 0xAA, 0xB6, 0xAA, 0xB7, /* 0x54-0x57 */ + 0xAA, 0xB8, 0xAA, 0xB9, 0xAA, 0xBA, 0xAA, 0xBB, /* 0x58-0x5B */ + 0xAA, 0xBC, 0xAA, 0xBD, 0xAA, 0xBE, 0xAA, 0xBF, /* 0x5C-0x5F */ + 0xAA, 0xC0, 0xAA, 0xC1, 0xAA, 0xC2, 0xAA, 0xC3, /* 0x60-0x63 */ + 0xAA, 0xC4, 0xAA, 0xC5, 0xAA, 0xC6, 0xAA, 0xC7, /* 0x64-0x67 */ + 0xAA, 0xC8, 0xAA, 0xC9, 0xAA, 0xCA, 0xAA, 0xCB, /* 0x68-0x6B */ + 0xAA, 0xCC, 0xAA, 0xCD, 0xAA, 0xCE, 0xAA, 0xCF, /* 0x6C-0x6F */ + 0xAA, 0xD0, 0xAA, 0xD1, 0xAA, 0xD2, 0xAA, 0xD3, /* 0x70-0x73 */ + 0xAA, 0xD4, 0xAA, 0xD5, 0xAA, 0xD6, 0xAA, 0xD7, /* 0x74-0x77 */ + 0xAA, 0xD8, 0xAA, 0xD9, 0xAA, 0xDA, 0xAA, 0xDB, /* 0x78-0x7B */ + 0xAA, 0xDC, 0xAA, 0xDD, 0xAA, 0xDE, 0xAA, 0xDF, /* 0x7C-0x7F */ + + 0xAA, 0xE0, 0xAA, 0xE1, 0xAA, 0xE2, 0xAA, 0xE3, /* 0x80-0x83 */ + 0xAA, 0xE4, 0xAA, 0xE5, 0xAA, 0xE6, 0xAA, 0xE7, /* 0x84-0x87 */ + 0xAA, 0xE8, 0xAA, 0xE9, 0xAA, 0xEA, 0xAA, 0xEB, /* 0x88-0x8B */ + 0xAA, 0xEC, 0xAA, 0xED, 0xAA, 0xEE, 0xAA, 0xEF, /* 0x8C-0x8F */ + 0xAA, 0xF0, 0xAA, 0xF1, 0xAA, 0xF2, 0xAA, 0xF3, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xAB, 0xA1, 0xAB, 0xA2, 0xAB, 0xA3, /* 0xA0-0xA3 */ + 0xAB, 0xA4, 0xAB, 0xA5, 0xAB, 0xA6, 0xAB, 0xA7, /* 0xA4-0xA7 */ + 0xAB, 0xA8, 0xAB, 0xA9, 0xAB, 0xAA, 0xAB, 0xAB, /* 0xA8-0xAB */ + 0xAB, 0xAC, 0xAB, 0xAD, 0xAB, 0xAE, 0xAB, 0xAF, /* 0xAC-0xAF */ + 0xAB, 0xB0, 0xAB, 0xB1, 0xAB, 0xB2, 0xAB, 0xB3, /* 0xB0-0xB3 */ + 0xAB, 0xB4, 0xAB, 0xB5, 0xAB, 0xB6, 0xAB, 0xB7, /* 0xB4-0xB7 */ + 0xAB, 0xB8, 0xAB, 0xB9, 0xAB, 0xBA, 0xAB, 0xBB, /* 0xB8-0xBB */ + 0xAB, 0xBC, 0xAB, 0xBD, 0xAB, 0xBE, 0xAB, 0xBF, /* 0xBC-0xBF */ + 0xAB, 0xC0, 0xAB, 0xC1, 0xAB, 0xC2, 0xAB, 0xC3, /* 0xC0-0xC3 */ + 0xAB, 0xC4, 0xAB, 0xC5, 0xAB, 0xC6, 0xAB, 0xC7, /* 0xC4-0xC7 */ + 0xAB, 0xC8, 0xAB, 0xC9, 0xAB, 0xCA, 0xAB, 0xCB, /* 0xC8-0xCB */ + 0xAB, 0xCC, 0xAB, 0xCD, 0xAB, 0xCE, 0xAB, 0xCF, /* 0xCC-0xCF */ + 0xAB, 0xD0, 0xAB, 0xD1, 0xAB, 0xD2, 0xAB, 0xD3, /* 0xD0-0xD3 */ + 0xAB, 0xD4, 0xAB, 0xD5, 0xAB, 0xD6, 0xAB, 0xD7, /* 0xD4-0xD7 */ + 0xAB, 0xD8, 0xAB, 0xD9, 0xAB, 0xDA, 0xAB, 0xDB, /* 0xD8-0xDB */ + 0xAB, 0xDC, 0xAB, 0xDD, 0xAB, 0xDE, 0xAB, 0xDF, /* 0xDC-0xDF */ + 0xAB, 0xE0, 0xAB, 0xE1, 0xAB, 0xE2, 0xAB, 0xE3, /* 0xE0-0xE3 */ + 0xAB, 0xE4, 0xAB, 0xE5, 0xAB, 0xE6, 0xAB, 0xE7, /* 0xE4-0xE7 */ + 0xAB, 0xE8, 0xAB, 0xE9, 0xAB, 0xEA, 0xAB, 0xEB, /* 0xE8-0xEB */ + 0xAB, 0xEC, 0xAB, 0xED, 0xAB, 0xEE, 0xAB, 0xEF, /* 0xEC-0xEF */ + 0xAB, 0xF0, 0xAB, 0xF1, 0xAB, 0xF2, 0xAB, 0xF3, /* 0xF0-0xF3 */ + 0xAB, 0xF4, 0xAB, 0xF5, 0xAB, 0xF6, 0x00, 0x00, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_31[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, /* 0x30-0x33 */ + 0xA4, 0xA4, 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, /* 0x34-0x37 */ + 0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xAA, 0xA4, 0xAB, /* 0x38-0x3B */ + 0xA4, 0xAC, 0xA4, 0xAD, 0xA4, 0xAE, 0xA4, 0xAF, /* 0x3C-0x3F */ + 0xA4, 0xB0, 0xA4, 0xB1, 0xA4, 0xB2, 0xA4, 0xB3, /* 0x40-0x43 */ + 0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0x44-0x47 */ + 0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0x48-0x4B */ + 0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0xA4, 0xBF, /* 0x4C-0x4F */ + 0xA4, 0xC0, 0xA4, 0xC1, 0xA4, 0xC2, 0xA4, 0xC3, /* 0x50-0x53 */ + 0xA4, 0xC4, 0xA4, 0xC5, 0xA4, 0xC6, 0xA4, 0xC7, /* 0x54-0x57 */ + 0xA4, 0xC8, 0xA4, 0xC9, 0xA4, 0xCA, 0xA4, 0xCB, /* 0x58-0x5B */ + 0xA4, 0xCC, 0xA4, 0xCD, 0xA4, 0xCE, 0xA4, 0xCF, /* 0x5C-0x5F */ + 0xA4, 0xD0, 0xA4, 0xD1, 0xA4, 0xD2, 0xA4, 0xD3, /* 0x60-0x63 */ + 0xA4, 0xD4, 0xA4, 0xD5, 0xA4, 0xD6, 0xA4, 0xD7, /* 0x64-0x67 */ + 0xA4, 0xD8, 0xA4, 0xD9, 0xA4, 0xDA, 0xA4, 0xDB, /* 0x68-0x6B */ + 0xA4, 0xDC, 0xA4, 0xDD, 0xA4, 0xDE, 0xA4, 0xDF, /* 0x6C-0x6F */ + 0xA4, 0xE0, 0xA4, 0xE1, 0xA4, 0xE2, 0xA4, 0xE3, /* 0x70-0x73 */ + 0xA4, 0xE4, 0xA4, 0xE5, 0xA4, 0xE6, 0xA4, 0xE7, /* 0x74-0x77 */ + 0xA4, 0xE8, 0xA4, 0xE9, 0xA4, 0xEA, 0xA4, 0xEB, /* 0x78-0x7B */ + 0xA4, 0xEC, 0xA4, 0xED, 0xA4, 0xEE, 0xA4, 0xEF, /* 0x7C-0x7F */ + + 0xA4, 0xF0, 0xA4, 0xF1, 0xA4, 0xF2, 0xA4, 0xF3, /* 0x80-0x83 */ + 0xA4, 0xF4, 0xA4, 0xF5, 0xA4, 0xF6, 0xA4, 0xF7, /* 0x84-0x87 */ + 0xA4, 0xF8, 0xA4, 0xF9, 0xA4, 0xFA, 0xA4, 0xFB, /* 0x88-0x8B */ + 0xA4, 0xFC, 0xA4, 0xFD, 0xA4, 0xFE, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xE9, 0xEC, 0xA3, /* 0x90-0x93 */ + 0xDF, 0xB2, 0xDE, 0xCC, 0xDF, 0xBE, 0xF1, 0xE9, /* 0x94-0x97 */ + 0xF9, 0xBB, 0xCB, 0xA3, 0xEB, 0xE0, 0xDC, 0xB0, /* 0x98-0x9B */ + 0xEF, 0xCB, 0xF4, 0xB8, 0xF2, 0xA2, 0xEC, 0xD1, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_32[512] = { + 0xA9, 0xB1, 0xA9, 0xB2, 0xA9, 0xB3, 0xA9, 0xB4, /* 0x00-0x03 */ + 0xA9, 0xB5, 0xA9, 0xB6, 0xA9, 0xB7, 0xA9, 0xB8, /* 0x04-0x07 */ + 0xA9, 0xB9, 0xA9, 0xBA, 0xA9, 0xBB, 0xA9, 0xBC, /* 0x08-0x0B */ + 0xA9, 0xBD, 0xA9, 0xBE, 0xA9, 0xBF, 0xA9, 0xC0, /* 0x0C-0x0F */ + 0xA9, 0xC1, 0xA9, 0xC2, 0xA9, 0xC3, 0xA9, 0xC4, /* 0x10-0x13 */ + 0xA9, 0xC5, 0xA9, 0xC6, 0xA9, 0xC7, 0xA9, 0xC8, /* 0x14-0x17 */ + 0xA9, 0xC9, 0xA9, 0xCA, 0xA9, 0xCB, 0xA9, 0xCC, /* 0x18-0x1B */ + 0xA2, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xEC, 0xE9, 0xEC, 0xA3, 0xDF, 0xB2, 0xDE, 0xCC, /* 0x20-0x23 */ + 0xE7, 0xE9, 0xD7, 0xBF, 0xF6, 0xD2, 0xF8, 0xA2, /* 0x24-0x27 */ + 0xCE, 0xFA, 0xE4, 0xA8, 0xEA, 0xC5, 0xFB, 0xFD, /* 0x28-0x2B */ + 0xE2, 0xA9, 0xD9, 0xCA, 0xD1, 0xD1, 0xF7, 0xCF, /* 0x2C-0x2F */ + 0xEC, 0xED, 0xF1, 0xBB, 0xEA, 0xF3, 0xDE, 0xE4, /* 0x30-0x33 */ + 0xD9, 0xA3, 0xF7, 0xE5, 0xEE, 0xAF, 0xF5, 0xE6, /* 0x34-0x37 */ + 0xD6, 0xCC, 0xD3, 0xDB, 0xFB, 0xBC, 0xF9, 0xCA, /* 0x38-0x3B */ + 0xCA, 0xF8, 0xD0, 0xEA, 0xED, 0xC0, 0xFA, 0xF0, /* 0x3C-0x3F */ + 0xF0, 0xAE, 0xFD, 0xCC, 0xED, 0xBB, 0xF2, 0xB8, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA8, 0xB1, 0xA8, 0xB2, 0xA8, 0xB3, 0xA8, 0xB4, /* 0x60-0x63 */ + 0xA8, 0xB5, 0xA8, 0xB6, 0xA8, 0xB7, 0xA8, 0xB8, /* 0x64-0x67 */ + 0xA8, 0xB9, 0xA8, 0xBA, 0xA8, 0xBB, 0xA8, 0xBC, /* 0x68-0x6B */ + 0xA8, 0xBD, 0xA8, 0xBE, 0xA8, 0xBF, 0xA8, 0xC0, /* 0x6C-0x6F */ + 0xA8, 0xC1, 0xA8, 0xC2, 0xA8, 0xC3, 0xA8, 0xC4, /* 0x70-0x73 */ + 0xA8, 0xC5, 0xA8, 0xC6, 0xA8, 0xC7, 0xA8, 0xC8, /* 0x74-0x77 */ + 0xA8, 0xC9, 0xA8, 0xCA, 0xA8, 0xCB, 0xA8, 0xCC, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0xDE, /* 0x7C-0x7F */ + + 0xEC, 0xE9, 0xEC, 0xA3, 0xDF, 0xB2, 0xDE, 0xCC, /* 0x80-0x83 */ + 0xE7, 0xE9, 0xD7, 0xBF, 0xF6, 0xD2, 0xF8, 0xA2, /* 0x84-0x87 */ + 0xCE, 0xFA, 0xE4, 0xA8, 0xEA, 0xC5, 0xFB, 0xFD, /* 0x88-0x8B */ + 0xE2, 0xA9, 0xD9, 0xCA, 0xD1, 0xD1, 0xF7, 0xCF, /* 0x8C-0x8F */ + 0xEC, 0xED, 0xF1, 0xBB, 0xEA, 0xF3, 0xDE, 0xE4, /* 0x90-0x93 */ + 0xD9, 0xA3, 0xF7, 0xE5, 0xEE, 0xAF, 0xF5, 0xE6, /* 0x94-0x97 */ + 0xD6, 0xCC, 0xDD, 0xFA, 0xD1, 0xFB, 0xD2, 0xB3, /* 0x98-0x9B */ + 0xEE, 0xEA, 0xE9, 0xD0, 0xEC, 0xD4, 0xF1, 0xBC, /* 0x9C-0x9F */ + 0xFA, 0xA3, 0xFD, 0xCC, 0xDE, 0xD0, 0xEF, 0xE1, /* 0xA0-0xA3 */ + 0xDF, 0xBE, 0xF1, 0xE9, 0xF9, 0xBB, 0xF1, 0xA7, /* 0xA4-0xA7 */ + 0xE9, 0xD3, 0xEC, 0xA2, 0xF0, 0xF3, 0xF9, 0xCA, /* 0xA8-0xAB */ + 0xCA, 0xF8, 0xD0, 0xEA, 0xED, 0xC0, 0xFA, 0xF0, /* 0xAC-0xAF */ + 0xE5, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ +}; + +static const unsigned char u2c_33[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xA7, 0xC9, 0xA7, 0xCA, 0xA7, 0xCB, 0xA7, 0xCC, /* 0x80-0x83 */ + 0xA7, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xA7, 0xBA, 0xA7, 0xBB, 0xA7, 0xDC, 0xA7, 0xDD, /* 0x88-0x8B */ + 0xA7, 0xDE, 0xA7, 0xB6, 0xA7, 0xB7, 0xA7, 0xB8, /* 0x8C-0x8F */ + 0xA7, 0xD4, 0xA7, 0xD5, 0xA7, 0xD6, 0xA7, 0xD7, /* 0x90-0x93 */ + 0xA7, 0xD8, 0xA7, 0xA1, 0xA7, 0xA2, 0xA7, 0xA3, /* 0x94-0x97 */ + 0xA7, 0xA5, 0xA7, 0xAB, 0xA7, 0xAC, 0xA7, 0xAD, /* 0x98-0x9B */ + 0xA7, 0xAE, 0xA7, 0xAF, 0xA7, 0xB0, 0xA7, 0xB1, /* 0x9C-0x9F */ + 0xA7, 0xB2, 0xA7, 0xB3, 0xA7, 0xB4, 0xA7, 0xA7, /* 0xA0-0xA3 */ + 0xA7, 0xA8, 0xA7, 0xA9, 0xA7, 0xAA, 0xA7, 0xBD, /* 0xA4-0xA7 */ + 0xA7, 0xBE, 0xA7, 0xE5, 0xA7, 0xE6, 0xA7, 0xE7, /* 0xA8-0xAB */ + 0xA7, 0xE8, 0xA7, 0xE1, 0xA7, 0xE2, 0xA7, 0xE3, /* 0xAC-0xAF */ + 0xA7, 0xBF, 0xA7, 0xC0, 0xA7, 0xC1, 0xA7, 0xC2, /* 0xB0-0xB3 */ + 0xA7, 0xC3, 0xA7, 0xC4, 0xA7, 0xC5, 0xA7, 0xC6, /* 0xB4-0xB7 */ + 0xA7, 0xC7, 0xA7, 0xC8, 0xA7, 0xCE, 0xA7, 0xCF, /* 0xB8-0xBB */ + 0xA7, 0xD0, 0xA7, 0xD1, 0xA7, 0xD2, 0xA7, 0xD3, /* 0xBC-0xBF */ + 0xA7, 0xDA, 0xA7, 0xDB, 0xA2, 0xE3, 0xA7, 0xEC, /* 0xC0-0xC3 */ + 0xA7, 0xA6, 0xA7, 0xE0, 0xA7, 0xEF, 0xA2, 0xE1, /* 0xC4-0xC7 */ + 0xA7, 0xBC, 0xA7, 0xED, 0xA7, 0xB5, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xB9, /* 0xCC-0xCF */ + 0xA7, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xEB, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0xDF, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xA2, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xE4, /* 0xD8-0xDB */ + 0xA7, 0xEE, 0xA7, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ +}; + +static const unsigned char u2c_4E[512] = { + 0xEC, 0xE9, 0xEF, 0xCB, 0x00, 0x00, 0xF6, 0xD2, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xB2, /* 0x04-0x07 */ + 0xED, 0xDB, 0xDF, 0xB2, 0xDF, 0xBE, 0xF9, 0xBB, /* 0x08-0x0B */ + 0x00, 0x00, 0xDC, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xF5, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xF3, 0xA6, 0xDD, 0xE0, 0xE1, 0xA6, 0x00, 0x00, /* 0x14-0x17 */ + 0xCE, 0xF8, 0xDC, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAA, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xF1, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xFA, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xFC, 0xAF, 0xD3, 0xA1, 0x00, 0x00, 0xF1, 0xAB, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xD1, 0xD2, 0xAC, /* 0x40-0x43 */ + 0x00, 0x00, 0xCE, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xFD, /* 0x48-0x4B */ + 0x00, 0x00, 0xDE, 0xBF, 0xFB, 0xBA, 0xF9, 0xB9, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xD2, 0x00, 0x00, /* 0x54-0x57 */ + 0xE3, 0xAB, 0xEB, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xCE, 0xFA, 0xCB, 0xF7, 0xE5, 0xA5, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE1, /* 0x68-0x6B */ + 0x00, 0x00, 0xD4, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE1, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xE3, 0xDF, 0xAD, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xEB, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xAF, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xF5, 0x00, 0x00, /* 0x84-0x87 */ + 0xE5, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xC0, /* 0x88-0x8B */ + 0xEC, 0xA3, 0x00, 0x00, 0xE9, 0xCD, 0x00, 0x00, /* 0x8C-0x8F */ + 0xEA, 0xA7, 0xE9, 0xF6, 0xFB, 0xBB, 0x00, 0x00, /* 0x90-0x93 */ + 0xE7, 0xE9, 0xEF, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xD0, 0xE6, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xC1, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xAC, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xD8, 0xCC, 0xF9, 0xF1, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xCE, 0xDF, 0xFA, 0xA4, 0xE6, 0xB2, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xFA, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xBD, /* 0xA8-0xAB */ + 0xCC, 0xC8, 0xEF, 0xCD, 0xD5, 0xD5, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xA2, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xD1, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE4, 0xA7, 0xEC, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF6, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xFB, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xD1, 0xCB, 0xBF, /* 0xC8-0xCB */ + 0x00, 0x00, 0xED, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xED, 0xA8, 0xDE, 0xC2, 0xF6, 0xE2, 0xED, 0xDC, /* 0xD4-0xD7 */ + 0xDC, 0xF5, 0xE0, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xD4, 0xCE, 0x00, 0x00, 0xF4, 0xB5, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xDB, /* 0xE0-0xE3 */ + 0xD6, 0xB5, 0xEC, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE4, 0xE6, 0x00, 0x00, 0xF1, 0xEA, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xEC, 0xCB, 0xC0, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xF2, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_4F[512] = { + 0x00, 0x00, 0xD0, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xF9, 0xF2, 0xEC, 0xA5, 0xD0, 0xDF, /* 0x08-0x0B */ + 0x00, 0x00, 0xE7, 0xEA, 0xD0, 0xEB, 0xDC, 0xD1, /* 0x0C-0x0F */ + 0xDB, 0xE9, 0xFD, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xD7, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xDA, 0xE1, 0x00, 0x00, 0xD6, 0xB6, 0x00, 0x00, /* 0x34-0x37 */ + 0xE3, 0xDF, 0x00, 0x00, 0xDE, 0xC3, 0x00, 0x00, /* 0x38-0x3B */ + 0xDE, 0xC4, 0xCA, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xEC, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xA3, 0xEE, 0xB7, /* 0x44-0x47 */ + 0xF8, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xEA, 0xC8, 0xEE, 0xB8, 0xF1, 0xAC, /* 0x4C-0x4F */ + 0xF1, 0xA5, 0xE9, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xF9, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xE5, 0xF9, 0xEC, 0xEA, 0xDD, 0xD6, /* 0x58-0x5B */ + 0xED, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xF8, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xBA, /* 0x6C-0x6F */ + 0xDB, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xA2, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xCD, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xED, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xEB, 0xDE, 0xC5, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE3, 0xE0, 0x00, 0x00, 0xCA, 0xC9, /* 0x80-0x83 */ + 0xF2, 0xE9, 0x00, 0x00, 0xD5, 0xCE, 0x00, 0x00, /* 0x84-0x87 */ + 0xF6, 0xB6, 0x00, 0x00, 0xCE, 0xC2, 0xD6, 0xC7, /* 0x88-0x8B */ + 0x00, 0x00, 0xE3, 0xB4, 0x00, 0x00, 0xF1, 0xAD, /* 0x8C-0x8F */ + 0x00, 0x00, 0xEA, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xC2, 0x00, 0x00, /* 0x94-0x97 */ + 0xF3, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xEA, /* 0x98-0x9B */ + 0x00, 0x00, 0xEB, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xB2, 0xFD, 0xA5, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF6, 0xD5, 0xD5, 0xE2, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xB5, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xF5, 0xF5, 0xB5, /* 0xC0-0xC3 */ + 0xE4, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE7, 0xEB, 0xF1, 0xD5, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xBB, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xE9, 0xB5, 0x00, 0x00, 0xCC, 0xC9, /* 0xD0-0xD3 */ + 0xFA, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD4, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xD6, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xDC, 0xC1, 0x00, 0x00, 0xDE, 0xC6, /* 0xDC-0xDF */ + 0xFA, 0xEF, 0xE3, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF3, 0xDC, 0xF6, /* 0xEC-0xEF */ + 0x00, 0x00, 0xCE, 0xFC, 0x00, 0x00, 0xDB, 0xC4, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xF8, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xDC, 0xE4, 0x00, 0x00, 0xE5, 0xEF, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_50[512] = { + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB1, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xD6, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xF3, 0xDA, 0x00, 0x00, 0xCB, 0xC1, /* 0x08-0x0B */ + 0x00, 0x00, 0xDB, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xD9, 0xFA, 0xD3, 0xEE, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xB8, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xFD, 0xA6, 0xEB, 0xEF, 0x00, 0x00, /* 0x18-0x1B */ + 0xF4, 0xA6, 0x00, 0x00, 0xCC, 0xCA, 0xF3, 0xA8, /* 0x1C-0x1F */ + 0x00, 0x00, 0xF3, 0xDB, 0x00, 0x00, 0xDB, 0xA7, /* 0x20-0x23 */ + 0xF6, 0xB7, 0x00, 0x00, 0xCF, 0xE6, 0xF0, 0xF2, /* 0x24-0x27 */ + 0xCB, 0xDA, 0x00, 0x00, 0xE7, 0xD2, 0xD7, 0xC3, /* 0x28-0x2B */ + 0xF6, 0xF0, 0xE8, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA6, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE7, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xA3, /* 0x44-0x47 */ + 0xCC, 0xA7, 0xEA, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xB6, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xFA, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xAE, 0x00, 0x00, /* 0x58-0x5B */ + 0xEF, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xCB, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xF6, 0xB0, 0xEF, 0xCF, 0xE9, 0xCF, 0x00, 0x00, /* 0x74-0x77 */ + 0xF7, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xCE, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xDC, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xDB, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xCB, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xDF, 0xA1, 0xDD, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xF5, 0xCA, 0xE9, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xEC, 0xEE, 0xEE, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF3, 0xF0, 0x00, 0x00, 0xDF, 0xBF, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xCB, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xD0, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF4, 0xD2, 0xE0, 0xBA, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xC0, /* 0xCC-0xCF */ + 0x00, 0x00, 0xCE, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xDC, 0xD2, 0xFD, 0xEA, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xF6, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xCA, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE8, 0xE9, 0x00, 0x00, 0xE3, 0xAC, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xF3, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xCA, 0xA4, 0x00, 0x00, 0xDB, 0xF8, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xC7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_51[512] = { + 0xEB, 0xF0, 0xF1, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xE5, 0xE2, 0x00, 0x00, 0xCC, 0xCC, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xCB, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE3, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xC1, /* 0x1C-0x1F */ + 0x00, 0x00, 0xD6, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xD0, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB9, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xE3, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xD3, 0x00, 0x00, /* 0x38-0x3B */ + 0xE5, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xE8, 0xB4, 0xEB, 0xC3, 0x00, 0x00, 0xEA, 0xAA, /* 0x40-0x43 */ + 0xFA, 0xFC, 0xF5, 0xF6, 0xF0, 0xBC, 0xFD, 0xD4, /* 0x44-0x47 */ + 0xE0, 0xBB, 0xCE, 0xC3, 0x00, 0x00, 0xD0, 0xBA, /* 0x48-0x4B */ + 0xF7, 0xBA, 0xD8, 0xF3, 0xF7, 0xCD, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xAE, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xD4, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xE7, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xEC, 0xFD, 0x00, 0x00, 0xD2, 0xAE, /* 0x64-0x67 */ + 0xEE, 0xEF, 0xD5, 0xD7, 0xEA, 0xE4, 0xF8, 0xA2, /* 0x68-0x6B */ + 0xCD, 0xEB, 0xD7, 0xBF, 0xFB, 0xB1, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xCD, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xDC, 0xB2, 0xD0, 0xEC, 0xCE, 0xFD, /* 0x74-0x77 */ + 0xEE, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xCC, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xD0, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xF7, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xFC, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xEE, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xB3, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xD8, 0xF4, 0x00, 0x00, 0xE9, 0xB7, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xCE, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xD9, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xF1, 0x00, 0x00, /* 0xA8-0xAB */ + 0xD4, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA7, 0xD5, 0xD2, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xD6, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF4, 0xA2, 0x00, 0x00, 0xF1, 0xD7, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xD5, 0xD8, 0x00, 0x00, 0xF0, 0xBD, /* 0xC8-0xCB */ + 0xD7, 0xD0, 0xD4, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD7, 0xCF, 0xEB, 0xEA, 0xFD, 0xEB, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xDB, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xFC, 0xC5, 0xCB, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xD5, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xF4, 0xC8, 0xE8, 0xEA, 0xF5, 0xF3, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xF9, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_52[512] = { + 0xD3, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xD3, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xC2, 0xEF, 0xB7, /* 0x04-0x07 */ + 0xE7, 0xD4, 0x00, 0x00, 0xCA, 0xCA, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xFB, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xFA, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xAA, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xF4, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xF7, 0xF7, 0xDC, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xD7, 0xD7, 0xDF, 0xA2, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xBE, 0x00, 0x00, /* 0x2C-0x2F */ + 0xD3, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xA4, 0xE1, 0xEC, /* 0x34-0x37 */ + 0xCF, 0xE7, 0xF3, 0xCB, 0xED, 0xA9, 0xCA, 0xBE, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xEF, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xCE, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xFB, 0xD0, 0xBB, /* 0x48-0x4B */ + 0xD5, 0xB7, 0xEE, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xF4, 0xA8, 0x00, 0x00, 0xDC, 0xF8, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xA7, /* 0x58-0x5B */ + 0x00, 0x00, 0xDA, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xE0, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xED, 0xA5, 0xEE, 0xF2, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xF9, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xDC, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xF3, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xF8, 0xF2, 0x00, 0x00, 0xF4, 0xF9, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xF1, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xBC, /* 0x84-0x87 */ + 0xDB, 0xF9, 0xD7, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xCB, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xF0, 0xA5, 0xCB, 0xFD, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF4, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xED, /* 0x9C-0x9F */ + 0xCA, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xAB, /* 0xA0-0xA3 */ + 0xD0, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xF0, 0xBE, 0xD2, 0xBD, 0xCC, 0xA4, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xB6, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xCC, 0xCD, 0x00, 0x00, 0xDA, 0xFA, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xF6, 0xCF, 0x00, 0x00, 0xE9, 0xB8, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xD8, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xCC, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xCD, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xD4, 0xD1, 0xE9, 0xED, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xCA, 0xEB, 0xD9, 0xE2, 0x00, 0x00, 0xFD, 0xB2, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE3, 0xAD, 0xD6, 0xCC, 0xD9, 0xB4, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA7, 0xEE, 0xD3, /* 0xE0-0xE3 */ + 0xD0, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xB3, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xD5, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xCF, 0xE8, 0x00, 0x00, 0xED, 0xC3, 0xD0, 0xB2, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xFE, 0xDA, 0xA8, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_53[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xF8, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xFD, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xF8, 0xD1, 0x00, 0x00, 0xF8, 0xD2, /* 0x0C-0x0F */ + 0xDC, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xDD, 0xE2, 0xFB, 0xF9, 0xDD, 0xC1, /* 0x14-0x17 */ + 0x00, 0x00, 0xE3, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xED, 0xDD, 0xCE, 0xC4, 0x00, 0x00, 0xCB, 0xA1, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xE3, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xDD, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xF9, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xFB, /* 0x3C-0x3F */ + 0xCF, 0xA1, 0xE4, 0xA8, 0x00, 0x00, 0xF4, 0xB6, /* 0x40-0x43 */ + 0xEC, 0xFE, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAE, /* 0x44-0x47 */ + 0xE7, 0xED, 0xFD, 0xC1, 0xDA, 0xE2, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xD8, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xDD, 0xE4, 0xF0, 0xEF, 0xF6, 0xF1, /* 0x50-0x53 */ + 0xFA, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xF5, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xCF, 0x00, 0x00, /* 0x58-0x5B */ + 0xDC, 0xD4, 0x00, 0x00, 0xDC, 0xA6, 0x00, 0x00, /* 0x5C-0x5F */ + 0xEF, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xCF, 0x00, 0x00, /* 0x64-0x67 */ + 0xE0, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD6, /* 0x6C-0x6F */ + 0xEC, 0xD4, 0xEA, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xCA, 0xBF, 0xD5, 0xB0, 0x00, 0x00, 0xCF, 0xE9, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xF1, 0xED, 0x00, 0x00, 0xCC, 0xCF, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xE4, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xED, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xD7, 0xD8, 0x00, 0x00, 0xFD, 0xA7, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xAB, /* 0x9C-0x9F */ + 0xF6, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xCF, 0xF0, 0xF9, 0xBD, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xE6, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xDB, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xD1, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE9, 0xD1, 0xF3, 0xA9, 0xD0, 0xE0, 0xE9, 0xD2, /* 0xC8-0xCB */ + 0x00, 0x00, 0xDA, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE2, 0xD2, 0x00, 0x00, 0xF6, 0xA2, 0xE1, 0xF4, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xE4, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE7, 0xD5, 0xF5, 0xBF, 0xCF, 0xA2, /* 0xE0-0xE3 */ + 0xCD, 0xAF, 0xCF, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xCD, 0xB0, 0xF1, 0xFE, 0xD0, 0xA3, /* 0xE8-0xEB */ + 0xE1, 0xAF, 0xF8, 0xA3, 0x00, 0x00, 0xCA, 0xA6, /* 0xEC-0xEF */ + 0xF7, 0xBB, 0xF2, 0xEA, 0xDE, 0xC8, 0xE9, 0xD3, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xDE, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_54[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xDE, /* 0x00-0x03 */ + 0xCA, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xF9, 0xEA, 0xD1, 0xCE, 0xEE, 0xD4, 0x00, 0x00, /* 0x08-0x0B */ + 0xD4, 0xD2, 0xD9, 0xA3, 0xFD, 0xA8, 0xD7, 0xD9, /* 0x0C-0x0F */ + 0xF7, 0xCE, 0xFA, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xD6, /* 0x18-0x1B */ + 0x00, 0x00, 0xD7, 0xF0, 0x00, 0x00, 0xEB, 0xE1, /* 0x1C-0x1F */ + 0xF8, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xFA, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xDD, 0xC3, 0x00, 0x00, 0xF9, 0xDF, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xEF, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xFD, 0xE5, 0xF6, 0xA3, 0x00, 0x00, 0xD9, 0xFC, /* 0x38-0x3B */ + 0xFD, 0xA9, 0x00, 0x00, 0xE7, 0xEE, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xE5, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xEF, 0xD0, 0x00, 0x00, 0xCD, 0xB1, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xF7, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xF1, 0xB2, 0x00, 0x00, 0xF1, 0xB1, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xCD, 0xB2, 0x00, 0x00, 0xDA, 0xAB, /* 0x70-0x73 */ + 0x00, 0x00, 0xCA, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xE2, /* 0x78-0x7B */ + 0xFB, 0xBC, 0xD9, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xEE, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD3, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xFB, 0xFA, 0x00, 0x00, 0xCF, 0xA4, 0x00, 0x00, /* 0x8C-0x8F */ + 0xDC, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xF6, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xED, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA1, /* 0xA8-0xAB */ + 0xCE, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xA6, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xF9, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xEC, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE4, 0xEE, 0xF9, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xFB, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xF9, 0xEB, 0xEE, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xEA, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xCA, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xF4, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xCD, 0xD6, 0xFC, 0xF6, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xC9, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD4, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_55[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xF8, 0xA6, 0x00, 0x00, 0xDE, 0xCA, 0xF2, 0xC6, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xDA, 0x00, 0x00, /* 0x0C-0x0F */ + 0xD3, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xD8, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE6, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF3, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xE4, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xE4, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xF6, 0xF2, 0x00, 0x00, 0xDF, 0xC2, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xFD, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xF6, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xBA, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xAF, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xE1, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xF0, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xCB, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xE0, 0xBC, 0x00, 0x00, 0xF4, 0xCA, 0xD4, 0xFA, /* 0x84-0x87 */ + 0x00, 0x00, 0xFD, 0xAA, 0xF9, 0xE2, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xF4, 0xB7, 0xFD, 0xC2, 0xFC, 0xB0, 0x00, 0x00, /* 0x98-0x9B */ + 0xFD, 0xEC, 0xCA, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xBD, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xEA, 0xE7, 0xDF, 0xC3, 0xD1, 0xD2, /* 0xA8-0xAB */ + 0xCE, 0xE2, 0x00, 0x00, 0xD3, 0xA4, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xFD, 0xAB, 0x00, 0x00, 0xDF, 0xE0, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xF2, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF0, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD0, 0xEE, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xAA, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xCB, /* 0xE0-0xE3 */ + 0xF6, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE1, 0xF5, 0xF1, 0xB3, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_56[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xA3, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xCA, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xCF, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xC4, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB0, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xBF, 0x00, 0x00, /* 0x30-0x33 */ + 0xF6, 0xA4, 0x00, 0x00, 0xE3, 0xB6, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC6, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xD0, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xED, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xDD, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xF7, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xE6, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xDE, 0xAD, 0x00, 0x00, 0xFA, 0xBF, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE5, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xED, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0xA5, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xFD, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xF5, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF6, 0xDE, 0xCC, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xDE, 0x00, 0x00, /* 0xDC-0xDF */ + 0xEC, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xCD, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xD6, 0xB7, 0xCD, 0xB3, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_57[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD5, /* 0x00-0x03 */ + 0xE5, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xCF, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xD0, /* 0x08-0x0B */ + 0x00, 0x00, 0xEA, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xAE, 0xEA, 0xAD, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xF1, 0x00, 0x00, /* 0x14-0x17 */ + 0xD3, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xCF, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xEE, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xD0, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xF2, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF0, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xF2, 0xA3, 0x00, 0x00, 0xF7, 0xF8, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xB3, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xA9, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xD3, 0xBB, 0xCA, 0xEC, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF1, 0xA6, 0xCB, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xF7, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xCD, 0xDE, 0x00, 0x00, 0xF7, 0xA4, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xC0, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xDD, 0x00, 0x00, /* 0x6C-0x6F */ + 0xCC, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xCF, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xF6, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF7, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xD3, 0xDC, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xFE, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xA7, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xEB, 0xD9, 0x00, 0x00, 0xCF, 0xA7, 0xEA, 0xAF, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xEF, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB9, /* 0xC4-0xC7 */ + 0xF1, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD8, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xF2, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xB4, /* 0xDC-0xDF */ + 0xDC, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xF3, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE3, 0xD0, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xFB, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xDB, 0xC6, 0xD0, 0xF1, 0x00, 0x00, /* 0xF8-0xFB */ + 0xD0, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_58[512] = { + 0xCF, 0xDC, 0x00, 0x00, 0xD3, 0xD1, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xCC, 0xB1, 0xF7, 0xD8, 0x00, 0x00, /* 0x04-0x07 */ + 0xCB, 0xA8, 0xEB, 0xBC, 0xE4, 0xBE, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xDC, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xDC, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xF0, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xC0, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xED, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xEB, /* 0x2C-0x2F */ + 0xE5, 0xE8, 0xDC, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xED, 0xDE, 0xD3, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xF7, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xD4, 0xE7, 0xAB, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xC3, /* 0x4C-0x4F */ + 0x00, 0x00, 0xE1, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xF7, 0xB2, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xF3, /* 0x54-0x57 */ + 0xD3, 0xD2, 0x00, 0x00, 0xF5, 0xC0, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xDD, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xEE, 0xF3, 0xE7, 0xF1, 0x00, 0x00, /* 0x60-0x63 */ + 0xFD, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xF2, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xF3, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xEE, 0xF4, 0x00, 0x00, 0xE2, 0xD3, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xD1, /* 0x80-0x83 */ + 0x00, 0x00, 0xDF, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xE9, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD7, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xF5, 0xCD, 0x00, 0x00, 0xF1, 0xF2, 0xFA, 0xC7, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xD9, 0xF8, 0xD4, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xE5, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xC5, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF2, 0xED, 0xDF, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xCB, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xDB, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE8, 0xB5, 0x00, 0x00, 0xD3, 0xA6, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xB5, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xF9, 0xC9, 0x00, 0x00, 0xE4, 0xE2, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xFB, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xD7, 0xA4, 0xCE, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xD5, 0xD6, 0xE6, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE5, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xCD, /* 0xE8-0xEB */ + 0xEC, 0xF3, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE0, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xEC, 0xEC, 0xFB, 0xBE, 0xDF, 0xEB, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE1, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_59[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xBE, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xD0, 0xF3, 0xE0, 0xAA, 0xE8, 0xE2, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE2, 0xD4, 0xD2, 0xFD, 0x00, 0x00, /* 0x18-0x1B */ + 0xE5, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD3, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xDE, /* 0x24-0x27 */ + 0x00, 0x00, 0xF4, 0xB8, 0xF7, 0xBC, 0xDC, 0xFD, /* 0x28-0x2B */ + 0x00, 0x00, 0xE8, 0xEC, 0xE4, 0xE7, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE3, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xA8, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xF1, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE5, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF4, /* 0x44-0x47 */ + 0xD2, 0xAF, 0xDC, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xA5, 0xF1, 0xB4, /* 0x4C-0x4F */ + 0xFC, 0xB1, 0xCC, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xDD, 0xC6, 0xFA, 0xD1, 0x00, 0x00, 0xF7, 0xDF, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xA8, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xEE, 0xF5, 0x00, 0x00, 0xDE, 0xCE, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF3, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xAC, 0xEB, 0xC4, /* 0x68-0x6B */ + 0xED, 0xE1, 0xE0, 0xAB, 0xDD, 0xC7, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xB3, /* 0x70-0x73 */ + 0xD2, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xCA, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xFB, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xFD, 0xDD, 0xE5, /* 0x80-0x83 */ + 0xD8, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xF4, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF5, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xED, 0xD0, 0xD2, /* 0x94-0x97 */ + 0x00, 0x00, 0xD9, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xF6, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xDB, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xF7, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xD8, 0xD9, 0x00, 0x00, 0xF4, 0xA3, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xDD, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xD1, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xB5, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xED, 0xAB, 0x00, 0x00, 0xE3, 0xB7, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xEE, 0xBB, 0xCD, 0xB4, 0x00, 0x00, 0xE0, 0xF3, /* 0xD0-0xD3 */ + 0xEA, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xEC, 0xF5, 0xE8, 0xEE, 0x00, 0x00, /* 0xD8-0xDB */ + 0xCB, 0xA9, 0xF1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xCD, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xEC, 0xA9, 0x00, 0x00, 0xF2, 0xEB, 0x00, 0x00, /* 0xE8-0xEB */ + 0xFD, 0xEF, 0x00, 0x00, 0xF9, 0xF3, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xE6, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xD8, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xAC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5A[512] = { + 0x00, 0x00, 0xEA, 0xCE, 0x00, 0x00, 0xE8, 0xDF, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xDE, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xD2, 0xA6, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF4, /* 0x18-0x1B */ + 0xD1, 0xD6, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xC2, /* 0x1C-0x1F */ + 0xE3, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE4, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xD8, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xA5, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xF3, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xD7, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xE8, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xE8, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xE6, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xE6, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xFE, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xDA, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xAC, 0xEA, 0xB0, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE3, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xCA, 0xAA, 0xE1, 0xF9, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xEA, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF2, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xFA, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xEE, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xF4, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xD2, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ +}; + +static const unsigned char u2c_5B[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xFB, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xFD, 0xF0, 0x00, 0x00, 0xE0, 0xBD, /* 0x08-0x0B */ + 0xCE, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xC6, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xAE, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xDF, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xBE, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xED, 0xAD, 0xFA, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xCD, 0xEE, 0xED, 0xA6, 0x00, 0x00, 0xED, 0xAE, /* 0x54-0x57 */ + 0xF0, 0xED, 0x00, 0x00, 0xDD, 0xA1, 0x00, 0x00, /* 0x58-0x5B */ + 0xED, 0xAF, 0xFC, 0xF8, 0x00, 0x00, 0xD8, 0xEB, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xF9, /* 0x60-0x63 */ + 0xCD, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xFA, 0xA9, 0x00, 0x00, 0xE1, 0xDD, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE2, 0xD5, 0xED, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xDD, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xF9, 0xCA, 0x00, 0x00, 0xEA, 0xE8, 0x00, 0x00, /* 0x78-0x7B */ + 0xE5, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xD3, 0xEB, 0x00, 0x00, 0xE9, 0xD4, /* 0x84-0x87 */ + 0xE1, 0xFA, 0xE4, 0xCC, 0x00, 0x00, 0xE1, 0xE4, /* 0x88-0x8B */ + 0xE8, 0xC7, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xDB, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xD5, /* 0x90-0x93 */ + 0x00, 0x00, 0xF7, 0xB5, 0xFC, 0xF3, 0xF0, 0xF3, /* 0x94-0x97 */ + 0xCE, 0xAF, 0xF1, 0xB5, 0xEF, 0xD2, 0xE8, 0xC8, /* 0x98-0x9B */ + 0xEB, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xD4, 0xE0, 0xBE, /* 0xA0-0xA3 */ + 0xE3, 0xF8, 0xEA, 0xE9, 0xFC, 0xB2, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE0, 0xF4, 0x00, 0x00, 0xCF, 0xE0, 0x00, 0x00, /* 0xAC-0xAF */ + 0xEE, 0xA5, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xAA, /* 0xB0-0xB3 */ + 0xE6, 0xC3, 0xE1, 0xB2, 0xCA, 0xAB, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE3, 0xE4, 0xE9, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD6, /* 0xBC-0xBF */ + 0xF3, 0xF2, 0x00, 0x00, 0xEE, 0xD6, 0xEA, 0xB2, /* 0xC0-0xC3 */ + 0xD0, 0xF6, 0xEC, 0xD9, 0xDA, 0xCB, 0xCF, 0xA8, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xDD, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xD8, 0xDB, 0x00, 0x00, 0xF9, 0xCE, 0xE9, 0xD5, /* 0xD0-0xD3 */ + 0xE3, 0xD1, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xBC, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xAC, 0xF3, 0xCC, /* 0xDC-0xDF */ + 0x00, 0x00, 0xCD, 0xFB, 0xF6, 0xD6, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE7, 0xF5, 0xE8, 0xEF, 0xE3, 0xF9, 0xD2, 0xBB, /* 0xE4-0xE7 */ + 0xF3, 0xF3, 0xE3, 0xFB, 0x00, 0x00, 0xDE, 0xD0, /* 0xE8-0xEB */ + 0xCE, 0xB0, 0x00, 0x00, 0xD6, 0xF7, 0xF1, 0xD9, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xF5, 0xC1, 0xDC, 0xC4, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xF5, 0xBB, 0x00, 0x00, 0xDE, 0xD1, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_5C[512] = { + 0x00, 0x00, 0xDC, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xDE, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE2, /* 0x04-0x07 */ + 0xEE, 0xF6, 0xEA, 0xCF, 0xF0, 0xEE, 0xE3, 0xFC, /* 0x08-0x0B */ + 0x00, 0x00, 0xD3, 0xDF, 0xD3, 0xF4, 0xE1, 0xB3, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE1, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xD3, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xDF, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xE9, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xDB, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF6, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xE3, 0xB9, 0xEB, 0xC5, 0xF4, 0xA9, 0xCD, 0xB6, /* 0x38-0x3B */ + 0xD2, 0xF9, 0x00, 0x00, 0xDA, 0xAD, 0xD2, 0xE3, /* 0x3C-0x3F */ + 0xCF, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xCB, 0xDC, 0xCC, 0xFA, 0x00, 0x00, /* 0x44-0x47 */ + 0xCF, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA9, /* 0x48-0x4B */ + 0x00, 0x00, 0xE3, 0xBB, 0xE3, 0xBA, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xE0, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xEE, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB3, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xD3, 0xF5, 0x00, 0x00, 0xD7, 0xA6, 0x00, 0x00, /* 0x60-0x63 */ + 0xF6, 0xB5, 0xD7, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE1, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xEA, /* 0x6C-0x6F */ + 0x00, 0x00, 0xDF, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xFD, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xD0, 0xF7, 0xED, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xCB, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE4, 0xDB, 0x00, 0x00, 0xE1, 0xFB, /* 0xA8-0xAB */ + 0xCB, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xD3, 0xE0, 0x00, 0x00, 0xE4, 0xBF, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xFB, 0xC0, 0x00, 0x00, 0xDA, 0xBE, /* 0xB4-0xB7 */ + 0xE4, 0xCD, 0x00, 0x00, 0xD6, 0xB9, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xC0, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE1, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xF6, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xDF, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xE4, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xE7, /* 0xEC-0xEF */ + 0xDC, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xFA, 0xD6, 0x00, 0x00, 0xD3, 0xF6, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xDA, /* 0xF8-0xFB */ + 0x00, 0x00, 0xFA, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xFD, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xD5, 0xCF, 0xD0, 0xF8, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xCD, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xF5, 0xCB, 0x00, 0x00, 0xE4, 0xF0, 0xCB, 0xAB, /* 0x14-0x17 */ + 0x00, 0x00, 0xD7, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xFE, /* 0x24-0x27 */ + 0x00, 0x00, 0xDD, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xAE, /* 0x48-0x4B */ + 0xCA, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xD5, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE3, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE8, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xAB, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xA9, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xF7, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xD4, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xCE, 0xE4, 0x00, 0x00, 0xE8, 0xF2, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xF5, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE7, 0xAE, 0x00, 0x00, 0xD6, 0xBA, 0x00, 0x00, /* 0xB8-0xBB */ + 0xDF, 0xEC, 0xE4, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE8, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xB5, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xDC, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF4, 0xB9, 0xF1, 0xB6, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE2, 0xDE, 0xE1, 0xB5, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xCD, 0xEF, 0xF1, 0xA7, 0xCE, 0xE5, /* 0xE4-0xE7 */ + 0xCB, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xE3, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xAC, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xD0, 0xF9, 0xEC, 0xAB, 0xDE, 0xD3, /* 0xF0-0xF3 */ + 0xF7, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xF5, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE1, 0xDE, 0xCB, 0xEE, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5E[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xBC, 0xF8, 0xD6, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xEE, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xFD, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xF7, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xDE, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xF2, 0xED, 0x00, 0x00, 0xDB, 0xD9, /* 0x18-0x1B */ + 0x00, 0x00, 0xF0, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE1, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xD4, /* 0x28-0x2B */ + 0x00, 0x00, 0xE0, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xE3, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xE1, 0x00, 0x00, /* 0x34-0x37 */ + 0xDF, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xD9, 0xB6, 0x00, 0x00, 0xFD, 0xAC, /* 0x3C-0x3F */ + 0xEF, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE4, 0xC1, 0xF8, 0xEB, 0x00, 0x00, 0xDB, 0xAC, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xFC, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xD8, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xBA, /* 0x5C-0x5F */ + 0x00, 0x00, 0xDB, 0xDF, 0xD3, 0xD3, 0xF8, 0xC7, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xCE, 0xF8, 0xC1, /* 0x70-0x73 */ + 0xD2, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB4, /* 0x74-0x77 */ + 0xFA, 0xB9, 0xCA, 0xCF, 0x00, 0x00, 0xFC, 0xB3, /* 0x78-0x7B */ + 0xEA, 0xEA, 0xEA, 0xEB, 0xD0, 0xFA, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xED, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xE7, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xC9, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xED, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xEE, 0xBC, 0x00, 0x00, 0xEF, 0xC1, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xD2, 0x00, 0x00, /* 0x98-0x9B */ + 0xDD, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xDF, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xF8, 0xF1, 0xA8, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xB7, /* 0xA8-0xAB */ + 0x00, 0x00, 0xEF, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE4, 0xDD, 0xDF, 0xEE, 0xCB, 0xAC, /* 0xB4-0xB7 */ + 0xE9, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xEC, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xCB, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xF9, 0xBF, 0xD6, 0xAF, 0xD5, 0xC6, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xCF, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xA9, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xF8, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xB7, 0xEE, 0xF8, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD9, /* 0xDC-0xDF */ + 0xF3, 0xDF, 0x00, 0x00, 0xF8, 0xC8, 0xCE, 0xC6, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xD5, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xE6, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xC5, 0xEF, 0xD5, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xEF, 0xFC, 0xDF, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_5F[512] = { + 0x00, 0x00, 0xDC, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xD6, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xC9, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD2, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE3, 0xBD, 0x00, 0x00, 0xCF, 0xE1, /* 0x10-0x13 */ + 0xF0, 0xC0, 0xEC, 0xDA, 0x00, 0x00, 0xDD, 0xD7, /* 0x14-0x17 */ + 0xFB, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xAC, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xA9, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xD7, 0xFB, 0xC1, /* 0x24-0x27 */ + 0x00, 0x00, 0xD2, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE5, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xED, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xAD, 0x00, 0x00, /* 0x38-0x3B */ + 0xF9, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xF7, 0xA5, 0x00, 0x00, 0xCB, 0xAE, 0x00, 0x00, /* 0x48-0x4B */ + 0xDA, 0xAF, 0x00, 0x00, 0xD8, 0xB6, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xA7, 0xFB, 0xB2, /* 0x54-0x57 */ + 0x00, 0x00, 0xFD, 0xC4, 0x00, 0x00, 0xEC, 0xAD, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xA1, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE9, 0xE9, 0xEE, /* 0x64-0x67 */ + 0x00, 0x00, 0xF3, 0xF4, 0xF8, 0xF3, 0xF0, 0xC1, /* 0x68-0x6B */ + 0xDE, 0xAF, 0xF8, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xF3, 0xE0, 0xE7, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xAD, /* 0x74-0x77 */ + 0x00, 0x00, 0xE6, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xF9, 0xA8, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xD8, /* 0x7C-0x7F */ + + 0xE8, 0xD9, 0xEF, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xD3, 0xE2, 0x00, 0x00, 0xE2, 0xDF, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xE0, 0xD7, 0xC8, /* 0x88-0x8B */ + 0xFD, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xDF, 0xEF, 0xCC, 0xD3, 0xD3, 0xF9, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF0, /* 0x94-0x97 */ + 0xDB, 0xC7, 0xDE, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xF4, 0x00, 0x00, /* 0x9C-0x9F */ + 0xD5, 0xD0, 0xE5, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xFC, 0xC7, 0xDC, 0xD6, 0xE2, 0xE0, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xB0, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF3, 0xA3, 0x00, 0x00, 0xD3, 0xEC, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xF4, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xFD, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xFD, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xF9, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xD0, 0xFB, 0xEC, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xBC, 0xF2, 0xA4, /* 0xD4-0xD7 */ + 0xD8, 0xCE, 0xD8, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xF5, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xE1, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xD2, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xFB, 0xEC, 0x00, 0x00, 0xDD, 0xC8, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_60[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE8, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0xC1, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD7, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xD6, 0xBB, 0xDE, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xF7, 0xBD, 0xEC, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xD0, 0xE1, 0x00, 0x00, 0xE0, 0xF5, /* 0x24-0x27 */ + 0xEA, 0xB3, 0x00, 0x00, 0xCE, 0xD6, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xA5, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xEC, 0xF6, 0xE2, 0xE1, 0xE3, 0xBE, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xFC, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xCD, 0xF0, 0x00, 0x00, 0xF9, 0xF6, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xDF, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xE5, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xCE, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xE1, 0xED, 0xB0, /* 0x60-0x63 */ + 0xFD, 0xD1, 0xF6, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xF9, 0xCF, 0xEB, 0xDA, 0xCA, 0xC1, 0x00, 0x00, /* 0x68-0x6B */ + 0xD2, 0xB8, 0xCD, 0xF1, 0x00, 0x00, 0xE3, 0xD3, /* 0x6C-0x6F */ + 0xFD, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xE6, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xE3, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xF0, 0xAA, 0xF9, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xFC, 0xE2, 0x00, 0x00, 0xF8, 0xA7, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE5, 0xEE, 0xF9, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF6, /* 0x9C-0x9F */ + 0xEA, 0xED, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xB4, /* 0xA0-0xA3 */ + 0xF5, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xDC, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xF0, 0xF5, 0x00, 0x00, 0xDD, 0xE8, 0xD3, 0xED, /* 0xB0-0xB3 */ + 0xF5, 0xFC, 0x00, 0x00, 0xDA, 0xBF, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xCC, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xD3, 0xFA, 0xF4, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xEF, 0xD7, 0x00, 0x00, 0xD4, 0xC3, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xFB, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xED, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE0, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xEE, /* 0xDC-0xDF */ + 0xFB, 0xB3, 0xE4, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xF6, 0xE7, 0xD2, 0xDD, 0x00, 0x00, 0xDF, 0xCC, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xC9, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE5, 0xA9, 0xE0, 0xF6, 0xF6, 0xB3, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_61[512] = { + 0x00, 0x00, 0xE1, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xF0, 0x00, 0x00, /* 0x04-0x07 */ + 0xEA, 0xEF, 0xEA, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xDA, 0xC0, 0xF8, 0xB4, 0xEB, 0xF2, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xE4, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xD7, 0xE4, 0xF1, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xEF, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xD7, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xFC, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xF3, 0xE1, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xC4, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xE3, 0xE5, 0x00, 0x00, 0xCB, 0xC5, 0xEA, 0xB4, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xBD, 0x00, 0x00, /* 0x40-0x43 */ + 0xD7, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xDB, /* 0x44-0x47 */ + 0xED, 0xB1, 0x00, 0x00, 0xCC, 0xC3, 0xF7, 0xBE, /* 0x48-0x4B */ + 0xFC, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xF4, /* 0x50-0x53 */ + 0x00, 0x00, 0xD9, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xF3, 0xD3, 0xF3, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xF7, 0xE4, 0x00, 0x00, 0xF7, 0xD1, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xB7, 0xCE, 0xB1, /* 0x60-0x63 */ + 0xCA, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xB4, /* 0x64-0x67 */ + 0xCB, 0xC6, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xF6, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xE7, 0x00, 0x00, /* 0x6C-0x6F */ + 0xEA, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xD4, 0xCB, 0xAF, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xF4, 0xAA, 0xE9, 0xAF, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xF5, 0xC3, 0xE9, 0xD8, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xE9, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xF3, 0x00, 0x00, /* 0x8C-0x8F */ + 0xD5, 0xFB, 0xDE, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xF4, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xFD, 0xF3, 0xFD, 0xF2, 0xF7, 0xA6, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xDD, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xD3, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xCC, 0xA8, 0x00, 0x00, 0xDA, 0xC1, /* 0xA8-0xAB */ + 0xCC, 0xD5, 0x00, 0x00, 0xD9, 0xE4, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xCA, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE3, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xBC, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xF0, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xC4, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xD0, /* 0xC4-0xC7 */ + 0xFA, 0xAB, 0xEB, 0xEB, 0xE7, 0xF8, 0xD9, 0xE5, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xD7, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xA4, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xFB, 0xFC, 0xE3, /* 0xF4-0xF7 */ + 0xFA, 0xD8, 0x00, 0x00, 0xF3, 0xD5, 0x00, 0x00, /* 0xF8-0xFB */ + 0xCF, 0xAB, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xF3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_62[512] = { + 0xD5, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xD4, /* 0x04-0x07 */ + 0xCD, 0xFC, 0x00, 0x00, 0xD9, 0xE6, 0x00, 0x00, /* 0x08-0x0B */ + 0xE2, 0xF9, 0xE2, 0xA1, 0xEB, 0xD4, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE0, 0xF7, 0xE4, 0xB2, 0xCC, 0xFC, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xE4, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xAB, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xBD, /* 0x1C-0x1F */ + 0x00, 0x00, 0xCA, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xB8, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xC0, 0x00, 0x00, /* 0x2C-0x2F */ + 0xEE, 0xFA, 0xFD, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xD3, 0xE3, 0x00, 0x00, 0xFB, 0xC2, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xE8, 0xDB, 0xAE, /* 0x3C-0x3F */ + 0xE1, 0xB6, 0xF8, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xBF, /* 0x44-0x47 */ + 0xFB, 0xC3, 0xDD, 0xEA, 0x00, 0x00, 0xE2, 0xA2, /* 0x48-0x4B */ + 0x00, 0x00, 0xEE, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xE8, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xF6, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xCA, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xD0, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xA6, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xDD, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xE4, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAF, /* 0x7C-0x7F */ + + 0xD0, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xF4, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xCC, 0xBC, 0xF7, 0xEA, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE5, 0xE4, 0xDF, 0xF1, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xF7, 0xE1, 0x00, 0x00, 0xF9, 0xF7, /* 0x94-0x97 */ + 0xEF, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD8, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xA9, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xF8, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xEE, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xD8, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xE4, 0xE3, 0xF5, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xD9, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xE7, /* 0xC4-0xC7 */ + 0xD2, 0xB9, 0xD5, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xDA, 0xE5, 0xDA, 0xD0, 0x00, 0x00, 0xD1, 0xD9, /* 0xCC-0xCF */ + 0xCE, 0xD8, 0x00, 0x00, 0xCB, 0xDE, 0xF4, 0xAC, /* 0xD0-0xD3 */ + 0xDA, 0xFB, 0x00, 0x00, 0xF6, 0xE9, 0xE8, 0xF3, /* 0xD4-0xD7 */ + 0xCF, 0xAC, 0xF0, 0xF0, 0x00, 0x00, 0xF4, 0xFD, /* 0xD8-0xDB */ + 0xDB, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xCE, 0xC0, 0xE3, 0xD4, 0xD1, 0xCF, 0xF1, 0xF5, /* 0xEC-0xEF */ + 0x00, 0x00, 0xCD, 0xF2, 0x00, 0x00, 0xCF, 0xEB, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xB8, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xA6, 0xD1, 0xDA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_63[512] = { + 0x00, 0x00, 0xF2, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA6, /* 0x04-0x07 */ + 0x00, 0x00, 0xE4, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xD3, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xA9, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xC9, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xD8, 0xE6, 0xC9, /* 0x38-0x3B */ + 0x00, 0x00, 0xD8, 0xB8, 0xFA, 0xF3, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xF3, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xF8, 0xA4, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xF3, /* 0x4C-0x4F */ + 0xE6, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xF8, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xE9, /* 0x64-0x67 */ + 0xDE, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xDF, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0xEC, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xDF, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xF4, 0xD2, 0xBA, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xF2, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB7, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE2, 0xA3, 0xD3, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xED, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xC9, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xFA, 0x00, 0x00, /* 0x94-0x97 */ + 0xCF, 0xDE, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xD0, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xD5, 0xD3, 0xF3, 0xF5, 0xF7, 0xAE, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xEF, 0xC8, 0x00, 0x00, 0xCD, 0xF3, /* 0xA4-0xA7 */ + 0xF5, 0xCF, 0xE5, 0xF3, 0xF0, 0xC2, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCA, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xEA, 0xF1, 0x00, 0x00, 0xD0, 0xA6, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xDA, /* 0xCC-0xCF */ + 0xF0, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xE7, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC0, 0xFC, 0xB5, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE4, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xCC, 0xA9, 0xFD, 0xC6, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xEA, 0xB5, 0x00, 0x00, 0xE5, 0xAA, 0xDF, 0xBA, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_64[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE1, 0xDF, 0x00, 0x00, 0xDA, 0xD1, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xE1, 0xB8, 0x00, 0x00, 0xE8, 0xF4, 0xD3, 0xFD, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xE2, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xCA, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xDA, 0xE6, 0xF7, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xCD, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xB6, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xEE, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xF5, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xD8, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA7, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xD9, 0xB8, 0xD9, 0xB9, 0xEF, 0xC9, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xD6, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xF7, 0xCB, 0xDF, 0xAE, 0xE8, 0xF5, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0xB5, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xD5, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xF4, 0xCC, 0xDA, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xE8, /* 0xA8-0xAB */ + 0x00, 0x00, 0xF7, 0xEB, 0xF5, 0xC9, 0x00, 0x00, /* 0xAC-0xAF */ + 0xF3, 0xBC, 0x00, 0x00, 0xDA, 0xD2, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB5, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xE8, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xD6, 0xCF, 0xF4, 0xBA, 0x00, 0x00, 0xF7, 0xC9, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xAA, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xF0, 0xC3, 0xCC, 0xD6, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xD3, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xD3, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xDB, 0xFB, 0x00, 0x00, 0xCB, 0xE0, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xD3, 0xE4, 0xF6, 0xF7, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xD5, 0xBA, 0xF3, 0xCD, 0xCB, 0xE1, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xEB, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xAD, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xFC, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xEC, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xF6, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_65[512] = { + 0xDA, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xF7, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE5, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xE0, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xFD, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xE6, 0xFC, 0xAB, /* 0x28-0x2B */ + 0xD5, 0xBB, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA8, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xA5, 0xCD, 0xB9, /* 0x34-0x37 */ + 0xEA, 0xF2, 0xCB, 0xC7, 0x00, 0x00, 0xCD, 0xF4, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xAF, 0xEF, 0xD9, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xCD, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xFC, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xDF, 0xF3, 0xCE, 0xE7, 0xDA, 0xC2, /* 0x4C-0x4F */ + 0x00, 0x00, 0xCF, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF9, 0xF8, 0xA8, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xE2, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xF2, 0xDF, 0xA4, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xC4, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xCC, 0xD7, 0xE5, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xBB, 0x00, 0x00, /* 0x70-0x73 */ + 0xEF, 0xDA, 0xEE, 0xD8, 0x00, 0x00, 0xDD, 0xA7, /* 0x74-0x77 */ + 0xE2, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC0, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xB0, 0xF8, 0xCA, /* 0x80-0x83 */ + 0x00, 0x00, 0xFC, 0xFA, 0x00, 0x00, 0xD9, 0xFE, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xDE, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xDD, 0xEC, 0xDA, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xE0, /* 0x94-0x97 */ + 0x00, 0x00, 0xD6, 0xF9, 0x00, 0x00, 0xCD, 0xD7, /* 0x98-0x9B */ + 0xDE, 0xD8, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xF8, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE4, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xD0, 0xC5, 0xF4, 0xAE, 0x00, 0x00, 0xDD, 0xA8, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xC5, /* 0xA8-0xAB */ + 0xF3, 0xD6, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xD9, /* 0xAC-0xAF */ + 0xE3, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xA8, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xDB, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xE5, 0xDA, 0xE3, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xDB, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xD5, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC1, /* 0xC8-0xCB */ + 0xEF, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xE9, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xB2, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xFD, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xD9, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xFE, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xEC, 0xED, 0xD3, 0xA9, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xF2, 0xA9, 0xF0, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xE2, 0xE2, 0xE9, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xF9, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE9, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xDA, 0xDA, 0xC3, /* 0xF8-0xFB */ + 0xDA, 0xC4, 0xD4, 0xC5, 0x00, 0x00, 0xE7, 0xFA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_66[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xE0, 0xE3, 0xB0, /* 0x04-0x07 */ + 0x00, 0x00, 0xDB, 0xB2, 0xFB, 0xC4, 0x00, 0x00, /* 0x08-0x0B */ + 0xF3, 0xE3, 0x00, 0x00, 0xD9, 0xA5, 0xFB, 0xE7, /* 0x0C-0x0F */ + 0xDD, 0xCB, 0xD0, 0xD4, 0x00, 0x00, 0xE6, 0xB6, /* 0x10-0x13 */ + 0xE0, 0xAE, 0xFD, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB5, 0xE0, 0xF8, /* 0x1C-0x1F */ + 0xE7, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xF5, 0xF0, 0x00, 0x00, 0xD8, 0xDC, /* 0x24-0x27 */ + 0xED, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xE1, 0xB9, 0x00, 0x00, 0xE3, 0xC0, /* 0x2C-0x2F */ + 0xF9, 0xC0, 0xE9, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xD9, 0xDB, 0x00, 0x00, 0xF3, 0xE4, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB6, 0xE4, 0xE9, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xF0, 0xC5, 0xE3, 0xC1, 0xFC, 0xCC, /* 0x40-0x43 */ + 0xFC, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xF2, 0xCB, 0x00, 0x00, 0xF2, 0xCC, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xCF, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xF1, 0xDB, 0x00, 0x00, 0xFA, 0xD9, /* 0x58-0x5B */ + 0x00, 0x00, 0xF1, 0xB8, 0xFD, 0xF5, 0xE0, 0xF9, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xE7, 0xFB, 0xFC, 0xB7, 0xFC, 0xE4, 0xFB, 0xC5, /* 0x64-0x67 */ + 0xE3, 0xE7, 0xD8, 0xB9, 0x00, 0x00, 0xF6, 0xF8, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xC5, 0xCC, 0xD8, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xAF, /* 0x70-0x73 */ + 0xF4, 0xE7, 0x00, 0x00, 0xEF, 0xDC, 0xCF, 0xFC, /* 0x74-0x77 */ + 0xEF, 0xDD, 0x00, 0x00, 0xF2, 0xAA, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xFD, 0xBE, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xAC, /* 0x84-0x87 */ + 0xFD, 0xBB, 0xFD, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB2, 0x00, 0x00, /* 0x8C-0x8F */ + 0xEA, 0xD1, 0xDF, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xEC, 0xE4, 0xDE, /* 0x94-0x97 */ + 0xE5, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xD9, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xCD, 0xBC, 0x00, 0x00, 0xF3, 0xE5, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xD5, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xBA, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xE7, 0xFB, 0xB5, /* 0xB0-0xB3 */ + 0xF8, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE0, 0xE7, 0x00, 0x00, 0xCC, 0xD9, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xC6, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xE7, 0xA5, 0x00, 0x00, 0xD5, 0xF5, 0xD3, 0xBE, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xFC, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF2, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xDF, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xE8, 0xF8, 0xF8, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xCE, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xF6, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE8, 0xD8, 0x00, 0x00, 0xCD, 0xD8, 0xE7, 0xD6, /* 0xF0-0xF3 */ + 0xCC, 0xDA, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE3, /* 0xF4-0xF7 */ + 0xDF, 0xF6, 0xF0, 0xC7, 0xF0, 0xC6, 0x00, 0x00, /* 0xF8-0xFB */ + 0xD8, 0xBA, 0x00, 0x00, 0xF1, 0xF4, 0xF4, 0xF0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_67[512] = { + 0xF5, 0xCC, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xE5, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xEA, 0xC5, 0xEA, 0xF3, 0x00, 0x00, 0xDD, 0xDB, /* 0x08-0x0B */ + 0x00, 0x00, 0xDC, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xDE, 0xFD, 0xF2, 0xF9, 0x00, 0x00, 0xD5, 0xC7, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD0, /* 0x18-0x1B */ + 0x00, 0x00, 0xF0, 0xC8, 0xD1, 0xA1, 0xD1, 0xA2, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD4, 0xD6, 0xE8, /* 0x24-0x27 */ + 0xD9, 0xCA, 0x00, 0x00, 0xDA, 0xB1, 0xD8, 0xC7, /* 0x28-0x2B */ + 0xDC, 0xE2, 0xF3, 0xCE, 0xF5, 0xF4, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF1, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xDA, 0xD3, 0x00, 0x00, 0xF6, 0xEA, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0xF5, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xFD, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xD2, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xDF, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xDD, 0xFA, 0xBA, /* 0x4C-0x4F */ + 0xEE, 0xA7, 0xF5, 0xBD, 0x00, 0x00, 0xF8, 0xF5, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xE8, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xD4, 0xE1, 0x00, 0x00, 0xD1, 0xA3, 0xE1, 0xD6, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xF9, 0xF8, 0x00, 0x00, 0xDB, 0xCA, /* 0x6C-0x6F */ + 0xCB, 0xF9, 0xD4, 0xD4, 0x00, 0x00, 0xD9, 0xDC, /* 0x70-0x73 */ + 0x00, 0x00, 0xEE, 0xBE, 0x00, 0x00, 0xF7, 0xED, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xEE, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE6, 0xF7, 0xF9, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xED, /* 0x84-0x87 */ + 0x00, 0x00, 0xE8, 0xDB, 0x00, 0x00, 0xDB, 0xB3, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xF7, /* 0x8C-0x8F */ + 0xE0, 0xB0, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xE2, /* 0x90-0x93 */ + 0x00, 0x00, 0xF6, 0xD7, 0x00, 0x00, 0xD7, 0xF9, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xDD, 0x00, 0x00, /* 0x98-0x9B */ + 0xCD, 0xFD, 0xF2, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xBD, /* 0xAC-0xAF */ + 0xF8, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xAC, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xAD, 0xCA, 0xAE, /* 0xB4-0xB7 */ + 0xCF, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xC2, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xDC, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xDA, /* 0xCC-0xCF */ + 0xD9, 0xBB, 0xCA, 0xF3, 0xF6, 0xD3, 0xE6, 0xF8, /* 0xD0-0xD3 */ + 0xEA, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xF6, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF6, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xCF, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xCA, 0xD3, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xAF, /* 0xEC-0xEF */ + 0xD2, 0xB0, 0xF1, 0xBA, 0x00, 0x00, 0xD7, 0xB3, /* 0xF0-0xF3 */ + 0xE3, 0xC3, 0xF3, 0xFD, 0xDE, 0xDA, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xDB, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_68[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xE3, 0xEE, 0xFB, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xF7, 0xD7, 0xCA, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xCE, 0xE8, 0xDB, 0xDB, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xBB, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF1, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xFA, 0xB7, 0xD0, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xCC, 0xAB, 0xEE, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xCB, 0xFA, 0xF9, 0xF9, 0xCC, 0xFD, 0xD3, 0xFE, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xE4, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xEE, 0x00, 0x00, /* 0x4C-0x4F */ + 0xD4, 0xD5, 0xDF, 0xCD, 0x00, 0x00, 0xFC, 0xB8, /* 0x50-0x53 */ + 0xD1, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xF2, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xD2, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xD4, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xD5, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xD8, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xD9, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xA9, /* 0x90-0x93 */ + 0xF6, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xDB, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xF0, 0xC9, 0x00, 0x00, 0xFC, 0xFC, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE8, 0xC9, 0xF4, 0xFE, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xFC, /* 0xA4-0xA7 */ + 0xD7, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xDE, 0xDC, 0x00, 0x00, 0xF0, 0xAC, /* 0xAC-0xAF */ + 0xCC, 0xFE, 0xCD, 0xE1, 0x00, 0x00, 0xE1, 0xBA, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xDB, 0xEF, 0xDA, 0xB2, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xD1, 0xA5, 0xDC, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xD8, 0xF6, 0x00, 0x00, 0xD1, 0xA4, /* 0xC8-0xCB */ + 0x00, 0x00, 0xCD, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xEA, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xF0, 0xF7, 0x00, 0x00, 0xF0, 0xCA, /* 0xD4-0xD7 */ + 0xD0, 0xBE, 0x00, 0x00, 0xDD, 0xDC, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xD6, /* 0xDC-0xDF */ + 0xD3, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xD0, /* 0xE4-0xE7 */ + 0xCD, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xB5, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xF8, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xD4, 0xA1, 0xCE, 0xB2, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_69[512] = { + 0xE8, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xEB, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE3, 0xD5, 0xF5, 0xD0, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xA1, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xA7, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE5, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xE6, 0xCB, 0x00, 0x00, 0xF5, 0xF1, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC5, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xA3, /* 0x50-0x53 */ + 0xE0, 0xDB, 0xF6, 0xEB, 0x00, 0x00, 0xCB, 0xF1, /* 0x54-0x57 */ + 0x00, 0x00, 0xD9, 0xEA, 0xF5, 0xA2, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xD1, 0x00, 0x00, /* 0x5C-0x5F */ + 0xD1, 0xF8, 0xEA, 0xF8, 0xEA, 0xF9, 0xDA, 0xB3, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xEF, 0xDF, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xEF, /* 0x68-0x6B */ + 0x00, 0x00, 0xE5, 0xF6, 0xEE, 0xBF, 0xE2, 0xE4, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xD0, 0xBF, 0x00, 0x00, 0xFA, 0xAC, /* 0x74-0x77 */ + 0xF5, 0xD1, 0xE7, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xE9, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xCE, /* 0x98-0x9B */ + 0xDB, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xFC, 0xCE, 0x00, 0x00, 0xDD, 0xEE, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB4, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xD7, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xB4, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xCD, 0xBE, 0x00, 0x00, 0xDA, 0xE9, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xB0, /* 0xC8-0xCB */ + 0xF7, 0xD9, 0xF3, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xCE, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xCE, 0xAA, 0x00, 0x00, 0xCB, 0xC8, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xA7, /* 0xF8-0xFB */ + 0x00, 0x00, 0xF0, 0xCB, 0x00, 0x00, 0xD0, 0xC7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6A[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC5, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xE0, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xD5, 0xDA, 0x00, 0x00, 0xD7, 0xA7, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xC0, /* 0x14-0x17 */ + 0x00, 0x00, 0xF8, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xD2, 0xED, 0xE9, /* 0x1C-0x1F */ + 0x00, 0x00, 0xD9, 0xBC, 0x00, 0x00, 0xE5, 0xC6, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xF5, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xDA, 0xD4, 0xE2, 0xA7, 0xFB, 0xFC, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xF1, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xCA, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xE8, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xE9, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF8, 0xE2, 0xE5, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xD0, 0xB9, 0xD4, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xA6, /* 0x5C-0x5F */ + 0x00, 0x00, 0xDF, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xF4, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xD3, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xCC, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xEF, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xE5, 0xE5, 0xD0, 0xD5, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xFC, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xFC, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xFE, 0xED, 0xEA, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xB1, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xE3, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xA2, 0xCF, 0xF6, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD0, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xEA, 0xF1, 0xEE, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xCB, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA1, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_6B[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xD5, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0xED, 0x00, 0x00, /* 0x08-0x0B */ + 0xED, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xB2, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xBC, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xFD, 0xE2, 0xF3, 0xAD, 0x00, 0x00, 0xFD, 0xDB, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB0, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xA7, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xFD, 0xE3, 0xCE, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xE4, 0xFA, 0xCE, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xCA, 0xB0, 0x00, 0x00, 0xF7, 0xA7, 0x00, 0x00, /* 0x4C-0x4F */ + 0xCF, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xA2, /* 0x5C-0x5F */ + 0x00, 0x00, 0xFC, 0xB6, 0xF2, 0xAD, 0xEF, 0xE1, /* 0x60-0x63 */ + 0xF3, 0xAE, 0xDC, 0xC6, 0xD9, 0xEB, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE0, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA8, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF6, /* 0x74-0x77 */ + 0xCF, 0xFD, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xDD, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xD1, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xEA, /* 0x80-0x83 */ + 0xF2, 0xCF, 0x00, 0x00, 0xF7, 0xBF, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xE2, 0xE6, 0xE2, 0xA8, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD6, 0x00, 0x00, /* 0x94-0x97 */ + 0xED, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF9, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xB1, 0xDE, 0xB2, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE8, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xD3, 0xAB, 0x00, 0x00, 0xEB, 0xDC, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xAF, 0x00, 0x00, /* 0xB8-0xBB */ + 0xCA, 0xC3, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xFC, /* 0xBC-0xBF */ + 0x00, 0x00, 0xFD, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xEB, 0xF6, 0xCF, 0xB2, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xEC, /* 0xC8-0xCB */ + 0x00, 0x00, 0xD9, 0xBD, 0x00, 0x00, 0xD8, 0xDF, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xB8, 0xEB, 0xBE, /* 0xD0-0xD3 */ + 0xDD, 0xEF, 0x00, 0x00, 0xDD, 0xF0, 0xDD, 0xF1, /* 0xD4-0xD7 */ + 0xDD, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xBE, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xC6, /* 0xE8-0xEB */ + 0xCF, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ +}; + +static const unsigned char u2c_6C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xEE, 0xFD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xAB, /* 0x0C-0x0F */ + 0x00, 0x00, 0xDA, 0xC5, 0x00, 0x00, 0xD8, 0xEC, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xA8, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xE2, 0xA9, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xBC, /* 0x34-0x37 */ + 0xE7, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xF0, 0x00, 0x00, /* 0x3C-0x3F */ + 0xEF, 0xE2, 0xF1, 0xF0, 0xCF, 0xB4, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xF1, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE0, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xDF, 0xA5, 0x00, 0x00, 0xF9, 0xD2, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xFD, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xE6, 0xA3, 0xFB, 0xF1, 0xCB, 0xB0, /* 0x5C-0x5F */ + 0xF2, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xCD, 0xE7, 0x00, 0x00, 0xE8, 0xDC, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xE7, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xF7, 0xC0, 0x00, 0x00, 0xD0, 0xE3, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xA1, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xBD, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xD1, 0xA9, 0xDD, 0xCC, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE3, 0xFE, 0xD1, 0xAA, 0xE8, 0xAA, /* 0x80-0x83 */ + 0x00, 0x00, 0xEA, 0xB6, 0xF9, 0xFA, 0xE6, 0xCC, /* 0x84-0x87 */ + 0xF6, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xD4, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xD9, 0xCB, 0x00, 0x00, 0xD9, 0xD2, 0xD3, 0xCB, /* 0x90-0x93 */ + 0xD8, 0xF7, 0xDA, 0xA9, 0xF5, 0xF8, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xDE, 0xDE, 0xF2, 0xAF, 0xF8, 0xA9, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC8, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xC1, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC1, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xDD, 0xF3, 0xEA, 0xFA, 0x00, 0x00, 0xF6, 0xBD, /* 0xB8-0xBB */ + 0xE1, 0xBB, 0xCD, 0xBF, 0xF4, 0xD4, 0xE6, 0xCD, /* 0xBC-0xBF */ + 0x00, 0x00, 0xFC, 0xCF, 0xFB, 0xA2, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xE0, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF4, 0xBB, 0xDA, 0xD5, 0x00, 0x00, /* 0xC8-0xCB */ + 0xF9, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xF2, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xDB, 0xF6, 0x00, 0x00, 0xDE, 0xDF, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xF2, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xF8, 0xDC, 0xF7, 0xEE, 0xEB, 0xE8, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xD2, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xF1, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xDA, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xEA, 0xDA, 0xC6, /* 0xEC-0xEF */ + 0xF7, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB6, /* 0xF0-0xF3 */ +}; + +static const unsigned char u2c_6D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xC7, /* 0x08-0x0B */ + 0xD6, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xDC, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA9, /* 0x14-0x17 */ + 0x00, 0x00, 0xE2, 0xAA, 0x00, 0x00, 0xD5, 0xA6, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xD7, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xF2, 0xD0, 0x00, 0x00, 0xEA, 0xFB, /* 0x24-0x27 */ + 0x00, 0x00, 0xE0, 0xDD, 0xFB, 0xF3, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xBD, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xE2, 0xE7, 0xFD, 0xD7, 0x00, 0x00, /* 0x34-0x37 */ + 0xCE, 0xC8, 0xEA, 0xB7, 0x00, 0x00, 0xFC, 0xC0, /* 0x38-0x3B */ + 0x00, 0x00, 0xFD, 0xE7, 0xF7, 0xEF, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xD7, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xEF, 0xBA, 0xF1, 0xDD, 0x00, 0x00, /* 0x58-0x5B */ + 0xDE, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xCB, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xDD, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xFB, 0xC7, 0xD5, 0xC8, 0x00, 0x00, /* 0x68-0x6B */ + 0xD7, 0xDF, 0x00, 0x00, 0xDD, 0xA9, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xE9, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xAD, /* 0x74-0x77 */ + 0xF6, 0xD9, 0xFA, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xAA, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xE6, 0xEE, 0x00, 0x00, 0xCC, 0xDC, /* 0x84-0x87 */ + 0xE1, 0xBC, 0xE0, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xE9, 0xBF, 0xFC, 0xFD, 0xE6, 0xCE, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE1, 0xD7, 0x00, 0x00, 0xE6, 0xCF, /* 0x90-0x93 */ + 0x00, 0x00, 0xF4, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF3, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xFB, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF9, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xEF, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xEE, /* 0xC0-0xC3 */ + 0xF6, 0xBE, 0xE0, 0xB2, 0xFC, 0xFE, 0xD1, 0xAB, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xFA, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xC8, /* 0xCC-0xCF */ + 0x00, 0x00, 0xE2, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xD4, 0xA3, 0xF0, 0xF8, 0xD7, 0xA8, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE7, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xD3, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xEF, 0xE4, 0x00, 0x00, 0xD7, 0xC5, 0xEB, 0xE2, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xE7, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE4, 0xA2, 0x00, 0x00, 0xE2, 0xE8, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xE6, 0xD0, 0x00, 0x00, 0xFB, 0xE8, /* 0xF4-0xF7 */ + 0xF4, 0xE8, 0xE5, 0xF4, 0xF4, 0xBC, 0xF4, 0xD5, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_6E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xB6, /* 0x14-0x17 */ + 0x00, 0x00, 0xFC, 0xB9, 0xEE, 0xC2, 0xCA, 0xF5, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xE5, /* 0x1C-0x1F */ + 0xCB, 0xE2, 0xD4, 0xA4, 0x00, 0x00, 0xDE, 0xE0, /* 0x20-0x23 */ + 0xDA, 0xFD, 0xE4, 0xC6, 0xE8, 0xBE, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xDE, /* 0x28-0x2B */ + 0xF6, 0xB4, 0xEA, 0xD2, 0x00, 0x00, 0xF9, 0xFB, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC2, 0x00, 0x00, /* 0x30-0x33 */ + 0xCA, 0xE4, 0x00, 0x00, 0xE7, 0xB7, 0x00, 0x00, /* 0x34-0x37 */ + 0xEA, 0xFD, 0x00, 0x00, 0xD9, 0xDD, 0x00, 0x00, /* 0x38-0x3B */ + 0xDA, 0xB4, 0xEE, 0xAA, 0xFB, 0xE9, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xCB, /* 0x40-0x43 */ + 0xDA, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xBE, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xD3, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xC9, 0x00, 0x00, /* 0x54-0x57 */ + 0xDF, 0xCF, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xC0, /* 0x58-0x5B */ + 0xE3, 0xD7, 0x00, 0x00, 0xEF, 0xE6, 0xFC, 0xD0, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC0, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xD3, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xDC, 0xF7, 0xB7, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xB8, 0xD1, 0xF9, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xC8, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xEA, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xDE, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xD7, 0xB6, 0xCF, 0xB5, 0x00, 0x00, 0xD9, 0xA8, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xEE, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xDD, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xA2, 0xE8, 0xAE, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBD, /* 0xAC-0xAF */ + 0x00, 0x00, 0xF2, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC1, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0xFC, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xB5, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF3, 0xE7, 0xD8, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xFC, 0xD1, 0x00, 0x00, 0xED, 0xB2, /* 0xC8-0xCB */ + 0xF4, 0xAF, 0x00, 0x00, 0xFB, 0xA3, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xFC, 0xC1, 0x00, 0x00, 0xEE, 0xAB, /* 0xD0-0xD3 */ + 0xD4, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xF2, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xEE, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xFB, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xE3, 0xD8, 0xBB, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6F[512] = { + 0x00, 0x00, 0xE5, 0xDB, 0xF8, 0xF7, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xD4, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xA9, /* 0x0C-0x0F */ + 0x00, 0x00, 0xCB, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xE6, 0xD1, 0xF0, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xD8, 0xAE, 0x00, 0x00, 0xF9, 0xD3, 0xD5, 0xFE, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xBC, /* 0x28-0x2B */ + 0xF2, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE2, 0xAB, 0xF3, 0xE8, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xEF, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xEC, /* 0x3C-0x3F */ + 0x00, 0x00, 0xE7, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xDA, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xCC, 0xBE, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xFC, /* 0x54-0x57 */ + 0xDA, 0xEB, 0x00, 0x00, 0xE2, 0xD8, 0xED, 0xD6, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD1, 0xE0, 0xB3, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xD2, 0x00, 0x00, /* 0x60-0x63 */ + 0xEB, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xD3, 0xC1, 0xF0, 0xCD, 0x00, 0x00, /* 0x6C-0x6F */ + 0xCF, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xD2, 0x00, 0x00, /* 0x78-0x7B */ + 0xD4, 0xD8, 0xDC, 0xC9, 0xD7, 0xF1, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xDF, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xF3, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xF4, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xF1, 0xBF, 0xF8, 0xB1, 0x00, 0x00, /* 0x8C-0x8F */ + 0xE9, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xFB, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xD5, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xD4, /* 0xA0-0xA3 */ + 0xF7, 0xCA, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC8, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xE8, 0xF3, 0xBD, /* 0xAC-0xAF */ + 0x00, 0x00, 0xEE, 0xFE, 0x00, 0x00, 0xE7, 0xFE, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xD3, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB6, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCC, 0xAD, 0xF6, 0xFA, 0xD6, 0xB2, 0xD2, 0xD8, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xD8, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE3, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xB9, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xAD, /* 0xDC-0xDF */ + 0xFB, 0xCC, 0xEB, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xD4, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xFB, 0xCD, 0x00, 0x00, 0xD5, 0xBD, /* 0xE8-0xEB */ + 0xF1, 0xDF, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xFB, /* 0xEC-0xEF */ + 0x00, 0x00, 0xDE, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xEB, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_70[512] = { + 0x00, 0x00, 0xE5, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xFB, 0xA4, 0xD4, 0xB9, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xDE, 0xE1, 0x00, 0x00, 0xE4, 0xA3, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xB7, /* 0x0C-0x0F */ + 0x00, 0x00, 0xF8, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xDE, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xD6, 0xD2, 0x00, 0x00, 0xF9, 0xD5, 0xE7, 0xBA, /* 0x18-0x1B */ + 0xEB, 0xD5, 0xD5, 0xF7, 0xEF, 0xE7, 0xE1, 0xBE, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xAE, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xE9, /* 0x24-0x27 */ + 0xD6, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBB, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xCB, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xCE, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xFB, 0xA5, 0xE1, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xF7, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xFB, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xBD, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xFD, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xFC, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xCF, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xED, 0xC7, 0xEE, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xCC, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xA7, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xFA, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xA4, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xFD, 0xDC, 0xED, 0xB3, 0xCE, 0xC9, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xEF, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE1, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xDB, /* 0xA8-0xAB */ + 0xCB, 0xE3, 0xF7, 0xA9, 0x00, 0x00, 0xFB, 0xA6, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB9, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xC0, /* 0xB4-0xB7 */ + 0xED, 0xC8, 0xEF, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xD6, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xCE, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA1, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xFB, 0xF4, 0xD5, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF1, 0xF6, 0x00, 0x00, 0xE6, 0xD3, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xCC, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xF8, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xDC, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_71[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xFD, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xE5, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xF1, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xDB, 0xCC, 0xDD, 0xCD, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xC8, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xD9, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xA5, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE6, 0xFB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xD4, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xC8, /* 0x44-0x47 */ + 0x00, 0x00, 0xD6, 0xA1, 0xFD, 0xBF, 0x00, 0x00, /* 0x48-0x4B */ + 0xFC, 0xD3, 0x00, 0x00, 0xEF, 0xA1, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE7, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xEE, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xE6, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xE9, 0xF2, 0x00, 0x00, 0xDF, 0xB0, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xD8, 0xE0, 0xFC, 0xBA, 0xFD, 0xAF, 0xF0, 0xCE, /* 0x64-0x67 */ + 0x00, 0x00, 0xDB, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE5, 0xC9, 0x00, 0x00, 0xED, 0xB4, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE0, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xE3, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xE9, 0xFB, 0xEA, 0xA8, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xB7, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xA7, 0x00, 0x00, /* 0x90-0x93 */ + 0xE9, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xFD, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD9, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xEC, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE8, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xE6, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xFD, 0xF8, 0xFD, 0xF9, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xBF, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xE7, 0xA7, 0x00, 0x00, 0xE6, 0xD7, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xD4, 0xF3, 0xD4, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xFA, 0x00, 0x00, /* 0xCC-0xCF */ + 0xD7, 0xF2, 0x00, 0x00, 0xE1, 0xC0, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xDB, 0xE2, 0xE6, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xBD, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xF0, 0xCF, 0xF3, 0xBE, 0xE2, 0xAC, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xF5, 0xB7, 0xE0, 0xF0, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xB8, /* 0xF8-0xFB */ + 0xE3, 0xE8, 0x00, 0x00, 0xD4, 0xA7, 0xE8, 0xFC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_72[512] = { + 0xFA, 0xD2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xEF, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xD6, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xB4, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xD0, 0x00, 0x00, /* 0x28-0x2B */ + 0xF7, 0xF0, 0xEE, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xEA, 0xBA, 0x00, 0x00, 0xEA, 0xD3, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xED, 0xC9, 0xDD, 0xAB, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xAC, 0xFD, 0xA1, /* 0x38-0x3B */ + 0x00, 0x00, 0xDF, 0xD0, 0xEC, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */ + 0xDF, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xED, 0xF8, 0xB8, /* 0x44-0x47 */ + 0xF7, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xF8, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xE0, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xD4, 0xBA, 0xE4, 0xB3, 0x00, 0x00, 0xE9, 0xDA, /* 0x58-0x5B */ + 0x00, 0x00, 0xDE, 0xB6, 0x00, 0x00, 0xD9, 0xBF, /* 0x5C-0x5F */ + 0x00, 0x00, 0xD9, 0xC0, 0xD6, 0xEF, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xCC, /* 0x64-0x67 */ + 0x00, 0x00, 0xDA, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xE5, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xF7, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xCC, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xDF, 0xF9, 0xD7, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xBB, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xFA, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xCC, 0xB3, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xF3, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xDF, 0xD2, 0x00, 0x00, 0xCE, 0xCA, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xEE, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE4, 0x00, 0x00, /* 0xCC-0xCF */ + 0xFB, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xB7, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xEE, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xCE, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE2, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xD7, 0xE1, 0xFA, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xD5, 0xC9, 0xF8, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_73[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xD9, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xE9, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xED, /* 0x18-0x1B */ + 0xE3, 0xC4, 0xF0, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE8, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE0, 0xFA, 0xEE, 0xC4, 0xD9, 0xDE, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xA2, 0xEB, 0xA3, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xC2, 0xEA, 0xBB, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE8, 0xAB, 0xDE, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xED, 0xEF, 0x00, 0x00, 0xE8, 0xA3, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xF1, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xD4, 0xBC, 0x00, 0x00, 0xFC, 0xEA, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE7, 0xBE, 0x00, 0x00, 0xFC, 0xF2, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xD6, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xE2, 0xAE, 0x00, 0x00, 0xD3, 0xB7, 0xFA, 0xCC, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xFA, 0xDC, 0x00, 0x00, 0xED, 0xB5, 0xE1, 0xE3, /* 0x84-0x87 */ + 0x00, 0x00, 0xE8, 0xAC, 0x00, 0x00, 0xE8, 0xDD, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xE9, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xF4, 0xBD, 0x00, 0x00, 0xCF, 0xB8, 0xE9, 0xDB, /* 0x94-0x97 */ + 0xD1, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xC7, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xC9, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE8, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xDE, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xBC, 0xD3, 0xE5, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xFA, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xDA, 0xD6, 0x00, 0x00, 0xCA, 0xB1, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xDA, 0xC8, 0xDF, 0xA6, 0x00, 0x00, /* 0xC8-0xCB */ + 0xF9, 0xB3, 0xF2, 0xD2, 0x00, 0x00, 0xCA, 0xC4, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xCB, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xCD, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xFD, 0xB0, 0xD5, 0xA8, 0x00, 0x00, /* 0xDC-0xDF */ + 0xF1, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xE9, /* 0xE0-0xE3 */ + 0xDC, 0xCA, 0xEC, 0xB4, 0xFA, 0xC0, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xFB, 0xA8, 0xD0, 0xA8, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xDA, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xEE, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE0, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xEF, 0xEA, 0xFA, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_74[512] = { + 0x00, 0x00, 0xE0, 0xC4, 0x00, 0x00, 0xCF, 0xB9, /* 0x00-0x03 */ + 0x00, 0x00, 0xD5, 0xCA, 0xD7, 0xE2, 0xE2, 0xAF, /* 0x04-0x07 */ + 0x00, 0x00, 0xD7, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xCD, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xDA, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xEF, 0xA2, 0xE2, 0xDA, 0xF6, 0xFC, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xFB, 0xD0, 0xD1, 0xAD, 0x00, 0x00, /* 0x24-0x27 */ + 0xCD, 0xE4, 0x00, 0x00, 0xD1, 0xAE, 0xDC, 0xED, /* 0x28-0x2B */ + 0xE8, 0xCE, 0x00, 0x00, 0xF0, 0xF9, 0xCE, 0xB5, /* 0x2C-0x2F */ + 0xE6, 0xFC, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xFB, /* 0x30-0x33 */ + 0xD0, 0xD6, 0xDD, 0xF5, 0xF7, 0xF1, 0x00, 0x00, /* 0x34-0x37 */ + 0xF6, 0xFD, 0x00, 0x00, 0xDB, 0xF7, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xEA, /* 0x3C-0x3F */ + 0xE9, 0xDC, 0xD9, 0xC1, 0x00, 0x00, 0xF5, 0xF2, /* 0x40-0x43 */ + 0xE0, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD4, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xF9, 0xC2, 0x00, 0x00, 0xEA, 0xBC, /* 0x54-0x57 */ + 0x00, 0x00, 0xD2, 0xC5, 0xFB, 0xD1, 0xE7, 0xC0, /* 0x58-0x5B */ + 0xEB, 0xA5, 0x00, 0x00, 0xDF, 0xFA, 0xE3, 0xA2, /* 0x5C-0x5F */ + 0xD7, 0xB9, 0x00, 0x00, 0xE9, 0xC3, 0x00, 0x00, /* 0x60-0x63 */ + 0xE8, 0xFD, 0xE8, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xF2, 0xD3, 0xFB, 0xA9, 0xD8, 0xA5, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xCB, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xC8, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xAF, 0xD7, 0xE3, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC6, /* 0x84-0x87 */ + 0x00, 0x00, 0xD6, 0xA2, 0x00, 0x00, 0xED, 0xF0, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xD7, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xFC, 0xD4, 0x00, 0x00, 0xDA, 0xD7, 0xCC, 0xDF, /* 0x9C-0x9F */ + 0x00, 0x00, 0xF2, 0xD4, 0x00, 0x00, 0xD1, 0xB0, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xCC, 0xE0, 0x00, 0x00, 0xDB, 0xFD, /* 0xA4-0xA7 */ + 0xF3, 0xBF, 0x00, 0x00, 0xF0, 0xD1, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xFC, 0xBB, 0x00, 0x00, 0xE2, 0xB0, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE6, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE2, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xDF, 0xDE, 0x00, 0x00, 0xE0, 0xC7, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xEF, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xE1, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xEA, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE7, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xCE, 0xB6, 0x00, 0x00, 0xF3, 0xC0, 0x00, 0x00, /* 0xD8-0xDB */ + 0xCD, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xFB, 0xD2, 0x00, 0x00, 0xF8, 0xF8, 0xF7, 0xFB, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xBF, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xB7, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xB6, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_75[512] = { + 0x00, 0x00, 0xDC, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xCC, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xF1, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xE8, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xCA, 0xF6, 0x00, 0x00, 0xE4, 0xA4, 0xF4, 0xD6, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xE6, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xA7, /* 0x20-0x23 */ + 0x00, 0x00, 0xDF, 0xE7, 0xE1, 0xC1, 0x00, 0x00, /* 0x24-0x27 */ + 0xE9, 0xC4, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xCB, /* 0x28-0x2B */ + 0xE9, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xEF, 0xA3, 0xEB, 0xA6, 0xCB, 0xA3, 0xE3, 0xE9, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xFB, /* 0x34-0x37 */ + 0xEF, 0xA4, 0x00, 0x00, 0xEF, 0xEB, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xB4, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xCD, 0xA3, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xE6, /* 0x4C-0x4F */ + 0x00, 0x00, 0xEF, 0xA5, 0x00, 0x00, 0xD3, 0xCC, /* 0x50-0x53 */ + 0xDA, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xD7, 0xBA, 0x00, 0x00, 0xF2, 0xD5, /* 0x58-0x5B */ + 0xF5, 0xE5, 0xD9, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xB4, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xD5, 0xD4, 0xFD, 0xCF, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xE3, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xE1, /* 0x6C-0x6F */ + 0xEC, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xFB, 0xFE, 0xD3, 0xD7, 0x00, 0x00, /* 0x74-0x77 */ + 0xD1, 0xB1, 0x00, 0x00, 0xCB, 0xB1, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB2, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xB2, 0xF1, 0xC2, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xE1, 0xF9, 0xB5, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC3, 0xE1, 0xC2, /* 0x8C-0x8F */ + 0x00, 0x00, 0xEB, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xDF, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xCB, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xB9, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xF8, 0xDE, 0xF9, 0xAA, 0xCA, 0xF7, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xED, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xD3, 0xB8, 0xF2, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xD4, 0xD9, 0xEE, 0xC5, 0xF2, 0xF0, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xB2, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xDC, 0xBB, 0x00, 0x00, 0xF1, 0xF8, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xEC, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xCA, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xF6, 0xC0, 0xFD, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xD4, 0xE3, 0xCC, 0xE2, 0x00, 0x00, 0xF7, 0xD4, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xE5, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xD3, 0xC3, 0x00, 0x00, 0xD8, 0xA6, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xF6, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xF6, 0x00, 0x00, /* 0xF8-0xFB */ + 0xCD, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_76[512] = { + 0xE5, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE5, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE1, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xB0, /* 0x1C-0x1F */ + 0xF4, 0xB0, 0xF3, 0xEA, 0xDA, 0xEE, 0x00, 0x00, /* 0x20-0x23 */ + 0xD7, 0xBB, 0x00, 0x00, 0xE2, 0xB1, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xAA, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xFB, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE4, 0xDF, 0x00, 0x00, 0xCA, 0xD6, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xA8, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xFE, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xF6, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xEF, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xD4, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE0, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE8, 0xB9, 0x00, 0x00, 0xEF, 0xA6, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xCD, 0xA4, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF4, /* 0x78-0x7B */ + 0xDB, 0xA1, 0xDB, 0xDC, 0xDB, 0xDD, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xEE, 0xDC, 0x00, 0x00, 0xCB, 0xCB, 0xFC, 0xD5, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xEB, 0x00, 0x00, /* 0x8C-0x8F */ + 0xCD, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xD3, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xAB, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xD4, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xA9, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xDD, 0xDB, 0xCD, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xCE, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE7, 0xC3, 0x00, 0x00, 0xEC, 0xCC, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xEC, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xCC, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xFC, /* 0xD8-0xDB */ + 0xD4, 0xA8, 0x00, 0x00, 0xED, 0xD3, 0xD8, 0xEF, /* 0xDC-0xDF */ + 0x00, 0x00, 0xF2, 0xD7, 0x00, 0x00, 0xCA, 0xF8, /* 0xE0-0xE3 */ + 0xDA, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD4, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xCD, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xEE, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xF2, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xDF, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xDA, 0xF0, 0x00, 0x00, 0xE2, 0xEA, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_77[512] = { + 0x00, 0x00, 0xE0, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xD8, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xF7, 0xAF, 0xDA, 0xB6, 0x00, 0x00, 0xCA, 0xD7, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xD8, 0x00, 0x00, /* 0x1C-0x1F */ + 0xD8, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xFA, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xEF, /* 0x34-0x37 */ + 0xD9, 0xC2, 0x00, 0x00, 0xF0, 0xD2, 0x00, 0x00, /* 0x38-0x3B */ + 0xE4, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xF3, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xFA, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xEC, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xE2, 0xB2, 0x00, 0x00, 0xD4, 0xBD, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xCE, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xE2, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xD4, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xC2, 0xE7, 0xDA, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xD9, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xD9, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xBE, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xDC, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE2, 0xEB, 0xD6, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xCA, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xDA, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xD7, /* 0xB8-0xBB */ + 0xCC, 0xA1, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xBA, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xB8, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xC3, /* 0xD8-0xDB */ + 0xD0, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xC5, 0xEB, 0xF8, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xF2, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xCF, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xD3, 0xAD, 0xE8, 0xE1, 0xCE, 0xEC, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB4, /* 0xF0-0xF3 */ +}; + +static const unsigned char u2c_78[512] = { + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xE3, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xF7, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xF2, 0xB2, 0xF3, 0xF6, 0xF6, 0xDB, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xD7, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xDF, 0x00, 0x00, /* 0x30-0x33 */ + 0xF7, 0xF2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xD0, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xDA, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xF5, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xBC, /* 0x68-0x6B */ + 0xCC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xDB, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xDD, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xD1, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xED, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xD6, 0xDE, 0xE4, 0xF4, 0xE1, 0xEF, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xDD, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xCF, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE5, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xA1, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE0, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xAC, 0xFC, 0xAD, /* 0xB8-0xBB */ + 0xD8, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xED, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xDB, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xF0, 0xF3, 0xAF, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xA5, 0x00, 0x00, /* 0xCC-0xCF */ + 0xDA, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xD8, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xCC, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB4, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xCA, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xF2, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_79[512] = { + 0x00, 0x00, 0xF5, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xA8, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xA6, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xEC, 0xD5, 0xF8, /* 0x28-0x2B */ + 0xDA, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xC6, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xE4, 0x00, 0x00, /* 0x3C-0x3F */ + 0xDE, 0xE5, 0xD1, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB6, /* 0x44-0x47 */ + 0xD1, 0xB7, 0xF2, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE9, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xD3, 0xF2, 0xB4, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xD4, 0xCB, 0xE4, /* 0x58-0x5B */ + 0xFB, 0xD4, 0xF5, 0xE6, 0xE3, 0xEA, 0x00, 0x00, /* 0x5C-0x5F */ + 0xDE, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xDF, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xF8, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xF0, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB8, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xDF, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xD0, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xFC, 0xA1, 0xEF, 0xEE, 0xDC, 0xD8, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE9, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xDD, 0xFD, 0xFB, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xC9, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC9, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xD4, 0xAA, 0x00, 0x00, 0xE5, 0xCC, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE9, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xD0, 0xD8, 0xFC, 0xA2, 0xD4, 0xBE, /* 0xBC-0xBF */ + 0xE2, 0xB3, 0xDE, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xDC, 0xBC, 0xD2, 0xB6, 0xF5, 0xD5, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xCE, 0xA1, 0xF5, 0xA9, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xDD, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xDD, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xD5, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xF6, 0xDF, 0x00, 0x00, 0xF2, 0xDA, 0xE4, 0xEB, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xF2, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xB9, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_7A[512] = { + 0xFD, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xE1, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xCA, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xEF, /* 0x08-0x0B */ + 0x00, 0x00, 0xF5, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xEC, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xAD, /* 0x14-0x17 */ + 0x00, 0x00, 0xF2, 0xC2, 0xF6, 0xC3, 0x00, 0x00, /* 0x18-0x1B */ + 0xD7, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xA2, /* 0x1C-0x1F */ + 0xF0, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xFA, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF6, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF3, 0xF2, 0xC3, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xAB, /* 0x38-0x3B */ + 0xCA, 0xB3, 0xCD, 0xA6, 0x00, 0x00, 0xCD, 0xC3, /* 0x3C-0x3F */ + 0xCD, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xCF, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xF6, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xEE, 0xDD, 0xE7, 0xC4, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xB4, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xDF, 0xE2, 0xE7, 0xDB, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE8, 0xB1, 0x00, 0x00, 0xFC, 0xAE, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE5, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xFA, 0xEB, 0x00, 0x00, 0xCF, 0xBC, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xCF, 0xE2, 0xCD, 0xF6, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xEF, 0xF0, 0x00, 0x00, 0xF4, 0xBE, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xD4, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xF3, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE9, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xF2, 0xF3, 0xEB, /* 0x90-0x93 */ + 0x00, 0x00, 0xF0, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xCF, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xDF, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE8, 0xC0, 0xE8, 0xC1, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0xE3, 0xE9, 0xA2, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAA, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF3, 0xC1, 0xD0, 0xAB, 0x00, 0x00, 0xD4, 0xE4, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xBC, 0xD8, 0xA1, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xDF, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xF3, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xDC, 0xBD, 0x00, 0x00, 0xCC, 0xE5, /* 0xDC-0xDF */ + 0xED, 0xF1, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xE2, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xD4, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xB5, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xCA, 0xE6, 0x00, 0x00, 0xD3, 0xAE, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xE6, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xF1, 0xD3, 0xF5, 0xE7, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xDA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7B[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xEE, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE1, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xDF, 0xE9, 0x00, 0x00, 0xEE, 0xDE, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xC2, 0x00, 0x00, /* 0x1C-0x1F */ + 0xD8, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xAC, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xF0, 0xAF, 0xD6, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xE1, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xB6, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xD4, 0xF5, 0x00, 0x00, 0xD0, 0xC9, /* 0x48-0x4B */ + 0xEF, 0xA7, 0xE2, 0xEC, 0x00, 0x00, 0xDB, 0xEA, /* 0x4C-0x4F */ + 0xCE, 0xCC, 0xF5, 0xE8, 0xF7, 0xD5, 0x00, 0x00, /* 0x50-0x53 */ + 0xD3, 0xCD, 0x00, 0x00, 0xF3, 0xFE, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xD0, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE0, 0xFE, 0x00, 0x00, 0xDF, 0xFB, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xE6, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE8, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xCD, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xA8, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB4, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xDA, 0xD8, 0xD1, 0xB9, 0x00, 0x00, 0xDF, 0xA9, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xB0, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xCC, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xCE, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xEF, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xDF, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xED, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xEE, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xEF, 0xBD, 0xFC, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xDB, 0xF4, 0x00, 0x00, 0xEF, 0xAA, 0xF8, 0xB9, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF5, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD9, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xE1, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xD4, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xDE, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ +}; + +static const unsigned char u2c_7C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xEA, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xC2, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xAF, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xCA, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xD7, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xD8, 0xE1, 0xC7, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xF4, 0xD8, 0xD6, 0xB3, 0xDD, 0xAD, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xBE, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xF1, 0xC3, 0xEE, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xD6, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xF4, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xD7, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xB7, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xFB, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xDD, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xA3, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xDA, 0xD9, 0x00, 0x00, 0xF0, 0xD8, /* 0x94-0x97 */ + 0xEF, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD8, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xF1, 0xD4, 0x00, 0x00, 0xED, 0xF2, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xDB, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xD5, 0xDC, 0xF3, 0xC4, 0xCB, 0xD7, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE2, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xF1, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xD5, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xD8, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xD0, 0xF0, 0xD9, /* 0xDC-0xDF */ + 0xCB, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xDD, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xA7, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAC, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7D[512] = { + 0xD1, 0xBA, 0x00, 0x00, 0xF1, 0xC4, 0x00, 0x00, /* 0x00-0x03 */ + 0xE5, 0xB3, 0xFB, 0xF5, 0xE9, 0xE1, 0xFD, 0xE0, /* 0x04-0x07 */ + 0xFC, 0xBC, 0x00, 0x00, 0xDA, 0xA2, 0xDA, 0xA3, /* 0x08-0x0B */ + 0x00, 0x00, 0xD2, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xD2, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xE2, 0xED, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xE9, /* 0x14-0x17 */ + 0xCE, 0xDC, 0xF2, 0xB5, 0xD0, 0xE4, 0xDD, 0xD1, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE1, 0xC8, 0xDB, 0xB7, 0xDF, 0xE3, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xB9, /* 0x28-0x2B */ + 0xF1, 0xC5, 0x00, 0x00, 0xF3, 0xCF, 0xD7, 0xAB, /* 0x2C-0x2F */ + 0xE1, 0xAC, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xEB, /* 0x30-0x33 */ + 0x00, 0x00, 0xEE, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xE1, 0xC9, 0xCA, 0xFA, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xFB, 0xFA, 0xE1, /* 0x40-0x43 */ + 0xF0, 0xDA, 0xCC, 0xE7, 0xDA, 0xF4, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xCC, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xED, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xD5, 0xA9, 0xFA, 0xE2, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xE5, 0x00, 0x00, /* 0x64-0x67 */ + 0xEB, 0xD6, 0x00, 0x00, 0xEC, 0xDF, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xFC, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xF7, 0xD6, 0xDE, 0xEA, 0xCB, 0xB4, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xBE, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xCC, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xBD, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xF2, 0xE2, 0xB7, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xE8, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xF0, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xD6, 0xE0, 0x00, 0x00, 0xF1, 0xC6, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE2, 0xB8, 0xEB, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xCB, 0xB5, 0xD8, 0xD1, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xF4, 0xCE, 0xF3, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xD7, 0xC6, 0x00, 0x00, 0xD1, 0xBB, 0xF7, 0xAA, /* 0xB8-0xBB */ + 0x00, 0x00, 0xED, 0xCA, 0xD7, 0xD3, 0xD8, 0xFA, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xC5, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xCC, 0xDD, 0xFC, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xFD, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xF9, 0xE5, 0x00, 0x00, 0xE0, 0xCA, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF2, 0xFD, 0xD3, 0xB0, 0x00, 0x00, /* 0xDC-0xDF */ + 0xF4, 0xF3, 0xDA, 0xC9, 0x00, 0x00, 0xE6, 0xDE, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xF8, 0xBA, 0xE8, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xD8, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD5, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xD6, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xC6, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_7E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xF2, 0xDB, 0xE4, 0xFC, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xE8, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xDA, /* 0x18-0x1B */ + 0x00, 0x00, 0xF2, 0xDC, 0xFB, 0xD6, 0xE9, 0xB2, /* 0x1C-0x1F */ + 0x00, 0x00, 0xEE, 0xAD, 0x00, 0x00, 0xFA, 0xE3, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xEE, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xEA, 0xE6, 0xE0, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF0, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xAC, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xF5, 0xC5, 0xEE, 0xE0, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xDB, 0xE5, 0x00, 0x00, 0xDD, 0xDE, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xF0, 0xE9, 0xA3, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xF9, 0x00, 0x00, /* 0x50-0x53 */ + 0xF2, 0xC4, 0xE0, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xA4, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xE2, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE3, 0xB1, 0xFC, 0xEB, 0xCD, 0xA8, /* 0x68-0x6B */ + 0x00, 0x00, 0xCC, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xF0, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xE6, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xCD, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xC3, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xE1, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xAB, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xC5, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE9, 0x00, 0x00, /* 0x94-0x97 */ + 0xF3, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xC0, /* 0x98-0x9B */ + 0xD5, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_7F[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xAE, 0x00, 0x00, /* 0x34-0x37 */ + 0xF9, 0xFC, 0x00, 0x00, 0xCC, 0xC0, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xE5, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xCE, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xD8, 0xD2, 0xF9, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xAA, 0xCE, 0xD1, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xC7, 0x00, 0x00, /* 0x6C-0x6F */ + 0xDB, 0xEB, 0x00, 0x00, 0xDF, 0xFE, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xD8, 0xE1, 0x00, 0x00, 0xF7, 0xF3, /* 0x74-0x77 */ + 0x00, 0x00, 0xD7, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xD4, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xD1, 0xBC, 0x00, 0x00, 0xE5, 0xCF, 0x00, 0x00, /* 0x88-0x8B */ + 0xCB, 0xB6, 0x00, 0x00, 0xDA, 0xB8, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xCD, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xBE, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xBA, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xCF, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE0, 0xCC, 0xEB, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xFD, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xD7, 0xE8, 0xCB, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE9, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xE8, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE3, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xCD, 0x00, 0x00, /* 0xC8-0xCB */ + 0xEC, 0xCE, 0x00, 0x00, 0xD6, 0xBF, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xA7, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xDF, 0xD6, 0xFD, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE1, /* 0xDC-0xDF */ + 0xF6, 0xA8, 0xDD, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xF8, 0xBB, 0x00, 0x00, 0xE8, 0xD1, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xF9, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xCE, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xEC, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_80[512] = { + 0xE9, 0xA5, 0xD6, 0xD5, 0x00, 0x00, 0xCD, 0xC5, /* 0x00-0x03 */ + 0x00, 0x00, 0xED, 0xBA, 0xD1, 0xBD, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xCF, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xEC, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xD2, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xCC, 0xE9, 0x00, 0x00, 0xD9, 0xC4, /* 0x14-0x17 */ + 0xE9, 0xFC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xD1, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xBC, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xAD, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xF7, 0xB0, 0x00, 0x00, 0xCC, 0xEA, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xC4, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC0, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xFD, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA1, 0x00, 0x00, /* 0x54-0x57 */ + 0xDE, 0xBD, 0x00, 0x00, 0xF6, 0xA9, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xA4, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xA4, /* 0x6C-0x6F */ + 0xF5, 0xC6, 0x00, 0x00, 0xE1, 0xA2, 0xE9, 0xC6, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xC5, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xF4, 0xE9, 0xD6, 0xEC, 0xEB, 0xD3, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xEC, 0xBD, 0xE2, 0xDC, 0xDE, 0xEB, 0xF0, 0xDC, /* 0x84-0x87 */ + 0x00, 0x00, 0xEB, 0xBF, 0x00, 0x00, 0xD7, 0xCE, /* 0x88-0x8B */ + 0xD1, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xAB, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xFD, /* 0x98-0x9B */ + 0x00, 0x00, 0xCA, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xCD, 0xC6, 0xF2, 0xB6, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xDD, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xCC, 0xB7, 0xDB, 0xB8, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xE9, /* 0xAC-0xAF */ + 0x00, 0x00, 0xCE, 0xDD, 0xEB, 0xC0, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xFD, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xCB, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD6, /* 0xC0-0xC3 */ + 0xF1, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xDB, 0xCE, 0x00, 0x00, 0xF7, 0xC3, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xCF, 0xCB, 0xA4, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE0, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xFB, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xEB, 0xCA, 0xE0, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xCE, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xD4, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xFD, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xD2, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_81[512] = { + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xB7, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xFA, 0xF6, 0xF6, 0xAA, 0xFA, 0xF7, /* 0x04-0x07 */ + 0xD8, 0xE6, 0x00, 0x00, 0xF4, 0xB1, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE8, 0xD2, 0x00, 0x00, 0xCA, 0xC5, 0xCC, 0xEB, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xEE, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xE2, 0xBB, 0x00, 0x00, 0xF7, 0xAD, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE1, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xF3, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xA1, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xFD, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xEC, 0x00, 0x00, /* 0x4C-0x4F */ + 0xDD, 0xAF, 0xDD, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xCB, 0xB7, 0xE8, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xE1, 0xA3, 0xD2, 0xE0, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xFE, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE9, 0xA6, 0xCB, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xED, 0xF3, 0xDC, 0xD9, 0xE0, 0xCD, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xDA, /* 0x7C-0x7F */ + + 0xDB, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xCC, 0xAE, 0x00, 0x00, 0xDA, 0xDB, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xC7, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xB1, 0x00, 0x00, /* 0x98-0x9B */ + 0xD8, 0xAF, 0xE3, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xCE, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xF3, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xF8, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xCE, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF5, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xEC, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xD3, 0xC5, 0xFC, 0xEC, 0xD2, 0xDB, /* 0xBC-0xBF */ + 0xD4, 0xEB, 0x00, 0x00, 0xDE, 0xA2, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE6, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xF0, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xD5, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xF4, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xED, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE8, 0xC2, 0x00, 0x00, 0xED, 0xF5, /* 0xE4-0xE7 */ + 0xD7, 0xFC, 0x00, 0x00, 0xED, 0xBB, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xF6, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xB8, /* 0xF0-0xF3 */ + 0xF6, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xE6, 0xF2, 0xDD, /* 0xF8-0xFB */ + 0xCF, 0xBF, 0x00, 0x00, 0xEB, 0xAC, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_82[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xCF, 0xC0, 0x00, 0x00, 0xE6, 0xA8, /* 0x04-0x07 */ + 0xFD, 0xE9, 0x00, 0x00, 0xCF, 0xC1, 0x00, 0x00, /* 0x08-0x0B */ + 0xE0, 0xDF, 0xDE, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA2, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xBF, /* 0x18-0x1B */ + 0xE2, 0xEF, 0x00, 0x00, 0xD9, 0xF1, 0xF1, 0xC7, /* 0x1C-0x1F */ + 0x00, 0x00, 0xCB, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xFE, 0xDB, 0xBA, /* 0x28-0x2B */ + 0xDA, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xF6, 0xEC, 0xDA, 0xDC, 0xFA, 0xE4, /* 0x34-0x37 */ + 0x00, 0x00, 0xE0, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xDD, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xE6, 0xA9, 0x00, 0x00, 0xEF, 0xF3, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xF3, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xEB, 0xFA, 0x00, 0x00, 0xF9, 0xE6, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xDD, 0xD5, 0xDE, /* 0x6C-0x6F */ + 0x00, 0x00, 0xCA, 0xDE, 0xDF, 0xE4, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xFD, 0x00, 0x00, /* 0x74-0x77 */ + 0xF5, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF5, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE3, /* 0x88-0x8B */ + 0x00, 0x00, 0xED, 0xCB, 0xCF, 0xE4, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD3, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xDD, 0xB3, 0xD4, 0xEC, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xF2, 0xB9, 0x00, 0x00, 0xDF, 0xB7, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xCB, 0xCE, 0xFB, 0xD8, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xD0, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xDD, 0xD2, 0xF7, 0xF4, 0xE7, 0xDC, 0xE4, 0xA5, /* 0xAC-0xAF */ + 0x00, 0x00, 0xFC, 0xA3, 0x00, 0x00, 0xDB, 0xBB, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xBA, /* 0xB4-0xB7 */ + 0xE9, 0xFD, 0xD0, 0xCA, 0x00, 0x00, 0xF5, 0xD6, /* 0xB8-0xBB */ + 0xD9, 0xC5, 0xE4, 0xB4, 0x00, 0x00, 0xED, 0xA7, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xEA, 0xBD, 0xE6, 0xFE, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xF7, 0xC4, 0xF5, 0xAD, 0x00, 0x00, 0xD9, 0xE0, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xB4, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE2, 0xCF, 0xC2, /* 0xDC-0xDF */ + 0x00, 0x00, 0xEC, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE5, 0xB4, 0xCD, 0xC8, 0xEE, 0xC8, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE7, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xCD, 0xC9, 0xF9, 0xB7, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_83[512] = { + 0x00, 0x00, 0xF1, 0xE8, 0xD9, 0xF2, 0xDB, 0xF5, /* 0x00-0x03 */ + 0xCA, 0xB5, 0xD9, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xD8, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xAB, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xED, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD4, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xDA, /* 0x2C-0x2F */ + 0x00, 0x00, 0xE2, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xFC, 0xED, 0xEC, 0xE0, 0xD2, 0xFE, 0x00, 0x00, /* 0x34-0x37 */ + 0xE9, 0xC7, 0xE6, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xE2, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xBB, /* 0x44-0x47 */ + 0x00, 0x00, 0xF5, 0xAE, 0xFB, 0xAA, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xFB, /* 0x4C-0x4F */ + 0x00, 0x00, 0xEC, 0xBF, 0xFC, 0xD8, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xE5, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC3, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE2, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xD7, 0xE9, 0xED, 0xF6, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xED, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xEC, 0x00, 0x00, /* 0x94-0x97 */ + 0xE3, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xD4, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xF8, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xDD, 0xB4, 0xE4, 0xB5, 0xD8, 0xB0, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xD8, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xF4, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xCE, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xD6, 0xE1, 0xCF, 0xD2, 0x00, 0x00, /* 0xC8-0xCB */ + 0xD0, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xA2, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xEE, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xF3, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xDC, 0xCC, 0x00, 0x00, 0xD0, 0xCB, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xA4, /* 0xEC-0xEF */ + 0xCD, 0xCA, 0xD7, 0xD4, 0xDE, 0xA3, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE4, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xEE, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE2, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_84[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xFE, /* 0x00-0x03 */ + 0xD4, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xD1, 0x00, 0x00, /* 0x08-0x0B */ + 0xD8, 0xF0, 0xF8, 0xC3, 0xEA, 0xD7, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xF5, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xD8, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xFD, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xEB, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xD5, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xE7, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCA, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE7, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xF8, 0xE3, 0x00, 0x00, 0xD4, 0xDD, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD8, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xD9, /* 0x68-0x6B */ + 0xED, 0xF7, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB5, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xD0, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xF1, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xE2, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE3, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xD9, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xDF, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xDB, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE4, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xF1, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB6, /* 0xB8-0xBB */ + 0xF3, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xDA, /* 0xBC-0xBF */ + 0xE1, 0xE0, 0x00, 0x00, 0xD9, 0xAC, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF5, 0xEB, 0x00, 0x00, 0xE0, 0xB6, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE9, 0xC8, 0x00, 0x00, 0xCB, 0xCF, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE3, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xDE, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xBE, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xDC, 0xEF, 0x00, 0x00, 0xD6, 0xA5, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xE2, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xD6, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_85[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xD9, 0xA1, 0x00, 0x00, 0xD8, 0xC0, /* 0x10-0x13 */ + 0xDC, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xED, 0xBD, /* 0x14-0x17 */ + 0xDF, 0xB8, 0x00, 0x00, 0xEA, 0xA5, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xAD, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xF3, 0xF9, 0x00, 0x00, 0xED, 0xF8, /* 0x20-0x23 */ + 0x00, 0x00, 0xF5, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xE1, 0xCA, 0xEB, 0xE3, 0x00, 0x00, 0xF2, 0xDE, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xF8, 0xCC, 0x00, 0x00, 0xEA, 0xD9, /* 0x3C-0x3F */ + 0x00, 0x00, 0xD3, 0xC6, 0x00, 0x00, 0xDB, 0xE6, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xF5, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF0, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xFE, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xFB, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF2, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xCF, 0xF2, 0xF7, 0xB9, 0xD9, 0xF3, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xE1, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xDA, 0xDD, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xB9, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xFB, /* 0x8C-0x8F */ + 0x00, 0x00, 0xCB, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xED, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE0, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xC0, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xFD, 0xBC, 0xDF, 0xB1, 0xE3, 0xEF, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA3, /* 0xAC-0xAF */ + 0xFD, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xB1, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xCD, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xED, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xD5, 0xC0, 0xE3, 0xF0, 0xED, 0xFA, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE9, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD5, 0xED, 0xE7, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xD4, 0xF6, 0xE5, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xDB, 0xE7, 0xE2, 0xBF, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCB, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xF4, 0xF0, 0xDD, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xAB, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_86[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xDE, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD6, 0xE1, 0xCC, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xB3, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xEE, 0xDC, 0xA2, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xD0, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xD5, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xA1, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xDB, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF9, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xCB, 0xF3, 0xF4, 0xA5, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC8, /* 0x58-0x5B */ + 0xD6, 0xD7, 0x00, 0x00, 0xE9, 0xE5, 0xFB, 0xDC, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xD0, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xFB, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xA5, 0x00, 0x00, /* 0x88-0x8B */ + 0xDB, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xE2, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xF7, /* 0xA0-0xA3 */ + 0xF0, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xF6, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xEF, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB1, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xFC, 0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE8, 0xC3, 0x00, 0x00, 0xF1, 0xC8, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF1, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xF9, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xF2, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB6, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_87[512] = { + 0xF5, 0xB9, 0x00, 0x00, 0xDC, 0xF0, 0xE3, 0xF1, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xE8, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xF2, 0xBB, 0x00, 0x00, 0xDE, 0xA4, 0x00, 0x00, /* 0x18-0x1B */ + 0xDA, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE9, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE3, 0xDA, 0x00, 0x00, 0xFC, 0xD9, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xDA, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC4, 0x00, 0x00, /* 0x64-0x67 */ + 0xE3, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xFB, 0xDD, 0x00, 0x00, 0xEF, 0xCA, 0x00, 0x00, /* 0x74-0x77 */ + 0xE8, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xCC, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xEB, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xAD, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xAB, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xD9, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xA2, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF6, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xDA, 0xF6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xE0, 0xD1, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xA8, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xF9, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xFA, 0xAF, 0x00, 0x00, 0xEB, 0xFC, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xEA, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_88[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xE3, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xC5, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xE3, 0xD5, 0xEE, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xCD, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xD9, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC1, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xFA, 0xEC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xEB, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xFA, 0xBC, 0xE6, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xE5, 0xE2, 0xFA, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xB6, /* 0x54-0x57 */ + 0x00, 0x00, 0xE4, 0xB7, 0x00, 0x00, 0xEA, 0xDB, /* 0x58-0x5B */ + 0x00, 0x00, 0xF5, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xFB, 0xAC, 0xCF, 0xC3, 0xEB, 0xFD, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xF8, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xB9, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xE1, 0xF1, 0x00, 0x00, 0xD2, 0xA4, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xFB, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xDA, 0xD0, 0xDB, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xEA, 0xBE, 0xD9, 0xB1, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xCA, 0xB7, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xE7, /* 0x88-0x8B */ + 0x00, 0x00, 0xF8, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB2, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC0, 0xF2, 0xDF, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xE5, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xAC, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xCD, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xEE, 0xAE, 0xD6, 0xAE, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xEA, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE7, 0xE0, 0xEB, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xCF, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xDC, 0xCD, 0xED, 0xFB, 0x00, 0x00, 0xDE, 0xF0, /* 0xDC-0xDF */ + 0x00, 0x00, 0xD7, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xDE, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xD7, /* 0xF0-0xF3 */ + 0xDB, 0xD0, 0xDB, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xD5, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xF0, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_89[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xDC, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xCA, 0xE8, 0x00, 0x00, 0xF8, 0xE6, 0xDC, 0xCE, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xEA, 0xDC, 0xDB, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE9, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xDB, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xA8, 0x00, 0x00, /* 0x34-0x37 */ + 0xD7, 0xAE, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE1, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xCB, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xE5, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xDC, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xD5, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xCA, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xA9, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA4, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE9, 0xA9, 0x00, 0x00, 0xD3, 0xC7, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xDD, 0xF8, 0xAE, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xB8, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAE, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xF2, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xCA, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xCC, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xD4, 0xAD, 0xF6, 0xD1, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xCC, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xC6, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xD5, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCE, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xC7, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xB0, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xDF, 0xD8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xF5, 0xBA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_8A[512] = { + 0xE5, 0xEB, 0x00, 0x00, 0xEF, 0xF4, 0xDD, 0xB5, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xCD, 0xAA, 0x00, 0x00, 0xE3, 0xF2, 0x00, 0x00, /* 0x08-0x0B */ + 0xFB, 0xF7, 0x00, 0x00, 0xF7, 0xD0, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFD, 0xBA, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xE1, 0xF6, 0xFE, /* 0x14-0x17 */ + 0xD1, 0xC0, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xC5, /* 0x18-0x1B */ + 0x00, 0x00, 0xE4, 0xB8, 0x00, 0x00, 0xE1, 0xE8, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xC1, /* 0x20-0x23 */ + 0x00, 0x00, 0xD2, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xBE, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xE0, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xFA, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xE1, 0xCD, 0x00, 0x00, 0xCA, 0xB8, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xE0, 0xF1, 0xC9, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xDE, 0xF1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xF0, 0xDF, 0xF8, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCC, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xF2, 0x00, 0x00, /* 0x5C-0x5F */ + 0xE7, 0xC9, 0x00, 0x00, 0xE2, 0xF3, 0xE7, 0xE1, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xCB, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xE3, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xCF, 0xF8, 0xEF, 0xAC, 0x00, 0x00, /* 0x6C-0x6F */ + 0xFD, 0xFE, 0xFC, 0xA5, 0xFA, 0xB1, 0xDF, 0xD9, /* 0x70-0x73 */ + 0x00, 0x00, 0xE0, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xF4, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xF1, 0xCA, 0x00, 0x00, 0xCE, 0xA3, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xF2, 0xBC, 0xEC, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xA5, /* 0x90-0x93 */ + 0x00, 0x00, 0xF7, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xEB, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xDE, 0x00, 0x00, /* 0x9C-0x9F */ + 0xE1, 0xA4, 0xCD, 0xAB, 0x00, 0x00, 0xD9, 0xF4, /* 0xA0-0xA3 */ + 0xE8, 0xA6, 0xCD, 0xCE, 0xE1, 0xE9, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xFC, 0xEF, 0x00, 0x00, 0xE0, 0xE3, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE2, 0xC1, 0x00, 0x00, 0xCE, 0xA4, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xDE, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xEB, 0xFE, 0x00, 0x00, 0xEB, 0xDD, 0xF0, 0xE0, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xDB, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xE2, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xC8, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xEB, /* 0xC8-0xCB */ + 0x00, 0x00, 0xEE, 0xB5, 0x00, 0x00, 0xF5, 0xD8, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xDF, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xE5, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xB0, /* 0xD8-0xDB */ + 0xF4, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE3, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xF4, 0xFA, 0xB2, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xF5, 0xCA, 0xDF, /* 0xE8-0xEB */ + 0x00, 0x00, 0xEB, 0xB1, 0xED, 0xBF, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xFD, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xA6, 0xF9, 0xA4, /* 0xF4-0xF7 */ + 0xF0, 0xB3, 0x00, 0x00, 0xE5, 0xEC, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xE7, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8B[512] = { + 0xD9, 0xC7, 0xE4, 0xD7, 0xEA, 0xDD, 0x00, 0x00, /* 0x00-0x03 */ + 0xD4, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xBA, 0x00, 0x00, /* 0x0C-0x0F */ + 0xDA, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xF9, 0xCC, 0x00, 0x00, 0xE1, 0xDA, 0xDB, 0xBF, /* 0x14-0x17 */ + 0x00, 0x00, 0xCC, 0xC5, 0xEC, 0xD0, 0xCB, 0xBB, /* 0x18-0x1B */ + 0x00, 0x00, 0xDE, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE9, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xD9, 0xC8, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE3, /* 0x28-0x2B */ + 0xD7, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xC4, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xD0, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xFC, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xF1, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xD2, 0xD1, 0xC1, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xE3, 0xDB, 0x00, 0x00, 0xD3, 0xC9, 0x00, 0x00, /* 0x58-0x5B */ + 0xDC, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xED, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xDE, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xBB, /* 0x6C-0x6F */ + 0xEC, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xCC, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xDE, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE7, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xD4, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xA8, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xE2, 0xC2, 0x00, 0x00, 0xF3, 0xD8, 0xE5, 0xD3, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xD9, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xC6, 0x00, 0x00, /* 0x98-0x9B */ +}; + +static const unsigned char u2c_8C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xDB, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xAC, /* 0x3C-0x3F */ + 0x00, 0x00, 0xFC, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xE7, 0x00, 0x00, /* 0x44-0x47 */ + 0xD1, 0xC2, 0x00, 0x00, 0xF9, 0xA5, 0x00, 0x00, /* 0x48-0x4B */ + 0xE8, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xE3, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0xCA, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xDF, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xDF, 0xE7, 0xE3, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xF8, 0xFB, 0xE3, 0xCF, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xB0, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xE7, 0x00, 0x00, /* 0x88-0x8B */ + 0xD9, 0xC9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xF8, 0xAF, 0xEF, 0xF6, 0x00, 0x00, /* 0x9C-0x9F */ + 0xDD, 0xB6, 0xEE, 0xAF, 0xCD, 0xF8, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xB8, /* 0xA4-0xA7 */ + 0xFC, 0xA7, 0xF7, 0xFC, 0xF7, 0xB1, 0xCE, 0xBB, /* 0xA8-0xAB */ + 0xF4, 0xA1, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCD, /* 0xAC-0xAF */ + 0xE1, 0xAE, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xC3, /* 0xB0-0xB3 */ + 0xCF, 0xFE, 0x00, 0x00, 0xF8, 0xBF, 0xD8, 0xE2, /* 0xB4-0xB7 */ + 0xD3, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xA8, /* 0xB8-0xBB */ + 0xF4, 0xE4, 0xEC, 0xC2, 0x00, 0x00, 0xD9, 0xF5, /* 0xBC-0xBF */ + 0xF9, 0xC5, 0xDD, 0xD3, 0xD6, 0xF1, 0xEC, 0xFC, /* 0xC0-0xC3 */ + 0xFC, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xED, 0xC0, /* 0xC4-0xC7 */ + 0xCA, 0xB9, 0x00, 0x00, 0xEE, 0xE4, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xF2, 0xE1, 0x00, 0x00, 0xDE, 0xB9, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xF2, 0x00, 0x00, /* 0xD8-0xDB */ + 0xDE, 0xF4, 0x00, 0x00, 0xDF, 0xDB, 0x00, 0x00, /* 0xDC-0xDF */ + 0xDB, 0xD3, 0x00, 0x00, 0xFA, 0xE7, 0xD8, 0xE3, /* 0xE0-0xE3 */ + 0xF4, 0xC1, 0x00, 0x00, 0xDD, 0xB7, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xF5, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xD4, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xD6, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xB8, /* 0xF8-0xFB */ + 0xCF, 0xC5, 0xDF, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xF2, 0xBE, 0xF6, 0xA1, 0x00, 0x00, 0xEB, 0xCB, /* 0x04-0x07 */ + 0xF1, 0xFC, 0x00, 0x00, 0xF3, 0xC7, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE0, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xFC, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xDB, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xEE, 0xE5, 0x00, 0x00, 0xDE, 0xF5, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xD3, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xF1, 0xCB, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAF, /* 0x70-0x73 */ + 0xDD, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xD1, 0xC3, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xF5, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xC6, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xF0, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xAC, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xF5, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xEB, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0xBA, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xBF, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xC5, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xA2, /* 0xC8-0xCB */ + 0xF2, 0xF6, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xBA, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xF5, /* 0xD8-0xDB */ + 0x00, 0x00, 0xCB, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xEE, 0xE6, 0x00, 0x00, 0xE0, 0xD3, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xCE, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xD8, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xAF, /* 0xF0-0xF3 */ +}; + +static const unsigned char u2c_8E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC9, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xCE, /* 0x0C-0x0F */ + 0xF4, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xE6, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xA1, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xEB, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xF1, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xB3, 0x00, 0x00, /* 0x40-0x43 */ + 0xF0, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xF4, /* 0x44-0x47 */ + 0xD4, 0xB0, 0xF3, 0xB2, 0xFB, 0xB7, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xF5, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE7, /* 0x5C-0x5F */ + 0xF4, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xF5, 0xED, 0x00, 0x00, 0xCF, 0xF3, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xF0, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xCE, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xCC, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xE5, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xF5, 0xE3, 0xF3, /* 0xA8-0xAB */ + 0xCF, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCF, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xB3, 0xE4, 0xD8, /* 0xC8-0xCB */ + 0xCF, 0xF9, 0xCF, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xFA, 0xCD, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE3, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xE2, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xF5, 0xEE, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xBB, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xDC, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8F[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF2, /* 0x00-0x03 */ + 0x00, 0x00, 0xD6, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xEE, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xE5, 0xD8, 0xC2, /* 0x10-0x13 */ + 0xDC, 0xD0, 0xCC, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xE0, /* 0x18-0x1B */ + 0xF6, 0xCA, 0xFD, 0xCA, 0xD8, 0xD6, 0xF4, 0xCF, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xA6, 0xDC, 0xBE, /* 0x24-0x27 */ + 0x00, 0x00, 0xDB, 0xD4, 0xD7, 0xC7, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xFE, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xCD, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xE2, 0xC3, 0xDC, 0xDE, 0x00, 0x00, 0xDC, 0xDF, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xAD, 0xE6, 0xAB, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xF9, 0xDD, 0xEA, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xEF, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xF4, 0xD0, 0xCE, 0xF3, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xE6, 0xAC, 0x00, 0x00, 0xCE, 0xDE, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF9, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xF4, /* 0x98-0x9B */ + 0xCD, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xB8, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xFD, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xDC, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xDE, 0xF6, 0x00, 0x00, 0xDC, 0xAA, /* 0xAC-0xAF */ + 0xF2, 0xE3, 0xE9, 0xB4, 0xD2, 0xDC, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE6, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE3, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xCA, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xD0, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xDA, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xBC, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xE8, 0xDA, 0xDE, /* 0xE8-0xEB */ + 0x00, 0x00, 0xF2, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE2, 0xFB, 0x00, 0x00, 0xCC, 0xA6, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xBB, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xEE, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xF5, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_90[512] = { + 0xF7, 0xDC, 0xE1, 0xEA, 0xCE, 0xC1, 0xD4, 0xB1, /* 0x00-0x03 */ + 0x00, 0x00, 0xFD, 0xB1, 0xE6, 0xBD, 0x00, 0x00, /* 0x04-0x07 */ + 0xFB, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE7, /* 0x08-0x0B */ + 0x00, 0x00, 0xE1, 0xCE, 0x00, 0x00, 0xF7, 0xE2, /* 0x0C-0x0F */ + 0xF5, 0xEF, 0xCF, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xD4, 0xB2, 0xCC, 0xEF, 0x00, 0x00, 0xD4, 0xE8, /* 0x14-0x17 */ + 0x00, 0x00, 0xEE, 0xCF, 0xF7, 0xD7, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xE0, 0xA6, 0xD6, 0xC1, 0xE1, 0xDC, /* 0x1C-0x1F */ + 0xF0, 0xE3, 0xF1, 0xE4, 0xDC, 0xF1, 0xD6, 0xA7, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xF5, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF1, 0xCE, 0xF2, 0xE4, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xD0, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xEC, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xF9, 0xBA, 0x00, 0x00, 0xEB, 0xB5, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xD4, 0xED, 0xE2, 0xC4, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE7, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xB4, 0xEA, 0xA1, /* 0x48-0x4B */ + 0x00, 0x00, 0xF8, 0xBC, 0xCE, 0xA6, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF9, 0xC6, 0xFC, 0xDA, 0x00, 0x00, 0xD4, 0xB3, /* 0x50-0x53 */ + 0xD3, 0xB9, 0xEA, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xE9, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xE1, 0xE1, 0xD3, 0xCF, 0xF4, 0xF6, 0x00, 0x00, /* 0x5C-0x5F */ + 0xEA, 0xC0, 0xE1, 0xCF, 0x00, 0x00, 0xCC, 0xBA, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xEE, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xF0, 0xE4, 0xF3, 0xB4, 0xD4, 0xEE, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xC0, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xF1, 0xE5, 0x00, 0x00, 0xF4, 0xC3, /* 0x74-0x77 */ + 0xE0, 0xD4, 0x00, 0x00, 0xEB, 0xB6, 0x00, 0x00, /* 0x78-0x7B */ + 0xD7, 0xA1, 0xCB, 0xE8, 0x00, 0x00, 0xF9, 0xAD, /* 0x7C-0x7F */ + + 0xE9, 0xAD, 0xD8, 0xE4, 0xFA, 0xB3, 0xE2, 0xC5, /* 0x80-0x83 */ + 0xFC, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xC4, /* 0x84-0x87 */ + 0xD8, 0xB1, 0x00, 0x00, 0xDC, 0xAB, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xA4, /* 0x8C-0x8F */ + 0x00, 0x00, 0xEB, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xE8, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xD8, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xFB, 0xAE, 0xD1, 0xE1, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xC0, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xF5, 0xBE, 0x00, 0x00, 0xDE, 0xF7, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xFB, /* 0xAC-0xAF */ + 0xF7, 0xC6, 0xCF, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE1, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xEE, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xE9, 0xF4, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF4, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xCD, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xCF, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xDD, 0xBB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xCE, 0xAC, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xE9, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xD4, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_91[512] = { + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xC7, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xDB, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xFA, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xDE, 0xA9, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF8, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xEF, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB3, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xEB, 0xB7, 0xEF, 0xF8, 0xF5, 0xDC, /* 0x48-0x4B */ + 0xED, 0xCC, 0xDB, 0xD5, 0xF1, 0xCF, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xD0, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xB2, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xD9, 0xAE, 0xD5, 0xAC, 0x00, 0x00, /* 0x68-0x6B */ + 0xE2, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xFD, 0xA3, 0x00, 0x00, 0xFB, 0xE5, /* 0x74-0x77 */ + 0xDF, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF5, /* 0x84-0x87 */ + 0x00, 0x00, 0xF6, 0xAD, 0x00, 0x00, 0xF5, 0xB3, /* 0x88-0x8B */ + 0x00, 0x00, 0xF0, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA5, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xF5, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xA2, /* 0xA8-0xAB */ + 0xED, 0xFD, 0x00, 0x00, 0xF5, 0xB4, 0xFB, 0xB8, /* 0xAC-0xAF */ + 0x00, 0x00, 0xDB, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xD6, 0xCA, 0xCB, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE5, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xFA, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xEB, 0xB8, 0x00, 0x00, 0xE0, 0xB7, /* 0xC8-0xCB */ + 0xD7, 0xEC, 0xF1, 0xEC, 0xE5, 0xAF, 0xD5, 0xE1, /* 0xCC-0xCF */ + 0xD7, 0xED, 0xD1, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xF2, /* 0xD4-0xD7 */ + 0xEF, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xDD, 0xBC, 0xF6, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xE5, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xC4, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE9, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xF3, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_92[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xD4, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xCC, 0xA2, 0xF7, 0xFE, 0xDF, 0xBC, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xCD, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xB7, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xD6, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xAD, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xAF, /* 0x3C-0x3F */ + 0xCB, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xCB, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xFA, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCC, 0xC6, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE7, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xC7, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xA4, 0x00, 0x00, /* 0x60-0x63 */ + 0xCF, 0xC9, 0xE2, 0xFC, 0xEF, 0xFA, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xEB, 0xDE, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xC8, /* 0x80-0x83 */ + 0x00, 0x00, 0xD4, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE0, 0xD5, 0x00, 0x00, 0xEF, 0xB0, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC7, 0x00, 0x00, /* 0x94-0x97 */ + 0xD9, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xF9, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xE5, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0xCA, 0xE1, 0xD1, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE2, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xEF, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xF9, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xF2, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE0, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xE8, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xCB, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xCB, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_93[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xD6, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xF5, 0xDE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xF5, 0xDF, 0x00, 0x00, 0xEE, 0xB6, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF6, 0xD3, 0xCA, /* 0x1C-0x1F */ + 0xEF, 0xFC, 0xD1, 0xC4, 0xEF, 0xB1, 0x00, 0x00, /* 0x20-0x23 */ + 0xD1, 0xC5, 0x00, 0x00, 0xD0, 0xDE, 0x00, 0x00, /* 0x24-0x27 */ + 0xD9, 0xE1, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB8, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xD1, 0xF3, 0xB9, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xE7, 0xCC, 0x00, 0x00, 0xD6, 0xA8, 0xCE, 0xA7, /* 0x48-0x4B */ + 0x00, 0x00, 0xD4, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xE4, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xB4, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xB9, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xCB, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xF6, 0xDD, 0x00, 0x00, 0xF1, 0xA3, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xCC, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE9, 0xCA, 0x00, 0x00, 0xE1, 0xF0, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xE0, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xAF, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xD1, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xFB, 0xE0, 0xF2, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xEC, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xEC, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xEE, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xCB, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xCC, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xD7, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xA1, 0x00, 0x00, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_94[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xFC, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xF1, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xE0, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xB2, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xF4, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xF7, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xF1, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xCA, 0xFC, 0xCA, 0xFD, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xCE, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xF3, 0xC8, 0x00, 0x00, 0xF3, 0xBA, /* 0x7C-0x7F */ +}; + +static const unsigned char u2c_95[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xED, 0xFE, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xDA, 0xA6, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xEC, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xF8, 0xCD, 0x00, 0x00, 0xCB, 0xD2, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xCE, /* 0x8C-0x8F */ + 0x00, 0x00, 0xF9, 0xD8, 0xF9, 0xD9, 0xCA, 0xE0, /* 0x90-0x93 */ + 0xDA, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xCB, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCA, 0xC8, /* 0xA0-0xA3 */ + 0xF9, 0xEE, 0xDB, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xD0, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xD5, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xE6, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xA2, /* 0xB8-0xBB */ + 0xE4, 0xD9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE1, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xFC, 0xC4, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xF9, 0xEF, 0xCF, 0xF4, 0xF7, 0xE6, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xCE, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0xF4, 0xC5, 0xDC, 0xA3, 0x00, 0x00, /* 0xE0-0xE3 */ +}; + +static const unsigned char u2c_96[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xDD, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xF4, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xA1, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xD6, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xC1, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xE6, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xB9, /* 0x3C-0x3F */ + 0xF6, 0xED, 0x00, 0x00, 0xF9, 0xAE, 0x00, 0x00, /* 0x40-0x43 */ + 0xDD, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xB0, /* 0x48-0x4B */ + 0xD8, 0xE8, 0xCB, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF9, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xCE, /* 0x58-0x5B */ + 0xF9, 0xF0, 0xE0, 0xED, 0xE3, 0xB3, 0xF4, 0xB3, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xC2, 0xF2, 0xE6, /* 0x60-0x63 */ + 0xF0, 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xD6, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xEB, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xE7, /* 0x70-0x73 */ + 0x00, 0x00, 0xD7, 0xD5, 0xD4, 0xB6, 0xF9, 0xE8, /* 0x74-0x77 */ + 0xD7, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE5, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xE9, 0xEA, 0xD7, 0xCC, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xE9, 0xE2, 0xC9, /* 0x88-0x8B */ + 0x00, 0x00, 0xFC, 0xDB, 0xCD, 0xAD, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xCC, 0xB0, 0xEA, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xE4, 0xF6, 0xD0, 0xC0, 0x00, 0x00, 0xF0, 0xB7, /* 0x98-0x9B */ + 0xEE, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xF6, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCA, /* 0xA4-0xA7 */ + 0xE2, 0xCB, 0x00, 0x00, 0xFA, 0xCF, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xEB, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xCB, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xB4, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xED, 0xCD, 0xE4, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xEA, 0xA9, 0xE4, 0xBA, 0xF3, 0xA2, 0xCD, 0xD2, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF6, 0xCB, 0x00, 0x00, 0xF1, 0xE6, /* 0xC8-0xCB */ + 0xED, 0xC1, 0xE8, 0xBC, 0xEE, 0xD1, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xF0, 0xE7, 0xE2, 0xCC, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE4, 0xAA, 0x00, 0x00, 0xF5, 0xE1, /* 0xD8-0xDB */ + 0xED, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xEE, 0xD1, 0xF1, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xE9, 0xEB, 0xE9, 0xEC, 0xE0, 0xE4, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xA7, /* 0xEC-0xEF */ + 0xDD, 0xD4, 0x00, 0x00, 0xEA, 0xA3, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC3, 0xD6, 0xF4, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xDA, 0xDF, 0x00, 0x00, 0xEF, 0xB3, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_97[512] = { + 0xE2, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xFD, 0xF2, 0xE8, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xEF, 0xC5, 0x00, 0x00, 0xE7, 0xE7, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xFD, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE7, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xDF, 0xDC, 0x00, 0x00, 0xF9, 0xC7, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xF6, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xDF, 0xAC, 0x00, 0x00, 0xD6, 0xDA, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xDC, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xF0, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xFA, 0x00, 0x00, /* 0x40-0x43 */ + 0xE4, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xD6, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xF4, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xFE, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xF0, 0xA1, 0x00, 0x00, 0xDE, 0xAA, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xDA, 0xBC, 0xD8, 0xFC, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xFA, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xEC, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xFC, 0xA8, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xE6, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xCB, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xB9, /* 0x88-0x8B */ + 0x00, 0x00, 0xE4, 0xD3, 0x00, 0x00, 0xCD, 0xF9, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xCF, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xCA, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xD4, /* 0xA8-0xAB */ + 0x00, 0x00, 0xF8, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xC7, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xDF, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xDB, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD4, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xE5, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD2, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xA4, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC2, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_98[512] = { + 0xFB, 0xE1, 0xFA, 0xED, 0xF0, 0xA2, 0xCC, 0xF1, /* 0x00-0x03 */ + 0x00, 0x00, 0xFA, 0xA3, 0xE2, 0xF7, 0x00, 0x00, /* 0x04-0x07 */ + 0xE2, 0xCE, 0x00, 0x00, 0xE9, 0xF5, 0x00, 0x00, /* 0x08-0x0B */ + 0xE1, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE7, 0xE8, 0xE8, 0xD7, 0xDA, 0xF8, 0xD4, 0xCB, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xF6, /* 0x14-0x17 */ + 0xD6, 0xC5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xD4, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xFA, 0xFA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xCC, 0xF2, 0xF7, 0xDD, 0x00, 0x00, 0xDE, 0xBA, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xA8, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xF0, 0xB9, 0xE4, 0xFE, 0xE4, 0xC9, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xE4, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xEA, 0xC3, 0x00, 0x00, 0xEF, 0xB4, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xBE, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xFB, 0xE2, 0x00, 0x00, 0xCD, 0xD3, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xB5, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xE9, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xF9, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xBD, /* 0xAC-0xAF */ + 0x00, 0x00, 0xF7, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF8, 0xFD, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xFC, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xAB, /* 0xD8-0xDB */ + 0xDB, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xDD, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE1, 0xE2, 0xD1, 0xC6, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xF6, 0xD0, 0xEB, 0xE6, 0xDA, 0xF9, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xEC, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xDE, 0xF8, 0xF8, 0xE9, 0xE3, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_99[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF5, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xFA, 0xC3, 0xE5, 0xD7, 0x00, 0x00, /* 0x08-0x0B */ + 0xEC, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xF3, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xBB, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE6, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xB6, 0x00, 0x00, /* 0x1C-0x1F */ + 0xDC, 0xBF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xCE, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xD8, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xD0, 0xCF, 0x00, 0x00, 0xCF, 0xFA, /* 0x48-0x4B */ + 0xF3, 0xCA, 0xE0, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xD1, 0xC7, 0xE9, 0xAE, 0x00, 0x00, /* 0x50-0x53 */ + 0xE8, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xFA, 0xC4, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCF, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xFA, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xF9, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xDC, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xFB, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xD8, 0xA9, 0xE5, 0xDF, 0xF9, 0xA7, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xF6, 0xEE, 0x00, 0x00, 0xF6, 0xCC, /* 0xB0-0xB3 */ + 0xE2, 0xF8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xEC, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xDA, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xF1, 0xD2, 0xD2, 0xCC, 0xCF, 0xCB, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xCA, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xDD, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF6, 0xEF, 0x00, 0x00, 0xDE, 0xF9, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xFA, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xD5, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xE7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9A[512] = { + 0x00, 0x00, 0xDE, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xDC, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xC8, 0xD1, 0xC9, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xF8, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xF6, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xD4, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xE2, 0xE1, 0xD3, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xD8, 0xE9, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xFE, /* 0x40-0x43 */ + 0x00, 0x00, 0xCF, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xFD, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xCE, 0xF6, 0x00, 0x00, 0xFA, 0xD0, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xF3, 0xE6, 0xBE, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xAE, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF0, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xD1, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xFC, 0xBE, 0xD5, 0xF1, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xCD, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xFA, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD0, /* 0xD0-0xD3 */ + 0xF4, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xCD, 0xD4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE7, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xA5, 0x00, 0x00, /* 0xEC-0xEF */ +}; + +static const unsigned char u2c_9B[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD1, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xA2, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xE3, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xEA, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xD0, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xCE, 0xDA, 0xFB, 0xEB, 0xDB, 0xA6, /* 0x40-0x43 */ + 0xDB, 0xDE, 0xD8, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE0, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xD8, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE0, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xDB, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xC6, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xF8, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xD5, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF7, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xD8, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xD7, 0xEF, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xED, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xCD, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xCC, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ +}; + +static const unsigned char u2c_9C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xF5, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE4, 0xCA, 0x00, 0x00, 0xDC, 0xE1, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xF9, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xFC, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA7, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC4, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xBE, /* 0x44-0x47 */ + 0x00, 0x00, 0xDC, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xF7, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xF0, 0xE8, 0x00, 0x00, 0xDD, 0xC0, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xCF, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xF3, /* 0xF0-0xF3 */ + 0xD9, 0xB0, 0x00, 0x00, 0xE6, 0xE9, 0x00, 0x00, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_9D[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xE4, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xC4, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xEC, 0x00, 0x00, /* 0x24-0x27 */ + 0xE4, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0xF8, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xCC, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xE4, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xCD, 0xDC, 0xD9, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xDD, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xCE, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xD9, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA3, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xF9, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xCD, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xCE, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xAF, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xFD, 0xD3, 0xEB, 0xED, 0xD6, 0xDC, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_9E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xA4, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0xB6, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xD6, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xF9, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE7, 0xA4, 0x00, 0x00, 0xD6, 0xE3, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xCB, 0xD6, 0xE4, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xF2, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xDE, 0xFA, 0x00, 0x00, 0xD7, 0xF8, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xD8, 0xEA, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xCF, 0xD5, 0xD8, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xAB, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xFD, 0xCB, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFC, 0xDC, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xE0, 0xA8, 0xD5, 0xF3, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xFD, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xCC, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xD9, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xD3, 0xEA, /* 0xD8-0xDB */ + 0xF5, 0xF5, 0x00, 0x00, 0xEF, 0xC7, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xD3, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xDA, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ +}; + +static const unsigned char u2c_9F[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xA8, /* 0x04-0x07 */ + 0xDC, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xA3, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xD5, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE0, 0xA9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xAC, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xBA, 0xEE, 0xB1, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xB2, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xCD, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xD2, /* 0x5C-0x5F */ + 0x00, 0x00, 0xD6, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE5, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xBB, 0x00, 0x00, /* 0x68-0x6B */ + 0xE5, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xCB, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xD7, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xDB, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xCA, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xCF, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_AC[512] = { + 0xB0, 0xA1, 0xB0, 0xA2, 0x81, 0x41, 0x81, 0x42, /* 0x00-0x03 */ + 0xB0, 0xA3, 0x81, 0x43, 0x81, 0x44, 0xB0, 0xA4, /* 0x04-0x07 */ + 0xB0, 0xA5, 0xB0, 0xA6, 0xB0, 0xA7, 0x81, 0x45, /* 0x08-0x0B */ + 0x81, 0x46, 0x81, 0x47, 0x81, 0x48, 0x81, 0x49, /* 0x0C-0x0F */ + 0xB0, 0xA8, 0xB0, 0xA9, 0xB0, 0xAA, 0xB0, 0xAB, /* 0x10-0x13 */ + 0xB0, 0xAC, 0xB0, 0xAD, 0xB0, 0xAE, 0xB0, 0xAF, /* 0x14-0x17 */ + 0x81, 0x4A, 0xB0, 0xB0, 0xB0, 0xB1, 0xB0, 0xB2, /* 0x18-0x1B */ + 0xB0, 0xB3, 0xB0, 0xB4, 0x81, 0x4B, 0x81, 0x4C, /* 0x1C-0x1F */ + 0xB0, 0xB5, 0x81, 0x4D, 0x81, 0x4E, 0x81, 0x4F, /* 0x20-0x23 */ + 0xB0, 0xB6, 0x81, 0x50, 0x81, 0x51, 0x81, 0x52, /* 0x24-0x27 */ + 0x81, 0x53, 0x81, 0x54, 0x81, 0x55, 0x81, 0x56, /* 0x28-0x2B */ + 0xB0, 0xB7, 0xB0, 0xB8, 0x81, 0x57, 0xB0, 0xB9, /* 0x2C-0x2F */ + 0xB0, 0xBA, 0xB0, 0xBB, 0x81, 0x58, 0x81, 0x59, /* 0x30-0x33 */ + 0x81, 0x5A, 0x81, 0x61, 0x81, 0x62, 0x81, 0x63, /* 0x34-0x37 */ + 0xB0, 0xBC, 0xB0, 0xBD, 0x81, 0x64, 0x81, 0x65, /* 0x38-0x3B */ + 0xB0, 0xBE, 0x81, 0x66, 0x81, 0x67, 0x81, 0x68, /* 0x3C-0x3F */ + 0xB0, 0xBF, 0x81, 0x69, 0x81, 0x6A, 0x81, 0x6B, /* 0x40-0x43 */ + 0x81, 0x6C, 0x81, 0x6D, 0x81, 0x6E, 0x81, 0x6F, /* 0x44-0x47 */ + 0x81, 0x70, 0x81, 0x71, 0x81, 0x72, 0xB0, 0xC0, /* 0x48-0x4B */ + 0x81, 0x73, 0xB0, 0xC1, 0x81, 0x74, 0x81, 0x75, /* 0x4C-0x4F */ + 0x81, 0x76, 0x81, 0x77, 0x81, 0x78, 0x81, 0x79, /* 0x50-0x53 */ + 0xB0, 0xC2, 0x81, 0x7A, 0x81, 0x81, 0x81, 0x82, /* 0x54-0x57 */ + 0xB0, 0xC3, 0x81, 0x83, 0x81, 0x84, 0x81, 0x85, /* 0x58-0x5B */ + 0xB0, 0xC4, 0x81, 0x86, 0x81, 0x87, 0x81, 0x88, /* 0x5C-0x5F */ + 0x81, 0x89, 0x81, 0x8A, 0x81, 0x8B, 0x81, 0x8C, /* 0x60-0x63 */ + 0x81, 0x8D, 0x81, 0x8E, 0x81, 0x8F, 0x81, 0x90, /* 0x64-0x67 */ + 0x81, 0x91, 0x81, 0x92, 0x81, 0x93, 0x81, 0x94, /* 0x68-0x6B */ + 0x81, 0x95, 0x81, 0x96, 0x81, 0x97, 0x81, 0x98, /* 0x6C-0x6F */ + 0xB0, 0xC5, 0xB0, 0xC6, 0x81, 0x99, 0x81, 0x9A, /* 0x70-0x73 */ + 0xB0, 0xC7, 0x81, 0x9B, 0x81, 0x9C, 0xB0, 0xC8, /* 0x74-0x77 */ + 0xB0, 0xC9, 0x81, 0x9D, 0xB0, 0xCA, 0x81, 0x9E, /* 0x78-0x7B */ + 0x81, 0x9F, 0x81, 0xA0, 0x81, 0xA1, 0x81, 0xA2, /* 0x7C-0x7F */ + + 0xB0, 0xCB, 0xB0, 0xCC, 0x81, 0xA3, 0xB0, 0xCD, /* 0x80-0x83 */ + 0xB0, 0xCE, 0xB0, 0xCF, 0xB0, 0xD0, 0x81, 0xA4, /* 0x84-0x87 */ + 0x81, 0xA5, 0xB0, 0xD1, 0xB0, 0xD2, 0xB0, 0xD3, /* 0x88-0x8B */ + 0xB0, 0xD4, 0x81, 0xA6, 0x81, 0xA7, 0x81, 0xA8, /* 0x8C-0x8F */ + 0xB0, 0xD5, 0x81, 0xA9, 0x81, 0xAA, 0x81, 0xAB, /* 0x90-0x93 */ + 0xB0, 0xD6, 0x81, 0xAC, 0x81, 0xAD, 0x81, 0xAE, /* 0x94-0x97 */ + 0x81, 0xAF, 0x81, 0xB0, 0x81, 0xB1, 0x81, 0xB2, /* 0x98-0x9B */ + 0xB0, 0xD7, 0xB0, 0xD8, 0x81, 0xB3, 0xB0, 0xD9, /* 0x9C-0x9F */ + 0xB0, 0xDA, 0xB0, 0xDB, 0x81, 0xB4, 0x81, 0xB5, /* 0xA0-0xA3 */ + 0x81, 0xB6, 0x81, 0xB7, 0x81, 0xB8, 0x81, 0xB9, /* 0xA4-0xA7 */ + 0xB0, 0xDC, 0xB0, 0xDD, 0xB0, 0xDE, 0x81, 0xBA, /* 0xA8-0xAB */ + 0xB0, 0xDF, 0x81, 0xBB, 0x81, 0xBC, 0xB0, 0xE0, /* 0xAC-0xAF */ + 0xB0, 0xE1, 0x81, 0xBD, 0x81, 0xBE, 0x81, 0xBF, /* 0xB0-0xB3 */ + 0x81, 0xC0, 0x81, 0xC1, 0x81, 0xC2, 0x81, 0xC3, /* 0xB4-0xB7 */ + 0xB0, 0xE2, 0xB0, 0xE3, 0x81, 0xC4, 0xB0, 0xE4, /* 0xB8-0xBB */ + 0xB0, 0xE5, 0xB0, 0xE6, 0x81, 0xC5, 0x81, 0xC6, /* 0xBC-0xBF */ + 0x81, 0xC7, 0xB0, 0xE7, 0x81, 0xC8, 0x81, 0xC9, /* 0xC0-0xC3 */ + 0xB0, 0xE8, 0x81, 0xCA, 0x81, 0xCB, 0x81, 0xCC, /* 0xC4-0xC7 */ + 0xB0, 0xE9, 0x81, 0xCD, 0x81, 0xCE, 0x81, 0xCF, /* 0xC8-0xCB */ + 0xB0, 0xEA, 0x81, 0xD0, 0x81, 0xD1, 0x81, 0xD2, /* 0xCC-0xCF */ + 0x81, 0xD3, 0x81, 0xD4, 0x81, 0xD5, 0x81, 0xD6, /* 0xD0-0xD3 */ + 0x81, 0xD7, 0xB0, 0xEB, 0x81, 0xD8, 0xB0, 0xEC, /* 0xD4-0xD7 */ + 0x81, 0xD9, 0x81, 0xDA, 0x81, 0xDB, 0x81, 0xDC, /* 0xD8-0xDB */ + 0x81, 0xDD, 0x81, 0xDE, 0x81, 0xDF, 0x81, 0xE0, /* 0xDC-0xDF */ + 0xB0, 0xED, 0xB0, 0xEE, 0x81, 0xE1, 0x81, 0xE2, /* 0xE0-0xE3 */ + 0xB0, 0xEF, 0x81, 0xE3, 0x81, 0xE4, 0xB0, 0xF0, /* 0xE4-0xE7 */ + 0xB0, 0xF1, 0x81, 0xE5, 0xB0, 0xF2, 0x81, 0xE6, /* 0xE8-0xEB */ + 0xB0, 0xF3, 0x81, 0xE7, 0x81, 0xE8, 0xB0, 0xF4, /* 0xEC-0xEF */ + 0xB0, 0xF5, 0xB0, 0xF6, 0x81, 0xE9, 0xB0, 0xF7, /* 0xF0-0xF3 */ + 0x81, 0xEA, 0xB0, 0xF8, 0xB0, 0xF9, 0x81, 0xEB, /* 0xF4-0xF7 */ + 0x81, 0xEC, 0x81, 0xED, 0x81, 0xEE, 0x81, 0xEF, /* 0xF8-0xFB */ + 0xB0, 0xFA, 0xB0, 0xFB, 0x81, 0xF0, 0x81, 0xF1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_AD[512] = { + 0xB0, 0xFC, 0x81, 0xF2, 0x81, 0xF3, 0x81, 0xF4, /* 0x00-0x03 */ + 0xB0, 0xFD, 0x81, 0xF5, 0xB0, 0xFE, 0x81, 0xF6, /* 0x04-0x07 */ + 0x81, 0xF7, 0x81, 0xF8, 0x81, 0xF9, 0x81, 0xFA, /* 0x08-0x0B */ + 0xB1, 0xA1, 0xB1, 0xA2, 0x81, 0xFB, 0xB1, 0xA3, /* 0x0C-0x0F */ + 0x81, 0xFC, 0xB1, 0xA4, 0x81, 0xFD, 0x81, 0xFE, /* 0x10-0x13 */ + 0x82, 0x41, 0x82, 0x42, 0x82, 0x43, 0x82, 0x44, /* 0x14-0x17 */ + 0xB1, 0xA5, 0x82, 0x45, 0x82, 0x46, 0x82, 0x47, /* 0x18-0x1B */ + 0xB1, 0xA6, 0x82, 0x48, 0x82, 0x49, 0x82, 0x4A, /* 0x1C-0x1F */ + 0xB1, 0xA7, 0x82, 0x4B, 0x82, 0x4C, 0x82, 0x4D, /* 0x20-0x23 */ + 0x82, 0x4E, 0x82, 0x4F, 0x82, 0x50, 0x82, 0x51, /* 0x24-0x27 */ + 0x82, 0x52, 0xB1, 0xA8, 0x82, 0x53, 0x82, 0x54, /* 0x28-0x2B */ + 0xB1, 0xA9, 0xB1, 0xAA, 0x82, 0x55, 0x82, 0x56, /* 0x2C-0x2F */ + 0x82, 0x57, 0x82, 0x58, 0x82, 0x59, 0x82, 0x5A, /* 0x30-0x33 */ + 0xB1, 0xAB, 0xB1, 0xAC, 0x82, 0x61, 0x82, 0x62, /* 0x34-0x37 */ + 0xB1, 0xAD, 0x82, 0x63, 0x82, 0x64, 0x82, 0x65, /* 0x38-0x3B */ + 0xB1, 0xAE, 0x82, 0x66, 0x82, 0x67, 0x82, 0x68, /* 0x3C-0x3F */ + 0x82, 0x69, 0x82, 0x6A, 0x82, 0x6B, 0x82, 0x6C, /* 0x40-0x43 */ + 0xB1, 0xAF, 0xB1, 0xB0, 0x82, 0x6D, 0xB1, 0xB1, /* 0x44-0x47 */ + 0x82, 0x6E, 0xB1, 0xB2, 0x82, 0x6F, 0x82, 0x70, /* 0x48-0x4B */ + 0x82, 0x71, 0x82, 0x72, 0x82, 0x73, 0x82, 0x74, /* 0x4C-0x4F */ + 0xB1, 0xB3, 0x82, 0x75, 0x82, 0x76, 0x82, 0x77, /* 0x50-0x53 */ + 0xB1, 0xB4, 0x82, 0x78, 0x82, 0x79, 0x82, 0x7A, /* 0x54-0x57 */ + 0xB1, 0xB5, 0x82, 0x81, 0x82, 0x82, 0x82, 0x83, /* 0x58-0x5B */ + 0x82, 0x84, 0x82, 0x85, 0x82, 0x86, 0x82, 0x87, /* 0x5C-0x5F */ + 0x82, 0x88, 0xB1, 0xB6, 0x82, 0x89, 0xB1, 0xB7, /* 0x60-0x63 */ + 0x82, 0x8A, 0x82, 0x8B, 0x82, 0x8C, 0x82, 0x8D, /* 0x64-0x67 */ + 0x82, 0x8E, 0x82, 0x8F, 0x82, 0x90, 0x82, 0x91, /* 0x68-0x6B */ + 0xB1, 0xB8, 0xB1, 0xB9, 0x82, 0x92, 0x82, 0x93, /* 0x6C-0x6F */ + 0xB1, 0xBA, 0x82, 0x94, 0x82, 0x95, 0xB1, 0xBB, /* 0x70-0x73 */ + 0xB1, 0xBC, 0xB1, 0xBD, 0xB1, 0xBE, 0x82, 0x96, /* 0x74-0x77 */ + 0x82, 0x97, 0x82, 0x98, 0x82, 0x99, 0xB1, 0xBF, /* 0x78-0x7B */ + 0xB1, 0xC0, 0xB1, 0xC1, 0x82, 0x9A, 0xB1, 0xC2, /* 0x7C-0x7F */ + + 0x82, 0x9B, 0xB1, 0xC3, 0xB1, 0xC4, 0x82, 0x9C, /* 0x80-0x83 */ + 0x82, 0x9D, 0x82, 0x9E, 0x82, 0x9F, 0x82, 0xA0, /* 0x84-0x87 */ + 0xB1, 0xC5, 0xB1, 0xC6, 0x82, 0xA1, 0x82, 0xA2, /* 0x88-0x8B */ + 0xB1, 0xC7, 0x82, 0xA3, 0x82, 0xA4, 0x82, 0xA5, /* 0x8C-0x8F */ + 0xB1, 0xC8, 0x82, 0xA6, 0x82, 0xA7, 0x82, 0xA8, /* 0x90-0x93 */ + 0x82, 0xA9, 0x82, 0xAA, 0x82, 0xAB, 0x82, 0xAC, /* 0x94-0x97 */ + 0x82, 0xAD, 0x82, 0xAE, 0x82, 0xAF, 0x82, 0xB0, /* 0x98-0x9B */ + 0xB1, 0xC9, 0xB1, 0xCA, 0x82, 0xB1, 0x82, 0xB2, /* 0x9C-0x9F */ + 0x82, 0xB3, 0x82, 0xB4, 0x82, 0xB5, 0x82, 0xB6, /* 0xA0-0xA3 */ + 0xB1, 0xCB, 0x82, 0xB7, 0x82, 0xB8, 0x82, 0xB9, /* 0xA4-0xA7 */ + 0x82, 0xBA, 0x82, 0xBB, 0x82, 0xBC, 0x82, 0xBD, /* 0xA8-0xAB */ + 0x82, 0xBE, 0x82, 0xBF, 0x82, 0xC0, 0x82, 0xC1, /* 0xAC-0xAF */ + 0x82, 0xC2, 0x82, 0xC3, 0x82, 0xC4, 0x82, 0xC5, /* 0xB0-0xB3 */ + 0x82, 0xC6, 0x82, 0xC7, 0x82, 0xC8, 0xB1, 0xCC, /* 0xB4-0xB7 */ + 0x82, 0xC9, 0x82, 0xCA, 0x82, 0xCB, 0x82, 0xCC, /* 0xB8-0xBB */ + 0x82, 0xCD, 0x82, 0xCE, 0x82, 0xCF, 0x82, 0xD0, /* 0xBC-0xBF */ + 0xB1, 0xCD, 0xB1, 0xCE, 0x82, 0xD1, 0x82, 0xD2, /* 0xC0-0xC3 */ + 0xB1, 0xCF, 0x82, 0xD3, 0x82, 0xD4, 0x82, 0xD5, /* 0xC4-0xC7 */ + 0xB1, 0xD0, 0x82, 0xD6, 0x82, 0xD7, 0x82, 0xD8, /* 0xC8-0xCB */ + 0x82, 0xD9, 0x82, 0xDA, 0x82, 0xDB, 0x82, 0xDC, /* 0xCC-0xCF */ + 0xB1, 0xD1, 0xB1, 0xD2, 0x82, 0xDD, 0xB1, 0xD3, /* 0xD0-0xD3 */ + 0x82, 0xDE, 0x82, 0xDF, 0x82, 0xE0, 0x82, 0xE1, /* 0xD4-0xD7 */ + 0x82, 0xE2, 0x82, 0xE3, 0x82, 0xE4, 0x82, 0xE5, /* 0xD8-0xDB */ + 0xB1, 0xD4, 0x82, 0xE6, 0x82, 0xE7, 0x82, 0xE8, /* 0xDC-0xDF */ + 0xB1, 0xD5, 0x82, 0xE9, 0x82, 0xEA, 0x82, 0xEB, /* 0xE0-0xE3 */ + 0xB1, 0xD6, 0x82, 0xEC, 0x82, 0xED, 0x82, 0xEE, /* 0xE4-0xE7 */ + 0x82, 0xEF, 0x82, 0xF0, 0x82, 0xF1, 0x82, 0xF2, /* 0xE8-0xEB */ + 0x82, 0xF3, 0x82, 0xF4, 0x82, 0xF5, 0x82, 0xF6, /* 0xEC-0xEF */ + 0x82, 0xF7, 0x82, 0xF8, 0x82, 0xF9, 0x82, 0xFA, /* 0xF0-0xF3 */ + 0x82, 0xFB, 0x82, 0xFC, 0x82, 0xFD, 0x82, 0xFE, /* 0xF4-0xF7 */ + 0xB1, 0xD7, 0xB1, 0xD8, 0x83, 0x41, 0x83, 0x42, /* 0xF8-0xFB */ + 0xB1, 0xD9, 0x83, 0x43, 0x83, 0x44, 0xB1, 0xDA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_AE[512] = { + 0xB1, 0xDB, 0xB1, 0xDC, 0x83, 0x45, 0x83, 0x46, /* 0x00-0x03 */ + 0x83, 0x47, 0x83, 0x48, 0x83, 0x49, 0x83, 0x4A, /* 0x04-0x07 */ + 0xB1, 0xDD, 0xB1, 0xDE, 0x83, 0x4B, 0xB1, 0xDF, /* 0x08-0x0B */ + 0x83, 0x4C, 0xB1, 0xE0, 0x83, 0x4D, 0x83, 0x4E, /* 0x0C-0x0F */ + 0x83, 0x4F, 0x83, 0x50, 0x83, 0x51, 0x83, 0x52, /* 0x10-0x13 */ + 0xB1, 0xE1, 0x83, 0x53, 0x83, 0x54, 0x83, 0x55, /* 0x14-0x17 */ + 0x83, 0x56, 0x83, 0x57, 0x83, 0x58, 0x83, 0x59, /* 0x18-0x1B */ + 0x83, 0x5A, 0x83, 0x61, 0x83, 0x62, 0x83, 0x63, /* 0x1C-0x1F */ + 0x83, 0x64, 0x83, 0x65, 0x83, 0x66, 0x83, 0x67, /* 0x20-0x23 */ + 0x83, 0x68, 0x83, 0x69, 0x83, 0x6A, 0x83, 0x6B, /* 0x24-0x27 */ + 0x83, 0x6C, 0x83, 0x6D, 0x83, 0x6E, 0x83, 0x6F, /* 0x28-0x2B */ + 0x83, 0x70, 0x83, 0x71, 0x83, 0x72, 0x83, 0x73, /* 0x2C-0x2F */ + 0xB1, 0xE2, 0xB1, 0xE3, 0x83, 0x74, 0x83, 0x75, /* 0x30-0x33 */ + 0xB1, 0xE4, 0x83, 0x76, 0x83, 0x77, 0xB1, 0xE5, /* 0x34-0x37 */ + 0xB1, 0xE6, 0x83, 0x78, 0xB1, 0xE7, 0x83, 0x79, /* 0x38-0x3B */ + 0x83, 0x7A, 0x83, 0x81, 0x83, 0x82, 0x83, 0x83, /* 0x3C-0x3F */ + 0xB1, 0xE8, 0xB1, 0xE9, 0x83, 0x84, 0xB1, 0xEA, /* 0x40-0x43 */ + 0x83, 0x85, 0xB1, 0xEB, 0xB1, 0xEC, 0x83, 0x86, /* 0x44-0x47 */ + 0x83, 0x87, 0x83, 0x88, 0xB1, 0xED, 0x83, 0x89, /* 0x48-0x4B */ + 0xB1, 0xEE, 0xB1, 0xEF, 0xB1, 0xF0, 0x83, 0x8A, /* 0x4C-0x4F */ + 0xB1, 0xF1, 0x83, 0x8B, 0x83, 0x8C, 0x83, 0x8D, /* 0x50-0x53 */ + 0xB1, 0xF2, 0x83, 0x8E, 0xB1, 0xF3, 0x83, 0x8F, /* 0x54-0x57 */ + 0x83, 0x90, 0x83, 0x91, 0x83, 0x92, 0x83, 0x93, /* 0x58-0x5B */ + 0xB1, 0xF4, 0xB1, 0xF5, 0x83, 0x94, 0xB1, 0xF6, /* 0x5C-0x5F */ + 0xB1, 0xF7, 0xB1, 0xF8, 0x83, 0x95, 0x83, 0x96, /* 0x60-0x63 */ + 0x83, 0x97, 0xB1, 0xF9, 0x83, 0x98, 0x83, 0x99, /* 0x64-0x67 */ + 0xB1, 0xFA, 0xB1, 0xFB, 0x83, 0x9A, 0x83, 0x9B, /* 0x68-0x6B */ + 0xB1, 0xFC, 0x83, 0x9C, 0x83, 0x9D, 0x83, 0x9E, /* 0x6C-0x6F */ + 0xB1, 0xFD, 0x83, 0x9F, 0x83, 0xA0, 0x83, 0xA1, /* 0x70-0x73 */ + 0x83, 0xA2, 0x83, 0xA3, 0x83, 0xA4, 0x83, 0xA5, /* 0x74-0x77 */ + 0xB1, 0xFE, 0xB2, 0xA1, 0x83, 0xA6, 0xB2, 0xA2, /* 0x78-0x7B */ + 0xB2, 0xA3, 0xB2, 0xA4, 0x83, 0xA7, 0x83, 0xA8, /* 0x7C-0x7F */ + + 0x83, 0xA9, 0x83, 0xAA, 0x83, 0xAB, 0x83, 0xAC, /* 0x80-0x83 */ + 0xB2, 0xA5, 0xB2, 0xA6, 0x83, 0xAD, 0x83, 0xAE, /* 0x84-0x87 */ + 0x83, 0xAF, 0x83, 0xB0, 0x83, 0xB1, 0x83, 0xB2, /* 0x88-0x8B */ + 0xB2, 0xA7, 0x83, 0xB3, 0x83, 0xB4, 0x83, 0xB5, /* 0x8C-0x8F */ + 0x83, 0xB6, 0x83, 0xB7, 0x83, 0xB8, 0x83, 0xB9, /* 0x90-0x93 */ + 0x83, 0xBA, 0x83, 0xBB, 0x83, 0xBC, 0x83, 0xBD, /* 0x94-0x97 */ + 0x83, 0xBE, 0x83, 0xBF, 0x83, 0xC0, 0x83, 0xC1, /* 0x98-0x9B */ + 0x83, 0xC2, 0x83, 0xC3, 0x83, 0xC4, 0x83, 0xC5, /* 0x9C-0x9F */ + 0x83, 0xC6, 0x83, 0xC7, 0x83, 0xC8, 0x83, 0xC9, /* 0xA0-0xA3 */ + 0x83, 0xCA, 0x83, 0xCB, 0x83, 0xCC, 0x83, 0xCD, /* 0xA4-0xA7 */ + 0x83, 0xCE, 0x83, 0xCF, 0x83, 0xD0, 0x83, 0xD1, /* 0xA8-0xAB */ + 0x83, 0xD2, 0x83, 0xD3, 0x83, 0xD4, 0x83, 0xD5, /* 0xAC-0xAF */ + 0x83, 0xD6, 0x83, 0xD7, 0x83, 0xD8, 0x83, 0xD9, /* 0xB0-0xB3 */ + 0x83, 0xDA, 0x83, 0xDB, 0x83, 0xDC, 0x83, 0xDD, /* 0xB4-0xB7 */ + 0x83, 0xDE, 0x83, 0xDF, 0x83, 0xE0, 0x83, 0xE1, /* 0xB8-0xBB */ + 0xB2, 0xA8, 0xB2, 0xA9, 0xB2, 0xAA, 0x83, 0xE2, /* 0xBC-0xBF */ + 0xB2, 0xAB, 0x83, 0xE3, 0x83, 0xE4, 0x83, 0xE5, /* 0xC0-0xC3 */ + 0xB2, 0xAC, 0x83, 0xE6, 0x83, 0xE7, 0x83, 0xE8, /* 0xC4-0xC7 */ + 0x83, 0xE9, 0x83, 0xEA, 0x83, 0xEB, 0x83, 0xEC, /* 0xC8-0xCB */ + 0xB2, 0xAD, 0xB2, 0xAE, 0x83, 0xED, 0xB2, 0xAF, /* 0xCC-0xCF */ + 0xB2, 0xB0, 0xB2, 0xB1, 0x83, 0xEE, 0x83, 0xEF, /* 0xD0-0xD3 */ + 0x83, 0xF0, 0x83, 0xF1, 0x83, 0xF2, 0x83, 0xF3, /* 0xD4-0xD7 */ + 0xB2, 0xB2, 0xB2, 0xB3, 0x83, 0xF4, 0x83, 0xF5, /* 0xD8-0xDB */ + 0xB2, 0xB4, 0x83, 0xF6, 0x83, 0xF7, 0x83, 0xF8, /* 0xDC-0xDF */ + 0x83, 0xF9, 0x83, 0xFA, 0x83, 0xFB, 0x83, 0xFC, /* 0xE0-0xE3 */ + 0x83, 0xFD, 0x83, 0xFE, 0x84, 0x41, 0x84, 0x42, /* 0xE4-0xE7 */ + 0xB2, 0xB5, 0x84, 0x43, 0x84, 0x44, 0xB2, 0xB6, /* 0xE8-0xEB */ + 0x84, 0x45, 0xB2, 0xB7, 0x84, 0x46, 0x84, 0x47, /* 0xEC-0xEF */ + 0x84, 0x48, 0x84, 0x49, 0x84, 0x4A, 0x84, 0x4B, /* 0xF0-0xF3 */ + 0xB2, 0xB8, 0x84, 0x4C, 0x84, 0x4D, 0x84, 0x4E, /* 0xF4-0xF7 */ + 0xB2, 0xB9, 0x84, 0x4F, 0x84, 0x50, 0x84, 0x51, /* 0xF8-0xFB */ + 0xB2, 0xBA, 0x84, 0x52, 0x84, 0x53, 0x84, 0x54, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_AF[512] = { + 0x84, 0x55, 0x84, 0x56, 0x84, 0x57, 0x84, 0x58, /* 0x00-0x03 */ + 0x84, 0x59, 0x84, 0x5A, 0x84, 0x61, 0xB2, 0xBB, /* 0x04-0x07 */ + 0xB2, 0xBC, 0x84, 0x62, 0x84, 0x63, 0x84, 0x64, /* 0x08-0x0B */ + 0x84, 0x65, 0xB2, 0xBD, 0x84, 0x66, 0x84, 0x67, /* 0x0C-0x0F */ + 0xB2, 0xBE, 0x84, 0x68, 0x84, 0x69, 0x84, 0x6A, /* 0x10-0x13 */ + 0x84, 0x6B, 0x84, 0x6C, 0x84, 0x6D, 0x84, 0x6E, /* 0x14-0x17 */ + 0x84, 0x6F, 0x84, 0x70, 0x84, 0x71, 0x84, 0x72, /* 0x18-0x1B */ + 0x84, 0x73, 0x84, 0x74, 0x84, 0x75, 0x84, 0x76, /* 0x1C-0x1F */ + 0x84, 0x77, 0x84, 0x78, 0x84, 0x79, 0x84, 0x7A, /* 0x20-0x23 */ + 0x84, 0x81, 0x84, 0x82, 0x84, 0x83, 0x84, 0x84, /* 0x24-0x27 */ + 0x84, 0x85, 0x84, 0x86, 0x84, 0x87, 0x84, 0x88, /* 0x28-0x2B */ + 0xB2, 0xBF, 0xB2, 0xC0, 0x84, 0x89, 0x84, 0x8A, /* 0x2C-0x2F */ + 0xB2, 0xC1, 0x84, 0x8B, 0xB2, 0xC2, 0x84, 0x8C, /* 0x30-0x33 */ + 0xB2, 0xC3, 0x84, 0x8D, 0x84, 0x8E, 0x84, 0x8F, /* 0x34-0x37 */ + 0x84, 0x90, 0x84, 0x91, 0x84, 0x92, 0x84, 0x93, /* 0x38-0x3B */ + 0xB2, 0xC4, 0xB2, 0xC5, 0x84, 0x94, 0xB2, 0xC6, /* 0x3C-0x3F */ + 0x84, 0x95, 0xB2, 0xC7, 0xB2, 0xC8, 0xB2, 0xC9, /* 0x40-0x43 */ + 0x84, 0x96, 0x84, 0x97, 0x84, 0x98, 0x84, 0x99, /* 0x44-0x47 */ + 0xB2, 0xCA, 0xB2, 0xCB, 0x84, 0x9A, 0x84, 0x9B, /* 0x48-0x4B */ + 0x84, 0x9C, 0x84, 0x9D, 0x84, 0x9E, 0x84, 0x9F, /* 0x4C-0x4F */ + 0xB2, 0xCC, 0x84, 0xA0, 0x84, 0xA1, 0x84, 0xA2, /* 0x50-0x53 */ + 0x84, 0xA3, 0x84, 0xA4, 0x84, 0xA5, 0x84, 0xA6, /* 0x54-0x57 */ + 0x84, 0xA7, 0x84, 0xA8, 0x84, 0xA9, 0x84, 0xAA, /* 0x58-0x5B */ + 0xB2, 0xCD, 0xB2, 0xCE, 0x84, 0xAB, 0x84, 0xAC, /* 0x5C-0x5F */ + 0x84, 0xAD, 0x84, 0xAE, 0x84, 0xAF, 0x84, 0xB0, /* 0x60-0x63 */ + 0xB2, 0xCF, 0xB2, 0xD0, 0x84, 0xB1, 0x84, 0xB2, /* 0x64-0x67 */ + 0x84, 0xB3, 0x84, 0xB4, 0x84, 0xB5, 0x84, 0xB6, /* 0x68-0x6B */ + 0x84, 0xB7, 0x84, 0xB8, 0x84, 0xB9, 0x84, 0xBA, /* 0x6C-0x6F */ + 0x84, 0xBB, 0x84, 0xBC, 0x84, 0xBD, 0x84, 0xBE, /* 0x70-0x73 */ + 0x84, 0xBF, 0x84, 0xC0, 0x84, 0xC1, 0x84, 0xC2, /* 0x74-0x77 */ + 0x84, 0xC3, 0xB2, 0xD1, 0x84, 0xC4, 0x84, 0xC5, /* 0x78-0x7B */ + 0x84, 0xC6, 0x84, 0xC7, 0x84, 0xC8, 0x84, 0xC9, /* 0x7C-0x7F */ + + 0xB2, 0xD2, 0x84, 0xCA, 0x84, 0xCB, 0x84, 0xCC, /* 0x80-0x83 */ + 0xB2, 0xD3, 0x84, 0xCD, 0x84, 0xCE, 0x84, 0xCF, /* 0x84-0x87 */ + 0xB2, 0xD4, 0x84, 0xD0, 0x84, 0xD1, 0x84, 0xD2, /* 0x88-0x8B */ + 0x84, 0xD3, 0x84, 0xD4, 0x84, 0xD5, 0x84, 0xD6, /* 0x8C-0x8F */ + 0xB2, 0xD5, 0xB2, 0xD6, 0x84, 0xD7, 0x84, 0xD8, /* 0x90-0x93 */ + 0x84, 0xD9, 0xB2, 0xD7, 0x84, 0xDA, 0x84, 0xDB, /* 0x94-0x97 */ + 0x84, 0xDC, 0x84, 0xDD, 0x84, 0xDE, 0x84, 0xDF, /* 0x98-0x9B */ + 0xB2, 0xD8, 0x84, 0xE0, 0x84, 0xE1, 0x84, 0xE2, /* 0x9C-0x9F */ + 0x84, 0xE3, 0x84, 0xE4, 0x84, 0xE5, 0x84, 0xE6, /* 0xA0-0xA3 */ + 0x84, 0xE7, 0x84, 0xE8, 0x84, 0xE9, 0x84, 0xEA, /* 0xA4-0xA7 */ + 0x84, 0xEB, 0x84, 0xEC, 0x84, 0xED, 0x84, 0xEE, /* 0xA8-0xAB */ + 0x84, 0xEF, 0x84, 0xF0, 0x84, 0xF1, 0x84, 0xF2, /* 0xAC-0xAF */ + 0x84, 0xF3, 0x84, 0xF4, 0x84, 0xF5, 0x84, 0xF6, /* 0xB0-0xB3 */ + 0x84, 0xF7, 0x84, 0xF8, 0x84, 0xF9, 0x84, 0xFA, /* 0xB4-0xB7 */ + 0xB2, 0xD9, 0xB2, 0xDA, 0x84, 0xFB, 0x84, 0xFC, /* 0xB8-0xBB */ + 0xB2, 0xDB, 0x84, 0xFD, 0x84, 0xFE, 0x85, 0x41, /* 0xBC-0xBF */ + 0xB2, 0xDC, 0x85, 0x42, 0x85, 0x43, 0x85, 0x44, /* 0xC0-0xC3 */ + 0x85, 0x45, 0x85, 0x46, 0x85, 0x47, 0xB2, 0xDD, /* 0xC4-0xC7 */ + 0xB2, 0xDE, 0xB2, 0xDF, 0x85, 0x48, 0xB2, 0xE0, /* 0xC8-0xCB */ + 0x85, 0x49, 0xB2, 0xE1, 0xB2, 0xE2, 0x85, 0x4A, /* 0xCC-0xCF */ + 0x85, 0x4B, 0x85, 0x4C, 0x85, 0x4D, 0x85, 0x4E, /* 0xD0-0xD3 */ + 0xB2, 0xE3, 0x85, 0x4F, 0x85, 0x50, 0x85, 0x51, /* 0xD4-0xD7 */ + 0x85, 0x52, 0x85, 0x53, 0x85, 0x54, 0x85, 0x55, /* 0xD8-0xDB */ + 0xB2, 0xE4, 0x85, 0x56, 0x85, 0x57, 0x85, 0x58, /* 0xDC-0xDF */ + 0x85, 0x59, 0x85, 0x5A, 0x85, 0x61, 0x85, 0x62, /* 0xE0-0xE3 */ + 0x85, 0x63, 0x85, 0x64, 0x85, 0x65, 0x85, 0x66, /* 0xE4-0xE7 */ + 0xB2, 0xE5, 0xB2, 0xE6, 0x85, 0x67, 0x85, 0x68, /* 0xE8-0xEB */ + 0x85, 0x69, 0x85, 0x6A, 0x85, 0x6B, 0x85, 0x6C, /* 0xEC-0xEF */ + 0xB2, 0xE7, 0xB2, 0xE8, 0x85, 0x6D, 0x85, 0x6E, /* 0xF0-0xF3 */ + 0xB2, 0xE9, 0x85, 0x6F, 0x85, 0x70, 0x85, 0x71, /* 0xF4-0xF7 */ + 0xB2, 0xEA, 0x85, 0x72, 0x85, 0x73, 0x85, 0x74, /* 0xF8-0xFB */ + 0x85, 0x75, 0x85, 0x76, 0x85, 0x77, 0x85, 0x78, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B0[512] = { + 0xB2, 0xEB, 0xB2, 0xEC, 0x85, 0x79, 0x85, 0x7A, /* 0x00-0x03 */ + 0xB2, 0xED, 0x85, 0x81, 0x85, 0x82, 0x85, 0x83, /* 0x04-0x07 */ + 0x85, 0x84, 0x85, 0x85, 0x85, 0x86, 0x85, 0x87, /* 0x08-0x0B */ + 0xB2, 0xEE, 0x85, 0x88, 0x85, 0x89, 0x85, 0x8A, /* 0x0C-0x0F */ + 0xB2, 0xEF, 0x85, 0x8B, 0x85, 0x8C, 0x85, 0x8D, /* 0x10-0x13 */ + 0xB2, 0xF0, 0x85, 0x8E, 0x85, 0x8F, 0x85, 0x90, /* 0x14-0x17 */ + 0x85, 0x91, 0x85, 0x92, 0x85, 0x93, 0x85, 0x94, /* 0x18-0x1B */ + 0xB2, 0xF1, 0xB2, 0xF2, 0x85, 0x95, 0x85, 0x96, /* 0x1C-0x1F */ + 0x85, 0x97, 0x85, 0x98, 0x85, 0x99, 0x85, 0x9A, /* 0x20-0x23 */ + 0x85, 0x9B, 0x85, 0x9C, 0x85, 0x9D, 0x85, 0x9E, /* 0x24-0x27 */ + 0xB2, 0xF3, 0x85, 0x9F, 0x85, 0xA0, 0x85, 0xA1, /* 0x28-0x2B */ + 0x85, 0xA2, 0x85, 0xA3, 0x85, 0xA4, 0x85, 0xA5, /* 0x2C-0x2F */ + 0x85, 0xA6, 0x85, 0xA7, 0x85, 0xA8, 0x85, 0xA9, /* 0x30-0x33 */ + 0x85, 0xAA, 0x85, 0xAB, 0x85, 0xAC, 0x85, 0xAD, /* 0x34-0x37 */ + 0x85, 0xAE, 0x85, 0xAF, 0x85, 0xB0, 0x85, 0xB1, /* 0x38-0x3B */ + 0x85, 0xB2, 0x85, 0xB3, 0x85, 0xB4, 0x85, 0xB5, /* 0x3C-0x3F */ + 0x85, 0xB6, 0x85, 0xB7, 0x85, 0xB8, 0x85, 0xB9, /* 0x40-0x43 */ + 0xB2, 0xF4, 0xB2, 0xF5, 0x85, 0xBA, 0x85, 0xBB, /* 0x44-0x47 */ + 0xB2, 0xF6, 0x85, 0xBC, 0xB2, 0xF7, 0x85, 0xBD, /* 0x48-0x4B */ + 0xB2, 0xF8, 0x85, 0xBE, 0xB2, 0xF9, 0x85, 0xBF, /* 0x4C-0x4F */ + 0x85, 0xC0, 0x85, 0xC1, 0x85, 0xC2, 0xB2, 0xFA, /* 0x50-0x53 */ + 0xB2, 0xFB, 0xB2, 0xFC, 0x85, 0xC3, 0xB2, 0xFD, /* 0x54-0x57 */ + 0x85, 0xC4, 0xB2, 0xFE, 0x85, 0xC5, 0x85, 0xC6, /* 0x58-0x5B */ + 0x85, 0xC7, 0xB3, 0xA1, 0x85, 0xC8, 0x85, 0xC9, /* 0x5C-0x5F */ + 0x85, 0xCA, 0x85, 0xCB, 0x85, 0xCC, 0x85, 0xCD, /* 0x60-0x63 */ + 0x85, 0xCE, 0x85, 0xCF, 0x85, 0xD0, 0x85, 0xD1, /* 0x64-0x67 */ + 0x85, 0xD2, 0x85, 0xD3, 0x85, 0xD4, 0x85, 0xD5, /* 0x68-0x6B */ + 0x85, 0xD6, 0x85, 0xD7, 0x85, 0xD8, 0x85, 0xD9, /* 0x6C-0x6F */ + 0x85, 0xDA, 0x85, 0xDB, 0x85, 0xDC, 0x85, 0xDD, /* 0x70-0x73 */ + 0x85, 0xDE, 0x85, 0xDF, 0x85, 0xE0, 0x85, 0xE1, /* 0x74-0x77 */ + 0x85, 0xE2, 0x85, 0xE3, 0x85, 0xE4, 0x85, 0xE5, /* 0x78-0x7B */ + 0xB3, 0xA2, 0xB3, 0xA3, 0x85, 0xE6, 0x85, 0xE7, /* 0x7C-0x7F */ + + 0xB3, 0xA4, 0x85, 0xE8, 0x85, 0xE9, 0x85, 0xEA, /* 0x80-0x83 */ + 0xB3, 0xA5, 0x85, 0xEB, 0x85, 0xEC, 0x85, 0xED, /* 0x84-0x87 */ + 0x85, 0xEE, 0x85, 0xEF, 0x85, 0xF0, 0x85, 0xF1, /* 0x88-0x8B */ + 0xB3, 0xA6, 0xB3, 0xA7, 0x85, 0xF2, 0xB3, 0xA8, /* 0x8C-0x8F */ + 0x85, 0xF3, 0xB3, 0xA9, 0x85, 0xF4, 0x85, 0xF5, /* 0x90-0x93 */ + 0x85, 0xF6, 0x85, 0xF7, 0x85, 0xF8, 0x85, 0xF9, /* 0x94-0x97 */ + 0xB3, 0xAA, 0xB3, 0xAB, 0xB3, 0xAC, 0x85, 0xFA, /* 0x98-0x9B */ + 0xB3, 0xAD, 0x85, 0xFB, 0x85, 0xFC, 0xB3, 0xAE, /* 0x9C-0x9F */ + 0xB3, 0xAF, 0xB3, 0xB0, 0xB3, 0xB1, 0x85, 0xFD, /* 0xA0-0xA3 */ + 0x85, 0xFE, 0x86, 0x41, 0x86, 0x42, 0x86, 0x43, /* 0xA4-0xA7 */ + 0xB3, 0xB2, 0xB3, 0xB3, 0x86, 0x44, 0xB3, 0xB4, /* 0xA8-0xAB */ + 0xB3, 0xB5, 0xB3, 0xB6, 0xB3, 0xB7, 0xB3, 0xB8, /* 0xAC-0xAF */ + 0x86, 0x45, 0xB3, 0xB9, 0x86, 0x46, 0xB3, 0xBA, /* 0xB0-0xB3 */ + 0xB3, 0xBB, 0xB3, 0xBC, 0x86, 0x47, 0x86, 0x48, /* 0xB4-0xB7 */ + 0xB3, 0xBD, 0x86, 0x49, 0x86, 0x4A, 0x86, 0x4B, /* 0xB8-0xBB */ + 0xB3, 0xBE, 0x86, 0x4C, 0x86, 0x4D, 0x86, 0x4E, /* 0xBC-0xBF */ + 0x86, 0x4F, 0x86, 0x50, 0x86, 0x51, 0x86, 0x52, /* 0xC0-0xC3 */ + 0xB3, 0xBF, 0xB3, 0xC0, 0x86, 0x53, 0xB3, 0xC1, /* 0xC4-0xC7 */ + 0xB3, 0xC2, 0xB3, 0xC3, 0x86, 0x54, 0x86, 0x55, /* 0xC8-0xCB */ + 0x86, 0x56, 0x86, 0x57, 0x86, 0x58, 0x86, 0x59, /* 0xCC-0xCF */ + 0xB3, 0xC4, 0xB3, 0xC5, 0x86, 0x5A, 0x86, 0x61, /* 0xD0-0xD3 */ + 0xB3, 0xC6, 0x86, 0x62, 0x86, 0x63, 0x86, 0x64, /* 0xD4-0xD7 */ + 0xB3, 0xC7, 0x86, 0x65, 0x86, 0x66, 0x86, 0x67, /* 0xD8-0xDB */ + 0x86, 0x68, 0x86, 0x69, 0x86, 0x6A, 0x86, 0x6B, /* 0xDC-0xDF */ + 0xB3, 0xC8, 0x86, 0x6C, 0x86, 0x6D, 0x86, 0x6E, /* 0xE0-0xE3 */ + 0x86, 0x6F, 0xB3, 0xC9, 0x86, 0x70, 0x86, 0x71, /* 0xE4-0xE7 */ + 0x86, 0x72, 0x86, 0x73, 0x86, 0x74, 0x86, 0x75, /* 0xE8-0xEB */ + 0x86, 0x76, 0x86, 0x77, 0x86, 0x78, 0x86, 0x79, /* 0xEC-0xEF */ + 0x86, 0x7A, 0x86, 0x81, 0x86, 0x82, 0x86, 0x83, /* 0xF0-0xF3 */ + 0x86, 0x84, 0x86, 0x85, 0x86, 0x86, 0x86, 0x87, /* 0xF4-0xF7 */ + 0x86, 0x88, 0x86, 0x89, 0x86, 0x8A, 0x86, 0x8B, /* 0xF8-0xFB */ + 0x86, 0x8C, 0x86, 0x8D, 0x86, 0x8E, 0x86, 0x8F, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B1[512] = { + 0x86, 0x90, 0x86, 0x91, 0x86, 0x92, 0x86, 0x93, /* 0x00-0x03 */ + 0x86, 0x94, 0x86, 0x95, 0x86, 0x96, 0x86, 0x97, /* 0x04-0x07 */ + 0xB3, 0xCA, 0xB3, 0xCB, 0x86, 0x98, 0xB3, 0xCC, /* 0x08-0x0B */ + 0xB3, 0xCD, 0x86, 0x99, 0x86, 0x9A, 0x86, 0x9B, /* 0x0C-0x0F */ + 0xB3, 0xCE, 0x86, 0x9C, 0xB3, 0xCF, 0xB3, 0xD0, /* 0x10-0x13 */ + 0x86, 0x9D, 0x86, 0x9E, 0x86, 0x9F, 0x86, 0xA0, /* 0x14-0x17 */ + 0xB3, 0xD1, 0xB3, 0xD2, 0x86, 0xA1, 0xB3, 0xD3, /* 0x18-0x1B */ + 0xB3, 0xD4, 0xB3, 0xD5, 0x86, 0xA2, 0x86, 0xA3, /* 0x1C-0x1F */ + 0x86, 0xA4, 0x86, 0xA5, 0x86, 0xA6, 0xB3, 0xD6, /* 0x20-0x23 */ + 0xB3, 0xD7, 0xB3, 0xD8, 0x86, 0xA7, 0x86, 0xA8, /* 0x24-0x27 */ + 0xB3, 0xD9, 0x86, 0xA9, 0x86, 0xAA, 0x86, 0xAB, /* 0x28-0x2B */ + 0xB3, 0xDA, 0x86, 0xAC, 0x86, 0xAD, 0x86, 0xAE, /* 0x2C-0x2F */ + 0x86, 0xAF, 0x86, 0xB0, 0x86, 0xB1, 0x86, 0xB2, /* 0x30-0x33 */ + 0xB3, 0xDB, 0xB3, 0xDC, 0x86, 0xB3, 0xB3, 0xDD, /* 0x34-0x37 */ + 0xB3, 0xDE, 0xB3, 0xDF, 0x86, 0xB4, 0x86, 0xB5, /* 0x38-0x3B */ + 0x86, 0xB6, 0x86, 0xB7, 0x86, 0xB8, 0x86, 0xB9, /* 0x3C-0x3F */ + 0xB3, 0xE0, 0xB3, 0xE1, 0x86, 0xBA, 0x86, 0xBB, /* 0x40-0x43 */ + 0xB3, 0xE2, 0x86, 0xBC, 0x86, 0xBD, 0x86, 0xBE, /* 0x44-0x47 */ + 0xB3, 0xE3, 0x86, 0xBF, 0x86, 0xC0, 0x86, 0xC1, /* 0x48-0x4B */ + 0x86, 0xC2, 0x86, 0xC3, 0x86, 0xC4, 0x86, 0xC5, /* 0x4C-0x4F */ + 0xB3, 0xE4, 0xB3, 0xE5, 0x86, 0xC6, 0x86, 0xC7, /* 0x50-0x53 */ + 0xB3, 0xE6, 0xB3, 0xE7, 0x86, 0xC8, 0x86, 0xC9, /* 0x54-0x57 */ + 0xB3, 0xE8, 0x86, 0xCA, 0x86, 0xCB, 0x86, 0xCC, /* 0x58-0x5B */ + 0xB3, 0xE9, 0x86, 0xCD, 0x86, 0xCE, 0x86, 0xCF, /* 0x5C-0x5F */ + 0xB3, 0xEA, 0x86, 0xD0, 0x86, 0xD1, 0x86, 0xD2, /* 0x60-0x63 */ + 0x86, 0xD3, 0x86, 0xD4, 0x86, 0xD5, 0x86, 0xD6, /* 0x64-0x67 */ + 0x86, 0xD7, 0x86, 0xD8, 0x86, 0xD9, 0x86, 0xDA, /* 0x68-0x6B */ + 0x86, 0xDB, 0x86, 0xDC, 0x86, 0xDD, 0x86, 0xDE, /* 0x6C-0x6F */ + 0x86, 0xDF, 0x86, 0xE0, 0x86, 0xE1, 0x86, 0xE2, /* 0x70-0x73 */ + 0x86, 0xE3, 0x86, 0xE4, 0x86, 0xE5, 0x86, 0xE6, /* 0x74-0x77 */ + 0xB3, 0xEB, 0xB3, 0xEC, 0x86, 0xE7, 0x86, 0xE8, /* 0x78-0x7B */ + 0xB3, 0xED, 0x86, 0xE9, 0x86, 0xEA, 0x86, 0xEB, /* 0x7C-0x7F */ + + 0xB3, 0xEE, 0x86, 0xEC, 0xB3, 0xEF, 0x86, 0xED, /* 0x80-0x83 */ + 0x86, 0xEE, 0x86, 0xEF, 0x86, 0xF0, 0x86, 0xF1, /* 0x84-0x87 */ + 0xB3, 0xF0, 0xB3, 0xF1, 0x86, 0xF2, 0xB3, 0xF2, /* 0x88-0x8B */ + 0x86, 0xF3, 0xB3, 0xF3, 0x86, 0xF4, 0x86, 0xF5, /* 0x8C-0x8F */ + 0x86, 0xF6, 0x86, 0xF7, 0xB3, 0xF4, 0xB3, 0xF5, /* 0x90-0x93 */ + 0xB3, 0xF6, 0x86, 0xF8, 0x86, 0xF9, 0x86, 0xFA, /* 0x94-0x97 */ + 0xB3, 0xF7, 0x86, 0xFB, 0x86, 0xFC, 0x86, 0xFD, /* 0x98-0x9B */ + 0xB3, 0xF8, 0x86, 0xFE, 0x87, 0x41, 0x87, 0x42, /* 0x9C-0x9F */ + 0x87, 0x43, 0x87, 0x44, 0x87, 0x45, 0x87, 0x46, /* 0xA0-0xA3 */ + 0x87, 0x47, 0x87, 0x48, 0x87, 0x49, 0x87, 0x4A, /* 0xA4-0xA7 */ + 0xB3, 0xF9, 0x87, 0x4B, 0x87, 0x4C, 0x87, 0x4D, /* 0xA8-0xAB */ + 0x87, 0x4E, 0x87, 0x4F, 0x87, 0x50, 0x87, 0x51, /* 0xAC-0xAF */ + 0x87, 0x52, 0x87, 0x53, 0x87, 0x54, 0x87, 0x55, /* 0xB0-0xB3 */ + 0x87, 0x56, 0x87, 0x57, 0x87, 0x58, 0x87, 0x59, /* 0xB4-0xB7 */ + 0x87, 0x5A, 0x87, 0x61, 0x87, 0x62, 0x87, 0x63, /* 0xB8-0xBB */ + 0x87, 0x64, 0x87, 0x65, 0x87, 0x66, 0x87, 0x67, /* 0xBC-0xBF */ + 0x87, 0x68, 0x87, 0x69, 0x87, 0x6A, 0x87, 0x6B, /* 0xC0-0xC3 */ + 0x87, 0x6C, 0x87, 0x6D, 0x87, 0x6E, 0x87, 0x6F, /* 0xC4-0xC7 */ + 0x87, 0x70, 0x87, 0x71, 0x87, 0x72, 0x87, 0x73, /* 0xC8-0xCB */ + 0xB3, 0xFA, 0x87, 0x74, 0x87, 0x75, 0x87, 0x76, /* 0xCC-0xCF */ + 0xB3, 0xFB, 0x87, 0x77, 0x87, 0x78, 0x87, 0x79, /* 0xD0-0xD3 */ + 0xB3, 0xFC, 0x87, 0x7A, 0x87, 0x81, 0x87, 0x82, /* 0xD4-0xD7 */ + 0x87, 0x83, 0x87, 0x84, 0x87, 0x85, 0x87, 0x86, /* 0xD8-0xDB */ + 0xB3, 0xFD, 0xB3, 0xFE, 0x87, 0x87, 0xB4, 0xA1, /* 0xDC-0xDF */ + 0x87, 0x88, 0x87, 0x89, 0x87, 0x8A, 0x87, 0x8B, /* 0xE0-0xE3 */ + 0x87, 0x8C, 0x87, 0x8D, 0x87, 0x8E, 0x87, 0x8F, /* 0xE4-0xE7 */ + 0xB4, 0xA2, 0xB4, 0xA3, 0x87, 0x90, 0x87, 0x91, /* 0xE8-0xEB */ + 0xB4, 0xA4, 0x87, 0x92, 0x87, 0x93, 0x87, 0x94, /* 0xEC-0xEF */ + 0xB4, 0xA5, 0x87, 0x95, 0x87, 0x96, 0x87, 0x97, /* 0xF0-0xF3 */ + 0x87, 0x98, 0x87, 0x99, 0x87, 0x9A, 0x87, 0x9B, /* 0xF4-0xF7 */ + 0x87, 0x9C, 0xB4, 0xA6, 0x87, 0x9D, 0xB4, 0xA7, /* 0xF8-0xFB */ + 0x87, 0x9E, 0xB4, 0xA8, 0x87, 0x9F, 0x87, 0xA0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B2[512] = { + 0x87, 0xA1, 0x87, 0xA2, 0x87, 0xA3, 0x87, 0xA4, /* 0x00-0x03 */ + 0xB4, 0xA9, 0xB4, 0xAA, 0x87, 0xA5, 0x87, 0xA6, /* 0x04-0x07 */ + 0xB4, 0xAB, 0x87, 0xA7, 0x87, 0xA8, 0xB4, 0xAC, /* 0x08-0x0B */ + 0xB4, 0xAD, 0x87, 0xA9, 0x87, 0xAA, 0x87, 0xAB, /* 0x0C-0x0F */ + 0x87, 0xAC, 0x87, 0xAD, 0x87, 0xAE, 0x87, 0xAF, /* 0x10-0x13 */ + 0xB4, 0xAE, 0xB4, 0xAF, 0x87, 0xB0, 0xB4, 0xB0, /* 0x14-0x17 */ + 0x87, 0xB1, 0xB4, 0xB1, 0x87, 0xB2, 0x87, 0xB3, /* 0x18-0x1B */ + 0x87, 0xB4, 0x87, 0xB5, 0x87, 0xB6, 0x87, 0xB7, /* 0x1C-0x1F */ + 0xB4, 0xB2, 0x87, 0xB8, 0x87, 0xB9, 0x87, 0xBA, /* 0x20-0x23 */ + 0x87, 0xBB, 0x87, 0xBC, 0x87, 0xBD, 0x87, 0xBE, /* 0x24-0x27 */ + 0x87, 0xBF, 0x87, 0xC0, 0x87, 0xC1, 0x87, 0xC2, /* 0x28-0x2B */ + 0x87, 0xC3, 0x87, 0xC4, 0x87, 0xC5, 0x87, 0xC6, /* 0x2C-0x2F */ + 0x87, 0xC7, 0x87, 0xC8, 0x87, 0xC9, 0x87, 0xCA, /* 0x30-0x33 */ + 0xB4, 0xB3, 0x87, 0xCB, 0x87, 0xCC, 0x87, 0xCD, /* 0x34-0x37 */ + 0x87, 0xCE, 0x87, 0xCF, 0x87, 0xD0, 0x87, 0xD1, /* 0x38-0x3B */ + 0xB4, 0xB4, 0x87, 0xD2, 0x87, 0xD3, 0x87, 0xD4, /* 0x3C-0x3F */ + 0x87, 0xD5, 0x87, 0xD6, 0x87, 0xD7, 0x87, 0xD8, /* 0x40-0x43 */ + 0x87, 0xD9, 0x87, 0xDA, 0x87, 0xDB, 0x87, 0xDC, /* 0x44-0x47 */ + 0x87, 0xDD, 0x87, 0xDE, 0x87, 0xDF, 0x87, 0xE0, /* 0x48-0x4B */ + 0x87, 0xE1, 0x87, 0xE2, 0x87, 0xE3, 0x87, 0xE4, /* 0x4C-0x4F */ + 0x87, 0xE5, 0x87, 0xE6, 0x87, 0xE7, 0x87, 0xE8, /* 0x50-0x53 */ + 0x87, 0xE9, 0x87, 0xEA, 0x87, 0xEB, 0x87, 0xEC, /* 0x54-0x57 */ + 0xB4, 0xB5, 0x87, 0xED, 0x87, 0xEE, 0x87, 0xEF, /* 0x58-0x5B */ + 0xB4, 0xB6, 0x87, 0xF0, 0x87, 0xF1, 0x87, 0xF2, /* 0x5C-0x5F */ + 0xB4, 0xB7, 0x87, 0xF3, 0x87, 0xF4, 0x87, 0xF5, /* 0x60-0x63 */ + 0x87, 0xF6, 0x87, 0xF7, 0x87, 0xF8, 0x87, 0xF9, /* 0x64-0x67 */ + 0xB4, 0xB8, 0xB4, 0xB9, 0x87, 0xFA, 0x87, 0xFB, /* 0x68-0x6B */ + 0x87, 0xFC, 0x87, 0xFD, 0x87, 0xFE, 0x88, 0x41, /* 0x6C-0x6F */ + 0x88, 0x42, 0x88, 0x43, 0x88, 0x44, 0x88, 0x45, /* 0x70-0x73 */ + 0xB4, 0xBA, 0xB4, 0xBB, 0x88, 0x46, 0x88, 0x47, /* 0x74-0x77 */ + 0x88, 0x48, 0x88, 0x49, 0x88, 0x4A, 0x88, 0x4B, /* 0x78-0x7B */ + 0xB4, 0xBC, 0x88, 0x4C, 0x88, 0x4D, 0x88, 0x4E, /* 0x7C-0x7F */ + + 0x88, 0x4F, 0x88, 0x50, 0x88, 0x51, 0x88, 0x52, /* 0x80-0x83 */ + 0xB4, 0xBD, 0xB4, 0xBE, 0x88, 0x53, 0x88, 0x54, /* 0x84-0x87 */ + 0x88, 0x55, 0xB4, 0xBF, 0x88, 0x56, 0x88, 0x57, /* 0x88-0x8B */ + 0x88, 0x58, 0x88, 0x59, 0x88, 0x5A, 0x88, 0x61, /* 0x8C-0x8F */ + 0xB4, 0xC0, 0xB4, 0xC1, 0x88, 0x62, 0x88, 0x63, /* 0x90-0x93 */ + 0xB4, 0xC2, 0x88, 0x64, 0x88, 0x65, 0x88, 0x66, /* 0x94-0x97 */ + 0xB4, 0xC3, 0xB4, 0xC4, 0xB4, 0xC5, 0x88, 0x67, /* 0x98-0x9B */ + 0x88, 0x68, 0x88, 0x69, 0x88, 0x6A, 0x88, 0x6B, /* 0x9C-0x9F */ + 0xB4, 0xC6, 0xB4, 0xC7, 0x88, 0x6C, 0xB4, 0xC8, /* 0xA0-0xA3 */ + 0x88, 0x6D, 0xB4, 0xC9, 0xB4, 0xCA, 0x88, 0x6E, /* 0xA4-0xA7 */ + 0x88, 0x6F, 0x88, 0x70, 0xB4, 0xCB, 0x88, 0x71, /* 0xA8-0xAB */ + 0xB4, 0xCC, 0x88, 0x72, 0x88, 0x73, 0x88, 0x74, /* 0xAC-0xAF */ + 0xB4, 0xCD, 0x88, 0x75, 0x88, 0x76, 0x88, 0x77, /* 0xB0-0xB3 */ + 0xB4, 0xCE, 0x88, 0x78, 0x88, 0x79, 0x88, 0x7A, /* 0xB4-0xB7 */ + 0x88, 0x81, 0x88, 0x82, 0x88, 0x83, 0x88, 0x84, /* 0xB8-0xBB */ + 0x88, 0x85, 0x88, 0x86, 0x88, 0x87, 0x88, 0x88, /* 0xBC-0xBF */ + 0x88, 0x89, 0x88, 0x8A, 0x88, 0x8B, 0x88, 0x8C, /* 0xC0-0xC3 */ + 0x88, 0x8D, 0x88, 0x8E, 0x88, 0x8F, 0x88, 0x90, /* 0xC4-0xC7 */ + 0xB4, 0xCF, 0xB4, 0xD0, 0x88, 0x91, 0x88, 0x92, /* 0xC8-0xCB */ + 0xB4, 0xD1, 0x88, 0x93, 0x88, 0x94, 0x88, 0x95, /* 0xCC-0xCF */ + 0xB4, 0xD2, 0x88, 0x96, 0xB4, 0xD3, 0x88, 0x97, /* 0xD0-0xD3 */ + 0x88, 0x98, 0x88, 0x99, 0x88, 0x9A, 0x88, 0x9B, /* 0xD4-0xD7 */ + 0xB4, 0xD4, 0xB4, 0xD5, 0x88, 0x9C, 0xB4, 0xD6, /* 0xD8-0xDB */ + 0x88, 0x9D, 0xB4, 0xD7, 0x88, 0x9E, 0x88, 0x9F, /* 0xDC-0xDF */ + 0x88, 0xA0, 0x88, 0xA1, 0xB4, 0xD8, 0x88, 0xA2, /* 0xE0-0xE3 */ + 0xB4, 0xD9, 0xB4, 0xDA, 0xB4, 0xDB, 0x88, 0xA3, /* 0xE4-0xE7 */ + 0xB4, 0xDC, 0x88, 0xA4, 0x88, 0xA5, 0xB4, 0xDD, /* 0xE8-0xEB */ + 0xB4, 0xDE, 0xB4, 0xDF, 0xB4, 0xE0, 0xB4, 0xE1, /* 0xEC-0xEF */ + 0x88, 0xA6, 0x88, 0xA7, 0x88, 0xA8, 0xB4, 0xE2, /* 0xF0-0xF3 */ + 0xB4, 0xE3, 0xB4, 0xE4, 0x88, 0xA9, 0xB4, 0xE5, /* 0xF4-0xF7 */ + 0xB4, 0xE6, 0xB4, 0xE7, 0xB4, 0xE8, 0xB4, 0xE9, /* 0xF8-0xFB */ + 0x88, 0xAA, 0x88, 0xAB, 0x88, 0xAC, 0xB4, 0xEA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B3[512] = { + 0xB4, 0xEB, 0xB4, 0xEC, 0x88, 0xAD, 0x88, 0xAE, /* 0x00-0x03 */ + 0xB4, 0xED, 0x88, 0xAF, 0x88, 0xB0, 0x88, 0xB1, /* 0x04-0x07 */ + 0xB4, 0xEE, 0x88, 0xB2, 0x88, 0xB3, 0x88, 0xB4, /* 0x08-0x0B */ + 0x88, 0xB5, 0x88, 0xB6, 0x88, 0xB7, 0x88, 0xB8, /* 0x0C-0x0F */ + 0xB4, 0xEF, 0xB4, 0xF0, 0x88, 0xB9, 0xB4, 0xF1, /* 0x10-0x13 */ + 0xB4, 0xF2, 0xB4, 0xF3, 0x88, 0xBA, 0x88, 0xBB, /* 0x14-0x17 */ + 0x88, 0xBC, 0x88, 0xBD, 0x88, 0xBE, 0x88, 0xBF, /* 0x18-0x1B */ + 0xB4, 0xF4, 0x88, 0xC0, 0x88, 0xC1, 0x88, 0xC2, /* 0x1C-0x1F */ + 0x88, 0xC3, 0x88, 0xC4, 0x88, 0xC5, 0x88, 0xC6, /* 0x20-0x23 */ + 0x88, 0xC7, 0x88, 0xC8, 0x88, 0xC9, 0x88, 0xCA, /* 0x24-0x27 */ + 0x88, 0xCB, 0x88, 0xCC, 0x88, 0xCD, 0x88, 0xCE, /* 0x28-0x2B */ + 0x88, 0xCF, 0x88, 0xD0, 0x88, 0xD1, 0x88, 0xD2, /* 0x2C-0x2F */ + 0x88, 0xD3, 0x88, 0xD4, 0x88, 0xD5, 0x88, 0xD6, /* 0x30-0x33 */ + 0x88, 0xD7, 0x88, 0xD8, 0x88, 0xD9, 0x88, 0xDA, /* 0x34-0x37 */ + 0x88, 0xDB, 0x88, 0xDC, 0x88, 0xDD, 0x88, 0xDE, /* 0x38-0x3B */ + 0x88, 0xDF, 0x88, 0xE0, 0x88, 0xE1, 0x88, 0xE2, /* 0x3C-0x3F */ + 0x88, 0xE3, 0x88, 0xE4, 0x88, 0xE5, 0x88, 0xE6, /* 0x40-0x43 */ + 0x88, 0xE7, 0x88, 0xE8, 0x88, 0xE9, 0x88, 0xEA, /* 0x44-0x47 */ + 0x88, 0xEB, 0x88, 0xEC, 0x88, 0xED, 0x88, 0xEE, /* 0x48-0x4B */ + 0x88, 0xEF, 0x88, 0xF0, 0x88, 0xF1, 0x88, 0xF2, /* 0x4C-0x4F */ + 0x88, 0xF3, 0x88, 0xF4, 0x88, 0xF5, 0x88, 0xF6, /* 0x50-0x53 */ + 0xB4, 0xF5, 0xB4, 0xF6, 0xB4, 0xF7, 0x88, 0xF7, /* 0x54-0x57 */ + 0xB4, 0xF8, 0x88, 0xF8, 0x88, 0xF9, 0xB4, 0xF9, /* 0x58-0x5B */ + 0xB4, 0xFA, 0x88, 0xFA, 0xB4, 0xFB, 0xB4, 0xFC, /* 0x5C-0x5F */ + 0x88, 0xFB, 0x88, 0xFC, 0x88, 0xFD, 0x88, 0xFE, /* 0x60-0x63 */ + 0xB4, 0xFD, 0xB4, 0xFE, 0x89, 0x41, 0xB5, 0xA1, /* 0x64-0x67 */ + 0x89, 0x42, 0xB5, 0xA2, 0x89, 0x43, 0xB5, 0xA3, /* 0x68-0x6B */ + 0x89, 0x44, 0x89, 0x45, 0xB5, 0xA4, 0x89, 0x46, /* 0x6C-0x6F */ + 0xB5, 0xA5, 0xB5, 0xA6, 0x89, 0x47, 0x89, 0x48, /* 0x70-0x73 */ + 0xB5, 0xA7, 0x89, 0x49, 0x89, 0x4A, 0x89, 0x4B, /* 0x74-0x77 */ + 0xB5, 0xA8, 0x89, 0x4C, 0x89, 0x4D, 0x89, 0x4E, /* 0x78-0x7B */ + 0x89, 0x4F, 0x89, 0x50, 0x89, 0x51, 0x89, 0x52, /* 0x7C-0x7F */ + + 0xB5, 0xA9, 0xB5, 0xAA, 0x89, 0x53, 0xB5, 0xAB, /* 0x80-0x83 */ + 0xB5, 0xAC, 0xB5, 0xAD, 0x89, 0x54, 0x89, 0x55, /* 0x84-0x87 */ + 0x89, 0x56, 0x89, 0x57, 0x89, 0x58, 0x89, 0x59, /* 0x88-0x8B */ + 0xB5, 0xAE, 0x89, 0x5A, 0x89, 0x61, 0x89, 0x62, /* 0x8C-0x8F */ + 0xB5, 0xAF, 0x89, 0x63, 0x89, 0x64, 0x89, 0x65, /* 0x90-0x93 */ + 0xB5, 0xB0, 0x89, 0x66, 0x89, 0x67, 0x89, 0x68, /* 0x94-0x97 */ + 0x89, 0x69, 0x89, 0x6A, 0x89, 0x6B, 0x89, 0x6C, /* 0x98-0x9B */ + 0x89, 0x6D, 0x89, 0x6E, 0x89, 0x6F, 0x89, 0x70, /* 0x9C-0x9F */ + 0xB5, 0xB1, 0xB5, 0xB2, 0x89, 0x71, 0x89, 0x72, /* 0xA0-0xA3 */ + 0x89, 0x73, 0x89, 0x74, 0x89, 0x75, 0x89, 0x76, /* 0xA4-0xA7 */ + 0xB5, 0xB3, 0x89, 0x77, 0x89, 0x78, 0x89, 0x79, /* 0xA8-0xAB */ + 0xB5, 0xB4, 0x89, 0x7A, 0x89, 0x81, 0x89, 0x82, /* 0xAC-0xAF */ + 0x89, 0x83, 0x89, 0x84, 0x89, 0x85, 0x89, 0x86, /* 0xB0-0xB3 */ + 0x89, 0x87, 0x89, 0x88, 0x89, 0x89, 0x89, 0x8A, /* 0xB4-0xB7 */ + 0x89, 0x8B, 0x89, 0x8C, 0x89, 0x8D, 0x89, 0x8E, /* 0xB8-0xBB */ + 0x89, 0x8F, 0x89, 0x90, 0x89, 0x91, 0x89, 0x92, /* 0xBC-0xBF */ + 0x89, 0x93, 0x89, 0x94, 0x89, 0x95, 0x89, 0x96, /* 0xC0-0xC3 */ + 0xB5, 0xB5, 0xB5, 0xB6, 0x89, 0x97, 0x89, 0x98, /* 0xC4-0xC7 */ + 0xB5, 0xB7, 0x89, 0x99, 0x89, 0x9A, 0xB5, 0xB8, /* 0xC8-0xCB */ + 0xB5, 0xB9, 0x89, 0x9B, 0xB5, 0xBA, 0x89, 0x9C, /* 0xCC-0xCF */ + 0xB5, 0xBB, 0x89, 0x9D, 0x89, 0x9E, 0x89, 0x9F, /* 0xD0-0xD3 */ + 0xB5, 0xBC, 0xB5, 0xBD, 0x89, 0xA0, 0xB5, 0xBE, /* 0xD4-0xD7 */ + 0x89, 0xA1, 0xB5, 0xBF, 0x89, 0xA2, 0xB5, 0xC0, /* 0xD8-0xDB */ + 0x89, 0xA3, 0xB5, 0xC1, 0x89, 0xA4, 0x89, 0xA5, /* 0xDC-0xDF */ + 0xB5, 0xC2, 0x89, 0xA6, 0x89, 0xA7, 0x89, 0xA8, /* 0xE0-0xE3 */ + 0xB5, 0xC3, 0x89, 0xA9, 0x89, 0xAA, 0x89, 0xAB, /* 0xE4-0xE7 */ + 0xB5, 0xC4, 0x89, 0xAC, 0x89, 0xAD, 0x89, 0xAE, /* 0xE8-0xEB */ + 0x89, 0xAF, 0x89, 0xB0, 0x89, 0xB1, 0x89, 0xB2, /* 0xEC-0xEF */ + 0x89, 0xB3, 0x89, 0xB4, 0x89, 0xB5, 0x89, 0xB6, /* 0xF0-0xF3 */ + 0x89, 0xB7, 0x89, 0xB8, 0x89, 0xB9, 0x89, 0xBA, /* 0xF4-0xF7 */ + 0x89, 0xBB, 0x89, 0xBC, 0x89, 0xBD, 0x89, 0xBE, /* 0xF8-0xFB */ + 0xB5, 0xC5, 0x89, 0xBF, 0x89, 0xC0, 0x89, 0xC1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B4[512] = { + 0x89, 0xC2, 0x89, 0xC3, 0x89, 0xC4, 0x89, 0xC5, /* 0x00-0x03 */ + 0x89, 0xC6, 0x89, 0xC7, 0x89, 0xC8, 0x89, 0xC9, /* 0x04-0x07 */ + 0x89, 0xCA, 0x89, 0xCB, 0x89, 0xCC, 0x89, 0xCD, /* 0x08-0x0B */ + 0x89, 0xCE, 0x89, 0xCF, 0x89, 0xD0, 0x89, 0xD1, /* 0x0C-0x0F */ + 0xB5, 0xC6, 0x89, 0xD2, 0x89, 0xD3, 0x89, 0xD4, /* 0x10-0x13 */ + 0x89, 0xD5, 0x89, 0xD6, 0x89, 0xD7, 0x89, 0xD8, /* 0x14-0x17 */ + 0xB5, 0xC7, 0x89, 0xD9, 0x89, 0xDA, 0x89, 0xDB, /* 0x18-0x1B */ + 0xB5, 0xC8, 0x89, 0xDC, 0x89, 0xDD, 0x89, 0xDE, /* 0x1C-0x1F */ + 0xB5, 0xC9, 0x89, 0xDF, 0x89, 0xE0, 0x89, 0xE1, /* 0x20-0x23 */ + 0x89, 0xE2, 0x89, 0xE3, 0x89, 0xE4, 0x89, 0xE5, /* 0x24-0x27 */ + 0xB5, 0xCA, 0xB5, 0xCB, 0x89, 0xE6, 0xB5, 0xCC, /* 0x28-0x2B */ + 0x89, 0xE7, 0x89, 0xE8, 0x89, 0xE9, 0x89, 0xEA, /* 0x2C-0x2F */ + 0x89, 0xEB, 0x89, 0xEC, 0x89, 0xED, 0x89, 0xEE, /* 0x30-0x33 */ + 0xB5, 0xCD, 0x89, 0xEF, 0x89, 0xF0, 0x89, 0xF1, /* 0x34-0x37 */ + 0x89, 0xF2, 0x89, 0xF3, 0x89, 0xF4, 0x89, 0xF5, /* 0x38-0x3B */ + 0x89, 0xF6, 0x89, 0xF7, 0x89, 0xF8, 0x89, 0xF9, /* 0x3C-0x3F */ + 0x89, 0xFA, 0x89, 0xFB, 0x89, 0xFC, 0x89, 0xFD, /* 0x40-0x43 */ + 0x89, 0xFE, 0x8A, 0x41, 0x8A, 0x42, 0x8A, 0x43, /* 0x44-0x47 */ + 0x8A, 0x44, 0x8A, 0x45, 0x8A, 0x46, 0x8A, 0x47, /* 0x48-0x4B */ + 0x8A, 0x48, 0x8A, 0x49, 0x8A, 0x4A, 0x8A, 0x4B, /* 0x4C-0x4F */ + 0xB5, 0xCE, 0xB5, 0xCF, 0x8A, 0x4C, 0x8A, 0x4D, /* 0x50-0x53 */ + 0xB5, 0xD0, 0x8A, 0x4E, 0x8A, 0x4F, 0x8A, 0x50, /* 0x54-0x57 */ + 0xB5, 0xD1, 0x8A, 0x51, 0x8A, 0x52, 0x8A, 0x53, /* 0x58-0x5B */ + 0x8A, 0x54, 0x8A, 0x55, 0x8A, 0x56, 0x8A, 0x57, /* 0x5C-0x5F */ + 0xB5, 0xD2, 0xB5, 0xD3, 0x8A, 0x58, 0xB5, 0xD4, /* 0x60-0x63 */ + 0x8A, 0x59, 0xB5, 0xD5, 0x8A, 0x5A, 0x8A, 0x61, /* 0x64-0x67 */ + 0x8A, 0x62, 0x8A, 0x63, 0x8A, 0x64, 0x8A, 0x65, /* 0x68-0x6B */ + 0xB5, 0xD6, 0x8A, 0x66, 0x8A, 0x67, 0x8A, 0x68, /* 0x6C-0x6F */ + 0x8A, 0x69, 0x8A, 0x6A, 0x8A, 0x6B, 0x8A, 0x6C, /* 0x70-0x73 */ + 0x8A, 0x6D, 0x8A, 0x6E, 0x8A, 0x6F, 0x8A, 0x70, /* 0x74-0x77 */ + 0x8A, 0x71, 0x8A, 0x72, 0x8A, 0x73, 0x8A, 0x74, /* 0x78-0x7B */ + 0x8A, 0x75, 0x8A, 0x76, 0x8A, 0x77, 0x8A, 0x78, /* 0x7C-0x7F */ + + 0xB5, 0xD7, 0x8A, 0x79, 0x8A, 0x7A, 0x8A, 0x81, /* 0x80-0x83 */ + 0x8A, 0x82, 0x8A, 0x83, 0x8A, 0x84, 0x8A, 0x85, /* 0x84-0x87 */ + 0xB5, 0xD8, 0x8A, 0x86, 0x8A, 0x87, 0x8A, 0x88, /* 0x88-0x8B */ + 0x8A, 0x89, 0x8A, 0x8A, 0x8A, 0x8B, 0x8A, 0x8C, /* 0x8C-0x8F */ + 0x8A, 0x8D, 0x8A, 0x8E, 0x8A, 0x8F, 0x8A, 0x90, /* 0x90-0x93 */ + 0x8A, 0x91, 0x8A, 0x92, 0x8A, 0x93, 0x8A, 0x94, /* 0x94-0x97 */ + 0x8A, 0x95, 0x8A, 0x96, 0x8A, 0x97, 0x8A, 0x98, /* 0x98-0x9B */ + 0x8A, 0x99, 0xB5, 0xD9, 0x8A, 0x9A, 0x8A, 0x9B, /* 0x9C-0x9F */ + 0x8A, 0x9C, 0x8A, 0x9D, 0x8A, 0x9E, 0x8A, 0x9F, /* 0xA0-0xA3 */ + 0xB5, 0xDA, 0x8A, 0xA0, 0x8A, 0xA1, 0x8A, 0xA2, /* 0xA4-0xA7 */ + 0xB5, 0xDB, 0x8A, 0xA3, 0x8A, 0xA4, 0x8A, 0xA5, /* 0xA8-0xAB */ + 0xB5, 0xDC, 0x8A, 0xA6, 0x8A, 0xA7, 0x8A, 0xA8, /* 0xAC-0xAF */ + 0x8A, 0xA9, 0x8A, 0xAA, 0x8A, 0xAB, 0x8A, 0xAC, /* 0xB0-0xB3 */ + 0x8A, 0xAD, 0xB5, 0xDD, 0x8A, 0xAE, 0xB5, 0xDE, /* 0xB4-0xB7 */ + 0x8A, 0xAF, 0xB5, 0xDF, 0x8A, 0xB0, 0x8A, 0xB1, /* 0xB8-0xBB */ + 0x8A, 0xB2, 0x8A, 0xB3, 0x8A, 0xB4, 0x8A, 0xB5, /* 0xBC-0xBF */ + 0xB5, 0xE0, 0x8A, 0xB6, 0x8A, 0xB7, 0x8A, 0xB8, /* 0xC0-0xC3 */ + 0xB5, 0xE1, 0x8A, 0xB9, 0x8A, 0xBA, 0x8A, 0xBB, /* 0xC4-0xC7 */ + 0xB5, 0xE2, 0x8A, 0xBC, 0x8A, 0xBD, 0x8A, 0xBE, /* 0xC8-0xCB */ + 0x8A, 0xBF, 0x8A, 0xC0, 0x8A, 0xC1, 0x8A, 0xC2, /* 0xCC-0xCF */ + 0xB5, 0xE3, 0x8A, 0xC3, 0x8A, 0xC4, 0x8A, 0xC5, /* 0xD0-0xD3 */ + 0x8A, 0xC6, 0xB5, 0xE4, 0x8A, 0xC7, 0x8A, 0xC8, /* 0xD4-0xD7 */ + 0x8A, 0xC9, 0x8A, 0xCA, 0x8A, 0xCB, 0x8A, 0xCC, /* 0xD8-0xDB */ + 0xB5, 0xE5, 0xB5, 0xE6, 0x8A, 0xCD, 0x8A, 0xCE, /* 0xDC-0xDF */ + 0xB5, 0xE7, 0x8A, 0xCF, 0x8A, 0xD0, 0xB5, 0xE8, /* 0xE0-0xE3 */ + 0xB5, 0xE9, 0x8A, 0xD1, 0xB5, 0xEA, 0x8A, 0xD2, /* 0xE4-0xE7 */ + 0x8A, 0xD3, 0x8A, 0xD4, 0x8A, 0xD5, 0x8A, 0xD6, /* 0xE8-0xEB */ + 0xB5, 0xEB, 0xB5, 0xEC, 0x8A, 0xD7, 0xB5, 0xED, /* 0xEC-0xEF */ + 0x8A, 0xD8, 0xB5, 0xEE, 0x8A, 0xD9, 0x8A, 0xDA, /* 0xF0-0xF3 */ + 0x8A, 0xDB, 0x8A, 0xDC, 0x8A, 0xDD, 0x8A, 0xDE, /* 0xF4-0xF7 */ + 0xB5, 0xEF, 0x8A, 0xDF, 0x8A, 0xE0, 0x8A, 0xE1, /* 0xF8-0xFB */ + 0x8A, 0xE2, 0x8A, 0xE3, 0x8A, 0xE4, 0x8A, 0xE5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B5[512] = { + 0x8A, 0xE6, 0x8A, 0xE7, 0x8A, 0xE8, 0x8A, 0xE9, /* 0x00-0x03 */ + 0x8A, 0xEA, 0x8A, 0xEB, 0x8A, 0xEC, 0x8A, 0xED, /* 0x04-0x07 */ + 0x8A, 0xEE, 0x8A, 0xEF, 0x8A, 0xF0, 0x8A, 0xF1, /* 0x08-0x0B */ + 0x8A, 0xF2, 0x8A, 0xF3, 0x8A, 0xF4, 0x8A, 0xF5, /* 0x0C-0x0F */ + 0x8A, 0xF6, 0x8A, 0xF7, 0x8A, 0xF8, 0x8A, 0xF9, /* 0x10-0x13 */ + 0xB5, 0xF0, 0xB5, 0xF1, 0x8A, 0xFA, 0x8A, 0xFB, /* 0x14-0x17 */ + 0xB5, 0xF2, 0x8A, 0xFC, 0x8A, 0xFD, 0xB5, 0xF3, /* 0x18-0x1B */ + 0xB5, 0xF4, 0x8A, 0xFE, 0x8B, 0x41, 0x8B, 0x42, /* 0x1C-0x1F */ + 0x8B, 0x43, 0x8B, 0x44, 0x8B, 0x45, 0x8B, 0x46, /* 0x20-0x23 */ + 0xB5, 0xF5, 0xB5, 0xF6, 0x8B, 0x47, 0xB5, 0xF7, /* 0x24-0x27 */ + 0xB5, 0xF8, 0xB5, 0xF9, 0xB5, 0xFA, 0x8B, 0x48, /* 0x28-0x2B */ + 0x8B, 0x49, 0x8B, 0x4A, 0x8B, 0x4B, 0x8B, 0x4C, /* 0x2C-0x2F */ + 0xB5, 0xFB, 0xB5, 0xFC, 0x8B, 0x4D, 0x8B, 0x4E, /* 0x30-0x33 */ + 0xB5, 0xFD, 0x8B, 0x4F, 0x8B, 0x50, 0x8B, 0x51, /* 0x34-0x37 */ + 0xB5, 0xFE, 0x8B, 0x52, 0x8B, 0x53, 0x8B, 0x54, /* 0x38-0x3B */ + 0x8B, 0x55, 0x8B, 0x56, 0x8B, 0x57, 0x8B, 0x58, /* 0x3C-0x3F */ + 0xB6, 0xA1, 0xB6, 0xA2, 0x8B, 0x59, 0xB6, 0xA3, /* 0x40-0x43 */ + 0xB6, 0xA4, 0xB6, 0xA5, 0x8B, 0x5A, 0x8B, 0x61, /* 0x44-0x47 */ + 0x8B, 0x62, 0x8B, 0x63, 0x8B, 0x64, 0xB6, 0xA6, /* 0x48-0x4B */ + 0xB6, 0xA7, 0xB6, 0xA8, 0x8B, 0x65, 0x8B, 0x66, /* 0x4C-0x4F */ + 0xB6, 0xA9, 0x8B, 0x67, 0x8B, 0x68, 0x8B, 0x69, /* 0x50-0x53 */ + 0xB6, 0xAA, 0x8B, 0x6A, 0x8B, 0x6B, 0x8B, 0x6C, /* 0x54-0x57 */ + 0x8B, 0x6D, 0x8B, 0x6E, 0x8B, 0x6F, 0x8B, 0x70, /* 0x58-0x5B */ + 0xB6, 0xAB, 0xB6, 0xAC, 0x8B, 0x71, 0xB6, 0xAD, /* 0x5C-0x5F */ + 0xB6, 0xAE, 0xB6, 0xAF, 0x8B, 0x72, 0x8B, 0x73, /* 0x60-0x63 */ + 0x8B, 0x74, 0x8B, 0x75, 0x8B, 0x76, 0x8B, 0x77, /* 0x64-0x67 */ + 0x8B, 0x78, 0x8B, 0x79, 0x8B, 0x7A, 0x8B, 0x81, /* 0x68-0x6B */ + 0x8B, 0x82, 0x8B, 0x83, 0x8B, 0x84, 0x8B, 0x85, /* 0x6C-0x6F */ + 0x8B, 0x86, 0x8B, 0x87, 0x8B, 0x88, 0x8B, 0x89, /* 0x70-0x73 */ + 0x8B, 0x8A, 0x8B, 0x8B, 0x8B, 0x8C, 0x8B, 0x8D, /* 0x74-0x77 */ + 0x8B, 0x8E, 0x8B, 0x8F, 0x8B, 0x90, 0x8B, 0x91, /* 0x78-0x7B */ + 0x8B, 0x92, 0x8B, 0x93, 0x8B, 0x94, 0x8B, 0x95, /* 0x7C-0x7F */ + + 0x8B, 0x96, 0x8B, 0x97, 0x8B, 0x98, 0x8B, 0x99, /* 0x80-0x83 */ + 0x8B, 0x9A, 0x8B, 0x9B, 0x8B, 0x9C, 0x8B, 0x9D, /* 0x84-0x87 */ + 0x8B, 0x9E, 0x8B, 0x9F, 0x8B, 0xA0, 0x8B, 0xA1, /* 0x88-0x8B */ + 0x8B, 0xA2, 0x8B, 0xA3, 0x8B, 0xA4, 0x8B, 0xA5, /* 0x8C-0x8F */ + 0x8B, 0xA6, 0x8B, 0xA7, 0x8B, 0xA8, 0x8B, 0xA9, /* 0x90-0x93 */ + 0x8B, 0xAA, 0x8B, 0xAB, 0x8B, 0xAC, 0x8B, 0xAD, /* 0x94-0x97 */ + 0x8B, 0xAE, 0x8B, 0xAF, 0x8B, 0xB0, 0x8B, 0xB1, /* 0x98-0x9B */ + 0x8B, 0xB2, 0x8B, 0xB3, 0x8B, 0xB4, 0x8B, 0xB5, /* 0x9C-0x9F */ + 0xB6, 0xB0, 0xB6, 0xB1, 0x8B, 0xB6, 0x8B, 0xB7, /* 0xA0-0xA3 */ + 0xB6, 0xB2, 0x8B, 0xB8, 0x8B, 0xB9, 0x8B, 0xBA, /* 0xA4-0xA7 */ + 0xB6, 0xB3, 0x8B, 0xBB, 0xB6, 0xB4, 0xB6, 0xB5, /* 0xA8-0xAB */ + 0x8B, 0xBC, 0x8B, 0xBD, 0x8B, 0xBE, 0x8B, 0xBF, /* 0xAC-0xAF */ + 0xB6, 0xB6, 0xB6, 0xB7, 0x8B, 0xC0, 0xB6, 0xB8, /* 0xB0-0xB3 */ + 0xB6, 0xB9, 0xB6, 0xBA, 0x8B, 0xC1, 0x8B, 0xC2, /* 0xB4-0xB7 */ + 0x8B, 0xC3, 0x8B, 0xC4, 0x8B, 0xC5, 0xB6, 0xBB, /* 0xB8-0xBB */ + 0xB6, 0xBC, 0xB6, 0xBD, 0x8B, 0xC6, 0x8B, 0xC7, /* 0xBC-0xBF */ + 0xB6, 0xBE, 0x8B, 0xC8, 0x8B, 0xC9, 0x8B, 0xCA, /* 0xC0-0xC3 */ + 0xB6, 0xBF, 0x8B, 0xCB, 0x8B, 0xCC, 0x8B, 0xCD, /* 0xC4-0xC7 */ + 0x8B, 0xCE, 0x8B, 0xCF, 0x8B, 0xD0, 0x8B, 0xD1, /* 0xC8-0xCB */ + 0xB6, 0xC0, 0xB6, 0xC1, 0x8B, 0xD2, 0xB6, 0xC2, /* 0xCC-0xCF */ + 0xB6, 0xC3, 0xB6, 0xC4, 0x8B, 0xD3, 0x8B, 0xD4, /* 0xD0-0xD3 */ + 0x8B, 0xD5, 0x8B, 0xD6, 0x8B, 0xD7, 0x8B, 0xD8, /* 0xD4-0xD7 */ + 0xB6, 0xC5, 0x8B, 0xD9, 0x8B, 0xDA, 0x8B, 0xDB, /* 0xD8-0xDB */ + 0x8B, 0xDC, 0x8B, 0xDD, 0x8B, 0xDE, 0x8B, 0xDF, /* 0xDC-0xDF */ + 0x8B, 0xE0, 0x8B, 0xE1, 0x8B, 0xE2, 0x8B, 0xE3, /* 0xE0-0xE3 */ + 0x8B, 0xE4, 0x8B, 0xE5, 0x8B, 0xE6, 0x8B, 0xE7, /* 0xE4-0xE7 */ + 0x8B, 0xE8, 0x8B, 0xE9, 0x8B, 0xEA, 0x8B, 0xEB, /* 0xE8-0xEB */ + 0xB6, 0xC6, 0x8B, 0xEC, 0x8B, 0xED, 0x8B, 0xEE, /* 0xEC-0xEF */ + 0x8B, 0xEF, 0x8B, 0xF0, 0x8B, 0xF1, 0x8B, 0xF2, /* 0xF0-0xF3 */ + 0x8B, 0xF3, 0x8B, 0xF4, 0x8B, 0xF5, 0x8B, 0xF6, /* 0xF4-0xF7 */ + 0x8B, 0xF7, 0x8B, 0xF8, 0x8B, 0xF9, 0x8B, 0xFA, /* 0xF8-0xFB */ + 0x8B, 0xFB, 0x8B, 0xFC, 0x8B, 0xFD, 0x8B, 0xFE, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B6[512] = { + 0x8C, 0x41, 0x8C, 0x42, 0x8C, 0x43, 0x8C, 0x44, /* 0x00-0x03 */ + 0x8C, 0x45, 0x8C, 0x46, 0x8C, 0x47, 0x8C, 0x48, /* 0x04-0x07 */ + 0x8C, 0x49, 0x8C, 0x4A, 0x8C, 0x4B, 0x8C, 0x4C, /* 0x08-0x0B */ + 0x8C, 0x4D, 0x8C, 0x4E, 0x8C, 0x4F, 0x8C, 0x50, /* 0x0C-0x0F */ + 0xB6, 0xC7, 0xB6, 0xC8, 0x8C, 0x51, 0x8C, 0x52, /* 0x10-0x13 */ + 0xB6, 0xC9, 0x8C, 0x53, 0x8C, 0x54, 0x8C, 0x55, /* 0x14-0x17 */ + 0xB6, 0xCA, 0x8C, 0x56, 0x8C, 0x57, 0x8C, 0x58, /* 0x18-0x1B */ + 0x8C, 0x59, 0x8C, 0x5A, 0x8C, 0x61, 0x8C, 0x62, /* 0x1C-0x1F */ + 0x8C, 0x63, 0x8C, 0x64, 0x8C, 0x65, 0x8C, 0x66, /* 0x20-0x23 */ + 0x8C, 0x67, 0xB6, 0xCB, 0x8C, 0x68, 0x8C, 0x69, /* 0x24-0x27 */ + 0x8C, 0x6A, 0x8C, 0x6B, 0x8C, 0x6C, 0x8C, 0x6D, /* 0x28-0x2B */ + 0xB6, 0xCC, 0x8C, 0x6E, 0x8C, 0x6F, 0x8C, 0x70, /* 0x2C-0x2F */ + 0x8C, 0x71, 0x8C, 0x72, 0x8C, 0x73, 0x8C, 0x74, /* 0x30-0x33 */ + 0xB6, 0xCD, 0x8C, 0x75, 0x8C, 0x76, 0x8C, 0x77, /* 0x34-0x37 */ + 0x8C, 0x78, 0x8C, 0x79, 0x8C, 0x7A, 0x8C, 0x81, /* 0x38-0x3B */ + 0x8C, 0x82, 0x8C, 0x83, 0x8C, 0x84, 0x8C, 0x85, /* 0x3C-0x3F */ + 0x8C, 0x86, 0x8C, 0x87, 0x8C, 0x88, 0x8C, 0x89, /* 0x40-0x43 */ + 0x8C, 0x8A, 0x8C, 0x8B, 0x8C, 0x8C, 0x8C, 0x8D, /* 0x44-0x47 */ + 0xB6, 0xCE, 0x8C, 0x8E, 0x8C, 0x8F, 0x8C, 0x90, /* 0x48-0x4B */ + 0x8C, 0x91, 0x8C, 0x92, 0x8C, 0x93, 0x8C, 0x94, /* 0x4C-0x4F */ + 0x8C, 0x95, 0x8C, 0x96, 0x8C, 0x97, 0x8C, 0x98, /* 0x50-0x53 */ + 0x8C, 0x99, 0x8C, 0x9A, 0x8C, 0x9B, 0x8C, 0x9C, /* 0x54-0x57 */ + 0x8C, 0x9D, 0x8C, 0x9E, 0x8C, 0x9F, 0x8C, 0xA0, /* 0x58-0x5B */ + 0x8C, 0xA1, 0x8C, 0xA2, 0x8C, 0xA3, 0x8C, 0xA4, /* 0x5C-0x5F */ + 0x8C, 0xA5, 0x8C, 0xA6, 0x8C, 0xA7, 0x8C, 0xA8, /* 0x60-0x63 */ + 0xB6, 0xCF, 0x8C, 0xA9, 0x8C, 0xAA, 0x8C, 0xAB, /* 0x64-0x67 */ + 0xB6, 0xD0, 0x8C, 0xAC, 0x8C, 0xAD, 0x8C, 0xAE, /* 0x68-0x6B */ + 0x8C, 0xAF, 0x8C, 0xB0, 0x8C, 0xB1, 0x8C, 0xB2, /* 0x6C-0x6F */ + 0x8C, 0xB3, 0x8C, 0xB4, 0x8C, 0xB5, 0x8C, 0xB6, /* 0x70-0x73 */ + 0x8C, 0xB7, 0x8C, 0xB8, 0x8C, 0xB9, 0x8C, 0xBA, /* 0x74-0x77 */ + 0x8C, 0xBB, 0x8C, 0xBC, 0x8C, 0xBD, 0x8C, 0xBE, /* 0x78-0x7B */ + 0x8C, 0xBF, 0x8C, 0xC0, 0x8C, 0xC1, 0x8C, 0xC2, /* 0x7C-0x7F */ + + 0x8C, 0xC3, 0x8C, 0xC4, 0x8C, 0xC5, 0x8C, 0xC6, /* 0x80-0x83 */ + 0x8C, 0xC7, 0x8C, 0xC8, 0x8C, 0xC9, 0x8C, 0xCA, /* 0x84-0x87 */ + 0x8C, 0xCB, 0x8C, 0xCC, 0x8C, 0xCD, 0x8C, 0xCE, /* 0x88-0x8B */ + 0x8C, 0xCF, 0x8C, 0xD0, 0x8C, 0xD1, 0x8C, 0xD2, /* 0x8C-0x8F */ + 0x8C, 0xD3, 0x8C, 0xD4, 0x8C, 0xD5, 0x8C, 0xD6, /* 0x90-0x93 */ + 0x8C, 0xD7, 0x8C, 0xD8, 0x8C, 0xD9, 0x8C, 0xDA, /* 0x94-0x97 */ + 0x8C, 0xDB, 0x8C, 0xDC, 0x8C, 0xDD, 0x8C, 0xDE, /* 0x98-0x9B */ + 0xB6, 0xD1, 0xB6, 0xD2, 0x8C, 0xDF, 0x8C, 0xE0, /* 0x9C-0x9F */ + 0xB6, 0xD3, 0x8C, 0xE1, 0x8C, 0xE2, 0x8C, 0xE3, /* 0xA0-0xA3 */ + 0xB6, 0xD4, 0x8C, 0xE4, 0x8C, 0xE5, 0x8C, 0xE6, /* 0xA4-0xA7 */ + 0x8C, 0xE7, 0x8C, 0xE8, 0x8C, 0xE9, 0xB6, 0xD5, /* 0xA8-0xAB */ + 0xB6, 0xD6, 0x8C, 0xEA, 0x8C, 0xEB, 0x8C, 0xEC, /* 0xAC-0xAF */ + 0x8C, 0xED, 0xB6, 0xD7, 0x8C, 0xEE, 0x8C, 0xEF, /* 0xB0-0xB3 */ + 0x8C, 0xF0, 0x8C, 0xF1, 0x8C, 0xF2, 0x8C, 0xF3, /* 0xB4-0xB7 */ + 0x8C, 0xF4, 0x8C, 0xF5, 0x8C, 0xF6, 0x8C, 0xF7, /* 0xB8-0xBB */ + 0x8C, 0xF8, 0x8C, 0xF9, 0x8C, 0xFA, 0x8C, 0xFB, /* 0xBC-0xBF */ + 0x8C, 0xFC, 0x8C, 0xFD, 0x8C, 0xFE, 0x8D, 0x41, /* 0xC0-0xC3 */ + 0x8D, 0x42, 0x8D, 0x43, 0x8D, 0x44, 0x8D, 0x45, /* 0xC4-0xC7 */ + 0x8D, 0x46, 0x8D, 0x47, 0x8D, 0x48, 0x8D, 0x49, /* 0xC8-0xCB */ + 0x8D, 0x4A, 0x8D, 0x4B, 0x8D, 0x4C, 0x8D, 0x4D, /* 0xCC-0xCF */ + 0x8D, 0x4E, 0x8D, 0x4F, 0x8D, 0x50, 0x8D, 0x51, /* 0xD0-0xD3 */ + 0xB6, 0xD8, 0x8D, 0x52, 0x8D, 0x53, 0x8D, 0x54, /* 0xD4-0xD7 */ + 0x8D, 0x55, 0x8D, 0x56, 0x8D, 0x57, 0x8D, 0x58, /* 0xD8-0xDB */ + 0x8D, 0x59, 0x8D, 0x5A, 0x8D, 0x61, 0x8D, 0x62, /* 0xDC-0xDF */ + 0x8D, 0x63, 0x8D, 0x64, 0x8D, 0x65, 0x8D, 0x66, /* 0xE0-0xE3 */ + 0x8D, 0x67, 0x8D, 0x68, 0x8D, 0x69, 0x8D, 0x6A, /* 0xE4-0xE7 */ + 0x8D, 0x6B, 0x8D, 0x6C, 0x8D, 0x6D, 0x8D, 0x6E, /* 0xE8-0xEB */ + 0x8D, 0x6F, 0x8D, 0x70, 0x8D, 0x71, 0x8D, 0x72, /* 0xEC-0xEF */ + 0xB6, 0xD9, 0x8D, 0x73, 0x8D, 0x74, 0x8D, 0x75, /* 0xF0-0xF3 */ + 0xB6, 0xDA, 0x8D, 0x76, 0x8D, 0x77, 0x8D, 0x78, /* 0xF4-0xF7 */ + 0xB6, 0xDB, 0x8D, 0x79, 0x8D, 0x7A, 0x8D, 0x81, /* 0xF8-0xFB */ + 0x8D, 0x82, 0x8D, 0x83, 0x8D, 0x84, 0x8D, 0x85, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B7[512] = { + 0xB6, 0xDC, 0xB6, 0xDD, 0x8D, 0x86, 0x8D, 0x87, /* 0x00-0x03 */ + 0x8D, 0x88, 0xB6, 0xDE, 0x8D, 0x89, 0x8D, 0x8A, /* 0x04-0x07 */ + 0x8D, 0x8B, 0x8D, 0x8C, 0x8D, 0x8D, 0x8D, 0x8E, /* 0x08-0x0B */ + 0x8D, 0x8F, 0x8D, 0x90, 0x8D, 0x91, 0x8D, 0x92, /* 0x0C-0x0F */ + 0x8D, 0x93, 0x8D, 0x94, 0x8D, 0x95, 0x8D, 0x96, /* 0x10-0x13 */ + 0x8D, 0x97, 0x8D, 0x98, 0x8D, 0x99, 0x8D, 0x9A, /* 0x14-0x17 */ + 0x8D, 0x9B, 0x8D, 0x9C, 0x8D, 0x9D, 0x8D, 0x9E, /* 0x18-0x1B */ + 0x8D, 0x9F, 0x8D, 0xA0, 0x8D, 0xA1, 0x8D, 0xA2, /* 0x1C-0x1F */ + 0x8D, 0xA3, 0x8D, 0xA4, 0x8D, 0xA5, 0x8D, 0xA6, /* 0x20-0x23 */ + 0x8D, 0xA7, 0x8D, 0xA8, 0x8D, 0xA9, 0x8D, 0xAA, /* 0x24-0x27 */ + 0xB6, 0xDF, 0xB6, 0xE0, 0x8D, 0xAB, 0x8D, 0xAC, /* 0x28-0x2B */ + 0xB6, 0xE1, 0x8D, 0xAD, 0x8D, 0xAE, 0xB6, 0xE2, /* 0x2C-0x2F */ + 0xB6, 0xE3, 0x8D, 0xAF, 0x8D, 0xB0, 0x8D, 0xB1, /* 0x30-0x33 */ + 0x8D, 0xB2, 0x8D, 0xB3, 0x8D, 0xB4, 0x8D, 0xB5, /* 0x34-0x37 */ + 0xB6, 0xE4, 0xB6, 0xE5, 0x8D, 0xB6, 0xB6, 0xE6, /* 0x38-0x3B */ + 0x8D, 0xB7, 0x8D, 0xB8, 0x8D, 0xB9, 0x8D, 0xBA, /* 0x3C-0x3F */ + 0x8D, 0xBB, 0x8D, 0xBC, 0x8D, 0xBD, 0x8D, 0xBE, /* 0x40-0x43 */ + 0xB6, 0xE7, 0x8D, 0xBF, 0x8D, 0xC0, 0x8D, 0xC1, /* 0x44-0x47 */ + 0xB6, 0xE8, 0x8D, 0xC2, 0x8D, 0xC3, 0x8D, 0xC4, /* 0x48-0x4B */ + 0xB6, 0xE9, 0x8D, 0xC5, 0x8D, 0xC6, 0x8D, 0xC7, /* 0x4C-0x4F */ + 0x8D, 0xC8, 0x8D, 0xC9, 0x8D, 0xCA, 0x8D, 0xCB, /* 0x50-0x53 */ + 0xB6, 0xEA, 0xB6, 0xEB, 0x8D, 0xCC, 0x8D, 0xCD, /* 0x54-0x57 */ + 0x8D, 0xCE, 0x8D, 0xCF, 0x8D, 0xD0, 0x8D, 0xD1, /* 0x58-0x5B */ + 0x8D, 0xD2, 0x8D, 0xD3, 0x8D, 0xD4, 0x8D, 0xD5, /* 0x5C-0x5F */ + 0xB6, 0xEC, 0x8D, 0xD6, 0x8D, 0xD7, 0x8D, 0xD8, /* 0x60-0x63 */ + 0xB6, 0xED, 0x8D, 0xD9, 0x8D, 0xDA, 0x8D, 0xDB, /* 0x64-0x67 */ + 0xB6, 0xEE, 0x8D, 0xDC, 0x8D, 0xDD, 0x8D, 0xDE, /* 0x68-0x6B */ + 0x8D, 0xDF, 0x8D, 0xE0, 0x8D, 0xE1, 0x8D, 0xE2, /* 0x6C-0x6F */ + 0xB6, 0xEF, 0xB6, 0xF0, 0x8D, 0xE3, 0xB6, 0xF1, /* 0x70-0x73 */ + 0x8D, 0xE4, 0xB6, 0xF2, 0x8D, 0xE5, 0x8D, 0xE6, /* 0x74-0x77 */ + 0x8D, 0xE7, 0x8D, 0xE8, 0x8D, 0xE9, 0x8D, 0xEA, /* 0x78-0x7B */ + 0xB6, 0xF3, 0xB6, 0xF4, 0x8D, 0xEB, 0x8D, 0xEC, /* 0x7C-0x7F */ + + 0xB6, 0xF5, 0x8D, 0xED, 0x8D, 0xEE, 0x8D, 0xEF, /* 0x80-0x83 */ + 0xB6, 0xF6, 0x8D, 0xF0, 0x8D, 0xF1, 0x8D, 0xF2, /* 0x84-0x87 */ + 0x8D, 0xF3, 0x8D, 0xF4, 0x8D, 0xF5, 0x8D, 0xF6, /* 0x88-0x8B */ + 0xB6, 0xF7, 0xB6, 0xF8, 0x8D, 0xF7, 0xB6, 0xF9, /* 0x8C-0x8F */ + 0xB6, 0xFA, 0xB6, 0xFB, 0xB6, 0xFC, 0x8D, 0xF8, /* 0x90-0x93 */ + 0x8D, 0xF9, 0x8D, 0xFA, 0xB6, 0xFD, 0xB6, 0xFE, /* 0x94-0x97 */ + 0xB7, 0xA1, 0xB7, 0xA2, 0x8D, 0xFB, 0x8D, 0xFC, /* 0x98-0x9B */ + 0xB7, 0xA3, 0x8D, 0xFD, 0x8D, 0xFE, 0x8E, 0x41, /* 0x9C-0x9F */ + 0xB7, 0xA4, 0x8E, 0x42, 0x8E, 0x43, 0x8E, 0x44, /* 0xA0-0xA3 */ + 0x8E, 0x45, 0x8E, 0x46, 0x8E, 0x47, 0x8E, 0x48, /* 0xA4-0xA7 */ + 0xB7, 0xA5, 0xB7, 0xA6, 0x8E, 0x49, 0xB7, 0xA7, /* 0xA8-0xAB */ + 0xB7, 0xA8, 0xB7, 0xA9, 0x8E, 0x4A, 0x8E, 0x4B, /* 0xAC-0xAF */ + 0x8E, 0x4C, 0x8E, 0x4D, 0x8E, 0x4E, 0x8E, 0x4F, /* 0xB0-0xB3 */ + 0xB7, 0xAA, 0xB7, 0xAB, 0x8E, 0x50, 0x8E, 0x51, /* 0xB4-0xB7 */ + 0xB7, 0xAC, 0x8E, 0x52, 0x8E, 0x53, 0x8E, 0x54, /* 0xB8-0xBB */ + 0x8E, 0x55, 0x8E, 0x56, 0x8E, 0x57, 0x8E, 0x58, /* 0xBC-0xBF */ + 0x8E, 0x59, 0x8E, 0x5A, 0x8E, 0x61, 0x8E, 0x62, /* 0xC0-0xC3 */ + 0x8E, 0x63, 0x8E, 0x64, 0x8E, 0x65, 0xB7, 0xAD, /* 0xC4-0xC7 */ + 0x8E, 0x66, 0xB7, 0xAE, 0x8E, 0x67, 0x8E, 0x68, /* 0xC8-0xCB */ + 0x8E, 0x69, 0x8E, 0x6A, 0x8E, 0x6B, 0x8E, 0x6C, /* 0xCC-0xCF */ + 0x8E, 0x6D, 0x8E, 0x6E, 0x8E, 0x6F, 0x8E, 0x70, /* 0xD0-0xD3 */ + 0x8E, 0x71, 0x8E, 0x72, 0x8E, 0x73, 0x8E, 0x74, /* 0xD4-0xD7 */ + 0x8E, 0x75, 0x8E, 0x76, 0x8E, 0x77, 0x8E, 0x78, /* 0xD8-0xDB */ + 0x8E, 0x79, 0x8E, 0x7A, 0x8E, 0x81, 0x8E, 0x82, /* 0xDC-0xDF */ + 0x8E, 0x83, 0x8E, 0x84, 0x8E, 0x85, 0x8E, 0x86, /* 0xE0-0xE3 */ + 0x8E, 0x87, 0x8E, 0x88, 0x8E, 0x89, 0x8E, 0x8A, /* 0xE4-0xE7 */ + 0x8E, 0x8B, 0x8E, 0x8C, 0x8E, 0x8D, 0x8E, 0x8E, /* 0xE8-0xEB */ + 0xB7, 0xAF, 0xB7, 0xB0, 0x8E, 0x8F, 0x8E, 0x90, /* 0xEC-0xEF */ + 0xB7, 0xB1, 0x8E, 0x91, 0x8E, 0x92, 0x8E, 0x93, /* 0xF0-0xF3 */ + 0xB7, 0xB2, 0x8E, 0x94, 0x8E, 0x95, 0x8E, 0x96, /* 0xF4-0xF7 */ + 0x8E, 0x97, 0x8E, 0x98, 0x8E, 0x99, 0x8E, 0x9A, /* 0xF8-0xFB */ + 0xB7, 0xB3, 0xB7, 0xB4, 0x8E, 0x9B, 0xB7, 0xB5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B8[512] = { + 0xB7, 0xB6, 0xB7, 0xB7, 0x8E, 0x9C, 0x8E, 0x9D, /* 0x00-0x03 */ + 0x8E, 0x9E, 0x8E, 0x9F, 0x8E, 0xA0, 0xB7, 0xB8, /* 0x04-0x07 */ + 0xB7, 0xB9, 0xB7, 0xBA, 0x8E, 0xA1, 0x8E, 0xA2, /* 0x08-0x0B */ + 0xB7, 0xBB, 0x8E, 0xA3, 0x8E, 0xA4, 0x8E, 0xA5, /* 0x0C-0x0F */ + 0xB7, 0xBC, 0x8E, 0xA6, 0x8E, 0xA7, 0x8E, 0xA8, /* 0x10-0x13 */ + 0x8E, 0xA9, 0x8E, 0xAA, 0x8E, 0xAB, 0x8E, 0xAC, /* 0x14-0x17 */ + 0xB7, 0xBD, 0xB7, 0xBE, 0x8E, 0xAD, 0xB7, 0xBF, /* 0x18-0x1B */ + 0x8E, 0xAE, 0xB7, 0xC0, 0x8E, 0xAF, 0x8E, 0xB0, /* 0x1C-0x1F */ + 0x8E, 0xB1, 0x8E, 0xB2, 0x8E, 0xB3, 0x8E, 0xB4, /* 0x20-0x23 */ + 0xB7, 0xC1, 0xB7, 0xC2, 0x8E, 0xB5, 0x8E, 0xB6, /* 0x24-0x27 */ + 0xB7, 0xC3, 0x8E, 0xB7, 0x8E, 0xB8, 0x8E, 0xB9, /* 0x28-0x2B */ + 0xB7, 0xC4, 0x8E, 0xBA, 0x8E, 0xBB, 0x8E, 0xBC, /* 0x2C-0x2F */ + 0x8E, 0xBD, 0x8E, 0xBE, 0x8E, 0xBF, 0x8E, 0xC0, /* 0x30-0x33 */ + 0xB7, 0xC5, 0xB7, 0xC6, 0x8E, 0xC1, 0xB7, 0xC7, /* 0x34-0x37 */ + 0xB7, 0xC8, 0xB7, 0xC9, 0x8E, 0xC2, 0x8E, 0xC3, /* 0x38-0x3B */ + 0x8E, 0xC4, 0x8E, 0xC5, 0x8E, 0xC6, 0x8E, 0xC7, /* 0x3C-0x3F */ + 0xB7, 0xCA, 0x8E, 0xC8, 0x8E, 0xC9, 0x8E, 0xCA, /* 0x40-0x43 */ + 0xB7, 0xCB, 0x8E, 0xCB, 0x8E, 0xCC, 0x8E, 0xCD, /* 0x44-0x47 */ + 0x8E, 0xCE, 0x8E, 0xCF, 0x8E, 0xD0, 0x8E, 0xD1, /* 0x48-0x4B */ + 0x8E, 0xD2, 0x8E, 0xD3, 0x8E, 0xD4, 0x8E, 0xD5, /* 0x4C-0x4F */ + 0x8E, 0xD6, 0xB7, 0xCC, 0x8E, 0xD7, 0xB7, 0xCD, /* 0x50-0x53 */ + 0x8E, 0xD8, 0x8E, 0xD9, 0x8E, 0xDA, 0x8E, 0xDB, /* 0x54-0x57 */ + 0x8E, 0xDC, 0x8E, 0xDD, 0x8E, 0xDE, 0x8E, 0xDF, /* 0x58-0x5B */ + 0xB7, 0xCE, 0xB7, 0xCF, 0x8E, 0xE0, 0x8E, 0xE1, /* 0x5C-0x5F */ + 0xB7, 0xD0, 0x8E, 0xE2, 0x8E, 0xE3, 0x8E, 0xE4, /* 0x60-0x63 */ + 0xB7, 0xD1, 0x8E, 0xE5, 0x8E, 0xE6, 0x8E, 0xE7, /* 0x64-0x67 */ + 0x8E, 0xE8, 0x8E, 0xE9, 0x8E, 0xEA, 0x8E, 0xEB, /* 0x68-0x6B */ + 0xB7, 0xD2, 0xB7, 0xD3, 0x8E, 0xEC, 0xB7, 0xD4, /* 0x6C-0x6F */ + 0x8E, 0xED, 0xB7, 0xD5, 0x8E, 0xEE, 0x8E, 0xEF, /* 0x70-0x73 */ + 0x8E, 0xF0, 0x8E, 0xF1, 0x8E, 0xF2, 0x8E, 0xF3, /* 0x74-0x77 */ + 0xB7, 0xD6, 0x8E, 0xF4, 0x8E, 0xF5, 0x8E, 0xF6, /* 0x78-0x7B */ + 0xB7, 0xD7, 0x8E, 0xF7, 0x8E, 0xF8, 0x8E, 0xF9, /* 0x7C-0x7F */ + + 0x8E, 0xFA, 0x8E, 0xFB, 0x8E, 0xFC, 0x8E, 0xFD, /* 0x80-0x83 */ + 0x8E, 0xFE, 0x8F, 0x41, 0x8F, 0x42, 0x8F, 0x43, /* 0x84-0x87 */ + 0x8F, 0x44, 0x8F, 0x45, 0x8F, 0x46, 0x8F, 0x47, /* 0x88-0x8B */ + 0x8F, 0x48, 0xB7, 0xD8, 0x8F, 0x49, 0x8F, 0x4A, /* 0x8C-0x8F */ + 0x8F, 0x4B, 0x8F, 0x4C, 0x8F, 0x4D, 0x8F, 0x4E, /* 0x90-0x93 */ + 0x8F, 0x4F, 0x8F, 0x50, 0x8F, 0x51, 0x8F, 0x52, /* 0x94-0x97 */ + 0x8F, 0x53, 0x8F, 0x54, 0x8F, 0x55, 0x8F, 0x56, /* 0x98-0x9B */ + 0x8F, 0x57, 0x8F, 0x58, 0x8F, 0x59, 0x8F, 0x5A, /* 0x9C-0x9F */ + 0x8F, 0x61, 0x8F, 0x62, 0x8F, 0x63, 0x8F, 0x64, /* 0xA0-0xA3 */ + 0x8F, 0x65, 0x8F, 0x66, 0x8F, 0x67, 0x8F, 0x68, /* 0xA4-0xA7 */ + 0xB7, 0xD9, 0x8F, 0x69, 0x8F, 0x6A, 0x8F, 0x6B, /* 0xA8-0xAB */ + 0x8F, 0x6C, 0x8F, 0x6D, 0x8F, 0x6E, 0x8F, 0x6F, /* 0xAC-0xAF */ + 0xB7, 0xDA, 0x8F, 0x70, 0x8F, 0x71, 0x8F, 0x72, /* 0xB0-0xB3 */ + 0xB7, 0xDB, 0x8F, 0x73, 0x8F, 0x74, 0x8F, 0x75, /* 0xB4-0xB7 */ + 0xB7, 0xDC, 0x8F, 0x76, 0x8F, 0x77, 0x8F, 0x78, /* 0xB8-0xBB */ + 0x8F, 0x79, 0x8F, 0x7A, 0x8F, 0x81, 0x8F, 0x82, /* 0xBC-0xBF */ + 0xB7, 0xDD, 0xB7, 0xDE, 0x8F, 0x83, 0xB7, 0xDF, /* 0xC0-0xC3 */ + 0x8F, 0x84, 0xB7, 0xE0, 0x8F, 0x85, 0x8F, 0x86, /* 0xC4-0xC7 */ + 0x8F, 0x87, 0x8F, 0x88, 0x8F, 0x89, 0x8F, 0x8A, /* 0xC8-0xCB */ + 0xB7, 0xE1, 0x8F, 0x8B, 0x8F, 0x8C, 0x8F, 0x8D, /* 0xCC-0xCF */ + 0xB7, 0xE2, 0x8F, 0x8E, 0x8F, 0x8F, 0x8F, 0x90, /* 0xD0-0xD3 */ + 0xB7, 0xE3, 0x8F, 0x91, 0x8F, 0x92, 0x8F, 0x93, /* 0xD4-0xD7 */ + 0x8F, 0x94, 0x8F, 0x95, 0x8F, 0x96, 0x8F, 0x97, /* 0xD8-0xDB */ + 0x8F, 0x98, 0xB7, 0xE4, 0x8F, 0x99, 0xB7, 0xE5, /* 0xDC-0xDF */ + 0x8F, 0x9A, 0xB7, 0xE6, 0x8F, 0x9B, 0x8F, 0x9C, /* 0xE0-0xE3 */ + 0x8F, 0x9D, 0x8F, 0x9E, 0x8F, 0x9F, 0x8F, 0xA0, /* 0xE4-0xE7 */ + 0xB7, 0xE7, 0xB7, 0xE8, 0x8F, 0xA1, 0x8F, 0xA2, /* 0xE8-0xEB */ + 0xB7, 0xE9, 0x8F, 0xA3, 0x8F, 0xA4, 0x8F, 0xA5, /* 0xEC-0xEF */ + 0xB7, 0xEA, 0x8F, 0xA6, 0x8F, 0xA7, 0x8F, 0xA8, /* 0xF0-0xF3 */ + 0x8F, 0xA9, 0x8F, 0xAA, 0x8F, 0xAB, 0x8F, 0xAC, /* 0xF4-0xF7 */ + 0xB7, 0xEB, 0xB7, 0xEC, 0x8F, 0xAD, 0xB7, 0xED, /* 0xF8-0xFB */ + 0x8F, 0xAE, 0xB7, 0xEE, 0x8F, 0xAF, 0x8F, 0xB0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_B9[512] = { + 0x8F, 0xB1, 0x8F, 0xB2, 0x8F, 0xB3, 0x8F, 0xB4, /* 0x00-0x03 */ + 0xB7, 0xEF, 0x8F, 0xB5, 0x8F, 0xB6, 0x8F, 0xB7, /* 0x04-0x07 */ + 0x8F, 0xB8, 0x8F, 0xB9, 0x8F, 0xBA, 0x8F, 0xBB, /* 0x08-0x0B */ + 0x8F, 0xBC, 0x8F, 0xBD, 0x8F, 0xBE, 0x8F, 0xBF, /* 0x0C-0x0F */ + 0x8F, 0xC0, 0x8F, 0xC1, 0x8F, 0xC2, 0x8F, 0xC3, /* 0x10-0x13 */ + 0x8F, 0xC4, 0x8F, 0xC5, 0x8F, 0xC6, 0x8F, 0xC7, /* 0x14-0x17 */ + 0xB7, 0xF0, 0x8F, 0xC8, 0x8F, 0xC9, 0x8F, 0xCA, /* 0x18-0x1B */ + 0x8F, 0xCB, 0x8F, 0xCC, 0x8F, 0xCD, 0x8F, 0xCE, /* 0x1C-0x1F */ + 0xB7, 0xF1, 0x8F, 0xCF, 0x8F, 0xD0, 0x8F, 0xD1, /* 0x20-0x23 */ + 0x8F, 0xD2, 0x8F, 0xD3, 0x8F, 0xD4, 0x8F, 0xD5, /* 0x24-0x27 */ + 0x8F, 0xD6, 0x8F, 0xD7, 0x8F, 0xD8, 0x8F, 0xD9, /* 0x28-0x2B */ + 0x8F, 0xDA, 0x8F, 0xDB, 0x8F, 0xDC, 0x8F, 0xDD, /* 0x2C-0x2F */ + 0x8F, 0xDE, 0x8F, 0xDF, 0x8F, 0xE0, 0x8F, 0xE1, /* 0x30-0x33 */ + 0x8F, 0xE2, 0x8F, 0xE3, 0x8F, 0xE4, 0x8F, 0xE5, /* 0x34-0x37 */ + 0x8F, 0xE6, 0x8F, 0xE7, 0x8F, 0xE8, 0x8F, 0xE9, /* 0x38-0x3B */ + 0xB7, 0xF2, 0xB7, 0xF3, 0x8F, 0xEA, 0x8F, 0xEB, /* 0x3C-0x3F */ + 0xB7, 0xF4, 0x8F, 0xEC, 0x8F, 0xED, 0x8F, 0xEE, /* 0x40-0x43 */ + 0xB7, 0xF5, 0x8F, 0xEF, 0x8F, 0xF0, 0x8F, 0xF1, /* 0x44-0x47 */ + 0x8F, 0xF2, 0x8F, 0xF3, 0x8F, 0xF4, 0x8F, 0xF5, /* 0x48-0x4B */ + 0xB7, 0xF6, 0x8F, 0xF6, 0x8F, 0xF7, 0xB7, 0xF7, /* 0x4C-0x4F */ + 0x8F, 0xF8, 0xB7, 0xF8, 0x8F, 0xF9, 0x8F, 0xFA, /* 0x50-0x53 */ + 0x8F, 0xFB, 0x8F, 0xFC, 0x8F, 0xFD, 0x8F, 0xFE, /* 0x54-0x57 */ + 0xB7, 0xF9, 0xB7, 0xFA, 0x90, 0x41, 0x90, 0x42, /* 0x58-0x5B */ + 0xB7, 0xFB, 0x90, 0x43, 0x90, 0x44, 0x90, 0x45, /* 0x5C-0x5F */ + 0xB7, 0xFC, 0x90, 0x46, 0x90, 0x47, 0x90, 0x48, /* 0x60-0x63 */ + 0x90, 0x49, 0x90, 0x4A, 0x90, 0x4B, 0x90, 0x4C, /* 0x64-0x67 */ + 0xB7, 0xFD, 0xB7, 0xFE, 0x90, 0x4D, 0xB8, 0xA1, /* 0x68-0x6B */ + 0x90, 0x4E, 0xB8, 0xA2, 0x90, 0x4F, 0x90, 0x50, /* 0x6C-0x6F */ + 0x90, 0x51, 0x90, 0x52, 0x90, 0x53, 0x90, 0x54, /* 0x70-0x73 */ + 0xB8, 0xA3, 0xB8, 0xA4, 0x90, 0x55, 0x90, 0x56, /* 0x74-0x77 */ + 0xB8, 0xA5, 0x90, 0x57, 0x90, 0x58, 0x90, 0x59, /* 0x78-0x7B */ + 0xB8, 0xA6, 0x90, 0x5A, 0x90, 0x61, 0x90, 0x62, /* 0x7C-0x7F */ + + 0x90, 0x63, 0x90, 0x64, 0x90, 0x65, 0x90, 0x66, /* 0x80-0x83 */ + 0xB8, 0xA7, 0xB8, 0xA8, 0x90, 0x67, 0xB8, 0xA9, /* 0x84-0x87 */ + 0x90, 0x68, 0xB8, 0xAA, 0xB8, 0xAB, 0x90, 0x69, /* 0x88-0x8B */ + 0x90, 0x6A, 0xB8, 0xAC, 0xB8, 0xAD, 0x90, 0x6B, /* 0x8C-0x8F */ + 0x90, 0x6C, 0x90, 0x6D, 0x90, 0x6E, 0x90, 0x6F, /* 0x90-0x93 */ + 0x90, 0x70, 0x90, 0x71, 0x90, 0x72, 0x90, 0x73, /* 0x94-0x97 */ + 0x90, 0x74, 0x90, 0x75, 0x90, 0x76, 0x90, 0x77, /* 0x98-0x9B */ + 0x90, 0x78, 0x90, 0x79, 0x90, 0x7A, 0x90, 0x81, /* 0x9C-0x9F */ + 0x90, 0x82, 0x90, 0x83, 0x90, 0x84, 0x90, 0x85, /* 0xA0-0xA3 */ + 0x90, 0x86, 0x90, 0x87, 0x90, 0x88, 0x90, 0x89, /* 0xA4-0xA7 */ + 0x90, 0x8A, 0x90, 0x8B, 0x90, 0x8C, 0x90, 0x8D, /* 0xA8-0xAB */ + 0xB8, 0xAE, 0xB8, 0xAF, 0x90, 0x8E, 0x90, 0x8F, /* 0xAC-0xAF */ + 0xB8, 0xB0, 0x90, 0x90, 0x90, 0x91, 0x90, 0x92, /* 0xB0-0xB3 */ + 0xB8, 0xB1, 0x90, 0x93, 0x90, 0x94, 0x90, 0x95, /* 0xB4-0xB7 */ + 0x90, 0x96, 0x90, 0x97, 0x90, 0x98, 0x90, 0x99, /* 0xB8-0xBB */ + 0xB8, 0xB2, 0xB8, 0xB3, 0x90, 0x9A, 0xB8, 0xB4, /* 0xBC-0xBF */ + 0x90, 0x9B, 0xB8, 0xB5, 0x90, 0x9C, 0x90, 0x9D, /* 0xC0-0xC3 */ + 0x90, 0x9E, 0x90, 0x9F, 0x90, 0xA0, 0x90, 0xA1, /* 0xC4-0xC7 */ + 0xB8, 0xB6, 0xB8, 0xB7, 0x90, 0xA2, 0x90, 0xA3, /* 0xC8-0xCB */ + 0xB8, 0xB8, 0x90, 0xA4, 0xB8, 0xB9, 0xB8, 0xBA, /* 0xCC-0xCF */ + 0xB8, 0xBB, 0xB8, 0xBC, 0xB8, 0xBD, 0x90, 0xA5, /* 0xD0-0xD3 */ + 0x90, 0xA6, 0x90, 0xA7, 0x90, 0xA8, 0x90, 0xA9, /* 0xD4-0xD7 */ + 0xB8, 0xBE, 0xB8, 0xBF, 0x90, 0xAA, 0xB8, 0xC0, /* 0xD8-0xDB */ + 0x90, 0xAB, 0xB8, 0xC1, 0xB8, 0xC2, 0x90, 0xAC, /* 0xDC-0xDF */ + 0x90, 0xAD, 0xB8, 0xC3, 0x90, 0xAE, 0xB8, 0xC4, /* 0xE0-0xE3 */ + 0xB8, 0xC5, 0xB8, 0xC6, 0x90, 0xAF, 0x90, 0xB0, /* 0xE4-0xE7 */ + 0xB8, 0xC7, 0x90, 0xB1, 0x90, 0xB2, 0x90, 0xB3, /* 0xE8-0xEB */ + 0xB8, 0xC8, 0x90, 0xB4, 0x90, 0xB5, 0x90, 0xB6, /* 0xEC-0xEF */ + 0x90, 0xB7, 0x90, 0xB8, 0x90, 0xB9, 0x90, 0xBA, /* 0xF0-0xF3 */ + 0xB8, 0xC9, 0xB8, 0xCA, 0x90, 0xBB, 0xB8, 0xCB, /* 0xF4-0xF7 */ + 0xB8, 0xCC, 0xB8, 0xCD, 0xB8, 0xCE, 0x90, 0xBC, /* 0xF8-0xFB */ + 0x90, 0xBD, 0x90, 0xBE, 0x90, 0xBF, 0x90, 0xC0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_BA[512] = { + 0xB8, 0xCF, 0xB8, 0xD0, 0x90, 0xC1, 0x90, 0xC2, /* 0x00-0x03 */ + 0x90, 0xC3, 0x90, 0xC4, 0x90, 0xC5, 0x90, 0xC6, /* 0x04-0x07 */ + 0xB8, 0xD1, 0x90, 0xC7, 0x90, 0xC8, 0x90, 0xC9, /* 0x08-0x0B */ + 0x90, 0xCA, 0x90, 0xCB, 0x90, 0xCC, 0x90, 0xCD, /* 0x0C-0x0F */ + 0x90, 0xCE, 0x90, 0xCF, 0x90, 0xD0, 0x90, 0xD1, /* 0x10-0x13 */ + 0x90, 0xD2, 0xB8, 0xD2, 0x90, 0xD3, 0x90, 0xD4, /* 0x14-0x17 */ + 0x90, 0xD5, 0x90, 0xD6, 0x90, 0xD7, 0x90, 0xD8, /* 0x18-0x1B */ + 0x90, 0xD9, 0x90, 0xDA, 0x90, 0xDB, 0x90, 0xDC, /* 0x1C-0x1F */ + 0x90, 0xDD, 0x90, 0xDE, 0x90, 0xDF, 0x90, 0xE0, /* 0x20-0x23 */ + 0x90, 0xE1, 0x90, 0xE2, 0x90, 0xE3, 0x90, 0xE4, /* 0x24-0x27 */ + 0x90, 0xE5, 0x90, 0xE6, 0x90, 0xE7, 0x90, 0xE8, /* 0x28-0x2B */ + 0x90, 0xE9, 0x90, 0xEA, 0x90, 0xEB, 0x90, 0xEC, /* 0x2C-0x2F */ + 0x90, 0xED, 0x90, 0xEE, 0x90, 0xEF, 0x90, 0xF0, /* 0x30-0x33 */ + 0x90, 0xF1, 0x90, 0xF2, 0x90, 0xF3, 0x90, 0xF4, /* 0x34-0x37 */ + 0xB8, 0xD3, 0xB8, 0xD4, 0x90, 0xF5, 0x90, 0xF6, /* 0x38-0x3B */ + 0xB8, 0xD5, 0x90, 0xF7, 0x90, 0xF8, 0x90, 0xF9, /* 0x3C-0x3F */ + 0xB8, 0xD6, 0x90, 0xFA, 0xB8, 0xD7, 0x90, 0xFB, /* 0x40-0x43 */ + 0x90, 0xFC, 0x90, 0xFD, 0x90, 0xFE, 0x91, 0x41, /* 0x44-0x47 */ + 0xB8, 0xD8, 0xB8, 0xD9, 0x91, 0x42, 0xB8, 0xDA, /* 0x48-0x4B */ + 0x91, 0x43, 0xB8, 0xDB, 0xB8, 0xDC, 0x91, 0x44, /* 0x4C-0x4F */ + 0x91, 0x45, 0x91, 0x46, 0x91, 0x47, 0xB8, 0xDD, /* 0x50-0x53 */ + 0xB8, 0xDE, 0xB8, 0xDF, 0x91, 0x48, 0x91, 0x49, /* 0x54-0x57 */ + 0xB8, 0xE0, 0x91, 0x4A, 0x91, 0x4B, 0x91, 0x4C, /* 0x58-0x5B */ + 0xB8, 0xE1, 0x91, 0x4D, 0x91, 0x4E, 0x91, 0x4F, /* 0x5C-0x5F */ + 0x91, 0x50, 0x91, 0x51, 0x91, 0x52, 0x91, 0x53, /* 0x60-0x63 */ + 0xB8, 0xE2, 0xB8, 0xE3, 0x91, 0x54, 0xB8, 0xE4, /* 0x64-0x67 */ + 0xB8, 0xE5, 0xB8, 0xE6, 0x91, 0x55, 0x91, 0x56, /* 0x68-0x6B */ + 0x91, 0x57, 0x91, 0x58, 0x91, 0x59, 0x91, 0x5A, /* 0x6C-0x6F */ + 0xB8, 0xE7, 0xB8, 0xE8, 0x91, 0x61, 0x91, 0x62, /* 0x70-0x73 */ + 0xB8, 0xE9, 0x91, 0x63, 0x91, 0x64, 0x91, 0x65, /* 0x74-0x77 */ + 0xB8, 0xEA, 0x91, 0x66, 0x91, 0x67, 0x91, 0x68, /* 0x78-0x7B */ + 0x91, 0x69, 0x91, 0x6A, 0x91, 0x6B, 0x91, 0x6C, /* 0x7C-0x7F */ + + 0x91, 0x6D, 0x91, 0x6E, 0x91, 0x6F, 0xB8, 0xEB, /* 0x80-0x83 */ + 0xB8, 0xEC, 0xB8, 0xED, 0x91, 0x70, 0xB8, 0xEE, /* 0x84-0x87 */ + 0x91, 0x71, 0x91, 0x72, 0x91, 0x73, 0x91, 0x74, /* 0x88-0x8B */ + 0xB8, 0xEF, 0x91, 0x75, 0x91, 0x76, 0x91, 0x77, /* 0x8C-0x8F */ + 0x91, 0x78, 0x91, 0x79, 0x91, 0x7A, 0x91, 0x81, /* 0x90-0x93 */ + 0x91, 0x82, 0x91, 0x83, 0x91, 0x84, 0x91, 0x85, /* 0x94-0x97 */ + 0x91, 0x86, 0x91, 0x87, 0x91, 0x88, 0x91, 0x89, /* 0x98-0x9B */ + 0x91, 0x8A, 0x91, 0x8B, 0x91, 0x8C, 0x91, 0x8D, /* 0x9C-0x9F */ + 0x91, 0x8E, 0x91, 0x8F, 0x91, 0x90, 0x91, 0x91, /* 0xA0-0xA3 */ + 0x91, 0x92, 0x91, 0x93, 0x91, 0x94, 0x91, 0x95, /* 0xA4-0xA7 */ + 0xB8, 0xF0, 0xB8, 0xF1, 0x91, 0x96, 0xB8, 0xF2, /* 0xA8-0xAB */ + 0xB8, 0xF3, 0x91, 0x97, 0x91, 0x98, 0x91, 0x99, /* 0xAC-0xAF */ + 0xB8, 0xF4, 0x91, 0x9A, 0xB8, 0xF5, 0x91, 0x9B, /* 0xB0-0xB3 */ + 0x91, 0x9C, 0x91, 0x9D, 0x91, 0x9E, 0x91, 0x9F, /* 0xB4-0xB7 */ + 0xB8, 0xF6, 0xB8, 0xF7, 0x91, 0xA0, 0xB8, 0xF8, /* 0xB8-0xBB */ + 0x91, 0xA1, 0xB8, 0xF9, 0x91, 0xA2, 0x91, 0xA3, /* 0xBC-0xBF */ + 0x91, 0xA4, 0x91, 0xA5, 0x91, 0xA6, 0x91, 0xA7, /* 0xC0-0xC3 */ + 0xB8, 0xFA, 0x91, 0xA8, 0x91, 0xA9, 0x91, 0xAA, /* 0xC4-0xC7 */ + 0xB8, 0xFB, 0x91, 0xAB, 0x91, 0xAC, 0x91, 0xAD, /* 0xC8-0xCB */ + 0x91, 0xAE, 0x91, 0xAF, 0x91, 0xB0, 0x91, 0xB1, /* 0xCC-0xCF */ + 0x91, 0xB2, 0x91, 0xB3, 0x91, 0xB4, 0x91, 0xB5, /* 0xD0-0xD3 */ + 0x91, 0xB6, 0x91, 0xB7, 0x91, 0xB8, 0x91, 0xB9, /* 0xD4-0xD7 */ + 0xB8, 0xFC, 0xB8, 0xFD, 0x91, 0xBA, 0x91, 0xBB, /* 0xD8-0xDB */ + 0x91, 0xBC, 0x91, 0xBD, 0x91, 0xBE, 0x91, 0xBF, /* 0xDC-0xDF */ + 0x91, 0xC0, 0x91, 0xC1, 0x91, 0xC2, 0x91, 0xC3, /* 0xE0-0xE3 */ + 0x91, 0xC4, 0x91, 0xC5, 0x91, 0xC6, 0x91, 0xC7, /* 0xE4-0xE7 */ + 0x91, 0xC8, 0x91, 0xC9, 0x91, 0xCA, 0x91, 0xCB, /* 0xE8-0xEB */ + 0x91, 0xCC, 0x91, 0xCD, 0x91, 0xCE, 0x91, 0xCF, /* 0xEC-0xEF */ + 0x91, 0xD0, 0x91, 0xD1, 0x91, 0xD2, 0x91, 0xD3, /* 0xF0-0xF3 */ + 0x91, 0xD4, 0x91, 0xD5, 0x91, 0xD6, 0x91, 0xD7, /* 0xF4-0xF7 */ + 0x91, 0xD8, 0x91, 0xD9, 0x91, 0xDA, 0x91, 0xDB, /* 0xF8-0xFB */ + 0xB8, 0xFE, 0x91, 0xDC, 0x91, 0xDD, 0x91, 0xDE, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_BB[512] = { + 0xB9, 0xA1, 0x91, 0xDF, 0x91, 0xE0, 0x91, 0xE1, /* 0x00-0x03 */ + 0xB9, 0xA2, 0x91, 0xE2, 0x91, 0xE3, 0x91, 0xE4, /* 0x04-0x07 */ + 0x91, 0xE5, 0x91, 0xE6, 0x91, 0xE7, 0x91, 0xE8, /* 0x08-0x0B */ + 0x91, 0xE9, 0xB9, 0xA3, 0x91, 0xEA, 0xB9, 0xA4, /* 0x0C-0x0F */ + 0x91, 0xEB, 0xB9, 0xA5, 0x91, 0xEC, 0x91, 0xED, /* 0x10-0x13 */ + 0x91, 0xEE, 0x91, 0xEF, 0x91, 0xF0, 0x91, 0xF1, /* 0x14-0x17 */ + 0xB9, 0xA6, 0x91, 0xF2, 0x91, 0xF3, 0x91, 0xF4, /* 0x18-0x1B */ + 0xB9, 0xA7, 0x91, 0xF5, 0x91, 0xF6, 0x91, 0xF7, /* 0x1C-0x1F */ + 0xB9, 0xA8, 0x91, 0xF8, 0x91, 0xF9, 0x91, 0xFA, /* 0x20-0x23 */ + 0x91, 0xFB, 0x91, 0xFC, 0x91, 0xFD, 0x91, 0xFE, /* 0x24-0x27 */ + 0x92, 0x41, 0xB9, 0xA9, 0x92, 0x42, 0xB9, 0xAA, /* 0x28-0x2B */ + 0x92, 0x43, 0x92, 0x44, 0x92, 0x45, 0x92, 0x46, /* 0x2C-0x2F */ + 0x92, 0x47, 0x92, 0x48, 0x92, 0x49, 0x92, 0x4A, /* 0x30-0x33 */ + 0xB9, 0xAB, 0xB9, 0xAC, 0xB9, 0xAD, 0x92, 0x4B, /* 0x34-0x37 */ + 0xB9, 0xAE, 0x92, 0x4C, 0x92, 0x4D, 0xB9, 0xAF, /* 0x38-0x3B */ + 0xB9, 0xB0, 0xB9, 0xB1, 0xB9, 0xB2, 0x92, 0x4E, /* 0x3C-0x3F */ + 0x92, 0x4F, 0x92, 0x50, 0x92, 0x51, 0x92, 0x52, /* 0x40-0x43 */ + 0xB9, 0xB3, 0xB9, 0xB4, 0x92, 0x53, 0xB9, 0xB5, /* 0x44-0x47 */ + 0x92, 0x54, 0xB9, 0xB6, 0x92, 0x55, 0x92, 0x56, /* 0x48-0x4B */ + 0x92, 0x57, 0xB9, 0xB7, 0x92, 0x58, 0xB9, 0xB8, /* 0x4C-0x4F */ + 0xB9, 0xB9, 0x92, 0x59, 0x92, 0x5A, 0x92, 0x61, /* 0x50-0x53 */ + 0xB9, 0xBA, 0x92, 0x62, 0x92, 0x63, 0x92, 0x64, /* 0x54-0x57 */ + 0xB9, 0xBB, 0x92, 0x65, 0x92, 0x66, 0x92, 0x67, /* 0x58-0x5B */ + 0x92, 0x68, 0x92, 0x69, 0x92, 0x6A, 0x92, 0x6B, /* 0x5C-0x5F */ + 0x92, 0x6C, 0xB9, 0xBC, 0x92, 0x6D, 0xB9, 0xBD, /* 0x60-0x63 */ + 0x92, 0x6E, 0x92, 0x6F, 0x92, 0x70, 0x92, 0x71, /* 0x64-0x67 */ + 0x92, 0x72, 0x92, 0x73, 0x92, 0x74, 0x92, 0x75, /* 0x68-0x6B */ + 0xB9, 0xBE, 0x92, 0x76, 0x92, 0x77, 0x92, 0x78, /* 0x6C-0x6F */ + 0x92, 0x79, 0x92, 0x7A, 0x92, 0x81, 0x92, 0x82, /* 0x70-0x73 */ + 0x92, 0x83, 0x92, 0x84, 0x92, 0x85, 0x92, 0x86, /* 0x74-0x77 */ + 0x92, 0x87, 0x92, 0x88, 0x92, 0x89, 0x92, 0x8A, /* 0x78-0x7B */ + 0x92, 0x8B, 0x92, 0x8C, 0x92, 0x8D, 0x92, 0x8E, /* 0x7C-0x7F */ + + 0x92, 0x8F, 0x92, 0x90, 0x92, 0x91, 0x92, 0x92, /* 0x80-0x83 */ + 0x92, 0x93, 0x92, 0x94, 0x92, 0x95, 0x92, 0x96, /* 0x84-0x87 */ + 0xB9, 0xBF, 0x92, 0x97, 0x92, 0x98, 0x92, 0x99, /* 0x88-0x8B */ + 0xB9, 0xC0, 0x92, 0x9A, 0x92, 0x9B, 0x92, 0x9C, /* 0x8C-0x8F */ + 0xB9, 0xC1, 0x92, 0x9D, 0x92, 0x9E, 0x92, 0x9F, /* 0x90-0x93 */ + 0x92, 0xA0, 0x92, 0xA1, 0x92, 0xA2, 0x92, 0xA3, /* 0x94-0x97 */ + 0x92, 0xA4, 0x92, 0xA5, 0x92, 0xA6, 0x92, 0xA7, /* 0x98-0x9B */ + 0x92, 0xA8, 0x92, 0xA9, 0x92, 0xAA, 0x92, 0xAB, /* 0x9C-0x9F */ + 0x92, 0xAC, 0x92, 0xAD, 0x92, 0xAE, 0x92, 0xAF, /* 0xA0-0xA3 */ + 0xB9, 0xC2, 0x92, 0xB0, 0x92, 0xB1, 0x92, 0xB2, /* 0xA4-0xA7 */ + 0xB9, 0xC3, 0x92, 0xB3, 0x92, 0xB4, 0x92, 0xB5, /* 0xA8-0xAB */ + 0xB9, 0xC4, 0x92, 0xB6, 0x92, 0xB7, 0x92, 0xB8, /* 0xAC-0xAF */ + 0x92, 0xB9, 0x92, 0xBA, 0x92, 0xBB, 0x92, 0xBC, /* 0xB0-0xB3 */ + 0xB9, 0xC5, 0x92, 0xBD, 0x92, 0xBE, 0xB9, 0xC6, /* 0xB4-0xB7 */ + 0x92, 0xBF, 0x92, 0xC0, 0x92, 0xC1, 0x92, 0xC2, /* 0xB8-0xBB */ + 0x92, 0xC3, 0x92, 0xC4, 0x92, 0xC5, 0x92, 0xC6, /* 0xBC-0xBF */ + 0xB9, 0xC7, 0x92, 0xC7, 0x92, 0xC8, 0x92, 0xC9, /* 0xC0-0xC3 */ + 0xB9, 0xC8, 0x92, 0xCA, 0x92, 0xCB, 0x92, 0xCC, /* 0xC4-0xC7 */ + 0xB9, 0xC9, 0x92, 0xCD, 0x92, 0xCE, 0x92, 0xCF, /* 0xC8-0xCB */ + 0x92, 0xD0, 0x92, 0xD1, 0x92, 0xD2, 0x92, 0xD3, /* 0xCC-0xCF */ + 0xB9, 0xCA, 0x92, 0xD4, 0x92, 0xD5, 0xB9, 0xCB, /* 0xD0-0xD3 */ + 0x92, 0xD6, 0x92, 0xD7, 0x92, 0xD8, 0x92, 0xD9, /* 0xD4-0xD7 */ + 0x92, 0xDA, 0x92, 0xDB, 0x92, 0xDC, 0x92, 0xDD, /* 0xD8-0xDB */ + 0x92, 0xDE, 0x92, 0xDF, 0x92, 0xE0, 0x92, 0xE1, /* 0xDC-0xDF */ + 0x92, 0xE2, 0x92, 0xE3, 0x92, 0xE4, 0x92, 0xE5, /* 0xE0-0xE3 */ + 0x92, 0xE6, 0x92, 0xE7, 0x92, 0xE8, 0x92, 0xE9, /* 0xE4-0xE7 */ + 0x92, 0xEA, 0x92, 0xEB, 0x92, 0xEC, 0x92, 0xED, /* 0xE8-0xEB */ + 0x92, 0xEE, 0x92, 0xEF, 0x92, 0xF0, 0x92, 0xF1, /* 0xEC-0xEF */ + 0x92, 0xF2, 0x92, 0xF3, 0x92, 0xF4, 0x92, 0xF5, /* 0xF0-0xF3 */ + 0x92, 0xF6, 0x92, 0xF7, 0x92, 0xF8, 0x92, 0xF9, /* 0xF4-0xF7 */ + 0xB9, 0xCC, 0xB9, 0xCD, 0x92, 0xFA, 0x92, 0xFB, /* 0xF8-0xFB */ + 0xB9, 0xCE, 0x92, 0xFC, 0x92, 0xFD, 0xB9, 0xCF, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_BC[512] = { + 0xB9, 0xD0, 0x92, 0xFE, 0xB9, 0xD1, 0x93, 0x41, /* 0x00-0x03 */ + 0x93, 0x42, 0x93, 0x43, 0x93, 0x44, 0x93, 0x45, /* 0x04-0x07 */ + 0xB9, 0xD2, 0xB9, 0xD3, 0x93, 0x46, 0xB9, 0xD4, /* 0x08-0x0B */ + 0xB9, 0xD5, 0xB9, 0xD6, 0x93, 0x47, 0xB9, 0xD7, /* 0x0C-0x0F */ + 0x93, 0x48, 0xB9, 0xD8, 0x93, 0x49, 0x93, 0x4A, /* 0x10-0x13 */ + 0xB9, 0xD9, 0xB9, 0xDA, 0xB9, 0xDB, 0xB9, 0xDC, /* 0x14-0x17 */ + 0xB9, 0xDD, 0x93, 0x4B, 0x93, 0x4C, 0xB9, 0xDE, /* 0x18-0x1B */ + 0xB9, 0xDF, 0xB9, 0xE0, 0xB9, 0xE1, 0xB9, 0xE2, /* 0x1C-0x1F */ + 0x93, 0x4D, 0x93, 0x4E, 0x93, 0x4F, 0x93, 0x50, /* 0x20-0x23 */ + 0xB9, 0xE3, 0xB9, 0xE4, 0x93, 0x51, 0xB9, 0xE5, /* 0x24-0x27 */ + 0x93, 0x52, 0xB9, 0xE6, 0x93, 0x53, 0x93, 0x54, /* 0x28-0x2B */ + 0x93, 0x55, 0xB9, 0xE7, 0x93, 0x56, 0x93, 0x57, /* 0x2C-0x2F */ + 0xB9, 0xE8, 0xB9, 0xE9, 0x93, 0x58, 0x93, 0x59, /* 0x30-0x33 */ + 0xB9, 0xEA, 0x93, 0x5A, 0x93, 0x61, 0x93, 0x62, /* 0x34-0x37 */ + 0xB9, 0xEB, 0x93, 0x63, 0x93, 0x64, 0x93, 0x65, /* 0x38-0x3B */ + 0x93, 0x66, 0x93, 0x67, 0x93, 0x68, 0x93, 0x69, /* 0x3C-0x3F */ + 0xB9, 0xEC, 0xB9, 0xED, 0x93, 0x6A, 0xB9, 0xEE, /* 0x40-0x43 */ + 0xB9, 0xEF, 0xB9, 0xF0, 0x93, 0x6B, 0x93, 0x6C, /* 0x44-0x47 */ + 0x93, 0x6D, 0xB9, 0xF1, 0x93, 0x6E, 0x93, 0x6F, /* 0x48-0x4B */ + 0xB9, 0xF2, 0xB9, 0xF3, 0x93, 0x70, 0x93, 0x71, /* 0x4C-0x4F */ + 0xB9, 0xF4, 0x93, 0x72, 0x93, 0x73, 0x93, 0x74, /* 0x50-0x53 */ + 0x93, 0x75, 0x93, 0x76, 0x93, 0x77, 0x93, 0x78, /* 0x54-0x57 */ + 0x93, 0x79, 0x93, 0x7A, 0x93, 0x81, 0x93, 0x82, /* 0x58-0x5B */ + 0x93, 0x83, 0xB9, 0xF5, 0x93, 0x84, 0x93, 0x85, /* 0x5C-0x5F */ + 0x93, 0x86, 0x93, 0x87, 0x93, 0x88, 0x93, 0x89, /* 0x60-0x63 */ + 0x93, 0x8A, 0x93, 0x8B, 0x93, 0x8C, 0x93, 0x8D, /* 0x64-0x67 */ + 0x93, 0x8E, 0x93, 0x8F, 0x93, 0x90, 0x93, 0x91, /* 0x68-0x6B */ + 0x93, 0x92, 0x93, 0x93, 0x93, 0x94, 0x93, 0x95, /* 0x6C-0x6F */ + 0x93, 0x96, 0x93, 0x97, 0x93, 0x98, 0x93, 0x99, /* 0x70-0x73 */ + 0x93, 0x9A, 0x93, 0x9B, 0x93, 0x9C, 0x93, 0x9D, /* 0x74-0x77 */ + 0x93, 0x9E, 0x93, 0x9F, 0x93, 0xA0, 0x93, 0xA1, /* 0x78-0x7B */ + 0x93, 0xA2, 0x93, 0xA3, 0x93, 0xA4, 0x93, 0xA5, /* 0x7C-0x7F */ + + 0x93, 0xA6, 0x93, 0xA7, 0x93, 0xA8, 0x93, 0xA9, /* 0x80-0x83 */ + 0xB9, 0xF6, 0xB9, 0xF7, 0x93, 0xAA, 0x93, 0xAB, /* 0x84-0x87 */ + 0xB9, 0xF8, 0x93, 0xAC, 0x93, 0xAD, 0xB9, 0xF9, /* 0x88-0x8B */ + 0xB9, 0xFA, 0x93, 0xAE, 0xB9, 0xFB, 0x93, 0xAF, /* 0x8C-0x8F */ + 0x93, 0xB0, 0x93, 0xB1, 0x93, 0xB2, 0x93, 0xB3, /* 0x90-0x93 */ + 0xB9, 0xFC, 0xB9, 0xFD, 0x93, 0xB4, 0xB9, 0xFE, /* 0x94-0x97 */ + 0x93, 0xB5, 0xBA, 0xA1, 0xBA, 0xA2, 0x93, 0xB6, /* 0x98-0x9B */ + 0x93, 0xB7, 0x93, 0xB8, 0x93, 0xB9, 0x93, 0xBA, /* 0x9C-0x9F */ + 0xBA, 0xA3, 0xBA, 0xA4, 0x93, 0xBB, 0x93, 0xBC, /* 0xA0-0xA3 */ + 0xBA, 0xA5, 0x93, 0xBD, 0x93, 0xBE, 0xBA, 0xA6, /* 0xA4-0xA7 */ + 0xBA, 0xA7, 0x93, 0xBF, 0x93, 0xC0, 0x93, 0xC1, /* 0xA8-0xAB */ + 0x93, 0xC2, 0x93, 0xC3, 0x93, 0xC4, 0x93, 0xC5, /* 0xAC-0xAF */ + 0xBA, 0xA8, 0xBA, 0xA9, 0x93, 0xC6, 0xBA, 0xAA, /* 0xB0-0xB3 */ + 0xBA, 0xAB, 0xBA, 0xAC, 0x93, 0xC7, 0x93, 0xC8, /* 0xB4-0xB7 */ + 0x93, 0xC9, 0x93, 0xCA, 0x93, 0xCB, 0x93, 0xCC, /* 0xB8-0xBB */ + 0xBA, 0xAD, 0xBA, 0xAE, 0x93, 0xCD, 0x93, 0xCE, /* 0xBC-0xBF */ + 0xBA, 0xAF, 0x93, 0xCF, 0x93, 0xD0, 0x93, 0xD1, /* 0xC0-0xC3 */ + 0xBA, 0xB0, 0x93, 0xD2, 0x93, 0xD3, 0x93, 0xD4, /* 0xC4-0xC7 */ + 0x93, 0xD5, 0x93, 0xD6, 0x93, 0xD7, 0x93, 0xD8, /* 0xC8-0xCB */ + 0x93, 0xD9, 0xBA, 0xB1, 0x93, 0xDA, 0xBA, 0xB2, /* 0xCC-0xCF */ + 0xBA, 0xB3, 0xBA, 0xB4, 0x93, 0xDB, 0x93, 0xDC, /* 0xD0-0xD3 */ + 0x93, 0xDD, 0xBA, 0xB5, 0x93, 0xDE, 0x93, 0xDF, /* 0xD4-0xD7 */ + 0xBA, 0xB6, 0x93, 0xE0, 0x93, 0xE1, 0x93, 0xE2, /* 0xD8-0xDB */ + 0xBA, 0xB7, 0x93, 0xE3, 0x93, 0xE4, 0x93, 0xE5, /* 0xDC-0xDF */ + 0x93, 0xE6, 0x93, 0xE7, 0x93, 0xE8, 0x93, 0xE9, /* 0xE0-0xE3 */ + 0x93, 0xEA, 0x93, 0xEB, 0x93, 0xEC, 0x93, 0xED, /* 0xE4-0xE7 */ + 0x93, 0xEE, 0x93, 0xEF, 0x93, 0xF0, 0x93, 0xF1, /* 0xE8-0xEB */ + 0x93, 0xF2, 0x93, 0xF3, 0x93, 0xF4, 0x93, 0xF5, /* 0xEC-0xEF */ + 0x93, 0xF6, 0x93, 0xF7, 0x93, 0xF8, 0x93, 0xF9, /* 0xF0-0xF3 */ + 0xBA, 0xB8, 0xBA, 0xB9, 0xBA, 0xBA, 0x93, 0xFA, /* 0xF4-0xF7 */ + 0xBA, 0xBB, 0x93, 0xFB, 0x93, 0xFC, 0x93, 0xFD, /* 0xF8-0xFB */ + 0xBA, 0xBC, 0x93, 0xFE, 0x94, 0x41, 0x94, 0x42, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_BD[512] = { + 0x94, 0x43, 0x94, 0x44, 0x94, 0x45, 0x94, 0x46, /* 0x00-0x03 */ + 0xBA, 0xBD, 0xBA, 0xBE, 0x94, 0x47, 0xBA, 0xBF, /* 0x04-0x07 */ + 0x94, 0x48, 0xBA, 0xC0, 0x94, 0x49, 0x94, 0x4A, /* 0x08-0x0B */ + 0x94, 0x4B, 0x94, 0x4C, 0x94, 0x4D, 0x94, 0x4E, /* 0x0C-0x0F */ + 0xBA, 0xC1, 0x94, 0x4F, 0x94, 0x50, 0x94, 0x51, /* 0x10-0x13 */ + 0xBA, 0xC2, 0x94, 0x52, 0x94, 0x53, 0x94, 0x54, /* 0x14-0x17 */ + 0x94, 0x55, 0x94, 0x56, 0x94, 0x57, 0x94, 0x58, /* 0x18-0x1B */ + 0x94, 0x59, 0x94, 0x5A, 0x94, 0x61, 0x94, 0x62, /* 0x1C-0x1F */ + 0x94, 0x63, 0x94, 0x64, 0x94, 0x65, 0x94, 0x66, /* 0x20-0x23 */ + 0xBA, 0xC3, 0x94, 0x67, 0x94, 0x68, 0x94, 0x69, /* 0x24-0x27 */ + 0x94, 0x6A, 0x94, 0x6B, 0x94, 0x6C, 0x94, 0x6D, /* 0x28-0x2B */ + 0xBA, 0xC4, 0x94, 0x6E, 0x94, 0x6F, 0x94, 0x70, /* 0x2C-0x2F */ + 0x94, 0x71, 0x94, 0x72, 0x94, 0x73, 0x94, 0x74, /* 0x30-0x33 */ + 0x94, 0x75, 0x94, 0x76, 0x94, 0x77, 0x94, 0x78, /* 0x34-0x37 */ + 0x94, 0x79, 0x94, 0x7A, 0x94, 0x81, 0x94, 0x82, /* 0x38-0x3B */ + 0x94, 0x83, 0x94, 0x84, 0x94, 0x85, 0x94, 0x86, /* 0x3C-0x3F */ + 0xBA, 0xC5, 0x94, 0x87, 0x94, 0x88, 0x94, 0x89, /* 0x40-0x43 */ + 0x94, 0x8A, 0x94, 0x8B, 0x94, 0x8C, 0x94, 0x8D, /* 0x44-0x47 */ + 0xBA, 0xC6, 0xBA, 0xC7, 0x94, 0x8E, 0x94, 0x8F, /* 0x48-0x4B */ + 0xBA, 0xC8, 0x94, 0x90, 0x94, 0x91, 0x94, 0x92, /* 0x4C-0x4F */ + 0xBA, 0xC9, 0x94, 0x93, 0x94, 0x94, 0x94, 0x95, /* 0x50-0x53 */ + 0x94, 0x96, 0x94, 0x97, 0x94, 0x98, 0x94, 0x99, /* 0x54-0x57 */ + 0xBA, 0xCA, 0xBA, 0xCB, 0x94, 0x9A, 0x94, 0x9B, /* 0x58-0x5B */ + 0x94, 0x9C, 0x94, 0x9D, 0x94, 0x9E, 0x94, 0x9F, /* 0x5C-0x5F */ + 0x94, 0xA0, 0x94, 0xA1, 0x94, 0xA2, 0x94, 0xA3, /* 0x60-0x63 */ + 0xBA, 0xCC, 0x94, 0xA4, 0x94, 0xA5, 0x94, 0xA6, /* 0x64-0x67 */ + 0xBA, 0xCD, 0x94, 0xA7, 0x94, 0xA8, 0x94, 0xA9, /* 0x68-0x6B */ + 0x94, 0xAA, 0x94, 0xAB, 0x94, 0xAC, 0x94, 0xAD, /* 0x6C-0x6F */ + 0x94, 0xAE, 0x94, 0xAF, 0x94, 0xB0, 0x94, 0xB1, /* 0x70-0x73 */ + 0x94, 0xB2, 0x94, 0xB3, 0x94, 0xB4, 0x94, 0xB5, /* 0x74-0x77 */ + 0x94, 0xB6, 0x94, 0xB7, 0x94, 0xB8, 0x94, 0xB9, /* 0x78-0x7B */ + 0x94, 0xBA, 0x94, 0xBB, 0x94, 0xBC, 0x94, 0xBD, /* 0x7C-0x7F */ + + 0xBA, 0xCE, 0xBA, 0xCF, 0x94, 0xBE, 0x94, 0xBF, /* 0x80-0x83 */ + 0xBA, 0xD0, 0x94, 0xC0, 0x94, 0xC1, 0xBA, 0xD1, /* 0x84-0x87 */ + 0xBA, 0xD2, 0xBA, 0xD3, 0xBA, 0xD4, 0x94, 0xC2, /* 0x88-0x8B */ + 0x94, 0xC3, 0x94, 0xC4, 0x94, 0xC5, 0x94, 0xC6, /* 0x8C-0x8F */ + 0xBA, 0xD5, 0xBA, 0xD6, 0x94, 0xC7, 0xBA, 0xD7, /* 0x90-0x93 */ + 0x94, 0xC8, 0xBA, 0xD8, 0x94, 0xC9, 0x94, 0xCA, /* 0x94-0x97 */ + 0x94, 0xCB, 0xBA, 0xD9, 0xBA, 0xDA, 0x94, 0xCC, /* 0x98-0x9B */ + 0xBA, 0xDB, 0x94, 0xCD, 0x94, 0xCE, 0x94, 0xCF, /* 0x9C-0x9F */ + 0x94, 0xD0, 0x94, 0xD1, 0x94, 0xD2, 0x94, 0xD3, /* 0xA0-0xA3 */ + 0xBA, 0xDC, 0x94, 0xD4, 0x94, 0xD5, 0x94, 0xD6, /* 0xA4-0xA7 */ + 0x94, 0xD7, 0x94, 0xD8, 0x94, 0xD9, 0x94, 0xDA, /* 0xA8-0xAB */ + 0x94, 0xDB, 0x94, 0xDC, 0x94, 0xDD, 0x94, 0xDE, /* 0xAC-0xAF */ + 0xBA, 0xDD, 0x94, 0xDF, 0x94, 0xE0, 0x94, 0xE1, /* 0xB0-0xB3 */ + 0x94, 0xE2, 0x94, 0xE3, 0x94, 0xE4, 0x94, 0xE5, /* 0xB4-0xB7 */ + 0xBA, 0xDE, 0x94, 0xE6, 0x94, 0xE7, 0x94, 0xE8, /* 0xB8-0xBB */ + 0x94, 0xE9, 0x94, 0xEA, 0x94, 0xEB, 0x94, 0xEC, /* 0xBC-0xBF */ + 0x94, 0xED, 0x94, 0xEE, 0x94, 0xEF, 0x94, 0xF0, /* 0xC0-0xC3 */ + 0x94, 0xF1, 0x94, 0xF2, 0x94, 0xF3, 0x94, 0xF4, /* 0xC4-0xC7 */ + 0x94, 0xF5, 0x94, 0xF6, 0x94, 0xF7, 0x94, 0xF8, /* 0xC8-0xCB */ + 0x94, 0xF9, 0x94, 0xFA, 0x94, 0xFB, 0x94, 0xFC, /* 0xCC-0xCF */ + 0x94, 0xFD, 0x94, 0xFE, 0x95, 0x41, 0x95, 0x42, /* 0xD0-0xD3 */ + 0xBA, 0xDF, 0xBA, 0xE0, 0x95, 0x43, 0x95, 0x44, /* 0xD4-0xD7 */ + 0xBA, 0xE1, 0x95, 0x45, 0x95, 0x46, 0x95, 0x47, /* 0xD8-0xDB */ + 0xBA, 0xE2, 0x95, 0x48, 0x95, 0x49, 0x95, 0x4A, /* 0xDC-0xDF */ + 0x95, 0x4B, 0x95, 0x4C, 0x95, 0x4D, 0x95, 0x4E, /* 0xE0-0xE3 */ + 0x95, 0x4F, 0x95, 0x50, 0x95, 0x51, 0x95, 0x52, /* 0xE4-0xE7 */ + 0x95, 0x53, 0xBA, 0xE3, 0x95, 0x54, 0x95, 0x55, /* 0xE8-0xEB */ + 0x95, 0x56, 0x95, 0x57, 0x95, 0x58, 0x95, 0x59, /* 0xEC-0xEF */ + 0xBA, 0xE4, 0x95, 0x5A, 0x95, 0x61, 0x95, 0x62, /* 0xF0-0xF3 */ + 0xBA, 0xE5, 0x95, 0x63, 0x95, 0x64, 0x95, 0x65, /* 0xF4-0xF7 */ + 0xBA, 0xE6, 0x95, 0x66, 0x95, 0x67, 0x95, 0x68, /* 0xF8-0xFB */ + 0x95, 0x69, 0x95, 0x6A, 0x95, 0x6B, 0x95, 0x6C, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_BE[512] = { + 0xBA, 0xE7, 0x95, 0x6D, 0x95, 0x6E, 0xBA, 0xE8, /* 0x00-0x03 */ + 0x95, 0x6F, 0xBA, 0xE9, 0x95, 0x70, 0x95, 0x71, /* 0x04-0x07 */ + 0x95, 0x72, 0x95, 0x73, 0x95, 0x74, 0x95, 0x75, /* 0x08-0x0B */ + 0xBA, 0xEA, 0xBA, 0xEB, 0x95, 0x76, 0x95, 0x77, /* 0x0C-0x0F */ + 0xBA, 0xEC, 0x95, 0x78, 0x95, 0x79, 0x95, 0x7A, /* 0x10-0x13 */ + 0xBA, 0xED, 0x95, 0x81, 0x95, 0x82, 0x95, 0x83, /* 0x14-0x17 */ + 0x95, 0x84, 0x95, 0x85, 0x95, 0x86, 0x95, 0x87, /* 0x18-0x1B */ + 0xBA, 0xEE, 0xBA, 0xEF, 0x95, 0x88, 0xBA, 0xF0, /* 0x1C-0x1F */ + 0x95, 0x89, 0x95, 0x8A, 0x95, 0x8B, 0x95, 0x8C, /* 0x20-0x23 */ + 0x95, 0x8D, 0x95, 0x8E, 0x95, 0x8F, 0x95, 0x90, /* 0x24-0x27 */ + 0x95, 0x91, 0x95, 0x92, 0x95, 0x93, 0x95, 0x94, /* 0x28-0x2B */ + 0x95, 0x95, 0x95, 0x96, 0x95, 0x97, 0x95, 0x98, /* 0x2C-0x2F */ + 0x95, 0x99, 0x95, 0x9A, 0x95, 0x9B, 0x95, 0x9C, /* 0x30-0x33 */ + 0x95, 0x9D, 0x95, 0x9E, 0x95, 0x9F, 0x95, 0xA0, /* 0x34-0x37 */ + 0x95, 0xA1, 0x95, 0xA2, 0x95, 0xA3, 0x95, 0xA4, /* 0x38-0x3B */ + 0x95, 0xA5, 0x95, 0xA6, 0x95, 0xA7, 0x95, 0xA8, /* 0x3C-0x3F */ + 0x95, 0xA9, 0x95, 0xAA, 0x95, 0xAB, 0x95, 0xAC, /* 0x40-0x43 */ + 0xBA, 0xF1, 0xBA, 0xF2, 0x95, 0xAD, 0x95, 0xAE, /* 0x44-0x47 */ + 0xBA, 0xF3, 0x95, 0xAF, 0x95, 0xB0, 0x95, 0xB1, /* 0x48-0x4B */ + 0xBA, 0xF4, 0x95, 0xB2, 0xBA, 0xF5, 0x95, 0xB3, /* 0x4C-0x4F */ + 0x95, 0xB4, 0x95, 0xB5, 0x95, 0xB6, 0x95, 0xB7, /* 0x50-0x53 */ + 0xBA, 0xF6, 0xBA, 0xF7, 0x95, 0xB8, 0xBA, 0xF8, /* 0x54-0x57 */ + 0x95, 0xB9, 0xBA, 0xF9, 0xBA, 0xFA, 0xBA, 0xFB, /* 0x58-0x5B */ + 0x95, 0xBA, 0x95, 0xBB, 0x95, 0xBC, 0x95, 0xBD, /* 0x5C-0x5F */ + 0xBA, 0xFC, 0xBA, 0xFD, 0x95, 0xBE, 0x95, 0xBF, /* 0x60-0x63 */ + 0xBA, 0xFE, 0x95, 0xC0, 0x95, 0xC1, 0x95, 0xC2, /* 0x64-0x67 */ + 0xBB, 0xA1, 0x95, 0xC3, 0xBB, 0xA2, 0x95, 0xC4, /* 0x68-0x6B */ + 0x95, 0xC5, 0x95, 0xC6, 0x95, 0xC7, 0x95, 0xC8, /* 0x6C-0x6F */ + 0xBB, 0xA3, 0xBB, 0xA4, 0x95, 0xC9, 0xBB, 0xA5, /* 0x70-0x73 */ + 0xBB, 0xA6, 0xBB, 0xA7, 0x95, 0xCA, 0x95, 0xCB, /* 0x74-0x77 */ + 0x95, 0xCC, 0x95, 0xCD, 0x95, 0xCE, 0xBB, 0xA8, /* 0x78-0x7B */ + 0xBB, 0xA9, 0xBB, 0xAA, 0x95, 0xCF, 0x95, 0xD0, /* 0x7C-0x7F */ + + 0xBB, 0xAB, 0x95, 0xD1, 0x95, 0xD2, 0x95, 0xD3, /* 0x80-0x83 */ + 0xBB, 0xAC, 0x95, 0xD4, 0x95, 0xD5, 0x95, 0xD6, /* 0x84-0x87 */ + 0x95, 0xD7, 0x95, 0xD8, 0x95, 0xD9, 0x95, 0xDA, /* 0x88-0x8B */ + 0xBB, 0xAD, 0xBB, 0xAE, 0x95, 0xDB, 0xBB, 0xAF, /* 0x8C-0x8F */ + 0xBB, 0xB0, 0xBB, 0xB1, 0x95, 0xDC, 0x95, 0xDD, /* 0x90-0x93 */ + 0x95, 0xDE, 0x95, 0xDF, 0x95, 0xE0, 0x95, 0xE1, /* 0x94-0x97 */ + 0xBB, 0xB2, 0xBB, 0xB3, 0x95, 0xE2, 0x95, 0xE3, /* 0x98-0x9B */ + 0x95, 0xE4, 0x95, 0xE5, 0x95, 0xE6, 0x95, 0xE7, /* 0x9C-0x9F */ + 0x95, 0xE8, 0x95, 0xE9, 0x95, 0xEA, 0x95, 0xEB, /* 0xA0-0xA3 */ + 0x95, 0xEC, 0x95, 0xED, 0x95, 0xEE, 0x95, 0xEF, /* 0xA4-0xA7 */ + 0xBB, 0xB4, 0x95, 0xF0, 0x95, 0xF1, 0x95, 0xF2, /* 0xA8-0xAB */ + 0x95, 0xF3, 0x95, 0xF4, 0x95, 0xF5, 0x95, 0xF6, /* 0xAC-0xAF */ + 0x95, 0xF7, 0x95, 0xF8, 0x95, 0xF9, 0x95, 0xFA, /* 0xB0-0xB3 */ + 0x95, 0xFB, 0x95, 0xFC, 0x95, 0xFD, 0x95, 0xFE, /* 0xB4-0xB7 */ + 0x96, 0x41, 0x96, 0x42, 0x96, 0x43, 0x96, 0x44, /* 0xB8-0xBB */ + 0x96, 0x45, 0x96, 0x46, 0x96, 0x47, 0x96, 0x48, /* 0xBC-0xBF */ + 0x96, 0x49, 0x96, 0x4A, 0x96, 0x4B, 0x96, 0x4C, /* 0xC0-0xC3 */ + 0x96, 0x4D, 0x96, 0x4E, 0x96, 0x4F, 0x96, 0x50, /* 0xC4-0xC7 */ + 0x96, 0x51, 0x96, 0x52, 0x96, 0x53, 0x96, 0x54, /* 0xC8-0xCB */ + 0x96, 0x55, 0x96, 0x56, 0x96, 0x57, 0x96, 0x58, /* 0xCC-0xCF */ + 0xBB, 0xB5, 0xBB, 0xB6, 0x96, 0x59, 0x96, 0x5A, /* 0xD0-0xD3 */ + 0xBB, 0xB7, 0x96, 0x61, 0x96, 0x62, 0xBB, 0xB8, /* 0xD4-0xD7 */ + 0xBB, 0xB9, 0x96, 0x63, 0x96, 0x64, 0x96, 0x65, /* 0xD8-0xDB */ + 0x96, 0x66, 0x96, 0x67, 0x96, 0x68, 0x96, 0x69, /* 0xDC-0xDF */ + 0xBB, 0xBA, 0x96, 0x6A, 0x96, 0x6B, 0xBB, 0xBB, /* 0xE0-0xE3 */ + 0xBB, 0xBC, 0xBB, 0xBD, 0x96, 0x6C, 0x96, 0x6D, /* 0xE4-0xE7 */ + 0x96, 0x6E, 0x96, 0x6F, 0x96, 0x70, 0x96, 0x71, /* 0xE8-0xEB */ + 0xBB, 0xBE, 0x96, 0x72, 0x96, 0x73, 0x96, 0x74, /* 0xEC-0xEF */ + 0x96, 0x75, 0x96, 0x76, 0x96, 0x77, 0x96, 0x78, /* 0xF0-0xF3 */ + 0x96, 0x79, 0x96, 0x7A, 0x96, 0x81, 0x96, 0x82, /* 0xF4-0xF7 */ + 0x96, 0x83, 0x96, 0x84, 0x96, 0x85, 0x96, 0x86, /* 0xF8-0xFB */ + 0x96, 0x87, 0x96, 0x88, 0x96, 0x89, 0x96, 0x8A, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_BF[512] = { + 0x96, 0x8B, 0xBB, 0xBF, 0x96, 0x8C, 0x96, 0x8D, /* 0x00-0x03 */ + 0x96, 0x8E, 0x96, 0x8F, 0x96, 0x90, 0x96, 0x91, /* 0x04-0x07 */ + 0xBB, 0xC0, 0xBB, 0xC1, 0x96, 0x92, 0x96, 0x93, /* 0x08-0x0B */ + 0x96, 0x94, 0x96, 0x95, 0x96, 0x96, 0x96, 0x97, /* 0x0C-0x0F */ + 0x96, 0x98, 0x96, 0x99, 0x96, 0x9A, 0x96, 0x9B, /* 0x10-0x13 */ + 0x96, 0x9C, 0x96, 0x9D, 0x96, 0x9E, 0x96, 0x9F, /* 0x14-0x17 */ + 0xBB, 0xC2, 0xBB, 0xC3, 0x96, 0xA0, 0xBB, 0xC4, /* 0x18-0x1B */ + 0xBB, 0xC5, 0xBB, 0xC6, 0x96, 0xA1, 0x96, 0xA2, /* 0x1C-0x1F */ + 0x96, 0xA3, 0x96, 0xA4, 0x96, 0xA5, 0x96, 0xA6, /* 0x20-0x23 */ + 0x96, 0xA7, 0x96, 0xA8, 0x96, 0xA9, 0x96, 0xAA, /* 0x24-0x27 */ + 0x96, 0xAB, 0x96, 0xAC, 0x96, 0xAD, 0x96, 0xAE, /* 0x28-0x2B */ + 0x96, 0xAF, 0x96, 0xB0, 0x96, 0xB1, 0x96, 0xB2, /* 0x2C-0x2F */ + 0x96, 0xB3, 0x96, 0xB4, 0x96, 0xB5, 0x96, 0xB6, /* 0x30-0x33 */ + 0x96, 0xB7, 0x96, 0xB8, 0x96, 0xB9, 0x96, 0xBA, /* 0x34-0x37 */ + 0x96, 0xBB, 0x96, 0xBC, 0x96, 0xBD, 0x96, 0xBE, /* 0x38-0x3B */ + 0x96, 0xBF, 0x96, 0xC0, 0x96, 0xC1, 0x96, 0xC2, /* 0x3C-0x3F */ + 0xBB, 0xC7, 0xBB, 0xC8, 0x96, 0xC3, 0x96, 0xC4, /* 0x40-0x43 */ + 0xBB, 0xC9, 0x96, 0xC5, 0x96, 0xC6, 0x96, 0xC7, /* 0x44-0x47 */ + 0xBB, 0xCA, 0x96, 0xC8, 0x96, 0xC9, 0x96, 0xCA, /* 0x48-0x4B */ + 0x96, 0xCB, 0x96, 0xCC, 0x96, 0xCD, 0x96, 0xCE, /* 0x4C-0x4F */ + 0xBB, 0xCB, 0xBB, 0xCC, 0x96, 0xCF, 0x96, 0xD0, /* 0x50-0x53 */ + 0x96, 0xD1, 0xBB, 0xCD, 0x96, 0xD2, 0x96, 0xD3, /* 0x54-0x57 */ + 0x96, 0xD4, 0x96, 0xD5, 0x96, 0xD6, 0x96, 0xD7, /* 0x58-0x5B */ + 0x96, 0xD8, 0x96, 0xD9, 0x96, 0xDA, 0x96, 0xDB, /* 0x5C-0x5F */ + 0x96, 0xDC, 0x96, 0xDD, 0x96, 0xDE, 0x96, 0xDF, /* 0x60-0x63 */ + 0x96, 0xE0, 0x96, 0xE1, 0x96, 0xE2, 0x96, 0xE3, /* 0x64-0x67 */ + 0x96, 0xE4, 0x96, 0xE5, 0x96, 0xE6, 0x96, 0xE7, /* 0x68-0x6B */ + 0x96, 0xE8, 0x96, 0xE9, 0x96, 0xEA, 0x96, 0xEB, /* 0x6C-0x6F */ + 0x96, 0xEC, 0x96, 0xED, 0x96, 0xEE, 0x96, 0xEF, /* 0x70-0x73 */ + 0x96, 0xF0, 0x96, 0xF1, 0x96, 0xF2, 0x96, 0xF3, /* 0x74-0x77 */ + 0x96, 0xF4, 0x96, 0xF5, 0x96, 0xF6, 0x96, 0xF7, /* 0x78-0x7B */ + 0x96, 0xF8, 0x96, 0xF9, 0x96, 0xFA, 0x96, 0xFB, /* 0x7C-0x7F */ + + 0x96, 0xFC, 0x96, 0xFD, 0x96, 0xFE, 0x97, 0x41, /* 0x80-0x83 */ + 0x97, 0x42, 0x97, 0x43, 0x97, 0x44, 0x97, 0x45, /* 0x84-0x87 */ + 0x97, 0x46, 0x97, 0x47, 0x97, 0x48, 0x97, 0x49, /* 0x88-0x8B */ + 0x97, 0x4A, 0x97, 0x4B, 0x97, 0x4C, 0x97, 0x4D, /* 0x8C-0x8F */ + 0x97, 0x4E, 0x97, 0x4F, 0x97, 0x50, 0x97, 0x51, /* 0x90-0x93 */ + 0xBB, 0xCE, 0x97, 0x52, 0x97, 0x53, 0x97, 0x54, /* 0x94-0x97 */ + 0x97, 0x55, 0x97, 0x56, 0x97, 0x57, 0x97, 0x58, /* 0x98-0x9B */ + 0x97, 0x59, 0x97, 0x5A, 0x97, 0x61, 0x97, 0x62, /* 0x9C-0x9F */ + 0x97, 0x63, 0x97, 0x64, 0x97, 0x65, 0x97, 0x66, /* 0xA0-0xA3 */ + 0x97, 0x67, 0x97, 0x68, 0x97, 0x69, 0x97, 0x6A, /* 0xA4-0xA7 */ + 0x97, 0x6B, 0x97, 0x6C, 0x97, 0x6D, 0x97, 0x6E, /* 0xA8-0xAB */ + 0x97, 0x6F, 0x97, 0x70, 0x97, 0x71, 0x97, 0x72, /* 0xAC-0xAF */ + 0xBB, 0xCF, 0x97, 0x73, 0x97, 0x74, 0x97, 0x75, /* 0xB0-0xB3 */ + 0x97, 0x76, 0x97, 0x77, 0x97, 0x78, 0x97, 0x79, /* 0xB4-0xB7 */ + 0x97, 0x7A, 0x97, 0x81, 0x97, 0x82, 0x97, 0x83, /* 0xB8-0xBB */ + 0x97, 0x84, 0x97, 0x85, 0x97, 0x86, 0x97, 0x87, /* 0xBC-0xBF */ + 0x97, 0x88, 0x97, 0x89, 0x97, 0x8A, 0x97, 0x8B, /* 0xC0-0xC3 */ + 0x97, 0x8C, 0xBB, 0xD0, 0x97, 0x8D, 0x97, 0x8E, /* 0xC4-0xC7 */ + 0x97, 0x8F, 0x97, 0x90, 0x97, 0x91, 0x97, 0x92, /* 0xC8-0xCB */ + 0xBB, 0xD1, 0xBB, 0xD2, 0x97, 0x93, 0x97, 0x94, /* 0xCC-0xCF */ + 0xBB, 0xD3, 0x97, 0x95, 0x97, 0x96, 0x97, 0x97, /* 0xD0-0xD3 */ + 0xBB, 0xD4, 0x97, 0x98, 0x97, 0x99, 0x97, 0x9A, /* 0xD4-0xD7 */ + 0x97, 0x9B, 0x97, 0x9C, 0x97, 0x9D, 0x97, 0x9E, /* 0xD8-0xDB */ + 0xBB, 0xD5, 0x97, 0x9F, 0x97, 0xA0, 0xBB, 0xD6, /* 0xDC-0xDF */ + 0x97, 0xA1, 0xBB, 0xD7, 0x97, 0xA2, 0x97, 0xA3, /* 0xE0-0xE3 */ + 0x97, 0xA4, 0x97, 0xA5, 0x97, 0xA6, 0x97, 0xA7, /* 0xE4-0xE7 */ + 0x97, 0xA8, 0x97, 0xA9, 0x97, 0xAA, 0x97, 0xAB, /* 0xE8-0xEB */ + 0x97, 0xAC, 0x97, 0xAD, 0x97, 0xAE, 0x97, 0xAF, /* 0xEC-0xEF */ + 0x97, 0xB0, 0x97, 0xB1, 0x97, 0xB2, 0x97, 0xB3, /* 0xF0-0xF3 */ + 0x97, 0xB4, 0x97, 0xB5, 0x97, 0xB6, 0x97, 0xB7, /* 0xF4-0xF7 */ + 0x97, 0xB8, 0x97, 0xB9, 0x97, 0xBA, 0x97, 0xBB, /* 0xF8-0xFB */ + 0x97, 0xBC, 0x97, 0xBD, 0x97, 0xBE, 0x97, 0xBF, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C0[512] = { + 0x97, 0xC0, 0x97, 0xC1, 0x97, 0xC2, 0x97, 0xC3, /* 0x00-0x03 */ + 0x97, 0xC4, 0x97, 0xC5, 0x97, 0xC6, 0x97, 0xC7, /* 0x04-0x07 */ + 0x97, 0xC8, 0x97, 0xC9, 0x97, 0xCA, 0x97, 0xCB, /* 0x08-0x0B */ + 0x97, 0xCC, 0x97, 0xCD, 0x97, 0xCE, 0x97, 0xCF, /* 0x0C-0x0F */ + 0x97, 0xD0, 0x97, 0xD1, 0x97, 0xD2, 0x97, 0xD3, /* 0x10-0x13 */ + 0x97, 0xD4, 0x97, 0xD5, 0x97, 0xD6, 0x97, 0xD7, /* 0x14-0x17 */ + 0x97, 0xD8, 0x97, 0xD9, 0x97, 0xDA, 0x97, 0xDB, /* 0x18-0x1B */ + 0x97, 0xDC, 0x97, 0xDD, 0x97, 0xDE, 0x97, 0xDF, /* 0x1C-0x1F */ + 0x97, 0xE0, 0x97, 0xE1, 0x97, 0xE2, 0x97, 0xE3, /* 0x20-0x23 */ + 0x97, 0xE4, 0x97, 0xE5, 0x97, 0xE6, 0x97, 0xE7, /* 0x24-0x27 */ + 0x97, 0xE8, 0x97, 0xE9, 0x97, 0xEA, 0x97, 0xEB, /* 0x28-0x2B */ + 0x97, 0xEC, 0x97, 0xED, 0x97, 0xEE, 0x97, 0xEF, /* 0x2C-0x2F */ + 0x97, 0xF0, 0x97, 0xF1, 0x97, 0xF2, 0x97, 0xF3, /* 0x30-0x33 */ + 0x97, 0xF4, 0x97, 0xF5, 0x97, 0xF6, 0x97, 0xF7, /* 0x34-0x37 */ + 0x97, 0xF8, 0x97, 0xF9, 0x97, 0xFA, 0x97, 0xFB, /* 0x38-0x3B */ + 0xBB, 0xD8, 0x97, 0xFC, 0x97, 0xFD, 0x97, 0xFE, /* 0x3C-0x3F */ + 0x98, 0x41, 0x98, 0x42, 0x98, 0x43, 0x98, 0x44, /* 0x40-0x43 */ + 0x98, 0x45, 0x98, 0x46, 0x98, 0x47, 0x98, 0x48, /* 0x44-0x47 */ + 0x98, 0x49, 0x98, 0x4A, 0x98, 0x4B, 0x98, 0x4C, /* 0x48-0x4B */ + 0x98, 0x4D, 0x98, 0x4E, 0x98, 0x4F, 0x98, 0x50, /* 0x4C-0x4F */ + 0x98, 0x51, 0xBB, 0xD9, 0x98, 0x52, 0x98, 0x53, /* 0x50-0x53 */ + 0x98, 0x54, 0x98, 0x55, 0x98, 0x56, 0x98, 0x57, /* 0x54-0x57 */ + 0xBB, 0xDA, 0x98, 0x58, 0x98, 0x59, 0x98, 0x5A, /* 0x58-0x5B */ + 0xBB, 0xDB, 0x98, 0x61, 0x98, 0x62, 0x98, 0x63, /* 0x5C-0x5F */ + 0xBB, 0xDC, 0x98, 0x64, 0x98, 0x65, 0x98, 0x66, /* 0x60-0x63 */ + 0x98, 0x67, 0x98, 0x68, 0x98, 0x69, 0x98, 0x6A, /* 0x64-0x67 */ + 0xBB, 0xDD, 0xBB, 0xDE, 0x98, 0x6B, 0x98, 0x6C, /* 0x68-0x6B */ + 0x98, 0x6D, 0x98, 0x6E, 0x98, 0x6F, 0x98, 0x70, /* 0x6C-0x6F */ + 0x98, 0x71, 0x98, 0x72, 0x98, 0x73, 0x98, 0x74, /* 0x70-0x73 */ + 0x98, 0x75, 0x98, 0x76, 0x98, 0x77, 0x98, 0x78, /* 0x74-0x77 */ + 0x98, 0x79, 0x98, 0x7A, 0x98, 0x81, 0x98, 0x82, /* 0x78-0x7B */ + 0x98, 0x83, 0x98, 0x84, 0x98, 0x85, 0x98, 0x86, /* 0x7C-0x7F */ + + 0x98, 0x87, 0x98, 0x88, 0x98, 0x89, 0x98, 0x8A, /* 0x80-0x83 */ + 0x98, 0x8B, 0x98, 0x8C, 0x98, 0x8D, 0x98, 0x8E, /* 0x84-0x87 */ + 0x98, 0x8F, 0x98, 0x90, 0x98, 0x91, 0x98, 0x92, /* 0x88-0x8B */ + 0x98, 0x93, 0x98, 0x94, 0x98, 0x95, 0x98, 0x96, /* 0x8C-0x8F */ + 0xBB, 0xDF, 0xBB, 0xE0, 0x98, 0x97, 0x98, 0x98, /* 0x90-0x93 */ + 0xBB, 0xE1, 0x98, 0x99, 0x98, 0x9A, 0x98, 0x9B, /* 0x94-0x97 */ + 0xBB, 0xE2, 0x98, 0x9C, 0x98, 0x9D, 0x98, 0x9E, /* 0x98-0x9B */ + 0x98, 0x9F, 0x98, 0xA0, 0x98, 0xA1, 0x98, 0xA2, /* 0x9C-0x9F */ + 0xBB, 0xE3, 0xBB, 0xE4, 0x98, 0xA3, 0xBB, 0xE5, /* 0xA0-0xA3 */ + 0x98, 0xA4, 0xBB, 0xE6, 0x98, 0xA5, 0x98, 0xA6, /* 0xA4-0xA7 */ + 0x98, 0xA7, 0x98, 0xA8, 0x98, 0xA9, 0x98, 0xAA, /* 0xA8-0xAB */ + 0xBB, 0xE7, 0xBB, 0xE8, 0x98, 0xAB, 0xBB, 0xE9, /* 0xAC-0xAF */ + 0xBB, 0xEA, 0x98, 0xAC, 0x98, 0xAD, 0xBB, 0xEB, /* 0xB0-0xB3 */ + 0xBB, 0xEC, 0xBB, 0xED, 0xBB, 0xEE, 0x98, 0xAE, /* 0xB4-0xB7 */ + 0x98, 0xAF, 0x98, 0xB0, 0x98, 0xB1, 0x98, 0xB2, /* 0xB8-0xBB */ + 0xBB, 0xEF, 0xBB, 0xF0, 0x98, 0xB3, 0xBB, 0xF1, /* 0xBC-0xBF */ + 0xBB, 0xF2, 0xBB, 0xF3, 0x98, 0xB4, 0x98, 0xB5, /* 0xC0-0xC3 */ + 0x98, 0xB6, 0xBB, 0xF4, 0x98, 0xB7, 0x98, 0xB8, /* 0xC4-0xC7 */ + 0xBB, 0xF5, 0xBB, 0xF6, 0x98, 0xB9, 0x98, 0xBA, /* 0xC8-0xCB */ + 0xBB, 0xF7, 0x98, 0xBB, 0x98, 0xBC, 0x98, 0xBD, /* 0xCC-0xCF */ + 0xBB, 0xF8, 0x98, 0xBE, 0x98, 0xBF, 0x98, 0xC0, /* 0xD0-0xD3 */ + 0x98, 0xC1, 0x98, 0xC2, 0x98, 0xC3, 0x98, 0xC4, /* 0xD4-0xD7 */ + 0xBB, 0xF9, 0xBB, 0xFA, 0x98, 0xC5, 0xBB, 0xFB, /* 0xD8-0xDB */ + 0xBB, 0xFC, 0xBB, 0xFD, 0x98, 0xC6, 0x98, 0xC7, /* 0xDC-0xDF */ + 0x98, 0xC8, 0x98, 0xC9, 0x98, 0xCA, 0x98, 0xCB, /* 0xE0-0xE3 */ + 0xBB, 0xFE, 0xBC, 0xA1, 0x98, 0xCC, 0x98, 0xCD, /* 0xE4-0xE7 */ + 0xBC, 0xA2, 0x98, 0xCE, 0x98, 0xCF, 0x98, 0xD0, /* 0xE8-0xEB */ + 0xBC, 0xA3, 0x98, 0xD1, 0x98, 0xD2, 0x98, 0xD3, /* 0xEC-0xEF */ + 0x98, 0xD4, 0x98, 0xD5, 0x98, 0xD6, 0x98, 0xD7, /* 0xF0-0xF3 */ + 0xBC, 0xA4, 0xBC, 0xA5, 0x98, 0xD8, 0xBC, 0xA6, /* 0xF4-0xF7 */ + 0x98, 0xD9, 0xBC, 0xA7, 0x98, 0xDA, 0x98, 0xDB, /* 0xF8-0xFB */ + 0x98, 0xDC, 0x98, 0xDD, 0x98, 0xDE, 0x98, 0xDF, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C1[512] = { + 0xBC, 0xA8, 0x98, 0xE0, 0x98, 0xE1, 0x98, 0xE2, /* 0x00-0x03 */ + 0xBC, 0xA9, 0x98, 0xE3, 0x98, 0xE4, 0x98, 0xE5, /* 0x04-0x07 */ + 0xBC, 0xAA, 0x98, 0xE6, 0x98, 0xE7, 0x98, 0xE8, /* 0x08-0x0B */ + 0x98, 0xE9, 0x98, 0xEA, 0x98, 0xEB, 0x98, 0xEC, /* 0x0C-0x0F */ + 0xBC, 0xAB, 0x98, 0xED, 0x98, 0xEE, 0x98, 0xEF, /* 0x10-0x13 */ + 0x98, 0xF0, 0xBC, 0xAC, 0x98, 0xF1, 0x98, 0xF2, /* 0x14-0x17 */ + 0x98, 0xF3, 0x98, 0xF4, 0x98, 0xF5, 0x98, 0xF6, /* 0x18-0x1B */ + 0xBC, 0xAD, 0xBC, 0xAE, 0xBC, 0xAF, 0xBC, 0xB0, /* 0x1C-0x1F */ + 0xBC, 0xB1, 0x98, 0xF7, 0x98, 0xF8, 0xBC, 0xB2, /* 0x20-0x23 */ + 0xBC, 0xB3, 0x98, 0xF9, 0xBC, 0xB4, 0xBC, 0xB5, /* 0x24-0x27 */ + 0x98, 0xFA, 0x98, 0xFB, 0x98, 0xFC, 0x98, 0xFD, /* 0x28-0x2B */ + 0xBC, 0xB6, 0xBC, 0xB7, 0x98, 0xFE, 0xBC, 0xB8, /* 0x2C-0x2F */ + 0xBC, 0xB9, 0xBC, 0xBA, 0x99, 0x41, 0x99, 0x42, /* 0x30-0x33 */ + 0x99, 0x43, 0x99, 0x44, 0xBC, 0xBB, 0x99, 0x45, /* 0x34-0x37 */ + 0xBC, 0xBC, 0xBC, 0xBD, 0x99, 0x46, 0x99, 0x47, /* 0x38-0x3B */ + 0xBC, 0xBE, 0x99, 0x48, 0x99, 0x49, 0x99, 0x4A, /* 0x3C-0x3F */ + 0xBC, 0xBF, 0x99, 0x4B, 0x99, 0x4C, 0x99, 0x4D, /* 0x40-0x43 */ + 0x99, 0x4E, 0x99, 0x4F, 0x99, 0x50, 0x99, 0x51, /* 0x44-0x47 */ + 0xBC, 0xC0, 0xBC, 0xC1, 0x99, 0x52, 0xBC, 0xC2, /* 0x48-0x4B */ + 0xBC, 0xC3, 0xBC, 0xC4, 0x99, 0x53, 0x99, 0x54, /* 0x4C-0x4F */ + 0x99, 0x55, 0x99, 0x56, 0x99, 0x57, 0x99, 0x58, /* 0x50-0x53 */ + 0xBC, 0xC5, 0xBC, 0xC6, 0x99, 0x59, 0x99, 0x5A, /* 0x54-0x57 */ + 0xBC, 0xC7, 0x99, 0x61, 0x99, 0x62, 0x99, 0x63, /* 0x58-0x5B */ + 0xBC, 0xC8, 0x99, 0x64, 0x99, 0x65, 0x99, 0x66, /* 0x5C-0x5F */ + 0x99, 0x67, 0x99, 0x68, 0x99, 0x69, 0x99, 0x6A, /* 0x60-0x63 */ + 0xBC, 0xC9, 0xBC, 0xCA, 0x99, 0x6B, 0xBC, 0xCB, /* 0x64-0x67 */ + 0xBC, 0xCC, 0xBC, 0xCD, 0x99, 0x6C, 0x99, 0x6D, /* 0x68-0x6B */ + 0x99, 0x6E, 0x99, 0x6F, 0x99, 0x70, 0x99, 0x71, /* 0x6C-0x6F */ + 0xBC, 0xCE, 0x99, 0x72, 0x99, 0x73, 0x99, 0x74, /* 0x70-0x73 */ + 0xBC, 0xCF, 0x99, 0x75, 0x99, 0x76, 0x99, 0x77, /* 0x74-0x77 */ + 0xBC, 0xD0, 0x99, 0x78, 0x99, 0x79, 0x99, 0x7A, /* 0x78-0x7B */ + 0x99, 0x81, 0x99, 0x82, 0x99, 0x83, 0x99, 0x84, /* 0x7C-0x7F */ + + 0x99, 0x85, 0x99, 0x86, 0x99, 0x87, 0x99, 0x88, /* 0x80-0x83 */ + 0x99, 0x89, 0xBC, 0xD1, 0x99, 0x8A, 0x99, 0x8B, /* 0x84-0x87 */ + 0x99, 0x8C, 0x99, 0x8D, 0x99, 0x8E, 0x99, 0x8F, /* 0x88-0x8B */ + 0xBC, 0xD2, 0xBC, 0xD3, 0xBC, 0xD4, 0x99, 0x90, /* 0x8C-0x8F */ + 0xBC, 0xD5, 0x99, 0x91, 0x99, 0x92, 0x99, 0x93, /* 0x90-0x93 */ + 0xBC, 0xD6, 0x99, 0x94, 0xBC, 0xD7, 0x99, 0x95, /* 0x94-0x97 */ + 0x99, 0x96, 0x99, 0x97, 0x99, 0x98, 0x99, 0x99, /* 0x98-0x9B */ + 0xBC, 0xD8, 0xBC, 0xD9, 0x99, 0x9A, 0xBC, 0xDA, /* 0x9C-0x9F */ + 0x99, 0x9B, 0xBC, 0xDB, 0x99, 0x9C, 0x99, 0x9D, /* 0xA0-0xA3 */ + 0x99, 0x9E, 0xBC, 0xDC, 0x99, 0x9F, 0x99, 0xA0, /* 0xA4-0xA7 */ + 0xBC, 0xDD, 0xBC, 0xDE, 0x99, 0xA1, 0x99, 0xA2, /* 0xA8-0xAB */ + 0xBC, 0xDF, 0x99, 0xA3, 0x99, 0xA4, 0x99, 0xA5, /* 0xAC-0xAF */ + 0xBC, 0xE0, 0x99, 0xA6, 0x99, 0xA7, 0x99, 0xA8, /* 0xB0-0xB3 */ + 0x99, 0xA9, 0x99, 0xAA, 0x99, 0xAB, 0x99, 0xAC, /* 0xB4-0xB7 */ + 0x99, 0xAD, 0x99, 0xAE, 0x99, 0xAF, 0x99, 0xB0, /* 0xB8-0xBB */ + 0x99, 0xB1, 0xBC, 0xE1, 0x99, 0xB2, 0x99, 0xB3, /* 0xBC-0xBF */ + 0x99, 0xB4, 0x99, 0xB5, 0x99, 0xB6, 0x99, 0xB7, /* 0xC0-0xC3 */ + 0xBC, 0xE2, 0x99, 0xB8, 0x99, 0xB9, 0x99, 0xBA, /* 0xC4-0xC7 */ + 0xBC, 0xE3, 0x99, 0xBB, 0x99, 0xBC, 0x99, 0xBD, /* 0xC8-0xCB */ + 0xBC, 0xE4, 0x99, 0xBE, 0x99, 0xBF, 0x99, 0xC0, /* 0xCC-0xCF */ + 0x99, 0xC1, 0x99, 0xC2, 0x99, 0xC3, 0x99, 0xC4, /* 0xD0-0xD3 */ + 0xBC, 0xE5, 0x99, 0xC5, 0x99, 0xC6, 0xBC, 0xE6, /* 0xD4-0xD7 */ + 0xBC, 0xE7, 0x99, 0xC7, 0x99, 0xC8, 0x99, 0xC9, /* 0xD8-0xDB */ + 0x99, 0xCA, 0x99, 0xCB, 0x99, 0xCC, 0x99, 0xCD, /* 0xDC-0xDF */ + 0xBC, 0xE8, 0x99, 0xCE, 0x99, 0xCF, 0x99, 0xD0, /* 0xE0-0xE3 */ + 0xBC, 0xE9, 0x99, 0xD1, 0x99, 0xD2, 0x99, 0xD3, /* 0xE4-0xE7 */ + 0xBC, 0xEA, 0x99, 0xD4, 0x99, 0xD5, 0x99, 0xD6, /* 0xE8-0xEB */ + 0x99, 0xD7, 0x99, 0xD8, 0x99, 0xD9, 0x99, 0xDA, /* 0xEC-0xEF */ + 0xBC, 0xEB, 0xBC, 0xEC, 0x99, 0xDB, 0xBC, 0xED, /* 0xF0-0xF3 */ + 0x99, 0xDC, 0x99, 0xDD, 0x99, 0xDE, 0x99, 0xDF, /* 0xF4-0xF7 */ + 0x99, 0xE0, 0x99, 0xE1, 0x99, 0xE2, 0x99, 0xE3, /* 0xF8-0xFB */ + 0xBC, 0xEE, 0xBC, 0xEF, 0x99, 0xE4, 0x99, 0xE5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C2[512] = { + 0xBC, 0xF0, 0x99, 0xE6, 0x99, 0xE7, 0x99, 0xE8, /* 0x00-0x03 */ + 0xBC, 0xF1, 0x99, 0xE9, 0x99, 0xEA, 0x99, 0xEB, /* 0x04-0x07 */ + 0x99, 0xEC, 0x99, 0xED, 0x99, 0xEE, 0x99, 0xEF, /* 0x08-0x0B */ + 0xBC, 0xF2, 0xBC, 0xF3, 0x99, 0xF0, 0xBC, 0xF4, /* 0x0C-0x0F */ + 0x99, 0xF1, 0xBC, 0xF5, 0x99, 0xF2, 0x99, 0xF3, /* 0x10-0x13 */ + 0x99, 0xF4, 0x99, 0xF5, 0x99, 0xF6, 0x99, 0xF7, /* 0x14-0x17 */ + 0xBC, 0xF6, 0xBC, 0xF7, 0x99, 0xF8, 0x99, 0xF9, /* 0x18-0x1B */ + 0xBC, 0xF8, 0x99, 0xFA, 0x99, 0xFB, 0xBC, 0xF9, /* 0x1C-0x1F */ + 0xBC, 0xFA, 0x99, 0xFC, 0x99, 0xFD, 0x99, 0xFE, /* 0x20-0x23 */ + 0x9A, 0x41, 0x9A, 0x42, 0x9A, 0x43, 0x9A, 0x44, /* 0x24-0x27 */ + 0xBC, 0xFB, 0xBC, 0xFC, 0x9A, 0x45, 0xBC, 0xFD, /* 0x28-0x2B */ + 0x9A, 0x46, 0xBC, 0xFE, 0x9A, 0x47, 0xBD, 0xA1, /* 0x2C-0x2F */ + 0x9A, 0x48, 0xBD, 0xA2, 0xBD, 0xA3, 0x9A, 0x49, /* 0x30-0x33 */ + 0xBD, 0xA4, 0x9A, 0x4A, 0x9A, 0x4B, 0x9A, 0x4C, /* 0x34-0x37 */ + 0x9A, 0x4D, 0x9A, 0x4E, 0x9A, 0x4F, 0x9A, 0x50, /* 0x38-0x3B */ + 0x9A, 0x51, 0x9A, 0x52, 0x9A, 0x53, 0x9A, 0x54, /* 0x3C-0x3F */ + 0x9A, 0x55, 0x9A, 0x56, 0x9A, 0x57, 0x9A, 0x58, /* 0x40-0x43 */ + 0x9A, 0x59, 0x9A, 0x5A, 0x9A, 0x61, 0x9A, 0x62, /* 0x44-0x47 */ + 0xBD, 0xA5, 0x9A, 0x63, 0x9A, 0x64, 0x9A, 0x65, /* 0x48-0x4B */ + 0x9A, 0x66, 0x9A, 0x67, 0x9A, 0x68, 0x9A, 0x69, /* 0x4C-0x4F */ + 0xBD, 0xA6, 0xBD, 0xA7, 0x9A, 0x6A, 0x9A, 0x6B, /* 0x50-0x53 */ + 0xBD, 0xA8, 0x9A, 0x6C, 0x9A, 0x6D, 0x9A, 0x6E, /* 0x54-0x57 */ + 0xBD, 0xA9, 0x9A, 0x6F, 0x9A, 0x70, 0x9A, 0x71, /* 0x58-0x5B */ + 0x9A, 0x72, 0x9A, 0x73, 0x9A, 0x74, 0x9A, 0x75, /* 0x5C-0x5F */ + 0xBD, 0xAA, 0x9A, 0x76, 0x9A, 0x77, 0x9A, 0x78, /* 0x60-0x63 */ + 0x9A, 0x79, 0xBD, 0xAB, 0x9A, 0x7A, 0x9A, 0x81, /* 0x64-0x67 */ + 0x9A, 0x82, 0x9A, 0x83, 0x9A, 0x84, 0x9A, 0x85, /* 0x68-0x6B */ + 0xBD, 0xAC, 0xBD, 0xAD, 0x9A, 0x86, 0x9A, 0x87, /* 0x6C-0x6F */ + 0xBD, 0xAE, 0x9A, 0x88, 0x9A, 0x89, 0x9A, 0x8A, /* 0x70-0x73 */ + 0xBD, 0xAF, 0x9A, 0x8B, 0x9A, 0x8C, 0x9A, 0x8D, /* 0x74-0x77 */ + 0x9A, 0x8E, 0x9A, 0x8F, 0x9A, 0x90, 0x9A, 0x91, /* 0x78-0x7B */ + 0xBD, 0xB0, 0xBD, 0xB1, 0x9A, 0x92, 0xBD, 0xB2, /* 0x7C-0x7F */ + + 0x9A, 0x93, 0xBD, 0xB3, 0x9A, 0x94, 0x9A, 0x95, /* 0x80-0x83 */ + 0x9A, 0x96, 0x9A, 0x97, 0x9A, 0x98, 0x9A, 0x99, /* 0x84-0x87 */ + 0xBD, 0xB4, 0xBD, 0xB5, 0x9A, 0x9A, 0x9A, 0x9B, /* 0x88-0x8B */ + 0x9A, 0x9C, 0x9A, 0x9D, 0x9A, 0x9E, 0x9A, 0x9F, /* 0x8C-0x8F */ + 0xBD, 0xB6, 0x9A, 0xA0, 0x9A, 0xA1, 0x9A, 0xA2, /* 0x90-0x93 */ + 0x9A, 0xA3, 0x9A, 0xA4, 0x9A, 0xA5, 0x9A, 0xA6, /* 0x94-0x97 */ + 0xBD, 0xB7, 0x9A, 0xA7, 0x9A, 0xA8, 0xBD, 0xB8, /* 0x98-0x9B */ + 0x9A, 0xA9, 0xBD, 0xB9, 0x9A, 0xAA, 0x9A, 0xAB, /* 0x9C-0x9F */ + 0x9A, 0xAC, 0x9A, 0xAD, 0x9A, 0xAE, 0x9A, 0xAF, /* 0xA0-0xA3 */ + 0xBD, 0xBA, 0xBD, 0xBB, 0x9A, 0xB0, 0x9A, 0xB1, /* 0xA4-0xA7 */ + 0xBD, 0xBC, 0x9A, 0xB2, 0x9A, 0xB3, 0x9A, 0xB4, /* 0xA8-0xAB */ + 0xBD, 0xBD, 0xBD, 0xBE, 0x9A, 0xB5, 0x9A, 0xB6, /* 0xAC-0xAF */ + 0x9A, 0xB7, 0x9A, 0xB8, 0x9A, 0xB9, 0x9A, 0xBA, /* 0xB0-0xB3 */ + 0xBD, 0xBF, 0xBD, 0xC0, 0x9A, 0xBB, 0xBD, 0xC1, /* 0xB4-0xB7 */ + 0x9A, 0xBC, 0xBD, 0xC2, 0x9A, 0xBD, 0x9A, 0xBE, /* 0xB8-0xBB */ + 0x9A, 0xBF, 0x9A, 0xC0, 0x9A, 0xC1, 0x9A, 0xC2, /* 0xBC-0xBF */ + 0x9A, 0xC3, 0x9A, 0xC4, 0x9A, 0xC5, 0x9A, 0xC6, /* 0xC0-0xC3 */ + 0x9A, 0xC7, 0x9A, 0xC8, 0x9A, 0xC9, 0x9A, 0xCA, /* 0xC4-0xC7 */ + 0x9A, 0xCB, 0x9A, 0xCC, 0x9A, 0xCD, 0x9A, 0xCE, /* 0xC8-0xCB */ + 0x9A, 0xCF, 0x9A, 0xD0, 0x9A, 0xD1, 0x9A, 0xD2, /* 0xCC-0xCF */ + 0x9A, 0xD3, 0x9A, 0xD4, 0x9A, 0xD5, 0x9A, 0xD6, /* 0xD0-0xD3 */ + 0x9A, 0xD7, 0x9A, 0xD8, 0x9A, 0xD9, 0x9A, 0xDA, /* 0xD4-0xD7 */ + 0x9A, 0xDB, 0x9A, 0xDC, 0x9A, 0xDD, 0x9A, 0xDE, /* 0xD8-0xDB */ + 0xBD, 0xC3, 0xBD, 0xC4, 0x9A, 0xDF, 0x9A, 0xE0, /* 0xDC-0xDF */ + 0xBD, 0xC5, 0x9A, 0xE1, 0x9A, 0xE2, 0xBD, 0xC6, /* 0xE0-0xE3 */ + 0xBD, 0xC7, 0x9A, 0xE3, 0x9A, 0xE4, 0x9A, 0xE5, /* 0xE4-0xE7 */ + 0x9A, 0xE6, 0x9A, 0xE7, 0x9A, 0xE8, 0xBD, 0xC8, /* 0xE8-0xEB */ + 0xBD, 0xC9, 0xBD, 0xCA, 0x9A, 0xE9, 0xBD, 0xCB, /* 0xEC-0xEF */ + 0x9A, 0xEA, 0xBD, 0xCC, 0x9A, 0xEB, 0x9A, 0xEC, /* 0xF0-0xF3 */ + 0x9A, 0xED, 0x9A, 0xEE, 0xBD, 0xCD, 0x9A, 0xEF, /* 0xF4-0xF7 */ + 0xBD, 0xCE, 0xBD, 0xCF, 0x9A, 0xF0, 0xBD, 0xD0, /* 0xF8-0xFB */ + 0xBD, 0xD1, 0x9A, 0xF1, 0x9A, 0xF2, 0x9A, 0xF3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C3[512] = { + 0xBD, 0xD2, 0x9A, 0xF4, 0x9A, 0xF5, 0x9A, 0xF6, /* 0x00-0x03 */ + 0x9A, 0xF7, 0x9A, 0xF8, 0x9A, 0xF9, 0x9A, 0xFA, /* 0x04-0x07 */ + 0xBD, 0xD3, 0xBD, 0xD4, 0x9A, 0xFB, 0x9A, 0xFC, /* 0x08-0x0B */ + 0xBD, 0xD5, 0xBD, 0xD6, 0x9A, 0xFD, 0x9A, 0xFE, /* 0x0C-0x0F */ + 0x9B, 0x41, 0x9B, 0x42, 0x9B, 0x43, 0xBD, 0xD7, /* 0x10-0x13 */ + 0xBD, 0xD8, 0xBD, 0xD9, 0x9B, 0x44, 0x9B, 0x45, /* 0x14-0x17 */ + 0xBD, 0xDA, 0x9B, 0x46, 0x9B, 0x47, 0x9B, 0x48, /* 0x18-0x1B */ + 0xBD, 0xDB, 0x9B, 0x49, 0x9B, 0x4A, 0x9B, 0x4B, /* 0x1C-0x1F */ + 0x9B, 0x4C, 0x9B, 0x4D, 0x9B, 0x4E, 0x9B, 0x4F, /* 0x20-0x23 */ + 0xBD, 0xDC, 0xBD, 0xDD, 0x9B, 0x50, 0x9B, 0x51, /* 0x24-0x27 */ + 0xBD, 0xDE, 0xBD, 0xDF, 0x9B, 0x52, 0x9B, 0x53, /* 0x28-0x2B */ + 0x9B, 0x54, 0x9B, 0x55, 0x9B, 0x56, 0x9B, 0x57, /* 0x2C-0x2F */ + 0x9B, 0x58, 0x9B, 0x59, 0x9B, 0x5A, 0x9B, 0x61, /* 0x30-0x33 */ + 0x9B, 0x62, 0x9B, 0x63, 0x9B, 0x64, 0x9B, 0x65, /* 0x34-0x37 */ + 0x9B, 0x66, 0x9B, 0x67, 0x9B, 0x68, 0x9B, 0x69, /* 0x38-0x3B */ + 0x9B, 0x6A, 0x9B, 0x6B, 0x9B, 0x6C, 0x9B, 0x6D, /* 0x3C-0x3F */ + 0x9B, 0x6E, 0x9B, 0x6F, 0x9B, 0x70, 0x9B, 0x71, /* 0x40-0x43 */ + 0x9B, 0x72, 0xBD, 0xE0, 0x9B, 0x73, 0x9B, 0x74, /* 0x44-0x47 */ + 0x9B, 0x75, 0x9B, 0x76, 0x9B, 0x77, 0x9B, 0x78, /* 0x48-0x4B */ + 0x9B, 0x79, 0x9B, 0x7A, 0x9B, 0x81, 0x9B, 0x82, /* 0x4C-0x4F */ + 0x9B, 0x83, 0x9B, 0x84, 0x9B, 0x85, 0x9B, 0x86, /* 0x50-0x53 */ + 0x9B, 0x87, 0x9B, 0x88, 0x9B, 0x89, 0x9B, 0x8A, /* 0x54-0x57 */ + 0x9B, 0x8B, 0x9B, 0x8C, 0x9B, 0x8D, 0x9B, 0x8E, /* 0x58-0x5B */ + 0x9B, 0x8F, 0x9B, 0x90, 0x9B, 0x91, 0x9B, 0x92, /* 0x5C-0x5F */ + 0x9B, 0x93, 0x9B, 0x94, 0x9B, 0x95, 0x9B, 0x96, /* 0x60-0x63 */ + 0x9B, 0x97, 0x9B, 0x98, 0x9B, 0x99, 0x9B, 0x9A, /* 0x64-0x67 */ + 0xBD, 0xE1, 0xBD, 0xE2, 0x9B, 0x9B, 0x9B, 0x9C, /* 0x68-0x6B */ + 0xBD, 0xE3, 0x9B, 0x9D, 0x9B, 0x9E, 0x9B, 0x9F, /* 0x6C-0x6F */ + 0xBD, 0xE4, 0x9B, 0xA0, 0xBD, 0xE5, 0x9B, 0xA1, /* 0x70-0x73 */ + 0x9B, 0xA2, 0x9B, 0xA3, 0x9B, 0xA4, 0x9B, 0xA5, /* 0x74-0x77 */ + 0xBD, 0xE6, 0xBD, 0xE7, 0x9B, 0xA6, 0x9B, 0xA7, /* 0x78-0x7B */ + 0xBD, 0xE8, 0xBD, 0xE9, 0x9B, 0xA8, 0x9B, 0xA9, /* 0x7C-0x7F */ + + 0x9B, 0xAA, 0x9B, 0xAB, 0x9B, 0xAC, 0x9B, 0xAD, /* 0x80-0x83 */ + 0xBD, 0xEA, 0x9B, 0xAE, 0x9B, 0xAF, 0x9B, 0xB0, /* 0x84-0x87 */ + 0xBD, 0xEB, 0x9B, 0xB1, 0x9B, 0xB2, 0x9B, 0xB3, /* 0x88-0x8B */ + 0xBD, 0xEC, 0x9B, 0xB4, 0x9B, 0xB5, 0x9B, 0xB6, /* 0x8C-0x8F */ + 0x9B, 0xB7, 0x9B, 0xB8, 0x9B, 0xB9, 0x9B, 0xBA, /* 0x90-0x93 */ + 0x9B, 0xBB, 0x9B, 0xBC, 0x9B, 0xBD, 0x9B, 0xBE, /* 0x94-0x97 */ + 0x9B, 0xBF, 0x9B, 0xC0, 0x9B, 0xC1, 0x9B, 0xC2, /* 0x98-0x9B */ + 0x9B, 0xC3, 0x9B, 0xC4, 0x9B, 0xC5, 0x9B, 0xC6, /* 0x9C-0x9F */ + 0x9B, 0xC7, 0x9B, 0xC8, 0x9B, 0xC9, 0x9B, 0xCA, /* 0xA0-0xA3 */ + 0x9B, 0xCB, 0x9B, 0xCC, 0x9B, 0xCD, 0x9B, 0xCE, /* 0xA4-0xA7 */ + 0x9B, 0xCF, 0x9B, 0xD0, 0x9B, 0xD1, 0x9B, 0xD2, /* 0xA8-0xAB */ + 0x9B, 0xD3, 0x9B, 0xD4, 0x9B, 0xD5, 0x9B, 0xD6, /* 0xAC-0xAF */ + 0x9B, 0xD7, 0x9B, 0xD8, 0x9B, 0xD9, 0x9B, 0xDA, /* 0xB0-0xB3 */ + 0x9B, 0xDB, 0x9B, 0xDC, 0x9B, 0xDD, 0x9B, 0xDE, /* 0xB4-0xB7 */ + 0x9B, 0xDF, 0x9B, 0xE0, 0x9B, 0xE1, 0x9B, 0xE2, /* 0xB8-0xBB */ + 0x9B, 0xE3, 0x9B, 0xE4, 0x9B, 0xE5, 0x9B, 0xE6, /* 0xBC-0xBF */ + 0xBD, 0xED, 0x9B, 0xE7, 0x9B, 0xE8, 0x9B, 0xE9, /* 0xC0-0xC3 */ + 0x9B, 0xEA, 0x9B, 0xEB, 0x9B, 0xEC, 0x9B, 0xED, /* 0xC4-0xC7 */ + 0x9B, 0xEE, 0x9B, 0xEF, 0x9B, 0xF0, 0x9B, 0xF1, /* 0xC8-0xCB */ + 0x9B, 0xF2, 0x9B, 0xF3, 0x9B, 0xF4, 0x9B, 0xF5, /* 0xCC-0xCF */ + 0x9B, 0xF6, 0x9B, 0xF7, 0x9B, 0xF8, 0x9B, 0xF9, /* 0xD0-0xD3 */ + 0x9B, 0xFA, 0x9B, 0xFB, 0x9B, 0xFC, 0x9B, 0xFD, /* 0xD4-0xD7 */ + 0xBD, 0xEE, 0xBD, 0xEF, 0x9B, 0xFE, 0x9C, 0x41, /* 0xD8-0xDB */ + 0xBD, 0xF0, 0x9C, 0x42, 0x9C, 0x43, 0xBD, 0xF1, /* 0xDC-0xDF */ + 0xBD, 0xF2, 0x9C, 0x44, 0xBD, 0xF3, 0x9C, 0x45, /* 0xE0-0xE3 */ + 0x9C, 0x46, 0x9C, 0x47, 0x9C, 0x48, 0x9C, 0x49, /* 0xE4-0xE7 */ + 0xBD, 0xF4, 0xBD, 0xF5, 0x9C, 0x4A, 0x9C, 0x4B, /* 0xE8-0xEB */ + 0x9C, 0x4C, 0xBD, 0xF6, 0x9C, 0x4D, 0x9C, 0x4E, /* 0xEC-0xEF */ + 0x9C, 0x4F, 0x9C, 0x50, 0x9C, 0x51, 0x9C, 0x52, /* 0xF0-0xF3 */ + 0xBD, 0xF7, 0xBD, 0xF8, 0x9C, 0x53, 0x9C, 0x54, /* 0xF4-0xF7 */ + 0xBD, 0xF9, 0x9C, 0x55, 0x9C, 0x56, 0x9C, 0x57, /* 0xF8-0xFB */ + 0x9C, 0x58, 0x9C, 0x59, 0x9C, 0x5A, 0x9C, 0x61, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C4[512] = { + 0x9C, 0x62, 0x9C, 0x63, 0x9C, 0x64, 0x9C, 0x65, /* 0x00-0x03 */ + 0x9C, 0x66, 0x9C, 0x67, 0x9C, 0x68, 0x9C, 0x69, /* 0x04-0x07 */ + 0xBD, 0xFA, 0x9C, 0x6A, 0x9C, 0x6B, 0x9C, 0x6C, /* 0x08-0x0B */ + 0x9C, 0x6D, 0x9C, 0x6E, 0x9C, 0x6F, 0x9C, 0x70, /* 0x0C-0x0F */ + 0xBD, 0xFB, 0x9C, 0x71, 0x9C, 0x72, 0x9C, 0x73, /* 0x10-0x13 */ + 0x9C, 0x74, 0x9C, 0x75, 0x9C, 0x76, 0x9C, 0x77, /* 0x14-0x17 */ + 0x9C, 0x78, 0x9C, 0x79, 0x9C, 0x7A, 0x9C, 0x81, /* 0x18-0x1B */ + 0x9C, 0x82, 0x9C, 0x83, 0x9C, 0x84, 0x9C, 0x85, /* 0x1C-0x1F */ + 0x9C, 0x86, 0x9C, 0x87, 0x9C, 0x88, 0x9C, 0x89, /* 0x20-0x23 */ + 0xBD, 0xFC, 0x9C, 0x8A, 0x9C, 0x8B, 0x9C, 0x8C, /* 0x24-0x27 */ + 0x9C, 0x8D, 0x9C, 0x8E, 0x9C, 0x8F, 0x9C, 0x90, /* 0x28-0x2B */ + 0xBD, 0xFD, 0x9C, 0x91, 0x9C, 0x92, 0x9C, 0x93, /* 0x2C-0x2F */ + 0xBD, 0xFE, 0x9C, 0x94, 0x9C, 0x95, 0x9C, 0x96, /* 0x30-0x33 */ + 0xBE, 0xA1, 0x9C, 0x97, 0x9C, 0x98, 0x9C, 0x99, /* 0x34-0x37 */ + 0x9C, 0x9A, 0x9C, 0x9B, 0x9C, 0x9C, 0x9C, 0x9D, /* 0x38-0x3B */ + 0xBE, 0xA2, 0xBE, 0xA3, 0x9C, 0x9E, 0x9C, 0x9F, /* 0x3C-0x3F */ + 0x9C, 0xA0, 0x9C, 0xA1, 0x9C, 0xA2, 0x9C, 0xA3, /* 0x40-0x43 */ + 0x9C, 0xA4, 0x9C, 0xA5, 0x9C, 0xA6, 0x9C, 0xA7, /* 0x44-0x47 */ + 0xBE, 0xA4, 0x9C, 0xA8, 0x9C, 0xA9, 0x9C, 0xAA, /* 0x48-0x4B */ + 0x9C, 0xAB, 0x9C, 0xAC, 0x9C, 0xAD, 0x9C, 0xAE, /* 0x4C-0x4F */ + 0x9C, 0xAF, 0x9C, 0xB0, 0x9C, 0xB1, 0x9C, 0xB2, /* 0x50-0x53 */ + 0x9C, 0xB3, 0x9C, 0xB4, 0x9C, 0xB5, 0x9C, 0xB6, /* 0x54-0x57 */ + 0x9C, 0xB7, 0x9C, 0xB8, 0x9C, 0xB9, 0x9C, 0xBA, /* 0x58-0x5B */ + 0x9C, 0xBB, 0x9C, 0xBC, 0x9C, 0xBD, 0x9C, 0xBE, /* 0x5C-0x5F */ + 0x9C, 0xBF, 0x9C, 0xC0, 0x9C, 0xC1, 0x9C, 0xC2, /* 0x60-0x63 */ + 0xBE, 0xA5, 0xBE, 0xA6, 0x9C, 0xC3, 0x9C, 0xC4, /* 0x64-0x67 */ + 0xBE, 0xA7, 0x9C, 0xC5, 0x9C, 0xC6, 0x9C, 0xC7, /* 0x68-0x6B */ + 0xBE, 0xA8, 0x9C, 0xC8, 0x9C, 0xC9, 0x9C, 0xCA, /* 0x6C-0x6F */ + 0x9C, 0xCB, 0x9C, 0xCC, 0x9C, 0xCD, 0x9C, 0xCE, /* 0x70-0x73 */ + 0xBE, 0xA9, 0xBE, 0xAA, 0x9C, 0xCF, 0x9C, 0xD0, /* 0x74-0x77 */ + 0x9C, 0xD1, 0xBE, 0xAB, 0x9C, 0xD2, 0x9C, 0xD3, /* 0x78-0x7B */ + 0x9C, 0xD4, 0x9C, 0xD5, 0x9C, 0xD6, 0x9C, 0xD7, /* 0x7C-0x7F */ + + 0xBE, 0xAC, 0x9C, 0xD8, 0x9C, 0xD9, 0x9C, 0xDA, /* 0x80-0x83 */ + 0x9C, 0xDB, 0x9C, 0xDC, 0x9C, 0xDD, 0x9C, 0xDE, /* 0x84-0x87 */ + 0x9C, 0xDF, 0x9C, 0xE0, 0x9C, 0xE1, 0x9C, 0xE2, /* 0x88-0x8B */ + 0x9C, 0xE3, 0x9C, 0xE4, 0x9C, 0xE5, 0x9C, 0xE6, /* 0x8C-0x8F */ + 0x9C, 0xE7, 0x9C, 0xE8, 0x9C, 0xE9, 0x9C, 0xEA, /* 0x90-0x93 */ + 0xBE, 0xAD, 0x9C, 0xEB, 0x9C, 0xEC, 0x9C, 0xED, /* 0x94-0x97 */ + 0x9C, 0xEE, 0x9C, 0xEF, 0x9C, 0xF0, 0x9C, 0xF1, /* 0x98-0x9B */ + 0xBE, 0xAE, 0x9C, 0xF2, 0x9C, 0xF3, 0x9C, 0xF4, /* 0x9C-0x9F */ + 0x9C, 0xF5, 0x9C, 0xF6, 0x9C, 0xF7, 0x9C, 0xF8, /* 0xA0-0xA3 */ + 0x9C, 0xF9, 0x9C, 0xFA, 0x9C, 0xFB, 0x9C, 0xFC, /* 0xA4-0xA7 */ + 0x9C, 0xFD, 0x9C, 0xFE, 0x9D, 0x41, 0x9D, 0x42, /* 0xA8-0xAB */ + 0x9D, 0x43, 0x9D, 0x44, 0x9D, 0x45, 0x9D, 0x46, /* 0xAC-0xAF */ + 0x9D, 0x47, 0x9D, 0x48, 0x9D, 0x49, 0x9D, 0x4A, /* 0xB0-0xB3 */ + 0x9D, 0x4B, 0x9D, 0x4C, 0x9D, 0x4D, 0x9D, 0x4E, /* 0xB4-0xB7 */ + 0xBE, 0xAF, 0x9D, 0x4F, 0x9D, 0x50, 0x9D, 0x51, /* 0xB8-0xBB */ + 0xBE, 0xB0, 0x9D, 0x52, 0x9D, 0x53, 0x9D, 0x54, /* 0xBC-0xBF */ + 0x9D, 0x55, 0x9D, 0x56, 0x9D, 0x57, 0x9D, 0x58, /* 0xC0-0xC3 */ + 0x9D, 0x59, 0x9D, 0x5A, 0x9D, 0x61, 0x9D, 0x62, /* 0xC4-0xC7 */ + 0x9D, 0x63, 0x9D, 0x64, 0x9D, 0x65, 0x9D, 0x66, /* 0xC8-0xCB */ + 0x9D, 0x67, 0x9D, 0x68, 0x9D, 0x69, 0x9D, 0x6A, /* 0xCC-0xCF */ + 0x9D, 0x6B, 0x9D, 0x6C, 0x9D, 0x6D, 0x9D, 0x6E, /* 0xD0-0xD3 */ + 0x9D, 0x6F, 0x9D, 0x70, 0x9D, 0x71, 0x9D, 0x72, /* 0xD4-0xD7 */ + 0x9D, 0x73, 0x9D, 0x74, 0x9D, 0x75, 0x9D, 0x76, /* 0xD8-0xDB */ + 0x9D, 0x77, 0x9D, 0x78, 0x9D, 0x79, 0x9D, 0x7A, /* 0xDC-0xDF */ + 0x9D, 0x81, 0x9D, 0x82, 0x9D, 0x83, 0x9D, 0x84, /* 0xE0-0xE3 */ + 0x9D, 0x85, 0x9D, 0x86, 0x9D, 0x87, 0x9D, 0x88, /* 0xE4-0xE7 */ + 0x9D, 0x89, 0xBE, 0xB1, 0x9D, 0x8A, 0x9D, 0x8B, /* 0xE8-0xEB */ + 0x9D, 0x8C, 0x9D, 0x8D, 0x9D, 0x8E, 0x9D, 0x8F, /* 0xEC-0xEF */ + 0xBE, 0xB2, 0xBE, 0xB3, 0x9D, 0x90, 0x9D, 0x91, /* 0xF0-0xF3 */ + 0xBE, 0xB4, 0x9D, 0x92, 0x9D, 0x93, 0x9D, 0x94, /* 0xF4-0xF7 */ + 0xBE, 0xB5, 0x9D, 0x95, 0xBE, 0xB6, 0x9D, 0x96, /* 0xF8-0xFB */ + 0x9D, 0x97, 0x9D, 0x98, 0x9D, 0x99, 0xBE, 0xB7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C5[512] = { + 0xBE, 0xB8, 0xBE, 0xB9, 0x9D, 0x9A, 0x9D, 0x9B, /* 0x00-0x03 */ + 0x9D, 0x9C, 0x9D, 0x9D, 0x9D, 0x9E, 0x9D, 0x9F, /* 0x04-0x07 */ + 0x9D, 0xA0, 0x9D, 0xA1, 0x9D, 0xA2, 0x9D, 0xA3, /* 0x08-0x0B */ + 0xBE, 0xBA, 0x9D, 0xA4, 0x9D, 0xA5, 0x9D, 0xA6, /* 0x0C-0x0F */ + 0xBE, 0xBB, 0x9D, 0xA7, 0x9D, 0xA8, 0x9D, 0xA9, /* 0x10-0x13 */ + 0xBE, 0xBC, 0x9D, 0xAA, 0x9D, 0xAB, 0x9D, 0xAC, /* 0x14-0x17 */ + 0x9D, 0xAD, 0x9D, 0xAE, 0x9D, 0xAF, 0x9D, 0xB0, /* 0x18-0x1B */ + 0xBE, 0xBD, 0x9D, 0xB1, 0x9D, 0xB2, 0x9D, 0xB3, /* 0x1C-0x1F */ + 0x9D, 0xB4, 0x9D, 0xB5, 0x9D, 0xB6, 0x9D, 0xB7, /* 0x20-0x23 */ + 0x9D, 0xB8, 0x9D, 0xB9, 0x9D, 0xBA, 0x9D, 0xBB, /* 0x24-0x27 */ + 0xBE, 0xBE, 0xBE, 0xBF, 0x9D, 0xBC, 0x9D, 0xBD, /* 0x28-0x2B */ + 0xBE, 0xC0, 0x9D, 0xBE, 0x9D, 0xBF, 0x9D, 0xC0, /* 0x2C-0x2F */ + 0xBE, 0xC1, 0x9D, 0xC1, 0x9D, 0xC2, 0x9D, 0xC3, /* 0x30-0x33 */ + 0x9D, 0xC4, 0x9D, 0xC5, 0x9D, 0xC6, 0x9D, 0xC7, /* 0x34-0x37 */ + 0xBE, 0xC2, 0xBE, 0xC3, 0x9D, 0xC8, 0xBE, 0xC4, /* 0x38-0x3B */ + 0x9D, 0xC9, 0xBE, 0xC5, 0x9D, 0xCA, 0x9D, 0xCB, /* 0x3C-0x3F */ + 0x9D, 0xCC, 0x9D, 0xCD, 0x9D, 0xCE, 0x9D, 0xCF, /* 0x40-0x43 */ + 0xBE, 0xC6, 0xBE, 0xC7, 0x9D, 0xD0, 0x9D, 0xD1, /* 0x44-0x47 */ + 0xBE, 0xC8, 0xBE, 0xC9, 0xBE, 0xCA, 0x9D, 0xD2, /* 0x48-0x4B */ + 0xBE, 0xCB, 0xBE, 0xCC, 0xBE, 0xCD, 0x9D, 0xD3, /* 0x4C-0x4F */ + 0x9D, 0xD4, 0x9D, 0xD5, 0x9D, 0xD6, 0xBE, 0xCE, /* 0x50-0x53 */ + 0xBE, 0xCF, 0xBE, 0xD0, 0x9D, 0xD7, 0xBE, 0xD1, /* 0x54-0x57 */ + 0xBE, 0xD2, 0xBE, 0xD3, 0x9D, 0xD8, 0x9D, 0xD9, /* 0x58-0x5B */ + 0x9D, 0xDA, 0xBE, 0xD4, 0xBE, 0xD5, 0x9D, 0xDB, /* 0x5C-0x5F */ + 0xBE, 0xD6, 0xBE, 0xD7, 0x9D, 0xDC, 0x9D, 0xDD, /* 0x60-0x63 */ + 0xBE, 0xD8, 0x9D, 0xDE, 0x9D, 0xDF, 0x9D, 0xE0, /* 0x64-0x67 */ + 0xBE, 0xD9, 0x9D, 0xE1, 0x9D, 0xE2, 0x9D, 0xE3, /* 0x68-0x6B */ + 0x9D, 0xE4, 0x9D, 0xE5, 0x9D, 0xE6, 0x9D, 0xE7, /* 0x6C-0x6F */ + 0xBE, 0xDA, 0xBE, 0xDB, 0x9D, 0xE8, 0xBE, 0xDC, /* 0x70-0x73 */ + 0xBE, 0xDD, 0xBE, 0xDE, 0x9D, 0xE9, 0x9D, 0xEA, /* 0x74-0x77 */ + 0x9D, 0xEB, 0x9D, 0xEC, 0x9D, 0xED, 0x9D, 0xEE, /* 0x78-0x7B */ + 0xBE, 0xDF, 0xBE, 0xE0, 0x9D, 0xEF, 0x9D, 0xF0, /* 0x7C-0x7F */ + + 0xBE, 0xE1, 0x9D, 0xF1, 0x9D, 0xF2, 0x9D, 0xF3, /* 0x80-0x83 */ + 0xBE, 0xE2, 0x9D, 0xF4, 0x9D, 0xF5, 0xBE, 0xE3, /* 0x84-0x87 */ + 0x9D, 0xF6, 0x9D, 0xF7, 0x9D, 0xF8, 0x9D, 0xF9, /* 0x88-0x8B */ + 0xBE, 0xE4, 0xBE, 0xE5, 0x9D, 0xFA, 0xBE, 0xE6, /* 0x8C-0x8F */ + 0x9D, 0xFB, 0xBE, 0xE7, 0x9D, 0xFC, 0x9D, 0xFD, /* 0x90-0x93 */ + 0x9D, 0xFE, 0xBE, 0xE8, 0x9E, 0x41, 0xBE, 0xE9, /* 0x94-0x97 */ + 0xBE, 0xEA, 0x9E, 0x42, 0x9E, 0x43, 0x9E, 0x44, /* 0x98-0x9B */ + 0xBE, 0xEB, 0x9E, 0x45, 0x9E, 0x46, 0x9E, 0x47, /* 0x9C-0x9F */ + 0xBE, 0xEC, 0x9E, 0x48, 0x9E, 0x49, 0x9E, 0x4A, /* 0xA0-0xA3 */ + 0x9E, 0x4B, 0x9E, 0x4C, 0x9E, 0x4D, 0x9E, 0x4E, /* 0xA4-0xA7 */ + 0x9E, 0x4F, 0xBE, 0xED, 0x9E, 0x50, 0x9E, 0x51, /* 0xA8-0xAB */ + 0x9E, 0x52, 0x9E, 0x53, 0x9E, 0x54, 0x9E, 0x55, /* 0xAC-0xAF */ + 0x9E, 0x56, 0x9E, 0x57, 0x9E, 0x58, 0x9E, 0x59, /* 0xB0-0xB3 */ + 0xBE, 0xEE, 0xBE, 0xEF, 0x9E, 0x5A, 0x9E, 0x61, /* 0xB4-0xB7 */ + 0xBE, 0xF0, 0xBE, 0xF1, 0x9E, 0x62, 0xBE, 0xF2, /* 0xB8-0xBB */ + 0xBE, 0xF3, 0xBE, 0xF4, 0xBE, 0xF5, 0x9E, 0x63, /* 0xBC-0xBF */ + 0x9E, 0x64, 0x9E, 0x65, 0x9E, 0x66, 0x9E, 0x67, /* 0xC0-0xC3 */ + 0xBE, 0xF6, 0xBE, 0xF7, 0xBE, 0xF8, 0xBE, 0xF9, /* 0xC4-0xC7 */ + 0xBE, 0xFA, 0xBE, 0xFB, 0xBE, 0xFC, 0x9E, 0x68, /* 0xC8-0xCB */ + 0xBE, 0xFD, 0x9E, 0x69, 0xBE, 0xFE, 0x9E, 0x6A, /* 0xCC-0xCF */ + 0xBF, 0xA1, 0xBF, 0xA2, 0x9E, 0x6B, 0x9E, 0x6C, /* 0xD0-0xD3 */ + 0xBF, 0xA3, 0x9E, 0x6D, 0x9E, 0x6E, 0x9E, 0x6F, /* 0xD4-0xD7 */ + 0xBF, 0xA4, 0x9E, 0x70, 0x9E, 0x71, 0x9E, 0x72, /* 0xD8-0xDB */ + 0x9E, 0x73, 0x9E, 0x74, 0x9E, 0x75, 0x9E, 0x76, /* 0xDC-0xDF */ + 0xBF, 0xA5, 0xBF, 0xA6, 0x9E, 0x77, 0xBF, 0xA7, /* 0xE0-0xE3 */ + 0x9E, 0x78, 0xBF, 0xA8, 0x9E, 0x79, 0x9E, 0x7A, /* 0xE4-0xE7 */ + 0x9E, 0x81, 0x9E, 0x82, 0x9E, 0x83, 0x9E, 0x84, /* 0xE8-0xEB */ + 0xBF, 0xA9, 0xBF, 0xAA, 0xBF, 0xAB, 0x9E, 0x85, /* 0xEC-0xEF */ + 0xBF, 0xAC, 0x9E, 0x86, 0x9E, 0x87, 0x9E, 0x88, /* 0xF0-0xF3 */ + 0xBF, 0xAD, 0x9E, 0x89, 0xBF, 0xAE, 0xBF, 0xAF, /* 0xF4-0xF7 */ + 0x9E, 0x8A, 0x9E, 0x8B, 0x9E, 0x8C, 0x9E, 0x8D, /* 0xF8-0xFB */ + 0xBF, 0xB0, 0xBF, 0xB1, 0xBF, 0xB2, 0xBF, 0xB3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C6[512] = { + 0xBF, 0xB4, 0xBF, 0xB5, 0x9E, 0x8E, 0x9E, 0x8F, /* 0x00-0x03 */ + 0x9E, 0x90, 0xBF, 0xB6, 0xBF, 0xB7, 0xBF, 0xB8, /* 0x04-0x07 */ + 0xBF, 0xB9, 0x9E, 0x91, 0x9E, 0x92, 0x9E, 0x93, /* 0x08-0x0B */ + 0xBF, 0xBA, 0x9E, 0x94, 0x9E, 0x95, 0x9E, 0x96, /* 0x0C-0x0F */ + 0xBF, 0xBB, 0x9E, 0x97, 0x9E, 0x98, 0x9E, 0x99, /* 0x10-0x13 */ + 0x9E, 0x9A, 0x9E, 0x9B, 0x9E, 0x9C, 0x9E, 0x9D, /* 0x14-0x17 */ + 0xBF, 0xBC, 0xBF, 0xBD, 0x9E, 0x9E, 0xBF, 0xBE, /* 0x18-0x1B */ + 0xBF, 0xBF, 0x9E, 0x9F, 0x9E, 0xA0, 0x9E, 0xA1, /* 0x1C-0x1F */ + 0x9E, 0xA2, 0x9E, 0xA3, 0x9E, 0xA4, 0x9E, 0xA5, /* 0x20-0x23 */ + 0xBF, 0xC0, 0xBF, 0xC1, 0x9E, 0xA6, 0x9E, 0xA7, /* 0x24-0x27 */ + 0xBF, 0xC2, 0x9E, 0xA8, 0x9E, 0xA9, 0x9E, 0xAA, /* 0x28-0x2B */ + 0xBF, 0xC3, 0xBF, 0xC4, 0xBF, 0xC5, 0x9E, 0xAB, /* 0x2C-0x2F */ + 0xBF, 0xC6, 0x9E, 0xAC, 0x9E, 0xAD, 0xBF, 0xC7, /* 0x30-0x33 */ + 0xBF, 0xC8, 0xBF, 0xC9, 0x9E, 0xAE, 0xBF, 0xCA, /* 0x34-0x37 */ + 0x9E, 0xAF, 0xBF, 0xCB, 0x9E, 0xB0, 0xBF, 0xCC, /* 0x38-0x3B */ + 0x9E, 0xB1, 0x9E, 0xB2, 0x9E, 0xB3, 0x9E, 0xB4, /* 0x3C-0x3F */ + 0xBF, 0xCD, 0xBF, 0xCE, 0x9E, 0xB5, 0x9E, 0xB6, /* 0x40-0x43 */ + 0xBF, 0xCF, 0x9E, 0xB7, 0x9E, 0xB8, 0x9E, 0xB9, /* 0x44-0x47 */ + 0xBF, 0xD0, 0x9E, 0xBA, 0x9E, 0xBB, 0x9E, 0xBC, /* 0x48-0x4B */ + 0x9E, 0xBD, 0x9E, 0xBE, 0x9E, 0xBF, 0x9E, 0xC0, /* 0x4C-0x4F */ + 0xBF, 0xD1, 0xBF, 0xD2, 0x9E, 0xC1, 0xBF, 0xD3, /* 0x50-0x53 */ + 0xBF, 0xD4, 0xBF, 0xD5, 0x9E, 0xC2, 0x9E, 0xC3, /* 0x54-0x57 */ + 0x9E, 0xC4, 0x9E, 0xC5, 0x9E, 0xC6, 0x9E, 0xC7, /* 0x58-0x5B */ + 0xBF, 0xD6, 0xBF, 0xD7, 0x9E, 0xC8, 0x9E, 0xC9, /* 0x5C-0x5F */ + 0xBF, 0xD8, 0x9E, 0xCA, 0x9E, 0xCB, 0x9E, 0xCC, /* 0x60-0x63 */ + 0x9E, 0xCD, 0x9E, 0xCE, 0x9E, 0xCF, 0x9E, 0xD0, /* 0x64-0x67 */ + 0x9E, 0xD1, 0x9E, 0xD2, 0x9E, 0xD3, 0x9E, 0xD4, /* 0x68-0x6B */ + 0xBF, 0xD9, 0x9E, 0xD5, 0x9E, 0xD6, 0xBF, 0xDA, /* 0x6C-0x6F */ + 0x9E, 0xD7, 0xBF, 0xDB, 0x9E, 0xD8, 0x9E, 0xD9, /* 0x70-0x73 */ + 0x9E, 0xDA, 0x9E, 0xDB, 0x9E, 0xDC, 0x9E, 0xDD, /* 0x74-0x77 */ + 0xBF, 0xDC, 0xBF, 0xDD, 0x9E, 0xDE, 0x9E, 0xDF, /* 0x78-0x7B */ + 0xBF, 0xDE, 0x9E, 0xE0, 0x9E, 0xE1, 0x9E, 0xE2, /* 0x7C-0x7F */ + + 0xBF, 0xDF, 0x9E, 0xE3, 0x9E, 0xE4, 0x9E, 0xE5, /* 0x80-0x83 */ + 0x9E, 0xE6, 0x9E, 0xE7, 0x9E, 0xE8, 0x9E, 0xE9, /* 0x84-0x87 */ + 0xBF, 0xE0, 0xBF, 0xE1, 0x9E, 0xEA, 0xBF, 0xE2, /* 0x88-0x8B */ + 0x9E, 0xEB, 0xBF, 0xE3, 0x9E, 0xEC, 0x9E, 0xED, /* 0x8C-0x8F */ + 0x9E, 0xEE, 0x9E, 0xEF, 0x9E, 0xF0, 0x9E, 0xF1, /* 0x90-0x93 */ + 0xBF, 0xE4, 0xBF, 0xE5, 0x9E, 0xF2, 0x9E, 0xF3, /* 0x94-0x97 */ + 0xBF, 0xE6, 0x9E, 0xF4, 0x9E, 0xF5, 0x9E, 0xF6, /* 0x98-0x9B */ + 0xBF, 0xE7, 0x9E, 0xF7, 0x9E, 0xF8, 0x9E, 0xF9, /* 0x9C-0x9F */ + 0x9E, 0xFA, 0x9E, 0xFB, 0x9E, 0xFC, 0x9E, 0xFD, /* 0xA0-0xA3 */ + 0xBF, 0xE8, 0xBF, 0xE9, 0x9E, 0xFE, 0xBF, 0xEA, /* 0xA4-0xA7 */ + 0x9F, 0x41, 0xBF, 0xEB, 0x9F, 0x42, 0x9F, 0x43, /* 0xA8-0xAB */ + 0x9F, 0x44, 0x9F, 0x45, 0x9F, 0x46, 0x9F, 0x47, /* 0xAC-0xAF */ + 0xBF, 0xEC, 0xBF, 0xED, 0x9F, 0x48, 0x9F, 0x49, /* 0xB0-0xB3 */ + 0xBF, 0xEE, 0x9F, 0x4A, 0x9F, 0x4B, 0x9F, 0x4C, /* 0xB4-0xB7 */ + 0xBF, 0xEF, 0xBF, 0xF0, 0xBF, 0xF1, 0x9F, 0x4D, /* 0xB8-0xBB */ + 0x9F, 0x4E, 0x9F, 0x4F, 0x9F, 0x50, 0x9F, 0x51, /* 0xBC-0xBF */ + 0xBF, 0xF2, 0xBF, 0xF3, 0x9F, 0x52, 0xBF, 0xF4, /* 0xC0-0xC3 */ + 0x9F, 0x53, 0xBF, 0xF5, 0x9F, 0x54, 0x9F, 0x55, /* 0xC4-0xC7 */ + 0x9F, 0x56, 0x9F, 0x57, 0x9F, 0x58, 0x9F, 0x59, /* 0xC8-0xCB */ + 0xBF, 0xF6, 0xBF, 0xF7, 0x9F, 0x5A, 0x9F, 0x61, /* 0xCC-0xCF */ + 0xBF, 0xF8, 0x9F, 0x62, 0x9F, 0x63, 0x9F, 0x64, /* 0xD0-0xD3 */ + 0xBF, 0xF9, 0x9F, 0x65, 0x9F, 0x66, 0x9F, 0x67, /* 0xD4-0xD7 */ + 0x9F, 0x68, 0x9F, 0x69, 0x9F, 0x6A, 0x9F, 0x6B, /* 0xD8-0xDB */ + 0xBF, 0xFA, 0xBF, 0xFB, 0x9F, 0x6C, 0x9F, 0x6D, /* 0xDC-0xDF */ + 0xBF, 0xFC, 0xBF, 0xFD, 0x9F, 0x6E, 0x9F, 0x6F, /* 0xE0-0xE3 */ + 0x9F, 0x70, 0x9F, 0x71, 0x9F, 0x72, 0x9F, 0x73, /* 0xE4-0xE7 */ + 0xBF, 0xFE, 0xC0, 0xA1, 0x9F, 0x74, 0x9F, 0x75, /* 0xE8-0xEB */ + 0xC0, 0xA2, 0x9F, 0x76, 0x9F, 0x77, 0x9F, 0x78, /* 0xEC-0xEF */ + 0xC0, 0xA3, 0x9F, 0x79, 0x9F, 0x7A, 0x9F, 0x81, /* 0xF0-0xF3 */ + 0x9F, 0x82, 0x9F, 0x83, 0x9F, 0x84, 0x9F, 0x85, /* 0xF4-0xF7 */ + 0xC0, 0xA4, 0xC0, 0xA5, 0x9F, 0x86, 0x9F, 0x87, /* 0xF8-0xFB */ + 0x9F, 0x88, 0xC0, 0xA6, 0x9F, 0x89, 0x9F, 0x8A, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C7[512] = { + 0x9F, 0x8B, 0x9F, 0x8C, 0x9F, 0x8D, 0x9F, 0x8E, /* 0x00-0x03 */ + 0xC0, 0xA7, 0xC0, 0xA8, 0x9F, 0x8F, 0x9F, 0x90, /* 0x04-0x07 */ + 0xC0, 0xA9, 0x9F, 0x91, 0x9F, 0x92, 0x9F, 0x93, /* 0x08-0x0B */ + 0xC0, 0xAA, 0x9F, 0x94, 0x9F, 0x95, 0x9F, 0x96, /* 0x0C-0x0F */ + 0x9F, 0x97, 0x9F, 0x98, 0x9F, 0x99, 0x9F, 0x9A, /* 0x10-0x13 */ + 0xC0, 0xAB, 0xC0, 0xAC, 0x9F, 0x9B, 0xC0, 0xAD, /* 0x14-0x17 */ + 0x9F, 0x9C, 0xC0, 0xAE, 0x9F, 0x9D, 0x9F, 0x9E, /* 0x18-0x1B */ + 0x9F, 0x9F, 0x9F, 0xA0, 0x9F, 0xA1, 0x9F, 0xA2, /* 0x1C-0x1F */ + 0xC0, 0xAF, 0xC0, 0xB0, 0x9F, 0xA3, 0x9F, 0xA4, /* 0x20-0x23 */ + 0xC0, 0xB1, 0x9F, 0xA5, 0x9F, 0xA6, 0x9F, 0xA7, /* 0x24-0x27 */ + 0xC0, 0xB2, 0x9F, 0xA8, 0x9F, 0xA9, 0x9F, 0xAA, /* 0x28-0x2B */ + 0x9F, 0xAB, 0x9F, 0xAC, 0x9F, 0xAD, 0x9F, 0xAE, /* 0x2C-0x2F */ + 0xC0, 0xB3, 0xC0, 0xB4, 0x9F, 0xAF, 0xC0, 0xB5, /* 0x30-0x33 */ + 0x9F, 0xB0, 0xC0, 0xB6, 0x9F, 0xB1, 0xC0, 0xB7, /* 0x34-0x37 */ + 0x9F, 0xB2, 0x9F, 0xB3, 0x9F, 0xB4, 0x9F, 0xB5, /* 0x38-0x3B */ + 0xC0, 0xB8, 0xC0, 0xB9, 0x9F, 0xB6, 0x9F, 0xB7, /* 0x3C-0x3F */ + 0xC0, 0xBA, 0x9F, 0xB8, 0x9F, 0xB9, 0x9F, 0xBA, /* 0x40-0x43 */ + 0xC0, 0xBB, 0x9F, 0xBB, 0x9F, 0xBC, 0x9F, 0xBD, /* 0x44-0x47 */ + 0x9F, 0xBE, 0x9F, 0xBF, 0xC0, 0xBC, 0x9F, 0xC0, /* 0x48-0x4B */ + 0xC0, 0xBD, 0xC0, 0xBE, 0x9F, 0xC1, 0xC0, 0xBF, /* 0x4C-0x4F */ + 0x9F, 0xC2, 0xC0, 0xC0, 0xC0, 0xC1, 0xC0, 0xC2, /* 0x50-0x53 */ + 0xC0, 0xC3, 0xC0, 0xC4, 0xC0, 0xC5, 0xC0, 0xC6, /* 0x54-0x57 */ + 0xC0, 0xC7, 0x9F, 0xC3, 0x9F, 0xC4, 0x9F, 0xC5, /* 0x58-0x5B */ + 0xC0, 0xC8, 0x9F, 0xC6, 0x9F, 0xC7, 0x9F, 0xC8, /* 0x5C-0x5F */ + 0xC0, 0xC9, 0x9F, 0xC9, 0x9F, 0xCA, 0x9F, 0xCB, /* 0x60-0x63 */ + 0x9F, 0xCC, 0x9F, 0xCD, 0x9F, 0xCE, 0x9F, 0xCF, /* 0x64-0x67 */ + 0xC0, 0xCA, 0x9F, 0xD0, 0x9F, 0xD1, 0xC0, 0xCB, /* 0x68-0x6B */ + 0x9F, 0xD2, 0x9F, 0xD3, 0x9F, 0xD4, 0x9F, 0xD5, /* 0x6C-0x6F */ + 0x9F, 0xD6, 0x9F, 0xD7, 0x9F, 0xD8, 0x9F, 0xD9, /* 0x70-0x73 */ + 0xC0, 0xCC, 0xC0, 0xCD, 0x9F, 0xDA, 0x9F, 0xDB, /* 0x74-0x77 */ + 0xC0, 0xCE, 0x9F, 0xDC, 0x9F, 0xDD, 0x9F, 0xDE, /* 0x78-0x7B */ + 0xC0, 0xCF, 0xC0, 0xD0, 0xC0, 0xD1, 0x9F, 0xDF, /* 0x7C-0x7F */ + + 0x9F, 0xE0, 0x9F, 0xE1, 0x9F, 0xE2, 0xC0, 0xD2, /* 0x80-0x83 */ + 0xC0, 0xD3, 0xC0, 0xD4, 0x9F, 0xE3, 0xC0, 0xD5, /* 0x84-0x87 */ + 0xC0, 0xD6, 0xC0, 0xD7, 0xC0, 0xD8, 0x9F, 0xE4, /* 0x88-0x8B */ + 0x9F, 0xE5, 0x9F, 0xE6, 0xC0, 0xD9, 0x9F, 0xE7, /* 0x8C-0x8F */ + 0xC0, 0xDA, 0xC0, 0xDB, 0x9F, 0xE8, 0x9F, 0xE9, /* 0x90-0x93 */ + 0xC0, 0xDC, 0x9F, 0xEA, 0xC0, 0xDD, 0xC0, 0xDE, /* 0x94-0x97 */ + 0xC0, 0xDF, 0x9F, 0xEB, 0xC0, 0xE0, 0x9F, 0xEC, /* 0x98-0x9B */ + 0x9F, 0xED, 0x9F, 0xEE, 0x9F, 0xEF, 0x9F, 0xF0, /* 0x9C-0x9F */ + 0xC0, 0xE1, 0xC0, 0xE2, 0x9F, 0xF1, 0xC0, 0xE3, /* 0xA0-0xA3 */ + 0xC0, 0xE4, 0xC0, 0xE5, 0xC0, 0xE6, 0x9F, 0xF2, /* 0xA4-0xA7 */ + 0x9F, 0xF3, 0x9F, 0xF4, 0x9F, 0xF5, 0x9F, 0xF6, /* 0xA8-0xAB */ + 0xC0, 0xE7, 0xC0, 0xE8, 0x9F, 0xF7, 0x9F, 0xF8, /* 0xAC-0xAF */ + 0xC0, 0xE9, 0x9F, 0xF9, 0x9F, 0xFA, 0x9F, 0xFB, /* 0xB0-0xB3 */ + 0xC0, 0xEA, 0x9F, 0xFC, 0x9F, 0xFD, 0x9F, 0xFE, /* 0xB4-0xB7 */ + 0xA0, 0x41, 0xA0, 0x42, 0xA0, 0x43, 0xA0, 0x44, /* 0xB8-0xBB */ + 0xC0, 0xEB, 0xC0, 0xEC, 0xA0, 0x45, 0xC0, 0xED, /* 0xBC-0xBF */ + 0xC0, 0xEE, 0xC0, 0xEF, 0xA0, 0x46, 0xA0, 0x47, /* 0xC0-0xC3 */ + 0xA0, 0x48, 0xA0, 0x49, 0xA0, 0x4A, 0xA0, 0x4B, /* 0xC4-0xC7 */ + 0xC0, 0xF0, 0xC0, 0xF1, 0xA0, 0x4C, 0xA0, 0x4D, /* 0xC8-0xCB */ + 0xC0, 0xF2, 0xA0, 0x4E, 0xC0, 0xF3, 0xA0, 0x4F, /* 0xCC-0xCF */ + 0xC0, 0xF4, 0xA0, 0x50, 0xA0, 0x51, 0xA0, 0x52, /* 0xD0-0xD3 */ + 0xA0, 0x53, 0xA0, 0x54, 0xA0, 0x55, 0xA0, 0x56, /* 0xD4-0xD7 */ + 0xC0, 0xF5, 0xA0, 0x57, 0xA0, 0x58, 0xA0, 0x59, /* 0xD8-0xDB */ + 0xA0, 0x5A, 0xC0, 0xF6, 0xA0, 0x61, 0xA0, 0x62, /* 0xDC-0xDF */ + 0xA0, 0x63, 0xA0, 0x64, 0xA0, 0x65, 0xA0, 0x66, /* 0xE0-0xE3 */ + 0xC0, 0xF7, 0xA0, 0x67, 0xA0, 0x68, 0xA0, 0x69, /* 0xE4-0xE7 */ + 0xC0, 0xF8, 0xA0, 0x6A, 0xA0, 0x6B, 0xA0, 0x6C, /* 0xE8-0xEB */ + 0xC0, 0xF9, 0xA0, 0x6D, 0xA0, 0x6E, 0xA0, 0x6F, /* 0xEC-0xEF */ + 0xA0, 0x70, 0xA0, 0x71, 0xA0, 0x72, 0xA0, 0x73, /* 0xF0-0xF3 */ + 0xA0, 0x74, 0xA0, 0x75, 0xA0, 0x76, 0xA0, 0x77, /* 0xF4-0xF7 */ + 0xA0, 0x78, 0xA0, 0x79, 0xA0, 0x7A, 0xA0, 0x81, /* 0xF8-0xFB */ + 0xA0, 0x82, 0xA0, 0x83, 0xA0, 0x84, 0xA0, 0x85, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C8[512] = { + 0xC0, 0xFA, 0xC0, 0xFB, 0xA0, 0x86, 0xA0, 0x87, /* 0x00-0x03 */ + 0xC0, 0xFC, 0xA0, 0x88, 0xA0, 0x89, 0xA0, 0x8A, /* 0x04-0x07 */ + 0xC0, 0xFD, 0xA0, 0x8B, 0xC0, 0xFE, 0xA0, 0x8C, /* 0x08-0x0B */ + 0xA0, 0x8D, 0xA0, 0x8E, 0xA0, 0x8F, 0xA0, 0x90, /* 0x0C-0x0F */ + 0xC1, 0xA1, 0xC1, 0xA2, 0xA0, 0x91, 0xC1, 0xA3, /* 0x10-0x13 */ + 0xA0, 0x92, 0xC1, 0xA4, 0xC1, 0xA5, 0xA0, 0x93, /* 0x14-0x17 */ + 0xA0, 0x94, 0xA0, 0x95, 0xA0, 0x96, 0xA0, 0x97, /* 0x18-0x1B */ + 0xC1, 0xA6, 0xC1, 0xA7, 0xA0, 0x98, 0xA0, 0x99, /* 0x1C-0x1F */ + 0xC1, 0xA8, 0xA0, 0x9A, 0xA0, 0x9B, 0xA0, 0x9C, /* 0x20-0x23 */ + 0xC1, 0xA9, 0xA0, 0x9D, 0xA0, 0x9E, 0xA0, 0x9F, /* 0x24-0x27 */ + 0xA0, 0xA0, 0xA0, 0xA1, 0xA0, 0xA2, 0xA0, 0xA3, /* 0x28-0x2B */ + 0xC1, 0xAA, 0xC1, 0xAB, 0xA0, 0xA4, 0xC1, 0xAC, /* 0x2C-0x2F */ + 0xA0, 0xA5, 0xC1, 0xAD, 0xA0, 0xA6, 0xA0, 0xA7, /* 0x30-0x33 */ + 0xA0, 0xA8, 0xA0, 0xA9, 0xA0, 0xAA, 0xA0, 0xAB, /* 0x34-0x37 */ + 0xC1, 0xAE, 0xA0, 0xAC, 0xA0, 0xAD, 0xA0, 0xAE, /* 0x38-0x3B */ + 0xC1, 0xAF, 0xA0, 0xAF, 0xA0, 0xB0, 0xA0, 0xB1, /* 0x3C-0x3F */ + 0xC1, 0xB0, 0xA0, 0xB2, 0xA0, 0xB3, 0xA0, 0xB4, /* 0x40-0x43 */ + 0xA0, 0xB5, 0xA0, 0xB6, 0xA0, 0xB7, 0xA0, 0xB8, /* 0x44-0x47 */ + 0xC1, 0xB1, 0xC1, 0xB2, 0xA0, 0xB9, 0xA0, 0xBA, /* 0x48-0x4B */ + 0xC1, 0xB3, 0xC1, 0xB4, 0xA0, 0xBB, 0xA0, 0xBC, /* 0x4C-0x4F */ + 0xA0, 0xBD, 0xA0, 0xBE, 0xA0, 0xBF, 0xA0, 0xC0, /* 0x50-0x53 */ + 0xC1, 0xB5, 0xA0, 0xC1, 0xA0, 0xC2, 0xA0, 0xC3, /* 0x54-0x57 */ + 0xA0, 0xC4, 0xA0, 0xC5, 0xA0, 0xC6, 0xA0, 0xC7, /* 0x58-0x5B */ + 0xA0, 0xC8, 0xA0, 0xC9, 0xA0, 0xCA, 0xA0, 0xCB, /* 0x5C-0x5F */ + 0xA0, 0xCC, 0xA0, 0xCD, 0xA0, 0xCE, 0xA0, 0xCF, /* 0x60-0x63 */ + 0xA0, 0xD0, 0xA0, 0xD1, 0xA0, 0xD2, 0xA0, 0xD3, /* 0x64-0x67 */ + 0xA0, 0xD4, 0xA0, 0xD5, 0xA0, 0xD6, 0xA0, 0xD7, /* 0x68-0x6B */ + 0xA0, 0xD8, 0xA0, 0xD9, 0xA0, 0xDA, 0xA0, 0xDB, /* 0x6C-0x6F */ + 0xC1, 0xB6, 0xC1, 0xB7, 0xA0, 0xDC, 0xA0, 0xDD, /* 0x70-0x73 */ + 0xC1, 0xB8, 0xA0, 0xDE, 0xA0, 0xDF, 0xA0, 0xE0, /* 0x74-0x77 */ + 0xC1, 0xB9, 0xA0, 0xE1, 0xC1, 0xBA, 0xA0, 0xE2, /* 0x78-0x7B */ + 0xA0, 0xE3, 0xA0, 0xE4, 0xA0, 0xE5, 0xA0, 0xE6, /* 0x7C-0x7F */ + + 0xC1, 0xBB, 0xC1, 0xBC, 0xA0, 0xE7, 0xC1, 0xBD, /* 0x80-0x83 */ + 0xA0, 0xE8, 0xC1, 0xBE, 0xC1, 0xBF, 0xC1, 0xC0, /* 0x84-0x87 */ + 0xA0, 0xE9, 0xA0, 0xEA, 0xA0, 0xEB, 0xC1, 0xC1, /* 0x88-0x8B */ + 0xC1, 0xC2, 0xC1, 0xC3, 0xA0, 0xEC, 0xA0, 0xED, /* 0x8C-0x8F */ + 0xA0, 0xEE, 0xA0, 0xEF, 0xA0, 0xF0, 0xA0, 0xF1, /* 0x90-0x93 */ + 0xC1, 0xC4, 0xA0, 0xF2, 0xA0, 0xF3, 0xA0, 0xF4, /* 0x94-0x97 */ + 0xA0, 0xF5, 0xA0, 0xF6, 0xA0, 0xF7, 0xA0, 0xF8, /* 0x98-0x9B */ + 0xA0, 0xF9, 0xC1, 0xC5, 0xA0, 0xFA, 0xC1, 0xC6, /* 0x9C-0x9F */ + 0xA0, 0xFB, 0xC1, 0xC7, 0xA0, 0xFC, 0xA0, 0xFD, /* 0xA0-0xA3 */ + 0xA0, 0xFE, 0xA1, 0x41, 0xA1, 0x42, 0xA1, 0x43, /* 0xA4-0xA7 */ + 0xC1, 0xC8, 0xA1, 0x44, 0xA1, 0x45, 0xA1, 0x46, /* 0xA8-0xAB */ + 0xA1, 0x47, 0xA1, 0x48, 0xA1, 0x49, 0xA1, 0x4A, /* 0xAC-0xAF */ + 0xA1, 0x4B, 0xA1, 0x4C, 0xA1, 0x4D, 0xA1, 0x4E, /* 0xB0-0xB3 */ + 0xA1, 0x4F, 0xA1, 0x50, 0xA1, 0x51, 0xA1, 0x52, /* 0xB4-0xB7 */ + 0xA1, 0x53, 0xA1, 0x54, 0xA1, 0x55, 0xA1, 0x56, /* 0xB8-0xBB */ + 0xC1, 0xC9, 0xC1, 0xCA, 0xA1, 0x57, 0xA1, 0x58, /* 0xBC-0xBF */ + 0xA1, 0x59, 0xA1, 0x5A, 0xA1, 0x61, 0xA1, 0x62, /* 0xC0-0xC3 */ + 0xC1, 0xCB, 0xA1, 0x63, 0xA1, 0x64, 0xA1, 0x65, /* 0xC4-0xC7 */ + 0xC1, 0xCC, 0xA1, 0x66, 0xA1, 0x67, 0xA1, 0x68, /* 0xC8-0xCB */ + 0xC1, 0xCD, 0xA1, 0x69, 0xA1, 0x6A, 0xA1, 0x6B, /* 0xCC-0xCF */ + 0xA1, 0x6C, 0xA1, 0x6D, 0xA1, 0x6E, 0xA1, 0x6F, /* 0xD0-0xD3 */ + 0xC1, 0xCE, 0xC1, 0xCF, 0xA1, 0x70, 0xC1, 0xD0, /* 0xD4-0xD7 */ + 0xA1, 0x71, 0xC1, 0xD1, 0xA1, 0x72, 0xA1, 0x73, /* 0xD8-0xDB */ + 0xA1, 0x74, 0xA1, 0x75, 0xA1, 0x76, 0xA1, 0x77, /* 0xDC-0xDF */ + 0xC1, 0xD2, 0xC1, 0xD3, 0xA1, 0x78, 0xA1, 0x79, /* 0xE0-0xE3 */ + 0xC1, 0xD4, 0xA1, 0x7A, 0xA1, 0x81, 0xA1, 0x82, /* 0xE4-0xE7 */ + 0xA1, 0x83, 0xA1, 0x84, 0xA1, 0x85, 0xA1, 0x86, /* 0xE8-0xEB */ + 0xA1, 0x87, 0xA1, 0x88, 0xA1, 0x89, 0xA1, 0x8A, /* 0xEC-0xEF */ + 0xA1, 0x8B, 0xA1, 0x8C, 0xA1, 0x8D, 0xA1, 0x8E, /* 0xF0-0xF3 */ + 0xA1, 0x8F, 0xC1, 0xD5, 0xA1, 0x90, 0xA1, 0x91, /* 0xF4-0xF7 */ + 0xA1, 0x92, 0xA1, 0x93, 0xA1, 0x94, 0xA1, 0x95, /* 0xF8-0xFB */ + 0xC1, 0xD6, 0xC1, 0xD7, 0xA1, 0x96, 0xA1, 0x97, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_C9[512] = { + 0xC1, 0xD8, 0xA1, 0x98, 0xA1, 0x99, 0xA1, 0x9A, /* 0x00-0x03 */ + 0xC1, 0xD9, 0xC1, 0xDA, 0xC1, 0xDB, 0xA1, 0x9B, /* 0x04-0x07 */ + 0xA1, 0x9C, 0xA1, 0x9D, 0xA1, 0x9E, 0xA1, 0x9F, /* 0x08-0x0B */ + 0xC1, 0xDC, 0xC1, 0xDD, 0xA1, 0xA0, 0xC1, 0xDE, /* 0x0C-0x0F */ + 0xA2, 0x41, 0xC1, 0xDF, 0xA2, 0x42, 0xA2, 0x43, /* 0x10-0x13 */ + 0xA2, 0x44, 0xA2, 0x45, 0xA2, 0x46, 0xA2, 0x47, /* 0x14-0x17 */ + 0xC1, 0xE0, 0xA2, 0x48, 0xA2, 0x49, 0xA2, 0x4A, /* 0x18-0x1B */ + 0xA2, 0x4B, 0xA2, 0x4C, 0xA2, 0x4D, 0xA2, 0x4E, /* 0x1C-0x1F */ + 0xA2, 0x4F, 0xA2, 0x50, 0xA2, 0x51, 0xA2, 0x52, /* 0x20-0x23 */ + 0xA2, 0x53, 0xA2, 0x54, 0xA2, 0x55, 0xA2, 0x56, /* 0x24-0x27 */ + 0xA2, 0x57, 0xA2, 0x58, 0xA2, 0x59, 0xA2, 0x5A, /* 0x28-0x2B */ + 0xC1, 0xE1, 0xA2, 0x61, 0xA2, 0x62, 0xA2, 0x63, /* 0x2C-0x2F */ + 0xA2, 0x64, 0xA2, 0x65, 0xA2, 0x66, 0xA2, 0x67, /* 0x30-0x33 */ + 0xC1, 0xE2, 0xA2, 0x68, 0xA2, 0x69, 0xA2, 0x6A, /* 0x34-0x37 */ + 0xA2, 0x6B, 0xA2, 0x6C, 0xA2, 0x6D, 0xA2, 0x6E, /* 0x38-0x3B */ + 0xA2, 0x6F, 0xA2, 0x70, 0xA2, 0x71, 0xA2, 0x72, /* 0x3C-0x3F */ + 0xA2, 0x73, 0xA2, 0x74, 0xA2, 0x75, 0xA2, 0x76, /* 0x40-0x43 */ + 0xA2, 0x77, 0xA2, 0x78, 0xA2, 0x79, 0xA2, 0x7A, /* 0x44-0x47 */ + 0xA2, 0x81, 0xA2, 0x82, 0xA2, 0x83, 0xA2, 0x84, /* 0x48-0x4B */ + 0xA2, 0x85, 0xA2, 0x86, 0xA2, 0x87, 0xA2, 0x88, /* 0x4C-0x4F */ + 0xC1, 0xE3, 0xC1, 0xE4, 0xA2, 0x89, 0xA2, 0x8A, /* 0x50-0x53 */ + 0xC1, 0xE5, 0xA2, 0x8B, 0xA2, 0x8C, 0xA2, 0x8D, /* 0x54-0x57 */ + 0xC1, 0xE6, 0xA2, 0x8E, 0xA2, 0x8F, 0xA2, 0x90, /* 0x58-0x5B */ + 0xA2, 0x91, 0xA2, 0x92, 0xA2, 0x93, 0xA2, 0x94, /* 0x5C-0x5F */ + 0xC1, 0xE7, 0xC1, 0xE8, 0xA2, 0x95, 0xC1, 0xE9, /* 0x60-0x63 */ + 0xA2, 0x96, 0xA2, 0x97, 0xA2, 0x98, 0xA2, 0x99, /* 0x64-0x67 */ + 0xA2, 0x9A, 0xA2, 0x9B, 0xA2, 0x9C, 0xA2, 0x9D, /* 0x68-0x6B */ + 0xC1, 0xEA, 0xA2, 0x9E, 0xA2, 0x9F, 0xA2, 0xA0, /* 0x6C-0x6F */ + 0xC1, 0xEB, 0xA3, 0x41, 0xA3, 0x42, 0xA3, 0x43, /* 0x70-0x73 */ + 0xC1, 0xEC, 0xA3, 0x44, 0xA3, 0x45, 0xA3, 0x46, /* 0x74-0x77 */ + 0xA3, 0x47, 0xA3, 0x48, 0xA3, 0x49, 0xA3, 0x4A, /* 0x78-0x7B */ + 0xC1, 0xED, 0xA3, 0x4B, 0xA3, 0x4C, 0xA3, 0x4D, /* 0x7C-0x7F */ + + 0xA3, 0x4E, 0xA3, 0x4F, 0xA3, 0x50, 0xA3, 0x51, /* 0x80-0x83 */ + 0xA3, 0x52, 0xA3, 0x53, 0xA3, 0x54, 0xA3, 0x55, /* 0x84-0x87 */ + 0xC1, 0xEE, 0xC1, 0xEF, 0xA3, 0x56, 0xA3, 0x57, /* 0x88-0x8B */ + 0xC1, 0xF0, 0xA3, 0x58, 0xA3, 0x59, 0xA3, 0x5A, /* 0x8C-0x8F */ + 0xC1, 0xF1, 0xA3, 0x61, 0xA3, 0x62, 0xA3, 0x63, /* 0x90-0x93 */ + 0xA3, 0x64, 0xA3, 0x65, 0xA3, 0x66, 0xA3, 0x67, /* 0x94-0x97 */ + 0xC1, 0xF2, 0xC1, 0xF3, 0xA3, 0x68, 0xC1, 0xF4, /* 0x98-0x9B */ + 0xA3, 0x69, 0xC1, 0xF5, 0xA3, 0x6A, 0xA3, 0x6B, /* 0x9C-0x9F */ + 0xA3, 0x6C, 0xA3, 0x6D, 0xA3, 0x6E, 0xA3, 0x6F, /* 0xA0-0xA3 */ + 0xA3, 0x70, 0xA3, 0x71, 0xA3, 0x72, 0xA3, 0x73, /* 0xA4-0xA7 */ + 0xA3, 0x74, 0xA3, 0x75, 0xA3, 0x76, 0xA3, 0x77, /* 0xA8-0xAB */ + 0xA3, 0x78, 0xA3, 0x79, 0xA3, 0x7A, 0xA3, 0x81, /* 0xAC-0xAF */ + 0xA3, 0x82, 0xA3, 0x83, 0xA3, 0x84, 0xA3, 0x85, /* 0xB0-0xB3 */ + 0xA3, 0x86, 0xA3, 0x87, 0xA3, 0x88, 0xA3, 0x89, /* 0xB4-0xB7 */ + 0xA3, 0x8A, 0xA3, 0x8B, 0xA3, 0x8C, 0xA3, 0x8D, /* 0xB8-0xBB */ + 0xA3, 0x8E, 0xA3, 0x8F, 0xA3, 0x90, 0xA3, 0x91, /* 0xBC-0xBF */ + 0xC1, 0xF6, 0xC1, 0xF7, 0xA3, 0x92, 0xA3, 0x93, /* 0xC0-0xC3 */ + 0xC1, 0xF8, 0xA3, 0x94, 0xA3, 0x95, 0xC1, 0xF9, /* 0xC4-0xC7 */ + 0xC1, 0xFA, 0xA3, 0x96, 0xC1, 0xFB, 0xA3, 0x97, /* 0xC8-0xCB */ + 0xA3, 0x98, 0xA3, 0x99, 0xA3, 0x9A, 0xA3, 0x9B, /* 0xCC-0xCF */ + 0xC1, 0xFC, 0xC1, 0xFD, 0xA3, 0x9C, 0xC1, 0xFE, /* 0xD0-0xD3 */ + 0xA3, 0x9D, 0xC2, 0xA1, 0xC2, 0xA2, 0xA3, 0x9E, /* 0xD4-0xD7 */ + 0xA3, 0x9F, 0xC2, 0xA3, 0xC2, 0xA4, 0xA3, 0xA0, /* 0xD8-0xDB */ + 0xC2, 0xA5, 0xC2, 0xA6, 0xA4, 0x41, 0xA4, 0x42, /* 0xDC-0xDF */ + 0xC2, 0xA7, 0xA4, 0x43, 0xC2, 0xA8, 0xA4, 0x44, /* 0xE0-0xE3 */ + 0xC2, 0xA9, 0xA4, 0x45, 0xA4, 0x46, 0xC2, 0xAA, /* 0xE4-0xE7 */ + 0xA4, 0x47, 0xA4, 0x48, 0xA4, 0x49, 0xA4, 0x4A, /* 0xE8-0xEB */ + 0xC2, 0xAB, 0xC2, 0xAC, 0xA4, 0x4B, 0xC2, 0xAD, /* 0xEC-0xEF */ + 0xC2, 0xAE, 0xC2, 0xAF, 0xA4, 0x4C, 0xA4, 0x4D, /* 0xF0-0xF3 */ + 0xA4, 0x4E, 0xA4, 0x4F, 0xA4, 0x50, 0xA4, 0x51, /* 0xF4-0xF7 */ + 0xC2, 0xB0, 0xC2, 0xB1, 0xA4, 0x52, 0xA4, 0x53, /* 0xF8-0xFB */ + 0xC2, 0xB2, 0xA4, 0x54, 0xA4, 0x55, 0xA4, 0x56, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_CA[512] = { + 0xC2, 0xB3, 0xA4, 0x57, 0xA4, 0x58, 0xA4, 0x59, /* 0x00-0x03 */ + 0xA4, 0x5A, 0xA4, 0x61, 0xA4, 0x62, 0xA4, 0x63, /* 0x04-0x07 */ + 0xC2, 0xB4, 0xC2, 0xB5, 0xA4, 0x64, 0xC2, 0xB6, /* 0x08-0x0B */ + 0xC2, 0xB7, 0xC2, 0xB8, 0xA4, 0x65, 0xA4, 0x66, /* 0x0C-0x0F */ + 0xA4, 0x67, 0xA4, 0x68, 0xA4, 0x69, 0xA4, 0x6A, /* 0x10-0x13 */ + 0xC2, 0xB9, 0xA4, 0x6B, 0xA4, 0x6C, 0xA4, 0x6D, /* 0x14-0x17 */ + 0xC2, 0xBA, 0xA4, 0x6E, 0xA4, 0x6F, 0xA4, 0x70, /* 0x18-0x1B */ + 0xA4, 0x71, 0xA4, 0x72, 0xA4, 0x73, 0xA4, 0x74, /* 0x1C-0x1F */ + 0xA4, 0x75, 0xA4, 0x76, 0xA4, 0x77, 0xA4, 0x78, /* 0x20-0x23 */ + 0xA4, 0x79, 0xA4, 0x7A, 0xA4, 0x81, 0xA4, 0x82, /* 0x24-0x27 */ + 0xA4, 0x83, 0xC2, 0xBB, 0xA4, 0x84, 0xA4, 0x85, /* 0x28-0x2B */ + 0xA4, 0x86, 0xA4, 0x87, 0xA4, 0x88, 0xA4, 0x89, /* 0x2C-0x2F */ + 0xA4, 0x8A, 0xA4, 0x8B, 0xA4, 0x8C, 0xA4, 0x8D, /* 0x30-0x33 */ + 0xA4, 0x8E, 0xA4, 0x8F, 0xA4, 0x90, 0xA4, 0x91, /* 0x34-0x37 */ + 0xA4, 0x92, 0xA4, 0x93, 0xA4, 0x94, 0xA4, 0x95, /* 0x38-0x3B */ + 0xA4, 0x96, 0xA4, 0x97, 0xA4, 0x98, 0xA4, 0x99, /* 0x3C-0x3F */ + 0xA4, 0x9A, 0xA4, 0x9B, 0xA4, 0x9C, 0xA4, 0x9D, /* 0x40-0x43 */ + 0xA4, 0x9E, 0xA4, 0x9F, 0xA4, 0xA0, 0xA5, 0x41, /* 0x44-0x47 */ + 0xA5, 0x42, 0xA5, 0x43, 0xA5, 0x44, 0xA5, 0x45, /* 0x48-0x4B */ + 0xC2, 0xBC, 0xC2, 0xBD, 0xA5, 0x46, 0xA5, 0x47, /* 0x4C-0x4F */ + 0xC2, 0xBE, 0xA5, 0x48, 0xA5, 0x49, 0xA5, 0x4A, /* 0x50-0x53 */ + 0xC2, 0xBF, 0xA5, 0x4B, 0xA5, 0x4C, 0xA5, 0x4D, /* 0x54-0x57 */ + 0xA5, 0x4E, 0xA5, 0x4F, 0xA5, 0x50, 0xA5, 0x51, /* 0x58-0x5B */ + 0xC2, 0xC0, 0xC2, 0xC1, 0xA5, 0x52, 0xC2, 0xC2, /* 0x5C-0x5F */ + 0xC2, 0xC3, 0xC2, 0xC4, 0xA5, 0x53, 0xA5, 0x54, /* 0x60-0x63 */ + 0xA5, 0x55, 0xA5, 0x56, 0xA5, 0x57, 0xA5, 0x58, /* 0x64-0x67 */ + 0xC2, 0xC5, 0xA5, 0x59, 0xA5, 0x5A, 0xA5, 0x61, /* 0x68-0x6B */ + 0xA5, 0x62, 0xA5, 0x63, 0xA5, 0x64, 0xA5, 0x65, /* 0x6C-0x6F */ + 0xA5, 0x66, 0xA5, 0x67, 0xA5, 0x68, 0xA5, 0x69, /* 0x70-0x73 */ + 0xA5, 0x6A, 0xA5, 0x6B, 0xA5, 0x6C, 0xA5, 0x6D, /* 0x74-0x77 */ + 0xA5, 0x6E, 0xA5, 0x6F, 0xA5, 0x70, 0xA5, 0x71, /* 0x78-0x7B */ + 0xA5, 0x72, 0xC2, 0xC6, 0xA5, 0x73, 0xA5, 0x74, /* 0x7C-0x7F */ + + 0xA5, 0x75, 0xA5, 0x76, 0xA5, 0x77, 0xA5, 0x78, /* 0x80-0x83 */ + 0xC2, 0xC7, 0xA5, 0x79, 0xA5, 0x7A, 0xA5, 0x81, /* 0x84-0x87 */ + 0xA5, 0x82, 0xA5, 0x83, 0xA5, 0x84, 0xA5, 0x85, /* 0x88-0x8B */ + 0xA5, 0x86, 0xA5, 0x87, 0xA5, 0x88, 0xA5, 0x89, /* 0x8C-0x8F */ + 0xA5, 0x8A, 0xA5, 0x8B, 0xA5, 0x8C, 0xA5, 0x8D, /* 0x90-0x93 */ + 0xA5, 0x8E, 0xA5, 0x8F, 0xA5, 0x90, 0xA5, 0x91, /* 0x94-0x97 */ + 0xC2, 0xC8, 0xA5, 0x92, 0xA5, 0x93, 0xA5, 0x94, /* 0x98-0x9B */ + 0xA5, 0x95, 0xA5, 0x96, 0xA5, 0x97, 0xA5, 0x98, /* 0x9C-0x9F */ + 0xA5, 0x99, 0xA5, 0x9A, 0xA5, 0x9B, 0xA5, 0x9C, /* 0xA0-0xA3 */ + 0xA5, 0x9D, 0xA5, 0x9E, 0xA5, 0x9F, 0xA5, 0xA0, /* 0xA4-0xA7 */ + 0xA6, 0x41, 0xA6, 0x42, 0xA6, 0x43, 0xA6, 0x44, /* 0xA8-0xAB */ + 0xA6, 0x45, 0xA6, 0x46, 0xA6, 0x47, 0xA6, 0x48, /* 0xAC-0xAF */ + 0xA6, 0x49, 0xA6, 0x4A, 0xA6, 0x4B, 0xA6, 0x4C, /* 0xB0-0xB3 */ + 0xA6, 0x4D, 0xA6, 0x4E, 0xA6, 0x4F, 0xA6, 0x50, /* 0xB4-0xB7 */ + 0xA6, 0x51, 0xA6, 0x52, 0xA6, 0x53, 0xA6, 0x54, /* 0xB8-0xBB */ + 0xC2, 0xC9, 0xC2, 0xCA, 0xA6, 0x55, 0xA6, 0x56, /* 0xBC-0xBF */ + 0xC2, 0xCB, 0xA6, 0x57, 0xA6, 0x58, 0xA6, 0x59, /* 0xC0-0xC3 */ + 0xC2, 0xCC, 0xA6, 0x5A, 0xA6, 0x61, 0xA6, 0x62, /* 0xC4-0xC7 */ + 0xA6, 0x63, 0xA6, 0x64, 0xA6, 0x65, 0xA6, 0x66, /* 0xC8-0xCB */ + 0xC2, 0xCD, 0xC2, 0xCE, 0xA6, 0x67, 0xC2, 0xCF, /* 0xCC-0xCF */ + 0xA6, 0x68, 0xC2, 0xD0, 0xA6, 0x69, 0xC2, 0xD1, /* 0xD0-0xD3 */ + 0xA6, 0x6A, 0xA6, 0x6B, 0xA6, 0x6C, 0xA6, 0x6D, /* 0xD4-0xD7 */ + 0xC2, 0xD2, 0xC2, 0xD3, 0xA6, 0x6E, 0xA6, 0x6F, /* 0xD8-0xDB */ + 0xA6, 0x70, 0xA6, 0x71, 0xA6, 0x72, 0xA6, 0x73, /* 0xDC-0xDF */ + 0xC2, 0xD4, 0xA6, 0x74, 0xA6, 0x75, 0xA6, 0x76, /* 0xE0-0xE3 */ + 0xA6, 0x77, 0xA6, 0x78, 0xA6, 0x79, 0xA6, 0x7A, /* 0xE4-0xE7 */ + 0xA6, 0x81, 0xA6, 0x82, 0xA6, 0x83, 0xA6, 0x84, /* 0xE8-0xEB */ + 0xC2, 0xD5, 0xA6, 0x85, 0xA6, 0x86, 0xA6, 0x87, /* 0xEC-0xEF */ + 0xA6, 0x88, 0xA6, 0x89, 0xA6, 0x8A, 0xA6, 0x8B, /* 0xF0-0xF3 */ + 0xC2, 0xD6, 0xA6, 0x8C, 0xA6, 0x8D, 0xA6, 0x8E, /* 0xF4-0xF7 */ + 0xA6, 0x8F, 0xA6, 0x90, 0xA6, 0x91, 0xA6, 0x92, /* 0xF8-0xFB */ + 0xA6, 0x93, 0xA6, 0x94, 0xA6, 0x95, 0xA6, 0x96, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_CB[512] = { + 0xA6, 0x97, 0xA6, 0x98, 0xA6, 0x99, 0xA6, 0x9A, /* 0x00-0x03 */ + 0xA6, 0x9B, 0xA6, 0x9C, 0xA6, 0x9D, 0xA6, 0x9E, /* 0x04-0x07 */ + 0xC2, 0xD7, 0xA6, 0x9F, 0xA6, 0xA0, 0xA7, 0x41, /* 0x08-0x0B */ + 0xA7, 0x42, 0xA7, 0x43, 0xA7, 0x44, 0xA7, 0x45, /* 0x0C-0x0F */ + 0xC2, 0xD8, 0xA7, 0x46, 0xA7, 0x47, 0xA7, 0x48, /* 0x10-0x13 */ + 0xC2, 0xD9, 0xA7, 0x49, 0xA7, 0x4A, 0xA7, 0x4B, /* 0x14-0x17 */ + 0xC2, 0xDA, 0xA7, 0x4C, 0xA7, 0x4D, 0xA7, 0x4E, /* 0x18-0x1B */ + 0xA7, 0x4F, 0xA7, 0x50, 0xA7, 0x51, 0xA7, 0x52, /* 0x1C-0x1F */ + 0xC2, 0xDB, 0xC2, 0xDC, 0xA7, 0x53, 0xA7, 0x54, /* 0x20-0x23 */ + 0xA7, 0x55, 0xA7, 0x56, 0xA7, 0x57, 0xA7, 0x58, /* 0x24-0x27 */ + 0xA7, 0x59, 0xA7, 0x5A, 0xA7, 0x61, 0xA7, 0x62, /* 0x28-0x2B */ + 0xA7, 0x63, 0xA7, 0x64, 0xA7, 0x65, 0xA7, 0x66, /* 0x2C-0x2F */ + 0xA7, 0x67, 0xA7, 0x68, 0xA7, 0x69, 0xA7, 0x6A, /* 0x30-0x33 */ + 0xA7, 0x6B, 0xA7, 0x6C, 0xA7, 0x6D, 0xA7, 0x6E, /* 0x34-0x37 */ + 0xA7, 0x6F, 0xA7, 0x70, 0xA7, 0x71, 0xA7, 0x72, /* 0x38-0x3B */ + 0xA7, 0x73, 0xA7, 0x74, 0xA7, 0x75, 0xA7, 0x76, /* 0x3C-0x3F */ + 0xA7, 0x77, 0xC2, 0xDD, 0xA7, 0x78, 0xA7, 0x79, /* 0x40-0x43 */ + 0xA7, 0x7A, 0xA7, 0x81, 0xA7, 0x82, 0xA7, 0x83, /* 0x44-0x47 */ + 0xC2, 0xDE, 0xC2, 0xDF, 0xA7, 0x84, 0xA7, 0x85, /* 0x48-0x4B */ + 0xC2, 0xE0, 0xA7, 0x86, 0xA7, 0x87, 0xA7, 0x88, /* 0x4C-0x4F */ + 0xC2, 0xE1, 0xA7, 0x89, 0xA7, 0x8A, 0xA7, 0x8B, /* 0x50-0x53 */ + 0xA7, 0x8C, 0xA7, 0x8D, 0xA7, 0x8E, 0xA7, 0x8F, /* 0x54-0x57 */ + 0xC2, 0xE2, 0xC2, 0xE3, 0xA7, 0x90, 0xA7, 0x91, /* 0x58-0x5B */ + 0xA7, 0x92, 0xC2, 0xE4, 0xA7, 0x93, 0xA7, 0x94, /* 0x5C-0x5F */ + 0xA7, 0x95, 0xA7, 0x96, 0xA7, 0x97, 0xA7, 0x98, /* 0x60-0x63 */ + 0xC2, 0xE5, 0xA7, 0x99, 0xA7, 0x9A, 0xA7, 0x9B, /* 0x64-0x67 */ + 0xA7, 0x9C, 0xA7, 0x9D, 0xA7, 0x9E, 0xA7, 0x9F, /* 0x68-0x6B */ + 0xA7, 0xA0, 0xA8, 0x41, 0xA8, 0x42, 0xA8, 0x43, /* 0x6C-0x6F */ + 0xA8, 0x44, 0xA8, 0x45, 0xA8, 0x46, 0xA8, 0x47, /* 0x70-0x73 */ + 0xA8, 0x48, 0xA8, 0x49, 0xA8, 0x4A, 0xA8, 0x4B, /* 0x74-0x77 */ + 0xC2, 0xE6, 0xC2, 0xE7, 0xA8, 0x4C, 0xA8, 0x4D, /* 0x78-0x7B */ + 0xA8, 0x4E, 0xA8, 0x4F, 0xA8, 0x50, 0xA8, 0x51, /* 0x7C-0x7F */ + + 0xA8, 0x52, 0xA8, 0x53, 0xA8, 0x54, 0xA8, 0x55, /* 0x80-0x83 */ + 0xA8, 0x56, 0xA8, 0x57, 0xA8, 0x58, 0xA8, 0x59, /* 0x84-0x87 */ + 0xA8, 0x5A, 0xA8, 0x61, 0xA8, 0x62, 0xA8, 0x63, /* 0x88-0x8B */ + 0xA8, 0x64, 0xA8, 0x65, 0xA8, 0x66, 0xA8, 0x67, /* 0x8C-0x8F */ + 0xA8, 0x68, 0xA8, 0x69, 0xA8, 0x6A, 0xA8, 0x6B, /* 0x90-0x93 */ + 0xA8, 0x6C, 0xA8, 0x6D, 0xA8, 0x6E, 0xA8, 0x6F, /* 0x94-0x97 */ + 0xA8, 0x70, 0xA8, 0x71, 0xA8, 0x72, 0xA8, 0x73, /* 0x98-0x9B */ + 0xC2, 0xE8, 0xA8, 0x74, 0xA8, 0x75, 0xA8, 0x76, /* 0x9C-0x9F */ + 0xA8, 0x77, 0xA8, 0x78, 0xA8, 0x79, 0xA8, 0x7A, /* 0xA0-0xA3 */ + 0xA8, 0x81, 0xA8, 0x82, 0xA8, 0x83, 0xA8, 0x84, /* 0xA4-0xA7 */ + 0xA8, 0x85, 0xA8, 0x86, 0xA8, 0x87, 0xA8, 0x88, /* 0xA8-0xAB */ + 0xA8, 0x89, 0xA8, 0x8A, 0xA8, 0x8B, 0xA8, 0x8C, /* 0xAC-0xAF */ + 0xA8, 0x8D, 0xA8, 0x8E, 0xA8, 0x8F, 0xA8, 0x90, /* 0xB0-0xB3 */ + 0xA8, 0x91, 0xA8, 0x92, 0xA8, 0x93, 0xA8, 0x94, /* 0xB4-0xB7 */ + 0xC2, 0xE9, 0xA8, 0x95, 0xA8, 0x96, 0xA8, 0x97, /* 0xB8-0xBB */ + 0xA8, 0x98, 0xA8, 0x99, 0xA8, 0x9A, 0xA8, 0x9B, /* 0xBC-0xBF */ + 0xA8, 0x9C, 0xA8, 0x9D, 0xA8, 0x9E, 0xA8, 0x9F, /* 0xC0-0xC3 */ + 0xA8, 0xA0, 0xA9, 0x41, 0xA9, 0x42, 0xA9, 0x43, /* 0xC4-0xC7 */ + 0xA9, 0x44, 0xA9, 0x45, 0xA9, 0x46, 0xA9, 0x47, /* 0xC8-0xCB */ + 0xA9, 0x48, 0xA9, 0x49, 0xA9, 0x4A, 0xA9, 0x4B, /* 0xCC-0xCF */ + 0xA9, 0x4C, 0xA9, 0x4D, 0xA9, 0x4E, 0xA9, 0x4F, /* 0xD0-0xD3 */ + 0xC2, 0xEA, 0xA9, 0x50, 0xA9, 0x51, 0xA9, 0x52, /* 0xD4-0xD7 */ + 0xA9, 0x53, 0xA9, 0x54, 0xA9, 0x55, 0xA9, 0x56, /* 0xD8-0xDB */ + 0xA9, 0x57, 0xA9, 0x58, 0xA9, 0x59, 0xA9, 0x5A, /* 0xDC-0xDF */ + 0xA9, 0x61, 0xA9, 0x62, 0xA9, 0x63, 0xA9, 0x64, /* 0xE0-0xE3 */ + 0xC2, 0xEB, 0xA9, 0x65, 0xA9, 0x66, 0xC2, 0xEC, /* 0xE4-0xE7 */ + 0xA9, 0x67, 0xC2, 0xED, 0xA9, 0x68, 0xA9, 0x69, /* 0xE8-0xEB */ + 0xA9, 0x6A, 0xA9, 0x6B, 0xA9, 0x6C, 0xA9, 0x6D, /* 0xEC-0xEF */ + 0xA9, 0x6E, 0xA9, 0x6F, 0xA9, 0x70, 0xA9, 0x71, /* 0xF0-0xF3 */ + 0xA9, 0x72, 0xA9, 0x73, 0xA9, 0x74, 0xA9, 0x75, /* 0xF4-0xF7 */ + 0xA9, 0x76, 0xA9, 0x77, 0xA9, 0x78, 0xA9, 0x79, /* 0xF8-0xFB */ + 0xA9, 0x7A, 0xA9, 0x81, 0xA9, 0x82, 0xA9, 0x83, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_CC[512] = { + 0xA9, 0x84, 0xA9, 0x85, 0xA9, 0x86, 0xA9, 0x87, /* 0x00-0x03 */ + 0xA9, 0x88, 0xA9, 0x89, 0xA9, 0x8A, 0xA9, 0x8B, /* 0x04-0x07 */ + 0xA9, 0x8C, 0xA9, 0x8D, 0xA9, 0x8E, 0xA9, 0x8F, /* 0x08-0x0B */ + 0xC2, 0xEE, 0xC2, 0xEF, 0xA9, 0x90, 0xA9, 0x91, /* 0x0C-0x0F */ + 0xC2, 0xF0, 0xA9, 0x92, 0xA9, 0x93, 0xA9, 0x94, /* 0x10-0x13 */ + 0xC2, 0xF1, 0xA9, 0x95, 0xA9, 0x96, 0xA9, 0x97, /* 0x14-0x17 */ + 0xA9, 0x98, 0xA9, 0x99, 0xA9, 0x9A, 0xA9, 0x9B, /* 0x18-0x1B */ + 0xC2, 0xF2, 0xC2, 0xF3, 0xA9, 0x9C, 0xA9, 0x9D, /* 0x1C-0x1F */ + 0xA9, 0x9E, 0xC2, 0xF4, 0xC2, 0xF5, 0xA9, 0x9F, /* 0x20-0x23 */ + 0xA9, 0xA0, 0xAA, 0x41, 0xAA, 0x42, 0xC2, 0xF6, /* 0x24-0x27 */ + 0xC2, 0xF7, 0xC2, 0xF8, 0xAA, 0x43, 0xAA, 0x44, /* 0x28-0x2B */ + 0xC2, 0xF9, 0xAA, 0x45, 0xC2, 0xFA, 0xAA, 0x46, /* 0x2C-0x2F */ + 0xC2, 0xFB, 0xAA, 0x47, 0xAA, 0x48, 0xAA, 0x49, /* 0x30-0x33 */ + 0xAA, 0x4A, 0xAA, 0x4B, 0xAA, 0x4C, 0xAA, 0x4D, /* 0x34-0x37 */ + 0xC2, 0xFC, 0xC2, 0xFD, 0xAA, 0x4E, 0xC2, 0xFE, /* 0x38-0x3B */ + 0xC3, 0xA1, 0xC3, 0xA2, 0xC3, 0xA3, 0xAA, 0x4F, /* 0x3C-0x3F */ + 0xAA, 0x50, 0xAA, 0x51, 0xAA, 0x52, 0xAA, 0x53, /* 0x40-0x43 */ + 0xC3, 0xA4, 0xC3, 0xA5, 0xAA, 0x54, 0xAA, 0x55, /* 0x44-0x47 */ + 0xC3, 0xA6, 0xAA, 0x56, 0xAA, 0x57, 0xAA, 0x58, /* 0x48-0x4B */ + 0xC3, 0xA7, 0xAA, 0x59, 0xAA, 0x5A, 0xAA, 0x61, /* 0x4C-0x4F */ + 0xAA, 0x62, 0xAA, 0x63, 0xAA, 0x64, 0xAA, 0x65, /* 0x50-0x53 */ + 0xC3, 0xA8, 0xC3, 0xA9, 0xAA, 0x66, 0xC3, 0xAA, /* 0x54-0x57 */ + 0xC3, 0xAB, 0xC3, 0xAC, 0xAA, 0x67, 0xAA, 0x68, /* 0x58-0x5B */ + 0xAA, 0x69, 0xAA, 0x6A, 0xAA, 0x6B, 0xAA, 0x6C, /* 0x5C-0x5F */ + 0xC3, 0xAD, 0xAA, 0x6D, 0xAA, 0x6E, 0xAA, 0x6F, /* 0x60-0x63 */ + 0xC3, 0xAE, 0xAA, 0x70, 0xC3, 0xAF, 0xAA, 0x71, /* 0x64-0x67 */ + 0xC3, 0xB0, 0xAA, 0x72, 0xAA, 0x73, 0xAA, 0x74, /* 0x68-0x6B */ + 0xAA, 0x75, 0xAA, 0x76, 0xAA, 0x77, 0xAA, 0x78, /* 0x6C-0x6F */ + 0xC3, 0xB1, 0xAA, 0x79, 0xAA, 0x7A, 0xAA, 0x81, /* 0x70-0x73 */ + 0xAA, 0x82, 0xC3, 0xB2, 0xAA, 0x83, 0xAA, 0x84, /* 0x74-0x77 */ + 0xAA, 0x85, 0xAA, 0x86, 0xAA, 0x87, 0xAA, 0x88, /* 0x78-0x7B */ + 0xAA, 0x89, 0xAA, 0x8A, 0xAA, 0x8B, 0xAA, 0x8C, /* 0x7C-0x7F */ + + 0xAA, 0x8D, 0xAA, 0x8E, 0xAA, 0x8F, 0xAA, 0x90, /* 0x80-0x83 */ + 0xAA, 0x91, 0xAA, 0x92, 0xAA, 0x93, 0xAA, 0x94, /* 0x84-0x87 */ + 0xAA, 0x95, 0xAA, 0x96, 0xAA, 0x97, 0xAA, 0x98, /* 0x88-0x8B */ + 0xAA, 0x99, 0xAA, 0x9A, 0xAA, 0x9B, 0xAA, 0x9C, /* 0x8C-0x8F */ + 0xAA, 0x9D, 0xAA, 0x9E, 0xAA, 0x9F, 0xAA, 0xA0, /* 0x90-0x93 */ + 0xAB, 0x41, 0xAB, 0x42, 0xAB, 0x43, 0xAB, 0x44, /* 0x94-0x97 */ + 0xC3, 0xB3, 0xC3, 0xB4, 0xAB, 0x45, 0xAB, 0x46, /* 0x98-0x9B */ + 0xC3, 0xB5, 0xAB, 0x47, 0xAB, 0x48, 0xAB, 0x49, /* 0x9C-0x9F */ + 0xC3, 0xB6, 0xAB, 0x4A, 0xAB, 0x4B, 0xAB, 0x4C, /* 0xA0-0xA3 */ + 0xAB, 0x4D, 0xAB, 0x4E, 0xAB, 0x4F, 0xAB, 0x50, /* 0xA4-0xA7 */ + 0xC3, 0xB7, 0xC3, 0xB8, 0xAB, 0x51, 0xC3, 0xB9, /* 0xA8-0xAB */ + 0xC3, 0xBA, 0xC3, 0xBB, 0xAB, 0x52, 0xAB, 0x53, /* 0xAC-0xAF */ + 0xAB, 0x54, 0xAB, 0x55, 0xAB, 0x56, 0xAB, 0x57, /* 0xB0-0xB3 */ + 0xC3, 0xBC, 0xC3, 0xBD, 0xAB, 0x58, 0xAB, 0x59, /* 0xB4-0xB7 */ + 0xC3, 0xBE, 0xAB, 0x5A, 0xAB, 0x61, 0xAB, 0x62, /* 0xB8-0xBB */ + 0xC3, 0xBF, 0xAB, 0x63, 0xAB, 0x64, 0xAB, 0x65, /* 0xBC-0xBF */ + 0xAB, 0x66, 0xAB, 0x67, 0xAB, 0x68, 0xAB, 0x69, /* 0xC0-0xC3 */ + 0xC3, 0xC0, 0xC3, 0xC1, 0xAB, 0x6A, 0xC3, 0xC2, /* 0xC4-0xC7 */ + 0xAB, 0x6B, 0xC3, 0xC3, 0xAB, 0x6C, 0xAB, 0x6D, /* 0xC8-0xCB */ + 0xAB, 0x6E, 0xAB, 0x6F, 0xAB, 0x70, 0xAB, 0x71, /* 0xCC-0xCF */ + 0xC3, 0xC4, 0xAB, 0x72, 0xAB, 0x73, 0xAB, 0x74, /* 0xD0-0xD3 */ + 0xC3, 0xC5, 0xAB, 0x75, 0xAB, 0x76, 0xAB, 0x77, /* 0xD4-0xD7 */ + 0xAB, 0x78, 0xAB, 0x79, 0xAB, 0x7A, 0xAB, 0x81, /* 0xD8-0xDB */ + 0xAB, 0x82, 0xAB, 0x83, 0xAB, 0x84, 0xAB, 0x85, /* 0xDC-0xDF */ + 0xAB, 0x86, 0xAB, 0x87, 0xAB, 0x88, 0xAB, 0x89, /* 0xE0-0xE3 */ + 0xC3, 0xC6, 0xAB, 0x8A, 0xAB, 0x8B, 0xAB, 0x8C, /* 0xE4-0xE7 */ + 0xAB, 0x8D, 0xAB, 0x8E, 0xAB, 0x8F, 0xAB, 0x90, /* 0xE8-0xEB */ + 0xC3, 0xC7, 0xAB, 0x91, 0xAB, 0x92, 0xAB, 0x93, /* 0xEC-0xEF */ + 0xC3, 0xC8, 0xAB, 0x94, 0xAB, 0x95, 0xAB, 0x96, /* 0xF0-0xF3 */ + 0xAB, 0x97, 0xAB, 0x98, 0xAB, 0x99, 0xAB, 0x9A, /* 0xF4-0xF7 */ + 0xAB, 0x9B, 0xAB, 0x9C, 0xAB, 0x9D, 0xAB, 0x9E, /* 0xF8-0xFB */ + 0xAB, 0x9F, 0xAB, 0xA0, 0xAC, 0x41, 0xAC, 0x42, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_CD[512] = { + 0xAC, 0x43, 0xC3, 0xC9, 0xAC, 0x44, 0xAC, 0x45, /* 0x00-0x03 */ + 0xAC, 0x46, 0xAC, 0x47, 0xAC, 0x48, 0xAC, 0x49, /* 0x04-0x07 */ + 0xC3, 0xCA, 0xC3, 0xCB, 0xAC, 0x4A, 0xAC, 0x4B, /* 0x08-0x0B */ + 0xC3, 0xCC, 0xAC, 0x4C, 0xAC, 0x4D, 0xAC, 0x4E, /* 0x0C-0x0F */ + 0xC3, 0xCD, 0xAC, 0x4F, 0xAC, 0x50, 0xAC, 0x51, /* 0x10-0x13 */ + 0xAC, 0x52, 0xAC, 0x53, 0xAC, 0x54, 0xAC, 0x55, /* 0x14-0x17 */ + 0xC3, 0xCE, 0xC3, 0xCF, 0xAC, 0x56, 0xC3, 0xD0, /* 0x18-0x1B */ + 0xAC, 0x57, 0xC3, 0xD1, 0xAC, 0x58, 0xAC, 0x59, /* 0x1C-0x1F */ + 0xAC, 0x5A, 0xAC, 0x61, 0xAC, 0x62, 0xAC, 0x63, /* 0x20-0x23 */ + 0xC3, 0xD2, 0xAC, 0x64, 0xAC, 0x65, 0xAC, 0x66, /* 0x24-0x27 */ + 0xC3, 0xD3, 0xAC, 0x67, 0xAC, 0x68, 0xAC, 0x69, /* 0x28-0x2B */ + 0xC3, 0xD4, 0xAC, 0x6A, 0xAC, 0x6B, 0xAC, 0x6C, /* 0x2C-0x2F */ + 0xAC, 0x6D, 0xAC, 0x6E, 0xAC, 0x6F, 0xAC, 0x70, /* 0x30-0x33 */ + 0xAC, 0x71, 0xAC, 0x72, 0xAC, 0x73, 0xAC, 0x74, /* 0x34-0x37 */ + 0xAC, 0x75, 0xC3, 0xD5, 0xAC, 0x76, 0xAC, 0x77, /* 0x38-0x3B */ + 0xAC, 0x78, 0xAC, 0x79, 0xAC, 0x7A, 0xAC, 0x81, /* 0x3C-0x3F */ + 0xAC, 0x82, 0xAC, 0x83, 0xAC, 0x84, 0xAC, 0x85, /* 0x40-0x43 */ + 0xAC, 0x86, 0xAC, 0x87, 0xAC, 0x88, 0xAC, 0x89, /* 0x44-0x47 */ + 0xAC, 0x8A, 0xAC, 0x8B, 0xAC, 0x8C, 0xAC, 0x8D, /* 0x48-0x4B */ + 0xAC, 0x8E, 0xAC, 0x8F, 0xAC, 0x90, 0xAC, 0x91, /* 0x4C-0x4F */ + 0xAC, 0x92, 0xAC, 0x93, 0xAC, 0x94, 0xAC, 0x95, /* 0x50-0x53 */ + 0xAC, 0x96, 0xAC, 0x97, 0xAC, 0x98, 0xAC, 0x99, /* 0x54-0x57 */ + 0xAC, 0x9A, 0xAC, 0x9B, 0xAC, 0x9C, 0xAC, 0x9D, /* 0x58-0x5B */ + 0xC3, 0xD6, 0xAC, 0x9E, 0xAC, 0x9F, 0xAC, 0xA0, /* 0x5C-0x5F */ + 0xC3, 0xD7, 0xAD, 0x41, 0xAD, 0x42, 0xAD, 0x43, /* 0x60-0x63 */ + 0xC3, 0xD8, 0xAD, 0x44, 0xAD, 0x45, 0xAD, 0x46, /* 0x64-0x67 */ + 0xAD, 0x47, 0xAD, 0x48, 0xAD, 0x49, 0xAD, 0x4A, /* 0x68-0x6B */ + 0xC3, 0xD9, 0xC3, 0xDA, 0xAD, 0x4B, 0xC3, 0xDB, /* 0x6C-0x6F */ + 0xAD, 0x4C, 0xC3, 0xDC, 0xAD, 0x4D, 0xAD, 0x4E, /* 0x70-0x73 */ + 0xAD, 0x4F, 0xAD, 0x50, 0xAD, 0x51, 0xAD, 0x52, /* 0x74-0x77 */ + 0xC3, 0xDD, 0xAD, 0x53, 0xAD, 0x54, 0xAD, 0x55, /* 0x78-0x7B */ + 0xAD, 0x56, 0xAD, 0x57, 0xAD, 0x58, 0xAD, 0x59, /* 0x7C-0x7F */ + + 0xAD, 0x5A, 0xAD, 0x61, 0xAD, 0x62, 0xAD, 0x63, /* 0x80-0x83 */ + 0xAD, 0x64, 0xAD, 0x65, 0xAD, 0x66, 0xAD, 0x67, /* 0x84-0x87 */ + 0xC3, 0xDE, 0xAD, 0x68, 0xAD, 0x69, 0xAD, 0x6A, /* 0x88-0x8B */ + 0xAD, 0x6B, 0xAD, 0x6C, 0xAD, 0x6D, 0xAD, 0x6E, /* 0x8C-0x8F */ + 0xAD, 0x6F, 0xAD, 0x70, 0xAD, 0x71, 0xAD, 0x72, /* 0x90-0x93 */ + 0xC3, 0xDF, 0xC3, 0xE0, 0xAD, 0x73, 0xAD, 0x74, /* 0x94-0x97 */ + 0xC3, 0xE1, 0xAD, 0x75, 0xAD, 0x76, 0xAD, 0x77, /* 0x98-0x9B */ + 0xC3, 0xE2, 0xAD, 0x78, 0xAD, 0x79, 0xAD, 0x7A, /* 0x9C-0x9F */ + 0xAD, 0x81, 0xAD, 0x82, 0xAD, 0x83, 0xAD, 0x84, /* 0xA0-0xA3 */ + 0xC3, 0xE3, 0xC3, 0xE4, 0xAD, 0x85, 0xC3, 0xE5, /* 0xA4-0xA7 */ + 0xAD, 0x86, 0xC3, 0xE6, 0xAD, 0x87, 0xAD, 0x88, /* 0xA8-0xAB */ + 0xAD, 0x89, 0xAD, 0x8A, 0xAD, 0x8B, 0xAD, 0x8C, /* 0xAC-0xAF */ + 0xC3, 0xE7, 0xAD, 0x8D, 0xAD, 0x8E, 0xAD, 0x8F, /* 0xB0-0xB3 */ + 0xAD, 0x90, 0xAD, 0x91, 0xAD, 0x92, 0xAD, 0x93, /* 0xB4-0xB7 */ + 0xAD, 0x94, 0xAD, 0x95, 0xAD, 0x96, 0xAD, 0x97, /* 0xB8-0xBB */ + 0xAD, 0x98, 0xAD, 0x99, 0xAD, 0x9A, 0xAD, 0x9B, /* 0xBC-0xBF */ + 0xAD, 0x9C, 0xAD, 0x9D, 0xAD, 0x9E, 0xAD, 0x9F, /* 0xC0-0xC3 */ + 0xC3, 0xE8, 0xAD, 0xA0, 0xAE, 0x41, 0xAE, 0x42, /* 0xC4-0xC7 */ + 0xAE, 0x43, 0xAE, 0x44, 0xAE, 0x45, 0xAE, 0x46, /* 0xC8-0xCB */ + 0xC3, 0xE9, 0xAE, 0x47, 0xAE, 0x48, 0xAE, 0x49, /* 0xCC-0xCF */ + 0xC3, 0xEA, 0xAE, 0x4A, 0xAE, 0x4B, 0xAE, 0x4C, /* 0xD0-0xD3 */ + 0xAE, 0x4D, 0xAE, 0x4E, 0xAE, 0x4F, 0xAE, 0x50, /* 0xD4-0xD7 */ + 0xAE, 0x51, 0xAE, 0x52, 0xAE, 0x53, 0xAE, 0x54, /* 0xD8-0xDB */ + 0xAE, 0x55, 0xAE, 0x56, 0xAE, 0x57, 0xAE, 0x58, /* 0xDC-0xDF */ + 0xAE, 0x59, 0xAE, 0x5A, 0xAE, 0x61, 0xAE, 0x62, /* 0xE0-0xE3 */ + 0xAE, 0x63, 0xAE, 0x64, 0xAE, 0x65, 0xAE, 0x66, /* 0xE4-0xE7 */ + 0xC3, 0xEB, 0xAE, 0x67, 0xAE, 0x68, 0xAE, 0x69, /* 0xE8-0xEB */ + 0xC3, 0xEC, 0xAE, 0x6A, 0xAE, 0x6B, 0xAE, 0x6C, /* 0xEC-0xEF */ + 0xC3, 0xED, 0xAE, 0x6D, 0xAE, 0x6E, 0xAE, 0x6F, /* 0xF0-0xF3 */ + 0xAE, 0x70, 0xAE, 0x71, 0xAE, 0x72, 0xAE, 0x73, /* 0xF4-0xF7 */ + 0xC3, 0xEE, 0xC3, 0xEF, 0xAE, 0x74, 0xC3, 0xF0, /* 0xF8-0xFB */ + 0xAE, 0x75, 0xC3, 0xF1, 0xAE, 0x76, 0xAE, 0x77, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_CE[512] = { + 0xAE, 0x78, 0xAE, 0x79, 0xAE, 0x7A, 0xAE, 0x81, /* 0x00-0x03 */ + 0xC3, 0xF2, 0xAE, 0x82, 0xAE, 0x83, 0xAE, 0x84, /* 0x04-0x07 */ + 0xC3, 0xF3, 0xAE, 0x85, 0xAE, 0x86, 0xAE, 0x87, /* 0x08-0x0B */ + 0xC3, 0xF4, 0xAE, 0x88, 0xAE, 0x89, 0xAE, 0x8A, /* 0x0C-0x0F */ + 0xAE, 0x8B, 0xAE, 0x8C, 0xAE, 0x8D, 0xAE, 0x8E, /* 0x10-0x13 */ + 0xC3, 0xF5, 0xAE, 0x8F, 0xAE, 0x90, 0xAE, 0x91, /* 0x14-0x17 */ + 0xAE, 0x92, 0xC3, 0xF6, 0xAE, 0x93, 0xAE, 0x94, /* 0x18-0x1B */ + 0xAE, 0x95, 0xAE, 0x96, 0xAE, 0x97, 0xAE, 0x98, /* 0x1C-0x1F */ + 0xC3, 0xF7, 0xC3, 0xF8, 0xAE, 0x99, 0xAE, 0x9A, /* 0x20-0x23 */ + 0xC3, 0xF9, 0xAE, 0x9B, 0xAE, 0x9C, 0xAE, 0x9D, /* 0x24-0x27 */ + 0xC3, 0xFA, 0xAE, 0x9E, 0xAE, 0x9F, 0xAE, 0xA0, /* 0x28-0x2B */ + 0xAF, 0x41, 0xAF, 0x42, 0xAF, 0x43, 0xAF, 0x44, /* 0x2C-0x2F */ + 0xC3, 0xFB, 0xC3, 0xFC, 0xAF, 0x45, 0xC3, 0xFD, /* 0x30-0x33 */ + 0xAF, 0x46, 0xC3, 0xFE, 0xAF, 0x47, 0xAF, 0x48, /* 0x34-0x37 */ + 0xAF, 0x49, 0xAF, 0x4A, 0xAF, 0x4B, 0xAF, 0x4C, /* 0x38-0x3B */ + 0xAF, 0x4D, 0xAF, 0x4E, 0xAF, 0x4F, 0xAF, 0x50, /* 0x3C-0x3F */ + 0xAF, 0x51, 0xAF, 0x52, 0xAF, 0x53, 0xAF, 0x54, /* 0x40-0x43 */ + 0xAF, 0x55, 0xAF, 0x56, 0xAF, 0x57, 0xAF, 0x58, /* 0x44-0x47 */ + 0xAF, 0x59, 0xAF, 0x5A, 0xAF, 0x61, 0xAF, 0x62, /* 0x48-0x4B */ + 0xAF, 0x63, 0xAF, 0x64, 0xAF, 0x65, 0xAF, 0x66, /* 0x4C-0x4F */ + 0xAF, 0x67, 0xAF, 0x68, 0xAF, 0x69, 0xAF, 0x6A, /* 0x50-0x53 */ + 0xAF, 0x6B, 0xAF, 0x6C, 0xAF, 0x6D, 0xAF, 0x6E, /* 0x54-0x57 */ + 0xC4, 0xA1, 0xC4, 0xA2, 0xAF, 0x6F, 0xAF, 0x70, /* 0x58-0x5B */ + 0xC4, 0xA3, 0xAF, 0x71, 0xAF, 0x72, 0xC4, 0xA4, /* 0x5C-0x5F */ + 0xC4, 0xA5, 0xC4, 0xA6, 0xAF, 0x73, 0xAF, 0x74, /* 0x60-0x63 */ + 0xAF, 0x75, 0xAF, 0x76, 0xAF, 0x77, 0xAF, 0x78, /* 0x64-0x67 */ + 0xC4, 0xA7, 0xC4, 0xA8, 0xAF, 0x79, 0xC4, 0xA9, /* 0x68-0x6B */ + 0xAF, 0x7A, 0xC4, 0xAA, 0xAF, 0x81, 0xAF, 0x82, /* 0x6C-0x6F */ + 0xAF, 0x83, 0xAF, 0x84, 0xAF, 0x85, 0xAF, 0x86, /* 0x70-0x73 */ + 0xC4, 0xAB, 0xC4, 0xAC, 0xAF, 0x87, 0xAF, 0x88, /* 0x74-0x77 */ + 0xC4, 0xAD, 0xAF, 0x89, 0xAF, 0x8A, 0xAF, 0x8B, /* 0x78-0x7B */ + 0xC4, 0xAE, 0xAF, 0x8C, 0xAF, 0x8D, 0xAF, 0x8E, /* 0x7C-0x7F */ + + 0xAF, 0x8F, 0xAF, 0x90, 0xAF, 0x91, 0xAF, 0x92, /* 0x80-0x83 */ + 0xC4, 0xAF, 0xC4, 0xB0, 0xAF, 0x93, 0xC4, 0xB1, /* 0x84-0x87 */ + 0xAF, 0x94, 0xC4, 0xB2, 0xAF, 0x95, 0xAF, 0x96, /* 0x88-0x8B */ + 0xAF, 0x97, 0xAF, 0x98, 0xAF, 0x99, 0xAF, 0x9A, /* 0x8C-0x8F */ + 0xC4, 0xB3, 0xC4, 0xB4, 0xAF, 0x9B, 0xAF, 0x9C, /* 0x90-0x93 */ + 0xC4, 0xB5, 0xAF, 0x9D, 0xAF, 0x9E, 0xAF, 0x9F, /* 0x94-0x97 */ + 0xC4, 0xB6, 0xAF, 0xA0, 0xB0, 0x41, 0xB0, 0x42, /* 0x98-0x9B */ + 0xB0, 0x43, 0xB0, 0x44, 0xB0, 0x45, 0xB0, 0x46, /* 0x9C-0x9F */ + 0xC4, 0xB7, 0xC4, 0xB8, 0xB0, 0x47, 0xC4, 0xB9, /* 0xA0-0xA3 */ + 0xC4, 0xBA, 0xC4, 0xBB, 0xB0, 0x48, 0xB0, 0x49, /* 0xA4-0xA7 */ + 0xB0, 0x4A, 0xB0, 0x4B, 0xB0, 0x4C, 0xB0, 0x4D, /* 0xA8-0xAB */ + 0xC4, 0xBC, 0xC4, 0xBD, 0xB0, 0x4E, 0xB0, 0x4F, /* 0xAC-0xAF */ + 0xB0, 0x50, 0xB0, 0x51, 0xB0, 0x52, 0xB0, 0x53, /* 0xB0-0xB3 */ + 0xB0, 0x54, 0xB0, 0x55, 0xB0, 0x56, 0xB0, 0x57, /* 0xB4-0xB7 */ + 0xB0, 0x58, 0xB0, 0x59, 0xB0, 0x5A, 0xB0, 0x61, /* 0xB8-0xBB */ + 0xB0, 0x62, 0xB0, 0x63, 0xB0, 0x64, 0xB0, 0x65, /* 0xBC-0xBF */ + 0xB0, 0x66, 0xC4, 0xBE, 0xB0, 0x67, 0xB0, 0x68, /* 0xC0-0xC3 */ + 0xB0, 0x69, 0xB0, 0x6A, 0xB0, 0x6B, 0xB0, 0x6C, /* 0xC4-0xC7 */ + 0xB0, 0x6D, 0xB0, 0x6E, 0xB0, 0x6F, 0xB0, 0x70, /* 0xC8-0xCB */ + 0xB0, 0x71, 0xB0, 0x72, 0xB0, 0x73, 0xB0, 0x74, /* 0xCC-0xCF */ + 0xB0, 0x75, 0xB0, 0x76, 0xB0, 0x77, 0xB0, 0x78, /* 0xD0-0xD3 */ + 0xB0, 0x79, 0xB0, 0x7A, 0xB0, 0x81, 0xB0, 0x82, /* 0xD4-0xD7 */ + 0xB0, 0x83, 0xB0, 0x84, 0xB0, 0x85, 0xB0, 0x86, /* 0xD8-0xDB */ + 0xB0, 0x87, 0xB0, 0x88, 0xB0, 0x89, 0xB0, 0x8A, /* 0xDC-0xDF */ + 0xB0, 0x8B, 0xB0, 0x8C, 0xB0, 0x8D, 0xB0, 0x8E, /* 0xE0-0xE3 */ + 0xC4, 0xBF, 0xC4, 0xC0, 0xB0, 0x8F, 0xB0, 0x90, /* 0xE4-0xE7 */ + 0xC4, 0xC1, 0xB0, 0x91, 0xB0, 0x92, 0xC4, 0xC2, /* 0xE8-0xEB */ + 0xC4, 0xC3, 0xB0, 0x93, 0xB0, 0x94, 0xB0, 0x95, /* 0xEC-0xEF */ + 0xB0, 0x96, 0xB0, 0x97, 0xB0, 0x98, 0xB0, 0x99, /* 0xF0-0xF3 */ + 0xC4, 0xC4, 0xC4, 0xC5, 0xB0, 0x9A, 0xC4, 0xC6, /* 0xF4-0xF7 */ + 0xC4, 0xC7, 0xC4, 0xC8, 0xB0, 0x9B, 0xB0, 0x9C, /* 0xF8-0xFB */ + 0xB0, 0x9D, 0xB0, 0x9E, 0xB0, 0x9F, 0xB0, 0xA0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_CF[512] = { + 0xC4, 0xC9, 0xC4, 0xCA, 0xB1, 0x41, 0xB1, 0x42, /* 0x00-0x03 */ + 0xC4, 0xCB, 0xB1, 0x43, 0xB1, 0x44, 0xB1, 0x45, /* 0x04-0x07 */ + 0xC4, 0xCC, 0xB1, 0x46, 0xB1, 0x47, 0xB1, 0x48, /* 0x08-0x0B */ + 0xB1, 0x49, 0xB1, 0x4A, 0xB1, 0x4B, 0xB1, 0x4C, /* 0x0C-0x0F */ + 0xC4, 0xCD, 0xC4, 0xCE, 0xB1, 0x4D, 0xC4, 0xCF, /* 0x10-0x13 */ + 0xB1, 0x4E, 0xC4, 0xD0, 0xB1, 0x4F, 0xB1, 0x50, /* 0x14-0x17 */ + 0xB1, 0x51, 0xB1, 0x52, 0xB1, 0x53, 0xB1, 0x54, /* 0x18-0x1B */ + 0xC4, 0xD1, 0xB1, 0x55, 0xB1, 0x56, 0xB1, 0x57, /* 0x1C-0x1F */ + 0xC4, 0xD2, 0xB1, 0x58, 0xB1, 0x59, 0xB1, 0x5A, /* 0x20-0x23 */ + 0xC4, 0xD3, 0xB1, 0x61, 0xB1, 0x62, 0xB1, 0x63, /* 0x24-0x27 */ + 0xB1, 0x64, 0xB1, 0x65, 0xB1, 0x66, 0xB1, 0x67, /* 0x28-0x2B */ + 0xC4, 0xD4, 0xC4, 0xD5, 0xB1, 0x68, 0xC4, 0xD6, /* 0x2C-0x2F */ + 0xC4, 0xD7, 0xC4, 0xD8, 0xB1, 0x69, 0xB1, 0x6A, /* 0x30-0x33 */ + 0xB1, 0x6B, 0xB1, 0x6C, 0xB1, 0x6D, 0xB1, 0x6E, /* 0x34-0x37 */ + 0xC4, 0xD9, 0xB1, 0x6F, 0xB1, 0x70, 0xB1, 0x71, /* 0x38-0x3B */ + 0xB1, 0x72, 0xB1, 0x73, 0xB1, 0x74, 0xB1, 0x75, /* 0x3C-0x3F */ + 0xB1, 0x76, 0xB1, 0x77, 0xB1, 0x78, 0xB1, 0x79, /* 0x40-0x43 */ + 0xB1, 0x7A, 0xB1, 0x81, 0xB1, 0x82, 0xB1, 0x83, /* 0x44-0x47 */ + 0xB1, 0x84, 0xB1, 0x85, 0xB1, 0x86, 0xB1, 0x87, /* 0x48-0x4B */ + 0xB1, 0x88, 0xB1, 0x89, 0xB1, 0x8A, 0xB1, 0x8B, /* 0x4C-0x4F */ + 0xB1, 0x8C, 0xB1, 0x8D, 0xB1, 0x8E, 0xB1, 0x8F, /* 0x50-0x53 */ + 0xC4, 0xDA, 0xC4, 0xDB, 0xB1, 0x90, 0xB1, 0x91, /* 0x54-0x57 */ + 0xC4, 0xDC, 0xB1, 0x92, 0xB1, 0x93, 0xB1, 0x94, /* 0x58-0x5B */ + 0xC4, 0xDD, 0xB1, 0x95, 0xB1, 0x96, 0xB1, 0x97, /* 0x5C-0x5F */ + 0xB1, 0x98, 0xB1, 0x99, 0xB1, 0x9A, 0xB1, 0x9B, /* 0x60-0x63 */ + 0xC4, 0xDE, 0xC4, 0xDF, 0xB1, 0x9C, 0xC4, 0xE0, /* 0x64-0x67 */ + 0xB1, 0x9D, 0xC4, 0xE1, 0xB1, 0x9E, 0xB1, 0x9F, /* 0x68-0x6B */ + 0xB1, 0xA0, 0xB2, 0x41, 0xB2, 0x42, 0xB2, 0x43, /* 0x6C-0x6F */ + 0xC4, 0xE2, 0xC4, 0xE3, 0xB2, 0x44, 0xB2, 0x45, /* 0x70-0x73 */ + 0xC4, 0xE4, 0xB2, 0x46, 0xB2, 0x47, 0xB2, 0x48, /* 0x74-0x77 */ + 0xC4, 0xE5, 0xB2, 0x49, 0xB2, 0x4A, 0xB2, 0x4B, /* 0x78-0x7B */ + 0xB2, 0x4C, 0xB2, 0x4D, 0xB2, 0x4E, 0xB2, 0x4F, /* 0x7C-0x7F */ + + 0xC4, 0xE6, 0xB2, 0x50, 0xB2, 0x51, 0xB2, 0x52, /* 0x80-0x83 */ + 0xB2, 0x53, 0xC4, 0xE7, 0xB2, 0x54, 0xB2, 0x55, /* 0x84-0x87 */ + 0xB2, 0x56, 0xB2, 0x57, 0xB2, 0x58, 0xB2, 0x59, /* 0x88-0x8B */ + 0xC4, 0xE8, 0xB2, 0x5A, 0xB2, 0x61, 0xB2, 0x62, /* 0x8C-0x8F */ + 0xB2, 0x63, 0xB2, 0x64, 0xB2, 0x65, 0xB2, 0x66, /* 0x90-0x93 */ + 0xB2, 0x67, 0xB2, 0x68, 0xB2, 0x69, 0xB2, 0x6A, /* 0x94-0x97 */ + 0xB2, 0x6B, 0xB2, 0x6C, 0xB2, 0x6D, 0xB2, 0x6E, /* 0x98-0x9B */ + 0xB2, 0x6F, 0xB2, 0x70, 0xB2, 0x71, 0xB2, 0x72, /* 0x9C-0x9F */ + 0xB2, 0x73, 0xC4, 0xE9, 0xB2, 0x74, 0xB2, 0x75, /* 0xA0-0xA3 */ + 0xB2, 0x76, 0xB2, 0x77, 0xB2, 0x78, 0xB2, 0x79, /* 0xA4-0xA7 */ + 0xC4, 0xEA, 0xB2, 0x7A, 0xB2, 0x81, 0xB2, 0x82, /* 0xA8-0xAB */ + 0xB2, 0x83, 0xB2, 0x84, 0xB2, 0x85, 0xB2, 0x86, /* 0xAC-0xAF */ + 0xC4, 0xEB, 0xB2, 0x87, 0xB2, 0x88, 0xB2, 0x89, /* 0xB0-0xB3 */ + 0xB2, 0x8A, 0xB2, 0x8B, 0xB2, 0x8C, 0xB2, 0x8D, /* 0xB4-0xB7 */ + 0xB2, 0x8E, 0xB2, 0x8F, 0xB2, 0x90, 0xB2, 0x91, /* 0xB8-0xBB */ + 0xB2, 0x92, 0xB2, 0x93, 0xB2, 0x94, 0xB2, 0x95, /* 0xBC-0xBF */ + 0xB2, 0x96, 0xB2, 0x97, 0xB2, 0x98, 0xB2, 0x99, /* 0xC0-0xC3 */ + 0xC4, 0xEC, 0xB2, 0x9A, 0xB2, 0x9B, 0xB2, 0x9C, /* 0xC4-0xC7 */ + 0xB2, 0x9D, 0xB2, 0x9E, 0xB2, 0x9F, 0xB2, 0xA0, /* 0xC8-0xCB */ + 0xB3, 0x41, 0xB3, 0x42, 0xB3, 0x43, 0xB3, 0x44, /* 0xCC-0xCF */ + 0xB3, 0x45, 0xB3, 0x46, 0xB3, 0x47, 0xB3, 0x48, /* 0xD0-0xD3 */ + 0xB3, 0x49, 0xB3, 0x4A, 0xB3, 0x4B, 0xB3, 0x4C, /* 0xD4-0xD7 */ + 0xB3, 0x4D, 0xB3, 0x4E, 0xB3, 0x4F, 0xB3, 0x50, /* 0xD8-0xDB */ + 0xB3, 0x51, 0xB3, 0x52, 0xB3, 0x53, 0xB3, 0x54, /* 0xDC-0xDF */ + 0xC4, 0xED, 0xC4, 0xEE, 0xB3, 0x55, 0xB3, 0x56, /* 0xE0-0xE3 */ + 0xC4, 0xEF, 0xB3, 0x57, 0xB3, 0x58, 0xB3, 0x59, /* 0xE4-0xE7 */ + 0xC4, 0xF0, 0xB3, 0x5A, 0xB3, 0x61, 0xB3, 0x62, /* 0xE8-0xEB */ + 0xB3, 0x63, 0xB3, 0x64, 0xB3, 0x65, 0xB3, 0x66, /* 0xEC-0xEF */ + 0xC4, 0xF1, 0xC4, 0xF2, 0xB3, 0x67, 0xC4, 0xF3, /* 0xF0-0xF3 */ + 0xB3, 0x68, 0xC4, 0xF4, 0xB3, 0x69, 0xB3, 0x6A, /* 0xF4-0xF7 */ + 0xB3, 0x6B, 0xB3, 0x6C, 0xB3, 0x6D, 0xB3, 0x6E, /* 0xF8-0xFB */ + 0xC4, 0xF5, 0xB3, 0x6F, 0xB3, 0x70, 0xB3, 0x71, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D0[512] = { + 0xC4, 0xF6, 0xB3, 0x72, 0xB3, 0x73, 0xB3, 0x74, /* 0x00-0x03 */ + 0xC4, 0xF7, 0xB3, 0x75, 0xB3, 0x76, 0xB3, 0x77, /* 0x04-0x07 */ + 0xB3, 0x78, 0xB3, 0x79, 0xB3, 0x7A, 0xB3, 0x81, /* 0x08-0x0B */ + 0xB3, 0x82, 0xB3, 0x83, 0xB3, 0x84, 0xB3, 0x85, /* 0x0C-0x0F */ + 0xB3, 0x86, 0xC4, 0xF8, 0xB3, 0x87, 0xB3, 0x88, /* 0x10-0x13 */ + 0xB3, 0x89, 0xB3, 0x8A, 0xB3, 0x8B, 0xB3, 0x8C, /* 0x14-0x17 */ + 0xC4, 0xF9, 0xB3, 0x8D, 0xB3, 0x8E, 0xB3, 0x8F, /* 0x18-0x1B */ + 0xB3, 0x90, 0xB3, 0x91, 0xB3, 0x92, 0xB3, 0x93, /* 0x1C-0x1F */ + 0xB3, 0x94, 0xB3, 0x95, 0xB3, 0x96, 0xB3, 0x97, /* 0x20-0x23 */ + 0xB3, 0x98, 0xB3, 0x99, 0xB3, 0x9A, 0xB3, 0x9B, /* 0x24-0x27 */ + 0xB3, 0x9C, 0xB3, 0x9D, 0xB3, 0x9E, 0xB3, 0x9F, /* 0x28-0x2B */ + 0xB3, 0xA0, 0xC4, 0xFA, 0xB4, 0x41, 0xB4, 0x42, /* 0x2C-0x2F */ + 0xB4, 0x43, 0xB4, 0x44, 0xB4, 0x45, 0xB4, 0x46, /* 0x30-0x33 */ + 0xC4, 0xFB, 0xC4, 0xFC, 0xB4, 0x47, 0xB4, 0x48, /* 0x34-0x37 */ + 0xC4, 0xFD, 0xB4, 0x49, 0xB4, 0x4A, 0xB4, 0x4B, /* 0x38-0x3B */ + 0xC4, 0xFE, 0xB4, 0x4C, 0xB4, 0x4D, 0xB4, 0x4E, /* 0x3C-0x3F */ + 0xB4, 0x4F, 0xB4, 0x50, 0xB4, 0x51, 0xB4, 0x52, /* 0x40-0x43 */ + 0xC5, 0xA1, 0xC5, 0xA2, 0xB4, 0x53, 0xC5, 0xA3, /* 0x44-0x47 */ + 0xB4, 0x54, 0xC5, 0xA4, 0xB4, 0x55, 0xB4, 0x56, /* 0x48-0x4B */ + 0xB4, 0x57, 0xB4, 0x58, 0xB4, 0x59, 0xB4, 0x5A, /* 0x4C-0x4F */ + 0xC5, 0xA5, 0xB4, 0x61, 0xB4, 0x62, 0xB4, 0x63, /* 0x50-0x53 */ + 0xC5, 0xA6, 0xB4, 0x64, 0xB4, 0x65, 0xB4, 0x66, /* 0x54-0x57 */ + 0xC5, 0xA7, 0xB4, 0x67, 0xB4, 0x68, 0xB4, 0x69, /* 0x58-0x5B */ + 0xB4, 0x6A, 0xB4, 0x6B, 0xB4, 0x6C, 0xB4, 0x6D, /* 0x5C-0x5F */ + 0xC5, 0xA8, 0xB4, 0x6E, 0xB4, 0x6F, 0xB4, 0x70, /* 0x60-0x63 */ + 0xB4, 0x71, 0xB4, 0x72, 0xB4, 0x73, 0xB4, 0x74, /* 0x64-0x67 */ + 0xB4, 0x75, 0xB4, 0x76, 0xB4, 0x77, 0xB4, 0x78, /* 0x68-0x6B */ + 0xC5, 0xA9, 0xC5, 0xAA, 0xB4, 0x79, 0xB4, 0x7A, /* 0x6C-0x6F */ + 0xC5, 0xAB, 0xB4, 0x81, 0xB4, 0x82, 0xB4, 0x83, /* 0x70-0x73 */ + 0xC5, 0xAC, 0xB4, 0x84, 0xB4, 0x85, 0xB4, 0x86, /* 0x74-0x77 */ + 0xB4, 0x87, 0xB4, 0x88, 0xB4, 0x89, 0xB4, 0x8A, /* 0x78-0x7B */ + 0xC5, 0xAD, 0xC5, 0xAE, 0xB4, 0x8B, 0xB4, 0x8C, /* 0x7C-0x7F */ + + 0xB4, 0x8D, 0xC5, 0xAF, 0xB4, 0x8E, 0xB4, 0x8F, /* 0x80-0x83 */ + 0xB4, 0x90, 0xB4, 0x91, 0xB4, 0x92, 0xB4, 0x93, /* 0x84-0x87 */ + 0xB4, 0x94, 0xB4, 0x95, 0xB4, 0x96, 0xB4, 0x97, /* 0x88-0x8B */ + 0xB4, 0x98, 0xB4, 0x99, 0xB4, 0x9A, 0xB4, 0x9B, /* 0x8C-0x8F */ + 0xB4, 0x9C, 0xB4, 0x9D, 0xB4, 0x9E, 0xB4, 0x9F, /* 0x90-0x93 */ + 0xB4, 0xA0, 0xB5, 0x41, 0xB5, 0x42, 0xB5, 0x43, /* 0x94-0x97 */ + 0xB5, 0x44, 0xB5, 0x45, 0xB5, 0x46, 0xB5, 0x47, /* 0x98-0x9B */ + 0xB5, 0x48, 0xB5, 0x49, 0xB5, 0x4A, 0xB5, 0x4B, /* 0x9C-0x9F */ + 0xB5, 0x4C, 0xB5, 0x4D, 0xB5, 0x4E, 0xB5, 0x4F, /* 0xA0-0xA3 */ + 0xC5, 0xB0, 0xC5, 0xB1, 0xB5, 0x50, 0xB5, 0x51, /* 0xA4-0xA7 */ + 0xC5, 0xB2, 0xB5, 0x52, 0xB5, 0x53, 0xB5, 0x54, /* 0xA8-0xAB */ + 0xC5, 0xB3, 0xB5, 0x55, 0xB5, 0x56, 0xB5, 0x57, /* 0xAC-0xAF */ + 0xB5, 0x58, 0xB5, 0x59, 0xB5, 0x5A, 0xB5, 0x61, /* 0xB0-0xB3 */ + 0xC5, 0xB4, 0xC5, 0xB5, 0xB5, 0x62, 0xC5, 0xB6, /* 0xB4-0xB7 */ + 0xB5, 0x63, 0xC5, 0xB7, 0xB5, 0x64, 0xB5, 0x65, /* 0xB8-0xBB */ + 0xB5, 0x66, 0xB5, 0x67, 0xB5, 0x68, 0xB5, 0x69, /* 0xBC-0xBF */ + 0xC5, 0xB8, 0xC5, 0xB9, 0xB5, 0x6A, 0xB5, 0x6B, /* 0xC0-0xC3 */ + 0xC5, 0xBA, 0xB5, 0x6C, 0xB5, 0x6D, 0xB5, 0x6E, /* 0xC4-0xC7 */ + 0xC5, 0xBB, 0xC5, 0xBC, 0xB5, 0x6F, 0xB5, 0x70, /* 0xC8-0xCB */ + 0xB5, 0x71, 0xB5, 0x72, 0xB5, 0x73, 0xB5, 0x74, /* 0xCC-0xCF */ + 0xC5, 0xBD, 0xC5, 0xBE, 0xB5, 0x75, 0xC5, 0xBF, /* 0xD0-0xD3 */ + 0xC5, 0xC0, 0xC5, 0xC1, 0xB5, 0x76, 0xB5, 0x77, /* 0xD4-0xD7 */ + 0xB5, 0x78, 0xB5, 0x79, 0xB5, 0x7A, 0xB5, 0x81, /* 0xD8-0xDB */ + 0xC5, 0xC2, 0xC5, 0xC3, 0xB5, 0x82, 0xB5, 0x83, /* 0xDC-0xDF */ + 0xC5, 0xC4, 0xB5, 0x84, 0xB5, 0x85, 0xB5, 0x86, /* 0xE0-0xE3 */ + 0xC5, 0xC5, 0xB5, 0x87, 0xB5, 0x88, 0xB5, 0x89, /* 0xE4-0xE7 */ + 0xB5, 0x8A, 0xB5, 0x8B, 0xB5, 0x8C, 0xB5, 0x8D, /* 0xE8-0xEB */ + 0xC5, 0xC6, 0xC5, 0xC7, 0xB5, 0x8E, 0xC5, 0xC8, /* 0xEC-0xEF */ + 0xC5, 0xC9, 0xC5, 0xCA, 0xB5, 0x8F, 0xB5, 0x90, /* 0xF0-0xF3 */ + 0xB5, 0x91, 0xB5, 0x92, 0xB5, 0x93, 0xB5, 0x94, /* 0xF4-0xF7 */ + 0xC5, 0xCB, 0xB5, 0x95, 0xB5, 0x96, 0xB5, 0x97, /* 0xF8-0xFB */ + 0xB5, 0x98, 0xB5, 0x99, 0xB5, 0x9A, 0xB5, 0x9B, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D1[512] = { + 0xB5, 0x9C, 0xB5, 0x9D, 0xB5, 0x9E, 0xB5, 0x9F, /* 0x00-0x03 */ + 0xB5, 0xA0, 0xB6, 0x41, 0xB6, 0x42, 0xB6, 0x43, /* 0x04-0x07 */ + 0xB6, 0x44, 0xB6, 0x45, 0xB6, 0x46, 0xB6, 0x47, /* 0x08-0x0B */ + 0xB6, 0x48, 0xC5, 0xCC, 0xB6, 0x49, 0xB6, 0x4A, /* 0x0C-0x0F */ + 0xB6, 0x4B, 0xB6, 0x4C, 0xB6, 0x4D, 0xB6, 0x4E, /* 0x10-0x13 */ + 0xB6, 0x4F, 0xB6, 0x50, 0xB6, 0x51, 0xB6, 0x52, /* 0x14-0x17 */ + 0xB6, 0x53, 0xB6, 0x54, 0xB6, 0x55, 0xB6, 0x56, /* 0x18-0x1B */ + 0xB6, 0x57, 0xB6, 0x58, 0xB6, 0x59, 0xB6, 0x5A, /* 0x1C-0x1F */ + 0xB6, 0x61, 0xB6, 0x62, 0xB6, 0x63, 0xB6, 0x64, /* 0x20-0x23 */ + 0xB6, 0x65, 0xB6, 0x66, 0xB6, 0x67, 0xB6, 0x68, /* 0x24-0x27 */ + 0xB6, 0x69, 0xB6, 0x6A, 0xB6, 0x6B, 0xB6, 0x6C, /* 0x28-0x2B */ + 0xB6, 0x6D, 0xB6, 0x6E, 0xB6, 0x6F, 0xB6, 0x70, /* 0x2C-0x2F */ + 0xC5, 0xCD, 0xC5, 0xCE, 0xB6, 0x71, 0xB6, 0x72, /* 0x30-0x33 */ + 0xC5, 0xCF, 0xB6, 0x73, 0xB6, 0x74, 0xB6, 0x75, /* 0x34-0x37 */ + 0xC5, 0xD0, 0xB6, 0x76, 0xC5, 0xD1, 0xB6, 0x77, /* 0x38-0x3B */ + 0xB6, 0x78, 0xB6, 0x79, 0xB6, 0x7A, 0xB6, 0x81, /* 0x3C-0x3F */ + 0xC5, 0xD2, 0xC5, 0xD3, 0xB6, 0x82, 0xC5, 0xD4, /* 0x40-0x43 */ + 0xC5, 0xD5, 0xC5, 0xD6, 0xB6, 0x83, 0xB6, 0x84, /* 0x44-0x47 */ + 0xB6, 0x85, 0xB6, 0x86, 0xB6, 0x87, 0xB6, 0x88, /* 0x48-0x4B */ + 0xC5, 0xD7, 0xC5, 0xD8, 0xB6, 0x89, 0xB6, 0x8A, /* 0x4C-0x4F */ + 0xC5, 0xD9, 0xB6, 0x8B, 0xB6, 0x8C, 0xB6, 0x8D, /* 0x50-0x53 */ + 0xC5, 0xDA, 0xB6, 0x8E, 0xB6, 0x8F, 0xB6, 0x90, /* 0x54-0x57 */ + 0xB6, 0x91, 0xB6, 0x92, 0xB6, 0x93, 0xB6, 0x94, /* 0x58-0x5B */ + 0xC5, 0xDB, 0xC5, 0xDC, 0xB6, 0x95, 0xC5, 0xDD, /* 0x5C-0x5F */ + 0xB6, 0x96, 0xC5, 0xDE, 0xB6, 0x97, 0xB6, 0x98, /* 0x60-0x63 */ + 0xB6, 0x99, 0xB6, 0x9A, 0xB6, 0x9B, 0xB6, 0x9C, /* 0x64-0x67 */ + 0xC5, 0xDF, 0xB6, 0x9D, 0xB6, 0x9E, 0xB6, 0x9F, /* 0x68-0x6B */ + 0xC5, 0xE0, 0xB6, 0xA0, 0xB7, 0x41, 0xB7, 0x42, /* 0x6C-0x6F */ + 0xB7, 0x43, 0xB7, 0x44, 0xB7, 0x45, 0xB7, 0x46, /* 0x70-0x73 */ + 0xB7, 0x47, 0xB7, 0x48, 0xB7, 0x49, 0xB7, 0x4A, /* 0x74-0x77 */ + 0xB7, 0x4B, 0xB7, 0x4C, 0xB7, 0x4D, 0xB7, 0x4E, /* 0x78-0x7B */ + 0xC5, 0xE1, 0xB7, 0x4F, 0xB7, 0x50, 0xB7, 0x51, /* 0x7C-0x7F */ + + 0xB7, 0x52, 0xB7, 0x53, 0xB7, 0x54, 0xB7, 0x55, /* 0x80-0x83 */ + 0xC5, 0xE2, 0xB7, 0x56, 0xB7, 0x57, 0xB7, 0x58, /* 0x84-0x87 */ + 0xC5, 0xE3, 0xB7, 0x59, 0xB7, 0x5A, 0xB7, 0x61, /* 0x88-0x8B */ + 0xB7, 0x62, 0xB7, 0x63, 0xB7, 0x64, 0xB7, 0x65, /* 0x8C-0x8F */ + 0xB7, 0x66, 0xB7, 0x67, 0xB7, 0x68, 0xB7, 0x69, /* 0x90-0x93 */ + 0xB7, 0x6A, 0xB7, 0x6B, 0xB7, 0x6C, 0xB7, 0x6D, /* 0x94-0x97 */ + 0xB7, 0x6E, 0xB7, 0x6F, 0xB7, 0x70, 0xB7, 0x71, /* 0x98-0x9B */ + 0xB7, 0x72, 0xB7, 0x73, 0xB7, 0x74, 0xB7, 0x75, /* 0x9C-0x9F */ + 0xC5, 0xE4, 0xC5, 0xE5, 0xB7, 0x76, 0xB7, 0x77, /* 0xA0-0xA3 */ + 0xC5, 0xE6, 0xB7, 0x78, 0xB7, 0x79, 0xB7, 0x7A, /* 0xA4-0xA7 */ + 0xC5, 0xE7, 0xB7, 0x81, 0xB7, 0x82, 0xB7, 0x83, /* 0xA8-0xAB */ + 0xB7, 0x84, 0xB7, 0x85, 0xB7, 0x86, 0xB7, 0x87, /* 0xAC-0xAF */ + 0xC5, 0xE8, 0xC5, 0xE9, 0xB7, 0x88, 0xC5, 0xEA, /* 0xB0-0xB3 */ + 0xB7, 0x89, 0xC5, 0xEB, 0xB7, 0x8A, 0xB7, 0x8B, /* 0xB4-0xB7 */ + 0xB7, 0x8C, 0xB7, 0x8D, 0xC5, 0xEC, 0xB7, 0x8E, /* 0xB8-0xBB */ + 0xC5, 0xED, 0xB7, 0x8F, 0xB7, 0x90, 0xB7, 0x91, /* 0xBC-0xBF */ + 0xC5, 0xEE, 0xB7, 0x92, 0xB7, 0x93, 0xB7, 0x94, /* 0xC0-0xC3 */ + 0xB7, 0x95, 0xB7, 0x96, 0xB7, 0x97, 0xB7, 0x98, /* 0xC4-0xC7 */ + 0xB7, 0x99, 0xB7, 0x9A, 0xB7, 0x9B, 0xB7, 0x9C, /* 0xC8-0xCB */ + 0xB7, 0x9D, 0xB7, 0x9E, 0xB7, 0x9F, 0xB7, 0xA0, /* 0xCC-0xCF */ + 0xB8, 0x41, 0xB8, 0x42, 0xB8, 0x43, 0xB8, 0x44, /* 0xD0-0xD3 */ + 0xB8, 0x45, 0xB8, 0x46, 0xB8, 0x47, 0xB8, 0x48, /* 0xD4-0xD7 */ + 0xC5, 0xEF, 0xB8, 0x49, 0xB8, 0x4A, 0xB8, 0x4B, /* 0xD8-0xDB */ + 0xB8, 0x4C, 0xB8, 0x4D, 0xB8, 0x4E, 0xB8, 0x4F, /* 0xDC-0xDF */ + 0xB8, 0x50, 0xB8, 0x51, 0xB8, 0x52, 0xB8, 0x53, /* 0xE0-0xE3 */ + 0xB8, 0x54, 0xB8, 0x55, 0xB8, 0x56, 0xB8, 0x57, /* 0xE4-0xE7 */ + 0xB8, 0x58, 0xB8, 0x59, 0xB8, 0x5A, 0xB8, 0x61, /* 0xE8-0xEB */ + 0xB8, 0x62, 0xB8, 0x63, 0xB8, 0x64, 0xB8, 0x65, /* 0xEC-0xEF */ + 0xB8, 0x66, 0xB8, 0x67, 0xB8, 0x68, 0xB8, 0x69, /* 0xF0-0xF3 */ + 0xC5, 0xF0, 0xB8, 0x6A, 0xB8, 0x6B, 0xB8, 0x6C, /* 0xF4-0xF7 */ + 0xC5, 0xF1, 0xB8, 0x6D, 0xB8, 0x6E, 0xB8, 0x6F, /* 0xF8-0xFB */ + 0xB8, 0x70, 0xB8, 0x71, 0xB8, 0x72, 0xB8, 0x73, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D2[512] = { + 0xB8, 0x74, 0xB8, 0x75, 0xB8, 0x76, 0xB8, 0x77, /* 0x00-0x03 */ + 0xB8, 0x78, 0xB8, 0x79, 0xB8, 0x7A, 0xC5, 0xF2, /* 0x04-0x07 */ + 0xB8, 0x81, 0xC5, 0xF3, 0xB8, 0x82, 0xB8, 0x83, /* 0x08-0x0B */ + 0xB8, 0x84, 0xB8, 0x85, 0xB8, 0x86, 0xB8, 0x87, /* 0x0C-0x0F */ + 0xC5, 0xF4, 0xB8, 0x88, 0xB8, 0x89, 0xB8, 0x8A, /* 0x10-0x13 */ + 0xB8, 0x8B, 0xB8, 0x8C, 0xB8, 0x8D, 0xB8, 0x8E, /* 0x14-0x17 */ + 0xB8, 0x8F, 0xB8, 0x90, 0xB8, 0x91, 0xB8, 0x92, /* 0x18-0x1B */ + 0xB8, 0x93, 0xB8, 0x94, 0xB8, 0x95, 0xB8, 0x96, /* 0x1C-0x1F */ + 0xB8, 0x97, 0xB8, 0x98, 0xB8, 0x99, 0xB8, 0x9A, /* 0x20-0x23 */ + 0xB8, 0x9B, 0xB8, 0x9C, 0xB8, 0x9D, 0xB8, 0x9E, /* 0x24-0x27 */ + 0xB8, 0x9F, 0xB8, 0xA0, 0xB9, 0x41, 0xB9, 0x42, /* 0x28-0x2B */ + 0xC5, 0xF5, 0xC5, 0xF6, 0xB9, 0x43, 0xB9, 0x44, /* 0x2C-0x2F */ + 0xC5, 0xF7, 0xB9, 0x45, 0xB9, 0x46, 0xB9, 0x47, /* 0x30-0x33 */ + 0xC5, 0xF8, 0xB9, 0x48, 0xB9, 0x49, 0xB9, 0x4A, /* 0x34-0x37 */ + 0xB9, 0x4B, 0xB9, 0x4C, 0xB9, 0x4D, 0xB9, 0x4E, /* 0x38-0x3B */ + 0xC5, 0xF9, 0xC5, 0xFA, 0xB9, 0x4F, 0xC5, 0xFB, /* 0x3C-0x3F */ + 0xB9, 0x50, 0xC5, 0xFC, 0xB9, 0x51, 0xB9, 0x52, /* 0x40-0x43 */ + 0xB9, 0x53, 0xB9, 0x54, 0xB9, 0x55, 0xB9, 0x56, /* 0x44-0x47 */ + 0xC5, 0xFD, 0xB9, 0x57, 0xB9, 0x58, 0xB9, 0x59, /* 0x48-0x4B */ + 0xB9, 0x5A, 0xB9, 0x61, 0xB9, 0x62, 0xB9, 0x63, /* 0x4C-0x4F */ + 0xB9, 0x64, 0xB9, 0x65, 0xB9, 0x66, 0xB9, 0x67, /* 0x50-0x53 */ + 0xB9, 0x68, 0xB9, 0x69, 0xB9, 0x6A, 0xB9, 0x6B, /* 0x54-0x57 */ + 0xB9, 0x6C, 0xB9, 0x6D, 0xB9, 0x6E, 0xB9, 0x6F, /* 0x58-0x5B */ + 0xC5, 0xFE, 0xB9, 0x70, 0xB9, 0x71, 0xB9, 0x72, /* 0x5C-0x5F */ + 0xB9, 0x73, 0xB9, 0x74, 0xB9, 0x75, 0xB9, 0x76, /* 0x60-0x63 */ + 0xC6, 0xA1, 0xB9, 0x77, 0xB9, 0x78, 0xB9, 0x79, /* 0x64-0x67 */ + 0xB9, 0x7A, 0xB9, 0x81, 0xB9, 0x82, 0xB9, 0x83, /* 0x68-0x6B */ + 0xB9, 0x84, 0xB9, 0x85, 0xB9, 0x86, 0xB9, 0x87, /* 0x6C-0x6F */ + 0xB9, 0x88, 0xB9, 0x89, 0xB9, 0x8A, 0xB9, 0x8B, /* 0x70-0x73 */ + 0xB9, 0x8C, 0xB9, 0x8D, 0xB9, 0x8E, 0xB9, 0x8F, /* 0x74-0x77 */ + 0xB9, 0x90, 0xB9, 0x91, 0xB9, 0x92, 0xB9, 0x93, /* 0x78-0x7B */ + 0xB9, 0x94, 0xB9, 0x95, 0xB9, 0x96, 0xB9, 0x97, /* 0x7C-0x7F */ + + 0xC6, 0xA2, 0xC6, 0xA3, 0xB9, 0x98, 0xB9, 0x99, /* 0x80-0x83 */ + 0xC6, 0xA4, 0xB9, 0x9A, 0xB9, 0x9B, 0xB9, 0x9C, /* 0x84-0x87 */ + 0xC6, 0xA5, 0xB9, 0x9D, 0xB9, 0x9E, 0xB9, 0x9F, /* 0x88-0x8B */ + 0xB9, 0xA0, 0xBA, 0x41, 0xBA, 0x42, 0xBA, 0x43, /* 0x8C-0x8F */ + 0xC6, 0xA6, 0xC6, 0xA7, 0xBA, 0x44, 0xBA, 0x45, /* 0x90-0x93 */ + 0xBA, 0x46, 0xC6, 0xA8, 0xBA, 0x47, 0xBA, 0x48, /* 0x94-0x97 */ + 0xBA, 0x49, 0xBA, 0x4A, 0xBA, 0x4B, 0xBA, 0x4C, /* 0x98-0x9B */ + 0xC6, 0xA9, 0xBA, 0x4D, 0xBA, 0x4E, 0xBA, 0x4F, /* 0x9C-0x9F */ + 0xC6, 0xAA, 0xBA, 0x50, 0xBA, 0x51, 0xBA, 0x52, /* 0xA0-0xA3 */ + 0xC6, 0xAB, 0xBA, 0x53, 0xBA, 0x54, 0xBA, 0x55, /* 0xA4-0xA7 */ + 0xBA, 0x56, 0xBA, 0x57, 0xBA, 0x58, 0xBA, 0x59, /* 0xA8-0xAB */ + 0xC6, 0xAC, 0xBA, 0x5A, 0xBA, 0x61, 0xBA, 0x62, /* 0xAC-0xAF */ + 0xBA, 0x63, 0xC6, 0xAD, 0xBA, 0x64, 0xBA, 0x65, /* 0xB0-0xB3 */ + 0xBA, 0x66, 0xBA, 0x67, 0xBA, 0x68, 0xBA, 0x69, /* 0xB4-0xB7 */ + 0xC6, 0xAE, 0xC6, 0xAF, 0xBA, 0x6A, 0xBA, 0x6B, /* 0xB8-0xBB */ + 0xC6, 0xB0, 0xBA, 0x6C, 0xBA, 0x6D, 0xC6, 0xB1, /* 0xBC-0xBF */ + 0xC6, 0xB2, 0xBA, 0x6E, 0xC6, 0xB3, 0xBA, 0x6F, /* 0xC0-0xC3 */ + 0xBA, 0x70, 0xBA, 0x71, 0xBA, 0x72, 0xBA, 0x73, /* 0xC4-0xC7 */ + 0xC6, 0xB4, 0xC6, 0xB5, 0xBA, 0x74, 0xC6, 0xB6, /* 0xC8-0xCB */ + 0xBA, 0x75, 0xBA, 0x76, 0xBA, 0x77, 0xBA, 0x78, /* 0xCC-0xCF */ + 0xBA, 0x79, 0xBA, 0x7A, 0xBA, 0x81, 0xBA, 0x82, /* 0xD0-0xD3 */ + 0xC6, 0xB7, 0xBA, 0x83, 0xBA, 0x84, 0xBA, 0x85, /* 0xD4-0xD7 */ + 0xC6, 0xB8, 0xBA, 0x86, 0xBA, 0x87, 0xBA, 0x88, /* 0xD8-0xDB */ + 0xC6, 0xB9, 0xBA, 0x89, 0xBA, 0x8A, 0xBA, 0x8B, /* 0xDC-0xDF */ + 0xBA, 0x8C, 0xBA, 0x8D, 0xBA, 0x8E, 0xBA, 0x8F, /* 0xE0-0xE3 */ + 0xC6, 0xBA, 0xC6, 0xBB, 0xBA, 0x90, 0xBA, 0x91, /* 0xE4-0xE7 */ + 0xBA, 0x92, 0xBA, 0x93, 0xBA, 0x94, 0xBA, 0x95, /* 0xE8-0xEB */ + 0xBA, 0x96, 0xBA, 0x97, 0xBA, 0x98, 0xBA, 0x99, /* 0xEC-0xEF */ + 0xC6, 0xBC, 0xC6, 0xBD, 0xBA, 0x9A, 0xBA, 0x9B, /* 0xF0-0xF3 */ + 0xC6, 0xBE, 0xBA, 0x9C, 0xBA, 0x9D, 0xBA, 0x9E, /* 0xF4-0xF7 */ + 0xC6, 0xBF, 0xBA, 0x9F, 0xBA, 0xA0, 0xBB, 0x41, /* 0xF8-0xFB */ + 0xBB, 0x42, 0xBB, 0x43, 0xBB, 0x44, 0xBB, 0x45, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D3[512] = { + 0xC6, 0xC0, 0xC6, 0xC1, 0xBB, 0x46, 0xC6, 0xC2, /* 0x00-0x03 */ + 0xBB, 0x47, 0xC6, 0xC3, 0xBB, 0x48, 0xBB, 0x49, /* 0x04-0x07 */ + 0xBB, 0x4A, 0xBB, 0x4B, 0xBB, 0x4C, 0xBB, 0x4D, /* 0x08-0x0B */ + 0xC6, 0xC4, 0xC6, 0xC5, 0xC6, 0xC6, 0xBB, 0x4E, /* 0x0C-0x0F */ + 0xC6, 0xC7, 0xBB, 0x4F, 0xBB, 0x50, 0xBB, 0x51, /* 0x10-0x13 */ + 0xC6, 0xC8, 0xBB, 0x52, 0xC6, 0xC9, 0xBB, 0x53, /* 0x14-0x17 */ + 0xBB, 0x54, 0xBB, 0x55, 0xBB, 0x56, 0xBB, 0x57, /* 0x18-0x1B */ + 0xC6, 0xCA, 0xC6, 0xCB, 0xBB, 0x58, 0xC6, 0xCC, /* 0x1C-0x1F */ + 0xC6, 0xCD, 0xC6, 0xCE, 0xBB, 0x59, 0xBB, 0x5A, /* 0x20-0x23 */ + 0xBB, 0x61, 0xC6, 0xCF, 0xBB, 0x62, 0xBB, 0x63, /* 0x24-0x27 */ + 0xC6, 0xD0, 0xC6, 0xD1, 0xBB, 0x64, 0xBB, 0x65, /* 0x28-0x2B */ + 0xC6, 0xD2, 0xBB, 0x66, 0xBB, 0x67, 0xBB, 0x68, /* 0x2C-0x2F */ + 0xC6, 0xD3, 0xBB, 0x69, 0xBB, 0x6A, 0xBB, 0x6B, /* 0x30-0x33 */ + 0xBB, 0x6C, 0xBB, 0x6D, 0xBB, 0x6E, 0xBB, 0x6F, /* 0x34-0x37 */ + 0xC6, 0xD4, 0xC6, 0xD5, 0xBB, 0x70, 0xC6, 0xD6, /* 0x38-0x3B */ + 0xC6, 0xD7, 0xC6, 0xD8, 0xBB, 0x71, 0xBB, 0x72, /* 0x3C-0x3F */ + 0xBB, 0x73, 0xBB, 0x74, 0xBB, 0x75, 0xBB, 0x76, /* 0x40-0x43 */ + 0xC6, 0xD9, 0xC6, 0xDA, 0xBB, 0x77, 0xBB, 0x78, /* 0x44-0x47 */ + 0xBB, 0x79, 0xBB, 0x7A, 0xBB, 0x81, 0xBB, 0x82, /* 0x48-0x4B */ + 0xBB, 0x83, 0xBB, 0x84, 0xBB, 0x85, 0xBB, 0x86, /* 0x4C-0x4F */ + 0xBB, 0x87, 0xBB, 0x88, 0xBB, 0x89, 0xBB, 0x8A, /* 0x50-0x53 */ + 0xBB, 0x8B, 0xBB, 0x8C, 0xBB, 0x8D, 0xBB, 0x8E, /* 0x54-0x57 */ + 0xBB, 0x8F, 0xBB, 0x90, 0xBB, 0x91, 0xBB, 0x92, /* 0x58-0x5B */ + 0xBB, 0x93, 0xBB, 0x94, 0xBB, 0x95, 0xBB, 0x96, /* 0x5C-0x5F */ + 0xBB, 0x97, 0xBB, 0x98, 0xBB, 0x99, 0xBB, 0x9A, /* 0x60-0x63 */ + 0xBB, 0x9B, 0xBB, 0x9C, 0xBB, 0x9D, 0xBB, 0x9E, /* 0x64-0x67 */ + 0xBB, 0x9F, 0xBB, 0xA0, 0xBC, 0x41, 0xBC, 0x42, /* 0x68-0x6B */ + 0xBC, 0x43, 0xBC, 0x44, 0xBC, 0x45, 0xBC, 0x46, /* 0x6C-0x6F */ + 0xBC, 0x47, 0xBC, 0x48, 0xBC, 0x49, 0xBC, 0x4A, /* 0x70-0x73 */ + 0xBC, 0x4B, 0xBC, 0x4C, 0xBC, 0x4D, 0xBC, 0x4E, /* 0x74-0x77 */ + 0xBC, 0x4F, 0xBC, 0x50, 0xBC, 0x51, 0xBC, 0x52, /* 0x78-0x7B */ + 0xC6, 0xDB, 0xC6, 0xDC, 0xBC, 0x53, 0xBC, 0x54, /* 0x7C-0x7F */ + + 0xC6, 0xDD, 0xBC, 0x55, 0xBC, 0x56, 0xBC, 0x57, /* 0x80-0x83 */ + 0xC6, 0xDE, 0xBC, 0x58, 0xBC, 0x59, 0xBC, 0x5A, /* 0x84-0x87 */ + 0xBC, 0x61, 0xBC, 0x62, 0xBC, 0x63, 0xBC, 0x64, /* 0x88-0x8B */ + 0xC6, 0xDF, 0xC6, 0xE0, 0xBC, 0x65, 0xC6, 0xE1, /* 0x8C-0x8F */ + 0xC6, 0xE2, 0xC6, 0xE3, 0xBC, 0x66, 0xBC, 0x67, /* 0x90-0x93 */ + 0xBC, 0x68, 0xBC, 0x69, 0xBC, 0x6A, 0xBC, 0x6B, /* 0x94-0x97 */ + 0xC6, 0xE4, 0xC6, 0xE5, 0xBC, 0x6C, 0xBC, 0x6D, /* 0x98-0x9B */ + 0xC6, 0xE6, 0xBC, 0x6E, 0xBC, 0x6F, 0xBC, 0x70, /* 0x9C-0x9F */ + 0xC6, 0xE7, 0xBC, 0x71, 0xBC, 0x72, 0xBC, 0x73, /* 0xA0-0xA3 */ + 0xBC, 0x74, 0xBC, 0x75, 0xBC, 0x76, 0xBC, 0x77, /* 0xA4-0xA7 */ + 0xC6, 0xE8, 0xC6, 0xE9, 0xBC, 0x78, 0xC6, 0xEA, /* 0xA8-0xAB */ + 0xBC, 0x79, 0xC6, 0xEB, 0xBC, 0x7A, 0xBC, 0x81, /* 0xAC-0xAF */ + 0xBC, 0x82, 0xBC, 0x83, 0xBC, 0x84, 0xBC, 0x85, /* 0xB0-0xB3 */ + 0xC6, 0xEC, 0xBC, 0x86, 0xBC, 0x87, 0xBC, 0x88, /* 0xB4-0xB7 */ + 0xC6, 0xED, 0xBC, 0x89, 0xBC, 0x8A, 0xBC, 0x8B, /* 0xB8-0xBB */ + 0xC6, 0xEE, 0xBC, 0x8C, 0xBC, 0x8D, 0xBC, 0x8E, /* 0xBC-0xBF */ + 0xBC, 0x8F, 0xBC, 0x90, 0xBC, 0x91, 0xBC, 0x92, /* 0xC0-0xC3 */ + 0xC6, 0xEF, 0xC6, 0xF0, 0xBC, 0x93, 0xBC, 0x94, /* 0xC4-0xC7 */ + 0xC6, 0xF1, 0xC6, 0xF2, 0xBC, 0x95, 0xBC, 0x96, /* 0xC8-0xCB */ + 0xBC, 0x97, 0xBC, 0x98, 0xBC, 0x99, 0xBC, 0x9A, /* 0xCC-0xCF */ + 0xC6, 0xF3, 0xBC, 0x9B, 0xBC, 0x9C, 0xBC, 0x9D, /* 0xD0-0xD3 */ + 0xBC, 0x9E, 0xBC, 0x9F, 0xBC, 0xA0, 0xBD, 0x41, /* 0xD4-0xD7 */ + 0xC6, 0xF4, 0xBD, 0x42, 0xBD, 0x43, 0xBD, 0x44, /* 0xD8-0xDB */ + 0xBD, 0x45, 0xBD, 0x46, 0xBD, 0x47, 0xBD, 0x48, /* 0xDC-0xDF */ + 0xBD, 0x49, 0xC6, 0xF5, 0xBD, 0x4A, 0xC6, 0xF6, /* 0xE0-0xE3 */ + 0xBD, 0x4B, 0xBD, 0x4C, 0xBD, 0x4D, 0xBD, 0x4E, /* 0xE4-0xE7 */ + 0xBD, 0x4F, 0xBD, 0x50, 0xBD, 0x51, 0xBD, 0x52, /* 0xE8-0xEB */ + 0xC6, 0xF7, 0xC6, 0xF8, 0xBD, 0x53, 0xBD, 0x54, /* 0xEC-0xEF */ + 0xC6, 0xF9, 0xBD, 0x55, 0xBD, 0x56, 0xBD, 0x57, /* 0xF0-0xF3 */ + 0xC6, 0xFA, 0xBD, 0x58, 0xBD, 0x59, 0xBD, 0x5A, /* 0xF4-0xF7 */ + 0xBD, 0x61, 0xBD, 0x62, 0xBD, 0x63, 0xBD, 0x64, /* 0xF8-0xFB */ + 0xC6, 0xFB, 0xC6, 0xFC, 0xBD, 0x65, 0xC6, 0xFD, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D4[512] = { + 0xBD, 0x66, 0xC6, 0xFE, 0xBD, 0x67, 0xBD, 0x68, /* 0x00-0x03 */ + 0xBD, 0x69, 0xBD, 0x6A, 0xBD, 0x6B, 0xBD, 0x6C, /* 0x04-0x07 */ + 0xC7, 0xA1, 0xBD, 0x6D, 0xBD, 0x6E, 0xBD, 0x6F, /* 0x08-0x0B */ + 0xBD, 0x70, 0xBD, 0x71, 0xBD, 0x72, 0xBD, 0x73, /* 0x0C-0x0F */ + 0xBD, 0x74, 0xBD, 0x75, 0xBD, 0x76, 0xBD, 0x77, /* 0x10-0x13 */ + 0xBD, 0x78, 0xBD, 0x79, 0xBD, 0x7A, 0xBD, 0x81, /* 0x14-0x17 */ + 0xBD, 0x82, 0xBD, 0x83, 0xBD, 0x84, 0xBD, 0x85, /* 0x18-0x1B */ + 0xBD, 0x86, 0xC7, 0xA2, 0xBD, 0x87, 0xBD, 0x88, /* 0x1C-0x1F */ + 0xBD, 0x89, 0xBD, 0x8A, 0xBD, 0x8B, 0xBD, 0x8C, /* 0x20-0x23 */ + 0xBD, 0x8D, 0xBD, 0x8E, 0xBD, 0x8F, 0xBD, 0x90, /* 0x24-0x27 */ + 0xBD, 0x91, 0xBD, 0x92, 0xBD, 0x93, 0xBD, 0x94, /* 0x28-0x2B */ + 0xBD, 0x95, 0xBD, 0x96, 0xBD, 0x97, 0xBD, 0x98, /* 0x2C-0x2F */ + 0xBD, 0x99, 0xBD, 0x9A, 0xBD, 0x9B, 0xBD, 0x9C, /* 0x30-0x33 */ + 0xBD, 0x9D, 0xBD, 0x9E, 0xBD, 0x9F, 0xBD, 0xA0, /* 0x34-0x37 */ + 0xBE, 0x41, 0xBE, 0x42, 0xBE, 0x43, 0xBE, 0x44, /* 0x38-0x3B */ + 0xBE, 0x45, 0xBE, 0x46, 0xBE, 0x47, 0xBE, 0x48, /* 0x3C-0x3F */ + 0xC7, 0xA3, 0xBE, 0x49, 0xBE, 0x4A, 0xBE, 0x4B, /* 0x40-0x43 */ + 0xC7, 0xA4, 0xBE, 0x4C, 0xBE, 0x4D, 0xBE, 0x4E, /* 0x44-0x47 */ + 0xBE, 0x4F, 0xBE, 0x50, 0xBE, 0x51, 0xBE, 0x52, /* 0x48-0x4B */ + 0xBE, 0x53, 0xBE, 0x54, 0xBE, 0x55, 0xBE, 0x56, /* 0x4C-0x4F */ + 0xBE, 0x57, 0xBE, 0x58, 0xBE, 0x59, 0xBE, 0x5A, /* 0x50-0x53 */ + 0xBE, 0x61, 0xBE, 0x62, 0xBE, 0x63, 0xBE, 0x64, /* 0x54-0x57 */ + 0xBE, 0x65, 0xBE, 0x66, 0xBE, 0x67, 0xBE, 0x68, /* 0x58-0x5B */ + 0xC7, 0xA5, 0xBE, 0x69, 0xBE, 0x6A, 0xBE, 0x6B, /* 0x5C-0x5F */ + 0xC7, 0xA6, 0xBE, 0x6C, 0xBE, 0x6D, 0xBE, 0x6E, /* 0x60-0x63 */ + 0xC7, 0xA7, 0xBE, 0x6F, 0xBE, 0x70, 0xBE, 0x71, /* 0x64-0x67 */ + 0xBE, 0x72, 0xBE, 0x73, 0xBE, 0x74, 0xBE, 0x75, /* 0x68-0x6B */ + 0xBE, 0x76, 0xC7, 0xA8, 0xBE, 0x77, 0xC7, 0xA9, /* 0x6C-0x6F */ + 0xBE, 0x78, 0xBE, 0x79, 0xBE, 0x7A, 0xBE, 0x81, /* 0x70-0x73 */ + 0xBE, 0x82, 0xBE, 0x83, 0xBE, 0x84, 0xBE, 0x85, /* 0x74-0x77 */ + 0xC7, 0xAA, 0xC7, 0xAB, 0xBE, 0x86, 0xBE, 0x87, /* 0x78-0x7B */ + 0xC7, 0xAC, 0xBE, 0x88, 0xBE, 0x89, 0xC7, 0xAD, /* 0x7C-0x7F */ + + 0xC7, 0xAE, 0xBE, 0x8A, 0xC7, 0xAF, 0xBE, 0x8B, /* 0x80-0x83 */ + 0xBE, 0x8C, 0xBE, 0x8D, 0xBE, 0x8E, 0xBE, 0x8F, /* 0x84-0x87 */ + 0xC7, 0xB0, 0xC7, 0xB1, 0xBE, 0x90, 0xC7, 0xB2, /* 0x88-0x8B */ + 0xBE, 0x91, 0xC7, 0xB3, 0xBE, 0x92, 0xBE, 0x93, /* 0x8C-0x8F */ + 0xBE, 0x94, 0xBE, 0x95, 0xBE, 0x96, 0xBE, 0x97, /* 0x90-0x93 */ + 0xC7, 0xB4, 0xBE, 0x98, 0xBE, 0x99, 0xBE, 0x9A, /* 0x94-0x97 */ + 0xBE, 0x9B, 0xBE, 0x9C, 0xBE, 0x9D, 0xBE, 0x9E, /* 0x98-0x9B */ + 0xBE, 0x9F, 0xBE, 0xA0, 0xBF, 0x41, 0xBF, 0x42, /* 0x9C-0x9F */ + 0xBF, 0x43, 0xBF, 0x44, 0xBF, 0x45, 0xBF, 0x46, /* 0xA0-0xA3 */ + 0xBF, 0x47, 0xBF, 0x48, 0xBF, 0x49, 0xBF, 0x4A, /* 0xA4-0xA7 */ + 0xBF, 0x4B, 0xC7, 0xB5, 0xBF, 0x4C, 0xBF, 0x4D, /* 0xA8-0xAB */ + 0xBF, 0x4E, 0xBF, 0x4F, 0xBF, 0x50, 0xBF, 0x51, /* 0xAC-0xAF */ + 0xBF, 0x52, 0xBF, 0x53, 0xBF, 0x54, 0xBF, 0x55, /* 0xB0-0xB3 */ + 0xBF, 0x56, 0xBF, 0x57, 0xBF, 0x58, 0xBF, 0x59, /* 0xB4-0xB7 */ + 0xBF, 0x5A, 0xBF, 0x61, 0xBF, 0x62, 0xBF, 0x63, /* 0xB8-0xBB */ + 0xBF, 0x64, 0xBF, 0x65, 0xBF, 0x66, 0xBF, 0x67, /* 0xBC-0xBF */ + 0xBF, 0x68, 0xBF, 0x69, 0xBF, 0x6A, 0xBF, 0x6B, /* 0xC0-0xC3 */ + 0xBF, 0x6C, 0xBF, 0x6D, 0xBF, 0x6E, 0xBF, 0x6F, /* 0xC4-0xC7 */ + 0xBF, 0x70, 0xBF, 0x71, 0xBF, 0x72, 0xBF, 0x73, /* 0xC8-0xCB */ + 0xC7, 0xB6, 0xBF, 0x74, 0xBF, 0x75, 0xBF, 0x76, /* 0xCC-0xCF */ + 0xC7, 0xB7, 0xBF, 0x77, 0xBF, 0x78, 0xBF, 0x79, /* 0xD0-0xD3 */ + 0xC7, 0xB8, 0xBF, 0x7A, 0xBF, 0x81, 0xBF, 0x82, /* 0xD4-0xD7 */ + 0xBF, 0x83, 0xBF, 0x84, 0xBF, 0x85, 0xBF, 0x86, /* 0xD8-0xDB */ + 0xC7, 0xB9, 0xBF, 0x87, 0xBF, 0x88, 0xC7, 0xBA, /* 0xDC-0xDF */ + 0xBF, 0x89, 0xBF, 0x8A, 0xBF, 0x8B, 0xBF, 0x8C, /* 0xE0-0xE3 */ + 0xBF, 0x8D, 0xBF, 0x8E, 0xBF, 0x8F, 0xBF, 0x90, /* 0xE4-0xE7 */ + 0xC7, 0xBB, 0xBF, 0x91, 0xBF, 0x92, 0xBF, 0x93, /* 0xE8-0xEB */ + 0xC7, 0xBC, 0xBF, 0x94, 0xBF, 0x95, 0xBF, 0x96, /* 0xEC-0xEF */ + 0xC7, 0xBD, 0xBF, 0x97, 0xBF, 0x98, 0xBF, 0x99, /* 0xF0-0xF3 */ + 0xBF, 0x9A, 0xBF, 0x9B, 0xBF, 0x9C, 0xBF, 0x9D, /* 0xF4-0xF7 */ + 0xC7, 0xBE, 0xBF, 0x9E, 0xBF, 0x9F, 0xC7, 0xBF, /* 0xF8-0xFB */ + 0xBF, 0xA0, 0xC7, 0xC0, 0xC0, 0x41, 0xC0, 0x42, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D5[512] = { + 0xC0, 0x43, 0xC0, 0x44, 0xC0, 0x45, 0xC0, 0x46, /* 0x00-0x03 */ + 0xC7, 0xC1, 0xC0, 0x47, 0xC0, 0x48, 0xC0, 0x49, /* 0x04-0x07 */ + 0xC7, 0xC2, 0xC0, 0x4A, 0xC0, 0x4B, 0xC0, 0x4C, /* 0x08-0x0B */ + 0xC7, 0xC3, 0xC0, 0x4D, 0xC0, 0x4E, 0xC0, 0x4F, /* 0x0C-0x0F */ + 0xC0, 0x50, 0xC0, 0x51, 0xC0, 0x52, 0xC0, 0x53, /* 0x10-0x13 */ + 0xC7, 0xC4, 0xC7, 0xC5, 0xC0, 0x54, 0xC7, 0xC6, /* 0x14-0x17 */ + 0xC0, 0x55, 0xC0, 0x56, 0xC0, 0x57, 0xC0, 0x58, /* 0x18-0x1B */ + 0xC0, 0x59, 0xC0, 0x5A, 0xC0, 0x61, 0xC0, 0x62, /* 0x1C-0x1F */ + 0xC0, 0x63, 0xC0, 0x64, 0xC0, 0x65, 0xC0, 0x66, /* 0x20-0x23 */ + 0xC0, 0x67, 0xC0, 0x68, 0xC0, 0x69, 0xC0, 0x6A, /* 0x24-0x27 */ + 0xC0, 0x6B, 0xC0, 0x6C, 0xC0, 0x6D, 0xC0, 0x6E, /* 0x28-0x2B */ + 0xC0, 0x6F, 0xC0, 0x70, 0xC0, 0x71, 0xC0, 0x72, /* 0x2C-0x2F */ + 0xC0, 0x73, 0xC0, 0x74, 0xC0, 0x75, 0xC0, 0x76, /* 0x30-0x33 */ + 0xC0, 0x77, 0xC0, 0x78, 0xC0, 0x79, 0xC0, 0x7A, /* 0x34-0x37 */ + 0xC0, 0x81, 0xC0, 0x82, 0xC0, 0x83, 0xC0, 0x84, /* 0x38-0x3B */ + 0xC7, 0xC7, 0xC7, 0xC8, 0xC0, 0x85, 0xC0, 0x86, /* 0x3C-0x3F */ + 0xC7, 0xC9, 0xC0, 0x87, 0xC0, 0x88, 0xC0, 0x89, /* 0x40-0x43 */ + 0xC7, 0xCA, 0xC0, 0x8A, 0xC0, 0x8B, 0xC0, 0x8C, /* 0x44-0x47 */ + 0xC0, 0x8D, 0xC0, 0x8E, 0xC0, 0x8F, 0xC0, 0x90, /* 0x48-0x4B */ + 0xC7, 0xCB, 0xC7, 0xCC, 0xC0, 0x91, 0xC7, 0xCD, /* 0x4C-0x4F */ + 0xC0, 0x92, 0xC7, 0xCE, 0xC0, 0x93, 0xC0, 0x94, /* 0x50-0x53 */ + 0xC0, 0x95, 0xC0, 0x96, 0xC0, 0x97, 0xC0, 0x98, /* 0x54-0x57 */ + 0xC7, 0xCF, 0xC7, 0xD0, 0xC0, 0x99, 0xC0, 0x9A, /* 0x58-0x5B */ + 0xC7, 0xD1, 0xC0, 0x9B, 0xC0, 0x9C, 0xC0, 0x9D, /* 0x5C-0x5F */ + 0xC7, 0xD2, 0xC0, 0x9E, 0xC0, 0x9F, 0xC0, 0xA0, /* 0x60-0x63 */ + 0xC1, 0x41, 0xC7, 0xD3, 0xC1, 0x42, 0xC1, 0x43, /* 0x64-0x67 */ + 0xC7, 0xD4, 0xC7, 0xD5, 0xC1, 0x44, 0xC7, 0xD6, /* 0x68-0x6B */ + 0xC1, 0x45, 0xC7, 0xD7, 0xC1, 0x46, 0xC1, 0x47, /* 0x6C-0x6F */ + 0xC1, 0x48, 0xC1, 0x49, 0xC1, 0x4A, 0xC1, 0x4B, /* 0x70-0x73 */ + 0xC7, 0xD8, 0xC7, 0xD9, 0xC1, 0x4C, 0xC1, 0x4D, /* 0x74-0x77 */ + 0xC7, 0xDA, 0xC1, 0x4E, 0xC1, 0x4F, 0xC1, 0x50, /* 0x78-0x7B */ + 0xC7, 0xDB, 0xC1, 0x51, 0xC1, 0x52, 0xC1, 0x53, /* 0x7C-0x7F */ + + 0xC1, 0x54, 0xC1, 0x55, 0xC1, 0x56, 0xC1, 0x57, /* 0x80-0x83 */ + 0xC7, 0xDC, 0xC7, 0xDD, 0xC1, 0x58, 0xC7, 0xDE, /* 0x84-0x87 */ + 0xC7, 0xDF, 0xC7, 0xE0, 0xC1, 0x59, 0xC1, 0x5A, /* 0x88-0x8B */ + 0xC1, 0x61, 0xC1, 0x62, 0xC1, 0x63, 0xC1, 0x64, /* 0x8C-0x8F */ + 0xC7, 0xE1, 0xC1, 0x65, 0xC1, 0x66, 0xC1, 0x67, /* 0x90-0x93 */ + 0xC1, 0x68, 0xC1, 0x69, 0xC1, 0x6A, 0xC1, 0x6B, /* 0x94-0x97 */ + 0xC1, 0x6C, 0xC1, 0x6D, 0xC1, 0x6E, 0xC1, 0x6F, /* 0x98-0x9B */ + 0xC1, 0x70, 0xC1, 0x71, 0xC1, 0x72, 0xC1, 0x73, /* 0x9C-0x9F */ + 0xC1, 0x74, 0xC1, 0x75, 0xC1, 0x76, 0xC1, 0x77, /* 0xA0-0xA3 */ + 0xC1, 0x78, 0xC7, 0xE2, 0xC1, 0x79, 0xC1, 0x7A, /* 0xA4-0xA7 */ + 0xC1, 0x81, 0xC1, 0x82, 0xC1, 0x83, 0xC1, 0x84, /* 0xA8-0xAB */ + 0xC1, 0x85, 0xC1, 0x86, 0xC1, 0x87, 0xC1, 0x88, /* 0xAC-0xAF */ + 0xC1, 0x89, 0xC1, 0x8A, 0xC1, 0x8B, 0xC1, 0x8C, /* 0xB0-0xB3 */ + 0xC1, 0x8D, 0xC1, 0x8E, 0xC1, 0x8F, 0xC1, 0x90, /* 0xB4-0xB7 */ + 0xC1, 0x91, 0xC1, 0x92, 0xC1, 0x93, 0xC1, 0x94, /* 0xB8-0xBB */ + 0xC1, 0x95, 0xC1, 0x96, 0xC1, 0x97, 0xC1, 0x98, /* 0xBC-0xBF */ + 0xC1, 0x99, 0xC1, 0x9A, 0xC1, 0x9B, 0xC1, 0x9C, /* 0xC0-0xC3 */ + 0xC1, 0x9D, 0xC1, 0x9E, 0xC1, 0x9F, 0xC1, 0xA0, /* 0xC4-0xC7 */ + 0xC7, 0xE3, 0xC7, 0xE4, 0xC2, 0x41, 0xC2, 0x42, /* 0xC8-0xCB */ + 0xC7, 0xE5, 0xC2, 0x43, 0xC2, 0x44, 0xC2, 0x45, /* 0xCC-0xCF */ + 0xC7, 0xE6, 0xC2, 0x46, 0xC7, 0xE7, 0xC2, 0x47, /* 0xD0-0xD3 */ + 0xC2, 0x48, 0xC2, 0x49, 0xC2, 0x4A, 0xC2, 0x4B, /* 0xD4-0xD7 */ + 0xC7, 0xE8, 0xC7, 0xE9, 0xC2, 0x4C, 0xC7, 0xEA, /* 0xD8-0xDB */ + 0xC2, 0x4D, 0xC7, 0xEB, 0xC2, 0x4E, 0xC2, 0x4F, /* 0xDC-0xDF */ + 0xC2, 0x50, 0xC2, 0x51, 0xC2, 0x52, 0xC2, 0x53, /* 0xE0-0xE3 */ + 0xC7, 0xEC, 0xC7, 0xED, 0xC2, 0x54, 0xC2, 0x55, /* 0xE4-0xE7 */ + 0xC7, 0xEE, 0xC2, 0x56, 0xC2, 0x57, 0xC2, 0x58, /* 0xE8-0xEB */ + 0xC7, 0xEF, 0xC2, 0x59, 0xC2, 0x5A, 0xC2, 0x61, /* 0xEC-0xEF */ + 0xC2, 0x62, 0xC2, 0x63, 0xC2, 0x64, 0xC2, 0x65, /* 0xF0-0xF3 */ + 0xC7, 0xF0, 0xC7, 0xF1, 0xC2, 0x66, 0xC7, 0xF2, /* 0xF4-0xF7 */ + 0xC2, 0x67, 0xC7, 0xF3, 0xC2, 0x68, 0xC2, 0x69, /* 0xF8-0xFB */ + 0xC2, 0x6A, 0xC2, 0x6B, 0xC2, 0x6C, 0xC2, 0x6D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D6[512] = { + 0xC7, 0xF4, 0xC7, 0xF5, 0xC2, 0x6E, 0xC2, 0x6F, /* 0x00-0x03 */ + 0xC7, 0xF6, 0xC2, 0x70, 0xC2, 0x71, 0xC2, 0x72, /* 0x04-0x07 */ + 0xC7, 0xF7, 0xC2, 0x73, 0xC2, 0x74, 0xC2, 0x75, /* 0x08-0x0B */ + 0xC2, 0x76, 0xC2, 0x77, 0xC2, 0x78, 0xC2, 0x79, /* 0x0C-0x0F */ + 0xC7, 0xF8, 0xC7, 0xF9, 0xC2, 0x7A, 0xC7, 0xFA, /* 0x10-0x13 */ + 0xC7, 0xFB, 0xC7, 0xFC, 0xC2, 0x81, 0xC2, 0x82, /* 0x14-0x17 */ + 0xC2, 0x83, 0xC2, 0x84, 0xC2, 0x85, 0xC2, 0x86, /* 0x18-0x1B */ + 0xC7, 0xFD, 0xC2, 0x87, 0xC2, 0x88, 0xC2, 0x89, /* 0x1C-0x1F */ + 0xC7, 0xFE, 0xC2, 0x8A, 0xC2, 0x8B, 0xC2, 0x8C, /* 0x20-0x23 */ + 0xC8, 0xA1, 0xC2, 0x8D, 0xC2, 0x8E, 0xC2, 0x8F, /* 0x24-0x27 */ + 0xC2, 0x90, 0xC2, 0x91, 0xC2, 0x92, 0xC2, 0x93, /* 0x28-0x2B */ + 0xC2, 0x94, 0xC8, 0xA2, 0xC2, 0x95, 0xC2, 0x96, /* 0x2C-0x2F */ + 0xC2, 0x97, 0xC2, 0x98, 0xC2, 0x99, 0xC2, 0x9A, /* 0x30-0x33 */ + 0xC2, 0x9B, 0xC2, 0x9C, 0xC2, 0x9D, 0xC2, 0x9E, /* 0x34-0x37 */ + 0xC8, 0xA3, 0xC8, 0xA4, 0xC2, 0x9F, 0xC2, 0xA0, /* 0x38-0x3B */ + 0xC8, 0xA5, 0xC3, 0x41, 0xC3, 0x42, 0xC3, 0x43, /* 0x3C-0x3F */ + 0xC8, 0xA6, 0xC3, 0x44, 0xC3, 0x45, 0xC3, 0x46, /* 0x40-0x43 */ + 0xC3, 0x47, 0xC8, 0xA7, 0xC3, 0x48, 0xC3, 0x49, /* 0x44-0x47 */ + 0xC8, 0xA8, 0xC8, 0xA9, 0xC3, 0x4A, 0xC8, 0xAA, /* 0x48-0x4B */ + 0xC3, 0x4B, 0xC8, 0xAB, 0xC3, 0x4C, 0xC3, 0x4D, /* 0x4C-0x4F */ + 0xC3, 0x4E, 0xC8, 0xAC, 0xC3, 0x4F, 0xC3, 0x50, /* 0x50-0x53 */ + 0xC8, 0xAD, 0xC8, 0xAE, 0xC3, 0x51, 0xC3, 0x52, /* 0x54-0x57 */ + 0xC8, 0xAF, 0xC3, 0x53, 0xC3, 0x54, 0xC3, 0x55, /* 0x58-0x5B */ + 0xC8, 0xB0, 0xC3, 0x56, 0xC3, 0x57, 0xC3, 0x58, /* 0x5C-0x5F */ + 0xC3, 0x59, 0xC3, 0x5A, 0xC3, 0x61, 0xC3, 0x62, /* 0x60-0x63 */ + 0xC3, 0x63, 0xC3, 0x64, 0xC3, 0x65, 0xC8, 0xB1, /* 0x64-0x67 */ + 0xC3, 0x66, 0xC8, 0xB2, 0xC3, 0x67, 0xC3, 0x68, /* 0x68-0x6B */ + 0xC3, 0x69, 0xC3, 0x6A, 0xC3, 0x6B, 0xC3, 0x6C, /* 0x6C-0x6F */ + 0xC8, 0xB3, 0xC8, 0xB4, 0xC3, 0x6D, 0xC3, 0x6E, /* 0x70-0x73 */ + 0xC8, 0xB5, 0xC3, 0x6F, 0xC3, 0x70, 0xC3, 0x71, /* 0x74-0x77 */ + 0xC3, 0x72, 0xC3, 0x73, 0xC3, 0x74, 0xC3, 0x75, /* 0x78-0x7B */ + 0xC3, 0x76, 0xC3, 0x77, 0xC3, 0x78, 0xC3, 0x79, /* 0x7C-0x7F */ + + 0xC3, 0x7A, 0xC3, 0x81, 0xC3, 0x82, 0xC8, 0xB6, /* 0x80-0x83 */ + 0xC3, 0x83, 0xC8, 0xB7, 0xC3, 0x84, 0xC3, 0x85, /* 0x84-0x87 */ + 0xC3, 0x86, 0xC3, 0x87, 0xC3, 0x88, 0xC3, 0x89, /* 0x88-0x8B */ + 0xC8, 0xB8, 0xC8, 0xB9, 0xC3, 0x8A, 0xC3, 0x8B, /* 0x8C-0x8F */ + 0xC8, 0xBA, 0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, /* 0x90-0x93 */ + 0xC8, 0xBB, 0xC3, 0x8F, 0xC3, 0x90, 0xC3, 0x91, /* 0x94-0x97 */ + 0xC3, 0x92, 0xC3, 0x93, 0xC3, 0x94, 0xC3, 0x95, /* 0x98-0x9B */ + 0xC3, 0x96, 0xC8, 0xBC, 0xC3, 0x97, 0xC8, 0xBD, /* 0x9C-0x9F */ + 0xC3, 0x98, 0xC8, 0xBE, 0xC3, 0x99, 0xC3, 0x9A, /* 0xA0-0xA3 */ + 0xC3, 0x9B, 0xC3, 0x9C, 0xC3, 0x9D, 0xC3, 0x9E, /* 0xA4-0xA7 */ + 0xC8, 0xBF, 0xC3, 0x9F, 0xC3, 0xA0, 0xC4, 0x41, /* 0xA8-0xAB */ + 0xC8, 0xC0, 0xC4, 0x42, 0xC4, 0x43, 0xC4, 0x44, /* 0xAC-0xAF */ + 0xC8, 0xC1, 0xC4, 0x45, 0xC4, 0x46, 0xC4, 0x47, /* 0xB0-0xB3 */ + 0xC4, 0x48, 0xC4, 0x49, 0xC4, 0x4A, 0xC4, 0x4B, /* 0xB4-0xB7 */ + 0xC4, 0x4C, 0xC8, 0xC2, 0xC4, 0x4D, 0xC8, 0xC3, /* 0xB8-0xBB */ + 0xC4, 0x4E, 0xC4, 0x4F, 0xC4, 0x50, 0xC4, 0x51, /* 0xBC-0xBF */ + 0xC4, 0x52, 0xC4, 0x53, 0xC4, 0x54, 0xC4, 0x55, /* 0xC0-0xC3 */ + 0xC8, 0xC4, 0xC8, 0xC5, 0xC4, 0x56, 0xC4, 0x57, /* 0xC4-0xC7 */ + 0xC8, 0xC6, 0xC4, 0x58, 0xC4, 0x59, 0xC4, 0x5A, /* 0xC8-0xCB */ + 0xC8, 0xC7, 0xC4, 0x61, 0xC4, 0x62, 0xC4, 0x63, /* 0xCC-0xCF */ + 0xC4, 0x64, 0xC8, 0xC8, 0xC4, 0x65, 0xC4, 0x66, /* 0xD0-0xD3 */ + 0xC8, 0xC9, 0xC4, 0x67, 0xC4, 0x68, 0xC8, 0xCA, /* 0xD4-0xD7 */ + 0xC4, 0x69, 0xC8, 0xCB, 0xC4, 0x6A, 0xC4, 0x6B, /* 0xD8-0xDB */ + 0xC4, 0x6C, 0xC4, 0x6D, 0xC4, 0x6E, 0xC4, 0x6F, /* 0xDC-0xDF */ + 0xC8, 0xCC, 0xC4, 0x70, 0xC4, 0x71, 0xC4, 0x72, /* 0xE0-0xE3 */ + 0xC8, 0xCD, 0xC4, 0x73, 0xC4, 0x74, 0xC4, 0x75, /* 0xE4-0xE7 */ + 0xC8, 0xCE, 0xC4, 0x76, 0xC4, 0x77, 0xC4, 0x78, /* 0xE8-0xEB */ + 0xC4, 0x79, 0xC4, 0x7A, 0xC4, 0x81, 0xC4, 0x82, /* 0xEC-0xEF */ + 0xC8, 0xCF, 0xC4, 0x83, 0xC4, 0x84, 0xC4, 0x85, /* 0xF0-0xF3 */ + 0xC4, 0x86, 0xC8, 0xD0, 0xC4, 0x87, 0xC4, 0x88, /* 0xF4-0xF7 */ + 0xC4, 0x89, 0xC4, 0x8A, 0xC4, 0x8B, 0xC4, 0x8C, /* 0xF8-0xFB */ + 0xC8, 0xD1, 0xC8, 0xD2, 0xC4, 0x8D, 0xC4, 0x8E, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_D7[512] = { + 0xC8, 0xD3, 0xC4, 0x8F, 0xC4, 0x90, 0xC4, 0x91, /* 0x00-0x03 */ + 0xC8, 0xD4, 0xC4, 0x92, 0xC4, 0x93, 0xC4, 0x94, /* 0x04-0x07 */ + 0xC4, 0x95, 0xC4, 0x96, 0xC4, 0x97, 0xC4, 0x98, /* 0x08-0x0B */ + 0xC4, 0x99, 0xC4, 0x9A, 0xC4, 0x9B, 0xC4, 0x9C, /* 0x0C-0x0F */ + 0xC4, 0x9D, 0xC8, 0xD5, 0xC4, 0x9E, 0xC4, 0x9F, /* 0x10-0x13 */ + 0xC4, 0xA0, 0xC5, 0x41, 0xC5, 0x42, 0xC5, 0x43, /* 0x14-0x17 */ + 0xC8, 0xD6, 0xC8, 0xD7, 0xC5, 0x44, 0xC5, 0x45, /* 0x18-0x1B */ + 0xC8, 0xD8, 0xC5, 0x46, 0xC5, 0x47, 0xC5, 0x48, /* 0x1C-0x1F */ + 0xC8, 0xD9, 0xC5, 0x49, 0xC5, 0x4A, 0xC5, 0x4B, /* 0x20-0x23 */ + 0xC5, 0x4C, 0xC5, 0x4D, 0xC5, 0x4E, 0xC5, 0x4F, /* 0x24-0x27 */ + 0xC8, 0xDA, 0xC8, 0xDB, 0xC5, 0x50, 0xC8, 0xDC, /* 0x28-0x2B */ + 0xC5, 0x51, 0xC8, 0xDD, 0xC5, 0x52, 0xC5, 0x53, /* 0x2C-0x2F */ + 0xC5, 0x54, 0xC5, 0x55, 0xC5, 0x56, 0xC5, 0x57, /* 0x30-0x33 */ + 0xC8, 0xDE, 0xC8, 0xDF, 0xC5, 0x58, 0xC5, 0x59, /* 0x34-0x37 */ + 0xC8, 0xE0, 0xC5, 0x5A, 0xC5, 0x61, 0xC5, 0x62, /* 0x38-0x3B */ + 0xC8, 0xE1, 0xC5, 0x63, 0xC5, 0x64, 0xC5, 0x65, /* 0x3C-0x3F */ + 0xC5, 0x66, 0xC5, 0x67, 0xC5, 0x68, 0xC5, 0x69, /* 0x40-0x43 */ + 0xC8, 0xE2, 0xC5, 0x6A, 0xC5, 0x6B, 0xC8, 0xE3, /* 0x44-0x47 */ + 0xC5, 0x6C, 0xC8, 0xE4, 0xC5, 0x6D, 0xC5, 0x6E, /* 0x48-0x4B */ + 0xC5, 0x6F, 0xC5, 0x70, 0xC5, 0x71, 0xC5, 0x72, /* 0x4C-0x4F */ + 0xC8, 0xE5, 0xC8, 0xE6, 0xC5, 0x73, 0xC5, 0x74, /* 0x50-0x53 */ + 0xC8, 0xE7, 0xC5, 0x75, 0xC8, 0xE8, 0xC8, 0xE9, /* 0x54-0x57 */ + 0xC8, 0xEA, 0xC8, 0xEB, 0xC5, 0x76, 0xC5, 0x77, /* 0x58-0x5B */ + 0xC5, 0x78, 0xC5, 0x79, 0xC5, 0x7A, 0xC5, 0x81, /* 0x5C-0x5F */ + 0xC8, 0xEC, 0xC8, 0xED, 0xC5, 0x82, 0xC8, 0xEE, /* 0x60-0x63 */ + 0xC5, 0x83, 0xC8, 0xEF, 0xC5, 0x84, 0xC5, 0x85, /* 0x64-0x67 */ + 0xC5, 0x86, 0xC8, 0xF0, 0xC5, 0x87, 0xC5, 0x88, /* 0x68-0x6B */ + 0xC8, 0xF1, 0xC5, 0x89, 0xC5, 0x8A, 0xC5, 0x8B, /* 0x6C-0x6F */ + 0xC8, 0xF2, 0xC5, 0x8C, 0xC5, 0x8D, 0xC5, 0x8E, /* 0x70-0x73 */ + 0xC8, 0xF3, 0xC5, 0x8F, 0xC5, 0x90, 0xC5, 0x91, /* 0x74-0x77 */ + 0xC5, 0x92, 0xC5, 0x93, 0xC5, 0x94, 0xC5, 0x95, /* 0x78-0x7B */ + 0xC8, 0xF4, 0xC8, 0xF5, 0xC5, 0x96, 0xC5, 0x97, /* 0x7C-0x7F */ + + 0xC5, 0x98, 0xC8, 0xF6, 0xC5, 0x99, 0xC5, 0x9A, /* 0x80-0x83 */ + 0xC5, 0x9B, 0xC5, 0x9C, 0xC5, 0x9D, 0xC5, 0x9E, /* 0x84-0x87 */ + 0xC8, 0xF7, 0xC8, 0xF8, 0xC5, 0x9F, 0xC5, 0xA0, /* 0x88-0x8B */ + 0xC8, 0xF9, 0xC6, 0x41, 0xC6, 0x42, 0xC6, 0x43, /* 0x8C-0x8F */ + 0xC8, 0xFA, 0xC6, 0x44, 0xC6, 0x45, 0xC6, 0x46, /* 0x90-0x93 */ + 0xC6, 0x47, 0xC6, 0x48, 0xC6, 0x49, 0xC6, 0x4A, /* 0x94-0x97 */ + 0xC8, 0xFB, 0xC8, 0xFC, 0xC6, 0x4B, 0xC8, 0xFD, /* 0x98-0x9B */ + 0xC6, 0x4C, 0xC8, 0xFE, 0xC6, 0x4D, 0xC6, 0x4E, /* 0x9C-0x9F */ + 0xC6, 0x4F, 0xC6, 0x50, 0xC6, 0x51, 0xC6, 0x52, /* 0xA0-0xA3 */ +}; + +static const unsigned char u2c_DC[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ +}; + +static const unsigned char u2c_F9[512] = { + 0xCB, 0xD0, 0xCB, 0xD6, 0xCB, 0xE7, 0xCD, 0xCF, /* 0x00-0x03 */ + 0xCD, 0xE8, 0xCE, 0xAD, 0xCF, 0xFB, 0xD0, 0xA2, /* 0x04-0x07 */ + 0xD0, 0xB8, 0xD0, 0xD0, 0xD0, 0xDD, 0xD1, 0xD4, /* 0x08-0x0B */ + 0xD1, 0xD5, 0xD1, 0xD8, 0xD1, 0xDB, 0xD1, 0xDC, /* 0x0C-0x0F */ + 0xD1, 0xDD, 0xD1, 0xDE, 0xD1, 0xDF, 0xD1, 0xE0, /* 0x10-0x13 */ + 0xD1, 0xE2, 0xD1, 0xE3, 0xD1, 0xE4, 0xD1, 0xE5, /* 0x14-0x17 */ + 0xD1, 0xE6, 0xD1, 0xE8, 0xD1, 0xE9, 0xD1, 0xEA, /* 0x18-0x1B */ + 0xD1, 0xEB, 0xD1, 0xED, 0xD1, 0xEF, 0xD1, 0xF0, /* 0x1C-0x1F */ + 0xD1, 0xF2, 0xD1, 0xF6, 0xD1, 0xFA, 0xD1, 0xFC, /* 0x20-0x23 */ + 0xD1, 0xFD, 0xD1, 0xFE, 0xD2, 0xA2, 0xD2, 0xA3, /* 0x24-0x27 */ + 0xD2, 0xA7, 0xD2, 0xA8, 0xD2, 0xA9, 0xD2, 0xAA, /* 0x28-0x2B */ + 0xD2, 0xAB, 0xD2, 0xAD, 0xD2, 0xB2, 0xD2, 0xBE, /* 0x2C-0x2F */ + 0xD2, 0xC2, 0xD2, 0xC3, 0xD2, 0xC4, 0xD2, 0xC6, /* 0x30-0x33 */ + 0xD2, 0xC7, 0xD2, 0xC8, 0xD2, 0xC9, 0xD2, 0xCA, /* 0x34-0x37 */ + 0xD2, 0xCB, 0xD2, 0xCD, 0xD2, 0xCE, 0xD2, 0xCF, /* 0x38-0x3B */ + 0xD2, 0xD0, 0xD2, 0xD1, 0xD2, 0xD2, 0xD2, 0xD3, /* 0x3C-0x3F */ + 0xD2, 0xD4, 0xD2, 0xD5, 0xD2, 0xD6, 0xD2, 0xD7, /* 0x40-0x43 */ + 0xD2, 0xD9, 0xD2, 0xDA, 0xD2, 0xDE, 0xD2, 0xDF, /* 0x44-0x47 */ + 0xD2, 0xE1, 0xD2, 0xE2, 0xD2, 0xE4, 0xD2, 0xE5, /* 0x48-0x4B */ + 0xD2, 0xE6, 0xD2, 0xE7, 0xD2, 0xE8, 0xD2, 0xE9, /* 0x4C-0x4F */ + 0xD2, 0xEA, 0xD2, 0xEB, 0xD2, 0xF0, 0xD2, 0xF1, /* 0x50-0x53 */ + 0xD2, 0xF2, 0xD2, 0xF3, 0xD2, 0xF4, 0xD2, 0xF5, /* 0x54-0x57 */ + 0xD2, 0xF7, 0xD2, 0xF8, 0xD4, 0xE6, 0xD4, 0xFC, /* 0x58-0x5B */ + 0xD5, 0xA5, 0xD5, 0xAB, 0xD5, 0xAE, 0xD6, 0xB8, /* 0x5C-0x5F */ + 0xD6, 0xCD, 0xD7, 0xCB, 0xD7, 0xE4, 0xDB, 0xC5, /* 0x60-0x63 */ + 0xDB, 0xE4, 0xDC, 0xA5, 0xDD, 0xA5, 0xDD, 0xD5, /* 0x64-0x67 */ + 0xDD, 0xF4, 0xDE, 0xFC, 0xDE, 0xFE, 0xDF, 0xB3, /* 0x68-0x6B */ + 0xDF, 0xE1, 0xDF, 0xE8, 0xE0, 0xF1, 0xE1, 0xAD, /* 0x6C-0x6F */ + 0xE1, 0xED, 0xE3, 0xF5, 0xE4, 0xA1, 0xE4, 0xA9, /* 0x70-0x73 */ + 0xE5, 0xAE, 0xE5, 0xB1, 0xE5, 0xB2, 0xE5, 0xB9, /* 0x74-0x77 */ + 0xE5, 0xBB, 0xE5, 0xBC, 0xE5, 0xC4, 0xE5, 0xCE, /* 0x78-0x7B */ + 0xE5, 0xD0, 0xE5, 0xD2, 0xE5, 0xD6, 0xE5, 0xFA, /* 0x7C-0x7F */ + + 0xE5, 0xFB, 0xE5, 0xFC, 0xE5, 0xFE, 0xE6, 0xA1, /* 0x80-0x83 */ + 0xE6, 0xA4, 0xE6, 0xA7, 0xE6, 0xAD, 0xE6, 0xAF, /* 0x84-0x87 */ + 0xE6, 0xB0, 0xE6, 0xB1, 0xE6, 0xB3, 0xE6, 0xB7, /* 0x88-0x8B */ + 0xE6, 0xB8, 0xE6, 0xBC, 0xE6, 0xC4, 0xE6, 0xC6, /* 0x8C-0x8F */ + 0xE6, 0xC7, 0xE6, 0xCA, 0xE6, 0xD2, 0xE6, 0xD6, /* 0x90-0x93 */ + 0xE6, 0xD9, 0xE6, 0xDC, 0xE6, 0xDF, 0xE6, 0xE1, /* 0x94-0x97 */ + 0xE6, 0xE4, 0xE6, 0xE5, 0xE6, 0xE6, 0xE6, 0xE8, /* 0x98-0x9B */ + 0xE6, 0xEA, 0xE6, 0xEB, 0xE6, 0xEC, 0xE6, 0xEF, /* 0x9C-0x9F */ + 0xE6, 0xF1, 0xE6, 0xF2, 0xE6, 0xF5, 0xE6, 0xF6, /* 0xA0-0xA3 */ + 0xE6, 0xF7, 0xE6, 0xF9, 0xE7, 0xA1, 0xE7, 0xA6, /* 0xA4-0xA7 */ + 0xE7, 0xA9, 0xE7, 0xAA, 0xE7, 0xAC, 0xE7, 0xAD, /* 0xA8-0xAB */ + 0xE7, 0xB0, 0xE7, 0xBF, 0xE7, 0xC1, 0xE7, 0xC6, /* 0xAC-0xAF */ + 0xE7, 0xC7, 0xE7, 0xCB, 0xE7, 0xCD, 0xE7, 0xCF, /* 0xB0-0xB3 */ + 0xE7, 0xD0, 0xE7, 0xD3, 0xE7, 0xDF, 0xE7, 0xE4, /* 0xB4-0xB7 */ + 0xE7, 0xE6, 0xE7, 0xF7, 0xE8, 0xE7, 0xE8, 0xE8, /* 0xB8-0xBB */ + 0xE8, 0xF0, 0xE8, 0xF1, 0xE8, 0xF7, 0xE8, 0xF9, /* 0xBC-0xBF */ + 0xE8, 0xFB, 0xE8, 0xFE, 0xE9, 0xA7, 0xE9, 0xAC, /* 0xC0-0xC3 */ + 0xE9, 0xCC, 0xE9, 0xF7, 0xEA, 0xC1, 0xEA, 0xE5, /* 0xC4-0xC7 */ + 0xEA, 0xF4, 0xEA, 0xF7, 0xEA, 0xFC, 0xEA, 0xFE, /* 0xC8-0xCB */ + 0xEB, 0xA4, 0xEB, 0xA7, 0xEB, 0xA9, 0xEB, 0xAA, /* 0xCC-0xCF */ + 0xEB, 0xBA, 0xEB, 0xBB, 0xEB, 0xBD, 0xEB, 0xC1, /* 0xD0-0xD3 */ + 0xEB, 0xC2, 0xEB, 0xC6, 0xEB, 0xC7, 0xEB, 0xCC, /* 0xD4-0xD7 */ + 0xEB, 0xCF, 0xEB, 0xD0, 0xEB, 0xD1, 0xEB, 0xD2, /* 0xD8-0xDB */ + 0xEB, 0xD8, 0xEC, 0xA6, 0xEC, 0xA7, 0xEC, 0xAA, /* 0xDC-0xDF */ + 0xEC, 0xAF, 0xEC, 0xB0, 0xEC, 0xB1, 0xEC, 0xB2, /* 0xE0-0xE3 */ + 0xEC, 0xB5, 0xEC, 0xB8, 0xEC, 0xBA, 0xEC, 0xC0, /* 0xE4-0xE7 */ + 0xEC, 0xC1, 0xEC, 0xC5, 0xEC, 0xC6, 0xEC, 0xC9, /* 0xE8-0xEB */ + 0xEC, 0xCA, 0xEC, 0xD5, 0xEC, 0xDD, 0xEC, 0xDE, /* 0xEC-0xEF */ + 0xEC, 0xE1, 0xEC, 0xE4, 0xEC, 0xE7, 0xEC, 0xE8, /* 0xF0-0xF3 */ + 0xEC, 0xF7, 0xEC, 0xF8, 0xEC, 0xFA, 0xED, 0xA1, /* 0xF4-0xF7 */ + 0xED, 0xA2, 0xED, 0xA3, 0xED, 0xEE, 0xEE, 0xDB, /* 0xF8-0xFB */ + 0xF2, 0xBD, 0xF2, 0xFA, 0xF3, 0xB1, 0xF4, 0xA7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_FA[512] = { + 0xF4, 0xEE, 0xF6, 0xF4, 0xF6, 0xF6, 0xF7, 0xB8, /* 0x00-0x03 */ + 0xF7, 0xC8, 0xF7, 0xD3, 0xF8, 0xDB, 0xF8, 0xF0, /* 0x04-0x07 */ + 0xFA, 0xA1, 0xFA, 0xA2, 0xFA, 0xE6, 0xFC, 0xA9, /* 0x08-0x0B */ + 0xE8, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xF5, 0xC0, 0x00, 0x00, 0xF4, 0xE7, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xFD, 0xEB, 0x00, 0x00, 0xEC, 0xCC, /* 0x14-0x17 */ + 0x00, 0x00, 0xE3, 0xEA, 0xDF, 0xD4, 0xDC, 0xD8, /* 0x18-0x1B */ + 0xEF, 0xFE, 0xEF, 0xF1, 0xE9, 0xE2, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0xB3, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xEC, 0xEF, 0xD4, 0xB4, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xF9, 0xDE, 0xF8, /* 0x28-0x2B */ + 0xCE, 0xBD, 0xF9, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ +}; + +static const unsigned char u2c_FF[512] = { + 0x00, 0x00, 0xA3, 0xA1, 0xA3, 0xA2, 0xA3, 0xA3, /* 0x00-0x03 */ + 0xA3, 0xA4, 0xA3, 0xA5, 0xA3, 0xA6, 0xA3, 0xA7, /* 0x04-0x07 */ + 0xA3, 0xA8, 0xA3, 0xA9, 0xA3, 0xAA, 0xA3, 0xAB, /* 0x08-0x0B */ + 0xA3, 0xAC, 0xA3, 0xAD, 0xA3, 0xAE, 0xA3, 0xAF, /* 0x0C-0x0F */ + 0xA3, 0xB0, 0xA3, 0xB1, 0xA3, 0xB2, 0xA3, 0xB3, /* 0x10-0x13 */ + 0xA3, 0xB4, 0xA3, 0xB5, 0xA3, 0xB6, 0xA3, 0xB7, /* 0x14-0x17 */ + 0xA3, 0xB8, 0xA3, 0xB9, 0xA3, 0xBA, 0xA3, 0xBB, /* 0x18-0x1B */ + 0xA3, 0xBC, 0xA3, 0xBD, 0xA3, 0xBE, 0xA3, 0xBF, /* 0x1C-0x1F */ + 0xA3, 0xC0, 0xA3, 0xC1, 0xA3, 0xC2, 0xA3, 0xC3, /* 0x20-0x23 */ + 0xA3, 0xC4, 0xA3, 0xC5, 0xA3, 0xC6, 0xA3, 0xC7, /* 0x24-0x27 */ + 0xA3, 0xC8, 0xA3, 0xC9, 0xA3, 0xCA, 0xA3, 0xCB, /* 0x28-0x2B */ + 0xA3, 0xCC, 0xA3, 0xCD, 0xA3, 0xCE, 0xA3, 0xCF, /* 0x2C-0x2F */ + 0xA3, 0xD0, 0xA3, 0xD1, 0xA3, 0xD2, 0xA3, 0xD3, /* 0x30-0x33 */ + 0xA3, 0xD4, 0xA3, 0xD5, 0xA3, 0xD6, 0xA3, 0xD7, /* 0x34-0x37 */ + 0xA3, 0xD8, 0xA3, 0xD9, 0xA3, 0xDA, 0xA3, 0xDB, /* 0x38-0x3B */ + 0xA1, 0xAC, 0xA3, 0xDD, 0xA3, 0xDE, 0xA3, 0xDF, /* 0x3C-0x3F */ + 0xA3, 0xE0, 0xA3, 0xE1, 0xA3, 0xE2, 0xA3, 0xE3, /* 0x40-0x43 */ + 0xA3, 0xE4, 0xA3, 0xE5, 0xA3, 0xE6, 0xA3, 0xE7, /* 0x44-0x47 */ + 0xA3, 0xE8, 0xA3, 0xE9, 0xA3, 0xEA, 0xA3, 0xEB, /* 0x48-0x4B */ + 0xA3, 0xEC, 0xA3, 0xED, 0xA3, 0xEE, 0xA3, 0xEF, /* 0x4C-0x4F */ + 0xA3, 0xF0, 0xA3, 0xF1, 0xA3, 0xF2, 0xA3, 0xF3, /* 0x50-0x53 */ + 0xA3, 0xF4, 0xA3, 0xF5, 0xA3, 0xF6, 0xA3, 0xF7, /* 0x54-0x57 */ + 0xA3, 0xF8, 0xA3, 0xF9, 0xA3, 0xFA, 0xA3, 0xFB, /* 0x58-0x5B */ + 0xA3, 0xFC, 0xA3, 0xFD, 0xA2, 0xA6, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xA4, 0xD4, 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, /* 0xA0-0xA3 */ + 0xA4, 0xA4, 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, /* 0xA4-0xA7 */ + 0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xAA, 0xA4, 0xAB, /* 0xA8-0xAB */ + 0xA4, 0xAC, 0xA4, 0xAD, 0xA4, 0xAE, 0xA4, 0xAF, /* 0xAC-0xAF */ + 0xA4, 0xB0, 0xA4, 0xB1, 0xA4, 0xB2, 0xA4, 0xB3, /* 0xB0-0xB3 */ + 0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0xB4-0xB7 */ + 0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0xB8-0xBB */ + 0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xBF, 0xA4, 0xC0, /* 0xC0-0xC3 */ + 0xA4, 0xC1, 0xA4, 0xC2, 0xA4, 0xC3, 0xA4, 0xC4, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xC5, 0xA4, 0xC6, /* 0xC8-0xCB */ + 0xA4, 0xC7, 0xA4, 0xC8, 0xA4, 0xC9, 0xA4, 0xCA, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xCB, 0xA4, 0xCC, /* 0xD0-0xD3 */ + 0xA4, 0xCD, 0xA4, 0xCE, 0xA4, 0xCF, 0xA4, 0xD0, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xD1, 0xA4, 0xD2, /* 0xD8-0xDB */ + 0xA4, 0xD3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xA1, 0xCB, 0xA1, 0xCC, 0xA1, 0xFE, 0xA3, 0xFE, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xA1, 0xCD, 0xA3, 0xDC, 0x00, 0x00, /* 0xE4-0xE7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + NULL, u2c_01, u2c_02, u2c_03, u2c_04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, u2c_11, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_20, u2c_21, u2c_22, u2c_23, u2c_24, u2c_25, u2c_26, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_30, u2c_31, u2c_32, u2c_33, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, u2c_4E, u2c_4F, + u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, + u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, + u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, + u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, + u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, + u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, + u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, + u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, + u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, + u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, u2c_AC, u2c_AD, u2c_AE, u2c_AF, + u2c_B0, u2c_B1, u2c_B2, u2c_B3, u2c_B4, u2c_B5, u2c_B6, u2c_B7, + u2c_B8, u2c_B9, u2c_BA, u2c_BB, u2c_BC, u2c_BD, u2c_BE, u2c_BF, + u2c_C0, u2c_C1, u2c_C2, u2c_C3, u2c_C4, u2c_C5, u2c_C6, u2c_C7, + u2c_C8, u2c_C9, u2c_CA, u2c_CB, u2c_CC, u2c_CD, u2c_CE, u2c_CF, + u2c_D0, u2c_D1, u2c_D2, u2c_D3, u2c_D4, u2c_D5, u2c_D6, u2c_D7, + NULL, NULL, NULL, NULL, u2c_DC, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, u2c_F9, u2c_FA, NULL, NULL, NULL, NULL, u2c_FF, }; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(const wchar_t uni, + unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni&0xFF; + unsigned char ch = (uni>>8)&0xFF; + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + + uni2charset = page_uni2charset[ch]; + if (uni2charset) { + if (boundlen <= 1) + return -ENAMETOOLONG; + out[0] = uni2charset[cl*2]; + out[1] = uni2charset[cl*2+1]; + if (out[0] == 0x00 && out[1] == 0x00) + return -EINVAL; + n = 2; + } else if (ch==0 && cl) { + out[0] = cl; + n = 1; + } + else + return -EINVAL; + + return n; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + unsigned char ch, cl; + const wchar_t *charset2uni; + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + if (boundlen == 1) { + *uni = rawstring[0]; + return 1; + } + + ch = rawstring[0]; + cl = rawstring[1]; + + charset2uni = page_charset2uni[ch]; + if (charset2uni && cl) { + *uni = charset2uni[cl]; + if (*uni == 0x0000) + return -EINVAL; + n = 2; + } else{ + *uni = ch; + n = 1; + } + return n; +} + +static struct nls_table table = { + .charset = "cp949", + .alias = "euc-kr", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp949(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp949(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp949) +module_exit(exit_nls_cp949) + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NLS(euc-kr); diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c new file mode 100644 index 00000000..ef253682 --- /dev/null +++ b/fs/nls/nls_cp950.c @@ -0,0 +1,9483 @@ +/* + * linux/fs/nls/nls_cp950.c + * + * Charset cp950 translation tables. + * This translation table was generated automatically, the + * original table can be download from the Microsoft website. + * (http://www.microsoft.com/typography/unicode/unicodecp.htm) + */ + +#include +#include +#include +#include +#include + +static const wchar_t c2u_A1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x3000,0xFF0C,0x3001,0x3002,0xFF0E,0x2027,0xFF1B,0xFF1A,/* 0x40-0x47 */ + 0xFF1F,0xFF01,0xFE30,0x2026,0x2025,0xFE50,0xFE51,0xFE52,/* 0x48-0x4F */ + 0x00B7,0xFE54,0xFE55,0xFE56,0xFE57,0xFF5C,0x2013,0xFE31,/* 0x50-0x57 */ + 0x2014,0xFE33,0x2574,0xFE34,0xFE4F,0xFF08,0xFF09,0xFE35,/* 0x58-0x5F */ + 0xFE36,0xFF5B,0xFF5D,0xFE37,0xFE38,0x3014,0x3015,0xFE39,/* 0x60-0x67 */ + 0xFE3A,0x3010,0x3011,0xFE3B,0xFE3C,0x300A,0x300B,0xFE3D,/* 0x68-0x6F */ + 0xFE3E,0x3008,0x3009,0xFE3F,0xFE40,0x300C,0x300D,0xFE41,/* 0x70-0x77 */ + 0xFE42,0x300E,0x300F,0xFE43,0xFE44,0xFE59,0xFE5A,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0xFE5B,0xFE5C,0xFE5D,0xFE5E,0x2018,0x2019,0x201C,/* 0xA0-0xA7 */ + 0x201D,0x301D,0x301E,0x2035,0x2032,0xFF03,0xFF06,0xFF0A,/* 0xA8-0xAF */ + 0x203B,0x00A7,0x3003,0x25CB,0x25CF,0x25B3,0x25B2,0x25CE,/* 0xB0-0xB7 */ + 0x2606,0x2605,0x25C7,0x25C6,0x25A1,0x25A0,0x25BD,0x25BC,/* 0xB8-0xBF */ + 0x32A3,0x2105,0x00AF,0xFFE3,0xFF3F,0x02CD,0xFE49,0xFE4A,/* 0xC0-0xC7 */ + 0xFE4D,0xFE4E,0xFE4B,0xFE4C,0xFE5F,0xFE60,0xFE61,0xFF0B,/* 0xC8-0xCF */ + 0xFF0D,0x00D7,0x00F7,0x00B1,0x221A,0xFF1C,0xFF1E,0xFF1D,/* 0xD0-0xD7 */ + 0x2266,0x2267,0x2260,0x221E,0x2252,0x2261,0xFE62,0xFE63,/* 0xD8-0xDF */ + 0xFE64,0xFE65,0xFE66,0xFF5E,0x2229,0x222A,0x22A5,0x2220,/* 0xE0-0xE7 */ + 0x221F,0x22BF,0x33D2,0x33D1,0x222B,0x222E,0x2235,0x2234,/* 0xE8-0xEF */ + 0x2640,0x2642,0x2295,0x2299,0x2191,0x2193,0x2190,0x2192,/* 0xF0-0xF7 */ + 0x2196,0x2197,0x2199,0x2198,0x2225,0x2223,0xFF0F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0xFF3C,0x2215,0xFE68,0xFF04,0xFFE5,0x3012,0xFFE0,0xFFE1,/* 0x40-0x47 */ + 0xFF05,0xFF20,0x2103,0x2109,0xFE69,0xFE6A,0xFE6B,0x33D5,/* 0x48-0x4F */ + 0x339C,0x339D,0x339E,0x33CE,0x33A1,0x338E,0x338F,0x33C4,/* 0x50-0x57 */ + 0x00B0,0x5159,0x515B,0x515E,0x515D,0x5161,0x5163,0x55E7,/* 0x58-0x5F */ + 0x74E9,0x7CCE,0x2581,0x2582,0x2583,0x2584,0x2585,0x2586,/* 0x60-0x67 */ + 0x2587,0x2588,0x258F,0x258E,0x258D,0x258C,0x258B,0x258A,/* 0x68-0x6F */ + 0x2589,0x253C,0x2534,0x252C,0x2524,0x251C,0x2594,0x2500,/* 0x70-0x77 */ + 0x2502,0x2595,0x250C,0x2510,0x2514,0x2518,0x256D,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x256E,0x2570,0x256F,0x2550,0x255E,0x256A,0x2561,/* 0xA0-0xA7 */ + 0x25E2,0x25E3,0x25E5,0x25E4,0x2571,0x2572,0x2573,0xFF10,/* 0xA8-0xAF */ + 0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,0xFF18,/* 0xB0-0xB7 */ + 0xFF19,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,/* 0xB8-0xBF */ + 0x2167,0x2168,0x2169,0x3021,0x3022,0x3023,0x3024,0x3025,/* 0xC0-0xC7 */ + 0x3026,0x3027,0x3028,0x3029,0x5341,0x5344,0x5345,0xFF21,/* 0xC8-0xCF */ + 0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,0xFF28,0xFF29,/* 0xD0-0xD7 */ + 0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,0xFF30,0xFF31,/* 0xD8-0xDF */ + 0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,0xFF38,0xFF39,/* 0xE0-0xE7 */ + 0xFF3A,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0xE8-0xEF */ + 0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0xF0-0xF7 */ + 0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0xFF57,0xFF58,0xFF59,0xFF5A,0x0391,0x0392,0x0393,0x0394,/* 0x40-0x47 */ + 0x0395,0x0396,0x0397,0x0398,0x0399,0x039A,0x039B,0x039C,/* 0x48-0x4F */ + 0x039D,0x039E,0x039F,0x03A0,0x03A1,0x03A3,0x03A4,0x03A5,/* 0x50-0x57 */ + 0x03A6,0x03A7,0x03A8,0x03A9,0x03B1,0x03B2,0x03B3,0x03B4,/* 0x58-0x5F */ + 0x03B5,0x03B6,0x03B7,0x03B8,0x03B9,0x03BA,0x03BB,0x03BC,/* 0x60-0x67 */ + 0x03BD,0x03BE,0x03BF,0x03C0,0x03C1,0x03C3,0x03C4,0x03C5,/* 0x68-0x6F */ + 0x03C6,0x03C7,0x03C8,0x03C9,0x3105,0x3106,0x3107,0x3108,/* 0x70-0x77 */ + 0x3109,0x310A,0x310B,0x310C,0x310D,0x310E,0x310F,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x3110,0x3111,0x3112,0x3113,0x3114,0x3115,0x3116,/* 0xA0-0xA7 */ + 0x3117,0x3118,0x3119,0x311A,0x311B,0x311C,0x311D,0x311E,/* 0xA8-0xAF */ + 0x311F,0x3120,0x3121,0x3122,0x3123,0x3124,0x3125,0x3126,/* 0xB0-0xB7 */ + 0x3127,0x3128,0x3129,0x02D9,0x02C9,0x02CA,0x02C7,0x02CB,/* 0xB8-0xBF */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC8-0xCF */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD0-0xD7 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD8-0xDF */ + 0x0000,0x20AC,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xE0-0xE7 */ +}; + +static const wchar_t c2u_A4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x4E00,0x4E59,0x4E01,0x4E03,0x4E43,0x4E5D,0x4E86,0x4E8C,/* 0x40-0x47 */ + 0x4EBA,0x513F,0x5165,0x516B,0x51E0,0x5200,0x5201,0x529B,/* 0x48-0x4F */ + 0x5315,0x5341,0x535C,0x53C8,0x4E09,0x4E0B,0x4E08,0x4E0A,/* 0x50-0x57 */ + 0x4E2B,0x4E38,0x51E1,0x4E45,0x4E48,0x4E5F,0x4E5E,0x4E8E,/* 0x58-0x5F */ + 0x4EA1,0x5140,0x5203,0x52FA,0x5343,0x53C9,0x53E3,0x571F,/* 0x60-0x67 */ + 0x58EB,0x5915,0x5927,0x5973,0x5B50,0x5B51,0x5B53,0x5BF8,/* 0x68-0x6F */ + 0x5C0F,0x5C22,0x5C38,0x5C71,0x5DDD,0x5DE5,0x5DF1,0x5DF2,/* 0x70-0x77 */ + 0x5DF3,0x5DFE,0x5E72,0x5EFE,0x5F0B,0x5F13,0x624D,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x4E11,0x4E10,0x4E0D,0x4E2D,0x4E30,0x4E39,0x4E4B,/* 0xA0-0xA7 */ + 0x5C39,0x4E88,0x4E91,0x4E95,0x4E92,0x4E94,0x4EA2,0x4EC1,/* 0xA8-0xAF */ + 0x4EC0,0x4EC3,0x4EC6,0x4EC7,0x4ECD,0x4ECA,0x4ECB,0x4EC4,/* 0xB0-0xB7 */ + 0x5143,0x5141,0x5167,0x516D,0x516E,0x516C,0x5197,0x51F6,/* 0xB8-0xBF */ + 0x5206,0x5207,0x5208,0x52FB,0x52FE,0x52FF,0x5316,0x5339,/* 0xC0-0xC7 */ + 0x5348,0x5347,0x5345,0x535E,0x5384,0x53CB,0x53CA,0x53CD,/* 0xC8-0xCF */ + 0x58EC,0x5929,0x592B,0x592A,0x592D,0x5B54,0x5C11,0x5C24,/* 0xD0-0xD7 */ + 0x5C3A,0x5C6F,0x5DF4,0x5E7B,0x5EFF,0x5F14,0x5F15,0x5FC3,/* 0xD8-0xDF */ + 0x6208,0x6236,0x624B,0x624E,0x652F,0x6587,0x6597,0x65A4,/* 0xE0-0xE7 */ + 0x65B9,0x65E5,0x66F0,0x6708,0x6728,0x6B20,0x6B62,0x6B79,/* 0xE8-0xEF */ + 0x6BCB,0x6BD4,0x6BDB,0x6C0F,0x6C34,0x706B,0x722A,0x7236,/* 0xF0-0xF7 */ + 0x723B,0x7247,0x7259,0x725B,0x72AC,0x738B,0x4E19,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x4E16,0x4E15,0x4E14,0x4E18,0x4E3B,0x4E4D,0x4E4F,0x4E4E,/* 0x40-0x47 */ + 0x4EE5,0x4ED8,0x4ED4,0x4ED5,0x4ED6,0x4ED7,0x4EE3,0x4EE4,/* 0x48-0x4F */ + 0x4ED9,0x4EDE,0x5145,0x5144,0x5189,0x518A,0x51AC,0x51F9,/* 0x50-0x57 */ + 0x51FA,0x51F8,0x520A,0x52A0,0x529F,0x5305,0x5306,0x5317,/* 0x58-0x5F */ + 0x531D,0x4EDF,0x534A,0x5349,0x5361,0x5360,0x536F,0x536E,/* 0x60-0x67 */ + 0x53BB,0x53EF,0x53E4,0x53F3,0x53EC,0x53EE,0x53E9,0x53E8,/* 0x68-0x6F */ + 0x53FC,0x53F8,0x53F5,0x53EB,0x53E6,0x53EA,0x53F2,0x53F1,/* 0x70-0x77 */ + 0x53F0,0x53E5,0x53ED,0x53FB,0x56DB,0x56DA,0x5916,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x592E,0x5931,0x5974,0x5976,0x5B55,0x5B83,0x5C3C,/* 0xA0-0xA7 */ + 0x5DE8,0x5DE7,0x5DE6,0x5E02,0x5E03,0x5E73,0x5E7C,0x5F01,/* 0xA8-0xAF */ + 0x5F18,0x5F17,0x5FC5,0x620A,0x6253,0x6254,0x6252,0x6251,/* 0xB0-0xB7 */ + 0x65A5,0x65E6,0x672E,0x672C,0x672A,0x672B,0x672D,0x6B63,/* 0xB8-0xBF */ + 0x6BCD,0x6C11,0x6C10,0x6C38,0x6C41,0x6C40,0x6C3E,0x72AF,/* 0xC0-0xC7 */ + 0x7384,0x7389,0x74DC,0x74E6,0x7518,0x751F,0x7528,0x7529,/* 0xC8-0xCF */ + 0x7530,0x7531,0x7532,0x7533,0x758B,0x767D,0x76AE,0x76BF,/* 0xD0-0xD7 */ + 0x76EE,0x77DB,0x77E2,0x77F3,0x793A,0x79BE,0x7A74,0x7ACB,/* 0xD8-0xDF */ + 0x4E1E,0x4E1F,0x4E52,0x4E53,0x4E69,0x4E99,0x4EA4,0x4EA6,/* 0xE0-0xE7 */ + 0x4EA5,0x4EFF,0x4F09,0x4F19,0x4F0A,0x4F15,0x4F0D,0x4F10,/* 0xE8-0xEF */ + 0x4F11,0x4F0F,0x4EF2,0x4EF6,0x4EFB,0x4EF0,0x4EF3,0x4EFD,/* 0xF0-0xF7 */ + 0x4F01,0x4F0B,0x5149,0x5147,0x5146,0x5148,0x5168,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5171,0x518D,0x51B0,0x5217,0x5211,0x5212,0x520E,0x5216,/* 0x40-0x47 */ + 0x52A3,0x5308,0x5321,0x5320,0x5370,0x5371,0x5409,0x540F,/* 0x48-0x4F */ + 0x540C,0x540A,0x5410,0x5401,0x540B,0x5404,0x5411,0x540D,/* 0x50-0x57 */ + 0x5408,0x5403,0x540E,0x5406,0x5412,0x56E0,0x56DE,0x56DD,/* 0x58-0x5F */ + 0x5733,0x5730,0x5728,0x572D,0x572C,0x572F,0x5729,0x5919,/* 0x60-0x67 */ + 0x591A,0x5937,0x5938,0x5984,0x5978,0x5983,0x597D,0x5979,/* 0x68-0x6F */ + 0x5982,0x5981,0x5B57,0x5B58,0x5B87,0x5B88,0x5B85,0x5B89,/* 0x70-0x77 */ + 0x5BFA,0x5C16,0x5C79,0x5DDE,0x5E06,0x5E76,0x5E74,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5F0F,0x5F1B,0x5FD9,0x5FD6,0x620E,0x620C,0x620D,/* 0xA0-0xA7 */ + 0x6210,0x6263,0x625B,0x6258,0x6536,0x65E9,0x65E8,0x65EC,/* 0xA8-0xAF */ + 0x65ED,0x66F2,0x66F3,0x6709,0x673D,0x6734,0x6731,0x6735,/* 0xB0-0xB7 */ + 0x6B21,0x6B64,0x6B7B,0x6C16,0x6C5D,0x6C57,0x6C59,0x6C5F,/* 0xB8-0xBF */ + 0x6C60,0x6C50,0x6C55,0x6C61,0x6C5B,0x6C4D,0x6C4E,0x7070,/* 0xC0-0xC7 */ + 0x725F,0x725D,0x767E,0x7AF9,0x7C73,0x7CF8,0x7F36,0x7F8A,/* 0xC8-0xCF */ + 0x7FBD,0x8001,0x8003,0x800C,0x8012,0x8033,0x807F,0x8089,/* 0xD0-0xD7 */ + 0x808B,0x808C,0x81E3,0x81EA,0x81F3,0x81FC,0x820C,0x821B,/* 0xD8-0xDF */ + 0x821F,0x826E,0x8272,0x827E,0x866B,0x8840,0x884C,0x8863,/* 0xE0-0xE7 */ + 0x897F,0x9621,0x4E32,0x4EA8,0x4F4D,0x4F4F,0x4F47,0x4F57,/* 0xE8-0xEF */ + 0x4F5E,0x4F34,0x4F5B,0x4F55,0x4F30,0x4F50,0x4F51,0x4F3D,/* 0xF0-0xF7 */ + 0x4F3A,0x4F38,0x4F43,0x4F54,0x4F3C,0x4F46,0x4F63,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x4F5C,0x4F60,0x4F2F,0x4F4E,0x4F36,0x4F59,0x4F5D,0x4F48,/* 0x40-0x47 */ + 0x4F5A,0x514C,0x514B,0x514D,0x5175,0x51B6,0x51B7,0x5225,/* 0x48-0x4F */ + 0x5224,0x5229,0x522A,0x5228,0x52AB,0x52A9,0x52AA,0x52AC,/* 0x50-0x57 */ + 0x5323,0x5373,0x5375,0x541D,0x542D,0x541E,0x543E,0x5426,/* 0x58-0x5F */ + 0x544E,0x5427,0x5446,0x5443,0x5433,0x5448,0x5442,0x541B,/* 0x60-0x67 */ + 0x5429,0x544A,0x5439,0x543B,0x5438,0x542E,0x5435,0x5436,/* 0x68-0x6F */ + 0x5420,0x543C,0x5440,0x5431,0x542B,0x541F,0x542C,0x56EA,/* 0x70-0x77 */ + 0x56F0,0x56E4,0x56EB,0x574A,0x5751,0x5740,0x574D,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5747,0x574E,0x573E,0x5750,0x574F,0x573B,0x58EF,/* 0xA0-0xA7 */ + 0x593E,0x599D,0x5992,0x59A8,0x599E,0x59A3,0x5999,0x5996,/* 0xA8-0xAF */ + 0x598D,0x59A4,0x5993,0x598A,0x59A5,0x5B5D,0x5B5C,0x5B5A,/* 0xB0-0xB7 */ + 0x5B5B,0x5B8C,0x5B8B,0x5B8F,0x5C2C,0x5C40,0x5C41,0x5C3F,/* 0xB8-0xBF */ + 0x5C3E,0x5C90,0x5C91,0x5C94,0x5C8C,0x5DEB,0x5E0C,0x5E8F,/* 0xC0-0xC7 */ + 0x5E87,0x5E8A,0x5EF7,0x5F04,0x5F1F,0x5F64,0x5F62,0x5F77,/* 0xC8-0xCF */ + 0x5F79,0x5FD8,0x5FCC,0x5FD7,0x5FCD,0x5FF1,0x5FEB,0x5FF8,/* 0xD0-0xD7 */ + 0x5FEA,0x6212,0x6211,0x6284,0x6297,0x6296,0x6280,0x6276,/* 0xD8-0xDF */ + 0x6289,0x626D,0x628A,0x627C,0x627E,0x6279,0x6273,0x6292,/* 0xE0-0xE7 */ + 0x626F,0x6298,0x626E,0x6295,0x6293,0x6291,0x6286,0x6539,/* 0xE8-0xEF */ + 0x653B,0x6538,0x65F1,0x66F4,0x675F,0x674E,0x674F,0x6750,/* 0xF0-0xF7 */ + 0x6751,0x675C,0x6756,0x675E,0x6749,0x6746,0x6760,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6753,0x6757,0x6B65,0x6BCF,0x6C42,0x6C5E,0x6C99,0x6C81,/* 0x40-0x47 */ + 0x6C88,0x6C89,0x6C85,0x6C9B,0x6C6A,0x6C7A,0x6C90,0x6C70,/* 0x48-0x4F */ + 0x6C8C,0x6C68,0x6C96,0x6C92,0x6C7D,0x6C83,0x6C72,0x6C7E,/* 0x50-0x57 */ + 0x6C74,0x6C86,0x6C76,0x6C8D,0x6C94,0x6C98,0x6C82,0x7076,/* 0x58-0x5F */ + 0x707C,0x707D,0x7078,0x7262,0x7261,0x7260,0x72C4,0x72C2,/* 0x60-0x67 */ + 0x7396,0x752C,0x752B,0x7537,0x7538,0x7682,0x76EF,0x77E3,/* 0x68-0x6F */ + 0x79C1,0x79C0,0x79BF,0x7A76,0x7CFB,0x7F55,0x8096,0x8093,/* 0x70-0x77 */ + 0x809D,0x8098,0x809B,0x809A,0x80B2,0x826F,0x8292,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x828B,0x828D,0x898B,0x89D2,0x8A00,0x8C37,0x8C46,/* 0xA0-0xA7 */ + 0x8C55,0x8C9D,0x8D64,0x8D70,0x8DB3,0x8EAB,0x8ECA,0x8F9B,/* 0xA8-0xAF */ + 0x8FB0,0x8FC2,0x8FC6,0x8FC5,0x8FC4,0x5DE1,0x9091,0x90A2,/* 0xB0-0xB7 */ + 0x90AA,0x90A6,0x90A3,0x9149,0x91C6,0x91CC,0x9632,0x962E,/* 0xB8-0xBF */ + 0x9631,0x962A,0x962C,0x4E26,0x4E56,0x4E73,0x4E8B,0x4E9B,/* 0xC0-0xC7 */ + 0x4E9E,0x4EAB,0x4EAC,0x4F6F,0x4F9D,0x4F8D,0x4F73,0x4F7F,/* 0xC8-0xCF */ + 0x4F6C,0x4F9B,0x4F8B,0x4F86,0x4F83,0x4F70,0x4F75,0x4F88,/* 0xD0-0xD7 */ + 0x4F69,0x4F7B,0x4F96,0x4F7E,0x4F8F,0x4F91,0x4F7A,0x5154,/* 0xD8-0xDF */ + 0x5152,0x5155,0x5169,0x5177,0x5176,0x5178,0x51BD,0x51FD,/* 0xE0-0xE7 */ + 0x523B,0x5238,0x5237,0x523A,0x5230,0x522E,0x5236,0x5241,/* 0xE8-0xEF */ + 0x52BE,0x52BB,0x5352,0x5354,0x5353,0x5351,0x5366,0x5377,/* 0xF0-0xF7 */ + 0x5378,0x5379,0x53D6,0x53D4,0x53D7,0x5473,0x5475,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_A9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5496,0x5478,0x5495,0x5480,0x547B,0x5477,0x5484,0x5492,/* 0x40-0x47 */ + 0x5486,0x547C,0x5490,0x5471,0x5476,0x548C,0x549A,0x5462,/* 0x48-0x4F */ + 0x5468,0x548B,0x547D,0x548E,0x56FA,0x5783,0x5777,0x576A,/* 0x50-0x57 */ + 0x5769,0x5761,0x5766,0x5764,0x577C,0x591C,0x5949,0x5947,/* 0x58-0x5F */ + 0x5948,0x5944,0x5954,0x59BE,0x59BB,0x59D4,0x59B9,0x59AE,/* 0x60-0x67 */ + 0x59D1,0x59C6,0x59D0,0x59CD,0x59CB,0x59D3,0x59CA,0x59AF,/* 0x68-0x6F */ + 0x59B3,0x59D2,0x59C5,0x5B5F,0x5B64,0x5B63,0x5B97,0x5B9A,/* 0x70-0x77 */ + 0x5B98,0x5B9C,0x5B99,0x5B9B,0x5C1A,0x5C48,0x5C45,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5C46,0x5CB7,0x5CA1,0x5CB8,0x5CA9,0x5CAB,0x5CB1,/* 0xA0-0xA7 */ + 0x5CB3,0x5E18,0x5E1A,0x5E16,0x5E15,0x5E1B,0x5E11,0x5E78,/* 0xA8-0xAF */ + 0x5E9A,0x5E97,0x5E9C,0x5E95,0x5E96,0x5EF6,0x5F26,0x5F27,/* 0xB0-0xB7 */ + 0x5F29,0x5F80,0x5F81,0x5F7F,0x5F7C,0x5FDD,0x5FE0,0x5FFD,/* 0xB8-0xBF */ + 0x5FF5,0x5FFF,0x600F,0x6014,0x602F,0x6035,0x6016,0x602A,/* 0xC0-0xC7 */ + 0x6015,0x6021,0x6027,0x6029,0x602B,0x601B,0x6216,0x6215,/* 0xC8-0xCF */ + 0x623F,0x623E,0x6240,0x627F,0x62C9,0x62CC,0x62C4,0x62BF,/* 0xD0-0xD7 */ + 0x62C2,0x62B9,0x62D2,0x62DB,0x62AB,0x62D3,0x62D4,0x62CB,/* 0xD8-0xDF */ + 0x62C8,0x62A8,0x62BD,0x62BC,0x62D0,0x62D9,0x62C7,0x62CD,/* 0xE0-0xE7 */ + 0x62B5,0x62DA,0x62B1,0x62D8,0x62D6,0x62D7,0x62C6,0x62AC,/* 0xE8-0xEF */ + 0x62CE,0x653E,0x65A7,0x65BC,0x65FA,0x6614,0x6613,0x660C,/* 0xF0-0xF7 */ + 0x6606,0x6602,0x660E,0x6600,0x660F,0x6615,0x660A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_AA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6607,0x670D,0x670B,0x676D,0x678B,0x6795,0x6771,0x679C,/* 0x40-0x47 */ + 0x6773,0x6777,0x6787,0x679D,0x6797,0x676F,0x6770,0x677F,/* 0x48-0x4F */ + 0x6789,0x677E,0x6790,0x6775,0x679A,0x6793,0x677C,0x676A,/* 0x50-0x57 */ + 0x6772,0x6B23,0x6B66,0x6B67,0x6B7F,0x6C13,0x6C1B,0x6CE3,/* 0x58-0x5F */ + 0x6CE8,0x6CF3,0x6CB1,0x6CCC,0x6CE5,0x6CB3,0x6CBD,0x6CBE,/* 0x60-0x67 */ + 0x6CBC,0x6CE2,0x6CAB,0x6CD5,0x6CD3,0x6CB8,0x6CC4,0x6CB9,/* 0x68-0x6F */ + 0x6CC1,0x6CAE,0x6CD7,0x6CC5,0x6CF1,0x6CBF,0x6CBB,0x6CE1,/* 0x70-0x77 */ + 0x6CDB,0x6CCA,0x6CAC,0x6CEF,0x6CDC,0x6CD6,0x6CE0,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7095,0x708E,0x7092,0x708A,0x7099,0x722C,0x722D,/* 0xA0-0xA7 */ + 0x7238,0x7248,0x7267,0x7269,0x72C0,0x72CE,0x72D9,0x72D7,/* 0xA8-0xAF */ + 0x72D0,0x73A9,0x73A8,0x739F,0x73AB,0x73A5,0x753D,0x759D,/* 0xB0-0xB7 */ + 0x7599,0x759A,0x7684,0x76C2,0x76F2,0x76F4,0x77E5,0x77FD,/* 0xB8-0xBF */ + 0x793E,0x7940,0x7941,0x79C9,0x79C8,0x7A7A,0x7A79,0x7AFA,/* 0xC0-0xC7 */ + 0x7CFE,0x7F54,0x7F8C,0x7F8B,0x8005,0x80BA,0x80A5,0x80A2,/* 0xC8-0xCF */ + 0x80B1,0x80A1,0x80AB,0x80A9,0x80B4,0x80AA,0x80AF,0x81E5,/* 0xD0-0xD7 */ + 0x81FE,0x820D,0x82B3,0x829D,0x8299,0x82AD,0x82BD,0x829F,/* 0xD8-0xDF */ + 0x82B9,0x82B1,0x82AC,0x82A5,0x82AF,0x82B8,0x82A3,0x82B0,/* 0xE0-0xE7 */ + 0x82BE,0x82B7,0x864E,0x8671,0x521D,0x8868,0x8ECB,0x8FCE,/* 0xE8-0xEF */ + 0x8FD4,0x8FD1,0x90B5,0x90B8,0x90B1,0x90B6,0x91C7,0x91D1,/* 0xF0-0xF7 */ + 0x9577,0x9580,0x961C,0x9640,0x963F,0x963B,0x9644,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_AB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9642,0x96B9,0x96E8,0x9752,0x975E,0x4E9F,0x4EAD,0x4EAE,/* 0x40-0x47 */ + 0x4FE1,0x4FB5,0x4FAF,0x4FBF,0x4FE0,0x4FD1,0x4FCF,0x4FDD,/* 0x48-0x4F */ + 0x4FC3,0x4FB6,0x4FD8,0x4FDF,0x4FCA,0x4FD7,0x4FAE,0x4FD0,/* 0x50-0x57 */ + 0x4FC4,0x4FC2,0x4FDA,0x4FCE,0x4FDE,0x4FB7,0x5157,0x5192,/* 0x58-0x5F */ + 0x5191,0x51A0,0x524E,0x5243,0x524A,0x524D,0x524C,0x524B,/* 0x60-0x67 */ + 0x5247,0x52C7,0x52C9,0x52C3,0x52C1,0x530D,0x5357,0x537B,/* 0x68-0x6F */ + 0x539A,0x53DB,0x54AC,0x54C0,0x54A8,0x54CE,0x54C9,0x54B8,/* 0x70-0x77 */ + 0x54A6,0x54B3,0x54C7,0x54C2,0x54BD,0x54AA,0x54C1,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x54C4,0x54C8,0x54AF,0x54AB,0x54B1,0x54BB,0x54A9,/* 0xA0-0xA7 */ + 0x54A7,0x54BF,0x56FF,0x5782,0x578B,0x57A0,0x57A3,0x57A2,/* 0xA8-0xAF */ + 0x57CE,0x57AE,0x5793,0x5955,0x5951,0x594F,0x594E,0x5950,/* 0xB0-0xB7 */ + 0x59DC,0x59D8,0x59FF,0x59E3,0x59E8,0x5A03,0x59E5,0x59EA,/* 0xB8-0xBF */ + 0x59DA,0x59E6,0x5A01,0x59FB,0x5B69,0x5BA3,0x5BA6,0x5BA4,/* 0xC0-0xC7 */ + 0x5BA2,0x5BA5,0x5C01,0x5C4E,0x5C4F,0x5C4D,0x5C4B,0x5CD9,/* 0xC8-0xCF */ + 0x5CD2,0x5DF7,0x5E1D,0x5E25,0x5E1F,0x5E7D,0x5EA0,0x5EA6,/* 0xD0-0xD7 */ + 0x5EFA,0x5F08,0x5F2D,0x5F65,0x5F88,0x5F85,0x5F8A,0x5F8B,/* 0xD8-0xDF */ + 0x5F87,0x5F8C,0x5F89,0x6012,0x601D,0x6020,0x6025,0x600E,/* 0xE0-0xE7 */ + 0x6028,0x604D,0x6070,0x6068,0x6062,0x6046,0x6043,0x606C,/* 0xE8-0xEF */ + 0x606B,0x606A,0x6064,0x6241,0x62DC,0x6316,0x6309,0x62FC,/* 0xF0-0xF7 */ + 0x62ED,0x6301,0x62EE,0x62FD,0x6307,0x62F1,0x62F7,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_AC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x62EF,0x62EC,0x62FE,0x62F4,0x6311,0x6302,0x653F,0x6545,/* 0x40-0x47 */ + 0x65AB,0x65BD,0x65E2,0x6625,0x662D,0x6620,0x6627,0x662F,/* 0x48-0x4F */ + 0x661F,0x6628,0x6631,0x6624,0x66F7,0x67FF,0x67D3,0x67F1,/* 0x50-0x57 */ + 0x67D4,0x67D0,0x67EC,0x67B6,0x67AF,0x67F5,0x67E9,0x67EF,/* 0x58-0x5F */ + 0x67C4,0x67D1,0x67B4,0x67DA,0x67E5,0x67B8,0x67CF,0x67DE,/* 0x60-0x67 */ + 0x67F3,0x67B0,0x67D9,0x67E2,0x67DD,0x67D2,0x6B6A,0x6B83,/* 0x68-0x6F */ + 0x6B86,0x6BB5,0x6BD2,0x6BD7,0x6C1F,0x6CC9,0x6D0B,0x6D32,/* 0x70-0x77 */ + 0x6D2A,0x6D41,0x6D25,0x6D0C,0x6D31,0x6D1E,0x6D17,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6D3B,0x6D3D,0x6D3E,0x6D36,0x6D1B,0x6CF5,0x6D39,/* 0xA0-0xA7 */ + 0x6D27,0x6D38,0x6D29,0x6D2E,0x6D35,0x6D0E,0x6D2B,0x70AB,/* 0xA8-0xAF */ + 0x70BA,0x70B3,0x70AC,0x70AF,0x70AD,0x70B8,0x70AE,0x70A4,/* 0xB0-0xB7 */ + 0x7230,0x7272,0x726F,0x7274,0x72E9,0x72E0,0x72E1,0x73B7,/* 0xB8-0xBF */ + 0x73CA,0x73BB,0x73B2,0x73CD,0x73C0,0x73B3,0x751A,0x752D,/* 0xC0-0xC7 */ + 0x754F,0x754C,0x754E,0x754B,0x75AB,0x75A4,0x75A5,0x75A2,/* 0xC8-0xCF */ + 0x75A3,0x7678,0x7686,0x7687,0x7688,0x76C8,0x76C6,0x76C3,/* 0xD0-0xD7 */ + 0x76C5,0x7701,0x76F9,0x76F8,0x7709,0x770B,0x76FE,0x76FC,/* 0xD8-0xDF */ + 0x7707,0x77DC,0x7802,0x7814,0x780C,0x780D,0x7946,0x7949,/* 0xE0-0xE7 */ + 0x7948,0x7947,0x79B9,0x79BA,0x79D1,0x79D2,0x79CB,0x7A7F,/* 0xE8-0xEF */ + 0x7A81,0x7AFF,0x7AFD,0x7C7D,0x7D02,0x7D05,0x7D00,0x7D09,/* 0xF0-0xF7 */ + 0x7D07,0x7D04,0x7D06,0x7F38,0x7F8E,0x7FBF,0x8004,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_AD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8010,0x800D,0x8011,0x8036,0x80D6,0x80E5,0x80DA,0x80C3,/* 0x40-0x47 */ + 0x80C4,0x80CC,0x80E1,0x80DB,0x80CE,0x80DE,0x80E4,0x80DD,/* 0x48-0x4F */ + 0x81F4,0x8222,0x82E7,0x8303,0x8305,0x82E3,0x82DB,0x82E6,/* 0x50-0x57 */ + 0x8304,0x82E5,0x8302,0x8309,0x82D2,0x82D7,0x82F1,0x8301,/* 0x58-0x5F */ + 0x82DC,0x82D4,0x82D1,0x82DE,0x82D3,0x82DF,0x82EF,0x8306,/* 0x60-0x67 */ + 0x8650,0x8679,0x867B,0x867A,0x884D,0x886B,0x8981,0x89D4,/* 0x68-0x6F */ + 0x8A08,0x8A02,0x8A03,0x8C9E,0x8CA0,0x8D74,0x8D73,0x8DB4,/* 0x70-0x77 */ + 0x8ECD,0x8ECC,0x8FF0,0x8FE6,0x8FE2,0x8FEA,0x8FE5,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8FED,0x8FEB,0x8FE4,0x8FE8,0x90CA,0x90CE,0x90C1,/* 0xA0-0xA7 */ + 0x90C3,0x914B,0x914A,0x91CD,0x9582,0x9650,0x964B,0x964C,/* 0xA8-0xAF */ + 0x964D,0x9762,0x9769,0x97CB,0x97ED,0x97F3,0x9801,0x98A8,/* 0xB0-0xB7 */ + 0x98DB,0x98DF,0x9996,0x9999,0x4E58,0x4EB3,0x500C,0x500D,/* 0xB8-0xBF */ + 0x5023,0x4FEF,0x5026,0x5025,0x4FF8,0x5029,0x5016,0x5006,/* 0xC0-0xC7 */ + 0x503C,0x501F,0x501A,0x5012,0x5011,0x4FFA,0x5000,0x5014,/* 0xC8-0xCF */ + 0x5028,0x4FF1,0x5021,0x500B,0x5019,0x5018,0x4FF3,0x4FEE,/* 0xD0-0xD7 */ + 0x502D,0x502A,0x4FFE,0x502B,0x5009,0x517C,0x51A4,0x51A5,/* 0xD8-0xDF */ + 0x51A2,0x51CD,0x51CC,0x51C6,0x51CB,0x5256,0x525C,0x5254,/* 0xE0-0xE7 */ + 0x525B,0x525D,0x532A,0x537F,0x539F,0x539D,0x53DF,0x54E8,/* 0xE8-0xEF */ + 0x5510,0x5501,0x5537,0x54FC,0x54E5,0x54F2,0x5506,0x54FA,/* 0xF0-0xF7 */ + 0x5514,0x54E9,0x54ED,0x54E1,0x5509,0x54EE,0x54EA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_AE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x54E6,0x5527,0x5507,0x54FD,0x550F,0x5703,0x5704,0x57C2,/* 0x40-0x47 */ + 0x57D4,0x57CB,0x57C3,0x5809,0x590F,0x5957,0x5958,0x595A,/* 0x48-0x4F */ + 0x5A11,0x5A18,0x5A1C,0x5A1F,0x5A1B,0x5A13,0x59EC,0x5A20,/* 0x50-0x57 */ + 0x5A23,0x5A29,0x5A25,0x5A0C,0x5A09,0x5B6B,0x5C58,0x5BB0,/* 0x58-0x5F */ + 0x5BB3,0x5BB6,0x5BB4,0x5BAE,0x5BB5,0x5BB9,0x5BB8,0x5C04,/* 0x60-0x67 */ + 0x5C51,0x5C55,0x5C50,0x5CED,0x5CFD,0x5CFB,0x5CEA,0x5CE8,/* 0x68-0x6F */ + 0x5CF0,0x5CF6,0x5D01,0x5CF4,0x5DEE,0x5E2D,0x5E2B,0x5EAB,/* 0x70-0x77 */ + 0x5EAD,0x5EA7,0x5F31,0x5F92,0x5F91,0x5F90,0x6059,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6063,0x6065,0x6050,0x6055,0x606D,0x6069,0x606F,/* 0xA0-0xA7 */ + 0x6084,0x609F,0x609A,0x608D,0x6094,0x608C,0x6085,0x6096,/* 0xA8-0xAF */ + 0x6247,0x62F3,0x6308,0x62FF,0x634E,0x633E,0x632F,0x6355,/* 0xB0-0xB7 */ + 0x6342,0x6346,0x634F,0x6349,0x633A,0x6350,0x633D,0x632A,/* 0xB8-0xBF */ + 0x632B,0x6328,0x634D,0x634C,0x6548,0x6549,0x6599,0x65C1,/* 0xC0-0xC7 */ + 0x65C5,0x6642,0x6649,0x664F,0x6643,0x6652,0x664C,0x6645,/* 0xC8-0xCF */ + 0x6641,0x66F8,0x6714,0x6715,0x6717,0x6821,0x6838,0x6848,/* 0xD0-0xD7 */ + 0x6846,0x6853,0x6839,0x6842,0x6854,0x6829,0x68B3,0x6817,/* 0xD8-0xDF */ + 0x684C,0x6851,0x683D,0x67F4,0x6850,0x6840,0x683C,0x6843,/* 0xE0-0xE7 */ + 0x682A,0x6845,0x6813,0x6818,0x6841,0x6B8A,0x6B89,0x6BB7,/* 0xE8-0xEF */ + 0x6C23,0x6C27,0x6C28,0x6C26,0x6C24,0x6CF0,0x6D6A,0x6D95,/* 0xF0-0xF7 */ + 0x6D88,0x6D87,0x6D66,0x6D78,0x6D77,0x6D59,0x6D93,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_AF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6D6C,0x6D89,0x6D6E,0x6D5A,0x6D74,0x6D69,0x6D8C,0x6D8A,/* 0x40-0x47 */ + 0x6D79,0x6D85,0x6D65,0x6D94,0x70CA,0x70D8,0x70E4,0x70D9,/* 0x48-0x4F */ + 0x70C8,0x70CF,0x7239,0x7279,0x72FC,0x72F9,0x72FD,0x72F8,/* 0x50-0x57 */ + 0x72F7,0x7386,0x73ED,0x7409,0x73EE,0x73E0,0x73EA,0x73DE,/* 0x58-0x5F */ + 0x7554,0x755D,0x755C,0x755A,0x7559,0x75BE,0x75C5,0x75C7,/* 0x60-0x67 */ + 0x75B2,0x75B3,0x75BD,0x75BC,0x75B9,0x75C2,0x75B8,0x768B,/* 0x68-0x6F */ + 0x76B0,0x76CA,0x76CD,0x76CE,0x7729,0x771F,0x7720,0x7728,/* 0x70-0x77 */ + 0x77E9,0x7830,0x7827,0x7838,0x781D,0x7834,0x7837,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7825,0x782D,0x7820,0x781F,0x7832,0x7955,0x7950,/* 0xA0-0xA7 */ + 0x7960,0x795F,0x7956,0x795E,0x795D,0x7957,0x795A,0x79E4,/* 0xA8-0xAF */ + 0x79E3,0x79E7,0x79DF,0x79E6,0x79E9,0x79D8,0x7A84,0x7A88,/* 0xB0-0xB7 */ + 0x7AD9,0x7B06,0x7B11,0x7C89,0x7D21,0x7D17,0x7D0B,0x7D0A,/* 0xB8-0xBF */ + 0x7D20,0x7D22,0x7D14,0x7D10,0x7D15,0x7D1A,0x7D1C,0x7D0D,/* 0xC0-0xC7 */ + 0x7D19,0x7D1B,0x7F3A,0x7F5F,0x7F94,0x7FC5,0x7FC1,0x8006,/* 0xC8-0xCF */ + 0x8018,0x8015,0x8019,0x8017,0x803D,0x803F,0x80F1,0x8102,/* 0xD0-0xD7 */ + 0x80F0,0x8105,0x80ED,0x80F4,0x8106,0x80F8,0x80F3,0x8108,/* 0xD8-0xDF */ + 0x80FD,0x810A,0x80FC,0x80EF,0x81ED,0x81EC,0x8200,0x8210,/* 0xE0-0xE7 */ + 0x822A,0x822B,0x8228,0x822C,0x82BB,0x832B,0x8352,0x8354,/* 0xE8-0xEF */ + 0x834A,0x8338,0x8350,0x8349,0x8335,0x8334,0x834F,0x8332,/* 0xF0-0xF7 */ + 0x8339,0x8336,0x8317,0x8340,0x8331,0x8328,0x8343,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8654,0x868A,0x86AA,0x8693,0x86A4,0x86A9,0x868C,0x86A3,/* 0x40-0x47 */ + 0x869C,0x8870,0x8877,0x8881,0x8882,0x887D,0x8879,0x8A18,/* 0x48-0x4F */ + 0x8A10,0x8A0E,0x8A0C,0x8A15,0x8A0A,0x8A17,0x8A13,0x8A16,/* 0x50-0x57 */ + 0x8A0F,0x8A11,0x8C48,0x8C7A,0x8C79,0x8CA1,0x8CA2,0x8D77,/* 0x58-0x5F */ + 0x8EAC,0x8ED2,0x8ED4,0x8ECF,0x8FB1,0x9001,0x9006,0x8FF7,/* 0x60-0x67 */ + 0x9000,0x8FFA,0x8FF4,0x9003,0x8FFD,0x9005,0x8FF8,0x9095,/* 0x68-0x6F */ + 0x90E1,0x90DD,0x90E2,0x9152,0x914D,0x914C,0x91D8,0x91DD,/* 0x70-0x77 */ + 0x91D7,0x91DC,0x91D9,0x9583,0x9662,0x9663,0x9661,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x965B,0x965D,0x9664,0x9658,0x965E,0x96BB,0x98E2,/* 0xA0-0xA7 */ + 0x99AC,0x9AA8,0x9AD8,0x9B25,0x9B32,0x9B3C,0x4E7E,0x507A,/* 0xA8-0xAF */ + 0x507D,0x505C,0x5047,0x5043,0x504C,0x505A,0x5049,0x5065,/* 0xB0-0xB7 */ + 0x5076,0x504E,0x5055,0x5075,0x5074,0x5077,0x504F,0x500F,/* 0xB8-0xBF */ + 0x506F,0x506D,0x515C,0x5195,0x51F0,0x526A,0x526F,0x52D2,/* 0xC0-0xC7 */ + 0x52D9,0x52D8,0x52D5,0x5310,0x530F,0x5319,0x533F,0x5340,/* 0xC8-0xCF */ + 0x533E,0x53C3,0x66FC,0x5546,0x556A,0x5566,0x5544,0x555E,/* 0xD0-0xD7 */ + 0x5561,0x5543,0x554A,0x5531,0x5556,0x554F,0x5555,0x552F,/* 0xD8-0xDF */ + 0x5564,0x5538,0x552E,0x555C,0x552C,0x5563,0x5533,0x5541,/* 0xE0-0xE7 */ + 0x5557,0x5708,0x570B,0x5709,0x57DF,0x5805,0x580A,0x5806,/* 0xE8-0xEF */ + 0x57E0,0x57E4,0x57FA,0x5802,0x5835,0x57F7,0x57F9,0x5920,/* 0xF0-0xF7 */ + 0x5962,0x5A36,0x5A41,0x5A49,0x5A66,0x5A6A,0x5A40,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5A3C,0x5A62,0x5A5A,0x5A46,0x5A4A,0x5B70,0x5BC7,0x5BC5,/* 0x40-0x47 */ + 0x5BC4,0x5BC2,0x5BBF,0x5BC6,0x5C09,0x5C08,0x5C07,0x5C60,/* 0x48-0x4F */ + 0x5C5C,0x5C5D,0x5D07,0x5D06,0x5D0E,0x5D1B,0x5D16,0x5D22,/* 0x50-0x57 */ + 0x5D11,0x5D29,0x5D14,0x5D19,0x5D24,0x5D27,0x5D17,0x5DE2,/* 0x58-0x5F */ + 0x5E38,0x5E36,0x5E33,0x5E37,0x5EB7,0x5EB8,0x5EB6,0x5EB5,/* 0x60-0x67 */ + 0x5EBE,0x5F35,0x5F37,0x5F57,0x5F6C,0x5F69,0x5F6B,0x5F97,/* 0x68-0x6F */ + 0x5F99,0x5F9E,0x5F98,0x5FA1,0x5FA0,0x5F9C,0x607F,0x60A3,/* 0x70-0x77 */ + 0x6089,0x60A0,0x60A8,0x60CB,0x60B4,0x60E6,0x60BD,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x60C5,0x60BB,0x60B5,0x60DC,0x60BC,0x60D8,0x60D5,/* 0xA0-0xA7 */ + 0x60C6,0x60DF,0x60B8,0x60DA,0x60C7,0x621A,0x621B,0x6248,/* 0xA8-0xAF */ + 0x63A0,0x63A7,0x6372,0x6396,0x63A2,0x63A5,0x6377,0x6367,/* 0xB0-0xB7 */ + 0x6398,0x63AA,0x6371,0x63A9,0x6389,0x6383,0x639B,0x636B,/* 0xB8-0xBF */ + 0x63A8,0x6384,0x6388,0x6399,0x63A1,0x63AC,0x6392,0x638F,/* 0xC0-0xC7 */ + 0x6380,0x637B,0x6369,0x6368,0x637A,0x655D,0x6556,0x6551,/* 0xC8-0xCF */ + 0x6559,0x6557,0x555F,0x654F,0x6558,0x6555,0x6554,0x659C,/* 0xD0-0xD7 */ + 0x659B,0x65AC,0x65CF,0x65CB,0x65CC,0x65CE,0x665D,0x665A,/* 0xD8-0xDF */ + 0x6664,0x6668,0x6666,0x665E,0x66F9,0x52D7,0x671B,0x6881,/* 0xE0-0xE7 */ + 0x68AF,0x68A2,0x6893,0x68B5,0x687F,0x6876,0x68B1,0x68A7,/* 0xE8-0xEF */ + 0x6897,0x68B0,0x6883,0x68C4,0x68AD,0x6886,0x6885,0x6894,/* 0xF0-0xF7 */ + 0x689D,0x68A8,0x689F,0x68A1,0x6882,0x6B32,0x6BBA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6BEB,0x6BEC,0x6C2B,0x6D8E,0x6DBC,0x6DF3,0x6DD9,0x6DB2,/* 0x40-0x47 */ + 0x6DE1,0x6DCC,0x6DE4,0x6DFB,0x6DFA,0x6E05,0x6DC7,0x6DCB,/* 0x48-0x4F */ + 0x6DAF,0x6DD1,0x6DAE,0x6DDE,0x6DF9,0x6DB8,0x6DF7,0x6DF5,/* 0x50-0x57 */ + 0x6DC5,0x6DD2,0x6E1A,0x6DB5,0x6DDA,0x6DEB,0x6DD8,0x6DEA,/* 0x58-0x5F */ + 0x6DF1,0x6DEE,0x6DE8,0x6DC6,0x6DC4,0x6DAA,0x6DEC,0x6DBF,/* 0x60-0x67 */ + 0x6DE6,0x70F9,0x7109,0x710A,0x70FD,0x70EF,0x723D,0x727D,/* 0x68-0x6F */ + 0x7281,0x731C,0x731B,0x7316,0x7313,0x7319,0x7387,0x7405,/* 0x70-0x77 */ + 0x740A,0x7403,0x7406,0x73FE,0x740D,0x74E0,0x74F6,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x74F7,0x751C,0x7522,0x7565,0x7566,0x7562,0x7570,/* 0xA0-0xA7 */ + 0x758F,0x75D4,0x75D5,0x75B5,0x75CA,0x75CD,0x768E,0x76D4,/* 0xA8-0xAF */ + 0x76D2,0x76DB,0x7737,0x773E,0x773C,0x7736,0x7738,0x773A,/* 0xB0-0xB7 */ + 0x786B,0x7843,0x784E,0x7965,0x7968,0x796D,0x79FB,0x7A92,/* 0xB8-0xBF */ + 0x7A95,0x7B20,0x7B28,0x7B1B,0x7B2C,0x7B26,0x7B19,0x7B1E,/* 0xC0-0xC7 */ + 0x7B2E,0x7C92,0x7C97,0x7C95,0x7D46,0x7D43,0x7D71,0x7D2E,/* 0xC8-0xCF */ + 0x7D39,0x7D3C,0x7D40,0x7D30,0x7D33,0x7D44,0x7D2F,0x7D42,/* 0xD0-0xD7 */ + 0x7D32,0x7D31,0x7F3D,0x7F9E,0x7F9A,0x7FCC,0x7FCE,0x7FD2,/* 0xD8-0xDF */ + 0x801C,0x804A,0x8046,0x812F,0x8116,0x8123,0x812B,0x8129,/* 0xE0-0xE7 */ + 0x8130,0x8124,0x8202,0x8235,0x8237,0x8236,0x8239,0x838E,/* 0xE8-0xEF */ + 0x839E,0x8398,0x8378,0x83A2,0x8396,0x83BD,0x83AB,0x8392,/* 0xF0-0xF7 */ + 0x838A,0x8393,0x8389,0x83A0,0x8377,0x837B,0x837C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8386,0x83A7,0x8655,0x5F6A,0x86C7,0x86C0,0x86B6,0x86C4,/* 0x40-0x47 */ + 0x86B5,0x86C6,0x86CB,0x86B1,0x86AF,0x86C9,0x8853,0x889E,/* 0x48-0x4F */ + 0x8888,0x88AB,0x8892,0x8896,0x888D,0x888B,0x8993,0x898F,/* 0x50-0x57 */ + 0x8A2A,0x8A1D,0x8A23,0x8A25,0x8A31,0x8A2D,0x8A1F,0x8A1B,/* 0x58-0x5F */ + 0x8A22,0x8C49,0x8C5A,0x8CA9,0x8CAC,0x8CAB,0x8CA8,0x8CAA,/* 0x60-0x67 */ + 0x8CA7,0x8D67,0x8D66,0x8DBE,0x8DBA,0x8EDB,0x8EDF,0x9019,/* 0x68-0x6F */ + 0x900D,0x901A,0x9017,0x9023,0x901F,0x901D,0x9010,0x9015,/* 0x70-0x77 */ + 0x901E,0x9020,0x900F,0x9022,0x9016,0x901B,0x9014,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x90E8,0x90ED,0x90FD,0x9157,0x91CE,0x91F5,0x91E6,/* 0xA0-0xA7 */ + 0x91E3,0x91E7,0x91ED,0x91E9,0x9589,0x966A,0x9675,0x9673,/* 0xA8-0xAF */ + 0x9678,0x9670,0x9674,0x9676,0x9677,0x966C,0x96C0,0x96EA,/* 0xB0-0xB7 */ + 0x96E9,0x7AE0,0x7ADF,0x9802,0x9803,0x9B5A,0x9CE5,0x9E75,/* 0xB8-0xBF */ + 0x9E7F,0x9EA5,0x9EBB,0x50A2,0x508D,0x5085,0x5099,0x5091,/* 0xC0-0xC7 */ + 0x5080,0x5096,0x5098,0x509A,0x6700,0x51F1,0x5272,0x5274,/* 0xC8-0xCF */ + 0x5275,0x5269,0x52DE,0x52DD,0x52DB,0x535A,0x53A5,0x557B,/* 0xD0-0xD7 */ + 0x5580,0x55A7,0x557C,0x558A,0x559D,0x5598,0x5582,0x559C,/* 0xD8-0xDF */ + 0x55AA,0x5594,0x5587,0x558B,0x5583,0x55B3,0x55AE,0x559F,/* 0xE0-0xE7 */ + 0x553E,0x55B2,0x559A,0x55BB,0x55AC,0x55B1,0x557E,0x5589,/* 0xE8-0xEF */ + 0x55AB,0x5599,0x570D,0x582F,0x582A,0x5834,0x5824,0x5830,/* 0xF0-0xF7 */ + 0x5831,0x5821,0x581D,0x5820,0x58F9,0x58FA,0x5960,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5A77,0x5A9A,0x5A7F,0x5A92,0x5A9B,0x5AA7,0x5B73,0x5B71,/* 0x40-0x47 */ + 0x5BD2,0x5BCC,0x5BD3,0x5BD0,0x5C0A,0x5C0B,0x5C31,0x5D4C,/* 0x48-0x4F */ + 0x5D50,0x5D34,0x5D47,0x5DFD,0x5E45,0x5E3D,0x5E40,0x5E43,/* 0x50-0x57 */ + 0x5E7E,0x5ECA,0x5EC1,0x5EC2,0x5EC4,0x5F3C,0x5F6D,0x5FA9,/* 0x58-0x5F */ + 0x5FAA,0x5FA8,0x60D1,0x60E1,0x60B2,0x60B6,0x60E0,0x611C,/* 0x60-0x67 */ + 0x6123,0x60FA,0x6115,0x60F0,0x60FB,0x60F4,0x6168,0x60F1,/* 0x68-0x6F */ + 0x610E,0x60F6,0x6109,0x6100,0x6112,0x621F,0x6249,0x63A3,/* 0x70-0x77 */ + 0x638C,0x63CF,0x63C0,0x63E9,0x63C9,0x63C6,0x63CD,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x63D2,0x63E3,0x63D0,0x63E1,0x63D6,0x63ED,0x63EE,/* 0xA0-0xA7 */ + 0x6376,0x63F4,0x63EA,0x63DB,0x6452,0x63DA,0x63F9,0x655E,/* 0xA8-0xAF */ + 0x6566,0x6562,0x6563,0x6591,0x6590,0x65AF,0x666E,0x6670,/* 0xB0-0xB7 */ + 0x6674,0x6676,0x666F,0x6691,0x667A,0x667E,0x6677,0x66FE,/* 0xB8-0xBF */ + 0x66FF,0x671F,0x671D,0x68FA,0x68D5,0x68E0,0x68D8,0x68D7,/* 0xC0-0xC7 */ + 0x6905,0x68DF,0x68F5,0x68EE,0x68E7,0x68F9,0x68D2,0x68F2,/* 0xC8-0xCF */ + 0x68E3,0x68CB,0x68CD,0x690D,0x6912,0x690E,0x68C9,0x68DA,/* 0xD0-0xD7 */ + 0x696E,0x68FB,0x6B3E,0x6B3A,0x6B3D,0x6B98,0x6B96,0x6BBC,/* 0xD8-0xDF */ + 0x6BEF,0x6C2E,0x6C2F,0x6C2C,0x6E2F,0x6E38,0x6E54,0x6E21,/* 0xE0-0xE7 */ + 0x6E32,0x6E67,0x6E4A,0x6E20,0x6E25,0x6E23,0x6E1B,0x6E5B,/* 0xE8-0xEF */ + 0x6E58,0x6E24,0x6E56,0x6E6E,0x6E2D,0x6E26,0x6E6F,0x6E34,/* 0xF0-0xF7 */ + 0x6E4D,0x6E3A,0x6E2C,0x6E43,0x6E1D,0x6E3E,0x6ECB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6E89,0x6E19,0x6E4E,0x6E63,0x6E44,0x6E72,0x6E69,0x6E5F,/* 0x40-0x47 */ + 0x7119,0x711A,0x7126,0x7130,0x7121,0x7136,0x716E,0x711C,/* 0x48-0x4F */ + 0x724C,0x7284,0x7280,0x7336,0x7325,0x7334,0x7329,0x743A,/* 0x50-0x57 */ + 0x742A,0x7433,0x7422,0x7425,0x7435,0x7436,0x7434,0x742F,/* 0x58-0x5F */ + 0x741B,0x7426,0x7428,0x7525,0x7526,0x756B,0x756A,0x75E2,/* 0x60-0x67 */ + 0x75DB,0x75E3,0x75D9,0x75D8,0x75DE,0x75E0,0x767B,0x767C,/* 0x68-0x6F */ + 0x7696,0x7693,0x76B4,0x76DC,0x774F,0x77ED,0x785D,0x786C,/* 0x70-0x77 */ + 0x786F,0x7A0D,0x7A08,0x7A0B,0x7A05,0x7A00,0x7A98,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7A97,0x7A96,0x7AE5,0x7AE3,0x7B49,0x7B56,0x7B46,/* 0xA0-0xA7 */ + 0x7B50,0x7B52,0x7B54,0x7B4D,0x7B4B,0x7B4F,0x7B51,0x7C9F,/* 0xA8-0xAF */ + 0x7CA5,0x7D5E,0x7D50,0x7D68,0x7D55,0x7D2B,0x7D6E,0x7D72,/* 0xB0-0xB7 */ + 0x7D61,0x7D66,0x7D62,0x7D70,0x7D73,0x5584,0x7FD4,0x7FD5,/* 0xB8-0xBF */ + 0x800B,0x8052,0x8085,0x8155,0x8154,0x814B,0x8151,0x814E,/* 0xC0-0xC7 */ + 0x8139,0x8146,0x813E,0x814C,0x8153,0x8174,0x8212,0x821C,/* 0xC8-0xCF */ + 0x83E9,0x8403,0x83F8,0x840D,0x83E0,0x83C5,0x840B,0x83C1,/* 0xD0-0xD7 */ + 0x83EF,0x83F1,0x83F4,0x8457,0x840A,0x83F0,0x840C,0x83CC,/* 0xD8-0xDF */ + 0x83FD,0x83F2,0x83CA,0x8438,0x840E,0x8404,0x83DC,0x8407,/* 0xE0-0xE7 */ + 0x83D4,0x83DF,0x865B,0x86DF,0x86D9,0x86ED,0x86D4,0x86DB,/* 0xE8-0xEF */ + 0x86E4,0x86D0,0x86DE,0x8857,0x88C1,0x88C2,0x88B1,0x8983,/* 0xF0-0xF7 */ + 0x8996,0x8A3B,0x8A60,0x8A55,0x8A5E,0x8A3C,0x8A41,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8A54,0x8A5B,0x8A50,0x8A46,0x8A34,0x8A3A,0x8A36,0x8A56,/* 0x40-0x47 */ + 0x8C61,0x8C82,0x8CAF,0x8CBC,0x8CB3,0x8CBD,0x8CC1,0x8CBB,/* 0x48-0x4F */ + 0x8CC0,0x8CB4,0x8CB7,0x8CB6,0x8CBF,0x8CB8,0x8D8A,0x8D85,/* 0x50-0x57 */ + 0x8D81,0x8DCE,0x8DDD,0x8DCB,0x8DDA,0x8DD1,0x8DCC,0x8DDB,/* 0x58-0x5F */ + 0x8DC6,0x8EFB,0x8EF8,0x8EFC,0x8F9C,0x902E,0x9035,0x9031,/* 0x60-0x67 */ + 0x9038,0x9032,0x9036,0x9102,0x90F5,0x9109,0x90FE,0x9163,/* 0x68-0x6F */ + 0x9165,0x91CF,0x9214,0x9215,0x9223,0x9209,0x921E,0x920D,/* 0x70-0x77 */ + 0x9210,0x9207,0x9211,0x9594,0x958F,0x958B,0x9591,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9593,0x9592,0x958E,0x968A,0x968E,0x968B,0x967D,/* 0xA0-0xA7 */ + 0x9685,0x9686,0x968D,0x9672,0x9684,0x96C1,0x96C5,0x96C4,/* 0xA8-0xAF */ + 0x96C6,0x96C7,0x96EF,0x96F2,0x97CC,0x9805,0x9806,0x9808,/* 0xB0-0xB7 */ + 0x98E7,0x98EA,0x98EF,0x98E9,0x98F2,0x98ED,0x99AE,0x99AD,/* 0xB8-0xBF */ + 0x9EC3,0x9ECD,0x9ED1,0x4E82,0x50AD,0x50B5,0x50B2,0x50B3,/* 0xC0-0xC7 */ + 0x50C5,0x50BE,0x50AC,0x50B7,0x50BB,0x50AF,0x50C7,0x527F,/* 0xC8-0xCF */ + 0x5277,0x527D,0x52DF,0x52E6,0x52E4,0x52E2,0x52E3,0x532F,/* 0xD0-0xD7 */ + 0x55DF,0x55E8,0x55D3,0x55E6,0x55CE,0x55DC,0x55C7,0x55D1,/* 0xD8-0xDF */ + 0x55E3,0x55E4,0x55EF,0x55DA,0x55E1,0x55C5,0x55C6,0x55E5,/* 0xE0-0xE7 */ + 0x55C9,0x5712,0x5713,0x585E,0x5851,0x5858,0x5857,0x585A,/* 0xE8-0xEF */ + 0x5854,0x586B,0x584C,0x586D,0x584A,0x5862,0x5852,0x584B,/* 0xF0-0xF7 */ + 0x5967,0x5AC1,0x5AC9,0x5ACC,0x5ABE,0x5ABD,0x5ABC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5AB3,0x5AC2,0x5AB2,0x5D69,0x5D6F,0x5E4C,0x5E79,0x5EC9,/* 0x40-0x47 */ + 0x5EC8,0x5F12,0x5F59,0x5FAC,0x5FAE,0x611A,0x610F,0x6148,/* 0x48-0x4F */ + 0x611F,0x60F3,0x611B,0x60F9,0x6101,0x6108,0x614E,0x614C,/* 0x50-0x57 */ + 0x6144,0x614D,0x613E,0x6134,0x6127,0x610D,0x6106,0x6137,/* 0x58-0x5F */ + 0x6221,0x6222,0x6413,0x643E,0x641E,0x642A,0x642D,0x643D,/* 0x60-0x67 */ + 0x642C,0x640F,0x641C,0x6414,0x640D,0x6436,0x6416,0x6417,/* 0x68-0x6F */ + 0x6406,0x656C,0x659F,0x65B0,0x6697,0x6689,0x6687,0x6688,/* 0x70-0x77 */ + 0x6696,0x6684,0x6698,0x668D,0x6703,0x6994,0x696D,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x695A,0x6977,0x6960,0x6954,0x6975,0x6930,0x6982,/* 0xA0-0xA7 */ + 0x694A,0x6968,0x696B,0x695E,0x6953,0x6979,0x6986,0x695D,/* 0xA8-0xAF */ + 0x6963,0x695B,0x6B47,0x6B72,0x6BC0,0x6BBF,0x6BD3,0x6BFD,/* 0xB0-0xB7 */ + 0x6EA2,0x6EAF,0x6ED3,0x6EB6,0x6EC2,0x6E90,0x6E9D,0x6EC7,/* 0xB8-0xBF */ + 0x6EC5,0x6EA5,0x6E98,0x6EBC,0x6EBA,0x6EAB,0x6ED1,0x6E96,/* 0xC0-0xC7 */ + 0x6E9C,0x6EC4,0x6ED4,0x6EAA,0x6EA7,0x6EB4,0x714E,0x7159,/* 0xC8-0xCF */ + 0x7169,0x7164,0x7149,0x7167,0x715C,0x716C,0x7166,0x714C,/* 0xD0-0xD7 */ + 0x7165,0x715E,0x7146,0x7168,0x7156,0x723A,0x7252,0x7337,/* 0xD8-0xDF */ + 0x7345,0x733F,0x733E,0x746F,0x745A,0x7455,0x745F,0x745E,/* 0xE0-0xE7 */ + 0x7441,0x743F,0x7459,0x745B,0x745C,0x7576,0x7578,0x7600,/* 0xE8-0xEF */ + 0x75F0,0x7601,0x75F2,0x75F1,0x75FA,0x75FF,0x75F4,0x75F3,/* 0xF0-0xF7 */ + 0x76DE,0x76DF,0x775B,0x776B,0x7766,0x775E,0x7763,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7779,0x776A,0x776C,0x775C,0x7765,0x7768,0x7762,0x77EE,/* 0x40-0x47 */ + 0x788E,0x78B0,0x7897,0x7898,0x788C,0x7889,0x787C,0x7891,/* 0x48-0x4F */ + 0x7893,0x787F,0x797A,0x797F,0x7981,0x842C,0x79BD,0x7A1C,/* 0x50-0x57 */ + 0x7A1A,0x7A20,0x7A14,0x7A1F,0x7A1E,0x7A9F,0x7AA0,0x7B77,/* 0x58-0x5F */ + 0x7BC0,0x7B60,0x7B6E,0x7B67,0x7CB1,0x7CB3,0x7CB5,0x7D93,/* 0x60-0x67 */ + 0x7D79,0x7D91,0x7D81,0x7D8F,0x7D5B,0x7F6E,0x7F69,0x7F6A,/* 0x68-0x6F */ + 0x7F72,0x7FA9,0x7FA8,0x7FA4,0x8056,0x8058,0x8086,0x8084,/* 0x70-0x77 */ + 0x8171,0x8170,0x8178,0x8165,0x816E,0x8173,0x816B,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8179,0x817A,0x8166,0x8205,0x8247,0x8482,0x8477,/* 0xA0-0xA7 */ + 0x843D,0x8431,0x8475,0x8466,0x846B,0x8449,0x846C,0x845B,/* 0xA8-0xAF */ + 0x843C,0x8435,0x8461,0x8463,0x8469,0x846D,0x8446,0x865E,/* 0xB0-0xB7 */ + 0x865C,0x865F,0x86F9,0x8713,0x8708,0x8707,0x8700,0x86FE,/* 0xB8-0xBF */ + 0x86FB,0x8702,0x8703,0x8706,0x870A,0x8859,0x88DF,0x88D4,/* 0xC0-0xC7 */ + 0x88D9,0x88DC,0x88D8,0x88DD,0x88E1,0x88CA,0x88D5,0x88D2,/* 0xC8-0xCF */ + 0x899C,0x89E3,0x8A6B,0x8A72,0x8A73,0x8A66,0x8A69,0x8A70,/* 0xD0-0xD7 */ + 0x8A87,0x8A7C,0x8A63,0x8AA0,0x8A71,0x8A85,0x8A6D,0x8A62,/* 0xD8-0xDF */ + 0x8A6E,0x8A6C,0x8A79,0x8A7B,0x8A3E,0x8A68,0x8C62,0x8C8A,/* 0xE0-0xE7 */ + 0x8C89,0x8CCA,0x8CC7,0x8CC8,0x8CC4,0x8CB2,0x8CC3,0x8CC2,/* 0xE8-0xEF */ + 0x8CC5,0x8DE1,0x8DDF,0x8DE8,0x8DEF,0x8DF3,0x8DFA,0x8DEA,/* 0xF0-0xF7 */ + 0x8DE4,0x8DE6,0x8EB2,0x8F03,0x8F09,0x8EFE,0x8F0A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_B9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8F9F,0x8FB2,0x904B,0x904A,0x9053,0x9042,0x9054,0x903C,/* 0x40-0x47 */ + 0x9055,0x9050,0x9047,0x904F,0x904E,0x904D,0x9051,0x903E,/* 0x48-0x4F */ + 0x9041,0x9112,0x9117,0x916C,0x916A,0x9169,0x91C9,0x9237,/* 0x50-0x57 */ + 0x9257,0x9238,0x923D,0x9240,0x923E,0x925B,0x924B,0x9264,/* 0x58-0x5F */ + 0x9251,0x9234,0x9249,0x924D,0x9245,0x9239,0x923F,0x925A,/* 0x60-0x67 */ + 0x9598,0x9698,0x9694,0x9695,0x96CD,0x96CB,0x96C9,0x96CA,/* 0x68-0x6F */ + 0x96F7,0x96FB,0x96F9,0x96F6,0x9756,0x9774,0x9776,0x9810,/* 0x70-0x77 */ + 0x9811,0x9813,0x980A,0x9812,0x980C,0x98FC,0x98F4,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x98FD,0x98FE,0x99B3,0x99B1,0x99B4,0x9AE1,0x9CE9,/* 0xA0-0xA7 */ + 0x9E82,0x9F0E,0x9F13,0x9F20,0x50E7,0x50EE,0x50E5,0x50D6,/* 0xA8-0xAF */ + 0x50ED,0x50DA,0x50D5,0x50CF,0x50D1,0x50F1,0x50CE,0x50E9,/* 0xB0-0xB7 */ + 0x5162,0x51F3,0x5283,0x5282,0x5331,0x53AD,0x55FE,0x5600,/* 0xB8-0xBF */ + 0x561B,0x5617,0x55FD,0x5614,0x5606,0x5609,0x560D,0x560E,/* 0xC0-0xC7 */ + 0x55F7,0x5616,0x561F,0x5608,0x5610,0x55F6,0x5718,0x5716,/* 0xC8-0xCF */ + 0x5875,0x587E,0x5883,0x5893,0x588A,0x5879,0x5885,0x587D,/* 0xD0-0xD7 */ + 0x58FD,0x5925,0x5922,0x5924,0x596A,0x5969,0x5AE1,0x5AE6,/* 0xD8-0xDF */ + 0x5AE9,0x5AD7,0x5AD6,0x5AD8,0x5AE3,0x5B75,0x5BDE,0x5BE7,/* 0xE0-0xE7 */ + 0x5BE1,0x5BE5,0x5BE6,0x5BE8,0x5BE2,0x5BE4,0x5BDF,0x5C0D,/* 0xE8-0xEF */ + 0x5C62,0x5D84,0x5D87,0x5E5B,0x5E63,0x5E55,0x5E57,0x5E54,/* 0xF0-0xF7 */ + 0x5ED3,0x5ED6,0x5F0A,0x5F46,0x5F70,0x5FB9,0x6147,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x613F,0x614B,0x6177,0x6162,0x6163,0x615F,0x615A,0x6158,/* 0x40-0x47 */ + 0x6175,0x622A,0x6487,0x6458,0x6454,0x64A4,0x6478,0x645F,/* 0x48-0x4F */ + 0x647A,0x6451,0x6467,0x6434,0x646D,0x647B,0x6572,0x65A1,/* 0x50-0x57 */ + 0x65D7,0x65D6,0x66A2,0x66A8,0x669D,0x699C,0x69A8,0x6995,/* 0x58-0x5F */ + 0x69C1,0x69AE,0x69D3,0x69CB,0x699B,0x69B7,0x69BB,0x69AB,/* 0x60-0x67 */ + 0x69B4,0x69D0,0x69CD,0x69AD,0x69CC,0x69A6,0x69C3,0x69A3,/* 0x68-0x6F */ + 0x6B49,0x6B4C,0x6C33,0x6F33,0x6F14,0x6EFE,0x6F13,0x6EF4,/* 0x70-0x77 */ + 0x6F29,0x6F3E,0x6F20,0x6F2C,0x6F0F,0x6F02,0x6F22,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6EFF,0x6EEF,0x6F06,0x6F31,0x6F38,0x6F32,0x6F23,/* 0xA0-0xA7 */ + 0x6F15,0x6F2B,0x6F2F,0x6F88,0x6F2A,0x6EEC,0x6F01,0x6EF2,/* 0xA8-0xAF */ + 0x6ECC,0x6EF7,0x7194,0x7199,0x717D,0x718A,0x7184,0x7192,/* 0xB0-0xB7 */ + 0x723E,0x7292,0x7296,0x7344,0x7350,0x7464,0x7463,0x746A,/* 0xB8-0xBF */ + 0x7470,0x746D,0x7504,0x7591,0x7627,0x760D,0x760B,0x7609,/* 0xC0-0xC7 */ + 0x7613,0x76E1,0x76E3,0x7784,0x777D,0x777F,0x7761,0x78C1,/* 0xC8-0xCF */ + 0x789F,0x78A7,0x78B3,0x78A9,0x78A3,0x798E,0x798F,0x798D,/* 0xD0-0xD7 */ + 0x7A2E,0x7A31,0x7AAA,0x7AA9,0x7AED,0x7AEF,0x7BA1,0x7B95,/* 0xD8-0xDF */ + 0x7B8B,0x7B75,0x7B97,0x7B9D,0x7B94,0x7B8F,0x7BB8,0x7B87,/* 0xE0-0xE7 */ + 0x7B84,0x7CB9,0x7CBD,0x7CBE,0x7DBB,0x7DB0,0x7D9C,0x7DBD,/* 0xE8-0xEF */ + 0x7DBE,0x7DA0,0x7DCA,0x7DB4,0x7DB2,0x7DB1,0x7DBA,0x7DA2,/* 0xF0-0xF7 */ + 0x7DBF,0x7DB5,0x7DB8,0x7DAD,0x7DD2,0x7DC7,0x7DAC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7F70,0x7FE0,0x7FE1,0x7FDF,0x805E,0x805A,0x8087,0x8150,/* 0x40-0x47 */ + 0x8180,0x818F,0x8188,0x818A,0x817F,0x8182,0x81E7,0x81FA,/* 0x48-0x4F */ + 0x8207,0x8214,0x821E,0x824B,0x84C9,0x84BF,0x84C6,0x84C4,/* 0x50-0x57 */ + 0x8499,0x849E,0x84B2,0x849C,0x84CB,0x84B8,0x84C0,0x84D3,/* 0x58-0x5F */ + 0x8490,0x84BC,0x84D1,0x84CA,0x873F,0x871C,0x873B,0x8722,/* 0x60-0x67 */ + 0x8725,0x8734,0x8718,0x8755,0x8737,0x8729,0x88F3,0x8902,/* 0x68-0x6F */ + 0x88F4,0x88F9,0x88F8,0x88FD,0x88E8,0x891A,0x88EF,0x8AA6,/* 0x70-0x77 */ + 0x8A8C,0x8A9E,0x8AA3,0x8A8D,0x8AA1,0x8A93,0x8AA4,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8AAA,0x8AA5,0x8AA8,0x8A98,0x8A91,0x8A9A,0x8AA7,/* 0xA0-0xA7 */ + 0x8C6A,0x8C8D,0x8C8C,0x8CD3,0x8CD1,0x8CD2,0x8D6B,0x8D99,/* 0xA8-0xAF */ + 0x8D95,0x8DFC,0x8F14,0x8F12,0x8F15,0x8F13,0x8FA3,0x9060,/* 0xB0-0xB7 */ + 0x9058,0x905C,0x9063,0x9059,0x905E,0x9062,0x905D,0x905B,/* 0xB8-0xBF */ + 0x9119,0x9118,0x911E,0x9175,0x9178,0x9177,0x9174,0x9278,/* 0xC0-0xC7 */ + 0x9280,0x9285,0x9298,0x9296,0x927B,0x9293,0x929C,0x92A8,/* 0xC8-0xCF */ + 0x927C,0x9291,0x95A1,0x95A8,0x95A9,0x95A3,0x95A5,0x95A4,/* 0xD0-0xD7 */ + 0x9699,0x969C,0x969B,0x96CC,0x96D2,0x9700,0x977C,0x9785,/* 0xD8-0xDF */ + 0x97F6,0x9817,0x9818,0x98AF,0x98B1,0x9903,0x9905,0x990C,/* 0xE0-0xE7 */ + 0x9909,0x99C1,0x9AAF,0x9AB0,0x9AE6,0x9B41,0x9B42,0x9CF4,/* 0xE8-0xEF */ + 0x9CF6,0x9CF3,0x9EBC,0x9F3B,0x9F4A,0x5104,0x5100,0x50FB,/* 0xF0-0xF7 */ + 0x50F5,0x50F9,0x5102,0x5108,0x5109,0x5105,0x51DC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5287,0x5288,0x5289,0x528D,0x528A,0x52F0,0x53B2,0x562E,/* 0x40-0x47 */ + 0x563B,0x5639,0x5632,0x563F,0x5634,0x5629,0x5653,0x564E,/* 0x48-0x4F */ + 0x5657,0x5674,0x5636,0x562F,0x5630,0x5880,0x589F,0x589E,/* 0x50-0x57 */ + 0x58B3,0x589C,0x58AE,0x58A9,0x58A6,0x596D,0x5B09,0x5AFB,/* 0x58-0x5F */ + 0x5B0B,0x5AF5,0x5B0C,0x5B08,0x5BEE,0x5BEC,0x5BE9,0x5BEB,/* 0x60-0x67 */ + 0x5C64,0x5C65,0x5D9D,0x5D94,0x5E62,0x5E5F,0x5E61,0x5EE2,/* 0x68-0x6F */ + 0x5EDA,0x5EDF,0x5EDD,0x5EE3,0x5EE0,0x5F48,0x5F71,0x5FB7,/* 0x70-0x77 */ + 0x5FB5,0x6176,0x6167,0x616E,0x615D,0x6155,0x6182,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x617C,0x6170,0x616B,0x617E,0x61A7,0x6190,0x61AB,/* 0xA0-0xA7 */ + 0x618E,0x61AC,0x619A,0x61A4,0x6194,0x61AE,0x622E,0x6469,/* 0xA8-0xAF */ + 0x646F,0x6479,0x649E,0x64B2,0x6488,0x6490,0x64B0,0x64A5,/* 0xB0-0xB7 */ + 0x6493,0x6495,0x64A9,0x6492,0x64AE,0x64AD,0x64AB,0x649A,/* 0xB8-0xBF */ + 0x64AC,0x6499,0x64A2,0x64B3,0x6575,0x6577,0x6578,0x66AE,/* 0xC0-0xC7 */ + 0x66AB,0x66B4,0x66B1,0x6A23,0x6A1F,0x69E8,0x6A01,0x6A1E,/* 0xC8-0xCF */ + 0x6A19,0x69FD,0x6A21,0x6A13,0x6A0A,0x69F3,0x6A02,0x6A05,/* 0xD0-0xD7 */ + 0x69ED,0x6A11,0x6B50,0x6B4E,0x6BA4,0x6BC5,0x6BC6,0x6F3F,/* 0xD8-0xDF */ + 0x6F7C,0x6F84,0x6F51,0x6F66,0x6F54,0x6F86,0x6F6D,0x6F5B,/* 0xE0-0xE7 */ + 0x6F78,0x6F6E,0x6F8E,0x6F7A,0x6F70,0x6F64,0x6F97,0x6F58,/* 0xE8-0xEF */ + 0x6ED5,0x6F6F,0x6F60,0x6F5F,0x719F,0x71AC,0x71B1,0x71A8,/* 0xF0-0xF7 */ + 0x7256,0x729B,0x734E,0x7357,0x7469,0x748B,0x7483,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x747E,0x7480,0x757F,0x7620,0x7629,0x761F,0x7624,0x7626,/* 0x40-0x47 */ + 0x7621,0x7622,0x769A,0x76BA,0x76E4,0x778E,0x7787,0x778C,/* 0x48-0x4F */ + 0x7791,0x778B,0x78CB,0x78C5,0x78BA,0x78CA,0x78BE,0x78D5,/* 0x50-0x57 */ + 0x78BC,0x78D0,0x7A3F,0x7A3C,0x7A40,0x7A3D,0x7A37,0x7A3B,/* 0x58-0x5F */ + 0x7AAF,0x7AAE,0x7BAD,0x7BB1,0x7BC4,0x7BB4,0x7BC6,0x7BC7,/* 0x60-0x67 */ + 0x7BC1,0x7BA0,0x7BCC,0x7CCA,0x7DE0,0x7DF4,0x7DEF,0x7DFB,/* 0x68-0x6F */ + 0x7DD8,0x7DEC,0x7DDD,0x7DE8,0x7DE3,0x7DDA,0x7DDE,0x7DE9,/* 0x70-0x77 */ + 0x7D9E,0x7DD9,0x7DF2,0x7DF9,0x7F75,0x7F77,0x7FAF,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7FE9,0x8026,0x819B,0x819C,0x819D,0x81A0,0x819A,/* 0xA0-0xA7 */ + 0x8198,0x8517,0x853D,0x851A,0x84EE,0x852C,0x852D,0x8513,/* 0xA8-0xAF */ + 0x8511,0x8523,0x8521,0x8514,0x84EC,0x8525,0x84FF,0x8506,/* 0xB0-0xB7 */ + 0x8782,0x8774,0x8776,0x8760,0x8766,0x8778,0x8768,0x8759,/* 0xB8-0xBF */ + 0x8757,0x874C,0x8753,0x885B,0x885D,0x8910,0x8907,0x8912,/* 0xC0-0xC7 */ + 0x8913,0x8915,0x890A,0x8ABC,0x8AD2,0x8AC7,0x8AC4,0x8A95,/* 0xC8-0xCF */ + 0x8ACB,0x8AF8,0x8AB2,0x8AC9,0x8AC2,0x8ABF,0x8AB0,0x8AD6,/* 0xD0-0xD7 */ + 0x8ACD,0x8AB6,0x8AB9,0x8ADB,0x8C4C,0x8C4E,0x8C6C,0x8CE0,/* 0xD8-0xDF */ + 0x8CDE,0x8CE6,0x8CE4,0x8CEC,0x8CED,0x8CE2,0x8CE3,0x8CDC,/* 0xE0-0xE7 */ + 0x8CEA,0x8CE1,0x8D6D,0x8D9F,0x8DA3,0x8E2B,0x8E10,0x8E1D,/* 0xE8-0xEF */ + 0x8E22,0x8E0F,0x8E29,0x8E1F,0x8E21,0x8E1E,0x8EBA,0x8F1D,/* 0xF0-0xF7 */ + 0x8F1B,0x8F1F,0x8F29,0x8F26,0x8F2A,0x8F1C,0x8F1E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8F25,0x9069,0x906E,0x9068,0x906D,0x9077,0x9130,0x912D,/* 0x40-0x47 */ + 0x9127,0x9131,0x9187,0x9189,0x918B,0x9183,0x92C5,0x92BB,/* 0x48-0x4F */ + 0x92B7,0x92EA,0x92AC,0x92E4,0x92C1,0x92B3,0x92BC,0x92D2,/* 0x50-0x57 */ + 0x92C7,0x92F0,0x92B2,0x95AD,0x95B1,0x9704,0x9706,0x9707,/* 0x58-0x5F */ + 0x9709,0x9760,0x978D,0x978B,0x978F,0x9821,0x982B,0x981C,/* 0x60-0x67 */ + 0x98B3,0x990A,0x9913,0x9912,0x9918,0x99DD,0x99D0,0x99DF,/* 0x68-0x6F */ + 0x99DB,0x99D1,0x99D5,0x99D2,0x99D9,0x9AB7,0x9AEE,0x9AEF,/* 0x70-0x77 */ + 0x9B27,0x9B45,0x9B44,0x9B77,0x9B6F,0x9D06,0x9D09,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9D03,0x9EA9,0x9EBE,0x9ECE,0x58A8,0x9F52,0x5112,/* 0xA0-0xA7 */ + 0x5118,0x5114,0x5110,0x5115,0x5180,0x51AA,0x51DD,0x5291,/* 0xA8-0xAF */ + 0x5293,0x52F3,0x5659,0x566B,0x5679,0x5669,0x5664,0x5678,/* 0xB0-0xB7 */ + 0x566A,0x5668,0x5665,0x5671,0x566F,0x566C,0x5662,0x5676,/* 0xB8-0xBF */ + 0x58C1,0x58BE,0x58C7,0x58C5,0x596E,0x5B1D,0x5B34,0x5B78,/* 0xC0-0xC7 */ + 0x5BF0,0x5C0E,0x5F4A,0x61B2,0x6191,0x61A9,0x618A,0x61CD,/* 0xC8-0xCF */ + 0x61B6,0x61BE,0x61CA,0x61C8,0x6230,0x64C5,0x64C1,0x64CB,/* 0xD0-0xD7 */ + 0x64BB,0x64BC,0x64DA,0x64C4,0x64C7,0x64C2,0x64CD,0x64BF,/* 0xD8-0xDF */ + 0x64D2,0x64D4,0x64BE,0x6574,0x66C6,0x66C9,0x66B9,0x66C4,/* 0xE0-0xE7 */ + 0x66C7,0x66B8,0x6A3D,0x6A38,0x6A3A,0x6A59,0x6A6B,0x6A58,/* 0xE8-0xEF */ + 0x6A39,0x6A44,0x6A62,0x6A61,0x6A4B,0x6A47,0x6A35,0x6A5F,/* 0xF0-0xF7 */ + 0x6A48,0x6B59,0x6B77,0x6C05,0x6FC2,0x6FB1,0x6FA1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_BF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6FC3,0x6FA4,0x6FC1,0x6FA7,0x6FB3,0x6FC0,0x6FB9,0x6FB6,/* 0x40-0x47 */ + 0x6FA6,0x6FA0,0x6FB4,0x71BE,0x71C9,0x71D0,0x71D2,0x71C8,/* 0x48-0x4F */ + 0x71D5,0x71B9,0x71CE,0x71D9,0x71DC,0x71C3,0x71C4,0x7368,/* 0x50-0x57 */ + 0x749C,0x74A3,0x7498,0x749F,0x749E,0x74E2,0x750C,0x750D,/* 0x58-0x5F */ + 0x7634,0x7638,0x763A,0x76E7,0x76E5,0x77A0,0x779E,0x779F,/* 0x60-0x67 */ + 0x77A5,0x78E8,0x78DA,0x78EC,0x78E7,0x79A6,0x7A4D,0x7A4E,/* 0x68-0x6F */ + 0x7A46,0x7A4C,0x7A4B,0x7ABA,0x7BD9,0x7C11,0x7BC9,0x7BE4,/* 0x70-0x77 */ + 0x7BDB,0x7BE1,0x7BE9,0x7BE6,0x7CD5,0x7CD6,0x7E0A,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7E11,0x7E08,0x7E1B,0x7E23,0x7E1E,0x7E1D,0x7E09,/* 0xA0-0xA7 */ + 0x7E10,0x7F79,0x7FB2,0x7FF0,0x7FF1,0x7FEE,0x8028,0x81B3,/* 0xA8-0xAF */ + 0x81A9,0x81A8,0x81FB,0x8208,0x8258,0x8259,0x854A,0x8559,/* 0xB0-0xB7 */ + 0x8548,0x8568,0x8569,0x8543,0x8549,0x856D,0x856A,0x855E,/* 0xB8-0xBF */ + 0x8783,0x879F,0x879E,0x87A2,0x878D,0x8861,0x892A,0x8932,/* 0xC0-0xC7 */ + 0x8925,0x892B,0x8921,0x89AA,0x89A6,0x8AE6,0x8AFA,0x8AEB,/* 0xC8-0xCF */ + 0x8AF1,0x8B00,0x8ADC,0x8AE7,0x8AEE,0x8AFE,0x8B01,0x8B02,/* 0xD0-0xD7 */ + 0x8AF7,0x8AED,0x8AF3,0x8AF6,0x8AFC,0x8C6B,0x8C6D,0x8C93,/* 0xD8-0xDF */ + 0x8CF4,0x8E44,0x8E31,0x8E34,0x8E42,0x8E39,0x8E35,0x8F3B,/* 0xE0-0xE7 */ + 0x8F2F,0x8F38,0x8F33,0x8FA8,0x8FA6,0x9075,0x9074,0x9078,/* 0xE8-0xEF */ + 0x9072,0x907C,0x907A,0x9134,0x9192,0x9320,0x9336,0x92F8,/* 0xF0-0xF7 */ + 0x9333,0x932F,0x9322,0x92FC,0x932B,0x9304,0x931A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9310,0x9326,0x9321,0x9315,0x932E,0x9319,0x95BB,0x96A7,/* 0x40-0x47 */ + 0x96A8,0x96AA,0x96D5,0x970E,0x9711,0x9716,0x970D,0x9713,/* 0x48-0x4F */ + 0x970F,0x975B,0x975C,0x9766,0x9798,0x9830,0x9838,0x983B,/* 0x50-0x57 */ + 0x9837,0x982D,0x9839,0x9824,0x9910,0x9928,0x991E,0x991B,/* 0x58-0x5F */ + 0x9921,0x991A,0x99ED,0x99E2,0x99F1,0x9AB8,0x9ABC,0x9AFB,/* 0x60-0x67 */ + 0x9AED,0x9B28,0x9B91,0x9D15,0x9D23,0x9D26,0x9D28,0x9D12,/* 0x68-0x6F */ + 0x9D1B,0x9ED8,0x9ED4,0x9F8D,0x9F9C,0x512A,0x511F,0x5121,/* 0x70-0x77 */ + 0x5132,0x52F5,0x568E,0x5680,0x5690,0x5685,0x5687,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x568F,0x58D5,0x58D3,0x58D1,0x58CE,0x5B30,0x5B2A,/* 0xA0-0xA7 */ + 0x5B24,0x5B7A,0x5C37,0x5C68,0x5DBC,0x5DBA,0x5DBD,0x5DB8,/* 0xA8-0xAF */ + 0x5E6B,0x5F4C,0x5FBD,0x61C9,0x61C2,0x61C7,0x61E6,0x61CB,/* 0xB0-0xB7 */ + 0x6232,0x6234,0x64CE,0x64CA,0x64D8,0x64E0,0x64F0,0x64E6,/* 0xB8-0xBF */ + 0x64EC,0x64F1,0x64E2,0x64ED,0x6582,0x6583,0x66D9,0x66D6,/* 0xC0-0xC7 */ + 0x6A80,0x6A94,0x6A84,0x6AA2,0x6A9C,0x6ADB,0x6AA3,0x6A7E,/* 0xC8-0xCF */ + 0x6A97,0x6A90,0x6AA0,0x6B5C,0x6BAE,0x6BDA,0x6C08,0x6FD8,/* 0xD0-0xD7 */ + 0x6FF1,0x6FDF,0x6FE0,0x6FDB,0x6FE4,0x6FEB,0x6FEF,0x6F80,/* 0xD8-0xDF */ + 0x6FEC,0x6FE1,0x6FE9,0x6FD5,0x6FEE,0x6FF0,0x71E7,0x71DF,/* 0xE0-0xE7 */ + 0x71EE,0x71E6,0x71E5,0x71ED,0x71EC,0x71F4,0x71E0,0x7235,/* 0xE8-0xEF */ + 0x7246,0x7370,0x7372,0x74A9,0x74B0,0x74A6,0x74A8,0x7646,/* 0xF0-0xF7 */ + 0x7642,0x764C,0x76EA,0x77B3,0x77AA,0x77B0,0x77AC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x77A7,0x77AD,0x77EF,0x78F7,0x78FA,0x78F4,0x78EF,0x7901,/* 0x40-0x47 */ + 0x79A7,0x79AA,0x7A57,0x7ABF,0x7C07,0x7C0D,0x7BFE,0x7BF7,/* 0x48-0x4F */ + 0x7C0C,0x7BE0,0x7CE0,0x7CDC,0x7CDE,0x7CE2,0x7CDF,0x7CD9,/* 0x50-0x57 */ + 0x7CDD,0x7E2E,0x7E3E,0x7E46,0x7E37,0x7E32,0x7E43,0x7E2B,/* 0x58-0x5F */ + 0x7E3D,0x7E31,0x7E45,0x7E41,0x7E34,0x7E39,0x7E48,0x7E35,/* 0x60-0x67 */ + 0x7E3F,0x7E2F,0x7F44,0x7FF3,0x7FFC,0x8071,0x8072,0x8070,/* 0x68-0x6F */ + 0x806F,0x8073,0x81C6,0x81C3,0x81BA,0x81C2,0x81C0,0x81BF,/* 0x70-0x77 */ + 0x81BD,0x81C9,0x81BE,0x81E8,0x8209,0x8271,0x85AA,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8584,0x857E,0x859C,0x8591,0x8594,0x85AF,0x859B,/* 0xA0-0xA7 */ + 0x8587,0x85A8,0x858A,0x8667,0x87C0,0x87D1,0x87B3,0x87D2,/* 0xA8-0xAF */ + 0x87C6,0x87AB,0x87BB,0x87BA,0x87C8,0x87CB,0x893B,0x8936,/* 0xB0-0xB7 */ + 0x8944,0x8938,0x893D,0x89AC,0x8B0E,0x8B17,0x8B19,0x8B1B,/* 0xB8-0xBF */ + 0x8B0A,0x8B20,0x8B1D,0x8B04,0x8B10,0x8C41,0x8C3F,0x8C73,/* 0xC0-0xC7 */ + 0x8CFA,0x8CFD,0x8CFC,0x8CF8,0x8CFB,0x8DA8,0x8E49,0x8E4B,/* 0xC8-0xCF */ + 0x8E48,0x8E4A,0x8F44,0x8F3E,0x8F42,0x8F45,0x8F3F,0x907F,/* 0xD0-0xD7 */ + 0x907D,0x9084,0x9081,0x9082,0x9080,0x9139,0x91A3,0x919E,/* 0xD8-0xDF */ + 0x919C,0x934D,0x9382,0x9328,0x9375,0x934A,0x9365,0x934B,/* 0xE0-0xE7 */ + 0x9318,0x937E,0x936C,0x935B,0x9370,0x935A,0x9354,0x95CA,/* 0xE8-0xEF */ + 0x95CB,0x95CC,0x95C8,0x95C6,0x96B1,0x96B8,0x96D6,0x971C,/* 0xF0-0xF7 */ + 0x971E,0x97A0,0x97D3,0x9846,0x98B6,0x9935,0x9A01,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x99FF,0x9BAE,0x9BAB,0x9BAA,0x9BAD,0x9D3B,0x9D3F,0x9E8B,/* 0x40-0x47 */ + 0x9ECF,0x9EDE,0x9EDC,0x9EDD,0x9EDB,0x9F3E,0x9F4B,0x53E2,/* 0x48-0x4F */ + 0x5695,0x56AE,0x58D9,0x58D8,0x5B38,0x5F5D,0x61E3,0x6233,/* 0x50-0x57 */ + 0x64F4,0x64F2,0x64FE,0x6506,0x64FA,0x64FB,0x64F7,0x65B7,/* 0x58-0x5F */ + 0x66DC,0x6726,0x6AB3,0x6AAC,0x6AC3,0x6ABB,0x6AB8,0x6AC2,/* 0x60-0x67 */ + 0x6AAE,0x6AAF,0x6B5F,0x6B78,0x6BAF,0x7009,0x700B,0x6FFE,/* 0x68-0x6F */ + 0x7006,0x6FFA,0x7011,0x700F,0x71FB,0x71FC,0x71FE,0x71F8,/* 0x70-0x77 */ + 0x7377,0x7375,0x74A7,0x74BF,0x7515,0x7656,0x7658,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7652,0x77BD,0x77BF,0x77BB,0x77BC,0x790E,0x79AE,/* 0xA0-0xA7 */ + 0x7A61,0x7A62,0x7A60,0x7AC4,0x7AC5,0x7C2B,0x7C27,0x7C2A,/* 0xA8-0xAF */ + 0x7C1E,0x7C23,0x7C21,0x7CE7,0x7E54,0x7E55,0x7E5E,0x7E5A,/* 0xB0-0xB7 */ + 0x7E61,0x7E52,0x7E59,0x7F48,0x7FF9,0x7FFB,0x8077,0x8076,/* 0xB8-0xBF */ + 0x81CD,0x81CF,0x820A,0x85CF,0x85A9,0x85CD,0x85D0,0x85C9,/* 0xC0-0xC7 */ + 0x85B0,0x85BA,0x85B9,0x85A6,0x87EF,0x87EC,0x87F2,0x87E0,/* 0xC8-0xCF */ + 0x8986,0x89B2,0x89F4,0x8B28,0x8B39,0x8B2C,0x8B2B,0x8C50,/* 0xD0-0xD7 */ + 0x8D05,0x8E59,0x8E63,0x8E66,0x8E64,0x8E5F,0x8E55,0x8EC0,/* 0xD8-0xDF */ + 0x8F49,0x8F4D,0x9087,0x9083,0x9088,0x91AB,0x91AC,0x91D0,/* 0xE0-0xE7 */ + 0x9394,0x938A,0x9396,0x93A2,0x93B3,0x93AE,0x93AC,0x93B0,/* 0xE8-0xEF */ + 0x9398,0x939A,0x9397,0x95D4,0x95D6,0x95D0,0x95D5,0x96E2,/* 0xF0-0xF7 */ + 0x96DC,0x96D9,0x96DB,0x96DE,0x9724,0x97A3,0x97A6,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x97AD,0x97F9,0x984D,0x984F,0x984C,0x984E,0x9853,0x98BA,/* 0x40-0x47 */ + 0x993E,0x993F,0x993D,0x992E,0x99A5,0x9A0E,0x9AC1,0x9B03,/* 0x48-0x4F */ + 0x9B06,0x9B4F,0x9B4E,0x9B4D,0x9BCA,0x9BC9,0x9BFD,0x9BC8,/* 0x50-0x57 */ + 0x9BC0,0x9D51,0x9D5D,0x9D60,0x9EE0,0x9F15,0x9F2C,0x5133,/* 0x58-0x5F */ + 0x56A5,0x58DE,0x58DF,0x58E2,0x5BF5,0x9F90,0x5EEC,0x61F2,/* 0x60-0x67 */ + 0x61F7,0x61F6,0x61F5,0x6500,0x650F,0x66E0,0x66DD,0x6AE5,/* 0x68-0x6F */ + 0x6ADD,0x6ADA,0x6AD3,0x701B,0x701F,0x7028,0x701A,0x701D,/* 0x70-0x77 */ + 0x7015,0x7018,0x7206,0x720D,0x7258,0x72A2,0x7378,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x737A,0x74BD,0x74CA,0x74E3,0x7587,0x7586,0x765F,/* 0xA0-0xA7 */ + 0x7661,0x77C7,0x7919,0x79B1,0x7A6B,0x7A69,0x7C3E,0x7C3F,/* 0xA8-0xAF */ + 0x7C38,0x7C3D,0x7C37,0x7C40,0x7E6B,0x7E6D,0x7E79,0x7E69,/* 0xB0-0xB7 */ + 0x7E6A,0x7F85,0x7E73,0x7FB6,0x7FB9,0x7FB8,0x81D8,0x85E9,/* 0xB8-0xBF */ + 0x85DD,0x85EA,0x85D5,0x85E4,0x85E5,0x85F7,0x87FB,0x8805,/* 0xC0-0xC7 */ + 0x880D,0x87F9,0x87FE,0x8960,0x895F,0x8956,0x895E,0x8B41,/* 0xC8-0xCF */ + 0x8B5C,0x8B58,0x8B49,0x8B5A,0x8B4E,0x8B4F,0x8B46,0x8B59,/* 0xD0-0xD7 */ + 0x8D08,0x8D0A,0x8E7C,0x8E72,0x8E87,0x8E76,0x8E6C,0x8E7A,/* 0xD8-0xDF */ + 0x8E74,0x8F54,0x8F4E,0x8FAD,0x908A,0x908B,0x91B1,0x91AE,/* 0xE0-0xE7 */ + 0x93E1,0x93D1,0x93DF,0x93C3,0x93C8,0x93DC,0x93DD,0x93D6,/* 0xE8-0xEF */ + 0x93E2,0x93CD,0x93D8,0x93E4,0x93D7,0x93E8,0x95DC,0x96B4,/* 0xF0-0xF7 */ + 0x96E3,0x972A,0x9727,0x9761,0x97DC,0x97FB,0x985E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x9858,0x985B,0x98BC,0x9945,0x9949,0x9A16,0x9A19,0x9B0D,/* 0x40-0x47 */ + 0x9BE8,0x9BE7,0x9BD6,0x9BDB,0x9D89,0x9D61,0x9D72,0x9D6A,/* 0x48-0x4F */ + 0x9D6C,0x9E92,0x9E97,0x9E93,0x9EB4,0x52F8,0x56A8,0x56B7,/* 0x50-0x57 */ + 0x56B6,0x56B4,0x56BC,0x58E4,0x5B40,0x5B43,0x5B7D,0x5BF6,/* 0x58-0x5F */ + 0x5DC9,0x61F8,0x61FA,0x6518,0x6514,0x6519,0x66E6,0x6727,/* 0x60-0x67 */ + 0x6AEC,0x703E,0x7030,0x7032,0x7210,0x737B,0x74CF,0x7662,/* 0x68-0x6F */ + 0x7665,0x7926,0x792A,0x792C,0x792B,0x7AC7,0x7AF6,0x7C4C,/* 0x70-0x77 */ + 0x7C43,0x7C4D,0x7CEF,0x7CF0,0x8FAE,0x7E7D,0x7E7C,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7E82,0x7F4C,0x8000,0x81DA,0x8266,0x85FB,0x85F9,/* 0xA0-0xA7 */ + 0x8611,0x85FA,0x8606,0x860B,0x8607,0x860A,0x8814,0x8815,/* 0xA8-0xAF */ + 0x8964,0x89BA,0x89F8,0x8B70,0x8B6C,0x8B66,0x8B6F,0x8B5F,/* 0xB0-0xB7 */ + 0x8B6B,0x8D0F,0x8D0D,0x8E89,0x8E81,0x8E85,0x8E82,0x91B4,/* 0xB8-0xBF */ + 0x91CB,0x9418,0x9403,0x93FD,0x95E1,0x9730,0x98C4,0x9952,/* 0xC0-0xC7 */ + 0x9951,0x99A8,0x9A2B,0x9A30,0x9A37,0x9A35,0x9C13,0x9C0D,/* 0xC8-0xCF */ + 0x9E79,0x9EB5,0x9EE8,0x9F2F,0x9F5F,0x9F63,0x9F61,0x5137,/* 0xD0-0xD7 */ + 0x5138,0x56C1,0x56C0,0x56C2,0x5914,0x5C6C,0x5DCD,0x61FC,/* 0xD8-0xDF */ + 0x61FE,0x651D,0x651C,0x6595,0x66E9,0x6AFB,0x6B04,0x6AFA,/* 0xE0-0xE7 */ + 0x6BB2,0x704C,0x721B,0x72A7,0x74D6,0x74D4,0x7669,0x77D3,/* 0xE8-0xEF */ + 0x7C50,0x7E8F,0x7E8C,0x7FBC,0x8617,0x862D,0x861A,0x8823,/* 0xF0-0xF7 */ + 0x8822,0x8821,0x881F,0x896A,0x896C,0x89BD,0x8B74,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8B77,0x8B7D,0x8D13,0x8E8A,0x8E8D,0x8E8B,0x8F5F,0x8FAF,/* 0x40-0x47 */ + 0x91BA,0x942E,0x9433,0x9435,0x943A,0x9438,0x9432,0x942B,/* 0x48-0x4F */ + 0x95E2,0x9738,0x9739,0x9732,0x97FF,0x9867,0x9865,0x9957,/* 0x50-0x57 */ + 0x9A45,0x9A43,0x9A40,0x9A3E,0x9ACF,0x9B54,0x9B51,0x9C2D,/* 0x58-0x5F */ + 0x9C25,0x9DAF,0x9DB4,0x9DC2,0x9DB8,0x9E9D,0x9EEF,0x9F19,/* 0x60-0x67 */ + 0x9F5C,0x9F66,0x9F67,0x513C,0x513B,0x56C8,0x56CA,0x56C9,/* 0x68-0x6F */ + 0x5B7F,0x5DD4,0x5DD2,0x5F4E,0x61FF,0x6524,0x6B0A,0x6B61,/* 0x70-0x77 */ + 0x7051,0x7058,0x7380,0x74E4,0x758A,0x766E,0x766C,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x79B3,0x7C60,0x7C5F,0x807E,0x807D,0x81DF,0x8972,/* 0xA0-0xA7 */ + 0x896F,0x89FC,0x8B80,0x8D16,0x8D17,0x8E91,0x8E93,0x8F61,/* 0xA8-0xAF */ + 0x9148,0x9444,0x9451,0x9452,0x973D,0x973E,0x97C3,0x97C1,/* 0xB0-0xB7 */ + 0x986B,0x9955,0x9A55,0x9A4D,0x9AD2,0x9B1A,0x9C49,0x9C31,/* 0xB8-0xBF */ + 0x9C3E,0x9C3B,0x9DD3,0x9DD7,0x9F34,0x9F6C,0x9F6A,0x9F94,/* 0xC0-0xC7 */ + 0x56CC,0x5DD6,0x6200,0x6523,0x652B,0x652A,0x66EC,0x6B10,/* 0xC8-0xCF */ + 0x74DA,0x7ACA,0x7C64,0x7C63,0x7C65,0x7E93,0x7E96,0x7E94,/* 0xD0-0xD7 */ + 0x81E2,0x8638,0x863F,0x8831,0x8B8A,0x9090,0x908F,0x9463,/* 0xD8-0xDF */ + 0x9460,0x9464,0x9768,0x986F,0x995C,0x9A5A,0x9A5B,0x9A57,/* 0xE0-0xE7 */ + 0x9AD3,0x9AD4,0x9AD1,0x9C54,0x9C57,0x9C56,0x9DE5,0x9E9F,/* 0xE8-0xEF */ + 0x9EF4,0x56D1,0x58E9,0x652C,0x705E,0x7671,0x7672,0x77D7,/* 0xF0-0xF7 */ + 0x7F50,0x7F88,0x8836,0x8839,0x8862,0x8B93,0x8B92,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_C6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8B96,0x8277,0x8D1B,0x91C0,0x946A,0x9742,0x9748,0x9744,/* 0x40-0x47 */ + 0x97C6,0x9870,0x9A5F,0x9B22,0x9B58,0x9C5F,0x9DF9,0x9DFA,/* 0x48-0x4F */ + 0x9E7C,0x9E7D,0x9F07,0x9F77,0x9F72,0x5EF3,0x6B16,0x7063,/* 0x50-0x57 */ + 0x7C6C,0x7C6E,0x883B,0x89C0,0x8EA1,0x91C1,0x9472,0x9470,/* 0x58-0x5F */ + 0x9871,0x995E,0x9AD6,0x9B23,0x9ECC,0x7064,0x77DA,0x8B9A,/* 0x60-0x67 */ + 0x9477,0x97C9,0x9A62,0x9A65,0x7E9C,0x8B9C,0x8EAA,0x91C5,/* 0x68-0x6F */ + 0x947D,0x947E,0x947C,0x9C77,0x9C78,0x9EF7,0x8C54,0x947F,/* 0x70-0x77 */ + 0x9E1A,0x7228,0x9A6A,0x9B31,0x9E1B,0x9E1E,0x7C72,0x0000,/* 0x78-0x7F */ +}; + +static const wchar_t c2u_C9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x4E42,0x4E5C,0x51F5,0x531A,0x5382,0x4E07,0x4E0C,0x4E47,/* 0x40-0x47 */ + 0x4E8D,0x56D7,0xFA0C,0x5C6E,0x5F73,0x4E0F,0x5187,0x4E0E,/* 0x48-0x4F */ + 0x4E2E,0x4E93,0x4EC2,0x4EC9,0x4EC8,0x5198,0x52FC,0x536C,/* 0x50-0x57 */ + 0x53B9,0x5720,0x5903,0x592C,0x5C10,0x5DFF,0x65E1,0x6BB3,/* 0x58-0x5F */ + 0x6BCC,0x6C14,0x723F,0x4E31,0x4E3C,0x4EE8,0x4EDC,0x4EE9,/* 0x60-0x67 */ + 0x4EE1,0x4EDD,0x4EDA,0x520C,0x531C,0x534C,0x5722,0x5723,/* 0x68-0x6F */ + 0x5917,0x592F,0x5B81,0x5B84,0x5C12,0x5C3B,0x5C74,0x5C73,/* 0x70-0x77 */ + 0x5E04,0x5E80,0x5E82,0x5FC9,0x6209,0x6250,0x6C15,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6C36,0x6C43,0x6C3F,0x6C3B,0x72AE,0x72B0,0x738A,/* 0xA0-0xA7 */ + 0x79B8,0x808A,0x961E,0x4F0E,0x4F18,0x4F2C,0x4EF5,0x4F14,/* 0xA8-0xAF */ + 0x4EF1,0x4F00,0x4EF7,0x4F08,0x4F1D,0x4F02,0x4F05,0x4F22,/* 0xB0-0xB7 */ + 0x4F13,0x4F04,0x4EF4,0x4F12,0x51B1,0x5213,0x5209,0x5210,/* 0xB8-0xBF */ + 0x52A6,0x5322,0x531F,0x534D,0x538A,0x5407,0x56E1,0x56DF,/* 0xC0-0xC7 */ + 0x572E,0x572A,0x5734,0x593C,0x5980,0x597C,0x5985,0x597B,/* 0xC8-0xCF */ + 0x597E,0x5977,0x597F,0x5B56,0x5C15,0x5C25,0x5C7C,0x5C7A,/* 0xD0-0xD7 */ + 0x5C7B,0x5C7E,0x5DDF,0x5E75,0x5E84,0x5F02,0x5F1A,0x5F74,/* 0xD8-0xDF */ + 0x5FD5,0x5FD4,0x5FCF,0x625C,0x625E,0x6264,0x6261,0x6266,/* 0xE0-0xE7 */ + 0x6262,0x6259,0x6260,0x625A,0x6265,0x65EF,0x65EE,0x673E,/* 0xE8-0xEF */ + 0x6739,0x6738,0x673B,0x673A,0x673F,0x673C,0x6733,0x6C18,/* 0xF0-0xF7 */ + 0x6C46,0x6C52,0x6C5C,0x6C4F,0x6C4A,0x6C54,0x6C4B,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6C4C,0x7071,0x725E,0x72B4,0x72B5,0x738E,0x752A,0x767F,/* 0x40-0x47 */ + 0x7A75,0x7F51,0x8278,0x827C,0x8280,0x827D,0x827F,0x864D,/* 0x48-0x4F */ + 0x897E,0x9099,0x9097,0x9098,0x909B,0x9094,0x9622,0x9624,/* 0x50-0x57 */ + 0x9620,0x9623,0x4F56,0x4F3B,0x4F62,0x4F49,0x4F53,0x4F64,/* 0x58-0x5F */ + 0x4F3E,0x4F67,0x4F52,0x4F5F,0x4F41,0x4F58,0x4F2D,0x4F33,/* 0x60-0x67 */ + 0x4F3F,0x4F61,0x518F,0x51B9,0x521C,0x521E,0x5221,0x52AD,/* 0x68-0x6F */ + 0x52AE,0x5309,0x5363,0x5372,0x538E,0x538F,0x5430,0x5437,/* 0x70-0x77 */ + 0x542A,0x5454,0x5445,0x5419,0x541C,0x5425,0x5418,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x543D,0x544F,0x5441,0x5428,0x5424,0x5447,0x56EE,/* 0xA0-0xA7 */ + 0x56E7,0x56E5,0x5741,0x5745,0x574C,0x5749,0x574B,0x5752,/* 0xA8-0xAF */ + 0x5906,0x5940,0x59A6,0x5998,0x59A0,0x5997,0x598E,0x59A2,/* 0xB0-0xB7 */ + 0x5990,0x598F,0x59A7,0x59A1,0x5B8E,0x5B92,0x5C28,0x5C2A,/* 0xB8-0xBF */ + 0x5C8D,0x5C8F,0x5C88,0x5C8B,0x5C89,0x5C92,0x5C8A,0x5C86,/* 0xC0-0xC7 */ + 0x5C93,0x5C95,0x5DE0,0x5E0A,0x5E0E,0x5E8B,0x5E89,0x5E8C,/* 0xC8-0xCF */ + 0x5E88,0x5E8D,0x5F05,0x5F1D,0x5F78,0x5F76,0x5FD2,0x5FD1,/* 0xD0-0xD7 */ + 0x5FD0,0x5FED,0x5FE8,0x5FEE,0x5FF3,0x5FE1,0x5FE4,0x5FE3,/* 0xD8-0xDF */ + 0x5FFA,0x5FEF,0x5FF7,0x5FFB,0x6000,0x5FF4,0x623A,0x6283,/* 0xE0-0xE7 */ + 0x628C,0x628E,0x628F,0x6294,0x6287,0x6271,0x627B,0x627A,/* 0xE8-0xEF */ + 0x6270,0x6281,0x6288,0x6277,0x627D,0x6272,0x6274,0x6537,/* 0xF0-0xF7 */ + 0x65F0,0x65F4,0x65F3,0x65F2,0x65F5,0x6745,0x6747,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6759,0x6755,0x674C,0x6748,0x675D,0x674D,0x675A,0x674B,/* 0x40-0x47 */ + 0x6BD0,0x6C19,0x6C1A,0x6C78,0x6C67,0x6C6B,0x6C84,0x6C8B,/* 0x48-0x4F */ + 0x6C8F,0x6C71,0x6C6F,0x6C69,0x6C9A,0x6C6D,0x6C87,0x6C95,/* 0x50-0x57 */ + 0x6C9C,0x6C66,0x6C73,0x6C65,0x6C7B,0x6C8E,0x7074,0x707A,/* 0x58-0x5F */ + 0x7263,0x72BF,0x72BD,0x72C3,0x72C6,0x72C1,0x72BA,0x72C5,/* 0x60-0x67 */ + 0x7395,0x7397,0x7393,0x7394,0x7392,0x753A,0x7539,0x7594,/* 0x68-0x6F */ + 0x7595,0x7681,0x793D,0x8034,0x8095,0x8099,0x8090,0x8092,/* 0x70-0x77 */ + 0x809C,0x8290,0x828F,0x8285,0x828E,0x8291,0x8293,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x828A,0x8283,0x8284,0x8C78,0x8FC9,0x8FBF,0x909F,/* 0xA0-0xA7 */ + 0x90A1,0x90A5,0x909E,0x90A7,0x90A0,0x9630,0x9628,0x962F,/* 0xA8-0xAF */ + 0x962D,0x4E33,0x4F98,0x4F7C,0x4F85,0x4F7D,0x4F80,0x4F87,/* 0xB0-0xB7 */ + 0x4F76,0x4F74,0x4F89,0x4F84,0x4F77,0x4F4C,0x4F97,0x4F6A,/* 0xB8-0xBF */ + 0x4F9A,0x4F79,0x4F81,0x4F78,0x4F90,0x4F9C,0x4F94,0x4F9E,/* 0xC0-0xC7 */ + 0x4F92,0x4F82,0x4F95,0x4F6B,0x4F6E,0x519E,0x51BC,0x51BE,/* 0xC8-0xCF */ + 0x5235,0x5232,0x5233,0x5246,0x5231,0x52BC,0x530A,0x530B,/* 0xD0-0xD7 */ + 0x533C,0x5392,0x5394,0x5487,0x547F,0x5481,0x5491,0x5482,/* 0xD8-0xDF */ + 0x5488,0x546B,0x547A,0x547E,0x5465,0x546C,0x5474,0x5466,/* 0xE0-0xE7 */ + 0x548D,0x546F,0x5461,0x5460,0x5498,0x5463,0x5467,0x5464,/* 0xE8-0xEF */ + 0x56F7,0x56F9,0x576F,0x5772,0x576D,0x576B,0x5771,0x5770,/* 0xF0-0xF7 */ + 0x5776,0x5780,0x5775,0x577B,0x5773,0x5774,0x5762,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5768,0x577D,0x590C,0x5945,0x59B5,0x59BA,0x59CF,0x59CE,/* 0x40-0x47 */ + 0x59B2,0x59CC,0x59C1,0x59B6,0x59BC,0x59C3,0x59D6,0x59B1,/* 0x48-0x4F */ + 0x59BD,0x59C0,0x59C8,0x59B4,0x59C7,0x5B62,0x5B65,0x5B93,/* 0x50-0x57 */ + 0x5B95,0x5C44,0x5C47,0x5CAE,0x5CA4,0x5CA0,0x5CB5,0x5CAF,/* 0x58-0x5F */ + 0x5CA8,0x5CAC,0x5C9F,0x5CA3,0x5CAD,0x5CA2,0x5CAA,0x5CA7,/* 0x60-0x67 */ + 0x5C9D,0x5CA5,0x5CB6,0x5CB0,0x5CA6,0x5E17,0x5E14,0x5E19,/* 0x68-0x6F */ + 0x5F28,0x5F22,0x5F23,0x5F24,0x5F54,0x5F82,0x5F7E,0x5F7D,/* 0x70-0x77 */ + 0x5FDE,0x5FE5,0x602D,0x6026,0x6019,0x6032,0x600B,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6034,0x600A,0x6017,0x6033,0x601A,0x601E,0x602C,/* 0xA0-0xA7 */ + 0x6022,0x600D,0x6010,0x602E,0x6013,0x6011,0x600C,0x6009,/* 0xA8-0xAF */ + 0x601C,0x6214,0x623D,0x62AD,0x62B4,0x62D1,0x62BE,0x62AA,/* 0xB0-0xB7 */ + 0x62B6,0x62CA,0x62AE,0x62B3,0x62AF,0x62BB,0x62A9,0x62B0,/* 0xB8-0xBF */ + 0x62B8,0x653D,0x65A8,0x65BB,0x6609,0x65FC,0x6604,0x6612,/* 0xC0-0xC7 */ + 0x6608,0x65FB,0x6603,0x660B,0x660D,0x6605,0x65FD,0x6611,/* 0xC8-0xCF */ + 0x6610,0x66F6,0x670A,0x6785,0x676C,0x678E,0x6792,0x6776,/* 0xD0-0xD7 */ + 0x677B,0x6798,0x6786,0x6784,0x6774,0x678D,0x678C,0x677A,/* 0xD8-0xDF */ + 0x679F,0x6791,0x6799,0x6783,0x677D,0x6781,0x6778,0x6779,/* 0xE0-0xE7 */ + 0x6794,0x6B25,0x6B80,0x6B7E,0x6BDE,0x6C1D,0x6C93,0x6CEC,/* 0xE8-0xEF */ + 0x6CEB,0x6CEE,0x6CD9,0x6CB6,0x6CD4,0x6CAD,0x6CE7,0x6CB7,/* 0xF0-0xF7 */ + 0x6CD0,0x6CC2,0x6CBA,0x6CC3,0x6CC6,0x6CED,0x6CF2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6CD2,0x6CDD,0x6CB4,0x6C8A,0x6C9D,0x6C80,0x6CDE,0x6CC0,/* 0x40-0x47 */ + 0x6D30,0x6CCD,0x6CC7,0x6CB0,0x6CF9,0x6CCF,0x6CE9,0x6CD1,/* 0x48-0x4F */ + 0x7094,0x7098,0x7085,0x7093,0x7086,0x7084,0x7091,0x7096,/* 0x50-0x57 */ + 0x7082,0x709A,0x7083,0x726A,0x72D6,0x72CB,0x72D8,0x72C9,/* 0x58-0x5F */ + 0x72DC,0x72D2,0x72D4,0x72DA,0x72CC,0x72D1,0x73A4,0x73A1,/* 0x60-0x67 */ + 0x73AD,0x73A6,0x73A2,0x73A0,0x73AC,0x739D,0x74DD,0x74E8,/* 0x68-0x6F */ + 0x753F,0x7540,0x753E,0x758C,0x7598,0x76AF,0x76F3,0x76F1,/* 0x70-0x77 */ + 0x76F0,0x76F5,0x77F8,0x77FC,0x77F9,0x77FB,0x77FA,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x77F7,0x7942,0x793F,0x79C5,0x7A78,0x7A7B,0x7AFB,/* 0xA0-0xA7 */ + 0x7C75,0x7CFD,0x8035,0x808F,0x80AE,0x80A3,0x80B8,0x80B5,/* 0xA8-0xAF */ + 0x80AD,0x8220,0x82A0,0x82C0,0x82AB,0x829A,0x8298,0x829B,/* 0xB0-0xB7 */ + 0x82B5,0x82A7,0x82AE,0x82BC,0x829E,0x82BA,0x82B4,0x82A8,/* 0xB8-0xBF */ + 0x82A1,0x82A9,0x82C2,0x82A4,0x82C3,0x82B6,0x82A2,0x8670,/* 0xC0-0xC7 */ + 0x866F,0x866D,0x866E,0x8C56,0x8FD2,0x8FCB,0x8FD3,0x8FCD,/* 0xC8-0xCF */ + 0x8FD6,0x8FD5,0x8FD7,0x90B2,0x90B4,0x90AF,0x90B3,0x90B0,/* 0xD0-0xD7 */ + 0x9639,0x963D,0x963C,0x963A,0x9643,0x4FCD,0x4FC5,0x4FD3,/* 0xD8-0xDF */ + 0x4FB2,0x4FC9,0x4FCB,0x4FC1,0x4FD4,0x4FDC,0x4FD9,0x4FBB,/* 0xE0-0xE7 */ + 0x4FB3,0x4FDB,0x4FC7,0x4FD6,0x4FBA,0x4FC0,0x4FB9,0x4FEC,/* 0xE8-0xEF */ + 0x5244,0x5249,0x52C0,0x52C2,0x533D,0x537C,0x5397,0x5396,/* 0xF0-0xF7 */ + 0x5399,0x5398,0x54BA,0x54A1,0x54AD,0x54A5,0x54CF,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x54C3,0x830D,0x54B7,0x54AE,0x54D6,0x54B6,0x54C5,0x54C6,/* 0x40-0x47 */ + 0x54A0,0x5470,0x54BC,0x54A2,0x54BE,0x5472,0x54DE,0x54B0,/* 0x48-0x4F */ + 0x57B5,0x579E,0x579F,0x57A4,0x578C,0x5797,0x579D,0x579B,/* 0x50-0x57 */ + 0x5794,0x5798,0x578F,0x5799,0x57A5,0x579A,0x5795,0x58F4,/* 0x58-0x5F */ + 0x590D,0x5953,0x59E1,0x59DE,0x59EE,0x5A00,0x59F1,0x59DD,/* 0x60-0x67 */ + 0x59FA,0x59FD,0x59FC,0x59F6,0x59E4,0x59F2,0x59F7,0x59DB,/* 0x68-0x6F */ + 0x59E9,0x59F3,0x59F5,0x59E0,0x59FE,0x59F4,0x59ED,0x5BA8,/* 0x70-0x77 */ + 0x5C4C,0x5CD0,0x5CD8,0x5CCC,0x5CD7,0x5CCB,0x5CDB,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5CDE,0x5CDA,0x5CC9,0x5CC7,0x5CCA,0x5CD6,0x5CD3,/* 0xA0-0xA7 */ + 0x5CD4,0x5CCF,0x5CC8,0x5CC6,0x5CCE,0x5CDF,0x5CF8,0x5DF9,/* 0xA8-0xAF */ + 0x5E21,0x5E22,0x5E23,0x5E20,0x5E24,0x5EB0,0x5EA4,0x5EA2,/* 0xB0-0xB7 */ + 0x5E9B,0x5EA3,0x5EA5,0x5F07,0x5F2E,0x5F56,0x5F86,0x6037,/* 0xB8-0xBF */ + 0x6039,0x6054,0x6072,0x605E,0x6045,0x6053,0x6047,0x6049,/* 0xC0-0xC7 */ + 0x605B,0x604C,0x6040,0x6042,0x605F,0x6024,0x6044,0x6058,/* 0xC8-0xCF */ + 0x6066,0x606E,0x6242,0x6243,0x62CF,0x630D,0x630B,0x62F5,/* 0xD0-0xD7 */ + 0x630E,0x6303,0x62EB,0x62F9,0x630F,0x630C,0x62F8,0x62F6,/* 0xD8-0xDF */ + 0x6300,0x6313,0x6314,0x62FA,0x6315,0x62FB,0x62F0,0x6541,/* 0xE0-0xE7 */ + 0x6543,0x65AA,0x65BF,0x6636,0x6621,0x6632,0x6635,0x661C,/* 0xE8-0xEF */ + 0x6626,0x6622,0x6633,0x662B,0x663A,0x661D,0x6634,0x6639,/* 0xF0-0xF7 */ + 0x662E,0x670F,0x6710,0x67C1,0x67F2,0x67C8,0x67BA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_CF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x67DC,0x67BB,0x67F8,0x67D8,0x67C0,0x67B7,0x67C5,0x67EB,/* 0x40-0x47 */ + 0x67E4,0x67DF,0x67B5,0x67CD,0x67B3,0x67F7,0x67F6,0x67EE,/* 0x48-0x4F */ + 0x67E3,0x67C2,0x67B9,0x67CE,0x67E7,0x67F0,0x67B2,0x67FC,/* 0x50-0x57 */ + 0x67C6,0x67ED,0x67CC,0x67AE,0x67E6,0x67DB,0x67FA,0x67C9,/* 0x58-0x5F */ + 0x67CA,0x67C3,0x67EA,0x67CB,0x6B28,0x6B82,0x6B84,0x6BB6,/* 0x60-0x67 */ + 0x6BD6,0x6BD8,0x6BE0,0x6C20,0x6C21,0x6D28,0x6D34,0x6D2D,/* 0x68-0x6F */ + 0x6D1F,0x6D3C,0x6D3F,0x6D12,0x6D0A,0x6CDA,0x6D33,0x6D04,/* 0x70-0x77 */ + 0x6D19,0x6D3A,0x6D1A,0x6D11,0x6D00,0x6D1D,0x6D42,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6D01,0x6D18,0x6D37,0x6D03,0x6D0F,0x6D40,0x6D07,/* 0xA0-0xA7 */ + 0x6D20,0x6D2C,0x6D08,0x6D22,0x6D09,0x6D10,0x70B7,0x709F,/* 0xA8-0xAF */ + 0x70BE,0x70B1,0x70B0,0x70A1,0x70B4,0x70B5,0x70A9,0x7241,/* 0xB0-0xB7 */ + 0x7249,0x724A,0x726C,0x7270,0x7273,0x726E,0x72CA,0x72E4,/* 0xB8-0xBF */ + 0x72E8,0x72EB,0x72DF,0x72EA,0x72E6,0x72E3,0x7385,0x73CC,/* 0xC0-0xC7 */ + 0x73C2,0x73C8,0x73C5,0x73B9,0x73B6,0x73B5,0x73B4,0x73EB,/* 0xC8-0xCF */ + 0x73BF,0x73C7,0x73BE,0x73C3,0x73C6,0x73B8,0x73CB,0x74EC,/* 0xD0-0xD7 */ + 0x74EE,0x752E,0x7547,0x7548,0x75A7,0x75AA,0x7679,0x76C4,/* 0xD8-0xDF */ + 0x7708,0x7703,0x7704,0x7705,0x770A,0x76F7,0x76FB,0x76FA,/* 0xE0-0xE7 */ + 0x77E7,0x77E8,0x7806,0x7811,0x7812,0x7805,0x7810,0x780F,/* 0xE8-0xEF */ + 0x780E,0x7809,0x7803,0x7813,0x794A,0x794C,0x794B,0x7945,/* 0xF0-0xF7 */ + 0x7944,0x79D5,0x79CD,0x79CF,0x79D6,0x79CE,0x7A80,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7A7E,0x7AD1,0x7B00,0x7B01,0x7C7A,0x7C78,0x7C79,0x7C7F,/* 0x40-0x47 */ + 0x7C80,0x7C81,0x7D03,0x7D08,0x7D01,0x7F58,0x7F91,0x7F8D,/* 0x48-0x4F */ + 0x7FBE,0x8007,0x800E,0x800F,0x8014,0x8037,0x80D8,0x80C7,/* 0x50-0x57 */ + 0x80E0,0x80D1,0x80C8,0x80C2,0x80D0,0x80C5,0x80E3,0x80D9,/* 0x58-0x5F */ + 0x80DC,0x80CA,0x80D5,0x80C9,0x80CF,0x80D7,0x80E6,0x80CD,/* 0x60-0x67 */ + 0x81FF,0x8221,0x8294,0x82D9,0x82FE,0x82F9,0x8307,0x82E8,/* 0x68-0x6F */ + 0x8300,0x82D5,0x833A,0x82EB,0x82D6,0x82F4,0x82EC,0x82E1,/* 0x70-0x77 */ + 0x82F2,0x82F5,0x830C,0x82FB,0x82F6,0x82F0,0x82EA,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x82E4,0x82E0,0x82FA,0x82F3,0x82ED,0x8677,0x8674,/* 0xA0-0xA7 */ + 0x867C,0x8673,0x8841,0x884E,0x8867,0x886A,0x8869,0x89D3,/* 0xA8-0xAF */ + 0x8A04,0x8A07,0x8D72,0x8FE3,0x8FE1,0x8FEE,0x8FE0,0x90F1,/* 0xB0-0xB7 */ + 0x90BD,0x90BF,0x90D5,0x90C5,0x90BE,0x90C7,0x90CB,0x90C8,/* 0xB8-0xBF */ + 0x91D4,0x91D3,0x9654,0x964F,0x9651,0x9653,0x964A,0x964E,/* 0xC0-0xC7 */ + 0x501E,0x5005,0x5007,0x5013,0x5022,0x5030,0x501B,0x4FF5,/* 0xC8-0xCF */ + 0x4FF4,0x5033,0x5037,0x502C,0x4FF6,0x4FF7,0x5017,0x501C,/* 0xD0-0xD7 */ + 0x5020,0x5027,0x5035,0x502F,0x5031,0x500E,0x515A,0x5194,/* 0xD8-0xDF */ + 0x5193,0x51CA,0x51C4,0x51C5,0x51C8,0x51CE,0x5261,0x525A,/* 0xE0-0xE7 */ + 0x5252,0x525E,0x525F,0x5255,0x5262,0x52CD,0x530E,0x539E,/* 0xE8-0xEF */ + 0x5526,0x54E2,0x5517,0x5512,0x54E7,0x54F3,0x54E4,0x551A,/* 0xF0-0xF7 */ + 0x54FF,0x5504,0x5508,0x54EB,0x5511,0x5505,0x54F1,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x550A,0x54FB,0x54F7,0x54F8,0x54E0,0x550E,0x5503,0x550B,/* 0x40-0x47 */ + 0x5701,0x5702,0x57CC,0x5832,0x57D5,0x57D2,0x57BA,0x57C6,/* 0x48-0x4F */ + 0x57BD,0x57BC,0x57B8,0x57B6,0x57BF,0x57C7,0x57D0,0x57B9,/* 0x50-0x57 */ + 0x57C1,0x590E,0x594A,0x5A19,0x5A16,0x5A2D,0x5A2E,0x5A15,/* 0x58-0x5F */ + 0x5A0F,0x5A17,0x5A0A,0x5A1E,0x5A33,0x5B6C,0x5BA7,0x5BAD,/* 0x60-0x67 */ + 0x5BAC,0x5C03,0x5C56,0x5C54,0x5CEC,0x5CFF,0x5CEE,0x5CF1,/* 0x68-0x6F */ + 0x5CF7,0x5D00,0x5CF9,0x5E29,0x5E28,0x5EA8,0x5EAE,0x5EAA,/* 0x70-0x77 */ + 0x5EAC,0x5F33,0x5F30,0x5F67,0x605D,0x605A,0x6067,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6041,0x60A2,0x6088,0x6080,0x6092,0x6081,0x609D,/* 0xA0-0xA7 */ + 0x6083,0x6095,0x609B,0x6097,0x6087,0x609C,0x608E,0x6219,/* 0xA8-0xAF */ + 0x6246,0x62F2,0x6310,0x6356,0x632C,0x6344,0x6345,0x6336,/* 0xB0-0xB7 */ + 0x6343,0x63E4,0x6339,0x634B,0x634A,0x633C,0x6329,0x6341,/* 0xB8-0xBF */ + 0x6334,0x6358,0x6354,0x6359,0x632D,0x6347,0x6333,0x635A,/* 0xC0-0xC7 */ + 0x6351,0x6338,0x6357,0x6340,0x6348,0x654A,0x6546,0x65C6,/* 0xC8-0xCF */ + 0x65C3,0x65C4,0x65C2,0x664A,0x665F,0x6647,0x6651,0x6712,/* 0xD0-0xD7 */ + 0x6713,0x681F,0x681A,0x6849,0x6832,0x6833,0x683B,0x684B,/* 0xD8-0xDF */ + 0x684F,0x6816,0x6831,0x681C,0x6835,0x682B,0x682D,0x682F,/* 0xE0-0xE7 */ + 0x684E,0x6844,0x6834,0x681D,0x6812,0x6814,0x6826,0x6828,/* 0xE8-0xEF */ + 0x682E,0x684D,0x683A,0x6825,0x6820,0x6B2C,0x6B2F,0x6B2D,/* 0xF0-0xF7 */ + 0x6B31,0x6B34,0x6B6D,0x8082,0x6B88,0x6BE6,0x6BE4,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6BE8,0x6BE3,0x6BE2,0x6BE7,0x6C25,0x6D7A,0x6D63,0x6D64,/* 0x40-0x47 */ + 0x6D76,0x6D0D,0x6D61,0x6D92,0x6D58,0x6D62,0x6D6D,0x6D6F,/* 0x48-0x4F */ + 0x6D91,0x6D8D,0x6DEF,0x6D7F,0x6D86,0x6D5E,0x6D67,0x6D60,/* 0x50-0x57 */ + 0x6D97,0x6D70,0x6D7C,0x6D5F,0x6D82,0x6D98,0x6D2F,0x6D68,/* 0x58-0x5F */ + 0x6D8B,0x6D7E,0x6D80,0x6D84,0x6D16,0x6D83,0x6D7B,0x6D7D,/* 0x60-0x67 */ + 0x6D75,0x6D90,0x70DC,0x70D3,0x70D1,0x70DD,0x70CB,0x7F39,/* 0x68-0x6F */ + 0x70E2,0x70D7,0x70D2,0x70DE,0x70E0,0x70D4,0x70CD,0x70C5,/* 0x70-0x77 */ + 0x70C6,0x70C7,0x70DA,0x70CE,0x70E1,0x7242,0x7278,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7277,0x7276,0x7300,0x72FA,0x72F4,0x72FE,0x72F6,/* 0xA0-0xA7 */ + 0x72F3,0x72FB,0x7301,0x73D3,0x73D9,0x73E5,0x73D6,0x73BC,/* 0xA8-0xAF */ + 0x73E7,0x73E3,0x73E9,0x73DC,0x73D2,0x73DB,0x73D4,0x73DD,/* 0xB0-0xB7 */ + 0x73DA,0x73D7,0x73D8,0x73E8,0x74DE,0x74DF,0x74F4,0x74F5,/* 0xB8-0xBF */ + 0x7521,0x755B,0x755F,0x75B0,0x75C1,0x75BB,0x75C4,0x75C0,/* 0xC0-0xC7 */ + 0x75BF,0x75B6,0x75BA,0x768A,0x76C9,0x771D,0x771B,0x7710,/* 0xC8-0xCF */ + 0x7713,0x7712,0x7723,0x7711,0x7715,0x7719,0x771A,0x7722,/* 0xD0-0xD7 */ + 0x7727,0x7823,0x782C,0x7822,0x7835,0x782F,0x7828,0x782E,/* 0xD8-0xDF */ + 0x782B,0x7821,0x7829,0x7833,0x782A,0x7831,0x7954,0x795B,/* 0xE0-0xE7 */ + 0x794F,0x795C,0x7953,0x7952,0x7951,0x79EB,0x79EC,0x79E0,/* 0xE8-0xEF */ + 0x79EE,0x79ED,0x79EA,0x79DC,0x79DE,0x79DD,0x7A86,0x7A89,/* 0xF0-0xF7 */ + 0x7A85,0x7A8B,0x7A8C,0x7A8A,0x7A87,0x7AD8,0x7B10,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7B04,0x7B13,0x7B05,0x7B0F,0x7B08,0x7B0A,0x7B0E,0x7B09,/* 0x40-0x47 */ + 0x7B12,0x7C84,0x7C91,0x7C8A,0x7C8C,0x7C88,0x7C8D,0x7C85,/* 0x48-0x4F */ + 0x7D1E,0x7D1D,0x7D11,0x7D0E,0x7D18,0x7D16,0x7D13,0x7D1F,/* 0x50-0x57 */ + 0x7D12,0x7D0F,0x7D0C,0x7F5C,0x7F61,0x7F5E,0x7F60,0x7F5D,/* 0x58-0x5F */ + 0x7F5B,0x7F96,0x7F92,0x7FC3,0x7FC2,0x7FC0,0x8016,0x803E,/* 0x60-0x67 */ + 0x8039,0x80FA,0x80F2,0x80F9,0x80F5,0x8101,0x80FB,0x8100,/* 0x68-0x6F */ + 0x8201,0x822F,0x8225,0x8333,0x832D,0x8344,0x8319,0x8351,/* 0x70-0x77 */ + 0x8325,0x8356,0x833F,0x8341,0x8326,0x831C,0x8322,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8342,0x834E,0x831B,0x832A,0x8308,0x833C,0x834D,/* 0xA0-0xA7 */ + 0x8316,0x8324,0x8320,0x8337,0x832F,0x8329,0x8347,0x8345,/* 0xA8-0xAF */ + 0x834C,0x8353,0x831E,0x832C,0x834B,0x8327,0x8348,0x8653,/* 0xB0-0xB7 */ + 0x8652,0x86A2,0x86A8,0x8696,0x868D,0x8691,0x869E,0x8687,/* 0xB8-0xBF */ + 0x8697,0x8686,0x868B,0x869A,0x8685,0x86A5,0x8699,0x86A1,/* 0xC0-0xC7 */ + 0x86A7,0x8695,0x8698,0x868E,0x869D,0x8690,0x8694,0x8843,/* 0xC8-0xCF */ + 0x8844,0x886D,0x8875,0x8876,0x8872,0x8880,0x8871,0x887F,/* 0xD0-0xD7 */ + 0x886F,0x8883,0x887E,0x8874,0x887C,0x8A12,0x8C47,0x8C57,/* 0xD8-0xDF */ + 0x8C7B,0x8CA4,0x8CA3,0x8D76,0x8D78,0x8DB5,0x8DB7,0x8DB6,/* 0xE0-0xE7 */ + 0x8ED1,0x8ED3,0x8FFE,0x8FF5,0x9002,0x8FFF,0x8FFB,0x9004,/* 0xE8-0xEF */ + 0x8FFC,0x8FF6,0x90D6,0x90E0,0x90D9,0x90DA,0x90E3,0x90DF,/* 0xF0-0xF7 */ + 0x90E5,0x90D8,0x90DB,0x90D7,0x90DC,0x90E4,0x9150,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x914E,0x914F,0x91D5,0x91E2,0x91DA,0x965C,0x965F,0x96BC,/* 0x40-0x47 */ + 0x98E3,0x9ADF,0x9B2F,0x4E7F,0x5070,0x506A,0x5061,0x505E,/* 0x48-0x4F */ + 0x5060,0x5053,0x504B,0x505D,0x5072,0x5048,0x504D,0x5041,/* 0x50-0x57 */ + 0x505B,0x504A,0x5062,0x5015,0x5045,0x505F,0x5069,0x506B,/* 0x58-0x5F */ + 0x5063,0x5064,0x5046,0x5040,0x506E,0x5073,0x5057,0x5051,/* 0x60-0x67 */ + 0x51D0,0x526B,0x526D,0x526C,0x526E,0x52D6,0x52D3,0x532D,/* 0x68-0x6F */ + 0x539C,0x5575,0x5576,0x553C,0x554D,0x5550,0x5534,0x552A,/* 0x70-0x77 */ + 0x5551,0x5562,0x5536,0x5535,0x5530,0x5552,0x5545,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x550C,0x5532,0x5565,0x554E,0x5539,0x5548,0x552D,/* 0xA0-0xA7 */ + 0x553B,0x5540,0x554B,0x570A,0x5707,0x57FB,0x5814,0x57E2,/* 0xA8-0xAF */ + 0x57F6,0x57DC,0x57F4,0x5800,0x57ED,0x57FD,0x5808,0x57F8,/* 0xB0-0xB7 */ + 0x580B,0x57F3,0x57CF,0x5807,0x57EE,0x57E3,0x57F2,0x57E5,/* 0xB8-0xBF */ + 0x57EC,0x57E1,0x580E,0x57FC,0x5810,0x57E7,0x5801,0x580C,/* 0xC0-0xC7 */ + 0x57F1,0x57E9,0x57F0,0x580D,0x5804,0x595C,0x5A60,0x5A58,/* 0xC8-0xCF */ + 0x5A55,0x5A67,0x5A5E,0x5A38,0x5A35,0x5A6D,0x5A50,0x5A5F,/* 0xD0-0xD7 */ + 0x5A65,0x5A6C,0x5A53,0x5A64,0x5A57,0x5A43,0x5A5D,0x5A52,/* 0xD8-0xDF */ + 0x5A44,0x5A5B,0x5A48,0x5A8E,0x5A3E,0x5A4D,0x5A39,0x5A4C,/* 0xE0-0xE7 */ + 0x5A70,0x5A69,0x5A47,0x5A51,0x5A56,0x5A42,0x5A5C,0x5B72,/* 0xE8-0xEF */ + 0x5B6E,0x5BC1,0x5BC0,0x5C59,0x5D1E,0x5D0B,0x5D1D,0x5D1A,/* 0xF0-0xF7 */ + 0x5D20,0x5D0C,0x5D28,0x5D0D,0x5D26,0x5D25,0x5D0F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5D30,0x5D12,0x5D23,0x5D1F,0x5D2E,0x5E3E,0x5E34,0x5EB1,/* 0x40-0x47 */ + 0x5EB4,0x5EB9,0x5EB2,0x5EB3,0x5F36,0x5F38,0x5F9B,0x5F96,/* 0x48-0x4F */ + 0x5F9F,0x608A,0x6090,0x6086,0x60BE,0x60B0,0x60BA,0x60D3,/* 0x50-0x57 */ + 0x60D4,0x60CF,0x60E4,0x60D9,0x60DD,0x60C8,0x60B1,0x60DB,/* 0x58-0x5F */ + 0x60B7,0x60CA,0x60BF,0x60C3,0x60CD,0x60C0,0x6332,0x6365,/* 0x60-0x67 */ + 0x638A,0x6382,0x637D,0x63BD,0x639E,0x63AD,0x639D,0x6397,/* 0x68-0x6F */ + 0x63AB,0x638E,0x636F,0x6387,0x6390,0x636E,0x63AF,0x6375,/* 0x70-0x77 */ + 0x639C,0x636D,0x63AE,0x637C,0x63A4,0x633B,0x639F,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6378,0x6385,0x6381,0x6391,0x638D,0x6370,0x6553,/* 0xA0-0xA7 */ + 0x65CD,0x6665,0x6661,0x665B,0x6659,0x665C,0x6662,0x6718,/* 0xA8-0xAF */ + 0x6879,0x6887,0x6890,0x689C,0x686D,0x686E,0x68AE,0x68AB,/* 0xB0-0xB7 */ + 0x6956,0x686F,0x68A3,0x68AC,0x68A9,0x6875,0x6874,0x68B2,/* 0xB8-0xBF */ + 0x688F,0x6877,0x6892,0x687C,0x686B,0x6872,0x68AA,0x6880,/* 0xC0-0xC7 */ + 0x6871,0x687E,0x689B,0x6896,0x688B,0x68A0,0x6889,0x68A4,/* 0xC8-0xCF */ + 0x6878,0x687B,0x6891,0x688C,0x688A,0x687D,0x6B36,0x6B33,/* 0xD0-0xD7 */ + 0x6B37,0x6B38,0x6B91,0x6B8F,0x6B8D,0x6B8E,0x6B8C,0x6C2A,/* 0xD8-0xDF */ + 0x6DC0,0x6DAB,0x6DB4,0x6DB3,0x6E74,0x6DAC,0x6DE9,0x6DE2,/* 0xE0-0xE7 */ + 0x6DB7,0x6DF6,0x6DD4,0x6E00,0x6DC8,0x6DE0,0x6DDF,0x6DD6,/* 0xE8-0xEF */ + 0x6DBE,0x6DE5,0x6DDC,0x6DDD,0x6DDB,0x6DF4,0x6DCA,0x6DBD,/* 0xF0-0xF7 */ + 0x6DED,0x6DF0,0x6DBA,0x6DD5,0x6DC2,0x6DCF,0x6DC9,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6DD0,0x6DF2,0x6DD3,0x6DFD,0x6DD7,0x6DCD,0x6DE3,0x6DBB,/* 0x40-0x47 */ + 0x70FA,0x710D,0x70F7,0x7117,0x70F4,0x710C,0x70F0,0x7104,/* 0x48-0x4F */ + 0x70F3,0x7110,0x70FC,0x70FF,0x7106,0x7113,0x7100,0x70F8,/* 0x50-0x57 */ + 0x70F6,0x710B,0x7102,0x710E,0x727E,0x727B,0x727C,0x727F,/* 0x58-0x5F */ + 0x731D,0x7317,0x7307,0x7311,0x7318,0x730A,0x7308,0x72FF,/* 0x60-0x67 */ + 0x730F,0x731E,0x7388,0x73F6,0x73F8,0x73F5,0x7404,0x7401,/* 0x68-0x6F */ + 0x73FD,0x7407,0x7400,0x73FA,0x73FC,0x73FF,0x740C,0x740B,/* 0x70-0x77 */ + 0x73F4,0x7408,0x7564,0x7563,0x75CE,0x75D2,0x75CF,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x75CB,0x75CC,0x75D1,0x75D0,0x768F,0x7689,0x76D3,/* 0xA0-0xA7 */ + 0x7739,0x772F,0x772D,0x7731,0x7732,0x7734,0x7733,0x773D,/* 0xA8-0xAF */ + 0x7725,0x773B,0x7735,0x7848,0x7852,0x7849,0x784D,0x784A,/* 0xB0-0xB7 */ + 0x784C,0x7826,0x7845,0x7850,0x7964,0x7967,0x7969,0x796A,/* 0xB8-0xBF */ + 0x7963,0x796B,0x7961,0x79BB,0x79FA,0x79F8,0x79F6,0x79F7,/* 0xC0-0xC7 */ + 0x7A8F,0x7A94,0x7A90,0x7B35,0x7B47,0x7B34,0x7B25,0x7B30,/* 0xC8-0xCF */ + 0x7B22,0x7B24,0x7B33,0x7B18,0x7B2A,0x7B1D,0x7B31,0x7B2B,/* 0xD0-0xD7 */ + 0x7B2D,0x7B2F,0x7B32,0x7B38,0x7B1A,0x7B23,0x7C94,0x7C98,/* 0xD8-0xDF */ + 0x7C96,0x7CA3,0x7D35,0x7D3D,0x7D38,0x7D36,0x7D3A,0x7D45,/* 0xE0-0xE7 */ + 0x7D2C,0x7D29,0x7D41,0x7D47,0x7D3E,0x7D3F,0x7D4A,0x7D3B,/* 0xE8-0xEF */ + 0x7D28,0x7F63,0x7F95,0x7F9C,0x7F9D,0x7F9B,0x7FCA,0x7FCB,/* 0xF0-0xF7 */ + 0x7FCD,0x7FD0,0x7FD1,0x7FC7,0x7FCF,0x7FC9,0x801F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x801E,0x801B,0x8047,0x8043,0x8048,0x8118,0x8125,0x8119,/* 0x40-0x47 */ + 0x811B,0x812D,0x811F,0x812C,0x811E,0x8121,0x8115,0x8127,/* 0x48-0x4F */ + 0x811D,0x8122,0x8211,0x8238,0x8233,0x823A,0x8234,0x8232,/* 0x50-0x57 */ + 0x8274,0x8390,0x83A3,0x83A8,0x838D,0x837A,0x8373,0x83A4,/* 0x58-0x5F */ + 0x8374,0x838F,0x8381,0x8395,0x8399,0x8375,0x8394,0x83A9,/* 0x60-0x67 */ + 0x837D,0x8383,0x838C,0x839D,0x839B,0x83AA,0x838B,0x837E,/* 0x68-0x6F */ + 0x83A5,0x83AF,0x8388,0x8397,0x83B0,0x837F,0x83A6,0x8387,/* 0x70-0x77 */ + 0x83AE,0x8376,0x839A,0x8659,0x8656,0x86BF,0x86B7,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x86C2,0x86C1,0x86C5,0x86BA,0x86B0,0x86C8,0x86B9,/* 0xA0-0xA7 */ + 0x86B3,0x86B8,0x86CC,0x86B4,0x86BB,0x86BC,0x86C3,0x86BD,/* 0xA8-0xAF */ + 0x86BE,0x8852,0x8889,0x8895,0x88A8,0x88A2,0x88AA,0x889A,/* 0xB0-0xB7 */ + 0x8891,0x88A1,0x889F,0x8898,0x88A7,0x8899,0x889B,0x8897,/* 0xB8-0xBF */ + 0x88A4,0x88AC,0x888C,0x8893,0x888E,0x8982,0x89D6,0x89D9,/* 0xC0-0xC7 */ + 0x89D5,0x8A30,0x8A27,0x8A2C,0x8A1E,0x8C39,0x8C3B,0x8C5C,/* 0xC8-0xCF */ + 0x8C5D,0x8C7D,0x8CA5,0x8D7D,0x8D7B,0x8D79,0x8DBC,0x8DC2,/* 0xD0-0xD7 */ + 0x8DB9,0x8DBF,0x8DC1,0x8ED8,0x8EDE,0x8EDD,0x8EDC,0x8ED7,/* 0xD8-0xDF */ + 0x8EE0,0x8EE1,0x9024,0x900B,0x9011,0x901C,0x900C,0x9021,/* 0xE0-0xE7 */ + 0x90EF,0x90EA,0x90F0,0x90F4,0x90F2,0x90F3,0x90D4,0x90EB,/* 0xE8-0xEF */ + 0x90EC,0x90E9,0x9156,0x9158,0x915A,0x9153,0x9155,0x91EC,/* 0xF0-0xF7 */ + 0x91F4,0x91F1,0x91F3,0x91F8,0x91E4,0x91F9,0x91EA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x91EB,0x91F7,0x91E8,0x91EE,0x957A,0x9586,0x9588,0x967C,/* 0x40-0x47 */ + 0x966D,0x966B,0x9671,0x966F,0x96BF,0x976A,0x9804,0x98E5,/* 0x48-0x4F */ + 0x9997,0x509B,0x5095,0x5094,0x509E,0x508B,0x50A3,0x5083,/* 0x50-0x57 */ + 0x508C,0x508E,0x509D,0x5068,0x509C,0x5092,0x5082,0x5087,/* 0x58-0x5F */ + 0x515F,0x51D4,0x5312,0x5311,0x53A4,0x53A7,0x5591,0x55A8,/* 0x60-0x67 */ + 0x55A5,0x55AD,0x5577,0x5645,0x55A2,0x5593,0x5588,0x558F,/* 0x68-0x6F */ + 0x55B5,0x5581,0x55A3,0x5592,0x55A4,0x557D,0x558C,0x55A6,/* 0x70-0x77 */ + 0x557F,0x5595,0x55A1,0x558E,0x570C,0x5829,0x5837,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5819,0x581E,0x5827,0x5823,0x5828,0x57F5,0x5848,/* 0xA0-0xA7 */ + 0x5825,0x581C,0x581B,0x5833,0x583F,0x5836,0x582E,0x5839,/* 0xA8-0xAF */ + 0x5838,0x582D,0x582C,0x583B,0x5961,0x5AAF,0x5A94,0x5A9F,/* 0xB0-0xB7 */ + 0x5A7A,0x5AA2,0x5A9E,0x5A78,0x5AA6,0x5A7C,0x5AA5,0x5AAC,/* 0xB8-0xBF */ + 0x5A95,0x5AAE,0x5A37,0x5A84,0x5A8A,0x5A97,0x5A83,0x5A8B,/* 0xC0-0xC7 */ + 0x5AA9,0x5A7B,0x5A7D,0x5A8C,0x5A9C,0x5A8F,0x5A93,0x5A9D,/* 0xC8-0xCF */ + 0x5BEA,0x5BCD,0x5BCB,0x5BD4,0x5BD1,0x5BCA,0x5BCE,0x5C0C,/* 0xD0-0xD7 */ + 0x5C30,0x5D37,0x5D43,0x5D6B,0x5D41,0x5D4B,0x5D3F,0x5D35,/* 0xD8-0xDF */ + 0x5D51,0x5D4E,0x5D55,0x5D33,0x5D3A,0x5D52,0x5D3D,0x5D31,/* 0xE0-0xE7 */ + 0x5D59,0x5D42,0x5D39,0x5D49,0x5D38,0x5D3C,0x5D32,0x5D36,/* 0xE8-0xEF */ + 0x5D40,0x5D45,0x5E44,0x5E41,0x5F58,0x5FA6,0x5FA5,0x5FAB,/* 0xF0-0xF7 */ + 0x60C9,0x60B9,0x60CC,0x60E2,0x60CE,0x60C4,0x6114,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_D9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x60F2,0x610A,0x6116,0x6105,0x60F5,0x6113,0x60F8,0x60FC,/* 0x40-0x47 */ + 0x60FE,0x60C1,0x6103,0x6118,0x611D,0x6110,0x60FF,0x6104,/* 0x48-0x4F */ + 0x610B,0x624A,0x6394,0x63B1,0x63B0,0x63CE,0x63E5,0x63E8,/* 0x50-0x57 */ + 0x63EF,0x63C3,0x649D,0x63F3,0x63CA,0x63E0,0x63F6,0x63D5,/* 0x58-0x5F */ + 0x63F2,0x63F5,0x6461,0x63DF,0x63BE,0x63DD,0x63DC,0x63C4,/* 0x60-0x67 */ + 0x63D8,0x63D3,0x63C2,0x63C7,0x63CC,0x63CB,0x63C8,0x63F0,/* 0x68-0x6F */ + 0x63D7,0x63D9,0x6532,0x6567,0x656A,0x6564,0x655C,0x6568,/* 0x70-0x77 */ + 0x6565,0x658C,0x659D,0x659E,0x65AE,0x65D0,0x65D2,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x667C,0x666C,0x667B,0x6680,0x6671,0x6679,0x666A,/* 0xA0-0xA7 */ + 0x6672,0x6701,0x690C,0x68D3,0x6904,0x68DC,0x692A,0x68EC,/* 0xA8-0xAF */ + 0x68EA,0x68F1,0x690F,0x68D6,0x68F7,0x68EB,0x68E4,0x68F6,/* 0xB0-0xB7 */ + 0x6913,0x6910,0x68F3,0x68E1,0x6907,0x68CC,0x6908,0x6970,/* 0xB8-0xBF */ + 0x68B4,0x6911,0x68EF,0x68C6,0x6914,0x68F8,0x68D0,0x68FD,/* 0xC0-0xC7 */ + 0x68FC,0x68E8,0x690B,0x690A,0x6917,0x68CE,0x68C8,0x68DD,/* 0xC8-0xCF */ + 0x68DE,0x68E6,0x68F4,0x68D1,0x6906,0x68D4,0x68E9,0x6915,/* 0xD0-0xD7 */ + 0x6925,0x68C7,0x6B39,0x6B3B,0x6B3F,0x6B3C,0x6B94,0x6B97,/* 0xD8-0xDF */ + 0x6B99,0x6B95,0x6BBD,0x6BF0,0x6BF2,0x6BF3,0x6C30,0x6DFC,/* 0xE0-0xE7 */ + 0x6E46,0x6E47,0x6E1F,0x6E49,0x6E88,0x6E3C,0x6E3D,0x6E45,/* 0xE8-0xEF */ + 0x6E62,0x6E2B,0x6E3F,0x6E41,0x6E5D,0x6E73,0x6E1C,0x6E33,/* 0xF0-0xF7 */ + 0x6E4B,0x6E40,0x6E51,0x6E3B,0x6E03,0x6E2E,0x6E5E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6E68,0x6E5C,0x6E61,0x6E31,0x6E28,0x6E60,0x6E71,0x6E6B,/* 0x40-0x47 */ + 0x6E39,0x6E22,0x6E30,0x6E53,0x6E65,0x6E27,0x6E78,0x6E64,/* 0x48-0x4F */ + 0x6E77,0x6E55,0x6E79,0x6E52,0x6E66,0x6E35,0x6E36,0x6E5A,/* 0x50-0x57 */ + 0x7120,0x711E,0x712F,0x70FB,0x712E,0x7131,0x7123,0x7125,/* 0x58-0x5F */ + 0x7122,0x7132,0x711F,0x7128,0x713A,0x711B,0x724B,0x725A,/* 0x60-0x67 */ + 0x7288,0x7289,0x7286,0x7285,0x728B,0x7312,0x730B,0x7330,/* 0x68-0x6F */ + 0x7322,0x7331,0x7333,0x7327,0x7332,0x732D,0x7326,0x7323,/* 0x70-0x77 */ + 0x7335,0x730C,0x742E,0x742C,0x7430,0x742B,0x7416,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x741A,0x7421,0x742D,0x7431,0x7424,0x7423,0x741D,/* 0xA0-0xA7 */ + 0x7429,0x7420,0x7432,0x74FB,0x752F,0x756F,0x756C,0x75E7,/* 0xA8-0xAF */ + 0x75DA,0x75E1,0x75E6,0x75DD,0x75DF,0x75E4,0x75D7,0x7695,/* 0xB0-0xB7 */ + 0x7692,0x76DA,0x7746,0x7747,0x7744,0x774D,0x7745,0x774A,/* 0xB8-0xBF */ + 0x774E,0x774B,0x774C,0x77DE,0x77EC,0x7860,0x7864,0x7865,/* 0xC0-0xC7 */ + 0x785C,0x786D,0x7871,0x786A,0x786E,0x7870,0x7869,0x7868,/* 0xC8-0xCF */ + 0x785E,0x7862,0x7974,0x7973,0x7972,0x7970,0x7A02,0x7A0A,/* 0xD0-0xD7 */ + 0x7A03,0x7A0C,0x7A04,0x7A99,0x7AE6,0x7AE4,0x7B4A,0x7B3B,/* 0xD8-0xDF */ + 0x7B44,0x7B48,0x7B4C,0x7B4E,0x7B40,0x7B58,0x7B45,0x7CA2,/* 0xE0-0xE7 */ + 0x7C9E,0x7CA8,0x7CA1,0x7D58,0x7D6F,0x7D63,0x7D53,0x7D56,/* 0xE8-0xEF */ + 0x7D67,0x7D6A,0x7D4F,0x7D6D,0x7D5C,0x7D6B,0x7D52,0x7D54,/* 0xF0-0xF7 */ + 0x7D69,0x7D51,0x7D5F,0x7D4E,0x7F3E,0x7F3F,0x7F65,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7F66,0x7FA2,0x7FA0,0x7FA1,0x7FD7,0x8051,0x804F,0x8050,/* 0x40-0x47 */ + 0x80FE,0x80D4,0x8143,0x814A,0x8152,0x814F,0x8147,0x813D,/* 0x48-0x4F */ + 0x814D,0x813A,0x81E6,0x81EE,0x81F7,0x81F8,0x81F9,0x8204,/* 0x50-0x57 */ + 0x823C,0x823D,0x823F,0x8275,0x833B,0x83CF,0x83F9,0x8423,/* 0x58-0x5F */ + 0x83C0,0x83E8,0x8412,0x83E7,0x83E4,0x83FC,0x83F6,0x8410,/* 0x60-0x67 */ + 0x83C6,0x83C8,0x83EB,0x83E3,0x83BF,0x8401,0x83DD,0x83E5,/* 0x68-0x6F */ + 0x83D8,0x83FF,0x83E1,0x83CB,0x83CE,0x83D6,0x83F5,0x83C9,/* 0x70-0x77 */ + 0x8409,0x840F,0x83DE,0x8411,0x8406,0x83C2,0x83F3,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x83D5,0x83FA,0x83C7,0x83D1,0x83EA,0x8413,0x83C3,/* 0xA0-0xA7 */ + 0x83EC,0x83EE,0x83C4,0x83FB,0x83D7,0x83E2,0x841B,0x83DB,/* 0xA8-0xAF */ + 0x83FE,0x86D8,0x86E2,0x86E6,0x86D3,0x86E3,0x86DA,0x86EA,/* 0xB0-0xB7 */ + 0x86DD,0x86EB,0x86DC,0x86EC,0x86E9,0x86D7,0x86E8,0x86D1,/* 0xB8-0xBF */ + 0x8848,0x8856,0x8855,0x88BA,0x88D7,0x88B9,0x88B8,0x88C0,/* 0xC0-0xC7 */ + 0x88BE,0x88B6,0x88BC,0x88B7,0x88BD,0x88B2,0x8901,0x88C9,/* 0xC8-0xCF */ + 0x8995,0x8998,0x8997,0x89DD,0x89DA,0x89DB,0x8A4E,0x8A4D,/* 0xD0-0xD7 */ + 0x8A39,0x8A59,0x8A40,0x8A57,0x8A58,0x8A44,0x8A45,0x8A52,/* 0xD8-0xDF */ + 0x8A48,0x8A51,0x8A4A,0x8A4C,0x8A4F,0x8C5F,0x8C81,0x8C80,/* 0xE0-0xE7 */ + 0x8CBA,0x8CBE,0x8CB0,0x8CB9,0x8CB5,0x8D84,0x8D80,0x8D89,/* 0xE8-0xEF */ + 0x8DD8,0x8DD3,0x8DCD,0x8DC7,0x8DD6,0x8DDC,0x8DCF,0x8DD5,/* 0xF0-0xF7 */ + 0x8DD9,0x8DC8,0x8DD7,0x8DC5,0x8EEF,0x8EF7,0x8EFA,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8EF9,0x8EE6,0x8EEE,0x8EE5,0x8EF5,0x8EE7,0x8EE8,0x8EF6,/* 0x40-0x47 */ + 0x8EEB,0x8EF1,0x8EEC,0x8EF4,0x8EE9,0x902D,0x9034,0x902F,/* 0x48-0x4F */ + 0x9106,0x912C,0x9104,0x90FF,0x90FC,0x9108,0x90F9,0x90FB,/* 0x50-0x57 */ + 0x9101,0x9100,0x9107,0x9105,0x9103,0x9161,0x9164,0x915F,/* 0x58-0x5F */ + 0x9162,0x9160,0x9201,0x920A,0x9225,0x9203,0x921A,0x9226,/* 0x60-0x67 */ + 0x920F,0x920C,0x9200,0x9212,0x91FF,0x91FD,0x9206,0x9204,/* 0x68-0x6F */ + 0x9227,0x9202,0x921C,0x9224,0x9219,0x9217,0x9205,0x9216,/* 0x70-0x77 */ + 0x957B,0x958D,0x958C,0x9590,0x9687,0x967E,0x9688,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9689,0x9683,0x9680,0x96C2,0x96C8,0x96C3,0x96F1,/* 0xA0-0xA7 */ + 0x96F0,0x976C,0x9770,0x976E,0x9807,0x98A9,0x98EB,0x9CE6,/* 0xA8-0xAF */ + 0x9EF9,0x4E83,0x4E84,0x4EB6,0x50BD,0x50BF,0x50C6,0x50AE,/* 0xB0-0xB7 */ + 0x50C4,0x50CA,0x50B4,0x50C8,0x50C2,0x50B0,0x50C1,0x50BA,/* 0xB8-0xBF */ + 0x50B1,0x50CB,0x50C9,0x50B6,0x50B8,0x51D7,0x527A,0x5278,/* 0xC0-0xC7 */ + 0x527B,0x527C,0x55C3,0x55DB,0x55CC,0x55D0,0x55CB,0x55CA,/* 0xC8-0xCF */ + 0x55DD,0x55C0,0x55D4,0x55C4,0x55E9,0x55BF,0x55D2,0x558D,/* 0xD0-0xD7 */ + 0x55CF,0x55D5,0x55E2,0x55D6,0x55C8,0x55F2,0x55CD,0x55D9,/* 0xD8-0xDF */ + 0x55C2,0x5714,0x5853,0x5868,0x5864,0x584F,0x584D,0x5849,/* 0xE0-0xE7 */ + 0x586F,0x5855,0x584E,0x585D,0x5859,0x5865,0x585B,0x583D,/* 0xE8-0xEF */ + 0x5863,0x5871,0x58FC,0x5AC7,0x5AC4,0x5ACB,0x5ABA,0x5AB8,/* 0xF0-0xF7 */ + 0x5AB1,0x5AB5,0x5AB0,0x5ABF,0x5AC8,0x5ABB,0x5AC6,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DD[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5AB7,0x5AC0,0x5ACA,0x5AB4,0x5AB6,0x5ACD,0x5AB9,0x5A90,/* 0x40-0x47 */ + 0x5BD6,0x5BD8,0x5BD9,0x5C1F,0x5C33,0x5D71,0x5D63,0x5D4A,/* 0x48-0x4F */ + 0x5D65,0x5D72,0x5D6C,0x5D5E,0x5D68,0x5D67,0x5D62,0x5DF0,/* 0x50-0x57 */ + 0x5E4F,0x5E4E,0x5E4A,0x5E4D,0x5E4B,0x5EC5,0x5ECC,0x5EC6,/* 0x58-0x5F */ + 0x5ECB,0x5EC7,0x5F40,0x5FAF,0x5FAD,0x60F7,0x6149,0x614A,/* 0x60-0x67 */ + 0x612B,0x6145,0x6136,0x6132,0x612E,0x6146,0x612F,0x614F,/* 0x68-0x6F */ + 0x6129,0x6140,0x6220,0x9168,0x6223,0x6225,0x6224,0x63C5,/* 0x70-0x77 */ + 0x63F1,0x63EB,0x6410,0x6412,0x6409,0x6420,0x6424,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6433,0x6443,0x641F,0x6415,0x6418,0x6439,0x6437,/* 0xA0-0xA7 */ + 0x6422,0x6423,0x640C,0x6426,0x6430,0x6428,0x6441,0x6435,/* 0xA8-0xAF */ + 0x642F,0x640A,0x641A,0x6440,0x6425,0x6427,0x640B,0x63E7,/* 0xB0-0xB7 */ + 0x641B,0x642E,0x6421,0x640E,0x656F,0x6592,0x65D3,0x6686,/* 0xB8-0xBF */ + 0x668C,0x6695,0x6690,0x668B,0x668A,0x6699,0x6694,0x6678,/* 0xC0-0xC7 */ + 0x6720,0x6966,0x695F,0x6938,0x694E,0x6962,0x6971,0x693F,/* 0xC8-0xCF */ + 0x6945,0x696A,0x6939,0x6942,0x6957,0x6959,0x697A,0x6948,/* 0xD0-0xD7 */ + 0x6949,0x6935,0x696C,0x6933,0x693D,0x6965,0x68F0,0x6978,/* 0xD8-0xDF */ + 0x6934,0x6969,0x6940,0x696F,0x6944,0x6976,0x6958,0x6941,/* 0xE0-0xE7 */ + 0x6974,0x694C,0x693B,0x694B,0x6937,0x695C,0x694F,0x6951,/* 0xE8-0xEF */ + 0x6932,0x6952,0x692F,0x697B,0x693C,0x6B46,0x6B45,0x6B43,/* 0xF0-0xF7 */ + 0x6B42,0x6B48,0x6B41,0x6B9B,0xFA0D,0x6BFB,0x6BFC,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6BF9,0x6BF7,0x6BF8,0x6E9B,0x6ED6,0x6EC8,0x6E8F,0x6EC0,/* 0x40-0x47 */ + 0x6E9F,0x6E93,0x6E94,0x6EA0,0x6EB1,0x6EB9,0x6EC6,0x6ED2,/* 0x48-0x4F */ + 0x6EBD,0x6EC1,0x6E9E,0x6EC9,0x6EB7,0x6EB0,0x6ECD,0x6EA6,/* 0x50-0x57 */ + 0x6ECF,0x6EB2,0x6EBE,0x6EC3,0x6EDC,0x6ED8,0x6E99,0x6E92,/* 0x58-0x5F */ + 0x6E8E,0x6E8D,0x6EA4,0x6EA1,0x6EBF,0x6EB3,0x6ED0,0x6ECA,/* 0x60-0x67 */ + 0x6E97,0x6EAE,0x6EA3,0x7147,0x7154,0x7152,0x7163,0x7160,/* 0x68-0x6F */ + 0x7141,0x715D,0x7162,0x7172,0x7178,0x716A,0x7161,0x7142,/* 0x70-0x77 */ + 0x7158,0x7143,0x714B,0x7170,0x715F,0x7150,0x7153,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7144,0x714D,0x715A,0x724F,0x728D,0x728C,0x7291,/* 0xA0-0xA7 */ + 0x7290,0x728E,0x733C,0x7342,0x733B,0x733A,0x7340,0x734A,/* 0xA8-0xAF */ + 0x7349,0x7444,0x744A,0x744B,0x7452,0x7451,0x7457,0x7440,/* 0xB0-0xB7 */ + 0x744F,0x7450,0x744E,0x7442,0x7446,0x744D,0x7454,0x74E1,/* 0xB8-0xBF */ + 0x74FF,0x74FE,0x74FD,0x751D,0x7579,0x7577,0x6983,0x75EF,/* 0xC0-0xC7 */ + 0x760F,0x7603,0x75F7,0x75FE,0x75FC,0x75F9,0x75F8,0x7610,/* 0xC8-0xCF */ + 0x75FB,0x75F6,0x75ED,0x75F5,0x75FD,0x7699,0x76B5,0x76DD,/* 0xD0-0xD7 */ + 0x7755,0x775F,0x7760,0x7752,0x7756,0x775A,0x7769,0x7767,/* 0xD8-0xDF */ + 0x7754,0x7759,0x776D,0x77E0,0x7887,0x789A,0x7894,0x788F,/* 0xE0-0xE7 */ + 0x7884,0x7895,0x7885,0x7886,0x78A1,0x7883,0x7879,0x7899,/* 0xE8-0xEF */ + 0x7880,0x7896,0x787B,0x797C,0x7982,0x797D,0x7979,0x7A11,/* 0xF0-0xF7 */ + 0x7A18,0x7A19,0x7A12,0x7A17,0x7A15,0x7A22,0x7A13,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_DF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7A1B,0x7A10,0x7AA3,0x7AA2,0x7A9E,0x7AEB,0x7B66,0x7B64,/* 0x40-0x47 */ + 0x7B6D,0x7B74,0x7B69,0x7B72,0x7B65,0x7B73,0x7B71,0x7B70,/* 0x48-0x4F */ + 0x7B61,0x7B78,0x7B76,0x7B63,0x7CB2,0x7CB4,0x7CAF,0x7D88,/* 0x50-0x57 */ + 0x7D86,0x7D80,0x7D8D,0x7D7F,0x7D85,0x7D7A,0x7D8E,0x7D7B,/* 0x58-0x5F */ + 0x7D83,0x7D7C,0x7D8C,0x7D94,0x7D84,0x7D7D,0x7D92,0x7F6D,/* 0x60-0x67 */ + 0x7F6B,0x7F67,0x7F68,0x7F6C,0x7FA6,0x7FA5,0x7FA7,0x7FDB,/* 0x68-0x6F */ + 0x7FDC,0x8021,0x8164,0x8160,0x8177,0x815C,0x8169,0x815B,/* 0x70-0x77 */ + 0x8162,0x8172,0x6721,0x815E,0x8176,0x8167,0x816F,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8144,0x8161,0x821D,0x8249,0x8244,0x8240,0x8242,/* 0xA0-0xA7 */ + 0x8245,0x84F1,0x843F,0x8456,0x8476,0x8479,0x848F,0x848D,/* 0xA8-0xAF */ + 0x8465,0x8451,0x8440,0x8486,0x8467,0x8430,0x844D,0x847D,/* 0xB0-0xB7 */ + 0x845A,0x8459,0x8474,0x8473,0x845D,0x8507,0x845E,0x8437,/* 0xB8-0xBF */ + 0x843A,0x8434,0x847A,0x8443,0x8478,0x8432,0x8445,0x8429,/* 0xC0-0xC7 */ + 0x83D9,0x844B,0x842F,0x8442,0x842D,0x845F,0x8470,0x8439,/* 0xC8-0xCF */ + 0x844E,0x844C,0x8452,0x846F,0x84C5,0x848E,0x843B,0x8447,/* 0xD0-0xD7 */ + 0x8436,0x8433,0x8468,0x847E,0x8444,0x842B,0x8460,0x8454,/* 0xD8-0xDF */ + 0x846E,0x8450,0x870B,0x8704,0x86F7,0x870C,0x86FA,0x86D6,/* 0xE0-0xE7 */ + 0x86F5,0x874D,0x86F8,0x870E,0x8709,0x8701,0x86F6,0x870D,/* 0xE8-0xEF */ + 0x8705,0x88D6,0x88CB,0x88CD,0x88CE,0x88DE,0x88DB,0x88DA,/* 0xF0-0xF7 */ + 0x88CC,0x88D0,0x8985,0x899B,0x89DF,0x89E5,0x89E4,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x89E1,0x89E0,0x89E2,0x89DC,0x89E6,0x8A76,0x8A86,0x8A7F,/* 0x40-0x47 */ + 0x8A61,0x8A3F,0x8A77,0x8A82,0x8A84,0x8A75,0x8A83,0x8A81,/* 0x48-0x4F */ + 0x8A74,0x8A7A,0x8C3C,0x8C4B,0x8C4A,0x8C65,0x8C64,0x8C66,/* 0x50-0x57 */ + 0x8C86,0x8C84,0x8C85,0x8CCC,0x8D68,0x8D69,0x8D91,0x8D8C,/* 0x58-0x5F */ + 0x8D8E,0x8D8F,0x8D8D,0x8D93,0x8D94,0x8D90,0x8D92,0x8DF0,/* 0x60-0x67 */ + 0x8DE0,0x8DEC,0x8DF1,0x8DEE,0x8DD0,0x8DE9,0x8DE3,0x8DE2,/* 0x68-0x6F */ + 0x8DE7,0x8DF2,0x8DEB,0x8DF4,0x8F06,0x8EFF,0x8F01,0x8F00,/* 0x70-0x77 */ + 0x8F05,0x8F07,0x8F08,0x8F02,0x8F0B,0x9052,0x903F,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9044,0x9049,0x903D,0x9110,0x910D,0x910F,0x9111,/* 0xA0-0xA7 */ + 0x9116,0x9114,0x910B,0x910E,0x916E,0x916F,0x9248,0x9252,/* 0xA8-0xAF */ + 0x9230,0x923A,0x9266,0x9233,0x9265,0x925E,0x9283,0x922E,/* 0xB0-0xB7 */ + 0x924A,0x9246,0x926D,0x926C,0x924F,0x9260,0x9267,0x926F,/* 0xB8-0xBF */ + 0x9236,0x9261,0x9270,0x9231,0x9254,0x9263,0x9250,0x9272,/* 0xC0-0xC7 */ + 0x924E,0x9253,0x924C,0x9256,0x9232,0x959F,0x959C,0x959E,/* 0xC8-0xCF */ + 0x959B,0x9692,0x9693,0x9691,0x9697,0x96CE,0x96FA,0x96FD,/* 0xD0-0xD7 */ + 0x96F8,0x96F5,0x9773,0x9777,0x9778,0x9772,0x980F,0x980D,/* 0xD8-0xDF */ + 0x980E,0x98AC,0x98F6,0x98F9,0x99AF,0x99B2,0x99B0,0x99B5,/* 0xE0-0xE7 */ + 0x9AAD,0x9AAB,0x9B5B,0x9CEA,0x9CED,0x9CE7,0x9E80,0x9EFD,/* 0xE8-0xEF */ + 0x50E6,0x50D4,0x50D7,0x50E8,0x50F3,0x50DB,0x50EA,0x50DD,/* 0xF0-0xF7 */ + 0x50E4,0x50D3,0x50EC,0x50F0,0x50EF,0x50E3,0x50E0,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x51D8,0x5280,0x5281,0x52E9,0x52EB,0x5330,0x53AC,0x5627,/* 0x40-0x47 */ + 0x5615,0x560C,0x5612,0x55FC,0x560F,0x561C,0x5601,0x5613,/* 0x48-0x4F */ + 0x5602,0x55FA,0x561D,0x5604,0x55FF,0x55F9,0x5889,0x587C,/* 0x50-0x57 */ + 0x5890,0x5898,0x5886,0x5881,0x587F,0x5874,0x588B,0x587A,/* 0x58-0x5F */ + 0x5887,0x5891,0x588E,0x5876,0x5882,0x5888,0x587B,0x5894,/* 0x60-0x67 */ + 0x588F,0x58FE,0x596B,0x5ADC,0x5AEE,0x5AE5,0x5AD5,0x5AEA,/* 0x68-0x6F */ + 0x5ADA,0x5AED,0x5AEB,0x5AF3,0x5AE2,0x5AE0,0x5ADB,0x5AEC,/* 0x70-0x77 */ + 0x5ADE,0x5ADD,0x5AD9,0x5AE8,0x5ADF,0x5B77,0x5BE0,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x5BE3,0x5C63,0x5D82,0x5D80,0x5D7D,0x5D86,0x5D7A,/* 0xA0-0xA7 */ + 0x5D81,0x5D77,0x5D8A,0x5D89,0x5D88,0x5D7E,0x5D7C,0x5D8D,/* 0xA8-0xAF */ + 0x5D79,0x5D7F,0x5E58,0x5E59,0x5E53,0x5ED8,0x5ED1,0x5ED7,/* 0xB0-0xB7 */ + 0x5ECE,0x5EDC,0x5ED5,0x5ED9,0x5ED2,0x5ED4,0x5F44,0x5F43,/* 0xB8-0xBF */ + 0x5F6F,0x5FB6,0x612C,0x6128,0x6141,0x615E,0x6171,0x6173,/* 0xC0-0xC7 */ + 0x6152,0x6153,0x6172,0x616C,0x6180,0x6174,0x6154,0x617A,/* 0xC8-0xCF */ + 0x615B,0x6165,0x613B,0x616A,0x6161,0x6156,0x6229,0x6227,/* 0xD0-0xD7 */ + 0x622B,0x642B,0x644D,0x645B,0x645D,0x6474,0x6476,0x6472,/* 0xD8-0xDF */ + 0x6473,0x647D,0x6475,0x6466,0x64A6,0x644E,0x6482,0x645E,/* 0xE0-0xE7 */ + 0x645C,0x644B,0x6453,0x6460,0x6450,0x647F,0x643F,0x646C,/* 0xE8-0xEF */ + 0x646B,0x6459,0x6465,0x6477,0x6573,0x65A0,0x66A1,0x66A0,/* 0xF0-0xF7 */ + 0x669F,0x6705,0x6704,0x6722,0x69B1,0x69B6,0x69C9,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x69A0,0x69CE,0x6996,0x69B0,0x69AC,0x69BC,0x6991,0x6999,/* 0x40-0x47 */ + 0x698E,0x69A7,0x698D,0x69A9,0x69BE,0x69AF,0x69BF,0x69C4,/* 0x48-0x4F */ + 0x69BD,0x69A4,0x69D4,0x69B9,0x69CA,0x699A,0x69CF,0x69B3,/* 0x50-0x57 */ + 0x6993,0x69AA,0x69A1,0x699E,0x69D9,0x6997,0x6990,0x69C2,/* 0x58-0x5F */ + 0x69B5,0x69A5,0x69C6,0x6B4A,0x6B4D,0x6B4B,0x6B9E,0x6B9F,/* 0x60-0x67 */ + 0x6BA0,0x6BC3,0x6BC4,0x6BFE,0x6ECE,0x6EF5,0x6EF1,0x6F03,/* 0x68-0x6F */ + 0x6F25,0x6EF8,0x6F37,0x6EFB,0x6F2E,0x6F09,0x6F4E,0x6F19,/* 0x70-0x77 */ + 0x6F1A,0x6F27,0x6F18,0x6F3B,0x6F12,0x6EED,0x6F0A,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x6F36,0x6F73,0x6EF9,0x6EEE,0x6F2D,0x6F40,0x6F30,/* 0xA0-0xA7 */ + 0x6F3C,0x6F35,0x6EEB,0x6F07,0x6F0E,0x6F43,0x6F05,0x6EFD,/* 0xA8-0xAF */ + 0x6EF6,0x6F39,0x6F1C,0x6EFC,0x6F3A,0x6F1F,0x6F0D,0x6F1E,/* 0xB0-0xB7 */ + 0x6F08,0x6F21,0x7187,0x7190,0x7189,0x7180,0x7185,0x7182,/* 0xB8-0xBF */ + 0x718F,0x717B,0x7186,0x7181,0x7197,0x7244,0x7253,0x7297,/* 0xC0-0xC7 */ + 0x7295,0x7293,0x7343,0x734D,0x7351,0x734C,0x7462,0x7473,/* 0xC8-0xCF */ + 0x7471,0x7475,0x7472,0x7467,0x746E,0x7500,0x7502,0x7503,/* 0xD0-0xD7 */ + 0x757D,0x7590,0x7616,0x7608,0x760C,0x7615,0x7611,0x760A,/* 0xD8-0xDF */ + 0x7614,0x76B8,0x7781,0x777C,0x7785,0x7782,0x776E,0x7780,/* 0xE0-0xE7 */ + 0x776F,0x777E,0x7783,0x78B2,0x78AA,0x78B4,0x78AD,0x78A8,/* 0xE8-0xEF */ + 0x787E,0x78AB,0x789E,0x78A5,0x78A0,0x78AC,0x78A2,0x78A4,/* 0xF0-0xF7 */ + 0x7998,0x798A,0x798B,0x7996,0x7995,0x7994,0x7993,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7997,0x7988,0x7992,0x7990,0x7A2B,0x7A4A,0x7A30,0x7A2F,/* 0x40-0x47 */ + 0x7A28,0x7A26,0x7AA8,0x7AAB,0x7AAC,0x7AEE,0x7B88,0x7B9C,/* 0x48-0x4F */ + 0x7B8A,0x7B91,0x7B90,0x7B96,0x7B8D,0x7B8C,0x7B9B,0x7B8E,/* 0x50-0x57 */ + 0x7B85,0x7B98,0x5284,0x7B99,0x7BA4,0x7B82,0x7CBB,0x7CBF,/* 0x58-0x5F */ + 0x7CBC,0x7CBA,0x7DA7,0x7DB7,0x7DC2,0x7DA3,0x7DAA,0x7DC1,/* 0x60-0x67 */ + 0x7DC0,0x7DC5,0x7D9D,0x7DCE,0x7DC4,0x7DC6,0x7DCB,0x7DCC,/* 0x68-0x6F */ + 0x7DAF,0x7DB9,0x7D96,0x7DBC,0x7D9F,0x7DA6,0x7DAE,0x7DA9,/* 0x70-0x77 */ + 0x7DA1,0x7DC9,0x7F73,0x7FE2,0x7FE3,0x7FE5,0x7FDE,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8024,0x805D,0x805C,0x8189,0x8186,0x8183,0x8187,/* 0xA0-0xA7 */ + 0x818D,0x818C,0x818B,0x8215,0x8497,0x84A4,0x84A1,0x849F,/* 0xA8-0xAF */ + 0x84BA,0x84CE,0x84C2,0x84AC,0x84AE,0x84AB,0x84B9,0x84B4,/* 0xB0-0xB7 */ + 0x84C1,0x84CD,0x84AA,0x849A,0x84B1,0x84D0,0x849D,0x84A7,/* 0xB8-0xBF */ + 0x84BB,0x84A2,0x8494,0x84C7,0x84CC,0x849B,0x84A9,0x84AF,/* 0xC0-0xC7 */ + 0x84A8,0x84D6,0x8498,0x84B6,0x84CF,0x84A0,0x84D7,0x84D4,/* 0xC8-0xCF */ + 0x84D2,0x84DB,0x84B0,0x8491,0x8661,0x8733,0x8723,0x8728,/* 0xD0-0xD7 */ + 0x876B,0x8740,0x872E,0x871E,0x8721,0x8719,0x871B,0x8743,/* 0xD8-0xDF */ + 0x872C,0x8741,0x873E,0x8746,0x8720,0x8732,0x872A,0x872D,/* 0xE0-0xE7 */ + 0x873C,0x8712,0x873A,0x8731,0x8735,0x8742,0x8726,0x8727,/* 0xE8-0xEF */ + 0x8738,0x8724,0x871A,0x8730,0x8711,0x88F7,0x88E7,0x88F1,/* 0xF0-0xF7 */ + 0x88F2,0x88FA,0x88FE,0x88EE,0x88FC,0x88F6,0x88FB,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x88F0,0x88EC,0x88EB,0x899D,0x89A1,0x899F,0x899E,0x89E9,/* 0x40-0x47 */ + 0x89EB,0x89E8,0x8AAB,0x8A99,0x8A8B,0x8A92,0x8A8F,0x8A96,/* 0x48-0x4F */ + 0x8C3D,0x8C68,0x8C69,0x8CD5,0x8CCF,0x8CD7,0x8D96,0x8E09,/* 0x50-0x57 */ + 0x8E02,0x8DFF,0x8E0D,0x8DFD,0x8E0A,0x8E03,0x8E07,0x8E06,/* 0x58-0x5F */ + 0x8E05,0x8DFE,0x8E00,0x8E04,0x8F10,0x8F11,0x8F0E,0x8F0D,/* 0x60-0x67 */ + 0x9123,0x911C,0x9120,0x9122,0x911F,0x911D,0x911A,0x9124,/* 0x68-0x6F */ + 0x9121,0x911B,0x917A,0x9172,0x9179,0x9173,0x92A5,0x92A4,/* 0x70-0x77 */ + 0x9276,0x929B,0x927A,0x92A0,0x9294,0x92AA,0x928D,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x92A6,0x929A,0x92AB,0x9279,0x9297,0x927F,0x92A3,/* 0xA0-0xA7 */ + 0x92EE,0x928E,0x9282,0x9295,0x92A2,0x927D,0x9288,0x92A1,/* 0xA8-0xAF */ + 0x928A,0x9286,0x928C,0x9299,0x92A7,0x927E,0x9287,0x92A9,/* 0xB0-0xB7 */ + 0x929D,0x928B,0x922D,0x969E,0x96A1,0x96FF,0x9758,0x977D,/* 0xB8-0xBF */ + 0x977A,0x977E,0x9783,0x9780,0x9782,0x977B,0x9784,0x9781,/* 0xC0-0xC7 */ + 0x977F,0x97CE,0x97CD,0x9816,0x98AD,0x98AE,0x9902,0x9900,/* 0xC8-0xCF */ + 0x9907,0x999D,0x999C,0x99C3,0x99B9,0x99BB,0x99BA,0x99C2,/* 0xD0-0xD7 */ + 0x99BD,0x99C7,0x9AB1,0x9AE3,0x9AE7,0x9B3E,0x9B3F,0x9B60,/* 0xD8-0xDF */ + 0x9B61,0x9B5F,0x9CF1,0x9CF2,0x9CF5,0x9EA7,0x50FF,0x5103,/* 0xE0-0xE7 */ + 0x5130,0x50F8,0x5106,0x5107,0x50F6,0x50FE,0x510B,0x510C,/* 0xE8-0xEF */ + 0x50FD,0x510A,0x528B,0x528C,0x52F1,0x52EF,0x5648,0x5642,/* 0xF0-0xF7 */ + 0x564C,0x5635,0x5641,0x564A,0x5649,0x5646,0x5658,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x565A,0x5640,0x5633,0x563D,0x562C,0x563E,0x5638,0x562A,/* 0x40-0x47 */ + 0x563A,0x571A,0x58AB,0x589D,0x58B1,0x58A0,0x58A3,0x58AF,/* 0x48-0x4F */ + 0x58AC,0x58A5,0x58A1,0x58FF,0x5AFF,0x5AF4,0x5AFD,0x5AF7,/* 0x50-0x57 */ + 0x5AF6,0x5B03,0x5AF8,0x5B02,0x5AF9,0x5B01,0x5B07,0x5B05,/* 0x58-0x5F */ + 0x5B0F,0x5C67,0x5D99,0x5D97,0x5D9F,0x5D92,0x5DA2,0x5D93,/* 0x60-0x67 */ + 0x5D95,0x5DA0,0x5D9C,0x5DA1,0x5D9A,0x5D9E,0x5E69,0x5E5D,/* 0x68-0x6F */ + 0x5E60,0x5E5C,0x7DF3,0x5EDB,0x5EDE,0x5EE1,0x5F49,0x5FB2,/* 0x70-0x77 */ + 0x618B,0x6183,0x6179,0x61B1,0x61B0,0x61A2,0x6189,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x619B,0x6193,0x61AF,0x61AD,0x619F,0x6192,0x61AA,/* 0xA0-0xA7 */ + 0x61A1,0x618D,0x6166,0x61B3,0x622D,0x646E,0x6470,0x6496,/* 0xA8-0xAF */ + 0x64A0,0x6485,0x6497,0x649C,0x648F,0x648B,0x648A,0x648C,/* 0xB0-0xB7 */ + 0x64A3,0x649F,0x6468,0x64B1,0x6498,0x6576,0x657A,0x6579,/* 0xB8-0xBF */ + 0x657B,0x65B2,0x65B3,0x66B5,0x66B0,0x66A9,0x66B2,0x66B7,/* 0xC0-0xC7 */ + 0x66AA,0x66AF,0x6A00,0x6A06,0x6A17,0x69E5,0x69F8,0x6A15,/* 0xC8-0xCF */ + 0x69F1,0x69E4,0x6A20,0x69FF,0x69EC,0x69E2,0x6A1B,0x6A1D,/* 0xD0-0xD7 */ + 0x69FE,0x6A27,0x69F2,0x69EE,0x6A14,0x69F7,0x69E7,0x6A40,/* 0xD8-0xDF */ + 0x6A08,0x69E6,0x69FB,0x6A0D,0x69FC,0x69EB,0x6A09,0x6A04,/* 0xE0-0xE7 */ + 0x6A18,0x6A25,0x6A0F,0x69F6,0x6A26,0x6A07,0x69F4,0x6A16,/* 0xE8-0xEF */ + 0x6B51,0x6BA5,0x6BA3,0x6BA2,0x6BA6,0x6C01,0x6C00,0x6BFF,/* 0xF0-0xF7 */ + 0x6C02,0x6F41,0x6F26,0x6F7E,0x6F87,0x6FC6,0x6F92,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6F8D,0x6F89,0x6F8C,0x6F62,0x6F4F,0x6F85,0x6F5A,0x6F96,/* 0x40-0x47 */ + 0x6F76,0x6F6C,0x6F82,0x6F55,0x6F72,0x6F52,0x6F50,0x6F57,/* 0x48-0x4F */ + 0x6F94,0x6F93,0x6F5D,0x6F00,0x6F61,0x6F6B,0x6F7D,0x6F67,/* 0x50-0x57 */ + 0x6F90,0x6F53,0x6F8B,0x6F69,0x6F7F,0x6F95,0x6F63,0x6F77,/* 0x58-0x5F */ + 0x6F6A,0x6F7B,0x71B2,0x71AF,0x719B,0x71B0,0x71A0,0x719A,/* 0x60-0x67 */ + 0x71A9,0x71B5,0x719D,0x71A5,0x719E,0x71A4,0x71A1,0x71AA,/* 0x68-0x6F */ + 0x719C,0x71A7,0x71B3,0x7298,0x729A,0x7358,0x7352,0x735E,/* 0x70-0x77 */ + 0x735F,0x7360,0x735D,0x735B,0x7361,0x735A,0x7359,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7362,0x7487,0x7489,0x748A,0x7486,0x7481,0x747D,/* 0xA0-0xA7 */ + 0x7485,0x7488,0x747C,0x7479,0x7508,0x7507,0x757E,0x7625,/* 0xA8-0xAF */ + 0x761E,0x7619,0x761D,0x761C,0x7623,0x761A,0x7628,0x761B,/* 0xB0-0xB7 */ + 0x769C,0x769D,0x769E,0x769B,0x778D,0x778F,0x7789,0x7788,/* 0xB8-0xBF */ + 0x78CD,0x78BB,0x78CF,0x78CC,0x78D1,0x78CE,0x78D4,0x78C8,/* 0xC0-0xC7 */ + 0x78C3,0x78C4,0x78C9,0x799A,0x79A1,0x79A0,0x799C,0x79A2,/* 0xC8-0xCF */ + 0x799B,0x6B76,0x7A39,0x7AB2,0x7AB4,0x7AB3,0x7BB7,0x7BCB,/* 0xD0-0xD7 */ + 0x7BBE,0x7BAC,0x7BCE,0x7BAF,0x7BB9,0x7BCA,0x7BB5,0x7CC5,/* 0xD8-0xDF */ + 0x7CC8,0x7CCC,0x7CCB,0x7DF7,0x7DDB,0x7DEA,0x7DE7,0x7DD7,/* 0xE0-0xE7 */ + 0x7DE1,0x7E03,0x7DFA,0x7DE6,0x7DF6,0x7DF1,0x7DF0,0x7DEE,/* 0xE8-0xEF */ + 0x7DDF,0x7F76,0x7FAC,0x7FB0,0x7FAD,0x7FED,0x7FEB,0x7FEA,/* 0xF0-0xF7 */ + 0x7FEC,0x7FE6,0x7FE8,0x8064,0x8067,0x81A3,0x819F,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x819E,0x8195,0x81A2,0x8199,0x8197,0x8216,0x824F,0x8253,/* 0x40-0x47 */ + 0x8252,0x8250,0x824E,0x8251,0x8524,0x853B,0x850F,0x8500,/* 0x48-0x4F */ + 0x8529,0x850E,0x8509,0x850D,0x851F,0x850A,0x8527,0x851C,/* 0x50-0x57 */ + 0x84FB,0x852B,0x84FA,0x8508,0x850C,0x84F4,0x852A,0x84F2,/* 0x58-0x5F */ + 0x8515,0x84F7,0x84EB,0x84F3,0x84FC,0x8512,0x84EA,0x84E9,/* 0x60-0x67 */ + 0x8516,0x84FE,0x8528,0x851D,0x852E,0x8502,0x84FD,0x851E,/* 0x68-0x6F */ + 0x84F6,0x8531,0x8526,0x84E7,0x84E8,0x84F0,0x84EF,0x84F9,/* 0x70-0x77 */ + 0x8518,0x8520,0x8530,0x850B,0x8519,0x852F,0x8662,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8756,0x8763,0x8764,0x8777,0x87E1,0x8773,0x8758,/* 0xA0-0xA7 */ + 0x8754,0x875B,0x8752,0x8761,0x875A,0x8751,0x875E,0x876D,/* 0xA8-0xAF */ + 0x876A,0x8750,0x874E,0x875F,0x875D,0x876F,0x876C,0x877A,/* 0xB0-0xB7 */ + 0x876E,0x875C,0x8765,0x874F,0x877B,0x8775,0x8762,0x8767,/* 0xB8-0xBF */ + 0x8769,0x885A,0x8905,0x890C,0x8914,0x890B,0x8917,0x8918,/* 0xC0-0xC7 */ + 0x8919,0x8906,0x8916,0x8911,0x890E,0x8909,0x89A2,0x89A4,/* 0xC8-0xCF */ + 0x89A3,0x89ED,0x89F0,0x89EC,0x8ACF,0x8AC6,0x8AB8,0x8AD3,/* 0xD0-0xD7 */ + 0x8AD1,0x8AD4,0x8AD5,0x8ABB,0x8AD7,0x8ABE,0x8AC0,0x8AC5,/* 0xD8-0xDF */ + 0x8AD8,0x8AC3,0x8ABA,0x8ABD,0x8AD9,0x8C3E,0x8C4D,0x8C8F,/* 0xE0-0xE7 */ + 0x8CE5,0x8CDF,0x8CD9,0x8CE8,0x8CDA,0x8CDD,0x8CE7,0x8DA0,/* 0xE8-0xEF */ + 0x8D9C,0x8DA1,0x8D9B,0x8E20,0x8E23,0x8E25,0x8E24,0x8E2E,/* 0xF0-0xF7 */ + 0x8E15,0x8E1B,0x8E16,0x8E11,0x8E19,0x8E26,0x8E27,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8E14,0x8E12,0x8E18,0x8E13,0x8E1C,0x8E17,0x8E1A,0x8F2C,/* 0x40-0x47 */ + 0x8F24,0x8F18,0x8F1A,0x8F20,0x8F23,0x8F16,0x8F17,0x9073,/* 0x48-0x4F */ + 0x9070,0x906F,0x9067,0x906B,0x912F,0x912B,0x9129,0x912A,/* 0x50-0x57 */ + 0x9132,0x9126,0x912E,0x9185,0x9186,0x918A,0x9181,0x9182,/* 0x58-0x5F */ + 0x9184,0x9180,0x92D0,0x92C3,0x92C4,0x92C0,0x92D9,0x92B6,/* 0x60-0x67 */ + 0x92CF,0x92F1,0x92DF,0x92D8,0x92E9,0x92D7,0x92DD,0x92CC,/* 0x68-0x6F */ + 0x92EF,0x92C2,0x92E8,0x92CA,0x92C8,0x92CE,0x92E6,0x92CD,/* 0x70-0x77 */ + 0x92D5,0x92C9,0x92E0,0x92DE,0x92E7,0x92D1,0x92D3,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x92B5,0x92E1,0x92C6,0x92B4,0x957C,0x95AC,0x95AB,/* 0xA0-0xA7 */ + 0x95AE,0x95B0,0x96A4,0x96A2,0x96D3,0x9705,0x9708,0x9702,/* 0xA8-0xAF */ + 0x975A,0x978A,0x978E,0x9788,0x97D0,0x97CF,0x981E,0x981D,/* 0xB0-0xB7 */ + 0x9826,0x9829,0x9828,0x9820,0x981B,0x9827,0x98B2,0x9908,/* 0xB8-0xBF */ + 0x98FA,0x9911,0x9914,0x9916,0x9917,0x9915,0x99DC,0x99CD,/* 0xC0-0xC7 */ + 0x99CF,0x99D3,0x99D4,0x99CE,0x99C9,0x99D6,0x99D8,0x99CB,/* 0xC8-0xCF */ + 0x99D7,0x99CC,0x9AB3,0x9AEC,0x9AEB,0x9AF3,0x9AF2,0x9AF1,/* 0xD0-0xD7 */ + 0x9B46,0x9B43,0x9B67,0x9B74,0x9B71,0x9B66,0x9B76,0x9B75,/* 0xD8-0xDF */ + 0x9B70,0x9B68,0x9B64,0x9B6C,0x9CFC,0x9CFA,0x9CFD,0x9CFF,/* 0xE0-0xE7 */ + 0x9CF7,0x9D07,0x9D00,0x9CF9,0x9CFB,0x9D08,0x9D05,0x9D04,/* 0xE8-0xEF */ + 0x9E83,0x9ED3,0x9F0F,0x9F10,0x511C,0x5113,0x5117,0x511A,/* 0xF0-0xF7 */ + 0x5111,0x51DE,0x5334,0x53E1,0x5670,0x5660,0x566E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_E9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5673,0x5666,0x5663,0x566D,0x5672,0x565E,0x5677,0x571C,/* 0x40-0x47 */ + 0x571B,0x58C8,0x58BD,0x58C9,0x58BF,0x58BA,0x58C2,0x58BC,/* 0x48-0x4F */ + 0x58C6,0x5B17,0x5B19,0x5B1B,0x5B21,0x5B14,0x5B13,0x5B10,/* 0x50-0x57 */ + 0x5B16,0x5B28,0x5B1A,0x5B20,0x5B1E,0x5BEF,0x5DAC,0x5DB1,/* 0x58-0x5F */ + 0x5DA9,0x5DA7,0x5DB5,0x5DB0,0x5DAE,0x5DAA,0x5DA8,0x5DB2,/* 0x60-0x67 */ + 0x5DAD,0x5DAF,0x5DB4,0x5E67,0x5E68,0x5E66,0x5E6F,0x5EE9,/* 0x68-0x6F */ + 0x5EE7,0x5EE6,0x5EE8,0x5EE5,0x5F4B,0x5FBC,0x619D,0x61A8,/* 0x70-0x77 */ + 0x6196,0x61C5,0x61B4,0x61C6,0x61C1,0x61CC,0x61BA,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x61BF,0x61B8,0x618C,0x64D7,0x64D6,0x64D0,0x64CF,/* 0xA0-0xA7 */ + 0x64C9,0x64BD,0x6489,0x64C3,0x64DB,0x64F3,0x64D9,0x6533,/* 0xA8-0xAF */ + 0x657F,0x657C,0x65A2,0x66C8,0x66BE,0x66C0,0x66CA,0x66CB,/* 0xB0-0xB7 */ + 0x66CF,0x66BD,0x66BB,0x66BA,0x66CC,0x6723,0x6A34,0x6A66,/* 0xB8-0xBF */ + 0x6A49,0x6A67,0x6A32,0x6A68,0x6A3E,0x6A5D,0x6A6D,0x6A76,/* 0xC0-0xC7 */ + 0x6A5B,0x6A51,0x6A28,0x6A5A,0x6A3B,0x6A3F,0x6A41,0x6A6A,/* 0xC8-0xCF */ + 0x6A64,0x6A50,0x6A4F,0x6A54,0x6A6F,0x6A69,0x6A60,0x6A3C,/* 0xD0-0xD7 */ + 0x6A5E,0x6A56,0x6A55,0x6A4D,0x6A4E,0x6A46,0x6B55,0x6B54,/* 0xD8-0xDF */ + 0x6B56,0x6BA7,0x6BAA,0x6BAB,0x6BC8,0x6BC7,0x6C04,0x6C03,/* 0xE0-0xE7 */ + 0x6C06,0x6FAD,0x6FCB,0x6FA3,0x6FC7,0x6FBC,0x6FCE,0x6FC8,/* 0xE8-0xEF */ + 0x6F5E,0x6FC4,0x6FBD,0x6F9E,0x6FCA,0x6FA8,0x7004,0x6FA5,/* 0xF0-0xF7 */ + 0x6FAE,0x6FBA,0x6FAC,0x6FAA,0x6FCF,0x6FBF,0x6FB8,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EA[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6FA2,0x6FC9,0x6FAB,0x6FCD,0x6FAF,0x6FB2,0x6FB0,0x71C5,/* 0x40-0x47 */ + 0x71C2,0x71BF,0x71B8,0x71D6,0x71C0,0x71C1,0x71CB,0x71D4,/* 0x48-0x4F */ + 0x71CA,0x71C7,0x71CF,0x71BD,0x71D8,0x71BC,0x71C6,0x71DA,/* 0x50-0x57 */ + 0x71DB,0x729D,0x729E,0x7369,0x7366,0x7367,0x736C,0x7365,/* 0x58-0x5F */ + 0x736B,0x736A,0x747F,0x749A,0x74A0,0x7494,0x7492,0x7495,/* 0x60-0x67 */ + 0x74A1,0x750B,0x7580,0x762F,0x762D,0x7631,0x763D,0x7633,/* 0x68-0x6F */ + 0x763C,0x7635,0x7632,0x7630,0x76BB,0x76E6,0x779A,0x779D,/* 0x70-0x77 */ + 0x77A1,0x779C,0x779B,0x77A2,0x77A3,0x7795,0x7799,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7797,0x78DD,0x78E9,0x78E5,0x78EA,0x78DE,0x78E3,/* 0xA0-0xA7 */ + 0x78DB,0x78E1,0x78E2,0x78ED,0x78DF,0x78E0,0x79A4,0x7A44,/* 0xA8-0xAF */ + 0x7A48,0x7A47,0x7AB6,0x7AB8,0x7AB5,0x7AB1,0x7AB7,0x7BDE,/* 0xB0-0xB7 */ + 0x7BE3,0x7BE7,0x7BDD,0x7BD5,0x7BE5,0x7BDA,0x7BE8,0x7BF9,/* 0xB8-0xBF */ + 0x7BD4,0x7BEA,0x7BE2,0x7BDC,0x7BEB,0x7BD8,0x7BDF,0x7CD2,/* 0xC0-0xC7 */ + 0x7CD4,0x7CD7,0x7CD0,0x7CD1,0x7E12,0x7E21,0x7E17,0x7E0C,/* 0xC8-0xCF */ + 0x7E1F,0x7E20,0x7E13,0x7E0E,0x7E1C,0x7E15,0x7E1A,0x7E22,/* 0xD0-0xD7 */ + 0x7E0B,0x7E0F,0x7E16,0x7E0D,0x7E14,0x7E25,0x7E24,0x7F43,/* 0xD8-0xDF */ + 0x7F7B,0x7F7C,0x7F7A,0x7FB1,0x7FEF,0x802A,0x8029,0x806C,/* 0xE0-0xE7 */ + 0x81B1,0x81A6,0x81AE,0x81B9,0x81B5,0x81AB,0x81B0,0x81AC,/* 0xE8-0xEF */ + 0x81B4,0x81B2,0x81B7,0x81A7,0x81F2,0x8255,0x8256,0x8257,/* 0xF0-0xF7 */ + 0x8556,0x8545,0x856B,0x854D,0x8553,0x8561,0x8558,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EB[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8540,0x8546,0x8564,0x8541,0x8562,0x8544,0x8551,0x8547,/* 0x40-0x47 */ + 0x8563,0x853E,0x855B,0x8571,0x854E,0x856E,0x8575,0x8555,/* 0x48-0x4F */ + 0x8567,0x8560,0x858C,0x8566,0x855D,0x8554,0x8565,0x856C,/* 0x50-0x57 */ + 0x8663,0x8665,0x8664,0x879B,0x878F,0x8797,0x8793,0x8792,/* 0x58-0x5F */ + 0x8788,0x8781,0x8796,0x8798,0x8779,0x8787,0x87A3,0x8785,/* 0x60-0x67 */ + 0x8790,0x8791,0x879D,0x8784,0x8794,0x879C,0x879A,0x8789,/* 0x68-0x6F */ + 0x891E,0x8926,0x8930,0x892D,0x892E,0x8927,0x8931,0x8922,/* 0x70-0x77 */ + 0x8929,0x8923,0x892F,0x892C,0x891F,0x89F1,0x8AE0,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8AE2,0x8AF2,0x8AF4,0x8AF5,0x8ADD,0x8B14,0x8AE4,/* 0xA0-0xA7 */ + 0x8ADF,0x8AF0,0x8AC8,0x8ADE,0x8AE1,0x8AE8,0x8AFF,0x8AEF,/* 0xA8-0xAF */ + 0x8AFB,0x8C91,0x8C92,0x8C90,0x8CF5,0x8CEE,0x8CF1,0x8CF0,/* 0xB0-0xB7 */ + 0x8CF3,0x8D6C,0x8D6E,0x8DA5,0x8DA7,0x8E33,0x8E3E,0x8E38,/* 0xB8-0xBF */ + 0x8E40,0x8E45,0x8E36,0x8E3C,0x8E3D,0x8E41,0x8E30,0x8E3F,/* 0xC0-0xC7 */ + 0x8EBD,0x8F36,0x8F2E,0x8F35,0x8F32,0x8F39,0x8F37,0x8F34,/* 0xC8-0xCF */ + 0x9076,0x9079,0x907B,0x9086,0x90FA,0x9133,0x9135,0x9136,/* 0xD0-0xD7 */ + 0x9193,0x9190,0x9191,0x918D,0x918F,0x9327,0x931E,0x9308,/* 0xD8-0xDF */ + 0x931F,0x9306,0x930F,0x937A,0x9338,0x933C,0x931B,0x9323,/* 0xE0-0xE7 */ + 0x9312,0x9301,0x9346,0x932D,0x930E,0x930D,0x92CB,0x931D,/* 0xE8-0xEF */ + 0x92FA,0x9325,0x9313,0x92F9,0x92F7,0x9334,0x9302,0x9324,/* 0xF0-0xF7 */ + 0x92FF,0x9329,0x9339,0x9335,0x932A,0x9314,0x930C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EC[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x930B,0x92FE,0x9309,0x9300,0x92FB,0x9316,0x95BC,0x95CD,/* 0x40-0x47 */ + 0x95BE,0x95B9,0x95BA,0x95B6,0x95BF,0x95B5,0x95BD,0x96A9,/* 0x48-0x4F */ + 0x96D4,0x970B,0x9712,0x9710,0x9799,0x9797,0x9794,0x97F0,/* 0x50-0x57 */ + 0x97F8,0x9835,0x982F,0x9832,0x9924,0x991F,0x9927,0x9929,/* 0x58-0x5F */ + 0x999E,0x99EE,0x99EC,0x99E5,0x99E4,0x99F0,0x99E3,0x99EA,/* 0x60-0x67 */ + 0x99E9,0x99E7,0x9AB9,0x9ABF,0x9AB4,0x9ABB,0x9AF6,0x9AFA,/* 0x68-0x6F */ + 0x9AF9,0x9AF7,0x9B33,0x9B80,0x9B85,0x9B87,0x9B7C,0x9B7E,/* 0x70-0x77 */ + 0x9B7B,0x9B82,0x9B93,0x9B92,0x9B90,0x9B7A,0x9B95,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9B7D,0x9B88,0x9D25,0x9D17,0x9D20,0x9D1E,0x9D14,/* 0xA0-0xA7 */ + 0x9D29,0x9D1D,0x9D18,0x9D22,0x9D10,0x9D19,0x9D1F,0x9E88,/* 0xA8-0xAF */ + 0x9E86,0x9E87,0x9EAE,0x9EAD,0x9ED5,0x9ED6,0x9EFA,0x9F12,/* 0xB0-0xB7 */ + 0x9F3D,0x5126,0x5125,0x5122,0x5124,0x5120,0x5129,0x52F4,/* 0xB8-0xBF */ + 0x5693,0x568C,0x568D,0x5686,0x5684,0x5683,0x567E,0x5682,/* 0xC0-0xC7 */ + 0x567F,0x5681,0x58D6,0x58D4,0x58CF,0x58D2,0x5B2D,0x5B25,/* 0xC8-0xCF */ + 0x5B32,0x5B23,0x5B2C,0x5B27,0x5B26,0x5B2F,0x5B2E,0x5B7B,/* 0xD0-0xD7 */ + 0x5BF1,0x5BF2,0x5DB7,0x5E6C,0x5E6A,0x5FBE,0x5FBB,0x61C3,/* 0xD8-0xDF */ + 0x61B5,0x61BC,0x61E7,0x61E0,0x61E5,0x61E4,0x61E8,0x61DE,/* 0xE0-0xE7 */ + 0x64EF,0x64E9,0x64E3,0x64EB,0x64E4,0x64E8,0x6581,0x6580,/* 0xE8-0xEF */ + 0x65B6,0x65DA,0x66D2,0x6A8D,0x6A96,0x6A81,0x6AA5,0x6A89,/* 0xF0-0xF7 */ + 0x6A9F,0x6A9B,0x6AA1,0x6A9E,0x6A87,0x6A93,0x6A8E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_ED[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x6A95,0x6A83,0x6AA8,0x6AA4,0x6A91,0x6A7F,0x6AA6,0x6A9A,/* 0x40-0x47 */ + 0x6A85,0x6A8C,0x6A92,0x6B5B,0x6BAD,0x6C09,0x6FCC,0x6FA9,/* 0x48-0x4F */ + 0x6FF4,0x6FD4,0x6FE3,0x6FDC,0x6FED,0x6FE7,0x6FE6,0x6FDE,/* 0x50-0x57 */ + 0x6FF2,0x6FDD,0x6FE2,0x6FE8,0x71E1,0x71F1,0x71E8,0x71F2,/* 0x58-0x5F */ + 0x71E4,0x71F0,0x71E2,0x7373,0x736E,0x736F,0x7497,0x74B2,/* 0x60-0x67 */ + 0x74AB,0x7490,0x74AA,0x74AD,0x74B1,0x74A5,0x74AF,0x7510,/* 0x68-0x6F */ + 0x7511,0x7512,0x750F,0x7584,0x7643,0x7648,0x7649,0x7647,/* 0x70-0x77 */ + 0x76A4,0x76E9,0x77B5,0x77AB,0x77B2,0x77B7,0x77B6,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x77B4,0x77B1,0x77A8,0x77F0,0x78F3,0x78FD,0x7902,/* 0xA0-0xA7 */ + 0x78FB,0x78FC,0x78F2,0x7905,0x78F9,0x78FE,0x7904,0x79AB,/* 0xA8-0xAF */ + 0x79A8,0x7A5C,0x7A5B,0x7A56,0x7A58,0x7A54,0x7A5A,0x7ABE,/* 0xB0-0xB7 */ + 0x7AC0,0x7AC1,0x7C05,0x7C0F,0x7BF2,0x7C00,0x7BFF,0x7BFB,/* 0xB8-0xBF */ + 0x7C0E,0x7BF4,0x7C0B,0x7BF3,0x7C02,0x7C09,0x7C03,0x7C01,/* 0xC0-0xC7 */ + 0x7BF8,0x7BFD,0x7C06,0x7BF0,0x7BF1,0x7C10,0x7C0A,0x7CE8,/* 0xC8-0xCF */ + 0x7E2D,0x7E3C,0x7E42,0x7E33,0x9848,0x7E38,0x7E2A,0x7E49,/* 0xD0-0xD7 */ + 0x7E40,0x7E47,0x7E29,0x7E4C,0x7E30,0x7E3B,0x7E36,0x7E44,/* 0xD8-0xDF */ + 0x7E3A,0x7F45,0x7F7F,0x7F7E,0x7F7D,0x7FF4,0x7FF2,0x802C,/* 0xE0-0xE7 */ + 0x81BB,0x81C4,0x81CC,0x81CA,0x81C5,0x81C7,0x81BC,0x81E9,/* 0xE8-0xEF */ + 0x825B,0x825A,0x825C,0x8583,0x8580,0x858F,0x85A7,0x8595,/* 0xF0-0xF7 */ + 0x85A0,0x858B,0x85A3,0x857B,0x85A4,0x859A,0x859E,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EE[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8577,0x857C,0x8589,0x85A1,0x857A,0x8578,0x8557,0x858E,/* 0x40-0x47 */ + 0x8596,0x8586,0x858D,0x8599,0x859D,0x8581,0x85A2,0x8582,/* 0x48-0x4F */ + 0x8588,0x8585,0x8579,0x8576,0x8598,0x8590,0x859F,0x8668,/* 0x50-0x57 */ + 0x87BE,0x87AA,0x87AD,0x87C5,0x87B0,0x87AC,0x87B9,0x87B5,/* 0x58-0x5F */ + 0x87BC,0x87AE,0x87C9,0x87C3,0x87C2,0x87CC,0x87B7,0x87AF,/* 0x60-0x67 */ + 0x87C4,0x87CA,0x87B4,0x87B6,0x87BF,0x87B8,0x87BD,0x87DE,/* 0x68-0x6F */ + 0x87B2,0x8935,0x8933,0x893C,0x893E,0x8941,0x8952,0x8937,/* 0x70-0x77 */ + 0x8942,0x89AD,0x89AF,0x89AE,0x89F2,0x89F3,0x8B1E,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x8B18,0x8B16,0x8B11,0x8B05,0x8B0B,0x8B22,0x8B0F,/* 0xA0-0xA7 */ + 0x8B12,0x8B15,0x8B07,0x8B0D,0x8B08,0x8B06,0x8B1C,0x8B13,/* 0xA8-0xAF */ + 0x8B1A,0x8C4F,0x8C70,0x8C72,0x8C71,0x8C6F,0x8C95,0x8C94,/* 0xB0-0xB7 */ + 0x8CF9,0x8D6F,0x8E4E,0x8E4D,0x8E53,0x8E50,0x8E4C,0x8E47,/* 0xB8-0xBF */ + 0x8F43,0x8F40,0x9085,0x907E,0x9138,0x919A,0x91A2,0x919B,/* 0xC0-0xC7 */ + 0x9199,0x919F,0x91A1,0x919D,0x91A0,0x93A1,0x9383,0x93AF,/* 0xC8-0xCF */ + 0x9364,0x9356,0x9347,0x937C,0x9358,0x935C,0x9376,0x9349,/* 0xD0-0xD7 */ + 0x9350,0x9351,0x9360,0x936D,0x938F,0x934C,0x936A,0x9379,/* 0xD8-0xDF */ + 0x9357,0x9355,0x9352,0x934F,0x9371,0x9377,0x937B,0x9361,/* 0xE0-0xE7 */ + 0x935E,0x9363,0x9367,0x9380,0x934E,0x9359,0x95C7,0x95C0,/* 0xE8-0xEF */ + 0x95C9,0x95C3,0x95C5,0x95B7,0x96AE,0x96B0,0x96AC,0x9720,/* 0xF0-0xF7 */ + 0x971F,0x9718,0x971D,0x9719,0x979A,0x97A1,0x979C,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_EF[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x979E,0x979D,0x97D5,0x97D4,0x97F1,0x9841,0x9844,0x984A,/* 0x40-0x47 */ + 0x9849,0x9845,0x9843,0x9925,0x992B,0x992C,0x992A,0x9933,/* 0x48-0x4F */ + 0x9932,0x992F,0x992D,0x9931,0x9930,0x9998,0x99A3,0x99A1,/* 0x50-0x57 */ + 0x9A02,0x99FA,0x99F4,0x99F7,0x99F9,0x99F8,0x99F6,0x99FB,/* 0x58-0x5F */ + 0x99FD,0x99FE,0x99FC,0x9A03,0x9ABE,0x9AFE,0x9AFD,0x9B01,/* 0x60-0x67 */ + 0x9AFC,0x9B48,0x9B9A,0x9BA8,0x9B9E,0x9B9B,0x9BA6,0x9BA1,/* 0x68-0x6F */ + 0x9BA5,0x9BA4,0x9B86,0x9BA2,0x9BA0,0x9BAF,0x9D33,0x9D41,/* 0x70-0x77 */ + 0x9D67,0x9D36,0x9D2E,0x9D2F,0x9D31,0x9D38,0x9D30,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9D45,0x9D42,0x9D43,0x9D3E,0x9D37,0x9D40,0x9D3D,/* 0xA0-0xA7 */ + 0x7FF5,0x9D2D,0x9E8A,0x9E89,0x9E8D,0x9EB0,0x9EC8,0x9EDA,/* 0xA8-0xAF */ + 0x9EFB,0x9EFF,0x9F24,0x9F23,0x9F22,0x9F54,0x9FA0,0x5131,/* 0xB0-0xB7 */ + 0x512D,0x512E,0x5698,0x569C,0x5697,0x569A,0x569D,0x5699,/* 0xB8-0xBF */ + 0x5970,0x5B3C,0x5C69,0x5C6A,0x5DC0,0x5E6D,0x5E6E,0x61D8,/* 0xC0-0xC7 */ + 0x61DF,0x61ED,0x61EE,0x61F1,0x61EA,0x61F0,0x61EB,0x61D6,/* 0xC8-0xCF */ + 0x61E9,0x64FF,0x6504,0x64FD,0x64F8,0x6501,0x6503,0x64FC,/* 0xD0-0xD7 */ + 0x6594,0x65DB,0x66DA,0x66DB,0x66D8,0x6AC5,0x6AB9,0x6ABD,/* 0xD8-0xDF */ + 0x6AE1,0x6AC6,0x6ABA,0x6AB6,0x6AB7,0x6AC7,0x6AB4,0x6AAD,/* 0xE0-0xE7 */ + 0x6B5E,0x6BC9,0x6C0B,0x7007,0x700C,0x700D,0x7001,0x7005,/* 0xE8-0xEF */ + 0x7014,0x700E,0x6FFF,0x7000,0x6FFB,0x7026,0x6FFC,0x6FF7,/* 0xF0-0xF7 */ + 0x700A,0x7201,0x71FF,0x71F9,0x7203,0x71FD,0x7376,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F0[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x74B8,0x74C0,0x74B5,0x74C1,0x74BE,0x74B6,0x74BB,0x74C2,/* 0x40-0x47 */ + 0x7514,0x7513,0x765C,0x7664,0x7659,0x7650,0x7653,0x7657,/* 0x48-0x4F */ + 0x765A,0x76A6,0x76BD,0x76EC,0x77C2,0x77BA,0x78FF,0x790C,/* 0x50-0x57 */ + 0x7913,0x7914,0x7909,0x7910,0x7912,0x7911,0x79AD,0x79AC,/* 0x58-0x5F */ + 0x7A5F,0x7C1C,0x7C29,0x7C19,0x7C20,0x7C1F,0x7C2D,0x7C1D,/* 0x60-0x67 */ + 0x7C26,0x7C28,0x7C22,0x7C25,0x7C30,0x7E5C,0x7E50,0x7E56,/* 0x68-0x6F */ + 0x7E63,0x7E58,0x7E62,0x7E5F,0x7E51,0x7E60,0x7E57,0x7E53,/* 0x70-0x77 */ + 0x7FB5,0x7FB3,0x7FF7,0x7FF8,0x8075,0x81D1,0x81D2,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x81D0,0x825F,0x825E,0x85B4,0x85C6,0x85C0,0x85C3,/* 0xA0-0xA7 */ + 0x85C2,0x85B3,0x85B5,0x85BD,0x85C7,0x85C4,0x85BF,0x85CB,/* 0xA8-0xAF */ + 0x85CE,0x85C8,0x85C5,0x85B1,0x85B6,0x85D2,0x8624,0x85B8,/* 0xB0-0xB7 */ + 0x85B7,0x85BE,0x8669,0x87E7,0x87E6,0x87E2,0x87DB,0x87EB,/* 0xB8-0xBF */ + 0x87EA,0x87E5,0x87DF,0x87F3,0x87E4,0x87D4,0x87DC,0x87D3,/* 0xC0-0xC7 */ + 0x87ED,0x87D8,0x87E3,0x87A4,0x87D7,0x87D9,0x8801,0x87F4,/* 0xC8-0xCF */ + 0x87E8,0x87DD,0x8953,0x894B,0x894F,0x894C,0x8946,0x8950,/* 0xD0-0xD7 */ + 0x8951,0x8949,0x8B2A,0x8B27,0x8B23,0x8B33,0x8B30,0x8B35,/* 0xD8-0xDF */ + 0x8B47,0x8B2F,0x8B3C,0x8B3E,0x8B31,0x8B25,0x8B37,0x8B26,/* 0xE0-0xE7 */ + 0x8B36,0x8B2E,0x8B24,0x8B3B,0x8B3D,0x8B3A,0x8C42,0x8C75,/* 0xE8-0xEF */ + 0x8C99,0x8C98,0x8C97,0x8CFE,0x8D04,0x8D02,0x8D00,0x8E5C,/* 0xF0-0xF7 */ + 0x8E62,0x8E60,0x8E57,0x8E56,0x8E5E,0x8E65,0x8E67,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F1[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8E5B,0x8E5A,0x8E61,0x8E5D,0x8E69,0x8E54,0x8F46,0x8F47,/* 0x40-0x47 */ + 0x8F48,0x8F4B,0x9128,0x913A,0x913B,0x913E,0x91A8,0x91A5,/* 0x48-0x4F */ + 0x91A7,0x91AF,0x91AA,0x93B5,0x938C,0x9392,0x93B7,0x939B,/* 0x50-0x57 */ + 0x939D,0x9389,0x93A7,0x938E,0x93AA,0x939E,0x93A6,0x9395,/* 0x58-0x5F */ + 0x9388,0x9399,0x939F,0x938D,0x93B1,0x9391,0x93B2,0x93A4,/* 0x60-0x67 */ + 0x93A8,0x93B4,0x93A3,0x93A5,0x95D2,0x95D3,0x95D1,0x96B3,/* 0x68-0x6F */ + 0x96D7,0x96DA,0x5DC2,0x96DF,0x96D8,0x96DD,0x9723,0x9722,/* 0x70-0x77 */ + 0x9725,0x97AC,0x97AE,0x97A8,0x97AB,0x97A4,0x97AA,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x97A2,0x97A5,0x97D7,0x97D9,0x97D6,0x97D8,0x97FA,/* 0xA0-0xA7 */ + 0x9850,0x9851,0x9852,0x98B8,0x9941,0x993C,0x993A,0x9A0F,/* 0xA8-0xAF */ + 0x9A0B,0x9A09,0x9A0D,0x9A04,0x9A11,0x9A0A,0x9A05,0x9A07,/* 0xB0-0xB7 */ + 0x9A06,0x9AC0,0x9ADC,0x9B08,0x9B04,0x9B05,0x9B29,0x9B35,/* 0xB8-0xBF */ + 0x9B4A,0x9B4C,0x9B4B,0x9BC7,0x9BC6,0x9BC3,0x9BBF,0x9BC1,/* 0xC0-0xC7 */ + 0x9BB5,0x9BB8,0x9BD3,0x9BB6,0x9BC4,0x9BB9,0x9BBD,0x9D5C,/* 0xC8-0xCF */ + 0x9D53,0x9D4F,0x9D4A,0x9D5B,0x9D4B,0x9D59,0x9D56,0x9D4C,/* 0xD0-0xD7 */ + 0x9D57,0x9D52,0x9D54,0x9D5F,0x9D58,0x9D5A,0x9E8E,0x9E8C,/* 0xD8-0xDF */ + 0x9EDF,0x9F01,0x9F00,0x9F16,0x9F25,0x9F2B,0x9F2A,0x9F29,/* 0xE0-0xE7 */ + 0x9F28,0x9F4C,0x9F55,0x5134,0x5135,0x5296,0x52F7,0x53B4,/* 0xE8-0xEF */ + 0x56AB,0x56AD,0x56A6,0x56A7,0x56AA,0x56AC,0x58DA,0x58DD,/* 0xF0-0xF7 */ + 0x58DB,0x5912,0x5B3D,0x5B3E,0x5B3F,0x5DC3,0x5E70,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F2[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x5FBF,0x61FB,0x6507,0x6510,0x650D,0x6509,0x650C,0x650E,/* 0x40-0x47 */ + 0x6584,0x65DE,0x65DD,0x66DE,0x6AE7,0x6AE0,0x6ACC,0x6AD1,/* 0x48-0x4F */ + 0x6AD9,0x6ACB,0x6ADF,0x6ADC,0x6AD0,0x6AEB,0x6ACF,0x6ACD,/* 0x50-0x57 */ + 0x6ADE,0x6B60,0x6BB0,0x6C0C,0x7019,0x7027,0x7020,0x7016,/* 0x58-0x5F */ + 0x702B,0x7021,0x7022,0x7023,0x7029,0x7017,0x7024,0x701C,/* 0x60-0x67 */ + 0x702A,0x720C,0x720A,0x7207,0x7202,0x7205,0x72A5,0x72A6,/* 0x68-0x6F */ + 0x72A4,0x72A3,0x72A1,0x74CB,0x74C5,0x74B7,0x74C3,0x7516,/* 0x70-0x77 */ + 0x7660,0x77C9,0x77CA,0x77C4,0x77F1,0x791D,0x791B,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x7921,0x791C,0x7917,0x791E,0x79B0,0x7A67,0x7A68,/* 0xA0-0xA7 */ + 0x7C33,0x7C3C,0x7C39,0x7C2C,0x7C3B,0x7CEC,0x7CEA,0x7E76,/* 0xA8-0xAF */ + 0x7E75,0x7E78,0x7E70,0x7E77,0x7E6F,0x7E7A,0x7E72,0x7E74,/* 0xB0-0xB7 */ + 0x7E68,0x7F4B,0x7F4A,0x7F83,0x7F86,0x7FB7,0x7FFD,0x7FFE,/* 0xB8-0xBF */ + 0x8078,0x81D7,0x81D5,0x8264,0x8261,0x8263,0x85EB,0x85F1,/* 0xC0-0xC7 */ + 0x85ED,0x85D9,0x85E1,0x85E8,0x85DA,0x85D7,0x85EC,0x85F2,/* 0xC8-0xCF */ + 0x85F8,0x85D8,0x85DF,0x85E3,0x85DC,0x85D1,0x85F0,0x85E6,/* 0xD0-0xD7 */ + 0x85EF,0x85DE,0x85E2,0x8800,0x87FA,0x8803,0x87F6,0x87F7,/* 0xD8-0xDF */ + 0x8809,0x880C,0x880B,0x8806,0x87FC,0x8808,0x87FF,0x880A,/* 0xE0-0xE7 */ + 0x8802,0x8962,0x895A,0x895B,0x8957,0x8961,0x895C,0x8958,/* 0xE8-0xEF */ + 0x895D,0x8959,0x8988,0x89B7,0x89B6,0x89F6,0x8B50,0x8B48,/* 0xF0-0xF7 */ + 0x8B4A,0x8B40,0x8B53,0x8B56,0x8B54,0x8B4B,0x8B55,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F3[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8B51,0x8B42,0x8B52,0x8B57,0x8C43,0x8C77,0x8C76,0x8C9A,/* 0x40-0x47 */ + 0x8D06,0x8D07,0x8D09,0x8DAC,0x8DAA,0x8DAD,0x8DAB,0x8E6D,/* 0x48-0x4F */ + 0x8E78,0x8E73,0x8E6A,0x8E6F,0x8E7B,0x8EC2,0x8F52,0x8F51,/* 0x50-0x57 */ + 0x8F4F,0x8F50,0x8F53,0x8FB4,0x9140,0x913F,0x91B0,0x91AD,/* 0x58-0x5F */ + 0x93DE,0x93C7,0x93CF,0x93C2,0x93DA,0x93D0,0x93F9,0x93EC,/* 0x60-0x67 */ + 0x93CC,0x93D9,0x93A9,0x93E6,0x93CA,0x93D4,0x93EE,0x93E3,/* 0x68-0x6F */ + 0x93D5,0x93C4,0x93CE,0x93C0,0x93D2,0x93E7,0x957D,0x95DA,/* 0x70-0x77 */ + 0x95DB,0x96E1,0x9729,0x972B,0x972C,0x9728,0x9726,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x97B3,0x97B7,0x97B6,0x97DD,0x97DE,0x97DF,0x985C,/* 0xA0-0xA7 */ + 0x9859,0x985D,0x9857,0x98BF,0x98BD,0x98BB,0x98BE,0x9948,/* 0xA8-0xAF */ + 0x9947,0x9943,0x99A6,0x99A7,0x9A1A,0x9A15,0x9A25,0x9A1D,/* 0xB0-0xB7 */ + 0x9A24,0x9A1B,0x9A22,0x9A20,0x9A27,0x9A23,0x9A1E,0x9A1C,/* 0xB8-0xBF */ + 0x9A14,0x9AC2,0x9B0B,0x9B0A,0x9B0E,0x9B0C,0x9B37,0x9BEA,/* 0xC0-0xC7 */ + 0x9BEB,0x9BE0,0x9BDE,0x9BE4,0x9BE6,0x9BE2,0x9BF0,0x9BD4,/* 0xC8-0xCF */ + 0x9BD7,0x9BEC,0x9BDC,0x9BD9,0x9BE5,0x9BD5,0x9BE1,0x9BDA,/* 0xD0-0xD7 */ + 0x9D77,0x9D81,0x9D8A,0x9D84,0x9D88,0x9D71,0x9D80,0x9D78,/* 0xD8-0xDF */ + 0x9D86,0x9D8B,0x9D8C,0x9D7D,0x9D6B,0x9D74,0x9D75,0x9D70,/* 0xE0-0xE7 */ + 0x9D69,0x9D85,0x9D73,0x9D7B,0x9D82,0x9D6F,0x9D79,0x9D7F,/* 0xE8-0xEF */ + 0x9D87,0x9D68,0x9E94,0x9E91,0x9EC0,0x9EFC,0x9F2D,0x9F40,/* 0xF0-0xF7 */ + 0x9F41,0x9F4D,0x9F56,0x9F57,0x9F58,0x5337,0x56B2,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F4[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x56B5,0x56B3,0x58E3,0x5B45,0x5DC6,0x5DC7,0x5EEE,0x5EEF,/* 0x40-0x47 */ + 0x5FC0,0x5FC1,0x61F9,0x6517,0x6516,0x6515,0x6513,0x65DF,/* 0x48-0x4F */ + 0x66E8,0x66E3,0x66E4,0x6AF3,0x6AF0,0x6AEA,0x6AE8,0x6AF9,/* 0x50-0x57 */ + 0x6AF1,0x6AEE,0x6AEF,0x703C,0x7035,0x702F,0x7037,0x7034,/* 0x58-0x5F */ + 0x7031,0x7042,0x7038,0x703F,0x703A,0x7039,0x7040,0x703B,/* 0x60-0x67 */ + 0x7033,0x7041,0x7213,0x7214,0x72A8,0x737D,0x737C,0x74BA,/* 0x68-0x6F */ + 0x76AB,0x76AA,0x76BE,0x76ED,0x77CC,0x77CE,0x77CF,0x77CD,/* 0x70-0x77 */ + 0x77F2,0x7925,0x7923,0x7927,0x7928,0x7924,0x7929,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x79B2,0x7A6E,0x7A6C,0x7A6D,0x7AF7,0x7C49,0x7C48,/* 0xA0-0xA7 */ + 0x7C4A,0x7C47,0x7C45,0x7CEE,0x7E7B,0x7E7E,0x7E81,0x7E80,/* 0xA8-0xAF */ + 0x7FBA,0x7FFF,0x8079,0x81DB,0x81D9,0x820B,0x8268,0x8269,/* 0xB0-0xB7 */ + 0x8622,0x85FF,0x8601,0x85FE,0x861B,0x8600,0x85F6,0x8604,/* 0xB8-0xBF */ + 0x8609,0x8605,0x860C,0x85FD,0x8819,0x8810,0x8811,0x8817,/* 0xC0-0xC7 */ + 0x8813,0x8816,0x8963,0x8966,0x89B9,0x89F7,0x8B60,0x8B6A,/* 0xC8-0xCF */ + 0x8B5D,0x8B68,0x8B63,0x8B65,0x8B67,0x8B6D,0x8DAE,0x8E86,/* 0xD0-0xD7 */ + 0x8E88,0x8E84,0x8F59,0x8F56,0x8F57,0x8F55,0x8F58,0x8F5A,/* 0xD8-0xDF */ + 0x908D,0x9143,0x9141,0x91B7,0x91B5,0x91B2,0x91B3,0x940B,/* 0xE0-0xE7 */ + 0x9413,0x93FB,0x9420,0x940F,0x9414,0x93FE,0x9415,0x9410,/* 0xE8-0xEF */ + 0x9428,0x9419,0x940D,0x93F5,0x9400,0x93F7,0x9407,0x940E,/* 0xF0-0xF7 */ + 0x9416,0x9412,0x93FA,0x9409,0x93F8,0x940A,0x93FF,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F5[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x93FC,0x940C,0x93F6,0x9411,0x9406,0x95DE,0x95E0,0x95DF,/* 0x40-0x47 */ + 0x972E,0x972F,0x97B9,0x97BB,0x97FD,0x97FE,0x9860,0x9862,/* 0x48-0x4F */ + 0x9863,0x985F,0x98C1,0x98C2,0x9950,0x994E,0x9959,0x994C,/* 0x50-0x57 */ + 0x994B,0x9953,0x9A32,0x9A34,0x9A31,0x9A2C,0x9A2A,0x9A36,/* 0x58-0x5F */ + 0x9A29,0x9A2E,0x9A38,0x9A2D,0x9AC7,0x9ACA,0x9AC6,0x9B10,/* 0x60-0x67 */ + 0x9B12,0x9B11,0x9C0B,0x9C08,0x9BF7,0x9C05,0x9C12,0x9BF8,/* 0x68-0x6F */ + 0x9C40,0x9C07,0x9C0E,0x9C06,0x9C17,0x9C14,0x9C09,0x9D9F,/* 0x70-0x77 */ + 0x9D99,0x9DA4,0x9D9D,0x9D92,0x9D98,0x9D90,0x9D9B,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9DA0,0x9D94,0x9D9C,0x9DAA,0x9D97,0x9DA1,0x9D9A,/* 0xA0-0xA7 */ + 0x9DA2,0x9DA8,0x9D9E,0x9DA3,0x9DBF,0x9DA9,0x9D96,0x9DA6,/* 0xA8-0xAF */ + 0x9DA7,0x9E99,0x9E9B,0x9E9A,0x9EE5,0x9EE4,0x9EE7,0x9EE6,/* 0xB0-0xB7 */ + 0x9F30,0x9F2E,0x9F5B,0x9F60,0x9F5E,0x9F5D,0x9F59,0x9F91,/* 0xB8-0xBF */ + 0x513A,0x5139,0x5298,0x5297,0x56C3,0x56BD,0x56BE,0x5B48,/* 0xC0-0xC7 */ + 0x5B47,0x5DCB,0x5DCF,0x5EF1,0x61FD,0x651B,0x6B02,0x6AFC,/* 0xC8-0xCF */ + 0x6B03,0x6AF8,0x6B00,0x7043,0x7044,0x704A,0x7048,0x7049,/* 0xD0-0xD7 */ + 0x7045,0x7046,0x721D,0x721A,0x7219,0x737E,0x7517,0x766A,/* 0xD8-0xDF */ + 0x77D0,0x792D,0x7931,0x792F,0x7C54,0x7C53,0x7CF2,0x7E8A,/* 0xE0-0xE7 */ + 0x7E87,0x7E88,0x7E8B,0x7E86,0x7E8D,0x7F4D,0x7FBB,0x8030,/* 0xE8-0xEF */ + 0x81DD,0x8618,0x862A,0x8626,0x861F,0x8623,0x861C,0x8619,/* 0xF0-0xF7 */ + 0x8627,0x862E,0x8621,0x8620,0x8629,0x861E,0x8625,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F6[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8829,0x881D,0x881B,0x8820,0x8824,0x881C,0x882B,0x884A,/* 0x40-0x47 */ + 0x896D,0x8969,0x896E,0x896B,0x89FA,0x8B79,0x8B78,0x8B45,/* 0x48-0x4F */ + 0x8B7A,0x8B7B,0x8D10,0x8D14,0x8DAF,0x8E8E,0x8E8C,0x8F5E,/* 0x50-0x57 */ + 0x8F5B,0x8F5D,0x9146,0x9144,0x9145,0x91B9,0x943F,0x943B,/* 0x58-0x5F */ + 0x9436,0x9429,0x943D,0x943C,0x9430,0x9439,0x942A,0x9437,/* 0x60-0x67 */ + 0x942C,0x9440,0x9431,0x95E5,0x95E4,0x95E3,0x9735,0x973A,/* 0x68-0x6F */ + 0x97BF,0x97E1,0x9864,0x98C9,0x98C6,0x98C0,0x9958,0x9956,/* 0x70-0x77 */ + 0x9A39,0x9A3D,0x9A46,0x9A44,0x9A42,0x9A41,0x9A3A,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9A3F,0x9ACD,0x9B15,0x9B17,0x9B18,0x9B16,0x9B3A,/* 0xA0-0xA7 */ + 0x9B52,0x9C2B,0x9C1D,0x9C1C,0x9C2C,0x9C23,0x9C28,0x9C29,/* 0xA8-0xAF */ + 0x9C24,0x9C21,0x9DB7,0x9DB6,0x9DBC,0x9DC1,0x9DC7,0x9DCA,/* 0xB0-0xB7 */ + 0x9DCF,0x9DBE,0x9DC5,0x9DC3,0x9DBB,0x9DB5,0x9DCE,0x9DB9,/* 0xB8-0xBF */ + 0x9DBA,0x9DAC,0x9DC8,0x9DB1,0x9DAD,0x9DCC,0x9DB3,0x9DCD,/* 0xC0-0xC7 */ + 0x9DB2,0x9E7A,0x9E9C,0x9EEB,0x9EEE,0x9EED,0x9F1B,0x9F18,/* 0xC8-0xCF */ + 0x9F1A,0x9F31,0x9F4E,0x9F65,0x9F64,0x9F92,0x4EB9,0x56C6,/* 0xD0-0xD7 */ + 0x56C5,0x56CB,0x5971,0x5B4B,0x5B4C,0x5DD5,0x5DD1,0x5EF2,/* 0xD8-0xDF */ + 0x6521,0x6520,0x6526,0x6522,0x6B0B,0x6B08,0x6B09,0x6C0D,/* 0xE0-0xE7 */ + 0x7055,0x7056,0x7057,0x7052,0x721E,0x721F,0x72A9,0x737F,/* 0xE8-0xEF */ + 0x74D8,0x74D5,0x74D9,0x74D7,0x766D,0x76AD,0x7935,0x79B4,/* 0xF0-0xF7 */ + 0x7A70,0x7A71,0x7C57,0x7C5C,0x7C59,0x7C5B,0x7C5A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F7[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7CF4,0x7CF1,0x7E91,0x7F4F,0x7F87,0x81DE,0x826B,0x8634,/* 0x40-0x47 */ + 0x8635,0x8633,0x862C,0x8632,0x8636,0x882C,0x8828,0x8826,/* 0x48-0x4F */ + 0x882A,0x8825,0x8971,0x89BF,0x89BE,0x89FB,0x8B7E,0x8B84,/* 0x50-0x57 */ + 0x8B82,0x8B86,0x8B85,0x8B7F,0x8D15,0x8E95,0x8E94,0x8E9A,/* 0x58-0x5F */ + 0x8E92,0x8E90,0x8E96,0x8E97,0x8F60,0x8F62,0x9147,0x944C,/* 0x60-0x67 */ + 0x9450,0x944A,0x944B,0x944F,0x9447,0x9445,0x9448,0x9449,/* 0x68-0x6F */ + 0x9446,0x973F,0x97E3,0x986A,0x9869,0x98CB,0x9954,0x995B,/* 0x70-0x77 */ + 0x9A4E,0x9A53,0x9A54,0x9A4C,0x9A4F,0x9A48,0x9A4A,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9A49,0x9A52,0x9A50,0x9AD0,0x9B19,0x9B2B,0x9B3B,/* 0xA0-0xA7 */ + 0x9B56,0x9B55,0x9C46,0x9C48,0x9C3F,0x9C44,0x9C39,0x9C33,/* 0xA8-0xAF */ + 0x9C41,0x9C3C,0x9C37,0x9C34,0x9C32,0x9C3D,0x9C36,0x9DDB,/* 0xB0-0xB7 */ + 0x9DD2,0x9DDE,0x9DDA,0x9DCB,0x9DD0,0x9DDC,0x9DD1,0x9DDF,/* 0xB8-0xBF */ + 0x9DE9,0x9DD9,0x9DD8,0x9DD6,0x9DF5,0x9DD5,0x9DDD,0x9EB6,/* 0xC0-0xC7 */ + 0x9EF0,0x9F35,0x9F33,0x9F32,0x9F42,0x9F6B,0x9F95,0x9FA2,/* 0xC8-0xCF */ + 0x513D,0x5299,0x58E8,0x58E7,0x5972,0x5B4D,0x5DD8,0x882F,/* 0xD0-0xD7 */ + 0x5F4F,0x6201,0x6203,0x6204,0x6529,0x6525,0x6596,0x66EB,/* 0xD8-0xDF */ + 0x6B11,0x6B12,0x6B0F,0x6BCA,0x705B,0x705A,0x7222,0x7382,/* 0xE0-0xE7 */ + 0x7381,0x7383,0x7670,0x77D4,0x7C67,0x7C66,0x7E95,0x826C,/* 0xE8-0xEF */ + 0x863A,0x8640,0x8639,0x863C,0x8631,0x863B,0x863E,0x8830,/* 0xF0-0xF7 */ + 0x8832,0x882E,0x8833,0x8976,0x8974,0x8973,0x89FE,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F8[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x8B8C,0x8B8E,0x8B8B,0x8B88,0x8C45,0x8D19,0x8E98,0x8F64,/* 0x40-0x47 */ + 0x8F63,0x91BC,0x9462,0x9455,0x945D,0x9457,0x945E,0x97C4,/* 0x48-0x4F */ + 0x97C5,0x9800,0x9A56,0x9A59,0x9B1E,0x9B1F,0x9B20,0x9C52,/* 0x50-0x57 */ + 0x9C58,0x9C50,0x9C4A,0x9C4D,0x9C4B,0x9C55,0x9C59,0x9C4C,/* 0x58-0x5F */ + 0x9C4E,0x9DFB,0x9DF7,0x9DEF,0x9DE3,0x9DEB,0x9DF8,0x9DE4,/* 0x60-0x67 */ + 0x9DF6,0x9DE1,0x9DEE,0x9DE6,0x9DF2,0x9DF0,0x9DE2,0x9DEC,/* 0x68-0x6F */ + 0x9DF4,0x9DF3,0x9DE8,0x9DED,0x9EC2,0x9ED0,0x9EF2,0x9EF3,/* 0x70-0x77 */ + 0x9F06,0x9F1C,0x9F38,0x9F37,0x9F36,0x9F43,0x9F4F,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9F71,0x9F70,0x9F6E,0x9F6F,0x56D3,0x56CD,0x5B4E,/* 0xA0-0xA7 */ + 0x5C6D,0x652D,0x66ED,0x66EE,0x6B13,0x705F,0x7061,0x705D,/* 0xA8-0xAF */ + 0x7060,0x7223,0x74DB,0x74E5,0x77D5,0x7938,0x79B7,0x79B6,/* 0xB0-0xB7 */ + 0x7C6A,0x7E97,0x7F89,0x826D,0x8643,0x8838,0x8837,0x8835,/* 0xB8-0xBF */ + 0x884B,0x8B94,0x8B95,0x8E9E,0x8E9F,0x8EA0,0x8E9D,0x91BE,/* 0xC0-0xC7 */ + 0x91BD,0x91C2,0x946B,0x9468,0x9469,0x96E5,0x9746,0x9743,/* 0xC8-0xCF */ + 0x9747,0x97C7,0x97E5,0x9A5E,0x9AD5,0x9B59,0x9C63,0x9C67,/* 0xD0-0xD7 */ + 0x9C66,0x9C62,0x9C5E,0x9C60,0x9E02,0x9DFE,0x9E07,0x9E03,/* 0xD8-0xDF */ + 0x9E06,0x9E05,0x9E00,0x9E01,0x9E09,0x9DFF,0x9DFD,0x9E04,/* 0xE0-0xE7 */ + 0x9EA0,0x9F1E,0x9F46,0x9F74,0x9F75,0x9F76,0x56D4,0x652E,/* 0xE8-0xEF */ + 0x65B8,0x6B18,0x6B19,0x6B17,0x6B1A,0x7062,0x7226,0x72AA,/* 0xF0-0xF7 */ + 0x77D8,0x77D9,0x7939,0x7C69,0x7C6B,0x7CF6,0x7E9A,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t c2u_F9[256] = { + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */ + 0x7E98,0x7E9B,0x7E99,0x81E0,0x81E1,0x8646,0x8647,0x8648,/* 0x40-0x47 */ + 0x8979,0x897A,0x897C,0x897B,0x89FF,0x8B98,0x8B99,0x8EA5,/* 0x48-0x4F */ + 0x8EA4,0x8EA3,0x946E,0x946D,0x946F,0x9471,0x9473,0x9749,/* 0x50-0x57 */ + 0x9872,0x995F,0x9C68,0x9C6E,0x9C6D,0x9E0B,0x9E0D,0x9E10,/* 0x58-0x5F */ + 0x9E0F,0x9E12,0x9E11,0x9EA1,0x9EF5,0x9F09,0x9F47,0x9F78,/* 0x60-0x67 */ + 0x9F7B,0x9F7A,0x9F79,0x571E,0x7066,0x7C6F,0x883C,0x8DB2,/* 0x68-0x6F */ + 0x8EA6,0x91C3,0x9474,0x9478,0x9476,0x9475,0x9A60,0x9C74,/* 0x70-0x77 */ + 0x9C73,0x9C71,0x9C75,0x9E14,0x9E13,0x9EF6,0x9F0A,0x0000,/* 0x78-0x7F */ + + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */ + 0x0000,0x9FA4,0x7068,0x7065,0x7CF7,0x866A,0x883E,0x883D,/* 0xA0-0xA7 */ + 0x883F,0x8B9E,0x8C9C,0x8EA9,0x8EC9,0x974B,0x9873,0x9874,/* 0xA8-0xAF */ + 0x98CC,0x9961,0x99AB,0x9A64,0x9A66,0x9A67,0x9B24,0x9E15,/* 0xB0-0xB7 */ + 0x9E17,0x9F48,0x6207,0x6B1E,0x7227,0x864C,0x8EA8,0x9482,/* 0xB8-0xBF */ + 0x9480,0x9481,0x9A69,0x9A68,0x9B2E,0x9E19,0x7229,0x864B,/* 0xC0-0xC7 */ + 0x8B9F,0x9483,0x9C79,0x9EB7,0x7675,0x9A6B,0x9C7A,0x9E1D,/* 0xC8-0xCF */ + 0x7069,0x706A,0x9EA4,0x9F7E,0x9F49,0x9F98,0x7881,0x92B9,/* 0xD0-0xD7 */ + 0x88CF,0x58BB,0x6052,0x7CA7,0x5AFA,0x2554,0x2566,0x2557,/* 0xD8-0xDF */ + 0x2560,0x256C,0x2563,0x255A,0x2569,0x255D,0x2552,0x2564,/* 0xE0-0xE7 */ + 0x2555,0x255E,0x256A,0x2561,0x2558,0x2567,0x255B,0x2553,/* 0xE8-0xEF */ + 0x2565,0x2556,0x255F,0x256B,0x2562,0x2559,0x2568,0x255C,/* 0xF0-0xF7 */ + 0x2551,0x2550,0x256D,0x256E,0x2570,0x256F,0x2593,0x0000,/* 0xF8-0xFF */ +}; + +static const wchar_t *page_charset2uni[256] = { + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, c2u_A1, c2u_A2, c2u_A3, c2u_A4, c2u_A5, c2u_A6, c2u_A7, + c2u_A8, c2u_A9, c2u_AA, c2u_AB, c2u_AC, c2u_AD, c2u_AE, c2u_AF, + c2u_B0, c2u_B1, c2u_B2, c2u_B3, c2u_B4, c2u_B5, c2u_B6, c2u_B7, + c2u_B8, c2u_B9, c2u_BA, c2u_BB, c2u_BC, c2u_BD, c2u_BE, c2u_BF, + c2u_C0, c2u_C1, c2u_C2, c2u_C3, c2u_C4, c2u_C5, c2u_C6, NULL, + NULL, c2u_C9, c2u_CA, c2u_CB, c2u_CC, c2u_CD, c2u_CE, c2u_CF, + c2u_D0, c2u_D1, c2u_D2, c2u_D3, c2u_D4, c2u_D5, c2u_D6, c2u_D7, + c2u_D8, c2u_D9, c2u_DA, c2u_DB, c2u_DC, c2u_DD, c2u_DE, c2u_DF, + c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, + c2u_E8, c2u_E9, c2u_EA, c2u_EB, c2u_EC, c2u_ED, c2u_EE, c2u_EF, + c2u_F0, c2u_F1, c2u_F2, c2u_F3, c2u_F4, c2u_F5, c2u_F6, c2u_F7, + c2u_F8, c2u_F9, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char u2c_02[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA3, 0xBE, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xA3, 0xBC, 0xA3, 0xBD, 0xA3, 0xBF, /* 0xC8-0xCB */ + 0x00, 0x00, 0xA1, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xA3, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ +}; + +static const unsigned char u2c_03[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xA1, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xA3, 0x44, 0xA3, 0x45, 0xA3, 0x46, /* 0x90-0x93 */ + 0xA3, 0x47, 0xA3, 0x48, 0xA3, 0x49, 0xA3, 0x4A, /* 0x94-0x97 */ + 0xA3, 0x4B, 0xA3, 0x4C, 0xA3, 0x4D, 0xA3, 0x4E, /* 0x98-0x9B */ + 0xA3, 0x4F, 0xA3, 0x50, 0xA3, 0x51, 0xA3, 0x52, /* 0x9C-0x9F */ + 0xA3, 0x53, 0xA3, 0x54, 0x00, 0x00, 0xA3, 0x55, /* 0xA0-0xA3 */ + 0xA3, 0x56, 0xA3, 0x57, 0xA3, 0x58, 0xA3, 0x59, /* 0xA4-0xA7 */ + 0xA3, 0x5A, 0xA3, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xA3, 0x5C, 0xA3, 0x5D, 0xA3, 0x5E, /* 0xB0-0xB3 */ + 0xA3, 0x5F, 0xA3, 0x60, 0xA3, 0x61, 0xA3, 0x62, /* 0xB4-0xB7 */ + 0xA3, 0x63, 0xA3, 0x64, 0xA3, 0x65, 0xA3, 0x66, /* 0xB8-0xBB */ + 0xA3, 0x67, 0xA3, 0x68, 0xA3, 0x69, 0xA3, 0x6A, /* 0xBC-0xBF */ + 0xA3, 0x6B, 0xA3, 0x6C, 0x00, 0x00, 0xA3, 0x6D, /* 0xC0-0xC3 */ + 0xA3, 0x6E, 0xA3, 0x6F, 0xA3, 0x70, 0xA3, 0x71, /* 0xC4-0xC7 */ + 0xA3, 0x72, 0xA3, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ +}; + +static const unsigned char u2c_20[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x56, /* 0x10-0x13 */ + 0xA1, 0x58, 0xA2, 0x77, 0xA1, 0xFC, 0x00, 0x00, /* 0x14-0x17 */ + 0xA1, 0xA5, 0xA1, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xA1, 0xA7, 0xA1, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0x45, 0x00, 0x00, /* 0x20-0x23 */ + 0xA3, 0xBB, 0xA1, 0x4C, 0xA1, 0x4B, 0xA1, 0x45, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xAC, 0xA1, 0xB2, /* 0x30-0x33 */ + 0x00, 0x00, 0xA1, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xB0, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC3, 0x00, 0x00, /* 0x3C-0x3F */ +}; + +static const unsigned char u2c_21[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA2, 0x4A, /* 0x00-0x03 */ + 0x00, 0x00, 0xA1, 0xC1, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xA2, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA2, 0xB9, 0xA2, 0xBA, 0xA2, 0xBB, 0xA2, 0xBC, /* 0x60-0x63 */ + 0xA2, 0xBD, 0xA2, 0xBE, 0xA2, 0xBF, 0xA2, 0xC0, /* 0x64-0x67 */ + 0xA2, 0xC1, 0xA2, 0xC2, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xA1, 0xF6, 0xA1, 0xF4, 0xA1, 0xF7, 0xA1, 0xF5, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF8, 0xA1, 0xF9, /* 0x94-0x97 */ + 0xA1, 0xFB, 0xA1, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ +}; + +static const unsigned char u2c_22[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xA2, 0x41, 0xA2, 0x42, 0x00, 0x00, /* 0x14-0x17 */ + 0xA2, 0x58, 0x00, 0x00, 0xA1, 0xD4, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xDB, 0xA1, 0xE8, /* 0x1C-0x1F */ + 0xA1, 0xE7, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xFD, /* 0x20-0x23 */ + 0x00, 0x00, 0xA1, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xA1, 0xE4, 0xA1, 0xE5, 0xA1, 0xEC, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xED, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xA1, 0xEF, 0xA1, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xDC, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA1, 0xDA, 0xA1, 0xDD, 0x00, 0x00, 0xA1, 0xDD, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xD8, 0xA1, 0xD9, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xA1, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xA1, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xA1, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xE9, /* 0xBC-0xBF */ +}; + +static const unsigned char u2c_23[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x5B, /* 0x04-0x07 */ +}; + +static const unsigned char u2c_25[512] = { + 0xA2, 0x77, 0x00, 0x00, 0xA2, 0x78, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xA2, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xA2, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xA2, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xA2, 0x7D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xA2, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xA2, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xA2, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xA2, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xA2, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF9, 0xF9, 0xF9, 0xF8, 0xF9, 0xE6, 0xF9, 0xEF, /* 0x50-0x53 */ + 0xF9, 0xDD, 0xF9, 0xE8, 0xF9, 0xF1, 0xF9, 0xDF, /* 0x54-0x57 */ + 0xF9, 0xEC, 0xF9, 0xF5, 0xF9, 0xE3, 0xF9, 0xEE, /* 0x58-0x5B */ + 0xF9, 0xF7, 0xF9, 0xE5, 0xF9, 0xE9, 0xF9, 0xF2, /* 0x5C-0x5F */ + 0xF9, 0xE0, 0xF9, 0xEB, 0xF9, 0xF4, 0xF9, 0xE2, /* 0x60-0x63 */ + 0xF9, 0xE7, 0xF9, 0xF0, 0xF9, 0xDE, 0xF9, 0xED, /* 0x64-0x67 */ + 0xF9, 0xF6, 0xF9, 0xE4, 0xF9, 0xEA, 0xF9, 0xF3, /* 0x68-0x6B */ + 0xF9, 0xE1, 0xA2, 0x7E, 0xA2, 0xA1, 0xA2, 0xA3, /* 0x6C-0x6F */ + 0xA2, 0xA2, 0xA2, 0xAC, 0xA2, 0xAD, 0xA2, 0xAE, /* 0x70-0x73 */ + 0xA1, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xA2, 0x62, 0xA2, 0x63, 0xA2, 0x64, /* 0x80-0x83 */ + 0xA2, 0x65, 0xA2, 0x66, 0xA2, 0x67, 0xA2, 0x68, /* 0x84-0x87 */ + 0xA2, 0x69, 0xA2, 0x70, 0xA2, 0x6F, 0xA2, 0x6E, /* 0x88-0x8B */ + 0xA2, 0x6D, 0xA2, 0x6C, 0xA2, 0x6B, 0xA2, 0x6A, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xFE, /* 0x90-0x93 */ + 0xA2, 0x76, 0xA2, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xA1, 0xBD, 0xA1, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xB6, 0xA1, 0xB5, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xA1, 0xBF, 0xA1, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xBB, 0xA1, 0xBA, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xB3, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xA1, 0xB7, 0xA1, 0xB4, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0xA8, 0xA2, 0xA9, /* 0xE0-0xE3 */ + 0xA2, 0xAB, 0xA2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ +}; + +static const unsigned char u2c_26[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xA1, 0xB9, 0xA1, 0xB8, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xA1, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xA1, 0xF0, 0xA1, 0xF2, 0xA1, 0xF1, 0x00, 0x00, /* 0x40-0x43 */ +}; + +static const unsigned char u2c_30[512] = { + 0xA1, 0x40, 0xA1, 0x42, 0xA1, 0x43, 0xA1, 0xB2, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xA1, 0x71, 0xA1, 0x72, 0xA1, 0x6D, 0xA1, 0x6E, /* 0x08-0x0B */ + 0xA1, 0x75, 0xA1, 0x76, 0xA1, 0x79, 0xA1, 0x7A, /* 0x0C-0x0F */ + 0xA1, 0x69, 0xA1, 0x6A, 0xA2, 0x45, 0x00, 0x00, /* 0x10-0x13 */ + 0xA1, 0x65, 0xA1, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xA1, 0xE3, 0xA1, 0xA9, 0xA1, 0xAA, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xA2, 0xC3, 0xA2, 0xC4, 0xA2, 0xC5, /* 0x20-0x23 */ + 0xA2, 0xC6, 0xA2, 0xC7, 0xA2, 0xC8, 0xA2, 0xC9, /* 0x24-0x27 */ + 0xA2, 0xCA, 0xA2, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xA1, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ +}; + +static const unsigned char u2c_31[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xA3, 0x74, 0xA3, 0x75, 0xA3, 0x76, /* 0x04-0x07 */ + 0xA3, 0x77, 0xA3, 0x78, 0xA3, 0x79, 0xA3, 0x7A, /* 0x08-0x0B */ + 0xA3, 0x7B, 0xA3, 0x7C, 0xA3, 0x7D, 0xA3, 0x7E, /* 0x0C-0x0F */ + 0xA3, 0xA1, 0xA3, 0xA2, 0xA3, 0xA3, 0xA3, 0xA4, /* 0x10-0x13 */ + 0xA3, 0xA5, 0xA3, 0xA6, 0xA3, 0xA7, 0xA3, 0xA8, /* 0x14-0x17 */ + 0xA3, 0xA9, 0xA3, 0xAA, 0xA3, 0xAB, 0xA3, 0xAC, /* 0x18-0x1B */ + 0xA3, 0xAD, 0xA3, 0xAE, 0xA3, 0xAF, 0xA3, 0xB0, /* 0x1C-0x1F */ + 0xA3, 0xB1, 0xA3, 0xB2, 0xA3, 0xB3, 0xA3, 0xB4, /* 0x20-0x23 */ + 0xA3, 0xB5, 0xA3, 0xB6, 0xA3, 0xB7, 0xA3, 0xB8, /* 0x24-0x27 */ + 0xA3, 0xB9, 0xA3, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0x40, 0xA4, 0x47, /* 0x90-0x93 */ + 0xA4, 0x54, 0xA5, 0x7C, 0xA4, 0x57, 0xA4, 0xA4, /* 0x94-0x97 */ + 0xA4, 0x55, 0xA5, 0xD2, 0xA4, 0x41, 0xA4, 0xFE, /* 0x98-0x9B */ + 0xA4, 0x42, 0xA4, 0xD1, 0xA6, 0x61, 0xA4, 0x48, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_32[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xA4, 0x40, 0xA4, 0x47, 0xA4, 0x54, 0xA5, 0x7C, /* 0x20-0x23 */ + 0xA4, 0xAD, 0xA4, 0xBB, 0xA4, 0x43, 0xA4, 0x4B, /* 0x24-0x27 */ + 0xA4, 0x45, 0xA4, 0x51, 0xA4, 0xEB, 0xA4, 0xF5, /* 0x28-0x2B */ + 0xA4, 0xF4, 0xA4, 0xEC, 0xAA, 0xF7, 0xA4, 0x67, /* 0x2C-0x2F */ + 0xA4, 0xE9, 0xAE, 0xE8, 0xA6, 0xB3, 0xAA, 0xC0, /* 0x30-0x33 */ + 0xA6, 0x57, 0xAF, 0x53, 0xB0, 0x5D, 0xAF, 0xAC, /* 0x34-0x37 */ + 0xB3, 0xD2, 0xA5, 0x4E, 0xA9, 0x49, 0xBE, 0xC7, /* 0x38-0x3B */ + 0xBA, 0xCA, 0xA5, 0xF8, 0xB8, 0xEA, 0xA8, 0xF3, /* 0x3C-0x3F */ + 0xB2, 0xBD, 0xA5, 0xF0, 0xA6, 0xDB, 0xA6, 0xDC, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xA4, 0x40, 0xA4, 0x47, 0xA4, 0x54, 0xA5, 0x7C, /* 0x80-0x83 */ + 0xA4, 0xAD, 0xA4, 0xBB, 0xA4, 0x43, 0xA4, 0x4B, /* 0x84-0x87 */ + 0xA4, 0x45, 0xA4, 0x51, 0xA4, 0xEB, 0xA4, 0xF5, /* 0x88-0x8B */ + 0xA4, 0xF4, 0xA4, 0xEC, 0xAA, 0xF7, 0xA4, 0x67, /* 0x8C-0x8F */ + 0xA4, 0xE9, 0xAE, 0xE8, 0xA6, 0xB3, 0xAA, 0xC0, /* 0x90-0x93 */ + 0xA6, 0x57, 0xAF, 0x53, 0xB0, 0x5D, 0xAF, 0xAC, /* 0x94-0x97 */ + 0xB3, 0xD2, 0xAF, 0xB5, 0xA8, 0x6B, 0xA4, 0x6B, /* 0x98-0x9B */ + 0xBE, 0x41, 0xC0, 0x75, 0xA6, 0x4C, 0xAA, 0x60, /* 0x9C-0x9F */ + 0xB6, 0xB5, 0xA5, 0xF0, 0xBC, 0x67, 0xA1, 0xC0, /* 0xA0-0xA3 */ + 0xA4, 0x57, 0xA4, 0xA4, 0xA4, 0x55, 0xA5, 0xAA, /* 0xA4-0xA7 */ + 0xA5, 0x6B, 0xC2, 0xE5, 0xA9, 0x76, 0xBE, 0xC7, /* 0xA8-0xAB */ + 0xBA, 0xCA, 0xA5, 0xF8, 0xB8, 0xEA, 0xA8, 0xF3, /* 0xAC-0xAF */ + 0xA9, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ +}; + +static const unsigned char u2c_33[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0x55, 0xA2, 0x56, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xA2, 0x50, 0xA2, 0x51, 0xA2, 0x52, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xA2, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xA2, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xA2, 0x53, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xA1, 0xEB, 0xA1, 0xEA, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xA2, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ +}; + +static const unsigned char u2c_4E[512] = { + 0xA4, 0x40, 0xA4, 0x42, 0x00, 0x00, 0xA4, 0x43, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x45, /* 0x04-0x07 */ + 0xA4, 0x56, 0xA4, 0x54, 0xA4, 0x57, 0xA4, 0x55, /* 0x08-0x0B */ + 0xC9, 0x46, 0xA4, 0xA3, 0xC9, 0x4F, 0xC9, 0x4D, /* 0x0C-0x0F */ + 0xA4, 0xA2, 0xA4, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xA5, 0x42, 0xA5, 0x41, 0xA5, 0x40, 0x00, 0x00, /* 0x14-0x17 */ + 0xA5, 0x43, 0xA4, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xA5, 0xE0, 0xA5, 0xE1, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC3, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA4, 0x58, /* 0x28-0x2B */ + 0x00, 0x00, 0xA4, 0xA4, 0xC9, 0x50, 0x00, 0x00, /* 0x2C-0x2F */ + 0xA4, 0xA5, 0xC9, 0x63, 0xA6, 0xEA, 0xCB, 0xB1, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xA4, 0x59, 0xA4, 0xA6, 0x00, 0x00, 0xA5, 0x44, /* 0x38-0x3B */ + 0xC9, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xC9, 0x40, 0xA4, 0x44, /* 0x40-0x43 */ + 0x00, 0x00, 0xA4, 0x5B, 0x00, 0x00, 0xC9, 0x47, /* 0x44-0x47 */ + 0xA4, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xA7, /* 0x48-0x4B */ + 0x00, 0x00, 0xA5, 0x45, 0xA5, 0x47, 0xA5, 0x46, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xA5, 0xE2, 0xA5, 0xE3, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC4, 0x00, 0x00, /* 0x54-0x57 */ + 0xAD, 0xBC, 0xA4, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xC9, 0x41, 0xA4, 0x45, 0xA4, 0x5E, 0xA4, 0x5D, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xA5, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC5, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xB0, 0xAE, 0xD4, 0x4B, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xB6, 0xC3, 0xDC, 0xB1, /* 0x80-0x83 */ + 0xDC, 0xB2, 0x00, 0x00, 0xA4, 0x46, 0x00, 0x00, /* 0x84-0x87 */ + 0xA4, 0xA9, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC6, /* 0x88-0x8B */ + 0xA4, 0x47, 0xC9, 0x48, 0xA4, 0x5F, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xA4, 0xAA, 0xA4, 0xAC, 0xC9, 0x51, /* 0x90-0x93 */ + 0xA4, 0xAD, 0xA4, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xA5, 0xE5, 0x00, 0x00, 0xA8, 0xC7, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC8, 0xAB, 0x45, /* 0x9C-0x9F */ + 0x00, 0x00, 0xA4, 0x60, 0xA4, 0xAE, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xA5, 0xE6, 0xA5, 0xE8, 0xA5, 0xE7, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xA6, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xC9, /* 0xA8-0xAB */ + 0xA8, 0xCA, 0xAB, 0x46, 0xAB, 0x47, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xBD, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xB3, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xF6, 0xD6, 0xA4, 0x48, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xA4, 0xB0, 0xA4, 0xAF, 0xC9, 0x52, 0xA4, 0xB1, /* 0xC0-0xC3 */ + 0xA4, 0xB7, 0x00, 0x00, 0xA4, 0xB2, 0xA4, 0xB3, /* 0xC4-0xC7 */ + 0xC9, 0x54, 0xC9, 0x53, 0xA4, 0xB5, 0xA4, 0xB6, /* 0xC8-0xCB */ + 0x00, 0x00, 0xA4, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xA5, 0x4A, 0xA5, 0x4B, 0xA5, 0x4C, 0xA5, 0x4D, /* 0xD4-0xD7 */ + 0xA5, 0x49, 0xA5, 0x50, 0xC9, 0x6A, 0x00, 0x00, /* 0xD8-0xDB */ + 0xC9, 0x66, 0xC9, 0x69, 0xA5, 0x51, 0xA5, 0x61, /* 0xDC-0xDF */ + 0x00, 0x00, 0xC9, 0x68, 0x00, 0x00, 0xA5, 0x4E, /* 0xE0-0xE3 */ + 0xA5, 0x4F, 0xA5, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xC9, 0x65, 0xC9, 0x67, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xA5, 0xF5, 0xC9, 0xB0, 0xA5, 0xF2, 0xA5, 0xF6, /* 0xF0-0xF3 */ + 0xC9, 0xBA, 0xC9, 0xAE, 0xA5, 0xF3, 0xC9, 0xB2, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA5, 0xF4, /* 0xF8-0xFB */ + 0x00, 0x00, 0xA5, 0xF7, 0x00, 0x00, 0xA5, 0xE9, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_4F[512] = { + 0xC9, 0xB1, 0xA5, 0xF8, 0xC9, 0xB5, 0x00, 0x00, /* 0x00-0x03 */ + 0xC9, 0xB9, 0xC9, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xC9, 0xB3, 0xA5, 0xEA, 0xA5, 0xEC, 0xA5, 0xF9, /* 0x08-0x0B */ + 0x00, 0x00, 0xA5, 0xEE, 0xC9, 0xAB, 0xA5, 0xF1, /* 0x0C-0x0F */ + 0xA5, 0xEF, 0xA5, 0xF0, 0xC9, 0xBB, 0xC9, 0xB8, /* 0x10-0x13 */ + 0xC9, 0xAF, 0xA5, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xC9, 0xAC, 0xA5, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xC9, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xC9, 0xB7, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xC9, 0xAD, 0xCA, 0x66, 0x00, 0x00, 0xA7, 0x42, /* 0x2C-0x2F */ + 0xA6, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xCA, 0x67, /* 0x30-0x33 */ + 0xA6, 0xF1, 0x00, 0x00, 0xA7, 0x44, 0x00, 0x00, /* 0x34-0x37 */ + 0xA6, 0xF9, 0x00, 0x00, 0xA6, 0xF8, 0xCA, 0x5B, /* 0x38-0x3B */ + 0xA6, 0xFC, 0xA6, 0xF7, 0xCA, 0x60, 0xCA, 0x68, /* 0x3C-0x3F */ + 0x00, 0x00, 0xCA, 0x64, 0x00, 0x00, 0xA6, 0xFA, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xA6, 0xFD, 0xA6, 0xEE, /* 0x44-0x47 */ + 0xA7, 0x47, 0xCA, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xCB, 0xBD, 0xA6, 0xEC, 0xA7, 0x43, 0xA6, 0xED, /* 0x4C-0x4F */ + 0xA6, 0xF5, 0xA6, 0xF6, 0xCA, 0x62, 0xCA, 0x5E, /* 0x50-0x53 */ + 0xA6, 0xFB, 0xA6, 0xF3, 0xCA, 0x5A, 0xA6, 0xEF, /* 0x54-0x57 */ + 0xCA, 0x65, 0xA7, 0x45, 0xA7, 0x48, 0xA6, 0xF2, /* 0x58-0x5B */ + 0xA7, 0x40, 0xA7, 0x46, 0xA6, 0xF0, 0xCA, 0x63, /* 0x5C-0x5F */ + 0xA7, 0x41, 0xCA, 0x69, 0xCA, 0x5C, 0xA6, 0xFE, /* 0x60-0x63 */ + 0xCA, 0x5F, 0x00, 0x00, 0x00, 0x00, 0xCA, 0x61, /* 0x64-0x67 */ + 0x00, 0x00, 0xA8, 0xD8, 0xCB, 0xBF, 0xCB, 0xCB, /* 0x68-0x6B */ + 0xA8, 0xD0, 0x00, 0x00, 0xCB, 0xCC, 0xA8, 0xCB, /* 0x6C-0x6F */ + 0xA8, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xCE, /* 0x70-0x73 */ + 0xCB, 0xB9, 0xA8, 0xD6, 0xCB, 0xB8, 0xCB, 0xBC, /* 0x74-0x77 */ + 0xCB, 0xC3, 0xCB, 0xC1, 0xA8, 0xDE, 0xA8, 0xD9, /* 0x78-0x7B */ + 0xCB, 0xB3, 0xCB, 0xB5, 0xA8, 0xDB, 0xA8, 0xCF, /* 0x7C-0x7F */ + + 0xCB, 0xB6, 0xCB, 0xC2, 0xCB, 0xC9, 0xA8, 0xD4, /* 0x80-0x83 */ + 0xCB, 0xBB, 0xCB, 0xB4, 0xA8, 0xD3, 0xCB, 0xB7, /* 0x84-0x87 */ + 0xA8, 0xD7, 0xCB, 0xBA, 0x00, 0x00, 0xA8, 0xD2, /* 0x88-0x8B */ + 0x00, 0x00, 0xA8, 0xCD, 0x00, 0x00, 0xA8, 0xDC, /* 0x8C-0x8F */ + 0xCB, 0xC4, 0xA8, 0xDD, 0xCB, 0xC8, 0x00, 0x00, /* 0x90-0x93 */ + 0xCB, 0xC6, 0xCB, 0xCA, 0xA8, 0xDA, 0xCB, 0xBE, /* 0x94-0x97 */ + 0xCB, 0xB2, 0x00, 0x00, 0xCB, 0xC0, 0xA8, 0xD1, /* 0x98-0x9B */ + 0xCB, 0xC5, 0xA8, 0xCC, 0xCB, 0xC7, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0x56, 0xAB, 0x4A, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xCD, 0xE0, 0xCD, 0xE8, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xAB, 0x49, 0xAB, 0x51, 0xAB, 0x5D, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xCD, 0xEE, 0xCD, 0xEC, 0xCD, 0xE7, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x4B, /* 0xBC-0xBF */ + 0xCD, 0xED, 0xCD, 0xE3, 0xAB, 0x59, 0xAB, 0x50, /* 0xC0-0xC3 */ + 0xAB, 0x58, 0xCD, 0xDE, 0x00, 0x00, 0xCD, 0xEA, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xCD, 0xE1, 0xAB, 0x54, 0xCD, 0xE2, /* 0xC8-0xCB */ + 0x00, 0x00, 0xCD, 0xDD, 0xAB, 0x5B, 0xAB, 0x4E, /* 0xCC-0xCF */ + 0xAB, 0x57, 0xAB, 0x4D, 0x00, 0x00, 0xCD, 0xDF, /* 0xD0-0xD3 */ + 0xCD, 0xE4, 0x00, 0x00, 0xCD, 0xEB, 0xAB, 0x55, /* 0xD4-0xD7 */ + 0xAB, 0x52, 0xCD, 0xE6, 0xAB, 0x5A, 0xCD, 0xE9, /* 0xD8-0xDB */ + 0xCD, 0xE5, 0xAB, 0x4F, 0xAB, 0x5C, 0xAB, 0x53, /* 0xDC-0xDF */ + 0xAB, 0x4C, 0xAB, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xCD, 0xEF, 0x00, 0x00, 0xAD, 0xD7, 0xAD, 0xC1, /* 0xEC-0xEF */ + 0x00, 0x00, 0xAD, 0xD1, 0x00, 0x00, 0xAD, 0xD6, /* 0xF0-0xF3 */ + 0xD0, 0xD0, 0xD0, 0xCF, 0xD0, 0xD4, 0xD0, 0xD5, /* 0xF4-0xF7 */ + 0xAD, 0xC4, 0x00, 0x00, 0xAD, 0xCD, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xAD, 0xDA, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_50[512] = { + 0xAD, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xD0, 0xC9, 0xAD, 0xC7, 0xD0, 0xCA, /* 0x04-0x07 */ + 0x00, 0x00, 0xAD, 0xDC, 0x00, 0x00, 0xAD, 0xD3, /* 0x08-0x0B */ + 0xAD, 0xBE, 0xAD, 0xBF, 0xD0, 0xDD, 0xB0, 0xBF, /* 0x0C-0x0F */ + 0x00, 0x00, 0xAD, 0xCC, 0xAD, 0xCB, 0xD0, 0xCB, /* 0x10-0x13 */ + 0xAD, 0xCF, 0xD4, 0x5B, 0xAD, 0xC6, 0xD0, 0xD6, /* 0x14-0x17 */ + 0xAD, 0xD5, 0xAD, 0xD4, 0xAD, 0xCA, 0xD0, 0xCE, /* 0x18-0x1B */ + 0xD0, 0xD7, 0x00, 0x00, 0xD0, 0xC8, 0xAD, 0xC9, /* 0x1C-0x1F */ + 0xD0, 0xD8, 0xAD, 0xD2, 0xD0, 0xCC, 0xAD, 0xC0, /* 0x20-0x23 */ + 0x00, 0x00, 0xAD, 0xC3, 0xAD, 0xC2, 0xD0, 0xD9, /* 0x24-0x27 */ + 0xAD, 0xD0, 0xAD, 0xC5, 0xAD, 0xD9, 0xAD, 0xDB, /* 0x28-0x2B */ + 0xD0, 0xD3, 0xAD, 0xD8, 0x00, 0x00, 0xD0, 0xDB, /* 0x2C-0x2F */ + 0xD0, 0xCD, 0xD0, 0xDC, 0x00, 0x00, 0xD0, 0xD1, /* 0x30-0x33 */ + 0x00, 0x00, 0xD0, 0xDA, 0x00, 0x00, 0xD0, 0xD2, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xAD, 0xC8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xD4, 0x63, 0xD4, 0x57, 0x00, 0x00, 0xB0, 0xB3, /* 0x40-0x43 */ + 0x00, 0x00, 0xD4, 0x5C, 0xD4, 0x62, 0xB0, 0xB2, /* 0x44-0x47 */ + 0xD4, 0x55, 0xB0, 0xB6, 0xD4, 0x59, 0xD4, 0x52, /* 0x48-0x4B */ + 0xB0, 0xB4, 0xD4, 0x56, 0xB0, 0xB9, 0xB0, 0xBE, /* 0x4C-0x4F */ + 0x00, 0x00, 0xD4, 0x67, 0x00, 0x00, 0xD4, 0x51, /* 0x50-0x53 */ + 0x00, 0x00, 0xB0, 0xBA, 0x00, 0x00, 0xD4, 0x66, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xB0, 0xB5, 0xD4, 0x58, /* 0x58-0x5B */ + 0xB0, 0xB1, 0xD4, 0x53, 0xD4, 0x4F, 0xD4, 0x5D, /* 0x5C-0x5F */ + 0xD4, 0x50, 0xD4, 0x4E, 0xD4, 0x5A, 0xD4, 0x60, /* 0x60-0x63 */ + 0xD4, 0x61, 0xB0, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xD8, 0x5B, 0xD4, 0x5E, 0xD4, 0x4D, 0xD4, 0x5F, /* 0x68-0x6B */ + 0x00, 0x00, 0xB0, 0xC1, 0xD4, 0x64, 0xB0, 0xC0, /* 0x6C-0x6F */ + 0xD4, 0x4C, 0x00, 0x00, 0xD4, 0x54, 0xD4, 0x65, /* 0x70-0x73 */ + 0xB0, 0xBC, 0xB0, 0xBB, 0xB0, 0xB8, 0xB0, 0xBD, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xB0, 0xAF, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xB0, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xB3, 0xC8, 0x00, 0x00, 0xD8, 0x5E, 0xD8, 0x57, /* 0x80-0x83 */ + 0x00, 0x00, 0xB3, 0xC5, 0x00, 0x00, 0xD8, 0x5F, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0x55, /* 0x88-0x8B */ + 0xD8, 0x58, 0xB3, 0xC4, 0xD8, 0x59, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xB3, 0xC7, 0xD8, 0x5D, 0x00, 0x00, /* 0x90-0x93 */ + 0xD8, 0x53, 0xD8, 0x52, 0xB3, 0xC9, 0x00, 0x00, /* 0x94-0x97 */ + 0xB3, 0xCA, 0xB3, 0xC6, 0xB3, 0xCB, 0xD8, 0x51, /* 0x98-0x9B */ + 0xD8, 0x5C, 0xD8, 0x5A, 0xD8, 0x54, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0xC3, 0xD8, 0x56, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xB6, 0xCA, 0xB6, 0xC4, 0xDC, 0xB7, 0xB6, 0xCD, /* 0xAC-0xAF */ + 0xDC, 0xBD, 0xDC, 0xC0, 0xB6, 0xC6, 0xB6, 0xC7, /* 0xB0-0xB3 */ + 0xDC, 0xBA, 0xB6, 0xC5, 0xDC, 0xC3, 0xB6, 0xCB, /* 0xB4-0xB7 */ + 0xDC, 0xC4, 0x00, 0x00, 0xDC, 0xBF, 0xB6, 0xCC, /* 0xB8-0xBB */ + 0x00, 0x00, 0xDC, 0xB4, 0xB6, 0xC9, 0xDC, 0xB5, /* 0xBC-0xBF */ + 0x00, 0x00, 0xDC, 0xBE, 0xDC, 0xBC, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xDC, 0xB8, 0xB6, 0xC8, 0xDC, 0xB6, 0xB6, 0xCE, /* 0xC4-0xC7 */ + 0xDC, 0xBB, 0xDC, 0xC2, 0xDC, 0xB9, 0xDC, 0xC1, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xB9, 0xB6, 0xB9, 0xB3, /* 0xCC-0xCF */ + 0x00, 0x00, 0xB9, 0xB4, 0x00, 0x00, 0xE0, 0xF9, /* 0xD0-0xD3 */ + 0xE0, 0xF1, 0xB9, 0xB2, 0xB9, 0xAF, 0xE0, 0xF2, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xB9, 0xB1, 0xE0, 0xF5, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE0, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xE0, 0xFE, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xFD, /* 0xE0-0xE3 */ + 0xE0, 0xF8, 0xB9, 0xAE, 0xE0, 0xF0, 0xB9, 0xAC, /* 0xE4-0xE7 */ + 0xE0, 0xF3, 0xB9, 0xB7, 0xE0, 0xF6, 0x00, 0x00, /* 0xE8-0xEB */ + 0xE0, 0xFA, 0xB9, 0xB0, 0xB9, 0xAD, 0xE0, 0xFC, /* 0xEC-0xEF */ + 0xE0, 0xFB, 0xB9, 0xB5, 0x00, 0x00, 0xE0, 0xF4, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xBB, 0xF8, 0xE4, 0xEC, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xE4, 0xE9, 0xBB, 0xF9, 0x00, 0x00, 0xBB, 0xF7, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE4, 0xF0, 0xE4, 0xED, 0xE4, 0xE6, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_51[512] = { + 0xBB, 0xF6, 0x00, 0x00, 0xBB, 0xFA, 0xE4, 0xE7, /* 0x00-0x03 */ + 0xBB, 0xF5, 0xBB, 0xFD, 0xE4, 0xEA, 0xE4, 0xEB, /* 0x04-0x07 */ + 0xBB, 0xFB, 0xBB, 0xFC, 0xE4, 0xF1, 0xE4, 0xEE, /* 0x08-0x0B */ + 0xE4, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xBE, 0xAA, 0xE8, 0xF8, 0xBE, 0xA7, 0xE8, 0xF5, /* 0x10-0x13 */ + 0xBE, 0xA9, 0xBE, 0xAB, 0x00, 0x00, 0xE8, 0xF6, /* 0x14-0x17 */ + 0xBE, 0xA8, 0x00, 0x00, 0xE8, 0xF7, 0x00, 0x00, /* 0x18-0x1B */ + 0xE8, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xC0, 0x76, /* 0x1C-0x1F */ + 0xEC, 0xBD, 0xC0, 0x77, 0xEC, 0xBB, 0x00, 0x00, /* 0x20-0x23 */ + 0xEC, 0xBC, 0xEC, 0xBA, 0xEC, 0xB9, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xEC, 0xBE, 0xC0, 0x75, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xEF, 0xB8, 0xEF, 0xB9, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE4, 0xE8, 0xEF, 0xB7, 0xC0, 0x78, 0xC3, 0x5F, /* 0x30-0x33 */ + 0xF1, 0xEB, 0xF1, 0xEC, 0x00, 0x00, 0xC4, 0xD7, /* 0x34-0x37 */ + 0xC4, 0xD8, 0xF5, 0xC1, 0xF5, 0xC0, 0xC5, 0x6C, /* 0x38-0x3B */ + 0xC5, 0x6B, 0xF7, 0xD0, 0x00, 0x00, 0xA4, 0x49, /* 0x3C-0x3F */ + 0xA4, 0x61, 0xA4, 0xB9, 0x00, 0x00, 0xA4, 0xB8, /* 0x40-0x43 */ + 0xA5, 0x53, 0xA5, 0x52, 0xA5, 0xFC, 0xA5, 0xFB, /* 0x44-0x47 */ + 0xA5, 0xFD, 0xA5, 0xFA, 0x00, 0x00, 0xA7, 0x4A, /* 0x48-0x4B */ + 0xA7, 0x49, 0xA7, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xE0, 0x00, 0x00, /* 0x50-0x53 */ + 0xA8, 0xDF, 0xA8, 0xE1, 0x00, 0x00, 0xAB, 0x5E, /* 0x54-0x57 */ + 0x00, 0x00, 0xA2, 0x59, 0xD0, 0xDE, 0xA2, 0x5A, /* 0x58-0x5B */ + 0xB0, 0xC2, 0xA2, 0x5C, 0xA2, 0x5B, 0xD8, 0x60, /* 0x5C-0x5F */ + 0x00, 0x00, 0xA2, 0x5D, 0xB9, 0xB8, 0xA2, 0x5E, /* 0x60-0x63 */ + 0x00, 0x00, 0xA4, 0x4A, 0x00, 0x00, 0xA4, 0xBA, /* 0x64-0x67 */ + 0xA5, 0xFE, 0xA8, 0xE2, 0x00, 0x00, 0xA4, 0x4B, /* 0x68-0x6B */ + 0xA4, 0xBD, 0xA4, 0xBB, 0xA4, 0xBC, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xA6, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xA7, 0x4C, 0xA8, 0xE4, 0xA8, 0xE3, /* 0x74-0x77 */ + 0xA8, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xAD, 0xDD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xBE, 0xAC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x4E, /* 0x84-0x87 */ + 0x00, 0x00, 0xA5, 0x54, 0xA5, 0x55, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xA6, 0x41, 0x00, 0x00, 0xCA, 0x6A, /* 0x8C-0x8F */ + 0x00, 0x00, 0xAB, 0x60, 0xAB, 0x5F, 0xD0, 0xE0, /* 0x90-0x93 */ + 0xD0, 0xDF, 0xB0, 0xC3, 0x00, 0x00, 0xA4, 0xBE, /* 0x94-0x97 */ + 0xC9, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xCD, 0x00, 0x00, /* 0x9C-0x9F */ + 0xAB, 0x61, 0x00, 0x00, 0xAD, 0xE0, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xAD, 0xDE, 0xAD, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xBE, 0xAD, 0x00, 0x00, /* 0xA8-0xAB */ + 0xA5, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xA6, 0x42, 0xC9, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0x4D, 0xA7, 0x4E, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xCA, 0x6B, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xCB, 0xCE, 0xA8, 0xE6, 0xCB, 0xCF, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xD0, 0xE2, 0xD0, 0xE3, 0xAD, 0xE3, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xD0, 0xE4, 0x00, 0x00, 0xD0, 0xE1, 0xAD, 0xE4, /* 0xC8-0xCB */ + 0xAD, 0xE2, 0xAD, 0xE1, 0xD0, 0xE5, 0x00, 0x00, /* 0xCC-0xCF */ + 0xD4, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xD8, 0x61, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xC5, /* 0xD4-0xD7 */ + 0xE1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xBB, 0xFE, 0xBE, 0xAE, 0xE8, 0xF9, 0x00, 0x00, /* 0xDC-0xDF */ + 0xA4, 0x4C, 0xA4, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xB0, 0xC4, 0xB3, 0xCD, 0x00, 0x00, 0xB9, 0xB9, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xC9, 0x42, 0xA4, 0xBF, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xA5, 0x59, 0xA5, 0x57, 0xA5, 0x58, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xA8, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_52[512] = { + 0xA4, 0x4D, 0xA4, 0x4E, 0x00, 0x00, 0xA4, 0x62, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0xC0, 0xA4, 0xC1, /* 0x04-0x07 */ + 0xA4, 0xC2, 0xC9, 0xBE, 0xA5, 0x5A, 0x00, 0x00, /* 0x08-0x0B */ + 0xC9, 0x6B, 0x00, 0x00, 0xA6, 0x46, 0x00, 0x00, /* 0x0C-0x0F */ + 0xC9, 0xBF, 0xA6, 0x44, 0xA6, 0x45, 0xC9, 0xBD, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xA6, 0x47, 0xA6, 0x43, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xCA, 0x6C, 0xAA, 0xEC, 0xCA, 0x6D, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xCA, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xA7, 0x50, 0xA7, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xA7, 0x53, 0xA7, 0x51, 0xA7, 0x52, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xED, 0x00, 0x00, /* 0x2C-0x2F */ + 0xA8, 0xEC, 0xCB, 0xD4, 0xCB, 0xD1, 0xCB, 0xD2, /* 0x30-0x33 */ + 0x00, 0x00, 0xCB, 0xD0, 0xA8, 0xEE, 0xA8, 0xEA, /* 0x34-0x37 */ + 0xA8, 0xE9, 0x00, 0x00, 0xA8, 0xEB, 0xA8, 0xE8, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xA8, 0xEF, 0x00, 0x00, 0xAB, 0x63, /* 0x40-0x43 */ + 0xCD, 0xF0, 0x00, 0x00, 0xCB, 0xD3, 0xAB, 0x68, /* 0x44-0x47 */ + 0x00, 0x00, 0xCD, 0xF1, 0xAB, 0x64, 0xAB, 0x67, /* 0x48-0x4B */ + 0xAB, 0x66, 0xAB, 0x65, 0xAB, 0x62, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xE8, 0x00, 0x00, /* 0x50-0x53 */ + 0xAD, 0xE7, 0xD0, 0xEB, 0xAD, 0xE5, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xE7, 0xAD, 0xE8, /* 0x58-0x5B */ + 0xAD, 0xE6, 0xAD, 0xE9, 0xD0, 0xE9, 0xD0, 0xEA, /* 0x5C-0x5F */ + 0x00, 0x00, 0xD0, 0xE6, 0xD0, 0xEC, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xB3, 0xD1, 0xB0, 0xC5, 0xD4, 0x69, /* 0x68-0x6B */ + 0xD4, 0x6B, 0xD4, 0x6A, 0xD4, 0x6C, 0xB0, 0xC6, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0xCE, 0x00, 0x00, /* 0x70-0x73 */ + 0xB3, 0xCF, 0xB3, 0xD0, 0x00, 0x00, 0xB6, 0xD0, /* 0x74-0x77 */ + 0xDC, 0xC7, 0x00, 0x00, 0xDC, 0xC6, 0xDC, 0xC8, /* 0x78-0x7B */ + 0xDC, 0xC9, 0xB6, 0xD1, 0x00, 0x00, 0xB6, 0xCF, /* 0x7C-0x7F */ + + 0xE1, 0x41, 0xE1, 0x42, 0xB9, 0xBB, 0xB9, 0xBA, /* 0x80-0x83 */ + 0xE3, 0x5A, 0x00, 0x00, 0x00, 0x00, 0xBC, 0x40, /* 0x84-0x87 */ + 0xBC, 0x41, 0xBC, 0x42, 0xBC, 0x44, 0xE4, 0xF2, /* 0x88-0x8B */ + 0xE4, 0xF3, 0xBC, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xBE, 0xAF, 0x00, 0x00, 0xBE, 0xB0, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xED, 0xF5, 0xC3, /* 0x94-0x97 */ + 0xF5, 0xC2, 0xF7, 0xD1, 0x00, 0x00, 0xA4, 0x4F, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA5, 0x5C, /* 0x9C-0x9F */ + 0xA5, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x48, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xC9, 0xC0, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xA7, 0x55, 0xA7, 0x56, 0xA7, 0x54, /* 0xA8-0xAB */ + 0xA7, 0x57, 0xCA, 0x6F, 0xCA, 0x70, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xF1, /* 0xB8-0xBB */ + 0xCB, 0xD5, 0x00, 0x00, 0xA8, 0xF0, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCD, 0xF2, 0xAB, 0x6C, 0xCD, 0xF3, 0xAB, 0x6B, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x69, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xAB, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xD0, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xB0, 0xC7, 0xD4, 0x6E, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xB0, 0xCA, 0xD4, 0x6D, 0xB1, 0xE5, /* 0xD4-0xD7 */ + 0xB0, 0xC9, 0xB0, 0xC8, 0x00, 0x00, 0xB3, 0xD4, /* 0xD8-0xDB */ + 0x00, 0x00, 0xB3, 0xD3, 0xB3, 0xD2, 0xB6, 0xD2, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xB6, 0xD5, 0xB6, 0xD6, /* 0xE0-0xE3 */ + 0xB6, 0xD4, 0x00, 0x00, 0xB6, 0xD3, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xE1, 0x43, 0x00, 0x00, 0xE1, 0x44, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xF5, /* 0xEC-0xEF */ + 0xBC, 0x45, 0xE4, 0xF4, 0x00, 0x00, 0xBE, 0xB1, /* 0xF0-0xF3 */ + 0xEC, 0xBF, 0xC0, 0x79, 0x00, 0x00, 0xF1, 0xEE, /* 0xF4-0xF7 */ + 0xC4, 0x55, 0x00, 0x00, 0xA4, 0x63, 0xA4, 0xC3, /* 0xF8-0xFB */ + 0xC9, 0x56, 0x00, 0x00, 0xA4, 0xC4, 0xA4, 0xC5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_53[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xA5, 0x5D, 0xA5, 0x5E, 0x00, 0x00, /* 0x04-0x07 */ + 0xA6, 0x49, 0xCA, 0x71, 0xCB, 0xD6, 0xCB, 0xD7, /* 0x08-0x0B */ + 0x00, 0x00, 0xAB, 0x6D, 0xD0, 0xEE, 0xB0, 0xCC, /* 0x0C-0x0F */ + 0xB0, 0xCB, 0xD8, 0x63, 0xD8, 0x62, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xA4, 0x50, 0xA4, 0xC6, 0xA5, 0x5F, /* 0x14-0x17 */ + 0x00, 0x00, 0xB0, 0xCD, 0xC9, 0x43, 0x00, 0x00, /* 0x18-0x1B */ + 0xC9, 0x6C, 0xA5, 0x60, 0x00, 0x00, 0xC9, 0xC2, /* 0x1C-0x1F */ + 0xA6, 0x4B, 0xA6, 0x4A, 0xC9, 0xC1, 0xA7, 0x58, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xAD, 0xEA, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xD4, 0x6F, 0x00, 0x00, 0xB6, 0xD7, /* 0x2C-0x2F */ + 0xE1, 0x45, 0xB9, 0xBC, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xE8, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xFD, /* 0x34-0x37 */ + 0x00, 0x00, 0xA4, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xCB, 0xD8, 0xCD, 0xF4, 0xB0, 0xD0, 0xB0, 0xCE, /* 0x3C-0x3F */ + 0xB0, 0xCF, 0xA4, 0x51, 0x00, 0x00, 0xA4, 0x64, /* 0x40-0x43 */ + 0xA2, 0xCD, 0xA4, 0xCA, 0x00, 0x00, 0xA4, 0xC9, /* 0x44-0x47 */ + 0xA4, 0xC8, 0xA5, 0x63, 0xA5, 0x62, 0x00, 0x00, /* 0x48-0x4B */ + 0xC9, 0x6D, 0xC9, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xA8, 0xF5, 0xA8, 0xF2, 0xA8, 0xF4, /* 0x50-0x53 */ + 0xA8, 0xF3, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x6E, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0xD5, 0x00, 0x00, /* 0x58-0x5B */ + 0xA4, 0x52, 0x00, 0x00, 0xA4, 0xCB, 0x00, 0x00, /* 0x5C-0x5F */ + 0xA5, 0x65, 0xA5, 0x64, 0x00, 0x00, 0xCA, 0x72, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xF6, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xC9, 0x57, 0x00, 0x00, 0xA5, 0x67, 0xA5, 0x66, /* 0x6C-0x6F */ + 0xA6, 0x4C, 0xA6, 0x4D, 0xCA, 0x73, 0xA7, 0x59, /* 0x70-0x73 */ + 0x00, 0x00, 0xA7, 0x5A, 0x00, 0x00, 0xA8, 0xF7, /* 0x74-0x77 */ + 0xA8, 0xF8, 0xA8, 0xF9, 0x00, 0x00, 0xAB, 0x6F, /* 0x78-0x7B */ + 0xCD, 0xF5, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xEB, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xC9, 0x44, 0x00, 0x00, /* 0x80-0x83 */ + 0xA4, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xC9, 0xC4, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0x74, 0xCA, 0x75, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xD9, 0x00, 0x00, /* 0x90-0x93 */ + 0xCB, 0xDA, 0x00, 0x00, 0xCD, 0xF7, 0xCD, 0xF6, /* 0x94-0x97 */ + 0xCD, 0xF9, 0xCD, 0xF8, 0xAB, 0x70, 0x00, 0x00, /* 0x98-0x9B */ + 0xD4, 0x70, 0xAD, 0xED, 0xD0, 0xEF, 0xAD, 0xEC, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xD8, 0x64, 0xB3, 0xD6, 0x00, 0x00, 0xD8, 0x65, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE1, 0x46, 0xB9, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xBC, 0x46, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xF1, 0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xC9, 0x58, 0x00, 0x00, 0xA5, 0x68, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB0, 0xD1, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xA4, 0x53, 0xA4, 0x65, 0xA4, 0xCE, 0xA4, 0xCD, /* 0xC8-0xCB */ + 0x00, 0x00, 0xA4, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xA8, 0xFB, 0x00, 0x00, 0xA8, 0xFA, 0xA8, 0xFC, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0x71, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xEE, /* 0xDC-0xDF */ + 0x00, 0x00, 0xE8, 0xFB, 0xC2, 0x4F, 0xA4, 0x66, /* 0xE0-0xE3 */ + 0xA5, 0x6A, 0xA5, 0x79, 0xA5, 0x74, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xA5, 0x6F, 0xA5, 0x6E, 0xA5, 0x75, 0xA5, 0x73, /* 0xE8-0xEB */ + 0xA5, 0x6C, 0xA5, 0x7A, 0xA5, 0x6D, 0xA5, 0x69, /* 0xEC-0xEF */ + 0xA5, 0x78, 0xA5, 0x77, 0xA5, 0x76, 0xA5, 0x6B, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xA5, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xA5, 0x71, 0x00, 0x00, 0x00, 0x00, 0xA5, 0x7B, /* 0xF8-0xFB */ + 0xA5, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_54[512] = { + 0x00, 0x00, 0xA6, 0x53, 0x00, 0x00, 0xA6, 0x59, /* 0x00-0x03 */ + 0xA6, 0x55, 0x00, 0x00, 0xA6, 0x5B, 0xC9, 0xC5, /* 0x04-0x07 */ + 0xA6, 0x58, 0xA6, 0x4E, 0xA6, 0x51, 0xA6, 0x54, /* 0x08-0x0B */ + 0xA6, 0x50, 0xA6, 0x57, 0xA6, 0x5A, 0xA6, 0x4F, /* 0x0C-0x0F */ + 0xA6, 0x52, 0xA6, 0x56, 0xA6, 0x5C, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xCA, 0x7E, 0xCA, 0x7B, 0x00, 0x00, 0xA7, 0x67, /* 0x18-0x1B */ + 0xCA, 0x7C, 0xA7, 0x5B, 0xA7, 0x5D, 0xA7, 0x75, /* 0x1C-0x1F */ + 0xA7, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xCA, 0xA5, 0xCA, 0x7D, 0xA7, 0x5F, 0xA7, 0x61, /* 0x24-0x27 */ + 0xCA, 0xA4, 0xA7, 0x68, 0xCA, 0x78, 0xA7, 0x74, /* 0x28-0x2B */ + 0xA7, 0x76, 0xA7, 0x5C, 0xA7, 0x6D, 0x00, 0x00, /* 0x2C-0x2F */ + 0xCA, 0x76, 0xA7, 0x73, 0x00, 0x00, 0xA7, 0x64, /* 0x30-0x33 */ + 0x00, 0x00, 0xA7, 0x6E, 0xA7, 0x6F, 0xCA, 0x77, /* 0x34-0x37 */ + 0xA7, 0x6C, 0xA7, 0x6A, 0x00, 0x00, 0xA7, 0x6B, /* 0x38-0x3B */ + 0xA7, 0x71, 0xCA, 0xA1, 0xA7, 0x5E, 0x00, 0x00, /* 0x3C-0x3F */ + 0xA7, 0x72, 0xCA, 0xA3, 0xA7, 0x66, 0xA7, 0x63, /* 0x40-0x43 */ + 0x00, 0x00, 0xCA, 0x7A, 0xA7, 0x62, 0xCA, 0xA6, /* 0x44-0x47 */ + 0xA7, 0x65, 0x00, 0x00, 0xA7, 0x69, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0x60, 0xCA, 0xA2, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xCA, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xCB, 0xEB, 0xCB, 0xEA, 0xA9, 0x4F, 0xCB, 0xED, /* 0x60-0x63 */ + 0xCB, 0xEF, 0xCB, 0xE4, 0xCB, 0xE7, 0xCB, 0xEE, /* 0x64-0x67 */ + 0xA9, 0x50, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xE1, /* 0x68-0x6B */ + 0xCB, 0xE5, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xE9, /* 0x6C-0x6F */ + 0xCE, 0x49, 0xA9, 0x4B, 0xCE, 0x4D, 0xA8, 0xFD, /* 0x70-0x73 */ + 0xCB, 0xE6, 0xA8, 0xFE, 0xA9, 0x4C, 0xA9, 0x45, /* 0x74-0x77 */ + 0xA9, 0x41, 0x00, 0x00, 0xCB, 0xE2, 0xA9, 0x44, /* 0x78-0x7B */ + 0xA9, 0x49, 0xA9, 0x52, 0xCB, 0xE3, 0xCB, 0xDC, /* 0x7C-0x7F */ + + 0xA9, 0x43, 0xCB, 0xDD, 0xCB, 0xDF, 0x00, 0x00, /* 0x80-0x83 */ + 0xA9, 0x46, 0x00, 0x00, 0xA9, 0x48, 0xCB, 0xDB, /* 0x84-0x87 */ + 0xCB, 0xE0, 0x00, 0x00, 0x00, 0x00, 0xA9, 0x51, /* 0x88-0x8B */ + 0xA9, 0x4D, 0xCB, 0xE8, 0xA9, 0x53, 0x00, 0x00, /* 0x8C-0x8F */ + 0xA9, 0x4A, 0xCB, 0xDE, 0xA9, 0x47, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xA9, 0x42, 0xA9, 0x40, 0x00, 0x00, /* 0x94-0x97 */ + 0xCB, 0xEC, 0x00, 0x00, 0xA9, 0x4E, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xCE, 0x48, 0xCD, 0xFB, 0xCE, 0x4B, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xCD, 0xFD, 0xAB, 0x78, 0xAB, 0xA8, /* 0xA4-0xA7 */ + 0xAB, 0x74, 0xAB, 0xA7, 0xAB, 0x7D, 0xAB, 0xA4, /* 0xA8-0xAB */ + 0xAB, 0x72, 0xCD, 0xFC, 0xCE, 0x43, 0xAB, 0xA3, /* 0xAC-0xAF */ + 0xCE, 0x4F, 0xAB, 0xA5, 0x00, 0x00, 0xAB, 0x79, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0x45, 0xCE, 0x42, /* 0xB4-0xB7 */ + 0xAB, 0x77, 0x00, 0x00, 0xCD, 0xFA, 0xAB, 0xA6, /* 0xB8-0xBB */ + 0xCE, 0x4A, 0xAB, 0x7C, 0xCE, 0x4C, 0xAB, 0xA9, /* 0xBC-0xBF */ + 0xAB, 0x73, 0xAB, 0x7E, 0xAB, 0x7B, 0xCE, 0x40, /* 0xC0-0xC3 */ + 0xAB, 0xA1, 0xCE, 0x46, 0xCE, 0x47, 0xAB, 0x7A, /* 0xC4-0xC7 */ + 0xAB, 0xA2, 0xAB, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0x75, 0xCD, 0xFE, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0x44, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0x4E, 0x00, 0x00, /* 0xDC-0xDF */ + 0xD1, 0x44, 0xAD, 0xFB, 0xD0, 0xF1, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xD0, 0xF6, 0xAD, 0xF4, 0xAE, 0x40, 0xD0, 0xF4, /* 0xE4-0xE7 */ + 0xAD, 0xEF, 0xAD, 0xF9, 0xAD, 0xFE, 0xD0, 0xFB, /* 0xE8-0xEB */ + 0x00, 0x00, 0xAD, 0xFA, 0xAD, 0xFD, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xD0, 0xFE, 0xAD, 0xF5, 0xD0, 0xF5, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0x42, /* 0xF4-0xF7 */ + 0xD1, 0x43, 0x00, 0x00, 0xAD, 0xF7, 0xD1, 0x41, /* 0xF8-0xFB */ + 0xAD, 0xF3, 0xAE, 0x43, 0x00, 0x00, 0xD0, 0xF8, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_55[512] = { + 0x00, 0x00, 0xAD, 0xF1, 0x00, 0x00, 0xD1, 0x46, /* 0x00-0x03 */ + 0xD0, 0xF9, 0xD0, 0xFD, 0xAD, 0xF6, 0xAE, 0x42, /* 0x04-0x07 */ + 0xD0, 0xFA, 0xAD, 0xFC, 0xD1, 0x40, 0xD1, 0x47, /* 0x08-0x0B */ + 0xD4, 0xA1, 0x00, 0x00, 0xD1, 0x45, 0xAE, 0x44, /* 0x0C-0x0F */ + 0xAD, 0xF0, 0xD0, 0xFC, 0xD0, 0xF3, 0x00, 0x00, /* 0x10-0x13 */ + 0xAD, 0xF8, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF2, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF7, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xF0, 0xAE, 0x41, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0x77, 0x00, 0x00, /* 0x28-0x2B */ + 0xB0, 0xE4, 0xD4, 0xA7, 0xB0, 0xE2, 0xB0, 0xDF, /* 0x2C-0x2F */ + 0xD4, 0x7C, 0xB0, 0xDB, 0xD4, 0xA2, 0xB0, 0xE6, /* 0x30-0x33 */ + 0xD4, 0x76, 0xD4, 0x7B, 0xD4, 0x7A, 0xAD, 0xF2, /* 0x34-0x37 */ + 0xB0, 0xE1, 0xD4, 0xA5, 0x00, 0x00, 0xD4, 0xA8, /* 0x38-0x3B */ + 0xD4, 0x73, 0x00, 0x00, 0xB3, 0xE8, 0x00, 0x00, /* 0x3C-0x3F */ + 0xD4, 0xA9, 0xB0, 0xE7, 0x00, 0x00, 0xB0, 0xD9, /* 0x40-0x43 */ + 0xB0, 0xD6, 0xD4, 0x7E, 0xB0, 0xD3, 0x00, 0x00, /* 0x44-0x47 */ + 0xD4, 0xA6, 0x00, 0x00, 0xB0, 0xDA, 0xD4, 0xAA, /* 0x48-0x4B */ + 0x00, 0x00, 0xD4, 0x74, 0xD4, 0xA4, 0xB0, 0xDD, /* 0x4C-0x4F */ + 0xD4, 0x75, 0xD4, 0x78, 0xD4, 0x7D, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xB0, 0xDE, 0xB0, 0xDC, 0xB0, 0xE8, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xB0, 0xE3, 0x00, 0x00, 0xB0, 0xD7, 0xB1, 0xD2, /* 0x5C-0x5F */ + 0x00, 0x00, 0xB0, 0xD8, 0xD4, 0x79, 0xB0, 0xE5, /* 0x60-0x63 */ + 0xB0, 0xE0, 0xD4, 0xA3, 0xB0, 0xD5, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xB0, 0xD4, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xD4, 0x71, 0xD4, 0x72, 0xD8, 0x6A, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xD7, /* 0x78-0x7B */ + 0xB3, 0xDA, 0xD8, 0x75, 0xB3, 0xEE, 0xD8, 0x78, /* 0x7C-0x7F */ + + 0xB3, 0xD8, 0xD8, 0x71, 0xB3, 0xDE, 0xB3, 0xE4, /* 0x80-0x83 */ + 0xB5, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xE2, /* 0x84-0x87 */ + 0xD8, 0x6E, 0xB3, 0xEF, 0xB3, 0xDB, 0xB3, 0xE3, /* 0x88-0x8B */ + 0xD8, 0x76, 0xDC, 0xD7, 0xD8, 0x7B, 0xD8, 0x6F, /* 0x8C-0x8F */ + 0x00, 0x00, 0xD8, 0x66, 0xD8, 0x73, 0xD8, 0x6D, /* 0x90-0x93 */ + 0xB3, 0xE1, 0xD8, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xB3, 0xDD, 0xB3, 0xF1, 0xB3, 0xEA, 0x00, 0x00, /* 0x98-0x9B */ + 0xB3, 0xDF, 0xB3, 0xDC, 0x00, 0x00, 0xB3, 0xE7, /* 0x9C-0x9F */ + 0x00, 0x00, 0xD8, 0x7A, 0xD8, 0x6C, 0xD8, 0x72, /* 0xA0-0xA3 */ + 0xD8, 0x74, 0xD8, 0x68, 0xD8, 0x77, 0xB3, 0xD9, /* 0xA4-0xA7 */ + 0xD8, 0x67, 0x00, 0x00, 0xB3, 0xE0, 0xB3, 0xF0, /* 0xA8-0xAB */ + 0xB3, 0xEC, 0xD8, 0x69, 0xB3, 0xE6, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xB3, 0xED, 0xB3, 0xE9, 0xB3, 0xE5, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xD8, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xEB, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xD5, /* 0xBC-0xBF */ + 0xDC, 0xD1, 0x00, 0x00, 0xDC, 0xE0, 0xDC, 0xCA, /* 0xC0-0xC3 */ + 0xDC, 0xD3, 0xB6, 0xE5, 0xB6, 0xE6, 0xB6, 0xDE, /* 0xC4-0xC7 */ + 0xDC, 0xDC, 0xB6, 0xE8, 0xDC, 0xCF, 0xDC, 0xCE, /* 0xC8-0xCB */ + 0xDC, 0xCC, 0xDC, 0xDE, 0xB6, 0xDC, 0xDC, 0xD8, /* 0xCC-0xCF */ + 0xDC, 0xCD, 0xB6, 0xDF, 0xDC, 0xD6, 0xB6, 0xDA, /* 0xD0-0xD3 */ + 0xDC, 0xD2, 0xDC, 0xD9, 0xDC, 0xDB, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xDC, 0xDF, 0xB6, 0xE3, 0xDC, 0xCB, /* 0xD8-0xDB */ + 0xB6, 0xDD, 0xDC, 0xD0, 0x00, 0x00, 0xB6, 0xD8, /* 0xDC-0xDF */ + 0x00, 0x00, 0xB6, 0xE4, 0xDC, 0xDA, 0xB6, 0xE0, /* 0xE0-0xE3 */ + 0xB6, 0xE1, 0xB6, 0xE7, 0xB6, 0xDB, 0xA2, 0x5F, /* 0xE4-0xE7 */ + 0xB6, 0xD9, 0xDC, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xE2, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xDC, 0xDD, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xB9, 0xCD, 0xB9, 0xC8, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE1, 0x55, 0xE1, 0x51, 0x00, 0x00, /* 0xF8-0xFB */ + 0xE1, 0x4B, 0xB9, 0xC2, 0xB9, 0xBE, 0xE1, 0x54, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_56[512] = { + 0xB9, 0xBF, 0xE1, 0x4E, 0xE1, 0x50, 0x00, 0x00, /* 0x00-0x03 */ + 0xE1, 0x53, 0x00, 0x00, 0xB9, 0xC4, 0x00, 0x00, /* 0x04-0x07 */ + 0xB9, 0xCB, 0xB9, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xE1, 0x49, 0xB9, 0xC6, 0xB9, 0xC7, 0xE1, 0x4C, /* 0x0C-0x0F */ + 0xB9, 0xCC, 0x00, 0x00, 0xE1, 0x4A, 0xE1, 0x4F, /* 0x10-0x13 */ + 0xB9, 0xC3, 0xE1, 0x48, 0xB9, 0xC9, 0xB9, 0xC1, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB9, 0xC0, /* 0x18-0x1B */ + 0xE1, 0x4D, 0xE1, 0x52, 0x00, 0x00, 0xB9, 0xCA, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x47, /* 0x24-0x27 */ + 0x00, 0x00, 0xBC, 0x4D, 0xE5, 0x47, 0x00, 0x00, /* 0x28-0x2B */ + 0xE5, 0x44, 0x00, 0x00, 0xBC, 0x47, 0xBC, 0x53, /* 0x2C-0x2F */ + 0xBC, 0x54, 0x00, 0x00, 0xBC, 0x4A, 0xE5, 0x42, /* 0x30-0x33 */ + 0xBC, 0x4C, 0xE4, 0xF9, 0xBC, 0x52, 0x00, 0x00, /* 0x34-0x37 */ + 0xE5, 0x46, 0xBC, 0x49, 0xE5, 0x48, 0xBC, 0x48, /* 0x38-0x3B */ + 0x00, 0x00, 0xE5, 0x43, 0xE5, 0x45, 0xBC, 0x4B, /* 0x3C-0x3F */ + 0xE5, 0x41, 0xE4, 0xFA, 0xE4, 0xF7, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xD8, 0x6B, 0xE4, 0xFD, 0x00, 0x00, /* 0x44-0x47 */ + 0xE4, 0xF6, 0xE4, 0xFC, 0xE4, 0xFB, 0x00, 0x00, /* 0x48-0x4B */ + 0xE4, 0xF8, 0x00, 0x00, 0xBC, 0x4F, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBC, 0x4E, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBC, 0x50, /* 0x54-0x57 */ + 0xE4, 0xFE, 0xBE, 0xB2, 0xE5, 0x40, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x45, 0x00, 0x00, /* 0x5C-0x5F */ + 0xE8, 0xFD, 0x00, 0x00, 0xBE, 0xBE, 0xE9, 0x42, /* 0x60-0x63 */ + 0xBE, 0xB6, 0xBE, 0xBA, 0xE9, 0x41, 0x00, 0x00, /* 0x64-0x67 */ + 0xBE, 0xB9, 0xBE, 0xB5, 0xBE, 0xB8, 0xBE, 0xB3, /* 0x68-0x6B */ + 0xBE, 0xBD, 0xE9, 0x43, 0xE8, 0xFE, 0xBE, 0xBC, /* 0x6C-0x6F */ + 0xE8, 0xFC, 0xBE, 0xBB, 0xE9, 0x44, 0xE9, 0x40, /* 0x70-0x73 */ + 0xBC, 0x51, 0x00, 0x00, 0xBE, 0xBF, 0xE9, 0x46, /* 0x74-0x77 */ + 0xBE, 0xB7, 0xBE, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xC6, 0xEC, 0xC8, /* 0x7C-0x7F */ + + 0xC0, 0x7B, 0xEC, 0xC9, 0xEC, 0xC7, 0xEC, 0xC5, /* 0x80-0x83 */ + 0xEC, 0xC4, 0xC0, 0x7D, 0xEC, 0xC3, 0xC0, 0x7E, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xEC, 0xC1, 0xEC, 0xC2, 0xC0, 0x7A, 0xC0, 0xA1, /* 0x8C-0x8F */ + 0xC0, 0x7C, 0x00, 0x00, 0x00, 0x00, 0xEC, 0xC0, /* 0x90-0x93 */ + 0x00, 0x00, 0xC2, 0x50, 0x00, 0x00, 0xEF, 0xBC, /* 0x94-0x97 */ + 0xEF, 0xBA, 0xEF, 0xBF, 0xEF, 0xBD, 0x00, 0x00, /* 0x98-0x9B */ + 0xEF, 0xBB, 0xEF, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xC3, 0x60, 0xF1, 0xF2, 0xF1, 0xF3, /* 0xA4-0xA7 */ + 0xC4, 0x56, 0x00, 0x00, 0xF1, 0xF4, 0xF1, 0xF0, /* 0xA8-0xAB */ + 0xF1, 0xF5, 0xF1, 0xF1, 0xC2, 0x51, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xFE, 0xF4, 0x41, /* 0xB0-0xB3 */ + 0xC4, 0x59, 0xF4, 0x40, 0xC4, 0x58, 0xC4, 0x57, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xC4, 0x5A, 0xF5, 0xC5, 0xF5, 0xC6, 0x00, 0x00, /* 0xBC-0xBF */ + 0xC4, 0xDA, 0xC4, 0xD9, 0xC4, 0xDB, 0xF5, 0xC4, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xF6, 0xD8, 0xF6, 0xD7, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xC5, 0x6D, 0xC5, 0x6F, 0xC5, 0x6E, 0xF6, 0xD9, /* 0xC8-0xCB */ + 0xC5, 0xC8, 0xF8, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xC5, 0xF1, 0x00, 0x00, 0xF8, 0xA5, /* 0xD0-0xD3 */ + 0xF8, 0xEE, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x49, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xA5, 0x7D, 0xA5, 0x7C, /* 0xD8-0xDB */ + 0x00, 0x00, 0xA6, 0x5F, 0xA6, 0x5E, 0xC9, 0xC7, /* 0xDC-0xDF */ + 0xA6, 0x5D, 0xC9, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xA7, 0x79, 0xCA, 0xA9, 0x00, 0x00, 0xCA, 0xA8, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0x77, 0xA7, 0x7A, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xA7, 0x00, 0x00, /* 0xEC-0xEF */ + 0xA7, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xF0, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xCB, 0xF1, 0xA9, 0x54, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xAA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_57[512] = { + 0x00, 0x00, 0xD1, 0x48, 0xD1, 0x49, 0xAE, 0x45, /* 0x00-0x03 */ + 0xAE, 0x46, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xAC, /* 0x04-0x07 */ + 0xB0, 0xE9, 0xB0, 0xEB, 0xD4, 0xAB, 0xB0, 0xEA, /* 0x08-0x0B */ + 0xD8, 0x7C, 0xB3, 0xF2, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xB6, 0xE9, 0xB6, 0xEA, /* 0x10-0x13 */ + 0xDC, 0xE1, 0x00, 0x00, 0xB9, 0xCF, 0x00, 0x00, /* 0x14-0x17 */ + 0xB9, 0xCE, 0x00, 0x00, 0xE5, 0x49, 0xE9, 0x48, /* 0x18-0x1B */ + 0xE9, 0x47, 0x00, 0x00, 0xF9, 0x6B, 0xA4, 0x67, /* 0x1C-0x1F */ + 0xC9, 0x59, 0x00, 0x00, 0xC9, 0x6E, 0xC9, 0x6F, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xA6, 0x62, 0xA6, 0x66, 0xC9, 0xC9, 0x00, 0x00, /* 0x28-0x2B */ + 0xA6, 0x64, 0xA6, 0x63, 0xC9, 0xC8, 0xA6, 0x65, /* 0x2C-0x2F */ + 0xA6, 0x61, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x60, /* 0x30-0x33 */ + 0xC9, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xA6, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0xA3, 0x00, 0x00, /* 0x3C-0x3F */ + 0xA7, 0x7D, 0xCA, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xCA, 0xAB, 0x00, 0x00, 0xA7, 0xA1, /* 0x44-0x47 */ + 0x00, 0x00, 0xCA, 0xAD, 0xA7, 0x7B, 0xCA, 0xAE, /* 0x48-0x4B */ + 0xCA, 0xAC, 0xA7, 0x7E, 0xA7, 0xA2, 0xA7, 0xA5, /* 0x4C-0x4F */ + 0xA7, 0xA4, 0xA7, 0x7C, 0xCA, 0xAF, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xA9, 0x59, 0xCB, 0xFE, 0x00, 0x00, /* 0x60-0x63 */ + 0xA9, 0x5B, 0x00, 0x00, 0xA9, 0x5A, 0x00, 0x00, /* 0x64-0x67 */ + 0xCC, 0x40, 0xA9, 0x58, 0xA9, 0x57, 0xCB, 0xF5, /* 0x68-0x6B */ + 0x00, 0x00, 0xCB, 0xF4, 0x00, 0x00, 0xCB, 0xF2, /* 0x6C-0x6F */ + 0xCB, 0xF7, 0xCB, 0xF6, 0xCB, 0xF3, 0xCB, 0xFC, /* 0x70-0x73 */ + 0xCB, 0xFD, 0xCB, 0xFA, 0xCB, 0xF8, 0xA9, 0x56, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xFB, /* 0x78-0x7B */ + 0xA9, 0x5C, 0xCC, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xCB, 0xF9, 0x00, 0x00, 0xAB, 0xAB, 0xA9, 0x55, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xAC, /* 0x88-0x8B */ + 0xCE, 0x54, 0x00, 0x00, 0x00, 0x00, 0xCE, 0x5A, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xB2, /* 0x90-0x93 */ + 0xCE, 0x58, 0xCE, 0x5E, 0x00, 0x00, 0xCE, 0x55, /* 0x94-0x97 */ + 0xCE, 0x59, 0xCE, 0x5B, 0xCE, 0x5D, 0xCE, 0x57, /* 0x98-0x9B */ + 0x00, 0x00, 0xCE, 0x56, 0xCE, 0x51, 0xCE, 0x52, /* 0x9C-0x9F */ + 0xAB, 0xAD, 0x00, 0x00, 0xAB, 0xAF, 0xAB, 0xAE, /* 0xA0-0xA3 */ + 0xCE, 0x53, 0xCE, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0xB1, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xCE, 0x50, 0xD1, 0x53, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xD1, 0x52, 0xD1, 0x57, 0xD1, 0x4E, 0x00, 0x00, /* 0xB8-0xBB */ + 0xD1, 0x51, 0xD1, 0x50, 0x00, 0x00, 0xD1, 0x54, /* 0xBC-0xBF */ + 0x00, 0x00, 0xD1, 0x58, 0xAE, 0x47, 0xAE, 0x4A, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0x4F, 0xD1, 0x55, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAE, 0x49, /* 0xC8-0xCB */ + 0xD1, 0x4A, 0x00, 0x00, 0xAB, 0xB0, 0xD4, 0xBA, /* 0xCC-0xCF */ + 0xD1, 0x56, 0x00, 0x00, 0xD1, 0x4D, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xAE, 0x48, 0xD1, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD4, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xB0, 0xEC, /* 0xDC-0xDF */ + 0xB0, 0xF0, 0xD4, 0xC1, 0xD4, 0xAF, 0xD4, 0xBD, /* 0xE0-0xE3 */ + 0xB0, 0xF1, 0xD4, 0xBF, 0x00, 0x00, 0xD4, 0xC5, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xD4, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xD4, 0xC0, 0xD4, 0xB4, 0xD4, 0xBC, 0x00, 0x00, /* 0xEC-0xEF */ + 0xD4, 0xCA, 0xD4, 0xC8, 0xD4, 0xBE, 0xD4, 0xB9, /* 0xF0-0xF3 */ + 0xD4, 0xB2, 0xD8, 0xA6, 0xD4, 0xB0, 0xB0, 0xF5, /* 0xF4-0xF7 */ + 0xD4, 0xB7, 0xB0, 0xF6, 0xB0, 0xF2, 0xD4, 0xAD, /* 0xF8-0xFB */ + 0xD4, 0xC3, 0xD4, 0xB5, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_58[512] = { + 0xD4, 0xB3, 0xD4, 0xC6, 0xB0, 0xF3, 0x00, 0x00, /* 0x00-0x03 */ + 0xD4, 0xCC, 0xB0, 0xED, 0xB0, 0xEF, 0xD4, 0xBB, /* 0x04-0x07 */ + 0xD4, 0xB6, 0xAE, 0x4B, 0xB0, 0xEE, 0xD4, 0xB8, /* 0x08-0x0B */ + 0xD4, 0xC7, 0xD4, 0xCB, 0xD4, 0xC2, 0x00, 0x00, /* 0x0C-0x0F */ + 0xD4, 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xD4, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xD8, 0xA1, 0x00, 0x00, 0xD8, 0xAA, /* 0x18-0x1B */ + 0xD8, 0xA9, 0xB3, 0xFA, 0xD8, 0xA2, 0x00, 0x00, /* 0x1C-0x1F */ + 0xB3, 0xFB, 0xB3, 0xF9, 0x00, 0x00, 0xD8, 0xA4, /* 0x20-0x23 */ + 0xB3, 0xF6, 0xD8, 0xA8, 0x00, 0x00, 0xD8, 0xA3, /* 0x24-0x27 */ + 0xD8, 0xA5, 0xD8, 0x7D, 0xB3, 0xF4, 0x00, 0x00, /* 0x28-0x2B */ + 0xD8, 0xB2, 0xD8, 0xB1, 0xD8, 0xAE, 0xB3, 0xF3, /* 0x2C-0x2F */ + 0xB3, 0xF7, 0xB3, 0xF8, 0xD1, 0x4B, 0xD8, 0xAB, /* 0x30-0x33 */ + 0xB3, 0xF5, 0xB0, 0xF4, 0xD8, 0xAD, 0xD8, 0x7E, /* 0x34-0x37 */ + 0xD8, 0xB0, 0xD8, 0xAF, 0x00, 0x00, 0xD8, 0xB3, /* 0x38-0x3B */ + 0x00, 0x00, 0xDC, 0xEF, 0x00, 0x00, 0xD8, 0xAC, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xD8, 0xA7, 0xDC, 0xE7, 0xB6, 0xF4, 0xB6, 0xF7, /* 0x48-0x4B */ + 0xB6, 0xF2, 0xDC, 0xE6, 0xDC, 0xEA, 0xDC, 0xE5, /* 0x4C-0x4F */ + 0x00, 0x00, 0xB6, 0xEC, 0xB6, 0xF6, 0xDC, 0xE2, /* 0x50-0x53 */ + 0xB6, 0xF0, 0xDC, 0xE9, 0x00, 0x00, 0xB6, 0xEE, /* 0x54-0x57 */ + 0xB6, 0xED, 0xDC, 0xEC, 0xB6, 0xEF, 0xDC, 0xEE, /* 0x58-0x5B */ + 0x00, 0x00, 0xDC, 0xEB, 0xB6, 0xEB, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xB6, 0xF5, 0xDC, 0xF0, /* 0x60-0x63 */ + 0xDC, 0xE4, 0xDC, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xDC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xF1, /* 0x68-0x6B */ + 0x00, 0x00, 0xB6, 0xF3, 0x00, 0x00, 0xDC, 0xE8, /* 0x6C-0x6F */ + 0x00, 0x00, 0xDC, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xE1, 0x5D, 0xB9, 0xD0, 0xE1, 0x63, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xB9, 0xD5, 0xE1, 0x5F, 0xE1, 0x66, /* 0x78-0x7B */ + 0xE1, 0x57, 0xB9, 0xD7, 0xB9, 0xD1, 0xE1, 0x5C, /* 0x7C-0x7F */ + + 0xBC, 0x55, 0xE1, 0x5B, 0xE1, 0x64, 0xB9, 0xD2, /* 0x80-0x83 */ + 0x00, 0x00, 0xB9, 0xD6, 0xE1, 0x5A, 0xE1, 0x60, /* 0x84-0x87 */ + 0xE1, 0x65, 0xE1, 0x56, 0xB9, 0xD4, 0xE1, 0x5E, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0x62, 0xE1, 0x68, /* 0x8C-0x8F */ + 0xE1, 0x58, 0xE1, 0x61, 0x00, 0x00, 0xB9, 0xD3, /* 0x90-0x93 */ + 0xE1, 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xE1, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xBC, 0x59, 0xE5, 0x4B, 0xBC, 0x57, 0xBC, 0x56, /* 0x9C-0x9F */ + 0xE5, 0x4D, 0xE5, 0x52, 0x00, 0x00, 0xE5, 0x4E, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xE5, 0x51, 0xBC, 0x5C, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xBE, 0xA5, 0xBC, 0x5B, 0x00, 0x00, 0xE5, 0x4A, /* 0xA8-0xAB */ + 0xE5, 0x50, 0x00, 0x00, 0xBC, 0x5A, 0xE5, 0x4F, /* 0xAC-0xAF */ + 0x00, 0x00, 0xE5, 0x4C, 0x00, 0x00, 0xBC, 0x58, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x4D, 0xF9, 0xD9, /* 0xB8-0xBB */ + 0xE9, 0x4F, 0xE9, 0x4A, 0xBE, 0xC1, 0xE9, 0x4C, /* 0xBC-0xBF */ + 0x00, 0x00, 0xBE, 0xC0, 0xE9, 0x4E, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xBE, 0xC3, 0xE9, 0x50, 0xBE, 0xC2, /* 0xC4-0xC7 */ + 0xE9, 0x49, 0xE9, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0xA5, 0xEC, 0xCC, /* 0xCC-0xCF */ + 0x00, 0x00, 0xC0, 0xA4, 0xEC, 0xCD, 0xC0, 0xA3, /* 0xD0-0xD3 */ + 0xEC, 0xCB, 0xC0, 0xA2, 0xEC, 0xCA, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xC2, 0x53, 0xC2, 0x52, 0xF1, 0xF6, 0xF1, 0xF8, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF1, 0xF7, 0xC3, 0x61, 0xC3, 0x62, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xC3, 0x63, 0xF4, 0x42, /* 0xE0-0xE3 */ + 0xC4, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xD3, /* 0xE4-0xE7 */ + 0xF7, 0xD2, 0xC5, 0xF2, 0x00, 0x00, 0xA4, 0x68, /* 0xE8-0xEB */ + 0xA4, 0xD0, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xA7, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xCE, 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xB3, 0xFC, 0xB3, 0xFD, 0x00, 0x00, /* 0xF8-0xFB */ + 0xDC, 0xF2, 0xB9, 0xD8, 0xE1, 0x69, 0xE5, 0x53, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_59[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC9, 0x5A, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xB0, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xCC, 0x42, 0xCE, 0x60, 0xD1, 0x59, 0xAE, 0x4C, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xF9, 0x00, 0x00, /* 0x10-0x13 */ + 0xC4, 0xDC, 0xA4, 0x69, 0xA5, 0x7E, 0xC9, 0x70, /* 0x14-0x17 */ + 0x00, 0x00, 0xA6, 0x67, 0xA6, 0x68, 0x00, 0x00, /* 0x18-0x1B */ + 0xA9, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xB0, 0xF7, 0x00, 0x00, 0xB9, 0xDA, 0x00, 0x00, /* 0x20-0x23 */ + 0xB9, 0xDB, 0xB9, 0xD9, 0x00, 0x00, 0xA4, 0x6A, /* 0x24-0x27 */ + 0x00, 0x00, 0xA4, 0xD1, 0xA4, 0xD3, 0xA4, 0xD2, /* 0x28-0x2B */ + 0xC9, 0x5B, 0xA4, 0xD4, 0xA5, 0xA1, 0xC9, 0x71, /* 0x2C-0x2F */ + 0x00, 0x00, 0xA5, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x69, /* 0x34-0x37 */ + 0xA6, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xC9, 0xCB, 0x00, 0x00, 0xA7, 0xA8, 0x00, 0x00, /* 0x3C-0x3F */ + 0xCA, 0xB1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xA9, 0x61, 0xCC, 0x43, 0x00, 0x00, 0xA9, 0x5F, /* 0x44-0x47 */ + 0xA9, 0x60, 0xA9, 0x5E, 0xD1, 0x5A, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0xB6, 0xAB, 0xB5, /* 0x4C-0x4F */ + 0xAB, 0xB7, 0xAB, 0xB4, 0x00, 0x00, 0xCE, 0x61, /* 0x50-0x53 */ + 0xA9, 0x62, 0xAB, 0xB3, 0x00, 0x00, 0xAE, 0x4D, /* 0x54-0x57 */ + 0xAE, 0x4E, 0x00, 0x00, 0xAE, 0x4F, 0x00, 0x00, /* 0x58-0x5B */ + 0xD4, 0xCD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xB3, 0xFE, 0xD8, 0xB4, 0xB0, 0xF8, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xF8, /* 0x64-0x67 */ + 0x00, 0x00, 0xB9, 0xDD, 0xB9, 0xDC, 0xE1, 0x6A, /* 0x68-0x6B */ + 0x00, 0x00, 0xBC, 0x5D, 0xBE, 0xC4, 0x00, 0x00, /* 0x6C-0x6F */ + 0xEF, 0xC0, 0xF6, 0xDA, 0xF7, 0xD4, 0xA4, 0x6B, /* 0x70-0x73 */ + 0xA5, 0xA3, 0x00, 0x00, 0xA5, 0xA4, 0xC9, 0xD1, /* 0x74-0x77 */ + 0xA6, 0x6C, 0xA6, 0x6F, 0x00, 0x00, 0xC9, 0xCF, /* 0x78-0x7B */ + 0xC9, 0xCD, 0xA6, 0x6E, 0xC9, 0xD0, 0xC9, 0xD2, /* 0x7C-0x7F */ + + 0xC9, 0xCC, 0xA6, 0x71, 0xA6, 0x70, 0xA6, 0x6D, /* 0x80-0x83 */ + 0xA6, 0x6B, 0xC9, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0xB3, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xA7, 0xB0, 0xCA, 0xB6, 0xCA, 0xB9, /* 0x8C-0x8F */ + 0xCA, 0xB8, 0x00, 0x00, 0xA7, 0xAA, 0xA7, 0xB2, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0xAF, 0xCA, 0xB5, /* 0x94-0x97 */ + 0xCA, 0xB3, 0xA7, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xA7, 0xA9, 0xA7, 0xAC, 0x00, 0x00, /* 0x9C-0x9F */ + 0xCA, 0xB4, 0xCA, 0xBB, 0xCA, 0xB7, 0xA7, 0xAD, /* 0xA0-0xA3 */ + 0xA7, 0xB1, 0xA7, 0xB4, 0xCA, 0xB2, 0xCA, 0xBA, /* 0xA4-0xA7 */ + 0xA7, 0xAB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0x67, 0xA9, 0x6F, /* 0xAC-0xAF */ + 0x00, 0x00, 0xCC, 0x4F, 0xCC, 0x48, 0xA9, 0x70, /* 0xB0-0xB3 */ + 0xCC, 0x53, 0xCC, 0x44, 0xCC, 0x4B, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xA9, 0x66, 0xCC, 0x45, 0xA9, 0x64, /* 0xB8-0xBB */ + 0xCC, 0x4C, 0xCC, 0x50, 0xA9, 0x63, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCC, 0x51, 0xCC, 0x4A, 0x00, 0x00, 0xCC, 0x4D, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xA9, 0x72, 0xA9, 0x69, 0xCC, 0x54, /* 0xC4-0xC7 */ + 0xCC, 0x52, 0x00, 0x00, 0xA9, 0x6E, 0xA9, 0x6C, /* 0xC8-0xCB */ + 0xCC, 0x49, 0xA9, 0x6B, 0xCC, 0x47, 0xCC, 0x46, /* 0xCC-0xCF */ + 0xA9, 0x6A, 0xA9, 0x68, 0xA9, 0x71, 0xA9, 0x6D, /* 0xD0-0xD3 */ + 0xA9, 0x65, 0x00, 0x00, 0xCC, 0x4E, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xAB, 0xB9, 0x00, 0x00, 0xAB, 0xC0, 0xCE, 0x6F, /* 0xD8-0xDB */ + 0xAB, 0xB8, 0xCE, 0x67, 0xCE, 0x63, 0x00, 0x00, /* 0xDC-0xDF */ + 0xCE, 0x73, 0xCE, 0x62, 0x00, 0x00, 0xAB, 0xBB, /* 0xE0-0xE3 */ + 0xCE, 0x6C, 0xAB, 0xBE, 0xAB, 0xC1, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xAB, 0xBC, 0xCE, 0x70, 0xAB, 0xBF, 0x00, 0x00, /* 0xE8-0xEB */ + 0xAE, 0x56, 0xCE, 0x76, 0xCE, 0x64, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xCE, 0x66, 0xCE, 0x6D, 0xCE, 0x71, /* 0xF0-0xF3 */ + 0xCE, 0x75, 0xCE, 0x72, 0xCE, 0x6B, 0xCE, 0x6E, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0x68, 0xAB, 0xC3, /* 0xF8-0xFB */ + 0xCE, 0x6A, 0xCE, 0x69, 0xCE, 0x74, 0xAB, 0xBA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5A[512] = { + 0xCE, 0x65, 0xAB, 0xC2, 0x00, 0x00, 0xAB, 0xBD, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xAE, 0x5C, 0xD1, 0x62, 0x00, 0x00, /* 0x08-0x0B */ + 0xAE, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xD1, 0x60, /* 0x0C-0x0F */ + 0x00, 0x00, 0xAE, 0x50, 0x00, 0x00, 0xAE, 0x55, /* 0x10-0x13 */ + 0x00, 0x00, 0xD1, 0x5F, 0xD1, 0x5C, 0xD1, 0x61, /* 0x14-0x17 */ + 0xAE, 0x51, 0xD1, 0x5B, 0x00, 0x00, 0xAE, 0x54, /* 0x18-0x1B */ + 0xAE, 0x52, 0x00, 0x00, 0xD1, 0x63, 0xAE, 0x53, /* 0x1C-0x1F */ + 0xAE, 0x57, 0x00, 0x00, 0x00, 0x00, 0xAE, 0x58, /* 0x20-0x23 */ + 0x00, 0x00, 0xAE, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xAE, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xD1, 0x5D, 0xD1, 0x5E, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD1, 0x64, /* 0x30-0x33 */ + 0x00, 0x00, 0xD4, 0xD4, 0xB0, 0xF9, 0xD8, 0xC2, /* 0x34-0x37 */ + 0xD4, 0xD3, 0xD4, 0xE6, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xB1, 0x40, 0x00, 0x00, 0xD4, 0xE4, 0x00, 0x00, /* 0x3C-0x3F */ + 0xB0, 0xFE, 0xB0, 0xFA, 0xD4, 0xED, 0xD4, 0xDD, /* 0x40-0x43 */ + 0xD4, 0xE0, 0x00, 0x00, 0xB1, 0x43, 0xD4, 0xEA, /* 0x44-0x47 */ + 0xD4, 0xE2, 0xB0, 0xFB, 0xB1, 0x44, 0x00, 0x00, /* 0x48-0x4B */ + 0xD4, 0xE7, 0xD4, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xD4, 0xD6, 0xD4, 0xEB, 0xD4, 0xDF, 0xD4, 0xDA, /* 0x50-0x53 */ + 0x00, 0x00, 0xD4, 0xD0, 0xD4, 0xEC, 0xD4, 0xDC, /* 0x54-0x57 */ + 0xD4, 0xCF, 0x00, 0x00, 0xB1, 0x42, 0xD4, 0xE1, /* 0x58-0x5B */ + 0xD4, 0xEE, 0xD4, 0xDE, 0xD4, 0xD2, 0xD4, 0xD7, /* 0x5C-0x5F */ + 0xD4, 0xCE, 0x00, 0x00, 0xB1, 0x41, 0x00, 0x00, /* 0x60-0x63 */ + 0xD4, 0xDB, 0xD4, 0xD8, 0xB0, 0xFC, 0xD4, 0xD1, /* 0x64-0x67 */ + 0x00, 0x00, 0xD4, 0xE9, 0xB0, 0xFD, 0x00, 0x00, /* 0x68-0x6B */ + 0xD4, 0xD9, 0xD4, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xD4, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB4, 0x40, /* 0x74-0x77 */ + 0xD8, 0xBB, 0x00, 0x00, 0xD8, 0xB8, 0xD8, 0xC9, /* 0x78-0x7B */ + 0xD8, 0xBD, 0xD8, 0xCA, 0x00, 0x00, 0xB4, 0x42, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC6, /* 0x80-0x83 */ + 0xD8, 0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xC4, 0xD8, 0xC7, /* 0x88-0x8B */ + 0xD8, 0xCB, 0x00, 0x00, 0xD4, 0xE3, 0xD8, 0xCD, /* 0x8C-0x8F */ + 0xDD, 0x47, 0x00, 0x00, 0xB4, 0x43, 0xD8, 0xCE, /* 0x90-0x93 */ + 0xD8, 0xB6, 0xD8, 0xC0, 0x00, 0x00, 0xD8, 0xC5, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xB4, 0x41, 0xB4, 0x44, /* 0x98-0x9B */ + 0xD8, 0xCC, 0xD8, 0xCF, 0xD8, 0xBA, 0xD8, 0xB7, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xB9, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xD8, 0xBE, 0xD8, 0xBC, 0xB4, 0x45, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xD8, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xD8, 0xBF, 0x00, 0x00, 0xD8, 0xC1, 0xD8, 0xB5, /* 0xAC-0xAF */ + 0xDC, 0xFA, 0xDC, 0xF8, 0xB7, 0x42, 0xB7, 0x40, /* 0xB0-0xB3 */ + 0xDD, 0x43, 0xDC, 0xF9, 0xDD, 0x44, 0xDD, 0x40, /* 0xB4-0xB7 */ + 0xDC, 0xF7, 0xDD, 0x46, 0xDC, 0xF6, 0xDC, 0xFD, /* 0xB8-0xBB */ + 0xB6, 0xFE, 0xB6, 0xFD, 0xB6, 0xFC, 0xDC, 0xFB, /* 0xBC-0xBF */ + 0xDD, 0x41, 0xB6, 0xF9, 0xB7, 0x41, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xDC, 0xF4, 0x00, 0x00, 0xDC, 0xFE, 0xDC, 0xF3, /* 0xC4-0xC7 */ + 0xDC, 0xFC, 0xB6, 0xFA, 0xDD, 0x42, 0xDC, 0xF5, /* 0xC8-0xCB */ + 0xB6, 0xFB, 0xDD, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE1, 0x6E, 0xB9, 0xE2, 0xB9, 0xE1, /* 0xD4-0xD7 */ + 0xB9, 0xE3, 0xE1, 0x7A, 0xE1, 0x70, 0xE1, 0x76, /* 0xD8-0xDB */ + 0xE1, 0x6B, 0xE1, 0x79, 0xE1, 0x78, 0xE1, 0x7C, /* 0xDC-0xDF */ + 0xE1, 0x75, 0xB9, 0xDE, 0xE1, 0x74, 0xB9, 0xE4, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE1, 0x6D, 0xB9, 0xDF, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xE1, 0x7B, 0xB9, 0xE0, 0xE1, 0x6F, 0xE1, 0x72, /* 0xE8-0xEB */ + 0xE1, 0x77, 0xE1, 0x71, 0xE1, 0x6C, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0x73, /* 0xF0-0xF3 */ + 0xE5, 0x55, 0xBC, 0x61, 0xE5, 0x58, 0xE5, 0x57, /* 0xF4-0xF7 */ + 0xE5, 0x5A, 0xE5, 0x5C, 0xF9, 0xDC, 0xBC, 0x5F, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE5, 0x56, 0x00, 0x00, 0xE5, 0x54, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5B[512] = { + 0x00, 0x00, 0xE5, 0x5D, 0xE5, 0x5B, 0xE5, 0x59, /* 0x00-0x03 */ + 0x00, 0x00, 0xE5, 0x5F, 0x00, 0x00, 0xE5, 0x5E, /* 0x04-0x07 */ + 0xBC, 0x63, 0xBC, 0x5E, 0x00, 0x00, 0xBC, 0x60, /* 0x08-0x0B */ + 0xBC, 0x62, 0x00, 0x00, 0x00, 0x00, 0xE5, 0x60, /* 0x0C-0x0F */ + 0xE9, 0x57, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x56, /* 0x10-0x13 */ + 0xE9, 0x55, 0x00, 0x00, 0xE9, 0x58, 0xE9, 0x51, /* 0x14-0x17 */ + 0x00, 0x00, 0xE9, 0x52, 0xE9, 0x5A, 0xE9, 0x53, /* 0x18-0x1B */ + 0x00, 0x00, 0xBE, 0xC5, 0xE9, 0x5C, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE9, 0x5B, 0xE9, 0x54, 0x00, 0x00, 0xEC, 0xD1, /* 0x20-0x23 */ + 0xC0, 0xA8, 0xEC, 0xCF, 0xEC, 0xD4, 0xEC, 0xD3, /* 0x24-0x27 */ + 0xE9, 0x59, 0x00, 0x00, 0xC0, 0xA7, 0x00, 0x00, /* 0x28-0x2B */ + 0xEC, 0xD2, 0xEC, 0xCE, 0xEC, 0xD6, 0xEC, 0xD5, /* 0x2C-0x2F */ + 0xC0, 0xA6, 0x00, 0x00, 0xEC, 0xD0, 0x00, 0x00, /* 0x30-0x33 */ + 0xBE, 0xC6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xC2, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xEF, 0xC1, 0xF1, 0xFA, 0xF1, 0xFB, 0xF1, 0xFC, /* 0x3C-0x3F */ + 0xC4, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x5D, /* 0x40-0x43 */ + 0x00, 0x00, 0xF4, 0x43, 0x00, 0x00, 0xF5, 0xC8, /* 0x44-0x47 */ + 0xF5, 0xC7, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xDB, /* 0x48-0x4B */ + 0xF6, 0xDC, 0xF7, 0xD5, 0xF8, 0xA7, 0x00, 0x00, /* 0x4C-0x4F */ + 0xA4, 0x6C, 0xA4, 0x6D, 0x00, 0x00, 0xA4, 0x6E, /* 0x50-0x53 */ + 0xA4, 0xD5, 0xA5, 0xA5, 0xC9, 0xD3, 0xA6, 0x72, /* 0x54-0x57 */ + 0xA6, 0x73, 0x00, 0x00, 0xA7, 0xB7, 0xA7, 0xB8, /* 0x58-0x5B */ + 0xA7, 0xB6, 0xA7, 0xB5, 0x00, 0x00, 0xA9, 0x73, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0x55, 0xA9, 0x75, /* 0x60-0x63 */ + 0xA9, 0x74, 0xCC, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xAB, 0xC4, 0x00, 0x00, 0xAE, 0x5D, /* 0x68-0x6B */ + 0xD1, 0x65, 0x00, 0x00, 0xD4, 0xF0, 0x00, 0x00, /* 0x6C-0x6F */ + 0xB1, 0x45, 0xB4, 0x47, 0xD4, 0xEF, 0xB4, 0x46, /* 0x70-0x73 */ + 0x00, 0x00, 0xB9, 0xE5, 0x00, 0x00, 0xE1, 0x7D, /* 0x74-0x77 */ + 0xBE, 0xC7, 0x00, 0x00, 0xC0, 0xA9, 0xEC, 0xD7, /* 0x78-0x7B */ + 0x00, 0x00, 0xC4, 0x5E, 0x00, 0x00, 0xC5, 0x70, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xC9, 0x72, 0x00, 0x00, 0xA5, 0xA6, /* 0x80-0x83 */ + 0xC9, 0x73, 0xA6, 0x76, 0x00, 0x00, 0xA6, 0x74, /* 0x84-0x87 */ + 0xA6, 0x75, 0xA6, 0x77, 0x00, 0x00, 0xA7, 0xBA, /* 0x88-0x8B */ + 0xA7, 0xB9, 0x00, 0x00, 0xCA, 0xBC, 0xA7, 0xBB, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xBD, 0xCC, 0x57, /* 0x90-0x93 */ + 0x00, 0x00, 0xCC, 0x58, 0x00, 0x00, 0xA9, 0x76, /* 0x94-0x97 */ + 0xA9, 0x78, 0xA9, 0x7A, 0xA9, 0x77, 0xA9, 0x7B, /* 0x98-0x9B */ + 0xA9, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0xC8, 0xAB, 0xC5, /* 0xA0-0xA3 */ + 0xAB, 0xC7, 0xAB, 0xC9, 0xAB, 0xC6, 0xD1, 0x66, /* 0xA4-0xA7 */ + 0xCE, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xD1, 0x68, 0xD1, 0x67, 0xAE, 0x63, 0x00, 0x00, /* 0xAC-0xAF */ + 0xAE, 0x5F, 0x00, 0x00, 0x00, 0x00, 0xAE, 0x60, /* 0xB0-0xB3 */ + 0xAE, 0x62, 0xAE, 0x64, 0xAE, 0x61, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xAE, 0x66, 0xAE, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x4A, /* 0xBC-0xBF */ + 0xD4, 0xF2, 0xD4, 0xF1, 0xB1, 0x49, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xB1, 0x48, 0xB1, 0x47, 0xB1, 0x4B, 0xB1, 0x46, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xD5, 0xD8, 0xD2, /* 0xC8-0xCB */ + 0xB4, 0x49, 0xD8, 0xD1, 0xD8, 0xD6, 0x00, 0x00, /* 0xCC-0xCF */ + 0xB4, 0x4B, 0xD8, 0xD4, 0xB4, 0x48, 0xB4, 0x4A, /* 0xD0-0xD3 */ + 0xD8, 0xD3, 0x00, 0x00, 0xDD, 0x48, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xDD, 0x49, 0xDD, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xB9, 0xE6, 0xB9, 0xEE, /* 0xDC-0xDF */ + 0xE1, 0x7E, 0xB9, 0xE8, 0xB9, 0xEC, 0xE1, 0xA1, /* 0xE0-0xE3 */ + 0xB9, 0xED, 0xB9, 0xE9, 0xB9, 0xEA, 0xB9, 0xE7, /* 0xE4-0xE7 */ + 0xB9, 0xEB, 0xBC, 0x66, 0xD8, 0xD0, 0xBC, 0x67, /* 0xE8-0xEB */ + 0xBC, 0x65, 0x00, 0x00, 0xBC, 0x64, 0xE9, 0x5D, /* 0xEC-0xEF */ + 0xBE, 0xC8, 0xEC, 0xD8, 0xEC, 0xD9, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xC3, 0x64, 0xC4, 0x5F, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xA4, 0x6F, 0x00, 0x00, 0xA6, 0x78, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5C[512] = { + 0x00, 0x00, 0xAB, 0xCA, 0x00, 0x00, 0xD1, 0x69, /* 0x00-0x03 */ + 0xAE, 0x67, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x4E, /* 0x04-0x07 */ + 0xB1, 0x4D, 0xB1, 0x4C, 0xB4, 0x4C, 0xB4, 0x4D, /* 0x08-0x0B */ + 0xD8, 0xD7, 0xB9, 0xEF, 0xBE, 0xC9, 0xA4, 0x70, /* 0x0C-0x0F */ + 0xC9, 0x5C, 0xA4, 0xD6, 0xC9, 0x74, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xC9, 0xD4, 0xA6, 0x79, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0x7C, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0x4B, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0x71, 0x00, 0x00, /* 0x20-0x23 */ + 0xA4, 0xD7, 0xC9, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xCA, 0xBE, 0x00, 0x00, 0xCA, 0xBF, 0x00, 0x00, /* 0x28-0x2B */ + 0xA7, 0xBC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xD8, 0xD8, 0xB4, 0x4E, 0x00, 0x00, 0xDD, 0x4C, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC0, 0xAA, /* 0x34-0x37 */ + 0xA4, 0x72, 0xA4, 0xA8, 0xA4, 0xD8, 0xC9, 0x75, /* 0x38-0x3B */ + 0xA5, 0xA7, 0x00, 0x00, 0xA7, 0xC0, 0xA7, 0xBF, /* 0x3C-0x3F */ + 0xA7, 0xBD, 0xA7, 0xBE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xCC, 0x59, 0xA9, 0x7E, 0xA9, 0xA1, 0xCC, 0x5A, /* 0x44-0x47 */ + 0xA9, 0x7D, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xCE, /* 0x48-0x4B */ + 0xCE, 0x78, 0xAB, 0xCD, 0xAB, 0xCB, 0xAB, 0xCC, /* 0x4C-0x4F */ + 0xAE, 0x6A, 0xAE, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xD1, 0x6B, 0xAE, 0x69, 0xD1, 0x6A, 0x00, 0x00, /* 0x54-0x57 */ + 0xAE, 0x5E, 0xD4, 0xF3, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xB1, 0x50, 0xB1, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xB1, 0x4F, 0x00, 0x00, 0xB9, 0xF0, 0xE1, 0xA2, /* 0x60-0x63 */ + 0xBC, 0x68, 0xBC, 0x69, 0x00, 0x00, 0xE5, 0x61, /* 0x64-0x67 */ + 0xC0, 0xAB, 0xEF, 0xC2, 0xEF, 0xC3, 0x00, 0x00, /* 0x68-0x6B */ + 0xC4, 0xDD, 0xF8, 0xA8, 0xC9, 0x4B, 0xA4, 0xD9, /* 0x6C-0x6F */ + 0x00, 0x00, 0xA4, 0x73, 0x00, 0x00, 0xC9, 0x77, /* 0x70-0x73 */ + 0xC9, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xA6, 0x7A, 0xC9, 0xD7, 0xC9, 0xD8, /* 0x78-0x7B */ + 0xC9, 0xD6, 0x00, 0x00, 0xC9, 0xD9, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xC7, 0x00, 0x00, /* 0x84-0x87 */ + 0xCA, 0xC2, 0xCA, 0xC4, 0xCA, 0xC6, 0xCA, 0xC3, /* 0x88-0x8B */ + 0xA7, 0xC4, 0xCA, 0xC0, 0x00, 0x00, 0xCA, 0xC1, /* 0x8C-0x8F */ + 0xA7, 0xC1, 0xA7, 0xC2, 0xCA, 0xC5, 0xCA, 0xC8, /* 0x90-0x93 */ + 0xA7, 0xC3, 0xCA, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xCC, 0x68, 0x00, 0x00, 0xCC, 0x62, /* 0x9C-0x9F */ + 0xCC, 0x5D, 0xA9, 0xA3, 0xCC, 0x65, 0xCC, 0x63, /* 0xA0-0xA3 */ + 0xCC, 0x5C, 0xCC, 0x69, 0xCC, 0x6C, 0xCC, 0x67, /* 0xA4-0xA7 */ + 0xCC, 0x60, 0xA9, 0xA5, 0xCC, 0x66, 0xA9, 0xA6, /* 0xA8-0xAB */ + 0xCC, 0x61, 0xCC, 0x64, 0xCC, 0x5B, 0xCC, 0x5F, /* 0xAC-0xAF */ + 0xCC, 0x6B, 0xA9, 0xA7, 0x00, 0x00, 0xA9, 0xA8, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xCC, 0x5E, 0xCC, 0x6A, 0xA9, 0xA2, /* 0xB4-0xB7 */ + 0xA9, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xAB, 0xCE, 0xA4, /* 0xC4-0xC7 */ + 0xCE, 0xAA, 0xCE, 0xA3, 0xCE, 0xA5, 0xCE, 0x7D, /* 0xC8-0xCB */ + 0xCE, 0x7B, 0x00, 0x00, 0xCE, 0xAC, 0xCE, 0xA9, /* 0xCC-0xCF */ + 0xCE, 0x79, 0x00, 0x00, 0xAB, 0xD0, 0xCE, 0xA7, /* 0xD0-0xD3 */ + 0xCE, 0xA8, 0x00, 0x00, 0xCE, 0xA6, 0xCE, 0x7C, /* 0xD4-0xD7 */ + 0xCE, 0x7A, 0xAB, 0xCF, 0xCE, 0xA2, 0xCE, 0x7E, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xCE, 0xA1, 0xCE, 0xAD, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xAE, 0x6F, 0x00, 0x00, 0xAE, 0x6E, 0x00, 0x00, /* 0xE8-0xEB */ + 0xD1, 0x6C, 0xAE, 0x6B, 0xD1, 0x6E, 0x00, 0x00, /* 0xEC-0xEF */ + 0xAE, 0x70, 0xD1, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xAE, 0x73, 0x00, 0x00, 0xAE, 0x71, 0xD1, 0x70, /* 0xF4-0xF7 */ + 0xCE, 0xAE, 0xD1, 0x72, 0x00, 0x00, 0xAE, 0x6D, /* 0xF8-0xFB */ + 0x00, 0x00, 0xAE, 0x6C, 0x00, 0x00, 0xD1, 0x6D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5D[512] = { + 0xD1, 0x71, 0xAE, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xB1, 0x53, 0xB1, 0x52, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD4, 0xF5, /* 0x08-0x0B */ + 0xD4, 0xF9, 0xD4, 0xFB, 0xB1, 0x54, 0xD4, 0xFE, /* 0x0C-0x0F */ + 0x00, 0x00, 0xB1, 0x58, 0xD5, 0x41, 0x00, 0x00, /* 0x10-0x13 */ + 0xB1, 0x5A, 0x00, 0x00, 0xB1, 0x56, 0xB1, 0x5E, /* 0x14-0x17 */ + 0x00, 0x00, 0xB1, 0x5B, 0xD4, 0xF7, 0xB1, 0x55, /* 0x18-0x1B */ + 0x00, 0x00, 0xD4, 0xF6, 0xD4, 0xF4, 0xD5, 0x43, /* 0x1C-0x1F */ + 0xD4, 0xF8, 0x00, 0x00, 0xB1, 0x57, 0xD5, 0x42, /* 0x20-0x23 */ + 0xB1, 0x5C, 0xD4, 0xFD, 0xD4, 0xFC, 0xB1, 0x5D, /* 0x24-0x27 */ + 0xD4, 0xFA, 0xB1, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0x44, 0x00, 0x00, /* 0x2C-0x2F */ + 0xD5, 0x40, 0xD8, 0xE7, 0xD8, 0xEE, 0xD8, 0xE3, /* 0x30-0x33 */ + 0xB4, 0x51, 0xD8, 0xDF, 0xD8, 0xEF, 0xD8, 0xD9, /* 0x34-0x37 */ + 0xD8, 0xEC, 0xD8, 0xEA, 0xD8, 0xE4, 0x00, 0x00, /* 0x38-0x3B */ + 0xD8, 0xED, 0xD8, 0xE6, 0x00, 0x00, 0xD8, 0xDE, /* 0x3C-0x3F */ + 0xD8, 0xF0, 0xD8, 0xDC, 0xD8, 0xE9, 0xD8, 0xDA, /* 0x40-0x43 */ + 0x00, 0x00, 0xD8, 0xF1, 0x00, 0x00, 0xB4, 0x52, /* 0x44-0x47 */ + 0x00, 0x00, 0xD8, 0xEB, 0xDD, 0x4F, 0xD8, 0xDD, /* 0x48-0x4B */ + 0xB4, 0x4F, 0x00, 0x00, 0xD8, 0xE1, 0x00, 0x00, /* 0x4C-0x4F */ + 0xB4, 0x50, 0xD8, 0xE0, 0xD8, 0xE5, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xD8, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xD8, 0xE8, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0x53, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0x56, 0xDD, 0x4E, /* 0x60-0x63 */ + 0x00, 0x00, 0xDD, 0x50, 0x00, 0x00, 0xDD, 0x55, /* 0x64-0x67 */ + 0xDD, 0x54, 0xB7, 0x43, 0x00, 0x00, 0xD8, 0xDB, /* 0x68-0x6B */ + 0xDD, 0x52, 0x00, 0x00, 0x00, 0x00, 0xB7, 0x44, /* 0x6C-0x6F */ + 0x00, 0x00, 0xDD, 0x4D, 0xDD, 0x51, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xA9, /* 0x74-0x77 */ + 0x00, 0x00, 0xE1, 0xB0, 0xE1, 0xA7, 0x00, 0x00, /* 0x78-0x7B */ + 0xE1, 0xAE, 0xE1, 0xA5, 0xE1, 0xAD, 0xE1, 0xB1, /* 0x7C-0x7F */ + + 0xE1, 0xA4, 0xE1, 0xA8, 0xE1, 0xA3, 0x00, 0x00, /* 0x80-0x83 */ + 0xB9, 0xF1, 0x00, 0x00, 0xE1, 0xA6, 0xB9, 0xF2, /* 0x84-0x87 */ + 0xE1, 0xAC, 0xE1, 0xAB, 0xE1, 0xAA, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xE1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x65, 0xE5, 0x67, /* 0x90-0x93 */ + 0xBC, 0x6B, 0xE5, 0x68, 0x00, 0x00, 0xE5, 0x63, /* 0x94-0x97 */ + 0x00, 0x00, 0xE5, 0x62, 0xE5, 0x6C, 0x00, 0x00, /* 0x98-0x9B */ + 0xE5, 0x6A, 0xBC, 0x6A, 0xE5, 0x6D, 0xE5, 0x64, /* 0x9C-0x9F */ + 0xE5, 0x69, 0xE5, 0x6B, 0xE5, 0x66, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE9, 0x61, /* 0xA4-0xA7 */ + 0xE9, 0x66, 0xE9, 0x60, 0xE9, 0x65, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE9, 0x5E, 0xE9, 0x68, 0xE9, 0x64, 0xE9, 0x69, /* 0xAC-0xAF */ + 0xE9, 0x63, 0xE9, 0x5F, 0xE9, 0x67, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE9, 0x6A, 0xE9, 0x62, 0x00, 0x00, 0xEC, 0xDA, /* 0xB4-0xB7 */ + 0xC0, 0xAF, 0x00, 0x00, 0xC0, 0xAD, 0x00, 0x00, /* 0xB8-0xBB */ + 0xC0, 0xAC, 0xC0, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xEF, 0xC4, 0x00, 0x00, 0xF1, 0x72, 0xF1, 0xFD, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0x44, 0xF4, 0x45, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xC4, 0x60, 0x00, 0x00, 0xF5, 0xC9, /* 0xC8-0xCB */ + 0x00, 0x00, 0xC4, 0xDE, 0x00, 0x00, 0xF5, 0xCA, /* 0xCC-0xCF */ + 0x00, 0x00, 0xF6, 0xDE, 0xC5, 0x72, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xC5, 0x71, 0xF6, 0xDD, 0xC5, 0xC9, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xF7, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xA4, 0x74, 0xA6, 0x7B, 0xC9, 0xDA, /* 0xDC-0xDF */ + 0xCA, 0xCA, 0xA8, 0xB5, 0xB1, 0x5F, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xA4, 0x75, 0xA5, 0xAA, 0xA5, 0xA9, /* 0xE4-0xE7 */ + 0xA5, 0xA8, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xC5, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xAE, 0x74, 0x00, 0x00, /* 0xEC-0xEF */ + 0xDD, 0x57, 0xA4, 0x76, 0xA4, 0x77, 0xA4, 0x78, /* 0xF0-0xF3 */ + 0xA4, 0xDA, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xD1, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xCE, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xB4, 0x53, 0xA4, 0x79, 0xC9, 0x5D, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5E[512] = { + 0x00, 0x00, 0x00, 0x00, 0xA5, 0xAB, 0xA5, 0xAC, /* 0x00-0x03 */ + 0xC9, 0x78, 0x00, 0x00, 0xA6, 0x7C, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xCB, 0x00, 0x00, /* 0x08-0x0B */ + 0xA7, 0xC6, 0x00, 0x00, 0xCA, 0xCC, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xA9, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xCC, 0x6E, 0xA9, 0xAC, 0xA9, 0xAB, 0xCC, 0x6D, /* 0x14-0x17 */ + 0xA9, 0xA9, 0xCC, 0x6F, 0xA9, 0xAA, 0xA9, 0xAD, /* 0x18-0x1B */ + 0x00, 0x00, 0xAB, 0xD2, 0x00, 0x00, 0xAB, 0xD4, /* 0x1C-0x1F */ + 0xCE, 0xB3, 0xCE, 0xB0, 0xCE, 0xB1, 0xCE, 0xB2, /* 0x20-0x23 */ + 0xCE, 0xB4, 0xAB, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xD1, 0x74, 0xD1, 0x73, 0x00, 0x00, 0xAE, 0x76, /* 0x28-0x2B */ + 0x00, 0x00, 0xAE, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x62, /* 0x30-0x33 */ + 0xD5, 0x46, 0x00, 0x00, 0xB1, 0x61, 0xB1, 0x63, /* 0x34-0x37 */ + 0xB1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xB4, 0x55, 0xD5, 0x45, 0x00, 0x00, /* 0x3C-0x3F */ + 0xB4, 0x56, 0xD8, 0xF3, 0x00, 0x00, 0xB4, 0x57, /* 0x40-0x43 */ + 0xD8, 0xF2, 0xB4, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0x5A, 0xDD, 0x5C, /* 0x48-0x4B */ + 0xB7, 0x45, 0xDD, 0x5B, 0xDD, 0x59, 0xDD, 0x58, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xB4, /* 0x50-0x53 */ + 0xB9, 0xF7, 0xB9, 0xF5, 0x00, 0x00, 0xB9, 0xF6, /* 0x54-0x57 */ + 0xE1, 0xB2, 0xE1, 0xB3, 0x00, 0x00, 0xB9, 0xF3, /* 0x58-0x5B */ + 0xE5, 0x71, 0xE5, 0x6F, 0x00, 0x00, 0xBC, 0x6D, /* 0x5C-0x5F */ + 0xE5, 0x70, 0xBC, 0x6E, 0xBC, 0x6C, 0xB9, 0xF4, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0x6D, 0xE9, 0x6B, /* 0x64-0x67 */ + 0xE9, 0x6C, 0xE5, 0x6E, 0xEC, 0xDC, 0xC0, 0xB0, /* 0x68-0x6B */ + 0xEC, 0xDB, 0xEF, 0xC5, 0xEF, 0xC6, 0xE9, 0x6E, /* 0x6C-0x6F */ + 0xF1, 0xFE, 0x00, 0x00, 0xA4, 0x7A, 0xA5, 0xAD, /* 0x70-0x73 */ + 0xA6, 0x7E, 0xC9, 0xDB, 0xA6, 0x7D, 0x00, 0x00, /* 0x74-0x77 */ + 0xA9, 0xAF, 0xB7, 0x46, 0x00, 0x00, 0xA4, 0xDB, /* 0x78-0x7B */ + 0xA5, 0xAE, 0xAB, 0xD5, 0xB4, 0x58, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xC9, 0x79, 0x00, 0x00, 0xC9, 0x7A, 0x00, 0x00, /* 0x80-0x83 */ + 0xC9, 0xDC, 0x00, 0x00, 0x00, 0x00, 0xA7, 0xC8, /* 0x84-0x87 */ + 0xCA, 0xD0, 0xCA, 0xCE, 0xA7, 0xC9, 0xCA, 0xCD, /* 0x88-0x8B */ + 0xCA, 0xCF, 0xCA, 0xD1, 0x00, 0x00, 0xA7, 0xC7, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xA9, 0xB3, 0xA9, 0xB4, 0xA9, 0xB1, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0xB0, 0xCE, 0xB8, /* 0x98-0x9B */ + 0xA9, 0xB2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xAB, 0xD6, 0x00, 0x00, 0xCE, 0xB7, 0xCE, 0xB9, /* 0xA0-0xA3 */ + 0xCE, 0xB6, 0xCE, 0xBA, 0xAB, 0xD7, 0xAE, 0x79, /* 0xA4-0xA7 */ + 0xD1, 0x75, 0x00, 0x00, 0xD1, 0x77, 0xAE, 0x77, /* 0xA8-0xAB */ + 0xD1, 0x78, 0xAE, 0x78, 0xD1, 0x76, 0x00, 0x00, /* 0xAC-0xAF */ + 0xCE, 0xB5, 0xD5, 0x47, 0xD5, 0x4A, 0xD5, 0x4B, /* 0xB0-0xB3 */ + 0xD5, 0x48, 0xB1, 0x67, 0xB1, 0x66, 0xB1, 0x64, /* 0xB4-0xB7 */ + 0xB1, 0x65, 0xD5, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xB1, 0x68, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xB4, 0x5A, 0xB4, 0x5B, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xB4, 0x5C, 0xDD, 0x5D, 0xDD, 0x5F, 0xDD, 0x61, /* 0xC4-0xC7 */ + 0xB7, 0x48, 0xB7, 0x47, 0xB4, 0x59, 0xDD, 0x60, /* 0xC8-0xCB */ + 0xDD, 0x5E, 0x00, 0x00, 0xE1, 0xB8, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xE1, 0xB6, 0xE1, 0xBC, 0xB9, 0xF8, /* 0xD0-0xD3 */ + 0xE1, 0xBD, 0xE1, 0xBA, 0xB9, 0xF9, 0xE1, 0xB7, /* 0xD4-0xD7 */ + 0xE1, 0xB5, 0xE1, 0xBB, 0xBC, 0x70, 0xE5, 0x73, /* 0xD8-0xDB */ + 0xE1, 0xB9, 0xBC, 0x72, 0xE5, 0x74, 0xBC, 0x71, /* 0xDC-0xDF */ + 0xBC, 0x74, 0xE5, 0x75, 0xBC, 0x6F, 0xBC, 0x73, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE9, 0x73, 0xE9, 0x71, 0xE9, 0x70, /* 0xE4-0xE7 */ + 0xE9, 0x72, 0xE9, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xC3, 0x66, 0x00, 0x00, 0xF4, 0x46, 0xF4, 0x47, /* 0xEC-0xEF */ + 0x00, 0x00, 0xF5, 0xCB, 0xF6, 0xDF, 0xC6, 0x55, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0xB5, 0xA7, 0xCA, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0xD8, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xA4, 0x7B, 0xA4, 0xDC, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_5F[512] = { + 0x00, 0x00, 0xA5, 0xAF, 0xC9, 0xDD, 0x00, 0x00, /* 0x00-0x03 */ + 0xA7, 0xCB, 0xCA, 0xD2, 0x00, 0x00, 0xCE, 0xBB, /* 0x04-0x07 */ + 0xAB, 0xD9, 0x00, 0x00, 0xB9, 0xFA, 0xA4, 0x7C, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0xA1, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0x49, 0xA4, 0x7D, /* 0x10-0x13 */ + 0xA4, 0xDD, 0xA4, 0xDE, 0x00, 0x00, 0xA5, 0xB1, /* 0x14-0x17 */ + 0xA5, 0xB0, 0x00, 0x00, 0xC9, 0xDE, 0xA6, 0xA2, /* 0x18-0x1B */ + 0x00, 0x00, 0xCA, 0xD3, 0x00, 0x00, 0xA7, 0xCC, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0x71, 0xCC, 0x72, /* 0x20-0x23 */ + 0xCC, 0x73, 0x00, 0x00, 0xA9, 0xB6, 0xA9, 0xB7, /* 0x24-0x27 */ + 0xCC, 0x70, 0xA9, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xAB, 0xDA, 0xCE, 0xBC, 0x00, 0x00, /* 0x2C-0x2F */ + 0xD1, 0x7A, 0xAE, 0x7A, 0x00, 0x00, 0xD1, 0x79, /* 0x30-0x33 */ + 0x00, 0x00, 0xB1, 0x69, 0xD5, 0x4C, 0xB1, 0x6A, /* 0x34-0x37 */ + 0xD5, 0x4D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xB4, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xDD, 0x62, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xBF, /* 0x40-0x43 */ + 0xE1, 0xBE, 0x00, 0x00, 0xB9, 0xFB, 0x00, 0x00, /* 0x44-0x47 */ + 0xBC, 0x75, 0xE5, 0x76, 0xBE, 0xCA, 0xE9, 0x74, /* 0x48-0x4B */ + 0xC0, 0xB1, 0x00, 0x00, 0xC5, 0x73, 0xF7, 0xD8, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xCC, 0x74, 0x00, 0x00, 0xCE, 0xBD, 0xB1, 0x6B, /* 0x54-0x57 */ + 0xD8, 0xF4, 0xB7, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xC2, 0x55, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xA7, 0xCE, 0x00, 0x00, /* 0x60-0x63 */ + 0xA7, 0xCD, 0xAB, 0xDB, 0x00, 0x00, 0xD1, 0x7B, /* 0x64-0x67 */ + 0x00, 0x00, 0xB1, 0x6D, 0xB3, 0x43, 0xB1, 0x6E, /* 0x68-0x6B */ + 0xB1, 0x6C, 0xB4, 0x5E, 0x00, 0x00, 0xE1, 0xC0, /* 0x6C-0x6F */ + 0xB9, 0xFC, 0xBC, 0x76, 0x00, 0x00, 0xC9, 0x4C, /* 0x70-0x73 */ + 0xC9, 0xDF, 0x00, 0x00, 0xCA, 0xD5, 0xA7, 0xCF, /* 0x74-0x77 */ + 0xCA, 0xD4, 0xA7, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xA9, 0xBC, 0xCC, 0x77, 0xCC, 0x76, 0xA9, 0xBB, /* 0x7C-0x7F */ + + 0xA9, 0xB9, 0xA9, 0xBA, 0xCC, 0x75, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xAB, 0xDD, 0xCE, 0xBE, 0xAB, 0xE0, /* 0x84-0x87 */ + 0xAB, 0xDC, 0xAB, 0xE2, 0xAB, 0xDE, 0xAB, 0xDF, /* 0x88-0x8B */ + 0xAB, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xAE, 0x7D, 0xAE, 0x7C, 0xAE, 0x7B, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0x4F, 0xB1, 0x6F, /* 0x94-0x97 */ + 0xB1, 0x72, 0xB1, 0x70, 0x00, 0x00, 0xD5, 0x4E, /* 0x98-0x9B */ + 0xB1, 0x75, 0x00, 0x00, 0xB1, 0x71, 0xD5, 0x50, /* 0x9C-0x9F */ + 0xB1, 0x74, 0xB1, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xD8, 0xF6, 0xD8, 0xF5, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xB4, 0x61, 0xB4, 0x5F, 0xB4, 0x60, 0xD8, 0xF7, /* 0xA8-0xAB */ + 0xB7, 0x4B, 0xDD, 0x64, 0xB7, 0x4C, 0xDD, 0x63, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0x77, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xBC, 0x78, 0xE1, 0xC1, 0xBC, 0x77, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xB9, 0xFD, 0x00, 0x00, 0xEC, 0xDE, /* 0xB8-0xBB */ + 0xE9, 0x75, 0xC0, 0xB2, 0xEC, 0xDD, 0xF2, 0x40, /* 0xBC-0xBF */ + 0xF4, 0x48, 0xF4, 0x49, 0x00, 0x00, 0xA4, 0xDF, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xA5, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xC9, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xA7, 0xD2, 0xA7, 0xD4, 0x00, 0x00, 0xC9, 0xE2, /* 0xCC-0xCF */ + 0xCA, 0xD8, 0xCA, 0xD7, 0xCA, 0xD6, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xC9, 0xE1, 0xC9, 0xE0, 0xA6, 0xA4, 0xA7, 0xD3, /* 0xD4-0xD7 */ + 0xA7, 0xD1, 0xA6, 0xA3, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xA9, 0xBD, 0xCC, 0x78, 0x00, 0x00, /* 0xDC-0xDF */ + 0xA9, 0xBE, 0xCA, 0xDD, 0x00, 0x00, 0xCA, 0xDF, /* 0xE0-0xE3 */ + 0xCA, 0xDE, 0xCC, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xCA, 0xDA, 0x00, 0x00, 0xA7, 0xD8, 0xA7, 0xD6, /* 0xE8-0xEB */ + 0x00, 0x00, 0xCA, 0xD9, 0xCA, 0xDB, 0xCA, 0xE1, /* 0xEC-0xEF */ + 0x00, 0x00, 0xA7, 0xD5, 0x00, 0x00, 0xCA, 0xDC, /* 0xF0-0xF3 */ + 0xCA, 0xE5, 0xA9, 0xC0, 0x00, 0x00, 0xCA, 0xE2, /* 0xF4-0xF7 */ + 0xA7, 0xD7, 0x00, 0x00, 0xCA, 0xE0, 0xCA, 0xE3, /* 0xF8-0xFB */ + 0x00, 0x00, 0xA9, 0xBF, 0x00, 0x00, 0xA9, 0xC1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_60[512] = { + 0xCA, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xCC, 0xAF, 0xCC, 0xA2, 0xCC, 0x7E, /* 0x08-0x0B */ + 0xCC, 0xAE, 0xCC, 0xA9, 0xAB, 0xE7, 0xA9, 0xC2, /* 0x0C-0x0F */ + 0xCC, 0xAA, 0xCC, 0xAD, 0xAB, 0xE3, 0xCC, 0xAC, /* 0x10-0x13 */ + 0xA9, 0xC3, 0xA9, 0xC8, 0xA9, 0xC6, 0xCC, 0xA3, /* 0x14-0x17 */ + 0x00, 0x00, 0xCC, 0x7C, 0xCC, 0xA5, 0xA9, 0xCD, /* 0x18-0x1B */ + 0xCC, 0xB0, 0xAB, 0xE4, 0xCC, 0xA6, 0x00, 0x00, /* 0x1C-0x1F */ + 0xAB, 0xE5, 0xA9, 0xC9, 0xCC, 0xA8, 0x00, 0x00, /* 0x20-0x23 */ + 0xCE, 0xCD, 0xAB, 0xE6, 0xCC, 0x7B, 0xA9, 0xCA, /* 0x24-0x27 */ + 0xAB, 0xE8, 0xA9, 0xCB, 0xA9, 0xC7, 0xA9, 0xCC, /* 0x28-0x2B */ + 0xCC, 0xA7, 0xCC, 0x7A, 0xCC, 0xAB, 0xA9, 0xC4, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0x7D, 0xCC, 0xA4, /* 0x30-0x33 */ + 0xCC, 0xA1, 0xA9, 0xC5, 0x00, 0x00, 0xCE, 0xBF, /* 0x34-0x37 */ + 0x00, 0x00, 0xCE, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xCE, 0xCA, 0xD1, 0xA1, 0xCE, 0xCB, 0xAB, 0xEE, /* 0x40-0x43 */ + 0xCE, 0xCE, 0xCE, 0xC4, 0xAB, 0xED, 0xCE, 0xC6, /* 0x44-0x47 */ + 0x00, 0x00, 0xCE, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xCE, 0xC9, 0xAB, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xAE, 0xA3, 0x00, 0x00, 0xF9, 0xDA, 0xCE, 0xC5, /* 0x50-0x53 */ + 0xCE, 0xC1, 0xAE, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xCE, 0xCF, 0xAE, 0x7E, 0xD1, 0x7D, 0xCE, 0xC8, /* 0x58-0x5B */ + 0x00, 0x00, 0xD1, 0x7C, 0xCE, 0xC3, 0xCE, 0xCC, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0xEC, 0xAE, 0xA1, /* 0x60-0x63 */ + 0xAB, 0xF2, 0xAE, 0xA2, 0xCE, 0xD0, 0xD1, 0x7E, /* 0x64-0x67 */ + 0xAB, 0xEB, 0xAE, 0xA6, 0xAB, 0xF1, 0xAB, 0xF0, /* 0x68-0x6B */ + 0xAB, 0xEF, 0xAE, 0xA5, 0xCE, 0xD1, 0xAE, 0xA7, /* 0x6C-0x6F */ + 0xAB, 0xEA, 0x00, 0x00, 0xCE, 0xC2, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0x76, /* 0x7C-0x7F */ + + 0xD1, 0xA4, 0xD1, 0xA6, 0x00, 0x00, 0xD1, 0xA8, /* 0x80-0x83 */ + 0xAE, 0xA8, 0xAE, 0xAE, 0xD5, 0x53, 0xD1, 0xAC, /* 0x84-0x87 */ + 0xD1, 0xA3, 0xB1, 0x78, 0xD5, 0x51, 0x00, 0x00, /* 0x88-0x8B */ + 0xAE, 0xAD, 0xAE, 0xAB, 0xD1, 0xAE, 0x00, 0x00, /* 0x8C-0x8F */ + 0xD5, 0x52, 0x00, 0x00, 0xD1, 0xA5, 0x00, 0x00, /* 0x90-0x93 */ + 0xAE, 0xAC, 0xD1, 0xA9, 0xAE, 0xAF, 0xD1, 0xAB, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xAE, 0xAA, 0xD1, 0xAA, /* 0x98-0x9B */ + 0xD1, 0xAD, 0xD1, 0xA7, 0x00, 0x00, 0xAE, 0xA9, /* 0x9C-0x9F */ + 0xB1, 0x79, 0x00, 0x00, 0xD1, 0xA2, 0xB1, 0x77, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xB1, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xD5, 0x55, 0xD5, 0x5E, 0xB4, 0x64, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xB1, 0x7C, 0xB1, 0xA3, 0xB4, 0x65, 0xD5, 0x60, /* 0xB4-0xB7 */ + 0xB1, 0xAA, 0xD8, 0xF9, 0xD5, 0x56, 0xB1, 0xA2, /* 0xB8-0xBB */ + 0xB1, 0xA5, 0xB1, 0x7E, 0xD5, 0x54, 0xD5, 0x62, /* 0xBC-0xBF */ + 0xD5, 0x65, 0xD9, 0x49, 0x00, 0x00, 0xD5, 0x63, /* 0xC0-0xC3 */ + 0xD8, 0xFD, 0xB1, 0xA1, 0xB1, 0xA8, 0xB1, 0xAC, /* 0xC4-0xC7 */ + 0xD5, 0x5D, 0xD8, 0xF8, 0xD5, 0x61, 0xB1, 0x7B, /* 0xC8-0xCB */ + 0xD8, 0xFA, 0xD5, 0x64, 0xD8, 0xFC, 0xD5, 0x59, /* 0xCC-0xCF */ + 0x00, 0x00, 0xB4, 0x62, 0x00, 0x00, 0xD5, 0x57, /* 0xD0-0xD3 */ + 0xD5, 0x58, 0xB1, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xB1, 0xA6, 0xD5, 0x5B, 0xB1, 0xAB, 0xD5, 0x5F, /* 0xD8-0xDB */ + 0xB1, 0xA4, 0xD5, 0x5C, 0x00, 0x00, 0xB1, 0xA9, /* 0xDC-0xDF */ + 0xB4, 0x66, 0xB4, 0x63, 0xD8, 0xFB, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xD5, 0x5A, 0x00, 0x00, 0xB1, 0x7D, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xB4, 0x6B, 0xB4, 0x6F, 0xD9, 0x40, 0xB7, 0x51, /* 0xF0-0xF3 */ + 0xB4, 0x6D, 0xD9, 0x44, 0xB4, 0x71, 0xDD, 0x65, /* 0xF4-0xF7 */ + 0xD9, 0x46, 0xB7, 0x53, 0xB4, 0x69, 0xB4, 0x6C, /* 0xF8-0xFB */ + 0xD9, 0x47, 0x00, 0x00, 0xD9, 0x48, 0xD9, 0x4E, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_61[512] = { + 0xB4, 0x73, 0xB7, 0x54, 0x00, 0x00, 0xD9, 0x4A, /* 0x00-0x03 */ + 0xD9, 0x4F, 0xD9, 0x43, 0xB7, 0x5E, 0x00, 0x00, /* 0x04-0x07 */ + 0xB7, 0x55, 0xB4, 0x72, 0xD9, 0x41, 0xD9, 0x50, /* 0x08-0x0B */ + 0x00, 0x00, 0xB7, 0x5D, 0xB4, 0x70, 0xB7, 0x4E, /* 0x0C-0x0F */ + 0xD9, 0x4D, 0x00, 0x00, 0xB4, 0x74, 0xD9, 0x45, /* 0x10-0x13 */ + 0xD8, 0xFE, 0xB4, 0x6A, 0xD9, 0x42, 0x00, 0x00, /* 0x14-0x17 */ + 0xD9, 0x4B, 0x00, 0x00, 0xB7, 0x4D, 0xB7, 0x52, /* 0x18-0x1B */ + 0xB4, 0x67, 0xD9, 0x4C, 0x00, 0x00, 0xB7, 0x50, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB4, 0x68, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB7, 0x5C, /* 0x24-0x27 */ + 0xE1, 0xC3, 0xDD, 0x70, 0x00, 0x00, 0xDD, 0x68, /* 0x28-0x2B */ + 0xE1, 0xC2, 0x00, 0x00, 0xDD, 0x6C, 0xDD, 0x6E, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xDD, 0x6B, 0x00, 0x00, /* 0x30-0x33 */ + 0xB7, 0x5B, 0x00, 0x00, 0xDD, 0x6A, 0xB7, 0x5F, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xD2, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0x5A, 0xBA, 0x40, /* 0x3C-0x3F */ + 0xDD, 0x71, 0xE1, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xB7, 0x58, 0xDD, 0x69, 0xDD, 0x6D, 0xB9, 0xFE, /* 0x44-0x47 */ + 0xB7, 0x4F, 0xDD, 0x66, 0xDD, 0x67, 0xBA, 0x41, /* 0x48-0x4B */ + 0xB7, 0x57, 0xB7, 0x59, 0xB7, 0x56, 0xDD, 0x6F, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xC8, 0xE1, 0xC9, /* 0x50-0x53 */ + 0xE1, 0xCE, 0xBC, 0x7D, 0xE1, 0xD5, 0x00, 0x00, /* 0x54-0x57 */ + 0xBA, 0x47, 0x00, 0x00, 0xBA, 0x46, 0xE1, 0xD0, /* 0x58-0x5B */ + 0x00, 0x00, 0xBC, 0x7C, 0xE1, 0xC5, 0xBA, 0x45, /* 0x5C-0x5F */ + 0x00, 0x00, 0xE1, 0xD4, 0xBA, 0x43, 0xBA, 0x44, /* 0x60-0x63 */ + 0x00, 0x00, 0xE1, 0xD1, 0xE5, 0xAA, 0xBC, 0x7A, /* 0x64-0x67 */ + 0xB4, 0x6E, 0x00, 0x00, 0xE1, 0xD3, 0xBC, 0xA3, /* 0x68-0x6B */ + 0xE1, 0xCB, 0x00, 0x00, 0xBC, 0x7B, 0x00, 0x00, /* 0x6C-0x6F */ + 0xBC, 0xA2, 0xE1, 0xC6, 0xE1, 0xCA, 0xE1, 0xC7, /* 0x70-0x73 */ + 0xE1, 0xCD, 0xBA, 0x48, 0xBC, 0x79, 0xBA, 0x42, /* 0x74-0x77 */ + 0x00, 0x00, 0xE5, 0x7A, 0xE1, 0xCF, 0x00, 0x00, /* 0x78-0x7B */ + 0xBC, 0xA1, 0x00, 0x00, 0xBC, 0xA4, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xE1, 0xCC, 0x00, 0x00, 0xBC, 0x7E, 0xE5, 0x79, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xE5, 0x7E, 0xBE, 0xCE, 0xE5, 0x78, /* 0x88-0x8B */ + 0xE9, 0xA3, 0xE5, 0xA9, 0xBC, 0xA8, 0x00, 0x00, /* 0x8C-0x8F */ + 0xBC, 0xA6, 0xBE, 0xCC, 0xE5, 0xA6, 0xE5, 0xA2, /* 0x90-0x93 */ + 0xBC, 0xAC, 0x00, 0x00, 0xE9, 0x78, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xBC, 0xAA, 0xE5, 0xA1, /* 0x98-0x9B */ + 0x00, 0x00, 0xE9, 0x76, 0x00, 0x00, 0xE5, 0xA5, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE5, 0xA8, 0xE5, 0x7D, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xBC, 0xAB, 0x00, 0x00, 0x00, 0x00, 0xBC, 0xA5, /* 0xA4-0xA7 */ + 0xE9, 0x77, 0xBE, 0xCD, 0xE5, 0xA7, 0xBC, 0xA7, /* 0xA8-0xAB */ + 0xBC, 0xA9, 0xE5, 0xA4, 0xBC, 0xAD, 0xE5, 0xA3, /* 0xAC-0xAF */ + 0xE5, 0x7C, 0xE5, 0x7B, 0xBE, 0xCB, 0xE5, 0xAB, /* 0xB0-0xB3 */ + 0xE9, 0x7A, 0xEC, 0xE0, 0xBE, 0xD0, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE9, 0xA2, 0x00, 0x00, 0xE9, 0x7E, 0x00, 0x00, /* 0xB8-0xBB */ + 0xEC, 0xE1, 0x00, 0x00, 0xBE, 0xD1, 0xE9, 0xA1, /* 0xBC-0xBF */ + 0x00, 0x00, 0xE9, 0x7C, 0xC0, 0xB4, 0xEC, 0xDF, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE9, 0x79, 0xE9, 0x7B, 0xC0, 0xB5, /* 0xC4-0xC7 */ + 0xBE, 0xD3, 0xC0, 0xB3, 0xBE, 0xD2, 0xC0, 0xB7, /* 0xC8-0xCB */ + 0xE9, 0x7D, 0xBE, 0xCF, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0xCF, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xEF, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xE7, 0xEF, 0xC8, /* 0xDC-0xDF */ + 0xEC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x56, /* 0xE0-0xE3 */ + 0xEC, 0xE5, 0xEC, 0xE4, 0xC0, 0xB6, 0xEC, 0xE2, /* 0xE4-0xE7 */ + 0xEC, 0xE6, 0xEF, 0xD0, 0xEF, 0xCC, 0xEF, 0xCE, /* 0xE8-0xEB */ + 0x00, 0x00, 0xEF, 0xC9, 0xEF, 0xCA, 0x00, 0x00, /* 0xEC-0xEF */ + 0xEF, 0xCD, 0xEF, 0xCB, 0xC3, 0x67, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xC3, 0x6A, 0xC3, 0x69, 0xC3, 0x68, /* 0xF4-0xF7 */ + 0xC4, 0x61, 0xF4, 0x4A, 0xC4, 0x62, 0xF2, 0x41, /* 0xF8-0xFB */ + 0xC4, 0xDF, 0xF5, 0xCC, 0xC4, 0xE0, 0xC5, 0x74, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_62[512] = { + 0xC5, 0xCA, 0xF7, 0xD9, 0x00, 0x00, 0xF7, 0xDA, /* 0x00-0x03 */ + 0xF7, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xBA, /* 0x04-0x07 */ + 0xA4, 0xE0, 0xC9, 0x7C, 0xA5, 0xB3, 0x00, 0x00, /* 0x08-0x0B */ + 0xA6, 0xA6, 0xA6, 0xA7, 0xA6, 0xA5, 0x00, 0x00, /* 0x0C-0x0F */ + 0xA6, 0xA8, 0xA7, 0xDA, 0xA7, 0xD9, 0x00, 0x00, /* 0x10-0x13 */ + 0xCC, 0xB1, 0xA9, 0xCF, 0xA9, 0xCE, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xD1, 0xAF, 0xB1, 0xAD, 0xB1, 0xAE, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB4, 0x75, /* 0x1C-0x1F */ + 0xDD, 0x72, 0xB7, 0x60, 0xB7, 0x61, 0xDD, 0x74, /* 0x20-0x23 */ + 0xDD, 0x76, 0xDD, 0x75, 0x00, 0x00, 0xE1, 0xD7, /* 0x24-0x27 */ + 0x00, 0x00, 0xE1, 0xD6, 0xBA, 0x49, 0xE1, 0xD8, /* 0x28-0x2B */ + 0x00, 0x00, 0xE5, 0xAC, 0xBC, 0xAE, 0x00, 0x00, /* 0x2C-0x2F */ + 0xBE, 0xD4, 0x00, 0x00, 0xC0, 0xB8, 0xC2, 0x57, /* 0x30-0x33 */ + 0xC0, 0xB9, 0x00, 0x00, 0xA4, 0xE1, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0xE6, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xCC, 0xB2, 0xA9, 0xD1, 0xA9, 0xD0, /* 0x3C-0x3F */ + 0xA9, 0xD2, 0xAB, 0xF3, 0xCE, 0xD2, 0xCE, 0xD3, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xB0, 0xAE, 0xB0, /* 0x44-0x47 */ + 0xB1, 0xAF, 0xB4, 0x76, 0xD9, 0x51, 0xA4, 0xE2, /* 0x48-0x4B */ + 0x00, 0x00, 0xA4, 0x7E, 0xA4, 0xE3, 0x00, 0x00, /* 0x4C-0x4F */ + 0xC9, 0x7D, 0xA5, 0xB7, 0xA5, 0xB6, 0xA5, 0xB4, /* 0x50-0x53 */ + 0xA5, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xA6, 0xAB, 0xC9, 0xE9, 0xC9, 0xEB, 0xA6, 0xAA, /* 0x58-0x5B */ + 0xC9, 0xE3, 0x00, 0x00, 0xC9, 0xE4, 0x00, 0x00, /* 0x5C-0x5F */ + 0xC9, 0xEA, 0xC9, 0xE6, 0xC9, 0xE8, 0xA6, 0xA9, /* 0x60-0x63 */ + 0xC9, 0xE5, 0xC9, 0xEC, 0xC9, 0xE7, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xA7, 0xE1, 0xA7, 0xEA, 0xA7, 0xE8, /* 0x6C-0x6F */ + 0xCA, 0xF0, 0xCA, 0xED, 0xCA, 0xF5, 0xA7, 0xE6, /* 0x70-0x73 */ + 0xCA, 0xF6, 0x00, 0x00, 0xA7, 0xDF, 0xCA, 0xF3, /* 0x74-0x77 */ + 0x00, 0x00, 0xA7, 0xE5, 0xCA, 0xEF, 0xCA, 0xEE, /* 0x78-0x7B */ + 0xA7, 0xE3, 0xCA, 0xF4, 0xA7, 0xE4, 0xA9, 0xD3, /* 0x7C-0x7F */ + + 0xA7, 0xDE, 0xCA, 0xF1, 0x00, 0x00, 0xCA, 0xE7, /* 0x80-0x83 */ + 0xA7, 0xDB, 0x00, 0x00, 0xA7, 0xEE, 0xCA, 0xEC, /* 0x84-0x87 */ + 0xCA, 0xF2, 0xA7, 0xE0, 0xA7, 0xE2, 0x00, 0x00, /* 0x88-0x8B */ + 0xCA, 0xE8, 0x00, 0x00, 0xCA, 0xE9, 0xCA, 0xEA, /* 0x8C-0x8F */ + 0x00, 0x00, 0xA7, 0xED, 0xA7, 0xE7, 0xA7, 0xEC, /* 0x90-0x93 */ + 0xCA, 0xEB, 0xA7, 0xEB, 0xA7, 0xDD, 0xA7, 0xDC, /* 0x94-0x97 */ + 0xA7, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xA9, 0xE1, 0xCC, 0xBE, 0xCC, 0xB7, 0xA9, 0xDC, /* 0xA8-0xAB */ + 0xA9, 0xEF, 0xCC, 0xB3, 0xCC, 0xBA, 0xCC, 0xBC, /* 0xAC-0xAF */ + 0xCC, 0xBF, 0xA9, 0xEA, 0x00, 0x00, 0xCC, 0xBB, /* 0xB0-0xB3 */ + 0xCC, 0xB4, 0xA9, 0xE8, 0xCC, 0xB8, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xCC, 0xC0, 0xA9, 0xD9, 0x00, 0x00, 0xCC, 0xBD, /* 0xB8-0xBB */ + 0xA9, 0xE3, 0xA9, 0xE2, 0xCC, 0xB6, 0xA9, 0xD7, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0xD8, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xA9, 0xD6, 0x00, 0x00, 0xA9, 0xEE, 0xA9, 0xE6, /* 0xC4-0xC7 */ + 0xA9, 0xE0, 0xA9, 0xD4, 0xCC, 0xB9, 0xA9, 0xDF, /* 0xC8-0xCB */ + 0xA9, 0xD5, 0xA9, 0xE7, 0xA9, 0xF0, 0xCE, 0xD4, /* 0xCC-0xCF */ + 0xA9, 0xE4, 0xCC, 0xB5, 0xA9, 0xDA, 0xA9, 0xDD, /* 0xD0-0xD3 */ + 0xA9, 0xDE, 0x00, 0x00, 0xA9, 0xEC, 0xA9, 0xED, /* 0xD4-0xD7 */ + 0xA9, 0xEB, 0xA9, 0xE5, 0xA9, 0xE9, 0xA9, 0xDB, /* 0xD8-0xDB */ + 0xAB, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xDA, /* 0xE8-0xEB */ + 0xAC, 0x41, 0xAB, 0xF8, 0xAB, 0xFA, 0xAC, 0x40, /* 0xEC-0xEF */ + 0xCE, 0xE6, 0xAB, 0xFD, 0xD1, 0xB1, 0xAE, 0xB1, /* 0xF0-0xF3 */ + 0xAC, 0x43, 0xCE, 0xD7, 0xCE, 0xDF, 0xAB, 0xFE, /* 0xF4-0xF7 */ + 0xCE, 0xDE, 0xCE, 0xDB, 0xCE, 0xE3, 0xCE, 0xE5, /* 0xF8-0xFB */ + 0xAB, 0xF7, 0xAB, 0xFB, 0xAC, 0x42, 0xAE, 0xB3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_63[512] = { + 0xCE, 0xE0, 0xAB, 0xF9, 0xAC, 0x45, 0xCE, 0xD9, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAB, 0xFC, /* 0x04-0x07 */ + 0xAE, 0xB2, 0xAB, 0xF6, 0x00, 0x00, 0xCE, 0xD6, /* 0x08-0x0B */ + 0xCE, 0xDD, 0xCE, 0xD5, 0xCE, 0xD8, 0xCE, 0xDC, /* 0x0C-0x0F */ + 0xD1, 0xB2, 0xAC, 0x44, 0x00, 0x00, 0xCE, 0xE1, /* 0x10-0x13 */ + 0xCE, 0xE2, 0xCE, 0xE4, 0xAB, 0xF5, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xAE, 0xC1, 0xD1, 0xBE, 0xAE, 0xBF, 0xAE, 0xC0, /* 0x28-0x2B */ + 0xD1, 0xB4, 0xD1, 0xC4, 0x00, 0x00, 0xAE, 0xB6, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xD5, 0x66, 0xD1, 0xC6, /* 0x30-0x33 */ + 0xD1, 0xC0, 0x00, 0x00, 0xD1, 0xB7, 0x00, 0x00, /* 0x34-0x37 */ + 0xD1, 0xC9, 0xD1, 0xBA, 0xAE, 0xBC, 0xD5, 0x7D, /* 0x38-0x3B */ + 0xD1, 0xBD, 0xAE, 0xBE, 0xAE, 0xB5, 0x00, 0x00, /* 0x3C-0x3F */ + 0xD1, 0xCB, 0xD1, 0xBF, 0xAE, 0xB8, 0xD1, 0xB8, /* 0x40-0x43 */ + 0xD1, 0xB5, 0xD1, 0xB6, 0xAE, 0xB9, 0xD1, 0xC5, /* 0x44-0x47 */ + 0xD1, 0xCC, 0xAE, 0xBB, 0xD1, 0xBC, 0xD1, 0xBB, /* 0x48-0x4B */ + 0xAE, 0xC3, 0xAE, 0xC2, 0xAE, 0xB4, 0xAE, 0xBA, /* 0x4C-0x4F */ + 0xAE, 0xBD, 0xD1, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xD1, 0xC2, 0xAE, 0xB7, 0xD1, 0xB3, 0xD1, 0xCA, /* 0x54-0x57 */ + 0xD1, 0xC1, 0xD1, 0xC3, 0xD1, 0xC7, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xD5, 0x67, 0x00, 0x00, 0xB1, 0xB7, /* 0x64-0x67 */ + 0xB1, 0xCB, 0xB1, 0xCA, 0x00, 0x00, 0xB1, 0xBF, /* 0x68-0x6B */ + 0x00, 0x00, 0xD5, 0x79, 0xD5, 0x75, 0xD5, 0x72, /* 0x6C-0x6F */ + 0xD5, 0xA6, 0xB1, 0xBA, 0xB1, 0xB2, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xD5, 0x77, 0xB4, 0xA8, 0xB1, 0xB6, /* 0x74-0x77 */ + 0xD5, 0xA1, 0x00, 0x00, 0xB1, 0xCC, 0xB1, 0xC9, /* 0x78-0x7B */ + 0xD5, 0x7B, 0xD5, 0x6A, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xB1, 0xC8, 0xD5, 0xA3, 0xD5, 0x69, 0xB1, 0xBD, /* 0x80-0x83 */ + 0xB1, 0xC1, 0xD5, 0xA2, 0x00, 0x00, 0xD5, 0x73, /* 0x84-0x87 */ + 0xB1, 0xC2, 0xB1, 0xBC, 0xD5, 0x68, 0x00, 0x00, /* 0x88-0x8B */ + 0xB4, 0x78, 0xD5, 0xA5, 0xD5, 0x71, 0xB1, 0xC7, /* 0x8C-0x8F */ + 0xD5, 0x74, 0xD5, 0xA4, 0xB1, 0xC6, 0x00, 0x00, /* 0x90-0x93 */ + 0xD9, 0x52, 0x00, 0x00, 0xB1, 0xB3, 0xD5, 0x6F, /* 0x94-0x97 */ + 0xB1, 0xB8, 0xB1, 0xC3, 0x00, 0x00, 0xB1, 0xBE, /* 0x98-0x9B */ + 0xD5, 0x78, 0xD5, 0x6E, 0xD5, 0x6C, 0xD5, 0x7E, /* 0x9C-0x9F */ + 0xB1, 0xB0, 0xB1, 0xC4, 0xB1, 0xB4, 0xB4, 0x77, /* 0xA0-0xA3 */ + 0xD5, 0x7C, 0xB1, 0xB5, 0x00, 0x00, 0xB1, 0xB1, /* 0xA4-0xA7 */ + 0xB1, 0xC0, 0xB1, 0xBB, 0xB1, 0xB9, 0xD5, 0x70, /* 0xA8-0xAB */ + 0xB1, 0xC5, 0xD5, 0x6D, 0xD5, 0x7A, 0xD5, 0x76, /* 0xAC-0xAF */ + 0xD9, 0x54, 0xD9, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xD5, 0x6B, 0xD9, 0x64, 0x00, 0x00, /* 0xBC-0xBF */ + 0xB4, 0x7A, 0x00, 0x00, 0xD9, 0x6A, 0xD9, 0x59, /* 0xC0-0xC3 */ + 0xD9, 0x67, 0xDD, 0x77, 0xB4, 0x7D, 0xD9, 0x6B, /* 0xC4-0xC7 */ + 0xD9, 0x6E, 0xB4, 0x7C, 0xD9, 0x5C, 0xD9, 0x6D, /* 0xC8-0xCB */ + 0xD9, 0x6C, 0xB4, 0x7E, 0xD9, 0x55, 0xB4, 0x79, /* 0xCC-0xCF */ + 0xB4, 0xA3, 0x00, 0x00, 0xB4, 0xA1, 0xD9, 0x69, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xD9, 0x5F, 0xB4, 0xA5, 0xD9, 0x70, /* 0xD4-0xD7 */ + 0xD9, 0x68, 0xD9, 0x71, 0xB4, 0xAD, 0xB4, 0xAB, /* 0xD8-0xDB */ + 0xD9, 0x66, 0xD9, 0x65, 0x00, 0x00, 0xD9, 0x63, /* 0xDC-0xDF */ + 0xD9, 0x5D, 0xB4, 0xA4, 0x00, 0x00, 0xB4, 0xA2, /* 0xE0-0xE3 */ + 0xD1, 0xB9, 0xD9, 0x56, 0x00, 0x00, 0xDD, 0xB7, /* 0xE4-0xE7 */ + 0xD9, 0x57, 0xB4, 0x7B, 0xB4, 0xAA, 0xDD, 0x79, /* 0xE8-0xEB */ + 0x00, 0x00, 0xB4, 0xA6, 0xB4, 0xA7, 0xD9, 0x58, /* 0xEC-0xEF */ + 0xD9, 0x6F, 0xDD, 0x78, 0xD9, 0x60, 0xD9, 0x5B, /* 0xF0-0xF3 */ + 0xB4, 0xA9, 0xD9, 0x61, 0xD9, 0x5E, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xB4, 0xAE, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_64[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0x70, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xDD, 0x7C, 0xDD, 0xB1, 0xDD, 0xB6, /* 0x08-0x0B */ + 0xDD, 0xAA, 0xB7, 0x6C, 0xDD, 0xBB, 0xB7, 0x69, /* 0x0C-0x0F */ + 0xDD, 0x7A, 0x00, 0x00, 0xDD, 0x7B, 0xB7, 0x62, /* 0x10-0x13 */ + 0xB7, 0x6B, 0xDD, 0xA4, 0xB7, 0x6E, 0xB7, 0x6F, /* 0x14-0x17 */ + 0xDD, 0xA5, 0x00, 0x00, 0xDD, 0xB2, 0xDD, 0xB8, /* 0x18-0x1B */ + 0xB7, 0x6A, 0x00, 0x00, 0xB7, 0x64, 0xDD, 0xA3, /* 0x1C-0x1F */ + 0xDD, 0x7D, 0xDD, 0xBA, 0xDD, 0xA8, 0xDD, 0xA9, /* 0x20-0x23 */ + 0xDD, 0x7E, 0xDD, 0xB4, 0xDD, 0xAB, 0xDD, 0xB5, /* 0x24-0x27 */ + 0xDD, 0xAD, 0x00, 0x00, 0xB7, 0x65, 0xE1, 0xD9, /* 0x28-0x2B */ + 0xB7, 0x68, 0xB7, 0x66, 0xDD, 0xB9, 0xDD, 0xB0, /* 0x2C-0x2F */ + 0xDD, 0xAC, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xA1, /* 0x30-0x33 */ + 0xBA, 0x53, 0xDD, 0xAF, 0xB7, 0x6D, 0xDD, 0xA7, /* 0x34-0x37 */ + 0x00, 0x00, 0xDD, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xB7, 0x67, 0xB7, 0x63, 0xE1, 0xEE, /* 0x3C-0x3F */ + 0xDD, 0xB3, 0xDD, 0xAE, 0x00, 0x00, 0xDD, 0xA2, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE9, /* 0x48-0x4B */ + 0x00, 0x00, 0xE1, 0xDA, 0xE1, 0xE5, 0x00, 0x00, /* 0x4C-0x4F */ + 0xE1, 0xEC, 0xBA, 0x51, 0xB4, 0xAC, 0xE1, 0xEA, /* 0x50-0x53 */ + 0xBA, 0x4C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xBA, 0x4B, 0xE1, 0xF1, 0x00, 0x00, 0xE1, 0xDB, /* 0x58-0x5B */ + 0xE1, 0xE8, 0xE1, 0xDC, 0xE1, 0xE7, 0xBA, 0x4F, /* 0x5C-0x5F */ + 0xE1, 0xEB, 0xD9, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xE1, 0xF2, 0xE1, 0xE3, 0xBA, 0x52, /* 0x64-0x67 */ + 0xE5, 0xBA, 0xBC, 0xAF, 0x00, 0x00, 0xE1, 0xF0, /* 0x68-0x6B */ + 0xE1, 0xEF, 0xBA, 0x54, 0xE5, 0xAD, 0xBC, 0xB0, /* 0x6C-0x6F */ + 0xE5, 0xAE, 0x00, 0x00, 0xE1, 0xDF, 0xE1, 0xE0, /* 0x70-0x73 */ + 0xE1, 0xDD, 0xE1, 0xE2, 0xE1, 0xDE, 0xE1, 0xF3, /* 0x74-0x77 */ + 0xBA, 0x4E, 0xBC, 0xB1, 0xBA, 0x50, 0xBA, 0x55, /* 0x78-0x7B */ + 0x00, 0x00, 0xE1, 0xE1, 0x00, 0x00, 0xE1, 0xED, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE1, 0xE6, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xE5, 0xB1, 0x00, 0x00, 0xBA, 0x4A, /* 0x84-0x87 */ + 0xBC, 0xB4, 0xE9, 0xAA, 0xE5, 0xB6, 0xE5, 0xB5, /* 0x88-0x8B */ + 0xE5, 0xB7, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xB4, /* 0x8C-0x8F */ + 0xBC, 0xB5, 0x00, 0x00, 0xBC, 0xBB, 0xBC, 0xB8, /* 0x90-0x93 */ + 0x00, 0x00, 0xBC, 0xB9, 0xE5, 0xAF, 0xE5, 0xB2, /* 0x94-0x97 */ + 0xE5, 0xBC, 0xBC, 0xC1, 0xBC, 0xBF, 0x00, 0x00, /* 0x98-0x9B */ + 0xE5, 0xB3, 0xD9, 0x5A, 0xBC, 0xB2, 0xE5, 0xB9, /* 0x9C-0x9F */ + 0xE5, 0xB0, 0x00, 0x00, 0xBC, 0xC2, 0xE5, 0xB8, /* 0xA0-0xA3 */ + 0xBA, 0x4D, 0xBC, 0xB7, 0xE1, 0xE4, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xBC, 0xBA, 0x00, 0x00, 0xBC, 0xBE, /* 0xA8-0xAB */ + 0xBC, 0xC0, 0xBC, 0xBD, 0xBC, 0xBC, 0x00, 0x00, /* 0xAC-0xAF */ + 0xBC, 0xB6, 0xE5, 0xBB, 0xBC, 0xB3, 0xBC, 0xC3, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBE, 0xD8, /* 0xB8-0xBB */ + 0xBE, 0xD9, 0xE9, 0xA9, 0xBE, 0xE2, 0xBE, 0xDF, /* 0xBC-0xBF */ + 0x00, 0x00, 0xBE, 0xD6, 0xBE, 0xDD, 0xE9, 0xAB, /* 0xC0-0xC3 */ + 0xBE, 0xDB, 0xBE, 0xD5, 0x00, 0x00, 0xBE, 0xDC, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE9, 0xA8, 0xC0, 0xBB, 0xBE, 0xD7, /* 0xC8-0xCB */ + 0x00, 0x00, 0xBE, 0xDE, 0xC0, 0xBA, 0xE9, 0xA7, /* 0xCC-0xCF */ + 0xE9, 0xA6, 0x00, 0x00, 0xBE, 0xE0, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xBE, 0xE1, 0x00, 0x00, 0xE9, 0xA5, 0xE9, 0xA4, /* 0xD4-0xD7 */ + 0xC0, 0xBC, 0xE9, 0xAE, 0xBE, 0xDA, 0xE9, 0xAC, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xC0, 0xBD, 0x00, 0x00, 0xC0, 0xC2, 0xEC, 0xEA, /* 0xE0-0xE3 */ + 0xEC, 0xEC, 0x00, 0x00, 0xC0, 0xBF, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xEC, 0xED, 0xEC, 0xE9, 0x00, 0x00, 0xEC, 0xEB, /* 0xE8-0xEB */ + 0xC0, 0xC0, 0xC0, 0xC3, 0x00, 0x00, 0xEC, 0xE8, /* 0xEC-0xEF */ + 0xC0, 0xBE, 0xC0, 0xC1, 0xC2, 0x59, 0xE9, 0xAD, /* 0xF0-0xF3 */ + 0xC2, 0x58, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x5E, /* 0xF4-0xF7 */ + 0xEF, 0xD4, 0x00, 0x00, 0xC2, 0x5C, 0xC2, 0x5D, /* 0xF8-0xFB */ + 0xEF, 0xD7, 0xEF, 0xD3, 0xC2, 0x5A, 0xEF, 0xD1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_65[512] = { + 0xC3, 0x6B, 0xEF, 0xD5, 0x00, 0x00, 0xEF, 0xD6, /* 0x00-0x03 */ + 0xEF, 0xD2, 0x00, 0x00, 0xC2, 0x5B, 0xF2, 0x42, /* 0x04-0x07 */ + 0x00, 0x00, 0xF2, 0x45, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xF2, 0x46, 0xF2, 0x44, 0xF2, 0x47, 0xC3, 0x6C, /* 0x0C-0x0F */ + 0xF2, 0x43, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x4E, /* 0x10-0x13 */ + 0xC4, 0x64, 0xF4, 0x4D, 0xF4, 0x4C, 0xF4, 0x4B, /* 0x14-0x17 */ + 0xC4, 0x63, 0xC4, 0x65, 0x00, 0x00, 0xF5, 0xCD, /* 0x18-0x1B */ + 0xC4, 0xE2, 0xC4, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xF6, 0xE1, 0xF6, 0xE0, 0xF6, 0xE3, 0xC5, 0xCB, /* 0x20-0x23 */ + 0xC5, 0x75, 0xF7, 0xDD, 0xF6, 0xE2, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xF7, 0xDC, 0xC5, 0xCD, 0xC5, 0xCC, /* 0x28-0x2B */ + 0xC5, 0xF3, 0xF8, 0xA9, 0xF8, 0xEF, 0xA4, 0xE4, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0x72, 0xE9, 0xAF, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xA6, 0xAC, 0xCA, 0xF7, /* 0x34-0x37 */ + 0xA7, 0xF1, 0xA7, 0xEF, 0x00, 0x00, 0xA7, 0xF0, /* 0x38-0x3B */ + 0x00, 0x00, 0xCC, 0xC1, 0xA9, 0xF1, 0xAC, 0x46, /* 0x3C-0x3F */ + 0x00, 0x00, 0xCE, 0xE7, 0x00, 0x00, 0xCE, 0xE8, /* 0x40-0x43 */ + 0x00, 0x00, 0xAC, 0x47, 0xD1, 0xCE, 0x00, 0x00, /* 0x44-0x47 */ + 0xAE, 0xC4, 0xAE, 0xC5, 0xD1, 0xCD, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xD3, /* 0x4C-0x4F */ + 0x00, 0x00, 0xB1, 0xCF, 0x00, 0x00, 0xD5, 0xA7, /* 0x50-0x53 */ + 0xB1, 0xD6, 0xB1, 0xD5, 0xB1, 0xCE, 0xB1, 0xD1, /* 0x54-0x57 */ + 0xB1, 0xD4, 0xB1, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xD9, 0x76, 0xB1, 0xCD, 0xB4, 0xAF, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xB4, 0xB1, 0xB4, 0xB2, /* 0x60-0x63 */ + 0xD9, 0x75, 0xD9, 0x78, 0xB4, 0xB0, 0xD9, 0x73, /* 0x64-0x67 */ + 0xD9, 0x77, 0x00, 0x00, 0xD9, 0x74, 0x00, 0x00, /* 0x68-0x6B */ + 0xB7, 0x71, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xBC, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xBA, 0x56, 0xE1, 0xF4, /* 0x70-0x73 */ + 0xBE, 0xE3, 0xBC, 0xC4, 0xE5, 0xBD, 0xBC, 0xC5, /* 0x74-0x77 */ + 0xBC, 0xC6, 0xE5, 0xBF, 0xE5, 0xBE, 0xE5, 0xC0, /* 0x78-0x7B */ + 0xE9, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB0, /* 0x7C-0x7F */ + + 0xEC, 0xEF, 0xEC, 0xEE, 0xC0, 0xC4, 0xC0, 0xC5, /* 0x80-0x83 */ + 0xF2, 0x48, 0x00, 0x00, 0x00, 0x00, 0xA4, 0xE5, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xD9, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xB4, 0xB4, 0xB4, 0xB3, 0xDD, 0xBD, 0x00, 0x00, /* 0x90-0x93 */ + 0xEF, 0xD8, 0xC4, 0xE3, 0xF7, 0xDE, 0xA4, 0xE6, /* 0x94-0x97 */ + 0x00, 0x00, 0xAE, 0xC6, 0x00, 0x00, 0xB1, 0xD8, /* 0x98-0x9B */ + 0xB1, 0xD7, 0xD9, 0x7A, 0xD9, 0x7B, 0xB7, 0x72, /* 0x9C-0x9F */ + 0xE1, 0xF5, 0xBA, 0x57, 0xE9, 0xB2, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xA4, 0xE7, 0xA5, 0xB8, 0x00, 0x00, 0xA9, 0xF2, /* 0xA4-0xA7 */ + 0xCC, 0xC2, 0x00, 0x00, 0xCE, 0xE9, 0xAC, 0x48, /* 0xA8-0xAB */ + 0xB1, 0xD9, 0x00, 0x00, 0xD9, 0x7C, 0xB4, 0xB5, /* 0xAC-0xAF */ + 0xB7, 0x73, 0x00, 0x00, 0xE5, 0xC1, 0xE5, 0xC2, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xF0, 0xC2, 0x5F, /* 0xB4-0xB7 */ + 0xF8, 0xF0, 0xA4, 0xE8, 0x00, 0x00, 0xCC, 0xC3, /* 0xB8-0xBB */ + 0xA9, 0xF3, 0xAC, 0x49, 0x00, 0x00, 0xCE, 0xEA, /* 0xBC-0xBF */ + 0x00, 0x00, 0xAE, 0xC7, 0xD1, 0xD2, 0xD1, 0xD0, /* 0xC0-0xC3 */ + 0xD1, 0xD1, 0xAE, 0xC8, 0xD1, 0xCF, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xDB, /* 0xC8-0xCB */ + 0xB1, 0xDC, 0xD5, 0xA8, 0xB1, 0xDD, 0xB1, 0xDA, /* 0xCC-0xCF */ + 0xD9, 0x7D, 0x00, 0x00, 0xD9, 0x7E, 0xDD, 0xBE, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xBA, 0x59, 0xBA, 0x58, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xF1, 0xEF, 0xD9, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF2, 0x4A, 0xF2, 0x49, 0xF4, 0x4F, /* 0xDC-0xDF */ + 0x00, 0x00, 0xC9, 0x5E, 0xAC, 0x4A, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xA4, 0xE9, 0xA5, 0xB9, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xA6, 0xAE, 0xA6, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xA6, 0xAF, 0xA6, 0xB0, 0xC9, 0xEE, 0xC9, 0xED, /* 0xEC-0xEF */ + 0xCA, 0xF8, 0xA7, 0xF2, 0xCA, 0xFB, 0xCA, 0xFA, /* 0xF0-0xF3 */ + 0xCA, 0xF9, 0xCA, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xA9, 0xF4, 0xCC, 0xC9, /* 0xF8-0xFB */ + 0xCC, 0xC5, 0xCC, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_66[512] = { + 0xA9, 0xFB, 0x00, 0x00, 0xA9, 0xF9, 0xCC, 0xCA, /* 0x00-0x03 */ + 0xCC, 0xC6, 0xCC, 0xCD, 0xA9, 0xF8, 0xAA, 0x40, /* 0x04-0x07 */ + 0xCC, 0xC8, 0xCC, 0xC4, 0xA9, 0xFE, 0xCC, 0xCB, /* 0x08-0x0B */ + 0xA9, 0xF7, 0xCC, 0xCC, 0xA9, 0xFA, 0xA9, 0xFC, /* 0x0C-0x0F */ + 0xCC, 0xD0, 0xCC, 0xCF, 0xCC, 0xC7, 0xA9, 0xF6, /* 0x10-0x13 */ + 0xA9, 0xF5, 0xA9, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xCE, 0xEF, 0xCE, 0xF5, 0x00, 0x00, 0xAC, 0x50, /* 0x1C-0x1F */ + 0xAC, 0x4D, 0xCE, 0xEC, 0xCE, 0xF1, 0x00, 0x00, /* 0x20-0x23 */ + 0xAC, 0x53, 0xAC, 0x4B, 0xCE, 0xF0, 0xAC, 0x4E, /* 0x24-0x27 */ + 0xAC, 0x51, 0x00, 0x00, 0x00, 0x00, 0xCE, 0xF3, /* 0x28-0x2B */ + 0x00, 0x00, 0xAC, 0x4C, 0xCE, 0xF8, 0xAC, 0x4F, /* 0x2C-0x2F */ + 0x00, 0x00, 0xAC, 0x52, 0xCE, 0xED, 0xCE, 0xF2, /* 0x30-0x33 */ + 0xCE, 0xF6, 0xCE, 0xEE, 0xCE, 0xEB, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xCE, 0xF7, 0xCE, 0xF4, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xAE, 0xD0, 0xAE, 0xC9, 0xAE, 0xCC, /* 0x40-0x43 */ + 0x00, 0x00, 0xAE, 0xCF, 0x00, 0x00, 0xD1, 0xD5, /* 0x44-0x47 */ + 0x00, 0x00, 0xAE, 0xCA, 0xD1, 0xD3, 0x00, 0x00, /* 0x48-0x4B */ + 0xAE, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xAE, 0xCB, /* 0x4C-0x4F */ + 0x00, 0x00, 0xD1, 0xD6, 0xAE, 0xCD, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xD5, 0xAC, 0xB1, 0xDF, 0xD5, 0xAB, /* 0x58-0x5B */ + 0xD5, 0xAD, 0xB1, 0xDE, 0xB1, 0xE3, 0xD1, 0xD4, /* 0x5C-0x5F */ + 0x00, 0x00, 0xD5, 0xAA, 0xD5, 0xAE, 0x00, 0x00, /* 0x60-0x63 */ + 0xB1, 0xE0, 0xD5, 0xA9, 0xB1, 0xE2, 0x00, 0x00, /* 0x64-0x67 */ + 0xB1, 0xE1, 0x00, 0x00, 0xD9, 0xA7, 0x00, 0x00, /* 0x68-0x6B */ + 0xD9, 0xA2, 0x00, 0x00, 0xB4, 0xB6, 0xB4, 0xBA, /* 0x6C-0x6F */ + 0xB4, 0xB7, 0xD9, 0xA5, 0xD9, 0xA8, 0x00, 0x00, /* 0x70-0x73 */ + 0xB4, 0xB8, 0x00, 0x00, 0xB4, 0xB9, 0xB4, 0xBE, /* 0x74-0x77 */ + 0xDD, 0xC7, 0xD9, 0xA6, 0xB4, 0xBC, 0xD9, 0xA3, /* 0x78-0x7B */ + 0xD9, 0xA1, 0x00, 0x00, 0xB4, 0xBD, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xD9, 0xA4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xB7, 0x79, 0x00, 0x00, 0xDD, 0xBF, 0xB7, 0x76, /* 0x84-0x87 */ + 0xB7, 0x77, 0xB7, 0x75, 0xDD, 0xC4, 0xDD, 0xC3, /* 0x88-0x8B */ + 0xDD, 0xC0, 0xB7, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xDD, 0xC2, 0xB4, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xDD, 0xC6, 0xDD, 0xC1, 0xB7, 0x78, 0xB7, 0x74, /* 0x94-0x97 */ + 0xB7, 0x7A, 0xDD, 0xC5, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xBA, 0x5C, 0x00, 0x00, 0xE1, 0xF8, /* 0x9C-0x9F */ + 0xE1, 0xF7, 0xE1, 0xF6, 0xBA, 0x5A, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xBA, 0x5B, 0xE5, 0xC5, 0xE5, 0xC8, 0xBC, 0xC8, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xBC, 0xC7, 0xE5, 0xC9, /* 0xAC-0xAF */ + 0xE5, 0xC4, 0xBC, 0xCA, 0xE5, 0xC6, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xBC, 0xC9, 0xE5, 0xC3, 0x00, 0x00, 0xE5, 0xC7, /* 0xB4-0xB7 */ + 0xBE, 0xE9, 0xBE, 0xE6, 0xE9, 0xBB, 0xE9, 0xBA, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE9, 0xB9, 0xE9, 0xB4, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE9, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xBE, 0xE7, 0x00, 0x00, 0xBE, 0xE4, 0xBE, 0xE8, /* 0xC4-0xC7 */ + 0xE9, 0xB3, 0xBE, 0xE5, 0xE9, 0xB6, 0xE9, 0xB7, /* 0xC8-0xCB */ + 0xE9, 0xBC, 0x00, 0x00, 0x00, 0x00, 0xE9, 0xB8, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xF2, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0xC7, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xEF, 0xDC, 0xC0, 0xC6, 0xEF, 0xDA, 0xEF, 0xDB, /* 0xD8-0xDB */ + 0xC2, 0x60, 0xC3, 0x6E, 0xF2, 0x4B, 0x00, 0x00, /* 0xDC-0xDF */ + 0xC3, 0x6D, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x51, /* 0xE0-0xE3 */ + 0xF4, 0x52, 0x00, 0x00, 0xC4, 0x66, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xF4, 0x50, 0xC4, 0xE4, 0x00, 0x00, 0xF7, 0xDF, /* 0xE8-0xEB */ + 0xC5, 0xCE, 0xF8, 0xAA, 0xF8, 0xAB, 0x00, 0x00, /* 0xEC-0xEF */ + 0xA4, 0xEA, 0x00, 0x00, 0xA6, 0xB1, 0xA6, 0xB2, /* 0xF0-0xF3 */ + 0xA7, 0xF3, 0x00, 0x00, 0xCC, 0xD1, 0xAC, 0x54, /* 0xF4-0xF7 */ + 0xAE, 0xD1, 0xB1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0xB0, 0xD2, 0x00, 0x00, 0xB4, 0xBF, 0xB4, 0xC0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_67[512] = { + 0xB3, 0xCC, 0xD9, 0xA9, 0x00, 0x00, 0xB7, 0x7C, /* 0x00-0x03 */ + 0xE1, 0xFA, 0xE1, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xA4, 0xEB, 0xA6, 0xB3, 0xCC, 0xD2, 0xAA, 0x42, /* 0x08-0x0B */ + 0x00, 0x00, 0xAA, 0x41, 0x00, 0x00, 0xCE, 0xF9, /* 0x0C-0x0F */ + 0xCE, 0xFA, 0x00, 0x00, 0xD1, 0xD7, 0xD1, 0xD8, /* 0x10-0x13 */ + 0xAE, 0xD2, 0xAE, 0xD3, 0x00, 0x00, 0xAE, 0xD4, /* 0x14-0x17 */ + 0xD5, 0xAF, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xE6, /* 0x18-0x1B */ + 0x00, 0x00, 0xB4, 0xC2, 0x00, 0x00, 0xB4, 0xC1, /* 0x1C-0x1F */ + 0xDD, 0xC8, 0xDF, 0x7A, 0xE1, 0xFB, 0xE9, 0xBD, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xC2, 0x61, 0xC4, 0x67, /* 0x24-0x27 */ + 0xA4, 0xEC, 0x00, 0x00, 0xA5, 0xBC, 0xA5, 0xBD, /* 0x28-0x2B */ + 0xA5, 0xBB, 0xA5, 0xBE, 0xA5, 0xBA, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xA6, 0xB6, 0x00, 0x00, 0xC9, 0xF6, /* 0x30-0x33 */ + 0xA6, 0xB5, 0xA6, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xC9, 0xF1, 0xC9, 0xF0, 0xC9, 0xF3, 0xC9, 0xF2, /* 0x38-0x3B */ + 0xC9, 0xF5, 0xA6, 0xB4, 0xC9, 0xEF, 0xC9, 0xF4, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xCA, 0xFD, 0xA7, 0xFD, 0xCA, 0xFE, /* 0x44-0x47 */ + 0xCB, 0x43, 0xA7, 0xFC, 0x00, 0x00, 0xCB, 0x47, /* 0x48-0x4B */ + 0xCB, 0x42, 0xCB, 0x45, 0xA7, 0xF5, 0xA7, 0xF6, /* 0x4C-0x4F */ + 0xA7, 0xF7, 0xA7, 0xF8, 0x00, 0x00, 0xA8, 0x40, /* 0x50-0x53 */ + 0x00, 0x00, 0xCB, 0x41, 0xA7, 0xFA, 0xA8, 0x41, /* 0x54-0x57 */ + 0x00, 0x00, 0xCB, 0x40, 0xCB, 0x46, 0x00, 0x00, /* 0x58-0x5B */ + 0xA7, 0xF9, 0xCB, 0x44, 0xA7, 0xFB, 0xA7, 0xF4, /* 0x5C-0x5F */ + 0xA7, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xAA, 0x57, 0x00, 0x00, /* 0x68-0x6B */ + 0xCC, 0xD4, 0xAA, 0x43, 0x00, 0x00, 0xAA, 0x4D, /* 0x6C-0x6F */ + 0xAA, 0x4E, 0xAA, 0x46, 0xAA, 0x58, 0xAA, 0x48, /* 0x70-0x73 */ + 0xCC, 0xDC, 0xAA, 0x53, 0xCC, 0xD7, 0xAA, 0x49, /* 0x74-0x77 */ + 0xCC, 0xE6, 0xCC, 0xE7, 0xCC, 0xDF, 0xCC, 0xD8, /* 0x78-0x7B */ + 0xAA, 0x56, 0xCC, 0xE4, 0xAA, 0x51, 0xAA, 0x4F, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xCC, 0xE5, 0x00, 0x00, 0xCC, 0xE3, /* 0x80-0x83 */ + 0xCC, 0xDB, 0xCC, 0xD3, 0xCC, 0xDA, 0xAA, 0x4A, /* 0x84-0x87 */ + 0x00, 0x00, 0xAA, 0x50, 0x00, 0x00, 0xAA, 0x44, /* 0x88-0x8B */ + 0xCC, 0xDE, 0xCC, 0xDD, 0xCC, 0xD5, 0x00, 0x00, /* 0x8C-0x8F */ + 0xAA, 0x52, 0xCC, 0xE1, 0xCC, 0xD6, 0xAA, 0x55, /* 0x90-0x93 */ + 0xCC, 0xE8, 0xAA, 0x45, 0x00, 0x00, 0xAA, 0x4C, /* 0x94-0x97 */ + 0xCC, 0xD9, 0xCC, 0xE2, 0xAA, 0x54, 0x00, 0x00, /* 0x98-0x9B */ + 0xAA, 0x47, 0xAA, 0x4B, 0x00, 0x00, 0xCC, 0xE0, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0x5B, 0xAC, 0x5C, /* 0xAC-0xAF */ + 0xAC, 0x69, 0x00, 0x00, 0xCF, 0x56, 0xCF, 0x4C, /* 0xB0-0xB3 */ + 0xAC, 0x62, 0xCF, 0x4A, 0xAC, 0x5B, 0xCF, 0x45, /* 0xB4-0xB7 */ + 0xAC, 0x65, 0xCF, 0x52, 0xCE, 0xFE, 0xCF, 0x41, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCF, 0x44, 0xCE, 0xFB, 0xCF, 0x51, 0xCF, 0x61, /* 0xC0-0xC3 */ + 0xAC, 0x60, 0xCF, 0x46, 0xCF, 0x58, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xCE, 0xFD, 0xCF, 0x5F, 0xCF, 0x60, 0xCF, 0x63, /* 0xC8-0xCB */ + 0xCF, 0x5A, 0xCF, 0x4B, 0xCF, 0x53, 0xAC, 0x66, /* 0xCC-0xCF */ + 0xAC, 0x59, 0xAC, 0x61, 0xAC, 0x6D, 0xAC, 0x56, /* 0xD0-0xD3 */ + 0xAC, 0x58, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xCF, 0x43, 0xAC, 0x6A, 0xAC, 0x63, 0xCF, 0x5D, /* 0xD8-0xDB */ + 0xCF, 0x40, 0xAC, 0x6C, 0xAC, 0x67, 0xCF, 0x49, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xAC, 0x6B, 0xCF, 0x50, /* 0xE0-0xE3 */ + 0xCF, 0x48, 0xAC, 0x64, 0xCF, 0x5C, 0xCF, 0x54, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xAC, 0x5E, 0xCF, 0x62, 0xCF, 0x47, /* 0xE8-0xEB */ + 0xAC, 0x5A, 0xCF, 0x59, 0xCF, 0x4F, 0xAC, 0x5F, /* 0xEC-0xEF */ + 0xCF, 0x55, 0xAC, 0x57, 0xCE, 0xFC, 0xAC, 0x68, /* 0xF0-0xF3 */ + 0xAE, 0xE3, 0xAC, 0x5D, 0xCF, 0x4E, 0xCF, 0x4D, /* 0xF4-0xF7 */ + 0xCF, 0x42, 0x00, 0x00, 0xCF, 0x5E, 0x00, 0x00, /* 0xF8-0xFB */ + 0xCF, 0x57, 0x00, 0x00, 0x00, 0x00, 0xAC, 0x55, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_68[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xEC, 0xAE, 0xEA, /* 0x10-0x13 */ + 0xD1, 0xED, 0x00, 0x00, 0xD1, 0xE1, 0xAE, 0xDF, /* 0x14-0x17 */ + 0xAE, 0xEB, 0x00, 0x00, 0xD1, 0xDA, 0x00, 0x00, /* 0x18-0x1B */ + 0xD1, 0xE3, 0xD1, 0xEB, 0x00, 0x00, 0xD1, 0xD9, /* 0x1C-0x1F */ + 0xD1, 0xF4, 0xAE, 0xD5, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xD1, 0xF3, 0xD1, 0xEE, 0x00, 0x00, /* 0x24-0x27 */ + 0xD1, 0xEF, 0xAE, 0xDD, 0xAE, 0xE8, 0xD1, 0xE5, /* 0x28-0x2B */ + 0x00, 0x00, 0xD1, 0xE6, 0xD1, 0xF0, 0xD1, 0xE7, /* 0x2C-0x2F */ + 0x00, 0x00, 0xD1, 0xE2, 0xD1, 0xDC, 0xD1, 0xDD, /* 0x30-0x33 */ + 0xD1, 0xEA, 0xD1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xAE, 0xD6, 0xAE, 0xDA, 0xD1, 0xF2, 0xD1, 0xDE, /* 0x38-0x3B */ + 0xAE, 0xE6, 0xAE, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xAE, 0xE5, 0xAE, 0xEC, 0xAE, 0xDB, 0xAE, 0xE7, /* 0x40-0x43 */ + 0xD1, 0xE9, 0xAE, 0xE9, 0xAE, 0xD8, 0x00, 0x00, /* 0x44-0x47 */ + 0xAE, 0xD7, 0xD1, 0xDB, 0x00, 0x00, 0xD1, 0xDF, /* 0x48-0x4B */ + 0xAE, 0xE0, 0xD1, 0xF1, 0xD1, 0xE8, 0xD1, 0xE0, /* 0x4C-0x4F */ + 0xAE, 0xE4, 0xAE, 0xE1, 0x00, 0x00, 0xAE, 0xD9, /* 0x50-0x53 */ + 0xAE, 0xDC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xC4, /* 0x68-0x6B */ + 0x00, 0x00, 0xD5, 0xB4, 0xD5, 0xB5, 0xD5, 0xB9, /* 0x6C-0x6F */ + 0x00, 0x00, 0xD5, 0xC8, 0xD5, 0xC5, 0x00, 0x00, /* 0x70-0x73 */ + 0xD5, 0xBE, 0xD5, 0xBD, 0xB1, 0xED, 0xD5, 0xC1, /* 0x74-0x77 */ + 0xD5, 0xD0, 0xD5, 0xB0, 0x00, 0x00, 0xD5, 0xD1, /* 0x78-0x7B */ + 0xD5, 0xC3, 0xD5, 0xD5, 0xD5, 0xC9, 0xB1, 0xEC, /* 0x7C-0x7F */ + + 0xD5, 0xC7, 0xB1, 0xE7, 0xB1, 0xFC, 0xB1, 0xF2, /* 0x80-0x83 */ + 0x00, 0x00, 0xB1, 0xF6, 0xB1, 0xF5, 0xD5, 0xB1, /* 0x84-0x87 */ + 0x00, 0x00, 0xD5, 0xCE, 0xD5, 0xD4, 0xD5, 0xCC, /* 0x88-0x8B */ + 0xD5, 0xD3, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xC0, /* 0x8C-0x8F */ + 0xD5, 0xB2, 0xD5, 0xD2, 0xD5, 0xC2, 0xB1, 0xEA, /* 0x90-0x93 */ + 0xB1, 0xF7, 0x00, 0x00, 0xD5, 0xCB, 0xB1, 0xF0, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD5, 0xCA, /* 0x98-0x9B */ + 0xD5, 0xB3, 0xB1, 0xF8, 0x00, 0x00, 0xB1, 0xFA, /* 0x9C-0x9F */ + 0xD5, 0xCD, 0xB1, 0xFB, 0xB1, 0xE9, 0xD5, 0xBA, /* 0xA0-0xA3 */ + 0xD5, 0xCF, 0x00, 0x00, 0x00, 0x00, 0xB1, 0xEF, /* 0xA4-0xA7 */ + 0xB1, 0xF9, 0xD5, 0xBC, 0xD5, 0xC6, 0xD5, 0xB7, /* 0xA8-0xAB */ + 0xD5, 0xBB, 0xB1, 0xF4, 0xD5, 0xB6, 0xB1, 0xE8, /* 0xAC-0xAF */ + 0xB1, 0xF1, 0xB1, 0xEE, 0xD5, 0xBF, 0xAE, 0xDE, /* 0xB0-0xB3 */ + 0xD9, 0xC0, 0xB1, 0xEB, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xB1, 0xF3, 0x00, 0x00, 0xD9, 0xC3, 0xD9, 0xD9, /* 0xC4-0xC7 */ + 0xD9, 0xCE, 0xB4, 0xD6, 0x00, 0x00, 0xB4, 0xD1, /* 0xC8-0xCB */ + 0xD9, 0xBD, 0xB4, 0xD2, 0xD9, 0xCD, 0x00, 0x00, /* 0xCC-0xCF */ + 0xD9, 0xC6, 0xD9, 0xD3, 0xB4, 0xCE, 0xD9, 0xAB, /* 0xD0-0xD3 */ + 0xD9, 0xD5, 0xB4, 0xC4, 0xD9, 0xB3, 0xB4, 0xC7, /* 0xD4-0xD7 */ + 0xB4, 0xC6, 0x00, 0x00, 0xB4, 0xD7, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD9, 0xAD, 0xD9, 0xCF, 0xD9, 0xD0, 0xB4, 0xC9, /* 0xDC-0xDF */ + 0xB4, 0xC5, 0xD9, 0xBB, 0x00, 0x00, 0xB4, 0xD0, /* 0xE0-0xE3 */ + 0xD9, 0xB6, 0x00, 0x00, 0xD9, 0xD1, 0xB4, 0xCC, /* 0xE4-0xE7 */ + 0xD9, 0xC9, 0xD9, 0xD6, 0xD9, 0xB0, 0xD9, 0xB5, /* 0xE8-0xEB */ + 0xD9, 0xAF, 0x00, 0x00, 0xB4, 0xCB, 0xD9, 0xC2, /* 0xEC-0xEF */ + 0xDD, 0xDE, 0xD9, 0xB1, 0xB4, 0xCF, 0xD9, 0xBA, /* 0xF0-0xF3 */ + 0xD9, 0xD2, 0xB4, 0xCA, 0xD9, 0xB7, 0xD9, 0xB4, /* 0xF4-0xF7 */ + 0xD9, 0xC5, 0xB4, 0xCD, 0xB4, 0xC3, 0xB4, 0xD9, /* 0xF8-0xFB */ + 0xD9, 0xC8, 0xD9, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_69[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xD9, 0xAC, 0xB4, 0xC8, 0xD9, 0xD4, 0xD9, 0xBC, /* 0x04-0x07 */ + 0xD9, 0xBE, 0x00, 0x00, 0xD9, 0xCB, 0xD9, 0xCA, /* 0x08-0x0B */ + 0xD9, 0xAA, 0xB4, 0xD3, 0xB4, 0xD5, 0xD9, 0xB2, /* 0x0C-0x0F */ + 0xD9, 0xB9, 0xD9, 0xC1, 0xB4, 0xD4, 0xD9, 0xB8, /* 0x10-0x13 */ + 0xD9, 0xC4, 0xD9, 0xD7, 0x00, 0x00, 0xD9, 0xCC, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xD9, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xD9, 0xAE, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDD, 0xF2, /* 0x2C-0x2F */ + 0xB7, 0xA6, 0x00, 0x00, 0xDD, 0xF0, 0xDD, 0xDB, /* 0x30-0x33 */ + 0xDD, 0xE0, 0xDD, 0xD9, 0x00, 0x00, 0xDD, 0xEC, /* 0x34-0x37 */ + 0xDD, 0xCB, 0xDD, 0xD2, 0x00, 0x00, 0xDD, 0xEA, /* 0x38-0x3B */ + 0xDD, 0xF4, 0xDD, 0xDC, 0x00, 0x00, 0xDD, 0xCF, /* 0x3C-0x3F */ + 0xDD, 0xE2, 0xDD, 0xE7, 0xDD, 0xD3, 0x00, 0x00, /* 0x40-0x43 */ + 0xDD, 0xE4, 0xDD, 0xD0, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xDD, 0xD7, 0xDD, 0xD8, 0xB7, 0xA8, 0xDD, 0xEB, /* 0x48-0x4B */ + 0xDD, 0xE9, 0x00, 0x00, 0xDD, 0xCC, 0xDD, 0xEE, /* 0x4C-0x4F */ + 0x00, 0x00, 0xDD, 0xEF, 0xDD, 0xF1, 0xB7, 0xAC, /* 0x50-0x53 */ + 0xB7, 0xA4, 0x00, 0x00, 0xD5, 0xB8, 0xDD, 0xD4, /* 0x54-0x57 */ + 0xDD, 0xE6, 0xDD, 0xD5, 0xB7, 0xA1, 0xB7, 0xB1, /* 0x58-0x5B */ + 0xDD, 0xED, 0xB7, 0xAF, 0xB7, 0xAB, 0xDD, 0xCA, /* 0x5C-0x5F */ + 0xB7, 0xA3, 0x00, 0x00, 0xDD, 0xCD, 0xB7, 0xB0, /* 0x60-0x63 */ + 0x00, 0x00, 0xDD, 0xDD, 0xDD, 0xC9, 0x00, 0x00, /* 0x64-0x67 */ + 0xB7, 0xA9, 0xDD, 0xE1, 0xDD, 0xD1, 0xB7, 0xAA, /* 0x68-0x6B */ + 0xDD, 0xDA, 0xB7, 0x7E, 0xB4, 0xD8, 0xDD, 0xE3, /* 0x6C-0x6F */ + 0xD9, 0xBF, 0xDD, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xDD, 0xE8, 0xB7, 0xA5, 0xDD, 0xE5, 0xB7, 0xA2, /* 0x74-0x77 */ + 0xDD, 0xDF, 0xB7, 0xAD, 0xDD, 0xD6, 0xDD, 0xF3, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xB7, 0xA7, 0xDE, 0xC6, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0xAE, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xE2, 0x4A, 0xE2, 0x48, 0x00, 0x00, /* 0x8C-0x8F */ + 0xE2, 0x5E, 0xE2, 0x46, 0x00, 0x00, 0xE2, 0x58, /* 0x90-0x93 */ + 0xB7, 0x7D, 0xBA, 0x5F, 0xE2, 0x42, 0xE2, 0x5D, /* 0x94-0x97 */ + 0x00, 0x00, 0xE2, 0x47, 0xE2, 0x55, 0xBA, 0x64, /* 0x98-0x9B */ + 0xBA, 0x5D, 0x00, 0x00, 0xE2, 0x5B, 0x00, 0x00, /* 0x9C-0x9F */ + 0xE2, 0x40, 0xE2, 0x5A, 0x00, 0x00, 0xBA, 0x6F, /* 0xA0-0xA3 */ + 0xE2, 0x51, 0xE2, 0x61, 0xBA, 0x6D, 0xE2, 0x49, /* 0xA4-0xA7 */ + 0xBA, 0x5E, 0xE2, 0x4B, 0xE2, 0x59, 0xBA, 0x67, /* 0xA8-0xAB */ + 0xE2, 0x44, 0xBA, 0x6B, 0xBA, 0x61, 0xE2, 0x4D, /* 0xAC-0xAF */ + 0xE2, 0x43, 0xE1, 0xFC, 0x00, 0x00, 0xE2, 0x57, /* 0xB0-0xB3 */ + 0xBA, 0x68, 0xE2, 0x60, 0xE1, 0xFD, 0xBA, 0x65, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE2, 0x53, 0x00, 0x00, 0xBA, 0x66, /* 0xB8-0xBB */ + 0xE2, 0x45, 0xE2, 0x50, 0xE2, 0x4C, 0xE2, 0x4E, /* 0xBC-0xBF */ + 0x00, 0x00, 0xBA, 0x60, 0xE2, 0x5F, 0xBA, 0x6E, /* 0xC0-0xC3 */ + 0xE2, 0x4F, 0x00, 0x00, 0xE2, 0x62, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE1, 0xFE, 0xE2, 0x54, 0xBA, 0x63, /* 0xC8-0xCB */ + 0xBA, 0x6C, 0xBA, 0x6A, 0xE2, 0x41, 0xE2, 0x56, /* 0xCC-0xCF */ + 0xBA, 0x69, 0x00, 0x00, 0x00, 0x00, 0xBA, 0x62, /* 0xD0-0xD3 */ + 0xE2, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE2, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xE5, 0xD5, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xE5, 0xD1, 0xE5, 0xCD, 0xE5, 0xE1, 0xE5, 0xDE, /* 0xE4-0xE7 */ + 0xBC, 0xCD, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE5, /* 0xE8-0xEB */ + 0xE5, 0xD4, 0xBC, 0xD8, 0xE5, 0xDB, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE5, 0xD0, 0xE5, 0xDA, 0xBC, 0xD5, /* 0xF0-0xF3 */ + 0xE5, 0xEE, 0x00, 0x00, 0xE5, 0xEB, 0xE5, 0xDD, /* 0xF4-0xF7 */ + 0xE5, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xE5, 0xE2, /* 0xF8-0xFB */ + 0xE5, 0xE4, 0xBC, 0xD1, 0xE5, 0xD8, 0xE5, 0xD3, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6A[512] = { + 0xE5, 0xCA, 0xBC, 0xCE, 0xBC, 0xD6, 0x00, 0x00, /* 0x00-0x03 */ + 0xE5, 0xE7, 0xBC, 0xD7, 0xE5, 0xCB, 0xE5, 0xED, /* 0x04-0x07 */ + 0xE5, 0xE0, 0xE5, 0xE6, 0xBC, 0xD4, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE5, 0xE3, 0x00, 0x00, 0xE5, 0xEA, /* 0x0C-0x0F */ + 0x00, 0x00, 0xBC, 0xD9, 0x00, 0x00, 0xBC, 0xD3, /* 0x10-0x13 */ + 0xE5, 0xDC, 0xE5, 0xCF, 0xE5, 0xEF, 0xE5, 0xCC, /* 0x14-0x17 */ + 0xE5, 0xE8, 0xBC, 0xD0, 0x00, 0x00, 0xE5, 0xD6, /* 0x18-0x1B */ + 0x00, 0x00, 0xE5, 0xD7, 0xBC, 0xCF, 0xBC, 0xCC, /* 0x1C-0x1F */ + 0xE5, 0xD2, 0xBC, 0xD2, 0x00, 0x00, 0xBC, 0xCB, /* 0x20-0x23 */ + 0x00, 0x00, 0xE5, 0xE9, 0xE5, 0xEC, 0xE5, 0xD9, /* 0x24-0x27 */ + 0xE9, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC2, 0x00, 0x00, /* 0x30-0x33 */ + 0xE9, 0xBE, 0xBE, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xBE, 0xEB, 0xBE, 0xF0, 0xBE, 0xEC, 0xE9, 0xCC, /* 0x38-0x3B */ + 0xE9, 0xD7, 0xBE, 0xEA, 0xE9, 0xC4, 0xE9, 0xCD, /* 0x3C-0x3F */ + 0xE5, 0xDF, 0xE9, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xBE, 0xF1, 0x00, 0x00, 0xE9, 0xDD, 0xBE, 0xF5, /* 0x44-0x47 */ + 0xBE, 0xF8, 0xE9, 0xC0, 0x00, 0x00, 0xBE, 0xF4, /* 0x48-0x4B */ + 0x00, 0x00, 0xE9, 0xDB, 0xE9, 0xDC, 0xE9, 0xD2, /* 0x4C-0x4F */ + 0xE9, 0xD1, 0xE9, 0xC9, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xE9, 0xD3, 0xE9, 0xDA, 0xE9, 0xD9, 0x00, 0x00, /* 0x54-0x57 */ + 0xBE, 0xEF, 0xBE, 0xED, 0xE9, 0xCB, 0xE9, 0xC8, /* 0x58-0x5B */ + 0x00, 0x00, 0xE9, 0xC5, 0xE9, 0xD8, 0xBE, 0xF7, /* 0x5C-0x5F */ + 0xE9, 0xD6, 0xBE, 0xF3, 0xBE, 0xF2, 0x00, 0x00, /* 0x60-0x63 */ + 0xE9, 0xD0, 0x00, 0x00, 0xE9, 0xBF, 0xE9, 0xC1, /* 0x64-0x67 */ + 0xE9, 0xC3, 0xE9, 0xD5, 0xE9, 0xCF, 0xBE, 0xEE, /* 0x68-0x6B */ + 0x00, 0x00, 0xE9, 0xC6, 0x00, 0x00, 0xE9, 0xD4, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xC7, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0xCF, 0xED, 0x45, /* 0x7C-0x7F */ + + 0xC0, 0xC8, 0xEC, 0xF5, 0x00, 0x00, 0xED, 0x41, /* 0x80-0x83 */ + 0xC0, 0xCA, 0xED, 0x48, 0x00, 0x00, 0xEC, 0xFC, /* 0x84-0x87 */ + 0x00, 0x00, 0xEC, 0xF7, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xED, 0x49, 0xEC, 0xF3, 0xEC, 0xFE, 0x00, 0x00, /* 0x8C-0x8F */ + 0xC0, 0xD1, 0xED, 0x44, 0xED, 0x4A, 0xEC, 0xFD, /* 0x90-0x93 */ + 0xC0, 0xC9, 0xED, 0x40, 0xEC, 0xF4, 0xC0, 0xD0, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0x47, 0xEC, 0xF9, /* 0x98-0x9B */ + 0xC0, 0xCC, 0x00, 0x00, 0xEC, 0xFB, 0xEC, 0xF8, /* 0x9C-0x9F */ + 0xC0, 0xD2, 0xEC, 0xFA, 0xC0, 0xCB, 0xC0, 0xCE, /* 0xA0-0xA3 */ + 0xED, 0x43, 0xEC, 0xF6, 0xED, 0x46, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xED, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xC2, 0x63, 0xEF, 0xE7, 0xC2, 0x68, 0xC2, 0x69, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x62, /* 0xB0-0xB3 */ + 0xEF, 0xE6, 0x00, 0x00, 0xEF, 0xE3, 0xEF, 0xE4, /* 0xB4-0xB7 */ + 0xC2, 0x66, 0xEF, 0xDE, 0xEF, 0xE2, 0xC2, 0x65, /* 0xB8-0xBB */ + 0x00, 0x00, 0xEF, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xC2, 0x67, 0xC2, 0x64, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xEF, 0xDD, 0xEF, 0xE1, 0xEF, 0xE5, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0x51, /* 0xC8-0xCB */ + 0xF2, 0x4E, 0xF2, 0x57, 0x00, 0x00, 0xF2, 0x56, /* 0xCC-0xCF */ + 0xF2, 0x54, 0xF2, 0x4F, 0x00, 0x00, 0xC3, 0x72, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xF2, 0x50, 0xC3, 0x71, 0xC0, 0xCD, /* 0xD8-0xDB */ + 0xF2, 0x53, 0xC3, 0x70, 0xF2, 0x58, 0xF2, 0x52, /* 0xDC-0xDF */ + 0xF2, 0x4D, 0xEF, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xC3, 0x6F, 0x00, 0x00, 0xF2, 0x4C, /* 0xE4-0xE7 */ + 0xF4, 0x56, 0x00, 0x00, 0xF4, 0x55, 0xF2, 0x55, /* 0xE8-0xEB */ + 0xC4, 0x68, 0x00, 0x00, 0xF4, 0x59, 0xF4, 0x5A, /* 0xEC-0xEF */ + 0xF4, 0x54, 0xF4, 0x58, 0x00, 0x00, 0xF4, 0x53, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xF5, 0xD1, 0xF4, 0x57, 0xC4, 0xE7, 0xC4, 0xE5, /* 0xF8-0xFB */ + 0xF5, 0xCF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6B[512] = { + 0xF5, 0xD2, 0x00, 0x00, 0xF5, 0xCE, 0xF5, 0xD0, /* 0x00-0x03 */ + 0xC4, 0xE6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xF6, 0xE5, 0xF6, 0xE6, 0xC5, 0x76, 0xF6, 0xE4, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF7, 0xE2, /* 0x0C-0x0F */ + 0xC5, 0xCF, 0xF7, 0xE0, 0xF7, 0xE1, 0xF8, 0xAC, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xC6, 0x56, 0xF8, 0xF3, /* 0x14-0x17 */ + 0xF8, 0xF1, 0xF8, 0xF2, 0xF8, 0xF4, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xBB, 0x00, 0x00, /* 0x1C-0x1F */ + 0xA4, 0xED, 0xA6, 0xB8, 0x00, 0x00, 0xAA, 0x59, /* 0x20-0x23 */ + 0x00, 0x00, 0xCC, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xCF, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0xD1, 0xF5, 0xD1, 0xF7, 0x00, 0x00, 0xD1, 0xF6, /* 0x2C-0x2F */ + 0x00, 0x00, 0xD1, 0xF8, 0xB1, 0xFD, 0xD5, 0xD7, /* 0x30-0x33 */ + 0xD1, 0xF9, 0x00, 0x00, 0xD5, 0xD6, 0xD5, 0xD8, /* 0x34-0x37 */ + 0xD5, 0xD9, 0xD9, 0xDA, 0xB4, 0xDB, 0xD9, 0xDB, /* 0x38-0x3B */ + 0xD9, 0xDD, 0xB4, 0xDC, 0xB4, 0xDA, 0xD9, 0xDC, /* 0x3C-0x3F */ + 0x00, 0x00, 0xDD, 0xFA, 0xDD, 0xF8, 0xDD, 0xF7, /* 0x40-0x43 */ + 0x00, 0x00, 0xDD, 0xF6, 0xDD, 0xF5, 0xB7, 0xB2, /* 0x44-0x47 */ + 0xDD, 0xF9, 0xBA, 0x70, 0xE2, 0x63, 0xE2, 0x65, /* 0x48-0x4B */ + 0xBA, 0x71, 0xE2, 0x64, 0xBC, 0xDB, 0x00, 0x00, /* 0x4C-0x4F */ + 0xBC, 0xDA, 0xE5, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xE9, 0xDF, 0xE9, 0xDE, 0xE9, 0xE0, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xBE, 0xF9, 0x00, 0x00, 0xED, 0x4B, /* 0x58-0x5B */ + 0xC0, 0xD3, 0x00, 0x00, 0xEF, 0xE8, 0xC2, 0x6A, /* 0x5C-0x5F */ + 0xF2, 0x59, 0xC5, 0x77, 0xA4, 0xEE, 0xA5, 0xBF, /* 0x60-0x63 */ + 0xA6, 0xB9, 0xA8, 0x42, 0xAA, 0x5A, 0xAA, 0x5B, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xAC, 0x6E, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xD1, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0xB3, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xD1, 0xBE, 0xFA, /* 0x74-0x77 */ + 0xC2, 0x6B, 0xA4, 0xEF, 0x00, 0x00, 0xA6, 0xBA, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xEB, 0xAA, 0x5C, /* 0x7C-0x7F */ + + 0xCC, 0xEA, 0x00, 0x00, 0xCF, 0x65, 0xAC, 0x6F, /* 0x80-0x83 */ + 0xCF, 0x66, 0x00, 0x00, 0xAC, 0x70, 0x00, 0x00, /* 0x84-0x87 */ + 0xD1, 0xFC, 0xAE, 0xEE, 0xAE, 0xED, 0x00, 0x00, /* 0x88-0x8B */ + 0xD5, 0xDE, 0xD5, 0xDC, 0xD5, 0xDD, 0xD5, 0xDB, /* 0x8C-0x8F */ + 0x00, 0x00, 0xD5, 0xDA, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xD9, 0xDE, 0xD9, 0xE1, 0xB4, 0xDE, 0xD9, 0xDF, /* 0x94-0x97 */ + 0xB4, 0xDD, 0xD9, 0xE0, 0x00, 0x00, 0xDD, 0xFB, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x66, 0xE2, 0x67, /* 0x9C-0x9F */ + 0xE2, 0x68, 0x00, 0x00, 0xE5, 0xF3, 0xE5, 0xF2, /* 0xA0-0xA3 */ + 0xBC, 0xDC, 0xE5, 0xF1, 0xE5, 0xF4, 0xE9, 0xE1, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xE2, 0xE9, 0xE3, /* 0xA8-0xAB */ + 0x00, 0x00, 0xED, 0x4C, 0xC0, 0xD4, 0xC2, 0x6C, /* 0xAC-0xAF */ + 0xF2, 0x5A, 0x00, 0x00, 0xC4, 0xE8, 0xC9, 0x5F, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xAC, 0x71, 0xCF, 0x67, 0xAE, 0xEF, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xB1, 0xFE, 0x00, 0x00, /* 0xB8-0xBB */ + 0xB4, 0xDF, 0xD9, 0xE2, 0x00, 0x00, 0xB7, 0xB5, /* 0xBC-0xBF */ + 0xB7, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xE2, 0x69, /* 0xC0-0xC3 */ + 0xE2, 0x6A, 0xBC, 0xDD, 0xBC, 0xDE, 0xE9, 0xE5, /* 0xC4-0xC7 */ + 0xE9, 0xE4, 0xEF, 0xE9, 0xF7, 0xE3, 0xA4, 0xF0, /* 0xC8-0xCB */ + 0xC9, 0x60, 0xA5, 0xC0, 0x00, 0x00, 0xA8, 0x43, /* 0xCC-0xCF */ + 0xCB, 0x48, 0x00, 0x00, 0xAC, 0x72, 0xB7, 0xB6, /* 0xD0-0xD3 */ + 0xA4, 0xF1, 0x00, 0x00, 0xCF, 0x68, 0xAC, 0x73, /* 0xD4-0xD7 */ + 0xCF, 0x69, 0x00, 0x00, 0xC0, 0xD5, 0xA4, 0xF2, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0xCC, 0xEC, 0x00, 0x00, /* 0xDC-0xDF */ + 0xCF, 0x6A, 0x00, 0x00, 0xD2, 0x42, 0xD2, 0x41, /* 0xE0-0xE3 */ + 0xD1, 0xFE, 0x00, 0x00, 0xD1, 0xFD, 0xD2, 0x43, /* 0xE4-0xE7 */ + 0xD2, 0x40, 0x00, 0x00, 0x00, 0x00, 0xB2, 0x40, /* 0xE8-0xEB */ + 0xB2, 0x41, 0x00, 0x00, 0x00, 0x00, 0xB4, 0xE0, /* 0xEC-0xEF */ + 0xD9, 0xE3, 0x00, 0x00, 0xD9, 0xE4, 0xD9, 0xE5, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDE, 0x41, /* 0xF4-0xF7 */ + 0xDE, 0x42, 0xDE, 0x40, 0x00, 0x00, 0xDD, 0xFD, /* 0xF8-0xFB */ + 0xDD, 0xFE, 0xB7, 0xB7, 0xE2, 0x6B, 0xE5, 0xF7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6C[512] = { + 0xE5, 0xF6, 0xE5, 0xF5, 0xE5, 0xF8, 0xE9, 0xE7, /* 0x00-0x03 */ + 0xE9, 0xE6, 0xBE, 0xFB, 0xE9, 0xE8, 0x00, 0x00, /* 0x04-0x07 */ + 0xC0, 0xD6, 0xED, 0x4D, 0x00, 0x00, 0xEF, 0xEA, /* 0x08-0x0B */ + 0xF2, 0x5B, 0xF6, 0xE7, 0x00, 0x00, 0xA4, 0xF3, /* 0x0C-0x0F */ + 0xA5, 0xC2, 0xA5, 0xC1, 0x00, 0x00, 0xAA, 0x5D, /* 0x10-0x13 */ + 0xC9, 0x61, 0xC9, 0x7E, 0xA6, 0xBB, 0x00, 0x00, /* 0x14-0x17 */ + 0xC9, 0xF7, 0xCB, 0x49, 0xCB, 0x4A, 0xAA, 0x5E, /* 0x18-0x1B */ + 0x00, 0x00, 0xCC, 0xED, 0x00, 0x00, 0xAC, 0x74, /* 0x1C-0x1F */ + 0xCF, 0x6B, 0xCF, 0x6C, 0x00, 0x00, 0xAE, 0xF0, /* 0x20-0x23 */ + 0xAE, 0xF4, 0xD2, 0x44, 0xAE, 0xF3, 0xAE, 0xF1, /* 0x24-0x27 */ + 0xAE, 0xF2, 0x00, 0x00, 0xD5, 0xDF, 0xB2, 0x42, /* 0x28-0x2B */ + 0xB4, 0xE3, 0x00, 0x00, 0xB4, 0xE1, 0xB4, 0xE2, /* 0x2C-0x2F */ + 0xD9, 0xE6, 0x00, 0x00, 0x00, 0x00, 0xBA, 0x72, /* 0x30-0x33 */ + 0xA4, 0xF4, 0x00, 0x00, 0xC9, 0xA1, 0x00, 0x00, /* 0x34-0x37 */ + 0xA5, 0xC3, 0x00, 0x00, 0x00, 0x00, 0xC9, 0xA4, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xA5, 0xC6, 0xC9, 0xA3, /* 0x3C-0x3F */ + 0xA5, 0xC5, 0xA5, 0xC4, 0xA8, 0x44, 0xC9, 0xA2, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xC9, 0xF8, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xC9, 0xFC, 0xC9, 0xFE, /* 0x48-0x4B */ + 0xCA, 0x40, 0xA6, 0xC5, 0xA6, 0xC6, 0xC9, 0xFB, /* 0x4C-0x4F */ + 0xA6, 0xC1, 0x00, 0x00, 0xC9, 0xF9, 0x00, 0x00, /* 0x50-0x53 */ + 0xC9, 0xFD, 0xA6, 0xC2, 0x00, 0x00, 0xA6, 0xBD, /* 0x54-0x57 */ + 0x00, 0x00, 0xA6, 0xBE, 0x00, 0x00, 0xA6, 0xC4, /* 0x58-0x5B */ + 0xC9, 0xFA, 0xA6, 0xBC, 0xA8, 0x45, 0xA6, 0xBF, /* 0x5C-0x5F */ + 0xA6, 0xC0, 0xA6, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xCB, 0x5B, 0xCB, 0x59, 0xCB, 0x4C, /* 0x64-0x67 */ + 0xA8, 0x51, 0xCB, 0x53, 0xA8, 0x4C, 0xCB, 0x4D, /* 0x68-0x6B */ + 0x00, 0x00, 0xCB, 0x55, 0x00, 0x00, 0xCB, 0x52, /* 0x6C-0x6F */ + 0xA8, 0x4F, 0xCB, 0x51, 0xA8, 0x56, 0xCB, 0x5A, /* 0x70-0x73 */ + 0xA8, 0x58, 0x00, 0x00, 0xA8, 0x5A, 0x00, 0x00, /* 0x74-0x77 */ + 0xCB, 0x4B, 0x00, 0x00, 0xA8, 0x4D, 0xCB, 0x5C, /* 0x78-0x7B */ + 0x00, 0x00, 0xA8, 0x54, 0xA8, 0x57, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xCD, 0x45, 0xA8, 0x47, 0xA8, 0x5E, 0xA8, 0x55, /* 0x80-0x83 */ + 0xCB, 0x4E, 0xA8, 0x4A, 0xA8, 0x59, 0xCB, 0x56, /* 0x84-0x87 */ + 0xA8, 0x48, 0xA8, 0x49, 0xCD, 0x43, 0xCB, 0x4F, /* 0x88-0x8B */ + 0xA8, 0x50, 0xA8, 0x5B, 0xCB, 0x5D, 0xCB, 0x50, /* 0x8C-0x8F */ + 0xA8, 0x4E, 0x00, 0x00, 0xA8, 0x53, 0xCC, 0xEE, /* 0x90-0x93 */ + 0xA8, 0x5C, 0xCB, 0x57, 0xA8, 0x52, 0x00, 0x00, /* 0x94-0x97 */ + 0xA8, 0x5D, 0xA8, 0x46, 0xCB, 0x54, 0xA8, 0x4B, /* 0x98-0x9B */ + 0xCB, 0x58, 0xCD, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAA, 0x6A, /* 0xA8-0xAB */ + 0xAA, 0x7A, 0xCC, 0xF5, 0xAA, 0x71, 0x00, 0x00, /* 0xAC-0xAF */ + 0xCD, 0x4B, 0xAA, 0x62, 0x00, 0x00, 0xAA, 0x65, /* 0xB0-0xB3 */ + 0xCD, 0x42, 0x00, 0x00, 0xCC, 0xF3, 0xCC, 0xF7, /* 0xB4-0xB7 */ + 0xAA, 0x6D, 0xAA, 0x6F, 0xCC, 0xFA, 0xAA, 0x76, /* 0xB8-0xBB */ + 0xAA, 0x68, 0xAA, 0x66, 0xAA, 0x67, 0xAA, 0x75, /* 0xBC-0xBF */ + 0xCD, 0x47, 0xAA, 0x70, 0xCC, 0xF9, 0xCC, 0xFB, /* 0xC0-0xC3 */ + 0xAA, 0x6E, 0xAA, 0x73, 0xCC, 0xFC, 0xCD, 0x4A, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xAC, 0x75, 0xAA, 0x79, 0x00, 0x00, /* 0xC8-0xCB */ + 0xAA, 0x63, 0xCD, 0x49, 0x00, 0x00, 0xCD, 0x4D, /* 0xCC-0xCF */ + 0xCC, 0xF8, 0xCD, 0x4F, 0xCD, 0x40, 0xAA, 0x6C, /* 0xD0-0xD3 */ + 0xCC, 0xF4, 0xAA, 0x6B, 0xAA, 0x7D, 0xAA, 0x72, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xCC, 0xF2, 0xCF, 0x75, 0xAA, 0x78, /* 0xD8-0xDB */ + 0xAA, 0x7C, 0xCD, 0x41, 0xCD, 0x46, 0x00, 0x00, /* 0xDC-0xDF */ + 0xAA, 0x7E, 0xAA, 0x77, 0xAA, 0x69, 0xAA, 0x5F, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xAA, 0x64, 0x00, 0x00, 0xCC, 0xF6, /* 0xE4-0xE7 */ + 0xAA, 0x60, 0xCD, 0x4E, 0x00, 0x00, 0xCC, 0xF0, /* 0xE8-0xEB */ + 0xCC, 0xEF, 0xCC, 0xFD, 0xCC, 0xF1, 0xAA, 0x7B, /* 0xEC-0xEF */ + 0xAE, 0xF5, 0xAA, 0x74, 0xCC, 0xFE, 0xAA, 0x61, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xAC, 0xA6, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xCD, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_6D[512] = { + 0xCF, 0x7C, 0xCF, 0xA1, 0x00, 0x00, 0xCF, 0xA4, /* 0x00-0x03 */ + 0xCF, 0x77, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xA7, /* 0x04-0x07 */ + 0xCF, 0xAA, 0xCF, 0xAC, 0xCF, 0x74, 0xAC, 0x76, /* 0x08-0x0B */ + 0xAC, 0x7B, 0xD2, 0x49, 0xAC, 0xAD, 0xCF, 0xA5, /* 0x0C-0x0F */ + 0xCF, 0xAD, 0xCF, 0x7B, 0xCF, 0x73, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0x64, 0xAC, 0x7E, /* 0x14-0x17 */ + 0xCF, 0xA2, 0xCF, 0x78, 0xCF, 0x7A, 0xAC, 0xA5, /* 0x18-0x1B */ + 0x00, 0x00, 0xCF, 0x7D, 0xAC, 0x7D, 0xCF, 0x70, /* 0x1C-0x1F */ + 0xCF, 0xA8, 0x00, 0x00, 0xCF, 0xAB, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xAC, 0x7A, 0x00, 0x00, 0xAC, 0xA8, /* 0x24-0x27 */ + 0xCF, 0x6D, 0xAC, 0xAA, 0xAC, 0x78, 0xAC, 0xAE, /* 0x28-0x2B */ + 0xCF, 0xA9, 0xCF, 0x6F, 0xAC, 0xAB, 0xD2, 0x5E, /* 0x2C-0x2F */ + 0xCD, 0x48, 0xAC, 0x7C, 0xAC, 0x77, 0xCF, 0x76, /* 0x30-0x33 */ + 0xCF, 0x6E, 0xAC, 0xAC, 0xAC, 0xA4, 0xCF, 0xA3, /* 0x34-0x37 */ + 0xAC, 0xA9, 0xAC, 0xA7, 0xCF, 0x79, 0xAC, 0xA1, /* 0x38-0x3B */ + 0xCF, 0x71, 0xAC, 0xA2, 0xAC, 0xA3, 0xCF, 0x72, /* 0x3C-0x3F */ + 0xCF, 0xA6, 0xAC, 0x79, 0xCF, 0x7E, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xD2, 0x4C, 0xAE, 0xFD, 0xAF, 0x43, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0x55, 0xD2, 0x5B, /* 0x5C-0x5F */ + 0xD2, 0x57, 0xD2, 0x4A, 0xD2, 0x4D, 0xD2, 0x46, /* 0x60-0x63 */ + 0xD2, 0x47, 0xAF, 0x4A, 0xAE, 0xFA, 0xD2, 0x56, /* 0x64-0x67 */ + 0xD2, 0x5F, 0xAF, 0x45, 0xAE, 0xF6, 0x00, 0x00, /* 0x68-0x6B */ + 0xAF, 0x40, 0xD2, 0x4E, 0xAF, 0x42, 0xD2, 0x4F, /* 0x6C-0x6F */ + 0xD2, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xAF, 0x44, 0xD2, 0x68, 0xD2, 0x48, 0xAE, 0xFC, /* 0x74-0x77 */ + 0xAE, 0xFB, 0xAF, 0x48, 0xD2, 0x45, 0xD2, 0x66, /* 0x78-0x7B */ + 0xD2, 0x5A, 0xD2, 0x67, 0xD2, 0x61, 0xD2, 0x53, /* 0x7C-0x7F */ + + 0xD2, 0x62, 0x00, 0x00, 0xD2, 0x5C, 0xD2, 0x65, /* 0x80-0x83 */ + 0xD2, 0x63, 0xAF, 0x49, 0xD2, 0x54, 0xAE, 0xF9, /* 0x84-0x87 */ + 0xAE, 0xF8, 0xAF, 0x41, 0xAF, 0x47, 0xD2, 0x60, /* 0x88-0x8B */ + 0xAF, 0x46, 0xD2, 0x51, 0xB2, 0x43, 0x00, 0x00, /* 0x8C-0x8F */ + 0xD2, 0x69, 0xD2, 0x50, 0xD2, 0x4B, 0xAE, 0xFE, /* 0x90-0x93 */ + 0xAF, 0x4B, 0xAE, 0xF7, 0x00, 0x00, 0xD2, 0x58, /* 0x94-0x97 */ + 0xD2, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0x65, 0xD5, 0xE1, /* 0xA8-0xAB */ + 0xD5, 0xE5, 0x00, 0x00, 0xB2, 0x52, 0xB2, 0x50, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0x47, 0xD5, 0xE3, /* 0xB0-0xB3 */ + 0xD5, 0xE2, 0xB2, 0x5B, 0x00, 0x00, 0xD5, 0xE8, /* 0xB4-0xB7 */ + 0xB2, 0x55, 0x00, 0x00, 0xD5, 0xFA, 0xD6, 0x47, /* 0xB8-0xBB */ + 0xB2, 0x44, 0xD5, 0xF7, 0xD5, 0xF0, 0xB2, 0x67, /* 0xBC-0xBF */ + 0xD5, 0xE0, 0x00, 0x00, 0xD5, 0xFC, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xB2, 0x64, 0xB2, 0x58, 0xB2, 0x63, 0xB2, 0x4E, /* 0xC4-0xC7 */ + 0xD5, 0xEC, 0xD5, 0xFE, 0xD5, 0xF6, 0xB2, 0x4F, /* 0xC8-0xCB */ + 0xB2, 0x49, 0xD6, 0x45, 0x00, 0x00, 0xD5, 0xFD, /* 0xCC-0xCF */ + 0xD6, 0x40, 0xB2, 0x51, 0xB2, 0x59, 0xD6, 0x42, /* 0xD0-0xD3 */ + 0xD5, 0xEA, 0xD5, 0xFB, 0xD5, 0xEF, 0xD6, 0x44, /* 0xD4-0xD7 */ + 0xB2, 0x5E, 0xB2, 0x46, 0xB2, 0x5C, 0xD5, 0xF4, /* 0xD8-0xDB */ + 0xD5, 0xF2, 0xD5, 0xF3, 0xB2, 0x53, 0xD5, 0xEE, /* 0xDC-0xDF */ + 0xD5, 0xED, 0xB2, 0x48, 0xD5, 0xE7, 0xD6, 0x46, /* 0xE0-0xE3 */ + 0xB2, 0x4A, 0xD5, 0xF1, 0xB2, 0x68, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xB2, 0x62, 0xD5, 0xE6, 0xB2, 0x5F, 0xB2, 0x5D, /* 0xE8-0xEB */ + 0xB2, 0x66, 0xD5, 0xF8, 0xB2, 0x61, 0xD2, 0x52, /* 0xEC-0xEF */ + 0xD5, 0xF9, 0xB2, 0x60, 0xD6, 0x41, 0xB2, 0x45, /* 0xF0-0xF3 */ + 0xD5, 0xF5, 0xB2, 0x57, 0xD5, 0xE9, 0xB2, 0x56, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xB2, 0x54, 0xB2, 0x4C, 0xB2, 0x4B, /* 0xF8-0xFB */ + 0xD9, 0xE7, 0xD6, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6E[512] = { + 0xD5, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xFC, /* 0x00-0x03 */ + 0x00, 0x00, 0xB2, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xB5, 0x41, 0xB2, 0x5A, 0xB4, 0xEE, /* 0x18-0x1B */ + 0xD9, 0xF6, 0xB4, 0xFC, 0x00, 0x00, 0xD9, 0xEA, /* 0x1C-0x1F */ + 0xB4, 0xEB, 0xB4, 0xE7, 0xDA, 0x49, 0xB4, 0xED, /* 0x20-0x23 */ + 0xB4, 0xF1, 0xB4, 0xEC, 0xB4, 0xF5, 0xDA, 0x4D, /* 0x24-0x27 */ + 0xDA, 0x44, 0x00, 0x00, 0x00, 0x00, 0xD9, 0xF1, /* 0x28-0x2B */ + 0xB4, 0xFA, 0xB4, 0xF4, 0xD9, 0xFD, 0xB4, 0xE4, /* 0x2C-0x2F */ + 0xDA, 0x4A, 0xDA, 0x43, 0xB4, 0xE8, 0xD9, 0xF7, /* 0x30-0x33 */ + 0xB4, 0xF7, 0xDA, 0x55, 0xDA, 0x56, 0x00, 0x00, /* 0x34-0x37 */ + 0xB4, 0xE5, 0xDA, 0x48, 0xB4, 0xF9, 0xD9, 0xFB, /* 0x38-0x3B */ + 0xD9, 0xED, 0xD9, 0xEE, 0xB4, 0xFD, 0xD9, 0xF2, /* 0x3C-0x3F */ + 0xD9, 0xF9, 0xD9, 0xF3, 0x00, 0x00, 0xB4, 0xFB, /* 0x40-0x43 */ + 0xB5, 0x44, 0xD9, 0xEF, 0xD9, 0xE8, 0xD9, 0xE9, /* 0x44-0x47 */ + 0x00, 0x00, 0xD9, 0xEB, 0xB4, 0xEA, 0xD9, 0xF8, /* 0x48-0x4B */ + 0x00, 0x00, 0xB4, 0xF8, 0xB5, 0x42, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xD9, 0xFA, 0xDA, 0x53, 0xDA, 0x4B, /* 0x50-0x53 */ + 0xB4, 0xE6, 0xDA, 0x51, 0xB4, 0xF2, 0x00, 0x00, /* 0x54-0x57 */ + 0xB4, 0xF0, 0x00, 0x00, 0xDA, 0x57, 0xB4, 0xEF, /* 0x58-0x5B */ + 0xDA, 0x41, 0xD9, 0xF4, 0xD9, 0xFE, 0xB5, 0x47, /* 0x5C-0x5F */ + 0xDA, 0x45, 0xDA, 0x42, 0xD9, 0xF0, 0xB5, 0x43, /* 0x60-0x63 */ + 0xDA, 0x4F, 0xDA, 0x4C, 0xDA, 0x54, 0xB4, 0xE9, /* 0x64-0x67 */ + 0xDA, 0x40, 0xB5, 0x46, 0x00, 0x00, 0xDA, 0x47, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xB4, 0xF3, 0xB4, 0xF6, /* 0x6C-0x6F */ + 0x00, 0x00, 0xDA, 0x46, 0xB5, 0x45, 0xD9, 0xF5, /* 0x70-0x73 */ + 0xD5, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xDA, 0x50, /* 0x74-0x77 */ + 0xDA, 0x4E, 0xDA, 0x52, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xD9, 0xEC, 0xB5, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xDE, 0x61, 0xDE, 0x60, 0xDE, 0x46, /* 0x8C-0x8F */ + 0xB7, 0xBD, 0x00, 0x00, 0xDE, 0x5F, 0xDE, 0x49, /* 0x90-0x93 */ + 0xDE, 0x4A, 0x00, 0x00, 0xB7, 0xC7, 0xDE, 0x68, /* 0x94-0x97 */ + 0xB7, 0xC2, 0xDE, 0x5E, 0x00, 0x00, 0xDE, 0x43, /* 0x98-0x9B */ + 0xB7, 0xC8, 0xB7, 0xBE, 0xDE, 0x52, 0xDE, 0x48, /* 0x9C-0x9F */ + 0xDE, 0x4B, 0xDE, 0x63, 0xB7, 0xB8, 0xDE, 0x6A, /* 0xA0-0xA3 */ + 0xDE, 0x62, 0xB7, 0xC1, 0xDE, 0x57, 0xB7, 0xCC, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0xCB, 0xB7, 0xC5, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0x69, 0xB7, 0xB9, /* 0xAC-0xAF */ + 0xDE, 0x55, 0xDE, 0x4C, 0xDE, 0x59, 0xDE, 0x65, /* 0xB0-0xB3 */ + 0xB7, 0xCD, 0x00, 0x00, 0xB7, 0xBB, 0xDE, 0x54, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xDE, 0x4D, 0xB7, 0xC4, 0x00, 0x00, /* 0xB8-0xBB */ + 0xB7, 0xC3, 0xDE, 0x50, 0xDE, 0x5A, 0xDE, 0x64, /* 0xBC-0xBF */ + 0xDE, 0x47, 0xDE, 0x51, 0xB7, 0xBC, 0xDE, 0x5B, /* 0xC0-0xC3 */ + 0xB7, 0xC9, 0xB7, 0xC0, 0xDE, 0x4E, 0xB7, 0xBF, /* 0xC4-0xC7 */ + 0xDE, 0x45, 0xDE, 0x53, 0xDE, 0x67, 0xB4, 0xFE, /* 0xC8-0xCB */ + 0xBA, 0xB0, 0xDE, 0x56, 0xE2, 0x6C, 0xDE, 0x58, /* 0xCC-0xCF */ + 0xDE, 0x66, 0xB7, 0xC6, 0xDE, 0x4F, 0xB7, 0xBA, /* 0xD0-0xD3 */ + 0xB7, 0xCA, 0xBC, 0xF0, 0xDE, 0x44, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xDE, 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xDE, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xAA, /* 0xE8-0xEB */ + 0xBA, 0xAD, 0xE2, 0x7D, 0xE2, 0xA4, 0xBA, 0xA2, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE2, 0x6E, 0xBA, 0xAF, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xBA, 0x77, 0xE2, 0x6D, 0xE2, 0xB0, 0xBA, 0xB1, /* 0xF4-0xF7 */ + 0xE2, 0x71, 0xE2, 0xA3, 0x00, 0x00, 0xE2, 0x73, /* 0xF8-0xFB */ + 0xE2, 0xB3, 0xE2, 0xAF, 0xBA, 0x75, 0xBA, 0xA1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_6F[512] = { + 0xE6, 0x53, 0xBA, 0xAE, 0xBA, 0x7D, 0xE2, 0x6F, /* 0x00-0x03 */ + 0x00, 0x00, 0xE2, 0xAE, 0xBA, 0xA3, 0xE2, 0xAB, /* 0x04-0x07 */ + 0xE2, 0xB8, 0xE2, 0x75, 0xE2, 0x7E, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE2, 0xB6, 0xE2, 0xAC, 0xBA, 0x7C, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x7C, 0xBA, 0x76, /* 0x10-0x13 */ + 0xBA, 0x74, 0xBA, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xE2, 0x7A, 0xE2, 0x77, 0xE2, 0x78, 0x00, 0x00, /* 0x18-0x1B */ + 0xE2, 0xB2, 0x00, 0x00, 0xE2, 0xB7, 0xE2, 0xB5, /* 0x1C-0x1F */ + 0xBA, 0x7A, 0xE2, 0xB9, 0xBA, 0x7E, 0xBA, 0xA7, /* 0x20-0x23 */ + 0x00, 0x00, 0xE2, 0x70, 0xE5, 0xFA, 0xE2, 0x79, /* 0x24-0x27 */ + 0x00, 0x00, 0xBA, 0x78, 0xBA, 0xAC, 0xBA, 0xA9, /* 0x28-0x2B */ + 0xBA, 0x7B, 0xE2, 0xA5, 0xE2, 0x74, 0xBA, 0xAA, /* 0x2C-0x2F */ + 0xE2, 0xA7, 0xBA, 0xA4, 0xBA, 0xA6, 0xBA, 0x73, /* 0x30-0x33 */ + 0x00, 0x00, 0xE2, 0xA9, 0xE2, 0xA1, 0xE2, 0x72, /* 0x34-0x37 */ + 0xBA, 0xA5, 0xE2, 0xB1, 0xE2, 0xB4, 0xE2, 0x7B, /* 0x38-0x3B */ + 0xE2, 0xA8, 0x00, 0x00, 0xBA, 0x79, 0xBC, 0xDF, /* 0x3C-0x3F */ + 0xE2, 0xA6, 0xE5, 0xF9, 0x00, 0x00, 0xE2, 0xAD, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0x76, 0xE6, 0x44, /* 0x4C-0x4F */ + 0xE6, 0x4E, 0xBC, 0xE2, 0xE6, 0x4D, 0xE6, 0x59, /* 0x50-0x53 */ + 0xBC, 0xE4, 0xE6, 0x4B, 0x00, 0x00, 0xE6, 0x4F, /* 0x54-0x57 */ + 0xBC, 0xEF, 0x00, 0x00, 0xE6, 0x46, 0xBC, 0xE7, /* 0x58-0x5B */ + 0x00, 0x00, 0xE6, 0x52, 0xE9, 0xF0, 0xBC, 0xF3, /* 0x5C-0x5F */ + 0xBC, 0xF2, 0xE6, 0x54, 0xE6, 0x43, 0xE6, 0x5E, /* 0x60-0x63 */ + 0xBC, 0xED, 0x00, 0x00, 0xBC, 0xE3, 0xE6, 0x57, /* 0x64-0x67 */ + 0x00, 0x00, 0xE6, 0x5B, 0xE6, 0x60, 0xE6, 0x55, /* 0x68-0x6B */ + 0xE6, 0x49, 0xBC, 0xE6, 0xBC, 0xE9, 0xBC, 0xF1, /* 0x6C-0x6F */ + 0xBC, 0xEC, 0x00, 0x00, 0xE6, 0x4C, 0xE2, 0xA2, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0x48, 0xE6, 0x5F, /* 0x74-0x77 */ + 0xBC, 0xE8, 0x00, 0x00, 0xBC, 0xEB, 0xE6, 0x61, /* 0x78-0x7B */ + 0xBC, 0xE0, 0xE6, 0x56, 0xE5, 0xFB, 0xE6, 0x5C, /* 0x7C-0x7F */ + + 0xC0, 0xDF, 0x00, 0x00, 0xE6, 0x4A, 0x00, 0x00, /* 0x80-0x83 */ + 0xBC, 0xE1, 0xE6, 0x45, 0xBC, 0xE5, 0xE5, 0xFC, /* 0x84-0x87 */ + 0xBA, 0xAB, 0xE6, 0x41, 0x00, 0x00, 0xE6, 0x5A, /* 0x88-0x8B */ + 0xE6, 0x42, 0xE6, 0x40, 0xBC, 0xEA, 0x00, 0x00, /* 0x8C-0x8F */ + 0xE6, 0x58, 0x00, 0x00, 0xE5, 0xFE, 0xE6, 0x51, /* 0x90-0x93 */ + 0xE6, 0x50, 0xE6, 0x5D, 0xE6, 0x47, 0xBC, 0xEE, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE9, 0xF3, 0x00, 0x00, /* 0x9C-0x9F */ + 0xBF, 0x49, 0xBE, 0xFE, 0xEA, 0x40, 0xE9, 0xEB, /* 0xA0-0xA3 */ + 0xBF, 0x41, 0xE9, 0xF7, 0xBF, 0x48, 0xBF, 0x43, /* 0xA4-0xA7 */ + 0xE9, 0xF5, 0xED, 0x4F, 0xE9, 0xFB, 0xEA, 0x42, /* 0xA8-0xAB */ + 0xE9, 0xFA, 0xE9, 0xE9, 0xE9, 0xF8, 0xEA, 0x44, /* 0xAC-0xAF */ + 0xEA, 0x46, 0xBE, 0xFD, 0xEA, 0x45, 0xBF, 0x44, /* 0xB0-0xB3 */ + 0xBF, 0x4A, 0x00, 0x00, 0xBF, 0x47, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE9, 0xFE, 0xBF, 0x46, 0xE9, 0xF9, 0x00, 0x00, /* 0xB8-0xBB */ + 0xE9, 0xED, 0xE9, 0xF2, 0x00, 0x00, 0xE9, 0xFD, /* 0xBC-0xBF */ + 0xBF, 0x45, 0xBF, 0x42, 0xBE, 0xFC, 0xBF, 0x40, /* 0xC0-0xC3 */ + 0xE9, 0xF1, 0x00, 0x00, 0xE5, 0xFD, 0xE9, 0xEC, /* 0xC4-0xC7 */ + 0xE9, 0xEF, 0xEA, 0x41, 0xE9, 0xF4, 0xE9, 0xEA, /* 0xC8-0xCB */ + 0xED, 0x4E, 0xEA, 0x43, 0xE9, 0xEE, 0xE9, 0xFC, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xED, 0x51, 0xC0, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xC0, 0xD7, 0x00, 0x00, 0x00, 0x00, 0xC0, 0xDB, /* 0xD8-0xDB */ + 0xED, 0x53, 0xED, 0x59, 0xED, 0x57, 0xC0, 0xD9, /* 0xDC-0xDF */ + 0xC0, 0xDA, 0xC0, 0xE1, 0xED, 0x5A, 0xED, 0x52, /* 0xE0-0xE3 */ + 0xC0, 0xDC, 0x00, 0x00, 0xED, 0x56, 0xED, 0x55, /* 0xE4-0xE7 */ + 0xED, 0x5B, 0xC0, 0xE2, 0x00, 0x00, 0xC0, 0xDD, /* 0xE8-0xEB */ + 0xC0, 0xE0, 0xED, 0x54, 0xC0, 0xE4, 0xC0, 0xDE, /* 0xEC-0xEF */ + 0xC0, 0xE5, 0xC0, 0xD8, 0xED, 0x58, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xED, 0x50, 0x00, 0x00, 0x00, 0x00, 0xEF, 0xF7, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xC2, 0x71, 0xEF, 0xF4, /* 0xF8-0xFB */ + 0xEF, 0xF6, 0x00, 0x00, 0xC2, 0x6F, 0xEF, 0xF2, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_70[512] = { + 0xEF, 0xF3, 0xEF, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xE9, 0xF6, 0xEF, 0xEF, 0xC2, 0x70, 0xEF, 0xEB, /* 0x04-0x07 */ + 0x00, 0x00, 0xC2, 0x6D, 0xEF, 0xF8, 0xC2, 0x6E, /* 0x08-0x0B */ + 0xEF, 0xEC, 0xEF, 0xED, 0xEF, 0xF1, 0xC2, 0x73, /* 0x0C-0x0F */ + 0x00, 0x00, 0xC2, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xEF, 0xF0, 0xC3, 0x78, 0xF2, 0x5F, 0xF2, 0x65, /* 0x14-0x17 */ + 0xC3, 0x79, 0xF2, 0x5C, 0xC3, 0x76, 0xC3, 0x73, /* 0x18-0x1B */ + 0xF2, 0x67, 0xC3, 0x77, 0x00, 0x00, 0xC3, 0x74, /* 0x1C-0x1F */ + 0xF2, 0x5E, 0xF2, 0x61, 0xF2, 0x62, 0xF2, 0x63, /* 0x20-0x23 */ + 0xF2, 0x66, 0x00, 0x00, 0xEF, 0xF5, 0xF2, 0x5D, /* 0x24-0x27 */ + 0xC3, 0x75, 0xF2, 0x64, 0xF2, 0x68, 0xF2, 0x60, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x5D, /* 0x2C-0x2F */ + 0xC4, 0x6A, 0xF4, 0x60, 0xC4, 0x6B, 0xF4, 0x68, /* 0x30-0x33 */ + 0xF4, 0x5F, 0xF4, 0x5C, 0x00, 0x00, 0xF4, 0x5E, /* 0x34-0x37 */ + 0xF4, 0x62, 0xF4, 0x65, 0xF4, 0x64, 0xF4, 0x67, /* 0x38-0x3B */ + 0xF4, 0x5B, 0x00, 0x00, 0xC4, 0x69, 0xF4, 0x63, /* 0x3C-0x3F */ + 0xF4, 0x66, 0xF4, 0x69, 0xF4, 0x61, 0xF5, 0xD3, /* 0x40-0x43 */ + 0xF5, 0xD4, 0xF5, 0xD8, 0xF5, 0xD9, 0x00, 0x00, /* 0x44-0x47 */ + 0xF5, 0xD6, 0xF5, 0xD7, 0xF5, 0xD5, 0x00, 0x00, /* 0x48-0x4B */ + 0xC4, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xC5, 0x78, 0xF6, 0xEB, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xF6, 0xE8, 0xF6, 0xE9, 0xF6, 0xEA, /* 0x54-0x57 */ + 0xC5, 0x79, 0x00, 0x00, 0xF7, 0xE5, 0xF7, 0xE4, /* 0x58-0x5B */ + 0x00, 0x00, 0xF8, 0xAF, 0xC5, 0xF4, 0xF8, 0xAD, /* 0x5C-0x5F */ + 0xF8, 0xB0, 0xF8, 0xAE, 0xF8, 0xF5, 0xC6, 0x57, /* 0x60-0x63 */ + 0xC6, 0x65, 0xF9, 0xA3, 0xF9, 0x6C, 0x00, 0x00, /* 0x64-0x67 */ + 0xF9, 0xA2, 0xF9, 0xD0, 0xF9, 0xD1, 0xA4, 0xF5, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xA6, 0xC7, 0xCA, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xCB, 0x5E, 0x00, 0x00, 0xA8, 0x5F, 0x00, 0x00, /* 0x74-0x77 */ + 0xA8, 0x62, 0x00, 0x00, 0xCB, 0x5F, 0x00, 0x00, /* 0x78-0x7B */ + 0xA8, 0x60, 0xA8, 0x61, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xCD, 0x58, 0xCD, 0x5A, /* 0x80-0x83 */ + 0xCD, 0x55, 0xCD, 0x52, 0xCD, 0x54, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xAA, 0xA4, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xAA, 0xA2, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xCD, 0x56, 0xAA, 0xA3, 0xCD, 0x53, /* 0x90-0x93 */ + 0xCD, 0x50, 0xAA, 0xA1, 0xCD, 0x57, 0x00, 0x00, /* 0x94-0x97 */ + 0xCD, 0x51, 0xAA, 0xA5, 0xCD, 0x59, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xAF, /* 0x9C-0x9F */ + 0x00, 0x00, 0xCF, 0xB3, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xAC, 0xB7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xCF, 0xB6, 0x00, 0x00, 0xAC, 0xAF, /* 0xA8-0xAB */ + 0xAC, 0xB2, 0xAC, 0xB4, 0xAC, 0xB6, 0xAC, 0xB3, /* 0xAC-0xAF */ + 0xCF, 0xB2, 0xCF, 0xB1, 0x00, 0x00, 0xAC, 0xB1, /* 0xB0-0xB3 */ + 0xCF, 0xB4, 0xCF, 0xB5, 0x00, 0x00, 0xCF, 0xAE, /* 0xB4-0xB7 */ + 0xAC, 0xB5, 0x00, 0x00, 0xAC, 0xB0, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0xB0, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xD2, 0x77, 0xD2, 0x78, 0xD2, 0x79, /* 0xC4-0xC7 */ + 0xAF, 0x50, 0x00, 0x00, 0xAF, 0x4C, 0xD2, 0x6E, /* 0xC8-0xCB */ + 0x00, 0x00, 0xD2, 0x76, 0xD2, 0x7B, 0xAF, 0x51, /* 0xCC-0xCF */ + 0x00, 0x00, 0xD2, 0x6C, 0xD2, 0x72, 0xD2, 0x6B, /* 0xD0-0xD3 */ + 0xD2, 0x75, 0x00, 0x00, 0x00, 0x00, 0xD2, 0x71, /* 0xD4-0xD7 */ + 0xAF, 0x4D, 0xAF, 0x4F, 0xD2, 0x7A, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD2, 0x6A, 0xD2, 0x6D, 0xD2, 0x73, 0x00, 0x00, /* 0xDC-0xDF */ + 0xD2, 0x74, 0xD2, 0x7C, 0xD2, 0x70, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xAF, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB2, 0x6D, /* 0xEC-0xEF */ + 0xD6, 0x4E, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x50, /* 0xF0-0xF3 */ + 0xD6, 0x4C, 0x00, 0x00, 0xD6, 0x58, 0xD6, 0x4A, /* 0xF4-0xF7 */ + 0xD6, 0x57, 0xB2, 0x69, 0xD6, 0x48, 0xDA, 0x5B, /* 0xF8-0xFB */ + 0xD6, 0x52, 0xB2, 0x6C, 0x00, 0x00, 0xD6, 0x53, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_71[512] = { + 0xD6, 0x56, 0x00, 0x00, 0xD6, 0x5A, 0x00, 0x00, /* 0x00-0x03 */ + 0xD6, 0x4F, 0x00, 0x00, 0xD6, 0x54, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xB2, 0x6A, 0xB2, 0x6B, 0xD6, 0x59, /* 0x08-0x0B */ + 0xD6, 0x4D, 0xD6, 0x49, 0xD6, 0x5B, 0x00, 0x00, /* 0x0C-0x0F */ + 0xD6, 0x51, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x55, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x4B, /* 0x14-0x17 */ + 0x00, 0x00, 0xB5, 0x48, 0xB5, 0x49, 0xDA, 0x65, /* 0x18-0x1B */ + 0xB5, 0x4F, 0x00, 0x00, 0xDA, 0x59, 0xDA, 0x62, /* 0x1C-0x1F */ + 0xDA, 0x58, 0xB5, 0x4C, 0xDA, 0x60, 0xDA, 0x5E, /* 0x20-0x23 */ + 0x00, 0x00, 0xDA, 0x5F, 0xB5, 0x4A, 0x00, 0x00, /* 0x24-0x27 */ + 0xDA, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0x5C, 0xDA, 0x5A, /* 0x2C-0x2F */ + 0xB5, 0x4B, 0xDA, 0x5D, 0xDA, 0x61, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xB5, 0x4D, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0x64, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xDE, 0x70, 0xDE, 0x77, 0xDE, 0x79, /* 0x40-0x43 */ + 0xDE, 0xA1, 0x00, 0x00, 0xB7, 0xDA, 0xDE, 0x6B, /* 0x44-0x47 */ + 0x00, 0x00, 0xB7, 0xD2, 0x00, 0x00, 0xDE, 0x7A, /* 0x48-0x4B */ + 0xB7, 0xD7, 0xDE, 0xA2, 0xB7, 0xCE, 0x00, 0x00, /* 0x4C-0x4F */ + 0xDE, 0x7D, 0x00, 0x00, 0xDE, 0x6D, 0xDE, 0x7E, /* 0x50-0x53 */ + 0xDE, 0x6C, 0x00, 0x00, 0xB7, 0xDC, 0x00, 0x00, /* 0x54-0x57 */ + 0xDE, 0x78, 0xB7, 0xCF, 0xDE, 0xA3, 0x00, 0x00, /* 0x58-0x5B */ + 0xB7, 0xD4, 0xDE, 0x71, 0xB7, 0xD9, 0xDE, 0x7C, /* 0x5C-0x5F */ + 0xDE, 0x6F, 0xDE, 0x76, 0xDE, 0x72, 0xDE, 0x6E, /* 0x60-0x63 */ + 0xB7, 0xD1, 0xB7, 0xD8, 0xB7, 0xD6, 0xB7, 0xD3, /* 0x64-0x67 */ + 0xB7, 0xDB, 0xB7, 0xD0, 0xDE, 0x75, 0x00, 0x00, /* 0x68-0x6B */ + 0xB7, 0xD5, 0x00, 0x00, 0xB5, 0x4E, 0x00, 0x00, /* 0x6C-0x6F */ + 0xDE, 0x7B, 0x00, 0x00, 0xDE, 0x73, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xDE, 0x74, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC1, /* 0x78-0x7B */ + 0x00, 0x00, 0xBA, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xE2, 0xBD, 0xE2, 0xC3, 0xE2, 0xBF, 0x00, 0x00, /* 0x80-0x83 */ + 0xBA, 0xB6, 0xE2, 0xBE, 0xE2, 0xC2, 0xE2, 0xBA, /* 0x84-0x87 */ + 0x00, 0x00, 0xE2, 0xBC, 0xBA, 0xB5, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC0, /* 0x8C-0x8F */ + 0xE2, 0xBB, 0x00, 0x00, 0xBA, 0xB7, 0x00, 0x00, /* 0x90-0x93 */ + 0xBA, 0xB2, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xC4, /* 0x94-0x97 */ + 0x00, 0x00, 0xBA, 0xB3, 0xE6, 0x67, 0xE6, 0x64, /* 0x98-0x9B */ + 0xE6, 0x70, 0xE6, 0x6A, 0xE6, 0x6C, 0xBC, 0xF4, /* 0x9C-0x9F */ + 0xE6, 0x66, 0xE6, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE6, 0x6D, 0xE6, 0x6B, 0x00, 0x00, 0xE6, 0x71, /* 0xA4-0xA7 */ + 0xBC, 0xF7, 0xE6, 0x68, 0xE6, 0x6F, 0x00, 0x00, /* 0xA8-0xAB */ + 0xBC, 0xF5, 0x00, 0x00, 0x00, 0x00, 0xE6, 0x63, /* 0xAC-0xAF */ + 0xE6, 0x65, 0xBC, 0xF6, 0xE6, 0x62, 0xE6, 0x72, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xE6, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xEA, 0x4A, 0xBF, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0xEA, 0x55, 0xEA, 0x53, 0xBF, 0x4B, 0xEA, 0x49, /* 0xBC-0xBF */ + 0xEA, 0x4C, 0xEA, 0x4D, 0xEA, 0x48, 0xBF, 0x55, /* 0xC0-0xC3 */ + 0xBF, 0x56, 0xEA, 0x47, 0xEA, 0x56, 0xEA, 0x51, /* 0xC4-0xC7 */ + 0xBF, 0x4F, 0xBF, 0x4C, 0xEA, 0x50, 0xEA, 0x4E, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xBF, 0x52, 0xEA, 0x52, /* 0xCC-0xCF */ + 0xBF, 0x4D, 0x00, 0x00, 0xBF, 0x4E, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xEA, 0x4F, 0xBF, 0x50, 0xEA, 0x4B, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xEA, 0x54, 0xBF, 0x53, 0xEA, 0x57, 0xEA, 0x58, /* 0xD8-0xDB */ + 0xBF, 0x54, 0x00, 0x00, 0x00, 0x00, 0xC0, 0xE7, /* 0xDC-0xDF */ + 0xC0, 0xEE, 0xED, 0x5C, 0xED, 0x62, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xED, 0x60, 0xC0, 0xEA, 0xC0, 0xE9, 0xC0, 0xE6, /* 0xE4-0xE7 */ + 0xED, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xC0, 0xEC, 0xC0, 0xEB, 0xC0, 0xE8, 0x00, 0x00, /* 0xEC-0xEF */ + 0xED, 0x61, 0xED, 0x5D, 0xED, 0x5F, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xC0, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xC2, 0x77, 0xEF, 0xFB, 0x00, 0x00, 0xC2, 0x74, /* 0xF8-0xFB */ + 0xC2, 0x75, 0xEF, 0xFD, 0xC2, 0x76, 0xEF, 0xFA, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_72[512] = { + 0x00, 0x00, 0xEF, 0xF9, 0xF2, 0x6C, 0xEF, 0xFC, /* 0x00-0x03 */ + 0x00, 0x00, 0xF2, 0x6D, 0xC3, 0x7A, 0xF2, 0x6B, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0x6A, 0x00, 0x00, /* 0x08-0x0B */ + 0xF2, 0x69, 0xC3, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xC4, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xF4, 0x6A, /* 0x10-0x13 */ + 0xF4, 0x6B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xF5, 0xDC, 0xF5, 0xDB, 0xC4, 0xEA, /* 0x18-0x1B */ + 0x00, 0x00, 0xF5, 0xDA, 0xF6, 0xEC, 0xF6, 0xED, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xF7, 0xE6, 0xF8, 0xB1, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xF6, 0xF9, 0xBC, /* 0x24-0x27 */ + 0xC6, 0x79, 0xF9, 0xC6, 0xA4, 0xF6, 0x00, 0x00, /* 0x28-0x2B */ + 0xAA, 0xA6, 0xAA, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xAC, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xC0, 0xEF, 0xA4, 0xF7, 0x00, 0x00, /* 0x34-0x37 */ + 0xAA, 0xA8, 0xAF, 0x52, 0xB7, 0xDD, 0xA4, 0xF8, /* 0x38-0x3B */ + 0x00, 0x00, 0xB2, 0x6E, 0xBA, 0xB8, 0xC9, 0x62, /* 0x3C-0x3F */ + 0x00, 0x00, 0xCF, 0xB7, 0xD2, 0x7D, 0x00, 0x00, /* 0x40-0x43 */ + 0xE2, 0xC5, 0x00, 0x00, 0xC0, 0xF0, 0xA4, 0xF9, /* 0x44-0x47 */ + 0xAA, 0xA9, 0xCF, 0xB8, 0xCF, 0xB9, 0xDA, 0x66, /* 0x48-0x4B */ + 0xB5, 0x50, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xA4, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0xDE, 0xE2, 0xC6, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xBC, 0xF8, 0x00, 0x00, /* 0x54-0x57 */ + 0xC3, 0x7C, 0xA4, 0xFA, 0xDA, 0x67, 0xA4, 0xFB, /* 0x58-0x5B */ + 0x00, 0x00, 0xA6, 0xC9, 0xCA, 0x42, 0xA6, 0xC8, /* 0x5C-0x5F */ + 0xA8, 0x65, 0xA8, 0x64, 0xA8, 0x63, 0xCB, 0x60, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAA, 0xAA, /* 0x64-0x67 */ + 0x00, 0x00, 0xAA, 0xAB, 0xCD, 0x5B, 0x00, 0x00, /* 0x68-0x6B */ + 0xCF, 0xBA, 0x00, 0x00, 0xCF, 0xBD, 0xAC, 0xBA, /* 0x6C-0x6F */ + 0xCF, 0xBB, 0x00, 0x00, 0xAC, 0xB9, 0xCF, 0xBC, /* 0x70-0x73 */ + 0xAC, 0xBB, 0x00, 0x00, 0xD2, 0xA2, 0xD2, 0xA1, /* 0x74-0x77 */ + 0xD2, 0x7E, 0xAF, 0x53, 0x00, 0x00, 0xD6, 0x5D, /* 0x78-0x7B */ + 0xD6, 0x5E, 0xB2, 0x6F, 0xD6, 0x5C, 0xD6, 0x5F, /* 0x7C-0x7F */ + + 0xB5, 0x52, 0xB2, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xB5, 0x51, 0xDA, 0x6B, 0xDA, 0x6A, 0x00, 0x00, /* 0x84-0x87 */ + 0xDA, 0x68, 0xDA, 0x69, 0x00, 0x00, 0xDA, 0x6C, /* 0x88-0x8B */ + 0xDE, 0xA6, 0xDE, 0xA5, 0xDE, 0xA9, 0x00, 0x00, /* 0x8C-0x8F */ + 0xDE, 0xA8, 0xDE, 0xA7, 0xBA, 0xB9, 0xE2, 0xC9, /* 0x90-0x93 */ + 0x00, 0x00, 0xE2, 0xC8, 0xBA, 0xBA, 0xE2, 0xC7, /* 0x94-0x97 */ + 0xE6, 0x73, 0x00, 0x00, 0xE6, 0x74, 0xBC, 0xF9, /* 0x98-0x9B */ + 0x00, 0x00, 0xEA, 0x59, 0xEA, 0x5A, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xF2, 0x72, 0xC3, 0x7D, 0xF2, 0x71, /* 0xA0-0xA3 */ + 0xF2, 0x70, 0xF2, 0x6E, 0xF2, 0x6F, 0xC4, 0xEB, /* 0xA4-0xA7 */ + 0xF4, 0x6C, 0xF6, 0xEE, 0xF8, 0xF7, 0x00, 0x00, /* 0xA8-0xAB */ + 0xA4, 0xFC, 0x00, 0x00, 0xC9, 0xA5, 0xA5, 0xC7, /* 0xAC-0xAF */ + 0xC9, 0xA6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xCA, 0x43, 0xCA, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0x66, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xCB, 0x62, 0x00, 0x00, 0xCB, 0x61, /* 0xBC-0xBF */ + 0xAA, 0xAC, 0xCB, 0x65, 0xA8, 0x67, 0xCB, 0x63, /* 0xC0-0xC3 */ + 0xA8, 0x66, 0xCB, 0x67, 0xCB, 0x64, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xCD, 0x5F, 0xCF, 0xBE, 0xCD, 0x5D, /* 0xC8-0xCB */ + 0xCD, 0x64, 0x00, 0x00, 0xAA, 0xAD, 0x00, 0x00, /* 0xCC-0xCF */ + 0xAA, 0xB0, 0xCD, 0x65, 0xCD, 0x61, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xCD, 0x62, 0x00, 0x00, 0xCD, 0x5C, 0xAA, 0xAF, /* 0xD4-0xD7 */ + 0xCD, 0x5E, 0xAA, 0xAE, 0xCD, 0x63, 0x00, 0x00, /* 0xD8-0xDB */ + 0xCD, 0x60, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xC2, /* 0xDC-0xDF */ + 0xAC, 0xBD, 0xAC, 0xBE, 0x00, 0x00, 0xCF, 0xC5, /* 0xE0-0xE3 */ + 0xCF, 0xBF, 0x00, 0x00, 0xCF, 0xC4, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xCF, 0xC0, 0xAC, 0xBC, 0xCF, 0xC3, 0xCF, 0xC1, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xA8, /* 0xF0-0xF3 */ + 0xD2, 0xA5, 0x00, 0x00, 0xD2, 0xA7, 0xAF, 0x58, /* 0xF4-0xF7 */ + 0xAF, 0x57, 0xAF, 0x55, 0xD2, 0xA4, 0xD2, 0xA9, /* 0xF8-0xFB */ + 0xAF, 0x54, 0xAF, 0x56, 0xD2, 0xA6, 0xD6, 0x67, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_73[512] = { + 0xD2, 0xA3, 0xD2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x62, /* 0x04-0x07 */ + 0xD6, 0x66, 0x00, 0x00, 0xD6, 0x65, 0xDA, 0x6E, /* 0x08-0x0B */ + 0xDA, 0x79, 0x00, 0x00, 0x00, 0x00, 0xD6, 0x68, /* 0x0C-0x0F */ + 0x00, 0x00, 0xD6, 0x63, 0xDA, 0x6D, 0xB2, 0x74, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0x73, 0xD6, 0x61, /* 0x14-0x17 */ + 0xD6, 0x64, 0xB2, 0x75, 0x00, 0x00, 0xB2, 0x72, /* 0x18-0x1B */ + 0xB2, 0x71, 0xD6, 0x60, 0xD6, 0x69, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0x70, 0xDA, 0x77, /* 0x20-0x23 */ + 0x00, 0x00, 0xB5, 0x54, 0xDA, 0x76, 0xDA, 0x73, /* 0x24-0x27 */ + 0x00, 0x00, 0xB5, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xDA, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xDA, 0x6F, 0xDA, 0x71, 0xDA, 0x74, 0xDA, 0x72, /* 0x30-0x33 */ + 0xB5, 0x55, 0xDA, 0x78, 0xB5, 0x53, 0xB7, 0xDF, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xAD, 0xDE, 0xAC, /* 0x38-0x3B */ + 0xDE, 0xAA, 0x00, 0x00, 0xB7, 0xE2, 0xB7, 0xE1, /* 0x3C-0x3F */ + 0xDE, 0xAE, 0x00, 0x00, 0xDE, 0xAB, 0xE2, 0xCA, /* 0x40-0x43 */ + 0xBA, 0xBB, 0xB7, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xDE, 0xB0, 0xDE, 0xAF, 0x00, 0x00, /* 0x48-0x4B */ + 0xE2, 0xCD, 0xE2, 0xCB, 0xBC, 0xFA, 0x00, 0x00, /* 0x4C-0x4F */ + 0xBA, 0xBC, 0xE2, 0xCC, 0xE6, 0x76, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBC, 0xFB, /* 0x54-0x57 */ + 0xE6, 0x75, 0xE6, 0x7E, 0xE6, 0x7D, 0xE6, 0x7B, /* 0x58-0x5B */ + 0x00, 0x00, 0xE6, 0x7A, 0xE6, 0x77, 0xE6, 0x78, /* 0x5C-0x5F */ + 0xE6, 0x79, 0xE6, 0x7C, 0xE6, 0xA1, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0xEA, 0x5F, 0xEA, 0x5C, 0xEA, 0x5D, /* 0x64-0x67 */ + 0xBF, 0x57, 0xEA, 0x5B, 0xEA, 0x61, 0xEA, 0x60, /* 0x68-0x6B */ + 0xEA, 0x5E, 0x00, 0x00, 0xED, 0x64, 0xED, 0x65, /* 0x6C-0x6F */ + 0xC0, 0xF1, 0x00, 0x00, 0xC0, 0xF2, 0xED, 0x63, /* 0x70-0x73 */ + 0x00, 0x00, 0xC2, 0x79, 0xEF, 0xFE, 0xC2, 0x78, /* 0x74-0x77 */ + 0xC3, 0x7E, 0x00, 0x00, 0xC3, 0xA1, 0xC4, 0x6D, /* 0x78-0x7B */ + 0xF4, 0x6E, 0xF4, 0x6D, 0xF5, 0xDD, 0xF6, 0xEF, /* 0x7C-0x7F */ + + 0xC5, 0x7A, 0xF7, 0xE8, 0xF7, 0xE7, 0xF7, 0xE9, /* 0x80-0x83 */ + 0xA5, 0xC8, 0xCF, 0xC6, 0xAF, 0x59, 0xB2, 0x76, /* 0x84-0x87 */ + 0xD6, 0x6A, 0xA5, 0xC9, 0xC9, 0xA7, 0xA4, 0xFD, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xCA, 0x45, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0x6C, 0xCB, 0x6A, /* 0x90-0x93 */ + 0xCB, 0x6B, 0xCB, 0x68, 0xA8, 0x68, 0xCB, 0x69, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xCD, 0x6D, 0x00, 0x00, 0xAA, 0xB3, /* 0x9C-0x9F */ + 0xCD, 0x6B, 0xCD, 0x67, 0xCD, 0x6A, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xCD, 0x66, 0xAA, 0xB5, 0xCD, 0x69, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xAA, 0xB2, 0xAA, 0xB1, 0x00, 0x00, 0xAA, 0xB4, /* 0xA8-0xAB */ + 0xCD, 0x6C, 0xCD, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xAC, 0xC2, 0xAC, 0xC5, /* 0xB0-0xB3 */ + 0xCF, 0xCE, 0xCF, 0xCD, 0xCF, 0xCC, 0xAC, 0xBF, /* 0xB4-0xB7 */ + 0xCF, 0xD5, 0xCF, 0xCB, 0x00, 0x00, 0xAC, 0xC1, /* 0xB8-0xBB */ + 0xD2, 0xAF, 0x00, 0x00, 0xCF, 0xD2, 0xCF, 0xD0, /* 0xBC-0xBF */ + 0xAC, 0xC4, 0x00, 0x00, 0xCF, 0xC8, 0xCF, 0xD3, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xCF, 0xCA, 0xCF, 0xD4, 0xCF, 0xD1, /* 0xC4-0xC7 */ + 0xCF, 0xC9, 0x00, 0x00, 0xAC, 0xC0, 0xCF, 0xD6, /* 0xC8-0xCB */ + 0xCF, 0xC7, 0xAC, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xD2, 0xB4, 0xD2, 0xAB, /* 0xD0-0xD3 */ + 0xD2, 0xB6, 0x00, 0x00, 0xD2, 0xAE, 0xD2, 0xB9, /* 0xD4-0xD7 */ + 0xD2, 0xBA, 0xD2, 0xAC, 0xD2, 0xB8, 0xD2, 0xB5, /* 0xD8-0xDB */ + 0xD2, 0xB3, 0xD2, 0xB7, 0xAF, 0x5F, 0x00, 0x00, /* 0xDC-0xDF */ + 0xAF, 0x5D, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xB1, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xD2, 0xAD, 0x00, 0x00, 0xD2, 0xB0, /* 0xE4-0xE7 */ + 0xD2, 0xBB, 0xD2, 0xB2, 0xAF, 0x5E, 0xCF, 0xCF, /* 0xE8-0xEB */ + 0x00, 0x00, 0xAF, 0x5A, 0xAF, 0x5C, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xD6, 0x78, 0xD6, 0x6D, 0xD6, 0x6B, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xD6, 0x6C, 0x00, 0x00, 0xD6, 0x73, 0x00, 0x00, /* 0xF8-0xFB */ + 0xD6, 0x74, 0xD6, 0x70, 0xB2, 0x7B, 0xD6, 0x75, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_74[512] = { + 0xD6, 0x72, 0xD6, 0x6F, 0x00, 0x00, 0xB2, 0x79, /* 0x00-0x03 */ + 0xD6, 0x6E, 0xB2, 0x77, 0xB2, 0x7A, 0xD6, 0x71, /* 0x04-0x07 */ + 0xD6, 0x79, 0xAF, 0x5B, 0xB2, 0x78, 0xD6, 0x77, /* 0x08-0x0B */ + 0xD6, 0x76, 0xB2, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0x7E, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xA1, 0xB5, 0x60, /* 0x18-0x1B */ + 0x00, 0x00, 0xDA, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xDA, 0xA9, 0xDA, 0xA2, 0xB5, 0x5A, 0xDA, 0xA6, /* 0x20-0x23 */ + 0xDA, 0xA5, 0xB5, 0x5B, 0xB5, 0x61, 0x00, 0x00, /* 0x24-0x27 */ + 0xB5, 0x62, 0xDA, 0xA8, 0xB5, 0x58, 0xDA, 0x7D, /* 0x28-0x2B */ + 0xDA, 0x7B, 0xDA, 0xA3, 0xDA, 0x7A, 0xB5, 0x5F, /* 0x2C-0x2F */ + 0xDA, 0x7C, 0xDA, 0xA4, 0xDA, 0xAA, 0xB5, 0x59, /* 0x30-0x33 */ + 0xB5, 0x5E, 0xB5, 0x5C, 0xB5, 0x5D, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xB5, 0x57, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB7, 0xE9, /* 0x3C-0x3F */ + 0xDE, 0xB7, 0xB7, 0xE8, 0xDE, 0xBB, 0x00, 0x00, /* 0x40-0x43 */ + 0xDE, 0xB1, 0x00, 0x00, 0xDE, 0xBC, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xB2, 0xDE, 0xB3, /* 0x48-0x4B */ + 0x00, 0x00, 0xDE, 0xBD, 0xDE, 0xBA, 0xDE, 0xB8, /* 0x4C-0x4F */ + 0xDE, 0xB9, 0xDE, 0xB5, 0xDE, 0xB4, 0x00, 0x00, /* 0x50-0x53 */ + 0xDE, 0xBE, 0xB7, 0xE5, 0x00, 0x00, 0xDE, 0xB6, /* 0x54-0x57 */ + 0x00, 0x00, 0xB7, 0xEA, 0xB7, 0xE4, 0xB7, 0xEB, /* 0x58-0x5B */ + 0xB7, 0xEC, 0x00, 0x00, 0xB7, 0xE7, 0xB7, 0xE6, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xCE, 0xBA, 0xBE, /* 0x60-0x63 */ + 0xBA, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xE2, 0xD3, /* 0x64-0x67 */ + 0x00, 0x00, 0xBC, 0xFC, 0xBA, 0xBF, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xBA, 0xC1, 0xE2, 0xD4, 0xB7, 0xE3, /* 0x6C-0x6F */ + 0xBA, 0xC0, 0xE2, 0xD0, 0xE2, 0xD2, 0xE2, 0xCF, /* 0x70-0x73 */ + 0x00, 0x00, 0xE2, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xE6, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xE6, 0xAA, 0xE6, 0xA7, 0xBD, 0x40, 0xEA, 0x62, /* 0x7C-0x7F */ + + 0xBD, 0x41, 0xE6, 0xA6, 0x00, 0x00, 0xBC, 0xFE, /* 0x80-0x83 */ + 0x00, 0x00, 0xE6, 0xA8, 0xE6, 0xA5, 0xE6, 0xA2, /* 0x84-0x87 */ + 0xE6, 0xA9, 0xE6, 0xA3, 0xE6, 0xA4, 0xBC, 0xFD, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xED, 0x69, 0x00, 0x00, 0xEA, 0x66, 0x00, 0x00, /* 0x90-0x93 */ + 0xEA, 0x65, 0xEA, 0x67, 0x00, 0x00, 0xED, 0x66, /* 0x94-0x97 */ + 0xBF, 0x5A, 0x00, 0x00, 0xEA, 0x63, 0x00, 0x00, /* 0x98-0x9B */ + 0xBF, 0x58, 0x00, 0x00, 0xBF, 0x5C, 0xBF, 0x5B, /* 0x9C-0x9F */ + 0xEA, 0x64, 0xEA, 0x68, 0x00, 0x00, 0xBF, 0x59, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xED, 0x6D, 0xC0, 0xF5, 0xC2, 0x7A, /* 0xA4-0xA7 */ + 0xC0, 0xF6, 0xC0, 0xF3, 0xED, 0x6A, 0xED, 0x68, /* 0xA8-0xAB */ + 0x00, 0x00, 0xED, 0x6B, 0x00, 0x00, 0xED, 0x6E, /* 0xAC-0xAF */ + 0xC0, 0xF4, 0xED, 0x6C, 0xED, 0x67, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF0, 0x42, 0xF0, 0x45, 0xF2, 0x75, /* 0xB4-0xB7 */ + 0xF0, 0x40, 0x00, 0x00, 0xF4, 0x6F, 0xF0, 0x46, /* 0xB8-0xBB */ + 0x00, 0x00, 0xC3, 0xA2, 0xF0, 0x44, 0xC2, 0x7B, /* 0xBC-0xBF */ + 0xF0, 0x41, 0xF0, 0x43, 0xF0, 0x47, 0xF2, 0x76, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xF2, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xC3, 0xA3, 0xF2, 0x73, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x6E, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xC4, 0xED, 0xF6, 0xF1, 0xC4, 0xEC, 0xF6, 0xF3, /* 0xD4-0xD7 */ + 0xF6, 0xF0, 0xF6, 0xF2, 0xC5, 0xD0, 0xF8, 0xB2, /* 0xD8-0xDB */ + 0xA5, 0xCA, 0xCD, 0x6E, 0xD2, 0xBC, 0xD2, 0xBD, /* 0xDC-0xDF */ + 0xB2, 0x7D, 0xDE, 0xBF, 0xBF, 0x5D, 0xC3, 0xA4, /* 0xE0-0xE3 */ + 0xC5, 0x7B, 0xF8, 0xB3, 0xA5, 0xCB, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xCD, 0x6F, 0xA2, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xCF, 0xD7, 0x00, 0x00, 0xCF, 0xD8, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xD2, 0xBE, 0xD2, 0xBF, 0xB2, 0x7E, 0xB2, 0xA1, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xAB, /* 0xF8-0xFB */ + 0x00, 0x00, 0xDE, 0xC2, 0xDE, 0xC1, 0xDE, 0xC0, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_75[512] = { + 0xE2, 0xD5, 0x00, 0x00, 0xE2, 0xD6, 0xE2, 0xD7, /* 0x00-0x03 */ + 0xBA, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xAD, /* 0x04-0x07 */ + 0xE6, 0xAC, 0x00, 0x00, 0x00, 0x00, 0xEA, 0x69, /* 0x08-0x0B */ + 0xBF, 0x5E, 0xBF, 0x5F, 0x00, 0x00, 0xED, 0x72, /* 0x0C-0x0F */ + 0xED, 0x6F, 0xED, 0x70, 0xED, 0x71, 0xF0, 0x49, /* 0x10-0x13 */ + 0xF0, 0x48, 0xC2, 0x7C, 0xF2, 0x77, 0xF5, 0xDE, /* 0x14-0x17 */ + 0xA5, 0xCC, 0x00, 0x00, 0xAC, 0xC6, 0x00, 0x00, /* 0x18-0x1B */ + 0xB2, 0xA2, 0xDE, 0xC3, 0x00, 0x00, 0xA5, 0xCD, /* 0x1C-0x1F */ + 0x00, 0x00, 0xD2, 0xC0, 0xB2, 0xA3, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xB5, 0x63, 0xB5, 0x64, 0x00, 0x00, /* 0x24-0x27 */ + 0xA5, 0xCE, 0xA5, 0xCF, 0xCA, 0x46, 0xA8, 0x6A, /* 0x28-0x2B */ + 0xA8, 0x69, 0xAC, 0xC7, 0xCF, 0xD9, 0xDA, 0xAC, /* 0x2C-0x2F */ + 0xA5, 0xD0, 0xA5, 0xD1, 0xA5, 0xD2, 0xA5, 0xD3, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x6B, /* 0x34-0x37 */ + 0xA8, 0x6C, 0xCB, 0x6E, 0xCB, 0x6D, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xAA, 0xB6, 0xCD, 0x72, 0xCD, 0x70, /* 0x3C-0x3F */ + 0xCD, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCF, 0xDA, /* 0x44-0x47 */ + 0xCF, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xAC, 0xCB, /* 0x48-0x4B */ + 0xAC, 0xC9, 0x00, 0x00, 0xAC, 0xCA, 0xAC, 0xC8, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xAF, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xAF, 0x64, 0xAF, 0x63, 0xD2, 0xC1, /* 0x58-0x5B */ + 0xAF, 0x62, 0xAF, 0x61, 0x00, 0x00, 0xD2, 0xC2, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0xA6, 0xD6, 0x7B, /* 0x60-0x63 */ + 0xD6, 0x7A, 0xB2, 0xA4, 0xB2, 0xA5, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xB5, 0x66, 0xB5, 0x65, /* 0x68-0x6B */ + 0xDA, 0xAE, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xAD, /* 0x6C-0x6F */ + 0xB2, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xB7, 0xED, 0xDE, 0xC5, /* 0x74-0x77 */ + 0xB7, 0xEE, 0xDE, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xE2, 0xD8, 0xE6, 0xAE, 0xBD, 0x42, /* 0x7C-0x7F */ + + 0xEA, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xED, 0x73, 0x00, 0x00, 0xC3, 0xA6, 0xC3, 0xA5, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xC5, 0x7C, 0xA5, 0xD4, /* 0x88-0x8B */ + 0xCD, 0x73, 0x00, 0x00, 0x00, 0x00, 0xB2, 0xA8, /* 0x8C-0x8F */ + 0xE2, 0xD9, 0xBA, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xCB, 0x6F, 0xCB, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xCD, 0x74, 0xAA, 0xB8, 0xAA, 0xB9, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xAA, 0xB7, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xAC, 0xCF, 0xAC, 0xD0, /* 0xA0-0xA3 */ + 0xAC, 0xCD, 0xAC, 0xCE, 0x00, 0x00, 0xCF, 0xDC, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xCF, 0xDD, 0xAC, 0xCC, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xD2, 0xC3, 0x00, 0x00, 0xAF, 0x68, 0xAF, 0x69, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xB2, 0xAB, 0xD2, 0xC9, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xAF, 0x6E, 0xAF, 0x6C, 0xD2, 0xCA, 0xD2, 0xC5, /* 0xB8-0xBB */ + 0xAF, 0x6B, 0xAF, 0x6A, 0xAF, 0x65, 0xD2, 0xC8, /* 0xBC-0xBF */ + 0xD2, 0xC7, 0xD2, 0xC4, 0xAF, 0x6D, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xD2, 0xC6, 0xAF, 0x66, 0x00, 0x00, 0xAF, 0x67, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0xAC, 0xD6, 0xA1, /* 0xC8-0xCB */ + 0xD6, 0xA2, 0xB2, 0xAD, 0xD6, 0x7C, 0xD6, 0x7E, /* 0xCC-0xCF */ + 0xD6, 0xA4, 0xD6, 0xA3, 0xD6, 0x7D, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xB2, 0xA9, 0xB2, 0xAA, 0x00, 0x00, 0xDA, 0xB6, /* 0xD4-0xD7 */ + 0xB5, 0x6B, 0xB5, 0x6A, 0xDA, 0xB0, 0xB5, 0x68, /* 0xD8-0xDB */ + 0x00, 0x00, 0xDA, 0xB3, 0xB5, 0x6C, 0xDA, 0xB4, /* 0xDC-0xDF */ + 0xB5, 0x6D, 0xDA, 0xB1, 0xB5, 0x67, 0xB5, 0x69, /* 0xE0-0xE3 */ + 0xDA, 0xB5, 0x00, 0x00, 0xDA, 0xB2, 0xDA, 0xAF, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xDE, 0xD2, 0x00, 0x00, 0xDE, 0xC7, /* 0xEC-0xEF */ + 0xB7, 0xF0, 0xB7, 0xF3, 0xB7, 0xF2, 0xB7, 0xF7, /* 0xF0-0xF3 */ + 0xB7, 0xF6, 0xDE, 0xD3, 0xDE, 0xD1, 0xDE, 0xCA, /* 0xF4-0xF7 */ + 0xDE, 0xCE, 0xDE, 0xCD, 0xB7, 0xF4, 0xDE, 0xD0, /* 0xF8-0xFB */ + 0xDE, 0xCC, 0xDE, 0xD4, 0xDE, 0xCB, 0xB7, 0xF5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_76[512] = { + 0xB7, 0xEF, 0xB7, 0xF1, 0x00, 0x00, 0xDE, 0xC9, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xE2, 0xDB, 0xBA, 0xC7, 0xE2, 0xDF, 0xBA, 0xC6, /* 0x08-0x0B */ + 0xE2, 0xDC, 0xBA, 0xC5, 0x00, 0x00, 0xDE, 0xC8, /* 0x0C-0x0F */ + 0xDE, 0xCF, 0xE2, 0xDE, 0x00, 0x00, 0xBA, 0xC8, /* 0x10-0x13 */ + 0xE2, 0xE0, 0xE2, 0xDD, 0xE2, 0xDA, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xE6, 0xB1, 0xE6, 0xB5, 0xE6, 0xB7, /* 0x18-0x1B */ + 0xE6, 0xB3, 0xE6, 0xB2, 0xE6, 0xB0, 0xBD, 0x45, /* 0x1C-0x1F */ + 0xBD, 0x43, 0xBD, 0x48, 0xBD, 0x49, 0xE6, 0xB4, /* 0x20-0x23 */ + 0xBD, 0x46, 0xE6, 0xAF, 0xBD, 0x47, 0xBA, 0xC4, /* 0x24-0x27 */ + 0xE6, 0xB6, 0xBD, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xEA, 0x6C, 0x00, 0x00, 0xEA, 0x6B, /* 0x2C-0x2F */ + 0xEA, 0x73, 0xEA, 0x6D, 0xEA, 0x72, 0xEA, 0x6F, /* 0x30-0x33 */ + 0xBF, 0x60, 0xEA, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xBF, 0x61, 0x00, 0x00, 0xBF, 0x62, 0x00, 0x00, /* 0x38-0x3B */ + 0xEA, 0x70, 0xEA, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0xF8, 0xED, 0x74, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0xF7, 0xED, 0x77, /* 0x44-0x47 */ + 0xED, 0x75, 0xED, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xC0, 0xF9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF0, 0x4D, 0x00, 0x00, 0xC2, 0xA1, 0xF0, 0x4E, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xC2, 0x7D, 0xF0, 0x4F, /* 0x54-0x57 */ + 0xC2, 0x7E, 0xF0, 0x4C, 0xF0, 0x50, 0x00, 0x00, /* 0x58-0x5B */ + 0xF0, 0x4A, 0x00, 0x00, 0x00, 0x00, 0xC3, 0xA7, /* 0x5C-0x5F */ + 0xF2, 0x78, 0xC3, 0xA8, 0xC4, 0x6F, 0x00, 0x00, /* 0x60-0x63 */ + 0xF0, 0x4B, 0xC4, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xC4, 0xEE, 0xF5, 0xDF, 0x00, 0x00, /* 0x68-0x6B */ + 0xC5, 0x7E, 0xF6, 0xF4, 0xC5, 0x7D, 0x00, 0x00, /* 0x6C-0x6F */ + 0xF7, 0xEA, 0xC5, 0xF5, 0xC5, 0xF6, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xF9, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xAC, 0xD1, 0xCF, 0xDE, 0x00, 0x00, 0xB5, 0x6E, /* 0x78-0x7B */ + 0xB5, 0x6F, 0xA5, 0xD5, 0xA6, 0xCA, 0xCA, 0x47, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xCB, 0x71, 0xA8, 0x6D, 0x00, 0x00, /* 0x80-0x83 */ + 0xAA, 0xBA, 0x00, 0x00, 0xAC, 0xD2, 0xAC, 0xD3, /* 0x84-0x87 */ + 0xAC, 0xD4, 0xD6, 0xA6, 0xD2, 0xCB, 0xAF, 0x6F, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0xAE, 0xD6, 0xA5, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xB8, 0xB5, 0x71, /* 0x90-0x93 */ + 0x00, 0x00, 0xDA, 0xB7, 0xB5, 0x70, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xDE, 0xD5, 0xBD, 0x4A, 0xE6, 0xBB, /* 0x98-0x9B */ + 0xE6, 0xB8, 0xE6, 0xB9, 0xE6, 0xBA, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xED, 0x78, 0x00, 0x00, 0xF0, 0x51, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0x71, 0xF4, 0x70, /* 0xA8-0xAB */ + 0x00, 0x00, 0xF6, 0xF5, 0xA5, 0xD6, 0xCD, 0x75, /* 0xAC-0xAF */ + 0xAF, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xB5, 0x72, 0xDE, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE2, 0xE1, 0x00, 0x00, 0xBD, 0x4B, 0xEA, 0x74, /* 0xB8-0xBB */ + 0x00, 0x00, 0xF0, 0x52, 0xF4, 0x72, 0xA5, 0xD7, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xAA, 0xBB, 0xAC, 0xD7, /* 0xC0-0xC3 */ + 0xCF, 0xDF, 0xAC, 0xD8, 0xAC, 0xD6, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xAC, 0xD5, 0xD2, 0xCC, 0xAF, 0x71, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xAF, 0x72, 0xAF, 0x73, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0xB0, 0xD6, 0xA7, /* 0xD0-0xD3 */ + 0xB2, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xB9, 0xB2, 0xB1, /* 0xD8-0xDB */ + 0xB5, 0x73, 0xDE, 0xD7, 0xB7, 0xF8, 0xB7, 0xF9, /* 0xDC-0xDF */ + 0x00, 0x00, 0xBA, 0xC9, 0x00, 0x00, 0xBA, 0xCA, /* 0xE0-0xE3 */ + 0xBD, 0x4C, 0xBF, 0x64, 0xEA, 0x75, 0xBF, 0x63, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xED, 0x79, 0xC0, 0xFA, 0x00, 0x00, /* 0xE8-0xEB */ + 0xF0, 0x53, 0xF4, 0x73, 0xA5, 0xD8, 0xA8, 0x6E, /* 0xEC-0xEF */ + 0xCD, 0x78, 0xCD, 0x77, 0xAA, 0xBC, 0xCD, 0x76, /* 0xF0-0xF3 */ + 0xAA, 0xBD, 0xCD, 0x79, 0x00, 0x00, 0xCF, 0xE5, /* 0xF4-0xF7 */ + 0xAC, 0xDB, 0xAC, 0xDA, 0xCF, 0xE7, 0xCF, 0xE6, /* 0xF8-0xFB */ + 0xAC, 0xDF, 0x00, 0x00, 0xAC, 0xDE, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_77[512] = { + 0x00, 0x00, 0xAC, 0xD9, 0x00, 0x00, 0xCF, 0xE1, /* 0x00-0x03 */ + 0xCF, 0xE2, 0xCF, 0xE3, 0x00, 0x00, 0xAC, 0xE0, /* 0x04-0x07 */ + 0xCF, 0xE0, 0xAC, 0xDC, 0xCF, 0xE4, 0xAC, 0xDD, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xD2, 0xCF, 0xD2, 0xD3, 0xD2, 0xD1, 0xD2, 0xD0, /* 0x10-0x13 */ + 0x00, 0x00, 0xD2, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xD2, 0xD5, 0xD2, 0xD6, 0xD2, 0xCE, /* 0x18-0x1B */ + 0x00, 0x00, 0xD2, 0xCD, 0x00, 0x00, 0xAF, 0x75, /* 0x1C-0x1F */ + 0xAF, 0x76, 0x00, 0x00, 0xD2, 0xD7, 0xD2, 0xD2, /* 0x20-0x23 */ + 0x00, 0x00, 0xD6, 0xB0, 0x00, 0x00, 0xD2, 0xD8, /* 0x24-0x27 */ + 0xAF, 0x77, 0xAF, 0x74, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xD6, 0xAA, 0x00, 0x00, 0xD6, 0xA9, /* 0x2C-0x2F */ + 0x00, 0x00, 0xD6, 0xAB, 0xD6, 0xAC, 0xD6, 0xAE, /* 0x30-0x33 */ + 0xD6, 0xAD, 0xD6, 0xB2, 0xB2, 0xB5, 0xB2, 0xB2, /* 0x34-0x37 */ + 0xB2, 0xB6, 0xD6, 0xA8, 0xB2, 0xB7, 0xD6, 0xB1, /* 0x38-0x3B */ + 0xB2, 0xB4, 0xD6, 0xAF, 0xB2, 0xB3, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xDA, 0xBC, 0xDA, 0xBE, 0xDA, 0xBA, 0xDA, 0xBB, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xBF, 0xDA, 0xC1, /* 0x48-0x4B */ + 0xDA, 0xC2, 0xDA, 0xBD, 0xDA, 0xC0, 0xB5, 0x74, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xDE, 0xDB, 0x00, 0x00, /* 0x50-0x53 */ + 0xDE, 0xE0, 0xDE, 0xD8, 0xDE, 0xDC, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xDE, 0xE1, 0xDE, 0xDD, 0xB7, 0xFA, /* 0x58-0x5B */ + 0xB8, 0x43, 0x00, 0x00, 0xB7, 0xFD, 0xDE, 0xD9, /* 0x5C-0x5F */ + 0xDE, 0xDA, 0xBA, 0xCE, 0xB8, 0x46, 0xB7, 0xFE, /* 0x60-0x63 */ + 0x00, 0x00, 0xB8, 0x44, 0xB7, 0xFC, 0xDE, 0xDF, /* 0x64-0x67 */ + 0xB8, 0x45, 0xDE, 0xDE, 0xB8, 0x41, 0xB7, 0xFB, /* 0x68-0x6B */ + 0xB8, 0x42, 0xDE, 0xE2, 0xE2, 0xE6, 0xE2, 0xE8, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xB8, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xE2, 0xE3, 0xBA, 0xCC, 0xE2, 0xE9, 0xBA, 0xCD, /* 0x7C-0x7F */ + + 0xE2, 0xE7, 0xE2, 0xE2, 0xE2, 0xE5, 0xE2, 0xEA, /* 0x80-0x83 */ + 0xBA, 0xCB, 0xE2, 0xE4, 0x00, 0x00, 0xBD, 0x4E, /* 0x84-0x87 */ + 0xE6, 0xBF, 0xE6, 0xBE, 0x00, 0x00, 0xBD, 0x51, /* 0x88-0x8B */ + 0xBD, 0x4F, 0xE6, 0xBC, 0xBD, 0x4D, 0xE6, 0xBD, /* 0x8C-0x8F */ + 0x00, 0x00, 0xBD, 0x50, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xEA, 0x7D, 0x00, 0x00, 0xEA, 0xA1, /* 0x94-0x97 */ + 0x00, 0x00, 0xEA, 0x7E, 0xEA, 0x76, 0xEA, 0x7A, /* 0x98-0x9B */ + 0xEA, 0x79, 0xEA, 0x77, 0xBF, 0x66, 0xBF, 0x67, /* 0x9C-0x9F */ + 0xBF, 0x65, 0xEA, 0x78, 0xEA, 0x7B, 0xEA, 0x7C, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xBF, 0x68, 0x00, 0x00, 0xC1, 0x40, /* 0xA4-0xA7 */ + 0xED, 0xA3, 0x00, 0x00, 0xC0, 0xFC, 0xED, 0x7B, /* 0xA8-0xAB */ + 0xC0, 0xFE, 0xC1, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xC0, 0xFD, 0xED, 0xA2, 0xED, 0x7C, 0xC0, 0xFB, /* 0xB0-0xB3 */ + 0xED, 0xA1, 0xED, 0x7A, 0xED, 0x7E, 0xED, 0x7D, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0x55, 0xC2, 0xA4, /* 0xB8-0xBB */ + 0xC2, 0xA5, 0xC2, 0xA2, 0x00, 0x00, 0xC2, 0xA3, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xF0, 0x54, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xF2, 0x7B, 0x00, 0x00, 0x00, 0x00, 0xC3, 0xA9, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF2, 0x79, 0xF2, 0x7A, 0x00, 0x00, /* 0xC8-0xCB */ + 0xF4, 0x74, 0xF4, 0x77, 0xF4, 0x75, 0xF4, 0x76, /* 0xCC-0xCF */ + 0xF5, 0xE0, 0x00, 0x00, 0x00, 0x00, 0xC4, 0xEF, /* 0xD0-0xD3 */ + 0xF7, 0xEB, 0xF8, 0xB4, 0x00, 0x00, 0xC5, 0xF7, /* 0xD4-0xD7 */ + 0xF8, 0xF8, 0xF8, 0xF9, 0xC6, 0x66, 0xA5, 0xD9, /* 0xD8-0xDB */ + 0xAC, 0xE1, 0x00, 0x00, 0xDA, 0xC3, 0x00, 0x00, /* 0xDC-0xDF */ + 0xDE, 0xE3, 0x00, 0x00, 0xA5, 0xDA, 0xA8, 0x6F, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xAA, 0xBE, 0x00, 0x00, 0xCF, 0xE8, /* 0xE4-0xE7 */ + 0xCF, 0xE9, 0xAF, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xDA, 0xC4, 0xB5, 0x75, 0xB8, 0x47, 0xC1, 0x42, /* 0xEC-0xEF */ + 0xED, 0xA4, 0xF2, 0x7C, 0xF4, 0x78, 0xA5, 0xDB, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xA1, /* 0xF4-0xF7 */ + 0xCD, 0x7A, 0xCD, 0x7C, 0xCD, 0x7E, 0xCD, 0x7D, /* 0xF8-0xFB */ + 0xCD, 0x7B, 0xAA, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_78[512] = { + 0x00, 0x00, 0x00, 0x00, 0xAC, 0xE2, 0xCF, 0xF2, /* 0x00-0x03 */ + 0x00, 0x00, 0xCF, 0xED, 0xCF, 0xEA, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xCF, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xAC, 0xE4, 0xAC, 0xE5, 0xCF, 0xF0, 0xCF, 0xEF, /* 0x0C-0x0F */ + 0xCF, 0xEE, 0xCF, 0xEB, 0xCF, 0xEC, 0xCF, 0xF3, /* 0x10-0x13 */ + 0xAC, 0xE3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0xAF, 0x7C, 0x00, 0x00, 0xAF, 0xA4, /* 0x1C-0x1F */ + 0xAF, 0xA3, 0xD2, 0xE1, 0xD2, 0xDB, 0xD2, 0xD9, /* 0x20-0x23 */ + 0x00, 0x00, 0xAF, 0xA1, 0xD6, 0xB9, 0xAF, 0x7A, /* 0x24-0x27 */ + 0xD2, 0xDE, 0xD2, 0xE2, 0xD2, 0xE4, 0xD2, 0xE0, /* 0x28-0x2B */ + 0xD2, 0xDA, 0xAF, 0xA2, 0xD2, 0xDF, 0xD2, 0xDD, /* 0x2C-0x2F */ + 0xAF, 0x79, 0xD2, 0xE5, 0xAF, 0xA5, 0xD2, 0xE3, /* 0x30-0x33 */ + 0xAF, 0x7D, 0xD2, 0xDC, 0x00, 0x00, 0xAF, 0x7E, /* 0x34-0x37 */ + 0xAF, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB2, 0xB9, /* 0x40-0x43 */ + 0x00, 0x00, 0xD6, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xD6, 0xB3, 0xD6, 0xB5, 0xD6, 0xB7, 0x00, 0x00, /* 0x48-0x4B */ + 0xD6, 0xB8, 0xD6, 0xB6, 0xB2, 0xBA, 0x00, 0x00, /* 0x4C-0x4F */ + 0xD6, 0xBB, 0x00, 0x00, 0xD6, 0xB4, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0xDA, 0xC8, 0xB5, 0x76, 0xDA, 0xD0, 0x00, 0x00, /* 0x5C-0x5F */ + 0xDA, 0xC5, 0x00, 0x00, 0xDA, 0xD1, 0x00, 0x00, /* 0x60-0x63 */ + 0xDA, 0xC6, 0xDA, 0xC7, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xDA, 0xCF, 0xDA, 0xCE, 0xDA, 0xCB, 0xB2, 0xB8, /* 0x68-0x6B */ + 0xB5, 0x77, 0xDA, 0xC9, 0xDA, 0xCC, 0xB5, 0x78, /* 0x6C-0x6F */ + 0xDA, 0xCD, 0xDA, 0xCA, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xDE, 0xEE, 0x00, 0x00, 0xDE, 0xF2, /* 0x78-0x7B */ + 0xB8, 0x4E, 0x00, 0x00, 0xE2, 0xF0, 0xB8, 0x51, /* 0x7C-0x7F */ + + 0xDE, 0xF0, 0xF9, 0xD6, 0x00, 0x00, 0xDE, 0xED, /* 0x80-0x83 */ + 0xDE, 0xE8, 0xDE, 0xEA, 0xDE, 0xEB, 0xDE, 0xE4, /* 0x84-0x87 */ + 0x00, 0x00, 0xB8, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xB8, 0x4C, 0x00, 0x00, 0xB8, 0x48, 0xDE, 0xE7, /* 0x8C-0x8F */ + 0x00, 0x00, 0xB8, 0x4F, 0x00, 0x00, 0xB8, 0x50, /* 0x90-0x93 */ + 0xDE, 0xE6, 0xDE, 0xE9, 0xDE, 0xF1, 0xB8, 0x4A, /* 0x94-0x97 */ + 0xB8, 0x4B, 0xDE, 0xEF, 0xDE, 0xE5, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xE2, 0xF2, 0xBA, 0xD0, /* 0x9C-0x9F */ + 0xE2, 0xF4, 0xDE, 0xEC, 0xE2, 0xF6, 0xBA, 0xD4, /* 0xA0-0xA3 */ + 0xE2, 0xF7, 0xE2, 0xF3, 0x00, 0x00, 0xBA, 0xD1, /* 0xA4-0xA7 */ + 0xE2, 0xEF, 0xBA, 0xD3, 0xE2, 0xEC, 0xE2, 0xF1, /* 0xA8-0xAB */ + 0xE2, 0xF5, 0xE2, 0xEE, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xB8, 0x49, 0x00, 0x00, 0xE2, 0xEB, 0xBA, 0xD2, /* 0xB0-0xB3 */ + 0xE2, 0xED, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xBD, 0x54, 0xE6, 0xC1, /* 0xB8-0xBB */ + 0xBD, 0x58, 0x00, 0x00, 0xBD, 0x56, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xBA, 0xCF, 0x00, 0x00, 0xE6, 0xC8, /* 0xC0-0xC3 */ + 0xE6, 0xC9, 0xBD, 0x53, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE6, 0xC7, 0xE6, 0xCA, 0xBD, 0x55, 0xBD, 0x52, /* 0xC8-0xCB */ + 0xE6, 0xC3, 0xE6, 0xC0, 0xE6, 0xC5, 0xE6, 0xC2, /* 0xCC-0xCF */ + 0xBD, 0x59, 0xE6, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xE6, 0xC6, 0xBD, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xBF, 0x6A, 0xEA, 0xA8, /* 0xD8-0xDB */ + 0x00, 0x00, 0xEA, 0xA2, 0xEA, 0xA6, 0xEA, 0xAC, /* 0xDC-0xDF */ + 0xEA, 0xAD, 0xEA, 0xA9, 0xEA, 0xAA, 0xEA, 0xA7, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xEA, 0xA4, 0x00, 0x00, 0xBF, 0x6C, /* 0xE4-0xE7 */ + 0xBF, 0x69, 0xEA, 0xA3, 0xEA, 0xA5, 0x00, 0x00, /* 0xE8-0xEB */ + 0xBF, 0x6B, 0xEA, 0xAB, 0x00, 0x00, 0xC1, 0x46, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xAA, 0xED, 0xA5, /* 0xF0-0xF3 */ + 0xC1, 0x45, 0x00, 0x00, 0x00, 0x00, 0xC1, 0x43, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xED, 0xAC, 0xC1, 0x44, 0xED, 0xA8, /* 0xF8-0xFB */ + 0xED, 0xA9, 0xED, 0xA6, 0xED, 0xAD, 0xF0, 0x56, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_79[512] = { + 0x00, 0x00, 0xC1, 0x47, 0xED, 0xA7, 0x00, 0x00, /* 0x00-0x03 */ + 0xED, 0xAE, 0xED, 0xAB, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0xF0, 0x5A, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xF0, 0x57, 0x00, 0x00, 0xC2, 0xA6, 0x00, 0x00, /* 0x0C-0x0F */ + 0xF0, 0x5B, 0xF0, 0x5D, 0xF0, 0x5C, 0xF0, 0x58, /* 0x10-0x13 */ + 0xF0, 0x59, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA3, /* 0x14-0x17 */ + 0x00, 0x00, 0xC3, 0xAA, 0x00, 0x00, 0xF2, 0x7E, /* 0x18-0x1B */ + 0xF2, 0xA2, 0xF2, 0x7D, 0xF2, 0xA4, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xF2, 0xA1, 0x00, 0x00, 0xF4, 0x7A, /* 0x20-0x23 */ + 0xF4, 0x7D, 0xF4, 0x79, 0xC4, 0x71, 0xF4, 0x7B, /* 0x24-0x27 */ + 0xF4, 0x7C, 0xF4, 0x7E, 0xC4, 0x72, 0xC4, 0x74, /* 0x28-0x2B */ + 0xC4, 0x73, 0xF5, 0xE1, 0x00, 0x00, 0xF5, 0xE3, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF5, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xF6, 0xF6, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xF8, 0xB5, 0xF8, 0xFA, 0xA5, 0xDC, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xCB, 0x72, 0xAA, 0xC0, 0xCD, 0xA3, /* 0x3C-0x3F */ + 0xAA, 0xC1, 0xAA, 0xC2, 0xCD, 0xA2, 0x00, 0x00, /* 0x40-0x43 */ + 0xCF, 0xF8, 0xCF, 0xF7, 0xAC, 0xE6, 0xAC, 0xE9, /* 0x44-0x47 */ + 0xAC, 0xE8, 0xAC, 0xE7, 0xCF, 0xF4, 0xCF, 0xF6, /* 0x48-0x4B */ + 0xCF, 0xF5, 0x00, 0x00, 0x00, 0x00, 0xD2, 0xE8, /* 0x4C-0x4F */ + 0xAF, 0xA7, 0xD2, 0xEC, 0xD2, 0xEB, 0xD2, 0xEA, /* 0x50-0x53 */ + 0xD2, 0xE6, 0xAF, 0xA6, 0xAF, 0xAA, 0xAF, 0xAD, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xAF, 0xAE, 0xD2, 0xE7, /* 0x58-0x5B */ + 0xD2, 0xE9, 0xAF, 0xAC, 0xAF, 0xAB, 0xAF, 0xA9, /* 0x5C-0x5F */ + 0xAF, 0xA8, 0xD6, 0xC2, 0x00, 0x00, 0xD6, 0xC0, /* 0x60-0x63 */ + 0xD6, 0xBC, 0xB2, 0xBB, 0x00, 0x00, 0xD6, 0xBD, /* 0x64-0x67 */ + 0xB2, 0xBC, 0xD6, 0xBE, 0xD6, 0xBF, 0xD6, 0xC1, /* 0x68-0x6B */ + 0x00, 0x00, 0xB2, 0xBD, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xDA, 0xD5, 0x00, 0x00, 0xDA, 0xD4, 0xDA, 0xD3, /* 0x70-0x73 */ + 0xDA, 0xD2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xDE, 0xF6, 0xB8, 0x52, 0x00, 0x00, /* 0x78-0x7B */ + 0xDE, 0xF3, 0xDE, 0xF5, 0x00, 0x00, 0xB8, 0x53, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xB8, 0x54, 0xDE, 0xF4, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE3, 0x41, 0x00, 0x00, 0xE2, 0xF9, 0xE2, 0xFA, /* 0x88-0x8B */ + 0x00, 0x00, 0xBA, 0xD7, 0xBA, 0xD5, 0xBA, 0xD6, /* 0x8C-0x8F */ + 0xE3, 0x43, 0x00, 0x00, 0xE3, 0x42, 0xE2, 0xFE, /* 0x90-0x93 */ + 0xE2, 0xFD, 0xE2, 0xFC, 0xE2, 0xFB, 0xE3, 0x40, /* 0x94-0x97 */ + 0xE2, 0xF8, 0x00, 0x00, 0xE6, 0xCB, 0xE6, 0xD0, /* 0x98-0x9B */ + 0xE6, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xE6, 0xCD, 0xE6, 0xCC, 0xE6, 0xCF, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xEA, 0xAE, 0x00, 0x00, 0xBF, 0x6D, 0xC1, 0x48, /* 0xA4-0xA7 */ + 0xED, 0xB0, 0x00, 0x00, 0xC1, 0x49, 0xED, 0xAF, /* 0xA8-0xAB */ + 0xF0, 0x5F, 0xF0, 0x5E, 0xC2, 0xA7, 0x00, 0x00, /* 0xAC-0xAF */ + 0xF2, 0xA5, 0xC3, 0xAB, 0xF4, 0xA1, 0xC5, 0xA1, /* 0xB0-0xB3 */ + 0xF6, 0xF7, 0x00, 0x00, 0xF8, 0xB7, 0xF8, 0xB6, /* 0xB4-0xB7 */ + 0xC9, 0xA8, 0xAC, 0xEA, 0xAC, 0xEB, 0xD6, 0xC3, /* 0xB8-0xBB */ + 0x00, 0x00, 0xB8, 0x56, 0xA5, 0xDD, 0xA8, 0x72, /* 0xBC-0xBF */ + 0xA8, 0x71, 0xA8, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xCD, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xAA, 0xC4, 0xAA, 0xC3, 0x00, 0x00, 0xAC, 0xEE, /* 0xC8-0xCB */ + 0x00, 0x00, 0xCF, 0xFA, 0xCF, 0xFD, 0xCF, 0xFB, /* 0xCC-0xCF */ + 0x00, 0x00, 0xAC, 0xEC, 0xAC, 0xED, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xCF, 0xF9, 0xCF, 0xFC, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xAF, 0xB5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xD2, 0xF3, 0xD2, 0xF5, 0xD2, 0xF4, 0xAF, 0xB2, /* 0xDC-0xDF */ + 0xD2, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xAF, 0xB0, /* 0xE0-0xE3 */ + 0xAF, 0xAF, 0x00, 0x00, 0xAF, 0xB3, 0xAF, 0xB1, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xAF, 0xB4, 0xD2, 0xF2, 0xD2, 0xED, /* 0xE8-0xEB */ + 0xD2, 0xEE, 0xD2, 0xF1, 0xD2, 0xF0, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC6, 0xD6, 0xC7, /* 0xF4-0xF7 */ + 0xD6, 0xC5, 0x00, 0x00, 0xD6, 0xC4, 0xB2, 0xBE, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7A[512] = { + 0xB5, 0x7D, 0x00, 0x00, 0xDA, 0xD6, 0xDA, 0xD8, /* 0x00-0x03 */ + 0xDA, 0xDA, 0xB5, 0x7C, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xB5, 0x7A, 0x00, 0x00, 0xDA, 0xD7, 0xB5, 0x7B, /* 0x08-0x0B */ + 0xDA, 0xD9, 0xB5, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xDF, 0x41, 0xDE, 0xF7, 0xDE, 0xFA, 0xDE, 0xFE, /* 0x10-0x13 */ + 0xB8, 0x5A, 0xDE, 0xFC, 0x00, 0x00, 0xDE, 0xFB, /* 0x14-0x17 */ + 0xDE, 0xF8, 0xDE, 0xF9, 0xB8, 0x58, 0xDF, 0x40, /* 0x18-0x1B */ + 0xB8, 0x57, 0x00, 0x00, 0xB8, 0x5C, 0xB8, 0x5B, /* 0x1C-0x1F */ + 0xB8, 0x59, 0x00, 0x00, 0xDE, 0xFD, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x49, 0x00, 0x00, /* 0x24-0x27 */ + 0xE3, 0x48, 0x00, 0x00, 0x00, 0x00, 0xE3, 0x44, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xBA, 0xD8, 0xE3, 0x47, /* 0x2C-0x2F */ + 0xE3, 0x46, 0xBA, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBD, 0x5E, /* 0x34-0x37 */ + 0x00, 0x00, 0xE6, 0xD2, 0x00, 0x00, 0xBD, 0x5F, /* 0x38-0x3B */ + 0xBD, 0x5B, 0xBD, 0x5D, 0x00, 0x00, 0xBD, 0x5A, /* 0x3C-0x3F */ + 0xBD, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xEA, 0xAF, 0x00, 0x00, 0xBF, 0x70, 0xEA, 0xB1, /* 0x44-0x47 */ + 0xEA, 0xB0, 0x00, 0x00, 0xE3, 0x45, 0xBF, 0x72, /* 0x48-0x4B */ + 0xBF, 0x71, 0xBF, 0x6E, 0xBF, 0x6F, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xED, 0xB5, 0x00, 0x00, 0xED, 0xB3, 0xC1, 0x4A, /* 0x54-0x57 */ + 0xED, 0xB4, 0x00, 0x00, 0xED, 0xB6, 0xED, 0xB2, /* 0x58-0x5B */ + 0xED, 0xB1, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x60, /* 0x5C-0x5F */ + 0xC2, 0xAA, 0xC2, 0xA8, 0xC2, 0xA9, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA6, /* 0x64-0x67 */ + 0xF2, 0xA7, 0xC3, 0xAD, 0x00, 0x00, 0xC3, 0xAC, /* 0x68-0x6B */ + 0xF4, 0xA3, 0xF4, 0xA4, 0xF4, 0xA2, 0x00, 0x00, /* 0x6C-0x6F */ + 0xF6, 0xF8, 0xF6, 0xF9, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xA5, 0xDE, 0xCA, 0x48, 0xA8, 0x73, 0x00, 0x00, /* 0x74-0x77 */ + 0xCD, 0xA5, 0xAA, 0xC6, 0xAA, 0xC5, 0xCD, 0xA6, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0x40, 0xAC, 0xEF, /* 0x7C-0x7F */ + + 0xCF, 0xFE, 0xAC, 0xF0, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xAF, 0xB6, 0xD2, 0xF8, 0xD2, 0xF6, 0xD2, 0xFC, /* 0x84-0x87 */ + 0xAF, 0xB7, 0xD2, 0xF7, 0xD2, 0xFB, 0xD2, 0xF9, /* 0x88-0x8B */ + 0xD2, 0xFA, 0x00, 0x00, 0x00, 0x00, 0xD6, 0xC8, /* 0x8C-0x8F */ + 0xD6, 0xCA, 0x00, 0x00, 0xB2, 0xBF, 0x00, 0x00, /* 0x90-0x93 */ + 0xD6, 0xC9, 0xB2, 0xC0, 0xB5, 0xA2, 0xB5, 0xA1, /* 0x94-0x97 */ + 0xB5, 0x7E, 0xDA, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0x44, 0xB8, 0x5D, /* 0x9C-0x9F */ + 0xB8, 0x5E, 0x00, 0x00, 0xDF, 0x43, 0xDF, 0x42, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xE3, 0x4A, 0xBA, 0xDB, 0xBA, 0xDA, 0xE3, 0x4B, /* 0xA8-0xAB */ + 0xE3, 0x4C, 0x00, 0x00, 0xBD, 0x61, 0xBD, 0x60, /* 0xAC-0xAF */ + 0x00, 0x00, 0xEA, 0xB5, 0xE6, 0xD3, 0xE6, 0xD5, /* 0xB0-0xB3 */ + 0xE6, 0xD4, 0xEA, 0xB4, 0xEA, 0xB2, 0xEA, 0xB6, /* 0xB4-0xB7 */ + 0xEA, 0xB3, 0x00, 0x00, 0xBF, 0x73, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xED, 0xB7, 0xC1, 0x4B, /* 0xBC-0xBF */ + 0xED, 0xB8, 0xED, 0xB9, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xC2, 0xAB, 0xC2, 0xAC, 0x00, 0x00, 0xC4, 0x75, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xC5, 0xD1, 0xA5, 0xDF, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xD0, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xD2, 0xFD, 0xAF, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xBA, /* 0xDC-0xDF */ + 0xB3, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xB5, 0xA4, /* 0xE0-0xE3 */ + 0xDA, 0xDD, 0xB5, 0xA3, 0xDA, 0xDC, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x45, /* 0xE8-0xEB */ + 0x00, 0x00, 0xBA, 0xDC, 0xE3, 0x4D, 0xBA, 0xDD, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xC4, 0x76, 0xF4, 0xA5, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xA6, 0xCB, 0xAA, 0xC7, 0xCD, 0xA7, /* 0xF8-0xFB */ + 0x00, 0x00, 0xAC, 0xF2, 0x00, 0x00, 0xAC, 0xF1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7B[512] = { + 0xD0, 0x42, 0xD0, 0x43, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xD3, 0x40, 0xD3, 0x42, 0xAF, 0xB9, 0x00, 0x00, /* 0x04-0x07 */ + 0xD3, 0x44, 0xD3, 0x47, 0xD3, 0x45, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0x46, 0xD3, 0x43, /* 0x0C-0x0F */ + 0xD2, 0xFE, 0xAF, 0xBA, 0xD3, 0x48, 0xD3, 0x41, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xD6, 0xD3, 0xB2, 0xC6, 0xD6, 0xDC, 0xB2, 0xC3, /* 0x18-0x1B */ + 0x00, 0x00, 0xD6, 0xD5, 0xB2, 0xC7, 0x00, 0x00, /* 0x1C-0x1F */ + 0xB2, 0xC1, 0x00, 0x00, 0xD6, 0xD0, 0xD6, 0xDD, /* 0x20-0x23 */ + 0xD6, 0xD1, 0xD6, 0xCE, 0xB2, 0xC5, 0x00, 0x00, /* 0x24-0x27 */ + 0xB2, 0xC2, 0x00, 0x00, 0xD6, 0xD4, 0xD6, 0xD7, /* 0x28-0x2B */ + 0xB2, 0xC4, 0xD6, 0xD8, 0xB2, 0xC8, 0xD6, 0xD9, /* 0x2C-0x2F */ + 0xD6, 0xCF, 0xD6, 0xD6, 0xD6, 0xDA, 0xD6, 0xD2, /* 0x30-0x33 */ + 0xD6, 0xCD, 0xD6, 0xCB, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xD6, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xDA, 0xDF, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0xDA, 0xE4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xDA, 0xE0, 0xDA, 0xE6, 0xB5, 0xA7, 0xD6, 0xCC, /* 0x44-0x47 */ + 0xDA, 0xE1, 0xB5, 0xA5, 0xDA, 0xDE, 0xB5, 0xAC, /* 0x48-0x4B */ + 0xDA, 0xE2, 0xB5, 0xAB, 0xDA, 0xE3, 0xB5, 0xAD, /* 0x4C-0x4F */ + 0xB5, 0xA8, 0xB5, 0xAE, 0xB5, 0xA9, 0x00, 0x00, /* 0x50-0x53 */ + 0xB5, 0xAA, 0x00, 0x00, 0xB5, 0xA6, 0x00, 0x00, /* 0x54-0x57 */ + 0xDA, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0xB8, 0x61, 0xDF, 0x50, 0x00, 0x00, 0xDF, 0x53, /* 0x60-0x63 */ + 0xDF, 0x47, 0xDF, 0x4C, 0xDF, 0x46, 0xB8, 0x63, /* 0x64-0x67 */ + 0x00, 0x00, 0xDF, 0x4A, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xDF, 0x48, 0xB8, 0x62, 0x00, 0x00, /* 0x6C-0x6F */ + 0xDF, 0x4F, 0xDF, 0x4E, 0xDF, 0x4B, 0xDF, 0x4D, /* 0x70-0x73 */ + 0xDF, 0x49, 0xBA, 0xE1, 0xDF, 0x52, 0xB8, 0x5F, /* 0x74-0x77 */ + 0xDF, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x5D, 0x00, 0x00, /* 0x80-0x83 */ + 0xBA, 0xE8, 0xE3, 0x58, 0x00, 0x00, 0xBA, 0xE7, /* 0x84-0x87 */ + 0xE3, 0x4E, 0x00, 0x00, 0xE3, 0x50, 0xBA, 0xE0, /* 0x88-0x8B */ + 0xE3, 0x55, 0xE3, 0x54, 0xE3, 0x57, 0xBA, 0xE5, /* 0x8C-0x8F */ + 0xE3, 0x52, 0xE3, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xBA, 0xE4, 0xBA, 0xDF, 0xE3, 0x53, 0xBA, 0xE2, /* 0x94-0x97 */ + 0xE3, 0x59, 0xE3, 0x5B, 0x00, 0x00, 0xE3, 0x56, /* 0x98-0x9B */ + 0xE3, 0x4F, 0xBA, 0xE3, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xBD, 0x69, 0xBA, 0xDE, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE3, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE6, 0xD9, 0xBD, 0x62, 0x00, 0x00, 0xE6, 0xDB, /* 0xAC-0xAF */ + 0x00, 0x00, 0xBD, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xBD, 0x65, 0xE6, 0xDE, 0x00, 0x00, 0xE6, 0xD6, /* 0xB4-0xB7 */ + 0xBA, 0xE6, 0xE6, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xD8, 0x00, 0x00, /* 0xBC-0xBF */ + 0xB8, 0x60, 0xBD, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xBD, 0x64, 0x00, 0x00, 0xBD, 0x66, 0xBD, 0x67, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xBF, 0x76, 0xE6, 0xDD, 0xE6, 0xD7, /* 0xC8-0xCB */ + 0xBD, 0x6A, 0x00, 0x00, 0xE6, 0xDA, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xEA, 0xC0, 0xEA, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xEA, 0xC5, 0xBF, 0x74, 0xEA, 0xBD, 0xBF, 0x78, /* 0xD8-0xDB */ + 0xEA, 0xC3, 0xEA, 0xBA, 0xEA, 0xB7, 0xEA, 0xC6, /* 0xDC-0xDF */ + 0xC1, 0x51, 0xBF, 0x79, 0xEA, 0xC2, 0xEA, 0xB8, /* 0xE0-0xE3 */ + 0xBF, 0x77, 0xEA, 0xBC, 0xBF, 0x7B, 0xEA, 0xB9, /* 0xE4-0xE7 */ + 0xEA, 0xBE, 0xBF, 0x7A, 0xEA, 0xC1, 0xEA, 0xC4, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xED, 0xCB, 0xED, 0xCC, 0xED, 0xBC, 0xED, 0xC3, /* 0xF0-0xF3 */ + 0xED, 0xC1, 0x00, 0x00, 0x00, 0x00, 0xC1, 0x4F, /* 0xF4-0xF7 */ + 0xED, 0xC8, 0xEA, 0xBF, 0x00, 0x00, 0xED, 0xBF, /* 0xF8-0xFB */ + 0x00, 0x00, 0xED, 0xC9, 0xC1, 0x4E, 0xED, 0xBE, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7C[512] = { + 0xED, 0xBD, 0xED, 0xC7, 0xED, 0xC4, 0xED, 0xC6, /* 0x00-0x03 */ + 0x00, 0x00, 0xED, 0xBA, 0xED, 0xCA, 0xC1, 0x4C, /* 0x04-0x07 */ + 0x00, 0x00, 0xED, 0xC5, 0xED, 0xCE, 0xED, 0xC2, /* 0x08-0x0B */ + 0xC1, 0x50, 0xC1, 0x4D, 0xED, 0xC0, 0xED, 0xBB, /* 0x0C-0x0F */ + 0xED, 0xCD, 0xBF, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xF0, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xF0, 0x61, 0xF0, 0x67, 0xC2, 0xB0, 0xF0, 0x65, /* 0x1C-0x1F */ + 0xF0, 0x64, 0xC2, 0xB2, 0xF0, 0x6A, 0xC2, 0xB1, /* 0x20-0x23 */ + 0x00, 0x00, 0xF0, 0x6B, 0xF0, 0x68, 0xC2, 0xAE, /* 0x24-0x27 */ + 0xF0, 0x69, 0xF0, 0x62, 0xC2, 0xAF, 0xC2, 0xAD, /* 0x28-0x2B */ + 0xF2, 0xAB, 0xF0, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xF0, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xA8, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC3, 0xB2, /* 0x34-0x37 */ + 0xC3, 0xB0, 0xF2, 0xAA, 0x00, 0x00, 0xF2, 0xAC, /* 0x38-0x3B */ + 0xF2, 0xA9, 0xC3, 0xB1, 0xC3, 0xAE, 0xC3, 0xAF, /* 0x3C-0x3F */ + 0xC3, 0xB3, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x78, /* 0x40-0x43 */ + 0x00, 0x00, 0xF4, 0xAA, 0x00, 0x00, 0xF4, 0xA9, /* 0x44-0x47 */ + 0xF4, 0xA7, 0xF4, 0xA6, 0xF4, 0xA8, 0x00, 0x00, /* 0x48-0x4B */ + 0xC4, 0x77, 0xC4, 0x79, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xC4, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xF5, 0xE5, /* 0x50-0x53 */ + 0xF5, 0xE4, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xFA, /* 0x54-0x57 */ + 0x00, 0x00, 0xF6, 0xFC, 0xF6, 0xFE, 0xF6, 0xFD, /* 0x58-0x5B */ + 0xF6, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xC5, 0xA3, /* 0x5C-0x5F */ + 0xC5, 0xA2, 0x00, 0x00, 0x00, 0x00, 0xC5, 0xD3, /* 0x60-0x63 */ + 0xC5, 0xD2, 0xC5, 0xD4, 0xF7, 0xED, 0xF7, 0xEC, /* 0x64-0x67 */ + 0x00, 0x00, 0xF8, 0xFB, 0xF8, 0xB8, 0xF8, 0xFC, /* 0x68-0x6B */ + 0xC6, 0x58, 0x00, 0x00, 0xC6, 0x59, 0xF9, 0x6D, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xC6, 0x7E, 0xA6, 0xCC, /* 0x70-0x73 */ + 0x00, 0x00, 0xCD, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0xD0, 0x45, 0xD0, 0x46, 0xD0, 0x44, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xAC, 0xF3, 0x00, 0x00, 0xD0, 0x47, /* 0x7C-0x7F */ + + 0xD0, 0x48, 0xD0, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xD3, 0x49, 0xD3, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xD3, 0x4D, 0xAF, 0xBB, 0xD3, 0x4B, 0x00, 0x00, /* 0x88-0x8B */ + 0xD3, 0x4C, 0xD3, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xD3, 0x4A, 0xB2, 0xC9, 0x00, 0x00, /* 0x90-0x93 */ + 0xD6, 0xDE, 0xB2, 0xCB, 0xD6, 0xE0, 0xB2, 0xCA, /* 0x94-0x97 */ + 0xD6, 0xDF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xE8, 0xB5, 0xAF, /* 0x9C-0x9F */ + 0x00, 0x00, 0xDA, 0xEA, 0xDA, 0xE7, 0xD6, 0xE1, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xB5, 0xB0, 0x00, 0x00, 0xF9, 0xDB, /* 0xA4-0xA7 */ + 0xDA, 0xE9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x56, /* 0xAC-0xAF */ + 0x00, 0x00, 0xB8, 0x64, 0xDF, 0x54, 0xB8, 0x65, /* 0xB0-0xB3 */ + 0xDF, 0x55, 0xB8, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xBA, 0xE9, 0xE3, 0x61, 0xE3, 0x5E, /* 0xB8-0xBB */ + 0xE3, 0x60, 0xBA, 0xEA, 0xBA, 0xEB, 0xE3, 0x5F, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xE6, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xE6, 0xE0, 0x00, 0x00, 0xBD, 0x6B, 0xE6, 0xE2, /* 0xC8-0xCB */ + 0xE6, 0xE1, 0x00, 0x00, 0xA2, 0x61, 0x00, 0x00, /* 0xCC-0xCF */ + 0xEA, 0xCA, 0xEA, 0xCB, 0xEA, 0xC7, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xEA, 0xC8, 0xBF, 0x7C, 0xBF, 0x7D, 0xEA, 0xC9, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xC1, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xC1, 0x53, 0xC1, 0x58, 0xC1, 0x54, 0xC1, 0x56, /* 0xDC-0xDF */ + 0xC1, 0x52, 0x00, 0x00, 0xC1, 0x55, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC2, 0xB3, /* 0xE4-0xE7 */ + 0xED, 0xCF, 0x00, 0x00, 0xF2, 0xAE, 0x00, 0x00, /* 0xE8-0xEB */ + 0xF2, 0xAD, 0x00, 0x00, 0xF4, 0xAB, 0xC4, 0x7A, /* 0xEC-0xEF */ + 0xC4, 0x7B, 0xF7, 0x41, 0xF5, 0xE6, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xF7, 0x40, 0x00, 0x00, 0xF8, 0xFD, 0xF9, 0xA4, /* 0xF4-0xF7 */ + 0xA6, 0xCD, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x74, /* 0xF8-0xFB */ + 0x00, 0x00, 0xCD, 0xA9, 0xAA, 0xC8, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_7D[512] = { + 0xAC, 0xF6, 0xD0, 0x4C, 0xAC, 0xF4, 0xD0, 0x4A, /* 0x00-0x03 */ + 0xAC, 0xF9, 0xAC, 0xF5, 0xAC, 0xFA, 0xAC, 0xF8, /* 0x04-0x07 */ + 0xD0, 0x4B, 0xAC, 0xF7, 0xAF, 0xBF, 0xAF, 0xBE, /* 0x08-0x0B */ + 0xD3, 0x5A, 0xAF, 0xC7, 0xD3, 0x53, 0xD3, 0x59, /* 0x0C-0x0F */ + 0xAF, 0xC3, 0xD3, 0x52, 0xD3, 0x58, 0xD3, 0x56, /* 0x10-0x13 */ + 0xAF, 0xC2, 0xAF, 0xC4, 0xD3, 0x55, 0xAF, 0xBD, /* 0x14-0x17 */ + 0xD3, 0x54, 0xAF, 0xC8, 0xAF, 0xC5, 0xAF, 0xC9, /* 0x18-0x1B */ + 0xAF, 0xC6, 0xD3, 0x51, 0xD3, 0x50, 0xD3, 0x57, /* 0x1C-0x1F */ + 0xAF, 0xC0, 0xAF, 0xBC, 0xAF, 0xC1, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xD6, 0xF0, 0xD6, 0xE9, 0x00, 0x00, 0xB5, 0xB5, /* 0x28-0x2B */ + 0xD6, 0xE8, 0x00, 0x00, 0xB2, 0xCF, 0xB2, 0xD6, /* 0x2C-0x2F */ + 0xB2, 0xD3, 0xB2, 0xD9, 0xB2, 0xD8, 0xB2, 0xD4, /* 0x30-0x33 */ + 0x00, 0x00, 0xD6, 0xE2, 0xD6, 0xE5, 0x00, 0x00, /* 0x34-0x37 */ + 0xD6, 0xE4, 0xB2, 0xD0, 0xD6, 0xE6, 0xD6, 0xEF, /* 0x38-0x3B */ + 0xB2, 0xD1, 0xD6, 0xE3, 0xD6, 0xEC, 0xD6, 0xED, /* 0x3C-0x3F */ + 0xB2, 0xD2, 0xD6, 0xEA, 0xB2, 0xD7, 0xB2, 0xCD, /* 0x40-0x43 */ + 0xB2, 0xD5, 0xD6, 0xE7, 0xB2, 0xCC, 0xD6, 0xEB, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xD6, 0xEE, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xDA, 0xFB, 0xDA, 0xF2, /* 0x4C-0x4F */ + 0xB5, 0xB2, 0xDA, 0xF9, 0xDA, 0xF6, 0xDA, 0xEE, /* 0x50-0x53 */ + 0xDA, 0xF7, 0xB5, 0xB4, 0xDA, 0xEF, 0x00, 0x00, /* 0x54-0x57 */ + 0xDA, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xB8, 0x6C, /* 0x58-0x5B */ + 0xDA, 0xF4, 0x00, 0x00, 0xB5, 0xB1, 0xDA, 0xFA, /* 0x5C-0x5F */ + 0x00, 0x00, 0xB5, 0xB8, 0xB5, 0xBA, 0xDA, 0xED, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xB5, 0xB9, 0xDA, 0xF0, /* 0x64-0x67 */ + 0xB5, 0xB3, 0xDA, 0xF8, 0xDA, 0xF1, 0xDA, 0xF5, /* 0x68-0x6B */ + 0x00, 0x00, 0xDA, 0xF3, 0xB5, 0xB6, 0xDA, 0xEC, /* 0x6C-0x6F */ + 0xB5, 0xBB, 0xB2, 0xCE, 0xB5, 0xB7, 0xB5, 0xBC, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xB8, 0x68, 0xDF, 0x5D, 0xDF, 0x5F, /* 0x78-0x7B */ + 0xDF, 0x61, 0xDF, 0x65, 0x00, 0x00, 0xDF, 0x5B, /* 0x7C-0x7F */ + + 0xDF, 0x59, 0xB8, 0x6A, 0x00, 0x00, 0xDF, 0x60, /* 0x80-0x83 */ + 0xDF, 0x64, 0xDF, 0x5C, 0xDF, 0x58, 0x00, 0x00, /* 0x84-0x87 */ + 0xDF, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0xDF, 0x62, 0xDF, 0x5A, 0xDF, 0x5E, 0xB8, 0x6B, /* 0x8C-0x8F */ + 0x00, 0x00, 0xB8, 0x69, 0xDF, 0x66, 0xB8, 0x67, /* 0x90-0x93 */ + 0xDF, 0x63, 0x00, 0x00, 0xE3, 0x72, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xBA, 0xEE, 0xE3, 0x6A, 0xBD, 0x78, 0xE3, 0x74, /* 0x9C-0x9F */ + 0xBA, 0xF1, 0xE3, 0x78, 0xBA, 0xF7, 0xE3, 0x65, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0x75, 0xE3, 0x62, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xE3, 0x77, 0xE3, 0x66, 0x00, 0x00, /* 0xA8-0xAB */ + 0xBA, 0xFE, 0xBA, 0xFB, 0xE3, 0x76, 0xE3, 0x70, /* 0xAC-0xAF */ + 0xBA, 0xED, 0xBA, 0xF5, 0xBA, 0xF4, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xBA, 0xF3, 0xBA, 0xF9, 0x00, 0x00, 0xE3, 0x63, /* 0xB4-0xB7 */ + 0xBA, 0xFA, 0xE3, 0x71, 0xBA, 0xF6, 0xBA, 0xEC, /* 0xB8-0xBB */ + 0xE3, 0x73, 0xBA, 0xEF, 0xBA, 0xF0, 0xBA, 0xF8, /* 0xBC-0xBF */ + 0xE3, 0x68, 0xE3, 0x67, 0xE3, 0x64, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xE3, 0x6C, 0xE3, 0x69, 0xE3, 0x6D, 0xBA, 0xFD, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE3, 0x79, 0xBA, 0xF2, 0xE3, 0x6E, /* 0xC8-0xCB */ + 0xE3, 0x6F, 0x00, 0x00, 0xE3, 0x6B, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xBA, 0xFC, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE7, /* 0xD4-0xD7 */ + 0xBD, 0x70, 0xBD, 0x79, 0xBD, 0x75, 0xE6, 0xE4, /* 0xD8-0xDB */ + 0x00, 0x00, 0xBD, 0x72, 0xBD, 0x76, 0xE6, 0xF0, /* 0xDC-0xDF */ + 0xBD, 0x6C, 0xE6, 0xE8, 0x00, 0x00, 0xBD, 0x74, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xE6, 0xEB, 0xE6, 0xE6, /* 0xE4-0xE7 */ + 0xBD, 0x73, 0xBD, 0x77, 0xE6, 0xE5, 0x00, 0x00, /* 0xE8-0xEB */ + 0xBD, 0x71, 0x00, 0x00, 0xE6, 0xEF, 0xBD, 0x6E, /* 0xEC-0xEF */ + 0xE6, 0xEE, 0xE6, 0xED, 0xBD, 0x7A, 0xE5, 0x72, /* 0xF0-0xF3 */ + 0xBD, 0x6D, 0x00, 0x00, 0xE6, 0xEC, 0xE6, 0xE3, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xBD, 0x7B, 0xE6, 0xEA, 0xBD, 0x6F, /* 0xF8-0xFB */ +}; + +static const unsigned char u2c_7E[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xE9, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0xBF, 0xA2, 0xBF, 0xA7, 0xBF, 0x7E, 0xEA, 0xD8, /* 0x08-0x0B */ + 0xEA, 0xCF, 0xEA, 0xDB, 0xEA, 0xD3, 0xEA, 0xD9, /* 0x0C-0x0F */ + 0xBF, 0xA8, 0xBF, 0xA1, 0xEA, 0xCC, 0xEA, 0xD2, /* 0x10-0x13 */ + 0xEA, 0xDC, 0xEA, 0xD5, 0xEA, 0xDA, 0xEA, 0xCE, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xD6, 0xBF, 0xA3, /* 0x18-0x1B */ + 0xEA, 0xD4, 0xBF, 0xA6, 0xBF, 0xA5, 0xEA, 0xD0, /* 0x1C-0x1F */ + 0xEA, 0xD1, 0xEA, 0xCD, 0xEA, 0xD7, 0xBF, 0xA4, /* 0x20-0x23 */ + 0xEA, 0xDE, 0xEA, 0xDD, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xED, 0xDA, 0xED, 0xD6, 0xC1, 0x5F, /* 0x28-0x2B */ + 0x00, 0x00, 0xED, 0xD0, 0xC1, 0x59, 0xC1, 0x69, /* 0x2C-0x2F */ + 0xED, 0xDC, 0xC1, 0x61, 0xC1, 0x5D, 0xED, 0xD3, /* 0x30-0x33 */ + 0xC1, 0x64, 0xC1, 0x67, 0xED, 0xDE, 0xC1, 0x5C, /* 0x34-0x37 */ + 0xED, 0xD5, 0xC1, 0x65, 0xED, 0xE0, 0xED, 0xDD, /* 0x38-0x3B */ + 0xED, 0xD1, 0xC1, 0x60, 0xC1, 0x5A, 0xC1, 0x68, /* 0x3C-0x3F */ + 0xED, 0xD8, 0xC1, 0x63, 0xED, 0xD2, 0xC1, 0x5E, /* 0x40-0x43 */ + 0xED, 0xDF, 0xC1, 0x62, 0xC1, 0x5B, 0xED, 0xD9, /* 0x44-0x47 */ + 0xC1, 0x66, 0xED, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xED, 0xDB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF0, 0x6E, 0xF0, 0x74, 0xC2, 0xB9, 0xF0, 0x77, /* 0x50-0x53 */ + 0xC2, 0xB4, 0xC2, 0xB5, 0xF0, 0x6F, 0xF0, 0x76, /* 0x54-0x57 */ + 0xF0, 0x71, 0xC2, 0xBA, 0xC2, 0xB7, 0x00, 0x00, /* 0x58-0x5B */ + 0xF0, 0x6D, 0x00, 0x00, 0xC2, 0xB6, 0xF0, 0x73, /* 0x5C-0x5F */ + 0xF0, 0x75, 0xC2, 0xB8, 0xF0, 0x72, 0xF0, 0x70, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xF2, 0xB8, 0xC3, 0xB7, 0xC3, 0xB8, 0xC3, 0xB4, /* 0x68-0x6B */ + 0x00, 0x00, 0xC3, 0xB5, 0x00, 0x00, 0xF2, 0xB4, /* 0x6C-0x6F */ + 0xF2, 0xB2, 0x00, 0x00, 0xF2, 0xB6, 0xC3, 0xBA, /* 0x70-0x73 */ + 0xF2, 0xB7, 0xF2, 0xB0, 0xF2, 0xAF, 0xF2, 0xB3, /* 0x74-0x77 */ + 0xF2, 0xB1, 0xC3, 0xB6, 0xF2, 0xB5, 0xF4, 0xAC, /* 0x78-0x7B */ + 0xC4, 0x7E, 0xC4, 0x7D, 0xF4, 0xAD, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xF4, 0xAF, 0xF4, 0xAE, 0xC4, 0xA1, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0xEB, 0xF5, 0xE8, /* 0x84-0x87 */ + 0xF5, 0xE9, 0x00, 0x00, 0xF5, 0xE7, 0xF5, 0xEA, /* 0x88-0x8B */ + 0xC4, 0xF2, 0xF5, 0xEC, 0x00, 0x00, 0xC4, 0xF1, /* 0x8C-0x8F */ + 0x00, 0x00, 0xF7, 0x42, 0x00, 0x00, 0xC5, 0xD5, /* 0x90-0x93 */ + 0xC5, 0xD7, 0xF7, 0xEE, 0xC5, 0xD6, 0xF8, 0xB9, /* 0x94-0x97 */ + 0xF9, 0x40, 0xF9, 0x42, 0xF8, 0xFE, 0xF9, 0x41, /* 0x98-0x9B */ + 0xC6, 0x6C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_7F[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xA6, 0xCE, 0x00, 0x00, /* 0x34-0x37 */ + 0xAC, 0xFB, 0xD2, 0x6F, 0xAF, 0xCA, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xB2, 0xDA, 0xDA, 0xFC, 0xDA, 0xFD, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEA, 0xDF, /* 0x40-0x43 */ + 0xC1, 0x6A, 0xED, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xC2, 0xBB, 0x00, 0x00, 0xF2, 0xBA, 0xF2, 0xB9, /* 0x48-0x4B */ + 0xC4, 0xA2, 0xF5, 0xED, 0x00, 0x00, 0xF7, 0x43, /* 0x4C-0x4F */ + 0xC5, 0xF8, 0xCA, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xAA, 0xC9, 0xA8, 0x75, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xD0, 0x4D, 0x00, 0x00, 0x00, 0x00, 0xD3, 0x60, /* 0x58-0x5B */ + 0xD3, 0x5B, 0xD3, 0x5F, 0xD3, 0x5D, 0xAF, 0xCB, /* 0x5C-0x5F */ + 0xD3, 0x5E, 0xD3, 0x5C, 0x00, 0x00, 0xD6, 0xF1, /* 0x60-0x63 */ + 0x00, 0x00, 0xDA, 0xFE, 0xDB, 0x40, 0xDF, 0x69, /* 0x64-0x67 */ + 0xDF, 0x6A, 0xB8, 0x6E, 0xB8, 0x6F, 0xDF, 0x68, /* 0x68-0x6B */ + 0xDF, 0x6B, 0xDF, 0x67, 0xB8, 0x6D, 0x00, 0x00, /* 0x6C-0x6F */ + 0xBB, 0x40, 0x00, 0x00, 0xB8, 0x70, 0xE3, 0x7A, /* 0x70-0x73 */ + 0x00, 0x00, 0xBD, 0x7C, 0xE6, 0xF1, 0xBD, 0x7D, /* 0x74-0x77 */ + 0x00, 0x00, 0xBF, 0xA9, 0xEA, 0xE2, 0xEA, 0xE0, /* 0x78-0x7B */ + 0xEA, 0xE1, 0xED, 0xE4, 0xED, 0xE3, 0xED, 0xE2, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0xBB, /* 0x80-0x83 */ + 0x00, 0x00, 0xC3, 0xB9, 0xF2, 0xBC, 0xF7, 0x44, /* 0x84-0x87 */ + 0xC5, 0xF9, 0xF8, 0xBA, 0xA6, 0xCF, 0xAA, 0xCB, /* 0x88-0x8B */ + 0xAA, 0xCA, 0xD0, 0x4F, 0xAC, 0xFC, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xD0, 0x4E, 0xD3, 0x62, 0x00, 0x00, /* 0x90-0x93 */ + 0xAF, 0xCC, 0xD6, 0xF2, 0xD3, 0x61, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0xDC, 0xD6, 0xF5, /* 0x98-0x9B */ + 0xD6, 0xF3, 0xD6, 0xF4, 0xB2, 0xDB, 0x00, 0x00, /* 0x9C-0x9F */ + 0xDB, 0x42, 0xDB, 0x43, 0xDB, 0x41, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xB8, 0x73, 0xDF, 0x6D, 0xDF, 0x6C, 0xDF, 0x6E, /* 0xA4-0xA7 */ + 0xB8, 0x72, 0xB8, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE6, 0xF2, 0xE6, 0xF4, 0x00, 0x00, 0xBD, 0x7E, /* 0xAC-0xAF */ + 0xE6, 0xF3, 0xEA, 0xE3, 0xBF, 0xAA, 0xF0, 0x79, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF0, 0x78, 0xC3, 0xBB, 0xF2, 0xBD, /* 0xB4-0xB7 */ + 0xC3, 0xBD, 0xC3, 0xBC, 0xF4, 0xB0, 0xF5, 0xEE, /* 0xB8-0xBB */ + 0xC4, 0xF3, 0xA6, 0xD0, 0xD0, 0x50, 0xAC, 0xFD, /* 0xBC-0xBF */ + 0xD3, 0x65, 0xAF, 0xCE, 0xD3, 0x64, 0xD3, 0x63, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xAF, 0xCD, 0x00, 0x00, 0xD6, 0xFB, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xD6, 0xFD, 0xD6, 0xF6, 0xD6, 0xF7, /* 0xC8-0xCB */ + 0xB2, 0xDD, 0xD6, 0xF8, 0xB2, 0xDE, 0xD6, 0xFC, /* 0xCC-0xCF */ + 0xD6, 0xF9, 0xD6, 0xFA, 0xB2, 0xDF, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xB5, 0xBE, 0xB5, 0xBF, 0x00, 0x00, 0xDB, 0x44, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x6F, /* 0xD8-0xDB */ + 0xDF, 0x70, 0x00, 0x00, 0xE3, 0x7E, 0xBB, 0x43, /* 0xDC-0xDF */ + 0xBB, 0x41, 0xBB, 0x42, 0xE3, 0x7B, 0xE3, 0x7C, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xE3, 0x7D, 0xE6, 0xF9, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xE6, 0xFA, 0xBD, 0xA1, 0xE6, 0xF7, 0xE6, 0xF6, /* 0xE8-0xEB */ + 0xE6, 0xF8, 0xE6, 0xF5, 0xBF, 0xAD, 0xEA, 0xE4, /* 0xEC-0xEF */ + 0xBF, 0xAB, 0xBF, 0xAC, 0xED, 0xE6, 0xC1, 0x6B, /* 0xF0-0xF3 */ + 0xED, 0xE5, 0xEF, 0xA8, 0x00, 0x00, 0xF0, 0x7A, /* 0xF4-0xF7 */ + 0xF0, 0x7B, 0xC2, 0xBC, 0x00, 0x00, 0xC2, 0xBD, /* 0xF8-0xFB */ + 0xC1, 0x6C, 0xF2, 0xBE, 0xF2, 0xBF, 0xF4, 0xB1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_80[512] = { + 0xC4, 0xA3, 0xA6, 0xD1, 0x00, 0x00, 0xA6, 0xD2, /* 0x00-0x03 */ + 0xAC, 0xFE, 0xAA, 0xCC, 0xAF, 0xCF, 0xD0, 0x51, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB5, 0xC0, /* 0x08-0x0B */ + 0xA6, 0xD3, 0xAD, 0x41, 0xD0, 0x52, 0xD0, 0x53, /* 0x0C-0x0F */ + 0xAD, 0x40, 0xAD, 0x42, 0xA6, 0xD4, 0x00, 0x00, /* 0x10-0x13 */ + 0xD0, 0x54, 0xAF, 0xD1, 0xD3, 0x66, 0xAF, 0xD3, /* 0x14-0x17 */ + 0xAF, 0xD0, 0xAF, 0xD2, 0x00, 0x00, 0xD7, 0x41, /* 0x18-0x1B */ + 0xB2, 0xE0, 0x00, 0x00, 0xD7, 0x40, 0xD6, 0xFE, /* 0x1C-0x1F */ + 0x00, 0x00, 0xDF, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xE3, 0xA1, 0x00, 0x00, 0xBD, 0xA2, 0x00, 0x00, /* 0x24-0x27 */ + 0xBF, 0xAE, 0xEA, 0xE6, 0xEA, 0xE5, 0x00, 0x00, /* 0x28-0x2B */ + 0xED, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xF5, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xA6, 0xD5, /* 0x30-0x33 */ + 0xCB, 0x73, 0xCD, 0xAA, 0xAD, 0x43, 0xD0, 0x55, /* 0x34-0x37 */ + 0x00, 0x00, 0xD3, 0x68, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xAF, 0xD4, 0xD3, 0x67, 0xAF, 0xD5, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0x43, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xB2, 0xE2, 0xD7, 0x42, /* 0x44-0x47 */ + 0xD7, 0x44, 0x00, 0x00, 0xB2, 0xE1, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x46, /* 0x4C-0x4F */ + 0xDB, 0x47, 0xDB, 0x45, 0xB5, 0xC1, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xB8, 0x74, 0x00, 0x00, /* 0x54-0x57 */ + 0xB8, 0x75, 0x00, 0x00, 0xBB, 0x45, 0x00, 0x00, /* 0x58-0x5B */ + 0xE3, 0xA3, 0xE3, 0xA2, 0xBB, 0x44, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xE6, 0xFB, 0x00, 0x00, 0x00, 0x00, 0xE6, 0xFC, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xEA, 0xE7, 0x00, 0x00, 0x00, 0x00, 0xC1, 0x70, /* 0x6C-0x6F */ + 0xC1, 0x6F, 0xC1, 0x6D, 0xC1, 0x6E, 0xC1, 0x71, /* 0x70-0x73 */ + 0x00, 0x00, 0xF0, 0x7C, 0xC2, 0xBF, 0xC2, 0xBE, /* 0x74-0x77 */ + 0xF2, 0xC0, 0xF4, 0xB2, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xC5, 0xA5, 0xC5, 0xA4, 0xA6, 0xD6, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xD1, 0xFB, 0x00, 0x00, /* 0x80-0x83 */ + 0xB8, 0x77, 0xB5, 0xC2, 0xB8, 0x76, 0xBB, 0x46, /* 0x84-0x87 */ + 0x00, 0x00, 0xA6, 0xD7, 0xC9, 0xA9, 0xA6, 0xD8, /* 0x88-0x8B */ + 0xA6, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xAB, /* 0x8C-0x8F */ + 0xCB, 0x76, 0x00, 0x00, 0xCB, 0x77, 0xA8, 0x77, /* 0x90-0x93 */ + 0x00, 0x00, 0xCB, 0x74, 0xA8, 0x76, 0x00, 0x00, /* 0x94-0x97 */ + 0xA8, 0x79, 0xCB, 0x75, 0xA8, 0x7B, 0xA8, 0x7A, /* 0x98-0x9B */ + 0xCB, 0x78, 0xA8, 0x78, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xAA, 0xD1, 0xAA, 0xCF, 0xCD, 0xAD, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xAA, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xAA, 0xD3, 0xAA, 0xD5, 0xAA, 0xD2, /* 0xA8-0xAB */ + 0x00, 0x00, 0xCD, 0xB0, 0xCD, 0xAC, 0xAA, 0xD6, /* 0xAC-0xAF */ + 0x00, 0x00, 0xAA, 0xD0, 0xA8, 0x7C, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xAA, 0xD4, 0xCD, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xCD, 0xAE, 0x00, 0x00, 0xAA, 0xCD, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0x5B, 0xAD, 0x47, /* 0xC0-0xC3 */ + 0xAD, 0x48, 0xD0, 0x5D, 0x00, 0x00, 0xD0, 0x57, /* 0xC4-0xC7 */ + 0xD0, 0x5A, 0xD0, 0x63, 0xD0, 0x61, 0x00, 0x00, /* 0xC8-0xCB */ + 0xAD, 0x49, 0xD0, 0x67, 0xAD, 0x4C, 0xD0, 0x64, /* 0xCC-0xCF */ + 0xD0, 0x5C, 0xD0, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xDB, 0x49, 0xD0, 0x62, 0xAD, 0x44, 0xD0, 0x65, /* 0xD4-0xD7 */ + 0xD0, 0x56, 0xD0, 0x5F, 0xAD, 0x46, 0xAD, 0x4B, /* 0xD8-0xDB */ + 0xD0, 0x60, 0xAD, 0x4F, 0xAD, 0x4D, 0x00, 0x00, /* 0xDC-0xDF */ + 0xD0, 0x58, 0xAD, 0x4A, 0x00, 0x00, 0xD0, 0x5E, /* 0xE0-0xE3 */ + 0xAD, 0x4E, 0xAD, 0x45, 0xD0, 0x66, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xAF, 0xDA, 0x00, 0x00, 0xAF, 0xE3, /* 0xEC-0xEF */ + 0xAF, 0xD8, 0xAF, 0xD6, 0xD3, 0x6A, 0xAF, 0xDE, /* 0xF0-0xF3 */ + 0xAF, 0xDB, 0xD3, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xAF, 0xDD, 0xD3, 0x6B, 0xD3, 0x69, 0xD3, 0x6E, /* 0xF8-0xFB */ + 0xAF, 0xE2, 0xAF, 0xE0, 0xDB, 0x48, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_81[512] = { + 0xD3, 0x6F, 0xD3, 0x6D, 0xAF, 0xD7, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xAF, 0xD9, 0xAF, 0xDC, 0x00, 0x00, /* 0x04-0x07 */ + 0xAF, 0xDF, 0x00, 0x00, 0xAF, 0xE1, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xD7, 0x4E, 0xB2, 0xE4, 0x00, 0x00, /* 0x14-0x17 */ + 0xD7, 0x45, 0xD7, 0x47, 0x00, 0x00, 0xD7, 0x48, /* 0x18-0x1B */ + 0x00, 0x00, 0xD7, 0x50, 0xD7, 0x4C, 0xD7, 0x4A, /* 0x1C-0x1F */ + 0x00, 0x00, 0xD7, 0x4D, 0xD7, 0x51, 0xB2, 0xE5, /* 0x20-0x23 */ + 0xB2, 0xE9, 0xD7, 0x46, 0x00, 0x00, 0xD7, 0x4F, /* 0x24-0x27 */ + 0x00, 0x00, 0xB2, 0xE7, 0x00, 0x00, 0xB2, 0xE6, /* 0x28-0x2B */ + 0xD7, 0x4B, 0xD7, 0x49, 0x00, 0x00, 0xB2, 0xE3, /* 0x2C-0x2F */ + 0xB2, 0xE8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xB5, 0xC8, 0xDB, 0x51, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xDB, 0x4F, 0xB5, 0xCA, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x4A, /* 0x40-0x43 */ + 0xDF, 0xA1, 0x00, 0x00, 0xB5, 0xC9, 0xDB, 0x4E, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0x4B, 0xB5, 0xC5, /* 0x48-0x4B */ + 0xB5, 0xCB, 0xDB, 0x50, 0xB5, 0xC7, 0xDB, 0x4D, /* 0x4C-0x4F */ + 0xBB, 0x47, 0xB5, 0xC6, 0xDB, 0x4C, 0xB5, 0xCC, /* 0x50-0x53 */ + 0xB5, 0xC4, 0xB5, 0xC3, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDF, 0x77, /* 0x58-0x5B */ + 0xDF, 0x75, 0x00, 0x00, 0xDF, 0x7B, 0x00, 0x00, /* 0x5C-0x5F */ + 0xDF, 0x73, 0xDF, 0xA2, 0xDF, 0x78, 0x00, 0x00, /* 0x60-0x63 */ + 0xDF, 0x72, 0xB8, 0x7B, 0xB8, 0xA3, 0xDF, 0x7D, /* 0x64-0x67 */ + 0x00, 0x00, 0xDF, 0x76, 0x00, 0x00, 0xB8, 0x7E, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0xB8, 0x7C, 0xDF, 0x7E, /* 0x6C-0x6F */ + 0xB8, 0x79, 0xB8, 0x78, 0xDF, 0x79, 0xB8, 0x7D, /* 0x70-0x73 */ + 0xB5, 0xCD, 0x00, 0x00, 0xDF, 0x7C, 0xDF, 0x74, /* 0x74-0x77 */ + 0xB8, 0x7A, 0xB8, 0xA1, 0xB8, 0xA2, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBB, 0x4C, /* 0x7C-0x7F */ + + 0xBB, 0x48, 0x00, 0x00, 0xBB, 0x4D, 0xE3, 0xA6, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xA5, 0xE3, 0xA7, /* 0x84-0x87 */ + 0xBB, 0x4A, 0xE3, 0xA4, 0xBB, 0x4B, 0xE3, 0xAA, /* 0x88-0x8B */ + 0xE3, 0xA9, 0xE3, 0xA8, 0x00, 0x00, 0xBB, 0x49, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0xE7, 0x41, 0x00, 0x00, 0xE7, 0x44, /* 0x94-0x97 */ + 0xBD, 0xA8, 0xE7, 0x43, 0xBD, 0xA7, 0xBD, 0xA3, /* 0x98-0x9B */ + 0xBD, 0xA4, 0xBD, 0xA5, 0xE7, 0x40, 0xE6, 0xFE, /* 0x9C-0x9F */ + 0xBD, 0xA6, 0x00, 0x00, 0xE7, 0x42, 0xE6, 0xFD, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xE9, 0xEA, 0xF3, /* 0xA4-0xA7 */ + 0xBF, 0xB1, 0xBF, 0xB0, 0x00, 0x00, 0xEA, 0xED, /* 0xA8-0xAB */ + 0xEA, 0xEF, 0x00, 0x00, 0xEA, 0xEA, 0x00, 0x00, /* 0xAC-0xAF */ + 0xEA, 0xEE, 0xEA, 0xE8, 0xEA, 0xF1, 0xBF, 0xAF, /* 0xB0-0xB3 */ + 0xEA, 0xF0, 0xEA, 0xEC, 0x00, 0x00, 0xEA, 0xF2, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xEA, 0xEB, 0xC1, 0x74, 0xED, 0xE8, /* 0xB8-0xBB */ + 0xED, 0xEE, 0xC1, 0x78, 0xC1, 0x7A, 0xC1, 0x77, /* 0xBC-0xBF */ + 0xC1, 0x76, 0x00, 0x00, 0xC1, 0x75, 0xC1, 0x73, /* 0xC0-0xC3 */ + 0xED, 0xE9, 0xED, 0xEC, 0xC1, 0x72, 0xED, 0xED, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xC1, 0x79, 0xED, 0xEB, 0x00, 0x00, /* 0xC8-0xCB */ + 0xED, 0xEA, 0xC2, 0xC0, 0x00, 0x00, 0xC2, 0xC1, /* 0xCC-0xCF */ + 0xF0, 0xA1, 0xF0, 0x7D, 0xF0, 0x7E, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xF2, 0xC2, 0x00, 0x00, 0xF2, 0xC1, /* 0xD4-0xD7 */ + 0xC3, 0xBE, 0xF4, 0xB4, 0xC4, 0xA4, 0xF4, 0xB3, /* 0xD8-0xDB */ + 0x00, 0x00, 0xF5, 0xF0, 0xF7, 0x45, 0xC5, 0xA6, /* 0xDC-0xDF */ + 0xF9, 0x43, 0xF9, 0x44, 0xC5, 0xD8, 0xA6, 0xDA, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xAA, 0xD7, 0xDB, 0x52, 0xBB, 0x4E, /* 0xE4-0xE7 */ + 0xC1, 0x7B, 0xED, 0xEF, 0xA6, 0xDB, 0x00, 0x00, /* 0xE8-0xEB */ + 0xAF, 0xE5, 0xAF, 0xE4, 0xDB, 0x53, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xEA, 0xF4, 0xA6, 0xDC, /* 0xF0-0xF3 */ + 0xAD, 0x50, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x54, /* 0xF4-0xF7 */ + 0xDB, 0x55, 0xDB, 0x56, 0xBB, 0x4F, 0xBF, 0xB2, /* 0xF8-0xFB */ + 0xA6, 0xDD, 0x00, 0x00, 0xAA, 0xD8, 0xD0, 0x68, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_82[512] = { + 0xAF, 0xE6, 0xD3, 0x70, 0xB2, 0xEA, 0x00, 0x00, /* 0x00-0x03 */ + 0xDB, 0x57, 0xB8, 0xA4, 0x00, 0x00, 0xBB, 0x50, /* 0x04-0x07 */ + 0xBF, 0xB3, 0xC1, 0x7C, 0xC2, 0xC2, 0xF4, 0xB5, /* 0x08-0x0B */ + 0xA6, 0xDE, 0xAA, 0xD9, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xAF, 0xE7, 0xD7, 0x52, 0xB5, 0xCE, 0x00, 0x00, /* 0x10-0x13 */ + 0xBB, 0x51, 0xE3, 0xAB, 0xE7, 0x45, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0xDF, /* 0x18-0x1B */ + 0xB5, 0xCF, 0xDF, 0xA3, 0xBB, 0x52, 0xA6, 0xE0, /* 0x1C-0x1F */ + 0xCD, 0xB1, 0xD0, 0x69, 0xAD, 0x51, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xD3, 0x72, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xAF, 0xEA, 0x00, 0x00, 0xAF, 0xE8, 0xAF, 0xE9, /* 0x28-0x2B */ + 0xAF, 0xEB, 0x00, 0x00, 0x00, 0x00, 0xD3, 0x71, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0x57, 0xD7, 0x54, /* 0x30-0x33 */ + 0xD7, 0x56, 0xB2, 0xEB, 0xB2, 0xED, 0xB2, 0xEC, /* 0x34-0x37 */ + 0xD7, 0x53, 0xB2, 0xEE, 0xD7, 0x55, 0x00, 0x00, /* 0x38-0x3B */ + 0xDB, 0x58, 0xDB, 0x59, 0x00, 0x00, 0xDB, 0x5A, /* 0x3C-0x3F */ + 0xDF, 0xA6, 0x00, 0x00, 0xDF, 0xA7, 0x00, 0x00, /* 0x40-0x43 */ + 0xDF, 0xA5, 0xDF, 0xA8, 0x00, 0x00, 0xB8, 0xA5, /* 0x44-0x47 */ + 0x00, 0x00, 0xDF, 0xA4, 0x00, 0x00, 0xBB, 0x53, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0x4A, 0xE7, 0x46, /* 0x4C-0x4F */ + 0xE7, 0x49, 0xE7, 0x4B, 0xE7, 0x48, 0xE7, 0x47, /* 0x50-0x53 */ + 0x00, 0x00, 0xEA, 0xF5, 0xEA, 0xF6, 0xEA, 0xF7, /* 0x54-0x57 */ + 0xBF, 0xB4, 0xBF, 0xB5, 0xED, 0xF1, 0xED, 0xF0, /* 0x58-0x5B */ + 0xED, 0xF2, 0x00, 0x00, 0xF0, 0xA3, 0xF0, 0xA2, /* 0x5C-0x5F */ + 0x00, 0x00, 0xF2, 0xC4, 0x00, 0x00, 0xF2, 0xC5, /* 0x60-0x63 */ + 0xF2, 0xC3, 0x00, 0x00, 0xC4, 0xA5, 0x00, 0x00, /* 0x64-0x67 */ + 0xF4, 0xB6, 0xF4, 0xB7, 0x00, 0x00, 0xF7, 0x46, /* 0x68-0x6B */ + 0xF7, 0xEF, 0xF8, 0xBB, 0xA6, 0xE1, 0xA8, 0x7D, /* 0x6C-0x6F */ + 0x00, 0x00, 0xC1, 0x7D, 0xA6, 0xE2, 0x00, 0x00, /* 0x70-0x73 */ + 0xD7, 0x58, 0xDB, 0x5B, 0x00, 0x00, 0xC6, 0x41, /* 0x74-0x77 */ + 0xCA, 0x4A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xCA, 0x4B, 0xCA, 0x4D, 0xA6, 0xE3, 0xCA, 0x4E, /* 0x7C-0x7F */ + + 0xCA, 0x4C, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xA2, /* 0x80-0x83 */ + 0xCB, 0xA3, 0xCB, 0x7B, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xA1, 0xA8, 0xA1, /* 0x88-0x8B */ + 0x00, 0x00, 0xA8, 0xA2, 0xCB, 0x7C, 0xCB, 0x7A, /* 0x8C-0x8F */ + 0xCB, 0x79, 0xCB, 0x7D, 0xA8, 0x7E, 0xCB, 0x7E, /* 0x90-0x93 */ + 0xD0, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xCD, 0xB6, 0xAA, 0xDC, 0xCD, 0xB5, 0xCD, 0xB7, /* 0x98-0x9B */ + 0x00, 0x00, 0xAA, 0xDB, 0xCD, 0xBC, 0xAA, 0xDF, /* 0x9C-0x9F */ + 0xCD, 0xB2, 0xCD, 0xC0, 0xCD, 0xC6, 0xAA, 0xE6, /* 0xA0-0xA3 */ + 0xCD, 0xC3, 0xAA, 0xE3, 0x00, 0x00, 0xCD, 0xB9, /* 0xA4-0xA7 */ + 0xCD, 0xBF, 0xCD, 0xC1, 0x00, 0x00, 0xCD, 0xB4, /* 0xA8-0xAB */ + 0xAA, 0xE2, 0xAA, 0xDD, 0xCD, 0xBA, 0xAA, 0xE4, /* 0xAC-0xAF */ + 0xAA, 0xE7, 0xAA, 0xE1, 0x00, 0x00, 0xAA, 0xDA, /* 0xB0-0xB3 */ + 0xCD, 0xBE, 0xCD, 0xB8, 0xCD, 0xC5, 0xAA, 0xE9, /* 0xB4-0xB7 */ + 0xAA, 0xE5, 0xAA, 0xE0, 0xCD, 0xBD, 0xAF, 0xEC, /* 0xB8-0xBB */ + 0xCD, 0xBB, 0xAA, 0xDE, 0xAA, 0xE8, 0x00, 0x00, /* 0xBC-0xBF */ + 0xCD, 0xB3, 0x00, 0x00, 0xCD, 0xC2, 0xCD, 0xC4, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xAD, 0x62, 0xAD, 0x5C, 0xAD, 0x64, /* 0xD0-0xD3 */ + 0xAD, 0x61, 0xD0, 0x71, 0xD0, 0x74, 0xAD, 0x5D, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xD0, 0x6B, 0x00, 0x00, 0xAD, 0x56, /* 0xD8-0xDB */ + 0xAD, 0x60, 0x00, 0x00, 0xAD, 0x63, 0xAD, 0x65, /* 0xDC-0xDF */ + 0xD0, 0xA2, 0xD0, 0x77, 0x00, 0x00, 0xAD, 0x55, /* 0xE0-0xE3 */ + 0xD0, 0xA1, 0xAD, 0x59, 0xAD, 0x57, 0xAD, 0x52, /* 0xE4-0xE7 */ + 0xD0, 0x6F, 0x00, 0x00, 0xD0, 0x7E, 0xD0, 0x73, /* 0xE8-0xEB */ + 0xD0, 0x76, 0xD0, 0xA5, 0x00, 0x00, 0xAD, 0x66, /* 0xEC-0xEF */ + 0xD0, 0x7D, 0xAD, 0x5E, 0xD0, 0x78, 0xD0, 0xA4, /* 0xF0-0xF3 */ + 0xD0, 0x75, 0xD0, 0x79, 0xD0, 0x7C, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xD0, 0x6D, 0xD0, 0xA3, 0xD0, 0x7B, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0x6C, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_83[512] = { + 0xD0, 0x70, 0xAD, 0x5F, 0xAD, 0x5A, 0xAD, 0x53, /* 0x00-0x03 */ + 0xAD, 0x58, 0xAD, 0x54, 0xAD, 0x67, 0xD0, 0x6E, /* 0x04-0x07 */ + 0xD3, 0xA5, 0xAD, 0x5B, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0xD0, 0x7A, 0xCE, 0x41, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xD3, 0xA8, 0xAF, 0xFA, /* 0x14-0x17 */ + 0x00, 0x00, 0xD3, 0x76, 0x00, 0x00, 0xD3, 0xA3, /* 0x18-0x1B */ + 0xD3, 0x7D, 0x00, 0x00, 0xD3, 0xB2, 0x00, 0x00, /* 0x1C-0x1F */ + 0xD3, 0xAA, 0x00, 0x00, 0xD3, 0x7E, 0x00, 0x00, /* 0x20-0x23 */ + 0xD3, 0xA9, 0xD3, 0x78, 0xD3, 0x7C, 0xD3, 0xB5, /* 0x24-0x27 */ + 0xAF, 0xFD, 0xD3, 0xAD, 0xD3, 0xA4, 0xAF, 0xED, /* 0x28-0x2B */ + 0xD3, 0xB3, 0xD3, 0x74, 0x00, 0x00, 0xD3, 0xAC, /* 0x2C-0x2F */ + 0x00, 0x00, 0xAF, 0xFC, 0xAF, 0xF7, 0xD3, 0x73, /* 0x30-0x33 */ + 0xAF, 0xF5, 0xAF, 0xF4, 0xAF, 0xF9, 0xD3, 0xAB, /* 0x34-0x37 */ + 0xAF, 0xF1, 0xAF, 0xF8, 0xD0, 0x72, 0xDB, 0x5C, /* 0x38-0x3B */ + 0xD3, 0xA6, 0x00, 0x00, 0x00, 0x00, 0xD3, 0x7A, /* 0x3C-0x3F */ + 0xAF, 0xFB, 0xD3, 0x7B, 0xD3, 0xA1, 0xAF, 0xFE, /* 0x40-0x43 */ + 0xD3, 0x75, 0xD3, 0xAF, 0x00, 0x00, 0xD3, 0xAE, /* 0x44-0x47 */ + 0xD3, 0xB6, 0xAF, 0xF3, 0xAF, 0xF0, 0xD3, 0xB4, /* 0x48-0x4B */ + 0xD3, 0xB0, 0xD3, 0xA7, 0xD3, 0xA2, 0xAF, 0xF6, /* 0x4C-0x4F */ + 0xAF, 0xF2, 0xD3, 0x77, 0xAF, 0xEE, 0xD3, 0xB1, /* 0x50-0x53 */ + 0xAF, 0xEF, 0x00, 0x00, 0xD3, 0x79, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0x5E, /* 0x70-0x73 */ + 0xD7, 0x60, 0xD7, 0x65, 0xD7, 0x79, 0xB2, 0xFC, /* 0x74-0x77 */ + 0xB2, 0xF2, 0x00, 0x00, 0xD7, 0x5D, 0xB2, 0xFD, /* 0x78-0x7B */ + 0xB2, 0xFE, 0xD7, 0x68, 0xD7, 0x6F, 0xD7, 0x75, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xD7, 0x62, 0x00, 0x00, 0xD7, 0x69, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0x40, 0xD7, 0x77, /* 0x84-0x87 */ + 0xD7, 0x72, 0xB2, 0xFA, 0xB2, 0xF8, 0xD7, 0x6E, /* 0x88-0x8B */ + 0xD7, 0x6A, 0xD7, 0x5C, 0xB2, 0xEF, 0xD7, 0x61, /* 0x8C-0x8F */ + 0xD7, 0x59, 0x00, 0x00, 0xB2, 0xF7, 0xB2, 0xF9, /* 0x90-0x93 */ + 0xD7, 0x66, 0xD7, 0x63, 0xB2, 0xF4, 0xD7, 0x73, /* 0x94-0x97 */ + 0xB2, 0xF1, 0xD7, 0x64, 0xD7, 0x7A, 0xD7, 0x6C, /* 0x98-0x9B */ + 0x00, 0x00, 0xD7, 0x6B, 0xB2, 0xF0, 0x00, 0x00, /* 0x9C-0x9F */ + 0xB2, 0xFB, 0x00, 0x00, 0xB2, 0xF3, 0xD7, 0x5A, /* 0xA0-0xA3 */ + 0xD7, 0x5F, 0xD7, 0x70, 0xD7, 0x76, 0xB3, 0x41, /* 0xA4-0xA7 */ + 0xD7, 0x5B, 0xD7, 0x67, 0xD7, 0x6D, 0xB2, 0xF6, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0x78, 0xD7, 0x71, /* 0xAC-0xAF */ + 0xD7, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xB2, 0xF5, 0x00, 0x00, 0xDB, 0x6C, /* 0xBC-0xBF */ + 0xDB, 0x60, 0xB5, 0xD7, 0xDB, 0x7D, 0xDB, 0xA7, /* 0xC0-0xC3 */ + 0xDB, 0xAA, 0xB5, 0xD5, 0xDB, 0x68, 0xDB, 0xA3, /* 0xC4-0xC7 */ + 0xDB, 0x69, 0xDB, 0x77, 0xB5, 0xE2, 0xDB, 0x73, /* 0xC8-0xCB */ + 0xB5, 0xDF, 0x00, 0x00, 0xDB, 0x74, 0xDB, 0x5D, /* 0xCC-0xCF */ + 0x00, 0x00, 0xDB, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xB5, 0xE8, 0xDB, 0xA1, 0xDB, 0x75, 0xDB, 0xAC, /* 0xD4-0xD7 */ + 0xDB, 0x70, 0xDF, 0xC8, 0x00, 0x00, 0xDB, 0xAF, /* 0xD8-0xDB */ + 0xB5, 0xE6, 0xDB, 0x6E, 0xDB, 0x7A, 0xB5, 0xE9, /* 0xDC-0xDF */ + 0xB5, 0xD4, 0xDB, 0x72, 0xDB, 0xAD, 0xDB, 0x6B, /* 0xE0-0xE3 */ + 0xDB, 0x64, 0xDB, 0x6F, 0x00, 0x00, 0xDB, 0x63, /* 0xE4-0xE7 */ + 0xDB, 0x61, 0xB5, 0xD0, 0xDB, 0xA5, 0xDB, 0x6A, /* 0xE8-0xEB */ + 0xDB, 0xA8, 0x00, 0x00, 0xDB, 0xA9, 0xB5, 0xD8, /* 0xEC-0xEF */ + 0xB5, 0xDD, 0xB5, 0xD9, 0xB5, 0xE1, 0xDB, 0x7E, /* 0xF0-0xF3 */ + 0xB5, 0xDA, 0xDB, 0x76, 0xDB, 0x66, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xB5, 0xD2, 0xDB, 0x5E, 0xDB, 0xA2, 0xDB, 0xAB, /* 0xF8-0xFB */ + 0xDB, 0x65, 0xB5, 0xE0, 0xDB, 0xB0, 0xDB, 0x71, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_84[512] = { + 0x00, 0x00, 0xDB, 0x6D, 0x00, 0x00, 0xB5, 0xD1, /* 0x00-0x03 */ + 0xB5, 0xE5, 0x00, 0x00, 0xDB, 0x7C, 0xB5, 0xE7, /* 0x04-0x07 */ + 0x00, 0x00, 0xDB, 0x78, 0xB5, 0xDC, 0xB5, 0xD6, /* 0x08-0x0B */ + 0xB5, 0xDE, 0xB5, 0xD3, 0xB5, 0xE4, 0xDB, 0x79, /* 0x0C-0x0F */ + 0xDB, 0x67, 0xDB, 0x7B, 0xDB, 0x62, 0xDB, 0xA6, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0xAE, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDB, 0x5F, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xDF, 0xC7, 0x00, 0x00, 0xDF, 0xDD, /* 0x28-0x2B */ + 0xB8, 0x55, 0xDF, 0xCC, 0x00, 0x00, 0xDF, 0xCA, /* 0x2C-0x2F */ + 0xDF, 0xB5, 0xB8, 0xA9, 0xDF, 0xC5, 0xDF, 0xD9, /* 0x30-0x33 */ + 0xDF, 0xC1, 0xB8, 0xB1, 0xDF, 0xD8, 0xDF, 0xBF, /* 0x34-0x37 */ + 0xB5, 0xE3, 0xDF, 0xCF, 0xDF, 0xC0, 0xDF, 0xD6, /* 0x38-0x3B */ + 0xB8, 0xB0, 0xB8, 0xA8, 0x00, 0x00, 0xDF, 0xAA, /* 0x3C-0x3F */ + 0xDF, 0xB2, 0x00, 0x00, 0xDF, 0xCB, 0xDF, 0xC3, /* 0x40-0x43 */ + 0xDF, 0xDC, 0xDF, 0xC6, 0xB8, 0xB6, 0xDF, 0xD7, /* 0x44-0x47 */ + 0x00, 0x00, 0xB8, 0xAD, 0x00, 0x00, 0xDF, 0xC9, /* 0x48-0x4B */ + 0xDF, 0xD1, 0xDF, 0xB6, 0xDF, 0xD0, 0x00, 0x00, /* 0x4C-0x4F */ + 0xDF, 0xE1, 0xDF, 0xB1, 0xDF, 0xD2, 0x00, 0x00, /* 0x50-0x53 */ + 0xDF, 0xDF, 0x00, 0x00, 0xDF, 0xAB, 0xB5, 0xDB, /* 0x54-0x57 */ + 0x00, 0x00, 0xDF, 0xB9, 0xDF, 0xB8, 0xB8, 0xAF, /* 0x58-0x5B */ + 0x00, 0x00, 0xDF, 0xBC, 0xDF, 0xBE, 0xDF, 0xCD, /* 0x5C-0x5F */ + 0xDF, 0xDE, 0xB8, 0xB2, 0x00, 0x00, 0xB8, 0xB3, /* 0x60-0x63 */ + 0x00, 0x00, 0xDF, 0xB0, 0xB8, 0xAB, 0xDF, 0xB4, /* 0x64-0x67 */ + 0xDF, 0xDA, 0xB8, 0xB4, 0x00, 0x00, 0xB8, 0xAC, /* 0x68-0x6B */ + 0xB8, 0xAE, 0xB8, 0xB5, 0xDF, 0xE0, 0xDF, 0xD3, /* 0x6C-0x6F */ + 0xDF, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xBB, /* 0x70-0x73 */ + 0xDF, 0xBA, 0xB8, 0xAA, 0xDF, 0xAC, 0xB8, 0xA7, /* 0x74-0x77 */ + 0xDF, 0xC4, 0xDF, 0xAD, 0xDF, 0xC2, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0xDF, 0xB7, 0xDF, 0xDB, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0xB8, 0xA6, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xDF, 0xB3, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xDF, 0xAF, 0xDF, 0xD5, 0xDF, 0xAE, /* 0x8C-0x8F */ + 0xBB, 0x60, 0xE3, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xE3, 0xC2, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xAC, /* 0x94-0x97 */ + 0xE3, 0xCA, 0xBB, 0x58, 0xE3, 0xBB, 0xE3, 0xC5, /* 0x98-0x9B */ + 0xBB, 0x5B, 0xE3, 0xBE, 0xBB, 0x59, 0xE3, 0xAF, /* 0x9C-0x9F */ + 0xE3, 0xCD, 0xE3, 0xAE, 0xE3, 0xC1, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE3, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xBF, /* 0xA4-0xA7 */ + 0xE3, 0xC8, 0xE3, 0xC6, 0xE3, 0xBA, 0xE3, 0xB5, /* 0xA8-0xAB */ + 0xE3, 0xB3, 0x00, 0x00, 0xE3, 0xB4, 0xE3, 0xC7, /* 0xAC-0xAF */ + 0xE3, 0xD2, 0xE3, 0xBC, 0xBB, 0x5A, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xE3, 0xB7, 0x00, 0x00, 0xE3, 0xCB, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xBB, 0x5D, 0xE3, 0xB6, 0xE3, 0xB0, 0xE3, 0xC0, /* 0xB8-0xBB */ + 0xBB, 0x61, 0x00, 0x00, 0x00, 0x00, 0xBB, 0x55, /* 0xBC-0xBF */ + 0xBB, 0x5E, 0xE3, 0xB8, 0xE3, 0xB2, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xBB, 0x57, 0xDF, 0xD4, 0xBB, 0x56, 0xE3, 0xC3, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xBB, 0x54, 0xBB, 0x63, 0xBB, 0x5C, /* 0xC8-0xCB */ + 0xE3, 0xC4, 0xE3, 0xB9, 0xE3, 0xB1, 0xE3, 0xCC, /* 0xCC-0xCF */ + 0xE3, 0xBD, 0xBB, 0x62, 0xE3, 0xD0, 0xBB, 0x5F, /* 0xD0-0xD3 */ + 0xE3, 0xCF, 0x00, 0x00, 0xE3, 0xC9, 0xE3, 0xCE, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xD1, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x73, /* 0xE4-0xE7 */ + 0xE7, 0x74, 0xE7, 0x67, 0xE7, 0x66, 0xE7, 0x62, /* 0xE8-0xEB */ + 0xBD, 0xB4, 0x00, 0x00, 0xBD, 0xAC, 0xE7, 0x76, /* 0xEC-0xEF */ + 0xE7, 0x75, 0xDF, 0xA9, 0xE7, 0x5F, 0xE7, 0x63, /* 0xF0-0xF3 */ + 0xE7, 0x5D, 0x00, 0x00, 0xE7, 0x70, 0xE7, 0x61, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE7, 0x77, 0xE7, 0x5A, 0xE7, 0x58, /* 0xF8-0xFB */ + 0xE7, 0x64, 0xE7, 0x6E, 0xE7, 0x69, 0xBD, 0xB6, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_85[512] = { + 0xE7, 0x4F, 0x00, 0x00, 0xE7, 0x6D, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xBD, 0xB7, 0xDF, 0xBD, /* 0x04-0x07 */ + 0xE7, 0x5B, 0xE7, 0x52, 0xE7, 0x55, 0xE7, 0x7B, /* 0x08-0x0B */ + 0xE7, 0x5C, 0xE7, 0x53, 0xE7, 0x51, 0xE7, 0x4E, /* 0x0C-0x0F */ + 0x00, 0x00, 0xBD, 0xB0, 0xE7, 0x65, 0xBD, 0xAF, /* 0x10-0x13 */ + 0xBD, 0xB3, 0xE7, 0x60, 0xE7, 0x68, 0xBD, 0xA9, /* 0x14-0x17 */ + 0xE7, 0x78, 0xE7, 0x7C, 0xBD, 0xAB, 0x00, 0x00, /* 0x18-0x1B */ + 0xE7, 0x57, 0xE7, 0x6B, 0xE7, 0x6F, 0xE7, 0x54, /* 0x1C-0x1F */ + 0xE7, 0x79, 0xBD, 0xB2, 0x00, 0x00, 0xBD, 0xB1, /* 0x20-0x23 */ + 0xE7, 0x4C, 0xBD, 0xB5, 0xE7, 0x72, 0xE7, 0x56, /* 0x24-0x27 */ + 0xE7, 0x6A, 0xE7, 0x50, 0xE7, 0x5E, 0xE7, 0x59, /* 0x28-0x2B */ + 0xBD, 0xAD, 0xBD, 0xAE, 0xE7, 0x6C, 0xE7, 0x7D, /* 0x2C-0x2F */ + 0xE7, 0x7A, 0xE7, 0x71, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0x4D, /* 0x38-0x3B */ + 0x00, 0x00, 0xBD, 0xAA, 0xEB, 0x49, 0x00, 0x00, /* 0x3C-0x3F */ + 0xEB, 0x40, 0xEB, 0x43, 0x00, 0x00, 0xBF, 0xBB, /* 0x40-0x43 */ + 0xEB, 0x45, 0xEA, 0xF9, 0xEB, 0x41, 0xEB, 0x47, /* 0x44-0x47 */ + 0xBF, 0xB8, 0xBF, 0xBC, 0xBF, 0xB6, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0xEA, 0xFB, 0xEB, 0x4C, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0xEB, 0x46, 0x00, 0x00, 0xEA, 0xFC, /* 0x50-0x53 */ + 0xEB, 0x55, 0xEB, 0x4F, 0xEA, 0xF8, 0xEE, 0x46, /* 0x54-0x57 */ + 0xEA, 0xFE, 0xBF, 0xB7, 0x00, 0x00, 0xEB, 0x4A, /* 0x58-0x5B */ + 0x00, 0x00, 0xEB, 0x54, 0xBF, 0xBF, 0x00, 0x00, /* 0x5C-0x5F */ + 0xEB, 0x51, 0xEA, 0xFD, 0xEB, 0x44, 0xEB, 0x48, /* 0x60-0x63 */ + 0xEB, 0x42, 0xEB, 0x56, 0xEB, 0x53, 0xEB, 0x50, /* 0x64-0x67 */ + 0xBF, 0xB9, 0xBF, 0xBA, 0xBF, 0xBE, 0xEA, 0xFA, /* 0x68-0x6B */ + 0xEB, 0x57, 0xBF, 0xBD, 0xEB, 0x4D, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xEB, 0x4B, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xEB, 0x4E, 0xEE, 0x53, 0xEE, 0x40, /* 0x74-0x77 */ + 0xEE, 0x45, 0xEE, 0x52, 0xEE, 0x44, 0xED, 0xFB, /* 0x78-0x7B */ + 0xEE, 0x41, 0x00, 0x00, 0xC1, 0xA2, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xED, 0xF4, 0xEE, 0x4D, 0xEE, 0x4F, 0xED, 0xF3, /* 0x80-0x83 */ + 0xC1, 0xA1, 0xEE, 0x51, 0xEE, 0x49, 0xC1, 0xA8, /* 0x84-0x87 */ + 0xEE, 0x50, 0xEE, 0x42, 0xC1, 0xAA, 0xED, 0xF9, /* 0x88-0x8B */ + 0xEB, 0x52, 0xEE, 0x4A, 0xEE, 0x47, 0xED, 0xF5, /* 0x8C-0x8F */ + 0xEE, 0x55, 0xC1, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xC1, 0xA5, 0xED, 0xF7, 0xEE, 0x48, 0x00, 0x00, /* 0x94-0x97 */ + 0xEE, 0x54, 0xEE, 0x4B, 0xED, 0xFD, 0xC1, 0xA7, /* 0x98-0x9B */ + 0xC1, 0xA3, 0xEE, 0x4C, 0xED, 0xFE, 0xEE, 0x56, /* 0x9C-0x9F */ + 0xED, 0xF8, 0xEE, 0x43, 0xEE, 0x4E, 0xED, 0xFA, /* 0xA0-0xA3 */ + 0xED, 0xFC, 0x00, 0x00, 0xC2, 0xCB, 0xED, 0xF6, /* 0xA4-0xA7 */ + 0xC1, 0xA9, 0xC2, 0xC4, 0xC1, 0x7E, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC1, 0xA6, /* 0xAC-0xAF */ + 0xC2, 0xC8, 0xF0, 0xB3, 0x00, 0x00, 0xF0, 0xA9, /* 0xB0-0xB3 */ + 0xF0, 0xA4, 0xF0, 0xAA, 0xF0, 0xB4, 0xF0, 0xB8, /* 0xB4-0xB7 */ + 0xF0, 0xB7, 0xC2, 0xCA, 0xC2, 0xC9, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xF0, 0xAB, 0xF0, 0xB9, 0xF0, 0xAE, /* 0xBC-0xBF */ + 0xF0, 0xA6, 0x00, 0x00, 0xF0, 0xA8, 0xF0, 0xA7, /* 0xC0-0xC3 */ + 0xF0, 0xAD, 0xF0, 0xB2, 0xF0, 0xA5, 0xF0, 0xAC, /* 0xC4-0xC7 */ + 0xF0, 0xB1, 0xC2, 0xC7, 0x00, 0x00, 0xF0, 0xAF, /* 0xC8-0xCB */ + 0x00, 0x00, 0xC2, 0xC5, 0xF0, 0xB0, 0xC2, 0xC3, /* 0xCC-0xCF */ + 0xC2, 0xC6, 0xF2, 0xD5, 0xF0, 0xB5, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xC3, 0xC2, 0x00, 0x00, 0xF2, 0xCD, /* 0xD4-0xD7 */ + 0xF2, 0xD1, 0xF2, 0xC9, 0xF2, 0xCC, 0x00, 0x00, /* 0xD8-0xDB */ + 0xF2, 0xD4, 0xC3, 0xC0, 0xF2, 0xD9, 0xF2, 0xD2, /* 0xDC-0xDF */ + 0x00, 0x00, 0xF2, 0xCA, 0xF2, 0xDA, 0xF2, 0xD3, /* 0xE0-0xE3 */ + 0xC3, 0xC3, 0xC3, 0xC4, 0xF2, 0xD7, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xF2, 0xCB, 0xC3, 0xBF, 0xC3, 0xC1, 0xF2, 0xC6, /* 0xE8-0xEB */ + 0xF2, 0xCE, 0xF2, 0xC8, 0x00, 0x00, 0xF2, 0xD8, /* 0xEC-0xEF */ + 0xF2, 0xD6, 0xF2, 0xC7, 0xF2, 0xCF, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xF4, 0xBE, 0xC3, 0xC5, /* 0xF4-0xF7 */ + 0xF2, 0xD0, 0xC4, 0xA7, 0xC4, 0xA9, 0xC4, 0xA6, /* 0xF8-0xFB */ + 0x00, 0x00, 0xF4, 0xC3, 0xF4, 0xBB, 0xF4, 0xB9, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_86[512] = { + 0xF4, 0xBD, 0xF4, 0xBA, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0xF4, 0xBF, 0xF4, 0xC1, 0xC4, 0xAA, 0xC4, 0xAC, /* 0x04-0x07 */ + 0x00, 0x00, 0xF4, 0xC0, 0xC4, 0xAD, 0xC4, 0xAB, /* 0x08-0x0B */ + 0xF4, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xC4, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC4, 0xF4, /* 0x14-0x17 */ + 0xF5, 0xF1, 0xF5, 0xF7, 0xC4, 0xF6, 0xF4, 0xBC, /* 0x18-0x1B */ + 0xF5, 0xF6, 0x00, 0x00, 0xF5, 0xFD, 0xF5, 0xF4, /* 0x1C-0x1F */ + 0xF5, 0xFB, 0xF5, 0xFA, 0xF4, 0xB8, 0xF5, 0xF5, /* 0x20-0x23 */ + 0xF0, 0xB6, 0xF5, 0xFE, 0xF5, 0xF3, 0xF5, 0xF8, /* 0x24-0x27 */ + 0x00, 0x00, 0xF5, 0xFC, 0xF5, 0xF2, 0x00, 0x00, /* 0x28-0x2B */ + 0xF7, 0x4A, 0xC4, 0xF5, 0xF5, 0xF9, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xF7, 0xF4, 0xF7, 0x4B, 0xF7, 0x49, /* 0x30-0x33 */ + 0xF7, 0x47, 0xF7, 0x48, 0xF7, 0x4C, 0x00, 0x00, /* 0x34-0x37 */ + 0xC5, 0xD9, 0xF7, 0xF2, 0xF7, 0xF0, 0xF7, 0xF5, /* 0x38-0x3B */ + 0xF7, 0xF3, 0x00, 0x00, 0xF7, 0xF6, 0xC5, 0xDA, /* 0x3C-0x3F */ + 0xF7, 0xF1, 0x00, 0x00, 0x00, 0x00, 0xF8, 0xBC, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0x45, 0xF9, 0x46, /* 0x44-0x47 */ + 0xF9, 0x47, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC7, /* 0x48-0x4B */ + 0xF9, 0xBD, 0xCA, 0x4F, 0xAA, 0xEA, 0x00, 0x00, /* 0x4C-0x4F */ + 0xAD, 0x68, 0x00, 0x00, 0xD3, 0xB8, 0xD3, 0xB7, /* 0x50-0x53 */ + 0xB0, 0x40, 0xB3, 0x42, 0xD7, 0x7C, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0xD7, 0x7B, 0x00, 0x00, 0xB5, 0xEA, /* 0x58-0x5B */ + 0xB8, 0xB8, 0x00, 0x00, 0xB8, 0xB7, 0xB8, 0xB9, /* 0x5C-0x5F */ + 0x00, 0x00, 0xE3, 0xD4, 0xE7, 0x7E, 0xEB, 0x58, /* 0x60-0x63 */ + 0xEB, 0x5A, 0xEB, 0x59, 0x00, 0x00, 0xC1, 0xAB, /* 0x64-0x67 */ + 0xEE, 0x57, 0xF0, 0xBA, 0xF9, 0xA5, 0xA6, 0xE4, /* 0x68-0x6B */ + 0x00, 0x00, 0xCD, 0xC9, 0xCD, 0xCA, 0xCD, 0xC8, /* 0x6C-0x6F */ + 0xCD, 0xC7, 0xAA, 0xEB, 0x00, 0x00, 0xD0, 0xA9, /* 0x70-0x73 */ + 0xD0, 0xA7, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xA6, /* 0x74-0x77 */ + 0x00, 0x00, 0xAD, 0x69, 0xAD, 0x6B, 0xAD, 0x6A, /* 0x78-0x7B */ + 0xD0, 0xA8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xD3, 0xC4, 0xD3, 0xC1, 0xD3, 0xBF, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0xB0, 0x41, 0xD3, 0xC2, /* 0x88-0x8B */ + 0xB0, 0x46, 0xD3, 0xBC, 0xD3, 0xCB, 0x00, 0x00, /* 0x8C-0x8F */ + 0xD3, 0xCD, 0xD3, 0xBD, 0x00, 0x00, 0xB0, 0x43, /* 0x90-0x93 */ + 0xD3, 0xCE, 0xD3, 0xC9, 0xD3, 0xBB, 0xD3, 0xC0, /* 0x94-0x97 */ + 0xD3, 0xCA, 0xD3, 0xC6, 0xD3, 0xC3, 0x00, 0x00, /* 0x98-0x9B */ + 0xB0, 0x48, 0xD3, 0xCC, 0xD3, 0xBE, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xD3, 0xC7, 0xD3, 0xB9, 0xB0, 0x47, /* 0xA0-0xA3 */ + 0xB0, 0x44, 0xD3, 0xC5, 0x00, 0x00, 0xD3, 0xC8, /* 0xA4-0xA7 */ + 0xD3, 0xBA, 0xB0, 0x45, 0xB0, 0x42, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x4C, /* 0xAC-0xAF */ + 0xD7, 0xA5, 0xB3, 0x4B, 0x00, 0x00, 0xD7, 0xA8, /* 0xB0-0xB3 */ + 0xD7, 0xAB, 0xB3, 0x48, 0xB3, 0x46, 0xD7, 0x7E, /* 0xB4-0xB7 */ + 0xD7, 0xA9, 0xD7, 0xA7, 0xD7, 0xA4, 0xD7, 0xAC, /* 0xB8-0xBB */ + 0xD7, 0xAD, 0xD7, 0xAF, 0xD7, 0xB0, 0xD7, 0x7D, /* 0xBC-0xBF */ + 0xB3, 0x45, 0xD7, 0xA2, 0xD7, 0xA1, 0xD7, 0xAE, /* 0xC0-0xC3 */ + 0xB3, 0x47, 0xD7, 0xA3, 0xB3, 0x49, 0xB3, 0x44, /* 0xC4-0xC7 */ + 0xD7, 0xA6, 0xB3, 0x4D, 0x00, 0x00, 0xB3, 0x4A, /* 0xC8-0xCB */ + 0xD7, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xB5, 0xF1, 0xDB, 0xBF, 0x00, 0x00, 0xDB, 0xB4, /* 0xD0-0xD3 */ + 0xB5, 0xEE, 0x00, 0x00, 0xDF, 0xE7, 0xDB, 0xBD, /* 0xD4-0xD7 */ + 0xDB, 0xB1, 0xB5, 0xEC, 0xDB, 0xB6, 0xB5, 0xEF, /* 0xD8-0xDB */ + 0xDB, 0xBA, 0xDB, 0xB8, 0xB5, 0xF2, 0xB5, 0xEB, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xB2, 0xDB, 0xB5, /* 0xE0-0xE3 */ + 0xB5, 0xF0, 0x00, 0x00, 0xDB, 0xB3, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xDB, 0xBE, 0xDB, 0xBC, 0xDB, 0xB7, 0xDB, 0xB9, /* 0xE8-0xEB */ + 0xDB, 0xBB, 0xB5, 0xED, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xDF, 0xE8, 0xDF, 0xEE, 0xDF, 0xE4, /* 0xF4-0xF7 */ + 0xDF, 0xEA, 0xB8, 0xBA, 0xDF, 0xE6, 0xB8, 0xC0, /* 0xF8-0xFB */ + 0x00, 0x00, 0x00, 0x00, 0xB8, 0xBF, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_87[512] = { + 0xB8, 0xBE, 0xDF, 0xED, 0xB8, 0xC1, 0xB8, 0xC2, /* 0x00-0x03 */ + 0xDF, 0xE3, 0xDF, 0xF0, 0xB8, 0xC3, 0xB8, 0xBD, /* 0x04-0x07 */ + 0xB8, 0xBC, 0xDF, 0xEC, 0xB8, 0xC4, 0xDF, 0xE2, /* 0x08-0x0B */ + 0xDF, 0xE5, 0xDF, 0xEF, 0xDF, 0xEB, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0xE3, 0xF4, 0xE3, 0xE9, 0xB8, 0xBB, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0xBB, 0x6A, 0xE3, 0xDD, 0xE3, 0xF2, 0xE3, 0xDE, /* 0x18-0x1B */ + 0xBB, 0x65, 0x00, 0x00, 0xE3, 0xDB, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE3, 0xE4, 0xE3, 0xDC, 0xBB, 0x67, 0xE3, 0xD6, /* 0x20-0x23 */ + 0xE3, 0xF1, 0xBB, 0x68, 0xE3, 0xEE, 0xE3, 0xEF, /* 0x24-0x27 */ + 0xE3, 0xD7, 0xBB, 0x6D, 0xE3, 0xE6, 0x00, 0x00, /* 0x28-0x2B */ + 0xE3, 0xE0, 0xE3, 0xE7, 0xE3, 0xDA, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE3, 0xF3, 0xE3, 0xEB, 0xE3, 0xE5, 0xE3, 0xD5, /* 0x30-0x33 */ + 0xBB, 0x69, 0xE3, 0xEC, 0x00, 0x00, 0xBB, 0x6C, /* 0x34-0x37 */ + 0xE3, 0xF0, 0x00, 0x00, 0xE3, 0xEA, 0xBB, 0x66, /* 0x38-0x3B */ + 0xE3, 0xE8, 0x00, 0x00, 0xE3, 0xE2, 0xBB, 0x64, /* 0x3C-0x3F */ + 0xE3, 0xD9, 0xE3, 0xE1, 0xE3, 0xED, 0xE3, 0xDF, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xE3, 0xE3, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0xBD, 0xC1, 0xDF, 0xE9, 0xE7, 0xB2, 0xE7, 0xBB, /* 0x4C-0x4F */ + 0xE7, 0xB1, 0xE7, 0xAD, 0xE7, 0xAA, 0xBD, 0xC2, /* 0x50-0x53 */ + 0xE7, 0xA8, 0xBB, 0x6B, 0xE7, 0xA1, 0xBD, 0xC0, /* 0x54-0x57 */ + 0xE7, 0xA7, 0xBD, 0xBF, 0xE7, 0xAC, 0xE7, 0xA9, /* 0x58-0x5B */ + 0xE7, 0xB9, 0xE7, 0xB4, 0xE7, 0xAE, 0xE7, 0xB3, /* 0x5C-0x5F */ + 0xBD, 0xBB, 0xE7, 0xAB, 0xE7, 0xBE, 0xE7, 0xA2, /* 0x60-0x63 */ + 0xE7, 0xA3, 0xE7, 0xBA, 0xBD, 0xBC, 0xE7, 0xBF, /* 0x64-0x67 */ + 0xBD, 0xBE, 0xE7, 0xC0, 0xE7, 0xB0, 0xE3, 0xD8, /* 0x68-0x6B */ + 0xE7, 0xB6, 0xE7, 0xAF, 0xE7, 0xB8, 0xE7, 0xB5, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE7, 0xA6, /* 0x70-0x73 */ + 0xBD, 0xB9, 0xE7, 0xBD, 0xBD, 0xBA, 0xE7, 0xA4, /* 0x74-0x77 */ + 0xBD, 0xBD, 0xEB, 0x64, 0xE7, 0xB7, 0xE7, 0xBC, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xEB, 0x61, 0xBD, 0xB8, 0xBF, 0xC0, /* 0x80-0x83 */ + 0xEB, 0x6B, 0xEB, 0x67, 0x00, 0x00, 0xEB, 0x65, /* 0x84-0x87 */ + 0xEB, 0x60, 0xEB, 0x6F, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xBF, 0xC4, 0x00, 0x00, 0xEB, 0x5C, /* 0x8C-0x8F */ + 0xEB, 0x68, 0xEB, 0x69, 0xEB, 0x5F, 0xEB, 0x5E, /* 0x90-0x93 */ + 0xEB, 0x6C, 0x00, 0x00, 0xEB, 0x62, 0xEB, 0x5D, /* 0x94-0x97 */ + 0xEB, 0x63, 0x00, 0x00, 0xEB, 0x6E, 0xEB, 0x5B, /* 0x98-0x9B */ + 0xEB, 0x6D, 0xEB, 0x6A, 0xBF, 0xC2, 0xBF, 0xC1, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0xBF, 0xC3, 0xEB, 0x66, /* 0xA0-0xA3 */ + 0xF0, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0x59, 0xC1, 0xB1, /* 0xA8-0xAB */ + 0xEE, 0x5D, 0xEE, 0x5A, 0xEE, 0x61, 0xEE, 0x67, /* 0xAC-0xAF */ + 0xEE, 0x5C, 0x00, 0x00, 0xEE, 0x70, 0xC1, 0xAE, /* 0xB0-0xB3 */ + 0xEE, 0x6A, 0xEE, 0x5F, 0xEE, 0x6B, 0xEE, 0x66, /* 0xB4-0xB7 */ + 0xEE, 0x6D, 0xEE, 0x5E, 0xC1, 0xB3, 0xC1, 0xB2, /* 0xB8-0xBB */ + 0xEE, 0x60, 0xEE, 0x6E, 0xEE, 0x58, 0xEE, 0x6C, /* 0xBC-0xBF */ + 0xC1, 0xAC, 0x00, 0x00, 0xEE, 0x64, 0xEE, 0x63, /* 0xC0-0xC3 */ + 0xEE, 0x68, 0xEE, 0x5B, 0xC1, 0xB0, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xC1, 0xB4, 0xEE, 0x62, 0xEE, 0x69, 0xC1, 0xB5, /* 0xC8-0xCB */ + 0xEE, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xC1, 0xAD, 0xC1, 0xAF, 0xF0, 0xC7, /* 0xD0-0xD3 */ + 0xF0, 0xC5, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xCC, /* 0xD4-0xD7 */ + 0xF0, 0xC9, 0xF0, 0xCD, 0x00, 0x00, 0xF0, 0xBE, /* 0xD8-0xDB */ + 0xF0, 0xC6, 0xF0, 0xD1, 0xEE, 0x6F, 0xF0, 0xC2, /* 0xDC-0xDF */ + 0xC2, 0xCF, 0xE7, 0xA5, 0xF0, 0xBD, 0xF0, 0xCA, /* 0xE0-0xE3 */ + 0xF0, 0xC4, 0xF0, 0xC1, 0xF0, 0xBC, 0xF0, 0xBB, /* 0xE4-0xE7 */ + 0xF0, 0xD0, 0x00, 0x00, 0xF0, 0xC0, 0xF0, 0xBF, /* 0xE8-0xEB */ + 0xC2, 0xCD, 0xF0, 0xC8, 0x00, 0x00, 0xC2, 0xCC, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xC2, 0xCE, 0xF0, 0xC3, /* 0xF0-0xF3 */ + 0xF0, 0xCF, 0x00, 0x00, 0xF2, 0xDE, 0xF2, 0xDF, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xC3, 0xC9, 0xF2, 0xDC, 0xC3, 0xC6, /* 0xF8-0xFB */ + 0xF2, 0xE4, 0x00, 0x00, 0xC3, 0xCA, 0xF2, 0xE6, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_88[512] = { + 0xF2, 0xDB, 0xF0, 0xCE, 0xF2, 0xE8, 0xF2, 0xDD, /* 0x00-0x03 */ + 0x00, 0x00, 0xC3, 0xC7, 0xF2, 0xE3, 0x00, 0x00, /* 0x04-0x07 */ + 0xF2, 0xE5, 0xF2, 0xE0, 0xF2, 0xE7, 0xF2, 0xE2, /* 0x08-0x0B */ + 0xF2, 0xE1, 0xC3, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xF4, 0xC5, 0xF4, 0xC6, 0x00, 0x00, 0xF4, 0xC8, /* 0x10-0x13 */ + 0xC4, 0xAE, 0xC4, 0xAF, 0xF4, 0xC9, 0xF4, 0xC7, /* 0x14-0x17 */ + 0x00, 0x00, 0xF4, 0xC4, 0x00, 0x00, 0xF6, 0x42, /* 0x18-0x1B */ + 0xF6, 0x45, 0xF6, 0x41, 0x00, 0x00, 0xC4, 0xFA, /* 0x1C-0x1F */ + 0xF6, 0x43, 0xC4, 0xF9, 0xC4, 0xF8, 0xC4, 0xF7, /* 0x20-0x23 */ + 0xF6, 0x44, 0xF7, 0x51, 0xF7, 0x4F, 0x00, 0x00, /* 0x24-0x27 */ + 0xF7, 0x4E, 0xF6, 0x40, 0xF7, 0x50, 0xF6, 0x46, /* 0x28-0x2B */ + 0xF7, 0x4D, 0x00, 0x00, 0xF7, 0xF9, 0xF7, 0xD7, /* 0x2C-0x2F */ + 0xF7, 0xF7, 0xC5, 0xDB, 0xF7, 0xF8, 0xF7, 0xFA, /* 0x30-0x33 */ + 0x00, 0x00, 0xF8, 0xBF, 0xC5, 0xFA, 0xF8, 0xBE, /* 0x34-0x37 */ + 0xF8, 0xBD, 0xC5, 0xFB, 0x00, 0x00, 0xC6, 0x5A, /* 0x38-0x3B */ + 0xF9, 0x6E, 0xF9, 0xA7, 0xF9, 0xA6, 0xF9, 0xA8, /* 0x3C-0x3F */ + 0xA6, 0xE5, 0xD0, 0xAA, 0x00, 0x00, 0xD3, 0xCF, /* 0x40-0x43 */ + 0xD3, 0xD0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0xDB, 0xC0, 0x00, 0x00, 0xF6, 0x47, 0xF8, 0xC0, /* 0x48-0x4B */ + 0xA6, 0xE6, 0xAD, 0x6C, 0xD0, 0xAB, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xD7, 0xB1, 0xB3, 0x4E, /* 0x50-0x53 */ + 0x00, 0x00, 0xDB, 0xC2, 0xDB, 0xC1, 0xB5, 0xF3, /* 0x54-0x57 */ + 0x00, 0x00, 0xB8, 0xC5, 0xE7, 0xC1, 0xBD, 0xC3, /* 0x58-0x5B */ + 0x00, 0x00, 0xBD, 0xC4, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0xBF, 0xC5, 0xC5, 0xFC, 0xA6, 0xE7, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xAC, /* 0x64-0x67 */ + 0xAA, 0xED, 0xD0, 0xAE, 0xD0, 0xAD, 0xAD, 0x6D, /* 0x68-0x6B */ + 0x00, 0x00, 0xD3, 0xD1, 0x00, 0x00, 0xD3, 0xD8, /* 0x6C-0x6F */ + 0xB0, 0x49, 0xD3, 0xD6, 0xD3, 0xD4, 0x00, 0x00, /* 0x70-0x73 */ + 0xD3, 0xDB, 0xD3, 0xD2, 0xD3, 0xD3, 0xB0, 0x4A, /* 0x74-0x77 */ + 0x00, 0x00, 0xB0, 0x4E, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xD3, 0xDC, 0xB0, 0x4D, 0xD3, 0xDA, 0xD3, 0xD7, /* 0x7C-0x7F */ + + 0xD3, 0xD5, 0xB0, 0x4B, 0xB0, 0x4C, 0xD3, 0xD9, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xB3, 0x50, 0xD7, 0xB2, 0x00, 0x00, 0xB3, 0x55, /* 0x88-0x8B */ + 0xD7, 0xC2, 0xB3, 0x54, 0xD7, 0xC4, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xD7, 0xB8, 0xB3, 0x52, 0xD7, 0xC3, /* 0x90-0x93 */ + 0x00, 0x00, 0xD7, 0xB3, 0xB3, 0x53, 0xD7, 0xBF, /* 0x94-0x97 */ + 0xD7, 0xBB, 0xD7, 0xBD, 0xD7, 0xB7, 0xD7, 0xBE, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0x4F, 0xD7, 0xBA, /* 0x9C-0x9F */ + 0x00, 0x00, 0xD7, 0xB9, 0xD7, 0xB5, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xD7, 0xC0, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xBC, /* 0xA4-0xA7 */ + 0xD7, 0xB4, 0x00, 0x00, 0xD7, 0xB6, 0xB3, 0x51, /* 0xA8-0xAB */ + 0xD7, 0xC1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0xB5, 0xF6, 0xDB, 0xCD, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xDB, 0xC9, 0xDB, 0xCB, /* 0xB4-0xB7 */ + 0xDB, 0xC6, 0xDB, 0xC5, 0xDB, 0xC3, 0x00, 0x00, /* 0xB8-0xBB */ + 0xDB, 0xCA, 0xDB, 0xCC, 0xDB, 0xC8, 0x00, 0x00, /* 0xBC-0xBF */ + 0xDB, 0xC7, 0xB5, 0xF4, 0xB5, 0xF5, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xDB, 0xCF, 0xB8, 0xCD, 0xDF, 0xF2, /* 0xC8-0xCB */ + 0xDF, 0xF8, 0xDF, 0xF3, 0xDF, 0xF4, 0xF9, 0xD8, /* 0xCC-0xCF */ + 0xDF, 0xF9, 0x00, 0x00, 0xB8, 0xCF, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xB8, 0xC7, 0xB8, 0xCE, 0xDF, 0xF1, 0xDB, 0xC4, /* 0xD4-0xD7 */ + 0xB8, 0xCA, 0xB8, 0xC8, 0xDF, 0xF7, 0xDF, 0xF6, /* 0xD8-0xDB */ + 0xB8, 0xC9, 0xB8, 0xCB, 0xDF, 0xF5, 0xB8, 0xC6, /* 0xDC-0xDF */ + 0x00, 0x00, 0xB8, 0xCC, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE3, 0xF6, /* 0xE4-0xE7 */ + 0xBB, 0x74, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x42, /* 0xE8-0xEB */ + 0xE4, 0x41, 0x00, 0x00, 0xE3, 0xFB, 0xBB, 0x76, /* 0xEC-0xEF */ + 0xE4, 0x40, 0xE3, 0xF7, 0xE3, 0xF8, 0xBB, 0x6E, /* 0xF0-0xF3 */ + 0xBB, 0x70, 0x00, 0x00, 0xE3, 0xFD, 0xE3, 0xF5, /* 0xF4-0xF7 */ + 0xBB, 0x72, 0xBB, 0x71, 0xE3, 0xF9, 0xE3, 0xFE, /* 0xF8-0xFB */ + 0xE3, 0xFC, 0xBB, 0x73, 0xE3, 0xFA, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_89[512] = { + 0x00, 0x00, 0xDB, 0xCE, 0xBB, 0x6F, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xE7, 0xC2, 0xE7, 0xC9, 0xBD, 0xC6, /* 0x04-0x07 */ + 0x00, 0x00, 0xE7, 0xCD, 0xBD, 0xCA, 0xE7, 0xC5, /* 0x08-0x0B */ + 0xE7, 0xC3, 0x00, 0x00, 0xE7, 0xCC, 0x00, 0x00, /* 0x0C-0x0F */ + 0xBD, 0xC5, 0xE7, 0xCB, 0xBD, 0xC7, 0xBD, 0xC8, /* 0x10-0x13 */ + 0xE7, 0xC4, 0xBD, 0xC9, 0xE7, 0xCA, 0xE7, 0xC6, /* 0x14-0x17 */ + 0xE7, 0xC7, 0xE7, 0xC8, 0xBB, 0x75, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0x70, 0xEB, 0x7C, /* 0x1C-0x1F */ + 0x00, 0x00, 0xBF, 0xCA, 0xEB, 0x77, 0xEB, 0x79, /* 0x20-0x23 */ + 0x00, 0x00, 0xBF, 0xC8, 0xEB, 0x71, 0xEB, 0x75, /* 0x24-0x27 */ + 0x00, 0x00, 0xEB, 0x78, 0xBF, 0xC6, 0xBF, 0xC9, /* 0x28-0x2B */ + 0xEB, 0x7B, 0xEB, 0x73, 0xEB, 0x74, 0xEB, 0x7A, /* 0x2C-0x2F */ + 0xEB, 0x72, 0xEB, 0x76, 0xBF, 0xC7, 0xEE, 0x72, /* 0x30-0x33 */ + 0x00, 0x00, 0xEE, 0x71, 0xC1, 0xB7, 0xEE, 0x77, /* 0x34-0x37 */ + 0xC1, 0xB9, 0x00, 0x00, 0x00, 0x00, 0xC1, 0xB6, /* 0x38-0x3B */ + 0xEE, 0x73, 0xC1, 0xBA, 0xEE, 0x74, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xEE, 0x75, 0xEE, 0x78, 0x00, 0x00, /* 0x40-0x43 */ + 0xC1, 0xB8, 0x00, 0x00, 0xF0, 0xD6, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xF0, 0xD9, 0x00, 0x00, 0xF0, 0xD3, /* 0x48-0x4B */ + 0xF0, 0xD5, 0x00, 0x00, 0x00, 0x00, 0xF0, 0xD4, /* 0x4C-0x4F */ + 0xF0, 0xD7, 0xF0, 0xD8, 0xEE, 0x76, 0xF0, 0xD2, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xC3, 0xCD, 0xF2, 0xEC, /* 0x54-0x57 */ + 0xF2, 0xEF, 0xF2, 0xF1, 0xF2, 0xEA, 0xF2, 0xEB, /* 0x58-0x5B */ + 0xF2, 0xEE, 0xF2, 0xF0, 0xC3, 0xCE, 0xC3, 0xCC, /* 0x5C-0x5F */ + 0xC3, 0xCB, 0xF2, 0xED, 0xF2, 0xE9, 0xF4, 0xCA, /* 0x60-0x63 */ + 0xC4, 0xB0, 0x00, 0x00, 0xF4, 0xCB, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0xF6, 0x49, 0xC4, 0xFB, 0xF6, 0x4B, /* 0x68-0x6B */ + 0xC4, 0xFC, 0xF6, 0x48, 0xF6, 0x4A, 0xC5, 0xA8, /* 0x6C-0x6F */ + 0x00, 0x00, 0xF7, 0x52, 0xC5, 0xA7, 0xF7, 0xFD, /* 0x70-0x73 */ + 0xF7, 0xFC, 0x00, 0x00, 0xF7, 0xFB, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xF9, 0x48, 0xF9, 0x49, 0xF9, 0x4B, /* 0x78-0x7B */ + 0xF9, 0x4A, 0x00, 0x00, 0xCA, 0x50, 0xA6, 0xE8, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xAD, 0x6E, 0xD7, 0xC5, 0xB5, 0xF7, /* 0x80-0x83 */ + 0x00, 0x00, 0xDF, 0xFA, 0xC2, 0xD0, 0x00, 0x00, /* 0x84-0x87 */ + 0xF2, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA3, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x57, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x56, /* 0x90-0x93 */ + 0x00, 0x00, 0xDB, 0xD0, 0xB5, 0xF8, 0xDB, 0xD2, /* 0x94-0x97 */ + 0xDB, 0xD1, 0x00, 0x00, 0x00, 0x00, 0xDF, 0xFB, /* 0x98-0x9B */ + 0xB8, 0xD0, 0xE4, 0x43, 0xE4, 0x46, 0xE4, 0x45, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE4, 0x44, 0xE7, 0xCE, 0xE7, 0xD0, /* 0xA0-0xA3 */ + 0xE7, 0xCF, 0x00, 0x00, 0xBF, 0xCC, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xBF, 0xCB, 0x00, 0x00, /* 0xA8-0xAB */ + 0xC1, 0xBB, 0xEE, 0x79, 0xEE, 0x7B, 0xEE, 0x7A, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xC2, 0xD1, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xF2, 0xF4, 0xF2, 0xF3, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xF4, 0xCC, 0xC4, 0xB1, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xC4, 0xFD, 0xF7, 0x54, 0xF7, 0x53, /* 0xBC-0xBF */ + 0xC6, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA4, 0xD0, 0xAF, /* 0xD0-0xD3 */ + 0xAD, 0x6F, 0xD7, 0xC8, 0xD7, 0xC6, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xD7, 0xC7, 0xDB, 0xD4, 0xDB, 0xD5, /* 0xD8-0xDB */ + 0xE0, 0x43, 0xDB, 0xD3, 0x00, 0x00, 0xDF, 0xFC, /* 0xDC-0xDF */ + 0xE0, 0x41, 0xE0, 0x40, 0xE0, 0x42, 0xB8, 0xD1, /* 0xE0-0xE3 */ + 0xDF, 0xFE, 0xDF, 0xFD, 0xE0, 0x44, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xE4, 0x49, 0xE4, 0x47, 0x00, 0x00, 0xE4, 0x48, /* 0xE8-0xEB */ + 0xE7, 0xD3, 0xE7, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xE7, 0xD2, 0xEB, 0x7D, 0xEE, 0x7C, 0xEE, 0x7D, /* 0xF0-0xF3 */ + 0xC2, 0xD2, 0x00, 0x00, 0xF2, 0xF5, 0xF4, 0xCD, /* 0xF4-0xF7 */ + 0xC4, 0xB2, 0x00, 0x00, 0xF6, 0x4C, 0xF7, 0x55, /* 0xF8-0xFB */ + 0xC5, 0xA9, 0x00, 0x00, 0xF7, 0xFE, 0xF9, 0x4C, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8A[512] = { + 0xA8, 0xA5, 0x00, 0x00, 0xAD, 0x71, 0xAD, 0x72, /* 0x00-0x03 */ + 0xD0, 0xB0, 0x00, 0x00, 0x00, 0x00, 0xD0, 0xB1, /* 0x04-0x07 */ + 0xAD, 0x70, 0x00, 0x00, 0xB0, 0x54, 0x00, 0x00, /* 0x08-0x0B */ + 0xB0, 0x52, 0x00, 0x00, 0xB0, 0x51, 0xB0, 0x58, /* 0x0C-0x0F */ + 0xB0, 0x50, 0xB0, 0x59, 0xD3, 0xDD, 0xB0, 0x56, /* 0x10-0x13 */ + 0x00, 0x00, 0xB0, 0x53, 0xB0, 0x57, 0xB0, 0x55, /* 0x14-0x17 */ + 0xB0, 0x4F, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x5F, /* 0x18-0x1B */ + 0x00, 0x00, 0xB3, 0x59, 0xD7, 0xCC, 0xB3, 0x5E, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0x60, 0xB3, 0x5A, /* 0x20-0x23 */ + 0x00, 0x00, 0xB3, 0x5B, 0x00, 0x00, 0xD7, 0xCA, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0x58, 0x00, 0x00, /* 0x28-0x2B */ + 0xD7, 0xCB, 0xB3, 0x5D, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xD7, 0xC9, 0xB3, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0xB6, 0x44, 0x00, 0x00, 0xB6, 0x46, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xDB, 0xD8, 0xB6, 0x45, 0xB5, 0xF9, /* 0x38-0x3B */ + 0xB5, 0xFD, 0x00, 0x00, 0xB8, 0xE4, 0xE0, 0x49, /* 0x3C-0x3F */ + 0xDB, 0xDA, 0xB5, 0xFE, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xDB, 0xDD, 0xDB, 0xDE, 0xB6, 0x43, 0x00, 0x00, /* 0x44-0x47 */ + 0xDB, 0xE0, 0x00, 0x00, 0xDB, 0xE2, 0x00, 0x00, /* 0x48-0x4B */ + 0xDB, 0xE3, 0xDB, 0xD7, 0xDB, 0xD6, 0xDB, 0xE4, /* 0x4C-0x4F */ + 0xB6, 0x42, 0xDB, 0xE1, 0xDB, 0xDF, 0x00, 0x00, /* 0x50-0x53 */ + 0xB6, 0x40, 0xB5, 0xFB, 0xB6, 0x47, 0xDB, 0xDB, /* 0x54-0x57 */ + 0xDB, 0xDC, 0xDB, 0xD9, 0x00, 0x00, 0xB6, 0x41, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xB5, 0xFC, 0x00, 0x00, /* 0x5C-0x5F */ + 0xB5, 0xFA, 0xE0, 0x48, 0xB8, 0xDF, 0xB8, 0xDA, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xB8, 0xD5, 0x00, 0x00, /* 0x64-0x67 */ + 0xB8, 0xE5, 0xB8, 0xD6, 0x00, 0x00, 0xB8, 0xD2, /* 0x68-0x6B */ + 0xB8, 0xE1, 0xB8, 0xDE, 0xB8, 0xE0, 0x00, 0x00, /* 0x6C-0x6F */ + 0xB8, 0xD7, 0xB8, 0xDC, 0xB8, 0xD3, 0xB8, 0xD4, /* 0x70-0x73 */ + 0xE0, 0x50, 0xE0, 0x4D, 0xE0, 0x45, 0xE0, 0x4A, /* 0x74-0x77 */ + 0x00, 0x00, 0xB8, 0xE2, 0xE0, 0x51, 0xB8, 0xE3, /* 0x78-0x7B */ + 0xB8, 0xD9, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x47, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xE0, 0x4F, 0xE0, 0x4B, 0xE0, 0x4E, /* 0x80-0x83 */ + 0xE0, 0x4C, 0xB8, 0xDD, 0xE0, 0x46, 0xB8, 0xD8, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x4C, /* 0x88-0x8B */ + 0xBB, 0x78, 0xBB, 0x7B, 0x00, 0x00, 0xE4, 0x4E, /* 0x8C-0x8F */ + 0x00, 0x00, 0xBB, 0xA5, 0xE4, 0x4D, 0xBB, 0x7D, /* 0x90-0x93 */ + 0x00, 0x00, 0xBD, 0xCF, 0xE4, 0x4F, 0x00, 0x00, /* 0x94-0x97 */ + 0xBB, 0xA4, 0xE4, 0x4B, 0xBB, 0xA6, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xBB, 0x79, 0x00, 0x00, /* 0x9C-0x9F */ + 0xB8, 0xDB, 0xBB, 0x7C, 0x00, 0x00, 0xBB, 0x7A, /* 0xA0-0xA3 */ + 0xBB, 0x7E, 0xBB, 0xA2, 0xBB, 0x77, 0xBB, 0xA7, /* 0xA4-0xA7 */ + 0xBB, 0xA3, 0x00, 0x00, 0xBB, 0xA1, 0xE4, 0x4A, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0xBD, 0xD6, 0x00, 0x00, 0xBD, 0xD2, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xBD, 0xD9, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xE7, 0xD6, 0xBD, 0xDA, 0xE7, 0xE2, 0xE7, 0xDB, /* 0xB8-0xBB */ + 0xBD, 0xCB, 0xE7, 0xE3, 0xE7, 0xDD, 0xBD, 0xD5, /* 0xBC-0xBF */ + 0xE7, 0xDE, 0x00, 0x00, 0xBD, 0xD4, 0xE7, 0xE1, /* 0xC0-0xC3 */ + 0xBD, 0xCE, 0xE7, 0xDF, 0xE7, 0xD5, 0xBD, 0xCD, /* 0xC4-0xC7 */ + 0xEB, 0xAA, 0xBD, 0xD3, 0x00, 0x00, 0xBD, 0xD0, /* 0xC8-0xCB */ + 0x00, 0x00, 0xBD, 0xD8, 0x00, 0x00, 0xE7, 0xD4, /* 0xCC-0xCF */ + 0x00, 0x00, 0xE7, 0xD8, 0xBD, 0xCC, 0xE7, 0xD7, /* 0xD0-0xD3 */ + 0xE7, 0xD9, 0xE7, 0xDA, 0xBD, 0xD7, 0xE7, 0xDC, /* 0xD4-0xD7 */ + 0xE7, 0xE0, 0xE7, 0xE4, 0x00, 0x00, 0xBD, 0xDB, /* 0xD8-0xDB */ + 0xBF, 0xD2, 0xEB, 0xA5, 0xEB, 0xAB, 0xEB, 0xA8, /* 0xDC-0xDF */ + 0xEB, 0x7E, 0xEB, 0xAC, 0xEB, 0xA1, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xEB, 0xA7, 0x00, 0x00, 0xBF, 0xCD, 0xBF, 0xD3, /* 0xE4-0xE7 */ + 0xEB, 0xAD, 0x00, 0x00, 0x00, 0x00, 0xBF, 0xCF, /* 0xE8-0xEB */ + 0x00, 0x00, 0xBF, 0xD9, 0xBF, 0xD4, 0xEB, 0xAF, /* 0xEC-0xEF */ + 0xEB, 0xA9, 0xBF, 0xD0, 0xEB, 0xA2, 0xBF, 0xDA, /* 0xF0-0xF3 */ + 0xEB, 0xA3, 0xEB, 0xA4, 0xBF, 0xDB, 0xBF, 0xD8, /* 0xF4-0xF7 */ + 0xBD, 0xD1, 0x00, 0x00, 0xBF, 0xCE, 0xEB, 0xB0, /* 0xF8-0xFB */ + 0xBF, 0xDC, 0x00, 0x00, 0xBF, 0xD5, 0xEB, 0xAE, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8B[512] = { + 0xBF, 0xD1, 0xBF, 0xD6, 0xBF, 0xD7, 0x00, 0x00, /* 0x00-0x03 */ + 0xC1, 0xC3, 0xEE, 0xA4, 0xEE, 0xAD, 0xEE, 0xAA, /* 0x04-0x07 */ + 0xEE, 0xAC, 0x00, 0x00, 0xC1, 0xC0, 0xEE, 0xA5, /* 0x08-0x0B */ + 0x00, 0x00, 0xEE, 0xAB, 0xC1, 0xBC, 0xEE, 0xA7, /* 0x0C-0x0F */ + 0xC1, 0xC4, 0xEE, 0xA3, 0xEE, 0xA8, 0xEE, 0xAF, /* 0x10-0x13 */ + 0xEB, 0xA6, 0xEE, 0xA9, 0xEE, 0xA2, 0xC1, 0xBD, /* 0x14-0x17 */ + 0xEE, 0xA1, 0xC1, 0xBE, 0xEE, 0xB0, 0xC1, 0xBF, /* 0x18-0x1B */ + 0xEE, 0xAE, 0xC1, 0xC2, 0xEE, 0x7E, 0x00, 0x00, /* 0x1C-0x1F */ + 0xC1, 0xC1, 0x00, 0x00, 0xEE, 0xA6, 0xF0, 0xDC, /* 0x20-0x23 */ + 0xF0, 0xEA, 0xF0, 0xE5, 0xF0, 0xE7, 0xF0, 0xDB, /* 0x24-0x27 */ + 0xC2, 0xD3, 0x00, 0x00, 0xF0, 0xDA, 0xC2, 0xD6, /* 0x28-0x2B */ + 0xC2, 0xD5, 0x00, 0x00, 0xF0, 0xE9, 0xF0, 0xE1, /* 0x2C-0x2F */ + 0xF0, 0xDE, 0xF0, 0xE4, 0x00, 0x00, 0xF0, 0xDD, /* 0x30-0x33 */ + 0x00, 0x00, 0xF0, 0xDF, 0xF0, 0xE8, 0xF0, 0xE6, /* 0x34-0x37 */ + 0x00, 0x00, 0xC2, 0xD4, 0xF0, 0xED, 0xF0, 0xEB, /* 0x38-0x3B */ + 0xF0, 0xE2, 0xF0, 0xEC, 0xF0, 0xE3, 0x00, 0x00, /* 0x3C-0x3F */ + 0xF2, 0xF9, 0xC3, 0xCF, 0xF3, 0x41, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xF6, 0x4F, 0xC3, 0xD6, 0xF0, 0xE0, /* 0x44-0x47 */ + 0xF2, 0xF7, 0xC3, 0xD2, 0xF2, 0xF8, 0xF2, 0xFD, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0xC3, 0xD4, 0xC3, 0xD5, /* 0x4C-0x4F */ + 0xF2, 0xF6, 0xF3, 0x40, 0xF3, 0x42, 0xF2, 0xFA, /* 0x50-0x53 */ + 0xF2, 0xFC, 0xF2, 0xFE, 0xF2, 0xFB, 0xF3, 0x43, /* 0x54-0x57 */ + 0xC3, 0xD1, 0xC3, 0xD7, 0xC3, 0xD3, 0x00, 0x00, /* 0x58-0x5B */ + 0xC3, 0xD0, 0xF4, 0xD0, 0x00, 0x00, 0xC4, 0xB7, /* 0x5C-0x5F */ + 0xF4, 0xCE, 0x00, 0x00, 0x00, 0x00, 0xF4, 0xD2, /* 0x60-0x63 */ + 0x00, 0x00, 0xF4, 0xD3, 0xC4, 0xB5, 0xF4, 0xD4, /* 0x64-0x67 */ + 0xF4, 0xD1, 0x00, 0x00, 0xF4, 0xCF, 0xC4, 0xB8, /* 0x68-0x6B */ + 0xC4, 0xB4, 0xF4, 0xD5, 0x00, 0x00, 0xC4, 0xB6, /* 0x6C-0x6F */ + 0xC4, 0xB3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xC4, 0xFE, 0x00, 0x00, 0x00, 0x00, 0xC5, 0x40, /* 0x74-0x77 */ + 0xF6, 0x4E, 0xF6, 0x4D, 0xF6, 0x50, 0xF6, 0x51, /* 0x78-0x7B */ + 0x00, 0x00, 0xC5, 0x41, 0xF7, 0x56, 0xF7, 0x5B, /* 0x7C-0x7F */ + + 0xC5, 0xAA, 0x00, 0x00, 0xF7, 0x58, 0x00, 0x00, /* 0x80-0x83 */ + 0xF7, 0x57, 0xF7, 0x5A, 0xF7, 0x59, 0x00, 0x00, /* 0x84-0x87 */ + 0xF8, 0x43, 0x00, 0x00, 0xC5, 0xDC, 0xF8, 0x42, /* 0x88-0x8B */ + 0xF8, 0x40, 0x00, 0x00, 0xF8, 0x41, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0xC5, 0xFE, 0xC5, 0xFD, /* 0x90-0x93 */ + 0xF8, 0xC1, 0xF8, 0xC2, 0xC6, 0x40, 0x00, 0x00, /* 0x94-0x97 */ + 0xF9, 0x4D, 0xF9, 0x4E, 0xC6, 0x67, 0x00, 0x00, /* 0x98-0x9B */ + 0xC6, 0x6D, 0x00, 0x00, 0xF9, 0xA9, 0xF9, 0xC8, /* 0x9C-0x9F */ +}; + +static const unsigned char u2c_8C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xA6, /* 0x34-0x37 */ + 0x00, 0x00, 0xD7, 0xCD, 0x00, 0x00, 0xD7, 0xCE, /* 0x38-0x3B */ + 0xE0, 0x52, 0xE4, 0x50, 0xE7, 0xE5, 0xC1, 0xC6, /* 0x3C-0x3F */ + 0x00, 0x00, 0xC1, 0xC5, 0xF0, 0xEE, 0xF3, 0x44, /* 0x40-0x43 */ + 0x00, 0x00, 0xF8, 0x44, 0xA8, 0xA7, 0xD3, 0xDE, /* 0x44-0x47 */ + 0xB0, 0x5A, 0xB3, 0x61, 0xE0, 0x54, 0xE0, 0x53, /* 0x48-0x4B */ + 0xBD, 0xDC, 0xE7, 0xE6, 0xBD, 0xDD, 0xEE, 0xB1, /* 0x4C-0x4F */ + 0xC2, 0xD7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0xC6, 0x76, 0xA8, 0xA8, 0xCD, 0xCB, 0xD3, 0xDF, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0x62, 0x00, 0x00, /* 0x58-0x5B */ + 0xD7, 0xCF, 0xD7, 0xD0, 0x00, 0x00, 0xDB, 0xE5, /* 0x5C-0x5F */ + 0x00, 0x00, 0xB6, 0x48, 0xB8, 0xE6, 0x00, 0x00, /* 0x60-0x63 */ + 0xE0, 0x56, 0xE0, 0x55, 0xE0, 0x57, 0x00, 0x00, /* 0x64-0x67 */ + 0xE4, 0x51, 0xE4, 0x52, 0xBB, 0xA8, 0xBF, 0xDD, /* 0x68-0x6B */ + 0xBD, 0xDE, 0xBF, 0xDE, 0x00, 0x00, 0xEE, 0xB5, /* 0x6C-0x6F */ + 0xEE, 0xB2, 0xEE, 0xB4, 0xEE, 0xB3, 0xC1, 0xC7, /* 0x70-0x73 */ + 0x00, 0x00, 0xF0, 0xEF, 0xF3, 0x46, 0xF3, 0x45, /* 0x74-0x77 */ + 0xCB, 0xA4, 0xB0, 0x5C, 0xB0, 0x5B, 0xD3, 0xE0, /* 0x78-0x7B */ + 0x00, 0x00, 0xD7, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xDB, 0xE7, 0xDB, 0xE6, 0xB6, 0x49, 0x00, 0x00, /* 0x80-0x83 */ + 0xE0, 0x59, 0xE0, 0x5A, 0xE0, 0x58, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xB8, 0xE8, 0xB8, 0xE7, 0x00, 0x00, /* 0x88-0x8B */ + 0xBB, 0xAA, 0xBB, 0xA9, 0x00, 0x00, 0xE7, 0xE7, /* 0x8C-0x8F */ + 0xEB, 0xB3, 0xEB, 0xB1, 0xEB, 0xB2, 0xBF, 0xDF, /* 0x90-0x93 */ + 0xEE, 0xB7, 0xEE, 0xB6, 0x00, 0x00, 0xF0, 0xF2, /* 0x94-0x97 */ + 0xF0, 0xF1, 0xF0, 0xF0, 0xF3, 0x47, 0x00, 0x00, /* 0x98-0x9B */ + 0xF9, 0xAA, 0xA8, 0xA9, 0xAD, 0x73, 0x00, 0x00, /* 0x9C-0x9F */ + 0xAD, 0x74, 0xB0, 0x5D, 0xB0, 0x5E, 0xD3, 0xE2, /* 0xA0-0xA3 */ + 0xD3, 0xE1, 0xD7, 0xD2, 0x00, 0x00, 0xB3, 0x68, /* 0xA4-0xA7 */ + 0xB3, 0x66, 0xB3, 0x63, 0xB3, 0x67, 0xB3, 0x65, /* 0xA8-0xAB */ + 0xB3, 0x64, 0x00, 0x00, 0x00, 0x00, 0xB6, 0x4A, /* 0xAC-0xAF */ + 0xDB, 0xEA, 0x00, 0x00, 0xB8, 0xED, 0xB6, 0x4C, /* 0xB0-0xB3 */ + 0xB6, 0x51, 0xDB, 0xEC, 0xB6, 0x53, 0xB6, 0x52, /* 0xB4-0xB7 */ + 0xB6, 0x55, 0xDB, 0xEB, 0xDB, 0xE8, 0xB6, 0x4F, /* 0xB8-0xBB */ + 0xB6, 0x4B, 0xB6, 0x4D, 0xDB, 0xE9, 0xB6, 0x54, /* 0xBC-0xBF */ + 0xB6, 0x50, 0xB6, 0x4E, 0xB8, 0xEF, 0xB8, 0xEE, /* 0xC0-0xC3 */ + 0xB8, 0xEC, 0xB8, 0xF0, 0x00, 0x00, 0xB8, 0xEA, /* 0xC4-0xC7 */ + 0xB8, 0xEB, 0x00, 0x00, 0xB8, 0xE9, 0x00, 0x00, /* 0xC8-0xCB */ + 0xE0, 0x5B, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x54, /* 0xCC-0xCF */ + 0x00, 0x00, 0xBB, 0xAC, 0xBB, 0xAD, 0xBB, 0xAB, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE4, 0x53, 0x00, 0x00, 0xE4, 0x55, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xE7, 0xEA, 0xE7, 0xEC, 0x00, 0x00, /* 0xD8-0xDB */ + 0xBD, 0xE7, 0xE7, 0xED, 0xBD, 0xE0, 0xE7, 0xE9, /* 0xDC-0xDF */ + 0xBD, 0xDF, 0xBD, 0xE9, 0xBD, 0xE5, 0xBD, 0xE6, /* 0xE0-0xE3 */ + 0xBD, 0xE2, 0xE7, 0xE8, 0xBD, 0xE1, 0xE7, 0xEE, /* 0xE4-0xE7 */ + 0xE7, 0xEB, 0x00, 0x00, 0xBD, 0xE8, 0x00, 0x00, /* 0xE8-0xEB */ + 0xBD, 0xE3, 0xBD, 0xE4, 0xEB, 0xB5, 0x00, 0x00, /* 0xEC-0xEF */ + 0xEB, 0xB7, 0xEB, 0xB6, 0x00, 0x00, 0xEB, 0xB8, /* 0xF0-0xF3 */ + 0xBF, 0xE0, 0xEB, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xC1, 0xCB, 0xEE, 0xB8, 0xC1, 0xC8, 0xC1, 0xCC, /* 0xF8-0xFB */ + 0xC1, 0xCA, 0xC1, 0xC9, 0xF0, 0xF3, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8D[512] = { + 0xF0, 0xF6, 0x00, 0x00, 0xF0, 0xF5, 0x00, 0x00, /* 0x00-0x03 */ + 0xF0, 0xF4, 0xC2, 0xD8, 0xF3, 0x48, 0xF3, 0x49, /* 0x04-0x07 */ + 0xC3, 0xD8, 0xF3, 0x4A, 0xC3, 0xD9, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xC4, 0xBA, 0x00, 0x00, 0xC4, 0xB9, /* 0x0C-0x0F */ + 0xF6, 0x52, 0x00, 0x00, 0x00, 0x00, 0xC5, 0x42, /* 0x10-0x13 */ + 0xF6, 0x53, 0xF7, 0x5C, 0xC5, 0xAB, 0xC5, 0xAC, /* 0x14-0x17 */ + 0x00, 0x00, 0xF8, 0x45, 0x00, 0x00, 0xC6, 0x42, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xA8, 0xAA, 0x00, 0x00, 0xB3, 0x6A, 0xB3, 0x69, /* 0x64-0x67 */ + 0xE0, 0x5C, 0xE0, 0x5D, 0x00, 0x00, 0xBB, 0xAE, /* 0x68-0x6B */ + 0xEB, 0xB9, 0xBD, 0xEA, 0xEB, 0xBA, 0xEE, 0xB9, /* 0x6C-0x6F */ + 0xA8, 0xAB, 0x00, 0x00, 0xD0, 0xB2, 0xAD, 0x76, /* 0x70-0x73 */ + 0xAD, 0x75, 0x00, 0x00, 0xD3, 0xE3, 0xB0, 0x5F, /* 0x74-0x77 */ + 0xD3, 0xE4, 0xD7, 0xD5, 0x00, 0x00, 0xD7, 0xD4, /* 0x78-0x7B */ + 0x00, 0x00, 0xD7, 0xD3, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xDB, 0xEE, 0xB6, 0x58, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0xDB, 0xED, 0xB6, 0x57, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0xDB, 0xEF, 0xB6, 0x56, 0x00, 0x00, /* 0x88-0x8B */ + 0xE0, 0x5F, 0xE0, 0x62, 0xE0, 0x60, 0xE0, 0x61, /* 0x8C-0x8F */ + 0xE0, 0x65, 0xE0, 0x5E, 0xE0, 0x66, 0xE0, 0x63, /* 0x90-0x93 */ + 0xE0, 0x64, 0xBB, 0xB0, 0xE4, 0x56, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xBB, 0xAF, 0x00, 0x00, 0xE7, 0xF2, /* 0x98-0x9B */ + 0xE7, 0xF0, 0x00, 0x00, 0x00, 0x00, 0xBD, 0xEB, /* 0x9C-0x9F */ + 0xE7, 0xEF, 0xE7, 0xF1, 0x00, 0x00, 0xBD, 0xEC, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xEB, 0xBB, 0x00, 0x00, 0xEB, 0xBC, /* 0xA4-0xA7 */ + 0xC1, 0xCD, 0x00, 0x00, 0xF3, 0x4C, 0xF3, 0x4E, /* 0xA8-0xAB */ + 0xF3, 0x4B, 0xF3, 0x4D, 0xF4, 0xD6, 0xF6, 0x54, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0x6F, 0xA8, 0xAC, /* 0xB0-0xB3 */ + 0xAD, 0x77, 0xD3, 0xE5, 0xD3, 0xE7, 0xD3, 0xE6, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xD7, 0xD8, 0xB3, 0x6C, 0x00, 0x00, /* 0xB8-0xBB */ + 0xD7, 0xD6, 0x00, 0x00, 0xB3, 0x6B, 0xD7, 0xD9, /* 0xBC-0xBF */ + 0x00, 0x00, 0xD7, 0xDA, 0xD7, 0xD7, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xDB, 0xFB, 0xB6, 0x60, 0xDB, 0xF3, /* 0xC4-0xC7 */ + 0xDB, 0xF9, 0x00, 0x00, 0x00, 0x00, 0xB6, 0x5B, /* 0xC8-0xCB */ + 0xB6, 0x5E, 0xDB, 0xF2, 0xB6, 0x59, 0xDB, 0xF6, /* 0xCC-0xCF */ + 0xE0, 0x6C, 0xB6, 0x5D, 0x00, 0x00, 0xDB, 0xF1, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xDB, 0xF7, 0xDB, 0xF4, 0xDB, 0xFA, /* 0xD4-0xD7 */ + 0xDB, 0xF0, 0xDB, 0xF8, 0xB6, 0x5C, 0xB6, 0x5F, /* 0xD8-0xDB */ + 0xDB, 0xF5, 0xB6, 0x5A, 0x00, 0x00, 0xB8, 0xF2, /* 0xDC-0xDF */ + 0xE0, 0x68, 0xB8, 0xF1, 0xE0, 0x6F, 0xE0, 0x6E, /* 0xE0-0xE3 */ + 0xB8, 0xF8, 0x00, 0x00, 0xB8, 0xF9, 0xE0, 0x70, /* 0xE4-0xE7 */ + 0xB8, 0xF3, 0xE0, 0x6D, 0xB8, 0xF7, 0xE0, 0x72, /* 0xE8-0xEB */ + 0xE0, 0x69, 0x00, 0x00, 0xE0, 0x6B, 0xB8, 0xF4, /* 0xEC-0xEF */ + 0xE0, 0x67, 0xE0, 0x6A, 0xE0, 0x71, 0xB8, 0xF5, /* 0xF0-0xF3 */ + 0xE0, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0x00, 0x00, 0xB8, 0xF6, 0x00, 0x00, /* 0xF8-0xFB */ + 0xBB, 0xB1, 0xE4, 0x5B, 0xE4, 0x61, 0xE4, 0x59, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8E[512] = { + 0xE4, 0x62, 0x00, 0x00, 0xE4, 0x58, 0xE4, 0x5D, /* 0x00-0x03 */ + 0xE4, 0x63, 0xE4, 0x60, 0xE4, 0x5F, 0xE4, 0x5E, /* 0x04-0x07 */ + 0x00, 0x00, 0xE4, 0x57, 0xE4, 0x5C, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0xE4, 0x5A, 0x00, 0x00, 0xBD, 0xF1, /* 0x0C-0x0F */ + 0xBD, 0xEE, 0xE7, 0xFB, 0xE8, 0x41, 0xE8, 0x43, /* 0x10-0x13 */ + 0xE8, 0x40, 0xE7, 0xF8, 0xE7, 0xFA, 0xE8, 0x45, /* 0x14-0x17 */ + 0xE8, 0x42, 0xE7, 0xFC, 0xE8, 0x46, 0xE7, 0xF9, /* 0x18-0x1B */ + 0xE8, 0x44, 0xBD, 0xEF, 0xBD, 0xF5, 0xBD, 0xF3, /* 0x1C-0x1F */ + 0xE7, 0xF3, 0xBD, 0xF4, 0xBD, 0xF0, 0xE7, 0xF4, /* 0x20-0x23 */ + 0xE7, 0xF6, 0xE7, 0xF5, 0xE7, 0xFD, 0xE7, 0xFE, /* 0x24-0x27 */ + 0x00, 0x00, 0xBD, 0xF2, 0x00, 0x00, 0xBD, 0xED, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xE7, 0xF7, 0x00, 0x00, /* 0x2C-0x2F */ + 0xEB, 0xC6, 0xBF, 0xE2, 0x00, 0x00, 0xEB, 0xBD, /* 0x30-0x33 */ + 0xBF, 0xE3, 0xBF, 0xE6, 0xEB, 0xC2, 0x00, 0x00, /* 0x34-0x37 */ + 0xEB, 0xBF, 0xBF, 0xE5, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xEB, 0xC3, 0xEB, 0xC4, 0xEB, 0xBE, 0xEB, 0xC7, /* 0x3C-0x3F */ + 0xEB, 0xC0, 0xEB, 0xC5, 0xBF, 0xE4, 0x00, 0x00, /* 0x40-0x43 */ + 0xBF, 0xE1, 0xEB, 0xC1, 0x00, 0x00, 0xEE, 0xBF, /* 0x44-0x47 */ + 0xC1, 0xD0, 0xC1, 0xCE, 0xC1, 0xD1, 0xC1, 0xCF, /* 0x48-0x4B */ + 0xEE, 0xBE, 0xEE, 0xBB, 0xEE, 0xBA, 0x00, 0x00, /* 0x4C-0x4F */ + 0xEE, 0xBD, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xBC, /* 0x50-0x53 */ + 0xF1, 0x45, 0xC2, 0xDE, 0xF0, 0xFB, 0xF0, 0xFA, /* 0x54-0x57 */ + 0x00, 0x00, 0xC2, 0xD9, 0xF1, 0x41, 0xF1, 0x40, /* 0x58-0x5B */ + 0xF0, 0xF7, 0xF1, 0x43, 0xF0, 0xFC, 0xC2, 0xDD, /* 0x5C-0x5F */ + 0xF0, 0xF9, 0xF1, 0x42, 0xF0, 0xF8, 0xC2, 0xDA, /* 0x60-0x63 */ + 0xC2, 0xDC, 0xF0, 0xFD, 0xC2, 0xDB, 0xF0, 0xFE, /* 0x64-0x67 */ + 0x00, 0x00, 0xF1, 0x44, 0xF3, 0x52, 0x00, 0x00, /* 0x68-0x6B */ + 0xC3, 0xDE, 0xF3, 0x4F, 0x00, 0x00, 0xF3, 0x53, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xC3, 0xDB, 0xF3, 0x51, /* 0x70-0x73 */ + 0xC3, 0xE0, 0x00, 0x00, 0xC3, 0xDD, 0x00, 0x00, /* 0x74-0x77 */ + 0xF3, 0x50, 0x00, 0x00, 0xC3, 0xDF, 0xF3, 0x54, /* 0x78-0x7B */ + 0xC3, 0xDA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0xC4, 0xBC, 0xC4, 0xBE, 0x00, 0x00, /* 0x80-0x83 */ + 0xF4, 0xD9, 0xC4, 0xBD, 0xF4, 0xD7, 0xC3, 0xDC, /* 0x84-0x87 */ + 0xF4, 0xD8, 0xC4, 0xBB, 0xC5, 0x43, 0xC5, 0x45, /* 0x88-0x8B */ + 0xF6, 0x56, 0xC5, 0x44, 0xF6, 0x55, 0x00, 0x00, /* 0x8C-0x8F */ + 0xF7, 0x61, 0xC5, 0xAD, 0xF7, 0x60, 0xC5, 0xAE, /* 0x90-0x93 */ + 0xF7, 0x5E, 0xF7, 0x5D, 0xF7, 0x62, 0xF7, 0x63, /* 0x94-0x97 */ + 0xF8, 0x46, 0x00, 0x00, 0xF7, 0x5F, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0xF8, 0xC6, 0xF8, 0xC3, 0xF8, 0xC4, /* 0x9C-0x9F */ + 0xF8, 0xC5, 0xC6, 0x5C, 0x00, 0x00, 0xF9, 0x51, /* 0xA0-0xA3 */ + 0xF9, 0x50, 0xF9, 0x4F, 0xF9, 0x70, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xF9, 0xBE, 0xF9, 0xAB, 0xC6, 0x6E, 0xA8, 0xAD, /* 0xA8-0xAB */ + 0xB0, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xB8, 0xFA, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0xBD, 0xF6, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xEB, 0xC8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xC2, 0xDF, 0x00, 0x00, 0xF3, 0x55, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF9, 0xAC, 0xA8, 0xAE, 0xAA, 0xEE, /* 0xC8-0xCB */ + 0xAD, 0x79, 0xAD, 0x78, 0x00, 0x00, 0xB0, 0x63, /* 0xCC-0xCF */ + 0x00, 0x00, 0xD3, 0xE8, 0xB0, 0x61, 0xD3, 0xE9, /* 0xD0-0xD3 */ + 0xB0, 0x62, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xDF, /* 0xD4-0xD7 */ + 0xD7, 0xDB, 0x00, 0x00, 0x00, 0x00, 0xB3, 0x6D, /* 0xD8-0xDB */ + 0xD7, 0xDE, 0xD7, 0xDD, 0xD7, 0xDC, 0xB3, 0x6E, /* 0xDC-0xDF */ + 0xD7, 0xE0, 0xD7, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xDC, 0x43, 0xDC, 0x41, 0xDC, 0x45, /* 0xE4-0xE7 */ + 0xDC, 0x46, 0xDC, 0x4C, 0x00, 0x00, 0xDC, 0x48, /* 0xE8-0xEB */ + 0xDC, 0x4A, 0x00, 0x00, 0xDC, 0x42, 0xDB, 0xFC, /* 0xEC-0xEF */ + 0x00, 0x00, 0xDC, 0x49, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xDC, 0x4B, 0xDC, 0x44, 0xDC, 0x47, 0xDB, 0xFD, /* 0xF4-0xF7 */ + 0xB6, 0x62, 0xDC, 0x40, 0xDB, 0xFE, 0xB6, 0x61, /* 0xF8-0xFB */ + 0xB6, 0x63, 0x00, 0x00, 0xB8, 0xFD, 0xE0, 0x75, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_8F[512] = { + 0xE0, 0x77, 0xE0, 0x76, 0xE0, 0x7B, 0xB8, 0xFB, /* 0x00-0x03 */ + 0x00, 0x00, 0xE0, 0x78, 0xE0, 0x74, 0xE0, 0x79, /* 0x04-0x07 */ + 0xE0, 0x7A, 0xB8, 0xFC, 0xB8, 0xFE, 0xE0, 0x7C, /* 0x08-0x0B */ + 0x00, 0x00, 0xE4, 0x67, 0xE4, 0x66, 0x00, 0x00, /* 0x0C-0x0F */ + 0xE4, 0x64, 0xE4, 0x65, 0xBB, 0xB3, 0xBB, 0xB5, /* 0x10-0x13 */ + 0xBB, 0xB2, 0xBB, 0xB4, 0xE8, 0x4D, 0xE8, 0x4E, /* 0x14-0x17 */ + 0xE8, 0x49, 0x00, 0x00, 0xE8, 0x4A, 0xBD, 0xF8, /* 0x18-0x1B */ + 0xBD, 0xFD, 0xBD, 0xF7, 0xBD, 0xFE, 0xBD, 0xF9, /* 0x1C-0x1F */ + 0xE8, 0x4B, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x4C, /* 0x20-0x23 */ + 0xE8, 0x48, 0xBE, 0x40, 0xBD, 0xFB, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0xBD, 0xFA, 0xBD, 0xFC, 0x00, 0x00, /* 0x28-0x2B */ + 0xE8, 0x47, 0x00, 0x00, 0xEB, 0xCA, 0xBF, 0xE8, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xCC, 0xBF, 0xEA, /* 0x30-0x33 */ + 0xEB, 0xCF, 0xEB, 0xCB, 0xEB, 0xC9, 0xEB, 0xCE, /* 0x34-0x37 */ + 0xBF, 0xE9, 0xEB, 0xCD, 0x00, 0x00, 0xBF, 0xE7, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xC1, 0xD3, 0xC1, 0xD6, /* 0x3C-0x3F */ + 0xEE, 0xC1, 0x00, 0x00, 0xC1, 0xD4, 0xEE, 0xC0, /* 0x40-0x43 */ + 0xC1, 0xD2, 0xC1, 0xD5, 0xF1, 0x46, 0xF1, 0x47, /* 0x44-0x47 */ + 0xF1, 0x48, 0xC2, 0xE0, 0x00, 0x00, 0xF1, 0x49, /* 0x48-0x4B */ + 0x00, 0x00, 0xC2, 0xE1, 0xC3, 0xE2, 0xF3, 0x58, /* 0x4C-0x4F */ + 0xF3, 0x59, 0xF3, 0x57, 0xF3, 0x56, 0xF3, 0x5A, /* 0x50-0x53 */ + 0xC3, 0xE1, 0xF4, 0xDD, 0xF4, 0xDB, 0xF4, 0xDC, /* 0x54-0x57 */ + 0xF4, 0xDE, 0xF4, 0xDA, 0xF4, 0xDF, 0xF6, 0x58, /* 0x58-0x5B */ + 0x00, 0x00, 0xF6, 0x59, 0xF6, 0x57, 0xC5, 0x46, /* 0x5C-0x5F */ + 0xF7, 0x64, 0xC5, 0xAF, 0xF7, 0x65, 0xF8, 0x48, /* 0x60-0x63 */ + 0xF8, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0xAF, /* 0x98-0x9B */ + 0xB6, 0x64, 0x00, 0x00, 0x00, 0x00, 0xB9, 0x40, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBB, 0xB6, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0xBF, 0xEC, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xBF, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xC3, 0xE3, 0xC4, 0x7C, 0xC5, 0x47, /* 0xAC-0xAF */ + 0xA8, 0xB0, 0xB0, 0x64, 0xB9, 0x41, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xF3, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCB, 0xA6, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xB1, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xA8, 0xB4, 0xA8, 0xB3, 0xA8, 0xB2, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xCB, 0xA5, 0x00, 0x00, 0xCD, 0xCD, /* 0xC8-0xCB */ + 0x00, 0x00, 0xCD, 0xCF, 0xAA, 0xEF, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0xAA, 0xF1, 0xCD, 0xCC, 0xCD, 0xCE, /* 0xD0-0xD3 */ + 0xAA, 0xF0, 0xCD, 0xD1, 0xCD, 0xD0, 0xCD, 0xD2, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xD0, 0xB6, 0xD0, 0xB4, 0xAD, 0x7C, 0xD0, 0xB3, /* 0xE0-0xE3 */ + 0xAD, 0xA3, 0xAD, 0x7E, 0xAD, 0x7B, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xAD, 0xA4, 0x00, 0x00, 0xAD, 0x7D, 0xAD, 0xA2, /* 0xE8-0xEB */ + 0x00, 0x00, 0xAD, 0xA1, 0xD0, 0xB5, 0x00, 0x00, /* 0xEC-0xEF */ + 0xAD, 0x7A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xB0, 0x6A, 0xD3, 0xEB, 0xD3, 0xF1, 0xB0, 0x67, /* 0xF4-0xF7 */ + 0xB0, 0x6E, 0x00, 0x00, 0xB0, 0x69, 0xD3, 0xEE, /* 0xF8-0xFB */ + 0xD3, 0xF0, 0xB0, 0x6C, 0xD3, 0xEA, 0xD3, 0xED, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_90[512] = { + 0xB0, 0x68, 0xB0, 0x65, 0xD3, 0xEC, 0xB0, 0x6B, /* 0x00-0x03 */ + 0xD3, 0xEF, 0xB0, 0x6D, 0xB0, 0x66, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD7, 0xE3, /* 0x08-0x0B */ + 0xD7, 0xE6, 0xB3, 0x70, 0x00, 0x00, 0xB3, 0x7A, /* 0x0C-0x0F */ + 0xB3, 0x76, 0xD7, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xB3, 0x7E, 0xB3, 0x77, 0xB3, 0x7C, 0xB3, 0x72, /* 0x14-0x17 */ + 0x00, 0x00, 0xB3, 0x6F, 0xB3, 0x71, 0xB3, 0x7D, /* 0x18-0x1B */ + 0xD7, 0xE5, 0xB3, 0x75, 0xB3, 0x78, 0xB3, 0x74, /* 0x1C-0x1F */ + 0xB3, 0x79, 0xD7, 0xE7, 0xB3, 0x7B, 0xB3, 0x73, /* 0x20-0x23 */ + 0xD7, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xDC, 0x4D, 0xB6, 0x65, 0xDC, 0x4F, /* 0x2C-0x2F */ + 0x00, 0x00, 0xB6, 0x67, 0xB6, 0x69, 0x00, 0x00, /* 0x30-0x33 */ + 0xDC, 0x4E, 0xB6, 0x66, 0xB6, 0x6A, 0x00, 0x00, /* 0x34-0x37 */ + 0xB6, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xB9, 0x47, 0xE0, 0xA3, 0xB9, 0x4F, 0xE0, 0x7E, /* 0x3C-0x3F */ + 0x00, 0x00, 0xB9, 0x50, 0xB9, 0x45, 0x00, 0x00, /* 0x40-0x43 */ + 0xE0, 0xA1, 0x00, 0x00, 0x00, 0x00, 0xB9, 0x4A, /* 0x44-0x47 */ + 0x00, 0x00, 0xE0, 0xA2, 0xB9, 0x43, 0xB9, 0x42, /* 0x48-0x4B */ + 0x00, 0x00, 0xB9, 0x4D, 0xB9, 0x4C, 0xB9, 0x4B, /* 0x4C-0x4F */ + 0xB9, 0x49, 0xB9, 0x4E, 0xE0, 0x7D, 0xB9, 0x44, /* 0x50-0x53 */ + 0xB9, 0x46, 0xB9, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xBB, 0xB8, 0xBB, 0xBB, 0x00, 0x00, 0xBB, 0xBF, /* 0x58-0x5B */ + 0xBB, 0xB9, 0xBB, 0xBE, 0xBB, 0xBC, 0x00, 0x00, /* 0x5C-0x5F */ + 0xBB, 0xB7, 0x00, 0x00, 0xBB, 0xBD, 0xBB, 0xBA, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x52, /* 0x64-0x67 */ + 0xBE, 0x43, 0xBE, 0x41, 0x00, 0x00, 0xE8, 0x53, /* 0x68-0x6B */ + 0x00, 0x00, 0xBE, 0x44, 0xBE, 0x42, 0xE8, 0x51, /* 0x6C-0x6F */ + 0xE8, 0x50, 0x00, 0x00, 0xBF, 0xF0, 0xE8, 0x4F, /* 0x70-0x73 */ + 0xBF, 0xEE, 0xBF, 0xED, 0xEB, 0xD0, 0xBE, 0x45, /* 0x74-0x77 */ + 0xBF, 0xEF, 0xEB, 0xD1, 0xBF, 0xF2, 0xEB, 0xD2, /* 0x78-0x7B */ + 0xBF, 0xF1, 0xC1, 0xD8, 0xEE, 0xC3, 0xC1, 0xD7, /* 0x7C-0x7F */ + + 0xC1, 0xDC, 0xC1, 0xDA, 0xC1, 0xDB, 0xC2, 0xE3, /* 0x80-0x83 */ + 0xC1, 0xD9, 0xEE, 0xC2, 0xEB, 0xD3, 0xC2, 0xE2, /* 0x84-0x87 */ + 0xC2, 0xE4, 0x00, 0x00, 0xC3, 0xE4, 0xC3, 0xE5, /* 0x88-0x8B */ + 0x00, 0x00, 0xF4, 0xE0, 0x00, 0x00, 0xC5, 0xDE, /* 0x8C-0x8F */ + 0xC5, 0xDD, 0xA8, 0xB6, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xCA, 0x55, 0xB0, 0x6F, 0x00, 0x00, 0xCA, 0x52, /* 0x94-0x97 */ + 0xCA, 0x53, 0xCA, 0x51, 0x00, 0x00, 0xCA, 0x54, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xCB, 0xAA, 0xCB, 0xA7, /* 0x9C-0x9F */ + 0xCB, 0xAC, 0xCB, 0xA8, 0xA8, 0xB7, 0xA8, 0xBA, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xCB, 0xA9, 0xA8, 0xB9, 0xCB, 0xAB, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0xA8, 0xB8, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xD5, /* 0xAC-0xAF */ + 0xCD, 0xD7, 0xAA, 0xF4, 0xCD, 0xD3, 0xCD, 0xD6, /* 0xB0-0xB3 */ + 0xCD, 0xD4, 0xAA, 0xF2, 0xAA, 0xF5, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xAA, 0xF3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xD0, 0xB8, 0xD0, 0xBC, 0xD0, 0xB9, /* 0xBC-0xBF */ + 0x00, 0x00, 0xAD, 0xA7, 0x00, 0x00, 0xAD, 0xA8, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xD0, 0xBB, 0x00, 0x00, 0xD0, 0xBD, /* 0xC4-0xC7 */ + 0xD0, 0xBF, 0x00, 0x00, 0xAD, 0xA5, 0xD0, 0xBE, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0xAD, 0xA6, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xD7, 0xEE, 0xD0, 0xBA, 0xD3, 0xF2, 0xD3, 0xFB, /* 0xD4-0xD7 */ + 0xD3, 0xF9, 0xD3, 0xF4, 0xD3, 0xF5, 0xD3, 0xFA, /* 0xD8-0xDB */ + 0xD3, 0xFC, 0xB0, 0x71, 0x00, 0x00, 0xD3, 0xF7, /* 0xDC-0xDF */ + 0xD3, 0xF3, 0xB0, 0x70, 0xB0, 0x72, 0xD3, 0xF6, /* 0xE0-0xE3 */ + 0xD3, 0xFD, 0xD3, 0xF8, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xB3, 0xA1, 0xD7, 0xF1, 0xD7, 0xE9, 0xD7, 0xEF, /* 0xE8-0xEB */ + 0xD7, 0xF0, 0xB3, 0xA2, 0x00, 0x00, 0xD7, 0xE8, /* 0xEC-0xEF */ + 0xD7, 0xEA, 0xD0, 0xB7, 0xD7, 0xEC, 0xD7, 0xED, /* 0xF0-0xF3 */ + 0xD7, 0xEB, 0xB6, 0x6C, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xDC, 0x56, 0xEB, 0xD4, 0xDC, 0x57, /* 0xF8-0xFB */ + 0xDC, 0x54, 0xB3, 0xA3, 0xB6, 0x6E, 0xDC, 0x53, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_91[512] = { + 0xDC, 0x59, 0xDC, 0x58, 0xB6, 0x6B, 0xDC, 0x5C, /* 0x00-0x03 */ + 0xDC, 0x52, 0xDC, 0x5B, 0xDC, 0x50, 0xDC, 0x5A, /* 0x04-0x07 */ + 0xDC, 0x55, 0xB6, 0x6D, 0x00, 0x00, 0xE0, 0xAA, /* 0x08-0x0B */ + 0x00, 0x00, 0xE0, 0xA5, 0xE0, 0xAB, 0xE0, 0xA6, /* 0x0C-0x0F */ + 0xE0, 0xA4, 0xE0, 0xA7, 0xB9, 0x51, 0x00, 0x00, /* 0x10-0x13 */ + 0xE0, 0xA9, 0x00, 0x00, 0xE0, 0xA8, 0xB9, 0x52, /* 0x14-0x17 */ + 0xBB, 0xC1, 0xBB, 0xC0, 0xE4, 0x6E, 0xE4, 0x71, /* 0x18-0x1B */ + 0xE4, 0x69, 0xE4, 0x6D, 0xBB, 0xC2, 0xE4, 0x6C, /* 0x1C-0x1F */ + 0xE4, 0x6A, 0xE4, 0x70, 0xE4, 0x6B, 0xE4, 0x68, /* 0x20-0x23 */ + 0xE4, 0x6F, 0x00, 0x00, 0xE8, 0x59, 0xBE, 0x48, /* 0x24-0x27 */ + 0xF1, 0x4A, 0xE8, 0x56, 0xE8, 0x57, 0xE8, 0x55, /* 0x28-0x2B */ + 0xDC, 0x51, 0xBE, 0x47, 0xE8, 0x5A, 0xE8, 0x54, /* 0x2C-0x2F */ + 0xBE, 0x46, 0xBE, 0x49, 0xE8, 0x58, 0xEB, 0xD5, /* 0x30-0x33 */ + 0xBF, 0xF3, 0xEB, 0xD6, 0xEB, 0xD7, 0x00, 0x00, /* 0x34-0x37 */ + 0xEE, 0xC4, 0xC1, 0xDD, 0xF1, 0x4B, 0xF1, 0x4C, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0x4D, 0xF3, 0x5D, /* 0x3C-0x3F */ + 0xF3, 0x5C, 0xF4, 0xE2, 0x00, 0x00, 0xF4, 0xE1, /* 0x40-0x43 */ + 0xF6, 0x5B, 0xF6, 0x5C, 0xF6, 0x5A, 0xF7, 0x66, /* 0x44-0x47 */ + 0xC5, 0xB0, 0xA8, 0xBB, 0xAD, 0xAA, 0xAD, 0xA9, /* 0x48-0x4B */ + 0xB0, 0x75, 0xB0, 0x74, 0xD4, 0x40, 0xD4, 0x41, /* 0x4C-0x4F */ + 0xD3, 0xFE, 0x00, 0x00, 0xB0, 0x73, 0xD7, 0xF5, /* 0x50-0x53 */ + 0x00, 0x00, 0xD7, 0xF6, 0xD7, 0xF2, 0xB3, 0xA4, /* 0x54-0x57 */ + 0xD7, 0xF3, 0x00, 0x00, 0xD7, 0xF4, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xDC, 0x5F, /* 0x5C-0x5F */ + 0xDC, 0x61, 0xDC, 0x5D, 0xDC, 0x60, 0xB6, 0x6F, /* 0x60-0x63 */ + 0xDC, 0x5E, 0xB6, 0x70, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xDD, 0x73, 0xB9, 0x55, 0xB9, 0x54, 0x00, 0x00, /* 0x68-0x6B */ + 0xB9, 0x53, 0x00, 0x00, 0xE0, 0xAC, 0xE0, 0xAD, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x73, 0xE4, 0x75, /* 0x70-0x73 */ + 0xBB, 0xC6, 0xBB, 0xC3, 0x00, 0x00, 0xBB, 0xC5, /* 0x74-0x77 */ + 0xBB, 0xC4, 0xE4, 0x74, 0xE4, 0x72, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xE8, 0x61, 0xE8, 0x5E, 0xE8, 0x5F, 0xBE, 0x4D, /* 0x80-0x83 */ + 0xE8, 0x60, 0xE8, 0x5B, 0xE8, 0x5C, 0xBE, 0x4A, /* 0x84-0x87 */ + 0x00, 0x00, 0xBE, 0x4B, 0xE8, 0x5D, 0xBE, 0x4C, /* 0x88-0x8B */ + 0x00, 0x00, 0xEB, 0xDB, 0x00, 0x00, 0xEB, 0xDC, /* 0x8C-0x8F */ + 0xEB, 0xD9, 0xEB, 0xDA, 0xBF, 0xF4, 0xEB, 0xD8, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0xEE, 0xC8, 0xEE, 0xC5, 0xEE, 0xC7, /* 0x98-0x9B */ + 0xC1, 0xE0, 0xEE, 0xCB, 0xC1, 0xDF, 0xEE, 0xC9, /* 0x9C-0x9F */ + 0xEE, 0xCC, 0xEE, 0xCA, 0xEE, 0xC6, 0xC1, 0xDE, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xF1, 0x4F, 0x00, 0x00, 0xF1, 0x50, /* 0xA4-0xA7 */ + 0xF1, 0x4E, 0x00, 0x00, 0xF1, 0x52, 0xC2, 0xE5, /* 0xA8-0xAB */ + 0xC2, 0xE6, 0xF3, 0x5F, 0xC3, 0xE7, 0xF1, 0x51, /* 0xAC-0xAF */ + 0xF3, 0x5E, 0xC3, 0xE6, 0xF4, 0xE5, 0xF4, 0xE6, /* 0xB0-0xB3 */ + 0xC4, 0xBF, 0xF4, 0xE4, 0x00, 0x00, 0xF4, 0xE3, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xF6, 0x5D, 0xC5, 0x48, 0x00, 0x00, /* 0xB8-0xBB */ + 0xF8, 0x49, 0xF8, 0xC8, 0xF8, 0xC7, 0x00, 0x00, /* 0xBC-0xBF */ + 0xC6, 0x43, 0xC6, 0x5D, 0xF8, 0xC9, 0xF9, 0x71, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xC6, 0x6F, 0xA8, 0xBC, 0xAA, 0xF6, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xB9, 0x56, 0x00, 0x00, 0xC4, 0xC0, /* 0xC8-0xCB */ + 0xA8, 0xBD, 0xAD, 0xAB, 0xB3, 0xA5, 0xB6, 0x71, /* 0xCC-0xCF */ + 0xC2, 0xE7, 0xAA, 0xF7, 0x00, 0x00, 0xD0, 0xC1, /* 0xD0-0xD3 */ + 0xD0, 0xC0, 0xD4, 0x42, 0x00, 0x00, 0xB0, 0x78, /* 0xD4-0xD7 */ + 0xB0, 0x76, 0xB0, 0x7A, 0xD4, 0x44, 0x00, 0x00, /* 0xD8-0xDB */ + 0xB0, 0x79, 0xB0, 0x77, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xD4, 0x43, 0xB3, 0xA8, /* 0xE0-0xE3 */ + 0xD7, 0xFC, 0x00, 0x00, 0xB3, 0xA7, 0xB3, 0xA9, /* 0xE4-0xE7 */ + 0xD8, 0x42, 0xB3, 0xAB, 0xD7, 0xFE, 0xD8, 0x40, /* 0xE8-0xEB */ + 0xD7, 0xF7, 0xB3, 0xAA, 0xD8, 0x43, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xD7, 0xF9, 0x00, 0x00, 0xD7, 0xFA, /* 0xF0-0xF3 */ + 0xD7, 0xF8, 0xB3, 0xA6, 0x00, 0x00, 0xD8, 0x41, /* 0xF4-0xF7 */ + 0xD7, 0xFB, 0xD7, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xDC, 0x6D, 0x00, 0x00, 0xDC, 0x6C, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_92[512] = { + 0xDC, 0x6A, 0xDC, 0x62, 0xDC, 0x71, 0xDC, 0x65, /* 0x00-0x03 */ + 0xDC, 0x6F, 0xDC, 0x76, 0xDC, 0x6E, 0xB6, 0x79, /* 0x04-0x07 */ + 0x00, 0x00, 0xB6, 0x75, 0xDC, 0x63, 0x00, 0x00, /* 0x08-0x0B */ + 0xDC, 0x69, 0xB6, 0x77, 0x00, 0x00, 0xDC, 0x68, /* 0x0C-0x0F */ + 0xB6, 0x78, 0xB6, 0x7A, 0xDC, 0x6B, 0x00, 0x00, /* 0x10-0x13 */ + 0xB6, 0x72, 0xB6, 0x73, 0xDC, 0x77, 0xDC, 0x75, /* 0x14-0x17 */ + 0x00, 0x00, 0xDC, 0x74, 0xDC, 0x66, 0x00, 0x00, /* 0x18-0x1B */ + 0xDC, 0x72, 0x00, 0x00, 0xB6, 0x76, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0x74, /* 0x20-0x23 */ + 0xDC, 0x73, 0xDC, 0x64, 0xDC, 0x67, 0xDC, 0x70, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xE4, 0xBA, 0xE0, 0xB7, 0x00, 0x00, /* 0x2C-0x2F */ + 0xE0, 0xB0, 0xE0, 0xC3, 0xE0, 0xCC, 0xE0, 0xB3, /* 0x30-0x33 */ + 0xB9, 0x61, 0x00, 0x00, 0xE0, 0xC0, 0xB9, 0x57, /* 0x34-0x37 */ + 0xB9, 0x59, 0xB9, 0x65, 0xE0, 0xB1, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xB9, 0x5A, 0xB9, 0x5C, 0xB9, 0x66, /* 0x3C-0x3F */ + 0xB9, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0xB9, 0x64, 0xE0, 0xB9, 0x00, 0x00, /* 0x44-0x47 */ + 0xE0, 0xAE, 0xB9, 0x62, 0xE0, 0xB8, 0xB9, 0x5E, /* 0x48-0x4B */ + 0xE0, 0xCA, 0xB9, 0x63, 0xE0, 0xC8, 0xE0, 0xBC, /* 0x4C-0x4F */ + 0xE0, 0xC6, 0xB9, 0x60, 0xE0, 0xAF, 0xE0, 0xC9, /* 0x50-0x53 */ + 0xE0, 0xC4, 0x00, 0x00, 0xE0, 0xCB, 0xB9, 0x58, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xB9, 0x67, 0xB9, 0x5D, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xE0, 0xB5, 0x00, 0x00, /* 0x5C-0x5F */ + 0xE0, 0xBD, 0xE0, 0xC1, 0x00, 0x00, 0xE0, 0xC5, /* 0x60-0x63 */ + 0xB9, 0x5F, 0xE0, 0xB4, 0xE0, 0xB2, 0xE0, 0xBE, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE0, 0xBB, 0xE0, 0xBA, 0x00, 0x00, 0xE0, 0xBF, /* 0x6C-0x6F */ + 0xE0, 0xC2, 0x00, 0x00, 0xE0, 0xC7, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0x78, 0x00, 0x00, /* 0x74-0x77 */ + 0xBB, 0xC7, 0xE4, 0xA4, 0xE4, 0x7A, 0xBB, 0xCC, /* 0x78-0x7B */ + 0xBB, 0xD0, 0xE4, 0xAD, 0xE4, 0xB5, 0xE4, 0xA6, /* 0x7C-0x7F */ + + 0xBB, 0xC8, 0x00, 0x00, 0xE4, 0xAA, 0xE0, 0xB6, /* 0x80-0x83 */ + 0x00, 0x00, 0xBB, 0xC9, 0xE4, 0xB1, 0xE4, 0xB6, /* 0x84-0x87 */ + 0xE4, 0xAE, 0x00, 0x00, 0xE4, 0xB0, 0xE4, 0xB9, /* 0x88-0x8B */ + 0xE4, 0xB2, 0xE4, 0x7E, 0xE4, 0xA9, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xBB, 0xD1, 0x00, 0x00, 0xBB, 0xCD, /* 0x90-0x93 */ + 0xE4, 0x7C, 0xE4, 0xAB, 0xBB, 0xCB, 0xE4, 0xA5, /* 0x94-0x97 */ + 0xBB, 0xCA, 0xE4, 0xB3, 0xE4, 0xA2, 0xE4, 0x79, /* 0x98-0x9B */ + 0xBB, 0xCE, 0xE4, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xE4, 0x7B, 0xE4, 0xAF, 0xE4, 0xAC, 0xE4, 0xA7, /* 0xA0-0xA3 */ + 0xE4, 0x77, 0xE4, 0x76, 0xE4, 0xA1, 0xE4, 0xB4, /* 0xA4-0xA7 */ + 0xBB, 0xCF, 0xE4, 0xB7, 0xE4, 0x7D, 0xE4, 0xA3, /* 0xA8-0xAB */ + 0xBE, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0xBE, 0x5A, 0xBE, 0x55, /* 0xB0-0xB3 */ + 0xE8, 0xA4, 0xE8, 0xA1, 0xE8, 0x67, 0xBE, 0x50, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xF9, 0xD7, 0x00, 0x00, 0xBE, 0x4F, /* 0xB8-0xBB */ + 0xBE, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xE8, 0x65, 0xBE, 0x54, 0xE8, 0x71, 0xE8, 0x63, /* 0xC0-0xC3 */ + 0xE8, 0x64, 0xBE, 0x4E, 0xE8, 0xA3, 0xBE, 0x58, /* 0xC4-0xC7 */ + 0xE8, 0x74, 0xE8, 0x79, 0xE8, 0x73, 0xEB, 0xEE, /* 0xC8-0xCB */ + 0xE8, 0x6F, 0xE8, 0x77, 0xE8, 0x75, 0xE8, 0x68, /* 0xCC-0xCF */ + 0xE8, 0x62, 0xE8, 0x7D, 0xBE, 0x57, 0xE8, 0x7E, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xE8, 0x78, 0x00, 0x00, 0xE8, 0x6D, /* 0xD4-0xD7 */ + 0xE8, 0x6B, 0xE8, 0x66, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0xE8, 0x6E, 0xE8, 0x7B, 0xE8, 0x6A, /* 0xDC-0xDF */ + 0xE8, 0x7A, 0xE8, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xBE, 0x53, 0x00, 0x00, 0xE8, 0x76, 0xE8, 0x7C, /* 0xE4-0xE7 */ + 0xE8, 0x72, 0xE8, 0x6C, 0xBE, 0x51, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xA8, 0xE8, 0x70, /* 0xEC-0xEF */ + 0xBE, 0x59, 0xE8, 0x69, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEB, 0xF4, /* 0xF4-0xF7 */ + 0xBF, 0xF7, 0xEB, 0xF3, 0xEB, 0xF0, 0xEC, 0x44, /* 0xF8-0xFB */ + 0xBF, 0xFB, 0x00, 0x00, 0xEC, 0x41, 0xEB, 0xF8, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_93[512] = { + 0xEC, 0x43, 0xEB, 0xE9, 0xEB, 0xF6, 0x00, 0x00, /* 0x00-0x03 */ + 0xBF, 0xFD, 0x00, 0x00, 0xEB, 0xE1, 0x00, 0x00, /* 0x04-0x07 */ + 0xEB, 0xDF, 0xEC, 0x42, 0x00, 0x00, 0xEC, 0x40, /* 0x08-0x0B */ + 0xEB, 0xFE, 0xEB, 0xED, 0xEB, 0xEC, 0xEB, 0xE2, /* 0x0C-0x0F */ + 0xC0, 0x40, 0x00, 0x00, 0xEB, 0xE8, 0xEB, 0xF2, /* 0x10-0x13 */ + 0xEB, 0xFD, 0xC0, 0x43, 0xEC, 0x45, 0x00, 0x00, /* 0x14-0x17 */ + 0xC1, 0xE8, 0xC0, 0x45, 0xBF, 0xFE, 0xEB, 0xE6, /* 0x18-0x1B */ + 0x00, 0x00, 0xEB, 0xEF, 0xEB, 0xDE, 0xEB, 0xE0, /* 0x1C-0x1F */ + 0xBF, 0xF5, 0xC0, 0x42, 0xBF, 0xFA, 0xEB, 0xE7, /* 0x20-0x23 */ + 0xEB, 0xF7, 0xEB, 0xF1, 0xC0, 0x41, 0xEB, 0xDD, /* 0x24-0x27 */ + 0xC1, 0xE3, 0xEB, 0xF9, 0xEB, 0xFC, 0xBF, 0xFC, /* 0x28-0x2B */ + 0x00, 0x00, 0xEB, 0xEB, 0xC0, 0x44, 0xBF, 0xF9, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xBF, 0xF8, /* 0x30-0x33 */ + 0xEB, 0xF5, 0xEB, 0xFB, 0xBF, 0xF6, 0x00, 0x00, /* 0x34-0x37 */ + 0xEB, 0xE4, 0xEB, 0xFA, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0xEB, 0xE5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xEB, 0xEA, 0xEE, 0xD2, /* 0x44-0x47 */ + 0x00, 0x00, 0xEE, 0xD7, 0xC1, 0xE5, 0xC1, 0xE7, /* 0x48-0x4B */ + 0xEE, 0xDD, 0xC1, 0xE1, 0xEE, 0xEC, 0xEE, 0xE3, /* 0x4C-0x4F */ + 0xEE, 0xD8, 0xEE, 0xD9, 0xEE, 0xE2, 0x00, 0x00, /* 0x50-0x53 */ + 0xC1, 0xEE, 0xEE, 0xE1, 0xEE, 0xD1, 0xEE, 0xE0, /* 0x54-0x57 */ + 0xEE, 0xD4, 0xEE, 0xED, 0xC1, 0xED, 0xC1, 0xEB, /* 0x58-0x5B */ + 0xEE, 0xD5, 0x00, 0x00, 0xEE, 0xE8, 0x00, 0x00, /* 0x5C-0x5F */ + 0xEE, 0xDA, 0xEE, 0xE7, 0x00, 0x00, 0xEE, 0xE9, /* 0x60-0x63 */ + 0xEE, 0xD0, 0xC1, 0xE6, 0x00, 0x00, 0xEE, 0xEA, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xEE, 0xDE, 0x00, 0x00, /* 0x68-0x6B */ + 0xC1, 0xEA, 0xEE, 0xDB, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0xC1, 0xEC, 0xEE, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xC1, 0xE4, 0xEE, 0xD6, 0xEE, 0xE5, /* 0x74-0x77 */ + 0x00, 0x00, 0xEE, 0xDF, 0xEB, 0xE3, 0xEE, 0xE6, /* 0x78-0x7B */ + 0xEE, 0xD3, 0x00, 0x00, 0xC1, 0xE9, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xEE, 0xEB, 0x00, 0x00, 0xC1, 0xE2, 0xEE, 0xCE, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xF1, 0x60, 0xF1, 0x59, 0xC2, 0xE9, 0x00, 0x00, /* 0x88-0x8B */ + 0xF1, 0x54, 0xF1, 0x63, 0xF1, 0x5B, 0xEE, 0xDC, /* 0x8C-0x8F */ + 0x00, 0x00, 0xF1, 0x65, 0xF1, 0x55, 0x00, 0x00, /* 0x90-0x93 */ + 0xC2, 0xE8, 0xF1, 0x5F, 0xC2, 0xEA, 0xC2, 0xF2, /* 0x94-0x97 */ + 0xC2, 0xF0, 0xF1, 0x61, 0xC2, 0xF1, 0xF1, 0x57, /* 0x98-0x9B */ + 0x00, 0x00, 0xF1, 0x58, 0xF1, 0x5D, 0xF1, 0x62, /* 0x9C-0x9F */ + 0x00, 0x00, 0xEE, 0xCD, 0xC2, 0xEB, 0xF1, 0x6A, /* 0xA0-0xA3 */ + 0xF1, 0x67, 0xF1, 0x6B, 0xF1, 0x5E, 0xF1, 0x5A, /* 0xA4-0xA7 */ + 0xF1, 0x68, 0xF3, 0x6A, 0xF1, 0x5C, 0x00, 0x00, /* 0xA8-0xAB */ + 0xC2, 0xEE, 0x00, 0x00, 0xC2, 0xED, 0xEE, 0xCF, /* 0xAC-0xAF */ + 0xC2, 0xEF, 0xF1, 0x64, 0xF1, 0x66, 0xC2, 0xEC, /* 0xB0-0xB3 */ + 0xF1, 0x69, 0xF1, 0x53, 0x00, 0x00, 0xF1, 0x56, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0xF3, 0x73, 0x00, 0x00, 0xF3, 0x63, 0xC3, 0xEB, /* 0xC0-0xC3 */ + 0xF3, 0x71, 0x00, 0x00, 0x00, 0x00, 0xF3, 0x61, /* 0xC4-0xC7 */ + 0xC3, 0xEC, 0x00, 0x00, 0xF3, 0x6C, 0x00, 0x00, /* 0xC8-0xCB */ + 0xF3, 0x68, 0xC3, 0xF1, 0xF3, 0x72, 0xF3, 0x62, /* 0xCC-0xCF */ + 0xF3, 0x65, 0xC3, 0xE9, 0xF3, 0x74, 0x00, 0x00, /* 0xD0-0xD3 */ + 0xF3, 0x6D, 0xF3, 0x70, 0xC3, 0xEF, 0xC3, 0xF4, /* 0xD4-0xD7 */ + 0xC3, 0xF2, 0xF3, 0x69, 0xF3, 0x64, 0x00, 0x00, /* 0xD8-0xDB */ + 0xC3, 0xED, 0xC3, 0xEE, 0xF3, 0x60, 0xC3, 0xEA, /* 0xDC-0xDF */ + 0x00, 0x00, 0xC3, 0xE8, 0xC3, 0xF0, 0xF3, 0x6F, /* 0xE0-0xE3 */ + 0xC3, 0xF3, 0x00, 0x00, 0xF3, 0x6B, 0xF3, 0x75, /* 0xE4-0xE7 */ + 0xC3, 0xF5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0xF3, 0x67, 0x00, 0x00, 0xF3, 0x6E, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xF4, 0xF3, 0xF5, 0x42, 0xF4, 0xF5, /* 0xF4-0xF7 */ + 0xF4, 0xFC, 0xF3, 0x66, 0xF4, 0xFA, 0xF4, 0xE9, /* 0xF8-0xFB */ + 0xF5, 0x40, 0xC4, 0xC3, 0xF4, 0xED, 0xF4, 0xFE, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_94[512] = { + 0xF4, 0xF4, 0x00, 0x00, 0x00, 0x00, 0xC4, 0xC2, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0x44, 0xF4, 0xF6, /* 0x04-0x07 */ + 0x00, 0x00, 0xF4, 0xFB, 0xF4, 0xFD, 0xF4, 0xE7, /* 0x08-0x0B */ + 0xF5, 0x41, 0xF4, 0xF2, 0xF4, 0xF7, 0xF4, 0xEB, /* 0x0C-0x0F */ + 0xF4, 0xEF, 0xF5, 0x43, 0xF4, 0xF9, 0xF4, 0xE8, /* 0x10-0x13 */ + 0xF4, 0xEC, 0xF4, 0xEE, 0xF4, 0xF8, 0x00, 0x00, /* 0x14-0x17 */ + 0xC4, 0xC1, 0xF4, 0xF1, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0xF4, 0xEA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xF4, 0xF0, 0xF6, 0x61, 0xF6, 0x66, 0xC5, 0x4F, /* 0x28-0x2B */ + 0xF6, 0x68, 0x00, 0x00, 0xC5, 0x49, 0x00, 0x00, /* 0x2C-0x2F */ + 0xF6, 0x64, 0xF6, 0x6A, 0xC5, 0x4E, 0xC5, 0x4A, /* 0x30-0x33 */ + 0x00, 0x00, 0xC5, 0x4B, 0xF6, 0x60, 0xF6, 0x67, /* 0x34-0x37 */ + 0xC5, 0x4D, 0xF6, 0x65, 0xC5, 0x4C, 0xF6, 0x5F, /* 0x38-0x3B */ + 0xF6, 0x63, 0xF6, 0x62, 0x00, 0x00, 0xF6, 0x5E, /* 0x3C-0x3F */ + 0xF6, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xC5, 0xB1, 0xF7, 0x6D, 0xF7, 0x70, 0xF7, 0x6C, /* 0x44-0x47 */ + 0xF7, 0x6E, 0xF7, 0x6F, 0xF7, 0x69, 0xF7, 0x6A, /* 0x48-0x4B */ + 0xF7, 0x67, 0x00, 0x00, 0x00, 0x00, 0xF7, 0x6B, /* 0x4C-0x4F */ + 0xF7, 0x68, 0xC5, 0xB2, 0xC5, 0xB3, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0xF8, 0x4B, 0x00, 0x00, 0xF8, 0x4D, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0xF8, 0x4C, 0xF8, 0x4E, 0x00, 0x00, /* 0x5C-0x5F */ + 0xC5, 0xE0, 0x00, 0x00, 0xF8, 0x4A, 0xC5, 0xDF, /* 0x60-0x63 */ + 0xC5, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0xF8, 0xCB, 0xF8, 0xCC, 0xC6, 0x44, 0xF8, 0xCA, /* 0x68-0x6B */ + 0x00, 0x00, 0xF9, 0x53, 0xF9, 0x52, 0xF9, 0x54, /* 0x6C-0x6F */ + 0xC6, 0x5F, 0xF9, 0x55, 0xC6, 0x5E, 0xF9, 0x56, /* 0x70-0x73 */ + 0xF9, 0x72, 0xF9, 0x75, 0xF9, 0x74, 0xC6, 0x68, /* 0x74-0x77 */ + 0xF9, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xC6, 0x72, 0xC6, 0x70, 0xC6, 0x71, 0xC6, 0x77, /* 0x7C-0x7F */ + + 0xF9, 0xC0, 0xF9, 0xC1, 0xF9, 0xBF, 0xF9, 0xC9, /* 0x80-0x83 */ +}; + +static const unsigned char u2c_95[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAA, 0xF8, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0x44, 0xDC, 0x78, /* 0x78-0x7B */ + 0xE8, 0xA5, 0xF3, 0x76, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xAA, 0xF9, 0x00, 0x00, 0xAD, 0xAC, 0xB0, 0x7B, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xD8, 0x45, 0x00, 0x00, /* 0x84-0x87 */ + 0xD8, 0x46, 0xB3, 0xAC, 0x00, 0x00, 0xB6, 0x7D, /* 0x88-0x8B */ + 0xDC, 0x7A, 0xDC, 0x79, 0xB6, 0xA3, 0xB6, 0x7C, /* 0x8C-0x8F */ + 0xDC, 0x7B, 0xB6, 0x7E, 0xB6, 0xA2, 0xB6, 0xA1, /* 0x90-0x93 */ + 0xB6, 0x7B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xB9, 0x68, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xD0, /* 0x98-0x9B */ + 0xE0, 0xCE, 0x00, 0x00, 0xE0, 0xCF, 0xE0, 0xCD, /* 0x9C-0x9F */ + 0x00, 0x00, 0xBB, 0xD2, 0x00, 0x00, 0xBB, 0xD5, /* 0xA0-0xA3 */ + 0xBB, 0xD7, 0xBB, 0xD6, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xBB, 0xD3, 0xBB, 0xD4, 0x00, 0x00, 0xE8, 0xA7, /* 0xA8-0xAB */ + 0xE8, 0xA6, 0xBE, 0x5B, 0xE8, 0xA8, 0x00, 0x00, /* 0xAC-0xAF */ + 0xE8, 0xA9, 0xBE, 0x5C, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xEC, 0x4D, 0xEC, 0x4B, 0xEE, 0xF3, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xEC, 0x49, 0xEC, 0x4A, 0xC0, 0x46, /* 0xB8-0xBB */ + 0xEC, 0x46, 0xEC, 0x4E, 0xEC, 0x48, 0xEC, 0x4C, /* 0xBC-0xBF */ + 0xEE, 0xEF, 0x00, 0x00, 0x00, 0x00, 0xEE, 0xF1, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xEE, 0xF2, 0xC1, 0xF3, 0xEE, 0xEE, /* 0xC4-0xC7 */ + 0xC1, 0xF2, 0xEE, 0xF0, 0xC1, 0xEF, 0xC1, 0xF0, /* 0xC8-0xCB */ + 0xC1, 0xF1, 0xEC, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0xC2, 0xF5, 0xF1, 0x6E, 0xF1, 0x6C, 0xF1, 0x6D, /* 0xD0-0xD3 */ + 0xC2, 0xF3, 0xC2, 0xF6, 0xC2, 0xF4, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0x77, 0xF3, 0x78, /* 0xD8-0xDB */ + 0xC3, 0xF6, 0x00, 0x00, 0xF5, 0x45, 0xF5, 0x47, /* 0xDC-0xDF */ + 0xF5, 0x46, 0xC4, 0xC4, 0xC5, 0x50, 0xF6, 0x6D, /* 0xE0-0xE3 */ + 0xF6, 0x6C, 0xF6, 0x6B, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ +}; + +static const unsigned char u2c_96[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xAA, 0xFA, 0x00, 0x00, 0xC9, 0xAA, 0x00, 0x00, /* 0x1C-0x1F */ + 0xCA, 0x58, 0xA6, 0xE9, 0xCA, 0x56, 0xCA, 0x59, /* 0x20-0x23 */ + 0xCA, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xCB, 0xAE, 0x00, 0x00, 0xA8, 0xC1, 0x00, 0x00, /* 0x28-0x2B */ + 0xA8, 0xC2, 0xCB, 0xB0, 0xA8, 0xBF, 0xCB, 0xAF, /* 0x2C-0x2F */ + 0xCB, 0xAD, 0xA8, 0xC0, 0xA8, 0xBE, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0xCD, 0xD8, 0xCD, 0xDB, 0xAA, 0xFD, /* 0x38-0x3B */ + 0xCD, 0xDA, 0xCD, 0xD9, 0x00, 0x00, 0xAA, 0xFC, /* 0x3C-0x3F */ + 0xAA, 0xFB, 0x00, 0x00, 0xAB, 0x40, 0xCD, 0xDC, /* 0x40-0x43 */ + 0xAA, 0xFE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xD0, 0xC6, 0xAD, 0xAE, /* 0x48-0x4B */ + 0xAD, 0xAF, 0xAD, 0xB0, 0xD0, 0xC7, 0xD0, 0xC3, /* 0x4C-0x4F */ + 0xAD, 0xAD, 0xD0, 0xC4, 0x00, 0x00, 0xD0, 0xC5, /* 0x50-0x53 */ + 0xD0, 0xC2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0xB0, 0xA4, 0x00, 0x00, 0x00, 0x00, 0xB0, 0xA1, /* 0x58-0x5B */ + 0xD4, 0x45, 0xB0, 0xA2, 0xB0, 0xA5, 0xD4, 0x46, /* 0x5C-0x5F */ + 0x00, 0x00, 0xB0, 0x7E, 0xB0, 0x7C, 0xB0, 0x7D, /* 0x60-0x63 */ + 0xB0, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xB3, 0xAD, 0xD8, 0x49, /* 0x68-0x6B */ + 0xB3, 0xB5, 0xD8, 0x48, 0x00, 0x00, 0xD8, 0x4B, /* 0x6C-0x6F */ + 0xB3, 0xB1, 0xD8, 0x4A, 0xB6, 0xAB, 0xB3, 0xAF, /* 0x70-0x73 */ + 0xB3, 0xB2, 0xB3, 0xAE, 0xB3, 0xB3, 0xB3, 0xB4, /* 0x74-0x77 */ + 0xB3, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0xD8, 0x47, 0xB6, 0xA7, 0xDC, 0x7D, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xDC, 0xA3, 0x00, 0x00, 0x00, 0x00, 0xDC, 0xA2, /* 0x80-0x83 */ + 0xB6, 0xAC, 0xB6, 0xA8, 0xB6, 0xA9, 0xDC, 0x7C, /* 0x84-0x87 */ + 0xDC, 0x7E, 0xDC, 0xA1, 0xB6, 0xA4, 0xB6, 0xA6, /* 0x88-0x8B */ + 0x00, 0x00, 0xB6, 0xAA, 0xB6, 0xA5, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xE0, 0xD3, 0xE0, 0xD1, 0xE0, 0xD2, /* 0x90-0x93 */ + 0xB9, 0x6A, 0xB9, 0x6B, 0x00, 0x00, 0xE0, 0xD4, /* 0x94-0x97 */ + 0xB9, 0x69, 0xBB, 0xD8, 0x00, 0x00, 0xBB, 0xDA, /* 0x98-0x9B */ + 0xBB, 0xD9, 0x00, 0x00, 0xE4, 0xBB, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xE4, 0xBC, 0xE8, 0xAB, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xE8, 0xAA, 0x00, 0x00, 0x00, 0x00, 0xC0, 0x47, /* 0xA4-0xA7 */ + 0xC0, 0x48, 0xEC, 0x4F, 0xC0, 0x49, 0x00, 0x00, /* 0xA8-0xAB */ + 0xEE, 0xF6, 0x00, 0x00, 0xEE, 0xF4, 0x00, 0x00, /* 0xAC-0xAF */ + 0xEE, 0xF5, 0xC1, 0xF4, 0x00, 0x00, 0xF1, 0x6F, /* 0xB0-0xB3 */ + 0xC3, 0xF7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xC1, 0xF5, 0xAB, 0x41, 0x00, 0x00, 0xB0, 0xA6, /* 0xB8-0xBB */ + 0xD4, 0x47, 0x00, 0x00, 0x00, 0x00, 0xD8, 0x4C, /* 0xBC-0xBF */ + 0xB3, 0xB6, 0xB6, 0xAD, 0xDC, 0xA4, 0xDC, 0xA6, /* 0xC0-0xC3 */ + 0xB6, 0xAF, 0xB6, 0xAE, 0xB6, 0xB0, 0xB6, 0xB1, /* 0xC4-0xC7 */ + 0xDC, 0xA5, 0xB9, 0x6E, 0xB9, 0x6F, 0xB9, 0x6D, /* 0xC8-0xCB */ + 0xBB, 0xDB, 0xB9, 0x6C, 0xE0, 0xD5, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0xBB, 0xDC, 0xE8, 0xAC, /* 0xD0-0xD3 */ + 0xEC, 0x50, 0xC0, 0x4A, 0xC1, 0xF6, 0xF1, 0x70, /* 0xD4-0xD7 */ + 0xF1, 0x74, 0xC2, 0xF9, 0xF1, 0x71, 0xC2, 0xFA, /* 0xD8-0xDB */ + 0xC2, 0xF8, 0xF1, 0x75, 0xC2, 0xFB, 0xF1, 0x73, /* 0xDC-0xDF */ + 0x00, 0x00, 0xF3, 0x79, 0xC2, 0xF7, 0xC3, 0xF8, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xF8, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xAB, 0x42, 0xB3, 0xB8, 0xB3, 0xB7, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB6, 0xB2, /* 0xEC-0xEF */ + 0xDC, 0xA8, 0xDC, 0xA7, 0xB6, 0xB3, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0xE0, 0xD9, 0xB9, 0x73, 0xB9, 0x70, /* 0xF4-0xF7 */ + 0xE0, 0xD8, 0xB9, 0x72, 0xE0, 0xD6, 0xB9, 0x71, /* 0xF8-0xFB */ + 0x00, 0x00, 0xE0, 0xD7, 0x00, 0x00, 0xE4, 0xBD, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_97[512] = { + 0xBB, 0xDD, 0x00, 0x00, 0xE8, 0xAF, 0x00, 0x00, /* 0x00-0x03 */ + 0xBE, 0x5D, 0xE8, 0xAD, 0xBE, 0x5E, 0xBE, 0x5F, /* 0x04-0x07 */ + 0xE8, 0xAE, 0xBE, 0x60, 0x00, 0x00, 0xEC, 0x51, /* 0x08-0x0B */ + 0x00, 0x00, 0xC0, 0x4E, 0xC0, 0x4B, 0xC0, 0x50, /* 0x0C-0x0F */ + 0xEC, 0x53, 0xC0, 0x4C, 0xEC, 0x52, 0xC0, 0x4F, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0x4D, 0x00, 0x00, /* 0x14-0x17 */ + 0xEE, 0xF9, 0xEE, 0xFB, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xC1, 0xF7, 0xEE, 0xFA, 0xC1, 0xF8, 0xEE, 0xF8, /* 0x1C-0x1F */ + 0xEE, 0xF7, 0x00, 0x00, 0xF1, 0x77, 0xF1, 0x76, /* 0x20-0x23 */ + 0xC2, 0xFC, 0xF1, 0x78, 0xF3, 0x7E, 0xC3, 0xFA, /* 0x24-0x27 */ + 0xF3, 0x7D, 0xF3, 0x7A, 0xC3, 0xF9, 0xF3, 0x7B, /* 0x28-0x2B */ + 0xF3, 0x7C, 0x00, 0x00, 0xF5, 0x48, 0xF5, 0x49, /* 0x2C-0x2F */ + 0xC4, 0xC5, 0x00, 0x00, 0xC5, 0x53, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xF6, 0x6E, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0xC5, 0x51, 0xC5, 0x52, 0xF6, 0x6F, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xC5, 0xB4, 0xC5, 0xB5, 0xF7, 0x71, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0xC6, 0x45, 0xF8, 0xCF, /* 0x40-0x43 */ + 0xC6, 0x47, 0x00, 0x00, 0xF8, 0xCE, 0xF8, 0xD0, /* 0x44-0x47 */ + 0xC6, 0x46, 0xF9, 0x57, 0x00, 0x00, 0xF9, 0xAD, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xAB, 0x43, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0xB9, 0x74, 0x00, 0x00, /* 0x54-0x57 */ + 0xE4, 0xBE, 0x00, 0x00, 0xE8, 0xB0, 0xC0, 0x51, /* 0x58-0x5B */ + 0xC0, 0x52, 0x00, 0x00, 0xAB, 0x44, 0x00, 0x00, /* 0x5C-0x5F */ + 0xBE, 0x61, 0xC3, 0xFB, 0xAD, 0xB1, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0x53, 0x00, 0x00, /* 0x64-0x67 */ + 0xC5, 0xE2, 0xAD, 0xB2, 0xD8, 0x4D, 0x00, 0x00, /* 0x68-0x6B */ + 0xDC, 0xA9, 0x00, 0x00, 0xDC, 0xAB, 0x00, 0x00, /* 0x6C-0x6F */ + 0xDC, 0xAA, 0x00, 0x00, 0xE0, 0xDD, 0xE0, 0xDA, /* 0x70-0x73 */ + 0xB9, 0x75, 0x00, 0x00, 0xB9, 0x76, 0xE0, 0xDB, /* 0x74-0x77 */ + 0xE0, 0xDC, 0x00, 0x00, 0xE4, 0xC0, 0xE4, 0xC5, /* 0x78-0x7B */ + 0xBB, 0xDE, 0xE4, 0xBF, 0xE4, 0xC1, 0xE4, 0xC8, /* 0x7C-0x7F */ + + 0xE4, 0xC3, 0xE4, 0xC7, 0xE4, 0xC4, 0xE4, 0xC2, /* 0x80-0x83 */ + 0xE4, 0xC6, 0xBB, 0xDF, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0xE8, 0xB3, 0x00, 0x00, 0xE8, 0xB1, 0xBE, 0x63, /* 0x88-0x8B */ + 0x00, 0x00, 0xBE, 0x62, 0xE8, 0xB2, 0xBE, 0x64, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0xEC, 0x56, 0x00, 0x00, 0x00, 0x00, 0xEC, 0x55, /* 0x94-0x97 */ + 0xC0, 0x54, 0xEC, 0x54, 0xEE, 0xFC, 0x00, 0x00, /* 0x98-0x9B */ + 0xEE, 0xFE, 0xEF, 0x41, 0xEF, 0x40, 0x00, 0x00, /* 0x9C-0x9F */ + 0xC1, 0xF9, 0xEE, 0xFD, 0xF1, 0xA1, 0xC2, 0xFD, /* 0xA0-0xA3 */ + 0xF1, 0x7D, 0xF1, 0xA2, 0xC2, 0xFE, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xF1, 0x7B, 0x00, 0x00, 0xF1, 0x7E, 0xF1, 0x7C, /* 0xA8-0xAB */ + 0xF1, 0x79, 0xC3, 0x40, 0xF1, 0x7A, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xA1, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xF3, 0xA3, 0xF3, 0xA2, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xF5, 0x4A, 0x00, 0x00, 0xF5, 0x4B, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF6, 0x70, /* 0xBC-0xBF */ + 0x00, 0x00, 0xC5, 0xB7, 0x00, 0x00, 0xC5, 0xB6, /* 0xC0-0xC3 */ + 0xF8, 0x4F, 0xF8, 0x50, 0xC6, 0x48, 0xF8, 0xD1, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xC6, 0x69, 0x00, 0x00, 0xAD, 0xB3, /* 0xC8-0xCB */ + 0xB6, 0xB4, 0xE4, 0xCA, 0xE4, 0xC9, 0xE8, 0xB5, /* 0xCC-0xCF */ + 0xE8, 0xB4, 0x00, 0x00, 0x00, 0x00, 0xC1, 0xFA, /* 0xD0-0xD3 */ + 0xEF, 0x43, 0xEF, 0x42, 0xF1, 0xA5, 0xF1, 0xA3, /* 0xD4-0xD7 */ + 0xF1, 0xA6, 0xF1, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xC3, 0xFC, 0xF3, 0xA4, 0xF3, 0xA5, 0xF3, 0xA6, /* 0xDC-0xDF */ + 0x00, 0x00, 0xF6, 0x71, 0x00, 0x00, 0xF7, 0x72, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xF8, 0xD2, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xAD, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xEC, 0x57, 0xEF, 0x44, 0x00, 0x00, 0xAD, 0xB5, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xBB, 0xE0, 0x00, 0x00, /* 0xF4-0xF7 */ + 0xEC, 0x58, 0xC3, 0x41, 0xF1, 0xA7, 0xC3, 0xFD, /* 0xF8-0xFB */ + 0x00, 0x00, 0xF5, 0x4C, 0xF5, 0x4D, 0xC5, 0x54, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_98[512] = { + 0xF8, 0x51, 0xAD, 0xB6, 0xB3, 0xBB, 0xB3, 0xBC, /* 0x00-0x03 */ + 0xD8, 0x4E, 0xB6, 0xB5, 0xB6, 0xB6, 0xDC, 0xAC, /* 0x04-0x07 */ + 0xB6, 0xB7, 0x00, 0x00, 0xB9, 0x7A, 0x00, 0x00, /* 0x08-0x0B */ + 0xB9, 0x7C, 0xE0, 0xDF, 0xE0, 0xE0, 0xE0, 0xDE, /* 0x0C-0x0F */ + 0xB9, 0x77, 0xB9, 0x78, 0xB9, 0x7B, 0xB9, 0x79, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0xE4, 0xCB, 0xBB, 0xE1, /* 0x14-0x17 */ + 0xBB, 0xE2, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xBC, /* 0x18-0x1B */ + 0xBE, 0x67, 0xE8, 0xB7, 0xE8, 0xB6, 0x00, 0x00, /* 0x1C-0x1F */ + 0xE8, 0xBB, 0xBE, 0x65, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xC0, 0x5B, 0x00, 0x00, 0xE8, 0xB8, 0xE8, 0xBD, /* 0x24-0x27 */ + 0xE8, 0xBA, 0xE8, 0xB9, 0x00, 0x00, 0xBE, 0x66, /* 0x28-0x2B */ + 0x00, 0x00, 0xC0, 0x59, 0x00, 0x00, 0xEC, 0x5A, /* 0x2C-0x2F */ + 0xC0, 0x55, 0x00, 0x00, 0xEC, 0x5B, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0xEC, 0x59, 0x00, 0x00, 0xC0, 0x58, /* 0x34-0x37 */ + 0xC0, 0x56, 0xC0, 0x5A, 0x00, 0x00, 0xC0, 0x57, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0xEF, 0x45, 0x00, 0x00, 0xEF, 0x4A, /* 0x40-0x43 */ + 0xEF, 0x46, 0xEF, 0x49, 0xC1, 0xFB, 0x00, 0x00, /* 0x44-0x47 */ + 0xED, 0xD4, 0xEF, 0x48, 0xEF, 0x47, 0x00, 0x00, /* 0x48-0x4B */ + 0xC3, 0x44, 0xC3, 0x42, 0xC3, 0x45, 0xC3, 0x43, /* 0x4C-0x4F */ + 0xF1, 0xA8, 0xF1, 0xA9, 0xF1, 0xAA, 0xC3, 0x46, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xAA, /* 0x54-0x57 */ + 0xC4, 0x40, 0xF3, 0xA8, 0x00, 0x00, 0xC4, 0x41, /* 0x58-0x5B */ + 0xF3, 0xA7, 0xF3, 0xA9, 0xC3, 0xFE, 0xF5, 0x51, /* 0x5C-0x5F */ + 0xF5, 0x4E, 0x00, 0x00, 0xF5, 0x4F, 0xF5, 0x50, /* 0x60-0x63 */ + 0xF6, 0x72, 0xC5, 0x56, 0x00, 0x00, 0xC5, 0x55, /* 0x64-0x67 */ + 0x00, 0x00, 0xF7, 0x74, 0xF7, 0x73, 0xC5, 0xB8, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC5, 0xE3, /* 0x6C-0x6F */ + 0xC6, 0x49, 0xC6, 0x60, 0xF9, 0x58, 0xF9, 0xAE, /* 0x70-0x73 */ + 0xF9, 0xAF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xAD, 0xB7, 0xDC, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0xE0, 0xE1, 0xE4, 0xCC, 0xE4, 0xCD, 0xBB, 0xE3, /* 0xAC-0xAF */ + 0x00, 0x00, 0xBB, 0xE4, 0xE8, 0xBE, 0xBE, 0x68, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0xC1, 0xFC, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xF1, 0xAB, 0x00, 0x00, 0xC3, 0x47, 0xF3, 0xAD, /* 0xB8-0xBB */ + 0xC4, 0x42, 0xF3, 0xAC, 0xF3, 0xAE, 0xF3, 0xAB, /* 0xBC-0xBF */ + 0xF6, 0x75, 0xF5, 0x52, 0xF5, 0x53, 0x00, 0x00, /* 0xC0-0xC3 */ + 0xC4, 0xC6, 0x00, 0x00, 0xF6, 0x74, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xF6, 0x73, 0x00, 0x00, 0xF7, 0x75, /* 0xC8-0xCB */ + 0xF9, 0xB0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xB8, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAD, 0xB9, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xB0, 0xA7, 0xD4, 0x48, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xD8, 0x4F, 0x00, 0x00, 0xB6, 0xB8, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xB6, 0xBB, 0xB6, 0xB9, 0xDC, 0xAE, /* 0xE8-0xEB */ + 0x00, 0x00, 0xB6, 0xBD, 0x00, 0x00, 0xB6, 0xBA, /* 0xEC-0xEF */ + 0x00, 0x00, 0x00, 0x00, 0xB6, 0xBC, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xB9, 0x7E, 0x00, 0x00, 0xE0, 0xE2, 0x00, 0x00, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE0, 0xE3, 0xE8, 0xC0, 0x00, 0x00, /* 0xF8-0xFB */ + 0xB9, 0x7D, 0xB9, 0xA1, 0xB9, 0xA2, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_99[512] = { + 0xE4, 0xCF, 0x00, 0x00, 0xE4, 0xCE, 0xBB, 0xE5, /* 0x00-0x03 */ + 0x00, 0x00, 0xBB, 0xE6, 0x00, 0x00, 0xE4, 0xD0, /* 0x04-0x07 */ + 0xE8, 0xBF, 0xBB, 0xE8, 0xBE, 0x69, 0x00, 0x00, /* 0x08-0x0B */ + 0xBB, 0xE7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xC0, 0x5C, 0xE8, 0xC1, 0xBE, 0x6B, 0xBE, 0x6A, /* 0x10-0x13 */ + 0xE8, 0xC2, 0xE8, 0xC5, 0xE8, 0xC3, 0xE8, 0xC4, /* 0x14-0x17 */ + 0xBE, 0x6C, 0x00, 0x00, 0xC0, 0x61, 0xC0, 0x5F, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0x5E, 0xEC, 0x5D, /* 0x1C-0x1F */ + 0x00, 0x00, 0xC0, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0xEC, 0x5C, 0xEF, 0x4B, 0x00, 0x00, 0xEC, 0x5E, /* 0x24-0x27 */ + 0xC0, 0x5D, 0xEC, 0x5F, 0xEF, 0x4E, 0xEF, 0x4C, /* 0x28-0x2B */ + 0xEF, 0x4D, 0xEF, 0x52, 0xC3, 0x4B, 0xEF, 0x51, /* 0x2C-0x2F */ + 0xEF, 0x54, 0xEF, 0x53, 0xEF, 0x50, 0xEF, 0x4F, /* 0x30-0x33 */ + 0x00, 0x00, 0xC1, 0xFD, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xAE, 0x00, 0x00, /* 0x38-0x3B */ + 0xF1, 0xAD, 0xC3, 0x4A, 0xC3, 0x48, 0xC3, 0x49, /* 0x3C-0x3F */ + 0x00, 0x00, 0xF1, 0xAC, 0x00, 0x00, 0xF3, 0xB1, /* 0x40-0x43 */ + 0x00, 0x00, 0xC4, 0x43, 0x00, 0x00, 0xF3, 0xB0, /* 0x44-0x47 */ + 0xF3, 0xAF, 0xC4, 0x44, 0x00, 0x00, 0xF5, 0x58, /* 0x48-0x4B */ + 0xF5, 0x57, 0x00, 0x00, 0xF5, 0x55, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF5, 0x54, 0xC4, 0xC8, 0xC4, 0xC7, 0xF5, 0x59, /* 0x50-0x53 */ + 0xF7, 0x76, 0xC5, 0xB9, 0xF6, 0x77, 0xC5, 0x57, /* 0x54-0x57 */ + 0xF6, 0x76, 0xF5, 0x56, 0x00, 0x00, 0xF7, 0x77, /* 0x58-0x5B */ + 0xC5, 0xE4, 0x00, 0x00, 0xC6, 0x61, 0xF9, 0x59, /* 0x5C-0x5F */ + 0x00, 0x00, 0xF9, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0xAD, 0xBA, 0xD8, 0x50, /* 0x94-0x97 */ + 0xEF, 0x55, 0xAD, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xE4, 0xD2, 0xE4, 0xD1, 0xEC, 0x60, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0xEF, 0x57, 0x00, 0x00, 0xEF, 0x56, /* 0xA0-0xA3 */ + 0x00, 0x00, 0xC3, 0x4C, 0xF3, 0xB2, 0xF3, 0xB3, /* 0xA4-0xA7 */ + 0xC4, 0xC9, 0x00, 0x00, 0x00, 0x00, 0xF9, 0xB2, /* 0xA8-0xAB */ + 0xB0, 0xA8, 0xB6, 0xBF, 0xB6, 0xBE, 0xE0, 0xE4, /* 0xAC-0xAF */ + 0xE0, 0xE6, 0xB9, 0xA4, 0xE0, 0xE5, 0xB9, 0xA3, /* 0xB0-0xB3 */ + 0xB9, 0xA5, 0xE0, 0xE7, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0xE4, 0xD4, 0xE4, 0xD6, 0xE4, 0xD5, /* 0xB8-0xBB */ + 0x00, 0x00, 0xE4, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0xBB, 0xE9, 0xE4, 0xD7, 0xE4, 0xD3, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xD9, /* 0xC4-0xC7 */ + 0x00, 0x00, 0xE8, 0xCC, 0x00, 0x00, 0xE8, 0xCF, /* 0xC8-0xCB */ + 0xE8, 0xD1, 0xE8, 0xC7, 0xE8, 0xCB, 0xE8, 0xC8, /* 0xCC-0xCF */ + 0xBE, 0x6E, 0xBE, 0x71, 0xBE, 0x73, 0xE8, 0xC9, /* 0xD0-0xD3 */ + 0xE8, 0xCA, 0xBE, 0x72, 0xE8, 0xCD, 0xE8, 0xD0, /* 0xD4-0xD7 */ + 0xE8, 0xCE, 0xBE, 0x74, 0x00, 0x00, 0xBE, 0x70, /* 0xD8-0xDB */ + 0xE8, 0xC6, 0xBE, 0x6D, 0x00, 0x00, 0xBE, 0x6F, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0xC0, 0x63, 0xEC, 0x66, /* 0xE0-0xE3 */ + 0xEC, 0x64, 0xEC, 0x63, 0x00, 0x00, 0xEC, 0x69, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xEC, 0x68, 0xEC, 0x67, 0x00, 0x00, /* 0xE8-0xEB */ + 0xEC, 0x62, 0xC0, 0x62, 0xEC, 0x61, 0x00, 0x00, /* 0xEC-0xEF */ + 0xEC, 0x65, 0xC0, 0x64, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0xEF, 0x5A, 0x00, 0x00, 0xEF, 0x5E, 0xEF, 0x5B, /* 0xF4-0xF7 */ + 0xEF, 0x5D, 0xEF, 0x5C, 0xEF, 0x59, 0xEF, 0x5F, /* 0xF8-0xFB */ + 0xEF, 0x62, 0xEF, 0x60, 0xEF, 0x61, 0xC2, 0x40, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9A[512] = { + 0x00, 0x00, 0xC1, 0xFE, 0xEF, 0x58, 0xEF, 0x63, /* 0x00-0x03 */ + 0xF1, 0xB3, 0xF1, 0xB6, 0xF1, 0xB8, 0xF1, 0xB7, /* 0x04-0x07 */ + 0x00, 0x00, 0xF1, 0xB1, 0xF1, 0xB5, 0xF1, 0xB0, /* 0x08-0x0B */ + 0x00, 0x00, 0xF1, 0xB2, 0xC3, 0x4D, 0xF1, 0xAF, /* 0x0C-0x0F */ + 0x00, 0x00, 0xF1, 0xB4, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0xF3, 0xC0, 0xF3, 0xB5, 0xC4, 0x45, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0xC4, 0x46, 0xF3, 0xB4, 0xF3, 0xB9, /* 0x18-0x1B */ + 0xF3, 0xBF, 0xF3, 0xB7, 0xF3, 0xBE, 0x00, 0x00, /* 0x1C-0x1F */ + 0xF3, 0xBB, 0x00, 0x00, 0xF3, 0xBA, 0xF3, 0xBD, /* 0x20-0x23 */ + 0xF3, 0xB8, 0xF3, 0xB6, 0x00, 0x00, 0xF3, 0xBC, /* 0x24-0x27 */ + 0x00, 0x00, 0xF5, 0x60, 0xF5, 0x5E, 0xC4, 0xCA, /* 0x28-0x2B */ + 0xF5, 0x5D, 0xF5, 0x63, 0xF5, 0x61, 0x00, 0x00, /* 0x2C-0x2F */ + 0xC4, 0xCB, 0xF5, 0x5C, 0xF5, 0x5A, 0x00, 0x00, /* 0x30-0x33 */ + 0xF5, 0x5B, 0xC4, 0xCD, 0xF5, 0x5F, 0xC4, 0xCC, /* 0x34-0x37 */ + 0xF5, 0x62, 0xF6, 0x78, 0xF6, 0x7E, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0xF6, 0x79, 0xC5, 0x5B, 0xF6, 0xA1, /* 0x3C-0x3F */ + 0xC5, 0x5A, 0xF6, 0x7D, 0xF6, 0x7C, 0xC5, 0x59, /* 0x40-0x43 */ + 0xF6, 0x7B, 0xC5, 0x58, 0xF6, 0x7A, 0x00, 0x00, /* 0x44-0x47 */ + 0xF7, 0x7D, 0xF7, 0xA1, 0xF7, 0x7E, 0x00, 0x00, /* 0x48-0x4B */ + 0xF7, 0x7B, 0xC5, 0xBB, 0xF7, 0x78, 0xF7, 0x7C, /* 0x4C-0x4F */ + 0xF7, 0xA3, 0x00, 0x00, 0xF7, 0xA2, 0xF7, 0x79, /* 0x50-0x53 */ + 0xF7, 0x7A, 0xC5, 0xBA, 0xF8, 0x52, 0xC5, 0xE7, /* 0x54-0x57 */ + 0x00, 0x00, 0xF8, 0x53, 0xC5, 0xE5, 0xC5, 0xE6, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD3, 0xC6, 0x4A, /* 0x5C-0x5F */ + 0xF9, 0x76, 0x00, 0x00, 0xC6, 0x6A, 0x00, 0x00, /* 0x60-0x63 */ + 0xF9, 0xB3, 0xC6, 0x6B, 0xF9, 0xB4, 0xF9, 0xB5, /* 0x64-0x67 */ + 0xF9, 0xC3, 0xF9, 0xC2, 0xC6, 0x7A, 0xF9, 0xCD, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xB0, 0xA9, 0x00, 0x00, 0x00, 0x00, 0xE0, 0xE9, /* 0xA8-0xAB */ + 0x00, 0x00, 0xE0, 0xE8, 0x00, 0x00, 0xBB, 0xEA, /* 0xAC-0xAF */ + 0xBB, 0xEB, 0xE4, 0xDA, 0x00, 0x00, 0xE8, 0xD2, /* 0xB0-0xB3 */ + 0xEC, 0x6C, 0x00, 0x00, 0x00, 0x00, 0xBE, 0x75, /* 0xB4-0xB7 */ + 0xC0, 0x65, 0xEC, 0x6A, 0x00, 0x00, 0xEC, 0x6D, /* 0xB8-0xBB */ + 0xC0, 0x66, 0x00, 0x00, 0xEF, 0x64, 0xEC, 0x6B, /* 0xBC-0xBF */ + 0xF1, 0xB9, 0xC3, 0x4E, 0xF3, 0xC1, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0x66, 0xF5, 0x64, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0x65, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0xF6, 0xA2, 0x00, 0x00, 0xC5, 0x5C, /* 0xCC-0xCF */ + 0xF7, 0xA4, 0xC5, 0xEA, 0xC5, 0xBC, 0xC5, 0xE8, /* 0xD0-0xD3 */ + 0xC5, 0xE9, 0xF8, 0xD4, 0xC6, 0x62, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xB0, 0xAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0xF1, 0xBA, 0x00, 0x00, 0x00, 0x00, 0xD4, 0x49, /* 0xDC-0xDF */ + 0x00, 0x00, 0xB9, 0xA6, 0x00, 0x00, 0xE4, 0xDB, /* 0xE0-0xE3 */ + 0x00, 0x00, 0x00, 0x00, 0xBB, 0xEC, 0xE4, 0xDC, /* 0xE4-0xE7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0xD4, /* 0xE8-0xEB */ + 0xE8, 0xD3, 0xC0, 0x68, 0xBE, 0x76, 0xBE, 0x77, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE8, 0xD7, 0xE8, 0xD6, 0xE8, 0xD5, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0x6E, 0xEC, 0x71, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xEC, 0x70, 0xEC, 0x6F, 0xC0, 0x67, /* 0xF8-0xFB */ + 0xEF, 0x68, 0xEF, 0x66, 0xEF, 0x65, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9B[512] = { + 0x00, 0x00, 0xEF, 0x67, 0x00, 0x00, 0xC3, 0x4F, /* 0x00-0x03 */ + 0xF1, 0xBC, 0xF1, 0xBD, 0xC3, 0x50, 0x00, 0x00, /* 0x04-0x07 */ + 0xF1, 0xBB, 0x00, 0x00, 0xF3, 0xC3, 0xF3, 0xC2, /* 0x08-0x0B */ + 0xF3, 0xC5, 0xC4, 0x47, 0xF3, 0xC4, 0x00, 0x00, /* 0x0C-0x0F */ + 0xF5, 0x67, 0xF5, 0x69, 0xF5, 0x68, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xF6, 0xA3, 0xF6, 0xA6, 0xF6, 0xA4, /* 0x14-0x17 */ + 0xF6, 0xA5, 0xF7, 0xA5, 0xC5, 0xBD, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0x54, 0xF8, 0x55, /* 0x1C-0x1F */ + 0xF8, 0x56, 0x00, 0x00, 0xC6, 0x4B, 0xC6, 0x63, /* 0x20-0x23 */ + 0xF9, 0xB6, 0xB0, 0xAB, 0x00, 0x00, 0xBE, 0x78, /* 0x24-0x27 */ + 0xC0, 0x69, 0xF1, 0xBE, 0x00, 0x00, 0xF7, 0xA6, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xC4, 0xD4, 0x4A, /* 0x2C-0x2F */ + 0x00, 0x00, 0xC6, 0x7B, 0xB0, 0xAC, 0xEC, 0x72, /* 0x30-0x33 */ + 0x00, 0x00, 0xF1, 0xBF, 0x00, 0x00, 0xF3, 0xC6, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0xF6, 0xA7, 0xF7, 0xA7, /* 0x38-0x3B */ + 0xB0, 0xAD, 0x00, 0x00, 0xE4, 0xDD, 0xE4, 0xDE, /* 0x3C-0x3F */ + 0x00, 0x00, 0xBB, 0xED, 0xBB, 0xEE, 0xE8, 0xD9, /* 0x40-0x43 */ + 0xBE, 0x7A, 0xBE, 0x79, 0xE8, 0xD8, 0x00, 0x00, /* 0x44-0x47 */ + 0xEF, 0x69, 0x00, 0x00, 0xF1, 0xC0, 0xF1, 0xC2, /* 0x48-0x4B */ + 0xF1, 0xC1, 0xC3, 0x53, 0xC3, 0x52, 0xC3, 0x51, /* 0x4C-0x4F */ + 0x00, 0x00, 0xC5, 0x5E, 0xF6, 0xA8, 0x00, 0x00, /* 0x50-0x53 */ + 0xC5, 0x5D, 0xF7, 0xA9, 0xF7, 0xA8, 0x00, 0x00, /* 0x54-0x57 */ + 0xC6, 0x4C, 0xF8, 0xD5, 0xB3, 0xBD, 0xE0, 0xEA, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0xE1, /* 0x5C-0x5F */ + 0xE4, 0xDF, 0xE4, 0xE0, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xE8, 0xE2, 0x00, 0x00, 0xE8, 0xDD, 0xE8, 0xDA, /* 0x64-0x67 */ + 0xE8, 0xE1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0xE8, 0xE3, 0x00, 0x00, 0x00, 0x00, 0xBE, 0x7C, /* 0x6C-0x6F */ + 0xE8, 0xE0, 0xE8, 0xDC, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0xE8, 0xDB, 0xE8, 0xDF, 0xE8, 0xDE, 0xBE, 0x7B, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0x7D, 0xEC, 0x78, /* 0x78-0x7B */ + 0xEC, 0x76, 0xEC, 0xA1, 0xEC, 0x77, 0x00, 0x00, /* 0x7C-0x7F */ + + 0xEC, 0x73, 0x00, 0x00, 0xEC, 0x79, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0xEC, 0x74, 0xEF, 0x72, 0xEC, 0x75, /* 0x84-0x87 */ + 0xEC, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xEC, 0x7C, 0xC0, 0x6A, 0xEC, 0x7B, 0xEC, 0x7A, /* 0x90-0x93 */ + 0x00, 0x00, 0xEC, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0x6A, 0xEF, 0x6D, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0x6C, 0x00, 0x00, /* 0x9C-0x9F */ + 0xEF, 0x74, 0xEF, 0x6F, 0xEF, 0x73, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xEF, 0x71, 0xEF, 0x70, 0xEF, 0x6E, 0x00, 0x00, /* 0xA4-0xA7 */ + 0xEF, 0x6B, 0x00, 0x00, 0xC2, 0x43, 0xC2, 0x42, /* 0xA8-0xAB */ + 0x00, 0x00, 0xC2, 0x44, 0xC2, 0x41, 0xEF, 0x75, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0xF1, 0xC8, 0xF1, 0xCB, 0x00, 0x00, /* 0xB4-0xB7 */ + 0xF1, 0xC9, 0xF1, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0xF1, 0xCE, 0x00, 0x00, 0xF1, 0xC6, /* 0xBC-0xBF */ + 0xC3, 0x58, 0xF1, 0xC7, 0x00, 0x00, 0xF1, 0xC5, /* 0xC0-0xC3 */ + 0xF1, 0xCC, 0x00, 0x00, 0xF1, 0xC4, 0xF1, 0xC3, /* 0xC4-0xC7 */ + 0xC3, 0x57, 0xC3, 0x55, 0xC3, 0x54, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xCA, /* 0xD0-0xD3 */ + 0xF3, 0xCF, 0xF3, 0xD5, 0xC4, 0x4A, 0xF3, 0xD0, /* 0xD4-0xD7 */ + 0x00, 0x00, 0xF3, 0xD3, 0xF3, 0xD7, 0xC4, 0x4B, /* 0xD8-0xDB */ + 0xF3, 0xD2, 0x00, 0x00, 0xF3, 0xCA, 0x00, 0x00, /* 0xDC-0xDF */ + 0xF3, 0xC9, 0xF3, 0xD6, 0xF3, 0xCD, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xF3, 0xCB, 0xF3, 0xD4, 0xF3, 0xCC, 0xC4, 0x49, /* 0xE4-0xE7 */ + 0xC4, 0x48, 0x00, 0x00, 0xF3, 0xC7, 0xF3, 0xC8, /* 0xE8-0xEB */ + 0xF3, 0xD1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0xF3, 0xCE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF5, 0x6C, /* 0xF4-0xF7 */ + 0xF5, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */ + 0x00, 0x00, 0xC3, 0x56, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9C[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0xF5, 0x6D, 0xF5, 0x73, 0xF5, 0x71, /* 0x04-0x07 */ + 0xF5, 0x6B, 0xF5, 0x76, 0x00, 0x00, 0xF5, 0x6A, /* 0x08-0x0B */ + 0x00, 0x00, 0xC4, 0xCF, 0xF5, 0x72, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0xF5, 0x6E, 0xC4, 0xCE, /* 0x10-0x13 */ + 0xF5, 0x75, 0x00, 0x00, 0x00, 0x00, 0xF5, 0x74, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0xF6, 0xAB, 0xF6, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0xF6, 0xB1, 0x00, 0x00, 0xF6, 0xAD, /* 0x20-0x23 */ + 0xF6, 0xB0, 0xC5, 0x60, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xF6, 0xAE, 0xF6, 0xAF, 0x00, 0x00, 0xF6, 0xA9, /* 0x28-0x2B */ + 0xF6, 0xAC, 0xC5, 0x5F, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0xC5, 0xBF, 0xF7, 0xB4, 0xF7, 0xAF, /* 0x30-0x33 */ + 0xF7, 0xB3, 0x00, 0x00, 0xF7, 0xB6, 0xF7, 0xB2, /* 0x34-0x37 */ + 0x00, 0x00, 0xF7, 0xAE, 0x00, 0x00, 0xC5, 0xC1, /* 0x38-0x3B */ + 0xF7, 0xB1, 0xF7, 0xB5, 0xC5, 0xC0, 0xF7, 0xAC, /* 0x3C-0x3F */ + 0xF5, 0x70, 0xF7, 0xB0, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0xF7, 0xAD, 0x00, 0x00, 0xF7, 0xAA, 0x00, 0x00, /* 0x44-0x47 */ + 0xF7, 0xAB, 0xC5, 0xBE, 0xF8, 0x5A, 0xF8, 0x5C, /* 0x48-0x4B */ + 0xF8, 0x5F, 0xF8, 0x5B, 0xF8, 0x60, 0x00, 0x00, /* 0x4C-0x4F */ + 0xF8, 0x59, 0x00, 0x00, 0xF8, 0x57, 0x00, 0x00, /* 0x50-0x53 */ + 0xC5, 0xEB, 0xF8, 0x5D, 0xC5, 0xED, 0xC5, 0xEC, /* 0x54-0x57 */ + 0xF8, 0x58, 0xF8, 0x5E, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xDA, 0xC6, 0x4D, /* 0x5C-0x5F */ + 0xF8, 0xDB, 0x00, 0x00, 0xF8, 0xD9, 0xF8, 0xD6, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xD8, 0xF8, 0xD7, /* 0x64-0x67 */ + 0xF9, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0xF9, 0x5C, 0xF9, 0x5B, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0xF9, 0x79, 0x00, 0x00, 0xF9, 0x78, /* 0x70-0x73 */ + 0xF9, 0x77, 0xF9, 0x7A, 0x00, 0x00, 0xC6, 0x73, /* 0x74-0x77 */ + 0xC6, 0x74, 0xF9, 0xCA, 0xF9, 0xCE, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xB3, 0xBE, 0xDC, 0xAF, 0xE0, 0xED, /* 0xE4-0xE7 */ + 0x00, 0x00, 0xB9, 0xA7, 0xE0, 0xEB, 0x00, 0x00, /* 0xE8-0xEB */ + 0x00, 0x00, 0xE0, 0xEC, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */ + 0x00, 0x00, 0xE4, 0xE2, 0xE4, 0xE3, 0xBB, 0xF1, /* 0xF0-0xF3 */ + 0xBB, 0xEF, 0xE4, 0xE4, 0xBB, 0xF0, 0xE8, 0xE8, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xE8, 0xEB, 0xE8, 0xE5, 0xE8, 0xEC, /* 0xF8-0xFB */ + 0xE8, 0xE4, 0xE8, 0xE6, 0x00, 0x00, 0xE8, 0xE7, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9D[512] = { + 0xE8, 0xEA, 0x00, 0x00, 0x00, 0x00, 0xBE, 0xA1, /* 0x00-0x03 */ + 0xE8, 0xEF, 0xE8, 0xEE, 0xBE, 0x7D, 0xE8, 0xE9, /* 0x04-0x07 */ + 0xE8, 0xED, 0xBE, 0x7E, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xEC, 0xAC, 0x00, 0x00, 0xC0, 0x6F, 0x00, 0x00, /* 0x10-0x13 */ + 0xEC, 0xA7, 0xC0, 0x6B, 0x00, 0x00, 0xEC, 0xA4, /* 0x14-0x17 */ + 0xEC, 0xAA, 0xEC, 0xAD, 0x00, 0x00, 0xC0, 0x70, /* 0x18-0x1B */ + 0x00, 0x00, 0xEC, 0xA9, 0xEC, 0xA6, 0xEC, 0xAE, /* 0x1C-0x1F */ + 0xEC, 0xA5, 0x00, 0x00, 0xEC, 0xAB, 0xC0, 0x6C, /* 0x20-0x23 */ + 0x00, 0x00, 0xEC, 0xA3, 0xC0, 0x6D, 0x00, 0x00, /* 0x24-0x27 */ + 0xC0, 0x6E, 0xEC, 0xA8, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0xEF, 0xA9, 0xEF, 0x7A, 0xEF, 0x7B, /* 0x2C-0x2F */ + 0xEF, 0x7E, 0xEF, 0x7C, 0x00, 0x00, 0xEF, 0x76, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0xEF, 0x79, 0xEF, 0xA5, /* 0x34-0x37 */ + 0xEF, 0x7D, 0x00, 0x00, 0x00, 0x00, 0xC2, 0x45, /* 0x38-0x3B */ + 0x00, 0x00, 0xEF, 0xA7, 0xEF, 0xA4, 0xC2, 0x46, /* 0x3C-0x3F */ + 0xEF, 0xA6, 0xEF, 0x77, 0xEF, 0xA2, 0xEF, 0xA3, /* 0x40-0x43 */ + 0x00, 0x00, 0xEF, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xF1, 0xD2, 0xF1, 0xD4, /* 0x48-0x4B */ + 0xF1, 0xD7, 0x00, 0x00, 0x00, 0x00, 0xF1, 0xD1, /* 0x4C-0x4F */ + 0x00, 0x00, 0xC3, 0x59, 0xF1, 0xD9, 0xF1, 0xD0, /* 0x50-0x53 */ + 0xF1, 0xDA, 0x00, 0x00, 0xF1, 0xD6, 0xF1, 0xD8, /* 0x54-0x57 */ + 0xF1, 0xDC, 0xF1, 0xD5, 0xF1, 0xDD, 0xF1, 0xD3, /* 0x58-0x5B */ + 0xF1, 0xCF, 0xC3, 0x5A, 0x00, 0x00, 0xF1, 0xDB, /* 0x5C-0x5F */ + 0xC3, 0x5B, 0xC4, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xEF, 0x78, /* 0x64-0x67 */ + 0xF3, 0xF1, 0xF3, 0xE8, 0xC4, 0x4F, 0xF3, 0xE4, /* 0x68-0x6B */ + 0xC4, 0x50, 0x00, 0x00, 0x00, 0x00, 0xF3, 0xED, /* 0x6C-0x6F */ + 0xF3, 0xE7, 0xF3, 0xDD, 0xC4, 0x4E, 0xF3, 0xEA, /* 0x70-0x73 */ + 0xF3, 0xE5, 0xF3, 0xE6, 0x00, 0x00, 0xF3, 0xD8, /* 0x74-0x77 */ + 0xF3, 0xDF, 0xF3, 0xEE, 0x00, 0x00, 0xF3, 0xEB, /* 0x78-0x7B */ + 0x00, 0x00, 0xF3, 0xE3, 0x00, 0x00, 0xF3, 0xEF, /* 0x7C-0x7F */ + + 0xF3, 0xDE, 0xF3, 0xD9, 0xF3, 0xEC, 0x00, 0x00, /* 0x80-0x83 */ + 0xF3, 0xDB, 0xF3, 0xE9, 0xF3, 0xE0, 0xF3, 0xF0, /* 0x84-0x87 */ + 0xF3, 0xDC, 0xC4, 0x4C, 0xF3, 0xDA, 0xF3, 0xE1, /* 0x88-0x8B */ + 0xF3, 0xE2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xF5, 0x7D, 0x00, 0x00, 0xF5, 0x7B, 0x00, 0x00, /* 0x90-0x93 */ + 0xF5, 0xA2, 0x00, 0x00, 0xF5, 0xAE, 0xF5, 0xA5, /* 0x94-0x97 */ + 0xF5, 0x7C, 0xF5, 0x78, 0xF5, 0xA7, 0xF5, 0x7E, /* 0x98-0x9B */ + 0xF5, 0xA3, 0xF5, 0x7A, 0xF5, 0xAA, 0xF5, 0x77, /* 0x9C-0x9F */ + 0xF5, 0xA1, 0xF5, 0xA6, 0xF5, 0xA8, 0xF5, 0xAB, /* 0xA0-0xA3 */ + 0xF5, 0x79, 0x00, 0x00, 0xF5, 0xAF, 0xF5, 0xB0, /* 0xA4-0xA7 */ + 0xF5, 0xA9, 0xF5, 0xAD, 0xF5, 0xA4, 0x00, 0x00, /* 0xA8-0xAB */ + 0xF6, 0xC1, 0xF6, 0xC4, 0x00, 0x00, 0xC5, 0x61, /* 0xAC-0xAF */ + 0x00, 0x00, 0xF6, 0xC3, 0xF6, 0xC8, 0xF6, 0xC6, /* 0xB0-0xB3 */ + 0xC5, 0x62, 0xF6, 0xBD, 0xF6, 0xB3, 0xF6, 0xB2, /* 0xB4-0xB7 */ + 0xC5, 0x64, 0xF6, 0xBF, 0xF6, 0xC0, 0xF6, 0xBC, /* 0xB8-0xBB */ + 0xF6, 0xB4, 0x00, 0x00, 0xF6, 0xB9, 0xF5, 0xAC, /* 0xBC-0xBF */ + 0x00, 0x00, 0xF6, 0xB5, 0xC5, 0x63, 0xF6, 0xBB, /* 0xC0-0xC3 */ + 0x00, 0x00, 0xF6, 0xBA, 0x00, 0x00, 0xF6, 0xB6, /* 0xC4-0xC7 */ + 0xF6, 0xC2, 0x00, 0x00, 0xF6, 0xB7, 0xF7, 0xBB, /* 0xC8-0xCB */ + 0xF6, 0xC5, 0xF6, 0xC7, 0xF6, 0xBE, 0xF6, 0xB8, /* 0xCC-0xCF */ + 0xF7, 0xBC, 0xF7, 0xBE, 0xF7, 0xB8, 0xC5, 0xC2, /* 0xD0-0xD3 */ + 0x00, 0x00, 0xF7, 0xC5, 0xF7, 0xC3, 0xC5, 0xC3, /* 0xD4-0xD7 */ + 0xF7, 0xC2, 0xF7, 0xC1, 0xF7, 0xBA, 0xF7, 0xB7, /* 0xD8-0xDB */ + 0xF7, 0xBD, 0xF7, 0xC6, 0xF7, 0xB9, 0xF7, 0xBF, /* 0xDC-0xDF */ + 0x00, 0x00, 0xF8, 0x69, 0xF8, 0x6E, 0xF8, 0x64, /* 0xE0-0xE3 */ + 0xF8, 0x67, 0xC5, 0xEE, 0xF8, 0x6B, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xF8, 0x72, 0xF7, 0xC0, 0x00, 0x00, 0xF8, 0x65, /* 0xE8-0xEB */ + 0xF8, 0x6F, 0xF8, 0x73, 0xF8, 0x6A, 0xF8, 0x63, /* 0xEC-0xEF */ + 0xF8, 0x6D, 0x00, 0x00, 0xF8, 0x6C, 0xF8, 0x71, /* 0xF0-0xF3 */ + 0xF8, 0x70, 0xF7, 0xC4, 0xF8, 0x68, 0xF8, 0x62, /* 0xF4-0xF7 */ + 0xF8, 0x66, 0xC6, 0x4E, 0xC6, 0x4F, 0xF8, 0x61, /* 0xF8-0xFB */ + 0x00, 0x00, 0xF8, 0xE6, 0xF8, 0xDD, 0xF8, 0xE5, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9E[512] = { + 0xF8, 0xE2, 0xF8, 0xE3, 0xF8, 0xDC, 0xF8, 0xDF, /* 0x00-0x03 */ + 0xF8, 0xE7, 0xF8, 0xE1, 0xF8, 0xE0, 0xF8, 0xDE, /* 0x04-0x07 */ + 0x00, 0x00, 0xF8, 0xE4, 0x00, 0x00, 0xF9, 0x5D, /* 0x08-0x0B */ + 0x00, 0x00, 0xF9, 0x5E, 0x00, 0x00, 0xF9, 0x60, /* 0x0C-0x0F */ + 0xF9, 0x5F, 0xF9, 0x62, 0xF9, 0x61, 0xF9, 0x7C, /* 0x10-0x13 */ + 0xF9, 0x7B, 0xF9, 0xB7, 0x00, 0x00, 0xF9, 0xB8, /* 0x14-0x17 */ + 0x00, 0x00, 0xF9, 0xC5, 0xC6, 0x78, 0xC6, 0x7C, /* 0x18-0x1B */ + 0x00, 0x00, 0xF9, 0xCF, 0xC6, 0x7D, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0xB3, 0xBF, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0xC4, 0xD0, 0xF6, 0xC9, 0x00, 0x00, /* 0x78-0x7B */ + 0xC6, 0x50, 0xC6, 0x51, 0x00, 0x00, 0xB3, 0xC0, /* 0x7C-0x7F */ + + 0xE0, 0xEE, 0x00, 0x00, 0xB9, 0xA8, 0xE8, 0xF0, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0xEC, 0xB0, 0xEC, 0xB1, /* 0x84-0x87 */ + 0xEC, 0xAF, 0xEF, 0xAB, 0xEF, 0xAA, 0xC2, 0x47, /* 0x88-0x8B */ + 0xF1, 0xDF, 0xEF, 0xAC, 0xF1, 0xDE, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0xF3, 0xF3, 0xC4, 0x51, 0xC4, 0x53, /* 0x90-0x93 */ + 0xF3, 0xF2, 0x00, 0x00, 0x00, 0x00, 0xC4, 0x52, /* 0x94-0x97 */ + 0x00, 0x00, 0xF5, 0xB1, 0xF5, 0xB3, 0xF5, 0xB2, /* 0x98-0x9B */ + 0xF6, 0xCA, 0xC5, 0x65, 0x00, 0x00, 0xC5, 0xEF, /* 0x9C-0x9F */ + 0xF8, 0xE8, 0xF9, 0x63, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xF9, 0xD2, 0xB3, 0xC1, 0x00, 0x00, 0xE4, 0xE5, /* 0xA4-0xA7 */ + 0x00, 0x00, 0xBE, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0xEC, 0xB3, 0xEC, 0xB2, 0x00, 0x00, /* 0xAC-0xAF */ + 0xEF, 0xAD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0xC4, 0x54, 0xC4, 0xD1, 0xF7, 0xC7, 0xF9, 0xCB, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xB3, 0xC2, /* 0xB8-0xBB */ + 0xBB, 0xF2, 0x00, 0x00, 0xBE, 0xA3, 0x00, 0x00, /* 0xBC-0xBF */ + 0xF3, 0xF4, 0x00, 0x00, 0xF8, 0x74, 0xB6, 0xC0, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0xEF, 0xAE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0xC6, 0x64, 0xB6, 0xC1, 0xBE, 0xA4, 0xC2, 0x48, /* 0xCC-0xCF */ + 0xF8, 0x75, 0xB6, 0xC2, 0x00, 0x00, 0xE8, 0xF1, /* 0xD0-0xD3 */ + 0xC0, 0x72, 0xEC, 0xB4, 0xEC, 0xB5, 0x00, 0x00, /* 0xD4-0xD7 */ + 0xC0, 0x71, 0x00, 0x00, 0xEF, 0xAF, 0xC2, 0x4C, /* 0xD8-0xDB */ + 0xC2, 0x4A, 0xC2, 0x4B, 0xC2, 0x49, 0xF1, 0xE0, /* 0xDC-0xDF */ + 0xC3, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */ + 0xF5, 0xB5, 0xF5, 0xB4, 0xF5, 0xB7, 0xF5, 0xB6, /* 0xE4-0xE7 */ + 0xC4, 0xD2, 0x00, 0x00, 0x00, 0x00, 0xF6, 0xCB, /* 0xE8-0xEB */ + 0x00, 0x00, 0xF6, 0xCD, 0xF6, 0xCC, 0xC5, 0x66, /* 0xEC-0xEF */ + 0xF7, 0xC8, 0x00, 0x00, 0xF8, 0x76, 0xF8, 0x77, /* 0xF0-0xF3 */ + 0xC5, 0xF0, 0xF9, 0x64, 0xF9, 0x7D, 0xC6, 0x75, /* 0xF4-0xF7 */ + 0x00, 0x00, 0xDC, 0xB0, 0xEC, 0xB6, 0xEF, 0xB0, /* 0xF8-0xFB */ + 0xF3, 0xF5, 0xE0, 0xEF, 0x00, 0x00, 0xEF, 0xB1, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_9F[512] = { + 0xF1, 0xE2, 0xF1, 0xE1, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0x78, 0xC6, 0x52, /* 0x04-0x07 */ + 0x00, 0x00, 0xF9, 0x65, 0xF9, 0x7E, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0xB9, 0xA9, 0xE8, 0xF2, /* 0x0C-0x0F */ + 0xE8, 0xF3, 0x00, 0x00, 0xEC, 0xB7, 0xB9, 0xAA, /* 0x10-0x13 */ + 0x00, 0x00, 0xC3, 0x5D, 0xF1, 0xE3, 0x00, 0x00, /* 0x14-0x17 */ + 0xF6, 0xCF, 0xC5, 0x67, 0xF6, 0xD0, 0xF6, 0xCE, /* 0x18-0x1B */ + 0xF8, 0x79, 0x00, 0x00, 0xF8, 0xE9, 0x00, 0x00, /* 0x1C-0x1F */ + 0xB9, 0xAB, 0x00, 0x00, 0xEF, 0xB4, 0xEF, 0xB3, /* 0x20-0x23 */ + 0xEF, 0xB2, 0xF1, 0xE4, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0xF1, 0xE8, 0xF1, 0xE7, 0xF1, 0xE6, 0xF1, 0xE5, /* 0x28-0x2B */ + 0xC3, 0x5E, 0xF3, 0xF6, 0xF5, 0xB9, 0xC4, 0xD3, /* 0x2C-0x2F */ + 0xF5, 0xB8, 0xF6, 0xD1, 0xF7, 0xCB, 0xF7, 0xCA, /* 0x30-0x33 */ + 0xC5, 0xC4, 0xF7, 0xC9, 0xF8, 0x7C, 0xF8, 0x7B, /* 0x34-0x37 */ + 0xF8, 0x7A, 0x00, 0x00, 0x00, 0x00, 0xBB, 0xF3, /* 0x38-0x3B */ + 0x00, 0x00, 0xEC, 0xB8, 0xC2, 0x4D, 0x00, 0x00, /* 0x3C-0x3F */ + 0xF3, 0xF7, 0xF3, 0xF8, 0xF7, 0xCC, 0xF8, 0x7D, /* 0x40-0x43 */ + 0x00, 0x00, 0x00, 0x00, 0xF8, 0xEA, 0xF9, 0x66, /* 0x44-0x47 */ + 0xF9, 0xB9, 0xF9, 0xD4, 0xBB, 0xF4, 0xC2, 0x4E, /* 0x48-0x4B */ + 0xF1, 0xE9, 0xF3, 0xF9, 0xF6, 0xD2, 0xF8, 0x7E, /* 0x4C-0x4F */ + 0x00, 0x00, 0x00, 0x00, 0xBE, 0xA6, 0x00, 0x00, /* 0x50-0x53 */ + 0xEF, 0xB5, 0xF1, 0xEA, 0xF3, 0xFA, 0xF3, 0xFB, /* 0x54-0x57 */ + 0xF3, 0xFC, 0xF5, 0xBE, 0x00, 0x00, 0xF5, 0xBA, /* 0x58-0x5B */ + 0xC5, 0x68, 0xF5, 0xBD, 0xF5, 0xBC, 0xC4, 0xD4, /* 0x5C-0x5F */ + 0xF5, 0xBB, 0xC4, 0xD6, 0x00, 0x00, 0xC4, 0xD5, /* 0x60-0x63 */ + 0xF6, 0xD4, 0xF6, 0xD3, 0xC5, 0x69, 0xC5, 0x6A, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xC5, 0xC6, 0xF7, 0xCD, /* 0x68-0x6B */ + 0xC5, 0xC5, 0x00, 0x00, 0xF8, 0xA3, 0xF8, 0xA4, /* 0x6C-0x6F */ + 0xF8, 0xA2, 0xF8, 0xA1, 0xC6, 0x54, 0x00, 0x00, /* 0x70-0x73 */ + 0xF8, 0xEB, 0xF8, 0xEC, 0xF8, 0xED, 0xC6, 0x53, /* 0x74-0x77 */ + 0xF9, 0x67, 0xF9, 0x6A, 0xF9, 0x69, 0xF9, 0x68, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0xF9, 0xD3, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0xC0, 0x73, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0xC3, 0x65, 0xF5, 0xBF, 0xF6, 0xD5, 0x00, 0x00, /* 0x90-0x93 */ + 0xC5, 0xC7, 0xF7, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0xF9, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0xC0, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0xEF, 0xB6, 0x00, 0x00, 0xF7, 0xCF, 0x00, 0x00, /* 0xA0-0xA3 */ + 0xF9, 0xA1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ +}; + +static const unsigned char u2c_DC[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ +}; + +static const unsigned char u2c_F9[512] = { + 0xB0, 0x5A, 0xA7, 0xF3, 0xA8, 0xAE, 0xB8, 0xEB, /* 0x00-0x03 */ + 0xB7, 0xC6, 0xA6, 0xEA, 0xA5, 0x79, 0xC0, 0x74, /* 0x04-0x07 */ + 0xC0, 0x74, 0xAB, 0xB4, 0xAA, 0xF7, 0xB3, 0xE2, /* 0x08-0x0B */ + 0xA9, 0x60, 0xC3, 0x69, 0xC4, 0xEE, 0xC3, 0xB9, /* 0x0C-0x0F */ + 0xC5, 0xDA, 0xC1, 0xB3, 0xBB, 0x72, 0xC5, 0xDE, /* 0x10-0x13 */ + 0xBC, 0xD6, 0xAC, 0xA5, 0xAF, 0x4F, 0xAF, 0x5F, /* 0x14-0x17 */ + 0xB8, 0xA8, 0xB9, 0x54, 0xC0, 0x64, 0xB6, 0xC3, /* 0x18-0x1B */ + 0xA7, 0x5A, 0xC4, 0xE6, 0xC4, 0xEA, 0xC4, 0xF5, /* 0x1C-0x1F */ + 0xC6, 0x7D, 0xB4, 0x50, 0xC0, 0xDD, 0xC2, 0xC5, /* 0x20-0x23 */ + 0xC4, 0xB0, 0xA9, 0xD4, 0xC3, 0xBE, 0xC4, 0xFA, /* 0x24-0x27 */ + 0xB4, 0x59, 0xAE, 0xD4, 0xAE, 0xF6, 0xAF, 0x54, /* 0x28-0x2B */ + 0x00, 0x00, 0xA8, 0xD3, 0xA7, 0x4E, 0xB3, 0xD2, /* 0x2C-0x2F */ + 0xBE, 0xDB, 0xC3, 0x72, 0xC4, 0x6C, 0xBF, 0x63, /* 0x30-0x33 */ + 0xA6, 0xD1, 0xC4, 0xAA, 0xB8, 0xB8, 0xB8, 0xF4, /* 0x34-0x37 */ + 0xC5, 0x53, 0xBE, 0x7C, 0xC6, 0x4F, 0xB8, 0x4C, /* 0x38-0x3B */ + 0xB8, 0x53, 0xBA, 0xF1, 0xDB, 0x77, 0xBF, 0xFD, /* 0x3C-0x3F */ + 0xB3, 0xC0, 0xBD, 0xD7, 0xC3, 0x62, 0xA7, 0xCB, /* 0x40-0x43 */ + 0xC5, 0xA2, 0xC5, 0xA4, 0xA8, 0x63, 0xBD, 0x55, /* 0x44-0x47 */ + 0xB8, 0xEF, 0xB9, 0x70, 0xC2, 0x53, 0xB9, 0xF0, /* 0x48-0x4B */ + 0xBC, 0xD3, 0xB2, 0x5C, 0xBA, 0x7C, 0xB2, 0xD6, /* 0x4C-0x4F */ + 0xC1, 0x5C, 0xAD, 0xAE, 0xB0, 0xC7, 0xA6, 0xD8, /* 0x50-0x53 */ + 0xBB, 0xFE, 0xAD, 0xE2, 0xB8, 0x57, 0xBA, 0xF0, /* 0x54-0x57 */ + 0xB5, 0xD9, 0xB3, 0xAE, 0xC5, 0xAA, 0xCE, 0xD4, /* 0x58-0x5B */ + 0xBC, 0xD6, 0xBF, 0xD5, 0xA4, 0xA6, 0xB9, 0xE7, /* 0x5C-0x5F */ + 0xAB, 0xE3, 0xB2, 0x76, 0xB2, 0xA7, 0xA5, 0x5F, /* 0x60-0x63 */ + 0xED, 0xA8, 0xAB, 0x4B, 0xB4, 0x5F, 0xA4, 0xA3, /* 0x64-0x67 */ + 0xAA, 0x63, 0xBC, 0xC6, 0xAF, 0xC1, 0xB0, 0xD1, /* 0x68-0x6B */ + 0xB6, 0xEB, 0xAC, 0xD9, 0xB8, 0xAD, 0xBB, 0xA1, /* 0x6C-0x6F */ + 0xB1, 0xFE, 0xA8, 0xB0, 0xA8, 0x48, 0xAC, 0x42, /* 0x70-0x73 */ + 0xAD, 0x59, 0xB1, 0xB0, 0xB2, 0xA4, 0xAB, 0x47, /* 0x74-0x77 */ + 0xA8, 0xE2, 0x00, 0x00, 0xB1, 0xE7, 0xC2, 0xB3, /* 0x78-0x7B */ + 0xA8, 0x7D, 0xBD, 0xCC, 0xB6, 0x71, 0xC0, 0x79, /* 0x7C-0x7F */ + + 0xA7, 0x66, 0xA4, 0x6B, 0xC3, 0x66, 0xAE, 0xC8, /* 0x80-0x83 */ + 0xC2, 0x6F, 0xC4, 0x72, 0xBE, 0x5B, 0xC6, 0x7A, /* 0x84-0x87 */ + 0xC4, 0x52, 0xBE, 0xA4, 0xA4, 0x4F, 0xBE, 0xE4, /* 0x88-0x8B */ + 0xBE, 0xFA, 0xF7, 0x65, 0xA6, 0x7E, 0xBC, 0xA6, /* 0x8C-0x8F */ + 0xC5, 0xCA, 0xBC, 0xBF, 0xBA, 0xA7, 0xB7, 0xD2, /* 0x90-0x93 */ + 0xE6, 0xA3, 0x00, 0x00, 0xBD, 0x6D, 0xC1, 0x70, /* 0x94-0x97 */ + 0xBD, 0xFB, 0xBD, 0xAC, 0xB3, 0x73, 0xC1, 0xE5, /* 0x98-0x9B */ + 0xA6, 0x43, 0xA6, 0x48, 0xAB, 0x7C, 0xAF, 0x50, /* 0x9C-0x9F */ + 0xB5, 0xF5, 0xBB, 0xA1, 0xB7, 0x47, 0xA9, 0xC0, /* 0xA0-0xA3 */ + 0xB1, 0xC9, 0xC0, 0xD4, 0xC3, 0xAE, 0xC2, 0x79, /* 0xA4-0xA7 */ + 0xA5, 0x4F, 0xCB, 0xF1, 0xB9, 0xE7, 0xC0, 0xAD, /* 0xA8-0xAB */ + 0xCC, 0xB0, 0xAC, 0xC2, 0xBC, 0xFC, 0xB2, 0xDC, /* 0xAC-0xAF */ + 0xB2, 0xE2, 0xB9, 0x61, 0xB9, 0x73, 0xC6, 0x46, /* 0xB0-0xB3 */ + 0xBB, 0xE2, 0xA8, 0xD2, 0xC2, 0xA7, 0xC4, 0xBF, /* 0xB4-0xB7 */ + 0xC1, 0xF5, 0xB4, 0x63, 0xA4, 0x46, 0xB9, 0xB1, /* 0xB8-0xBB */ + 0xBC, 0x64, 0xA7, 0xBF, 0xAE, 0xC6, 0xBC, 0xD6, /* 0xBC-0xBF */ + 0xBF, 0x52, 0xC0, 0xF8, 0xE7, 0x64, 0xBF, 0xF1, /* 0xC0-0xC3 */ + 0xC0, 0x73, 0xB7, 0x77, 0xA8, 0xBF, 0xBC, 0x42, /* 0xC4-0xC7 */ + 0xCC, 0xD8, 0xAC, 0x68, 0xAC, 0x79, 0xB7, 0xC8, /* 0xC8-0xCB */ + 0xAF, 0x5B, 0xAF, 0x64, 0xB2, 0xB8, 0xAF, 0xC3, /* 0xCC-0xCF */ + 0xC3, 0xFE, 0xA4, 0xBB, 0xBC, 0xAE, 0xB3, 0xB0, /* 0xD0-0xD3 */ + 0xAD, 0xDB, 0xB1, 0x5B, 0xB2, 0x5F, 0xBD, 0xFC, /* 0xD4-0xD7 */ + 0xAB, 0xDF, 0xB7, 0x58, 0xAE, 0xDF, 0xB2, 0x76, /* 0xD8-0xDB */ + 0xB6, 0xA9, 0xA7, 0x51, 0xA6, 0x4F, 0xBC, 0x69, /* 0xDC-0xDF */ + 0xA9, 0xF6, 0xA7, 0xF5, 0xB1, 0xF9, 0xAA, 0x64, /* 0xE0-0xE3 */ + 0xB2, 0x7A, 0xB5, 0x67, 0xBF, 0xA9, 0x00, 0x00, /* 0xE4-0xE7 */ + 0xB8, 0xCC, 0xA8, 0xBD, 0xC2, 0xF7, 0xB0, 0xCE, /* 0xE8-0xEB */ + 0xB7, 0xC4, 0xA7, 0x5B, 0xBF, 0x4D, 0xBF, 0x5A, /* 0xEC-0xEF */ + 0xC4, 0xA9, 0x00, 0x00, 0xC5, 0xEC, 0xC5, 0xEF, /* 0xF0-0xF3 */ + 0xAA, 0x4C, 0xB2, 0x4F, 0xC1, 0x7B, 0xA5, 0xDF, /* 0xF4-0xF7 */ + 0xB2, 0xC1, 0xB2, 0xC9, 0xAA, 0xAC, 0xAA, 0xA5, /* 0xF8-0xFB */ + 0xC3, 0xD1, 0xA4, 0xB0, 0xAF, 0xF9, 0xA8, 0xEB, /* 0xFC-0xFF */ +}; + +static const unsigned char u2c_FA[512] = { + 0xA4, 0xC1, 0xAB, 0xD7, 0xA9, 0xDD, 0xBF, 0x7D, /* 0x00-0x03 */ + 0xA6, 0x76, 0xAC, 0x7D, 0xBC, 0xC9, 0xBF, 0xE7, /* 0x04-0x07 */ + 0xA6, 0xE6, 0xAD, 0xB0, 0xA8, 0xA3, 0xB9, 0xF8, /* 0x08-0x0B */ + 0xC9, 0x4A, 0xDD, 0xFC, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0xB6, 0xEF, 0x00, 0x00, 0xB4, 0xB8, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0xE8, 0xF9, 0xBD, 0xDE, 0xAF, 0x71, /* 0x14-0x17 */ + 0x00, 0x00, 0xAF, 0xAB, 0xB2, 0xBB, 0xBA, 0xD6, /* 0x18-0x1B */ + 0xB9, 0x74, 0xBA, 0xEB, 0xA6, 0xD0, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0xBD, 0xD1, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0xB6, 0x68, 0xB3, 0xA3, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0xB6, 0xBA, 0xB9, 0x7D, /* 0x28-0x2B */ + 0xC0, 0x5D, 0xC5, 0x62, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ +}; + +static const unsigned char u2c_FE[512] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */ + 0xA1, 0x4A, 0xA1, 0x57, 0x00, 0x00, 0xA1, 0x59, /* 0x30-0x33 */ + 0xA1, 0x5B, 0xA1, 0x5F, 0xA1, 0x60, 0xA1, 0x63, /* 0x34-0x37 */ + 0xA1, 0x64, 0xA1, 0x67, 0xA1, 0x68, 0xA1, 0x6B, /* 0x38-0x3B */ + 0xA1, 0x6C, 0xA1, 0x6F, 0xA1, 0x70, 0xA1, 0x73, /* 0x3C-0x3F */ + 0xA1, 0x74, 0xA1, 0x77, 0xA1, 0x78, 0xA1, 0x7B, /* 0x40-0x43 */ + 0xA1, 0x7C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */ + 0x00, 0x00, 0xA1, 0xC6, 0xA1, 0xC7, 0xA1, 0xCA, /* 0x48-0x4B */ + 0xA1, 0xCB, 0xA1, 0xC8, 0xA1, 0xC9, 0xA1, 0x5C, /* 0x4C-0x4F */ + 0xA1, 0x4D, 0xA1, 0x4E, 0xA1, 0x4F, 0x00, 0x00, /* 0x50-0x53 */ + 0xA1, 0x51, 0xA1, 0x52, 0xA1, 0x53, 0xA1, 0x54, /* 0x54-0x57 */ + 0x00, 0x00, 0xA1, 0x7D, 0xA1, 0x7E, 0xA1, 0xA1, /* 0x58-0x5B */ + 0xA1, 0xA2, 0xA1, 0xA3, 0xA1, 0xA4, 0xA1, 0xCC, /* 0x5C-0x5F */ + 0xA1, 0xCD, 0xA1, 0xCE, 0xA1, 0xDE, 0xA1, 0xDF, /* 0x60-0x63 */ + 0xA1, 0xE0, 0xA1, 0xE1, 0xA1, 0xE2, 0x00, 0x00, /* 0x64-0x67 */ + 0xA2, 0x42, 0xA2, 0x4C, 0xA2, 0x4D, 0xA2, 0x4E, /* 0x68-0x6B */ +}; + +static const unsigned char u2c_FF[512] = { + 0x00, 0x00, 0xA1, 0x49, 0xA1, 0xA8, 0xA1, 0xAD, /* 0x00-0x03 */ + 0xA2, 0x43, 0xA2, 0x48, 0xA1, 0xAE, 0xA1, 0xA6, /* 0x04-0x07 */ + 0xA1, 0x5D, 0xA1, 0x5E, 0xA1, 0xAF, 0xA1, 0xCF, /* 0x08-0x0B */ + 0xA1, 0x41, 0xA1, 0xD0, 0xA1, 0x44, 0xA1, 0xFE, /* 0x0C-0x0F */ + 0xA2, 0xAF, 0xA2, 0xB0, 0xA2, 0xB1, 0xA2, 0xB2, /* 0x10-0x13 */ + 0xA2, 0xB3, 0xA2, 0xB4, 0xA2, 0xB5, 0xA2, 0xB6, /* 0x14-0x17 */ + 0xA2, 0xB7, 0xA2, 0xB8, 0xA1, 0x47, 0xA1, 0x46, /* 0x18-0x1B */ + 0xA1, 0xD5, 0xA1, 0xD7, 0xA1, 0xD6, 0xA1, 0x48, /* 0x1C-0x1F */ + 0xA2, 0x49, 0xA2, 0xCF, 0xA2, 0xD0, 0xA2, 0xD1, /* 0x20-0x23 */ + 0xA2, 0xD2, 0xA2, 0xD3, 0xA2, 0xD4, 0xA2, 0xD5, /* 0x24-0x27 */ + 0xA2, 0xD6, 0xA2, 0xD7, 0xA2, 0xD8, 0xA2, 0xD9, /* 0x28-0x2B */ + 0xA2, 0xDA, 0xA2, 0xDB, 0xA2, 0xDC, 0xA2, 0xDD, /* 0x2C-0x2F */ + 0xA2, 0xDE, 0xA2, 0xDF, 0xA2, 0xE0, 0xA2, 0xE1, /* 0x30-0x33 */ + 0xA2, 0xE2, 0xA2, 0xE3, 0xA2, 0xE4, 0xA2, 0xE5, /* 0x34-0x37 */ + 0xA2, 0xE6, 0xA2, 0xE7, 0xA2, 0xE8, 0xA1, 0x65, /* 0x38-0x3B */ + 0xA2, 0x40, 0xA1, 0x66, 0xA1, 0x73, 0xA1, 0xC4, /* 0x3C-0x3F */ + 0xA1, 0xA5, 0xA2, 0xE9, 0xA2, 0xEA, 0xA2, 0xEB, /* 0x40-0x43 */ + 0xA2, 0xEC, 0xA2, 0xED, 0xA2, 0xEE, 0xA2, 0xEF, /* 0x44-0x47 */ + 0xA2, 0xF0, 0xA2, 0xF1, 0xA2, 0xF2, 0xA2, 0xF3, /* 0x48-0x4B */ + 0xA2, 0xF4, 0xA2, 0xF5, 0xA2, 0xF6, 0xA2, 0xF7, /* 0x4C-0x4F */ + 0xA2, 0xF8, 0xA2, 0xF9, 0xA2, 0xFA, 0xA2, 0xFB, /* 0x50-0x53 */ + 0xA2, 0xFC, 0xA2, 0xFD, 0xA2, 0xFE, 0xA3, 0x40, /* 0x54-0x57 */ + 0xA3, 0x41, 0xA3, 0x42, 0xA3, 0x43, 0xA1, 0x61, /* 0x58-0x5B */ + 0xA1, 0x55, 0xA1, 0x62, 0xA1, 0xE3, 0x00, 0x00, /* 0x5C-0x5F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */ + 0xA1, 0x4E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */ + 0xA2, 0x46, 0xA2, 0x47, 0x00, 0x00, 0xA1, 0xC3, /* 0xE0-0xE3 */ + 0x00, 0x00, 0xA2, 0x44, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + NULL, NULL, u2c_02, u2c_03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_20, u2c_21, u2c_22, u2c_23, NULL, u2c_25, u2c_26, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + u2c_30, u2c_31, u2c_32, u2c_33, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, u2c_4E, u2c_4F, + u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, + u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, + u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, + u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, + u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, + u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, + u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, + u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, + u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, + u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, u2c_DC, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, u2c_F9, u2c_FA, NULL, NULL, NULL, u2c_FE, u2c_FF, }; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(const wchar_t uni, + unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni&0xFF; + unsigned char ch = (uni>>8)&0xFF; + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + + uni2charset = page_uni2charset[ch]; + if (uni2charset) { + if (boundlen <= 1) + return -ENAMETOOLONG; + out[0] = uni2charset[cl*2]; + out[1] = uni2charset[cl*2+1]; + if (out[0] == 0x00 && out[1] == 0x00) + return -EINVAL; + n = 2; + } else if (ch==0 && cl) { + out[0] = cl; + n = 1; + } + else + return -EINVAL; + + return n; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + unsigned char ch, cl; + const wchar_t *charset2uni; + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + if (boundlen == 1) { + *uni = rawstring[0]; + return 1; + } + + ch = rawstring[0]; + cl = rawstring[1]; + + charset2uni = page_charset2uni[ch]; + if (charset2uni && cl) { + *uni = charset2uni[cl]; + if (*uni == 0x0000) + return -EINVAL; + n = 2; + } else{ + *uni = ch; + n = 1; + } + return n; +} + +static struct nls_table table = { + .charset = "cp950", + .alias = "big5", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_cp950(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_cp950(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_cp950) +module_exit(exit_nls_cp950) + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NLS(big5); diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c new file mode 100644 index 00000000..7424929a --- /dev/null +++ b/fs/nls/nls_euc-jp.c @@ -0,0 +1,581 @@ +/* + * linux/fs/nls/nls_euc-jp.c + * + * Added `OSF/JVC Recommended Code Set Conversion Specification + * between Japanese EUC and Shift-JIS' support: + * (http://www.opengroup.or.jp/jvc/cde/sjis-euc-e.html) + */ + +#include +#include +#include +#include +#include + +static struct nls_table *p_nls; + +#define IS_SJIS_LOW_BYTE(l) ((0x40 <= (l)) && ((l) <= 0xFC) && ((l) != 0x7F)) +/* JIS X 0208 (include NEC spesial characters) */ +#define IS_SJIS_JISX0208(h, l) ((((0x81 <= (h)) && ((h) <= 0x9F)) \ + || ((0xE0 <= (h)) && ((h) <= 0xEA))) \ + && IS_SJIS_LOW_BYTE(l)) +#define IS_SJIS_JISX0201KANA(c) ((0xA1 <= (c)) && ((c) <= 0xDF)) +#define IS_SJIS_UDC_LOW(h, l) (((0xF0 <= (h)) && ((h) <= 0xF4)) \ + && IS_SJIS_LOW_BYTE(l)) +#define IS_SJIS_UDC_HI(h, l) (((0xF5 <= (h)) && ((h) <= 0xF9)) \ + && IS_SJIS_LOW_BYTE(l)) +#define IS_SJIS_IBM(h, l) (((0xFA <= (h)) && ((h) <= 0xFC)) \ + && IS_SJIS_LOW_BYTE(l)) +#define IS_SJIS_NECIBM(h, l) (((0xED <= (h)) && ((h) <= 0xEE)) \ + && IS_SJIS_LOW_BYTE(l)) +#define MAP_SJIS2EUC(sjis_hi, sjis_lo, sjis_p, euc_hi, euc_lo, euc_p) { \ + if ((sjis_lo) >= 0x9F) { \ + (euc_hi) = (sjis_hi) * 2 - (((sjis_p) * 2 - (euc_p)) - 1); \ + (euc_lo) = (sjis_lo) + 2; \ + } else { \ + (euc_hi) = (sjis_hi) * 2 - ((sjis_p) * 2 - (euc_p)); \ + (euc_lo) = (sjis_lo) + ((sjis_lo) >= 0x7F ? 0x60 : 0x61); \ + } \ +} while(0) + +#define SS2 (0x8E) /* Single Shift 2 */ +#define SS3 (0x8F) /* Single Shift 3 */ +#define IS_EUC_BYTE(c) ((0xA1 <= (c)) && ((c) <= 0xFE)) +#define IS_EUC_JISX0208(h, l) (IS_EUC_BYTE(h) && IS_EUC_BYTE(l)) +#define IS_EUC_JISX0201KANA(h, l) (((h) == SS2) && (0xA1 <= (l) && (l) <= 0xDF)) +#define IS_EUC_UDC_LOW(h, l) (((0xF5 <= (h)) && ((h) <= 0xFE)) \ + && IS_EUC_BYTE(l)) +#define IS_EUC_UDC_HI(h, l) IS_EUC_UDC_LOW(h, l) /* G3 block */ +#define MAP_EUC2SJIS(euc_hi, euc_lo, euc_p, sjis_hi, sjis_lo, sjis_p) { \ + if ((euc_hi) & 1) { \ + (sjis_hi) = (euc_hi) / 2 + ((sjis_p) - (euc_p) / 2); \ + (sjis_lo) = (euc_lo) - ((euc_lo) >= 0xE0 ? 0x60 : 0x61); \ + } else { \ + (sjis_hi) = (euc_hi) / 2 + (((sjis_p) - (euc_p) / 2) - 1); \ + (sjis_lo) = (euc_lo) - 2; \ + } \ +} while(0) + +/* SJIS IBM extended characters to EUC map */ +static const unsigned char sjisibm2euc_map[][2] = { + {0xF3, 0xF3}, {0xF3, 0xF4}, {0xF3, 0xF5}, {0xF3, 0xF6}, {0xF3, 0xF7}, + {0xF3, 0xF8}, {0xF3, 0xF9}, {0xF3, 0xFA}, {0xF3, 0xFB}, {0xF3, 0xFC}, + {0xF3, 0xFD}, {0xF3, 0xFE}, {0xF4, 0xA1}, {0xF4, 0xA2}, {0xF4, 0xA3}, + {0xF4, 0xA4}, {0xF4, 0xA5}, {0xF4, 0xA6}, {0xF4, 0xA7}, {0xF4, 0xA8}, + {0xA2, 0xCC}, {0xA2, 0xC3}, {0xF4, 0xA9}, {0xF4, 0xAA}, {0xF4, 0xAB}, + {0xF4, 0xAC}, {0xF4, 0xAD}, {0xA2, 0xE8}, {0xD4, 0xE3}, {0xDC, 0xDF}, + {0xE4, 0xE9}, {0xE3, 0xF8}, {0xD9, 0xA1}, {0xB1, 0xBB}, {0xF4, 0xAE}, + {0xC2, 0xAD}, {0xC3, 0xFC}, {0xE4, 0xD0}, {0xC2, 0xBF}, {0xBC, 0xF4}, + {0xB0, 0xA9}, {0xB0, 0xC8}, {0xF4, 0xAF}, {0xB0, 0xD2}, {0xB0, 0xD4}, + {0xB0, 0xE3}, {0xB0, 0xEE}, {0xB1, 0xA7}, {0xB1, 0xA3}, {0xB1, 0xAC}, + {0xB1, 0xA9}, {0xB1, 0xBE}, {0xB1, 0xDF}, {0xB1, 0xD8}, {0xB1, 0xC8}, + {0xB1, 0xD7}, {0xB1, 0xE3}, {0xB1, 0xF4}, {0xB1, 0xE1}, {0xB2, 0xA3}, + {0xF4, 0xB0}, {0xB2, 0xBB}, {0xB2, 0xE6}, {0x00, 0x00}, {0xB2, 0xED}, + {0xB2, 0xF5}, {0xB2, 0xFC}, {0xF4, 0xB1}, {0xB3, 0xB5}, {0xB3, 0xD8}, + {0xB3, 0xDB}, {0xB3, 0xE5}, {0xB3, 0xEE}, {0xB3, 0xFB}, {0xF4, 0xB2}, + {0xF4, 0xB3}, {0xB4, 0xC0}, {0xB4, 0xC7}, {0xB4, 0xD0}, {0xB4, 0xDE}, + {0xF4, 0xB4}, {0xB5, 0xAA}, {0xF4, 0xB5}, {0xB5, 0xAF}, {0xB5, 0xC4}, + {0xB5, 0xE8}, {0xF4, 0xB6}, {0xB7, 0xC2}, {0xB7, 0xE4}, {0xB7, 0xE8}, + {0xB7, 0xE7}, {0xF4, 0xB7}, {0xF4, 0xB8}, {0xF4, 0xB9}, {0xB8, 0xCE}, + {0xB8, 0xE1}, {0xB8, 0xF5}, {0xB8, 0xF7}, {0xB8, 0xF8}, {0xB8, 0xFC}, + {0xB9, 0xAF}, {0xB9, 0xB7}, {0xBA, 0xBE}, {0xBA, 0xDB}, {0xCD, 0xAA}, + {0xBA, 0xE1}, {0xF4, 0xBA}, {0xBA, 0xEB}, {0xBB, 0xB3}, {0xBB, 0xB8}, + {0xF4, 0xBB}, {0xBB, 0xCA}, {0xF4, 0xBC}, {0xF4, 0xBD}, {0xBB, 0xD0}, + {0xBB, 0xDE}, {0xBB, 0xF4}, {0xBB, 0xF5}, {0xBB, 0xF9}, {0xBC, 0xE4}, + {0xBC, 0xED}, {0xBC, 0xFE}, {0xF4, 0xBE}, {0xBD, 0xC2}, {0xBD, 0xE7}, + {0xF4, 0xBF}, {0xBD, 0xF0}, {0xBE, 0xB0}, {0xBE, 0xAC}, {0xF4, 0xC0}, + {0xBE, 0xB3}, {0xBE, 0xBD}, {0xBE, 0xCD}, {0xBE, 0xC9}, {0xBE, 0xE4}, + {0xBF, 0xA8}, {0xBF, 0xC9}, {0xC0, 0xC4}, {0xC0, 0xE4}, {0xC0, 0xF4}, + {0xC1, 0xA6}, {0xF4, 0xC1}, {0xC1, 0xF5}, {0xC1, 0xFC}, {0xF4, 0xC2}, + {0xC1, 0xF8}, {0xC2, 0xAB}, {0xC2, 0xA1}, {0xC2, 0xA5}, {0xF4, 0xC3}, + {0xC2, 0xB8}, {0xC2, 0xBA}, {0xF4, 0xC4}, {0xC2, 0xC4}, {0xC2, 0xD2}, + {0xC2, 0xD7}, {0xC2, 0xDB}, {0xC2, 0xDE}, {0xC2, 0xED}, {0xC2, 0xF0}, + {0xF4, 0xC5}, {0xC3, 0xA1}, {0xC3, 0xB5}, {0xC3, 0xC9}, {0xC3, 0xB9}, + {0xF4, 0xC6}, {0xC3, 0xD8}, {0xC3, 0xFE}, {0xF4, 0xC7}, {0xC4, 0xCC}, + {0xF4, 0xC8}, {0xC4, 0xD9}, {0xC4, 0xEA}, {0xC4, 0xFD}, {0xF4, 0xC9}, + {0xC5, 0xA7}, {0xC5, 0xB5}, {0xC5, 0xB6}, {0xF4, 0xCA}, {0xC5, 0xD5}, + {0xC6, 0xB8}, {0xC6, 0xD7}, {0xC6, 0xE0}, {0xC6, 0xEA}, {0xC6, 0xE3}, + {0xC7, 0xA1}, {0xC7, 0xAB}, {0xC7, 0xC7}, {0xC7, 0xC3}, {0xC7, 0xCB}, + {0xC7, 0xCF}, {0xC7, 0xD9}, {0xF4, 0xCB}, {0xF4, 0xCC}, {0xC7, 0xE6}, + {0xC7, 0xEE}, {0xC7, 0xFC}, {0xC7, 0xEB}, {0xC7, 0xF0}, {0xC8, 0xB1}, + {0xC8, 0xE5}, {0xC8, 0xF8}, {0xC9, 0xA6}, {0xC9, 0xAB}, {0xC9, 0xAD}, + {0xF4, 0xCD}, {0xC9, 0xCA}, {0xC9, 0xD3}, {0xC9, 0xE9}, {0xC9, 0xE3}, + {0xC9, 0xFC}, {0xC9, 0xF4}, {0xC9, 0xF5}, {0xF4, 0xCE}, {0xCA, 0xB3}, + {0xCA, 0xBD}, {0xCA, 0xEF}, {0xCA, 0xF1}, {0xCB, 0xAE}, {0xF4, 0xCF}, + {0xCB, 0xCA}, {0xCB, 0xE6}, {0xCB, 0xEA}, {0xCB, 0xF0}, {0xCB, 0xF4}, + {0xCB, 0xEE}, {0xCC, 0xA5}, {0xCB, 0xF9}, {0xCC, 0xAB}, {0xCC, 0xAE}, + {0xCC, 0xAD}, {0xCC, 0xB2}, {0xCC, 0xC2}, {0xCC, 0xD0}, {0xCC, 0xD9}, + {0xF4, 0xD0}, {0xCD, 0xBB}, {0xF4, 0xD1}, {0xCE, 0xBB}, {0xF4, 0xD2}, + {0xCE, 0xBA}, {0xCE, 0xC3}, {0xF4, 0xD3}, {0xCE, 0xF2}, {0xB3, 0xDD}, + {0xCF, 0xD5}, {0xCF, 0xE2}, {0xCF, 0xE9}, {0xCF, 0xED}, {0xF4, 0xD4}, + {0xF4, 0xD5}, {0xF4, 0xD6}, {0x00, 0x00}, {0xF4, 0xD7}, {0xD0, 0xE5}, + {0xF4, 0xD8}, {0xD0, 0xE9}, {0xD1, 0xE8}, {0xF4, 0xD9}, {0xF4, 0xDA}, + {0xD1, 0xEC}, {0xD2, 0xBB}, {0xF4, 0xDB}, {0xD3, 0xE1}, {0xD3, 0xE8}, + {0xD4, 0xA7}, {0xF4, 0xDC}, {0xF4, 0xDD}, {0xD4, 0xD4}, {0xD4, 0xF2}, + {0xD5, 0xAE}, {0xF4, 0xDE}, {0xD7, 0xDE}, {0xF4, 0xDF}, {0xD8, 0xA2}, + {0xD8, 0xB7}, {0xD8, 0xC1}, {0xD8, 0xD1}, {0xD8, 0xF4}, {0xD9, 0xC6}, + {0xD9, 0xC8}, {0xD9, 0xD1}, {0xF4, 0xE0}, {0xF4, 0xE1}, {0xF4, 0xE2}, + {0xF4, 0xE3}, {0xF4, 0xE4}, {0xDC, 0xD3}, {0xDD, 0xC8}, {0xDD, 0xD4}, + {0xDD, 0xEA}, {0xDD, 0xFA}, {0xDE, 0xA4}, {0xDE, 0xB0}, {0xF4, 0xE5}, + {0xDE, 0xB5}, {0xDE, 0xCB}, {0xF4, 0xE6}, {0xDF, 0xB9}, {0xF4, 0xE7}, + {0xDF, 0xC3}, {0xF4, 0xE8}, {0xF4, 0xE9}, {0xE0, 0xD9}, {0xF4, 0xEA}, + {0xF4, 0xEB}, {0xE1, 0xE2}, {0xF4, 0xEC}, {0xF4, 0xED}, {0xF4, 0xEE}, + {0xE2, 0xC7}, {0xE3, 0xA8}, {0xE3, 0xA6}, {0xE3, 0xA9}, {0xE3, 0xAF}, + {0xE3, 0xB0}, {0xE3, 0xAA}, {0xE3, 0xAB}, {0xE3, 0xBC}, {0xE3, 0xC1}, + {0xE3, 0xBF}, {0xE3, 0xD5}, {0xE3, 0xD8}, {0xE3, 0xD6}, {0xE3, 0xDF}, + {0xE3, 0xE3}, {0xE3, 0xE1}, {0xE3, 0xD4}, {0xE3, 0xE9}, {0xE4, 0xA6}, + {0xE3, 0xF1}, {0xE3, 0xF2}, {0xE4, 0xCB}, {0xE4, 0xC1}, {0xE4, 0xC3}, + {0xE4, 0xBE}, {0xF4, 0xEF}, {0xE4, 0xC0}, {0xE4, 0xC7}, {0xE4, 0xBF}, + {0xE4, 0xE0}, {0xE4, 0xDE}, {0xE4, 0xD1}, {0xF4, 0xF0}, {0xE4, 0xDC}, + {0xE4, 0xD2}, {0xE4, 0xDB}, {0xE4, 0xD4}, {0xE4, 0xFA}, {0xE4, 0xEF}, + {0xE5, 0xB3}, {0xE5, 0xBF}, {0xE5, 0xC9}, {0xE5, 0xD0}, {0xE5, 0xE2}, + {0xE5, 0xEA}, {0xE5, 0xEB}, {0xF4, 0xF1}, {0xF4, 0xF2}, {0xF4, 0xF3}, + {0xE6, 0xE8}, {0xE6, 0xEF}, {0xE7, 0xAC}, {0xF4, 0xF4}, {0xE7, 0xAE}, + {0xF4, 0xF5}, {0xE7, 0xB1}, {0xF4, 0xF6}, {0xE7, 0xB2}, {0xE8, 0xB1}, + {0xE8, 0xB6}, {0xF4, 0xF7}, {0xF4, 0xF8}, {0xE8, 0xDD}, {0xF4, 0xF9}, + {0xF4, 0xFA}, {0xE9, 0xD1}, {0xF4, 0xFB}, {0xE9, 0xED}, {0xEA, 0xCD}, + {0xF4, 0xFC}, {0xEA, 0xDB}, {0xEA, 0xE6}, {0xEA, 0xEA}, {0xEB, 0xA5}, + {0xEB, 0xFB}, {0xEB, 0xFA}, {0xF4, 0xFD}, {0xEC, 0xD6}, {0xF4, 0xFE}, +}; + +#define IS_EUC_IBM2JISX0208(h, l) \ + (((h) == 0xA2 && (l) == 0xCC) || ((h) == 0xA2 && (l) == 0xE8)) + +/* EUC to SJIS IBM extended characters map (G3 JIS X 0212 block) */ +static struct { + unsigned short euc; + unsigned char sjis[2]; +} euc2sjisibm_jisx0212_map[] = { + {0xA2C3, {0xFA, 0x55}}, {0xB0A9, {0xFA, 0x68}}, {0xB0C8, {0xFA, 0x69}}, + {0xB0D2, {0xFA, 0x6B}}, {0xB0D4, {0xFA, 0x6C}}, {0xB0E3, {0xFA, 0x6D}}, + {0xB0EE, {0xFA, 0x6E}}, {0xB1A3, {0xFA, 0x70}}, {0xB1A7, {0xFA, 0x6F}}, + {0xB1A9, {0xFA, 0x72}}, {0xB1AC, {0xFA, 0x71}}, {0xB1BB, {0xFA, 0x61}}, + {0xB1BE, {0xFA, 0x73}}, {0xB1C8, {0xFA, 0x76}}, {0xB1D7, {0xFA, 0x77}}, + {0xB1D8, {0xFA, 0x75}}, {0xB1DF, {0xFA, 0x74}}, {0xB1E1, {0xFA, 0x7A}}, + {0xB1E3, {0xFA, 0x78}}, {0xB1F4, {0xFA, 0x79}}, {0xB2A3, {0xFA, 0x7B}}, + {0xB2BB, {0xFA, 0x7D}}, {0xB2E6, {0xFA, 0x7E}}, {0xB2ED, {0xFA, 0x80}}, + {0xB2F5, {0xFA, 0x81}}, {0xB2FC, {0xFA, 0x82}}, {0xB3B5, {0xFA, 0x84}}, + {0xB3D8, {0xFA, 0x85}}, {0xB3DB, {0xFA, 0x86}}, {0xB3DD, {0xFB, 0x77}}, + {0xB3E5, {0xFA, 0x87}}, {0xB3EE, {0xFA, 0x88}}, {0xB3FB, {0xFA, 0x89}}, + {0xB4C0, {0xFA, 0x8C}}, {0xB4C7, {0xFA, 0x8D}}, {0xB4D0, {0xFA, 0x8E}}, + {0xB4DE, {0xFA, 0x8F}}, {0xB5AA, {0xFA, 0x91}}, {0xB5AF, {0xFA, 0x93}}, + {0xB5C4, {0xFA, 0x94}}, {0xB5E8, {0xFA, 0x95}}, {0xB7C2, {0xFA, 0x97}}, + {0xB7E4, {0xFA, 0x98}}, {0xB7E7, {0xFA, 0x9A}}, {0xB7E8, {0xFA, 0x99}}, + {0xB8CE, {0xFA, 0x9E}}, {0xB8E1, {0xFA, 0x9F}}, {0xB8F5, {0xFA, 0xA0}}, + {0xB8F7, {0xFA, 0xA1}}, {0xB8F8, {0xFA, 0xA2}}, {0xB8FC, {0xFA, 0xA3}}, + {0xB9AF, {0xFA, 0xA4}}, {0xB9B7, {0xFA, 0xA5}}, {0xBABE, {0xFA, 0xA6}}, + {0xBADB, {0xFA, 0xA7}}, {0xBAE1, {0xFA, 0xA9}}, {0xBAEB, {0xFA, 0xAB}}, + {0xBBB3, {0xFA, 0xAC}}, {0xBBB8, {0xFA, 0xAD}}, {0xBBCA, {0xFA, 0xAF}}, + {0xBBD0, {0xFA, 0xB2}}, {0xBBDE, {0xFA, 0xB3}}, {0xBBF4, {0xFA, 0xB4}}, + {0xBBF5, {0xFA, 0xB5}}, {0xBBF9, {0xFA, 0xB6}}, {0xBCE4, {0xFA, 0xB7}}, + {0xBCED, {0xFA, 0xB8}}, {0xBCF4, {0xFA, 0x67}}, {0xBCFE, {0xFA, 0xB9}}, + {0xBDC2, {0xFA, 0xBB}}, {0xBDE7, {0xFA, 0xBC}}, {0xBDF0, {0xFA, 0xBE}}, + {0xBEAC, {0xFA, 0xC0}}, {0xBEB0, {0xFA, 0xBF}}, {0xBEB3, {0xFA, 0xC2}}, + {0xBEBD, {0xFA, 0xC3}}, {0xBEC9, {0xFA, 0xC5}}, {0xBECD, {0xFA, 0xC4}}, + {0xBEE4, {0xFA, 0xC6}}, {0xBFA8, {0xFA, 0xC7}}, {0xBFC9, {0xFA, 0xC8}}, + {0xC0C4, {0xFA, 0xC9}}, {0xC0E4, {0xFA, 0xCA}}, {0xC0F4, {0xFA, 0xCB}}, + {0xC1A6, {0xFA, 0xCC}}, {0xC1F5, {0xFA, 0xCE}}, {0xC1F8, {0xFA, 0xD1}}, + {0xC1FC, {0xFA, 0xCF}}, {0xC2A1, {0xFA, 0xD3}}, {0xC2A5, {0xFA, 0xD4}}, + {0xC2AB, {0xFA, 0xD2}}, {0xC2AD, {0xFA, 0x63}}, {0xC2B8, {0xFA, 0xD6}}, + {0xC2BA, {0xFA, 0xD7}}, {0xC2BF, {0xFA, 0x66}}, {0xC2C4, {0xFA, 0xD9}}, + {0xC2D2, {0xFA, 0xDA}}, {0xC2D7, {0xFA, 0xDB}}, {0xC2DB, {0xFA, 0xDC}}, + {0xC2DE, {0xFA, 0xDD}}, {0xC2ED, {0xFA, 0xDE}}, {0xC2F0, {0xFA, 0xDF}}, + {0xC3A1, {0xFA, 0xE1}}, {0xC3B5, {0xFA, 0xE2}}, {0xC3B9, {0xFA, 0xE4}}, + {0xC3C9, {0xFA, 0xE3}}, {0xC3D8, {0xFA, 0xE6}}, {0xC3FC, {0xFA, 0x64}}, + {0xC3FE, {0xFA, 0xE7}}, {0xC4CC, {0xFA, 0xE9}}, {0xC4D9, {0xFA, 0xEB}}, + {0xC4EA, {0xFA, 0xEC}}, {0xC4FD, {0xFA, 0xED}}, {0xC5A7, {0xFA, 0xEF}}, + {0xC5B5, {0xFA, 0xF0}}, {0xC5B6, {0xFA, 0xF1}}, {0xC5D5, {0xFA, 0xF3}}, + {0xC6B8, {0xFA, 0xF4}}, {0xC6D7, {0xFA, 0xF5}}, {0xC6E0, {0xFA, 0xF6}}, + {0xC6E3, {0xFA, 0xF8}}, {0xC6EA, {0xFA, 0xF7}}, {0xC7A1, {0xFA, 0xF9}}, + {0xC7AB, {0xFA, 0xFA}}, {0xC7C3, {0xFA, 0xFC}}, {0xC7C7, {0xFA, 0xFB}}, + {0xC7CB, {0xFB, 0x40}}, {0xC7CF, {0xFB, 0x41}}, {0xC7D9, {0xFB, 0x42}}, + {0xC7E6, {0xFB, 0x45}}, {0xC7EB, {0xFB, 0x48}}, {0xC7EE, {0xFB, 0x46}}, + {0xC7F0, {0xFB, 0x49}}, {0xC7FC, {0xFB, 0x47}}, {0xC8B1, {0xFB, 0x4A}}, + {0xC8E5, {0xFB, 0x4B}}, {0xC8F8, {0xFB, 0x4C}}, {0xC9A6, {0xFB, 0x4D}}, + {0xC9AB, {0xFB, 0x4E}}, {0xC9AD, {0xFB, 0x4F}}, {0xC9CA, {0xFB, 0x51}}, + {0xC9D3, {0xFB, 0x52}}, {0xC9E3, {0xFB, 0x54}}, {0xC9E9, {0xFB, 0x53}}, + {0xC9F4, {0xFB, 0x56}}, {0xC9F5, {0xFB, 0x57}}, {0xC9FC, {0xFB, 0x55}}, + {0xCAB3, {0xFB, 0x59}}, {0xCABD, {0xFB, 0x5A}}, {0xCAEF, {0xFB, 0x5B}}, + {0xCAF1, {0xFB, 0x5C}}, {0xCBAE, {0xFB, 0x5D}}, {0xCBCA, {0xFB, 0x5F}}, + {0xCBE6, {0xFB, 0x60}}, {0xCBEA, {0xFB, 0x61}}, {0xCBEE, {0xFB, 0x64}}, + {0xCBF0, {0xFB, 0x62}}, {0xCBF4, {0xFB, 0x63}}, {0xCBF9, {0xFB, 0x66}}, + {0xCCA5, {0xFB, 0x65}}, {0xCCAB, {0xFB, 0x67}}, {0xCCAD, {0xFB, 0x69}}, + {0xCCAE, {0xFB, 0x68}}, {0xCCB2, {0xFB, 0x6A}}, {0xCCC2, {0xFB, 0x6B}}, + {0xCCD0, {0xFB, 0x6C}}, {0xCCD9, {0xFB, 0x6D}}, {0xCDAA, {0xFA, 0xA8}}, + {0xCDBB, {0xFB, 0x6F}}, {0xCEBA, {0xFB, 0x73}}, {0xCEBB, {0xFB, 0x71}}, + {0xCEC3, {0xFB, 0x74}}, {0xCEF2, {0xFB, 0x76}}, {0xCFD5, {0xFB, 0x78}}, + {0xCFE2, {0xFB, 0x79}}, {0xCFE9, {0xFB, 0x7A}}, {0xCFED, {0xFB, 0x7B}}, + {0xD0E5, {0xFB, 0x81}}, {0xD0E9, {0xFB, 0x83}}, {0xD1E8, {0xFB, 0x84}}, + {0xD1EC, {0xFB, 0x87}}, {0xD2BB, {0xFB, 0x88}}, {0xD3E1, {0xFB, 0x8A}}, + {0xD3E8, {0xFB, 0x8B}}, {0xD4A7, {0xFB, 0x8C}}, {0xD4D4, {0xFB, 0x8F}}, + {0xD4E3, {0xFA, 0x5C}}, {0xD4F2, {0xFB, 0x90}}, {0xD5AE, {0xFB, 0x91}}, + {0xD7DE, {0xFB, 0x93}}, {0xD8A2, {0xFB, 0x95}}, {0xD8B7, {0xFB, 0x96}}, + {0xD8C1, {0xFB, 0x97}}, {0xD8D1, {0xFB, 0x98}}, {0xD8F4, {0xFB, 0x99}}, + {0xD9A1, {0xFA, 0x60}}, {0xD9C6, {0xFB, 0x9A}}, {0xD9C8, {0xFB, 0x9B}}, + {0xD9D1, {0xFB, 0x9C}}, {0xDCD3, {0xFB, 0xA2}}, {0xDCDF, {0xFA, 0x5D}}, + {0xDDC8, {0xFB, 0xA3}}, {0xDDD4, {0xFB, 0xA4}}, {0xDDEA, {0xFB, 0xA5}}, + {0xDDFA, {0xFB, 0xA6}}, {0xDEA4, {0xFB, 0xA7}}, {0xDEB0, {0xFB, 0xA8}}, + {0xDEB5, {0xFB, 0xAA}}, {0xDECB, {0xFB, 0xAB}}, {0xDFB9, {0xFB, 0xAD}}, + {0xDFC3, {0xFB, 0xAF}}, {0xE0D9, {0xFB, 0xB2}}, {0xE1E2, {0xFB, 0xB5}}, + {0xE2C7, {0xFB, 0xB9}}, {0xE3A6, {0xFB, 0xBB}}, {0xE3A8, {0xFB, 0xBA}}, + {0xE3A9, {0xFB, 0xBC}}, {0xE3AA, {0xFB, 0xBF}}, {0xE3AB, {0xFB, 0xC0}}, + {0xE3AF, {0xFB, 0xBD}}, {0xE3B0, {0xFB, 0xBE}}, {0xE3BC, {0xFB, 0xC1}}, + {0xE3BF, {0xFB, 0xC3}}, {0xE3C1, {0xFB, 0xC2}}, {0xE3D4, {0xFB, 0xCA}}, + {0xE3D5, {0xFB, 0xC4}}, {0xE3D6, {0xFB, 0xC6}}, {0xE3D8, {0xFB, 0xC5}}, + {0xE3DF, {0xFB, 0xC7}}, {0xE3E1, {0xFB, 0xC9}}, {0xE3E3, {0xFB, 0xC8}}, + {0xE3E9, {0xFB, 0xCB}}, {0xE3F1, {0xFB, 0xCD}}, {0xE3F2, {0xFB, 0xCE}}, + {0xE3F8, {0xFA, 0x5F}}, {0xE4A6, {0xFB, 0xCC}}, {0xE4BE, {0xFB, 0xD2}}, + {0xE4BF, {0xFB, 0xD6}}, {0xE4C0, {0xFB, 0xD4}}, {0xE4C1, {0xFB, 0xD0}}, + {0xE4C3, {0xFB, 0xD1}}, {0xE4C7, {0xFB, 0xD5}}, {0xE4CB, {0xFB, 0xCF}}, + {0xE4D0, {0xFA, 0x65}}, {0xE4D1, {0xFB, 0xD9}}, {0xE4D2, {0xFB, 0xDC}}, + {0xE4D4, {0xFB, 0xDE}}, {0xE4DB, {0xFB, 0xDD}}, {0xE4DC, {0xFB, 0xDB}}, + {0xE4DE, {0xFB, 0xD8}}, {0xE4E0, {0xFB, 0xD7}}, {0xE4E9, {0xFA, 0x5E}}, + {0xE4EF, {0xFB, 0xE0}}, {0xE4FA, {0xFB, 0xDF}}, {0xE5B3, {0xFB, 0xE1}}, + {0xE5BF, {0xFB, 0xE2}}, {0xE5C9, {0xFB, 0xE3}}, {0xE5D0, {0xFB, 0xE4}}, + {0xE5E2, {0xFB, 0xE5}}, {0xE5EA, {0xFB, 0xE6}}, {0xE5EB, {0xFB, 0xE7}}, + {0xE6E8, {0xFB, 0xEB}}, {0xE6EF, {0xFB, 0xEC}}, {0xE7AC, {0xFB, 0xED}}, + {0xE7AE, {0xFB, 0xEF}}, {0xE7B1, {0xFB, 0xF1}}, {0xE7B2, {0xFB, 0xF3}}, + {0xE8B1, {0xFB, 0xF4}}, {0xE8B6, {0xFB, 0xF5}}, {0xE8DD, {0xFB, 0xF8}}, + {0xE9D1, {0xFB, 0xFB}}, {0xE9ED, {0xFC, 0x40}}, {0xEACD, {0xFC, 0x41}}, + {0xEADB, {0xFC, 0x43}}, {0xEAE6, {0xFC, 0x44}}, {0xEAEA, {0xFC, 0x45}}, + {0xEBA5, {0xFC, 0x46}}, {0xEBFA, {0xFC, 0x48}}, {0xEBFB, {0xFC, 0x47}}, + {0xECD6, {0xFC, 0x4A}}, +}; + +/* EUC to SJIS IBM extended characters map (G3 Upper block) */ +static const unsigned char euc2sjisibm_g3upper_map[][2] = { + {0xFA, 0x40}, {0xFA, 0x41}, {0xFA, 0x42}, {0xFA, 0x43}, {0xFA, 0x44}, + {0xFA, 0x45}, {0xFA, 0x46}, {0xFA, 0x47}, {0xFA, 0x48}, {0xFA, 0x49}, + {0xFA, 0x4A}, {0xFA, 0x4B}, {0xFA, 0x4C}, {0xFA, 0x4D}, {0xFA, 0x4E}, + {0xFA, 0x4F}, {0xFA, 0x50}, {0xFA, 0x51}, {0xFA, 0x52}, {0xFA, 0x53}, + {0xFA, 0x56}, {0xFA, 0x57}, {0xFA, 0x58}, {0xFA, 0x59}, {0xFA, 0x5A}, + {0xFA, 0x62}, {0xFA, 0x6A}, {0xFA, 0x7C}, {0xFA, 0x83}, {0xFA, 0x8A}, + {0xFA, 0x8B}, {0xFA, 0x90}, {0xFA, 0x92}, {0xFA, 0x96}, {0xFA, 0x9B}, + {0xFA, 0x9C}, {0xFA, 0x9D}, {0xFA, 0xAA}, {0xFA, 0xAE}, {0xFA, 0xB0}, + {0xFA, 0xB1}, {0xFA, 0xBA}, {0xFA, 0xBD}, {0xFA, 0xC1}, {0xFA, 0xCD}, + {0xFA, 0xD0}, {0xFA, 0xD5}, {0xFA, 0xD8}, {0xFA, 0xE0}, {0xFA, 0xE5}, + {0xFA, 0xE8}, {0xFA, 0xEA}, {0xFA, 0xEE}, {0xFA, 0xF2}, {0xFB, 0x43}, + {0xFB, 0x44}, {0xFB, 0x50}, {0xFB, 0x58}, {0xFB, 0x5E}, {0xFB, 0x6E}, + {0xFB, 0x70}, {0xFB, 0x72}, {0xFB, 0x75}, {0xFB, 0x7C}, {0xFB, 0x7D}, + {0xFB, 0x7E}, {0xFB, 0x80}, {0xFB, 0x82}, {0xFB, 0x85}, {0xFB, 0x86}, + {0xFB, 0x89}, {0xFB, 0x8D}, {0xFB, 0x8E}, {0xFB, 0x92}, {0xFB, 0x94}, + {0xFB, 0x9D}, {0xFB, 0x9E}, {0xFB, 0x9F}, {0xFB, 0xA0}, {0xFB, 0xA1}, + {0xFB, 0xA9}, {0xFB, 0xAC}, {0xFB, 0xAE}, {0xFB, 0xB0}, {0xFB, 0xB1}, + {0xFB, 0xB3}, {0xFB, 0xB4}, {0xFB, 0xB6}, {0xFB, 0xB7}, {0xFB, 0xB8}, + {0xFB, 0xD3}, {0xFB, 0xDA}, {0xFB, 0xE8}, {0xFB, 0xE9}, {0xFB, 0xEA}, + {0xFB, 0xEE}, {0xFB, 0xF0}, {0xFB, 0xF2}, {0xFB, 0xF6}, {0xFB, 0xF7}, + {0xFB, 0xF9}, {0xFB, 0xFA}, {0xFB, 0xFC}, {0xFC, 0x42}, {0xFC, 0x49}, + {0xFC, 0x4B}, +}; + +static inline int sjisibm2euc(unsigned char *euc, const unsigned char sjis_hi, + const unsigned char sjis_lo); +static inline int euc2sjisibm_jisx0212(unsigned char *sjis, const unsigned char euc_hi, + const unsigned char euc_lo); +static inline int euc2sjisibm_g3upper(unsigned char *sjis, const unsigned char euc_hi, + const unsigned char euc_lo); +static inline int euc2sjisibm(unsigned char *sjis, const unsigned char euc_hi, + const unsigned char euc_lo); +static inline int sjisnec2sjisibm(unsigned char *sjisibm, + const unsigned char sjisnec_hi, + const unsigned char sjisnec_lo); + +/* SJIS IBM extended characters to EUC */ +static inline int sjisibm2euc(unsigned char *euc, const unsigned char sjis_hi, + const unsigned char sjis_lo) +{ + int index; + + index = ((sjis_hi - 0xFA) * (0xFD - 0x40)) + (sjis_lo - 0x40); + if (IS_EUC_IBM2JISX0208(sjisibm2euc_map[index][0], + sjisibm2euc_map[index][1])) { + euc[0] = sjisibm2euc_map[index][0]; + euc[1] = sjisibm2euc_map[index][1]; + return 2; + } else { + euc[0] = SS3; + euc[1] = sjisibm2euc_map[index][0]; + euc[2] = sjisibm2euc_map[index][1]; + return 3; + } +} + +/* EUC to SJIS IBM extended characters (G3 JIS X 0212 block) */ +static inline int euc2sjisibm_jisx0212(unsigned char *sjis, const unsigned char euc_hi, + const unsigned char euc_lo) +{ + int index, min_index, max_index; + unsigned short euc; + + min_index = 0; + max_index = ARRAY_SIZE(euc2sjisibm_jisx0212_map) - 1; + euc = (euc_hi << 8) | euc_lo; + + while (min_index <= max_index) { + index = (min_index + max_index) / 2; + if (euc < euc2sjisibm_jisx0212_map[index].euc) + max_index = index - 1; + else + min_index = index + 1; + if (euc == euc2sjisibm_jisx0212_map[index].euc) { + sjis[0] = euc2sjisibm_jisx0212_map[index].sjis[0]; + sjis[1] = euc2sjisibm_jisx0212_map[index].sjis[1]; + return 3; + } + } + return 0; +} + +/* EUC to SJIS IBM extended characters (G3 Upper block) */ +static inline int euc2sjisibm_g3upper(unsigned char *sjis, const unsigned char euc_hi, + const unsigned char euc_lo) +{ + int index; + + if (euc_hi == 0xF3) + index = ((euc_hi << 8) | euc_lo) - 0xF3F3; + else + index = ((euc_hi << 8) | euc_lo) - 0xF4A1 + 12; + + if ((index < 0) || (index >= ARRAY_SIZE(euc2sjisibm_g3upper_map))) + return 0; + + sjis[0] = euc2sjisibm_g3upper_map[index][0]; + sjis[1] = euc2sjisibm_g3upper_map[index][1]; + + return 3; +} + +/* EUC to SJIS IBM extended characters (G3 block) */ +static inline int euc2sjisibm(unsigned char *sjis, const unsigned char euc_hi, + const unsigned char euc_lo) +{ + int n; + +#if 0 + if ((euc_hi == 0xA2) && (euc_lo == 0xCC)) { + sjis[0] = 0xFA; + sjis[1] = 0x54; + return 2; + } else if ((euc_hi == 0xA2) && (euc_lo == 0xE8)) { + sjis[0] = 0xFA; + sjis[1] = 0x5B; + return 2; + } +#endif + if ((n = euc2sjisibm_g3upper(sjis, euc_hi, euc_lo))) { + return n; + } else if ((n = euc2sjisibm_jisx0212(sjis, euc_hi, euc_lo))) { + return n; + } + + return 0; +} + +/* NEC/IBM extended characters to IBM extended characters */ +static inline int sjisnec2sjisibm(unsigned char *sjisibm, + const unsigned char sjisnec_hi, + const unsigned char sjisnec_lo) +{ + int count; + + if (! IS_SJIS_NECIBM(sjisnec_hi, sjisnec_lo)) + return 0; + + if ((sjisnec_hi == 0xEE) && (sjisnec_lo == 0xF9)) { + sjisibm[0] = 0x81; + sjisibm[1] = 0xCA; + return 2; + } + + if ((sjisnec_hi == 0xEE) && (sjisnec_lo >= 0xEF)) { + count = (sjisnec_hi << 8 | sjisnec_lo) + - (sjisnec_lo <= 0xF9 ? 0xEEEF : (0xEEEF - 10)); + } else { + count = (sjisnec_hi - 0xED) * (0xFC - 0x40) + + (sjisnec_lo - 0x40) + (0x5C - 0x40); + if (sjisnec_lo >= 0x7F) + count--; + } + + sjisibm[0] = 0xFA + (count / (0xFC - 0x40)); + sjisibm[1] = 0x40 + (count % (0xFC - 0x40)); + if (sjisibm[1] >= 0x7F) + sjisibm[1]++; + + return 2; +} + +static int uni2char(const wchar_t uni, + unsigned char *out, int boundlen) +{ + int n; + + if (!p_nls) + return -EINVAL; + if ((n = p_nls->uni2char(uni, out, boundlen)) < 0) + return n; + + /* translate SJIS into EUC-JP */ + if (n == 1) { + if (IS_SJIS_JISX0201KANA(out[0])) { + /* JIS X 0201 KANA */ + if (boundlen < 2) + return -ENAMETOOLONG; + + out[1] = out[0]; + out[0] = SS2; + return 2; + } + } else if (n == 2) { + /* NEC/IBM extended characters to IBM extended characters */ + sjisnec2sjisibm(out, out[0], out[1]); + + if (IS_SJIS_UDC_LOW(out[0], out[1])) { + /* User defined characters half low */ + MAP_SJIS2EUC(out[0], out[1], 0xF0, out[0], out[1], 0xF5); + } else if (IS_SJIS_UDC_HI(out[0], out[1])) { + /* User defined characters half high */ + unsigned char ch, cl; + + if (boundlen < 3) + return -ENAMETOOLONG; + + n = 3; ch = out[0]; cl = out[1]; + out[0] = SS3; + MAP_SJIS2EUC(ch, cl, 0xF5, out[1], out[2], 0xF5); + } else if (IS_SJIS_IBM(out[0], out[1])) { + /* IBM extended characters */ + unsigned char euc[3], i; + + n = sjisibm2euc(euc, out[0], out[1]); + if (boundlen < n) + return -ENAMETOOLONG; + for (i = 0; i < n; i++) + out[i] = euc[i]; + } else if (IS_SJIS_JISX0208(out[0], out[1])) { + /* JIS X 0208 (include NEC special characters) */ + out[0] = (out[0]^0xA0)*2 + 0x5F; + if (out[1] > 0x9E) + out[0]++; + + if (out[1] < 0x7F) + out[1] = out[1] + 0x61; + else if (out[1] < 0x9F) + out[1] = out[1] + 0x60; + else + out[1] = out[1] + 0x02; + } else { + /* Invalid characters */ + return -EINVAL; + } + } + else + return -EINVAL; + + return n; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + unsigned char sjis_temp[2]; + int euc_offset, n; + + if ( !p_nls ) + return -EINVAL; + if (boundlen <= 0) + return -ENAMETOOLONG; + + /* translate EUC-JP into SJIS */ + if (rawstring[0] > 0x7F) { + if (rawstring[0] == SS3) { + if (boundlen < 3) + return -EINVAL; + euc_offset = 3; + + if (IS_EUC_UDC_HI(rawstring[1], rawstring[2])) { + /* User defined characters half high */ + MAP_EUC2SJIS(rawstring[1], rawstring[2], 0xF5, + sjis_temp[0], sjis_temp[1], 0xF5); + } else if (euc2sjisibm(sjis_temp,rawstring[1],rawstring[2])) { + /* IBM extended characters */ + } else { + /* JIS X 0212 and Invalid characters*/ + return -EINVAL; + + /* 'GETA' with SJIS coding */ + /* sjis_temp[0] = 0x81; */ + /* sjis_temp[1] = 0xAC; */ + } + } else { + if (boundlen < 2) + return -EINVAL; + euc_offset = 2; + + if (IS_EUC_JISX0201KANA(rawstring[0], rawstring[1])) { + /* JIS X 0201 KANA */ + sjis_temp[0] = rawstring[1]; + sjis_temp[1] = 0x00; + } else if (IS_EUC_UDC_LOW(rawstring[0], rawstring[1])) { + /* User defined characters half low */ + MAP_EUC2SJIS(rawstring[0], rawstring[1], 0xF5, + sjis_temp[0], sjis_temp[1], 0xF0); + } else if (IS_EUC_JISX0208(rawstring[0], rawstring[1])) { + /* JIS X 0208 (include NEC spesial characters) */ + sjis_temp[0] = ((rawstring[0]-0x5f)/2) ^ 0xA0; + if (!(rawstring[0] & 1)) + sjis_temp[1] = rawstring[1] - 0x02; + else if (rawstring[1] < 0xE0) + sjis_temp[1] = rawstring[1] - 0x61; + else + sjis_temp[1] = rawstring[1] - 0x60; + } else { + /* Invalid characters */ + return -EINVAL; + } + } + } else { + euc_offset = 1; + + /* JIS X 0201 ROMAJI */ + sjis_temp[0] = rawstring[0]; + sjis_temp[1] = 0x00; + } + + if ( (n = p_nls->char2uni(sjis_temp, sizeof(sjis_temp), uni)) < 0) + return n; + + return euc_offset; +} + +static struct nls_table table = { + .charset = "euc-jp", + .uni2char = uni2char, + .char2uni = char2uni, + .owner = THIS_MODULE, +}; + +static int __init init_nls_euc_jp(void) +{ + p_nls = load_nls("cp932"); + + if (p_nls) { + table.charset2upper = p_nls->charset2upper; + table.charset2lower = p_nls->charset2lower; + return register_nls(&table); + } + + return -EINVAL; +} + +static void __exit exit_nls_euc_jp(void) +{ + unregister_nls(&table); + unload_nls(p_nls); +} + +module_init(init_nls_euc_jp) +module_exit(exit_nls_euc_jp) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c new file mode 100644 index 00000000..7b951bb5 --- /dev/null +++ b/fs/nls/nls_iso8859-1.c @@ -0,0 +1,258 @@ +/* + * linux/fs/nls/nls_iso8859-1.c + * + * Charset iso8859-1 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x00a1, 0x00a2, 0x00a3, + 0x00a4, 0x00a5, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x00aa, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x00af, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x00b2, 0x00b3, + 0x00b4, 0x00b5, 0x00b6, 0x00b7, + 0x00b8, 0x00b9, 0x00ba, 0x00bb, + 0x00bc, 0x00bd, 0x00be, 0x00bf, + /* 0xc0*/ + 0x00c0, 0x00c1, 0x00c2, 0x00c3, + 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, + 0x00cc, 0x00cd, 0x00ce, 0x00cf, + /* 0xd0*/ + 0x00d0, 0x00d1, 0x00d2, 0x00d3, + 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x00d9, 0x00da, 0x00db, + 0x00dc, 0x00dd, 0x00de, 0x00df, + /* 0xe0*/ + 0x00e0, 0x00e1, 0x00e2, 0x00e3, + 0x00e4, 0x00e5, 0x00e6, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, + 0x00ec, 0x00ed, 0x00ee, 0x00ef, + /* 0xf0*/ + 0x00f0, 0x00f1, 0x00f2, 0x00f3, + 0x00f4, 0x00f5, 0x00f6, 0x00f7, + 0x00f8, 0x00f9, 0x00fa, 0x00fb, + 0x00fc, 0x00fd, 0x00fe, 0x00ff, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0x00, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-1", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_iso8859_1(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_1(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_1) +module_exit(exit_nls_iso8859_1) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c new file mode 100644 index 00000000..c4d52ea9 --- /dev/null +++ b/fs/nls/nls_iso8859-13.c @@ -0,0 +1,286 @@ +/* + * linux/fs/nls/nls_iso8859-13.c + * + * Charset iso8859-13 translation tables. + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x201d, 0x00a2, 0x00a3, + 0x00a4, 0x201e, 0x00a6, 0x00a7, + 0x00d8, 0x00a9, 0x0156, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x00c6, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x00b2, 0x00b3, + 0x201c, 0x00b5, 0x00b6, 0x00b7, + 0x00f8, 0x00b9, 0x0157, 0x00bb, + 0x00bc, 0x00bd, 0x00be, 0x00e6, + /* 0xc0*/ + 0x0104, 0x012e, 0x0100, 0x0106, + 0x00c4, 0x00c5, 0x0118, 0x0112, + 0x010c, 0x00c9, 0x0179, 0x0116, + 0x0122, 0x0136, 0x012a, 0x013b, + /* 0xd0*/ + 0x0160, 0x0143, 0x0145, 0x00d3, + 0x014c, 0x00d5, 0x00d6, 0x00d7, + 0x0172, 0x0141, 0x015a, 0x016a, + 0x00dc, 0x017b, 0x017d, 0x00df, + /* 0xe0*/ + 0x0105, 0x012f, 0x0101, 0x0107, + 0x00e4, 0x00e5, 0x0119, 0x0113, + 0x010d, 0x00e9, 0x017a, 0x0117, + 0x0123, 0x0137, 0x012b, 0x013c, + /* 0xf0*/ + 0x0161, 0x0144, 0x0146, 0x00f3, + 0x014d, 0x00f5, 0x00f6, 0x00f7, + 0x0173, 0x0142, 0x015b, 0x016b, + 0x00fc, 0x017c, 0x017e, 0x2019, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0x00, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0x00, 0xb9, 0x00, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0xc4, 0xc5, 0xaf, 0x00, /* 0xc0-0xc7 */ + 0x00, 0xc9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0xd3, 0x00, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xa8, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0xe4, 0xe5, 0xbf, 0x00, /* 0xe0-0xe7 */ + 0x00, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0xf3, 0x00, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xb8, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0xc2, 0xe2, 0x00, 0x00, 0xc0, 0xe0, 0xc3, 0xe3, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0xc7, 0xe7, 0x00, 0x00, 0xcb, 0xeb, /* 0x10-0x17 */ + 0xc6, 0xe6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0xce, 0xee, 0x00, 0x00, 0xc1, 0xe1, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xcd, 0xed, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0xcf, 0xef, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0xd9, 0xf9, 0xd1, 0xf1, 0xd2, 0xf2, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0xd4, 0xf4, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, 0xba, /* 0x50-0x57 */ + 0x00, 0x00, 0xda, 0xfa, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0xd8, 0xf8, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0xca, 0xea, 0xdd, 0xfd, 0xde, 0xfe, 0x00, /* 0x78-0x7f */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0xff, 0x00, 0x00, 0xb4, 0xa1, 0xa5, 0x00, /* 0x18-0x1f */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbf, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xbd, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-13", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_iso8859_13(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_13(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_13) +module_exit(exit_nls_iso8859_13) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c new file mode 100644 index 00000000..dc02600c --- /dev/null +++ b/fs/nls/nls_iso8859-14.c @@ -0,0 +1,342 @@ +/* + * linux/fs/nls/nls_iso8859-14.c + * + * Charset iso8859-14 translation tables. + * + * Generated automatically from the Unicode and charset table + * provided by the Unicode Organisation at + * http://www.unicode.org/ + * The Unicode to charset table has only exact mappings. + * + * Rhys Jones, Swansea University Computer Society + * rhys@sucs.swan.ac.uk + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x1e02, 0x1e03, 0x00a3, + 0x010a, 0x010b, 0x1e0a, 0x00a7, + 0x1e80, 0x00a9, 0x1e82, 0x1e0b, + 0x1ef2, 0x00ad, 0x00ae, 0x0178, + /* 0xb0*/ + 0x1e1e, 0x1e1f, 0x0120, 0x0121, + 0x1e40, 0x1e41, 0x00b6, 0x1e56, + 0x1e81, 0x1e57, 0x1e83, 0x1e60, + 0x1ef3, 0x1e84, 0x1e85, 0x1e61, + /* 0xc0*/ + 0x00c0, 0x00c1, 0x00c2, 0x00c3, + 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, + 0x00cc, 0x00cd, 0x00ce, 0x00cf, + /* 0xd0*/ + 0x0174, 0x00d1, 0x00d2, 0x00d3, + 0x00d4, 0x00d5, 0x00d6, 0x1e6a, + 0x00d8, 0x00d9, 0x00da, 0x00db, + 0x00dc, 0x00dd, 0x0176, 0x00df, + /* 0xe0*/ + 0x00e0, 0x00e1, 0x00e2, 0x00e3, + 0x00e4, 0x00e5, 0x00e6, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, + 0x00ec, 0x00ed, 0x00ee, 0x00ef, + /* 0xf0*/ + 0x0175, 0x00f1, 0x00f2, 0x00f3, + 0x00f4, 0x00f5, 0x00f6, 0x1e6b, + 0x00f8, 0x00f9, 0x00fa, 0x00fb, + 0x00fc, 0x00fd, 0x0177, 0x00ff, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */ + 0x00, 0xa9, 0x00, 0x00, 0x00, 0xad, 0xae, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb6, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0x00, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0x00, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0x00, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0x00, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0xa1, 0xa2, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0xa6, 0xab, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0xb1, /* 0x18-0x1f */ + 0xb2, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0xb4, 0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, 0xb9, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xbb, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0xd7, 0xf7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0xd0, 0xf0, 0xde, 0xfe, /* 0x70-0x77 */ + 0xaf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xa8, 0xb8, 0xaa, 0xba, 0xbd, 0xbe, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0xac, 0xbc, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page1e[256] = { + 0x00, 0x00, 0xa1, 0xa2, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0xa6, 0xab, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0xb1, /* 0x18-0x1f */ + 0xb2, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0xb4, 0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, 0xb9, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xbb, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0xd7, 0xf7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0xd0, 0xf0, 0xde, 0xfe, /* 0x70-0x77 */ + 0xaf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0xa8, 0xb8, 0xaa, 0xba, 0xbd, 0xbe, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0xac, 0xbc, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, page1e, NULL, + + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa2, 0xa2, 0xa3, 0xab, 0xab, 0xab, 0xa7, /* 0xa0-0xa7 */ + 0xb8, 0xa9, 0xba, 0xab, 0xbc, 0xad, 0xae, 0xff, /* 0xa8-0xaf */ + 0xb1, 0xb1, 0xb3, 0xb3, 0xb5, 0xb5, 0xb6, 0xb9, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbf, 0xbc, 0xbe, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa1, 0xa3, 0xa6, 0xa6, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xa6, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb0, 0xb2, 0xb2, 0xb4, 0xb4, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xa8, 0xb7, 0xaa, 0xbb, 0xac, 0xbd, 0xbd, 0xbb, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xaf, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-14", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_iso8859_14(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_14(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_14) +module_exit(exit_nls_iso8859_14) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c new file mode 100644 index 00000000..3c7dfc83 --- /dev/null +++ b/fs/nls/nls_iso8859-15.c @@ -0,0 +1,308 @@ +/* + * linux/fs/nls/nls_iso8859-15.c + * + * Charset iso8859-15 translation tables. + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x00a1, 0x00a2, 0x00a3, + 0x20ac, 0x00a5, 0x0160, 0x00a7, + 0x0161, 0x00a9, 0x00aa, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x00af, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x00b2, 0x00b3, + 0x017d, 0x00b5, 0x00b6, 0x00b7, + 0x017e, 0x00b9, 0x00ba, 0x00bb, + 0x0152, 0x0153, 0x0178, 0x00bf, + /* 0xc0*/ + 0x00c0, 0x00c1, 0x00c2, 0x00c3, + 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, + 0x00cc, 0x00cd, 0x00ce, 0x00cf, + /* 0xd0*/ + 0x00d0, 0x00d1, 0x00d2, 0x00d3, + 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x00d9, 0x00da, 0x00db, + 0x00dc, 0x00dd, 0x00de, 0x00df, + /* 0xe0*/ + 0x00e0, 0x00e1, 0x00e2, 0x00e3, + 0x00e4, 0x00e5, 0x00e6, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, + 0x00ec, 0x00ed, 0x00ee, 0x00ef, + /* 0xf0*/ + 0x00f0, 0x00f1, 0x00f2, 0x00f3, + 0x00f4, 0x00f5, 0x00f6, 0x00f7, + 0x00f8, 0x00f9, 0x00fa, 0x00fb, + 0x00fc, 0x00fd, 0x00fe, 0x00ff, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0x00, 0xa5, 0x00, 0xa7, /* 0xa0-0xa7 */ + 0x00, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0x00, 0xb9, 0xba, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0xbc, 0xbd, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xa6, 0xa8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0xbe, 0x00, 0x00, 0x00, 0x00, 0xb4, 0xb8, 0x00, /* 0x78-0x7f */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + page20, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb8, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbd, 0xbd, 0xff, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa6, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb4, 0xb9, 0xba, 0xbb, 0xbc, 0xbc, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xbe, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-15", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_iso8859_15(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_15(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_15) +module_exit(exit_nls_iso8859_15) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c new file mode 100644 index 00000000..a2d2197e --- /dev/null +++ b/fs/nls/nls_iso8859-2.c @@ -0,0 +1,309 @@ +/* + * linux/fs/nls/nls_iso8859-2.c + * + * Charset iso8859-2 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x0104, 0x02d8, 0x0141, + 0x00a4, 0x013d, 0x015a, 0x00a7, + 0x00a8, 0x0160, 0x015e, 0x0164, + 0x0179, 0x00ad, 0x017d, 0x017b, + /* 0xb0*/ + 0x00b0, 0x0105, 0x02db, 0x0142, + 0x00b4, 0x013e, 0x015b, 0x02c7, + 0x00b8, 0x0161, 0x015f, 0x0165, + 0x017a, 0x02dd, 0x017e, 0x017c, + /* 0xc0*/ + 0x0154, 0x00c1, 0x00c2, 0x0102, + 0x00c4, 0x0139, 0x0106, 0x00c7, + 0x010c, 0x00c9, 0x0118, 0x00cb, + 0x011a, 0x00cd, 0x00ce, 0x010e, + /* 0xd0*/ + 0x0110, 0x0143, 0x0147, 0x00d3, + 0x00d4, 0x0150, 0x00d6, 0x00d7, + 0x0158, 0x016e, 0x00da, 0x0170, + 0x00dc, 0x00dd, 0x0162, 0x00df, + /* 0xe0*/ + 0x0155, 0x00e1, 0x00e2, 0x0103, + 0x00e4, 0x013a, 0x0107, 0x00e7, + 0x010d, 0x00e9, 0x0119, 0x00eb, + 0x011b, 0x00ed, 0x00ee, 0x010f, + /* 0xf0*/ + 0x0111, 0x0144, 0x0148, 0x00f3, + 0x00f4, 0x0151, 0x00f6, 0x00f7, + 0x0159, 0x016f, 0x00fa, 0x0171, + 0x00fc, 0x00fd, 0x0163, 0x02d9, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ + 0xb0, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */ + 0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0x00, 0x00, 0xda, 0x00, 0xdc, 0xdd, 0x00, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */ + 0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0x00, 0x00, 0xfa, 0x00, 0xfc, 0xfd, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0xc3, 0xe3, 0xa1, 0xb1, 0xc6, 0xe6, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0xcf, 0xef, /* 0x08-0x0f */ + 0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0xca, 0xea, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0xc5, 0xe5, 0x00, 0x00, 0xa5, 0xb5, 0x00, /* 0x38-0x3f */ + 0x00, 0xa3, 0xb3, 0xd1, 0xf1, 0x00, 0x00, 0xd2, /* 0x40-0x47 */ + 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xd5, 0xf5, 0x00, 0x00, 0xc0, 0xe0, 0x00, 0x00, /* 0x50-0x57 */ + 0xd8, 0xf8, 0xa6, 0xb6, 0x00, 0x00, 0xaa, 0xba, /* 0x58-0x5f */ + 0xa9, 0xb9, 0xde, 0xfe, 0xab, 0xbb, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd9, 0xf9, /* 0x68-0x6f */ + 0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0xac, 0xbc, 0xaf, 0xbf, 0xae, 0xbe, 0x00, /* 0x78-0x7f */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0xa2, 0xff, 0x00, 0xb2, 0x00, 0xbd, 0x00, 0x00, /* 0xd8-0xdf */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xbf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xaf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-2", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_iso8859_2(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_2(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_2) +module_exit(exit_nls_iso8859_2) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c new file mode 100644 index 00000000..a61e0daa --- /dev/null +++ b/fs/nls/nls_iso8859-3.c @@ -0,0 +1,309 @@ +/* + * linux/fs/nls/nls_iso8859-3.c + * + * Charset iso8859-3 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x0126, 0x02d8, 0x00a3, + 0x00a4, 0x0000, 0x0124, 0x00a7, + 0x00a8, 0x0130, 0x015e, 0x011e, + 0x0134, 0x00ad, 0x0000, 0x017b, + /* 0xb0*/ + 0x00b0, 0x0127, 0x00b2, 0x00b3, + 0x00b4, 0x00b5, 0x0125, 0x00b7, + 0x00b8, 0x0131, 0x015f, 0x011f, + 0x0135, 0x00bd, 0x0000, 0x017c, + /* 0xc0*/ + 0x00c0, 0x00c1, 0x00c2, 0x0000, + 0x00c4, 0x010a, 0x0108, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, + 0x00cc, 0x00cd, 0x00ce, 0x00cf, + /* 0xd0*/ + 0x0000, 0x00d1, 0x00d2, 0x00d3, + 0x00d4, 0x0120, 0x00d6, 0x00d7, + 0x011c, 0x00d9, 0x00da, 0x00db, + 0x00dc, 0x016c, 0x015c, 0x00df, + /* 0xe0*/ + 0x00e0, 0x00e1, 0x00e2, 0x0000, + 0x00e4, 0x010b, 0x0109, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, + 0x00ec, 0x00ed, 0x00ee, 0x00ef, + /* 0xf0*/ + 0x0000, 0x00f1, 0x00f2, 0x00f3, + 0x00f4, 0x0121, 0x00f6, 0x00f7, + 0x011d, 0x00f9, 0x00fa, 0x00fb, + 0x00fc, 0x016d, 0x015d, 0x02d9, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0xa3, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ + 0xb0, 0x00, 0xb2, 0xb3, 0xb4, 0xb5, 0x00, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0x00, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0x00, 0xd9, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0x00, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0xc6, 0xe6, 0xc5, 0xe5, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0xd8, 0xf8, 0xab, 0xbb, /* 0x18-0x1f */ + 0xd5, 0xf5, 0x00, 0x00, 0xa6, 0xb6, 0xa1, 0xb1, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0xa9, 0xb9, 0x00, 0x00, 0xac, 0xbc, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0xde, 0xfe, 0xaa, 0xba, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0xdd, 0xfd, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0xaf, 0xbf, 0x00, 0x00, 0x00, /* 0x78-0x7f */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0xa2, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xb1, 0xa2, 0xa3, 0xa4, 0x00, 0xb6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0x69, 0xba, 0xbb, 0xbc, 0xad, 0x00, 0xbf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0x00, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xa1, 0xb2, 0xb3, 0xb4, 0x00, 0xa6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0x49, 0xaa, 0xab, 0xac, 0xbd, 0x00, 0xaf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-3", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_iso8859_3(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_3(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_3) +module_exit(exit_nls_iso8859_3) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c new file mode 100644 index 00000000..e8ff5554 --- /dev/null +++ b/fs/nls/nls_iso8859-4.c @@ -0,0 +1,309 @@ +/* + * linux/fs/nls/nls_iso8859-4.c + * + * Charset iso8859-4 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x0104, 0x0138, 0x0156, + 0x00a4, 0x0128, 0x013b, 0x00a7, + 0x00a8, 0x0160, 0x0112, 0x0122, + 0x0166, 0x00ad, 0x017d, 0x00af, + /* 0xb0*/ + 0x00b0, 0x0105, 0x02db, 0x0157, + 0x00b4, 0x0129, 0x013c, 0x02c7, + 0x00b8, 0x0161, 0x0113, 0x0123, + 0x0167, 0x014a, 0x017e, 0x014b, + /* 0xc0*/ + 0x0100, 0x00c1, 0x00c2, 0x00c3, + 0x00c4, 0x00c5, 0x00c6, 0x012e, + 0x010c, 0x00c9, 0x0118, 0x00cb, + 0x0116, 0x00cd, 0x00ce, 0x012a, + /* 0xd0*/ + 0x0110, 0x0145, 0x014c, 0x0136, + 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x0172, 0x00da, 0x00db, + 0x00dc, 0x0168, 0x016a, 0x00df, + /* 0xe0*/ + 0x0101, 0x00e1, 0x00e2, 0x00e3, + 0x00e4, 0x00e5, 0x00e6, 0x012f, + 0x010d, 0x00e9, 0x0119, 0x00eb, + 0x0117, 0x00ed, 0x00ee, 0x012b, + /* 0xf0*/ + 0x0111, 0x0146, 0x014d, 0x0137, + 0x00f4, 0x00f5, 0x00f6, 0x00f7, + 0x00f8, 0x0173, 0x00fa, 0x00fb, + 0x00fc, 0x0169, 0x016b, 0x02d9, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0x00, /* 0xc0-0xc7 */ + 0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0x00, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0x00, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0x00, /* 0xe0-0xe7 */ + 0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0x00, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0xc0, 0xe0, 0x00, 0x00, 0xa1, 0xb1, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0x00, 0x00, /* 0x08-0x0f */ + 0xd0, 0xf0, 0xaa, 0xba, 0x00, 0x00, 0xcc, 0xec, /* 0x10-0x17 */ + 0xca, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0xab, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0xa5, 0xb5, 0xcf, 0xef, 0x00, 0x00, 0xc7, 0xe7, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd3, 0xf3, /* 0x30-0x37 */ + 0xa2, 0x00, 0x00, 0xa6, 0xb6, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xd1, 0xf1, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0xbd, 0xbf, 0xd2, 0xf2, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa3, 0xb3, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0xa9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0xac, 0xbc, /* 0x60-0x67 */ + 0xdd, 0xfd, 0xde, 0xfe, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0xd9, 0xf9, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xae, 0xbe, 0x00, /* 0x78-0x7f */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0xff, 0x00, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, page02, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbf, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xbd, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-4", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_iso8859_4(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_4(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_4) +module_exit(exit_nls_iso8859_4) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c new file mode 100644 index 00000000..4721e893 --- /dev/null +++ b/fs/nls/nls_iso8859-5.c @@ -0,0 +1,273 @@ +/* + * linux/fs/nls/nls_iso8859-5.c + * + * Charset iso8859-5 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x0401, 0x0402, 0x0403, + 0x0404, 0x0405, 0x0406, 0x0407, + 0x0408, 0x0409, 0x040a, 0x040b, + 0x040c, 0x00ad, 0x040e, 0x040f, + /* 0xb0*/ + 0x0410, 0x0411, 0x0412, 0x0413, + 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, + 0x041c, 0x041d, 0x041e, 0x041f, + /* 0xc0*/ + 0x0420, 0x0421, 0x0422, 0x0423, + 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, + 0x042c, 0x042d, 0x042e, 0x042f, + /* 0xd0*/ + 0x0430, 0x0431, 0x0432, 0x0433, + 0x0434, 0x0435, 0x0436, 0x0437, + 0x0438, 0x0439, 0x043a, 0x043b, + 0x043c, 0x043d, 0x043e, 0x043f, + /* 0xe0*/ + 0x0440, 0x0441, 0x0442, 0x0443, + 0x0444, 0x0445, 0x0446, 0x0447, + 0x0448, 0x0449, 0x044a, 0x044b, + 0x044c, 0x044d, 0x044e, 0x044f, + /* 0xf0*/ + 0x2116, 0x0451, 0x0452, 0x0453, + 0x0454, 0x0455, 0x0456, 0x0457, + 0x0458, 0x0459, 0x045a, 0x045b, + 0x045c, 0x00a7, 0x045e, 0x045f, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ +}; + +static const unsigned char page04[256] = { + 0x00, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x00-0x07 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0x00, 0xae, 0xaf, /* 0x08-0x0f */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x10-0x17 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0x18-0x1f */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0x38-0x3f */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */ + 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x50-0x57 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0xfe, 0xff, /* 0x58-0x5f */ +}; + +static const unsigned char page21[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, /* 0x10-0x17 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, page21, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xa0-0xa7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xad, 0xfe, 0xff, /* 0xa8-0xaf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xb0-0xb7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xd0-0xd7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xf0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xf0-0xf7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xfd, 0xae, 0xaf, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-5", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_iso8859_5(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_5(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_5) +module_exit(exit_nls_iso8859_5) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c new file mode 100644 index 00000000..01a517d6 --- /dev/null +++ b/fs/nls/nls_iso8859-6.c @@ -0,0 +1,264 @@ +/* + * linux/fs/nls/nls_iso8859-6.c + * + * Charset iso8859-6 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0660, 0x0661, 0x0662, 0x0663, + 0x0664, 0x0665, 0x0666, 0x0667, + 0x0668, 0x0669, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x0000, 0x0000, 0x0000, + 0x00a4, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + 0x060c, 0x00ad, 0x0000, 0x0000, + /* 0xb0*/ + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x061b, + 0x0000, 0x0000, 0x0000, 0x061f, + /* 0xc0*/ + 0x0000, 0x0621, 0x0622, 0x0623, + 0x0624, 0x0625, 0x0626, 0x0627, + 0x0628, 0x0629, 0x062a, 0x062b, + 0x062c, 0x062d, 0x062e, 0x062f, + /* 0xd0*/ + 0x0630, 0x0631, 0x0632, 0x0633, + 0x0634, 0x0635, 0x0636, 0x0637, + 0x0638, 0x0639, 0x063a, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + /* 0xe0*/ + 0x0640, 0x0641, 0x0642, 0x0643, + 0x0644, 0x0645, 0x0646, 0x0647, + 0x0648, 0x0649, 0x064a, 0x064b, + 0x064c, 0x064d, 0x064e, 0x064f, + /* 0xf0*/ + 0x0650, 0x0651, 0x0652, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ +}; + +static const unsigned char page06[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0x18-0x1f */ + 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */ + 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */ + 0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x60-0x67 */ + 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, NULL, NULL, page06, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0xac, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0xb8-0xbf */ + 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0xac, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0xb8-0xbf */ + 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-6", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_iso8859_6(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_6(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_6) +module_exit(exit_nls_iso8859_6) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c new file mode 100644 index 00000000..2d27b93e --- /dev/null +++ b/fs/nls/nls_iso8859-7.c @@ -0,0 +1,318 @@ +/* + * linux/fs/nls/nls_iso8859-7.c + * + * Charset iso8859-7 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x02bd, 0x02bc, 0x00a3, + 0x0000, 0x0000, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x0000, 0x00ab, + 0x00ac, 0x00ad, 0x0000, 0x2015, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x00b2, 0x00b3, + 0x0384, 0x0385, 0x0386, 0x00b7, + 0x0388, 0x0389, 0x038a, 0x00bb, + 0x038c, 0x00bd, 0x038e, 0x038f, + /* 0xc0*/ + 0x0390, 0x0391, 0x0392, 0x0393, + 0x0394, 0x0395, 0x0396, 0x0397, + 0x0398, 0x0399, 0x039a, 0x039b, + 0x039c, 0x039d, 0x039e, 0x039f, + /* 0xd0*/ + 0x03a0, 0x03a1, 0x0000, 0x03a3, + 0x03a4, 0x03a5, 0x03a6, 0x03a7, + 0x03a8, 0x03a9, 0x03aa, 0x03ab, + 0x03ac, 0x03ad, 0x03ae, 0x03af, + /* 0xe0*/ + 0x03b0, 0x03b1, 0x03b2, 0x03b3, + 0x03b4, 0x03b5, 0x03b6, 0x03b7, + 0x03b8, 0x03b9, 0x03ba, 0x03bb, + 0x03bc, 0x03bd, 0x03be, 0x03bf, + /* 0xf0*/ + 0x03c0, 0x03c1, 0x03c2, 0x03c3, + 0x03c4, 0x03c5, 0x03c6, 0x03c7, + 0x03c8, 0x03c9, 0x03ca, 0x03cb, + 0x03cc, 0x03cd, 0x03ce, 0x0000, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0x00, 0x00, 0xa3, 0x00, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0x00, 0x00, 0x00, 0xb7, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0xbb, 0x00, 0xbd, 0x00, 0x00, /* 0xb8-0xbf */ +}; + +static const unsigned char page02[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0xa2, 0xa1, 0x00, 0x00, /* 0xb8-0xbf */ +}; + +static const unsigned char page03[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0xb4, 0xb5, 0xb6, 0x00, /* 0x80-0x87 */ + 0xb8, 0xb9, 0xba, 0x00, 0xbc, 0x00, 0xbe, 0xbf, /* 0x88-0x8f */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x90-0x97 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x98-0x9f */ + 0xd0, 0xd1, 0x00, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xa0-0xa7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xa8-0xaf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xb0-0xb7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xb8-0xbf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xc0-0xc7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xc8-0xcf */ +}; + +static const unsigned char page20[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0xaf, 0x00, 0x00, /* 0x10-0x17 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, page02, page03, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + page20, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0x00, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xdc, 0xb7, /* 0xb0-0xb7 */ + 0xdd, 0xde, 0xdf, 0xbb, 0xfc, 0xbd, 0xfd, 0xfe, /* 0xb8-0xbf */ + 0xc0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0x00, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0x00, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xb6, 0xb8, 0xb9, 0xba, /* 0xd8-0xdf */ + 0xe0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd3, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xbc, 0xbe, 0xbf, 0x00, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-7", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_iso8859_7(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_7(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_7) +module_exit(exit_nls_iso8859_7) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c new file mode 100644 index 00000000..694bf070 --- /dev/null +++ b/fs/nls/nls_iso8859-9.c @@ -0,0 +1,273 @@ +/* + * linux/fs/nls/nls_iso8859-9.c + * + * Charset iso8859-9 translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, + 0x008c, 0x008d, 0x008e, 0x008f, + /* 0x90*/ + 0x0090, 0x0091, 0x0092, 0x0093, + 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, + 0x009c, 0x009d, 0x009e, 0x009f, + /* 0xa0*/ + 0x00a0, 0x00a1, 0x00a2, 0x00a3, + 0x00a4, 0x00a5, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x00aa, 0x00ab, + 0x00ac, 0x00ad, 0x00ae, 0x00af, + /* 0xb0*/ + 0x00b0, 0x00b1, 0x00b2, 0x00b3, + 0x00b4, 0x00b5, 0x00b6, 0x00b7, + 0x00b8, 0x00b9, 0x00ba, 0x00bb, + 0x00bc, 0x00bd, 0x00be, 0x00bf, + /* 0xc0*/ + 0x00c0, 0x00c1, 0x00c2, 0x00c3, + 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, + 0x00cc, 0x00cd, 0x00ce, 0x00cf, + /* 0xd0*/ + 0x011e, 0x00d1, 0x00d2, 0x00d3, + 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x00d9, 0x00da, 0x00db, + 0x00dc, 0x0130, 0x015e, 0x00df, + /* 0xe0*/ + 0x00e0, 0x00e1, 0x00e2, 0x00e3, + 0x00e4, 0x00e5, 0x00e6, 0x00e7, + 0x00e8, 0x00e9, 0x00ea, 0x00eb, + 0x00ec, 0x00ed, 0x00ee, 0x00ef, + /* 0xf0*/ + 0x011f, 0x00f1, 0x00f2, 0x00f3, + 0x00f4, 0x00f5, 0x00f6, 0x00f7, + 0x00f8, 0x00f9, 0x00fa, 0x00fb, + 0x00fc, 0x0131, 0x015f, 0x00ff, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char page01[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd0, 0xf0, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0xdd, 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0xfe, /* 0x58-0x5f */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x69, 0xfe, 0xdf, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0x49, 0xde, 0x00, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "iso8859-9", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_iso8859_9(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_iso8859_9(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_iso8859_9) +module_exit(exit_nls_iso8859_9) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c new file mode 100644 index 00000000..43875310 --- /dev/null +++ b/fs/nls/nls_koi8-r.c @@ -0,0 +1,324 @@ +/* + * linux/fs/nls/nls_koi8-r.c + * + * Charset koi8-r translation tables. + * Generated automatically from the Unicode and charset + * tables from the Unicode Organization (www.unicode.org). + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x2500, 0x2502, 0x250c, 0x2510, + 0x2514, 0x2518, 0x251c, 0x2524, + 0x252c, 0x2534, 0x253c, 0x2580, + 0x2584, 0x2588, 0x258c, 0x2590, + /* 0x90*/ + 0x2591, 0x2592, 0x2593, 0x2320, + 0x25a0, 0x2219, 0x221a, 0x2248, + 0x2264, 0x2265, 0x00a0, 0x2321, + 0x00b0, 0x00b2, 0x00b7, 0x00f7, + /* 0xa0*/ + 0x2550, 0x2551, 0x2552, 0x0451, + 0x2553, 0x2554, 0x2555, 0x2556, + 0x2557, 0x2558, 0x2559, 0x255a, + 0x255b, 0x255c, 0x255d, 0x255e, + /* 0xb0*/ + 0x255f, 0x2560, 0x2561, 0x0401, + 0x2562, 0x2563, 0x2564, 0x2565, + 0x2566, 0x2567, 0x2568, 0x2569, + 0x256a, 0x256b, 0x256c, 0x00a9, + /* 0xc0*/ + 0x044e, 0x0430, 0x0431, 0x0446, + 0x0434, 0x0435, 0x0444, 0x0433, + 0x0445, 0x0438, 0x0439, 0x043a, + 0x043b, 0x043c, 0x043d, 0x043e, + /* 0xd0*/ + 0x043f, 0x044f, 0x0440, 0x0441, + 0x0442, 0x0443, 0x0436, 0x0432, + 0x044c, 0x044b, 0x0437, 0x0448, + 0x044d, 0x0449, 0x0447, 0x044a, + /* 0xe0*/ + 0x042e, 0x0410, 0x0411, 0x0426, + 0x0414, 0x0415, 0x0424, 0x0413, + 0x0425, 0x0418, 0x0419, 0x041a, + 0x041b, 0x041c, 0x041d, 0x041e, + /* 0xf0*/ + 0x041f, 0x042f, 0x0420, 0x0421, + 0x0422, 0x0423, 0x0416, 0x0412, + 0x042c, 0x042b, 0x0417, 0x0428, + 0x042d, 0x0429, 0x0427, 0x042a, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x9c, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9f, /* 0xf0-0xf7 */ +}; + +static const unsigned char page04[256] = { + 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, /* 0x10-0x17 */ + 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, /* 0x18-0x1f */ + 0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, /* 0x20-0x27 */ + 0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, /* 0x28-0x2f */ + 0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, /* 0x30-0x37 */ + 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, /* 0x38-0x3f */ + 0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, /* 0x40-0x47 */ + 0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, /* 0x48-0x4f */ + 0x00, 0xa3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x95, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x99, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x93, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0x80, 0x00, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x85, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xa0, 0xa1, 0xa2, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, /* 0x50-0x57 */ + 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, /* 0x58-0x5f */ + 0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, /* 0x60-0x67 */ + 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x8b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x8d, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x8f, 0x90, 0x91, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xa3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xb3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "koi8-r", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_koi8_r(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_koi8_r(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_koi8_r) +module_exit(exit_nls_koi8_r) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c new file mode 100644 index 00000000..e7bc1d75 --- /dev/null +++ b/fs/nls/nls_koi8-ru.c @@ -0,0 +1,83 @@ +/* + * linux/fs/nls/nls_koi8-ru.c + * + * Charset koi8-ru translation based on charset koi8-u. + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static struct nls_table *p_nls; + +static int uni2char(const wchar_t uni, + unsigned char *out, int boundlen) +{ + if (boundlen <= 0) + return -ENAMETOOLONG; + + if ((uni & 0xffaf) == 0x040e || (uni & 0xffce) == 0x254c) { + /* koi8-ru and koi8-u differ only on two characters */ + if (uni == 0x040e) + out[0] = 0xbe; + else if (uni == 0x045e) + out[0] = 0xae; + else if (uni == 0x255d || uni == 0x256c) + return 0; + else + return p_nls->uni2char(uni, out, boundlen); + return 1; + } + else + /* fast path */ + return p_nls->uni2char(uni, out, boundlen); +} + +static int char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + int n; + + if ((*rawstring & 0xef) != 0xae) { + /* koi8-ru and koi8-u differ only on two characters */ + *uni = (*rawstring & 0x10) ? 0x040e : 0x045e; + return 1; + } + + n = p_nls->char2uni(rawstring, boundlen, uni); + return n; +} + +static struct nls_table table = { + .charset = "koi8-ru", + .uni2char = uni2char, + .char2uni = char2uni, + .owner = THIS_MODULE, +}; + +static int __init init_nls_koi8_ru(void) +{ + p_nls = load_nls("koi8-u"); + + if (p_nls) { + table.charset2upper = p_nls->charset2upper; + table.charset2lower = p_nls->charset2lower; + return register_nls(&table); + } + + return -EINVAL; +} + +static void __exit exit_nls_koi8_ru(void) +{ + unregister_nls(&table); + unload_nls(p_nls); +} + +module_init(init_nls_koi8_ru) +module_exit(exit_nls_koi8_ru) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c new file mode 100644 index 00000000..8c9f0292 --- /dev/null +++ b/fs/nls/nls_koi8-u.c @@ -0,0 +1,331 @@ +/* + * linux/fs/nls/nls_koi8-u.c + * + * Charset koi8-u translation tables. + * The Unicode to charset table has only exact mappings. + */ + +#include +#include +#include +#include +#include + +static const wchar_t charset2uni[256] = { + /* 0x00*/ + 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, + 0x000c, 0x000d, 0x000e, 0x000f, + /* 0x10*/ + 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, + 0x001c, 0x001d, 0x001e, 0x001f, + /* 0x20*/ + 0x0020, 0x0021, 0x0022, 0x0023, + 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, + 0x002c, 0x002d, 0x002e, 0x002f, + /* 0x30*/ + 0x0030, 0x0031, 0x0032, 0x0033, + 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, + 0x003c, 0x003d, 0x003e, 0x003f, + /* 0x40*/ + 0x0040, 0x0041, 0x0042, 0x0043, + 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, + 0x004c, 0x004d, 0x004e, 0x004f, + /* 0x50*/ + 0x0050, 0x0051, 0x0052, 0x0053, + 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, + 0x005c, 0x005d, 0x005e, 0x005f, + /* 0x60*/ + 0x0060, 0x0061, 0x0062, 0x0063, + 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, + 0x006c, 0x006d, 0x006e, 0x006f, + /* 0x70*/ + 0x0070, 0x0071, 0x0072, 0x0073, + 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, + 0x007c, 0x007d, 0x007e, 0x007f, + /* 0x80*/ + 0x2500, 0x2502, 0x250c, 0x2510, + 0x2514, 0x2518, 0x251c, 0x2524, + 0x252c, 0x2534, 0x253c, 0x2580, + 0x2584, 0x2588, 0x258c, 0x2590, + /* 0x90*/ + 0x2591, 0x2592, 0x2593, 0x2320, + 0x25a0, 0x2219, 0x221a, 0x2248, + 0x2264, 0x2265, 0x00a0, 0x2321, + 0x00b0, 0x00b2, 0x00b7, 0x00f7, + /* 0xa0*/ + 0x2550, 0x2551, 0x2552, 0x0451, + 0x0454, 0x2554, 0x0456, 0x0457, + 0x2557, 0x2558, 0x2559, 0x255a, + 0x255b, 0x0491, 0x255d, 0x255e, + /* 0xb0*/ + 0x255f, 0x2560, 0x2561, 0x0401, + 0x0404, 0x2563, 0x0406, 0x0407, + 0x2566, 0x2567, 0x2568, 0x2569, + 0x256a, 0x0490, 0x256c, 0x00a9, + /* 0xc0*/ + 0x044e, 0x0430, 0x0431, 0x0446, + 0x0434, 0x0435, 0x0444, 0x0433, + 0x0445, 0x0438, 0x0439, 0x043a, + 0x043b, 0x043c, 0x043d, 0x043e, + /* 0xd0*/ + 0x043f, 0x044f, 0x0440, 0x0441, + 0x0442, 0x0443, 0x0436, 0x0432, + 0x044c, 0x044b, 0x0437, 0x0448, + 0x044d, 0x0449, 0x0447, 0x044a, + /* 0xe0*/ + 0x042e, 0x0410, 0x0411, 0x0426, + 0x0414, 0x0415, 0x0424, 0x0413, + 0x0425, 0x0418, 0x0419, 0x041a, + 0x041b, 0x041c, 0x041d, 0x041e, + /* 0xf0*/ + 0x041f, 0x042f, 0x0420, 0x0421, + 0x0422, 0x0423, 0x0416, 0x0412, + 0x042c, 0x042b, 0x0417, 0x0428, + 0x042d, 0x0429, 0x0427, 0x042a, +}; + +static const unsigned char page00[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ + 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ + 0x9c, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xb0-0xb7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9f, /* 0xf0-0xf7 */ +}; + +static const unsigned char page04[256] = { + 0x00, 0xb3, 0x00, 0x00, 0xb4, 0x00, 0xb6, 0xb7, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, /* 0x10-0x17 */ + 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, /* 0x18-0x1f */ + 0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, /* 0x20-0x27 */ + 0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, /* 0x28-0x2f */ + 0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, /* 0x30-0x37 */ + 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, /* 0x38-0x3f */ + 0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, /* 0x40-0x47 */ + 0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, /* 0x48-0x4f */ + 0x00, 0xa3, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0xbd, 0xad, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ +}; + +static const unsigned char page22[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x95, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ + 0x00, 0x00, 0x00, 0x00, 0x98, 0x99, 0x00, 0x00, /* 0x60-0x67 */ +}; + +static const unsigned char page23[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x93, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ +}; + +static const unsigned char page25[256] = { + 0x80, 0x00, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ + 0x00, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, /* 0x08-0x0f */ + 0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, /* 0x10-0x17 */ + 0x85, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, /* 0x18-0x1f */ + 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x20-0x27 */ + 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x28-0x2f */ + 0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x30-0x37 */ + 0x00, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x38-0x3f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ + 0xa0, 0xa1, 0xa2, 0x00, 0xa5, 0x00, 0x00, 0xa8, /* 0x50-0x57 */ + 0xa9, 0xaa, 0xab, 0xac, 0x00, 0xae, 0xaf, 0xb0, /* 0x58-0x5f */ + 0xb1, 0xb2, 0x00, 0xb5, 0x00, 0x00, 0xb8, 0xb9, /* 0x60-0x67 */ + 0xba, 0xbb, 0xbc, 0x00, 0xbe, 0x00, 0x00, 0x00, /* 0x68-0x6f */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ + + 0x8b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, /* 0x80-0x87 */ + 0x8d, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x88-0x8f */ + 0x8f, 0x90, 0x91, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ + 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ +}; + +static const unsigned char *const page_uni2charset[256] = { + page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, page22, page23, NULL, page25, NULL, NULL, +}; + +static const unsigned char charset2lower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xa3, 0xa4, 0xb5, 0xa6, 0xa7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */ +}; + +static const unsigned char charset2upper[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ + + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ + 0xa0, 0xa1, 0xa2, 0xb3, 0xb4, 0xa5, 0xb6, 0xb7, /* 0xa0-0xa7 */ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xaf, /* 0xa8-0xaf */ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ +}; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + const unsigned char *uni2charset; + unsigned char cl = uni & 0x00ff; + unsigned char ch = (uni & 0xff00) >> 8; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + uni2charset = page_uni2charset[ch]; + if (uni2charset && uni2charset[cl]) + out[0] = uni2charset[cl]; + else + return -EINVAL; + return 1; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + *uni = charset2uni[*rawstring]; + if (*uni == 0x0000) + return -EINVAL; + return 1; +} + +static struct nls_table table = { + .charset = "koi8-u", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = charset2lower, + .charset2upper = charset2upper, + .owner = THIS_MODULE, +}; + +static int __init init_nls_koi8_u(void) +{ + return register_nls(&table); +} + +static void __exit exit_nls_koi8_u(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_koi8_u) +module_exit(exit_nls_koi8_u) + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c new file mode 100644 index 00000000..0d60a44a --- /dev/null +++ b/fs/nls/nls_utf8.c @@ -0,0 +1,68 @@ +/* + * Module for handling utf8 just like any other charset. + * By Urban Widmark 2000 + */ + +#include +#include +#include +#include +#include + +static unsigned char identity[256]; + +static int uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + n = utf32_to_utf8(uni, out, boundlen); + if (n < 0) { + *out = '?'; + return -EINVAL; + } + return n; +} + +static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) +{ + int n; + unicode_t u; + + n = utf8_to_utf32(rawstring, boundlen, &u); + if (n < 0 || u > MAX_WCHAR_T) { + *uni = 0x003f; /* ? */ + return -EINVAL; + } + *uni = (wchar_t) u; + return n; +} + +static struct nls_table table = { + .charset = "utf8", + .uni2char = uni2char, + .char2uni = char2uni, + .charset2lower = identity, /* no conversion */ + .charset2upper = identity, + .owner = THIS_MODULE, +}; + +static int __init init_nls_utf8(void) +{ + int i; + for (i=0; i<256; i++) + identity[i] = i; + + return register_nls(&table); +} + +static void __exit exit_nls_utf8(void) +{ + unregister_nls(&table); +} + +module_init(init_nls_utf8) +module_exit(exit_nls_utf8) +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/no-block.c b/fs/no-block.c new file mode 100644 index 00000000..6e40e42a --- /dev/null +++ b/fs/no-block.c @@ -0,0 +1,23 @@ +/* no-block.c: implementation of routines required for non-BLOCK configuration + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +static int no_blkdev_open(struct inode * inode, struct file * filp) +{ + return -ENODEV; +} + +const struct file_operations def_blk_fops = { + .open = no_blkdev_open, + .llseek = noop_llseek, +}; diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig new file mode 100644 index 00000000..22c629ee --- /dev/null +++ b/fs/notify/Kconfig @@ -0,0 +1,6 @@ +config FSNOTIFY + def_bool n + +source "fs/notify/dnotify/Kconfig" +source "fs/notify/inotify/Kconfig" +source "fs/notify/fanotify/Kconfig" diff --git a/fs/notify/Makefile b/fs/notify/Makefile new file mode 100644 index 00000000..ae5f33a6 --- /dev/null +++ b/fs/notify/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \ + mark.o vfsmount_mark.o + +obj-y += dnotify/ +obj-y += inotify/ +obj-y += fanotify/ diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig new file mode 100644 index 00000000..f9c1ca13 --- /dev/null +++ b/fs/notify/dnotify/Kconfig @@ -0,0 +1,11 @@ +config DNOTIFY + bool "Dnotify support" + select FSNOTIFY + default y + help + Dnotify is a directory-based per-fd file change notification system + that uses signals to communicate events to user-space. There exist + superior alternatives, but some applications may still rely on + dnotify. + + If unsure, say Y. diff --git a/fs/notify/dnotify/Makefile b/fs/notify/dnotify/Makefile new file mode 100644 index 00000000..f145251d --- /dev/null +++ b/fs/notify/dnotify/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_DNOTIFY) += dnotify.o diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c new file mode 100644 index 00000000..3344bdd5 --- /dev/null +++ b/fs/notify/dnotify/dnotify.c @@ -0,0 +1,411 @@ +/* + * Directory notifications for Linux. + * + * Copyright (C) 2000,2001,2002 Stephen Rothwell + * + * Copyright (C) 2009 Eric Paris + * dnotify was largly rewritten to use the new fsnotify infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int dir_notify_enable __read_mostly = 1; + +static struct kmem_cache *dnotify_struct_cache __read_mostly; +static struct kmem_cache *dnotify_mark_cache __read_mostly; +static struct fsnotify_group *dnotify_group __read_mostly; +static DEFINE_MUTEX(dnotify_mark_mutex); + +/* + * dnotify will attach one of these to each inode (i_fsnotify_marks) which + * is being watched by dnotify. If multiple userspace applications are watching + * the same directory with dnotify their information is chained in dn + */ +struct dnotify_mark { + struct fsnotify_mark fsn_mark; + struct dnotify_struct *dn; +}; + +/* + * When a process starts or stops watching an inode the set of events which + * dnotify cares about for that inode may change. This function runs the + * list of everything receiving dnotify events about this directory and calculates + * the set of all those events. After it updates what dnotify is interested in + * it calls the fsnotify function so it can update the set of all events relevant + * to this inode. + */ +static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) +{ + __u32 new_mask, old_mask; + struct dnotify_struct *dn; + struct dnotify_mark *dn_mark = container_of(fsn_mark, + struct dnotify_mark, + fsn_mark); + + assert_spin_locked(&fsn_mark->lock); + + old_mask = fsn_mark->mask; + new_mask = 0; + for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next) + new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT); + fsnotify_set_mark_mask_locked(fsn_mark, new_mask); + + if (old_mask == new_mask) + return; + + if (fsn_mark->i.inode) + fsnotify_recalc_inode_mask(fsn_mark->i.inode); +} + +/* + * Mains fsnotify call where events are delivered to dnotify. + * Find the dnotify mark on the relevant inode, run the list of dnotify structs + * on that mark and determine which of them has expressed interest in receiving + * events of this type. When found send the correct process and signal and + * destroy the dnotify struct if it was not registered to receive multiple + * events. + */ +static int dnotify_handle_event(struct fsnotify_group *group, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + struct fsnotify_event *event) +{ + struct dnotify_mark *dn_mark; + struct inode *to_tell; + struct dnotify_struct *dn; + struct dnotify_struct **prev; + struct fown_struct *fown; + __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; + + BUG_ON(vfsmount_mark); + + to_tell = event->to_tell; + + dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); + + spin_lock(&inode_mark->lock); + prev = &dn_mark->dn; + while ((dn = *prev) != NULL) { + if ((dn->dn_mask & test_mask) == 0) { + prev = &dn->dn_next; + continue; + } + fown = &dn->dn_filp->f_owner; + send_sigio(fown, dn->dn_fd, POLL_MSG); + if (dn->dn_mask & FS_DN_MULTISHOT) + prev = &dn->dn_next; + else { + *prev = dn->dn_next; + kmem_cache_free(dnotify_struct_cache, dn); + dnotify_recalc_inode_mask(inode_mark); + } + } + + spin_unlock(&inode_mark->lock); + + return 0; +} + +/* + * Given an inode and mask determine if dnotify would be interested in sending + * userspace notification for that pair. + */ +static bool dnotify_should_send_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + __u32 mask, void *data, int data_type) +{ + /* not a dir, dnotify doesn't care */ + if (!S_ISDIR(inode->i_mode)) + return false; + + return true; +} + +static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) +{ + struct dnotify_mark *dn_mark = container_of(fsn_mark, + struct dnotify_mark, + fsn_mark); + + BUG_ON(dn_mark->dn); + + kmem_cache_free(dnotify_mark_cache, dn_mark); +} + +static struct fsnotify_ops dnotify_fsnotify_ops = { + .handle_event = dnotify_handle_event, + .should_send_event = dnotify_should_send_event, + .free_group_priv = NULL, + .freeing_mark = NULL, + .free_event_priv = NULL, +}; + +/* + * Called every time a file is closed. Looks first for a dnotify mark on the + * inode. If one is found run all of the ->dn structures attached to that + * mark for one relevant to this process closing the file and remove that + * dnotify_struct. If that was the last dnotify_struct also remove the + * fsnotify_mark. + */ +void dnotify_flush(struct file *filp, fl_owner_t id) +{ + struct fsnotify_mark *fsn_mark; + struct dnotify_mark *dn_mark; + struct dnotify_struct *dn; + struct dnotify_struct **prev; + struct inode *inode; + + inode = filp->f_path.dentry->d_inode; + if (!S_ISDIR(inode->i_mode)) + return; + + fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); + if (!fsn_mark) + return; + dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); + + mutex_lock(&dnotify_mark_mutex); + + spin_lock(&fsn_mark->lock); + prev = &dn_mark->dn; + while ((dn = *prev) != NULL) { + if ((dn->dn_owner == id) && (dn->dn_filp == filp)) { + *prev = dn->dn_next; + kmem_cache_free(dnotify_struct_cache, dn); + dnotify_recalc_inode_mask(fsn_mark); + break; + } + prev = &dn->dn_next; + } + + spin_unlock(&fsn_mark->lock); + + /* nothing else could have found us thanks to the dnotify_mark_mutex */ + if (dn_mark->dn == NULL) + fsnotify_destroy_mark(fsn_mark); + + mutex_unlock(&dnotify_mark_mutex); + + fsnotify_put_mark(fsn_mark); +} + +/* this conversion is done only at watch creation */ +static __u32 convert_arg(unsigned long arg) +{ + __u32 new_mask = FS_EVENT_ON_CHILD; + + if (arg & DN_MULTISHOT) + new_mask |= FS_DN_MULTISHOT; + if (arg & DN_DELETE) + new_mask |= (FS_DELETE | FS_MOVED_FROM); + if (arg & DN_MODIFY) + new_mask |= FS_MODIFY; + if (arg & DN_ACCESS) + new_mask |= FS_ACCESS; + if (arg & DN_ATTRIB) + new_mask |= FS_ATTRIB; + if (arg & DN_RENAME) + new_mask |= FS_DN_RENAME; + if (arg & DN_CREATE) + new_mask |= (FS_CREATE | FS_MOVED_TO); + + return new_mask; +} + +/* + * If multiple processes watch the same inode with dnotify there is only one + * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct + * onto that mark. This function either attaches the new dnotify_struct onto + * that list, or it |= the mask onto an existing dnofiy_struct. + */ +static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark, + fl_owner_t id, int fd, struct file *filp, __u32 mask) +{ + struct dnotify_struct *odn; + + odn = dn_mark->dn; + while (odn != NULL) { + /* adding more events to existing dnofiy_struct? */ + if ((odn->dn_owner == id) && (odn->dn_filp == filp)) { + odn->dn_fd = fd; + odn->dn_mask |= mask; + return -EEXIST; + } + odn = odn->dn_next; + } + + dn->dn_mask = mask; + dn->dn_fd = fd; + dn->dn_filp = filp; + dn->dn_owner = id; + dn->dn_next = dn_mark->dn; + dn_mark->dn = dn; + + return 0; +} + +/* + * When a process calls fcntl to attach a dnotify watch to a directory it ends + * up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be + * attached to the fsnotify_mark. + */ +int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) +{ + struct dnotify_mark *new_dn_mark, *dn_mark; + struct fsnotify_mark *new_fsn_mark, *fsn_mark; + struct dnotify_struct *dn; + struct inode *inode; + fl_owner_t id = current->files; + struct file *f; + int destroy = 0, error = 0; + __u32 mask; + + /* we use these to tell if we need to kfree */ + new_fsn_mark = NULL; + dn = NULL; + + if (!dir_notify_enable) { + error = -EINVAL; + goto out_err; + } + + /* a 0 mask means we are explicitly removing the watch */ + if ((arg & ~DN_MULTISHOT) == 0) { + dnotify_flush(filp, id); + error = 0; + goto out_err; + } + + /* dnotify only works on directories */ + inode = filp->f_path.dentry->d_inode; + if (!S_ISDIR(inode->i_mode)) { + error = -ENOTDIR; + goto out_err; + } + + /* expect most fcntl to add new rather than augment old */ + dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL); + if (!dn) { + error = -ENOMEM; + goto out_err; + } + + /* new fsnotify mark, we expect most fcntl calls to add a new mark */ + new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL); + if (!new_dn_mark) { + error = -ENOMEM; + goto out_err; + } + + /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */ + mask = convert_arg(arg); + + /* set up the new_fsn_mark and new_dn_mark */ + new_fsn_mark = &new_dn_mark->fsn_mark; + fsnotify_init_mark(new_fsn_mark, dnotify_free_mark); + new_fsn_mark->mask = mask; + new_dn_mark->dn = NULL; + + /* this is needed to prevent the fcntl/close race described below */ + mutex_lock(&dnotify_mark_mutex); + + /* add the new_fsn_mark or find an old one. */ + fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); + if (fsn_mark) { + dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); + spin_lock(&fsn_mark->lock); + } else { + fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0); + spin_lock(&new_fsn_mark->lock); + fsn_mark = new_fsn_mark; + dn_mark = new_dn_mark; + /* we used new_fsn_mark, so don't free it */ + new_fsn_mark = NULL; + } + + rcu_read_lock(); + f = fcheck(fd); + rcu_read_unlock(); + + /* if (f != filp) means that we lost a race and another task/thread + * actually closed the fd we are still playing with before we grabbed + * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the + * only time we clean up the marks we need to get our mark off + * the list. */ + if (f != filp) { + /* if we added ourselves, shoot ourselves, it's possible that + * the flush actually did shoot this fsn_mark. That's fine too + * since multiple calls to destroy_mark is perfectly safe, if + * we found a dn_mark already attached to the inode, just sod + * off silently as the flush at close time dealt with it. + */ + if (dn_mark == new_dn_mark) + destroy = 1; + goto out; + } + + error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); + if (error) { + /* if we added, we must shoot */ + if (dn_mark == new_dn_mark) + destroy = 1; + goto out; + } + + error = attach_dn(dn, dn_mark, id, fd, filp, mask); + /* !error means that we attached the dn to the dn_mark, so don't free it */ + if (!error) + dn = NULL; + /* -EEXIST means that we didn't add this new dn and used an old one. + * that isn't an error (and the unused dn should be freed) */ + else if (error == -EEXIST) + error = 0; + + dnotify_recalc_inode_mask(fsn_mark); +out: + spin_unlock(&fsn_mark->lock); + + if (destroy) + fsnotify_destroy_mark(fsn_mark); + + mutex_unlock(&dnotify_mark_mutex); + fsnotify_put_mark(fsn_mark); +out_err: + if (new_fsn_mark) + fsnotify_put_mark(new_fsn_mark); + if (dn) + kmem_cache_free(dnotify_struct_cache, dn); + return error; +} + +static int __init dnotify_init(void) +{ + dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC); + dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC); + + dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops); + if (IS_ERR(dnotify_group)) + panic("unable to allocate fsnotify group for dnotify\n"); + return 0; +} + +module_init(dnotify_init) diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig new file mode 100644 index 00000000..7dceff00 --- /dev/null +++ b/fs/notify/fanotify/Kconfig @@ -0,0 +1,26 @@ +config FANOTIFY + bool "Filesystem wide access notification" + select FSNOTIFY + select ANON_INODES + default n + ---help--- + Say Y here to enable fanotify suport. fanotify is a file access + notification system which differs from inotify in that it sends + an open file descriptor to the userspace listener along with + the event. + + If unsure, say Y. + +config FANOTIFY_ACCESS_PERMISSIONS + bool "fanotify permissions checking" + depends on FANOTIFY + depends on SECURITY + default n + ---help--- + Say Y here is you want fanotify listeners to be able to make permissions + decisions concerning filesystem events. This is used by some fanotify + listeners which need to scan files before allowing the system access to + use those files. This is used by some anti-malware vendors and by some + hierarchical storage managent systems. + + If unsure, say N. diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile new file mode 100644 index 00000000..0999213e --- /dev/null +++ b/fs/notify/fanotify/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_FANOTIFY) += fanotify.o fanotify_user.o diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c new file mode 100644 index 00000000..f35794b9 --- /dev/null +++ b/fs/notify/fanotify/fanotify.c @@ -0,0 +1,228 @@ +#include +#include +#include +#include +#include +#include /* UINT_MAX */ +#include +#include +#include +#include + +static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) +{ + pr_debug("%s: old=%p new=%p\n", __func__, old, new); + + if (old->to_tell == new->to_tell && + old->data_type == new->data_type && + old->tgid == new->tgid) { + switch (old->data_type) { + case (FSNOTIFY_EVENT_PATH): + if ((old->path.mnt == new->path.mnt) && + (old->path.dentry == new->path.dentry)) + return true; + case (FSNOTIFY_EVENT_NONE): + return true; + default: + BUG(); + }; + } + return false; +} + +/* and the list better be locked by something too! */ +static struct fsnotify_event *fanotify_merge(struct list_head *list, + struct fsnotify_event *event) +{ + struct fsnotify_event_holder *test_holder; + struct fsnotify_event *test_event = NULL; + struct fsnotify_event *new_event; + + pr_debug("%s: list=%p event=%p\n", __func__, list, event); + + + list_for_each_entry_reverse(test_holder, list, event_list) { + if (should_merge(test_holder->event, event)) { + test_event = test_holder->event; + break; + } + } + + if (!test_event) + return NULL; + + fsnotify_get_event(test_event); + + /* if they are exactly the same we are done */ + if (test_event->mask == event->mask) + return test_event; + + /* + * if the refcnt == 2 this is the only queue + * for this event and so we can update the mask + * in place. + */ + if (atomic_read(&test_event->refcnt) == 2) { + test_event->mask |= event->mask; + return test_event; + } + + new_event = fsnotify_clone_event(test_event); + + /* done with test_event */ + fsnotify_put_event(test_event); + + /* couldn't allocate memory, merge was not possible */ + if (unlikely(!new_event)) + return ERR_PTR(-ENOMEM); + + /* build new event and replace it on the list */ + new_event->mask = (test_event->mask | event->mask); + fsnotify_replace_event(test_holder, new_event); + + /* we hold a reference on new_event from clone_event */ + return new_event; +} + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS +static int fanotify_get_response_from_access(struct fsnotify_group *group, + struct fsnotify_event *event) +{ + int ret; + + pr_debug("%s: group=%p event=%p\n", __func__, group, event); + + wait_event(group->fanotify_data.access_waitq, event->response || + atomic_read(&group->fanotify_data.bypass_perm)); + + if (!event->response) /* bypass_perm set */ + return 0; + + /* userspace responded, convert to something usable */ + spin_lock(&event->lock); + switch (event->response) { + case FAN_ALLOW: + ret = 0; + break; + case FAN_DENY: + default: + ret = -EPERM; + } + event->response = 0; + spin_unlock(&event->lock); + + pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, + group, event, ret); + + return ret; +} +#endif + +static int fanotify_handle_event(struct fsnotify_group *group, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *fanotify_mark, + struct fsnotify_event *event) +{ + int ret = 0; + struct fsnotify_event *notify_event = NULL; + + BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); + BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); + BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); + BUILD_BUG_ON(FAN_OPEN != FS_OPEN); + BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); + BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); + BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); + BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); + BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); + + pr_debug("%s: group=%p event=%p\n", __func__, group, event); + + notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge); + if (IS_ERR(notify_event)) + return PTR_ERR(notify_event); + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (event->mask & FAN_ALL_PERM_EVENTS) { + /* if we merged we need to wait on the new event */ + if (notify_event) + event = notify_event; + ret = fanotify_get_response_from_access(group, event); + } +#endif + + if (notify_event) + fsnotify_put_event(notify_event); + + return ret; +} + +static bool fanotify_should_send_event(struct fsnotify_group *group, + struct inode *to_tell, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmnt_mark, + __u32 event_mask, void *data, int data_type) +{ + __u32 marks_mask, marks_ignored_mask; + struct path *path = data; + + pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " + "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, + inode_mark, vfsmnt_mark, event_mask, data, data_type); + + /* if we don't have enough info to send an event to userspace say no */ + if (data_type != FSNOTIFY_EVENT_PATH) + return false; + + /* sorry, fanotify only gives a damn about files and dirs */ + if (!S_ISREG(path->dentry->d_inode->i_mode) && + !S_ISDIR(path->dentry->d_inode->i_mode)) + return false; + + if (inode_mark && vfsmnt_mark) { + marks_mask = (vfsmnt_mark->mask | inode_mark->mask); + marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask); + } else if (inode_mark) { + /* + * if the event is for a child and this inode doesn't care about + * events on the child, don't send it! + */ + if ((event_mask & FS_EVENT_ON_CHILD) && + !(inode_mark->mask & FS_EVENT_ON_CHILD)) + return false; + marks_mask = inode_mark->mask; + marks_ignored_mask = inode_mark->ignored_mask; + } else if (vfsmnt_mark) { + marks_mask = vfsmnt_mark->mask; + marks_ignored_mask = vfsmnt_mark->ignored_mask; + } else { + BUG(); + } + + if (S_ISDIR(path->dentry->d_inode->i_mode) && + (marks_ignored_mask & FS_ISDIR)) + return false; + + if (event_mask & marks_mask & ~marks_ignored_mask) + return true; + + return false; +} + +static void fanotify_free_group_priv(struct fsnotify_group *group) +{ + struct user_struct *user; + + user = group->fanotify_data.user; + atomic_dec(&user->fanotify_listeners); + free_uid(user); +} + +const struct fsnotify_ops fanotify_fsnotify_ops = { + .handle_event = fanotify_handle_event, + .should_send_event = fanotify_should_send_event, + .free_group_priv = fanotify_free_group_priv, + .free_event_priv = NULL, + .freeing_mark = NULL, +}; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c new file mode 100644 index 00000000..9fde1c00 --- /dev/null +++ b/fs/notify/fanotify/fanotify_user.c @@ -0,0 +1,891 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define FANOTIFY_DEFAULT_MAX_EVENTS 16384 +#define FANOTIFY_DEFAULT_MAX_MARKS 8192 +#define FANOTIFY_DEFAULT_MAX_LISTENERS 128 + +extern const struct fsnotify_ops fanotify_fsnotify_ops; + +static struct kmem_cache *fanotify_mark_cache __read_mostly; +static struct kmem_cache *fanotify_response_event_cache __read_mostly; + +struct fanotify_response_event { + struct list_head list; + __s32 fd; + struct fsnotify_event *event; +}; + +/* + * Get an fsnotify notification event if one exists and is small + * enough to fit in "count". Return an error pointer if the count + * is not large enough. + * + * Called with the group->notification_mutex held. + */ +static struct fsnotify_event *get_one_event(struct fsnotify_group *group, + size_t count) +{ + BUG_ON(!mutex_is_locked(&group->notification_mutex)); + + pr_debug("%s: group=%p count=%zd\n", __func__, group, count); + + if (fsnotify_notify_queue_is_empty(group)) + return NULL; + + if (FAN_EVENT_METADATA_LEN > count) + return ERR_PTR(-EINVAL); + + /* held the notification_mutex the whole time, so this is the + * same event we peeked above */ + return fsnotify_remove_notify_event(group); +} + +static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event) +{ + int client_fd; + struct dentry *dentry; + struct vfsmount *mnt; + struct file *new_file; + + pr_debug("%s: group=%p event=%p\n", __func__, group, event); + + client_fd = get_unused_fd(); + if (client_fd < 0) + return client_fd; + + if (event->data_type != FSNOTIFY_EVENT_PATH) { + WARN_ON(1); + put_unused_fd(client_fd); + return -EINVAL; + } + + /* + * we need a new file handle for the userspace program so it can read even if it was + * originally opened O_WRONLY. + */ + dentry = dget(event->path.dentry); + mnt = mntget(event->path.mnt); + /* it's possible this event was an overflow event. in that case dentry and mnt + * are NULL; That's fine, just don't call dentry open */ + if (dentry && mnt) + new_file = dentry_open(dentry, mnt, + group->fanotify_data.f_flags | FMODE_NONOTIFY, + current_cred()); + else + new_file = ERR_PTR(-EOVERFLOW); + if (IS_ERR(new_file)) { + /* + * we still send an event even if we can't open the file. this + * can happen when say tasks are gone and we try to open their + * /proc files or we try to open a WRONLY file like in sysfs + * we just send the errno to userspace since there isn't much + * else we can do. + */ + put_unused_fd(client_fd); + client_fd = PTR_ERR(new_file); + } else { + fd_install(client_fd, new_file); + } + + return client_fd; +} + +static int fill_event_metadata(struct fsnotify_group *group, + struct fanotify_event_metadata *metadata, + struct fsnotify_event *event) +{ + int ret = 0; + + pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, + group, metadata, event); + + metadata->event_len = FAN_EVENT_METADATA_LEN; + metadata->metadata_len = FAN_EVENT_METADATA_LEN; + metadata->vers = FANOTIFY_METADATA_VERSION; + metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; + metadata->pid = pid_vnr(event->tgid); + if (unlikely(event->mask & FAN_Q_OVERFLOW)) + metadata->fd = FAN_NOFD; + else { + metadata->fd = create_fd(group, event); + if (metadata->fd < 0) + ret = metadata->fd; + } + + return ret; +} + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS +static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group, + __s32 fd) +{ + struct fanotify_response_event *re, *return_re = NULL; + + mutex_lock(&group->fanotify_data.access_mutex); + list_for_each_entry(re, &group->fanotify_data.access_list, list) { + if (re->fd != fd) + continue; + + list_del_init(&re->list); + return_re = re; + break; + } + mutex_unlock(&group->fanotify_data.access_mutex); + + pr_debug("%s: found return_re=%p\n", __func__, return_re); + + return return_re; +} + +static int process_access_response(struct fsnotify_group *group, + struct fanotify_response *response_struct) +{ + struct fanotify_response_event *re; + __s32 fd = response_struct->fd; + __u32 response = response_struct->response; + + pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group, + fd, response); + /* + * make sure the response is valid, if invalid we do nothing and either + * userspace can send a valid response or we will clean it up after the + * timeout + */ + switch (response) { + case FAN_ALLOW: + case FAN_DENY: + break; + default: + return -EINVAL; + } + + if (fd < 0) + return -EINVAL; + + re = dequeue_re(group, fd); + if (!re) + return -ENOENT; + + re->event->response = response; + + wake_up(&group->fanotify_data.access_waitq); + + kmem_cache_free(fanotify_response_event_cache, re); + + return 0; +} + +static int prepare_for_access_response(struct fsnotify_group *group, + struct fsnotify_event *event, + __s32 fd) +{ + struct fanotify_response_event *re; + + if (!(event->mask & FAN_ALL_PERM_EVENTS)) + return 0; + + re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL); + if (!re) + return -ENOMEM; + + re->event = event; + re->fd = fd; + + mutex_lock(&group->fanotify_data.access_mutex); + + if (atomic_read(&group->fanotify_data.bypass_perm)) { + mutex_unlock(&group->fanotify_data.access_mutex); + kmem_cache_free(fanotify_response_event_cache, re); + event->response = FAN_ALLOW; + return 0; + } + + list_add_tail(&re->list, &group->fanotify_data.access_list); + mutex_unlock(&group->fanotify_data.access_mutex); + + return 0; +} + +static void remove_access_response(struct fsnotify_group *group, + struct fsnotify_event *event, + __s32 fd) +{ + struct fanotify_response_event *re; + + if (!(event->mask & FAN_ALL_PERM_EVENTS)) + return; + + re = dequeue_re(group, fd); + if (!re) + return; + + BUG_ON(re->event != event); + + kmem_cache_free(fanotify_response_event_cache, re); + + return; +} +#else +static int prepare_for_access_response(struct fsnotify_group *group, + struct fsnotify_event *event, + __s32 fd) +{ + return 0; +} + +static void remove_access_response(struct fsnotify_group *group, + struct fsnotify_event *event, + __s32 fd) +{ + return; +} +#endif + +static ssize_t copy_event_to_user(struct fsnotify_group *group, + struct fsnotify_event *event, + char __user *buf) +{ + struct fanotify_event_metadata fanotify_event_metadata; + int fd, ret; + + pr_debug("%s: group=%p event=%p\n", __func__, group, event); + + ret = fill_event_metadata(group, &fanotify_event_metadata, event); + if (ret < 0) + goto out; + + fd = fanotify_event_metadata.fd; + ret = prepare_for_access_response(group, event, fd); + if (ret) + goto out_close_fd; + + ret = -EFAULT; + if (copy_to_user(buf, &fanotify_event_metadata, + fanotify_event_metadata.event_len)) + goto out_kill_access_response; + + return fanotify_event_metadata.event_len; + +out_kill_access_response: + remove_access_response(group, event, fd); +out_close_fd: + if (fd != FAN_NOFD) + sys_close(fd); +out: +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (event->mask & FAN_ALL_PERM_EVENTS) { + event->response = FAN_DENY; + wake_up(&group->fanotify_data.access_waitq); + } +#endif + return ret; +} + +/* intofiy userspace file descriptor functions */ +static unsigned int fanotify_poll(struct file *file, poll_table *wait) +{ + struct fsnotify_group *group = file->private_data; + int ret = 0; + + poll_wait(file, &group->notification_waitq, wait); + mutex_lock(&group->notification_mutex); + if (!fsnotify_notify_queue_is_empty(group)) + ret = POLLIN | POLLRDNORM; + mutex_unlock(&group->notification_mutex); + + return ret; +} + +static ssize_t fanotify_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + struct fsnotify_group *group; + struct fsnotify_event *kevent; + char __user *start; + int ret; + DEFINE_WAIT(wait); + + start = buf; + group = file->private_data; + + pr_debug("%s: group=%p\n", __func__, group); + + while (1) { + prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE); + + mutex_lock(&group->notification_mutex); + kevent = get_one_event(group, count); + mutex_unlock(&group->notification_mutex); + + if (kevent) { + ret = PTR_ERR(kevent); + if (IS_ERR(kevent)) + break; + ret = copy_event_to_user(group, kevent, buf); + fsnotify_put_event(kevent); + if (ret < 0) + break; + buf += ret; + count -= ret; + continue; + } + + ret = -EAGAIN; + if (file->f_flags & O_NONBLOCK) + break; + ret = -ERESTARTSYS; + if (signal_pending(current)) + break; + + if (start != buf) + break; + + schedule(); + } + + finish_wait(&group->notification_waitq, &wait); + if (start != buf && ret != -EFAULT) + ret = buf - start; + return ret; +} + +static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) +{ +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + struct fanotify_response response = { .fd = -1, .response = -1 }; + struct fsnotify_group *group; + int ret; + + group = file->private_data; + + if (count > sizeof(response)) + count = sizeof(response); + + pr_debug("%s: group=%p count=%zu\n", __func__, group, count); + + if (copy_from_user(&response, buf, count)) + return -EFAULT; + + ret = process_access_response(group, &response); + if (ret < 0) + count = ret; + + return count; +#else + return -EINVAL; +#endif +} + +static int fanotify_release(struct inode *ignored, struct file *file) +{ + struct fsnotify_group *group = file->private_data; + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + struct fanotify_response_event *re, *lre; + + mutex_lock(&group->fanotify_data.access_mutex); + + atomic_inc(&group->fanotify_data.bypass_perm); + + list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { + pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, + re, re->event); + + list_del_init(&re->list); + re->event->response = FAN_ALLOW; + + kmem_cache_free(fanotify_response_event_cache, re); + } + mutex_unlock(&group->fanotify_data.access_mutex); + + wake_up(&group->fanotify_data.access_waitq); +#endif + /* matches the fanotify_init->fsnotify_alloc_group */ + fsnotify_put_group(group); + + return 0; +} + +static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct fsnotify_group *group; + struct fsnotify_event_holder *holder; + void __user *p; + int ret = -ENOTTY; + size_t send_len = 0; + + group = file->private_data; + + p = (void __user *) arg; + + switch (cmd) { + case FIONREAD: + mutex_lock(&group->notification_mutex); + list_for_each_entry(holder, &group->notification_list, event_list) + send_len += FAN_EVENT_METADATA_LEN; + mutex_unlock(&group->notification_mutex); + ret = put_user(send_len, (int __user *) p); + break; + } + + return ret; +} + +static const struct file_operations fanotify_fops = { + .poll = fanotify_poll, + .read = fanotify_read, + .write = fanotify_write, + .fasync = NULL, + .release = fanotify_release, + .unlocked_ioctl = fanotify_ioctl, + .compat_ioctl = fanotify_ioctl, + .llseek = noop_llseek, +}; + +static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) +{ + kmem_cache_free(fanotify_mark_cache, fsn_mark); +} + +static int fanotify_find_path(int dfd, const char __user *filename, + struct path *path, unsigned int flags) +{ + int ret; + + pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__, + dfd, filename, flags); + + if (filename == NULL) { + struct file *file; + int fput_needed; + + ret = -EBADF; + file = fget_light(dfd, &fput_needed); + if (!file) + goto out; + + ret = -ENOTDIR; + if ((flags & FAN_MARK_ONLYDIR) && + !(S_ISDIR(file->f_path.dentry->d_inode->i_mode))) { + fput_light(file, fput_needed); + goto out; + } + + *path = file->f_path; + path_get(path); + fput_light(file, fput_needed); + } else { + unsigned int lookup_flags = 0; + + if (!(flags & FAN_MARK_DONT_FOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + if (flags & FAN_MARK_ONLYDIR) + lookup_flags |= LOOKUP_DIRECTORY; + + ret = user_path_at(dfd, filename, lookup_flags, path); + if (ret) + goto out; + } + + /* you can only watch an inode if you have read permissions on it */ + ret = inode_permission(path->dentry->d_inode, MAY_READ); + if (ret) + path_put(path); +out: + return ret; +} + +static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, + __u32 mask, + unsigned int flags) +{ + __u32 oldmask; + + spin_lock(&fsn_mark->lock); + if (!(flags & FAN_MARK_IGNORED_MASK)) { + oldmask = fsn_mark->mask; + fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask)); + } else { + oldmask = fsn_mark->ignored_mask; + fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask)); + } + spin_unlock(&fsn_mark->lock); + + if (!(oldmask & ~mask)) + fsnotify_destroy_mark(fsn_mark); + + return mask & oldmask; +} + +static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, + struct vfsmount *mnt, __u32 mask, + unsigned int flags) +{ + struct fsnotify_mark *fsn_mark = NULL; + __u32 removed; + + fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); + if (!fsn_mark) + return -ENOENT; + + removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags); + fsnotify_put_mark(fsn_mark); + if (removed & mnt->mnt_fsnotify_mask) + fsnotify_recalc_vfsmount_mask(mnt); + + return 0; +} + +static int fanotify_remove_inode_mark(struct fsnotify_group *group, + struct inode *inode, __u32 mask, + unsigned int flags) +{ + struct fsnotify_mark *fsn_mark = NULL; + __u32 removed; + + fsn_mark = fsnotify_find_inode_mark(group, inode); + if (!fsn_mark) + return -ENOENT; + + removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags); + /* matches the fsnotify_find_inode_mark() */ + fsnotify_put_mark(fsn_mark); + if (removed & inode->i_fsnotify_mask) + fsnotify_recalc_inode_mask(inode); + + return 0; +} + +static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, + __u32 mask, + unsigned int flags) +{ + __u32 oldmask = -1; + + spin_lock(&fsn_mark->lock); + if (!(flags & FAN_MARK_IGNORED_MASK)) { + oldmask = fsn_mark->mask; + fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask)); + } else { + __u32 tmask = fsn_mark->ignored_mask | mask; + fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); + if (flags & FAN_MARK_IGNORED_SURV_MODIFY) + fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; + } + + if (!(flags & FAN_MARK_ONDIR)) { + __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR; + fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); + } + + spin_unlock(&fsn_mark->lock); + + return mask & ~oldmask; +} + +static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, + struct vfsmount *mnt, __u32 mask, + unsigned int flags) +{ + struct fsnotify_mark *fsn_mark; + __u32 added; + int ret = 0; + + fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); + if (!fsn_mark) { + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + return -ENOSPC; + + fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); + if (!fsn_mark) + return -ENOMEM; + + fsnotify_init_mark(fsn_mark, fanotify_free_mark); + ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0); + if (ret) + goto err; + } + added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); + + if (added & ~mnt->mnt_fsnotify_mask) + fsnotify_recalc_vfsmount_mask(mnt); +err: + fsnotify_put_mark(fsn_mark); + return ret; +} + +static int fanotify_add_inode_mark(struct fsnotify_group *group, + struct inode *inode, __u32 mask, + unsigned int flags) +{ + struct fsnotify_mark *fsn_mark; + __u32 added; + int ret = 0; + + pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); + + /* + * If some other task has this inode open for write we should not add + * an ignored mark, unless that ignored mark is supposed to survive + * modification changes anyway. + */ + if ((flags & FAN_MARK_IGNORED_MASK) && + !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && + (atomic_read(&inode->i_writecount) > 0)) + return 0; + + fsn_mark = fsnotify_find_inode_mark(group, inode); + if (!fsn_mark) { + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + return -ENOSPC; + + fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); + if (!fsn_mark) + return -ENOMEM; + + fsnotify_init_mark(fsn_mark, fanotify_free_mark); + ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0); + if (ret) + goto err; + } + added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); + + if (added & ~inode->i_fsnotify_mask) + fsnotify_recalc_inode_mask(inode); +err: + fsnotify_put_mark(fsn_mark); + return ret; +} + +/* fanotify syscalls */ +SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) +{ + struct fsnotify_group *group; + int f_flags, fd; + struct user_struct *user; + + pr_debug("%s: flags=%d event_f_flags=%d\n", + __func__, flags, event_f_flags); + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (flags & ~FAN_ALL_INIT_FLAGS) + return -EINVAL; + + user = get_current_user(); + if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { + free_uid(user); + return -EMFILE; + } + + f_flags = O_RDWR | FMODE_NONOTIFY; + if (flags & FAN_CLOEXEC) + f_flags |= O_CLOEXEC; + if (flags & FAN_NONBLOCK) + f_flags |= O_NONBLOCK; + + /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ + group = fsnotify_alloc_group(&fanotify_fsnotify_ops); + if (IS_ERR(group)) { + free_uid(user); + return PTR_ERR(group); + } + + group->fanotify_data.user = user; + atomic_inc(&user->fanotify_listeners); + + group->fanotify_data.f_flags = event_f_flags; +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + mutex_init(&group->fanotify_data.access_mutex); + init_waitqueue_head(&group->fanotify_data.access_waitq); + INIT_LIST_HEAD(&group->fanotify_data.access_list); + atomic_set(&group->fanotify_data.bypass_perm, 0); +#endif + switch (flags & FAN_ALL_CLASS_BITS) { + case FAN_CLASS_NOTIF: + group->priority = FS_PRIO_0; + break; + case FAN_CLASS_CONTENT: + group->priority = FS_PRIO_1; + break; + case FAN_CLASS_PRE_CONTENT: + group->priority = FS_PRIO_2; + break; + default: + fd = -EINVAL; + goto out_put_group; + } + + if (flags & FAN_UNLIMITED_QUEUE) { + fd = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out_put_group; + group->max_events = UINT_MAX; + } else { + group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS; + } + + if (flags & FAN_UNLIMITED_MARKS) { + fd = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out_put_group; + group->fanotify_data.max_marks = UINT_MAX; + } else { + group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS; + } + + fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); + if (fd < 0) + goto out_put_group; + + return fd; + +out_put_group: + fsnotify_put_group(group); + return fd; +} + +SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, + __u64 mask, int dfd, + const char __user * pathname) +{ + struct inode *inode = NULL; + struct vfsmount *mnt = NULL; + struct fsnotify_group *group; + struct file *filp; + struct path path; + int ret, fput_needed; + + pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", + __func__, fanotify_fd, flags, dfd, pathname, mask); + + /* we only use the lower 32 bits as of right now. */ + if (mask & ((__u64)0xffffffff << 32)) + return -EINVAL; + + if (flags & ~FAN_ALL_MARK_FLAGS) + return -EINVAL; + switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { + case FAN_MARK_ADD: /* fallthrough */ + case FAN_MARK_REMOVE: + if (!mask) + return -EINVAL; + case FAN_MARK_FLUSH: + break; + default: + return -EINVAL; + } + + if (mask & FAN_ONDIR) { + flags |= FAN_MARK_ONDIR; + mask &= ~FAN_ONDIR; + } + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD)) +#else + if (mask & ~(FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD)) +#endif + return -EINVAL; + + filp = fget_light(fanotify_fd, &fput_needed); + if (unlikely(!filp)) + return -EBADF; + + /* verify that this is indeed an fanotify instance */ + ret = -EINVAL; + if (unlikely(filp->f_op != &fanotify_fops)) + goto fput_and_out; + group = filp->private_data; + + /* + * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not + * allowed to set permissions events. + */ + ret = -EINVAL; + if (mask & FAN_ALL_PERM_EVENTS && + group->priority == FS_PRIO_0) + goto fput_and_out; + + ret = fanotify_find_path(dfd, pathname, &path, flags); + if (ret) + goto fput_and_out; + + /* inode held in place by reference to path; group by fget on fd */ + if (!(flags & FAN_MARK_MOUNT)) + inode = path.dentry->d_inode; + else + mnt = path.mnt; + + /* create/update an inode mark */ + switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { + case FAN_MARK_ADD: + if (flags & FAN_MARK_MOUNT) + ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags); + else + ret = fanotify_add_inode_mark(group, inode, mask, flags); + break; + case FAN_MARK_REMOVE: + if (flags & FAN_MARK_MOUNT) + ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags); + else + ret = fanotify_remove_inode_mark(group, inode, mask, flags); + break; + case FAN_MARK_FLUSH: + if (flags & FAN_MARK_MOUNT) + fsnotify_clear_vfsmount_marks_by_group(group); + else + fsnotify_clear_inode_marks_by_group(group); + break; + default: + ret = -EINVAL; + } + + path_put(&path); +fput_and_out: + fput_light(filp, fput_needed); + return ret; +} + +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_fanotify_mark(long fanotify_fd, long flags, __u64 mask, + long dfd, long pathname) +{ + return SYSC_fanotify_mark((int) fanotify_fd, (unsigned int) flags, + mask, (int) dfd, + (const char __user *) pathname); +} +SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark); +#endif + +/* + * fanotify_user_setup - Our initialization function. Note that we cannot return + * error because we have compiled-in VFS hooks. So an (unlikely) failure here + * must result in panic(). + */ +static int __init fanotify_user_setup(void) +{ + fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); + fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event, + SLAB_PANIC); + + return 0; +} +device_initcall(fanotify_user_setup); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c new file mode 100644 index 00000000..79b47cbb --- /dev/null +++ b/fs/notify/fsnotify.c @@ -0,0 +1,310 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "fsnotify.h" + +/* + * Clear all of the marks on an inode when it is being evicted from core + */ +void __fsnotify_inode_delete(struct inode *inode) +{ + fsnotify_clear_marks_by_inode(inode); +} +EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); + +void __fsnotify_vfsmount_delete(struct vfsmount *mnt) +{ + fsnotify_clear_marks_by_mount(mnt); +} + +/* + * Given an inode, first check if we care what happens to our children. Inotify + * and dnotify both tell their parents about events. If we care about any event + * on a child we run all of our children and set a dentry flag saying that the + * parent cares. Thus when an event happens on a child it can quickly tell if + * if there is a need to find a parent and send the event to the parent. + */ +void __fsnotify_update_child_dentry_flags(struct inode *inode) +{ + struct dentry *alias; + int watched; + + if (!S_ISDIR(inode->i_mode)) + return; + + /* determine if the children should tell inode about their events */ + watched = fsnotify_inode_watches_children(inode); + + spin_lock(&inode->i_lock); + /* run all of the dentries associated with this inode. Since this is a + * directory, there damn well better only be one item on this list */ + list_for_each_entry(alias, &inode->i_dentry, d_alias) { + struct dentry *child; + + /* run all of the children of the original inode and fix their + * d_flags to indicate parental interest (their parent is the + * original inode) */ + spin_lock(&alias->d_lock); + list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) { + if (!child->d_inode) + continue; + + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); + if (watched) + child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; + else + child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; + spin_unlock(&child->d_lock); + } + spin_unlock(&alias->d_lock); + } + spin_unlock(&inode->i_lock); +} + +/* Notify this dentry's parent about a child's events. */ +int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) +{ + struct dentry *parent; + struct inode *p_inode; + int ret = 0; + + if (!dentry) + dentry = path->dentry; + + if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) + return 0; + + parent = dget_parent(dentry); + p_inode = parent->d_inode; + + if (unlikely(!fsnotify_inode_watches_children(p_inode))) + __fsnotify_update_child_dentry_flags(p_inode); + else if (p_inode->i_fsnotify_mask & mask) { + /* we are notifying a parent so come up with the new mask which + * specifies these are events which came from a child. */ + mask |= FS_EVENT_ON_CHILD; + + if (path) + ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH, + dentry->d_name.name, 0); + else + ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, + dentry->d_name.name, 0); + } + + dput(parent); + + return ret; +} +EXPORT_SYMBOL_GPL(__fsnotify_parent); + +static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + __u32 mask, void *data, + int data_is, u32 cookie, + const unsigned char *file_name, + struct fsnotify_event **event) +{ + struct fsnotify_group *group = NULL; + __u32 inode_test_mask = 0; + __u32 vfsmount_test_mask = 0; + + if (unlikely(!inode_mark && !vfsmount_mark)) { + BUG(); + return 0; + } + + /* clear ignored on inode modification */ + if (mask & FS_MODIFY) { + if (inode_mark && + !(inode_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) + inode_mark->ignored_mask = 0; + if (vfsmount_mark && + !(vfsmount_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) + vfsmount_mark->ignored_mask = 0; + } + + /* does the inode mark tell us to do something? */ + if (inode_mark) { + group = inode_mark->group; + inode_test_mask = (mask & ~FS_EVENT_ON_CHILD); + inode_test_mask &= inode_mark->mask; + inode_test_mask &= ~inode_mark->ignored_mask; + } + + /* does the vfsmount_mark tell us to do something? */ + if (vfsmount_mark) { + vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD); + group = vfsmount_mark->group; + vfsmount_test_mask &= vfsmount_mark->mask; + vfsmount_test_mask &= ~vfsmount_mark->ignored_mask; + if (inode_mark) + vfsmount_test_mask &= ~inode_mark->ignored_mask; + } + + pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p" + " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" + " data=%p data_is=%d cookie=%d event=%p\n", + __func__, group, to_tell, mnt, mask, inode_mark, + inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, + data_is, cookie, *event); + + if (!inode_test_mask && !vfsmount_test_mask) + return 0; + + if (group->ops->should_send_event(group, to_tell, inode_mark, + vfsmount_mark, mask, data, + data_is) == false) + return 0; + + if (!*event) { + *event = fsnotify_create_event(to_tell, mask, data, + data_is, file_name, + cookie, GFP_KERNEL); + if (!*event) + return -ENOMEM; + } + return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event); +} + +/* + * This is the main call to fsnotify. The VFS calls into hook specific functions + * in linux/fsnotify.h. Those functions then in turn call here. Here will call + * out to all of the registered fsnotify_group. Those groups can then use the + * notification event in whatever means they feel necessary. + */ +int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, + const unsigned char *file_name, u32 cookie) +{ + struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; + struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; + struct fsnotify_group *inode_group, *vfsmount_group; + struct fsnotify_event *event = NULL; + struct vfsmount *mnt; + int idx, ret = 0; + /* global tests shouldn't care about events on child only the specific event */ + __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); + + if (data_is == FSNOTIFY_EVENT_PATH) + mnt = ((struct path *)data)->mnt; + else + mnt = NULL; + + /* + * if this is a modify event we may need to clear the ignored masks + * otherwise return if neither the inode nor the vfsmount care about + * this type of event. + */ + if (!(mask & FS_MODIFY) && + !(test_mask & to_tell->i_fsnotify_mask) && + !(mnt && test_mask & mnt->mnt_fsnotify_mask)) + return 0; + + idx = srcu_read_lock(&fsnotify_mark_srcu); + + if ((mask & FS_MODIFY) || + (test_mask & to_tell->i_fsnotify_mask)) + inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, + &fsnotify_mark_srcu); + + if (mnt && ((mask & FS_MODIFY) || + (test_mask & mnt->mnt_fsnotify_mask))) { + vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, + &fsnotify_mark_srcu); + inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, + &fsnotify_mark_srcu); + } + + while (inode_node || vfsmount_node) { + inode_group = vfsmount_group = NULL; + + if (inode_node) { + inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), + struct fsnotify_mark, i.i_list); + inode_group = inode_mark->group; + } + + if (vfsmount_node) { + vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), + struct fsnotify_mark, m.m_list); + vfsmount_group = vfsmount_mark->group; + } + + if (inode_group > vfsmount_group) { + /* handle inode */ + ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, + data_is, cookie, file_name, &event); + /* we didn't use the vfsmount_mark */ + vfsmount_group = NULL; + } else if (vfsmount_group > inode_group) { + ret = send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, + data_is, cookie, file_name, &event); + inode_group = NULL; + } else { + ret = send_to_group(to_tell, mnt, inode_mark, vfsmount_mark, + mask, data, data_is, cookie, file_name, + &event); + } + + if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) + goto out; + + if (inode_group) + inode_node = srcu_dereference(inode_node->next, + &fsnotify_mark_srcu); + if (vfsmount_group) + vfsmount_node = srcu_dereference(vfsmount_node->next, + &fsnotify_mark_srcu); + } + ret = 0; +out: + srcu_read_unlock(&fsnotify_mark_srcu, idx); + /* + * fsnotify_create_event() took a reference so the event can't be cleaned + * up while we are still trying to add it to lists, drop that one. + */ + if (event) + fsnotify_put_event(event); + + return ret; +} +EXPORT_SYMBOL_GPL(fsnotify); + +static __init int fsnotify_init(void) +{ + int ret; + + BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23); + + ret = init_srcu_struct(&fsnotify_mark_srcu); + if (ret) + panic("initializing fsnotify_mark_srcu"); + + return 0; +} +core_initcall(fsnotify_init); diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h new file mode 100644 index 00000000..85e7d2b4 --- /dev/null +++ b/fs/notify/fsnotify.h @@ -0,0 +1,47 @@ +#ifndef __FS_NOTIFY_FSNOTIFY_H_ +#define __FS_NOTIFY_FSNOTIFY_H_ + +#include +#include +#include +#include + +/* destroy all events sitting in this groups notification queue */ +extern void fsnotify_flush_notify(struct fsnotify_group *group); + +/* protects reads of inode and vfsmount marks list */ +extern struct srcu_struct fsnotify_mark_srcu; + +extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, + __u32 mask); +/* add a mark to an inode */ +extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group, struct inode *inode, + int allow_dups); +/* add a mark to a vfsmount */ +extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group, struct vfsmount *mnt, + int allow_dups); + +/* final kfree of a group */ +extern void fsnotify_final_destroy_group(struct fsnotify_group *group); + +/* vfsmount specific destruction of a mark */ +extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); +/* inode specific destruction of a mark */ +extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); +/* run the list of all marks associated with inode and flag them to be freed */ +extern void fsnotify_clear_marks_by_inode(struct inode *inode); +/* run the list of all marks associated with vfsmount and flag them to be freed */ +extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt); +/* + * update the dentry->d_flags of all of inode's children to indicate if inode cares + * about events that happen to its children. + */ +extern void __fsnotify_update_child_dentry_flags(struct inode *inode); + +/* allocate and destroy and event holder to attach events to notification/access queues */ +extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void); +extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder); + +#endif /* __FS_NOTIFY_FSNOTIFY_H_ */ diff --git a/fs/notify/group.c b/fs/notify/group.c new file mode 100644 index 00000000..d309f384 --- /dev/null +++ b/fs/notify/group.c @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include "fsnotify.h" + +#include + +/* + * Final freeing of a group + */ +void fsnotify_final_destroy_group(struct fsnotify_group *group) +{ + /* clear the notification queue of all events */ + fsnotify_flush_notify(group); + + if (group->ops->free_group_priv) + group->ops->free_group_priv(group); + + kfree(group); +} + +/* + * Trying to get rid of a group. We need to first get rid of any outstanding + * allocations and then free the group. Remember that fsnotify_clear_marks_by_group + * could miss marks that are being freed by inode and those marks could still + * hold a reference to this group (via group->num_marks) If we get into that + * situtation, the fsnotify_final_destroy_group will get called when that final + * mark is freed. + */ +static void fsnotify_destroy_group(struct fsnotify_group *group) +{ + /* clear all inode marks for this group */ + fsnotify_clear_marks_by_group(group); + + synchronize_srcu(&fsnotify_mark_srcu); + + /* past the point of no return, matches the initial value of 1 */ + if (atomic_dec_and_test(&group->num_marks)) + fsnotify_final_destroy_group(group); +} + +/* + * Drop a reference to a group. Free it if it's through. + */ +void fsnotify_put_group(struct fsnotify_group *group) +{ + if (atomic_dec_and_test(&group->refcnt)) + fsnotify_destroy_group(group); +} + +/* + * Create a new fsnotify_group and hold a reference for the group returned. + */ +struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) +{ + struct fsnotify_group *group; + + group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL); + if (!group) + return ERR_PTR(-ENOMEM); + + /* set to 0 when there a no external references to this group */ + atomic_set(&group->refcnt, 1); + /* + * hits 0 when there are no external references AND no marks for + * this group + */ + atomic_set(&group->num_marks, 1); + + mutex_init(&group->notification_mutex); + INIT_LIST_HEAD(&group->notification_list); + init_waitqueue_head(&group->notification_waitq); + group->max_events = UINT_MAX; + + spin_lock_init(&group->mark_lock); + INIT_LIST_HEAD(&group->marks_list); + + group->ops = ops; + + return group; +} diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c new file mode 100644 index 00000000..07ea8d3e --- /dev/null +++ b/fs/notify/inode_mark.c @@ -0,0 +1,315 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include "fsnotify.h" + +#include "../internal.h" + +/* + * Recalculate the mask of events relevant to a given inode locked. + */ +static void fsnotify_recalc_inode_mask_locked(struct inode *inode) +{ + struct fsnotify_mark *mark; + struct hlist_node *pos; + __u32 new_mask = 0; + + assert_spin_locked(&inode->i_lock); + + hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) + new_mask |= mark->mask; + inode->i_fsnotify_mask = new_mask; +} + +/* + * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types + * any notifier is interested in hearing for this inode. + */ +void fsnotify_recalc_inode_mask(struct inode *inode) +{ + spin_lock(&inode->i_lock); + fsnotify_recalc_inode_mask_locked(inode); + spin_unlock(&inode->i_lock); + + __fsnotify_update_child_dentry_flags(inode); +} + +void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) +{ + struct inode *inode = mark->i.inode; + + assert_spin_locked(&mark->lock); + assert_spin_locked(&mark->group->mark_lock); + + spin_lock(&inode->i_lock); + + hlist_del_init_rcu(&mark->i.i_list); + mark->i.inode = NULL; + + /* + * this mark is now off the inode->i_fsnotify_marks list and we + * hold the inode->i_lock, so this is the perfect time to update the + * inode->i_fsnotify_mask + */ + fsnotify_recalc_inode_mask_locked(inode); + + spin_unlock(&inode->i_lock); +} + +/* + * Given an inode, destroy all of the marks associated with that inode. + */ +void fsnotify_clear_marks_by_inode(struct inode *inode) +{ + struct fsnotify_mark *mark, *lmark; + struct hlist_node *pos, *n; + LIST_HEAD(free_list); + + spin_lock(&inode->i_lock); + hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) { + list_add(&mark->i.free_i_list, &free_list); + hlist_del_init_rcu(&mark->i.i_list); + fsnotify_get_mark(mark); + } + spin_unlock(&inode->i_lock); + + list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) { + fsnotify_destroy_mark(mark); + fsnotify_put_mark(mark); + } +} + +/* + * Given a group clear all of the inode marks associated with that group. + */ +void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) +{ + fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE); +} + +/* + * given a group and inode, find the mark associated with that combination. + * if found take a reference to that mark and return it, else return NULL + */ +struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group, + struct inode *inode) +{ + struct fsnotify_mark *mark; + struct hlist_node *pos; + + assert_spin_locked(&inode->i_lock); + + hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) { + if (mark->group == group) { + fsnotify_get_mark(mark); + return mark; + } + } + return NULL; +} + +/* + * given a group and inode, find the mark associated with that combination. + * if found take a reference to that mark and return it, else return NULL + */ +struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, + struct inode *inode) +{ + struct fsnotify_mark *mark; + + spin_lock(&inode->i_lock); + mark = fsnotify_find_inode_mark_locked(group, inode); + spin_unlock(&inode->i_lock); + + return mark; +} + +/* + * If we are setting a mark mask on an inode mark we should pin the inode + * in memory. + */ +void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark, + __u32 mask) +{ + struct inode *inode; + + assert_spin_locked(&mark->lock); + + if (mask && + mark->i.inode && + !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) { + mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED; + inode = igrab(mark->i.inode); + /* + * we shouldn't be able to get here if the inode wasn't + * already safely held in memory. But bug in case it + * ever is wrong. + */ + BUG_ON(!inode); + } +} + +/* + * Attach an initialized mark to a given inode. + * These marks may be used for the fsnotify backend to determine which + * event types should be delivered to which group and for which inodes. These + * marks are ordered according to priority, highest number first, and then by + * the group's location in memory. + */ +int fsnotify_add_inode_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group, struct inode *inode, + int allow_dups) +{ + struct fsnotify_mark *lmark; + struct hlist_node *node, *last = NULL; + int ret = 0; + + mark->flags |= FSNOTIFY_MARK_FLAG_INODE; + + assert_spin_locked(&mark->lock); + assert_spin_locked(&group->mark_lock); + + spin_lock(&inode->i_lock); + + mark->i.inode = inode; + + /* is mark the first mark? */ + if (hlist_empty(&inode->i_fsnotify_marks)) { + hlist_add_head_rcu(&mark->i.i_list, &inode->i_fsnotify_marks); + goto out; + } + + /* should mark be in the middle of the current list? */ + hlist_for_each_entry(lmark, node, &inode->i_fsnotify_marks, i.i_list) { + last = node; + + if ((lmark->group == group) && !allow_dups) { + ret = -EEXIST; + goto out; + } + + if (mark->group->priority < lmark->group->priority) + continue; + + if ((mark->group->priority == lmark->group->priority) && + (mark->group < lmark->group)) + continue; + + hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list); + goto out; + } + + BUG_ON(last == NULL); + /* mark should be the last entry. last is the current last entry */ + hlist_add_after_rcu(last, &mark->i.i_list); +out: + fsnotify_recalc_inode_mask_locked(inode); + spin_unlock(&inode->i_lock); + + return ret; +} + +/** + * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. + * @list: list of inodes being unmounted (sb->s_inodes) + * + * Called during unmount with no locks held, so needs to be safe against + * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block. + */ +void fsnotify_unmount_inodes(struct list_head *list) +{ + struct inode *inode, *next_i, *need_iput = NULL; + + spin_lock(&inode_sb_list_lock); + list_for_each_entry_safe(inode, next_i, list, i_sb_list) { + struct inode *need_iput_tmp; + + /* + * We cannot __iget() an inode in state I_FREEING, + * I_WILL_FREE, or I_NEW which is fine because by that point + * the inode cannot have any associated watches. + */ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); + continue; + } + + /* + * If i_count is zero, the inode cannot have any watches and + * doing an __iget/iput with MS_ACTIVE clear would actually + * evict all inodes with zero i_count from icache which is + * unnecessarily violent and may in fact be illegal to do. + */ + if (!atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); + continue; + } + + need_iput_tmp = need_iput; + need_iput = NULL; + + /* In case fsnotify_inode_delete() drops a reference. */ + if (inode != need_iput_tmp) + __iget(inode); + else + need_iput_tmp = NULL; + spin_unlock(&inode->i_lock); + + /* In case the dropping of a reference would nuke next_i. */ + if ((&next_i->i_sb_list != list) && + atomic_read(&next_i->i_count)) { + spin_lock(&next_i->i_lock); + if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) { + __iget(next_i); + need_iput = next_i; + } + spin_unlock(&next_i->i_lock); + } + + /* + * We can safely drop inode_sb_list_lock here because we hold + * references on both inode and next_i. Also no new inodes + * will be added since the umount has begun. + */ + spin_unlock(&inode_sb_list_lock); + + if (need_iput_tmp) + iput(need_iput_tmp); + + /* for each watch, send FS_UNMOUNT and then remove it */ + fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + + fsnotify_inode_delete(inode); + + iput(inode); + + spin_lock(&inode_sb_list_lock); + } + spin_unlock(&inode_sb_list_lock); +} diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig new file mode 100644 index 00000000..b981fc0c --- /dev/null +++ b/fs/notify/inotify/Kconfig @@ -0,0 +1,17 @@ +config INOTIFY_USER + bool "Inotify support for userspace" + select ANON_INODES + select FSNOTIFY + default y + ---help--- + Say Y here to enable inotify support for userspace, including the + associated system calls. Inotify allows monitoring of both files and + directories via a single open fd. Events are read from the file + descriptor, which is also select()- and poll()-able. + Inotify fixes numerous shortcomings in dnotify and introduces several + new features including multiple file events, one-shot support, and + unmount notification. + + For more information, see + + If unsure, say Y. diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile new file mode 100644 index 00000000..a380dabe --- /dev/null +++ b/fs/notify/inotify/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h new file mode 100644 index 00000000..b6642e4d --- /dev/null +++ b/fs/notify/inotify/inotify.h @@ -0,0 +1,21 @@ +#include +#include +#include /* struct kmem_cache */ + +extern struct kmem_cache *event_priv_cachep; + +struct inotify_event_private_data { + struct fsnotify_event_private_data fsnotify_event_priv_data; + int wd; +}; + +struct inotify_inode_mark { + struct fsnotify_mark fsn_mark; + int wd; +}; + +extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, + struct fsnotify_group *group); +extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); + +extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c new file mode 100644 index 00000000..e3cbd746 --- /dev/null +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -0,0 +1,222 @@ +/* + * fs/inotify_user.c - inotify support for userspace + * + * Authors: + * John McCutchan + * Robert Love + * + * Copyright (C) 2005 John McCutchan + * Copyright 2006 Hewlett-Packard Development Company, L.P. + * + * Copyright (C) 2009 Eric Paris + * inotify was largely rewriten to make use of the fsnotify infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include /* d_unlinked */ +#include /* struct inode */ +#include +#include +#include /* struct path */ +#include /* kmem_* */ +#include +#include + +#include "inotify.h" + +/* + * Check if 2 events contain the same information. We do not compare private data + * but at this moment that isn't a problem for any know fsnotify listeners. + */ +static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new) +{ + if ((old->mask == new->mask) && + (old->to_tell == new->to_tell) && + (old->data_type == new->data_type) && + (old->name_len == new->name_len)) { + switch (old->data_type) { + case (FSNOTIFY_EVENT_INODE): + /* remember, after old was put on the wait_q we aren't + * allowed to look at the inode any more, only thing + * left to check was if the file_name is the same */ + if (!old->name_len || + !strcmp(old->file_name, new->file_name)) + return true; + break; + case (FSNOTIFY_EVENT_PATH): + if ((old->path.mnt == new->path.mnt) && + (old->path.dentry == new->path.dentry)) + return true; + break; + case (FSNOTIFY_EVENT_NONE): + if (old->mask & FS_Q_OVERFLOW) + return true; + else if (old->mask & FS_IN_IGNORED) + return false; + return true; + }; + } + return false; +} + +static struct fsnotify_event *inotify_merge(struct list_head *list, + struct fsnotify_event *event) +{ + struct fsnotify_event_holder *last_holder; + struct fsnotify_event *last_event; + + /* and the list better be locked by something too */ + spin_lock(&event->lock); + + last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list); + last_event = last_holder->event; + if (event_compare(last_event, event)) + fsnotify_get_event(last_event); + else + last_event = NULL; + + spin_unlock(&event->lock); + + return last_event; +} + +static int inotify_handle_event(struct fsnotify_group *group, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + struct fsnotify_event *event) +{ + struct inotify_inode_mark *i_mark; + struct inode *to_tell; + struct inotify_event_private_data *event_priv; + struct fsnotify_event_private_data *fsn_event_priv; + struct fsnotify_event *added_event; + int wd, ret = 0; + + BUG_ON(vfsmount_mark); + + pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group, + event, event->to_tell, event->mask); + + to_tell = event->to_tell; + + i_mark = container_of(inode_mark, struct inotify_inode_mark, + fsn_mark); + wd = i_mark->wd; + + event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); + if (unlikely(!event_priv)) + return -ENOMEM; + + fsn_event_priv = &event_priv->fsnotify_event_priv_data; + + fsn_event_priv->group = group; + event_priv->wd = wd; + + added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge); + if (added_event) { + inotify_free_event_priv(fsn_event_priv); + if (!IS_ERR(added_event)) + fsnotify_put_event(added_event); + else + ret = PTR_ERR(added_event); + } + + if (inode_mark->mask & IN_ONESHOT) + fsnotify_destroy_mark(inode_mark); + + return ret; +} + +static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) +{ + inotify_ignored_and_remove_idr(fsn_mark, group); +} + +static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + __u32 mask, void *data, int data_type) +{ + if ((inode_mark->mask & FS_EXCL_UNLINK) && + (data_type == FSNOTIFY_EVENT_PATH)) { + struct path *path = data; + + if (d_unlinked(path->dentry)) + return false; + } + + return true; +} + +/* + * This is NEVER supposed to be called. Inotify marks should either have been + * removed from the idr when the watch was removed or in the + * fsnotify_destroy_mark_by_group() call when the inotify instance was being + * torn down. This is only called if the idr is about to be freed but there + * are still marks in it. + */ +static int idr_callback(int id, void *p, void *data) +{ + struct fsnotify_mark *fsn_mark; + struct inotify_inode_mark *i_mark; + static bool warned = false; + + if (warned) + return 0; + + warned = true; + fsn_mark = p; + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); + + WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in " + "idr. Probably leaking memory\n", id, p, data); + + /* + * I'm taking the liberty of assuming that the mark in question is a + * valid address and I'm dereferencing it. This might help to figure + * out why we got here and the panic is no worse than the original + * BUG() that was here. + */ + if (fsn_mark) + printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n", + fsn_mark->group, fsn_mark->i.inode, i_mark->wd); + return 0; +} + +static void inotify_free_group_priv(struct fsnotify_group *group) +{ + /* ideally the idr is empty and we won't hit the BUG in the callback */ + idr_for_each(&group->inotify_data.idr, idr_callback, group); + idr_remove_all(&group->inotify_data.idr); + idr_destroy(&group->inotify_data.idr); + atomic_dec(&group->inotify_data.user->inotify_devs); + free_uid(group->inotify_data.user); +} + +void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) +{ + struct inotify_event_private_data *event_priv; + + + event_priv = container_of(fsn_event_priv, struct inotify_event_private_data, + fsnotify_event_priv_data); + + kmem_cache_free(event_priv_cachep, event_priv); +} + +const struct fsnotify_ops inotify_fsnotify_ops = { + .handle_event = inotify_handle_event, + .should_send_event = inotify_should_send_event, + .free_group_priv = inotify_free_group_priv, + .free_event_priv = inotify_free_event_priv, + .freeing_mark = inotify_freeing_mark, +}; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c new file mode 100644 index 00000000..8445fbc8 --- /dev/null +++ b/fs/notify/inotify/inotify_user.c @@ -0,0 +1,867 @@ +/* + * fs/inotify_user.c - inotify support for userspace + * + * Authors: + * John McCutchan + * Robert Love + * + * Copyright (C) 2005 John McCutchan + * Copyright 2006 Hewlett-Packard Development Company, L.P. + * + * Copyright (C) 2009 Eric Paris + * inotify was largely rewriten to make use of the fsnotify infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include /* struct inode */ +#include +#include +#include /* module_init */ +#include +#include /* roundup() */ +#include /* LOOKUP_FOLLOW */ +#include /* struct user */ +#include /* struct kmem_cache */ +#include +#include +#include +#include +#include +#include + +#include "inotify.h" + +#include + +/* these are configurable via /proc/sys/fs/inotify/ */ +static int inotify_max_user_instances __read_mostly; +static int inotify_max_queued_events __read_mostly; +static int inotify_max_user_watches __read_mostly; + +static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; +struct kmem_cache *event_priv_cachep __read_mostly; + +#ifdef CONFIG_SYSCTL + +#include + +static int zero; + +ctl_table inotify_table[] = { + { + .procname = "max_user_instances", + .data = &inotify_max_user_instances, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "max_user_watches", + .data = &inotify_max_user_watches, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "max_queued_events", + .data = &inotify_max_queued_events, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static inline __u32 inotify_arg_to_mask(u32 arg) +{ + __u32 mask; + + /* + * everything should accept their own ignored, cares about children, + * and should receive events when the inode is unmounted + */ + mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT); + + /* mask off the flags used to open the fd */ + mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK)); + + return mask; +} + +static inline u32 inotify_mask_to_arg(__u32 mask) +{ + return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED | + IN_Q_OVERFLOW); +} + +/* intofiy userspace file descriptor functions */ +static unsigned int inotify_poll(struct file *file, poll_table *wait) +{ + struct fsnotify_group *group = file->private_data; + int ret = 0; + + poll_wait(file, &group->notification_waitq, wait); + mutex_lock(&group->notification_mutex); + if (!fsnotify_notify_queue_is_empty(group)) + ret = POLLIN | POLLRDNORM; + mutex_unlock(&group->notification_mutex); + + return ret; +} + +/* + * Get an inotify_kernel_event if one exists and is small + * enough to fit in "count". Return an error pointer if + * not large enough. + * + * Called with the group->notification_mutex held. + */ +static struct fsnotify_event *get_one_event(struct fsnotify_group *group, + size_t count) +{ + size_t event_size = sizeof(struct inotify_event); + struct fsnotify_event *event; + + if (fsnotify_notify_queue_is_empty(group)) + return NULL; + + event = fsnotify_peek_notify_event(group); + + pr_debug("%s: group=%p event=%p\n", __func__, group, event); + + if (event->name_len) + event_size += roundup(event->name_len + 1, event_size); + + if (event_size > count) + return ERR_PTR(-EINVAL); + + /* held the notification_mutex the whole time, so this is the + * same event we peeked above */ + fsnotify_remove_notify_event(group); + + return event; +} + +/* + * Copy an event to user space, returning how much we copied. + * + * We already checked that the event size is smaller than the + * buffer we had in "get_one_event()" above. + */ +static ssize_t copy_event_to_user(struct fsnotify_group *group, + struct fsnotify_event *event, + char __user *buf) +{ + struct inotify_event inotify_event; + struct fsnotify_event_private_data *fsn_priv; + struct inotify_event_private_data *priv; + size_t event_size = sizeof(struct inotify_event); + size_t name_len = 0; + + pr_debug("%s: group=%p event=%p\n", __func__, group, event); + + /* we get the inotify watch descriptor from the event private data */ + spin_lock(&event->lock); + fsn_priv = fsnotify_remove_priv_from_event(group, event); + spin_unlock(&event->lock); + + if (!fsn_priv) + inotify_event.wd = -1; + else { + priv = container_of(fsn_priv, struct inotify_event_private_data, + fsnotify_event_priv_data); + inotify_event.wd = priv->wd; + inotify_free_event_priv(fsn_priv); + } + + /* + * round up event->name_len so it is a multiple of event_size + * plus an extra byte for the terminating '\0'. + */ + if (event->name_len) + name_len = roundup(event->name_len + 1, event_size); + inotify_event.len = name_len; + + inotify_event.mask = inotify_mask_to_arg(event->mask); + inotify_event.cookie = event->sync_cookie; + + /* send the main event */ + if (copy_to_user(buf, &inotify_event, event_size)) + return -EFAULT; + + buf += event_size; + + /* + * fsnotify only stores the pathname, so here we have to send the pathname + * and then pad that pathname out to a multiple of sizeof(inotify_event) + * with zeros. I get my zeros from the nul_inotify_event. + */ + if (name_len) { + unsigned int len_to_zero = name_len - event->name_len; + /* copy the path name */ + if (copy_to_user(buf, event->file_name, event->name_len)) + return -EFAULT; + buf += event->name_len; + + /* fill userspace with 0's */ + if (clear_user(buf, len_to_zero)) + return -EFAULT; + buf += len_to_zero; + event_size += name_len; + } + + return event_size; +} + +static ssize_t inotify_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + struct fsnotify_group *group; + struct fsnotify_event *kevent; + char __user *start; + int ret; + DEFINE_WAIT(wait); + + start = buf; + group = file->private_data; + + while (1) { + prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE); + + mutex_lock(&group->notification_mutex); + kevent = get_one_event(group, count); + mutex_unlock(&group->notification_mutex); + + pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent); + + if (kevent) { + ret = PTR_ERR(kevent); + if (IS_ERR(kevent)) + break; + ret = copy_event_to_user(group, kevent, buf); + fsnotify_put_event(kevent); + if (ret < 0) + break; + buf += ret; + count -= ret; + continue; + } + + ret = -EAGAIN; + if (file->f_flags & O_NONBLOCK) + break; + ret = -EINTR; + if (signal_pending(current)) + break; + + if (start != buf) + break; + + schedule(); + } + + finish_wait(&group->notification_waitq, &wait); + if (start != buf && ret != -EFAULT) + ret = buf - start; + return ret; +} + +static int inotify_fasync(int fd, struct file *file, int on) +{ + struct fsnotify_group *group = file->private_data; + + return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO; +} + +static int inotify_release(struct inode *ignored, struct file *file) +{ + struct fsnotify_group *group = file->private_data; + + pr_debug("%s: group=%p\n", __func__, group); + + fsnotify_clear_marks_by_group(group); + + /* free this group, matching get was inotify_init->fsnotify_obtain_group */ + fsnotify_put_group(group); + + return 0; +} + +static long inotify_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fsnotify_group *group; + struct fsnotify_event_holder *holder; + struct fsnotify_event *event; + void __user *p; + int ret = -ENOTTY; + size_t send_len = 0; + + group = file->private_data; + p = (void __user *) arg; + + pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd); + + switch (cmd) { + case FIONREAD: + mutex_lock(&group->notification_mutex); + list_for_each_entry(holder, &group->notification_list, event_list) { + event = holder->event; + send_len += sizeof(struct inotify_event); + if (event->name_len) + send_len += roundup(event->name_len + 1, + sizeof(struct inotify_event)); + } + mutex_unlock(&group->notification_mutex); + ret = put_user(send_len, (int __user *) p); + break; + } + + return ret; +} + +static const struct file_operations inotify_fops = { + .poll = inotify_poll, + .read = inotify_read, + .fasync = inotify_fasync, + .release = inotify_release, + .unlocked_ioctl = inotify_ioctl, + .compat_ioctl = inotify_ioctl, + .llseek = noop_llseek, +}; + + +/* + * find_inode - resolve a user-given path to a specific inode + */ +static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags) +{ + int error; + + error = user_path_at(AT_FDCWD, dirname, flags, path); + if (error) + return error; + /* you can only watch an inode if you have read permissions on it */ + error = inode_permission(path->dentry->d_inode, MAY_READ); + if (error) + path_put(path); + return error; +} + +static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock, + int *last_wd, + struct inotify_inode_mark *i_mark) +{ + int ret; + + do { + if (unlikely(!idr_pre_get(idr, GFP_KERNEL))) + return -ENOMEM; + + spin_lock(idr_lock); + ret = idr_get_new_above(idr, i_mark, *last_wd + 1, + &i_mark->wd); + /* we added the mark to the idr, take a reference */ + if (!ret) { + *last_wd = i_mark->wd; + fsnotify_get_mark(&i_mark->fsn_mark); + } + spin_unlock(idr_lock); + } while (ret == -EAGAIN); + + return ret; +} + +static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group, + int wd) +{ + struct idr *idr = &group->inotify_data.idr; + spinlock_t *idr_lock = &group->inotify_data.idr_lock; + struct inotify_inode_mark *i_mark; + + assert_spin_locked(idr_lock); + + i_mark = idr_find(idr, wd); + if (i_mark) { + struct fsnotify_mark *fsn_mark = &i_mark->fsn_mark; + + fsnotify_get_mark(fsn_mark); + /* One ref for being in the idr, one ref we just took */ + BUG_ON(atomic_read(&fsn_mark->refcnt) < 2); + } + + return i_mark; +} + +static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group, + int wd) +{ + struct inotify_inode_mark *i_mark; + spinlock_t *idr_lock = &group->inotify_data.idr_lock; + + spin_lock(idr_lock); + i_mark = inotify_idr_find_locked(group, wd); + spin_unlock(idr_lock); + + return i_mark; +} + +static void do_inotify_remove_from_idr(struct fsnotify_group *group, + struct inotify_inode_mark *i_mark) +{ + struct idr *idr = &group->inotify_data.idr; + spinlock_t *idr_lock = &group->inotify_data.idr_lock; + int wd = i_mark->wd; + + assert_spin_locked(idr_lock); + + idr_remove(idr, wd); + + /* removed from the idr, drop that ref */ + fsnotify_put_mark(&i_mark->fsn_mark); +} + +/* + * Remove the mark from the idr (if present) and drop the reference + * on the mark because it was in the idr. + */ +static void inotify_remove_from_idr(struct fsnotify_group *group, + struct inotify_inode_mark *i_mark) +{ + spinlock_t *idr_lock = &group->inotify_data.idr_lock; + struct inotify_inode_mark *found_i_mark = NULL; + int wd; + + spin_lock(idr_lock); + wd = i_mark->wd; + + /* + * does this i_mark think it is in the idr? we shouldn't get called + * if it wasn't.... + */ + if (wd == -1) { + WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" + " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, + i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); + goto out; + } + + /* Lets look in the idr to see if we find it */ + found_i_mark = inotify_idr_find_locked(group, wd); + if (unlikely(!found_i_mark)) { + WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" + " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, + i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); + goto out; + } + + /* + * We found an mark in the idr at the right wd, but it's + * not the mark we were told to remove. eparis seriously + * fucked up somewhere. + */ + if (unlikely(found_i_mark != i_mark)) { + WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p " + "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d " + "found_i_mark->group=%p found_i_mark->inode=%p\n", + __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, + i_mark->fsn_mark.i.inode, found_i_mark, found_i_mark->wd, + found_i_mark->fsn_mark.group, + found_i_mark->fsn_mark.i.inode); + goto out; + } + + /* + * One ref for being in the idr + * one ref held by the caller trying to kill us + * one ref grabbed by inotify_idr_find + */ + if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) { + printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" + " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, + i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); + /* we can't really recover with bad ref cnting.. */ + BUG(); + } + + do_inotify_remove_from_idr(group, i_mark); +out: + /* match the ref taken by inotify_idr_find_locked() */ + if (found_i_mark) + fsnotify_put_mark(&found_i_mark->fsn_mark); + i_mark->wd = -1; + spin_unlock(idr_lock); +} + +/* + * Send IN_IGNORED for this wd, remove this wd from the idr. + */ +void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, + struct fsnotify_group *group) +{ + struct inotify_inode_mark *i_mark; + struct fsnotify_event *ignored_event, *notify_event; + struct inotify_event_private_data *event_priv; + struct fsnotify_event_private_data *fsn_event_priv; + int ret; + + ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, + FSNOTIFY_EVENT_NONE, NULL, 0, + GFP_NOFS); + if (!ignored_event) + return; + + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); + + event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); + if (unlikely(!event_priv)) + goto skip_send_ignore; + + fsn_event_priv = &event_priv->fsnotify_event_priv_data; + + fsn_event_priv->group = group; + event_priv->wd = i_mark->wd; + + notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL); + if (notify_event) { + if (IS_ERR(notify_event)) + ret = PTR_ERR(notify_event); + else + fsnotify_put_event(notify_event); + inotify_free_event_priv(fsn_event_priv); + } + +skip_send_ignore: + + /* matches the reference taken when the event was created */ + fsnotify_put_event(ignored_event); + + /* remove this mark from the idr */ + inotify_remove_from_idr(group, i_mark); + + atomic_dec(&group->inotify_data.user->inotify_watches); +} + +/* ding dong the mark is dead */ +static void inotify_free_mark(struct fsnotify_mark *fsn_mark) +{ + struct inotify_inode_mark *i_mark; + + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); + + kmem_cache_free(inotify_inode_mark_cachep, i_mark); +} + +static int inotify_update_existing_watch(struct fsnotify_group *group, + struct inode *inode, + u32 arg) +{ + struct fsnotify_mark *fsn_mark; + struct inotify_inode_mark *i_mark; + __u32 old_mask, new_mask; + __u32 mask; + int add = (arg & IN_MASK_ADD); + int ret; + + /* don't allow invalid bits: we don't want flags set */ + mask = inotify_arg_to_mask(arg); + if (unlikely(!(mask & IN_ALL_EVENTS))) + return -EINVAL; + + fsn_mark = fsnotify_find_inode_mark(group, inode); + if (!fsn_mark) + return -ENOENT; + + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); + + spin_lock(&fsn_mark->lock); + + old_mask = fsn_mark->mask; + if (add) + fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask)); + else + fsnotify_set_mark_mask_locked(fsn_mark, mask); + new_mask = fsn_mark->mask; + + spin_unlock(&fsn_mark->lock); + + if (old_mask != new_mask) { + /* more bits in old than in new? */ + int dropped = (old_mask & ~new_mask); + /* more bits in this fsn_mark than the inode's mask? */ + int do_inode = (new_mask & ~inode->i_fsnotify_mask); + + /* update the inode with this new fsn_mark */ + if (dropped || do_inode) + fsnotify_recalc_inode_mask(inode); + + } + + /* return the wd */ + ret = i_mark->wd; + + /* match the get from fsnotify_find_mark() */ + fsnotify_put_mark(fsn_mark); + + return ret; +} + +static int inotify_new_watch(struct fsnotify_group *group, + struct inode *inode, + u32 arg) +{ + struct inotify_inode_mark *tmp_i_mark; + __u32 mask; + int ret; + struct idr *idr = &group->inotify_data.idr; + spinlock_t *idr_lock = &group->inotify_data.idr_lock; + + /* don't allow invalid bits: we don't want flags set */ + mask = inotify_arg_to_mask(arg); + if (unlikely(!(mask & IN_ALL_EVENTS))) + return -EINVAL; + + tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); + if (unlikely(!tmp_i_mark)) + return -ENOMEM; + + fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark); + tmp_i_mark->fsn_mark.mask = mask; + tmp_i_mark->wd = -1; + + ret = -ENOSPC; + if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) + goto out_err; + + ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd, + tmp_i_mark); + if (ret) + goto out_err; + + /* we are on the idr, now get on the inode */ + ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0); + if (ret) { + /* we failed to get on the inode, get off the idr */ + inotify_remove_from_idr(group, tmp_i_mark); + goto out_err; + } + + /* increment the number of watches the user has */ + atomic_inc(&group->inotify_data.user->inotify_watches); + + /* return the watch descriptor for this new mark */ + ret = tmp_i_mark->wd; + +out_err: + /* match the ref from fsnotify_init_mark() */ + fsnotify_put_mark(&tmp_i_mark->fsn_mark); + + return ret; +} + +static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) +{ + int ret = 0; + +retry: + /* try to update and existing watch with the new arg */ + ret = inotify_update_existing_watch(group, inode, arg); + /* no mark present, try to add a new one */ + if (ret == -ENOENT) + ret = inotify_new_watch(group, inode, arg); + /* + * inotify_new_watch could race with another thread which did an + * inotify_new_watch between the update_existing and the add watch + * here, go back and try to update an existing mark again. + */ + if (ret == -EEXIST) + goto retry; + + return ret; +} + +static struct fsnotify_group *inotify_new_group(unsigned int max_events) +{ + struct fsnotify_group *group; + + group = fsnotify_alloc_group(&inotify_fsnotify_ops); + if (IS_ERR(group)) + return group; + + group->max_events = max_events; + + spin_lock_init(&group->inotify_data.idr_lock); + idr_init(&group->inotify_data.idr); + group->inotify_data.last_wd = 0; + group->inotify_data.fa = NULL; + group->inotify_data.user = get_current_user(); + + if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > + inotify_max_user_instances) { + fsnotify_put_group(group); + return ERR_PTR(-EMFILE); + } + + return group; +} + + +/* inotify syscalls */ +SYSCALL_DEFINE1(inotify_init1, int, flags) +{ + struct fsnotify_group *group; + int ret; + + /* Check the IN_* constants for consistency. */ + BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK); + + if (flags & ~(IN_CLOEXEC | IN_NONBLOCK)) + return -EINVAL; + + /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */ + group = inotify_new_group(inotify_max_queued_events); + if (IS_ERR(group)) + return PTR_ERR(group); + + ret = anon_inode_getfd("inotify", &inotify_fops, group, + O_RDONLY | flags); + if (ret < 0) + fsnotify_put_group(group); + + return ret; +} + +SYSCALL_DEFINE0(inotify_init) +{ + return sys_inotify_init1(0); +} + +SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, + u32, mask) +{ + struct fsnotify_group *group; + struct inode *inode; + struct path path; + struct file *filp; + int ret, fput_needed; + unsigned flags = 0; + + filp = fget_light(fd, &fput_needed); + if (unlikely(!filp)) + return -EBADF; + + /* verify that this is indeed an inotify instance */ + if (unlikely(filp->f_op != &inotify_fops)) { + ret = -EINVAL; + goto fput_and_out; + } + + if (!(mask & IN_DONT_FOLLOW)) + flags |= LOOKUP_FOLLOW; + if (mask & IN_ONLYDIR) + flags |= LOOKUP_DIRECTORY; + + ret = inotify_find_inode(pathname, &path, flags); + if (ret) + goto fput_and_out; + + /* inode held in place by reference to path; group by fget on fd */ + inode = path.dentry->d_inode; + group = filp->private_data; + + /* create/update an inode mark */ + ret = inotify_update_watch(group, inode, mask); + path_put(&path); +fput_and_out: + fput_light(filp, fput_needed); + return ret; +} + +SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) +{ + struct fsnotify_group *group; + struct inotify_inode_mark *i_mark; + struct file *filp; + int ret = 0, fput_needed; + + filp = fget_light(fd, &fput_needed); + if (unlikely(!filp)) + return -EBADF; + + /* verify that this is indeed an inotify instance */ + ret = -EINVAL; + if (unlikely(filp->f_op != &inotify_fops)) + goto out; + + group = filp->private_data; + + ret = -EINVAL; + i_mark = inotify_idr_find(group, wd); + if (unlikely(!i_mark)) + goto out; + + ret = 0; + + fsnotify_destroy_mark(&i_mark->fsn_mark); + + /* match ref taken by inotify_idr_find */ + fsnotify_put_mark(&i_mark->fsn_mark); + +out: + fput_light(filp, fput_needed); + return ret; +} + +/* + * inotify_user_setup - Our initialization function. Note that we cannot return + * error because we have compiled-in VFS hooks. So an (unlikely) failure here + * must result in panic(). + */ +static int __init inotify_user_setup(void) +{ + BUILD_BUG_ON(IN_ACCESS != FS_ACCESS); + BUILD_BUG_ON(IN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB); + BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE); + BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); + BUILD_BUG_ON(IN_OPEN != FS_OPEN); + BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM); + BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO); + BUILD_BUG_ON(IN_CREATE != FS_CREATE); + BUILD_BUG_ON(IN_DELETE != FS_DELETE); + BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF); + BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF); + BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT); + BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW); + BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED); + BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK); + BUILD_BUG_ON(IN_ISDIR != FS_ISDIR); + BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); + + BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); + + inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); + event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); + + inotify_max_queued_events = 16384; + inotify_max_user_instances = 128; + inotify_max_user_watches = 8192; + + return 0; +} +module_init(inotify_user_setup); diff --git a/fs/notify/mark.c b/fs/notify/mark.c new file mode 100644 index 00000000..42ed1957 --- /dev/null +++ b/fs/notify/mark.c @@ -0,0 +1,372 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * fsnotify inode mark locking/lifetime/and refcnting + * + * REFCNT: + * The mark->refcnt tells how many "things" in the kernel currently are + * referencing this object. The object typically will live inside the kernel + * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task + * which can find this object holding the appropriete locks, can take a reference + * and the object itself is guaranteed to survive until the reference is dropped. + * + * LOCKING: + * There are 3 spinlocks involved with fsnotify inode marks and they MUST + * be taken in order as follows: + * + * mark->lock + * group->mark_lock + * inode->i_lock + * + * mark->lock protects 2 things, mark->group and mark->inode. You must hold + * that lock to dereference either of these things (they could be NULL even with + * the lock) + * + * group->mark_lock protects the marks_list anchored inside a given group + * and each mark is hooked via the g_list. It also sorta protects the + * free_g_list, which when used is anchored by a private list on the stack of the + * task which held the group->mark_lock. + * + * inode->i_lock protects the i_fsnotify_marks list anchored inside a + * given inode and each mark is hooked via the i_list. (and sorta the + * free_i_list) + * + * + * LIFETIME: + * Inode marks survive between when they are added to an inode and when their + * refcnt==0. + * + * The inode mark can be cleared for a number of different reasons including: + * - The inode is unlinked for the last time. (fsnotify_inode_remove) + * - The inode is being evicted from cache. (fsnotify_inode_delete) + * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes) + * - Something explicitly requests that it be removed. (fsnotify_destroy_mark) + * - The fsnotify_group associated with the mark is going away and all such marks + * need to be cleaned up. (fsnotify_clear_marks_by_group) + * + * Worst case we are given an inode and need to clean up all the marks on that + * inode. We take i_lock and walk the i_fsnotify_marks safely. For each + * mark on the list we take a reference (so the mark can't disappear under us). + * We remove that mark form the inode's list of marks and we add this mark to a + * private list anchored on the stack using i_free_list; At this point we no + * longer fear anything finding the mark using the inode's list of marks. + * + * We can safely and locklessly run the private list on the stack of everything + * we just unattached from the original inode. For each mark on the private list + * we grab the mark-> and can thus dereference mark->group and mark->inode. If + * we see the group and inode are not NULL we take those locks. Now holding all + * 3 locks we can completely remove the mark from other tasks finding it in the + * future. Remember, 10 things might already be referencing this mark, but they + * better be holding a ref. We drop our reference we took before we unhooked it + * from the inode. When the ref hits 0 we can free the mark. + * + * Very similarly for freeing by group, except we use free_g_list. + * + * This has the very interesting property of being able to run concurrently with + * any (or all) other directions. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include "fsnotify.h" + +struct srcu_struct fsnotify_mark_srcu; +static DEFINE_SPINLOCK(destroy_lock); +static LIST_HEAD(destroy_list); +static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq); + +void fsnotify_get_mark(struct fsnotify_mark *mark) +{ + atomic_inc(&mark->refcnt); +} + +void fsnotify_put_mark(struct fsnotify_mark *mark) +{ + if (atomic_dec_and_test(&mark->refcnt)) + mark->free_mark(mark); +} + +/* + * Any time a mark is getting freed we end up here. + * The caller had better be holding a reference to this mark so we don't actually + * do the final put under the mark->lock + */ +void fsnotify_destroy_mark(struct fsnotify_mark *mark) +{ + struct fsnotify_group *group; + struct inode *inode = NULL; + + spin_lock(&mark->lock); + + group = mark->group; + + /* something else already called this function on this mark */ + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { + spin_unlock(&mark->lock); + return; + } + + mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; + + spin_lock(&group->mark_lock); + + if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { + inode = mark->i.inode; + fsnotify_destroy_inode_mark(mark); + } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) + fsnotify_destroy_vfsmount_mark(mark); + else + BUG(); + + list_del_init(&mark->g_list); + + spin_unlock(&group->mark_lock); + spin_unlock(&mark->lock); + + spin_lock(&destroy_lock); + list_add(&mark->destroy_list, &destroy_list); + spin_unlock(&destroy_lock); + wake_up(&destroy_waitq); + + /* + * Some groups like to know that marks are being freed. This is a + * callback to the group function to let it know that this mark + * is being freed. + */ + if (group->ops->freeing_mark) + group->ops->freeing_mark(mark, group); + + /* + * __fsnotify_update_child_dentry_flags(inode); + * + * I really want to call that, but we can't, we have no idea if the inode + * still exists the second we drop the mark->lock. + * + * The next time an event arrive to this inode from one of it's children + * __fsnotify_parent will see that the inode doesn't care about it's + * children and will update all of these flags then. So really this + * is just a lazy update (and could be a perf win...) + */ + + if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) + iput(inode); + + /* + * We don't necessarily have a ref on mark from caller so the above iput + * may have already destroyed it. Don't touch from now on. + */ + + /* + * it's possible that this group tried to destroy itself, but this + * this mark was simultaneously being freed by inode. If that's the + * case, we finish freeing the group here. + */ + if (unlikely(atomic_dec_and_test(&group->num_marks))) + fsnotify_final_destroy_group(group); +} + +void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) +{ + assert_spin_locked(&mark->lock); + + mark->mask = mask; + + if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) + fsnotify_set_inode_mark_mask_locked(mark, mask); +} + +void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask) +{ + assert_spin_locked(&mark->lock); + + mark->ignored_mask = mask; +} + +/* + * Attach an initialized mark to a given group and fs object. + * These marks may be used for the fsnotify backend to determine which + * event types should be delivered to which group. + */ +int fsnotify_add_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group, struct inode *inode, + struct vfsmount *mnt, int allow_dups) +{ + int ret = 0; + + BUG_ON(inode && mnt); + BUG_ON(!inode && !mnt); + + /* + * LOCKING ORDER!!!! + * mark->lock + * group->mark_lock + * inode->i_lock + */ + spin_lock(&mark->lock); + spin_lock(&group->mark_lock); + + mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE; + + mark->group = group; + list_add(&mark->g_list, &group->marks_list); + atomic_inc(&group->num_marks); + fsnotify_get_mark(mark); /* for i_list and g_list */ + + if (inode) { + ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups); + if (ret) + goto err; + } else if (mnt) { + ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups); + if (ret) + goto err; + } else { + BUG(); + } + + spin_unlock(&group->mark_lock); + + /* this will pin the object if appropriate */ + fsnotify_set_mark_mask_locked(mark, mark->mask); + + spin_unlock(&mark->lock); + + if (inode) + __fsnotify_update_child_dentry_flags(inode); + + return ret; +err: + mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; + list_del_init(&mark->g_list); + mark->group = NULL; + atomic_dec(&group->num_marks); + + spin_unlock(&group->mark_lock); + spin_unlock(&mark->lock); + + spin_lock(&destroy_lock); + list_add(&mark->destroy_list, &destroy_list); + spin_unlock(&destroy_lock); + wake_up(&destroy_waitq); + + return ret; +} + +/* + * clear any marks in a group in which mark->flags & flags is true + */ +void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, + unsigned int flags) +{ + struct fsnotify_mark *lmark, *mark; + LIST_HEAD(free_list); + + spin_lock(&group->mark_lock); + list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { + if (mark->flags & flags) { + list_add(&mark->free_g_list, &free_list); + list_del_init(&mark->g_list); + fsnotify_get_mark(mark); + } + } + spin_unlock(&group->mark_lock); + + list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) { + fsnotify_destroy_mark(mark); + fsnotify_put_mark(mark); + } +} + +/* + * Given a group, destroy all of the marks associated with that group. + */ +void fsnotify_clear_marks_by_group(struct fsnotify_group *group) +{ + fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1); +} + +void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) +{ + assert_spin_locked(&old->lock); + new->i.inode = old->i.inode; + new->m.mnt = old->m.mnt; + new->group = old->group; + new->mask = old->mask; + new->free_mark = old->free_mark; +} + +/* + * Nothing fancy, just initialize lists and locks and counters. + */ +void fsnotify_init_mark(struct fsnotify_mark *mark, + void (*free_mark)(struct fsnotify_mark *mark)) +{ + memset(mark, 0, sizeof(*mark)); + spin_lock_init(&mark->lock); + atomic_set(&mark->refcnt, 1); + mark->free_mark = free_mark; +} + +static int fsnotify_mark_destroy(void *ignored) +{ + struct fsnotify_mark *mark, *next; + LIST_HEAD(private_destroy_list); + + for (;;) { + spin_lock(&destroy_lock); + /* exchange the list head */ + list_replace_init(&destroy_list, &private_destroy_list); + spin_unlock(&destroy_lock); + + synchronize_srcu(&fsnotify_mark_srcu); + + list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) { + list_del_init(&mark->destroy_list); + fsnotify_put_mark(mark); + } + + wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list)); + } + + return 0; +} + +static int __init fsnotify_mark_init(void) +{ + struct task_struct *thread; + + thread = kthread_run(fsnotify_mark_destroy, NULL, + "fsnotify_mark"); + if (IS_ERR(thread)) + panic("unable to start fsnotify mark destruction thread."); + + return 0; +} +device_initcall(fsnotify_mark_init); diff --git a/fs/notify/notification.c b/fs/notify/notification.c new file mode 100644 index 00000000..f39260f8 --- /dev/null +++ b/fs/notify/notification.c @@ -0,0 +1,464 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * Basic idea behind the notification queue: An fsnotify group (like inotify) + * sends the userspace notification about events asyncronously some time after + * the event happened. When inotify gets an event it will need to add that + * event to the group notify queue. Since a single event might need to be on + * multiple group's notification queues we can't add the event directly to each + * queue and instead add a small "event_holder" to each queue. This event_holder + * has a pointer back to the original event. Since the majority of events are + * going to end up on one, and only one, notification queue we embed one + * event_holder into each event. This means we have a single allocation instead + * of always needing two. If the embedded event_holder is already in use by + * another group a new event_holder (from fsnotify_event_holder_cachep) will be + * allocated and used. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include "fsnotify.h" + +static struct kmem_cache *fsnotify_event_cachep; +static struct kmem_cache *fsnotify_event_holder_cachep; +/* + * This is a magic event we send when the q is too full. Since it doesn't + * hold real event information we just keep one system wide and use it any time + * it is needed. It's refcnt is set 1 at kernel init time and will never + * get set to 0 so it will never get 'freed' + */ +static struct fsnotify_event *q_overflow_event; +static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); + +/** + * fsnotify_get_cookie - return a unique cookie for use in synchronizing events. + * Called from fsnotify_move, which is inlined into filesystem modules. + */ +u32 fsnotify_get_cookie(void) +{ + return atomic_inc_return(&fsnotify_sync_cookie); +} +EXPORT_SYMBOL_GPL(fsnotify_get_cookie); + +/* return true if the notify queue is empty, false otherwise */ +bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) +{ + BUG_ON(!mutex_is_locked(&group->notification_mutex)); + return list_empty(&group->notification_list) ? true : false; +} + +void fsnotify_get_event(struct fsnotify_event *event) +{ + atomic_inc(&event->refcnt); +} + +void fsnotify_put_event(struct fsnotify_event *event) +{ + if (!event) + return; + + if (atomic_dec_and_test(&event->refcnt)) { + pr_debug("%s: event=%p\n", __func__, event); + + if (event->data_type == FSNOTIFY_EVENT_PATH) + path_put(&event->path); + + BUG_ON(!list_empty(&event->private_data_list)); + + kfree(event->file_name); + put_pid(event->tgid); + kmem_cache_free(fsnotify_event_cachep, event); + } +} + +struct fsnotify_event_holder *fsnotify_alloc_event_holder(void) +{ + return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL); +} + +void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) +{ + if (holder) + kmem_cache_free(fsnotify_event_holder_cachep, holder); +} + +/* + * Find the private data that the group previously attached to this event when + * the group added the event to the notification queue (fsnotify_add_notify_event) + */ +struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event) +{ + struct fsnotify_event_private_data *lpriv; + struct fsnotify_event_private_data *priv = NULL; + + assert_spin_locked(&event->lock); + + list_for_each_entry(lpriv, &event->private_data_list, event_list) { + if (lpriv->group == group) { + priv = lpriv; + list_del(&priv->event_list); + break; + } + } + return priv; +} + +/* + * Add an event to the group notification queue. The group can later pull this + * event off the queue to deal with. If the event is successfully added to the + * group's notification queue, a reference is taken on event. + */ +struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, + struct fsnotify_event_private_data *priv, + struct fsnotify_event *(*merge)(struct list_head *, + struct fsnotify_event *)) +{ + struct fsnotify_event *return_event = NULL; + struct fsnotify_event_holder *holder = NULL; + struct list_head *list = &group->notification_list; + + pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv); + + /* + * There is one fsnotify_event_holder embedded inside each fsnotify_event. + * Check if we expect to be able to use that holder. If not alloc a new + * holder. + * For the overflow event it's possible that something will use the in + * event holder before we get the lock so we may need to jump back and + * alloc a new holder, this can't happen for most events... + */ + if (!list_empty(&event->holder.event_list)) { +alloc_holder: + holder = fsnotify_alloc_event_holder(); + if (!holder) + return ERR_PTR(-ENOMEM); + } + + mutex_lock(&group->notification_mutex); + + if (group->q_len >= group->max_events) { + event = q_overflow_event; + + /* + * we need to return the overflow event + * which means we need a ref + */ + fsnotify_get_event(event); + return_event = event; + + /* sorry, no private data on the overflow event */ + priv = NULL; + } + + if (!list_empty(list) && merge) { + struct fsnotify_event *tmp; + + tmp = merge(list, event); + if (tmp) { + mutex_unlock(&group->notification_mutex); + + if (return_event) + fsnotify_put_event(return_event); + if (holder != &event->holder) + fsnotify_destroy_event_holder(holder); + return tmp; + } + } + + spin_lock(&event->lock); + + if (list_empty(&event->holder.event_list)) { + if (unlikely(holder)) + fsnotify_destroy_event_holder(holder); + holder = &event->holder; + } else if (unlikely(!holder)) { + /* between the time we checked above and got the lock the in + * event holder was used, go back and get a new one */ + spin_unlock(&event->lock); + mutex_unlock(&group->notification_mutex); + + if (return_event) { + fsnotify_put_event(return_event); + return_event = NULL; + } + + goto alloc_holder; + } + + group->q_len++; + holder->event = event; + + fsnotify_get_event(event); + list_add_tail(&holder->event_list, list); + if (priv) + list_add_tail(&priv->event_list, &event->private_data_list); + spin_unlock(&event->lock); + mutex_unlock(&group->notification_mutex); + + wake_up(&group->notification_waitq); + return return_event; +} + +/* + * Remove and return the first event from the notification list. There is a + * reference held on this event since it was on the list. It is the responsibility + * of the caller to drop this reference. + */ +struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) +{ + struct fsnotify_event *event; + struct fsnotify_event_holder *holder; + + BUG_ON(!mutex_is_locked(&group->notification_mutex)); + + pr_debug("%s: group=%p\n", __func__, group); + + holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); + + event = holder->event; + + spin_lock(&event->lock); + holder->event = NULL; + list_del_init(&holder->event_list); + spin_unlock(&event->lock); + + /* event == holder means we are referenced through the in event holder */ + if (holder != &event->holder) + fsnotify_destroy_event_holder(holder); + + group->q_len--; + + return event; +} + +/* + * This will not remove the event, that must be done with fsnotify_remove_notify_event() + */ +struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) +{ + struct fsnotify_event *event; + struct fsnotify_event_holder *holder; + + BUG_ON(!mutex_is_locked(&group->notification_mutex)); + + holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); + event = holder->event; + + return event; +} + +/* + * Called when a group is being torn down to clean up any outstanding + * event notifications. + */ +void fsnotify_flush_notify(struct fsnotify_group *group) +{ + struct fsnotify_event *event; + struct fsnotify_event_private_data *priv; + + mutex_lock(&group->notification_mutex); + while (!fsnotify_notify_queue_is_empty(group)) { + event = fsnotify_remove_notify_event(group); + /* if they don't implement free_event_priv they better not have attached any */ + if (group->ops->free_event_priv) { + spin_lock(&event->lock); + priv = fsnotify_remove_priv_from_event(group, event); + spin_unlock(&event->lock); + if (priv) + group->ops->free_event_priv(priv); + } + fsnotify_put_event(event); /* matches fsnotify_add_notify_event */ + } + mutex_unlock(&group->notification_mutex); +} + +static void initialize_event(struct fsnotify_event *event) +{ + INIT_LIST_HEAD(&event->holder.event_list); + atomic_set(&event->refcnt, 1); + + spin_lock_init(&event->lock); + + INIT_LIST_HEAD(&event->private_data_list); +} + +/* + * Caller damn well better be holding whatever mutex is protecting the + * old_holder->event_list and the new_event must be a clean event which + * cannot be found anywhere else in the kernel. + */ +int fsnotify_replace_event(struct fsnotify_event_holder *old_holder, + struct fsnotify_event *new_event) +{ + struct fsnotify_event *old_event = old_holder->event; + struct fsnotify_event_holder *new_holder = &new_event->holder; + + enum event_spinlock_class { + SPINLOCK_OLD, + SPINLOCK_NEW, + }; + + pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event); + + /* + * if the new_event's embedded holder is in use someone + * screwed up and didn't give us a clean new event. + */ + BUG_ON(!list_empty(&new_holder->event_list)); + + spin_lock_nested(&old_event->lock, SPINLOCK_OLD); + spin_lock_nested(&new_event->lock, SPINLOCK_NEW); + + new_holder->event = new_event; + list_replace_init(&old_holder->event_list, &new_holder->event_list); + + spin_unlock(&new_event->lock); + spin_unlock(&old_event->lock); + + /* event == holder means we are referenced through the in event holder */ + if (old_holder != &old_event->holder) + fsnotify_destroy_event_holder(old_holder); + + fsnotify_get_event(new_event); /* on the list take reference */ + fsnotify_put_event(old_event); /* off the list, drop reference */ + + return 0; +} + +struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event) +{ + struct fsnotify_event *event; + + event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); + if (!event) + return NULL; + + pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event); + + memcpy(event, old_event, sizeof(*event)); + initialize_event(event); + + if (event->name_len) { + event->file_name = kstrdup(old_event->file_name, GFP_KERNEL); + if (!event->file_name) { + kmem_cache_free(fsnotify_event_cachep, event); + return NULL; + } + } + event->tgid = get_pid(old_event->tgid); + if (event->data_type == FSNOTIFY_EVENT_PATH) + path_get(&event->path); + + return event; +} + +/* + * fsnotify_create_event - Allocate a new event which will be sent to each + * group's handle_event function if the group was interested in this + * particular event. + * + * @to_tell the inode which is supposed to receive the event (sometimes a + * parent of the inode to which the event happened. + * @mask what actually happened. + * @data pointer to the object which was actually affected + * @data_type flag indication if the data is a file, path, inode, nothing... + * @name the filename, if available + */ +struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, + int data_type, const unsigned char *name, + u32 cookie, gfp_t gfp) +{ + struct fsnotify_event *event; + + event = kmem_cache_zalloc(fsnotify_event_cachep, gfp); + if (!event) + return NULL; + + pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n", + __func__, event, to_tell, mask, data, data_type); + + initialize_event(event); + + if (name) { + event->file_name = kstrdup(name, gfp); + if (!event->file_name) { + kmem_cache_free(fsnotify_event_cachep, event); + return NULL; + } + event->name_len = strlen(event->file_name); + } + + event->tgid = get_pid(task_tgid(current)); + event->sync_cookie = cookie; + event->to_tell = to_tell; + event->data_type = data_type; + + switch (data_type) { + case FSNOTIFY_EVENT_PATH: { + struct path *path = data; + event->path.dentry = path->dentry; + event->path.mnt = path->mnt; + path_get(&event->path); + break; + } + case FSNOTIFY_EVENT_INODE: + event->inode = data; + break; + case FSNOTIFY_EVENT_NONE: + event->inode = NULL; + event->path.dentry = NULL; + event->path.mnt = NULL; + break; + default: + BUG(); + } + + event->mask = mask; + + return event; +} + +__init int fsnotify_notification_init(void) +{ + fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); + fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); + + q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL, + FSNOTIFY_EVENT_NONE, NULL, 0, + GFP_KERNEL); + if (!q_overflow_event) + panic("unable to allocate fsnotify q_overflow_event\n"); + + return 0; +} +subsys_initcall(fsnotify_notification_init); + diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c new file mode 100644 index 00000000..e86577d6 --- /dev/null +++ b/fs/notify/vfsmount_mark.c @@ -0,0 +1,190 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include "fsnotify.h" + +void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) +{ + struct fsnotify_mark *mark, *lmark; + struct hlist_node *pos, *n; + LIST_HEAD(free_list); + + spin_lock(&mnt->mnt_root->d_lock); + hlist_for_each_entry_safe(mark, pos, n, &mnt->mnt_fsnotify_marks, m.m_list) { + list_add(&mark->m.free_m_list, &free_list); + hlist_del_init_rcu(&mark->m.m_list); + fsnotify_get_mark(mark); + } + spin_unlock(&mnt->mnt_root->d_lock); + + list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) { + fsnotify_destroy_mark(mark); + fsnotify_put_mark(mark); + } +} + +void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) +{ + fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT); +} + +/* + * Recalculate the mask of events relevant to a given vfsmount locked. + */ +static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt) +{ + struct fsnotify_mark *mark; + struct hlist_node *pos; + __u32 new_mask = 0; + + assert_spin_locked(&mnt->mnt_root->d_lock); + + hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) + new_mask |= mark->mask; + mnt->mnt_fsnotify_mask = new_mask; +} + +/* + * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types + * any notifier is interested in hearing for this mount point + */ +void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt) +{ + spin_lock(&mnt->mnt_root->d_lock); + fsnotify_recalc_vfsmount_mask_locked(mnt); + spin_unlock(&mnt->mnt_root->d_lock); +} + +void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) +{ + struct vfsmount *mnt = mark->m.mnt; + + assert_spin_locked(&mark->lock); + assert_spin_locked(&mark->group->mark_lock); + + spin_lock(&mnt->mnt_root->d_lock); + + hlist_del_init_rcu(&mark->m.m_list); + mark->m.mnt = NULL; + + fsnotify_recalc_vfsmount_mask_locked(mnt); + + spin_unlock(&mnt->mnt_root->d_lock); +} + +static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group, + struct vfsmount *mnt) +{ + struct fsnotify_mark *mark; + struct hlist_node *pos; + + assert_spin_locked(&mnt->mnt_root->d_lock); + + hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) { + if (mark->group == group) { + fsnotify_get_mark(mark); + return mark; + } + } + return NULL; +} + +/* + * given a group and vfsmount, find the mark associated with that combination. + * if found take a reference to that mark and return it, else return NULL + */ +struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, + struct vfsmount *mnt) +{ + struct fsnotify_mark *mark; + + spin_lock(&mnt->mnt_root->d_lock); + mark = fsnotify_find_vfsmount_mark_locked(group, mnt); + spin_unlock(&mnt->mnt_root->d_lock); + + return mark; +} + +/* + * Attach an initialized mark to a given group and vfsmount. + * These marks may be used for the fsnotify backend to determine which + * event types should be delivered to which groups. + */ +int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group, struct vfsmount *mnt, + int allow_dups) +{ + struct fsnotify_mark *lmark; + struct hlist_node *node, *last = NULL; + int ret = 0; + + mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; + + assert_spin_locked(&mark->lock); + assert_spin_locked(&group->mark_lock); + + spin_lock(&mnt->mnt_root->d_lock); + + mark->m.mnt = mnt; + + /* is mark the first mark? */ + if (hlist_empty(&mnt->mnt_fsnotify_marks)) { + hlist_add_head_rcu(&mark->m.m_list, &mnt->mnt_fsnotify_marks); + goto out; + } + + /* should mark be in the middle of the current list? */ + hlist_for_each_entry(lmark, node, &mnt->mnt_fsnotify_marks, m.m_list) { + last = node; + + if ((lmark->group == group) && !allow_dups) { + ret = -EEXIST; + goto out; + } + + if (mark->group->priority < lmark->group->priority) + continue; + + if ((mark->group->priority == lmark->group->priority) && + (mark->group < lmark->group)) + continue; + + hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list); + goto out; + } + + BUG_ON(last == NULL); + /* mark should be the last entry. last is the current last entry */ + hlist_add_after_rcu(last, &mark->m.m_list); +out: + fsnotify_recalc_vfsmount_mask_locked(mnt); + spin_unlock(&mnt->mnt_root->d_lock); + + return ret; +} diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig new file mode 100644 index 00000000..f5a868cc --- /dev/null +++ b/fs/ntfs/Kconfig @@ -0,0 +1,78 @@ +config NTFS_FS + tristate "NTFS file system support" + select NLS + help + NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. + + Saying Y or M here enables read support. There is partial, but + safe, write support available. For write support you must also + say Y to "NTFS write support" below. + + There are also a number of user-space tools available, called + ntfsprogs. These include ntfsundelete and ntfsresize, that work + without NTFS support enabled in the kernel. + + This is a rewrite from scratch of Linux NTFS support and replaced + the old NTFS code starting with Linux 2.5.11. A backport to + the Linux 2.4 kernel series is separately available as a patch + from the project web site. + + For more information see + and . + + To compile this file system support as a module, choose M here: the + module will be called ntfs. + + If you are not using Windows NT, 2000, XP or 2003 in addition to + Linux on your computer it is safe to say N. + +config NTFS_DEBUG + bool "NTFS debugging support" + depends on NTFS_FS + help + If you are experiencing any problems with the NTFS file system, say + Y here. This will result in additional consistency checks to be + performed by the driver as well as additional debugging messages to + be written to the system log. Note that debugging messages are + disabled by default. To enable them, supply the option debug_msgs=1 + at the kernel command line when booting the kernel or as an option + to insmod when loading the ntfs module. Once the driver is active, + you can enable debugging messages by doing (as root): + echo 1 > /proc/sys/fs/ntfs-debug + Replacing the "1" with "0" would disable debug messages. + + If you leave debugging messages disabled, this results in little + overhead, but enabling debug messages results in very significant + slowdown of the system. + + When reporting bugs, please try to have available a full dump of + debugging messages while the misbehaviour was occurring. + +config NTFS_RW + bool "NTFS write support" + depends on NTFS_FS + help + This enables the partial, but safe, write support in the NTFS driver. + + The only supported operation is overwriting existing files, without + changing the file length. No file or directory creation, deletion or + renaming is possible. Note only non-resident files can be written to + so you may find that some very small files (<500 bytes or so) cannot + be written to. + + While we cannot guarantee that it will not damage any data, we have + so far not received a single report where the driver would have + damaged someones data so we assume it is perfectly safe to use. + + Note: While write support is safe in this version (a rewrite from + scratch of the NTFS support), it should be noted that the old NTFS + write support, included in Linux 2.5.10 and before (since 1997), + is not safe. + + This is currently useful with TopologiLinux. TopologiLinux is run + on top of any DOS/Microsoft Windows system without partitioning your + hard disk. Unlike other Linux distributions TopologiLinux does not + need its own partition. For more information see + + + It is perfectly safe to say N here. diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile new file mode 100644 index 00000000..30206b23 --- /dev/null +++ b/fs/ntfs/Makefile @@ -0,0 +1,14 @@ +# Rules for making the NTFS driver. + +obj-$(CONFIG_NTFS_FS) += ntfs.o + +ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ + index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ + unistr.o upcase.o + +ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o + +ccflags-y := -DNTFS_VERSION=\"2.1.30\" +ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG +ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW + diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c new file mode 100644 index 00000000..0b1e885b --- /dev/null +++ b/fs/ntfs/aops.c @@ -0,0 +1,1638 @@ +/** + * aops.c - NTFS kernel address space operations and page cache handling. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "aops.h" +#include "attrib.h" +#include "debug.h" +#include "inode.h" +#include "mft.h" +#include "runlist.h" +#include "types.h" +#include "ntfs.h" + +/** + * ntfs_end_buffer_async_read - async io completion for reading attributes + * @bh: buffer head on which io is completed + * @uptodate: whether @bh is now uptodate or not + * + * Asynchronous I/O completion handler for reading pages belonging to the + * attribute address space of an inode. The inodes can either be files or + * directories or they can be fake inodes describing some attribute. + * + * If NInoMstProtected(), perform the post read mst fixups when all IO on the + * page has been completed and mark the page uptodate or set the error bit on + * the page. To determine the size of the records that need fixing up, we + * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs + * record size, and index_block_size_bits, to the log(base 2) of the ntfs + * record size. + */ +static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) +{ + unsigned long flags; + struct buffer_head *first, *tmp; + struct page *page; + struct inode *vi; + ntfs_inode *ni; + int page_uptodate = 1; + + page = bh->b_page; + vi = page->mapping->host; + ni = NTFS_I(vi); + + if (likely(uptodate)) { + loff_t i_size; + s64 file_ofs, init_size; + + set_buffer_uptodate(bh); + + file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) + + bh_offset(bh); + read_lock_irqsave(&ni->size_lock, flags); + init_size = ni->initialized_size; + i_size = i_size_read(vi); + read_unlock_irqrestore(&ni->size_lock, flags); + if (unlikely(init_size > i_size)) { + /* Race with shrinking truncate. */ + init_size = i_size; + } + /* Check for the current buffer head overflowing. */ + if (unlikely(file_ofs + bh->b_size > init_size)) { + int ofs; + void *kaddr; + + ofs = 0; + if (file_ofs < init_size) + ofs = init_size - file_ofs; + local_irq_save(flags); + kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ); + memset(kaddr + bh_offset(bh) + ofs, 0, + bh->b_size - ofs); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_BIO_SRC_IRQ); + local_irq_restore(flags); + } + } else { + clear_buffer_uptodate(bh); + SetPageError(page); + ntfs_error(ni->vol->sb, "Buffer I/O error, logical block " + "0x%llx.", (unsigned long long)bh->b_blocknr); + } + first = page_buffers(page); + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + clear_buffer_async_read(bh); + unlock_buffer(bh); + tmp = bh; + do { + if (!buffer_uptodate(tmp)) + page_uptodate = 0; + if (buffer_async_read(tmp)) { + if (likely(buffer_locked(tmp))) + goto still_busy; + /* Async buffers must be locked. */ + BUG(); + } + tmp = tmp->b_this_page; + } while (tmp != bh); + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); + /* + * If none of the buffers had errors then we can set the page uptodate, + * but we first have to perform the post read mst fixups, if the + * attribute is mst protected, i.e. if NInoMstProteced(ni) is true. + * Note we ignore fixup errors as those are detected when + * map_mft_record() is called which gives us per record granularity + * rather than per page granularity. + */ + if (!NInoMstProtected(ni)) { + if (likely(page_uptodate && !PageError(page))) + SetPageUptodate(page); + } else { + u8 *kaddr; + unsigned int i, recs; + u32 rec_size; + + rec_size = ni->itype.index.block_size; + recs = PAGE_CACHE_SIZE / rec_size; + /* Should have been verified before we got here... */ + BUG_ON(!recs); + local_irq_save(flags); + kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ); + for (i = 0; i < recs; i++) + post_read_mst_fixup((NTFS_RECORD*)(kaddr + + i * rec_size), rec_size); + kunmap_atomic(kaddr, KM_BIO_SRC_IRQ); + local_irq_restore(flags); + flush_dcache_page(page); + if (likely(page_uptodate && !PageError(page))) + SetPageUptodate(page); + } + unlock_page(page); + return; +still_busy: + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); + return; +} + +/** + * ntfs_read_block - fill a @page of an address space with data + * @page: page cache page to fill with data + * + * Fill the page @page of the address space belonging to the @page->host inode. + * We read each buffer asynchronously and when all buffers are read in, our io + * completion handler ntfs_end_buffer_read_async(), if required, automatically + * applies the mst fixups to the page before finally marking it uptodate and + * unlocking it. + * + * We only enforce allocated_size limit because i_size is checked for in + * generic_file_read(). + * + * Return 0 on success and -errno on error. + * + * Contains an adapted version of fs/buffer.c::block_read_full_page(). + */ +static int ntfs_read_block(struct page *page) +{ + loff_t i_size; + VCN vcn; + LCN lcn; + s64 init_size; + struct inode *vi; + ntfs_inode *ni; + ntfs_volume *vol; + runlist_element *rl; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + sector_t iblock, lblock, zblock; + unsigned long flags; + unsigned int blocksize, vcn_ofs; + int i, nr; + unsigned char blocksize_bits; + + vi = page->mapping->host; + ni = NTFS_I(vi); + vol = ni->vol; + + /* $MFT/$DATA must have its complete runlist in memory at all times. */ + BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni)); + + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + + if (!page_has_buffers(page)) { + create_empty_buffers(page, blocksize, 0); + if (unlikely(!page_has_buffers(page))) { + unlock_page(page); + return -ENOMEM; + } + } + bh = head = page_buffers(page); + BUG_ON(!bh); + + /* + * We may be racing with truncate. To avoid some of the problems we + * now take a snapshot of the various sizes and use those for the whole + * of the function. In case of an extending truncate it just means we + * may leave some buffers unmapped which are now allocated. This is + * not a problem since these buffers will just get mapped when a write + * occurs. In case of a shrinking truncate, we will detect this later + * on due to the runlist being incomplete and if the page is being + * fully truncated, truncate will throw it away as soon as we unlock + * it so no need to worry what we do with it. + */ + iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits); + read_lock_irqsave(&ni->size_lock, flags); + lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits; + init_size = ni->initialized_size; + i_size = i_size_read(vi); + read_unlock_irqrestore(&ni->size_lock, flags); + if (unlikely(init_size > i_size)) { + /* Race with shrinking truncate. */ + init_size = i_size; + } + zblock = (init_size + blocksize - 1) >> blocksize_bits; + + /* Loop through all the buffers in the page. */ + rl = NULL; + nr = i = 0; + do { + int err = 0; + + if (unlikely(buffer_uptodate(bh))) + continue; + if (unlikely(buffer_mapped(bh))) { + arr[nr++] = bh; + continue; + } + bh->b_bdev = vol->sb->s_bdev; + /* Is the block within the allowed limits? */ + if (iblock < lblock) { + bool is_retry = false; + + /* Convert iblock into corresponding vcn and offset. */ + vcn = (VCN)iblock << blocksize_bits >> + vol->cluster_size_bits; + vcn_ofs = ((VCN)iblock << blocksize_bits) & + vol->cluster_size_mask; + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + /* Successful remap. */ + if (lcn >= 0) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + /* Only read initialized data blocks. */ + if (iblock < zblock) { + arr[nr++] = bh; + continue; + } + /* Fully non-initialized data block, zero it. */ + goto handle_zblock; + } + /* It is a hole, need to zero it. */ + if (lcn == LCN_HOLE) + goto handle_hole; + /* If first try and runlist unmapped, map and retry. */ + if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { + is_retry = true; + /* + * Attempt to map runlist, dropping lock for + * the duration. + */ + up_read(&ni->runlist.lock); + err = ntfs_map_runlist(ni, vcn); + if (likely(!err)) + goto lock_retry_remap; + rl = NULL; + } else if (!rl) + up_read(&ni->runlist.lock); + /* + * If buffer is outside the runlist, treat it as a + * hole. This can happen due to concurrent truncate + * for example. + */ + if (err == -ENOENT || lcn == LCN_ENOENT) { + err = 0; + goto handle_hole; + } + /* Hard error, zero out region. */ + if (!err) + err = -EIO; + bh->b_blocknr = -1; + SetPageError(page); + ntfs_error(vol->sb, "Failed to read from inode 0x%lx, " + "attribute type 0x%x, vcn 0x%llx, " + "offset 0x%x because its location on " + "disk could not be determined%s " + "(error code %i).", ni->mft_no, + ni->type, (unsigned long long)vcn, + vcn_ofs, is_retry ? " even after " + "retrying" : "", err); + } + /* + * Either iblock was outside lblock limits or + * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion + * of the page and set the buffer uptodate. + */ +handle_hole: + bh->b_blocknr = -1UL; + clear_buffer_mapped(bh); +handle_zblock: + zero_user(page, i * blocksize, blocksize); + if (likely(!err)) + set_buffer_uptodate(bh); + } while (i++, iblock++, (bh = bh->b_this_page) != head); + + /* Release the lock if we took it. */ + if (rl) + up_read(&ni->runlist.lock); + + /* Check we have at least one buffer ready for i/o. */ + if (nr) { + struct buffer_head *tbh; + + /* Lock the buffers. */ + for (i = 0; i < nr; i++) { + tbh = arr[i]; + lock_buffer(tbh); + tbh->b_end_io = ntfs_end_buffer_async_read; + set_buffer_async_read(tbh); + } + /* Finally, start i/o on the buffers. */ + for (i = 0; i < nr; i++) { + tbh = arr[i]; + if (likely(!buffer_uptodate(tbh))) + submit_bh(READ, tbh); + else + ntfs_end_buffer_async_read(tbh, 1); + } + return 0; + } + /* No i/o was scheduled on any of the buffers. */ + if (likely(!PageError(page))) + SetPageUptodate(page); + else /* Signal synchronous i/o error. */ + nr = -EIO; + unlock_page(page); + return nr; +} + +/** + * ntfs_readpage - fill a @page of a @file with data from the device + * @file: open file to which the page @page belongs or NULL + * @page: page cache page to fill with data + * + * For non-resident attributes, ntfs_readpage() fills the @page of the open + * file @file by calling the ntfs version of the generic block_read_full_page() + * function, ntfs_read_block(), which in turn creates and reads in the buffers + * associated with the page asynchronously. + * + * For resident attributes, OTOH, ntfs_readpage() fills @page by copying the + * data from the mft record (which at this stage is most likely in memory) and + * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as + * even if the mft record is not cached at this point in time, we need to wait + * for it to be read in before we can do the copy. + * + * Return 0 on success and -errno on error. + */ +static int ntfs_readpage(struct file *file, struct page *page) +{ + loff_t i_size; + struct inode *vi; + ntfs_inode *ni, *base_ni; + u8 *addr; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *mrec; + unsigned long flags; + u32 attr_len; + int err = 0; + +retry_readpage: + BUG_ON(!PageLocked(page)); + vi = page->mapping->host; + i_size = i_size_read(vi); + /* Is the page fully outside i_size? (truncate in progress) */ + if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) { + zero_user(page, 0, PAGE_CACHE_SIZE); + ntfs_debug("Read outside i_size - truncated?"); + goto done; + } + /* + * This can potentially happen because we clear PageUptodate() during + * ntfs_writepage() of MstProtected() attributes. + */ + if (PageUptodate(page)) { + unlock_page(page); + return 0; + } + ni = NTFS_I(vi); + /* + * Only $DATA attributes can be encrypted and only unnamed $DATA + * attributes can be compressed. Index root can have the flags set but + * this means to create compressed/encrypted files, not that the + * attribute is compressed/encrypted. Note we need to check for + * AT_INDEX_ALLOCATION since this is the type of both directory and + * index inodes. + */ + if (ni->type != AT_INDEX_ALLOCATION) { + /* If attribute is encrypted, deny access, just like NT4. */ + if (NInoEncrypted(ni)) { + BUG_ON(ni->type != AT_DATA); + err = -EACCES; + goto err_out; + } + /* Compressed data streams are handled in compress.c. */ + if (NInoNonResident(ni) && NInoCompressed(ni)) { + BUG_ON(ni->type != AT_DATA); + BUG_ON(ni->name_len); + return ntfs_read_compressed_block(page); + } + } + /* NInoNonResident() == NInoIndexAllocPresent() */ + if (NInoNonResident(ni)) { + /* Normal, non-resident data stream. */ + return ntfs_read_block(page); + } + /* + * Attribute is resident, implying it is not compressed or encrypted. + * This also means the attribute is smaller than an mft record and + * hence smaller than a page, so can simply zero out any pages with + * index above 0. Note the attribute can actually be marked compressed + * but if it is resident the actual data is not compressed so we are + * ok to ignore the compressed flag here. + */ + if (unlikely(page->index > 0)) { + zero_user(page, 0, PAGE_CACHE_SIZE); + goto done; + } + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Map, pin, and lock the mft record. */ + mrec = map_mft_record(base_ni); + if (IS_ERR(mrec)) { + err = PTR_ERR(mrec); + goto err_out; + } + /* + * If a parallel write made the attribute non-resident, drop the mft + * record and retry the readpage. + */ + if (unlikely(NInoNonResident(ni))) { + unmap_mft_record(base_ni); + goto retry_readpage; + } + ctx = ntfs_attr_get_search_ctx(base_ni, mrec); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto unm_err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto put_unm_err_out; + attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); + read_lock_irqsave(&ni->size_lock, flags); + if (unlikely(attr_len > ni->initialized_size)) + attr_len = ni->initialized_size; + i_size = i_size_read(vi); + read_unlock_irqrestore(&ni->size_lock, flags); + if (unlikely(attr_len > i_size)) { + /* Race with shrinking truncate. */ + attr_len = i_size; + } + addr = kmap_atomic(page, KM_USER0); + /* Copy the data to the page. */ + memcpy(addr, (u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset), + attr_len); + /* Zero the remainder of the page. */ + memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); + flush_dcache_page(page); + kunmap_atomic(addr, KM_USER0); +put_unm_err_out: + ntfs_attr_put_search_ctx(ctx); +unm_err_out: + unmap_mft_record(base_ni); +done: + SetPageUptodate(page); +err_out: + unlock_page(page); + return err; +} + +#ifdef NTFS_RW + +/** + * ntfs_write_block - write a @page to the backing store + * @page: page cache page to write out + * @wbc: writeback control structure + * + * This function is for writing pages belonging to non-resident, non-mst + * protected attributes to their backing store. + * + * For a page with buffers, map and write the dirty buffers asynchronously + * under page writeback. For a page without buffers, create buffers for the + * page, then proceed as above. + * + * If a page doesn't have buffers the page dirty state is definitive. If a page + * does have buffers, the page dirty state is just a hint, and the buffer dirty + * state is definitive. (A hint which has rules: dirty buffers against a clean + * page is illegal. Other combinations are legal and need to be handled. In + * particular a dirty page containing clean buffers for example.) + * + * Return 0 on success and -errno on error. + * + * Based on ntfs_read_block() and __block_write_full_page(). + */ +static int ntfs_write_block(struct page *page, struct writeback_control *wbc) +{ + VCN vcn; + LCN lcn; + s64 initialized_size; + loff_t i_size; + sector_t block, dblock, iblock; + struct inode *vi; + ntfs_inode *ni; + ntfs_volume *vol; + runlist_element *rl; + struct buffer_head *bh, *head; + unsigned long flags; + unsigned int blocksize, vcn_ofs; + int err; + bool need_end_writeback; + unsigned char blocksize_bits; + + vi = page->mapping->host; + ni = NTFS_I(vi); + vol = ni->vol; + + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " + "0x%lx.", ni->mft_no, ni->type, page->index); + + BUG_ON(!NInoNonResident(ni)); + BUG_ON(NInoMstProtected(ni)); + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + if (!page_has_buffers(page)) { + BUG_ON(!PageUptodate(page)); + create_empty_buffers(page, blocksize, + (1 << BH_Uptodate) | (1 << BH_Dirty)); + if (unlikely(!page_has_buffers(page))) { + ntfs_warning(vol->sb, "Error allocating page " + "buffers. Redirtying page so we try " + "again later."); + /* + * Put the page back on mapping->dirty_pages, but leave + * its buffers' dirty state as-is. + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } + bh = head = page_buffers(page); + BUG_ON(!bh); + + /* NOTE: Different naming scheme to ntfs_read_block()! */ + + /* The first block in the page. */ + block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits); + + read_lock_irqsave(&ni->size_lock, flags); + i_size = i_size_read(vi); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + + /* The first out of bounds block for the data size. */ + dblock = (i_size + blocksize - 1) >> blocksize_bits; + + /* The last (fully or partially) initialized block. */ + iblock = initialized_size >> blocksize_bits; + + /* + * Be very careful. We have no exclusion from __set_page_dirty_buffers + * here, and the (potentially unmapped) buffers may become dirty at + * any time. If a buffer becomes dirty here after we've inspected it + * then we just miss that fact, and the page stays dirty. + * + * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * handle that here by just cleaning them. + */ + + /* + * Loop through all the buffers in the page, mapping all the dirty + * buffers to disk addresses and handling any aliases from the + * underlying block device's mapping. + */ + rl = NULL; + err = 0; + do { + bool is_retry = false; + + if (unlikely(block >= dblock)) { + /* + * Mapped buffers outside i_size will occur, because + * this page can be outside i_size when there is a + * truncate in progress. The contents of such buffers + * were zeroed by ntfs_writepage(). + * + * FIXME: What about the small race window where + * ntfs_writepage() has not done any clearing because + * the page was within i_size but before we get here, + * vmtruncate() modifies i_size? + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + + /* Clean buffers are not written out, so no need to map them. */ + if (!buffer_dirty(bh)) + continue; + + /* Make sure we have enough initialized size. */ + if (unlikely((block >= iblock) && + (initialized_size < i_size))) { + /* + * If this page is fully outside initialized size, zero + * out all pages between the current initialized size + * and the current page. Just use ntfs_readpage() to do + * the zeroing transparently. + */ + if (block > iblock) { + // TODO: + // For each page do: + // - read_cache_page() + // Again for each page do: + // - wait_on_page_locked() + // - Check (PageUptodate(page) && + // !PageError(page)) + // Update initialized size in the attribute and + // in the inode. + // Again, for each page do: + // __set_page_dirty_buffers(); + // page_cache_release() + // We don't need to wait on the writes. + // Update iblock. + } + /* + * The current page straddles initialized size. Zero + * all non-uptodate buffers and set them uptodate (and + * dirty?). Note, there aren't any non-uptodate buffers + * if the page is uptodate. + * FIXME: For an uptodate page, the buffers may need to + * be written out because they were not initialized on + * disk before. + */ + if (!PageUptodate(page)) { + // TODO: + // Zero any non-uptodate buffers up to i_size. + // Set them uptodate and dirty. + } + // TODO: + // Update initialized size in the attribute and in the + // inode (up to i_size). + // Update iblock. + // FIXME: This is inefficient. Try to batch the two + // size changes to happen in one go. + ntfs_error(vol->sb, "Writing beyond initialized size " + "is not supported yet. Sorry."); + err = -EOPNOTSUPP; + break; + // Do NOT set_buffer_new() BUT DO clear buffer range + // outside write request range. + // set_buffer_uptodate() on complete buffers as well as + // set_buffer_dirty(). + } + + /* No need to map buffers that are already mapped. */ + if (buffer_mapped(bh)) + continue; + + /* Unmapped, dirty buffer. Need to map it. */ + bh->b_bdev = vol->sb->s_bdev; + + /* Convert block into corresponding vcn and offset. */ + vcn = (VCN)block << blocksize_bits; + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + /* Successful remap. */ + if (lcn >= 0) { + /* Setup buffer head to point to correct block. */ + bh->b_blocknr = ((lcn << vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + continue; + } + /* It is a hole, need to instantiate it. */ + if (lcn == LCN_HOLE) { + u8 *kaddr; + unsigned long *bpos, *bend; + + /* Check if the buffer is zero. */ + kaddr = kmap_atomic(page, KM_USER0); + bpos = (unsigned long *)(kaddr + bh_offset(bh)); + bend = (unsigned long *)((u8*)bpos + blocksize); + do { + if (unlikely(*bpos)) + break; + } while (likely(++bpos < bend)); + kunmap_atomic(kaddr, KM_USER0); + if (bpos == bend) { + /* + * Buffer is zero and sparse, no need to write + * it. + */ + bh->b_blocknr = -1; + clear_buffer_dirty(bh); + continue; + } + // TODO: Instantiate the hole. + // clear_buffer_new(bh); + // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + ntfs_error(vol->sb, "Writing into sparse regions is " + "not supported yet. Sorry."); + err = -EOPNOTSUPP; + break; + } + /* If first try and runlist unmapped, map and retry. */ + if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { + is_retry = true; + /* + * Attempt to map runlist, dropping lock for + * the duration. + */ + up_read(&ni->runlist.lock); + err = ntfs_map_runlist(ni, vcn); + if (likely(!err)) + goto lock_retry_remap; + rl = NULL; + } else if (!rl) + up_read(&ni->runlist.lock); + /* + * If buffer is outside the runlist, truncate has cut it out + * of the runlist. Just clean and clear the buffer and set it + * uptodate so it can get discarded by the VM. + */ + if (err == -ENOENT || lcn == LCN_ENOENT) { + bh->b_blocknr = -1; + clear_buffer_dirty(bh); + zero_user(page, bh_offset(bh), blocksize); + set_buffer_uptodate(bh); + err = 0; + continue; + } + /* Failed to map the buffer, even after retrying. */ + if (!err) + err = -EIO; + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " + "attribute type 0x%x, vcn 0x%llx, offset 0x%x " + "because its location on disk could not be " + "determined%s (error code %i).", ni->mft_no, + ni->type, (unsigned long long)vcn, + vcn_ofs, is_retry ? " even after " + "retrying" : "", err); + break; + } while (block++, (bh = bh->b_this_page) != head); + + /* Release the lock if we took it. */ + if (rl) + up_read(&ni->runlist.lock); + + /* For the error case, need to reset bh to the beginning. */ + bh = head; + + /* Just an optimization, so ->readpage() is not called later. */ + if (unlikely(!PageUptodate(page))) { + int uptodate = 1; + do { + if (!buffer_uptodate(bh)) { + uptodate = 0; + bh = head; + break; + } + } while ((bh = bh->b_this_page) != head); + if (uptodate) + SetPageUptodate(page); + } + + /* Setup all mapped, dirty buffers for async write i/o. */ + do { + if (buffer_mapped(bh) && buffer_dirty(bh)) { + lock_buffer(bh); + if (test_clear_buffer_dirty(bh)) { + BUG_ON(!buffer_uptodate(bh)); + mark_buffer_async_write(bh); + } else + unlock_buffer(bh); + } else if (unlikely(err)) { + /* + * For the error case. The buffer may have been set + * dirty during attachment to a dirty page. + */ + if (err != -ENOMEM) + clear_buffer_dirty(bh); + } + } while ((bh = bh->b_this_page) != head); + + if (unlikely(err)) { + // TODO: Remove the -EOPNOTSUPP check later on... + if (unlikely(err == -EOPNOTSUPP)) + err = 0; + else if (err == -ENOMEM) { + ntfs_warning(vol->sb, "Error allocating memory. " + "Redirtying page so we try again " + "later."); + /* + * Put the page back on mapping->dirty_pages, but + * leave its buffer's dirty state as-is. + */ + redirty_page_for_writepage(wbc, page); + err = 0; + } else + SetPageError(page); + } + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); /* Keeps try_to_free_buffers() away. */ + + /* Submit the prepared buffers for i/o. */ + need_end_writeback = true; + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(WRITE, bh); + need_end_writeback = false; + } + bh = next; + } while (bh != head); + unlock_page(page); + + /* If no i/o was started, need to end_page_writeback(). */ + if (unlikely(need_end_writeback)) + end_page_writeback(page); + + ntfs_debug("Done."); + return err; +} + +/** + * ntfs_write_mst_block - write a @page to the backing store + * @page: page cache page to write out + * @wbc: writeback control structure + * + * This function is for writing pages belonging to non-resident, mst protected + * attributes to their backing store. The only supported attributes are index + * allocation and $MFT/$DATA. Both directory inodes and index inodes are + * supported for the index allocation case. + * + * The page must remain locked for the duration of the write because we apply + * the mst fixups, write, and then undo the fixups, so if we were to unlock the + * page before undoing the fixups, any other user of the page will see the + * page contents as corrupt. + * + * We clear the page uptodate flag for the duration of the function to ensure + * exclusion for the $MFT/$DATA case against someone mapping an mft record we + * are about to apply the mst fixups to. + * + * Return 0 on success and -errno on error. + * + * Based on ntfs_write_block(), ntfs_mft_writepage(), and + * write_mft_record_nolock(). + */ +static int ntfs_write_mst_block(struct page *page, + struct writeback_control *wbc) +{ + sector_t block, dblock, rec_block; + struct inode *vi = page->mapping->host; + ntfs_inode *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + u8 *kaddr; + unsigned int rec_size = ni->itype.index.block_size; + ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size]; + struct buffer_head *bh, *head, *tbh, *rec_start_bh; + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; + runlist_element *rl; + int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2; + unsigned bh_size, rec_size_bits; + bool sync, is_mft, page_is_dirty, rec_is_dirty; + unsigned char bh_size_bits; + + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " + "0x%lx.", vi->i_ino, ni->type, page->index); + BUG_ON(!NInoNonResident(ni)); + BUG_ON(!NInoMstProtected(ni)); + is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino); + /* + * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page + * in its page cache were to be marked dirty. However this should + * never happen with the current driver and considering we do not + * handle this case here we do want to BUG(), at least for now. + */ + BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) || + (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION))); + bh_size = vol->sb->s_blocksize; + bh_size_bits = vol->sb->s_blocksize_bits; + max_bhs = PAGE_CACHE_SIZE / bh_size; + BUG_ON(!max_bhs); + BUG_ON(max_bhs > MAX_BUF_PER_PAGE); + + /* Were we called for sync purposes? */ + sync = (wbc->sync_mode == WB_SYNC_ALL); + + /* Make sure we have mapped buffers. */ + bh = head = page_buffers(page); + BUG_ON(!bh); + + rec_size_bits = ni->itype.index.block_size_bits; + BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits)); + bhs_per_rec = rec_size >> bh_size_bits; + BUG_ON(!bhs_per_rec); + + /* The first block in the page. */ + rec_block = block = (sector_t)page->index << + (PAGE_CACHE_SHIFT - bh_size_bits); + + /* The first out of bounds block for the data size. */ + dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits; + + rl = NULL; + err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0; + page_is_dirty = rec_is_dirty = false; + rec_start_bh = NULL; + do { + bool is_retry = false; + + if (likely(block < rec_block)) { + if (unlikely(block >= dblock)) { + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + /* + * This block is not the first one in the record. We + * ignore the buffer's dirty state because we could + * have raced with a parallel mark_ntfs_record_dirty(). + */ + if (!rec_is_dirty) + continue; + if (unlikely(err2)) { + if (err2 != -ENOMEM) + clear_buffer_dirty(bh); + continue; + } + } else /* if (block == rec_block) */ { + BUG_ON(block > rec_block); + /* This block is the first one in the record. */ + rec_block += bhs_per_rec; + err2 = 0; + if (unlikely(block >= dblock)) { + clear_buffer_dirty(bh); + continue; + } + if (!buffer_dirty(bh)) { + /* Clean records are not written out. */ + rec_is_dirty = false; + continue; + } + rec_is_dirty = true; + rec_start_bh = bh; + } + /* Need to map the buffer if it is not mapped already. */ + if (unlikely(!buffer_mapped(bh))) { + VCN vcn; + LCN lcn; + unsigned int vcn_ofs; + + bh->b_bdev = vol->sb->s_bdev; + /* Obtain the vcn and offset of the current block. */ + vcn = (VCN)block << bh_size_bits; + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + /* Successful remap. */ + if (likely(lcn >= 0)) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << + vol->cluster_size_bits) + + vcn_ofs) >> bh_size_bits; + set_buffer_mapped(bh); + } else { + /* + * Remap failed. Retry to map the runlist once + * unless we are working on $MFT which always + * has the whole of its runlist in memory. + */ + if (!is_mft && !is_retry && + lcn == LCN_RL_NOT_MAPPED) { + is_retry = true; + /* + * Attempt to map runlist, dropping + * lock for the duration. + */ + up_read(&ni->runlist.lock); + err2 = ntfs_map_runlist(ni, vcn); + if (likely(!err2)) + goto lock_retry_remap; + if (err2 == -ENOMEM) + page_is_dirty = true; + lcn = err2; + } else { + err2 = -EIO; + if (!rl) + up_read(&ni->runlist.lock); + } + /* Hard error. Abort writing this record. */ + if (!err || err == -ENOMEM) + err = err2; + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Cannot write ntfs record " + "0x%llx (inode 0x%lx, " + "attribute type 0x%x) because " + "its location on disk could " + "not be determined (error " + "code %lli).", + (long long)block << + bh_size_bits >> + vol->mft_record_size_bits, + ni->mft_no, ni->type, + (long long)lcn); + /* + * If this is not the first buffer, remove the + * buffers in this record from the list of + * buffers to write and clear their dirty bit + * if not error -ENOMEM. + */ + if (rec_start_bh != bh) { + while (bhs[--nr_bhs] != rec_start_bh) + ; + if (err2 != -ENOMEM) { + do { + clear_buffer_dirty( + rec_start_bh); + } while ((rec_start_bh = + rec_start_bh-> + b_this_page) != + bh); + } + } + continue; + } + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(nr_bhs >= max_bhs); + bhs[nr_bhs++] = bh; + } while (block++, (bh = bh->b_this_page) != head); + if (unlikely(rl)) + up_read(&ni->runlist.lock); + /* If there were no dirty buffers, we are done. */ + if (!nr_bhs) + goto done; + /* Map the page so we can access its contents. */ + kaddr = kmap(page); + /* Clear the page uptodate flag whilst the mst fixups are applied. */ + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + for (i = 0; i < nr_bhs; i++) { + unsigned int ofs; + + /* Skip buffers which are not at the beginning of records. */ + if (i % bhs_per_rec) + continue; + tbh = bhs[i]; + ofs = bh_offset(tbh); + if (is_mft) { + ntfs_inode *tni; + unsigned long mft_no; + + /* Get the mft record number. */ + mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs) + >> rec_size_bits; + /* Check whether to write this mft record. */ + tni = NULL; + if (!ntfs_may_write_mft_record(vol, mft_no, + (MFT_RECORD*)(kaddr + ofs), &tni)) { + /* + * The record should not be written. This + * means we need to redirty the page before + * returning. + */ + page_is_dirty = true; + /* + * Remove the buffers in this mft record from + * the list of buffers to write. + */ + do { + bhs[i] = NULL; + } while (++i % bhs_per_rec); + continue; + } + /* + * The record should be written. If a locked ntfs + * inode was returned, add it to the array of locked + * ntfs inodes. + */ + if (tni) + locked_nis[nr_locked_nis++] = tni; + } + /* Apply the mst protection fixups. */ + err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs), + rec_size); + if (unlikely(err2)) { + if (!err || err == -ENOMEM) + err = -EIO; + ntfs_error(vol->sb, "Failed to apply mst fixups " + "(inode 0x%lx, attribute type 0x%x, " + "page index 0x%lx, page offset 0x%x)!" + " Unmount and run chkdsk.", vi->i_ino, + ni->type, page->index, ofs); + /* + * Mark all the buffers in this record clean as we do + * not want to write corrupt data to disk. + */ + do { + clear_buffer_dirty(bhs[i]); + bhs[i] = NULL; + } while (++i % bhs_per_rec); + continue; + } + nr_recs++; + } + /* If no records are to be written out, we are done. */ + if (!nr_recs) + goto unm_done; + flush_dcache_page(page); + /* Lock buffers and start synchronous write i/o on them. */ + for (i = 0; i < nr_bhs; i++) { + tbh = bhs[i]; + if (!tbh) + continue; + if (!trylock_buffer(tbh)) + BUG(); + /* The buffer dirty state is now irrelevant, just clean it. */ + clear_buffer_dirty(tbh); + BUG_ON(!buffer_uptodate(tbh)); + BUG_ON(!buffer_mapped(tbh)); + get_bh(tbh); + tbh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, tbh); + } + /* Synchronize the mft mirror now if not @sync. */ + if (is_mft && !sync) + goto do_mirror; +do_wait: + /* Wait on i/o completion of buffers. */ + for (i = 0; i < nr_bhs; i++) { + tbh = bhs[i]; + if (!tbh) + continue; + wait_on_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { + ntfs_error(vol->sb, "I/O error while writing ntfs " + "record buffer (inode 0x%lx, " + "attribute type 0x%x, page index " + "0x%lx, page offset 0x%lx)! Unmount " + "and run chkdsk.", vi->i_ino, ni->type, + page->index, bh_offset(tbh)); + if (!err || err == -ENOMEM) + err = -EIO; + /* + * Set the buffer uptodate so the page and buffer + * states do not become out of sync. + */ + set_buffer_uptodate(tbh); + } + } + /* If @sync, now synchronize the mft mirror. */ + if (is_mft && sync) { +do_mirror: + for (i = 0; i < nr_bhs; i++) { + unsigned long mft_no; + unsigned int ofs; + + /* + * Skip buffers which are not at the beginning of + * records. + */ + if (i % bhs_per_rec) + continue; + tbh = bhs[i]; + /* Skip removed buffers (and hence records). */ + if (!tbh) + continue; + ofs = bh_offset(tbh); + /* Get the mft record number. */ + mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs) + >> rec_size_bits; + if (mft_no < vol->mftmirr_size) + ntfs_sync_mft_mirror(vol, mft_no, + (MFT_RECORD*)(kaddr + ofs), + sync); + } + if (!sync) + goto do_wait; + } + /* Remove the mst protection fixups again. */ + for (i = 0; i < nr_bhs; i++) { + if (!(i % bhs_per_rec)) { + tbh = bhs[i]; + if (!tbh) + continue; + post_write_mst_fixup((NTFS_RECORD*)(kaddr + + bh_offset(tbh))); + } + } + flush_dcache_page(page); +unm_done: + /* Unlock any locked inodes. */ + while (nr_locked_nis-- > 0) { + ntfs_inode *tni, *base_tni; + + tni = locked_nis[nr_locked_nis]; + /* Get the base inode. */ + mutex_lock(&tni->extent_lock); + if (tni->nr_extents >= 0) + base_tni = tni; + else { + base_tni = tni->ext.base_ntfs_ino; + BUG_ON(!base_tni); + } + mutex_unlock(&tni->extent_lock); + ntfs_debug("Unlocking %s inode 0x%lx.", + tni == base_tni ? "base" : "extent", + tni->mft_no); + mutex_unlock(&tni->mrec_lock); + atomic_dec(&tni->count); + iput(VFS_I(base_tni)); + } + SetPageUptodate(page); + kunmap(page); +done: + if (unlikely(err && err != -ENOMEM)) { + /* + * Set page error if there is only one ntfs record in the page. + * Otherwise we would loose per-record granularity. + */ + if (ni->itype.index.block_size == PAGE_CACHE_SIZE) + SetPageError(page); + NVolSetErrors(vol); + } + if (page_is_dirty) { + ntfs_debug("Page still contains one or more dirty ntfs " + "records. Redirtying the page starting at " + "record 0x%lx.", page->index << + (PAGE_CACHE_SHIFT - rec_size_bits)); + redirty_page_for_writepage(wbc, page); + unlock_page(page); + } else { + /* + * Keep the VM happy. This must be done otherwise the + * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though + * the page is clean. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + } + if (likely(!err)) + ntfs_debug("Done."); + return err; +} + +/** + * ntfs_writepage - write a @page to the backing store + * @page: page cache page to write out + * @wbc: writeback control structure + * + * This is called from the VM when it wants to have a dirty ntfs page cache + * page cleaned. The VM has already locked the page and marked it clean. + * + * For non-resident attributes, ntfs_writepage() writes the @page by calling + * the ntfs version of the generic block_write_full_page() function, + * ntfs_write_block(), which in turn if necessary creates and writes the + * buffers associated with the page asynchronously. + * + * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying + * the data to the mft record (which at this stage is most likely in memory). + * The mft record is then marked dirty and written out asynchronously via the + * vfs inode dirty code path for the inode the mft record belongs to or via the + * vm page dirty code path for the page the mft record is in. + * + * Based on ntfs_readpage() and fs/buffer.c::block_write_full_page(). + * + * Return 0 on success and -errno on error. + */ +static int ntfs_writepage(struct page *page, struct writeback_control *wbc) +{ + loff_t i_size; + struct inode *vi = page->mapping->host; + ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi); + char *addr; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *m = NULL; + u32 attr_len; + int err; + +retry_writepage: + BUG_ON(!PageLocked(page)); + i_size = i_size_read(vi); + /* Is the page fully outside i_size? (truncate in progress) */ + if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) { + /* + * The page may have dirty, unmapped buffers. Make them + * freeable here, so the page does not leak. + */ + block_invalidatepage(page, 0); + unlock_page(page); + ntfs_debug("Write outside i_size - truncated?"); + return 0; + } + /* + * Only $DATA attributes can be encrypted and only unnamed $DATA + * attributes can be compressed. Index root can have the flags set but + * this means to create compressed/encrypted files, not that the + * attribute is compressed/encrypted. Note we need to check for + * AT_INDEX_ALLOCATION since this is the type of both directory and + * index inodes. + */ + if (ni->type != AT_INDEX_ALLOCATION) { + /* If file is encrypted, deny access, just like NT4. */ + if (NInoEncrypted(ni)) { + unlock_page(page); + BUG_ON(ni->type != AT_DATA); + ntfs_debug("Denying write access to encrypted file."); + return -EACCES; + } + /* Compressed data streams are handled in compress.c. */ + if (NInoNonResident(ni) && NInoCompressed(ni)) { + BUG_ON(ni->type != AT_DATA); + BUG_ON(ni->name_len); + // TODO: Implement and replace this with + // return ntfs_write_compressed_block(page); + unlock_page(page); + ntfs_error(vi->i_sb, "Writing to compressed files is " + "not supported yet. Sorry."); + return -EOPNOTSUPP; + } + // TODO: Implement and remove this check. + if (NInoNonResident(ni) && NInoSparse(ni)) { + unlock_page(page); + ntfs_error(vi->i_sb, "Writing to sparse files is not " + "supported yet. Sorry."); + return -EOPNOTSUPP; + } + } + /* NInoNonResident() == NInoIndexAllocPresent() */ + if (NInoNonResident(ni)) { + /* We have to zero every time due to mmap-at-end-of-file. */ + if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) { + /* The page straddles i_size. */ + unsigned int ofs = i_size & ~PAGE_CACHE_MASK; + zero_user_segment(page, ofs, PAGE_CACHE_SIZE); + } + /* Handle mst protected attributes. */ + if (NInoMstProtected(ni)) + return ntfs_write_mst_block(page, wbc); + /* Normal, non-resident data stream. */ + return ntfs_write_block(page, wbc); + } + /* + * Attribute is resident, implying it is not compressed, encrypted, or + * mst protected. This also means the attribute is smaller than an mft + * record and hence smaller than a page, so can simply return error on + * any pages with index above 0. Note the attribute can actually be + * marked compressed but if it is resident the actual data is not + * compressed so we are ok to ignore the compressed flag here. + */ + BUG_ON(page_has_buffers(page)); + BUG_ON(!PageUptodate(page)); + if (unlikely(page->index > 0)) { + ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0. " + "Aborting write.", page->index); + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + return -EIO; + } + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Map, pin, and lock the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + /* + * If a parallel write made the attribute non-resident, drop the mft + * record and retry the writepage. + */ + if (unlikely(NInoNonResident(ni))) { + unmap_mft_record(base_ni); + goto retry_writepage; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto err_out; + /* + * Keep the VM happy. This must be done otherwise the radix-tree tag + * PAGECACHE_TAG_DIRTY remains set even though the page is clean. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); + i_size = i_size_read(vi); + if (unlikely(attr_len > i_size)) { + /* Race with shrinking truncate or a failed truncate. */ + attr_len = i_size; + /* + * If the truncate failed, fix it up now. If a concurrent + * truncate, we do its job, so it does not have to do anything. + */ + err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr, + attr_len); + /* Shrinking cannot fail. */ + BUG_ON(err); + } + addr = kmap_atomic(page, KM_USER0); + /* Copy the data from the page to the mft record. */ + memcpy((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset), + addr, attr_len); + /* Zero out of bounds area in the page cache page. */ + memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); + kunmap_atomic(addr, KM_USER0); + flush_dcache_page(page); + flush_dcache_mft_record_page(ctx->ntfs_ino); + /* We are done with the page. */ + end_page_writeback(page); + /* Finally, mark the mft record dirty, so it gets written back. */ + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + return 0; +err_out: + if (err == -ENOMEM) { + ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying " + "page so we try again later."); + /* + * Put the page back on mapping->dirty_pages, but leave its + * buffers' dirty state as-is. + */ + redirty_page_for_writepage(wbc, page); + err = 0; + } else { + ntfs_error(vi->i_sb, "Resident attribute write failed with " + "error %i.", err); + SetPageError(page); + NVolSetErrors(ni->vol); + } + unlock_page(page); + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + return err; +} + +#endif /* NTFS_RW */ + +/** + * ntfs_aops - general address space operations for inodes and attributes + */ +const struct address_space_operations ntfs_aops = { + .readpage = ntfs_readpage, /* Fill page with data. */ +#ifdef NTFS_RW + .writepage = ntfs_writepage, /* Write dirty page to disk. */ +#endif /* NTFS_RW */ + .migratepage = buffer_migrate_page, /* Move a page cache page from + one physical page to an + other. */ + .error_remove_page = generic_error_remove_page, +}; + +/** + * ntfs_mst_aops - general address space operations for mst protecteed inodes + * and attributes + */ +const struct address_space_operations ntfs_mst_aops = { + .readpage = ntfs_readpage, /* Fill page with data. */ +#ifdef NTFS_RW + .writepage = ntfs_writepage, /* Write dirty page to disk. */ + .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty + without touching the buffers + belonging to the page. */ +#endif /* NTFS_RW */ + .migratepage = buffer_migrate_page, /* Move a page cache page from + one physical page to an + other. */ + .error_remove_page = generic_error_remove_page, +}; + +#ifdef NTFS_RW + +/** + * mark_ntfs_record_dirty - mark an ntfs record dirty + * @page: page containing the ntfs record to mark dirty + * @ofs: byte offset within @page at which the ntfs record begins + * + * Set the buffers and the page in which the ntfs record is located dirty. + * + * The latter also marks the vfs inode the ntfs record belongs to dirty + * (I_DIRTY_PAGES only). + * + * If the page does not have buffers, we create them and set them uptodate. + * The page may not be locked which is why we need to handle the buffers under + * the mapping->private_lock. Once the buffers are marked dirty we no longer + * need the lock since try_to_free_buffers() does not free dirty buffers. + */ +void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { + struct address_space *mapping = page->mapping; + ntfs_inode *ni = NTFS_I(mapping->host); + struct buffer_head *bh, *head, *buffers_to_free = NULL; + unsigned int end, bh_size, bh_ofs; + + BUG_ON(!PageUptodate(page)); + end = ofs + ni->itype.index.block_size; + bh_size = VFS_I(ni)->i_sb->s_blocksize; + spin_lock(&mapping->private_lock); + if (unlikely(!page_has_buffers(page))) { + spin_unlock(&mapping->private_lock); + bh = head = alloc_page_buffers(page, bh_size, 1); + spin_lock(&mapping->private_lock); + if (likely(!page_has_buffers(page))) { + struct buffer_head *tail; + + do { + set_buffer_uptodate(bh); + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + attach_page_buffers(page, head); + } else + buffers_to_free = bh; + } + bh = head = page_buffers(page); + BUG_ON(!bh); + do { + bh_ofs = bh_offset(bh); + if (bh_ofs + bh_size <= ofs) + continue; + if (unlikely(bh_ofs >= end)) + break; + set_buffer_dirty(bh); + } while ((bh = bh->b_this_page) != head); + spin_unlock(&mapping->private_lock); + __set_page_dirty_nobuffers(page); + if (unlikely(buffers_to_free)) { + do { + bh = buffers_to_free->b_this_page; + free_buffer_head(buffers_to_free); + buffers_to_free = bh; + } while (buffers_to_free); + } +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h new file mode 100644 index 00000000..caecc58f --- /dev/null +++ b/fs/ntfs/aops.h @@ -0,0 +1,107 @@ +/** + * aops.h - Defines for NTFS kernel address space operations and page cache + * handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_AOPS_H +#define _LINUX_NTFS_AOPS_H + +#include +#include +#include +#include + +#include "inode.h" + +/** + * ntfs_unmap_page - release a page that was mapped using ntfs_map_page() + * @page: the page to release + * + * Unpin, unmap and release a page that was obtained from ntfs_map_page(). + */ +static inline void ntfs_unmap_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +/** + * ntfs_map_page - map a page into accessible memory, reading it if necessary + * @mapping: address space for which to obtain the page + * @index: index into the page cache for @mapping of the page to map + * + * Read a page from the page cache of the address space @mapping at position + * @index, where @index is in units of PAGE_CACHE_SIZE, and not in bytes. + * + * If the page is not in memory it is loaded from disk first using the readpage + * method defined in the address space operations of @mapping and the page is + * added to the page cache of @mapping in the process. + * + * If the page belongs to an mst protected attribute and it is marked as such + * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no + * error checking is performed. This means the caller has to verify whether + * the ntfs record(s) contained in the page are valid or not using one of the + * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are + * expecting to see. (For details of the macros, see fs/ntfs/layout.h.) + * + * If the page is in high memory it is mapped into memory directly addressible + * by the kernel. + * + * Finally the page count is incremented, thus pinning the page into place. + * + * The above means that page_address(page) can be used on all pages obtained + * with ntfs_map_page() to get the kernel virtual address of the page. + * + * When finished with the page, the caller has to call ntfs_unmap_page() to + * unpin, unmap and release the page. + * + * Note this does not grant exclusive access. If such is desired, the caller + * must provide it independently of the ntfs_{un}map_page() calls by using + * a {rw_}semaphore or other means of serialization. A spin lock cannot be + * used as ntfs_map_page() can block. + * + * The unlocked and uptodate page is returned on success or an encoded error + * on failure. Caller has to test for error using the IS_ERR() macro on the + * return value. If that evaluates to 'true', the negative error code can be + * obtained using PTR_ERR() on the return value of ntfs_map_page(). + */ +static inline struct page *ntfs_map_page(struct address_space *mapping, + unsigned long index) +{ + struct page *page = read_mapping_page(mapping, index, NULL); + + if (!IS_ERR(page)) { + kmap(page); + if (!PageError(page)) + return page; + ntfs_unmap_page(page); + return ERR_PTR(-EIO); + } + return page; +} + +#ifdef NTFS_RW + +extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_AOPS_H */ diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c new file mode 100644 index 00000000..f14fde2b --- /dev/null +++ b/fs/ntfs/attrib.c @@ -0,0 +1,2615 @@ +/** + * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include + +#include "attrib.h" +#include "debug.h" +#include "layout.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "ntfs.h" +#include "types.h" + +/** + * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode + * @ni: ntfs inode for which to map (part of) a runlist + * @vcn: map runlist part containing this vcn + * @ctx: active attribute search context if present or NULL if not + * + * Map the part of a runlist containing the @vcn of the ntfs inode @ni. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when ntfs_map_runlist_nolock() encounters unmapped + * runlist fragments and allows their mapping. If you do not have the mft + * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock() + * will perform the necessary mapping and unmapping. + * + * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and + * restores it before returning. Thus, @ctx will be left pointing to the same + * attribute on return as on entry. However, the actual pointers in @ctx may + * point to different memory locations on return, so you must remember to reset + * any cached pointers from the @ctx, i.e. after the call to + * ntfs_map_runlist_nolock(), you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * + * Return 0 on success and -errno on error. There is one special error code + * which is not an error as such. This is -ENOENT. It means that @vcn is out + * of bounds of the runlist. + * + * Note the runlist can be NULL after this function returns if @vcn is zero and + * the attribute has zero allocated size, i.e. there simply is no runlist. + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist will be modified. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) +{ + VCN end_vcn; + unsigned long flags; + ntfs_inode *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + runlist_element *rl; + struct page *put_this_page = NULL; + int err = 0; + bool ctx_is_temporary, ctx_needs_reset; + ntfs_attr_search_ctx old_ctx = { NULL, }; + + ntfs_debug("Mapping runlist part containing vcn 0x%llx.", + (unsigned long long)vcn); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + if (!ctx) { + ctx_is_temporary = ctx_needs_reset = true; + m = map_mft_record(base_ni); + if (IS_ERR(m)) + return PTR_ERR(m); + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + } else { + VCN allocated_size_vcn; + + BUG_ON(IS_ERR(ctx->mrec)); + a = ctx->attr; + BUG_ON(!a->non_resident); + ctx_is_temporary = false; + end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + read_lock_irqsave(&ni->size_lock, flags); + allocated_size_vcn = ni->allocated_size >> + ni->vol->cluster_size_bits; + read_unlock_irqrestore(&ni->size_lock, flags); + if (!a->data.non_resident.lowest_vcn && end_vcn <= 0) + end_vcn = allocated_size_vcn - 1; + /* + * If we already have the attribute extent containing @vcn in + * @ctx, no need to look it up again. We slightly cheat in + * that if vcn exceeds the allocated size, we will refuse to + * map the runlist below, so there is definitely no need to get + * the right attribute extent. + */ + if (vcn >= allocated_size_vcn || (a->type == ni->type && + a->name_length == ni->name_len && + !memcmp((u8*)a + le16_to_cpu(a->name_offset), + ni->name, ni->name_len) && + sle64_to_cpu(a->data.non_resident.lowest_vcn) + <= vcn && end_vcn >= vcn)) + ctx_needs_reset = false; + else { + /* Save the old search context. */ + old_ctx = *ctx; + /* + * If the currently mapped (extent) inode is not the + * base inode we will unmap it when we reinitialize the + * search context which means we need to get a + * reference to the page containing the mapped mft + * record so we do not accidentally drop changes to the + * mft record when it has not been marked dirty yet. + */ + if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino != + old_ctx.base_ntfs_ino) { + put_this_page = old_ctx.ntfs_ino->page; + page_cache_get(put_this_page); + } + /* + * Reinitialize the search context so we can lookup the + * needed attribute extent. + */ + ntfs_attr_reinit_search_ctx(ctx); + ctx_needs_reset = true; + } + } + if (ctx_needs_reset) { + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, vcn, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + BUG_ON(!ctx->attr->non_resident); + } + a = ctx->attr; + /* + * Only decompress the mapping pairs if @vcn is inside it. Otherwise + * we get into problems when we try to map an out of bounds vcn because + * we then try to map the already mapped runlist fragment and + * ntfs_mapping_pairs_decompress() fails. + */ + end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1; + if (unlikely(vcn && vcn >= end_vcn)) { + err = -ENOENT; + goto err_out; + } + rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl); + if (IS_ERR(rl)) + err = PTR_ERR(rl); + else + ni->runlist.rl = rl; +err_out: + if (ctx_is_temporary) { + if (likely(ctx)) + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + } else if (ctx_needs_reset) { + /* + * If there is no attribute list, restoring the search context + * is accomplished simply by copying the saved context back over + * the caller supplied context. If there is an attribute list, + * things are more complicated as we need to deal with mapping + * of mft records and resulting potential changes in pointers. + */ + if (NInoAttrList(base_ni)) { + /* + * If the currently mapped (extent) inode is not the + * one we had before, we need to unmap it and map the + * old one. + */ + if (ctx->ntfs_ino != old_ctx.ntfs_ino) { + /* + * If the currently mapped inode is not the + * base inode, unmap it. + */ + if (ctx->base_ntfs_ino && ctx->ntfs_ino != + ctx->base_ntfs_ino) { + unmap_extent_mft_record(ctx->ntfs_ino); + ctx->mrec = ctx->base_mrec; + BUG_ON(!ctx->mrec); + } + /* + * If the old mapped inode is not the base + * inode, map it. + */ + if (old_ctx.base_ntfs_ino && + old_ctx.ntfs_ino != + old_ctx.base_ntfs_ino) { +retry_map: + ctx->mrec = map_mft_record( + old_ctx.ntfs_ino); + /* + * Something bad has happened. If out + * of memory retry till it succeeds. + * Any other errors are fatal and we + * return the error code in ctx->mrec. + * Let the caller deal with it... We + * just need to fudge things so the + * caller can reinit and/or put the + * search context safely. + */ + if (IS_ERR(ctx->mrec)) { + if (PTR_ERR(ctx->mrec) == + -ENOMEM) { + schedule(); + goto retry_map; + } else + old_ctx.ntfs_ino = + old_ctx. + base_ntfs_ino; + } + } + } + /* Update the changed pointers in the saved context. */ + if (ctx->mrec != old_ctx.mrec) { + if (!IS_ERR(ctx->mrec)) + old_ctx.attr = (ATTR_RECORD*)( + (u8*)ctx->mrec + + ((u8*)old_ctx.attr - + (u8*)old_ctx.mrec)); + old_ctx.mrec = ctx->mrec; + } + } + /* Restore the search context to the saved one. */ + *ctx = old_ctx; + /* + * We drop the reference on the page we took earlier. In the + * case that IS_ERR(ctx->mrec) is true this means we might lose + * some changes to the mft record that had been made between + * the last time it was marked dirty/written out and now. This + * at this stage is not a problem as the mapping error is fatal + * enough that the mft record cannot be written out anyway and + * the caller is very likely to shutdown the whole inode + * immediately and mark the volume dirty for chkdsk to pick up + * the pieces anyway. + */ + if (put_this_page) + page_cache_release(put_this_page); + } + return err; +} + +/** + * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode + * @ni: ntfs inode for which to map (part of) a runlist + * @vcn: map runlist part containing this vcn + * + * Map the part of a runlist containing the @vcn of the ntfs inode @ni. + * + * Return 0 on success and -errno on error. There is one special error code + * which is not an error as such. This is -ENOENT. It means that @vcn is out + * of bounds of the runlist. + * + * Locking: - The runlist must be unlocked on entry and is unlocked on return. + * - This function takes the runlist lock for writing and may modify + * the runlist. + */ +int ntfs_map_runlist(ntfs_inode *ni, VCN vcn) +{ + int err = 0; + + down_write(&ni->runlist.lock); + /* Make sure someone else didn't do the work while we were sleeping. */ + if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <= + LCN_RL_NOT_MAPPED)) + err = ntfs_map_runlist_nolock(ni, vcn, NULL); + up_write(&ni->runlist.lock); + return err; +} + +/** + * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode + * @ni: ntfs inode of the attribute whose runlist to search + * @vcn: vcn to convert + * @write_locked: true if the runlist is locked for writing + * + * Find the virtual cluster number @vcn in the runlist of the ntfs attribute + * described by the ntfs inode @ni and return the corresponding logical cluster + * number (lcn). + * + * If the @vcn is not mapped yet, the attempt is made to map the attribute + * extent containing the @vcn and the vcn to lcn conversion is retried. + * + * If @write_locked is true the caller has locked the runlist for writing and + * if false for reading. + * + * Since lcns must be >= 0, we use negative return codes with special meaning: + * + * Return code Meaning / Description + * ========================================== + * LCN_HOLE Hole / not allocated on disk. + * LCN_ENOENT There is no such vcn in the runlist, i.e. @vcn is out of bounds. + * LCN_ENOMEM Not enough memory to map runlist. + * LCN_EIO Critical error (runlist/file is corrupt, i/o error, etc). + * + * Locking: - The runlist must be locked on entry and is left locked on return. + * - If @write_locked is 'false', i.e. the runlist is locked for reading, + * the lock may be dropped inside the function so you cannot rely on + * the runlist still being the same when this function returns. + */ +LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, + const bool write_locked) +{ + LCN lcn; + unsigned long flags; + bool is_retry = false; + + ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.", + ni->mft_no, (unsigned long long)vcn, + write_locked ? "write" : "read"); + BUG_ON(!ni); + BUG_ON(!NInoNonResident(ni)); + BUG_ON(vcn < 0); + if (!ni->runlist.rl) { + read_lock_irqsave(&ni->size_lock, flags); + if (!ni->allocated_size) { + read_unlock_irqrestore(&ni->size_lock, flags); + return LCN_ENOENT; + } + read_unlock_irqrestore(&ni->size_lock, flags); + } +retry_remap: + /* Convert vcn to lcn. If that fails map the runlist and retry once. */ + lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn); + if (likely(lcn >= LCN_HOLE)) { + ntfs_debug("Done, lcn 0x%llx.", (long long)lcn); + return lcn; + } + if (lcn != LCN_RL_NOT_MAPPED) { + if (lcn != LCN_ENOENT) + lcn = LCN_EIO; + } else if (!is_retry) { + int err; + + if (!write_locked) { + up_read(&ni->runlist.lock); + down_write(&ni->runlist.lock); + if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) != + LCN_RL_NOT_MAPPED)) { + up_write(&ni->runlist.lock); + down_read(&ni->runlist.lock); + goto retry_remap; + } + } + err = ntfs_map_runlist_nolock(ni, vcn, NULL); + if (!write_locked) { + up_write(&ni->runlist.lock); + down_read(&ni->runlist.lock); + } + if (likely(!err)) { + is_retry = true; + goto retry_remap; + } + if (err == -ENOENT) + lcn = LCN_ENOENT; + else if (err == -ENOMEM) + lcn = LCN_ENOMEM; + else + lcn = LCN_EIO; + } + if (lcn != LCN_ENOENT) + ntfs_error(ni->vol->sb, "Failed with error code %lli.", + (long long)lcn); + return lcn; +} + +/** + * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode + * @ni: ntfs inode describing the runlist to search + * @vcn: vcn to find + * @ctx: active attribute search context if present or NULL if not + * + * Find the virtual cluster number @vcn in the runlist described by the ntfs + * inode @ni and return the address of the runlist element containing the @vcn. + * + * If the @vcn is not mapped yet, the attempt is made to map the attribute + * extent containing the @vcn and the vcn to lcn conversion is retried. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped + * runlist fragments and allows their mapping. If you do not have the mft + * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock() + * will perform the necessary mapping and unmapping. + * + * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and + * restores it before returning. Thus, @ctx will be left pointing to the same + * attribute on return as on entry. However, the actual pointers in @ctx may + * point to different memory locations on return, so you must remember to reset + * any cached pointers from the @ctx, i.e. after the call to + * ntfs_attr_find_vcn_nolock(), you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * Note you need to distinguish between the lcn of the returned runlist element + * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on + * read and allocate clusters on write. + * + * Return the runlist element containing the @vcn on success and + * ERR_PTR(-errno) on error. You need to test the return value with IS_ERR() + * to decide if the return is success or failure and PTR_ERR() to get to the + * error code if IS_ERR() is true. + * + * The possible error return codes are: + * -ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds. + * -ENOMEM - Not enough memory to map runlist. + * -EIO - Critical error (runlist/file is corrupt, i/o error, etc). + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist may be modified when + * needed runlist fragments need to be mapped. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, + ntfs_attr_search_ctx *ctx) +{ + unsigned long flags; + runlist_element *rl; + int err = 0; + bool is_retry = false; + + ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.", + ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out"); + BUG_ON(!ni); + BUG_ON(!NInoNonResident(ni)); + BUG_ON(vcn < 0); + if (!ni->runlist.rl) { + read_lock_irqsave(&ni->size_lock, flags); + if (!ni->allocated_size) { + read_unlock_irqrestore(&ni->size_lock, flags); + return ERR_PTR(-ENOENT); + } + read_unlock_irqrestore(&ni->size_lock, flags); + } +retry_remap: + rl = ni->runlist.rl; + if (likely(rl && vcn >= rl[0].vcn)) { + while (likely(rl->length)) { + if (unlikely(vcn < rl[1].vcn)) { + if (likely(rl->lcn >= LCN_HOLE)) { + ntfs_debug("Done."); + return rl; + } + break; + } + rl++; + } + if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) { + if (likely(rl->lcn == LCN_ENOENT)) + err = -ENOENT; + else + err = -EIO; + } + } + if (!err && !is_retry) { + /* + * If the search context is invalid we cannot map the unmapped + * region. + */ + if (IS_ERR(ctx->mrec)) + err = PTR_ERR(ctx->mrec); + else { + /* + * The @vcn is in an unmapped region, map the runlist + * and retry. + */ + err = ntfs_map_runlist_nolock(ni, vcn, ctx); + if (likely(!err)) { + is_retry = true; + goto retry_remap; + } + } + if (err == -EINVAL) + err = -EIO; + } else if (!err) + err = -EIO; + if (err != -ENOENT) + ntfs_error(ni->vol->sb, "Failed with error code %i.", err); + return ERR_PTR(err); +} + +/** + * ntfs_attr_find - find (next) attribute in mft record + * @type: attribute type to find + * @name: attribute name to find (optional, i.e. NULL means don't care) + * @name_len: attribute name length (only needed if @name present) + * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) + * @val: attribute value to find (optional, resident attributes only) + * @val_len: attribute value length + * @ctx: search context with mft record and attribute to search from + * + * You should not need to call this function directly. Use ntfs_attr_lookup() + * instead. + * + * ntfs_attr_find() takes a search context @ctx as parameter and searches the + * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an + * attribute of @type, optionally @name and @val. + * + * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will + * point to the found attribute. + * + * If the attribute is not found, ntfs_attr_find() returns -ENOENT and + * @ctx->attr will point to the attribute before which the attribute being + * searched for would need to be inserted if such an action were to be desired. + * + * On actual error, ntfs_attr_find() returns -EIO. In this case @ctx->attr is + * undefined and in particular do not rely on it not changing. + * + * If @ctx->is_first is 'true', the search begins with @ctx->attr itself. If it + * is 'false', the search begins after @ctx->attr. + * + * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and + * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record + * @ctx->mrec belongs. This is so we can get at the ntfs volume and hence at + * the upcase table. If @ic is CASE_SENSITIVE, the comparison is case + * sensitive. When @name is present, @name_len is the @name length in Unicode + * characters. + * + * If @name is not present (NULL), we assume that the unnamed attribute is + * being searched for. + * + * Finally, the resident attribute value @val is looked for, if present. If + * @val is not present (NULL), @val_len is ignored. + * + * ntfs_attr_find() only searches the specified mft record and it ignores the + * presence of an attribute list attribute (unless it is the one being searched + * for, obviously). If you need to take attribute lists into consideration, + * use ntfs_attr_lookup() instead (see below). This also means that you cannot + * use ntfs_attr_find() to search for extent records of non-resident + * attributes, as extents with lowest_vcn != 0 are usually described by the + * attribute list attribute only. - Note that it is possible that the first + * extent is only in the attribute list while the last extent is in the base + * mft record, so do not rely on being able to find the first extent in the + * base mft record. + * + * Warning: Never use @val when looking for attribute types which can be + * non-resident as this most likely will result in a crash! + */ +static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, + const u32 name_len, const IGNORE_CASE_BOOL ic, + const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) +{ + ATTR_RECORD *a; + ntfs_volume *vol = ctx->ntfs_ino->vol; + ntfschar *upcase = vol->upcase; + u32 upcase_len = vol->upcase_len; + + /* + * Iterate over attributes in mft record starting at @ctx->attr, or the + * attribute following that, if @ctx->is_first is 'true'. + */ + if (ctx->is_first) { + a = ctx->attr; + ctx->is_first = false; + } else + a = (ATTR_RECORD*)((u8*)ctx->attr + + le32_to_cpu(ctx->attr->length)); + for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { + if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated)) + break; + ctx->attr = a; + if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || + a->type == AT_END)) + return -ENOENT; + if (unlikely(!a->length)) + break; + if (a->type != type) + continue; + /* + * If @name is present, compare the two names. If @name is + * missing, assume we want an unnamed attribute. + */ + if (!name) { + /* The search failed if the found attribute is named. */ + if (a->name_length) + return -ENOENT; + } else if (!ntfs_are_names_equal(name, name_len, + (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)), + a->name_length, ic, upcase, upcase_len)) { + register int rc; + + rc = ntfs_collate_names(name, name_len, + (ntfschar*)((u8*)a + + le16_to_cpu(a->name_offset)), + a->name_length, 1, IGNORE_CASE, + upcase, upcase_len); + /* + * If @name collates before a->name, there is no + * matching attribute. + */ + if (rc == -1) + return -ENOENT; + /* If the strings are not equal, continue search. */ + if (rc) + continue; + rc = ntfs_collate_names(name, name_len, + (ntfschar*)((u8*)a + + le16_to_cpu(a->name_offset)), + a->name_length, 1, CASE_SENSITIVE, + upcase, upcase_len); + if (rc == -1) + return -ENOENT; + if (rc) + continue; + } + /* + * The names match or @name not present and attribute is + * unnamed. If no @val specified, we have found the attribute + * and are done. + */ + if (!val) + return 0; + /* @val is present; compare values. */ + else { + register int rc; + + rc = memcmp(val, (u8*)a + le16_to_cpu( + a->data.resident.value_offset), + min_t(u32, val_len, le32_to_cpu( + a->data.resident.value_length))); + /* + * If @val collates before the current attribute's + * value, there is no matching attribute. + */ + if (!rc) { + register u32 avl; + + avl = le32_to_cpu( + a->data.resident.value_length); + if (val_len == avl) + return 0; + if (val_len < avl) + return -ENOENT; + } else if (rc < 0) + return -ENOENT; + } + } + ntfs_error(vol->sb, "Inode is corrupt. Run chkdsk."); + NVolSetErrors(vol); + return -EIO; +} + +/** + * load_attribute_list - load an attribute list into memory + * @vol: ntfs volume from which to read + * @runlist: runlist of the attribute list + * @al_start: destination buffer + * @size: size of the destination buffer in bytes + * @initialized_size: initialized size of the attribute list + * + * Walk the runlist @runlist and load all clusters from it copying them into + * the linear buffer @al. The maximum number of bytes copied to @al is @size + * bytes. Note, @size does not need to be a multiple of the cluster size. If + * @initialized_size is less than @size, the region in @al between + * @initialized_size and @size will be zeroed and not read from disk. + * + * Return 0 on success or -errno on error. + */ +int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start, + const s64 size, const s64 initialized_size) +{ + LCN lcn; + u8 *al = al_start; + u8 *al_end = al + initialized_size; + runlist_element *rl; + struct buffer_head *bh; + struct super_block *sb; + unsigned long block_size; + unsigned long block, max_block; + int err = 0; + unsigned char block_size_bits; + + ntfs_debug("Entering."); + if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 || + initialized_size > size) + return -EINVAL; + if (!initialized_size) { + memset(al, 0, size); + return 0; + } + sb = vol->sb; + block_size = sb->s_blocksize; + block_size_bits = sb->s_blocksize_bits; + down_read(&runlist->lock); + rl = runlist->rl; + if (!rl) { + ntfs_error(sb, "Cannot read attribute list since runlist is " + "missing."); + goto err_out; + } + /* Read all clusters specified by the runlist one run at a time. */ + while (rl->length) { + lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn); + ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", + (unsigned long long)rl->vcn, + (unsigned long long)lcn); + /* The attribute list cannot be sparse. */ + if (lcn < 0) { + ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed. Cannot " + "read attribute list."); + goto err_out; + } + block = lcn << vol->cluster_size_bits >> block_size_bits; + /* Read the run from device in chunks of block_size bytes. */ + max_block = block + (rl->length << vol->cluster_size_bits >> + block_size_bits); + ntfs_debug("max_block = 0x%lx.", max_block); + do { + ntfs_debug("Reading block = 0x%lx.", block); + bh = sb_bread(sb, block); + if (!bh) { + ntfs_error(sb, "sb_bread() failed. Cannot " + "read attribute list."); + goto err_out; + } + if (al + block_size >= al_end) + goto do_final; + memcpy(al, bh->b_data, block_size); + brelse(bh); + al += block_size; + } while (++block < max_block); + rl++; + } + if (initialized_size < size) { +initialize: + memset(al_start + initialized_size, 0, size - initialized_size); + } +done: + up_read(&runlist->lock); + return err; +do_final: + if (al < al_end) { + /* + * Partial block. + * + * Note: The attribute list can be smaller than its allocation + * by multiple clusters. This has been encountered by at least + * two people running Windows XP, thus we cannot do any + * truncation sanity checking here. (AIA) + */ + memcpy(al, bh->b_data, al_end - al); + brelse(bh); + if (initialized_size < size) + goto initialize; + goto done; + } + brelse(bh); + /* Real overflow! */ + ntfs_error(sb, "Attribute list buffer overflow. Read attribute list " + "is truncated."); +err_out: + err = -EIO; + goto done; +} + +/** + * ntfs_external_attr_find - find an attribute in the attribute list of an inode + * @type: attribute type to find + * @name: attribute name to find (optional, i.e. NULL means don't care) + * @name_len: attribute name length (only needed if @name present) + * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) + * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only) + * @val: attribute value to find (optional, resident attributes only) + * @val_len: attribute value length + * @ctx: search context with mft record and attribute to search from + * + * You should not need to call this function directly. Use ntfs_attr_lookup() + * instead. + * + * Find an attribute by searching the attribute list for the corresponding + * attribute list entry. Having found the entry, map the mft record if the + * attribute is in a different mft record/inode, ntfs_attr_find() the attribute + * in there and return it. + * + * On first search @ctx->ntfs_ino must be the base mft record and @ctx must + * have been obtained from a call to ntfs_attr_get_search_ctx(). On subsequent + * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is + * then the base inode). + * + * After finishing with the attribute/mft record you need to call + * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any + * mapped inodes, etc). + * + * If the attribute is found, ntfs_external_attr_find() returns 0 and + * @ctx->attr will point to the found attribute. @ctx->mrec will point to the + * mft record in which @ctx->attr is located and @ctx->al_entry will point to + * the attribute list entry for the attribute. + * + * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and + * @ctx->attr will point to the attribute in the base mft record before which + * the attribute being searched for would need to be inserted if such an action + * were to be desired. @ctx->mrec will point to the mft record in which + * @ctx->attr is located and @ctx->al_entry will point to the attribute list + * entry of the attribute before which the attribute being searched for would + * need to be inserted if such an action were to be desired. + * + * Thus to insert the not found attribute, one wants to add the attribute to + * @ctx->mrec (the base mft record) and if there is not enough space, the + * attribute should be placed in a newly allocated extent mft record. The + * attribute list entry for the inserted attribute should be inserted in the + * attribute list attribute at @ctx->al_entry. + * + * On actual error, ntfs_external_attr_find() returns -EIO. In this case + * @ctx->attr is undefined and in particular do not rely on it not changing. + */ +static int ntfs_external_attr_find(const ATTR_TYPE type, + const ntfschar *name, const u32 name_len, + const IGNORE_CASE_BOOL ic, const VCN lowest_vcn, + const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) +{ + ntfs_inode *base_ni, *ni; + ntfs_volume *vol; + ATTR_LIST_ENTRY *al_entry, *next_al_entry; + u8 *al_start, *al_end; + ATTR_RECORD *a; + ntfschar *al_name; + u32 al_name_len; + int err = 0; + static const char *es = " Unmount and run chkdsk."; + + ni = ctx->ntfs_ino; + base_ni = ctx->base_ntfs_ino; + ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type); + if (!base_ni) { + /* First call happens with the base mft record. */ + base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino; + ctx->base_mrec = ctx->mrec; + } + if (ni == base_ni) + ctx->base_attr = ctx->attr; + if (type == AT_END) + goto not_found; + vol = base_ni->vol; + al_start = base_ni->attr_list; + al_end = al_start + base_ni->attr_list_size; + if (!ctx->al_entry) + ctx->al_entry = (ATTR_LIST_ENTRY*)al_start; + /* + * Iterate over entries in attribute list starting at @ctx->al_entry, + * or the entry following that, if @ctx->is_first is 'true'. + */ + if (ctx->is_first) { + al_entry = ctx->al_entry; + ctx->is_first = false; + } else + al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry + + le16_to_cpu(ctx->al_entry->length)); + for (;; al_entry = next_al_entry) { + /* Out of bounds check. */ + if ((u8*)al_entry < base_ni->attr_list || + (u8*)al_entry > al_end) + break; /* Inode is corrupt. */ + ctx->al_entry = al_entry; + /* Catch the end of the attribute list. */ + if ((u8*)al_entry == al_end) + goto not_found; + if (!al_entry->length) + break; + if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + + le16_to_cpu(al_entry->length) > al_end) + break; + next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + + le16_to_cpu(al_entry->length)); + if (le32_to_cpu(al_entry->type) > le32_to_cpu(type)) + goto not_found; + if (type != al_entry->type) + continue; + /* + * If @name is present, compare the two names. If @name is + * missing, assume we want an unnamed attribute. + */ + al_name_len = al_entry->name_length; + al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset); + if (!name) { + if (al_name_len) + goto not_found; + } else if (!ntfs_are_names_equal(al_name, al_name_len, name, + name_len, ic, vol->upcase, vol->upcase_len)) { + register int rc; + + rc = ntfs_collate_names(name, name_len, al_name, + al_name_len, 1, IGNORE_CASE, + vol->upcase, vol->upcase_len); + /* + * If @name collates before al_name, there is no + * matching attribute. + */ + if (rc == -1) + goto not_found; + /* If the strings are not equal, continue search. */ + if (rc) + continue; + /* + * FIXME: Reverse engineering showed 0, IGNORE_CASE but + * that is inconsistent with ntfs_attr_find(). The + * subsequent rc checks were also different. Perhaps I + * made a mistake in one of the two. Need to recheck + * which is correct or at least see what is going on... + * (AIA) + */ + rc = ntfs_collate_names(name, name_len, al_name, + al_name_len, 1, CASE_SENSITIVE, + vol->upcase, vol->upcase_len); + if (rc == -1) + goto not_found; + if (rc) + continue; + } + /* + * The names match or @name not present and attribute is + * unnamed. Now check @lowest_vcn. Continue search if the + * next attribute list entry still fits @lowest_vcn. Otherwise + * we have reached the right one or the search has failed. + */ + if (lowest_vcn && (u8*)next_al_entry >= al_start && + (u8*)next_al_entry + 6 < al_end && + (u8*)next_al_entry + le16_to_cpu( + next_al_entry->length) <= al_end && + sle64_to_cpu(next_al_entry->lowest_vcn) <= + lowest_vcn && + next_al_entry->type == al_entry->type && + next_al_entry->name_length == al_name_len && + ntfs_are_names_equal((ntfschar*)((u8*) + next_al_entry + + next_al_entry->name_offset), + next_al_entry->name_length, + al_name, al_name_len, CASE_SENSITIVE, + vol->upcase, vol->upcase_len)) + continue; + if (MREF_LE(al_entry->mft_reference) == ni->mft_no) { + if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) { + ntfs_error(vol->sb, "Found stale mft " + "reference in attribute list " + "of base inode 0x%lx.%s", + base_ni->mft_no, es); + err = -EIO; + break; + } + } else { /* Mft references do not match. */ + /* If there is a mapped record unmap it first. */ + if (ni != base_ni) + unmap_extent_mft_record(ni); + /* Do we want the base record back? */ + if (MREF_LE(al_entry->mft_reference) == + base_ni->mft_no) { + ni = ctx->ntfs_ino = base_ni; + ctx->mrec = ctx->base_mrec; + } else { + /* We want an extent record. */ + ctx->mrec = map_extent_mft_record(base_ni, + le64_to_cpu( + al_entry->mft_reference), &ni); + if (IS_ERR(ctx->mrec)) { + ntfs_error(vol->sb, "Failed to map " + "extent mft record " + "0x%lx of base inode " + "0x%lx.%s", + MREF_LE(al_entry-> + mft_reference), + base_ni->mft_no, es); + err = PTR_ERR(ctx->mrec); + if (err == -ENOENT) + err = -EIO; + /* Cause @ctx to be sanitized below. */ + ni = NULL; + break; + } + ctx->ntfs_ino = ni; + } + ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + + le16_to_cpu(ctx->mrec->attrs_offset)); + } + /* + * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the + * mft record containing the attribute represented by the + * current al_entry. + */ + /* + * We could call into ntfs_attr_find() to find the right + * attribute in this mft record but this would be less + * efficient and not quite accurate as ntfs_attr_find() ignores + * the attribute instance numbers for example which become + * important when one plays with attribute lists. Also, + * because a proper match has been found in the attribute list + * entry above, the comparison can now be optimized. So it is + * worth re-implementing a simplified ntfs_attr_find() here. + */ + a = ctx->attr; + /* + * Use a manual loop so we can still use break and continue + * with the same meanings as above. + */ +do_next_attr_loop: + if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated)) + break; + if (a->type == AT_END) + break; + if (!a->length) + break; + if (al_entry->instance != a->instance) + goto do_next_attr; + /* + * If the type and/or the name are mismatched between the + * attribute list entry and the attribute record, there is + * corruption so we break and return error EIO. + */ + if (al_entry->type != a->type) + break; + if (!ntfs_are_names_equal((ntfschar*)((u8*)a + + le16_to_cpu(a->name_offset)), a->name_length, + al_name, al_name_len, CASE_SENSITIVE, + vol->upcase, vol->upcase_len)) + break; + ctx->attr = a; + /* + * If no @val specified or @val specified and it matches, we + * have found it! + */ + if (!val || (!a->non_resident && le32_to_cpu( + a->data.resident.value_length) == val_len && + !memcmp((u8*)a + + le16_to_cpu(a->data.resident.value_offset), + val, val_len))) { + ntfs_debug("Done, found."); + return 0; + } +do_next_attr: + /* Proceed to the next attribute in the current mft record. */ + a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length)); + goto do_next_attr_loop; + } + if (!err) { + ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt " + "attribute list attribute.%s", base_ni->mft_no, + es); + err = -EIO; + } + if (ni != base_ni) { + if (ni) + unmap_extent_mft_record(ni); + ctx->ntfs_ino = base_ni; + ctx->mrec = ctx->base_mrec; + ctx->attr = ctx->base_attr; + } + if (err != -ENOMEM) + NVolSetErrors(vol); + return err; +not_found: + /* + * If we were looking for AT_END, we reset the search context @ctx and + * use ntfs_attr_find() to seek to the end of the base mft record. + */ + if (type == AT_END) { + ntfs_attr_reinit_search_ctx(ctx); + return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len, + ctx); + } + /* + * The attribute was not found. Before we return, we want to ensure + * @ctx->mrec and @ctx->attr indicate the position at which the + * attribute should be inserted in the base mft record. Since we also + * want to preserve @ctx->al_entry we cannot reinitialize the search + * context using ntfs_attr_reinit_search_ctx() as this would set + * @ctx->al_entry to NULL. Thus we do the necessary bits manually (see + * ntfs_attr_init_search_ctx() below). Note, we _only_ preserve + * @ctx->al_entry as the remaining fields (base_*) are identical to + * their non base_ counterparts and we cannot set @ctx->base_attr + * correctly yet as we do not know what @ctx->attr will be set to by + * the call to ntfs_attr_find() below. + */ + if (ni != base_ni) + unmap_extent_mft_record(ni); + ctx->mrec = ctx->base_mrec; + ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + + le16_to_cpu(ctx->mrec->attrs_offset)); + ctx->is_first = true; + ctx->ntfs_ino = base_ni; + ctx->base_ntfs_ino = NULL; + ctx->base_mrec = NULL; + ctx->base_attr = NULL; + /* + * In case there are multiple matches in the base mft record, need to + * keep enumerating until we get an attribute not found response (or + * another error), otherwise we would keep returning the same attribute + * over and over again and all programs using us for enumeration would + * lock up in a tight loop. + */ + do { + err = ntfs_attr_find(type, name, name_len, ic, val, val_len, + ctx); + } while (!err); + ntfs_debug("Done, not found."); + return err; +} + +/** + * ntfs_attr_lookup - find an attribute in an ntfs inode + * @type: attribute type to find + * @name: attribute name to find (optional, i.e. NULL means don't care) + * @name_len: attribute name length (only needed if @name present) + * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) + * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only) + * @val: attribute value to find (optional, resident attributes only) + * @val_len: attribute value length + * @ctx: search context with mft record and attribute to search from + * + * Find an attribute in an ntfs inode. On first search @ctx->ntfs_ino must + * be the base mft record and @ctx must have been obtained from a call to + * ntfs_attr_get_search_ctx(). + * + * This function transparently handles attribute lists and @ctx is used to + * continue searches where they were left off at. + * + * After finishing with the attribute/mft record you need to call + * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any + * mapped inodes, etc). + * + * Return 0 if the search was successful and -errno if not. + * + * When 0, @ctx->attr is the found attribute and it is in mft record + * @ctx->mrec. If an attribute list attribute is present, @ctx->al_entry is + * the attribute list entry of the found attribute. + * + * When -ENOENT, @ctx->attr is the attribute which collates just after the + * attribute being searched for, i.e. if one wants to add the attribute to the + * mft record this is the correct place to insert it into. If an attribute + * list attribute is present, @ctx->al_entry is the attribute list entry which + * collates just after the attribute list entry of the attribute being searched + * for, i.e. if one wants to add the attribute to the mft record this is the + * correct place to insert its attribute list entry into. + * + * When -errno != -ENOENT, an error occurred during the lookup. @ctx->attr is + * then undefined and in particular you should not rely on it not changing. + */ +int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, + const u32 name_len, const IGNORE_CASE_BOOL ic, + const VCN lowest_vcn, const u8 *val, const u32 val_len, + ntfs_attr_search_ctx *ctx) +{ + ntfs_inode *base_ni; + + ntfs_debug("Entering."); + BUG_ON(IS_ERR(ctx->mrec)); + if (ctx->base_ntfs_ino) + base_ni = ctx->base_ntfs_ino; + else + base_ni = ctx->ntfs_ino; + /* Sanity check, just for debugging really. */ + BUG_ON(!base_ni); + if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST) + return ntfs_attr_find(type, name, name_len, ic, val, val_len, + ctx); + return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn, + val, val_len, ctx); +} + +/** + * ntfs_attr_init_search_ctx - initialize an attribute search context + * @ctx: attribute search context to initialize + * @ni: ntfs inode with which to initialize the search context + * @mrec: mft record with which to initialize the search context + * + * Initialize the attribute search context @ctx with @ni and @mrec. + */ +static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx, + ntfs_inode *ni, MFT_RECORD *mrec) +{ + *ctx = (ntfs_attr_search_ctx) { + .mrec = mrec, + /* Sanity checks are performed elsewhere. */ + .attr = (ATTR_RECORD*)((u8*)mrec + + le16_to_cpu(mrec->attrs_offset)), + .is_first = true, + .ntfs_ino = ni, + }; +} + +/** + * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context + * @ctx: attribute search context to reinitialize + * + * Reinitialize the attribute search context @ctx, unmapping an associated + * extent mft record if present, and initialize the search context again. + * + * This is used when a search for a new attribute is being started to reset + * the search context to the beginning. + */ +void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx) +{ + if (likely(!ctx->base_ntfs_ino)) { + /* No attribute list. */ + ctx->is_first = true; + /* Sanity checks are performed elsewhere. */ + ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + + le16_to_cpu(ctx->mrec->attrs_offset)); + /* + * This needs resetting due to ntfs_external_attr_find() which + * can leave it set despite having zeroed ctx->base_ntfs_ino. + */ + ctx->al_entry = NULL; + return; + } /* Attribute list. */ + if (ctx->ntfs_ino != ctx->base_ntfs_ino) + unmap_extent_mft_record(ctx->ntfs_ino); + ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec); + return; +} + +/** + * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context + * @ni: ntfs inode with which to initialize the search context + * @mrec: mft record with which to initialize the search context + * + * Allocate a new attribute search context, initialize it with @ni and @mrec, + * and return it. Return NULL if allocation failed. + */ +ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec) +{ + ntfs_attr_search_ctx *ctx; + + ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS); + if (ctx) + ntfs_attr_init_search_ctx(ctx, ni, mrec); + return ctx; +} + +/** + * ntfs_attr_put_search_ctx - release an attribute search context + * @ctx: attribute search context to free + * + * Release the attribute search context @ctx, unmapping an associated extent + * mft record if present. + */ +void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx) +{ + if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino) + unmap_extent_mft_record(ctx->ntfs_ino); + kmem_cache_free(ntfs_attr_ctx_cache, ctx); + return; +} + +#ifdef NTFS_RW + +/** + * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to find + * + * Search for the attribute definition record corresponding to the attribute + * @type in the $AttrDef system file. + * + * Return the attribute type definition record if found and NULL if not found. + */ +static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol, + const ATTR_TYPE type) +{ + ATTR_DEF *ad; + + BUG_ON(!vol->attrdef); + BUG_ON(!type); + for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef < + vol->attrdef_size && ad->type; ++ad) { + /* We have not found it yet, carry on searching. */ + if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type))) + continue; + /* We found the attribute; return it. */ + if (likely(ad->type == type)) + return ad; + /* We have gone too far already. No point in continuing. */ + break; + } + /* Attribute not found. */ + ntfs_debug("Attribute type 0x%x not found in $AttrDef.", + le32_to_cpu(type)); + return NULL; +} + +/** + * ntfs_attr_size_bounds_check - check a size of an attribute type for validity + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to check + * @size: size which to check + * + * Check whether the @size in bytes is valid for an attribute of @type on the + * ntfs volume @vol. This information is obtained from $AttrDef system file. + * + * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not + * listed in $AttrDef. + */ +int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type, + const s64 size) +{ + ATTR_DEF *ad; + + BUG_ON(size < 0); + /* + * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not + * listed in $AttrDef. + */ + if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024)) + return -ERANGE; + /* Get the $AttrDef entry for the attribute @type. */ + ad = ntfs_attr_find_in_attrdef(vol, type); + if (unlikely(!ad)) + return -ENOENT; + /* Do the bounds check. */ + if (((sle64_to_cpu(ad->min_size) > 0) && + size < sle64_to_cpu(ad->min_size)) || + ((sle64_to_cpu(ad->max_size) > 0) && size > + sle64_to_cpu(ad->max_size))) + return -ERANGE; + return 0; +} + +/** + * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to check + * + * Check whether the attribute of @type on the ntfs volume @vol is allowed to + * be non-resident. This information is obtained from $AttrDef system file. + * + * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and + * -ENOENT if the attribute is not listed in $AttrDef. + */ +int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type) +{ + ATTR_DEF *ad; + + /* Find the attribute definition record in $AttrDef. */ + ad = ntfs_attr_find_in_attrdef(vol, type); + if (unlikely(!ad)) + return -ENOENT; + /* Check the flags and return the result. */ + if (ad->flags & ATTR_DEF_RESIDENT) + return -EPERM; + return 0; +} + +/** + * ntfs_attr_can_be_resident - check if an attribute can be resident + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to check + * + * Check whether the attribute of @type on the ntfs volume @vol is allowed to + * be resident. This information is derived from our ntfs knowledge and may + * not be completely accurate, especially when user defined attributes are + * present. Basically we allow everything to be resident except for index + * allocation and $EA attributes. + * + * Return 0 if the attribute is allowed to be non-resident and -EPERM if not. + * + * Warning: In the system file $MFT the attribute $Bitmap must be non-resident + * otherwise windows will not boot (blue screen of death)! We cannot + * check for this here as we do not know which inode's $Bitmap is + * being asked about so the caller needs to special case this. + */ +int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type) +{ + if (type == AT_INDEX_ALLOCATION) + return -EPERM; + return 0; +} + +/** + * ntfs_attr_record_resize - resize an attribute record + * @m: mft record containing attribute record + * @a: attribute record to resize + * @new_size: new size in bytes to which to resize the attribute record @a + * + * Resize the attribute record @a, i.e. the resident part of the attribute, in + * the mft record @m to @new_size bytes. + * + * Return 0 on success and -errno on error. The following error codes are + * defined: + * -ENOSPC - Not enough space in the mft record @m to perform the resize. + * + * Note: On error, no modifications have been performed whatsoever. + * + * Warning: If you make a record smaller without having copied all the data you + * are interested in the data may be overwritten. + */ +int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size) +{ + ntfs_debug("Entering for new_size %u.", new_size); + /* Align to 8 bytes if it is not already done. */ + if (new_size & 7) + new_size = (new_size + 7) & ~7; + /* If the actual attribute length has changed, move things around. */ + if (new_size != le32_to_cpu(a->length)) { + u32 new_muse = le32_to_cpu(m->bytes_in_use) - + le32_to_cpu(a->length) + new_size; + /* Not enough space in this mft record. */ + if (new_muse > le32_to_cpu(m->bytes_allocated)) + return -ENOSPC; + /* Move attributes following @a to their new location. */ + memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length), + le32_to_cpu(m->bytes_in_use) - ((u8*)a - + (u8*)m) - le32_to_cpu(a->length)); + /* Adjust @m to reflect the change in used space. */ + m->bytes_in_use = cpu_to_le32(new_muse); + /* Adjust @a to reflect the new size. */ + if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length)) + a->length = cpu_to_le32(new_size); + } + return 0; +} + +/** + * ntfs_resident_attr_value_resize - resize the value of a resident attribute + * @m: mft record containing attribute record + * @a: attribute record whose value to resize + * @new_size: new size in bytes to which to resize the attribute value of @a + * + * Resize the value of the attribute @a in the mft record @m to @new_size bytes. + * If the value is made bigger, the newly allocated space is cleared. + * + * Return 0 on success and -errno on error. The following error codes are + * defined: + * -ENOSPC - Not enough space in the mft record @m to perform the resize. + * + * Note: On error, no modifications have been performed whatsoever. + * + * Warning: If you make a record smaller without having copied all the data you + * are interested in the data may be overwritten. + */ +int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, + const u32 new_size) +{ + u32 old_size; + + /* Resize the resident part of the attribute record. */ + if (ntfs_attr_record_resize(m, a, + le16_to_cpu(a->data.resident.value_offset) + new_size)) + return -ENOSPC; + /* + * The resize succeeded! If we made the attribute value bigger, clear + * the area between the old size and @new_size. + */ + old_size = le32_to_cpu(a->data.resident.value_length); + if (new_size > old_size) + memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) + + old_size, 0, new_size - old_size); + /* Finally update the length of the attribute value. */ + a->data.resident.value_length = cpu_to_le32(new_size); + return 0; +} + +/** + * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute + * @ni: ntfs inode describing the attribute to convert + * @data_size: size of the resident data to copy to the non-resident attribute + * + * Convert the resident ntfs attribute described by the ntfs inode @ni to a + * non-resident one. + * + * @data_size must be equal to the attribute value size. This is needed since + * we need to know the size before we can map the mft record and our callers + * always know it. The reason we cannot simply read the size from the vfs + * inode i_size is that this is not necessarily uptodate. This happens when + * ntfs_attr_make_non_resident() is called in the ->truncate call path(s). + * + * Return 0 on success and -errno on error. The following error return codes + * are defined: + * -EPERM - The attribute is not allowed to be non-resident. + * -ENOMEM - Not enough memory. + * -ENOSPC - Not enough disk space. + * -EINVAL - Attribute not defined on the volume. + * -EIO - I/o error or other error. + * Note that -ENOSPC is also returned in the case that there is not enough + * space in the mft record to do the conversion. This can happen when the mft + * record is already very full. The caller is responsible for trying to make + * space in the mft record and trying again. FIXME: Do we need a separate + * error return code for this kind of -ENOSPC or is it always worth trying + * again in case the attribute may then fit in a resident state so no need to + * make it non-resident at all? Ho-hum... (AIA) + * + * NOTE to self: No changes in the attribute list are required to move from + * a resident to a non-resident attribute. + * + * Locking: - The caller must hold i_mutex on the inode. + */ +int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) +{ + s64 new_size; + struct inode *vi = VFS_I(ni); + ntfs_volume *vol = ni->vol; + ntfs_inode *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + struct page *page; + runlist_element *rl; + u8 *kaddr; + unsigned long flags; + int mp_size, mp_ofs, name_ofs, arec_size, err, err2; + u32 attr_size; + u8 old_res_attr_flags; + + /* Check that the attribute is allowed to be non-resident. */ + err = ntfs_attr_can_be_non_resident(vol, ni->type); + if (unlikely(err)) { + if (err == -EPERM) + ntfs_debug("Attribute is not allowed to be " + "non-resident."); + else + ntfs_debug("Attribute not defined on the NTFS " + "volume!"); + return err; + } + /* + * FIXME: Compressed and encrypted attributes are not supported when + * writing and we should never have gotten here for them. + */ + BUG_ON(NInoCompressed(ni)); + BUG_ON(NInoEncrypted(ni)); + /* + * The size needs to be aligned to a cluster boundary for allocation + * purposes. + */ + new_size = (data_size + vol->cluster_size - 1) & + ~(vol->cluster_size - 1); + if (new_size > 0) { + /* + * Will need the page later and since the page lock nests + * outside all ntfs locks, we need to get the page now. + */ + page = find_or_create_page(vi->i_mapping, 0, + mapping_gfp_mask(vi->i_mapping)); + if (unlikely(!page)) + return -ENOMEM; + /* Start by allocating clusters to hold the attribute value. */ + rl = ntfs_cluster_alloc(vol, 0, new_size >> + vol->cluster_size_bits, -1, DATA_ZONE, true); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + ntfs_debug("Failed to allocate cluster%s, error code " + "%i.", (new_size >> + vol->cluster_size_bits) > 1 ? "s" : "", + err); + goto page_err_out; + } + } else { + rl = NULL; + page = NULL; + } + /* Determine the size of the mapping pairs array. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1); + if (unlikely(mp_size < 0)) { + err = mp_size; + ntfs_debug("Failed to get size for mapping pairs array, error " + "code %i.", err); + goto rl_err_out; + } + down_write(&ni->runlist.lock); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(NInoNonResident(ni)); + BUG_ON(a->non_resident); + /* + * Calculate new offsets for the name and the mapping pairs array. + */ + if (NInoSparse(ni) || NInoCompressed(ni)) + name_ofs = (offsetof(ATTR_REC, + data.non_resident.compressed_size) + + sizeof(a->data.non_resident.compressed_size) + + 7) & ~7; + else + name_ofs = (offsetof(ATTR_REC, + data.non_resident.compressed_size) + 7) & ~7; + mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; + /* + * Determine the size of the resident part of the now non-resident + * attribute record. + */ + arec_size = (mp_ofs + mp_size + 7) & ~7; + /* + * If the page is not uptodate bring it uptodate by copying from the + * attribute value. + */ + attr_size = le32_to_cpu(a->data.resident.value_length); + BUG_ON(attr_size != data_size); + if (page && !PageUptodate(page)) { + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr, (u8*)a + + le16_to_cpu(a->data.resident.value_offset), + attr_size); + memset(kaddr + attr_size, 0, PAGE_CACHE_SIZE - attr_size); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page); + SetPageUptodate(page); + } + /* Backup the attribute flag. */ + old_res_attr_flags = a->data.resident.flags; + /* Resize the resident part of the attribute record. */ + err = ntfs_attr_record_resize(m, a, arec_size); + if (unlikely(err)) + goto err_out; + /* + * Convert the resident part of the attribute record to describe a + * non-resident attribute. + */ + a->non_resident = 1; + /* Move the attribute name if it exists and update the offset. */ + if (a->name_length) + memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(ntfschar)); + a->name_offset = cpu_to_le16(name_ofs); + /* Setup the fields specific to non-resident attributes. */ + a->data.non_resident.lowest_vcn = 0; + a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >> + vol->cluster_size_bits); + a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs); + memset(&a->data.non_resident.reserved, 0, + sizeof(a->data.non_resident.reserved)); + a->data.non_resident.allocated_size = cpu_to_sle64(new_size); + a->data.non_resident.data_size = + a->data.non_resident.initialized_size = + cpu_to_sle64(attr_size); + if (NInoSparse(ni) || NInoCompressed(ni)) { + a->data.non_resident.compression_unit = 0; + if (NInoCompressed(ni) || vol->major_ver < 3) + a->data.non_resident.compression_unit = 4; + a->data.non_resident.compressed_size = + a->data.non_resident.allocated_size; + } else + a->data.non_resident.compression_unit = 0; + /* Generate the mapping pairs array into the attribute record. */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs, + arec_size - mp_ofs, rl, 0, -1, NULL); + if (unlikely(err)) { + ntfs_debug("Failed to build mapping pairs, error code %i.", + err); + goto undo_err_out; + } + /* Setup the in-memory attribute structure to be non-resident. */ + ni->runlist.rl = rl; + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_size; + if (NInoSparse(ni) || NInoCompressed(ni)) { + ni->itype.compressed.size = ni->allocated_size; + if (a->data.non_resident.compression_unit) { + ni->itype.compressed.block_size = 1U << (a->data. + non_resident.compression_unit + + vol->cluster_size_bits); + ni->itype.compressed.block_size_bits = + ffs(ni->itype.compressed.block_size) - + 1; + ni->itype.compressed.block_clusters = 1U << + a->data.non_resident.compression_unit; + } else { + ni->itype.compressed.block_size = 0; + ni->itype.compressed.block_size_bits = 0; + ni->itype.compressed.block_clusters = 0; + } + vi->i_blocks = ni->itype.compressed.size >> 9; + } else + vi->i_blocks = ni->allocated_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); + /* + * This needs to be last since the address space operations ->readpage + * and ->writepage can run concurrently with us as they are not + * serialized on i_mutex. Note, we are not allowed to fail once we flip + * this switch, which is another reason to do this last. + */ + NInoSetNonResident(ni); + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + if (page) { + set_page_dirty(page); + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + } + ntfs_debug("Done."); + return 0; +undo_err_out: + /* Convert the attribute back into a resident attribute. */ + a->non_resident = 0; + /* Move the attribute name if it exists and update the offset. */ + name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) + + sizeof(a->data.resident.reserved) + 7) & ~7; + if (a->name_length) + memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(ntfschar)); + mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; + a->name_offset = cpu_to_le16(name_ofs); + arec_size = (mp_ofs + attr_size + 7) & ~7; + /* Resize the resident part of the attribute record. */ + err2 = ntfs_attr_record_resize(m, a, arec_size); + if (unlikely(err2)) { + /* + * This cannot happen (well if memory corruption is at work it + * could happen in theory), but deal with it as well as we can. + * If the old size is too small, truncate the attribute, + * otherwise simply give it a larger allocated size. + * FIXME: Should check whether chkdsk complains when the + * allocated size is much bigger than the resident value size. + */ + arec_size = le32_to_cpu(a->length); + if ((mp_ofs + attr_size) > arec_size) { + err2 = attr_size; + attr_size = arec_size - mp_ofs; + ntfs_error(vol->sb, "Failed to undo partial resident " + "to non-resident attribute " + "conversion. Truncating inode 0x%lx, " + "attribute type 0x%x from %i bytes to " + "%i bytes to maintain metadata " + "consistency. THIS MEANS YOU ARE " + "LOSING %i BYTES DATA FROM THIS %s.", + vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + err2, attr_size, err2 - attr_size, + ((ni->type == AT_DATA) && + !ni->name_len) ? "FILE": "ATTRIBUTE"); + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = attr_size; + i_size_write(vi, attr_size); + write_unlock_irqrestore(&ni->size_lock, flags); + } + } + /* Setup the fields specific to resident attributes. */ + a->data.resident.value_length = cpu_to_le32(attr_size); + a->data.resident.value_offset = cpu_to_le16(mp_ofs); + a->data.resident.flags = old_res_attr_flags; + memset(&a->data.resident.reserved, 0, + sizeof(a->data.resident.reserved)); + /* Copy the data from the page back to the attribute value. */ + if (page) { + kaddr = kmap_atomic(page, KM_USER0); + memcpy((u8*)a + mp_ofs, kaddr, attr_size); + kunmap_atomic(kaddr, KM_USER0); + } + /* Setup the allocated size in the ntfs inode in case it changed. */ + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = arec_size - mp_ofs; + write_unlock_irqrestore(&ni->size_lock, flags); + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ni->runlist.rl = NULL; + up_write(&ni->runlist.lock); +rl_err_out: + if (rl) { + if (ntfs_cluster_free_from_rl(vol, rl) < 0) { + ntfs_error(vol->sb, "Failed to release allocated " + "cluster(s) in error code path. Run " + "chkdsk to recover the lost " + "cluster(s)."); + NVolSetErrors(vol); + } + ntfs_free(rl); +page_err_out: + unlock_page(page); + page_cache_release(page); + } + if (err == -EINVAL) + err = -EIO; + return err; +} + +/** + * ntfs_attr_extend_allocation - extend the allocated space of an attribute + * @ni: ntfs inode of the attribute whose allocation to extend + * @new_alloc_size: new size in bytes to which to extend the allocation to + * @new_data_size: new size in bytes to which to extend the data to + * @data_start: beginning of region which is required to be non-sparse + * + * Extend the allocated space of an attribute described by the ntfs inode @ni + * to @new_alloc_size bytes. If @data_start is -1, the whole extension may be + * implemented as a hole in the file (as long as both the volume and the ntfs + * inode @ni have sparse support enabled). If @data_start is >= 0, then the + * region between the old allocated size and @data_start - 1 may be made sparse + * but the regions between @data_start and @new_alloc_size must be backed by + * actual clusters. + * + * If @new_data_size is -1, it is ignored. If it is >= 0, then the data size + * of the attribute is extended to @new_data_size. Note that the i_size of the + * vfs inode is not updated. Only the data size in the base attribute record + * is updated. The caller has to update i_size separately if this is required. + * WARNING: It is a BUG() for @new_data_size to be smaller than the old data + * size as well as for @new_data_size to be greater than @new_alloc_size. + * + * For resident attributes this involves resizing the attribute record and if + * necessary moving it and/or other attributes into extent mft records and/or + * converting the attribute to a non-resident attribute which in turn involves + * extending the allocation of a non-resident attribute as described below. + * + * For non-resident attributes this involves allocating clusters in the data + * zone on the volume (except for regions that are being made sparse) and + * extending the run list to describe the allocated clusters as well as + * updating the mapping pairs array of the attribute. This in turn involves + * resizing the attribute record and if necessary moving it and/or other + * attributes into extent mft records and/or splitting the attribute record + * into multiple extent attribute records. + * + * Also, the attribute list attribute is updated if present and in some of the + * above cases (the ones where extent mft records/attributes come into play), + * an attribute list attribute is created if not already present. + * + * Return the new allocated size on success and -errno on error. In the case + * that an error is encountered but a partial extension at least up to + * @data_start (if present) is possible, the allocation is partially extended + * and this is returned. This means the caller must check the returned size to + * determine if the extension was partial. If @data_start is -1 then partial + * allocations are not performed. + * + * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA. + * + * Locking: This function takes the runlist lock of @ni for writing as well as + * locking the mft record of the base ntfs inode. These locks are maintained + * throughout execution of the function. These locks are required so that the + * attribute can be resized safely and so that it can for example be converted + * from resident to non-resident safely. + * + * TODO: At present attribute list attribute handling is not implemented. + * + * TODO: At present it is not safe to call this function for anything other + * than the $DATA attribute(s) of an uncompressed and unencrypted file. + */ +s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, + const s64 new_data_size, const s64 data_start) +{ + VCN vcn; + s64 ll, allocated_size, start = data_start; + struct inode *vi = VFS_I(ni); + ntfs_volume *vol = ni->vol; + ntfs_inode *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + runlist_element *rl, *rl2; + unsigned long flags; + int err, mp_size; + u32 attr_len = 0; /* Silence stupid gcc warning. */ + bool mp_rebuilt; + +#ifdef DEBUG + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " + "old_allocated_size 0x%llx, " + "new_allocated_size 0x%llx, new_data_size 0x%llx, " + "data_start 0x%llx.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (unsigned long long)allocated_size, + (unsigned long long)new_alloc_size, + (unsigned long long)new_data_size, + (unsigned long long)start); +#endif +retry_extend: + /* + * For non-resident attributes, @start and @new_size need to be aligned + * to cluster boundaries for allocation purposes. + */ + if (NInoNonResident(ni)) { + if (start > 0) + start &= ~(s64)vol->cluster_size_mask; + new_alloc_size = (new_alloc_size + vol->cluster_size - 1) & + ~(s64)vol->cluster_size_mask; + } + BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size); + /* Check if new size is allowed in $AttrDef. */ + err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size); + if (unlikely(err)) { + /* Only emit errors when the write will fail completely. */ + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (start < 0 || start >= allocated_size) { + if (err == -ERANGE) { + ntfs_error(vol->sb, "Cannot extend allocation " + "of inode 0x%lx, attribute " + "type 0x%x, because the new " + "allocation would exceed the " + "maximum allowed size for " + "this attribute type.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + } else { + ntfs_error(vol->sb, "Cannot extend allocation " + "of inode 0x%lx, attribute " + "type 0x%x, because this " + "attribute type is not " + "defined on the NTFS volume. " + "Possible corruption! You " + "should run chkdsk!", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + } + } + /* Translate error code to be POSIX conformant for write(2). */ + if (err == -ERANGE) + err = -EFBIG; + else + err = -EIO; + return err; + } + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* + * We will be modifying both the runlist (if non-resident) and the mft + * record so lock them both down. + */ + down_write(&ni->runlist.lock); + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + /* + * If non-resident, seek to the last extent. If resident, there is + * only one extent, so seek to that. + */ + vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits : + 0; + /* + * Abort if someone did the work whilst we waited for the locks. If we + * just converted the attribute from resident to non-resident it is + * likely that exactly this has happened already. We cannot quite + * abort if we need to update the data size. + */ + if (unlikely(new_alloc_size <= allocated_size)) { + ntfs_debug("Allocated size already exceeds requested size."); + new_alloc_size = allocated_size; + if (new_data_size < 0) + goto done; + /* + * We want the first attribute extent so that we can update the + * data size. + */ + vcn = 0; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, vcn, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + /* Use goto to reduce indentation. */ + if (a->non_resident) + goto do_non_resident_extend; + BUG_ON(NInoNonResident(ni)); + /* The total length of the attribute value. */ + attr_len = le32_to_cpu(a->data.resident.value_length); + /* + * Extend the attribute record to be able to store the new attribute + * size. ntfs_attr_record_resize() will not do anything if the size is + * not changing. + */ + if (new_alloc_size < vol->mft_record_size && + !ntfs_attr_record_resize(m, a, + le16_to_cpu(a->data.resident.value_offset) + + new_alloc_size)) { + /* The resize succeeded! */ + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset); + write_unlock_irqrestore(&ni->size_lock, flags); + if (new_data_size >= 0) { + BUG_ON(new_data_size < attr_len); + a->data.resident.value_length = + cpu_to_le32((u32)new_data_size); + } + goto flush_done; + } + /* + * We have to drop all the locks so we can call + * ntfs_attr_make_non_resident(). This could be optimised by try- + * locking the first page cache page and only if that fails dropping + * the locks, locking the page, and redoing all the locking and + * lookups. While this would be a huge optimisation, it is not worth + * it as this is definitely a slow code path. + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + /* + * Not enough space in the mft record, try to make the attribute + * non-resident and if successful restart the extension process. + */ + err = ntfs_attr_make_non_resident(ni, attr_len); + if (likely(!err)) + goto retry_extend; + /* + * Could not make non-resident. If this is due to this not being + * permitted for this attribute type or there not being enough space, + * try to make other attributes non-resident. Otherwise fail. + */ + if (unlikely(err != -EPERM && err != -ENOSPC)) { + /* Only emit errors when the write will fail completely. */ + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because the conversion from resident " + "to non-resident attribute failed " + "with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM) + err = -EIO; + goto conv_err_out; + } + /* TODO: Not implemented from here, abort. */ + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (start < 0 || start >= allocated_size) { + if (err == -ENOSPC) + ntfs_error(vol->sb, "Not enough space in the mft " + "record/on disk for the non-resident " + "attribute value. This case is not " + "implemented yet."); + else /* if (err == -EPERM) */ + ntfs_error(vol->sb, "This attribute type may not be " + "non-resident. This case is not " + "implemented yet."); + } + err = -EOPNOTSUPP; + goto conv_err_out; +#if 0 + // TODO: Attempt to make other attributes non-resident. + if (!err) + goto do_resident_extend; + /* + * Both the attribute list attribute and the standard information + * attribute must remain in the base inode. Thus, if this is one of + * these attributes, we have to try to move other attributes out into + * extent mft records instead. + */ + if (ni->type == AT_ATTRIBUTE_LIST || + ni->type == AT_STANDARD_INFORMATION) { + // TODO: Attempt to move other attributes into extent mft + // records. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + goto err_out; + } + // TODO: Attempt to move this attribute to an extent mft record, but + // only if it is not already the only attribute in an mft record in + // which case there would be nothing to gain. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + /* There is nothing we can do to make enough space. )-: */ + goto err_out; +#endif +do_non_resident_extend: + BUG_ON(!NInoNonResident(ni)); + if (new_alloc_size == allocated_size) { + BUG_ON(vcn); + goto alloc_done; + } + /* + * If the data starts after the end of the old allocation, this is a + * $DATA attribute and sparse attributes are enabled on the volume and + * for this inode, then create a sparse region between the old + * allocated size and the start of the data. Otherwise simply proceed + * with filling the whole space between the old allocated size and the + * new allocated size with clusters. + */ + if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA || + !NVolSparseEnabled(vol) || NInoSparseDisabled(ni)) + goto skip_sparse; + // TODO: This is not implemented yet. We just fill in with real + // clusters for now... + ntfs_debug("Inserting holes is not-implemented yet. Falling back to " + "allocating real clusters instead."); +skip_sparse: + rl = ni->runlist.rl; + if (likely(rl)) { + /* Seek to the end of the runlist. */ + while (rl->length) + rl++; + } + /* If this attribute extent is not mapped, map it now. */ + if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED || + (rl->lcn == LCN_ENOENT && rl > ni->runlist.rl && + (rl-1)->lcn == LCN_RL_NOT_MAPPED))) { + if (!rl && !allocated_size) + goto first_alloc; + rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation " + "of inode 0x%lx, attribute " + "type 0x%x, because the " + "mapping of a runlist " + "fragment failed with error " + "code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + err); + if (err != -ENOMEM) + err = -EIO; + goto err_out; + } + ni->runlist.rl = rl; + /* Seek to the end of the runlist. */ + while (rl->length) + rl++; + } + /* + * We now know the runlist of the last extent is mapped and @rl is at + * the end of the runlist. We want to begin allocating clusters + * starting at the last allocated cluster to reduce fragmentation. If + * there are no valid LCNs in the attribute we let the cluster + * allocator choose the starting cluster. + */ + /* If the last LCN is a hole or simillar seek back to last real LCN. */ + while (rl->lcn < 0 && rl > ni->runlist.rl) + rl--; +first_alloc: + // FIXME: Need to implement partial allocations so at least part of the + // write can be performed when start >= 0. (Needed for POSIX write(2) + // conformance.) + rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits, + (new_alloc_size - allocated_size) >> + vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ? + rl->lcn + rl->length : -1, DATA_ZONE, true); + if (IS_ERR(rl2)) { + err = PTR_ERR(rl2); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because the allocation of clusters " + "failed with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM && err != -ENOSPC) + err = -EIO; + goto err_out; + } + rl = ntfs_runlists_merge(ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because the runlist merge failed " + "with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM) + err = -EIO; + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to release allocated " + "cluster(s) in error code path. Run " + "chkdsk to recover the lost " + "cluster(s)."); + NVolSetErrors(vol); + } + ntfs_free(rl2); + goto err_out; + } + ni->runlist.rl = rl; + ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size - + allocated_size) >> vol->cluster_size_bits); + /* Find the runlist element with which the attribute extent starts. */ + ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); + rl2 = ntfs_rl_find_vcn_nolock(rl, ll); + BUG_ON(!rl2); + BUG_ON(!rl2->length); + BUG_ON(rl2->lcn < LCN_HOLE); + mp_rebuilt = false; + /* Get the size for the new mapping pairs array for this extent. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); + if (unlikely(mp_size <= 0)) { + err = mp_size; + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because determining the size for the " + "mapping pairs failed with error code " + "%i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + err = -EIO; + goto undo_alloc; + } + /* Extend the attribute record to fit the bigger mapping pairs array. */ + attr_len = le32_to_cpu(a->length); + err = ntfs_attr_record_resize(m, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + if (unlikely(err)) { + BUG_ON(err != -ENOSPC); + // TODO: Deal with this by moving this extent to a new mft + // record or by starting a new extent in a new mft record, + // possibly by extending this extent partially and filling it + // and creating a new extent for the remainder, or by making + // other attributes non-resident and/or by moving other + // attributes out of this mft record. + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Not enough space in the mft " + "record for the extended attribute " + "record. This case is not " + "implemented yet."); + err = -EOPNOTSUPP; + goto undo_alloc; + } + mp_rebuilt = true; + /* Generate the mapping pairs array directly into the attr record. */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, ll, -1, NULL); + if (unlikely(err)) { + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because building the mapping pairs " + "failed with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + err = -EIO; + goto undo_alloc; + } + /* Update the highest_vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> + vol->cluster_size_bits) - 1); + /* + * We now have extended the allocated size of the attribute. Reflect + * this in the ntfs_inode structure and the attribute record. + */ + if (a->data.non_resident.lowest_vcn) { + /* + * We are not in the first attribute extent, switch to it, but + * first ensure the changes will make it to disk later. + */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto restore_undo_alloc; + /* @m is not used any more so no need to set it. */ + a = ctx->attr; + } + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_alloc_size; + a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); + /* + * FIXME: This would fail if @ni is a directory, $MFT, or an index, + * since those can have sparse/compressed set. For example can be + * set compressed even though it is not compressed itself and in that + * case the bit means that files are to be created compressed in the + * directory... At present this is ok as this code is only called for + * regular files, and only for their $DATA attribute(s). + * FIXME: The calculation is wrong if we created a hole above. For now + * it does not matter as we never create holes. + */ + if (NInoSparse(ni) || NInoCompressed(ni)) { + ni->itype.compressed.size += new_alloc_size - allocated_size; + a->data.non_resident.compressed_size = + cpu_to_sle64(ni->itype.compressed.size); + vi->i_blocks = ni->itype.compressed.size >> 9; + } else + vi->i_blocks = new_alloc_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); +alloc_done: + if (new_data_size >= 0) { + BUG_ON(new_data_size < + sle64_to_cpu(a->data.non_resident.data_size)); + a->data.non_resident.data_size = cpu_to_sle64(new_data_size); + } +flush_done: + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); +done: + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + ntfs_debug("Done, new_allocated_size 0x%llx.", + (unsigned long long)new_alloc_size); + return new_alloc_size; +restore_undo_alloc: + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot complete extension of allocation " + "of inode 0x%lx, attribute type 0x%x, because " + "lookup of first attribute extent failed with " + "error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err == -ENOENT) + err = -EIO; + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE, + allocated_size >> vol->cluster_size_bits, NULL, 0, + ctx)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "attribute in error code path. Run chkdsk to " + "recover."); + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_alloc_size; + /* + * FIXME: This would fail if @ni is a directory... See above. + * FIXME: The calculation is wrong if we created a hole above. + * For now it does not matter as we never create holes. + */ + if (NInoSparse(ni) || NInoCompressed(ni)) { + ni->itype.compressed.size += new_alloc_size - + allocated_size; + vi->i_blocks = ni->itype.compressed.size >> 9; + } else + vi->i_blocks = new_alloc_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + /* + * The only thing that is now wrong is the allocated size of the + * base attribute extent which chkdsk should be able to fix. + */ + NVolSetErrors(vol); + return err; + } + ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64( + (allocated_size >> vol->cluster_size_bits) - 1); +undo_alloc: + ll = allocated_size >> vol->cluster_size_bits; + if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) { + ntfs_error(vol->sb, "Failed to release allocated cluster(s) " + "in error code path. Run chkdsk to recover " + "the lost cluster(s)."); + NVolSetErrors(vol); + } + m = ctx->mrec; + a = ctx->attr; + /* + * If the runlist truncation fails and/or the search context is no + * longer valid, we cannot resize the attribute record or build the + * mapping pairs array thus we mark the inode bad so that no access to + * the freed clusters can happen. + */ + if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) { + ntfs_error(vol->sb, "Failed to %s in error code path. Run " + "chkdsk to recover.", IS_ERR(m) ? + "restore attribute search context" : + "truncate attribute runlist"); + NVolSetErrors(vol); + } else if (mp_rebuilt) { + if (ntfs_attr_record_resize(m, a, attr_len)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record in error code path. Run " + "chkdsk to recover."); + NVolSetErrors(vol); + } else /* if (success) */ { + if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident. + mapping_pairs_offset), attr_len - + le16_to_cpu(a->data.non_resident. + mapping_pairs_offset), rl2, ll, -1, + NULL)) { + ntfs_error(vol->sb, "Failed to restore " + "mapping pairs array in error " + "code path. Run chkdsk to " + "recover."); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } + } +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); +conv_err_out: + ntfs_debug("Failed. Returning error code %i.", err); + return err; +} + +/** + * ntfs_attr_set - fill (a part of) an attribute with a byte + * @ni: ntfs inode describing the attribute to fill + * @ofs: offset inside the attribute at which to start to fill + * @cnt: number of bytes to fill + * @val: the unsigned 8-bit value with which to fill the attribute + * + * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at + * byte offset @ofs inside the attribute with the constant byte @val. + * + * This function is effectively like memset() applied to an ntfs attribute. + * Note thie function actually only operates on the page cache pages belonging + * to the ntfs attribute and it marks them dirty after doing the memset(). + * Thus it relies on the vm dirty page write code paths to cause the modified + * pages to be written to the mft record/disk. + * + * Return 0 on success and -errno on error. An error code of -ESPIPE means + * that @ofs + @cnt were outside the end of the attribute and no write was + * performed. + */ +int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) +{ + ntfs_volume *vol = ni->vol; + struct address_space *mapping; + struct page *page; + u8 *kaddr; + pgoff_t idx, end; + unsigned start_ofs, end_ofs, size; + + ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.", + (long long)ofs, (long long)cnt, val); + BUG_ON(ofs < 0); + BUG_ON(cnt < 0); + if (!cnt) + goto done; + /* + * FIXME: Compressed and encrypted attributes are not supported when + * writing and we should never have gotten here for them. + */ + BUG_ON(NInoCompressed(ni)); + BUG_ON(NInoEncrypted(ni)); + mapping = VFS_I(ni)->i_mapping; + /* Work out the starting index and page offset. */ + idx = ofs >> PAGE_CACHE_SHIFT; + start_ofs = ofs & ~PAGE_CACHE_MASK; + /* Work out the ending index and page offset. */ + end = ofs + cnt; + end_ofs = end & ~PAGE_CACHE_MASK; + /* If the end is outside the inode size return -ESPIPE. */ + if (unlikely(end > i_size_read(VFS_I(ni)))) { + ntfs_error(vol->sb, "Request exceeds end of attribute."); + return -ESPIPE; + } + end >>= PAGE_CACHE_SHIFT; + /* If there is a first partial page, need to do it the slow way. */ + if (start_ofs) { + page = read_mapping_page(mapping, idx, NULL); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read first partial " + "page (error, index 0x%lx).", idx); + return PTR_ERR(page); + } + /* + * If the last page is the same as the first page, need to + * limit the write to the end offset. + */ + size = PAGE_CACHE_SIZE; + if (idx == end) + size = end_ofs; + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + start_ofs, val, size - start_ofs); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_page_dirty(page); + page_cache_release(page); + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + if (idx == end) + goto done; + idx++; + } + /* Do the whole pages the fast way. */ + for (; idx < end; idx++) { + /* Find or create the current page. (The page is locked.) */ + page = grab_cache_page(mapping, idx); + if (unlikely(!page)) { + ntfs_error(vol->sb, "Insufficient memory to grab " + "page (index 0x%lx).", idx); + return -ENOMEM; + } + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr, val, PAGE_CACHE_SIZE); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + /* + * If the page has buffers, mark them uptodate since buffer + * state and not page state is definitive in 2.6 kernels. + */ + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + set_buffer_uptodate(bh); + } while ((bh = bh->b_this_page) != head); + } + /* Now that buffers are uptodate, set the page uptodate, too. */ + SetPageUptodate(page); + /* + * Set the page and all its buffers dirty and mark the inode + * dirty, too. The VM will write the page later on. + */ + set_page_dirty(page); + /* Finally unlock and release the page. */ + unlock_page(page); + page_cache_release(page); + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } + /* If there is a last partial page, need to do it the slow way. */ + if (end_ofs) { + page = read_mapping_page(mapping, idx, NULL); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read last partial page " + "(error, index 0x%lx).", idx); + return PTR_ERR(page); + } + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr, val, end_ofs); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_page_dirty(page); + page_cache_release(page); + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } +done: + ntfs_debug("Done."); + return 0; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h new file mode 100644 index 00000000..3c8b74c9 --- /dev/null +++ b/fs/ntfs/attrib.h @@ -0,0 +1,116 @@ +/* + * attrib.h - Defines for attribute handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_ATTRIB_H +#define _LINUX_NTFS_ATTRIB_H + +#include "endian.h" +#include "types.h" +#include "layout.h" +#include "inode.h" +#include "runlist.h" +#include "volume.h" + +/** + * ntfs_attr_search_ctx - used in attribute search functions + * @mrec: buffer containing mft record to search + * @attr: attribute record in @mrec where to begin/continue search + * @is_first: if true ntfs_attr_lookup() begins search with @attr, else after + * + * Structure must be initialized to zero before the first call to one of the + * attribute search functions. Initialize @mrec to point to the mft record to + * search, and @attr to point to the first attribute within @mrec (not necessary + * if calling the _first() functions), and set @is_first to 'true' (not necessary + * if calling the _first() functions). + * + * If @is_first is 'true', the search begins with @attr. If @is_first is 'false', + * the search begins after @attr. This is so that, after the first call to one + * of the search attribute functions, we can call the function again, without + * any modification of the search context, to automagically get the next + * matching attribute. + */ +typedef struct { + MFT_RECORD *mrec; + ATTR_RECORD *attr; + bool is_first; + ntfs_inode *ntfs_ino; + ATTR_LIST_ENTRY *al_entry; + ntfs_inode *base_ntfs_ino; + MFT_RECORD *base_mrec; + ATTR_RECORD *base_attr; +} ntfs_attr_search_ctx; + +extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, + ntfs_attr_search_ctx *ctx); +extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn); + +extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, + const bool write_locked); + +extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, + const VCN vcn, ntfs_attr_search_ctx *ctx); + +int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, + const u32 name_len, const IGNORE_CASE_BOOL ic, + const VCN lowest_vcn, const u8 *val, const u32 val_len, + ntfs_attr_search_ctx *ctx); + +extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start, + const s64 size, const s64 initialized_size); + +static inline s64 ntfs_attr_size(const ATTR_RECORD *a) +{ + if (!a->non_resident) + return (s64)le32_to_cpu(a->data.resident.value_length); + return sle64_to_cpu(a->data.non_resident.data_size); +} + +extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx); +extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, + MFT_RECORD *mrec); +extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx); + +#ifdef NTFS_RW + +extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol, + const ATTR_TYPE type, const s64 size); +extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, + const ATTR_TYPE type); +extern int ntfs_attr_can_be_resident(const ntfs_volume *vol, + const ATTR_TYPE type); + +extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size); +extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, + const u32 new_size); + +extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size); + +extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, + const s64 new_data_size, const s64 data_start); + +extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, + const u8 val); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_ATTRIB_H */ diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c new file mode 100644 index 00000000..0809cf87 --- /dev/null +++ b/fs/ntfs/bitmap.c @@ -0,0 +1,193 @@ +/* + * bitmap.c - NTFS kernel bitmap handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifdef NTFS_RW + +#include + +#include "bitmap.h" +#include "debug.h" +#include "aops.h" +#include "ntfs.h" + +/** + * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to set + * @count: number of bits to set + * @value: value to set the bits to (i.e. 0 or 1) + * @is_rollback: if 'true' this is a rollback operation + * + * Set @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi to @value, where @value is either 0 or 1. + * + * @is_rollback should always be 'false', it is for internal use to rollback + * errors. You probably want to use ntfs_bitmap_set_bits_in_run() instead. + * + * Return 0 on success and -errno on error. + */ +int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, + const s64 count, const u8 value, const bool is_rollback) +{ + s64 cnt = count; + pgoff_t index, end_index; + struct address_space *mapping; + struct page *page; + u8 *kaddr; + int pos, len; + u8 bit; + + BUG_ON(!vi); + ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, " + "value %u.%s", vi->i_ino, (unsigned long long)start_bit, + (unsigned long long)cnt, (unsigned int)value, + is_rollback ? " (rollback)" : ""); + BUG_ON(start_bit < 0); + BUG_ON(cnt < 0); + BUG_ON(value > 1); + /* + * Calculate the indices for the pages containing the first and last + * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively. + */ + index = start_bit >> (3 + PAGE_CACHE_SHIFT); + end_index = (start_bit + cnt - 1) >> (3 + PAGE_CACHE_SHIFT); + + /* Get the page containing the first bit (@start_bit). */ + mapping = vi->i_mapping; + page = ntfs_map_page(mapping, index); + if (IS_ERR(page)) { + if (!is_rollback) + ntfs_error(vi->i_sb, "Failed to map first page (error " + "%li), aborting.", PTR_ERR(page)); + return PTR_ERR(page); + } + kaddr = page_address(page); + + /* Set @pos to the position of the byte containing @start_bit. */ + pos = (start_bit >> 3) & ~PAGE_CACHE_MASK; + + /* Calculate the position of @start_bit in the first byte. */ + bit = start_bit & 7; + + /* If the first byte is partial, modify the appropriate bits in it. */ + if (bit) { + u8 *byte = kaddr + pos; + while ((bit & 7) && cnt) { + cnt--; + if (value) + *byte |= 1 << bit++; + else + *byte &= ~(1 << bit++); + } + /* If we are done, unmap the page and return success. */ + if (!cnt) + goto done; + + /* Update @pos to the new position. */ + pos++; + } + /* + * Depending on @value, modify all remaining whole bytes in the page up + * to @cnt. + */ + len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE - pos); + memset(kaddr + pos, value ? 0xff : 0, len); + cnt -= len << 3; + + /* Update @len to point to the first not-done byte in the page. */ + if (cnt < 8) + len += pos; + + /* If we are not in the last page, deal with all subsequent pages. */ + while (index < end_index) { + BUG_ON(cnt <= 0); + + /* Update @index and get the next page. */ + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + page = ntfs_map_page(mapping, ++index); + if (IS_ERR(page)) + goto rollback; + kaddr = page_address(page); + /* + * Depending on @value, modify all remaining whole bytes in the + * page up to @cnt. + */ + len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE); + memset(kaddr, value ? 0xff : 0, len); + cnt -= len << 3; + } + /* + * The currently mapped page is the last one. If the last byte is + * partial, modify the appropriate bits in it. Note, @len is the + * position of the last byte inside the page. + */ + if (cnt) { + u8 *byte; + + BUG_ON(cnt > 7); + + bit = cnt; + byte = kaddr + len; + while (bit--) { + if (value) + *byte |= 1 << bit; + else + *byte &= ~(1 << bit); + } + } +done: + /* We are done. Unmap the page and return success. */ + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + ntfs_debug("Done."); + return 0; +rollback: + /* + * Current state: + * - no pages are mapped + * - @count - @cnt is the number of bits that have been modified + */ + if (is_rollback) + return PTR_ERR(page); + if (count != cnt) + pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt, + value ? 0 : 1, true); + else + pos = 0; + if (!pos) { + /* Rollback was successful. */ + ntfs_error(vi->i_sb, "Failed to map subsequent page (error " + "%li), aborting.", PTR_ERR(page)); + } else { + /* Rollback failed. */ + ntfs_error(vi->i_sb, "Failed to map subsequent page (error " + "%li) and rollback failed (error %i). " + "Aborting and leaving inconsistent metadata. " + "Unmount and run chkdsk.", PTR_ERR(page), pos); + NVolSetErrors(NTFS_SB(vi->i_sb)); + } + return PTR_ERR(page); +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h new file mode 100644 index 00000000..72c9ad8b --- /dev/null +++ b/fs/ntfs/bitmap.h @@ -0,0 +1,118 @@ +/* + * bitmap.h - Defines for NTFS kernel bitmap handling. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_BITMAP_H +#define _LINUX_NTFS_BITMAP_H + +#ifdef NTFS_RW + +#include + +#include "types.h" + +extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, + const s64 count, const u8 value, const bool is_rollback); + +/** + * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to set + * @count: number of bits to set + * @value: value to set the bits to (i.e. 0 or 1) + * + * Set @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi to @value, where @value is either 0 or 1. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi, + const s64 start_bit, const s64 count, const u8 value) +{ + return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value, + false); +} + +/** + * ntfs_bitmap_set_run - set a run of bits in a bitmap + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to set + * @count: number of bits to set + * + * Set @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit, + const s64 count) +{ + return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1); +} + +/** + * ntfs_bitmap_clear_run - clear a run of bits in a bitmap + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to clear + * @count: number of bits to clear + * + * Clear @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit, + const s64 count) +{ + return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0); +} + +/** + * ntfs_bitmap_set_bit - set a bit in a bitmap + * @vi: vfs inode describing the bitmap + * @bit: bit to set + * + * Set bit @bit in the bitmap described by the vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit) +{ + return ntfs_bitmap_set_run(vi, bit, 1); +} + +/** + * ntfs_bitmap_clear_bit - clear a bit in a bitmap + * @vi: vfs inode describing the bitmap + * @bit: bit to clear + * + * Clear bit @bit in the bitmap described by the vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit) +{ + return ntfs_bitmap_clear_run(vi, bit, 1); +} + +#endif /* NTFS_RW */ + +#endif /* defined _LINUX_NTFS_BITMAP_H */ diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c new file mode 100644 index 00000000..4a28ab38 --- /dev/null +++ b/fs/ntfs/collate.c @@ -0,0 +1,124 @@ +/* + * collate.c - NTFS kernel collation handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "collate.h" +#include "debug.h" +#include "ntfs.h" + +static int ntfs_collate_binary(ntfs_volume *vol, + const void *data1, const int data1_len, + const void *data2, const int data2_len) +{ + int rc; + + ntfs_debug("Entering."); + rc = memcmp(data1, data2, min(data1_len, data2_len)); + if (!rc && (data1_len != data2_len)) { + if (data1_len < data2_len) + rc = -1; + else + rc = 1; + } + ntfs_debug("Done, returning %i", rc); + return rc; +} + +static int ntfs_collate_ntofs_ulong(ntfs_volume *vol, + const void *data1, const int data1_len, + const void *data2, const int data2_len) +{ + int rc; + u32 d1, d2; + + ntfs_debug("Entering."); + // FIXME: We don't really want to bug here. + BUG_ON(data1_len != data2_len); + BUG_ON(data1_len != 4); + d1 = le32_to_cpup(data1); + d2 = le32_to_cpup(data2); + if (d1 < d2) + rc = -1; + else { + if (d1 == d2) + rc = 0; + else + rc = 1; + } + ntfs_debug("Done, returning %i", rc); + return rc; +} + +typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int, + const void *, const int); + +static ntfs_collate_func_t ntfs_do_collate0x0[3] = { + ntfs_collate_binary, + NULL/*ntfs_collate_file_name*/, + NULL/*ntfs_collate_unicode_string*/, +}; + +static ntfs_collate_func_t ntfs_do_collate0x1[4] = { + ntfs_collate_ntofs_ulong, + NULL/*ntfs_collate_ntofs_sid*/, + NULL/*ntfs_collate_ntofs_security_hash*/, + NULL/*ntfs_collate_ntofs_ulongs*/, +}; + +/** + * ntfs_collate - collate two data items using a specified collation rule + * @vol: ntfs volume to which the data items belong + * @cr: collation rule to use when comparing the items + * @data1: first data item to collate + * @data1_len: length in bytes of @data1 + * @data2: second data item to collate + * @data2_len: length in bytes of @data2 + * + * Collate the two data items @data1 and @data2 using the collation rule @cr + * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before, + * to match, or to collate after @data2. + * + * For speed we use the collation rule @cr as an index into two tables of + * function pointers to call the appropriate collation function. + */ +int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, + const void *data1, const int data1_len, + const void *data2, const int data2_len) { + int i; + + ntfs_debug("Entering."); + /* + * FIXME: At the moment we only support COLLATION_BINARY and + * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now. + */ + BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG); + i = le32_to_cpu(cr); + BUG_ON(i < 0); + if (i <= 0x02) + return ntfs_do_collate0x0[i](vol, data1, data1_len, + data2, data2_len); + BUG_ON(i < 0x10); + i -= 0x10; + if (likely(i <= 3)) + return ntfs_do_collate0x1[i](vol, data1, data1_len, + data2, data2_len); + BUG(); + return 0; +} diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h new file mode 100644 index 00000000..aba83347 --- /dev/null +++ b/fs/ntfs/collate.h @@ -0,0 +1,50 @@ +/* + * collate.h - Defines for NTFS kernel collation handling. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_COLLATE_H +#define _LINUX_NTFS_COLLATE_H + +#include "types.h" +#include "volume.h" + +static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) { + int i; + + /* + * FIXME: At the moment we only support COLLATION_BINARY and + * COLLATION_NTOFS_ULONG, so we return false for everything else for + * now. + */ + if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG)) + return false; + i = le32_to_cpu(cr); + if (likely(((i >= 0) && (i <= 0x02)) || + ((i >= 0x10) && (i <= 0x13)))) + return true; + return false; +} + +extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, + const void *data1, const int data1_len, + const void *data2, const int data2_len); + +#endif /* _LINUX_NTFS_COLLATE_H */ diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c new file mode 100644 index 00000000..ee4144ce --- /dev/null +++ b/fs/ntfs/compress.c @@ -0,0 +1,969 @@ +/** + * compress.c - NTFS kernel compressed attributes handling. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include + +#include "attrib.h" +#include "inode.h" +#include "debug.h" +#include "ntfs.h" + +/** + * ntfs_compression_constants - enum of constants used in the compression code + */ +typedef enum { + /* Token types and access mask. */ + NTFS_SYMBOL_TOKEN = 0, + NTFS_PHRASE_TOKEN = 1, + NTFS_TOKEN_MASK = 1, + + /* Compression sub-block constants. */ + NTFS_SB_SIZE_MASK = 0x0fff, + NTFS_SB_SIZE = 0x1000, + NTFS_SB_IS_COMPRESSED = 0x8000, + + /* + * The maximum compression block size is by definition 16 * the cluster + * size, with the maximum supported cluster size being 4kiB. Thus the + * maximum compression buffer size is 64kiB, so we use this when + * initializing the compression buffer. + */ + NTFS_MAX_CB_SIZE = 64 * 1024, +} ntfs_compression_constants; + +/** + * ntfs_compression_buffer - one buffer for the decompression engine + */ +static u8 *ntfs_compression_buffer = NULL; + +/** + * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer + */ +static DEFINE_SPINLOCK(ntfs_cb_lock); + +/** + * allocate_compression_buffers - allocate the decompression buffers + * + * Caller has to hold the ntfs_lock mutex. + * + * Return 0 on success or -ENOMEM if the allocations failed. + */ +int allocate_compression_buffers(void) +{ + BUG_ON(ntfs_compression_buffer); + + ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE); + if (!ntfs_compression_buffer) + return -ENOMEM; + return 0; +} + +/** + * free_compression_buffers - free the decompression buffers + * + * Caller has to hold the ntfs_lock mutex. + */ +void free_compression_buffers(void) +{ + BUG_ON(!ntfs_compression_buffer); + vfree(ntfs_compression_buffer); + ntfs_compression_buffer = NULL; +} + +/** + * zero_partial_compressed_page - zero out of bounds compressed page region + */ +static void zero_partial_compressed_page(struct page *page, + const s64 initialized_size) +{ + u8 *kp = page_address(page); + unsigned int kp_ofs; + + ntfs_debug("Zeroing page region outside initialized size."); + if (((s64)page->index << PAGE_CACHE_SHIFT) >= initialized_size) { + /* + * FIXME: Using clear_page() will become wrong when we get + * PAGE_CACHE_SIZE != PAGE_SIZE but for now there is no problem. + */ + clear_page(kp); + return; + } + kp_ofs = initialized_size & ~PAGE_CACHE_MASK; + memset(kp + kp_ofs, 0, PAGE_CACHE_SIZE - kp_ofs); + return; +} + +/** + * handle_bounds_compressed_page - test for&handle out of bounds compressed page + */ +static inline void handle_bounds_compressed_page(struct page *page, + const loff_t i_size, const s64 initialized_size) +{ + if ((page->index >= (initialized_size >> PAGE_CACHE_SHIFT)) && + (initialized_size < i_size)) + zero_partial_compressed_page(page, initialized_size); + return; +} + +/** + * ntfs_decompress - decompress a compression block into an array of pages + * @dest_pages: destination array of pages + * @dest_index: current index into @dest_pages (IN/OUT) + * @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT) + * @dest_max_index: maximum index into @dest_pages (IN) + * @dest_max_ofs: maximum offset within @dest_pages[@dest_max_index] (IN) + * @xpage: the target page (-1 if none) (IN) + * @xpage_done: set to 1 if xpage was completed successfully (IN/OUT) + * @cb_start: compression block to decompress (IN) + * @cb_size: size of compression block @cb_start in bytes (IN) + * @i_size: file size when we started the read (IN) + * @initialized_size: initialized file size when we started the read (IN) + * + * The caller must have disabled preemption. ntfs_decompress() reenables it when + * the critical section is finished. + * + * This decompresses the compression block @cb_start into the array of + * destination pages @dest_pages starting at index @dest_index into @dest_pages + * and at offset @dest_pos into the page @dest_pages[@dest_index]. + * + * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1. + * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified. + * + * @cb_start is a pointer to the compression block which needs decompressing + * and @cb_size is the size of @cb_start in bytes (8-64kiB). + * + * Return 0 if success or -EOVERFLOW on error in the compressed stream. + * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was + * completed during the decompression of the compression block (@cb_start). + * + * Warning: This function *REQUIRES* PAGE_CACHE_SIZE >= 4096 or it will blow up + * unpredicatbly! You have been warned! + * + * Note to hackers: This function may not sleep until it has finished accessing + * the compression block @cb_start as it is a per-CPU buffer. + */ +static int ntfs_decompress(struct page *dest_pages[], int *dest_index, + int *dest_ofs, const int dest_max_index, const int dest_max_ofs, + const int xpage, char *xpage_done, u8 *const cb_start, + const u32 cb_size, const loff_t i_size, + const s64 initialized_size) +{ + /* + * Pointers into the compressed data, i.e. the compression block (cb), + * and the therein contained sub-blocks (sb). + */ + u8 *cb_end = cb_start + cb_size; /* End of cb. */ + u8 *cb = cb_start; /* Current position in cb. */ + u8 *cb_sb_start = cb; /* Beginning of the current sb in the cb. */ + u8 *cb_sb_end; /* End of current sb / beginning of next sb. */ + + /* Variables for uncompressed data / destination. */ + struct page *dp; /* Current destination page being worked on. */ + u8 *dp_addr; /* Current pointer into dp. */ + u8 *dp_sb_start; /* Start of current sub-block in dp. */ + u8 *dp_sb_end; /* End of current sb in dp (dp_sb_start + + NTFS_SB_SIZE). */ + u16 do_sb_start; /* @dest_ofs when starting this sub-block. */ + u16 do_sb_end; /* @dest_ofs of end of this sb (do_sb_start + + NTFS_SB_SIZE). */ + + /* Variables for tag and token parsing. */ + u8 tag; /* Current tag. */ + int token; /* Loop counter for the eight tokens in tag. */ + + /* Need this because we can't sleep, so need two stages. */ + int completed_pages[dest_max_index - *dest_index + 1]; + int nr_completed_pages = 0; + + /* Default error code. */ + int err = -EOVERFLOW; + + ntfs_debug("Entering, cb_size = 0x%x.", cb_size); +do_next_sb: + ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.", + cb - cb_start); + /* + * Have we reached the end of the compression block or the end of the + * decompressed data? The latter can happen for example if the current + * position in the compression block is one byte before its end so the + * first two checks do not detect it. + */ + if (cb == cb_end || !le16_to_cpup((le16*)cb) || + (*dest_index == dest_max_index && + *dest_ofs == dest_max_ofs)) { + int i; + + ntfs_debug("Completed. Returning success (0)."); + err = 0; +return_error: + /* We can sleep from now on, so we drop lock. */ + spin_unlock(&ntfs_cb_lock); + /* Second stage: finalize completed pages. */ + if (nr_completed_pages > 0) { + for (i = 0; i < nr_completed_pages; i++) { + int di = completed_pages[i]; + + dp = dest_pages[di]; + /* + * If we are outside the initialized size, zero + * the out of bounds page range. + */ + handle_bounds_compressed_page(dp, i_size, + initialized_size); + flush_dcache_page(dp); + kunmap(dp); + SetPageUptodate(dp); + unlock_page(dp); + if (di == xpage) + *xpage_done = 1; + else + page_cache_release(dp); + dest_pages[di] = NULL; + } + } + return err; + } + + /* Setup offsets for the current sub-block destination. */ + do_sb_start = *dest_ofs; + do_sb_end = do_sb_start + NTFS_SB_SIZE; + + /* Check that we are still within allowed boundaries. */ + if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs) + goto return_overflow; + + /* Does the minimum size of a compressed sb overflow valid range? */ + if (cb + 6 > cb_end) + goto return_overflow; + + /* Setup the current sub-block source pointers and validate range. */ + cb_sb_start = cb; + cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK) + + 3; + if (cb_sb_end > cb_end) + goto return_overflow; + + /* Get the current destination page. */ + dp = dest_pages[*dest_index]; + if (!dp) { + /* No page present. Skip decompression of this sub-block. */ + cb = cb_sb_end; + + /* Advance destination position to next sub-block. */ + *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_CACHE_MASK; + if (!*dest_ofs && (++*dest_index > dest_max_index)) + goto return_overflow; + goto do_next_sb; + } + + /* We have a valid destination page. Setup the destination pointers. */ + dp_addr = (u8*)page_address(dp) + do_sb_start; + + /* Now, we are ready to process the current sub-block (sb). */ + if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) { + ntfs_debug("Found uncompressed sub-block."); + /* This sb is not compressed, just copy it into destination. */ + + /* Advance source position to first data byte. */ + cb += 2; + + /* An uncompressed sb must be full size. */ + if (cb_sb_end - cb != NTFS_SB_SIZE) + goto return_overflow; + + /* Copy the block and advance the source position. */ + memcpy(dp_addr, cb, NTFS_SB_SIZE); + cb += NTFS_SB_SIZE; + + /* Advance destination position to next sub-block. */ + *dest_ofs += NTFS_SB_SIZE; + if (!(*dest_ofs &= ~PAGE_CACHE_MASK)) { +finalize_page: + /* + * First stage: add current page index to array of + * completed pages. + */ + completed_pages[nr_completed_pages++] = *dest_index; + if (++*dest_index > dest_max_index) + goto return_overflow; + } + goto do_next_sb; + } + ntfs_debug("Found compressed sub-block."); + /* This sb is compressed, decompress it into destination. */ + + /* Setup destination pointers. */ + dp_sb_start = dp_addr; + dp_sb_end = dp_sb_start + NTFS_SB_SIZE; + + /* Forward to the first tag in the sub-block. */ + cb += 2; +do_next_tag: + if (cb == cb_sb_end) { + /* Check if the decompressed sub-block was not full-length. */ + if (dp_addr < dp_sb_end) { + int nr_bytes = do_sb_end - *dest_ofs; + + ntfs_debug("Filling incomplete sub-block with " + "zeroes."); + /* Zero remainder and update destination position. */ + memset(dp_addr, 0, nr_bytes); + *dest_ofs += nr_bytes; + } + /* We have finished the current sub-block. */ + if (!(*dest_ofs &= ~PAGE_CACHE_MASK)) + goto finalize_page; + goto do_next_sb; + } + + /* Check we are still in range. */ + if (cb > cb_sb_end || dp_addr > dp_sb_end) + goto return_overflow; + + /* Get the next tag and advance to first token. */ + tag = *cb++; + + /* Parse the eight tokens described by the tag. */ + for (token = 0; token < 8; token++, tag >>= 1) { + u16 lg, pt, length, max_non_overlap; + register u16 i; + u8 *dp_back_addr; + + /* Check if we are done / still in range. */ + if (cb >= cb_sb_end || dp_addr > dp_sb_end) + break; + + /* Determine token type and parse appropriately.*/ + if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) { + /* + * We have a symbol token, copy the symbol across, and + * advance the source and destination positions. + */ + *dp_addr++ = *cb++; + ++*dest_ofs; + + /* Continue with the next token. */ + continue; + } + + /* + * We have a phrase token. Make sure it is not the first tag in + * the sb as this is illegal and would confuse the code below. + */ + if (dp_addr == dp_sb_start) + goto return_overflow; + + /* + * Determine the number of bytes to go back (p) and the number + * of bytes to copy (l). We use an optimized algorithm in which + * we first calculate log2(current destination position in sb), + * which allows determination of l and p in O(1) rather than + * O(n). We just need an arch-optimized log2() function now. + */ + lg = 0; + for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1) + lg++; + + /* Get the phrase token into i. */ + pt = le16_to_cpup((le16*)cb); + + /* + * Calculate starting position of the byte sequence in + * the destination using the fact that p = (pt >> (12 - lg)) + 1 + * and make sure we don't go too far back. + */ + dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1; + if (dp_back_addr < dp_sb_start) + goto return_overflow; + + /* Now calculate the length of the byte sequence. */ + length = (pt & (0xfff >> lg)) + 3; + + /* Advance destination position and verify it is in range. */ + *dest_ofs += length; + if (*dest_ofs > do_sb_end) + goto return_overflow; + + /* The number of non-overlapping bytes. */ + max_non_overlap = dp_addr - dp_back_addr; + + if (length <= max_non_overlap) { + /* The byte sequence doesn't overlap, just copy it. */ + memcpy(dp_addr, dp_back_addr, length); + + /* Advance destination pointer. */ + dp_addr += length; + } else { + /* + * The byte sequence does overlap, copy non-overlapping + * part and then do a slow byte by byte copy for the + * overlapping part. Also, advance the destination + * pointer. + */ + memcpy(dp_addr, dp_back_addr, max_non_overlap); + dp_addr += max_non_overlap; + dp_back_addr += max_non_overlap; + length -= max_non_overlap; + while (length--) + *dp_addr++ = *dp_back_addr++; + } + + /* Advance source position and continue with the next token. */ + cb += 2; + } + + /* No tokens left in the current tag. Continue with the next tag. */ + goto do_next_tag; + +return_overflow: + ntfs_error(NULL, "Failed. Returning -EOVERFLOW."); + goto return_error; +} + +/** + * ntfs_read_compressed_block - read a compressed block into the page cache + * @page: locked page in the compression block(s) we need to read + * + * When we are called the page has already been verified to be locked and the + * attribute is known to be non-resident, not encrypted, but compressed. + * + * 1. Determine which compression block(s) @page is in. + * 2. Get hold of all pages corresponding to this/these compression block(s). + * 3. Read the (first) compression block. + * 4. Decompress it into the corresponding pages. + * 5. Throw the compressed data away and proceed to 3. for the next compression + * block or return success if no more compression blocks left. + * + * Warning: We have to be careful what we do about existing pages. They might + * have been written to so that we would lose data if we were to just overwrite + * them with the out-of-date uncompressed data. + * + * FIXME: For PAGE_CACHE_SIZE > cb_size we are not doing the Right Thing(TM) at + * the end of the file I think. We need to detect this case and zero the out + * of bounds remainder of the page in question and mark it as handled. At the + * moment we would just return -EIO on such a page. This bug will only become + * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte + * clusters so is probably not going to be seen by anyone. Still this should + * be fixed. (AIA) + * + * FIXME: Again for PAGE_CACHE_SIZE > cb_size we are screwing up both in + * handling sparse and compressed cbs. (AIA) + * + * FIXME: At the moment we don't do any zeroing out in the case that + * initialized_size is less than data_size. This should be safe because of the + * nature of the compression algorithm used. Just in case we check and output + * an error message in read inode if the two sizes are not equal for a + * compressed file. (AIA) + */ +int ntfs_read_compressed_block(struct page *page) +{ + loff_t i_size; + s64 initialized_size; + struct address_space *mapping = page->mapping; + ntfs_inode *ni = NTFS_I(mapping->host); + ntfs_volume *vol = ni->vol; + struct super_block *sb = vol->sb; + runlist_element *rl; + unsigned long flags, block_size = sb->s_blocksize; + unsigned char block_size_bits = sb->s_blocksize_bits; + u8 *cb, *cb_pos, *cb_end; + struct buffer_head **bhs; + unsigned long offset, index = page->index; + u32 cb_size = ni->itype.compressed.block_size; + u64 cb_size_mask = cb_size - 1UL; + VCN vcn; + LCN lcn; + /* The first wanted vcn (minimum alignment is PAGE_CACHE_SIZE). */ + VCN start_vcn = (((s64)index << PAGE_CACHE_SHIFT) & ~cb_size_mask) >> + vol->cluster_size_bits; + /* + * The first vcn after the last wanted vcn (minimum alignment is again + * PAGE_CACHE_SIZE. + */ + VCN end_vcn = ((((s64)(index + 1UL) << PAGE_CACHE_SHIFT) + cb_size - 1) + & ~cb_size_mask) >> vol->cluster_size_bits; + /* Number of compression blocks (cbs) in the wanted vcn range. */ + unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits + >> ni->itype.compressed.block_size_bits; + /* + * Number of pages required to store the uncompressed data from all + * compression blocks (cbs) overlapping @page. Due to alignment + * guarantees of start_vcn and end_vcn, no need to round up here. + */ + unsigned int nr_pages = (end_vcn - start_vcn) << + vol->cluster_size_bits >> PAGE_CACHE_SHIFT; + unsigned int xpage, max_page, cur_page, cur_ofs, i; + unsigned int cb_clusters, cb_max_ofs; + int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0; + struct page **pages; + unsigned char xpage_done = 0; + + ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = " + "%i.", index, cb_size, nr_pages); + /* + * Bad things happen if we get here for anything that is not an + * unnamed $DATA attribute. + */ + BUG_ON(ni->type != AT_DATA); + BUG_ON(ni->name_len); + + pages = kmalloc(nr_pages * sizeof(struct page *), GFP_NOFS); + + /* Allocate memory to store the buffer heads we need. */ + bhs_size = cb_size / block_size * sizeof(struct buffer_head *); + bhs = kmalloc(bhs_size, GFP_NOFS); + + if (unlikely(!pages || !bhs)) { + kfree(bhs); + kfree(pages); + unlock_page(page); + ntfs_error(vol->sb, "Failed to allocate internal buffers."); + return -ENOMEM; + } + + /* + * We have already been given one page, this is the one we must do. + * Once again, the alignment guarantees keep it simple. + */ + offset = start_vcn << vol->cluster_size_bits >> PAGE_CACHE_SHIFT; + xpage = index - offset; + pages[xpage] = page; + /* + * The remaining pages need to be allocated and inserted into the page + * cache, alignment guarantees keep all the below much simpler. (-8 + */ + read_lock_irqsave(&ni->size_lock, flags); + i_size = i_size_read(VFS_I(ni)); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + max_page = ((i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + offset; + /* Is the page fully outside i_size? (truncate in progress) */ + if (xpage >= max_page) { + kfree(bhs); + kfree(pages); + zero_user(page, 0, PAGE_CACHE_SIZE); + ntfs_debug("Compressed read outside i_size - truncated?"); + SetPageUptodate(page); + unlock_page(page); + return 0; + } + if (nr_pages < max_page) + max_page = nr_pages; + for (i = 0; i < max_page; i++, offset++) { + if (i != xpage) + pages[i] = grab_cache_page_nowait(mapping, offset); + page = pages[i]; + if (page) { + /* + * We only (re)read the page if it isn't already read + * in and/or dirty or we would be losing data or at + * least wasting our time. + */ + if (!PageDirty(page) && (!PageUptodate(page) || + PageError(page))) { + ClearPageError(page); + kmap(page); + continue; + } + unlock_page(page); + page_cache_release(page); + pages[i] = NULL; + } + } + + /* + * We have the runlist, and all the destination pages we need to fill. + * Now read the first compression block. + */ + cur_page = 0; + cur_ofs = 0; + cb_clusters = ni->itype.compressed.block_clusters; +do_next_cb: + nr_cbs--; + nr_bhs = 0; + + /* Read all cb buffer heads one cluster at a time. */ + rl = NULL; + for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn; + vcn++) { + bool is_retry = false; + + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", + (unsigned long long)vcn, + (unsigned long long)lcn); + if (lcn < 0) { + /* + * When we reach the first sparse cluster we have + * finished with the cb. + */ + if (lcn == LCN_HOLE) + break; + if (is_retry || lcn != LCN_RL_NOT_MAPPED) + goto rl_err; + is_retry = true; + /* + * Attempt to map runlist, dropping lock for the + * duration. + */ + up_read(&ni->runlist.lock); + if (!ntfs_map_runlist(ni, vcn)) + goto lock_retry_remap; + goto map_rl_err; + } + block = lcn << vol->cluster_size_bits >> block_size_bits; + /* Read the lcn from device in chunks of block_size bytes. */ + max_block = block + (vol->cluster_size >> block_size_bits); + do { + ntfs_debug("block = 0x%x.", block); + if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block)))) + goto getblk_err; + nr_bhs++; + } while (++block < max_block); + } + + /* Release the lock if we took it. */ + if (rl) + up_read(&ni->runlist.lock); + + /* Setup and initiate io on all buffer heads. */ + for (i = 0; i < nr_bhs; i++) { + struct buffer_head *tbh = bhs[i]; + + if (!trylock_buffer(tbh)) + continue; + if (unlikely(buffer_uptodate(tbh))) { + unlock_buffer(tbh); + continue; + } + get_bh(tbh); + tbh->b_end_io = end_buffer_read_sync; + submit_bh(READ, tbh); + } + + /* Wait for io completion on all buffer heads. */ + for (i = 0; i < nr_bhs; i++) { + struct buffer_head *tbh = bhs[i]; + + if (buffer_uptodate(tbh)) + continue; + wait_on_buffer(tbh); + /* + * We need an optimization barrier here, otherwise we start + * hitting the below fixup code when accessing a loopback + * mounted ntfs partition. This indicates either there is a + * race condition in the loop driver or, more likely, gcc + * overoptimises the code without the barrier and it doesn't + * do the Right Thing(TM). + */ + barrier(); + if (unlikely(!buffer_uptodate(tbh))) { + ntfs_warning(vol->sb, "Buffer is unlocked but not " + "uptodate! Unplugging the disk queue " + "and rescheduling."); + get_bh(tbh); + io_schedule(); + put_bh(tbh); + if (unlikely(!buffer_uptodate(tbh))) + goto read_err; + ntfs_warning(vol->sb, "Buffer is now uptodate. Good."); + } + } + + /* + * Get the compression buffer. We must not sleep any more + * until we are finished with it. + */ + spin_lock(&ntfs_cb_lock); + cb = ntfs_compression_buffer; + + BUG_ON(!cb); + + cb_pos = cb; + cb_end = cb + cb_size; + + /* Copy the buffer heads into the contiguous buffer. */ + for (i = 0; i < nr_bhs; i++) { + memcpy(cb_pos, bhs[i]->b_data, block_size); + cb_pos += block_size; + } + + /* Just a precaution. */ + if (cb_pos + 2 <= cb + cb_size) + *(u16*)cb_pos = 0; + + /* Reset cb_pos back to the beginning. */ + cb_pos = cb; + + /* We now have both source (if present) and destination. */ + ntfs_debug("Successfully read the compression block."); + + /* The last page and maximum offset within it for the current cb. */ + cb_max_page = (cur_page << PAGE_CACHE_SHIFT) + cur_ofs + cb_size; + cb_max_ofs = cb_max_page & ~PAGE_CACHE_MASK; + cb_max_page >>= PAGE_CACHE_SHIFT; + + /* Catch end of file inside a compression block. */ + if (cb_max_page > max_page) + cb_max_page = max_page; + + if (vcn == start_vcn - cb_clusters) { + /* Sparse cb, zero out page range overlapping the cb. */ + ntfs_debug("Found sparse compression block."); + /* We can sleep from now on, so we drop lock. */ + spin_unlock(&ntfs_cb_lock); + if (cb_max_ofs) + cb_max_page--; + for (; cur_page < cb_max_page; cur_page++) { + page = pages[cur_page]; + if (page) { + /* + * FIXME: Using clear_page() will become wrong + * when we get PAGE_CACHE_SIZE != PAGE_SIZE but + * for now there is no problem. + */ + if (likely(!cur_ofs)) + clear_page(page_address(page)); + else + memset(page_address(page) + cur_ofs, 0, + PAGE_CACHE_SIZE - + cur_ofs); + flush_dcache_page(page); + kunmap(page); + SetPageUptodate(page); + unlock_page(page); + if (cur_page == xpage) + xpage_done = 1; + else + page_cache_release(page); + pages[cur_page] = NULL; + } + cb_pos += PAGE_CACHE_SIZE - cur_ofs; + cur_ofs = 0; + if (cb_pos >= cb_end) + break; + } + /* If we have a partial final page, deal with it now. */ + if (cb_max_ofs && cb_pos < cb_end) { + page = pages[cur_page]; + if (page) + memset(page_address(page) + cur_ofs, 0, + cb_max_ofs - cur_ofs); + /* + * No need to update cb_pos at this stage: + * cb_pos += cb_max_ofs - cur_ofs; + */ + cur_ofs = cb_max_ofs; + } + } else if (vcn == start_vcn) { + /* We can't sleep so we need two stages. */ + unsigned int cur2_page = cur_page; + unsigned int cur_ofs2 = cur_ofs; + u8 *cb_pos2 = cb_pos; + + ntfs_debug("Found uncompressed compression block."); + /* Uncompressed cb, copy it to the destination pages. */ + /* + * TODO: As a big optimization, we could detect this case + * before we read all the pages and use block_read_full_page() + * on all full pages instead (we still have to treat partial + * pages especially but at least we are getting rid of the + * synchronous io for the majority of pages. + * Or if we choose not to do the read-ahead/-behind stuff, we + * could just return block_read_full_page(pages[xpage]) as long + * as PAGE_CACHE_SIZE <= cb_size. + */ + if (cb_max_ofs) + cb_max_page--; + /* First stage: copy data into destination pages. */ + for (; cur_page < cb_max_page; cur_page++) { + page = pages[cur_page]; + if (page) + memcpy(page_address(page) + cur_ofs, cb_pos, + PAGE_CACHE_SIZE - cur_ofs); + cb_pos += PAGE_CACHE_SIZE - cur_ofs; + cur_ofs = 0; + if (cb_pos >= cb_end) + break; + } + /* If we have a partial final page, deal with it now. */ + if (cb_max_ofs && cb_pos < cb_end) { + page = pages[cur_page]; + if (page) + memcpy(page_address(page) + cur_ofs, cb_pos, + cb_max_ofs - cur_ofs); + cb_pos += cb_max_ofs - cur_ofs; + cur_ofs = cb_max_ofs; + } + /* We can sleep from now on, so drop lock. */ + spin_unlock(&ntfs_cb_lock); + /* Second stage: finalize pages. */ + for (; cur2_page < cb_max_page; cur2_page++) { + page = pages[cur2_page]; + if (page) { + /* + * If we are outside the initialized size, zero + * the out of bounds page range. + */ + handle_bounds_compressed_page(page, i_size, + initialized_size); + flush_dcache_page(page); + kunmap(page); + SetPageUptodate(page); + unlock_page(page); + if (cur2_page == xpage) + xpage_done = 1; + else + page_cache_release(page); + pages[cur2_page] = NULL; + } + cb_pos2 += PAGE_CACHE_SIZE - cur_ofs2; + cur_ofs2 = 0; + if (cb_pos2 >= cb_end) + break; + } + } else { + /* Compressed cb, decompress it into the destination page(s). */ + unsigned int prev_cur_page = cur_page; + + ntfs_debug("Found compressed compression block."); + err = ntfs_decompress(pages, &cur_page, &cur_ofs, + cb_max_page, cb_max_ofs, xpage, &xpage_done, + cb_pos, cb_size - (cb_pos - cb), i_size, + initialized_size); + /* + * We can sleep from now on, lock already dropped by + * ntfs_decompress(). + */ + if (err) { + ntfs_error(vol->sb, "ntfs_decompress() failed in inode " + "0x%lx with error code %i. Skipping " + "this compression block.", + ni->mft_no, -err); + /* Release the unfinished pages. */ + for (; prev_cur_page < cur_page; prev_cur_page++) { + page = pages[prev_cur_page]; + if (page) { + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + if (prev_cur_page != xpage) + page_cache_release(page); + pages[prev_cur_page] = NULL; + } + } + } + } + + /* Release the buffer heads. */ + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + + /* Do we have more work to do? */ + if (nr_cbs) + goto do_next_cb; + + /* We no longer need the list of buffer heads. */ + kfree(bhs); + + /* Clean up if we have any pages left. Should never happen. */ + for (cur_page = 0; cur_page < max_page; cur_page++) { + page = pages[cur_page]; + if (page) { + ntfs_error(vol->sb, "Still have pages left! " + "Terminating them with extreme " + "prejudice. Inode 0x%lx, page index " + "0x%lx.", ni->mft_no, page->index); + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + if (cur_page != xpage) + page_cache_release(page); + pages[cur_page] = NULL; + } + } + + /* We no longer need the list of pages. */ + kfree(pages); + + /* If we have completed the requested page, we return success. */ + if (likely(xpage_done)) + return 0; + + ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ? + "EOVERFLOW" : (!err ? "EIO" : "unknown error")); + return err < 0 ? err : -EIO; + +read_err: + ntfs_error(vol->sb, "IO error while reading compressed data."); + /* Release the buffer heads. */ + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + goto err_out; + +map_rl_err: + ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read " + "compression block."); + goto err_out; + +rl_err: + up_read(&ni->runlist.lock); + ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read " + "compression block."); + goto err_out; + +getblk_err: + up_read(&ni->runlist.lock); + ntfs_error(vol->sb, "getblk() failed. Cannot read compression block."); + +err_out: + kfree(bhs); + for (i = cur_page; i < max_page; i++) { + page = pages[i]; + if (page) { + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + if (i != xpage) + page_cache_release(page); + } + } + kfree(pages); + return -EIO; +} diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c new file mode 100644 index 00000000..807150e2 --- /dev/null +++ b/fs/ntfs/debug.c @@ -0,0 +1,183 @@ +/* + * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "debug.h" + +/* + * A static buffer to hold the error string being displayed and a spinlock + * to protect concurrent accesses to it. + */ +static char err_buf[1024]; +static DEFINE_SPINLOCK(err_buf_lock); + +/** + * __ntfs_warning - output a warning to the syslog + * @function: name of function outputting the warning + * @sb: super block of mounted ntfs filesystem + * @fmt: warning string containing format specifications + * @...: a variable number of arguments specified in @fmt + * + * Outputs a warning to the syslog for the mounted ntfs filesystem described + * by @sb. + * + * @fmt and the corresponding @... is printf style format string containing + * the warning string and the corresponding format arguments, respectively. + * + * @function is the name of the function from which __ntfs_warning is being + * called. + * + * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead + * as this provides the @function parameter automatically. + */ +void __ntfs_warning(const char *function, const struct super_block *sb, + const char *fmt, ...) +{ + va_list args; + int flen = 0; + +#ifndef DEBUG + if (!printk_ratelimit()) + return; +#endif + if (function) + flen = strlen(function); + spin_lock(&err_buf_lock); + va_start(args, fmt); + vsnprintf(err_buf, sizeof(err_buf), fmt, args); + va_end(args); + if (sb) + printk(KERN_ERR "NTFS-fs warning (device %s): %s(): %s\n", + sb->s_id, flen ? function : "", err_buf); + else + printk(KERN_ERR "NTFS-fs warning: %s(): %s\n", + flen ? function : "", err_buf); + spin_unlock(&err_buf_lock); +} + +/** + * __ntfs_error - output an error to the syslog + * @function: name of function outputting the error + * @sb: super block of mounted ntfs filesystem + * @fmt: error string containing format specifications + * @...: a variable number of arguments specified in @fmt + * + * Outputs an error to the syslog for the mounted ntfs filesystem described + * by @sb. + * + * @fmt and the corresponding @... is printf style format string containing + * the error string and the corresponding format arguments, respectively. + * + * @function is the name of the function from which __ntfs_error is being + * called. + * + * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead + * as this provides the @function parameter automatically. + */ +void __ntfs_error(const char *function, const struct super_block *sb, + const char *fmt, ...) +{ + va_list args; + int flen = 0; + +#ifndef DEBUG + if (!printk_ratelimit()) + return; +#endif + if (function) + flen = strlen(function); + spin_lock(&err_buf_lock); + va_start(args, fmt); + vsnprintf(err_buf, sizeof(err_buf), fmt, args); + va_end(args); + if (sb) + printk(KERN_ERR "NTFS-fs error (device %s): %s(): %s\n", + sb->s_id, flen ? function : "", err_buf); + else + printk(KERN_ERR "NTFS-fs error: %s(): %s\n", + flen ? function : "", err_buf); + spin_unlock(&err_buf_lock); +} + +#ifdef DEBUG + +/* If 1, output debug messages, and if 0, don't. */ +int debug_msgs = 0; + +void __ntfs_debug (const char *file, int line, const char *function, + const char *fmt, ...) +{ + va_list args; + int flen = 0; + + if (!debug_msgs) + return; + if (function) + flen = strlen(function); + spin_lock(&err_buf_lock); + va_start(args, fmt); + vsnprintf(err_buf, sizeof(err_buf), fmt, args); + va_end(args); + printk(KERN_DEBUG "NTFS-fs DEBUG (%s, %d): %s(): %s\n", file, line, + flen ? function : "", err_buf); + spin_unlock(&err_buf_lock); +} + +/* Dump a runlist. Caller has to provide synchronisation for @rl. */ +void ntfs_debug_dump_runlist(const runlist_element *rl) +{ + int i; + const char *lcn_str[5] = { "LCN_HOLE ", "LCN_RL_NOT_MAPPED", + "LCN_ENOENT ", "LCN_unknown " }; + + if (!debug_msgs) + return; + printk(KERN_DEBUG "NTFS-fs DEBUG: Dumping runlist (values in hex):\n"); + if (!rl) { + printk(KERN_DEBUG "Run list not present.\n"); + return; + } + printk(KERN_DEBUG "VCN LCN Run length\n"); + for (i = 0; ; i++) { + LCN lcn = (rl + i)->lcn; + + if (lcn < (LCN)0) { + int index = -lcn - 1; + + if (index > -LCN_ENOENT - 1) + index = 3; + printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n", + (long long)(rl + i)->vcn, lcn_str[index], + (long long)(rl + i)->length, + (rl + i)->length ? "" : + " (runlist end)"); + } else + printk(KERN_DEBUG "%-16Lx %-16Lx %-16Lx%s\n", + (long long)(rl + i)->vcn, + (long long)(rl + i)->lcn, + (long long)(rl + i)->length, + (rl + i)->length ? "" : + " (runlist end)"); + if (!(rl + i)->length) + break; + } +} + +#endif diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h new file mode 100644 index 00000000..2142b1c6 --- /dev/null +++ b/fs/ntfs/debug.h @@ -0,0 +1,63 @@ +/* + * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_DEBUG_H +#define _LINUX_NTFS_DEBUG_H + +#include + +#include "runlist.h" + +#ifdef DEBUG + +extern int debug_msgs; + +extern void __ntfs_debug(const char *file, int line, const char *function, + const char *format, ...) __attribute__ ((format (printf, 4, 5))); +/** + * ntfs_debug - write a debug level message to syslog + * @f: a printf format string containing the message + * @...: the variables to substitute into @f + * + * ntfs_debug() writes a DEBUG level message to the syslog but only if the + * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP. + */ +#define ntfs_debug(f, a...) \ + __ntfs_debug(__FILE__, __LINE__, __func__, f, ##a) + +extern void ntfs_debug_dump_runlist(const runlist_element *rl); + +#else /* !DEBUG */ + +#define ntfs_debug(f, a...) do {} while (0) +#define ntfs_debug_dump_runlist(rl) do {} while (0) + +#endif /* !DEBUG */ + +extern void __ntfs_warning(const char *function, const struct super_block *sb, + const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); +#define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a) + +extern void __ntfs_error(const char *function, const struct super_block *sb, + const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); +#define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a) + +#endif /* _LINUX_NTFS_DEBUG_H */ diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c new file mode 100644 index 00000000..0f48e7c5 --- /dev/null +++ b/fs/ntfs/dir.c @@ -0,0 +1,1575 @@ +/** + * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include + +#include "dir.h" +#include "aops.h" +#include "attrib.h" +#include "mft.h" +#include "debug.h" +#include "ntfs.h" + +/** + * The little endian Unicode string $I30 as a global constant. + */ +ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'), + cpu_to_le16('3'), cpu_to_le16('0'), 0 }; + +/** + * ntfs_lookup_inode_by_name - find an inode in a directory given its name + * @dir_ni: ntfs inode of the directory in which to search for the name + * @uname: Unicode name for which to search in the directory + * @uname_len: length of the name @uname in Unicode characters + * @res: return the found file name if necessary (see below) + * + * Look for an inode with name @uname in the directory with inode @dir_ni. + * ntfs_lookup_inode_by_name() walks the contents of the directory looking for + * the Unicode name. If the name is found in the directory, the corresponding + * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it + * is a 64-bit number containing the sequence number. + * + * On error, a negative value is returned corresponding to the error code. In + * particular if the inode is not found -ENOENT is returned. Note that you + * can't just check the return value for being negative, you have to check the + * inode number for being negative which you can extract using MREC(return + * value). + * + * Note, @uname_len does not include the (optional) terminating NULL character. + * + * Note, we look for a case sensitive match first but we also look for a case + * insensitive match at the same time. If we find a case insensitive match, we + * save that for the case that we don't find an exact match, where we return + * the case insensitive match and setup @res (which we allocate!) with the mft + * reference, the file name type, length and with a copy of the little endian + * Unicode file name itself. If we match a file name which is in the DOS name + * space, we only return the mft reference and file name type in @res. + * ntfs_lookup() then uses this to find the long file name in the inode itself. + * This is to avoid polluting the dcache with short file names. We want them to + * work but we don't care for how quickly one can access them. This also fixes + * the dcache aliasing issues. + * + * Locking: - Caller must hold i_mutex on the directory. + * - Each page cache page in the index allocation mapping must be + * locked whilst being accessed otherwise we may find a corrupt + * page due to it being under ->writepage at the moment which + * applies the mst protection fixups before writing out and then + * removes them again after the write is complete after which it + * unlocks the page. + */ +MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, + const int uname_len, ntfs_name **res) +{ + ntfs_volume *vol = dir_ni->vol; + struct super_block *sb = vol->sb; + MFT_RECORD *m; + INDEX_ROOT *ir; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *index_end; + u64 mref; + ntfs_attr_search_ctx *ctx; + int err, rc; + VCN vcn, old_vcn; + struct address_space *ia_mapping; + struct page *page; + u8 *kaddr; + ntfs_name *name = NULL; + + BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode)); + BUG_ON(NInoAttr(dir_ni)); + /* Get hold of the mft record for the directory. */ + m = map_mft_record(dir_ni); + if (IS_ERR(m)) { + ntfs_error(sb, "map_mft_record() failed with error code %ld.", + -PTR_ERR(m)); + return ERR_MREF(PTR_ERR(m)); + } + ctx = ntfs_attr_get_search_ctx(dir_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(sb, "Index root attribute missing in " + "directory inode 0x%lx.", + dir_ni->mft_no); + err = -EIO; + } + goto err_out; + } + /* Get to the index root value (it's been verified in read_inode). */ + ir = (INDEX_ROOT*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) + goto dir_err_out; + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * We perform a case sensitive comparison and if that matches + * we are done and return the mft reference of the inode (i.e. + * the inode number together with the sequence number for + * consistency checking). We convert it to cpu format before + * returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { +found_it: + /* + * We have a perfect match, so we don't need to care + * about having matched imperfectly before, so we can + * free name and set *res to NULL. + * However, if the perfect match is a short file name, + * we need to signal this through *res, so that + * ntfs_lookup() can fix dcache aliasing issues. + * As an optimization we just reuse an existing + * allocation of *res. + */ + if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { + if (!name) { + name = kmalloc(sizeof(ntfs_name), + GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto err_out; + } + } + name->mref = le64_to_cpu( + ie->data.dir.indexed_file); + name->type = FILE_NAME_DOS; + name->len = 0; + *res = name; + } else { + kfree(name); + *res = NULL; + } + mref = le64_to_cpu(ie->data.dir.indexed_file); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + return mref; + } + /* + * For a case insensitive mount, we also perform a case + * insensitive comparison (provided the file name is not in the + * POSIX namespace). If the comparison matches, and the name is + * in the WIN32 namespace, we cache the filename in *res so + * that the caller, ntfs_lookup(), can work on it. If the + * comparison matches, and the name is in the DOS namespace, we + * only cache the mft reference and the file name type (we set + * the name length to zero for simplicity). + */ + if (!NVolCaseSensitive(vol) && + ie->key.file_name.file_name_type && + ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + IGNORE_CASE, vol->upcase, vol->upcase_len)) { + int name_size = sizeof(ntfs_name); + u8 type = ie->key.file_name.file_name_type; + u8 len = ie->key.file_name.file_name_length; + + /* Only one case insensitive matching name allowed. */ + if (name) { + ntfs_error(sb, "Found already allocated name " + "in phase 1. Please run chkdsk " + "and if that doesn't find any " + "errors please report you saw " + "this message to " + "linux-ntfs-dev@lists." + "sourceforge.net."); + goto dir_err_out; + } + + if (type != FILE_NAME_DOS) + name_size += len * sizeof(ntfschar); + name = kmalloc(name_size, GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto err_out; + } + name->mref = le64_to_cpu(ie->data.dir.indexed_file); + name->type = type; + if (type != FILE_NAME_DOS) { + name->len = len; + memcpy(name->name, ie->key.file_name.file_name, + len * sizeof(ntfschar)); + } else + name->len = 0; + *res = name; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it; + } + /* + * We have finished with this index without success. Check for the + * presence of a child node and if not present return -ENOENT, unless + * we have got a matching name cached in name in which case return the + * mft reference associated with it. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + if (name) { + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + return name->mref; + } + ntfs_debug("Entry not found."); + err = -ENOENT; + goto err_out; + } /* Child node present, descend into it. */ + /* Consistency check: Verify that an index allocation exists. */ + if (!NInoIndexAllocPresent(dir_ni)) { + ntfs_error(sb, "No index allocation attribute but index entry " + "requires one. Directory inode 0x%lx is " + "corrupt or driver bug.", dir_ni->mft_no); + goto err_out; + } + /* Get the starting vcn of the index_block holding the child node. */ + vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); + ia_mapping = VFS_I(dir_ni)->i_mapping; + /* + * We are done with the index root and the mft record. Release them, + * otherwise we deadlock with ntfs_map_page(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + m = NULL; + ctx = NULL; +descend_into_child_node: + /* + * Convert vcn to index into the index allocation attribute in units + * of PAGE_CACHE_SIZE and map the page cache page, reading it from + * disk if necessary. + */ + page = ntfs_map_page(ia_mapping, vcn << + dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(sb, "Failed to map directory index page, error %ld.", + -PTR_ERR(page)); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + kaddr = (u8*)page_address(page); +fast_descend_into_child_node: + /* Get to the index allocation block. */ + ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << + dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK)); + /* Bounds checks. */ + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) { + ntfs_error(sb, "Out of bounds check failed. Corrupt directory " + "inode 0x%lx or driver bug.", dir_ni->mft_no); + goto unm_err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Directory index record with vcn 0x%llx is " + "corrupt. Corrupt inode 0x%lx. Run chkdsk.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (sle64_to_cpu(ia->index_block_vcn) != vcn) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). " + "Directory inode 0x%lx is corrupt or driver " + "bug.", (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (le32_to_cpu(ia->index.allocated_size) + 0x18 != + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx has a size (%u) differing from the " + "directory specified size (%u). Directory " + "inode is corrupt or driver bug.", + (unsigned long long)vcn, dir_ni->mft_no, + le32_to_cpu(ia->index.allocated_size) + 0x18, + dir_ni->itype.index.block_size); + goto unm_err_out; + } + index_end = (u8*)ia + dir_ni->itype.index.block_size; + if (index_end > kaddr + PAGE_CACHE_SIZE) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx crosses page boundary. Impossible! " + "Cannot access! This is probably a bug in the " + "driver.", (unsigned long long)vcn, + dir_ni->mft_no); + goto unm_err_out; + } + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " + "inode 0x%lx exceeds maximum size.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Iterate similar to above big loop but applied to index buffer, thus + * loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds check. */ + if ((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) { + ntfs_error(sb, "Index entry out of bounds in " + "directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * We perform a case sensitive comparison and if that matches + * we are done and return the mft reference of the inode (i.e. + * the inode number together with the sequence number for + * consistency checking). We convert it to cpu format before + * returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { +found_it2: + /* + * We have a perfect match, so we don't need to care + * about having matched imperfectly before, so we can + * free name and set *res to NULL. + * However, if the perfect match is a short file name, + * we need to signal this through *res, so that + * ntfs_lookup() can fix dcache aliasing issues. + * As an optimization we just reuse an existing + * allocation of *res. + */ + if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { + if (!name) { + name = kmalloc(sizeof(ntfs_name), + GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto unm_err_out; + } + } + name->mref = le64_to_cpu( + ie->data.dir.indexed_file); + name->type = FILE_NAME_DOS; + name->len = 0; + *res = name; + } else { + kfree(name); + *res = NULL; + } + mref = le64_to_cpu(ie->data.dir.indexed_file); + unlock_page(page); + ntfs_unmap_page(page); + return mref; + } + /* + * For a case insensitive mount, we also perform a case + * insensitive comparison (provided the file name is not in the + * POSIX namespace). If the comparison matches, and the name is + * in the WIN32 namespace, we cache the filename in *res so + * that the caller, ntfs_lookup(), can work on it. If the + * comparison matches, and the name is in the DOS namespace, we + * only cache the mft reference and the file name type (we set + * the name length to zero for simplicity). + */ + if (!NVolCaseSensitive(vol) && + ie->key.file_name.file_name_type && + ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + IGNORE_CASE, vol->upcase, vol->upcase_len)) { + int name_size = sizeof(ntfs_name); + u8 type = ie->key.file_name.file_name_type; + u8 len = ie->key.file_name.file_name_length; + + /* Only one case insensitive matching name allowed. */ + if (name) { + ntfs_error(sb, "Found already allocated name " + "in phase 2. Please run chkdsk " + "and if that doesn't find any " + "errors please report you saw " + "this message to " + "linux-ntfs-dev@lists." + "sourceforge.net."); + unlock_page(page); + ntfs_unmap_page(page); + goto dir_err_out; + } + + if (type != FILE_NAME_DOS) + name_size += len * sizeof(ntfschar); + name = kmalloc(name_size, GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto unm_err_out; + } + name->mref = le64_to_cpu(ie->data.dir.indexed_file); + name->type = type; + if (type != FILE_NAME_DOS) { + name->len = len; + memcpy(name->name, ie->key.file_name.file_name, + len * sizeof(ntfschar)); + } else + name->len = 0; + *res = name; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it2; + } + /* + * We have finished with this index buffer without success. Check for + * the presence of a child node. + */ + if (ie->flags & INDEX_ENTRY_NODE) { + if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { + ntfs_error(sb, "Index entry with child node found in " + "a leaf node in directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* Child node present, descend into it. */ + old_vcn = vcn; + vcn = sle64_to_cpup((sle64*)((u8*)ie + + le16_to_cpu(ie->length) - 8)); + if (vcn >= 0) { + /* If vcn is in the same page cache page as old_vcn we + * recycle the mapped page. */ + if (old_vcn << vol->cluster_size_bits >> + PAGE_CACHE_SHIFT == vcn << + vol->cluster_size_bits >> + PAGE_CACHE_SHIFT) + goto fast_descend_into_child_node; + unlock_page(page); + ntfs_unmap_page(page); + goto descend_into_child_node; + } + ntfs_error(sb, "Negative child node vcn in directory inode " + "0x%lx.", dir_ni->mft_no); + goto unm_err_out; + } + /* + * No child node present, return -ENOENT, unless we have got a matching + * name cached in name in which case return the mft reference + * associated with it. + */ + if (name) { + unlock_page(page); + ntfs_unmap_page(page); + return name->mref; + } + ntfs_debug("Entry not found."); + err = -ENOENT; +unm_err_out: + unlock_page(page); + ntfs_unmap_page(page); +err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(dir_ni); + if (name) { + kfree(name); + *res = NULL; + } + return ERR_MREF(err); +dir_err_out: + ntfs_error(sb, "Corrupt directory. Aborting lookup."); + goto err_out; +} + +#if 0 + +// TODO: (AIA) +// The algorithm embedded in this code will be required for the time when we +// want to support adding of entries to directories, where we require correct +// collation of file names in order not to cause corruption of the filesystem. + +/** + * ntfs_lookup_inode_by_name - find an inode in a directory given its name + * @dir_ni: ntfs inode of the directory in which to search for the name + * @uname: Unicode name for which to search in the directory + * @uname_len: length of the name @uname in Unicode characters + * + * Look for an inode with name @uname in the directory with inode @dir_ni. + * ntfs_lookup_inode_by_name() walks the contents of the directory looking for + * the Unicode name. If the name is found in the directory, the corresponding + * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it + * is a 64-bit number containing the sequence number. + * + * On error, a negative value is returned corresponding to the error code. In + * particular if the inode is not found -ENOENT is returned. Note that you + * can't just check the return value for being negative, you have to check the + * inode number for being negative which you can extract using MREC(return + * value). + * + * Note, @uname_len does not include the (optional) terminating NULL character. + */ +u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, + const int uname_len) +{ + ntfs_volume *vol = dir_ni->vol; + struct super_block *sb = vol->sb; + MFT_RECORD *m; + INDEX_ROOT *ir; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *index_end; + u64 mref; + ntfs_attr_search_ctx *ctx; + int err, rc; + IGNORE_CASE_BOOL ic; + VCN vcn, old_vcn; + struct address_space *ia_mapping; + struct page *page; + u8 *kaddr; + + /* Get hold of the mft record for the directory. */ + m = map_mft_record(dir_ni); + if (IS_ERR(m)) { + ntfs_error(sb, "map_mft_record() failed with error code %ld.", + -PTR_ERR(m)); + return ERR_MREF(PTR_ERR(m)); + } + ctx = ntfs_attr_get_search_ctx(dir_ni, m); + if (!ctx) { + err = -ENOMEM; + goto err_out; + } + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(sb, "Index root attribute missing in " + "directory inode 0x%lx.", + dir_ni->mft_no); + err = -EIO; + } + goto err_out; + } + /* Get to the index root value (it's been verified in read_inode). */ + ir = (INDEX_ROOT*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) + goto dir_err_out; + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * If the current entry has a name type of POSIX, the name is + * case sensitive and not otherwise. This has the effect of us + * not being able to access any POSIX file names which collate + * after the non-POSIX one when they only differ in case, but + * anyone doing screwy stuff like that deserves to burn in + * hell... Doing that kind of stuff on NT4 actually causes + * corruption on the partition even when using SP6a and Linux + * is not involved at all. + */ + ic = ie->key.file_name.file_name_type ? IGNORE_CASE : + CASE_SENSITIVE; + /* + * If the names match perfectly, we are done and return the + * mft reference of the inode (i.e. the inode number together + * with the sequence number for consistency checking. We + * convert it to cpu format before returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, ic, + vol->upcase, vol->upcase_len)) { +found_it: + mref = le64_to_cpu(ie->data.dir.indexed_file); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + return mref; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it; + } + /* + * We have finished with this index without success. Check for the + * presence of a child node. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + /* No child node, return -ENOENT. */ + err = -ENOENT; + goto err_out; + } /* Child node present, descend into it. */ + /* Consistency check: Verify that an index allocation exists. */ + if (!NInoIndexAllocPresent(dir_ni)) { + ntfs_error(sb, "No index allocation attribute but index entry " + "requires one. Directory inode 0x%lx is " + "corrupt or driver bug.", dir_ni->mft_no); + goto err_out; + } + /* Get the starting vcn of the index_block holding the child node. */ + vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); + ia_mapping = VFS_I(dir_ni)->i_mapping; + /* + * We are done with the index root and the mft record. Release them, + * otherwise we deadlock with ntfs_map_page(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + m = NULL; + ctx = NULL; +descend_into_child_node: + /* + * Convert vcn to index into the index allocation attribute in units + * of PAGE_CACHE_SIZE and map the page cache page, reading it from + * disk if necessary. + */ + page = ntfs_map_page(ia_mapping, vcn << + dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(sb, "Failed to map directory index page, error %ld.", + -PTR_ERR(page)); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + kaddr = (u8*)page_address(page); +fast_descend_into_child_node: + /* Get to the index allocation block. */ + ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << + dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK)); + /* Bounds checks. */ + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) { + ntfs_error(sb, "Out of bounds check failed. Corrupt directory " + "inode 0x%lx or driver bug.", dir_ni->mft_no); + goto unm_err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Directory index record with vcn 0x%llx is " + "corrupt. Corrupt inode 0x%lx. Run chkdsk.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (sle64_to_cpu(ia->index_block_vcn) != vcn) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). " + "Directory inode 0x%lx is corrupt or driver " + "bug.", (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (le32_to_cpu(ia->index.allocated_size) + 0x18 != + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx has a size (%u) differing from the " + "directory specified size (%u). Directory " + "inode is corrupt or driver bug.", + (unsigned long long)vcn, dir_ni->mft_no, + le32_to_cpu(ia->index.allocated_size) + 0x18, + dir_ni->itype.index.block_size); + goto unm_err_out; + } + index_end = (u8*)ia + dir_ni->itype.index.block_size; + if (index_end > kaddr + PAGE_CACHE_SIZE) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx crosses page boundary. Impossible! " + "Cannot access! This is probably a bug in the " + "driver.", (unsigned long long)vcn, + dir_ni->mft_no); + goto unm_err_out; + } + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " + "inode 0x%lx exceeds maximum size.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Iterate similar to above big loop but applied to index buffer, thus + * loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds check. */ + if ((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) { + ntfs_error(sb, "Index entry out of bounds in " + "directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * If the current entry has a name type of POSIX, the name is + * case sensitive and not otherwise. This has the effect of us + * not being able to access any POSIX file names which collate + * after the non-POSIX one when they only differ in case, but + * anyone doing screwy stuff like that deserves to burn in + * hell... Doing that kind of stuff on NT4 actually causes + * corruption on the partition even when using SP6a and Linux + * is not involved at all. + */ + ic = ie->key.file_name.file_name_type ? IGNORE_CASE : + CASE_SENSITIVE; + /* + * If the names match perfectly, we are done and return the + * mft reference of the inode (i.e. the inode number together + * with the sequence number for consistency checking. We + * convert it to cpu format before returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, ic, + vol->upcase, vol->upcase_len)) { +found_it2: + mref = le64_to_cpu(ie->data.dir.indexed_file); + unlock_page(page); + ntfs_unmap_page(page); + return mref; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it2; + } + /* + * We have finished with this index buffer without success. Check for + * the presence of a child node. + */ + if (ie->flags & INDEX_ENTRY_NODE) { + if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { + ntfs_error(sb, "Index entry with child node found in " + "a leaf node in directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* Child node present, descend into it. */ + old_vcn = vcn; + vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); + if (vcn >= 0) { + /* If vcn is in the same page cache page as old_vcn we + * recycle the mapped page. */ + if (old_vcn << vol->cluster_size_bits >> + PAGE_CACHE_SHIFT == vcn << + vol->cluster_size_bits >> + PAGE_CACHE_SHIFT) + goto fast_descend_into_child_node; + unlock_page(page); + ntfs_unmap_page(page); + goto descend_into_child_node; + } + ntfs_error(sb, "Negative child node vcn in directory inode " + "0x%lx.", dir_ni->mft_no); + goto unm_err_out; + } + /* No child node, return -ENOENT. */ + ntfs_debug("Entry not found."); + err = -ENOENT; +unm_err_out: + unlock_page(page); + ntfs_unmap_page(page); +err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(dir_ni); + return ERR_MREF(err); +dir_err_out: + ntfs_error(sb, "Corrupt directory. Aborting lookup."); + goto err_out; +} + +#endif + +/** + * ntfs_filldir - ntfs specific filldir method + * @vol: current ntfs volume + * @fpos: position in the directory + * @ndir: ntfs inode of current directory + * @ia_page: page in which the index allocation buffer @ie is in resides + * @ie: current index entry + * @name: buffer to use for the converted name + * @dirent: vfs filldir callback context + * @filldir: vfs filldir callback + * + * Convert the Unicode @name to the loaded NLS and pass it to the @filldir + * callback. + * + * If @ia_page is not NULL it is the locked page containing the index + * allocation block containing the index entry @ie. + * + * Note, we drop (and then reacquire) the page lock on @ia_page across the + * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup + * since ntfs_lookup() will lock the same page. As an optimization, we do not + * retake the lock if we are returning a non-zero value as ntfs_readdir() + * would need to drop the lock immediately anyway. + */ +static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos, + ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie, + u8 *name, void *dirent, filldir_t filldir) +{ + unsigned long mref; + int name_len, rc; + unsigned dt_type; + FILE_NAME_TYPE_FLAGS name_type; + + name_type = ie->key.file_name.file_name_type; + if (name_type == FILE_NAME_DOS) { + ntfs_debug("Skipping DOS name space entry."); + return 0; + } + if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) { + ntfs_debug("Skipping root directory self reference entry."); + return 0; + } + if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user && + !NVolShowSystemFiles(vol)) { + ntfs_debug("Skipping system file."); + return 0; + } + name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, &name, + NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1); + if (name_len <= 0) { + ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.", + (long long)MREF_LE(ie->data.dir.indexed_file)); + return 0; + } + if (ie->key.file_name.file_attributes & + FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT) + dt_type = DT_DIR; + else + dt_type = DT_REG; + mref = MREF_LE(ie->data.dir.indexed_file); + /* + * Drop the page lock otherwise we deadlock with NFS when it calls + * ->lookup since ntfs_lookup() will lock the same page. + */ + if (ia_page) + unlock_page(ia_page); + ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode " + "0x%lx, DT_%s.", name, name_len, fpos, mref, + dt_type == DT_DIR ? "DIR" : "REG"); + rc = filldir(dirent, name, name_len, fpos, mref, dt_type); + /* Relock the page but not if we are aborting ->readdir. */ + if (!rc && ia_page) + lock_page(ia_page); + return rc; +} + +/* + * We use the same basic approach as the old NTFS driver, i.e. we parse the + * index root entries and then the index allocation entries that are marked + * as in use in the index bitmap. + * + * While this will return the names in random order this doesn't matter for + * ->readdir but OTOH results in a faster ->readdir. + * + * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS + * parts (e.g. ->f_pos and ->i_size, and it also protects against directory + * modifications). + * + * Locking: - Caller must hold i_mutex on the directory. + * - Each page cache page in the index allocation mapping must be + * locked whilst being accessed otherwise we may find a corrupt + * page due to it being under ->writepage at the moment which + * applies the mst protection fixups before writing out and then + * removes them again after the write is complete after which it + * unlocks the page. + */ +static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; + loff_t fpos, i_size; + struct inode *bmp_vi, *vdir = filp->f_path.dentry->d_inode; + struct super_block *sb = vdir->i_sb; + ntfs_inode *ndir = NTFS_I(vdir); + ntfs_volume *vol = NTFS_SB(sb); + MFT_RECORD *m; + INDEX_ROOT *ir = NULL; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *name = NULL; + int rc, err, ir_pos, cur_bmp_pos; + struct address_space *ia_mapping, *bmp_mapping; + struct page *bmp_page = NULL, *ia_page = NULL; + u8 *kaddr, *bmp, *index_end; + ntfs_attr_search_ctx *ctx; + + fpos = filp->f_pos; + ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.", + vdir->i_ino, fpos); + rc = err = 0; + /* Are we at end of dir yet? */ + i_size = i_size_read(vdir); + if (fpos >= i_size + vol->mft_record_size) + goto done; + /* Emulate . and .. for all directories. */ + if (!fpos) { + ntfs_debug("Calling filldir for . with len 1, fpos 0x0, " + "inode 0x%lx, DT_DIR.", vdir->i_ino); + rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR); + if (rc) + goto done; + fpos++; + } + if (fpos == 1) { + ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, " + "inode 0x%lx, DT_DIR.", + (unsigned long)parent_ino(filp->f_path.dentry)); + rc = filldir(dirent, "..", 2, fpos, + parent_ino(filp->f_path.dentry), DT_DIR); + if (rc) + goto done; + fpos++; + } + m = NULL; + ctx = NULL; + /* + * Allocate a buffer to store the current name being processed + * converted to format determined by current NLS. + */ + name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS); + if (unlikely(!name)) { + err = -ENOMEM; + goto err_out; + } + /* Are we jumping straight into the index allocation attribute? */ + if (fpos >= vol->mft_record_size) + goto skip_index_root; + /* Get hold of the mft record for the directory. */ + m = map_mft_record(ndir); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ndir, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + /* Get the offset into the index root attribute. */ + ir_pos = (s64)fpos; + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + ntfs_error(sb, "Index root attribute missing in directory " + "inode 0x%lx.", vdir->i_ino); + goto err_out; + } + /* + * Copy the index root attribute value to a buffer so that we can put + * the search context and unmap the mft record before calling the + * filldir() callback. We need to do this because of NFSd which calls + * ->lookup() from its filldir callback() and this causes NTFS to + * deadlock as ntfs_lookup() maps the mft record of the directory and + * we have got it mapped here already. The only solution is for us to + * unmap the mft record here so that a call to ntfs_lookup() is able to + * map the mft record without deadlocking. + */ + rc = le32_to_cpu(ctx->attr->data.resident.value_length); + ir = kmalloc(rc, GFP_NOFS); + if (unlikely(!ir)) { + err = -ENOMEM; + goto err_out; + } + /* Copy the index root value (it has been verified in read_inode). */ + memcpy(ir, (u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset), rc); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ndir); + ctx = NULL; + m = NULL; + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry or until filldir tells us it has had enough + * or signals an error (both covered by the rc test). + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir); + /* Bounds checks. */ + if (unlikely((u8*)ie < (u8*)ir || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end)) + goto err_out; + /* The last entry cannot contain a name. */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Skip index root entry if continuing previous readdir. */ + if (ir_pos > (u8*)ie - (u8*)ir) + continue; + /* Advance the position even if going to skip the entry. */ + fpos = (u8*)ie - (u8*)ir; + /* Submit the name to the filldir callback. */ + rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent, + filldir); + if (rc) { + kfree(ir); + goto abort; + } + } + /* We are done with the index root and can free the buffer. */ + kfree(ir); + ir = NULL; + /* If there is no index allocation attribute we are finished. */ + if (!NInoIndexAllocPresent(ndir)) + goto EOD; + /* Advance fpos to the beginning of the index allocation. */ + fpos = vol->mft_record_size; +skip_index_root: + kaddr = NULL; + prev_ia_pos = -1LL; + /* Get the offset into the index allocation attribute. */ + ia_pos = (s64)fpos - vol->mft_record_size; + ia_mapping = vdir->i_mapping; + ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino); + bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4); + if (IS_ERR(bmp_vi)) { + ntfs_error(sb, "Failed to get bitmap attribute."); + err = PTR_ERR(bmp_vi); + goto err_out; + } + bmp_mapping = bmp_vi->i_mapping; + /* Get the starting bitmap bit position and sanity check it. */ + bmp_pos = ia_pos >> ndir->itype.index.block_size_bits; + if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) { + ntfs_error(sb, "Current index allocation position exceeds " + "index bitmap size."); + goto iput_err_out; + } + /* Get the starting bit position in the current bitmap page. */ + cur_bmp_pos = bmp_pos & ((PAGE_CACHE_SIZE * 8) - 1); + bmp_pos &= ~(u64)((PAGE_CACHE_SIZE * 8) - 1); +get_next_bmp_page: + ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx", + (unsigned long long)bmp_pos >> (3 + PAGE_CACHE_SHIFT), + (unsigned long long)bmp_pos & + (unsigned long long)((PAGE_CACHE_SIZE * 8) - 1)); + bmp_page = ntfs_map_page(bmp_mapping, + bmp_pos >> (3 + PAGE_CACHE_SHIFT)); + if (IS_ERR(bmp_page)) { + ntfs_error(sb, "Reading index bitmap failed."); + err = PTR_ERR(bmp_page); + bmp_page = NULL; + goto iput_err_out; + } + bmp = (u8*)page_address(bmp_page); + /* Find next index block in use. */ + while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) { +find_next_index_buffer: + cur_bmp_pos++; + /* + * If we have reached the end of the bitmap page, get the next + * page, and put away the old one. + */ + if (unlikely((cur_bmp_pos >> 3) >= PAGE_CACHE_SIZE)) { + ntfs_unmap_page(bmp_page); + bmp_pos += PAGE_CACHE_SIZE * 8; + cur_bmp_pos = 0; + goto get_next_bmp_page; + } + /* If we have reached the end of the bitmap, we are done. */ + if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size)) + goto unm_EOD; + ia_pos = (bmp_pos + cur_bmp_pos) << + ndir->itype.index.block_size_bits; + } + ntfs_debug("Handling index buffer 0x%llx.", + (unsigned long long)bmp_pos + cur_bmp_pos); + /* If the current index buffer is in the same page we reuse the page. */ + if ((prev_ia_pos & (s64)PAGE_CACHE_MASK) != + (ia_pos & (s64)PAGE_CACHE_MASK)) { + prev_ia_pos = ia_pos; + if (likely(ia_page != NULL)) { + unlock_page(ia_page); + ntfs_unmap_page(ia_page); + } + /* + * Map the page cache page containing the current ia_pos, + * reading it from disk if necessary. + */ + ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_CACHE_SHIFT); + if (IS_ERR(ia_page)) { + ntfs_error(sb, "Reading index allocation data failed."); + err = PTR_ERR(ia_page); + ia_page = NULL; + goto err_out; + } + lock_page(ia_page); + kaddr = (u8*)page_address(ia_page); + } + /* Get the current index buffer. */ + ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_CACHE_MASK & + ~(s64)(ndir->itype.index.block_size - 1))); + /* Bounds checks. */ + if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE)) { + ntfs_error(sb, "Out of bounds check failed. Corrupt directory " + "inode 0x%lx or driver bug.", vdir->i_ino); + goto err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Directory index record with vcn 0x%llx is " + "corrupt. Corrupt inode 0x%lx. Run chkdsk.", + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos & + ~(s64)(ndir->itype.index.block_size - 1)) >> + ndir->itype.index.vcn_size_bits)) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). " + "Directory inode 0x%lx is corrupt or driver " + "bug. ", (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 != + ndir->itype.index.block_size)) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx has a size (%u) differing from the " + "directory specified size (%u). Directory " + "inode is corrupt or driver bug.", + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino, + le32_to_cpu(ia->index.allocated_size) + 0x18, + ndir->itype.index.block_size); + goto err_out; + } + index_end = (u8*)ia + ndir->itype.index.block_size; + if (unlikely(index_end > kaddr + PAGE_CACHE_SIZE)) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx crosses page boundary. Impossible! " + "Cannot access! This is probably a bug in the " + "driver.", (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1); + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " + "inode 0x%lx exceeds maximum size.", + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + /* The first index entry in this index buffer. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry or until filldir tells us it has had enough + * or signals an error (both covered by the rc test). + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + ntfs_debug("In index allocation, offset 0x%llx.", + (unsigned long long)ia_start + + (unsigned long long)((u8*)ie - (u8*)ia)); + /* Bounds checks. */ + if (unlikely((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end)) + goto err_out; + /* The last entry cannot contain a name. */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Skip index block entry if continuing previous readdir. */ + if (ia_pos - ia_start > (u8*)ie - (u8*)ia) + continue; + /* Advance the position even if going to skip the entry. */ + fpos = (u8*)ie - (u8*)ia + + (sle64_to_cpu(ia->index_block_vcn) << + ndir->itype.index.vcn_size_bits) + + vol->mft_record_size; + /* + * Submit the name to the @filldir callback. Note, + * ntfs_filldir() drops the lock on @ia_page but it retakes it + * before returning, unless a non-zero value is returned in + * which case the page is left unlocked. + */ + rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent, + filldir); + if (rc) { + /* @ia_page is already unlocked in this case. */ + ntfs_unmap_page(ia_page); + ntfs_unmap_page(bmp_page); + iput(bmp_vi); + goto abort; + } + } + goto find_next_index_buffer; +unm_EOD: + if (ia_page) { + unlock_page(ia_page); + ntfs_unmap_page(ia_page); + } + ntfs_unmap_page(bmp_page); + iput(bmp_vi); +EOD: + /* We are finished, set fpos to EOD. */ + fpos = i_size + vol->mft_record_size; +abort: + kfree(name); +done: +#ifdef DEBUG + if (!rc) + ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos); + else + ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.", + rc, fpos); +#endif + filp->f_pos = fpos; + return 0; +err_out: + if (bmp_page) { + ntfs_unmap_page(bmp_page); +iput_err_out: + iput(bmp_vi); + } + if (ia_page) { + unlock_page(ia_page); + ntfs_unmap_page(ia_page); + } + kfree(ir); + kfree(name); + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(ndir); + if (!err) + err = -EIO; + ntfs_debug("Failed. Returning error code %i.", -err); + filp->f_pos = fpos; + return err; +} + +/** + * ntfs_dir_open - called when an inode is about to be opened + * @vi: inode to be opened + * @filp: file structure describing the inode + * + * Limit directory size to the page cache limit on architectures where unsigned + * long is 32-bits. This is the most we can do for now without overflowing the + * page cache page index. Doing it this way means we don't run into problems + * because of existing too large directories. It would be better to allow the + * user to read the accessible part of the directory but I doubt very much + * anyone is going to hit this check on a 32-bit architecture, so there is no + * point in adding the extra complexity required to support this. + * + * On 64-bit architectures, the check is hopefully optimized away by the + * compiler. + */ +static int ntfs_dir_open(struct inode *vi, struct file *filp) +{ + if (sizeof(unsigned long) < 8) { + if (i_size_read(vi) > MAX_LFS_FILESIZE) + return -EFBIG; + } + return 0; +} + +#ifdef NTFS_RW + +/** + * ntfs_dir_fsync - sync a directory to disk + * @filp: directory to be synced + * @dentry: dentry describing the directory to sync + * @datasync: if non-zero only flush user data and not metadata + * + * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and + * msync system calls. This function is based on file.c::ntfs_file_fsync(). + * + * Write the mft record and all associated extent mft records as well as the + * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device. + * + * If @datasync is true, we do not wait on the inode(s) to be written out + * but we always wait on the page cache pages to be written out. + * + * Note: In the past @filp could be NULL so we ignore it as we don't need it + * anyway. + * + * Locking: Caller must hold i_mutex on the inode. + * + * TODO: We should probably also write all attribute/index inodes associated + * with this inode but since we have no simple way of getting to them we ignore + * this problem for now. We do write the $BITMAP attribute if it is present + * which is the important one for a directory so things are not too bad. + */ +static int ntfs_dir_fsync(struct file *filp, int datasync) +{ + struct inode *bmp_vi, *vi = filp->f_mapping->host; + int err, ret; + ntfs_attr na; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + BUG_ON(!S_ISDIR(vi->i_mode)); + /* If the bitmap attribute inode is in memory sync it, too. */ + na.mft_no = vi->i_ino; + na.type = AT_BITMAP; + na.name = I30; + na.name_len = 4; + bmp_vi = ilookup5(vi->i_sb, vi->i_ino, (test_t)ntfs_test_inode, &na); + if (bmp_vi) { + write_inode_now(bmp_vi, !datasync); + iput(bmp_vi); + } + ret = __ntfs_write_inode(vi, 1); + write_inode_now(vi, !datasync); + err = sync_blockdev(vi->i_sb->s_bdev); + if (unlikely(err && !ret)) + ret = err; + if (likely(!ret)) + ntfs_debug("Done."); + else + ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " + "%u.", datasync ? "data" : "", vi->i_ino, -ret); + return ret; +} + +#endif /* NTFS_RW */ + +const struct file_operations ntfs_dir_ops = { + .llseek = generic_file_llseek, /* Seek inside directory. */ + .read = generic_read_dir, /* Return -EISDIR. */ + .readdir = ntfs_readdir, /* Read directory contents. */ +#ifdef NTFS_RW + .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ + /*.aio_fsync = ,*/ /* Sync all outstanding async + i/o operations on a kiocb. */ +#endif /* NTFS_RW */ + /*.ioctl = ,*/ /* Perform function on the + mounted filesystem. */ + .open = ntfs_dir_open, /* Open directory. */ +}; diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h new file mode 100644 index 00000000..aea7582d --- /dev/null +++ b/fs/ntfs/dir.h @@ -0,0 +1,48 @@ +/* + * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of + * the Linux-NTFS project. + * + * Copyright (c) 2002-2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_DIR_H +#define _LINUX_NTFS_DIR_H + +#include "layout.h" +#include "inode.h" +#include "types.h" + +/* + * ntfs_name is used to return the file name to the caller of + * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup()) + * to be able to deal with dcache aliasing issues. + */ +typedef struct { + MFT_REF mref; + FILE_NAME_TYPE_FLAGS type; + u8 len; + ntfschar name[0]; +} __attribute__ ((__packed__)) ntfs_name; + +/* The little endian Unicode string $I30 as a global constant. */ +extern ntfschar I30[5]; + +extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, + const ntfschar *uname, const int uname_len, ntfs_name **res); + +#endif /* _LINUX_NTFS_FS_DIR_H */ diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h new file mode 100644 index 00000000..927b5bf0 --- /dev/null +++ b/fs/ntfs/endian.h @@ -0,0 +1,93 @@ +/* + * endian.h - Defines for endianness handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_ENDIAN_H +#define _LINUX_NTFS_ENDIAN_H + +#include +#include "types.h" + +/* + * Signed endianness conversion functions. + */ + +static inline s16 sle16_to_cpu(sle16 x) +{ + return le16_to_cpu((__force le16)x); +} + +static inline s32 sle32_to_cpu(sle32 x) +{ + return le32_to_cpu((__force le32)x); +} + +static inline s64 sle64_to_cpu(sle64 x) +{ + return le64_to_cpu((__force le64)x); +} + +static inline s16 sle16_to_cpup(sle16 *x) +{ + return le16_to_cpu(*(__force le16*)x); +} + +static inline s32 sle32_to_cpup(sle32 *x) +{ + return le32_to_cpu(*(__force le32*)x); +} + +static inline s64 sle64_to_cpup(sle64 *x) +{ + return le64_to_cpu(*(__force le64*)x); +} + +static inline sle16 cpu_to_sle16(s16 x) +{ + return (__force sle16)cpu_to_le16(x); +} + +static inline sle32 cpu_to_sle32(s32 x) +{ + return (__force sle32)cpu_to_le32(x); +} + +static inline sle64 cpu_to_sle64(s64 x) +{ + return (__force sle64)cpu_to_le64(x); +} + +static inline sle16 cpu_to_sle16p(s16 *x) +{ + return (__force sle16)cpu_to_le16(*x); +} + +static inline sle32 cpu_to_sle32p(s32 *x) +{ + return (__force sle32)cpu_to_le32(*x); +} + +static inline sle64 cpu_to_sle64p(s64 *x) +{ + return (__force sle64)cpu_to_le64(*x); +} + +#endif /* _LINUX_NTFS_ENDIAN_H */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c new file mode 100644 index 00000000..f4b1057a --- /dev/null +++ b/fs/ntfs/file.c @@ -0,0 +1,2227 @@ +/* + * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "attrib.h" +#include "bitmap.h" +#include "inode.h" +#include "debug.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "ntfs.h" + +/** + * ntfs_file_open - called when an inode is about to be opened + * @vi: inode to be opened + * @filp: file structure describing the inode + * + * Limit file size to the page cache limit on architectures where unsigned long + * is 32-bits. This is the most we can do for now without overflowing the page + * cache page index. Doing it this way means we don't run into problems because + * of existing too large files. It would be better to allow the user to read + * the beginning of the file but I doubt very much anyone is going to hit this + * check on a 32-bit architecture, so there is no point in adding the extra + * complexity required to support this. + * + * On 64-bit architectures, the check is hopefully optimized away by the + * compiler. + * + * After the check passes, just call generic_file_open() to do its work. + */ +static int ntfs_file_open(struct inode *vi, struct file *filp) +{ + if (sizeof(unsigned long) < 8) { + if (i_size_read(vi) > MAX_LFS_FILESIZE) + return -EOVERFLOW; + } + return generic_file_open(vi, filp); +} + +#ifdef NTFS_RW + +/** + * ntfs_attr_extend_initialized - extend the initialized size of an attribute + * @ni: ntfs inode of the attribute to extend + * @new_init_size: requested new initialized size in bytes + * @cached_page: store any allocated but unused page here + * @lru_pvec: lru-buffering pagevec of the caller + * + * Extend the initialized size of an attribute described by the ntfs inode @ni + * to @new_init_size bytes. This involves zeroing any non-sparse space between + * the old initialized size and @new_init_size both in the page cache and on + * disk (if relevant complete pages are already uptodate in the page cache then + * these are simply marked dirty). + * + * As a side-effect, the file size (vfs inode->i_size) may be incremented as, + * in the resident attribute case, it is tied to the initialized size and, in + * the non-resident attribute case, it may not fall below the initialized size. + * + * Note that if the attribute is resident, we do not need to touch the page + * cache at all. This is because if the page cache page is not uptodate we + * bring it uptodate later, when doing the write to the mft record since we + * then already have the page mapped. And if the page is uptodate, the + * non-initialized region will already have been zeroed when the page was + * brought uptodate and the region may in fact already have been overwritten + * with new data via mmap() based writes, so we cannot just zero it. And since + * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped + * is unspecified, we choose not to do zeroing and thus we do not need to touch + * the page at all. For a more detailed explanation see ntfs_truncate() in + * fs/ntfs/inode.c. + * + * Return 0 on success and -errno on error. In the case that an error is + * encountered it is possible that the initialized size will already have been + * incremented some way towards @new_init_size but it is guaranteed that if + * this is the case, the necessary zeroing will also have happened and that all + * metadata is self-consistent. + * + * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be + * held by the caller. + */ +static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) +{ + s64 old_init_size; + loff_t old_i_size; + pgoff_t index, end_index; + unsigned long flags; + struct inode *vi = VFS_I(ni); + ntfs_inode *base_ni; + MFT_RECORD *m = NULL; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx = NULL; + struct address_space *mapping; + struct page *page = NULL; + u8 *kattr; + int err; + u32 attr_len; + + read_lock_irqsave(&ni->size_lock, flags); + old_init_size = ni->initialized_size; + old_i_size = i_size_read(vi); + BUG_ON(new_init_size > ni->allocated_size); + read_unlock_irqrestore(&ni->size_lock, flags); + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " + "old_initialized_size 0x%llx, " + "new_initialized_size 0x%llx, i_size 0x%llx.", + vi->i_ino, (unsigned)le32_to_cpu(ni->type), + (unsigned long long)old_init_size, + (unsigned long long)new_init_size, old_i_size); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Use goto to reduce indentation and we need the label below anyway. */ + if (NInoNonResident(ni)) + goto do_non_resident_extend; + BUG_ON(old_init_size != old_i_size); + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(a->non_resident); + /* The total length of the attribute value. */ + attr_len = le32_to_cpu(a->data.resident.value_length); + BUG_ON(old_i_size != (loff_t)attr_len); + /* + * Do the zeroing in the mft record and update the attribute size in + * the mft record. + */ + kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); + memset(kattr + attr_len, 0, new_init_size - attr_len); + a->data.resident.value_length = cpu_to_le32((u32)new_init_size); + /* Finally, update the sizes in the vfs and ntfs inodes. */ + write_lock_irqsave(&ni->size_lock, flags); + i_size_write(vi, new_init_size); + ni->initialized_size = new_init_size; + write_unlock_irqrestore(&ni->size_lock, flags); + goto done; +do_non_resident_extend: + /* + * If the new initialized size @new_init_size exceeds the current file + * size (vfs inode->i_size), we need to extend the file size to the + * new initialized size. + */ + if (new_init_size > old_i_size) { + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(!a->non_resident); + BUG_ON(old_i_size != (loff_t) + sle64_to_cpu(a->data.non_resident.data_size)); + a->data.non_resident.data_size = cpu_to_sle64(new_init_size); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + /* Update the file size in the vfs inode. */ + i_size_write(vi, new_init_size); + ntfs_attr_put_search_ctx(ctx); + ctx = NULL; + unmap_mft_record(base_ni); + m = NULL; + } + mapping = vi->i_mapping; + index = old_init_size >> PAGE_CACHE_SHIFT; + end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + do { + /* + * Read the page. If the page is not present, this will zero + * the uninitialized regions for us. + */ + page = read_mapping_page(mapping, index, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto init_err_out; + } + if (unlikely(PageError(page))) { + page_cache_release(page); + err = -EIO; + goto init_err_out; + } + /* + * Update the initialized size in the ntfs inode. This is + * enough to make ntfs_writepage() work. + */ + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT; + if (ni->initialized_size > new_init_size) + ni->initialized_size = new_init_size; + write_unlock_irqrestore(&ni->size_lock, flags); + /* Set the page dirty so it gets written out. */ + set_page_dirty(page); + page_cache_release(page); + /* + * Play nice with the vm and the rest of the system. This is + * very much needed as we can potentially be modifying the + * initialised size from a very small value to a really huge + * value, e.g. + * f = open(somefile, O_TRUNC); + * truncate(f, 10GiB); + * seek(f, 10GiB); + * write(f, 1); + * And this would mean we would be marking dirty hundreds of + * thousands of pages or as in the above example more than + * two and a half million pages! + * + * TODO: For sparse pages could optimize this workload by using + * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This + * would be set in readpage for sparse pages and here we would + * not need to mark dirty any pages which have this bit set. + * The only caveat is that we have to clear the bit everywhere + * where we allocate any clusters that lie in the page or that + * contain the page. + * + * TODO: An even greater optimization would be for us to only + * call readpage() on pages which are not in sparse regions as + * determined from the runlist. This would greatly reduce the + * number of pages we read and make dirty in the case of sparse + * files. + */ + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } while (++index < end_index); + read_lock_irqsave(&ni->size_lock, flags); + BUG_ON(ni->initialized_size != new_init_size); + read_unlock_irqrestore(&ni->size_lock, flags); + /* Now bring in sync the initialized_size in the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto init_err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto init_err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto init_err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(!a->non_resident); + a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size); +done: + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.", + (unsigned long long)new_init_size, i_size_read(vi)); + return 0; +init_err_out: + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = old_init_size; + write_unlock_irqrestore(&ni->size_lock, flags); +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ntfs_debug("Failed. Returning error code %i.", err); + return err; +} + +/** + * ntfs_fault_in_pages_readable - + * + * Fault a number of userspace pages into pagetables. + * + * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes + * with more than two userspace pages as well as handling the single page case + * elegantly. + * + * If you find this difficult to understand, then think of the while loop being + * the following code, except that we do without the integer variable ret: + * + * do { + * ret = __get_user(c, uaddr); + * uaddr += PAGE_SIZE; + * } while (!ret && uaddr < end); + * + * Note, the final __get_user() may well run out-of-bounds of the user buffer, + * but _not_ out-of-bounds of the page the user buffer belongs to, and since + * this is only a read and not a write, and since it is still in the same page, + * it should not matter and this makes the code much simpler. + */ +static inline void ntfs_fault_in_pages_readable(const char __user *uaddr, + int bytes) +{ + const char __user *end; + volatile char c; + + /* Set @end to the first byte outside the last page we care about. */ + end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes); + + while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end)) + ; +} + +/** + * ntfs_fault_in_pages_readable_iovec - + * + * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs. + */ +static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov, + size_t iov_ofs, int bytes) +{ + do { + const char __user *buf; + unsigned len; + + buf = iov->iov_base + iov_ofs; + len = iov->iov_len - iov_ofs; + if (len > bytes) + len = bytes; + ntfs_fault_in_pages_readable(buf, len); + bytes -= len; + iov++; + iov_ofs = 0; + } while (bytes); +} + +/** + * __ntfs_grab_cache_pages - obtain a number of locked pages + * @mapping: address space mapping from which to obtain page cache pages + * @index: starting index in @mapping at which to begin obtaining pages + * @nr_pages: number of page cache pages to obtain + * @pages: array of pages in which to return the obtained page cache pages + * @cached_page: allocated but as yet unused page + * @lru_pvec: lru-buffering pagevec of caller + * + * Obtain @nr_pages locked page cache pages from the mapping @mapping and + * starting at index @index. + * + * If a page is newly created, add it to lru list + * + * Note, the page locks are obtained in ascending page index order. + */ +static inline int __ntfs_grab_cache_pages(struct address_space *mapping, + pgoff_t index, const unsigned nr_pages, struct page **pages, + struct page **cached_page) +{ + int err, nr; + + BUG_ON(!nr_pages); + err = nr = 0; + do { + pages[nr] = find_lock_page(mapping, index); + if (!pages[nr]) { + if (!*cached_page) { + *cached_page = page_cache_alloc(mapping); + if (unlikely(!*cached_page)) { + err = -ENOMEM; + goto err_out; + } + } + err = add_to_page_cache_lru(*cached_page, mapping, index, + GFP_KERNEL); + if (unlikely(err)) { + if (err == -EEXIST) + continue; + goto err_out; + } + pages[nr] = *cached_page; + *cached_page = NULL; + } + index++; + nr++; + } while (nr < nr_pages); +out: + return err; +err_out: + while (nr > 0) { + unlock_page(pages[--nr]); + page_cache_release(pages[nr]); + } + goto out; +} + +static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) +{ + lock_buffer(bh); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + return submit_bh(READ, bh); +} + +/** + * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data + * @pages: array of destination pages + * @nr_pages: number of pages in @pages + * @pos: byte position in file at which the write begins + * @bytes: number of bytes to be written + * + * This is called for non-resident attributes from ntfs_file_buffered_write() + * with i_mutex held on the inode (@pages[0]->mapping->host). There are + * @nr_pages pages in @pages which are locked but not kmap()ped. The source + * data has not yet been copied into the @pages. + * + * Need to fill any holes with actual clusters, allocate buffers if necessary, + * ensure all the buffers are mapped, and bring uptodate any buffers that are + * only partially being written to. + * + * If @nr_pages is greater than one, we are guaranteed that the cluster size is + * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside + * the same cluster and that they are the entirety of that cluster, and that + * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. + * + * i_size is not to be modified yet. + * + * Return 0 on success or -errno on error. + */ +static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, + unsigned nr_pages, s64 pos, size_t bytes) +{ + VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend; + LCN lcn; + s64 bh_pos, vcn_len, end, initialized_size; + sector_t lcn_block; + struct page *page; + struct inode *vi; + ntfs_inode *ni, *base_ni = NULL; + ntfs_volume *vol; + runlist_element *rl, *rl2; + struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *m = NULL; + ATTR_RECORD *a = NULL; + unsigned long flags; + u32 attr_rec_len = 0; + unsigned blocksize, u; + int err, mp_size; + bool rl_write_locked, was_hole, is_retry; + unsigned char blocksize_bits; + struct { + u8 runlist_merged:1; + u8 mft_attr_mapped:1; + u8 mp_rebuilt:1; + u8 attr_switched:1; + } status = { 0, 0, 0, 0 }; + + BUG_ON(!nr_pages); + BUG_ON(!pages); + BUG_ON(!*pages); + vi = pages[0]->mapping->host; + ni = NTFS_I(vi); + vol = ni->vol; + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " + "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", + vi->i_ino, ni->type, pages[0]->index, nr_pages, + (long long)pos, bytes); + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + u = 0; + do { + page = pages[u]; + BUG_ON(!page); + /* + * create_empty_buffers() will create uptodate/dirty buffers if + * the page is uptodate/dirty. + */ + if (!page_has_buffers(page)) { + create_empty_buffers(page, blocksize, 0); + if (unlikely(!page_has_buffers(page))) + return -ENOMEM; + } + } while (++u < nr_pages); + rl_write_locked = false; + rl = NULL; + err = 0; + vcn = lcn = -1; + vcn_len = 0; + lcn_block = -1; + was_hole = false; + cpos = pos >> vol->cluster_size_bits; + end = pos + bytes; + cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; + /* + * Loop over each page and for each page over each buffer. Use goto to + * reduce indentation. + */ + u = 0; +do_next_page: + page = pages[u]; + bh_pos = (s64)page->index << PAGE_CACHE_SHIFT; + bh = head = page_buffers(page); + do { + VCN cdelta; + s64 bh_end; + unsigned bh_cofs; + + /* Clear buffer_new on all buffers to reinitialise state. */ + if (buffer_new(bh)) + clear_buffer_new(bh); + bh_end = bh_pos + blocksize; + bh_cpos = bh_pos >> vol->cluster_size_bits; + bh_cofs = bh_pos & vol->cluster_size_mask; + if (buffer_mapped(bh)) { + /* + * The buffer is already mapped. If it is uptodate, + * ignore it. + */ + if (buffer_uptodate(bh)) + continue; + /* + * The buffer is not uptodate. If the page is uptodate + * set the buffer uptodate and otherwise ignore it. + */ + if (PageUptodate(page)) { + set_buffer_uptodate(bh); + continue; + } + /* + * Neither the page nor the buffer are uptodate. If + * the buffer is only partially being written to, we + * need to read it in before the write, i.e. now. + */ + if ((bh_pos < pos && bh_end > pos) || + (bh_pos < end && bh_end > end)) { + /* + * If the buffer is fully or partially within + * the initialized size, do an actual read. + * Otherwise, simply zero the buffer. + */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (bh_pos < initialized_size) { + ntfs_submit_bh_for_read(bh); + *wait_bh++ = bh; + } else { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + } + continue; + } + /* Unmapped buffer. Need to map it. */ + bh->b_bdev = vol->sb->s_bdev; + /* + * If the current buffer is in the same clusters as the map + * cache, there is no need to check the runlist again. The + * map cache is made up of @vcn, which is the first cached file + * cluster, @vcn_len which is the number of cached file + * clusters, @lcn is the device cluster corresponding to @vcn, + * and @lcn_block is the block number corresponding to @lcn. + */ + cdelta = bh_cpos - vcn; + if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) { +map_buffer_cached: + BUG_ON(lcn < 0); + bh->b_blocknr = lcn_block + + (cdelta << (vol->cluster_size_bits - + blocksize_bits)) + + (bh_cofs >> blocksize_bits); + set_buffer_mapped(bh); + /* + * If the page is uptodate so is the buffer. If the + * buffer is fully outside the write, we ignore it if + * it was already allocated and we mark it dirty so it + * gets written out if we allocated it. On the other + * hand, if we allocated the buffer but we are not + * marking it dirty we set buffer_new so we can do + * error recovery. + */ + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (unlikely(was_hole)) { + /* We allocated the buffer. */ + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + if (bh_end <= pos || bh_pos >= end) + mark_buffer_dirty(bh); + else + set_buffer_new(bh); + } + continue; + } + /* Page is _not_ uptodate. */ + if (likely(!was_hole)) { + /* + * Buffer was already allocated. If it is not + * uptodate and is only partially being written + * to, we need to read it in before the write, + * i.e. now. + */ + if (!buffer_uptodate(bh) && bh_pos < end && + bh_end > pos && + (bh_pos < pos || + bh_end > end)) { + /* + * If the buffer is fully or partially + * within the initialized size, do an + * actual read. Otherwise, simply zero + * the buffer. + */ + read_lock_irqsave(&ni->size_lock, + flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, + flags); + if (bh_pos < initialized_size) { + ntfs_submit_bh_for_read(bh); + *wait_bh++ = bh; + } else { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + } + continue; + } + /* We allocated the buffer. */ + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + /* + * If the buffer is fully outside the write, zero it, + * set it uptodate, and mark it dirty so it gets + * written out. If it is partially being written to, + * zero region surrounding the write but leave it to + * commit write to do anything else. Finally, if the + * buffer is fully being overwritten, do nothing. + */ + if (bh_end <= pos || bh_pos >= end) { + if (!buffer_uptodate(bh)) { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + mark_buffer_dirty(bh); + continue; + } + set_buffer_new(bh); + if (!buffer_uptodate(bh) && + (bh_pos < pos || bh_end > end)) { + u8 *kaddr; + unsigned pofs; + + kaddr = kmap_atomic(page, KM_USER0); + if (bh_pos < pos) { + pofs = bh_pos & ~PAGE_CACHE_MASK; + memset(kaddr + pofs, 0, pos - bh_pos); + } + if (bh_end > end) { + pofs = end & ~PAGE_CACHE_MASK; + memset(kaddr + pofs, 0, bh_end - end); + } + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page); + } + continue; + } + /* + * Slow path: this is the first buffer in the cluster. If it + * is outside allocated size and is not uptodate, zero it and + * set it uptodate. + */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (bh_pos > initialized_size) { + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } else if (!buffer_uptodate(bh)) { + zero_user(page, bh_offset(bh), blocksize); + set_buffer_uptodate(bh); + } + continue; + } + is_retry = false; + if (!rl) { + down_read(&ni->runlist.lock); +retry_remap: + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target cluster. */ + while (rl->length && rl[1].vcn <= bh_cpos) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos); + if (likely(lcn >= 0)) { + /* + * Successful remap, setup the map cache and + * use that to deal with the buffer. + */ + was_hole = false; + vcn = bh_cpos; + vcn_len = rl[1].vcn - vcn; + lcn_block = lcn << (vol->cluster_size_bits - + blocksize_bits); + cdelta = 0; + /* + * If the number of remaining clusters touched + * by the write is smaller or equal to the + * number of cached clusters, unlock the + * runlist as the map cache will be used from + * now on. + */ + if (likely(vcn + vcn_len >= cend)) { + if (rl_write_locked) { + up_write(&ni->runlist.lock); + rl_write_locked = false; + } else + up_read(&ni->runlist.lock); + rl = NULL; + } + goto map_buffer_cached; + } + } else + lcn = LCN_RL_NOT_MAPPED; + /* + * If it is not a hole and not out of bounds, the runlist is + * probably unmapped so try to map it now. + */ + if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) { + if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) { + /* Attempt to map runlist. */ + if (!rl_write_locked) { + /* + * We need the runlist locked for + * writing, so if it is locked for + * reading relock it now and retry in + * case it changed whilst we dropped + * the lock. + */ + up_read(&ni->runlist.lock); + down_write(&ni->runlist.lock); + rl_write_locked = true; + goto retry_remap; + } + err = ntfs_map_runlist_nolock(ni, bh_cpos, + NULL); + if (likely(!err)) { + is_retry = true; + goto retry_remap; + } + /* + * If @vcn is out of bounds, pretend @lcn is + * LCN_ENOENT. As long as the buffer is out + * of bounds this will work fine. + */ + if (err == -ENOENT) { + lcn = LCN_ENOENT; + err = 0; + goto rl_not_mapped_enoent; + } + } else + err = -EIO; + /* Failed to map the buffer, even after retrying. */ + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " + "attribute type 0x%x, vcn 0x%llx, " + "vcn offset 0x%x, because its " + "location on disk could not be " + "determined%s (error code %i).", + ni->mft_no, ni->type, + (unsigned long long)bh_cpos, + (unsigned)bh_pos & + vol->cluster_size_mask, + is_retry ? " even after retrying" : "", + err); + break; + } +rl_not_mapped_enoent: + /* + * The buffer is in a hole or out of bounds. We need to fill + * the hole, unless the buffer is in a cluster which is not + * touched by the write, in which case we just leave the buffer + * unmapped. This can only happen when the cluster size is + * less than the page cache size. + */ + if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) { + bh_cend = (bh_end + vol->cluster_size - 1) >> + vol->cluster_size_bits; + if ((bh_cend <= cpos || bh_cpos >= cend)) { + bh->b_blocknr = -1; + /* + * If the buffer is uptodate we skip it. If it + * is not but the page is uptodate, we can set + * the buffer uptodate. If the page is not + * uptodate, we can clear the buffer and set it + * uptodate. Whether this is worthwhile is + * debatable and this could be removed. + */ + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } else if (!buffer_uptodate(bh)) { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + continue; + } + } + /* + * Out of bounds buffer is invalid if it was not really out of + * bounds. + */ + BUG_ON(lcn != LCN_HOLE); + /* + * We need the runlist locked for writing, so if it is locked + * for reading relock it now and retry in case it changed + * whilst we dropped the lock. + */ + BUG_ON(!rl); + if (!rl_write_locked) { + up_read(&ni->runlist.lock); + down_write(&ni->runlist.lock); + rl_write_locked = true; + goto retry_remap; + } + /* Find the previous last allocated cluster. */ + BUG_ON(rl->lcn != LCN_HOLE); + lcn = -1; + rl2 = rl; + while (--rl2 >= ni->runlist.rl) { + if (rl2->lcn >= 0) { + lcn = rl2->lcn + rl2->length; + break; + } + } + rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE, + false); + if (IS_ERR(rl2)) { + err = PTR_ERR(rl2); + ntfs_debug("Failed to allocate cluster, error code %i.", + err); + break; + } + lcn = rl2->lcn; + rl = ntfs_runlists_merge(ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (err != -ENOMEM) + err = -EIO; + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to release " + "allocated cluster in error " + "code path. Run chkdsk to " + "recover the lost cluster."); + NVolSetErrors(vol); + } + ntfs_free(rl2); + break; + } + ni->runlist.rl = rl; + status.runlist_merged = 1; + ntfs_debug("Allocated cluster, lcn 0x%llx.", + (unsigned long long)lcn); + /* Map and lock the mft record and get the attribute record. */ + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + break; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + unmap_mft_record(base_ni); + break; + } + status.mft_attr_mapped = 1; + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, bh_cpos, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + break; + } + m = ctx->mrec; + a = ctx->attr; + /* + * Find the runlist element with which the attribute extent + * starts. Note, we cannot use the _attr_ version because we + * have mapped the mft record. That is ok because we know the + * runlist fragment must be mapped already to have ever gotten + * here, so we can just use the _rl_ version. + */ + vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn); + rl2 = ntfs_rl_find_vcn_nolock(rl, vcn); + BUG_ON(!rl2); + BUG_ON(!rl2->length); + BUG_ON(rl2->lcn < LCN_HOLE); + highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + /* + * If @highest_vcn is zero, calculate the real highest_vcn + * (which can really be zero). + */ + if (!highest_vcn) + highest_vcn = (sle64_to_cpu( + a->data.non_resident.allocated_size) >> + vol->cluster_size_bits) - 1; + /* + * Determine the size of the mapping pairs array for the new + * extent, i.e. the old extent with the hole filled. + */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn, + highest_vcn); + if (unlikely(mp_size <= 0)) { + if (!(err = mp_size)) + err = -EIO; + ntfs_debug("Failed to get size for mapping pairs " + "array, error code %i.", err); + break; + } + /* + * Resize the attribute record to fit the new mapping pairs + * array. + */ + attr_rec_len = le32_to_cpu(a->length); + err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)); + if (unlikely(err)) { + BUG_ON(err != -ENOSPC); + // TODO: Deal with this by using the current attribute + // and fill it with as much of the mapping pairs + // array as possible. Then loop over each attribute + // extent rewriting the mapping pairs arrays as we go + // along and if when we reach the end we have not + // enough space, try to resize the last attribute + // extent and if even that fails, add a new attribute + // extent. + // We could also try to resize at each step in the hope + // that we will not need to rewrite every single extent. + // Note, we may need to decompress some extents to fill + // the runlist as we are walking the extents... + ntfs_error(vol->sb, "Not enough space in the mft " + "record for the extended attribute " + "record. This case is not " + "implemented yet."); + err = -EOPNOTSUPP; + break ; + } + status.mp_rebuilt = 1; + /* + * Generate the mapping pairs array directly into the attribute + * record. + */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, vcn, highest_vcn, NULL); + if (unlikely(err)) { + ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, " + "attribute type 0x%x, because building " + "the mapping pairs failed with error " + "code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + err = -EIO; + break; + } + /* Update the highest_vcn but only if it was not set. */ + if (unlikely(!a->data.non_resident.highest_vcn)) + a->data.non_resident.highest_vcn = + cpu_to_sle64(highest_vcn); + /* + * If the attribute is sparse/compressed, update the compressed + * size in the ntfs_inode structure and the attribute record. + */ + if (likely(NInoSparse(ni) || NInoCompressed(ni))) { + /* + * If we are not in the first attribute extent, switch + * to it, but first ensure the changes will make it to + * disk later. + */ + if (a->data.non_resident.lowest_vcn) { + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(ni->type, ni->name, + ni->name_len, CASE_SENSITIVE, + 0, NULL, 0, ctx); + if (unlikely(err)) { + status.attr_switched = 1; + break; + } + /* @m is not used any more so do not set it. */ + a = ctx->attr; + } + write_lock_irqsave(&ni->size_lock, flags); + ni->itype.compressed.size += vol->cluster_size; + a->data.non_resident.compressed_size = + cpu_to_sle64(ni->itype.compressed.size); + write_unlock_irqrestore(&ni->size_lock, flags); + } + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + /* Successfully filled the hole. */ + status.runlist_merged = 0; + status.mft_attr_mapped = 0; + status.mp_rebuilt = 0; + /* Setup the map cache and use that to deal with the buffer. */ + was_hole = true; + vcn = bh_cpos; + vcn_len = 1; + lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits); + cdelta = 0; + /* + * If the number of remaining clusters in the @pages is smaller + * or equal to the number of cached clusters, unlock the + * runlist as the map cache will be used from now on. + */ + if (likely(vcn + vcn_len >= cend)) { + up_write(&ni->runlist.lock); + rl_write_locked = false; + rl = NULL; + } + goto map_buffer_cached; + } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); + /* If there are no errors, do the next page. */ + if (likely(!err && ++u < nr_pages)) + goto do_next_page; + /* If there are no errors, release the runlist lock if we took it. */ + if (likely(!err)) { + if (unlikely(rl_write_locked)) { + up_write(&ni->runlist.lock); + rl_write_locked = false; + } else if (unlikely(rl)) + up_read(&ni->runlist.lock); + rl = NULL; + } + /* If we issued read requests, let them complete. */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + while (wait_bh > wait) { + bh = *--wait_bh; + wait_on_buffer(bh); + if (likely(buffer_uptodate(bh))) { + page = bh->b_page; + bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) + + bh_offset(bh); + /* + * If the buffer overflows the initialized size, need + * to zero the overflowing region. + */ + if (unlikely(bh_pos + blocksize > initialized_size)) { + int ofs = 0; + + if (likely(bh_pos < initialized_size)) + ofs = initialized_size - bh_pos; + zero_user_segment(page, bh_offset(bh) + ofs, + blocksize); + } + } else /* if (unlikely(!buffer_uptodate(bh))) */ + err = -EIO; + } + if (likely(!err)) { + /* Clear buffer_new on all buffers. */ + u = 0; + do { + bh = head = page_buffers(pages[u]); + do { + if (buffer_new(bh)) + clear_buffer_new(bh); + } while ((bh = bh->b_this_page) != head); + } while (++u < nr_pages); + ntfs_debug("Done."); + return err; + } + if (status.attr_switched) { + /* Get back to the attribute extent we modified. */ + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) { + ntfs_error(vol->sb, "Failed to find required " + "attribute extent of attribute in " + "error code path. Run chkdsk to " + "recover."); + write_lock_irqsave(&ni->size_lock, flags); + ni->itype.compressed.size += vol->cluster_size; + write_unlock_irqrestore(&ni->size_lock, flags); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + /* + * The only thing that is now wrong is the compressed + * size of the base attribute extent which chkdsk + * should be able to fix. + */ + NVolSetErrors(vol); + } else { + m = ctx->mrec; + a = ctx->attr; + status.attr_switched = 0; + } + } + /* + * If the runlist has been modified, need to restore it by punching a + * hole into it and we then need to deallocate the on-disk cluster as + * well. Note, we only modify the runlist if we are able to generate a + * new mapping pairs array, i.e. only when the mapped attribute extent + * is not switched. + */ + if (status.runlist_merged && !status.attr_switched) { + BUG_ON(!rl_write_locked); + /* Make the file cluster we allocated sparse in the runlist. */ + if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) { + ntfs_error(vol->sb, "Failed to punch hole into " + "attribute runlist in error code " + "path. Run chkdsk to recover the " + "lost cluster."); + NVolSetErrors(vol); + } else /* if (success) */ { + status.runlist_merged = 0; + /* + * Deallocate the on-disk cluster we allocated but only + * if we succeeded in punching its vcn out of the + * runlist. + */ + down_write(&vol->lcnbmp_lock); + if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { + ntfs_error(vol->sb, "Failed to release " + "allocated cluster in error " + "code path. Run chkdsk to " + "recover the lost cluster."); + NVolSetErrors(vol); + } + up_write(&vol->lcnbmp_lock); + } + } + /* + * Resize the attribute record to its old size and rebuild the mapping + * pairs array. Note, we only can do this if the runlist has been + * restored to its old state which also implies that the mapped + * attribute extent is not switched. + */ + if (status.mp_rebuilt && !status.runlist_merged) { + if (ntfs_attr_record_resize(m, a, attr_rec_len)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record in error code path. Run " + "chkdsk to recover."); + NVolSetErrors(vol); + } else /* if (success) */ { + if (ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident. + mapping_pairs_offset), attr_rec_len - + le16_to_cpu(a->data.non_resident. + mapping_pairs_offset), ni->runlist.rl, + vcn, highest_vcn, NULL)) { + ntfs_error(vol->sb, "Failed to restore " + "mapping pairs array in error " + "code path. Run chkdsk to " + "recover."); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } + } + /* Release the mft record and the attribute. */ + if (status.mft_attr_mapped) { + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + } + /* Release the runlist lock. */ + if (rl_write_locked) + up_write(&ni->runlist.lock); + else if (rl) + up_read(&ni->runlist.lock); + /* + * Zero out any newly allocated blocks to avoid exposing stale data. + * If BH_New is set, we know that the block was newly allocated above + * and that it has not been fully zeroed and marked dirty yet. + */ + nr_pages = u; + u = 0; + end = bh_cpos << vol->cluster_size_bits; + do { + page = pages[u]; + bh = head = page_buffers(page); + do { + if (u == nr_pages && + ((s64)page->index << PAGE_CACHE_SHIFT) + + bh_offset(bh) >= end) + break; + if (!buffer_new(bh)) + continue; + clear_buffer_new(bh); + if (!buffer_uptodate(bh)) { + if (PageUptodate(page)) + set_buffer_uptodate(bh); + else { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + } + mark_buffer_dirty(bh); + } while ((bh = bh->b_this_page) != head); + } while (++u <= nr_pages); + ntfs_error(vol->sb, "Failed. Returning error code %i.", err); + return err; +} + +/* + * Copy as much as we can into the pages and return the number of bytes which + * were successfully copied. If a fault is encountered then clear the pages + * out to (ofs + bytes) and return the number of bytes which were copied. + */ +static inline size_t ntfs_copy_from_user(struct page **pages, + unsigned nr_pages, unsigned ofs, const char __user *buf, + size_t bytes) +{ + struct page **last_page = pages + nr_pages; + char *addr; + size_t total = 0; + unsigned len; + int left; + + do { + len = PAGE_CACHE_SIZE - ofs; + if (len > bytes) + len = bytes; + addr = kmap_atomic(*pages, KM_USER0); + left = __copy_from_user_inatomic(addr + ofs, buf, len); + kunmap_atomic(addr, KM_USER0); + if (unlikely(left)) { + /* Do it the slow way. */ + addr = kmap(*pages); + left = __copy_from_user(addr + ofs, buf, len); + kunmap(*pages); + if (unlikely(left)) + goto err_out; + } + total += len; + bytes -= len; + if (!bytes) + break; + buf += len; + ofs = 0; + } while (++pages < last_page); +out: + return total; +err_out: + total += len - left; + /* Zero the rest of the target like __copy_from_user(). */ + while (++pages < last_page) { + bytes -= len; + if (!bytes) + break; + len = PAGE_CACHE_SIZE; + if (len > bytes) + len = bytes; + zero_user(*pages, 0, len); + } + goto out; +} + +static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr, + const struct iovec *iov, size_t iov_ofs, size_t bytes) +{ + size_t total = 0; + + while (1) { + const char __user *buf = iov->iov_base + iov_ofs; + unsigned len; + size_t left; + + len = iov->iov_len - iov_ofs; + if (len > bytes) + len = bytes; + left = __copy_from_user_inatomic(vaddr, buf, len); + total += len; + bytes -= len; + vaddr += len; + if (unlikely(left)) { + total -= left; + break; + } + if (!bytes) + break; + iov++; + iov_ofs = 0; + } + return total; +} + +static inline void ntfs_set_next_iovec(const struct iovec **iovp, + size_t *iov_ofsp, size_t bytes) +{ + const struct iovec *iov = *iovp; + size_t iov_ofs = *iov_ofsp; + + while (bytes) { + unsigned len; + + len = iov->iov_len - iov_ofs; + if (len > bytes) + len = bytes; + bytes -= len; + iov_ofs += len; + if (iov->iov_len == iov_ofs) { + iov++; + iov_ofs = 0; + } + } + *iovp = iov; + *iov_ofsp = iov_ofs; +} + +/* + * This has the same side-effects and return value as ntfs_copy_from_user(). + * The difference is that on a fault we need to memset the remainder of the + * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s + * single-segment behaviour. + * + * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when + * atomic and when not atomic. This is ok because it calls + * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In + * fact, the only difference between __copy_from_user_inatomic() and + * __copy_from_user() is that the latter calls might_sleep() and the former + * should not zero the tail of the buffer on error. And on many architectures + * __copy_from_user_inatomic() is just defined to __copy_from_user() so it + * makes no difference at all on those architectures. + */ +static inline size_t ntfs_copy_from_user_iovec(struct page **pages, + unsigned nr_pages, unsigned ofs, const struct iovec **iov, + size_t *iov_ofs, size_t bytes) +{ + struct page **last_page = pages + nr_pages; + char *addr; + size_t copied, len, total = 0; + + do { + len = PAGE_CACHE_SIZE - ofs; + if (len > bytes) + len = bytes; + addr = kmap_atomic(*pages, KM_USER0); + copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs, + *iov, *iov_ofs, len); + kunmap_atomic(addr, KM_USER0); + if (unlikely(copied != len)) { + /* Do it the slow way. */ + addr = kmap(*pages); + copied = __ntfs_copy_from_user_iovec_inatomic(addr + + ofs, *iov, *iov_ofs, len); + if (unlikely(copied != len)) + goto err_out; + kunmap(*pages); + } + total += len; + ntfs_set_next_iovec(iov, iov_ofs, len); + bytes -= len; + if (!bytes) + break; + ofs = 0; + } while (++pages < last_page); +out: + return total; +err_out: + BUG_ON(copied > len); + /* Zero the rest of the target like __copy_from_user(). */ + memset(addr + ofs + copied, 0, len - copied); + kunmap(*pages); + total += copied; + ntfs_set_next_iovec(iov, iov_ofs, copied); + while (++pages < last_page) { + bytes -= len; + if (!bytes) + break; + len = PAGE_CACHE_SIZE; + if (len > bytes) + len = bytes; + zero_user(*pages, 0, len); + } + goto out; +} + +static inline void ntfs_flush_dcache_pages(struct page **pages, + unsigned nr_pages) +{ + BUG_ON(!nr_pages); + /* + * Warning: Do not do the decrement at the same time as the call to + * flush_dcache_page() because it is a NULL macro on i386 and hence the + * decrement never happens so the loop never terminates. + */ + do { + --nr_pages; + flush_dcache_page(pages[nr_pages]); + } while (nr_pages > 0); +} + +/** + * ntfs_commit_pages_after_non_resident_write - commit the received data + * @pages: array of destination pages + * @nr_pages: number of pages in @pages + * @pos: byte position in file at which the write begins + * @bytes: number of bytes to be written + * + * See description of ntfs_commit_pages_after_write(), below. + */ +static inline int ntfs_commit_pages_after_non_resident_write( + struct page **pages, const unsigned nr_pages, + s64 pos, size_t bytes) +{ + s64 end, initialized_size; + struct inode *vi; + ntfs_inode *ni, *base_ni; + struct buffer_head *bh, *head; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + unsigned long flags; + unsigned blocksize, u; + int err; + + vi = pages[0]->mapping->host; + ni = NTFS_I(vi); + blocksize = vi->i_sb->s_blocksize; + end = pos + bytes; + u = 0; + do { + s64 bh_pos; + struct page *page; + bool partial; + + page = pages[u]; + bh_pos = (s64)page->index << PAGE_CACHE_SHIFT; + bh = head = page_buffers(page); + partial = false; + do { + s64 bh_end; + + bh_end = bh_pos + blocksize; + if (bh_end <= pos || bh_pos >= end) { + if (!buffer_uptodate(bh)) + partial = true; + } else { + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } + } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); + /* + * If all buffers are now uptodate but the page is not, set the + * page uptodate. + */ + if (!partial && !PageUptodate(page)) + SetPageUptodate(page); + } while (++u < nr_pages); + /* + * Finally, if we do not need to update initialized_size or i_size we + * are finished. + */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (end <= initialized_size) { + ntfs_debug("Done."); + return 0; + } + /* + * Update initialized_size/i_size as appropriate, both in the inode and + * the mft record. + */ + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Map, pin, and lock the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + BUG_ON(!NInoNonResident(ni)); + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + a = ctx->attr; + BUG_ON(!a->non_resident); + write_lock_irqsave(&ni->size_lock, flags); + BUG_ON(end > ni->allocated_size); + ni->initialized_size = end; + a->data.non_resident.initialized_size = cpu_to_sle64(end); + if (end > i_size_read(vi)) { + i_size_write(vi, end); + a->data.non_resident.data_size = + a->data.non_resident.initialized_size; + } + write_unlock_irqrestore(&ni->size_lock, flags); + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + ntfs_debug("Done."); + return 0; +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error " + "code %i).", err); + if (err != -ENOMEM) + NVolSetErrors(ni->vol); + return err; +} + +/** + * ntfs_commit_pages_after_write - commit the received data + * @pages: array of destination pages + * @nr_pages: number of pages in @pages + * @pos: byte position in file at which the write begins + * @bytes: number of bytes to be written + * + * This is called from ntfs_file_buffered_write() with i_mutex held on the inode + * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are + * locked but not kmap()ped. The source data has already been copied into the + * @page. ntfs_prepare_pages_for_non_resident_write() has been called before + * the data was copied (for non-resident attributes only) and it returned + * success. + * + * Need to set uptodate and mark dirty all buffers within the boundary of the + * write. If all buffers in a page are uptodate we set the page uptodate, too. + * + * Setting the buffers dirty ensures that they get written out later when + * ntfs_writepage() is invoked by the VM. + * + * Finally, we need to update i_size and initialized_size as appropriate both + * in the inode and the mft record. + * + * This is modelled after fs/buffer.c::generic_commit_write(), which marks + * buffers uptodate and dirty, sets the page uptodate if all buffers in the + * page are uptodate, and updates i_size if the end of io is beyond i_size. In + * that case, it also marks the inode dirty. + * + * If things have gone as outlined in + * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page + * content modifications here for non-resident attributes. For resident + * attributes we need to do the uptodate bringing here which we combine with + * the copying into the mft record which means we save one atomic kmap. + * + * Return 0 on success or -errno on error. + */ +static int ntfs_commit_pages_after_write(struct page **pages, + const unsigned nr_pages, s64 pos, size_t bytes) +{ + s64 end, initialized_size; + loff_t i_size; + struct inode *vi; + ntfs_inode *ni, *base_ni; + struct page *page; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + char *kattr, *kaddr; + unsigned long flags; + u32 attr_len; + int err; + + BUG_ON(!nr_pages); + BUG_ON(!pages); + page = pages[0]; + BUG_ON(!page); + vi = page->mapping->host; + ni = NTFS_I(vi); + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " + "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", + vi->i_ino, ni->type, page->index, nr_pages, + (long long)pos, bytes); + if (NInoNonResident(ni)) + return ntfs_commit_pages_after_non_resident_write(pages, + nr_pages, pos, bytes); + BUG_ON(nr_pages > 1); + /* + * Attribute is resident, implying it is not compressed, encrypted, or + * sparse. + */ + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + BUG_ON(NInoNonResident(ni)); + /* Map, pin, and lock the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + a = ctx->attr; + BUG_ON(a->non_resident); + /* The total length of the attribute value. */ + attr_len = le32_to_cpu(a->data.resident.value_length); + i_size = i_size_read(vi); + BUG_ON(attr_len != i_size); + BUG_ON(pos > attr_len); + end = pos + bytes; + BUG_ON(end > le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset)); + kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); + kaddr = kmap_atomic(page, KM_USER0); + /* Copy the received data from the page to the mft record. */ + memcpy(kattr + pos, kaddr + pos, bytes); + /* Update the attribute length if necessary. */ + if (end > attr_len) { + attr_len = end; + a->data.resident.value_length = cpu_to_le32(attr_len); + } + /* + * If the page is not uptodate, bring the out of bounds area(s) + * uptodate by copying data from the mft record to the page. + */ + if (!PageUptodate(page)) { + if (pos > 0) + memcpy(kaddr, kattr, pos); + if (end < attr_len) + memcpy(kaddr + end, kattr + end, attr_len - end); + /* Zero the region outside the end of the attribute value. */ + memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); + flush_dcache_page(page); + SetPageUptodate(page); + } + kunmap_atomic(kaddr, KM_USER0); + /* Update initialized_size/i_size if necessary. */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + BUG_ON(end > ni->allocated_size); + read_unlock_irqrestore(&ni->size_lock, flags); + BUG_ON(initialized_size != i_size); + if (end > initialized_size) { + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = end; + i_size_write(vi, end); + write_unlock_irqrestore(&ni->size_lock, flags); + } + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + ntfs_debug("Done."); + return 0; +err_out: + if (err == -ENOMEM) { + ntfs_warning(vi->i_sb, "Error allocating memory required to " + "commit the write."); + if (PageUptodate(page)) { + ntfs_warning(vi->i_sb, "Page is uptodate, setting " + "dirty so the write will be retried " + "later on by the VM."); + /* + * Put the page on mapping->dirty_pages, but leave its + * buffers' dirty state as-is. + */ + __set_page_dirty_nobuffers(page); + err = 0; + } else + ntfs_error(vi->i_sb, "Page is not uptodate. Written " + "data has been lost."); + } else { + ntfs_error(vi->i_sb, "Resident attribute commit write failed " + "with error %i.", err); + NVolSetErrors(ni->vol); + } + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + return err; +} + +/** + * ntfs_file_buffered_write - + * + * Locking: The vfs is holding ->i_mutex on the inode. + */ +static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, + const struct iovec *iov, unsigned long nr_segs, + loff_t pos, loff_t *ppos, size_t count) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *vi = mapping->host; + ntfs_inode *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; + struct page *cached_page = NULL; + char __user *buf = NULL; + s64 end, ll; + VCN last_vcn; + LCN lcn; + unsigned long flags; + size_t bytes, iov_ofs = 0; /* Offset in the current iovec. */ + ssize_t status, written; + unsigned nr_pages; + int err; + + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " + "pos 0x%llx, count 0x%lx.", + vi->i_ino, (unsigned)le32_to_cpu(ni->type), + (unsigned long long)pos, (unsigned long)count); + if (unlikely(!count)) + return 0; + BUG_ON(NInoMstProtected(ni)); + /* + * If the attribute is not an index root and it is encrypted or + * compressed, we cannot write to it yet. Note we need to check for + * AT_INDEX_ALLOCATION since this is the type of both directory and + * index inodes. + */ + if (ni->type != AT_INDEX_ALLOCATION) { + /* If file is encrypted, deny access, just like NT4. */ + if (NInoEncrypted(ni)) { + /* + * Reminder for later: Encrypted files are _always_ + * non-resident so that the content can always be + * encrypted. + */ + ntfs_debug("Denying write access to encrypted file."); + return -EACCES; + } + if (NInoCompressed(ni)) { + /* Only unnamed $DATA attribute can be compressed. */ + BUG_ON(ni->type != AT_DATA); + BUG_ON(ni->name_len); + /* + * Reminder for later: If resident, the data is not + * actually compressed. Only on the switch to non- + * resident does compression kick in. This is in + * contrast to encrypted files (see above). + */ + ntfs_error(vi->i_sb, "Writing to compressed files is " + "not implemented yet. Sorry."); + return -EOPNOTSUPP; + } + } + /* + * If a previous ntfs_truncate() failed, repeat it and abort if it + * fails again. + */ + if (unlikely(NInoTruncateFailed(ni))) { + down_write(&vi->i_alloc_sem); + err = ntfs_truncate(vi); + up_write(&vi->i_alloc_sem); + if (err || NInoTruncateFailed(ni)) { + if (!err) + err = -EIO; + ntfs_error(vol->sb, "Cannot perform write to inode " + "0x%lx, attribute type 0x%x, because " + "ntfs_truncate() failed (error code " + "%i).", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + return err; + } + } + /* The first byte after the write. */ + end = pos + count; + /* + * If the write goes beyond the allocated size, extend the allocation + * to cover the whole of the write, rounded up to the nearest cluster. + */ + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (end > ll) { + /* Extend the allocation without changing the data size. */ + ll = ntfs_attr_extend_allocation(ni, end, -1, pos); + if (likely(ll >= 0)) { + BUG_ON(pos >= ll); + /* If the extension was partial truncate the write. */ + if (end > ll) { + ntfs_debug("Truncating write to inode 0x%lx, " + "attribute type 0x%x, because " + "the allocation was only " + "partially extended.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + end = ll; + count = ll - pos; + } + } else { + err = ll; + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + /* Perform a partial write if possible or fail. */ + if (pos < ll) { + ntfs_debug("Truncating write to inode 0x%lx, " + "attribute type 0x%x, because " + "extending the allocation " + "failed (error code %i).", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type), err); + end = ll; + count = ll - pos; + } else { + ntfs_error(vol->sb, "Cannot perform write to " + "inode 0x%lx, attribute type " + "0x%x, because extending the " + "allocation failed (error " + "code %i).", vi->i_ino, + (unsigned) + le32_to_cpu(ni->type), err); + return err; + } + } + } + written = 0; + /* + * If the write starts beyond the initialized size, extend it up to the + * beginning of the write and initialize all non-sparse space between + * the old initialized size and the new one. This automatically also + * increments the vfs inode->i_size to keep it above or equal to the + * initialized_size. + */ + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (pos > ll) { + err = ntfs_attr_extend_initialized(ni, pos); + if (err < 0) { + ntfs_error(vol->sb, "Cannot perform write to inode " + "0x%lx, attribute type 0x%x, because " + "extending the initialized size " + "failed (error code %i).", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + status = err; + goto err_out; + } + } + /* + * Determine the number of pages per cluster for non-resident + * attributes. + */ + nr_pages = 1; + if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni)) + nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT; + /* Finally, perform the actual write. */ + last_vcn = -1; + if (likely(nr_segs == 1)) + buf = iov->iov_base; + do { + VCN vcn; + pgoff_t idx, start_idx; + unsigned ofs, do_pages, u; + size_t copied; + + start_idx = idx = pos >> PAGE_CACHE_SHIFT; + ofs = pos & ~PAGE_CACHE_MASK; + bytes = PAGE_CACHE_SIZE - ofs; + do_pages = 1; + if (nr_pages > 1) { + vcn = pos >> vol->cluster_size_bits; + if (vcn != last_vcn) { + last_vcn = vcn; + /* + * Get the lcn of the vcn the write is in. If + * it is a hole, need to lock down all pages in + * the cluster. + */ + down_read(&ni->runlist.lock); + lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >> + vol->cluster_size_bits, false); + up_read(&ni->runlist.lock); + if (unlikely(lcn < LCN_HOLE)) { + status = -EIO; + if (lcn == LCN_ENOMEM) + status = -ENOMEM; + else + ntfs_error(vol->sb, "Cannot " + "perform write to " + "inode 0x%lx, " + "attribute type 0x%x, " + "because the attribute " + "is corrupt.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + break; + } + if (lcn == LCN_HOLE) { + start_idx = (pos & ~(s64) + vol->cluster_size_mask) + >> PAGE_CACHE_SHIFT; + bytes = vol->cluster_size - (pos & + vol->cluster_size_mask); + do_pages = nr_pages; + } + } + } + if (bytes > count) + bytes = count; + /* + * Bring in the user page(s) that we will copy from _first_. + * Otherwise there is a nasty deadlock on copying from the same + * page(s) as we are writing to, without it/them being marked + * up-to-date. Note, at present there is nothing to stop the + * pages being swapped out between us bringing them into memory + * and doing the actual copying. + */ + if (likely(nr_segs == 1)) + ntfs_fault_in_pages_readable(buf, bytes); + else + ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes); + /* Get and lock @do_pages starting at index @start_idx. */ + status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, + pages, &cached_page); + if (unlikely(status)) + break; + /* + * For non-resident attributes, we need to fill any holes with + * actual clusters and ensure all bufferes are mapped. We also + * need to bring uptodate any buffers that are only partially + * being written to. + */ + if (NInoNonResident(ni)) { + status = ntfs_prepare_pages_for_non_resident_write( + pages, do_pages, pos, bytes); + if (unlikely(status)) { + loff_t i_size; + + do { + unlock_page(pages[--do_pages]); + page_cache_release(pages[do_pages]); + } while (do_pages); + /* + * The write preparation may have instantiated + * allocated space outside i_size. Trim this + * off again. We can ignore any errors in this + * case as we will just be waisting a bit of + * allocated space, which is not a disaster. + */ + i_size = i_size_read(vi); + if (pos + bytes > i_size) + vmtruncate(vi, i_size); + break; + } + } + u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index; + if (likely(nr_segs == 1)) { + copied = ntfs_copy_from_user(pages + u, do_pages - u, + ofs, buf, bytes); + buf += copied; + } else + copied = ntfs_copy_from_user_iovec(pages + u, + do_pages - u, ofs, &iov, &iov_ofs, + bytes); + ntfs_flush_dcache_pages(pages + u, do_pages - u); + status = ntfs_commit_pages_after_write(pages, do_pages, pos, + bytes); + if (likely(!status)) { + written += copied; + count -= copied; + pos += copied; + if (unlikely(copied != bytes)) + status = -EFAULT; + } + do { + unlock_page(pages[--do_pages]); + mark_page_accessed(pages[do_pages]); + page_cache_release(pages[do_pages]); + } while (do_pages); + if (unlikely(status)) + break; + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } while (count); +err_out: + *ppos = pos; + if (cached_page) + page_cache_release(cached_page); + ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", + written ? "written" : "status", (unsigned long)written, + (long)status); + return written ? written : status; +} + +/** + * ntfs_file_aio_write_nolock - + */ +static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, + const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + loff_t pos; + size_t count; /* after file limit checks */ + ssize_t written, err; + + count = 0; + err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); + if (err) + return err; + pos = *ppos; + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + /* We can write back this queue in page reclaim. */ + current->backing_dev_info = mapping->backing_dev_info; + written = 0; + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + if (!count) + goto out; + err = file_remove_suid(file); + if (err) + goto out; + file_update_time(file); + written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos, + count); +out: + current->backing_dev_info = NULL; + return written ? written : err; +} + +/** + * ntfs_file_aio_write - + */ +static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + + BUG_ON(iocb->ki_pos != pos); + + mutex_lock(&inode->i_mutex); + ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); + mutex_unlock(&inode->i_mutex); + if (ret > 0) { + int err = generic_write_sync(file, pos, ret); + if (err < 0) + ret = err; + } + return ret; +} + +/** + * ntfs_file_fsync - sync a file to disk + * @filp: file to be synced + * @datasync: if non-zero only flush user data and not metadata + * + * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync + * system calls. This function is inspired by fs/buffer.c::file_fsync(). + * + * If @datasync is false, write the mft record and all associated extent mft + * records as well as the $DATA attribute and then sync the block device. + * + * If @datasync is true and the attribute is non-resident, we skip the writing + * of the mft record and all associated extent mft records (this might still + * happen due to the write_inode_now() call). + * + * Also, if @datasync is true, we do not wait on the inode to be written out + * but we always wait on the page cache pages to be written out. + * + * Locking: Caller must hold i_mutex on the inode. + * + * TODO: We should probably also write all attribute/index inodes associated + * with this inode but since we have no simple way of getting to them we ignore + * this problem for now. + */ +static int ntfs_file_fsync(struct file *filp, int datasync) +{ + struct inode *vi = filp->f_mapping->host; + int err, ret = 0; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + BUG_ON(S_ISDIR(vi->i_mode)); + if (!datasync || !NInoNonResident(NTFS_I(vi))) + ret = __ntfs_write_inode(vi, 1); + write_inode_now(vi, !datasync); + /* + * NOTE: If we were to use mapping->private_list (see ext2 and + * fs/buffer.c) for dirty blocks then we could optimize the below to be + * sync_mapping_buffers(vi->i_mapping). + */ + err = sync_blockdev(vi->i_sb->s_bdev); + if (unlikely(err && !ret)) + ret = err; + if (likely(!ret)) + ntfs_debug("Done."); + else + ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " + "%u.", datasync ? "data" : "", vi->i_ino, -ret); + return ret; +} + +#endif /* NTFS_RW */ + +const struct file_operations ntfs_file_ops = { + .llseek = generic_file_llseek, /* Seek inside file. */ + .read = do_sync_read, /* Read from file. */ + .aio_read = generic_file_aio_read, /* Async read from file. */ +#ifdef NTFS_RW + .write = do_sync_write, /* Write to file. */ + .aio_write = ntfs_file_aio_write, /* Async write to file. */ + /*.release = ,*/ /* Last file is closed. See + fs/ext2/file.c:: + ext2_release_file() for + how to use this to discard + preallocated space for + write opened files. */ + .fsync = ntfs_file_fsync, /* Sync a file to disk. */ + /*.aio_fsync = ,*/ /* Sync all outstanding async + i/o operations on a + kiocb. */ +#endif /* NTFS_RW */ + /*.ioctl = ,*/ /* Perform function on the + mounted filesystem. */ + .mmap = generic_file_mmap, /* Mmap file. */ + .open = ntfs_file_open, /* Open file. */ + .splice_read = generic_file_splice_read /* Zero-copy data send with + the data source being on + the ntfs partition. We do + not need to care about the + data destination. */ + /*.sendpage = ,*/ /* Zero-copy data send with + the data destination being + on the ntfs partition. We + do not need to care about + the data source. */ +}; + +const struct inode_operations ntfs_file_inode_ops = { +#ifdef NTFS_RW + .truncate = ntfs_truncate_vfs, + .setattr = ntfs_setattr, +#endif /* NTFS_RW */ +}; + +const struct file_operations ntfs_empty_file_ops = {}; + +const struct inode_operations ntfs_empty_inode_ops = {}; diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c new file mode 100644 index 00000000..096c1356 --- /dev/null +++ b/fs/ntfs/index.c @@ -0,0 +1,454 @@ +/* + * index.c - NTFS kernel index handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include + +#include "aops.h" +#include "collate.h" +#include "debug.h" +#include "index.h" +#include "ntfs.h" + +/** + * ntfs_index_ctx_get - allocate and initialize a new index context + * @idx_ni: ntfs index inode with which to initialize the context + * + * Allocate a new index context, initialize it with @idx_ni and return it. + * Return NULL if allocation failed. + * + * Locking: Caller must hold i_mutex on the index inode. + */ +ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) +{ + ntfs_index_context *ictx; + + ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS); + if (ictx) + *ictx = (ntfs_index_context){ .idx_ni = idx_ni }; + return ictx; +} + +/** + * ntfs_index_ctx_put - release an index context + * @ictx: index context to free + * + * Release the index context @ictx, releasing all associated resources. + * + * Locking: Caller must hold i_mutex on the index inode. + */ +void ntfs_index_ctx_put(ntfs_index_context *ictx) +{ + if (ictx->entry) { + if (ictx->is_in_root) { + if (ictx->actx) + ntfs_attr_put_search_ctx(ictx->actx); + if (ictx->base_ni) + unmap_mft_record(ictx->base_ni); + } else { + struct page *page = ictx->page; + if (page) { + BUG_ON(!PageLocked(page)); + unlock_page(page); + ntfs_unmap_page(page); + } + } + } + kmem_cache_free(ntfs_index_ctx_cache, ictx); + return; +} + +/** + * ntfs_index_lookup - find a key in an index and return its index entry + * @key: [IN] key for which to search in the index + * @key_len: [IN] length of @key in bytes + * @ictx: [IN/OUT] context describing the index and the returned entry + * + * Before calling ntfs_index_lookup(), @ictx must have been obtained from a + * call to ntfs_index_ctx_get(). + * + * Look for the @key in the index specified by the index lookup context @ictx. + * ntfs_index_lookup() walks the contents of the index looking for the @key. + * + * If the @key is found in the index, 0 is returned and @ictx is setup to + * describe the index entry containing the matching @key. @ictx->entry is the + * index entry and @ictx->data and @ictx->data_len are the index entry data and + * its length in bytes, respectively. + * + * If the @key is not found in the index, -ENOENT is returned and @ictx is + * setup to describe the index entry whose key collates immediately after the + * search @key, i.e. this is the position in the index at which an index entry + * with a key of @key would need to be inserted. + * + * If an error occurs return the negative error code and @ictx is left + * untouched. + * + * When finished with the entry and its data, call ntfs_index_ctx_put() to free + * the context and other associated resources. + * + * If the index entry was modified, call flush_dcache_index_entry_page() + * immediately after the modification and either ntfs_index_entry_mark_dirty() + * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to + * ensure that the changes are written to disk. + * + * Locking: - Caller must hold i_mutex on the index inode. + * - Each page cache page in the index allocation mapping must be + * locked whilst being accessed otherwise we may find a corrupt + * page due to it being under ->writepage at the moment which + * applies the mst protection fixups before writing out and then + * removes them again after the write is complete after which it + * unlocks the page. + */ +int ntfs_index_lookup(const void *key, const int key_len, + ntfs_index_context *ictx) +{ + VCN vcn, old_vcn; + ntfs_inode *idx_ni = ictx->idx_ni; + ntfs_volume *vol = idx_ni->vol; + struct super_block *sb = vol->sb; + ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino; + MFT_RECORD *m; + INDEX_ROOT *ir; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *index_end, *kaddr; + ntfs_attr_search_ctx *actx; + struct address_space *ia_mapping; + struct page *page; + int rc, err = 0; + + ntfs_debug("Entering."); + BUG_ON(!NInoAttr(idx_ni)); + BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION); + BUG_ON(idx_ni->nr_extents != -1); + BUG_ON(!base_ni); + BUG_ON(!key); + BUG_ON(key_len <= 0); + if (!ntfs_is_collation_rule_supported( + idx_ni->itype.index.collation_rule)) { + ntfs_error(sb, "Index uses unsupported collation rule 0x%x. " + "Aborting lookup.", le32_to_cpu( + idx_ni->itype.index.collation_rule)); + return -EOPNOTSUPP; + } + /* Get hold of the mft record for the index inode. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + ntfs_error(sb, "map_mft_record() failed with error code %ld.", + -PTR_ERR(m)); + return PTR_ERR(m); + } + actx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!actx)) { + err = -ENOMEM; + goto err_out; + } + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, actx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(sb, "Index root attribute missing in inode " + "0x%lx.", idx_ni->mft_no); + err = -EIO; + } + goto err_out; + } + /* Get to the index root value (it has been verified in read_inode). */ + ir = (INDEX_ROOT*)((u8*)actx->attr + + le16_to_cpu(actx->attr->data.resident.value_offset)); + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)actx->mrec || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->length) > index_end) + goto idx_err_out; + /* + * The last entry cannot contain a key. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Further bounds checks. */ + if ((u32)sizeof(INDEX_ENTRY_HEADER) + + le16_to_cpu(ie->key_length) > + le16_to_cpu(ie->data.vi.data_offset) || + (u32)le16_to_cpu(ie->data.vi.data_offset) + + le16_to_cpu(ie->data.vi.data_length) > + le16_to_cpu(ie->length)) + goto idx_err_out; + /* If the keys match perfectly, we setup @ictx and return 0. */ + if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, + &ie->key, key_len)) { +ir_done: + ictx->is_in_root = true; + ictx->ir = ir; + ictx->actx = actx; + ictx->base_ni = base_ni; + ictx->ia = NULL; + ictx->page = NULL; +done: + ictx->entry = ie; + ictx->data = (u8*)ie + + le16_to_cpu(ie->data.vi.data_offset); + ictx->data_len = le16_to_cpu(ie->data.vi.data_length); + ntfs_debug("Done."); + return err; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, + key_len, &ie->key, le16_to_cpu(ie->key_length)); + /* + * If @key collates before the key of the current entry, there + * is definitely no such key in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* + * A match should never happen as the memcmp() call should have + * cought it, but we still treat it correctly. + */ + if (!rc) + goto ir_done; + /* The keys are not equal, continue the search. */ + } + /* + * We have finished with this index without success. Check for the + * presence of a child node and if not present setup @ictx and return + * -ENOENT. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + ntfs_debug("Entry not found."); + err = -ENOENT; + goto ir_done; + } /* Child node present, descend into it. */ + /* Consistency check: Verify that an index allocation exists. */ + if (!NInoIndexAllocPresent(idx_ni)) { + ntfs_error(sb, "No index allocation attribute but index entry " + "requires one. Inode 0x%lx is corrupt or " + "driver bug.", idx_ni->mft_no); + goto err_out; + } + /* Get the starting vcn of the index_block holding the child node. */ + vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); + ia_mapping = VFS_I(idx_ni)->i_mapping; + /* + * We are done with the index root and the mft record. Release them, + * otherwise we deadlock with ntfs_map_page(). + */ + ntfs_attr_put_search_ctx(actx); + unmap_mft_record(base_ni); + m = NULL; + actx = NULL; +descend_into_child_node: + /* + * Convert vcn to index into the index allocation attribute in units + * of PAGE_CACHE_SIZE and map the page cache page, reading it from + * disk if necessary. + */ + page = ntfs_map_page(ia_mapping, vcn << + idx_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(sb, "Failed to map index page, error %ld.", + -PTR_ERR(page)); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + kaddr = (u8*)page_address(page); +fast_descend_into_child_node: + /* Get to the index allocation block. */ + ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << + idx_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK)); + /* Bounds checks. */ + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) { + ntfs_error(sb, "Out of bounds check failed. Corrupt inode " + "0x%lx or driver bug.", idx_ni->mft_no); + goto unm_err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Index record with vcn 0x%llx is corrupt. " + "Corrupt inode 0x%lx. Run chkdsk.", + (long long)vcn, idx_ni->mft_no); + goto unm_err_out; + } + if (sle64_to_cpu(ia->index_block_vcn) != vcn) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). Inode " + "0x%lx is corrupt or driver bug.", + (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)vcn, idx_ni->mft_no); + goto unm_err_out; + } + if (le32_to_cpu(ia->index.allocated_size) + 0x18 != + idx_ni->itype.index.block_size) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has " + "a size (%u) differing from the index " + "specified size (%u). Inode is corrupt or " + "driver bug.", (unsigned long long)vcn, + idx_ni->mft_no, + le32_to_cpu(ia->index.allocated_size) + 0x18, + idx_ni->itype.index.block_size); + goto unm_err_out; + } + index_end = (u8*)ia + idx_ni->itype.index.block_size; + if (index_end > kaddr + PAGE_CACHE_SIZE) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx " + "crosses page boundary. Impossible! Cannot " + "access! This is probably a bug in the " + "driver.", (unsigned long long)vcn, + idx_ni->mft_no); + goto unm_err_out; + } + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (index_end > (u8*)ia + idx_ni->itype.index.block_size) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode " + "0x%lx exceeds maximum size.", + (unsigned long long)vcn, idx_ni->mft_no); + goto unm_err_out; + } + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Iterate similar to above big loop but applied to index buffer, thus + * loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->length) > index_end) { + ntfs_error(sb, "Index entry out of bounds in inode " + "0x%lx.", idx_ni->mft_no); + goto unm_err_out; + } + /* + * The last entry cannot contain a key. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Further bounds checks. */ + if ((u32)sizeof(INDEX_ENTRY_HEADER) + + le16_to_cpu(ie->key_length) > + le16_to_cpu(ie->data.vi.data_offset) || + (u32)le16_to_cpu(ie->data.vi.data_offset) + + le16_to_cpu(ie->data.vi.data_length) > + le16_to_cpu(ie->length)) { + ntfs_error(sb, "Index entry out of bounds in inode " + "0x%lx.", idx_ni->mft_no); + goto unm_err_out; + } + /* If the keys match perfectly, we setup @ictx and return 0. */ + if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, + &ie->key, key_len)) { +ia_done: + ictx->is_in_root = false; + ictx->actx = NULL; + ictx->base_ni = NULL; + ictx->ia = ia; + ictx->page = page; + goto done; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, + key_len, &ie->key, le16_to_cpu(ie->key_length)); + /* + * If @key collates before the key of the current entry, there + * is definitely no such key in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* + * A match should never happen as the memcmp() call should have + * cought it, but we still treat it correctly. + */ + if (!rc) + goto ia_done; + /* The keys are not equal, continue the search. */ + } + /* + * We have finished with this index buffer without success. Check for + * the presence of a child node and if not present return -ENOENT. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + ntfs_debug("Entry not found."); + err = -ENOENT; + goto ia_done; + } + if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { + ntfs_error(sb, "Index entry with child node found in a leaf " + "node in inode 0x%lx.", idx_ni->mft_no); + goto unm_err_out; + } + /* Child node present, descend into it. */ + old_vcn = vcn; + vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); + if (vcn >= 0) { + /* + * If vcn is in the same page cache page as old_vcn we recycle + * the mapped page. + */ + if (old_vcn << vol->cluster_size_bits >> + PAGE_CACHE_SHIFT == vcn << + vol->cluster_size_bits >> + PAGE_CACHE_SHIFT) + goto fast_descend_into_child_node; + unlock_page(page); + ntfs_unmap_page(page); + goto descend_into_child_node; + } + ntfs_error(sb, "Negative child node vcn in inode 0x%lx.", + idx_ni->mft_no); +unm_err_out: + unlock_page(page); + ntfs_unmap_page(page); +err_out: + if (!err) + err = -EIO; + if (actx) + ntfs_attr_put_search_ctx(actx); + if (m) + unmap_mft_record(base_ni); + return err; +idx_err_out: + ntfs_error(sb, "Corrupt index. Aborting lookup."); + goto err_out; +} diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h new file mode 100644 index 00000000..8745469c --- /dev/null +++ b/fs/ntfs/index.h @@ -0,0 +1,148 @@ +/* + * index.h - Defines for NTFS kernel index handling. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_INDEX_H +#define _LINUX_NTFS_INDEX_H + +#include + +#include "types.h" +#include "layout.h" +#include "inode.h" +#include "attrib.h" +#include "mft.h" +#include "aops.h" + +/** + * @idx_ni: index inode containing the @entry described by this context + * @entry: index entry (points into @ir or @ia) + * @data: index entry data (points into @entry) + * @data_len: length in bytes of @data + * @is_in_root: 'true' if @entry is in @ir and 'false' if it is in @ia + * @ir: index root if @is_in_root and NULL otherwise + * @actx: attribute search context if @is_in_root and NULL otherwise + * @base_ni: base inode if @is_in_root and NULL otherwise + * @ia: index block if @is_in_root is 'false' and NULL otherwise + * @page: page if @is_in_root is 'false' and NULL otherwise + * + * @idx_ni is the index inode this context belongs to. + * + * @entry is the index entry described by this context. @data and @data_len + * are the index entry data and its length in bytes, respectively. @data + * simply points into @entry. This is probably what the user is interested in. + * + * If @is_in_root is 'true', @entry is in the index root attribute @ir described + * by the attribute search context @actx and the base inode @base_ni. @ia and + * @page are NULL in this case. + * + * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia + * and @page point to the index allocation block and the mapped, locked page it + * is in, respectively. @ir, @actx and @base_ni are NULL in this case. + * + * To obtain a context call ntfs_index_ctx_get(). + * + * We use this context to allow ntfs_index_lookup() to return the found index + * @entry and its @data without having to allocate a buffer and copy the @entry + * and/or its @data into it. + * + * When finished with the @entry and its @data, call ntfs_index_ctx_put() to + * free the context and other associated resources. + * + * If the index entry was modified, call flush_dcache_index_entry_page() + * immediately after the modification and either ntfs_index_entry_mark_dirty() + * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to + * ensure that the changes are written to disk. + */ +typedef struct { + ntfs_inode *idx_ni; + INDEX_ENTRY *entry; + void *data; + u16 data_len; + bool is_in_root; + INDEX_ROOT *ir; + ntfs_attr_search_ctx *actx; + ntfs_inode *base_ni; + INDEX_ALLOCATION *ia; + struct page *page; +} ntfs_index_context; + +extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni); +extern void ntfs_index_ctx_put(ntfs_index_context *ictx); + +extern int ntfs_index_lookup(const void *key, const int key_len, + ntfs_index_context *ictx); + +#ifdef NTFS_RW + +/** + * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries + * @ictx: ntfs index context describing the index entry + * + * Call flush_dcache_page() for the page in which an index entry resides. + * + * This must be called every time an index entry is modified, just after the + * modification. + * + * If the index entry is in the index root attribute, simply flush the page + * containing the mft record containing the index root attribute. + * + * If the index entry is in an index block belonging to the index allocation + * attribute, simply flush the page cache page containing the index block. + */ +static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx) +{ + if (ictx->is_in_root) + flush_dcache_mft_record_page(ictx->actx->ntfs_ino); + else + flush_dcache_page(ictx->page); +} + +/** + * ntfs_index_entry_mark_dirty - mark an index entry dirty + * @ictx: ntfs index context describing the index entry + * + * Mark the index entry described by the index entry context @ictx dirty. + * + * If the index entry is in the index root attribute, simply mark the mft + * record containing the index root attribute dirty. This ensures the mft + * record, and hence the index root attribute, will be written out to disk + * later. + * + * If the index entry is in an index block belonging to the index allocation + * attribute, mark the buffers belonging to the index record as well as the + * page cache page the index block is in dirty. This automatically marks the + * VFS inode of the ntfs index inode to which the index entry belongs dirty, + * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the + * dirty index block, will be written out to disk later. + */ +static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx) +{ + if (ictx->is_in_root) + mark_mft_record_dirty(ictx->actx->ntfs_ino); + else + mark_ntfs_record_dirty(ictx->page, + (u8*)ictx->ia - (u8*)page_address(ictx->page)); +} + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_INDEX_H */ diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c new file mode 100644 index 00000000..c05d6dcf --- /dev/null +++ b/fs/ntfs/inode.c @@ -0,0 +1,3112 @@ +/** + * inode.c - NTFS kernel inode handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "aops.h" +#include "attrib.h" +#include "bitmap.h" +#include "dir.h" +#include "debug.h" +#include "inode.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "time.h" +#include "ntfs.h" + +/** + * ntfs_test_inode - compare two (possibly fake) inodes for equality + * @vi: vfs inode which to test + * @na: ntfs attribute which is being tested with + * + * Compare the ntfs attribute embedded in the ntfs specific part of the vfs + * inode @vi for equality with the ntfs attribute @na. + * + * If searching for the normal file/directory inode, set @na->type to AT_UNUSED. + * @na->name and @na->name_len are then ignored. + * + * Return 1 if the attributes match and 0 if not. + * + * NOTE: This function runs with the inode->i_lock spin lock held so it is not + * allowed to sleep. + */ +int ntfs_test_inode(struct inode *vi, ntfs_attr *na) +{ + ntfs_inode *ni; + + if (vi->i_ino != na->mft_no) + return 0; + ni = NTFS_I(vi); + /* If !NInoAttr(ni), @vi is a normal file or directory inode. */ + if (likely(!NInoAttr(ni))) { + /* If not looking for a normal inode this is a mismatch. */ + if (unlikely(na->type != AT_UNUSED)) + return 0; + } else { + /* A fake inode describing an attribute. */ + if (ni->type != na->type) + return 0; + if (ni->name_len != na->name_len) + return 0; + if (na->name_len && memcmp(ni->name, na->name, + na->name_len * sizeof(ntfschar))) + return 0; + } + /* Match! */ + return 1; +} + +/** + * ntfs_init_locked_inode - initialize an inode + * @vi: vfs inode to initialize + * @na: ntfs attribute which to initialize @vi to + * + * Initialize the vfs inode @vi with the values from the ntfs attribute @na in + * order to enable ntfs_test_inode() to do its work. + * + * If initializing the normal file/directory inode, set @na->type to AT_UNUSED. + * In that case, @na->name and @na->name_len should be set to NULL and 0, + * respectively. Although that is not strictly necessary as + * ntfs_read_locked_inode() will fill them in later. + * + * Return 0 on success and -errno on error. + * + * NOTE: This function runs with the inode->i_lock spin lock held so it is not + * allowed to sleep. (Hence the GFP_ATOMIC allocation.) + */ +static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na) +{ + ntfs_inode *ni = NTFS_I(vi); + + vi->i_ino = na->mft_no; + + ni->type = na->type; + if (na->type == AT_INDEX_ALLOCATION) + NInoSetMstProtected(ni); + + ni->name = na->name; + ni->name_len = na->name_len; + + /* If initializing a normal inode, we are done. */ + if (likely(na->type == AT_UNUSED)) { + BUG_ON(na->name); + BUG_ON(na->name_len); + return 0; + } + + /* It is a fake inode. */ + NInoSetAttr(ni); + + /* + * We have I30 global constant as an optimization as it is the name + * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC + * allocation but that is ok. And most attributes are unnamed anyway, + * thus the fraction of named attributes with name != I30 is actually + * absolutely tiny. + */ + if (na->name_len && na->name != I30) { + unsigned int i; + + BUG_ON(!na->name); + i = na->name_len * sizeof(ntfschar); + ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC); + if (!ni->name) + return -ENOMEM; + memcpy(ni->name, na->name, i); + ni->name[na->name_len] = 0; + } + return 0; +} + +typedef int (*set_t)(struct inode *, void *); +static int ntfs_read_locked_inode(struct inode *vi); +static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi); +static int ntfs_read_locked_index_inode(struct inode *base_vi, + struct inode *vi); + +/** + * ntfs_iget - obtain a struct inode corresponding to a specific normal inode + * @sb: super block of mounted volume + * @mft_no: mft record number / inode number to obtain + * + * Obtain the struct inode corresponding to a specific normal inode (i.e. a + * file or directory). + * + * If the inode is in the cache, it is just returned with an increased + * reference count. Otherwise, a new struct inode is allocated and initialized, + * and finally ntfs_read_locked_inode() is called to read in the inode and + * fill in the remainder of the inode structure. + * + * Return the struct inode on success. Check the return value with IS_ERR() and + * if true, the function failed and the error code is obtained from PTR_ERR(). + */ +struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no) +{ + struct inode *vi; + int err; + ntfs_attr na; + + na.mft_no = mft_no; + na.type = AT_UNUSED; + na.name = NULL; + na.name_len = 0; + + vi = iget5_locked(sb, mft_no, (test_t)ntfs_test_inode, + (set_t)ntfs_init_locked_inode, &na); + if (unlikely(!vi)) + return ERR_PTR(-ENOMEM); + + err = 0; + + /* If this is a freshly allocated inode, need to read it now. */ + if (vi->i_state & I_NEW) { + err = ntfs_read_locked_inode(vi); + unlock_new_inode(vi); + } + /* + * There is no point in keeping bad inodes around if the failure was + * due to ENOMEM. We want to be able to retry again later. + */ + if (unlikely(err == -ENOMEM)) { + iput(vi); + vi = ERR_PTR(err); + } + return vi; +} + +/** + * ntfs_attr_iget - obtain a struct inode corresponding to an attribute + * @base_vi: vfs base inode containing the attribute + * @type: attribute type + * @name: Unicode name of the attribute (NULL if unnamed) + * @name_len: length of @name in Unicode characters (0 if unnamed) + * + * Obtain the (fake) struct inode corresponding to the attribute specified by + * @type, @name, and @name_len, which is present in the base mft record + * specified by the vfs inode @base_vi. + * + * If the attribute inode is in the cache, it is just returned with an + * increased reference count. Otherwise, a new struct inode is allocated and + * initialized, and finally ntfs_read_locked_attr_inode() is called to read the + * attribute and fill in the inode structure. + * + * Note, for index allocation attributes, you need to use ntfs_index_iget() + * instead of ntfs_attr_iget() as working with indices is a lot more complex. + * + * Return the struct inode of the attribute inode on success. Check the return + * value with IS_ERR() and if true, the function failed and the error code is + * obtained from PTR_ERR(). + */ +struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, + ntfschar *name, u32 name_len) +{ + struct inode *vi; + int err; + ntfs_attr na; + + /* Make sure no one calls ntfs_attr_iget() for indices. */ + BUG_ON(type == AT_INDEX_ALLOCATION); + + na.mft_no = base_vi->i_ino; + na.type = type; + na.name = name; + na.name_len = name_len; + + vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode, + (set_t)ntfs_init_locked_inode, &na); + if (unlikely(!vi)) + return ERR_PTR(-ENOMEM); + + err = 0; + + /* If this is a freshly allocated inode, need to read it now. */ + if (vi->i_state & I_NEW) { + err = ntfs_read_locked_attr_inode(base_vi, vi); + unlock_new_inode(vi); + } + /* + * There is no point in keeping bad attribute inodes around. This also + * simplifies things in that we never need to check for bad attribute + * inodes elsewhere. + */ + if (unlikely(err)) { + iput(vi); + vi = ERR_PTR(err); + } + return vi; +} + +/** + * ntfs_index_iget - obtain a struct inode corresponding to an index + * @base_vi: vfs base inode containing the index related attributes + * @name: Unicode name of the index + * @name_len: length of @name in Unicode characters + * + * Obtain the (fake) struct inode corresponding to the index specified by @name + * and @name_len, which is present in the base mft record specified by the vfs + * inode @base_vi. + * + * If the index inode is in the cache, it is just returned with an increased + * reference count. Otherwise, a new struct inode is allocated and + * initialized, and finally ntfs_read_locked_index_inode() is called to read + * the index related attributes and fill in the inode structure. + * + * Return the struct inode of the index inode on success. Check the return + * value with IS_ERR() and if true, the function failed and the error code is + * obtained from PTR_ERR(). + */ +struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, + u32 name_len) +{ + struct inode *vi; + int err; + ntfs_attr na; + + na.mft_no = base_vi->i_ino; + na.type = AT_INDEX_ALLOCATION; + na.name = name; + na.name_len = name_len; + + vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode, + (set_t)ntfs_init_locked_inode, &na); + if (unlikely(!vi)) + return ERR_PTR(-ENOMEM); + + err = 0; + + /* If this is a freshly allocated inode, need to read it now. */ + if (vi->i_state & I_NEW) { + err = ntfs_read_locked_index_inode(base_vi, vi); + unlock_new_inode(vi); + } + /* + * There is no point in keeping bad index inodes around. This also + * simplifies things in that we never need to check for bad index + * inodes elsewhere. + */ + if (unlikely(err)) { + iput(vi); + vi = ERR_PTR(err); + } + return vi; +} + +struct inode *ntfs_alloc_big_inode(struct super_block *sb) +{ + ntfs_inode *ni; + + ntfs_debug("Entering."); + ni = kmem_cache_alloc(ntfs_big_inode_cache, GFP_NOFS); + if (likely(ni != NULL)) { + ni->state = 0; + return VFS_I(ni); + } + ntfs_error(sb, "Allocation of NTFS big inode structure failed."); + return NULL; +} + +static void ntfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); +} + +void ntfs_destroy_big_inode(struct inode *inode) +{ + ntfs_inode *ni = NTFS_I(inode); + + ntfs_debug("Entering."); + BUG_ON(ni->page); + if (!atomic_dec_and_test(&ni->count)) + BUG(); + call_rcu(&inode->i_rcu, ntfs_i_callback); +} + +static inline ntfs_inode *ntfs_alloc_extent_inode(void) +{ + ntfs_inode *ni; + + ntfs_debug("Entering."); + ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS); + if (likely(ni != NULL)) { + ni->state = 0; + return ni; + } + ntfs_error(NULL, "Allocation of NTFS inode structure failed."); + return NULL; +} + +static void ntfs_destroy_extent_inode(ntfs_inode *ni) +{ + ntfs_debug("Entering."); + BUG_ON(ni->page); + if (!atomic_dec_and_test(&ni->count)) + BUG(); + kmem_cache_free(ntfs_inode_cache, ni); +} + +/* + * The attribute runlist lock has separate locking rules from the + * normal runlist lock, so split the two lock-classes: + */ +static struct lock_class_key attr_list_rl_lock_class; + +/** + * __ntfs_init_inode - initialize ntfs specific part of an inode + * @sb: super block of mounted volume + * @ni: freshly allocated ntfs inode which to initialize + * + * Initialize an ntfs inode to defaults. + * + * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left + * untouched. Make sure to initialize them elsewhere. + * + * Return zero on success and -ENOMEM on error. + */ +void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni) +{ + ntfs_debug("Entering."); + rwlock_init(&ni->size_lock); + ni->initialized_size = ni->allocated_size = 0; + ni->seq_no = 0; + atomic_set(&ni->count, 1); + ni->vol = NTFS_SB(sb); + ntfs_init_runlist(&ni->runlist); + mutex_init(&ni->mrec_lock); + ni->page = NULL; + ni->page_ofs = 0; + ni->attr_list_size = 0; + ni->attr_list = NULL; + ntfs_init_runlist(&ni->attr_list_rl); + lockdep_set_class(&ni->attr_list_rl.lock, + &attr_list_rl_lock_class); + ni->itype.index.block_size = 0; + ni->itype.index.vcn_size = 0; + ni->itype.index.collation_rule = 0; + ni->itype.index.block_size_bits = 0; + ni->itype.index.vcn_size_bits = 0; + mutex_init(&ni->extent_lock); + ni->nr_extents = 0; + ni->ext.base_ntfs_ino = NULL; +} + +/* + * Extent inodes get MFT-mapped in a nested way, while the base inode + * is still mapped. Teach this nesting to the lock validator by creating + * a separate class for nested inode's mrec_lock's: + */ +static struct lock_class_key extent_inode_mrec_lock_key; + +inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, + unsigned long mft_no) +{ + ntfs_inode *ni = ntfs_alloc_extent_inode(); + + ntfs_debug("Entering."); + if (likely(ni != NULL)) { + __ntfs_init_inode(sb, ni); + lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key); + ni->mft_no = mft_no; + ni->type = AT_UNUSED; + ni->name = NULL; + ni->name_len = 0; + } + return ni; +} + +/** + * ntfs_is_extended_system_file - check if a file is in the $Extend directory + * @ctx: initialized attribute search context + * + * Search all file name attributes in the inode described by the attribute + * search context @ctx and check if any of the names are in the $Extend system + * directory. + * + * Return values: + * 1: file is in $Extend directory + * 0: file is not in $Extend directory + * -errno: failed to determine if the file is in the $Extend directory + */ +static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx) +{ + int nr_links, err; + + /* Restart search. */ + ntfs_attr_reinit_search_ctx(ctx); + + /* Get number of hard links. */ + nr_links = le16_to_cpu(ctx->mrec->link_count); + + /* Loop through all hard links. */ + while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0, + ctx))) { + FILE_NAME_ATTR *file_name_attr; + ATTR_RECORD *attr = ctx->attr; + u8 *p, *p2; + + nr_links--; + /* + * Maximum sanity checking as we are called on an inode that + * we suspect might be corrupt. + */ + p = (u8*)attr + le32_to_cpu(attr->length); + if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_in_use)) { +err_corrupt_attr: + ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name " + "attribute. You should run chkdsk."); + return -EIO; + } + if (attr->non_resident) { + ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file " + "name. You should run chkdsk."); + return -EIO; + } + if (attr->flags) { + ntfs_error(ctx->ntfs_ino->vol->sb, "File name with " + "invalid flags. You should run " + "chkdsk."); + return -EIO; + } + if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) { + ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file " + "name. You should run chkdsk."); + return -EIO; + } + file_name_attr = (FILE_NAME_ATTR*)((u8*)attr + + le16_to_cpu(attr->data.resident.value_offset)); + p2 = (u8*)attr + le32_to_cpu(attr->data.resident.value_length); + if (p2 < (u8*)attr || p2 > p) + goto err_corrupt_attr; + /* This attribute is ok, but is it in the $Extend directory? */ + if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend) + return 1; /* YES, it's an extended system file. */ + } + if (unlikely(err != -ENOENT)) + return err; + if (unlikely(nr_links)) { + ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count " + "doesn't match number of name attributes. You " + "should run chkdsk."); + return -EIO; + } + return 0; /* NO, it is not an extended system file. */ +} + +/** + * ntfs_read_locked_inode - read an inode from its device + * @vi: inode to read + * + * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode + * described by @vi into memory from the device. + * + * The only fields in @vi that we need to/can look at when the function is + * called are i_sb, pointing to the mounted device's super block, and i_ino, + * the number of the inode to load. + * + * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino + * for reading and sets up the necessary @vi fields as well as initializing + * the ntfs inode. + * + * Q: What locks are held when the function is called? + * A: i_state has I_NEW set, hence the inode is locked, also + * i_count is set to 1, so it is not going to go away + * i_flags is set to 0 and we have no business touching it. Only an ioctl() + * is allowed to write to them. We should of course be honouring them but + * we need to do that using the IS_* macros defined in include/linux/fs.h. + * In any case ntfs_read_locked_inode() has nothing to do with i_flags. + * + * Return 0 on success and -errno on error. In the error case, the inode will + * have had make_bad_inode() executed on it. + */ +static int ntfs_read_locked_inode(struct inode *vi) +{ + ntfs_volume *vol = NTFS_SB(vi->i_sb); + ntfs_inode *ni; + struct inode *bvi; + MFT_RECORD *m; + ATTR_RECORD *a; + STANDARD_INFORMATION *si; + ntfs_attr_search_ctx *ctx; + int err = 0; + + ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); + + /* Setup the generic vfs inode parts now. */ + + /* + * This is for checking whether an inode has changed w.r.t. a file so + * that the file can be updated if necessary (compare with f_version). + */ + vi->i_version = 1; + + vi->i_uid = vol->uid; + vi->i_gid = vol->gid; + vi->i_mode = 0; + + /* + * Initialize the ntfs specific part of @vi special casing + * FILE_MFT which we need to do at mount time. + */ + if (vi->i_ino != FILE_MFT) + ntfs_init_big_inode(vi); + ni = NTFS_I(vi); + + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ni, m); + if (!ctx) { + err = -ENOMEM; + goto unm_err_out; + } + + if (!(m->flags & MFT_RECORD_IN_USE)) { + ntfs_error(vi->i_sb, "Inode is not in use!"); + goto unm_err_out; + } + if (m->base_mft_record) { + ntfs_error(vi->i_sb, "Inode is an extent inode!"); + goto unm_err_out; + } + + /* Transfer information from mft record into vfs and ntfs inodes. */ + vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); + + /* + * FIXME: Keep in mind that link_count is two for files which have both + * a long file name and a short file name as separate entries, so if + * we are hiding short file names this will be too high. Either we need + * to account for the short file names by subtracting them or we need + * to make sure we delete files even though i_nlink is not zero which + * might be tricky due to vfs interactions. Need to think about this + * some more when implementing the unlink command. + */ + vi->i_nlink = le16_to_cpu(m->link_count); + /* + * FIXME: Reparse points can have the directory bit set even though + * they would be S_IFLNK. Need to deal with this further below when we + * implement reparse points / symbolic links but it will do for now. + * Also if not a directory, it could be something else, rather than + * a regular file. But again, will do for now. + */ + /* Everyone gets all permissions. */ + vi->i_mode |= S_IRWXUGO; + /* If read-only, no one gets write permissions. */ + if (IS_RDONLY(vi)) + vi->i_mode &= ~S_IWUGO; + if (m->flags & MFT_RECORD_IS_DIRECTORY) { + vi->i_mode |= S_IFDIR; + /* + * Apply the directory permissions mask set in the mount + * options. + */ + vi->i_mode &= ~vol->dmask; + /* Things break without this kludge! */ + if (vi->i_nlink > 1) + vi->i_nlink = 1; + } else { + vi->i_mode |= S_IFREG; + /* Apply the file permissions mask set in the mount options. */ + vi->i_mode &= ~vol->fmask; + } + /* + * Find the standard information attribute in the mft record. At this + * stage we haven't setup the attribute list stuff yet, so this could + * in fact fail if the standard information is in an extent record, but + * I don't think this actually ever happens. + */ + err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0, + ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + /* + * TODO: We should be performing a hot fix here (if the + * recover mount option is set) by creating a new + * attribute. + */ + ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute " + "is missing."); + } + goto unm_err_out; + } + a = ctx->attr; + /* Get the standard information attribute value. */ + si = (STANDARD_INFORMATION*)((u8*)a + + le16_to_cpu(a->data.resident.value_offset)); + + /* Transfer information from the standard information into vi. */ + /* + * Note: The i_?times do not quite map perfectly onto the NTFS times, + * but they are close enough, and in the end it doesn't really matter + * that much... + */ + /* + * mtime is the last change of the data within the file. Not changed + * when only metadata is changed, e.g. a rename doesn't affect mtime. + */ + vi->i_mtime = ntfs2utc(si->last_data_change_time); + /* + * ctime is the last change of the metadata of the file. This obviously + * always changes, when mtime is changed. ctime can be changed on its + * own, mtime is then not changed, e.g. when a file is renamed. + */ + vi->i_ctime = ntfs2utc(si->last_mft_change_time); + /* + * Last access to the data within the file. Not changed during a rename + * for example but changed whenever the file is written to. + */ + vi->i_atime = ntfs2utc(si->last_access_time); + + /* Find the attribute list attribute if present. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx); + if (err) { + if (unlikely(err != -ENOENT)) { + ntfs_error(vi->i_sb, "Failed to lookup attribute list " + "attribute."); + goto unm_err_out; + } + } else /* if (!err) */ { + if (vi->i_ino == FILE_MFT) + goto skip_attr_list_load; + ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino); + NInoSetAttrList(ni); + a = ctx->attr; + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "Attribute list attribute is " + "compressed."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + if (a->non_resident) { + ntfs_error(vi->i_sb, "Non-resident attribute " + "list attribute is encrypted/" + "sparse."); + goto unm_err_out; + } + ntfs_warning(vi->i_sb, "Resident attribute list " + "attribute in inode 0x%lx is marked " + "encrypted/sparse which is not true. " + "However, Windows allows this and " + "chkdsk does not detect or correct it " + "so we will just ignore the invalid " + "flags and pretend they are not set.", + vi->i_ino); + } + /* Now allocate memory for the attribute list. */ + ni->attr_list_size = (u32)ntfs_attr_size(a); + ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); + if (!ni->attr_list) { + ntfs_error(vi->i_sb, "Not enough memory to allocate " + "buffer for attribute list."); + err = -ENOMEM; + goto unm_err_out; + } + if (a->non_resident) { + NInoSetAttrListNonResident(ni); + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "Attribute list has non " + "zero lowest_vcn."); + goto unm_err_out; + } + /* + * Setup the runlist. No need for locking as we have + * exclusive access to the inode at this time. + */ + ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, + a, NULL); + if (IS_ERR(ni->attr_list_rl.rl)) { + err = PTR_ERR(ni->attr_list_rl.rl); + ni->attr_list_rl.rl = NULL; + ntfs_error(vi->i_sb, "Mapping pairs " + "decompression failed."); + goto unm_err_out; + } + /* Now load the attribute list. */ + if ((err = load_attribute_list(vol, &ni->attr_list_rl, + ni->attr_list, ni->attr_list_size, + sle64_to_cpu(a->data.non_resident. + initialized_size)))) { + ntfs_error(vi->i_sb, "Failed to load " + "attribute list attribute."); + goto unm_err_out; + } + } else /* if (!a->non_resident) */ { + if ((u8*)a + le16_to_cpu(a->data.resident.value_offset) + + le32_to_cpu( + a->data.resident.value_length) > + (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "Corrupt attribute list " + "in inode."); + goto unm_err_out; + } + /* Now copy the attribute list. */ + memcpy(ni->attr_list, (u8*)a + le16_to_cpu( + a->data.resident.value_offset), + le32_to_cpu( + a->data.resident.value_length)); + } + } +skip_attr_list_load: + /* + * If an attribute list is present we now have the attribute list value + * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes. + */ + if (S_ISDIR(vi->i_mode)) { + loff_t bvi_size; + ntfs_inode *bni; + INDEX_ROOT *ir; + u8 *ir_end, *index_end; + + /* It is a directory, find index root attribute. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, + 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + // FIXME: File is corrupt! Hot-fix with empty + // index root attribute if recovery option is + // set. + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute " + "is missing."); + } + goto unm_err_out; + } + a = ctx->attr; + /* Set up the state. */ + if (unlikely(a->non_resident)) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute is not " + "resident."); + goto unm_err_out; + } + /* Ensure the attribute name is placed before the value. */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute name is " + "placed after the attribute value."); + goto unm_err_out; + } + /* + * Compressed/encrypted index root just means that the newly + * created files in that directory should be created compressed/ + * encrypted. However index root cannot be both compressed and + * encrypted. + */ + if (a->flags & ATTR_COMPRESSION_MASK) + NInoSetCompressed(ni); + if (a->flags & ATTR_IS_ENCRYPTED) { + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "Found encrypted and " + "compressed attribute."); + goto unm_err_out; + } + NInoSetEncrypted(ni); + } + if (a->flags & ATTR_IS_SPARSE) + NInoSetSparse(ni); + ir = (INDEX_ROOT*)((u8*)a + + le16_to_cpu(a->data.resident.value_offset)); + ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); + if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " + "corrupt."); + goto unm_err_out; + } + index_end = (u8*)&ir->index + + le32_to_cpu(ir->index.index_length); + if (index_end > ir_end) { + ntfs_error(vi->i_sb, "Directory index is corrupt."); + goto unm_err_out; + } + if (ir->type != AT_FILE_NAME) { + ntfs_error(vi->i_sb, "Indexed attribute is not " + "$FILE_NAME."); + goto unm_err_out; + } + if (ir->collation_rule != COLLATION_FILE_NAME) { + ntfs_error(vi->i_sb, "Index collation rule is not " + "COLLATION_FILE_NAME."); + goto unm_err_out; + } + ni->itype.index.collation_rule = ir->collation_rule; + ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); + if (ni->itype.index.block_size & + (ni->itype.index.block_size - 1)) { + ntfs_error(vi->i_sb, "Index block size (%u) is not a " + "power of two.", + ni->itype.index.block_size); + goto unm_err_out; + } + if (ni->itype.index.block_size > PAGE_CACHE_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) > " + "PAGE_CACHE_SIZE (%ld) is not " + "supported. Sorry.", + ni->itype.index.block_size, + PAGE_CACHE_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) < " + "NTFS_BLOCK_SIZE (%i) is not " + "supported. Sorry.", + ni->itype.index.block_size, + NTFS_BLOCK_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + ni->itype.index.block_size_bits = + ffs(ni->itype.index.block_size) - 1; + /* Determine the size of a vcn in the directory index. */ + if (vol->cluster_size <= ni->itype.index.block_size) { + ni->itype.index.vcn_size = vol->cluster_size; + ni->itype.index.vcn_size_bits = vol->cluster_size_bits; + } else { + ni->itype.index.vcn_size = vol->sector_size; + ni->itype.index.vcn_size_bits = vol->sector_size_bits; + } + + /* Setup the index allocation attribute, even if not present. */ + NInoSetMstProtected(ni); + ni->type = AT_INDEX_ALLOCATION; + ni->name = I30; + ni->name_len = 4; + + if (!(ir->index.flags & LARGE_INDEX)) { + /* No index allocation. */ + vi->i_size = ni->initialized_size = + ni->allocated_size = 0; + /* We are done with the mft record, so we release it. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + m = NULL; + ctx = NULL; + goto skip_large_dir_stuff; + } /* LARGE_INDEX: Index allocation present. Setup state. */ + NInoSetIndexAllocPresent(ni); + /* Find index allocation attribute. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION " + "attribute is not present but " + "$INDEX_ROOT indicated it is."); + else + ntfs_error(vi->i_sb, "Failed to lookup " + "$INDEX_ALLOCATION " + "attribute."); + goto unm_err_out; + } + a = ctx->attr; + if (!a->non_resident) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is resident."); + goto unm_err_out; + } + /* + * Ensure the attribute name is placed before the mapping pairs + * array. + */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { + ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name " + "is placed after the mapping pairs " + "array."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is encrypted."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_SPARSE) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is sparse."); + goto unm_err_out; + } + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is compressed."); + goto unm_err_out; + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of " + "$INDEX_ALLOCATION attribute has non " + "zero lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + /* + * We are done with the mft record, so we release it. Otherwise + * we would deadlock in ntfs_attr_iget(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + m = NULL; + ctx = NULL; + /* Get the index bitmap attribute inode. */ + bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4); + if (IS_ERR(bvi)) { + ntfs_error(vi->i_sb, "Failed to get bitmap attribute."); + err = PTR_ERR(bvi); + goto unm_err_out; + } + bni = NTFS_I(bvi); + if (NInoCompressed(bni) || NInoEncrypted(bni) || + NInoSparse(bni)) { + ntfs_error(vi->i_sb, "$BITMAP attribute is compressed " + "and/or encrypted and/or sparse."); + goto iput_unm_err_out; + } + /* Consistency check bitmap size vs. index allocation size. */ + bvi_size = i_size_read(bvi); + if ((bvi_size << 3) < (vi->i_size >> + ni->itype.index.block_size_bits)) { + ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) " + "for index allocation (0x%llx).", + bvi_size << 3, vi->i_size); + goto iput_unm_err_out; + } + /* No longer need the bitmap attribute inode. */ + iput(bvi); +skip_large_dir_stuff: + /* Setup the operations for this inode. */ + vi->i_op = &ntfs_dir_inode_ops; + vi->i_fop = &ntfs_dir_ops; + } else { + /* It is a file. */ + ntfs_attr_reinit_search_ctx(ctx); + + /* Setup the data attribute, even if not present. */ + ni->type = AT_DATA; + ni->name = NULL; + ni->name_len = 0; + + /* Find first extent of the unnamed data attribute. */ + err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx); + if (unlikely(err)) { + vi->i_size = ni->initialized_size = + ni->allocated_size = 0; + if (err != -ENOENT) { + ntfs_error(vi->i_sb, "Failed to lookup $DATA " + "attribute."); + goto unm_err_out; + } + /* + * FILE_Secure does not have an unnamed $DATA + * attribute, so we special case it here. + */ + if (vi->i_ino == FILE_Secure) + goto no_data_attr_special_case; + /* + * Most if not all the system files in the $Extend + * system directory do not have unnamed data + * attributes so we need to check if the parent + * directory of the file is FILE_Extend and if it is + * ignore this error. To do this we need to get the + * name of this inode from the mft record as the name + * contains the back reference to the parent directory. + */ + if (ntfs_is_extended_system_file(ctx) > 0) + goto no_data_attr_special_case; + // FIXME: File is corrupt! Hot-fix with empty data + // attribute if recovery option is set. + ntfs_error(vi->i_sb, "$DATA attribute is missing."); + goto unm_err_out; + } + a = ctx->attr; + /* Setup the state. */ + if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { + if (a->flags & ATTR_COMPRESSION_MASK) { + NInoSetCompressed(ni); + if (vol->cluster_size > 4096) { + ntfs_error(vi->i_sb, "Found " + "compressed data but " + "compression is " + "disabled due to " + "cluster size (%i) > " + "4kiB.", + vol->cluster_size); + goto unm_err_out; + } + if ((a->flags & ATTR_COMPRESSION_MASK) + != ATTR_IS_COMPRESSED) { + ntfs_error(vi->i_sb, "Found unknown " + "compression method " + "or corrupt file."); + goto unm_err_out; + } + } + if (a->flags & ATTR_IS_SPARSE) + NInoSetSparse(ni); + } + if (a->flags & ATTR_IS_ENCRYPTED) { + if (NInoCompressed(ni)) { + ntfs_error(vi->i_sb, "Found encrypted and " + "compressed data."); + goto unm_err_out; + } + NInoSetEncrypted(ni); + } + if (a->non_resident) { + NInoSetNonResident(ni); + if (NInoCompressed(ni) || NInoSparse(ni)) { + if (NInoCompressed(ni) && a->data.non_resident. + compression_unit != 4) { + ntfs_error(vi->i_sb, "Found " + "non-standard " + "compression unit (%u " + "instead of 4). " + "Cannot handle this.", + a->data.non_resident. + compression_unit); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (a->data.non_resident.compression_unit) { + ni->itype.compressed.block_size = 1U << + (a->data.non_resident. + compression_unit + + vol->cluster_size_bits); + ni->itype.compressed.block_size_bits = + ffs(ni->itype. + compressed. + block_size) - 1; + ni->itype.compressed.block_clusters = + 1U << a->data. + non_resident. + compression_unit; + } else { + ni->itype.compressed.block_size = 0; + ni->itype.compressed.block_size_bits = + 0; + ni->itype.compressed.block_clusters = + 0; + } + ni->itype.compressed.size = sle64_to_cpu( + a->data.non_resident. + compressed_size); + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of $DATA " + "attribute has non zero " + "lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu( + a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + } else { /* Resident attribute. */ + vi->i_size = ni->initialized_size = le32_to_cpu( + a->data.resident.value_length); + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu( + a->data.resident.value_offset); + if (vi->i_size > ni->allocated_size) { + ntfs_error(vi->i_sb, "Resident data attribute " + "is corrupt (size exceeds " + "allocation)."); + goto unm_err_out; + } + } +no_data_attr_special_case: + /* We are done with the mft record, so we release it. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + m = NULL; + ctx = NULL; + /* Setup the operations for this inode. */ + vi->i_op = &ntfs_file_inode_ops; + vi->i_fop = &ntfs_file_ops; + } + if (NInoMstProtected(ni)) + vi->i_mapping->a_ops = &ntfs_mst_aops; + else + vi->i_mapping->a_ops = &ntfs_aops; + /* + * The number of 512-byte blocks used on disk (for stat). This is in so + * far inaccurate as it doesn't account for any named streams or other + * special non-resident attributes, but that is how Windows works, too, + * so we are at least consistent with Windows, if not entirely + * consistent with the Linux Way. Doing it the Linux Way would cause a + * significant slowdown as it would involve iterating over all + * attributes in the mft record and adding the allocated/compressed + * sizes of all non-resident attributes present to give us the Linux + * correct size that should go into i_blocks (after division by 512). + */ + if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni))) + vi->i_blocks = ni->itype.compressed.size >> 9; + else + vi->i_blocks = ni->allocated_size >> 9; + ntfs_debug("Done."); + return 0; +iput_unm_err_out: + iput(bvi); +unm_err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(ni); +err_out: + ntfs_error(vol->sb, "Failed with error code %i. Marking corrupt " + "inode 0x%lx as bad. Run chkdsk.", err, vi->i_ino); + make_bad_inode(vi); + if (err != -EOPNOTSUPP && err != -ENOMEM) + NVolSetErrors(vol); + return err; +} + +/** + * ntfs_read_locked_attr_inode - read an attribute inode from its base inode + * @base_vi: base inode + * @vi: attribute inode to read + * + * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the + * attribute inode described by @vi into memory from the base mft record + * described by @base_ni. + * + * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for + * reading and looks up the attribute described by @vi before setting up the + * necessary fields in @vi as well as initializing the ntfs inode. + * + * Q: What locks are held when the function is called? + * A: i_state has I_NEW set, hence the inode is locked, also + * i_count is set to 1, so it is not going to go away + * + * Return 0 on success and -errno on error. In the error case, the inode will + * have had make_bad_inode() executed on it. + * + * Note this cannot be called for AT_INDEX_ALLOCATION. + */ +static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) +{ + ntfs_volume *vol = NTFS_SB(vi->i_sb); + ntfs_inode *ni, *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + int err = 0; + + ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); + + ntfs_init_big_inode(vi); + + ni = NTFS_I(vi); + base_ni = NTFS_I(base_vi); + + /* Just mirror the values from the base inode. */ + vi->i_version = base_vi->i_version; + vi->i_uid = base_vi->i_uid; + vi->i_gid = base_vi->i_gid; + vi->i_nlink = base_vi->i_nlink; + vi->i_mtime = base_vi->i_mtime; + vi->i_ctime = base_vi->i_ctime; + vi->i_atime = base_vi->i_atime; + vi->i_generation = ni->seq_no = base_ni->seq_no; + + /* Set inode type to zero but preserve permissions. */ + vi->i_mode = base_vi->i_mode & ~S_IFMT; + + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (!ctx) { + err = -ENOMEM; + goto unm_err_out; + } + /* Find the attribute. */ + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto unm_err_out; + a = ctx->attr; + if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { + if (a->flags & ATTR_COMPRESSION_MASK) { + NInoSetCompressed(ni); + if ((ni->type != AT_DATA) || (ni->type == AT_DATA && + ni->name_len)) { + ntfs_error(vi->i_sb, "Found compressed " + "non-data or named data " + "attribute. Please report " + "you saw this message to " + "linux-ntfs-dev@lists." + "sourceforge.net"); + goto unm_err_out; + } + if (vol->cluster_size > 4096) { + ntfs_error(vi->i_sb, "Found compressed " + "attribute but compression is " + "disabled due to cluster size " + "(%i) > 4kiB.", + vol->cluster_size); + goto unm_err_out; + } + if ((a->flags & ATTR_COMPRESSION_MASK) != + ATTR_IS_COMPRESSED) { + ntfs_error(vi->i_sb, "Found unknown " + "compression method."); + goto unm_err_out; + } + } + /* + * The compressed/sparse flag set in an index root just means + * to compress all files. + */ + if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) { + ntfs_error(vi->i_sb, "Found mst protected attribute " + "but the attribute is %s. Please " + "report you saw this message to " + "linux-ntfs-dev@lists.sourceforge.net", + NInoCompressed(ni) ? "compressed" : + "sparse"); + goto unm_err_out; + } + if (a->flags & ATTR_IS_SPARSE) + NInoSetSparse(ni); + } + if (a->flags & ATTR_IS_ENCRYPTED) { + if (NInoCompressed(ni)) { + ntfs_error(vi->i_sb, "Found encrypted and compressed " + "data."); + goto unm_err_out; + } + /* + * The encryption flag set in an index root just means to + * encrypt all files. + */ + if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) { + ntfs_error(vi->i_sb, "Found mst protected attribute " + "but the attribute is encrypted. " + "Please report you saw this message " + "to linux-ntfs-dev@lists.sourceforge." + "net"); + goto unm_err_out; + } + if (ni->type != AT_DATA) { + ntfs_error(vi->i_sb, "Found encrypted non-data " + "attribute."); + goto unm_err_out; + } + NInoSetEncrypted(ni); + } + if (!a->non_resident) { + /* Ensure the attribute name is placed before the value. */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { + ntfs_error(vol->sb, "Attribute name is placed after " + "the attribute value."); + goto unm_err_out; + } + if (NInoMstProtected(ni)) { + ntfs_error(vi->i_sb, "Found mst protected attribute " + "but the attribute is resident. " + "Please report you saw this message to " + "linux-ntfs-dev@lists.sourceforge.net"); + goto unm_err_out; + } + vi->i_size = ni->initialized_size = le32_to_cpu( + a->data.resident.value_length); + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset); + if (vi->i_size > ni->allocated_size) { + ntfs_error(vi->i_sb, "Resident attribute is corrupt " + "(size exceeds allocation)."); + goto unm_err_out; + } + } else { + NInoSetNonResident(ni); + /* + * Ensure the attribute name is placed before the mapping pairs + * array. + */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { + ntfs_error(vol->sb, "Attribute name is placed after " + "the mapping pairs array."); + goto unm_err_out; + } + if (NInoCompressed(ni) || NInoSparse(ni)) { + if (NInoCompressed(ni) && a->data.non_resident. + compression_unit != 4) { + ntfs_error(vi->i_sb, "Found non-standard " + "compression unit (%u instead " + "of 4). Cannot handle this.", + a->data.non_resident. + compression_unit); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (a->data.non_resident.compression_unit) { + ni->itype.compressed.block_size = 1U << + (a->data.non_resident. + compression_unit + + vol->cluster_size_bits); + ni->itype.compressed.block_size_bits = + ffs(ni->itype.compressed. + block_size) - 1; + ni->itype.compressed.block_clusters = 1U << + a->data.non_resident. + compression_unit; + } else { + ni->itype.compressed.block_size = 0; + ni->itype.compressed.block_size_bits = 0; + ni->itype.compressed.block_clusters = 0; + } + ni->itype.compressed.size = sle64_to_cpu( + a->data.non_resident.compressed_size); + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of attribute has " + "non-zero lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + } + if (NInoMstProtected(ni)) + vi->i_mapping->a_ops = &ntfs_mst_aops; + else + vi->i_mapping->a_ops = &ntfs_aops; + if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT) + vi->i_blocks = ni->itype.compressed.size >> 9; + else + vi->i_blocks = ni->allocated_size >> 9; + /* + * Make sure the base inode does not go away and attach it to the + * attribute inode. + */ + igrab(base_vi); + ni->ext.base_ntfs_ino = base_ni; + ni->nr_extents = -1; + + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + + ntfs_debug("Done."); + return 0; + +unm_err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); +err_out: + ntfs_error(vol->sb, "Failed with error code %i while reading attribute " + "inode (mft_no 0x%lx, type 0x%x, name_len %i). " + "Marking corrupt inode and base inode 0x%lx as bad. " + "Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len, + base_vi->i_ino); + make_bad_inode(vi); + if (err != -ENOMEM) + NVolSetErrors(vol); + return err; +} + +/** + * ntfs_read_locked_index_inode - read an index inode from its base inode + * @base_vi: base inode + * @vi: index inode to read + * + * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the + * index inode described by @vi into memory from the base mft record described + * by @base_ni. + * + * ntfs_read_locked_index_inode() maps, pins and locks the base inode for + * reading and looks up the attributes relating to the index described by @vi + * before setting up the necessary fields in @vi as well as initializing the + * ntfs inode. + * + * Note, index inodes are essentially attribute inodes (NInoAttr() is true) + * with the attribute type set to AT_INDEX_ALLOCATION. Apart from that, they + * are setup like directory inodes since directories are a special case of + * indices ao they need to be treated in much the same way. Most importantly, + * for small indices the index allocation attribute might not actually exist. + * However, the index root attribute always exists but this does not need to + * have an inode associated with it and this is why we define a new inode type + * index. Also, like for directories, we need to have an attribute inode for + * the bitmap attribute corresponding to the index allocation attribute and we + * can store this in the appropriate field of the inode, just like we do for + * normal directory inodes. + * + * Q: What locks are held when the function is called? + * A: i_state has I_NEW set, hence the inode is locked, also + * i_count is set to 1, so it is not going to go away + * + * Return 0 on success and -errno on error. In the error case, the inode will + * have had make_bad_inode() executed on it. + */ +static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) +{ + loff_t bvi_size; + ntfs_volume *vol = NTFS_SB(vi->i_sb); + ntfs_inode *ni, *base_ni, *bni; + struct inode *bvi; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + INDEX_ROOT *ir; + u8 *ir_end, *index_end; + int err = 0; + + ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); + ntfs_init_big_inode(vi); + ni = NTFS_I(vi); + base_ni = NTFS_I(base_vi); + /* Just mirror the values from the base inode. */ + vi->i_version = base_vi->i_version; + vi->i_uid = base_vi->i_uid; + vi->i_gid = base_vi->i_gid; + vi->i_nlink = base_vi->i_nlink; + vi->i_mtime = base_vi->i_mtime; + vi->i_ctime = base_vi->i_ctime; + vi->i_atime = base_vi->i_atime; + vi->i_generation = ni->seq_no = base_ni->seq_no; + /* Set inode type to zero but preserve permissions. */ + vi->i_mode = base_vi->i_mode & ~S_IFMT; + /* Map the mft record for the base inode. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (!ctx) { + err = -ENOMEM; + goto unm_err_out; + } + /* Find the index root attribute. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " + "missing."); + goto unm_err_out; + } + a = ctx->attr; + /* Set up the state. */ + if (unlikely(a->non_resident)) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident."); + goto unm_err_out; + } + /* Ensure the attribute name is placed before the value. */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed " + "after the attribute value."); + goto unm_err_out; + } + /* + * Compressed/encrypted/sparse index root is not allowed, except for + * directories of course but those are not dealt with here. + */ + if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED | + ATTR_IS_SPARSE)) { + ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index " + "root attribute."); + goto unm_err_out; + } + ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset)); + ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); + if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt."); + goto unm_err_out; + } + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + if (index_end > ir_end) { + ntfs_error(vi->i_sb, "Index is corrupt."); + goto unm_err_out; + } + if (ir->type) { + ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).", + le32_to_cpu(ir->type)); + goto unm_err_out; + } + ni->itype.index.collation_rule = ir->collation_rule; + ntfs_debug("Index collation rule is 0x%x.", + le32_to_cpu(ir->collation_rule)); + ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); + if (!is_power_of_2(ni->itype.index.block_size)) { + ntfs_error(vi->i_sb, "Index block size (%u) is not a power of " + "two.", ni->itype.index.block_size); + goto unm_err_out; + } + if (ni->itype.index.block_size > PAGE_CACHE_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_CACHE_SIZE " + "(%ld) is not supported. Sorry.", + ni->itype.index.block_size, PAGE_CACHE_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE " + "(%i) is not supported. Sorry.", + ni->itype.index.block_size, NTFS_BLOCK_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1; + /* Determine the size of a vcn in the index. */ + if (vol->cluster_size <= ni->itype.index.block_size) { + ni->itype.index.vcn_size = vol->cluster_size; + ni->itype.index.vcn_size_bits = vol->cluster_size_bits; + } else { + ni->itype.index.vcn_size = vol->sector_size; + ni->itype.index.vcn_size_bits = vol->sector_size_bits; + } + /* Check for presence of index allocation attribute. */ + if (!(ir->index.flags & LARGE_INDEX)) { + /* No index allocation. */ + vi->i_size = ni->initialized_size = ni->allocated_size = 0; + /* We are done with the mft record, so we release it. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + m = NULL; + ctx = NULL; + goto skip_large_index_stuff; + } /* LARGE_INDEX: Index allocation present. Setup state. */ + NInoSetIndexAllocPresent(ni); + /* Find index allocation attribute. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "not present but $INDEX_ROOT " + "indicated it is."); + else + ntfs_error(vi->i_sb, "Failed to lookup " + "$INDEX_ALLOCATION attribute."); + goto unm_err_out; + } + a = ctx->attr; + if (!a->non_resident) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "resident."); + goto unm_err_out; + } + /* + * Ensure the attribute name is placed before the mapping pairs array. + */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { + ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is " + "placed after the mapping pairs array."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "encrypted."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_SPARSE) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse."); + goto unm_err_out; + } + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "compressed."); + goto unm_err_out; + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION " + "attribute has non zero lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size); + /* + * We are done with the mft record, so we release it. Otherwise + * we would deadlock in ntfs_attr_iget(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + m = NULL; + ctx = NULL; + /* Get the index bitmap attribute inode. */ + bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len); + if (IS_ERR(bvi)) { + ntfs_error(vi->i_sb, "Failed to get bitmap attribute."); + err = PTR_ERR(bvi); + goto unm_err_out; + } + bni = NTFS_I(bvi); + if (NInoCompressed(bni) || NInoEncrypted(bni) || + NInoSparse(bni)) { + ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or " + "encrypted and/or sparse."); + goto iput_unm_err_out; + } + /* Consistency check bitmap size vs. index allocation size. */ + bvi_size = i_size_read(bvi); + if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) { + ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for " + "index allocation (0x%llx).", bvi_size << 3, + vi->i_size); + goto iput_unm_err_out; + } + iput(bvi); +skip_large_index_stuff: + /* Setup the operations for this index inode. */ + vi->i_op = NULL; + vi->i_fop = NULL; + vi->i_mapping->a_ops = &ntfs_mst_aops; + vi->i_blocks = ni->allocated_size >> 9; + /* + * Make sure the base inode doesn't go away and attach it to the + * index inode. + */ + igrab(base_vi); + ni->ext.base_ntfs_ino = base_ni; + ni->nr_extents = -1; + + ntfs_debug("Done."); + return 0; +iput_unm_err_out: + iput(bvi); +unm_err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); +err_out: + ntfs_error(vi->i_sb, "Failed with error code %i while reading index " + "inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino, + ni->name_len); + make_bad_inode(vi); + if (err != -EOPNOTSUPP && err != -ENOMEM) + NVolSetErrors(vol); + return err; +} + +/* + * The MFT inode has special locking, so teach the lock validator + * about this by splitting off the locking rules of the MFT from + * the locking rules of other inodes. The MFT inode can never be + * accessed from the VFS side (or even internally), only by the + * map_mft functions. + */ +static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key; + +/** + * ntfs_read_inode_mount - special read_inode for mount time use only + * @vi: inode to read + * + * Read inode FILE_MFT at mount time, only called with super_block lock + * held from within the read_super() code path. + * + * This function exists because when it is called the page cache for $MFT/$DATA + * is not initialized and hence we cannot get at the contents of mft records + * by calling map_mft_record*(). + * + * Further it needs to cope with the circular references problem, i.e. cannot + * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because + * we do not know where the other extent mft records are yet and again, because + * we cannot call map_mft_record*() yet. Obviously this applies only when an + * attribute list is actually present in $MFT inode. + * + * We solve these problems by starting with the $DATA attribute before anything + * else and iterating using ntfs_attr_lookup($DATA) over all extents. As each + * extent is found, we ntfs_mapping_pairs_decompress() including the implied + * ntfs_runlists_merge(). Each step of the iteration necessarily provides + * sufficient information for the next step to complete. + * + * This should work but there are two possible pit falls (see inline comments + * below), but only time will tell if they are real pits or just smoke... + */ +int ntfs_read_inode_mount(struct inode *vi) +{ + VCN next_vcn, last_vcn, highest_vcn; + s64 block; + struct super_block *sb = vi->i_sb; + ntfs_volume *vol = NTFS_SB(sb); + struct buffer_head *bh; + ntfs_inode *ni; + MFT_RECORD *m = NULL; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + unsigned int i, nr_blocks; + int err; + + ntfs_debug("Entering."); + + /* Initialize the ntfs specific part of @vi. */ + ntfs_init_big_inode(vi); + + ni = NTFS_I(vi); + + /* Setup the data attribute. It is special as it is mst protected. */ + NInoSetNonResident(ni); + NInoSetMstProtected(ni); + NInoSetSparseDisabled(ni); + ni->type = AT_DATA; + ni->name = NULL; + ni->name_len = 0; + /* + * This sets up our little cheat allowing us to reuse the async read io + * completion handler for directories. + */ + ni->itype.index.block_size = vol->mft_record_size; + ni->itype.index.block_size_bits = vol->mft_record_size_bits; + + /* Very important! Needed to be able to call map_mft_record*(). */ + vol->mft_ino = vi; + + /* Allocate enough memory to read the first mft record. */ + if (vol->mft_record_size > 64 * 1024) { + ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).", + vol->mft_record_size); + goto err_out; + } + i = vol->mft_record_size; + if (i < sb->s_blocksize) + i = sb->s_blocksize; + m = (MFT_RECORD*)ntfs_malloc_nofs(i); + if (!m) { + ntfs_error(sb, "Failed to allocate buffer for $MFT record 0."); + goto err_out; + } + + /* Determine the first block of the $MFT/$DATA attribute. */ + block = vol->mft_lcn << vol->cluster_size_bits >> + sb->s_blocksize_bits; + nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits; + if (!nr_blocks) + nr_blocks = 1; + + /* Load $MFT/$DATA's first mft record. */ + for (i = 0; i < nr_blocks; i++) { + bh = sb_bread(sb, block++); + if (!bh) { + ntfs_error(sb, "Device read failed."); + goto err_out; + } + memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data, + sb->s_blocksize); + brelse(bh); + } + + /* Apply the mst fixups. */ + if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) { + /* FIXME: Try to use the $MFTMirr now. */ + ntfs_error(sb, "MST fixup failed. $MFT is corrupt."); + goto err_out; + } + + /* Need this to sanity check attribute list references to $MFT. */ + vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); + + /* Provides readpage() and sync_page() for map_mft_record(). */ + vi->i_mapping->a_ops = &ntfs_mst_aops; + + ctx = ntfs_attr_get_search_ctx(ni, m); + if (!ctx) { + err = -ENOMEM; + goto err_out; + } + + /* Find the attribute list attribute if present. */ + err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx); + if (err) { + if (unlikely(err != -ENOENT)) { + ntfs_error(sb, "Failed to lookup attribute list " + "attribute. You should run chkdsk."); + goto put_err_out; + } + } else /* if (!err) */ { + ATTR_LIST_ENTRY *al_entry, *next_al_entry; + u8 *al_end; + static const char *es = " Not allowed. $MFT is corrupt. " + "You should run chkdsk."; + + ntfs_debug("Attribute list attribute found in $MFT."); + NInoSetAttrList(ni); + a = ctx->attr; + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(sb, "Attribute list attribute is " + "compressed.%s", es); + goto put_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + if (a->non_resident) { + ntfs_error(sb, "Non-resident attribute list " + "attribute is encrypted/" + "sparse.%s", es); + goto put_err_out; + } + ntfs_warning(sb, "Resident attribute list attribute " + "in $MFT system file is marked " + "encrypted/sparse which is not true. " + "However, Windows allows this and " + "chkdsk does not detect or correct it " + "so we will just ignore the invalid " + "flags and pretend they are not set."); + } + /* Now allocate memory for the attribute list. */ + ni->attr_list_size = (u32)ntfs_attr_size(a); + ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); + if (!ni->attr_list) { + ntfs_error(sb, "Not enough memory to allocate buffer " + "for attribute list."); + goto put_err_out; + } + if (a->non_resident) { + NInoSetAttrListNonResident(ni); + if (a->data.non_resident.lowest_vcn) { + ntfs_error(sb, "Attribute list has non zero " + "lowest_vcn. $MFT is corrupt. " + "You should run chkdsk."); + goto put_err_out; + } + /* Setup the runlist. */ + ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, + a, NULL); + if (IS_ERR(ni->attr_list_rl.rl)) { + err = PTR_ERR(ni->attr_list_rl.rl); + ni->attr_list_rl.rl = NULL; + ntfs_error(sb, "Mapping pairs decompression " + "failed with error code %i.", + -err); + goto put_err_out; + } + /* Now load the attribute list. */ + if ((err = load_attribute_list(vol, &ni->attr_list_rl, + ni->attr_list, ni->attr_list_size, + sle64_to_cpu(a->data. + non_resident.initialized_size)))) { + ntfs_error(sb, "Failed to load attribute list " + "attribute with error code %i.", + -err); + goto put_err_out; + } + } else /* if (!ctx.attr->non_resident) */ { + if ((u8*)a + le16_to_cpu( + a->data.resident.value_offset) + + le32_to_cpu( + a->data.resident.value_length) > + (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(sb, "Corrupt attribute list " + "attribute."); + goto put_err_out; + } + /* Now copy the attribute list. */ + memcpy(ni->attr_list, (u8*)a + le16_to_cpu( + a->data.resident.value_offset), + le32_to_cpu( + a->data.resident.value_length)); + } + /* The attribute list is now setup in memory. */ + /* + * FIXME: I don't know if this case is actually possible. + * According to logic it is not possible but I have seen too + * many weird things in MS software to rely on logic... Thus we + * perform a manual search and make sure the first $MFT/$DATA + * extent is in the base inode. If it is not we abort with an + * error and if we ever see a report of this error we will need + * to do some magic in order to have the necessary mft record + * loaded and in the right place in the page cache. But + * hopefully logic will prevail and this never happens... + */ + al_entry = (ATTR_LIST_ENTRY*)ni->attr_list; + al_end = (u8*)al_entry + ni->attr_list_size; + for (;; al_entry = next_al_entry) { + /* Out of bounds check. */ + if ((u8*)al_entry < ni->attr_list || + (u8*)al_entry > al_end) + goto em_put_err_out; + /* Catch the end of the attribute list. */ + if ((u8*)al_entry == al_end) + goto em_put_err_out; + if (!al_entry->length) + goto em_put_err_out; + if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + + le16_to_cpu(al_entry->length) > al_end) + goto em_put_err_out; + next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + + le16_to_cpu(al_entry->length)); + if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA)) + goto em_put_err_out; + if (AT_DATA != al_entry->type) + continue; + /* We want an unnamed attribute. */ + if (al_entry->name_length) + goto em_put_err_out; + /* Want the first entry, i.e. lowest_vcn == 0. */ + if (al_entry->lowest_vcn) + goto em_put_err_out; + /* First entry has to be in the base mft record. */ + if (MREF_LE(al_entry->mft_reference) != vi->i_ino) { + /* MFT references do not match, logic fails. */ + ntfs_error(sb, "BUG: The first $DATA extent " + "of $MFT is not in the base " + "mft record. Please report " + "you saw this message to " + "linux-ntfs-dev@lists." + "sourceforge.net"); + goto put_err_out; + } else { + /* Sequence numbers must match. */ + if (MSEQNO_LE(al_entry->mft_reference) != + ni->seq_no) + goto em_put_err_out; + /* Got it. All is ok. We can stop now. */ + break; + } + } + } + + ntfs_attr_reinit_search_ctx(ctx); + + /* Now load all attribute extents. */ + a = NULL; + next_vcn = last_vcn = highest_vcn = 0; + while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0, + ctx))) { + runlist_element *nrl; + + /* Cache the current attribute. */ + a = ctx->attr; + /* $MFT must be non-resident. */ + if (!a->non_resident) { + ntfs_error(sb, "$MFT must be non-resident but a " + "resident extent was found. $MFT is " + "corrupt. Run chkdsk."); + goto put_err_out; + } + /* $MFT must be uncompressed and unencrypted. */ + if (a->flags & ATTR_COMPRESSION_MASK || + a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + ntfs_error(sb, "$MFT must be uncompressed, " + "non-sparse, and unencrypted but a " + "compressed/sparse/encrypted extent " + "was found. $MFT is corrupt. Run " + "chkdsk."); + goto put_err_out; + } + /* + * Decompress the mapping pairs array of this extent and merge + * the result into the existing runlist. No need for locking + * as we have exclusive access to the inode at this time and we + * are a mount in progress task, too. + */ + nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); + if (IS_ERR(nrl)) { + ntfs_error(sb, "ntfs_mapping_pairs_decompress() " + "failed with error code %ld. $MFT is " + "corrupt.", PTR_ERR(nrl)); + goto put_err_out; + } + ni->runlist.rl = nrl; + + /* Are we in the first extent? */ + if (!next_vcn) { + if (a->data.non_resident.lowest_vcn) { + ntfs_error(sb, "First extent of $DATA " + "attribute has non zero " + "lowest_vcn. $MFT is corrupt. " + "You should run chkdsk."); + goto put_err_out; + } + /* Get the last vcn in the $DATA attribute. */ + last_vcn = sle64_to_cpu( + a->data.non_resident.allocated_size) + >> vol->cluster_size_bits; + /* Fill in the inode size. */ + vi->i_size = sle64_to_cpu( + a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + /* + * Verify the number of mft records does not exceed + * 2^32 - 1. + */ + if ((vi->i_size >> vol->mft_record_size_bits) >= + (1ULL << 32)) { + ntfs_error(sb, "$MFT is too big! Aborting."); + goto put_err_out; + } + /* + * We have got the first extent of the runlist for + * $MFT which means it is now relatively safe to call + * the normal ntfs_read_inode() function. + * Complete reading the inode, this will actually + * re-read the mft record for $MFT, this time entering + * it into the page cache with which we complete the + * kick start of the volume. It should be safe to do + * this now as the first extent of $MFT/$DATA is + * already known and we would hope that we don't need + * further extents in order to find the other + * attributes belonging to $MFT. Only time will tell if + * this is really the case. If not we will have to play + * magic at this point, possibly duplicating a lot of + * ntfs_read_inode() at this point. We will need to + * ensure we do enough of its work to be able to call + * ntfs_read_inode() on extents of $MFT/$DATA. But lets + * hope this never happens... + */ + ntfs_read_locked_inode(vi); + if (is_bad_inode(vi)) { + ntfs_error(sb, "ntfs_read_inode() of $MFT " + "failed. BUG or corrupt $MFT. " + "Run chkdsk and if no errors " + "are found, please report you " + "saw this message to " + "linux-ntfs-dev@lists." + "sourceforge.net"); + ntfs_attr_put_search_ctx(ctx); + /* Revert to the safe super operations. */ + ntfs_free(m); + return -1; + } + /* + * Re-initialize some specifics about $MFT's inode as + * ntfs_read_inode() will have set up the default ones. + */ + /* Set uid and gid to root. */ + vi->i_uid = vi->i_gid = 0; + /* Regular file. No access for anyone. */ + vi->i_mode = S_IFREG; + /* No VFS initiated operations allowed for $MFT. */ + vi->i_op = &ntfs_empty_inode_ops; + vi->i_fop = &ntfs_empty_file_ops; + } + + /* Get the lowest vcn for the next extent. */ + highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + next_vcn = highest_vcn + 1; + + /* Only one extent or error, which we catch below. */ + if (next_vcn <= 0) + break; + + /* Avoid endless loops due to corruption. */ + if (next_vcn < sle64_to_cpu( + a->data.non_resident.lowest_vcn)) { + ntfs_error(sb, "$MFT has corrupt attribute list " + "attribute. Run chkdsk."); + goto put_err_out; + } + } + if (err != -ENOENT) { + ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. " + "$MFT is corrupt. Run chkdsk."); + goto put_err_out; + } + if (!a) { + ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is " + "corrupt. Run chkdsk."); + goto put_err_out; + } + if (highest_vcn && highest_vcn != last_vcn - 1) { + ntfs_error(sb, "Failed to load the complete runlist for " + "$MFT/$DATA. Driver bug or corrupt $MFT. " + "Run chkdsk."); + ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx", + (unsigned long long)highest_vcn, + (unsigned long long)last_vcn - 1); + goto put_err_out; + } + ntfs_attr_put_search_ctx(ctx); + ntfs_debug("Done."); + ntfs_free(m); + + /* + * Split the locking rules of the MFT inode from the + * locking rules of other inodes: + */ + lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key); + lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key); + + return 0; + +em_put_err_out: + ntfs_error(sb, "Couldn't find first extent of $DATA attribute in " + "attribute list. $MFT is corrupt. Run chkdsk."); +put_err_out: + ntfs_attr_put_search_ctx(ctx); +err_out: + ntfs_error(sb, "Failed. Marking inode as bad."); + make_bad_inode(vi); + ntfs_free(m); + return -1; +} + +static void __ntfs_clear_inode(ntfs_inode *ni) +{ + /* Free all alocated memory. */ + down_write(&ni->runlist.lock); + if (ni->runlist.rl) { + ntfs_free(ni->runlist.rl); + ni->runlist.rl = NULL; + } + up_write(&ni->runlist.lock); + + if (ni->attr_list) { + ntfs_free(ni->attr_list); + ni->attr_list = NULL; + } + + down_write(&ni->attr_list_rl.lock); + if (ni->attr_list_rl.rl) { + ntfs_free(ni->attr_list_rl.rl); + ni->attr_list_rl.rl = NULL; + } + up_write(&ni->attr_list_rl.lock); + + if (ni->name_len && ni->name != I30) { + /* Catch bugs... */ + BUG_ON(!ni->name); + kfree(ni->name); + } +} + +void ntfs_clear_extent_inode(ntfs_inode *ni) +{ + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + + BUG_ON(NInoAttr(ni)); + BUG_ON(ni->nr_extents != -1); + +#ifdef NTFS_RW + if (NInoDirty(ni)) { + if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino))) + ntfs_error(ni->vol->sb, "Clearing dirty extent inode! " + "Losing data! This is a BUG!!!"); + // FIXME: Do something!!! + } +#endif /* NTFS_RW */ + + __ntfs_clear_inode(ni); + + /* Bye, bye... */ + ntfs_destroy_extent_inode(ni); +} + +/** + * ntfs_evict_big_inode - clean up the ntfs specific part of an inode + * @vi: vfs inode pending annihilation + * + * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode() + * is called, which deallocates all memory belonging to the NTFS specific part + * of the inode and returns. + * + * If the MFT record is dirty, we commit it before doing anything else. + */ +void ntfs_evict_big_inode(struct inode *vi) +{ + ntfs_inode *ni = NTFS_I(vi); + + truncate_inode_pages(&vi->i_data, 0); + end_writeback(vi); + +#ifdef NTFS_RW + if (NInoDirty(ni)) { + bool was_bad = (is_bad_inode(vi)); + + /* Committing the inode also commits all extent inodes. */ + ntfs_commit_inode(vi); + + if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) { + ntfs_error(vi->i_sb, "Failed to commit dirty inode " + "0x%lx. Losing data!", vi->i_ino); + // FIXME: Do something!!! + } + } +#endif /* NTFS_RW */ + + /* No need to lock at this stage as no one else has a reference. */ + if (ni->nr_extents > 0) { + int i; + + for (i = 0; i < ni->nr_extents; i++) + ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]); + kfree(ni->ext.extent_ntfs_inos); + } + + __ntfs_clear_inode(ni); + + if (NInoAttr(ni)) { + /* Release the base inode if we are holding it. */ + if (ni->nr_extents == -1) { + iput(VFS_I(ni->ext.base_ntfs_ino)); + ni->nr_extents = 0; + ni->ext.base_ntfs_ino = NULL; + } + } + return; +} + +/** + * ntfs_show_options - show mount options in /proc/mounts + * @sf: seq_file in which to write our mount options + * @mnt: vfs mount whose mount options to display + * + * Called by the VFS once for each mounted ntfs volume when someone reads + * /proc/mounts in order to display the NTFS specific mount options of each + * mount. The mount options of the vfs mount @mnt are written to the seq file + * @sf and success is returned. + */ +int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt) +{ + ntfs_volume *vol = NTFS_SB(mnt->mnt_sb); + int i; + + seq_printf(sf, ",uid=%i", vol->uid); + seq_printf(sf, ",gid=%i", vol->gid); + if (vol->fmask == vol->dmask) + seq_printf(sf, ",umask=0%o", vol->fmask); + else { + seq_printf(sf, ",fmask=0%o", vol->fmask); + seq_printf(sf, ",dmask=0%o", vol->dmask); + } + seq_printf(sf, ",nls=%s", vol->nls_map->charset); + if (NVolCaseSensitive(vol)) + seq_printf(sf, ",case_sensitive"); + if (NVolShowSystemFiles(vol)) + seq_printf(sf, ",show_sys_files"); + if (!NVolSparseEnabled(vol)) + seq_printf(sf, ",disable_sparse"); + for (i = 0; on_errors_arr[i].val; i++) { + if (on_errors_arr[i].val & vol->on_errors) + seq_printf(sf, ",errors=%s", on_errors_arr[i].str); + } + seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier); + return 0; +} + +#ifdef NTFS_RW + +static const char *es = " Leaving inconsistent metadata. Unmount and run " + "chkdsk."; + +/** + * ntfs_truncate - called when the i_size of an ntfs inode is changed + * @vi: inode for which the i_size was changed + * + * We only support i_size changes for normal files at present, i.e. not + * compressed and not encrypted. This is enforced in ntfs_setattr(), see + * below. + * + * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and + * that the change is allowed. + * + * This implies for us that @vi is a file inode rather than a directory, index, + * or attribute inode as well as that @vi is a base inode. + * + * Returns 0 on success or -errno on error. + * + * Called with ->i_mutex held. In all but one case ->i_alloc_sem is held for + * writing. The only case in the kernel where ->i_alloc_sem is not held is + * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called + * with the current i_size as the offset. The analogous place in NTFS is in + * fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again + * without holding ->i_alloc_sem. + */ +int ntfs_truncate(struct inode *vi) +{ + s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size; + VCN highest_vcn; + unsigned long flags; + ntfs_inode *base_ni, *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + const char *te = " Leaving file length out of sync with i_size."; + int err, mp_size, size_change, alloc_change; + u32 attr_len; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + BUG_ON(NInoAttr(ni)); + BUG_ON(S_ISDIR(vi->i_mode)); + BUG_ON(NInoMstProtected(ni)); + BUG_ON(ni->nr_extents < 0); +retry_truncate: + /* + * Lock the runlist for writing and map the mft record to ensure it is + * safe to mess with the attribute runlist and sizes. + */ + down_write(&ni->runlist.lock); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx " + "(error code %d).%s", vi->i_ino, err, te); + ctx = NULL; + m = NULL; + goto old_bad_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + ntfs_error(vi->i_sb, "Failed to allocate a search context for " + "inode 0x%lx (not enough memory).%s", + vi->i_ino, te); + err = -ENOMEM; + goto old_bad_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(vi->i_sb, "Open attribute is missing from " + "mft record. Inode 0x%lx is corrupt. " + "Run chkdsk.%s", vi->i_ino, te); + err = -EIO; + } else + ntfs_error(vi->i_sb, "Failed to lookup attribute in " + "inode 0x%lx (error code %d).%s", + vi->i_ino, err, te); + goto old_bad_out; + } + m = ctx->mrec; + a = ctx->attr; + /* + * The i_size of the vfs inode is the new size for the attribute value. + */ + new_size = i_size_read(vi); + /* The current size of the attribute value is the old size. */ + old_size = ntfs_attr_size(a); + /* Calculate the new allocated size. */ + if (NInoNonResident(ni)) + new_alloc_size = (new_size + vol->cluster_size - 1) & + ~(s64)vol->cluster_size_mask; + else + new_alloc_size = (new_size + 7) & ~7; + /* The current allocated size is the old allocated size. */ + read_lock_irqsave(&ni->size_lock, flags); + old_alloc_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + /* + * The change in the file size. This will be 0 if no change, >0 if the + * size is growing, and <0 if the size is shrinking. + */ + size_change = -1; + if (new_size - old_size >= 0) { + size_change = 1; + if (new_size == old_size) + size_change = 0; + } + /* As above for the allocated size. */ + alloc_change = -1; + if (new_alloc_size - old_alloc_size >= 0) { + alloc_change = 1; + if (new_alloc_size == old_alloc_size) + alloc_change = 0; + } + /* + * If neither the size nor the allocation are being changed there is + * nothing to do. + */ + if (!size_change && !alloc_change) + goto unm_done; + /* If the size is changing, check if new size is allowed in $AttrDef. */ + if (size_change) { + err = ntfs_attr_size_bounds_check(vol, ni->type, new_size); + if (unlikely(err)) { + if (err == -ERANGE) { + ntfs_error(vol->sb, "Truncate would cause the " + "inode 0x%lx to %simum size " + "for its attribute type " + "(0x%x). Aborting truncate.", + vi->i_ino, + new_size > old_size ? "exceed " + "the max" : "go under the min", + le32_to_cpu(ni->type)); + err = -EFBIG; + } else { + ntfs_error(vol->sb, "Inode 0x%lx has unknown " + "attribute type 0x%x. " + "Aborting truncate.", + vi->i_ino, + le32_to_cpu(ni->type)); + err = -EIO; + } + /* Reset the vfs inode size to the old size. */ + i_size_write(vi, old_size); + goto err_out; + } + } + if (NInoCompressed(ni) || NInoEncrypted(ni)) { + ntfs_warning(vi->i_sb, "Changes in inode size are not " + "supported yet for %s files, ignoring.", + NInoCompressed(ni) ? "compressed" : + "encrypted"); + err = -EOPNOTSUPP; + goto bad_out; + } + if (a->non_resident) + goto do_non_resident_truncate; + BUG_ON(NInoNonResident(ni)); + /* Resize the attribute record to best fit the new attribute size. */ + if (new_size < vol->mft_record_size && + !ntfs_resident_attr_value_resize(m, a, new_size)) { + /* The resize succeeded! */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + write_lock_irqsave(&ni->size_lock, flags); + /* Update the sizes in the ntfs inode and all is done. */ + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset); + /* + * Note ntfs_resident_attr_value_resize() has already done any + * necessary data clearing in the attribute record. When the + * file is being shrunk vmtruncate() will already have cleared + * the top part of the last partial page, i.e. since this is + * the resident case this is the page with index 0. However, + * when the file is being expanded, the page cache page data + * between the old data_size, i.e. old_size, and the new_size + * has not been zeroed. Fortunately, we do not need to zero it + * either since on one hand it will either already be zero due + * to both readpage and writepage clearing partial page data + * beyond i_size in which case there is nothing to do or in the + * case of the file being mmap()ped at the same time, POSIX + * specifies that the behaviour is unspecified thus we do not + * have to do anything. This means that in our implementation + * in the rare case that the file is mmap()ped and a write + * occurred into the mmap()ped region just beyond the file size + * and writepage has not yet been called to write out the page + * (which would clear the area beyond the file size) and we now + * extend the file size to incorporate this dirty region + * outside the file size, a write of the page would result in + * this data being written to disk instead of being cleared. + * Given both POSIX and the Linux mmap(2) man page specify that + * this corner case is undefined, we choose to leave it like + * that as this is much simpler for us as we cannot lock the + * relevant page now since we are holding too many ntfs locks + * which would result in a lock reversal deadlock. + */ + ni->initialized_size = new_size; + write_unlock_irqrestore(&ni->size_lock, flags); + goto unm_done; + } + /* If the above resize failed, this must be an attribute extension. */ + BUG_ON(size_change < 0); + /* + * We have to drop all the locks so we can call + * ntfs_attr_make_non_resident(). This could be optimised by try- + * locking the first page cache page and only if that fails dropping + * the locks, locking the page, and redoing all the locking and + * lookups. While this would be a huge optimisation, it is not worth + * it as this is definitely a slow code path as it only ever can happen + * once for any given file. + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + /* + * Not enough space in the mft record, try to make the attribute + * non-resident and if successful restart the truncation process. + */ + err = ntfs_attr_make_non_resident(ni, old_size); + if (likely(!err)) + goto retry_truncate; + /* + * Could not make non-resident. If this is due to this not being + * permitted for this attribute type or there not being enough space, + * try to make other attributes non-resident. Otherwise fail. + */ + if (unlikely(err != -EPERM && err != -ENOSPC)) { + ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute " + "type 0x%x, because the conversion from " + "resident to non-resident attribute failed " + "with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM) + err = -EIO; + goto conv_err_out; + } + /* TODO: Not implemented from here, abort. */ + if (err == -ENOSPC) + ntfs_error(vol->sb, "Not enough space in the mft record/on " + "disk for the non-resident attribute value. " + "This case is not implemented yet."); + else /* if (err == -EPERM) */ + ntfs_error(vol->sb, "This attribute type may not be " + "non-resident. This case is not implemented " + "yet."); + err = -EOPNOTSUPP; + goto conv_err_out; +#if 0 + // TODO: Attempt to make other attributes non-resident. + if (!err) + goto do_resident_extend; + /* + * Both the attribute list attribute and the standard information + * attribute must remain in the base inode. Thus, if this is one of + * these attributes, we have to try to move other attributes out into + * extent mft records instead. + */ + if (ni->type == AT_ATTRIBUTE_LIST || + ni->type == AT_STANDARD_INFORMATION) { + // TODO: Attempt to move other attributes into extent mft + // records. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + goto err_out; + } + // TODO: Attempt to move this attribute to an extent mft record, but + // only if it is not already the only attribute in an mft record in + // which case there would be nothing to gain. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + /* There is nothing we can do to make enough space. )-: */ + goto err_out; +#endif +do_non_resident_truncate: + BUG_ON(!NInoNonResident(ni)); + if (alloc_change < 0) { + highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + if (highest_vcn > 0 && + old_alloc_size >> vol->cluster_size_bits > + highest_vcn + 1) { + /* + * This attribute has multiple extents. Not yet + * supported. + */ + ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, " + "attribute type 0x%x, because the " + "attribute is highly fragmented (it " + "consists of multiple extents) and " + "this case is not implemented yet.", + vi->i_ino, + (unsigned)le32_to_cpu(ni->type)); + err = -EOPNOTSUPP; + goto bad_out; + } + } + /* + * If the size is shrinking, need to reduce the initialized_size and + * the data_size before reducing the allocation. + */ + if (size_change < 0) { + /* + * Make the valid size smaller (i_size is already up-to-date). + */ + write_lock_irqsave(&ni->size_lock, flags); + if (new_size < ni->initialized_size) { + ni->initialized_size = new_size; + a->data.non_resident.initialized_size = + cpu_to_sle64(new_size); + } + a->data.non_resident.data_size = cpu_to_sle64(new_size); + write_unlock_irqrestore(&ni->size_lock, flags); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + /* If the allocated size is not changing, we are done. */ + if (!alloc_change) + goto unm_done; + /* + * If the size is shrinking it makes no sense for the + * allocation to be growing. + */ + BUG_ON(alloc_change > 0); + } else /* if (size_change >= 0) */ { + /* + * The file size is growing or staying the same but the + * allocation can be shrinking, growing or staying the same. + */ + if (alloc_change > 0) { + /* + * We need to extend the allocation and possibly update + * the data size. If we are updating the data size, + * since we are not touching the initialized_size we do + * not need to worry about the actual data on disk. + * And as far as the page cache is concerned, there + * will be no pages beyond the old data size and any + * partial region in the last page between the old and + * new data size (or the end of the page if the new + * data size is outside the page) does not need to be + * modified as explained above for the resident + * attribute truncate case. To do this, we simply drop + * the locks we hold and leave all the work to our + * friendly helper ntfs_attr_extend_allocation(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + err = ntfs_attr_extend_allocation(ni, new_size, + size_change > 0 ? new_size : -1, -1); + /* + * ntfs_attr_extend_allocation() will have done error + * output already. + */ + goto done; + } + if (!alloc_change) + goto alloc_done; + } + /* alloc_change < 0 */ + /* Free the clusters. */ + nr_freed = ntfs_cluster_free(ni, new_alloc_size >> + vol->cluster_size_bits, -1, ctx); + m = ctx->mrec; + a = ctx->attr; + if (unlikely(nr_freed < 0)) { + ntfs_error(vol->sb, "Failed to release cluster(s) (error code " + "%lli). Unmount and run chkdsk to recover " + "the lost cluster(s).", (long long)nr_freed); + NVolSetErrors(vol); + nr_freed = 0; + } + /* Truncate the runlist. */ + err = ntfs_rl_truncate_nolock(vol, &ni->runlist, + new_alloc_size >> vol->cluster_size_bits); + /* + * If the runlist truncation failed and/or the search context is no + * longer valid, we cannot resize the attribute record or build the + * mapping pairs array thus we mark the inode bad so that no access to + * the freed clusters can happen. + */ + if (unlikely(err || IS_ERR(m))) { + ntfs_error(vol->sb, "Failed to %s (error code %li).%s", + IS_ERR(m) ? + "restore attribute search context" : + "truncate attribute runlist", + IS_ERR(m) ? PTR_ERR(m) : err, es); + err = -EIO; + goto bad_out; + } + /* Get the size for the shrunk mapping pairs array for the runlist. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1); + if (unlikely(mp_size <= 0)) { + ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, " + "attribute type 0x%x, because determining the " + "size for the mapping pairs failed with error " + "code %i.%s", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), mp_size, es); + err = -EIO; + goto bad_out; + } + /* + * Shrink the attribute record for the new mapping pairs array. Note, + * this cannot fail since we are making the attribute smaller thus by + * definition there is enough space to do so. + */ + attr_len = le32_to_cpu(a->length); + err = ntfs_attr_record_resize(m, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + BUG_ON(err); + /* + * Generate the mapping pairs array directly into the attribute record. + */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, ni->runlist.rl, 0, -1, NULL); + if (unlikely(err)) { + ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, " + "attribute type 0x%x, because building the " + "mapping pairs failed with error code %i.%s", + vi->i_ino, (unsigned)le32_to_cpu(ni->type), + err, es); + err = -EIO; + goto bad_out; + } + /* Update the allocated/compressed size as well as the highest vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> + vol->cluster_size_bits) - 1); + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_alloc_size; + a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); + if (NInoSparse(ni) || NInoCompressed(ni)) { + if (nr_freed) { + ni->itype.compressed.size -= nr_freed << + vol->cluster_size_bits; + BUG_ON(ni->itype.compressed.size < 0); + a->data.non_resident.compressed_size = cpu_to_sle64( + ni->itype.compressed.size); + vi->i_blocks = ni->itype.compressed.size >> 9; + } + } else + vi->i_blocks = new_alloc_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); + /* + * We have shrunk the allocation. If this is a shrinking truncate we + * have already dealt with the initialized_size and the data_size above + * and we are done. If the truncate is only changing the allocation + * and not the data_size, we are also done. If this is an extending + * truncate, need to extend the data_size now which is ensured by the + * fact that @size_change is positive. + */ +alloc_done: + /* + * If the size is growing, need to update it now. If it is shrinking, + * we have already updated it above (before the allocation change). + */ + if (size_change > 0) + a->data.non_resident.data_size = cpu_to_sle64(new_size); + /* Ensure the modified mft record is written out. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); +unm_done: + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); +done: + /* Update the mtime and ctime on the base inode. */ + /* normally ->truncate shouldn't update ctime or mtime, + * but ntfs did before so it got a copy & paste version + * of file_update_time. one day someone should fix this + * for real. + */ + if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { + struct timespec now = current_fs_time(VFS_I(base_ni)->i_sb); + int sync_it = 0; + + if (!timespec_equal(&VFS_I(base_ni)->i_mtime, &now) || + !timespec_equal(&VFS_I(base_ni)->i_ctime, &now)) + sync_it = 1; + VFS_I(base_ni)->i_mtime = now; + VFS_I(base_ni)->i_ctime = now; + + if (sync_it) + mark_inode_dirty_sync(VFS_I(base_ni)); + } + + if (likely(!err)) { + NInoClearTruncateFailed(ni); + ntfs_debug("Done."); + } + return err; +old_bad_out: + old_size = -1; +bad_out: + if (err != -ENOMEM && err != -EOPNOTSUPP) + NVolSetErrors(vol); + if (err != -EOPNOTSUPP) + NInoSetTruncateFailed(ni); + else if (old_size >= 0) + i_size_write(vi, old_size); +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); +out: + ntfs_debug("Failed. Returning error code %i.", err); + return err; +conv_err_out: + if (err != -ENOMEM && err != -EOPNOTSUPP) + NVolSetErrors(vol); + if (err != -EOPNOTSUPP) + NInoSetTruncateFailed(ni); + else + i_size_write(vi, old_size); + goto out; +} + +/** + * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value + * @vi: inode for which the i_size was changed + * + * Wrapper for ntfs_truncate() that has no return value. + * + * See ntfs_truncate() description above for details. + */ +void ntfs_truncate_vfs(struct inode *vi) { + ntfs_truncate(vi); +} + +/** + * ntfs_setattr - called from notify_change() when an attribute is being changed + * @dentry: dentry whose attributes to change + * @attr: structure describing the attributes and the changes + * + * We have to trap VFS attempts to truncate the file described by @dentry as + * soon as possible, because we do not implement changes in i_size yet. So we + * abort all i_size changes here. + * + * We also abort all changes of user, group, and mode as we do not implement + * the NTFS ACLs yet. + * + * Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also + * called with ->i_alloc_sem held for writing. + */ +int ntfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *vi = dentry->d_inode; + int err; + unsigned int ia_valid = attr->ia_valid; + + err = inode_change_ok(vi, attr); + if (err) + goto out; + /* We do not support NTFS ACLs yet. */ + if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) { + ntfs_warning(vi->i_sb, "Changes in user/group/mode are not " + "supported yet, ignoring."); + err = -EOPNOTSUPP; + goto out; + } + if (ia_valid & ATTR_SIZE) { + if (attr->ia_size != i_size_read(vi)) { + ntfs_inode *ni = NTFS_I(vi); + /* + * FIXME: For now we do not support resizing of + * compressed or encrypted files yet. + */ + if (NInoCompressed(ni) || NInoEncrypted(ni)) { + ntfs_warning(vi->i_sb, "Changes in inode size " + "are not supported yet for " + "%s files, ignoring.", + NInoCompressed(ni) ? + "compressed" : "encrypted"); + err = -EOPNOTSUPP; + } else + err = vmtruncate(vi, attr->ia_size); + if (err || ia_valid == ATTR_SIZE) + goto out; + } else { + /* + * We skipped the truncate but must still update + * timestamps. + */ + ia_valid |= ATTR_MTIME | ATTR_CTIME; + } + } + if (ia_valid & ATTR_ATIME) + vi->i_atime = timespec_trunc(attr->ia_atime, + vi->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + vi->i_mtime = timespec_trunc(attr->ia_mtime, + vi->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + vi->i_ctime = timespec_trunc(attr->ia_ctime, + vi->i_sb->s_time_gran); + mark_inode_dirty(vi); +out: + return err; +} + +/** + * ntfs_write_inode - write out a dirty inode + * @vi: inode to write out + * @sync: if true, write out synchronously + * + * Write out a dirty inode to disk including any extent inodes if present. + * + * If @sync is true, commit the inode to disk and wait for io completion. This + * is done using write_mft_record(). + * + * If @sync is false, just schedule the write to happen but do not wait for i/o + * completion. In 2.6 kernels, scheduling usually happens just by virtue of + * marking the page (and in this case mft record) dirty but we do not implement + * this yet as write_mft_record() largely ignores the @sync parameter and + * always performs synchronous writes. + * + * Return 0 on success and -errno on error. + */ +int __ntfs_write_inode(struct inode *vi, int sync) +{ + sle64 nt; + ntfs_inode *ni = NTFS_I(vi); + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + STANDARD_INFORMATION *si; + int err = 0; + bool modified = false; + + ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "", + vi->i_ino); + /* + * Dirty attribute inodes are written via their real inodes so just + * clean them here. Access time updates are taken care off when the + * real inode is written. + */ + if (NInoAttr(ni)) { + NInoClearDirty(ni); + ntfs_debug("Done."); + return 0; + } + /* Map, pin, and lock the mft record belonging to the inode. */ + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + /* Update the access times in the standard information attribute. */ + ctx = ntfs_attr_get_search_ctx(ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto unm_err_out; + } + err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + ntfs_attr_put_search_ctx(ctx); + goto unm_err_out; + } + si = (STANDARD_INFORMATION*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + /* Update the access times if they have changed. */ + nt = utc2ntfs(vi->i_mtime); + if (si->last_data_change_time != nt) { + ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, " + "new = 0x%llx", vi->i_ino, (long long) + sle64_to_cpu(si->last_data_change_time), + (long long)sle64_to_cpu(nt)); + si->last_data_change_time = nt; + modified = true; + } + nt = utc2ntfs(vi->i_ctime); + if (si->last_mft_change_time != nt) { + ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " + "new = 0x%llx", vi->i_ino, (long long) + sle64_to_cpu(si->last_mft_change_time), + (long long)sle64_to_cpu(nt)); + si->last_mft_change_time = nt; + modified = true; + } + nt = utc2ntfs(vi->i_atime); + if (si->last_access_time != nt) { + ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, " + "new = 0x%llx", vi->i_ino, + (long long)sle64_to_cpu(si->last_access_time), + (long long)sle64_to_cpu(nt)); + si->last_access_time = nt; + modified = true; + } + /* + * If we just modified the standard information attribute we need to + * mark the mft record it is in dirty. We do this manually so that + * mark_inode_dirty() is not called which would redirty the inode and + * hence result in an infinite loop of trying to write the inode. + * There is no need to mark the base inode nor the base mft record + * dirty, since we are going to write this mft record below in any case + * and the base mft record may actually not have been modified so it + * might not need to be written out. + * NOTE: It is not a problem when the inode for $MFT itself is being + * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES + * on the $MFT inode and hence ntfs_write_inode() will not be + * re-invoked because of it which in turn is ok since the dirtied mft + * record will be cleaned and written out to disk below, i.e. before + * this function returns. + */ + if (modified) { + flush_dcache_mft_record_page(ctx->ntfs_ino); + if (!NInoTestSetDirty(ctx->ntfs_ino)) + mark_ntfs_record_dirty(ctx->ntfs_ino->page, + ctx->ntfs_ino->page_ofs); + } + ntfs_attr_put_search_ctx(ctx); + /* Now the access times are updated, write the base mft record. */ + if (NInoDirty(ni)) + err = write_mft_record(ni, m, sync); + /* Write all attached extent mft records. */ + mutex_lock(&ni->extent_lock); + if (ni->nr_extents > 0) { + ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos; + int i; + + ntfs_debug("Writing %i extent inodes.", ni->nr_extents); + for (i = 0; i < ni->nr_extents; i++) { + ntfs_inode *tni = extent_nis[i]; + + if (NInoDirty(tni)) { + MFT_RECORD *tm = map_mft_record(tni); + int ret; + + if (IS_ERR(tm)) { + if (!err || err == -ENOMEM) + err = PTR_ERR(tm); + continue; + } + ret = write_mft_record(tni, tm, sync); + unmap_mft_record(tni); + if (unlikely(ret)) { + if (!err || err == -ENOMEM) + err = ret; + } + } + } + } + mutex_unlock(&ni->extent_lock); + unmap_mft_record(ni); + if (unlikely(err)) + goto err_out; + ntfs_debug("Done."); + return 0; +unm_err_out: + unmap_mft_record(ni); +err_out: + if (err == -ENOMEM) { + ntfs_warning(vi->i_sb, "Not enough memory to write inode. " + "Marking the inode dirty again, so the VFS " + "retries later."); + mark_inode_dirty(vi); + } else { + ntfs_error(vi->i_sb, "Failed (error %i): Run chkdsk.", -err); + NVolSetErrors(ni->vol); + } + return err; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h new file mode 100644 index 00000000..2dabf813 --- /dev/null +++ b/fs/ntfs/inode.h @@ -0,0 +1,321 @@ +/* + * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of + * the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_INODE_H +#define _LINUX_NTFS_INODE_H + +#include + +#include +#include +#include +#include +#include + +#include "layout.h" +#include "volume.h" +#include "types.h" +#include "runlist.h" +#include "debug.h" + +typedef struct _ntfs_inode ntfs_inode; + +/* + * The NTFS in-memory inode structure. It is just used as an extension to the + * fields already provided in the VFS inode. + */ +struct _ntfs_inode { + rwlock_t size_lock; /* Lock serializing access to inode sizes. */ + s64 initialized_size; /* Copy from the attribute record. */ + s64 allocated_size; /* Copy from the attribute record. */ + unsigned long state; /* NTFS specific flags describing this inode. + See ntfs_inode_state_bits below. */ + unsigned long mft_no; /* Number of the mft record / inode. */ + u16 seq_no; /* Sequence number of the mft record. */ + atomic_t count; /* Inode reference count for book keeping. */ + ntfs_volume *vol; /* Pointer to the ntfs volume of this inode. */ + /* + * If NInoAttr() is true, the below fields describe the attribute which + * this fake inode belongs to. The actual inode of this attribute is + * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see + * below). For real inodes, we also set the type (AT_DATA for files and + * AT_INDEX_ALLOCATION for directories), with the name = NULL and + * name_len = 0 for files and name = I30 (global constant) and + * name_len = 4 for directories. + */ + ATTR_TYPE type; /* Attribute type of this fake inode. */ + ntfschar *name; /* Attribute name of this fake inode. */ + u32 name_len; /* Attribute name length of this fake inode. */ + runlist runlist; /* If state has the NI_NonResident bit set, + the runlist of the unnamed data attribute + (if a file) or of the index allocation + attribute (directory) or of the attribute + described by the fake inode (if NInoAttr()). + If runlist.rl is NULL, the runlist has not + been read in yet or has been unmapped. If + NI_NonResident is clear, the attribute is + resident (file and fake inode) or there is + no $I30 index allocation attribute + (small directory). In the latter case + runlist.rl is always NULL.*/ + /* + * The following fields are only valid for real inodes and extent + * inodes. + */ + struct mutex mrec_lock; /* Lock for serializing access to the + mft record belonging to this inode. */ + struct page *page; /* The page containing the mft record of the + inode. This should only be touched by the + (un)map_mft_record*() functions. */ + int page_ofs; /* Offset into the page at which the mft record + begins. This should only be touched by the + (un)map_mft_record*() functions. */ + /* + * Attribute list support (only for use by the attribute lookup + * functions). Setup during read_inode for all inodes with attribute + * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is + * further only valid if NI_AttrListNonResident is set. + */ + u32 attr_list_size; /* Length of attribute list value in bytes. */ + u8 *attr_list; /* Attribute list value itself. */ + runlist attr_list_rl; /* Run list for the attribute list value. */ + union { + struct { /* It is a directory, $MFT, or an index inode. */ + u32 block_size; /* Size of an index block. */ + u32 vcn_size; /* Size of a vcn in this + index. */ + COLLATION_RULE collation_rule; /* The collation rule + for the index. */ + u8 block_size_bits; /* Log2 of the above. */ + u8 vcn_size_bits; /* Log2 of the above. */ + } index; + struct { /* It is a compressed/sparse file/attribute inode. */ + s64 size; /* Copy of compressed_size from + $DATA. */ + u32 block_size; /* Size of a compression block + (cb). */ + u8 block_size_bits; /* Log2 of the size of a cb. */ + u8 block_clusters; /* Number of clusters per cb. */ + } compressed; + } itype; + struct mutex extent_lock; /* Lock for accessing/modifying the + below . */ + s32 nr_extents; /* For a base mft record, the number of attached extent + inodes (0 if none), for extent records and for fake + inodes describing an attribute this is -1. */ + union { /* This union is only used if nr_extents != 0. */ + ntfs_inode **extent_ntfs_inos; /* For nr_extents > 0, array of + the ntfs inodes of the extent + mft records belonging to + this base inode which have + been loaded. */ + ntfs_inode *base_ntfs_ino; /* For nr_extents == -1, the + ntfs inode of the base mft + record. For fake inodes, the + real (base) inode to which + the attribute belongs. */ + } ext; +}; + +/* + * Defined bits for the state field in the ntfs_inode structure. + * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only + */ +typedef enum { + NI_Dirty, /* 1: Mft record needs to be written to disk. */ + NI_AttrList, /* 1: Mft record contains an attribute list. */ + NI_AttrListNonResident, /* 1: Attribute list is non-resident. Implies + NI_AttrList is set. */ + + NI_Attr, /* 1: Fake inode for attribute i/o. + 0: Real inode or extent inode. */ + + NI_MstProtected, /* 1: Attribute is protected by MST fixups. + 0: Attribute is not protected by fixups. */ + NI_NonResident, /* 1: Unnamed data attr is non-resident (f). + 1: Attribute is non-resident (a). */ + NI_IndexAllocPresent = NI_NonResident, /* 1: $I30 index alloc attr is + present (d). */ + NI_Compressed, /* 1: Unnamed data attr is compressed (f). + 1: Create compressed files by default (d). + 1: Attribute is compressed (a). */ + NI_Encrypted, /* 1: Unnamed data attr is encrypted (f). + 1: Create encrypted files by default (d). + 1: Attribute is encrypted (a). */ + NI_Sparse, /* 1: Unnamed data attr is sparse (f). + 1: Create sparse files by default (d). + 1: Attribute is sparse (a). */ + NI_SparseDisabled, /* 1: May not create sparse regions. */ + NI_TruncateFailed, /* 1: Last ntfs_truncate() call failed. */ +} ntfs_inode_state_bits; + +/* + * NOTE: We should be adding dirty mft records to a list somewhere and they + * should be independent of the (ntfs/vfs) inode structure so that an inode can + * be removed but the record can be left dirty for syncing later. + */ + +/* + * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo() + * functions. + */ +#define NINO_FNS(flag) \ +static inline int NIno##flag(ntfs_inode *ni) \ +{ \ + return test_bit(NI_##flag, &(ni)->state); \ +} \ +static inline void NInoSet##flag(ntfs_inode *ni) \ +{ \ + set_bit(NI_##flag, &(ni)->state); \ +} \ +static inline void NInoClear##flag(ntfs_inode *ni) \ +{ \ + clear_bit(NI_##flag, &(ni)->state); \ +} + +/* + * As above for NInoTestSetFoo() and NInoTestClearFoo(). + */ +#define TAS_NINO_FNS(flag) \ +static inline int NInoTestSet##flag(ntfs_inode *ni) \ +{ \ + return test_and_set_bit(NI_##flag, &(ni)->state); \ +} \ +static inline int NInoTestClear##flag(ntfs_inode *ni) \ +{ \ + return test_and_clear_bit(NI_##flag, &(ni)->state); \ +} + +/* Emit the ntfs inode bitops functions. */ +NINO_FNS(Dirty) +TAS_NINO_FNS(Dirty) +NINO_FNS(AttrList) +NINO_FNS(AttrListNonResident) +NINO_FNS(Attr) +NINO_FNS(MstProtected) +NINO_FNS(NonResident) +NINO_FNS(IndexAllocPresent) +NINO_FNS(Compressed) +NINO_FNS(Encrypted) +NINO_FNS(Sparse) +NINO_FNS(SparseDisabled) +NINO_FNS(TruncateFailed) + +/* + * The full structure containing a ntfs_inode and a vfs struct inode. Used for + * all real and fake inodes but not for extent inodes which lack the vfs struct + * inode. + */ +typedef struct { + ntfs_inode ntfs_inode; + struct inode vfs_inode; /* The vfs inode structure. */ +} big_ntfs_inode; + +/** + * NTFS_I - return the ntfs inode given a vfs inode + * @inode: VFS inode + * + * NTFS_I() returns the ntfs inode associated with the VFS @inode. + */ +static inline ntfs_inode *NTFS_I(struct inode *inode) +{ + return (ntfs_inode *)list_entry(inode, big_ntfs_inode, vfs_inode); +} + +static inline struct inode *VFS_I(ntfs_inode *ni) +{ + return &((big_ntfs_inode *)ni)->vfs_inode; +} + +/** + * ntfs_attr - ntfs in memory attribute structure + * @mft_no: mft record number of the base mft record of this attribute + * @name: Unicode name of the attribute (NULL if unnamed) + * @name_len: length of @name in Unicode characters (0 if unnamed) + * @type: attribute type (see layout.h) + * + * This structure exists only to provide a small structure for the + * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism. + * + * NOTE: Elements are ordered by size to make the structure as compact as + * possible on all architectures. + */ +typedef struct { + unsigned long mft_no; + ntfschar *name; + u32 name_len; + ATTR_TYPE type; +} ntfs_attr; + +typedef int (*test_t)(struct inode *, void *); + +extern int ntfs_test_inode(struct inode *vi, ntfs_attr *na); + +extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no); +extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, + ntfschar *name, u32 name_len); +extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, + u32 name_len); + +extern struct inode *ntfs_alloc_big_inode(struct super_block *sb); +extern void ntfs_destroy_big_inode(struct inode *inode); +extern void ntfs_evict_big_inode(struct inode *vi); + +extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni); + +static inline void ntfs_init_big_inode(struct inode *vi) +{ + ntfs_inode *ni = NTFS_I(vi); + + ntfs_debug("Entering."); + __ntfs_init_inode(vi->i_sb, ni); + ni->mft_no = vi->i_ino; +} + +extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, + unsigned long mft_no); +extern void ntfs_clear_extent_inode(ntfs_inode *ni); + +extern int ntfs_read_inode_mount(struct inode *vi); + +extern int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt); + +#ifdef NTFS_RW + +extern int ntfs_truncate(struct inode *vi); +extern void ntfs_truncate_vfs(struct inode *vi); + +extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr); + +extern int __ntfs_write_inode(struct inode *vi, int sync); + +static inline void ntfs_commit_inode(struct inode *vi) +{ + if (!is_bad_inode(vi)) + __ntfs_write_inode(vi, 1); + return; +} + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_INODE_H */ diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h new file mode 100644 index 00000000..faece719 --- /dev/null +++ b/fs/ntfs/layout.h @@ -0,0 +1,2435 @@ +/* + * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_LAYOUT_H +#define _LINUX_NTFS_LAYOUT_H + +#include +#include +#include +#include + +#include "types.h" + +/* The NTFS oem_id "NTFS " */ +#define magicNTFS cpu_to_le64(0x202020205346544eULL) + +/* + * Location of bootsector on partition: + * The standard NTFS_BOOT_SECTOR is on sector 0 of the partition. + * On NT4 and above there is one backup copy of the boot sector to + * be found on the last sector of the partition (not normally accessible + * from within Windows as the bootsector contained number of sectors + * value is one less than the actual value!). + * On versions of NT 3.51 and earlier, the backup copy was located at + * number of sectors/2 (integer divide), i.e. in the middle of the volume. + */ + +/* + * BIOS parameter block (bpb) structure. + */ +typedef struct { + le16 bytes_per_sector; /* Size of a sector in bytes. */ + u8 sectors_per_cluster; /* Size of a cluster in sectors. */ + le16 reserved_sectors; /* zero */ + u8 fats; /* zero */ + le16 root_entries; /* zero */ + le16 sectors; /* zero */ + u8 media_type; /* 0xf8 = hard disk */ + le16 sectors_per_fat; /* zero */ + le16 sectors_per_track; /* irrelevant */ + le16 heads; /* irrelevant */ + le32 hidden_sectors; /* zero */ + le32 large_sectors; /* zero */ +} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK; + +/* + * NTFS boot sector structure. + */ +typedef struct { + u8 jump[3]; /* Irrelevant (jump to boot up code).*/ + le64 oem_id; /* Magic "NTFS ". */ + BIOS_PARAMETER_BLOCK bpb; /* See BIOS_PARAMETER_BLOCK. */ + u8 unused[4]; /* zero, NTFS diskedit.exe states that + this is actually: + __u8 physical_drive; // 0x80 + __u8 current_head; // zero + __u8 extended_boot_signature; + // 0x80 + __u8 unused; // zero + */ +/*0x28*/sle64 number_of_sectors; /* Number of sectors in volume. Gives + maximum volume size of 2^63 sectors. + Assuming standard sector size of 512 + bytes, the maximum byte size is + approx. 4.7x10^21 bytes. (-; */ + sle64 mft_lcn; /* Cluster location of mft data. */ + sle64 mftmirr_lcn; /* Cluster location of copy of mft. */ + s8 clusters_per_mft_record; /* Mft record size in clusters. */ + u8 reserved0[3]; /* zero */ + s8 clusters_per_index_record; /* Index block size in clusters. */ + u8 reserved1[3]; /* zero */ + le64 volume_serial_number; /* Irrelevant (serial number). */ + le32 checksum; /* Boot sector checksum. */ +/*0x54*/u8 bootstrap[426]; /* Irrelevant (boot up code). */ + le16 end_of_sector_marker; /* End of bootsector magic. Always is + 0xaa55 in little endian. */ +/* sizeof() = 512 (0x200) bytes */ +} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR; + +/* + * Magic identifiers present at the beginning of all ntfs record containing + * records (like mft records for example). + */ +enum { + /* Found in $MFT/$DATA. */ + magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */ + magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */ + magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ + + /* Found in $LogFile/$DATA. */ + magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */ + magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */ + + /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */ + magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ + + /* Found in all ntfs record containing records. */ + magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector + transfer was detected. */ + /* + * Found in $LogFile/$DATA when a page is full of 0xff bytes and is + * thus not initialized. Page must be initialized before using it. + */ + magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */ +}; + +typedef le32 NTFS_RECORD_TYPE; + +/* + * Generic magic comparison macros. Finally found a use for the ## preprocessor + * operator! (-8 + */ + +static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r) +{ + return (x == r); +} +#define ntfs_is_magic(x, m) __ntfs_is_magic(x, magic_##m) + +static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r) +{ + return (*p == r); +} +#define ntfs_is_magicp(p, m) __ntfs_is_magicp(p, magic_##m) + +/* + * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above. + */ +#define ntfs_is_file_record(x) ( ntfs_is_magic (x, FILE) ) +#define ntfs_is_file_recordp(p) ( ntfs_is_magicp(p, FILE) ) +#define ntfs_is_mft_record(x) ( ntfs_is_file_record (x) ) +#define ntfs_is_mft_recordp(p) ( ntfs_is_file_recordp(p) ) +#define ntfs_is_indx_record(x) ( ntfs_is_magic (x, INDX) ) +#define ntfs_is_indx_recordp(p) ( ntfs_is_magicp(p, INDX) ) +#define ntfs_is_hole_record(x) ( ntfs_is_magic (x, HOLE) ) +#define ntfs_is_hole_recordp(p) ( ntfs_is_magicp(p, HOLE) ) + +#define ntfs_is_rstr_record(x) ( ntfs_is_magic (x, RSTR) ) +#define ntfs_is_rstr_recordp(p) ( ntfs_is_magicp(p, RSTR) ) +#define ntfs_is_rcrd_record(x) ( ntfs_is_magic (x, RCRD) ) +#define ntfs_is_rcrd_recordp(p) ( ntfs_is_magicp(p, RCRD) ) + +#define ntfs_is_chkd_record(x) ( ntfs_is_magic (x, CHKD) ) +#define ntfs_is_chkd_recordp(p) ( ntfs_is_magicp(p, CHKD) ) + +#define ntfs_is_baad_record(x) ( ntfs_is_magic (x, BAAD) ) +#define ntfs_is_baad_recordp(p) ( ntfs_is_magicp(p, BAAD) ) + +#define ntfs_is_empty_record(x) ( ntfs_is_magic (x, empty) ) +#define ntfs_is_empty_recordp(p) ( ntfs_is_magicp(p, empty) ) + +/* + * The Update Sequence Array (usa) is an array of the le16 values which belong + * to the end of each sector protected by the update sequence record in which + * this array is contained. Note that the first entry is the Update Sequence + * Number (usn), a cyclic counter of how many times the protected record has + * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All + * last le16's of each sector have to be equal to the usn (during reading) or + * are set to it (during writing). If they are not, an incomplete multi sector + * transfer has occurred when the data was written. + * The maximum size for the update sequence array is fixed to: + * maximum size = usa_ofs + (usa_count * 2) = 510 bytes + * The 510 bytes comes from the fact that the last le16 in the array has to + * (obviously) finish before the last le16 of the first 512-byte sector. + * This formula can be used as a consistency check in that usa_ofs + + * (usa_count * 2) has to be less than or equal to 510. + */ +typedef struct { + NTFS_RECORD_TYPE magic; /* A four-byte magic identifying the record + type and/or status. */ + le16 usa_ofs; /* Offset to the Update Sequence Array (usa) + from the start of the ntfs record. */ + le16 usa_count; /* Number of le16 sized entries in the usa + including the Update Sequence Number (usn), + thus the number of fixups is the usa_count + minus 1. */ +} __attribute__ ((__packed__)) NTFS_RECORD; + +/* + * System files mft record numbers. All these files are always marked as used + * in the bitmap attribute of the mft; presumably in order to avoid accidental + * allocation for random other mft records. Also, the sequence number for each + * of the system files is always equal to their mft record number and it is + * never modified. + */ +typedef enum { + FILE_MFT = 0, /* Master file table (mft). Data attribute + contains the entries and bitmap attribute + records which ones are in use (bit==1). */ + FILE_MFTMirr = 1, /* Mft mirror: copy of first four mft records + in data attribute. If cluster size > 4kiB, + copy of first N mft records, with + N = cluster_size / mft_record_size. */ + FILE_LogFile = 2, /* Journalling log in data attribute. */ + FILE_Volume = 3, /* Volume name attribute and volume information + attribute (flags and ntfs version). Windows + refers to this file as volume DASD (Direct + Access Storage Device). */ + FILE_AttrDef = 4, /* Array of attribute definitions in data + attribute. */ + FILE_root = 5, /* Root directory. */ + FILE_Bitmap = 6, /* Allocation bitmap of all clusters (lcns) in + data attribute. */ + FILE_Boot = 7, /* Boot sector (always at cluster 0) in data + attribute. */ + FILE_BadClus = 8, /* Contains all bad clusters in the non-resident + data attribute. */ + FILE_Secure = 9, /* Shared security descriptors in data attribute + and two indexes into the descriptors. + Appeared in Windows 2000. Before that, this + file was named $Quota but was unused. */ + FILE_UpCase = 10, /* Uppercase equivalents of all 65536 Unicode + characters in data attribute. */ + FILE_Extend = 11, /* Directory containing other system files (eg. + $ObjId, $Quota, $Reparse and $UsnJrnl). This + is new to NTFS3.0. */ + FILE_reserved12 = 12, /* Reserved for future use (records 12-15). */ + FILE_reserved13 = 13, + FILE_reserved14 = 14, + FILE_reserved15 = 15, + FILE_first_user = 16, /* First user file, used as test limit for + whether to allow opening a file or not. */ +} NTFS_SYSTEM_FILES; + +/* + * These are the so far known MFT_RECORD_* flags (16-bit) which contain + * information about the mft record in which they are present. + */ +enum { + MFT_RECORD_IN_USE = cpu_to_le16(0x0001), + MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002), +} __attribute__ ((__packed__)); + +typedef le16 MFT_RECORD_FLAGS; + +/* + * mft references (aka file references or file record segment references) are + * used whenever a structure needs to refer to a record in the mft. + * + * A reference consists of a 48-bit index into the mft and a 16-bit sequence + * number used to detect stale references. + * + * For error reporting purposes we treat the 48-bit index as a signed quantity. + * + * The sequence number is a circular counter (skipping 0) describing how many + * times the referenced mft record has been (re)used. This has to match the + * sequence number of the mft record being referenced, otherwise the reference + * is considered stale and removed (FIXME: only ntfsck or the driver itself?). + * + * If the sequence number is zero it is assumed that no sequence number + * consistency checking should be performed. + * + * FIXME: Since inodes are 32-bit as of now, the driver needs to always check + * for high_part being 0 and if not either BUG(), cause a panic() or handle + * the situation in some other way. This shouldn't be a problem as a volume has + * to become HUGE in order to need more than 32-bits worth of mft records. + * Assuming the standard mft record size of 1kb only the records (never mind + * the non-resident attributes, etc.) would require 4Tb of space on their own + * for the first 32 bits worth of records. This is only if some strange person + * doesn't decide to foul play and make the mft sparse which would be a really + * horrible thing to do as it would trash our current driver implementation. )-: + * Do I hear screams "we want 64-bit inodes!" ?!? (-; + * + * FIXME: The mft zone is defined as the first 12% of the volume. This space is + * reserved so that the mft can grow contiguously and hence doesn't become + * fragmented. Volume free space includes the empty part of the mft zone and + * when the volume's free 88% are used up, the mft zone is shrunk by a factor + * of 2, thus making more space available for more files/data. This process is + * repeated every time there is no more free space except for the mft zone until + * there really is no more free space. + */ + +/* + * Typedef the MFT_REF as a 64-bit value for easier handling. + * Also define two unpacking macros to get to the reference (MREF) and + * sequence number (MSEQNO) respectively. + * The _LE versions are to be applied on little endian MFT_REFs. + * Note: The _LE versions will return a CPU endian formatted value! + */ +#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL +#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU) + +typedef u64 MFT_REF; +typedef le64 leMFT_REF; + +#define MK_MREF(m, s) ((MFT_REF)(((MFT_REF)(s) << 48) | \ + ((MFT_REF)(m) & MFT_REF_MASK_CPU))) +#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s)) + +#define MREF(x) ((unsigned long)((x) & MFT_REF_MASK_CPU)) +#define MSEQNO(x) ((u16)(((x) >> 48) & 0xffff)) +#define MREF_LE(x) ((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU)) +#define MSEQNO_LE(x) ((u16)((le64_to_cpu(x) >> 48) & 0xffff)) + +#define IS_ERR_MREF(x) (((x) & 0x0000800000000000ULL) ? true : false) +#define ERR_MREF(x) ((u64)((s64)(x))) +#define MREF_ERR(x) ((int)((s64)(x))) + +/* + * The mft record header present at the beginning of every record in the mft. + * This is followed by a sequence of variable length attribute records which + * is terminated by an attribute of type AT_END which is a truncated attribute + * in that it only consists of the attribute type code AT_END and none of the + * other members of the attribute structure are present. + */ +typedef struct { +/*Ofs*/ +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ + NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ + le16 usa_ofs; /* See NTFS_RECORD definition above. */ + le16 usa_count; /* See NTFS_RECORD definition above. */ + +/* 8*/ le64 lsn; /* $LogFile sequence number for this record. + Changed every time the record is modified. */ +/* 16*/ le16 sequence_number; /* Number of times this mft record has been + reused. (See description for MFT_REF + above.) NOTE: The increment (skipping zero) + is done when the file is deleted. NOTE: If + this is zero it is left zero. */ +/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of + directory entries referencing this record. + NOTE: Only used in mft base records. + NOTE: When deleting a directory entry we + check the link_count and if it is 1 we + delete the file. Otherwise we delete the + FILE_NAME_ATTR being referenced by the + directory entry from the mft record and + decrement the link_count. + FIXME: Careful with Win32 + DOS names! */ +/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this + mft record from the start of the mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file + is deleted, the MFT_RECORD_IN_USE flag is + set to zero. */ +/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft + record. This should be equal to the mft + record size. */ +/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. + When it is not zero it is a mft reference + pointing to the base mft record to which + this record belongs (this is then used to + locate the attribute list attribute present + in the base record which describes this + extension record and hence might need + modification when the extension record + itself is modified, also locating the + attribute list also means finding the other + potential extents, belonging to the non-base + mft record). */ +/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to + the next attribute added to this mft record. + NOTE: Incremented each time after it is used. + NOTE: Every time the mft record is reused + this number is set to zero. NOTE: The first + instance number is always 0. */ +/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */ +/* 42*/ le16 reserved; /* Reserved/alignment. */ +/* 44*/ le32 mft_record_number; /* Number of this mft record. */ +/* sizeof() = 48 bytes */ +/* + * When (re)using the mft record, we place the update sequence array at this + * offset, i.e. before we start with the attributes. This also makes sense, + * otherwise we could run into problems with the update sequence array + * containing in itself the last two bytes of a sector which would mean that + * multi sector transfer protection wouldn't work. As you can't protect data + * by overwriting it since you then can't get it back... + * When reading we obviously use the data from the ntfs record header. + */ +} __attribute__ ((__packed__)) MFT_RECORD; + +/* This is the version without the NTFS 3.1+ specific fields. */ +typedef struct { +/*Ofs*/ +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ + NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ + le16 usa_ofs; /* See NTFS_RECORD definition above. */ + le16 usa_count; /* See NTFS_RECORD definition above. */ + +/* 8*/ le64 lsn; /* $LogFile sequence number for this record. + Changed every time the record is modified. */ +/* 16*/ le16 sequence_number; /* Number of times this mft record has been + reused. (See description for MFT_REF + above.) NOTE: The increment (skipping zero) + is done when the file is deleted. NOTE: If + this is zero it is left zero. */ +/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of + directory entries referencing this record. + NOTE: Only used in mft base records. + NOTE: When deleting a directory entry we + check the link_count and if it is 1 we + delete the file. Otherwise we delete the + FILE_NAME_ATTR being referenced by the + directory entry from the mft record and + decrement the link_count. + FIXME: Careful with Win32 + DOS names! */ +/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this + mft record from the start of the mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file + is deleted, the MFT_RECORD_IN_USE flag is + set to zero. */ +/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft + record. This should be equal to the mft + record size. */ +/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. + When it is not zero it is a mft reference + pointing to the base mft record to which + this record belongs (this is then used to + locate the attribute list attribute present + in the base record which describes this + extension record and hence might need + modification when the extension record + itself is modified, also locating the + attribute list also means finding the other + potential extents, belonging to the non-base + mft record). */ +/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to + the next attribute added to this mft record. + NOTE: Incremented each time after it is used. + NOTE: Every time the mft record is reused + this number is set to zero. NOTE: The first + instance number is always 0. */ +/* sizeof() = 42 bytes */ +/* + * When (re)using the mft record, we place the update sequence array at this + * offset, i.e. before we start with the attributes. This also makes sense, + * otherwise we could run into problems with the update sequence array + * containing in itself the last two bytes of a sector which would mean that + * multi sector transfer protection wouldn't work. As you can't protect data + * by overwriting it since you then can't get it back... + * When reading we obviously use the data from the ntfs record header. + */ +} __attribute__ ((__packed__)) MFT_RECORD_OLD; + +/* + * System defined attributes (32-bit). Each attribute type has a corresponding + * attribute name (Unicode string of maximum 64 character length) as described + * by the attribute definitions present in the data attribute of the $AttrDef + * system file. On NTFS 3.0 volumes the names are just as the types are named + * in the below defines exchanging AT_ for the dollar sign ($). If that is not + * a revealing choice of symbol I do not know what is... (-; + */ +enum { + AT_UNUSED = cpu_to_le32( 0), + AT_STANDARD_INFORMATION = cpu_to_le32( 0x10), + AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20), + AT_FILE_NAME = cpu_to_le32( 0x30), + AT_OBJECT_ID = cpu_to_le32( 0x40), + AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50), + AT_VOLUME_NAME = cpu_to_le32( 0x60), + AT_VOLUME_INFORMATION = cpu_to_le32( 0x70), + AT_DATA = cpu_to_le32( 0x80), + AT_INDEX_ROOT = cpu_to_le32( 0x90), + AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0), + AT_BITMAP = cpu_to_le32( 0xb0), + AT_REPARSE_POINT = cpu_to_le32( 0xc0), + AT_EA_INFORMATION = cpu_to_le32( 0xd0), + AT_EA = cpu_to_le32( 0xe0), + AT_PROPERTY_SET = cpu_to_le32( 0xf0), + AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100), + AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000), + AT_END = cpu_to_le32(0xffffffff) +}; + +typedef le32 ATTR_TYPE; + +/* + * The collation rules for sorting views/indexes/etc (32-bit). + * + * COLLATION_BINARY - Collate by binary compare where the first byte is most + * significant. + * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary + * Unicode values, except that when a character can be uppercased, the + * upper case value collates before the lower case one. + * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation + * is done very much like COLLATION_UNICODE_STRING. In fact I have no idea + * what the difference is. Perhaps the difference is that file names + * would treat some special characters in an odd way (see + * unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[] + * for what I mean but COLLATION_UNICODE_STRING would not give any special + * treatment to any characters at all, but this is speculation. + * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key + * values. E.g. used for $SII index in FILE_Secure, which sorts by + * security_id (le32). + * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values. + * E.g. used for $O index in FILE_Extend/$Quota. + * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash + * values and second by ascending security_id values. E.g. used for $SDH + * index in FILE_Secure. + * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending + * le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which + * sorts by object_id (16-byte), by splitting up the object_id in four + * le32 values and using them as individual keys. E.g. take the following + * two security_ids, stored as follows on disk: + * 1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59 + * 2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45 + * To compare them, they are split into four le32 values each, like so: + * 1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081 + * 2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179 + * Now, it is apparent why the 2nd object_id collates after the 1st: the + * first le32 value of the 1st object_id is less than the first le32 of + * the 2nd object_id. If the first le32 values of both object_ids were + * equal then the second le32 values would be compared, etc. + */ +enum { + COLLATION_BINARY = cpu_to_le32(0x00), + COLLATION_FILE_NAME = cpu_to_le32(0x01), + COLLATION_UNICODE_STRING = cpu_to_le32(0x02), + COLLATION_NTOFS_ULONG = cpu_to_le32(0x10), + COLLATION_NTOFS_SID = cpu_to_le32(0x11), + COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12), + COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13), +}; + +typedef le32 COLLATION_RULE; + +/* + * The flags (32-bit) describing attribute properties in the attribute + * definition structure. FIXME: This information is based on Regis's + * information and, according to him, it is not certain and probably + * incomplete. The INDEXABLE flag is fairly certainly correct as only the file + * name attribute has this flag set and this is the only attribute indexed in + * NT4. + */ +enum { + ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be + indexed. */ + ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type + can be present multiple times in the + mft records of an inode. */ + ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value + must contain at least one non-zero + byte. */ + ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be + indexed and the attribute value must be + unique for the attribute type in all of + the mft records of an inode. */ + ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be + named and the name must be unique for + the attribute type in all of the mft + records of an inode. */ + ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be + resident. */ + ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log + modifications to this attribute, + regardless of whether it is resident or + non-resident. Without this, only log + modifications if the attribute is + resident. */ +}; + +typedef le32 ATTR_DEF_FLAGS; + +/* + * The data attribute of FILE_AttrDef contains a sequence of attribute + * definitions for the NTFS volume. With this, it is supposed to be safe for an + * older NTFS driver to mount a volume containing a newer NTFS version without + * damaging it (that's the theory. In practice it's: not damaging it too much). + * Entries are sorted by attribute type. The flags describe whether the + * attribute can be resident/non-resident and possibly other things, but the + * actual bits are unknown. + */ +typedef struct { +/*hex ofs*/ +/* 0*/ ntfschar name[0x40]; /* Unicode name of the attribute. Zero + terminated. */ +/* 80*/ ATTR_TYPE type; /* Type of the attribute. */ +/* 84*/ le32 display_rule; /* Default display rule. + FIXME: What does it mean? (AIA) */ +/* 88*/ COLLATION_RULE collation_rule; /* Default collation rule. */ +/* 8c*/ ATTR_DEF_FLAGS flags; /* Flags describing the attribute. */ +/* 90*/ sle64 min_size; /* Optional minimum attribute size. */ +/* 98*/ sle64 max_size; /* Maximum size of attribute. */ +/* sizeof() = 0xa0 or 160 bytes */ +} __attribute__ ((__packed__)) ATTR_DEF; + +/* + * Attribute flags (16-bit). + */ +enum { + ATTR_IS_COMPRESSED = cpu_to_le16(0x0001), + ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method + mask. Also, first + illegal value. */ + ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000), + ATTR_IS_SPARSE = cpu_to_le16(0x8000), +} __attribute__ ((__packed__)); + +typedef le16 ATTR_FLAGS; + +/* + * Attribute compression. + * + * Only the data attribute is ever compressed in the current ntfs driver in + * Windows. Further, compression is only applied when the data attribute is + * non-resident. Finally, to use compression, the maximum allowed cluster size + * on a volume is 4kib. + * + * The compression method is based on independently compressing blocks of X + * clusters, where X is determined from the compression_unit value found in the + * non-resident attribute record header (more precisely: X = 2^compression_unit + * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4). + * + * There are three different cases of how a compression block of X clusters + * can be stored: + * + * 1) The data in the block is all zero (a sparse block): + * This is stored as a sparse block in the runlist, i.e. the runlist + * entry has length = X and lcn = -1. The mapping pairs array actually + * uses a delta_lcn value length of 0, i.e. delta_lcn is not present at + * all, which is then interpreted by the driver as lcn = -1. + * NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then + * the same principles apply as above, except that the length is not + * restricted to being any particular value. + * + * 2) The data in the block is not compressed: + * This happens when compression doesn't reduce the size of the block + * in clusters. I.e. if compression has a small effect so that the + * compressed data still occupies X clusters, then the uncompressed data + * is stored in the block. + * This case is recognised by the fact that the runlist entry has + * length = X and lcn >= 0. The mapping pairs array stores this as + * normal with a run length of X and some specific delta_lcn, i.e. + * delta_lcn has to be present. + * + * 3) The data in the block is compressed: + * The common case. This case is recognised by the fact that the run + * list entry has length L < X and lcn >= 0. The mapping pairs array + * stores this as normal with a run length of X and some specific + * delta_lcn, i.e. delta_lcn has to be present. This runlist entry is + * immediately followed by a sparse entry with length = X - L and + * lcn = -1. The latter entry is to make up the vcn counting to the + * full compression block size X. + * + * In fact, life is more complicated because adjacent entries of the same type + * can be coalesced. This means that one has to keep track of the number of + * clusters handled and work on a basis of X clusters at a time being one + * block. An example: if length L > X this means that this particular runlist + * entry contains a block of length X and part of one or more blocks of length + * L - X. Another example: if length L < X, this does not necessarily mean that + * the block is compressed as it might be that the lcn changes inside the block + * and hence the following runlist entry describes the continuation of the + * potentially compressed block. The block would be compressed if the + * following runlist entry describes at least X - L sparse clusters, thus + * making up the compression block length as described in point 3 above. (Of + * course, there can be several runlist entries with small lengths so that the + * sparse entry does not follow the first data containing entry with + * length < X.) + * + * NOTE: At the end of the compressed attribute value, there most likely is not + * just the right amount of data to make up a compression block, thus this data + * is not even attempted to be compressed. It is just stored as is, unless + * the number of clusters it occupies is reduced when compressed in which case + * it is stored as a compressed compression block, complete with sparse + * clusters at the end. + */ + +/* + * Flags of resident attributes (8-bit). + */ +enum { + RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index + (has implications for deleting and + modifying the attribute). */ +} __attribute__ ((__packed__)); + +typedef u8 RESIDENT_ATTR_FLAGS; + +/* + * Attribute record header. Always aligned to 8-byte boundary. + */ +typedef struct { +/*Ofs*/ +/* 0*/ ATTR_TYPE type; /* The (32-bit) type of the attribute. */ +/* 4*/ le32 length; /* Byte size of the resident part of the + attribute (aligned to 8-byte boundary). + Used to get to the next attribute. */ +/* 8*/ u8 non_resident; /* If 0, attribute is resident. + If 1, attribute is non-resident. */ +/* 9*/ u8 name_length; /* Unicode character size of name of attribute. + 0 if unnamed. */ +/* 10*/ le16 name_offset; /* If name_length != 0, the byte offset to the + beginning of the name from the attribute + record. Note that the name is stored as a + Unicode string. When creating, place offset + just at the end of the record header. Then, + follow with attribute value or mapping pairs + array, resident and non-resident attributes + respectively, aligning to an 8-byte + boundary. */ +/* 12*/ ATTR_FLAGS flags; /* Flags describing the attribute. */ +/* 14*/ le16 instance; /* The instance of this attribute record. This + number is unique within this mft record (see + MFT_RECORD/next_attribute_instance notes in + in mft.h for more details). */ +/* 16*/ union { + /* Resident attributes. */ + struct { +/* 16 */ le32 value_length;/* Byte size of attribute value. */ +/* 20 */ le16 value_offset;/* Byte offset of the attribute + value from the start of the + attribute record. When creating, + align to 8-byte boundary if we + have a name present as this might + not have a length of a multiple + of 8-bytes. */ +/* 22 */ RESIDENT_ATTR_FLAGS flags; /* See above. */ +/* 23 */ s8 reserved; /* Reserved/alignment to 8-byte + boundary. */ + } __attribute__ ((__packed__)) resident; + /* Non-resident attributes. */ + struct { +/* 16*/ leVCN lowest_vcn;/* Lowest valid virtual cluster number + for this portion of the attribute value or + 0 if this is the only extent (usually the + case). - Only when an attribute list is used + does lowest_vcn != 0 ever occur. */ +/* 24*/ leVCN highest_vcn;/* Highest valid vcn of this extent of + the attribute value. - Usually there is only one + portion, so this usually equals the attribute + value size in clusters minus 1. Can be -1 for + zero length files. Can be 0 for "single extent" + attributes. */ +/* 32*/ le16 mapping_pairs_offset; /* Byte offset from the + beginning of the structure to the mapping pairs + array which contains the mappings between the + vcns and the logical cluster numbers (lcns). + When creating, place this at the end of this + record header aligned to 8-byte boundary. */ +/* 34*/ u8 compression_unit; /* The compression unit expressed + as the log to the base 2 of the number of + clusters in a compression unit. 0 means not + compressed. (This effectively limits the + compression unit size to be a power of two + clusters.) WinNT4 only uses a value of 4. + Sparse files have this set to 0 on XPSP2. */ +/* 35*/ u8 reserved[5]; /* Align to 8-byte boundary. */ +/* The sizes below are only used when lowest_vcn is zero, as otherwise it would + be difficult to keep them up-to-date.*/ +/* 40*/ sle64 allocated_size; /* Byte size of disk space + allocated to hold the attribute value. Always + is a multiple of the cluster size. When a file + is compressed, this field is a multiple of the + compression block size (2^compression_unit) and + it represents the logically allocated space + rather than the actual on disk usage. For this + use the compressed_size (see below). */ +/* 48*/ sle64 data_size; /* Byte size of the attribute + value. Can be larger than allocated_size if + attribute value is compressed or sparse. */ +/* 56*/ sle64 initialized_size; /* Byte size of initialized + portion of the attribute value. Usually equals + data_size. */ +/* sizeof(uncompressed attr) = 64*/ +/* 64*/ sle64 compressed_size; /* Byte size of the attribute + value after compression. Only present when + compressed or sparse. Always is a multiple of + the cluster size. Represents the actual amount + of disk space being used on the disk. */ +/* sizeof(compressed attr) = 72*/ + } __attribute__ ((__packed__)) non_resident; + } __attribute__ ((__packed__)) data; +} __attribute__ ((__packed__)) ATTR_RECORD; + +typedef ATTR_RECORD ATTR_REC; + +/* + * File attribute flags (32-bit) appearing in the file_attributes fields of the + * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR + * attributes of MFT_RECORDs and directory index entries. + * + * All of the below flags appear in the directory index entries but only some + * appear in the STANDARD_INFORMATION attribute whilst only some others appear + * in the FILENAME_ATTR attribute of MFT_RECORDs. Unless otherwise stated the + * flags appear in all of the above. + */ +enum { + FILE_ATTR_READONLY = cpu_to_le32(0x00000001), + FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002), + FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004), + /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */ + + FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010), + /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is + reserved for the DOS SUBDIRECTORY flag. */ + FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020), + FILE_ATTR_DEVICE = cpu_to_le32(0x00000040), + FILE_ATTR_NORMAL = cpu_to_le32(0x00000080), + + FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100), + FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200), + FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400), + FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800), + + FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000), + FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000), + FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000), + + FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7), + /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the + FILE_ATTR_DEVICE and preserves everything else. This mask is used + to obtain all flags that are valid for reading. */ + FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7), + /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the + F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, + F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask + is used to obtain all flags that are valid for setting. */ + /* + * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all + * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION + * attribute of an mft record. + */ + FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000), + /* Note, this is a copy of the corresponding bit from the mft record, + telling us whether this is a directory or not, i.e. whether it has + an index root attribute or not. */ + FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000), + /* Note, this is a copy of the corresponding bit from the mft record, + telling us whether this file has a view index present (eg. object id + index, quota index, one of the security indexes or the encrypting + filesystem related indexes). */ +}; + +typedef le32 FILE_ATTR_FLAGS; + +/* + * NOTE on times in NTFS: All times are in MS standard time format, i.e. they + * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00 + * universal coordinated time (UTC). (In Linux time starts 1st January 1970, + * 00:00:00 UTC and is stored as the number of 1-second intervals since then.) + */ + +/* + * Attribute: Standard information (0x10). + * + * NOTE: Always resident. + * NOTE: Present in all base file records on a volume. + * NOTE: There is conflicting information about the meaning of each of the time + * fields but the meaning as defined below has been verified to be + * correct by practical experimentation on Windows NT4 SP6a and is hence + * assumed to be the one and only correct interpretation. + */ +typedef struct { +/*Ofs*/ +/* 0*/ sle64 creation_time; /* Time file was created. Updated when + a filename is changed(?). */ +/* 8*/ sle64 last_data_change_time; /* Time the data attribute was last + modified. */ +/* 16*/ sle64 last_mft_change_time; /* Time this mft record was last + modified. */ +/* 24*/ sle64 last_access_time; /* Approximate time when the file was + last accessed (obviously this is not + updated on read-only volumes). In + Windows this is only updated when + accessed if some time delta has + passed since the last update. Also, + last access time updates can be + disabled altogether for speed. */ +/* 32*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ +/* 36*/ union { + /* NTFS 1.2 */ + struct { + /* 36*/ u8 reserved12[12]; /* Reserved/alignment to 8-byte + boundary. */ + } __attribute__ ((__packed__)) v1; + /* sizeof() = 48 bytes */ + /* NTFS 3.x */ + struct { +/* + * If a volume has been upgraded from a previous NTFS version, then these + * fields are present only if the file has been accessed since the upgrade. + * Recognize the difference by comparing the length of the resident attribute + * value. If it is 48, then the following fields are missing. If it is 72 then + * the fields are present. Maybe just check like this: + * if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) { + * Assume NTFS 1.2- format. + * If (volume version is 3.x) + * Upgrade attribute to NTFS 3.x format. + * else + * Use NTFS 1.2- format for access. + * } else + * Use NTFS 3.x format for access. + * Only problem is that it might be legal to set the length of the value to + * arbitrarily large values thus spoiling this check. - But chkdsk probably + * views that as a corruption, assuming that it behaves like this for all + * attributes. + */ + /* 36*/ le32 maximum_versions; /* Maximum allowed versions for + file. Zero if version numbering is disabled. */ + /* 40*/ le32 version_number; /* This file's version (if any). + Set to zero if maximum_versions is zero. */ + /* 44*/ le32 class_id; /* Class id from bidirectional + class id index (?). */ + /* 48*/ le32 owner_id; /* Owner_id of the user owning + the file. Translate via $Q index in FILE_Extend + /$Quota to the quota control entry for the user + owning the file. Zero if quotas are disabled. */ + /* 52*/ le32 security_id; /* Security_id for the file. + Translate via $SII index and $SDS data stream + in FILE_Secure to the security descriptor. */ + /* 56*/ le64 quota_charged; /* Byte size of the charge to + the quota for all streams of the file. Note: Is + zero if quotas are disabled. */ + /* 64*/ leUSN usn; /* Last update sequence number + of the file. This is a direct index into the + transaction log file ($UsnJrnl). It is zero if + the usn journal is disabled or this file has + not been subject to logging yet. See usnjrnl.h + for details. */ + } __attribute__ ((__packed__)) v3; + /* sizeof() = 72 bytes (NTFS 3.x) */ + } __attribute__ ((__packed__)) ver; +} __attribute__ ((__packed__)) STANDARD_INFORMATION; + +/* + * Attribute: Attribute list (0x20). + * + * - Can be either resident or non-resident. + * - Value consists of a sequence of variable length, 8-byte aligned, + * ATTR_LIST_ENTRY records. + * - The list is not terminated by anything at all! The only way to know when + * the end is reached is to keep track of the current offset and compare it to + * the attribute value size. + * - The attribute list attribute contains one entry for each attribute of + * the file in which the list is located, except for the list attribute + * itself. The list is sorted: first by attribute type, second by attribute + * name (if present), third by instance number. The extents of one + * non-resident attribute (if present) immediately follow after the initial + * extent. They are ordered by lowest_vcn and have their instace set to zero. + * It is not allowed to have two attributes with all sorting keys equal. + * - Further restrictions: + * - If not resident, the vcn to lcn mapping array has to fit inside the + * base mft record. + * - The attribute list attribute value has a maximum size of 256kb. This + * is imposed by the Windows cache manager. + * - Attribute lists are only used when the attributes of mft record do not + * fit inside the mft record despite all attributes (that can be made + * non-resident) having been made non-resident. This can happen e.g. when: + * - File has a large number of hard links (lots of file name + * attributes present). + * - The mapping pairs array of some non-resident attribute becomes so + * large due to fragmentation that it overflows the mft record. + * - The security descriptor is very complex (not applicable to + * NTFS 3.0 volumes). + * - There are many named streams. + */ +typedef struct { +/*Ofs*/ +/* 0*/ ATTR_TYPE type; /* Type of referenced attribute. */ +/* 4*/ le16 length; /* Byte size of this entry (8-byte aligned). */ +/* 6*/ u8 name_length; /* Size in Unicode chars of the name of the + attribute or 0 if unnamed. */ +/* 7*/ u8 name_offset; /* Byte offset to beginning of attribute name + (always set this to where the name would + start even if unnamed). */ +/* 8*/ leVCN lowest_vcn; /* Lowest virtual cluster number of this portion + of the attribute value. This is usually 0. It + is non-zero for the case where one attribute + does not fit into one mft record and thus + several mft records are allocated to hold + this attribute. In the latter case, each mft + record holds one extent of the attribute and + there is one attribute list entry for each + extent. NOTE: This is DEFINITELY a signed + value! The windows driver uses cmp, followed + by jg when comparing this, thus it treats it + as signed. */ +/* 16*/ leMFT_REF mft_reference;/* The reference of the mft record holding + the ATTR_RECORD for this portion of the + attribute value. */ +/* 24*/ le16 instance; /* If lowest_vcn = 0, the instance of the + attribute being referenced; otherwise 0. */ +/* 26*/ ntfschar name[0]; /* Use when creating only. When reading use + name_offset to determine the location of the + name. */ +/* sizeof() = 26 + (attribute_name_length * 2) bytes */ +} __attribute__ ((__packed__)) ATTR_LIST_ENTRY; + +/* + * The maximum allowed length for a file name. + */ +#define MAXIMUM_FILE_NAME_LENGTH 255 + +/* + * Possible namespaces for filenames in ntfs (8-bit). + */ +enum { + FILE_NAME_POSIX = 0x00, + /* This is the largest namespace. It is case sensitive and allows all + Unicode characters except for: '\0' and '/'. Beware that in + WinNT/2k/2003 by default files which eg have the same name except + for their case will not be distinguished by the standard utilities + and thus a "del filename" will delete both "filename" and "fileName" + without warning. However if for example Services For Unix (SFU) are + installed and the case sensitive option was enabled at installation + time, then you can create/access/delete such files. + Note that even SFU places restrictions on the filenames beyond the + '\0' and '/' and in particular the following set of characters is + not allowed: '"', '/', '<', '>', '\'. All other characters, + including the ones no allowed in WIN32 namespace are allowed. + Tested with SFU 3.5 (this is now free) running on Windows XP. */ + FILE_NAME_WIN32 = 0x01, + /* The standard WinNT/2k NTFS long filenames. Case insensitive. All + Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\', + and '|'. Further, names cannot end with a '.' or a space. */ + FILE_NAME_DOS = 0x02, + /* The standard DOS filenames (8.3 format). Uppercase only. All 8-bit + characters greater space, except: '"', '*', '+', ',', '/', ':', ';', + '<', '=', '>', '?', and '\'. */ + FILE_NAME_WIN32_AND_DOS = 0x03, + /* 3 means that both the Win32 and the DOS filenames are identical and + hence have been saved in this single filename record. */ +} __attribute__ ((__packed__)); + +typedef u8 FILE_NAME_TYPE_FLAGS; + +/* + * Attribute: Filename (0x30). + * + * NOTE: Always resident. + * NOTE: All fields, except the parent_directory, are only updated when the + * filename is changed. Until then, they just become out of sync with + * reality and the more up to date values are present in the standard + * information attribute. + * NOTE: There is conflicting information about the meaning of each of the time + * fields but the meaning as defined below has been verified to be + * correct by practical experimentation on Windows NT4 SP6a and is hence + * assumed to be the one and only correct interpretation. + */ +typedef struct { +/*hex ofs*/ +/* 0*/ leMFT_REF parent_directory; /* Directory this filename is + referenced from. */ +/* 8*/ sle64 creation_time; /* Time file was created. */ +/* 10*/ sle64 last_data_change_time; /* Time the data attribute was last + modified. */ +/* 18*/ sle64 last_mft_change_time; /* Time this mft record was last + modified. */ +/* 20*/ sle64 last_access_time; /* Time this mft record was last + accessed. */ +/* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space + for the unnamed data attribute. So + for normal $DATA, this is the + allocated_size from the unnamed + $DATA attribute and for compressed + and/or sparse $DATA, this is the + compressed_size from the unnamed + $DATA attribute. For a directory or + other inode without an unnamed $DATA + attribute, this is always 0. NOTE: + This is a multiple of the cluster + size. */ +/* 30*/ sle64 data_size; /* Byte size of actual data in unnamed + data attribute. For a directory or + other inode without an unnamed $DATA + attribute, this is always 0. */ +/* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ +/* 3c*/ union { + /* 3c*/ struct { + /* 3c*/ le16 packed_ea_size; /* Size of the buffer needed to + pack the extended attributes + (EAs), if such are present.*/ + /* 3e*/ le16 reserved; /* Reserved for alignment. */ + } __attribute__ ((__packed__)) ea; + /* 3c*/ struct { + /* 3c*/ le32 reparse_point_tag; /* Type of reparse point, + present only in reparse + points and only if there are + no EAs. */ + } __attribute__ ((__packed__)) rp; + } __attribute__ ((__packed__)) type; +/* 40*/ u8 file_name_length; /* Length of file name in + (Unicode) characters. */ +/* 41*/ FILE_NAME_TYPE_FLAGS file_name_type; /* Namespace of the file name.*/ +/* 42*/ ntfschar file_name[0]; /* File name in Unicode. */ +} __attribute__ ((__packed__)) FILE_NAME_ATTR; + +/* + * GUID structures store globally unique identifiers (GUID). A GUID is a + * 128-bit value consisting of one group of eight hexadecimal digits, followed + * by three groups of four hexadecimal digits each, followed by one group of + * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the + * distributed computing environment (DCE) universally unique identifier (UUID). + * Example of a GUID: + * 1F010768-5A73-BC91-0010A52216A7 + */ +typedef struct { + le32 data1; /* The first eight hexadecimal digits of the GUID. */ + le16 data2; /* The first group of four hexadecimal digits. */ + le16 data3; /* The second group of four hexadecimal digits. */ + u8 data4[8]; /* The first two bytes are the third group of four + hexadecimal digits. The remaining six bytes are the + final 12 hexadecimal digits. */ +} __attribute__ ((__packed__)) GUID; + +/* + * FILE_Extend/$ObjId contains an index named $O. This index contains all + * object_ids present on the volume as the index keys and the corresponding + * mft_record numbers as the index entry data parts. The data part (defined + * below) also contains three other object_ids: + * birth_volume_id - object_id of FILE_Volume on which the file was first + * created. Optional (i.e. can be zero). + * birth_object_id - object_id of file when it was first created. Usually + * equals the object_id. Optional (i.e. can be zero). + * domain_id - Reserved (always zero). + */ +typedef struct { + leMFT_REF mft_reference;/* Mft record containing the object_id in + the index entry key. */ + union { + struct { + GUID birth_volume_id; + GUID birth_object_id; + GUID domain_id; + } __attribute__ ((__packed__)) origin; + u8 extended_info[48]; + } __attribute__ ((__packed__)) opt; +} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA; + +/* + * Attribute: Object id (NTFS 3.0+) (0x40). + * + * NOTE: Always resident. + */ +typedef struct { + GUID object_id; /* Unique id assigned to the + file.*/ + /* The following fields are optional. The attribute value size is 16 + bytes, i.e. sizeof(GUID), if these are not present at all. Note, + the entries can be present but one or more (or all) can be zero + meaning that that particular value(s) is(are) not defined. */ + union { + struct { + GUID birth_volume_id; /* Unique id of volume on which + the file was first created.*/ + GUID birth_object_id; /* Unique id of file when it was + first created. */ + GUID domain_id; /* Reserved, zero. */ + } __attribute__ ((__packed__)) origin; + u8 extended_info[48]; + } __attribute__ ((__packed__)) opt; +} __attribute__ ((__packed__)) OBJECT_ID_ATTR; + +/* + * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in + * the SID structure (see below). + */ +//typedef enum { /* SID string prefix. */ +// SECURITY_NULL_SID_AUTHORITY = {0, 0, 0, 0, 0, 0}, /* S-1-0 */ +// SECURITY_WORLD_SID_AUTHORITY = {0, 0, 0, 0, 0, 1}, /* S-1-1 */ +// SECURITY_LOCAL_SID_AUTHORITY = {0, 0, 0, 0, 0, 2}, /* S-1-2 */ +// SECURITY_CREATOR_SID_AUTHORITY = {0, 0, 0, 0, 0, 3}, /* S-1-3 */ +// SECURITY_NON_UNIQUE_AUTHORITY = {0, 0, 0, 0, 0, 4}, /* S-1-4 */ +// SECURITY_NT_SID_AUTHORITY = {0, 0, 0, 0, 0, 5}, /* S-1-5 */ +//} IDENTIFIER_AUTHORITIES; + +/* + * These relative identifiers (RIDs) are used with the above identifier + * authorities to make up universal well-known SIDs. + * + * Note: The relative identifier (RID) refers to the portion of a SID, which + * identifies a user or group in relation to the authority that issued the SID. + * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is + * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and + * the relative identifier SECURITY_CREATOR_OWNER_RID (0). + */ +typedef enum { /* Identifier authority. */ + SECURITY_NULL_RID = 0, /* S-1-0 */ + SECURITY_WORLD_RID = 0, /* S-1-1 */ + SECURITY_LOCAL_RID = 0, /* S-1-2 */ + + SECURITY_CREATOR_OWNER_RID = 0, /* S-1-3 */ + SECURITY_CREATOR_GROUP_RID = 1, /* S-1-3 */ + + SECURITY_CREATOR_OWNER_SERVER_RID = 2, /* S-1-3 */ + SECURITY_CREATOR_GROUP_SERVER_RID = 3, /* S-1-3 */ + + SECURITY_DIALUP_RID = 1, + SECURITY_NETWORK_RID = 2, + SECURITY_BATCH_RID = 3, + SECURITY_INTERACTIVE_RID = 4, + SECURITY_SERVICE_RID = 6, + SECURITY_ANONYMOUS_LOGON_RID = 7, + SECURITY_PROXY_RID = 8, + SECURITY_ENTERPRISE_CONTROLLERS_RID=9, + SECURITY_SERVER_LOGON_RID = 9, + SECURITY_PRINCIPAL_SELF_RID = 0xa, + SECURITY_AUTHENTICATED_USER_RID = 0xb, + SECURITY_RESTRICTED_CODE_RID = 0xc, + SECURITY_TERMINAL_SERVER_RID = 0xd, + + SECURITY_LOGON_IDS_RID = 5, + SECURITY_LOGON_IDS_RID_COUNT = 3, + + SECURITY_LOCAL_SYSTEM_RID = 0x12, + + SECURITY_NT_NON_UNIQUE = 0x15, + + SECURITY_BUILTIN_DOMAIN_RID = 0x20, + + /* + * Well-known domain relative sub-authority values (RIDs). + */ + + /* Users. */ + DOMAIN_USER_RID_ADMIN = 0x1f4, + DOMAIN_USER_RID_GUEST = 0x1f5, + DOMAIN_USER_RID_KRBTGT = 0x1f6, + + /* Groups. */ + DOMAIN_GROUP_RID_ADMINS = 0x200, + DOMAIN_GROUP_RID_USERS = 0x201, + DOMAIN_GROUP_RID_GUESTS = 0x202, + DOMAIN_GROUP_RID_COMPUTERS = 0x203, + DOMAIN_GROUP_RID_CONTROLLERS = 0x204, + DOMAIN_GROUP_RID_CERT_ADMINS = 0x205, + DOMAIN_GROUP_RID_SCHEMA_ADMINS = 0x206, + DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207, + DOMAIN_GROUP_RID_POLICY_ADMINS = 0x208, + + /* Aliases. */ + DOMAIN_ALIAS_RID_ADMINS = 0x220, + DOMAIN_ALIAS_RID_USERS = 0x221, + DOMAIN_ALIAS_RID_GUESTS = 0x222, + DOMAIN_ALIAS_RID_POWER_USERS = 0x223, + + DOMAIN_ALIAS_RID_ACCOUNT_OPS = 0x224, + DOMAIN_ALIAS_RID_SYSTEM_OPS = 0x225, + DOMAIN_ALIAS_RID_PRINT_OPS = 0x226, + DOMAIN_ALIAS_RID_BACKUP_OPS = 0x227, + + DOMAIN_ALIAS_RID_REPLICATOR = 0x228, + DOMAIN_ALIAS_RID_RAS_SERVERS = 0x229, + DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a, +} RELATIVE_IDENTIFIERS; + +/* + * The universal well-known SIDs: + * + * NULL_SID S-1-0-0 + * WORLD_SID S-1-1-0 + * LOCAL_SID S-1-2-0 + * CREATOR_OWNER_SID S-1-3-0 + * CREATOR_GROUP_SID S-1-3-1 + * CREATOR_OWNER_SERVER_SID S-1-3-2 + * CREATOR_GROUP_SERVER_SID S-1-3-3 + * + * (Non-unique IDs) S-1-4 + * + * NT well-known SIDs: + * + * NT_AUTHORITY_SID S-1-5 + * DIALUP_SID S-1-5-1 + * + * NETWORD_SID S-1-5-2 + * BATCH_SID S-1-5-3 + * INTERACTIVE_SID S-1-5-4 + * SERVICE_SID S-1-5-6 + * ANONYMOUS_LOGON_SID S-1-5-7 (aka null logon session) + * PROXY_SID S-1-5-8 + * SERVER_LOGON_SID S-1-5-9 (aka domain controller account) + * SELF_SID S-1-5-10 (self RID) + * AUTHENTICATED_USER_SID S-1-5-11 + * RESTRICTED_CODE_SID S-1-5-12 (running restricted code) + * TERMINAL_SERVER_SID S-1-5-13 (running on terminal server) + * + * (Logon IDs) S-1-5-5-X-Y + * + * (NT non-unique IDs) S-1-5-0x15-... + * + * (Built-in domain) S-1-5-0x20 + */ + +/* + * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure. + * + * NOTE: This is stored as a big endian number, hence the high_part comes + * before the low_part. + */ +typedef union { + struct { + u16 high_part; /* High 16-bits. */ + u32 low_part; /* Low 32-bits. */ + } __attribute__ ((__packed__)) parts; + u8 value[6]; /* Value as individual bytes. */ +} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY; + +/* + * The SID structure is a variable-length structure used to uniquely identify + * users or groups. SID stands for security identifier. + * + * The standard textual representation of the SID is of the form: + * S-R-I-S-S... + * Where: + * - The first "S" is the literal character 'S' identifying the following + * digits as a SID. + * - R is the revision level of the SID expressed as a sequence of digits + * either in decimal or hexadecimal (if the later, prefixed by "0x"). + * - I is the 48-bit identifier_authority, expressed as digits as R above. + * - S... is one or more sub_authority values, expressed as digits as above. + * + * Example SID; the domain-relative SID of the local Administrators group on + * Windows NT/2k: + * S-1-5-32-544 + * This translates to a SID with: + * revision = 1, + * sub_authority_count = 2, + * identifier_authority = {0,0,0,0,0,5}, // SECURITY_NT_AUTHORITY + * sub_authority[0] = 32, // SECURITY_BUILTIN_DOMAIN_RID + * sub_authority[1] = 544 // DOMAIN_ALIAS_RID_ADMINS + */ +typedef struct { + u8 revision; + u8 sub_authority_count; + SID_IDENTIFIER_AUTHORITY identifier_authority; + le32 sub_authority[1]; /* At least one sub_authority. */ +} __attribute__ ((__packed__)) SID; + +/* + * Current constants for SIDs. + */ +typedef enum { + SID_REVISION = 1, /* Current revision level. */ + SID_MAX_SUB_AUTHORITIES = 15, /* Maximum number of those. */ + SID_RECOMMENDED_SUB_AUTHORITIES = 1, /* Will change to around 6 in + a future revision. */ +} SID_CONSTANTS; + +/* + * The predefined ACE types (8-bit, see below). + */ +enum { + ACCESS_MIN_MS_ACE_TYPE = 0, + ACCESS_ALLOWED_ACE_TYPE = 0, + ACCESS_DENIED_ACE_TYPE = 1, + SYSTEM_AUDIT_ACE_TYPE = 2, + SYSTEM_ALARM_ACE_TYPE = 3, /* Not implemented as of Win2k. */ + ACCESS_MAX_MS_V2_ACE_TYPE = 3, + + ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4, + ACCESS_MAX_MS_V3_ACE_TYPE = 4, + + /* The following are Win2k only. */ + ACCESS_MIN_MS_OBJECT_ACE_TYPE = 5, + ACCESS_ALLOWED_OBJECT_ACE_TYPE = 5, + ACCESS_DENIED_OBJECT_ACE_TYPE = 6, + SYSTEM_AUDIT_OBJECT_ACE_TYPE = 7, + SYSTEM_ALARM_OBJECT_ACE_TYPE = 8, + ACCESS_MAX_MS_OBJECT_ACE_TYPE = 8, + + ACCESS_MAX_MS_V4_ACE_TYPE = 8, + + /* This one is for WinNT/2k. */ + ACCESS_MAX_MS_ACE_TYPE = 8, +} __attribute__ ((__packed__)); + +typedef u8 ACE_TYPES; + +/* + * The ACE flags (8-bit) for audit and inheritance (see below). + * + * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE + * types to indicate that a message is generated (in Windows!) for successful + * accesses. + * + * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types + * to indicate that a message is generated (in Windows!) for failed accesses. + */ +enum { + /* The inheritance flags. */ + OBJECT_INHERIT_ACE = 0x01, + CONTAINER_INHERIT_ACE = 0x02, + NO_PROPAGATE_INHERIT_ACE = 0x04, + INHERIT_ONLY_ACE = 0x08, + INHERITED_ACE = 0x10, /* Win2k only. */ + VALID_INHERIT_FLAGS = 0x1f, + + /* The audit flags. */ + SUCCESSFUL_ACCESS_ACE_FLAG = 0x40, + FAILED_ACCESS_ACE_FLAG = 0x80, +} __attribute__ ((__packed__)); + +typedef u8 ACE_FLAGS; + +/* + * An ACE is an access-control entry in an access-control list (ACL). + * An ACE defines access to an object for a specific user or group or defines + * the types of access that generate system-administration messages or alarms + * for a specific user or group. The user or group is identified by a security + * identifier (SID). + * + * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary), + * which specifies the type and size of the ACE. The format of the subsequent + * data depends on the ACE type. + */ +typedef struct { +/*Ofs*/ +/* 0*/ ACE_TYPES type; /* Type of the ACE. */ +/* 1*/ ACE_FLAGS flags; /* Flags describing the ACE. */ +/* 2*/ le16 size; /* Size in bytes of the ACE. */ +} __attribute__ ((__packed__)) ACE_HEADER; + +/* + * The access mask (32-bit). Defines the access rights. + * + * The specific rights (bits 0 to 15). These depend on the type of the object + * being secured by the ACE. + */ +enum { + /* Specific rights for files and directories are as follows: */ + + /* Right to read data from the file. (FILE) */ + FILE_READ_DATA = cpu_to_le32(0x00000001), + /* Right to list contents of a directory. (DIRECTORY) */ + FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001), + + /* Right to write data to the file. (FILE) */ + FILE_WRITE_DATA = cpu_to_le32(0x00000002), + /* Right to create a file in the directory. (DIRECTORY) */ + FILE_ADD_FILE = cpu_to_le32(0x00000002), + + /* Right to append data to the file. (FILE) */ + FILE_APPEND_DATA = cpu_to_le32(0x00000004), + /* Right to create a subdirectory. (DIRECTORY) */ + FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004), + + /* Right to read extended attributes. (FILE/DIRECTORY) */ + FILE_READ_EA = cpu_to_le32(0x00000008), + + /* Right to write extended attributes. (FILE/DIRECTORY) */ + FILE_WRITE_EA = cpu_to_le32(0x00000010), + + /* Right to execute a file. (FILE) */ + FILE_EXECUTE = cpu_to_le32(0x00000020), + /* Right to traverse the directory. (DIRECTORY) */ + FILE_TRAVERSE = cpu_to_le32(0x00000020), + + /* + * Right to delete a directory and all the files it contains (its + * children), even if the files are read-only. (DIRECTORY) + */ + FILE_DELETE_CHILD = cpu_to_le32(0x00000040), + + /* Right to read file attributes. (FILE/DIRECTORY) */ + FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080), + + /* Right to change file attributes. (FILE/DIRECTORY) */ + FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100), + + /* + * The standard rights (bits 16 to 23). These are independent of the + * type of object being secured. + */ + + /* Right to delete the object. */ + DELETE = cpu_to_le32(0x00010000), + + /* + * Right to read the information in the object's security descriptor, + * not including the information in the SACL, i.e. right to read the + * security descriptor and owner. + */ + READ_CONTROL = cpu_to_le32(0x00020000), + + /* Right to modify the DACL in the object's security descriptor. */ + WRITE_DAC = cpu_to_le32(0x00040000), + + /* Right to change the owner in the object's security descriptor. */ + WRITE_OWNER = cpu_to_le32(0x00080000), + + /* + * Right to use the object for synchronization. Enables a process to + * wait until the object is in the signalled state. Some object types + * do not support this access right. + */ + SYNCHRONIZE = cpu_to_le32(0x00100000), + + /* + * The following STANDARD_RIGHTS_* are combinations of the above for + * convenience and are defined by the Win32 API. + */ + + /* These are currently defined to READ_CONTROL. */ + STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000), + STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000), + STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000), + + /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */ + STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000), + + /* + * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and + * SYNCHRONIZE access. + */ + STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000), + + /* + * The access system ACL and maximum allowed access types (bits 24 to + * 25, bits 26 to 27 are reserved). + */ + ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000), + MAXIMUM_ALLOWED = cpu_to_le32(0x02000000), + + /* + * The generic rights (bits 28 to 31). These map onto the standard and + * specific rights. + */ + + /* Read, write, and execute access. */ + GENERIC_ALL = cpu_to_le32(0x10000000), + + /* Execute access. */ + GENERIC_EXECUTE = cpu_to_le32(0x20000000), + + /* + * Write access. For files, this maps onto: + * FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA | + * FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE + * For directories, the mapping has the same numerical value. See + * above for the descriptions of the rights granted. + */ + GENERIC_WRITE = cpu_to_le32(0x40000000), + + /* + * Read access. For files, this maps onto: + * FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA | + * STANDARD_RIGHTS_READ | SYNCHRONIZE + * For directories, the mapping has the same numberical value. See + * above for the descriptions of the rights granted. + */ + GENERIC_READ = cpu_to_le32(0x80000000), +}; + +typedef le32 ACCESS_MASK; + +/* + * The generic mapping array. Used to denote the mapping of each generic + * access right to a specific access mask. + * + * FIXME: What exactly is this and what is it for? (AIA) + */ +typedef struct { + ACCESS_MASK generic_read; + ACCESS_MASK generic_write; + ACCESS_MASK generic_execute; + ACCESS_MASK generic_all; +} __attribute__ ((__packed__)) GENERIC_MAPPING; + +/* + * The predefined ACE type structures are as defined below. + */ + +/* + * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE + */ +typedef struct { +/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ + ACE_TYPES type; /* Type of the ACE. */ + ACE_FLAGS flags; /* Flags describing the ACE. */ + le16 size; /* Size in bytes of the ACE. */ +/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ + +/* 8*/ SID sid; /* The SID associated with the ACE. */ +} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, + SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE; + +/* + * The object ACE flags (32-bit). + */ +enum { + ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1), + ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2), +}; + +typedef le32 OBJECT_ACE_FLAGS; + +typedef struct { +/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ + ACE_TYPES type; /* Type of the ACE. */ + ACE_FLAGS flags; /* Flags describing the ACE. */ + le16 size; /* Size in bytes of the ACE. */ +/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ + +/* 8*/ OBJECT_ACE_FLAGS object_flags; /* Flags describing the object ACE. */ +/* 12*/ GUID object_type; +/* 28*/ GUID inherited_object_type; + +/* 44*/ SID sid; /* The SID associated with the ACE. */ +} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE, + ACCESS_DENIED_OBJECT_ACE, + SYSTEM_AUDIT_OBJECT_ACE, + SYSTEM_ALARM_OBJECT_ACE; + +/* + * An ACL is an access-control list (ACL). + * An ACL starts with an ACL header structure, which specifies the size of + * the ACL and the number of ACEs it contains. The ACL header is followed by + * zero or more access control entries (ACEs). The ACL as well as each ACE + * are aligned on 4-byte boundaries. + */ +typedef struct { + u8 revision; /* Revision of this ACL. */ + u8 alignment1; + le16 size; /* Allocated space in bytes for ACL. Includes this + header, the ACEs and the remaining free space. */ + le16 ace_count; /* Number of ACEs in the ACL. */ + le16 alignment2; +/* sizeof() = 8 bytes */ +} __attribute__ ((__packed__)) ACL; + +/* + * Current constants for ACLs. + */ +typedef enum { + /* Current revision. */ + ACL_REVISION = 2, + ACL_REVISION_DS = 4, + + /* History of revisions. */ + ACL_REVISION1 = 1, + MIN_ACL_REVISION = 2, + ACL_REVISION2 = 2, + ACL_REVISION3 = 3, + ACL_REVISION4 = 4, + MAX_ACL_REVISION = 4, +} ACL_CONSTANTS; + +/* + * The security descriptor control flags (16-bit). + * + * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID + * pointed to by the Owner field was provided by a defaulting mechanism + * rather than explicitly provided by the original provider of the + * security descriptor. This may affect the treatment of the SID with + * respect to inheritance of an owner. + * + * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in + * the Group field was provided by a defaulting mechanism rather than + * explicitly provided by the original provider of the security + * descriptor. This may affect the treatment of the SID with respect to + * inheritance of a primary group. + * + * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security + * descriptor contains a discretionary ACL. If this flag is set and the + * Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is + * explicitly being specified. + * + * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL + * pointed to by the Dacl field was provided by a defaulting mechanism + * rather than explicitly provided by the original provider of the + * security descriptor. This may affect the treatment of the ACL with + * respect to inheritance of an ACL. This flag is ignored if the + * DaclPresent flag is not set. + * + * SE_SACL_PRESENT - This boolean flag, when set, indicates that the security + * descriptor contains a system ACL pointed to by the Sacl field. If this + * flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then + * an empty (but present) ACL is being specified. + * + * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL + * pointed to by the Sacl field was provided by a defaulting mechanism + * rather than explicitly provided by the original provider of the + * security descriptor. This may affect the treatment of the ACL with + * respect to inheritance of an ACL. This flag is ignored if the + * SaclPresent flag is not set. + * + * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security + * descriptor is in self-relative form. In this form, all fields of the + * security descriptor are contiguous in memory and all pointer fields are + * expressed as offsets from the beginning of the security descriptor. + */ +enum { + SE_OWNER_DEFAULTED = cpu_to_le16(0x0001), + SE_GROUP_DEFAULTED = cpu_to_le16(0x0002), + SE_DACL_PRESENT = cpu_to_le16(0x0004), + SE_DACL_DEFAULTED = cpu_to_le16(0x0008), + + SE_SACL_PRESENT = cpu_to_le16(0x0010), + SE_SACL_DEFAULTED = cpu_to_le16(0x0020), + + SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100), + SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200), + SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400), + SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800), + + SE_DACL_PROTECTED = cpu_to_le16(0x1000), + SE_SACL_PROTECTED = cpu_to_le16(0x2000), + SE_RM_CONTROL_VALID = cpu_to_le16(0x4000), + SE_SELF_RELATIVE = cpu_to_le16(0x8000) +} __attribute__ ((__packed__)); + +typedef le16 SECURITY_DESCRIPTOR_CONTROL; + +/* + * Self-relative security descriptor. Contains the owner and group SIDs as well + * as the sacl and dacl ACLs inside the security descriptor itself. + */ +typedef struct { + u8 revision; /* Revision level of the security descriptor. */ + u8 alignment; + SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of + the descriptor as well as the following fields. */ + le32 owner; /* Byte offset to a SID representing an object's + owner. If this is NULL, no owner SID is present in + the descriptor. */ + le32 group; /* Byte offset to a SID representing an object's + primary group. If this is NULL, no primary group + SID is present in the descriptor. */ + le32 sacl; /* Byte offset to a system ACL. Only valid, if + SE_SACL_PRESENT is set in the control field. If + SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL + is specified. */ + le32 dacl; /* Byte offset to a discretionary ACL. Only valid, if + SE_DACL_PRESENT is set in the control field. If + SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL + (unconditionally granting access) is specified. */ +/* sizeof() = 0x14 bytes */ +} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE; + +/* + * Absolute security descriptor. Does not contain the owner and group SIDs, nor + * the sacl and dacl ACLs inside the security descriptor. Instead, it contains + * pointers to these structures in memory. Obviously, absolute security + * descriptors are only useful for in memory representations of security + * descriptors. On disk, a self-relative security descriptor is used. + */ +typedef struct { + u8 revision; /* Revision level of the security descriptor. */ + u8 alignment; + SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of + the descriptor as well as the following fields. */ + SID *owner; /* Points to a SID representing an object's owner. If + this is NULL, no owner SID is present in the + descriptor. */ + SID *group; /* Points to a SID representing an object's primary + group. If this is NULL, no primary group SID is + present in the descriptor. */ + ACL *sacl; /* Points to a system ACL. Only valid, if + SE_SACL_PRESENT is set in the control field. If + SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL + is specified. */ + ACL *dacl; /* Points to a discretionary ACL. Only valid, if + SE_DACL_PRESENT is set in the control field. If + SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL + (unconditionally granting access) is specified. */ +} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR; + +/* + * Current constants for security descriptors. + */ +typedef enum { + /* Current revision. */ + SECURITY_DESCRIPTOR_REVISION = 1, + SECURITY_DESCRIPTOR_REVISION1 = 1, + + /* The sizes of both the absolute and relative security descriptors is + the same as pointers, at least on ia32 architecture are 32-bit. */ + SECURITY_DESCRIPTOR_MIN_LENGTH = sizeof(SECURITY_DESCRIPTOR), +} SECURITY_DESCRIPTOR_CONSTANTS; + +/* + * Attribute: Security descriptor (0x50). A standard self-relative security + * descriptor. + * + * NOTE: Can be resident or non-resident. + * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally + * in FILE_Secure and the correct descriptor is found using the security_id + * from the standard information attribute. + */ +typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR; + +/* + * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one + * referenced instance of each unique security descriptor is stored. + * + * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It + * does, however, contain two indexes ($SDH and $SII) as well as a named data + * stream ($SDS). + * + * Every unique security descriptor is assigned a unique security identifier + * (security_id, not to be confused with a SID). The security_id is unique for + * the NTFS volume and is used as an index into the $SII index, which maps + * security_ids to the security descriptor's storage location within the $SDS + * data attribute. The $SII index is sorted by ascending security_id. + * + * A simple hash is computed from each security descriptor. This hash is used + * as an index into the $SDH index, which maps security descriptor hashes to + * the security descriptor's storage location within the $SDS data attribute. + * The $SDH index is sorted by security descriptor hash and is stored in a B+ + * tree. When searching $SDH (with the intent of determining whether or not a + * new security descriptor is already present in the $SDS data stream), if a + * matching hash is found, but the security descriptors do not match, the + * search in the $SDH index is continued, searching for a next matching hash. + * + * When a precise match is found, the security_id coresponding to the security + * descriptor in the $SDS attribute is read from the found $SDH index entry and + * is stored in the $STANDARD_INFORMATION attribute of the file/directory to + * which the security descriptor is being applied. The $STANDARD_INFORMATION + * attribute is present in all base mft records (i.e. in all files and + * directories). + * + * If a match is not found, the security descriptor is assigned a new unique + * security_id and is added to the $SDS data attribute. Then, entries + * referencing the this security descriptor in the $SDS data attribute are + * added to the $SDH and $SII indexes. + * + * Note: Entries are never deleted from FILE_Secure, even if nothing + * references an entry any more. + */ + +/* + * This header precedes each security descriptor in the $SDS data stream. + * This is also the index entry data part of both the $SII and $SDH indexes. + */ +typedef struct { + le32 hash; /* Hash of the security descriptor. */ + le32 security_id; /* The security_id assigned to the descriptor. */ + le64 offset; /* Byte offset of this entry in the $SDS stream. */ + le32 length; /* Size in bytes of this entry in $SDS stream. */ +} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER; + +/* + * The $SDS data stream contains the security descriptors, aligned on 16-byte + * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot + * cross 256kib boundaries (this restriction is imposed by the Windows cache + * manager). Each security descriptor is contained in a SDS_ENTRY structure. + * Also, each security descriptor is stored twice in the $SDS stream with a + * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size) + * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the + * the first copy of the security descriptor will be at offset 0x51d0 in the + * $SDS data stream and the second copy will be at offset 0x451d0. + */ +typedef struct { +/*Ofs*/ +/* 0 SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like + unnamed structs. */ + le32 hash; /* Hash of the security descriptor. */ + le32 security_id; /* The security_id assigned to the descriptor. */ + le64 offset; /* Byte offset of this entry in the $SDS stream. */ + le32 length; /* Size in bytes of this entry in $SDS stream. */ +/* 20*/ SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security + descriptor. */ +} __attribute__ ((__packed__)) SDS_ENTRY; + +/* + * The index entry key used in the $SII index. The collation type is + * COLLATION_NTOFS_ULONG. + */ +typedef struct { + le32 security_id; /* The security_id assigned to the descriptor. */ +} __attribute__ ((__packed__)) SII_INDEX_KEY; + +/* + * The index entry key used in the $SDH index. The keys are sorted first by + * hash and then by security_id. The collation rule is + * COLLATION_NTOFS_SECURITY_HASH. + */ +typedef struct { + le32 hash; /* Hash of the security descriptor. */ + le32 security_id; /* The security_id assigned to the descriptor. */ +} __attribute__ ((__packed__)) SDH_INDEX_KEY; + +/* + * Attribute: Volume name (0x60). + * + * NOTE: Always resident. + * NOTE: Present only in FILE_Volume. + */ +typedef struct { + ntfschar name[0]; /* The name of the volume in Unicode. */ +} __attribute__ ((__packed__)) VOLUME_NAME; + +/* + * Possible flags for the volume (16-bit). + */ +enum { + VOLUME_IS_DIRTY = cpu_to_le16(0x0001), + VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002), + VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004), + VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008), + + VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010), + VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020), + + VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000), + VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000), + + VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f), + + /* To make our life easier when checking if we must mount read-only. */ + VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027), +} __attribute__ ((__packed__)); + +typedef le16 VOLUME_FLAGS; + +/* + * Attribute: Volume information (0x70). + * + * NOTE: Always resident. + * NOTE: Present only in FILE_Volume. + * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses + * NTFS 1.2. I haven't personally seen other values yet. + */ +typedef struct { + le64 reserved; /* Not used (yet?). */ + u8 major_ver; /* Major version of the ntfs format. */ + u8 minor_ver; /* Minor version of the ntfs format. */ + VOLUME_FLAGS flags; /* Bit array of VOLUME_* flags. */ +} __attribute__ ((__packed__)) VOLUME_INFORMATION; + +/* + * Attribute: Data attribute (0x80). + * + * NOTE: Can be resident or non-resident. + * + * Data contents of a file (i.e. the unnamed stream) or of a named stream. + */ +typedef struct { + u8 data[0]; /* The file's data contents. */ +} __attribute__ ((__packed__)) DATA_ATTR; + +/* + * Index header flags (8-bit). + */ +enum { + /* + * When index header is in an index root attribute: + */ + SMALL_INDEX = 0, /* The index is small enough to fit inside the index + root attribute and there is no index allocation + attribute present. */ + LARGE_INDEX = 1, /* The index is too large to fit in the index root + attribute and/or an index allocation attribute is + present. */ + /* + * When index header is in an index block, i.e. is part of index + * allocation attribute: + */ + LEAF_NODE = 0, /* This is a leaf node, i.e. there are no more nodes + branching off it. */ + INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf + node. */ + NODE_MASK = 1, /* Mask for accessing the *_NODE bits. */ +} __attribute__ ((__packed__)); + +typedef u8 INDEX_HEADER_FLAGS; + +/* + * This is the header for indexes, describing the INDEX_ENTRY records, which + * follow the INDEX_HEADER. Together the index header and the index entries + * make up a complete index. + * + * IMPORTANT NOTE: The offset, length and size structure members are counted + * relative to the start of the index header structure and not relative to the + * start of the index root or index allocation structures themselves. + */ +typedef struct { + le32 entries_offset; /* Byte offset to first INDEX_ENTRY + aligned to 8-byte boundary. */ + le32 index_length; /* Data size of the index in bytes, + i.e. bytes used from allocated + size, aligned to 8-byte boundary. */ + le32 allocated_size; /* Byte size of this index (block), + multiple of 8 bytes. */ + /* NOTE: For the index root attribute, the above two numbers are always + equal, as the attribute is resident and it is resized as needed. In + the case of the index allocation attribute the attribute is not + resident and hence the allocated_size is a fixed value and must + equal the index_block_size specified by the INDEX_ROOT attribute + corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK + belongs to. */ + INDEX_HEADER_FLAGS flags; /* Bit field of INDEX_HEADER_FLAGS. */ + u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ +} __attribute__ ((__packed__)) INDEX_HEADER; + +/* + * Attribute: Index root (0x90). + * + * NOTE: Always resident. + * + * This is followed by a sequence of index entries (INDEX_ENTRY structures) + * as described by the index header. + * + * When a directory is small enough to fit inside the index root then this + * is the only attribute describing the directory. When the directory is too + * large to fit in the index root, on the other hand, two aditional attributes + * are present: an index allocation attribute, containing sub-nodes of the B+ + * directory tree (see below), and a bitmap attribute, describing which virtual + * cluster numbers (vcns) in the index allocation attribute are in use by an + * index block. + * + * NOTE: The root directory (FILE_root) contains an entry for itself. Other + * dircetories do not contain entries for themselves, though. + */ +typedef struct { + ATTR_TYPE type; /* Type of the indexed attribute. Is + $FILE_NAME for directories, zero + for view indexes. No other values + allowed. */ + COLLATION_RULE collation_rule; /* Collation rule used to sort the + index entries. If type is $FILE_NAME, + this must be COLLATION_FILE_NAME. */ + le32 index_block_size; /* Size of each index block in bytes (in + the index allocation attribute). */ + u8 clusters_per_index_block; /* Cluster size of each index block (in + the index allocation attribute), when + an index block is >= than a cluster, + otherwise this will be the log of + the size (like how the encoding of + the mft record size and the index + record size found in the boot sector + work). Has to be a power of 2. */ + u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ + INDEX_HEADER index; /* Index header describing the + following index entries. */ +} __attribute__ ((__packed__)) INDEX_ROOT; + +/* + * Attribute: Index allocation (0xa0). + * + * NOTE: Always non-resident (doesn't make sense to be resident anyway!). + * + * This is an array of index blocks. Each index block starts with an + * INDEX_BLOCK structure containing an index header, followed by a sequence of + * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER. + */ +typedef struct { +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ + NTFS_RECORD_TYPE magic; /* Magic is "INDX". */ + le16 usa_ofs; /* See NTFS_RECORD definition. */ + le16 usa_count; /* See NTFS_RECORD definition. */ + +/* 8*/ sle64 lsn; /* $LogFile sequence number of the last + modification of this index block. */ +/* 16*/ leVCN index_block_vcn; /* Virtual cluster number of the index block. + If the cluster_size on the volume is <= the + index_block_size of the directory, + index_block_vcn counts in units of clusters, + and in units of sectors otherwise. */ +/* 24*/ INDEX_HEADER index; /* Describes the following index entries. */ +/* sizeof()= 40 (0x28) bytes */ +/* + * When creating the index block, we place the update sequence array at this + * offset, i.e. before we start with the index entries. This also makes sense, + * otherwise we could run into problems with the update sequence array + * containing in itself the last two bytes of a sector which would mean that + * multi sector transfer protection wouldn't work. As you can't protect data + * by overwriting it since you then can't get it back... + * When reading use the data from the ntfs record header. + */ +} __attribute__ ((__packed__)) INDEX_BLOCK; + +typedef INDEX_BLOCK INDEX_ALLOCATION; + +/* + * The system file FILE_Extend/$Reparse contains an index named $R listing + * all reparse points on the volume. The index entry keys are as defined + * below. Note, that there is no index data associated with the index entries. + * + * The index entries are sorted by the index key file_id. The collation rule is + * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the + * primary key / is not a key at all. (AIA) + */ +typedef struct { + le32 reparse_tag; /* Reparse point type (inc. flags). */ + leMFT_REF file_id; /* Mft record of the file containing the + reparse point attribute. */ +} __attribute__ ((__packed__)) REPARSE_INDEX_KEY; + +/* + * Quota flags (32-bit). + * + * The user quota flags. Names explain meaning. + */ +enum { + QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001), + QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002), + QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004), + + QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007), + /* This is a bit mask for the user quota flags. */ + + /* + * These flags are only present in the quota defaults index entry, i.e. + * in the entry where owner_id = QUOTA_DEFAULTS_ID. + */ + QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010), + QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020), + QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040), + QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080), + + QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100), + QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200), + QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400), + QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800), +}; + +typedef le32 QUOTA_FLAGS; + +/* + * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas + * are on a per volume and per user basis. + * + * The $Q index contains one entry for each existing user_id on the volume. The + * index key is the user_id of the user/group owning this quota control entry, + * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the + * owner_id, is found in the standard information attribute. The collation rule + * for $Q is COLLATION_NTOFS_ULONG. + * + * The $O index contains one entry for each user/group who has been assigned + * a quota on that volume. The index key holds the SID of the user_id the + * entry belongs to, i.e. the owner_id. The collation rule for $O is + * COLLATION_NTOFS_SID. + * + * The $O index entry data is the user_id of the user corresponding to the SID. + * This user_id is used as an index into $Q to find the quota control entry + * associated with the SID. + * + * The $Q index entry data is the quota control entry and is defined below. + */ +typedef struct { + le32 version; /* Currently equals 2. */ + QUOTA_FLAGS flags; /* Flags describing this quota entry. */ + le64 bytes_used; /* How many bytes of the quota are in use. */ + sle64 change_time; /* Last time this quota entry was changed. */ + sle64 threshold; /* Soft quota (-1 if not limited). */ + sle64 limit; /* Hard quota (-1 if not limited). */ + sle64 exceeded_time; /* How long the soft quota has been exceeded. */ + SID sid; /* The SID of the user/object associated with + this quota entry. Equals zero for the quota + defaults entry (and in fact on a WinXP + volume, it is not present at all). */ +} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY; + +/* + * Predefined owner_id values (32-bit). + */ +enum { + QUOTA_INVALID_ID = cpu_to_le32(0x00000000), + QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001), + QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100), +}; + +/* + * Current constants for quota control entries. + */ +typedef enum { + /* Current version. */ + QUOTA_VERSION = 2, +} QUOTA_CONTROL_ENTRY_CONSTANTS; + +/* + * Index entry flags (16-bit). + */ +enum { + INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a + sub-node, i.e. a reference to an index block in form of + a virtual cluster number (see below). */ + INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last + entry in an index block. The index entry does not + represent a file but it can point to a sub-node. */ + + INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force + enum bit width to 16-bit. */ +} __attribute__ ((__packed__)); + +typedef le16 INDEX_ENTRY_FLAGS; + +/* + * This the index entry header (see below). + */ +typedef struct { +/* 0*/ union { + struct { /* Only valid when INDEX_ENTRY_END is not set. */ + leMFT_REF indexed_file; /* The mft reference of the file + described by this index + entry. Used for directory + indexes. */ + } __attribute__ ((__packed__)) dir; + struct { /* Used for views/indexes to find the entry's data. */ + le16 data_offset; /* Data byte offset from this + INDEX_ENTRY. Follows the + index key. */ + le16 data_length; /* Data length in bytes. */ + le32 reservedV; /* Reserved (zero). */ + } __attribute__ ((__packed__)) vi; + } __attribute__ ((__packed__)) data; +/* 8*/ le16 length; /* Byte size of this index entry, multiple of + 8-bytes. */ +/* 10*/ le16 key_length; /* Byte size of the key value, which is in the + index entry. It follows field reserved. Not + multiple of 8-bytes. */ +/* 12*/ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ +/* 14*/ le16 reserved; /* Reserved/align to 8-byte boundary. */ +/* sizeof() = 16 bytes */ +} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER; + +/* + * This is an index entry. A sequence of such entries follows each INDEX_HEADER + * structure. Together they make up a complete index. The index follows either + * an index root attribute or an index allocation attribute. + * + * NOTE: Before NTFS 3.0 only filename attributes were indexed. + */ +typedef struct { +/*Ofs*/ +/* 0 INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */ + union { + struct { /* Only valid when INDEX_ENTRY_END is not set. */ + leMFT_REF indexed_file; /* The mft reference of the file + described by this index + entry. Used for directory + indexes. */ + } __attribute__ ((__packed__)) dir; + struct { /* Used for views/indexes to find the entry's data. */ + le16 data_offset; /* Data byte offset from this + INDEX_ENTRY. Follows the + index key. */ + le16 data_length; /* Data length in bytes. */ + le32 reservedV; /* Reserved (zero). */ + } __attribute__ ((__packed__)) vi; + } __attribute__ ((__packed__)) data; + le16 length; /* Byte size of this index entry, multiple of + 8-bytes. */ + le16 key_length; /* Byte size of the key value, which is in the + index entry. It follows field reserved. Not + multiple of 8-bytes. */ + INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ + le16 reserved; /* Reserved/align to 8-byte boundary. */ + +/* 16*/ union { /* The key of the indexed attribute. NOTE: Only present + if INDEX_ENTRY_END bit in flags is not set. NOTE: On + NTFS versions before 3.0 the only valid key is the + FILE_NAME_ATTR. On NTFS 3.0+ the following + additional index keys are defined: */ + FILE_NAME_ATTR file_name;/* $I30 index in directories. */ + SII_INDEX_KEY sii; /* $SII index in $Secure. */ + SDH_INDEX_KEY sdh; /* $SDH index in $Secure. */ + GUID object_id; /* $O index in FILE_Extend/$ObjId: The + object_id of the mft record found in + the data part of the index. */ + REPARSE_INDEX_KEY reparse; /* $R index in + FILE_Extend/$Reparse. */ + SID sid; /* $O index in FILE_Extend/$Quota: + SID of the owner of the user_id. */ + le32 owner_id; /* $Q index in FILE_Extend/$Quota: + user_id of the owner of the quota + control entry in the data part of + the index. */ + } __attribute__ ((__packed__)) key; + /* The (optional) index data is inserted here when creating. */ + // leVCN vcn; /* If INDEX_ENTRY_NODE bit in flags is set, the last + // eight bytes of this index entry contain the virtual + // cluster number of the index block that holds the + // entries immediately preceding the current entry (the + // vcn references the corresponding cluster in the data + // of the non-resident index allocation attribute). If + // the key_length is zero, then the vcn immediately + // follows the INDEX_ENTRY_HEADER. Regardless of + // key_length, the address of the 8-byte boundary + // aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by + // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN), + // where sizeof(VCN) can be hardcoded as 8 if wanted. */ +} __attribute__ ((__packed__)) INDEX_ENTRY; + +/* + * Attribute: Bitmap (0xb0). + * + * Contains an array of bits (aka a bitfield). + * + * When used in conjunction with the index allocation attribute, each bit + * corresponds to one index block within the index allocation attribute. Thus + * the number of bits in the bitmap * index block size / cluster size is the + * number of clusters in the index allocation attribute. + */ +typedef struct { + u8 bitmap[0]; /* Array of bits. */ +} __attribute__ ((__packed__)) BITMAP_ATTR; + +/* + * The reparse point tag defines the type of the reparse point. It also + * includes several flags, which further describe the reparse point. + * + * The reparse point tag is an unsigned 32-bit value divided in three parts: + * + * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of + * the reparse point. + * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use. + * 3. The most significant three bits are flags describing the reparse point. + * They are defined as follows: + * bit 29: Name surrogate bit. If set, the filename is an alias for + * another object in the system. + * bit 30: High-latency bit. If set, accessing the first byte of data will + * be slow. (E.g. the data is stored on a tape drive.) + * bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User + * defined tags have to use zero here. + * + * These are the predefined reparse point tags: + */ +enum { + IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000), + IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000), + IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000), + + IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000), + IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001), + IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001), + + IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005), + IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006), + IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007), + IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008), + + IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003), + + IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004), + + IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000), + + IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff), +}; + +/* + * Attribute: Reparse point (0xc0). + * + * NOTE: Can be resident or non-resident. + */ +typedef struct { + le32 reparse_tag; /* Reparse point type (inc. flags). */ + le16 reparse_data_length; /* Byte size of reparse data. */ + le16 reserved; /* Align to 8-byte boundary. */ + u8 reparse_data[0]; /* Meaning depends on reparse_tag. */ +} __attribute__ ((__packed__)) REPARSE_POINT; + +/* + * Attribute: Extended attribute (EA) information (0xd0). + * + * NOTE: Always resident. (Is this true???) + */ +typedef struct { + le16 ea_length; /* Byte size of the packed extended + attributes. */ + le16 need_ea_count; /* The number of extended attributes which have + the NEED_EA bit set. */ + le32 ea_query_length; /* Byte size of the buffer required to query + the extended attributes when calling + ZwQueryEaFile() in Windows NT/2k. I.e. the + byte size of the unpacked extended + attributes. */ +} __attribute__ ((__packed__)) EA_INFORMATION; + +/* + * Extended attribute flags (8-bit). + */ +enum { + NEED_EA = 0x80 /* If set the file to which the EA belongs + cannot be interpreted without understanding + the associates extended attributes. */ +} __attribute__ ((__packed__)); + +typedef u8 EA_FLAGS; + +/* + * Attribute: Extended attribute (EA) (0xe0). + * + * NOTE: Can be resident or non-resident. + * + * Like the attribute list and the index buffer list, the EA attribute value is + * a sequence of EA_ATTR variable length records. + */ +typedef struct { + le32 next_entry_offset; /* Offset to the next EA_ATTR. */ + EA_FLAGS flags; /* Flags describing the EA. */ + u8 ea_name_length; /* Length of the name of the EA in bytes + excluding the '\0' byte terminator. */ + le16 ea_value_length; /* Byte size of the EA's value. */ + u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not + Unicode and it is zero terminated. */ + u8 ea_value[0]; /* The value of the EA. Immediately follows + the name. */ +} __attribute__ ((__packed__)) EA_ATTR; + +/* + * Attribute: Property set (0xf0). + * + * Intended to support Native Structure Storage (NSS) - a feature removed from + * NTFS 3.0 during beta testing. + */ +typedef struct { + /* Irrelevant as feature unused. */ +} __attribute__ ((__packed__)) PROPERTY_SET; + +/* + * Attribute: Logged utility stream (0x100). + * + * NOTE: Can be resident or non-resident. + * + * Operations on this attribute are logged to the journal ($LogFile) like + * normal metadata changes. + * + * Used by the Encrypting File System (EFS). All encrypted files have this + * attribute with the name $EFS. + */ +typedef struct { + /* Can be anything the creator chooses. */ + /* EFS uses it as follows: */ + // FIXME: Type this info, verifying it along the way. (AIA) +} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR; + +#endif /* _LINUX_NTFS_LAYOUT_H */ diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c new file mode 100644 index 00000000..1711b710 --- /dev/null +++ b/fs/ntfs/lcnalloc.c @@ -0,0 +1,1014 @@ +/* + * lcnalloc.c - Cluster (de)allocation code. Part of the Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifdef NTFS_RW + +#include + +#include "lcnalloc.h" +#include "debug.h" +#include "bitmap.h" +#include "inode.h" +#include "volume.h" +#include "attrib.h" +#include "malloc.h" +#include "aops.h" +#include "ntfs.h" + +/** + * ntfs_cluster_free_from_rl_nolock - free clusters from runlist + * @vol: mounted ntfs volume on which to free the clusters + * @rl: runlist describing the clusters to free + * + * Free all the clusters described by the runlist @rl on the volume @vol. In + * the case of an error being returned, at least some of the clusters were not + * freed. + * + * Return 0 on success and -errno on error. + * + * Locking: - The volume lcn bitmap must be locked for writing on entry and is + * left locked on return. + */ +int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, + const runlist_element *rl) +{ + struct inode *lcnbmp_vi = vol->lcnbmp_ino; + int ret = 0; + + ntfs_debug("Entering."); + if (!rl) + return 0; + for (; rl->length; rl++) { + int err; + + if (rl->lcn < 0) + continue; + err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length); + if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err)) + ret = err; + } + ntfs_debug("Done."); + return ret; +} + +/** + * ntfs_cluster_alloc - allocate clusters on an ntfs volume + * @vol: mounted ntfs volume on which to allocate the clusters + * @start_vcn: vcn to use for the first allocated cluster + * @count: number of clusters to allocate + * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none) + * @zone: zone from which to allocate the clusters + * @is_extension: if 'true', this is an attribute extension + * + * Allocate @count clusters preferably starting at cluster @start_lcn or at the + * current allocator position if @start_lcn is -1, on the mounted ntfs volume + * @vol. @zone is either DATA_ZONE for allocation of normal clusters or + * MFT_ZONE for allocation of clusters for the master file table, i.e. the + * $MFT/$DATA attribute. + * + * @start_vcn specifies the vcn of the first allocated cluster. This makes + * merging the resulting runlist with the old runlist easier. + * + * If @is_extension is 'true', the caller is allocating clusters to extend an + * attribute and if it is 'false', the caller is allocating clusters to fill a + * hole in an attribute. Practically the difference is that if @is_extension + * is 'true' the returned runlist will be terminated with LCN_ENOENT and if + * @is_extension is 'false' the runlist will be terminated with + * LCN_RL_NOT_MAPPED. + * + * You need to check the return value with IS_ERR(). If this is false, the + * function was successful and the return value is a runlist describing the + * allocated cluster(s). If IS_ERR() is true, the function failed and + * PTR_ERR() gives you the error code. + * + * Notes on the allocation algorithm + * ================================= + * + * There are two data zones. First is the area between the end of the mft zone + * and the end of the volume, and second is the area between the start of the + * volume and the start of the mft zone. On unmodified/standard NTFS 1.x + * volumes, the second data zone does not exist due to the mft zone being + * expanded to cover the start of the volume in order to reserve space for the + * mft bitmap attribute. + * + * This is not the prettiest function but the complexity stems from the need of + * implementing the mft vs data zoned approach and from the fact that we have + * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we + * need to cope with crossing over boundaries of two buffers. Further, the + * fact that the allocator allows for caller supplied hints as to the location + * of where allocation should begin and the fact that the allocator keeps track + * of where in the data zones the next natural allocation should occur, + * contribute to the complexity of the function. But it should all be + * worthwhile, because this allocator should: 1) be a full implementation of + * the MFT zone approach used by Windows NT, 2) cause reduction in + * fragmentation, and 3) be speedy in allocations (the code is not optimized + * for speed, but the algorithm is, so further speed improvements are probably + * possible). + * + * FIXME: We should be monitoring cluster allocation and increment the MFT zone + * size dynamically but this is something for the future. We will just cause + * heavier fragmentation by not doing it and I am not even sure Windows would + * grow the MFT zone dynamically, so it might even be correct not to do this. + * The overhead in doing dynamic MFT zone expansion would be very large and + * unlikely worth the effort. (AIA) + * + * TODO: I have added in double the required zone position pointer wrap around + * logic which can be optimized to having only one of the two logic sets. + * However, having the double logic will work fine, but if we have only one of + * the sets and we get it wrong somewhere, then we get into trouble, so + * removing the duplicate logic requires _very_ careful consideration of _all_ + * possible code paths. So at least for now, I am leaving the double logic - + * better safe than sorry... (AIA) + * + * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked + * on return. + * - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + */ +runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, + const s64 count, const LCN start_lcn, + const NTFS_CLUSTER_ALLOCATION_ZONES zone, + const bool is_extension) +{ + LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn; + LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size; + s64 clusters; + loff_t i_size; + struct inode *lcnbmp_vi; + runlist_element *rl = NULL; + struct address_space *mapping; + struct page *page = NULL; + u8 *buf, *byte; + int err = 0, rlpos, rlsize, buf_size; + u8 pass, done_zones, search_zone, need_writeback = 0, bit; + + ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn " + "0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn, + (unsigned long long)count, + (unsigned long long)start_lcn, + zone == MFT_ZONE ? "MFT" : "DATA"); + BUG_ON(!vol); + lcnbmp_vi = vol->lcnbmp_ino; + BUG_ON(!lcnbmp_vi); + BUG_ON(start_vcn < 0); + BUG_ON(count < 0); + BUG_ON(start_lcn < -1); + BUG_ON(zone < FIRST_ZONE); + BUG_ON(zone > LAST_ZONE); + + /* Return NULL if @count is zero. */ + if (!count) + return NULL; + /* Take the lcnbmp lock for writing. */ + down_write(&vol->lcnbmp_lock); + /* + * If no specific @start_lcn was requested, use the current data zone + * position, otherwise use the requested @start_lcn but make sure it + * lies outside the mft zone. Also set done_zones to 0 (no zones done) + * and pass depending on whether we are starting inside a zone (1) or + * at the beginning of a zone (2). If requesting from the MFT_ZONE, + * we either start at the current position within the mft zone or at + * the specified position. If the latter is out of bounds then we start + * at the beginning of the MFT_ZONE. + */ + done_zones = 0; + pass = 1; + /* + * zone_start and zone_end are the current search range. search_zone + * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of + * volume) and 4 for data zone 2 (start of volume till start of mft + * zone). + */ + zone_start = start_lcn; + if (zone_start < 0) { + if (zone == DATA_ZONE) + zone_start = vol->data1_zone_pos; + else + zone_start = vol->mft_zone_pos; + if (!zone_start) { + /* + * Zone starts at beginning of volume which means a + * single pass is sufficient. + */ + pass = 2; + } + } else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start && + zone_start < vol->mft_zone_end) { + zone_start = vol->mft_zone_end; + /* + * Starting at beginning of data1_zone which means a single + * pass in this zone is sufficient. + */ + pass = 2; + } else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start || + zone_start >= vol->mft_zone_end)) { + zone_start = vol->mft_lcn; + if (!vol->mft_zone_end) + zone_start = 0; + /* + * Starting at beginning of volume which means a single pass + * is sufficient. + */ + pass = 2; + } + if (zone == MFT_ZONE) { + zone_end = vol->mft_zone_end; + search_zone = 1; + } else /* if (zone == DATA_ZONE) */ { + /* Skip searching the mft zone. */ + done_zones |= 1; + if (zone_start >= vol->mft_zone_end) { + zone_end = vol->nr_clusters; + search_zone = 2; + } else { + zone_end = vol->mft_zone_start; + search_zone = 4; + } + } + /* + * bmp_pos is the current bit position inside the bitmap. We use + * bmp_initial_pos to determine whether or not to do a zone switch. + */ + bmp_pos = bmp_initial_pos = zone_start; + + /* Loop until all clusters are allocated, i.e. clusters == 0. */ + clusters = count; + rlpos = rlsize = 0; + mapping = lcnbmp_vi->i_mapping; + i_size = i_size_read(lcnbmp_vi); + while (1) { + ntfs_debug("Start of outer while loop: done_zones 0x%x, " + "search_zone %i, pass %i, zone_start 0x%llx, " + "zone_end 0x%llx, bmp_initial_pos 0x%llx, " + "bmp_pos 0x%llx, rlpos %i, rlsize %i.", + done_zones, search_zone, pass, + (unsigned long long)zone_start, + (unsigned long long)zone_end, + (unsigned long long)bmp_initial_pos, + (unsigned long long)bmp_pos, rlpos, rlsize); + /* Loop until we run out of free clusters. */ + last_read_pos = bmp_pos >> 3; + ntfs_debug("last_read_pos 0x%llx.", + (unsigned long long)last_read_pos); + if (last_read_pos > i_size) { + ntfs_debug("End of attribute reached. " + "Skipping to zone_pass_done."); + goto zone_pass_done; + } + if (likely(page)) { + if (need_writeback) { + ntfs_debug("Marking page dirty."); + flush_dcache_page(page); + set_page_dirty(page); + need_writeback = 0; + } + ntfs_unmap_page(page); + } + page = ntfs_map_page(mapping, last_read_pos >> + PAGE_CACHE_SHIFT); + if (IS_ERR(page)) { + err = PTR_ERR(page); + ntfs_error(vol->sb, "Failed to map page."); + goto out; + } + buf_size = last_read_pos & ~PAGE_CACHE_MASK; + buf = page_address(page) + buf_size; + buf_size = PAGE_CACHE_SIZE - buf_size; + if (unlikely(last_read_pos + buf_size > i_size)) + buf_size = i_size - last_read_pos; + buf_size <<= 3; + lcn = bmp_pos & 7; + bmp_pos &= ~(LCN)7; + ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, " + "bmp_pos 0x%llx, need_writeback %i.", buf_size, + (unsigned long long)lcn, + (unsigned long long)bmp_pos, need_writeback); + while (lcn < buf_size && lcn + bmp_pos < zone_end) { + byte = buf + (lcn >> 3); + ntfs_debug("In inner while loop: buf_size %i, " + "lcn 0x%llx, bmp_pos 0x%llx, " + "need_writeback %i, byte ofs 0x%x, " + "*byte 0x%x.", buf_size, + (unsigned long long)lcn, + (unsigned long long)bmp_pos, + need_writeback, + (unsigned int)(lcn >> 3), + (unsigned int)*byte); + /* Skip full bytes. */ + if (*byte == 0xff) { + lcn = (lcn + 8) & ~(LCN)7; + ntfs_debug("Continuing while loop 1."); + continue; + } + bit = 1 << (lcn & 7); + ntfs_debug("bit 0x%x.", bit); + /* If the bit is already set, go onto the next one. */ + if (*byte & bit) { + lcn++; + ntfs_debug("Continuing while loop 2."); + continue; + } + /* + * Allocate more memory if needed, including space for + * the terminator element. + * ntfs_malloc_nofs() operates on whole pages only. + */ + if ((rlpos + 2) * sizeof(*rl) > rlsize) { + runlist_element *rl2; + + ntfs_debug("Reallocating memory."); + if (!rl) + ntfs_debug("First free bit is at LCN " + "0x%llx.", + (unsigned long long) + (lcn + bmp_pos)); + rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); + if (unlikely(!rl2)) { + err = -ENOMEM; + ntfs_error(vol->sb, "Failed to " + "allocate memory."); + goto out; + } + memcpy(rl2, rl, rlsize); + ntfs_free(rl); + rl = rl2; + rlsize += PAGE_SIZE; + ntfs_debug("Reallocated memory, rlsize 0x%x.", + rlsize); + } + /* Allocate the bitmap bit. */ + *byte |= bit; + /* We need to write this bitmap page to disk. */ + need_writeback = 1; + ntfs_debug("*byte 0x%x, need_writeback is set.", + (unsigned int)*byte); + /* + * Coalesce with previous run if adjacent LCNs. + * Otherwise, append a new run. + */ + ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), " + "prev_lcn 0x%llx, lcn 0x%llx, " + "bmp_pos 0x%llx, prev_run_len 0x%llx, " + "rlpos %i.", + (unsigned long long)(lcn + bmp_pos), + 1ULL, (unsigned long long)prev_lcn, + (unsigned long long)lcn, + (unsigned long long)bmp_pos, + (unsigned long long)prev_run_len, + rlpos); + if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) { + ntfs_debug("Coalescing to run (lcn 0x%llx, " + "len 0x%llx).", + (unsigned long long) + rl[rlpos - 1].lcn, + (unsigned long long) + rl[rlpos - 1].length); + rl[rlpos - 1].length = ++prev_run_len; + ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), " + "prev_run_len 0x%llx.", + (unsigned long long) + rl[rlpos - 1].lcn, + (unsigned long long) + rl[rlpos - 1].length, + (unsigned long long) + prev_run_len); + } else { + if (likely(rlpos)) { + ntfs_debug("Adding new run, (previous " + "run lcn 0x%llx, " + "len 0x%llx).", + (unsigned long long) + rl[rlpos - 1].lcn, + (unsigned long long) + rl[rlpos - 1].length); + rl[rlpos].vcn = rl[rlpos - 1].vcn + + prev_run_len; + } else { + ntfs_debug("Adding new run, is first " + "run."); + rl[rlpos].vcn = start_vcn; + } + rl[rlpos].lcn = prev_lcn = lcn + bmp_pos; + rl[rlpos].length = prev_run_len = 1; + rlpos++; + } + /* Done? */ + if (!--clusters) { + LCN tc; + /* + * Update the current zone position. Positions + * of already scanned zones have been updated + * during the respective zone switches. + */ + tc = lcn + bmp_pos + 1; + ntfs_debug("Done. Updating current zone " + "position, tc 0x%llx, " + "search_zone %i.", + (unsigned long long)tc, + search_zone); + switch (search_zone) { + case 1: + ntfs_debug("Before checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + if (tc >= vol->mft_zone_end) { + vol->mft_zone_pos = + vol->mft_lcn; + if (!vol->mft_zone_end) + vol->mft_zone_pos = 0; + } else if ((bmp_initial_pos >= + vol->mft_zone_pos || + tc > vol->mft_zone_pos) + && tc >= vol->mft_lcn) + vol->mft_zone_pos = tc; + ntfs_debug("After checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + break; + case 2: + ntfs_debug("Before checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + if (tc >= vol->nr_clusters) + vol->data1_zone_pos = + vol->mft_zone_end; + else if ((bmp_initial_pos >= + vol->data1_zone_pos || + tc > vol->data1_zone_pos) + && tc >= vol->mft_zone_end) + vol->data1_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + break; + case 4: + ntfs_debug("Before checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + if (tc >= vol->mft_zone_start) + vol->data2_zone_pos = 0; + else if (bmp_initial_pos >= + vol->data2_zone_pos || + tc > vol->data2_zone_pos) + vol->data2_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + break; + default: + BUG(); + } + ntfs_debug("Finished. Going to out."); + goto out; + } + lcn++; + } + bmp_pos += buf_size; + ntfs_debug("After inner while loop: buf_size 0x%x, lcn " + "0x%llx, bmp_pos 0x%llx, need_writeback %i.", + buf_size, (unsigned long long)lcn, + (unsigned long long)bmp_pos, need_writeback); + if (bmp_pos < zone_end) { + ntfs_debug("Continuing outer while loop, " + "bmp_pos 0x%llx, zone_end 0x%llx.", + (unsigned long long)bmp_pos, + (unsigned long long)zone_end); + continue; + } +zone_pass_done: /* Finished with the current zone pass. */ + ntfs_debug("At zone_pass_done, pass %i.", pass); + if (pass == 1) { + /* + * Now do pass 2, scanning the first part of the zone + * we omitted in pass 1. + */ + pass = 2; + zone_end = zone_start; + switch (search_zone) { + case 1: /* mft_zone */ + zone_start = vol->mft_zone_start; + break; + case 2: /* data1_zone */ + zone_start = vol->mft_zone_end; + break; + case 4: /* data2_zone */ + zone_start = 0; + break; + default: + BUG(); + } + /* Sanity check. */ + if (zone_end < zone_start) + zone_end = zone_start; + bmp_pos = zone_start; + ntfs_debug("Continuing outer while loop, pass 2, " + "zone_start 0x%llx, zone_end 0x%llx, " + "bmp_pos 0x%llx.", + (unsigned long long)zone_start, + (unsigned long long)zone_end, + (unsigned long long)bmp_pos); + continue; + } /* pass == 2 */ +done_zones_check: + ntfs_debug("At done_zones_check, search_zone %i, done_zones " + "before 0x%x, done_zones after 0x%x.", + search_zone, done_zones, + done_zones | search_zone); + done_zones |= search_zone; + if (done_zones < 7) { + ntfs_debug("Switching zone."); + /* Now switch to the next zone we haven't done yet. */ + pass = 1; + switch (search_zone) { + case 1: + ntfs_debug("Switching from mft zone to data1 " + "zone."); + /* Update mft zone position. */ + if (rlpos) { + LCN tc; + + ntfs_debug("Before checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + tc = rl[rlpos - 1].lcn + + rl[rlpos - 1].length; + if (tc >= vol->mft_zone_end) { + vol->mft_zone_pos = + vol->mft_lcn; + if (!vol->mft_zone_end) + vol->mft_zone_pos = 0; + } else if ((bmp_initial_pos >= + vol->mft_zone_pos || + tc > vol->mft_zone_pos) + && tc >= vol->mft_lcn) + vol->mft_zone_pos = tc; + ntfs_debug("After checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + } + /* Switch from mft zone to data1 zone. */ +switch_to_data1_zone: search_zone = 2; + zone_start = bmp_initial_pos = + vol->data1_zone_pos; + zone_end = vol->nr_clusters; + if (zone_start == vol->mft_zone_end) + pass = 2; + if (zone_start >= zone_end) { + vol->data1_zone_pos = zone_start = + vol->mft_zone_end; + pass = 2; + } + break; + case 2: + ntfs_debug("Switching from data1 zone to " + "data2 zone."); + /* Update data1 zone position. */ + if (rlpos) { + LCN tc; + + ntfs_debug("Before checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + tc = rl[rlpos - 1].lcn + + rl[rlpos - 1].length; + if (tc >= vol->nr_clusters) + vol->data1_zone_pos = + vol->mft_zone_end; + else if ((bmp_initial_pos >= + vol->data1_zone_pos || + tc > vol->data1_zone_pos) + && tc >= vol->mft_zone_end) + vol->data1_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + } + /* Switch from data1 zone to data2 zone. */ + search_zone = 4; + zone_start = bmp_initial_pos = + vol->data2_zone_pos; + zone_end = vol->mft_zone_start; + if (!zone_start) + pass = 2; + if (zone_start >= zone_end) { + vol->data2_zone_pos = zone_start = + bmp_initial_pos = 0; + pass = 2; + } + break; + case 4: + ntfs_debug("Switching from data2 zone to " + "data1 zone."); + /* Update data2 zone position. */ + if (rlpos) { + LCN tc; + + ntfs_debug("Before checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + tc = rl[rlpos - 1].lcn + + rl[rlpos - 1].length; + if (tc >= vol->mft_zone_start) + vol->data2_zone_pos = 0; + else if (bmp_initial_pos >= + vol->data2_zone_pos || + tc > vol->data2_zone_pos) + vol->data2_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + } + /* Switch from data2 zone to data1 zone. */ + goto switch_to_data1_zone; + default: + BUG(); + } + ntfs_debug("After zone switch, search_zone %i, " + "pass %i, bmp_initial_pos 0x%llx, " + "zone_start 0x%llx, zone_end 0x%llx.", + search_zone, pass, + (unsigned long long)bmp_initial_pos, + (unsigned long long)zone_start, + (unsigned long long)zone_end); + bmp_pos = zone_start; + if (zone_start == zone_end) { + ntfs_debug("Empty zone, going to " + "done_zones_check."); + /* Empty zone. Don't bother searching it. */ + goto done_zones_check; + } + ntfs_debug("Continuing outer while loop."); + continue; + } /* done_zones == 7 */ + ntfs_debug("All zones are finished."); + /* + * All zones are finished! If DATA_ZONE, shrink mft zone. If + * MFT_ZONE, we have really run out of space. + */ + mft_zone_size = vol->mft_zone_end - vol->mft_zone_start; + ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end " + "0x%llx, mft_zone_size 0x%llx.", + (unsigned long long)vol->mft_zone_start, + (unsigned long long)vol->mft_zone_end, + (unsigned long long)mft_zone_size); + if (zone == MFT_ZONE || mft_zone_size <= 0) { + ntfs_debug("No free clusters left, going to out."); + /* Really no more space left on device. */ + err = -ENOSPC; + goto out; + } /* zone == DATA_ZONE && mft_zone_size > 0 */ + ntfs_debug("Shrinking mft zone."); + zone_end = vol->mft_zone_end; + mft_zone_size >>= 1; + if (mft_zone_size > 0) + vol->mft_zone_end = vol->mft_zone_start + mft_zone_size; + else /* mft zone and data2 zone no longer exist. */ + vol->data2_zone_pos = vol->mft_zone_start = + vol->mft_zone_end = 0; + if (vol->mft_zone_pos >= vol->mft_zone_end) { + vol->mft_zone_pos = vol->mft_lcn; + if (!vol->mft_zone_end) + vol->mft_zone_pos = 0; + } + bmp_pos = zone_start = bmp_initial_pos = + vol->data1_zone_pos = vol->mft_zone_end; + search_zone = 2; + pass = 2; + done_zones &= ~2; + ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, " + "vol->mft_zone_start 0x%llx, " + "vol->mft_zone_end 0x%llx, " + "vol->mft_zone_pos 0x%llx, search_zone 2, " + "pass 2, dones_zones 0x%x, zone_start 0x%llx, " + "zone_end 0x%llx, vol->data1_zone_pos 0x%llx, " + "continuing outer while loop.", + (unsigned long long)mft_zone_size, + (unsigned long long)vol->mft_zone_start, + (unsigned long long)vol->mft_zone_end, + (unsigned long long)vol->mft_zone_pos, + done_zones, (unsigned long long)zone_start, + (unsigned long long)zone_end, + (unsigned long long)vol->data1_zone_pos); + } + ntfs_debug("After outer while loop."); +out: + ntfs_debug("At out."); + /* Add runlist terminator element. */ + if (likely(rl)) { + rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length; + rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED; + rl[rlpos].length = 0; + } + if (likely(page && !IS_ERR(page))) { + if (need_writeback) { + ntfs_debug("Marking page dirty."); + flush_dcache_page(page); + set_page_dirty(page); + need_writeback = 0; + } + ntfs_unmap_page(page); + } + if (likely(!err)) { + up_write(&vol->lcnbmp_lock); + ntfs_debug("Done."); + return rl; + } + ntfs_error(vol->sb, "Failed to allocate clusters, aborting " + "(error %i).", err); + if (rl) { + int err2; + + if (err == -ENOSPC) + ntfs_debug("Not enough space to complete allocation, " + "err -ENOSPC, first free lcn 0x%llx, " + "could allocate up to 0x%llx " + "clusters.", + (unsigned long long)rl[0].lcn, + (unsigned long long)(count - clusters)); + /* Deallocate all allocated clusters. */ + ntfs_debug("Attempting rollback..."); + err2 = ntfs_cluster_free_from_rl_nolock(vol, rl); + if (err2) { + ntfs_error(vol->sb, "Failed to rollback (error %i). " + "Leaving inconsistent metadata! " + "Unmount and run chkdsk.", err2); + NVolSetErrors(vol); + } + /* Free the runlist. */ + ntfs_free(rl); + } else if (err == -ENOSPC) + ntfs_debug("No space left at all, err = -ENOSPC, first free " + "lcn = 0x%llx.", + (long long)vol->data1_zone_pos); + up_write(&vol->lcnbmp_lock); + return ERR_PTR(err); +} + +/** + * __ntfs_cluster_free - free clusters on an ntfs volume + * @ni: ntfs inode whose runlist describes the clusters to free + * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters + * @count: number of clusters to free or -1 for all clusters + * @ctx: active attribute search context if present or NULL if not + * @is_rollback: true if this is a rollback operation + * + * Free @count clusters starting at the cluster @start_vcn in the runlist + * described by the vfs inode @ni. + * + * If @count is -1, all clusters from @start_vcn to the end of the runlist are + * deallocated. Thus, to completely free all clusters in a runlist, use + * @start_vcn = 0 and @count = -1. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when __ntfs_cluster_free() encounters unmapped + * runlist fragments and allows their mapping. If you do not have the mft + * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will + * perform the necessary mapping and unmapping. + * + * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it + * before returning. Thus, @ctx will be left pointing to the same attribute on + * return as on entry. However, the actual pointers in @ctx may point to + * different memory locations on return, so you must remember to reset any + * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(), + * you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * + * @is_rollback should always be 'false', it is for internal use to rollback + * errors. You probably want to use ntfs_cluster_free() instead. + * + * Note, __ntfs_cluster_free() does not modify the runlist, so you have to + * remove from the runlist or mark sparse the freed runs later. + * + * Return the number of deallocated clusters (not counting sparse ones) on + * success and -errno on error. + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist may be modified when + * needed runlist fragments need to be mapped. + * - The volume lcn bitmap must be unlocked on entry and is unlocked + * on return. + * - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, + ntfs_attr_search_ctx *ctx, const bool is_rollback) +{ + s64 delta, to_free, total_freed, real_freed; + ntfs_volume *vol; + struct inode *lcnbmp_vi; + runlist_element *rl; + int err; + + BUG_ON(!ni); + ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count " + "0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn, + (unsigned long long)count, + is_rollback ? " (rollback)" : ""); + vol = ni->vol; + lcnbmp_vi = vol->lcnbmp_ino; + BUG_ON(!lcnbmp_vi); + BUG_ON(start_vcn < 0); + BUG_ON(count < -1); + /* + * Lock the lcn bitmap for writing but only if not rolling back. We + * must hold the lock all the way including through rollback otherwise + * rollback is not possible because once we have cleared a bit and + * dropped the lock, anyone could have set the bit again, thus + * allocating the cluster for another use. + */ + if (likely(!is_rollback)) + down_write(&vol->lcnbmp_lock); + + total_freed = real_freed = 0; + + rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx); + if (IS_ERR(rl)) { + if (!is_rollback) + ntfs_error(vol->sb, "Failed to find first runlist " + "element (error %li), aborting.", + PTR_ERR(rl)); + err = PTR_ERR(rl); + goto err_out; + } + if (unlikely(rl->lcn < LCN_HOLE)) { + if (!is_rollback) + ntfs_error(vol->sb, "First runlist element has " + "invalid lcn, aborting."); + err = -EIO; + goto err_out; + } + /* Find the starting cluster inside the run that needs freeing. */ + delta = start_vcn - rl->vcn; + + /* The number of clusters in this run that need freeing. */ + to_free = rl->length - delta; + if (count >= 0 && to_free > count) + to_free = count; + + if (likely(rl->lcn >= 0)) { + /* Do the actual freeing of the clusters in this run. */ + err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta, + to_free, likely(!is_rollback) ? 0 : 1); + if (unlikely(err)) { + if (!is_rollback) + ntfs_error(vol->sb, "Failed to clear first run " + "(error %i), aborting.", err); + goto err_out; + } + /* We have freed @to_free real clusters. */ + real_freed = to_free; + }; + /* Go to the next run and adjust the number of clusters left to free. */ + ++rl; + if (count >= 0) + count -= to_free; + + /* Keep track of the total "freed" clusters, including sparse ones. */ + total_freed = to_free; + /* + * Loop over the remaining runs, using @count as a capping value, and + * free them. + */ + for (; rl->length && count != 0; ++rl) { + if (unlikely(rl->lcn < LCN_HOLE)) { + VCN vcn; + + /* Attempt to map runlist. */ + vcn = rl->vcn; + rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (!is_rollback) + ntfs_error(vol->sb, "Failed to map " + "runlist fragment or " + "failed to find " + "subsequent runlist " + "element."); + goto err_out; + } + if (unlikely(rl->lcn < LCN_HOLE)) { + if (!is_rollback) + ntfs_error(vol->sb, "Runlist element " + "has invalid lcn " + "(0x%llx).", + (unsigned long long) + rl->lcn); + err = -EIO; + goto err_out; + } + } + /* The number of clusters in this run that need freeing. */ + to_free = rl->length; + if (count >= 0 && to_free > count) + to_free = count; + + if (likely(rl->lcn >= 0)) { + /* Do the actual freeing of the clusters in the run. */ + err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn, + to_free, likely(!is_rollback) ? 0 : 1); + if (unlikely(err)) { + if (!is_rollback) + ntfs_error(vol->sb, "Failed to clear " + "subsequent run."); + goto err_out; + } + /* We have freed @to_free real clusters. */ + real_freed += to_free; + } + /* Adjust the number of clusters left to free. */ + if (count >= 0) + count -= to_free; + + /* Update the total done clusters. */ + total_freed += to_free; + } + if (likely(!is_rollback)) + up_write(&vol->lcnbmp_lock); + + BUG_ON(count > 0); + + /* We are done. Return the number of actually freed clusters. */ + ntfs_debug("Done."); + return real_freed; +err_out: + if (is_rollback) + return err; + /* If no real clusters were freed, no need to rollback. */ + if (!real_freed) { + up_write(&vol->lcnbmp_lock); + return err; + } + /* + * Attempt to rollback and if that succeeds just return the error code. + * If rollback fails, set the volume errors flag, emit an error + * message, and return the error code. + */ + delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true); + if (delta < 0) { + ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving " + "inconsistent metadata! Unmount and run " + "chkdsk.", (int)delta); + NVolSetErrors(vol); + } + up_write(&vol->lcnbmp_lock); + ntfs_error(vol->sb, "Aborting (error %i).", err); + return err; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h new file mode 100644 index 00000000..2adb0431 --- /dev/null +++ b/fs/ntfs/lcnalloc.h @@ -0,0 +1,145 @@ +/* + * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_LCNALLOC_H +#define _LINUX_NTFS_LCNALLOC_H + +#ifdef NTFS_RW + +#include + +#include "attrib.h" +#include "types.h" +#include "inode.h" +#include "runlist.h" +#include "volume.h" + +typedef enum { + FIRST_ZONE = 0, /* For sanity checking. */ + MFT_ZONE = 0, /* Allocate from $MFT zone. */ + DATA_ZONE = 1, /* Allocate from $DATA zone. */ + LAST_ZONE = 1, /* For sanity checking. */ +} NTFS_CLUSTER_ALLOCATION_ZONES; + +extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, + const VCN start_vcn, const s64 count, const LCN start_lcn, + const NTFS_CLUSTER_ALLOCATION_ZONES zone, + const bool is_extension); + +extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, + s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback); + +/** + * ntfs_cluster_free - free clusters on an ntfs volume + * @ni: ntfs inode whose runlist describes the clusters to free + * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters + * @count: number of clusters to free or -1 for all clusters + * @ctx: active attribute search context if present or NULL if not + * + * Free @count clusters starting at the cluster @start_vcn in the runlist + * described by the ntfs inode @ni. + * + * If @count is -1, all clusters from @start_vcn to the end of the runlist are + * deallocated. Thus, to completely free all clusters in a runlist, use + * @start_vcn = 0 and @count = -1. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when ntfs_cluster_free() encounters unmapped runlist + * fragments and allows their mapping. If you do not have the mft record + * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform + * the necessary mapping and unmapping. + * + * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it + * before returning. Thus, @ctx will be left pointing to the same attribute on + * return as on entry. However, the actual pointers in @ctx may point to + * different memory locations on return, so you must remember to reset any + * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(), + * you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * + * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove + * from the runlist or mark sparse the freed runs later. + * + * Return the number of deallocated clusters (not counting sparse ones) on + * success and -errno on error. + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist may be modified when + * needed runlist fragments need to be mapped. + * - The volume lcn bitmap must be unlocked on entry and is unlocked + * on return. + * - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, + s64 count, ntfs_attr_search_ctx *ctx) +{ + return __ntfs_cluster_free(ni, start_vcn, count, ctx, false); +} + +extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, + const runlist_element *rl); + +/** + * ntfs_cluster_free_from_rl - free clusters from runlist + * @vol: mounted ntfs volume on which to free the clusters + * @rl: runlist describing the clusters to free + * + * Free all the clusters described by the runlist @rl on the volume @vol. In + * the case of an error being returned, at least some of the clusters were not + * freed. + * + * Return 0 on success and -errno on error. + * + * Locking: - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + * - The caller must have locked the runlist @rl for reading or + * writing. + */ +static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol, + const runlist_element *rl) +{ + int ret; + + down_write(&vol->lcnbmp_lock); + ret = ntfs_cluster_free_from_rl_nolock(vol, rl); + up_write(&vol->lcnbmp_lock); + return ret; +} + +#endif /* NTFS_RW */ + +#endif /* defined _LINUX_NTFS_LCNALLOC_H */ diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c new file mode 100644 index 00000000..c71de292 --- /dev/null +++ b/fs/ntfs/logfile.c @@ -0,0 +1,863 @@ +/* + * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2002-2007 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifdef NTFS_RW + +#include +#include +#include +#include +#include +#include + +#include "attrib.h" +#include "aops.h" +#include "debug.h" +#include "logfile.h" +#include "malloc.h" +#include "volume.h" +#include "ntfs.h" + +/** + * ntfs_check_restart_page_header - check the page header for consistency + * @vi: $LogFile inode to which the restart page header belongs + * @rp: restart page header to check + * @pos: position in @vi at which the restart page header resides + * + * Check the restart page header @rp for consistency and return 'true' if it is + * consistent and 'false' otherwise. + * + * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not + * require the full restart page. + */ +static bool ntfs_check_restart_page_header(struct inode *vi, + RESTART_PAGE_HEADER *rp, s64 pos) +{ + u32 logfile_system_page_size, logfile_log_page_size; + u16 ra_ofs, usa_count, usa_ofs, usa_end = 0; + bool have_usa = true; + + ntfs_debug("Entering."); + /* + * If the system or log page sizes are smaller than the ntfs block size + * or either is not a power of 2 we cannot handle this log file. + */ + logfile_system_page_size = le32_to_cpu(rp->system_page_size); + logfile_log_page_size = le32_to_cpu(rp->log_page_size); + if (logfile_system_page_size < NTFS_BLOCK_SIZE || + logfile_log_page_size < NTFS_BLOCK_SIZE || + logfile_system_page_size & + (logfile_system_page_size - 1) || + !is_power_of_2(logfile_log_page_size)) { + ntfs_error(vi->i_sb, "$LogFile uses unsupported page size."); + return false; + } + /* + * We must be either at !pos (1st restart page) or at pos = system page + * size (2nd restart page). + */ + if (pos && pos != logfile_system_page_size) { + ntfs_error(vi->i_sb, "Found restart area in incorrect " + "position in $LogFile."); + return false; + } + /* We only know how to handle version 1.1. */ + if (sle16_to_cpu(rp->major_ver) != 1 || + sle16_to_cpu(rp->minor_ver) != 1) { + ntfs_error(vi->i_sb, "$LogFile version %i.%i is not " + "supported. (This driver supports version " + "1.1 only.)", (int)sle16_to_cpu(rp->major_ver), + (int)sle16_to_cpu(rp->minor_ver)); + return false; + } + /* + * If chkdsk has been run the restart page may not be protected by an + * update sequence array. + */ + if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) { + have_usa = false; + goto skip_usa_checks; + } + /* Verify the size of the update sequence array. */ + usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS); + if (usa_count != le16_to_cpu(rp->usa_count)) { + ntfs_error(vi->i_sb, "$LogFile restart page specifies " + "inconsistent update sequence array count."); + return false; + } + /* Verify the position of the update sequence array. */ + usa_ofs = le16_to_cpu(rp->usa_ofs); + usa_end = usa_ofs + usa_count * sizeof(u16); + if (usa_ofs < sizeof(RESTART_PAGE_HEADER) || + usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "$LogFile restart page specifies " + "inconsistent update sequence array offset."); + return false; + } +skip_usa_checks: + /* + * Verify the position of the restart area. It must be: + * - aligned to 8-byte boundary, + * - after the update sequence array, and + * - within the system page size. + */ + ra_ofs = le16_to_cpu(rp->restart_area_offset); + if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end : + ra_ofs < sizeof(RESTART_PAGE_HEADER)) || + ra_ofs > logfile_system_page_size) { + ntfs_error(vi->i_sb, "$LogFile restart page specifies " + "inconsistent restart area offset."); + return false; + } + /* + * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn + * set. + */ + if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) { + ntfs_error(vi->i_sb, "$LogFile restart page is not modified " + "by chkdsk but a chkdsk LSN is specified."); + return false; + } + ntfs_debug("Done."); + return true; +} + +/** + * ntfs_check_restart_area - check the restart area for consistency + * @vi: $LogFile inode to which the restart page belongs + * @rp: restart page whose restart area to check + * + * Check the restart area of the restart page @rp for consistency and return + * 'true' if it is consistent and 'false' otherwise. + * + * This function assumes that the restart page header has already been + * consistency checked. + * + * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not + * require the full restart page. + */ +static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp) +{ + u64 file_size; + RESTART_AREA *ra; + u16 ra_ofs, ra_len, ca_ofs; + u8 fs_bits; + + ntfs_debug("Entering."); + ra_ofs = le16_to_cpu(rp->restart_area_offset); + ra = (RESTART_AREA*)((u8*)rp + ra_ofs); + /* + * Everything before ra->file_size must be before the first word + * protected by an update sequence number. This ensures that it is + * safe to access ra->client_array_offset. + */ + if (ra_ofs + offsetof(RESTART_AREA, file_size) > + NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent file offset."); + return false; + } + /* + * Now that we can access ra->client_array_offset, make sure everything + * up to the log client array is before the first word protected by an + * update sequence number. This ensures we can access all of the + * restart area elements safely. Also, the client array offset must be + * aligned to an 8-byte boundary. + */ + ca_ofs = le16_to_cpu(ra->client_array_offset); + if (((ca_ofs + 7) & ~7) != ca_ofs || + ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent client array offset."); + return false; + } + /* + * The restart area must end within the system page size both when + * calculated manually and as specified by ra->restart_area_length. + * Also, the calculated length must not exceed the specified length. + */ + ra_len = ca_ofs + le16_to_cpu(ra->log_clients) * + sizeof(LOG_CLIENT_RECORD); + if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) || + ra_ofs + le16_to_cpu(ra->restart_area_length) > + le32_to_cpu(rp->system_page_size) || + ra_len > le16_to_cpu(ra->restart_area_length)) { + ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds " + "of the system page size specified by the " + "restart page header and/or the specified " + "restart area length is inconsistent."); + return false; + } + /* + * The ra->client_free_list and ra->client_in_use_list must be either + * LOGFILE_NO_CLIENT or less than ra->log_clients or they are + * overflowing the client array. + */ + if ((ra->client_free_list != LOGFILE_NO_CLIENT && + le16_to_cpu(ra->client_free_list) >= + le16_to_cpu(ra->log_clients)) || + (ra->client_in_use_list != LOGFILE_NO_CLIENT && + le16_to_cpu(ra->client_in_use_list) >= + le16_to_cpu(ra->log_clients))) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "overflowing client free and/or in use lists."); + return false; + } + /* + * Check ra->seq_number_bits against ra->file_size for consistency. + * We cannot just use ffs() because the file size is not a power of 2. + */ + file_size = (u64)sle64_to_cpu(ra->file_size); + fs_bits = 0; + while (file_size) { + file_size >>= 1; + fs_bits++; + } + if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent sequence number bits."); + return false; + } + /* The log record header length must be a multiple of 8. */ + if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) != + le16_to_cpu(ra->log_record_header_length)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent log record header length."); + return false; + } + /* Dito for the log page data offset. */ + if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) != + le16_to_cpu(ra->log_page_data_offset)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent log page data offset."); + return false; + } + ntfs_debug("Done."); + return true; +} + +/** + * ntfs_check_log_client_array - check the log client array for consistency + * @vi: $LogFile inode to which the restart page belongs + * @rp: restart page whose log client array to check + * + * Check the log client array of the restart page @rp for consistency and + * return 'true' if it is consistent and 'false' otherwise. + * + * This function assumes that the restart page header and the restart area have + * already been consistency checked. + * + * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this + * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full + * restart page and the page must be multi sector transfer deprotected. + */ +static bool ntfs_check_log_client_array(struct inode *vi, + RESTART_PAGE_HEADER *rp) +{ + RESTART_AREA *ra; + LOG_CLIENT_RECORD *ca, *cr; + u16 nr_clients, idx; + bool in_free_list, idx_is_first; + + ntfs_debug("Entering."); + ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); + ca = (LOG_CLIENT_RECORD*)((u8*)ra + + le16_to_cpu(ra->client_array_offset)); + /* + * Check the ra->client_free_list first and then check the + * ra->client_in_use_list. Check each of the log client records in + * each of the lists and check that the array does not overflow the + * ra->log_clients value. Also keep track of the number of records + * visited as there cannot be more than ra->log_clients records and + * that way we detect eventual loops in within a list. + */ + nr_clients = le16_to_cpu(ra->log_clients); + idx = le16_to_cpu(ra->client_free_list); + in_free_list = true; +check_list: + for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--, + idx = le16_to_cpu(cr->next_client)) { + if (!nr_clients || idx >= le16_to_cpu(ra->log_clients)) + goto err_out; + /* Set @cr to the current log client record. */ + cr = ca + idx; + /* The first log client record must not have a prev_client. */ + if (idx_is_first) { + if (cr->prev_client != LOGFILE_NO_CLIENT) + goto err_out; + idx_is_first = false; + } + } + /* Switch to and check the in use list if we just did the free list. */ + if (in_free_list) { + in_free_list = false; + idx = le16_to_cpu(ra->client_in_use_list); + goto check_list; + } + ntfs_debug("Done."); + return true; +err_out: + ntfs_error(vi->i_sb, "$LogFile log client array is corrupt."); + return false; +} + +/** + * ntfs_check_and_load_restart_page - check the restart page for consistency + * @vi: $LogFile inode to which the restart page belongs + * @rp: restart page to check + * @pos: position in @vi at which the restart page resides + * @wrp: [OUT] copy of the multi sector transfer deprotected restart page + * @lsn: [OUT] set to the current logfile lsn on success + * + * Check the restart page @rp for consistency and return 0 if it is consistent + * and -errno otherwise. The restart page may have been modified by chkdsk in + * which case its magic is CHKD instead of RSTR. + * + * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not + * require the full restart page. + * + * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a + * copy of the complete multi sector transfer deprotected page. On failure, + * *@wrp is undefined. + * + * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current + * logfile lsn according to this restart page. On failure, *@lsn is undefined. + * + * The following error codes are defined: + * -EINVAL - The restart page is inconsistent. + * -ENOMEM - Not enough memory to load the restart page. + * -EIO - Failed to reading from $LogFile. + */ +static int ntfs_check_and_load_restart_page(struct inode *vi, + RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp, + LSN *lsn) +{ + RESTART_AREA *ra; + RESTART_PAGE_HEADER *trp; + int size, err; + + ntfs_debug("Entering."); + /* Check the restart page header for consistency. */ + if (!ntfs_check_restart_page_header(vi, rp, pos)) { + /* Error output already done inside the function. */ + return -EINVAL; + } + /* Check the restart area for consistency. */ + if (!ntfs_check_restart_area(vi, rp)) { + /* Error output already done inside the function. */ + return -EINVAL; + } + ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); + /* + * Allocate a buffer to store the whole restart page so we can multi + * sector transfer deprotect it. + */ + trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size)); + if (!trp) { + ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile " + "restart page buffer."); + return -ENOMEM; + } + /* + * Read the whole of the restart page into the buffer. If it fits + * completely inside @rp, just copy it from there. Otherwise map all + * the required pages and copy the data from them. + */ + size = PAGE_CACHE_SIZE - (pos & ~PAGE_CACHE_MASK); + if (size >= le32_to_cpu(rp->system_page_size)) { + memcpy(trp, rp, le32_to_cpu(rp->system_page_size)); + } else { + pgoff_t idx; + struct page *page; + int have_read, to_read; + + /* First copy what we already have in @rp. */ + memcpy(trp, rp, size); + /* Copy the remaining data one page at a time. */ + have_read = size; + to_read = le32_to_cpu(rp->system_page_size) - size; + idx = (pos + size) >> PAGE_CACHE_SHIFT; + BUG_ON((pos + size) & ~PAGE_CACHE_MASK); + do { + page = ntfs_map_page(vi->i_mapping, idx); + if (IS_ERR(page)) { + ntfs_error(vi->i_sb, "Error mapping $LogFile " + "page (index %lu).", idx); + err = PTR_ERR(page); + if (err != -EIO && err != -ENOMEM) + err = -EIO; + goto err_out; + } + size = min_t(int, to_read, PAGE_CACHE_SIZE); + memcpy((u8*)trp + have_read, page_address(page), size); + ntfs_unmap_page(page); + have_read += size; + to_read -= size; + idx++; + } while (to_read > 0); + } + /* + * Perform the multi sector transfer deprotection on the buffer if the + * restart page is protected. + */ + if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count)) + && post_read_mst_fixup((NTFS_RECORD*)trp, + le32_to_cpu(rp->system_page_size))) { + /* + * A multi sector tranfer error was detected. We only need to + * abort if the restart page contents exceed the multi sector + * transfer fixup of the first sector. + */ + if (le16_to_cpu(rp->restart_area_offset) + + le16_to_cpu(ra->restart_area_length) > + NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "Multi sector transfer error " + "detected in $LogFile restart page."); + err = -EINVAL; + goto err_out; + } + } + /* + * If the restart page is modified by chkdsk or there are no active + * logfile clients, the logfile is consistent. Otherwise, need to + * check the log client records for consistency, too. + */ + err = 0; + if (ntfs_is_rstr_record(rp->magic) && + ra->client_in_use_list != LOGFILE_NO_CLIENT) { + if (!ntfs_check_log_client_array(vi, trp)) { + err = -EINVAL; + goto err_out; + } + } + if (lsn) { + if (ntfs_is_rstr_record(rp->magic)) + *lsn = sle64_to_cpu(ra->current_lsn); + else /* if (ntfs_is_chkd_record(rp->magic)) */ + *lsn = sle64_to_cpu(rp->chkdsk_lsn); + } + ntfs_debug("Done."); + if (wrp) + *wrp = trp; + else { +err_out: + ntfs_free(trp); + } + return err; +} + +/** + * ntfs_check_logfile - check the journal for consistency + * @log_vi: struct inode of loaded journal $LogFile to check + * @rp: [OUT] on success this is a copy of the current restart page + * + * Check the $LogFile journal for consistency and return 'true' if it is + * consistent and 'false' if not. On success, the current restart page is + * returned in *@rp. Caller must call ntfs_free(*@rp) when finished with it. + * + * At present we only check the two restart pages and ignore the log record + * pages. + * + * Note that the MstProtected flag is not set on the $LogFile inode and hence + * when reading pages they are not deprotected. This is because we do not know + * if the $LogFile was created on a system with a different page size to ours + * yet and mst deprotection would fail if our page size is smaller. + */ +bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) +{ + s64 size, pos; + LSN rstr1_lsn, rstr2_lsn; + ntfs_volume *vol = NTFS_SB(log_vi->i_sb); + struct address_space *mapping = log_vi->i_mapping; + struct page *page = NULL; + u8 *kaddr = NULL; + RESTART_PAGE_HEADER *rstr1_ph = NULL; + RESTART_PAGE_HEADER *rstr2_ph = NULL; + int log_page_size, log_page_mask, err; + bool logfile_is_empty = true; + u8 log_page_bits; + + ntfs_debug("Entering."); + /* An empty $LogFile must have been clean before it got emptied. */ + if (NVolLogFileEmpty(vol)) + goto is_empty; + size = i_size_read(log_vi); + /* Make sure the file doesn't exceed the maximum allowed size. */ + if (size > MaxLogFileSize) + size = MaxLogFileSize; + /* + * Truncate size to a multiple of the page cache size or the default + * log page size if the page cache size is between the default log page + * log page size if the page cache size is between the default log page + * size and twice that. + */ + if (PAGE_CACHE_SIZE >= DefaultLogPageSize && PAGE_CACHE_SIZE <= + DefaultLogPageSize * 2) + log_page_size = DefaultLogPageSize; + else + log_page_size = PAGE_CACHE_SIZE; + log_page_mask = log_page_size - 1; + /* + * Use ntfs_ffs() instead of ffs() to enable the compiler to + * optimize log_page_size and log_page_bits into constants. + */ + log_page_bits = ntfs_ffs(log_page_size) - 1; + size &= ~(s64)(log_page_size - 1); + /* + * Ensure the log file is big enough to store at least the two restart + * pages and the minimum number of log record pages. + */ + if (size < log_page_size * 2 || (size - log_page_size * 2) >> + log_page_bits < MinLogRecordPages) { + ntfs_error(vol->sb, "$LogFile is too small."); + return false; + } + /* + * Read through the file looking for a restart page. Since the restart + * page header is at the beginning of a page we only need to search at + * what could be the beginning of a page (for each page size) rather + * than scanning the whole file byte by byte. If all potential places + * contain empty and uninitialzed records, the log file can be assumed + * to be empty. + */ + for (pos = 0; pos < size; pos <<= 1) { + pgoff_t idx = pos >> PAGE_CACHE_SHIFT; + if (!page || page->index != idx) { + if (page) + ntfs_unmap_page(page); + page = ntfs_map_page(mapping, idx); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Error mapping $LogFile " + "page (index %lu).", idx); + goto err_out; + } + } + kaddr = (u8*)page_address(page) + (pos & ~PAGE_CACHE_MASK); + /* + * A non-empty block means the logfile is not empty while an + * empty block after a non-empty block has been encountered + * means we are done. + */ + if (!ntfs_is_empty_recordp((le32*)kaddr)) + logfile_is_empty = false; + else if (!logfile_is_empty) + break; + /* + * A log record page means there cannot be a restart page after + * this so no need to continue searching. + */ + if (ntfs_is_rcrd_recordp((le32*)kaddr)) + break; + /* If not a (modified by chkdsk) restart page, continue. */ + if (!ntfs_is_rstr_recordp((le32*)kaddr) && + !ntfs_is_chkd_recordp((le32*)kaddr)) { + if (!pos) + pos = NTFS_BLOCK_SIZE >> 1; + continue; + } + /* + * Check the (modified by chkdsk) restart page for consistency + * and get a copy of the complete multi sector transfer + * deprotected restart page. + */ + err = ntfs_check_and_load_restart_page(log_vi, + (RESTART_PAGE_HEADER*)kaddr, pos, + !rstr1_ph ? &rstr1_ph : &rstr2_ph, + !rstr1_ph ? &rstr1_lsn : &rstr2_lsn); + if (!err) { + /* + * If we have now found the first (modified by chkdsk) + * restart page, continue looking for the second one. + */ + if (!pos) { + pos = NTFS_BLOCK_SIZE >> 1; + continue; + } + /* + * We have now found the second (modified by chkdsk) + * restart page, so we can stop looking. + */ + break; + } + /* + * Error output already done inside the function. Note, we do + * not abort if the restart page was invalid as we might still + * find a valid one further in the file. + */ + if (err != -EINVAL) { + ntfs_unmap_page(page); + goto err_out; + } + /* Continue looking. */ + if (!pos) + pos = NTFS_BLOCK_SIZE >> 1; + } + if (page) + ntfs_unmap_page(page); + if (logfile_is_empty) { + NVolSetLogFileEmpty(vol); +is_empty: + ntfs_debug("Done. ($LogFile is empty.)"); + return true; + } + if (!rstr1_ph) { + BUG_ON(rstr2_ph); + ntfs_error(vol->sb, "Did not find any restart pages in " + "$LogFile and it was not empty."); + return false; + } + /* If both restart pages were found, use the more recent one. */ + if (rstr2_ph) { + /* + * If the second restart area is more recent, switch to it. + * Otherwise just throw it away. + */ + if (rstr2_lsn > rstr1_lsn) { + ntfs_debug("Using second restart page as it is more " + "recent."); + ntfs_free(rstr1_ph); + rstr1_ph = rstr2_ph; + /* rstr1_lsn = rstr2_lsn; */ + } else { + ntfs_debug("Using first restart page as it is more " + "recent."); + ntfs_free(rstr2_ph); + } + rstr2_ph = NULL; + } + /* All consistency checks passed. */ + if (rp) + *rp = rstr1_ph; + else + ntfs_free(rstr1_ph); + ntfs_debug("Done."); + return true; +err_out: + if (rstr1_ph) + ntfs_free(rstr1_ph); + return false; +} + +/** + * ntfs_is_logfile_clean - check in the journal if the volume is clean + * @log_vi: struct inode of loaded journal $LogFile to check + * @rp: copy of the current restart page + * + * Analyze the $LogFile journal and return 'true' if it indicates the volume was + * shutdown cleanly and 'false' if not. + * + * At present we only look at the two restart pages and ignore the log record + * pages. This is a little bit crude in that there will be a very small number + * of cases where we think that a volume is dirty when in fact it is clean. + * This should only affect volumes that have not been shutdown cleanly but did + * not have any pending, non-check-pointed i/o, i.e. they were completely idle + * at least for the five seconds preceding the unclean shutdown. + * + * This function assumes that the $LogFile journal has already been consistency + * checked by a call to ntfs_check_logfile() and in particular if the $LogFile + * is empty this function requires that NVolLogFileEmpty() is true otherwise an + * empty volume will be reported as dirty. + */ +bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp) +{ + ntfs_volume *vol = NTFS_SB(log_vi->i_sb); + RESTART_AREA *ra; + + ntfs_debug("Entering."); + /* An empty $LogFile must have been clean before it got emptied. */ + if (NVolLogFileEmpty(vol)) { + ntfs_debug("Done. ($LogFile is empty.)"); + return true; + } + BUG_ON(!rp); + if (!ntfs_is_rstr_record(rp->magic) && + !ntfs_is_chkd_record(rp->magic)) { + ntfs_error(vol->sb, "Restart page buffer is invalid. This is " + "probably a bug in that the $LogFile should " + "have been consistency checked before calling " + "this function."); + return false; + } + ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); + /* + * If the $LogFile has active clients, i.e. it is open, and we do not + * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags, + * we assume there was an unclean shutdown. + */ + if (ra->client_in_use_list != LOGFILE_NO_CLIENT && + !(ra->flags & RESTART_VOLUME_IS_CLEAN)) { + ntfs_debug("Done. $LogFile indicates a dirty shutdown."); + return false; + } + /* $LogFile indicates a clean shutdown. */ + ntfs_debug("Done. $LogFile indicates a clean shutdown."); + return true; +} + +/** + * ntfs_empty_logfile - empty the contents of the $LogFile journal + * @log_vi: struct inode of loaded journal $LogFile to empty + * + * Empty the contents of the $LogFile journal @log_vi and return 'true' on + * success and 'false' on error. + * + * This function assumes that the $LogFile journal has already been consistency + * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean() + * has been used to ensure that the $LogFile is clean. + */ +bool ntfs_empty_logfile(struct inode *log_vi) +{ + VCN vcn, end_vcn; + ntfs_inode *log_ni = NTFS_I(log_vi); + ntfs_volume *vol = log_ni->vol; + struct super_block *sb = vol->sb; + runlist_element *rl; + unsigned long flags; + unsigned block_size, block_size_bits; + int err; + bool should_wait = true; + + ntfs_debug("Entering."); + if (NVolLogFileEmpty(vol)) { + ntfs_debug("Done."); + return true; + } + /* + * We cannot use ntfs_attr_set() because we may be still in the middle + * of a mount operation. Thus we do the emptying by hand by first + * zapping the page cache pages for the $LogFile/$DATA attribute and + * then emptying each of the buffers in each of the clusters specified + * by the runlist by hand. + */ + block_size = sb->s_blocksize; + block_size_bits = sb->s_blocksize_bits; + vcn = 0; + read_lock_irqsave(&log_ni->size_lock, flags); + end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >> + vol->cluster_size_bits; + read_unlock_irqrestore(&log_ni->size_lock, flags); + truncate_inode_pages(log_vi->i_mapping, 0); + down_write(&log_ni->runlist.lock); + rl = log_ni->runlist.rl; + if (unlikely(!rl || vcn < rl->vcn || !rl->length)) { +map_vcn: + err = ntfs_map_runlist_nolock(log_ni, vcn, NULL); + if (err) { + ntfs_error(sb, "Failed to map runlist fragment (error " + "%d).", -err); + goto err; + } + rl = log_ni->runlist.rl; + BUG_ON(!rl || vcn < rl->vcn || !rl->length); + } + /* Seek to the runlist element containing @vcn. */ + while (rl->length && vcn >= rl[1].vcn) + rl++; + do { + LCN lcn; + sector_t block, end_block; + s64 len; + + /* + * If this run is not mapped map it now and start again as the + * runlist will have been updated. + */ + lcn = rl->lcn; + if (unlikely(lcn == LCN_RL_NOT_MAPPED)) { + vcn = rl->vcn; + goto map_vcn; + } + /* If this run is not valid abort with an error. */ + if (unlikely(!rl->length || lcn < LCN_HOLE)) + goto rl_err; + /* Skip holes. */ + if (lcn == LCN_HOLE) + continue; + block = lcn << vol->cluster_size_bits >> block_size_bits; + len = rl->length; + if (rl[1].vcn > end_vcn) + len = end_vcn - rl->vcn; + end_block = (lcn + len) << vol->cluster_size_bits >> + block_size_bits; + /* Iterate over the blocks in the run and empty them. */ + do { + struct buffer_head *bh; + + /* Obtain the buffer, possibly not uptodate. */ + bh = sb_getblk(sb, block); + BUG_ON(!bh); + /* Setup buffer i/o submission. */ + lock_buffer(bh); + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + /* Set the entire contents of the buffer to 0xff. */ + memset(bh->b_data, -1, block_size); + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (buffer_dirty(bh)) + clear_buffer_dirty(bh); + /* + * Submit the buffer and wait for i/o to complete but + * only for the first buffer so we do not miss really + * serious i/o errors. Once the first buffer has + * completed ignore errors afterwards as we can assume + * that if one buffer worked all of them will work. + */ + submit_bh(WRITE, bh); + if (should_wait) { + should_wait = false; + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + goto io_err; + } + brelse(bh); + } while (++block < end_block); + } while ((++rl)->vcn < end_vcn); + up_write(&log_ni->runlist.lock); + /* + * Zap the pages again just in case any got instantiated whilst we were + * emptying the blocks by hand. FIXME: We may not have completed + * writing to all the buffer heads yet so this may happen too early. + * We really should use a kernel thread to do the emptying + * asynchronously and then we can also set the volume dirty and output + * an error message if emptying should fail. + */ + truncate_inode_pages(log_vi->i_mapping, 0); + /* Set the flag so we do not have to do it again on remount. */ + NVolSetLogFileEmpty(vol); + ntfs_debug("Done."); + return true; +io_err: + ntfs_error(sb, "Failed to write buffer. Unmount and run chkdsk."); + goto dirty_err; +rl_err: + ntfs_error(sb, "Runlist is corrupt. Unmount and run chkdsk."); +dirty_err: + NVolSetErrors(vol); + err = -EIO; +err: + up_write(&log_ni->runlist.lock); + ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).", + -err); + return false; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h new file mode 100644 index 00000000..aa2b6ac3 --- /dev/null +++ b/fs/ntfs/logfile.h @@ -0,0 +1,309 @@ +/* + * logfile.h - Defines for NTFS kernel journal ($LogFile) handling. Part of + * the Linux-NTFS project. + * + * Copyright (c) 2000-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_LOGFILE_H +#define _LINUX_NTFS_LOGFILE_H + +#ifdef NTFS_RW + +#include + +#include "types.h" +#include "endian.h" +#include "layout.h" + +/* + * Journal ($LogFile) organization: + * + * Two restart areas present in the first two pages (restart pages, one restart + * area in each page). When the volume is dismounted they should be identical, + * except for the update sequence array which usually has a different update + * sequence number. + * + * These are followed by log records organized in pages headed by a log record + * header going up to log file size. Not all pages contain log records when a + * volume is first formatted, but as the volume ages, all records will be used. + * When the log file fills up, the records at the beginning are purged (by + * modifying the oldest_lsn to a higher value presumably) and writing begins + * at the beginning of the file. Effectively, the log file is viewed as a + * circular entity. + * + * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept + * versions <= 1.x, including 0.-1. (Yes, that is a minus one in there!) We + * probably only want to support 1.1 as this seems to be the current version + * and we don't know how that differs from the older versions. The only + * exception is if the journal is clean as marked by the two restart pages + * then it doesn't matter whether we are on an earlier version. We can just + * reinitialize the logfile and start again with version 1.1. + */ + +/* Some $LogFile related constants. */ +#define MaxLogFileSize 0x100000000ULL +#define DefaultLogPageSize 4096 +#define MinLogRecordPages 48 + +/* + * Log file restart page header (begins the restart area). + */ +typedef struct { +/*Ofs*/ +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ +/* 0*/ NTFS_RECORD_TYPE magic; /* The magic is "RSTR". */ +/* 4*/ le16 usa_ofs; /* See NTFS_RECORD definition in layout.h. + When creating, set this to be immediately + after this header structure (without any + alignment). */ +/* 6*/ le16 usa_count; /* See NTFS_RECORD definition in layout.h. */ + +/* 8*/ leLSN chkdsk_lsn; /* The last log file sequence number found by + chkdsk. Only used when the magic is changed + to "CHKD". Otherwise this is zero. */ +/* 16*/ le32 system_page_size; /* Byte size of system pages when the log file + was created, has to be >= 512 and a power of + 2. Use this to calculate the required size + of the usa (usa_count) and add it to usa_ofs. + Then verify that the result is less than the + value of the restart_area_offset. */ +/* 20*/ le32 log_page_size; /* Byte size of log file pages, has to be >= + 512 and a power of 2. The default is 4096 + and is used when the system page size is + between 4096 and 8192. Otherwise this is + set to the system page size instead. */ +/* 24*/ le16 restart_area_offset;/* Byte offset from the start of this header to + the RESTART_AREA. Value has to be aligned + to 8-byte boundary. When creating, set this + to be after the usa. */ +/* 26*/ sle16 minor_ver; /* Log file minor version. Only check if major + version is 1. */ +/* 28*/ sle16 major_ver; /* Log file major version. We only support + version 1.1. */ +/* sizeof() = 30 (0x1e) bytes */ +} __attribute__ ((__packed__)) RESTART_PAGE_HEADER; + +/* + * Constant for the log client indices meaning that there are no client records + * in this particular client array. Also inside the client records themselves, + * this means that there are no client records preceding or following this one. + */ +#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff) +#define LOGFILE_NO_CLIENT_CPU 0xffff + +/* + * These are the so far known RESTART_AREA_* flags (16-bit) which contain + * information about the log file in which they are present. + */ +enum { + RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002), + RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ +} __attribute__ ((__packed__)); + +typedef le16 RESTART_AREA_FLAGS; + +/* + * Log file restart area record. The offset of this record is found by adding + * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found + * in it. See notes at restart_area_offset above. + */ +typedef struct { +/*Ofs*/ +/* 0*/ leLSN current_lsn; /* The current, i.e. last LSN inside the log + when the restart area was last written. + This happens often but what is the interval? + Is it just fixed time or is it every time a + check point is written or somethine else? + On create set to 0. */ +/* 8*/ le16 log_clients; /* Number of log client records in the array of + log client records which follows this + restart area. Must be 1. */ +/* 10*/ le16 client_free_list; /* The index of the first free log client record + in the array of log client records. + LOGFILE_NO_CLIENT means that there are no + free log client records in the array. + If != LOGFILE_NO_CLIENT, check that + log_clients > client_free_list. On Win2k + and presumably earlier, on a clean volume + this is != LOGFILE_NO_CLIENT, and it should + be 0, i.e. the first (and only) client + record is free and thus the logfile is + closed and hence clean. A dirty volume + would have left the logfile open and hence + this would be LOGFILE_NO_CLIENT. On WinXP + and presumably later, the logfile is always + open, even on clean shutdown so this should + always be LOGFILE_NO_CLIENT. */ +/* 12*/ le16 client_in_use_list;/* The index of the first in-use log client + record in the array of log client records. + LOGFILE_NO_CLIENT means that there are no + in-use log client records in the array. If + != LOGFILE_NO_CLIENT check that log_clients + > client_in_use_list. On Win2k and + presumably earlier, on a clean volume this + is LOGFILE_NO_CLIENT, i.e. there are no + client records in use and thus the logfile + is closed and hence clean. A dirty volume + would have left the logfile open and hence + this would be != LOGFILE_NO_CLIENT, and it + should be 0, i.e. the first (and only) + client record is in use. On WinXP and + presumably later, the logfile is always + open, even on clean shutdown so this should + always be 0. */ +/* 14*/ RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour. On Win2k + and presumably earlier this is always 0. On + WinXP and presumably later, if the logfile + was shutdown cleanly, the second bit, + RESTART_VOLUME_IS_CLEAN, is set. This bit + is cleared when the volume is mounted by + WinXP and set when the volume is dismounted, + thus if the logfile is dirty, this bit is + clear. Thus we don't need to check the + Windows version to determine if the logfile + is clean. Instead if the logfile is closed, + we know it must be clean. If it is open and + this bit is set, we also know it must be + clean. If on the other hand the logfile is + open and this bit is clear, we can be almost + certain that the logfile is dirty. */ +/* 16*/ le32 seq_number_bits; /* How many bits to use for the sequence + number. This is calculated as 67 - the + number of bits required to store the logfile + size in bytes and this can be used in with + the specified file_size as a consistency + check. */ +/* 20*/ le16 restart_area_length;/* Length of the restart area including the + client array. Following checks required if + version matches. Otherwise, skip them. + restart_area_offset + restart_area_length + has to be <= system_page_size. Also, + restart_area_length has to be >= + client_array_offset + (log_clients * + sizeof(log client record)). */ +/* 22*/ le16 client_array_offset;/* Offset from the start of this record to + the first log client record if versions are + matched. When creating, set this to be + after this restart area structure, aligned + to 8-bytes boundary. If the versions do not + match, this is ignored and the offset is + assumed to be (sizeof(RESTART_AREA) + 7) & + ~7, i.e. rounded up to first 8-byte + boundary. Either way, client_array_offset + has to be aligned to an 8-byte boundary. + Also, restart_area_offset + + client_array_offset has to be <= 510. + Finally, client_array_offset + (log_clients + * sizeof(log client record)) has to be <= + system_page_size. On Win2k and presumably + earlier, this is 0x30, i.e. immediately + following this record. On WinXP and + presumably later, this is 0x40, i.e. there + are 16 extra bytes between this record and + the client array. This probably means that + the RESTART_AREA record is actually bigger + in WinXP and later. */ +/* 24*/ sle64 file_size; /* Usable byte size of the log file. If the + restart_area_offset + the offset of the + file_size are > 510 then corruption has + occurred. This is the very first check when + starting with the restart_area as if it + fails it means that some of the above values + will be corrupted by the multi sector + transfer protection. The file_size has to + be rounded down to be a multiple of the + log_page_size in the RESTART_PAGE_HEADER and + then it has to be at least big enough to + store the two restart pages and 48 (0x30) + log record pages. */ +/* 32*/ le32 last_lsn_data_length;/* Length of data of last LSN, not including + the log record header. On create set to + 0. */ +/* 36*/ le16 log_record_header_length;/* Byte size of the log record header. + If the version matches then check that the + value of log_record_header_length is a + multiple of 8, i.e. + (log_record_header_length + 7) & ~7 == + log_record_header_length. When creating set + it to sizeof(LOG_RECORD_HEADER), aligned to + 8 bytes. */ +/* 38*/ le16 log_page_data_offset;/* Offset to the start of data in a log record + page. Must be a multiple of 8. On create + set it to immediately after the update + sequence array of the log record page. */ +/* 40*/ le32 restart_log_open_count;/* A counter that gets incremented every + time the logfile is restarted which happens + at mount time when the logfile is opened. + When creating set to a random value. Win2k + sets it to the low 32 bits of the current + system time in NTFS format (see time.h). */ +/* 44*/ le32 reserved; /* Reserved/alignment to 8-byte boundary. */ +/* sizeof() = 48 (0x30) bytes */ +} __attribute__ ((__packed__)) RESTART_AREA; + +/* + * Log client record. The offset of this record is found by adding the offset + * of the RESTART_AREA to the client_array_offset value found in it. + */ +typedef struct { +/*Ofs*/ +/* 0*/ leLSN oldest_lsn; /* Oldest LSN needed by this client. On create + set to 0. */ +/* 8*/ leLSN client_restart_lsn;/* LSN at which this client needs to restart + the volume, i.e. the current position within + the log file. At present, if clean this + should = current_lsn in restart area but it + probably also = current_lsn when dirty most + of the time. At create set to 0. */ +/* 16*/ le16 prev_client; /* The offset to the previous log client record + in the array of log client records. + LOGFILE_NO_CLIENT means there is no previous + client record, i.e. this is the first one. + This is always LOGFILE_NO_CLIENT. */ +/* 18*/ le16 next_client; /* The offset to the next log client record in + the array of log client records. + LOGFILE_NO_CLIENT means there are no next + client records, i.e. this is the last one. + This is always LOGFILE_NO_CLIENT. */ +/* 20*/ le16 seq_number; /* On Win2k and presumably earlier, this is set + to zero every time the logfile is restarted + and it is incremented when the logfile is + closed at dismount time. Thus it is 0 when + dirty and 1 when clean. On WinXP and + presumably later, this is always 0. */ +/* 22*/ u8 reserved[6]; /* Reserved/alignment. */ +/* 28*/ le32 client_name_length;/* Length of client name in bytes. Should + always be 8. */ +/* 32*/ ntfschar client_name[64];/* Name of the client in Unicode. Should + always be "NTFS" with the remaining bytes + set to 0. */ +/* sizeof() = 160 (0xa0) bytes */ +} __attribute__ ((__packed__)) LOG_CLIENT_RECORD; + +extern bool ntfs_check_logfile(struct inode *log_vi, + RESTART_PAGE_HEADER **rp); + +extern bool ntfs_is_logfile_clean(struct inode *log_vi, + const RESTART_PAGE_HEADER *rp); + +extern bool ntfs_empty_logfile(struct inode *log_vi); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_LOGFILE_H */ diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h new file mode 100644 index 00000000..a44b14cb --- /dev/null +++ b/fs/ntfs/malloc.h @@ -0,0 +1,96 @@ +/* + * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_MALLOC_H +#define _LINUX_NTFS_MALLOC_H + +#include +#include +#include + +/** + * __ntfs_malloc - allocate memory in multiples of pages + * @size: number of bytes to allocate + * @gfp_mask: extra flags for the allocator + * + * Internal function. You probably want ntfs_malloc_nofs()... + * + * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and + * returns a pointer to the allocated memory. + * + * If there was insufficient memory to complete the request, return NULL. + * Depending on @gfp_mask the allocation may be guaranteed to succeed. + */ +static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) +{ + if (likely(size <= PAGE_SIZE)) { + BUG_ON(!size); + /* kmalloc() has per-CPU caches so is faster for now. */ + return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); + /* return (void *)__get_free_page(gfp_mask); */ + } + if (likely((size >> PAGE_SHIFT) < totalram_pages)) + return __vmalloc(size, gfp_mask, PAGE_KERNEL); + return NULL; +} + +/** + * ntfs_malloc_nofs - allocate memory in multiples of pages + * @size: number of bytes to allocate + * + * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and + * returns a pointer to the allocated memory. + * + * If there was insufficient memory to complete the request, return NULL. + */ +static inline void *ntfs_malloc_nofs(unsigned long size) +{ + return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM); +} + +/** + * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages + * @size: number of bytes to allocate + * + * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and + * returns a pointer to the allocated memory. + * + * This function guarantees that the allocation will succeed. It will sleep + * for as long as it takes to complete the allocation. + * + * If there was insufficient memory to complete the request, return NULL. + */ +static inline void *ntfs_malloc_nofs_nofail(unsigned long size) +{ + return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL); +} + +static inline void ntfs_free(void *addr) +{ + if (!is_vmalloc_addr(addr)) { + kfree(addr); + /* free_page((unsigned long)addr); */ + return; + } + vfree(addr); +} + +#endif /* _LINUX_NTFS_MALLOC_H */ diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c new file mode 100644 index 00000000..382857f9 --- /dev/null +++ b/fs/ntfs/mft.c @@ -0,0 +1,2917 @@ +/** + * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include + +#include "attrib.h" +#include "aops.h" +#include "bitmap.h" +#include "debug.h" +#include "dir.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "ntfs.h" + +/** + * map_mft_record_page - map the page in which a specific mft record resides + * @ni: ntfs inode whose mft record page to map + * + * This maps the page in which the mft record of the ntfs inode @ni is situated + * and returns a pointer to the mft record within the mapped page. + * + * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR() + * contains the negative error code returned. + */ +static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) +{ + loff_t i_size; + ntfs_volume *vol = ni->vol; + struct inode *mft_vi = vol->mft_ino; + struct page *page; + unsigned long index, end_index; + unsigned ofs; + + BUG_ON(ni->page); + /* + * The index into the page cache and the offset within the page cache + * page of the wanted mft record. FIXME: We need to check for + * overflowing the unsigned long, but I don't think we would ever get + * here if the volume was that big... + */ + index = (u64)ni->mft_no << vol->mft_record_size_bits >> + PAGE_CACHE_SHIFT; + ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; + + i_size = i_size_read(mft_vi); + /* The maximum valid index into the page cache for $MFT's data. */ + end_index = i_size >> PAGE_CACHE_SHIFT; + + /* If the wanted index is out of bounds the mft record doesn't exist. */ + if (unlikely(index >= end_index)) { + if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs + + vol->mft_record_size) { + page = ERR_PTR(-ENOENT); + ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, " + "which is beyond the end of the mft. " + "This is probably a bug in the ntfs " + "driver.", ni->mft_no); + goto err_out; + } + } + /* Read, map, and pin the page. */ + page = ntfs_map_page(mft_vi->i_mapping, index); + if (likely(!IS_ERR(page))) { + /* Catch multi sector transfer fixup errors. */ + if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) + + ofs)))) { + ni->page = page; + ni->page_ofs = ofs; + return page_address(page) + ofs; + } + ntfs_error(vol->sb, "Mft record 0x%lx is corrupt. " + "Run chkdsk.", ni->mft_no); + ntfs_unmap_page(page); + page = ERR_PTR(-EIO); + NVolSetErrors(vol); + } +err_out: + ni->page = NULL; + ni->page_ofs = 0; + return (void*)page; +} + +/** + * map_mft_record - map, pin and lock an mft record + * @ni: ntfs inode whose MFT record to map + * + * First, take the mrec_lock mutex. We might now be sleeping, while waiting + * for the mutex if it was already locked by someone else. + * + * The page of the record is mapped using map_mft_record_page() before being + * returned to the caller. + * + * This in turn uses ntfs_map_page() to get the page containing the wanted mft + * record (it in turn calls read_cache_page() which reads it in from disk if + * necessary, increments the use count on the page so that it cannot disappear + * under us and returns a reference to the page cache page). + * + * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it + * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed + * and the post-read mst fixups on each mft record in the page have been + * performed, the page gets PG_uptodate set and PG_locked cleared (this is done + * in our asynchronous I/O completion handler end_buffer_read_mft_async()). + * ntfs_map_page() waits for PG_locked to become clear and checks if + * PG_uptodate is set and returns an error code if not. This provides + * sufficient protection against races when reading/using the page. + * + * However there is the write mapping to think about. Doing the above described + * checking here will be fine, because when initiating the write we will set + * PG_locked and clear PG_uptodate making sure nobody is touching the page + * contents. Doing the locking this way means that the commit to disk code in + * the page cache code paths is automatically sufficiently locked with us as + * we will not touch a page that has been locked or is not uptodate. The only + * locking problem then is them locking the page while we are accessing it. + * + * So that code will end up having to own the mrec_lock of all mft + * records/inodes present in the page before I/O can proceed. In that case we + * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be + * accessing anything without owning the mrec_lock mutex. But we do need to + * use them because of the read_cache_page() invocation and the code becomes so + * much simpler this way that it is well worth it. + * + * The mft record is now ours and we return a pointer to it. You need to check + * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return + * the error code. + * + * NOTE: Caller is responsible for setting the mft record dirty before calling + * unmap_mft_record(). This is obviously only necessary if the caller really + * modified the mft record... + * Q: Do we want to recycle one of the VFS inode state bits instead? + * A: No, the inode ones mean we want to change the mft record, not we want to + * write it out. + */ +MFT_RECORD *map_mft_record(ntfs_inode *ni) +{ + MFT_RECORD *m; + + ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); + + /* Make sure the ntfs inode doesn't go away. */ + atomic_inc(&ni->count); + + /* Serialize access to this mft record. */ + mutex_lock(&ni->mrec_lock); + + m = map_mft_record_page(ni); + if (likely(!IS_ERR(m))) + return m; + + mutex_unlock(&ni->mrec_lock); + atomic_dec(&ni->count); + ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m)); + return m; +} + +/** + * unmap_mft_record_page - unmap the page in which a specific mft record resides + * @ni: ntfs inode whose mft record page to unmap + * + * This unmaps the page in which the mft record of the ntfs inode @ni is + * situated and returns. This is a NOOP if highmem is not configured. + * + * The unmap happens via ntfs_unmap_page() which in turn decrements the use + * count on the page thus releasing it from the pinned state. + * + * We do not actually unmap the page from memory of course, as that will be + * done by the page cache code itself when memory pressure increases or + * whatever. + */ +static inline void unmap_mft_record_page(ntfs_inode *ni) +{ + BUG_ON(!ni->page); + + // TODO: If dirty, blah... + ntfs_unmap_page(ni->page); + ni->page = NULL; + ni->page_ofs = 0; + return; +} + +/** + * unmap_mft_record - release a mapped mft record + * @ni: ntfs inode whose MFT record to unmap + * + * We release the page mapping and the mrec_lock mutex which unmaps the mft + * record and releases it for others to get hold of. We also release the ntfs + * inode by decrementing the ntfs inode reference count. + * + * NOTE: If caller has modified the mft record, it is imperative to set the mft + * record dirty BEFORE calling unmap_mft_record(). + */ +void unmap_mft_record(ntfs_inode *ni) +{ + struct page *page = ni->page; + + BUG_ON(!page); + + ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); + + unmap_mft_record_page(ni); + mutex_unlock(&ni->mrec_lock); + atomic_dec(&ni->count); + /* + * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to + * ntfs_clear_extent_inode() in the extent inode case, and to the + * caller in the non-extent, yet pure ntfs inode case, to do the actual + * tear down of all structures and freeing of all allocated memory. + */ + return; +} + +/** + * map_extent_mft_record - load an extent inode and attach it to its base + * @base_ni: base ntfs inode + * @mref: mft reference of the extent inode to load + * @ntfs_ino: on successful return, pointer to the ntfs_inode structure + * + * Load the extent mft record @mref and attach it to its base inode @base_ni. + * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise + * PTR_ERR(result) gives the negative error code. + * + * On successful return, @ntfs_ino contains a pointer to the ntfs_inode + * structure of the mapped extent inode. + */ +MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, + ntfs_inode **ntfs_ino) +{ + MFT_RECORD *m; + ntfs_inode *ni = NULL; + ntfs_inode **extent_nis = NULL; + int i; + unsigned long mft_no = MREF(mref); + u16 seq_no = MSEQNO(mref); + bool destroy_ni = false; + + ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).", + mft_no, base_ni->mft_no); + /* Make sure the base ntfs inode doesn't go away. */ + atomic_inc(&base_ni->count); + /* + * Check if this extent inode has already been added to the base inode, + * in which case just return it. If not found, add it to the base + * inode before returning it. + */ + mutex_lock(&base_ni->extent_lock); + if (base_ni->nr_extents > 0) { + extent_nis = base_ni->ext.extent_ntfs_inos; + for (i = 0; i < base_ni->nr_extents; i++) { + if (mft_no != extent_nis[i]->mft_no) + continue; + ni = extent_nis[i]; + /* Make sure the ntfs inode doesn't go away. */ + atomic_inc(&ni->count); + break; + } + } + if (likely(ni != NULL)) { + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + /* We found the record; just have to map and return it. */ + m = map_mft_record(ni); + /* map_mft_record() has incremented this on success. */ + atomic_dec(&ni->count); + if (likely(!IS_ERR(m))) { + /* Verify the sequence number. */ + if (likely(le16_to_cpu(m->sequence_number) == seq_no)) { + ntfs_debug("Done 1."); + *ntfs_ino = ni; + return m; + } + unmap_mft_record(ni); + ntfs_error(base_ni->vol->sb, "Found stale extent mft " + "reference! Corrupt filesystem. " + "Run chkdsk."); + return ERR_PTR(-EIO); + } +map_err_out: + ntfs_error(base_ni->vol->sb, "Failed to map extent " + "mft record, error code %ld.", -PTR_ERR(m)); + return m; + } + /* Record wasn't there. Get a new ntfs inode and initialize it. */ + ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no); + if (unlikely(!ni)) { + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + return ERR_PTR(-ENOMEM); + } + ni->vol = base_ni->vol; + ni->seq_no = seq_no; + ni->nr_extents = -1; + ni->ext.base_ntfs_ino = base_ni; + /* Now map the record. */ + m = map_mft_record(ni); + if (IS_ERR(m)) { + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + ntfs_clear_extent_inode(ni); + goto map_err_out; + } + /* Verify the sequence number if it is present. */ + if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) { + ntfs_error(base_ni->vol->sb, "Found stale extent mft " + "reference! Corrupt filesystem. Run chkdsk."); + destroy_ni = true; + m = ERR_PTR(-EIO); + goto unm_err_out; + } + /* Attach extent inode to base inode, reallocating memory if needed. */ + if (!(base_ni->nr_extents & 3)) { + ntfs_inode **tmp; + int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *); + + tmp = kmalloc(new_size, GFP_NOFS); + if (unlikely(!tmp)) { + ntfs_error(base_ni->vol->sb, "Failed to allocate " + "internal buffer."); + destroy_ni = true; + m = ERR_PTR(-ENOMEM); + goto unm_err_out; + } + if (base_ni->nr_extents) { + BUG_ON(!base_ni->ext.extent_ntfs_inos); + memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size - + 4 * sizeof(ntfs_inode *)); + kfree(base_ni->ext.extent_ntfs_inos); + } + base_ni->ext.extent_ntfs_inos = tmp; + } + base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni; + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + ntfs_debug("Done 2."); + *ntfs_ino = ni; + return m; +unm_err_out: + unmap_mft_record(ni); + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + /* + * If the extent inode was not attached to the base inode we need to + * release it or we will leak memory. + */ + if (destroy_ni) + ntfs_clear_extent_inode(ni); + return m; +} + +#ifdef NTFS_RW + +/** + * __mark_mft_record_dirty - set the mft record and the page containing it dirty + * @ni: ntfs inode describing the mapped mft record + * + * Internal function. Users should call mark_mft_record_dirty() instead. + * + * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, + * as well as the page containing the mft record, dirty. Also, mark the base + * vfs inode dirty. This ensures that any changes to the mft record are + * written out to disk. + * + * NOTE: We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES) + * on the base vfs inode, because even though file data may have been modified, + * it is dirty in the inode meta data rather than the data page cache of the + * inode, and thus there are no data pages that need writing out. Therefore, a + * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the + * other hand, is not sufficient, because ->write_inode needs to be called even + * in case of fdatasync. This needs to happen or the file data would not + * necessarily hit the device synchronously, even though the vfs inode has the + * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just + * I_DIRTY_SYNC, since the file data has not actually hit the block device yet, + * which is not what I_DIRTY_SYNC on its own would suggest. + */ +void __mark_mft_record_dirty(ntfs_inode *ni) +{ + ntfs_inode *base_ni; + + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + BUG_ON(NInoAttr(ni)); + mark_ntfs_record_dirty(ni->page, ni->page_ofs); + /* Determine the base vfs inode and mark it dirty, too. */ + mutex_lock(&ni->extent_lock); + if (likely(ni->nr_extents >= 0)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + mutex_unlock(&ni->extent_lock); + __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC); +} + +static const char *ntfs_please_email = "Please email " + "linux-ntfs-dev@lists.sourceforge.net and say that you saw " + "this message. Thank you."; + +/** + * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror + * @vol: ntfs volume on which the mft record to synchronize resides + * @mft_no: mft record number of mft record to synchronize + * @m: mapped, mst protected (extent) mft record to synchronize + * + * Write the mapped, mst protected (extent) mft record @m with mft record + * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol, + * bypassing the page cache and the $MFTMirr inode itself. + * + * This function is only for use at umount time when the mft mirror inode has + * already been disposed off. We BUG() if we are called while the mft mirror + * inode is still attached to the volume. + * + * On success return 0. On error return -errno. + * + * NOTE: This function is not implemented yet as I am not convinced it can + * actually be triggered considering the sequence of commits we do in super.c:: + * ntfs_put_super(). But just in case we provide this place holder as the + * alternative would be either to BUG() or to get a NULL pointer dereference + * and Oops. + */ +static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol, + const unsigned long mft_no, MFT_RECORD *m) +{ + BUG_ON(vol->mftmirr_ino); + ntfs_error(vol->sb, "Umount time mft mirror syncing is not " + "implemented yet. %s", ntfs_please_email); + return -EOPNOTSUPP; +} + +/** + * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror + * @vol: ntfs volume on which the mft record to synchronize resides + * @mft_no: mft record number of mft record to synchronize + * @m: mapped, mst protected (extent) mft record to synchronize + * @sync: if true, wait for i/o completion + * + * Write the mapped, mst protected (extent) mft record @m with mft record + * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol. + * + * On success return 0. On error return -errno and set the volume errors flag + * in the ntfs volume @vol. + * + * NOTE: We always perform synchronous i/o and ignore the @sync parameter. + * + * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just + * schedule i/o via ->writepage or do it via kntfsd or whatever. + */ +int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, + MFT_RECORD *m, int sync) +{ + struct page *page; + unsigned int blocksize = vol->sb->s_blocksize; + int max_bhs = vol->mft_record_size / blocksize; + struct buffer_head *bhs[max_bhs]; + struct buffer_head *bh, *head; + u8 *kmirr; + runlist_element *rl; + unsigned int block_start, block_end, m_start, m_end, page_ofs; + int i_bhs, nr_bhs, err = 0; + unsigned char blocksize_bits = vol->sb->s_blocksize_bits; + + ntfs_debug("Entering for inode 0x%lx.", mft_no); + BUG_ON(!max_bhs); + if (unlikely(!vol->mftmirr_ino)) { + /* This could happen during umount... */ + err = ntfs_sync_mft_mirror_umount(vol, mft_no, m); + if (likely(!err)) + return err; + goto err_out; + } + /* Get the page containing the mirror copy of the mft record @m. */ + page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >> + (PAGE_CACHE_SHIFT - vol->mft_record_size_bits)); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to map mft mirror page."); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + /* Offset of the mft mirror record inside the page. */ + page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; + /* The address in the page of the mirror copy of the mft record @m. */ + kmirr = page_address(page) + page_ofs; + /* Copy the mst protected mft record to the mirror. */ + memcpy(kmirr, m, vol->mft_record_size); + /* Create uptodate buffers if not present. */ + if (unlikely(!page_has_buffers(page))) { + struct buffer_head *tail; + + bh = head = alloc_page_buffers(page, blocksize, 1); + do { + set_buffer_uptodate(bh); + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + attach_page_buffers(page, head); + } + bh = head = page_buffers(page); + BUG_ON(!bh); + rl = NULL; + nr_bhs = 0; + block_start = 0; + m_start = kmirr - (u8*)page_address(page); + m_end = m_start + vol->mft_record_size; + do { + block_end = block_start + blocksize; + /* If the buffer is outside the mft record, skip it. */ + if (block_end <= m_start) + continue; + if (unlikely(block_start >= m_end)) + break; + /* Need to map the buffer if it is not mapped already. */ + if (unlikely(!buffer_mapped(bh))) { + VCN vcn; + LCN lcn; + unsigned int vcn_ofs; + + bh->b_bdev = vol->sb->s_bdev; + /* Obtain the vcn and offset of the current block. */ + vcn = ((VCN)mft_no << vol->mft_record_size_bits) + + (block_start - m_start); + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { + down_read(&NTFS_I(vol->mftmirr_ino)-> + runlist.lock); + rl = NTFS_I(vol->mftmirr_ino)->runlist.rl; + /* + * $MFTMirr always has the whole of its runlist + * in memory. + */ + BUG_ON(!rl); + } + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + /* For $MFTMirr, only lcn >= 0 is a successful remap. */ + if (likely(lcn >= 0)) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << + vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + } else { + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Cannot write mft mirror " + "record 0x%lx because its " + "location on disk could not " + "be determined (error code " + "%lli).", mft_no, + (long long)lcn); + err = -EIO; + } + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(!nr_bhs && (m_start != block_start)); + BUG_ON(nr_bhs >= max_bhs); + bhs[nr_bhs++] = bh; + BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); + } while (block_start = block_end, (bh = bh->b_this_page) != head); + if (unlikely(rl)) + up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock); + if (likely(!err)) { + /* Lock buffers and start synchronous write i/o on them. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + if (!trylock_buffer(tbh)) + BUG(); + BUG_ON(!buffer_uptodate(tbh)); + clear_buffer_dirty(tbh); + get_bh(tbh); + tbh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, tbh); + } + /* Wait on i/o completion of buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + wait_on_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { + err = -EIO; + /* + * Set the buffer uptodate so the page and + * buffer states do not become out of sync. + */ + set_buffer_uptodate(tbh); + } + } + } else /* if (unlikely(err)) */ { + /* Clean the buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) + clear_buffer_dirty(bhs[i_bhs]); + } + /* Current state: all buffers are clean, unlocked, and uptodate. */ + /* Remove the mst protection fixups again. */ + post_write_mst_fixup((NTFS_RECORD*)kmirr); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + if (likely(!err)) { + ntfs_debug("Done."); + } else { + ntfs_error(vol->sb, "I/O error while writing mft mirror " + "record 0x%lx!", mft_no); +err_out: + ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error " + "code %i). Volume will be left marked dirty " + "on umount. Run ntfsfix on the partition " + "after umounting to correct this.", -err); + NVolSetErrors(vol); + } + return err; +} + +/** + * write_mft_record_nolock - write out a mapped (extent) mft record + * @ni: ntfs inode describing the mapped (extent) mft record + * @m: mapped (extent) mft record to write + * @sync: if true, wait for i/o completion + * + * Write the mapped (extent) mft record @m described by the (regular or extent) + * ntfs inode @ni to backing store. If the mft record @m has a counterpart in + * the mft mirror, that is also updated. + * + * We only write the mft record if the ntfs inode @ni is dirty and the first + * buffer belonging to its mft record is dirty, too. We ignore the dirty state + * of subsequent buffers because we could have raced with + * fs/ntfs/aops.c::mark_ntfs_record_dirty(). + * + * On success, clean the mft record and return 0. On error, leave the mft + * record dirty and return -errno. + * + * NOTE: We always perform synchronous i/o and ignore the @sync parameter. + * However, if the mft record has a counterpart in the mft mirror and @sync is + * true, we write the mft record, wait for i/o completion, and only then write + * the mft mirror copy. This ensures that if the system crashes either the mft + * or the mft mirror will contain a self-consistent mft record @m. If @sync is + * false on the other hand, we start i/o on both and then wait for completion + * on them. This provides a speedup but no longer guarantees that you will end + * up with a self-consistent mft record in the case of a crash but if you asked + * for asynchronous writing you probably do not care about that anyway. + * + * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just + * schedule i/o via ->writepage or do it via kntfsd or whatever. + */ +int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) +{ + ntfs_volume *vol = ni->vol; + struct page *page = ni->page; + unsigned int blocksize = vol->sb->s_blocksize; + unsigned char blocksize_bits = vol->sb->s_blocksize_bits; + int max_bhs = vol->mft_record_size / blocksize; + struct buffer_head *bhs[max_bhs]; + struct buffer_head *bh, *head; + runlist_element *rl; + unsigned int block_start, block_end, m_start, m_end; + int i_bhs, nr_bhs, err = 0; + + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + BUG_ON(NInoAttr(ni)); + BUG_ON(!max_bhs); + BUG_ON(!PageLocked(page)); + /* + * If the ntfs_inode is clean no need to do anything. If it is dirty, + * mark it as clean now so that it can be redirtied later on if needed. + * There is no danger of races since the caller is holding the locks + * for the mft record @m and the page it is in. + */ + if (!NInoTestClearDirty(ni)) + goto done; + bh = head = page_buffers(page); + BUG_ON(!bh); + rl = NULL; + nr_bhs = 0; + block_start = 0; + m_start = ni->page_ofs; + m_end = m_start + vol->mft_record_size; + do { + block_end = block_start + blocksize; + /* If the buffer is outside the mft record, skip it. */ + if (block_end <= m_start) + continue; + if (unlikely(block_start >= m_end)) + break; + /* + * If this block is not the first one in the record, we ignore + * the buffer's dirty state because we could have raced with a + * parallel mark_ntfs_record_dirty(). + */ + if (block_start == m_start) { + /* This block is the first one in the record. */ + if (!buffer_dirty(bh)) { + BUG_ON(nr_bhs); + /* Clean records are not written out. */ + break; + } + } + /* Need to map the buffer if it is not mapped already. */ + if (unlikely(!buffer_mapped(bh))) { + VCN vcn; + LCN lcn; + unsigned int vcn_ofs; + + bh->b_bdev = vol->sb->s_bdev; + /* Obtain the vcn and offset of the current block. */ + vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) + + (block_start - m_start); + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { + down_read(&NTFS_I(vol->mft_ino)->runlist.lock); + rl = NTFS_I(vol->mft_ino)->runlist.rl; + BUG_ON(!rl); + } + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + /* For $MFT, only lcn >= 0 is a successful remap. */ + if (likely(lcn >= 0)) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << + vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + } else { + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Cannot write mft record " + "0x%lx because its location " + "on disk could not be " + "determined (error code %lli).", + ni->mft_no, (long long)lcn); + err = -EIO; + } + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(!nr_bhs && (m_start != block_start)); + BUG_ON(nr_bhs >= max_bhs); + bhs[nr_bhs++] = bh; + BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); + } while (block_start = block_end, (bh = bh->b_this_page) != head); + if (unlikely(rl)) + up_read(&NTFS_I(vol->mft_ino)->runlist.lock); + if (!nr_bhs) + goto done; + if (unlikely(err)) + goto cleanup_out; + /* Apply the mst protection fixups. */ + err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size); + if (err) { + ntfs_error(vol->sb, "Failed to apply mst fixups!"); + goto cleanup_out; + } + flush_dcache_mft_record_page(ni); + /* Lock buffers and start synchronous write i/o on them. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + if (!trylock_buffer(tbh)) + BUG(); + BUG_ON(!buffer_uptodate(tbh)); + clear_buffer_dirty(tbh); + get_bh(tbh); + tbh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, tbh); + } + /* Synchronize the mft mirror now if not @sync. */ + if (!sync && ni->mft_no < vol->mftmirr_size) + ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); + /* Wait on i/o completion of buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + wait_on_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { + err = -EIO; + /* + * Set the buffer uptodate so the page and buffer + * states do not become out of sync. + */ + if (PageUptodate(page)) + set_buffer_uptodate(tbh); + } + } + /* If @sync, now synchronize the mft mirror. */ + if (sync && ni->mft_no < vol->mftmirr_size) + ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); + /* Remove the mst protection fixups again. */ + post_write_mst_fixup((NTFS_RECORD*)m); + flush_dcache_mft_record_page(ni); + if (unlikely(err)) { + /* I/O error during writing. This is really bad! */ + ntfs_error(vol->sb, "I/O error while writing mft record " + "0x%lx! Marking base inode as bad. You " + "should unmount the volume and run chkdsk.", + ni->mft_no); + goto err_out; + } +done: + ntfs_debug("Done."); + return 0; +cleanup_out: + /* Clean the buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) + clear_buffer_dirty(bhs[i_bhs]); +err_out: + /* + * Current state: all buffers are clean, unlocked, and uptodate. + * The caller should mark the base inode as bad so that no more i/o + * happens. ->clear_inode() will still be invoked so all extent inodes + * and other allocated memory will be freed. + */ + if (err == -ENOMEM) { + ntfs_error(vol->sb, "Not enough memory to write mft record. " + "Redirtying so the write is retried later."); + mark_mft_record_dirty(ni); + err = 0; + } else + NVolSetErrors(vol); + return err; +} + +/** + * ntfs_may_write_mft_record - check if an mft record may be written out + * @vol: [IN] ntfs volume on which the mft record to check resides + * @mft_no: [IN] mft record number of the mft record to check + * @m: [IN] mapped mft record to check + * @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned + * + * Check if the mapped (base or extent) mft record @m with mft record number + * @mft_no belonging to the ntfs volume @vol may be written out. If necessary + * and possible the ntfs inode of the mft record is locked and the base vfs + * inode is pinned. The locked ntfs inode is then returned in @locked_ni. The + * caller is responsible for unlocking the ntfs inode and unpinning the base + * vfs inode. + * + * Return 'true' if the mft record may be written out and 'false' if not. + * + * The caller has locked the page and cleared the uptodate flag on it which + * means that we can safely write out any dirty mft records that do not have + * their inodes in icache as determined by ilookup5() as anyone + * opening/creating such an inode would block when attempting to map the mft + * record in read_cache_page() until we are finished with the write out. + * + * Here is a description of the tests we perform: + * + * If the inode is found in icache we know the mft record must be a base mft + * record. If it is dirty, we do not write it and return 'false' as the vfs + * inode write paths will result in the access times being updated which would + * cause the base mft record to be redirtied and written out again. (We know + * the access time update will modify the base mft record because Windows + * chkdsk complains if the standard information attribute is not in the base + * mft record.) + * + * If the inode is in icache and not dirty, we attempt to lock the mft record + * and if we find the lock was already taken, it is not safe to write the mft + * record and we return 'false'. + * + * If we manage to obtain the lock we have exclusive access to the mft record, + * which also allows us safe writeout of the mft record. We then set + * @locked_ni to the locked ntfs inode and return 'true'. + * + * Note we cannot just lock the mft record and sleep while waiting for the lock + * because this would deadlock due to lock reversal (normally the mft record is + * locked before the page is locked but we already have the page locked here + * when we try to lock the mft record). + * + * If the inode is not in icache we need to perform further checks. + * + * If the mft record is not a FILE record or it is a base mft record, we can + * safely write it and return 'true'. + * + * We now know the mft record is an extent mft record. We check if the inode + * corresponding to its base mft record is in icache and obtain a reference to + * it if it is. If it is not, we can safely write it and return 'true'. + * + * We now have the base inode for the extent mft record. We check if it has an + * ntfs inode for the extent mft record attached and if not it is safe to write + * the extent mft record and we return 'true'. + * + * The ntfs inode for the extent mft record is attached to the base inode so we + * attempt to lock the extent mft record and if we find the lock was already + * taken, it is not safe to write the extent mft record and we return 'false'. + * + * If we manage to obtain the lock we have exclusive access to the extent mft + * record, which also allows us safe writeout of the extent mft record. We + * set the ntfs inode of the extent mft record clean and then set @locked_ni to + * the now locked ntfs inode and return 'true'. + * + * Note, the reason for actually writing dirty mft records here and not just + * relying on the vfs inode dirty code paths is that we can have mft records + * modified without them ever having actual inodes in memory. Also we can have + * dirty mft records with clean ntfs inodes in memory. None of the described + * cases would result in the dirty mft records being written out if we only + * relied on the vfs inode dirty code paths. And these cases can really occur + * during allocation of new mft records and in particular when the + * initialized_size of the $MFT/$DATA attribute is extended and the new space + * is initialized using ntfs_mft_record_format(). The clean inode can then + * appear if the mft record is reused for a new inode before it got written + * out. + */ +bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, + const MFT_RECORD *m, ntfs_inode **locked_ni) +{ + struct super_block *sb = vol->sb; + struct inode *mft_vi = vol->mft_ino; + struct inode *vi; + ntfs_inode *ni, *eni, **extent_nis; + int i; + ntfs_attr na; + + ntfs_debug("Entering for inode 0x%lx.", mft_no); + /* + * Normally we do not return a locked inode so set @locked_ni to NULL. + */ + BUG_ON(!locked_ni); + *locked_ni = NULL; + /* + * Check if the inode corresponding to this mft record is in the VFS + * inode cache and obtain a reference to it if it is. + */ + ntfs_debug("Looking for inode 0x%lx in icache.", mft_no); + na.mft_no = mft_no; + na.name = NULL; + na.name_len = 0; + na.type = AT_UNUSED; + /* + * Optimize inode 0, i.e. $MFT itself, since we have it in memory and + * we get here for it rather often. + */ + if (!mft_no) { + /* Balance the below iput(). */ + vi = igrab(mft_vi); + BUG_ON(vi != mft_vi); + } else { + /* + * Have to use ilookup5_nowait() since ilookup5() waits for the + * inode lock which causes ntfs to deadlock when a concurrent + * inode write via the inode dirty code paths and the page + * dirty code path of the inode dirty code path when writing + * $MFT occurs. + */ + vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na); + } + if (vi) { + ntfs_debug("Base inode 0x%lx is in icache.", mft_no); + /* The inode is in icache. */ + ni = NTFS_I(vi); + /* Take a reference to the ntfs inode. */ + atomic_inc(&ni->count); + /* If the inode is dirty, do not write this record. */ + if (NInoDirty(ni)) { + ntfs_debug("Inode 0x%lx is dirty, do not write it.", + mft_no); + atomic_dec(&ni->count); + iput(vi); + return false; + } + ntfs_debug("Inode 0x%lx is not dirty.", mft_no); + /* The inode is not dirty, try to take the mft record lock. */ + if (unlikely(!mutex_trylock(&ni->mrec_lock))) { + ntfs_debug("Mft record 0x%lx is already locked, do " + "not write it.", mft_no); + atomic_dec(&ni->count); + iput(vi); + return false; + } + ntfs_debug("Managed to lock mft record 0x%lx, write it.", + mft_no); + /* + * The write has to occur while we hold the mft record lock so + * return the locked ntfs inode. + */ + *locked_ni = ni; + return true; + } + ntfs_debug("Inode 0x%lx is not in icache.", mft_no); + /* The inode is not in icache. */ + /* Write the record if it is not a mft record (type "FILE"). */ + if (!ntfs_is_mft_record(m->magic)) { + ntfs_debug("Mft record 0x%lx is not a FILE record, write it.", + mft_no); + return true; + } + /* Write the mft record if it is a base inode. */ + if (!m->base_mft_record) { + ntfs_debug("Mft record 0x%lx is a base record, write it.", + mft_no); + return true; + } + /* + * This is an extent mft record. Check if the inode corresponding to + * its base mft record is in icache and obtain a reference to it if it + * is. + */ + na.mft_no = MREF_LE(m->base_mft_record); + ntfs_debug("Mft record 0x%lx is an extent record. Looking for base " + "inode 0x%lx in icache.", mft_no, na.mft_no); + if (!na.mft_no) { + /* Balance the below iput(). */ + vi = igrab(mft_vi); + BUG_ON(vi != mft_vi); + } else + vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode, + &na); + if (!vi) { + /* + * The base inode is not in icache, write this extent mft + * record. + */ + ntfs_debug("Base inode 0x%lx is not in icache, write the " + "extent record.", na.mft_no); + return true; + } + ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no); + /* + * The base inode is in icache. Check if it has the extent inode + * corresponding to this extent mft record attached. + */ + ni = NTFS_I(vi); + mutex_lock(&ni->extent_lock); + if (ni->nr_extents <= 0) { + /* + * The base inode has no attached extent inodes, write this + * extent mft record. + */ + mutex_unlock(&ni->extent_lock); + iput(vi); + ntfs_debug("Base inode 0x%lx has no attached extent inodes, " + "write the extent record.", na.mft_no); + return true; + } + /* Iterate over the attached extent inodes. */ + extent_nis = ni->ext.extent_ntfs_inos; + for (eni = NULL, i = 0; i < ni->nr_extents; ++i) { + if (mft_no == extent_nis[i]->mft_no) { + /* + * Found the extent inode corresponding to this extent + * mft record. + */ + eni = extent_nis[i]; + break; + } + } + /* + * If the extent inode was not attached to the base inode, write this + * extent mft record. + */ + if (!eni) { + mutex_unlock(&ni->extent_lock); + iput(vi); + ntfs_debug("Extent inode 0x%lx is not attached to its base " + "inode 0x%lx, write the extent record.", + mft_no, na.mft_no); + return true; + } + ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.", + mft_no, na.mft_no); + /* Take a reference to the extent ntfs inode. */ + atomic_inc(&eni->count); + mutex_unlock(&ni->extent_lock); + /* + * Found the extent inode coresponding to this extent mft record. + * Try to take the mft record lock. + */ + if (unlikely(!mutex_trylock(&eni->mrec_lock))) { + atomic_dec(&eni->count); + iput(vi); + ntfs_debug("Extent mft record 0x%lx is already locked, do " + "not write it.", mft_no); + return false; + } + ntfs_debug("Managed to lock extent mft record 0x%lx, write it.", + mft_no); + if (NInoTestClearDirty(eni)) + ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.", + mft_no); + /* + * The write has to occur while we hold the mft record lock so return + * the locked extent ntfs inode. + */ + *locked_ni = eni; + return true; +} + +static const char *es = " Leaving inconsistent metadata. Unmount and run " + "chkdsk."; + +/** + * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name + * @vol: volume on which to search for a free mft record + * @base_ni: open base inode if allocating an extent mft record or NULL + * + * Search for a free mft record in the mft bitmap attribute on the ntfs volume + * @vol. + * + * If @base_ni is NULL start the search at the default allocator position. + * + * If @base_ni is not NULL start the search at the mft record after the base + * mft record @base_ni. + * + * Return the free mft record on success and -errno on error. An error code of + * -ENOSPC means that there are no free mft records in the currently + * initialized mft bitmap. + * + * Locking: Caller must hold vol->mftbmp_lock for writing. + */ +static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol, + ntfs_inode *base_ni) +{ + s64 pass_end, ll, data_pos, pass_start, ofs, bit; + unsigned long flags; + struct address_space *mftbmp_mapping; + u8 *buf, *byte; + struct page *page; + unsigned int page_ofs, size; + u8 pass, b; + + ntfs_debug("Searching for free mft record in the currently " + "initialized mft bitmap."); + mftbmp_mapping = vol->mftbmp_ino->i_mapping; + /* + * Set the end of the pass making sure we do not overflow the mft + * bitmap. + */ + read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags); + pass_end = NTFS_I(vol->mft_ino)->allocated_size >> + vol->mft_record_size_bits; + read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags); + read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); + ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3; + read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); + if (pass_end > ll) + pass_end = ll; + pass = 1; + if (!base_ni) + data_pos = vol->mft_data_pos; + else + data_pos = base_ni->mft_no + 1; + if (data_pos < 24) + data_pos = 24; + if (data_pos >= pass_end) { + data_pos = 24; + pass = 2; + /* This happens on a freshly formatted volume. */ + if (data_pos >= pass_end) + return -ENOSPC; + } + pass_start = data_pos; + ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, " + "pass_end 0x%llx, data_pos 0x%llx.", pass, + (long long)pass_start, (long long)pass_end, + (long long)data_pos); + /* Loop until a free mft record is found. */ + for (; pass <= 2;) { + /* Cap size to pass_end. */ + ofs = data_pos >> 3; + page_ofs = ofs & ~PAGE_CACHE_MASK; + size = PAGE_CACHE_SIZE - page_ofs; + ll = ((pass_end + 7) >> 3) - ofs; + if (size > ll) + size = ll; + size <<= 3; + /* + * If we are still within the active pass, search the next page + * for a zero bit. + */ + if (size) { + page = ntfs_map_page(mftbmp_mapping, + ofs >> PAGE_CACHE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read mft " + "bitmap, aborting."); + return PTR_ERR(page); + } + buf = (u8*)page_address(page) + page_ofs; + bit = data_pos & 7; + data_pos &= ~7ull; + ntfs_debug("Before inner for loop: size 0x%x, " + "data_pos 0x%llx, bit 0x%llx", size, + (long long)data_pos, (long long)bit); + for (; bit < size && data_pos + bit < pass_end; + bit &= ~7ull, bit += 8) { + byte = buf + (bit >> 3); + if (*byte == 0xff) + continue; + b = ffz((unsigned long)*byte); + if (b < 8 && b >= (bit & 7)) { + ll = data_pos + (bit & ~7ull) + b; + if (unlikely(ll > (1ll << 32))) { + ntfs_unmap_page(page); + return -ENOSPC; + } + *byte |= 1 << b; + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + ntfs_debug("Done. (Found and " + "allocated mft record " + "0x%llx.)", + (long long)ll); + return ll; + } + } + ntfs_debug("After inner for loop: size 0x%x, " + "data_pos 0x%llx, bit 0x%llx", size, + (long long)data_pos, (long long)bit); + data_pos += size; + ntfs_unmap_page(page); + /* + * If the end of the pass has not been reached yet, + * continue searching the mft bitmap for a zero bit. + */ + if (data_pos < pass_end) + continue; + } + /* Do the next pass. */ + if (++pass == 2) { + /* + * Starting the second pass, in which we scan the first + * part of the zone which we omitted earlier. + */ + pass_end = pass_start; + data_pos = pass_start = 24; + ntfs_debug("pass %i, pass_start 0x%llx, pass_end " + "0x%llx.", pass, (long long)pass_start, + (long long)pass_end); + if (data_pos >= pass_end) + break; + } + } + /* No free mft records in currently initialized mft bitmap. */ + ntfs_debug("Done. (No free mft records left in currently initialized " + "mft bitmap.)"); + return -ENOSPC; +} + +/** + * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster + * @vol: volume on which to extend the mft bitmap attribute + * + * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster. + * + * Note: Only changes allocated_size, i.e. does not touch initialized_size or + * data_size. + * + * Return 0 on success and -errno on error. + * + * Locking: - Caller must hold vol->mftbmp_lock for writing. + * - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for + * writing and releases it before returning. + * - This function takes vol->lcnbmp_lock for writing and releases it + * before returning. + */ +static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) +{ + LCN lcn; + s64 ll; + unsigned long flags; + struct page *page; + ntfs_inode *mft_ni, *mftbmp_ni; + runlist_element *rl, *rl2 = NULL; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *mrec; + ATTR_RECORD *a = NULL; + int ret, mp_size; + u32 old_alen = 0; + u8 *b, tb; + struct { + u8 added_cluster:1; + u8 added_run:1; + u8 mp_rebuilt:1; + } status = { 0, 0, 0 }; + + ntfs_debug("Extending mft bitmap allocation."); + mft_ni = NTFS_I(vol->mft_ino); + mftbmp_ni = NTFS_I(vol->mftbmp_ino); + /* + * Determine the last lcn of the mft bitmap. The allocated size of the + * mft bitmap cannot be zero so we are ok to do this. + */ + down_write(&mftbmp_ni->runlist.lock); + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ll = mftbmp_ni->allocated_size; + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, + (ll - 1) >> vol->cluster_size_bits, NULL); + if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to determine last allocated " + "cluster of mft bitmap attribute."); + if (!IS_ERR(rl)) + ret = -EIO; + else + ret = PTR_ERR(rl); + return ret; + } + lcn = rl->lcn + rl->length; + ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.", + (long long)lcn); + /* + * Attempt to get the cluster following the last allocated cluster by + * hand as it may be in the MFT zone so the allocator would not give it + * to us. + */ + ll = lcn >> 3; + page = ntfs_map_page(vol->lcnbmp_ino->i_mapping, + ll >> PAGE_CACHE_SHIFT); + if (IS_ERR(page)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to read from lcn bitmap."); + return PTR_ERR(page); + } + b = (u8*)page_address(page) + (ll & ~PAGE_CACHE_MASK); + tb = 1 << (lcn & 7ull); + down_write(&vol->lcnbmp_lock); + if (*b != 0xff && !(*b & tb)) { + /* Next cluster is free, allocate it. */ + *b |= tb; + flush_dcache_page(page); + set_page_dirty(page); + up_write(&vol->lcnbmp_lock); + ntfs_unmap_page(page); + /* Update the mft bitmap runlist. */ + rl->length++; + rl[1].vcn++; + status.added_cluster = 1; + ntfs_debug("Appending one cluster to mft bitmap."); + } else { + up_write(&vol->lcnbmp_lock); + ntfs_unmap_page(page); + /* Allocate a cluster from the DATA_ZONE. */ + rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE, + true); + if (IS_ERR(rl2)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to allocate a cluster for " + "the mft bitmap."); + return PTR_ERR(rl2); + } + rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to merge runlists for mft " + "bitmap."); + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to dealocate " + "allocated cluster.%s", es); + NVolSetErrors(vol); + } + ntfs_free(rl2); + return PTR_ERR(rl); + } + mftbmp_ni->runlist.rl = rl; + status.added_run = 1; + ntfs_debug("Adding one run to mft bitmap."); + /* Find the last run in the new runlist. */ + for (; rl[1].length; rl++) + ; + } + /* + * Update the attribute record as well. Note: @rl is the last + * (non-terminator) runlist element of mft bitmap. + */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record."); + ret = PTR_ERR(mrec); + goto undo_alloc; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + ret = -ENOMEM; + goto undo_alloc; + } + ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, + 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft bitmap attribute."); + if (ret == -ENOENT) + ret = -EIO; + goto undo_alloc; + } + a = ctx->attr; + ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); + /* Search back for the previous last allocated cluster of mft bitmap. */ + for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) { + if (ll >= rl2->vcn) + break; + } + BUG_ON(ll < rl2->vcn); + BUG_ON(ll >= rl2->vcn + rl2->length); + /* Get the size for the new mapping pairs array for this extent. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); + if (unlikely(mp_size <= 0)) { + ntfs_error(vol->sb, "Get size for mapping pairs failed for " + "mft bitmap attribute extent."); + ret = mp_size; + if (!ret) + ret = -EIO; + goto undo_alloc; + } + /* Expand the attribute record if necessary. */ + old_alen = le32_to_cpu(a->length); + ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + if (unlikely(ret)) { + if (ret != -ENOSPC) { + ntfs_error(vol->sb, "Failed to resize attribute " + "record for mft bitmap attribute."); + goto undo_alloc; + } + // TODO: Deal with this by moving this extent to a new mft + // record or by starting a new extent in a new mft record or by + // moving other attributes out of this mft record. + // Note: It will need to be a special mft record and if none of + // those are available it gets rather complicated... + ntfs_error(vol->sb, "Not enough space in this mft record to " + "accommodate extended mft bitmap attribute " + "extent. Cannot handle this yet."); + ret = -EOPNOTSUPP; + goto undo_alloc; + } + status.mp_rebuilt = 1; + /* Generate the mapping pairs array directly into the attr record. */ + ret = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, ll, -1, NULL); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to build mapping pairs array for " + "mft bitmap attribute."); + goto undo_alloc; + } + /* Update the highest_vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); + /* + * We now have extended the mft bitmap allocated_size by one cluster. + * Reflect this in the ntfs_inode structure and the attribute record. + */ + if (a->data.non_resident.lowest_vcn) { + /* + * We are not in the first attribute extent, switch to it, but + * first ensure the changes will make it to disk later. + */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find first attribute " + "extent of mft bitmap attribute."); + goto restore_undo_alloc; + } + a = ctx->attr; + } + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + mftbmp_ni->allocated_size += vol->cluster_size; + a->data.non_resident.allocated_size = + cpu_to_sle64(mftbmp_ni->allocated_size); + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mftbmp_ni->runlist.lock); + ntfs_debug("Done."); + return 0; +restore_undo_alloc: + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, + 0, ctx)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft bitmap attribute.%s", es); + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + mftbmp_ni->allocated_size += vol->cluster_size; + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mftbmp_ni->runlist.lock); + /* + * The only thing that is now wrong is ->allocated_size of the + * base attribute extent which chkdsk should be able to fix. + */ + NVolSetErrors(vol); + return ret; + } + a = ctx->attr; + a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2); +undo_alloc: + if (status.added_cluster) { + /* Truncate the last run in the runlist by one cluster. */ + rl->length--; + rl[1].vcn--; + } else if (status.added_run) { + lcn = rl->lcn; + /* Remove the last run from the runlist. */ + rl->lcn = rl[1].lcn; + rl->length = 0; + } + /* Deallocate the cluster. */ + down_write(&vol->lcnbmp_lock); + if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { + ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es); + NVolSetErrors(vol); + } + up_write(&vol->lcnbmp_lock); + if (status.mp_rebuilt) { + if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + old_alen - le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + rl2, ll, -1, NULL)) { + ntfs_error(vol->sb, "Failed to restore mapping pairs " + "array.%s", es); + NVolSetErrors(vol); + } + if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record.%s", es); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (!IS_ERR(mrec)) + unmap_mft_record(mft_ni); + up_write(&mftbmp_ni->runlist.lock); + return ret; +} + +/** + * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data + * @vol: volume on which to extend the mft bitmap attribute + * + * Extend the initialized portion of the mft bitmap attribute on the ntfs + * volume @vol by 8 bytes. + * + * Note: Only changes initialized_size and data_size, i.e. requires that + * allocated_size is big enough to fit the new initialized_size. + * + * Return 0 on success and -error on error. + * + * Locking: Caller must hold vol->mftbmp_lock for writing. + */ +static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol) +{ + s64 old_data_size, old_initialized_size; + unsigned long flags; + struct inode *mftbmp_vi; + ntfs_inode *mft_ni, *mftbmp_ni; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *mrec; + ATTR_RECORD *a; + int ret; + + ntfs_debug("Extending mft bitmap initiailized (and data) size."); + mft_ni = NTFS_I(vol->mft_ino); + mftbmp_vi = vol->mftbmp_ino; + mftbmp_ni = NTFS_I(mftbmp_vi); + /* Get the attribute record. */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record."); + return PTR_ERR(mrec); + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + ret = -ENOMEM; + goto unm_err_out; + } + ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find first attribute extent of " + "mft bitmap attribute."); + if (ret == -ENOENT) + ret = -EIO; + goto put_err_out; + } + a = ctx->attr; + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_size = i_size_read(mftbmp_vi); + old_initialized_size = mftbmp_ni->initialized_size; + /* + * We can simply update the initialized_size before filling the space + * with zeroes because the caller is holding the mft bitmap lock for + * writing which ensures that no one else is trying to access the data. + */ + mftbmp_ni->initialized_size += 8; + a->data.non_resident.initialized_size = + cpu_to_sle64(mftbmp_ni->initialized_size); + if (mftbmp_ni->initialized_size > old_data_size) { + i_size_write(mftbmp_vi, mftbmp_ni->initialized_size); + a->data.non_resident.data_size = + cpu_to_sle64(mftbmp_ni->initialized_size); + } + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + /* Initialize the mft bitmap attribute value with zeroes. */ + ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0); + if (likely(!ret)) { + ntfs_debug("Done. (Wrote eight initialized bytes to mft " + "bitmap."); + return 0; + } + ntfs_error(vol->sb, "Failed to write to mft bitmap."); + /* Try to recover from the error. */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record.%s", es); + NVolSetErrors(vol); + return ret; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context.%s", es); + NVolSetErrors(vol); + goto unm_err_out; + } + if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) { + ntfs_error(vol->sb, "Failed to find first attribute extent of " + "mft bitmap attribute.%s", es); + NVolSetErrors(vol); +put_err_out: + ntfs_attr_put_search_ctx(ctx); +unm_err_out: + unmap_mft_record(mft_ni); + goto err_out; + } + a = ctx->attr; + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + mftbmp_ni->initialized_size = old_initialized_size; + a->data.non_resident.initialized_size = + cpu_to_sle64(old_initialized_size); + if (i_size_read(mftbmp_vi) != old_data_size) { + i_size_write(mftbmp_vi, old_data_size); + a->data.non_resident.data_size = cpu_to_sle64(old_data_size); + } + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, " + "data_size 0x%llx, initialized_size 0x%llx.", + (long long)mftbmp_ni->allocated_size, + (long long)i_size_read(mftbmp_vi), + (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ +err_out: + return ret; +} + +/** + * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute + * @vol: volume on which to extend the mft data attribute + * + * Extend the mft data attribute on the ntfs volume @vol by 16 mft records + * worth of clusters or if not enough space for this by one mft record worth + * of clusters. + * + * Note: Only changes allocated_size, i.e. does not touch initialized_size or + * data_size. + * + * Return 0 on success and -errno on error. + * + * Locking: - Caller must hold vol->mftbmp_lock for writing. + * - This function takes NTFS_I(vol->mft_ino)->runlist.lock for + * writing and releases it before returning. + * - This function calls functions which take vol->lcnbmp_lock for + * writing and release it before returning. + */ +static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) +{ + LCN lcn; + VCN old_last_vcn; + s64 min_nr, nr, ll; + unsigned long flags; + ntfs_inode *mft_ni; + runlist_element *rl, *rl2; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *mrec; + ATTR_RECORD *a = NULL; + int ret, mp_size; + u32 old_alen = 0; + bool mp_rebuilt = false; + + ntfs_debug("Extending mft data allocation."); + mft_ni = NTFS_I(vol->mft_ino); + /* + * Determine the preferred allocation location, i.e. the last lcn of + * the mft data attribute. The allocated size of the mft data + * attribute cannot be zero so we are ok to do this. + */ + down_write(&mft_ni->runlist.lock); + read_lock_irqsave(&mft_ni->size_lock, flags); + ll = mft_ni->allocated_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + rl = ntfs_attr_find_vcn_nolock(mft_ni, + (ll - 1) >> vol->cluster_size_bits, NULL); + if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + up_write(&mft_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to determine last allocated " + "cluster of mft data attribute."); + if (!IS_ERR(rl)) + ret = -EIO; + else + ret = PTR_ERR(rl); + return ret; + } + lcn = rl->lcn + rl->length; + ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn); + /* Minimum allocation is one mft record worth of clusters. */ + min_nr = vol->mft_record_size >> vol->cluster_size_bits; + if (!min_nr) + min_nr = 1; + /* Want to allocate 16 mft records worth of clusters. */ + nr = vol->mft_record_size << 4 >> vol->cluster_size_bits; + if (!nr) + nr = min_nr; + /* Ensure we do not go above 2^32-1 mft records. */ + read_lock_irqsave(&mft_ni->size_lock, flags); + ll = mft_ni->allocated_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + if (unlikely((ll + (nr << vol->cluster_size_bits)) >> + vol->mft_record_size_bits >= (1ll << 32))) { + nr = min_nr; + if (unlikely((ll + (nr << vol->cluster_size_bits)) >> + vol->mft_record_size_bits >= (1ll << 32))) { + ntfs_warning(vol->sb, "Cannot allocate mft record " + "because the maximum number of inodes " + "(2^32) has already been reached."); + up_write(&mft_ni->runlist.lock); + return -ENOSPC; + } + } + ntfs_debug("Trying mft data allocation with %s cluster count %lli.", + nr > min_nr ? "default" : "minimal", (long long)nr); + old_last_vcn = rl[1].vcn; + do { + rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE, + true); + if (likely(!IS_ERR(rl2))) + break; + if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { + ntfs_error(vol->sb, "Failed to allocate the minimal " + "number of clusters (%lli) for the " + "mft data attribute.", (long long)nr); + up_write(&mft_ni->runlist.lock); + return PTR_ERR(rl2); + } + /* + * There is not enough space to do the allocation, but there + * might be enough space to do a minimal allocation so try that + * before failing. + */ + nr = min_nr; + ntfs_debug("Retrying mft data allocation with minimal cluster " + "count %lli.", (long long)nr); + } while (1); + rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + up_write(&mft_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to merge runlists for mft data " + "attribute."); + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to dealocate clusters " + "from the mft data attribute.%s", es); + NVolSetErrors(vol); + } + ntfs_free(rl2); + return PTR_ERR(rl); + } + mft_ni->runlist.rl = rl; + ntfs_debug("Allocated %lli clusters.", (long long)nr); + /* Find the last run in the new runlist. */ + for (; rl[1].length; rl++) + ; + /* Update the attribute record as well. */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record."); + ret = PTR_ERR(mrec); + goto undo_alloc; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + ret = -ENOMEM; + goto undo_alloc; + } + ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, + CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft data attribute."); + if (ret == -ENOENT) + ret = -EIO; + goto undo_alloc; + } + a = ctx->attr; + ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); + /* Search back for the previous last allocated cluster of mft bitmap. */ + for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) { + if (ll >= rl2->vcn) + break; + } + BUG_ON(ll < rl2->vcn); + BUG_ON(ll >= rl2->vcn + rl2->length); + /* Get the size for the new mapping pairs array for this extent. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); + if (unlikely(mp_size <= 0)) { + ntfs_error(vol->sb, "Get size for mapping pairs failed for " + "mft data attribute extent."); + ret = mp_size; + if (!ret) + ret = -EIO; + goto undo_alloc; + } + /* Expand the attribute record if necessary. */ + old_alen = le32_to_cpu(a->length); + ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + if (unlikely(ret)) { + if (ret != -ENOSPC) { + ntfs_error(vol->sb, "Failed to resize attribute " + "record for mft data attribute."); + goto undo_alloc; + } + // TODO: Deal with this by moving this extent to a new mft + // record or by starting a new extent in a new mft record or by + // moving other attributes out of this mft record. + // Note: Use the special reserved mft records and ensure that + // this extent is not required to find the mft record in + // question. If no free special records left we would need to + // move an existing record away, insert ours in its place, and + // then place the moved record into the newly allocated space + // and we would then need to update all references to this mft + // record appropriately. This is rather complicated... + ntfs_error(vol->sb, "Not enough space in this mft record to " + "accommodate extended mft data attribute " + "extent. Cannot handle this yet."); + ret = -EOPNOTSUPP; + goto undo_alloc; + } + mp_rebuilt = true; + /* Generate the mapping pairs array directly into the attr record. */ + ret = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, ll, -1, NULL); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to build mapping pairs array of " + "mft data attribute."); + goto undo_alloc; + } + /* Update the highest_vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); + /* + * We now have extended the mft data allocated_size by nr clusters. + * Reflect this in the ntfs_inode structure and the attribute record. + * @rl is the last (non-terminator) runlist element of mft data + * attribute. + */ + if (a->data.non_resident.lowest_vcn) { + /* + * We are not in the first attribute extent, switch to it, but + * first ensure the changes will make it to disk later. + */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, + mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, + ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find first attribute " + "extent of mft data attribute."); + goto restore_undo_alloc; + } + a = ctx->attr; + } + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->allocated_size += nr << vol->cluster_size_bits; + a->data.non_resident.allocated_size = + cpu_to_sle64(mft_ni->allocated_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mft_ni->runlist.lock); + ntfs_debug("Done."); + return 0; +restore_undo_alloc: + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, + CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft data attribute.%s", es); + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->allocated_size += nr << vol->cluster_size_bits; + write_unlock_irqrestore(&mft_ni->size_lock, flags); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mft_ni->runlist.lock); + /* + * The only thing that is now wrong is ->allocated_size of the + * base attribute extent which chkdsk should be able to fix. + */ + NVolSetErrors(vol); + return ret; + } + ctx->attr->data.non_resident.highest_vcn = + cpu_to_sle64(old_last_vcn - 1); +undo_alloc: + if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) { + ntfs_error(vol->sb, "Failed to free clusters from mft data " + "attribute.%s", es); + NVolSetErrors(vol); + } + a = ctx->attr; + if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) { + ntfs_error(vol->sb, "Failed to truncate mft data attribute " + "runlist.%s", es); + NVolSetErrors(vol); + } + if (mp_rebuilt && !IS_ERR(ctx->mrec)) { + if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + old_alen - le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + rl2, ll, -1, NULL)) { + ntfs_error(vol->sb, "Failed to restore mapping pairs " + "array.%s", es); + NVolSetErrors(vol); + } + if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record.%s", es); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } else if (IS_ERR(ctx->mrec)) { + ntfs_error(vol->sb, "Failed to restore attribute search " + "context.%s", es); + NVolSetErrors(vol); + } + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (!IS_ERR(mrec)) + unmap_mft_record(mft_ni); + up_write(&mft_ni->runlist.lock); + return ret; +} + +/** + * ntfs_mft_record_layout - layout an mft record into a memory buffer + * @vol: volume to which the mft record will belong + * @mft_no: mft reference specifying the mft record number + * @m: destination buffer of size >= @vol->mft_record_size bytes + * + * Layout an empty, unused mft record with the mft record number @mft_no into + * the buffer @m. The volume @vol is needed because the mft record structure + * was modified in NTFS 3.1 so we need to know which volume version this mft + * record will be used on. + * + * Return 0 on success and -errno on error. + */ +static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no, + MFT_RECORD *m) +{ + ATTR_RECORD *a; + + ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); + if (mft_no >= (1ll << 32)) { + ntfs_error(vol->sb, "Mft record number 0x%llx exceeds " + "maximum of 2^32.", (long long)mft_no); + return -ERANGE; + } + /* Start by clearing the whole mft record to gives us a clean slate. */ + memset(m, 0, vol->mft_record_size); + /* Aligned to 2-byte boundary. */ + if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver)) + m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1); + else { + m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1); + /* + * Set the NTFS 3.1+ specific fields while we know that the + * volume version is 3.1+. + */ + m->reserved = 0; + m->mft_record_number = cpu_to_le32((u32)mft_no); + } + m->magic = magic_FILE; + if (vol->mft_record_size >= NTFS_BLOCK_SIZE) + m->usa_count = cpu_to_le16(vol->mft_record_size / + NTFS_BLOCK_SIZE + 1); + else { + m->usa_count = cpu_to_le16(1); + ntfs_warning(vol->sb, "Sector size is bigger than mft record " + "size. Setting usa_count to 1. If chkdsk " + "reports this as corruption, please email " + "linux-ntfs-dev@lists.sourceforge.net stating " + "that you saw this message and that the " + "modified filesystem created was corrupt. " + "Thank you."); + } + /* Set the update sequence number to 1. */ + *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1); + m->lsn = 0; + m->sequence_number = cpu_to_le16(1); + m->link_count = 0; + /* + * Place the attributes straight after the update sequence array, + * aligned to 8-byte boundary. + */ + m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) + + (le16_to_cpu(m->usa_count) << 1) + 7) & ~7); + m->flags = 0; + /* + * Using attrs_offset plus eight bytes (for the termination attribute). + * attrs_offset is already aligned to 8-byte boundary, so no need to + * align again. + */ + m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8); + m->bytes_allocated = cpu_to_le32(vol->mft_record_size); + m->base_mft_record = 0; + m->next_attr_instance = 0; + /* Add the termination attribute. */ + a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset)); + a->type = AT_END; + a->length = 0; + ntfs_debug("Done."); + return 0; +} + +/** + * ntfs_mft_record_format - format an mft record on an ntfs volume + * @vol: volume on which to format the mft record + * @mft_no: mft record number to format + * + * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused + * mft record into the appropriate place of the mft data attribute. This is + * used when extending the mft data attribute. + * + * Return 0 on success and -errno on error. + */ +static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no) +{ + loff_t i_size; + struct inode *mft_vi = vol->mft_ino; + struct page *page; + MFT_RECORD *m; + pgoff_t index, end_index; + unsigned int ofs; + int err; + + ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); + /* + * The index into the page cache and the offset within the page cache + * page of the wanted mft record. + */ + index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT; + ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; + /* The maximum valid index into the page cache for $MFT's data. */ + i_size = i_size_read(mft_vi); + end_index = i_size >> PAGE_CACHE_SHIFT; + if (unlikely(index >= end_index)) { + if (unlikely(index > end_index || ofs + vol->mft_record_size >= + (i_size & ~PAGE_CACHE_MASK))) { + ntfs_error(vol->sb, "Tried to format non-existing mft " + "record 0x%llx.", (long long)mft_no); + return -ENOENT; + } + } + /* Read, map, and pin the page containing the mft record. */ + page = ntfs_map_page(mft_vi->i_mapping, index); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to map page containing mft record " + "to format 0x%llx.", (long long)mft_no); + return PTR_ERR(page); + } + lock_page(page); + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + m = (MFT_RECORD*)((u8*)page_address(page) + ofs); + err = ntfs_mft_record_layout(vol, mft_no, m); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.", + (long long)mft_no); + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + return err; + } + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + /* + * Make sure the mft record is written out to disk. We could use + * ilookup5() to check if an inode is in icache and so on but this is + * unnecessary as ntfs_writepage() will write the dirty record anyway. + */ + mark_ntfs_record_dirty(page, ofs); + ntfs_unmap_page(page); + ntfs_debug("Done."); + return 0; +} + +/** + * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume + * @vol: [IN] volume on which to allocate the mft record + * @mode: [IN] mode if want a file or directory, i.e. base inode or 0 + * @base_ni: [IN] open base inode if allocating an extent mft record or NULL + * @mrec: [OUT] on successful return this is the mapped mft record + * + * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol. + * + * If @base_ni is NULL make the mft record a base mft record, i.e. a file or + * direvctory inode, and allocate it at the default allocator position. In + * this case @mode is the file mode as given to us by the caller. We in + * particular use @mode to distinguish whether a file or a directory is being + * created (S_IFDIR(mode) and S_IFREG(mode), respectively). + * + * If @base_ni is not NULL make the allocated mft record an extent record, + * allocate it starting at the mft record after the base mft record and attach + * the allocated and opened ntfs inode to the base inode @base_ni. In this + * case @mode must be 0 as it is meaningless for extent inodes. + * + * You need to check the return value with IS_ERR(). If false, the function + * was successful and the return value is the now opened ntfs inode of the + * allocated mft record. *@mrec is then set to the allocated, mapped, pinned, + * and locked mft record. If IS_ERR() is true, the function failed and the + * error code is obtained from PTR_ERR(return value). *@mrec is undefined in + * this case. + * + * Allocation strategy: + * + * To find a free mft record, we scan the mft bitmap for a zero bit. To + * optimize this we start scanning at the place specified by @base_ni or if + * @base_ni is NULL we start where we last stopped and we perform wrap around + * when we reach the end. Note, we do not try to allocate mft records below + * number 24 because numbers 0 to 15 are the defined system files anyway and 16 + * to 24 are special in that they are used for storing extension mft records + * for the $DATA attribute of $MFT. This is required to avoid the possibility + * of creating a runlist with a circular dependency which once written to disk + * can never be read in again. Windows will only use records 16 to 24 for + * normal files if the volume is completely out of space. We never use them + * which means that when the volume is really out of space we cannot create any + * more files while Windows can still create up to 8 small files. We can start + * doing this at some later time, it does not matter much for now. + * + * When scanning the mft bitmap, we only search up to the last allocated mft + * record. If there are no free records left in the range 24 to number of + * allocated mft records, then we extend the $MFT/$DATA attribute in order to + * create free mft records. We extend the allocated size of $MFT/$DATA by 16 + * records at a time or one cluster, if cluster size is above 16kiB. If there + * is not sufficient space to do this, we try to extend by a single mft record + * or one cluster, if cluster size is above the mft record size. + * + * No matter how many mft records we allocate, we initialize only the first + * allocated mft record, incrementing mft data size and initialized size + * accordingly, open an ntfs_inode for it and return it to the caller, unless + * there are less than 24 mft records, in which case we allocate and initialize + * mft records until we reach record 24 which we consider as the first free mft + * record for use by normal files. + * + * If during any stage we overflow the initialized data in the mft bitmap, we + * extend the initialized size (and data size) by 8 bytes, allocating another + * cluster if required. The bitmap data size has to be at least equal to the + * number of mft records in the mft, but it can be bigger, in which case the + * superflous bits are padded with zeroes. + * + * Thus, when we return successfully (IS_ERR() is false), we will have: + * - initialized / extended the mft bitmap if necessary, + * - initialized / extended the mft data if necessary, + * - set the bit corresponding to the mft record being allocated in the + * mft bitmap, + * - opened an ntfs_inode for the allocated mft record, and we will have + * - returned the ntfs_inode as well as the allocated mapped, pinned, and + * locked mft record. + * + * On error, the volume will be left in a consistent state and no record will + * be allocated. If rolling back a partial operation fails, we may leave some + * inconsistent metadata in which case we set NVolErrors() so the volume is + * left dirty when unmounted. + * + * Note, this function cannot make use of most of the normal functions, like + * for example for attribute resizing, etc, because when the run list overflows + * the base mft record and an attribute list is used, it is very important that + * the extension mft records used to store the $DATA attribute of $MFT can be + * reached without having to read the information contained inside them, as + * this would make it impossible to find them in the first place after the + * volume is unmounted. $MFT/$BITMAP probably does not need to follow this + * rule because the bitmap is not essential for finding the mft records, but on + * the other hand, handling the bitmap in this special way would make life + * easier because otherwise there might be circular invocations of functions + * when reading the bitmap. + */ +ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, + ntfs_inode *base_ni, MFT_RECORD **mrec) +{ + s64 ll, bit, old_data_initialized, old_data_size; + unsigned long flags; + struct inode *vi; + struct page *page; + ntfs_inode *mft_ni, *mftbmp_ni, *ni; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + pgoff_t index; + unsigned int ofs; + int err; + le16 seq_no, usn; + bool record_formatted = false; + + if (base_ni) { + ntfs_debug("Entering (allocating an extent mft record for " + "base mft record 0x%llx).", + (long long)base_ni->mft_no); + /* @mode and @base_ni are mutually exclusive. */ + BUG_ON(mode); + } else + ntfs_debug("Entering (allocating a base mft record)."); + if (mode) { + /* @mode and @base_ni are mutually exclusive. */ + BUG_ON(base_ni); + /* We only support creation of normal files and directories. */ + if (!S_ISREG(mode) && !S_ISDIR(mode)) + return ERR_PTR(-EOPNOTSUPP); + } + BUG_ON(!mrec); + mft_ni = NTFS_I(vol->mft_ino); + mftbmp_ni = NTFS_I(vol->mftbmp_ino); + down_write(&vol->mftbmp_lock); + bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni); + if (bit >= 0) { + ntfs_debug("Found and allocated free record (#1), bit 0x%llx.", + (long long)bit); + goto have_alloc_rec; + } + if (bit != -ENOSPC) { + up_write(&vol->mftbmp_lock); + return ERR_PTR(bit); + } + /* + * No free mft records left. If the mft bitmap already covers more + * than the currently used mft records, the next records are all free, + * so we can simply allocate the first unused mft record. + * Note: We also have to make sure that the mft bitmap at least covers + * the first 24 mft records as they are special and whilst they may not + * be in use, we do not allocate from them. + */ + read_lock_irqsave(&mft_ni->size_lock, flags); + ll = mft_ni->initialized_size >> vol->mft_record_size_bits; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_initialized = mftbmp_ni->initialized_size; + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + if (old_data_initialized << 3 > ll && old_data_initialized > 3) { + bit = ll; + if (bit < 24) + bit = 24; + if (unlikely(bit >= (1ll << 32))) + goto max_err_out; + ntfs_debug("Found free record (#2), bit 0x%llx.", + (long long)bit); + goto found_free_rec; + } + /* + * The mft bitmap needs to be expanded until it covers the first unused + * mft record that we can allocate. + * Note: The smallest mft record we allocate is mft record 24. + */ + bit = old_data_initialized << 3; + if (unlikely(bit >= (1ll << 32))) + goto max_err_out; + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_size = mftbmp_ni->allocated_size; + ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, " + "data_size 0x%llx, initialized_size 0x%llx.", + (long long)old_data_size, + (long long)i_size_read(vol->mftbmp_ino), + (long long)old_data_initialized); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + if (old_data_initialized + 8 > old_data_size) { + /* Need to extend bitmap by one more cluster. */ + ntfs_debug("mftbmp: initialized_size + 8 > allocated_size."); + err = ntfs_mft_bitmap_extend_allocation_nolock(vol); + if (unlikely(err)) { + up_write(&vol->mftbmp_lock); + goto err_out; + } +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ntfs_debug("Status of mftbmp after allocation extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mftbmp_ni->allocated_size, + (long long)i_size_read(vol->mftbmp_ino), + (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ + } + /* + * We now have sufficient allocated space, extend the initialized_size + * as well as the data_size if necessary and fill the new space with + * zeroes. + */ + err = ntfs_mft_bitmap_extend_initialized_nolock(vol); + if (unlikely(err)) { + up_write(&vol->mftbmp_lock); + goto err_out; + } +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ntfs_debug("Status of mftbmp after initialized extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mftbmp_ni->allocated_size, + (long long)i_size_read(vol->mftbmp_ino), + (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ + ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit); +found_free_rec: + /* @bit is the found free mft record, allocate it in the mft bitmap. */ + ntfs_debug("At found_free_rec."); + err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap."); + up_write(&vol->mftbmp_lock); + goto err_out; + } + ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit); +have_alloc_rec: + /* + * The mft bitmap is now uptodate. Deal with mft data attribute now. + * Note, we keep hold of the mft bitmap lock for writing until all + * modifications to the mft data attribute are complete, too, as they + * will impact decisions for mft bitmap and mft record allocation done + * by a parallel allocation and if the lock is not maintained a + * parallel allocation could allocate the same mft record as this one. + */ + ll = (bit + 1) << vol->mft_record_size_bits; + read_lock_irqsave(&mft_ni->size_lock, flags); + old_data_initialized = mft_ni->initialized_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + if (ll <= old_data_initialized) { + ntfs_debug("Allocated mft record already initialized."); + goto mft_rec_already_initialized; + } + ntfs_debug("Initializing allocated mft record."); + /* + * The mft record is outside the initialized data. Extend the mft data + * attribute until it covers the allocated record. The loop is only + * actually traversed more than once when a freshly formatted volume is + * first written to so it optimizes away nicely in the common case. + */ + read_lock_irqsave(&mft_ni->size_lock, flags); + ntfs_debug("Status of mft data before extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mft_ni->allocated_size, + (long long)i_size_read(vol->mft_ino), + (long long)mft_ni->initialized_size); + while (ll > mft_ni->allocated_size) { + read_unlock_irqrestore(&mft_ni->size_lock, flags); + err = ntfs_mft_data_extend_allocation_nolock(vol); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to extend mft data " + "allocation."); + goto undo_mftbmp_alloc_nolock; + } + read_lock_irqsave(&mft_ni->size_lock, flags); + ntfs_debug("Status of mft data after allocation extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mft_ni->allocated_size, + (long long)i_size_read(vol->mft_ino), + (long long)mft_ni->initialized_size); + } + read_unlock_irqrestore(&mft_ni->size_lock, flags); + /* + * Extend mft data initialized size (and data size of course) to reach + * the allocated mft record, formatting the mft records allong the way. + * Note: We only modify the ntfs_inode structure as that is all that is + * needed by ntfs_mft_record_format(). We will update the attribute + * record itself in one fell swoop later on. + */ + write_lock_irqsave(&mft_ni->size_lock, flags); + old_data_initialized = mft_ni->initialized_size; + old_data_size = vol->mft_ino->i_size; + while (ll > mft_ni->initialized_size) { + s64 new_initialized_size, mft_no; + + new_initialized_size = mft_ni->initialized_size + + vol->mft_record_size; + mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits; + if (new_initialized_size > i_size_read(vol->mft_ino)) + i_size_write(vol->mft_ino, new_initialized_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); + ntfs_debug("Initializing mft record 0x%llx.", + (long long)mft_no); + err = ntfs_mft_record_format(vol, mft_no); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to format mft record."); + goto undo_data_init; + } + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->initialized_size = new_initialized_size; + } + write_unlock_irqrestore(&mft_ni->size_lock, flags); + record_formatted = true; + /* Update the mft data attribute record to reflect the new sizes. */ + m = map_mft_record(mft_ni); + if (IS_ERR(m)) { + ntfs_error(vol->sb, "Failed to map mft record."); + err = PTR_ERR(m); + goto undo_data_init; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, m); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + err = -ENOMEM; + unmap_mft_record(mft_ni); + goto undo_data_init; + } + err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to find first attribute extent of " + "mft data attribute."); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + goto undo_data_init; + } + a = ctx->attr; + read_lock_irqsave(&mft_ni->size_lock, flags); + a->data.non_resident.initialized_size = + cpu_to_sle64(mft_ni->initialized_size); + a->data.non_resident.data_size = + cpu_to_sle64(i_size_read(vol->mft_ino)); + read_unlock_irqrestore(&mft_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + read_lock_irqsave(&mft_ni->size_lock, flags); + ntfs_debug("Status of mft data after mft record initialization: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mft_ni->allocated_size, + (long long)i_size_read(vol->mft_ino), + (long long)mft_ni->initialized_size); + BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size); + BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino)); + read_unlock_irqrestore(&mft_ni->size_lock, flags); +mft_rec_already_initialized: + /* + * We can finally drop the mft bitmap lock as the mft data attribute + * has been fully updated. The only disparity left is that the + * allocated mft record still needs to be marked as in use to match the + * set bit in the mft bitmap but this is actually not a problem since + * this mft record is not referenced from anywhere yet and the fact + * that it is allocated in the mft bitmap means that no-one will try to + * allocate it either. + */ + up_write(&vol->mftbmp_lock); + /* + * We now have allocated and initialized the mft record. Calculate the + * index of and the offset within the page cache page the record is in. + */ + index = bit << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT; + ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; + /* Read, map, and pin the page containing the mft record. */ + page = ntfs_map_page(vol->mft_ino->i_mapping, index); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to map page containing allocated " + "mft record 0x%llx.", (long long)bit); + err = PTR_ERR(page); + goto undo_mftbmp_alloc; + } + lock_page(page); + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + m = (MFT_RECORD*)((u8*)page_address(page) + ofs); + /* If we just formatted the mft record no need to do it again. */ + if (!record_formatted) { + /* Sanity check that the mft record is really not in use. */ + if (ntfs_is_file_record(m->magic) && + (m->flags & MFT_RECORD_IN_USE)) { + ntfs_error(vol->sb, "Mft record 0x%llx was marked " + "free in mft bitmap but is marked " + "used itself. Corrupt filesystem. " + "Unmount and run chkdsk.", + (long long)bit); + err = -EIO; + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + NVolSetErrors(vol); + goto undo_mftbmp_alloc; + } + /* + * We need to (re-)format the mft record, preserving the + * sequence number if it is not zero as well as the update + * sequence number if it is not zero or -1 (0xffff). This + * means we do not need to care whether or not something went + * wrong with the previous mft record. + */ + seq_no = m->sequence_number; + usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)); + err = ntfs_mft_record_layout(vol, bit, m); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to layout allocated mft " + "record 0x%llx.", (long long)bit); + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + goto undo_mftbmp_alloc; + } + if (seq_no) + m->sequence_number = seq_no; + if (usn && le16_to_cpu(usn) != 0xffff) + *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn; + } + /* Set the mft record itself in use. */ + m->flags |= MFT_RECORD_IN_USE; + if (S_ISDIR(mode)) + m->flags |= MFT_RECORD_IS_DIRECTORY; + flush_dcache_page(page); + SetPageUptodate(page); + if (base_ni) { + MFT_RECORD *m_tmp; + + /* + * Setup the base mft record in the extent mft record. This + * completes initialization of the allocated extent mft record + * and we can simply use it with map_extent_mft_record(). + */ + m->base_mft_record = MK_LE_MREF(base_ni->mft_no, + base_ni->seq_no); + /* + * Allocate an extent inode structure for the new mft record, + * attach it to the base inode @base_ni and map, pin, and lock + * its, i.e. the allocated, mft record. + */ + m_tmp = map_extent_mft_record(base_ni, bit, &ni); + if (IS_ERR(m_tmp)) { + ntfs_error(vol->sb, "Failed to map allocated extent " + "mft record 0x%llx.", (long long)bit); + err = PTR_ERR(m_tmp); + /* Set the mft record itself not in use. */ + m->flags &= cpu_to_le16( + ~le16_to_cpu(MFT_RECORD_IN_USE)); + flush_dcache_page(page); + /* Make sure the mft record is written out to disk. */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + ntfs_unmap_page(page); + goto undo_mftbmp_alloc; + } + BUG_ON(m != m_tmp); + /* + * Make sure the allocated mft record is written out to disk. + * No need to set the inode dirty because the caller is going + * to do that anyway after finishing with the new extent mft + * record (e.g. at a minimum a new attribute will be added to + * the mft record. + */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + /* + * Need to unmap the page since map_extent_mft_record() mapped + * it as well so we have it mapped twice at the moment. + */ + ntfs_unmap_page(page); + } else { + /* + * Allocate a new VFS inode and set it up. NOTE: @vi->i_nlink + * is set to 1 but the mft record->link_count is 0. The caller + * needs to bear this in mind. + */ + vi = new_inode(vol->sb); + if (unlikely(!vi)) { + err = -ENOMEM; + /* Set the mft record itself not in use. */ + m->flags &= cpu_to_le16( + ~le16_to_cpu(MFT_RECORD_IN_USE)); + flush_dcache_page(page); + /* Make sure the mft record is written out to disk. */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + ntfs_unmap_page(page); + goto undo_mftbmp_alloc; + } + vi->i_ino = bit; + /* + * This is for checking whether an inode has changed w.r.t. a + * file so that the file can be updated if necessary (compare + * with f_version). + */ + vi->i_version = 1; + + /* The owner and group come from the ntfs volume. */ + vi->i_uid = vol->uid; + vi->i_gid = vol->gid; + + /* Initialize the ntfs specific part of @vi. */ + ntfs_init_big_inode(vi); + ni = NTFS_I(vi); + /* + * Set the appropriate mode, attribute type, and name. For + * directories, also setup the index values to the defaults. + */ + if (S_ISDIR(mode)) { + vi->i_mode = S_IFDIR | S_IRWXUGO; + vi->i_mode &= ~vol->dmask; + + NInoSetMstProtected(ni); + ni->type = AT_INDEX_ALLOCATION; + ni->name = I30; + ni->name_len = 4; + + ni->itype.index.block_size = 4096; + ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1; + ni->itype.index.collation_rule = COLLATION_FILE_NAME; + if (vol->cluster_size <= ni->itype.index.block_size) { + ni->itype.index.vcn_size = vol->cluster_size; + ni->itype.index.vcn_size_bits = + vol->cluster_size_bits; + } else { + ni->itype.index.vcn_size = vol->sector_size; + ni->itype.index.vcn_size_bits = + vol->sector_size_bits; + } + } else { + vi->i_mode = S_IFREG | S_IRWXUGO; + vi->i_mode &= ~vol->fmask; + + ni->type = AT_DATA; + ni->name = NULL; + ni->name_len = 0; + } + if (IS_RDONLY(vi)) + vi->i_mode &= ~S_IWUGO; + + /* Set the inode times to the current time. */ + vi->i_atime = vi->i_mtime = vi->i_ctime = + current_fs_time(vi->i_sb); + /* + * Set the file size to 0, the ntfs inode sizes are set to 0 by + * the call to ntfs_init_big_inode() below. + */ + vi->i_size = 0; + vi->i_blocks = 0; + + /* Set the sequence number. */ + vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); + /* + * Manually map, pin, and lock the mft record as we already + * have its page mapped and it is very easy to do. + */ + atomic_inc(&ni->count); + mutex_lock(&ni->mrec_lock); + ni->page = page; + ni->page_ofs = ofs; + /* + * Make sure the allocated mft record is written out to disk. + * NOTE: We do not set the ntfs inode dirty because this would + * fail in ntfs_write_inode() because the inode does not have a + * standard information attribute yet. Also, there is no need + * to set the inode dirty because the caller is going to do + * that anyway after finishing with the new mft record (e.g. at + * a minimum some new attributes will be added to the mft + * record. + */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + + /* Add the inode to the inode hash for the superblock. */ + insert_inode_hash(vi); + + /* Update the default mft allocation position. */ + vol->mft_data_pos = bit + 1; + } + /* + * Return the opened, allocated inode of the allocated mft record as + * well as the mapped, pinned, and locked mft record. + */ + ntfs_debug("Returning opened, allocated %sinode 0x%llx.", + base_ni ? "extent " : "", (long long)bit); + *mrec = m; + return ni; +undo_data_init: + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->initialized_size = old_data_initialized; + i_size_write(vol->mft_ino, old_data_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); + goto undo_mftbmp_alloc_nolock; +undo_mftbmp_alloc: + down_write(&vol->mftbmp_lock); +undo_mftbmp_alloc_nolock: + if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) { + ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); + NVolSetErrors(vol); + } + up_write(&vol->mftbmp_lock); +err_out: + return ERR_PTR(err); +max_err_out: + ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum " + "number of inodes (2^32) has already been reached."); + up_write(&vol->mftbmp_lock); + return ERR_PTR(-ENOSPC); +} + +/** + * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume + * @ni: ntfs inode of the mapped extent mft record to free + * @m: mapped extent mft record of the ntfs inode @ni + * + * Free the mapped extent mft record @m of the extent ntfs inode @ni. + * + * Note that this function unmaps the mft record and closes and destroys @ni + * internally and hence you cannot use either @ni nor @m any more after this + * function returns success. + * + * On success return 0 and on error return -errno. @ni and @m are still valid + * in this case and have not been freed. + * + * For some errors an error message is displayed and the success code 0 is + * returned and the volume is then left dirty on umount. This makes sense in + * case we could not rollback the changes that were already done since the + * caller no longer wants to reference this mft record so it does not matter to + * the caller if something is wrong with it as long as it is properly detached + * from the base inode. + */ +int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m) +{ + unsigned long mft_no = ni->mft_no; + ntfs_volume *vol = ni->vol; + ntfs_inode *base_ni; + ntfs_inode **extent_nis; + int i, err; + le16 old_seq_no; + u16 seq_no; + + BUG_ON(NInoAttr(ni)); + BUG_ON(ni->nr_extents != -1); + + mutex_lock(&ni->extent_lock); + base_ni = ni->ext.base_ntfs_ino; + mutex_unlock(&ni->extent_lock); + + BUG_ON(base_ni->nr_extents <= 0); + + ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n", + mft_no, base_ni->mft_no); + + mutex_lock(&base_ni->extent_lock); + + /* Make sure we are holding the only reference to the extent inode. */ + if (atomic_read(&ni->count) > 2) { + ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, " + "not freeing.", base_ni->mft_no); + mutex_unlock(&base_ni->extent_lock); + return -EBUSY; + } + + /* Dissociate the ntfs inode from the base inode. */ + extent_nis = base_ni->ext.extent_ntfs_inos; + err = -ENOENT; + for (i = 0; i < base_ni->nr_extents; i++) { + if (ni != extent_nis[i]) + continue; + extent_nis += i; + base_ni->nr_extents--; + memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) * + sizeof(ntfs_inode*)); + err = 0; + break; + } + + mutex_unlock(&base_ni->extent_lock); + + if (unlikely(err)) { + ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to " + "its base inode 0x%lx.", mft_no, + base_ni->mft_no); + BUG(); + } + + /* + * The extent inode is no longer attached to the base inode so no one + * can get a reference to it any more. + */ + + /* Mark the mft record as not in use. */ + m->flags &= ~MFT_RECORD_IN_USE; + + /* Increment the sequence number, skipping zero, if it is not zero. */ + old_seq_no = m->sequence_number; + seq_no = le16_to_cpu(old_seq_no); + if (seq_no == 0xffff) + seq_no = 1; + else if (seq_no) + seq_no++; + m->sequence_number = cpu_to_le16(seq_no); + + /* + * Set the ntfs inode dirty and write it out. We do not need to worry + * about the base inode here since whatever caused the extent mft + * record to be freed is guaranteed to do it already. + */ + NInoSetDirty(ni); + err = write_mft_record(ni, m, 0); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not " + "freeing.", mft_no); + goto rollback; + } +rollback_error: + /* Unmap and throw away the now freed extent inode. */ + unmap_extent_mft_record(ni); + ntfs_clear_extent_inode(ni); + + /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */ + down_write(&vol->mftbmp_lock); + err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no); + up_write(&vol->mftbmp_lock); + if (unlikely(err)) { + /* + * The extent inode is gone but we failed to deallocate it in + * the mft bitmap. Just emit a warning and leave the volume + * dirty on umount. + */ + ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); + NVolSetErrors(vol); + } + return 0; +rollback: + /* Rollback what we did... */ + mutex_lock(&base_ni->extent_lock); + extent_nis = base_ni->ext.extent_ntfs_inos; + if (!(base_ni->nr_extents & 3)) { + int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*); + + extent_nis = kmalloc(new_size, GFP_NOFS); + if (unlikely(!extent_nis)) { + ntfs_error(vol->sb, "Failed to allocate internal " + "buffer during rollback.%s", es); + mutex_unlock(&base_ni->extent_lock); + NVolSetErrors(vol); + goto rollback_error; + } + if (base_ni->nr_extents) { + BUG_ON(!base_ni->ext.extent_ntfs_inos); + memcpy(extent_nis, base_ni->ext.extent_ntfs_inos, + new_size - 4 * sizeof(ntfs_inode*)); + kfree(base_ni->ext.extent_ntfs_inos); + } + base_ni->ext.extent_ntfs_inos = extent_nis; + } + m->flags |= MFT_RECORD_IN_USE; + m->sequence_number = old_seq_no; + extent_nis[base_ni->nr_extents++] = ni; + mutex_unlock(&base_ni->extent_lock); + mark_mft_record_dirty(ni); + return err; +} +#endif /* NTFS_RW */ diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h new file mode 100644 index 00000000..b52bf87b --- /dev/null +++ b/fs/ntfs/mft.h @@ -0,0 +1,124 @@ +/* + * mft.h - Defines for mft record handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_MFT_H +#define _LINUX_NTFS_MFT_H + +#include +#include +#include + +#include "inode.h" + +extern MFT_RECORD *map_mft_record(ntfs_inode *ni); +extern void unmap_mft_record(ntfs_inode *ni); + +extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, + ntfs_inode **ntfs_ino); + +static inline void unmap_extent_mft_record(ntfs_inode *ni) +{ + unmap_mft_record(ni); + return; +} + +#ifdef NTFS_RW + +/** + * flush_dcache_mft_record_page - flush_dcache_page() for mft records + * @ni: ntfs inode structure of mft record + * + * Call flush_dcache_page() for the page in which an mft record resides. + * + * This must be called every time an mft record is modified, just after the + * modification. + */ +static inline void flush_dcache_mft_record_page(ntfs_inode *ni) +{ + flush_dcache_page(ni->page); +} + +extern void __mark_mft_record_dirty(ntfs_inode *ni); + +/** + * mark_mft_record_dirty - set the mft record and the page containing it dirty + * @ni: ntfs inode describing the mapped mft record + * + * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, + * as well as the page containing the mft record, dirty. Also, mark the base + * vfs inode dirty. This ensures that any changes to the mft record are + * written out to disk. + * + * NOTE: Do not do anything if the mft record is already marked dirty. + */ +static inline void mark_mft_record_dirty(ntfs_inode *ni) +{ + if (!NInoTestSetDirty(ni)) + __mark_mft_record_dirty(ni); +} + +extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, + MFT_RECORD *m, int sync); + +extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync); + +/** + * write_mft_record - write out a mapped (extent) mft record + * @ni: ntfs inode describing the mapped (extent) mft record + * @m: mapped (extent) mft record to write + * @sync: if true, wait for i/o completion + * + * This is just a wrapper for write_mft_record_nolock() (see mft.c), which + * locks the page for the duration of the write. This ensures that there are + * no race conditions between writing the mft record via the dirty inode code + * paths and via the page cache write back code paths or between writing + * neighbouring mft records residing in the same page. + * + * Locking the page also serializes us against ->readpage() if the page is not + * uptodate. + * + * On success, clean the mft record and return 0. On error, leave the mft + * record dirty and return -errno. + */ +static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync) +{ + struct page *page = ni->page; + int err; + + BUG_ON(!page); + lock_page(page); + err = write_mft_record_nolock(ni, m, sync); + unlock_page(page); + return err; +} + +extern bool ntfs_may_write_mft_record(ntfs_volume *vol, + const unsigned long mft_no, const MFT_RECORD *m, + ntfs_inode **locked_ni); + +extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, + ntfs_inode *base_ni, MFT_RECORD **mrec); +extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_MFT_H */ diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c new file mode 100644 index 00000000..5a858d83 --- /dev/null +++ b/fs/ntfs/mst.c @@ -0,0 +1,203 @@ +/* + * mst.c - NTFS multi sector transfer protection handling code. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "ntfs.h" + +/** + * post_read_mst_fixup - deprotect multi sector transfer protected data + * @b: pointer to the data to deprotect + * @size: size in bytes of @b + * + * Perform the necessary post read multi sector transfer fixup and detect the + * presence of incomplete multi sector transfers. - In that case, overwrite the + * magic of the ntfs record header being processed with "BAAD" (in memory only!) + * and abort processing. + * + * Return 0 on success and -EINVAL on error ("BAAD" magic will be present). + * + * NOTE: We consider the absence / invalidity of an update sequence array to + * mean that the structure is not protected at all and hence doesn't need to + * be fixed up. Thus, we return success and not failure in this case. This is + * in contrast to pre_write_mst_fixup(), see below. + */ +int post_read_mst_fixup(NTFS_RECORD *b, const u32 size) +{ + u16 usa_ofs, usa_count, usn; + u16 *usa_pos, *data_pos; + + /* Setup the variables. */ + usa_ofs = le16_to_cpu(b->usa_ofs); + /* Decrement usa_count to get number of fixups. */ + usa_count = le16_to_cpu(b->usa_count) - 1; + /* Size and alignment checks. */ + if ( size & (NTFS_BLOCK_SIZE - 1) || + usa_ofs & 1 || + usa_ofs + (usa_count * 2) > size || + (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) + return 0; + /* Position of usn in update sequence array. */ + usa_pos = (u16*)b + usa_ofs/sizeof(u16); + /* + * The update sequence number which has to be equal to each of the + * u16 values before they are fixed up. Note no need to care for + * endianness since we are comparing and moving data for on disk + * structures which means the data is consistent. - If it is + * consistenty the wrong endianness it doesn't make any difference. + */ + usn = *usa_pos; + /* + * Position in protected data of first u16 that needs fixing up. + */ + data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; + /* + * Check for incomplete multi sector transfer(s). + */ + while (usa_count--) { + if (*data_pos != usn) { + /* + * Incomplete multi sector transfer detected! )-: + * Set the magic to "BAAD" and return failure. + * Note that magic_BAAD is already converted to le32. + */ + b->magic = magic_BAAD; + return -EINVAL; + } + data_pos += NTFS_BLOCK_SIZE/sizeof(u16); + } + /* Re-setup the variables. */ + usa_count = le16_to_cpu(b->usa_count) - 1; + data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; + /* Fixup all sectors. */ + while (usa_count--) { + /* + * Increment position in usa and restore original data from + * the usa into the data buffer. + */ + *data_pos = *(++usa_pos); + /* Increment position in data as well. */ + data_pos += NTFS_BLOCK_SIZE/sizeof(u16); + } + return 0; +} + +/** + * pre_write_mst_fixup - apply multi sector transfer protection + * @b: pointer to the data to protect + * @size: size in bytes of @b + * + * Perform the necessary pre write multi sector transfer fixup on the data + * pointer to by @b of @size. + * + * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed + * (assumed not needed). This is in contrast to post_read_mst_fixup() above. + * + * NOTE: We consider the absence / invalidity of an update sequence array to + * mean that the structure is not subject to protection and hence doesn't need + * to be fixed up. This means that you have to create a valid update sequence + * array header in the ntfs record before calling this function, otherwise it + * will fail (the header needs to contain the position of the update sequence + * array together with the number of elements in the array). You also need to + * initialise the update sequence number before calling this function + * otherwise a random word will be used (whatever was in the record at that + * position at that time). + */ +int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size) +{ + le16 *usa_pos, *data_pos; + u16 usa_ofs, usa_count, usn; + le16 le_usn; + + /* Sanity check + only fixup if it makes sense. */ + if (!b || ntfs_is_baad_record(b->magic) || + ntfs_is_hole_record(b->magic)) + return -EINVAL; + /* Setup the variables. */ + usa_ofs = le16_to_cpu(b->usa_ofs); + /* Decrement usa_count to get number of fixups. */ + usa_count = le16_to_cpu(b->usa_count) - 1; + /* Size and alignment checks. */ + if ( size & (NTFS_BLOCK_SIZE - 1) || + usa_ofs & 1 || + usa_ofs + (usa_count * 2) > size || + (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) + return -EINVAL; + /* Position of usn in update sequence array. */ + usa_pos = (le16*)((u8*)b + usa_ofs); + /* + * Cyclically increment the update sequence number + * (skipping 0 and -1, i.e. 0xffff). + */ + usn = le16_to_cpup(usa_pos) + 1; + if (usn == 0xffff || !usn) + usn = 1; + le_usn = cpu_to_le16(usn); + *usa_pos = le_usn; + /* Position in data of first u16 that needs fixing up. */ + data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; + /* Fixup all sectors. */ + while (usa_count--) { + /* + * Increment the position in the usa and save the + * original data from the data buffer into the usa. + */ + *(++usa_pos) = *data_pos; + /* Apply fixup to data. */ + *data_pos = le_usn; + /* Increment position in data as well. */ + data_pos += NTFS_BLOCK_SIZE/sizeof(le16); + } + return 0; +} + +/** + * post_write_mst_fixup - fast deprotect multi sector transfer protected data + * @b: pointer to the data to deprotect + * + * Perform the necessary post write multi sector transfer fixup, not checking + * for any errors, because we assume we have just used pre_write_mst_fixup(), + * thus the data will be fine or we would never have gotten here. + */ +void post_write_mst_fixup(NTFS_RECORD *b) +{ + le16 *usa_pos, *data_pos; + + u16 usa_ofs = le16_to_cpu(b->usa_ofs); + u16 usa_count = le16_to_cpu(b->usa_count) - 1; + + /* Position of usn in update sequence array. */ + usa_pos = (le16*)b + usa_ofs/sizeof(le16); + + /* Position in protected data of first u16 that needs fixing up. */ + data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; + + /* Fixup all sectors. */ + while (usa_count--) { + /* + * Increment position in usa and restore original data from + * the usa into the data buffer. + */ + *data_pos = *(++usa_pos); + + /* Increment position in data as well. */ + data_pos += NTFS_BLOCK_SIZE/sizeof(le16); + } +} diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c new file mode 100644 index 00000000..358273e5 --- /dev/null +++ b/fs/ntfs/namei.c @@ -0,0 +1,405 @@ +/* + * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2001-2006 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include + +#include "attrib.h" +#include "debug.h" +#include "dir.h" +#include "mft.h" +#include "ntfs.h" + +/** + * ntfs_lookup - find the inode represented by a dentry in a directory inode + * @dir_ino: directory inode in which to look for the inode + * @dent: dentry representing the inode to look for + * @nd: lookup nameidata + * + * In short, ntfs_lookup() looks for the inode represented by the dentry @dent + * in the directory inode @dir_ino and if found attaches the inode to the + * dentry @dent. + * + * In more detail, the dentry @dent specifies which inode to look for by + * supplying the name of the inode in @dent->d_name.name. ntfs_lookup() + * converts the name to Unicode and walks the contents of the directory inode + * @dir_ino looking for the converted Unicode name. If the name is found in the + * directory, the corresponding inode is loaded by calling ntfs_iget() on its + * inode number and the inode is associated with the dentry @dent via a call to + * d_splice_alias(). + * + * If the name is not found in the directory, a NULL inode is inserted into the + * dentry @dent via a call to d_add(). The dentry is then termed a negative + * dentry. + * + * Only if an actual error occurs, do we return an error via ERR_PTR(). + * + * In order to handle the case insensitivity issues of NTFS with regards to the + * dcache and the dcache requiring only one dentry per directory, we deal with + * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining + * a case sensitive dcache. This means that we get the full benefit of dcache + * speed when the file/directory is looked up with the same case as returned by + * ->ntfs_readdir() but that a lookup for any other case (or for the short file + * name) will not find anything in dcache and will enter ->ntfs_lookup() + * instead, where we search the directory for a fully matching file name + * (including case) and if that is not found, we search for a file name that + * matches with different case and if that has non-POSIX semantics we return + * that. We actually do only one search (case sensitive) and keep tabs on + * whether we have found a case insensitive match in the process. + * + * To simplify matters for us, we do not treat the short vs long filenames as + * two hard links but instead if the lookup matches a short filename, we + * return the dentry for the corresponding long filename instead. + * + * There are three cases we need to distinguish here: + * + * 1) @dent perfectly matches (i.e. including case) a directory entry with a + * file name in the WIN32 or POSIX namespaces. In this case + * ntfs_lookup_inode_by_name() will return with name set to NULL and we + * just d_splice_alias() @dent. + * 2) @dent matches (not including case) a directory entry with a file name in + * the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return + * with name set to point to a kmalloc()ed ntfs_name structure containing + * the properly cased little endian Unicode name. We convert the name to the + * current NLS code page, search if a dentry with this name already exists + * and if so return that instead of @dent. At this point things are + * complicated by the possibility of 'disconnected' dentries due to NFS + * which we deal with appropriately (see the code comments). The VFS will + * then destroy the old @dent and use the one we returned. If a dentry is + * not found, we allocate a new one, d_splice_alias() it, and return it as + * above. + * 3) @dent matches either perfectly or not (i.e. we don't care about case) a + * directory entry with a file name in the DOS namespace. In this case + * ntfs_lookup_inode_by_name() will return with name set to point to a + * kmalloc()ed ntfs_name structure containing the mft reference (cpu endian) + * of the inode. We use the mft reference to read the inode and to find the + * file name in the WIN32 namespace corresponding to the matched short file + * name. We then convert the name to the current NLS code page, and proceed + * searching for a dentry with this name, etc, as in case 2), above. + * + * Locking: Caller must hold i_mutex on the directory. + */ +static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, + struct nameidata *nd) +{ + ntfs_volume *vol = NTFS_SB(dir_ino->i_sb); + struct inode *dent_inode; + ntfschar *uname; + ntfs_name *name = NULL; + MFT_REF mref; + unsigned long dent_ino; + int uname_len; + + ntfs_debug("Looking up %s in directory inode 0x%lx.", + dent->d_name.name, dir_ino->i_ino); + /* Convert the name of the dentry to Unicode. */ + uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len, + &uname); + if (uname_len < 0) { + if (uname_len != -ENAMETOOLONG) + ntfs_error(vol->sb, "Failed to convert name to " + "Unicode."); + return ERR_PTR(uname_len); + } + mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len, + &name); + kmem_cache_free(ntfs_name_cache, uname); + if (!IS_ERR_MREF(mref)) { + dent_ino = MREF(mref); + ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino); + dent_inode = ntfs_iget(vol->sb, dent_ino); + if (likely(!IS_ERR(dent_inode))) { + /* Consistency check. */ + if (is_bad_inode(dent_inode) || MSEQNO(mref) == + NTFS_I(dent_inode)->seq_no || + dent_ino == FILE_MFT) { + /* Perfect WIN32/POSIX match. -- Case 1. */ + if (!name) { + ntfs_debug("Done. (Case 1.)"); + return d_splice_alias(dent_inode, dent); + } + /* + * We are too indented. Handle imperfect + * matches and short file names further below. + */ + goto handle_name; + } + ntfs_error(vol->sb, "Found stale reference to inode " + "0x%lx (reference sequence number = " + "0x%x, inode sequence number = 0x%x), " + "returning -EIO. Run chkdsk.", + dent_ino, MSEQNO(mref), + NTFS_I(dent_inode)->seq_no); + iput(dent_inode); + dent_inode = ERR_PTR(-EIO); + } else + ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with " + "error code %li.", dent_ino, + PTR_ERR(dent_inode)); + kfree(name); + /* Return the error code. */ + return (struct dentry *)dent_inode; + } + /* It is guaranteed that @name is no longer allocated at this point. */ + if (MREF_ERR(mref) == -ENOENT) { + ntfs_debug("Entry was not found, adding negative dentry."); + /* The dcache will handle negative entries. */ + d_add(dent, NULL); + ntfs_debug("Done."); + return NULL; + } + ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error " + "code %i.", -MREF_ERR(mref)); + return ERR_PTR(MREF_ERR(mref)); + // TODO: Consider moving this lot to a separate function! (AIA) +handle_name: + { + MFT_RECORD *m; + ntfs_attr_search_ctx *ctx; + ntfs_inode *ni = NTFS_I(dent_inode); + int err; + struct qstr nls_name; + + nls_name.name = NULL; + if (name->type != FILE_NAME_DOS) { /* Case 2. */ + ntfs_debug("Case 2."); + nls_name.len = (unsigned)ntfs_ucstonls(vol, + (ntfschar*)&name->name, name->len, + (unsigned char**)&nls_name.name, 0); + kfree(name); + } else /* if (name->type == FILE_NAME_DOS) */ { /* Case 3. */ + FILE_NAME_ATTR *fn; + + ntfs_debug("Case 3."); + kfree(name); + + /* Find the WIN32 name corresponding to the matched DOS name. */ + ni = NTFS_I(dent_inode); + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + do { + ATTR_RECORD *a; + u32 val_len; + + err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, + NULL, 0, ctx); + if (unlikely(err)) { + ntfs_error(vol->sb, "Inode corrupt: No WIN32 " + "namespace counterpart to DOS " + "file name. Run chkdsk."); + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + /* Consistency checks. */ + a = ctx->attr; + if (a->non_resident || a->flags) + goto eio_err_out; + val_len = le32_to_cpu(a->data.resident.value_length); + if (le16_to_cpu(a->data.resident.value_offset) + + val_len > le32_to_cpu(a->length)) + goto eio_err_out; + fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu( + ctx->attr->data.resident.value_offset)); + if ((u32)(fn->file_name_length * sizeof(ntfschar) + + sizeof(FILE_NAME_ATTR)) > val_len) + goto eio_err_out; + } while (fn->file_name_type != FILE_NAME_WIN32); + + /* Convert the found WIN32 name to current NLS code page. */ + nls_name.len = (unsigned)ntfs_ucstonls(vol, + (ntfschar*)&fn->file_name, fn->file_name_length, + (unsigned char**)&nls_name.name, 0); + + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + } + m = NULL; + ctx = NULL; + + /* Check if a conversion error occurred. */ + if ((signed)nls_name.len < 0) { + err = (signed)nls_name.len; + goto err_out; + } + nls_name.hash = full_name_hash(nls_name.name, nls_name.len); + + dent = d_add_ci(dent, dent_inode, &nls_name); + kfree(nls_name.name); + return dent; + +eio_err_out: + ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk."); + err = -EIO; +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(ni); + iput(dent_inode); + ntfs_error(vol->sb, "Failed, returning error code %i.", err); + return ERR_PTR(err); + } +} + +/** + * Inode operations for directories. + */ +const struct inode_operations ntfs_dir_inode_ops = { + .lookup = ntfs_lookup, /* VFS: Lookup directory. */ +}; + +/** + * ntfs_get_parent - find the dentry of the parent of a given directory dentry + * @child_dent: dentry of the directory whose parent directory to find + * + * Find the dentry for the parent directory of the directory specified by the + * dentry @child_dent. This function is called from + * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the + * default ->decode_fh() which is export_decode_fh() in the same file. + * + * The code is based on the ext3 ->get_parent() implementation found in + * fs/ext3/namei.c::ext3_get_parent(). + * + * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_mutex down. + * + * Return the dentry of the parent directory on success or the error code on + * error (IS_ERR() is true). + */ +static struct dentry *ntfs_get_parent(struct dentry *child_dent) +{ + struct inode *vi = child_dent->d_inode; + ntfs_inode *ni = NTFS_I(vi); + MFT_RECORD *mrec; + ntfs_attr_search_ctx *ctx; + ATTR_RECORD *attr; + FILE_NAME_ATTR *fn; + unsigned long parent_ino; + int err; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + /* Get the mft record of the inode belonging to the child dentry. */ + mrec = map_mft_record(ni); + if (IS_ERR(mrec)) + return (struct dentry *)mrec; + /* Find the first file name attribute in the mft record. */ + ctx = ntfs_attr_get_search_ctx(ni, mrec); + if (unlikely(!ctx)) { + unmap_mft_record(ni); + return ERR_PTR(-ENOMEM); + } +try_next: + err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + if (err == -ENOENT) + ntfs_error(vi->i_sb, "Inode 0x%lx does not have a " + "file name attribute. Run chkdsk.", + vi->i_ino); + return ERR_PTR(err); + } + attr = ctx->attr; + if (unlikely(attr->non_resident)) + goto try_next; + fn = (FILE_NAME_ATTR *)((u8 *)attr + + le16_to_cpu(attr->data.resident.value_offset)); + if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) > + (u8*)attr + le32_to_cpu(attr->length))) + goto try_next; + /* Get the inode number of the parent directory. */ + parent_ino = MREF_LE(fn->parent_directory); + /* Release the search context and the mft record of the child. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + + return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino)); +} + +static struct inode *ntfs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + inode = ntfs_iget(sb, ino); + if (!IS_ERR(inode)) { + if (is_bad_inode(inode) || inode->i_generation != generation) { + iput(inode); + inode = ERR_PTR(-ESTALE); + } + } + + return inode; +} + +static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ntfs_nfs_get_inode); +} + +static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ntfs_nfs_get_inode); +} + +/** + * Export operations allowing NFS exporting of mounted NTFS partitions. + * + * We use the default ->encode_fh() for now. Note that they + * use 32 bits to store the inode number which is an unsigned long so on 64-bit + * architectures is usually 64 bits so it would all fail horribly on huge + * volumes. I guess we need to define our own encode and decode fh functions + * that store 64-bit inode numbers at some point but for now we will ignore the + * problem... + * + * We also use the default ->get_name() helper (used by ->decode_fh() via + * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs + * independent. + * + * The default ->get_parent() just returns -EACCES so we have to provide our + * own and the default ->get_dentry() is incompatible with NTFS due to not + * allowing the inode number 0 which is used in NTFS for the system file $MFT + * and due to using iget() whereas NTFS needs ntfs_iget(). + */ +const struct export_operations ntfs_export_ops = { + .get_parent = ntfs_get_parent, /* Find the parent of a given + directory. */ + .fh_to_dentry = ntfs_fh_to_dentry, + .fh_to_parent = ntfs_fh_to_parent, +}; diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h new file mode 100644 index 00000000..d6a340bf --- /dev/null +++ b/fs/ntfs/ntfs.h @@ -0,0 +1,164 @@ +/* + * ntfs.h - Defines for NTFS Linux kernel driver. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (C) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_H +#define _LINUX_NTFS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "volume.h" +#include "layout.h" + +typedef enum { + NTFS_BLOCK_SIZE = 512, + NTFS_BLOCK_SIZE_BITS = 9, + NTFS_SB_MAGIC = 0x5346544e, /* 'NTFS' */ + NTFS_MAX_NAME_LEN = 255, + NTFS_MAX_ATTR_NAME_LEN = 255, + NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */ + NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_CACHE_SIZE, +} NTFS_CONSTANTS; + +/* Global variables. */ + +/* Slab caches (from super.c). */ +extern struct kmem_cache *ntfs_name_cache; +extern struct kmem_cache *ntfs_inode_cache; +extern struct kmem_cache *ntfs_big_inode_cache; +extern struct kmem_cache *ntfs_attr_ctx_cache; +extern struct kmem_cache *ntfs_index_ctx_cache; + +/* The various operations structs defined throughout the driver files. */ +extern const struct address_space_operations ntfs_aops; +extern const struct address_space_operations ntfs_mst_aops; + +extern const struct file_operations ntfs_file_ops; +extern const struct inode_operations ntfs_file_inode_ops; + +extern const struct file_operations ntfs_dir_ops; +extern const struct inode_operations ntfs_dir_inode_ops; + +extern const struct file_operations ntfs_empty_file_ops; +extern const struct inode_operations ntfs_empty_inode_ops; + +extern const struct export_operations ntfs_export_ops; + +/** + * NTFS_SB - return the ntfs volume given a vfs super block + * @sb: VFS super block + * + * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb. + */ +static inline ntfs_volume *NTFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* Declarations of functions and global variables. */ + +/* From fs/ntfs/compress.c */ +extern int ntfs_read_compressed_block(struct page *page); +extern int allocate_compression_buffers(void); +extern void free_compression_buffers(void); + +/* From fs/ntfs/super.c */ +#define default_upcase_len 0x10000 +extern struct mutex ntfs_lock; + +typedef struct { + int val; + char *str; +} option_t; +extern const option_t on_errors_arr[]; + +/* From fs/ntfs/mst.c */ +extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size); +extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size); +extern void post_write_mst_fixup(NTFS_RECORD *b); + +/* From fs/ntfs/unistr.c */ +extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, + const ntfschar *s2, size_t s2_len, + const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_size); +extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, + const ntfschar *name2, const u32 name2_len, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len); +extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n); +extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, + const ntfschar *upcase, const u32 upcase_size); +extern void ntfs_upcase_name(ntfschar *name, u32 name_len, + const ntfschar *upcase, const u32 upcase_len); +extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, + const ntfschar *upcase, const u32 upcase_len); +extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, + FILE_NAME_ATTR *file_name_attr2, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len); +extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, + const int ins_len, ntfschar **outs); +extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, + const int ins_len, unsigned char **outs, int outs_len); + +/* From fs/ntfs/upcase.c */ +extern ntfschar *generate_default_upcase(void); + +static inline int ntfs_ffs(int x) +{ + int r = 1; + + if (!x) + return 0; + if (!(x & 0xffff)) { + x >>= 16; + r += 16; + } + if (!(x & 0xff)) { + x >>= 8; + r += 8; + } + if (!(x & 0xf)) { + x >>= 4; + r += 4; + } + if (!(x & 3)) { + x >>= 2; + r += 2; + } + if (!(x & 1)) { + x >>= 1; + r += 1; + } + return r; +} + +#endif /* _LINUX_NTFS_H */ diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c new file mode 100644 index 00000000..d80e3315 --- /dev/null +++ b/fs/ntfs/quota.c @@ -0,0 +1,117 @@ +/* + * quota.c - NTFS kernel quota ($Quota) handling. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifdef NTFS_RW + +#include "index.h" +#include "quota.h" +#include "debug.h" +#include "ntfs.h" + +/** + * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume + * @vol: ntfs volume on which to mark the quotas out of date + * + * Mark the quotas out of date on the ntfs volume @vol and return 'true' on + * success and 'false' on error. + */ +bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) +{ + ntfs_index_context *ictx; + QUOTA_CONTROL_ENTRY *qce; + const le32 qid = QUOTA_DEFAULTS_ID; + int err; + + ntfs_debug("Entering."); + if (NVolQuotaOutOfDate(vol)) + goto done; + if (!vol->quota_ino || !vol->quota_q_ino) { + ntfs_error(vol->sb, "Quota inodes are not open."); + return false; + } + mutex_lock(&vol->quota_q_ino->i_mutex); + ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); + if (!ictx) { + ntfs_error(vol->sb, "Failed to get index context."); + goto err_out; + } + err = ntfs_index_lookup(&qid, sizeof(qid), ictx); + if (err) { + if (err == -ENOENT) + ntfs_error(vol->sb, "Quota defaults entry is not " + "present."); + else + ntfs_error(vol->sb, "Lookup of quota defaults entry " + "failed."); + goto err_out; + } + if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) { + ntfs_error(vol->sb, "Quota defaults entry size is invalid. " + "Run chkdsk."); + goto err_out; + } + qce = (QUOTA_CONTROL_ENTRY*)ictx->data; + if (le32_to_cpu(qce->version) != QUOTA_VERSION) { + ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not " + "supported.", le32_to_cpu(qce->version)); + goto err_out; + } + ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags)); + /* If quotas are already marked out of date, no need to do anything. */ + if (qce->flags & QUOTA_FLAG_OUT_OF_DATE) + goto set_done; + /* + * If quota tracking is neither requested, nor enabled and there are no + * pending deletes, no need to mark the quotas out of date. + */ + if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED | + QUOTA_FLAG_TRACKING_REQUESTED | + QUOTA_FLAG_PENDING_DELETES))) + goto set_done; + /* + * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date. + * This is verified on WinXP to be sufficient to cause windows to + * rescan the volume on boot and update all quota entries. + */ + qce->flags |= QUOTA_FLAG_OUT_OF_DATE; + /* Ensure the modified flags are written to disk. */ + ntfs_index_entry_flush_dcache_page(ictx); + ntfs_index_entry_mark_dirty(ictx); +set_done: + ntfs_index_ctx_put(ictx); + mutex_unlock(&vol->quota_q_ino->i_mutex); + /* + * We set the flag so we do not try to mark the quotas out of date + * again on remount. + */ + NVolSetQuotaOutOfDate(vol); +done: + ntfs_debug("Done."); + return true; +err_out: + if (ictx) + ntfs_index_ctx_put(ictx); + mutex_unlock(&vol->quota_q_ino->i_mutex); + return false; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h new file mode 100644 index 00000000..4cbe5594 --- /dev/null +++ b/fs/ntfs/quota.h @@ -0,0 +1,35 @@ +/* + * quota.h - Defines for NTFS kernel quota ($Quota) handling. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_QUOTA_H +#define _LINUX_NTFS_QUOTA_H + +#ifdef NTFS_RW + +#include "types.h" +#include "volume.h" + +extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_QUOTA_H */ diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c new file mode 100644 index 00000000..eac7d678 --- /dev/null +++ b/fs/ntfs/runlist.c @@ -0,0 +1,1907 @@ +/** + * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2002-2005 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "debug.h" +#include "dir.h" +#include "endian.h" +#include "malloc.h" +#include "ntfs.h" + +/** + * ntfs_rl_mm - runlist memmove + * + * It is up to the caller to serialize access to the runlist @base. + */ +static inline void ntfs_rl_mm(runlist_element *base, int dst, int src, + int size) +{ + if (likely((dst != src) && (size > 0))) + memmove(base + dst, base + src, size * sizeof(*base)); +} + +/** + * ntfs_rl_mc - runlist memory copy + * + * It is up to the caller to serialize access to the runlists @dstbase and + * @srcbase. + */ +static inline void ntfs_rl_mc(runlist_element *dstbase, int dst, + runlist_element *srcbase, int src, int size) +{ + if (likely(size > 0)) + memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase)); +} + +/** + * ntfs_rl_realloc - Reallocate memory for runlists + * @rl: original runlist + * @old_size: number of runlist elements in the original runlist @rl + * @new_size: number of runlist elements we need space for + * + * As the runlists grow, more memory will be required. To prevent the + * kernel having to allocate and reallocate large numbers of small bits of + * memory, this function returns an entire page of memory. + * + * It is up to the caller to serialize access to the runlist @rl. + * + * N.B. If the new allocation doesn't require a different number of pages in + * memory, the function will return the original pointer. + * + * On success, return a pointer to the newly allocated, or recycled, memory. + * On error, return -errno. The following error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_realloc(runlist_element *rl, + int old_size, int new_size) +{ + runlist_element *new_rl; + + old_size = PAGE_ALIGN(old_size * sizeof(*rl)); + new_size = PAGE_ALIGN(new_size * sizeof(*rl)); + if (old_size == new_size) + return rl; + + new_rl = ntfs_malloc_nofs(new_size); + if (unlikely(!new_rl)) + return ERR_PTR(-ENOMEM); + + if (likely(rl != NULL)) { + if (unlikely(old_size > new_size)) + old_size = new_size; + memcpy(new_rl, rl, old_size); + ntfs_free(rl); + } + return new_rl; +} + +/** + * ntfs_rl_realloc_nofail - Reallocate memory for runlists + * @rl: original runlist + * @old_size: number of runlist elements in the original runlist @rl + * @new_size: number of runlist elements we need space for + * + * As the runlists grow, more memory will be required. To prevent the + * kernel having to allocate and reallocate large numbers of small bits of + * memory, this function returns an entire page of memory. + * + * This function guarantees that the allocation will succeed. It will sleep + * for as long as it takes to complete the allocation. + * + * It is up to the caller to serialize access to the runlist @rl. + * + * N.B. If the new allocation doesn't require a different number of pages in + * memory, the function will return the original pointer. + * + * On success, return a pointer to the newly allocated, or recycled, memory. + * On error, return -errno. The following error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl, + int old_size, int new_size) +{ + runlist_element *new_rl; + + old_size = PAGE_ALIGN(old_size * sizeof(*rl)); + new_size = PAGE_ALIGN(new_size * sizeof(*rl)); + if (old_size == new_size) + return rl; + + new_rl = ntfs_malloc_nofs_nofail(new_size); + BUG_ON(!new_rl); + + if (likely(rl != NULL)) { + if (unlikely(old_size > new_size)) + old_size = new_size; + memcpy(new_rl, rl, old_size); + ntfs_free(rl); + } + return new_rl; +} + +/** + * ntfs_are_rl_mergeable - test if two runlists can be joined together + * @dst: original runlist + * @src: new runlist to test for mergeability with @dst + * + * Test if two runlists can be joined together. For this, their VCNs and LCNs + * must be adjacent. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * Return: true Success, the runlists can be merged. + * false Failure, the runlists cannot be merged. + */ +static inline bool ntfs_are_rl_mergeable(runlist_element *dst, + runlist_element *src) +{ + BUG_ON(!dst); + BUG_ON(!src); + + /* We can merge unmapped regions even if they are misaligned. */ + if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED)) + return true; + /* If the runs are misaligned, we cannot merge them. */ + if ((dst->vcn + dst->length) != src->vcn) + return false; + /* If both runs are non-sparse and contiguous, we can merge them. */ + if ((dst->lcn >= 0) && (src->lcn >= 0) && + ((dst->lcn + dst->length) == src->lcn)) + return true; + /* If we are merging two holes, we can merge them. */ + if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE)) + return true; + /* Cannot merge. */ + return false; +} + +/** + * __ntfs_rl_merge - merge two runlists without testing if they can be merged + * @dst: original, destination runlist + * @src: new runlist to merge with @dst + * + * Merge the two runlists, writing into the destination runlist @dst. The + * caller must make sure the runlists can be merged or this will corrupt the + * destination runlist. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + */ +static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src) +{ + dst->length += src->length; +} + +/** + * ntfs_rl_append - append a runlist after a given element + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: runlist to be inserted into @dst + * @ssize: number of elements in @src (excluding end marker) + * @loc: append the new runlist @src after this element in @dst + * + * Append the runlist @src after element @loc in @dst. Merge the right end of + * the new runlist, if necessary. Adjust the size of the hole before the + * appended runlist. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_append(runlist_element *dst, + int dsize, runlist_element *src, int ssize, int loc) +{ + bool right = false; /* Right end of @src needs merging. */ + int marker; /* End of the inserted runs. */ + + BUG_ON(!dst); + BUG_ON(!src); + + /* First, check if the right hand end needs merging. */ + if ((loc + 1) < dsize) + right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); + + /* Space required: @dst size + @src size, less one if we merged. */ + dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right); + if (IS_ERR(dst)) + return dst; + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlists. + */ + + /* First, merge the right hand end, if necessary. */ + if (right) + __ntfs_rl_merge(src + ssize - 1, dst + loc + 1); + + /* First run after the @src runs that have been inserted. */ + marker = loc + ssize + 1; + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right)); + ntfs_rl_mc(dst, loc + 1, src, 0, ssize); + + /* Adjust the size of the preceding hole. */ + dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn; + + /* We may have changed the length of the file, so fix the end marker */ + if (dst[marker].lcn == LCN_ENOENT) + dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; + + return dst; +} + +/** + * ntfs_rl_insert - insert a runlist into another + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: new runlist to be inserted + * @ssize: number of elements in @src (excluding end marker) + * @loc: insert the new runlist @src before this element in @dst + * + * Insert the runlist @src before element @loc in the runlist @dst. Merge the + * left end of the new runlist, if necessary. Adjust the size of the hole + * after the inserted runlist. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_insert(runlist_element *dst, + int dsize, runlist_element *src, int ssize, int loc) +{ + bool left = false; /* Left end of @src needs merging. */ + bool disc = false; /* Discontinuity between @dst and @src. */ + int marker; /* End of the inserted runs. */ + + BUG_ON(!dst); + BUG_ON(!src); + + /* + * disc => Discontinuity between the end of @dst and the start of @src. + * This means we might need to insert a "not mapped" run. + */ + if (loc == 0) + disc = (src[0].vcn > 0); + else { + s64 merged_length; + + left = ntfs_are_rl_mergeable(dst + loc - 1, src); + + merged_length = dst[loc - 1].length; + if (left) + merged_length += src->length; + + disc = (src[0].vcn > dst[loc - 1].vcn + merged_length); + } + /* + * Space required: @dst size + @src size, less one if we merged, plus + * one if there was a discontinuity. + */ + dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc); + if (IS_ERR(dst)) + return dst; + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlist. + */ + if (left) + __ntfs_rl_merge(dst + loc - 1, src); + /* + * First run after the @src runs that have been inserted. + * Nominally, @marker equals @loc + @ssize, i.e. location + number of + * runs in @src. However, if @left, then the first run in @src has + * been merged with one in @dst. And if @disc, then @dst and @src do + * not meet and we need an extra run to fill the gap. + */ + marker = loc + ssize - left + disc; + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, marker, loc, dsize - loc); + ntfs_rl_mc(dst, loc + disc, src, left, ssize - left); + + /* Adjust the VCN of the first run after the insertion... */ + dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; + /* ... and the length. */ + if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED) + dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn; + + /* Writing beyond the end of the file and there is a discontinuity. */ + if (disc) { + if (loc > 0) { + dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length; + dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn; + } else { + dst[loc].vcn = 0; + dst[loc].length = dst[loc + 1].vcn; + } + dst[loc].lcn = LCN_RL_NOT_MAPPED; + } + return dst; +} + +/** + * ntfs_rl_replace - overwrite a runlist element with another runlist + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: new runlist to be inserted + * @ssize: number of elements in @src (excluding end marker) + * @loc: index in runlist @dst to overwrite with @src + * + * Replace the runlist element @dst at @loc with @src. Merge the left and + * right ends of the inserted runlist, if necessary. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_replace(runlist_element *dst, + int dsize, runlist_element *src, int ssize, int loc) +{ + signed delta; + bool left = false; /* Left end of @src needs merging. */ + bool right = false; /* Right end of @src needs merging. */ + int tail; /* Start of tail of @dst. */ + int marker; /* End of the inserted runs. */ + + BUG_ON(!dst); + BUG_ON(!src); + + /* First, see if the left and right ends need merging. */ + if ((loc + 1) < dsize) + right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); + if (loc > 0) + left = ntfs_are_rl_mergeable(dst + loc - 1, src); + /* + * Allocate some space. We will need less if the left, right, or both + * ends get merged. The -1 accounts for the run being replaced. + */ + delta = ssize - 1 - left - right; + if (delta > 0) { + dst = ntfs_rl_realloc(dst, dsize, dsize + delta); + if (IS_ERR(dst)) + return dst; + } + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlists. + */ + + /* First, merge the left and right ends, if necessary. */ + if (right) + __ntfs_rl_merge(src + ssize - 1, dst + loc + 1); + if (left) + __ntfs_rl_merge(dst + loc - 1, src); + /* + * Offset of the tail of @dst. This needs to be moved out of the way + * to make space for the runs to be copied from @src, i.e. the first + * run of the tail of @dst. + * Nominally, @tail equals @loc + 1, i.e. location, skipping the + * replaced run. However, if @right, then one of @dst's runs is + * already merged into @src. + */ + tail = loc + right + 1; + /* + * First run after the @src runs that have been inserted, i.e. where + * the tail of @dst needs to be moved to. + * Nominally, @marker equals @loc + @ssize, i.e. location + number of + * runs in @src. However, if @left, then the first run in @src has + * been merged with one in @dst. + */ + marker = loc + ssize - left; + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, marker, tail, dsize - tail); + ntfs_rl_mc(dst, loc, src, left, ssize - left); + + /* We may have changed the length of the file, so fix the end marker. */ + if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT) + dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; + return dst; +} + +/** + * ntfs_rl_split - insert a runlist into the centre of a hole + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: new runlist to be inserted + * @ssize: number of elements in @src (excluding end marker) + * @loc: index in runlist @dst at which to split and insert @src + * + * Split the runlist @dst at @loc into two and insert @new in between the two + * fragments. No merging of runlists is necessary. Adjust the size of the + * holes either side. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize, + runlist_element *src, int ssize, int loc) +{ + BUG_ON(!dst); + BUG_ON(!src); + + /* Space required: @dst size + @src size + one new hole. */ + dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1); + if (IS_ERR(dst)) + return dst; + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlists. + */ + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc); + ntfs_rl_mc(dst, loc + 1, src, 0, ssize); + + /* Adjust the size of the holes either size of @src. */ + dst[loc].length = dst[loc+1].vcn - dst[loc].vcn; + dst[loc+ssize+1].vcn = dst[loc+ssize].vcn + dst[loc+ssize].length; + dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn; + + return dst; +} + +/** + * ntfs_runlists_merge - merge two runlists into one + * @drl: original runlist to be worked on + * @srl: new runlist to be merged into @drl + * + * First we sanity check the two runlists @srl and @drl to make sure that they + * are sensible and can be merged. The runlist @srl must be either after the + * runlist @drl or completely within a hole (or unmapped region) in @drl. + * + * It is up to the caller to serialize access to the runlists @drl and @srl. + * + * Merging of runlists is necessary in two cases: + * 1. When attribute lists are used and a further extent is being mapped. + * 2. When new clusters are allocated to fill a hole or extend a file. + * + * There are four possible ways @srl can be merged. It can: + * - be inserted at the beginning of a hole, + * - split the hole in two and be inserted between the two fragments, + * - be appended at the end of a hole, or it can + * - replace the whole hole. + * It can also be appended to the end of the runlist, which is just a variant + * of the insert case. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @drl and @srl are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + * -ERANGE - The runlists overlap and cannot be merged. + */ +runlist_element *ntfs_runlists_merge(runlist_element *drl, + runlist_element *srl) +{ + int di, si; /* Current index into @[ds]rl. */ + int sstart; /* First index with lcn > LCN_RL_NOT_MAPPED. */ + int dins; /* Index into @drl at which to insert @srl. */ + int dend, send; /* Last index into @[ds]rl. */ + int dfinal, sfinal; /* The last index into @[ds]rl with + lcn >= LCN_HOLE. */ + int marker = 0; + VCN marker_vcn = 0; + +#ifdef DEBUG + ntfs_debug("dst:"); + ntfs_debug_dump_runlist(drl); + ntfs_debug("src:"); + ntfs_debug_dump_runlist(srl); +#endif + + /* Check for silly calling... */ + if (unlikely(!srl)) + return drl; + if (IS_ERR(srl) || IS_ERR(drl)) + return ERR_PTR(-EINVAL); + + /* Check for the case where the first mapping is being done now. */ + if (unlikely(!drl)) { + drl = srl; + /* Complete the source runlist if necessary. */ + if (unlikely(drl[0].vcn)) { + /* Scan to the end of the source runlist. */ + for (dend = 0; likely(drl[dend].length); dend++) + ; + dend++; + drl = ntfs_rl_realloc(drl, dend, dend + 1); + if (IS_ERR(drl)) + return drl; + /* Insert start element at the front of the runlist. */ + ntfs_rl_mm(drl, 1, 0, dend); + drl[0].vcn = 0; + drl[0].lcn = LCN_RL_NOT_MAPPED; + drl[0].length = drl[1].vcn; + } + goto finished; + } + + si = di = 0; + + /* Skip any unmapped start element(s) in the source runlist. */ + while (srl[si].length && srl[si].lcn < LCN_HOLE) + si++; + + /* Can't have an entirely unmapped source runlist. */ + BUG_ON(!srl[si].length); + + /* Record the starting points. */ + sstart = si; + + /* + * Skip forward in @drl until we reach the position where @srl needs to + * be inserted. If we reach the end of @drl, @srl just needs to be + * appended to @drl. + */ + for (; drl[di].length; di++) { + if (drl[di].vcn + drl[di].length > srl[sstart].vcn) + break; + } + dins = di; + + /* Sanity check for illegal overlaps. */ + if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) && + (srl[si].lcn >= 0)) { + ntfs_error(NULL, "Run lists overlap. Cannot merge!"); + return ERR_PTR(-ERANGE); + } + + /* Scan to the end of both runlists in order to know their sizes. */ + for (send = si; srl[send].length; send++) + ; + for (dend = di; drl[dend].length; dend++) + ; + + if (srl[send].lcn == LCN_ENOENT) + marker_vcn = srl[marker = send].vcn; + + /* Scan to the last element with lcn >= LCN_HOLE. */ + for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--) + ; + for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--) + ; + + { + bool start; + bool finish; + int ds = dend + 1; /* Number of elements in drl & srl */ + int ss = sfinal - sstart + 1; + + start = ((drl[dins].lcn < LCN_RL_NOT_MAPPED) || /* End of file */ + (drl[dins].vcn == srl[sstart].vcn)); /* Start of hole */ + finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) && /* End of file */ + ((drl[dins].vcn + drl[dins].length) <= /* End of hole */ + (srl[send - 1].vcn + srl[send - 1].length))); + + /* Or we will lose an end marker. */ + if (finish && !drl[dins].length) + ss++; + if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn)) + finish = false; +#if 0 + ntfs_debug("dfinal = %i, dend = %i", dfinal, dend); + ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send); + ntfs_debug("start = %i, finish = %i", start, finish); + ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins); +#endif + if (start) { + if (finish) + drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins); + else + drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins); + } else { + if (finish) + drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins); + else + drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins); + } + if (IS_ERR(drl)) { + ntfs_error(NULL, "Merge failed."); + return drl; + } + ntfs_free(srl); + if (marker) { + ntfs_debug("Triggering marker code."); + for (ds = dend; drl[ds].length; ds++) + ; + /* We only need to care if @srl ended after @drl. */ + if (drl[ds].vcn <= marker_vcn) { + int slots = 0; + + if (drl[ds].vcn == marker_vcn) { + ntfs_debug("Old marker = 0x%llx, replacing " + "with LCN_ENOENT.", + (unsigned long long) + drl[ds].lcn); + drl[ds].lcn = LCN_ENOENT; + goto finished; + } + /* + * We need to create an unmapped runlist element in + * @drl or extend an existing one before adding the + * ENOENT terminator. + */ + if (drl[ds].lcn == LCN_ENOENT) { + ds--; + slots = 1; + } + if (drl[ds].lcn != LCN_RL_NOT_MAPPED) { + /* Add an unmapped runlist element. */ + if (!slots) { + drl = ntfs_rl_realloc_nofail(drl, ds, + ds + 2); + slots = 2; + } + ds++; + /* Need to set vcn if it isn't set already. */ + if (slots != 1) + drl[ds].vcn = drl[ds - 1].vcn + + drl[ds - 1].length; + drl[ds].lcn = LCN_RL_NOT_MAPPED; + /* We now used up a slot. */ + slots--; + } + drl[ds].length = marker_vcn - drl[ds].vcn; + /* Finally add the ENOENT terminator. */ + ds++; + if (!slots) + drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1); + drl[ds].vcn = marker_vcn; + drl[ds].lcn = LCN_ENOENT; + drl[ds].length = (s64)0; + } + } + } + +finished: + /* The merge was completed successfully. */ + ntfs_debug("Merged runlist:"); + ntfs_debug_dump_runlist(drl); + return drl; +} + +/** + * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist + * @vol: ntfs volume on which the attribute resides + * @attr: attribute record whose mapping pairs array to decompress + * @old_rl: optional runlist in which to insert @attr's runlist + * + * It is up to the caller to serialize access to the runlist @old_rl. + * + * Decompress the attribute @attr's mapping pairs array into a runlist. On + * success, return the decompressed runlist. + * + * If @old_rl is not NULL, decompressed runlist is inserted into the + * appropriate place in @old_rl and the resultant, combined runlist is + * returned. The original @old_rl is deallocated. + * + * On error, return -errno. @old_rl is left unmodified in that case. + * + * The following error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EIO - Corrupt runlist. + * -EINVAL - Invalid parameters were passed in. + * -ERANGE - The two runlists overlap. + * + * FIXME: For now we take the conceptionally simplest approach of creating the + * new runlist disregarding the already existing one and then splicing the + * two into one, if that is possible (we check for overlap and discard the new + * runlist if overlap present before returning ERR_PTR(-ERANGE)). + */ +runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, + const ATTR_RECORD *attr, runlist_element *old_rl) +{ + VCN vcn; /* Current vcn. */ + LCN lcn; /* Current lcn. */ + s64 deltaxcn; /* Change in [vl]cn. */ + runlist_element *rl; /* The output runlist. */ + u8 *buf; /* Current position in mapping pairs array. */ + u8 *attr_end; /* End of attribute. */ + int rlsize; /* Size of runlist buffer. */ + u16 rlpos; /* Current runlist position in units of + runlist_elements. */ + u8 b; /* Current byte offset in buf. */ + +#ifdef DEBUG + /* Make sure attr exists and is non-resident. */ + if (!attr || !attr->non_resident || sle64_to_cpu( + attr->data.non_resident.lowest_vcn) < (VCN)0) { + ntfs_error(vol->sb, "Invalid arguments."); + return ERR_PTR(-EINVAL); + } +#endif + /* Start at vcn = lowest_vcn and lcn 0. */ + vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn); + lcn = 0; + /* Get start of the mapping pairs array. */ + buf = (u8*)attr + le16_to_cpu( + attr->data.non_resident.mapping_pairs_offset); + attr_end = (u8*)attr + le32_to_cpu(attr->length); + if (unlikely(buf < (u8*)attr || buf > attr_end)) { + ntfs_error(vol->sb, "Corrupt attribute."); + return ERR_PTR(-EIO); + } + /* If the mapping pairs array is valid but empty, nothing to do. */ + if (!vcn && !*buf) + return old_rl; + /* Current position in runlist array. */ + rlpos = 0; + /* Allocate first page and set current runlist size to one page. */ + rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE); + if (unlikely(!rl)) + return ERR_PTR(-ENOMEM); + /* Insert unmapped starting element if necessary. */ + if (vcn) { + rl->vcn = 0; + rl->lcn = LCN_RL_NOT_MAPPED; + rl->length = vcn; + rlpos++; + } + while (buf < attr_end && *buf) { + /* + * Allocate more memory if needed, including space for the + * not-mapped and terminator elements. ntfs_malloc_nofs() + * operates on whole pages only. + */ + if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) { + runlist_element *rl2; + + rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); + if (unlikely(!rl2)) { + ntfs_free(rl); + return ERR_PTR(-ENOMEM); + } + memcpy(rl2, rl, rlsize); + ntfs_free(rl); + rl = rl2; + rlsize += PAGE_SIZE; + } + /* Enter the current vcn into the current runlist element. */ + rl[rlpos].vcn = vcn; + /* + * Get the change in vcn, i.e. the run length in clusters. + * Doing it this way ensures that we signextend negative values. + * A negative run length doesn't make any sense, but hey, I + * didn't make up the NTFS specs and Windows NT4 treats the run + * length as a signed value so that's how it is... + */ + b = *buf & 0xf; + if (b) { + if (unlikely(buf + b > attr_end)) + goto io_error; + for (deltaxcn = (s8)buf[b--]; b; b--) + deltaxcn = (deltaxcn << 8) + buf[b]; + } else { /* The length entry is compulsory. */ + ntfs_error(vol->sb, "Missing length entry in mapping " + "pairs array."); + deltaxcn = (s64)-1; + } + /* + * Assume a negative length to indicate data corruption and + * hence clean-up and return NULL. + */ + if (unlikely(deltaxcn < 0)) { + ntfs_error(vol->sb, "Invalid length in mapping pairs " + "array."); + goto err_out; + } + /* + * Enter the current run length into the current runlist + * element. + */ + rl[rlpos].length = deltaxcn; + /* Increment the current vcn by the current run length. */ + vcn += deltaxcn; + /* + * There might be no lcn change at all, as is the case for + * sparse clusters on NTFS 3.0+, in which case we set the lcn + * to LCN_HOLE. + */ + if (!(*buf & 0xf0)) + rl[rlpos].lcn = LCN_HOLE; + else { + /* Get the lcn change which really can be negative. */ + u8 b2 = *buf & 0xf; + b = b2 + ((*buf >> 4) & 0xf); + if (buf + b > attr_end) + goto io_error; + for (deltaxcn = (s8)buf[b--]; b > b2; b--) + deltaxcn = (deltaxcn << 8) + buf[b]; + /* Change the current lcn to its new value. */ + lcn += deltaxcn; +#ifdef DEBUG + /* + * On NTFS 1.2-, apparently can have lcn == -1 to + * indicate a hole. But we haven't verified ourselves + * whether it is really the lcn or the deltaxcn that is + * -1. So if either is found give us a message so we + * can investigate it further! + */ + if (vol->major_ver < 3) { + if (unlikely(deltaxcn == (LCN)-1)) + ntfs_error(vol->sb, "lcn delta == -1"); + if (unlikely(lcn == (LCN)-1)) + ntfs_error(vol->sb, "lcn == -1"); + } +#endif + /* Check lcn is not below -1. */ + if (unlikely(lcn < (LCN)-1)) { + ntfs_error(vol->sb, "Invalid LCN < -1 in " + "mapping pairs array."); + goto err_out; + } + /* Enter the current lcn into the runlist element. */ + rl[rlpos].lcn = lcn; + } + /* Get to the next runlist element. */ + rlpos++; + /* Increment the buffer position to the next mapping pair. */ + buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1; + } + if (unlikely(buf >= attr_end)) + goto io_error; + /* + * If there is a highest_vcn specified, it must be equal to the final + * vcn in the runlist - 1, or something has gone badly wrong. + */ + deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn); + if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) { +mpa_err: + ntfs_error(vol->sb, "Corrupt mapping pairs array in " + "non-resident attribute."); + goto err_out; + } + /* Setup not mapped runlist element if this is the base extent. */ + if (!attr->data.non_resident.lowest_vcn) { + VCN max_cluster; + + max_cluster = ((sle64_to_cpu( + attr->data.non_resident.allocated_size) + + vol->cluster_size - 1) >> + vol->cluster_size_bits) - 1; + /* + * A highest_vcn of zero means this is a single extent + * attribute so simply terminate the runlist with LCN_ENOENT). + */ + if (deltaxcn) { + /* + * If there is a difference between the highest_vcn and + * the highest cluster, the runlist is either corrupt + * or, more likely, there are more extents following + * this one. + */ + if (deltaxcn < max_cluster) { + ntfs_debug("More extents to follow; deltaxcn " + "= 0x%llx, max_cluster = " + "0x%llx", + (unsigned long long)deltaxcn, + (unsigned long long) + max_cluster); + rl[rlpos].vcn = vcn; + vcn += rl[rlpos].length = max_cluster - + deltaxcn; + rl[rlpos].lcn = LCN_RL_NOT_MAPPED; + rlpos++; + } else if (unlikely(deltaxcn > max_cluster)) { + ntfs_error(vol->sb, "Corrupt attribute. " + "deltaxcn = 0x%llx, " + "max_cluster = 0x%llx", + (unsigned long long)deltaxcn, + (unsigned long long) + max_cluster); + goto mpa_err; + } + } + rl[rlpos].lcn = LCN_ENOENT; + } else /* Not the base extent. There may be more extents to follow. */ + rl[rlpos].lcn = LCN_RL_NOT_MAPPED; + + /* Setup terminating runlist element. */ + rl[rlpos].vcn = vcn; + rl[rlpos].length = (s64)0; + /* If no existing runlist was specified, we are done. */ + if (!old_rl) { + ntfs_debug("Mapping pairs array successfully decompressed:"); + ntfs_debug_dump_runlist(rl); + return rl; + } + /* Now combine the new and old runlists checking for overlaps. */ + old_rl = ntfs_runlists_merge(old_rl, rl); + if (likely(!IS_ERR(old_rl))) + return old_rl; + ntfs_free(rl); + ntfs_error(vol->sb, "Failed to merge runlists."); + return old_rl; +io_error: + ntfs_error(vol->sb, "Corrupt attribute."); +err_out: + ntfs_free(rl); + return ERR_PTR(-EIO); +} + +/** + * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist + * @rl: runlist to use for conversion + * @vcn: vcn to convert + * + * Convert the virtual cluster number @vcn of an attribute into a logical + * cluster number (lcn) of a device using the runlist @rl to map vcns to their + * corresponding lcns. + * + * It is up to the caller to serialize access to the runlist @rl. + * + * Since lcns must be >= 0, we use negative return codes with special meaning: + * + * Return code Meaning / Description + * ================================================== + * LCN_HOLE Hole / not allocated on disk. + * LCN_RL_NOT_MAPPED This is part of the runlist which has not been + * inserted into the runlist yet. + * LCN_ENOENT There is no such vcn in the attribute. + * + * Locking: - The caller must have locked the runlist (for reading or writing). + * - This function does not touch the lock, nor does it modify the + * runlist. + */ +LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn) +{ + int i; + + BUG_ON(vcn < 0); + /* + * If rl is NULL, assume that we have found an unmapped runlist. The + * caller can then attempt to map it and fail appropriately if + * necessary. + */ + if (unlikely(!rl)) + return LCN_RL_NOT_MAPPED; + + /* Catch out of lower bounds vcn. */ + if (unlikely(vcn < rl[0].vcn)) + return LCN_ENOENT; + + for (i = 0; likely(rl[i].length); i++) { + if (unlikely(vcn < rl[i+1].vcn)) { + if (likely(rl[i].lcn >= (LCN)0)) + return rl[i].lcn + (vcn - rl[i].vcn); + return rl[i].lcn; + } + } + /* + * The terminator element is setup to the correct value, i.e. one of + * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT. + */ + if (likely(rl[i].lcn < (LCN)0)) + return rl[i].lcn; + /* Just in case... We could replace this with BUG() some day. */ + return LCN_ENOENT; +} + +#ifdef NTFS_RW + +/** + * ntfs_rl_find_vcn_nolock - find a vcn in a runlist + * @rl: runlist to search + * @vcn: vcn to find + * + * Find the virtual cluster number @vcn in the runlist @rl and return the + * address of the runlist element containing the @vcn on success. + * + * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of + * the runlist. + * + * Locking: The runlist must be locked on entry. + */ +runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn) +{ + BUG_ON(vcn < 0); + if (unlikely(!rl || vcn < rl[0].vcn)) + return NULL; + while (likely(rl->length)) { + if (unlikely(vcn < rl[1].vcn)) { + if (likely(rl->lcn >= LCN_HOLE)) + return rl; + return NULL; + } + rl++; + } + if (likely(rl->lcn == LCN_ENOENT)) + return rl; + return NULL; +} + +/** + * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number + * @n: number for which to get the number of bytes for + * + * Return the number of bytes required to store @n unambiguously as + * a signed number. + * + * This is used in the context of the mapping pairs array to determine how + * many bytes will be needed in the array to store a given logical cluster + * number (lcn) or a specific run length. + * + * Return the number of bytes written. This function cannot fail. + */ +static inline int ntfs_get_nr_significant_bytes(const s64 n) +{ + s64 l = n; + int i; + s8 j; + + i = 0; + do { + l >>= 8; + i++; + } while (l != 0 && l != -1); + j = (n >> 8 * (i - 1)) & 0xff; + /* If the sign bit is wrong, we need an extra byte. */ + if ((n < 0 && j >= 0) || (n > 0 && j < 0)) + i++; + return i; +} + +/** + * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array + * @vol: ntfs volume (needed for the ntfs version) + * @rl: locked runlist to determine the size of the mapping pairs of + * @first_vcn: first vcn which to include in the mapping pairs array + * @last_vcn: last vcn which to include in the mapping pairs array + * + * Walk the locked runlist @rl and calculate the size in bytes of the mapping + * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and + * finishing with vcn @last_vcn. + * + * A @last_vcn of -1 means end of runlist and in that case the size of the + * mapping pairs array corresponding to the runlist starting at vcn @first_vcn + * and finishing at the end of the runlist is determined. + * + * This for example allows us to allocate a buffer of the right size when + * building the mapping pairs array. + * + * If @rl is NULL, just return 1 (for the single terminator byte). + * + * Return the calculated size in bytes on success. On error, return -errno. + * The following error codes are defined: + * -EINVAL - Run list contains unmapped elements. Make sure to only pass + * fully mapped runlists to this function. + * -EIO - The runlist is corrupt. + * + * Locking: @rl must be locked on entry (either for reading or writing), it + * remains locked throughout, and is left locked upon return. + */ +int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, + const runlist_element *rl, const VCN first_vcn, + const VCN last_vcn) +{ + LCN prev_lcn; + int rls; + bool the_end = false; + + BUG_ON(first_vcn < 0); + BUG_ON(last_vcn < -1); + BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); + if (!rl) { + BUG_ON(first_vcn); + BUG_ON(last_vcn > 0); + return 1; + } + /* Skip to runlist element containing @first_vcn. */ + while (rl->length && first_vcn >= rl[1].vcn) + rl++; + if (unlikely((!rl->length && first_vcn > rl->vcn) || + first_vcn < rl->vcn)) + return -EINVAL; + prev_lcn = 0; + /* Always need the termining zero byte. */ + rls = 1; + /* Do the first partial run if present. */ + if (first_vcn > rl->vcn) { + s64 delta, length = rl->length; + + /* We know rl->length != 0 already. */ + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + delta = first_vcn - rl->vcn; + /* Header byte + length. */ + rls += 1 + ntfs_get_nr_significant_bytes(length - delta); + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just store the lcn. + * Note: this assumes that on NTFS 1.2-, holes are stored with + * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + prev_lcn = rl->lcn; + if (likely(rl->lcn >= 0)) + prev_lcn += delta; + /* Change in lcn. */ + rls += ntfs_get_nr_significant_bytes(prev_lcn); + } + /* Go to next runlist element. */ + rl++; + } + /* Do the full runs. */ + for (; rl->length && !the_end; rl++) { + s64 length = rl->length; + + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + /* Header byte + length. */ + rls += 1 + ntfs_get_nr_significant_bytes(length); + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just store the lcn. + * Note: this assumes that on NTFS 1.2-, holes are stored with + * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + /* Change in lcn. */ + rls += ntfs_get_nr_significant_bytes(rl->lcn - + prev_lcn); + prev_lcn = rl->lcn; + } + } + return rls; +err_out: + if (rl->lcn == LCN_RL_NOT_MAPPED) + rls = -EINVAL; + else + rls = -EIO; + return rls; +} + +/** + * ntfs_write_significant_bytes - write the significant bytes of a number + * @dst: destination buffer to write to + * @dst_max: pointer to last byte of destination buffer for bounds checking + * @n: number whose significant bytes to write + * + * Store in @dst, the minimum bytes of the number @n which are required to + * identify @n unambiguously as a signed number, taking care not to exceed + * @dest_max, the maximum position within @dst to which we are allowed to + * write. + * + * This is used when building the mapping pairs array of a runlist to compress + * a given logical cluster number (lcn) or a specific run length to the minimum + * size possible. + * + * Return the number of bytes written on success. On error, i.e. the + * destination buffer @dst is too small, return -ENOSPC. + */ +static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max, + const s64 n) +{ + s64 l = n; + int i; + s8 j; + + i = 0; + do { + if (unlikely(dst > dst_max)) + goto err_out; + *dst++ = l & 0xffll; + l >>= 8; + i++; + } while (l != 0 && l != -1); + j = (n >> 8 * (i - 1)) & 0xff; + /* If the sign bit is wrong, we need an extra byte. */ + if (n < 0 && j >= 0) { + if (unlikely(dst > dst_max)) + goto err_out; + i++; + *dst = (s8)-1; + } else if (n > 0 && j < 0) { + if (unlikely(dst > dst_max)) + goto err_out; + i++; + *dst = (s8)0; + } + return i; +err_out: + return -ENOSPC; +} + +/** + * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist + * @vol: ntfs volume (needed for the ntfs version) + * @dst: destination buffer to which to write the mapping pairs array + * @dst_len: size of destination buffer @dst in bytes + * @rl: locked runlist for which to build the mapping pairs array + * @first_vcn: first vcn which to include in the mapping pairs array + * @last_vcn: last vcn which to include in the mapping pairs array + * @stop_vcn: first vcn outside destination buffer on success or -ENOSPC + * + * Create the mapping pairs array from the locked runlist @rl, starting at vcn + * @first_vcn and finishing with vcn @last_vcn and save the array in @dst. + * @dst_len is the size of @dst in bytes and it should be at least equal to the + * value obtained by calling ntfs_get_size_for_mapping_pairs(). + * + * A @last_vcn of -1 means end of runlist and in that case the mapping pairs + * array corresponding to the runlist starting at vcn @first_vcn and finishing + * at the end of the runlist is created. + * + * If @rl is NULL, just write a single terminator byte to @dst. + * + * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to + * the first vcn outside the destination buffer. Note that on error, @dst has + * been filled with all the mapping pairs that will fit, thus it can be treated + * as partial success, in that a new attribute extent needs to be created or + * the next extent has to be used and the mapping pairs build has to be + * continued with @first_vcn set to *@stop_vcn. + * + * Return 0 on success and -errno on error. The following error codes are + * defined: + * -EINVAL - Run list contains unmapped elements. Make sure to only pass + * fully mapped runlists to this function. + * -EIO - The runlist is corrupt. + * -ENOSPC - The destination buffer is too small. + * + * Locking: @rl must be locked on entry (either for reading or writing), it + * remains locked throughout, and is left locked upon return. + */ +int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, + const int dst_len, const runlist_element *rl, + const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn) +{ + LCN prev_lcn; + s8 *dst_max, *dst_next; + int err = -ENOSPC; + bool the_end = false; + s8 len_len, lcn_len; + + BUG_ON(first_vcn < 0); + BUG_ON(last_vcn < -1); + BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); + BUG_ON(dst_len < 1); + if (!rl) { + BUG_ON(first_vcn); + BUG_ON(last_vcn > 0); + if (stop_vcn) + *stop_vcn = 0; + /* Terminator byte. */ + *dst = 0; + return 0; + } + /* Skip to runlist element containing @first_vcn. */ + while (rl->length && first_vcn >= rl[1].vcn) + rl++; + if (unlikely((!rl->length && first_vcn > rl->vcn) || + first_vcn < rl->vcn)) + return -EINVAL; + /* + * @dst_max is used for bounds checking in + * ntfs_write_significant_bytes(). + */ + dst_max = dst + dst_len - 1; + prev_lcn = 0; + /* Do the first partial run if present. */ + if (first_vcn > rl->vcn) { + s64 delta, length = rl->length; + + /* We know rl->length != 0 already. */ + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + delta = first_vcn - rl->vcn; + /* Write length. */ + len_len = ntfs_write_significant_bytes(dst + 1, dst_max, + length - delta); + if (unlikely(len_len < 0)) + goto size_err; + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just write the lcn + * change. FIXME: Do we need to write the lcn change or just + * the lcn in that case? Not sure as I have never seen this + * case on NT4. - We assume that we just need to write the lcn + * change until someone tells us otherwise... (AIA) + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + prev_lcn = rl->lcn; + if (likely(rl->lcn >= 0)) + prev_lcn += delta; + /* Write change in lcn. */ + lcn_len = ntfs_write_significant_bytes(dst + 1 + + len_len, dst_max, prev_lcn); + if (unlikely(lcn_len < 0)) + goto size_err; + } else + lcn_len = 0; + dst_next = dst + len_len + lcn_len + 1; + if (unlikely(dst_next > dst_max)) + goto size_err; + /* Update header byte. */ + *dst = lcn_len << 4 | len_len; + /* Position at next mapping pairs array element. */ + dst = dst_next; + /* Go to next runlist element. */ + rl++; + } + /* Do the full runs. */ + for (; rl->length && !the_end; rl++) { + s64 length = rl->length; + + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + /* Write length. */ + len_len = ntfs_write_significant_bytes(dst + 1, dst_max, + length); + if (unlikely(len_len < 0)) + goto size_err; + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just write the lcn + * change. FIXME: Do we need to write the lcn change or just + * the lcn in that case? Not sure as I have never seen this + * case on NT4. - We assume that we just need to write the lcn + * change until someone tells us otherwise... (AIA) + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + /* Write change in lcn. */ + lcn_len = ntfs_write_significant_bytes(dst + 1 + + len_len, dst_max, rl->lcn - prev_lcn); + if (unlikely(lcn_len < 0)) + goto size_err; + prev_lcn = rl->lcn; + } else + lcn_len = 0; + dst_next = dst + len_len + lcn_len + 1; + if (unlikely(dst_next > dst_max)) + goto size_err; + /* Update header byte. */ + *dst = lcn_len << 4 | len_len; + /* Position at next mapping pairs array element. */ + dst = dst_next; + } + /* Success. */ + err = 0; +size_err: + /* Set stop vcn. */ + if (stop_vcn) + *stop_vcn = rl->vcn; + /* Add terminator byte. */ + *dst = 0; + return err; +err_out: + if (rl->lcn == LCN_RL_NOT_MAPPED) + err = -EINVAL; + else + err = -EIO; + return err; +} + +/** + * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn + * @vol: ntfs volume (needed for error output) + * @runlist: runlist to truncate + * @new_length: the new length of the runlist in VCNs + * + * Truncate the runlist described by @runlist as well as the memory buffer + * holding the runlist elements to a length of @new_length VCNs. + * + * If @new_length lies within the runlist, the runlist elements with VCNs of + * @new_length and above are discarded. As a special case if @new_length is + * zero, the runlist is discarded and set to NULL. + * + * If @new_length lies beyond the runlist, a sparse runlist element is added to + * the end of the runlist @runlist or if the last runlist element is a sparse + * one already, this is extended. + * + * Note, no checking is done for unmapped runlist elements. It is assumed that + * the caller has mapped any elements that need to be mapped already. + * + * Return 0 on success and -errno on error. + * + * Locking: The caller must hold @runlist->lock for writing. + */ +int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, + const s64 new_length) +{ + runlist_element *rl; + int old_size; + + ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length); + BUG_ON(!runlist); + BUG_ON(new_length < 0); + rl = runlist->rl; + if (!new_length) { + ntfs_debug("Freeing runlist."); + runlist->rl = NULL; + if (rl) + ntfs_free(rl); + return 0; + } + if (unlikely(!rl)) { + /* + * Create a runlist consisting of a sparse runlist element of + * length @new_length followed by a terminator runlist element. + */ + rl = ntfs_malloc_nofs(PAGE_SIZE); + if (unlikely(!rl)) { + ntfs_error(vol->sb, "Not enough memory to allocate " + "runlist element buffer."); + return -ENOMEM; + } + runlist->rl = rl; + rl[1].length = rl->vcn = 0; + rl->lcn = LCN_HOLE; + rl[1].vcn = rl->length = new_length; + rl[1].lcn = LCN_ENOENT; + return 0; + } + BUG_ON(new_length < rl->vcn); + /* Find @new_length in the runlist. */ + while (likely(rl->length && new_length >= rl[1].vcn)) + rl++; + /* + * If not at the end of the runlist we need to shrink it. + * If at the end of the runlist we need to expand it. + */ + if (rl->length) { + runlist_element *trl; + bool is_end; + + ntfs_debug("Shrinking runlist."); + /* Determine the runlist size. */ + trl = rl + 1; + while (likely(trl->length)) + trl++; + old_size = trl - runlist->rl + 1; + /* Truncate the run. */ + rl->length = new_length - rl->vcn; + /* + * If a run was partially truncated, make the following runlist + * element a terminator. + */ + is_end = false; + if (rl->length) { + rl++; + if (!rl->length) + is_end = true; + rl->vcn = new_length; + rl->length = 0; + } + rl->lcn = LCN_ENOENT; + /* Reallocate memory if necessary. */ + if (!is_end) { + int new_size = rl - runlist->rl + 1; + rl = ntfs_rl_realloc(runlist->rl, old_size, new_size); + if (IS_ERR(rl)) + ntfs_warning(vol->sb, "Failed to shrink " + "runlist buffer. This just " + "wastes a bit of memory " + "temporarily so we ignore it " + "and return success."); + else + runlist->rl = rl; + } + } else if (likely(/* !rl->length && */ new_length > rl->vcn)) { + ntfs_debug("Expanding runlist."); + /* + * If there is a previous runlist element and it is a sparse + * one, extend it. Otherwise need to add a new, sparse runlist + * element. + */ + if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE)) + (rl - 1)->length = new_length - (rl - 1)->vcn; + else { + /* Determine the runlist size. */ + old_size = rl - runlist->rl + 1; + /* Reallocate memory if necessary. */ + rl = ntfs_rl_realloc(runlist->rl, old_size, + old_size + 1); + if (IS_ERR(rl)) { + ntfs_error(vol->sb, "Failed to expand runlist " + "buffer, aborting."); + return PTR_ERR(rl); + } + runlist->rl = rl; + /* + * Set @rl to the same runlist element in the new + * runlist as before in the old runlist. + */ + rl += old_size - 1; + /* Add a new, sparse runlist element. */ + rl->lcn = LCN_HOLE; + rl->length = new_length - rl->vcn; + /* Add a new terminator runlist element. */ + rl++; + rl->length = 0; + } + rl->vcn = new_length; + rl->lcn = LCN_ENOENT; + } else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ { + /* Runlist already has same size as requested. */ + rl->lcn = LCN_ENOENT; + } + ntfs_debug("Done."); + return 0; +} + +/** + * ntfs_rl_punch_nolock - punch a hole into a runlist + * @vol: ntfs volume (needed for error output) + * @runlist: runlist to punch a hole into + * @start: starting VCN of the hole to be created + * @length: size of the hole to be created in units of clusters + * + * Punch a hole into the runlist @runlist starting at VCN @start and of size + * @length clusters. + * + * Return 0 on success and -errno on error, in which case @runlist has not been + * modified. + * + * If @start and/or @start + @length are outside the runlist return error code + * -ENOENT. + * + * If the runlist contains unmapped or error elements between @start and @start + * + @length return error code -EINVAL. + * + * Locking: The caller must hold @runlist->lock for writing. + */ +int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, + const VCN start, const s64 length) +{ + const VCN end = start + length; + s64 delta; + runlist_element *rl, *rl_end, *rl_real_end, *trl; + int old_size; + bool lcn_fixup = false; + + ntfs_debug("Entering for start 0x%llx, length 0x%llx.", + (long long)start, (long long)length); + BUG_ON(!runlist); + BUG_ON(start < 0); + BUG_ON(length < 0); + BUG_ON(end < 0); + rl = runlist->rl; + if (unlikely(!rl)) { + if (likely(!start && !length)) + return 0; + return -ENOENT; + } + /* Find @start in the runlist. */ + while (likely(rl->length && start >= rl[1].vcn)) + rl++; + rl_end = rl; + /* Find @end in the runlist. */ + while (likely(rl_end->length && end >= rl_end[1].vcn)) { + /* Verify there are no unmapped or error elements. */ + if (unlikely(rl_end->lcn < LCN_HOLE)) + return -EINVAL; + rl_end++; + } + /* Check the last element. */ + if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE)) + return -EINVAL; + /* This covers @start being out of bounds, too. */ + if (!rl_end->length && end > rl_end->vcn) + return -ENOENT; + if (!length) + return 0; + if (!rl->length) + return -ENOENT; + rl_real_end = rl_end; + /* Determine the runlist size. */ + while (likely(rl_real_end->length)) + rl_real_end++; + old_size = rl_real_end - runlist->rl + 1; + /* If @start is in a hole simply extend the hole. */ + if (rl->lcn == LCN_HOLE) { + /* + * If both @start and @end are in the same sparse run, we are + * done. + */ + if (end <= rl[1].vcn) { + ntfs_debug("Done (requested hole is already sparse)."); + return 0; + } +extend_hole: + /* Extend the hole. */ + rl->length = end - rl->vcn; + /* If @end is in a hole, merge it with the current one. */ + if (rl_end->lcn == LCN_HOLE) { + rl_end++; + rl->length = rl_end->vcn - rl->vcn; + } + /* We have done the hole. Now deal with the remaining tail. */ + rl++; + /* Cut out all runlist elements up to @end. */ + if (rl < rl_end) + memmove(rl, rl_end, (rl_real_end - rl_end + 1) * + sizeof(*rl)); + /* Adjust the beginning of the tail if necessary. */ + if (end > rl->vcn) { + delta = end - rl->vcn; + rl->vcn = end; + rl->length -= delta; + /* Only adjust the lcn if it is real. */ + if (rl->lcn >= 0) + rl->lcn += delta; + } +shrink_allocation: + /* Reallocate memory if the allocation changed. */ + if (rl < rl_end) { + rl = ntfs_rl_realloc(runlist->rl, old_size, + old_size - (rl_end - rl)); + if (IS_ERR(rl)) + ntfs_warning(vol->sb, "Failed to shrink " + "runlist buffer. This just " + "wastes a bit of memory " + "temporarily so we ignore it " + "and return success."); + else + runlist->rl = rl; + } + ntfs_debug("Done (extend hole)."); + return 0; + } + /* + * If @start is at the beginning of a run things are easier as there is + * no need to split the first run. + */ + if (start == rl->vcn) { + /* + * @start is at the beginning of a run. + * + * If the previous run is sparse, extend its hole. + * + * If @end is not in the same run, switch the run to be sparse + * and extend the newly created hole. + * + * Thus both of these cases reduce the problem to the above + * case of "@start is in a hole". + */ + if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) { + rl--; + goto extend_hole; + } + if (end >= rl[1].vcn) { + rl->lcn = LCN_HOLE; + goto extend_hole; + } + /* + * The final case is when @end is in the same run as @start. + * For this need to split the run into two. One run for the + * sparse region between the beginning of the old run, i.e. + * @start, and @end and one for the remaining non-sparse + * region, i.e. between @end and the end of the old run. + */ + trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); + if (IS_ERR(trl)) + goto enomem_out; + old_size++; + if (runlist->rl != trl) { + rl = trl + (rl - runlist->rl); + rl_end = trl + (rl_end - runlist->rl); + rl_real_end = trl + (rl_real_end - runlist->rl); + runlist->rl = trl; + } +split_end: + /* Shift all the runs up by one. */ + memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl)); + /* Finally, setup the two split runs. */ + rl->lcn = LCN_HOLE; + rl->length = length; + rl++; + rl->vcn += length; + /* Only adjust the lcn if it is real. */ + if (rl->lcn >= 0 || lcn_fixup) + rl->lcn += length; + rl->length -= length; + ntfs_debug("Done (split one)."); + return 0; + } + /* + * @start is neither in a hole nor at the beginning of a run. + * + * If @end is in a hole, things are easier as simply truncating the run + * @start is in to end at @start - 1, deleting all runs after that up + * to @end, and finally extending the beginning of the run @end is in + * to be @start is all that is needed. + */ + if (rl_end->lcn == LCN_HOLE) { + /* Truncate the run containing @start. */ + rl->length = start - rl->vcn; + rl++; + /* Cut out all runlist elements up to @end. */ + if (rl < rl_end) + memmove(rl, rl_end, (rl_real_end - rl_end + 1) * + sizeof(*rl)); + /* Extend the beginning of the run @end is in to be @start. */ + rl->vcn = start; + rl->length = rl[1].vcn - start; + goto shrink_allocation; + } + /* + * If @end is not in a hole there are still two cases to distinguish. + * Either @end is or is not in the same run as @start. + * + * The second case is easier as it can be reduced to an already solved + * problem by truncating the run @start is in to end at @start - 1. + * Then, if @end is in the next run need to split the run into a sparse + * run followed by a non-sparse run (already covered above) and if @end + * is not in the next run switching it to be sparse, again reduces the + * problem to the already covered case of "@start is in a hole". + */ + if (end >= rl[1].vcn) { + /* + * If @end is not in the next run, reduce the problem to the + * case of "@start is in a hole". + */ + if (rl[1].length && end >= rl[2].vcn) { + /* Truncate the run containing @start. */ + rl->length = start - rl->vcn; + rl++; + rl->vcn = start; + rl->lcn = LCN_HOLE; + goto extend_hole; + } + trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); + if (IS_ERR(trl)) + goto enomem_out; + old_size++; + if (runlist->rl != trl) { + rl = trl + (rl - runlist->rl); + rl_end = trl + (rl_end - runlist->rl); + rl_real_end = trl + (rl_real_end - runlist->rl); + runlist->rl = trl; + } + /* Truncate the run containing @start. */ + rl->length = start - rl->vcn; + rl++; + /* + * @end is in the next run, reduce the problem to the case + * where "@start is at the beginning of a run and @end is in + * the same run as @start". + */ + delta = rl->vcn - start; + rl->vcn = start; + if (rl->lcn >= 0) { + rl->lcn -= delta; + /* Need this in case the lcn just became negative. */ + lcn_fixup = true; + } + rl->length += delta; + goto split_end; + } + /* + * The first case from above, i.e. @end is in the same run as @start. + * We need to split the run into three. One run for the non-sparse + * region between the beginning of the old run and @start, one for the + * sparse region between @start and @end, and one for the remaining + * non-sparse region, i.e. between @end and the end of the old run. + */ + trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2); + if (IS_ERR(trl)) + goto enomem_out; + old_size += 2; + if (runlist->rl != trl) { + rl = trl + (rl - runlist->rl); + rl_end = trl + (rl_end - runlist->rl); + rl_real_end = trl + (rl_real_end - runlist->rl); + runlist->rl = trl; + } + /* Shift all the runs up by two. */ + memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl)); + /* Finally, setup the three split runs. */ + rl->length = start - rl->vcn; + rl++; + rl->vcn = start; + rl->lcn = LCN_HOLE; + rl->length = length; + rl++; + delta = end - rl->vcn; + rl->vcn = end; + rl->lcn += delta; + rl->length -= delta; + ntfs_debug("Done (split both)."); + return 0; +enomem_out: + ntfs_error(vol->sb, "Not enough memory to extend runlist buffer."); + return -ENOMEM; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h new file mode 100644 index 00000000..47728fbb --- /dev/null +++ b/fs/ntfs/runlist.h @@ -0,0 +1,102 @@ +/* + * runlist.h - Defines for runlist handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_RUNLIST_H +#define _LINUX_NTFS_RUNLIST_H + +#include "types.h" +#include "layout.h" +#include "volume.h" + +/** + * runlist_element - in memory vcn to lcn mapping array element + * @vcn: starting vcn of the current array element + * @lcn: starting lcn of the current array element + * @length: length in clusters of the current array element + * + * The last vcn (in fact the last vcn + 1) is reached when length == 0. + * + * When lcn == -1 this means that the count vcns starting at vcn are not + * physically allocated (i.e. this is a hole / data is sparse). + */ +typedef struct { /* In memory vcn to lcn mapping structure element. */ + VCN vcn; /* vcn = Starting virtual cluster number. */ + LCN lcn; /* lcn = Starting logical cluster number. */ + s64 length; /* Run length in clusters. */ +} runlist_element; + +/** + * runlist - in memory vcn to lcn mapping array including a read/write lock + * @rl: pointer to an array of runlist elements + * @lock: read/write spinlock for serializing access to @rl + * + */ +typedef struct { + runlist_element *rl; + struct rw_semaphore lock; +} runlist; + +static inline void ntfs_init_runlist(runlist *rl) +{ + rl->rl = NULL; + init_rwsem(&rl->lock); +} + +typedef enum { + LCN_HOLE = -1, /* Keep this as highest value or die! */ + LCN_RL_NOT_MAPPED = -2, + LCN_ENOENT = -3, + LCN_ENOMEM = -4, + LCN_EIO = -5, +} LCN_SPECIAL_VALUES; + +extern runlist_element *ntfs_runlists_merge(runlist_element *drl, + runlist_element *srl); + +extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, + const ATTR_RECORD *attr, runlist_element *old_rl); + +extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn); + +#ifdef NTFS_RW + +extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, + const VCN vcn); + +extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, + const runlist_element *rl, const VCN first_vcn, + const VCN last_vcn); + +extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, + const int dst_len, const runlist_element *rl, + const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn); + +extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol, + runlist *const runlist, const s64 new_length); + +int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, + const VCN start, const s64 length); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_RUNLIST_H */ diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c new file mode 100644 index 00000000..b52706da --- /dev/null +++ b/fs/ntfs/super.c @@ -0,0 +1,3206 @@ +/* + * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2001,2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include /* For bdev_logical_block_size(). */ +#include +#include +#include +#include +#include + +#include "sysctl.h" +#include "logfile.h" +#include "quota.h" +#include "usnjrnl.h" +#include "dir.h" +#include "debug.h" +#include "index.h" +#include "inode.h" +#include "aops.h" +#include "layout.h" +#include "malloc.h" +#include "ntfs.h" + +/* Number of mounted filesystems which have compression enabled. */ +static unsigned long ntfs_nr_compression_users; + +/* A global default upcase table and a corresponding reference count. */ +static ntfschar *default_upcase = NULL; +static unsigned long ntfs_nr_upcase_users = 0; + +/* Error constants/strings used in inode.c::ntfs_show_options(). */ +typedef enum { + /* One of these must be present, default is ON_ERRORS_CONTINUE. */ + ON_ERRORS_PANIC = 0x01, + ON_ERRORS_REMOUNT_RO = 0x02, + ON_ERRORS_CONTINUE = 0x04, + /* Optional, can be combined with any of the above. */ + ON_ERRORS_RECOVER = 0x10, +} ON_ERRORS_ACTIONS; + +const option_t on_errors_arr[] = { + { ON_ERRORS_PANIC, "panic" }, + { ON_ERRORS_REMOUNT_RO, "remount-ro", }, + { ON_ERRORS_CONTINUE, "continue", }, + { ON_ERRORS_RECOVER, "recover" }, + { 0, NULL } +}; + +/** + * simple_getbool - + * + * Copied from old ntfs driver (which copied from vfat driver). + */ +static int simple_getbool(char *s, bool *setval) +{ + if (s) { + if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true")) + *setval = true; + else if (!strcmp(s, "0") || !strcmp(s, "no") || + !strcmp(s, "false")) + *setval = false; + else + return 0; + } else + *setval = true; + return 1; +} + +/** + * parse_options - parse the (re)mount options + * @vol: ntfs volume + * @opt: string containing the (re)mount options + * + * Parse the recognized options in @opt for the ntfs volume described by @vol. + */ +static bool parse_options(ntfs_volume *vol, char *opt) +{ + char *p, *v, *ov; + static char *utf8 = "utf8"; + int errors = 0, sloppy = 0; + uid_t uid = (uid_t)-1; + gid_t gid = (gid_t)-1; + mode_t fmask = (mode_t)-1, dmask = (mode_t)-1; + int mft_zone_multiplier = -1, on_errors = -1; + int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1; + struct nls_table *nls_map = NULL, *old_nls; + + /* I am lazy... (-8 */ +#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value) \ + if (!strcmp(p, option)) { \ + if (!v || !*v) \ + variable = default_value; \ + else { \ + variable = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + } \ + } +#define NTFS_GETOPT(option, variable) \ + if (!strcmp(p, option)) { \ + if (!v || !*v) \ + goto needs_arg; \ + variable = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + } +#define NTFS_GETOPT_OCTAL(option, variable) \ + if (!strcmp(p, option)) { \ + if (!v || !*v) \ + goto needs_arg; \ + variable = simple_strtoul(ov = v, &v, 8); \ + if (*v) \ + goto needs_val; \ + } +#define NTFS_GETOPT_BOOL(option, variable) \ + if (!strcmp(p, option)) { \ + bool val; \ + if (!simple_getbool(v, &val)) \ + goto needs_bool; \ + variable = val; \ + } +#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array) \ + if (!strcmp(p, option)) { \ + int _i; \ + if (!v || !*v) \ + goto needs_arg; \ + ov = v; \ + if (variable == -1) \ + variable = 0; \ + for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \ + if (!strcmp(opt_array[_i].str, v)) { \ + variable |= opt_array[_i].val; \ + break; \ + } \ + if (!opt_array[_i].str || !*opt_array[_i].str) \ + goto needs_val; \ + } + if (!opt || !*opt) + goto no_mount_options; + ntfs_debug("Entering with mount options string: %s", opt); + while ((p = strsep(&opt, ","))) { + if ((v = strchr(p, '='))) + *v++ = 0; + NTFS_GETOPT("uid", uid) + else NTFS_GETOPT("gid", gid) + else NTFS_GETOPT_OCTAL("umask", fmask = dmask) + else NTFS_GETOPT_OCTAL("fmask", fmask) + else NTFS_GETOPT_OCTAL("dmask", dmask) + else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier) + else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true) + else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files) + else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive) + else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse) + else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors, + on_errors_arr) + else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes")) + ntfs_warning(vol->sb, "Ignoring obsolete option %s.", + p); + else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) { + if (!strcmp(p, "iocharset")) + ntfs_warning(vol->sb, "Option iocharset is " + "deprecated. Please use " + "option nls= in " + "the future."); + if (!v || !*v) + goto needs_arg; +use_utf8: + old_nls = nls_map; + nls_map = load_nls(v); + if (!nls_map) { + if (!old_nls) { + ntfs_error(vol->sb, "NLS character set " + "%s not found.", v); + return false; + } + ntfs_error(vol->sb, "NLS character set %s not " + "found. Using previous one %s.", + v, old_nls->charset); + nls_map = old_nls; + } else /* nls_map */ { + unload_nls(old_nls); + } + } else if (!strcmp(p, "utf8")) { + bool val = false; + ntfs_warning(vol->sb, "Option utf8 is no longer " + "supported, using option nls=utf8. Please " + "use option nls=utf8 in the future and " + "make sure utf8 is compiled either as a " + "module or into the kernel."); + if (!v || !*v) + val = true; + else if (!simple_getbool(v, &val)) + goto needs_bool; + if (val) { + v = utf8; + goto use_utf8; + } + } else { + ntfs_error(vol->sb, "Unrecognized mount option %s.", p); + if (errors < INT_MAX) + errors++; + } +#undef NTFS_GETOPT_OPTIONS_ARRAY +#undef NTFS_GETOPT_BOOL +#undef NTFS_GETOPT +#undef NTFS_GETOPT_WITH_DEFAULT + } +no_mount_options: + if (errors && !sloppy) + return false; + if (sloppy) + ntfs_warning(vol->sb, "Sloppy option given. Ignoring " + "unrecognized mount option(s) and continuing."); + /* Keep this first! */ + if (on_errors != -1) { + if (!on_errors) { + ntfs_error(vol->sb, "Invalid errors option argument " + "or bug in options parser."); + return false; + } + } + if (nls_map) { + if (vol->nls_map && vol->nls_map != nls_map) { + ntfs_error(vol->sb, "Cannot change NLS character set " + "on remount."); + return false; + } /* else (!vol->nls_map) */ + ntfs_debug("Using NLS character set %s.", nls_map->charset); + vol->nls_map = nls_map; + } else /* (!nls_map) */ { + if (!vol->nls_map) { + vol->nls_map = load_nls_default(); + if (!vol->nls_map) { + ntfs_error(vol->sb, "Failed to load default " + "NLS character set."); + return false; + } + ntfs_debug("Using default NLS character set (%s).", + vol->nls_map->charset); + } + } + if (mft_zone_multiplier != -1) { + if (vol->mft_zone_multiplier && vol->mft_zone_multiplier != + mft_zone_multiplier) { + ntfs_error(vol->sb, "Cannot change mft_zone_multiplier " + "on remount."); + return false; + } + if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) { + ntfs_error(vol->sb, "Invalid mft_zone_multiplier. " + "Using default value, i.e. 1."); + mft_zone_multiplier = 1; + } + vol->mft_zone_multiplier = mft_zone_multiplier; + } + if (!vol->mft_zone_multiplier) + vol->mft_zone_multiplier = 1; + if (on_errors != -1) + vol->on_errors = on_errors; + if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER) + vol->on_errors |= ON_ERRORS_CONTINUE; + if (uid != (uid_t)-1) + vol->uid = uid; + if (gid != (gid_t)-1) + vol->gid = gid; + if (fmask != (mode_t)-1) + vol->fmask = fmask; + if (dmask != (mode_t)-1) + vol->dmask = dmask; + if (show_sys_files != -1) { + if (show_sys_files) + NVolSetShowSystemFiles(vol); + else + NVolClearShowSystemFiles(vol); + } + if (case_sensitive != -1) { + if (case_sensitive) + NVolSetCaseSensitive(vol); + else + NVolClearCaseSensitive(vol); + } + if (disable_sparse != -1) { + if (disable_sparse) + NVolClearSparseEnabled(vol); + else { + if (!NVolSparseEnabled(vol) && + vol->major_ver && vol->major_ver < 3) + ntfs_warning(vol->sb, "Not enabling sparse " + "support due to NTFS volume " + "version %i.%i (need at least " + "version 3.0).", vol->major_ver, + vol->minor_ver); + else + NVolSetSparseEnabled(vol); + } + } + return true; +needs_arg: + ntfs_error(vol->sb, "The %s option requires an argument.", p); + return false; +needs_bool: + ntfs_error(vol->sb, "The %s option requires a boolean argument.", p); + return false; +needs_val: + ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov); + return false; +} + +#ifdef NTFS_RW + +/** + * ntfs_write_volume_flags - write new flags to the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: new flags value for the volume information flags + * + * Internal function. You probably want to use ntfs_{set,clear}_volume_flags() + * instead (see below). + * + * Replace the volume information flags on the volume @vol with the value + * supplied in @flags. Note, this overwrites the volume information flags, so + * make sure to combine the flags you want to modify with the old flags and use + * the result when calling ntfs_write_volume_flags(). + * + * Return 0 on success and -errno on error. + */ +static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags) +{ + ntfs_inode *ni = NTFS_I(vol->vol_ino); + MFT_RECORD *m; + VOLUME_INFORMATION *vi; + ntfs_attr_search_ctx *ctx; + int err; + + ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.", + le16_to_cpu(vol->vol_flags), le16_to_cpu(flags)); + if (vol->vol_flags == flags) + goto done; + BUG_ON(!ni); + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ni, m); + if (!ctx) { + err = -ENOMEM; + goto put_unm_err_out; + } + err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, + ctx); + if (err) + goto put_unm_err_out; + vi = (VOLUME_INFORMATION*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + vol->vol_flags = vi->flags = flags; + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); +done: + ntfs_debug("Done."); + return 0; +put_unm_err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); +err_out: + ntfs_error(vol->sb, "Failed with error code %i.", -err); + return err; +} + +/** + * ntfs_set_volume_flags - set bits in the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: flags to set on the volume + * + * Set the bits in @flags in the volume information flags on the volume @vol. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) +{ + flags &= VOLUME_FLAGS_MASK; + return ntfs_write_volume_flags(vol, vol->vol_flags | flags); +} + +/** + * ntfs_clear_volume_flags - clear bits in the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: flags to clear on the volume + * + * Clear the bits in @flags in the volume information flags on the volume @vol. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) +{ + flags &= VOLUME_FLAGS_MASK; + flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags)); + return ntfs_write_volume_flags(vol, flags); +} + +#endif /* NTFS_RW */ + +/** + * ntfs_remount - change the mount options of a mounted ntfs filesystem + * @sb: superblock of mounted ntfs filesystem + * @flags: remount flags + * @opt: remount options string + * + * Change the mount options of an already mounted ntfs filesystem. + * + * NOTE: The VFS sets the @sb->s_flags remount flags to @flags after + * ntfs_remount() returns successfully (i.e. returns 0). Otherwise, + * @sb->s_flags are not changed. + */ +static int ntfs_remount(struct super_block *sb, int *flags, char *opt) +{ + ntfs_volume *vol = NTFS_SB(sb); + + ntfs_debug("Entering with remount options string: %s", opt); + +#ifndef NTFS_RW + /* For read-only compiled driver, enforce read-only flag. */ + *flags |= MS_RDONLY; +#else /* NTFS_RW */ + /* + * For the read-write compiled driver, if we are remounting read-write, + * make sure there are no volume errors and that no unsupported volume + * flags are set. Also, empty the logfile journal as it would become + * stale as soon as something is written to the volume and mark the + * volume dirty so that chkdsk is run if the volume is not umounted + * cleanly. Finally, mark the quotas out of date so Windows rescans + * the volume on boot and updates them. + * + * When remounting read-only, mark the volume clean if no volume errors + * have occurred. + */ + if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { + static const char *es = ". Cannot remount read-write."; + + /* Remounting read-write. */ + if (NVolErrors(vol)) { + ntfs_error(sb, "Volume has errors and is read-only%s", + es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_IS_DIRTY) { + ntfs_error(sb, "Volume is dirty and read-only%s", es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { + ntfs_error(sb, "Volume has been modified by chkdsk " + "and is read-only%s", es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { + ntfs_error(sb, "Volume has unsupported flags set " + "(0x%x) and is read-only%s", + (unsigned)le16_to_cpu(vol->vol_flags), + es); + return -EROFS; + } + if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { + ntfs_error(sb, "Failed to set dirty bit in volume " + "information flags%s", es); + return -EROFS; + } +#if 0 + // TODO: Enable this code once we start modifying anything that + // is different between NTFS 1.2 and 3.x... + /* Set NT4 compatibility flag on newer NTFS version volumes. */ + if ((vol->major_ver > 1)) { + if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { + ntfs_error(sb, "Failed to set NT4 " + "compatibility flag%s", es); + NVolSetErrors(vol); + return -EROFS; + } + } +#endif + if (!ntfs_empty_logfile(vol->logfile_ino)) { + ntfs_error(sb, "Failed to empty journal $LogFile%s", + es); + NVolSetErrors(vol); + return -EROFS; + } + if (!ntfs_mark_quotas_out_of_date(vol)) { + ntfs_error(sb, "Failed to mark quotas out of date%s", + es); + NVolSetErrors(vol); + return -EROFS; + } + if (!ntfs_stamp_usnjrnl(vol)) { + ntfs_error(sb, "Failed to stamp transation log " + "($UsnJrnl)%s", es); + NVolSetErrors(vol); + return -EROFS; + } + } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { + /* Remounting read-only. */ + if (!NVolErrors(vol)) { + if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) + ntfs_warning(sb, "Failed to clear dirty bit " + "in volume information " + "flags. Run chkdsk."); + } + } +#endif /* NTFS_RW */ + + // TODO: Deal with *flags. + + if (!parse_options(vol, opt)) + return -EINVAL; + + ntfs_debug("Done."); + return 0; +} + +/** + * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector + * @sb: Super block of the device to which @b belongs. + * @b: Boot sector of device @sb to check. + * @silent: If 'true', all output will be silenced. + * + * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot + * sector. Returns 'true' if it is valid and 'false' if not. + * + * @sb is only needed for warning/error output, i.e. it can be NULL when silent + * is 'true'. + */ +static bool is_boot_sector_ntfs(const struct super_block *sb, + const NTFS_BOOT_SECTOR *b, const bool silent) +{ + /* + * Check that checksum == sum of u32 values from b to the checksum + * field. If checksum is zero, no checking is done. We will work when + * the checksum test fails, since some utilities update the boot sector + * ignoring the checksum which leaves the checksum out-of-date. We + * report a warning if this is the case. + */ + if ((void*)b < (void*)&b->checksum && b->checksum && !silent) { + le32 *u; + u32 i; + + for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u) + i += le32_to_cpup(u); + if (le32_to_cpu(b->checksum) != i) + ntfs_warning(sb, "Invalid boot sector checksum."); + } + /* Check OEMidentifier is "NTFS " */ + if (b->oem_id != magicNTFS) + goto not_ntfs; + /* Check bytes per sector value is between 256 and 4096. */ + if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 || + le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000) + goto not_ntfs; + /* Check sectors per cluster value is valid. */ + switch (b->bpb.sectors_per_cluster) { + case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128: + break; + default: + goto not_ntfs; + } + /* Check the cluster size is not above the maximum (64kiB). */ + if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) * + b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE) + goto not_ntfs; + /* Check reserved/unused fields are really zero. */ + if (le16_to_cpu(b->bpb.reserved_sectors) || + le16_to_cpu(b->bpb.root_entries) || + le16_to_cpu(b->bpb.sectors) || + le16_to_cpu(b->bpb.sectors_per_fat) || + le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats) + goto not_ntfs; + /* Check clusters per file mft record value is valid. */ + if ((u8)b->clusters_per_mft_record < 0xe1 || + (u8)b->clusters_per_mft_record > 0xf7) + switch (b->clusters_per_mft_record) { + case 1: case 2: case 4: case 8: case 16: case 32: case 64: + break; + default: + goto not_ntfs; + } + /* Check clusters per index block value is valid. */ + if ((u8)b->clusters_per_index_record < 0xe1 || + (u8)b->clusters_per_index_record > 0xf7) + switch (b->clusters_per_index_record) { + case 1: case 2: case 4: case 8: case 16: case 32: case 64: + break; + default: + goto not_ntfs; + } + /* + * Check for valid end of sector marker. We will work without it, but + * many BIOSes will refuse to boot from a bootsector if the magic is + * incorrect, so we emit a warning. + */ + if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55)) + ntfs_warning(sb, "Invalid end of sector marker."); + return true; +not_ntfs: + return false; +} + +/** + * read_ntfs_boot_sector - read the NTFS boot sector of a device + * @sb: super block of device to read the boot sector from + * @silent: if true, suppress all output + * + * Reads the boot sector from the device and validates it. If that fails, tries + * to read the backup boot sector, first from the end of the device a-la NT4 and + * later and then from the middle of the device a-la NT3.51 and before. + * + * If a valid boot sector is found but it is not the primary boot sector, we + * repair the primary boot sector silently (unless the device is read-only or + * the primary boot sector is not accessible). + * + * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super + * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized + * to their respective values. + * + * Return the unlocked buffer head containing the boot sector or NULL on error. + */ +static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb, + const int silent) +{ + const char *read_err_str = "Unable to read %s boot sector."; + struct buffer_head *bh_primary, *bh_backup; + sector_t nr_blocks = NTFS_SB(sb)->nr_blocks; + + /* Try to read primary boot sector. */ + if ((bh_primary = sb_bread(sb, 0))) { + if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) + bh_primary->b_data, silent)) + return bh_primary; + if (!silent) + ntfs_error(sb, "Primary boot sector is invalid."); + } else if (!silent) + ntfs_error(sb, read_err_str, "primary"); + if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) { + if (bh_primary) + brelse(bh_primary); + if (!silent) + ntfs_error(sb, "Mount option errors=recover not used. " + "Aborting without trying to recover."); + return NULL; + } + /* Try to read NT4+ backup boot sector. */ + if ((bh_backup = sb_bread(sb, nr_blocks - 1))) { + if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) + bh_backup->b_data, silent)) + goto hotfix_primary_boot_sector; + brelse(bh_backup); + } else if (!silent) + ntfs_error(sb, read_err_str, "backup"); + /* Try to read NT3.51- backup boot sector. */ + if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) { + if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) + bh_backup->b_data, silent)) + goto hotfix_primary_boot_sector; + if (!silent) + ntfs_error(sb, "Could not find a valid backup boot " + "sector."); + brelse(bh_backup); + } else if (!silent) + ntfs_error(sb, read_err_str, "backup"); + /* We failed. Cleanup and return. */ + if (bh_primary) + brelse(bh_primary); + return NULL; +hotfix_primary_boot_sector: + if (bh_primary) { + /* + * If we managed to read sector zero and the volume is not + * read-only, copy the found, valid backup boot sector to the + * primary boot sector. Note we only copy the actual boot + * sector structure, not the actual whole device sector as that + * may be bigger and would potentially damage the $Boot system + * file (FIXME: Would be nice to know if the backup boot sector + * on a large sector device contains the whole boot loader or + * just the first 512 bytes). + */ + if (!(sb->s_flags & MS_RDONLY)) { + ntfs_warning(sb, "Hot-fix: Recovering invalid primary " + "boot sector from backup copy."); + memcpy(bh_primary->b_data, bh_backup->b_data, + NTFS_BLOCK_SIZE); + mark_buffer_dirty(bh_primary); + sync_dirty_buffer(bh_primary); + if (buffer_uptodate(bh_primary)) { + brelse(bh_backup); + return bh_primary; + } + ntfs_error(sb, "Hot-fix: Device write error while " + "recovering primary boot sector."); + } else { + ntfs_warning(sb, "Hot-fix: Recovery of primary boot " + "sector failed: Read-only mount."); + } + brelse(bh_primary); + } + ntfs_warning(sb, "Using backup boot sector."); + return bh_backup; +} + +/** + * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol + * @vol: volume structure to initialise with data from boot sector + * @b: boot sector to parse + * + * Parse the ntfs boot sector @b and store all imporant information therein in + * the ntfs super block @vol. Return 'true' on success and 'false' on error. + */ +static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) +{ + unsigned int sectors_per_cluster_bits, nr_hidden_sects; + int clusters_per_mft_record, clusters_per_index_record; + s64 ll; + + vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector); + vol->sector_size_bits = ffs(vol->sector_size) - 1; + ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size, + vol->sector_size); + ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits, + vol->sector_size_bits); + if (vol->sector_size < vol->sb->s_blocksize) { + ntfs_error(vol->sb, "Sector size (%i) is smaller than the " + "device block size (%lu). This is not " + "supported. Sorry.", vol->sector_size, + vol->sb->s_blocksize); + return false; + } + ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster); + sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1; + ntfs_debug("sectors_per_cluster_bits = 0x%x", + sectors_per_cluster_bits); + nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors); + ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects); + vol->cluster_size = vol->sector_size << sectors_per_cluster_bits; + vol->cluster_size_mask = vol->cluster_size - 1; + vol->cluster_size_bits = ffs(vol->cluster_size) - 1; + ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size, + vol->cluster_size); + ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask); + ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits); + if (vol->cluster_size < vol->sector_size) { + ntfs_error(vol->sb, "Cluster size (%i) is smaller than the " + "sector size (%i). This is not supported. " + "Sorry.", vol->cluster_size, vol->sector_size); + return false; + } + clusters_per_mft_record = b->clusters_per_mft_record; + ntfs_debug("clusters_per_mft_record = %i (0x%x)", + clusters_per_mft_record, clusters_per_mft_record); + if (clusters_per_mft_record > 0) + vol->mft_record_size = vol->cluster_size << + (ffs(clusters_per_mft_record) - 1); + else + /* + * When mft_record_size < cluster_size, clusters_per_mft_record + * = -log2(mft_record_size) bytes. mft_record_size normaly is + * 1024 bytes, which is encoded as 0xF6 (-10 in decimal). + */ + vol->mft_record_size = 1 << -clusters_per_mft_record; + vol->mft_record_size_mask = vol->mft_record_size - 1; + vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1; + ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size, + vol->mft_record_size); + ntfs_debug("vol->mft_record_size_mask = 0x%x", + vol->mft_record_size_mask); + ntfs_debug("vol->mft_record_size_bits = %i (0x%x)", + vol->mft_record_size_bits, vol->mft_record_size_bits); + /* + * We cannot support mft record sizes above the PAGE_CACHE_SIZE since + * we store $MFT/$DATA, the table of mft records in the page cache. + */ + if (vol->mft_record_size > PAGE_CACHE_SIZE) { + ntfs_error(vol->sb, "Mft record size (%i) exceeds the " + "PAGE_CACHE_SIZE on your system (%lu). " + "This is not supported. Sorry.", + vol->mft_record_size, PAGE_CACHE_SIZE); + return false; + } + /* We cannot support mft record sizes below the sector size. */ + if (vol->mft_record_size < vol->sector_size) { + ntfs_error(vol->sb, "Mft record size (%i) is smaller than the " + "sector size (%i). This is not supported. " + "Sorry.", vol->mft_record_size, + vol->sector_size); + return false; + } + clusters_per_index_record = b->clusters_per_index_record; + ntfs_debug("clusters_per_index_record = %i (0x%x)", + clusters_per_index_record, clusters_per_index_record); + if (clusters_per_index_record > 0) + vol->index_record_size = vol->cluster_size << + (ffs(clusters_per_index_record) - 1); + else + /* + * When index_record_size < cluster_size, + * clusters_per_index_record = -log2(index_record_size) bytes. + * index_record_size normaly equals 4096 bytes, which is + * encoded as 0xF4 (-12 in decimal). + */ + vol->index_record_size = 1 << -clusters_per_index_record; + vol->index_record_size_mask = vol->index_record_size - 1; + vol->index_record_size_bits = ffs(vol->index_record_size) - 1; + ntfs_debug("vol->index_record_size = %i (0x%x)", + vol->index_record_size, vol->index_record_size); + ntfs_debug("vol->index_record_size_mask = 0x%x", + vol->index_record_size_mask); + ntfs_debug("vol->index_record_size_bits = %i (0x%x)", + vol->index_record_size_bits, + vol->index_record_size_bits); + /* We cannot support index record sizes below the sector size. */ + if (vol->index_record_size < vol->sector_size) { + ntfs_error(vol->sb, "Index record size (%i) is smaller than " + "the sector size (%i). This is not " + "supported. Sorry.", vol->index_record_size, + vol->sector_size); + return false; + } + /* + * Get the size of the volume in clusters and check for 64-bit-ness. + * Windows currently only uses 32 bits to save the clusters so we do + * the same as it is much faster on 32-bit CPUs. + */ + ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits; + if ((u64)ll >= 1ULL << 32) { + ntfs_error(vol->sb, "Cannot handle 64-bit clusters. Sorry."); + return false; + } + vol->nr_clusters = ll; + ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters); + /* + * On an architecture where unsigned long is 32-bits, we restrict the + * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler + * will hopefully optimize the whole check away. + */ + if (sizeof(unsigned long) < 8) { + if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) { + ntfs_error(vol->sb, "Volume size (%lluTiB) is too " + "large for this architecture. " + "Maximum supported is 2TiB. Sorry.", + (unsigned long long)ll >> (40 - + vol->cluster_size_bits)); + return false; + } + } + ll = sle64_to_cpu(b->mft_lcn); + if (ll >= vol->nr_clusters) { + ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of " + "volume. Weird.", (unsigned long long)ll, + (unsigned long long)ll); + return false; + } + vol->mft_lcn = ll; + ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn); + ll = sle64_to_cpu(b->mftmirr_lcn); + if (ll >= vol->nr_clusters) { + ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end " + "of volume. Weird.", (unsigned long long)ll, + (unsigned long long)ll); + return false; + } + vol->mftmirr_lcn = ll; + ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn); +#ifdef NTFS_RW + /* + * Work out the size of the mft mirror in number of mft records. If the + * cluster size is less than or equal to the size taken by four mft + * records, the mft mirror stores the first four mft records. If the + * cluster size is bigger than the size taken by four mft records, the + * mft mirror contains as many mft records as will fit into one + * cluster. + */ + if (vol->cluster_size <= (4 << vol->mft_record_size_bits)) + vol->mftmirr_size = 4; + else + vol->mftmirr_size = vol->cluster_size >> + vol->mft_record_size_bits; + ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size); +#endif /* NTFS_RW */ + vol->serial_no = le64_to_cpu(b->volume_serial_number); + ntfs_debug("vol->serial_no = 0x%llx", + (unsigned long long)vol->serial_no); + return true; +} + +/** + * ntfs_setup_allocators - initialize the cluster and mft allocators + * @vol: volume structure for which to setup the allocators + * + * Setup the cluster (lcn) and mft allocators to the starting values. + */ +static void ntfs_setup_allocators(ntfs_volume *vol) +{ +#ifdef NTFS_RW + LCN mft_zone_size, mft_lcn; +#endif /* NTFS_RW */ + + ntfs_debug("vol->mft_zone_multiplier = 0x%x", + vol->mft_zone_multiplier); +#ifdef NTFS_RW + /* Determine the size of the MFT zone. */ + mft_zone_size = vol->nr_clusters; + switch (vol->mft_zone_multiplier) { /* % of volume size in clusters */ + case 4: + mft_zone_size >>= 1; /* 50% */ + break; + case 3: + mft_zone_size = (mft_zone_size + + (mft_zone_size >> 1)) >> 2; /* 37.5% */ + break; + case 2: + mft_zone_size >>= 2; /* 25% */ + break; + /* case 1: */ + default: + mft_zone_size >>= 3; /* 12.5% */ + break; + } + /* Setup the mft zone. */ + vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn; + ntfs_debug("vol->mft_zone_pos = 0x%llx", + (unsigned long long)vol->mft_zone_pos); + /* + * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs + * source) and if the actual mft_lcn is in the expected place or even + * further to the front of the volume, extend the mft_zone to cover the + * beginning of the volume as well. This is in order to protect the + * area reserved for the mft bitmap as well within the mft_zone itself. + * On non-standard volumes we do not protect it as the overhead would + * be higher than the speed increase we would get by doing it. + */ + mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size; + if (mft_lcn * vol->cluster_size < 16 * 1024) + mft_lcn = (16 * 1024 + vol->cluster_size - 1) / + vol->cluster_size; + if (vol->mft_zone_start <= mft_lcn) + vol->mft_zone_start = 0; + ntfs_debug("vol->mft_zone_start = 0x%llx", + (unsigned long long)vol->mft_zone_start); + /* + * Need to cap the mft zone on non-standard volumes so that it does + * not point outside the boundaries of the volume. We do this by + * halving the zone size until we are inside the volume. + */ + vol->mft_zone_end = vol->mft_lcn + mft_zone_size; + while (vol->mft_zone_end >= vol->nr_clusters) { + mft_zone_size >>= 1; + vol->mft_zone_end = vol->mft_lcn + mft_zone_size; + } + ntfs_debug("vol->mft_zone_end = 0x%llx", + (unsigned long long)vol->mft_zone_end); + /* + * Set the current position within each data zone to the start of the + * respective zone. + */ + vol->data1_zone_pos = vol->mft_zone_end; + ntfs_debug("vol->data1_zone_pos = 0x%llx", + (unsigned long long)vol->data1_zone_pos); + vol->data2_zone_pos = 0; + ntfs_debug("vol->data2_zone_pos = 0x%llx", + (unsigned long long)vol->data2_zone_pos); + + /* Set the mft data allocation position to mft record 24. */ + vol->mft_data_pos = 24; + ntfs_debug("vol->mft_data_pos = 0x%llx", + (unsigned long long)vol->mft_data_pos); +#endif /* NTFS_RW */ +} + +#ifdef NTFS_RW + +/** + * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume + * @vol: ntfs super block describing device whose mft mirror to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_init_mft_mirror(ntfs_volume *vol) +{ + struct inode *tmp_ino; + ntfs_inode *tmp_ni; + + ntfs_debug("Entering."); + /* Get mft mirror inode. */ + tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr); + if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + /* Caller will display error message. */ + return false; + } + /* + * Re-initialize some specifics about $MFTMirr's inode as + * ntfs_read_inode() will have set up the default ones. + */ + /* Set uid and gid to root. */ + tmp_ino->i_uid = tmp_ino->i_gid = 0; + /* Regular file. No access for anyone. */ + tmp_ino->i_mode = S_IFREG; + /* No VFS initiated operations allowed for $MFTMirr. */ + tmp_ino->i_op = &ntfs_empty_inode_ops; + tmp_ino->i_fop = &ntfs_empty_file_ops; + /* Put in our special address space operations. */ + tmp_ino->i_mapping->a_ops = &ntfs_mst_aops; + tmp_ni = NTFS_I(tmp_ino); + /* The $MFTMirr, like the $MFT is multi sector transfer protected. */ + NInoSetMstProtected(tmp_ni); + NInoSetSparseDisabled(tmp_ni); + /* + * Set up our little cheat allowing us to reuse the async read io + * completion handler for directories. + */ + tmp_ni->itype.index.block_size = vol->mft_record_size; + tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits; + vol->mftmirr_ino = tmp_ino; + ntfs_debug("Done."); + return true; +} + +/** + * check_mft_mirror - compare contents of the mft mirror with the mft + * @vol: ntfs super block describing device whose mft mirror to check + * + * Return 'true' on success or 'false' on error. + * + * Note, this function also results in the mft mirror runlist being completely + * mapped into memory. The mft mirror write code requires this and will BUG() + * should it find an unmapped runlist element. + */ +static bool check_mft_mirror(ntfs_volume *vol) +{ + struct super_block *sb = vol->sb; + ntfs_inode *mirr_ni; + struct page *mft_page, *mirr_page; + u8 *kmft, *kmirr; + runlist_element *rl, rl2[2]; + pgoff_t index; + int mrecs_per_page, i; + + ntfs_debug("Entering."); + /* Compare contents of $MFT and $MFTMirr. */ + mrecs_per_page = PAGE_CACHE_SIZE / vol->mft_record_size; + BUG_ON(!mrecs_per_page); + BUG_ON(!vol->mftmirr_size); + mft_page = mirr_page = NULL; + kmft = kmirr = NULL; + index = i = 0; + do { + u32 bytes; + + /* Switch pages if necessary. */ + if (!(i % mrecs_per_page)) { + if (index) { + ntfs_unmap_page(mft_page); + ntfs_unmap_page(mirr_page); + } + /* Get the $MFT page. */ + mft_page = ntfs_map_page(vol->mft_ino->i_mapping, + index); + if (IS_ERR(mft_page)) { + ntfs_error(sb, "Failed to read $MFT."); + return false; + } + kmft = page_address(mft_page); + /* Get the $MFTMirr page. */ + mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping, + index); + if (IS_ERR(mirr_page)) { + ntfs_error(sb, "Failed to read $MFTMirr."); + goto mft_unmap_out; + } + kmirr = page_address(mirr_page); + ++index; + } + /* Do not check the record if it is not in use. */ + if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) { + /* Make sure the record is ok. */ + if (ntfs_is_baad_recordp((le32*)kmft)) { + ntfs_error(sb, "Incomplete multi sector " + "transfer detected in mft " + "record %i.", i); +mm_unmap_out: + ntfs_unmap_page(mirr_page); +mft_unmap_out: + ntfs_unmap_page(mft_page); + return false; + } + } + /* Do not check the mirror record if it is not in use. */ + if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) { + if (ntfs_is_baad_recordp((le32*)kmirr)) { + ntfs_error(sb, "Incomplete multi sector " + "transfer detected in mft " + "mirror record %i.", i); + goto mm_unmap_out; + } + } + /* Get the amount of data in the current record. */ + bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use); + if (bytes < sizeof(MFT_RECORD_OLD) || + bytes > vol->mft_record_size || + ntfs_is_baad_recordp((le32*)kmft)) { + bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use); + if (bytes < sizeof(MFT_RECORD_OLD) || + bytes > vol->mft_record_size || + ntfs_is_baad_recordp((le32*)kmirr)) + bytes = vol->mft_record_size; + } + /* Compare the two records. */ + if (memcmp(kmft, kmirr, bytes)) { + ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not " + "match. Run ntfsfix or chkdsk.", i); + goto mm_unmap_out; + } + kmft += vol->mft_record_size; + kmirr += vol->mft_record_size; + } while (++i < vol->mftmirr_size); + /* Release the last pages. */ + ntfs_unmap_page(mft_page); + ntfs_unmap_page(mirr_page); + + /* Construct the mft mirror runlist by hand. */ + rl2[0].vcn = 0; + rl2[0].lcn = vol->mftmirr_lcn; + rl2[0].length = (vol->mftmirr_size * vol->mft_record_size + + vol->cluster_size - 1) / vol->cluster_size; + rl2[1].vcn = rl2[0].length; + rl2[1].lcn = LCN_ENOENT; + rl2[1].length = 0; + /* + * Because we have just read all of the mft mirror, we know we have + * mapped the full runlist for it. + */ + mirr_ni = NTFS_I(vol->mftmirr_ino); + down_read(&mirr_ni->runlist.lock); + rl = mirr_ni->runlist.rl; + /* Compare the two runlists. They must be identical. */ + i = 0; + do { + if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn || + rl2[i].length != rl[i].length) { + ntfs_error(sb, "$MFTMirr location mismatch. " + "Run chkdsk."); + up_read(&mirr_ni->runlist.lock); + return false; + } + } while (rl2[i++].length); + up_read(&mirr_ni->runlist.lock); + ntfs_debug("Done."); + return true; +} + +/** + * load_and_check_logfile - load and check the logfile inode for a volume + * @vol: ntfs super block describing device whose logfile to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_check_logfile(ntfs_volume *vol, + RESTART_PAGE_HEADER **rp) +{ + struct inode *tmp_ino; + + ntfs_debug("Entering."); + tmp_ino = ntfs_iget(vol->sb, FILE_LogFile); + if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + /* Caller will display error message. */ + return false; + } + if (!ntfs_check_logfile(tmp_ino, rp)) { + iput(tmp_ino); + /* ntfs_check_logfile() will have displayed error output. */ + return false; + } + NInoSetSparseDisabled(NTFS_I(tmp_ino)); + vol->logfile_ino = tmp_ino; + ntfs_debug("Done."); + return true; +} + +#define NTFS_HIBERFIL_HEADER_SIZE 4096 + +/** + * check_windows_hibernation_status - check if Windows is suspended on a volume + * @vol: ntfs super block of device to check + * + * Check if Windows is hibernated on the ntfs volume @vol. This is done by + * looking for the file hiberfil.sys in the root directory of the volume. If + * the file is not present Windows is definitely not suspended. + * + * If hiberfil.sys exists and is less than 4kiB in size it means Windows is + * definitely suspended (this volume is not the system volume). Caveat: on a + * system with many volumes it is possible that the < 4kiB check is bogus but + * for now this should do fine. + * + * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the + * hiberfil header (which is the first 4kiB). If this begins with "hibr", + * Windows is definitely suspended. If it is completely full of zeroes, + * Windows is definitely not hibernated. Any other case is treated as if + * Windows is suspended. This caters for the above mentioned caveat of a + * system with many volumes where no "hibr" magic would be present and there is + * no zero header. + * + * Return 0 if Windows is not hibernated on the volume, >0 if Windows is + * hibernated on the volume, and -errno on error. + */ +static int check_windows_hibernation_status(ntfs_volume *vol) +{ + MFT_REF mref; + struct inode *vi; + ntfs_inode *ni; + struct page *page; + u32 *kaddr, *kend; + ntfs_name *name = NULL; + int ret = 1; + static const ntfschar hiberfil[13] = { cpu_to_le16('h'), + cpu_to_le16('i'), cpu_to_le16('b'), + cpu_to_le16('e'), cpu_to_le16('r'), + cpu_to_le16('f'), cpu_to_le16('i'), + cpu_to_le16('l'), cpu_to_le16('.'), + cpu_to_le16('s'), cpu_to_le16('y'), + cpu_to_le16('s'), 0 }; + + ntfs_debug("Entering."); + /* + * Find the inode number for the hibernation file by looking up the + * filename hiberfil.sys in the root directory. + */ + mutex_lock(&vol->root_ino->i_mutex); + mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, + &name); + mutex_unlock(&vol->root_ino->i_mutex); + if (IS_ERR_MREF(mref)) { + ret = MREF_ERR(mref); + /* If the file does not exist, Windows is not hibernated. */ + if (ret == -ENOENT) { + ntfs_debug("hiberfil.sys not present. Windows is not " + "hibernated on the volume."); + return 0; + } + /* A real error occurred. */ + ntfs_error(vol->sb, "Failed to find inode number for " + "hiberfil.sys."); + return ret; + } + /* We do not care for the type of match that was found. */ + kfree(name); + /* Get the inode. */ + vi = ntfs_iget(vol->sb, MREF(mref)); + if (IS_ERR(vi) || is_bad_inode(vi)) { + if (!IS_ERR(vi)) + iput(vi); + ntfs_error(vol->sb, "Failed to load hiberfil.sys."); + return IS_ERR(vi) ? PTR_ERR(vi) : -EIO; + } + if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) { + ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx). " + "Windows is hibernated on the volume. This " + "is not the system volume.", i_size_read(vi)); + goto iput_out; + } + ni = NTFS_I(vi); + page = ntfs_map_page(vi->i_mapping, 0); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read from hiberfil.sys."); + ret = PTR_ERR(page); + goto iput_out; + } + kaddr = (u32*)page_address(page); + if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) { + ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is " + "hibernated on the volume. This is the " + "system volume."); + goto unm_iput_out; + } + kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr); + do { + if (unlikely(*kaddr)) { + ntfs_debug("hiberfil.sys is larger than 4kiB " + "(0x%llx), does not contain the " + "\"hibr\" magic, and does not have a " + "zero header. Windows is hibernated " + "on the volume. This is not the " + "system volume.", i_size_read(vi)); + goto unm_iput_out; + } + } while (++kaddr < kend); + ntfs_debug("hiberfil.sys contains a zero header. Windows is not " + "hibernated on the volume. This is the system " + "volume."); + ret = 0; +unm_iput_out: + ntfs_unmap_page(page); +iput_out: + iput(vi); + return ret; +} + +/** + * load_and_init_quota - load and setup the quota file for a volume if present + * @vol: ntfs super block describing device whose quota file to load + * + * Return 'true' on success or 'false' on error. If $Quota is not present, we + * leave vol->quota_ino as NULL and return success. + */ +static bool load_and_init_quota(ntfs_volume *vol) +{ + MFT_REF mref; + struct inode *tmp_ino; + ntfs_name *name = NULL; + static const ntfschar Quota[7] = { cpu_to_le16('$'), + cpu_to_le16('Q'), cpu_to_le16('u'), + cpu_to_le16('o'), cpu_to_le16('t'), + cpu_to_le16('a'), 0 }; + static ntfschar Q[3] = { cpu_to_le16('$'), + cpu_to_le16('Q'), 0 }; + + ntfs_debug("Entering."); + /* + * Find the inode number for the quota file by looking up the filename + * $Quota in the extended system files directory $Extend. + */ + mutex_lock(&vol->extend_ino->i_mutex); + mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, + &name); + mutex_unlock(&vol->extend_ino->i_mutex); + if (IS_ERR_MREF(mref)) { + /* + * If the file does not exist, quotas are disabled and have + * never been enabled on this volume, just return success. + */ + if (MREF_ERR(mref) == -ENOENT) { + ntfs_debug("$Quota not present. Volume does not have " + "quotas enabled."); + /* + * No need to try to set quotas out of date if they are + * not enabled. + */ + NVolSetQuotaOutOfDate(vol); + return true; + } + /* A real error occurred. */ + ntfs_error(vol->sb, "Failed to find inode number for $Quota."); + return false; + } + /* We do not care for the type of match that was found. */ + kfree(name); + /* Get the inode. */ + tmp_ino = ntfs_iget(vol->sb, MREF(mref)); + if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + ntfs_error(vol->sb, "Failed to load $Quota."); + return false; + } + vol->quota_ino = tmp_ino; + /* Get the $Q index allocation attribute. */ + tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2); + if (IS_ERR(tmp_ino)) { + ntfs_error(vol->sb, "Failed to load $Quota/$Q index."); + return false; + } + vol->quota_q_ino = tmp_ino; + ntfs_debug("Done."); + return true; +} + +/** + * load_and_init_usnjrnl - load and setup the transaction log if present + * @vol: ntfs super block describing device whose usnjrnl file to load + * + * Return 'true' on success or 'false' on error. + * + * If $UsnJrnl is not present or in the process of being disabled, we set + * NVolUsnJrnlStamped() and return success. + * + * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn, + * i.e. transaction logging has only just been enabled or the journal has been + * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped() + * and return success. + */ +static bool load_and_init_usnjrnl(ntfs_volume *vol) +{ + MFT_REF mref; + struct inode *tmp_ino; + ntfs_inode *tmp_ni; + struct page *page; + ntfs_name *name = NULL; + USN_HEADER *uh; + static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'), + cpu_to_le16('U'), cpu_to_le16('s'), + cpu_to_le16('n'), cpu_to_le16('J'), + cpu_to_le16('r'), cpu_to_le16('n'), + cpu_to_le16('l'), 0 }; + static ntfschar Max[5] = { cpu_to_le16('$'), + cpu_to_le16('M'), cpu_to_le16('a'), + cpu_to_le16('x'), 0 }; + static ntfschar J[3] = { cpu_to_le16('$'), + cpu_to_le16('J'), 0 }; + + ntfs_debug("Entering."); + /* + * Find the inode number for the transaction log file by looking up the + * filename $UsnJrnl in the extended system files directory $Extend. + */ + mutex_lock(&vol->extend_ino->i_mutex); + mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, + &name); + mutex_unlock(&vol->extend_ino->i_mutex); + if (IS_ERR_MREF(mref)) { + /* + * If the file does not exist, transaction logging is disabled, + * just return success. + */ + if (MREF_ERR(mref) == -ENOENT) { + ntfs_debug("$UsnJrnl not present. Volume does not " + "have transaction logging enabled."); +not_enabled: + /* + * No need to try to stamp the transaction log if + * transaction logging is not enabled. + */ + NVolSetUsnJrnlStamped(vol); + return true; + } + /* A real error occurred. */ + ntfs_error(vol->sb, "Failed to find inode number for " + "$UsnJrnl."); + return false; + } + /* We do not care for the type of match that was found. */ + kfree(name); + /* Get the inode. */ + tmp_ino = ntfs_iget(vol->sb, MREF(mref)); + if (unlikely(IS_ERR(tmp_ino) || is_bad_inode(tmp_ino))) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + ntfs_error(vol->sb, "Failed to load $UsnJrnl."); + return false; + } + vol->usnjrnl_ino = tmp_ino; + /* + * If the transaction log is in the process of being deleted, we can + * ignore it. + */ + if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) { + ntfs_debug("$UsnJrnl in the process of being disabled. " + "Volume does not have transaction logging " + "enabled."); + goto not_enabled; + } + /* Get the $DATA/$Max attribute. */ + tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4); + if (IS_ERR(tmp_ino)) { + ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max " + "attribute."); + return false; + } + vol->usnjrnl_max_ino = tmp_ino; + if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) { + ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max " + "attribute (size is 0x%llx but should be at " + "least 0x%zx bytes).", i_size_read(tmp_ino), + sizeof(USN_HEADER)); + return false; + } + /* Get the $DATA/$J attribute. */ + tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2); + if (IS_ERR(tmp_ino)) { + ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J " + "attribute."); + return false; + } + vol->usnjrnl_j_ino = tmp_ino; + /* Verify $J is non-resident and sparse. */ + tmp_ni = NTFS_I(vol->usnjrnl_j_ino); + if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) { + ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident " + "and/or not sparse."); + return false; + } + /* Read the USN_HEADER from $DATA/$Max. */ + page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max " + "attribute."); + return false; + } + uh = (USN_HEADER*)page_address(page); + /* Sanity check the $Max. */ + if (unlikely(sle64_to_cpu(uh->allocation_delta) > + sle64_to_cpu(uh->maximum_size))) { + ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds " + "maximum size (0x%llx). $UsnJrnl is corrupt.", + (long long)sle64_to_cpu(uh->allocation_delta), + (long long)sle64_to_cpu(uh->maximum_size)); + ntfs_unmap_page(page); + return false; + } + /* + * If the transaction log has been stamped and nothing has been written + * to it since, we do not need to stamp it. + */ + if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >= + i_size_read(vol->usnjrnl_j_ino))) { + if (likely(sle64_to_cpu(uh->lowest_valid_usn) == + i_size_read(vol->usnjrnl_j_ino))) { + ntfs_unmap_page(page); + ntfs_debug("$UsnJrnl is enabled but nothing has been " + "logged since it was last stamped. " + "Treating this as if the volume does " + "not have transaction logging " + "enabled."); + goto not_enabled; + } + ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) " + "which is out of bounds (0x%llx). $UsnJrnl " + "is corrupt.", + (long long)sle64_to_cpu(uh->lowest_valid_usn), + i_size_read(vol->usnjrnl_j_ino)); + ntfs_unmap_page(page); + return false; + } + ntfs_unmap_page(page); + ntfs_debug("Done."); + return true; +} + +/** + * load_and_init_attrdef - load the attribute definitions table for a volume + * @vol: ntfs super block describing device whose attrdef to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_init_attrdef(ntfs_volume *vol) +{ + loff_t i_size; + struct super_block *sb = vol->sb; + struct inode *ino; + struct page *page; + pgoff_t index, max_index; + unsigned int size; + + ntfs_debug("Entering."); + /* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */ + ino = ntfs_iget(sb, FILE_AttrDef); + if (IS_ERR(ino) || is_bad_inode(ino)) { + if (!IS_ERR(ino)) + iput(ino); + goto failed; + } + NInoSetSparseDisabled(NTFS_I(ino)); + /* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */ + i_size = i_size_read(ino); + if (i_size <= 0 || i_size > 0x7fffffff) + goto iput_failed; + vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size); + if (!vol->attrdef) + goto iput_failed; + index = 0; + max_index = i_size >> PAGE_CACHE_SHIFT; + size = PAGE_CACHE_SIZE; + while (index < max_index) { + /* Read the attrdef table and copy it into the linear buffer. */ +read_partial_attrdef_page: + page = ntfs_map_page(ino->i_mapping, index); + if (IS_ERR(page)) + goto free_iput_failed; + memcpy((u8*)vol->attrdef + (index++ << PAGE_CACHE_SHIFT), + page_address(page), size); + ntfs_unmap_page(page); + }; + if (size == PAGE_CACHE_SIZE) { + size = i_size & ~PAGE_CACHE_MASK; + if (size) + goto read_partial_attrdef_page; + } + vol->attrdef_size = i_size; + ntfs_debug("Read %llu bytes from $AttrDef.", i_size); + iput(ino); + return true; +free_iput_failed: + ntfs_free(vol->attrdef); + vol->attrdef = NULL; +iput_failed: + iput(ino); +failed: + ntfs_error(sb, "Failed to initialize attribute definition table."); + return false; +} + +#endif /* NTFS_RW */ + +/** + * load_and_init_upcase - load the upcase table for an ntfs volume + * @vol: ntfs super block describing device whose upcase to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_init_upcase(ntfs_volume *vol) +{ + loff_t i_size; + struct super_block *sb = vol->sb; + struct inode *ino; + struct page *page; + pgoff_t index, max_index; + unsigned int size; + int i, max; + + ntfs_debug("Entering."); + /* Read upcase table and setup vol->upcase and vol->upcase_len. */ + ino = ntfs_iget(sb, FILE_UpCase); + if (IS_ERR(ino) || is_bad_inode(ino)) { + if (!IS_ERR(ino)) + iput(ino); + goto upcase_failed; + } + /* + * The upcase size must not be above 64k Unicode characters, must not + * be zero and must be a multiple of sizeof(ntfschar). + */ + i_size = i_size_read(ino); + if (!i_size || i_size & (sizeof(ntfschar) - 1) || + i_size > 64ULL * 1024 * sizeof(ntfschar)) + goto iput_upcase_failed; + vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size); + if (!vol->upcase) + goto iput_upcase_failed; + index = 0; + max_index = i_size >> PAGE_CACHE_SHIFT; + size = PAGE_CACHE_SIZE; + while (index < max_index) { + /* Read the upcase table and copy it into the linear buffer. */ +read_partial_upcase_page: + page = ntfs_map_page(ino->i_mapping, index); + if (IS_ERR(page)) + goto iput_upcase_failed; + memcpy((char*)vol->upcase + (index++ << PAGE_CACHE_SHIFT), + page_address(page), size); + ntfs_unmap_page(page); + }; + if (size == PAGE_CACHE_SIZE) { + size = i_size & ~PAGE_CACHE_MASK; + if (size) + goto read_partial_upcase_page; + } + vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS; + ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).", + i_size, 64 * 1024 * sizeof(ntfschar)); + iput(ino); + mutex_lock(&ntfs_lock); + if (!default_upcase) { + ntfs_debug("Using volume specified $UpCase since default is " + "not present."); + mutex_unlock(&ntfs_lock); + return true; + } + max = default_upcase_len; + if (max > vol->upcase_len) + max = vol->upcase_len; + for (i = 0; i < max; i++) + if (vol->upcase[i] != default_upcase[i]) + break; + if (i == max) { + ntfs_free(vol->upcase); + vol->upcase = default_upcase; + vol->upcase_len = max; + ntfs_nr_upcase_users++; + mutex_unlock(&ntfs_lock); + ntfs_debug("Volume specified $UpCase matches default. Using " + "default."); + return true; + } + mutex_unlock(&ntfs_lock); + ntfs_debug("Using volume specified $UpCase since it does not match " + "the default."); + return true; +iput_upcase_failed: + iput(ino); + ntfs_free(vol->upcase); + vol->upcase = NULL; +upcase_failed: + mutex_lock(&ntfs_lock); + if (default_upcase) { + vol->upcase = default_upcase; + vol->upcase_len = default_upcase_len; + ntfs_nr_upcase_users++; + mutex_unlock(&ntfs_lock); + ntfs_error(sb, "Failed to load $UpCase from the volume. Using " + "default."); + return true; + } + mutex_unlock(&ntfs_lock); + ntfs_error(sb, "Failed to initialize upcase table."); + return false; +} + +/* + * The lcn and mft bitmap inodes are NTFS-internal inodes with + * their own special locking rules: + */ +static struct lock_class_key + lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key, + mftbmp_runlist_lock_key, mftbmp_mrec_lock_key; + +/** + * load_system_files - open the system files using normal functions + * @vol: ntfs super block describing device whose system files to load + * + * Open the system files with normal access functions and complete setting up + * the ntfs super block @vol. + * + * Return 'true' on success or 'false' on error. + */ +static bool load_system_files(ntfs_volume *vol) +{ + struct super_block *sb = vol->sb; + MFT_RECORD *m; + VOLUME_INFORMATION *vi; + ntfs_attr_search_ctx *ctx; +#ifdef NTFS_RW + RESTART_PAGE_HEADER *rp; + int err; +#endif /* NTFS_RW */ + + ntfs_debug("Entering."); +#ifdef NTFS_RW + /* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */ + if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) { + static const char *es1 = "Failed to load $MFTMirr"; + static const char *es2 = "$MFTMirr does not match $MFT"; + static const char *es3 = ". Run ntfsfix and/or chkdsk."; + + /* If a read-write mount, convert it to a read-only mount. */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + !vol->mftmirr_ino ? es1 : es2, + es3); + goto iput_mirr_err_out; + } + sb->s_flags |= MS_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", + !vol->mftmirr_ino ? es1 : es2, es3); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", + !vol->mftmirr_ino ? es1 : es2, es3); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } +#endif /* NTFS_RW */ + /* Get mft bitmap attribute inode. */ + vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0); + if (IS_ERR(vol->mftbmp_ino)) { + ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute."); + goto iput_mirr_err_out; + } + lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock, + &mftbmp_runlist_lock_key); + lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock, + &mftbmp_mrec_lock_key); + /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */ + if (!load_and_init_upcase(vol)) + goto iput_mftbmp_err_out; +#ifdef NTFS_RW + /* + * Read attribute definitions table and setup @vol->attrdef and + * @vol->attrdef_size. + */ + if (!load_and_init_attrdef(vol)) + goto iput_upcase_err_out; +#endif /* NTFS_RW */ + /* + * Get the cluster allocation bitmap inode and verify the size, no + * need for any locking at this stage as we are already running + * exclusively as we are mount in progress task. + */ + vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap); + if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) { + if (!IS_ERR(vol->lcnbmp_ino)) + iput(vol->lcnbmp_ino); + goto bitmap_failed; + } + lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock, + &lcnbmp_runlist_lock_key); + lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock, + &lcnbmp_mrec_lock_key); + + NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino)); + if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) { + iput(vol->lcnbmp_ino); +bitmap_failed: + ntfs_error(sb, "Failed to load $Bitmap."); + goto iput_attrdef_err_out; + } + /* + * Get the volume inode and setup our cache of the volume flags and + * version. + */ + vol->vol_ino = ntfs_iget(sb, FILE_Volume); + if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) { + if (!IS_ERR(vol->vol_ino)) + iput(vol->vol_ino); +volume_failed: + ntfs_error(sb, "Failed to load $Volume."); + goto iput_lcnbmp_err_out; + } + m = map_mft_record(NTFS_I(vol->vol_ino)); + if (IS_ERR(m)) { +iput_volume_failed: + iput(vol->vol_ino); + goto volume_failed; + } + if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) { + ntfs_error(sb, "Failed to get attribute search context."); + goto get_ctx_vol_failed; + } + if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, + ctx) || ctx->attr->non_resident || ctx->attr->flags) { +err_put_vol: + ntfs_attr_put_search_ctx(ctx); +get_ctx_vol_failed: + unmap_mft_record(NTFS_I(vol->vol_ino)); + goto iput_volume_failed; + } + vi = (VOLUME_INFORMATION*)((char*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + /* Some bounds checks. */ + if ((u8*)vi < (u8*)ctx->attr || (u8*)vi + + le32_to_cpu(ctx->attr->data.resident.value_length) > + (u8*)ctx->attr + le32_to_cpu(ctx->attr->length)) + goto err_put_vol; + /* Copy the volume flags and version to the ntfs_volume structure. */ + vol->vol_flags = vi->flags; + vol->major_ver = vi->major_ver; + vol->minor_ver = vi->minor_ver; + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(NTFS_I(vol->vol_ino)); + printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver, + vol->minor_ver); + if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { + ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " + "volume version %i.%i (need at least version " + "3.0).", vol->major_ver, vol->minor_ver); + NVolClearSparseEnabled(vol); + } +#ifdef NTFS_RW + /* Make sure that no unsupported volume flags are set. */ + if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { + static const char *es1a = "Volume is dirty"; + static const char *es1b = "Volume has been modified by chkdsk"; + static const char *es1c = "Volume has unsupported flags set"; + static const char *es2a = ". Run chkdsk and mount in Windows."; + static const char *es2b = ". Mount in Windows."; + const char *es1, *es2; + + es2 = es2a; + if (vol->vol_flags & VOLUME_IS_DIRTY) + es1 = es1a; + else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { + es1 = es1b; + es2 = es2b; + } else { + es1 = es1c; + ntfs_warning(sb, "Unsupported volume flags 0x%x " + "encountered.", + (unsigned)le16_to_cpu(vol->vol_flags)); + } + /* If a read-write mount, convert it to a read-only mount. */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_vol_err_out; + } + sb->s_flags |= MS_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* + * Do not set NVolErrors() because ntfs_remount() re-checks the + * flags which we need to do in case any flags have changed. + */ + } + /* + * Get the inode for the logfile, check it and determine if the volume + * was shutdown cleanly. + */ + rp = NULL; + if (!load_and_check_logfile(vol, &rp) || + !ntfs_is_logfile_clean(vol->logfile_ino, rp)) { + static const char *es1a = "Failed to load $LogFile"; + static const char *es1b = "$LogFile is not clean"; + static const char *es2 = ". Mount in Windows."; + const char *es1; + + es1 = !vol->logfile_ino ? es1a : es1b; + /* If a read-write mount, convert it to a read-only mount. */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + if (vol->logfile_ino) { + BUG_ON(!rp); + ntfs_free(rp); + } + goto iput_logfile_err_out; + } + sb->s_flags |= MS_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + ntfs_free(rp); +#endif /* NTFS_RW */ + /* Get the root directory inode so we can do path lookups. */ + vol->root_ino = ntfs_iget(sb, FILE_root); + if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) { + if (!IS_ERR(vol->root_ino)) + iput(vol->root_ino); + ntfs_error(sb, "Failed to load root directory."); + goto iput_logfile_err_out; + } +#ifdef NTFS_RW + /* + * Check if Windows is suspended to disk on the target volume. If it + * is hibernated, we must not write *anything* to the disk so set + * NVolErrors() without setting the dirty volume flag and mount + * read-only. This will prevent read-write remounting and it will also + * prevent all writes. + */ + err = check_windows_hibernation_status(vol); + if (unlikely(err)) { + static const char *es1a = "Failed to determine if Windows is " + "hibernated"; + static const char *es1b = "Windows is hibernated"; + static const char *es2 = ". Run chkdsk."; + const char *es1; + + es1 = err < 0 ? es1a : es1b; + /* If a read-write mount, convert it to a read-only mount. */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + sb->s_flags |= MS_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + /* If (still) a read-write mount, mark the volume dirty. */ + if (!(sb->s_flags & MS_RDONLY) && + ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { + static const char *es1 = "Failed to set dirty bit in volume " + "information flags"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= MS_RDONLY; + /* + * Do not set NVolErrors() because ntfs_remount() might manage + * to set the dirty flag in which case all would be well. + */ + } +#if 0 + // TODO: Enable this code once we start modifying anything that is + // different between NTFS 1.2 and 3.x... + /* + * If (still) a read-write mount, set the NT4 compatibility flag on + * newer NTFS version volumes. + */ + if (!(sb->s_flags & MS_RDONLY) && (vol->major_ver > 1) && + ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { + static const char *es1 = "Failed to set NT4 compatibility flag"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= MS_RDONLY; + NVolSetErrors(vol); + } +#endif + /* If (still) a read-write mount, empty the logfile. */ + if (!(sb->s_flags & MS_RDONLY) && + !ntfs_empty_logfile(vol->logfile_ino)) { + static const char *es1 = "Failed to empty $LogFile"; + static const char *es2 = ". Mount in Windows."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= MS_RDONLY; + NVolSetErrors(vol); + } +#endif /* NTFS_RW */ + /* If on NTFS versions before 3.0, we are done. */ + if (unlikely(vol->major_ver < 3)) + return true; + /* NTFS 3.0+ specific initialization. */ + /* Get the security descriptors inode. */ + vol->secure_ino = ntfs_iget(sb, FILE_Secure); + if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) { + if (!IS_ERR(vol->secure_ino)) + iput(vol->secure_ino); + ntfs_error(sb, "Failed to load $Secure."); + goto iput_root_err_out; + } + // TODO: Initialize security. + /* Get the extended system files' directory inode. */ + vol->extend_ino = ntfs_iget(sb, FILE_Extend); + if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) { + if (!IS_ERR(vol->extend_ino)) + iput(vol->extend_ino); + ntfs_error(sb, "Failed to load $Extend."); + goto iput_sec_err_out; + } +#ifdef NTFS_RW + /* Find the quota file, load it if present, and set it up. */ + if (!load_and_init_quota(vol)) { + static const char *es1 = "Failed to load $Quota"; + static const char *es2 = ". Run chkdsk."; + + /* If a read-write mount, convert it to a read-only mount. */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_quota_err_out; + } + sb->s_flags |= MS_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + /* If (still) a read-write mount, mark the quotas out of date. */ + if (!(sb->s_flags & MS_RDONLY) && + !ntfs_mark_quotas_out_of_date(vol)) { + static const char *es1 = "Failed to mark quotas out of date"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_quota_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= MS_RDONLY; + NVolSetErrors(vol); + } + /* + * Find the transaction log file ($UsnJrnl), load it if present, check + * it, and set it up. + */ + if (!load_and_init_usnjrnl(vol)) { + static const char *es1 = "Failed to load $UsnJrnl"; + static const char *es2 = ". Run chkdsk."; + + /* If a read-write mount, convert it to a read-only mount. */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_usnjrnl_err_out; + } + sb->s_flags |= MS_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + /* If (still) a read-write mount, stamp the transaction log. */ + if (!(sb->s_flags & MS_RDONLY) && !ntfs_stamp_usnjrnl(vol)) { + static const char *es1 = "Failed to stamp transaction log " + "($UsnJrnl)"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_usnjrnl_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= MS_RDONLY; + NVolSetErrors(vol); + } +#endif /* NTFS_RW */ + return true; +#ifdef NTFS_RW +iput_usnjrnl_err_out: + if (vol->usnjrnl_j_ino) + iput(vol->usnjrnl_j_ino); + if (vol->usnjrnl_max_ino) + iput(vol->usnjrnl_max_ino); + if (vol->usnjrnl_ino) + iput(vol->usnjrnl_ino); +iput_quota_err_out: + if (vol->quota_q_ino) + iput(vol->quota_q_ino); + if (vol->quota_ino) + iput(vol->quota_ino); + iput(vol->extend_ino); +#endif /* NTFS_RW */ +iput_sec_err_out: + iput(vol->secure_ino); +iput_root_err_out: + iput(vol->root_ino); +iput_logfile_err_out: +#ifdef NTFS_RW + if (vol->logfile_ino) + iput(vol->logfile_ino); +iput_vol_err_out: +#endif /* NTFS_RW */ + iput(vol->vol_ino); +iput_lcnbmp_err_out: + iput(vol->lcnbmp_ino); +iput_attrdef_err_out: + vol->attrdef_size = 0; + if (vol->attrdef) { + ntfs_free(vol->attrdef); + vol->attrdef = NULL; + } +#ifdef NTFS_RW +iput_upcase_err_out: +#endif /* NTFS_RW */ + vol->upcase_len = 0; + mutex_lock(&ntfs_lock); + if (vol->upcase == default_upcase) { + ntfs_nr_upcase_users--; + vol->upcase = NULL; + } + mutex_unlock(&ntfs_lock); + if (vol->upcase) { + ntfs_free(vol->upcase); + vol->upcase = NULL; + } +iput_mftbmp_err_out: + iput(vol->mftbmp_ino); +iput_mirr_err_out: +#ifdef NTFS_RW + if (vol->mftmirr_ino) + iput(vol->mftmirr_ino); +#endif /* NTFS_RW */ + return false; +} + +/** + * ntfs_put_super - called by the vfs to unmount a volume + * @sb: vfs superblock of volume to unmount + * + * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when + * the volume is being unmounted (umount system call has been invoked) and it + * releases all inodes and memory belonging to the NTFS specific part of the + * super block. + */ +static void ntfs_put_super(struct super_block *sb) +{ + ntfs_volume *vol = NTFS_SB(sb); + + ntfs_debug("Entering."); + +#ifdef NTFS_RW + /* + * Commit all inodes while they are still open in case some of them + * cause others to be dirtied. + */ + ntfs_commit_inode(vol->vol_ino); + + /* NTFS 3.0+ specific. */ + if (vol->major_ver >= 3) { + if (vol->usnjrnl_j_ino) + ntfs_commit_inode(vol->usnjrnl_j_ino); + if (vol->usnjrnl_max_ino) + ntfs_commit_inode(vol->usnjrnl_max_ino); + if (vol->usnjrnl_ino) + ntfs_commit_inode(vol->usnjrnl_ino); + if (vol->quota_q_ino) + ntfs_commit_inode(vol->quota_q_ino); + if (vol->quota_ino) + ntfs_commit_inode(vol->quota_ino); + if (vol->extend_ino) + ntfs_commit_inode(vol->extend_ino); + if (vol->secure_ino) + ntfs_commit_inode(vol->secure_ino); + } + + ntfs_commit_inode(vol->root_ino); + + down_write(&vol->lcnbmp_lock); + ntfs_commit_inode(vol->lcnbmp_ino); + up_write(&vol->lcnbmp_lock); + + down_write(&vol->mftbmp_lock); + ntfs_commit_inode(vol->mftbmp_ino); + up_write(&vol->mftbmp_lock); + + if (vol->logfile_ino) + ntfs_commit_inode(vol->logfile_ino); + + if (vol->mftmirr_ino) + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); + + /* + * If a read-write mount and no volume errors have occurred, mark the + * volume clean. Also, re-commit all affected inodes. + */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!NVolErrors(vol)) { + if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) + ntfs_warning(sb, "Failed to clear dirty bit " + "in volume information " + "flags. Run chkdsk."); + ntfs_commit_inode(vol->vol_ino); + ntfs_commit_inode(vol->root_ino); + if (vol->mftmirr_ino) + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); + } else { + ntfs_warning(sb, "Volume has errors. Leaving volume " + "marked dirty. Run chkdsk."); + } + } +#endif /* NTFS_RW */ + + iput(vol->vol_ino); + vol->vol_ino = NULL; + + /* NTFS 3.0+ specific clean up. */ + if (vol->major_ver >= 3) { +#ifdef NTFS_RW + if (vol->usnjrnl_j_ino) { + iput(vol->usnjrnl_j_ino); + vol->usnjrnl_j_ino = NULL; + } + if (vol->usnjrnl_max_ino) { + iput(vol->usnjrnl_max_ino); + vol->usnjrnl_max_ino = NULL; + } + if (vol->usnjrnl_ino) { + iput(vol->usnjrnl_ino); + vol->usnjrnl_ino = NULL; + } + if (vol->quota_q_ino) { + iput(vol->quota_q_ino); + vol->quota_q_ino = NULL; + } + if (vol->quota_ino) { + iput(vol->quota_ino); + vol->quota_ino = NULL; + } +#endif /* NTFS_RW */ + if (vol->extend_ino) { + iput(vol->extend_ino); + vol->extend_ino = NULL; + } + if (vol->secure_ino) { + iput(vol->secure_ino); + vol->secure_ino = NULL; + } + } + + iput(vol->root_ino); + vol->root_ino = NULL; + + down_write(&vol->lcnbmp_lock); + iput(vol->lcnbmp_ino); + vol->lcnbmp_ino = NULL; + up_write(&vol->lcnbmp_lock); + + down_write(&vol->mftbmp_lock); + iput(vol->mftbmp_ino); + vol->mftbmp_ino = NULL; + up_write(&vol->mftbmp_lock); + +#ifdef NTFS_RW + if (vol->logfile_ino) { + iput(vol->logfile_ino); + vol->logfile_ino = NULL; + } + if (vol->mftmirr_ino) { + /* Re-commit the mft mirror and mft just in case. */ + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); + iput(vol->mftmirr_ino); + vol->mftmirr_ino = NULL; + } + /* + * We should have no dirty inodes left, due to + * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as + * the underlying mft records are written out and cleaned. + */ + ntfs_commit_inode(vol->mft_ino); + write_inode_now(vol->mft_ino, 1); +#endif /* NTFS_RW */ + + iput(vol->mft_ino); + vol->mft_ino = NULL; + + /* Throw away the table of attribute definitions. */ + vol->attrdef_size = 0; + if (vol->attrdef) { + ntfs_free(vol->attrdef); + vol->attrdef = NULL; + } + vol->upcase_len = 0; + /* + * Destroy the global default upcase table if necessary. Also decrease + * the number of upcase users if we are a user. + */ + mutex_lock(&ntfs_lock); + if (vol->upcase == default_upcase) { + ntfs_nr_upcase_users--; + vol->upcase = NULL; + } + if (!ntfs_nr_upcase_users && default_upcase) { + ntfs_free(default_upcase); + default_upcase = NULL; + } + if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) + free_compression_buffers(); + mutex_unlock(&ntfs_lock); + if (vol->upcase) { + ntfs_free(vol->upcase); + vol->upcase = NULL; + } + + unload_nls(vol->nls_map); + + sb->s_fs_info = NULL; + kfree(vol); +} + +/** + * get_nr_free_clusters - return the number of free clusters on a volume + * @vol: ntfs volume for which to obtain free cluster count + * + * Calculate the number of free clusters on the mounted NTFS volume @vol. We + * actually calculate the number of clusters in use instead because this + * allows us to not care about partial pages as these will be just zero filled + * and hence not be counted as allocated clusters. + * + * The only particularity is that clusters beyond the end of the logical ntfs + * volume will be marked as allocated to prevent errors which means we have to + * discount those at the end. This is important as the cluster bitmap always + * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside + * the logical volume and marked in use when they are not as they do not exist. + * + * If any pages cannot be read we assume all clusters in the erroring pages are + * in use. This means we return an underestimate on errors which is better than + * an overestimate. + */ +static s64 get_nr_free_clusters(ntfs_volume *vol) +{ + s64 nr_free = vol->nr_clusters; + struct address_space *mapping = vol->lcnbmp_ino->i_mapping; + struct page *page; + pgoff_t index, max_index; + + ntfs_debug("Entering."); + /* Serialize accesses to the cluster bitmap. */ + down_read(&vol->lcnbmp_lock); + /* + * Convert the number of bits into bytes rounded up, then convert into + * multiples of PAGE_CACHE_SIZE, rounding up so that if we have one + * full and one partial page max_index = 2. + */ + max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + /* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */ + ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", + max_index, PAGE_CACHE_SIZE / 4); + for (index = 0; index < max_index; index++) { + unsigned long *kaddr; + + /* + * Read the page from page cache, getting it from backing store + * if necessary, and increment the use count. + */ + page = read_mapping_page(mapping, index, NULL); + /* Ignore pages which errored synchronously. */ + if (IS_ERR(page)) { + ntfs_debug("read_mapping_page() error. Skipping " + "page (index 0x%lx).", index); + nr_free -= PAGE_CACHE_SIZE * 8; + continue; + } + kaddr = kmap_atomic(page, KM_USER0); + /* + * Subtract the number of set bits. If this + * is the last page and it is partial we don't really care as + * it just means we do a little extra work but it won't affect + * the result as all out of range bytes are set to zero by + * ntfs_readpage(). + */ + nr_free -= bitmap_weight(kaddr, + PAGE_CACHE_SIZE * BITS_PER_BYTE); + kunmap_atomic(kaddr, KM_USER0); + page_cache_release(page); + } + ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1); + /* + * Fixup for eventual bits outside logical ntfs volume (see function + * description above). + */ + if (vol->nr_clusters & 63) + nr_free += 64 - (vol->nr_clusters & 63); + up_read(&vol->lcnbmp_lock); + /* If errors occurred we may well have gone below zero, fix this. */ + if (nr_free < 0) + nr_free = 0; + ntfs_debug("Exiting."); + return nr_free; +} + +/** + * __get_nr_free_mft_records - return the number of free inodes on a volume + * @vol: ntfs volume for which to obtain free inode count + * @nr_free: number of mft records in filesystem + * @max_index: maximum number of pages containing set bits + * + * Calculate the number of free mft records (inodes) on the mounted NTFS + * volume @vol. We actually calculate the number of mft records in use instead + * because this allows us to not care about partial pages as these will be just + * zero filled and hence not be counted as allocated mft record. + * + * If any pages cannot be read we assume all mft records in the erroring pages + * are in use. This means we return an underestimate on errors which is better + * than an overestimate. + * + * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing. + */ +static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, + s64 nr_free, const pgoff_t max_index) +{ + struct address_space *mapping = vol->mftbmp_ino->i_mapping; + struct page *page; + pgoff_t index; + + ntfs_debug("Entering."); + /* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */ + ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " + "0x%lx.", max_index, PAGE_CACHE_SIZE / 4); + for (index = 0; index < max_index; index++) { + unsigned long *kaddr; + + /* + * Read the page from page cache, getting it from backing store + * if necessary, and increment the use count. + */ + page = read_mapping_page(mapping, index, NULL); + /* Ignore pages which errored synchronously. */ + if (IS_ERR(page)) { + ntfs_debug("read_mapping_page() error. Skipping " + "page (index 0x%lx).", index); + nr_free -= PAGE_CACHE_SIZE * 8; + continue; + } + kaddr = kmap_atomic(page, KM_USER0); + /* + * Subtract the number of set bits. If this + * is the last page and it is partial we don't really care as + * it just means we do a little extra work but it won't affect + * the result as all out of range bytes are set to zero by + * ntfs_readpage(). + */ + nr_free -= bitmap_weight(kaddr, + PAGE_CACHE_SIZE * BITS_PER_BYTE); + kunmap_atomic(kaddr, KM_USER0); + page_cache_release(page); + } + ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.", + index - 1); + /* If errors occurred we may well have gone below zero, fix this. */ + if (nr_free < 0) + nr_free = 0; + ntfs_debug("Exiting."); + return nr_free; +} + +/** + * ntfs_statfs - return information about mounted NTFS volume + * @dentry: dentry from mounted volume + * @sfs: statfs structure in which to return the information + * + * Return information about the mounted NTFS volume @dentry in the statfs structure + * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is + * called). We interpret the values to be correct of the moment in time at + * which we are called. Most values are variable otherwise and this isn't just + * the free values but the totals as well. For example we can increase the + * total number of file nodes if we run out and we can keep doing this until + * there is no more space on the volume left at all. + * + * Called from vfs_statfs which is used to handle the statfs, fstatfs, and + * ustat system calls. + * + * Return 0 on success or -errno on error. + */ +static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) +{ + struct super_block *sb = dentry->d_sb; + s64 size; + ntfs_volume *vol = NTFS_SB(sb); + ntfs_inode *mft_ni = NTFS_I(vol->mft_ino); + pgoff_t max_index; + unsigned long flags; + + ntfs_debug("Entering."); + /* Type of filesystem. */ + sfs->f_type = NTFS_SB_MAGIC; + /* Optimal transfer block size. */ + sfs->f_bsize = PAGE_CACHE_SIZE; + /* + * Total data blocks in filesystem in units of f_bsize and since + * inodes are also stored in data blocs ($MFT is a file) this is just + * the total clusters. + */ + sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >> + PAGE_CACHE_SHIFT; + /* Free data blocks in filesystem in units of f_bsize. */ + size = get_nr_free_clusters(vol) << vol->cluster_size_bits >> + PAGE_CACHE_SHIFT; + if (size < 0LL) + size = 0LL; + /* Free blocks avail to non-superuser, same as above on NTFS. */ + sfs->f_bavail = sfs->f_bfree = size; + /* Serialize accesses to the inode bitmap. */ + down_read(&vol->mftbmp_lock); + read_lock_irqsave(&mft_ni->size_lock, flags); + size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits; + /* + * Convert the maximum number of set bits into bytes rounded up, then + * convert into multiples of PAGE_CACHE_SIZE, rounding up so that if we + * have one full and one partial page max_index = 2. + */ + max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits) + + 7) >> 3) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + /* Number of inodes in filesystem (at this point in time). */ + sfs->f_files = size; + /* Free inodes in fs (based on current total count). */ + sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index); + up_read(&vol->mftbmp_lock); + /* + * File system id. This is extremely *nix flavour dependent and even + * within Linux itself all fs do their own thing. I interpret this to + * mean a unique id associated with the mounted fs and not the id + * associated with the filesystem driver, the latter is already given + * by the filesystem type in sfs->f_type. Thus we use the 64-bit + * volume serial number splitting it into two 32-bit parts. We enter + * the least significant 32-bits in f_fsid[0] and the most significant + * 32-bits in f_fsid[1]. + */ + sfs->f_fsid.val[0] = vol->serial_no & 0xffffffff; + sfs->f_fsid.val[1] = (vol->serial_no >> 32) & 0xffffffff; + /* Maximum length of filenames. */ + sfs->f_namelen = NTFS_MAX_NAME_LEN; + return 0; +} + +#ifdef NTFS_RW +static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) +{ + return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL); +} +#endif + +/** + * The complete super operations. + */ +static const struct super_operations ntfs_sops = { + .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */ + .destroy_inode = ntfs_destroy_big_inode, /* VFS: Deallocate inode. */ +#ifdef NTFS_RW + //.dirty_inode = NULL, /* VFS: Called from + // __mark_inode_dirty(). */ + .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to + disk. */ + //.drop_inode = NULL, /* VFS: Called just after the + // inode reference count has + // been decreased to zero. + // NOTE: The inode lock is + // held. See fs/inode.c:: + // generic_drop_inode(). */ + //.delete_inode = NULL, /* VFS: Delete inode from disk. + // Called when i_count becomes + // 0 and i_nlink is also 0. */ + //.write_super = NULL, /* Flush dirty super block to + // disk. */ + //.sync_fs = NULL, /* ? */ + //.write_super_lockfs = NULL, /* ? */ + //.unlockfs = NULL, /* ? */ +#endif /* NTFS_RW */ + .put_super = ntfs_put_super, /* Syscall: umount. */ + .statfs = ntfs_statfs, /* Syscall: statfs */ + .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */ + .evict_inode = ntfs_evict_big_inode, /* VFS: Called when an inode is + removed from memory. */ + //.umount_begin = NULL, /* Forced umount. */ + .show_options = ntfs_show_options, /* Show mount options in + proc. */ +}; + +/** + * ntfs_fill_super - mount an ntfs filesystem + * @sb: super block of ntfs filesystem to mount + * @opt: string containing the mount options + * @silent: silence error output + * + * ntfs_fill_super() is called by the VFS to mount the device described by @sb + * with the mount otions in @data with the NTFS filesystem. + * + * If @silent is true, remain silent even if errors are detected. This is used + * during bootup, when the kernel tries to mount the root filesystem with all + * registered filesystems one after the other until one succeeds. This implies + * that all filesystems except the correct one will quite correctly and + * expectedly return an error, but nobody wants to see error messages when in + * fact this is what is supposed to happen. + * + * NOTE: @sb->s_flags contains the mount options flags. + */ +static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) +{ + ntfs_volume *vol; + struct buffer_head *bh; + struct inode *tmp_ino; + int blocksize, result; + + /* + * We do a pretty difficult piece of bootstrap by reading the + * MFT (and other metadata) from disk into memory. We'll only + * release this metadata during umount, so the locking patterns + * observed during bootstrap do not count. So turn off the + * observation of locking patterns (strictly for this context + * only) while mounting NTFS. [The validator is still active + * otherwise, even for this context: it will for example record + * lock class registrations.] + */ + lockdep_off(); + ntfs_debug("Entering."); +#ifndef NTFS_RW + sb->s_flags |= MS_RDONLY; +#endif /* ! NTFS_RW */ + /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */ + sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS); + vol = NTFS_SB(sb); + if (!vol) { + if (!silent) + ntfs_error(sb, "Allocation of NTFS volume structure " + "failed. Aborting mount..."); + lockdep_on(); + return -ENOMEM; + } + /* Initialize ntfs_volume structure. */ + *vol = (ntfs_volume) { + .sb = sb, + /* + * Default is group and other don't have any access to files or + * directories while owner has full access. Further, files by + * default are not executable but directories are of course + * browseable. + */ + .fmask = 0177, + .dmask = 0077, + }; + init_rwsem(&vol->mftbmp_lock); + init_rwsem(&vol->lcnbmp_lock); + + /* By default, enable sparse support. */ + NVolSetSparseEnabled(vol); + + /* Important to get the mount options dealt with now. */ + if (!parse_options(vol, (char*)opt)) + goto err_out_now; + + /* We support sector sizes up to the PAGE_CACHE_SIZE. */ + if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) { + if (!silent) + ntfs_error(sb, "Device has unsupported sector size " + "(%i). The maximum supported sector " + "size on this architecture is %lu " + "bytes.", + bdev_logical_block_size(sb->s_bdev), + PAGE_CACHE_SIZE); + goto err_out_now; + } + /* + * Setup the device access block size to NTFS_BLOCK_SIZE or the hard + * sector size, whichever is bigger. + */ + blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE); + if (blocksize < NTFS_BLOCK_SIZE) { + if (!silent) + ntfs_error(sb, "Unable to set device block size."); + goto err_out_now; + } + BUG_ON(blocksize != sb->s_blocksize); + ntfs_debug("Set device block size to %i bytes (block size bits %i).", + blocksize, sb->s_blocksize_bits); + /* Determine the size of the device in units of block_size bytes. */ + if (!i_size_read(sb->s_bdev->bd_inode)) { + if (!silent) + ntfs_error(sb, "Unable to determine device size."); + goto err_out_now; + } + vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> + sb->s_blocksize_bits; + /* Read the boot sector and return unlocked buffer head to it. */ + if (!(bh = read_ntfs_boot_sector(sb, silent))) { + if (!silent) + ntfs_error(sb, "Not an NTFS volume."); + goto err_out_now; + } + /* + * Extract the data from the boot sector and setup the ntfs volume + * using it. + */ + result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data); + brelse(bh); + if (!result) { + if (!silent) + ntfs_error(sb, "Unsupported NTFS filesystem."); + goto err_out_now; + } + /* + * If the boot sector indicates a sector size bigger than the current + * device block size, switch the device block size to the sector size. + * TODO: It may be possible to support this case even when the set + * below fails, we would just be breaking up the i/o for each sector + * into multiple blocks for i/o purposes but otherwise it should just + * work. However it is safer to leave disabled until someone hits this + * error message and then we can get them to try it without the setting + * so we know for sure that it works. + */ + if (vol->sector_size > blocksize) { + blocksize = sb_set_blocksize(sb, vol->sector_size); + if (blocksize != vol->sector_size) { + if (!silent) + ntfs_error(sb, "Unable to set device block " + "size to sector size (%i).", + vol->sector_size); + goto err_out_now; + } + BUG_ON(blocksize != sb->s_blocksize); + vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> + sb->s_blocksize_bits; + ntfs_debug("Changed device block size to %i bytes (block size " + "bits %i) to match volume sector size.", + blocksize, sb->s_blocksize_bits); + } + /* Initialize the cluster and mft allocators. */ + ntfs_setup_allocators(vol); + /* Setup remaining fields in the super block. */ + sb->s_magic = NTFS_SB_MAGIC; + /* + * Ntfs allows 63 bits for the file size, i.e. correct would be: + * sb->s_maxbytes = ~0ULL >> 1; + * But the kernel uses a long as the page cache page index which on + * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel + * defined to the maximum the page cache page index can cope with + * without overflowing the index or to 2^63 - 1, whichever is smaller. + */ + sb->s_maxbytes = MAX_LFS_FILESIZE; + /* Ntfs measures time in 100ns intervals. */ + sb->s_time_gran = 100; + /* + * Now load the metadata required for the page cache and our address + * space operations to function. We do this by setting up a specialised + * read_inode method and then just calling the normal iget() to obtain + * the inode for $MFT which is sufficient to allow our normal inode + * operations and associated address space operations to function. + */ + sb->s_op = &ntfs_sops; + tmp_ino = new_inode(sb); + if (!tmp_ino) { + if (!silent) + ntfs_error(sb, "Failed to load essential metadata."); + goto err_out_now; + } + tmp_ino->i_ino = FILE_MFT; + insert_inode_hash(tmp_ino); + if (ntfs_read_inode_mount(tmp_ino) < 0) { + if (!silent) + ntfs_error(sb, "Failed to load essential metadata."); + goto iput_tmp_ino_err_out_now; + } + mutex_lock(&ntfs_lock); + /* + * The current mount is a compression user if the cluster size is + * less than or equal 4kiB. + */ + if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) { + result = allocate_compression_buffers(); + if (result) { + ntfs_error(NULL, "Failed to allocate buffers " + "for compression engine."); + ntfs_nr_compression_users--; + mutex_unlock(&ntfs_lock); + goto iput_tmp_ino_err_out_now; + } + } + /* + * Generate the global default upcase table if necessary. Also + * temporarily increment the number of upcase users to avoid race + * conditions with concurrent (u)mounts. + */ + if (!default_upcase) + default_upcase = generate_default_upcase(); + ntfs_nr_upcase_users++; + mutex_unlock(&ntfs_lock); + /* + * From now on, ignore @silent parameter. If we fail below this line, + * it will be due to a corrupt fs or a system error, so we report it. + */ + /* + * Open the system files with normal access functions and complete + * setting up the ntfs super block. + */ + if (!load_system_files(vol)) { + ntfs_error(sb, "Failed to load system files."); + goto unl_upcase_iput_tmp_ino_err_out_now; + } + if ((sb->s_root = d_alloc_root(vol->root_ino))) { + /* We grab a reference, simulating an ntfs_iget(). */ + ihold(vol->root_ino); + ntfs_debug("Exiting, status successful."); + /* Release the default upcase if it has no users. */ + mutex_lock(&ntfs_lock); + if (!--ntfs_nr_upcase_users && default_upcase) { + ntfs_free(default_upcase); + default_upcase = NULL; + } + mutex_unlock(&ntfs_lock); + sb->s_export_op = &ntfs_export_ops; + lockdep_on(); + return 0; + } + ntfs_error(sb, "Failed to allocate root directory."); + /* Clean up after the successful load_system_files() call from above. */ + // TODO: Use ntfs_put_super() instead of repeating all this code... + // FIXME: Should mark the volume clean as the error is most likely + // -ENOMEM. + iput(vol->vol_ino); + vol->vol_ino = NULL; + /* NTFS 3.0+ specific clean up. */ + if (vol->major_ver >= 3) { +#ifdef NTFS_RW + if (vol->usnjrnl_j_ino) { + iput(vol->usnjrnl_j_ino); + vol->usnjrnl_j_ino = NULL; + } + if (vol->usnjrnl_max_ino) { + iput(vol->usnjrnl_max_ino); + vol->usnjrnl_max_ino = NULL; + } + if (vol->usnjrnl_ino) { + iput(vol->usnjrnl_ino); + vol->usnjrnl_ino = NULL; + } + if (vol->quota_q_ino) { + iput(vol->quota_q_ino); + vol->quota_q_ino = NULL; + } + if (vol->quota_ino) { + iput(vol->quota_ino); + vol->quota_ino = NULL; + } +#endif /* NTFS_RW */ + if (vol->extend_ino) { + iput(vol->extend_ino); + vol->extend_ino = NULL; + } + if (vol->secure_ino) { + iput(vol->secure_ino); + vol->secure_ino = NULL; + } + } + iput(vol->root_ino); + vol->root_ino = NULL; + iput(vol->lcnbmp_ino); + vol->lcnbmp_ino = NULL; + iput(vol->mftbmp_ino); + vol->mftbmp_ino = NULL; +#ifdef NTFS_RW + if (vol->logfile_ino) { + iput(vol->logfile_ino); + vol->logfile_ino = NULL; + } + if (vol->mftmirr_ino) { + iput(vol->mftmirr_ino); + vol->mftmirr_ino = NULL; + } +#endif /* NTFS_RW */ + /* Throw away the table of attribute definitions. */ + vol->attrdef_size = 0; + if (vol->attrdef) { + ntfs_free(vol->attrdef); + vol->attrdef = NULL; + } + vol->upcase_len = 0; + mutex_lock(&ntfs_lock); + if (vol->upcase == default_upcase) { + ntfs_nr_upcase_users--; + vol->upcase = NULL; + } + mutex_unlock(&ntfs_lock); + if (vol->upcase) { + ntfs_free(vol->upcase); + vol->upcase = NULL; + } + if (vol->nls_map) { + unload_nls(vol->nls_map); + vol->nls_map = NULL; + } + /* Error exit code path. */ +unl_upcase_iput_tmp_ino_err_out_now: + /* + * Decrease the number of upcase users and destroy the global default + * upcase table if necessary. + */ + mutex_lock(&ntfs_lock); + if (!--ntfs_nr_upcase_users && default_upcase) { + ntfs_free(default_upcase); + default_upcase = NULL; + } + if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) + free_compression_buffers(); + mutex_unlock(&ntfs_lock); +iput_tmp_ino_err_out_now: + iput(tmp_ino); + if (vol->mft_ino && vol->mft_ino != tmp_ino) + iput(vol->mft_ino); + vol->mft_ino = NULL; + /* Errors at this stage are irrelevant. */ +err_out_now: + sb->s_fs_info = NULL; + kfree(vol); + ntfs_debug("Failed, returning -EINVAL."); + lockdep_on(); + return -EINVAL; +} + +/* + * This is a slab cache to optimize allocations and deallocations of Unicode + * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN + * (255) Unicode characters + a terminating NULL Unicode character. + */ +struct kmem_cache *ntfs_name_cache; + +/* Slab caches for efficient allocation/deallocation of inodes. */ +struct kmem_cache *ntfs_inode_cache; +struct kmem_cache *ntfs_big_inode_cache; + +/* Init once constructor for the inode slab cache. */ +static void ntfs_big_inode_init_once(void *foo) +{ + ntfs_inode *ni = (ntfs_inode *)foo; + + inode_init_once(VFS_I(ni)); +} + +/* + * Slab caches to optimize allocations and deallocations of attribute search + * contexts and index contexts, respectively. + */ +struct kmem_cache *ntfs_attr_ctx_cache; +struct kmem_cache *ntfs_index_ctx_cache; + +/* Driver wide mutex. */ +DEFINE_MUTEX(ntfs_lock); + +static struct dentry *ntfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); +} + +static struct file_system_type ntfs_fs_type = { + .owner = THIS_MODULE, + .name = "ntfs", + .mount = ntfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +/* Stable names for the slab caches. */ +static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache"; +static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache"; +static const char ntfs_name_cache_name[] = "ntfs_name_cache"; +static const char ntfs_inode_cache_name[] = "ntfs_inode_cache"; +static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache"; + +static int __init init_ntfs_fs(void) +{ + int err = 0; + + /* This may be ugly but it results in pretty output so who cares. (-8 */ + printk(KERN_INFO "NTFS driver " NTFS_VERSION " [Flags: R/" +#ifdef NTFS_RW + "W" +#else + "O" +#endif +#ifdef DEBUG + " DEBUG" +#endif +#ifdef MODULE + " MODULE" +#endif + "].\n"); + + ntfs_debug("Debug messages are enabled."); + + ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name, + sizeof(ntfs_index_context), 0 /* offset */, + SLAB_HWCACHE_ALIGN, NULL /* ctor */); + if (!ntfs_index_ctx_cache) { + printk(KERN_CRIT "NTFS: Failed to create %s!\n", + ntfs_index_ctx_cache_name); + goto ictx_err_out; + } + ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name, + sizeof(ntfs_attr_search_ctx), 0 /* offset */, + SLAB_HWCACHE_ALIGN, NULL /* ctor */); + if (!ntfs_attr_ctx_cache) { + printk(KERN_CRIT "NTFS: Failed to create %s!\n", + ntfs_attr_ctx_cache_name); + goto actx_err_out; + } + + ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name, + (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!ntfs_name_cache) { + printk(KERN_CRIT "NTFS: Failed to create %s!\n", + ntfs_name_cache_name); + goto name_err_out; + } + + ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name, + sizeof(ntfs_inode), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + if (!ntfs_inode_cache) { + printk(KERN_CRIT "NTFS: Failed to create %s!\n", + ntfs_inode_cache_name); + goto inode_err_out; + } + + ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name, + sizeof(big_ntfs_inode), 0, + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + ntfs_big_inode_init_once); + if (!ntfs_big_inode_cache) { + printk(KERN_CRIT "NTFS: Failed to create %s!\n", + ntfs_big_inode_cache_name); + goto big_inode_err_out; + } + + /* Register the ntfs sysctls. */ + err = ntfs_sysctl(1); + if (err) { + printk(KERN_CRIT "NTFS: Failed to register NTFS sysctls!\n"); + goto sysctl_err_out; + } + + err = register_filesystem(&ntfs_fs_type); + if (!err) { + ntfs_debug("NTFS driver registered successfully."); + return 0; /* Success! */ + } + printk(KERN_CRIT "NTFS: Failed to register NTFS filesystem driver!\n"); + +sysctl_err_out: + kmem_cache_destroy(ntfs_big_inode_cache); +big_inode_err_out: + kmem_cache_destroy(ntfs_inode_cache); +inode_err_out: + kmem_cache_destroy(ntfs_name_cache); +name_err_out: + kmem_cache_destroy(ntfs_attr_ctx_cache); +actx_err_out: + kmem_cache_destroy(ntfs_index_ctx_cache); +ictx_err_out: + if (!err) { + printk(KERN_CRIT "NTFS: Aborting NTFS filesystem driver " + "registration...\n"); + err = -ENOMEM; + } + return err; +} + +static void __exit exit_ntfs_fs(void) +{ + ntfs_debug("Unregistering NTFS driver."); + + unregister_filesystem(&ntfs_fs_type); + kmem_cache_destroy(ntfs_big_inode_cache); + kmem_cache_destroy(ntfs_inode_cache); + kmem_cache_destroy(ntfs_name_cache); + kmem_cache_destroy(ntfs_attr_ctx_cache); + kmem_cache_destroy(ntfs_index_ctx_cache); + /* Unregister the ntfs sysctls. */ + ntfs_sysctl(0); +} + +MODULE_AUTHOR("Anton Altaparmakov "); +MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc."); +MODULE_VERSION(NTFS_VERSION); +MODULE_LICENSE("GPL"); +#ifdef DEBUG +module_param(debug_msgs, bool, 0); +MODULE_PARM_DESC(debug_msgs, "Enable debug messages."); +#endif + +module_init(init_ntfs_fs) +module_exit(exit_ntfs_fs) diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c new file mode 100644 index 00000000..79a89184 --- /dev/null +++ b/fs/ntfs/sysctl.c @@ -0,0 +1,83 @@ +/* + * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of + * the Linux-NTFS project. Adapted from the old NTFS driver, + * Copyright (C) 1997 Martin von Löwis, Régis Duchesne + * + * Copyright (c) 2002-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifdef DEBUG + +#include + +#ifdef CONFIG_SYSCTL + +#include +#include + +#include "sysctl.h" +#include "debug.h" + +/* Definition of the ntfs sysctl. */ +static ctl_table ntfs_sysctls[] = { + { + .procname = "ntfs-debug", + .data = &debug_msgs, /* Data pointer and size. */ + .maxlen = sizeof(debug_msgs), + .mode = 0644, /* Mode, proc handler. */ + .proc_handler = proc_dointvec + }, + {} +}; + +/* Define the parent directory /proc/sys/fs. */ +static ctl_table sysctls_root[] = { + { + .procname = "fs", + .mode = 0555, + .child = ntfs_sysctls + }, + {} +}; + +/* Storage for the sysctls header. */ +static struct ctl_table_header *sysctls_root_table = NULL; + +/** + * ntfs_sysctl - add or remove the debug sysctl + * @add: add (1) or remove (0) the sysctl + * + * Add or remove the debug sysctl. Return 0 on success or -errno on error. + */ +int ntfs_sysctl(int add) +{ + if (add) { + BUG_ON(sysctls_root_table); + sysctls_root_table = register_sysctl_table(sysctls_root); + if (!sysctls_root_table) + return -ENOMEM; + } else { + BUG_ON(!sysctls_root_table); + unregister_sysctl_table(sysctls_root_table); + sysctls_root_table = NULL; + } + return 0; +} + +#endif /* CONFIG_SYSCTL */ +#endif /* DEBUG */ diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h new file mode 100644 index 00000000..d4f8ce92 --- /dev/null +++ b/fs/ntfs/sysctl.h @@ -0,0 +1,41 @@ +/* + * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of + * the Linux-NTFS project. Adapted from the old NTFS driver, + * Copyright (C) 1997 Martin von Löwis, Régis Duchesne + * + * Copyright (c) 2002-2004 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_SYSCTL_H +#define _LINUX_NTFS_SYSCTL_H + + +#if defined(DEBUG) && defined(CONFIG_SYSCTL) + +extern int ntfs_sysctl(int add); + +#else + +/* Just return success. */ +static inline int ntfs_sysctl(int add) +{ + return 0; +} + +#endif /* DEBUG && CONFIG_SYSCTL */ +#endif /* _LINUX_NTFS_SYSCTL_H */ diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h new file mode 100644 index 00000000..01233989 --- /dev/null +++ b/fs/ntfs/time.h @@ -0,0 +1,100 @@ +/* + * time.h - NTFS time conversion functions. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_TIME_H +#define _LINUX_NTFS_TIME_H + +#include /* For current_kernel_time(). */ +#include /* For do_div(). */ + +#include "endian.h" + +#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000) + +/** + * utc2ntfs - convert Linux UTC time to NTFS time + * @ts: Linux UTC time to convert to NTFS time + * + * Convert the Linux UTC time @ts to its corresponding NTFS time and return + * that in little endian format. + * + * Linux stores time in a struct timespec consisting of a time_t (long at + * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second + * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of + * 1-nano-second intervals since the value of tv_sec. + * + * NTFS uses Microsoft's standard time format which is stored in a s64 and is + * measured as the number of 100-nano-second intervals since 1st January 1601, + * 00:00:00 UTC. + */ +static inline sle64 utc2ntfs(const struct timespec ts) +{ + /* + * Convert the seconds to 100ns intervals, add the nano-seconds + * converted to 100ns intervals, and then add the NTFS time offset. + */ + return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 + + NTFS_TIME_OFFSET); +} + +/** + * get_current_ntfs_time - get the current time in little endian NTFS format + * + * Get the current time from the Linux kernel, convert it to its corresponding + * NTFS time and return that in little endian format. + */ +static inline sle64 get_current_ntfs_time(void) +{ + return utc2ntfs(current_kernel_time()); +} + +/** + * ntfs2utc - convert NTFS time to Linux time + * @time: NTFS time (little endian) to convert to Linux UTC + * + * Convert the little endian NTFS time @time to its corresponding Linux UTC + * time and return that in cpu format. + * + * Linux stores time in a struct timespec consisting of a time_t (long at + * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second + * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of + * 1-nano-second intervals since the value of tv_sec. + * + * NTFS uses Microsoft's standard time format which is stored in a s64 and is + * measured as the number of 100 nano-second intervals since 1st January 1601, + * 00:00:00 UTC. + */ +static inline struct timespec ntfs2utc(const sle64 time) +{ + struct timespec ts; + + /* Subtract the NTFS time offset. */ + u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET); + /* + * Convert the time to 1-second intervals and the remainder to + * 1-nano-second intervals. + */ + ts.tv_nsec = do_div(t, 10000000) * 100; + ts.tv_sec = t; + return ts; +} + +#endif /* _LINUX_NTFS_TIME_H */ diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h new file mode 100644 index 00000000..8c8053b6 --- /dev/null +++ b/fs/ntfs/types.h @@ -0,0 +1,69 @@ +/* + * types.h - Defines for NTFS Linux kernel driver specific types. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_TYPES_H +#define _LINUX_NTFS_TYPES_H + +#include + +typedef __le16 le16; +typedef __le32 le32; +typedef __le64 le64; +typedef __u16 __bitwise sle16; +typedef __u32 __bitwise sle32; +typedef __u64 __bitwise sle64; + +/* 2-byte Unicode character type. */ +typedef le16 ntfschar; +#define UCHAR_T_SIZE_BITS 1 + +/* + * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN + * and VCN, to allow for type checking and better code readability. + */ +typedef s64 VCN; +typedef sle64 leVCN; +typedef s64 LCN; +typedef sle64 leLCN; + +/* + * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit + * values. We define our own type LSN, to allow for type checking and better + * code readability. + */ +typedef s64 LSN; +typedef sle64 leLSN; + +/* + * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values. + * We define our own type USN, to allow for type checking and better code + * readability. + */ +typedef s64 USN; +typedef sle64 leUSN; + +typedef enum { + CASE_SENSITIVE = 0, + IGNORE_CASE = 1, +} IGNORE_CASE_BOOL; + +#endif /* _LINUX_NTFS_TYPES_H */ diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c new file mode 100644 index 00000000..005ca4b0 --- /dev/null +++ b/fs/ntfs/unistr.c @@ -0,0 +1,398 @@ +/* + * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2006 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include + +#include "types.h" +#include "debug.h" +#include "ntfs.h" + +/* + * IMPORTANT + * ========= + * + * All these routines assume that the Unicode characters are in little endian + * encoding inside the strings!!! + */ + +/* + * This is used by the name collation functions to quickly determine what + * characters are (in)valid. + */ +static const u8 legal_ansi_char_array[0x40] = { + 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + + 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17, + 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00, + + 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, + 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18, +}; + +/** + * ntfs_are_names_equal - compare two Unicode names for equality + * @s1: name to compare to @s2 + * @s1_len: length in Unicode characters of @s1 + * @s2: name to compare to @s1 + * @s2_len: length in Unicode characters of @s2 + * @ic: ignore case bool + * @upcase: upcase table (only if @ic == IGNORE_CASE) + * @upcase_size: length in Unicode characters of @upcase (if present) + * + * Compare the names @s1 and @s2 and return 'true' (1) if the names are + * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE, + * the @upcase table is used to performa a case insensitive comparison. + */ +bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, + const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_size) +{ + if (s1_len != s2_len) + return false; + if (ic == CASE_SENSITIVE) + return !ntfs_ucsncmp(s1, s2, s1_len); + return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size); +} + +/** + * ntfs_collate_names - collate two Unicode names + * @name1: first Unicode name to compare + * @name2: second Unicode name to compare + * @err_val: if @name1 contains an invalid character return this value + * @ic: either CASE_SENSITIVE or IGNORE_CASE + * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE) + * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE) + * + * ntfs_collate_names collates two Unicode names and returns: + * + * -1 if the first name collates before the second one, + * 0 if the names match, + * 1 if the second name collates before the first one, or + * @err_val if an invalid character is found in @name1 during the comparison. + * + * The following characters are considered invalid: '"', '*', '<', '>' and '?'. + */ +int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, + const ntfschar *name2, const u32 name2_len, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len) +{ + u32 cnt, min_len; + u16 c1, c2; + + min_len = name1_len; + if (name1_len > name2_len) + min_len = name2_len; + for (cnt = 0; cnt < min_len; ++cnt) { + c1 = le16_to_cpu(*name1++); + c2 = le16_to_cpu(*name2++); + if (ic) { + if (c1 < upcase_len) + c1 = le16_to_cpu(upcase[c1]); + if (c2 < upcase_len) + c2 = le16_to_cpu(upcase[c2]); + } + if (c1 < 64 && legal_ansi_char_array[c1] & 8) + return err_val; + if (c1 < c2) + return -1; + if (c1 > c2) + return 1; + } + if (name1_len < name2_len) + return -1; + if (name1_len == name2_len) + return 0; + /* name1_len > name2_len */ + c1 = le16_to_cpu(*name1); + if (c1 < 64 && legal_ansi_char_array[c1] & 8) + return err_val; + return 1; +} + +/** + * ntfs_ucsncmp - compare two little endian Unicode strings + * @s1: first string + * @s2: second string + * @n: maximum unicode characters to compare + * + * Compare the first @n characters of the Unicode strings @s1 and @s2, + * The strings in little endian format and appropriate le16_to_cpu() + * conversion is performed on non-little endian machines. + * + * The function returns an integer less than, equal to, or greater than zero + * if @s1 (or the first @n Unicode characters thereof) is found, respectively, + * to be less than, to match, or be greater than @s2. + */ +int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) +{ + u16 c1, c2; + size_t i; + + for (i = 0; i < n; ++i) { + c1 = le16_to_cpu(s1[i]); + c2 = le16_to_cpu(s2[i]); + if (c1 < c2) + return -1; + if (c1 > c2) + return 1; + if (!c1) + break; + } + return 0; +} + +/** + * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case + * @s1: first string + * @s2: second string + * @n: maximum unicode characters to compare + * @upcase: upcase table + * @upcase_size: upcase table size in Unicode characters + * + * Compare the first @n characters of the Unicode strings @s1 and @s2, + * ignoring case. The strings in little endian format and appropriate + * le16_to_cpu() conversion is performed on non-little endian machines. + * + * Each character is uppercased using the @upcase table before the comparison. + * + * The function returns an integer less than, equal to, or greater than zero + * if @s1 (or the first @n Unicode characters thereof) is found, respectively, + * to be less than, to match, or be greater than @s2. + */ +int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, + const ntfschar *upcase, const u32 upcase_size) +{ + size_t i; + u16 c1, c2; + + for (i = 0; i < n; ++i) { + if ((c1 = le16_to_cpu(s1[i])) < upcase_size) + c1 = le16_to_cpu(upcase[c1]); + if ((c2 = le16_to_cpu(s2[i])) < upcase_size) + c2 = le16_to_cpu(upcase[c2]); + if (c1 < c2) + return -1; + if (c1 > c2) + return 1; + if (!c1) + break; + } + return 0; +} + +void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase, + const u32 upcase_len) +{ + u32 i; + u16 u; + + for (i = 0; i < name_len; i++) + if ((u = le16_to_cpu(name[i])) < upcase_len) + name[i] = upcase[u]; +} + +void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, + const ntfschar *upcase, const u32 upcase_len) +{ + ntfs_upcase_name((ntfschar*)&file_name_attr->file_name, + file_name_attr->file_name_length, upcase, upcase_len); +} + +int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, + FILE_NAME_ATTR *file_name_attr2, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len) +{ + return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name, + file_name_attr1->file_name_length, + (ntfschar*)&file_name_attr2->file_name, + file_name_attr2->file_name_length, + err_val, ic, upcase, upcase_len); +} + +/** + * ntfs_nlstoucs - convert NLS string to little endian Unicode string + * @vol: ntfs volume which we are working with + * @ins: input NLS string buffer + * @ins_len: length of input string in bytes + * @outs: on return contains the allocated output Unicode string buffer + * + * Convert the input string @ins, which is in whatever format the loaded NLS + * map dictates, into a little endian, 2-byte Unicode string. + * + * This function allocates the string and the caller is responsible for + * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it. + * + * On success the function returns the number of Unicode characters written to + * the output string *@outs (>= 0), not counting the terminating Unicode NULL + * character. *@outs is set to the allocated output string buffer. + * + * On error, a negative number corresponding to the error code is returned. In + * that case the output string is not allocated. Both *@outs and *@outs_len + * are then undefined. + * + * This might look a bit odd due to fast path optimization... + */ +int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, + const int ins_len, ntfschar **outs) +{ + struct nls_table *nls = vol->nls_map; + ntfschar *ucs; + wchar_t wc; + int i, o, wc_len; + + /* We do not trust outside sources. */ + if (likely(ins)) { + ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS); + if (likely(ucs)) { + for (i = o = 0; i < ins_len; i += wc_len) { + wc_len = nls->char2uni(ins + i, ins_len - i, + &wc); + if (likely(wc_len >= 0 && + o < NTFS_MAX_NAME_LEN)) { + if (likely(wc)) { + ucs[o++] = cpu_to_le16(wc); + continue; + } /* else if (!wc) */ + break; + } /* else if (wc_len < 0 || + o >= NTFS_MAX_NAME_LEN) */ + goto name_err; + } + ucs[o] = 0; + *outs = ucs; + return o; + } /* else if (!ucs) */ + ntfs_error(vol->sb, "Failed to allocate buffer for converted " + "name from ntfs_name_cache."); + return -ENOMEM; + } /* else if (!ins) */ + ntfs_error(vol->sb, "Received NULL pointer."); + return -EINVAL; +name_err: + kmem_cache_free(ntfs_name_cache, ucs); + if (wc_len < 0) { + ntfs_error(vol->sb, "Name using character set %s contains " + "characters that cannot be converted to " + "Unicode.", nls->charset); + i = -EILSEQ; + } else /* if (o >= NTFS_MAX_NAME_LEN) */ { + ntfs_error(vol->sb, "Name is too long (maximum length for a " + "name on NTFS is %d Unicode characters.", + NTFS_MAX_NAME_LEN); + i = -ENAMETOOLONG; + } + return i; +} + +/** + * ntfs_ucstonls - convert little endian Unicode string to NLS string + * @vol: ntfs volume which we are working with + * @ins: input Unicode string buffer + * @ins_len: length of input string in Unicode characters + * @outs: on return contains the (allocated) output NLS string buffer + * @outs_len: length of output string buffer in bytes + * + * Convert the input little endian, 2-byte Unicode string @ins, of length + * @ins_len into the string format dictated by the loaded NLS. + * + * If *@outs is NULL, this function allocates the string and the caller is + * responsible for calling kfree(*@outs); when finished with it. In this case + * @outs_len is ignored and can be 0. + * + * On success the function returns the number of bytes written to the output + * string *@outs (>= 0), not counting the terminating NULL byte. If the output + * string buffer was allocated, *@outs is set to it. + * + * On error, a negative number corresponding to the error code is returned. In + * that case the output string is not allocated. The contents of *@outs are + * then undefined. + * + * This might look a bit odd due to fast path optimization... + */ +int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, + const int ins_len, unsigned char **outs, int outs_len) +{ + struct nls_table *nls = vol->nls_map; + unsigned char *ns; + int i, o, ns_len, wc; + + /* We don't trust outside sources. */ + if (ins) { + ns = *outs; + ns_len = outs_len; + if (ns && !ns_len) { + wc = -ENAMETOOLONG; + goto conversion_err; + } + if (!ns) { + ns_len = ins_len * NLS_MAX_CHARSET_SIZE; + ns = kmalloc(ns_len + 1, GFP_NOFS); + if (!ns) + goto mem_err_out; + } + for (i = o = 0; i < ins_len; i++) { +retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, + ns_len - o); + if (wc > 0) { + o += wc; + continue; + } else if (!wc) + break; + else if (wc == -ENAMETOOLONG && ns != *outs) { + unsigned char *tc; + /* Grow in multiples of 64 bytes. */ + tc = kmalloc((ns_len + 64) & + ~63, GFP_NOFS); + if (tc) { + memcpy(tc, ns, ns_len); + ns_len = ((ns_len + 64) & ~63) - 1; + kfree(ns); + ns = tc; + goto retry; + } /* No memory so goto conversion_error; */ + } /* wc < 0, real error. */ + goto conversion_err; + } + ns[o] = 0; + *outs = ns; + return o; + } /* else (!ins) */ + ntfs_error(vol->sb, "Received NULL pointer."); + return -EINVAL; +conversion_err: + ntfs_error(vol->sb, "Unicode name contains characters that cannot be " + "converted to character set %s. You might want to " + "try to use the mount option nls=utf8.", nls->charset); + if (ns != *outs) + kfree(ns); + if (wc != -ENAMETOOLONG) + wc = -EILSEQ; + return wc; +mem_err_out: + ntfs_error(vol->sb, "Failed to allocate name!"); + return -ENOMEM; +} diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c new file mode 100644 index 00000000..e2f72ca9 --- /dev/null +++ b/fs/ntfs/upcase.c @@ -0,0 +1,87 @@ +/* + * upcase.c - Generate the full NTFS Unicode upcase table in little endian. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001 Richard Russon + * Copyright (c) 2001-2006 Anton Altaparmakov + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS source + * in the file COPYING); if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "malloc.h" +#include "ntfs.h" + +ntfschar *generate_default_upcase(void) +{ + static const int uc_run_table[][3] = { /* Start, End, Add */ + {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74}, + {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86}, + {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100}, + {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128}, + {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112}, + {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126}, + {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8}, + {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8}, + {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8}, + {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7}, + {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16}, + {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26}, + {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32}, + {0} + }; + + static const int uc_dup_table[][2] = { /* Start, End */ + {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC}, + {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB}, + {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5}, + {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9}, + {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95}, + {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9}, + {0} + }; + + static const int uc_word_table[][2] = { /* Offset, Value */ + {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196}, + {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C}, + {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D}, + {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F}, + {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9}, + {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE}, + {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7}, + {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197}, + {0} + }; + + int i, r; + ntfschar *uc; + + uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar)); + if (!uc) + return uc; + memset(uc, 0, default_upcase_len * sizeof(ntfschar)); + /* Generate the little endian Unicode upcase table used by ntfs. */ + for (i = 0; i < default_upcase_len; i++) + uc[i] = cpu_to_le16(i); + for (r = 0; uc_run_table[r][0]; r++) + for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) + le16_add_cpu(&uc[i], uc_run_table[r][2]); + for (r = 0; uc_dup_table[r][0]; r++) + for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) + le16_add_cpu(&uc[i + 1], -1); + for (r = 0; uc_word_table[r][0]; r++) + uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]); + return uc; +} diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c new file mode 100644 index 00000000..b2bc0d55 --- /dev/null +++ b/fs/ntfs/usnjrnl.c @@ -0,0 +1,84 @@ +/* + * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifdef NTFS_RW + +#include +#include +#include + +#include "aops.h" +#include "debug.h" +#include "endian.h" +#include "time.h" +#include "types.h" +#include "usnjrnl.h" +#include "volume.h" + +/** + * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume + * @vol: ntfs volume on which to stamp the transaction log + * + * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return + * 'true' on success and 'false' on error. + * + * This function assumes that the transaction log has already been loaded and + * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl(). + */ +bool ntfs_stamp_usnjrnl(ntfs_volume *vol) +{ + ntfs_debug("Entering."); + if (likely(!NVolUsnJrnlStamped(vol))) { + sle64 stamp; + struct page *page; + USN_HEADER *uh; + + page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read from " + "$UsnJrnl/$DATA/$Max attribute."); + return false; + } + uh = (USN_HEADER*)page_address(page); + stamp = get_current_ntfs_time(); + ntfs_debug("Stamping transaction log ($UsnJrnl): old " + "journal_id 0x%llx, old lowest_valid_usn " + "0x%llx, new journal_id 0x%llx, new " + "lowest_valid_usn 0x%llx.", + (long long)sle64_to_cpu(uh->journal_id), + (long long)sle64_to_cpu(uh->lowest_valid_usn), + (long long)sle64_to_cpu(stamp), + i_size_read(vol->usnjrnl_j_ino)); + uh->lowest_valid_usn = + cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino)); + uh->journal_id = stamp; + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + /* Set the flag so we do not have to do it again on remount. */ + NVolSetUsnJrnlStamped(vol); + } + ntfs_debug("Done."); + return true; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h new file mode 100644 index 00000000..00d8e6bd --- /dev/null +++ b/fs/ntfs/usnjrnl.h @@ -0,0 +1,205 @@ +/* + * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_USNJRNL_H +#define _LINUX_NTFS_USNJRNL_H + +#ifdef NTFS_RW + +#include "types.h" +#include "endian.h" +#include "layout.h" +#include "volume.h" + +/* + * Transaction log ($UsnJrnl) organization: + * + * The transaction log records whenever a file is modified in any way. So for + * example it will record that file "blah" was written to at a particular time + * but not what was written. If will record that a file was deleted or + * created, that a file was truncated, etc. See below for all the reason + * codes used. + * + * The transaction log is in the $Extend directory which is in the root + * directory of each volume. If it is not present it means transaction + * logging is disabled. If it is present it means transaction logging is + * either enabled or in the process of being disabled in which case we can + * ignore it as it will go away as soon as Windows gets its hands on it. + * + * To determine whether the transaction logging is enabled or in the process + * of being disabled, need to check the volume flags in the + * $VOLUME_INFORMATION attribute in the $Volume system file (which is present + * in the root directory and has a fixed mft record number, see layout.h). + * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log + * is in the process of being disabled and if this flag is clear it means the + * transaction log is enabled. + * + * The transaction log consists of two parts; the $DATA/$Max attribute as well + * as the $DATA/$J attribute. $Max is a header describing the transaction + * log whilst $J is the transaction log data itself as a sequence of variable + * sized USN_RECORDs (see below for all the structures). + * + * We do not care about transaction logging at this point in time but we still + * need to let windows know that the transaction log is out of date. To do + * this we need to stamp the transaction log. This involves setting the + * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used + * for the next added USN_RECORD to the $DATA/$J attribute as well as + * generating a new journal_id in $DATA/$Max. + * + * The journal_id is as of the current version (2.0) of the transaction log + * simply the 64-bit timestamp of when the journal was either created or last + * stamped. + * + * To determine the next usn there are two ways. The first is to parse + * $DATA/$J and to find the last USN_RECORD in it and to add its record_length + * to its usn (which is the byte offset in the $DATA/$J attribute). The + * second is simply to take the data size of the attribute. Since the usns + * are simply byte offsets into $DATA/$J, this is exactly the next usn. For + * obvious reasons we use the second method as it is much simpler and faster. + * + * As an aside, note that to actually disable the transaction log, one would + * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go + * through all the mft records on the volume and set the usn field in their + * $STANDARD_INFORMATION attribute to zero. Once that is done, one would need + * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally, + * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag. + * + * Note that if a volume is unmounted whilst the transaction log is being + * disabled, the process will continue the next time the volume is mounted. + * This is why we can safely mount read-write when we see a transaction log + * in the process of being deleted. + */ + +/* Some $UsnJrnl related constants. */ +#define UsnJrnlMajorVer 2 +#define UsnJrnlMinorVer 0 + +/* + * $DATA/$Max attribute. This is (always?) resident and has a fixed size of + * 32 bytes. It contains the header describing the transaction log. + */ +typedef struct { +/*Ofs*/ +/* 0*/sle64 maximum_size; /* The maximum on-disk size of the $DATA/$J + attribute. */ +/* 8*/sle64 allocation_delta; /* Number of bytes by which to increase the + size of the $DATA/$J attribute. */ +/*0x10*/sle64 journal_id; /* Current id of the transaction log. */ +/*0x18*/leUSN lowest_valid_usn; /* Lowest valid usn in $DATA/$J for the + current journal_id. */ +/* sizeof() = 32 (0x20) bytes */ +} __attribute__ ((__packed__)) USN_HEADER; + +/* + * Reason flags (32-bit). Cumulative flags describing the change(s) to the + * file since it was last opened. I think the names speak for themselves but + * if you disagree check out the descriptions in the Linux NTFS project NTFS + * documentation: http://www.linux-ntfs.org/ + */ +enum { + USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001), + USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002), + USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004), + USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010), + USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020), + USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040), + USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100), + USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200), + USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400), + USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800), + USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000), + USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000), + USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000), + USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000), + USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000), + USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000), + USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000), + USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000), + USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000), + USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000), + USN_REASON_CLOSE = cpu_to_le32(0x80000000), +}; + +typedef le32 USN_REASON_FLAGS; + +/* + * Source info flags (32-bit). Information about the source of the change(s) + * to the file. For detailed descriptions of what these mean, see the Linux + * NTFS project NTFS documentation: + * http://www.linux-ntfs.org/ + */ +enum { + USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001), + USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002), + USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004), +}; + +typedef le32 USN_SOURCE_INFO_FLAGS; + +/* + * $DATA/$J attribute. This is always non-resident, is marked as sparse, and + * is of variabled size. It consists of a sequence of variable size + * USN_RECORDS. The minimum allocated_size is allocation_delta as + * specified in $DATA/$Max. When the maximum_size specified in $DATA/$Max is + * exceeded by more than allocation_delta bytes, allocation_delta bytes are + * allocated and appended to the $DATA/$J attribute and an equal number of + * bytes at the beginning of the attribute are freed and made sparse. Note the + * making sparse only happens at volume checkpoints and hence the actual + * $DATA/$J size can exceed maximum_size + allocation_delta temporarily. + */ +typedef struct { +/*Ofs*/ +/* 0*/le32 length; /* Byte size of this record (8-byte + aligned). */ +/* 4*/le16 major_ver; /* Major version of the transaction log used + for this record. */ +/* 6*/le16 minor_ver; /* Minor version of the transaction log used + for this record. */ +/* 8*/leMFT_REF mft_reference;/* The mft reference of the file (or + directory) described by this record. */ +/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent + directory of the file described by this + record. */ +/*0x18*/leUSN usn; /* The usn of this record. Equals the offset + within the $DATA/$J attribute. */ +/*0x20*/sle64 time; /* Time when this record was created. */ +/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */ +/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */ +/*0x30*/le32 security_id; /* File security_id copied from + $STANDARD_INFORMATION. */ +/*0x34*/FILE_ATTR_FLAGS file_attributes; /* File attributes copied from + $STANDARD_INFORMATION or $FILE_NAME (not + sure which). */ +/*0x38*/le16 file_name_size; /* Size of the file name in bytes. */ +/*0x3a*/le16 file_name_offset; /* Offset to the file name in bytes from the + start of this record. */ +/*0x3c*/ntfschar file_name[0]; /* Use when creating only. When reading use + file_name_offset to determine the location + of the name. */ +/* sizeof() = 60 (0x3c) bytes */ +} __attribute__ ((__packed__)) USN_RECORD; + +extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_USNJRNL_H */ diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h new file mode 100644 index 00000000..406ab55d --- /dev/null +++ b/fs/ntfs/volume.h @@ -0,0 +1,177 @@ +/* + * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part + * of the Linux-NTFS project. + * + * Copyright (c) 2001-2006 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_VOLUME_H +#define _LINUX_NTFS_VOLUME_H + +#include + +#include "types.h" +#include "layout.h" + +/* + * The NTFS in memory super block structure. + */ +typedef struct { + /* + * FIXME: Reorder to have commonly used together element within the + * same cache line, aiming at a cache line size of 32 bytes. Aim for + * 64 bytes for less commonly used together elements. Put most commonly + * used elements to front of structure. Obviously do this only when the + * structure has stabilized... (AIA) + */ + /* Device specifics. */ + struct super_block *sb; /* Pointer back to the super_block. */ + LCN nr_blocks; /* Number of sb->s_blocksize bytes + sized blocks on the device. */ + /* Configuration provided by user at mount time. */ + unsigned long flags; /* Miscellaneous flags, see below. */ + uid_t uid; /* uid that files will be mounted as. */ + gid_t gid; /* gid that files will be mounted as. */ + mode_t fmask; /* The mask for file permissions. */ + mode_t dmask; /* The mask for directory + permissions. */ + u8 mft_zone_multiplier; /* Initial mft zone multiplier. */ + u8 on_errors; /* What to do on filesystem errors. */ + /* NTFS bootsector provided information. */ + u16 sector_size; /* in bytes */ + u8 sector_size_bits; /* log2(sector_size) */ + u32 cluster_size; /* in bytes */ + u32 cluster_size_mask; /* cluster_size - 1 */ + u8 cluster_size_bits; /* log2(cluster_size) */ + u32 mft_record_size; /* in bytes */ + u32 mft_record_size_mask; /* mft_record_size - 1 */ + u8 mft_record_size_bits; /* log2(mft_record_size) */ + u32 index_record_size; /* in bytes */ + u32 index_record_size_mask; /* index_record_size - 1 */ + u8 index_record_size_bits; /* log2(index_record_size) */ + LCN nr_clusters; /* Volume size in clusters == number of + bits in lcn bitmap. */ + LCN mft_lcn; /* Cluster location of mft data. */ + LCN mftmirr_lcn; /* Cluster location of copy of mft. */ + u64 serial_no; /* The volume serial number. */ + /* Mount specific NTFS information. */ + u32 upcase_len; /* Number of entries in upcase[]. */ + ntfschar *upcase; /* The upcase table. */ + + s32 attrdef_size; /* Size of the attribute definition + table in bytes. */ + ATTR_DEF *attrdef; /* Table of attribute definitions. + Obtained from FILE_AttrDef. */ + +#ifdef NTFS_RW + /* Variables used by the cluster and mft allocators. */ + s64 mft_data_pos; /* Mft record number at which to + allocate the next mft record. */ + LCN mft_zone_start; /* First cluster of the mft zone. */ + LCN mft_zone_end; /* First cluster beyond the mft zone. */ + LCN mft_zone_pos; /* Current position in the mft zone. */ + LCN data1_zone_pos; /* Current position in the first data + zone. */ + LCN data2_zone_pos; /* Current position in the second data + zone. */ +#endif /* NTFS_RW */ + + struct inode *mft_ino; /* The VFS inode of $MFT. */ + + struct inode *mftbmp_ino; /* Attribute inode for $MFT/$BITMAP. */ + struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the + mft record bitmap ($MFT/$BITMAP). */ +#ifdef NTFS_RW + struct inode *mftmirr_ino; /* The VFS inode of $MFTMirr. */ + int mftmirr_size; /* Size of mft mirror in mft records. */ + + struct inode *logfile_ino; /* The VFS inode of $LogFile. */ +#endif /* NTFS_RW */ + + struct inode *lcnbmp_ino; /* The VFS inode of $Bitmap. */ + struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the + cluster bitmap ($Bitmap/$DATA). */ + + struct inode *vol_ino; /* The VFS inode of $Volume. */ + VOLUME_FLAGS vol_flags; /* Volume flags. */ + u8 major_ver; /* Ntfs major version of volume. */ + u8 minor_ver; /* Ntfs minor version of volume. */ + + struct inode *root_ino; /* The VFS inode of the root + directory. */ + struct inode *secure_ino; /* The VFS inode of $Secure (NTFS3.0+ + only, otherwise NULL). */ + struct inode *extend_ino; /* The VFS inode of $Extend (NTFS3.0+ + only, otherwise NULL). */ +#ifdef NTFS_RW + /* $Quota stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ + struct inode *quota_ino; /* The VFS inode of $Quota. */ + struct inode *quota_q_ino; /* Attribute inode for $Quota/$Q. */ + /* $UsnJrnl stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ + struct inode *usnjrnl_ino; /* The VFS inode of $UsnJrnl. */ + struct inode *usnjrnl_max_ino; /* Attribute inode for $UsnJrnl/$Max. */ + struct inode *usnjrnl_j_ino; /* Attribute inode for $UsnJrnl/$J. */ +#endif /* NTFS_RW */ + struct nls_table *nls_map; +} ntfs_volume; + +/* + * Defined bits for the flags field in the ntfs_volume structure. + */ +typedef enum { + NV_Errors, /* 1: Volume has errors, prevent remount rw. */ + NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */ + NV_CaseSensitive, /* 1: Treat file names as case sensitive and + create filenames in the POSIX namespace. + Otherwise be case insensitive but still + create file names in POSIX namespace. */ + NV_LogFileEmpty, /* 1: $LogFile journal is empty. */ + NV_QuotaOutOfDate, /* 1: $Quota is out of date. */ + NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */ + NV_SparseEnabled, /* 1: May create sparse files. */ +} ntfs_volume_flags; + +/* + * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo() + * functions. + */ +#define DEFINE_NVOL_BIT_OPS(flag) \ +static inline int NVol##flag(ntfs_volume *vol) \ +{ \ + return test_bit(NV_##flag, &(vol)->flags); \ +} \ +static inline void NVolSet##flag(ntfs_volume *vol) \ +{ \ + set_bit(NV_##flag, &(vol)->flags); \ +} \ +static inline void NVolClear##flag(ntfs_volume *vol) \ +{ \ + clear_bit(NV_##flag, &(vol)->flags); \ +} + +/* Emit the ntfs volume bitops functions. */ +DEFINE_NVOL_BIT_OPS(Errors) +DEFINE_NVOL_BIT_OPS(ShowSystemFiles) +DEFINE_NVOL_BIT_OPS(CaseSensitive) +DEFINE_NVOL_BIT_OPS(LogFileEmpty) +DEFINE_NVOL_BIT_OPS(QuotaOutOfDate) +DEFINE_NVOL_BIT_OPS(UsnJrnlStamped) +DEFINE_NVOL_BIT_OPS(SparseEnabled) + +#endif /* _LINUX_NTFS_VOLUME_H */ diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig new file mode 100644 index 00000000..77a8de5f --- /dev/null +++ b/fs/ocfs2/Kconfig @@ -0,0 +1,76 @@ +config OCFS2_FS + tristate "OCFS2 file system support" + depends on NET && SYSFS && CONFIGFS_FS + select JBD2 + select CRC32 + select QUOTA + select QUOTA_TREE + select FS_POSIX_ACL + help + OCFS2 is a general purpose extent based shared disk cluster file + system with many similarities to ext3. It supports 64 bit inode + numbers, and has automatically extending metadata groups which may + also make it attractive for non-clustered use. + + You'll want to install the ocfs2-tools package in order to at least + get "mount.ocfs2". + + Project web page: http://oss.oracle.com/projects/ocfs2 + Tools web page: http://oss.oracle.com/projects/ocfs2-tools + OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ + + For more information on OCFS2, see the file + . + +config OCFS2_FS_O2CB + tristate "O2CB Kernelspace Clustering" + depends on OCFS2_FS + default y + help + OCFS2 includes a simple kernelspace clustering package, the OCFS2 + Cluster Base. It only requires a very small userspace component + to configure it. This comes with the standard ocfs2-tools package. + O2CB is limited to maintaining a cluster for OCFS2 file systems. + It cannot manage any other cluster applications. + + It is always safe to say Y here, as the clustering method is + run-time selectable. + +config OCFS2_FS_USERSPACE_CLUSTER + tristate "OCFS2 Userspace Clustering" + depends on OCFS2_FS && DLM + default y + help + This option will allow OCFS2 to use userspace clustering services + in conjunction with the DLM in fs/dlm. If you are using a + userspace cluster manager, say Y here. + + It is safe to say Y, as the clustering method is run-time + selectable. + +config OCFS2_FS_STATS + bool "OCFS2 statistics" + depends on OCFS2_FS && DEBUG_FS + default y + help + This option allows some fs statistics to be captured. Enabling + this option may increase the memory consumption. + +config OCFS2_DEBUG_MASKLOG + bool "OCFS2 logging support" + depends on OCFS2_FS + default y + help + The ocfs2 filesystem has an extensive logging system. The system + allows selection of events to log via files in /sys/o2cb/logmask/. + This option will enlarge your kernel, but it allows debugging of + ocfs2 filesystem issues. + +config OCFS2_DEBUG_FS + bool "OCFS2 expensive checks" + depends on OCFS2_FS + default n + help + This option will enable expensive consistency checks. Enable + this option for debugging only as it is likely to decrease + performance of the filesystem. diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile new file mode 100644 index 00000000..f17e58b3 --- /dev/null +++ b/fs/ocfs2/Makefile @@ -0,0 +1,54 @@ +ccflags-y := -Ifs/ocfs2 + +ccflags-y += -DCATCH_BH_JBD_RACES + +obj-$(CONFIG_OCFS2_FS) += \ + ocfs2.o \ + ocfs2_stackglue.o + +obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o +obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o + +ocfs2-objs := \ + alloc.o \ + aops.o \ + blockcheck.o \ + buffer_head_io.o \ + dcache.o \ + dir.o \ + dlmglue.o \ + export.o \ + extent_map.o \ + file.o \ + heartbeat.o \ + inode.o \ + ioctl.o \ + journal.o \ + localalloc.o \ + locks.o \ + mmap.o \ + namei.o \ + refcounttree.o \ + reservations.o \ + move_extents.o \ + resize.o \ + slot_map.o \ + suballoc.o \ + super.o \ + symlink.o \ + sysfile.o \ + uptodate.o \ + ver.o \ + quota_local.o \ + quota_global.o \ + xattr.o \ + acl.o + +ocfs2_stackglue-objs := stackglue.o +ocfs2_stack_o2cb-objs := stack_o2cb.o +ocfs2_stack_user-objs := stack_user.o + +obj-$(CONFIG_OCFS2_FS) += dlmfs/ +# cluster/ is always needed when OCFS2_FS for masklog support +obj-$(CONFIG_OCFS2_FS) += cluster/ +obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c new file mode 100644 index 00000000..e913ad13 --- /dev/null +++ b/fs/ocfs2/acl.c @@ -0,0 +1,535 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * acl.c + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * CREDITS: + * Lots of code in this file is copy from linux/fs/ext3/acl.c. + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include + +#include + +#include "ocfs2.h" +#include "alloc.h" +#include "dlmglue.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "ocfs2_fs.h" + +#include "xattr.h" +#include "acl.h" + +/* + * Convert from xattr value to acl struct. + */ +static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size) +{ + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(struct posix_acl_entry)) + return ERR_PTR(-EINVAL); + + count = size / sizeof(struct posix_acl_entry); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + struct ocfs2_acl_entry *entry = + (struct ocfs2_acl_entry *)value; + + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + acl->a_entries[n].e_id = le32_to_cpu(entry->e_id); + value += sizeof(struct posix_acl_entry); + + } + return acl; +} + +/* + * Convert acl struct to xattr value. + */ +static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size) +{ + struct ocfs2_acl_entry *entry = NULL; + char *ocfs2_acl; + size_t n; + + *size = acl->a_count * sizeof(struct posix_acl_entry); + + ocfs2_acl = kmalloc(*size, GFP_NOFS); + if (!ocfs2_acl) + return ERR_PTR(-ENOMEM); + + entry = (struct ocfs2_acl_entry *)ocfs2_acl; + for (n = 0; n < acl->a_count; n++, entry++) { + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + } + return ocfs2_acl; +} + +static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode, + int type, + struct buffer_head *di_bh) +{ + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, + "", value, retval); + } + + if (retval > 0) + acl = ocfs2_acl_from_xattr(value, retval); + else if (retval == -ENODATA || retval == 0) + acl = NULL; + else + acl = ERR_PTR(retval); + + kfree(value); + + return acl; +} + + +/* + * Get posix acl. + */ +static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + struct posix_acl *acl; + int ret; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return NULL; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + acl = ERR_PTR(ret); + return acl; + } + + acl = ocfs2_get_acl_nolock(inode, type, di_bh); + + ocfs2_inode_unlock(inode, 0); + + brelse(di_bh); + + return acl; +} + +/* + * Helper function to set i_mode in memory and disk. Some call paths + * will not have di_bh or a journal handle to pass, in which case it + * will create it's own. + */ +static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, + handle_t *handle, umode_t new_mode) +{ + int ret, commit_handle = 0; + struct ocfs2_dinode *di; + + if (di_bh == NULL) { + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } else + get_bh(di_bh); + + if (handle == NULL) { + handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_brelse; + } + + commit_handle = 1; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + inode->i_mode = new_mode; + inode->i_ctime = CURRENT_TIME; + di->i_mode = cpu_to_le16(inode->i_mode); + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + if (commit_handle) + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out_brelse: + brelse(di_bh); +out: + return ret; +} + +/* + * Set the access or default ACL of an inode. + */ +static int ocfs2_set_acl(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + int type, + struct posix_acl *acl, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int ret; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + ret = posix_acl_equiv_mode(acl, &mode); + if (ret < 0) + return ret; + else { + if (ret == 0) + acl = NULL; + + ret = ocfs2_acl_set_mode(inode, di_bh, + handle, mode); + if (ret) + return ret; + + } + } + break; + case ACL_TYPE_DEFAULT: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + default: + return -EINVAL; + } + + if (acl) { + value = ocfs2_acl_to_xattr(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + if (handle) + ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index, + "", value, size, 0, + meta_ac, data_ac); + else + ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0); + + kfree(value); + + return ret; +} + +int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags) +{ + struct ocfs2_super *osb; + struct buffer_head *di_bh = NULL; + struct posix_acl *acl; + int ret = -EAGAIN; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + osb = OCFS2_SB(inode->i_sb); + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return ret; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh); + + brelse(di_bh); + + if (IS_ERR(acl)) { + mlog_errno(PTR_ERR(acl)); + return PTR_ERR(acl); + } + if (acl) { + ret = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return ret; + } + + return -EAGAIN; +} + +int ocfs2_acl_chmod(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl, *clone; + int ret; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return 0; + + acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + ret = posix_acl_chmod_masq(clone, inode->i_mode); + if (!ret) + ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, + clone, NULL, NULL); + posix_acl_release(clone); + return ret; +} + +/* + * Initialize the ACLs of a new inode. If parent directory has default ACL, + * then clone to new inode. Called from ocfs2_mknod. + */ +int ocfs2_init_acl(handle_t *handle, + struct inode *inode, + struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dir_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl = NULL; + int ret = 0, ret2; + mode_t mode; + + if (!S_ISLNK(inode->i_mode)) { + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, + dir_bh); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) { + mode = inode->i_mode & ~current_umask(); + ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + } + } + if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { + struct posix_acl *clone; + + if (S_ISDIR(inode->i_mode)) { + ret = ocfs2_set_acl(handle, inode, di_bh, + ACL_TYPE_DEFAULT, acl, + meta_ac, data_ac); + if (ret) + goto cleanup; + } + clone = posix_acl_clone(acl, GFP_NOFS); + ret = -ENOMEM; + if (!clone) + goto cleanup; + + mode = inode->i_mode; + ret = posix_acl_create_masq(clone, &mode); + if (ret >= 0) { + ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret2) { + mlog_errno(ret2); + ret = ret2; + goto cleanup; + } + if (ret > 0) { + ret = ocfs2_set_acl(handle, inode, + di_bh, ACL_TYPE_ACCESS, + clone, meta_ac, data_ac); + } + } + posix_acl_release(clone); + } +cleanup: + posix_acl_release(acl); + return ret; +} + +static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry, + char *list, + size_t list_len, + const char *name, + size_t name_len, + int type) +{ + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return 0; + + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_ACCESS, size); + return size; +} + +static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry, + char *list, + size_t list_len, + const char *name, + size_t name_len, + int type) +{ + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return 0; + + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); + return size; +} + +static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + struct posix_acl *acl; + int ret; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return -EOPNOTSUPP; + + acl = ocfs2_get_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + ret = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return ret; +} + +static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct inode *inode = dentry->d_inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl; + int ret = 0; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return -EOPNOTSUPP; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + ret = posix_acl_valid(acl); + if (ret) + goto cleanup; + } + } else + acl = NULL; + + ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); + +cleanup: + posix_acl_release(acl); + return ret; +} + +const struct xattr_handler ocfs2_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = ocfs2_xattr_list_acl_access, + .get = ocfs2_xattr_get_acl, + .set = ocfs2_xattr_set_acl, +}; + +const struct xattr_handler ocfs2_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = ocfs2_xattr_list_acl_default, + .get = ocfs2_xattr_get_acl, + .set = ocfs2_xattr_set_acl, +}; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h new file mode 100644 index 00000000..4fe7c9cf --- /dev/null +++ b/fs/ocfs2/acl.h @@ -0,0 +1,36 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * acl.h + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_ACL_H +#define OCFS2_ACL_H + +#include + +struct ocfs2_acl_entry { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +}; + +extern int ocfs2_check_acl(struct inode *, int, unsigned int); +extern int ocfs2_acl_chmod(struct inode *); +extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, + struct buffer_head *, struct buffer_head *, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); + +#endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c new file mode 100644 index 00000000..76c8165d --- /dev/null +++ b/fs/ocfs2/alloc.c @@ -0,0 +1,7352 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * alloc.c + * + * Extent allocs and frees + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "aops.h" +#include "blockcheck.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "suballoc.h" +#include "sysfile.h" +#include "file.h" +#include "super.h" +#include "uptodate.h" +#include "xattr.h" +#include "refcounttree.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +enum ocfs2_contig_type { + CONTIG_NONE = 0, + CONTIG_LEFT, + CONTIG_RIGHT, + CONTIG_LEFTRIGHT, +}; + +static enum ocfs2_contig_type + ocfs2_extent_rec_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec); +/* + * Operations for a specific extent tree type. + * + * To implement an on-disk btree (extent tree) type in ocfs2, add + * an ocfs2_extent_tree_operations structure and the matching + * ocfs2_init__extent_tree() function. That's pretty much it + * for the allocation portion of the extent tree. + */ +struct ocfs2_extent_tree_operations { + /* + * last_eb_blk is the block number of the right most leaf extent + * block. Most on-disk structures containing an extent tree store + * this value for fast access. The ->eo_set_last_eb_blk() and + * ->eo_get_last_eb_blk() operations access this value. They are + * both required. + */ + void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et, + u64 blkno); + u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et); + + /* + * The on-disk structure usually keeps track of how many total + * clusters are stored in this extent tree. This function updates + * that value. new_clusters is the delta, and must be + * added to the total. Required. + */ + void (*eo_update_clusters)(struct ocfs2_extent_tree *et, + u32 new_clusters); + + /* + * If this extent tree is supported by an extent map, insert + * a record into the map. + */ + void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); + + /* + * If this extent tree is supported by an extent map, truncate the + * map to clusters, + */ + void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et, + u32 clusters); + + /* + * If ->eo_insert_check() exists, it is called before rec is + * inserted into the extent tree. It is optional. + */ + int (*eo_insert_check)(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); + int (*eo_sanity_check)(struct ocfs2_extent_tree *et); + + /* + * -------------------------------------------------------------- + * The remaining are internal to ocfs2_extent_tree and don't have + * accessor functions + */ + + /* + * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el. + * It is required. + */ + void (*eo_fill_root_el)(struct ocfs2_extent_tree *et); + + /* + * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if + * it exists. If it does not, et->et_max_leaf_clusters is set + * to 0 (unlimited). Optional. + */ + void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et); + + /* + * ->eo_extent_contig test whether the 2 ocfs2_extent_rec + * are contiguous or not. Optional. Don't need to set it if use + * ocfs2_extent_rec as the tree leaf. + */ + enum ocfs2_contig_type + (*eo_extent_contig)(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec); +}; + + +/* + * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check + * in the methods. + */ +static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); +static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno); +static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters); +static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); +static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters); +static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); +static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); +static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); +static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { + .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, + .eo_update_clusters = ocfs2_dinode_update_clusters, + .eo_extent_map_insert = ocfs2_dinode_extent_map_insert, + .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate, + .eo_insert_check = ocfs2_dinode_insert_check, + .eo_sanity_check = ocfs2_dinode_sanity_check, + .eo_fill_root_el = ocfs2_dinode_fill_root_el, +}; + +static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + di->i_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + return le64_to_cpu(di->i_last_eb_blk); +} + +static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci); + struct ocfs2_dinode *di = et->et_object; + + le32_add_cpu(&di->i_clusters, clusters); + spin_lock(&oi->ip_lock); + oi->ip_clusters = le32_to_cpu(di->i_clusters); + spin_unlock(&oi->ip_lock); +} + +static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode; + + ocfs2_extent_map_insert_rec(inode, rec); +} + +static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode; + + ocfs2_extent_map_trunc(inode, clusters); +} + +static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci); + struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb); + + BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL); + mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && + (oi->ip_clusters != le32_to_cpu(rec->e_cpos)), + "Device %s, asking for sparse allocation: inode %llu, " + "cpos %u, clusters %u\n", + osb->dev_str, + (unsigned long long)oi->ip_blkno, + rec->e_cpos, oi->ip_clusters); + + return 0; +} + +static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + return 0; +} + +static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + et->et_root_el = &di->id2.i_list; +} + + +static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_value_buf *vb = et->et_object; + + et->et_root_el = &vb->vb_xv->xr_list; +} + +static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_xattr_value_buf *vb = et->et_object; + + vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_value_buf *vb = et->et_object; + + return le64_to_cpu(vb->vb_xv->xr_last_eb_blk); +} + +static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_xattr_value_buf *vb = et->et_object; + + le32_add_cpu(&vb->vb_xv->xr_clusters, clusters); +} + +static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { + .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, + .eo_update_clusters = ocfs2_xattr_value_update_clusters, + .eo_fill_root_el = ocfs2_xattr_value_fill_root_el, +}; + +static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_block *xb = et->et_object; + + et->et_root_el = &xb->xb_attrs.xb_root.xt_list; +} + +static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et) +{ + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + et->et_max_leaf_clusters = + ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE); +} + +static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_xattr_block *xb = et->et_object; + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + + xt->xt_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_block *xb = et->et_object; + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + + return le64_to_cpu(xt->xt_last_eb_blk); +} + +static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_xattr_block *xb = et->et_object; + + le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); +} + +static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { + .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, + .eo_update_clusters = ocfs2_xattr_tree_update_clusters, + .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el, + .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, +}; + +static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + dx_root->dr_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + return le64_to_cpu(dx_root->dr_last_eb_blk); +} + +static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + le32_add_cpu(&dx_root->dr_clusters, clusters); +} + +static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root)); + + return 0; +} + +static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + et->et_root_el = &dx_root->dr_list; +} + +static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { + .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk, + .eo_update_clusters = ocfs2_dx_root_update_clusters, + .eo_sanity_check = ocfs2_dx_root_sanity_check, + .eo_fill_root_el = ocfs2_dx_root_fill_root_el, +}; + +static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + et->et_root_el = &rb->rf_list; +} + +static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + rb->rf_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + return le64_to_cpu(rb->rf_last_eb_blk); +} + +static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + le32_add_cpu(&rb->rf_clusters, clusters); +} + +static enum ocfs2_contig_type +ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) +{ + return CONTIG_NONE; +} + +static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { + .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, + .eo_update_clusters = ocfs2_refcount_tree_update_clusters, + .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el, + .eo_extent_contig = ocfs2_refcount_tree_extent_contig, +}; + +static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh, + ocfs2_journal_access_func access, + void *obj, + struct ocfs2_extent_tree_operations *ops) +{ + et->et_ops = ops; + et->et_root_bh = bh; + et->et_ci = ci; + et->et_root_journal_access = access; + if (!obj) + obj = (void *)bh->b_data; + et->et_object = obj; + + et->et_ops->eo_fill_root_el(et); + if (!et->et_ops->eo_fill_max_leaf_clusters) + et->et_max_leaf_clusters = 0; + else + et->et_ops->eo_fill_max_leaf_clusters(et); +} + +void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di, + NULL, &ocfs2_dinode_et_ops); +} + +void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb, + NULL, &ocfs2_xattr_tree_et_ops); +} + +void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct ocfs2_xattr_value_buf *vb) +{ + __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb, + &ocfs2_xattr_value_et_ops); +} + +void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr, + NULL, &ocfs2_dx_root_et_ops); +} + +void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb, + NULL, &ocfs2_refcount_tree_et_ops); +} + +static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 new_last_eb_blk) +{ + et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk); +} + +static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + return et->et_ops->eo_get_last_eb_blk(et); +} + +static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + et->et_ops->eo_update_clusters(et, clusters); +} + +static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + if (et->et_ops->eo_extent_map_insert) + et->et_ops->eo_extent_map_insert(et, rec); +} + +static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters) +{ + if (et->et_ops->eo_extent_map_truncate) + et->et_ops->eo_extent_map_truncate(et, clusters); +} + +static inline int ocfs2_et_root_journal_access(handle_t *handle, + struct ocfs2_extent_tree *et, + int type) +{ + return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh, + type); +} + +static inline enum ocfs2_contig_type + ocfs2_et_extent_contig(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec, + struct ocfs2_extent_rec *insert_rec) +{ + if (et->et_ops->eo_extent_contig) + return et->et_ops->eo_extent_contig(et, rec, insert_rec); + + return ocfs2_extent_rec_contig( + ocfs2_metadata_cache_get_super(et->et_ci), + rec, insert_rec); +} + +static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + int ret = 0; + + if (et->et_ops->eo_insert_check) + ret = et->et_ops->eo_insert_check(et, rec); + return ret; +} + +static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et) +{ + int ret = 0; + + if (et->et_ops->eo_sanity_check) + ret = et->et_ops->eo_sanity_check(et); + return ret; +} + +static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, + struct ocfs2_extent_block *eb); +static void ocfs2_adjust_rightmost_records(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_extent_rec *insert_rec); +/* + * Reset the actual path elements so that we can re-use the structure + * to build another path. Generally, this involves freeing the buffer + * heads. + */ +void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) +{ + int i, start = 0, depth = 0; + struct ocfs2_path_item *node; + + if (keep_root) + start = 1; + + for(i = start; i < path_num_items(path); i++) { + node = &path->p_node[i]; + + brelse(node->bh); + node->bh = NULL; + node->el = NULL; + } + + /* + * Tree depth may change during truncate, or insert. If we're + * keeping the root extent list, then make sure that our path + * structure reflects the proper depth. + */ + if (keep_root) + depth = le16_to_cpu(path_root_el(path)->l_tree_depth); + else + path_root_access(path) = NULL; + + path->p_tree_depth = depth; +} + +void ocfs2_free_path(struct ocfs2_path *path) +{ + if (path) { + ocfs2_reinit_path(path, 0); + kfree(path); + } +} + +/* + * All the elements of src into dest. After this call, src could be freed + * without affecting dest. + * + * Both paths should have the same root. Any non-root elements of dest + * will be freed. + */ +static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src) +{ + int i; + + BUG_ON(path_root_bh(dest) != path_root_bh(src)); + BUG_ON(path_root_el(dest) != path_root_el(src)); + BUG_ON(path_root_access(dest) != path_root_access(src)); + + ocfs2_reinit_path(dest, 1); + + for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { + dest->p_node[i].bh = src->p_node[i].bh; + dest->p_node[i].el = src->p_node[i].el; + + if (dest->p_node[i].bh) + get_bh(dest->p_node[i].bh); + } +} + +/* + * Make the *dest path the same as src and re-initialize src path to + * have a root only. + */ +static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) +{ + int i; + + BUG_ON(path_root_bh(dest) != path_root_bh(src)); + BUG_ON(path_root_access(dest) != path_root_access(src)); + + for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { + brelse(dest->p_node[i].bh); + + dest->p_node[i].bh = src->p_node[i].bh; + dest->p_node[i].el = src->p_node[i].el; + + src->p_node[i].bh = NULL; + src->p_node[i].el = NULL; + } +} + +/* + * Insert an extent block at given index. + * + * This will not take an additional reference on eb_bh. + */ +static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, + struct buffer_head *eb_bh) +{ + struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; + + /* + * Right now, no root bh is an extent block, so this helps + * catch code errors with dinode trees. The assertion can be + * safely removed if we ever need to insert extent block + * structures at the root. + */ + BUG_ON(index == 0); + + path->p_node[index].bh = eb_bh; + path->p_node[index].el = &eb->h_list; +} + +static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, + struct ocfs2_extent_list *root_el, + ocfs2_journal_access_func access) +{ + struct ocfs2_path *path; + + BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); + + path = kzalloc(sizeof(*path), GFP_NOFS); + if (path) { + path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); + get_bh(root_bh); + path_root_bh(path) = root_bh; + path_root_el(path) = root_el; + path_root_access(path) = access; + } + + return path; +} + +struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) +{ + return ocfs2_new_path(path_root_bh(path), path_root_el(path), + path_root_access(path)); +} + +struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) +{ + return ocfs2_new_path(et->et_root_bh, et->et_root_el, + et->et_root_journal_access); +} + +/* + * Journal the buffer at depth idx. All idx>0 are extent_blocks, + * otherwise it's the root_access function. + * + * I don't like the way this function's name looks next to + * ocfs2_journal_access_path(), but I don't have a better one. + */ +int ocfs2_path_bh_journal_access(handle_t *handle, + struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + int idx) +{ + ocfs2_journal_access_func access = path_root_access(path); + + if (!access) + access = ocfs2_journal_access; + + if (idx) + access = ocfs2_journal_access_eb; + + return access(handle, ci, path->p_node[idx].bh, + OCFS2_JOURNAL_ACCESS_WRITE); +} + +/* + * Convenience function to journal all components in a path. + */ +int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, + handle_t *handle, + struct ocfs2_path *path) +{ + int i, ret = 0; + + if (!path) + goto out; + + for(i = 0; i < path_num_items(path); i++) { + ret = ocfs2_path_bh_journal_access(handle, ci, path, i); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + +out: + return ret; +} + +/* + * Return the index of the extent record which contains cluster #v_cluster. + * -1 is returned if it was not found. + * + * Should work fine on interior and exterior nodes. + */ +int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster) +{ + int ret = -1; + int i; + struct ocfs2_extent_rec *rec; + u32 rec_end, rec_start, clusters; + + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + + rec_start = le32_to_cpu(rec->e_cpos); + clusters = ocfs2_rec_clusters(el, rec); + + rec_end = rec_start + clusters; + + if (v_cluster >= rec_start && v_cluster < rec_end) { + ret = i; + break; + } + } + + return ret; +} + +/* + * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and + * ocfs2_extent_rec_contig only work properly against leaf nodes! + */ +static int ocfs2_block_extent_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + u64 blkno) +{ + u64 blk_end = le64_to_cpu(ext->e_blkno); + + blk_end += ocfs2_clusters_to_blocks(sb, + le16_to_cpu(ext->e_leaf_clusters)); + + return blkno == blk_end; +} + +static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, + struct ocfs2_extent_rec *right) +{ + u32 left_range; + + left_range = le32_to_cpu(left->e_cpos) + + le16_to_cpu(left->e_leaf_clusters); + + return (left_range == le32_to_cpu(right->e_cpos)); +} + +static enum ocfs2_contig_type + ocfs2_extent_rec_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) +{ + u64 blkno = le64_to_cpu(insert_rec->e_blkno); + + /* + * Refuse to coalesce extent records with different flag + * fields - we don't want to mix unwritten extents with user + * data. + */ + if (ext->e_flags != insert_rec->e_flags) + return CONTIG_NONE; + + if (ocfs2_extents_adjacent(ext, insert_rec) && + ocfs2_block_extent_contig(sb, ext, blkno)) + return CONTIG_RIGHT; + + blkno = le64_to_cpu(ext->e_blkno); + if (ocfs2_extents_adjacent(insert_rec, ext) && + ocfs2_block_extent_contig(sb, insert_rec, blkno)) + return CONTIG_LEFT; + + return CONTIG_NONE; +} + +/* + * NOTE: We can have pretty much any combination of contiguousness and + * appending. + * + * The usefulness of APPEND_TAIL is more in that it lets us know that + * we'll have to update the path to that leaf. + */ +enum ocfs2_append_type { + APPEND_NONE = 0, + APPEND_TAIL, +}; + +enum ocfs2_split_type { + SPLIT_NONE = 0, + SPLIT_LEFT, + SPLIT_RIGHT, +}; + +struct ocfs2_insert_type { + enum ocfs2_split_type ins_split; + enum ocfs2_append_type ins_appending; + enum ocfs2_contig_type ins_contig; + int ins_contig_index; + int ins_tree_depth; +}; + +struct ocfs2_merge_ctxt { + enum ocfs2_contig_type c_contig_type; + int c_has_empty_extent; + int c_split_covers_rec; +}; + +static int ocfs2_validate_extent_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_extent_block *eb = + (struct ocfs2_extent_block *)bh->b_data; + + trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for extent block %llu\n", + (unsigned long long)bh->b_blocknr); + return rc; + } + + /* + * Errors after here are fatal. + */ + + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + ocfs2_error(sb, + "Extent block #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + eb->h_signature); + return -EINVAL; + } + + if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Extent block #%llu has an invalid h_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(eb->h_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Extent block #%llu has an invalid " + "h_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(eb->h_fs_generation)); + return -EINVAL; + } + + return 0; +} + +int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(ci, eb_blkno, &tmp, + ocfs2_validate_extent_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + + +/* + * How many free extents have we got before we need more meta data? + */ +int ocfs2_num_free_extents(struct ocfs2_super *osb, + struct ocfs2_extent_tree *et) +{ + int retval; + struct ocfs2_extent_list *el = NULL; + struct ocfs2_extent_block *eb; + struct buffer_head *eb_bh = NULL; + u64 last_eb_blk = 0; + + el = et->et_root_el; + last_eb_blk = ocfs2_et_get_last_eb_blk(et); + + if (last_eb_blk) { + retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk, + &eb_bh); + if (retval < 0) { + mlog_errno(retval); + goto bail; + } + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + } + + BUG_ON(el->l_tree_depth != 0); + + retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); +bail: + brelse(eb_bh); + + trace_ocfs2_num_free_extents(retval); + return retval; +} + +/* expects array to already be allocated + * + * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and + * l_count for you + */ +static int ocfs2_create_new_meta_bhs(handle_t *handle, + struct ocfs2_extent_tree *et, + int wanted, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head *bhs[]) +{ + int count, status, i; + u16 suballoc_bit_start; + u32 num_got; + u64 suballoc_loc, first_blkno; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); + struct ocfs2_extent_block *eb; + + count = 0; + while (count < wanted) { + status = ocfs2_claim_metadata(handle, + meta_ac, + wanted - count, + &suballoc_loc, + &suballoc_bit_start, + &num_got, + &first_blkno); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + for(i = count; i < (num_got + count); i++) { + bhs[i] = sb_getblk(osb->sb, first_blkno); + if (bhs[i] == NULL) { + status = -EIO; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]); + + status = ocfs2_journal_access_eb(handle, et->et_ci, + bhs[i], + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); + eb = (struct ocfs2_extent_block *) bhs[i]->b_data; + /* Ok, setup the minimal stuff here. */ + strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); + eb->h_blkno = cpu_to_le64(first_blkno); + eb->h_fs_generation = cpu_to_le32(osb->fs_generation); + eb->h_suballoc_slot = + cpu_to_le16(meta_ac->ac_alloc_slot); + eb->h_suballoc_loc = cpu_to_le64(suballoc_loc); + eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); + eb->h_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); + + suballoc_bit_start++; + first_blkno++; + + /* We'll also be dirtied by the caller, so + * this isn't absolutely necessary. */ + ocfs2_journal_dirty(handle, bhs[i]); + } + + count += num_got; + } + + status = 0; +bail: + if (status < 0) { + for(i = 0; i < wanted; i++) { + brelse(bhs[i]); + bhs[i] = NULL; + } + mlog_errno(status); + } + return status; +} + +/* + * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). + * + * Returns the sum of the rightmost extent rec logical offset and + * cluster count. + * + * ocfs2_add_branch() uses this to determine what logical cluster + * value should be populated into the leftmost new branch records. + * + * ocfs2_shift_tree_depth() uses this to determine the # clusters + * value for the new topmost tree record. + */ +static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) +{ + int i; + + i = le16_to_cpu(el->l_next_free_rec) - 1; + + return le32_to_cpu(el->l_recs[i].e_cpos) + + ocfs2_rec_clusters(el, &el->l_recs[i]); +} + +/* + * Change range of the branches in the right most path according to the leaf + * extent block's rightmost record. + */ +static int ocfs2_adjust_rightmost_branch(handle_t *handle, + struct ocfs2_extent_tree *et) +{ + int status; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + path = ocfs2_new_path_from_et(et); + if (!path) { + status = -ENOMEM; + return status; + } + + status = ocfs2_find_path(et->et_ci, path, UINT_MAX); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_extend_trans(handle, path_num_items(path)); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_journal_access_path(et->et_ci, handle, path); + if (status < 0) { + mlog_errno(status); + goto out; + } + + el = path_leaf_el(path); + rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1]; + + ocfs2_adjust_rightmost_records(handle, et, path, rec); + +out: + ocfs2_free_path(path); + return status; +} + +/* + * Add an entire tree branch to our inode. eb_bh is the extent block + * to start at, if we don't want to start the branch at the root + * structure. + * + * last_eb_bh is required as we have to update it's next_leaf pointer + * for the new last extent block. + * + * the new branch will be 'empty' in the sense that every block will + * contain a single record with cluster count == 0. + */ +static int ocfs2_add_branch(handle_t *handle, + struct ocfs2_extent_tree *et, + struct buffer_head *eb_bh, + struct buffer_head **last_eb_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int status, new_blocks, i; + u64 next_blkno, new_last_eb_blk; + struct buffer_head *bh; + struct buffer_head **new_eb_bhs = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *eb_el; + struct ocfs2_extent_list *el; + u32 new_cpos, root_end; + + BUG_ON(!last_eb_bh || !*last_eb_bh); + + if (eb_bh) { + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + } else + el = et->et_root_el; + + /* we never add a branch to a leaf. */ + BUG_ON(!el->l_tree_depth); + + new_blocks = le16_to_cpu(el->l_tree_depth); + + eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; + new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); + root_end = ocfs2_sum_rightmost_rec(et->et_root_el); + + /* + * If there is a gap before the root end and the real end + * of the righmost leaf block, we need to remove the gap + * between new_cpos and root_end first so that the tree + * is consistent after we add a new branch(it will start + * from new_cpos). + */ + if (root_end > new_cpos) { + trace_ocfs2_adjust_rightmost_branch( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + root_end, new_cpos); + + status = ocfs2_adjust_rightmost_branch(handle, et); + if (status) { + mlog_errno(status); + goto bail; + } + } + + /* allocate the number of new eb blocks we need */ + new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), + GFP_KERNEL); + if (!new_eb_bhs) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + status = ocfs2_create_new_meta_bhs(handle, et, new_blocks, + meta_ac, new_eb_bhs); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be + * linked with the rest of the tree. + * conversly, new_eb_bhs[0] is the new bottommost leaf. + * + * when we leave the loop, new_last_eb_blk will point to the + * newest leaf, and next_blkno will point to the topmost extent + * block. */ + next_blkno = new_last_eb_blk = 0; + for(i = 0; i < new_blocks; i++) { + bh = new_eb_bhs[i]; + eb = (struct ocfs2_extent_block *) bh->b_data; + /* ocfs2_create_new_meta_bhs() should create it right! */ + BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); + eb_el = &eb->h_list; + + status = ocfs2_journal_access_eb(handle, et->et_ci, bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb->h_next_leaf_blk = 0; + eb_el->l_tree_depth = cpu_to_le16(i); + eb_el->l_next_free_rec = cpu_to_le16(1); + /* + * This actually counts as an empty extent as + * c_clusters == 0 + */ + eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); + eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); + /* + * eb_el isn't always an interior node, but even leaf + * nodes want a zero'd flags and reserved field so + * this gets the whole 32 bits regardless of use. + */ + eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); + if (!eb_el->l_tree_depth) + new_last_eb_blk = le64_to_cpu(eb->h_blkno); + + ocfs2_journal_dirty(handle, bh); + next_blkno = le64_to_cpu(eb->h_blkno); + } + + /* This is a bit hairy. We want to update up to three blocks + * here without leaving any of them in an inconsistent state + * in case of error. We don't have to worry about + * journal_dirty erroring as it won't unless we've aborted the + * handle (in which case we would never be here) so reserving + * the write with journal_access is all we need to do. */ + status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + status = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (eb_bh) { + status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* Link the new branch into the rest of the tree (el will + * either be on the root_bh, or the extent block passed in. */ + i = le16_to_cpu(el->l_next_free_rec); + el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); + el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); + el->l_recs[i].e_int_clusters = 0; + le16_add_cpu(&el->l_next_free_rec, 1); + + /* fe needs a new last extent block pointer, as does the + * next_leaf on the previously last-extent-block. */ + ocfs2_et_set_last_eb_blk(et, new_last_eb_blk); + + eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; + eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); + + ocfs2_journal_dirty(handle, *last_eb_bh); + ocfs2_journal_dirty(handle, et->et_root_bh); + if (eb_bh) + ocfs2_journal_dirty(handle, eb_bh); + + /* + * Some callers want to track the rightmost leaf so pass it + * back here. + */ + brelse(*last_eb_bh); + get_bh(new_eb_bhs[0]); + *last_eb_bh = new_eb_bhs[0]; + + status = 0; +bail: + if (new_eb_bhs) { + for (i = 0; i < new_blocks; i++) + brelse(new_eb_bhs[i]); + kfree(new_eb_bhs); + } + + return status; +} + +/* + * adds another level to the allocation tree. + * returns back the new extent block so you can add a branch to it + * after this call. + */ +static int ocfs2_shift_tree_depth(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **ret_new_eb_bh) +{ + int status, i; + u32 new_clusters; + struct buffer_head *new_eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *root_el; + struct ocfs2_extent_list *eb_el; + + status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, + &new_eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; + /* ocfs2_create_new_meta_bhs() should create it right! */ + BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); + + eb_el = &eb->h_list; + root_el = et->et_root_el; + + status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* copy the root extent list data into the new extent block */ + eb_el->l_tree_depth = root_el->l_tree_depth; + eb_el->l_next_free_rec = root_el->l_next_free_rec; + for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) + eb_el->l_recs[i] = root_el->l_recs[i]; + + ocfs2_journal_dirty(handle, new_eb_bh); + + status = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + new_clusters = ocfs2_sum_rightmost_rec(eb_el); + + /* update root_bh now */ + le16_add_cpu(&root_el->l_tree_depth, 1); + root_el->l_recs[0].e_cpos = 0; + root_el->l_recs[0].e_blkno = eb->h_blkno; + root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); + for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) + memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); + root_el->l_next_free_rec = cpu_to_le16(1); + + /* If this is our 1st tree depth shift, then last_eb_blk + * becomes the allocated extent block */ + if (root_el->l_tree_depth == cpu_to_le16(1)) + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); + + ocfs2_journal_dirty(handle, et->et_root_bh); + + *ret_new_eb_bh = new_eb_bh; + new_eb_bh = NULL; + status = 0; +bail: + brelse(new_eb_bh); + + return status; +} + +/* + * Should only be called when there is no space left in any of the + * leaf nodes. What we want to do is find the lowest tree depth + * non-leaf extent block with room for new records. There are three + * valid results of this search: + * + * 1) a lowest extent block is found, then we pass it back in + * *lowest_eb_bh and return '0' + * + * 2) the search fails to find anything, but the root_el has room. We + * pass NULL back in *lowest_eb_bh, but still return '0' + * + * 3) the search fails to find anything AND the root_el is full, in + * which case we return > 0 + * + * return status < 0 indicates an error. + */ +static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, + struct buffer_head **target_bh) +{ + int status = 0, i; + u64 blkno; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *bh = NULL; + struct buffer_head *lowest_bh = NULL; + + *target_bh = NULL; + + el = et->et_root_el; + + while(le16_to_cpu(el->l_tree_depth) > 1) { + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty " + "extent list (next_free_rec == 0)", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); + status = -EIO; + goto bail; + } + i = le16_to_cpu(el->l_next_free_rec) - 1; + blkno = le64_to_cpu(el->l_recs[i].e_blkno); + if (!blkno) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has extent " + "list where extent # %d has no physical " + "block start", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); + status = -EIO; + goto bail; + } + + brelse(bh); + bh = NULL; + + status = ocfs2_read_extent_block(et->et_ci, blkno, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + + if (le16_to_cpu(el->l_next_free_rec) < + le16_to_cpu(el->l_count)) { + brelse(lowest_bh); + lowest_bh = bh; + get_bh(lowest_bh); + } + } + + /* If we didn't find one and the fe doesn't have any room, + * then return '1' */ + el = et->et_root_el; + if (!lowest_bh && (el->l_next_free_rec == el->l_count)) + status = 1; + + *target_bh = lowest_bh; +bail: + brelse(bh); + + return status; +} + +/* + * Grow a b-tree so that it has more records. + * + * We might shift the tree depth in which case existing paths should + * be considered invalid. + * + * Tree depth after the grow is returned via *final_depth. + * + * *last_eb_bh will be updated by ocfs2_add_branch(). + */ +static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, + int *final_depth, struct buffer_head **last_eb_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret, shift; + struct ocfs2_extent_list *el = et->et_root_el; + int depth = le16_to_cpu(el->l_tree_depth); + struct buffer_head *bh = NULL; + + BUG_ON(meta_ac == NULL); + + shift = ocfs2_find_branch_target(et, &bh); + if (shift < 0) { + ret = shift; + mlog_errno(ret); + goto out; + } + + /* We traveled all the way to the bottom of the allocation tree + * and didn't find room for any more extents - we need to add + * another tree level */ + if (shift) { + BUG_ON(bh); + trace_ocfs2_grow_tree( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + depth); + + /* ocfs2_shift_tree_depth will return us a buffer with + * the new extent block (so we can pass that to + * ocfs2_add_branch). */ + ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + depth++; + if (depth == 1) { + /* + * Special case: we have room now if we shifted from + * tree_depth 0, so no more work needs to be done. + * + * We won't be calling add_branch, so pass + * back *last_eb_bh as the new leaf. At depth + * zero, it should always be null so there's + * no reason to brelse. + */ + BUG_ON(*last_eb_bh); + get_bh(bh); + *last_eb_bh = bh; + goto out; + } + } + + /* call ocfs2_add_branch to add the final part of the tree with + * the new data. */ + ret = ocfs2_add_branch(handle, et, bh, last_eb_bh, + meta_ac); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + +out: + if (final_depth) + *final_depth = depth; + brelse(bh); + return ret; +} + +/* + * This function will discard the rightmost extent record. + */ +static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + int count = le16_to_cpu(el->l_count); + unsigned int num_bytes; + + BUG_ON(!next_free); + /* This will cause us to go off the end of our extent list. */ + BUG_ON(next_free >= count); + + num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; + + memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); +} + +static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i, insert_index, next_free, has_empty, num_bytes; + u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); + struct ocfs2_extent_rec *rec; + + next_free = le16_to_cpu(el->l_next_free_rec); + has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); + + BUG_ON(!next_free); + + /* The tree code before us didn't allow enough room in the leaf. */ + BUG_ON(el->l_next_free_rec == el->l_count && !has_empty); + + /* + * The easiest way to approach this is to just remove the + * empty extent and temporarily decrement next_free. + */ + if (has_empty) { + /* + * If next_free was 1 (only an empty extent), this + * loop won't execute, which is fine. We still want + * the decrement above to happen. + */ + for(i = 0; i < (next_free - 1); i++) + el->l_recs[i] = el->l_recs[i+1]; + + next_free--; + } + + /* + * Figure out what the new record index should be. + */ + for(i = 0; i < next_free; i++) { + rec = &el->l_recs[i]; + + if (insert_cpos < le32_to_cpu(rec->e_cpos)) + break; + } + insert_index = i; + + trace_ocfs2_rotate_leaf(insert_cpos, insert_index, + has_empty, next_free, + le16_to_cpu(el->l_count)); + + BUG_ON(insert_index < 0); + BUG_ON(insert_index >= le16_to_cpu(el->l_count)); + BUG_ON(insert_index > next_free); + + /* + * No need to memmove if we're just adding to the tail. + */ + if (insert_index != next_free) { + BUG_ON(next_free >= le16_to_cpu(el->l_count)); + + num_bytes = next_free - insert_index; + num_bytes *= sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[insert_index + 1], + &el->l_recs[insert_index], + num_bytes); + } + + /* + * Either we had an empty extent, and need to re-increment or + * there was no empty extent on a non full rightmost leaf node, + * in which case we still need to increment. + */ + next_free++; + el->l_next_free_rec = cpu_to_le16(next_free); + /* + * Make sure none of the math above just messed up our tree. + */ + BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); + + el->l_recs[insert_index] = *insert_rec; + +} + +static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el) +{ + int size, num_recs = le16_to_cpu(el->l_next_free_rec); + + BUG_ON(num_recs == 0); + + if (ocfs2_is_empty_extent(&el->l_recs[0])) { + num_recs--; + size = num_recs * sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[0], &el->l_recs[1], size); + memset(&el->l_recs[num_recs], 0, + sizeof(struct ocfs2_extent_rec)); + el->l_next_free_rec = cpu_to_le16(num_recs); + } +} + +/* + * Create an empty extent record . + * + * l_next_free_rec may be updated. + * + * If an empty extent already exists do nothing. + */ +static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + if (next_free == 0) + goto set_and_inc; + + if (ocfs2_is_empty_extent(&el->l_recs[0])) + return; + + mlog_bug_on_msg(el->l_count == el->l_next_free_rec, + "Asked to create an empty extent in a full list:\n" + "count = %u, tree depth = %u", + le16_to_cpu(el->l_count), + le16_to_cpu(el->l_tree_depth)); + + ocfs2_shift_records_right(el); + +set_and_inc: + le16_add_cpu(&el->l_next_free_rec, 1); + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); +} + +/* + * For a rotation which involves two leaf nodes, the "root node" is + * the lowest level tree node which contains a path to both leafs. This + * resulting set of information can be used to form a complete "subtree" + * + * This function is passed two full paths from the dinode down to a + * pair of adjacent leaves. It's task is to figure out which path + * index contains the subtree root - this can be the root index itself + * in a worst-case rotation. + * + * The array index of the subtree root is passed back. + */ +int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, + struct ocfs2_path *left, + struct ocfs2_path *right) +{ + int i = 0; + + /* + * Check that the caller passed in two paths from the same tree. + */ + BUG_ON(path_root_bh(left) != path_root_bh(right)); + + do { + i++; + + /* + * The caller didn't pass two adjacent paths. + */ + mlog_bug_on_msg(i > left->p_tree_depth, + "Owner %llu, left depth %u, right depth %u\n" + "left leaf blk %llu, right leaf blk %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + left->p_tree_depth, right->p_tree_depth, + (unsigned long long)path_leaf_bh(left)->b_blocknr, + (unsigned long long)path_leaf_bh(right)->b_blocknr); + } while (left->p_node[i].bh->b_blocknr == + right->p_node[i].bh->b_blocknr); + + return i - 1; +} + +typedef void (path_insert_t)(void *, struct buffer_head *); + +/* + * Traverse a btree path in search of cpos, starting at root_el. + * + * This code can be called with a cpos larger than the tree, in which + * case it will return the rightmost path. + */ +static int __ocfs2_find_path(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *root_el, u32 cpos, + path_insert_t *func, void *data) +{ + int i, ret = 0; + u32 range; + u64 blkno; + struct buffer_head *bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + el = root_el; + while (el->l_tree_depth) { + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has empty extent list at " + "depth %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + le16_to_cpu(el->l_tree_depth)); + ret = -EROFS; + goto out; + + } + + for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { + rec = &el->l_recs[i]; + + /* + * In the case that cpos is off the allocation + * tree, this should just wind up returning the + * rightmost record. + */ + range = le32_to_cpu(rec->e_cpos) + + ocfs2_rec_clusters(el, rec); + if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) + break; + } + + blkno = le64_to_cpu(el->l_recs[i].e_blkno); + if (blkno == 0) { + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has bad blkno in extent list " + "at depth %u (index %d)\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + le16_to_cpu(el->l_tree_depth), i); + ret = -EROFS; + goto out; + } + + brelse(bh); + bh = NULL; + ret = ocfs2_read_extent_block(ci, blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + + if (le16_to_cpu(el->l_next_free_rec) > + le16_to_cpu(el->l_count)) { + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has bad count in extent list " + "at block %llu (next free=%u, count=%u)\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)bh->b_blocknr, + le16_to_cpu(el->l_next_free_rec), + le16_to_cpu(el->l_count)); + ret = -EROFS; + goto out; + } + + if (func) + func(data, bh); + } + +out: + /* + * Catch any trailing bh that the loop didn't handle. + */ + brelse(bh); + + return ret; +} + +/* + * Given an initialized path (that is, it has a valid root extent + * list), this function will traverse the btree in search of the path + * which would contain cpos. + * + * The path traveled is recorded in the path structure. + * + * Note that this will not do any comparisons on leaf node extent + * records, so it will work fine in the case that we just added a tree + * branch. + */ +struct find_path_data { + int index; + struct ocfs2_path *path; +}; +static void find_path_ins(void *data, struct buffer_head *bh) +{ + struct find_path_data *fp = data; + + get_bh(bh); + ocfs2_path_insert_eb(fp->path, fp->index, bh); + fp->index++; +} +int ocfs2_find_path(struct ocfs2_caching_info *ci, + struct ocfs2_path *path, u32 cpos) +{ + struct find_path_data data; + + data.index = 1; + data.path = path; + return __ocfs2_find_path(ci, path_root_el(path), cpos, + find_path_ins, &data); +} + +static void find_leaf_ins(void *data, struct buffer_head *bh) +{ + struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; + struct ocfs2_extent_list *el = &eb->h_list; + struct buffer_head **ret = data; + + /* We want to retain only the leaf block. */ + if (le16_to_cpu(el->l_tree_depth) == 0) { + get_bh(bh); + *ret = bh; + } +} +/* + * Find the leaf block in the tree which would contain cpos. No + * checking of the actual leaf is done. + * + * Some paths want to call this instead of allocating a path structure + * and calling ocfs2_find_path(). + * + * This function doesn't handle non btree extent lists. + */ +int ocfs2_find_leaf(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *root_el, u32 cpos, + struct buffer_head **leaf_bh) +{ + int ret; + struct buffer_head *bh = NULL; + + ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + *leaf_bh = bh; +out: + return ret; +} + +/* + * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. + * + * Basically, we've moved stuff around at the bottom of the tree and + * we need to fix up the extent records above the changes to reflect + * the new changes. + * + * left_rec: the record on the left. + * left_child_el: is the child list pointed to by left_rec + * right_rec: the record to the right of left_rec + * right_child_el: is the child list pointed to by right_rec + * + * By definition, this only works on interior nodes. + */ +static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, + struct ocfs2_extent_list *left_child_el, + struct ocfs2_extent_rec *right_rec, + struct ocfs2_extent_list *right_child_el) +{ + u32 left_clusters, right_end; + + /* + * Interior nodes never have holes. Their cpos is the cpos of + * the leftmost record in their child list. Their cluster + * count covers the full theoretical range of their child list + * - the range between their cpos and the cpos of the record + * immediately to their right. + */ + left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); + if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) { + BUG_ON(right_child_el->l_tree_depth); + BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); + left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); + } + left_clusters -= le32_to_cpu(left_rec->e_cpos); + left_rec->e_int_clusters = cpu_to_le32(left_clusters); + + /* + * Calculate the rightmost cluster count boundary before + * moving cpos - we will need to adjust clusters after + * updating e_cpos to keep the same highest cluster count. + */ + right_end = le32_to_cpu(right_rec->e_cpos); + right_end += le32_to_cpu(right_rec->e_int_clusters); + + right_rec->e_cpos = left_rec->e_cpos; + le32_add_cpu(&right_rec->e_cpos, left_clusters); + + right_end -= le32_to_cpu(right_rec->e_cpos); + right_rec->e_int_clusters = cpu_to_le32(right_end); +} + +/* + * Adjust the adjacent root node records involved in a + * rotation. left_el_blkno is passed in as a key so that we can easily + * find it's index in the root list. + */ +static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, + struct ocfs2_extent_list *left_el, + struct ocfs2_extent_list *right_el, + u64 left_el_blkno) +{ + int i; + + BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= + le16_to_cpu(left_el->l_tree_depth)); + + for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { + if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) + break; + } + + /* + * The path walking code should have never returned a root and + * two paths which are not adjacent. + */ + BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); + + ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, + &root_el->l_recs[i + 1], right_el); +} + +/* + * We've changed a leaf block (in right_path) and need to reflect that + * change back up the subtree. + * + * This happens in multiple places: + * - When we've moved an extent record from the left path leaf to the right + * path leaf to make room for an empty extent in the left path leaf. + * - When our insert into the right path leaf is at the leftmost edge + * and requires an update of the path immediately to it's left. This + * can occur at the end of some types of rotation and appending inserts. + * - When we've adjusted the last extent record in the left path leaf and the + * 1st extent record in the right path leaf during cross extent block merge. + */ +static void ocfs2_complete_edge_insert(handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index) +{ + int i, idx; + struct ocfs2_extent_list *el, *left_el, *right_el; + struct ocfs2_extent_rec *left_rec, *right_rec; + struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; + + /* + * Update the counts and position values within all the + * interior nodes to reflect the leaf rotation we just did. + * + * The root node is handled below the loop. + * + * We begin the loop with right_el and left_el pointing to the + * leaf lists and work our way up. + * + * NOTE: within this loop, left_el and right_el always refer + * to the *child* lists. + */ + left_el = path_leaf_el(left_path); + right_el = path_leaf_el(right_path); + for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { + trace_ocfs2_complete_edge_insert(i); + + /* + * One nice property of knowing that all of these + * nodes are below the root is that we only deal with + * the leftmost right node record and the rightmost + * left node record. + */ + el = left_path->p_node[i].el; + idx = le16_to_cpu(left_el->l_next_free_rec) - 1; + left_rec = &el->l_recs[idx]; + + el = right_path->p_node[i].el; + right_rec = &el->l_recs[0]; + + ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, + right_el); + + ocfs2_journal_dirty(handle, left_path->p_node[i].bh); + ocfs2_journal_dirty(handle, right_path->p_node[i].bh); + + /* + * Setup our list pointers now so that the current + * parents become children in the next iteration. + */ + left_el = left_path->p_node[i].el; + right_el = right_path->p_node[i].el; + } + + /* + * At the root node, adjust the two adjacent records which + * begin our path to the leaves. + */ + + el = left_path->p_node[subtree_index].el; + left_el = left_path->p_node[subtree_index + 1].el; + right_el = right_path->p_node[subtree_index + 1].el; + + ocfs2_adjust_root_records(el, left_el, right_el, + left_path->p_node[subtree_index + 1].bh->b_blocknr); + + root_bh = left_path->p_node[subtree_index].bh; + + ocfs2_journal_dirty(handle, root_bh); +} + +static int ocfs2_rotate_subtree_right(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index) +{ + int ret, i; + struct buffer_head *right_leaf_bh; + struct buffer_head *left_leaf_bh = NULL; + struct buffer_head *root_bh; + struct ocfs2_extent_list *right_el, *left_el; + struct ocfs2_extent_rec move_rec; + + left_leaf_bh = path_leaf_bh(left_path); + left_el = path_leaf_el(left_path); + + if (left_el->l_next_free_rec != left_el->l_count) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Inode %llu has non-full interior leaf node %llu" + "(next free = %u)", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)left_leaf_bh->b_blocknr, + le16_to_cpu(left_el->l_next_free_rec)); + return -EROFS; + } + + /* + * This extent block may already have an empty record, so we + * return early if so. + */ + if (ocfs2_is_empty_extent(&left_el->l_recs[0])) + return 0; + + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + for(i = subtree_index + 1; i < path_num_items(right_path); i++) { + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + right_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + right_leaf_bh = path_leaf_bh(right_path); + right_el = path_leaf_el(right_path); + + /* This is a code error, not a disk corruption. */ + mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " + "because rightmost leaf block %llu is empty\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)right_leaf_bh->b_blocknr); + + ocfs2_create_empty_extent(right_el); + + ocfs2_journal_dirty(handle, right_leaf_bh); + + /* Do the copy now. */ + i = le16_to_cpu(left_el->l_next_free_rec) - 1; + move_rec = left_el->l_recs[i]; + right_el->l_recs[0] = move_rec; + + /* + * Clear out the record we just copied and shift everything + * over, leaving an empty extent in the left leaf. + * + * We temporarily subtract from next_free_rec so that the + * shift will lose the tail record (which is now defunct). + */ + le16_add_cpu(&left_el->l_next_free_rec, -1); + ocfs2_shift_records_right(left_el); + memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + le16_add_cpu(&left_el->l_next_free_rec, 1); + + ocfs2_journal_dirty(handle, left_leaf_bh); + + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); + +out: + return ret; +} + +/* + * Given a full path, determine what cpos value would return us a path + * containing the leaf immediately to the left of the current one. + * + * Will return zero if the path passed in is already the leftmost path. + */ +int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) +{ + int i, j, ret = 0; + u64 blkno; + struct ocfs2_extent_list *el; + + BUG_ON(path->p_tree_depth == 0); + + *cpos = 0; + + blkno = path_leaf_bh(path)->b_blocknr; + + /* Start at the tree node just above the leaf and work our way up. */ + i = path->p_tree_depth - 1; + while (i >= 0) { + el = path->p_node[i].el; + + /* + * Find the extent record just before the one in our + * path. + */ + for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { + if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { + if (j == 0) { + if (i == 0) { + /* + * We've determined that the + * path specified is already + * the leftmost one - return a + * cpos of zero. + */ + goto out; + } + /* + * The leftmost record points to our + * leaf - we need to travel up the + * tree one level. + */ + goto next_node; + } + + *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); + *cpos = *cpos + ocfs2_rec_clusters(el, + &el->l_recs[j - 1]); + *cpos = *cpos - 1; + goto out; + } + } + + /* + * If we got here, we never found a valid node where + * the tree indicated one should be. + */ + ocfs2_error(sb, + "Invalid extent tree at extent block %llu\n", + (unsigned long long)blkno); + ret = -EROFS; + goto out; + +next_node: + blkno = path->p_node[i].bh->b_blocknr; + i--; + } + +out: + return ret; +} + +/* + * Extend the transaction by enough credits to complete the rotation, + * and still leave at least the original number of credits allocated + * to this transaction. + */ +static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, + int op_credits, + struct ocfs2_path *path) +{ + int ret = 0; + int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; + + if (handle->h_buffer_credits < credits) + ret = ocfs2_extend_trans(handle, + credits - handle->h_buffer_credits); + + return ret; +} + +/* + * Trap the case where we're inserting into the theoretical range past + * the _actual_ left leaf range. Otherwise, we'll rotate a record + * whose cpos is less than ours into the right leaf. + * + * It's only necessary to look at the rightmost record of the left + * leaf because the logic that calls us should ensure that the + * theoretical ranges in the path components above the leaves are + * correct. + */ +static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, + u32 insert_cpos) +{ + struct ocfs2_extent_list *left_el; + struct ocfs2_extent_rec *rec; + int next_free; + + left_el = path_leaf_el(left_path); + next_free = le16_to_cpu(left_el->l_next_free_rec); + rec = &left_el->l_recs[next_free - 1]; + + if (insert_cpos > le32_to_cpu(rec->e_cpos)) + return 1; + return 0; +} + +static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + unsigned int range; + struct ocfs2_extent_rec *rec; + + if (next_free == 0) + return 0; + + rec = &el->l_recs[0]; + if (ocfs2_is_empty_extent(rec)) { + /* Empty list. */ + if (next_free == 1) + return 0; + rec = &el->l_recs[1]; + } + + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) + return 1; + return 0; +} + +/* + * Rotate all the records in a btree right one record, starting at insert_cpos. + * + * The path to the rightmost leaf should be passed in. + * + * The array is assumed to be large enough to hold an entire path (tree depth). + * + * Upon successful return from this function: + * + * - The 'right_path' array will contain a path to the leaf block + * whose range contains e_cpos. + * - That leaf block will have a single empty extent in list index 0. + * - In the case that the rotation requires a post-insert update, + * *ret_left_path will contain a valid path which can be passed to + * ocfs2_insert_path(). + */ +static int ocfs2_rotate_tree_right(handle_t *handle, + struct ocfs2_extent_tree *et, + enum ocfs2_split_type split, + u32 insert_cpos, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret, start, orig_credits = handle->h_buffer_credits; + u32 cpos; + struct ocfs2_path *left_path = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + + *ret_left_path = NULL; + + left_path = ocfs2_new_path_from_path(right_path); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_rotate_tree_right( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + insert_cpos, cpos); + + /* + * What we want to do here is: + * + * 1) Start with the rightmost path. + * + * 2) Determine a path to the leaf block directly to the left + * of that leaf. + * + * 3) Determine the 'subtree root' - the lowest level tree node + * which contains a path to both leaves. + * + * 4) Rotate the subtree. + * + * 5) Find the next subtree by considering the left path to be + * the new right path. + * + * The check at the top of this while loop also accepts + * insert_cpos == cpos because cpos is only a _theoretical_ + * value to get us the left path - insert_cpos might very well + * be filling that hole. + * + * Stop at a cpos of '0' because we either started at the + * leftmost branch (i.e., a tree with one branch and a + * rotation inside of it), or we've gone as far as we can in + * rotating subtrees. + */ + while (cpos && insert_cpos <= cpos) { + trace_ocfs2_rotate_tree_right( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + insert_cpos, cpos); + + ret = ocfs2_find_path(et->et_ci, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog_bug_on_msg(path_leaf_bh(left_path) == + path_leaf_bh(right_path), + "Owner %llu: error during insert of %u " + "(left path cpos %u) results in two identical " + "paths ending at %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + insert_cpos, cpos, + (unsigned long long) + path_leaf_bh(left_path)->b_blocknr); + + if (split == SPLIT_NONE && + ocfs2_rotate_requires_path_adjustment(left_path, + insert_cpos)) { + + /* + * We've rotated the tree as much as we + * should. The rest is up to + * ocfs2_insert_path() to complete, after the + * record insertion. We indicate this + * situation by returning the left path. + * + * The reason we don't adjust the records here + * before the record insert is that an error + * later might break the rule where a parent + * record e_cpos will reflect the actual + * e_cpos of the 1st nonempty record of the + * child list. + */ + *ret_left_path = left_path; + goto out_ret_path; + } + + start = ocfs2_find_subtree_root(et, left_path, right_path); + + trace_ocfs2_rotate_subtree(start, + (unsigned long long) + right_path->p_node[start].bh->b_blocknr, + right_path->p_tree_depth); + + ret = ocfs2_extend_rotate_transaction(handle, start, + orig_credits, right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rotate_subtree_right(handle, et, left_path, + right_path, start); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (split != SPLIT_NONE && + ocfs2_leftmost_rec_contains(path_leaf_el(right_path), + insert_cpos)) { + /* + * A rotate moves the rightmost left leaf + * record over to the leftmost right leaf + * slot. If we're doing an extent split + * instead of a real insert, then we have to + * check that the extent to be split wasn't + * just moved over. If it was, then we can + * exit here, passing left_path back - + * ocfs2_split_extent() is smart enough to + * search both leaves. + */ + *ret_left_path = left_path; + goto out_ret_path; + } + + /* + * There is no need to re-read the next right path + * as we know that it'll be our current left + * path. Optimize by copying values instead. + */ + ocfs2_mv_path(right_path, left_path); + + ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + } + +out: + ocfs2_free_path(left_path); + +out_ret_path: + return ret; +} + +static int ocfs2_update_edge_lengths(handle_t *handle, + struct ocfs2_extent_tree *et, + int subtree_index, struct ocfs2_path *path) +{ + int i, idx, ret; + struct ocfs2_extent_rec *rec; + struct ocfs2_extent_list *el; + struct ocfs2_extent_block *eb; + u32 range; + + /* + * In normal tree rotation process, we will never touch the + * tree branch above subtree_index and ocfs2_extend_rotate_transaction + * doesn't reserve the credits for them either. + * + * But we do have a special case here which will update the rightmost + * records for all the bh in the path. + * So we have to allocate extra credits and access them. + */ + ret = ocfs2_extend_trans(handle, subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Path should always be rightmost. */ + eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; + BUG_ON(eb->h_next_leaf_blk != 0ULL); + + el = &eb->h_list; + BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); + idx = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[idx]; + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + + for (i = 0; i < path->p_tree_depth; i++) { + el = path->p_node[i].el; + idx = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[idx]; + + rec->e_int_clusters = cpu_to_le32(range); + le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos)); + + ocfs2_journal_dirty(handle, path->p_node[i].bh); + } +out: + return ret; +} + +static void ocfs2_unlink_path(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_path *path, int unlink_start) +{ + int ret, i; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *bh; + + for(i = unlink_start; i < path_num_items(path); i++) { + bh = path->p_node[i].bh; + + eb = (struct ocfs2_extent_block *)bh->b_data; + /* + * Not all nodes might have had their final count + * decremented by the caller - handle this here. + */ + el = &eb->h_list; + if (le16_to_cpu(el->l_next_free_rec) > 1) { + mlog(ML_ERROR, + "Inode %llu, attempted to remove extent block " + "%llu with %u records\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(el->l_next_free_rec)); + + ocfs2_journal_dirty(handle, bh); + ocfs2_remove_from_cache(et->et_ci, bh); + continue; + } + + el->l_next_free_rec = 0; + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + + ocfs2_journal_dirty(handle, bh); + + ret = ocfs2_cache_extent_block_free(dealloc, eb); + if (ret) + mlog_errno(ret); + + ocfs2_remove_from_cache(et->et_ci, bh); + } +} + +static void ocfs2_unlink_subtree(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int i; + struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; + struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; + struct ocfs2_extent_list *el; + struct ocfs2_extent_block *eb; + + el = path_leaf_el(left_path); + + eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; + + for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) + if (root_el->l_recs[i].e_blkno == eb->h_blkno) + break; + + BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec)); + + memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); + le16_add_cpu(&root_el->l_next_free_rec, -1); + + eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; + eb->h_next_leaf_blk = 0; + + ocfs2_journal_dirty(handle, root_bh); + ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + + ocfs2_unlink_path(handle, et, dealloc, right_path, + subtree_index + 1); +} + +static int ocfs2_rotate_subtree_left(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int *deleted) +{ + int ret, i, del_right_subtree = 0, right_has_empty = 0; + struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); + struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; + struct ocfs2_extent_block *eb; + + *deleted = 0; + + right_leaf_el = path_leaf_el(right_path); + left_leaf_el = path_leaf_el(left_path); + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0])) + return 0; + + eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data; + if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) { + /* + * It's legal for us to proceed if the right leaf is + * the rightmost one and it has an empty extent. There + * are two cases to handle - whether the leaf will be + * empty after removal or not. If the leaf isn't empty + * then just remove the empty extent up front. The + * next block will handle empty leaves by flagging + * them for unlink. + * + * Non rightmost leaves will throw -EAGAIN and the + * caller can manually move the subtree and retry. + */ + + if (eb->h_next_leaf_blk != 0ULL) + return -EAGAIN; + + if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { + ret = ocfs2_journal_access_eb(handle, et->et_ci, + path_leaf_bh(right_path), + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_empty_extent(right_leaf_el); + } else + right_has_empty = 1; + } + + if (eb->h_next_leaf_blk == 0ULL && + le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) { + /* + * We have to update i_last_eb_blk during the meta + * data delete. + */ + ret = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + del_right_subtree = 1; + } + + /* + * Getting here with an empty extent in the right path implies + * that it's the rightmost path and will be deleted. + */ + BUG_ON(right_has_empty && !del_right_subtree); + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + for(i = subtree_index + 1; i < path_num_items(right_path); i++) { + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + right_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (!right_has_empty) { + /* + * Only do this if we're moving a real + * record. Otherwise, the action is delayed until + * after removal of the right path in which case we + * can do a simple shift to remove the empty extent. + */ + ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]); + memset(&right_leaf_el->l_recs[0], 0, + sizeof(struct ocfs2_extent_rec)); + } + if (eb->h_next_leaf_blk == 0ULL) { + /* + * Move recs over to get rid of empty extent, decrease + * next_free. This is allowed to remove the last + * extent in our leaf (setting l_next_free_rec to + * zero) - the delete code below won't care. + */ + ocfs2_remove_empty_extent(right_leaf_el); + } + + ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); + + if (del_right_subtree) { + ocfs2_unlink_subtree(handle, et, left_path, right_path, + subtree_index, dealloc); + ret = ocfs2_update_edge_lengths(handle, et, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); + + /* + * Removal of the extent in the left leaf was skipped + * above so we could delete the right path + * 1st. + */ + if (right_has_empty) + ocfs2_remove_empty_extent(left_leaf_el); + + ocfs2_journal_dirty(handle, et_root_bh); + + *deleted = 1; + } else + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); + +out: + return ret; +} + +/* + * Given a full path, determine what cpos value would return us a path + * containing the leaf immediately to the right of the current one. + * + * Will return zero if the path passed in is already the rightmost path. + * + * This looks similar, but is subtly different to + * ocfs2_find_cpos_for_left_leaf(). + */ +int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) +{ + int i, j, ret = 0; + u64 blkno; + struct ocfs2_extent_list *el; + + *cpos = 0; + + if (path->p_tree_depth == 0) + return 0; + + blkno = path_leaf_bh(path)->b_blocknr; + + /* Start at the tree node just above the leaf and work our way up. */ + i = path->p_tree_depth - 1; + while (i >= 0) { + int next_free; + + el = path->p_node[i].el; + + /* + * Find the extent record just after the one in our + * path. + */ + next_free = le16_to_cpu(el->l_next_free_rec); + for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { + if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { + if (j == (next_free - 1)) { + if (i == 0) { + /* + * We've determined that the + * path specified is already + * the rightmost one - return a + * cpos of zero. + */ + goto out; + } + /* + * The rightmost record points to our + * leaf - we need to travel up the + * tree one level. + */ + goto next_node; + } + + *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos); + goto out; + } + } + + /* + * If we got here, we never found a valid node where + * the tree indicated one should be. + */ + ocfs2_error(sb, + "Invalid extent tree at extent block %llu\n", + (unsigned long long)blkno); + ret = -EROFS; + goto out; + +next_node: + blkno = path->p_node[i].bh->b_blocknr; + i--; + } + +out: + return ret; +} + +static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path) +{ + int ret; + struct buffer_head *bh = path_leaf_bh(path); + struct ocfs2_extent_list *el = path_leaf_el(path); + + if (!ocfs2_is_empty_extent(&el->l_recs[0])) + return 0; + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path, + path_num_items(path) - 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_empty_extent(el); + ocfs2_journal_dirty(handle, bh); + +out: + return ret; +} + +static int __ocfs2_rotate_tree_left(handle_t *handle, + struct ocfs2_extent_tree *et, + int orig_credits, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_path **empty_extent_path) +{ + int ret, subtree_root, deleted; + u32 right_cpos; + struct ocfs2_path *left_path = NULL; + struct ocfs2_path *right_path = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + + BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); + + *empty_extent_path = NULL; + + ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + left_path = ocfs2_new_path_from_path(path); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ocfs2_cp_path(left_path, path); + + right_path = ocfs2_new_path_from_path(path); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + while (right_cpos) { + ret = ocfs2_find_path(et->et_ci, right_path, right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_root = ocfs2_find_subtree_root(et, left_path, + right_path); + + trace_ocfs2_rotate_subtree(subtree_root, + (unsigned long long) + right_path->p_node[subtree_root].bh->b_blocknr, + right_path->p_tree_depth); + + ret = ocfs2_extend_rotate_transaction(handle, subtree_root, + orig_credits, left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Caller might still want to make changes to the + * tree root, so re-add it to the journal here. + */ + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rotate_subtree_left(handle, et, left_path, + right_path, subtree_root, + dealloc, &deleted); + if (ret == -EAGAIN) { + /* + * The rotation has to temporarily stop due to + * the right subtree having an empty + * extent. Pass it back to the caller for a + * fixup. + */ + *empty_extent_path = right_path; + right_path = NULL; + goto out; + } + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * The subtree rotate might have removed records on + * the rightmost edge. If so, then rotation is + * complete. + */ + if (deleted) + break; + + ocfs2_mv_path(left_path, right_path); + + ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, + &right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + } + +out: + ocfs2_free_path(right_path); + ocfs2_free_path(left_path); + + return ret; +} + +static int ocfs2_remove_rightmost_path(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, subtree_index; + u32 cpos; + struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + + ret = ocfs2_et_sanity_check(et); + if (ret) + goto out; + /* + * There's two ways we handle this depending on + * whether path is the only existing one. + */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (cpos) { + /* + * We have a path to the left of this one - it needs + * an update too. + */ + left_path = ocfs2_new_path_from_path(path); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_index = ocfs2_find_subtree_root(et, left_path, path); + + ocfs2_unlink_subtree(handle, et, left_path, path, + subtree_index, dealloc); + ret = ocfs2_update_edge_lengths(handle, et, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); + } else { + /* + * 'path' is also the leftmost path which + * means it must be the only one. This gets + * handled differently because we want to + * revert the root back to having extents + * in-line. + */ + ocfs2_unlink_path(handle, et, dealloc, path, 1); + + el = et->et_root_el; + el->l_tree_depth = 0; + el->l_next_free_rec = 0; + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + + ocfs2_et_set_last_eb_blk(et, 0); + } + + ocfs2_journal_dirty(handle, path_root_bh(path)); + +out: + ocfs2_free_path(left_path); + return ret; +} + +/* + * Left rotation of btree records. + * + * In many ways, this is (unsurprisingly) the opposite of right + * rotation. We start at some non-rightmost path containing an empty + * extent in the leaf block. The code works its way to the rightmost + * path by rotating records to the left in every subtree. + * + * This is used by any code which reduces the number of extent records + * in a leaf. After removal, an empty record should be placed in the + * leftmost list position. + * + * This won't handle a length update of the rightmost path records if + * the rightmost tree leaf record is removed so the caller is + * responsible for detecting and correcting that. + */ +static int ocfs2_rotate_tree_left(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, orig_credits = handle->h_buffer_credits; + struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + el = path_leaf_el(path); + if (!ocfs2_is_empty_extent(&el->l_recs[0])) + return 0; + + if (path->p_tree_depth == 0) { +rightmost_no_delete: + /* + * Inline extents. This is trivially handled, so do + * it up front. + */ + ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * Handle rightmost branch now. There's several cases: + * 1) simple rotation leaving records in there. That's trivial. + * 2) rotation requiring a branch delete - there's no more + * records left. Two cases of this: + * a) There are branches to the left. + * b) This is also the leftmost (the only) branch. + * + * 1) is handled via ocfs2_rotate_rightmost_leaf_left() + * 2a) we need the left branch so that we can update it with the unlink + * 2b) we need to bring the root back to inline extents. + */ + + eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; + el = &eb->h_list; + if (eb->h_next_leaf_blk == 0) { + /* + * This gets a bit tricky if we're going to delete the + * rightmost path. Get the other cases out of the way + * 1st. + */ + if (le16_to_cpu(el->l_next_free_rec) > 1) + goto rightmost_no_delete; + + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ret = -EIO; + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty extent block at %llu", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)le64_to_cpu(eb->h_blkno)); + goto out; + } + + /* + * XXX: The caller can not trust "path" any more after + * this as it will have been deleted. What do we do? + * + * In theory the rotate-for-merge code will never get + * here because it'll always ask for a rotate in a + * nonempty list. + */ + + ret = ocfs2_remove_rightmost_path(handle, et, path, + dealloc); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * Now we can loop, remembering the path we get from -EAGAIN + * and restarting from there. + */ +try_rotate: + ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path, + dealloc, &restart_path); + if (ret && ret != -EAGAIN) { + mlog_errno(ret); + goto out; + } + + while (ret == -EAGAIN) { + tmp_path = restart_path; + restart_path = NULL; + + ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, + tmp_path, dealloc, + &restart_path); + if (ret && ret != -EAGAIN) { + mlog_errno(ret); + goto out; + } + + ocfs2_free_path(tmp_path); + tmp_path = NULL; + + if (ret == 0) + goto try_rotate; + } + +out: + ocfs2_free_path(tmp_path); + ocfs2_free_path(restart_path); + return ret; +} + +static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, + int index) +{ + struct ocfs2_extent_rec *rec = &el->l_recs[index]; + unsigned int size; + + if (rec->e_leaf_clusters == 0) { + /* + * We consumed all of the merged-from record. An empty + * extent cannot exist anywhere but the 1st array + * position, so move things over if the merged-from + * record doesn't occupy that position. + * + * This creates a new empty extent so the caller + * should be smart enough to have removed any existing + * ones. + */ + if (index > 0) { + BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); + size = index * sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[1], &el->l_recs[0], size); + } + + /* + * Always memset - the caller doesn't check whether it + * created an empty extent, so there could be junk in + * the other fields. + */ + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + } +} + +static int ocfs2_get_right_path(struct ocfs2_extent_tree *et, + struct ocfs2_path *left_path, + struct ocfs2_path **ret_right_path) +{ + int ret; + u32 right_cpos; + struct ocfs2_path *right_path = NULL; + struct ocfs2_extent_list *left_el; + + *ret_right_path = NULL; + + /* This function shouldn't be called for non-trees. */ + BUG_ON(left_path->p_tree_depth == 0); + + left_el = path_leaf_el(left_path); + BUG_ON(left_el->l_next_free_rec != left_el->l_count); + + ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + left_path, &right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* This function shouldn't be called for the rightmost leaf. */ + BUG_ON(right_cpos == 0); + + right_path = ocfs2_new_path_from_path(left_path); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, right_path, right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + *ret_right_path = right_path; +out: + if (ret) + ocfs2_free_path(right_path); + return ret; +} + +/* + * Remove split_rec clusters from the record at index and merge them + * onto the beginning of the record "next" to it. + * For index < l_count - 1, the next means the extent rec at index + 1. + * For index == l_count - 1, the "next" means the 1st extent rec of the + * next extent block. + */ +static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, + handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *split_rec, + int index) +{ + int ret, next_free, i; + unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); + struct ocfs2_extent_rec *left_rec; + struct ocfs2_extent_rec *right_rec; + struct ocfs2_extent_list *right_el; + struct ocfs2_path *right_path = NULL; + int subtree_index = 0; + struct ocfs2_extent_list *el = path_leaf_el(left_path); + struct buffer_head *bh = path_leaf_bh(left_path); + struct buffer_head *root_bh = NULL; + + BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); + left_rec = &el->l_recs[index]; + + if (index == le16_to_cpu(el->l_next_free_rec) - 1 && + le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { + /* we meet with a cross extent block merge. */ + ret = ocfs2_get_right_path(et, left_path, &right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + right_el = path_leaf_el(right_path); + next_free = le16_to_cpu(right_el->l_next_free_rec); + BUG_ON(next_free <= 0); + right_rec = &right_el->l_recs[0]; + if (ocfs2_is_empty_extent(right_rec)) { + BUG_ON(next_free <= 1); + right_rec = &right_el->l_recs[1]; + } + + BUG_ON(le32_to_cpu(left_rec->e_cpos) + + le16_to_cpu(left_rec->e_leaf_clusters) != + le32_to_cpu(right_rec->e_cpos)); + + subtree_index = ocfs2_find_subtree_root(et, left_path, + right_path); + + ret = ocfs2_extend_rotate_transaction(handle, subtree_index, + handle->h_buffer_credits, + right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = subtree_index + 1; + i < path_num_items(right_path); i++) { + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + right_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + } else { + BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1); + right_rec = &el->l_recs[index + 1]; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, + path_num_items(left_path) - 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters); + + le32_add_cpu(&right_rec->e_cpos, -split_clusters); + le64_add_cpu(&right_rec->e_blkno, + -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), + split_clusters)); + le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); + + ocfs2_cleanup_merge(el, index); + + ocfs2_journal_dirty(handle, bh); + if (right_path) { + ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); + } +out: + if (right_path) + ocfs2_free_path(right_path); + return ret; +} + +static int ocfs2_get_left_path(struct ocfs2_extent_tree *et, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret; + u32 left_cpos; + struct ocfs2_path *left_path = NULL; + + *ret_left_path = NULL; + + /* This function shouldn't be called for non-trees. */ + BUG_ON(right_path->p_tree_depth == 0); + + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + right_path, &left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* This function shouldn't be called for the leftmost leaf. */ + BUG_ON(left_cpos == 0); + + left_path = ocfs2_new_path_from_path(right_path); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, left_path, left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + *ret_left_path = left_path; +out: + if (ret) + ocfs2_free_path(left_path); + return ret; +} + +/* + * Remove split_rec clusters from the record at index and merge them + * onto the tail of the record "before" it. + * For index > 0, the "before" means the extent rec at index - 1. + * + * For index == 0, the "before" means the last record of the previous + * extent block. And there is also a situation that we may need to + * remove the rightmost leaf extent block in the right_path and change + * the right path to indicate the new rightmost path. + */ +static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, + handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int index) +{ + int ret, i, subtree_index = 0, has_empty_extent = 0; + unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); + struct ocfs2_extent_rec *left_rec; + struct ocfs2_extent_rec *right_rec; + struct ocfs2_extent_list *el = path_leaf_el(right_path); + struct buffer_head *bh = path_leaf_bh(right_path); + struct buffer_head *root_bh = NULL; + struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_list *left_el; + + BUG_ON(index < 0); + + right_rec = &el->l_recs[index]; + if (index == 0) { + /* we meet with a cross extent block merge. */ + ret = ocfs2_get_left_path(et, right_path, &left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + left_el = path_leaf_el(left_path); + BUG_ON(le16_to_cpu(left_el->l_next_free_rec) != + le16_to_cpu(left_el->l_count)); + + left_rec = &left_el->l_recs[ + le16_to_cpu(left_el->l_next_free_rec) - 1]; + BUG_ON(le32_to_cpu(left_rec->e_cpos) + + le16_to_cpu(left_rec->e_leaf_clusters) != + le32_to_cpu(split_rec->e_cpos)); + + subtree_index = ocfs2_find_subtree_root(et, left_path, + right_path); + + ret = ocfs2_extend_rotate_transaction(handle, subtree_index, + handle->h_buffer_credits, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = subtree_index + 1; + i < path_num_items(right_path); i++) { + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + right_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + } + } else { + left_rec = &el->l_recs[index - 1]; + if (ocfs2_is_empty_extent(&el->l_recs[0])) + has_empty_extent = 1; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + path_num_items(right_path) - 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (has_empty_extent && index == 1) { + /* + * The easy case - we can just plop the record right in. + */ + *left_rec = *split_rec; + + has_empty_extent = 0; + } else + le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); + + le32_add_cpu(&right_rec->e_cpos, split_clusters); + le64_add_cpu(&right_rec->e_blkno, + ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), + split_clusters)); + le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); + + ocfs2_cleanup_merge(el, index); + + ocfs2_journal_dirty(handle, bh); + if (left_path) { + ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + + /* + * In the situation that the right_rec is empty and the extent + * block is empty also, ocfs2_complete_edge_insert can't handle + * it and we need to delete the right extent block. + */ + if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && + le16_to_cpu(el->l_next_free_rec) == 1) { + + ret = ocfs2_remove_rightmost_path(handle, et, + right_path, + dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Now the rightmost extent block has been deleted. + * So we use the new rightmost path. + */ + ocfs2_mv_path(right_path, left_path); + left_path = NULL; + } else + ocfs2_complete_edge_insert(handle, left_path, + right_path, subtree_index); + } +out: + if (left_path) + ocfs2_free_path(left_path); + return ret; +} + +static int ocfs2_try_to_merge_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_merge_ctxt *ctxt) +{ + int ret = 0; + struct ocfs2_extent_list *el = path_leaf_el(path); + struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; + + BUG_ON(ctxt->c_contig_type == CONTIG_NONE); + + if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { + /* + * The merge code will need to create an empty + * extent to take the place of the newly + * emptied slot. Remove any pre-existing empty + * extents - having more than one in a leaf is + * illegal. + */ + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + split_index--; + rec = &el->l_recs[split_index]; + } + + if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) { + /* + * Left-right contig implies this. + */ + BUG_ON(!ctxt->c_split_covers_rec); + + /* + * Since the leftright insert always covers the entire + * extent, this call will delete the insert record + * entirely, resulting in an empty extent record added to + * the extent block. + * + * Since the adding of an empty extent shifts + * everything back to the right, there's no need to + * update split_index here. + * + * When the split_index is zero, we need to merge it to the + * prevoius extent block. It is more efficient and easier + * if we do merge_right first and merge_left later. + */ + ret = ocfs2_merge_rec_right(path, handle, et, split_rec, + split_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We can only get this from logic error above. + */ + BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); + + /* The merge left us with an empty extent, remove it. */ + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + rec = &el->l_recs[split_index]; + + /* + * Note that we don't pass split_rec here on purpose - + * we've merged it into the rec already. + */ + ret = ocfs2_merge_rec_left(path, handle, et, rec, + dealloc, split_index); + + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); + /* + * Error from this last rotate is not critical, so + * print but don't bubble it up. + */ + if (ret) + mlog_errno(ret); + ret = 0; + } else { + /* + * Merge a record to the left or right. + * + * 'contig_type' is relative to the existing record, + * so for example, if we're "right contig", it's to + * the record on the left (hence the left merge). + */ + if (ctxt->c_contig_type == CONTIG_RIGHT) { + ret = ocfs2_merge_rec_left(path, handle, et, + split_rec, dealloc, + split_index); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + ret = ocfs2_merge_rec_right(path, handle, + et, split_rec, + split_index); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (ctxt->c_split_covers_rec) { + /* + * The merge may have left an empty extent in + * our leaf. Try to rotate it away. + */ + ret = ocfs2_rotate_tree_left(handle, et, path, + dealloc); + if (ret) + mlog_errno(ret); + ret = 0; + } + } + +out: + return ret; +} + +static void ocfs2_subtract_from_rec(struct super_block *sb, + enum ocfs2_split_type split, + struct ocfs2_extent_rec *rec, + struct ocfs2_extent_rec *split_rec) +{ + u64 len_blocks; + + len_blocks = ocfs2_clusters_to_blocks(sb, + le16_to_cpu(split_rec->e_leaf_clusters)); + + if (split == SPLIT_LEFT) { + /* + * Region is on the left edge of the existing + * record. + */ + le32_add_cpu(&rec->e_cpos, + le16_to_cpu(split_rec->e_leaf_clusters)); + le64_add_cpu(&rec->e_blkno, len_blocks); + le16_add_cpu(&rec->e_leaf_clusters, + -le16_to_cpu(split_rec->e_leaf_clusters)); + } else { + /* + * Region is on the right edge of the existing + * record. + */ + le16_add_cpu(&rec->e_leaf_clusters, + -le16_to_cpu(split_rec->e_leaf_clusters)); + } +} + +/* + * Do the final bits of extent record insertion at the target leaf + * list. If this leaf is part of an allocation tree, it is assumed + * that the tree above has been prepared. + */ +static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_extent_list *el, + struct ocfs2_insert_type *insert) +{ + int i = insert->ins_contig_index; + unsigned int range; + struct ocfs2_extent_rec *rec; + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + if (insert->ins_split != SPLIT_NONE) { + i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); + BUG_ON(i == -1); + rec = &el->l_recs[i]; + ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), + insert->ins_split, rec, + insert_rec); + goto rotate; + } + + /* + * Contiguous insert - either left or right. + */ + if (insert->ins_contig != CONTIG_NONE) { + rec = &el->l_recs[i]; + if (insert->ins_contig == CONTIG_LEFT) { + rec->e_blkno = insert_rec->e_blkno; + rec->e_cpos = insert_rec->e_cpos; + } + le16_add_cpu(&rec->e_leaf_clusters, + le16_to_cpu(insert_rec->e_leaf_clusters)); + return; + } + + /* + * Handle insert into an empty leaf. + */ + if (le16_to_cpu(el->l_next_free_rec) == 0 || + ((le16_to_cpu(el->l_next_free_rec) == 1) && + ocfs2_is_empty_extent(&el->l_recs[0]))) { + el->l_recs[0] = *insert_rec; + el->l_next_free_rec = cpu_to_le16(1); + return; + } + + /* + * Appending insert. + */ + if (insert->ins_appending == APPEND_TAIL) { + i = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[i]; + range = le32_to_cpu(rec->e_cpos) + + le16_to_cpu(rec->e_leaf_clusters); + BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); + + mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= + le16_to_cpu(el->l_count), + "owner %llu, depth %u, count %u, next free %u, " + "rec.cpos %u, rec.clusters %u, " + "insert.cpos %u, insert.clusters %u\n", + ocfs2_metadata_cache_owner(et->et_ci), + le16_to_cpu(el->l_tree_depth), + le16_to_cpu(el->l_count), + le16_to_cpu(el->l_next_free_rec), + le32_to_cpu(el->l_recs[i].e_cpos), + le16_to_cpu(el->l_recs[i].e_leaf_clusters), + le32_to_cpu(insert_rec->e_cpos), + le16_to_cpu(insert_rec->e_leaf_clusters)); + i++; + el->l_recs[i] = *insert_rec; + le16_add_cpu(&el->l_next_free_rec, 1); + return; + } + +rotate: + /* + * Ok, we have to rotate. + * + * At this point, it is safe to assume that inserting into an + * empty leaf and appending to a leaf have both been handled + * above. + * + * This leaf needs to have space, either by the empty 1st + * extent record, or by virtue of an l_next_rec < l_count. + */ + ocfs2_rotate_leaf(el, insert_rec); +} + +static void ocfs2_adjust_rightmost_records(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_extent_rec *insert_rec) +{ + int ret, i, next_free; + struct buffer_head *bh; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + /* + * Update everything except the leaf block. + */ + for (i = 0; i < path->p_tree_depth; i++) { + bh = path->p_node[i].bh; + el = path->p_node[i].el; + + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has a bad extent list", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); + ret = -EIO; + return; + } + + rec = &el->l_recs[next_free - 1]; + + rec->e_int_clusters = insert_rec->e_cpos; + le32_add_cpu(&rec->e_int_clusters, + le16_to_cpu(insert_rec->e_leaf_clusters)); + le32_add_cpu(&rec->e_int_clusters, + -le32_to_cpu(rec->e_cpos)); + + ocfs2_journal_dirty(handle, bh); + } +} + +static int ocfs2_append_rec_to_path(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret, next_free; + struct ocfs2_extent_list *el; + struct ocfs2_path *left_path = NULL; + + *ret_left_path = NULL; + + /* + * This shouldn't happen for non-trees. The extent rec cluster + * count manipulation below only works for interior nodes. + */ + BUG_ON(right_path->p_tree_depth == 0); + + /* + * If our appending insert is at the leftmost edge of a leaf, + * then we might need to update the rightmost records of the + * neighboring path. + */ + el = path_leaf_el(right_path); + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0 || + (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { + u32 left_cpos; + + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + right_path, &left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_append_rec_to_path( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + le32_to_cpu(insert_rec->e_cpos), + left_cpos); + + /* + * No need to worry if the append is already in the + * leftmost leaf. + */ + if (left_cpos) { + left_path = ocfs2_new_path_from_path(right_path); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, left_path, + left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * ocfs2_insert_path() will pass the left_path to the + * journal for us. + */ + } + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec); + + *ret_left_path = left_path; + ret = 0; +out: + if (ret != 0) + ocfs2_free_path(left_path); + + return ret; +} + +static void ocfs2_split_record(struct ocfs2_extent_tree *et, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + struct ocfs2_extent_rec *split_rec, + enum ocfs2_split_type split) +{ + int index; + u32 cpos = le32_to_cpu(split_rec->e_cpos); + struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; + struct ocfs2_extent_rec *rec, *tmprec; + + right_el = path_leaf_el(right_path); + if (left_path) + left_el = path_leaf_el(left_path); + + el = right_el; + insert_el = right_el; + index = ocfs2_search_extent_list(el, cpos); + if (index != -1) { + if (index == 0 && left_path) { + BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); + + /* + * This typically means that the record + * started in the left path but moved to the + * right as a result of rotation. We either + * move the existing record to the left, or we + * do the later insert there. + * + * In this case, the left path should always + * exist as the rotate code will have passed + * it back for a post-insert update. + */ + + if (split == SPLIT_LEFT) { + /* + * It's a left split. Since we know + * that the rotate code gave us an + * empty extent in the left path, we + * can just do the insert there. + */ + insert_el = left_el; + } else { + /* + * Right split - we have to move the + * existing record over to the left + * leaf. The insert will be into the + * newly created empty extent in the + * right leaf. + */ + tmprec = &right_el->l_recs[index]; + ocfs2_rotate_leaf(left_el, tmprec); + el = left_el; + + memset(tmprec, 0, sizeof(*tmprec)); + index = ocfs2_search_extent_list(left_el, cpos); + BUG_ON(index == -1); + } + } + } else { + BUG_ON(!left_path); + BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0])); + /* + * Left path is easy - we can just allow the insert to + * happen. + */ + el = left_el; + insert_el = left_el; + index = ocfs2_search_extent_list(el, cpos); + BUG_ON(index == -1); + } + + rec = &el->l_recs[index]; + ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), + split, rec, split_rec); + ocfs2_rotate_leaf(insert_el, split_rec); +} + +/* + * This function only does inserts on an allocation b-tree. For tree + * depth = 0, ocfs2_insert_at_leaf() is called directly. + * + * right_path is the path we want to do the actual insert + * in. left_path should only be passed in if we need to update that + * portion of the tree after an edge insert. + */ +static int ocfs2_insert_path(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_insert_type *insert) +{ + int ret, subtree_index; + struct buffer_head *leaf_bh = path_leaf_bh(right_path); + + if (left_path) { + /* + * There's a chance that left_path got passed back to + * us without being accounted for in the + * journal. Extend our transaction here to be sure we + * can change those blocks. + */ + ret = ocfs2_extend_trans(handle, left_path->p_tree_depth); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + /* + * Pass both paths to the journal. The majority of inserts + * will be touching all components anyway. + */ + ret = ocfs2_journal_access_path(et->et_ci, handle, right_path); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (insert->ins_split != SPLIT_NONE) { + /* + * We could call ocfs2_insert_at_leaf() for some types + * of splits, but it's easier to just let one separate + * function sort it all out. + */ + ocfs2_split_record(et, left_path, right_path, + insert_rec, insert->ins_split); + + /* + * Split might have modified either leaf and we don't + * have a guarantee that the later edge insert will + * dirty this for us. + */ + if (left_path) + ocfs2_journal_dirty(handle, + path_leaf_bh(left_path)); + } else + ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path), + insert); + + ocfs2_journal_dirty(handle, leaf_bh); + + if (left_path) { + /* + * The rotate code has indicated that we need to fix + * up portions of the tree after the insert. + * + * XXX: Should we extend the transaction here? + */ + subtree_index = ocfs2_find_subtree_root(et, left_path, + right_path); + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); + } + + ret = 0; +out: + return ret; +} + +static int ocfs2_do_insert_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_insert_type *type) +{ + int ret, rotate = 0; + u32 cpos; + struct ocfs2_path *right_path = NULL; + struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_list *el; + + el = et->et_root_el; + + ret = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (le16_to_cpu(el->l_tree_depth) == 0) { + ocfs2_insert_at_leaf(et, insert_rec, el, type); + goto out_update_clusters; + } + + right_path = ocfs2_new_path_from_et(et); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + /* + * Determine the path to start with. Rotations need the + * rightmost path, everything else can go directly to the + * target leaf. + */ + cpos = le32_to_cpu(insert_rec->e_cpos); + if (type->ins_appending == APPEND_NONE && + type->ins_contig == CONTIG_NONE) { + rotate = 1; + cpos = UINT_MAX; + } + + ret = ocfs2_find_path(et->et_ci, right_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Rotations and appends need special treatment - they modify + * parts of the tree's above them. + * + * Both might pass back a path immediate to the left of the + * one being inserted to. This will be cause + * ocfs2_insert_path() to modify the rightmost records of + * left_path to account for an edge insert. + * + * XXX: When modifying this code, keep in mind that an insert + * can wind up skipping both of these two special cases... + */ + if (rotate) { + ret = ocfs2_rotate_tree_right(handle, et, type->ins_split, + le32_to_cpu(insert_rec->e_cpos), + right_path, &left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * ocfs2_rotate_tree_right() might have extended the + * transaction without re-journaling our tree root. + */ + ret = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + } else if (type->ins_appending == APPEND_TAIL + && type->ins_contig != CONTIG_LEFT) { + ret = ocfs2_append_rec_to_path(handle, et, insert_rec, + right_path, &left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_insert_path(handle, et, left_path, right_path, + insert_rec, type); + if (ret) { + mlog_errno(ret); + goto out; + } + +out_update_clusters: + if (type->ins_split == SPLIT_NONE) + ocfs2_et_update_clusters(et, + le16_to_cpu(insert_rec->e_leaf_clusters)); + + ocfs2_journal_dirty(handle, et->et_root_bh); + +out: + ocfs2_free_path(left_path); + ocfs2_free_path(right_path); + + return ret; +} + +static enum ocfs2_contig_type +ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_extent_list *el, int index, + struct ocfs2_extent_rec *split_rec) +{ + int status; + enum ocfs2_contig_type ret = CONTIG_NONE; + u32 left_cpos, right_cpos; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_extent_list *new_el; + struct ocfs2_path *left_path = NULL, *right_path = NULL; + struct buffer_head *bh; + struct ocfs2_extent_block *eb; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + + if (index > 0) { + rec = &el->l_recs[index - 1]; + } else if (path->p_tree_depth > 0) { + status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); + if (status) + goto out; + + if (left_cpos != 0) { + left_path = ocfs2_new_path_from_path(path); + if (!left_path) + goto out; + + status = ocfs2_find_path(et->et_ci, left_path, + left_cpos); + if (status) + goto out; + + new_el = path_leaf_el(left_path); + + if (le16_to_cpu(new_el->l_next_free_rec) != + le16_to_cpu(new_el->l_count)) { + bh = path_leaf_bh(left_path); + eb = (struct ocfs2_extent_block *)bh->b_data; + ocfs2_error(sb, + "Extent block #%llu has an " + "invalid l_next_free_rec of " + "%d. It should have " + "matched the l_count of %d", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec), + le16_to_cpu(new_el->l_count)); + status = -EINVAL; + goto out; + } + rec = &new_el->l_recs[ + le16_to_cpu(new_el->l_next_free_rec) - 1]; + } + } + + /* + * We're careful to check for an empty extent record here - + * the merge code will know what to do if it sees one. + */ + if (rec) { + if (index == 1 && ocfs2_is_empty_extent(rec)) { + if (split_rec->e_cpos == el->l_recs[index].e_cpos) + ret = CONTIG_RIGHT; + } else { + ret = ocfs2_et_extent_contig(et, rec, split_rec); + } + } + + rec = NULL; + if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) + rec = &el->l_recs[index + 1]; + else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && + path->p_tree_depth > 0) { + status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); + if (status) + goto out; + + if (right_cpos == 0) + goto out; + + right_path = ocfs2_new_path_from_path(path); + if (!right_path) + goto out; + + status = ocfs2_find_path(et->et_ci, right_path, right_cpos); + if (status) + goto out; + + new_el = path_leaf_el(right_path); + rec = &new_el->l_recs[0]; + if (ocfs2_is_empty_extent(rec)) { + if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { + bh = path_leaf_bh(right_path); + eb = (struct ocfs2_extent_block *)bh->b_data; + ocfs2_error(sb, + "Extent block #%llu has an " + "invalid l_next_free_rec of %d", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec)); + status = -EINVAL; + goto out; + } + rec = &new_el->l_recs[1]; + } + } + + if (rec) { + enum ocfs2_contig_type contig_type; + + contig_type = ocfs2_et_extent_contig(et, rec, split_rec); + + if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) + ret = CONTIG_LEFTRIGHT; + else if (ret == CONTIG_NONE) + ret = contig_type; + } + +out: + if (left_path) + ocfs2_free_path(left_path); + if (right_path) + ocfs2_free_path(right_path); + + return ret; +} + +static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, + struct ocfs2_insert_type *insert, + struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i; + enum ocfs2_contig_type contig_type = CONTIG_NONE; + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i], + insert_rec); + if (contig_type != CONTIG_NONE) { + insert->ins_contig_index = i; + break; + } + } + insert->ins_contig = contig_type; + + if (insert->ins_contig != CONTIG_NONE) { + struct ocfs2_extent_rec *rec = + &el->l_recs[insert->ins_contig_index]; + unsigned int len = le16_to_cpu(rec->e_leaf_clusters) + + le16_to_cpu(insert_rec->e_leaf_clusters); + + /* + * Caller might want us to limit the size of extents, don't + * calculate contiguousness if we might exceed that limit. + */ + if (et->et_max_leaf_clusters && + (len > et->et_max_leaf_clusters)) + insert->ins_contig = CONTIG_NONE; + } +} + +/* + * This should only be called against the righmost leaf extent list. + * + * ocfs2_figure_appending_type() will figure out whether we'll have to + * insert at the tail of the rightmost leaf. + * + * This should also work against the root extent list for tree's with 0 + * depth. If we consider the root extent list to be the rightmost leaf node + * then the logic here makes sense. + */ +static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, + struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i; + u32 cpos = le32_to_cpu(insert_rec->e_cpos); + struct ocfs2_extent_rec *rec; + + insert->ins_appending = APPEND_NONE; + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + if (!el->l_next_free_rec) + goto set_tail_append; + + if (ocfs2_is_empty_extent(&el->l_recs[0])) { + /* Were all records empty? */ + if (le16_to_cpu(el->l_next_free_rec) == 1) + goto set_tail_append; + } + + i = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[i]; + + if (cpos >= + (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters))) + goto set_tail_append; + + return; + +set_tail_append: + insert->ins_appending = APPEND_TAIL; +} + +/* + * Helper function called at the beginning of an insert. + * + * This computes a few things that are commonly used in the process of + * inserting into the btree: + * - Whether the new extent is contiguous with an existing one. + * - The current tree depth. + * - Whether the insert is an appending one. + * - The total # of free records in the tree. + * + * All of the information is stored on the ocfs2_insert_type + * structure. + */ +static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et, + struct buffer_head **last_eb_bh, + struct ocfs2_extent_rec *insert_rec, + int *free_records, + struct ocfs2_insert_type *insert) +{ + int ret; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct ocfs2_path *path = NULL; + struct buffer_head *bh = NULL; + + insert->ins_split = SPLIT_NONE; + + el = et->et_root_el; + insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); + + if (el->l_tree_depth) { + /* + * If we have tree depth, we read in the + * rightmost extent block ahead of time as + * ocfs2_figure_insert_type() and ocfs2_add_branch() + * may want it later. + */ + ret = ocfs2_read_extent_block(et->et_ci, + ocfs2_et_get_last_eb_blk(et), + &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + } + + /* + * Unless we have a contiguous insert, we'll need to know if + * there is room left in our allocation tree for another + * extent record. + * + * XXX: This test is simplistic, we can search for empty + * extent records too. + */ + *free_records = le16_to_cpu(el->l_count) - + le16_to_cpu(el->l_next_free_rec); + + if (!insert->ins_tree_depth) { + ocfs2_figure_contig_type(et, insert, el, insert_rec); + ocfs2_figure_appending_type(insert, el, insert_rec); + return 0; + } + + path = ocfs2_new_path_from_et(et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + /* + * In the case that we're inserting past what the tree + * currently accounts for, ocfs2_find_path() will return for + * us the rightmost tree path. This is accounted for below in + * the appending code. + */ + ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos)); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + /* + * Now that we have the path, there's two things we want to determine: + * 1) Contiguousness (also set contig_index if this is so) + * + * 2) Are we doing an append? We can trivially break this up + * into two types of appends: simple record append, or a + * rotate inside the tail leaf. + */ + ocfs2_figure_contig_type(et, insert, el, insert_rec); + + /* + * The insert code isn't quite ready to deal with all cases of + * left contiguousness. Specifically, if it's an insert into + * the 1st record in a leaf, it will require the adjustment of + * cluster count on the last record of the path directly to it's + * left. For now, just catch that case and fool the layers + * above us. This works just fine for tree_depth == 0, which + * is why we allow that above. + */ + if (insert->ins_contig == CONTIG_LEFT && + insert->ins_contig_index == 0) + insert->ins_contig = CONTIG_NONE; + + /* + * Ok, so we can simply compare against last_eb to figure out + * whether the path doesn't exist. This will only happen in + * the case that we're doing a tail append, so maybe we can + * take advantage of that information somehow. + */ + if (ocfs2_et_get_last_eb_blk(et) == + path_leaf_bh(path)->b_blocknr) { + /* + * Ok, ocfs2_find_path() returned us the rightmost + * tree path. This might be an appending insert. There are + * two cases: + * 1) We're doing a true append at the tail: + * -This might even be off the end of the leaf + * 2) We're "appending" by rotating in the tail + */ + ocfs2_figure_appending_type(insert, el, insert_rec); + } + +out: + ocfs2_free_path(path); + + if (ret == 0) + *last_eb_bh = bh; + else + brelse(bh); + return ret; +} + +/* + * Insert an extent into a btree. + * + * The caller needs to update the owning btree's cluster count. + */ +int ocfs2_insert_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, + u64 start_blk, + u32 new_clusters, + u8 flags, + struct ocfs2_alloc_context *meta_ac) +{ + int status; + int uninitialized_var(free_records); + struct buffer_head *last_eb_bh = NULL; + struct ocfs2_insert_type insert = {0, }; + struct ocfs2_extent_rec rec; + + trace_ocfs2_insert_extent_start( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos, new_clusters); + + memset(&rec, 0, sizeof(rec)); + rec.e_cpos = cpu_to_le32(cpos); + rec.e_blkno = cpu_to_le64(start_blk); + rec.e_leaf_clusters = cpu_to_le16(new_clusters); + rec.e_flags = flags; + status = ocfs2_et_insert_check(et, &rec); + if (status) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec, + &free_records, &insert); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig, + insert.ins_contig_index, free_records, + insert.ins_tree_depth); + + if (insert.ins_contig == CONTIG_NONE && free_records == 0) { + status = ocfs2_grow_tree(handle, et, + &insert.ins_tree_depth, &last_eb_bh, + meta_ac); + if (status) { + mlog_errno(status); + goto bail; + } + } + + /* Finally, we can add clusters. This might rotate the tree for us. */ + status = ocfs2_do_insert_extent(handle, et, &rec, &insert); + if (status < 0) + mlog_errno(status); + else + ocfs2_et_extent_map_insert(et, &rec); + +bail: + brelse(last_eb_bh); + + return status; +} + +/* + * Allcate and add clusters into the extent b-tree. + * The new clusters(clusters_to_add) will be inserted at logical_offset. + * The extent b-tree's root is specified by et, and + * it is not limited to the file storage. Any extent tree can use this + * function if it implements the proper ocfs2_extent_tree. + */ +int ocfs2_add_clusters_in_btree(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) +{ + int status = 0, err = 0; + int free_extents; + enum ocfs2_alloc_restarted reason = RESTART_NONE; + u32 bit_off, num_bits; + u64 block; + u8 flags = 0; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); + + BUG_ON(!clusters_to_add); + + if (mark_unwritten) + flags = OCFS2_EXT_UNWRITTEN; + + free_extents = ocfs2_num_free_extents(osb, et); + if (free_extents < 0) { + status = free_extents; + mlog_errno(status); + goto leave; + } + + /* there are two cases which could cause us to EAGAIN in the + * we-need-more-metadata case: + * 1) we haven't reserved *any* + * 2) we are so fragmented, we've needed to add metadata too + * many times. */ + if (!free_extents && !meta_ac) { + err = -1; + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } else if ((!free_extents) + && (ocfs2_alloc_context_bits_left(meta_ac) + < ocfs2_extend_meta_needed(et->et_root_el))) { + err = -2; + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } + + status = __ocfs2_claim_clusters(handle, data_ac, 1, + clusters_to_add, &bit_off, &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + BUG_ON(num_bits > clusters_to_add); + + /* reserve our write early -- insert_extent may update the tree root */ + status = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + trace_ocfs2_add_clusters_in_btree( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + bit_off, num_bits); + status = ocfs2_insert_extent(handle, et, *logical_offset, block, + num_bits, flags, meta_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + ocfs2_journal_dirty(handle, et->et_root_bh); + + clusters_to_add -= num_bits; + *logical_offset += num_bits; + + if (clusters_to_add) { + err = clusters_to_add; + status = -EAGAIN; + reason = RESTART_TRANS; + } + +leave: + if (reason_ret) + *reason_ret = reason; + trace_ocfs2_add_clusters_in_btree_ret(status, reason, err); + return status; +} + +static void ocfs2_make_right_split_rec(struct super_block *sb, + struct ocfs2_extent_rec *split_rec, + u32 cpos, + struct ocfs2_extent_rec *rec) +{ + u32 rec_cpos = le32_to_cpu(rec->e_cpos); + u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters); + + memset(split_rec, 0, sizeof(struct ocfs2_extent_rec)); + + split_rec->e_cpos = cpu_to_le32(cpos); + split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos); + + split_rec->e_blkno = rec->e_blkno; + le64_add_cpu(&split_rec->e_blkno, + ocfs2_clusters_to_blocks(sb, cpos - rec_cpos)); + + split_rec->e_flags = rec->e_flags; +} + +static int ocfs2_split_and_insert(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct buffer_head **last_eb_bh, + int split_index, + struct ocfs2_extent_rec *orig_split_rec, + struct ocfs2_alloc_context *meta_ac) +{ + int ret = 0, depth; + unsigned int insert_range, rec_range, do_leftright = 0; + struct ocfs2_extent_rec tmprec; + struct ocfs2_extent_list *rightmost_el; + struct ocfs2_extent_rec rec; + struct ocfs2_extent_rec split_rec = *orig_split_rec; + struct ocfs2_insert_type insert; + struct ocfs2_extent_block *eb; + +leftright: + /* + * Store a copy of the record on the stack - it might move + * around as the tree is manipulated below. + */ + rec = path_leaf_el(path)->l_recs[split_index]; + + rightmost_el = et->et_root_el; + + depth = le16_to_cpu(rightmost_el->l_tree_depth); + if (depth) { + BUG_ON(!(*last_eb_bh)); + eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; + rightmost_el = &eb->h_list; + } + + if (le16_to_cpu(rightmost_el->l_next_free_rec) == + le16_to_cpu(rightmost_el->l_count)) { + ret = ocfs2_grow_tree(handle, et, + &depth, last_eb_bh, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + memset(&insert, 0, sizeof(struct ocfs2_insert_type)); + insert.ins_appending = APPEND_NONE; + insert.ins_contig = CONTIG_NONE; + insert.ins_tree_depth = depth; + + insert_range = le32_to_cpu(split_rec.e_cpos) + + le16_to_cpu(split_rec.e_leaf_clusters); + rec_range = le32_to_cpu(rec.e_cpos) + + le16_to_cpu(rec.e_leaf_clusters); + + if (split_rec.e_cpos == rec.e_cpos) { + insert.ins_split = SPLIT_LEFT; + } else if (insert_range == rec_range) { + insert.ins_split = SPLIT_RIGHT; + } else { + /* + * Left/right split. We fake this as a right split + * first and then make a second pass as a left split. + */ + insert.ins_split = SPLIT_RIGHT; + + ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), + &tmprec, insert_range, &rec); + + split_rec = tmprec; + + BUG_ON(do_leftright); + do_leftright = 1; + } + + ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (do_leftright == 1) { + u32 cpos; + struct ocfs2_extent_list *el; + + do_leftright++; + split_rec = *orig_split_rec; + + ocfs2_reinit_path(path, 1); + + cpos = le32_to_cpu(split_rec.e_cpos); + ret = ocfs2_find_path(et->et_ci, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + split_index = ocfs2_search_extent_list(el, cpos); + goto leftright; + } +out: + + return ret; +} + +static int ocfs2_replace_extent_rec(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_extent_list *el, + int split_index, + struct ocfs2_extent_rec *split_rec) +{ + int ret; + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path, + path_num_items(path) - 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + el->l_recs[split_index] = *split_rec; + + ocfs2_journal_dirty(handle, path_leaf_bh(path)); +out: + return ret; +} + +/* + * Split part or all of the extent record at split_index in the leaf + * pointed to by path. Merge with the contiguous extent record if needed. + * + * Care is taken to handle contiguousness so as to not grow the tree. + * + * meta_ac is not strictly necessary - we only truly need it if growth + * of the tree is required. All other cases will degrade into a less + * optimal tree layout. + * + * last_eb_bh should be the rightmost leaf block for any extent + * btree. Since a split may grow the tree or a merge might shrink it, + * the caller cannot trust the contents of that buffer after this call. + * + * This code is optimized for readability - several passes might be + * made over certain portions of the tree. All of those blocks will + * have been brought into cache (and pinned via the journal), so the + * extra overhead is not expressed in terms of disk reads. + */ +int ocfs2_split_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + struct ocfs2_extent_list *el = path_leaf_el(path); + struct buffer_head *last_eb_bh = NULL; + struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; + struct ocfs2_merge_ctxt ctxt; + struct ocfs2_extent_list *rightmost_el; + + if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || + ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < + (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + + ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el, + split_index, + split_rec); + + /* + * The core merge / split code wants to know how much room is + * left in this allocation tree, so we pass the + * rightmost extent list. + */ + if (path->p_tree_depth) { + struct ocfs2_extent_block *eb; + + ret = ocfs2_read_extent_block(et->et_ci, + ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + rightmost_el = &eb->h_list; + } else + rightmost_el = path_root_el(path); + + if (rec->e_cpos == split_rec->e_cpos && + rec->e_leaf_clusters == split_rec->e_leaf_clusters) + ctxt.c_split_covers_rec = 1; + else + ctxt.c_split_covers_rec = 0; + + ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]); + + trace_ocfs2_split_extent(split_index, ctxt.c_contig_type, + ctxt.c_has_empty_extent, + ctxt.c_split_covers_rec); + + if (ctxt.c_contig_type == CONTIG_NONE) { + if (ctxt.c_split_covers_rec) + ret = ocfs2_replace_extent_rec(handle, et, path, el, + split_index, split_rec); + else + ret = ocfs2_split_and_insert(handle, et, path, + &last_eb_bh, split_index, + split_rec, meta_ac); + if (ret) + mlog_errno(ret); + } else { + ret = ocfs2_try_to_merge_extent(handle, et, path, + split_index, split_rec, + dealloc, &ctxt); + if (ret) + mlog_errno(ret); + } + +out: + brelse(last_eb_bh); + return ret; +} + +/* + * Change the flags of the already-existing extent at cpos for len clusters. + * + * new_flags: the flags we want to set. + * clear_flags: the flags we want to clear. + * phys: the new physical offset we want this new extent starts from. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +int ocfs2_change_extent_flag(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int new_flags, int clear_flags) +{ + int ret, index; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys); + struct ocfs2_extent_rec split_rec; + struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + left_path = ocfs2_new_path_from_et(et); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + el = path_leaf_el(left_path); + + index = ocfs2_search_extent_list(el, cpos); + if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + ocfs2_error(sb, + "Owner %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), cpos); + ret = -EROFS; + goto out; + } + + ret = -EIO; + rec = &el->l_recs[index]; + if (new_flags && (rec->e_flags & new_flags)) { + mlog(ML_ERROR, "Owner %llu tried to set %d flags on an " + "extent that already had them", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + new_flags); + goto out; + } + + if (clear_flags && !(rec->e_flags & clear_flags)) { + mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an " + "extent that didn't have them", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + clear_flags); + goto out; + } + + memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); + split_rec.e_cpos = cpu_to_le32(cpos); + split_rec.e_leaf_clusters = cpu_to_le16(len); + split_rec.e_blkno = cpu_to_le64(start_blkno); + split_rec.e_flags = rec->e_flags; + if (new_flags) + split_rec.e_flags |= new_flags; + if (clear_flags) + split_rec.e_flags &= ~clear_flags; + + ret = ocfs2_split_extent(handle, et, left_path, + index, &split_rec, meta_ac, + dealloc); + if (ret) + mlog_errno(ret); + +out: + ocfs2_free_path(left_path); + return ret; + +} + +/* + * Mark the already-existing extent at cpos as written for len clusters. + * This removes the unwritten extent flag. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, + handle_t *handle, u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + + trace_ocfs2_mark_extent_written( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + cpos, len, phys); + + if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " + "that are being written to, but the feature bit " + "is not set in the super block.", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + ret = -EROFS; + goto out; + } + + /* + * XXX: This should be fixed up so that we just re-insert the + * next extent records. + */ + ocfs2_et_extent_map_truncate(et, 0); + + ret = ocfs2_change_extent_flag(handle, et, cpos, + len, phys, meta_ac, dealloc, + 0, OCFS2_EXT_UNWRITTEN); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int index, u32 new_range, + struct ocfs2_alloc_context *meta_ac) +{ + int ret, depth, credits; + struct buffer_head *last_eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *rightmost_el, *el; + struct ocfs2_extent_rec split_rec; + struct ocfs2_extent_rec *rec; + struct ocfs2_insert_type insert; + + /* + * Setup the record to split before we grow the tree. + */ + el = path_leaf_el(path); + rec = &el->l_recs[index]; + ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), + &split_rec, new_range, rec); + + depth = path->p_tree_depth; + if (depth > 0) { + ret = ocfs2_read_extent_block(et->et_ci, + ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + rightmost_el = &eb->h_list; + } else + rightmost_el = path_leaf_el(path); + + credits = path->p_tree_depth + + ocfs2_extend_meta_needed(et->et_root_el); + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (le16_to_cpu(rightmost_el->l_next_free_rec) == + le16_to_cpu(rightmost_el->l_count)) { + ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + memset(&insert, 0, sizeof(struct ocfs2_insert_type)); + insert.ins_appending = APPEND_NONE; + insert.ins_contig = CONTIG_NONE; + insert.ins_split = SPLIT_RIGHT; + insert.ins_tree_depth = depth; + + ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert); + if (ret) + mlog_errno(ret); + +out: + brelse(last_eb_bh); + return ret; +} + +static int ocfs2_truncate_rec(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, int index, + struct ocfs2_cached_dealloc_ctxt *dealloc, + u32 cpos, u32 len) +{ + int ret; + u32 left_cpos, rec_range, trunc_range; + int wants_rotate = 0, is_rightmost_tree_rec = 0; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_list *el = path_leaf_el(path); + struct ocfs2_extent_rec *rec; + struct ocfs2_extent_block *eb; + + if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + index--; + } + + if (index == (le16_to_cpu(el->l_next_free_rec) - 1) && + path->p_tree_depth) { + /* + * Check whether this is the rightmost tree record. If + * we remove all of this record or part of its right + * edge then an update of the record lengths above it + * will be required. + */ + eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; + if (eb->h_next_leaf_blk == 0) + is_rightmost_tree_rec = 1; + } + + rec = &el->l_recs[index]; + if (index == 0 && path->p_tree_depth && + le32_to_cpu(rec->e_cpos) == cpos) { + /* + * Changing the leftmost offset (via partial or whole + * record truncate) of an interior (or rightmost) path + * means we have to update the subtree that is formed + * by this leaf and the one to it's left. + * + * There are two cases we can skip: + * 1) Path is the leftmost one in our btree. + * 2) The leaf is rightmost and will be empty after + * we remove the extent record - the rotate code + * knows how to update the newly formed edge. + */ + + ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { + left_path = ocfs2_new_path_from_path(path); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, left_path, + left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + } + } + + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + trunc_range = cpos + len; + + if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) { + int next_free; + + memset(rec, 0, sizeof(*rec)); + ocfs2_cleanup_merge(el, index); + wants_rotate = 1; + + next_free = le16_to_cpu(el->l_next_free_rec); + if (is_rightmost_tree_rec && next_free > 1) { + /* + * We skip the edge update if this path will + * be deleted by the rotate code. + */ + rec = &el->l_recs[next_free - 1]; + ocfs2_adjust_rightmost_records(handle, et, path, + rec); + } + } else if (le32_to_cpu(rec->e_cpos) == cpos) { + /* Remove leftmost portion of the record. */ + le32_add_cpu(&rec->e_cpos, len); + le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len)); + le16_add_cpu(&rec->e_leaf_clusters, -len); + } else if (rec_range == trunc_range) { + /* Remove rightmost portion of the record */ + le16_add_cpu(&rec->e_leaf_clusters, -len); + if (is_rightmost_tree_rec) + ocfs2_adjust_rightmost_records(handle, et, path, rec); + } else { + /* Caller should have trapped this. */ + mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) " + "(%u, %u)\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + le32_to_cpu(rec->e_cpos), + le16_to_cpu(rec->e_leaf_clusters), cpos, len); + BUG(); + } + + if (left_path) { + int subtree_index; + + subtree_index = ocfs2_find_subtree_root(et, left_path, path); + ocfs2_complete_edge_insert(handle, left_path, path, + subtree_index); + } + + ocfs2_journal_dirty(handle, path_leaf_bh(path)); + + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + +out: + ocfs2_free_path(left_path); + return ret; +} + +int ocfs2_remove_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, index; + u32 rec_range, trunc_range; + struct ocfs2_extent_rec *rec; + struct ocfs2_extent_list *el; + struct ocfs2_path *path = NULL; + + /* + * XXX: Why are we truncating to 0 instead of wherever this + * affects us? + */ + ocfs2_et_extent_map_truncate(et, 0); + + path = ocfs2_new_path_from_et(et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + index = ocfs2_search_extent_list(el, cpos); + if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); + ret = -EROFS; + goto out; + } + + /* + * We have 3 cases of extent removal: + * 1) Range covers the entire extent rec + * 2) Range begins or ends on one edge of the extent rec + * 3) Range is in the middle of the extent rec (no shared edges) + * + * For case 1 we remove the extent rec and left rotate to + * fill the hole. + * + * For case 2 we just shrink the existing extent rec, with a + * tree update if the shrinking edge is also the edge of an + * extent block. + * + * For case 3 we do a right split to turn the extent rec into + * something case 2 can handle. + */ + rec = &el->l_recs[index]; + rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + trunc_range = cpos + len; + + BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); + + trace_ocfs2_remove_extent( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos, len, index, le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + + if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { + ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, + cpos, len); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + ret = ocfs2_split_tree(handle, et, path, index, + trunc_range, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * The split could have manipulated the tree enough to + * move the record location, so we have to look for it again. + */ + ocfs2_reinit_path(path, 1); + + ret = ocfs2_find_path(et->et_ci, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + index = ocfs2_search_extent_list(el, cpos); + if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu: split at cpos %u lost record.", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); + ret = -EROFS; + goto out; + } + + /* + * Double check our values here. If anything is fishy, + * it's easier to catch it at the top level. + */ + rec = &el->l_recs[index]; + rec_range = le32_to_cpu(rec->e_cpos) + + ocfs2_rec_clusters(el, rec); + if (rec_range != trunc_range) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu: error after split at cpos %u" + "trunc len %u, existing record is (%u,%u)", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos, len, le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, + cpos, len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + +out: + ocfs2_free_path(path); + return ret; +} + +/* + * ocfs2_reserve_blocks_for_rec_trunc() would look basically the + * same as ocfs2_lock_alloctors(), except for it accepts a blocks + * number to reserve some extra blocks, and it only handles meta + * data allocations. + * + * Currently, only ocfs2_remove_btree_range() uses it for truncating + * and punching holes. + */ +static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 extents_to_split, + struct ocfs2_alloc_context **ac, + int extra_blocks) +{ + int ret = 0, num_free_extents; + unsigned int max_recs_needed = 2 * extents_to_split; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + *ac = NULL; + + num_free_extents = ocfs2_num_free_extents(osb, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) + extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); + + if (extra_blocks) { + ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + +out: + if (ret) { + if (*ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + } + + return ret; +} + +int ocfs2_remove_btree_range(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 cpos, u32 phys_cpos, u32 len, int flags, + struct ocfs2_cached_dealloc_ctxt *dealloc, + u64 refcount_loc) +{ + int ret, credits = 0, extra_blocks = 0; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_refcount_tree *ref_tree = NULL; + + if ((flags & OCFS2_EXT_REFCOUNTED) && len) { + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & + OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_prepare_refcount_change_for_del(inode, + refcount_loc, + phys_blkno, + len, + &credits, + &extra_blocks); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac, + extra_blocks); + if (ret) { + mlog_errno(ret); + return ret; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, + ocfs2_remove_extent_credits(osb->sb) + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + dquot_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(inode->i_sb, len)); + + ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_et_update_clusters(et, -len); + + ocfs2_journal_dirty(handle, et->et_root_bh); + + if (phys_blkno) { + if (flags & OCFS2_EXT_REFCOUNTED) + ret = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(osb->sb, + phys_blkno), + len, meta_ac, + dealloc, 1); + else + ret = ocfs2_truncate_log_append(osb, handle, + phys_blkno, len); + if (ret) + mlog_errno(ret); + + } + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + mutex_unlock(&tl_inode->i_mutex); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + + return ret; +} + +int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) +{ + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + + mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), + "slot %d, invalid truncate log parameters: used = " + "%u, count = %u\n", osb->slot_num, + le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); + return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); +} + +static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, + unsigned int new_start) +{ + unsigned int tail_index; + unsigned int current_tail; + + /* No records, nothing to coalesce */ + if (!le16_to_cpu(tl->tl_used)) + return 0; + + tail_index = le16_to_cpu(tl->tl_used) - 1; + current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); + current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); + + return current_tail == new_start; +} + +int ocfs2_truncate_log_append(struct ocfs2_super *osb, + handle_t *handle, + u64 start_blk, + unsigned int num_clusters) +{ + int status, index; + unsigned int start_cluster, tl_count; + struct inode *tl_inode = osb->osb_tl_inode; + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + + start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); + + di = (struct ocfs2_dinode *) tl_bh->b_data; + + /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated + * by the underlying call to ocfs2_read_inode_block(), so any + * corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; + tl_count = le16_to_cpu(tl->tl_count); + mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || + tl_count == 0, + "Truncate record count on #%llu invalid " + "wanted %u, actual %u\n", + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, + ocfs2_truncate_recs_per_inode(osb->sb), + le16_to_cpu(tl->tl_count)); + + /* Caller should have known to flush before calling us. */ + index = le16_to_cpu(tl->tl_used); + if (index >= tl_count) { + status = -ENOSPC; + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + trace_ocfs2_truncate_log_append( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index, + start_cluster, num_clusters); + if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { + /* + * Move index back to the record we are coalescing with. + * ocfs2_truncate_log_can_coalesce() guarantees nonzero + */ + index--; + + num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); + trace_ocfs2_truncate_log_append( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, + index, le32_to_cpu(tl->tl_recs[index].t_start), + num_clusters); + } else { + tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); + tl->tl_used = cpu_to_le16(index + 1); + } + tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); + + ocfs2_journal_dirty(handle, tl_bh); + + osb->truncated_clusters += num_clusters; +bail: + return status; +} + +static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, + handle_t *handle, + struct inode *data_alloc_inode, + struct buffer_head *data_alloc_bh) +{ + int status = 0; + int i; + unsigned int num_clusters; + u64 start_blk; + struct ocfs2_truncate_rec rec; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + struct inode *tl_inode = osb->osb_tl_inode; + struct buffer_head *tl_bh = osb->osb_tl_bh; + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + i = le16_to_cpu(tl->tl_used) - 1; + while (i >= 0) { + /* Caller has given us at least enough credits to + * update the truncate log dinode */ + status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + tl->tl_used = cpu_to_le16(i); + + ocfs2_journal_dirty(handle, tl_bh); + + /* TODO: Perhaps we can calculate the bulk of the + * credits up front rather than extending like + * this. */ + status = ocfs2_extend_trans(handle, + OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + rec = tl->tl_recs[i]; + start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, + le32_to_cpu(rec.t_start)); + num_clusters = le32_to_cpu(rec.t_clusters); + + /* if start_blk is not set, we ignore the record as + * invalid. */ + if (start_blk) { + trace_ocfs2_replay_truncate_records( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, + i, le32_to_cpu(rec.t_start), num_clusters); + + status = ocfs2_free_clusters(handle, data_alloc_inode, + data_alloc_bh, start_blk, + num_clusters); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + i--; + } + + osb->truncated_clusters = 0; + +bail: + return status; +} + +/* Expects you to already be holding tl_inode->i_mutex */ +int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) +{ + int status; + unsigned int num_to_flush; + handle_t *handle; + struct inode *tl_inode = osb->osb_tl_inode; + struct inode *data_alloc_inode = NULL; + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct buffer_head *data_alloc_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + + di = (struct ocfs2_dinode *) tl_bh->b_data; + + /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated + * by the underlying call to ocfs2_read_inode_block(), so any + * corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; + num_to_flush = le16_to_cpu(tl->tl_used); + trace_ocfs2_flush_truncate_log( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, + num_to_flush); + if (!num_to_flush) { + status = 0; + goto out; + } + + data_alloc_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!data_alloc_inode) { + status = -EINVAL; + mlog(ML_ERROR, "Could not get bitmap inode!\n"); + goto out; + } + + mutex_lock(&data_alloc_inode->i_mutex); + + status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); + if (status < 0) { + mlog_errno(status); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_unlock; + } + + status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, + data_alloc_bh); + if (status < 0) + mlog_errno(status); + + ocfs2_commit_trans(osb, handle); + +out_unlock: + brelse(data_alloc_bh); + ocfs2_inode_unlock(data_alloc_inode, 1); + +out_mutex: + mutex_unlock(&data_alloc_inode->i_mutex); + iput(data_alloc_inode); + +out: + return status; +} + +int ocfs2_flush_truncate_log(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = osb->osb_tl_inode; + + mutex_lock(&tl_inode->i_mutex); + status = __ocfs2_flush_truncate_log(osb); + mutex_unlock(&tl_inode->i_mutex); + + return status; +} + +static void ocfs2_truncate_log_worker(struct work_struct *work) +{ + int status; + struct ocfs2_super *osb = + container_of(work, struct ocfs2_super, + osb_truncate_log_wq.work); + + status = ocfs2_flush_truncate_log(osb); + if (status < 0) + mlog_errno(status); + else + ocfs2_init_steal_slots(osb); +} + +#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) +void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, + int cancel) +{ + if (osb->osb_tl_inode) { + /* We want to push off log flushes while truncates are + * still running. */ + if (cancel) + cancel_delayed_work(&osb->osb_truncate_log_wq); + + queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, + OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); + } +} + +static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, + int slot_num, + struct inode **tl_inode, + struct buffer_head **tl_bh) +{ + int status; + struct inode *inode = NULL; + struct buffer_head *bh = NULL; + + inode = ocfs2_get_system_file_inode(osb, + TRUNCATE_LOG_SYSTEM_INODE, + slot_num); + if (!inode) { + status = -EINVAL; + mlog(ML_ERROR, "Could not get load truncate log inode!\n"); + goto bail; + } + + status = ocfs2_read_inode_block(inode, &bh); + if (status < 0) { + iput(inode); + mlog_errno(status); + goto bail; + } + + *tl_inode = inode; + *tl_bh = bh; +bail: + return status; +} + +/* called during the 1st stage of node recovery. we stamp a clean + * truncate log and pass back a copy for processing later. if the + * truncate log does not require processing, a *tl_copy is set to + * NULL. */ +int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, + int slot_num, + struct ocfs2_dinode **tl_copy) +{ + int status; + struct inode *tl_inode = NULL; + struct buffer_head *tl_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + *tl_copy = NULL; + + trace_ocfs2_begin_truncate_log_recovery(slot_num); + + status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + di = (struct ocfs2_dinode *) tl_bh->b_data; + + /* tl_bh is loaded from ocfs2_get_truncate_log_info(). It's + * validated by the underlying call to ocfs2_read_inode_block(), + * so any corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; + if (le16_to_cpu(tl->tl_used)) { + trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used)); + + *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); + if (!(*tl_copy)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* Assuming the write-out below goes well, this copy + * will be passed back to recovery for processing. */ + memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); + + /* All we need to do to clear the truncate log is set + * tl_used. */ + tl->tl_used = 0; + + ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check); + status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode)); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + +bail: + if (tl_inode) + iput(tl_inode); + brelse(tl_bh); + + if (status < 0 && (*tl_copy)) { + kfree(*tl_copy); + *tl_copy = NULL; + mlog_errno(status); + } + + return status; +} + +int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *tl_copy) +{ + int status = 0; + int i; + unsigned int clusters, num_recs, start_cluster; + u64 start_blk; + handle_t *handle; + struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_truncate_log *tl; + + if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { + mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); + return -EINVAL; + } + + tl = &tl_copy->id2.i_dealloc; + num_recs = le16_to_cpu(tl->tl_used); + trace_ocfs2_complete_truncate_log_recovery( + (unsigned long long)le64_to_cpu(tl_copy->i_blkno), + num_recs); + + mutex_lock(&tl_inode->i_mutex); + for(i = 0; i < num_recs; i++) { + if (ocfs2_truncate_log_needs_flush(osb)) { + status = __ocfs2_flush_truncate_log(osb); + if (status < 0) { + mlog_errno(status); + goto bail_up; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_up; + } + + clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); + start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); + start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); + + status = ocfs2_truncate_log_append(osb, handle, + start_blk, clusters); + ocfs2_commit_trans(osb, handle); + if (status < 0) { + mlog_errno(status); + goto bail_up; + } + } + +bail_up: + mutex_unlock(&tl_inode->i_mutex); + + return status; +} + +void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = osb->osb_tl_inode; + + if (tl_inode) { + cancel_delayed_work(&osb->osb_truncate_log_wq); + flush_workqueue(ocfs2_wq); + + status = ocfs2_flush_truncate_log(osb); + if (status < 0) + mlog_errno(status); + + brelse(osb->osb_tl_bh); + iput(osb->osb_tl_inode); + } +} + +int ocfs2_truncate_log_init(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = NULL; + struct buffer_head *tl_bh = NULL; + + status = ocfs2_get_truncate_log_info(osb, + osb->slot_num, + &tl_inode, + &tl_bh); + if (status < 0) + mlog_errno(status); + + /* ocfs2_truncate_log_shutdown keys on the existence of + * osb->osb_tl_inode so we don't set any of the osb variables + * until we're sure all is well. */ + INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, + ocfs2_truncate_log_worker); + osb->osb_tl_bh = tl_bh; + osb->osb_tl_inode = tl_inode; + + return status; +} + +/* + * Delayed de-allocation of suballocator blocks. + * + * Some sets of block de-allocations might involve multiple suballocator inodes. + * + * The locking for this can get extremely complicated, especially when + * the suballocator inodes to delete from aren't known until deep + * within an unrelated codepath. + * + * ocfs2_extent_block structures are a good example of this - an inode + * btree could have been grown by any number of nodes each allocating + * out of their own suballoc inode. + * + * These structures allow the delay of block de-allocation until a + * later time, when locking of multiple cluster inodes won't cause + * deadlock. + */ + +/* + * Describe a single bit freed from a suballocator. For the block + * suballocators, it represents one block. For the global cluster + * allocator, it represents some clusters and free_bit indicates + * clusters number. + */ +struct ocfs2_cached_block_free { + struct ocfs2_cached_block_free *free_next; + u64 free_bg; + u64 free_blk; + unsigned int free_bit; +}; + +struct ocfs2_per_slot_free_list { + struct ocfs2_per_slot_free_list *f_next_suballocator; + int f_inode_type; + int f_slot; + struct ocfs2_cached_block_free *f_first; +}; + +static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, + int sysfile_type, + int slot, + struct ocfs2_cached_block_free *head) +{ + int ret; + u64 bg_blkno; + handle_t *handle; + struct inode *inode; + struct buffer_head *di_bh = NULL; + struct ocfs2_cached_block_free *tmp; + + inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot); + if (!inode) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + mutex_lock(&inode->i_mutex); + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + while (head) { + if (head->free_bg) + bg_blkno = head->free_bg; + else + bg_blkno = ocfs2_which_suballoc_group(head->free_blk, + head->free_bit); + trace_ocfs2_free_cached_blocks( + (unsigned long long)head->free_blk, head->free_bit); + + ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, + head->free_bit, bg_blkno, 1); + if (ret) { + mlog_errno(ret); + goto out_journal; + } + + ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); + if (ret) { + mlog_errno(ret); + goto out_journal; + } + + tmp = head; + head = head->free_next; + kfree(tmp); + } + +out_journal: + ocfs2_commit_trans(osb, handle); + +out_unlock: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); +out_mutex: + mutex_unlock(&inode->i_mutex); + iput(inode); +out: + while(head) { + /* Premature exit may have left some dangling items. */ + tmp = head; + head = head->free_next; + kfree(tmp); + } + + return ret; +} + +int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + u64 blkno, unsigned int bit) +{ + int ret = 0; + struct ocfs2_cached_block_free *item; + + item = kzalloc(sizeof(*item), GFP_NOFS); + if (item == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + + trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit); + + item->free_blk = blkno; + item->free_bit = bit; + item->free_next = ctxt->c_global_allocator; + + ctxt->c_global_allocator = item; + return ret; +} + +static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, + struct ocfs2_cached_block_free *head) +{ + struct ocfs2_cached_block_free *tmp; + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + int ret = 0; + + mutex_lock(&tl_inode->i_mutex); + + while (head) { + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + break; + } + + ret = ocfs2_truncate_log_append(osb, handle, head->free_blk, + head->free_bit); + + ocfs2_commit_trans(osb, handle); + tmp = head; + head = head->free_next; + kfree(tmp); + + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + mutex_unlock(&tl_inode->i_mutex); + + while (head) { + /* Premature exit may have left some dangling items. */ + tmp = head; + head = head->free_next; + kfree(tmp); + } + + return ret; +} + +int ocfs2_run_deallocs(struct ocfs2_super *osb, + struct ocfs2_cached_dealloc_ctxt *ctxt) +{ + int ret = 0, ret2; + struct ocfs2_per_slot_free_list *fl; + + if (!ctxt) + return 0; + + while (ctxt->c_first_suballocator) { + fl = ctxt->c_first_suballocator; + + if (fl->f_first) { + trace_ocfs2_run_deallocs(fl->f_inode_type, + fl->f_slot); + ret2 = ocfs2_free_cached_blocks(osb, + fl->f_inode_type, + fl->f_slot, + fl->f_first); + if (ret2) + mlog_errno(ret2); + if (!ret) + ret = ret2; + } + + ctxt->c_first_suballocator = fl->f_next_suballocator; + kfree(fl); + } + + if (ctxt->c_global_allocator) { + ret2 = ocfs2_free_cached_clusters(osb, + ctxt->c_global_allocator); + if (ret2) + mlog_errno(ret2); + if (!ret) + ret = ret2; + + ctxt->c_global_allocator = NULL; + } + + return ret; +} + +static struct ocfs2_per_slot_free_list * +ocfs2_find_per_slot_free_list(int type, + int slot, + struct ocfs2_cached_dealloc_ctxt *ctxt) +{ + struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; + + while (fl) { + if (fl->f_inode_type == type && fl->f_slot == slot) + return fl; + + fl = fl->f_next_suballocator; + } + + fl = kmalloc(sizeof(*fl), GFP_NOFS); + if (fl) { + fl->f_inode_type = type; + fl->f_slot = slot; + fl->f_first = NULL; + fl->f_next_suballocator = ctxt->c_first_suballocator; + + ctxt->c_first_suballocator = fl; + } + return fl; +} + +int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + int type, int slot, u64 suballoc, + u64 blkno, unsigned int bit) +{ + int ret; + struct ocfs2_per_slot_free_list *fl; + struct ocfs2_cached_block_free *item; + + fl = ocfs2_find_per_slot_free_list(type, slot, ctxt); + if (fl == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + item = kzalloc(sizeof(*item), GFP_NOFS); + if (item == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + trace_ocfs2_cache_block_dealloc(type, slot, + (unsigned long long)suballoc, + (unsigned long long)blkno, bit); + + item->free_bg = suballoc; + item->free_blk = blkno; + item->free_bit = bit; + item->free_next = fl->f_first; + + fl->f_first = item; + + ret = 0; +out: + return ret; +} + +static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, + struct ocfs2_extent_block *eb) +{ + return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(eb->h_suballoc_slot), + le64_to_cpu(eb->h_suballoc_loc), + le64_to_cpu(eb->h_blkno), + le16_to_cpu(eb->h_suballoc_bit)); +} + +static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) +{ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + return 0; +} + +void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, + unsigned int from, unsigned int to, + struct page *page, int zero, u64 *phys) +{ + int ret, partial = 0; + + ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); + if (ret) + mlog_errno(ret); + + if (zero) + zero_user_segment(page, from, to); + + /* + * Need to set the buffers we zero'd into uptodate + * here if they aren't - ocfs2_map_page_blocks() + * might've skipped some + */ + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_zero_func); + if (ret < 0) + mlog_errno(ret); + else if (ocfs2_should_order_data(inode)) { + ret = ocfs2_jbd2_file_inode(handle, inode); + if (ret < 0) + mlog_errno(ret); + } + + if (!partial) + SetPageUptodate(page); + + flush_dcache_page(page); +} + +static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, + loff_t end, struct page **pages, + int numpages, u64 phys, handle_t *handle) +{ + int i; + struct page *page; + unsigned int from, to = PAGE_CACHE_SIZE; + struct super_block *sb = inode->i_sb; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); + + if (numpages == 0) + goto out; + + to = PAGE_CACHE_SIZE; + for(i = 0; i < numpages; i++) { + page = pages[i]; + + from = start & (PAGE_CACHE_SIZE - 1); + if ((end >> PAGE_CACHE_SHIFT) == page->index) + to = end & (PAGE_CACHE_SIZE - 1); + + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + + ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1, + &phys); + + start = (page->index + 1) << PAGE_CACHE_SHIFT; + } +out: + if (pages) + ocfs2_unlock_and_free_pages(pages, numpages); +} + +int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num) +{ + int numpages, ret = 0; + struct address_space *mapping = inode->i_mapping; + unsigned long index; + loff_t last_page_bytes; + + BUG_ON(start > end); + + numpages = 0; + last_page_bytes = PAGE_ALIGN(end); + index = start >> PAGE_CACHE_SHIFT; + do { + pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS); + if (!pages[numpages]) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + numpages++; + index++; + } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT)); + +out: + if (ret != 0) { + if (pages) + ocfs2_unlock_and_free_pages(pages, numpages); + numpages = 0; + } + + *num = numpages; + + return ret; +} + +static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num) +{ + struct super_block *sb = inode->i_sb; + + BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != + (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); + + return ocfs2_grab_pages(inode, start, end, pages, num); +} + +/* + * Zero the area past i_size but still within an allocated + * cluster. This avoids exposing nonzero data on subsequent file + * extends. + * + * We need to call this before i_size is updated on the inode because + * otherwise block_write_full_page() will skip writeout of pages past + * i_size. The new_i_size parameter is passed for this reason. + */ +int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, + u64 range_start, u64 range_end) +{ + int ret = 0, numpages; + struct page **pages = NULL; + u64 phys; + unsigned int ext_flags; + struct super_block *sb = inode->i_sb; + + /* + * File systems which don't support sparse files zero on every + * extend. + */ + if (!ocfs2_sparse_alloc(OCFS2_SB(sb))) + return 0; + + pages = kcalloc(ocfs2_pages_per_cluster(sb), + sizeof(struct page *), GFP_NOFS); + if (pages == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + if (range_start == range_end) + goto out; + + ret = ocfs2_extent_map_get_blocks(inode, + range_start >> sb->s_blocksize_bits, + &phys, NULL, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Tail is a hole, or is marked unwritten. In either case, we + * can count on read and write to return/push zero's. + */ + if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN) + goto out; + + ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages, + &numpages); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_zero_cluster_pages(inode, range_start, range_end, pages, + numpages, phys, handle); + + /* + * Initiate writeout of the pages we zero'd here. We don't + * wait on them - the truncate_inode_pages() call later will + * do that for us. + */ + ret = filemap_fdatawrite_range(inode->i_mapping, range_start, + range_end - 1); + if (ret) + mlog_errno(ret); + +out: + if (pages) + kfree(pages); + + return ret; +} + +static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode, + struct ocfs2_dinode *di) +{ + unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits; + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); + + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + memset(&di->id2, 0, blocksize - + offsetof(struct ocfs2_dinode, id2) - + xattrsize); + else + memset(&di->id2, 0, blocksize - + offsetof(struct ocfs2_dinode, id2)); +} + +void ocfs2_dinode_new_extent_list(struct inode *inode, + struct ocfs2_dinode *di) +{ + ocfs2_zero_dinode_id2_with_xattr(inode, di); + di->id2.i_list.l_tree_depth = 0; + di->id2.i_list.l_next_free_rec = 0; + di->id2.i_list.l_count = cpu_to_le16( + ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di)); +} + +void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_inline_data *idata = &di->id2.i_data; + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + + /* + * We clear the entire i_data structure here so that all + * fields can be properly initialized. + */ + ocfs2_zero_dinode_id2_with_xattr(inode, di); + + idata->id_count = cpu_to_le16( + ocfs2_max_inline_data_with_xattr(inode->i_sb, di)); +} + +int ocfs2_convert_inline_data_to_extents(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret, i, has_data, num_pages = 0; + handle_t *handle; + u64 uninitialized_var(block); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_alloc_context *data_ac = NULL; + struct page **pages = NULL; + loff_t end = osb->s_clustersize; + struct ocfs2_extent_tree et; + int did_quota = 0; + + has_data = i_size_read(inode) ? 1 : 0; + + if (has_data) { + pages = kcalloc(ocfs2_pages_per_cluster(osb->sb), + sizeof(struct page *), GFP_NOFS); + if (pages == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_reserve_clusters(osb, 1, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, + ocfs2_inline_to_extents_credits(osb->sb)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (has_data) { + u32 bit_off, num; + unsigned int page_end; + u64 phys; + + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (ret) + goto out_commit; + did_quota = 1; + + data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + + ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, + &num); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * Save two copies, one for insert, and one that can + * be changed by ocfs2_map_and_dirty_page() below. + */ + block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); + + /* + * Non sparse file systems zero on extend, so no need + * to do that now. + */ + if (!ocfs2_sparse_alloc(osb) && + PAGE_CACHE_SIZE < osb->s_clustersize) + end = PAGE_CACHE_SIZE; + + ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * This should populate the 1st page for us and mark + * it up to date. + */ + ret = ocfs2_read_inline_data(inode, pages[0], di_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + page_end = PAGE_CACHE_SIZE; + if (PAGE_CACHE_SIZE > osb->s_clustersize) + page_end = osb->s_clustersize; + + for (i = 0; i < num_pages; i++) + ocfs2_map_and_dirty_page(inode, handle, 0, page_end, + pages[i], i > 0, &phys); + } + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + + ocfs2_dinode_new_extent_list(inode, di); + + ocfs2_journal_dirty(handle, di_bh); + + if (has_data) { + /* + * An error at this point should be extremely rare. If + * this proves to be false, we could always re-build + * the in-inode data from our pages. + */ + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + inode->i_blocks = ocfs2_inode_sector_count(inode); + } + +out_commit: + if (ret < 0 && did_quota) + dquot_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + + ocfs2_commit_trans(osb, handle); + +out_unlock: + if (data_ac) + ocfs2_free_alloc_context(data_ac); + +out: + if (pages) { + ocfs2_unlock_and_free_pages(pages, num_pages); + kfree(pages); + } + + return ret; +} + +/* + * It is expected, that by the time you call this function, + * inode->i_size and fe->i_size have been adjusted. + * + * WARNING: This will kfree the truncate context + */ +int ocfs2_commit_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *di_bh) +{ + int status = 0, i, flags = 0; + u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff; + u64 blkno = 0; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + struct ocfs2_path *path = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_list *root_el = &(di->id2.i_list); + u64 refcount_loc = le64_to_cpu(di->i_refcount_loc); + struct ocfs2_extent_tree et; + struct ocfs2_cached_dealloc_ctxt dealloc; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + ocfs2_init_dealloc_ctxt(&dealloc); + + new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, + i_size_read(inode)); + + path = ocfs2_new_path(di_bh, &di->id2.i_list, + ocfs2_journal_access_di); + if (!path) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + ocfs2_extent_map_trunc(inode, new_highest_cpos); + +start: + /* + * Check that we still have allocation to delete. + */ + if (OCFS2_I(inode)->ip_clusters == 0) { + status = 0; + goto bail; + } + + /* + * Truncate always works against the rightmost tree branch. + */ + status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX); + if (status) { + mlog_errno(status); + goto bail; + } + + trace_ocfs2_commit_truncate( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + new_highest_cpos, + OCFS2_I(inode)->ip_clusters, + path->p_tree_depth); + + /* + * By now, el will point to the extent list on the bottom most + * portion of this tree. Only the tail record is considered in + * each pass. + * + * We handle the following cases, in order: + * - empty extent: delete the remaining branch + * - remove the entire record + * - remove a partial record + * - no record needs to be removed (truncate has completed) + */ + el = path_leaf_el(path); + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(inode->i_sb, + "Inode %llu has empty extent block at %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)path_leaf_bh(path)->b_blocknr); + status = -EROFS; + goto bail; + } + + i = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[i]; + flags = rec->e_flags; + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + + if (i == 0 && ocfs2_is_empty_extent(rec)) { + /* + * Lower levels depend on this never happening, but it's best + * to check it up here before changing the tree. + */ + if (root_el->l_tree_depth && rec->e_int_clusters == 0) { + ocfs2_error(inode->i_sb, "Inode %lu has an empty " + "extent record, depth %u\n", inode->i_ino, + le16_to_cpu(root_el->l_tree_depth)); + status = -EROFS; + goto bail; + } + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = 0; + blkno = 0; + } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { + /* + * Truncate entire record. + */ + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = ocfs2_rec_clusters(el, rec); + blkno = le64_to_cpu(rec->e_blkno); + } else if (range > new_highest_cpos) { + /* + * Partial truncate. it also should be + * the last truncate we're doing. + */ + trunc_cpos = new_highest_cpos; + trunc_len = range - new_highest_cpos; + coff = new_highest_cpos - le32_to_cpu(rec->e_cpos); + blkno = le64_to_cpu(rec->e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, coff); + } else { + /* + * Truncate completed, leave happily. + */ + status = 0; + goto bail; + } + + phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + + status = ocfs2_remove_btree_range(inode, &et, trunc_cpos, + phys_cpos, trunc_len, flags, &dealloc, + refcount_loc); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_reinit_path(path, 1); + + /* + * The check above will catch the case where we've truncated + * away all allocation. + */ + goto start; + +bail: + + ocfs2_schedule_truncate_log_flush(osb, 1); + + ocfs2_run_deallocs(osb, &dealloc); + + ocfs2_free_path(path); + + return status; +} + +/* + * 'start' is inclusive, 'end' is not. + */ +int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, + unsigned int start, unsigned int end, int trunc) +{ + int ret; + unsigned int numbytes; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inline_data *idata = &di->id2.i_data; + + if (end > i_size_read(inode)) + end = i_size_read(inode); + + BUG_ON(start >= end); + + if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || + !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || + !ocfs2_supports_inline_data(osb)) { + ocfs2_error(inode->i_sb, + "Inline data flags for inode %llu don't agree! " + "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + le16_to_cpu(di->i_dyn_features), + OCFS2_I(inode)->ip_dyn_features, + osb->s_feature_incompat); + ret = -EROFS; + goto out; + } + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + numbytes = end - start; + memset(idata->id_data + start, 0, numbytes); + + /* + * No need to worry about the data page here - it's been + * truncated already and inline data doesn't need it for + * pushing zero's to disk, so we'll let readpage pick it up + * later. + */ + if (trunc) { + i_size_write(inode, start); + di->i_size = cpu_to_le64(start); + } + + inode->i_blocks = ocfs2_inode_sector_count(inode); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + return ret; +} + +static int ocfs2_trim_extent(struct super_block *sb, + struct ocfs2_group_desc *gd, + u32 start, u32 count) +{ + u64 discard, bcount; + + bcount = ocfs2_clusters_to_blocks(sb, count); + discard = le64_to_cpu(gd->bg_blkno) + + ocfs2_clusters_to_blocks(sb, start); + + trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount); + + return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0); +} + +static int ocfs2_trim_group(struct super_block *sb, + struct ocfs2_group_desc *gd, + u32 start, u32 max, u32 minbits) +{ + int ret = 0, count = 0, next; + void *bitmap = gd->bg_bitmap; + + if (le16_to_cpu(gd->bg_free_bits_count) < minbits) + return 0; + + trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno), + start, max, minbits); + + while (start < max) { + start = ocfs2_find_next_zero_bit(bitmap, max, start); + if (start >= max) + break; + next = ocfs2_find_next_bit(bitmap, max, start); + + if ((next - start) >= minbits) { + ret = ocfs2_trim_extent(sb, gd, + start, next - start); + if (ret < 0) { + mlog_errno(ret); + break; + } + count += next - start; + } + start = next + 1; + + if (fatal_signal_pending(current)) { + count = -ERESTARTSYS; + break; + } + + if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits) + break; + } + + if (ret < 0) + count = ret; + + return count; +} + +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + struct ocfs2_super *osb = OCFS2_SB(sb); + u64 start, len, trimmed, first_group, last_group, group; + int ret, cnt; + u32 first_bit, last_bit, minlen; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + struct buffer_head *gd_bh = NULL; + struct ocfs2_dinode *main_bm; + struct ocfs2_group_desc *gd = NULL; + + start = range->start >> osb->s_clustersize_bits; + len = range->len >> osb->s_clustersize_bits; + minlen = range->minlen >> osb->s_clustersize_bits; + trimmed = 0; + + if (!len) { + range->len = 0; + return 0; + } + + if (minlen >= osb->bitmap_cpg) + return -EINVAL; + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + + mutex_lock(&main_bm_inode->i_mutex); + + ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; + + if (start >= le32_to_cpu(main_bm->i_clusters)) { + ret = -EINVAL; + goto out_unlock; + } + + if (start + len > le32_to_cpu(main_bm->i_clusters)) + len = le32_to_cpu(main_bm->i_clusters) - start; + + trace_ocfs2_trim_fs(start, len, minlen); + + /* Determine first and last group to examine based on start and len */ + first_group = ocfs2_which_cluster_group(main_bm_inode, start); + if (first_group == osb->first_cluster_group_blkno) + first_bit = start; + else + first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); + last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); + last_bit = osb->bitmap_cpg; + + for (group = first_group; group <= last_group;) { + if (first_bit + len >= osb->bitmap_cpg) + last_bit = osb->bitmap_cpg; + else + last_bit = first_bit + len; + + ret = ocfs2_read_group_descriptor(main_bm_inode, + main_bm, group, + &gd_bh); + if (ret < 0) { + mlog_errno(ret); + break; + } + + gd = (struct ocfs2_group_desc *)gd_bh->b_data; + cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen); + brelse(gd_bh); + gd_bh = NULL; + if (cnt < 0) { + ret = cnt; + mlog_errno(ret); + break; + } + + trimmed += cnt; + len -= osb->bitmap_cpg - first_bit; + first_bit = 0; + if (group == osb->first_cluster_group_blkno) + group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); + else + group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); + } + range->len = trimmed * sb->s_blocksize; +out_unlock: + ocfs2_inode_unlock(main_bm_inode, 0); + brelse(main_bm_bh); +out_mutex: + mutex_unlock(&main_bm_inode->i_mutex); + iput(main_bm_inode); +out: + return ret; +} diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h new file mode 100644 index 00000000..ca381c58 --- /dev/null +++ b/fs/ocfs2/alloc.h @@ -0,0 +1,324 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * alloc.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_ALLOC_H +#define OCFS2_ALLOC_H + + +/* + * For xattr tree leaf, we limit the leaf byte size to be 64K. + */ +#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536 + +/* + * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract + * the b-tree operations in ocfs2. Now all the b-tree operations are not + * limited to ocfs2_dinode only. Any data which need to allocate clusters + * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree + * and operation. + * + * ocfs2_extent_tree becomes the first-class object for extent tree + * manipulation. Callers of the alloc.c code need to fill it via one of + * the ocfs2_init_*_extent_tree() operations below. + * + * ocfs2_extent_tree contains info for the root of the b-tree, it must have a + * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree + * functions. It needs the ocfs2_caching_info structure associated with + * I/O on the tree. With metadata ecc, we now call different journal_access + * functions for each type of metadata, so it must have the + * root_journal_access function. + * ocfs2_extent_tree_operations abstract the normal operations we do for + * the root of extent b-tree. + */ +struct ocfs2_extent_tree_operations; +struct ocfs2_extent_tree { + struct ocfs2_extent_tree_operations *et_ops; + struct buffer_head *et_root_bh; + struct ocfs2_extent_list *et_root_el; + struct ocfs2_caching_info *et_ci; + ocfs2_journal_access_func et_root_journal_access; + void *et_object; + unsigned int et_max_leaf_clusters; +}; + +/* + * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the + * specified object buffer. + */ +void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh); +void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh); +struct ocfs2_xattr_value_buf; +void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct ocfs2_xattr_value_buf *vb); +void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh); +void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh); + +/* + * Read an extent block into *bh. If *bh is NULL, a bh will be + * allocated. This is a cached read. The extent block will be validated + * with ocfs2_validate_extent_block(). + */ +int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, + struct buffer_head **bh); + +struct ocfs2_alloc_context; +int ocfs2_insert_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, + u64 start_blk, + u32 new_clusters, + u8 flags, + struct ocfs2_alloc_context *meta_ac); + +enum ocfs2_alloc_restarted { + RESTART_NONE = 0, + RESTART_TRANS, + RESTART_META +}; +int ocfs2_add_clusters_in_btree(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret); +struct ocfs2_cached_dealloc_ctxt; +struct ocfs2_path; +int ocfs2_split_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, + handle_t *handle, u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_change_extent_flag(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int new_flags, int clear_flags); +int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et, + u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_remove_btree_range(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 cpos, u32 phys_cpos, u32 len, int flags, + struct ocfs2_cached_dealloc_ctxt *dealloc, + u64 refcount_loc); + +int ocfs2_num_free_extents(struct ocfs2_super *osb, + struct ocfs2_extent_tree *et); + +/* + * how many new metadata chunks would an allocation need at maximum? + * + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ +static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el) +{ + /* + * Rather than do all the work of determining how much we need + * (involves a ton of reads and locks), just ask for the + * maximal limit. That's a tree depth shift. So, one block for + * level of the tree (current l_tree_depth), one block for the + * new tree_depth==0 extent_block, and one block at the new + * top-of-the tree. + */ + return le16_to_cpu(root_el->l_tree_depth) + 2; +} + +void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di); +void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di); +int ocfs2_convert_inline_data_to_extents(struct inode *inode, + struct buffer_head *di_bh); + +int ocfs2_truncate_log_init(struct ocfs2_super *osb); +void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb); +void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, + int cancel); +int ocfs2_flush_truncate_log(struct ocfs2_super *osb); +int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, + int slot_num, + struct ocfs2_dinode **tl_copy); +int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *tl_copy); +int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb); +int ocfs2_truncate_log_append(struct ocfs2_super *osb, + handle_t *handle, + u64 start_blk, + unsigned int num_clusters); +int __ocfs2_flush_truncate_log(struct ocfs2_super *osb); + +/* + * Process local structure which describes the block unlinks done + * during an operation. This is populated via + * ocfs2_cache_block_dealloc(). + * + * ocfs2_run_deallocs() should be called after the potentially + * de-allocating routines. No journal handles should be open, and most + * locks should have been dropped. + */ +struct ocfs2_cached_dealloc_ctxt { + struct ocfs2_per_slot_free_list *c_first_suballocator; + struct ocfs2_cached_block_free *c_global_allocator; +}; +static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c) +{ + c->c_first_suballocator = NULL; + c->c_global_allocator = NULL; +} +int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + u64 blkno, unsigned int bit); +int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + int type, int slot, u64 suballoc, u64 blkno, + unsigned int bit); +static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) +{ + return c->c_global_allocator != NULL; +} +int ocfs2_run_deallocs(struct ocfs2_super *osb, + struct ocfs2_cached_dealloc_ctxt *ctxt); + +struct ocfs2_truncate_context { + struct ocfs2_cached_dealloc_ctxt tc_dealloc; + int tc_ext_alloc_locked; /* is it cluster locked? */ + /* these get destroyed once it's passed to ocfs2_commit_truncate. */ + struct buffer_head *tc_last_eb_bh; +}; + +int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, + u64 range_start, u64 range_end); +int ocfs2_commit_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *di_bh); +int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, + unsigned int start, unsigned int end, int trunc); + +int ocfs2_find_leaf(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *root_el, u32 cpos, + struct buffer_head **leaf_bh); +int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); + +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range); +/* + * Helper function to look at the # of clusters in an extent record. + */ +static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *rec) +{ + /* + * Cluster count in extent records is slightly different + * between interior nodes and leaf nodes. This is to support + * unwritten extents which need a flags field in leaf node + * records, thus shrinking the available space for a clusters + * field. + */ + if (el->l_tree_depth) + return le32_to_cpu(rec->e_int_clusters); + else + return le16_to_cpu(rec->e_leaf_clusters); +} + +/* + * This is only valid for leaf nodes, which are the only ones that can + * have empty extents anyway. + */ +static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) +{ + return !rec->e_leaf_clusters; +} + +int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num); +void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, + unsigned int from, unsigned int to, + struct page *page, int zero, u64 *phys); +/* + * Structures which describe a path through a btree, and functions to + * manipulate them. + * + * The idea here is to be as generic as possible with the tree + * manipulation code. + */ +struct ocfs2_path_item { + struct buffer_head *bh; + struct ocfs2_extent_list *el; +}; + +#define OCFS2_MAX_PATH_DEPTH 5 + +struct ocfs2_path { + int p_tree_depth; + ocfs2_journal_access_func p_root_access; + struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; +}; + +#define path_root_bh(_path) ((_path)->p_node[0].bh) +#define path_root_el(_path) ((_path)->p_node[0].el) +#define path_root_access(_path)((_path)->p_root_access) +#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) +#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) +#define path_num_items(_path) ((_path)->p_tree_depth + 1) + +void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root); +void ocfs2_free_path(struct ocfs2_path *path); +int ocfs2_find_path(struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + u32 cpos); +struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path); +struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et); +int ocfs2_path_bh_journal_access(handle_t *handle, + struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + int idx); +int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, + handle_t *handle, + struct ocfs2_path *path); +int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos); +int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos); +int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, + struct ocfs2_path *left, + struct ocfs2_path *right); +#endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c new file mode 100644 index 00000000..ac97bca2 --- /dev/null +++ b/fs/ocfs2/aops.c @@ -0,0 +1,2048 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "aops.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "suballoc.h" +#include "super.h" +#include "symlink.h" +#include "refcounttree.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int err = -EIO; + int status; + struct ocfs2_dinode *fe = NULL; + struct buffer_head *bh = NULL; + struct buffer_head *buffer_cache_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + void *kaddr; + + trace_ocfs2_symlink_get_block( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)iblock, bh_result, create); + + BUG_ON(ocfs2_inode_is_fast_symlink(inode)); + + if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) { + mlog(ML_ERROR, "block offset > PATH_MAX: %llu", + (unsigned long long)iblock); + goto bail; + } + + status = ocfs2_read_inode_block(inode, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + fe = (struct ocfs2_dinode *) bh->b_data; + + if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, + le32_to_cpu(fe->i_clusters))) { + mlog(ML_ERROR, "block offset is outside the allocated size: " + "%llu\n", (unsigned long long)iblock); + goto bail; + } + + /* We don't use the page cache to create symlink data, so if + * need be, copy it over from the buffer cache. */ + if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) { + u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + + iblock; + buffer_cache_bh = sb_getblk(osb->sb, blkno); + if (!buffer_cache_bh) { + mlog(ML_ERROR, "couldn't getblock for symlink!\n"); + goto bail; + } + + /* we haven't locked out transactions, so a commit + * could've happened. Since we've got a reference on + * the bh, even if it commits while we're doing the + * copy, the data is still good. */ + if (buffer_jbd(buffer_cache_bh) + && ocfs2_inode_is_new(inode)) { + kaddr = kmap_atomic(bh_result->b_page, KM_USER0); + if (!kaddr) { + mlog(ML_ERROR, "couldn't kmap!\n"); + goto bail; + } + memcpy(kaddr + (bh_result->b_size * iblock), + buffer_cache_bh->b_data, + bh_result->b_size); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh_result); + } + brelse(buffer_cache_bh); + } + + map_bh(bh_result, inode->i_sb, + le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock); + + err = 0; + +bail: + brelse(bh); + + return err; +} + +int ocfs2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int err = 0; + unsigned int ext_flags; + u64 max_blocks = bh_result->b_size >> inode->i_blkbits; + u64 p_blkno, count, past_eof; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)iblock, bh_result, create); + + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) + mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", + inode, inode->i_ino); + + if (S_ISLNK(inode->i_mode)) { + /* this always does I/O for some reason. */ + err = ocfs2_symlink_get_block(inode, iblock, bh_result, create); + goto bail; + } + + err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count, + &ext_flags); + if (err) { + mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " + "%llu, NULL)\n", err, inode, (unsigned long long)iblock, + (unsigned long long)p_blkno); + goto bail; + } + + if (max_blocks < count) + count = max_blocks; + + /* + * ocfs2 never allocates in this function - the only time we + * need to use BH_New is when we're extending i_size on a file + * system which doesn't support holes, in which case BH_New + * allows __block_write_begin() to zero. + * + * If we see this on a sparse file system, then a truncate has + * raced us and removed the cluster. In this case, we clear + * the buffers dirty and uptodate bits and let the buffer code + * ignore it as a hole. + */ + if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) { + clear_buffer_dirty(bh_result); + clear_buffer_uptodate(bh_result); + goto bail; + } + + /* Treat the unwritten extent as a hole for zeroing purposes. */ + if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) + map_bh(bh_result, inode->i_sb, p_blkno); + + bh_result->b_size = count << inode->i_blkbits; + + if (!ocfs2_sparse_alloc(osb)) { + if (p_blkno == 0) { + err = -EIO; + mlog(ML_ERROR, + "iblock = %llu p_blkno = %llu blkno=(%llu)\n", + (unsigned long long)iblock, + (unsigned long long)p_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); + dump_stack(); + goto bail; + } + } + + past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + + trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)past_eof); + if (create && (iblock >= past_eof)) + set_buffer_new(bh_result); + +bail: + if (err < 0) + err = -EIO; + + return err; +} + +int ocfs2_read_inline_data(struct inode *inode, struct page *page, + struct buffer_head *di_bh) +{ + void *kaddr; + loff_t size; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { + ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + return -EROFS; + } + + size = i_size_read(inode); + + if (size > PAGE_CACHE_SIZE || + size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { + ocfs2_error(inode->i_sb, + "Inode %llu has with inline data has bad size: %Lu", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)size); + return -EROFS; + } + + kaddr = kmap_atomic(page, KM_USER0); + if (size) + memcpy(kaddr, di->id2.i_data.id_data, size); + /* Clear the remaining part of the page */ + memset(kaddr + size, 0, PAGE_CACHE_SIZE - size); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + SetPageUptodate(page); + + return 0; +} + +static int ocfs2_readpage_inline(struct inode *inode, struct page *page) +{ + int ret; + struct buffer_head *di_bh = NULL; + + BUG_ON(!PageLocked(page)); + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_inline_data(inode, page, di_bh); +out: + unlock_page(page); + + brelse(di_bh); + return ret; +} + +static int ocfs2_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; + int ret, unlock = 1; + + trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, + (page ? page->index : 0)); + + ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page); + if (ret != 0) { + if (ret == AOP_TRUNCATED_PAGE) + unlock = 0; + mlog_errno(ret); + goto out; + } + + if (down_read_trylock(&oi->ip_alloc_sem) == 0) { + ret = AOP_TRUNCATED_PAGE; + goto out_inode_unlock; + } + + /* + * i_size might have just been updated as we grabed the meta lock. We + * might now be discovering a truncate that hit on another node. + * block_read_full_page->get_block freaks out if it is asked to read + * beyond the end of a file, so we check here. Callers + * (generic_file_read, vm_ops->fault) are clever enough to check i_size + * and notice that the page they just read isn't needed. + * + * XXX sys_readahead() seems to get that wrong? + */ + if (start >= i_size_read(inode)) { + zero_user(page, 0, PAGE_SIZE); + SetPageUptodate(page); + ret = 0; + goto out_alloc; + } + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + ret = ocfs2_readpage_inline(inode, page); + else + ret = block_read_full_page(page, ocfs2_get_block); + unlock = 0; + +out_alloc: + up_read(&OCFS2_I(inode)->ip_alloc_sem); +out_inode_unlock: + ocfs2_inode_unlock(inode, 0); +out: + if (unlock) + unlock_page(page); + return ret; +} + +/* + * This is used only for read-ahead. Failures or difficult to handle + * situations are safe to ignore. + * + * Right now, we don't bother with BH_Boundary - in-inode extent lists + * are quite large (243 extents on 4k blocks), so most inodes don't + * grow out to a tree. If need be, detecting boundary extents could + * trivially be added in a future version of ocfs2_get_block(). + */ +static int ocfs2_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + int ret, err = -EIO; + struct inode *inode = mapping->host; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + loff_t start; + struct page *last; + + /* + * Use the nonblocking flag for the dlm code to avoid page + * lock inversion, but don't bother with retrying. + */ + ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK); + if (ret) + return err; + + if (down_read_trylock(&oi->ip_alloc_sem) == 0) { + ocfs2_inode_unlock(inode, 0); + return err; + } + + /* + * Don't bother with inline-data. There isn't anything + * to read-ahead in that case anyway... + */ + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + goto out_unlock; + + /* + * Check whether a remote node truncated this file - we just + * drop out in that case as it's not worth handling here. + */ + last = list_entry(pages->prev, struct page, lru); + start = (loff_t)last->index << PAGE_CACHE_SHIFT; + if (start >= i_size_read(inode)) + goto out_unlock; + + err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block); + +out_unlock: + up_read(&oi->ip_alloc_sem); + ocfs2_inode_unlock(inode, 0); + + return err; +} + +/* Note: Because we don't support holes, our allocation has + * already happened (allocation writes zeros to the file data) + * so we don't have to worry about ordered writes in + * ocfs2_writepage. + * + * ->writepage is called during the process of invalidating the page cache + * during blocked lock processing. It can't block on any cluster locks + * to during block mapping. It's relying on the fact that the block + * mapping can't have disappeared under the dirty pages that it is + * being asked to write back. + */ +static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) +{ + trace_ocfs2_writepage( + (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno, + page->index); + + return block_write_full_page(page, ocfs2_get_block, wbc); +} + +/* Taken from ext3. We don't necessarily need the full blown + * functionality yet, but IMHO it's better to cut and paste the whole + * thing so we can avoid introducing our own bugs (and easily pick up + * their fixes when they happen) --Mark */ +int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for ( bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) + { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) +{ + sector_t status; + u64 p_blkno = 0; + int err = 0; + struct inode *inode = mapping->host; + + trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)block); + + /* We don't need to lock journal system files, since they aren't + * accessed concurrently from multiple nodes. + */ + if (!INODE_JOURNAL(inode)) { + err = ocfs2_inode_lock(inode, NULL, 0); + if (err) { + if (err != -ENOENT) + mlog_errno(err); + goto bail; + } + down_read(&OCFS2_I(inode)->ip_alloc_sem); + } + + if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) + err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, + NULL); + + if (!INODE_JOURNAL(inode)) { + up_read(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_inode_unlock(inode, 0); + } + + if (err) { + mlog(ML_ERROR, "get_blocks() failed, block = %llu\n", + (unsigned long long)block); + mlog_errno(err); + goto bail; + } + +bail: + status = err ? 0 : p_blkno; + + return status; +} + +/* + * TODO: Make this into a generic get_blocks function. + * + * From do_direct_io in direct-io.c: + * "So what we do is to permit the ->get_blocks function to populate + * bh.b_size with the size of IO which is permitted at this offset and + * this i_blkbits." + * + * This function is called directly from get_more_blocks in direct-io.c. + * + * called like this: dio->get_blocks(dio->inode, fs_startblk, + * fs_count, map_bh, dio->rw == WRITE); + * + * Note that we never bother to allocate blocks here, and thus ignore the + * create argument. + */ +static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + u64 p_blkno, inode_blocks, contig_blocks; + unsigned int ext_flags; + unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + + /* This function won't even be called if the request isn't all + * nicely aligned and of the right size, so there's no need + * for us to check any of that. */ + + inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + + /* This figures out the size of the next contiguous block, and + * our logical offset */ + ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, + &contig_blocks, &ext_flags); + if (ret) { + mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", + (unsigned long long)iblock); + ret = -EIO; + goto bail; + } + + /* We should already CoW the refcounted extent in case of create. */ + BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); + + /* + * get_more_blocks() expects us to describe a hole by clearing + * the mapped bit on bh_result(). + * + * Consider an unwritten extent as a hole. + */ + if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) + map_bh(bh_result, inode->i_sb, p_blkno); + else + clear_buffer_mapped(bh_result); + + /* make sure we don't map more than max_blocks blocks here as + that's all the kernel will handle at this point. */ + if (max_blocks < contig_blocks) + contig_blocks = max_blocks; + bh_result->b_size = contig_blocks << blocksize_bits; +bail: + return ret; +} + +/* + * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're + * particularly interested in the aio/dio case. Like the core uses + * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from + * truncation on another. + */ +static void ocfs2_dio_end_io(struct kiocb *iocb, + loff_t offset, + ssize_t bytes, + void *private, + int ret, + bool is_async) +{ + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + int level; + + /* this io's submitter should not have unlocked this before we could */ + BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); + + if (ocfs2_iocb_is_sem_locked(iocb)) { + up_read(&inode->i_alloc_sem); + ocfs2_iocb_clear_sem_locked(iocb); + } + + ocfs2_iocb_clear_rw_locked(iocb); + + level = ocfs2_iocb_rw_locked_level(iocb); + ocfs2_rw_unlock(inode, level); + + if (is_async) + aio_complete(iocb, ret, 0); +} + +/* + * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen + * from ext3. PageChecked() bits have been removed as OCFS2 does not + * do journalled data. + */ +static void ocfs2_invalidatepage(struct page *page, unsigned long offset) +{ + journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; + + jbd2_journal_invalidatepage(journal, page, offset); +} + +static int ocfs2_releasepage(struct page *page, gfp_t wait) +{ + journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; + + if (!page_has_buffers(page)) + return 0; + return jbd2_journal_try_to_free_buffers(journal, page, wait); +} + +static ssize_t ocfs2_direct_IO(int rw, + struct kiocb *iocb, + const struct iovec *iov, + loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; + + /* + * Fallback to buffered I/O if we see an inode without + * extents. + */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; + + /* Fallback to buffered I/O if we are appending. */ + if (i_size_read(inode) <= offset) + return 0; + + return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, + iov, offset, nr_segs, + ocfs2_direct_IO_get_blocks, + ocfs2_dio_end_io, NULL, 0); +} + +static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, + u32 cpos, + unsigned int *start, + unsigned int *end) +{ + unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; + + if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { + unsigned int cpp; + + cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); + + cluster_start = cpos % cpp; + cluster_start = cluster_start << osb->s_clustersize_bits; + + cluster_end = cluster_start + osb->s_clustersize; + } + + BUG_ON(cluster_start > PAGE_SIZE); + BUG_ON(cluster_end > PAGE_SIZE); + + if (start) + *start = cluster_start; + if (end) + *end = cluster_end; +} + +/* + * 'from' and 'to' are the region in the page to avoid zeroing. + * + * If pagesize > clustersize, this function will avoid zeroing outside + * of the cluster boundary. + * + * from == to == 0 is code for "zero the entire cluster region" + */ +static void ocfs2_clear_page_regions(struct page *page, + struct ocfs2_super *osb, u32 cpos, + unsigned from, unsigned to) +{ + void *kaddr; + unsigned int cluster_start, cluster_end; + + ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); + + kaddr = kmap_atomic(page, KM_USER0); + + if (from || to) { + if (from > cluster_start) + memset(kaddr + cluster_start, 0, from - cluster_start); + if (to < cluster_end) + memset(kaddr + to, 0, cluster_end - to); + } else { + memset(kaddr + cluster_start, 0, cluster_end - cluster_start); + } + + kunmap_atomic(kaddr, KM_USER0); +} + +/* + * Nonsparse file systems fully allocate before we get to the write + * code. This prevents ocfs2_write() from tagging the write as an + * allocating one, which means ocfs2_map_page_blocks() might try to + * read-in the blocks at the tail of our file. Avoid reading them by + * testing i_size against each block offset. + */ +static int ocfs2_should_read_blk(struct inode *inode, struct page *page, + unsigned int block_start) +{ + u64 offset = page_offset(page) + block_start; + + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + return 1; + + if (i_size_read(inode) > offset) + return 1; + + return 0; +} + +/* + * Some of this taken from __block_write_begin(). We already have our + * mapping by now though, and the entire write will be allocating or + * it won't, so not much need to use BH_New. + * + * This will also skip zeroing, which is handled externally. + */ +int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, + struct inode *inode, unsigned int from, + unsigned int to, int new) +{ + int ret = 0; + struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; + unsigned int block_end, block_start; + unsigned int bsize = 1 << inode->i_blkbits; + + if (!page_has_buffers(page)) + create_empty_buffers(page, bsize, 0); + + head = page_buffers(page); + for (bh = head, block_start = 0; bh != head || !block_start; + bh = bh->b_this_page, block_start += bsize) { + block_end = block_start + bsize; + + clear_buffer_new(bh); + + /* + * Ignore blocks outside of our i/o range - + * they may belong to unallocated clusters. + */ + if (block_start >= to || block_end <= from) { + if (PageUptodate(page)) + set_buffer_uptodate(bh); + continue; + } + + /* + * For an allocating write with cluster size >= page + * size, we always write the entire page. + */ + if (new) + set_buffer_new(bh); + + if (!buffer_mapped(bh)) { + map_bh(bh, inode->i_sb, *p_blkno); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && + !buffer_new(bh) && + ocfs2_should_read_blk(inode, page, block_start) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++=bh; + } + + *p_blkno = *p_blkno + 1; + } + + /* + * If we issued read requests - let them complete. + */ + while(wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) + ret = -EIO; + } + + if (ret == 0 || !new) + return ret; + + /* + * If we get -EIO above, zero out any newly allocated blocks + * to avoid exposing stale data. + */ + bh = head; + block_start = 0; + do { + block_end = block_start + bsize; + if (block_end <= from) + goto next_bh; + if (block_start >= to) + break; + + zero_user(page, block_start, bh->b_size); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + +next_bh: + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); + + return ret; +} + +#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) +#define OCFS2_MAX_CTXT_PAGES 1 +#else +#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) +#endif + +#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) + +/* + * Describe the state of a single cluster to be written to. + */ +struct ocfs2_write_cluster_desc { + u32 c_cpos; + u32 c_phys; + /* + * Give this a unique field because c_phys eventually gets + * filled. + */ + unsigned c_new; + unsigned c_unwritten; + unsigned c_needs_zero; +}; + +struct ocfs2_write_ctxt { + /* Logical cluster position / len of write */ + u32 w_cpos; + u32 w_clen; + + /* First cluster allocated in a nonsparse extend */ + u32 w_first_new_cpos; + + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; + + /* + * This is true if page_size > cluster_size. + * + * It triggers a set of special cases during write which might + * have to deal with allocating writes to partial pages. + */ + unsigned int w_large_pages; + + /* + * Pages involved in this write. + * + * w_target_page is the page being written to by the user. + * + * w_pages is an array of pages which always contains + * w_target_page, and in the case of an allocating write with + * page_size < cluster size, it will contain zero'd and mapped + * pages adjacent to w_target_page which need to be written + * out in so that future reads from that region will get + * zero's. + */ + unsigned int w_num_pages; + struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; + struct page *w_target_page; + + /* + * ocfs2_write_end() uses this to know what the real range to + * write in the target should be. + */ + unsigned int w_target_from; + unsigned int w_target_to; + + /* + * We could use journal_current_handle() but this is cleaner, + * IMHO -Mark + */ + handle_t *w_handle; + + struct buffer_head *w_di_bh; + + struct ocfs2_cached_dealloc_ctxt w_dealloc; +}; + +void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) +{ + int i; + + for(i = 0; i < num_pages; i++) { + if (pages[i]) { + unlock_page(pages[i]); + mark_page_accessed(pages[i]); + page_cache_release(pages[i]); + } + } +} + +static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) +{ + ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); + + brelse(wc->w_di_bh); + kfree(wc); +} + +static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, + struct ocfs2_super *osb, loff_t pos, + unsigned len, struct buffer_head *di_bh) +{ + u32 cend; + struct ocfs2_write_ctxt *wc; + + wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS); + if (!wc) + return -ENOMEM; + + wc->w_cpos = pos >> osb->s_clustersize_bits; + wc->w_first_new_cpos = UINT_MAX; + cend = (pos + len - 1) >> osb->s_clustersize_bits; + wc->w_clen = cend - wc->w_cpos + 1; + get_bh(di_bh); + wc->w_di_bh = di_bh; + + if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) + wc->w_large_pages = 1; + else + wc->w_large_pages = 0; + + ocfs2_init_dealloc_ctxt(&wc->w_dealloc); + + *wcp = wc; + + return 0; +} + +/* + * If a page has any new buffers, zero them out here, and mark them uptodate + * and dirty so they'll be written out (in order to prevent uninitialised + * block data from leaking). And clear the new bit. + */ +static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) +{ + unsigned int block_start, block_end; + struct buffer_head *head, *bh; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + return; + + bh = head = page_buffers(page); + block_start = 0; + do { + block_end = block_start + bh->b_size; + + if (buffer_new(bh)) { + if (block_end > from && block_start < to) { + if (!PageUptodate(page)) { + unsigned start, end; + + start = max(from, block_start); + end = min(to, block_end); + + zero_user_segment(page, start, end); + set_buffer_uptodate(bh); + } + + clear_buffer_new(bh); + mark_buffer_dirty(bh); + } + } + + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); +} + +/* + * Only called when we have a failure during allocating write to write + * zero's to the newly allocated region. + */ +static void ocfs2_write_failure(struct inode *inode, + struct ocfs2_write_ctxt *wc, + loff_t user_pos, unsigned user_len) +{ + int i; + unsigned from = user_pos & (PAGE_CACHE_SIZE - 1), + to = user_pos + user_len; + struct page *tmppage; + + ocfs2_zero_new_buffers(wc->w_target_page, from, to); + + for(i = 0; i < wc->w_num_pages; i++) { + tmppage = wc->w_pages[i]; + + if (page_has_buffers(tmppage)) { + if (ocfs2_should_order_data(inode)) + ocfs2_jbd2_file_inode(wc->w_handle, inode); + + block_commit_write(tmppage, from, to); + } + } +} + +static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, + struct ocfs2_write_ctxt *wc, + struct page *page, u32 cpos, + loff_t user_pos, unsigned user_len, + int new) +{ + int ret; + unsigned int map_from = 0, map_to = 0; + unsigned int cluster_start, cluster_end; + unsigned int user_data_from = 0, user_data_to = 0; + + ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, + &cluster_start, &cluster_end); + + /* treat the write as new if the a hole/lseek spanned across + * the page boundary. + */ + new = new | ((i_size_read(inode) <= page_offset(page)) && + (page_offset(page) <= user_pos)); + + if (page == wc->w_target_page) { + map_from = user_pos & (PAGE_CACHE_SIZE - 1); + map_to = map_from + user_len; + + if (new) + ret = ocfs2_map_page_blocks(page, p_blkno, inode, + cluster_start, cluster_end, + new); + else + ret = ocfs2_map_page_blocks(page, p_blkno, inode, + map_from, map_to, new); + if (ret) { + mlog_errno(ret); + goto out; + } + + user_data_from = map_from; + user_data_to = map_to; + if (new) { + map_from = cluster_start; + map_to = cluster_end; + } + } else { + /* + * If we haven't allocated the new page yet, we + * shouldn't be writing it out without copying user + * data. This is likely a math error from the caller. + */ + BUG_ON(!new); + + map_from = cluster_start; + map_to = cluster_end; + + ret = ocfs2_map_page_blocks(page, p_blkno, inode, + cluster_start, cluster_end, new); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * Parts of newly allocated pages need to be zero'd. + * + * Above, we have also rewritten 'to' and 'from' - as far as + * the rest of the function is concerned, the entire cluster + * range inside of a page needs to be written. + * + * We can skip this if the page is up to date - it's already + * been zero'd from being read in as a hole. + */ + if (new && !PageUptodate(page)) + ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), + cpos, user_data_from, user_data_to); + + flush_dcache_page(page); + +out: + return ret; +} + +/* + * This function will only grab one clusters worth of pages. + */ +static int ocfs2_grab_pages_for_write(struct address_space *mapping, + struct ocfs2_write_ctxt *wc, + u32 cpos, loff_t user_pos, + unsigned user_len, int new, + struct page *mmap_page) +{ + int ret = 0, i; + unsigned long start, target_index, end_index, index; + struct inode *inode = mapping->host; + loff_t last_byte; + + target_index = user_pos >> PAGE_CACHE_SHIFT; + + /* + * Figure out how many pages we'll be manipulating here. For + * non allocating write, we just change the one + * page. Otherwise, we'll need a whole clusters worth. If we're + * writing past i_size, we only need enough pages to cover the + * last page of the write. + */ + if (new) { + wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); + start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); + /* + * We need the index *past* the last page we could possibly + * touch. This is the page past the end of the write or + * i_size, whichever is greater. + */ + last_byte = max(user_pos + user_len, i_size_read(inode)); + BUG_ON(last_byte < 1); + end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1; + if ((start + wc->w_num_pages) > end_index) + wc->w_num_pages = end_index - start; + } else { + wc->w_num_pages = 1; + start = target_index; + } + + for(i = 0; i < wc->w_num_pages; i++) { + index = start + i; + + if (index == target_index && mmap_page) { + /* + * ocfs2_pagemkwrite() is a little different + * and wants us to directly use the page + * passed in. + */ + lock_page(mmap_page); + + if (mmap_page->mapping != mapping) { + unlock_page(mmap_page); + /* + * Sanity check - the locking in + * ocfs2_pagemkwrite() should ensure + * that this code doesn't trigger. + */ + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + page_cache_get(mmap_page); + wc->w_pages[i] = mmap_page; + } else { + wc->w_pages[i] = find_or_create_page(mapping, index, + GFP_NOFS); + if (!wc->w_pages[i]) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + } + + if (index == target_index) + wc->w_target_page = wc->w_pages[i]; + } +out: + return ret; +} + +/* + * Prepare a single cluster for write one cluster into the file. + */ +static int ocfs2_write_cluster(struct address_space *mapping, + u32 phys, unsigned int unwritten, + unsigned int should_zero, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_write_ctxt *wc, u32 cpos, + loff_t user_pos, unsigned user_len) +{ + int ret, i, new; + u64 v_blkno, p_blkno; + struct inode *inode = mapping->host; + struct ocfs2_extent_tree et; + + new = phys == 0 ? 1 : 0; + if (new) { + u32 tmp_pos; + + /* + * This is safe to call with the page locks - it won't take + * any additional semaphores or cluster locks. + */ + tmp_pos = cpos; + ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, + &tmp_pos, 1, 0, wc->w_di_bh, + wc->w_handle, data_ac, + meta_ac, NULL); + /* + * This shouldn't happen because we must have already + * calculated the correct meta data allocation required. The + * internal tree allocation code should know how to increase + * transaction credits itself. + * + * If need be, we could handle -EAGAIN for a + * RESTART_TRANS here. + */ + mlog_bug_on_msg(ret == -EAGAIN, + "Inode %llu: EAGAIN return during allocation.\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } else if (unwritten) { + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), + wc->w_di_bh); + ret = ocfs2_mark_extent_written(inode, &et, + wc->w_handle, cpos, 1, phys, + meta_ac, &wc->w_dealloc); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + if (should_zero) + v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); + else + v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; + + /* + * The only reason this should fail is due to an inability to + * find the extent added. + */ + ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, + NULL); + if (ret < 0) { + ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " + "at logical block %llu", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_blkno); + goto out; + } + + BUG_ON(p_blkno == 0); + + for(i = 0; i < wc->w_num_pages; i++) { + int tmpret; + + tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, + wc->w_pages[i], cpos, + user_pos, user_len, + should_zero); + if (tmpret) { + mlog_errno(tmpret); + if (ret == 0) + ret = tmpret; + } + } + + /* + * We only have cleanup to do in case of allocating write. + */ + if (ret && new) + ocfs2_write_failure(inode, wc, user_pos, user_len); + +out: + + return ret; +} + +static int ocfs2_write_cluster_by_desc(struct address_space *mapping, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_write_ctxt *wc, + loff_t pos, unsigned len) +{ + int ret, i; + loff_t cluster_off; + unsigned int local_len = len; + struct ocfs2_write_cluster_desc *desc; + struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb); + + for (i = 0; i < wc->w_clen; i++) { + desc = &wc->w_desc[i]; + + /* + * We have to make sure that the total write passed in + * doesn't extend past a single cluster. + */ + local_len = len; + cluster_off = pos & (osb->s_clustersize - 1); + if ((cluster_off + local_len) > osb->s_clustersize) + local_len = osb->s_clustersize - cluster_off; + + ret = ocfs2_write_cluster(mapping, desc->c_phys, + desc->c_unwritten, + desc->c_needs_zero, + data_ac, meta_ac, + wc, desc->c_cpos, pos, local_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + len -= local_len; + pos += local_len; + } + + ret = 0; +out: + return ret; +} + +/* + * ocfs2_write_end() wants to know which parts of the target page it + * should complete the write on. It's easiest to compute them ahead of + * time when a more complete view of the write is available. + */ +static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, + struct ocfs2_write_ctxt *wc, + loff_t pos, unsigned len, int alloc) +{ + struct ocfs2_write_cluster_desc *desc; + + wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); + wc->w_target_to = wc->w_target_from + len; + + if (alloc == 0) + return; + + /* + * Allocating write - we may have different boundaries based + * on page size and cluster size. + * + * NOTE: We can no longer compute one value from the other as + * the actual write length and user provided length may be + * different. + */ + + if (wc->w_large_pages) { + /* + * We only care about the 1st and last cluster within + * our range and whether they should be zero'd or not. Either + * value may be extended out to the start/end of a + * newly allocated cluster. + */ + desc = &wc->w_desc[0]; + if (desc->c_needs_zero) + ocfs2_figure_cluster_boundaries(osb, + desc->c_cpos, + &wc->w_target_from, + NULL); + + desc = &wc->w_desc[wc->w_clen - 1]; + if (desc->c_needs_zero) + ocfs2_figure_cluster_boundaries(osb, + desc->c_cpos, + NULL, + &wc->w_target_to); + } else { + wc->w_target_from = 0; + wc->w_target_to = PAGE_CACHE_SIZE; + } +} + +/* + * Populate each single-cluster write descriptor in the write context + * with information about the i/o to be done. + * + * Returns the number of clusters that will have to be allocated, as + * well as a worst case estimate of the number of extent records that + * would have to be created during a write to an unwritten region. + */ +static int ocfs2_populate_write_desc(struct inode *inode, + struct ocfs2_write_ctxt *wc, + unsigned int *clusters_to_alloc, + unsigned int *extents_to_split) +{ + int ret; + struct ocfs2_write_cluster_desc *desc; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + u32 phys = 0; + int i; + + *clusters_to_alloc = 0; + *extents_to_split = 0; + + for (i = 0; i < wc->w_clen; i++) { + desc = &wc->w_desc[i]; + desc->c_cpos = wc->w_cpos + i; + + if (num_clusters == 0) { + /* + * Need to look up the next extent record. + */ + ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* We should already CoW the refcountd extent. */ + BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); + + /* + * Assume worst case - that we're writing in + * the middle of the extent. + * + * We can assume that the write proceeds from + * left to right, in which case the extent + * insert code is smart enough to coalesce the + * next splits into the previous records created. + */ + if (ext_flags & OCFS2_EXT_UNWRITTEN) + *extents_to_split = *extents_to_split + 2; + } else if (phys) { + /* + * Only increment phys if it doesn't describe + * a hole. + */ + phys++; + } + + /* + * If w_first_new_cpos is < UINT_MAX, we have a non-sparse + * file that got extended. w_first_new_cpos tells us + * where the newly allocated clusters are so we can + * zero them. + */ + if (desc->c_cpos >= wc->w_first_new_cpos) { + BUG_ON(phys == 0); + desc->c_needs_zero = 1; + } + + desc->c_phys = phys; + if (phys == 0) { + desc->c_new = 1; + desc->c_needs_zero = 1; + *clusters_to_alloc = *clusters_to_alloc + 1; + } + + if (ext_flags & OCFS2_EXT_UNWRITTEN) { + desc->c_unwritten = 1; + desc->c_needs_zero = 1; + } + + num_clusters--; + } + + ret = 0; +out: + return ret; +} + +static int ocfs2_write_begin_inline(struct address_space *mapping, + struct inode *inode, + struct ocfs2_write_ctxt *wc) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct page *page; + handle_t *handle; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + + page = find_or_create_page(mapping, 0, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + /* + * If we don't set w_num_pages then this page won't get unlocked + * and freed on cleanup of the write context. + */ + wc->w_pages[0] = wc->w_target_page = page; + wc->w_num_pages = 1; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + ocfs2_commit_trans(osb, handle); + + mlog_errno(ret); + goto out; + } + + if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) + ocfs2_set_inode_data_inline(inode, di); + + if (!PageUptodate(page)) { + ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh); + if (ret) { + ocfs2_commit_trans(osb, handle); + + goto out; + } + } + + wc->w_handle = handle; +out: + return ret; +} + +int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + if (new_size <= le16_to_cpu(di->id2.i_data.id_count)) + return 1; + return 0; +} + +static int ocfs2_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, loff_t pos, + unsigned len, struct page *mmap_page, + struct ocfs2_write_ctxt *wc) +{ + int ret, written = 0; + loff_t end = pos + len; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = NULL; + + trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno, + len, (unsigned long long)pos, + oi->ip_dyn_features); + + /* + * Handle inodes which already have inline data 1st. + */ + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (mmap_page == NULL && + ocfs2_size_fits_inline_data(wc->w_di_bh, end)) + goto do_inline_write; + + /* + * The write won't fit - we have to give this inode an + * inline extent list now. + */ + ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * Check whether the inode can accept inline data. + */ + if (oi->ip_clusters != 0 || i_size_read(inode) != 0) + return 0; + + /* + * Check whether the write can fit. + */ + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + if (mmap_page || + end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) + return 0; + +do_inline_write: + ret = ocfs2_write_begin_inline(mapping, inode, wc); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * This signals to the caller that the data can be written + * inline. + */ + written = 1; +out: + return written ? written : ret; +} + +/* + * This function only does anything for file systems which can't + * handle sparse files. + * + * What we want to do here is fill in any hole between the current end + * of allocation and the end of our write. That way the rest of the + * write path can treat it as an non-allocating write, which has no + * special case code for sparse/nonsparse files. + */ +static int ocfs2_expand_nonsparse_inode(struct inode *inode, + struct buffer_head *di_bh, + loff_t pos, unsigned len, + struct ocfs2_write_ctxt *wc) +{ + int ret; + loff_t newsize = pos + len; + + BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); + + if (newsize <= i_size_read(inode)) + return 0; + + ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos); + if (ret) + mlog_errno(ret); + + wc->w_first_new_cpos = + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); + + return ret; +} + +static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, + loff_t pos) +{ + int ret = 0; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); + if (pos > i_size_read(inode)) + ret = ocfs2_zero_extend(inode, di_bh, pos); + + return ret; +} + +/* + * Try to flush truncate logs if we can free enough clusters from it. + * As for return value, "< 0" means error, "0" no space and "1" means + * we have freed enough spaces and let the caller try to allocate again. + */ +static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, + unsigned int needed) +{ + tid_t target; + int ret = 0; + unsigned int truncated_clusters; + + mutex_lock(&osb->osb_tl_inode->i_mutex); + truncated_clusters = osb->truncated_clusters; + mutex_unlock(&osb->osb_tl_inode->i_mutex); + + /* + * Check whether we can succeed in allocating if we free + * the truncate log. + */ + if (truncated_clusters < needed) + goto out; + + ret = ocfs2_flush_truncate_log(osb); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { + jbd2_log_wait_commit(osb->journal->j_journal, target); + ret = 1; + } +out: + return ret; +} + +int ocfs2_write_begin_nolock(struct file *filp, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + struct buffer_head *di_bh, struct page *mmap_page) +{ + int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; + unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0; + struct ocfs2_write_ctxt *wc; + struct inode *inode = mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle; + struct ocfs2_extent_tree et; + int try_free = 1, ret1; + +try_again: + ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + if (ocfs2_supports_inline_data(osb)) { + ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len, + mmap_page, wc); + if (ret == 1) { + ret = 0; + goto success; + } + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + if (ocfs2_sparse_alloc(osb)) + ret = ocfs2_zero_tail(inode, di_bh, pos); + else + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, + wc); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_check_range_for_refcount(inode, pos, len); + if (ret < 0) { + mlog_errno(ret); + goto out; + } else if (ret == 1) { + clusters_need = wc->w_clen; + ret = ocfs2_refcount_cow(inode, filp, di_bh, + wc->w_cpos, wc->w_clen, UINT_MAX); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, + &extents_to_split); + if (ret) { + mlog_errno(ret); + goto out; + } + clusters_need += clusters_to_alloc; + + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + + trace_ocfs2_write_begin_nolock( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), + le32_to_cpu(di->i_clusters), + pos, len, flags, mmap_page, + clusters_to_alloc, extents_to_split); + + /* + * We set w_target_from, w_target_to here so that + * ocfs2_write_end() knows which range in the target page to + * write out. An allocation requires that we write the entire + * cluster range. + */ + if (clusters_to_alloc || extents_to_split) { + /* + * XXX: We are stretching the limits of + * ocfs2_lock_allocators(). It greatly over-estimates + * the work to be done. + */ + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), + wc->w_di_bh); + ret = ocfs2_lock_allocators(inode, &et, + clusters_to_alloc, extents_to_split, + &data_ac, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (data_ac) + data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + + credits = ocfs2_calc_extend_credits(inode->i_sb, + &di->id2.i_list, + clusters_to_alloc); + + } + + /* + * We have to zero sparse allocated clusters, unwritten extent clusters, + * and non-sparse clusters we just extended. For non-sparse writes, + * we know zeros will only be needed in the first and/or last cluster. + */ + if (clusters_to_alloc || extents_to_split || + (wc->w_clen && (wc->w_desc[0].c_needs_zero || + wc->w_desc[wc->w_clen - 1].c_needs_zero))) + cluster_of_pages = 1; + else + cluster_of_pages = 0; + + ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages); + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + wc->w_handle = handle; + + if (clusters_to_alloc) { + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); + if (ret) + goto out_commit; + } + /* + * We don't want this to fail in ocfs2_write_end(), so do it + * here. + */ + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_quota; + } + + /* + * Fill our page array first. That way we've grabbed enough so + * that we can zero and flush if we error after adding the + * extent. + */ + ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, + cluster_of_pages, mmap_page); + if (ret) { + mlog_errno(ret); + goto out_quota; + } + + ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, + len); + if (ret) { + mlog_errno(ret); + goto out_quota; + } + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + +success: + *pagep = wc->w_target_page; + *fsdata = wc; + return 0; +out_quota: + if (clusters_to_alloc) + dquot_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + ocfs2_free_write_ctxt(wc); + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + if (ret == -ENOSPC && try_free) { + /* + * Try to free some truncate log so that we can have enough + * clusters to allocate. + */ + try_free = 0; + + ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need); + if (ret1 == 1) + goto try_again; + + if (ret1 < 0) + mlog_errno(ret1); + } + + return ret; +} + +static int ocfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct inode *inode = mapping->host; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* + * Take alloc sem here to prevent concurrent lookups. That way + * the mapping, zeroing and tree manipulation within + * ocfs2_write() will be safe against ->readpage(). This + * should also serve to lock out allocation from a shared + * writeable region. + */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, + fsdata, di_bh, NULL); + if (ret) { + mlog_errno(ret); + goto out_fail; + } + + brelse(di_bh); + + return 0; + +out_fail: + up_write(&OCFS2_I(inode)->ip_alloc_sem); + + brelse(di_bh); + ocfs2_inode_unlock(inode, 1); + + return ret; +} + +static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, + unsigned len, unsigned *copied, + struct ocfs2_dinode *di, + struct ocfs2_write_ctxt *wc) +{ + void *kaddr; + + if (unlikely(*copied < len)) { + if (!PageUptodate(wc->w_target_page)) { + *copied = 0; + return; + } + } + + kaddr = kmap_atomic(wc->w_target_page, KM_USER0); + memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied); + kunmap_atomic(kaddr, KM_USER0); + + trace_ocfs2_write_end_inline( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)pos, *copied, + le16_to_cpu(di->id2.i_data.id_count), + le16_to_cpu(di->i_dyn_features)); +} + +int ocfs2_write_end_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int i; + unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_write_ctxt *wc = fsdata; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + handle_t *handle = wc->w_handle; + struct page *tmppage; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ocfs2_write_end_inline(inode, pos, len, &copied, di, wc); + goto out_write_size; + } + + if (unlikely(copied < len)) { + if (!PageUptodate(wc->w_target_page)) + copied = 0; + + ocfs2_zero_new_buffers(wc->w_target_page, start+copied, + start+len); + } + flush_dcache_page(wc->w_target_page); + + for(i = 0; i < wc->w_num_pages; i++) { + tmppage = wc->w_pages[i]; + + if (tmppage == wc->w_target_page) { + from = wc->w_target_from; + to = wc->w_target_to; + + BUG_ON(from > PAGE_CACHE_SIZE || + to > PAGE_CACHE_SIZE || + to < from); + } else { + /* + * Pages adjacent to the target (if any) imply + * a hole-filling write in which case we want + * to flush their entire range. + */ + from = 0; + to = PAGE_CACHE_SIZE; + } + + if (page_has_buffers(tmppage)) { + if (ocfs2_should_order_data(inode)) + ocfs2_jbd2_file_inode(wc->w_handle, inode); + block_commit_write(tmppage, from, to); + } + } + +out_write_size: + pos += copied; + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ocfs2_journal_dirty(handle, wc->w_di_bh); + + ocfs2_commit_trans(osb, handle); + + ocfs2_run_deallocs(osb, &wc->w_dealloc); + + ocfs2_free_write_ctxt(wc); + + return copied; +} + +static int ocfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + struct inode *inode = mapping->host; + + ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); + + up_write(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_inode_unlock(inode, 1); + + return ret; +} + +const struct address_space_operations ocfs2_aops = { + .readpage = ocfs2_readpage, + .readpages = ocfs2_readpages, + .writepage = ocfs2_writepage, + .write_begin = ocfs2_write_begin, + .write_end = ocfs2_write_end, + .bmap = ocfs2_bmap, + .direct_IO = ocfs2_direct_IO, + .invalidatepage = ocfs2_invalidatepage, + .releasepage = ocfs2_releasepage, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h new file mode 100644 index 00000000..75cf3ad9 --- /dev/null +++ b/fs/ocfs2/aops.h @@ -0,0 +1,94 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_AOPS_H +#define OCFS2_AOPS_H + +handle_t *ocfs2_start_walk_page_trans(struct inode *inode, + struct page *page, + unsigned from, + unsigned to); + +int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, + struct inode *inode, unsigned int from, + unsigned int to, int new); + +void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages); + +int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)); + +int ocfs2_write_end_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + +int ocfs2_write_begin_nolock(struct file *filp, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + struct buffer_head *di_bh, struct page *mmap_page); + +int ocfs2_read_inline_data(struct inode *inode, struct page *page, + struct buffer_head *di_bh); +int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size); + +int ocfs2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +/* all ocfs2_dio_end_io()'s fault */ +#define ocfs2_iocb_is_rw_locked(iocb) \ + test_bit(0, (unsigned long *)&iocb->private) +static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) +{ + set_bit(0, (unsigned long *)&iocb->private); + if (level) + set_bit(1, (unsigned long *)&iocb->private); + else + clear_bit(1, (unsigned long *)&iocb->private); +} + +/* + * Using a named enum representing lock types in terms of #N bit stored in + * iocb->private, which is going to be used for communication between + * ocfs2_dio_end_io() and ocfs2_file_aio_write/read(). + */ +enum ocfs2_iocb_lock_bits { + OCFS2_IOCB_RW_LOCK = 0, + OCFS2_IOCB_RW_LOCK_LEVEL, + OCFS2_IOCB_SEM, + OCFS2_IOCB_NUM_LOCKS +}; + +#define ocfs2_iocb_clear_rw_locked(iocb) \ + clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) +#define ocfs2_iocb_rw_locked_level(iocb) \ + test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) +#define ocfs2_iocb_set_sem_locked(iocb) \ + set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) +#define ocfs2_iocb_clear_sem_locked(iocb) \ + clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) +#define ocfs2_iocb_is_sem_locked(iocb) \ + test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) +#endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c new file mode 100644 index 00000000..c7ee03c2 --- /dev/null +++ b/fs/ocfs2/blockcheck.c @@ -0,0 +1,645 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * blockcheck.c + * + * Checksum and ECC codes for the OCFS2 userspace library. + * + * Copyright (C) 2006, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "blockcheck.h" + + +/* + * We use the following conventions: + * + * d = # data bits + * p = # parity bits + * c = # total code bits (d + p) + */ + + +/* + * Calculate the bit offset in the hamming code buffer based on the bit's + * offset in the data buffer. Since the hamming code reserves all + * power-of-two bits for parity, the data bit number and the code bit + * number are offset by all the parity bits beforehand. + * + * Recall that bit numbers in hamming code are 1-based. This function + * takes the 0-based data bit from the caller. + * + * An example. Take bit 1 of the data buffer. 1 is a power of two (2^0), + * so it's a parity bit. 2 is a power of two (2^1), so it's a parity bit. + * 3 is not a power of two. So bit 1 of the data buffer ends up as bit 3 + * in the code buffer. + * + * The caller can pass in *p if it wants to keep track of the most recent + * number of parity bits added. This allows the function to start the + * calculation at the last place. + */ +static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache) +{ + unsigned int b, p = 0; + + /* + * Data bits are 0-based, but we're talking code bits, which + * are 1-based. + */ + b = i + 1; + + /* Use the cache if it is there */ + if (p_cache) + p = *p_cache; + b += p; + + /* + * For every power of two below our bit number, bump our bit. + * + * We compare with (b + 1) because we have to compare with what b + * would be _if_ it were bumped up by the parity bit. Capice? + * + * p is set above. + */ + for (; (1 << p) < (b + 1); p++) + b++; + + if (p_cache) + *p_cache = p; + + return b; +} + +/* + * This is the low level encoder function. It can be called across + * multiple hunks just like the crc32 code. 'd' is the number of bits + * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had + * two 512B buffers, you would do it like so: + * + * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0); + * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8); + * + * If you just have one buffer, use ocfs2_hamming_encode_block(). + */ +u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr) +{ + unsigned int i, b, p = 0; + + BUG_ON(!d); + + /* + * b is the hamming code bit number. Hamming code specifies a + * 1-based array, but C uses 0-based. So 'i' is for C, and 'b' is + * for the algorithm. + * + * The i++ in the for loop is so that the start offset passed + * to ocfs2_find_next_bit_set() is one greater than the previously + * found bit. + */ + for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++) + { + /* + * i is the offset in this hunk, nr + i is the total bit + * offset. + */ + b = calc_code_bit(nr + i, &p); + + /* + * Data bits in the resultant code are checked by + * parity bits that are part of the bit number + * representation. Huh? + * + * + * In other words, the parity bit at position 2^k + * checks bits in positions having bit k set in + * their binary representation. Conversely, for + * instance, bit 13, i.e. 1101(2), is checked by + * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1. + * + * + * Note that 'k' is the _code_ bit number. 'b' in + * our loop. + */ + parity ^= b; + } + + /* While the data buffer was treated as little endian, the + * return value is in host endian. */ + return parity; +} + +u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize) +{ + return ocfs2_hamming_encode(0, data, blocksize * 8, 0); +} + +/* + * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit + * offset of the current hunk. If bit to be fixed is not part of the + * current hunk, this does nothing. + * + * If you only have one hunk, use ocfs2_hamming_fix_block(). + */ +void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr, + unsigned int fix) +{ + unsigned int i, b; + + BUG_ON(!d); + + /* + * If the bit to fix has an hweight of 1, it's a parity bit. One + * busted parity bit is its own error. Nothing to do here. + */ + if (hweight32(fix) == 1) + return; + + /* + * nr + d is the bit right past the data hunk we're looking at. + * If fix after that, nothing to do + */ + if (fix >= calc_code_bit(nr + d, NULL)) + return; + + /* + * nr is the offset in the data hunk we're starting at. Let's + * start b at the offset in the code buffer. See hamming_encode() + * for a more detailed description of 'b'. + */ + b = calc_code_bit(nr, NULL); + /* If the fix is before this hunk, nothing to do */ + if (fix < b) + return; + + for (i = 0; i < d; i++, b++) + { + /* Skip past parity bits */ + while (hweight32(b) == 1) + b++; + + /* + * i is the offset in this data hunk. + * nr + i is the offset in the total data buffer. + * b is the offset in the total code buffer. + * + * Thus, when b == fix, bit i in the current hunk needs + * fixing. + */ + if (b == fix) + { + if (ocfs2_test_bit(i, data)) + ocfs2_clear_bit(i, data); + else + ocfs2_set_bit(i, data); + break; + } + } +} + +void ocfs2_hamming_fix_block(void *data, unsigned int blocksize, + unsigned int fix) +{ + ocfs2_hamming_fix(data, blocksize * 8, 0, fix); +} + + +/* + * Debugfs handling. + */ + +#ifdef CONFIG_DEBUG_FS + +static int blockcheck_u64_get(void *data, u64 *val) +{ + *val = *(u64 *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); + +static struct dentry *blockcheck_debugfs_create(const char *name, + struct dentry *parent, + u64 *value) +{ + return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value, + &blockcheck_fops); +} + +static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) +{ + if (stats) { + debugfs_remove(stats->b_debug_check); + stats->b_debug_check = NULL; + debugfs_remove(stats->b_debug_failure); + stats->b_debug_failure = NULL; + debugfs_remove(stats->b_debug_recover); + stats->b_debug_recover = NULL; + debugfs_remove(stats->b_debug_dir); + stats->b_debug_dir = NULL; + } +} + +static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) +{ + int rc = -EINVAL; + + if (!stats) + goto out; + + stats->b_debug_dir = debugfs_create_dir("blockcheck", parent); + if (!stats->b_debug_dir) + goto out; + + stats->b_debug_check = + blockcheck_debugfs_create("blocks_checked", + stats->b_debug_dir, + &stats->b_check_count); + + stats->b_debug_failure = + blockcheck_debugfs_create("checksums_failed", + stats->b_debug_dir, + &stats->b_failure_count); + + stats->b_debug_recover = + blockcheck_debugfs_create("ecc_recoveries", + stats->b_debug_dir, + &stats->b_recover_count); + if (stats->b_debug_check && stats->b_debug_failure && + stats->b_debug_recover) + rc = 0; + +out: + if (rc) + ocfs2_blockcheck_debug_remove(stats); + return rc; +} +#else +static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) +{ + return 0; +} + +static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) +{ +} +#endif /* CONFIG_DEBUG_FS */ + +/* Always-called wrappers for starting and stopping the debugfs files */ +int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) +{ + return ocfs2_blockcheck_debug_install(stats, parent); +} + +void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats) +{ + ocfs2_blockcheck_debug_remove(stats); +} + +static void ocfs2_blockcheck_inc_check(struct ocfs2_blockcheck_stats *stats) +{ + u64 new_count; + + if (!stats) + return; + + spin_lock(&stats->b_lock); + stats->b_check_count++; + new_count = stats->b_check_count; + spin_unlock(&stats->b_lock); + + if (!new_count) + mlog(ML_NOTICE, "Block check count has wrapped\n"); +} + +static void ocfs2_blockcheck_inc_failure(struct ocfs2_blockcheck_stats *stats) +{ + u64 new_count; + + if (!stats) + return; + + spin_lock(&stats->b_lock); + stats->b_failure_count++; + new_count = stats->b_failure_count; + spin_unlock(&stats->b_lock); + + if (!new_count) + mlog(ML_NOTICE, "Checksum failure count has wrapped\n"); +} + +static void ocfs2_blockcheck_inc_recover(struct ocfs2_blockcheck_stats *stats) +{ + u64 new_count; + + if (!stats) + return; + + spin_lock(&stats->b_lock); + stats->b_recover_count++; + new_count = stats->b_recover_count; + spin_unlock(&stats->b_lock); + + if (!new_count) + mlog(ML_NOTICE, "ECC recovery count has wrapped\n"); +} + + + +/* + * These are the low-level APIs for using the ocfs2_block_check structure. + */ + +/* + * This function generates check information for a block. + * data is the block to be checked. bc is a pointer to the + * ocfs2_block_check structure describing the crc32 and the ecc. + * + * bc should be a pointer inside data, as the function will + * take care of zeroing it before calculating the check information. If + * bc does not point inside data, the caller must make sure any inline + * ocfs2_block_check structures are zeroed. + * + * The data buffer must be in on-disk endian (little endian for ocfs2). + * bc will be filled with little-endian values and will be ready to go to + * disk. + */ +void ocfs2_block_check_compute(void *data, size_t blocksize, + struct ocfs2_block_check *bc) +{ + u32 crc; + u32 ecc; + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + crc = crc32_le(~0, data, blocksize); + ecc = ocfs2_hamming_encode_block(data, blocksize); + + /* + * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no + * larger than 16 bits. + */ + BUG_ON(ecc > USHRT_MAX); + + bc->bc_crc32e = cpu_to_le32(crc); + bc->bc_ecc = cpu_to_le16((u16)ecc); +} + +/* + * This function validates existing check information. Like _compute, + * the function will take care of zeroing bc before calculating check codes. + * If bc is not a pointer inside data, the caller must have zeroed any + * inline ocfs2_block_check structures. + * + * Again, the data passed in should be the on-disk endian. + */ +int ocfs2_block_check_validate(void *data, size_t blocksize, + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats) +{ + int rc = 0; + struct ocfs2_block_check check; + u32 crc, ecc; + + ocfs2_blockcheck_inc_check(stats); + + check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); + check.bc_ecc = le16_to_cpu(bc->bc_ecc); + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + /* Fast path - if the crc32 validates, we're good to go */ + crc = crc32_le(~0, data, blocksize); + if (crc == check.bc_crc32e) + goto out; + + ocfs2_blockcheck_inc_failure(stats); + mlog(ML_ERROR, + "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n", + (unsigned int)check.bc_crc32e, (unsigned int)crc); + + /* Ok, try ECC fixups */ + ecc = ocfs2_hamming_encode_block(data, blocksize); + ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc); + + /* And check the crc32 again */ + crc = crc32_le(~0, data, blocksize); + if (crc == check.bc_crc32e) { + ocfs2_blockcheck_inc_recover(stats); + goto out; + } + + mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n", + (unsigned int)check.bc_crc32e, (unsigned int)crc); + + rc = -EIO; + +out: + bc->bc_crc32e = cpu_to_le32(check.bc_crc32e); + bc->bc_ecc = cpu_to_le16(check.bc_ecc); + + return rc; +} + +/* + * This function generates check information for a list of buffer_heads. + * bhs is the blocks to be checked. bc is a pointer to the + * ocfs2_block_check structure describing the crc32 and the ecc. + * + * bc should be a pointer inside data, as the function will + * take care of zeroing it before calculating the check information. If + * bc does not point inside data, the caller must make sure any inline + * ocfs2_block_check structures are zeroed. + * + * The data buffer must be in on-disk endian (little endian for ocfs2). + * bc will be filled with little-endian values and will be ready to go to + * disk. + */ +void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc) +{ + int i; + u32 crc, ecc; + + BUG_ON(nr < 0); + + if (!nr) + return; + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + for (i = 0, crc = ~0, ecc = 0; i < nr; i++) { + crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); + /* + * The number of bits in a buffer is obviously b_size*8. + * The offset of this buffer is b_size*i, so the bit offset + * of this buffer is b_size*8*i. + */ + ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data, + bhs[i]->b_size * 8, + bhs[i]->b_size * 8 * i); + } + + /* + * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no + * larger than 16 bits. + */ + BUG_ON(ecc > USHRT_MAX); + + bc->bc_crc32e = cpu_to_le32(crc); + bc->bc_ecc = cpu_to_le16((u16)ecc); +} + +/* + * This function validates existing check information on a list of + * buffer_heads. Like _compute_bhs, the function will take care of + * zeroing bc before calculating check codes. If bc is not a pointer + * inside data, the caller must have zeroed any inline + * ocfs2_block_check structures. + * + * Again, the data passed in should be the on-disk endian. + */ +int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats) +{ + int i, rc = 0; + struct ocfs2_block_check check; + u32 crc, ecc, fix; + + BUG_ON(nr < 0); + + if (!nr) + return 0; + + ocfs2_blockcheck_inc_check(stats); + + check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); + check.bc_ecc = le16_to_cpu(bc->bc_ecc); + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + /* Fast path - if the crc32 validates, we're good to go */ + for (i = 0, crc = ~0; i < nr; i++) + crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); + if (crc == check.bc_crc32e) + goto out; + + ocfs2_blockcheck_inc_failure(stats); + mlog(ML_ERROR, + "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", + (unsigned int)check.bc_crc32e, (unsigned int)crc); + + /* Ok, try ECC fixups */ + for (i = 0, ecc = 0; i < nr; i++) { + /* + * The number of bits in a buffer is obviously b_size*8. + * The offset of this buffer is b_size*i, so the bit offset + * of this buffer is b_size*8*i. + */ + ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data, + bhs[i]->b_size * 8, + bhs[i]->b_size * 8 * i); + } + fix = ecc ^ check.bc_ecc; + for (i = 0; i < nr; i++) { + /* + * Try the fix against each buffer. It will only affect + * one of them. + */ + ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8, + bhs[i]->b_size * 8 * i, fix); + } + + /* And check the crc32 again */ + for (i = 0, crc = ~0; i < nr; i++) + crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); + if (crc == check.bc_crc32e) { + ocfs2_blockcheck_inc_recover(stats); + goto out; + } + + mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", + (unsigned int)check.bc_crc32e, (unsigned int)crc); + + rc = -EIO; + +out: + bc->bc_crc32e = cpu_to_le32(check.bc_crc32e); + bc->bc_ecc = cpu_to_le16(check.bc_ecc); + + return rc; +} + +/* + * These are the main API. They check the superblock flag before + * calling the underlying operations. + * + * They expect the buffer(s) to be in disk format. + */ +void ocfs2_compute_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc) +{ + if (ocfs2_meta_ecc(OCFS2_SB(sb))) + ocfs2_block_check_compute(data, sb->s_blocksize, bc); +} + +int ocfs2_validate_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc) +{ + int rc = 0; + struct ocfs2_super *osb = OCFS2_SB(sb); + + if (ocfs2_meta_ecc(osb)) + rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc, + &osb->osb_ecc_stats); + + return rc; +} + +void ocfs2_compute_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc) +{ + if (ocfs2_meta_ecc(OCFS2_SB(sb))) + ocfs2_block_check_compute_bhs(bhs, nr, bc); +} + +int ocfs2_validate_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc) +{ + int rc = 0; + struct ocfs2_super *osb = OCFS2_SB(sb); + + if (ocfs2_meta_ecc(osb)) + rc = ocfs2_block_check_validate_bhs(bhs, nr, bc, + &osb->osb_ecc_stats); + + return rc; +} + diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h new file mode 100644 index 00000000..d4b69feb --- /dev/null +++ b/fs/ocfs2/blockcheck.h @@ -0,0 +1,107 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * blockcheck.h + * + * Checksum and ECC codes for the OCFS2 userspace library. + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_BLOCKCHECK_H +#define OCFS2_BLOCKCHECK_H + + +/* Count errors and error correction from blockcheck.c */ +struct ocfs2_blockcheck_stats { + spinlock_t b_lock; + u64 b_check_count; /* Number of blocks we've checked */ + u64 b_failure_count; /* Number of failed checksums */ + u64 b_recover_count; /* Number of blocks fixed by ecc */ + + /* + * debugfs entries, used if this is passed to + * ocfs2_blockcheck_stats_debugfs_install() + */ + struct dentry *b_debug_dir; /* Parent of the debugfs files */ + struct dentry *b_debug_check; /* Exposes b_check_count */ + struct dentry *b_debug_failure; /* Exposes b_failure_count */ + struct dentry *b_debug_recover; /* Exposes b_recover_count */ +}; + + +/* High level block API */ +void ocfs2_compute_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc); +int ocfs2_validate_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc); +void ocfs2_compute_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc); +int ocfs2_validate_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc); + +/* Lower level API */ +void ocfs2_block_check_compute(void *data, size_t blocksize, + struct ocfs2_block_check *bc); +int ocfs2_block_check_validate(void *data, size_t blocksize, + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats); +void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc); +int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats); + +/* Debug Initialization */ +int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent); +void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats); + +/* + * Hamming code functions + */ + +/* + * Encoding hamming code parity bits for a buffer. + * + * This is the low level encoder function. It can be called across + * multiple hunks just like the crc32 code. 'd' is the number of bits + * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had + * two 512B buffers, you would do it like so: + * + * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0); + * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8); + * + * If you just have one buffer, use ocfs2_hamming_encode_block(). + */ +u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, + unsigned int nr); +/* + * Fix a buffer with a bit error. The 'fix' is the original parity + * xor'd with the parity calculated now. + * + * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit + * offset of the current hunk. If bit to be fixed is not part of the + * current hunk, this does nothing. + * + * If you only have one buffer, use ocfs2_hamming_fix_block(). + */ +void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr, + unsigned int fix); + +/* Convenience wrappers for a single buffer of data */ +extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize); +extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize, + unsigned int fix); +#endif diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c new file mode 100644 index 00000000..5d18ad10 --- /dev/null +++ b/fs/ocfs2/buffer_head_io.c @@ -0,0 +1,429 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * io.c + * + * Buffer cache handling + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "inode.h" +#include "journal.h" +#include "uptodate.h" +#include "buffer_head_io.h" +#include "ocfs2_trace.h" + +/* + * Bits on bh->b_state used by ocfs2. + * + * These MUST be after the JBD2 bits. Hence, we use BH_JBDPrivateStart. + */ +enum ocfs2_state_bits { + BH_NeedsValidate = BH_JBDPrivateStart, +}; + +/* Expand the magic b_state functions */ +BUFFER_FNS(NeedsValidate, needs_validate); + +int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, + struct ocfs2_caching_info *ci) +{ + int ret = 0; + + trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci); + + BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); + BUG_ON(buffer_jbd(bh)); + + /* No need to check for a soft readonly file system here. non + * journalled writes are only ever done on system files which + * can get modified during recovery even if read-only. */ + if (ocfs2_is_hard_readonly(osb)) { + ret = -EROFS; + mlog_errno(ret); + goto out; + } + + ocfs2_metadata_cache_io_lock(ci); + + lock_buffer(bh); + set_buffer_uptodate(bh); + + /* remove from dirty list before I/O. */ + clear_buffer_dirty(bh); + + get_bh(bh); /* for end_buffer_write_sync() */ + bh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, bh); + + wait_on_buffer(bh); + + if (buffer_uptodate(bh)) { + ocfs2_set_buffer_uptodate(ci, bh); + } else { + /* We don't need to remove the clustered uptodate + * information for this bh as it's not marked locally + * uptodate. */ + ret = -EIO; + put_bh(bh); + mlog_errno(ret); + } + + ocfs2_metadata_cache_io_unlock(ci); +out: + return ret; +} + +int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, + unsigned int nr, struct buffer_head *bhs[]) +{ + int status = 0; + unsigned int i; + struct buffer_head *bh; + + trace_ocfs2_read_blocks_sync((unsigned long long)block, nr); + + if (!nr) + goto bail; + + for (i = 0 ; i < nr ; i++) { + if (bhs[i] == NULL) { + bhs[i] = sb_getblk(osb->sb, block++); + if (bhs[i] == NULL) { + status = -EIO; + mlog_errno(status); + goto bail; + } + } + bh = bhs[i]; + + if (buffer_jbd(bh)) { + trace_ocfs2_read_blocks_sync_jbd( + (unsigned long long)bh->b_blocknr); + continue; + } + + if (buffer_dirty(bh)) { + /* This should probably be a BUG, or + * at least return an error. */ + mlog(ML_ERROR, + "trying to sync read a dirty " + "buffer! (blocknr = %llu), skipping\n", + (unsigned long long)bh->b_blocknr); + continue; + } + + lock_buffer(bh); + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "block %llu had the JBD bit set " + "while I was in lock_buffer!", + (unsigned long long)bh->b_blocknr); + BUG(); + } + + clear_buffer_uptodate(bh); + get_bh(bh); /* for end_buffer_read_sync() */ + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + } + + for (i = nr; i > 0; i--) { + bh = bhs[i - 1]; + + /* No need to wait on the buffer if it's managed by JBD. */ + if (!buffer_jbd(bh)) + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + /* Status won't be cleared from here on out, + * so we can safely record this and loop back + * to cleanup the other buffers. */ + status = -EIO; + put_bh(bh); + bhs[i - 1] = NULL; + } + } + +bail: + return status; +} + +int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) +{ + int status = 0; + int i, ignore_cache = 0; + struct buffer_head *bh; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + + trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags); + + BUG_ON(!ci); + BUG_ON((flags & OCFS2_BH_READAHEAD) && + (flags & OCFS2_BH_IGNORE_CACHE)); + + if (bhs == NULL) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + if (nr < 0) { + mlog(ML_ERROR, "asked to read %d blocks!\n", nr); + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + if (nr == 0) { + status = 0; + goto bail; + } + + ocfs2_metadata_cache_io_lock(ci); + for (i = 0 ; i < nr ; i++) { + if (bhs[i] == NULL) { + bhs[i] = sb_getblk(sb, block++); + if (bhs[i] == NULL) { + ocfs2_metadata_cache_io_unlock(ci); + status = -EIO; + mlog_errno(status); + goto bail; + } + } + bh = bhs[i]; + ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE); + + /* There are three read-ahead cases here which we need to + * be concerned with. All three assume a buffer has + * previously been submitted with OCFS2_BH_READAHEAD + * and it hasn't yet completed I/O. + * + * 1) The current request is sync to disk. This rarely + * happens these days, and never when performance + * matters - the code can just wait on the buffer + * lock and re-submit. + * + * 2) The current request is cached, but not + * readahead. ocfs2_buffer_uptodate() will return + * false anyway, so we'll wind up waiting on the + * buffer lock to do I/O. We re-check the request + * with after getting the lock to avoid a re-submit. + * + * 3) The current request is readahead (and so must + * also be a caching one). We short circuit if the + * buffer is locked (under I/O) and if it's in the + * uptodate cache. The re-check from #2 catches the + * case that the previous read-ahead completes just + * before our is-it-in-flight check. + */ + + if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) { + trace_ocfs2_read_blocks_from_disk( + (unsigned long long)bh->b_blocknr, + (unsigned long long)ocfs2_metadata_cache_owner(ci)); + /* We're using ignore_cache here to say + * "go to disk" */ + ignore_cache = 1; + } + + trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr, + ignore_cache, buffer_jbd(bh), buffer_dirty(bh)); + + if (buffer_jbd(bh)) { + continue; + } + + if (ignore_cache) { + if (buffer_dirty(bh)) { + /* This should probably be a BUG, or + * at least return an error. */ + continue; + } + + /* A read-ahead request was made - if the + * buffer is already under read-ahead from a + * previously submitted request than we are + * done here. */ + if ((flags & OCFS2_BH_READAHEAD) + && ocfs2_buffer_read_ahead(ci, bh)) + continue; + + lock_buffer(bh); + if (buffer_jbd(bh)) { +#ifdef CATCH_BH_JBD_RACES + mlog(ML_ERROR, "block %llu had the JBD bit set " + "while I was in lock_buffer!", + (unsigned long long)bh->b_blocknr); + BUG(); +#else + unlock_buffer(bh); + continue; +#endif + } + + /* Re-check ocfs2_buffer_uptodate() as a + * previously read-ahead buffer may have + * completed I/O while we were waiting for the + * buffer lock. */ + if (!(flags & OCFS2_BH_IGNORE_CACHE) + && !(flags & OCFS2_BH_READAHEAD) + && ocfs2_buffer_uptodate(ci, bh)) { + unlock_buffer(bh); + continue; + } + + clear_buffer_uptodate(bh); + get_bh(bh); /* for end_buffer_read_sync() */ + if (validate) + set_buffer_needs_validate(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + continue; + } + } + + status = 0; + + for (i = (nr - 1); i >= 0; i--) { + bh = bhs[i]; + + if (!(flags & OCFS2_BH_READAHEAD)) { + /* We know this can't have changed as we hold the + * owner sem. Avoid doing any work on the bh if the + * journal has it. */ + if (!buffer_jbd(bh)) + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + /* Status won't be cleared from here on out, + * so we can safely record this and loop back + * to cleanup the other buffers. Don't need to + * remove the clustered uptodate information + * for this bh as it's not marked locally + * uptodate. */ + status = -EIO; + put_bh(bh); + bhs[i] = NULL; + continue; + } + + if (buffer_needs_validate(bh)) { + /* We never set NeedsValidate if the + * buffer was held by the journal, so + * that better not have changed */ + BUG_ON(buffer_jbd(bh)); + clear_buffer_needs_validate(bh); + status = validate(sb, bh); + if (status) { + put_bh(bh); + bhs[i] = NULL; + continue; + } + } + } + + /* Always set the buffer in the cache, even if it was + * a forced read, or read-ahead which hasn't yet + * completed. */ + ocfs2_set_buffer_uptodate(ci, bh); + } + ocfs2_metadata_cache_io_unlock(ci); + + trace_ocfs2_read_blocks_end((unsigned long long)block, nr, + flags, ignore_cache); + +bail: + + return status; +} + +/* Check whether the blkno is the super block or one of the backups. */ +static void ocfs2_check_super_or_backup(struct super_block *sb, + sector_t blkno) +{ + int i; + u64 backup_blkno; + + if (blkno == OCFS2_SUPER_BLOCK_BLKNO) + return; + + for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { + backup_blkno = ocfs2_backup_super_blkno(sb, i); + if (backup_blkno == blkno) + return; + } + + BUG(); +} + +/* + * Write super block and backups doesn't need to collaborate with journal, + * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed + * into this function. + */ +int ocfs2_write_super_or_backup(struct ocfs2_super *osb, + struct buffer_head *bh) +{ + int ret = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + BUG_ON(buffer_jbd(bh)); + ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr); + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) { + ret = -EROFS; + mlog_errno(ret); + goto out; + } + + lock_buffer(bh); + set_buffer_uptodate(bh); + + /* remove from dirty list before I/O. */ + clear_buffer_dirty(bh); + + get_bh(bh); /* for end_buffer_write_sync() */ + bh->b_end_io = end_buffer_write_sync; + ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check); + submit_bh(WRITE, bh); + + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + ret = -EIO; + put_bh(bh); + mlog_errno(ret); + } + +out: + return ret; +} diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h new file mode 100644 index 00000000..b97bcc6d --- /dev/null +++ b/fs/ocfs2/buffer_head_io.h @@ -0,0 +1,77 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_buffer_head.h + * + * Buffer cache handling functions defined + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_BUFFER_HEAD_IO_H +#define OCFS2_BUFFER_HEAD_IO_H + +#include + +void ocfs2_end_buffer_io_sync(struct buffer_head *bh, + int uptodate); + +int ocfs2_write_block(struct ocfs2_super *osb, + struct buffer_head *bh, + struct ocfs2_caching_info *ci); +int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, + unsigned int nr, struct buffer_head *bhs[]); + +/* + * If not NULL, validate() will be called on a buffer that is freshly + * read from disk. It will not be called if the buffer was in cache. + * Note that if validate() is being used for this buffer, it needs to + * be set even for a READAHEAD call, as it marks the buffer for later + * validation. + */ +int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)); + +int ocfs2_write_super_or_backup(struct ocfs2_super *osb, + struct buffer_head *bh); + +#define OCFS2_BH_IGNORE_CACHE 1 +#define OCFS2_BH_READAHEAD 8 + +static inline int ocfs2_read_block(struct ocfs2_caching_info *ci, u64 off, + struct buffer_head **bh, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) +{ + int status = 0; + + if (bh == NULL) { + printk("ocfs2: bh == NULL\n"); + status = -EINVAL; + goto bail; + } + + status = ocfs2_read_blocks(ci, off, 1, bh, 0, validate); + +bail: + return status; +} + +#endif /* OCFS2_BUFFER_HEAD_IO_H */ diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile new file mode 100644 index 00000000..bc8c5e7d --- /dev/null +++ b/fs/ocfs2/cluster/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o + +ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ + quorum.o tcp.o netdebug.o ver.o diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c new file mode 100644 index 00000000..9a3e6bbf --- /dev/null +++ b/fs/ocfs2/cluster/heartbeat.c @@ -0,0 +1,2628 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "heartbeat.h" +#include "tcp.h" +#include "nodemanager.h" +#include "quorum.h" + +#include "masklog.h" + + +/* + * The first heartbeat pass had one global thread that would serialize all hb + * callback calls. This global serializing sem should only be removed once + * we've made sure that all callees can deal with being called concurrently + * from multiple hb region threads. + */ +static DECLARE_RWSEM(o2hb_callback_sem); + +/* + * multiple hb threads are watching multiple regions. A node is live + * whenever any of the threads sees activity from the node in its region. + */ +static DEFINE_SPINLOCK(o2hb_live_lock); +static struct list_head o2hb_live_slots[O2NM_MAX_NODES]; +static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; +static LIST_HEAD(o2hb_node_events); +static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); + +/* + * In global heartbeat, we maintain a series of region bitmaps. + * - o2hb_region_bitmap allows us to limit the region number to max region. + * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). + * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes + * heartbeat on it. + * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts. + */ +static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; + +#define O2HB_DB_TYPE_LIVENODES 0 +#define O2HB_DB_TYPE_LIVEREGIONS 1 +#define O2HB_DB_TYPE_QUORUMREGIONS 2 +#define O2HB_DB_TYPE_FAILEDREGIONS 3 +#define O2HB_DB_TYPE_REGION_LIVENODES 4 +#define O2HB_DB_TYPE_REGION_NUMBER 5 +#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6 +#define O2HB_DB_TYPE_REGION_PINNED 7 +struct o2hb_debug_buf { + int db_type; + int db_size; + int db_len; + void *db_data; +}; + +static struct o2hb_debug_buf *o2hb_db_livenodes; +static struct o2hb_debug_buf *o2hb_db_liveregions; +static struct o2hb_debug_buf *o2hb_db_quorumregions; +static struct o2hb_debug_buf *o2hb_db_failedregions; + +#define O2HB_DEBUG_DIR "o2hb" +#define O2HB_DEBUG_LIVENODES "livenodes" +#define O2HB_DEBUG_LIVEREGIONS "live_regions" +#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions" +#define O2HB_DEBUG_FAILEDREGIONS "failed_regions" +#define O2HB_DEBUG_REGION_NUMBER "num" +#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms" +#define O2HB_DEBUG_REGION_PINNED "pinned" + +static struct dentry *o2hb_debug_dir; +static struct dentry *o2hb_debug_livenodes; +static struct dentry *o2hb_debug_liveregions; +static struct dentry *o2hb_debug_quorumregions; +static struct dentry *o2hb_debug_failedregions; + +static LIST_HEAD(o2hb_all_regions); + +static struct o2hb_callback { + struct list_head list; +} o2hb_callbacks[O2HB_NUM_CB]; + +static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); + +#define O2HB_DEFAULT_BLOCK_BITS 9 + +enum o2hb_heartbeat_modes { + O2HB_HEARTBEAT_LOCAL = 0, + O2HB_HEARTBEAT_GLOBAL, + O2HB_HEARTBEAT_NUM_MODES, +}; + +char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { + "local", /* O2HB_HEARTBEAT_LOCAL */ + "global", /* O2HB_HEARTBEAT_GLOBAL */ +}; + +unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; +unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; + +/* + * o2hb_dependent_users tracks the number of registered callbacks that depend + * on heartbeat. o2net and o2dlm are two entities that register this callback. + * However only o2dlm depends on the heartbeat. It does not want the heartbeat + * to stop while a dlm domain is still active. + */ +unsigned int o2hb_dependent_users; + +/* + * In global heartbeat mode, all regions are pinned if there are one or more + * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All + * regions are unpinned if the region count exceeds the cut off or the number + * of dependent users falls to zero. + */ +#define O2HB_PIN_CUT_OFF 3 + +/* + * In local heartbeat mode, we assume the dlm domain name to be the same as + * region uuid. This is true for domains created for the file system but not + * necessarily true for userdlm domains. This is a known limitation. + * + * In global heartbeat mode, we pin/unpin all o2hb regions. This solution + * works for both file system and userdlm domains. + */ +static int o2hb_region_pin(const char *region_uuid); +static void o2hb_region_unpin(const char *region_uuid); + +/* Only sets a new threshold if there are no active regions. + * + * No locking or otherwise interesting code is required for reading + * o2hb_dead_threshold as it can't change once regions are active and + * it's not interesting to anyone until then anyway. */ +static void o2hb_dead_threshold_set(unsigned int threshold) +{ + if (threshold > O2HB_MIN_DEAD_THRESHOLD) { + spin_lock(&o2hb_live_lock); + if (list_empty(&o2hb_all_regions)) + o2hb_dead_threshold = threshold; + spin_unlock(&o2hb_live_lock); + } +} + +static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode) +{ + int ret = -1; + + if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) { + spin_lock(&o2hb_live_lock); + if (list_empty(&o2hb_all_regions)) { + o2hb_heartbeat_mode = hb_mode; + ret = 0; + } + spin_unlock(&o2hb_live_lock); + } + + return ret; +} + +struct o2hb_node_event { + struct list_head hn_item; + enum o2hb_callback_type hn_event_type; + struct o2nm_node *hn_node; + int hn_node_num; +}; + +struct o2hb_disk_slot { + struct o2hb_disk_heartbeat_block *ds_raw_block; + u8 ds_node_num; + u64 ds_last_time; + u64 ds_last_generation; + u16 ds_equal_samples; + u16 ds_changed_samples; + struct list_head ds_live_item; +}; + +/* each thread owns a region.. when we're asked to tear down the region + * we ask the thread to stop, who cleans up the region */ +struct o2hb_region { + struct config_item hr_item; + + struct list_head hr_all_item; + unsigned hr_unclean_stop:1, + hr_item_pinned:1, + hr_item_dropped:1; + + /* protected by the hr_callback_sem */ + struct task_struct *hr_task; + + unsigned int hr_blocks; + unsigned long long hr_start_block; + + unsigned int hr_block_bits; + unsigned int hr_block_bytes; + + unsigned int hr_slots_per_page; + unsigned int hr_num_pages; + + struct page **hr_slot_data; + struct block_device *hr_bdev; + struct o2hb_disk_slot *hr_slots; + + /* live node map of this region */ + unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned int hr_region_num; + + struct dentry *hr_debug_dir; + struct dentry *hr_debug_livenodes; + struct dentry *hr_debug_regnum; + struct dentry *hr_debug_elapsed_time; + struct dentry *hr_debug_pinned; + struct o2hb_debug_buf *hr_db_livenodes; + struct o2hb_debug_buf *hr_db_regnum; + struct o2hb_debug_buf *hr_db_elapsed_time; + struct o2hb_debug_buf *hr_db_pinned; + + /* let the person setting up hb wait for it to return until it + * has reached a 'steady' state. This will be fixed when we have + * a more complete api that doesn't lead to this sort of fragility. */ + atomic_t hr_steady_iterations; + + char hr_dev_name[BDEVNAME_SIZE]; + + unsigned int hr_timeout_ms; + + /* randomized as the region goes up and down so that a node + * recognizes a node going up and down in one iteration */ + u64 hr_generation; + + struct delayed_work hr_write_timeout_work; + unsigned long hr_last_timeout_start; + + /* Used during o2hb_check_slot to hold a copy of the block + * being checked because we temporarily have to zero out the + * crc field. */ + struct o2hb_disk_heartbeat_block *hr_tmp_block; +}; + +struct o2hb_bio_wait_ctxt { + atomic_t wc_num_reqs; + struct completion wc_io_complete; + int wc_error; +}; + +static int o2hb_pop_count(void *map, int count) +{ + int i = -1, pop = 0; + + while ((i = find_next_bit(map, count, i + 1)) < count) + pop++; + return pop; +} + +static void o2hb_write_timeout(struct work_struct *work) +{ + int failed, quorum; + unsigned long flags; + struct o2hb_region *reg = + container_of(work, struct o2hb_region, + hr_write_timeout_work.work); + + mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " + "milliseconds\n", reg->hr_dev_name, + jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); + + if (o2hb_global_heartbeat_active()) { + spin_lock_irqsave(&o2hb_live_lock, flags); + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); + failed = o2hb_pop_count(&o2hb_failed_region_bitmap, + O2NM_MAX_REGIONS); + quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS); + spin_unlock_irqrestore(&o2hb_live_lock, flags); + + mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", + quorum, failed); + + /* + * Fence if the number of failed regions >= half the number + * of quorum regions + */ + if ((failed << 1) < quorum) + return; + } + + o2quo_disk_timeout(); +} + +static void o2hb_arm_write_timeout(struct o2hb_region *reg) +{ + mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", + O2HB_MAX_WRITE_TIMEOUT_MS); + + if (o2hb_global_heartbeat_active()) { + spin_lock(&o2hb_live_lock); + clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap); + spin_unlock(&o2hb_live_lock); + } + cancel_delayed_work(®->hr_write_timeout_work); + reg->hr_last_timeout_start = jiffies; + schedule_delayed_work(®->hr_write_timeout_work, + msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); +} + +static void o2hb_disarm_write_timeout(struct o2hb_region *reg) +{ + cancel_delayed_work_sync(®->hr_write_timeout_work); +} + +static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) +{ + atomic_set(&wc->wc_num_reqs, 1); + init_completion(&wc->wc_io_complete); + wc->wc_error = 0; +} + +/* Used in error paths too */ +static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc, + unsigned int num) +{ + /* sadly atomic_sub_and_test() isn't available on all platforms. The + * good news is that the fast path only completes one at a time */ + while(num--) { + if (atomic_dec_and_test(&wc->wc_num_reqs)) { + BUG_ON(num > 0); + complete(&wc->wc_io_complete); + } + } +} + +static void o2hb_wait_on_io(struct o2hb_region *reg, + struct o2hb_bio_wait_ctxt *wc) +{ + o2hb_bio_wait_dec(wc, 1); + wait_for_completion(&wc->wc_io_complete); +} + +static void o2hb_bio_end_io(struct bio *bio, + int error) +{ + struct o2hb_bio_wait_ctxt *wc = bio->bi_private; + + if (error) { + mlog(ML_ERROR, "IO Error %d\n", error); + wc->wc_error = error; + } + + o2hb_bio_wait_dec(wc, 1); + bio_put(bio); +} + +/* Setup a Bio to cover I/O against num_slots slots starting at + * start_slot. */ +static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, + struct o2hb_bio_wait_ctxt *wc, + unsigned int *current_slot, + unsigned int max_slots) +{ + int len, current_page; + unsigned int vec_len, vec_start; + unsigned int bits = reg->hr_block_bits; + unsigned int spp = reg->hr_slots_per_page; + unsigned int cs = *current_slot; + struct bio *bio; + struct page *page; + + /* Testing has shown this allocation to take long enough under + * GFP_KERNEL that the local node can get fenced. It would be + * nicest if we could pre-allocate these bios and avoid this + * all together. */ + bio = bio_alloc(GFP_ATOMIC, 16); + if (!bio) { + mlog(ML_ERROR, "Could not alloc slots BIO!\n"); + bio = ERR_PTR(-ENOMEM); + goto bail; + } + + /* Must put everything in 512 byte sectors for the bio... */ + bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9); + bio->bi_bdev = reg->hr_bdev; + bio->bi_private = wc; + bio->bi_end_io = o2hb_bio_end_io; + + vec_start = (cs << bits) % PAGE_CACHE_SIZE; + while(cs < max_slots) { + current_page = cs / spp; + page = reg->hr_slot_data[current_page]; + + vec_len = min(PAGE_CACHE_SIZE - vec_start, + (max_slots-cs) * (PAGE_CACHE_SIZE/spp) ); + + mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", + current_page, vec_len, vec_start); + + len = bio_add_page(bio, page, vec_len, vec_start); + if (len != vec_len) break; + + cs += vec_len / (PAGE_CACHE_SIZE/spp); + vec_start = 0; + } + +bail: + *current_slot = cs; + return bio; +} + +static int o2hb_read_slots(struct o2hb_region *reg, + unsigned int max_slots) +{ + unsigned int current_slot=0; + int status; + struct o2hb_bio_wait_ctxt wc; + struct bio *bio; + + o2hb_bio_wait_init(&wc); + + while(current_slot < max_slots) { + bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots); + if (IS_ERR(bio)) { + status = PTR_ERR(bio); + mlog_errno(status); + goto bail_and_wait; + } + + atomic_inc(&wc.wc_num_reqs); + submit_bio(READ, bio); + } + + status = 0; + +bail_and_wait: + o2hb_wait_on_io(reg, &wc); + if (wc.wc_error && !status) + status = wc.wc_error; + + return status; +} + +static int o2hb_issue_node_write(struct o2hb_region *reg, + struct o2hb_bio_wait_ctxt *write_wc) +{ + int status; + unsigned int slot; + struct bio *bio; + + o2hb_bio_wait_init(write_wc); + + slot = o2nm_this_node(); + + bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1); + if (IS_ERR(bio)) { + status = PTR_ERR(bio); + mlog_errno(status); + goto bail; + } + + atomic_inc(&write_wc->wc_num_reqs); + submit_bio(WRITE, bio); + + status = 0; +bail: + return status; +} + +static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg, + struct o2hb_disk_heartbeat_block *hb_block) +{ + __le32 old_cksum; + u32 ret; + + /* We want to compute the block crc with a 0 value in the + * hb_cksum field. Save it off here and replace after the + * crc. */ + old_cksum = hb_block->hb_cksum; + hb_block->hb_cksum = 0; + + ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes); + + hb_block->hb_cksum = old_cksum; + + return ret; +} + +static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block) +{ + mlog(ML_ERROR, "Dump slot information: seq = 0x%llx, node = %u, " + "cksum = 0x%x, generation 0x%llx\n", + (long long)le64_to_cpu(hb_block->hb_seq), + hb_block->hb_node, le32_to_cpu(hb_block->hb_cksum), + (long long)le64_to_cpu(hb_block->hb_generation)); +} + +static int o2hb_verify_crc(struct o2hb_region *reg, + struct o2hb_disk_heartbeat_block *hb_block) +{ + u32 read, computed; + + read = le32_to_cpu(hb_block->hb_cksum); + computed = o2hb_compute_block_crc_le(reg, hb_block); + + return read == computed; +} + +/* We want to make sure that nobody is heartbeating on top of us -- + * this will help detect an invalid configuration. */ +static void o2hb_check_last_timestamp(struct o2hb_region *reg) +{ + struct o2hb_disk_slot *slot; + struct o2hb_disk_heartbeat_block *hb_block; + char *errstr; + + slot = ®->hr_slots[o2nm_this_node()]; + /* Don't check on our 1st timestamp */ + if (!slot->ds_last_time) + return; + + hb_block = slot->ds_raw_block; + if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && + le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && + hb_block->hb_node == slot->ds_node_num) + return; + +#define ERRSTR1 "Another node is heartbeating on device" +#define ERRSTR2 "Heartbeat generation mismatch on device" +#define ERRSTR3 "Heartbeat sequence mismatch on device" + + if (hb_block->hb_node != slot->ds_node_num) + errstr = ERRSTR1; + else if (le64_to_cpu(hb_block->hb_generation) != + slot->ds_last_generation) + errstr = ERRSTR2; + else + errstr = ERRSTR3; + + mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), " + "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name, + slot->ds_node_num, (unsigned long long)slot->ds_last_generation, + (unsigned long long)slot->ds_last_time, hb_block->hb_node, + (unsigned long long)le64_to_cpu(hb_block->hb_generation), + (unsigned long long)le64_to_cpu(hb_block->hb_seq)); +} + +static inline void o2hb_prepare_block(struct o2hb_region *reg, + u64 generation) +{ + int node_num; + u64 cputime; + struct o2hb_disk_slot *slot; + struct o2hb_disk_heartbeat_block *hb_block; + + node_num = o2nm_this_node(); + slot = ®->hr_slots[node_num]; + + hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block; + memset(hb_block, 0, reg->hr_block_bytes); + /* TODO: time stuff */ + cputime = CURRENT_TIME.tv_sec; + if (!cputime) + cputime = 1; + + hb_block->hb_seq = cpu_to_le64(cputime); + hb_block->hb_node = node_num; + hb_block->hb_generation = cpu_to_le64(generation); + hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS); + + /* This step must always happen last! */ + hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg, + hb_block)); + + mlog(ML_HB_BIO, "our node generation = 0x%llx, cksum = 0x%x\n", + (long long)generation, + le32_to_cpu(hb_block->hb_cksum)); +} + +static void o2hb_fire_callbacks(struct o2hb_callback *hbcall, + struct o2nm_node *node, + int idx) +{ + struct list_head *iter; + struct o2hb_callback_func *f; + + list_for_each(iter, &hbcall->list) { + f = list_entry(iter, struct o2hb_callback_func, hc_item); + mlog(ML_HEARTBEAT, "calling funcs %p\n", f); + (f->hc_func)(node, idx, f->hc_data); + } +} + +/* Will run the list in order until we process the passed event */ +static void o2hb_run_event_list(struct o2hb_node_event *queued_event) +{ + int empty; + struct o2hb_callback *hbcall; + struct o2hb_node_event *event; + + spin_lock(&o2hb_live_lock); + empty = list_empty(&queued_event->hn_item); + spin_unlock(&o2hb_live_lock); + if (empty) + return; + + /* Holding callback sem assures we don't alter the callback + * lists when doing this, and serializes ourselves with other + * processes wanting callbacks. */ + down_write(&o2hb_callback_sem); + + spin_lock(&o2hb_live_lock); + while (!list_empty(&o2hb_node_events) + && !list_empty(&queued_event->hn_item)) { + event = list_entry(o2hb_node_events.next, + struct o2hb_node_event, + hn_item); + list_del_init(&event->hn_item); + spin_unlock(&o2hb_live_lock); + + mlog(ML_HEARTBEAT, "Node %s event for %d\n", + event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN", + event->hn_node_num); + + hbcall = hbcall_from_type(event->hn_event_type); + + /* We should *never* have gotten on to the list with a + * bad type... This isn't something that we should try + * to recover from. */ + BUG_ON(IS_ERR(hbcall)); + + o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num); + + spin_lock(&o2hb_live_lock); + } + spin_unlock(&o2hb_live_lock); + + up_write(&o2hb_callback_sem); +} + +static void o2hb_queue_node_event(struct o2hb_node_event *event, + enum o2hb_callback_type type, + struct o2nm_node *node, + int node_num) +{ + assert_spin_locked(&o2hb_live_lock); + + BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB)); + + event->hn_event_type = type; + event->hn_node = node; + event->hn_node_num = node_num; + + mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n", + type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num); + + list_add_tail(&event->hn_item, &o2hb_node_events); +} + +static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) +{ + struct o2hb_node_event event = + { .hn_item = LIST_HEAD_INIT(event.hn_item), }; + struct o2nm_node *node; + + node = o2nm_get_node_by_num(slot->ds_node_num); + if (!node) + return; + + spin_lock(&o2hb_live_lock); + if (!list_empty(&slot->ds_live_item)) { + mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n", + slot->ds_node_num); + + list_del_init(&slot->ds_live_item); + + if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); + + o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, + slot->ds_node_num); + } + } + spin_unlock(&o2hb_live_lock); + + o2hb_run_event_list(&event); + + o2nm_node_put(node); +} + +static void o2hb_set_quorum_device(struct o2hb_region *reg, + struct o2hb_disk_slot *slot) +{ + assert_spin_locked(&o2hb_live_lock); + + if (!o2hb_global_heartbeat_active()) + return; + + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + return; + + /* + * A region can be added to the quorum only when it sees all + * live nodes heartbeat on it. In other words, the region has been + * added to all nodes. + */ + if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, + sizeof(o2hb_live_node_bitmap))) + return; + + if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD) + return; + + printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", + config_item_name(®->hr_item)); + + set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); + + /* + * If global heartbeat active, unpin all regions if the + * region count > CUT_OFF + */ + if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) + o2hb_region_unpin(NULL); +} + +static int o2hb_check_slot(struct o2hb_region *reg, + struct o2hb_disk_slot *slot) +{ + int changed = 0, gen_changed = 0; + struct o2hb_node_event event = + { .hn_item = LIST_HEAD_INIT(event.hn_item), }; + struct o2nm_node *node; + struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block; + u64 cputime; + unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; + unsigned int slot_dead_ms; + int tmp; + + memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); + + /* + * If a node is no longer configured but is still in the livemap, we + * may need to clear that bit from the livemap. + */ + node = o2nm_get_node_by_num(slot->ds_node_num); + if (!node) { + spin_lock(&o2hb_live_lock); + tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap); + spin_unlock(&o2hb_live_lock); + if (!tmp) + return 0; + } + + if (!o2hb_verify_crc(reg, hb_block)) { + /* all paths from here will drop o2hb_live_lock for + * us. */ + spin_lock(&o2hb_live_lock); + + /* Don't print an error on the console in this case - + * a freshly formatted heartbeat area will not have a + * crc set on it. */ + if (list_empty(&slot->ds_live_item)) + goto out; + + /* The node is live but pushed out a bad crc. We + * consider it a transient miss but don't populate any + * other values as they may be junk. */ + mlog(ML_ERROR, "Node %d has written a bad crc to %s\n", + slot->ds_node_num, reg->hr_dev_name); + o2hb_dump_slot(hb_block); + + slot->ds_equal_samples++; + goto fire_callbacks; + } + + /* we don't care if these wrap.. the state transitions below + * clear at the right places */ + cputime = le64_to_cpu(hb_block->hb_seq); + if (slot->ds_last_time != cputime) + slot->ds_changed_samples++; + else + slot->ds_equal_samples++; + slot->ds_last_time = cputime; + + /* The node changed heartbeat generations. We assume this to + * mean it dropped off but came back before we timed out. We + * want to consider it down for the time being but don't want + * to lose any changed_samples state we might build up to + * considering it live again. */ + if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) { + gen_changed = 1; + slot->ds_equal_samples = 0; + mlog(ML_HEARTBEAT, "Node %d changed generation (0x%llx " + "to 0x%llx)\n", slot->ds_node_num, + (long long)slot->ds_last_generation, + (long long)le64_to_cpu(hb_block->hb_generation)); + } + + slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation); + + mlog(ML_HEARTBEAT, "Slot %d gen 0x%llx cksum 0x%x " + "seq %llu last %llu changed %u equal %u\n", + slot->ds_node_num, (long long)slot->ds_last_generation, + le32_to_cpu(hb_block->hb_cksum), + (unsigned long long)le64_to_cpu(hb_block->hb_seq), + (unsigned long long)slot->ds_last_time, slot->ds_changed_samples, + slot->ds_equal_samples); + + spin_lock(&o2hb_live_lock); + +fire_callbacks: + /* dead nodes only come to life after some number of + * changes at any time during their dead time */ + if (list_empty(&slot->ds_live_item) && + slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) { + mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n", + slot->ds_node_num, (long long)slot->ds_last_generation); + + set_bit(slot->ds_node_num, reg->hr_live_node_bitmap); + + /* first on the list generates a callback */ + if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes " + "bitmap\n", slot->ds_node_num); + set_bit(slot->ds_node_num, o2hb_live_node_bitmap); + + o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, + slot->ds_node_num); + + changed = 1; + } + + list_add_tail(&slot->ds_live_item, + &o2hb_live_slots[slot->ds_node_num]); + + slot->ds_equal_samples = 0; + + /* We want to be sure that all nodes agree on the + * number of milliseconds before a node will be + * considered dead. The self-fencing timeout is + * computed from this value, and a discrepancy might + * result in heartbeat calling a node dead when it + * hasn't self-fenced yet. */ + slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms); + if (slot_dead_ms && slot_dead_ms != dead_ms) { + /* TODO: Perhaps we can fail the region here. */ + mlog(ML_ERROR, "Node %d on device %s has a dead count " + "of %u ms, but our count is %u ms.\n" + "Please double check your configuration values " + "for 'O2CB_HEARTBEAT_THRESHOLD'\n", + slot->ds_node_num, reg->hr_dev_name, slot_dead_ms, + dead_ms); + } + goto out; + } + + /* if the list is dead, we're done.. */ + if (list_empty(&slot->ds_live_item)) + goto out; + + /* live nodes only go dead after enough consequtive missed + * samples.. reset the missed counter whenever we see + * activity */ + if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) { + mlog(ML_HEARTBEAT, "Node %d left my region\n", + slot->ds_node_num); + + clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap); + + /* last off the live_slot generates a callback */ + list_del_init(&slot->ds_live_item); + if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live " + "nodes bitmap\n", slot->ds_node_num); + clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); + + /* node can be null */ + o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, + node, slot->ds_node_num); + + changed = 1; + } + + /* We don't clear this because the node is still + * actually writing new blocks. */ + if (!gen_changed) + slot->ds_changed_samples = 0; + goto out; + } + if (slot->ds_changed_samples) { + slot->ds_changed_samples = 0; + slot->ds_equal_samples = 0; + } +out: + o2hb_set_quorum_device(reg, slot); + + spin_unlock(&o2hb_live_lock); + + o2hb_run_event_list(&event); + + if (node) + o2nm_node_put(node); + return changed; +} + +/* This could be faster if we just implmented a find_last_bit, but I + * don't think the circumstances warrant it. */ +static int o2hb_highest_node(unsigned long *nodes, + int numbits) +{ + int highest, node; + + highest = numbits; + node = -1; + while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) { + if (node >= numbits) + break; + + highest = node; + } + + return highest; +} + +static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) +{ + int i, ret, highest_node, change = 0; + unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + struct o2hb_bio_wait_ctxt write_wc; + + ret = o2nm_configured_node_map(configured_nodes, + sizeof(configured_nodes)); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* + * If a node is not configured but is in the livemap, we still need + * to read the slot so as to be able to remove it from the livemap. + */ + o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + i = -1; + while ((i = find_next_bit(live_node_bitmap, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + set_bit(i, configured_nodes); + } + + highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); + if (highest_node >= O2NM_MAX_NODES) { + mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); + return -EINVAL; + } + + /* No sense in reading the slots of nodes that don't exist + * yet. Of course, if the node definitions have holes in them + * then we're reading an empty slot anyway... Consider this + * best-effort. */ + ret = o2hb_read_slots(reg, highest_node + 1); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + /* With an up to date view of the slots, we can check that no + * other node has been improperly configured to heartbeat in + * our slot. */ + o2hb_check_last_timestamp(reg); + + /* fill in the proper info for our next heartbeat */ + o2hb_prepare_block(reg, reg->hr_generation); + + /* And fire off the write. Note that we don't wait on this I/O + * until later. */ + ret = o2hb_issue_node_write(reg, &write_wc); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + i = -1; + while((i = find_next_bit(configured_nodes, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + change |= o2hb_check_slot(reg, ®->hr_slots[i]); + } + + /* + * We have to be sure we've advertised ourselves on disk + * before we can go to steady state. This ensures that + * people we find in our steady state have seen us. + */ + o2hb_wait_on_io(reg, &write_wc); + if (write_wc.wc_error) { + /* Do not re-arm the write timeout on I/O error - we + * can't be sure that the new block ever made it to + * disk */ + mlog(ML_ERROR, "Write error %d on device \"%s\"\n", + write_wc.wc_error, reg->hr_dev_name); + return write_wc.wc_error; + } + + o2hb_arm_write_timeout(reg); + + /* let the person who launched us know when things are steady */ + if (!change && (atomic_read(®->hr_steady_iterations) != 0)) { + if (atomic_dec_and_test(®->hr_steady_iterations)) + wake_up(&o2hb_steady_queue); + } + + return 0; +} + +/* Subtract b from a, storing the result in a. a *must* have a larger + * value than b. */ +static void o2hb_tv_subtract(struct timeval *a, + struct timeval *b) +{ + /* just return 0 when a is after b */ + if (a->tv_sec < b->tv_sec || + (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) { + a->tv_sec = 0; + a->tv_usec = 0; + return; + } + + a->tv_sec -= b->tv_sec; + a->tv_usec -= b->tv_usec; + while ( a->tv_usec < 0 ) { + a->tv_sec--; + a->tv_usec += 1000000; + } +} + +static unsigned int o2hb_elapsed_msecs(struct timeval *start, + struct timeval *end) +{ + struct timeval res = *end; + + o2hb_tv_subtract(&res, start); + + return res.tv_sec * 1000 + res.tv_usec / 1000; +} + +/* + * we ride the region ref that the region dir holds. before the region + * dir is removed and drops it ref it will wait to tear down this + * thread. + */ +static int o2hb_thread(void *data) +{ + int i, ret; + struct o2hb_region *reg = data; + struct o2hb_bio_wait_ctxt write_wc; + struct timeval before_hb, after_hb; + unsigned int elapsed_msec; + + mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); + + set_user_nice(current, -20); + + /* Pin node */ + o2nm_depend_this_node(); + + while (!kthread_should_stop() && !reg->hr_unclean_stop) { + /* We track the time spent inside + * o2hb_do_disk_heartbeat so that we avoid more than + * hr_timeout_ms between disk writes. On busy systems + * this should result in a heartbeat which is less + * likely to time itself out. */ + do_gettimeofday(&before_hb); + + i = 0; + do { + ret = o2hb_do_disk_heartbeat(reg); + } while (ret && ++i < 2); + + do_gettimeofday(&after_hb); + elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); + + mlog(ML_HEARTBEAT, + "start = %lu.%lu, end = %lu.%lu, msec = %u\n", + before_hb.tv_sec, (unsigned long) before_hb.tv_usec, + after_hb.tv_sec, (unsigned long) after_hb.tv_usec, + elapsed_msec); + + if (elapsed_msec < reg->hr_timeout_ms) { + /* the kthread api has blocked signals for us so no + * need to record the return value. */ + msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); + } + } + + o2hb_disarm_write_timeout(reg); + + /* unclean stop is only used in very bad situation */ + for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++) + o2hb_shutdown_slot(®->hr_slots[i]); + + /* Explicit down notification - avoid forcing the other nodes + * to timeout on this region when we could just as easily + * write a clear generation - thus indicating to them that + * this node has left this region. + * + * XXX: Should we skip this on unclean_stop? */ + o2hb_prepare_block(reg, 0); + ret = o2hb_issue_node_write(reg, &write_wc); + if (ret == 0) { + o2hb_wait_on_io(reg, &write_wc); + } else { + mlog_errno(ret); + } + + /* Unpin node */ + o2nm_undepend_this_node(); + + mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); + + return 0; +} + +#ifdef CONFIG_DEBUG_FS +static int o2hb_debug_open(struct inode *inode, struct file *file) +{ + struct o2hb_debug_buf *db = inode->i_private; + struct o2hb_region *reg; + unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + char *buf = NULL; + int i = -1; + int out = 0; + + /* max_nodes should be the largest bitmap we pass here */ + BUG_ON(sizeof(map) < db->db_size); + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto bail; + + switch (db->db_type) { + case O2HB_DB_TYPE_LIVENODES: + case O2HB_DB_TYPE_LIVEREGIONS: + case O2HB_DB_TYPE_QUORUMREGIONS: + case O2HB_DB_TYPE_FAILEDREGIONS: + spin_lock(&o2hb_live_lock); + memcpy(map, db->db_data, db->db_size); + spin_unlock(&o2hb_live_lock); + break; + + case O2HB_DB_TYPE_REGION_LIVENODES: + spin_lock(&o2hb_live_lock); + reg = (struct o2hb_region *)db->db_data; + memcpy(map, reg->hr_live_node_bitmap, db->db_size); + spin_unlock(&o2hb_live_lock); + break; + + case O2HB_DB_TYPE_REGION_NUMBER: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%d\n", + reg->hr_region_num); + goto done; + + case O2HB_DB_TYPE_REGION_ELAPSED_TIME: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", + jiffies_to_msecs(jiffies - + reg->hr_last_timeout_start)); + goto done; + + case O2HB_DB_TYPE_REGION_PINNED: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", + !!reg->hr_item_pinned); + goto done; + + default: + goto done; + } + + while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len) + out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += snprintf(buf + out, PAGE_SIZE - out, "\n"); + +done: + i_size_write(inode, out); + + file->private_data = buf; + + return 0; +bail: + return -ENOMEM; +} + +static int o2hb_debug_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static ssize_t o2hb_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, + i_size_read(file->f_mapping->host)); +} +#else +static int o2hb_debug_open(struct inode *inode, struct file *file) +{ + return 0; +} +static int o2hb_debug_release(struct inode *inode, struct file *file) +{ + return 0; +} +static ssize_t o2hb_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return 0; +} +#endif /* CONFIG_DEBUG_FS */ + +static const struct file_operations o2hb_debug_fops = { + .open = o2hb_debug_open, + .release = o2hb_debug_release, + .read = o2hb_debug_read, + .llseek = generic_file_llseek, +}; + +void o2hb_exit(void) +{ + kfree(o2hb_db_livenodes); + kfree(o2hb_db_liveregions); + kfree(o2hb_db_quorumregions); + kfree(o2hb_db_failedregions); + debugfs_remove(o2hb_debug_failedregions); + debugfs_remove(o2hb_debug_quorumregions); + debugfs_remove(o2hb_debug_liveregions); + debugfs_remove(o2hb_debug_livenodes); + debugfs_remove(o2hb_debug_dir); +} + +static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, + struct o2hb_debug_buf **db, int db_len, + int type, int size, int len, void *data) +{ + *db = kmalloc(db_len, GFP_KERNEL); + if (!*db) + return NULL; + + (*db)->db_type = type; + (*db)->db_size = size; + (*db)->db_len = len; + (*db)->db_data = data; + + return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, + &o2hb_debug_fops); +} + +static int o2hb_debug_init(void) +{ + int ret = -ENOMEM; + + o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); + if (!o2hb_debug_dir) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES, + o2hb_debug_dir, + &o2hb_db_livenodes, + sizeof(*o2hb_db_livenodes), + O2HB_DB_TYPE_LIVENODES, + sizeof(o2hb_live_node_bitmap), + O2NM_MAX_NODES, + o2hb_live_node_bitmap); + if (!o2hb_debug_livenodes) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, + o2hb_debug_dir, + &o2hb_db_liveregions, + sizeof(*o2hb_db_liveregions), + O2HB_DB_TYPE_LIVEREGIONS, + sizeof(o2hb_live_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_live_region_bitmap); + if (!o2hb_debug_liveregions) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_quorumregions = + o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, + o2hb_debug_dir, + &o2hb_db_quorumregions, + sizeof(*o2hb_db_quorumregions), + O2HB_DB_TYPE_QUORUMREGIONS, + sizeof(o2hb_quorum_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_quorum_region_bitmap); + if (!o2hb_debug_quorumregions) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_failedregions = + o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, + o2hb_debug_dir, + &o2hb_db_failedregions, + sizeof(*o2hb_db_failedregions), + O2HB_DB_TYPE_FAILEDREGIONS, + sizeof(o2hb_failed_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_failed_region_bitmap); + if (!o2hb_debug_failedregions) { + mlog_errno(ret); + goto bail; + } + + ret = 0; +bail: + if (ret) + o2hb_exit(); + + return ret; +} + +int o2hb_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++) + INIT_LIST_HEAD(&o2hb_callbacks[i].list); + + for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++) + INIT_LIST_HEAD(&o2hb_live_slots[i]); + + INIT_LIST_HEAD(&o2hb_node_events); + + memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); + memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); + memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); + memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); + memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); + + o2hb_dependent_users = 0; + + return o2hb_debug_init(); +} + +/* if we're already in a callback then we're already serialized by the sem */ +static void o2hb_fill_node_map_from_callback(unsigned long *map, + unsigned bytes) +{ + BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); + + memcpy(map, &o2hb_live_node_bitmap, bytes); +} + +/* + * get a map of all nodes that are heartbeating in any regions + */ +void o2hb_fill_node_map(unsigned long *map, unsigned bytes) +{ + /* callers want to serialize this map and callbacks so that they + * can trust that they don't miss nodes coming to the party */ + down_read(&o2hb_callback_sem); + spin_lock(&o2hb_live_lock); + o2hb_fill_node_map_from_callback(map, bytes); + spin_unlock(&o2hb_live_lock); + up_read(&o2hb_callback_sem); +} +EXPORT_SYMBOL_GPL(o2hb_fill_node_map); + +/* + * heartbeat configfs bits. The heartbeat set is a default set under + * the cluster set in nodemanager.c. + */ + +static struct o2hb_region *to_o2hb_region(struct config_item *item) +{ + return item ? container_of(item, struct o2hb_region, hr_item) : NULL; +} + +/* drop_item only drops its ref after killing the thread, nothing should + * be using the region anymore. this has to clean up any state that + * attributes might have built up. */ +static void o2hb_region_release(struct config_item *item) +{ + int i; + struct page *page; + struct o2hb_region *reg = to_o2hb_region(item); + + if (reg->hr_tmp_block) + kfree(reg->hr_tmp_block); + + if (reg->hr_slot_data) { + for (i = 0; i < reg->hr_num_pages; i++) { + page = reg->hr_slot_data[i]; + if (page) + __free_page(page); + } + kfree(reg->hr_slot_data); + } + + if (reg->hr_bdev) + blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); + + if (reg->hr_slots) + kfree(reg->hr_slots); + + kfree(reg->hr_db_regnum); + kfree(reg->hr_db_livenodes); + debugfs_remove(reg->hr_debug_livenodes); + debugfs_remove(reg->hr_debug_regnum); + debugfs_remove(reg->hr_debug_elapsed_time); + debugfs_remove(reg->hr_debug_pinned); + debugfs_remove(reg->hr_debug_dir); + + spin_lock(&o2hb_live_lock); + list_del(®->hr_all_item); + spin_unlock(&o2hb_live_lock); + + kfree(reg); +} + +static int o2hb_read_block_input(struct o2hb_region *reg, + const char *page, + size_t count, + unsigned long *ret_bytes, + unsigned int *ret_bits) +{ + unsigned long bytes; + char *p = (char *)page; + + bytes = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + /* Heartbeat and fs min / max block sizes are the same. */ + if (bytes > 4096 || bytes < 512) + return -ERANGE; + if (hweight16(bytes) != 1) + return -EINVAL; + + if (ret_bytes) + *ret_bytes = bytes; + if (ret_bits) + *ret_bits = ffs(bytes) - 1; + + return 0; +} + +static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg, + char *page) +{ + return sprintf(page, "%u\n", reg->hr_block_bytes); +} + +static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg, + const char *page, + size_t count) +{ + int status; + unsigned long block_bytes; + unsigned int block_bits; + + if (reg->hr_bdev) + return -EINVAL; + + status = o2hb_read_block_input(reg, page, count, + &block_bytes, &block_bits); + if (status) + return status; + + reg->hr_block_bytes = (unsigned int)block_bytes; + reg->hr_block_bits = block_bits; + + return count; +} + +static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg, + char *page) +{ + return sprintf(page, "%llu\n", reg->hr_start_block); +} + +static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg, + const char *page, + size_t count) +{ + unsigned long long tmp; + char *p = (char *)page; + + if (reg->hr_bdev) + return -EINVAL; + + tmp = simple_strtoull(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + reg->hr_start_block = tmp; + + return count; +} + +static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg, + char *page) +{ + return sprintf(page, "%d\n", reg->hr_blocks); +} + +static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg, + const char *page, + size_t count) +{ + unsigned long tmp; + char *p = (char *)page; + + if (reg->hr_bdev) + return -EINVAL; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp > O2NM_MAX_NODES || tmp == 0) + return -ERANGE; + + reg->hr_blocks = (unsigned int)tmp; + + return count; +} + +static ssize_t o2hb_region_dev_read(struct o2hb_region *reg, + char *page) +{ + unsigned int ret = 0; + + if (reg->hr_bdev) + ret = sprintf(page, "%s\n", reg->hr_dev_name); + + return ret; +} + +static void o2hb_init_region_params(struct o2hb_region *reg) +{ + reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits; + reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS; + + mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n", + reg->hr_start_block, reg->hr_blocks); + mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n", + reg->hr_block_bytes, reg->hr_block_bits); + mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n", reg->hr_timeout_ms); + mlog(ML_HEARTBEAT, "dead threshold = %u\n", o2hb_dead_threshold); +} + +static int o2hb_map_slot_data(struct o2hb_region *reg) +{ + int i, j; + unsigned int last_slot; + unsigned int spp = reg->hr_slots_per_page; + struct page *page; + char *raw; + struct o2hb_disk_slot *slot; + + reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL); + if (reg->hr_tmp_block == NULL) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + reg->hr_slots = kcalloc(reg->hr_blocks, + sizeof(struct o2hb_disk_slot), GFP_KERNEL); + if (reg->hr_slots == NULL) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + for(i = 0; i < reg->hr_blocks; i++) { + slot = ®->hr_slots[i]; + slot->ds_node_num = i; + INIT_LIST_HEAD(&slot->ds_live_item); + slot->ds_raw_block = NULL; + } + + reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp; + mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks " + "at %u blocks per page\n", + reg->hr_num_pages, reg->hr_blocks, spp); + + reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *), + GFP_KERNEL); + if (!reg->hr_slot_data) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + for(i = 0; i < reg->hr_num_pages; i++) { + page = alloc_page(GFP_KERNEL); + if (!page) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + reg->hr_slot_data[i] = page; + + last_slot = i * spp; + raw = page_address(page); + for (j = 0; + (j < spp) && ((j + last_slot) < reg->hr_blocks); + j++) { + BUG_ON((j + last_slot) >= reg->hr_blocks); + + slot = ®->hr_slots[j + last_slot]; + slot->ds_raw_block = + (struct o2hb_disk_heartbeat_block *) raw; + + raw += reg->hr_block_bytes; + } + } + + return 0; +} + +/* Read in all the slots available and populate the tracking + * structures so that we can start with a baseline idea of what's + * there. */ +static int o2hb_populate_slot_data(struct o2hb_region *reg) +{ + int ret, i; + struct o2hb_disk_slot *slot; + struct o2hb_disk_heartbeat_block *hb_block; + + ret = o2hb_read_slots(reg, reg->hr_blocks); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* We only want to get an idea of the values initially in each + * slot, so we do no verification - o2hb_check_slot will + * actually determine if each configured slot is valid and + * whether any values have changed. */ + for(i = 0; i < reg->hr_blocks; i++) { + slot = ®->hr_slots[i]; + hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block; + + /* Only fill the values that o2hb_check_slot uses to + * determine changing slots */ + slot->ds_last_time = le64_to_cpu(hb_block->hb_seq); + slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation); + } + +out: + return ret; +} + +/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */ +static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, + const char *page, + size_t count) +{ + struct task_struct *hb_task; + long fd; + int sectsize; + char *p = (char *)page; + struct file *filp = NULL; + struct inode *inode = NULL; + ssize_t ret = -EINVAL; + int live_threshold; + + if (reg->hr_bdev) + goto out; + + /* We can't heartbeat without having had our node number + * configured yet. */ + if (o2nm_this_node() == O2NM_MAX_NODES) + goto out; + + fd = simple_strtol(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + goto out; + + if (fd < 0 || fd >= INT_MAX) + goto out; + + filp = fget(fd); + if (filp == NULL) + goto out; + + if (reg->hr_blocks == 0 || reg->hr_start_block == 0 || + reg->hr_block_bytes == 0) + goto out; + + inode = igrab(filp->f_mapping->host); + if (inode == NULL) + goto out; + + if (!S_ISBLK(inode->i_mode)) + goto out; + + reg->hr_bdev = I_BDEV(filp->f_mapping->host); + ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL); + if (ret) { + reg->hr_bdev = NULL; + goto out; + } + inode = NULL; + + bdevname(reg->hr_bdev, reg->hr_dev_name); + + sectsize = bdev_logical_block_size(reg->hr_bdev); + if (sectsize != reg->hr_block_bytes) { + mlog(ML_ERROR, + "blocksize %u incorrect for device, expected %d", + reg->hr_block_bytes, sectsize); + ret = -EINVAL; + goto out; + } + + o2hb_init_region_params(reg); + + /* Generation of zero is invalid */ + do { + get_random_bytes(®->hr_generation, + sizeof(reg->hr_generation)); + } while (reg->hr_generation == 0); + + ret = o2hb_map_slot_data(reg); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = o2hb_populate_slot_data(reg); + if (ret) { + mlog_errno(ret); + goto out; + } + + INIT_DELAYED_WORK(®->hr_write_timeout_work, o2hb_write_timeout); + + /* + * A node is considered live after it has beat LIVE_THRESHOLD + * times. We're not steady until we've given them a chance + * _after_ our first read. + * The default threshold is bare minimum so as to limit the delay + * during mounts. For global heartbeat, the threshold doubled for the + * first region. + */ + live_threshold = O2HB_LIVE_THRESHOLD; + if (o2hb_global_heartbeat_active()) { + spin_lock(&o2hb_live_lock); + if (o2hb_pop_count(&o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1) + live_threshold <<= 1; + spin_unlock(&o2hb_live_lock); + } + atomic_set(®->hr_steady_iterations, live_threshold + 1); + + hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", + reg->hr_item.ci_name); + if (IS_ERR(hb_task)) { + ret = PTR_ERR(hb_task); + mlog_errno(ret); + goto out; + } + + spin_lock(&o2hb_live_lock); + reg->hr_task = hb_task; + spin_unlock(&o2hb_live_lock); + + ret = wait_event_interruptible(o2hb_steady_queue, + atomic_read(®->hr_steady_iterations) == 0); + if (ret) { + /* We got interrupted (hello ptrace!). Clean up */ + spin_lock(&o2hb_live_lock); + hb_task = reg->hr_task; + reg->hr_task = NULL; + spin_unlock(&o2hb_live_lock); + + if (hb_task) + kthread_stop(hb_task); + goto out; + } + + /* Ok, we were woken. Make sure it wasn't by drop_item() */ + spin_lock(&o2hb_live_lock); + hb_task = reg->hr_task; + if (o2hb_global_heartbeat_active()) + set_bit(reg->hr_region_num, o2hb_live_region_bitmap); + spin_unlock(&o2hb_live_lock); + + if (hb_task) + ret = count; + else + ret = -EIO; + + if (hb_task && o2hb_global_heartbeat_active()) + printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", + config_item_name(®->hr_item)); + +out: + if (filp) + fput(filp); + if (inode) + iput(inode); + if (ret < 0) { + if (reg->hr_bdev) { + blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); + reg->hr_bdev = NULL; + } + } + return ret; +} + +static ssize_t o2hb_region_pid_read(struct o2hb_region *reg, + char *page) +{ + pid_t pid = 0; + + spin_lock(&o2hb_live_lock); + if (reg->hr_task) + pid = task_pid_nr(reg->hr_task); + spin_unlock(&o2hb_live_lock); + + if (!pid) + return 0; + + return sprintf(page, "%u\n", pid); +} + +struct o2hb_region_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct o2hb_region *, char *); + ssize_t (*store)(struct o2hb_region *, const char *, size_t); +}; + +static struct o2hb_region_attribute o2hb_region_attr_block_bytes = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "block_bytes", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_region_block_bytes_read, + .store = o2hb_region_block_bytes_write, +}; + +static struct o2hb_region_attribute o2hb_region_attr_start_block = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "start_block", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_region_start_block_read, + .store = o2hb_region_start_block_write, +}; + +static struct o2hb_region_attribute o2hb_region_attr_blocks = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "blocks", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_region_blocks_read, + .store = o2hb_region_blocks_write, +}; + +static struct o2hb_region_attribute o2hb_region_attr_dev = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "dev", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_region_dev_read, + .store = o2hb_region_dev_write, +}; + +static struct o2hb_region_attribute o2hb_region_attr_pid = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "pid", + .ca_mode = S_IRUGO | S_IRUSR }, + .show = o2hb_region_pid_read, +}; + +static struct configfs_attribute *o2hb_region_attrs[] = { + &o2hb_region_attr_block_bytes.attr, + &o2hb_region_attr_start_block.attr, + &o2hb_region_attr_blocks.attr, + &o2hb_region_attr_dev.attr, + &o2hb_region_attr_pid.attr, + NULL, +}; + +static ssize_t o2hb_region_show(struct config_item *item, + struct configfs_attribute *attr, + char *page) +{ + struct o2hb_region *reg = to_o2hb_region(item); + struct o2hb_region_attribute *o2hb_region_attr = + container_of(attr, struct o2hb_region_attribute, attr); + ssize_t ret = 0; + + if (o2hb_region_attr->show) + ret = o2hb_region_attr->show(reg, page); + return ret; +} + +static ssize_t o2hb_region_store(struct config_item *item, + struct configfs_attribute *attr, + const char *page, size_t count) +{ + struct o2hb_region *reg = to_o2hb_region(item); + struct o2hb_region_attribute *o2hb_region_attr = + container_of(attr, struct o2hb_region_attribute, attr); + ssize_t ret = -EINVAL; + + if (o2hb_region_attr->store) + ret = o2hb_region_attr->store(reg, page, count); + return ret; +} + +static struct configfs_item_operations o2hb_region_item_ops = { + .release = o2hb_region_release, + .show_attribute = o2hb_region_show, + .store_attribute = o2hb_region_store, +}; + +static struct config_item_type o2hb_region_type = { + .ct_item_ops = &o2hb_region_item_ops, + .ct_attrs = o2hb_region_attrs, + .ct_owner = THIS_MODULE, +}; + +/* heartbeat set */ + +struct o2hb_heartbeat_group { + struct config_group hs_group; + /* some stuff? */ +}; + +static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group) +{ + return group ? + container_of(group, struct o2hb_heartbeat_group, hs_group) + : NULL; +} + +static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) +{ + int ret = -ENOMEM; + + reg->hr_debug_dir = + debugfs_create_dir(config_item_name(®->hr_item), dir); + if (!reg->hr_debug_dir) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_livenodes = + o2hb_debug_create(O2HB_DEBUG_LIVENODES, + reg->hr_debug_dir, + &(reg->hr_db_livenodes), + sizeof(*(reg->hr_db_livenodes)), + O2HB_DB_TYPE_REGION_LIVENODES, + sizeof(reg->hr_live_node_bitmap), + O2NM_MAX_NODES, reg); + if (!reg->hr_debug_livenodes) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_regnum = + o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, + reg->hr_debug_dir, + &(reg->hr_db_regnum), + sizeof(*(reg->hr_db_regnum)), + O2HB_DB_TYPE_REGION_NUMBER, + 0, O2NM_MAX_NODES, reg); + if (!reg->hr_debug_regnum) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_elapsed_time = + o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, + reg->hr_debug_dir, + &(reg->hr_db_elapsed_time), + sizeof(*(reg->hr_db_elapsed_time)), + O2HB_DB_TYPE_REGION_ELAPSED_TIME, + 0, 0, reg); + if (!reg->hr_debug_elapsed_time) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_pinned = + o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, + reg->hr_debug_dir, + &(reg->hr_db_pinned), + sizeof(*(reg->hr_db_pinned)), + O2HB_DB_TYPE_REGION_PINNED, + 0, 0, reg); + if (!reg->hr_debug_pinned) { + mlog_errno(ret); + goto bail; + } + + ret = 0; +bail: + return ret; +} + +static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, + const char *name) +{ + struct o2hb_region *reg = NULL; + int ret; + + reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); + if (reg == NULL) + return ERR_PTR(-ENOMEM); + + if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) { + ret = -ENAMETOOLONG; + goto free; + } + + spin_lock(&o2hb_live_lock); + reg->hr_region_num = 0; + if (o2hb_global_heartbeat_active()) { + reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap, + O2NM_MAX_REGIONS); + if (reg->hr_region_num >= O2NM_MAX_REGIONS) { + spin_unlock(&o2hb_live_lock); + ret = -EFBIG; + goto free; + } + set_bit(reg->hr_region_num, o2hb_region_bitmap); + } + list_add_tail(®->hr_all_item, &o2hb_all_regions); + spin_unlock(&o2hb_live_lock); + + config_item_init_type_name(®->hr_item, name, &o2hb_region_type); + + ret = o2hb_debug_region_init(reg, o2hb_debug_dir); + if (ret) { + config_item_put(®->hr_item); + goto free; + } + + return ®->hr_item; +free: + kfree(reg); + return ERR_PTR(ret); +} + +static void o2hb_heartbeat_group_drop_item(struct config_group *group, + struct config_item *item) +{ + struct task_struct *hb_task; + struct o2hb_region *reg = to_o2hb_region(item); + int quorum_region = 0; + + /* stop the thread when the user removes the region dir */ + spin_lock(&o2hb_live_lock); + if (o2hb_global_heartbeat_active()) { + clear_bit(reg->hr_region_num, o2hb_region_bitmap); + clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + quorum_region = 1; + clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); + } + hb_task = reg->hr_task; + reg->hr_task = NULL; + reg->hr_item_dropped = 1; + spin_unlock(&o2hb_live_lock); + + if (hb_task) + kthread_stop(hb_task); + + /* + * If we're racing a dev_write(), we need to wake them. They will + * check reg->hr_task + */ + if (atomic_read(®->hr_steady_iterations) != 0) { + atomic_set(®->hr_steady_iterations, 0); + wake_up(&o2hb_steady_queue); + } + + if (o2hb_global_heartbeat_active()) + printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n", + config_item_name(®->hr_item)); + + config_item_put(item); + + if (!o2hb_global_heartbeat_active() || !quorum_region) + return; + + /* + * If global heartbeat active and there are dependent users, + * pin all regions if quorum region count <= CUT_OFF + */ + spin_lock(&o2hb_live_lock); + + if (!o2hb_dependent_users) + goto unlock; + + if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) + o2hb_region_pin(NULL); + +unlock: + spin_unlock(&o2hb_live_lock); +} + +struct o2hb_heartbeat_group_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct o2hb_heartbeat_group *, char *); + ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t); +}; + +static ssize_t o2hb_heartbeat_group_show(struct config_item *item, + struct configfs_attribute *attr, + char *page) +{ + struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); + struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = + container_of(attr, struct o2hb_heartbeat_group_attribute, attr); + ssize_t ret = 0; + + if (o2hb_heartbeat_group_attr->show) + ret = o2hb_heartbeat_group_attr->show(reg, page); + return ret; +} + +static ssize_t o2hb_heartbeat_group_store(struct config_item *item, + struct configfs_attribute *attr, + const char *page, size_t count) +{ + struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); + struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = + container_of(attr, struct o2hb_heartbeat_group_attribute, attr); + ssize_t ret = -EINVAL; + + if (o2hb_heartbeat_group_attr->store) + ret = o2hb_heartbeat_group_attr->store(reg, page, count); + return ret; +} + +static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group, + char *page) +{ + return sprintf(page, "%u\n", o2hb_dead_threshold); +} + +static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group, + const char *page, + size_t count) +{ + unsigned long tmp; + char *p = (char *)page; + + tmp = simple_strtoul(p, &p, 10); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + /* this will validate ranges for us. */ + o2hb_dead_threshold_set((unsigned int) tmp); + + return count; +} + +static +ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group, + char *page) +{ + return sprintf(page, "%s\n", + o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]); +} + +static +ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, + const char *page, size_t count) +{ + unsigned int i; + int ret; + size_t len; + + len = (page[count - 1] == '\n') ? count - 1 : count; + if (!len) + return -EINVAL; + + for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) { + if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) + continue; + + ret = o2hb_global_hearbeat_mode_set(i); + if (!ret) + printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n", + o2hb_heartbeat_mode_desc[i]); + return count; + } + + return -EINVAL; + +} + +static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "dead_threshold", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_heartbeat_group_threshold_show, + .store = o2hb_heartbeat_group_threshold_store, +}; + +static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "mode", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_heartbeat_group_mode_show, + .store = o2hb_heartbeat_group_mode_store, +}; + +static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { + &o2hb_heartbeat_group_attr_threshold.attr, + &o2hb_heartbeat_group_attr_mode.attr, + NULL, +}; + +static struct configfs_item_operations o2hb_hearbeat_group_item_ops = { + .show_attribute = o2hb_heartbeat_group_show, + .store_attribute = o2hb_heartbeat_group_store, +}; + +static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { + .make_item = o2hb_heartbeat_group_make_item, + .drop_item = o2hb_heartbeat_group_drop_item, +}; + +static struct config_item_type o2hb_heartbeat_group_type = { + .ct_group_ops = &o2hb_heartbeat_group_group_ops, + .ct_item_ops = &o2hb_hearbeat_group_item_ops, + .ct_attrs = o2hb_heartbeat_group_attrs, + .ct_owner = THIS_MODULE, +}; + +/* this is just here to avoid touching group in heartbeat.h which the + * entire damn world #includes */ +struct config_group *o2hb_alloc_hb_set(void) +{ + struct o2hb_heartbeat_group *hs = NULL; + struct config_group *ret = NULL; + + hs = kzalloc(sizeof(struct o2hb_heartbeat_group), GFP_KERNEL); + if (hs == NULL) + goto out; + + config_group_init_type_name(&hs->hs_group, "heartbeat", + &o2hb_heartbeat_group_type); + + ret = &hs->hs_group; +out: + if (ret == NULL) + kfree(hs); + return ret; +} + +void o2hb_free_hb_set(struct config_group *group) +{ + struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group); + kfree(hs); +} + +/* hb callback registration and issuing */ + +static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type) +{ + if (type == O2HB_NUM_CB) + return ERR_PTR(-EINVAL); + + return &o2hb_callbacks[type]; +} + +void o2hb_setup_callback(struct o2hb_callback_func *hc, + enum o2hb_callback_type type, + o2hb_cb_func *func, + void *data, + int priority) +{ + INIT_LIST_HEAD(&hc->hc_item); + hc->hc_func = func; + hc->hc_data = data; + hc->hc_priority = priority; + hc->hc_type = type; + hc->hc_magic = O2HB_CB_MAGIC; +} +EXPORT_SYMBOL_GPL(o2hb_setup_callback); + +/* + * In local heartbeat mode, region_uuid passed matches the dlm domain name. + * In global heartbeat mode, region_uuid passed is NULL. + * + * In local, we only pin the matching region. In global we pin all the active + * regions. + */ +static int o2hb_region_pin(const char *region_uuid) +{ + int ret = 0, found = 0; + struct o2hb_region *reg; + char *uuid; + + assert_spin_locked(&o2hb_live_lock); + + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + uuid = config_item_name(®->hr_item); + + /* local heartbeat */ + if (region_uuid) { + if (strcmp(region_uuid, uuid)) + continue; + found = 1; + } + + if (reg->hr_item_pinned || reg->hr_item_dropped) + goto skip_pin; + + /* Ignore ENOENT only for local hb (userdlm domain) */ + ret = o2nm_depend_item(®->hr_item); + if (!ret) { + mlog(ML_CLUSTER, "Pin region %s\n", uuid); + reg->hr_item_pinned = 1; + } else { + if (ret == -ENOENT && found) + ret = 0; + else { + mlog(ML_ERROR, "Pin region %s fails with %d\n", + uuid, ret); + break; + } + } +skip_pin: + if (found) + break; + } + + return ret; +} + +/* + * In local heartbeat mode, region_uuid passed matches the dlm domain name. + * In global heartbeat mode, region_uuid passed is NULL. + * + * In local, we only unpin the matching region. In global we unpin all the + * active regions. + */ +static void o2hb_region_unpin(const char *region_uuid) +{ + struct o2hb_region *reg; + char *uuid; + int found = 0; + + assert_spin_locked(&o2hb_live_lock); + + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + uuid = config_item_name(®->hr_item); + if (region_uuid) { + if (strcmp(region_uuid, uuid)) + continue; + found = 1; + } + + if (reg->hr_item_pinned) { + mlog(ML_CLUSTER, "Unpin region %s\n", uuid); + o2nm_undepend_item(®->hr_item); + reg->hr_item_pinned = 0; + } + if (found) + break; + } +} + +static int o2hb_region_inc_user(const char *region_uuid) +{ + int ret = 0; + + spin_lock(&o2hb_live_lock); + + /* local heartbeat */ + if (!o2hb_global_heartbeat_active()) { + ret = o2hb_region_pin(region_uuid); + goto unlock; + } + + /* + * if global heartbeat active and this is the first dependent user, + * pin all regions if quorum region count <= CUT_OFF + */ + o2hb_dependent_users++; + if (o2hb_dependent_users > 1) + goto unlock; + + if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) + ret = o2hb_region_pin(NULL); + +unlock: + spin_unlock(&o2hb_live_lock); + return ret; +} + +void o2hb_region_dec_user(const char *region_uuid) +{ + spin_lock(&o2hb_live_lock); + + /* local heartbeat */ + if (!o2hb_global_heartbeat_active()) { + o2hb_region_unpin(region_uuid); + goto unlock; + } + + /* + * if global heartbeat active and there are no dependent users, + * unpin all quorum regions + */ + o2hb_dependent_users--; + if (!o2hb_dependent_users) + o2hb_region_unpin(NULL); + +unlock: + spin_unlock(&o2hb_live_lock); +} + +int o2hb_register_callback(const char *region_uuid, + struct o2hb_callback_func *hc) +{ + struct o2hb_callback_func *tmp; + struct list_head *iter; + struct o2hb_callback *hbcall; + int ret; + + BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); + BUG_ON(!list_empty(&hc->hc_item)); + + hbcall = hbcall_from_type(hc->hc_type); + if (IS_ERR(hbcall)) { + ret = PTR_ERR(hbcall); + goto out; + } + + if (region_uuid) { + ret = o2hb_region_inc_user(region_uuid); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + down_write(&o2hb_callback_sem); + + list_for_each(iter, &hbcall->list) { + tmp = list_entry(iter, struct o2hb_callback_func, hc_item); + if (hc->hc_priority < tmp->hc_priority) { + list_add_tail(&hc->hc_item, iter); + break; + } + } + if (list_empty(&hc->hc_item)) + list_add_tail(&hc->hc_item, &hbcall->list); + + up_write(&o2hb_callback_sem); + ret = 0; +out: + mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n", + ret, __builtin_return_address(0), hc); + return ret; +} +EXPORT_SYMBOL_GPL(o2hb_register_callback); + +void o2hb_unregister_callback(const char *region_uuid, + struct o2hb_callback_func *hc) +{ + BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); + + mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n", + __builtin_return_address(0), hc); + + /* XXX Can this happen _with_ a region reference? */ + if (list_empty(&hc->hc_item)) + return; + + if (region_uuid) + o2hb_region_dec_user(region_uuid); + + down_write(&o2hb_callback_sem); + + list_del_init(&hc->hc_item); + + up_write(&o2hb_callback_sem); +} +EXPORT_SYMBOL_GPL(o2hb_unregister_callback); + +int o2hb_check_node_heartbeating(u8 node_num) +{ + unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + + o2hb_fill_node_map(testing_map, sizeof(testing_map)); + if (!test_bit(node_num, testing_map)) { + mlog(ML_HEARTBEAT, + "node (%u) does not have heartbeating enabled.\n", + node_num); + return 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); + +int o2hb_check_node_heartbeating_from_callback(u8 node_num) +{ + unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + + o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + if (!test_bit(node_num, testing_map)) { + mlog(ML_HEARTBEAT, + "node (%u) does not have heartbeating enabled.\n", + node_num); + return 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback); + +/* Makes sure our local node is configured with a node number, and is + * heartbeating. */ +int o2hb_check_local_node_heartbeating(void) +{ + u8 node_num; + + /* if this node was set then we have networking */ + node_num = o2nm_this_node(); + if (node_num == O2NM_MAX_NODES) { + mlog(ML_HEARTBEAT, "this node has not been configured.\n"); + return 0; + } + + return o2hb_check_node_heartbeating(node_num); +} +EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating); + +/* + * this is just a hack until we get the plumbing which flips file systems + * read only and drops the hb ref instead of killing the node dead. + */ +void o2hb_stop_all_regions(void) +{ + struct o2hb_region *reg; + + mlog(ML_ERROR, "stopping heartbeat on all active regions.\n"); + + spin_lock(&o2hb_live_lock); + + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) + reg->hr_unclean_stop = 1; + + spin_unlock(&o2hb_live_lock); +} +EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); + +int o2hb_get_all_regions(char *region_uuids, u8 max_regions) +{ + struct o2hb_region *reg; + int numregs = 0; + char *p; + + spin_lock(&o2hb_live_lock); + + p = region_uuids; + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + mlog(0, "Region: %s\n", config_item_name(®->hr_item)); + if (numregs < max_regions) { + memcpy(p, config_item_name(®->hr_item), + O2HB_MAX_REGION_NAME_LEN); + p += O2HB_MAX_REGION_NAME_LEN; + } + numregs++; + } + + spin_unlock(&o2hb_live_lock); + + return numregs; +} +EXPORT_SYMBOL_GPL(o2hb_get_all_regions); + +int o2hb_global_heartbeat_active(void) +{ + return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL); +} +EXPORT_SYMBOL(o2hb_global_heartbeat_active); diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h new file mode 100644 index 00000000..00ad8e8f --- /dev/null +++ b/fs/ocfs2/cluster/heartbeat.h @@ -0,0 +1,89 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * heartbeat.h + * + * Function prototypes + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_HEARTBEAT_H +#define O2CLUSTER_HEARTBEAT_H + +#include "ocfs2_heartbeat.h" + +#define O2HB_REGION_TIMEOUT_MS 2000 + +#define O2HB_MAX_REGION_NAME_LEN 32 + +/* number of changes to be seen as live */ +#define O2HB_LIVE_THRESHOLD 2 +/* number of equal samples to be seen as dead */ +extern unsigned int o2hb_dead_threshold; +#define O2HB_DEFAULT_DEAD_THRESHOLD 31 +/* Otherwise MAX_WRITE_TIMEOUT will be zero... */ +#define O2HB_MIN_DEAD_THRESHOLD 2 +#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1)) + +#define O2HB_CB_MAGIC 0x51d1e4ec + +/* callback stuff */ +enum o2hb_callback_type { + O2HB_NODE_DOWN_CB = 0, + O2HB_NODE_UP_CB, + O2HB_NUM_CB +}; + +struct o2nm_node; +typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *); + +struct o2hb_callback_func { + u32 hc_magic; + struct list_head hc_item; + o2hb_cb_func *hc_func; + void *hc_data; + int hc_priority; + enum o2hb_callback_type hc_type; +}; + +struct config_group *o2hb_alloc_hb_set(void); +void o2hb_free_hb_set(struct config_group *group); + +void o2hb_setup_callback(struct o2hb_callback_func *hc, + enum o2hb_callback_type type, + o2hb_cb_func *func, + void *data, + int priority); +int o2hb_register_callback(const char *region_uuid, + struct o2hb_callback_func *hc); +void o2hb_unregister_callback(const char *region_uuid, + struct o2hb_callback_func *hc); +void o2hb_fill_node_map(unsigned long *map, + unsigned bytes); +void o2hb_exit(void); +int o2hb_init(void); +int o2hb_check_node_heartbeating(u8 node_num); +int o2hb_check_node_heartbeating_from_callback(u8 node_num); +int o2hb_check_local_node_heartbeating(void); +void o2hb_stop_all_regions(void); +int o2hb_get_all_regions(char *region_uuids, u8 numregions); +int o2hb_global_heartbeat_active(void); + +#endif /* O2CLUSTER_HEARTBEAT_H */ diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c new file mode 100644 index 00000000..07ac24fd --- /dev/null +++ b/fs/ocfs2/cluster/masklog.c @@ -0,0 +1,155 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include + +#include "masklog.h" + +struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK); +EXPORT_SYMBOL_GPL(mlog_and_bits); +struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0); +EXPORT_SYMBOL_GPL(mlog_not_bits); + +static ssize_t mlog_mask_show(u64 mask, char *buf) +{ + char *state; + + if (__mlog_test_u64(mask, mlog_and_bits)) + state = "allow"; + else if (__mlog_test_u64(mask, mlog_not_bits)) + state = "deny"; + else + state = "off"; + + return snprintf(buf, PAGE_SIZE, "%s\n", state); +} + +static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) +{ + if (!strnicmp(buf, "allow", 5)) { + __mlog_set_u64(mask, mlog_and_bits); + __mlog_clear_u64(mask, mlog_not_bits); + } else if (!strnicmp(buf, "deny", 4)) { + __mlog_set_u64(mask, mlog_not_bits); + __mlog_clear_u64(mask, mlog_and_bits); + } else if (!strnicmp(buf, "off", 3)) { + __mlog_clear_u64(mask, mlog_not_bits); + __mlog_clear_u64(mask, mlog_and_bits); + } else + return -EINVAL; + + return count; +} + +struct mlog_attribute { + struct attribute attr; + u64 mask; +}; + +#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr) + +#define define_mask(_name) { \ + .attr = { \ + .name = #_name, \ + .mode = S_IRUGO | S_IWUSR, \ + }, \ + .mask = ML_##_name, \ +} + +static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { + define_mask(TCP), + define_mask(MSG), + define_mask(SOCKET), + define_mask(HEARTBEAT), + define_mask(HB_BIO), + define_mask(DLMFS), + define_mask(DLM), + define_mask(DLM_DOMAIN), + define_mask(DLM_THREAD), + define_mask(DLM_MASTER), + define_mask(DLM_RECOVERY), + define_mask(DLM_GLUE), + define_mask(VOTE), + define_mask(CONN), + define_mask(QUORUM), + define_mask(BASTS), + define_mask(CLUSTER), + define_mask(ERROR), + define_mask(NOTICE), + define_mask(KTHREAD), +}; + +static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; + +static ssize_t mlog_show(struct kobject *obj, struct attribute *attr, + char *buf) +{ + struct mlog_attribute *mlog_attr = to_mlog_attr(attr); + + return mlog_mask_show(mlog_attr->mask, buf); +} + +static ssize_t mlog_store(struct kobject *obj, struct attribute *attr, + const char *buf, size_t count) +{ + struct mlog_attribute *mlog_attr = to_mlog_attr(attr); + + return mlog_mask_store(mlog_attr->mask, buf, count); +} + +static const struct sysfs_ops mlog_attr_ops = { + .show = mlog_show, + .store = mlog_store, +}; + +static struct kobj_type mlog_ktype = { + .default_attrs = mlog_attr_ptrs, + .sysfs_ops = &mlog_attr_ops, +}; + +static struct kset mlog_kset = { + .kobj = {.ktype = &mlog_ktype}, +}; + +int mlog_sys_init(struct kset *o2cb_kset) +{ + int i = 0; + + while (mlog_attrs[i].attr.mode) { + mlog_attr_ptrs[i] = &mlog_attrs[i].attr; + i++; + } + mlog_attr_ptrs[i] = NULL; + + kobject_set_name(&mlog_kset.kobj, "logmask"); + mlog_kset.kobj.kset = o2cb_kset; + return kset_register(&mlog_kset); +} + +void mlog_sys_shutdown(void) +{ + kset_unregister(&mlog_kset); +} diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h new file mode 100644 index 00000000..baa2b9ef --- /dev/null +++ b/fs/ocfs2/cluster/masklog.h @@ -0,0 +1,219 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef O2CLUSTER_MASKLOG_H +#define O2CLUSTER_MASKLOG_H + +/* + * For now this is a trivial wrapper around printk() that gives the critical + * ability to enable sets of debugging output at run-time. In the future this + * will almost certainly be redirected to relayfs so that it can pay a + * substantially lower heisenberg tax. + * + * Callers associate the message with a bitmask and a global bitmask is + * maintained with help from /proc. If any of the bits match the message is + * output. + * + * We must have efficient bit tests on i386 and it seems gcc still emits crazy + * code for the 64bit compare. It emits very good code for the dual unsigned + * long tests, though, completely avoiding tests that can never pass if the + * caller gives a constant bitmask that fills one of the longs with all 0s. So + * the desire is to have almost all of the calls decided on by comparing just + * one of the longs. This leads to having infrequently given bits that are + * frequently matched in the high bits. + * + * _ERROR and _NOTICE are used for messages that always go to the console and + * have appropriate KERN_ prefixes. We wrap these in our function instead of + * just calling printk() so that this can eventually make its way through + * relayfs along with the debugging messages. Everything else gets KERN_DEBUG. + * The inline tests and macro dance give GCC the opportunity to quite cleverly + * only emit the appropriage printk() when the caller passes in a constant + * mask, as is almost always the case. + * + * All this bitmask nonsense is managed from the files under + * /sys/fs/o2cb/logmask/. Reading the files gives a straightforward + * indication of which bits are allowed (allow) or denied (off/deny). + * ENTRY deny + * EXIT deny + * TCP off + * MSG off + * SOCKET off + * ERROR allow + * NOTICE allow + * + * Writing changes the state of a given bit and requires a strictly formatted + * single write() call: + * + * write(fd, "allow", 5); + * + * Echoing allow/deny/off string into the logmask files can flip the bits + * on or off as expected; here is the bash script for example: + * + * log_mask="/sys/fs/o2cb/log_mask" + * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do + * echo allow >"$log_mask"/"$node" + * done + * + * The debugfs.ocfs2 tool can also flip the bits with the -l option: + * + * debugfs.ocfs2 -l TCP allow + */ + +/* for task_struct */ +#include + +/* bits that are frequently given and infrequently matched in the low word */ +/* NOTE: If you add a flag, you need to also update masklog.c! */ +#define ML_TCP 0x0000000000000001ULL /* net cluster/tcp.c */ +#define ML_MSG 0x0000000000000002ULL /* net network messages */ +#define ML_SOCKET 0x0000000000000004ULL /* net socket lifetime */ +#define ML_HEARTBEAT 0x0000000000000008ULL /* hb all heartbeat tracking */ +#define ML_HB_BIO 0x0000000000000010ULL /* hb io tracing */ +#define ML_DLMFS 0x0000000000000020ULL /* dlm user dlmfs */ +#define ML_DLM 0x0000000000000040ULL /* dlm general debugging */ +#define ML_DLM_DOMAIN 0x0000000000000080ULL /* dlm domain debugging */ +#define ML_DLM_THREAD 0x0000000000000100ULL /* dlm domain thread */ +#define ML_DLM_MASTER 0x0000000000000200ULL /* dlm master functions */ +#define ML_DLM_RECOVERY 0x0000000000000400ULL /* dlm master functions */ +#define ML_DLM_GLUE 0x0000000000000800ULL /* ocfs2 dlm glue layer */ +#define ML_VOTE 0x0000000000001000ULL /* ocfs2 node messaging */ +#define ML_CONN 0x0000000000002000ULL /* net connection management */ +#define ML_QUORUM 0x0000000000004000ULL /* net connection quorum */ +#define ML_BASTS 0x0000000000008000ULL /* dlmglue asts and basts */ +#define ML_CLUSTER 0x0000000000010000ULL /* cluster stack */ + +/* bits that are infrequently given and frequently matched in the high word */ +#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */ +#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */ +#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */ + +#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) +#ifndef MLOG_MASK_PREFIX +#define MLOG_MASK_PREFIX 0 +#endif + +/* + * When logging is disabled, force the bit test to 0 for anything other + * than errors and notices, allowing gcc to remove the code completely. + * When enabled, allow all masks. + */ +#if defined(CONFIG_OCFS2_DEBUG_MASKLOG) +#define ML_ALLOWED_BITS ~0 +#else +#define ML_ALLOWED_BITS (ML_ERROR|ML_NOTICE) +#endif + +#define MLOG_MAX_BITS 64 + +struct mlog_bits { + unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG]; +}; + +extern struct mlog_bits mlog_and_bits, mlog_not_bits; + +#if BITS_PER_LONG == 32 + +#define __mlog_test_u64(mask, bits) \ + ( (u32)(mask & 0xffffffff) & bits.words[0] || \ + ((u64)(mask) >> 32) & bits.words[1] ) +#define __mlog_set_u64(mask, bits) do { \ + bits.words[0] |= (u32)(mask & 0xffffffff); \ + bits.words[1] |= (u64)(mask) >> 32; \ +} while (0) +#define __mlog_clear_u64(mask, bits) do { \ + bits.words[0] &= ~((u32)(mask & 0xffffffff)); \ + bits.words[1] &= ~((u64)(mask) >> 32); \ +} while (0) +#define MLOG_BITS_RHS(mask) { \ + { \ + [0] = (u32)(mask & 0xffffffff), \ + [1] = (u64)(mask) >> 32, \ + } \ +} + +#else /* 32bit long above, 64bit long below */ + +#define __mlog_test_u64(mask, bits) ((mask) & bits.words[0]) +#define __mlog_set_u64(mask, bits) do { \ + bits.words[0] |= (mask); \ +} while (0) +#define __mlog_clear_u64(mask, bits) do { \ + bits.words[0] &= ~(mask); \ +} while (0) +#define MLOG_BITS_RHS(mask) { { (mask) } } + +#endif + +/* + * smp_processor_id() "helpfully" screams when called outside preemptible + * regions in current kernels. sles doesn't have the variants that don't + * scream. just do this instead of trying to guess which we're building + * against.. *sigh*. + */ +#define __mlog_cpu_guess ({ \ + unsigned long _cpu = get_cpu(); \ + put_cpu(); \ + _cpu; \ +}) + +/* In the following two macros, the whitespace after the ',' just + * before ##args is intentional. Otherwise, gcc 2.95 will eat the + * previous token if args expands to nothing. + */ +#define __mlog_printk(level, fmt, args...) \ + printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \ + task_pid_nr(current), __mlog_cpu_guess, \ + __PRETTY_FUNCTION__, __LINE__ , ##args) + +#define mlog(mask, fmt, args...) do { \ + u64 __m = MLOG_MASK_PREFIX | (mask); \ + if ((__m & ML_ALLOWED_BITS) && \ + __mlog_test_u64(__m, mlog_and_bits) && \ + !__mlog_test_u64(__m, mlog_not_bits)) { \ + if (__m & ML_ERROR) \ + __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \ + else if (__m & ML_NOTICE) \ + __mlog_printk(KERN_NOTICE, fmt , ##args); \ + else __mlog_printk(KERN_INFO, fmt , ##args); \ + } \ +} while (0) + +#define mlog_errno(st) do { \ + int _st = (st); \ + if (_st != -ERESTARTSYS && _st != -EINTR && \ + _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \ + mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ +} while (0) + +#define mlog_bug_on_msg(cond, fmt, args...) do { \ + if (cond) { \ + mlog(ML_ERROR, "bug expression: " #cond "\n"); \ + mlog(ML_ERROR, fmt, ##args); \ + BUG(); \ + } \ +} while (0) + +#include +#include +int mlog_sys_init(struct kset *o2cb_subsys); +void mlog_sys_shutdown(void); + +#endif /* O2CLUSTER_MASKLOG_H */ diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c new file mode 100644 index 00000000..3a583590 --- /dev/null +++ b/fs/ocfs2/cluster/netdebug.c @@ -0,0 +1,543 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * netdebug.c + * + * debug functionality for o2net + * + * Copyright (C) 2005, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifdef CONFIG_DEBUG_FS + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "tcp.h" +#include "nodemanager.h" +#define MLOG_MASK_PREFIX ML_TCP +#include "masklog.h" + +#include "tcp_internal.h" + +#define O2NET_DEBUG_DIR "o2net" +#define SC_DEBUG_NAME "sock_containers" +#define NST_DEBUG_NAME "send_tracking" +#define STATS_DEBUG_NAME "stats" + +#define SHOW_SOCK_CONTAINERS 0 +#define SHOW_SOCK_STATS 1 + +static struct dentry *o2net_dentry; +static struct dentry *sc_dentry; +static struct dentry *nst_dentry; +static struct dentry *stats_dentry; + +static DEFINE_SPINLOCK(o2net_debug_lock); + +static LIST_HEAD(sock_containers); +static LIST_HEAD(send_tracking); + +void o2net_debug_add_nst(struct o2net_send_tracking *nst) +{ + spin_lock(&o2net_debug_lock); + list_add(&nst->st_net_debug_item, &send_tracking); + spin_unlock(&o2net_debug_lock); +} + +void o2net_debug_del_nst(struct o2net_send_tracking *nst) +{ + spin_lock(&o2net_debug_lock); + if (!list_empty(&nst->st_net_debug_item)) + list_del_init(&nst->st_net_debug_item); + spin_unlock(&o2net_debug_lock); +} + +static struct o2net_send_tracking + *next_nst(struct o2net_send_tracking *nst_start) +{ + struct o2net_send_tracking *nst, *ret = NULL; + + assert_spin_locked(&o2net_debug_lock); + + list_for_each_entry(nst, &nst_start->st_net_debug_item, + st_net_debug_item) { + /* discover the head of the list */ + if (&nst->st_net_debug_item == &send_tracking) + break; + + /* use st_task to detect real nsts in the list */ + if (nst->st_task != NULL) { + ret = nst; + break; + } + } + + return ret; +} + +static void *nst_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct o2net_send_tracking *nst, *dummy_nst = seq->private; + + spin_lock(&o2net_debug_lock); + nst = next_nst(dummy_nst); + spin_unlock(&o2net_debug_lock); + + return nst; +} + +static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct o2net_send_tracking *nst, *dummy_nst = seq->private; + + spin_lock(&o2net_debug_lock); + nst = next_nst(dummy_nst); + list_del_init(&dummy_nst->st_net_debug_item); + if (nst) + list_add(&dummy_nst->st_net_debug_item, + &nst->st_net_debug_item); + spin_unlock(&o2net_debug_lock); + + return nst; /* unused, just needs to be null when done */ +} + +static int nst_seq_show(struct seq_file *seq, void *v) +{ + struct o2net_send_tracking *nst, *dummy_nst = seq->private; + ktime_t now; + s64 sock, send, status; + + spin_lock(&o2net_debug_lock); + nst = next_nst(dummy_nst); + if (!nst) + goto out; + + now = ktime_get(); + sock = ktime_to_us(ktime_sub(now, nst->st_sock_time)); + send = ktime_to_us(ktime_sub(now, nst->st_send_time)); + status = ktime_to_us(ktime_sub(now, nst->st_status_time)); + + /* get_task_comm isn't exported. oh well. */ + seq_printf(seq, "%p:\n" + " pid: %lu\n" + " tgid: %lu\n" + " process name: %s\n" + " node: %u\n" + " sc: %p\n" + " message id: %d\n" + " message type: %u\n" + " message key: 0x%08x\n" + " sock acquiry: %lld usecs ago\n" + " send start: %lld usecs ago\n" + " wait start: %lld usecs ago\n", + nst, (unsigned long)task_pid_nr(nst->st_task), + (unsigned long)nst->st_task->tgid, + nst->st_task->comm, nst->st_node, + nst->st_sc, nst->st_id, nst->st_msg_type, + nst->st_msg_key, + (long long)sock, + (long long)send, + (long long)status); + +out: + spin_unlock(&o2net_debug_lock); + + return 0; +} + +static void nst_seq_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations nst_seq_ops = { + .start = nst_seq_start, + .next = nst_seq_next, + .stop = nst_seq_stop, + .show = nst_seq_show, +}; + +static int nst_fop_open(struct inode *inode, struct file *file) +{ + struct o2net_send_tracking *dummy_nst; + struct seq_file *seq; + int ret; + + dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL); + if (dummy_nst == NULL) { + ret = -ENOMEM; + goto out; + } + dummy_nst->st_task = NULL; + + ret = seq_open(file, &nst_seq_ops); + if (ret) + goto out; + + seq = file->private_data; + seq->private = dummy_nst; + o2net_debug_add_nst(dummy_nst); + + dummy_nst = NULL; + +out: + kfree(dummy_nst); + return ret; +} + +static int nst_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct o2net_send_tracking *dummy_nst = seq->private; + + o2net_debug_del_nst(dummy_nst); + return seq_release_private(inode, file); +} + +static const struct file_operations nst_seq_fops = { + .open = nst_fop_open, + .read = seq_read, + .llseek = seq_lseek, + .release = nst_fop_release, +}; + +void o2net_debug_add_sc(struct o2net_sock_container *sc) +{ + spin_lock(&o2net_debug_lock); + list_add(&sc->sc_net_debug_item, &sock_containers); + spin_unlock(&o2net_debug_lock); +} + +void o2net_debug_del_sc(struct o2net_sock_container *sc) +{ + spin_lock(&o2net_debug_lock); + list_del_init(&sc->sc_net_debug_item); + spin_unlock(&o2net_debug_lock); +} + +struct o2net_sock_debug { + int dbg_ctxt; + struct o2net_sock_container *dbg_sock; +}; + +static struct o2net_sock_container + *next_sc(struct o2net_sock_container *sc_start) +{ + struct o2net_sock_container *sc, *ret = NULL; + + assert_spin_locked(&o2net_debug_lock); + + list_for_each_entry(sc, &sc_start->sc_net_debug_item, + sc_net_debug_item) { + /* discover the head of the list miscast as a sc */ + if (&sc->sc_net_debug_item == &sock_containers) + break; + + /* use sc_page to detect real scs in the list */ + if (sc->sc_page != NULL) { + ret = sc; + break; + } + } + + return ret; +} + +static void *sc_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct o2net_sock_debug *sd = seq->private; + struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; + + spin_lock(&o2net_debug_lock); + sc = next_sc(dummy_sc); + spin_unlock(&o2net_debug_lock); + + return sc; +} + +static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct o2net_sock_debug *sd = seq->private; + struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; + + spin_lock(&o2net_debug_lock); + sc = next_sc(dummy_sc); + list_del_init(&dummy_sc->sc_net_debug_item); + if (sc) + list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item); + spin_unlock(&o2net_debug_lock); + + return sc; /* unused, just needs to be null when done */ +} + +#ifdef CONFIG_OCFS2_FS_STATS +# define sc_send_count(_s) ((_s)->sc_send_count) +# define sc_recv_count(_s) ((_s)->sc_recv_count) +# define sc_tv_acquiry_total_ns(_s) (ktime_to_ns((_s)->sc_tv_acquiry_total)) +# define sc_tv_send_total_ns(_s) (ktime_to_ns((_s)->sc_tv_send_total)) +# define sc_tv_status_total_ns(_s) (ktime_to_ns((_s)->sc_tv_status_total)) +# define sc_tv_process_total_ns(_s) (ktime_to_ns((_s)->sc_tv_process_total)) +#else +# define sc_send_count(_s) (0U) +# define sc_recv_count(_s) (0U) +# define sc_tv_acquiry_total_ns(_s) (0LL) +# define sc_tv_send_total_ns(_s) (0LL) +# define sc_tv_status_total_ns(_s) (0LL) +# define sc_tv_process_total_ns(_s) (0LL) +#endif + +/* So that debugfs.ocfs2 can determine which format is being used */ +#define O2NET_STATS_STR_VERSION 1 +static void sc_show_sock_stats(struct seq_file *seq, + struct o2net_sock_container *sc) +{ + if (!sc) + return; + + seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION, + sc->sc_node->nd_num, (unsigned long)sc_send_count(sc), + (long long)sc_tv_acquiry_total_ns(sc), + (long long)sc_tv_send_total_ns(sc), + (long long)sc_tv_status_total_ns(sc), + (unsigned long)sc_recv_count(sc), + (long long)sc_tv_process_total_ns(sc)); +} + +static void sc_show_sock_container(struct seq_file *seq, + struct o2net_sock_container *sc) +{ + struct inet_sock *inet = NULL; + __be32 saddr = 0, daddr = 0; + __be16 sport = 0, dport = 0; + + if (!sc) + return; + + if (sc->sc_sock) { + inet = inet_sk(sc->sc_sock->sk); + /* the stack's structs aren't sparse endian clean */ + saddr = (__force __be32)inet->inet_saddr; + daddr = (__force __be32)inet->inet_daddr; + sport = (__force __be16)inet->inet_sport; + dport = (__force __be16)inet->inet_dport; + } + + /* XXX sigh, inet-> doesn't have sparse annotation so any + * use of it here generates a warning with -Wbitwise */ + seq_printf(seq, "%p:\n" + " krefs: %d\n" + " sock: %pI4:%u -> " + "%pI4:%u\n" + " remote node: %s\n" + " page off: %zu\n" + " handshake ok: %u\n" + " timer: %lld usecs\n" + " data ready: %lld usecs\n" + " advance start: %lld usecs\n" + " advance stop: %lld usecs\n" + " func start: %lld usecs\n" + " func stop: %lld usecs\n" + " func key: 0x%08x\n" + " func type: %u\n", + sc, + atomic_read(&sc->sc_kref.refcount), + &saddr, inet ? ntohs(sport) : 0, + &daddr, inet ? ntohs(dport) : 0, + sc->sc_node->nd_name, + sc->sc_page_off, + sc->sc_handshake_ok, + (long long)ktime_to_us(sc->sc_tv_timer), + (long long)ktime_to_us(sc->sc_tv_data_ready), + (long long)ktime_to_us(sc->sc_tv_advance_start), + (long long)ktime_to_us(sc->sc_tv_advance_stop), + (long long)ktime_to_us(sc->sc_tv_func_start), + (long long)ktime_to_us(sc->sc_tv_func_stop), + sc->sc_msg_key, + sc->sc_msg_type); +} + +static int sc_seq_show(struct seq_file *seq, void *v) +{ + struct o2net_sock_debug *sd = seq->private; + struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; + + spin_lock(&o2net_debug_lock); + sc = next_sc(dummy_sc); + + if (sc) { + if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS) + sc_show_sock_container(seq, sc); + else + sc_show_sock_stats(seq, sc); + } + + spin_unlock(&o2net_debug_lock); + + return 0; +} + +static void sc_seq_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations sc_seq_ops = { + .start = sc_seq_start, + .next = sc_seq_next, + .stop = sc_seq_stop, + .show = sc_seq_show, +}; + +static int sc_common_open(struct file *file, struct o2net_sock_debug *sd) +{ + struct o2net_sock_container *dummy_sc; + struct seq_file *seq; + int ret; + + dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL); + if (dummy_sc == NULL) { + ret = -ENOMEM; + goto out; + } + dummy_sc->sc_page = NULL; + + ret = seq_open(file, &sc_seq_ops); + if (ret) + goto out; + + seq = file->private_data; + seq->private = sd; + sd->dbg_sock = dummy_sc; + o2net_debug_add_sc(dummy_sc); + + dummy_sc = NULL; + +out: + kfree(dummy_sc); + return ret; +} + +static int sc_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct o2net_sock_debug *sd = seq->private; + struct o2net_sock_container *dummy_sc = sd->dbg_sock; + + o2net_debug_del_sc(dummy_sc); + return seq_release_private(inode, file); +} + +static int stats_fop_open(struct inode *inode, struct file *file) +{ + struct o2net_sock_debug *sd; + + sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL); + if (sd == NULL) + return -ENOMEM; + + sd->dbg_ctxt = SHOW_SOCK_STATS; + sd->dbg_sock = NULL; + + return sc_common_open(file, sd); +} + +static const struct file_operations stats_seq_fops = { + .open = stats_fop_open, + .read = seq_read, + .llseek = seq_lseek, + .release = sc_fop_release, +}; + +static int sc_fop_open(struct inode *inode, struct file *file) +{ + struct o2net_sock_debug *sd; + + sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL); + if (sd == NULL) + return -ENOMEM; + + sd->dbg_ctxt = SHOW_SOCK_CONTAINERS; + sd->dbg_sock = NULL; + + return sc_common_open(file, sd); +} + +static const struct file_operations sc_seq_fops = { + .open = sc_fop_open, + .read = seq_read, + .llseek = seq_lseek, + .release = sc_fop_release, +}; + +int o2net_debugfs_init(void) +{ + o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); + if (!o2net_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR, + o2net_dentry, NULL, + &nst_seq_fops); + if (!nst_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR, + o2net_dentry, NULL, + &sc_seq_fops); + if (!sc_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR, + o2net_dentry, NULL, + &stats_seq_fops); + if (!stats_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + return 0; +bail: + debugfs_remove(stats_dentry); + debugfs_remove(sc_dentry); + debugfs_remove(nst_dentry); + debugfs_remove(o2net_dentry); + return -ENOMEM; +} + +void o2net_debugfs_exit(void) +{ + debugfs_remove(stats_dentry); + debugfs_remove(sc_dentry); + debugfs_remove(nst_dentry); + debugfs_remove(o2net_dentry); +} + +#endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c new file mode 100644 index 00000000..bb240647 --- /dev/null +++ b/fs/ocfs2/cluster/nodemanager.c @@ -0,0 +1,989 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include + +#include "tcp.h" +#include "nodemanager.h" +#include "heartbeat.h" +#include "masklog.h" +#include "sys.h" +#include "ver.h" + +/* for now we operate under the assertion that there can be only one + * cluster active at a time. Changing this will require trickling + * cluster references throughout where nodes are looked up */ +struct o2nm_cluster *o2nm_single_cluster = NULL; + +char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { + "reset", /* O2NM_FENCE_RESET */ + "panic", /* O2NM_FENCE_PANIC */ +}; + +struct o2nm_node *o2nm_get_node_by_num(u8 node_num) +{ + struct o2nm_node *node = NULL; + + if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL) + goto out; + + read_lock(&o2nm_single_cluster->cl_nodes_lock); + node = o2nm_single_cluster->cl_nodes[node_num]; + if (node) + config_item_get(&node->nd_item); + read_unlock(&o2nm_single_cluster->cl_nodes_lock); +out: + return node; +} +EXPORT_SYMBOL_GPL(o2nm_get_node_by_num); + +int o2nm_configured_node_map(unsigned long *map, unsigned bytes) +{ + struct o2nm_cluster *cluster = o2nm_single_cluster; + + BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap))); + + if (cluster == NULL) + return -EINVAL; + + read_lock(&cluster->cl_nodes_lock); + memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap)); + read_unlock(&cluster->cl_nodes_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(o2nm_configured_node_map); + +static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster, + __be32 ip_needle, + struct rb_node ***ret_p, + struct rb_node **ret_parent) +{ + struct rb_node **p = &cluster->cl_node_ip_tree.rb_node; + struct rb_node *parent = NULL; + struct o2nm_node *node, *ret = NULL; + + while (*p) { + int cmp; + + parent = *p; + node = rb_entry(parent, struct o2nm_node, nd_ip_node); + + cmp = memcmp(&ip_needle, &node->nd_ipv4_address, + sizeof(ip_needle)); + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { + ret = node; + break; + } + } + + if (ret_p != NULL) + *ret_p = p; + if (ret_parent != NULL) + *ret_parent = parent; + + return ret; +} + +struct o2nm_node *o2nm_get_node_by_ip(__be32 addr) +{ + struct o2nm_node *node = NULL; + struct o2nm_cluster *cluster = o2nm_single_cluster; + + if (cluster == NULL) + goto out; + + read_lock(&cluster->cl_nodes_lock); + node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL); + if (node) + config_item_get(&node->nd_item); + read_unlock(&cluster->cl_nodes_lock); + +out: + return node; +} +EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip); + +void o2nm_node_put(struct o2nm_node *node) +{ + config_item_put(&node->nd_item); +} +EXPORT_SYMBOL_GPL(o2nm_node_put); + +void o2nm_node_get(struct o2nm_node *node) +{ + config_item_get(&node->nd_item); +} +EXPORT_SYMBOL_GPL(o2nm_node_get); + +u8 o2nm_this_node(void) +{ + u8 node_num = O2NM_MAX_NODES; + + if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local) + node_num = o2nm_single_cluster->cl_local_node; + + return node_num; +} +EXPORT_SYMBOL_GPL(o2nm_this_node); + +/* node configfs bits */ + +static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item) +{ + return item ? + container_of(to_config_group(item), struct o2nm_cluster, + cl_group) + : NULL; +} + +static struct o2nm_node *to_o2nm_node(struct config_item *item) +{ + return item ? container_of(item, struct o2nm_node, nd_item) : NULL; +} + +static void o2nm_node_release(struct config_item *item) +{ + struct o2nm_node *node = to_o2nm_node(item); + kfree(node); +} + +static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page) +{ + return sprintf(page, "%d\n", node->nd_num); +} + +static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node) +{ + /* through the first node_set .parent + * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */ + return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); +} + +enum { + O2NM_NODE_ATTR_NUM = 0, + O2NM_NODE_ATTR_PORT, + O2NM_NODE_ATTR_ADDRESS, + O2NM_NODE_ATTR_LOCAL, +}; + +static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page, + size_t count) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + unsigned long tmp; + char *p = (char *)page; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp >= O2NM_MAX_NODES) + return -ERANGE; + + /* once we're in the cl_nodes tree networking can look us up by + * node number and try to use our address and port attributes + * to connect to this node.. make sure that they've been set + * before writing the node attribute? */ + if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) || + !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) + return -EINVAL; /* XXX */ + + write_lock(&cluster->cl_nodes_lock); + if (cluster->cl_nodes[tmp]) + p = NULL; + else { + cluster->cl_nodes[tmp] = node; + node->nd_num = tmp; + set_bit(tmp, cluster->cl_nodes_bitmap); + } + write_unlock(&cluster->cl_nodes_lock); + if (p == NULL) + return -EEXIST; + + return count; +} +static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page) +{ + return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port)); +} + +static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node, + const char *page, size_t count) +{ + unsigned long tmp; + char *p = (char *)page; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp == 0) + return -EINVAL; + if (tmp >= (u16)-1) + return -ERANGE; + + node->nd_ipv4_port = htons(tmp); + + return count; +} + +static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page) +{ + return sprintf(page, "%pI4\n", &node->nd_ipv4_address); +} + +static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, + const char *page, + size_t count) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + int ret, i; + struct rb_node **p, *parent; + unsigned int octets[4]; + __be32 ipv4_addr = 0; + + ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2], + &octets[1], &octets[0]); + if (ret != 4) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(octets); i++) { + if (octets[i] > 255) + return -ERANGE; + be32_add_cpu(&ipv4_addr, octets[i] << (i * 8)); + } + + ret = 0; + write_lock(&cluster->cl_nodes_lock); + if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) + ret = -EEXIST; + else { + rb_link_node(&node->nd_ip_node, parent, p); + rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); + } + write_unlock(&cluster->cl_nodes_lock); + if (ret) + return ret; + + memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr)); + + return count; +} + +static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page) +{ + return sprintf(page, "%d\n", node->nd_local); +} + +static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page, + size_t count) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + unsigned long tmp; + char *p = (char *)page; + ssize_t ret; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + tmp = !!tmp; /* boolean of whether this node wants to be local */ + + /* setting local turns on networking rx for now so we require having + * set everything else first */ + if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) || + !test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) || + !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) + return -EINVAL; /* XXX */ + + /* the only failure case is trying to set a new local node + * when a different one is already set */ + if (tmp && tmp == cluster->cl_has_local && + cluster->cl_local_node != node->nd_num) + return -EBUSY; + + /* bring up the rx thread if we're setting the new local node. */ + if (tmp && !cluster->cl_has_local) { + ret = o2net_start_listening(node); + if (ret) + return ret; + } + + if (!tmp && cluster->cl_has_local && + cluster->cl_local_node == node->nd_num) { + o2net_stop_listening(node); + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; + } + + node->nd_local = tmp; + if (node->nd_local) { + cluster->cl_has_local = tmp; + cluster->cl_local_node = node->nd_num; + } + + return count; +} + +struct o2nm_node_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct o2nm_node *, char *); + ssize_t (*store)(struct o2nm_node *, const char *, size_t); +}; + +static struct o2nm_node_attribute o2nm_node_attr_num = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "num", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_node_num_read, + .store = o2nm_node_num_write, +}; + +static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "ipv4_port", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_node_ipv4_port_read, + .store = o2nm_node_ipv4_port_write, +}; + +static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "ipv4_address", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_node_ipv4_address_read, + .store = o2nm_node_ipv4_address_write, +}; + +static struct o2nm_node_attribute o2nm_node_attr_local = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "local", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_node_local_read, + .store = o2nm_node_local_write, +}; + +static struct configfs_attribute *o2nm_node_attrs[] = { + [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr, + [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr, + [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr, + [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr, + NULL, +}; + +static int o2nm_attr_index(struct configfs_attribute *attr) +{ + int i; + for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) { + if (attr == o2nm_node_attrs[i]) + return i; + } + BUG(); + return 0; +} + +static ssize_t o2nm_node_show(struct config_item *item, + struct configfs_attribute *attr, + char *page) +{ + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_node_attribute *o2nm_node_attr = + container_of(attr, struct o2nm_node_attribute, attr); + ssize_t ret = 0; + + if (o2nm_node_attr->show) + ret = o2nm_node_attr->show(node, page); + return ret; +} + +static ssize_t o2nm_node_store(struct config_item *item, + struct configfs_attribute *attr, + const char *page, size_t count) +{ + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_node_attribute *o2nm_node_attr = + container_of(attr, struct o2nm_node_attribute, attr); + ssize_t ret; + int attr_index = o2nm_attr_index(attr); + + if (o2nm_node_attr->store == NULL) { + ret = -EINVAL; + goto out; + } + + if (test_bit(attr_index, &node->nd_set_attributes)) + return -EBUSY; + + ret = o2nm_node_attr->store(node, page, count); + if (ret < count) + goto out; + + set_bit(attr_index, &node->nd_set_attributes); +out: + return ret; +} + +static struct configfs_item_operations o2nm_node_item_ops = { + .release = o2nm_node_release, + .show_attribute = o2nm_node_show, + .store_attribute = o2nm_node_store, +}; + +static struct config_item_type o2nm_node_type = { + .ct_item_ops = &o2nm_node_item_ops, + .ct_attrs = o2nm_node_attrs, + .ct_owner = THIS_MODULE, +}; + +/* node set */ + +struct o2nm_node_group { + struct config_group ns_group; + /* some stuff? */ +}; + +#if 0 +static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group) +{ + return group ? + container_of(group, struct o2nm_node_group, ns_group) + : NULL; +} +#endif + +struct o2nm_cluster_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct o2nm_cluster *, char *); + ssize_t (*store)(struct o2nm_cluster *, const char *, size_t); +}; + +static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count, + unsigned int *val) +{ + unsigned long tmp; + char *p = (char *)page; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp == 0) + return -EINVAL; + if (tmp >= (u32)-1) + return -ERANGE; + + *val = tmp; + + return count; +} + +static ssize_t o2nm_cluster_attr_idle_timeout_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms); +} + +static ssize_t o2nm_cluster_attr_idle_timeout_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + ssize_t ret; + unsigned int val; + + ret = o2nm_cluster_attr_write(page, count, &val); + + if (ret > 0) { + if (cluster->cl_idle_timeout_ms != val + && o2net_num_connected_peers()) { + mlog(ML_NOTICE, + "o2net: cannot change idle timeout after " + "the first peer has agreed to it." + " %d connected peers\n", + o2net_num_connected_peers()); + ret = -EINVAL; + } else if (val <= cluster->cl_keepalive_delay_ms) { + mlog(ML_NOTICE, "o2net: idle timeout must be larger " + "than keepalive delay\n"); + ret = -EINVAL; + } else { + cluster->cl_idle_timeout_ms = val; + } + } + + return ret; +} + +static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms); +} + +static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + ssize_t ret; + unsigned int val; + + ret = o2nm_cluster_attr_write(page, count, &val); + + if (ret > 0) { + if (cluster->cl_keepalive_delay_ms != val + && o2net_num_connected_peers()) { + mlog(ML_NOTICE, + "o2net: cannot change keepalive delay after" + " the first peer has agreed to it." + " %d connected peers\n", + o2net_num_connected_peers()); + ret = -EINVAL; + } else if (val >= cluster->cl_idle_timeout_ms) { + mlog(ML_NOTICE, "o2net: keepalive delay must be " + "smaller than idle timeout\n"); + ret = -EINVAL; + } else { + cluster->cl_keepalive_delay_ms = val; + } + } + + return ret; +} + +static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms); +} + +static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + return o2nm_cluster_attr_write(page, count, + &cluster->cl_reconnect_delay_ms); +} + +static ssize_t o2nm_cluster_attr_fence_method_read( + struct o2nm_cluster *cluster, char *page) +{ + ssize_t ret = 0; + + if (cluster) + ret = sprintf(page, "%s\n", + o2nm_fence_method_desc[cluster->cl_fence_method]); + return ret; +} + +static ssize_t o2nm_cluster_attr_fence_method_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + unsigned int i; + + if (page[count - 1] != '\n') + goto bail; + + for (i = 0; i < O2NM_FENCE_METHODS; ++i) { + if (count != strlen(o2nm_fence_method_desc[i]) + 1) + continue; + if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1)) + continue; + if (cluster->cl_fence_method != i) { + printk(KERN_INFO "ocfs2: Changing fence method to %s\n", + o2nm_fence_method_desc[i]); + cluster->cl_fence_method = i; + } + return count; + } + +bail: + return -EINVAL; +} + +static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "idle_timeout_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_idle_timeout_ms_read, + .store = o2nm_cluster_attr_idle_timeout_ms_write, +}; + +static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "keepalive_delay_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_keepalive_delay_ms_read, + .store = o2nm_cluster_attr_keepalive_delay_ms_write, +}; + +static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "reconnect_delay_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_reconnect_delay_ms_read, + .store = o2nm_cluster_attr_reconnect_delay_ms_write, +}; + +static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "fence_method", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_fence_method_read, + .store = o2nm_cluster_attr_fence_method_write, +}; + +static struct configfs_attribute *o2nm_cluster_attrs[] = { + &o2nm_cluster_attr_idle_timeout_ms.attr, + &o2nm_cluster_attr_keepalive_delay_ms.attr, + &o2nm_cluster_attr_reconnect_delay_ms.attr, + &o2nm_cluster_attr_fence_method.attr, + NULL, +}; +static ssize_t o2nm_cluster_show(struct config_item *item, + struct configfs_attribute *attr, + char *page) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + struct o2nm_cluster_attribute *o2nm_cluster_attr = + container_of(attr, struct o2nm_cluster_attribute, attr); + ssize_t ret = 0; + + if (o2nm_cluster_attr->show) + ret = o2nm_cluster_attr->show(cluster, page); + return ret; +} + +static ssize_t o2nm_cluster_store(struct config_item *item, + struct configfs_attribute *attr, + const char *page, size_t count) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + struct o2nm_cluster_attribute *o2nm_cluster_attr = + container_of(attr, struct o2nm_cluster_attribute, attr); + ssize_t ret; + + if (o2nm_cluster_attr->store == NULL) { + ret = -EINVAL; + goto out; + } + + ret = o2nm_cluster_attr->store(cluster, page, count); + if (ret < count) + goto out; +out: + return ret; +} + +static struct config_item *o2nm_node_group_make_item(struct config_group *group, + const char *name) +{ + struct o2nm_node *node = NULL; + + if (strlen(name) > O2NM_MAX_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL); + if (node == NULL) + return ERR_PTR(-ENOMEM); + + strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */ + config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); + spin_lock_init(&node->nd_lock); + + mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name); + + return &node->nd_item; +} + +static void o2nm_node_group_drop_item(struct config_group *group, + struct config_item *item) +{ + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); + + o2net_disconnect_node(node); + + if (cluster->cl_has_local && + (cluster->cl_local_node == node->nd_num)) { + cluster->cl_has_local = 0; + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; + o2net_stop_listening(node); + } + + /* XXX call into net to stop this node from trading messages */ + + write_lock(&cluster->cl_nodes_lock); + + /* XXX sloppy */ + if (node->nd_ipv4_address) + rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree); + + /* nd_num might be 0 if the node number hasn't been set.. */ + if (cluster->cl_nodes[node->nd_num] == node) { + cluster->cl_nodes[node->nd_num] = NULL; + clear_bit(node->nd_num, cluster->cl_nodes_bitmap); + } + write_unlock(&cluster->cl_nodes_lock); + + mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n", + config_item_name(&node->nd_item)); + + config_item_put(item); +} + +static struct configfs_group_operations o2nm_node_group_group_ops = { + .make_item = o2nm_node_group_make_item, + .drop_item = o2nm_node_group_drop_item, +}; + +static struct config_item_type o2nm_node_group_type = { + .ct_group_ops = &o2nm_node_group_group_ops, + .ct_owner = THIS_MODULE, +}; + +/* cluster */ + +static void o2nm_cluster_release(struct config_item *item) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + + kfree(cluster->cl_group.default_groups); + kfree(cluster); +} + +static struct configfs_item_operations o2nm_cluster_item_ops = { + .release = o2nm_cluster_release, + .show_attribute = o2nm_cluster_show, + .store_attribute = o2nm_cluster_store, +}; + +static struct config_item_type o2nm_cluster_type = { + .ct_item_ops = &o2nm_cluster_item_ops, + .ct_attrs = o2nm_cluster_attrs, + .ct_owner = THIS_MODULE, +}; + +/* cluster set */ + +struct o2nm_cluster_group { + struct configfs_subsystem cs_subsys; + /* some stuff? */ +}; + +#if 0 +static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group) +{ + return group ? + container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys) + : NULL; +} +#endif + +static struct config_group *o2nm_cluster_group_make_group(struct config_group *group, + const char *name) +{ + struct o2nm_cluster *cluster = NULL; + struct o2nm_node_group *ns = NULL; + struct config_group *o2hb_group = NULL, *ret = NULL; + void *defs = NULL; + + /* this runs under the parent dir's i_mutex; there can be only + * one caller in here at a time */ + if (o2nm_single_cluster) + return ERR_PTR(-ENOSPC); + + cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL); + ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL); + defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); + o2hb_group = o2hb_alloc_hb_set(); + if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) + goto out; + + config_group_init_type_name(&cluster->cl_group, name, + &o2nm_cluster_type); + config_group_init_type_name(&ns->ns_group, "node", + &o2nm_node_group_type); + + cluster->cl_group.default_groups = defs; + cluster->cl_group.default_groups[0] = &ns->ns_group; + cluster->cl_group.default_groups[1] = o2hb_group; + cluster->cl_group.default_groups[2] = NULL; + rwlock_init(&cluster->cl_nodes_lock); + cluster->cl_node_ip_tree = RB_ROOT; + cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; + cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; + cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; + cluster->cl_fence_method = O2NM_FENCE_RESET; + + ret = &cluster->cl_group; + o2nm_single_cluster = cluster; + +out: + if (ret == NULL) { + kfree(cluster); + kfree(ns); + o2hb_free_hb_set(o2hb_group); + kfree(defs); + ret = ERR_PTR(-ENOMEM); + } + + return ret; +} + +static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + int i; + struct config_item *killme; + + BUG_ON(o2nm_single_cluster != cluster); + o2nm_single_cluster = NULL; + + for (i = 0; cluster->cl_group.default_groups[i]; i++) { + killme = &cluster->cl_group.default_groups[i]->cg_item; + cluster->cl_group.default_groups[i] = NULL; + config_item_put(killme); + } + + config_item_put(item); +} + +static struct configfs_group_operations o2nm_cluster_group_group_ops = { + .make_group = o2nm_cluster_group_make_group, + .drop_item = o2nm_cluster_group_drop_item, +}; + +static struct config_item_type o2nm_cluster_group_type = { + .ct_group_ops = &o2nm_cluster_group_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct o2nm_cluster_group o2nm_cluster_group = { + .cs_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "cluster", + .ci_type = &o2nm_cluster_group_type, + }, + }, + }, +}; + +int o2nm_depend_item(struct config_item *item) +{ + return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item); +} + +void o2nm_undepend_item(struct config_item *item) +{ + configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item); +} + +int o2nm_depend_this_node(void) +{ + int ret = 0; + struct o2nm_node *local_node; + + local_node = o2nm_get_node_by_num(o2nm_this_node()); + if (!local_node) { + ret = -EINVAL; + goto out; + } + + ret = o2nm_depend_item(&local_node->nd_item); + o2nm_node_put(local_node); + +out: + return ret; +} + +void o2nm_undepend_this_node(void) +{ + struct o2nm_node *local_node; + + local_node = o2nm_get_node_by_num(o2nm_this_node()); + BUG_ON(!local_node); + + o2nm_undepend_item(&local_node->nd_item); + o2nm_node_put(local_node); +} + + +static void __exit exit_o2nm(void) +{ + /* XXX sync with hb callbacks and shut down hb? */ + o2net_unregister_hb_callbacks(); + configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys); + o2cb_sys_shutdown(); + + o2net_exit(); + o2hb_exit(); +} + +static int __init init_o2nm(void) +{ + int ret = -1; + + cluster_print_version(); + + ret = o2hb_init(); + if (ret) + goto out; + + ret = o2net_init(); + if (ret) + goto out_o2hb; + + ret = o2net_register_hb_callbacks(); + if (ret) + goto out_o2net; + + config_group_init(&o2nm_cluster_group.cs_subsys.su_group); + mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex); + ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys); + if (ret) { + printk(KERN_ERR "nodemanager: Registration returned %d\n", ret); + goto out_callbacks; + } + + ret = o2cb_sys_init(); + if (!ret) + goto out; + + configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys); +out_callbacks: + o2net_unregister_hb_callbacks(); +out_o2net: + o2net_exit(); +out_o2hb: + o2hb_exit(); +out: + return ret; +} + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); + +module_init(init_o2nm) +module_exit(exit_o2nm) diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h new file mode 100644 index 00000000..09ea2d38 --- /dev/null +++ b/fs/ocfs2/cluster/nodemanager.h @@ -0,0 +1,88 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * nodemanager.h + * + * Function prototypes + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_NODEMANAGER_H +#define O2CLUSTER_NODEMANAGER_H + +#include "ocfs2_nodemanager.h" + +/* This totally doesn't belong here. */ +#include +#include + +enum o2nm_fence_method { + O2NM_FENCE_RESET = 0, + O2NM_FENCE_PANIC, + O2NM_FENCE_METHODS, /* Number of fence methods */ +}; + +struct o2nm_node { + spinlock_t nd_lock; + struct config_item nd_item; + char nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */ + __u8 nd_num; + /* only one address per node, as attributes, for now. */ + __be32 nd_ipv4_address; + __be16 nd_ipv4_port; + struct rb_node nd_ip_node; + /* there can be only one local node for now */ + int nd_local; + + unsigned long nd_set_attributes; +}; + +struct o2nm_cluster { + struct config_group cl_group; + unsigned cl_has_local:1; + u8 cl_local_node; + rwlock_t cl_nodes_lock; + struct o2nm_node *cl_nodes[O2NM_MAX_NODES]; + struct rb_root cl_node_ip_tree; + unsigned int cl_idle_timeout_ms; + unsigned int cl_keepalive_delay_ms; + unsigned int cl_reconnect_delay_ms; + enum o2nm_fence_method cl_fence_method; + + /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ + unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; +}; + +extern struct o2nm_cluster *o2nm_single_cluster; + +u8 o2nm_this_node(void); + +int o2nm_configured_node_map(unsigned long *map, unsigned bytes); +struct o2nm_node *o2nm_get_node_by_num(u8 node_num); +struct o2nm_node *o2nm_get_node_by_ip(__be32 addr); +void o2nm_node_get(struct o2nm_node *node); +void o2nm_node_put(struct o2nm_node *node); + +int o2nm_depend_item(struct config_item *item); +void o2nm_undepend_item(struct config_item *item); +int o2nm_depend_this_node(void); +void o2nm_undepend_this_node(void); + +#endif /* O2CLUSTER_NODEMANAGER_H */ diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h new file mode 100644 index 00000000..3f4151da --- /dev/null +++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h @@ -0,0 +1,38 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_heartbeat.h + * + * On-disk structures for ocfs2_heartbeat + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _OCFS2_HEARTBEAT_H +#define _OCFS2_HEARTBEAT_H + +struct o2hb_disk_heartbeat_block { + __le64 hb_seq; + __u8 hb_node; + __u8 hb_pad1[3]; + __le32 hb_cksum; + __le64 hb_generation; + __le32 hb_dead_ms; +}; + +#endif /* _OCFS2_HEARTBEAT_H */ diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h new file mode 100644 index 00000000..49b59432 --- /dev/null +++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h @@ -0,0 +1,45 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_nodemanager.h + * + * Header describing the interface between userspace and the kernel + * for the ocfs2_nodemanager module. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef _OCFS2_NODEMANAGER_H +#define _OCFS2_NODEMANAGER_H + +#define O2NM_API_VERSION 5 + +#define O2NM_MAX_NODES 255 +#define O2NM_INVALID_NODE_NUM 255 + +/* host name, group name, cluster name all 64 bytes */ +#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN + +/* + * Maximum number of global heartbeat regions allowed. + * **CAUTION** Changing this number will break dlm compatibility. + */ +#define O2NM_MAX_REGIONS 32 + +#endif /* _OCFS2_NODEMANAGER_H */ diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c new file mode 100644 index 00000000..8f9cea15 --- /dev/null +++ b/fs/ocfs2/cluster/quorum.c @@ -0,0 +1,331 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +/* This quorum hack is only here until we transition to some more rational + * approach that is driven from userspace. Honest. No foolin'. + * + * Imagine two nodes lose network connectivity to each other but they're still + * up and operating in every other way. Presumably a network timeout indicates + * that a node is broken and should be recovered. They can't both recover each + * other and both carry on without serialising their access to the file system. + * They need to decide who is authoritative. Now extend that problem to + * arbitrary groups of nodes losing connectivity between each other. + * + * So we declare that a node which has given up on connecting to a majority + * of nodes who are still heartbeating will fence itself. + * + * There are huge opportunities for races here. After we give up on a node's + * connection we need to wait long enough to give heartbeat an opportunity + * to declare the node as truly dead. We also need to be careful with the + * race between when we see a node start heartbeating and when we connect + * to it. + * + * So nodes that are in this transtion put a hold on the quorum decision + * with a counter. As they fall out of this transition they drop the count + * and if they're the last, they fire off the decision. + */ +#include +#include +#include + +#include "heartbeat.h" +#include "nodemanager.h" +#define MLOG_MASK_PREFIX ML_QUORUM +#include "masklog.h" +#include "quorum.h" + +static struct o2quo_state { + spinlock_t qs_lock; + struct work_struct qs_work; + int qs_pending; + int qs_heartbeating; + unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int qs_connected; + unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int qs_holds; + unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; +} o2quo_state; + +/* this is horribly heavy-handed. It should instead flip the file + * system RO and call some userspace script. */ +static void o2quo_fence_self(void) +{ + /* panic spins with interrupts enabled. with preempt + * threads can still schedule, etc, etc */ + o2hb_stop_all_regions(); + + switch (o2nm_single_cluster->cl_fence_method) { + case O2NM_FENCE_PANIC: + panic("*** ocfs2 is very sorry to be fencing this system by " + "panicing ***\n"); + break; + default: + WARN_ON(o2nm_single_cluster->cl_fence_method >= + O2NM_FENCE_METHODS); + case O2NM_FENCE_RESET: + printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this " + "system by restarting ***\n"); + emergency_restart(); + break; + }; +} + +/* Indicate that a timeout occurred on a hearbeat region write. The + * other nodes in the cluster may consider us dead at that time so we + * want to "fence" ourselves so that we don't scribble on the disk + * after they think they've recovered us. This can't solve all + * problems related to writeout after recovery but this hack can at + * least close some of those gaps. When we have real fencing, this can + * go away as our node would be fenced externally before other nodes + * begin recovery. */ +void o2quo_disk_timeout(void) +{ + o2quo_fence_self(); +} + +static void o2quo_make_decision(struct work_struct *work) +{ + int quorum; + int lowest_hb, lowest_reachable = 0, fence = 0; + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES); + if (lowest_hb != O2NM_MAX_NODES) + lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm); + + mlog(0, "heartbeating: %d, connected: %d, " + "lowest: %d (%sreachable)\n", qs->qs_heartbeating, + qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un"); + + if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) || + qs->qs_heartbeating == 1) + goto out; + + if (qs->qs_heartbeating & 1) { + /* the odd numbered cluster case is straight forward -- + * if we can't talk to the majority we're hosed */ + quorum = (qs->qs_heartbeating + 1)/2; + if (qs->qs_connected < quorum) { + mlog(ML_ERROR, "fencing this node because it is " + "only connected to %u nodes and %u is needed " + "to make a quorum out of %u heartbeating nodes\n", + qs->qs_connected, quorum, + qs->qs_heartbeating); + fence = 1; + } + } else { + /* the even numbered cluster adds the possibility of each half + * of the cluster being able to talk amongst themselves.. in + * that case we're hosed if we can't talk to the group that has + * the lowest numbered node */ + quorum = qs->qs_heartbeating / 2; + if (qs->qs_connected < quorum) { + mlog(ML_ERROR, "fencing this node because it is " + "only connected to %u nodes and %u is needed " + "to make a quorum out of %u heartbeating nodes\n", + qs->qs_connected, quorum, + qs->qs_heartbeating); + fence = 1; + } + else if ((qs->qs_connected == quorum) && + !lowest_reachable) { + mlog(ML_ERROR, "fencing this node because it is " + "connected to a half-quorum of %u out of %u " + "nodes which doesn't include the lowest active " + "node %u\n", quorum, qs->qs_heartbeating, + lowest_hb); + fence = 1; + } + } + +out: + spin_unlock(&qs->qs_lock); + if (fence) + o2quo_fence_self(); +} + +static void o2quo_set_hold(struct o2quo_state *qs, u8 node) +{ + assert_spin_locked(&qs->qs_lock); + + if (!test_and_set_bit(node, qs->qs_hold_bm)) { + qs->qs_holds++; + mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES, + "node %u\n", node); + mlog(0, "node %u, %d total\n", node, qs->qs_holds); + } +} + +static void o2quo_clear_hold(struct o2quo_state *qs, u8 node) +{ + assert_spin_locked(&qs->qs_lock); + + if (test_and_clear_bit(node, qs->qs_hold_bm)) { + mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1); + if (--qs->qs_holds == 0) { + if (qs->qs_pending) { + qs->qs_pending = 0; + schedule_work(&qs->qs_work); + } + } + mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n", + node, qs->qs_holds); + } +} + +/* as a node comes up we delay the quorum decision until we know the fate of + * the connection. the hold will be droped in conn_up or hb_down. it might be + * perpetuated by con_err until hb_down. if we already have a conn, we might + * be dropping a hold that conn_up got. */ +void o2quo_hb_up(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + qs->qs_heartbeating++; + mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES, + "node %u\n", node); + mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node); + set_bit(node, qs->qs_hb_bm); + + mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); + + if (!test_bit(node, qs->qs_conn_bm)) + o2quo_set_hold(qs, node); + else + o2quo_clear_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +/* hb going down releases any holds we might have had due to this node from + * conn_up, conn_err, or hb_up */ +void o2quo_hb_down(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + qs->qs_heartbeating--; + mlog_bug_on_msg(qs->qs_heartbeating < 0, + "node %u, %d heartbeating\n", + node, qs->qs_heartbeating); + mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node); + clear_bit(node, qs->qs_hb_bm); + + mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); + + o2quo_clear_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +/* this tells us that we've decided that the node is still heartbeating + * even though we've lost it's conn. it must only be called after conn_err + * and indicates that we must now make a quorum decision in the future, + * though we might be doing so after waiting for holds to drain. Here + * we'll be dropping the hold from conn_err. */ +void o2quo_hb_still_up(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + mlog(0, "node %u\n", node); + + qs->qs_pending = 1; + o2quo_clear_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +/* This is analogous to hb_up. as a node's connection comes up we delay the + * quorum decision until we see it heartbeating. the hold will be droped in + * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if + * it's already heartbeating we we might be dropping a hold that conn_up got. + * */ +void o2quo_conn_up(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + qs->qs_connected++; + mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES, + "node %u\n", node); + mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node); + set_bit(node, qs->qs_conn_bm); + + mlog(0, "node %u, %d total\n", node, qs->qs_connected); + + if (!test_bit(node, qs->qs_hb_bm)) + o2quo_set_hold(qs, node); + else + o2quo_clear_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +/* we've decided that we won't ever be connecting to the node again. if it's + * still heartbeating we grab a hold that will delay decisions until either the + * node stops heartbeating from hb_down or the caller decides that the node is + * still up and calls still_up */ +void o2quo_conn_err(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + if (test_bit(node, qs->qs_conn_bm)) { + qs->qs_connected--; + mlog_bug_on_msg(qs->qs_connected < 0, + "node %u, connected %d\n", + node, qs->qs_connected); + + clear_bit(node, qs->qs_conn_bm); + } + + mlog(0, "node %u, %d total\n", node, qs->qs_connected); + + if (test_bit(node, qs->qs_hb_bm)) + o2quo_set_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +void o2quo_init(void) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock_init(&qs->qs_lock); + INIT_WORK(&qs->qs_work, o2quo_make_decision); +} + +void o2quo_exit(void) +{ + struct o2quo_state *qs = &o2quo_state; + + flush_work_sync(&qs->qs_work); +} diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h new file mode 100644 index 00000000..6649cc6f --- /dev/null +++ b/fs/ocfs2/cluster/quorum.h @@ -0,0 +1,36 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_QUORUM_H +#define O2CLUSTER_QUORUM_H + +void o2quo_init(void); +void o2quo_exit(void); + +void o2quo_hb_up(u8 node); +void o2quo_hb_down(u8 node); +void o2quo_hb_still_up(u8 node); +void o2quo_conn_up(u8 node); +void o2quo_conn_err(u8 node); +void o2quo_disk_timeout(void); + +#endif /* O2CLUSTER_QUORUM_H */ diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c new file mode 100644 index 00000000..a4b07730 --- /dev/null +++ b/fs/ocfs2/cluster/sys.c @@ -0,0 +1,82 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * sys.c + * + * OCFS2 cluster sysfs interface + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, + * version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#include +#include +#include +#include +#include + +#include "ocfs2_nodemanager.h" +#include "masklog.h" +#include "sys.h" + + +static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); +} +static struct kobj_attribute attr_version = + __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL); + +static struct attribute *o2cb_attrs[] = { + &attr_version.attr, + NULL, +}; + +static struct attribute_group o2cb_attr_group = { + .attrs = o2cb_attrs, +}; + +static struct kset *o2cb_kset; + +void o2cb_sys_shutdown(void) +{ + mlog_sys_shutdown(); + kset_unregister(o2cb_kset); +} + +int o2cb_sys_init(void) +{ + int ret; + + o2cb_kset = kset_create_and_add("o2cb", NULL, fs_kobj); + if (!o2cb_kset) + return -ENOMEM; + + ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); + if (ret) + goto error; + + ret = mlog_sys_init(o2cb_kset); + if (ret) + goto error; + return 0; +error: + kset_unregister(o2cb_kset); + return ret; +} diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h new file mode 100644 index 00000000..d66b8ab0 --- /dev/null +++ b/fs/ocfs2/cluster/sys.h @@ -0,0 +1,33 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * sys.h + * + * Function prototypes for o2cb sysfs interface + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, + * version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_SYS_H +#define O2CLUSTER_SYS_H + +void o2cb_sys_shutdown(void); +int o2cb_sys_init(void); + +#endif /* O2CLUSTER_SYS_H */ diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c new file mode 100644 index 00000000..db5ee4b4 --- /dev/null +++ b/fs/ocfs2/cluster/tcp.c @@ -0,0 +1,2143 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * ---- + * + * Callers for this were originally written against a very simple synchronus + * API. This implementation reflects those simple callers. Some day I'm sure + * we'll need to move to a more robust posting/callback mechanism. + * + * Transmit calls pass in kernel virtual addresses and block copying this into + * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting + * for a failed socket to timeout. TX callers can also pass in a poniter to an + * 'int' which gets filled with an errno off the wire in response to the + * message they send. + * + * Handlers for unsolicited messages are registered. Each socket has a page + * that incoming data is copied into. First the header, then the data. + * Handlers are called from only one thread with a reference to this per-socket + * page. This page is destroyed after the handler call, so it can't be + * referenced beyond the call. Handlers may block but are discouraged from + * doing so. + * + * Any framing errors (bad magic, large payload lengths) close a connection. + * + * Our sock_container holds the state we associate with a socket. It's current + * framing state is held there as well as the refcounting we do around when it + * is safe to tear down the socket. The socket is only finally torn down from + * the container when the container loses all of its references -- so as long + * as you hold a ref on the container you can trust that the socket is valid + * for use with kernel socket APIs. + * + * Connections are initiated between a pair of nodes when the node with the + * higher node number gets a heartbeat callback which indicates that the lower + * numbered node has started heartbeating. The lower numbered node is passive + * and only accepts the connection if the higher numbered node is heartbeating. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "heartbeat.h" +#include "tcp.h" +#include "nodemanager.h" +#define MLOG_MASK_PREFIX ML_TCP +#include "masklog.h" +#include "quorum.h" + +#include "tcp_internal.h" + +#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u" +#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \ + &sc->sc_node->nd_ipv4_address, \ + ntohs(sc->sc_node->nd_ipv4_port) + +/* + * In the following two log macros, the whitespace after the ',' just + * before ##args is intentional. Otherwise, gcc 2.95 will eat the + * previous token if args expands to nothing. + */ +#define msglog(hdr, fmt, args...) do { \ + typeof(hdr) __hdr = (hdr); \ + mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d " \ + "key %08x num %u] " fmt, \ + be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), \ + be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status), \ + be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key), \ + be32_to_cpu(__hdr->msg_num) , ##args); \ +} while (0) + +#define sclog(sc, fmt, args...) do { \ + typeof(sc) __sc = (sc); \ + mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \ + "pg_off %zu] " fmt, __sc, \ + atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \ + __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \ + ##args); \ +} while (0) + +static DEFINE_RWLOCK(o2net_handler_lock); +static struct rb_root o2net_handler_tree = RB_ROOT; + +static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; + +/* XXX someday we'll need better accounting */ +static struct socket *o2net_listen_sock = NULL; + +/* + * listen work is only queued by the listening socket callbacks on the + * o2net_wq. teardown detaches the callbacks before destroying the workqueue. + * quorum work is queued as sock containers are shutdown.. stop_listening + * tears down all the node's sock containers, preventing future shutdowns + * and queued quroum work, before canceling delayed quorum work and + * destroying the work queue. + */ +static struct workqueue_struct *o2net_wq; +static struct work_struct o2net_listen_work; + +static struct o2hb_callback_func o2net_hb_up, o2net_hb_down; +#define O2NET_HB_PRI 0x1 + +static struct o2net_handshake *o2net_hand; +static struct o2net_msg *o2net_keep_req, *o2net_keep_resp; + +static int o2net_sys_err_translations[O2NET_ERR_MAX] = + {[O2NET_ERR_NONE] = 0, + [O2NET_ERR_NO_HNDLR] = -ENOPROTOOPT, + [O2NET_ERR_OVERFLOW] = -EOVERFLOW, + [O2NET_ERR_DIED] = -EHOSTDOWN,}; + +/* can't quite avoid *all* internal declarations :/ */ +static void o2net_sc_connect_completed(struct work_struct *work); +static void o2net_rx_until_empty(struct work_struct *work); +static void o2net_shutdown_sc(struct work_struct *work); +static void o2net_listen_data_ready(struct sock *sk, int bytes); +static void o2net_sc_send_keep_req(struct work_struct *work); +static void o2net_idle_timer(unsigned long data); +static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); +static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); + +#ifdef CONFIG_DEBUG_FS +static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, + u32 msgkey, struct task_struct *task, u8 node) +{ + INIT_LIST_HEAD(&nst->st_net_debug_item); + nst->st_task = task; + nst->st_msg_type = msgtype; + nst->st_msg_key = msgkey; + nst->st_node = node; +} + +static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +{ + nst->st_sock_time = ktime_get(); +} + +static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +{ + nst->st_send_time = ktime_get(); +} + +static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +{ + nst->st_status_time = ktime_get(); +} + +static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, + struct o2net_sock_container *sc) +{ + nst->st_sc = sc; +} + +static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, + u32 msg_id) +{ + nst->st_id = msg_id; +} + +static inline void o2net_set_sock_timer(struct o2net_sock_container *sc) +{ + sc->sc_tv_timer = ktime_get(); +} + +static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc) +{ + sc->sc_tv_data_ready = ktime_get(); +} + +static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc) +{ + sc->sc_tv_advance_start = ktime_get(); +} + +static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc) +{ + sc->sc_tv_advance_stop = ktime_get(); +} + +static inline void o2net_set_func_start_time(struct o2net_sock_container *sc) +{ + sc->sc_tv_func_start = ktime_get(); +} + +static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc) +{ + sc->sc_tv_func_stop = ktime_get(); +} + +#else /* CONFIG_DEBUG_FS */ +# define o2net_init_nst(a, b, c, d, e) +# define o2net_set_nst_sock_time(a) +# define o2net_set_nst_send_time(a) +# define o2net_set_nst_status_time(a) +# define o2net_set_nst_sock_container(a, b) +# define o2net_set_nst_msg_id(a, b) +# define o2net_set_sock_timer(a) +# define o2net_set_data_ready_time(a) +# define o2net_set_advance_start_time(a) +# define o2net_set_advance_stop_time(a) +# define o2net_set_func_start_time(a) +# define o2net_set_func_stop_time(a) +#endif /* CONFIG_DEBUG_FS */ + +#ifdef CONFIG_OCFS2_FS_STATS +static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc) +{ + return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start); +} + +static void o2net_update_send_stats(struct o2net_send_tracking *nst, + struct o2net_sock_container *sc) +{ + sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total, + ktime_sub(ktime_get(), + nst->st_status_time)); + sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total, + ktime_sub(nst->st_status_time, + nst->st_send_time)); + sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total, + ktime_sub(nst->st_send_time, + nst->st_sock_time)); + sc->sc_send_count++; +} + +static void o2net_update_recv_stats(struct o2net_sock_container *sc) +{ + sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total, + o2net_get_func_run_time(sc)); + sc->sc_recv_count++; +} + +#else + +# define o2net_update_send_stats(a, b) + +# define o2net_update_recv_stats(sc) + +#endif /* CONFIG_OCFS2_FS_STATS */ + +static inline int o2net_reconnect_delay(void) +{ + return o2nm_single_cluster->cl_reconnect_delay_ms; +} + +static inline int o2net_keepalive_delay(void) +{ + return o2nm_single_cluster->cl_keepalive_delay_ms; +} + +static inline int o2net_idle_timeout(void) +{ + return o2nm_single_cluster->cl_idle_timeout_ms; +} + +static inline int o2net_sys_err_to_errno(enum o2net_system_error err) +{ + int trans; + BUG_ON(err >= O2NET_ERR_MAX); + trans = o2net_sys_err_translations[err]; + + /* Just in case we mess up the translation table above */ + BUG_ON(err != O2NET_ERR_NONE && trans == 0); + return trans; +} + +static struct o2net_node * o2net_nn_from_num(u8 node_num) +{ + BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes)); + return &o2net_nodes[node_num]; +} + +static u8 o2net_num_from_nn(struct o2net_node *nn) +{ + BUG_ON(nn == NULL); + return nn - o2net_nodes; +} + +/* ------------------------------------------------------------ */ + +static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw) +{ + int ret = 0; + + do { + if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) { + ret = -EAGAIN; + break; + } + spin_lock(&nn->nn_lock); + ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id); + if (ret == 0) + list_add_tail(&nsw->ns_node_item, + &nn->nn_status_list); + spin_unlock(&nn->nn_lock); + } while (ret == -EAGAIN); + + if (ret == 0) { + init_waitqueue_head(&nsw->ns_wq); + nsw->ns_sys_status = O2NET_ERR_NONE; + nsw->ns_status = 0; + } + + return ret; +} + +static void o2net_complete_nsw_locked(struct o2net_node *nn, + struct o2net_status_wait *nsw, + enum o2net_system_error sys_status, + s32 status) +{ + assert_spin_locked(&nn->nn_lock); + + if (!list_empty(&nsw->ns_node_item)) { + list_del_init(&nsw->ns_node_item); + nsw->ns_sys_status = sys_status; + nsw->ns_status = status; + idr_remove(&nn->nn_status_idr, nsw->ns_id); + wake_up(&nsw->ns_wq); + } +} + +static void o2net_complete_nsw(struct o2net_node *nn, + struct o2net_status_wait *nsw, + u64 id, enum o2net_system_error sys_status, + s32 status) +{ + spin_lock(&nn->nn_lock); + if (nsw == NULL) { + if (id > INT_MAX) + goto out; + + nsw = idr_find(&nn->nn_status_idr, id); + if (nsw == NULL) + goto out; + } + + o2net_complete_nsw_locked(nn, nsw, sys_status, status); + +out: + spin_unlock(&nn->nn_lock); + return; +} + +static void o2net_complete_nodes_nsw(struct o2net_node *nn) +{ + struct o2net_status_wait *nsw, *tmp; + unsigned int num_kills = 0; + + assert_spin_locked(&nn->nn_lock); + + list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) { + o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0); + num_kills++; + } + + mlog(0, "completed %d messages for node %u\n", num_kills, + o2net_num_from_nn(nn)); +} + +static int o2net_nsw_completed(struct o2net_node *nn, + struct o2net_status_wait *nsw) +{ + int completed; + spin_lock(&nn->nn_lock); + completed = list_empty(&nsw->ns_node_item); + spin_unlock(&nn->nn_lock); + return completed; +} + +/* ------------------------------------------------------------ */ + +static void sc_kref_release(struct kref *kref) +{ + struct o2net_sock_container *sc = container_of(kref, + struct o2net_sock_container, sc_kref); + BUG_ON(timer_pending(&sc->sc_idle_timeout)); + + sclog(sc, "releasing\n"); + + if (sc->sc_sock) { + sock_release(sc->sc_sock); + sc->sc_sock = NULL; + } + + o2nm_undepend_item(&sc->sc_node->nd_item); + o2nm_node_put(sc->sc_node); + sc->sc_node = NULL; + + o2net_debug_del_sc(sc); + kfree(sc); +} + +static void sc_put(struct o2net_sock_container *sc) +{ + sclog(sc, "put\n"); + kref_put(&sc->sc_kref, sc_kref_release); +} +static void sc_get(struct o2net_sock_container *sc) +{ + sclog(sc, "get\n"); + kref_get(&sc->sc_kref); +} +static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) +{ + struct o2net_sock_container *sc, *ret = NULL; + struct page *page = NULL; + int status = 0; + + page = alloc_page(GFP_NOFS); + sc = kzalloc(sizeof(*sc), GFP_NOFS); + if (sc == NULL || page == NULL) + goto out; + + kref_init(&sc->sc_kref); + o2nm_node_get(node); + sc->sc_node = node; + + /* pin the node item of the remote node */ + status = o2nm_depend_item(&node->nd_item); + if (status) { + mlog_errno(status); + o2nm_node_put(node); + goto out; + } + INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed); + INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty); + INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); + INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req); + + init_timer(&sc->sc_idle_timeout); + sc->sc_idle_timeout.function = o2net_idle_timer; + sc->sc_idle_timeout.data = (unsigned long)sc; + + sclog(sc, "alloced\n"); + + ret = sc; + sc->sc_page = page; + o2net_debug_add_sc(sc); + sc = NULL; + page = NULL; + +out: + if (page) + __free_page(page); + kfree(sc); + + return ret; +} + +/* ------------------------------------------------------------ */ + +static void o2net_sc_queue_work(struct o2net_sock_container *sc, + struct work_struct *work) +{ + sc_get(sc); + if (!queue_work(o2net_wq, work)) + sc_put(sc); +} +static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc, + struct delayed_work *work, + int delay) +{ + sc_get(sc); + if (!queue_delayed_work(o2net_wq, work, delay)) + sc_put(sc); +} +static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc, + struct delayed_work *work) +{ + if (cancel_delayed_work(work)) + sc_put(sc); +} + +static atomic_t o2net_connected_peers = ATOMIC_INIT(0); + +int o2net_num_connected_peers(void) +{ + return atomic_read(&o2net_connected_peers); +} + +static void o2net_set_nn_state(struct o2net_node *nn, + struct o2net_sock_container *sc, + unsigned valid, int err) +{ + int was_valid = nn->nn_sc_valid; + int was_err = nn->nn_persistent_error; + struct o2net_sock_container *old_sc = nn->nn_sc; + + assert_spin_locked(&nn->nn_lock); + + if (old_sc && !sc) + atomic_dec(&o2net_connected_peers); + else if (!old_sc && sc) + atomic_inc(&o2net_connected_peers); + + /* the node num comparison and single connect/accept path should stop + * an non-null sc from being overwritten with another */ + BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc); + mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); + mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); + + if (was_valid && !valid && err == 0) + err = -ENOTCONN; + + mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n", + o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid, + nn->nn_persistent_error, err); + + nn->nn_sc = sc; + nn->nn_sc_valid = valid ? 1 : 0; + nn->nn_persistent_error = err; + + /* mirrors o2net_tx_can_proceed() */ + if (nn->nn_persistent_error || nn->nn_sc_valid) + wake_up(&nn->nn_sc_wq); + + if (!was_err && nn->nn_persistent_error) { + o2quo_conn_err(o2net_num_from_nn(nn)); + queue_delayed_work(o2net_wq, &nn->nn_still_up, + msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); + } + + if (was_valid && !valid) { + printk(KERN_NOTICE "o2net: no longer connected to " + SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); + o2net_complete_nodes_nsw(nn); + } + + if (!was_valid && valid) { + o2quo_conn_up(o2net_num_from_nn(nn)); + cancel_delayed_work(&nn->nn_connect_expired); + printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n", + o2nm_this_node() > sc->sc_node->nd_num ? + "connected to" : "accepted connection from", + SC_NODEF_ARGS(sc)); + } + + /* trigger the connecting worker func as long as we're not valid, + * it will back off if it shouldn't connect. This can be called + * from node config teardown and so needs to be careful about + * the work queue actually being up. */ + if (!valid && o2net_wq) { + unsigned long delay; + /* delay if we're within a RECONNECT_DELAY of the + * last attempt */ + delay = (nn->nn_last_connect_attempt + + msecs_to_jiffies(o2net_reconnect_delay())) + - jiffies; + if (delay > msecs_to_jiffies(o2net_reconnect_delay())) + delay = 0; + mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); + queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); + + /* + * Delay the expired work after idle timeout. + * + * We might have lots of failed connection attempts that run + * through here but we only cancel the connect_expired work when + * a connection attempt succeeds. So only the first enqueue of + * the connect_expired work will do anything. The rest will see + * that it's already queued and do nothing. + */ + delay += msecs_to_jiffies(o2net_idle_timeout()); + queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay); + } + + /* keep track of the nn's sc ref for the caller */ + if ((old_sc == NULL) && sc) + sc_get(sc); + if (old_sc && (old_sc != sc)) { + o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work); + sc_put(old_sc); + } +} + +/* see o2net_register_callbacks() */ +static void o2net_data_ready(struct sock *sk, int bytes) +{ + void (*ready)(struct sock *sk, int bytes); + + read_lock(&sk->sk_callback_lock); + if (sk->sk_user_data) { + struct o2net_sock_container *sc = sk->sk_user_data; + sclog(sc, "data_ready hit\n"); + o2net_set_data_ready_time(sc); + o2net_sc_queue_work(sc, &sc->sc_rx_work); + ready = sc->sc_data_ready; + } else { + ready = sk->sk_data_ready; + } + read_unlock(&sk->sk_callback_lock); + + ready(sk, bytes); +} + +/* see o2net_register_callbacks() */ +static void o2net_state_change(struct sock *sk) +{ + void (*state_change)(struct sock *sk); + struct o2net_sock_container *sc; + + read_lock(&sk->sk_callback_lock); + sc = sk->sk_user_data; + if (sc == NULL) { + state_change = sk->sk_state_change; + goto out; + } + + sclog(sc, "state_change to %d\n", sk->sk_state); + + state_change = sc->sc_state_change; + + switch(sk->sk_state) { + /* ignore connecting sockets as they make progress */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + case TCP_ESTABLISHED: + o2net_sc_queue_work(sc, &sc->sc_connect_work); + break; + default: + printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT + " shutdown, state %d\n", + SC_NODEF_ARGS(sc), sk->sk_state); + o2net_sc_queue_work(sc, &sc->sc_shutdown_work); + break; + } +out: + read_unlock(&sk->sk_callback_lock); + state_change(sk); +} + +/* + * we register callbacks so we can queue work on events before calling + * the original callbacks. our callbacks our careful to test user_data + * to discover when they've reaced with o2net_unregister_callbacks(). + */ +static void o2net_register_callbacks(struct sock *sk, + struct o2net_sock_container *sc) +{ + write_lock_bh(&sk->sk_callback_lock); + + /* accepted sockets inherit the old listen socket data ready */ + if (sk->sk_data_ready == o2net_listen_data_ready) { + sk->sk_data_ready = sk->sk_user_data; + sk->sk_user_data = NULL; + } + + BUG_ON(sk->sk_user_data != NULL); + sk->sk_user_data = sc; + sc_get(sc); + + sc->sc_data_ready = sk->sk_data_ready; + sc->sc_state_change = sk->sk_state_change; + sk->sk_data_ready = o2net_data_ready; + sk->sk_state_change = o2net_state_change; + + mutex_init(&sc->sc_send_lock); + + write_unlock_bh(&sk->sk_callback_lock); +} + +static int o2net_unregister_callbacks(struct sock *sk, + struct o2net_sock_container *sc) +{ + int ret = 0; + + write_lock_bh(&sk->sk_callback_lock); + if (sk->sk_user_data == sc) { + ret = 1; + sk->sk_user_data = NULL; + sk->sk_data_ready = sc->sc_data_ready; + sk->sk_state_change = sc->sc_state_change; + } + write_unlock_bh(&sk->sk_callback_lock); + + return ret; +} + +/* + * this is a little helper that is called by callers who have seen a problem + * with an sc and want to detach it from the nn if someone already hasn't beat + * them to it. if an error is given then the shutdown will be persistent + * and pending transmits will be canceled. + */ +static void o2net_ensure_shutdown(struct o2net_node *nn, + struct o2net_sock_container *sc, + int err) +{ + spin_lock(&nn->nn_lock); + if (nn->nn_sc == sc) + o2net_set_nn_state(nn, NULL, 0, err); + spin_unlock(&nn->nn_lock); +} + +/* + * This work queue function performs the blocking parts of socket shutdown. A + * few paths lead here. set_nn_state will trigger this callback if it sees an + * sc detached from the nn. state_change will also trigger this callback + * directly when it sees errors. In that case we need to call set_nn_state + * ourselves as state_change couldn't get the nn_lock and call set_nn_state + * itself. + */ +static void o2net_shutdown_sc(struct work_struct *work) +{ + struct o2net_sock_container *sc = + container_of(work, struct o2net_sock_container, + sc_shutdown_work); + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + + sclog(sc, "shutting down\n"); + + /* drop the callbacks ref and call shutdown only once */ + if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) { + /* we shouldn't flush as we're in the thread, the + * races with pending sc work structs are harmless */ + del_timer_sync(&sc->sc_idle_timeout); + o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); + sc_put(sc); + kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR); + } + + /* not fatal so failed connects before the other guy has our + * heartbeat can be retried */ + o2net_ensure_shutdown(nn, sc, 0); + sc_put(sc); +} + +/* ------------------------------------------------------------ */ + +static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type, + u32 key) +{ + int ret = memcmp(&nmh->nh_key, &key, sizeof(key)); + + if (ret == 0) + ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type)); + + return ret; +} + +static struct o2net_msg_handler * +o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p, + struct rb_node **ret_parent) +{ + struct rb_node **p = &o2net_handler_tree.rb_node; + struct rb_node *parent = NULL; + struct o2net_msg_handler *nmh, *ret = NULL; + int cmp; + + while (*p) { + parent = *p; + nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); + cmp = o2net_handler_cmp(nmh, msg_type, key); + + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { + ret = nmh; + break; + } + } + + if (ret_p != NULL) + *ret_p = p; + if (ret_parent != NULL) + *ret_parent = parent; + + return ret; +} + +static void o2net_handler_kref_release(struct kref *kref) +{ + struct o2net_msg_handler *nmh; + nmh = container_of(kref, struct o2net_msg_handler, nh_kref); + + kfree(nmh); +} + +static void o2net_handler_put(struct o2net_msg_handler *nmh) +{ + kref_put(&nmh->nh_kref, o2net_handler_kref_release); +} + +/* max_len is protection for the handler func. incoming messages won't + * be given to the handler if their payload is longer than the max. */ +int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, + o2net_msg_handler_func *func, void *data, + o2net_post_msg_handler_func *post_func, + struct list_head *unreg_list) +{ + struct o2net_msg_handler *nmh = NULL; + struct rb_node **p, *parent; + int ret = 0; + + if (max_len > O2NET_MAX_PAYLOAD_BYTES) { + mlog(0, "max_len for message handler out of range: %u\n", + max_len); + ret = -EINVAL; + goto out; + } + + if (!msg_type) { + mlog(0, "no message type provided: %u, %p\n", msg_type, func); + ret = -EINVAL; + goto out; + + } + if (!func) { + mlog(0, "no message handler provided: %u, %p\n", + msg_type, func); + ret = -EINVAL; + goto out; + } + + nmh = kzalloc(sizeof(struct o2net_msg_handler), GFP_NOFS); + if (nmh == NULL) { + ret = -ENOMEM; + goto out; + } + + nmh->nh_func = func; + nmh->nh_func_data = data; + nmh->nh_post_func = post_func; + nmh->nh_msg_type = msg_type; + nmh->nh_max_len = max_len; + nmh->nh_key = key; + /* the tree and list get this ref.. they're both removed in + * unregister when this ref is dropped */ + kref_init(&nmh->nh_kref); + INIT_LIST_HEAD(&nmh->nh_unregister_item); + + write_lock(&o2net_handler_lock); + if (o2net_handler_tree_lookup(msg_type, key, &p, &parent)) + ret = -EEXIST; + else { + rb_link_node(&nmh->nh_node, parent, p); + rb_insert_color(&nmh->nh_node, &o2net_handler_tree); + list_add_tail(&nmh->nh_unregister_item, unreg_list); + + mlog(ML_TCP, "registered handler func %p type %u key %08x\n", + func, msg_type, key); + /* we've had some trouble with handlers seemingly vanishing. */ + mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p, + &parent) == NULL, + "couldn't find handler we *just* registerd " + "for type %u key %08x\n", msg_type, key); + } + write_unlock(&o2net_handler_lock); + if (ret) + goto out; + +out: + if (ret) + kfree(nmh); + + return ret; +} +EXPORT_SYMBOL_GPL(o2net_register_handler); + +void o2net_unregister_handler_list(struct list_head *list) +{ + struct o2net_msg_handler *nmh, *n; + + write_lock(&o2net_handler_lock); + list_for_each_entry_safe(nmh, n, list, nh_unregister_item) { + mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", + nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); + rb_erase(&nmh->nh_node, &o2net_handler_tree); + list_del_init(&nmh->nh_unregister_item); + kref_put(&nmh->nh_kref, o2net_handler_kref_release); + } + write_unlock(&o2net_handler_lock); +} +EXPORT_SYMBOL_GPL(o2net_unregister_handler_list); + +static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key) +{ + struct o2net_msg_handler *nmh; + + read_lock(&o2net_handler_lock); + nmh = o2net_handler_tree_lookup(msg_type, key, NULL, NULL); + if (nmh) + kref_get(&nmh->nh_kref); + read_unlock(&o2net_handler_lock); + + return nmh; +} + +/* ------------------------------------------------------------ */ + +static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) +{ + int ret; + mm_segment_t oldfs; + struct kvec vec = { + .iov_len = len, + .iov_base = data, + }; + struct msghdr msg = { + .msg_iovlen = 1, + .msg_iov = (struct iovec *)&vec, + .msg_flags = MSG_DONTWAIT, + }; + + oldfs = get_fs(); + set_fs(get_ds()); + ret = sock_recvmsg(sock, &msg, len, msg.msg_flags); + set_fs(oldfs); + + return ret; +} + +static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, + size_t veclen, size_t total) +{ + int ret; + mm_segment_t oldfs; + struct msghdr msg = { + .msg_iov = (struct iovec *)vec, + .msg_iovlen = veclen, + }; + + if (sock == NULL) { + ret = -EINVAL; + goto out; + } + + oldfs = get_fs(); + set_fs(get_ds()); + ret = sock_sendmsg(sock, &msg, total); + set_fs(oldfs); + if (ret != total) { + mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, + total); + if (ret >= 0) + ret = -EPIPE; /* should be smarter, I bet */ + goto out; + } + + ret = 0; +out: + if (ret < 0) + mlog(0, "returning error: %d\n", ret); + return ret; +} + +static void o2net_sendpage(struct o2net_sock_container *sc, + void *kmalloced_virt, + size_t size) +{ + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + ssize_t ret; + + while (1) { + mutex_lock(&sc->sc_send_lock); + ret = sc->sc_sock->ops->sendpage(sc->sc_sock, + virt_to_page(kmalloced_virt), + (long)kmalloced_virt & ~PAGE_MASK, + size, MSG_DONTWAIT); + mutex_unlock(&sc->sc_send_lock); + if (ret == size) + break; + if (ret == (ssize_t)-EAGAIN) { + mlog(0, "sendpage of size %zu to " SC_NODEF_FMT + " returned EAGAIN\n", size, SC_NODEF_ARGS(sc)); + cond_resched(); + continue; + } + mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT + " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); + o2net_ensure_shutdown(nn, sc, 0); + break; + } +} + +static void o2net_init_msg(struct o2net_msg *msg, u16 data_len, u16 msg_type, u32 key) +{ + memset(msg, 0, sizeof(struct o2net_msg)); + msg->magic = cpu_to_be16(O2NET_MSG_MAGIC); + msg->data_len = cpu_to_be16(data_len); + msg->msg_type = cpu_to_be16(msg_type); + msg->sys_status = cpu_to_be32(O2NET_ERR_NONE); + msg->status = 0; + msg->key = cpu_to_be32(key); +} + +static int o2net_tx_can_proceed(struct o2net_node *nn, + struct o2net_sock_container **sc_ret, + int *error) +{ + int ret = 0; + + spin_lock(&nn->nn_lock); + if (nn->nn_persistent_error) { + ret = 1; + *sc_ret = NULL; + *error = nn->nn_persistent_error; + } else if (nn->nn_sc_valid) { + kref_get(&nn->nn_sc->sc_kref); + + ret = 1; + *sc_ret = nn->nn_sc; + *error = 0; + } + spin_unlock(&nn->nn_lock); + + return ret; +} + +int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, + size_t caller_veclen, u8 target_node, int *status) +{ + int ret = 0; + struct o2net_msg *msg = NULL; + size_t veclen, caller_bytes = 0; + struct kvec *vec = NULL; + struct o2net_sock_container *sc = NULL; + struct o2net_node *nn = o2net_nn_from_num(target_node); + struct o2net_status_wait nsw = { + .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), + }; + struct o2net_send_tracking nst; + + o2net_init_nst(&nst, msg_type, key, current, target_node); + + if (o2net_wq == NULL) { + mlog(0, "attempt to tx without o2netd running\n"); + ret = -ESRCH; + goto out; + } + + if (caller_veclen == 0) { + mlog(0, "bad kvec array length\n"); + ret = -EINVAL; + goto out; + } + + caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen); + if (caller_bytes > O2NET_MAX_PAYLOAD_BYTES) { + mlog(0, "total payload len %zu too large\n", caller_bytes); + ret = -EINVAL; + goto out; + } + + if (target_node == o2nm_this_node()) { + ret = -ELOOP; + goto out; + } + + o2net_debug_add_nst(&nst); + + o2net_set_nst_sock_time(&nst); + + wait_event(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &ret)); + if (ret) + goto out; + + o2net_set_nst_sock_container(&nst, sc); + + veclen = caller_veclen + 1; + vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); + if (vec == NULL) { + mlog(0, "failed to %zu element kvec!\n", veclen); + ret = -ENOMEM; + goto out; + } + + msg = kmalloc(sizeof(struct o2net_msg), GFP_ATOMIC); + if (!msg) { + mlog(0, "failed to allocate a o2net_msg!\n"); + ret = -ENOMEM; + goto out; + } + + o2net_init_msg(msg, caller_bytes, msg_type, key); + + vec[0].iov_len = sizeof(struct o2net_msg); + vec[0].iov_base = msg; + memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec)); + + ret = o2net_prep_nsw(nn, &nsw); + if (ret) + goto out; + + msg->msg_num = cpu_to_be32(nsw.ns_id); + o2net_set_nst_msg_id(&nst, nsw.ns_id); + + o2net_set_nst_send_time(&nst); + + /* finally, convert the message header to network byte-order + * and send */ + mutex_lock(&sc->sc_send_lock); + ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen, + sizeof(struct o2net_msg) + caller_bytes); + mutex_unlock(&sc->sc_send_lock); + msglog(msg, "sending returned %d\n", ret); + if (ret < 0) { + mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret); + goto out; + } + + /* wait on other node's handler */ + o2net_set_nst_status_time(&nst); + wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); + + o2net_update_send_stats(&nst, sc); + + /* Note that we avoid overwriting the callers status return + * variable if a system error was reported on the other + * side. Callers beware. */ + ret = o2net_sys_err_to_errno(nsw.ns_sys_status); + if (status && !ret) + *status = nsw.ns_status; + + mlog(0, "woken, returning system status %d, user status %d\n", + ret, nsw.ns_status); +out: + o2net_debug_del_nst(&nst); /* must be before dropping sc and node */ + if (sc) + sc_put(sc); + if (vec) + kfree(vec); + if (msg) + kfree(msg); + o2net_complete_nsw(nn, &nsw, 0, 0, 0); + return ret; +} +EXPORT_SYMBOL_GPL(o2net_send_message_vec); + +int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len, + u8 target_node, int *status) +{ + struct kvec vec = { + .iov_base = data, + .iov_len = len, + }; + return o2net_send_message_vec(msg_type, key, &vec, 1, + target_node, status); +} +EXPORT_SYMBOL_GPL(o2net_send_message); + +static int o2net_send_status_magic(struct socket *sock, struct o2net_msg *hdr, + enum o2net_system_error syserr, int err) +{ + struct kvec vec = { + .iov_base = hdr, + .iov_len = sizeof(struct o2net_msg), + }; + + BUG_ON(syserr >= O2NET_ERR_MAX); + + /* leave other fields intact from the incoming message, msg_num + * in particular */ + hdr->sys_status = cpu_to_be32(syserr); + hdr->status = cpu_to_be32(err); + hdr->magic = cpu_to_be16(O2NET_MSG_STATUS_MAGIC); // twiddle the magic + hdr->data_len = 0; + + msglog(hdr, "about to send status magic %d\n", err); + /* hdr has been in host byteorder this whole time */ + return o2net_send_tcp_msg(sock, &vec, 1, sizeof(struct o2net_msg)); +} + +/* this returns -errno if the header was unknown or too large, etc. + * after this is called the buffer us reused for the next message */ +static int o2net_process_message(struct o2net_sock_container *sc, + struct o2net_msg *hdr) +{ + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + int ret = 0, handler_status; + enum o2net_system_error syserr; + struct o2net_msg_handler *nmh = NULL; + void *ret_data = NULL; + + msglog(hdr, "processing message\n"); + + o2net_sc_postpone_idle(sc); + + switch(be16_to_cpu(hdr->magic)) { + case O2NET_MSG_STATUS_MAGIC: + /* special type for returning message status */ + o2net_complete_nsw(nn, NULL, + be32_to_cpu(hdr->msg_num), + be32_to_cpu(hdr->sys_status), + be32_to_cpu(hdr->status)); + goto out; + case O2NET_MSG_KEEP_REQ_MAGIC: + o2net_sendpage(sc, o2net_keep_resp, + sizeof(*o2net_keep_resp)); + goto out; + case O2NET_MSG_KEEP_RESP_MAGIC: + goto out; + case O2NET_MSG_MAGIC: + break; + default: + msglog(hdr, "bad magic\n"); + ret = -EINVAL; + goto out; + break; + } + + /* find a handler for it */ + handler_status = 0; + nmh = o2net_handler_get(be16_to_cpu(hdr->msg_type), + be32_to_cpu(hdr->key)); + if (!nmh) { + mlog(ML_TCP, "couldn't find handler for type %u key %08x\n", + be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key)); + syserr = O2NET_ERR_NO_HNDLR; + goto out_respond; + } + + syserr = O2NET_ERR_NONE; + + if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len) + syserr = O2NET_ERR_OVERFLOW; + + if (syserr != O2NET_ERR_NONE) + goto out_respond; + + o2net_set_func_start_time(sc); + sc->sc_msg_key = be32_to_cpu(hdr->key); + sc->sc_msg_type = be16_to_cpu(hdr->msg_type); + handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) + + be16_to_cpu(hdr->data_len), + nmh->nh_func_data, &ret_data); + o2net_set_func_stop_time(sc); + + o2net_update_recv_stats(sc); + +out_respond: + /* this destroys the hdr, so don't use it after this */ + mutex_lock(&sc->sc_send_lock); + ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr, + handler_status); + mutex_unlock(&sc->sc_send_lock); + hdr = NULL; + mlog(0, "sending handler status %d, syserr %d returned %d\n", + handler_status, syserr, ret); + + if (nmh) { + BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL); + if (nmh->nh_post_func) + (nmh->nh_post_func)(handler_status, nmh->nh_func_data, + ret_data); + } + +out: + if (nmh) + o2net_handler_put(nmh); + return ret; +} + +static int o2net_check_handshake(struct o2net_sock_container *sc) +{ + struct o2net_handshake *hand = page_address(sc->sc_page); + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + + if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { + mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol " + "version %llu but %llu is required, disconnecting\n", + SC_NODEF_ARGS(sc), + (unsigned long long)be64_to_cpu(hand->protocol_version), + O2NET_PROTOCOL_VERSION); + + /* don't bother reconnecting if its the wrong version. */ + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + /* + * Ensure timeouts are consistent with other nodes, otherwise + * we can end up with one node thinking that the other must be down, + * but isn't. This can ultimately cause corruption. + */ + if (be32_to_cpu(hand->o2net_idle_timeout_ms) != + o2net_idle_timeout()) { + mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " + "%u ms, but we use %u ms locally. disconnecting\n", + SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2net_idle_timeout_ms), + o2net_idle_timeout()); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != + o2net_keepalive_delay()) { + mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " + "%u ms, but we use %u ms locally. disconnecting\n", + SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2net_keepalive_delay_ms), + o2net_keepalive_delay()); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != + O2HB_MAX_WRITE_TIMEOUT_MS) { + mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of " + "%u ms, but we use %u ms locally. disconnecting\n", + SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), + O2HB_MAX_WRITE_TIMEOUT_MS); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + sc->sc_handshake_ok = 1; + + spin_lock(&nn->nn_lock); + /* set valid and queue the idle timers only if it hasn't been + * shut down already */ + if (nn->nn_sc == sc) { + o2net_sc_reset_idle_timer(sc); + atomic_set(&nn->nn_timeout, 0); + o2net_set_nn_state(nn, sc, 1, 0); + } + spin_unlock(&nn->nn_lock); + + /* shift everything up as though it wasn't there */ + sc->sc_page_off -= sizeof(struct o2net_handshake); + if (sc->sc_page_off) + memmove(hand, hand + 1, sc->sc_page_off); + + return 0; +} + +/* this demuxes the queued rx bytes into header or payload bits and calls + * handlers as each full message is read off the socket. it returns -error, + * == 0 eof, or > 0 for progress made.*/ +static int o2net_advance_rx(struct o2net_sock_container *sc) +{ + struct o2net_msg *hdr; + int ret = 0; + void *data; + size_t datalen; + + sclog(sc, "receiving\n"); + o2net_set_advance_start_time(sc); + + if (unlikely(sc->sc_handshake_ok == 0)) { + if(sc->sc_page_off < sizeof(struct o2net_handshake)) { + data = page_address(sc->sc_page) + sc->sc_page_off; + datalen = sizeof(struct o2net_handshake) - sc->sc_page_off; + ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); + if (ret > 0) + sc->sc_page_off += ret; + } + + if (sc->sc_page_off == sizeof(struct o2net_handshake)) { + o2net_check_handshake(sc); + if (unlikely(sc->sc_handshake_ok == 0)) + ret = -EPROTO; + } + goto out; + } + + /* do we need more header? */ + if (sc->sc_page_off < sizeof(struct o2net_msg)) { + data = page_address(sc->sc_page) + sc->sc_page_off; + datalen = sizeof(struct o2net_msg) - sc->sc_page_off; + ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); + if (ret > 0) { + sc->sc_page_off += ret; + /* only swab incoming here.. we can + * only get here once as we cross from + * being under to over */ + if (sc->sc_page_off == sizeof(struct o2net_msg)) { + hdr = page_address(sc->sc_page); + if (be16_to_cpu(hdr->data_len) > + O2NET_MAX_PAYLOAD_BYTES) + ret = -EOVERFLOW; + } + } + if (ret <= 0) + goto out; + } + + if (sc->sc_page_off < sizeof(struct o2net_msg)) { + /* oof, still don't have a header */ + goto out; + } + + /* this was swabbed above when we first read it */ + hdr = page_address(sc->sc_page); + + msglog(hdr, "at page_off %zu\n", sc->sc_page_off); + + /* do we need more payload? */ + if (sc->sc_page_off - sizeof(struct o2net_msg) < be16_to_cpu(hdr->data_len)) { + /* need more payload */ + data = page_address(sc->sc_page) + sc->sc_page_off; + datalen = (sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len)) - + sc->sc_page_off; + ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); + if (ret > 0) + sc->sc_page_off += ret; + if (ret <= 0) + goto out; + } + + if (sc->sc_page_off - sizeof(struct o2net_msg) == be16_to_cpu(hdr->data_len)) { + /* we can only get here once, the first time we read + * the payload.. so set ret to progress if the handler + * works out. after calling this the message is toast */ + ret = o2net_process_message(sc, hdr); + if (ret == 0) + ret = 1; + sc->sc_page_off = 0; + } + +out: + sclog(sc, "ret = %d\n", ret); + o2net_set_advance_stop_time(sc); + return ret; +} + +/* this work func is triggerd by data ready. it reads until it can read no + * more. it interprets 0, eof, as fatal. if data_ready hits while we're doing + * our work the work struct will be marked and we'll be called again. */ +static void o2net_rx_until_empty(struct work_struct *work) +{ + struct o2net_sock_container *sc = + container_of(work, struct o2net_sock_container, sc_rx_work); + int ret; + + do { + ret = o2net_advance_rx(sc); + } while (ret > 0); + + if (ret <= 0 && ret != -EAGAIN) { + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + sclog(sc, "saw error %d, closing\n", ret); + /* not permanent so read failed handshake can retry */ + o2net_ensure_shutdown(nn, sc, 0); + } + + sc_put(sc); +} + +static int o2net_set_nodelay(struct socket *sock) +{ + int ret, val = 1; + mm_segment_t oldfs; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + + /* + * Dear unsuspecting programmer, + * + * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level + * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will + * silently turn into SO_DEBUG. + * + * Yours, + * Keeper of hilariously fragile interfaces. + */ + ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char __user *)&val, sizeof(val)); + + set_fs(oldfs); + return ret; +} + +static void o2net_initialize_handshake(void) +{ + o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( + O2HB_MAX_WRITE_TIMEOUT_MS); + o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout()); + o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( + o2net_keepalive_delay()); + o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( + o2net_reconnect_delay()); +} + +/* ------------------------------------------------------------ */ + +/* called when a connect completes and after a sock is accepted. the + * rx path will see the response and mark the sc valid */ +static void o2net_sc_connect_completed(struct work_struct *work) +{ + struct o2net_sock_container *sc = + container_of(work, struct o2net_sock_container, + sc_connect_work); + + mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n", + (unsigned long long)O2NET_PROTOCOL_VERSION, + (unsigned long long)be64_to_cpu(o2net_hand->connector_id)); + + o2net_initialize_handshake(); + o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); + sc_put(sc); +} + +/* this is called as a work_struct func. */ +static void o2net_sc_send_keep_req(struct work_struct *work) +{ + struct o2net_sock_container *sc = + container_of(work, struct o2net_sock_container, + sc_keepalive_work.work); + + o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req)); + sc_put(sc); +} + +/* socket shutdown does a del_timer_sync against this as it tears down. + * we can't start this timer until we've got to the point in sc buildup + * where shutdown is going to be involved */ +static void o2net_idle_timer(unsigned long data) +{ + struct o2net_sock_container *sc = (struct o2net_sock_container *)data; + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + +#ifdef CONFIG_DEBUG_FS + ktime_t now = ktime_get(); +#endif + + printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " + "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), + o2net_idle_timeout() / 1000, + o2net_idle_timeout() % 1000); + +#ifdef CONFIG_DEBUG_FS + mlog(ML_NOTICE, "Here are some times that might help debug the " + "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, " + "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n", + (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now), + (long long)ktime_to_us(sc->sc_tv_data_ready), + (long long)ktime_to_us(sc->sc_tv_advance_start), + (long long)ktime_to_us(sc->sc_tv_advance_stop), + sc->sc_msg_key, sc->sc_msg_type, + (long long)ktime_to_us(sc->sc_tv_func_start), + (long long)ktime_to_us(sc->sc_tv_func_stop)); +#endif + + /* + * Initialize the nn_timeout so that the next connection attempt + * will continue in o2net_start_connect. + */ + atomic_set(&nn->nn_timeout, 1); + + o2net_sc_queue_work(sc, &sc->sc_shutdown_work); +} + +static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) +{ + o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); + o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, + msecs_to_jiffies(o2net_keepalive_delay())); + o2net_set_sock_timer(sc); + mod_timer(&sc->sc_idle_timeout, + jiffies + msecs_to_jiffies(o2net_idle_timeout())); +} + +static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) +{ + /* Only push out an existing timer */ + if (timer_pending(&sc->sc_idle_timeout)) + o2net_sc_reset_idle_timer(sc); +} + +/* this work func is kicked whenever a path sets the nn state which doesn't + * have valid set. This includes seeing hb come up, losing a connection, + * having a connect attempt fail, etc. This centralizes the logic which decides + * if a connect attempt should be made or if we should give up and all future + * transmit attempts should fail */ +static void o2net_start_connect(struct work_struct *work) +{ + struct o2net_node *nn = + container_of(work, struct o2net_node, nn_connect_work.work); + struct o2net_sock_container *sc = NULL; + struct o2nm_node *node = NULL, *mynode = NULL; + struct socket *sock = NULL; + struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; + int ret = 0, stop; + unsigned int timeout; + + /* if we're greater we initiate tx, otherwise we accept */ + if (o2nm_this_node() <= o2net_num_from_nn(nn)) + goto out; + + /* watch for racing with tearing a node down */ + node = o2nm_get_node_by_num(o2net_num_from_nn(nn)); + if (node == NULL) { + ret = 0; + goto out; + } + + mynode = o2nm_get_node_by_num(o2nm_this_node()); + if (mynode == NULL) { + ret = 0; + goto out; + } + + spin_lock(&nn->nn_lock); + /* + * see if we already have one pending or have given up. + * For nn_timeout, it is set when we close the connection + * because of the idle time out. So it means that we have + * at least connected to that node successfully once, + * now try to connect to it again. + */ + timeout = atomic_read(&nn->nn_timeout); + stop = (nn->nn_sc || + (nn->nn_persistent_error && + (nn->nn_persistent_error != -ENOTCONN || timeout == 0))); + spin_unlock(&nn->nn_lock); + if (stop) + goto out; + + nn->nn_last_connect_attempt = jiffies; + + sc = sc_alloc(node); + if (sc == NULL) { + mlog(0, "couldn't allocate sc\n"); + ret = -ENOMEM; + goto out; + } + + ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret < 0) { + mlog(0, "can't create socket: %d\n", ret); + goto out; + } + sc->sc_sock = sock; /* freed by sc_kref_release */ + + sock->sk->sk_allocation = GFP_ATOMIC; + + myaddr.sin_family = AF_INET; + myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; + myaddr.sin_port = htons(0); /* any port */ + + ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, + sizeof(myaddr)); + if (ret) { + mlog(ML_ERROR, "bind failed with %d at address %pI4\n", + ret, &mynode->nd_ipv4_address); + goto out; + } + + ret = o2net_set_nodelay(sc->sc_sock); + if (ret) { + mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); + goto out; + } + + o2net_register_callbacks(sc->sc_sock->sk, sc); + + spin_lock(&nn->nn_lock); + /* handshake completion will set nn->nn_sc_valid */ + o2net_set_nn_state(nn, sc, 0, 0); + spin_unlock(&nn->nn_lock); + + remoteaddr.sin_family = AF_INET; + remoteaddr.sin_addr.s_addr = node->nd_ipv4_address; + remoteaddr.sin_port = node->nd_ipv4_port; + + ret = sc->sc_sock->ops->connect(sc->sc_sock, + (struct sockaddr *)&remoteaddr, + sizeof(remoteaddr), + O_NONBLOCK); + if (ret == -EINPROGRESS) + ret = 0; + +out: + if (ret) { + mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed " + "with errno %d\n", SC_NODEF_ARGS(sc), ret); + /* 0 err so that another will be queued and attempted + * from set_nn_state */ + if (sc) + o2net_ensure_shutdown(nn, sc, 0); + } + if (sc) + sc_put(sc); + if (node) + o2nm_node_put(node); + if (mynode) + o2nm_node_put(mynode); + + return; +} + +static void o2net_connect_expired(struct work_struct *work) +{ + struct o2net_node *nn = + container_of(work, struct o2net_node, nn_connect_expired.work); + + spin_lock(&nn->nn_lock); + if (!nn->nn_sc_valid) { + mlog(ML_ERROR, "no connection established with node %u after " + "%u.%u seconds, giving up and returning errors.\n", + o2net_num_from_nn(nn), + o2net_idle_timeout() / 1000, + o2net_idle_timeout() % 1000); + + o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); + } + spin_unlock(&nn->nn_lock); +} + +static void o2net_still_up(struct work_struct *work) +{ + struct o2net_node *nn = + container_of(work, struct o2net_node, nn_still_up.work); + + o2quo_hb_still_up(o2net_num_from_nn(nn)); +} + +/* ------------------------------------------------------------ */ + +void o2net_disconnect_node(struct o2nm_node *node) +{ + struct o2net_node *nn = o2net_nn_from_num(node->nd_num); + + /* don't reconnect until it's heartbeating again */ + spin_lock(&nn->nn_lock); + atomic_set(&nn->nn_timeout, 0); + o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); + spin_unlock(&nn->nn_lock); + + if (o2net_wq) { + cancel_delayed_work(&nn->nn_connect_expired); + cancel_delayed_work(&nn->nn_connect_work); + cancel_delayed_work(&nn->nn_still_up); + flush_workqueue(o2net_wq); + } +} + +static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num, + void *data) +{ + o2quo_hb_down(node_num); + + if (!node) + return; + + if (node_num != o2nm_this_node()) + o2net_disconnect_node(node); + + BUG_ON(atomic_read(&o2net_connected_peers) < 0); +} + +static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, + void *data) +{ + struct o2net_node *nn = o2net_nn_from_num(node_num); + + o2quo_hb_up(node_num); + + BUG_ON(!node); + + /* ensure an immediate connect attempt */ + nn->nn_last_connect_attempt = jiffies - + (msecs_to_jiffies(o2net_reconnect_delay()) + 1); + + if (node_num != o2nm_this_node()) { + /* believe it or not, accept and node hearbeating testing + * can succeed for this node before we got here.. so + * only use set_nn_state to clear the persistent error + * if that hasn't already happened */ + spin_lock(&nn->nn_lock); + atomic_set(&nn->nn_timeout, 0); + if (nn->nn_persistent_error) + o2net_set_nn_state(nn, NULL, 0, 0); + spin_unlock(&nn->nn_lock); + } +} + +void o2net_unregister_hb_callbacks(void) +{ + o2hb_unregister_callback(NULL, &o2net_hb_up); + o2hb_unregister_callback(NULL, &o2net_hb_down); +} + +int o2net_register_hb_callbacks(void) +{ + int ret; + + o2hb_setup_callback(&o2net_hb_down, O2HB_NODE_DOWN_CB, + o2net_hb_node_down_cb, NULL, O2NET_HB_PRI); + o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB, + o2net_hb_node_up_cb, NULL, O2NET_HB_PRI); + + ret = o2hb_register_callback(NULL, &o2net_hb_up); + if (ret == 0) + ret = o2hb_register_callback(NULL, &o2net_hb_down); + + if (ret) + o2net_unregister_hb_callbacks(); + + return ret; +} + +/* ------------------------------------------------------------ */ + +static int o2net_accept_one(struct socket *sock) +{ + int ret, slen; + struct sockaddr_in sin; + struct socket *new_sock = NULL; + struct o2nm_node *node = NULL; + struct o2nm_node *local_node = NULL; + struct o2net_sock_container *sc = NULL; + struct o2net_node *nn; + + BUG_ON(sock == NULL); + ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, + sock->sk->sk_protocol, &new_sock); + if (ret) + goto out; + + new_sock->type = sock->type; + new_sock->ops = sock->ops; + ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); + if (ret < 0) + goto out; + + new_sock->sk->sk_allocation = GFP_ATOMIC; + + ret = o2net_set_nodelay(new_sock); + if (ret) { + mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); + goto out; + } + + slen = sizeof(sin); + ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, + &slen, 1); + if (ret < 0) + goto out; + + node = o2nm_get_node_by_ip(sin.sin_addr.s_addr); + if (node == NULL) { + mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n", + &sin.sin_addr.s_addr, ntohs(sin.sin_port)); + ret = -EINVAL; + goto out; + } + + if (o2nm_this_node() >= node->nd_num) { + local_node = o2nm_get_node_by_num(o2nm_this_node()); + mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' (" + "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n", + local_node->nd_name, local_node->nd_num, + &(local_node->nd_ipv4_address), + ntohs(local_node->nd_ipv4_port), + node->nd_name, node->nd_num, &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); + ret = -EINVAL; + goto out; + } + + /* this happens all the time when the other node sees our heartbeat + * and tries to connect before we see their heartbeat */ + if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) { + mlog(ML_CONN, "attempt to connect from node '%s' at " + "%pI4:%d but it isn't heartbeating\n", + node->nd_name, &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); + ret = -EINVAL; + goto out; + } + + nn = o2net_nn_from_num(node->nd_num); + + spin_lock(&nn->nn_lock); + if (nn->nn_sc) + ret = -EBUSY; + else + ret = 0; + spin_unlock(&nn->nn_lock); + if (ret) { + mlog(ML_NOTICE, "attempt to connect from node '%s' at " + "%pI4:%d but it already has an open connection\n", + node->nd_name, &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); + goto out; + } + + sc = sc_alloc(node); + if (sc == NULL) { + ret = -ENOMEM; + goto out; + } + + sc->sc_sock = new_sock; + new_sock = NULL; + + spin_lock(&nn->nn_lock); + atomic_set(&nn->nn_timeout, 0); + o2net_set_nn_state(nn, sc, 0, 0); + spin_unlock(&nn->nn_lock); + + o2net_register_callbacks(sc->sc_sock->sk, sc); + o2net_sc_queue_work(sc, &sc->sc_rx_work); + + o2net_initialize_handshake(); + o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); + +out: + if (new_sock) + sock_release(new_sock); + if (node) + o2nm_node_put(node); + if (local_node) + o2nm_node_put(local_node); + if (sc) + sc_put(sc); + return ret; +} + +static void o2net_accept_many(struct work_struct *work) +{ + struct socket *sock = o2net_listen_sock; + while (o2net_accept_one(sock) == 0) + cond_resched(); +} + +static void o2net_listen_data_ready(struct sock *sk, int bytes) +{ + void (*ready)(struct sock *sk, int bytes); + + read_lock(&sk->sk_callback_lock); + ready = sk->sk_user_data; + if (ready == NULL) { /* check for teardown race */ + ready = sk->sk_data_ready; + goto out; + } + + /* ->sk_data_ready is also called for a newly established child socket + * before it has been accepted and the acceptor has set up their + * data_ready.. we only want to queue listen work for our listening + * socket */ + if (sk->sk_state == TCP_LISTEN) { + mlog(ML_TCP, "bytes: %d\n", bytes); + queue_work(o2net_wq, &o2net_listen_work); + } + +out: + read_unlock(&sk->sk_callback_lock); + ready(sk, bytes); +} + +static int o2net_open_listening_sock(__be32 addr, __be16 port) +{ + struct socket *sock = NULL; + int ret; + struct sockaddr_in sin = { + .sin_family = PF_INET, + .sin_addr = { .s_addr = addr }, + .sin_port = port, + }; + + ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret < 0) { + mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret); + goto out; + } + + sock->sk->sk_allocation = GFP_ATOMIC; + + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_user_data = sock->sk->sk_data_ready; + sock->sk->sk_data_ready = o2net_listen_data_ready; + write_unlock_bh(&sock->sk->sk_callback_lock); + + o2net_listen_sock = sock; + INIT_WORK(&o2net_listen_work, o2net_accept_many); + + sock->sk->sk_reuse = 1; + ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); + if (ret < 0) { + mlog(ML_ERROR, "unable to bind socket at %pI4:%u, " + "ret=%d\n", &addr, ntohs(port), ret); + goto out; + } + + ret = sock->ops->listen(sock, 64); + if (ret < 0) { + mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n", + &addr, ntohs(port), ret); + } + +out: + if (ret) { + o2net_listen_sock = NULL; + if (sock) + sock_release(sock); + } + return ret; +} + +/* + * called from node manager when we should bring up our network listening + * socket. node manager handles all the serialization to only call this + * once and to match it with o2net_stop_listening(). note, + * o2nm_this_node() doesn't work yet as we're being called while it + * is being set up. + */ +int o2net_start_listening(struct o2nm_node *node) +{ + int ret = 0; + + BUG_ON(o2net_wq != NULL); + BUG_ON(o2net_listen_sock != NULL); + + mlog(ML_KTHREAD, "starting o2net thread...\n"); + o2net_wq = create_singlethread_workqueue("o2net"); + if (o2net_wq == NULL) { + mlog(ML_ERROR, "unable to launch o2net thread\n"); + return -ENOMEM; /* ? */ + } + + ret = o2net_open_listening_sock(node->nd_ipv4_address, + node->nd_ipv4_port); + if (ret) { + destroy_workqueue(o2net_wq); + o2net_wq = NULL; + } else + o2quo_conn_up(node->nd_num); + + return ret; +} + +/* again, o2nm_this_node() doesn't work here as we're involved in + * tearing it down */ +void o2net_stop_listening(struct o2nm_node *node) +{ + struct socket *sock = o2net_listen_sock; + size_t i; + + BUG_ON(o2net_wq == NULL); + BUG_ON(o2net_listen_sock == NULL); + + /* stop the listening socket from generating work */ + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_data_ready = sock->sk->sk_user_data; + sock->sk->sk_user_data = NULL; + write_unlock_bh(&sock->sk->sk_callback_lock); + + for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { + struct o2nm_node *node = o2nm_get_node_by_num(i); + if (node) { + o2net_disconnect_node(node); + o2nm_node_put(node); + } + } + + /* finish all work and tear down the work queue */ + mlog(ML_KTHREAD, "waiting for o2net thread to exit....\n"); + destroy_workqueue(o2net_wq); + o2net_wq = NULL; + + sock_release(o2net_listen_sock); + o2net_listen_sock = NULL; + + o2quo_conn_err(node->nd_num); +} + +/* ------------------------------------------------------------ */ + +int o2net_init(void) +{ + unsigned long i; + + o2quo_init(); + + if (o2net_debugfs_init()) + return -ENOMEM; + + o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); + o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); + o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); + if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) { + kfree(o2net_hand); + kfree(o2net_keep_req); + kfree(o2net_keep_resp); + return -ENOMEM; + } + + o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); + o2net_hand->connector_id = cpu_to_be64(1); + + o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC); + o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC); + + for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { + struct o2net_node *nn = o2net_nn_from_num(i); + + atomic_set(&nn->nn_timeout, 0); + spin_lock_init(&nn->nn_lock); + INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect); + INIT_DELAYED_WORK(&nn->nn_connect_expired, + o2net_connect_expired); + INIT_DELAYED_WORK(&nn->nn_still_up, o2net_still_up); + /* until we see hb from a node we'll return einval */ + nn->nn_persistent_error = -ENOTCONN; + init_waitqueue_head(&nn->nn_sc_wq); + idr_init(&nn->nn_status_idr); + INIT_LIST_HEAD(&nn->nn_status_list); + } + + return 0; +} + +void o2net_exit(void) +{ + o2quo_exit(); + kfree(o2net_hand); + kfree(o2net_keep_req); + kfree(o2net_keep_resp); + o2net_debugfs_exit(); +} diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h new file mode 100644 index 00000000..fd6179eb --- /dev/null +++ b/fs/ocfs2/cluster/tcp.h @@ -0,0 +1,152 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * tcp.h + * + * Function prototypes + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_TCP_H +#define O2CLUSTER_TCP_H + +#include +#ifdef __KERNEL__ +#include +#include +#else +#include +#endif +#include +#include + +struct o2net_msg +{ + __be16 magic; + __be16 data_len; + __be16 msg_type; + __be16 pad1; + __be32 sys_status; + __be32 status; + __be32 key; + __be32 msg_num; + __u8 buf[0]; +}; + +typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +typedef void (o2net_post_msg_handler_func)(int status, void *data, + void *ret_data); + +#define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg)) + +/* same as hb delay, we're waiting for another node to recognize our hb */ +#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000 + +#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 +#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 + + +/* TODO: figure this out.... */ +static inline int o2net_link_down(int err, struct socket *sock) +{ + if (sock) { + if (sock->sk->sk_state != TCP_ESTABLISHED && + sock->sk->sk_state != TCP_CLOSE_WAIT) + return 1; + } + + if (err >= 0) + return 0; + switch (err) { + /* ????????????????????????? */ + case -ERESTARTSYS: + case -EBADF: + /* When the server has died, an ICMP port unreachable + * message prompts ECONNREFUSED. */ + case -ECONNREFUSED: + case -ENOTCONN: + case -ECONNRESET: + case -EPIPE: + return 1; + } + return 0; +} + +enum { + O2NET_DRIVER_UNINITED, + O2NET_DRIVER_READY, +}; + +int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len, + u8 target_node, int *status); +int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec, + size_t veclen, u8 target_node, int *status); + +int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, + o2net_msg_handler_func *func, void *data, + o2net_post_msg_handler_func *post_func, + struct list_head *unreg_list); +void o2net_unregister_handler_list(struct list_head *list); + +struct o2nm_node; +int o2net_register_hb_callbacks(void); +void o2net_unregister_hb_callbacks(void); +int o2net_start_listening(struct o2nm_node *node); +void o2net_stop_listening(struct o2nm_node *node); +void o2net_disconnect_node(struct o2nm_node *node); +int o2net_num_connected_peers(void); + +int o2net_init(void); +void o2net_exit(void); + +struct o2net_send_tracking; +struct o2net_sock_container; + +#ifdef CONFIG_DEBUG_FS +int o2net_debugfs_init(void); +void o2net_debugfs_exit(void); +void o2net_debug_add_nst(struct o2net_send_tracking *nst); +void o2net_debug_del_nst(struct o2net_send_tracking *nst); +void o2net_debug_add_sc(struct o2net_sock_container *sc); +void o2net_debug_del_sc(struct o2net_sock_container *sc); +#else +static inline int o2net_debugfs_init(void) +{ + return 0; +} +static inline void o2net_debugfs_exit(void) +{ +} +static inline void o2net_debug_add_nst(struct o2net_send_tracking *nst) +{ +} +static inline void o2net_debug_del_nst(struct o2net_send_tracking *nst) +{ +} +static inline void o2net_debug_add_sc(struct o2net_sock_container *sc) +{ +} +static inline void o2net_debug_del_sc(struct o2net_sock_container *sc) +{ +} +#endif /* CONFIG_DEBUG_FS */ + +#endif /* O2CLUSTER_TCP_H */ diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h new file mode 100644 index 00000000..4cbcb657 --- /dev/null +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -0,0 +1,242 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef O2CLUSTER_TCP_INTERNAL_H +#define O2CLUSTER_TCP_INTERNAL_H + +#define O2NET_MSG_MAGIC ((u16)0xfa55) +#define O2NET_MSG_STATUS_MAGIC ((u16)0xfa56) +#define O2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57) +#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58) + +/* we're delaying our quorum decision so that heartbeat will have timed + * out truly dead nodes by the time we come around to making decisions + * on their number */ +#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS) + +/* + * This version number represents quite a lot, unfortunately. It not + * only represents the raw network message protocol on the wire but also + * locking semantics of the file system using the protocol. It should + * be somewhere else, I'm sure, but right now it isn't. + * + * With version 11, we separate out the filesystem locking portion. The + * filesystem now has a major.minor version it negotiates. Version 11 + * introduces this negotiation to the o2dlm protocol, and as such the + * version here in tcp_internal.h should not need to be bumped for + * filesystem locking changes. + * + * New in version 11 + * - Negotiation of filesystem locking in the dlm join. + * + * New in version 10: + * - Meta/data locks combined + * + * New in version 9: + * - All votes removed + * + * New in version 8: + * - Replace delete inode votes with a cluster lock + * + * New in version 7: + * - DLM join domain includes the live nodemap + * + * New in version 6: + * - DLM lockres remote refcount fixes. + * + * New in version 5: + * - Network timeout checking protocol + * + * New in version 4: + * - Remove i_generation from lock names for better stat performance. + * + * New in version 3: + * - Replace dentry votes with a cluster lock + * + * New in version 2: + * - full 64 bit i_size in the metadata lock lvbs + * - introduction of "rw" lock and pushing meta/data locking down + */ +#define O2NET_PROTOCOL_VERSION 11ULL +struct o2net_handshake { + __be64 protocol_version; + __be64 connector_id; + __be32 o2hb_heartbeat_timeout_ms; + __be32 o2net_idle_timeout_ms; + __be32 o2net_keepalive_delay_ms; + __be32 o2net_reconnect_delay_ms; +}; + +struct o2net_node { + /* this is never called from int/bh */ + spinlock_t nn_lock; + + /* set the moment an sc is allocated and a connect is started */ + struct o2net_sock_container *nn_sc; + /* _valid is only set after the handshake passes and tx can happen */ + unsigned nn_sc_valid:1; + /* if this is set tx just returns it */ + int nn_persistent_error; + /* It is only set to 1 after the idle time out. */ + atomic_t nn_timeout; + + /* threads waiting for an sc to arrive wait on the wq for generation + * to increase. it is increased when a connecting socket succeeds + * or fails or when an accepted socket is attached. */ + wait_queue_head_t nn_sc_wq; + + struct idr nn_status_idr; + struct list_head nn_status_list; + + /* connects are attempted from when heartbeat comes up until either hb + * goes down, the node is unconfigured, no connect attempts succeed + * before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work + * is queued from set_nn_state both from hb up and from itself if a + * connect attempt fails and so can be self-arming. shutdown is + * careful to first mark the nn such that no connects will be attempted + * before canceling delayed connect work and flushing the queue. */ + struct delayed_work nn_connect_work; + unsigned long nn_last_connect_attempt; + + /* this is queued as nodes come up and is canceled when a connection is + * established. this expiring gives up on the node and errors out + * transmits */ + struct delayed_work nn_connect_expired; + + /* after we give up on a socket we wait a while before deciding + * that it is still heartbeating and that we should do some + * quorum work */ + struct delayed_work nn_still_up; +}; + +struct o2net_sock_container { + struct kref sc_kref; + /* the next two are valid for the life time of the sc */ + struct socket *sc_sock; + struct o2nm_node *sc_node; + + /* all of these sc work structs hold refs on the sc while they are + * queued. they should not be able to ref a freed sc. the teardown + * race is with o2net_wq destruction in o2net_stop_listening() */ + + /* rx and connect work are generated from socket callbacks. sc + * shutdown removes the callbacks and then flushes the work queue */ + struct work_struct sc_rx_work; + struct work_struct sc_connect_work; + /* shutdown work is triggered in two ways. the simple way is + * for a code path calls ensure_shutdown which gets a lock, removes + * the sc from the nn, and queues the work. in this case the + * work is single-shot. the work is also queued from a sock + * callback, though, and in this case the work will find the sc + * still on the nn and will call ensure_shutdown itself.. this + * ends up triggering the shutdown work again, though nothing + * will be done in that second iteration. so work queue teardown + * has to be careful to remove the sc from the nn before waiting + * on the work queue so that the shutdown work doesn't remove the + * sc and rearm itself. + */ + struct work_struct sc_shutdown_work; + + struct timer_list sc_idle_timeout; + struct delayed_work sc_keepalive_work; + + unsigned sc_handshake_ok:1; + + struct page *sc_page; + size_t sc_page_off; + + /* original handlers for the sockets */ + void (*sc_state_change)(struct sock *sk); + void (*sc_data_ready)(struct sock *sk, int bytes); + + u32 sc_msg_key; + u16 sc_msg_type; + +#ifdef CONFIG_DEBUG_FS + struct list_head sc_net_debug_item; + ktime_t sc_tv_timer; + ktime_t sc_tv_data_ready; + ktime_t sc_tv_advance_start; + ktime_t sc_tv_advance_stop; + ktime_t sc_tv_func_start; + ktime_t sc_tv_func_stop; +#endif +#ifdef CONFIG_OCFS2_FS_STATS + ktime_t sc_tv_acquiry_total; + ktime_t sc_tv_send_total; + ktime_t sc_tv_status_total; + u32 sc_send_count; + u32 sc_recv_count; + ktime_t sc_tv_process_total; +#endif + struct mutex sc_send_lock; +}; + +struct o2net_msg_handler { + struct rb_node nh_node; + u32 nh_max_len; + u32 nh_msg_type; + u32 nh_key; + o2net_msg_handler_func *nh_func; + o2net_msg_handler_func *nh_func_data; + o2net_post_msg_handler_func + *nh_post_func; + struct kref nh_kref; + struct list_head nh_unregister_item; +}; + +enum o2net_system_error { + O2NET_ERR_NONE = 0, + O2NET_ERR_NO_HNDLR, + O2NET_ERR_OVERFLOW, + O2NET_ERR_DIED, + O2NET_ERR_MAX +}; + +struct o2net_status_wait { + enum o2net_system_error ns_sys_status; + s32 ns_status; + int ns_id; + wait_queue_head_t ns_wq; + struct list_head ns_node_item; +}; + +#ifdef CONFIG_DEBUG_FS +/* just for state dumps */ +struct o2net_send_tracking { + struct list_head st_net_debug_item; + struct task_struct *st_task; + struct o2net_sock_container *st_sc; + u32 st_id; + u32 st_msg_type; + u32 st_msg_key; + u8 st_node; + ktime_t st_sock_time; + ktime_t st_send_time; + ktime_t st_status_time; +}; +#else +struct o2net_send_tracking { + u32 dummy; +}; +#endif /* CONFIG_DEBUG_FS */ + +#endif /* O2CLUSTER_TCP_INTERNAL_H */ diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c new file mode 100644 index 00000000..a56eee6a --- /dev/null +++ b/fs/ocfs2/cluster/ver.c @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ver.c + * + * version string + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include + +#include "ver.h" + +#define CLUSTER_BUILD_VERSION "1.5.0" + +#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION + +void cluster_print_version(void) +{ + printk(KERN_INFO "%s\n", VERSION_STR); +} + +MODULE_DESCRIPTION(VERSION_STR); + +MODULE_VERSION(CLUSTER_BUILD_VERSION); diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h new file mode 100644 index 00000000..32554c33 --- /dev/null +++ b/fs/ocfs2/cluster/ver.h @@ -0,0 +1,31 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ver.h + * + * Function prototypes + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef O2CLUSTER_VER_H +#define O2CLUSTER_VER_H + +void cluster_print_version(void); + +#endif /* O2CLUSTER_VER_H */ diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c new file mode 100644 index 00000000..e5ba3481 --- /dev/null +++ b/fs/ocfs2/dcache.c @@ -0,0 +1,537 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dcache.c + * + * dentry cache handling code + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "dcache.h" +#include "dlmglue.h" +#include "file.h" +#include "inode.h" +#include "super.h" +#include "ocfs2_trace.h" + +void ocfs2_dentry_attach_gen(struct dentry *dentry) +{ + unsigned long gen = + OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; + BUG_ON(dentry->d_inode); + dentry->d_fsdata = (void *)gen; +} + + +static int ocfs2_dentry_revalidate(struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + int ret = 0; /* if all else fails, just return false */ + struct ocfs2_super *osb; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + osb = OCFS2_SB(dentry->d_sb); + + trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len, + dentry->d_name.name); + + /* For a negative dentry - + * check the generation number of the parent and compare with the + * one stored in the inode. + */ + if (inode == NULL) { + unsigned long gen = (unsigned long) dentry->d_fsdata; + unsigned long pgen = + OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; + + trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len, + dentry->d_name.name, + pgen, gen); + if (gen != pgen) + goto bail; + goto valid; + } + + BUG_ON(!osb); + + if (inode == osb->root_inode || is_bad_inode(inode)) + goto bail; + + spin_lock(&OCFS2_I(inode)->ip_lock); + /* did we or someone else delete this inode? */ + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + trace_ocfs2_dentry_revalidate_delete( + (unsigned long long)OCFS2_I(inode)->ip_blkno); + goto bail; + } + spin_unlock(&OCFS2_I(inode)->ip_lock); + + /* + * We don't need a cluster lock to test this because once an + * inode nlink hits zero, it never goes back. + */ + if (inode->i_nlink == 0) { + trace_ocfs2_dentry_revalidate_orphaned( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + S_ISDIR(inode->i_mode)); + goto bail; + } + + /* + * If the last lookup failed to create dentry lock, let us + * redo it. + */ + if (!dentry->d_fsdata) { + trace_ocfs2_dentry_revalidate_nofsdata( + (unsigned long long)OCFS2_I(inode)->ip_blkno); + goto bail; + } + +valid: + ret = 1; + +bail: + trace_ocfs2_dentry_revalidate_ret(ret); + return ret; +} + +static int ocfs2_match_dentry(struct dentry *dentry, + u64 parent_blkno, + int skip_unhashed) +{ + struct inode *parent; + + /* + * ocfs2_lookup() does a d_splice_alias() _before_ attaching + * to the lock data, so we skip those here, otherwise + * ocfs2_dentry_attach_lock() will get its original dentry + * back. + */ + if (!dentry->d_fsdata) + return 0; + + if (!dentry->d_parent) + return 0; + + if (skip_unhashed && d_unhashed(dentry)) + return 0; + + parent = dentry->d_parent->d_inode; + /* Negative parent dentry? */ + if (!parent) + return 0; + + /* Name is in a different directory. */ + if (OCFS2_I(parent)->ip_blkno != parent_blkno) + return 0; + + return 1; +} + +/* + * Walk the inode alias list, and find a dentry which has a given + * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it + * is looking for a dentry_lock reference. The downconvert thread is + * looking to unhash aliases, so we allow it to skip any that already + * have that property. + */ +struct dentry *ocfs2_find_local_alias(struct inode *inode, + u64 parent_blkno, + int skip_unhashed) +{ + struct list_head *p; + struct dentry *dentry = NULL; + + spin_lock(&inode->i_lock); + list_for_each(p, &inode->i_dentry) { + dentry = list_entry(p, struct dentry, d_alias); + + spin_lock(&dentry->d_lock); + if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { + trace_ocfs2_find_local_alias(dentry->d_name.len, + dentry->d_name.name); + + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + break; + } + spin_unlock(&dentry->d_lock); + + dentry = NULL; + } + + spin_unlock(&inode->i_lock); + + return dentry; +} + +DEFINE_SPINLOCK(dentry_attach_lock); + +/* + * Attach this dentry to a cluster lock. + * + * Dentry locks cover all links in a given directory to a particular + * inode. We do this so that ocfs2 can build a lock name which all + * nodes in the cluster can agree on at all times. Shoving full names + * in the cluster lock won't work due to size restrictions. Covering + * links inside of a directory is a good compromise because it still + * allows us to use the parent directory lock to synchronize + * operations. + * + * Call this function with the parent dir semaphore and the parent dir + * cluster lock held. + * + * The dir semaphore will protect us from having to worry about + * concurrent processes on our node trying to attach a lock at the + * same time. + * + * The dir cluster lock (held at either PR or EX mode) protects us + * from unlink and rename on other nodes. + * + * A dput() can happen asynchronously due to pruning, so we cover + * attaching and detaching the dentry lock with a + * dentry_attach_lock. + * + * A node which has done lookup on a name retains a protected read + * lock until final dput. If the user requests and unlink or rename, + * the protected read is upgraded to an exclusive lock. Other nodes + * who have seen the dentry will then be informed that they need to + * downgrade their lock, which will involve d_delete on the + * dentry. This happens in ocfs2_dentry_convert_worker(). + */ +int ocfs2_dentry_attach_lock(struct dentry *dentry, + struct inode *inode, + u64 parent_blkno) +{ + int ret; + struct dentry *alias; + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + + trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name, + (unsigned long long)parent_blkno, dl); + + /* + * Negative dentry. We ignore these for now. + * + * XXX: Could we can improve ocfs2_dentry_revalidate() by + * tracking these? + */ + if (!inode) + return 0; + + if (!dentry->d_inode && dentry->d_fsdata) { + /* Converting a negative dentry to positive + Clear dentry->d_fsdata */ + dentry->d_fsdata = dl = NULL; + } + + if (dl) { + mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, + " \"%.*s\": old parent: %llu, new: %llu\n", + dentry->d_name.len, dentry->d_name.name, + (unsigned long long)parent_blkno, + (unsigned long long)dl->dl_parent_blkno); + return 0; + } + + alias = ocfs2_find_local_alias(inode, parent_blkno, 0); + if (alias) { + /* + * Great, an alias exists, which means we must have a + * dentry lock already. We can just grab the lock off + * the alias and add it to the list. + * + * We're depending here on the fact that this dentry + * was found and exists in the dcache and so must have + * a reference to the dentry_lock because we can't + * race creates. Final dput() cannot happen on it + * since we have it pinned, so our reference is safe. + */ + dl = alias->d_fsdata; + mlog_bug_on_msg(!dl, "parent %llu, ino %llu\n", + (unsigned long long)parent_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, + " \"%.*s\": old parent: %llu, new: %llu\n", + dentry->d_name.len, dentry->d_name.name, + (unsigned long long)parent_blkno, + (unsigned long long)dl->dl_parent_blkno); + + trace_ocfs2_dentry_attach_lock_found(dl->dl_lockres.l_name, + (unsigned long long)parent_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + goto out_attach; + } + + /* + * There are no other aliases + */ + dl = kmalloc(sizeof(*dl), GFP_NOFS); + if (!dl) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + + dl->dl_count = 0; + /* + * Does this have to happen below, for all attaches, in case + * the struct inode gets blown away by the downconvert thread? + */ + dl->dl_inode = igrab(inode); + dl->dl_parent_blkno = parent_blkno; + ocfs2_dentry_lock_res_init(dl, parent_blkno, inode); + +out_attach: + spin_lock(&dentry_attach_lock); + dentry->d_fsdata = dl; + dl->dl_count++; + spin_unlock(&dentry_attach_lock); + + /* + * This actually gets us our PRMODE level lock. From now on, + * we'll have a notification if one of these names is + * destroyed on another node. + */ + ret = ocfs2_dentry_lock(dentry, 0); + if (!ret) + ocfs2_dentry_unlock(dentry, 0); + else + mlog_errno(ret); + + /* + * In case of error, manually free the allocation and do the iput(). + * We need to do this because error here means no d_instantiate(), + * which means iput() will not be called during dput(dentry). + */ + if (ret < 0 && !alias) { + ocfs2_lock_res_free(&dl->dl_lockres); + BUG_ON(dl->dl_count != 1); + spin_lock(&dentry_attach_lock); + dentry->d_fsdata = NULL; + spin_unlock(&dentry_attach_lock); + kfree(dl); + iput(inode); + } + + dput(alias); + + return ret; +} + +DEFINE_SPINLOCK(dentry_list_lock); + +/* We limit the number of dentry locks to drop in one go. We have + * this limit so that we don't starve other users of ocfs2_wq. */ +#define DL_INODE_DROP_COUNT 64 + +/* Drop inode references from dentry locks */ +static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) +{ + struct ocfs2_dentry_lock *dl; + + spin_lock(&dentry_list_lock); + while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { + dl = osb->dentry_lock_list; + osb->dentry_lock_list = dl->dl_next; + spin_unlock(&dentry_list_lock); + iput(dl->dl_inode); + kfree(dl); + spin_lock(&dentry_list_lock); + } + spin_unlock(&dentry_list_lock); +} + +void ocfs2_drop_dl_inodes(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dentry_lock_work); + + __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); + /* + * Don't queue dropping if umount is in progress. We flush the + * list in ocfs2_dismount_volume + */ + spin_lock(&dentry_list_lock); + if (osb->dentry_lock_list && + !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) + queue_work(ocfs2_wq, &osb->dentry_lock_work); + spin_unlock(&dentry_list_lock); +} + +/* Flush the whole work queue */ +void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) +{ + __ocfs2_drop_dl_inodes(osb, -1); +} + +/* + * ocfs2_dentry_iput() and friends. + * + * At this point, our particular dentry is detached from the inodes + * alias list, so there's no way that the locking code can find it. + * + * The interesting stuff happens when we determine that our lock needs + * to go away because this is the last subdir alias in the + * system. This function needs to handle a couple things: + * + * 1) Synchronizing lock shutdown with the downconvert threads. This + * is already handled for us via the lockres release drop function + * called in ocfs2_release_dentry_lock() + * + * 2) A race may occur when we're doing our lock shutdown and + * another process wants to create a new dentry lock. Right now we + * let them race, which means that for a very short while, this + * node might have two locks on a lock resource. This should be a + * problem though because one of them is in the process of being + * thrown out. + */ +static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, + struct ocfs2_dentry_lock *dl) +{ + ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); + ocfs2_lock_res_free(&dl->dl_lockres); + + /* We leave dropping of inode reference to ocfs2_wq as that can + * possibly lead to inode deletion which gets tricky */ + spin_lock(&dentry_list_lock); + if (!osb->dentry_lock_list && + !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) + queue_work(ocfs2_wq, &osb->dentry_lock_work); + dl->dl_next = osb->dentry_lock_list; + osb->dentry_lock_list = dl; + spin_unlock(&dentry_list_lock); +} + +void ocfs2_dentry_lock_put(struct ocfs2_super *osb, + struct ocfs2_dentry_lock *dl) +{ + int unlock; + + BUG_ON(dl->dl_count == 0); + + spin_lock(&dentry_attach_lock); + dl->dl_count--; + unlock = !dl->dl_count; + spin_unlock(&dentry_attach_lock); + + if (unlock) + ocfs2_drop_dentry_lock(osb, dl); +} + +static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode) +{ + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + + if (!dl) { + /* + * No dentry lock is ok if we're disconnected or + * unhashed. + */ + if (!(dentry->d_flags & DCACHE_DISCONNECTED) && + !d_unhashed(dentry)) { + unsigned long long ino = 0ULL; + if (inode) + ino = (unsigned long long)OCFS2_I(inode)->ip_blkno; + mlog(ML_ERROR, "Dentry is missing cluster lock. " + "inode: %llu, d_flags: 0x%x, d_name: %.*s\n", + ino, dentry->d_flags, dentry->d_name.len, + dentry->d_name.name); + } + + goto out; + } + + mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n", + dentry->d_name.len, dentry->d_name.name, + dl->dl_count); + + ocfs2_dentry_lock_put(OCFS2_SB(dentry->d_sb), dl); + +out: + iput(inode); +} + +/* + * d_move(), but keep the locks in sync. + * + * When we are done, "dentry" will have the parent dir and name of + * "target", which will be thrown away. + * + * We manually update the lock of "dentry" if need be. + * + * "target" doesn't have it's dentry lock touched - we allow the later + * dput() to handle this for us. + * + * This is called during ocfs2_rename(), while holding parent + * directory locks. The dentries have already been deleted on other + * nodes via ocfs2_remote_dentry_delete(). + * + * Normally, the VFS handles the d_move() for the file system, after + * the ->rename() callback. OCFS2 wants to handle this internally, so + * the new lock can be created atomically with respect to the cluster. + */ +void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target, + struct inode *old_dir, struct inode *new_dir) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(old_dir->i_sb); + struct inode *inode = dentry->d_inode; + + /* + * Move within the same directory, so the actual lock info won't + * change. + * + * XXX: Is there any advantage to dropping the lock here? + */ + if (old_dir == new_dir) + goto out_move; + + ocfs2_dentry_lock_put(osb, dentry->d_fsdata); + + dentry->d_fsdata = NULL; + ret = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(new_dir)->ip_blkno); + if (ret) + mlog_errno(ret); + +out_move: + d_move(dentry, target); +} + +const struct dentry_operations ocfs2_dentry_ops = { + .d_revalidate = ocfs2_dentry_revalidate, + .d_iput = ocfs2_dentry_iput, +}; diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h new file mode 100644 index 00000000..b79eff70 --- /dev/null +++ b/fs/ocfs2/dcache.h @@ -0,0 +1,69 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dcache.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_DCACHE_H +#define OCFS2_DCACHE_H + +extern const struct dentry_operations ocfs2_dentry_ops; + +struct ocfs2_dentry_lock { + /* Use count of dentry lock */ + unsigned int dl_count; + union { + /* Linked list of dentry locks to release */ + struct ocfs2_dentry_lock *dl_next; + u64 dl_parent_blkno; + }; + + /* + * The ocfs2_dentry_lock keeps an inode reference until + * dl_lockres has been destroyed. This is usually done in + * ->d_iput() anyway, so there should be minimal impact. + */ + struct inode *dl_inode; + struct ocfs2_lock_res dl_lockres; +}; + +int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, + u64 parent_blkno); + +extern spinlock_t dentry_list_lock; + +void ocfs2_dentry_lock_put(struct ocfs2_super *osb, + struct ocfs2_dentry_lock *dl); + +void ocfs2_drop_dl_inodes(struct work_struct *work); +void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); + +struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, + int skip_unhashed); + +void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target, + struct inode *old_dir, struct inode *new_dir); + +extern spinlock_t dentry_attach_lock; +void ocfs2_dentry_attach_gen(struct dentry *dentry); + +#endif /* OCFS2_DCACHE_H */ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c new file mode 100644 index 00000000..8582e3f4 --- /dev/null +++ b/fs/ocfs2/dir.c @@ -0,0 +1,4555 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dir.c + * + * Creates, reads, walks and deletes directory-nodes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * Portions of this code from linux/fs/ext3/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linux Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "blockcheck.h" +#include "dir.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "namei.h" +#include "suballoc.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +#define NAMEI_RA_CHUNKS 2 +#define NAMEI_RA_BLOCKS 4 +#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) +#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + +static unsigned char ocfs2_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int ocfs2_do_extend_dir(struct super_block *sb, + handle_t *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **new_bh); +static int ocfs2_dir_indexed(struct inode *inode); + +/* + * These are distinct checks because future versions of the file system will + * want to have a trailing dirent structure independent of indexing. + */ +static int ocfs2_supports_dir_trailer(struct inode *dir) +{ + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; + + return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir); +} + +/* + * "new' here refers to the point at which we're creating a new + * directory via "mkdir()", but also when we're expanding an inline + * directory. In either case, we don't yet have the indexing bit set + * on the directory, so the standard checks will fail in when metaecc + * is turned off. Only directory-initialization type functions should + * use this then. Everything else wants ocfs2_supports_dir_trailer() + */ +static int ocfs2_new_dir_wants_trailer(struct inode *dir) +{ + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + + return ocfs2_meta_ecc(osb) || + ocfs2_supports_indexed_dirs(osb); +} + +static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb) +{ + return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer); +} + +#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb)))) + +/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make + * them more consistent? */ +struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, + void *data) +{ + char *p = data; + + p += blocksize - sizeof(struct ocfs2_dir_block_trailer); + return (struct ocfs2_dir_block_trailer *)p; +} + +/* + * XXX: This is executed once on every dirent. We should consider optimizing + * it. + */ +static int ocfs2_skip_dir_trailer(struct inode *dir, + struct ocfs2_dir_entry *de, + unsigned long offset, + unsigned long blklen) +{ + unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer); + + if (!ocfs2_supports_dir_trailer(dir)) + return 0; + + if (offset != toff) + return 0; + + return 1; +} + +static void ocfs2_init_dir_trailer(struct inode *inode, + struct buffer_head *bh, u16 rec_len) +{ + struct ocfs2_dir_block_trailer *trailer; + + trailer = ocfs2_trailer_from_bh(bh, inode->i_sb); + strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE); + trailer->db_compat_rec_len = + cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer)); + trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); + trailer->db_blkno = cpu_to_le64(bh->b_blocknr); + trailer->db_free_rec_len = cpu_to_le16(rec_len); +} +/* + * Link an unindexed block with a dir trailer structure into the index free + * list. This function will modify dirdata_bh, but assumes you've already + * passed it to the journal. + */ +static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle, + struct buffer_head *dx_root_bh, + struct buffer_head *dirdata_bh) +{ + int ret; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dir_block_trailer *trailer; + + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + trailer->db_free_next = dx_root->dr_free_blk; + dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr); + + ocfs2_journal_dirty(handle, dx_root_bh); + +out: + return ret; +} + +static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res) +{ + return res->dl_prev_leaf_bh == NULL; +} + +void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res) +{ + brelse(res->dl_dx_root_bh); + brelse(res->dl_leaf_bh); + brelse(res->dl_dx_leaf_bh); + brelse(res->dl_prev_leaf_bh); +} + +static int ocfs2_dir_indexed(struct inode *inode) +{ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL) + return 1; + return 0; +} + +static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root) +{ + return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE; +} + +/* + * Hashing code adapted from ext3 + */ +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[4], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while (--n); + + buf[0] += b0; + buf[1] += b1; +} + +static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = msg[i] + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len, + struct ocfs2_dx_hinfo *hinfo) +{ + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + const char *p; + __u32 in[8], buf[4]; + + /* + * XXX: Is this really necessary, if the index is never looked + * at by readdir? Is a hash value of '0' a bad idea? + */ + if ((len == 1 && !strncmp(".", name, 1)) || + (len == 2 && !strncmp("..", name, 2))) { + buf[0] = buf[1] = 0; + goto out; + } + +#ifdef OCFS2_DEBUG_DX_DIRS + /* + * This makes it very easy to debug indexing problems. We + * should never allow this to be selected without hand editing + * this file though. + */ + buf[0] = buf[1] = len; + goto out; +#endif + + memcpy(buf, osb->osb_dx_seed, sizeof(buf)); + + p = name; + while (len > 0) { + str2hashbuf(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + +out: + hinfo->major_hash = buf[0]; + hinfo->minor_hash = buf[1]; +} + +/* + * bh passed here can be an inode block or a dir data block, depending + * on the inode inline data flag. + */ +static int ocfs2_check_dir_entry(struct inode * dir, + struct ocfs2_dir_entry * de, + struct buffer_head * bh, + unsigned long offset) +{ + const char *error_msg = NULL; + const int rlen = le16_to_cpu(de->rec_len); + + if (unlikely(rlen < OCFS2_DIR_REC_LEN(1))) + error_msg = "rec_len is smaller than minimal"; + else if (unlikely(rlen % 4 != 0)) + error_msg = "rec_len % 4 != 0"; + else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len))) + error_msg = "rec_len is too small for name_len"; + else if (unlikely( + ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)) + error_msg = "directory entry across blocks"; + + if (unlikely(error_msg != NULL)) + mlog(ML_ERROR, "bad entry in directory #%llu: %s - " + "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg, + offset, (unsigned long long)le64_to_cpu(de->inode), rlen, + de->name_len); + + return error_msg == NULL ? 1 : 0; +} + +static inline int ocfs2_match(int len, + const char * const name, + struct ocfs2_dir_entry *de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * Returns 0 if not found, -1 on failure, and 1 on success + */ +static inline int ocfs2_search_dirblock(struct buffer_head *bh, + struct inode *dir, + const char *name, int namelen, + unsigned long offset, + char *first_de, + unsigned int bytes, + struct ocfs2_dir_entry **res_dir) +{ + struct ocfs2_dir_entry *de; + char *dlimit, *de_buf; + int de_len; + int ret = 0; + + de_buf = first_de; + dlimit = de_buf + bytes; + + while (de_buf < dlimit) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + + de = (struct ocfs2_dir_entry *) de_buf; + + if (de_buf + namelen <= dlimit && + ocfs2_match(namelen, name, de)) { + /* found a match - just to be sure, do a full check */ + if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { + ret = -1; + goto bail; + } + *res_dir = de; + ret = 1; + goto bail; + } + + /* prevent looping on a bad block */ + de_len = le16_to_cpu(de->rec_len); + if (de_len <= 0) { + ret = -1; + goto bail; + } + + de_buf += de_len; + offset += de_len; + } + +bail: + trace_ocfs2_search_dirblock(ret); + return ret; +} + +static struct buffer_head *ocfs2_find_entry_id(const char *name, + int namelen, + struct inode *dir, + struct ocfs2_dir_entry **res_dir) +{ + int ret, found; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_inline_data *data; + + ret = ocfs2_read_inode_block(dir, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + data = &di->id2.i_data; + + found = ocfs2_search_dirblock(di_bh, dir, name, namelen, 0, + data->id_data, i_size_read(dir), res_dir); + if (found == 1) + return di_bh; + + brelse(di_bh); +out: + return NULL; +} + +static int ocfs2_validate_dir_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_dir_block_trailer *trailer = + ocfs2_trailer_from_bh(bh, sb); + + + /* + * We don't validate dirents here, that's handled + * in-place when the code walks them. + */ + trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + * + * Note that we are safe to call this even if the directory + * doesn't have a trailer. Filesystems without metaecc will do + * nothing, and filesystems with it will have one. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check); + if (rc) + mlog(ML_ERROR, "Checksum failed for dinode %llu\n", + (unsigned long long)bh->b_blocknr); + + return rc; +} + +/* + * Validate a directory trailer. + * + * We check the trailer here rather than in ocfs2_validate_dir_block() + * because that function doesn't have the inode to test. + */ +static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh) +{ + int rc = 0; + struct ocfs2_dir_block_trailer *trailer; + + trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); + if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { + rc = -EINVAL; + ocfs2_error(dir->i_sb, + "Invalid dirblock #%llu: " + "signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + trailer->db_signature); + goto out; + } + if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { + rc = -EINVAL; + ocfs2_error(dir->i_sb, + "Directory block #%llu has an invalid " + "db_blkno of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); + goto out; + } + if (le64_to_cpu(trailer->db_parent_dinode) != + OCFS2_I(dir)->ip_blkno) { + rc = -EINVAL; + ocfs2_error(dir->i_sb, + "Directory block #%llu on dinode " + "#%llu has an invalid parent_dinode " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); + goto out; + } +out: + return rc; +} + +/* + * This function forces all errors to -EIO for consistency with its + * predecessor, ocfs2_bread(). We haven't audited what returning the + * real error codes would do to callers. We log the real codes with + * mlog_errno() before we squash them. + */ +static int ocfs2_read_dir_block(struct inode *inode, u64 v_block, + struct buffer_head **bh, int flags) +{ + int rc = 0; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, + ocfs2_validate_dir_block); + if (rc) { + mlog_errno(rc); + goto out; + } + + if (!(flags & OCFS2_BH_READAHEAD) && + ocfs2_supports_dir_trailer(inode)) { + rc = ocfs2_check_dir_trailer(inode, tmp); + if (rc) { + if (!*bh) + brelse(tmp); + mlog_errno(rc); + goto out; + } + } + + /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */ + if (!*bh) + *bh = tmp; + +out: + return rc ? -EIO : 0; +} + +/* + * Read the block at 'phys' which belongs to this directory + * inode. This function does no virtual->physical block translation - + * what's passed in is assumed to be a valid directory block. + */ +static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys, + struct buffer_head **bh) +{ + int ret; + struct buffer_head *tmp = *bh; + + ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp, + ocfs2_validate_dir_block); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (ocfs2_supports_dir_trailer(dir)) { + ret = ocfs2_check_dir_trailer(dir, tmp); + if (ret) { + if (!*bh) + brelse(tmp); + mlog_errno(ret); + goto out; + } + } + + if (!ret && !*bh) + *bh = tmp; +out: + return ret; +} + +static int ocfs2_validate_dx_root(struct super_block *sb, + struct buffer_head *bh) +{ + int ret; + struct ocfs2_dx_root_block *dx_root; + + BUG_ON(!buffer_uptodate(bh)); + + dx_root = (struct ocfs2_dx_root_block *) bh->b_data; + + ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check); + if (ret) { + mlog(ML_ERROR, + "Checksum failed for dir index root block %llu\n", + (unsigned long long)bh->b_blocknr); + return ret; + } + + if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { + ocfs2_error(sb, + "Dir Index Root # %llu has bad signature %.*s", + (unsigned long long)le64_to_cpu(dx_root->dr_blkno), + 7, dx_root->dr_signature); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, + struct buffer_head **dx_root_bh) +{ + int ret; + u64 blkno = le64_to_cpu(di->i_dx_root); + struct buffer_head *tmp = *dx_root_bh; + + ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp, + ocfs2_validate_dx_root); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!ret && !*dx_root_bh) + *dx_root_bh = tmp; + + return ret; +} + +static int ocfs2_validate_dx_leaf(struct super_block *sb, + struct buffer_head *bh) +{ + int ret; + struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data; + + BUG_ON(!buffer_uptodate(bh)); + + ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check); + if (ret) { + mlog(ML_ERROR, + "Checksum failed for dir index leaf block %llu\n", + (unsigned long long)bh->b_blocknr); + return ret; + } + + if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { + ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s", + 7, dx_leaf->dl_signature); + return -EROFS; + } + + return 0; +} + +static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, + struct buffer_head **dx_leaf_bh) +{ + int ret; + struct buffer_head *tmp = *dx_leaf_bh; + + ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp, + ocfs2_validate_dx_leaf); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!ret && !*dx_leaf_bh) + *dx_leaf_bh = tmp; + + return ret; +} + +/* + * Read a series of dx_leaf blocks. This expects all buffer_head + * pointers to be NULL on function entry. + */ +static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num, + struct buffer_head **dx_leaf_bhs) +{ + int ret; + + ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0, + ocfs2_validate_dx_leaf); + if (ret) + mlog_errno(ret); + + return ret; +} + +static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dir_entry **res_dir) +{ + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; + struct buffer_head *bh, *ret = NULL; + unsigned long start, block, b; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead + buffer */ + int num = 0; + int nblocks, i, err; + + sb = dir->i_sb; + + nblocks = i_size_read(dir) >> sb->s_blocksize_bits; + start = OCFS2_I(dir)->ip_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; + +restart: + do { + /* + * We deal with the read-ahead logic here. + */ + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + b = block; + for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { + /* + * Terminate if we reach the end of the + * directory and must wrap, or if our + * search has finished at this block. + */ + if (b >= nblocks || (num && block == start)) { + bh_use[ra_max] = NULL; + break; + } + num++; + + bh = NULL; + err = ocfs2_read_dir_block(dir, b++, &bh, + OCFS2_BH_READAHEAD); + bh_use[ra_max] = bh; + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + if (ocfs2_read_dir_block(dir, block, &bh, 0)) { + /* read error, skip block & hope for the best. + * ocfs2_read_dir_block() has released the bh. */ + ocfs2_error(dir->i_sb, "reading directory %llu, " + "offset %lu\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, + block); + goto next; + } + i = ocfs2_search_dirblock(bh, dir, name, namelen, + block << sb->s_blocksize_bits, + bh->b_data, sb->s_blocksize, + res_dir); + if (i == 1) { + OCFS2_I(dir)->ip_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) + goto cleanup_and_exit; + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = i_size_read(dir) >> sb->s_blocksize_bits; + if (block < nblocks) { + start = 0; + goto restart; + } + +cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse(bh_use[ra_ptr]); + + trace_ocfs2_find_entry_el(ret); + return ret; +} + +static int ocfs2_dx_dir_lookup_rec(struct inode *inode, + struct ocfs2_extent_list *el, + u32 major_hash, + u32 *ret_cpos, + u64 *ret_phys_blkno, + unsigned int *ret_clen) +{ + int ret = 0, i, found; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash, + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "btree tree block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + found = 0; + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= major_hash) { + found = 1; + break; + } + } + + if (!found) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in btree", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + if (ret_phys_blkno) + *ret_phys_blkno = le64_to_cpu(rec->e_blkno); + if (ret_cpos) + *ret_cpos = le32_to_cpu(rec->e_cpos); + if (ret_clen) + *ret_clen = le16_to_cpu(rec->e_leaf_clusters); + +out: + brelse(eb_bh); + return ret; +} + +/* + * Returns the block index, from the start of the cluster which this + * hash belongs too. + */ +static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, + u32 minor_hash) +{ + return minor_hash & osb->osb_dx_mask; +} + +static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, + struct ocfs2_dx_hinfo *hinfo) +{ + return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash); +} + +static int ocfs2_dx_dir_lookup(struct inode *inode, + struct ocfs2_extent_list *el, + struct ocfs2_dx_hinfo *hinfo, + u32 *ret_cpos, + u64 *ret_phys_blkno) +{ + int ret = 0; + unsigned int cend, uninitialized_var(clen); + u32 uninitialized_var(cpos); + u64 uninitialized_var(blkno); + u32 name_hash = hinfo->major_hash; + + ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno, + &clen); + if (ret) { + mlog_errno(ret); + goto out; + } + + cend = cpos + clen; + if (name_hash >= cend) { + /* We want the last cluster */ + blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1); + cpos += clen - 1; + } else { + blkno += ocfs2_clusters_to_blocks(inode->i_sb, + name_hash - cpos); + cpos = name_hash; + } + + /* + * We now have the cluster which should hold our entry. To + * find the exact block from the start of the cluster to + * search, we take the lower bits of the hash. + */ + blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo); + + if (ret_phys_blkno) + *ret_phys_blkno = blkno; + if (ret_cpos) + *ret_cpos = cpos; + +out: + + return ret; +} + +static int ocfs2_dx_dir_search(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dx_root_block *dx_root, + struct ocfs2_dir_lookup_result *res) +{ + int ret, i, found; + u64 uninitialized_var(phys); + struct buffer_head *dx_leaf_bh = NULL; + struct ocfs2_dx_leaf *dx_leaf; + struct ocfs2_dx_entry *dx_entry = NULL; + struct buffer_head *dir_ent_bh = NULL; + struct ocfs2_dir_entry *dir_ent = NULL; + struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo; + struct ocfs2_extent_list *dr_el; + struct ocfs2_dx_entry_list *entry_list; + + ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo); + + if (ocfs2_dx_root_inline(dx_root)) { + entry_list = &dx_root->dr_entries; + goto search; + } + + dr_el = &dx_root->dr_list; + + ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno, + namelen, name, hinfo->major_hash, + hinfo->minor_hash, (unsigned long long)phys); + + ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data; + + trace_ocfs2_dx_dir_search_leaf_info( + le16_to_cpu(dx_leaf->dl_list.de_num_used), + le16_to_cpu(dx_leaf->dl_list.de_count)); + + entry_list = &dx_leaf->dl_list; + +search: + /* + * Empty leaf is legal, so no need to check for that. + */ + found = 0; + for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) { + dx_entry = &entry_list->de_entries[i]; + + if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash) + || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash)) + continue; + + /* + * Search unindexed leaf block now. We're not + * guaranteed to find anything. + */ + ret = ocfs2_read_dir_block_direct(dir, + le64_to_cpu(dx_entry->dx_dirent_blk), + &dir_ent_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * XXX: We should check the unindexed block here, + * before using it. + */ + + found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen, + 0, dir_ent_bh->b_data, + dir->i_sb->s_blocksize, &dir_ent); + if (found == 1) + break; + + if (found == -1) { + /* This means we found a bad directory entry. */ + ret = -EIO; + mlog_errno(ret); + goto out; + } + + brelse(dir_ent_bh); + dir_ent_bh = NULL; + } + + if (found <= 0) { + ret = -ENOENT; + goto out; + } + + res->dl_leaf_bh = dir_ent_bh; + res->dl_entry = dir_ent; + res->dl_dx_leaf_bh = dx_leaf_bh; + res->dl_dx_entry = dx_entry; + + ret = 0; +out: + if (ret) { + brelse(dx_leaf_bh); + brelse(dir_ent_bh); + } + return ret; +} + +static int ocfs2_find_entry_dx(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_root_block *dx_root; + + ret = ocfs2_read_inode_block(dir, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + + ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + + ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup); + if (ret) { + if (ret != -ENOENT) + mlog_errno(ret); + goto out; + } + + lookup->dl_dx_root_bh = dx_root_bh; + dx_root_bh = NULL; +out: + brelse(di_bh); + brelse(dx_root_bh); + return ret; +} + +/* + * Try to find an entry of the provided name within 'dir'. + * + * If nothing was found, -ENOENT is returned. Otherwise, zero is + * returned and the struct 'res' will contain information useful to + * other directory manipulation functions. + * + * Caller can NOT assume anything about the contents of the + * buffer_heads - they are passed back only so that it can be passed + * into any one of the manipulation functions (add entry, delete + * entry, etc). As an example, bh in the extent directory case is a + * data block, in the inline-data case it actually points to an inode, + * in the indexed directory case, multiple buffers are involved. + */ +int ocfs2_find_entry(const char *name, int namelen, + struct inode *dir, struct ocfs2_dir_lookup_result *lookup) +{ + struct buffer_head *bh; + struct ocfs2_dir_entry *res_dir = NULL; + + if (ocfs2_dir_indexed(dir)) + return ocfs2_find_entry_dx(name, namelen, dir, lookup); + + /* + * The unindexed dir code only uses part of the lookup + * structure, so there's no reason to push it down further + * than this. + */ + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir); + else + bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir); + + if (bh == NULL) + return -ENOENT; + + lookup->dl_leaf_bh = bh; + lookup->dl_entry = res_dir; + return 0; +} + +/* + * Update inode number and type of a previously found directory entry. + */ +int ocfs2_update_entry(struct inode *dir, handle_t *handle, + struct ocfs2_dir_lookup_result *res, + struct inode *new_entry_inode) +{ + int ret; + ocfs2_journal_access_func access = ocfs2_journal_access_db; + struct ocfs2_dir_entry *de = res->dl_entry; + struct buffer_head *de_bh = res->dl_leaf_bh; + + /* + * The same code works fine for both inline-data and extent + * based directories, so no need to split this up. The only + * difference is the journal_access function. + */ + + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + access = ocfs2_journal_access_di; + + ret = access(handle, INODE_CACHE(dir), de_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno); + ocfs2_set_de_type(de, new_entry_inode->i_mode); + + ocfs2_journal_dirty(handle, de_bh); + +out: + return ret; +} + +/* + * __ocfs2_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, + struct ocfs2_dir_entry *de_del, + struct buffer_head *bh, char *first_de, + unsigned int bytes) +{ + struct ocfs2_dir_entry *de, *pde; + int i, status = -ENOENT; + ocfs2_journal_access_func access = ocfs2_journal_access_db; + + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + access = ocfs2_journal_access_di; + + i = 0; + pde = NULL; + de = (struct ocfs2_dir_entry *) first_de; + while (i < bytes) { + if (!ocfs2_check_dir_entry(dir, de, bh, i)) { + status = -EIO; + mlog_errno(status); + goto bail; + } + if (de == de_del) { + status = access(handle, INODE_CACHE(dir), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + status = -EIO; + mlog_errno(status); + goto bail; + } + if (pde) + le16_add_cpu(&pde->rec_len, + le16_to_cpu(de->rec_len)); + else + de->inode = 0; + dir->i_version++; + ocfs2_journal_dirty(handle, bh); + goto bail; + } + i += le16_to_cpu(de->rec_len); + pde = de; + de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len)); + } +bail: + return status; +} + +static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de) +{ + unsigned int hole; + + if (le64_to_cpu(de->inode) == 0) + hole = le16_to_cpu(de->rec_len); + else + hole = le16_to_cpu(de->rec_len) - + OCFS2_DIR_REC_LEN(de->name_len); + + return hole; +} + +static int ocfs2_find_max_rec_len(struct super_block *sb, + struct buffer_head *dirblock_bh) +{ + int size, this_hole, largest_hole = 0; + char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data; + struct ocfs2_dir_entry *de; + + trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb); + size = ocfs2_dir_trailer_blk_off(sb); + limit = start + size; + de_buf = start; + de = (struct ocfs2_dir_entry *)de_buf; + do { + if (de_buf != trailer) { + this_hole = ocfs2_figure_dirent_hole(de); + if (this_hole > largest_hole) + largest_hole = this_hole; + } + + de_buf += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *)de_buf; + } while (de_buf < limit); + + if (largest_hole >= OCFS2_DIR_MIN_REC_LEN) + return largest_hole; + return 0; +} + +static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list, + int index) +{ + int num_used = le16_to_cpu(entry_list->de_num_used); + + if (num_used == 1 || index == (num_used - 1)) + goto clear; + + memmove(&entry_list->de_entries[index], + &entry_list->de_entries[index + 1], + (num_used - index - 1)*sizeof(struct ocfs2_dx_entry)); +clear: + num_used--; + memset(&entry_list->de_entries[num_used], 0, + sizeof(struct ocfs2_dx_entry)); + entry_list->de_num_used = cpu_to_le16(num_used); +} + +static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret, index, max_rec_len, add_to_free_list = 0; + struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; + struct buffer_head *leaf_bh = lookup->dl_leaf_bh; + struct ocfs2_dx_leaf *dx_leaf; + struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry; + struct ocfs2_dir_block_trailer *trailer; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + + /* + * This function gets a bit messy because we might have to + * modify the root block, regardless of whether the indexed + * entries are stored inline. + */ + + /* + * *Only* set 'entry_list' here, based on where we're looking + * for the indexed entries. Later, we might still want to + * journal both blocks, based on free list state. + */ + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + if (ocfs2_dx_root_inline(dx_root)) { + entry_list = &dx_root->dr_entries; + } else { + dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data; + entry_list = &dx_leaf->dl_list; + } + + /* Neither of these are a disk corruption - that should have + * been caught by lookup, before we got here. */ + BUG_ON(le16_to_cpu(entry_list->de_count) <= 0); + BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0); + + index = (char *)dx_entry - (char *)entry_list->de_entries; + index /= sizeof(*dx_entry); + + if (index >= le16_to_cpu(entry_list->de_num_used)) { + mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n", + (unsigned long long)OCFS2_I(dir)->ip_blkno, index, + entry_list, dx_entry); + return -EIO; + } + + /* + * We know that removal of this dirent will leave enough room + * for a new one, so add this block to the free list if it + * isn't already there. + */ + trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb); + if (trailer->db_free_rec_len == 0) + add_to_free_list = 1; + + /* + * Add the block holding our index into the journal before + * removing the unindexed entry. If we get an error return + * from __ocfs2_delete_entry(), then it hasn't removed the + * entry yet. Likewise, successful return means we *must* + * remove the indexed entry. + * + * We're also careful to journal the root tree block here as + * the entry count needs to be updated. Also, we might be + * adding to the start of the free list. + */ + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (!ocfs2_dx_root_inline(dx_root)) { + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), + lookup->dl_dx_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno, + index); + + ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry, + leaf_bh, leaf_bh->b_data, leaf_bh->b_size); + if (ret) { + mlog_errno(ret); + goto out; + } + + max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh); + trailer->db_free_rec_len = cpu_to_le16(max_rec_len); + if (add_to_free_list) { + trailer->db_free_next = dx_root->dr_free_blk; + dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr); + ocfs2_journal_dirty(handle, dx_root_bh); + } + + /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */ + ocfs2_journal_dirty(handle, leaf_bh); + + le32_add_cpu(&dx_root->dr_num_entries, -1); + ocfs2_journal_dirty(handle, dx_root_bh); + + ocfs2_dx_list_remove_entry(entry_list, index); + + if (!ocfs2_dx_root_inline(dx_root)) + ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh); + +out: + return ret; +} + +static inline int ocfs2_delete_entry_id(handle_t *handle, + struct inode *dir, + struct ocfs2_dir_entry *de_del, + struct buffer_head *bh) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_inline_data *data; + + ret = ocfs2_read_inode_block(dir, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + data = &di->id2.i_data; + + ret = __ocfs2_delete_entry(handle, dir, de_del, bh, data->id_data, + i_size_read(dir)); + + brelse(di_bh); +out: + return ret; +} + +static inline int ocfs2_delete_entry_el(handle_t *handle, + struct inode *dir, + struct ocfs2_dir_entry *de_del, + struct buffer_head *bh) +{ + return __ocfs2_delete_entry(handle, dir, de_del, bh, bh->b_data, + bh->b_size); +} + +/* + * Delete a directory entry. Hide the details of directory + * implementation from the caller. + */ +int ocfs2_delete_entry(handle_t *handle, + struct inode *dir, + struct ocfs2_dir_lookup_result *res) +{ + if (ocfs2_dir_indexed(dir)) + return ocfs2_delete_entry_dx(handle, dir, res); + + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return ocfs2_delete_entry_id(handle, dir, res->dl_entry, + res->dl_leaf_bh); + + return ocfs2_delete_entry_el(handle, dir, res->dl_entry, + res->dl_leaf_bh); +} + +/* + * Check whether 'de' has enough room to hold an entry of + * 'new_rec_len' bytes. + */ +static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de, + unsigned int new_rec_len) +{ + unsigned int de_really_used; + + /* Check whether this is an empty record with enough space */ + if (le64_to_cpu(de->inode) == 0 && + le16_to_cpu(de->rec_len) >= new_rec_len) + return 1; + + /* + * Record might have free space at the end which we can + * use. + */ + de_really_used = OCFS2_DIR_REC_LEN(de->name_len); + if (le16_to_cpu(de->rec_len) >= (de_really_used + new_rec_len)) + return 1; + + return 0; +} + +static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf, + struct ocfs2_dx_entry *dx_new_entry) +{ + int i; + + i = le16_to_cpu(dx_leaf->dl_list.de_num_used); + dx_leaf->dl_list.de_entries[i] = *dx_new_entry; + + le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1); +} + +static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list, + struct ocfs2_dx_hinfo *hinfo, + u64 dirent_blk) +{ + int i; + struct ocfs2_dx_entry *dx_entry; + + i = le16_to_cpu(entry_list->de_num_used); + dx_entry = &entry_list->de_entries[i]; + + memset(dx_entry, 0, sizeof(*dx_entry)); + dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash); + dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash); + dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk); + + le16_add_cpu(&entry_list->de_num_used, 1); +} + +static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle, + struct ocfs2_dx_hinfo *hinfo, + u64 dirent_blk, + struct buffer_head *dx_leaf_bh) +{ + int ret; + struct ocfs2_dx_leaf *dx_leaf; + + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; + ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk); + ocfs2_journal_dirty(handle, dx_leaf_bh); + +out: + return ret; +} + +static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle, + struct ocfs2_dx_hinfo *hinfo, + u64 dirent_blk, + struct ocfs2_dx_root_block *dx_root) +{ + ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk); +} + +static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret = 0; + struct ocfs2_dx_root_block *dx_root; + struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; + + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data; + if (ocfs2_dx_root_inline(dx_root)) { + ocfs2_dx_inline_root_insert(dir, handle, + &lookup->dl_hinfo, + lookup->dl_leaf_bh->b_blocknr, + dx_root); + } else { + ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo, + lookup->dl_leaf_bh->b_blocknr, + lookup->dl_dx_leaf_bh); + if (ret) + goto out; + } + + le32_add_cpu(&dx_root->dr_num_entries, 1); + ocfs2_journal_dirty(handle, dx_root_bh); + +out: + return ret; +} + +static void ocfs2_remove_block_from_free_list(struct inode *dir, + handle_t *handle, + struct ocfs2_dir_lookup_result *lookup) +{ + struct ocfs2_dir_block_trailer *trailer, *prev; + struct ocfs2_dx_root_block *dx_root; + struct buffer_head *bh; + + trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb); + + if (ocfs2_free_list_at_root(lookup)) { + bh = lookup->dl_dx_root_bh; + dx_root = (struct ocfs2_dx_root_block *)bh->b_data; + dx_root->dr_free_blk = trailer->db_free_next; + } else { + bh = lookup->dl_prev_leaf_bh; + prev = ocfs2_trailer_from_bh(bh, dir->i_sb); + prev->db_free_next = trailer->db_free_next; + } + + trailer->db_free_rec_len = cpu_to_le16(0); + trailer->db_free_next = cpu_to_le64(0); + + ocfs2_journal_dirty(handle, bh); + ocfs2_journal_dirty(handle, lookup->dl_leaf_bh); +} + +/* + * This expects that a journal write has been reserved on + * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh + */ +static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle, + struct ocfs2_dir_lookup_result *lookup) +{ + int max_rec_len; + struct ocfs2_dir_block_trailer *trailer; + + /* Walk dl_leaf_bh to figure out what the new free rec_len is. */ + max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh); + if (max_rec_len) { + /* + * There's still room in this block, so no need to remove it + * from the free list. In this case, we just want to update + * the rec len accounting. + */ + trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb); + trailer->db_free_rec_len = cpu_to_le16(max_rec_len); + ocfs2_journal_dirty(handle, lookup->dl_leaf_bh); + } else { + ocfs2_remove_block_from_free_list(dir, handle, lookup); + } +} + +/* we don't always have a dentry for what we want to add, so people + * like orphan dir can call this instead. + * + * The lookup context must have been filled from + * ocfs2_prepare_dir_for_insert. + */ +int __ocfs2_add_entry(handle_t *handle, + struct inode *dir, + const char *name, int namelen, + struct inode *inode, u64 blkno, + struct buffer_head *parent_fe_bh, + struct ocfs2_dir_lookup_result *lookup) +{ + unsigned long offset; + unsigned short rec_len; + struct ocfs2_dir_entry *de, *de1; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data; + struct super_block *sb = dir->i_sb; + int retval, status; + unsigned int size = sb->s_blocksize; + struct buffer_head *insert_bh = lookup->dl_leaf_bh; + char *data_start = insert_bh->b_data; + + if (!namelen) + return -EINVAL; + + if (ocfs2_dir_indexed(dir)) { + struct buffer_head *bh; + + /* + * An indexed dir may require that we update the free space + * list. Reserve a write to the previous node in the list so + * that we don't fail later. + * + * XXX: This can be either a dx_root_block, or an unindexed + * directory tree leaf block. + */ + if (ocfs2_free_list_at_root(lookup)) { + bh = lookup->dl_dx_root_bh; + retval = ocfs2_journal_access_dr(handle, + INODE_CACHE(dir), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + } else { + bh = lookup->dl_prev_leaf_bh; + retval = ocfs2_journal_access_db(handle, + INODE_CACHE(dir), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + } + if (retval) { + mlog_errno(retval); + return retval; + } + } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + data_start = di->id2.i_data.id_data; + size = i_size_read(dir); + + BUG_ON(insert_bh != parent_fe_bh); + } + + rec_len = OCFS2_DIR_REC_LEN(namelen); + offset = 0; + de = (struct ocfs2_dir_entry *) data_start; + while (1) { + BUG_ON((char *)de >= (size + data_start)); + + /* These checks should've already been passed by the + * prepare function, but I guess we can leave them + * here anyway. */ + if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) { + retval = -ENOENT; + goto bail; + } + if (ocfs2_match(namelen, name, de)) { + retval = -EEXIST; + goto bail; + } + + /* We're guaranteed that we should have space, so we + * can't possibly have hit the trailer...right? */ + mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size), + "Hit dir trailer trying to insert %.*s " + "(namelen %d) into directory %llu. " + "offset is %lu, trailer offset is %d\n", + namelen, name, namelen, + (unsigned long long)parent_fe_bh->b_blocknr, + offset, ocfs2_dir_trailer_blk_off(dir->i_sb)); + + if (ocfs2_dirent_would_fit(de, rec_len)) { + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); + if (retval < 0) { + mlog_errno(retval); + goto bail; + } + + if (insert_bh == parent_fe_bh) + status = ocfs2_journal_access_di(handle, + INODE_CACHE(dir), + insert_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + else { + status = ocfs2_journal_access_db(handle, + INODE_CACHE(dir), + insert_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + + if (ocfs2_dir_indexed(dir)) { + status = ocfs2_dx_dir_insert(dir, + handle, + lookup); + if (status) { + mlog_errno(status); + goto bail; + } + } + } + + /* By now the buffer is marked for journaling */ + offset += le16_to_cpu(de->rec_len); + if (le64_to_cpu(de->inode)) { + de1 = (struct ocfs2_dir_entry *)((char *) de + + OCFS2_DIR_REC_LEN(de->name_len)); + de1->rec_len = + cpu_to_le16(le16_to_cpu(de->rec_len) - + OCFS2_DIR_REC_LEN(de->name_len)); + de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); + de = de1; + } + de->file_type = OCFS2_FT_UNKNOWN; + if (blkno) { + de->inode = cpu_to_le64(blkno); + ocfs2_set_de_type(de, inode->i_mode); + } else + de->inode = 0; + de->name_len = namelen; + memcpy(de->name, name, namelen); + + if (ocfs2_dir_indexed(dir)) + ocfs2_recalc_free_list(dir, handle, lookup); + + dir->i_version++; + ocfs2_journal_dirty(handle, insert_bh); + retval = 0; + goto bail; + } + + offset += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len)); + } + + /* when you think about it, the assert above should prevent us + * from ever getting here. */ + retval = -ENOSPC; +bail: + if (retval) + mlog_errno(retval); + + return retval; +} + +static int ocfs2_dir_foreach_blk_id(struct inode *inode, + u64 *f_version, + loff_t *f_pos, void *priv, + filldir_t filldir, int *filldir_err) +{ + int ret, i, filldir_ret; + unsigned long offset = *f_pos; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_inline_data *data; + struct ocfs2_dir_entry *de; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + goto out; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + data = &di->id2.i_data; + + while (*f_pos < i_size_read(inode)) { +revalidate: + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (*f_version != inode->i_version) { + for (i = 0; i < i_size_read(inode) && i < offset; ) { + de = (struct ocfs2_dir_entry *) + (data->id_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (le16_to_cpu(de->rec_len) < + OCFS2_DIR_REC_LEN(1)) + break; + i += le16_to_cpu(de->rec_len); + } + *f_pos = offset = i; + *f_version = inode->i_version; + } + + de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos); + if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) { + /* On error, skip the f_pos to the end. */ + *f_pos = i_size_read(inode); + goto out; + } + offset += le16_to_cpu(de->rec_len); + if (le64_to_cpu(de->inode)) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + u64 version = *f_version; + unsigned char d_type = DT_UNKNOWN; + + if (de->file_type < OCFS2_FT_MAX) + d_type = ocfs2_filetype_table[de->file_type]; + + filldir_ret = filldir(priv, de->name, + de->name_len, + *f_pos, + le64_to_cpu(de->inode), + d_type); + if (filldir_ret) { + if (filldir_err) + *filldir_err = filldir_ret; + break; + } + if (version != *f_version) + goto revalidate; + } + *f_pos += le16_to_cpu(de->rec_len); + } + +out: + brelse(di_bh); + + return 0; +} + +/* + * NOTE: This function can be called against unindexed directories, + * and indexed ones. + */ +static int ocfs2_dir_foreach_blk_el(struct inode *inode, + u64 *f_version, + loff_t *f_pos, void *priv, + filldir_t filldir, int *filldir_err) +{ + int error = 0; + unsigned long offset, blk, last_ra_blk = 0; + int i, stored; + struct buffer_head * bh, * tmp; + struct ocfs2_dir_entry * de; + struct super_block * sb = inode->i_sb; + unsigned int ra_sectors = 16; + + stored = 0; + bh = NULL; + + offset = (*f_pos) & (sb->s_blocksize - 1); + + while (!error && !stored && *f_pos < i_size_read(inode)) { + blk = (*f_pos) >> sb->s_blocksize_bits; + if (ocfs2_read_dir_block(inode, blk, &bh, 0)) { + /* Skip the corrupt dirblock and keep trying */ + *f_pos += sb->s_blocksize - offset; + continue; + } + + /* The idea here is to begin with 8k read-ahead and to stay + * 4k ahead of our current position. + * + * TODO: Use the pagecache for this. We just need to + * make sure it's cluster-safe... */ + if (!last_ra_blk + || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) { + for (i = ra_sectors >> (sb->s_blocksize_bits - 9); + i > 0; i--) { + tmp = NULL; + if (!ocfs2_read_dir_block(inode, ++blk, &tmp, + OCFS2_BH_READAHEAD)) + brelse(tmp); + } + last_ra_blk = blk; + ra_sectors = 8; + } + +revalidate: + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (*f_version != inode->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ocfs2_dir_entry *) (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (le16_to_cpu(de->rec_len) < + OCFS2_DIR_REC_LEN(1)) + break; + i += le16_to_cpu(de->rec_len); + } + offset = i; + *f_pos = ((*f_pos) & ~(sb->s_blocksize - 1)) + | offset; + *f_version = inode->i_version; + } + + while (!error && *f_pos < i_size_read(inode) + && offset < sb->s_blocksize) { + de = (struct ocfs2_dir_entry *) (bh->b_data + offset); + if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { + /* On error, skip the f_pos to the + next block. */ + *f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1; + brelse(bh); + goto out; + } + offset += le16_to_cpu(de->rec_len); + if (le64_to_cpu(de->inode)) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + unsigned long version = *f_version; + unsigned char d_type = DT_UNKNOWN; + + if (de->file_type < OCFS2_FT_MAX) + d_type = ocfs2_filetype_table[de->file_type]; + error = filldir(priv, de->name, + de->name_len, + *f_pos, + le64_to_cpu(de->inode), + d_type); + if (error) { + if (filldir_err) + *filldir_err = error; + break; + } + if (version != *f_version) + goto revalidate; + stored ++; + } + *f_pos += le16_to_cpu(de->rec_len); + } + offset = 0; + brelse(bh); + bh = NULL; + } + + stored = 0; +out: + return stored; +} + +static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version, + loff_t *f_pos, void *priv, filldir_t filldir, + int *filldir_err) +{ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv, + filldir, filldir_err); + + return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir, + filldir_err); +} + +/* + * This is intended to be called from inside other kernel functions, + * so we fake some arguments. + */ +int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv, + filldir_t filldir) +{ + int ret = 0, filldir_err = 0; + u64 version = inode->i_version; + + while (*f_pos < i_size_read(inode)) { + ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv, + filldir, &filldir_err); + if (ret || filldir_err) + break; + } + + if (ret > 0) + ret = -EIO; + + return 0; +} + +/* + * ocfs2_readdir() + * + */ +int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + int error = 0; + struct inode *inode = filp->f_path.dentry->d_inode; + int lock_level = 0; + + trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); + + error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); + if (lock_level && error >= 0) { + /* We release EX lock which used to update atime + * and get PR lock again to reduce contention + * on commonly accessed directories. */ + ocfs2_inode_unlock(inode, 1); + lock_level = 0; + error = ocfs2_inode_lock(inode, NULL, 0); + } + if (error < 0) { + if (error != -ENOENT) + mlog_errno(error); + /* we haven't got any yet, so propagate the error. */ + goto bail_nolock; + } + + error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos, + dirent, filldir, NULL); + + ocfs2_inode_unlock(inode, lock_level); + if (error) + mlog_errno(error); + +bail_nolock: + + return error; +} + +/* + * NOTE: this should always be called with parent dir i_mutex taken. + */ +int ocfs2_find_files_on_disk(const char *name, + int namelen, + u64 *blkno, + struct inode *inode, + struct ocfs2_dir_lookup_result *lookup) +{ + int status = -ENOENT; + + trace_ocfs2_find_files_on_disk(namelen, name, blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + status = ocfs2_find_entry(name, namelen, inode, lookup); + if (status) + goto leave; + + *blkno = le64_to_cpu(lookup->dl_entry->inode); + + status = 0; +leave: + + return status; +} + +/* + * Convenience function for callers which just want the block number + * mapped to a name and don't require the full dirent info, etc. + */ +int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, + int namelen, u64 *blkno) +{ + int ret; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + + ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup); + ocfs2_free_dir_lookup_result(&lookup); + + return ret; +} + +/* Check for a name within a directory. + * + * Return 0 if the name does not exist + * Return -EEXIST if the directory contains the name + * + * Callers should have i_mutex + a cluster lock on dir + */ +int ocfs2_check_dir_for_entry(struct inode *dir, + const char *name, + int namelen) +{ + int ret; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + + trace_ocfs2_check_dir_for_entry( + (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); + + ret = -EEXIST; + if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) + goto bail; + + ret = 0; +bail: + ocfs2_free_dir_lookup_result(&lookup); + + if (ret) + mlog_errno(ret); + return ret; +} + +struct ocfs2_empty_dir_priv { + unsigned seen_dot; + unsigned seen_dot_dot; + unsigned seen_other; + unsigned dx_dir; +}; +static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, + loff_t pos, u64 ino, unsigned type) +{ + struct ocfs2_empty_dir_priv *p = priv; + + /* + * Check the positions of "." and ".." records to be sure + * they're in the correct place. + * + * Indexed directories don't need to proceed past the first + * two entries, so we end the scan after seeing '..'. Despite + * that, we allow the scan to proceed In the event that we + * have a corrupted indexed directory (no dot or dot dot + * entries). This allows us to double check for existing + * entries which might not have been found in the index. + */ + if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { + p->seen_dot = 1; + return 0; + } + + if (name_len == 2 && !strncmp("..", name, 2) && + pos == OCFS2_DIR_REC_LEN(1)) { + p->seen_dot_dot = 1; + + if (p->dx_dir && p->seen_dot) + return 1; + + return 0; + } + + p->seen_other = 1; + return 1; +} + +static int ocfs2_empty_dir_dx(struct inode *inode, + struct ocfs2_empty_dir_priv *priv) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_dx_root_block *dx_root; + + priv->dx_dir = 1; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + di = (struct ocfs2_dinode *)di_bh->b_data; + + ret = ocfs2_read_dx_root(inode, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + if (le32_to_cpu(dx_root->dr_num_entries) != 2) + priv->seen_other = 1; + +out: + brelse(di_bh); + brelse(dx_root_bh); + return ret; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + * + * Returns 1 if dir is empty, zero otherwise. + * + * XXX: This is a performance problem for unindexed directories. + */ +int ocfs2_empty_dir(struct inode *inode) +{ + int ret; + loff_t start = 0; + struct ocfs2_empty_dir_priv priv; + + memset(&priv, 0, sizeof(priv)); + + if (ocfs2_dir_indexed(inode)) { + ret = ocfs2_empty_dir_dx(inode, &priv); + if (ret) + mlog_errno(ret); + /* + * We still run ocfs2_dir_foreach to get the checks + * for "." and "..". + */ + } + + ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir); + if (ret) + mlog_errno(ret); + + if (!priv.seen_dot || !priv.seen_dot_dot) { + mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + /* + * XXX: Is it really safe to allow an unlink to continue? + */ + return 1; + } + + return !priv.seen_other; +} + +/* + * Fills "." and ".." dirents in a new directory block. Returns dirent for + * "..", which might be used during creation of a directory with a trailing + * header. It is otherwise safe to ignore the return code. + */ +static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode, + struct inode *parent, + char *start, + unsigned int size) +{ + struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start; + + de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); + de->name_len = 1; + de->rec_len = + cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); + strcpy(de->name, "."); + ocfs2_set_de_type(de, S_IFDIR); + + de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len)); + de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno); + de->rec_len = cpu_to_le16(size - OCFS2_DIR_REC_LEN(1)); + de->name_len = 2; + strcpy(de->name, ".."); + ocfs2_set_de_type(de, S_IFDIR); + + return de; +} + +/* + * This works together with code in ocfs2_mknod_locked() which sets + * the inline-data flag and initializes the inline-data section. + */ +static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb, + handle_t *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inline_data *data = &di->id2.i_data; + unsigned int size = le16_to_cpu(data->id_count); + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_fill_initial_dirents(inode, parent, data->id_data, size); + ocfs2_journal_dirty(handle, di_bh); + + i_size_write(inode, size); + inode->i_nlink = 2; + inode->i_blocks = ocfs2_inode_sector_count(inode); + + ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); + if (ret < 0) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, + handle_t *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *data_ac, + struct buffer_head **ret_new_bh) +{ + int status; + unsigned int size = osb->sb->s_blocksize; + struct buffer_head *new_bh = NULL; + struct ocfs2_dir_entry *de; + + if (ocfs2_new_dir_wants_trailer(inode)) + size = ocfs2_dir_trailer_blk_off(parent->i_sb); + + status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, + data_ac, NULL, &new_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); + + status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + memset(new_bh->b_data, 0, osb->sb->s_blocksize); + + de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size); + if (ocfs2_new_dir_wants_trailer(inode)) { + int size = le16_to_cpu(de->rec_len); + + /* + * Figure out the size of the hole left over after + * insertion of '.' and '..'. The trailer wants this + * information. + */ + size -= OCFS2_DIR_REC_LEN(2); + size -= sizeof(struct ocfs2_dir_block_trailer); + + ocfs2_init_dir_trailer(inode, new_bh, size); + } + + ocfs2_journal_dirty(handle, new_bh); + + i_size_write(inode, inode->i_sb->s_blocksize); + inode->i_nlink = 2; + inode->i_blocks = ocfs2_inode_sector_count(inode); + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = 0; + if (ret_new_bh) { + *ret_new_bh = new_bh; + new_bh = NULL; + } +bail: + brelse(new_bh); + + return status; +} + +static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, + handle_t *handle, struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dirdata_bh, + struct ocfs2_alloc_context *meta_ac, + int dx_inline, u32 num_entries, + struct buffer_head **ret_dx_root_bh) +{ + int ret; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + u16 dr_suballoc_bit; + u64 suballoc_loc, dr_blkno; + unsigned int num_bits; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dir_block_trailer *trailer = + ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); + + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, + &dr_suballoc_bit, &num_bits, &dr_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_dx_dir_attach_index( + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)dr_blkno); + + dx_root_bh = sb_getblk(osb->sb, dr_blkno); + if (dx_root_bh == NULL) { + ret = -EIO; + goto out; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh); + + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + memset(dx_root, 0, osb->sb->s_blocksize); + strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); + dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc); + dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); + dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); + dx_root->dr_blkno = cpu_to_le64(dr_blkno); + dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno); + dx_root->dr_num_entries = cpu_to_le32(num_entries); + if (le16_to_cpu(trailer->db_free_rec_len)) + dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr); + else + dx_root->dr_free_blk = cpu_to_le64(0); + + if (dx_inline) { + dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE; + dx_root->dr_entries.de_count = + cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb)); + } else { + dx_root->dr_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); + } + ocfs2_journal_dirty(handle, dx_root_bh); + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + + di->i_dx_root = cpu_to_le64(dr_blkno); + + spin_lock(&OCFS2_I(dir)->ip_lock); + OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; + di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); + spin_unlock(&OCFS2_I(dir)->ip_lock); + + ocfs2_journal_dirty(handle, di_bh); + + *ret_dx_root_bh = dx_root_bh; + dx_root_bh = NULL; + +out: + brelse(dx_root_bh); + return ret; +} + +static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb, + handle_t *handle, struct inode *dir, + struct buffer_head **dx_leaves, + int num_dx_leaves, u64 start_blk) +{ + int ret, i; + struct ocfs2_dx_leaf *dx_leaf; + struct buffer_head *bh; + + for (i = 0; i < num_dx_leaves; i++) { + bh = sb_getblk(osb->sb, start_blk + i); + if (bh == NULL) { + ret = -EIO; + goto out; + } + dx_leaves[i] = bh; + + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh); + + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data; + + memset(dx_leaf, 0, osb->sb->s_blocksize); + strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE); + dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation); + dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr); + dx_leaf->dl_list.de_count = + cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb)); + + trace_ocfs2_dx_dir_format_cluster( + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)bh->b_blocknr, + le16_to_cpu(dx_leaf->dl_list.de_count)); + + ocfs2_journal_dirty(handle, bh); + } + + ret = 0; +out: + return ret; +} + +/* + * Allocates and formats a new cluster for use in an indexed dir + * leaf. This version will not do the extent insert, so that it can be + * used by operations which need careful ordering. + */ +static int __ocfs2_dx_dir_new_cluster(struct inode *dir, + u32 cpos, handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct buffer_head **dx_leaves, + int num_dx_leaves, u64 *ret_phys_blkno) +{ + int ret; + u32 phys, num; + u64 phys_blkno; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + + /* + * XXX: For create, this should claim cluster for the index + * *before* the unindexed insert so that we have a better + * chance of contiguousness as the directory grows in number + * of entries. + */ + ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Format the new cluster first. That way, we're inserting + * valid data. + */ + phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys); + ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves, + num_dx_leaves, phys_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + *ret_phys_blkno = phys_blkno; +out: + return ret; +} + +static int ocfs2_dx_dir_new_cluster(struct inode *dir, + struct ocfs2_extent_tree *et, + u32 cpos, handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **dx_leaves, + int num_dx_leaves) +{ + int ret; + u64 phys_blkno; + + ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves, + num_dx_leaves, &phys_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0, + meta_ac); + if (ret) + mlog_errno(ret); +out: + return ret; +} + +static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb, + int *ret_num_leaves) +{ + int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1); + struct buffer_head **dx_leaves; + + dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *), + GFP_NOFS); + if (dx_leaves && ret_num_leaves) + *ret_num_leaves = num_dx_leaves; + + return dx_leaves; +} + +static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb, + handle_t *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *di_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + struct buffer_head *leaf_bh = NULL; + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_hinfo hinfo; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + + /* + * Our strategy is to create the directory as though it were + * unindexed, then add the index block. This works with very + * little complication since the state of a new directory is a + * very well known quantity. + * + * Essentially, we have two dirents ("." and ".."), in the 1st + * block which need indexing. These are easily inserted into + * the index block. + */ + + ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh, + data_ac, &leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh, + meta_ac, 1, 2, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + entry_list = &dx_root->dr_entries; + + /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */ + ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo); + ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); + + ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo); + ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); + +out: + brelse(dx_root_bh); + brelse(leaf_bh); + return ret; +} + +int ocfs2_fill_new_dir(struct ocfs2_super *osb, + handle_t *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac) + +{ + BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL); + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh); + + if (ocfs2_supports_indexed_dirs(osb)) + return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh, + data_ac, meta_ac); + + return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh, + data_ac, NULL); +} + +static int ocfs2_dx_dir_index_block(struct inode *dir, + handle_t *handle, + struct buffer_head **dx_leaves, + int num_dx_leaves, + u32 *num_dx_entries, + struct buffer_head *dirent_bh) +{ + int ret = 0, namelen, i; + char *de_buf, *limit; + struct ocfs2_dir_entry *de; + struct buffer_head *dx_leaf_bh; + struct ocfs2_dx_hinfo hinfo; + u64 dirent_blk = dirent_bh->b_blocknr; + + de_buf = dirent_bh->b_data; + limit = de_buf + dir->i_sb->s_blocksize; + + while (de_buf < limit) { + de = (struct ocfs2_dir_entry *)de_buf; + + namelen = de->name_len; + if (!namelen || !de->inode) + goto inc; + + ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo); + + i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo); + dx_leaf_bh = dx_leaves[i]; + + ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo, + dirent_blk, dx_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + *num_dx_entries = *num_dx_entries + 1; + +inc: + de_buf += le16_to_cpu(de->rec_len); + } + +out: + return ret; +} + +/* + * XXX: This expects dx_root_bh to already be part of the transaction. + */ +static void ocfs2_dx_dir_index_root_block(struct inode *dir, + struct buffer_head *dx_root_bh, + struct buffer_head *dirent_bh) +{ + char *de_buf, *limit; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dir_entry *de; + struct ocfs2_dx_hinfo hinfo; + u64 dirent_blk = dirent_bh->b_blocknr; + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + de_buf = dirent_bh->b_data; + limit = de_buf + dir->i_sb->s_blocksize; + + while (de_buf < limit) { + de = (struct ocfs2_dir_entry *)de_buf; + + if (!de->name_len || !de->inode) + goto inc; + + ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo); + + trace_ocfs2_dx_dir_index_root_block( + (unsigned long long)dir->i_ino, + hinfo.major_hash, hinfo.minor_hash, + de->name_len, de->name, + le16_to_cpu(dx_root->dr_entries.de_num_used)); + + ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo, + dirent_blk); + + le32_add_cpu(&dx_root->dr_num_entries, 1); +inc: + de_buf += le16_to_cpu(de->rec_len); + } +} + +/* + * Count the number of inline directory entries in di_bh and compare + * them against the number of entries we can hold in an inline dx root + * block. + */ +static int ocfs2_new_dx_should_be_inline(struct inode *dir, + struct buffer_head *di_bh) +{ + int dirent_count = 0; + char *de_buf, *limit; + struct ocfs2_dir_entry *de; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + de_buf = di->id2.i_data.id_data; + limit = de_buf + i_size_read(dir); + + while (de_buf < limit) { + de = (struct ocfs2_dir_entry *)de_buf; + + if (de->name_len && de->inode) + dirent_count++; + + de_buf += le16_to_cpu(de->rec_len); + } + + /* We are careful to leave room for one extra record. */ + return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb); +} + +/* + * Expand rec_len of the rightmost dirent in a directory block so that it + * contains the end of our valid space for dirents. We do this during + * expansion from an inline directory to one with extents. The first dir block + * in that case is taken from the inline data portion of the inode block. + * + * This will also return the largest amount of contiguous space for a dirent + * in the block. That value is *not* necessarily the last dirent, even after + * expansion. The directory indexing code wants this value for free space + * accounting. We do this here since we're already walking the entire dir + * block. + * + * We add the dir trailer if this filesystem wants it. + */ +static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size, + struct inode *dir) +{ + struct super_block *sb = dir->i_sb; + struct ocfs2_dir_entry *de; + struct ocfs2_dir_entry *prev_de; + char *de_buf, *limit; + unsigned int new_size = sb->s_blocksize; + unsigned int bytes, this_hole; + unsigned int largest_hole = 0; + + if (ocfs2_new_dir_wants_trailer(dir)) + new_size = ocfs2_dir_trailer_blk_off(sb); + + bytes = new_size - old_size; + + limit = start + old_size; + de_buf = start; + de = (struct ocfs2_dir_entry *)de_buf; + do { + this_hole = ocfs2_figure_dirent_hole(de); + if (this_hole > largest_hole) + largest_hole = this_hole; + + prev_de = de; + de_buf += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *)de_buf; + } while (de_buf < limit); + + le16_add_cpu(&prev_de->rec_len, bytes); + + /* We need to double check this after modification of the final + * dirent. */ + this_hole = ocfs2_figure_dirent_hole(prev_de); + if (this_hole > largest_hole) + largest_hole = this_hole; + + if (largest_hole >= OCFS2_DIR_MIN_REC_LEN) + return largest_hole; + return 0; +} + +/* + * We allocate enough clusters to fulfill "blocks_wanted", but set + * i_size to exactly one block. Ocfs2_extend_dir() will handle the + * rest automatically for us. + * + * *first_block_bh is a pointer to the 1st data block allocated to the + * directory. + */ +static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, + unsigned int blocks_wanted, + struct ocfs2_dir_lookup_result *lookup, + struct buffer_head **first_block_bh) +{ + u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0; + struct super_block *sb = dir->i_sb; + int ret, i, num_dx_leaves = 0, dx_inline = 0, + credits = ocfs2_inline_to_extents_credits(sb); + u64 dx_insert_blkno, blkno, + bytes = blocks_wanted << sb->s_blocksize_bits; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(dir); + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct buffer_head *dirdata_bh = NULL; + struct buffer_head *dx_root_bh = NULL; + struct buffer_head **dx_leaves = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + handle_t *handle; + struct ocfs2_extent_tree et; + struct ocfs2_extent_tree dx_et; + int did_quota = 0, bytes_allocated = 0; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh); + + alloc = ocfs2_clusters_for_bytes(sb, bytes); + dx_alloc = 0; + + down_write(&oi->ip_alloc_sem); + + if (ocfs2_supports_indexed_dirs(osb)) { + credits += ocfs2_add_dir_index_credits(sb); + + dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh); + if (!dx_inline) { + /* Add one more cluster for an index leaf */ + dx_alloc++; + dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb, + &num_dx_leaves); + if (!dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + } + + /* This gets us the dx_root */ + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * We should never need more than 2 clusters for the unindexed + * tree - maximum dirent size is far less than one block. In + * fact, the only time we'd need more than one cluster is if + * blocksize == clustersize and the dirent won't fit in the + * extra space that the expansion to a single block gives. As + * of today, that only happens on 4k/4k file systems. + */ + BUG_ON(alloc > 2); + + ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Prepare for worst case allocation scenario of two separate + * extents in the unindexed tree. + */ + if (alloc == 2) + credits += OCFS2_SUBALLOC_ALLOC; + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc)); + if (ret) + goto out_commit; + did_quota = 1; + + if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { + /* + * Allocate our index cluster first, to maximize the + * possibility that unindexed leaves grow + * contiguously. + */ + ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, + dx_leaves, num_dx_leaves, + &dx_insert_blkno); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); + } + + /* + * Try to claim as many clusters as the bitmap can give though + * if we only get one now, that's enough to continue. The rest + * will be claimed after the conversion to extents. + */ + if (ocfs2_dir_resv_allowed(osb)) + data_ac->ac_resv = &oi->ip_la_data_resv; + ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); + + /* + * Operations are carefully ordered so that we set up the new + * data block first. The conversion from inline data to + * extents follows. + */ + blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); + dirdata_bh = sb_getblk(sb, blkno); + if (!dirdata_bh) { + ret = -EIO; + mlog_errno(ret); + goto out_commit; + } + + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh); + + ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); + memset(dirdata_bh->b_data + i_size_read(dir), 0, + sb->s_blocksize - i_size_read(dir)); + i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir); + if (ocfs2_new_dir_wants_trailer(dir)) { + /* + * Prepare the dir trailer up front. It will otherwise look + * like a valid dirent. Even if inserting the index fails + * (unlikely), then all we'll have done is given first dir + * block a small amount of fragmentation. + */ + ocfs2_init_dir_trailer(dir, dirdata_bh, i); + } + + ocfs2_journal_dirty(handle, dirdata_bh); + + if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { + /* + * Dx dirs with an external cluster need to do this up + * front. Inline dx root's get handled later, after + * we've allocated our root block. We get passed back + * a total number of items so that dr_num_entries can + * be correctly set once the dx_root has been + * allocated. + */ + ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves, + num_dx_leaves, &num_dx_entries, + dirdata_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + /* + * Set extent, i_size, etc on the directory. After this, the + * inode should contain the same exact dirents as before and + * be fully accessible from system calls. + * + * We let the later dirent insert modify c/mtime - to the user + * the data hasn't changed. + */ + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + + ocfs2_dinode_new_extent_list(dir, di); + + i_size_write(dir, sb->s_blocksize); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + + di->i_size = cpu_to_le64(sb->s_blocksize); + di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); + + /* + * This should never fail as our extent list is empty and all + * related blocks have been journaled already. + */ + ret = ocfs2_insert_extent(handle, &et, 0, blkno, len, + 0, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * Set i_blocks after the extent insert for the most up to + * date ip_clusters value. + */ + dir->i_blocks = ocfs2_inode_sector_count(dir); + + ocfs2_journal_dirty(handle, di_bh); + + if (ocfs2_supports_indexed_dirs(osb)) { + ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh, + dirdata_bh, meta_ac, dx_inline, + num_dx_entries, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (dx_inline) { + ocfs2_dx_dir_index_root_block(dir, dx_root_bh, + dirdata_bh); + } else { + ocfs2_init_dx_root_extent_tree(&dx_et, + INODE_CACHE(dir), + dx_root_bh); + ret = ocfs2_insert_extent(handle, &dx_et, 0, + dx_insert_blkno, 1, 0, NULL); + if (ret) + mlog_errno(ret); + } + } + + /* + * We asked for two clusters, but only got one in the 1st + * pass. Claim the 2nd cluster as a separate extent. + */ + if (alloc > len) { + ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, + &len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); + + ret = ocfs2_insert_extent(handle, &et, 1, + blkno, len, 0, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); + } + + *first_block_bh = dirdata_bh; + dirdata_bh = NULL; + if (ocfs2_supports_indexed_dirs(osb)) { + unsigned int off; + + if (!dx_inline) { + /* + * We need to return the correct block within the + * cluster which should hold our entry. + */ + off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), + &lookup->dl_hinfo); + get_bh(dx_leaves[off]); + lookup->dl_dx_leaf_bh = dx_leaves[off]; + } + lookup->dl_dx_root_bh = dx_root_bh; + dx_root_bh = NULL; + } + +out_commit: + if (ret < 0 && did_quota) + dquot_free_space_nodirty(dir, bytes_allocated); + + ocfs2_commit_trans(osb, handle); + +out: + up_write(&oi->ip_alloc_sem); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + if (dx_leaves) { + for (i = 0; i < num_dx_leaves; i++) + brelse(dx_leaves[i]); + kfree(dx_leaves); + } + + brelse(dirdata_bh); + brelse(dx_root_bh); + + return ret; +} + +/* returns a bh of the 1st new block in the allocation. */ +static int ocfs2_do_extend_dir(struct super_block *sb, + handle_t *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **new_bh) +{ + int status; + int extend, did_quota = 0; + u64 p_blkno, v_blkno; + + spin_lock(&OCFS2_I(dir)->ip_lock); + extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); + spin_unlock(&OCFS2_I(dir)->ip_lock); + + if (extend) { + u32 offset = OCFS2_I(dir)->ip_clusters; + + status = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(sb, 1)); + if (status) + goto bail; + did_quota = 1; + + status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, + 1, 0, parent_fe_bh, handle, + data_ac, meta_ac, NULL); + BUG_ON(status == -EAGAIN); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir)); + status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *new_bh = sb_getblk(sb, p_blkno); + if (!*new_bh) { + status = -EIO; + mlog_errno(status); + goto bail; + } + status = 0; +bail: + if (did_quota && status < 0) + dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); + return status; +} + +/* + * Assumes you already have a cluster lock on the directory. + * + * 'blocks_wanted' is only used if we have an inline directory which + * is to be turned into an extent based one. The size of the dirent to + * insert might be larger than the space gained by growing to just one + * block, so we may have to grow the inode by two blocks in that case. + * + * If the directory is already indexed, dx_root_bh must be provided. + */ +static int ocfs2_extend_dir(struct ocfs2_super *osb, + struct inode *dir, + struct buffer_head *parent_fe_bh, + unsigned int blocks_wanted, + struct ocfs2_dir_lookup_result *lookup, + struct buffer_head **new_de_bh) +{ + int status = 0; + int credits, num_free_extents, drop_alloc_sem = 0; + loff_t dir_i_size; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + struct ocfs2_extent_list *el = &fe->id2.i_list; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = NULL; + struct buffer_head *new_bh = NULL; + struct ocfs2_dir_entry * de; + struct super_block *sb = osb->sb; + struct ocfs2_extent_tree et; + struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; + + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + /* + * This would be a code error as an inline directory should + * never have an index root. + */ + BUG_ON(dx_root_bh); + + status = ocfs2_expand_inline_dir(dir, parent_fe_bh, + blocks_wanted, lookup, + &new_bh); + if (status) { + mlog_errno(status); + goto bail; + } + + /* Expansion from inline to an indexed directory will + * have given us this. */ + dx_root_bh = lookup->dl_dx_root_bh; + + if (blocks_wanted == 1) { + /* + * If the new dirent will fit inside the space + * created by pushing out to one block, then + * we can complete the operation + * here. Otherwise we have to expand i_size + * and format the 2nd block below. + */ + BUG_ON(new_bh == NULL); + goto bail_bh; + } + + /* + * Get rid of 'new_bh' - we want to format the 2nd + * data block and return that instead. + */ + brelse(new_bh); + new_bh = NULL; + + down_write(&OCFS2_I(dir)->ip_alloc_sem); + drop_alloc_sem = 1; + dir_i_size = i_size_read(dir); + credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; + goto do_extend; + } + + down_write(&OCFS2_I(dir)->ip_alloc_sem); + drop_alloc_sem = 1; + dir_i_size = i_size_read(dir); + trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno, + dir_i_size); + + /* dir->i_size is always block aligned. */ + spin_lock(&OCFS2_I(dir)->ip_lock); + if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { + spin_unlock(&OCFS2_I(dir)->ip_lock); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), + parent_fe_bh); + num_free_extents = ocfs2_num_free_extents(osb, &et); + if (num_free_extents < 0) { + status = num_free_extents; + mlog_errno(status); + goto bail; + } + + if (!num_free_extents) { + status = ocfs2_reserve_new_metadata(osb, el, &meta_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + } + + status = ocfs2_reserve_clusters(osb, 1, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + if (ocfs2_dir_resv_allowed(osb)) + data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv; + + credits = ocfs2_calc_extend_credits(sb, el, 1); + } else { + spin_unlock(&OCFS2_I(dir)->ip_lock); + credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; + } + +do_extend: + if (ocfs2_dir_indexed(dir)) + credits++; /* For attaching the new dirent block to the + * dx_root */ + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh, + data_ac, meta_ac, &new_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh); + + status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + memset(new_bh->b_data, 0, sb->s_blocksize); + + de = (struct ocfs2_dir_entry *) new_bh->b_data; + de->inode = 0; + if (ocfs2_supports_dir_trailer(dir)) { + de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb)); + + ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len)); + + if (ocfs2_dir_indexed(dir)) { + status = ocfs2_dx_dir_link_trailer(dir, handle, + dx_root_bh, new_bh); + if (status) { + mlog_errno(status); + goto bail; + } + } + } else { + de->rec_len = cpu_to_le16(sb->s_blocksize); + } + ocfs2_journal_dirty(handle, new_bh); + + dir_i_size += dir->i_sb->s_blocksize; + i_size_write(dir, dir_i_size); + dir->i_blocks = ocfs2_inode_sector_count(dir); + status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail_bh: + *new_de_bh = new_bh; + get_bh(*new_de_bh); +bail: + if (handle) + ocfs2_commit_trans(osb, handle); + if (drop_alloc_sem) + up_write(&OCFS2_I(dir)->ip_alloc_sem); + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + brelse(new_bh); + + return status; +} + +static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, + const char *name, int namelen, + struct buffer_head **ret_de_bh, + unsigned int *blocks_wanted) +{ + int ret; + struct super_block *sb = dir->i_sb; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_dir_entry *de, *last_de = NULL; + char *de_buf, *limit; + unsigned long offset = 0; + unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize; + + /* + * This calculates how many free bytes we'd have in block zero, should + * this function force expansion to an extent tree. + */ + if (ocfs2_new_dir_wants_trailer(dir)) + free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir); + else + free_space = dir->i_sb->s_blocksize - i_size_read(dir); + + de_buf = di->id2.i_data.id_data; + limit = de_buf + i_size_read(dir); + rec_len = OCFS2_DIR_REC_LEN(namelen); + + while (de_buf < limit) { + de = (struct ocfs2_dir_entry *)de_buf; + + if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) { + ret = -ENOENT; + goto out; + } + if (ocfs2_match(namelen, name, de)) { + ret = -EEXIST; + goto out; + } + /* + * No need to check for a trailing dirent record here as + * they're not used for inline dirs. + */ + + if (ocfs2_dirent_would_fit(de, rec_len)) { + /* Ok, we found a spot. Return this bh and let + * the caller actually fill it in. */ + *ret_de_bh = di_bh; + get_bh(*ret_de_bh); + ret = 0; + goto out; + } + + last_de = de; + de_buf += le16_to_cpu(de->rec_len); + offset += le16_to_cpu(de->rec_len); + } + + /* + * We're going to require expansion of the directory - figure + * out how many blocks we'll need so that a place for the + * dirent can be found. + */ + *blocks_wanted = 1; + new_rec_len = le16_to_cpu(last_de->rec_len) + free_space; + if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len))) + *blocks_wanted = 2; + + ret = -ENOSPC; +out: + return ret; +} + +static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, + int namelen, struct buffer_head **ret_de_bh) +{ + unsigned long offset; + struct buffer_head *bh = NULL; + unsigned short rec_len; + struct ocfs2_dir_entry *de; + struct super_block *sb = dir->i_sb; + int status; + int blocksize = dir->i_sb->s_blocksize; + + status = ocfs2_read_dir_block(dir, 0, &bh, 0); + if (status) { + mlog_errno(status); + goto bail; + } + + rec_len = OCFS2_DIR_REC_LEN(namelen); + offset = 0; + de = (struct ocfs2_dir_entry *) bh->b_data; + while (1) { + if ((char *)de >= sb->s_blocksize + bh->b_data) { + brelse(bh); + bh = NULL; + + if (i_size_read(dir) <= offset) { + /* + * Caller will have to expand this + * directory. + */ + status = -ENOSPC; + goto bail; + } + status = ocfs2_read_dir_block(dir, + offset >> sb->s_blocksize_bits, + &bh, 0); + if (status) { + mlog_errno(status); + goto bail; + } + /* move to next block */ + de = (struct ocfs2_dir_entry *) bh->b_data; + } + if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { + status = -ENOENT; + goto bail; + } + if (ocfs2_match(namelen, name, de)) { + status = -EEXIST; + goto bail; + } + + if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize, + blocksize)) + goto next; + + if (ocfs2_dirent_would_fit(de, rec_len)) { + /* Ok, we found a spot. Return this bh and let + * the caller actually fill it in. */ + *ret_de_bh = bh; + get_bh(*ret_de_bh); + status = 0; + goto bail; + } +next: + offset += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); + } + + status = 0; +bail: + brelse(bh); + if (status) + mlog_errno(status); + + return status; +} + +static int dx_leaf_sort_cmp(const void *a, const void *b) +{ + const struct ocfs2_dx_entry *entry1 = a; + const struct ocfs2_dx_entry *entry2 = b; + u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash); + u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash); + u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash); + u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash); + + if (major_hash1 > major_hash2) + return 1; + if (major_hash1 < major_hash2) + return -1; + + /* + * It is not strictly necessary to sort by minor + */ + if (minor_hash1 > minor_hash2) + return 1; + if (minor_hash1 < minor_hash2) + return -1; + return 0; +} + +static void dx_leaf_sort_swap(void *a, void *b, int size) +{ + struct ocfs2_dx_entry *entry1 = a; + struct ocfs2_dx_entry *entry2 = b; + struct ocfs2_dx_entry tmp; + + BUG_ON(size != sizeof(*entry1)); + + tmp = *entry1; + *entry1 = *entry2; + *entry2 = tmp; +} + +static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf) +{ + struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; + int i, num = le16_to_cpu(dl_list->de_num_used); + + for (i = 0; i < (num - 1); i++) { + if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) != + le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash)) + return 0; + } + + return 1; +} + +/* + * Find the optimal value to split this leaf on. This expects the leaf + * entries to be in sorted order. + * + * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is + * the hash we want to insert. + * + * This function is only concerned with the major hash - that which + * determines which cluster an item belongs to. + */ +static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf, + u32 leaf_cpos, u32 insert_hash, + u32 *split_hash) +{ + struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; + int i, num_used = le16_to_cpu(dl_list->de_num_used); + int allsame; + + /* + * There's a couple rare, but nasty corner cases we have to + * check for here. All of them involve a leaf where all value + * have the same hash, which is what we look for first. + * + * Most of the time, all of the above is false, and we simply + * pick the median value for a split. + */ + allsame = ocfs2_dx_leaf_same_major(dx_leaf); + if (allsame) { + u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash); + + if (val == insert_hash) { + /* + * No matter where we would choose to split, + * the new entry would want to occupy the same + * block as these. Since there's no space left + * in their existing block, we know there + * won't be space after the split. + */ + return -ENOSPC; + } + + if (val == leaf_cpos) { + /* + * Because val is the same as leaf_cpos (which + * is the smallest value this leaf can have), + * yet is not equal to insert_hash, then we + * know that insert_hash *must* be larger than + * val (and leaf_cpos). At least cpos+1 in value. + * + * We also know then, that there cannot be an + * adjacent extent (otherwise we'd be looking + * at it). Choosing this value gives us a + * chance to get some contiguousness. + */ + *split_hash = leaf_cpos + 1; + return 0; + } + + if (val > insert_hash) { + /* + * val can not be the same as insert hash, and + * also must be larger than leaf_cpos. Also, + * we know that there can't be a leaf between + * cpos and val, otherwise the entries with + * hash 'val' would be there. + */ + *split_hash = val; + return 0; + } + + *split_hash = insert_hash; + return 0; + } + + /* + * Since the records are sorted and the checks above + * guaranteed that not all records in this block are the same, + * we simple travel forward, from the median, and pick the 1st + * record whose value is larger than leaf_cpos. + */ + for (i = (num_used / 2); i < num_used; i++) + if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) > + leaf_cpos) + break; + + BUG_ON(i == num_used); /* Should be impossible */ + *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash); + return 0; +} + +/* + * Transfer all entries in orig_dx_leaves whose major hash is equal to or + * larger than split_hash into new_dx_leaves. We use a temporary + * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks. + * + * Since the block offset inside a leaf (cluster) is a constant mask + * of minor_hash, we can optimize - an item at block offset X within + * the original cluster, will be at offset X within the new cluster. + */ +static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, + handle_t *handle, + struct ocfs2_dx_leaf *tmp_dx_leaf, + struct buffer_head **orig_dx_leaves, + struct buffer_head **new_dx_leaves, + int num_dx_leaves) +{ + int i, j, num_used; + u32 major_hash; + struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; + struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list; + struct ocfs2_dx_entry *dx_entry; + + tmp_list = &tmp_dx_leaf->dl_list; + + for (i = 0; i < num_dx_leaves; i++) { + orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; + orig_list = &orig_dx_leaf->dl_list; + new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; + new_list = &new_dx_leaf->dl_list; + + num_used = le16_to_cpu(orig_list->de_num_used); + + memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize); + tmp_list->de_num_used = cpu_to_le16(0); + memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used); + + for (j = 0; j < num_used; j++) { + dx_entry = &orig_list->de_entries[j]; + major_hash = le32_to_cpu(dx_entry->dx_major_hash); + if (major_hash >= split_hash) + ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf, + dx_entry); + else + ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf, + dx_entry); + } + memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize); + + ocfs2_journal_dirty(handle, orig_dx_leaves[i]); + ocfs2_journal_dirty(handle, new_dx_leaves[i]); + } +} + +static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb, + struct ocfs2_dx_root_block *dx_root) +{ + int credits = ocfs2_clusters_to_blocks(osb->sb, 2); + + credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1); + credits += ocfs2_quota_trans_credits(osb->sb); + return credits; +} + +/* + * Find the median value in dx_leaf_bh and allocate a new leaf to move + * half our entries into. + */ +static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, + struct buffer_head *dx_root_bh, + struct buffer_head *dx_leaf_bh, + struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos, + u64 leaf_blkno) +{ + struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; + int credits, ret, i, num_used, did_quota = 0; + u32 cpos, split_hash, insert_hash = hinfo->major_hash; + u64 orig_leaves_start; + int num_dx_leaves; + struct buffer_head **orig_dx_leaves = NULL; + struct buffer_head **new_dx_leaves = NULL; + struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL; + struct ocfs2_extent_tree et; + handle_t *handle = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_leaf *tmp_dx_leaf = NULL; + + trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)leaf_blkno, + insert_hash); + + ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + /* + * XXX: This is a rather large limit. We should use a more + * realistic value. + */ + if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX) + return -ENOSPC; + + num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used); + if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) { + mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: " + "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)leaf_blkno, num_used); + ret = -EIO; + goto out; + } + + orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves); + if (!orig_dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL); + if (!new_dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1)); + if (ret) + goto out_commit; + did_quota = 1; + + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * This block is changing anyway, so we can sort it in place. + */ + sort(dx_leaf->dl_list.de_entries, num_used, + sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp, + dx_leaf_sort_swap); + + ocfs2_journal_dirty(handle, dx_leaf_bh); + + ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash, + &split_hash); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash); + + /* + * We have to carefully order operations here. There are items + * which want to be in the new cluster before insert, but in + * order to put those items in the new cluster, we alter the + * old cluster. A failure to insert gets nasty. + * + * So, start by reserving writes to the old + * cluster. ocfs2_dx_dir_new_cluster will reserve writes on + * the new cluster for us, before inserting it. The insert + * won't happen if there's an error before that. Once the + * insert is done then, we can transfer from one leaf into the + * other without fear of hitting any error. + */ + + /* + * The leaf transfer wants some scratch space so that we don't + * wind up doing a bunch of expensive memmove(). + */ + tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS); + if (!tmp_dx_leaf) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_commit; + } + + orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno); + ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves, + orig_dx_leaves); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + cpos = split_hash; + ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, + data_ac, meta_ac, new_dx_leaves, + num_dx_leaves); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + for (i = 0; i < num_dx_leaves; i++) { + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), + orig_dx_leaves[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), + new_dx_leaves[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf, + orig_dx_leaves, new_dx_leaves, num_dx_leaves); + +out_commit: + if (ret < 0 && did_quota) + dquot_free_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1)); + + ocfs2_commit_trans(osb, handle); + +out: + if (orig_dx_leaves || new_dx_leaves) { + for (i = 0; i < num_dx_leaves; i++) { + if (orig_dx_leaves) + brelse(orig_dx_leaves[i]); + if (new_dx_leaves) + brelse(new_dx_leaves[i]); + } + kfree(orig_dx_leaves); + kfree(new_dx_leaves); + } + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + kfree(tmp_dx_leaf); + return ret; +} + +static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dx_root_bh, + const char *name, int namelen, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret, rebalanced = 0; + struct ocfs2_dx_root_block *dx_root; + struct buffer_head *dx_leaf_bh = NULL; + struct ocfs2_dx_leaf *dx_leaf; + u64 blkno; + u32 leaf_cpos; + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + +restart_search: + ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo, + &leaf_cpos, &blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; + + if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >= + le16_to_cpu(dx_leaf->dl_list.de_count)) { + if (rebalanced) { + /* + * Rebalancing should have provided us with + * space in an appropriate leaf. + * + * XXX: Is this an abnormal condition then? + * Should we print a message here? + */ + ret = -ENOSPC; + goto out; + } + + ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh, + &lookup->dl_hinfo, leaf_cpos, + blkno); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + /* + * Restart the lookup. The rebalance might have + * changed which block our item fits into. Mark our + * progress, so we only execute this once. + */ + brelse(dx_leaf_bh); + dx_leaf_bh = NULL; + rebalanced = 1; + goto restart_search; + } + + lookup->dl_dx_leaf_bh = dx_leaf_bh; + dx_leaf_bh = NULL; + +out: + brelse(dx_leaf_bh); + return ret; +} + +static int ocfs2_search_dx_free_list(struct inode *dir, + struct buffer_head *dx_root_bh, + int namelen, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret = -ENOSPC; + struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL; + struct ocfs2_dir_block_trailer *db; + u64 next_block; + int rec_len = OCFS2_DIR_REC_LEN(namelen); + struct ocfs2_dx_root_block *dx_root; + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + next_block = le64_to_cpu(dx_root->dr_free_blk); + + while (next_block) { + brelse(prev_leaf_bh); + prev_leaf_bh = leaf_bh; + leaf_bh = NULL; + + ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb); + if (rec_len <= le16_to_cpu(db->db_free_rec_len)) { + lookup->dl_leaf_bh = leaf_bh; + lookup->dl_prev_leaf_bh = prev_leaf_bh; + leaf_bh = NULL; + prev_leaf_bh = NULL; + break; + } + + next_block = le64_to_cpu(db->db_free_next); + } + + if (!next_block) + ret = -ENOSPC; + +out: + + brelse(leaf_bh); + brelse(prev_leaf_bh); + return ret; +} + +static int ocfs2_expand_inline_dx_root(struct inode *dir, + struct buffer_head *dx_root_bh) +{ + int ret, num_dx_leaves, i, j, did_quota = 0; + struct buffer_head **dx_leaves = NULL; + struct ocfs2_extent_tree et; + u64 insert_blkno; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + handle_t *handle = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + struct ocfs2_dx_entry *dx_entry; + struct ocfs2_dx_leaf *target_leaf; + + ret = ocfs2_reserve_clusters(osb, 1, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves); + if (!dx_leaves) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (ret) + goto out_commit; + did_quota = 1; + + /* + * We do this up front, before the allocation, so that a + * failure to add the dx_root_bh to the journal won't result + * us losing clusters. + */ + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves, + num_dx_leaves, &insert_blkno); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * Transfer the entries from our dx_root into the appropriate + * block + */ + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + entry_list = &dx_root->dr_entries; + + for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) { + dx_entry = &entry_list->de_entries[i]; + + j = __ocfs2_dx_dir_hash_idx(osb, + le32_to_cpu(dx_entry->dx_minor_hash)); + target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data; + + ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry); + + /* Each leaf has been passed to the journal already + * via __ocfs2_dx_dir_new_cluster() */ + } + + dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE; + memset(&dx_root->dr_list, 0, osb->sb->s_blocksize - + offsetof(struct ocfs2_dx_root_block, dr_list)); + dx_root->dr_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); + + /* This should never fail considering we start with an empty + * dx_root. */ + ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); + ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL); + if (ret) + mlog_errno(ret); + did_quota = 0; + + ocfs2_journal_dirty(handle, dx_root_bh); + +out_commit: + if (ret < 0 && did_quota) + dquot_free_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1)); + + ocfs2_commit_trans(osb, handle); + +out: + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + if (dx_leaves) { + for (i = 0; i < num_dx_leaves; i++) + brelse(dx_leaves[i]); + kfree(dx_leaves); + } + return ret; +} + +static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh) +{ + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dx_entry_list *entry_list; + + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + entry_list = &dx_root->dr_entries; + + if (le16_to_cpu(entry_list->de_num_used) >= + le16_to_cpu(entry_list->de_count)) + return -ENOSPC; + + return 0; +} + +static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir, + struct buffer_head *di_bh, + const char *name, + int namelen, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret, free_dx_root = 1; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct buffer_head *dx_root_bh = NULL; + struct buffer_head *leaf_bh = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_dx_root_block *dx_root; + + ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) { + ret = -ENOSPC; + mlog_errno(ret); + goto out; + } + + if (ocfs2_dx_root_inline(dx_root)) { + ret = ocfs2_inline_dx_has_space(dx_root_bh); + + if (ret == 0) + goto search_el; + + /* + * We ran out of room in the root block. Expand it to + * an extent, then allow ocfs2_find_dir_space_dx to do + * the rest. + */ + ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * Insert preparation for an indexed directory is split into two + * steps. The call to find_dir_space_dx reserves room in the index for + * an additional item. If we run out of space there, it's a real error + * we can't continue on. + */ + ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name, + namelen, lookup); + if (ret) { + mlog_errno(ret); + goto out; + } + +search_el: + /* + * Next, we need to find space in the unindexed tree. This call + * searches using the free space linked list. If the unindexed tree + * lacks sufficient space, we'll expand it below. The expansion code + * is smart enough to add any new blocks to the free space list. + */ + ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup); + if (ret && ret != -ENOSPC) { + mlog_errno(ret); + goto out; + } + + /* Do this up here - ocfs2_extend_dir might need the dx_root */ + lookup->dl_dx_root_bh = dx_root_bh; + free_dx_root = 0; + + if (ret == -ENOSPC) { + ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh); + + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We make the assumption here that new leaf blocks are added + * to the front of our free list. + */ + lookup->dl_prev_leaf_bh = NULL; + lookup->dl_leaf_bh = leaf_bh; + } + +out: + if (free_dx_root) + brelse(dx_root_bh); + return ret; +} + +/* + * Get a directory ready for insert. Any directory allocation required + * happens here. Success returns zero, and enough context in the dir + * lookup result that ocfs2_add_entry() will be able complete the task + * with minimal performance impact. + */ +int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, + struct inode *dir, + struct buffer_head *parent_fe_bh, + const char *name, + int namelen, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret; + unsigned int blocks_wanted = 1; + struct buffer_head *bh = NULL; + + trace_ocfs2_prepare_dir_for_insert( + (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen); + + if (!namelen) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + /* + * Do this up front to reduce confusion. + * + * The directory might start inline, then be turned into an + * indexed one, in which case we'd need to hash deep inside + * ocfs2_find_dir_space_id(). Since + * ocfs2_prepare_dx_dir_for_insert() also needs this hash + * done, there seems no point in spreading out the calls. We + * can optimize away the case where the file system doesn't + * support indexing. + */ + if (ocfs2_supports_indexed_dirs(osb)) + ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo); + + if (ocfs2_dir_indexed(dir)) { + ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh, + name, namelen, lookup); + if (ret) + mlog_errno(ret); + goto out; + } + + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name, + namelen, &bh, &blocks_wanted); + } else + ret = ocfs2_find_dir_space_el(dir, name, namelen, &bh); + + if (ret && ret != -ENOSPC) { + mlog_errno(ret); + goto out; + } + + if (ret == -ENOSPC) { + /* + * We have to expand the directory to add this name. + */ + BUG_ON(bh); + + ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted, + lookup, &bh); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + BUG_ON(!bh); + } + + lookup->dl_leaf_bh = bh; + bh = NULL; +out: + brelse(bh); + return ret; +} + +static int ocfs2_dx_dir_remove_index(struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dx_root_bh) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_dx_root_block *dx_root; + struct inode *dx_alloc_inode = NULL; + struct buffer_head *dx_alloc_bh = NULL; + handle_t *handle; + u64 blk; + u16 bit; + u64 bg_blkno; + + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; + + dx_alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(dx_root->dr_suballoc_slot)); + if (!dx_alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&dx_alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&OCFS2_I(dir)->ip_lock); + OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL; + di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); + spin_unlock(&OCFS2_I(dir)->ip_lock); + di->i_dx_root = cpu_to_le64(0ULL); + + ocfs2_journal_dirty(handle, di_bh); + + blk = le64_to_cpu(dx_root->dr_blkno); + bit = le16_to_cpu(dx_root->dr_suballoc_bit); + if (dx_root->dr_suballoc_loc) + bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh, + bit, bg_blkno, 1); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_unlock: + ocfs2_inode_unlock(dx_alloc_inode, 1); + +out_mutex: + mutex_unlock(&dx_alloc_inode->i_mutex); + brelse(dx_alloc_bh); +out: + iput(dx_alloc_inode); + return ret; +} + +int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) +{ + int ret; + unsigned int uninitialized_var(clen); + u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos); + u64 uninitialized_var(blkno); + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct buffer_head *dx_root_bh = NULL; + struct ocfs2_dx_root_block *dx_root; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree et; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (!ocfs2_dir_indexed(dir)) + return 0; + + ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; + + if (ocfs2_dx_root_inline(dx_root)) + goto remove_index; + + ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); + + /* XXX: What if dr_clusters is too large? */ + while (le32_to_cpu(dx_root->dr_clusters)) { + ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list, + major_hash, &cpos, &blkno, &clen); + if (ret) { + mlog_errno(ret); + goto out; + } + + p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); + + ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0, + &dealloc, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (cpos == 0) + break; + + major_hash = cpos - 1; + } + +remove_index: + ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh); +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + + brelse(dx_root_bh); + return ret; +} diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h new file mode 100644 index 00000000..e683f3de --- /dev/null +++ b/fs/ocfs2/dir.h @@ -0,0 +1,117 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dir.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_DIR_H +#define OCFS2_DIR_H + +struct ocfs2_dx_hinfo { + u32 major_hash; + u32 minor_hash; +}; + +struct ocfs2_dir_lookup_result { + struct buffer_head *dl_leaf_bh; /* Unindexed leaf + * block */ + struct ocfs2_dir_entry *dl_entry; /* Target dirent in + * unindexed leaf */ + + struct buffer_head *dl_dx_root_bh; /* Root of indexed + * tree */ + + struct buffer_head *dl_dx_leaf_bh; /* Indexed leaf block */ + struct ocfs2_dx_entry *dl_dx_entry; /* Target dx_entry in + * indexed leaf */ + struct ocfs2_dx_hinfo dl_hinfo; /* Name hash results */ + + struct buffer_head *dl_prev_leaf_bh;/* Previous entry in + * dir free space + * list. NULL if + * previous entry is + * dx root block. */ +}; + +void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res); + +int ocfs2_find_entry(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dir_lookup_result *lookup); +int ocfs2_delete_entry(handle_t *handle, + struct inode *dir, + struct ocfs2_dir_lookup_result *res); +int __ocfs2_add_entry(handle_t *handle, + struct inode *dir, + const char *name, int namelen, + struct inode *inode, u64 blkno, + struct buffer_head *parent_fe_bh, + struct ocfs2_dir_lookup_result *lookup); +static inline int ocfs2_add_entry(handle_t *handle, + struct dentry *dentry, + struct inode *inode, u64 blkno, + struct buffer_head *parent_fe_bh, + struct ocfs2_dir_lookup_result *lookup) +{ + return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, + dentry->d_name.name, dentry->d_name.len, + inode, blkno, parent_fe_bh, lookup); +} +int ocfs2_update_entry(struct inode *dir, handle_t *handle, + struct ocfs2_dir_lookup_result *res, + struct inode *new_entry_inode); + +int ocfs2_check_dir_for_entry(struct inode *dir, + const char *name, + int namelen); +int ocfs2_empty_dir(struct inode *inode); + +int ocfs2_find_files_on_disk(const char *name, + int namelen, + u64 *blkno, + struct inode *inode, + struct ocfs2_dir_lookup_result *res); +int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, + int namelen, u64 *blkno); +int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); +int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv, + filldir_t filldir); +int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, + struct inode *dir, + struct buffer_head *parent_fe_bh, + const char *name, + int namelen, + struct ocfs2_dir_lookup_result *lookup); +struct ocfs2_alloc_context; +int ocfs2_fill_new_dir(struct ocfs2_super *osb, + handle_t *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac); + +int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh); + +struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, + void *data); +#endif /* OCFS2_DIR_H */ diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile new file mode 100644 index 00000000..c8a044ef --- /dev/null +++ b/fs/ocfs2/dlm/Makefile @@ -0,0 +1,7 @@ +ccflags-y := -Ifs/ocfs2 + +obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o + +ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ + dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o + diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h new file mode 100644 index 00000000..3cfa114a --- /dev/null +++ b/fs/ocfs2/dlm/dlmapi.h @@ -0,0 +1,220 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmapi.h + * + * externally exported dlm interfaces + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMAPI_H +#define DLMAPI_H + +struct dlm_lock; +struct dlm_ctxt; + +/* NOTE: changes made to this enum should be reflected in dlmdebug.c */ +enum dlm_status { + DLM_NORMAL = 0, /* 0: request in progress */ + DLM_GRANTED, /* 1: request granted */ + DLM_DENIED, /* 2: request denied */ + DLM_DENIED_NOLOCKS, /* 3: request denied, out of system resources */ + DLM_WORKING, /* 4: async request in progress */ + DLM_BLOCKED, /* 5: lock request blocked */ + DLM_BLOCKED_ORPHAN, /* 6: lock request blocked by a orphan lock*/ + DLM_DENIED_GRACE_PERIOD, /* 7: topological change in progress */ + DLM_SYSERR, /* 8: system error */ + DLM_NOSUPPORT, /* 9: unsupported */ + DLM_CANCELGRANT, /* 10: can't cancel convert: already granted */ + DLM_IVLOCKID, /* 11: bad lockid */ + DLM_SYNC, /* 12: synchronous request granted */ + DLM_BADTYPE, /* 13: bad resource type */ + DLM_BADRESOURCE, /* 14: bad resource handle */ + DLM_MAXHANDLES, /* 15: no more resource handles */ + DLM_NOCLINFO, /* 16: can't contact cluster manager */ + DLM_NOLOCKMGR, /* 17: can't contact lock manager */ + DLM_NOPURGED, /* 18: can't contact purge daemon */ + DLM_BADARGS, /* 19: bad api args */ + DLM_VOID, /* 20: no status */ + DLM_NOTQUEUED, /* 21: NOQUEUE was specified and request failed */ + DLM_IVBUFLEN, /* 22: invalid resource name length */ + DLM_CVTUNGRANT, /* 23: attempted to convert ungranted lock */ + DLM_BADPARAM, /* 24: invalid lock mode specified */ + DLM_VALNOTVALID, /* 25: value block has been invalidated */ + DLM_REJECTED, /* 26: request rejected, unrecognized client */ + DLM_ABORT, /* 27: blocked lock request cancelled */ + DLM_CANCEL, /* 28: conversion request cancelled */ + DLM_IVRESHANDLE, /* 29: invalid resource handle */ + DLM_DEADLOCK, /* 30: deadlock recovery refused this request */ + DLM_DENIED_NOASTS, /* 31: failed to allocate AST */ + DLM_FORWARD, /* 32: request must wait for primary's response */ + DLM_TIMEOUT, /* 33: timeout value for lock has expired */ + DLM_IVGROUPID, /* 34: invalid group specification */ + DLM_VERS_CONFLICT, /* 35: version conflicts prevent request handling */ + DLM_BAD_DEVICE_PATH, /* 36: Locks device does not exist or path wrong */ + DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */ + DLM_NO_CONTROL_DEVICE, /* 38: Cannot set options on opened device */ + + DLM_RECOVERING, /* 39: extension, allows caller to fail a lock + request if it is being recovered */ + DLM_MIGRATING, /* 40: extension, allows caller to fail a lock + request if it is being migrated */ + DLM_MAXSTATS, /* 41: upper limit for return code validation */ +}; + +/* for pretty-printing dlm_status error messages */ +const char *dlm_errmsg(enum dlm_status err); +/* for pretty-printing dlm_status error names */ +const char *dlm_errname(enum dlm_status err); + +/* Eventually the DLM will use standard errno values, but in the + * meantime this lets us track dlm errors as they bubble up. When we + * bring its error reporting into line with the rest of the stack, + * these can just be replaced with calls to mlog_errno. */ +#define dlm_error(st) do { \ + if ((st) != DLM_RECOVERING && \ + (st) != DLM_MIGRATING && \ + (st) != DLM_FORWARD) \ + mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \ +} while (0) + +#define DLM_LKSB_UNUSED1 0x01 +#define DLM_LKSB_PUT_LVB 0x02 +#define DLM_LKSB_GET_LVB 0x04 +#define DLM_LKSB_UNUSED2 0x08 +#define DLM_LKSB_UNUSED3 0x10 +#define DLM_LKSB_UNUSED4 0x20 +#define DLM_LKSB_UNUSED5 0x40 +#define DLM_LKSB_UNUSED6 0x80 + +#define DLM_LVB_LEN 64 + +/* Callers are only allowed access to the lvb and status members of + * this struct. */ +struct dlm_lockstatus { + enum dlm_status status; + u32 flags; + struct dlm_lock *lockid; + char lvb[DLM_LVB_LEN]; +}; + +/* Valid lock modes. */ +#define LKM_IVMODE (-1) /* invalid mode */ +#define LKM_NLMODE 0 /* null lock */ +#define LKM_CRMODE 1 /* concurrent read unsupported */ +#define LKM_CWMODE 2 /* concurrent write unsupported */ +#define LKM_PRMODE 3 /* protected read */ +#define LKM_PWMODE 4 /* protected write unsupported */ +#define LKM_EXMODE 5 /* exclusive */ +#define LKM_MAXMODE 5 +#define LKM_MODEMASK 0xff + +/* Flags passed to dlmlock and dlmunlock: + * reserved: flags used by the "real" dlm + * only a few are supported by this dlm + * (U) = unsupported by ocfs2 dlm */ +#define LKM_ORPHAN 0x00000010 /* this lock is orphanable (U) */ +#define LKM_PARENTABLE 0x00000020 /* this lock was orphaned (U) */ +#define LKM_BLOCK 0x00000040 /* blocking lock request (U) */ +#define LKM_LOCAL 0x00000080 /* local lock request */ +#define LKM_VALBLK 0x00000100 /* lock value block request */ +#define LKM_NOQUEUE 0x00000200 /* non blocking request */ +#define LKM_CONVERT 0x00000400 /* conversion request */ +#define LKM_NODLCKWT 0x00000800 /* this lock wont deadlock (U) */ +#define LKM_UNLOCK 0x00001000 /* deallocate this lock */ +#define LKM_CANCEL 0x00002000 /* cancel conversion request */ +#define LKM_DEQALL 0x00004000 /* remove all locks held by proc (U) */ +#define LKM_INVVALBLK 0x00008000 /* invalidate lock value block */ +#define LKM_SYNCSTS 0x00010000 /* return synchronous status if poss (U) */ +#define LKM_TIMEOUT 0x00020000 /* lock request contains timeout (U) */ +#define LKM_SNGLDLCK 0x00040000 /* request can self-deadlock (U) */ +#define LKM_FINDLOCAL 0x00080000 /* find local lock request (U) */ +#define LKM_PROC_OWNED 0x00100000 /* owned by process, not group (U) */ +#define LKM_XID 0x00200000 /* use transaction id for deadlock (U) */ +#define LKM_XID_CONFLICT 0x00400000 /* do not allow lock inheritance (U) */ +#define LKM_FORCE 0x00800000 /* force unlock flag */ +#define LKM_REVVALBLK 0x01000000 /* temporary solution: re-validate + lock value block (U) */ +/* unused */ +#define LKM_UNUSED1 0x00000001 /* unused */ +#define LKM_UNUSED2 0x00000002 /* unused */ +#define LKM_UNUSED3 0x00000004 /* unused */ +#define LKM_UNUSED4 0x00000008 /* unused */ +#define LKM_UNUSED5 0x02000000 /* unused */ +#define LKM_UNUSED6 0x04000000 /* unused */ +#define LKM_UNUSED7 0x08000000 /* unused */ + +/* ocfs2 extensions: internal only + * should never be used by caller */ +#define LKM_MIGRATION 0x10000000 /* extension: lockres is to be migrated + to another node */ +#define LKM_PUT_LVB 0x20000000 /* extension: lvb is being passed + should be applied to lockres */ +#define LKM_GET_LVB 0x40000000 /* extension: lvb should be copied + from lockres when lock is granted */ +#define LKM_RECOVERY 0x80000000 /* extension: flag for recovery lock + used to avoid recovery rwsem */ + + +typedef void (dlm_astlockfunc_t)(void *); +typedef void (dlm_bastlockfunc_t)(void *, int); +typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status); + +enum dlm_status dlmlock(struct dlm_ctxt *dlm, + int mode, + struct dlm_lockstatus *lksb, + int flags, + const char *name, + int namelen, + dlm_astlockfunc_t *ast, + void *data, + dlm_bastlockfunc_t *bast); + +enum dlm_status dlmunlock(struct dlm_ctxt *dlm, + struct dlm_lockstatus *lksb, + int flags, + dlm_astunlockfunc_t *unlockast, + void *data); + +struct dlm_protocol_version { + u8 pv_major; + u8 pv_minor; +}; +struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key, + struct dlm_protocol_version *fs_proto); + +void dlm_unregister_domain(struct dlm_ctxt *dlm); + +void dlm_print_one_lock(struct dlm_lock *lockid); + +typedef void (dlm_eviction_func)(int, void *); +struct dlm_eviction_cb { + struct list_head ec_item; + dlm_eviction_func *ec_func; + void *ec_data; +}; +void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb, + dlm_eviction_func *f, + void *data); +void dlm_register_eviction_cb(struct dlm_ctxt *dlm, + struct dlm_eviction_cb *cb); +void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb); + +#endif /* DLMAPI_H */ diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c new file mode 100644 index 00000000..3a3ed4bb --- /dev/null +++ b/fs/ocfs2/dlm/dlmast.c @@ -0,0 +1,502 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmast.c + * + * AST and BAST functionality for local and remote nodes + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock); +static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); + +/* Should be called as an ast gets queued to see if the new + * lock level will obsolete a pending bast. + * For example, if dlm_thread queued a bast for an EX lock that + * was blocking another EX, but before sending the bast the + * lock owner downconverted to NL, the bast is now obsolete. + * Only the ast should be sent. + * This is needed because the lock and convert paths can queue + * asts out-of-band (not waiting for dlm_thread) in order to + * allow for LKM_NOQUEUE to get immediate responses. */ +static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + assert_spin_locked(&dlm->ast_lock); + assert_spin_locked(&lock->spinlock); + + if (lock->ml.highest_blocked == LKM_IVMODE) + return 0; + BUG_ON(lock->ml.highest_blocked == LKM_NLMODE); + + if (lock->bast_pending && + list_empty(&lock->bast_list)) + /* old bast already sent, ok */ + return 0; + + if (lock->ml.type == LKM_EXMODE) + /* EX blocks anything left, any bast still valid */ + return 0; + else if (lock->ml.type == LKM_NLMODE) + /* NL blocks nothing, no reason to send any bast, cancel it */ + return 1; + else if (lock->ml.highest_blocked != LKM_EXMODE) + /* PR only blocks EX */ + return 1; + + return 0; +} + +void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + struct dlm_lock_resource *res; + + BUG_ON(!dlm); + BUG_ON(!lock); + + res = lock->lockres; + + assert_spin_locked(&dlm->ast_lock); + + if (!list_empty(&lock->ast_list)) { + mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, " + "AST list not empty, pending %d, newlevel %d\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + lock->ast_pending, lock->ml.type); + BUG(); + } + if (lock->ast_pending) + mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); + + /* putting lock on list, add a ref */ + dlm_lock_get(lock); + spin_lock(&lock->spinlock); + + /* check to see if this ast obsoletes the bast */ + if (dlm_should_cancel_bast(dlm, lock)) { + mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); + lock->bast_pending = 0; + list_del_init(&lock->bast_list); + lock->ml.highest_blocked = LKM_IVMODE; + /* removing lock from list, remove a ref. guaranteed + * this won't be the last ref because of the get above, + * so res->spinlock will not be taken here */ + dlm_lock_put(lock); + /* free up the reserved bast that we are cancelling. + * guaranteed that this will not be the last reserved + * ast because *both* an ast and a bast were reserved + * to get to this point. the res->spinlock will not be + * taken here */ + dlm_lockres_release_ast(dlm, res); + } + list_add_tail(&lock->ast_list, &dlm->pending_asts); + lock->ast_pending = 1; + spin_unlock(&lock->spinlock); +} + +void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + BUG_ON(!dlm); + BUG_ON(!lock); + + spin_lock(&dlm->ast_lock); + __dlm_queue_ast(dlm, lock); + spin_unlock(&dlm->ast_lock); +} + + +void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + struct dlm_lock_resource *res; + + BUG_ON(!dlm); + BUG_ON(!lock); + + assert_spin_locked(&dlm->ast_lock); + + res = lock->lockres; + + BUG_ON(!list_empty(&lock->bast_list)); + if (lock->bast_pending) + mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); + + /* putting lock on list, add a ref */ + dlm_lock_get(lock); + spin_lock(&lock->spinlock); + list_add_tail(&lock->bast_list, &dlm->pending_basts); + lock->bast_pending = 1; + spin_unlock(&lock->spinlock); +} + +void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + BUG_ON(!dlm); + BUG_ON(!lock); + + spin_lock(&dlm->ast_lock); + __dlm_queue_bast(dlm, lock); + spin_unlock(&dlm->ast_lock); +} + +static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + struct dlm_lockstatus *lksb = lock->lksb; + BUG_ON(!lksb); + + /* only updates if this node masters the lockres */ + spin_lock(&res->spinlock); + if (res->owner == dlm->node_num) { + /* check the lksb flags for the direction */ + if (lksb->flags & DLM_LKSB_GET_LVB) { + mlog(0, "getting lvb from lockres for %s node\n", + lock->ml.node == dlm->node_num ? "master" : + "remote"); + memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN); + } + /* Do nothing for lvb put requests - they should be done in + * place when the lock is downconverted - otherwise we risk + * racing gets and puts which could result in old lvb data + * being propagated. We leave the put flag set and clear it + * here. In the future we might want to clear it at the time + * the put is actually done. + */ + } + spin_unlock(&res->spinlock); + + /* reset any lvb flags on the lksb */ + lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB); +} + +void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + dlm_astlockfunc_t *fn; + struct dlm_lockstatus *lksb; + + mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name, + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); + + lksb = lock->lksb; + fn = lock->ast; + BUG_ON(lock->ml.node != dlm->node_num); + + dlm_update_lvb(dlm, res, lock); + (*fn)(lock->astdata); +} + + +int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + int ret; + struct dlm_lockstatus *lksb; + int lksbflags; + + mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name, + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); + + lksb = lock->lksb; + BUG_ON(lock->ml.node == dlm->node_num); + + lksbflags = lksb->flags; + dlm_update_lvb(dlm, res, lock); + + /* lock request came from another node + * go do the ast over there */ + ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags); + return ret; +} + +void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock, int blocked_type) +{ + dlm_bastlockfunc_t *fn = lock->bast; + + BUG_ON(lock->ml.node != dlm->node_num); + + mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + blocked_type); + + (*fn)(lock->astdata, blocked_type); +} + + + +int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + int ret; + unsigned int locklen; + struct dlm_ctxt *dlm = data; + struct dlm_lock_resource *res = NULL; + struct dlm_lock *lock = NULL; + struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf; + char *name; + struct list_head *iter, *head=NULL; + u64 cookie; + u32 flags; + u8 node; + + if (!dlm_grab(dlm)) { + dlm_error(DLM_REJECTED); + return DLM_REJECTED; + } + + mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), + "Domain %s not fully joined!\n", dlm->name); + + name = past->name; + locklen = past->namelen; + cookie = past->cookie; + flags = be32_to_cpu(past->flags); + node = past->node_idx; + + if (locklen > DLM_LOCKID_NAME_MAX) { + ret = DLM_IVBUFLEN; + mlog(ML_ERROR, "Invalid name length (%d) in proxy ast " + "handler!\n", locklen); + goto leave; + } + + if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == + (LKM_PUT_LVB|LKM_GET_LVB)) { + mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n", + flags); + ret = DLM_BADARGS; + goto leave; + } + + mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : + (flags & LKM_GET_LVB ? "get lvb" : "none")); + + mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type); + + if (past->type != DLM_AST && + past->type != DLM_BAST) { + mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" + "name=%.*s, node=%u\n", past->type, + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + locklen, name, node); + ret = DLM_IVLOCKID; + goto leave; + } + + res = dlm_lookup_lockres(dlm, name, locklen); + if (!res) { + mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, " + "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"), + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + locklen, name, node); + ret = DLM_IVLOCKID; + goto leave; + } + + /* cannot get a proxy ast message if this node owns it */ + BUG_ON(res->owner == dlm->node_num); + + mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len, + res->lockname.name); + + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + mlog(0, "Responding with DLM_RECOVERING!\n"); + ret = DLM_RECOVERING; + goto unlock_out; + } + if (res->state & DLM_LOCK_RES_MIGRATING) { + mlog(0, "Responding with DLM_MIGRATING!\n"); + ret = DLM_MIGRATING; + goto unlock_out; + } + /* try convert queue for both ast/bast */ + head = &res->converting; + lock = NULL; + list_for_each(iter, head) { + lock = list_entry (iter, struct dlm_lock, list); + if (lock->ml.cookie == cookie) + goto do_ast; + } + + /* if not on convert, try blocked for ast, granted for bast */ + if (past->type == DLM_AST) + head = &res->blocked; + else + head = &res->granted; + + list_for_each(iter, head) { + lock = list_entry (iter, struct dlm_lock, list); + if (lock->ml.cookie == cookie) + goto do_ast; + } + + mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, " + "node=%u\n", past->type == DLM_AST ? "" : "b", + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + locklen, name, node); + + ret = DLM_NORMAL; +unlock_out: + spin_unlock(&res->spinlock); + goto leave; + +do_ast: + ret = DLM_NORMAL; + if (past->type == DLM_AST) { + /* do not alter lock refcount. switching lists. */ + list_move_tail(&lock->list, &res->granted); + mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + lock->ml.type, lock->ml.convert_type); + + if (lock->ml.convert_type != LKM_IVMODE) { + lock->ml.type = lock->ml.convert_type; + lock->ml.convert_type = LKM_IVMODE; + } else { + // should already be there.... + } + + lock->lksb->status = DLM_NORMAL; + + /* if we requested the lvb, fetch it into our lksb now */ + if (flags & LKM_GET_LVB) { + BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB)); + memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN); + } + } + spin_unlock(&res->spinlock); + + if (past->type == DLM_AST) + dlm_do_local_ast(dlm, res, lock); + else + dlm_do_local_bast(dlm, res, lock, past->blocked_type); + +leave: + if (res) + dlm_lockres_put(res); + + dlm_put(dlm); + return ret; +} + + + +int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock, int msg_type, + int blocked_type, int flags) +{ + int ret = 0; + struct dlm_proxy_ast past; + struct kvec vec[2]; + size_t veclen = 1; + int status; + + mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name, + res->lockname.len, res->lockname.name, lock->ml.node, msg_type, + blocked_type); + + memset(&past, 0, sizeof(struct dlm_proxy_ast)); + past.node_idx = dlm->node_num; + past.type = msg_type; + past.blocked_type = blocked_type; + past.namelen = res->lockname.len; + memcpy(past.name, res->lockname.name, past.namelen); + past.cookie = lock->ml.cookie; + + vec[0].iov_len = sizeof(struct dlm_proxy_ast); + vec[0].iov_base = &past; + if (flags & DLM_LKSB_GET_LVB) { + be32_add_cpu(&past.flags, LKM_GET_LVB); + vec[1].iov_len = DLM_LVB_LEN; + vec[1].iov_base = lock->lksb->lvb; + veclen++; + } + + ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, + lock->ml.node, &status); + if (ret < 0) + mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n", + dlm->name, res->lockname.len, res->lockname.name, ret, + lock->ml.node); + else { + if (status == DLM_RECOVERING) { + mlog(ML_ERROR, "sent AST to node %u, it thinks this " + "node is dead!\n", lock->ml.node); + BUG(); + } else if (status == DLM_MIGRATING) { + mlog(ML_ERROR, "sent AST to node %u, it returned " + "DLM_MIGRATING!\n", lock->ml.node); + BUG(); + } else if (status != DLM_NORMAL && status != DLM_IVLOCKID) { + mlog(ML_ERROR, "AST to node %u returned %d!\n", + lock->ml.node, status); + /* ignore it */ + } + ret = 0; + } + return ret; +} diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h new file mode 100644 index 00000000..d602abb5 --- /dev/null +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -0,0 +1,1181 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmcommon.h + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMCOMMON_H +#define DLMCOMMON_H + +#include + +#define DLM_HB_NODE_DOWN_PRI (0xf000000) +#define DLM_HB_NODE_UP_PRI (0x8000000) + +#define DLM_LOCKID_NAME_MAX 32 + +#define DLM_DOMAIN_NAME_MAX_LEN 255 +#define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES +#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes +#define DLM_THREAD_MS 200 // flush at least every 200 ms + +#define DLM_HASH_SIZE_DEFAULT (1 << 17) +#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE +# define DLM_HASH_PAGES 1 +#else +# define DLM_HASH_PAGES (DLM_HASH_SIZE_DEFAULT / PAGE_SIZE) +#endif +#define DLM_BUCKETS_PER_PAGE (PAGE_SIZE / sizeof(struct hlist_head)) +#define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE) + +/* Intended to make it easier for us to switch out hash functions */ +#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) + +enum dlm_mle_type { + DLM_MLE_BLOCK = 0, + DLM_MLE_MASTER = 1, + DLM_MLE_MIGRATION = 2, + DLM_MLE_NUM_TYPES = 3, +}; + +struct dlm_master_list_entry { + struct hlist_node master_hash_node; + struct list_head hb_events; + struct dlm_ctxt *dlm; + spinlock_t spinlock; + wait_queue_head_t wq; + atomic_t woken; + struct kref mle_refs; + int inuse; + unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + u8 master; + u8 new_master; + enum dlm_mle_type type; + struct o2hb_callback_func mle_hb_up; + struct o2hb_callback_func mle_hb_down; + struct dlm_lock_resource *mleres; + unsigned char mname[DLM_LOCKID_NAME_MAX]; + unsigned int mnamelen; + unsigned int mnamehash; +}; + +enum dlm_ast_type { + DLM_AST = 0, + DLM_BAST = 1, + DLM_ASTUNLOCK = 2, +}; + + +#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \ + LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \ + LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE) + +#define DLM_RECOVERY_LOCK_NAME "$RECOVERY" +#define DLM_RECOVERY_LOCK_NAME_LEN 9 + +static inline int dlm_is_recovery_lock(const char *lock_name, int name_len) +{ + if (name_len == DLM_RECOVERY_LOCK_NAME_LEN && + memcmp(lock_name, DLM_RECOVERY_LOCK_NAME, name_len)==0) + return 1; + return 0; +} + +#define DLM_RECO_STATE_ACTIVE 0x0001 +#define DLM_RECO_STATE_FINALIZE 0x0002 + +struct dlm_recovery_ctxt +{ + struct list_head resources; + struct list_head received; + struct list_head node_data; + u8 new_master; + u8 dead_node; + u16 state; + unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + wait_queue_head_t event; +}; + +enum dlm_ctxt_state { + DLM_CTXT_NEW = 0, + DLM_CTXT_JOINED = 1, + DLM_CTXT_IN_SHUTDOWN = 2, + DLM_CTXT_LEAVING = 3, +}; + +struct dlm_ctxt +{ + struct list_head list; + struct hlist_head **lockres_hash; + struct list_head dirty_list; + struct list_head purge_list; + struct list_head pending_asts; + struct list_head pending_basts; + struct list_head tracking_list; + unsigned int purge_count; + spinlock_t spinlock; + spinlock_t ast_lock; + spinlock_t track_lock; + char *name; + u8 node_num; + u32 key; + u8 joining_node; + wait_queue_head_t dlm_join_events; + unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + struct dlm_recovery_ctxt reco; + spinlock_t master_lock; + struct hlist_head **master_hash; + struct list_head mle_hb_events; + + /* these give a really vague idea of the system load */ + atomic_t mle_tot_count[DLM_MLE_NUM_TYPES]; + atomic_t mle_cur_count[DLM_MLE_NUM_TYPES]; + atomic_t res_tot_count; + atomic_t res_cur_count; + + struct dlm_debug_ctxt *dlm_debug_ctxt; + struct dentry *dlm_debugfs_subroot; + + /* NOTE: Next three are protected by dlm_domain_lock */ + struct kref dlm_refs; + enum dlm_ctxt_state dlm_state; + unsigned int num_joins; + + struct o2hb_callback_func dlm_hb_up; + struct o2hb_callback_func dlm_hb_down; + struct task_struct *dlm_thread_task; + struct task_struct *dlm_reco_thread_task; + struct workqueue_struct *dlm_worker; + wait_queue_head_t dlm_thread_wq; + wait_queue_head_t dlm_reco_thread_wq; + wait_queue_head_t ast_wq; + wait_queue_head_t migration_wq; + + struct work_struct dispatched_work; + struct list_head work_list; + spinlock_t work_lock; + struct list_head dlm_domain_handlers; + struct list_head dlm_eviction_callbacks; + + /* The filesystem specifies this at domain registration. We + * cache it here to know what to tell other nodes. */ + struct dlm_protocol_version fs_locking_proto; + /* This is the inter-dlm communication version */ + struct dlm_protocol_version dlm_locking_proto; +}; + +static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i) +{ + return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE); +} + +static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm, + unsigned i) +{ + return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + + (i % DLM_BUCKETS_PER_PAGE); +} + +/* these keventd work queue items are for less-frequently + * called functions that cannot be directly called from the + * net message handlers for some reason, usually because + * they need to send net messages of their own. */ +void dlm_dispatch_work(struct work_struct *work); + +struct dlm_lock_resource; +struct dlm_work_item; + +typedef void (dlm_workfunc_t)(struct dlm_work_item *, void *); + +struct dlm_request_all_locks_priv +{ + u8 reco_master; + u8 dead_node; +}; + +struct dlm_mig_lockres_priv +{ + struct dlm_lock_resource *lockres; + u8 real_master; + u8 extra_ref; +}; + +struct dlm_assert_master_priv +{ + struct dlm_lock_resource *lockres; + u8 request_from; + u32 flags; + unsigned ignore_higher:1; +}; + +struct dlm_deref_lockres_priv +{ + struct dlm_lock_resource *deref_res; + u8 deref_node; +}; + +struct dlm_work_item +{ + struct list_head list; + dlm_workfunc_t *func; + struct dlm_ctxt *dlm; + void *data; + union { + struct dlm_request_all_locks_priv ral; + struct dlm_mig_lockres_priv ml; + struct dlm_assert_master_priv am; + struct dlm_deref_lockres_priv dl; + } u; +}; + +static inline void dlm_init_work_item(struct dlm_ctxt *dlm, + struct dlm_work_item *i, + dlm_workfunc_t *f, void *data) +{ + memset(i, 0, sizeof(*i)); + i->func = f; + INIT_LIST_HEAD(&i->list); + i->data = data; + i->dlm = dlm; /* must have already done a dlm_grab on this! */ +} + + + +static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, + u8 node) +{ + assert_spin_locked(&dlm->spinlock); + + dlm->joining_node = node; + wake_up(&dlm->dlm_join_events); +} + +#define DLM_LOCK_RES_UNINITED 0x00000001 +#define DLM_LOCK_RES_RECOVERING 0x00000002 +#define DLM_LOCK_RES_READY 0x00000004 +#define DLM_LOCK_RES_DIRTY 0x00000008 +#define DLM_LOCK_RES_IN_PROGRESS 0x00000010 +#define DLM_LOCK_RES_MIGRATING 0x00000020 +#define DLM_LOCK_RES_DROPPING_REF 0x00000040 +#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 +#define DLM_LOCK_RES_SETREF_INPROG 0x00002000 + +/* max milliseconds to wait to sync up a network failure with a node death */ +#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) + +#define DLM_PURGE_INTERVAL_MS (8 * 1000) + +struct dlm_lock_resource +{ + /* WARNING: Please see the comment in dlm_init_lockres before + * adding fields here. */ + struct hlist_node hash_node; + struct qstr lockname; + struct kref refs; + + /* + * Please keep granted, converting, and blocked in this order, + * as some funcs want to iterate over all lists. + * + * All four lists are protected by the hash's reference. + */ + struct list_head granted; + struct list_head converting; + struct list_head blocked; + struct list_head purge; + + /* + * These two lists require you to hold an additional reference + * while they are on the list. + */ + struct list_head dirty; + struct list_head recovering; // dlm_recovery_ctxt.resources list + + /* Added during init and removed during release */ + struct list_head tracking; /* dlm->tracking_list */ + + /* unused lock resources have their last_used stamped and are + * put on a list for the dlm thread to run. */ + unsigned long last_used; + + struct dlm_ctxt *dlm; + + unsigned migration_pending:1; + atomic_t asts_reserved; + spinlock_t spinlock; + wait_queue_head_t wq; + u8 owner; //node which owns the lock resource, or unknown + u16 state; + char lvb[DLM_LVB_LEN]; + unsigned int inflight_locks; + unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; +}; + +struct dlm_migratable_lock +{ + __be64 cookie; + + /* these 3 are just padding for the in-memory structure, but + * list and flags are actually used when sent over the wire */ + __be16 pad1; + u8 list; // 0=granted, 1=converting, 2=blocked + u8 flags; + + s8 type; + s8 convert_type; + s8 highest_blocked; + u8 node; +}; // 16 bytes + +struct dlm_lock +{ + struct dlm_migratable_lock ml; + + struct list_head list; + struct list_head ast_list; + struct list_head bast_list; + struct dlm_lock_resource *lockres; + spinlock_t spinlock; + struct kref lock_refs; + + // ast and bast must be callable while holding a spinlock! + dlm_astlockfunc_t *ast; + dlm_bastlockfunc_t *bast; + void *astdata; + struct dlm_lockstatus *lksb; + unsigned ast_pending:1, + bast_pending:1, + convert_pending:1, + lock_pending:1, + cancel_pending:1, + unlock_pending:1, + lksb_kernel_allocated:1; +}; + + +#define DLM_LKSB_UNUSED1 0x01 +#define DLM_LKSB_PUT_LVB 0x02 +#define DLM_LKSB_GET_LVB 0x04 +#define DLM_LKSB_UNUSED2 0x08 +#define DLM_LKSB_UNUSED3 0x10 +#define DLM_LKSB_UNUSED4 0x20 +#define DLM_LKSB_UNUSED5 0x40 +#define DLM_LKSB_UNUSED6 0x80 + + +enum dlm_lockres_list { + DLM_GRANTED_LIST = 0, + DLM_CONVERTING_LIST = 1, + DLM_BLOCKED_LIST = 2, +}; + +static inline int dlm_lvb_is_empty(char *lvb) +{ + int i; + for (i=0; igranted; + else if (idx == DLM_CONVERTING_LIST) + ret = &res->converting; + else if (idx == DLM_BLOCKED_LIST) + ret = &res->blocked; + else + BUG(); + return ret; +} + + + + +struct dlm_node_iter +{ + unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int curnode; +}; + + +enum { + DLM_MASTER_REQUEST_MSG = 500, + DLM_UNUSED_MSG1 = 501, + DLM_ASSERT_MASTER_MSG = 502, + DLM_CREATE_LOCK_MSG = 503, + DLM_CONVERT_LOCK_MSG = 504, + DLM_PROXY_AST_MSG = 505, + DLM_UNLOCK_LOCK_MSG = 506, + DLM_DEREF_LOCKRES_MSG = 507, + DLM_MIGRATE_REQUEST_MSG = 508, + DLM_MIG_LOCKRES_MSG = 509, + DLM_QUERY_JOIN_MSG = 510, + DLM_ASSERT_JOINED_MSG = 511, + DLM_CANCEL_JOIN_MSG = 512, + DLM_EXIT_DOMAIN_MSG = 513, + DLM_MASTER_REQUERY_MSG = 514, + DLM_LOCK_REQUEST_MSG = 515, + DLM_RECO_DATA_DONE_MSG = 516, + DLM_BEGIN_RECO_MSG = 517, + DLM_FINALIZE_RECO_MSG = 518, + DLM_QUERY_REGION = 519, + DLM_QUERY_NODEINFO = 520, + DLM_BEGIN_EXIT_DOMAIN_MSG = 521, +}; + +struct dlm_reco_node_data +{ + int state; + u8 node_num; + struct list_head list; +}; + +enum { + DLM_RECO_NODE_DATA_DEAD = -1, + DLM_RECO_NODE_DATA_INIT = 0, + DLM_RECO_NODE_DATA_REQUESTING = 1, + DLM_RECO_NODE_DATA_REQUESTED = 2, + DLM_RECO_NODE_DATA_RECEIVING = 3, + DLM_RECO_NODE_DATA_DONE = 4, + DLM_RECO_NODE_DATA_FINALIZE_SENT = 5, +}; + + +enum { + DLM_MASTER_RESP_NO = 0, + DLM_MASTER_RESP_YES = 1, + DLM_MASTER_RESP_MAYBE = 2, + DLM_MASTER_RESP_ERROR = 3, +}; + + +struct dlm_master_request +{ + u8 node_idx; + u8 namelen; + __be16 pad1; + __be32 flags; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + +#define DLM_ASSERT_RESPONSE_REASSERT 0x00000001 +#define DLM_ASSERT_RESPONSE_MASTERY_REF 0x00000002 + +#define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001 +#define DLM_ASSERT_MASTER_REQUERY 0x00000002 +#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004 +struct dlm_assert_master +{ + u8 node_idx; + u8 namelen; + __be16 pad1; + __be32 flags; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + +#define DLM_MIGRATE_RESPONSE_MASTERY_REF 0x00000001 + +struct dlm_migrate_request +{ + u8 master; + u8 new_master; + u8 namelen; + u8 pad1; + __be32 pad2; + u8 name[O2NM_MAX_NAME_LEN]; +}; + +struct dlm_master_requery +{ + u8 pad1; + u8 pad2; + u8 node_idx; + u8 namelen; + __be32 pad3; + u8 name[O2NM_MAX_NAME_LEN]; +}; + +#define DLM_MRES_RECOVERY 0x01 +#define DLM_MRES_MIGRATION 0x02 +#define DLM_MRES_ALL_DONE 0x04 + +/* + * We would like to get one whole lockres into a single network + * message whenever possible. Generally speaking, there will be + * at most one dlm_lock on a lockres for each node in the cluster, + * plus (infrequently) any additional locks coming in from userdlm. + * + * struct _dlm_lockres_page + * { + * dlm_migratable_lockres mres; + * dlm_migratable_lock ml[DLM_MAX_MIGRATABLE_LOCKS]; + * u8 pad[DLM_MIG_LOCKRES_RESERVED]; + * }; + * + * from ../cluster/tcp.h + * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) + * (roughly 4080 bytes) + * and sizeof(dlm_migratable_lockres) = 112 bytes + * and sizeof(dlm_migratable_lock) = 16 bytes + * + * Choosing DLM_MAX_MIGRATABLE_LOCKS=240 and + * DLM_MIG_LOCKRES_RESERVED=128 means we have this: + * + * (DLM_MAX_MIGRATABLE_LOCKS * sizeof(dlm_migratable_lock)) + + * sizeof(dlm_migratable_lockres) + DLM_MIG_LOCKRES_RESERVED = + * NET_MAX_PAYLOAD_BYTES + * (240 * 16) + 112 + 128 = 4080 + * + * So a lockres would need more than 240 locks before it would + * use more than one network packet to recover. Not too bad. + */ +#define DLM_MAX_MIGRATABLE_LOCKS 240 + +struct dlm_migratable_lockres +{ + u8 master; + u8 lockname_len; + u8 num_locks; // locks sent in this structure + u8 flags; + __be32 total_locks; // locks to be sent for this migration cookie + __be64 mig_cookie; // cookie for this lockres migration + // or zero if not needed + // 16 bytes + u8 lockname[DLM_LOCKID_NAME_MAX]; + // 48 bytes + u8 lvb[DLM_LVB_LEN]; + // 112 bytes + struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112 +}; +#define DLM_MIG_LOCKRES_MAX_LEN \ + (sizeof(struct dlm_migratable_lockres) + \ + (sizeof(struct dlm_migratable_lock) * \ + DLM_MAX_MIGRATABLE_LOCKS) ) + +/* from above, 128 bytes + * for some undetermined future use */ +#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \ + DLM_MIG_LOCKRES_MAX_LEN) + +struct dlm_create_lock +{ + __be64 cookie; + + __be32 flags; + u8 pad1; + u8 node_idx; + s8 requested_type; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + +struct dlm_convert_lock +{ + __be64 cookie; + + __be32 flags; + u8 pad1; + u8 node_idx; + s8 requested_type; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; + + s8 lvb[0]; +}; +#define DLM_CONVERT_LOCK_MAX_LEN (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN) + +struct dlm_unlock_lock +{ + __be64 cookie; + + __be32 flags; + __be16 pad1; + u8 node_idx; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; + + s8 lvb[0]; +}; +#define DLM_UNLOCK_LOCK_MAX_LEN (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN) + +struct dlm_proxy_ast +{ + __be64 cookie; + + __be32 flags; + u8 node_idx; + u8 type; + u8 blocked_type; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; + + s8 lvb[0]; +}; +#define DLM_PROXY_AST_MAX_LEN (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN) + +#define DLM_MOD_KEY (0x666c6172) +enum dlm_query_join_response_code { + JOIN_DISALLOW = 0, + JOIN_OK = 1, + JOIN_OK_NO_MAP = 2, + JOIN_PROTOCOL_MISMATCH = 3, +}; + +struct dlm_query_join_packet { + u8 code; /* Response code. dlm_minor and fs_minor + are only valid if this is JOIN_OK */ + u8 dlm_minor; /* The minor version of the protocol the + dlm is speaking. */ + u8 fs_minor; /* The minor version of the protocol the + filesystem is speaking. */ + u8 reserved; +}; + +union dlm_query_join_response { + u32 intval; + struct dlm_query_join_packet packet; +}; + +struct dlm_lock_request +{ + u8 node_idx; + u8 dead_node; + __be16 pad1; + __be32 pad2; +}; + +struct dlm_reco_data_done +{ + u8 node_idx; + u8 dead_node; + __be16 pad1; + __be32 pad2; + + /* unused for now */ + /* eventually we can use this to attempt + * lvb recovery based on each node's info */ + u8 reco_lvb[DLM_LVB_LEN]; +}; + +struct dlm_begin_reco +{ + u8 node_idx; + u8 dead_node; + __be16 pad1; + __be32 pad2; +}; + + +#define BITS_PER_BYTE 8 +#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE) + +struct dlm_query_join_request +{ + u8 node_idx; + u8 pad1[2]; + u8 name_len; + struct dlm_protocol_version dlm_proto; + struct dlm_protocol_version fs_proto; + u8 domain[O2NM_MAX_NAME_LEN]; + u8 node_map[BITS_TO_BYTES(O2NM_MAX_NODES)]; +}; + +struct dlm_assert_joined +{ + u8 node_idx; + u8 pad1[2]; + u8 name_len; + u8 domain[O2NM_MAX_NAME_LEN]; +}; + +struct dlm_cancel_join +{ + u8 node_idx; + u8 pad1[2]; + u8 name_len; + u8 domain[O2NM_MAX_NAME_LEN]; +}; + +struct dlm_query_region { + u8 qr_node; + u8 qr_numregions; + u8 qr_namelen; + u8 pad1; + u8 qr_domain[O2NM_MAX_NAME_LEN]; + u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS]; +}; + +struct dlm_node_info { + u8 ni_nodenum; + u8 pad1; + u16 ni_ipv4_port; + u32 ni_ipv4_address; +}; + +struct dlm_query_nodeinfo { + u8 qn_nodenum; + u8 qn_numnodes; + u8 qn_namelen; + u8 pad1; + u8 qn_domain[O2NM_MAX_NAME_LEN]; + struct dlm_node_info qn_nodes[O2NM_MAX_NODES]; +}; + +struct dlm_exit_domain +{ + u8 node_idx; + u8 pad1[3]; +}; + +struct dlm_finalize_reco +{ + u8 node_idx; + u8 dead_node; + u8 flags; + u8 pad1; + __be32 pad2; +}; + +struct dlm_deref_lockres +{ + u32 pad1; + u16 pad2; + u8 node_idx; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + +static inline enum dlm_status +__dlm_lockres_state_to_status(struct dlm_lock_resource *res) +{ + enum dlm_status status = DLM_NORMAL; + + assert_spin_locked(&res->spinlock); + + if (res->state & DLM_LOCK_RES_RECOVERING) + status = DLM_RECOVERING; + else if (res->state & DLM_LOCK_RES_MIGRATING) + status = DLM_MIGRATING; + else if (res->state & DLM_LOCK_RES_IN_PROGRESS) + status = DLM_FORWARD; + + return status; +} + +static inline u8 dlm_get_lock_cookie_node(u64 cookie) +{ + u8 ret; + cookie >>= 56; + ret = (u8)(cookie & 0xffULL); + return ret; +} + +static inline unsigned long long dlm_get_lock_cookie_seq(u64 cookie) +{ + unsigned long long ret; + ret = ((unsigned long long)cookie) & 0x00ffffffffffffffULL; + return ret; +} + +struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, + struct dlm_lockstatus *lksb); +void dlm_lock_get(struct dlm_lock *lock); +void dlm_lock_put(struct dlm_lock *lock); + +void dlm_lock_attach_lockres(struct dlm_lock *lock, + struct dlm_lock_resource *res); + +int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); + +void dlm_revert_pending_convert(struct dlm_lock_resource *res, + struct dlm_lock *lock); +void dlm_revert_pending_lock(struct dlm_lock_resource *res, + struct dlm_lock *lock); + +int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +void dlm_commit_pending_cancel(struct dlm_lock_resource *res, + struct dlm_lock *lock); +void dlm_commit_pending_unlock(struct dlm_lock_resource *res, + struct dlm_lock *lock); + +int dlm_launch_thread(struct dlm_ctxt *dlm); +void dlm_complete_thread(struct dlm_ctxt *dlm); +int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); +void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); +void dlm_wait_for_recovery(struct dlm_ctxt *dlm); +void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); +int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); +int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); +int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); + +void dlm_put(struct dlm_ctxt *dlm); +struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); +int dlm_domain_fully_joined(struct dlm_ctxt *dlm); + +void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +static inline void dlm_lockres_get(struct dlm_lock_resource *res) +{ + /* This is called on every lookup, so it might be worth + * inlining. */ + kref_get(&res->refs); +} +void dlm_lockres_put(struct dlm_lock_resource *res); +void __dlm_unhash_lockres(struct dlm_lock_resource *res); +void __dlm_insert_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash); +struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash); +struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len); + +int dlm_is_host_down(int errno); + +struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, + const char *lockid, + int namelen, + int flags); +struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int namelen); + +#define dlm_lockres_set_refmap_bit(bit,res) \ + __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__) +#define dlm_lockres_clear_refmap_bit(bit,res) \ + __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__) + +static inline void __dlm_lockres_set_refmap_bit(int bit, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + //printk("%s:%d:%.*s: setting bit %d\n", file, line, + // res->lockname.len, res->lockname.name, bit); + set_bit(bit, res->refmap); +} + +static inline void __dlm_lockres_clear_refmap_bit(int bit, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + //printk("%s:%d:%.*s: clearing bit %d\n", file, line, + // res->lockname.len, res->lockname.name, bit); + clear_bit(bit, res->refmap); +} + +void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *file, + int line); +void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int new_lockres, + const char *file, + int line); +#define dlm_lockres_drop_inflight_ref(d,r) \ + __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__) +#define dlm_lockres_grab_inflight_ref(d,r) \ + __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__) +#define dlm_lockres_grab_inflight_ref_new(d,r) \ + __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__) + +void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); +void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); +void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); +void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); +void dlm_do_local_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock); +int dlm_do_remote_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock); +void dlm_do_local_bast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + int blocked_type); +int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + int msg_type, + int blocked_type, int flags); +static inline int dlm_send_proxy_bast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + int blocked_type) +{ + return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_BAST, + blocked_type, 0); +} + +static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + int flags) +{ + return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_AST, + 0, flags); +} + +void dlm_print_one_lock_resource(struct dlm_lock_resource *res); +void __dlm_print_one_lock_resource(struct dlm_lock_resource *res); + +u8 dlm_nm_this_node(struct dlm_ctxt *dlm); +void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); + + +int dlm_nm_init(struct dlm_ctxt *dlm); +int dlm_heartbeat_init(struct dlm_ctxt *dlm); +void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data); +void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data); + +int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +int dlm_finish_migration(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 old_master); +void dlm_lockres_release_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res); + +int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +void dlm_assert_master_post_handler(int status, void *data, void *ret_data); +int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + u8 nodenum, u8 *real_master); + + +int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int ignore_higher, + u8 request_from, + u32 flags); + + +int dlm_send_one_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_migratable_lockres *mres, + u8 send_to, + u8 flags); +void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); + +/* will exit holding res->spinlock, but may drop in function */ +void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags); +void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags); + +/* will exit holding res->spinlock, but may drop in function */ +static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) +{ + __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS| + DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_MIGRATING)); +} + +void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle); +void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle); + +/* create/destroy slab caches */ +int dlm_init_master_caches(void); +void dlm_destroy_master_caches(void); + +int dlm_init_lock_cache(void); +void dlm_destroy_lock_cache(void); + +int dlm_init_mle_cache(void); +void dlm_destroy_mle_cache(void); + +void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); +int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +void dlm_clean_master_list(struct dlm_ctxt *dlm, + u8 dead_node); +void dlm_force_free_mles(struct dlm_ctxt *dlm); +int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); +int __dlm_lockres_has_locks(struct dlm_lock_resource *res); +int __dlm_lockres_unused(struct dlm_lock_resource *res); + +static inline const char * dlm_lock_mode_name(int mode) +{ + switch (mode) { + case LKM_EXMODE: + return "EX"; + case LKM_PRMODE: + return "PR"; + case LKM_NLMODE: + return "NL"; + } + return "UNKNOWN"; +} + + +static inline int dlm_lock_compatible(int existing, int request) +{ + /* NO_LOCK compatible with all */ + if (request == LKM_NLMODE || + existing == LKM_NLMODE) + return 1; + + /* EX incompatible with all non-NO_LOCK */ + if (request == LKM_EXMODE) + return 0; + + /* request must be PR, which is compatible with PR */ + if (existing == LKM_PRMODE) + return 1; + + return 0; +} + +static inline int dlm_lock_on_list(struct list_head *head, + struct dlm_lock *lock) +{ + struct list_head *iter; + struct dlm_lock *tmplock; + + list_for_each(iter, head) { + tmplock = list_entry(iter, struct dlm_lock, list); + if (tmplock == lock) + return 1; + } + return 0; +} + + +static inline enum dlm_status dlm_err_to_dlm_status(int err) +{ + enum dlm_status ret; + if (err == -ENOMEM) + ret = DLM_SYSERR; + else if (err == -ETIMEDOUT || o2net_link_down(err, NULL)) + ret = DLM_NOLOCKMGR; + else if (err == -EINVAL) + ret = DLM_BADPARAM; + else if (err == -ENAMETOOLONG) + ret = DLM_IVBUFLEN; + else + ret = DLM_BADARGS; + return ret; +} + + +static inline void dlm_node_iter_init(unsigned long *map, + struct dlm_node_iter *iter) +{ + memcpy(iter->node_map, map, sizeof(iter->node_map)); + iter->curnode = -1; +} + +static inline int dlm_node_iter_next(struct dlm_node_iter *iter) +{ + int bit; + bit = find_next_bit(iter->node_map, O2NM_MAX_NODES, iter->curnode+1); + if (bit >= O2NM_MAX_NODES) { + iter->curnode = O2NM_MAX_NODES; + return -ENOENT; + } + iter->curnode = bit; + return bit; +} + +static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 owner) +{ + assert_spin_locked(&res->spinlock); + + res->owner = owner; +} + +static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 owner) +{ + assert_spin_locked(&res->spinlock); + + if (owner != res->owner) + dlm_set_lockres_owner(dlm, res, owner); +} + +#endif /* DLMCOMMON_H */ diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c new file mode 100644 index 00000000..29a886d1 --- /dev/null +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -0,0 +1,548 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmconvert.c + * + * underlying calls for lock conversion + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" + +#include "dlmconvert.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +/* NOTE: __dlmconvert_master is the only function in here that + * needs a spinlock held on entry (res->spinlock) and it is the + * only one that holds a lock on exit (res->spinlock). + * All other functions in here need no locks and drop all of + * the locks that they acquire. */ +static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, + int type, int *call_ast, + int *kick_thread); +static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type); + +/* + * this is only called directly by dlmlock(), and only when the + * local node is the owner of the lockres + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: see __dlmconvert_master + */ +enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type) +{ + int call_ast = 0, kick_thread = 0; + enum dlm_status status; + + spin_lock(&res->spinlock); + /* we are not in a network handler, this is fine */ + __dlm_wait_on_lockres(res); + __dlm_lockres_reserve_ast(res); + res->state |= DLM_LOCK_RES_IN_PROGRESS; + + status = __dlmconvert_master(dlm, res, lock, flags, type, + &call_ast, &kick_thread); + + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + if (status != DLM_NORMAL && status != DLM_NOTQUEUED) + dlm_error(status); + + /* either queue the ast or release it */ + if (call_ast) + dlm_queue_ast(dlm, lock); + else + dlm_lockres_release_ast(dlm, res); + + if (kick_thread) + dlm_kick_thread(dlm, res); + + return status; +} + +/* performs lock conversion at the lockres master site + * locking: + * caller needs: res->spinlock + * taken: takes and drops lock->spinlock + * held on exit: res->spinlock + * returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED + * call_ast: whether ast should be called for this lock + * kick_thread: whether dlm_kick_thread should be called + */ +static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, + int type, int *call_ast, + int *kick_thread) +{ + enum dlm_status status = DLM_NORMAL; + struct list_head *iter; + struct dlm_lock *tmplock=NULL; + + assert_spin_locked(&res->spinlock); + + mlog(0, "type=%d, convert_type=%d, new convert_type=%d\n", + lock->ml.type, lock->ml.convert_type, type); + + spin_lock(&lock->spinlock); + + /* already converting? */ + if (lock->ml.convert_type != LKM_IVMODE) { + mlog(ML_ERROR, "attempted to convert a lock with a lock " + "conversion pending\n"); + status = DLM_DENIED; + goto unlock_exit; + } + + /* must be on grant queue to convert */ + if (!dlm_lock_on_list(&res->granted, lock)) { + mlog(ML_ERROR, "attempted to convert a lock not on grant " + "queue\n"); + status = DLM_DENIED; + goto unlock_exit; + } + + if (flags & LKM_VALBLK) { + switch (lock->ml.type) { + case LKM_EXMODE: + /* EX + LKM_VALBLK + convert == set lvb */ + mlog(0, "will set lvb: converting %s->%s\n", + dlm_lock_mode_name(lock->ml.type), + dlm_lock_mode_name(type)); + lock->lksb->flags |= DLM_LKSB_PUT_LVB; + break; + case LKM_PRMODE: + case LKM_NLMODE: + /* refetch if new level is not NL */ + if (type > LKM_NLMODE) { + mlog(0, "will fetch new value into " + "lvb: converting %s->%s\n", + dlm_lock_mode_name(lock->ml.type), + dlm_lock_mode_name(type)); + lock->lksb->flags |= DLM_LKSB_GET_LVB; + } else { + mlog(0, "will NOT fetch new value " + "into lvb: converting %s->%s\n", + dlm_lock_mode_name(lock->ml.type), + dlm_lock_mode_name(type)); + flags &= ~(LKM_VALBLK); + } + break; + } + } + + + /* in-place downconvert? */ + if (type <= lock->ml.type) + goto grant; + + /* upconvert from here on */ + status = DLM_NORMAL; + list_for_each(iter, &res->granted) { + tmplock = list_entry(iter, struct dlm_lock, list); + if (tmplock == lock) + continue; + if (!dlm_lock_compatible(tmplock->ml.type, type)) + goto switch_queues; + } + + list_for_each(iter, &res->converting) { + tmplock = list_entry(iter, struct dlm_lock, list); + if (!dlm_lock_compatible(tmplock->ml.type, type)) + goto switch_queues; + /* existing conversion requests take precedence */ + if (!dlm_lock_compatible(tmplock->ml.convert_type, type)) + goto switch_queues; + } + + /* fall thru to grant */ + +grant: + mlog(0, "res %.*s, granting %s lock\n", res->lockname.len, + res->lockname.name, dlm_lock_mode_name(type)); + /* immediately grant the new lock type */ + lock->lksb->status = DLM_NORMAL; + if (lock->ml.node == dlm->node_num) + mlog(0, "doing in-place convert for nonlocal lock\n"); + lock->ml.type = type; + if (lock->lksb->flags & DLM_LKSB_PUT_LVB) + memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); + + status = DLM_NORMAL; + *call_ast = 1; + goto unlock_exit; + +switch_queues: + if (flags & LKM_NOQUEUE) { + mlog(0, "failed to convert NOQUEUE lock %.*s from " + "%d to %d...\n", res->lockname.len, res->lockname.name, + lock->ml.type, type); + status = DLM_NOTQUEUED; + goto unlock_exit; + } + mlog(0, "res %.*s, queueing...\n", res->lockname.len, + res->lockname.name); + + lock->ml.convert_type = type; + /* do not alter lock refcount. switching lists. */ + list_move_tail(&lock->list, &res->converting); + +unlock_exit: + spin_unlock(&lock->spinlock); + if (status == DLM_DENIED) { + __dlm_print_one_lock_resource(res); + } + if (status == DLM_NORMAL) + *kick_thread = 1; + return status; +} + +void dlm_revert_pending_convert(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + /* do not alter lock refcount. switching lists. */ + list_move_tail(&lock->list, &res->granted); + lock->ml.convert_type = LKM_IVMODE; + lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB); +} + +/* messages the master site to do lock conversion + * locking: + * caller needs: none + * taken: takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS + * held on exit: none + * returns: DLM_NORMAL, DLM_RECOVERING, status from remote node + */ +enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type) +{ + enum dlm_status status; + + mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, + lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); + + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + mlog(0, "bailing out early since res is RECOVERING " + "on secondary queue\n"); + /* __dlm_print_one_lock_resource(res); */ + status = DLM_RECOVERING; + goto bail; + } + /* will exit this call with spinlock held */ + __dlm_wait_on_lockres(res); + + if (lock->ml.convert_type != LKM_IVMODE) { + __dlm_print_one_lock_resource(res); + mlog(ML_ERROR, "converting a remote lock that is already " + "converting! (cookie=%u:%llu, conv=%d)\n", + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + lock->ml.convert_type); + status = DLM_DENIED; + goto bail; + } + res->state |= DLM_LOCK_RES_IN_PROGRESS; + /* move lock to local convert queue */ + /* do not alter lock refcount. switching lists. */ + list_move_tail(&lock->list, &res->converting); + lock->convert_pending = 1; + lock->ml.convert_type = type; + + if (flags & LKM_VALBLK) { + if (lock->ml.type == LKM_EXMODE) { + flags |= LKM_PUT_LVB; + lock->lksb->flags |= DLM_LKSB_PUT_LVB; + } else { + if (lock->ml.convert_type == LKM_NLMODE) + flags &= ~LKM_VALBLK; + else { + flags |= LKM_GET_LVB; + lock->lksb->flags |= DLM_LKSB_GET_LVB; + } + } + } + spin_unlock(&res->spinlock); + + /* no locks held here. + * need to wait for a reply as to whether it got queued or not. */ + status = dlm_send_remote_convert_request(dlm, res, lock, flags, type); + + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + lock->convert_pending = 0; + /* if it failed, move it back to granted queue */ + if (status != DLM_NORMAL) { + if (status != DLM_NOTQUEUED) + dlm_error(status); + dlm_revert_pending_convert(res, lock); + } +bail: + spin_unlock(&res->spinlock); + + /* TODO: should this be a wake_one? */ + /* wake up any IN_PROGRESS waiters */ + wake_up(&res->wq); + + return status; +} + +/* sends DLM_CONVERT_LOCK_MSG to master site + * locking: + * caller needs: none + * taken: none + * held on exit: none + * returns: DLM_NOLOCKMGR, status from remote node + */ +static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type) +{ + struct dlm_convert_lock convert; + int tmpret; + enum dlm_status ret; + int status = 0; + struct kvec vec[2]; + size_t veclen = 1; + + mlog(0, "%.*s\n", res->lockname.len, res->lockname.name); + + memset(&convert, 0, sizeof(struct dlm_convert_lock)); + convert.node_idx = dlm->node_num; + convert.requested_type = type; + convert.cookie = lock->ml.cookie; + convert.namelen = res->lockname.len; + convert.flags = cpu_to_be32(flags); + memcpy(convert.name, res->lockname.name, convert.namelen); + + vec[0].iov_len = sizeof(struct dlm_convert_lock); + vec[0].iov_base = &convert; + + if (flags & LKM_PUT_LVB) { + /* extra data to send if we are updating lvb */ + vec[1].iov_len = DLM_LVB_LEN; + vec[1].iov_base = lock->lksb->lvb; + veclen++; + } + + tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key, + vec, veclen, res->owner, &status); + if (tmpret >= 0) { + // successfully sent and received + ret = status; // this is already a dlm_status + if (ret == DLM_RECOVERING) { + mlog(0, "node %u returned DLM_RECOVERING from convert " + "message!\n", res->owner); + } else if (ret == DLM_MIGRATING) { + mlog(0, "node %u returned DLM_MIGRATING from convert " + "message!\n", res->owner); + } else if (ret == DLM_FORWARD) { + mlog(0, "node %u returned DLM_FORWARD from convert " + "message!\n", res->owner); + } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED) + dlm_error(ret); + } else { + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key, + res->owner); + if (dlm_is_host_down(tmpret)) { + /* instead of logging the same network error over + * and over, sleep here and wait for the heartbeat + * to notice the node is dead. times out after 5s. */ + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); + ret = DLM_RECOVERING; + mlog(0, "node %u died so returning DLM_RECOVERING " + "from convert message!\n", res->owner); + } else { + ret = dlm_err_to_dlm_status(tmpret); + } + } + + return ret; +} + +/* handler for DLM_CONVERT_LOCK_MSG on master site + * locking: + * caller needs: none + * taken: takes and drop res->spinlock + * held on exit: none + * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS, + * status from __dlmconvert_master + */ +int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf; + struct dlm_lock_resource *res = NULL; + struct list_head *iter; + struct dlm_lock *lock = NULL; + struct dlm_lockstatus *lksb; + enum dlm_status status = DLM_NORMAL; + u32 flags; + int call_ast = 0, kick_thread = 0, ast_reserved = 0, wake = 0; + + if (!dlm_grab(dlm)) { + dlm_error(DLM_REJECTED); + return DLM_REJECTED; + } + + mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), + "Domain %s not fully joined!\n", dlm->name); + + if (cnv->namelen > DLM_LOCKID_NAME_MAX) { + status = DLM_IVBUFLEN; + dlm_error(status); + goto leave; + } + + flags = be32_to_cpu(cnv->flags); + + if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == + (LKM_PUT_LVB|LKM_GET_LVB)) { + mlog(ML_ERROR, "both PUT and GET lvb specified\n"); + status = DLM_BADARGS; + goto leave; + } + + mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : + (flags & LKM_GET_LVB ? "get lvb" : "none")); + + status = DLM_IVLOCKID; + res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen); + if (!res) { + dlm_error(status); + goto leave; + } + + spin_lock(&res->spinlock); + status = __dlm_lockres_state_to_status(res); + if (status != DLM_NORMAL) { + spin_unlock(&res->spinlock); + dlm_error(status); + goto leave; + } + list_for_each(iter, &res->granted) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock->ml.cookie == cnv->cookie && + lock->ml.node == cnv->node_idx) { + dlm_lock_get(lock); + break; + } + lock = NULL; + } + spin_unlock(&res->spinlock); + if (!lock) { + status = DLM_IVLOCKID; + mlog(ML_ERROR, "did not find lock to convert on grant queue! " + "cookie=%u:%llu\n", + dlm_get_lock_cookie_node(be64_to_cpu(cnv->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cnv->cookie))); + dlm_print_one_lock_resource(res); + goto leave; + } + + /* found the lock */ + lksb = lock->lksb; + + /* see if caller needed to get/put lvb */ + if (flags & LKM_PUT_LVB) { + BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); + lksb->flags |= DLM_LKSB_PUT_LVB; + memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN); + } else if (flags & LKM_GET_LVB) { + BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); + lksb->flags |= DLM_LKSB_GET_LVB; + } + + spin_lock(&res->spinlock); + status = __dlm_lockres_state_to_status(res); + if (status == DLM_NORMAL) { + __dlm_lockres_reserve_ast(res); + ast_reserved = 1; + res->state |= DLM_LOCK_RES_IN_PROGRESS; + status = __dlmconvert_master(dlm, res, lock, flags, + cnv->requested_type, + &call_ast, &kick_thread); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + wake = 1; + } + spin_unlock(&res->spinlock); + if (wake) + wake_up(&res->wq); + + if (status != DLM_NORMAL) { + if (status != DLM_NOTQUEUED) + dlm_error(status); + lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB); + } + +leave: + if (lock) + dlm_lock_put(lock); + + /* either queue the ast or release it, if reserved */ + if (call_ast) + dlm_queue_ast(dlm, lock); + else if (ast_reserved) + dlm_lockres_release_ast(dlm, res); + + if (kick_thread) + dlm_kick_thread(dlm, res); + + if (res) + dlm_lockres_put(res); + + dlm_put(dlm); + + return status; +} diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h new file mode 100644 index 00000000..b2e3677d --- /dev/null +++ b/fs/ocfs2/dlm/dlmconvert.h @@ -0,0 +1,35 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmconvert.h + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMCONVERT_H +#define DLMCONVERT_H + +enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type); +enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type); + +#endif diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c new file mode 100644 index 00000000..56f82cb9 --- /dev/null +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -0,0 +1,1017 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmdebug.c + * + * debug functionality for the dlm + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" +#include "dlmdomain.h" +#include "dlmdebug.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +static int stringify_lockname(const char *lockname, int locklen, char *buf, + int len); + +void dlm_print_one_lock_resource(struct dlm_lock_resource *res) +{ + spin_lock(&res->spinlock); + __dlm_print_one_lock_resource(res); + spin_unlock(&res->spinlock); +} + +static void dlm_print_lockres_refmap(struct dlm_lock_resource *res) +{ + int bit; + assert_spin_locked(&res->spinlock); + + printk(" refmap nodes: [ "); + bit = 0; + while (1) { + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); + if (bit >= O2NM_MAX_NODES) + break; + printk("%u ", bit); + bit++; + } + printk("], inflight=%u\n", res->inflight_locks); +} + +static void __dlm_print_lock(struct dlm_lock *lock) +{ + spin_lock(&lock->spinlock); + + printk(" type=%d, conv=%d, node=%u, cookie=%u:%llu, " + "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), " + "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n", + lock->ml.type, lock->ml.convert_type, lock->ml.node, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + atomic_read(&lock->lock_refs.refcount), + (list_empty(&lock->ast_list) ? 'y' : 'n'), + (lock->ast_pending ? 'y' : 'n'), + (list_empty(&lock->bast_list) ? 'y' : 'n'), + (lock->bast_pending ? 'y' : 'n'), + (lock->convert_pending ? 'y' : 'n'), + (lock->lock_pending ? 'y' : 'n'), + (lock->cancel_pending ? 'y' : 'n'), + (lock->unlock_pending ? 'y' : 'n')); + + spin_unlock(&lock->spinlock); +} + +void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) +{ + struct list_head *iter2; + struct dlm_lock *lock; + char buf[DLM_LOCKID_NAME_MAX]; + + assert_spin_locked(&res->spinlock); + + stringify_lockname(res->lockname.name, res->lockname.len, + buf, sizeof(buf)); + printk("lockres: %s, owner=%u, state=%u\n", + buf, res->owner, res->state); + printk(" last used: %lu, refcnt: %u, on purge list: %s\n", + res->last_used, atomic_read(&res->refs.refcount), + list_empty(&res->purge) ? "no" : "yes"); + printk(" on dirty list: %s, on reco list: %s, " + "migrating pending: %s\n", + list_empty(&res->dirty) ? "no" : "yes", + list_empty(&res->recovering) ? "no" : "yes", + res->migration_pending ? "yes" : "no"); + printk(" inflight locks: %d, asts reserved: %d\n", + res->inflight_locks, atomic_read(&res->asts_reserved)); + dlm_print_lockres_refmap(res); + printk(" granted queue:\n"); + list_for_each(iter2, &res->granted) { + lock = list_entry(iter2, struct dlm_lock, list); + __dlm_print_lock(lock); + } + printk(" converting queue:\n"); + list_for_each(iter2, &res->converting) { + lock = list_entry(iter2, struct dlm_lock, list); + __dlm_print_lock(lock); + } + printk(" blocked queue:\n"); + list_for_each(iter2, &res->blocked) { + lock = list_entry(iter2, struct dlm_lock, list); + __dlm_print_lock(lock); + } +} + +void dlm_print_one_lock(struct dlm_lock *lockid) +{ + dlm_print_one_lock_resource(lockid->lockres); +} +EXPORT_SYMBOL_GPL(dlm_print_one_lock); + +static const char *dlm_errnames[] = { + [DLM_NORMAL] = "DLM_NORMAL", + [DLM_GRANTED] = "DLM_GRANTED", + [DLM_DENIED] = "DLM_DENIED", + [DLM_DENIED_NOLOCKS] = "DLM_DENIED_NOLOCKS", + [DLM_WORKING] = "DLM_WORKING", + [DLM_BLOCKED] = "DLM_BLOCKED", + [DLM_BLOCKED_ORPHAN] = "DLM_BLOCKED_ORPHAN", + [DLM_DENIED_GRACE_PERIOD] = "DLM_DENIED_GRACE_PERIOD", + [DLM_SYSERR] = "DLM_SYSERR", + [DLM_NOSUPPORT] = "DLM_NOSUPPORT", + [DLM_CANCELGRANT] = "DLM_CANCELGRANT", + [DLM_IVLOCKID] = "DLM_IVLOCKID", + [DLM_SYNC] = "DLM_SYNC", + [DLM_BADTYPE] = "DLM_BADTYPE", + [DLM_BADRESOURCE] = "DLM_BADRESOURCE", + [DLM_MAXHANDLES] = "DLM_MAXHANDLES", + [DLM_NOCLINFO] = "DLM_NOCLINFO", + [DLM_NOLOCKMGR] = "DLM_NOLOCKMGR", + [DLM_NOPURGED] = "DLM_NOPURGED", + [DLM_BADARGS] = "DLM_BADARGS", + [DLM_VOID] = "DLM_VOID", + [DLM_NOTQUEUED] = "DLM_NOTQUEUED", + [DLM_IVBUFLEN] = "DLM_IVBUFLEN", + [DLM_CVTUNGRANT] = "DLM_CVTUNGRANT", + [DLM_BADPARAM] = "DLM_BADPARAM", + [DLM_VALNOTVALID] = "DLM_VALNOTVALID", + [DLM_REJECTED] = "DLM_REJECTED", + [DLM_ABORT] = "DLM_ABORT", + [DLM_CANCEL] = "DLM_CANCEL", + [DLM_IVRESHANDLE] = "DLM_IVRESHANDLE", + [DLM_DEADLOCK] = "DLM_DEADLOCK", + [DLM_DENIED_NOASTS] = "DLM_DENIED_NOASTS", + [DLM_FORWARD] = "DLM_FORWARD", + [DLM_TIMEOUT] = "DLM_TIMEOUT", + [DLM_IVGROUPID] = "DLM_IVGROUPID", + [DLM_VERS_CONFLICT] = "DLM_VERS_CONFLICT", + [DLM_BAD_DEVICE_PATH] = "DLM_BAD_DEVICE_PATH", + [DLM_NO_DEVICE_PERMISSION] = "DLM_NO_DEVICE_PERMISSION", + [DLM_NO_CONTROL_DEVICE ] = "DLM_NO_CONTROL_DEVICE ", + [DLM_RECOVERING] = "DLM_RECOVERING", + [DLM_MIGRATING] = "DLM_MIGRATING", + [DLM_MAXSTATS] = "DLM_MAXSTATS", +}; + +static const char *dlm_errmsgs[] = { + [DLM_NORMAL] = "request in progress", + [DLM_GRANTED] = "request granted", + [DLM_DENIED] = "request denied", + [DLM_DENIED_NOLOCKS] = "request denied, out of system resources", + [DLM_WORKING] = "async request in progress", + [DLM_BLOCKED] = "lock request blocked", + [DLM_BLOCKED_ORPHAN] = "lock request blocked by a orphan lock", + [DLM_DENIED_GRACE_PERIOD] = "topological change in progress", + [DLM_SYSERR] = "system error", + [DLM_NOSUPPORT] = "unsupported", + [DLM_CANCELGRANT] = "can't cancel convert: already granted", + [DLM_IVLOCKID] = "bad lockid", + [DLM_SYNC] = "synchronous request granted", + [DLM_BADTYPE] = "bad resource type", + [DLM_BADRESOURCE] = "bad resource handle", + [DLM_MAXHANDLES] = "no more resource handles", + [DLM_NOCLINFO] = "can't contact cluster manager", + [DLM_NOLOCKMGR] = "can't contact lock manager", + [DLM_NOPURGED] = "can't contact purge daemon", + [DLM_BADARGS] = "bad api args", + [DLM_VOID] = "no status", + [DLM_NOTQUEUED] = "NOQUEUE was specified and request failed", + [DLM_IVBUFLEN] = "invalid resource name length", + [DLM_CVTUNGRANT] = "attempted to convert ungranted lock", + [DLM_BADPARAM] = "invalid lock mode specified", + [DLM_VALNOTVALID] = "value block has been invalidated", + [DLM_REJECTED] = "request rejected, unrecognized client", + [DLM_ABORT] = "blocked lock request cancelled", + [DLM_CANCEL] = "conversion request cancelled", + [DLM_IVRESHANDLE] = "invalid resource handle", + [DLM_DEADLOCK] = "deadlock recovery refused this request", + [DLM_DENIED_NOASTS] = "failed to allocate AST", + [DLM_FORWARD] = "request must wait for primary's response", + [DLM_TIMEOUT] = "timeout value for lock has expired", + [DLM_IVGROUPID] = "invalid group specification", + [DLM_VERS_CONFLICT] = "version conflicts prevent request handling", + [DLM_BAD_DEVICE_PATH] = "Locks device does not exist or path wrong", + [DLM_NO_DEVICE_PERMISSION] = "Client has insufficient perms for device", + [DLM_NO_CONTROL_DEVICE] = "Cannot set options on opened device ", + [DLM_RECOVERING] = "lock resource being recovered", + [DLM_MIGRATING] = "lock resource being migrated", + [DLM_MAXSTATS] = "invalid error number", +}; + +const char *dlm_errmsg(enum dlm_status err) +{ + if (err >= DLM_MAXSTATS || err < 0) + return dlm_errmsgs[DLM_MAXSTATS]; + return dlm_errmsgs[err]; +} +EXPORT_SYMBOL_GPL(dlm_errmsg); + +const char *dlm_errname(enum dlm_status err) +{ + if (err >= DLM_MAXSTATS || err < 0) + return dlm_errnames[DLM_MAXSTATS]; + return dlm_errnames[err]; +} +EXPORT_SYMBOL_GPL(dlm_errname); + +/* NOTE: This function converts a lockname into a string. It uses knowledge + * of the format of the lockname that should be outside the purview of the dlm. + * We are adding only to make dlm debugging slightly easier. + * + * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h. + */ +static int stringify_lockname(const char *lockname, int locklen, char *buf, + int len) +{ + int out = 0; + __be64 inode_blkno_be; + +#define OCFS2_DENTRY_LOCK_INO_START 18 + if (*lockname == 'N') { + memcpy((__be64 *)&inode_blkno_be, + (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START], + sizeof(__be64)); + out += snprintf(buf + out, len - out, "%.*s%08x", + OCFS2_DENTRY_LOCK_INO_START - 1, lockname, + (unsigned int)be64_to_cpu(inode_blkno_be)); + } else + out += snprintf(buf + out, len - out, "%.*s", + locklen, lockname); + return out; +} + +static int stringify_nodemap(unsigned long *nodemap, int maxnodes, + char *buf, int len) +{ + int out = 0; + int i = -1; + + while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes) + out += snprintf(buf + out, len - out, "%d ", i); + + return out; +} + +static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) +{ + int out = 0; + char *mle_type; + + if (mle->type == DLM_MLE_BLOCK) + mle_type = "BLK"; + else if (mle->type == DLM_MLE_MASTER) + mle_type = "MAS"; + else + mle_type = "MIG"; + + out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out); + out += snprintf(buf + out, len - out, + "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n", + mle_type, mle->master, mle->new_master, + !list_empty(&mle->hb_events), + !!mle->inuse, + atomic_read(&mle->mle_refs.refcount)); + + out += snprintf(buf + out, len - out, "Maybe="); + out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + out += snprintf(buf + out, len - out, "Vote="); + out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + out += snprintf(buf + out, len - out, "Response="); + out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + out += snprintf(buf + out, len - out, "Node="); + out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + out += snprintf(buf + out, len - out, "\n"); + + return out; +} + +void dlm_print_one_mle(struct dlm_master_list_entry *mle) +{ + char *buf; + + buf = (char *) get_zeroed_page(GFP_NOFS); + if (buf) { + dump_mle(mle, buf, PAGE_SIZE - 1); + free_page((unsigned long)buf); + } +} + +#ifdef CONFIG_DEBUG_FS + +static struct dentry *dlm_debugfs_root = NULL; + +#define DLM_DEBUGFS_DIR "o2dlm" +#define DLM_DEBUGFS_DLM_STATE "dlm_state" +#define DLM_DEBUGFS_LOCKING_STATE "locking_state" +#define DLM_DEBUGFS_MLE_STATE "mle_state" +#define DLM_DEBUGFS_PURGE_LIST "purge_list" + +/* begin - utils funcs */ +static void dlm_debug_free(struct kref *kref) +{ + struct dlm_debug_ctxt *dc; + + dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt); + + kfree(dc); +} + +static void dlm_debug_put(struct dlm_debug_ctxt *dc) +{ + if (dc) + kref_put(&dc->debug_refcnt, dlm_debug_free); +} + +static void dlm_debug_get(struct dlm_debug_ctxt *dc) +{ + kref_get(&dc->debug_refcnt); +} + +static int debug_release(struct inode *inode, struct file *file) +{ + free_page((unsigned long)file->private_data); + return 0; +} + +static ssize_t debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, + i_size_read(file->f_mapping->host)); +} +/* end - util funcs */ + +/* begin - purge list funcs */ +static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len) +{ + struct dlm_lock_resource *res; + int out = 0; + unsigned long total = 0; + + out += snprintf(buf + out, len - out, + "Dumping Purgelist for Domain: %s\n", dlm->name); + + spin_lock(&dlm->spinlock); + list_for_each_entry(res, &dlm->purge_list, purge) { + ++total; + if (len - out < 100) + continue; + spin_lock(&res->spinlock); + out += stringify_lockname(res->lockname.name, + res->lockname.len, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\t%ld\n", + (jiffies - res->last_used)/HZ); + spin_unlock(&res->spinlock); + } + spin_unlock(&dlm->spinlock); + + out += snprintf(buf + out, len - out, "Total on list: %ld\n", total); + + return out; +} + +static int debug_purgelist_open(struct inode *inode, struct file *file) +{ + struct dlm_ctxt *dlm = inode->i_private; + char *buf = NULL; + + buf = (char *) get_zeroed_page(GFP_NOFS); + if (!buf) + goto bail; + + i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1)); + + file->private_data = buf; + + return 0; +bail: + return -ENOMEM; +} + +static const struct file_operations debug_purgelist_fops = { + .open = debug_purgelist_open, + .release = debug_release, + .read = debug_read, + .llseek = generic_file_llseek, +}; +/* end - purge list funcs */ + +/* begin - debug mle funcs */ +static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) +{ + struct dlm_master_list_entry *mle; + struct hlist_head *bucket; + struct hlist_node *list; + int i, out = 0; + unsigned long total = 0, longest = 0, bucket_count = 0; + + out += snprintf(buf + out, len - out, + "Dumping MLEs for Domain: %s\n", dlm->name); + + spin_lock(&dlm->master_lock); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_master_hash(dlm, i); + hlist_for_each(list, bucket) { + mle = hlist_entry(list, struct dlm_master_list_entry, + master_hash_node); + ++total; + ++bucket_count; + if (len - out < 200) + continue; + out += dump_mle(mle, buf + out, len - out); + } + longest = max(longest, bucket_count); + bucket_count = 0; + } + spin_unlock(&dlm->master_lock); + + out += snprintf(buf + out, len - out, + "Total: %ld, Longest: %ld\n", total, longest); + return out; +} + +static int debug_mle_open(struct inode *inode, struct file *file) +{ + struct dlm_ctxt *dlm = inode->i_private; + char *buf = NULL; + + buf = (char *) get_zeroed_page(GFP_NOFS); + if (!buf) + goto bail; + + i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1)); + + file->private_data = buf; + + return 0; +bail: + return -ENOMEM; +} + +static const struct file_operations debug_mle_fops = { + .open = debug_mle_open, + .release = debug_release, + .read = debug_read, + .llseek = generic_file_llseek, +}; + +/* end - debug mle funcs */ + +/* begin - debug lockres funcs */ +static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len) +{ + int out; + +#define DEBUG_LOCK_VERSION 1 + spin_lock(&lock->spinlock); + out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d," + "%d,%d,%d,%d\n", + DEBUG_LOCK_VERSION, + list_type, lock->ml.type, lock->ml.convert_type, + lock->ml.node, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + !list_empty(&lock->ast_list), + !list_empty(&lock->bast_list), + lock->ast_pending, lock->bast_pending, + lock->convert_pending, lock->lock_pending, + lock->cancel_pending, lock->unlock_pending, + atomic_read(&lock->lock_refs.refcount)); + spin_unlock(&lock->spinlock); + + return out; +} + +static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) +{ + struct dlm_lock *lock; + int i; + int out = 0; + + out += snprintf(buf + out, len - out, "NAME:"); + out += stringify_lockname(res->lockname.name, res->lockname.len, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + +#define DEBUG_LRES_VERSION 1 + out += snprintf(buf + out, len - out, + "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n", + DEBUG_LRES_VERSION, + res->owner, res->state, res->last_used, + !list_empty(&res->purge), + !list_empty(&res->dirty), + !list_empty(&res->recovering), + res->inflight_locks, res->migration_pending, + atomic_read(&res->asts_reserved), + atomic_read(&res->refs.refcount)); + + /* refmap */ + out += snprintf(buf + out, len - out, "RMAP:"); + out += stringify_nodemap(res->refmap, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + /* lvb */ + out += snprintf(buf + out, len - out, "LVBX:"); + for (i = 0; i < DLM_LVB_LEN; i++) + out += snprintf(buf + out, len - out, + "%02x", (unsigned char)res->lvb[i]); + out += snprintf(buf + out, len - out, "\n"); + + /* granted */ + list_for_each_entry(lock, &res->granted, list) + out += dump_lock(lock, 0, buf + out, len - out); + + /* converting */ + list_for_each_entry(lock, &res->converting, list) + out += dump_lock(lock, 1, buf + out, len - out); + + /* blocked */ + list_for_each_entry(lock, &res->blocked, list) + out += dump_lock(lock, 2, buf + out, len - out); + + out += snprintf(buf + out, len - out, "\n"); + + return out; +} + +static void *lockres_seq_start(struct seq_file *m, loff_t *pos) +{ + struct debug_lockres *dl = m->private; + struct dlm_ctxt *dlm = dl->dl_ctxt; + struct dlm_lock_resource *oldres = dl->dl_res; + struct dlm_lock_resource *res = NULL; + struct list_head *track_list; + + spin_lock(&dlm->track_lock); + if (oldres) + track_list = &oldres->tracking; + else { + track_list = &dlm->tracking_list; + if (list_empty(track_list)) { + dl = NULL; + spin_unlock(&dlm->track_lock); + goto bail; + } + } + + list_for_each_entry(res, track_list, tracking) { + if (&res->tracking == &dlm->tracking_list) + res = NULL; + else + dlm_lockres_get(res); + break; + } + spin_unlock(&dlm->track_lock); + + if (oldres) + dlm_lockres_put(oldres); + + dl->dl_res = res; + + if (res) { + spin_lock(&res->spinlock); + dump_lockres(res, dl->dl_buf, dl->dl_len - 1); + spin_unlock(&res->spinlock); + } else + dl = NULL; + +bail: + /* passed to seq_show */ + return dl; +} + +static void lockres_seq_stop(struct seq_file *m, void *v) +{ +} + +static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return NULL; +} + +static int lockres_seq_show(struct seq_file *s, void *v) +{ + struct debug_lockres *dl = (struct debug_lockres *)v; + + seq_printf(s, "%s", dl->dl_buf); + + return 0; +} + +static const struct seq_operations debug_lockres_ops = { + .start = lockres_seq_start, + .stop = lockres_seq_stop, + .next = lockres_seq_next, + .show = lockres_seq_show, +}; + +static int debug_lockres_open(struct inode *inode, struct file *file) +{ + struct dlm_ctxt *dlm = inode->i_private; + int ret = -ENOMEM; + struct seq_file *seq; + struct debug_lockres *dl = NULL; + + dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL); + if (!dl) { + mlog_errno(ret); + goto bail; + } + + dl->dl_len = PAGE_SIZE; + dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL); + if (!dl->dl_buf) { + mlog_errno(ret); + goto bail; + } + + ret = seq_open(file, &debug_lockres_ops); + if (ret) { + mlog_errno(ret); + goto bail; + } + + seq = file->private_data; + seq->private = dl; + + dlm_grab(dlm); + dl->dl_ctxt = dlm; + + return 0; +bail: + if (dl) + kfree(dl->dl_buf); + kfree(dl); + return ret; +} + +static int debug_lockres_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct debug_lockres *dl = (struct debug_lockres *)seq->private; + + if (dl->dl_res) + dlm_lockres_put(dl->dl_res); + dlm_put(dl->dl_ctxt); + kfree(dl->dl_buf); + return seq_release_private(inode, file); +} + +static const struct file_operations debug_lockres_fops = { + .open = debug_lockres_open, + .release = debug_lockres_release, + .read = seq_read, + .llseek = seq_lseek, +}; +/* end - debug lockres funcs */ + +/* begin - debug state funcs */ +static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) +{ + int out = 0; + struct dlm_reco_node_data *node; + char *state; + int cur_mles = 0, tot_mles = 0; + int i; + + spin_lock(&dlm->spinlock); + + switch (dlm->dlm_state) { + case DLM_CTXT_NEW: + state = "NEW"; break; + case DLM_CTXT_JOINED: + state = "JOINED"; break; + case DLM_CTXT_IN_SHUTDOWN: + state = "SHUTDOWN"; break; + case DLM_CTXT_LEAVING: + state = "LEAVING"; break; + default: + state = "UNKNOWN"; break; + } + + /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ + out += snprintf(buf + out, len - out, + "Domain: %s Key: 0x%08x Protocol: %d.%d\n", + dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor); + + /* Thread Pid: xxx Node: xxx State: xxxxx */ + out += snprintf(buf + out, len - out, + "Thread Pid: %d Node: %d State: %s\n", + task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state); + + /* Number of Joins: xxx Joining Node: xxx */ + out += snprintf(buf + out, len - out, + "Number of Joins: %d Joining Node: %d\n", + dlm->num_joins, dlm->joining_node); + + /* Domain Map: xx xx xx */ + out += snprintf(buf + out, len - out, "Domain Map: "); + out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + /* Exit Domain Map: xx xx xx */ + out += snprintf(buf + out, len - out, "Exit Domain Map: "); + out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + /* Live Map: xx xx xx */ + out += snprintf(buf + out, len - out, "Live Map: "); + out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + /* Lock Resources: xxx (xxx) */ + out += snprintf(buf + out, len - out, + "Lock Resources: %d (%d)\n", + atomic_read(&dlm->res_cur_count), + atomic_read(&dlm->res_tot_count)); + + for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) + tot_mles += atomic_read(&dlm->mle_tot_count[i]); + + for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) + cur_mles += atomic_read(&dlm->mle_cur_count[i]); + + /* MLEs: xxx (xxx) */ + out += snprintf(buf + out, len - out, + "MLEs: %d (%d)\n", cur_mles, tot_mles); + + /* Blocking: xxx (xxx) */ + out += snprintf(buf + out, len - out, + " Blocking: %d (%d)\n", + atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]), + atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK])); + + /* Mastery: xxx (xxx) */ + out += snprintf(buf + out, len - out, + " Mastery: %d (%d)\n", + atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]), + atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER])); + + /* Migration: xxx (xxx) */ + out += snprintf(buf + out, len - out, + " Migration: %d (%d)\n", + atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]), + atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION])); + + /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ + out += snprintf(buf + out, len - out, + "Lists: Dirty=%s Purge=%s PendingASTs=%s " + "PendingBASTs=%s\n", + (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), + (list_empty(&dlm->purge_list) ? "Empty" : "InUse"), + (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"), + (list_empty(&dlm->pending_basts) ? "Empty" : "InUse")); + + /* Purge Count: xxx Refs: xxx */ + out += snprintf(buf + out, len - out, + "Purge Count: %d Refs: %d\n", dlm->purge_count, + atomic_read(&dlm->dlm_refs.refcount)); + + /* Dead Node: xxx */ + out += snprintf(buf + out, len - out, + "Dead Node: %d\n", dlm->reco.dead_node); + + /* What about DLM_RECO_STATE_FINALIZE? */ + if (dlm->reco.state == DLM_RECO_STATE_ACTIVE) + state = "ACTIVE"; + else + state = "INACTIVE"; + + /* Recovery Pid: xxxx Master: xxx State: xxxx */ + out += snprintf(buf + out, len - out, + "Recovery Pid: %d Master: %d State: %s\n", + task_pid_nr(dlm->dlm_reco_thread_task), + dlm->reco.new_master, state); + + /* Recovery Map: xx xx */ + out += snprintf(buf + out, len - out, "Recovery Map: "); + out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + /* Recovery Node State: */ + out += snprintf(buf + out, len - out, "Recovery Node State:\n"); + list_for_each_entry(node, &dlm->reco.node_data, list) { + switch (node->state) { + case DLM_RECO_NODE_DATA_INIT: + state = "INIT"; + break; + case DLM_RECO_NODE_DATA_REQUESTING: + state = "REQUESTING"; + break; + case DLM_RECO_NODE_DATA_DEAD: + state = "DEAD"; + break; + case DLM_RECO_NODE_DATA_RECEIVING: + state = "RECEIVING"; + break; + case DLM_RECO_NODE_DATA_REQUESTED: + state = "REQUESTED"; + break; + case DLM_RECO_NODE_DATA_DONE: + state = "DONE"; + break; + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + state = "FINALIZE-SENT"; + break; + default: + state = "BAD"; + break; + } + out += snprintf(buf + out, len - out, "\t%u - %s\n", + node->node_num, state); + } + + spin_unlock(&dlm->spinlock); + + return out; +} + +static int debug_state_open(struct inode *inode, struct file *file) +{ + struct dlm_ctxt *dlm = inode->i_private; + char *buf = NULL; + + buf = (char *) get_zeroed_page(GFP_NOFS); + if (!buf) + goto bail; + + i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1)); + + file->private_data = buf; + + return 0; +bail: + return -ENOMEM; +} + +static const struct file_operations debug_state_fops = { + .open = debug_state_open, + .release = debug_release, + .read = debug_read, + .llseek = generic_file_llseek, +}; +/* end - debug state funcs */ + +/* files in subroot */ +int dlm_debug_init(struct dlm_ctxt *dlm) +{ + struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; + + /* for dumping dlm_ctxt */ + dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE, + S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, + dlm, &debug_state_fops); + if (!dc->debug_state_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + /* for dumping lockres */ + dc->debug_lockres_dentry = + debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, + S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, + dlm, &debug_lockres_fops); + if (!dc->debug_lockres_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + /* for dumping mles */ + dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE, + S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, + dlm, &debug_mle_fops); + if (!dc->debug_mle_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + /* for dumping lockres on the purge list */ + dc->debug_purgelist_dentry = + debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, + S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, + dlm, &debug_purgelist_fops); + if (!dc->debug_purgelist_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + dlm_debug_get(dc); + return 0; + +bail: + dlm_debug_shutdown(dlm); + return -ENOMEM; +} + +void dlm_debug_shutdown(struct dlm_ctxt *dlm) +{ + struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; + + if (dc) { + debugfs_remove(dc->debug_purgelist_dentry); + debugfs_remove(dc->debug_mle_dentry); + debugfs_remove(dc->debug_lockres_dentry); + debugfs_remove(dc->debug_state_dentry); + dlm_debug_put(dc); + } +} + +/* subroot - domain dir */ +int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) +{ + dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, + dlm_debugfs_root); + if (!dlm->dlm_debugfs_subroot) { + mlog_errno(-ENOMEM); + goto bail; + } + + dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt), + GFP_KERNEL); + if (!dlm->dlm_debug_ctxt) { + mlog_errno(-ENOMEM); + goto bail; + } + kref_init(&dlm->dlm_debug_ctxt->debug_refcnt); + + return 0; +bail: + dlm_destroy_debugfs_subroot(dlm); + return -ENOMEM; +} + +void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) +{ + debugfs_remove(dlm->dlm_debugfs_subroot); +} + +/* debugfs root */ +int dlm_create_debugfs_root(void) +{ + dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL); + if (!dlm_debugfs_root) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + return 0; +} + +void dlm_destroy_debugfs_root(void) +{ + debugfs_remove(dlm_debugfs_root); +} +#endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h new file mode 100644 index 00000000..1f27c481 --- /dev/null +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -0,0 +1,81 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmdebug.h + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMDEBUG_H +#define DLMDEBUG_H + +void dlm_print_one_mle(struct dlm_master_list_entry *mle); + +#ifdef CONFIG_DEBUG_FS + +struct dlm_debug_ctxt { + struct kref debug_refcnt; + struct dentry *debug_state_dentry; + struct dentry *debug_lockres_dentry; + struct dentry *debug_mle_dentry; + struct dentry *debug_purgelist_dentry; +}; + +struct debug_lockres { + int dl_len; + char *dl_buf; + struct dlm_ctxt *dl_ctxt; + struct dlm_lock_resource *dl_res; +}; + +int dlm_debug_init(struct dlm_ctxt *dlm); +void dlm_debug_shutdown(struct dlm_ctxt *dlm); + +int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); +void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); + +int dlm_create_debugfs_root(void); +void dlm_destroy_debugfs_root(void); + +#else + +static inline int dlm_debug_init(struct dlm_ctxt *dlm) +{ + return 0; +} +static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm) +{ +} +static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) +{ + return 0; +} +static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) +{ +} +static inline int dlm_create_debugfs_root(void) +{ + return 0; +} +static inline void dlm_destroy_debugfs_root(void) +{ +} + +#endif /* CONFIG_DEBUG_FS */ +#endif /* DLMDEBUG_H */ diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c new file mode 100644 index 00000000..6ed6b95d --- /dev/null +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -0,0 +1,2397 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmdomain.c + * + * defines domain join / leave apis + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" +#include "dlmdomain.h" +#include "dlmdebug.h" + +#include "dlmver.h" + +#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) +#include "cluster/masklog.h" + +/* + * ocfs2 node maps are array of long int, which limits to send them freely + * across the wire due to endianness issues. To workaround this, we convert + * long ints to byte arrays. Following 3 routines are helper functions to + * set/test/copy bits within those array of bytes + */ +static inline void byte_set_bit(u8 nr, u8 map[]) +{ + map[nr >> 3] |= (1UL << (nr & 7)); +} + +static inline int byte_test_bit(u8 nr, u8 map[]) +{ + return ((1UL << (nr & 7)) & (map[nr >> 3])) != 0; +} + +static inline void byte_copymap(u8 dmap[], unsigned long smap[], + unsigned int sz) +{ + unsigned int nn; + + if (!sz) + return; + + memset(dmap, 0, ((sz + 7) >> 3)); + for (nn = 0 ; nn < sz; nn++) + if (test_bit(nn, smap)) + byte_set_bit(nn, dmap); +} + +static void dlm_free_pagevec(void **vec, int pages) +{ + while (pages--) + free_page((unsigned long)vec[pages]); + kfree(vec); +} + +static void **dlm_alloc_pagevec(int pages) +{ + void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL); + int i; + + if (!vec) + return NULL; + + for (i = 0; i < pages; i++) + if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL))) + goto out_free; + + mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n", + pages, (unsigned long)DLM_HASH_PAGES, + (unsigned long)DLM_BUCKETS_PER_PAGE); + return vec; +out_free: + dlm_free_pagevec(vec, i); + return NULL; +} + +/* + * + * spinlock lock ordering: if multiple locks are needed, obey this ordering: + * dlm_domain_lock + * struct dlm_ctxt->spinlock + * struct dlm_lock_resource->spinlock + * struct dlm_ctxt->master_lock + * struct dlm_ctxt->ast_lock + * dlm_master_list_entry->spinlock + * dlm_lock->spinlock + * + */ + +DEFINE_SPINLOCK(dlm_domain_lock); +LIST_HEAD(dlm_domains); +static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); + +/* + * The supported protocol version for DLM communication. Running domains + * will have a negotiated version with the same major number and a minor + * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should + * be used to determine what a running domain is actually using. + * + * New in version 1.1: + * - Message DLM_QUERY_REGION added to support global heartbeat + * - Message DLM_QUERY_NODEINFO added to allow online node removes + * New in version 1.2: + * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain + */ +static const struct dlm_protocol_version dlm_protocol = { + .pv_major = 1, + .pv_minor = 2, +}; + +#define DLM_DOMAIN_BACKOFF_MS 200 + +static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data); +static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_protocol_compare(struct dlm_protocol_version *existing, + struct dlm_protocol_version *request); + +static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); + +void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) +{ + if (!hlist_unhashed(&lockres->hash_node)) { + hlist_del_init(&lockres->hash_node); + dlm_lockres_put(lockres); + } +} + +void __dlm_insert_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + struct hlist_head *bucket; + struct qstr *q; + + assert_spin_locked(&dlm->spinlock); + + q = &res->lockname; + bucket = dlm_lockres_hash(dlm, q->hash); + + /* get a reference for our hashtable */ + dlm_lockres_get(res); + + hlist_add_head(&res->hash_node, bucket); +} + +struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash) +{ + struct hlist_head *bucket; + struct hlist_node *list; + + mlog(0, "%.*s\n", len, name); + + assert_spin_locked(&dlm->spinlock); + + bucket = dlm_lockres_hash(dlm, hash); + + hlist_for_each(list, bucket) { + struct dlm_lock_resource *res = hlist_entry(list, + struct dlm_lock_resource, hash_node); + if (res->lockname.name[0] != name[0]) + continue; + if (unlikely(res->lockname.len != len)) + continue; + if (memcmp(res->lockname.name + 1, name + 1, len - 1)) + continue; + dlm_lockres_get(res); + return res; + } + return NULL; +} + +/* intended to be called by functions which do not care about lock + * resources which are being purged (most net _handler functions). + * this will return NULL for any lock resource which is found but + * currently in the process of dropping its mastery reference. + * use __dlm_lookup_lockres_full when you need the lock resource + * regardless (e.g. dlm_get_lock_resource) */ +struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash) +{ + struct dlm_lock_resource *res = NULL; + + mlog(0, "%.*s\n", len, name); + + assert_spin_locked(&dlm->spinlock); + + res = __dlm_lookup_lockres_full(dlm, name, len, hash); + if (res) { + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + return NULL; + } + spin_unlock(&res->spinlock); + } + + return res; +} + +struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len) +{ + struct dlm_lock_resource *res; + unsigned int hash = dlm_lockid_hash(name, len); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, name, len, hash); + spin_unlock(&dlm->spinlock); + return res; +} + +static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len) +{ + struct dlm_ctxt *tmp = NULL; + struct list_head *iter; + + assert_spin_locked(&dlm_domain_lock); + + /* tmp->name here is always NULL terminated, + * but domain may not be! */ + list_for_each(iter, &dlm_domains) { + tmp = list_entry (iter, struct dlm_ctxt, list); + if (strlen(tmp->name) == len && + memcmp(tmp->name, domain, len)==0) + break; + tmp = NULL; + } + + return tmp; +} + +/* For null terminated domain strings ONLY */ +static struct dlm_ctxt * __dlm_lookup_domain(const char *domain) +{ + assert_spin_locked(&dlm_domain_lock); + + return __dlm_lookup_domain_full(domain, strlen(domain)); +} + + +/* returns true on one of two conditions: + * 1) the domain does not exist + * 2) the domain exists and it's state is "joined" */ +static int dlm_wait_on_domain_helper(const char *domain) +{ + int ret = 0; + struct dlm_ctxt *tmp = NULL; + + spin_lock(&dlm_domain_lock); + + tmp = __dlm_lookup_domain(domain); + if (!tmp) + ret = 1; + else if (tmp->dlm_state == DLM_CTXT_JOINED) + ret = 1; + + spin_unlock(&dlm_domain_lock); + return ret; +} + +static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) +{ + dlm_destroy_debugfs_subroot(dlm); + + if (dlm->lockres_hash) + dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); + + if (dlm->master_hash) + dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); + + if (dlm->name) + kfree(dlm->name); + + kfree(dlm); +} + +/* A little strange - this function will be called while holding + * dlm_domain_lock and is expected to be holding it on the way out. We + * will however drop and reacquire it multiple times */ +static void dlm_ctxt_release(struct kref *kref) +{ + struct dlm_ctxt *dlm; + + dlm = container_of(kref, struct dlm_ctxt, dlm_refs); + + BUG_ON(dlm->num_joins); + BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED); + + /* we may still be in the list if we hit an error during join. */ + list_del_init(&dlm->list); + + spin_unlock(&dlm_domain_lock); + + mlog(0, "freeing memory from domain %s\n", dlm->name); + + wake_up(&dlm_domain_events); + + dlm_free_ctxt_mem(dlm); + + spin_lock(&dlm_domain_lock); +} + +void dlm_put(struct dlm_ctxt *dlm) +{ + spin_lock(&dlm_domain_lock); + kref_put(&dlm->dlm_refs, dlm_ctxt_release); + spin_unlock(&dlm_domain_lock); +} + +static void __dlm_get(struct dlm_ctxt *dlm) +{ + kref_get(&dlm->dlm_refs); +} + +/* given a questionable reference to a dlm object, gets a reference if + * it can find it in the list, otherwise returns NULL in which case + * you shouldn't trust your pointer. */ +struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm) +{ + struct list_head *iter; + struct dlm_ctxt *target = NULL; + + spin_lock(&dlm_domain_lock); + + list_for_each(iter, &dlm_domains) { + target = list_entry (iter, struct dlm_ctxt, list); + + if (target == dlm) { + __dlm_get(target); + break; + } + + target = NULL; + } + + spin_unlock(&dlm_domain_lock); + + return target; +} + +int dlm_domain_fully_joined(struct dlm_ctxt *dlm) +{ + int ret; + + spin_lock(&dlm_domain_lock); + ret = (dlm->dlm_state == DLM_CTXT_JOINED) || + (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN); + spin_unlock(&dlm_domain_lock); + + return ret; +} + +static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) +{ + if (dlm->dlm_worker) { + flush_workqueue(dlm->dlm_worker); + destroy_workqueue(dlm->dlm_worker); + dlm->dlm_worker = NULL; + } +} + +static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) +{ + dlm_unregister_domain_handlers(dlm); + dlm_debug_shutdown(dlm); + dlm_complete_thread(dlm); + dlm_complete_recovery_thread(dlm); + dlm_destroy_dlm_worker(dlm); + + /* We've left the domain. Now we can take ourselves out of the + * list and allow the kref stuff to help us free the + * memory. */ + spin_lock(&dlm_domain_lock); + list_del_init(&dlm->list); + spin_unlock(&dlm_domain_lock); + + /* Wake up anyone waiting for us to remove this domain */ + wake_up(&dlm_domain_events); +} + +static int dlm_migrate_all_locks(struct dlm_ctxt *dlm) +{ + int i, num, n, ret = 0; + struct dlm_lock_resource *res; + struct hlist_node *iter; + struct hlist_head *bucket; + int dropped; + + mlog(0, "Migrating locks from domain %s\n", dlm->name); + + num = 0; + spin_lock(&dlm->spinlock); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { +redo_bucket: + n = 0; + bucket = dlm_lockres_hash(dlm, i); + iter = bucket->first; + while (iter) { + n++; + res = hlist_entry(iter, struct dlm_lock_resource, + hash_node); + dlm_lockres_get(res); + /* migrate, if necessary. this will drop the dlm + * spinlock and retake it if it does migration. */ + dropped = dlm_empty_lockres(dlm, res); + + spin_lock(&res->spinlock); + if (dropped) + __dlm_lockres_calc_usage(dlm, res); + else + iter = res->hash_node.next; + spin_unlock(&res->spinlock); + + dlm_lockres_put(res); + + if (dropped) { + cond_resched_lock(&dlm->spinlock); + goto redo_bucket; + } + } + cond_resched_lock(&dlm->spinlock); + num += n; + } + spin_unlock(&dlm->spinlock); + wake_up(&dlm->dlm_thread_wq); + + /* let the dlm thread take care of purging, keep scanning until + * nothing remains in the hash */ + if (num) { + mlog(0, "%s: %d lock resources in hash last pass\n", + dlm->name, num); + ret = -EAGAIN; + } + mlog(0, "DONE Migrating locks from domain %s\n", dlm->name); + return ret; +} + +static int dlm_no_joining_node(struct dlm_ctxt *dlm) +{ + int ret; + + spin_lock(&dlm->spinlock); + ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN; + spin_unlock(&dlm->spinlock); + + return ret; +} + +static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data) +{ + struct dlm_ctxt *dlm = data; + unsigned int node; + struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf; + + if (!dlm_grab(dlm)) + return 0; + + node = exit_msg->node_idx; + mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node); + + spin_lock(&dlm->spinlock); + set_bit(node, dlm->exit_domain_map); + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); + + return 0; +} + +static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm) +{ + /* Yikes, a double spinlock! I need domain_lock for the dlm + * state and the dlm spinlock for join state... Sorry! */ +again: + spin_lock(&dlm_domain_lock); + spin_lock(&dlm->spinlock); + + if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "Node %d is joining, we wait on it.\n", + dlm->joining_node); + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + + wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm)); + goto again; + } + + dlm->dlm_state = DLM_CTXT_LEAVING; + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); +} + +static void __dlm_print_nodes(struct dlm_ctxt *dlm) +{ + int node = -1; + + assert_spin_locked(&dlm->spinlock); + + printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name); + + while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, + node + 1)) < O2NM_MAX_NODES) { + printk("%d ", node); + } + printk("\n"); +} + +static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + unsigned int node; + struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf; + + mlog(0, "%p %u %p", msg, len, data); + + if (!dlm_grab(dlm)) + return 0; + + node = exit_msg->node_idx; + + printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name); + + spin_lock(&dlm->spinlock); + clear_bit(node, dlm->domain_map); + clear_bit(node, dlm->exit_domain_map); + __dlm_print_nodes(dlm); + + /* notify anything attached to the heartbeat events */ + dlm_hb_event_notify_attached(dlm, node, 0); + + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); + + return 0; +} + +static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type, + unsigned int node) +{ + int status; + struct dlm_exit_domain leave_msg; + + mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name, + msg_type, node); + + memset(&leave_msg, 0, sizeof(leave_msg)); + leave_msg.node_idx = dlm->node_num; + + status = o2net_send_message(msg_type, dlm->key, &leave_msg, + sizeof(leave_msg), node, NULL); + if (status < 0) + mlog(ML_ERROR, "Error %d sending domain exit message %u " + "to node %u on domain %s\n", status, msg_type, node, + dlm->name); + + return status; +} + +static void dlm_begin_exit_domain(struct dlm_ctxt *dlm) +{ + int node = -1; + + /* Support for begin exit domain was added in 1.2 */ + if (dlm->dlm_locking_proto.pv_major == 1 && + dlm->dlm_locking_proto.pv_minor < 2) + return; + + /* + * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely + * informational. Meaning if a node does not receive the message, + * so be it. + */ + spin_lock(&dlm->spinlock); + while (1) { + node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1); + if (node >= O2NM_MAX_NODES) + break; + if (node == dlm->node_num) + continue; + + spin_unlock(&dlm->spinlock); + dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node); + spin_lock(&dlm->spinlock); + } + spin_unlock(&dlm->spinlock); +} + +static void dlm_leave_domain(struct dlm_ctxt *dlm) +{ + int node, clear_node, status; + + /* At this point we've migrated away all our locks and won't + * accept mastership of new ones. The dlm is responsible for + * almost nothing now. We make sure not to confuse any joining + * nodes and then commence shutdown procedure. */ + + spin_lock(&dlm->spinlock); + /* Clear ourselves from the domain map */ + clear_bit(dlm->node_num, dlm->domain_map); + while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, + 0)) < O2NM_MAX_NODES) { + /* Drop the dlm spinlock. This is safe wrt the domain_map. + * -nodes cannot be added now as the + * query_join_handlers knows to respond with OK_NO_MAP + * -we catch the right network errors if a node is + * removed from the map while we're sending him the + * exit message. */ + spin_unlock(&dlm->spinlock); + + clear_node = 1; + + status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG, + node); + if (status < 0 && + status != -ENOPROTOOPT && + status != -ENOTCONN) { + mlog(ML_NOTICE, "Error %d sending domain exit message " + "to node %d\n", status, node); + + /* Not sure what to do here but lets sleep for + * a bit in case this was a transient + * error... */ + msleep(DLM_DOMAIN_BACKOFF_MS); + clear_node = 0; + } + + spin_lock(&dlm->spinlock); + /* If we're not clearing the node bit then we intend + * to loop back around to try again. */ + if (clear_node) + clear_bit(node, dlm->domain_map); + } + spin_unlock(&dlm->spinlock); +} + +int dlm_joined(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + + if (dlm->dlm_state == DLM_CTXT_JOINED) + ret = 1; + + spin_unlock(&dlm_domain_lock); + + return ret; +} + +int dlm_shutting_down(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + + if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) + ret = 1; + + spin_unlock(&dlm_domain_lock); + + return ret; +} + +void dlm_unregister_domain(struct dlm_ctxt *dlm) +{ + int leave = 0; + struct dlm_lock_resource *res; + + spin_lock(&dlm_domain_lock); + BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED); + BUG_ON(!dlm->num_joins); + + dlm->num_joins--; + if (!dlm->num_joins) { + /* We mark it "in shutdown" now so new register + * requests wait until we've completely left the + * domain. Don't use DLM_CTXT_LEAVING yet as we still + * want new domain joins to communicate with us at + * least until we've completed migration of our + * resources. */ + dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN; + leave = 1; + } + spin_unlock(&dlm_domain_lock); + + if (leave) { + mlog(0, "shutting down domain %s\n", dlm->name); + dlm_begin_exit_domain(dlm); + + /* We changed dlm state, notify the thread */ + dlm_kick_thread(dlm, NULL); + + while (dlm_migrate_all_locks(dlm)) { + /* Give dlm_thread time to purge the lockres' */ + msleep(500); + mlog(0, "%s: more migration to do\n", dlm->name); + } + + /* This list should be empty. If not, print remaining lockres */ + if (!list_empty(&dlm->tracking_list)) { + mlog(ML_ERROR, "Following lockres' are still on the " + "tracking list:\n"); + list_for_each_entry(res, &dlm->tracking_list, tracking) + dlm_print_one_lock_resource(res); + } + + dlm_mark_domain_leaving(dlm); + dlm_leave_domain(dlm); + dlm_force_free_mles(dlm); + dlm_complete_dlm_shutdown(dlm); + } + dlm_put(dlm); +} +EXPORT_SYMBOL_GPL(dlm_unregister_domain); + +static int dlm_query_join_proto_check(char *proto_type, int node, + struct dlm_protocol_version *ours, + struct dlm_protocol_version *request) +{ + int rc; + struct dlm_protocol_version proto = *request; + + if (!dlm_protocol_compare(ours, &proto)) { + mlog(0, + "node %u wanted to join with %s locking protocol " + "%u.%u, we respond with %u.%u\n", + node, proto_type, + request->pv_major, + request->pv_minor, + proto.pv_major, proto.pv_minor); + request->pv_minor = proto.pv_minor; + rc = 0; + } else { + mlog(ML_NOTICE, + "Node %u wanted to join with %s locking " + "protocol %u.%u, but we have %u.%u, disallowing\n", + node, proto_type, + request->pv_major, + request->pv_minor, + ours->pv_major, + ours->pv_minor); + rc = 1; + } + + return rc; +} + +/* + * struct dlm_query_join_packet is made up of four one-byte fields. They + * are effectively in big-endian order already. However, little-endian + * machines swap them before putting the packet on the wire (because + * query_join's response is a status, and that status is treated as a u32 + * on the wire). Thus, a big-endian and little-endian machines will treat + * this structure differently. + * + * The solution is to have little-endian machines swap the structure when + * converting from the structure to the u32 representation. This will + * result in the structure having the correct format on the wire no matter + * the host endian format. + */ +static void dlm_query_join_packet_to_wire(struct dlm_query_join_packet *packet, + u32 *wire) +{ + union dlm_query_join_response response; + + response.packet = *packet; + *wire = cpu_to_be32(response.intval); +} + +static void dlm_query_join_wire_to_packet(u32 wire, + struct dlm_query_join_packet *packet) +{ + union dlm_query_join_response response; + + response.intval = cpu_to_be32(wire); + *packet = response.packet; +} + +static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_query_join_request *query; + struct dlm_query_join_packet packet = { + .code = JOIN_DISALLOW, + }; + struct dlm_ctxt *dlm = NULL; + u32 response; + u8 nodenum; + + query = (struct dlm_query_join_request *) msg->buf; + + mlog(0, "node %u wants to join domain %s\n", query->node_idx, + query->domain); + + /* + * If heartbeat doesn't consider the node live, tell it + * to back off and try again. This gives heartbeat a chance + * to catch up. + */ + if (!o2hb_check_node_heartbeating(query->node_idx)) { + mlog(0, "node %u is not in our live map yet\n", + query->node_idx); + + packet.code = JOIN_DISALLOW; + goto respond; + } + + packet.code = JOIN_OK_NO_MAP; + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(query->domain, query->name_len); + if (!dlm) + goto unlock_respond; + + /* + * There is a small window where the joining node may not see the + * node(s) that just left but still part of the cluster. DISALLOW + * join request if joining node has different node map. + */ + nodenum=0; + while (nodenum < O2NM_MAX_NODES) { + if (test_bit(nodenum, dlm->domain_map)) { + if (!byte_test_bit(nodenum, query->node_map)) { + mlog(0, "disallow join as node %u does not " + "have node %u in its nodemap\n", + query->node_idx, nodenum); + packet.code = JOIN_DISALLOW; + goto unlock_respond; + } + } + nodenum++; + } + + /* Once the dlm ctxt is marked as leaving then we don't want + * to be put in someone's domain map. + * Also, explicitly disallow joining at certain troublesome + * times (ie. during recovery). */ + if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { + int bit = query->node_idx; + spin_lock(&dlm->spinlock); + + if (dlm->dlm_state == DLM_CTXT_NEW && + dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) { + /*If this is a brand new context and we + * haven't started our join process yet, then + * the other node won the race. */ + packet.code = JOIN_OK_NO_MAP; + } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { + /* Disallow parallel joins. */ + packet.code = JOIN_DISALLOW; + } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { + mlog(0, "node %u trying to join, but recovery " + "is ongoing.\n", bit); + packet.code = JOIN_DISALLOW; + } else if (test_bit(bit, dlm->recovery_map)) { + mlog(0, "node %u trying to join, but it " + "still needs recovery.\n", bit); + packet.code = JOIN_DISALLOW; + } else if (test_bit(bit, dlm->domain_map)) { + mlog(0, "node %u trying to join, but it " + "is still in the domain! needs recovery?\n", + bit); + packet.code = JOIN_DISALLOW; + } else { + /* Alright we're fully a part of this domain + * so we keep some state as to who's joining + * and indicate to him that needs to be fixed + * up. */ + + /* Make sure we speak compatible locking protocols. */ + if (dlm_query_join_proto_check("DLM", bit, + &dlm->dlm_locking_proto, + &query->dlm_proto)) { + packet.code = JOIN_PROTOCOL_MISMATCH; + } else if (dlm_query_join_proto_check("fs", bit, + &dlm->fs_locking_proto, + &query->fs_proto)) { + packet.code = JOIN_PROTOCOL_MISMATCH; + } else { + packet.dlm_minor = query->dlm_proto.pv_minor; + packet.fs_minor = query->fs_proto.pv_minor; + packet.code = JOIN_OK; + __dlm_set_joining_node(dlm, query->node_idx); + } + } + + spin_unlock(&dlm->spinlock); + } +unlock_respond: + spin_unlock(&dlm_domain_lock); + +respond: + mlog(0, "We respond with %u\n", packet.code); + + dlm_query_join_packet_to_wire(&packet, &response); + return response; +} + +static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_assert_joined *assert; + struct dlm_ctxt *dlm = NULL; + + assert = (struct dlm_assert_joined *) msg->buf; + + mlog(0, "node %u asserts join on domain %s\n", assert->node_idx, + assert->domain); + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len); + /* XXX should we consider no dlm ctxt an error? */ + if (dlm) { + spin_lock(&dlm->spinlock); + + /* Alright, this node has officially joined our + * domain. Set him in the map and clean up our + * leftover join state. */ + BUG_ON(dlm->joining_node != assert->node_idx); + set_bit(assert->node_idx, dlm->domain_map); + clear_bit(assert->node_idx, dlm->exit_domain_map); + __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); + + printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", + assert->node_idx, dlm->name); + __dlm_print_nodes(dlm); + + /* notify anything attached to the heartbeat events */ + dlm_hb_event_notify_attached(dlm, assert->node_idx, 1); + + spin_unlock(&dlm->spinlock); + } + spin_unlock(&dlm_domain_lock); + + return 0; +} + +static int dlm_match_regions(struct dlm_ctxt *dlm, + struct dlm_query_region *qr, + char *local, int locallen) +{ + char *remote = qr->qr_regions; + char *l, *r; + int localnr, i, j, foundit; + int status = 0; + + if (!o2hb_global_heartbeat_active()) { + if (qr->qr_numregions) { + mlog(ML_ERROR, "Domain %s: Joining node %d has global " + "heartbeat enabled but local node %d does not\n", + qr->qr_domain, qr->qr_node, dlm->node_num); + status = -EINVAL; + } + goto bail; + } + + if (o2hb_global_heartbeat_active() && !qr->qr_numregions) { + mlog(ML_ERROR, "Domain %s: Local node %d has global " + "heartbeat enabled but joining node %d does not\n", + qr->qr_domain, dlm->node_num, qr->qr_node); + status = -EINVAL; + goto bail; + } + + r = remote; + for (i = 0; i < qr->qr_numregions; ++i) { + mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r); + r += O2HB_MAX_REGION_NAME_LEN; + } + + localnr = min(O2NM_MAX_REGIONS, locallen/O2HB_MAX_REGION_NAME_LEN); + localnr = o2hb_get_all_regions(local, (u8)localnr); + + /* compare local regions with remote */ + l = local; + for (i = 0; i < localnr; ++i) { + foundit = 0; + r = remote; + for (j = 0; j <= qr->qr_numregions; ++j) { + if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) { + foundit = 1; + break; + } + r += O2HB_MAX_REGION_NAME_LEN; + } + if (!foundit) { + status = -EINVAL; + mlog(ML_ERROR, "Domain %s: Region '%.*s' registered " + "in local node %d but not in joining node %d\n", + qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l, + dlm->node_num, qr->qr_node); + goto bail; + } + l += O2HB_MAX_REGION_NAME_LEN; + } + + /* compare remote with local regions */ + r = remote; + for (i = 0; i < qr->qr_numregions; ++i) { + foundit = 0; + l = local; + for (j = 0; j < localnr; ++j) { + if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) { + foundit = 1; + break; + } + l += O2HB_MAX_REGION_NAME_LEN; + } + if (!foundit) { + status = -EINVAL; + mlog(ML_ERROR, "Domain %s: Region '%.*s' registered " + "in joining node %d but not in local node %d\n", + qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r, + qr->qr_node, dlm->node_num); + goto bail; + } + r += O2HB_MAX_REGION_NAME_LEN; + } + +bail: + return status; +} + +static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map) +{ + struct dlm_query_region *qr = NULL; + int status, ret = 0, i; + char *p; + + if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + goto bail; + + qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL); + if (!qr) { + ret = -ENOMEM; + mlog_errno(ret); + goto bail; + } + + qr->qr_node = dlm->node_num; + qr->qr_namelen = strlen(dlm->name); + memcpy(qr->qr_domain, dlm->name, qr->qr_namelen); + /* if local hb, the numregions will be zero */ + if (o2hb_global_heartbeat_active()) + qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions, + O2NM_MAX_REGIONS); + + p = qr->qr_regions; + for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN) + mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p); + + i = -1; + while ((i = find_next_bit(node_map, O2NM_MAX_NODES, + i + 1)) < O2NM_MAX_NODES) { + if (i == dlm->node_num) + continue; + + mlog(0, "Sending regions to node %d\n", i); + + ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr, + sizeof(struct dlm_query_region), + i, &status); + if (ret >= 0) + ret = status; + if (ret) { + mlog(ML_ERROR, "Region mismatch %d, node %d\n", + ret, i); + break; + } + } + +bail: + kfree(qr); + return ret; +} + +static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data) +{ + struct dlm_query_region *qr; + struct dlm_ctxt *dlm = NULL; + char *local = NULL; + int status = 0; + int locked = 0; + + qr = (struct dlm_query_region *) msg->buf; + + mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node, + qr->qr_domain); + + /* buffer used in dlm_mast_regions() */ + local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); + if (!local) { + status = -ENOMEM; + goto bail; + } + + status = -EINVAL; + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen); + if (!dlm) { + mlog(ML_ERROR, "Node %d queried hb regions on domain %s " + "before join domain\n", qr->qr_node, qr->qr_domain); + goto bail; + } + + spin_lock(&dlm->spinlock); + locked = 1; + if (dlm->joining_node != qr->qr_node) { + mlog(ML_ERROR, "Node %d queried hb regions on domain %s " + "but joining node is %d\n", qr->qr_node, qr->qr_domain, + dlm->joining_node); + goto bail; + } + + /* Support for global heartbeat was added in 1.1 */ + if (dlm->dlm_locking_proto.pv_major == 1 && + dlm->dlm_locking_proto.pv_minor == 0) { + mlog(ML_ERROR, "Node %d queried hb regions on domain %s " + "but active dlm protocol is %d.%d\n", qr->qr_node, + qr->qr_domain, dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor); + goto bail; + } + + status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions)); + +bail: + if (locked) + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + + kfree(local); + + return status; +} + +static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn) +{ + struct o2nm_node *local; + struct dlm_node_info *remote; + int i, j; + int status = 0; + + for (j = 0; j < qn->qn_numnodes; ++j) + mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum, + &(qn->qn_nodes[j].ni_ipv4_address), + ntohs(qn->qn_nodes[j].ni_ipv4_port)); + + for (i = 0; i < O2NM_MAX_NODES && !status; ++i) { + local = o2nm_get_node_by_num(i); + remote = NULL; + for (j = 0; j < qn->qn_numnodes; ++j) { + if (qn->qn_nodes[j].ni_nodenum == i) { + remote = &(qn->qn_nodes[j]); + break; + } + } + + if (!local && !remote) + continue; + + if ((local && !remote) || (!local && remote)) + status = -EINVAL; + + if (!status && + ((remote->ni_nodenum != local->nd_num) || + (remote->ni_ipv4_port != local->nd_ipv4_port) || + (remote->ni_ipv4_address != local->nd_ipv4_address))) + status = -EINVAL; + + if (status) { + if (remote && !local) + mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) " + "registered in joining node %d but not in " + "local node %d\n", qn->qn_domain, + remote->ni_nodenum, + &(remote->ni_ipv4_address), + ntohs(remote->ni_ipv4_port), + qn->qn_nodenum, dlm->node_num); + if (local && !remote) + mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) " + "registered in local node %d but not in " + "joining node %d\n", qn->qn_domain, + local->nd_num, &(local->nd_ipv4_address), + ntohs(local->nd_ipv4_port), + dlm->node_num, qn->qn_nodenum); + BUG_ON((!local && !remote)); + } + + if (local) + o2nm_node_put(local); + } + + return status; +} + +static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map) +{ + struct dlm_query_nodeinfo *qn = NULL; + struct o2nm_node *node; + int ret = 0, status, count, i; + + if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + goto bail; + + qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL); + if (!qn) { + ret = -ENOMEM; + mlog_errno(ret); + goto bail; + } + + for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) { + node = o2nm_get_node_by_num(i); + if (!node) + continue; + qn->qn_nodes[count].ni_nodenum = node->nd_num; + qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port; + qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address; + mlog(0, "Node %3d, %pI4:%u\n", node->nd_num, + &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port)); + ++count; + o2nm_node_put(node); + } + + qn->qn_nodenum = dlm->node_num; + qn->qn_numnodes = count; + qn->qn_namelen = strlen(dlm->name); + memcpy(qn->qn_domain, dlm->name, qn->qn_namelen); + + i = -1; + while ((i = find_next_bit(node_map, O2NM_MAX_NODES, + i + 1)) < O2NM_MAX_NODES) { + if (i == dlm->node_num) + continue; + + mlog(0, "Sending nodeinfo to node %d\n", i); + + ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY, + qn, sizeof(struct dlm_query_nodeinfo), + i, &status); + if (ret >= 0) + ret = status; + if (ret) { + mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i); + break; + } + } + +bail: + kfree(qn); + return ret; +} + +static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data) +{ + struct dlm_query_nodeinfo *qn; + struct dlm_ctxt *dlm = NULL; + int locked = 0, status = -EINVAL; + + qn = (struct dlm_query_nodeinfo *) msg->buf; + + mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum, + qn->qn_domain); + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen); + if (!dlm) { + mlog(ML_ERROR, "Node %d queried nodes on domain %s before " + "join domain\n", qn->qn_nodenum, qn->qn_domain); + goto bail; + } + + spin_lock(&dlm->spinlock); + locked = 1; + if (dlm->joining_node != qn->qn_nodenum) { + mlog(ML_ERROR, "Node %d queried nodes on domain %s but " + "joining node is %d\n", qn->qn_nodenum, qn->qn_domain, + dlm->joining_node); + goto bail; + } + + /* Support for node query was added in 1.1 */ + if (dlm->dlm_locking_proto.pv_major == 1 && + dlm->dlm_locking_proto.pv_minor == 0) { + mlog(ML_ERROR, "Node %d queried nodes on domain %s " + "but active dlm protocol is %d.%d\n", qn->qn_nodenum, + qn->qn_domain, dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor); + goto bail; + } + + status = dlm_match_nodes(dlm, qn); + +bail: + if (locked) + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + + return status; +} + +static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_cancel_join *cancel; + struct dlm_ctxt *dlm = NULL; + + cancel = (struct dlm_cancel_join *) msg->buf; + + mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx, + cancel->domain); + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len); + + if (dlm) { + spin_lock(&dlm->spinlock); + + /* Yikes, this guy wants to cancel his join. No + * problem, we simply cleanup our join state. */ + BUG_ON(dlm->joining_node != cancel->node_idx); + __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); + + spin_unlock(&dlm->spinlock); + } + spin_unlock(&dlm_domain_lock); + + return 0; +} + +static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm, + unsigned int node) +{ + int status; + struct dlm_cancel_join cancel_msg; + + memset(&cancel_msg, 0, sizeof(cancel_msg)); + cancel_msg.node_idx = dlm->node_num; + cancel_msg.name_len = strlen(dlm->name); + memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len); + + status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, + &cancel_msg, sizeof(cancel_msg), node, + NULL); + if (status < 0) { + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, + node); + goto bail; + } + +bail: + return status; +} + +/* map_size should be in bytes. */ +static int dlm_send_join_cancels(struct dlm_ctxt *dlm, + unsigned long *node_map, + unsigned int map_size) +{ + int status, tmpstat; + unsigned int node; + + if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) * + sizeof(unsigned long))) { + mlog(ML_ERROR, + "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n", + map_size, (unsigned)BITS_TO_LONGS(O2NM_MAX_NODES)); + return -EINVAL; + } + + status = 0; + node = -1; + while ((node = find_next_bit(node_map, O2NM_MAX_NODES, + node + 1)) < O2NM_MAX_NODES) { + if (node == dlm->node_num) + continue; + + tmpstat = dlm_send_one_join_cancel(dlm, node); + if (tmpstat) { + mlog(ML_ERROR, "Error return %d cancelling join on " + "node %d\n", tmpstat, node); + if (!status) + status = tmpstat; + } + } + + if (status) + mlog_errno(status); + return status; +} + +static int dlm_request_join(struct dlm_ctxt *dlm, + int node, + enum dlm_query_join_response_code *response) +{ + int status; + struct dlm_query_join_request join_msg; + struct dlm_query_join_packet packet; + u32 join_resp; + + mlog(0, "querying node %d\n", node); + + memset(&join_msg, 0, sizeof(join_msg)); + join_msg.node_idx = dlm->node_num; + join_msg.name_len = strlen(dlm->name); + memcpy(join_msg.domain, dlm->name, join_msg.name_len); + join_msg.dlm_proto = dlm->dlm_locking_proto; + join_msg.fs_proto = dlm->fs_locking_proto; + + /* copy live node map to join message */ + byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES); + + status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, + sizeof(join_msg), node, &join_resp); + if (status < 0 && status != -ENOPROTOOPT) { + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, + node); + goto bail; + } + dlm_query_join_wire_to_packet(join_resp, &packet); + + /* -ENOPROTOOPT from the net code means the other side isn't + listening for our message type -- that's fine, it means + his dlm isn't up, so we can consider him a 'yes' but not + joined into the domain. */ + if (status == -ENOPROTOOPT) { + status = 0; + *response = JOIN_OK_NO_MAP; + } else if (packet.code == JOIN_DISALLOW || + packet.code == JOIN_OK_NO_MAP) { + *response = packet.code; + } else if (packet.code == JOIN_PROTOCOL_MISMATCH) { + mlog(ML_NOTICE, + "This node requested DLM locking protocol %u.%u and " + "filesystem locking protocol %u.%u. At least one of " + "the protocol versions on node %d is not compatible, " + "disconnecting\n", + dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor, + dlm->fs_locking_proto.pv_major, + dlm->fs_locking_proto.pv_minor, + node); + status = -EPROTO; + *response = packet.code; + } else if (packet.code == JOIN_OK) { + *response = packet.code; + /* Use the same locking protocol as the remote node */ + dlm->dlm_locking_proto.pv_minor = packet.dlm_minor; + dlm->fs_locking_proto.pv_minor = packet.fs_minor; + mlog(0, + "Node %d responds JOIN_OK with DLM locking protocol " + "%u.%u and fs locking protocol %u.%u\n", + node, + dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor, + dlm->fs_locking_proto.pv_major, + dlm->fs_locking_proto.pv_minor); + } else { + status = -EINVAL; + mlog(ML_ERROR, "invalid response %d from node %u\n", + packet.code, node); + } + + mlog(0, "status %d, node %d response is %d\n", status, node, + *response); + +bail: + return status; +} + +static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, + unsigned int node) +{ + int status; + struct dlm_assert_joined assert_msg; + + mlog(0, "Sending join assert to node %u\n", node); + + memset(&assert_msg, 0, sizeof(assert_msg)); + assert_msg.node_idx = dlm->node_num; + assert_msg.name_len = strlen(dlm->name); + memcpy(assert_msg.domain, dlm->name, assert_msg.name_len); + + status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, + &assert_msg, sizeof(assert_msg), node, + NULL); + if (status < 0) + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, + node); + + return status; +} + +static void dlm_send_join_asserts(struct dlm_ctxt *dlm, + unsigned long *node_map) +{ + int status, node, live; + + status = 0; + node = -1; + while ((node = find_next_bit(node_map, O2NM_MAX_NODES, + node + 1)) < O2NM_MAX_NODES) { + if (node == dlm->node_num) + continue; + + do { + /* It is very important that this message be + * received so we spin until either the node + * has died or it gets the message. */ + status = dlm_send_one_join_assert(dlm, node); + + spin_lock(&dlm->spinlock); + live = test_bit(node, dlm->live_nodes_map); + spin_unlock(&dlm->spinlock); + + if (status) { + mlog(ML_ERROR, "Error return %d asserting " + "join on node %d\n", status, node); + + /* give us some time between errors... */ + if (live) + msleep(DLM_DOMAIN_BACKOFF_MS); + } + } while (status && live); + } +} + +struct domain_join_ctxt { + unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; +}; + +static int dlm_should_restart_join(struct dlm_ctxt *dlm, + struct domain_join_ctxt *ctxt, + enum dlm_query_join_response_code response) +{ + int ret; + + if (response == JOIN_DISALLOW) { + mlog(0, "Latest response of disallow -- should restart\n"); + return 1; + } + + spin_lock(&dlm->spinlock); + /* For now, we restart the process if the node maps have + * changed at all */ + ret = memcmp(ctxt->live_map, dlm->live_nodes_map, + sizeof(dlm->live_nodes_map)); + spin_unlock(&dlm->spinlock); + + if (ret) + mlog(0, "Node maps changed -- should restart\n"); + + return ret; +} + +static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) +{ + int status = 0, tmpstat, node; + struct domain_join_ctxt *ctxt; + enum dlm_query_join_response_code response = JOIN_DISALLOW; + + mlog(0, "%p", dlm); + + ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); + if (!ctxt) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* group sem locking should work for us here -- we're already + * registered for heartbeat events so filling this should be + * atomic wrt getting those handlers called. */ + o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map)); + + spin_lock(&dlm->spinlock); + memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map)); + + __dlm_set_joining_node(dlm, dlm->node_num); + + spin_unlock(&dlm->spinlock); + + node = -1; + while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES, + node + 1)) < O2NM_MAX_NODES) { + if (node == dlm->node_num) + continue; + + status = dlm_request_join(dlm, node, &response); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* Ok, either we got a response or the node doesn't have a + * dlm up. */ + if (response == JOIN_OK) + set_bit(node, ctxt->yes_resp_map); + + if (dlm_should_restart_join(dlm, ctxt, response)) { + status = -EAGAIN; + goto bail; + } + } + + mlog(0, "Yay, done querying nodes!\n"); + + /* Yay, everyone agree's we can join the domain. My domain is + * comprised of all nodes who were put in the + * yes_resp_map. Copy that into our domain map and send a join + * assert message to clean up everyone elses state. */ + spin_lock(&dlm->spinlock); + memcpy(dlm->domain_map, ctxt->yes_resp_map, + sizeof(ctxt->yes_resp_map)); + set_bit(dlm->node_num, dlm->domain_map); + spin_unlock(&dlm->spinlock); + + /* Support for global heartbeat and node info was added in 1.1 */ + if (dlm->dlm_locking_proto.pv_major > 1 || + dlm->dlm_locking_proto.pv_minor > 0) { + status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map); + if (status) { + mlog_errno(status); + goto bail; + } + status = dlm_send_regions(dlm, ctxt->yes_resp_map); + if (status) { + mlog_errno(status); + goto bail; + } + } + + dlm_send_join_asserts(dlm, ctxt->yes_resp_map); + + /* Joined state *must* be set before the joining node + * information, otherwise the query_join handler may read no + * current joiner but a state of NEW and tell joining nodes + * we're not in the domain. */ + spin_lock(&dlm_domain_lock); + dlm->dlm_state = DLM_CTXT_JOINED; + dlm->num_joins++; + spin_unlock(&dlm_domain_lock); + +bail: + spin_lock(&dlm->spinlock); + __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); + if (!status) + __dlm_print_nodes(dlm); + spin_unlock(&dlm->spinlock); + + if (ctxt) { + /* Do we need to send a cancel message to any nodes? */ + if (status < 0) { + tmpstat = dlm_send_join_cancels(dlm, + ctxt->yes_resp_map, + sizeof(ctxt->yes_resp_map)); + if (tmpstat < 0) + mlog_errno(tmpstat); + } + kfree(ctxt); + } + + mlog(0, "returning %d\n", status); + return status; +} + +static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) +{ + o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up); + o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down); + o2net_unregister_handler_list(&dlm->dlm_domain_handlers); +} + +static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) +{ + int status; + + mlog(0, "registering handlers.\n"); + + o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, + dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); + status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down); + if (status) + goto bail; + + o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, + dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); + status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up); + if (status) + goto bail; + + status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key, + sizeof(struct dlm_master_request), + dlm_master_request_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key, + sizeof(struct dlm_assert_master), + dlm_assert_master_handler, + dlm, dlm_assert_master_post_handler, + &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key, + sizeof(struct dlm_create_lock), + dlm_create_lock_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key, + DLM_CONVERT_LOCK_MAX_LEN, + dlm_convert_lock_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key, + DLM_UNLOCK_LOCK_MAX_LEN, + dlm_unlock_lock_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key, + DLM_PROXY_AST_MAX_LEN, + dlm_proxy_ast_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key, + sizeof(struct dlm_exit_domain), + dlm_exit_domain_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_DEREF_LOCKRES_MSG, dlm->key, + sizeof(struct dlm_deref_lockres), + dlm_deref_lockres_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key, + sizeof(struct dlm_migrate_request), + dlm_migrate_request_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key, + DLM_MIG_LOCKRES_MAX_LEN, + dlm_mig_lockres_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key, + sizeof(struct dlm_master_requery), + dlm_master_requery_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key, + sizeof(struct dlm_lock_request), + dlm_request_all_locks_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key, + sizeof(struct dlm_reco_data_done), + dlm_reco_data_done_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key, + sizeof(struct dlm_begin_reco), + dlm_begin_reco_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key, + sizeof(struct dlm_finalize_reco), + dlm_finalize_reco_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key, + sizeof(struct dlm_exit_domain), + dlm_begin_exit_domain_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + +bail: + if (status) + dlm_unregister_domain_handlers(dlm); + + return status; +} + +static int dlm_join_domain(struct dlm_ctxt *dlm) +{ + int status; + unsigned int backoff; + unsigned int total_backoff = 0; + + BUG_ON(!dlm); + + mlog(0, "Join domain %s\n", dlm->name); + + status = dlm_register_domain_handlers(dlm); + if (status) { + mlog_errno(status); + goto bail; + } + + status = dlm_debug_init(dlm); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = dlm_launch_thread(dlm); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = dlm_launch_recovery_thread(dlm); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + dlm->dlm_worker = create_singlethread_workqueue("dlm_wq"); + if (!dlm->dlm_worker) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + do { + status = dlm_try_to_join_domain(dlm); + + /* If we're racing another node to the join, then we + * need to back off temporarily and let them + * complete. */ +#define DLM_JOIN_TIMEOUT_MSECS 90000 + if (status == -EAGAIN) { + if (signal_pending(current)) { + status = -ERESTARTSYS; + goto bail; + } + + if (total_backoff > + msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) { + status = -ERESTARTSYS; + mlog(ML_NOTICE, "Timed out joining dlm domain " + "%s after %u msecs\n", dlm->name, + jiffies_to_msecs(total_backoff)); + goto bail; + } + + /* + * After you! + * No, after you! + * I insist! + * But you first! + * ... + */ + backoff = (unsigned int)(jiffies & 0x3); + backoff *= DLM_DOMAIN_BACKOFF_MS; + total_backoff += backoff; + mlog(0, "backoff %d\n", backoff); + msleep(backoff); + } + } while (status == -EAGAIN); + + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + wake_up(&dlm_domain_events); + + if (status) { + dlm_unregister_domain_handlers(dlm); + dlm_debug_shutdown(dlm); + dlm_complete_thread(dlm); + dlm_complete_recovery_thread(dlm); + dlm_destroy_dlm_worker(dlm); + } + + return status; +} + +static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, + u32 key) +{ + int i; + int ret; + struct dlm_ctxt *dlm = NULL; + + dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); + if (!dlm) { + mlog_errno(-ENOMEM); + goto leave; + } + + dlm->name = kstrdup(domain, GFP_KERNEL); + if (dlm->name == NULL) { + mlog_errno(-ENOMEM); + kfree(dlm); + dlm = NULL; + goto leave; + } + + dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); + if (!dlm->lockres_hash) { + mlog_errno(-ENOMEM); + kfree(dlm->name); + kfree(dlm); + dlm = NULL; + goto leave; + } + + for (i = 0; i < DLM_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i)); + + dlm->master_hash = (struct hlist_head **) + dlm_alloc_pagevec(DLM_HASH_PAGES); + if (!dlm->master_hash) { + mlog_errno(-ENOMEM); + dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); + kfree(dlm->name); + kfree(dlm); + dlm = NULL; + goto leave; + } + + for (i = 0; i < DLM_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(dlm_master_hash(dlm, i)); + + dlm->key = key; + dlm->node_num = o2nm_this_node(); + + ret = dlm_create_debugfs_subroot(dlm); + if (ret < 0) { + dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); + dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); + kfree(dlm->name); + kfree(dlm); + dlm = NULL; + goto leave; + } + + spin_lock_init(&dlm->spinlock); + spin_lock_init(&dlm->master_lock); + spin_lock_init(&dlm->ast_lock); + spin_lock_init(&dlm->track_lock); + INIT_LIST_HEAD(&dlm->list); + INIT_LIST_HEAD(&dlm->dirty_list); + INIT_LIST_HEAD(&dlm->reco.resources); + INIT_LIST_HEAD(&dlm->reco.received); + INIT_LIST_HEAD(&dlm->reco.node_data); + INIT_LIST_HEAD(&dlm->purge_list); + INIT_LIST_HEAD(&dlm->dlm_domain_handlers); + INIT_LIST_HEAD(&dlm->tracking_list); + dlm->reco.state = 0; + + INIT_LIST_HEAD(&dlm->pending_asts); + INIT_LIST_HEAD(&dlm->pending_basts); + + mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n", + dlm->recovery_map, &(dlm->recovery_map[0])); + + memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map)); + memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map)); + memset(dlm->domain_map, 0, sizeof(dlm->domain_map)); + + dlm->dlm_thread_task = NULL; + dlm->dlm_reco_thread_task = NULL; + dlm->dlm_worker = NULL; + init_waitqueue_head(&dlm->dlm_thread_wq); + init_waitqueue_head(&dlm->dlm_reco_thread_wq); + init_waitqueue_head(&dlm->reco.event); + init_waitqueue_head(&dlm->ast_wq); + init_waitqueue_head(&dlm->migration_wq); + INIT_LIST_HEAD(&dlm->mle_hb_events); + + dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; + init_waitqueue_head(&dlm->dlm_join_events); + + dlm->reco.new_master = O2NM_INVALID_NODE_NUM; + dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; + + atomic_set(&dlm->res_tot_count, 0); + atomic_set(&dlm->res_cur_count, 0); + for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) { + atomic_set(&dlm->mle_tot_count[i], 0); + atomic_set(&dlm->mle_cur_count[i], 0); + } + + spin_lock_init(&dlm->work_lock); + INIT_LIST_HEAD(&dlm->work_list); + INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work); + + kref_init(&dlm->dlm_refs); + dlm->dlm_state = DLM_CTXT_NEW; + + INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks); + + mlog(0, "context init: refcount %u\n", + atomic_read(&dlm->dlm_refs.refcount)); + +leave: + return dlm; +} + +/* + * Compare a requested locking protocol version against the current one. + * + * If the major numbers are different, they are incompatible. + * If the current minor is greater than the request, they are incompatible. + * If the current minor is less than or equal to the request, they are + * compatible, and the requester should run at the current minor version. + */ +static int dlm_protocol_compare(struct dlm_protocol_version *existing, + struct dlm_protocol_version *request) +{ + if (existing->pv_major != request->pv_major) + return 1; + + if (existing->pv_minor > request->pv_minor) + return 1; + + if (existing->pv_minor < request->pv_minor) + request->pv_minor = existing->pv_minor; + + return 0; +} + +/* + * dlm_register_domain: one-time setup per "domain". + * + * The filesystem passes in the requested locking version via proto. + * If registration was successful, proto will contain the negotiated + * locking protocol. + */ +struct dlm_ctxt * dlm_register_domain(const char *domain, + u32 key, + struct dlm_protocol_version *fs_proto) +{ + int ret; + struct dlm_ctxt *dlm = NULL; + struct dlm_ctxt *new_ctxt = NULL; + + if (strlen(domain) >= O2NM_MAX_NAME_LEN) { + ret = -ENAMETOOLONG; + mlog(ML_ERROR, "domain name length too long\n"); + goto leave; + } + + if (!o2hb_check_local_node_heartbeating()) { + mlog(ML_ERROR, "the local node has not been configured, or is " + "not heartbeating\n"); + ret = -EPROTO; + goto leave; + } + + mlog(0, "register called for domain \"%s\"\n", domain); + +retry: + dlm = NULL; + if (signal_pending(current)) { + ret = -ERESTARTSYS; + mlog_errno(ret); + goto leave; + } + + spin_lock(&dlm_domain_lock); + + dlm = __dlm_lookup_domain(domain); + if (dlm) { + if (dlm->dlm_state != DLM_CTXT_JOINED) { + spin_unlock(&dlm_domain_lock); + + mlog(0, "This ctxt is not joined yet!\n"); + wait_event_interruptible(dlm_domain_events, + dlm_wait_on_domain_helper( + domain)); + goto retry; + } + + if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) { + spin_unlock(&dlm_domain_lock); + mlog(ML_ERROR, + "Requested locking protocol version is not " + "compatible with already registered domain " + "\"%s\"\n", domain); + ret = -EPROTO; + goto leave; + } + + __dlm_get(dlm); + dlm->num_joins++; + + spin_unlock(&dlm_domain_lock); + + ret = 0; + goto leave; + } + + /* doesn't exist */ + if (!new_ctxt) { + spin_unlock(&dlm_domain_lock); + + new_ctxt = dlm_alloc_ctxt(domain, key); + if (new_ctxt) + goto retry; + + ret = -ENOMEM; + mlog_errno(ret); + goto leave; + } + + /* a little variable switch-a-roo here... */ + dlm = new_ctxt; + new_ctxt = NULL; + + /* add the new domain */ + list_add_tail(&dlm->list, &dlm_domains); + spin_unlock(&dlm_domain_lock); + + /* + * Pass the locking protocol version into the join. If the join + * succeeds, it will have the negotiated protocol set. + */ + dlm->dlm_locking_proto = dlm_protocol; + dlm->fs_locking_proto = *fs_proto; + + ret = dlm_join_domain(dlm); + if (ret) { + mlog_errno(ret); + dlm_put(dlm); + goto leave; + } + + /* Tell the caller what locking protocol we negotiated */ + *fs_proto = dlm->fs_locking_proto; + + ret = 0; +leave: + if (new_ctxt) + dlm_free_ctxt_mem(new_ctxt); + + if (ret < 0) + dlm = ERR_PTR(ret); + + return dlm; +} +EXPORT_SYMBOL_GPL(dlm_register_domain); + +static LIST_HEAD(dlm_join_handlers); + +static void dlm_unregister_net_handlers(void) +{ + o2net_unregister_handler_list(&dlm_join_handlers); +} + +static int dlm_register_net_handlers(void) +{ + int status = 0; + + status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, + sizeof(struct dlm_query_join_request), + dlm_query_join_handler, + NULL, NULL, &dlm_join_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, + sizeof(struct dlm_assert_joined), + dlm_assert_joined_handler, + NULL, NULL, &dlm_join_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, + sizeof(struct dlm_cancel_join), + dlm_cancel_join_handler, + NULL, NULL, &dlm_join_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY, + sizeof(struct dlm_query_region), + dlm_query_region_handler, + NULL, NULL, &dlm_join_handlers); + + if (status) + goto bail; + + status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY, + sizeof(struct dlm_query_nodeinfo), + dlm_query_nodeinfo_handler, + NULL, NULL, &dlm_join_handlers); +bail: + if (status < 0) + dlm_unregister_net_handlers(); + + return status; +} + +/* Domain eviction callback handling. + * + * The file system requires notification of node death *before* the + * dlm completes it's recovery work, otherwise it may be able to + * acquire locks on resources requiring recovery. Since the dlm can + * evict a node from it's domain *before* heartbeat fires, a similar + * mechanism is required. */ + +/* Eviction is not expected to happen often, so a per-domain lock is + * not necessary. Eviction callbacks are allowed to sleep for short + * periods of time. */ +static DECLARE_RWSEM(dlm_callback_sem); + +void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, + int node_num) +{ + struct list_head *iter; + struct dlm_eviction_cb *cb; + + down_read(&dlm_callback_sem); + list_for_each(iter, &dlm->dlm_eviction_callbacks) { + cb = list_entry(iter, struct dlm_eviction_cb, ec_item); + + cb->ec_func(node_num, cb->ec_data); + } + up_read(&dlm_callback_sem); +} + +void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb, + dlm_eviction_func *f, + void *data) +{ + INIT_LIST_HEAD(&cb->ec_item); + cb->ec_func = f; + cb->ec_data = data; +} +EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb); + +void dlm_register_eviction_cb(struct dlm_ctxt *dlm, + struct dlm_eviction_cb *cb) +{ + down_write(&dlm_callback_sem); + list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks); + up_write(&dlm_callback_sem); +} +EXPORT_SYMBOL_GPL(dlm_register_eviction_cb); + +void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb) +{ + down_write(&dlm_callback_sem); + list_del_init(&cb->ec_item); + up_write(&dlm_callback_sem); +} +EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb); + +static int __init dlm_init(void) +{ + int status; + + dlm_print_version(); + + status = dlm_init_mle_cache(); + if (status) { + mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); + goto error; + } + + status = dlm_init_master_caches(); + if (status) { + mlog(ML_ERROR, "Could not create o2dlm_lockres and " + "o2dlm_lockname slabcaches\n"); + goto error; + } + + status = dlm_init_lock_cache(); + if (status) { + mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n"); + goto error; + } + + status = dlm_register_net_handlers(); + if (status) { + mlog(ML_ERROR, "Unable to register network handlers\n"); + goto error; + } + + status = dlm_create_debugfs_root(); + if (status) + goto error; + + return 0; +error: + dlm_unregister_net_handlers(); + dlm_destroy_lock_cache(); + dlm_destroy_master_caches(); + dlm_destroy_mle_cache(); + return -1; +} + +static void __exit dlm_exit (void) +{ + dlm_destroy_debugfs_root(); + dlm_unregister_net_handlers(); + dlm_destroy_lock_cache(); + dlm_destroy_master_caches(); + dlm_destroy_mle_cache(); +} + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); + +module_init(dlm_init); +module_exit(dlm_exit); diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h new file mode 100644 index 00000000..2f7f60bf --- /dev/null +++ b/fs/ocfs2/dlm/dlmdomain.h @@ -0,0 +1,36 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmdomain.h + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMDOMAIN_H +#define DLMDOMAIN_H + +extern spinlock_t dlm_domain_lock; +extern struct list_head dlm_domains; + +int dlm_joined(struct dlm_ctxt *dlm); +int dlm_shutting_down(struct dlm_ctxt *dlm); +void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, + int node_num); + +#endif diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c new file mode 100644 index 00000000..8d39e0fd --- /dev/null +++ b/fs/ocfs2/dlm/dlmlock.c @@ -0,0 +1,767 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmlock.c + * + * underlying calls for lock creation + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" + +#include "dlmconvert.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +static struct kmem_cache *dlm_lock_cache = NULL; + +static DEFINE_SPINLOCK(dlm_cookie_lock); +static u64 dlm_next_cookie = 1; + +static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags); +static void dlm_init_lock(struct dlm_lock *newlock, int type, + u8 node, u64 cookie); +static void dlm_lock_release(struct kref *kref); +static void dlm_lock_detach_lockres(struct dlm_lock *lock); + +int dlm_init_lock_cache(void) +{ + dlm_lock_cache = kmem_cache_create("o2dlm_lock", + sizeof(struct dlm_lock), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (dlm_lock_cache == NULL) + return -ENOMEM; + return 0; +} + +void dlm_destroy_lock_cache(void) +{ + if (dlm_lock_cache) + kmem_cache_destroy(dlm_lock_cache); +} + +/* Tell us whether we can grant a new lock request. + * locking: + * caller needs: res->spinlock + * taken: none + * held on exit: none + * returns: 1 if the lock can be granted, 0 otherwise. + */ +static int dlm_can_grant_new_lock(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + struct list_head *iter; + struct dlm_lock *tmplock; + + list_for_each(iter, &res->granted) { + tmplock = list_entry(iter, struct dlm_lock, list); + + if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) + return 0; + } + + list_for_each(iter, &res->converting) { + tmplock = list_entry(iter, struct dlm_lock, list); + + if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) + return 0; + if (!dlm_lock_compatible(tmplock->ml.convert_type, + lock->ml.type)) + return 0; + } + + return 1; +} + +/* performs lock creation at the lockres master site + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: DLM_NORMAL, DLM_NOTQUEUED + */ +static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags) +{ + int call_ast = 0, kick_thread = 0; + enum dlm_status status = DLM_NORMAL; + + mlog(0, "type=%d\n", lock->ml.type); + + spin_lock(&res->spinlock); + /* if called from dlm_create_lock_handler, need to + * ensure it will not sleep in dlm_wait_on_lockres */ + status = __dlm_lockres_state_to_status(res); + if (status != DLM_NORMAL && + lock->ml.node != dlm->node_num) { + /* erf. state changed after lock was dropped. */ + spin_unlock(&res->spinlock); + dlm_error(status); + return status; + } + __dlm_wait_on_lockres(res); + __dlm_lockres_reserve_ast(res); + + if (dlm_can_grant_new_lock(res, lock)) { + mlog(0, "I can grant this lock right away\n"); + /* got it right away */ + lock->lksb->status = DLM_NORMAL; + status = DLM_NORMAL; + dlm_lock_get(lock); + list_add_tail(&lock->list, &res->granted); + + /* for the recovery lock, we can't allow the ast + * to be queued since the dlmthread is already + * frozen. but the recovery lock is always locked + * with LKM_NOQUEUE so we do not need the ast in + * this special case */ + if (!dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + kick_thread = 1; + call_ast = 1; + } else { + mlog(0, "%s: returning DLM_NORMAL to " + "node %u for reco lock\n", dlm->name, + lock->ml.node); + } + } else { + /* for NOQUEUE request, unless we get the + * lock right away, return DLM_NOTQUEUED */ + if (flags & LKM_NOQUEUE) { + status = DLM_NOTQUEUED; + if (dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + mlog(0, "%s: returning NOTQUEUED to " + "node %u for reco lock\n", dlm->name, + lock->ml.node); + } + } else { + dlm_lock_get(lock); + list_add_tail(&lock->list, &res->blocked); + kick_thread = 1; + } + } + /* reduce the inflight count, this may result in the lockres + * being purged below during calc_usage */ + if (lock->ml.node == dlm->node_num) + dlm_lockres_drop_inflight_ref(dlm, res); + + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + /* either queue the ast or release it */ + if (call_ast) + dlm_queue_ast(dlm, lock); + else + dlm_lockres_release_ast(dlm, res); + + dlm_lockres_calc_usage(dlm, res); + if (kick_thread) + dlm_kick_thread(dlm, res); + + return status; +} + +void dlm_revert_pending_lock(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + /* remove from local queue if it failed */ + list_del_init(&lock->list); + lock->lksb->flags &= ~DLM_LKSB_GET_LVB; +} + + +/* + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: DLM_DENIED, DLM_RECOVERING, or net status + */ +static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags) +{ + enum dlm_status status = DLM_DENIED; + int lockres_changed = 1; + + mlog(0, "type=%d, lockres %.*s, flags = 0x%x\n", + lock->ml.type, res->lockname.len, + res->lockname.name, flags); + + spin_lock(&res->spinlock); + + /* will exit this call with spinlock held */ + __dlm_wait_on_lockres(res); + res->state |= DLM_LOCK_RES_IN_PROGRESS; + + /* add lock to local (secondary) queue */ + dlm_lock_get(lock); + list_add_tail(&lock->list, &res->blocked); + lock->lock_pending = 1; + spin_unlock(&res->spinlock); + + /* spec seems to say that you will get DLM_NORMAL when the lock + * has been queued, meaning we need to wait for a reply here. */ + status = dlm_send_remote_lock_request(dlm, res, lock, flags); + + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + lock->lock_pending = 0; + if (status != DLM_NORMAL) { + if (status == DLM_RECOVERING && + dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + /* recovery lock was mastered by dead node. + * we need to have calc_usage shoot down this + * lockres and completely remaster it. */ + mlog(0, "%s: recovery lock was owned by " + "dead node %u, remaster it now.\n", + dlm->name, res->owner); + } else if (status != DLM_NOTQUEUED) { + /* + * DO NOT call calc_usage, as this would unhash + * the remote lockres before we ever get to use + * it. treat as if we never made any change to + * the lockres. + */ + lockres_changed = 0; + dlm_error(status); + } + dlm_revert_pending_lock(res, lock); + dlm_lock_put(lock); + } else if (dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + /* special case for the $RECOVERY lock. + * there will never be an AST delivered to put + * this lock on the proper secondary queue + * (granted), so do it manually. */ + mlog(0, "%s: $RECOVERY lock for this node (%u) is " + "mastered by %u; got lock, manually granting (no ast)\n", + dlm->name, dlm->node_num, res->owner); + list_move_tail(&lock->list, &res->granted); + } + spin_unlock(&res->spinlock); + + if (lockres_changed) + dlm_lockres_calc_usage(dlm, res); + + wake_up(&res->wq); + return status; +} + + +/* for remote lock creation. + * locking: + * caller needs: none, but need res->state & DLM_LOCK_RES_IN_PROGRESS + * taken: none + * held on exit: none + * returns: DLM_NOLOCKMGR, or net status + */ +static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags) +{ + struct dlm_create_lock create; + int tmpret, status = 0; + enum dlm_status ret; + + memset(&create, 0, sizeof(create)); + create.node_idx = dlm->node_num; + create.requested_type = lock->ml.type; + create.cookie = lock->ml.cookie; + create.namelen = res->lockname.len; + create.flags = cpu_to_be32(flags); + memcpy(create.name, res->lockname.name, create.namelen); + + tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, + sizeof(create), res->owner, &status); + if (tmpret >= 0) { + // successfully sent and received + ret = status; // this is already a dlm_status + if (ret == DLM_REJECTED) { + mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres " + "no longer owned by %u. that node is coming back " + "up currently.\n", dlm->name, create.namelen, + create.name, res->owner); + dlm_print_one_lock_resource(res); + BUG(); + } + } else { + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key, + res->owner); + if (dlm_is_host_down(tmpret)) { + ret = DLM_RECOVERING; + mlog(0, "node %u died so returning DLM_RECOVERING " + "from lock message!\n", res->owner); + } else { + ret = dlm_err_to_dlm_status(tmpret); + } + } + + return ret; +} + +void dlm_lock_get(struct dlm_lock *lock) +{ + kref_get(&lock->lock_refs); +} + +void dlm_lock_put(struct dlm_lock *lock) +{ + kref_put(&lock->lock_refs, dlm_lock_release); +} + +static void dlm_lock_release(struct kref *kref) +{ + struct dlm_lock *lock; + + lock = container_of(kref, struct dlm_lock, lock_refs); + + BUG_ON(!list_empty(&lock->list)); + BUG_ON(!list_empty(&lock->ast_list)); + BUG_ON(!list_empty(&lock->bast_list)); + BUG_ON(lock->ast_pending); + BUG_ON(lock->bast_pending); + + dlm_lock_detach_lockres(lock); + + if (lock->lksb_kernel_allocated) { + mlog(0, "freeing kernel-allocated lksb\n"); + kfree(lock->lksb); + } + kmem_cache_free(dlm_lock_cache, lock); +} + +/* associate a lock with it's lockres, getting a ref on the lockres */ +void dlm_lock_attach_lockres(struct dlm_lock *lock, + struct dlm_lock_resource *res) +{ + dlm_lockres_get(res); + lock->lockres = res; +} + +/* drop ref on lockres, if there is still one associated with lock */ +static void dlm_lock_detach_lockres(struct dlm_lock *lock) +{ + struct dlm_lock_resource *res; + + res = lock->lockres; + if (res) { + lock->lockres = NULL; + mlog(0, "removing lock's lockres reference\n"); + dlm_lockres_put(res); + } +} + +static void dlm_init_lock(struct dlm_lock *newlock, int type, + u8 node, u64 cookie) +{ + INIT_LIST_HEAD(&newlock->list); + INIT_LIST_HEAD(&newlock->ast_list); + INIT_LIST_HEAD(&newlock->bast_list); + spin_lock_init(&newlock->spinlock); + newlock->ml.type = type; + newlock->ml.convert_type = LKM_IVMODE; + newlock->ml.highest_blocked = LKM_IVMODE; + newlock->ml.node = node; + newlock->ml.pad1 = 0; + newlock->ml.list = 0; + newlock->ml.flags = 0; + newlock->ast = NULL; + newlock->bast = NULL; + newlock->astdata = NULL; + newlock->ml.cookie = cpu_to_be64(cookie); + newlock->ast_pending = 0; + newlock->bast_pending = 0; + newlock->convert_pending = 0; + newlock->lock_pending = 0; + newlock->unlock_pending = 0; + newlock->cancel_pending = 0; + newlock->lksb_kernel_allocated = 0; + + kref_init(&newlock->lock_refs); +} + +struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, + struct dlm_lockstatus *lksb) +{ + struct dlm_lock *lock; + int kernel_allocated = 0; + + lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS); + if (!lock) + return NULL; + + if (!lksb) { + /* zero memory only if kernel-allocated */ + lksb = kzalloc(sizeof(*lksb), GFP_NOFS); + if (!lksb) { + kfree(lock); + return NULL; + } + kernel_allocated = 1; + } + + dlm_init_lock(lock, type, node, cookie); + if (kernel_allocated) + lock->lksb_kernel_allocated = 1; + lock->lksb = lksb; + lksb->lockid = lock; + return lock; +} + +/* handler for lock creation net message + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED + */ +int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf; + struct dlm_lock_resource *res = NULL; + struct dlm_lock *newlock = NULL; + struct dlm_lockstatus *lksb = NULL; + enum dlm_status status = DLM_NORMAL; + char *name; + unsigned int namelen; + + BUG_ON(!dlm); + + if (!dlm_grab(dlm)) + return DLM_REJECTED; + + name = create->name; + namelen = create->namelen; + status = DLM_REJECTED; + if (!dlm_domain_fully_joined(dlm)) { + mlog(ML_ERROR, "Domain %s not fully joined, but node %u is " + "sending a create_lock message for lock %.*s!\n", + dlm->name, create->node_idx, namelen, name); + dlm_error(status); + goto leave; + } + + status = DLM_IVBUFLEN; + if (namelen > DLM_LOCKID_NAME_MAX) { + dlm_error(status); + goto leave; + } + + status = DLM_SYSERR; + newlock = dlm_new_lock(create->requested_type, + create->node_idx, + be64_to_cpu(create->cookie), NULL); + if (!newlock) { + dlm_error(status); + goto leave; + } + + lksb = newlock->lksb; + + if (be32_to_cpu(create->flags) & LKM_GET_LVB) { + lksb->flags |= DLM_LKSB_GET_LVB; + mlog(0, "set DLM_LKSB_GET_LVB flag\n"); + } + + status = DLM_IVLOCKID; + res = dlm_lookup_lockres(dlm, name, namelen); + if (!res) { + dlm_error(status); + goto leave; + } + + spin_lock(&res->spinlock); + status = __dlm_lockres_state_to_status(res); + spin_unlock(&res->spinlock); + + if (status != DLM_NORMAL) { + mlog(0, "lockres recovering/migrating/in-progress\n"); + goto leave; + } + + dlm_lock_attach_lockres(newlock, res); + + status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags)); +leave: + if (status != DLM_NORMAL) + if (newlock) + dlm_lock_put(newlock); + + if (res) + dlm_lockres_put(res); + + dlm_put(dlm); + + return status; +} + + +/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */ +static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie) +{ + u64 tmpnode = node_num; + + /* shift single byte of node num into top 8 bits */ + tmpnode <<= 56; + + spin_lock(&dlm_cookie_lock); + *cookie = (dlm_next_cookie | tmpnode); + if (++dlm_next_cookie & 0xff00000000000000ull) { + mlog(0, "This node's cookie will now wrap!\n"); + dlm_next_cookie = 1; + } + spin_unlock(&dlm_cookie_lock); +} + +enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode, + struct dlm_lockstatus *lksb, int flags, + const char *name, int namelen, dlm_astlockfunc_t *ast, + void *data, dlm_bastlockfunc_t *bast) +{ + enum dlm_status status; + struct dlm_lock_resource *res = NULL; + struct dlm_lock *lock = NULL; + int convert = 0, recovery = 0; + + /* yes this function is a mess. + * TODO: clean this up. lots of common code in the + * lock and convert paths, especially in the retry blocks */ + if (!lksb) { + dlm_error(DLM_BADARGS); + return DLM_BADARGS; + } + + status = DLM_BADPARAM; + if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) { + dlm_error(status); + goto error; + } + + if (flags & ~LKM_VALID_FLAGS) { + dlm_error(status); + goto error; + } + + convert = (flags & LKM_CONVERT); + recovery = (flags & LKM_RECOVERY); + + if (recovery && + (!dlm_is_recovery_lock(name, namelen) || convert) ) { + dlm_error(status); + goto error; + } + if (convert && (flags & LKM_LOCAL)) { + mlog(ML_ERROR, "strange LOCAL convert request!\n"); + goto error; + } + + if (convert) { + /* CONVERT request */ + + /* if converting, must pass in a valid dlm_lock */ + lock = lksb->lockid; + if (!lock) { + mlog(ML_ERROR, "NULL lock pointer in convert " + "request\n"); + goto error; + } + + res = lock->lockres; + if (!res) { + mlog(ML_ERROR, "NULL lockres pointer in convert " + "request\n"); + goto error; + } + dlm_lockres_get(res); + + /* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are + * static after the original lock call. convert requests will + * ensure that everything is the same, or return DLM_BADARGS. + * this means that DLM_DENIED_NOASTS will never be returned. + */ + if (lock->lksb != lksb || lock->ast != ast || + lock->bast != bast || lock->astdata != data) { + status = DLM_BADARGS; + mlog(ML_ERROR, "new args: lksb=%p, ast=%p, bast=%p, " + "astdata=%p\n", lksb, ast, bast, data); + mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, " + "astdata=%p\n", lock->lksb, lock->ast, + lock->bast, lock->astdata); + goto error; + } +retry_convert: + dlm_wait_for_recovery(dlm); + + if (res->owner == dlm->node_num) + status = dlmconvert_master(dlm, res, lock, flags, mode); + else + status = dlmconvert_remote(dlm, res, lock, flags, mode); + if (status == DLM_RECOVERING || status == DLM_MIGRATING || + status == DLM_FORWARD) { + /* for now, see how this works without sleeping + * and just retry right away. I suspect the reco + * or migration will complete fast enough that + * no waiting will be necessary */ + mlog(0, "retrying convert with migration/recovery/" + "in-progress\n"); + msleep(100); + goto retry_convert; + } + } else { + u64 tmpcookie; + + /* LOCK request */ + status = DLM_BADARGS; + if (!name) { + dlm_error(status); + goto error; + } + + status = DLM_IVBUFLEN; + if (namelen > DLM_LOCKID_NAME_MAX || namelen < 1) { + dlm_error(status); + goto error; + } + + dlm_get_next_cookie(dlm->node_num, &tmpcookie); + lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb); + if (!lock) { + dlm_error(status); + goto error; + } + + if (!recovery) + dlm_wait_for_recovery(dlm); + + /* find or create the lock resource */ + res = dlm_get_lock_resource(dlm, name, namelen, flags); + if (!res) { + status = DLM_IVLOCKID; + dlm_error(status); + goto error; + } + + mlog(0, "type=%d, flags = 0x%x\n", mode, flags); + mlog(0, "creating lock: lock=%p res=%p\n", lock, res); + + dlm_lock_attach_lockres(lock, res); + lock->ast = ast; + lock->bast = bast; + lock->astdata = data; + +retry_lock: + if (flags & LKM_VALBLK) { + mlog(0, "LKM_VALBLK passed by caller\n"); + + /* LVB requests for non PR, PW or EX locks are + * ignored. */ + if (mode < LKM_PRMODE) + flags &= ~LKM_VALBLK; + else { + flags |= LKM_GET_LVB; + lock->lksb->flags |= DLM_LKSB_GET_LVB; + } + } + + if (res->owner == dlm->node_num) + status = dlmlock_master(dlm, res, lock, flags); + else + status = dlmlock_remote(dlm, res, lock, flags); + + if (status == DLM_RECOVERING || status == DLM_MIGRATING || + status == DLM_FORWARD) { + mlog(0, "retrying lock with migration/" + "recovery/in progress\n"); + msleep(100); + /* no waiting for dlm_reco_thread */ + if (recovery) { + if (status != DLM_RECOVERING) + goto retry_lock; + + mlog(0, "%s: got RECOVERING " + "for $RECOVERY lock, master " + "was %u\n", dlm->name, + res->owner); + /* wait to see the node go down, then + * drop down and allow the lockres to + * get cleaned up. need to remaster. */ + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); + } else { + dlm_wait_for_recovery(dlm); + goto retry_lock; + } + } + + if (status != DLM_NORMAL) { + lock->lksb->flags &= ~DLM_LKSB_GET_LVB; + if (status != DLM_NOTQUEUED) + dlm_error(status); + goto error; + } + } + +error: + if (status != DLM_NORMAL) { + if (lock && !convert) + dlm_lock_put(lock); + // this is kind of unnecessary + lksb->status = status; + } + + /* put lockres ref from the convert path + * or from dlm_get_lock_resource */ + if (res) + dlm_lockres_put(res); + + return status; +} +EXPORT_SYMBOL_GPL(dlmlock); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c new file mode 100644 index 00000000..11eefb8c --- /dev/null +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -0,0 +1,3413 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmmod.c + * + * standalone DLM module + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" +#include "dlmdomain.h" +#include "dlmdebug.h" + +#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) +#include "cluster/masklog.h" + +static void dlm_mle_node_down(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + struct o2nm_node *node, + int idx); +static void dlm_mle_node_up(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + struct o2nm_node *node, + int idx); + +static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); +static int dlm_do_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + void *nodemap, u32 flags); +static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data); + +static inline int dlm_mle_equal(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + const char *name, + unsigned int namelen) +{ + if (dlm != mle->dlm) + return 0; + + if (namelen != mle->mnamelen || + memcmp(name, mle->mname, namelen) != 0) + return 0; + + return 1; +} + +static struct kmem_cache *dlm_lockres_cache = NULL; +static struct kmem_cache *dlm_lockname_cache = NULL; +static struct kmem_cache *dlm_mle_cache = NULL; + +static void dlm_mle_release(struct kref *kref); +static void dlm_init_mle(struct dlm_master_list_entry *mle, + enum dlm_mle_type type, + struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *name, + unsigned int namelen); +static void dlm_put_mle(struct dlm_master_list_entry *mle); +static void __dlm_put_mle(struct dlm_master_list_entry *mle); +static int dlm_find_mle(struct dlm_ctxt *dlm, + struct dlm_master_list_entry **mle, + char *name, unsigned int namelen); + +static int dlm_do_master_request(struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, int to); + + +static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + int *blocked); +static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + int blocked); +static int dlm_add_migration_mle(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + struct dlm_master_list_entry **oldmle, + const char *name, unsigned int namelen, + u8 new_master, u8 master); + +static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 target); +static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); + + +int dlm_is_host_down(int errno) +{ + switch (errno) { + case -EBADF: + case -ECONNREFUSED: + case -ENOTCONN: + case -ECONNRESET: + case -EPIPE: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ETIMEDOUT: + case -ECONNABORTED: + case -ENETDOWN: + case -ENETUNREACH: + case -ENETRESET: + case -ESHUTDOWN: + case -ENOPROTOOPT: + case -EINVAL: /* if returned from our tcp code, + this means there is no socket */ + return 1; + } + return 0; +} + + +/* + * MASTER LIST FUNCTIONS + */ + + +/* + * regarding master list entries and heartbeat callbacks: + * + * in order to avoid sleeping and allocation that occurs in + * heartbeat, master list entries are simply attached to the + * dlm's established heartbeat callbacks. the mle is attached + * when it is created, and since the dlm->spinlock is held at + * that time, any heartbeat event will be properly discovered + * by the mle. the mle needs to be detached from the + * dlm->mle_hb_events list as soon as heartbeat events are no + * longer useful to the mle, and before the mle is freed. + * + * as a general rule, heartbeat events are no longer needed by + * the mle once an "answer" regarding the lock master has been + * received. + */ +static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + assert_spin_locked(&dlm->spinlock); + + list_add_tail(&mle->hb_events, &dlm->mle_hb_events); +} + + +static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + if (!list_empty(&mle->hb_events)) + list_del_init(&mle->hb_events); +} + + +static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + spin_lock(&dlm->spinlock); + __dlm_mle_detach_hb_events(dlm, mle); + spin_unlock(&dlm->spinlock); +} + +static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + mle->inuse++; + kref_get(&mle->mle_refs); +} + +static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + mle->inuse--; + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + +} + +/* remove from list and free */ +static void __dlm_put_mle(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + if (!atomic_read(&mle->mle_refs.refcount)) { + /* this may or may not crash, but who cares. + * it's a BUG. */ + mlog(ML_ERROR, "bad mle: %p\n", mle); + dlm_print_one_mle(mle); + BUG(); + } else + kref_put(&mle->mle_refs, dlm_mle_release); +} + + +/* must not have any spinlocks coming in */ +static void dlm_put_mle(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); +} + +static inline void dlm_get_mle(struct dlm_master_list_entry *mle) +{ + kref_get(&mle->mle_refs); +} + +static void dlm_init_mle(struct dlm_master_list_entry *mle, + enum dlm_mle_type type, + struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *name, + unsigned int namelen) +{ + assert_spin_locked(&dlm->spinlock); + + mle->dlm = dlm; + mle->type = type; + INIT_HLIST_NODE(&mle->master_hash_node); + INIT_LIST_HEAD(&mle->hb_events); + memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); + spin_lock_init(&mle->spinlock); + init_waitqueue_head(&mle->wq); + atomic_set(&mle->woken, 0); + kref_init(&mle->mle_refs); + memset(mle->response_map, 0, sizeof(mle->response_map)); + mle->master = O2NM_MAX_NODES; + mle->new_master = O2NM_MAX_NODES; + mle->inuse = 0; + + BUG_ON(mle->type != DLM_MLE_BLOCK && + mle->type != DLM_MLE_MASTER && + mle->type != DLM_MLE_MIGRATION); + + if (mle->type == DLM_MLE_MASTER) { + BUG_ON(!res); + mle->mleres = res; + memcpy(mle->mname, res->lockname.name, res->lockname.len); + mle->mnamelen = res->lockname.len; + mle->mnamehash = res->lockname.hash; + } else { + BUG_ON(!name); + mle->mleres = NULL; + memcpy(mle->mname, name, namelen); + mle->mnamelen = namelen; + mle->mnamehash = dlm_lockid_hash(name, namelen); + } + + atomic_inc(&dlm->mle_tot_count[mle->type]); + atomic_inc(&dlm->mle_cur_count[mle->type]); + + /* copy off the node_map and register hb callbacks on our copy */ + memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); + memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); + clear_bit(dlm->node_num, mle->vote_map); + clear_bit(dlm->node_num, mle->node_map); + + /* attach the mle to the domain node up/down events */ + __dlm_mle_attach_hb_events(dlm, mle); +} + +void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) +{ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + + if (!hlist_unhashed(&mle->master_hash_node)) + hlist_del_init(&mle->master_hash_node); +} + +void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) +{ + struct hlist_head *bucket; + + assert_spin_locked(&dlm->master_lock); + + bucket = dlm_master_hash(dlm, mle->mnamehash); + hlist_add_head(&mle->master_hash_node, bucket); +} + +/* returns 1 if found, 0 if not */ +static int dlm_find_mle(struct dlm_ctxt *dlm, + struct dlm_master_list_entry **mle, + char *name, unsigned int namelen) +{ + struct dlm_master_list_entry *tmpmle; + struct hlist_head *bucket; + struct hlist_node *list; + unsigned int hash; + + assert_spin_locked(&dlm->master_lock); + + hash = dlm_lockid_hash(name, namelen); + bucket = dlm_master_hash(dlm, hash); + hlist_for_each(list, bucket) { + tmpmle = hlist_entry(list, struct dlm_master_list_entry, + master_hash_node); + if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) + continue; + dlm_get_mle(tmpmle); + *mle = tmpmle; + return 1; + } + return 0; +} + +void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) +{ + struct dlm_master_list_entry *mle; + + assert_spin_locked(&dlm->spinlock); + + list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) { + if (node_up) + dlm_mle_node_up(dlm, mle, NULL, idx); + else + dlm_mle_node_down(dlm, mle, NULL, idx); + } +} + +static void dlm_mle_node_down(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + struct o2nm_node *node, int idx) +{ + spin_lock(&mle->spinlock); + + if (!test_bit(idx, mle->node_map)) + mlog(0, "node %u already removed from nodemap!\n", idx); + else + clear_bit(idx, mle->node_map); + + spin_unlock(&mle->spinlock); +} + +static void dlm_mle_node_up(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + struct o2nm_node *node, int idx) +{ + spin_lock(&mle->spinlock); + + if (test_bit(idx, mle->node_map)) + mlog(0, "node %u already in node map!\n", idx); + else + set_bit(idx, mle->node_map); + + spin_unlock(&mle->spinlock); +} + + +int dlm_init_mle_cache(void) +{ + dlm_mle_cache = kmem_cache_create("o2dlm_mle", + sizeof(struct dlm_master_list_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (dlm_mle_cache == NULL) + return -ENOMEM; + return 0; +} + +void dlm_destroy_mle_cache(void) +{ + if (dlm_mle_cache) + kmem_cache_destroy(dlm_mle_cache); +} + +static void dlm_mle_release(struct kref *kref) +{ + struct dlm_master_list_entry *mle; + struct dlm_ctxt *dlm; + + mle = container_of(kref, struct dlm_master_list_entry, mle_refs); + dlm = mle->dlm; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + + mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname, + mle->type); + + /* remove from list if not already */ + __dlm_unlink_mle(dlm, mle); + + /* detach the mle from the domain node up/down events */ + __dlm_mle_detach_hb_events(dlm, mle); + + atomic_dec(&dlm->mle_cur_count[mle->type]); + + /* NOTE: kfree under spinlock here. + * if this is bad, we can move this to a freelist. */ + kmem_cache_free(dlm_mle_cache, mle); +} + + +/* + * LOCK RESOURCE FUNCTIONS + */ + +int dlm_init_master_caches(void) +{ + dlm_lockres_cache = kmem_cache_create("o2dlm_lockres", + sizeof(struct dlm_lock_resource), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!dlm_lockres_cache) + goto bail; + + dlm_lockname_cache = kmem_cache_create("o2dlm_lockname", + DLM_LOCKID_NAME_MAX, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!dlm_lockname_cache) + goto bail; + + return 0; +bail: + dlm_destroy_master_caches(); + return -ENOMEM; +} + +void dlm_destroy_master_caches(void) +{ + if (dlm_lockname_cache) + kmem_cache_destroy(dlm_lockname_cache); + + if (dlm_lockres_cache) + kmem_cache_destroy(dlm_lockres_cache); +} + +static void dlm_lockres_release(struct kref *kref) +{ + struct dlm_lock_resource *res; + struct dlm_ctxt *dlm; + + res = container_of(kref, struct dlm_lock_resource, refs); + dlm = res->dlm; + + /* This should not happen -- all lockres' have a name + * associated with them at init time. */ + BUG_ON(!res->lockname.name); + + mlog(0, "destroying lockres %.*s\n", res->lockname.len, + res->lockname.name); + + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", + res->lockname.len, res->lockname.name); + dlm_print_one_lock_resource(res); + } + spin_unlock(&dlm->track_lock); + + atomic_dec(&dlm->res_cur_count); + + if (!hlist_unhashed(&res->hash_node) || + !list_empty(&res->granted) || + !list_empty(&res->converting) || + !list_empty(&res->blocked) || + !list_empty(&res->dirty) || + !list_empty(&res->recovering) || + !list_empty(&res->purge)) { + mlog(ML_ERROR, + "Going to BUG for resource %.*s." + " We're on a list! [%c%c%c%c%c%c%c]\n", + res->lockname.len, res->lockname.name, + !hlist_unhashed(&res->hash_node) ? 'H' : ' ', + !list_empty(&res->granted) ? 'G' : ' ', + !list_empty(&res->converting) ? 'C' : ' ', + !list_empty(&res->blocked) ? 'B' : ' ', + !list_empty(&res->dirty) ? 'D' : ' ', + !list_empty(&res->recovering) ? 'R' : ' ', + !list_empty(&res->purge) ? 'P' : ' '); + + dlm_print_one_lock_resource(res); + } + + /* By the time we're ready to blow this guy away, we shouldn't + * be on any lists. */ + BUG_ON(!hlist_unhashed(&res->hash_node)); + BUG_ON(!list_empty(&res->granted)); + BUG_ON(!list_empty(&res->converting)); + BUG_ON(!list_empty(&res->blocked)); + BUG_ON(!list_empty(&res->dirty)); + BUG_ON(!list_empty(&res->recovering)); + BUG_ON(!list_empty(&res->purge)); + + kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); + + kmem_cache_free(dlm_lockres_cache, res); +} + +void dlm_lockres_put(struct dlm_lock_resource *res) +{ + kref_put(&res->refs, dlm_lockres_release); +} + +static void dlm_init_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *name, unsigned int namelen) +{ + char *qname; + + /* If we memset here, we lose our reference to the kmalloc'd + * res->lockname.name, so be sure to init every field + * correctly! */ + + qname = (char *) res->lockname.name; + memcpy(qname, name, namelen); + + res->lockname.len = namelen; + res->lockname.hash = dlm_lockid_hash(name, namelen); + + init_waitqueue_head(&res->wq); + spin_lock_init(&res->spinlock); + INIT_HLIST_NODE(&res->hash_node); + INIT_LIST_HEAD(&res->granted); + INIT_LIST_HEAD(&res->converting); + INIT_LIST_HEAD(&res->blocked); + INIT_LIST_HEAD(&res->dirty); + INIT_LIST_HEAD(&res->recovering); + INIT_LIST_HEAD(&res->purge); + INIT_LIST_HEAD(&res->tracking); + atomic_set(&res->asts_reserved, 0); + res->migration_pending = 0; + res->inflight_locks = 0; + + res->dlm = dlm; + + kref_init(&res->refs); + + atomic_inc(&dlm->res_tot_count); + atomic_inc(&dlm->res_cur_count); + + /* just for consistency */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); + spin_unlock(&res->spinlock); + + res->state = DLM_LOCK_RES_IN_PROGRESS; + + res->last_used = 0; + + spin_lock(&dlm->spinlock); + list_add_tail(&res->tracking, &dlm->tracking_list); + spin_unlock(&dlm->spinlock); + + memset(res->lvb, 0, DLM_LVB_LEN); + memset(res->refmap, 0, sizeof(res->refmap)); +} + +struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int namelen) +{ + struct dlm_lock_resource *res = NULL; + + res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS); + if (!res) + goto error; + + res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS); + if (!res->lockname.name) + goto error; + + dlm_init_lockres(dlm, res, name, namelen); + return res; + +error: + if (res && res->lockname.name) + kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); + + if (res) + kmem_cache_free(dlm_lockres_cache, res); + return NULL; +} + +void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int new_lockres, + const char *file, + int line) +{ + if (!new_lockres) + assert_spin_locked(&res->spinlock); + + if (!test_bit(dlm->node_num, res->refmap)) { + BUG_ON(res->inflight_locks != 0); + dlm_lockres_set_refmap_bit(dlm->node_num, res); + } + res->inflight_locks++; + mlog(0, "%s:%.*s: inflight++: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_locks); +} + +void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + assert_spin_locked(&res->spinlock); + + BUG_ON(res->inflight_locks == 0); + res->inflight_locks--; + mlog(0, "%s:%.*s: inflight--: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_locks); + if (res->inflight_locks == 0) + dlm_lockres_clear_refmap_bit(dlm->node_num, res); + wake_up(&res->wq); +} + +/* + * lookup a lock resource by name. + * may already exist in the hashtable. + * lockid is null terminated + * + * if not, allocate enough for the lockres and for + * the temporary structure used in doing the mastering. + * + * also, do a lookup in the dlm->master_list to see + * if another node has begun mastering the same lock. + * if so, there should be a block entry in there + * for this name, and we should *not* attempt to master + * the lock here. need to wait around for that node + * to assert_master (or die). + * + */ +struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, + const char *lockid, + int namelen, + int flags) +{ + struct dlm_lock_resource *tmpres=NULL, *res=NULL; + struct dlm_master_list_entry *mle = NULL; + struct dlm_master_list_entry *alloc_mle = NULL; + int blocked = 0; + int ret, nodenum; + struct dlm_node_iter iter; + unsigned int hash; + int tries = 0; + int bit, wait_on_recovery = 0; + int drop_inflight_if_nonlocal = 0; + + BUG_ON(!lockid); + + hash = dlm_lockid_hash(lockid, namelen); + + mlog(0, "get lockres %s (len %d)\n", lockid, namelen); + +lookup: + spin_lock(&dlm->spinlock); + tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); + if (tmpres) { + int dropping_ref = 0; + + spin_unlock(&dlm->spinlock); + + spin_lock(&tmpres->spinlock); + /* We wait for the other thread that is mastering the resource */ + if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { + __dlm_wait_on_lockres(tmpres); + BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); + } + + if (tmpres->owner == dlm->node_num) { + BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); + dlm_lockres_grab_inflight_ref(dlm, tmpres); + } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) + dropping_ref = 1; + spin_unlock(&tmpres->spinlock); + + /* wait until done messaging the master, drop our ref to allow + * the lockres to be purged, start over. */ + if (dropping_ref) { + spin_lock(&tmpres->spinlock); + __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF); + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; + } + + mlog(0, "found in hash!\n"); + if (res) + dlm_lockres_put(res); + res = tmpres; + goto leave; + } + + if (!res) { + spin_unlock(&dlm->spinlock); + mlog(0, "allocating a new resource\n"); + /* nothing found and we need to allocate one. */ + alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); + if (!alloc_mle) + goto leave; + res = dlm_new_lockres(dlm, lockid, namelen); + if (!res) + goto leave; + goto lookup; + } + + mlog(0, "no lockres found, allocated our own: %p\n", res); + + if (flags & LKM_LOCAL) { + /* caller knows it's safe to assume it's not mastered elsewhere + * DONE! return right away */ + spin_lock(&res->spinlock); + dlm_change_lockres_owner(dlm, res, dlm->node_num); + __dlm_insert_lockres(dlm, res); + dlm_lockres_grab_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + /* lockres still marked IN_PROGRESS */ + goto wake_waiters; + } + + /* check master list to see if another node has started mastering it */ + spin_lock(&dlm->master_lock); + + /* if we found a block, wait for lock to be mastered by another node */ + blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); + if (blocked) { + int mig; + if (mle->type == DLM_MLE_MASTER) { + mlog(ML_ERROR, "master entry for nonexistent lock!\n"); + BUG(); + } + mig = (mle->type == DLM_MLE_MIGRATION); + /* if there is a migration in progress, let the migration + * finish before continuing. we can wait for the absence + * of the MIGRATION mle: either the migrate finished or + * one of the nodes died and the mle was cleaned up. + * if there is a BLOCK here, but it already has a master + * set, we are too late. the master does not have a ref + * for us in the refmap. detach the mle and drop it. + * either way, go back to the top and start over. */ + if (mig || mle->master != O2NM_MAX_NODES) { + BUG_ON(mig && mle->master == dlm->node_num); + /* we arrived too late. the master does not + * have a ref for us. retry. */ + mlog(0, "%s:%.*s: late on %s\n", + dlm->name, namelen, lockid, + mig ? "MIGRATION" : "BLOCK"); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + /* master is known, detach */ + if (!mig) + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + mle = NULL; + /* this is lame, but we can't wait on either + * the mle or lockres waitqueue here */ + if (mig) + msleep(100); + goto lookup; + } + } else { + /* go ahead and try to master lock on this node */ + mle = alloc_mle; + /* make sure this does not get freed below */ + alloc_mle = NULL; + dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); + set_bit(dlm->node_num, mle->maybe_map); + __dlm_insert_mle(dlm, mle); + + /* still holding the dlm spinlock, check the recovery map + * to see if there are any nodes that still need to be + * considered. these will not appear in the mle nodemap + * but they might own this lockres. wait on them. */ + bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); + if (bit < O2NM_MAX_NODES) { + mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " + "recover before lock mastery can begin\n", + dlm->name, namelen, (char *)lockid, bit); + wait_on_recovery = 1; + } + } + + /* at this point there is either a DLM_MLE_BLOCK or a + * DLM_MLE_MASTER on the master list, so it's safe to add the + * lockres to the hashtable. anyone who finds the lock will + * still have to wait on the IN_PROGRESS. */ + + /* finally add the lockres to its hash bucket */ + __dlm_insert_lockres(dlm, res); + /* since this lockres is new it doesn't not require the spinlock */ + dlm_lockres_grab_inflight_ref_new(dlm, res); + + /* if this node does not become the master make sure to drop + * this inflight reference below */ + drop_inflight_if_nonlocal = 1; + + /* get an extra ref on the mle in case this is a BLOCK + * if so, the creator of the BLOCK may try to put the last + * ref at this time in the assert master handler, so we + * need an extra one to keep from a bad ptr deref. */ + dlm_get_mle_inuse(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + +redo_request: + while (wait_on_recovery) { + /* any cluster changes that occurred after dropping the + * dlm spinlock would be detectable be a change on the mle, + * so we only need to clear out the recovery map once. */ + if (dlm_is_recovery_lock(lockid, namelen)) { + mlog(ML_NOTICE, "%s: recovery map is not empty, but " + "must master $RECOVERY lock now\n", dlm->name); + if (!dlm_pre_master_reco_lockres(dlm, res)) + wait_on_recovery = 0; + else { + mlog(0, "%s: waiting 500ms for heartbeat state " + "change\n", dlm->name); + msleep(500); + } + continue; + } + + dlm_kick_recovery_thread(dlm); + msleep(1000); + dlm_wait_for_recovery(dlm); + + spin_lock(&dlm->spinlock); + bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); + if (bit < O2NM_MAX_NODES) { + mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " + "recover before lock mastery can begin\n", + dlm->name, namelen, (char *)lockid, bit); + wait_on_recovery = 1; + } else + wait_on_recovery = 0; + spin_unlock(&dlm->spinlock); + + if (wait_on_recovery) + dlm_wait_for_node_recovery(dlm, bit, 10000); + } + + /* must wait for lock to be mastered elsewhere */ + if (blocked) + goto wait; + + ret = -EINVAL; + dlm_node_iter_init(mle->vote_map, &iter); + while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { + ret = dlm_do_master_request(res, mle, nodenum); + if (ret < 0) + mlog_errno(ret); + if (mle->master != O2NM_MAX_NODES) { + /* found a master ! */ + if (mle->master <= nodenum) + break; + /* if our master request has not reached the master + * yet, keep going until it does. this is how the + * master will know that asserts are needed back to + * the lower nodes. */ + mlog(0, "%s:%.*s: requests only up to %u but master " + "is %u, keep going\n", dlm->name, namelen, + lockid, nodenum, mle->master); + } + } + +wait: + /* keep going until the response map includes all nodes */ + ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); + if (ret < 0) { + wait_on_recovery = 1; + mlog(0, "%s:%.*s: node map changed, redo the " + "master request now, blocked=%d\n", + dlm->name, res->lockname.len, + res->lockname.name, blocked); + if (++tries > 20) { + mlog(ML_ERROR, "%s:%.*s: spinning on " + "dlm_wait_for_lock_mastery, blocked=%d\n", + dlm->name, res->lockname.len, + res->lockname.name, blocked); + dlm_print_one_lock_resource(res); + dlm_print_one_mle(mle); + tries = 0; + } + goto redo_request; + } + + mlog(0, "lockres mastered by %u\n", res->owner); + /* make sure we never continue without this */ + BUG_ON(res->owner == O2NM_MAX_NODES); + + /* master is known, detach if not already detached */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + /* put the extra ref */ + dlm_put_mle_inuse(mle); + +wake_waiters: + spin_lock(&res->spinlock); + if (res->owner != dlm->node_num && drop_inflight_if_nonlocal) + dlm_lockres_drop_inflight_ref(dlm, res); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + +leave: + /* need to free the unused mle */ + if (alloc_mle) + kmem_cache_free(dlm_mle_cache, alloc_mle); + + return res; +} + + +#define DLM_MASTERY_TIMEOUT_MS 5000 + +static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + int *blocked) +{ + u8 m; + int ret, bit; + int map_changed, voting_done; + int assert, sleep; + +recheck: + ret = 0; + assert = 0; + + /* check if another node has already become the owner */ + spin_lock(&res->spinlock); + if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name, + res->lockname.len, res->lockname.name, res->owner); + spin_unlock(&res->spinlock); + /* this will cause the master to re-assert across + * the whole cluster, freeing up mles */ + if (res->owner != dlm->node_num) { + ret = dlm_do_master_request(res, mle, res->owner); + if (ret < 0) { + /* give recovery a chance to run */ + mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); + msleep(500); + goto recheck; + } + } + ret = 0; + goto leave; + } + spin_unlock(&res->spinlock); + + spin_lock(&mle->spinlock); + m = mle->master; + map_changed = (memcmp(mle->vote_map, mle->node_map, + sizeof(mle->vote_map)) != 0); + voting_done = (memcmp(mle->vote_map, mle->response_map, + sizeof(mle->vote_map)) == 0); + + /* restart if we hit any errors */ + if (map_changed) { + int b; + mlog(0, "%s: %.*s: node map changed, restarting\n", + dlm->name, res->lockname.len, res->lockname.name); + ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked); + b = (mle->type == DLM_MLE_BLOCK); + if ((*blocked && !b) || (!*blocked && b)) { + mlog(0, "%s:%.*s: status change: old=%d new=%d\n", + dlm->name, res->lockname.len, res->lockname.name, + *blocked, b); + *blocked = b; + } + spin_unlock(&mle->spinlock); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + mlog(0, "%s:%.*s: restart lock mastery succeeded, " + "rechecking now\n", dlm->name, res->lockname.len, + res->lockname.name); + goto recheck; + } else { + if (!voting_done) { + mlog(0, "map not changed and voting not done " + "for %s:%.*s\n", dlm->name, res->lockname.len, + res->lockname.name); + } + } + + if (m != O2NM_MAX_NODES) { + /* another node has done an assert! + * all done! */ + sleep = 0; + } else { + sleep = 1; + /* have all nodes responded? */ + if (voting_done && !*blocked) { + bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + if (dlm->node_num <= bit) { + /* my node number is lowest. + * now tell other nodes that I am + * mastering this. */ + mle->master = dlm->node_num; + /* ref was grabbed in get_lock_resource + * will be dropped in dlmlock_master */ + assert = 1; + sleep = 0; + } + /* if voting is done, but we have not received + * an assert master yet, we must sleep */ + } + } + + spin_unlock(&mle->spinlock); + + /* sleep if we haven't finished voting yet */ + if (sleep) { + unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); + + /* + if (atomic_read(&mle->mle_refs.refcount) < 2) + mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, + atomic_read(&mle->mle_refs.refcount), + res->lockname.len, res->lockname.name); + */ + atomic_set(&mle->woken, 0); + (void)wait_event_timeout(mle->wq, + (atomic_read(&mle->woken) == 1), + timeo); + if (res->owner == O2NM_MAX_NODES) { + mlog(0, "%s:%.*s: waiting again\n", dlm->name, + res->lockname.len, res->lockname.name); + goto recheck; + } + mlog(0, "done waiting, master is %u\n", res->owner); + ret = 0; + goto leave; + } + + ret = 0; /* done */ + if (assert) { + m = dlm->node_num; + mlog(0, "about to master %.*s here, this=%u\n", + res->lockname.len, res->lockname.name, m); + ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0); + if (ret) { + /* This is a failure in the network path, + * not in the response to the assert_master + * (any nonzero response is a BUG on this node). + * Most likely a socket just got disconnected + * due to node death. */ + mlog_errno(ret); + } + /* no longer need to restart lock mastery. + * all living nodes have been contacted. */ + ret = 0; + } + + /* set the lockres owner */ + spin_lock(&res->spinlock); + /* mastery reference obtained either during + * assert_master_handler or in get_lock_resource */ + dlm_change_lockres_owner(dlm, res, m); + spin_unlock(&res->spinlock); + +leave: + return ret; +} + +struct dlm_bitmap_diff_iter +{ + int curnode; + unsigned long *orig_bm; + unsigned long *cur_bm; + unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; +}; + +enum dlm_node_state_change +{ + NODE_DOWN = -1, + NODE_NO_CHANGE = 0, + NODE_UP +}; + +static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter, + unsigned long *orig_bm, + unsigned long *cur_bm) +{ + unsigned long p1, p2; + int i; + + iter->curnode = -1; + iter->orig_bm = orig_bm; + iter->cur_bm = cur_bm; + + for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) { + p1 = *(iter->orig_bm + i); + p2 = *(iter->cur_bm + i); + iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1); + } +} + +static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter, + enum dlm_node_state_change *state) +{ + int bit; + + if (iter->curnode >= O2NM_MAX_NODES) + return -ENOENT; + + bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES, + iter->curnode+1); + if (bit >= O2NM_MAX_NODES) { + iter->curnode = O2NM_MAX_NODES; + return -ENOENT; + } + + /* if it was there in the original then this node died */ + if (test_bit(bit, iter->orig_bm)) + *state = NODE_DOWN; + else + *state = NODE_UP; + + iter->curnode = bit; + return bit; +} + + +static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + int blocked) +{ + struct dlm_bitmap_diff_iter bdi; + enum dlm_node_state_change sc; + int node; + int ret = 0; + + mlog(0, "something happened such that the " + "master process may need to be restarted!\n"); + + assert_spin_locked(&mle->spinlock); + + dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map); + node = dlm_bitmap_diff_iter_next(&bdi, &sc); + while (node >= 0) { + if (sc == NODE_UP) { + /* a node came up. clear any old vote from + * the response map and set it in the vote map + * then restart the mastery. */ + mlog(ML_NOTICE, "node %d up while restarting\n", node); + + /* redo the master request, but only for the new node */ + mlog(0, "sending request to new node\n"); + clear_bit(node, mle->response_map); + set_bit(node, mle->vote_map); + } else { + mlog(ML_ERROR, "node down! %d\n", node); + if (blocked) { + int lowest = find_next_bit(mle->maybe_map, + O2NM_MAX_NODES, 0); + + /* act like it was never there */ + clear_bit(node, mle->maybe_map); + + if (node == lowest) { + mlog(0, "expected master %u died" + " while this node was blocked " + "waiting on it!\n", node); + lowest = find_next_bit(mle->maybe_map, + O2NM_MAX_NODES, + lowest+1); + if (lowest < O2NM_MAX_NODES) { + mlog(0, "%s:%.*s:still " + "blocked. waiting on %u " + "now\n", dlm->name, + res->lockname.len, + res->lockname.name, + lowest); + } else { + /* mle is an MLE_BLOCK, but + * there is now nothing left to + * block on. we need to return + * all the way back out and try + * again with an MLE_MASTER. + * dlm_do_local_recovery_cleanup + * has already run, so the mle + * refcount is ok */ + mlog(0, "%s:%.*s: no " + "longer blocking. try to " + "master this here\n", + dlm->name, + res->lockname.len, + res->lockname.name); + mle->type = DLM_MLE_MASTER; + mle->mleres = res; + } + } + } + + /* now blank out everything, as if we had never + * contacted anyone */ + memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); + memset(mle->response_map, 0, sizeof(mle->response_map)); + /* reset the vote_map to the current node_map */ + memcpy(mle->vote_map, mle->node_map, + sizeof(mle->node_map)); + /* put myself into the maybe map */ + if (mle->type != DLM_MLE_BLOCK) + set_bit(dlm->node_num, mle->maybe_map); + } + ret = -EAGAIN; + node = dlm_bitmap_diff_iter_next(&bdi, &sc); + } + return ret; +} + + +/* + * DLM_MASTER_REQUEST_MSG + * + * returns: 0 on success, + * -errno on a network error + * + * on error, the caller should assume the target node is "dead" + * + */ + +static int dlm_do_master_request(struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, int to) +{ + struct dlm_ctxt *dlm = mle->dlm; + struct dlm_master_request request; + int ret, response=0, resend; + + memset(&request, 0, sizeof(request)); + request.node_idx = dlm->node_num; + + BUG_ON(mle->type == DLM_MLE_MIGRATION); + + request.namelen = (u8)mle->mnamelen; + memcpy(request.name, mle->mname, request.namelen); + +again: + ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, + sizeof(request), to, &response); + if (ret < 0) { + if (ret == -ESRCH) { + /* should never happen */ + mlog(ML_ERROR, "TCP stack not ready!\n"); + BUG(); + } else if (ret == -EINVAL) { + mlog(ML_ERROR, "bad args passed to o2net!\n"); + BUG(); + } else if (ret == -ENOMEM) { + mlog(ML_ERROR, "out of memory while trying to send " + "network message! retrying\n"); + /* this is totally crude */ + msleep(50); + goto again; + } else if (!dlm_is_host_down(ret)) { + /* not a network error. bad. */ + mlog_errno(ret); + mlog(ML_ERROR, "unhandled error!"); + BUG(); + } + /* all other errors should be network errors, + * and likely indicate node death */ + mlog(ML_ERROR, "link to %d went down!\n", to); + goto out; + } + + ret = 0; + resend = 0; + spin_lock(&mle->spinlock); + switch (response) { + case DLM_MASTER_RESP_YES: + set_bit(to, mle->response_map); + mlog(0, "node %u is the master, response=YES\n", to); + mlog(0, "%s:%.*s: master node %u now knows I have a " + "reference\n", dlm->name, res->lockname.len, + res->lockname.name, to); + mle->master = to; + break; + case DLM_MASTER_RESP_NO: + mlog(0, "node %u not master, response=NO\n", to); + set_bit(to, mle->response_map); + break; + case DLM_MASTER_RESP_MAYBE: + mlog(0, "node %u not master, response=MAYBE\n", to); + set_bit(to, mle->response_map); + set_bit(to, mle->maybe_map); + break; + case DLM_MASTER_RESP_ERROR: + mlog(0, "node %u hit an error, resending\n", to); + resend = 1; + response = 0; + break; + default: + mlog(ML_ERROR, "bad response! %u\n", response); + BUG(); + } + spin_unlock(&mle->spinlock); + if (resend) { + /* this is also totally crude */ + msleep(50); + goto again; + } + +out: + return ret; +} + +/* + * locks that can be taken here: + * dlm->spinlock + * res->spinlock + * mle->spinlock + * dlm->master_list + * + * if possible, TRIM THIS DOWN!!! + */ +int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + u8 response = DLM_MASTER_RESP_MAYBE; + struct dlm_ctxt *dlm = data; + struct dlm_lock_resource *res = NULL; + struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; + struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; + char *name; + unsigned int namelen, hash; + int found, ret; + int set_maybe; + int dispatch_assert = 0; + + if (!dlm_grab(dlm)) + return DLM_MASTER_RESP_NO; + + if (!dlm_domain_fully_joined(dlm)) { + response = DLM_MASTER_RESP_NO; + goto send_response; + } + + name = request->name; + namelen = request->namelen; + hash = dlm_lockid_hash(name, namelen); + + if (namelen > DLM_LOCKID_NAME_MAX) { + response = DLM_IVBUFLEN; + goto send_response; + } + +way_up_top: + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); + if (res) { + spin_unlock(&dlm->spinlock); + + /* take care of the easy cases up front */ + spin_lock(&res->spinlock); + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_MIGRATING)) { + spin_unlock(&res->spinlock); + mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " + "being recovered/migrated\n"); + response = DLM_MASTER_RESP_ERROR; + if (mle) + kmem_cache_free(dlm_mle_cache, mle); + goto send_response; + } + + if (res->owner == dlm->node_num) { + mlog(0, "%s:%.*s: setting bit %u in refmap\n", + dlm->name, namelen, name, request->node_idx); + dlm_lockres_set_refmap_bit(request->node_idx, res); + spin_unlock(&res->spinlock); + response = DLM_MASTER_RESP_YES; + if (mle) + kmem_cache_free(dlm_mle_cache, mle); + + /* this node is the owner. + * there is some extra work that needs to + * happen now. the requesting node has + * caused all nodes up to this one to + * create mles. this node now needs to + * go back and clean those up. */ + dispatch_assert = 1; + goto send_response; + } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { + spin_unlock(&res->spinlock); + // mlog(0, "node %u is the master\n", res->owner); + response = DLM_MASTER_RESP_NO; + if (mle) + kmem_cache_free(dlm_mle_cache, mle); + goto send_response; + } + + /* ok, there is no owner. either this node is + * being blocked, or it is actively trying to + * master this lock. */ + if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { + mlog(ML_ERROR, "lock with no owner should be " + "in-progress!\n"); + BUG(); + } + + // mlog(0, "lockres is in progress...\n"); + spin_lock(&dlm->master_lock); + found = dlm_find_mle(dlm, &tmpmle, name, namelen); + if (!found) { + mlog(ML_ERROR, "no mle found for this lock!\n"); + BUG(); + } + set_maybe = 1; + spin_lock(&tmpmle->spinlock); + if (tmpmle->type == DLM_MLE_BLOCK) { + // mlog(0, "this node is waiting for " + // "lockres to be mastered\n"); + response = DLM_MASTER_RESP_NO; + } else if (tmpmle->type == DLM_MLE_MIGRATION) { + mlog(0, "node %u is master, but trying to migrate to " + "node %u.\n", tmpmle->master, tmpmle->new_master); + if (tmpmle->master == dlm->node_num) { + mlog(ML_ERROR, "no owner on lockres, but this " + "node is trying to migrate it to %u?!\n", + tmpmle->new_master); + BUG(); + } else { + /* the real master can respond on its own */ + response = DLM_MASTER_RESP_NO; + } + } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) { + set_maybe = 0; + if (tmpmle->master == dlm->node_num) { + response = DLM_MASTER_RESP_YES; + /* this node will be the owner. + * go back and clean the mles on any + * other nodes */ + dispatch_assert = 1; + dlm_lockres_set_refmap_bit(request->node_idx, res); + mlog(0, "%s:%.*s: setting bit %u in refmap\n", + dlm->name, namelen, name, + request->node_idx); + } else + response = DLM_MASTER_RESP_NO; + } else { + // mlog(0, "this node is attempting to " + // "master lockres\n"); + response = DLM_MASTER_RESP_MAYBE; + } + if (set_maybe) + set_bit(request->node_idx, tmpmle->maybe_map); + spin_unlock(&tmpmle->spinlock); + + spin_unlock(&dlm->master_lock); + spin_unlock(&res->spinlock); + + /* keep the mle attached to heartbeat events */ + dlm_put_mle(tmpmle); + if (mle) + kmem_cache_free(dlm_mle_cache, mle); + goto send_response; + } + + /* + * lockres doesn't exist on this node + * if there is an MLE_BLOCK, return NO + * if there is an MLE_MASTER, return MAYBE + * otherwise, add an MLE_BLOCK, return NO + */ + spin_lock(&dlm->master_lock); + found = dlm_find_mle(dlm, &tmpmle, name, namelen); + if (!found) { + /* this lockid has never been seen on this node yet */ + // mlog(0, "no mle found\n"); + if (!mle) { + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); + if (!mle) { + response = DLM_MASTER_RESP_ERROR; + mlog_errno(-ENOMEM); + goto send_response; + } + goto way_up_top; + } + + // mlog(0, "this is second time thru, already allocated, " + // "add the block.\n"); + dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); + set_bit(request->node_idx, mle->maybe_map); + __dlm_insert_mle(dlm, mle); + response = DLM_MASTER_RESP_NO; + } else { + // mlog(0, "mle was found\n"); + set_maybe = 1; + spin_lock(&tmpmle->spinlock); + if (tmpmle->master == dlm->node_num) { + mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n"); + BUG(); + } + if (tmpmle->type == DLM_MLE_BLOCK) + response = DLM_MASTER_RESP_NO; + else if (tmpmle->type == DLM_MLE_MIGRATION) { + mlog(0, "migration mle was found (%u->%u)\n", + tmpmle->master, tmpmle->new_master); + /* real master can respond on its own */ + response = DLM_MASTER_RESP_NO; + } else + response = DLM_MASTER_RESP_MAYBE; + if (set_maybe) + set_bit(request->node_idx, tmpmle->maybe_map); + spin_unlock(&tmpmle->spinlock); + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + if (found) { + /* keep the mle attached to heartbeat events */ + dlm_put_mle(tmpmle); + } +send_response: + /* + * __dlm_lookup_lockres() grabbed a reference to this lockres. + * The reference is released by dlm_assert_master_worker() under + * the call to dlm_dispatch_assert_master(). If + * dlm_assert_master_worker() isn't called, we drop it here. + */ + if (dispatch_assert) { + if (response != DLM_MASTER_RESP_YES) + mlog(ML_ERROR, "invalid response %d\n", response); + if (!res) { + mlog(ML_ERROR, "bad lockres while trying to assert!\n"); + BUG(); + } + mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", + dlm->node_num, res->lockname.len, res->lockname.name); + ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, + DLM_ASSERT_MASTER_MLE_CLEANUP); + if (ret < 0) { + mlog(ML_ERROR, "failed to dispatch assert master work\n"); + response = DLM_MASTER_RESP_ERROR; + dlm_lockres_put(res); + } + } else { + if (res) + dlm_lockres_put(res); + } + + dlm_put(dlm); + return response; +} + +/* + * DLM_ASSERT_MASTER_MSG + */ + + +/* + * NOTE: this can be used for debugging + * can periodically run all locks owned by this node + * and re-assert across the cluster... + */ +static int dlm_do_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + void *nodemap, u32 flags) +{ + struct dlm_assert_master assert; + int to, tmpret; + struct dlm_node_iter iter; + int ret = 0; + int reassert; + const char *lockname = res->lockname.name; + unsigned int namelen = res->lockname.len; + + BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + spin_lock(&res->spinlock); + res->state |= DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + +again: + reassert = 0; + + /* note that if this nodemap is empty, it returns 0 */ + dlm_node_iter_init(nodemap, &iter); + while ((to = dlm_node_iter_next(&iter)) >= 0) { + int r = 0; + struct dlm_master_list_entry *mle = NULL; + + mlog(0, "sending assert master to %d (%.*s)\n", to, + namelen, lockname); + memset(&assert, 0, sizeof(assert)); + assert.node_idx = dlm->node_num; + assert.namelen = namelen; + memcpy(assert.name, lockname, namelen); + assert.flags = cpu_to_be32(flags); + + tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, + &assert, sizeof(assert), to, &r); + if (tmpret < 0) { + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", tmpret, + DLM_ASSERT_MASTER_MSG, dlm->key, to); + if (!dlm_is_host_down(tmpret)) { + mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); + BUG(); + } + /* a node died. finish out the rest of the nodes. */ + mlog(0, "link to %d went down!\n", to); + /* any nonzero status return will do */ + ret = tmpret; + r = 0; + } else if (r < 0) { + /* ok, something horribly messed. kill thyself. */ + mlog(ML_ERROR,"during assert master of %.*s to %u, " + "got %d.\n", namelen, lockname, to, r); + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + if (dlm_find_mle(dlm, &mle, (char *)lockname, + namelen)) { + dlm_print_one_mle(mle); + __dlm_put_mle(mle); + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + BUG(); + } + + if (r & DLM_ASSERT_RESPONSE_REASSERT && + !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) { + mlog(ML_ERROR, "%.*s: very strange, " + "master MLE but no lockres on %u\n", + namelen, lockname, to); + } + + if (r & DLM_ASSERT_RESPONSE_REASSERT) { + mlog(0, "%.*s: node %u create mles on other " + "nodes and requests a re-assert\n", + namelen, lockname, to); + reassert = 1; + } + if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) { + mlog(0, "%.*s: node %u has a reference to this " + "lockres, set the bit in the refmap\n", + namelen, lockname, to); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(to, res); + spin_unlock(&res->spinlock); + } + } + + if (reassert) + goto again; + + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + return ret; +} + +/* + * locks that can be taken here: + * dlm->spinlock + * res->spinlock + * mle->spinlock + * dlm->master_list + * + * if possible, TRIM THIS DOWN!!! + */ +int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_master_list_entry *mle = NULL; + struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; + struct dlm_lock_resource *res = NULL; + char *name; + unsigned int namelen, hash; + u32 flags; + int master_request = 0, have_lockres_ref = 0; + int ret = 0; + + if (!dlm_grab(dlm)) + return 0; + + name = assert->name; + namelen = assert->namelen; + hash = dlm_lockid_hash(name, namelen); + flags = be32_to_cpu(assert->flags); + + if (namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length!"); + goto done; + } + + spin_lock(&dlm->spinlock); + + if (flags) + mlog(0, "assert_master with flags: %u\n", flags); + + /* find the MLE */ + spin_lock(&dlm->master_lock); + if (!dlm_find_mle(dlm, &mle, name, namelen)) { + /* not an error, could be master just re-asserting */ + mlog(0, "just got an assert_master from %u, but no " + "MLE for it! (%.*s)\n", assert->node_idx, + namelen, name); + } else { + int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0); + if (bit >= O2NM_MAX_NODES) { + /* not necessarily an error, though less likely. + * could be master just re-asserting. */ + mlog(0, "no bits set in the maybe_map, but %u " + "is asserting! (%.*s)\n", assert->node_idx, + namelen, name); + } else if (bit != assert->node_idx) { + if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { + mlog(0, "master %u was found, %u should " + "back off\n", assert->node_idx, bit); + } else { + /* with the fix for bug 569, a higher node + * number winning the mastery will respond + * YES to mastery requests, but this node + * had no way of knowing. let it pass. */ + mlog(0, "%u is the lowest node, " + "%u is asserting. (%.*s) %u must " + "have begun after %u won.\n", bit, + assert->node_idx, namelen, name, bit, + assert->node_idx); + } + } + if (mle->type == DLM_MLE_MIGRATION) { + if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { + mlog(0, "%s:%.*s: got cleanup assert" + " from %u for migration\n", + dlm->name, namelen, name, + assert->node_idx); + } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) { + mlog(0, "%s:%.*s: got unrelated assert" + " from %u for migration, ignoring\n", + dlm->name, namelen, name, + assert->node_idx); + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + goto done; + } + } + } + spin_unlock(&dlm->master_lock); + + /* ok everything checks out with the MLE + * now check to see if there is a lockres */ + res = __dlm_lookup_lockres(dlm, name, namelen, hash); + if (res) { + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + mlog(ML_ERROR, "%u asserting but %.*s is " + "RECOVERING!\n", assert->node_idx, namelen, name); + goto kill; + } + if (!mle) { + if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && + res->owner != assert->node_idx) { + mlog(ML_ERROR, "DIE! Mastery assert from %u, " + "but current owner is %u! (%.*s)\n", + assert->node_idx, res->owner, namelen, + name); + __dlm_print_one_lock_resource(res); + BUG(); + } + } else if (mle->type != DLM_MLE_MIGRATION) { + if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { + /* owner is just re-asserting */ + if (res->owner == assert->node_idx) { + mlog(0, "owner %u re-asserting on " + "lock %.*s\n", assert->node_idx, + namelen, name); + goto ok; + } + mlog(ML_ERROR, "got assert_master from " + "node %u, but %u is the owner! " + "(%.*s)\n", assert->node_idx, + res->owner, namelen, name); + goto kill; + } + if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { + mlog(ML_ERROR, "got assert from %u, but lock " + "with no owner should be " + "in-progress! (%.*s)\n", + assert->node_idx, + namelen, name); + goto kill; + } + } else /* mle->type == DLM_MLE_MIGRATION */ { + /* should only be getting an assert from new master */ + if (assert->node_idx != mle->new_master) { + mlog(ML_ERROR, "got assert from %u, but " + "new master is %u, and old master " + "was %u (%.*s)\n", + assert->node_idx, mle->new_master, + mle->master, namelen, name); + goto kill; + } + + } +ok: + spin_unlock(&res->spinlock); + } + + // mlog(0, "woo! got an assert_master from node %u!\n", + // assert->node_idx); + if (mle) { + int extra_ref = 0; + int nn = -1; + int rr, err = 0; + + spin_lock(&mle->spinlock); + if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) + extra_ref = 1; + else { + /* MASTER mle: if any bits set in the response map + * then the calling node needs to re-assert to clear + * up nodes that this node contacted */ + while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, + nn+1)) < O2NM_MAX_NODES) { + if (nn != dlm->node_num && nn != assert->node_idx) + master_request = 1; + } + } + mle->master = assert->node_idx; + atomic_set(&mle->woken, 1); + wake_up(&mle->wq); + spin_unlock(&mle->spinlock); + + if (res) { + int wake = 0; + spin_lock(&res->spinlock); + if (mle->type == DLM_MLE_MIGRATION) { + mlog(0, "finishing off migration of lockres %.*s, " + "from %u to %u\n", + res->lockname.len, res->lockname.name, + dlm->node_num, mle->new_master); + res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; + dlm_change_lockres_owner(dlm, res, mle->new_master); + BUG_ON(res->state & DLM_LOCK_RES_DIRTY); + } else { + dlm_change_lockres_owner(dlm, res, mle->master); + } + spin_unlock(&res->spinlock); + have_lockres_ref = 1; + if (wake) + wake_up(&res->wq); + } + + /* master is known, detach if not already detached. + * ensures that only one assert_master call will happen + * on this mle. */ + spin_lock(&dlm->master_lock); + + rr = atomic_read(&mle->mle_refs.refcount); + if (mle->inuse > 0) { + if (extra_ref && rr < 3) + err = 1; + else if (!extra_ref && rr < 2) + err = 1; + } else { + if (extra_ref && rr < 2) + err = 1; + else if (!extra_ref && rr < 1) + err = 1; + } + if (err) { + mlog(ML_ERROR, "%s:%.*s: got assert master from %u " + "that will mess up this node, refs=%d, extra=%d, " + "inuse=%d\n", dlm->name, namelen, name, + assert->node_idx, rr, extra_ref, mle->inuse); + dlm_print_one_mle(mle); + } + __dlm_unlink_mle(dlm, mle); + __dlm_mle_detach_hb_events(dlm, mle); + __dlm_put_mle(mle); + if (extra_ref) { + /* the assert master message now balances the extra + * ref given by the master / migration request message. + * if this is the last put, it will be removed + * from the list. */ + __dlm_put_mle(mle); + } + spin_unlock(&dlm->master_lock); + } else if (res) { + if (res->owner != assert->node_idx) { + mlog(0, "assert_master from %u, but current " + "owner is %u (%.*s), no mle\n", assert->node_idx, + res->owner, namelen, name); + } + } + spin_unlock(&dlm->spinlock); + +done: + ret = 0; + if (res) { + spin_lock(&res->spinlock); + res->state |= DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + *ret_data = (void *)res; + } + dlm_put(dlm); + if (master_request) { + mlog(0, "need to tell master to reassert\n"); + /* positive. negative would shoot down the node. */ + ret |= DLM_ASSERT_RESPONSE_REASSERT; + if (!have_lockres_ref) { + mlog(ML_ERROR, "strange, got assert from %u, MASTER " + "mle present here for %s:%.*s, but no lockres!\n", + assert->node_idx, dlm->name, namelen, name); + } + } + if (have_lockres_ref) { + /* let the master know we have a reference to the lockres */ + ret |= DLM_ASSERT_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: got assert from %u, need a ref\n", + dlm->name, namelen, name, assert->node_idx); + } + return ret; + +kill: + /* kill the caller! */ + mlog(ML_ERROR, "Bad message received from another node. Dumping state " + "and killing the other node now! This node is OK and can continue.\n"); + __dlm_print_one_lock_resource(res); + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + *ret_data = (void *)res; + dlm_put(dlm); + return -EINVAL; +} + +void dlm_assert_master_post_handler(int status, void *data, void *ret_data) +{ + struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data; + + if (ret_data) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + dlm_lockres_put(res); + } + return; +} + +int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int ignore_higher, u8 request_from, u32 flags) +{ + struct dlm_work_item *item; + item = kzalloc(sizeof(*item), GFP_NOFS); + if (!item) + return -ENOMEM; + + + /* queue up work for dlm_assert_master_worker */ + dlm_grab(dlm); /* get an extra ref for the work item */ + dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL); + item->u.am.lockres = res; /* already have a ref */ + /* can optionally ignore node numbers higher than this node */ + item->u.am.ignore_higher = ignore_higher; + item->u.am.request_from = request_from; + item->u.am.flags = flags; + + if (ignore_higher) + mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, + res->lockname.name); + + spin_lock(&dlm->work_lock); + list_add_tail(&item->list, &dlm->work_list); + spin_unlock(&dlm->work_lock); + + queue_work(dlm->dlm_worker, &dlm->dispatched_work); + return 0; +} + +static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) +{ + struct dlm_ctxt *dlm = data; + int ret = 0; + struct dlm_lock_resource *res; + unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int ignore_higher; + int bit; + u8 request_from; + u32 flags; + + dlm = item->dlm; + res = item->u.am.lockres; + ignore_higher = item->u.am.ignore_higher; + request_from = item->u.am.request_from; + flags = item->u.am.flags; + + spin_lock(&dlm->spinlock); + memcpy(nodemap, dlm->domain_map, sizeof(nodemap)); + spin_unlock(&dlm->spinlock); + + clear_bit(dlm->node_num, nodemap); + if (ignore_higher) { + /* if is this just to clear up mles for nodes below + * this node, do not send the message to the original + * caller or any node number higher than this */ + clear_bit(request_from, nodemap); + bit = dlm->node_num; + while (1) { + bit = find_next_bit(nodemap, O2NM_MAX_NODES, + bit+1); + if (bit >= O2NM_MAX_NODES) + break; + clear_bit(bit, nodemap); + } + } + + /* + * If we're migrating this lock to someone else, we are no + * longer allowed to assert out own mastery. OTOH, we need to + * prevent migration from starting while we're still asserting + * our dominance. The reserved ast delays migration. + */ + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_MIGRATING) { + mlog(0, "Someone asked us to assert mastery, but we're " + "in the middle of migration. Skipping assert, " + "the new master will handle that.\n"); + spin_unlock(&res->spinlock); + goto put; + } else + __dlm_lockres_reserve_ast(res); + spin_unlock(&res->spinlock); + + /* this call now finishes out the nodemap + * even if one or more nodes die */ + mlog(0, "worker about to master %.*s here, this=%u\n", + res->lockname.len, res->lockname.name, dlm->node_num); + ret = dlm_do_assert_master(dlm, res, nodemap, flags); + if (ret < 0) { + /* no need to restart, we are done */ + if (!dlm_is_host_down(ret)) + mlog_errno(ret); + } + + /* Ok, we've asserted ourselves. Let's let migration start. */ + dlm_lockres_release_ast(dlm, res); + +put: + dlm_lockres_put(res); + + mlog(0, "finished with dlm_assert_master_worker\n"); +} + +/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread. + * We cannot wait for node recovery to complete to begin mastering this + * lockres because this lockres is used to kick off recovery! ;-) + * So, do a pre-check on all living nodes to see if any of those nodes + * think that $RECOVERY is currently mastered by a dead node. If so, + * we wait a short time to allow that node to get notified by its own + * heartbeat stack, then check again. All $RECOVERY lock resources + * mastered by dead nodes are purged when the hearbeat callback is + * fired, so we can know for sure that it is safe to continue once + * the node returns a live node or no node. */ +static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + struct dlm_node_iter iter; + int nodenum; + int ret = 0; + u8 master = DLM_LOCK_RES_OWNER_UNKNOWN; + + spin_lock(&dlm->spinlock); + dlm_node_iter_init(dlm->domain_map, &iter); + spin_unlock(&dlm->spinlock); + + while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { + /* do not send to self */ + if (nodenum == dlm->node_num) + continue; + ret = dlm_do_master_requery(dlm, res, nodenum, &master); + if (ret < 0) { + mlog_errno(ret); + if (!dlm_is_host_down(ret)) + BUG(); + /* host is down, so answer for that node would be + * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ + ret = 0; + } + + if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { + /* check to see if this master is in the recovery map */ + spin_lock(&dlm->spinlock); + if (test_bit(master, dlm->recovery_map)) { + mlog(ML_NOTICE, "%s: node %u has not seen " + "node %u go down yet, and thinks the " + "dead node is mastering the recovery " + "lock. must wait.\n", dlm->name, + nodenum, master); + ret = -EAGAIN; + } + spin_unlock(&dlm->spinlock); + mlog(0, "%s: reco lock master is %u\n", dlm->name, + master); + break; + } + } + return ret; +} + +/* + * DLM_DEREF_LOCKRES_MSG + */ + +int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + struct dlm_deref_lockres deref; + int ret = 0, r; + const char *lockname; + unsigned int namelen; + + lockname = res->lockname.name; + namelen = res->lockname.len; + BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + mlog(0, "%s:%.*s: sending deref to %d\n", + dlm->name, namelen, lockname, res->owner); + memset(&deref, 0, sizeof(deref)); + deref.node_idx = dlm->node_num; + deref.namelen = namelen; + memcpy(deref.name, lockname, namelen); + + ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, + &deref, sizeof(deref), res->owner, &r); + if (ret < 0) + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key, + res->owner); + else if (r < 0) { + /* BAD. other node says I did not have a ref. */ + mlog(ML_ERROR,"while dropping ref on %s:%.*s " + "(master=%u) got %d.\n", dlm->name, namelen, + lockname, res->owner, r); + dlm_print_one_lock_resource(res); + BUG(); + } + return ret; +} + +int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf; + struct dlm_lock_resource *res = NULL; + char *name; + unsigned int namelen; + int ret = -EINVAL; + u8 node; + unsigned int hash; + struct dlm_work_item *item; + int cleared = 0; + int dispatch = 0; + + if (!dlm_grab(dlm)) + return 0; + + name = deref->name; + namelen = deref->namelen; + node = deref->node_idx; + + if (namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length!"); + goto done; + } + if (deref->node_idx >= O2NM_MAX_NODES) { + mlog(ML_ERROR, "Invalid node number: %u\n", node); + goto done; + } + + hash = dlm_lockid_hash(name, namelen); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); + if (!res) { + spin_unlock(&dlm->spinlock); + mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", + dlm->name, namelen, name); + goto done; + } + spin_unlock(&dlm->spinlock); + + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_SETREF_INPROG) + dispatch = 1; + else { + BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + if (test_bit(node, res->refmap)) { + dlm_lockres_clear_refmap_bit(node, res); + cleared = 1; + } + } + spin_unlock(&res->spinlock); + + if (!dispatch) { + if (cleared) + dlm_lockres_calc_usage(dlm, res); + else { + mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " + "but it is already dropped!\n", dlm->name, + res->lockname.len, res->lockname.name, node); + dlm_print_one_lock_resource(res); + } + ret = 0; + goto done; + } + + item = kzalloc(sizeof(*item), GFP_NOFS); + if (!item) { + ret = -ENOMEM; + mlog_errno(ret); + goto done; + } + + dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL); + item->u.dl.deref_res = res; + item->u.dl.deref_node = node; + + spin_lock(&dlm->work_lock); + list_add_tail(&item->list, &dlm->work_list); + spin_unlock(&dlm->work_lock); + + queue_work(dlm->dlm_worker, &dlm->dispatched_work); + return 0; + +done: + if (res) + dlm_lockres_put(res); + dlm_put(dlm); + + return ret; +} + +static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) +{ + struct dlm_ctxt *dlm; + struct dlm_lock_resource *res; + u8 node; + u8 cleared = 0; + + dlm = item->dlm; + res = item->u.dl.deref_res; + node = item->u.dl.deref_node; + + spin_lock(&res->spinlock); + BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + if (test_bit(node, res->refmap)) { + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); + dlm_lockres_clear_refmap_bit(node, res); + cleared = 1; + } + spin_unlock(&res->spinlock); + + if (cleared) { + mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", + dlm->name, res->lockname.len, res->lockname.name, node); + dlm_lockres_calc_usage(dlm, res); + } else { + mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " + "but it is already dropped!\n", dlm->name, + res->lockname.len, res->lockname.name, node); + dlm_print_one_lock_resource(res); + } + + dlm_lockres_put(res); +} + +/* + * A migrateable resource is one that is : + * 1. locally mastered, and, + * 2. zero local locks, and, + * 3. one or more non-local locks, or, one or more references + * Returns 1 if yes, 0 if not. + */ +static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + enum dlm_lockres_list idx; + int nonlocal = 0, node_ref; + struct list_head *queue; + struct dlm_lock *lock; + u64 cookie; + + assert_spin_locked(&res->spinlock); + + if (res->owner != dlm->node_num) + return 0; + + for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { + queue = dlm_list_idx_to_ptr(res, idx); + list_for_each_entry(lock, queue, list) { + if (lock->ml.node != dlm->node_num) { + nonlocal++; + continue; + } + cookie = be64_to_cpu(lock->ml.cookie); + mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on " + "%s list\n", dlm->name, res->lockname.len, + res->lockname.name, + dlm_get_lock_cookie_node(cookie), + dlm_get_lock_cookie_seq(cookie), + dlm_list_in_text(idx)); + return 0; + } + } + + if (!nonlocal) { + node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (node_ref >= O2NM_MAX_NODES) + return 0; + } + + mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len, + res->lockname.name); + + return 1; +} + +/* + * DLM_MIGRATE_LOCKRES + */ + + +static int dlm_migrate_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, u8 target) +{ + struct dlm_master_list_entry *mle = NULL; + struct dlm_master_list_entry *oldmle = NULL; + struct dlm_migratable_lockres *mres = NULL; + int ret = 0; + const char *name; + unsigned int namelen; + int mle_added = 0; + int wake = 0; + + if (!dlm_grab(dlm)) + return -EINVAL; + + BUG_ON(target == O2NM_MAX_NODES); + + name = res->lockname.name; + namelen = res->lockname.len; + + mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name, + target); + + /* preallocate up front. if this fails, abort */ + ret = -ENOMEM; + mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); + if (!mres) { + mlog_errno(ret); + goto leave; + } + + mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); + if (!mle) { + mlog_errno(ret); + goto leave; + } + ret = 0; + + /* + * clear any existing master requests and + * add the migration mle to the list + */ + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, + namelen, target, dlm->node_num); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + if (ret == -EEXIST) { + mlog(0, "another process is already migrating it\n"); + goto fail; + } + mle_added = 1; + + /* + * set the MIGRATING flag and flush asts + * if we fail after this we need to re-dirty the lockres + */ + if (dlm_mark_lockres_migrating(dlm, res, target) < 0) { + mlog(ML_ERROR, "tried to migrate %.*s to %u, but " + "the target went down.\n", res->lockname.len, + res->lockname.name, target); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; + spin_unlock(&res->spinlock); + ret = -EINVAL; + } + +fail: + if (oldmle) { + /* master is known, detach if not already detached */ + dlm_mle_detach_hb_events(dlm, oldmle); + dlm_put_mle(oldmle); + } + + if (ret < 0) { + if (mle_added) { + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + } else if (mle) { + kmem_cache_free(dlm_mle_cache, mle); + mle = NULL; + } + goto leave; + } + + /* + * at this point, we have a migration target, an mle + * in the master list, and the MIGRATING flag set on + * the lockres + */ + + /* now that remote nodes are spinning on the MIGRATING flag, + * ensure that all assert_master work is flushed. */ + flush_workqueue(dlm->dlm_worker); + + /* get an extra reference on the mle. + * otherwise the assert_master from the new + * master will destroy this. + * also, make sure that all callers of dlm_get_mle + * take both dlm->spinlock and dlm->master_lock */ + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + dlm_get_mle_inuse(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + /* notify new node and send all lock state */ + /* call send_one_lockres with migration flag. + * this serves as notice to the target node that a + * migration is starting. */ + ret = dlm_send_one_lockres(dlm, res, mres, target, + DLM_MRES_MIGRATION); + + if (ret < 0) { + mlog(0, "migration to node %u failed with %d\n", + target, ret); + /* migration failed, detach and clean up mle */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; + spin_unlock(&res->spinlock); + if (dlm_is_host_down(ret)) + dlm_wait_for_node_death(dlm, target, + DLM_NODE_DEATH_WAIT_MAX); + goto leave; + } + + /* at this point, the target sends a message to all nodes, + * (using dlm_do_migrate_request). this node is skipped since + * we had to put an mle in the list to begin the process. this + * node now waits for target to do an assert master. this node + * will be the last one notified, ensuring that the migration + * is complete everywhere. if the target dies while this is + * going on, some nodes could potentially see the target as the + * master, so it is important that my recovery finds the migration + * mle and sets the master to UNKNOWN. */ + + + /* wait for new node to assert master */ + while (1) { + ret = wait_event_interruptible_timeout(mle->wq, + (atomic_read(&mle->woken) == 1), + msecs_to_jiffies(5000)); + + if (ret >= 0) { + if (atomic_read(&mle->woken) == 1 || + res->owner == target) + break; + + mlog(0, "%s:%.*s: timed out during migration\n", + dlm->name, res->lockname.len, res->lockname.name); + /* avoid hang during shutdown when migrating lockres + * to a node which also goes down */ + if (dlm_is_node_dead(dlm, target)) { + mlog(0, "%s:%.*s: expected migration " + "target %u is no longer up, restarting\n", + dlm->name, res->lockname.len, + res->lockname.name, target); + ret = -EINVAL; + /* migration failed, detach and clean up mle */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; + spin_unlock(&res->spinlock); + goto leave; + } + } else + mlog(0, "%s:%.*s: caught signal during migration\n", + dlm->name, res->lockname.len, res->lockname.name); + } + + /* all done, set the owner, clear the flag */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, target); + res->state &= ~DLM_LOCK_RES_MIGRATING; + dlm_remove_nonlocal_locks(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + /* master is known, detach if not already detached */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle_inuse(mle); + ret = 0; + + dlm_lockres_calc_usage(dlm, res); + +leave: + /* re-dirty the lockres if we failed */ + if (ret < 0) + dlm_kick_thread(dlm, res); + + /* wake up waiters if the MIGRATING flag got set + * but migration failed */ + if (wake) + wake_up(&res->wq); + + if (mres) + free_page((unsigned long)mres); + + dlm_put(dlm); + + mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen, + name, target, ret); + return ret; +} + +#define DLM_MIGRATION_RETRY_MS 100 + +/* + * Should be called only after beginning the domain leave process. + * There should not be any remaining locks on nonlocal lock resources, + * and there should be no local locks left on locally mastered resources. + * + * Called with the dlm spinlock held, may drop it to do migration, but + * will re-acquire before exit. + * + * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped + */ +int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + int ret; + int lock_dropped = 0; + u8 target = O2NM_MAX_NODES; + + assert_spin_locked(&dlm->spinlock); + + spin_lock(&res->spinlock); + if (dlm_is_lockres_migrateable(dlm, res)) + target = dlm_pick_migration_target(dlm, res); + spin_unlock(&res->spinlock); + + if (target == O2NM_MAX_NODES) + goto leave; + + /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ + spin_unlock(&dlm->spinlock); + lock_dropped = 1; + ret = dlm_migrate_lockres(dlm, res, target); + if (ret) + mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n", + dlm->name, res->lockname.len, res->lockname.name, + target, ret); + spin_lock(&dlm->spinlock); +leave: + return lock_dropped; +} + +int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + int ret; + spin_lock(&dlm->ast_lock); + spin_lock(&lock->spinlock); + ret = (list_empty(&lock->bast_list) && !lock->bast_pending); + spin_unlock(&lock->spinlock); + spin_unlock(&dlm->ast_lock); + return ret; +} + +static int dlm_migration_can_proceed(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 mig_target) +{ + int can_proceed; + spin_lock(&res->spinlock); + can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING); + spin_unlock(&res->spinlock); + + /* target has died, so make the caller break out of the + * wait_event, but caller must recheck the domain_map */ + spin_lock(&dlm->spinlock); + if (!test_bit(mig_target, dlm->domain_map)) + can_proceed = 1; + spin_unlock(&dlm->spinlock); + return can_proceed; +} + +static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + int ret; + spin_lock(&res->spinlock); + ret = !!(res->state & DLM_LOCK_RES_DIRTY); + spin_unlock(&res->spinlock); + return ret; +} + + +static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 target) +{ + int ret = 0; + + mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n", + res->lockname.len, res->lockname.name, dlm->node_num, + target); + /* need to set MIGRATING flag on lockres. this is done by + * ensuring that all asts have been flushed for this lockres. */ + spin_lock(&res->spinlock); + BUG_ON(res->migration_pending); + res->migration_pending = 1; + /* strategy is to reserve an extra ast then release + * it below, letting the release do all of the work */ + __dlm_lockres_reserve_ast(res); + spin_unlock(&res->spinlock); + + /* now flush all the pending asts */ + dlm_kick_thread(dlm, res); + /* before waiting on DIRTY, block processes which may + * try to dirty the lockres before MIGRATING is set */ + spin_lock(&res->spinlock); + BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY); + res->state |= DLM_LOCK_RES_BLOCK_DIRTY; + spin_unlock(&res->spinlock); + /* now wait on any pending asts and the DIRTY state */ + wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); + dlm_lockres_release_ast(dlm, res); + + mlog(0, "about to wait on migration_wq, dirty=%s\n", + res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); + /* if the extra ref we just put was the final one, this + * will pass thru immediately. otherwise, we need to wait + * for the last ast to finish. */ +again: + ret = wait_event_interruptible_timeout(dlm->migration_wq, + dlm_migration_can_proceed(dlm, res, target), + msecs_to_jiffies(1000)); + if (ret < 0) { + mlog(0, "woken again: migrating? %s, dead? %s\n", + res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", + test_bit(target, dlm->domain_map) ? "no":"yes"); + } else { + mlog(0, "all is well: migrating? %s, dead? %s\n", + res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", + test_bit(target, dlm->domain_map) ? "no":"yes"); + } + if (!dlm_migration_can_proceed(dlm, res, target)) { + mlog(0, "trying again...\n"); + goto again; + } + + ret = 0; + /* did the target go down or die? */ + spin_lock(&dlm->spinlock); + if (!test_bit(target, dlm->domain_map)) { + mlog(ML_ERROR, "aha. migration target %u just went down\n", + target); + ret = -EHOSTDOWN; + } + spin_unlock(&dlm->spinlock); + + /* + * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for + * another try; otherwise, we are sure the MIGRATING state is there, + * drop the unneded state which blocked threads trying to DIRTY + */ + spin_lock(&res->spinlock); + BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); + res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; + if (!ret) + BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); + spin_unlock(&res->spinlock); + + /* + * at this point: + * + * o the DLM_LOCK_RES_MIGRATING flag is set if target not down + * o there are no pending asts on this lockres + * o all processes trying to reserve an ast on this + * lockres must wait for the MIGRATING flag to clear + */ + return ret; +} + +/* last step in the migration process. + * original master calls this to free all of the dlm_lock + * structures that used to be for other nodes. */ +static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + struct list_head *queue = &res->granted; + int i, bit; + struct dlm_lock *lock, *next; + + assert_spin_locked(&res->spinlock); + + BUG_ON(res->owner == dlm->node_num); + + for (i=0; i<3; i++) { + list_for_each_entry_safe(lock, next, queue, list) { + if (lock->ml.node != dlm->node_num) { + mlog(0, "putting lock for node %u\n", + lock->ml.node); + /* be extra careful */ + BUG_ON(!list_empty(&lock->ast_list)); + BUG_ON(!list_empty(&lock->bast_list)); + BUG_ON(lock->ast_pending); + BUG_ON(lock->bast_pending); + dlm_lockres_clear_refmap_bit(lock->ml.node, res); + list_del_init(&lock->list); + dlm_lock_put(lock); + /* In a normal unlock, we would have added a + * DLM_UNLOCK_FREE_LOCK action. Force it. */ + dlm_lock_put(lock); + } + } + queue++; + } + bit = 0; + while (1) { + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); + if (bit >= O2NM_MAX_NODES) + break; + /* do not clear the local node reference, if there is a + * process holding this, let it drop the ref itself */ + if (bit != dlm->node_num) { + mlog(0, "%s:%.*s: node %u had a ref to this " + "migrating lockres, clearing\n", dlm->name, + res->lockname.len, res->lockname.name, bit); + dlm_lockres_clear_refmap_bit(bit, res); + } + bit++; + } +} + +/* + * Pick a node to migrate the lock resource to. This function selects a + * potential target based first on the locks and then on refmap. It skips + * nodes that are in the process of exiting the domain. + */ +static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + enum dlm_lockres_list idx; + struct list_head *queue = &res->granted; + struct dlm_lock *lock; + int noderef; + u8 nodenum = O2NM_MAX_NODES; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + /* Go through all the locks */ + for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { + queue = dlm_list_idx_to_ptr(res, idx); + list_for_each_entry(lock, queue, list) { + if (lock->ml.node == dlm->node_num) + continue; + if (test_bit(lock->ml.node, dlm->exit_domain_map)) + continue; + nodenum = lock->ml.node; + goto bail; + } + } + + /* Go thru the refmap */ + noderef = -1; + while (1) { + noderef = find_next_bit(res->refmap, O2NM_MAX_NODES, + noderef + 1); + if (noderef >= O2NM_MAX_NODES) + break; + if (noderef == dlm->node_num) + continue; + if (test_bit(noderef, dlm->exit_domain_map)) + continue; + nodenum = noderef; + goto bail; + } + +bail: + return nodenum; +} + +/* this is called by the new master once all lockres + * data has been received */ +static int dlm_do_migrate_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 master, u8 new_master, + struct dlm_node_iter *iter) +{ + struct dlm_migrate_request migrate; + int ret, skip, status = 0; + int nodenum; + + memset(&migrate, 0, sizeof(migrate)); + migrate.namelen = res->lockname.len; + memcpy(migrate.name, res->lockname.name, migrate.namelen); + migrate.new_master = new_master; + migrate.master = master; + + ret = 0; + + /* send message to all nodes, except the master and myself */ + while ((nodenum = dlm_node_iter_next(iter)) >= 0) { + if (nodenum == master || + nodenum == new_master) + continue; + + /* We could race exit domain. If exited, skip. */ + spin_lock(&dlm->spinlock); + skip = (!test_bit(nodenum, dlm->domain_map)); + spin_unlock(&dlm->spinlock); + if (skip) { + clear_bit(nodenum, iter->node_map); + continue; + } + + ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key, + &migrate, sizeof(migrate), nodenum, + &status); + if (ret < 0) { + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG, + dlm->key, nodenum); + if (!dlm_is_host_down(ret)) { + mlog(ML_ERROR, "unhandled error=%d!\n", ret); + BUG(); + } + clear_bit(nodenum, iter->node_map); + ret = 0; + } else if (status < 0) { + mlog(0, "migrate request (node %u) returned %d!\n", + nodenum, status); + ret = status; + } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) { + /* during the migration request we short-circuited + * the mastery of the lockres. make sure we have + * a mastery ref for nodenum */ + mlog(0, "%s:%.*s: need ref for node %u\n", + dlm->name, res->lockname.len, res->lockname.name, + nodenum); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(nodenum, res); + spin_unlock(&res->spinlock); + } + } + + if (ret < 0) + mlog_errno(ret); + + mlog(0, "returning ret=%d\n", ret); + return ret; +} + + +/* if there is an existing mle for this lockres, we now know who the master is. + * (the one who sent us *this* message) we can clear it up right away. + * since the process that put the mle on the list still has a reference to it, + * we can unhash it now, set the master and wake the process. as a result, + * we will have no mle in the list to start with. now we can add an mle for + * the migration and this should be the only one found for those scanning the + * list. */ +int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_lock_resource *res = NULL; + struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; + struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; + const char *name; + unsigned int namelen, hash; + int ret = 0; + + if (!dlm_grab(dlm)) + return -EINVAL; + + name = migrate->name; + namelen = migrate->namelen; + hash = dlm_lockid_hash(name, namelen); + + /* preallocate.. if this fails, abort */ + mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); + + if (!mle) { + ret = -ENOMEM; + goto leave; + } + + /* check for pre-existing lock */ + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); + if (res) { + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + /* if all is working ok, this can only mean that we got + * a migrate request from a node that we now see as + * dead. what can we do here? drop it to the floor? */ + spin_unlock(&res->spinlock); + mlog(ML_ERROR, "Got a migrate request, but the " + "lockres is marked as recovering!"); + kmem_cache_free(dlm_mle_cache, mle); + ret = -EINVAL; /* need a better solution */ + goto unlock; + } + res->state |= DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); + } + + spin_lock(&dlm->master_lock); + /* ignore status. only nonzero status would BUG. */ + ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, + name, namelen, + migrate->new_master, + migrate->master); + + spin_unlock(&dlm->master_lock); +unlock: + spin_unlock(&dlm->spinlock); + + if (oldmle) { + /* master is known, detach if not already detached */ + dlm_mle_detach_hb_events(dlm, oldmle); + dlm_put_mle(oldmle); + } + + if (res) + dlm_lockres_put(res); +leave: + dlm_put(dlm); + return ret; +} + +/* must be holding dlm->spinlock and dlm->master_lock + * when adding a migration mle, we can clear any other mles + * in the master list because we know with certainty that + * the master is "master". so we remove any old mle from + * the list after setting it's master field, and then add + * the new migration mle. this way we can hold with the rule + * of having only one mle for a given lock name at all times. */ +static int dlm_add_migration_mle(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + struct dlm_master_list_entry **oldmle, + const char *name, unsigned int namelen, + u8 new_master, u8 master) +{ + int found; + int ret = 0; + + *oldmle = NULL; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + + /* caller is responsible for any ref taken here on oldmle */ + found = dlm_find_mle(dlm, oldmle, (char *)name, namelen); + if (found) { + struct dlm_master_list_entry *tmp = *oldmle; + spin_lock(&tmp->spinlock); + if (tmp->type == DLM_MLE_MIGRATION) { + if (master == dlm->node_num) { + /* ah another process raced me to it */ + mlog(0, "tried to migrate %.*s, but some " + "process beat me to it\n", + namelen, name); + ret = -EEXIST; + } else { + /* bad. 2 NODES are trying to migrate! */ + mlog(ML_ERROR, "migration error mle: " + "master=%u new_master=%u // request: " + "master=%u new_master=%u // " + "lockres=%.*s\n", + tmp->master, tmp->new_master, + master, new_master, + namelen, name); + BUG(); + } + } else { + /* this is essentially what assert_master does */ + tmp->master = master; + atomic_set(&tmp->woken, 1); + wake_up(&tmp->wq); + /* remove it so that only one mle will be found */ + __dlm_unlink_mle(dlm, tmp); + __dlm_mle_detach_hb_events(dlm, tmp); + ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: master=%u, newmaster=%u, " + "telling master to get ref for cleared out mle " + "during migration\n", dlm->name, namelen, name, + master, new_master); + } + spin_unlock(&tmp->spinlock); + } + + /* now add a migration mle to the tail of the list */ + dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); + mle->new_master = new_master; + /* the new master will be sending an assert master for this. + * at that point we will get the refmap reference */ + mle->master = master; + /* do this for consistency with other mle types */ + set_bit(new_master, mle->maybe_map); + __dlm_insert_mle(dlm, mle); + + return ret; +} + +/* + * Sets the owner of the lockres, associated to the mle, to UNKNOWN + */ +static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + struct dlm_lock_resource *res; + + /* Find the lockres associated to the mle and set its owner to UNK */ + res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen, + mle->mnamehash); + if (res) { + spin_unlock(&dlm->master_lock); + + /* move lockres onto recovery list */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); + dlm_move_lockres_to_recovery_list(dlm, res); + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + + /* about to get rid of mle, detach from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); + + /* dump the mle */ + spin_lock(&dlm->master_lock); + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + } + + return res; +} + +static void dlm_clean_migration_mle(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + __dlm_mle_detach_hb_events(dlm, mle); + + spin_lock(&mle->spinlock); + __dlm_unlink_mle(dlm, mle); + atomic_set(&mle->woken, 1); + spin_unlock(&mle->spinlock); + + wake_up(&mle->wq); +} + +static void dlm_clean_block_mle(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, u8 dead_node) +{ + int bit; + + BUG_ON(mle->type != DLM_MLE_BLOCK); + + spin_lock(&mle->spinlock); + bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + if (bit != dead_node) { + mlog(0, "mle found, but dead node %u would not have been " + "master\n", dead_node); + spin_unlock(&mle->spinlock); + } else { + /* Must drop the refcount by one since the assert_master will + * never arrive. This may result in the mle being unlinked and + * freed, but there may still be a process waiting in the + * dlmlock path which is fine. */ + mlog(0, "node %u was expected master\n", dead_node); + atomic_set(&mle->woken, 1); + spin_unlock(&mle->spinlock); + wake_up(&mle->wq); + + /* Do not need events any longer, so detach from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); + __dlm_put_mle(mle); + } +} + +void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) +{ + struct dlm_master_list_entry *mle; + struct dlm_lock_resource *res; + struct hlist_head *bucket; + struct hlist_node *list; + unsigned int i; + + mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node); +top: + assert_spin_locked(&dlm->spinlock); + + /* clean the master list */ + spin_lock(&dlm->master_lock); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_master_hash(dlm, i); + hlist_for_each(list, bucket) { + mle = hlist_entry(list, struct dlm_master_list_entry, + master_hash_node); + + BUG_ON(mle->type != DLM_MLE_BLOCK && + mle->type != DLM_MLE_MASTER && + mle->type != DLM_MLE_MIGRATION); + + /* MASTER mles are initiated locally. The waiting + * process will notice the node map change shortly. + * Let that happen as normal. */ + if (mle->type == DLM_MLE_MASTER) + continue; + + /* BLOCK mles are initiated by other nodes. Need to + * clean up if the dead node would have been the + * master. */ + if (mle->type == DLM_MLE_BLOCK) { + dlm_clean_block_mle(dlm, mle, dead_node); + continue; + } + + /* Everything else is a MIGRATION mle */ + + /* The rule for MIGRATION mles is that the master + * becomes UNKNOWN if *either* the original or the new + * master dies. All UNKNOWN lockres' are sent to + * whichever node becomes the recovery master. The new + * master is responsible for determining if there is + * still a master for this lockres, or if he needs to + * take over mastery. Either way, this node should + * expect another message to resolve this. */ + + if (mle->master != dead_node && + mle->new_master != dead_node) + continue; + + /* If we have reached this point, this mle needs to be + * removed from the list and freed. */ + dlm_clean_migration_mle(dlm, mle); + + mlog(0, "%s: node %u died during migration from " + "%u to %u!\n", dlm->name, dead_node, mle->master, + mle->new_master); + + /* If we find a lockres associated with the mle, we've + * hit this rare case that messes up our lock ordering. + * If so, we need to drop the master lock so that we can + * take the lockres lock, meaning that we will have to + * restart from the head of list. */ + res = dlm_reset_mleres_owner(dlm, mle); + if (res) + /* restart */ + goto top; + + /* This may be the last reference */ + __dlm_put_mle(mle); + } + } + spin_unlock(&dlm->master_lock); +} + +int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + u8 old_master) +{ + struct dlm_node_iter iter; + int ret = 0; + + spin_lock(&dlm->spinlock); + dlm_node_iter_init(dlm->domain_map, &iter); + clear_bit(old_master, iter.node_map); + clear_bit(dlm->node_num, iter.node_map); + spin_unlock(&dlm->spinlock); + + /* ownership of the lockres is changing. account for the + * mastery reference here since old_master will briefly have + * a reference after the migration completes */ + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(old_master, res); + spin_unlock(&res->spinlock); + + mlog(0, "now time to do a migrate request to other nodes\n"); + ret = dlm_do_migrate_request(dlm, res, old_master, + dlm->node_num, &iter); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + mlog(0, "doing assert master of %.*s to all except the original node\n", + res->lockname.len, res->lockname.name); + /* this call now finishes out the nodemap + * even if one or more nodes die */ + ret = dlm_do_assert_master(dlm, res, iter.node_map, + DLM_ASSERT_MASTER_FINISH_MIGRATION); + if (ret < 0) { + /* no longer need to retry. all living nodes contacted. */ + mlog_errno(ret); + ret = 0; + } + + memset(iter.node_map, 0, sizeof(iter.node_map)); + set_bit(old_master, iter.node_map); + mlog(0, "doing assert master of %.*s back to %u\n", + res->lockname.len, res->lockname.name, old_master); + ret = dlm_do_assert_master(dlm, res, iter.node_map, + DLM_ASSERT_MASTER_FINISH_MIGRATION); + if (ret < 0) { + mlog(0, "assert master to original master failed " + "with %d.\n", ret); + /* the only nonzero status here would be because of + * a dead original node. we're done. */ + ret = 0; + } + + /* all done, set the owner, clear the flag */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, dlm->node_num); + res->state &= ~DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); + /* re-dirty it on the new master */ + dlm_kick_thread(dlm, res); + wake_up(&res->wq); +leave: + return ret; +} + +/* + * LOCKRES AST REFCOUNT + * this is integral to migration + */ + +/* for future intent to call an ast, reserve one ahead of time. + * this should be called only after waiting on the lockres + * with dlm_wait_on_lockres, and while still holding the + * spinlock after the call. */ +void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + if (res->state & DLM_LOCK_RES_MIGRATING) { + __dlm_print_one_lock_resource(res); + } + BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); + + atomic_inc(&res->asts_reserved); +} + +/* + * used to drop the reserved ast, either because it went unused, + * or because the ast/bast was actually called. + * + * also, if there is a pending migration on this lockres, + * and this was the last pending ast on the lockres, + * atomically set the MIGRATING flag before we drop the lock. + * this is how we ensure that migration can proceed with no + * asts in progress. note that it is ok if the state of the + * queues is such that a lock should be granted in the future + * or that a bast should be fired, because the new master will + * shuffle the lists on this lockres as soon as it is migrated. + */ +void dlm_lockres_release_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock)) + return; + + if (!res->migration_pending) { + spin_unlock(&res->spinlock); + return; + } + + BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); + res->migration_pending = 0; + res->state |= DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + wake_up(&dlm->migration_wq); +} + +void dlm_force_free_mles(struct dlm_ctxt *dlm) +{ + int i; + struct hlist_head *bucket; + struct dlm_master_list_entry *mle; + struct hlist_node *tmp, *list; + + /* + * We notified all other nodes that we are exiting the domain and + * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still + * around we force free them and wake any processes that are waiting + * on the mles + */ + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + + BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING); + BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES)); + + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_master_hash(dlm, i); + hlist_for_each_safe(list, tmp, bucket) { + mle = hlist_entry(list, struct dlm_master_list_entry, + master_hash_node); + if (mle->type != DLM_MLE_BLOCK) { + mlog(ML_ERROR, "bad mle: %p\n", mle); + dlm_print_one_mle(mle); + } + atomic_set(&mle->woken, 1); + wake_up(&mle->wq); + + __dlm_unlink_mle(dlm, mle); + __dlm_mle_detach_hb_events(dlm, mle); + __dlm_put_mle(mle); + } + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); +} diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c new file mode 100644 index 00000000..7efab6d2 --- /dev/null +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -0,0 +1,2886 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmrecovery.c + * + * recovery stuff + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" +#include "dlmdomain.h" + +#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY) +#include "cluster/masklog.h" + +static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); + +static int dlm_recovery_thread(void *data); +void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); +int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); +void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); +static int dlm_do_recovery(struct dlm_ctxt *dlm); + +static int dlm_pick_recovery_master(struct dlm_ctxt *dlm); +static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node); +static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); +static int dlm_request_all_locks(struct dlm_ctxt *dlm, + u8 request_from, u8 dead_node); +static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); + +static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res); +static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, + const char *lockname, int namelen, + int total_locks, u64 cookie, + u8 flags, u8 master); +static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, + struct dlm_migratable_lockres *mres, + u8 send_to, + struct dlm_lock_resource *res, + int total_locks); +static int dlm_process_recovery_data(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_migratable_lockres *mres); +static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm); +static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, + u8 dead_node, u8 send_to); +static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node); +static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, + struct list_head *list, u8 dead_node); +static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, + u8 dead_node, u8 new_master); +static void dlm_reco_ast(void *astdata); +static void dlm_reco_bast(void *astdata, int blocked_type); +static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st); +static void dlm_request_all_locks_worker(struct dlm_work_item *item, + void *data); +static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data); +static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 *real_master); + +static u64 dlm_get_next_mig_cookie(void); + +static DEFINE_SPINLOCK(dlm_reco_state_lock); +static DEFINE_SPINLOCK(dlm_mig_cookie_lock); +static u64 dlm_mig_cookie = 1; + +static u64 dlm_get_next_mig_cookie(void) +{ + u64 c; + spin_lock(&dlm_mig_cookie_lock); + c = dlm_mig_cookie; + if (dlm_mig_cookie == (~0ULL)) + dlm_mig_cookie = 1; + else + dlm_mig_cookie++; + spin_unlock(&dlm_mig_cookie_lock); + return c; +} + +static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm, + u8 dead_node) +{ + assert_spin_locked(&dlm->spinlock); + if (dlm->reco.dead_node != dead_node) + mlog(0, "%s: changing dead_node from %u to %u\n", + dlm->name, dlm->reco.dead_node, dead_node); + dlm->reco.dead_node = dead_node; +} + +static inline void dlm_set_reco_master(struct dlm_ctxt *dlm, + u8 master) +{ + assert_spin_locked(&dlm->spinlock); + mlog(0, "%s: changing new_master from %u to %u\n", + dlm->name, dlm->reco.new_master, master); + dlm->reco.new_master = master; +} + +static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm) +{ + assert_spin_locked(&dlm->spinlock); + clear_bit(dlm->reco.dead_node, dlm->recovery_map); + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); + dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); +} + +static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) +{ + spin_lock(&dlm->spinlock); + __dlm_reset_recovery(dlm); + spin_unlock(&dlm->spinlock); +} + +/* Worker function used during recovery. */ +void dlm_dispatch_work(struct work_struct *work) +{ + struct dlm_ctxt *dlm = + container_of(work, struct dlm_ctxt, dispatched_work); + LIST_HEAD(tmp_list); + struct dlm_work_item *item, *next; + dlm_workfunc_t *workfunc; + int tot=0; + + spin_lock(&dlm->work_lock); + list_splice_init(&dlm->work_list, &tmp_list); + spin_unlock(&dlm->work_lock); + + list_for_each_entry(item, &tmp_list, list) { + tot++; + } + mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); + + list_for_each_entry_safe(item, next, &tmp_list, list) { + workfunc = item->func; + list_del_init(&item->list); + + /* already have ref on dlm to avoid having + * it disappear. just double-check. */ + BUG_ON(item->dlm != dlm); + + /* this is allowed to sleep and + * call network stuff */ + workfunc(item, item->data); + + dlm_put(dlm); + kfree(item); + } +} + +/* + * RECOVERY THREAD + */ + +void dlm_kick_recovery_thread(struct dlm_ctxt *dlm) +{ + /* wake the recovery thread + * this will wake the reco thread in one of three places + * 1) sleeping with no recovery happening + * 2) sleeping with recovery mastered elsewhere + * 3) recovery mastered here, waiting on reco data */ + + wake_up(&dlm->dlm_reco_thread_wq); +} + +/* Launch the recovery thread */ +int dlm_launch_recovery_thread(struct dlm_ctxt *dlm) +{ + mlog(0, "starting dlm recovery thread...\n"); + + dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm, + "dlm_reco_thread"); + if (IS_ERR(dlm->dlm_reco_thread_task)) { + mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task)); + dlm->dlm_reco_thread_task = NULL; + return -EINVAL; + } + + return 0; +} + +void dlm_complete_recovery_thread(struct dlm_ctxt *dlm) +{ + if (dlm->dlm_reco_thread_task) { + mlog(0, "waiting for dlm recovery thread to exit\n"); + kthread_stop(dlm->dlm_reco_thread_task); + dlm->dlm_reco_thread_task = NULL; + } +} + + + +/* + * this is lame, but here's how recovery works... + * 1) all recovery threads cluster wide will work on recovering + * ONE node at a time + * 2) negotiate who will take over all the locks for the dead node. + * thats right... ALL the locks. + * 3) once a new master is chosen, everyone scans all locks + * and moves aside those mastered by the dead guy + * 4) each of these locks should be locked until recovery is done + * 5) the new master collects up all of secondary lock queue info + * one lock at a time, forcing each node to communicate back + * before continuing + * 6) each secondary lock queue responds with the full known lock info + * 7) once the new master has run all its locks, it sends a ALLDONE! + * message to everyone + * 8) upon receiving this message, the secondary queue node unlocks + * and responds to the ALLDONE + * 9) once the new master gets responses from everyone, he unlocks + * everything and recovery for this dead node is done + *10) go back to 2) while there are still dead nodes + * + */ + +static void dlm_print_reco_node_status(struct dlm_ctxt *dlm) +{ + struct dlm_reco_node_data *ndata; + struct dlm_lock_resource *res; + + mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n", + dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), + dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive", + dlm->reco.dead_node, dlm->reco.new_master); + + list_for_each_entry(ndata, &dlm->reco.node_data, list) { + char *st = "unknown"; + switch (ndata->state) { + case DLM_RECO_NODE_DATA_INIT: + st = "init"; + break; + case DLM_RECO_NODE_DATA_REQUESTING: + st = "requesting"; + break; + case DLM_RECO_NODE_DATA_DEAD: + st = "dead"; + break; + case DLM_RECO_NODE_DATA_RECEIVING: + st = "receiving"; + break; + case DLM_RECO_NODE_DATA_REQUESTED: + st = "requested"; + break; + case DLM_RECO_NODE_DATA_DONE: + st = "done"; + break; + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + st = "finalize-sent"; + break; + default: + st = "bad"; + break; + } + mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n", + dlm->name, ndata->node_num, st); + } + list_for_each_entry(res, &dlm->reco.resources, recovering) { + mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n", + dlm->name, res->lockname.len, res->lockname.name); + } +} + +#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) + +static int dlm_recovery_thread(void *data) +{ + int status; + struct dlm_ctxt *dlm = data; + unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS); + + mlog(0, "dlm thread running for %s...\n", dlm->name); + + while (!kthread_should_stop()) { + if (dlm_domain_fully_joined(dlm)) { + status = dlm_do_recovery(dlm); + if (status == -EAGAIN) { + /* do not sleep, recheck immediately. */ + continue; + } + if (status < 0) + mlog_errno(status); + } + + wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq, + kthread_should_stop(), + timeout); + } + + mlog(0, "quitting DLM recovery thread\n"); + return 0; +} + +/* returns true when the recovery master has contacted us */ +static int dlm_reco_master_ready(struct dlm_ctxt *dlm) +{ + int ready; + spin_lock(&dlm->spinlock); + ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM); + spin_unlock(&dlm->spinlock); + return ready; +} + +/* returns true if node is no longer in the domain + * could be dead or just not joined */ +int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) +{ + int dead; + spin_lock(&dlm->spinlock); + dead = !test_bit(node, dlm->domain_map); + spin_unlock(&dlm->spinlock); + return dead; +} + +/* returns true if node is no longer in the domain + * could be dead or just not joined */ +static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node) +{ + int recovered; + spin_lock(&dlm->spinlock); + recovered = !test_bit(node, dlm->recovery_map); + spin_unlock(&dlm->spinlock); + return recovered; +} + + +int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) +{ + if (timeout) { + mlog(ML_NOTICE, "%s: waiting %dms for notification of " + "death of node %u\n", dlm->name, timeout, node); + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, node), + msecs_to_jiffies(timeout)); + } else { + mlog(ML_NOTICE, "%s: waiting indefinitely for notification " + "of death of node %u\n", dlm->name, node); + wait_event(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, node)); + } + /* for now, return 0 */ + return 0; +} + +int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) +{ + if (timeout) { + mlog(0, "%s: waiting %dms for notification of " + "recovery of node %u\n", dlm->name, timeout, node); + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_recovered(dlm, node), + msecs_to_jiffies(timeout)); + } else { + mlog(0, "%s: waiting indefinitely for notification " + "of recovery of node %u\n", dlm->name, node); + wait_event(dlm->dlm_reco_thread_wq, + dlm_is_node_recovered(dlm, node)); + } + /* for now, return 0 */ + return 0; +} + +/* callers of the top-level api calls (dlmlock/dlmunlock) should + * block on the dlm->reco.event when recovery is in progress. + * the dlm recovery thread will set this state when it begins + * recovering a dead node (as the new master or not) and clear + * the state and wake as soon as all affected lock resources have + * been marked with the RECOVERY flag */ +static int dlm_in_recovery(struct dlm_ctxt *dlm) +{ + int in_recovery; + spin_lock(&dlm->spinlock); + in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE); + spin_unlock(&dlm->spinlock); + return in_recovery; +} + + +void dlm_wait_for_recovery(struct dlm_ctxt *dlm) +{ + if (dlm_in_recovery(dlm)) { + mlog(0, "%s: reco thread %d in recovery: " + "state=%d, master=%u, dead=%u\n", + dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), + dlm->reco.state, dlm->reco.new_master, + dlm->reco.dead_node); + } + wait_event(dlm->reco.event, !dlm_in_recovery(dlm)); +} + +static void dlm_begin_recovery(struct dlm_ctxt *dlm) +{ + spin_lock(&dlm->spinlock); + BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); + dlm->reco.state |= DLM_RECO_STATE_ACTIVE; + spin_unlock(&dlm->spinlock); +} + +static void dlm_end_recovery(struct dlm_ctxt *dlm) +{ + spin_lock(&dlm->spinlock); + BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); + dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; + spin_unlock(&dlm->spinlock); + wake_up(&dlm->reco.event); +} + +static int dlm_do_recovery(struct dlm_ctxt *dlm) +{ + int status = 0; + int ret; + + spin_lock(&dlm->spinlock); + + /* check to see if the new master has died */ + if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM && + test_bit(dlm->reco.new_master, dlm->recovery_map)) { + mlog(0, "new master %u died while recovering %u!\n", + dlm->reco.new_master, dlm->reco.dead_node); + /* unset the new_master, leave dead_node */ + dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); + } + + /* select a target to recover */ + if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { + int bit; + + bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0); + if (bit >= O2NM_MAX_NODES || bit < 0) + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); + else + dlm_set_reco_dead_node(dlm, bit); + } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) { + /* BUG? */ + mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n", + dlm->reco.dead_node); + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); + } + + if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { + // mlog(0, "nothing to recover! sleeping now!\n"); + spin_unlock(&dlm->spinlock); + /* return to main thread loop and sleep. */ + return 0; + } + mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", + dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), + dlm->reco.dead_node); + spin_unlock(&dlm->spinlock); + + /* take write barrier */ + /* (stops the list reshuffling thread, proxy ast handling) */ + dlm_begin_recovery(dlm); + + if (dlm->reco.new_master == dlm->node_num) + goto master_here; + + if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { + /* choose a new master, returns 0 if this node + * is the master, -EEXIST if it's another node. + * this does not return until a new master is chosen + * or recovery completes entirely. */ + ret = dlm_pick_recovery_master(dlm); + if (!ret) { + /* already notified everyone. go. */ + goto master_here; + } + mlog(0, "another node will master this recovery session.\n"); + } + mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", + dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, + dlm->node_num, dlm->reco.dead_node); + + /* it is safe to start everything back up here + * because all of the dead node's lock resources + * have been marked as in-recovery */ + dlm_end_recovery(dlm); + + /* sleep out in main dlm_recovery_thread loop. */ + return 0; + +master_here: + mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node " + "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task), + dlm->node_num, dlm->reco.dead_node, dlm->name); + + status = dlm_remaster_locks(dlm, dlm->reco.dead_node); + if (status < 0) { + /* we should never hit this anymore */ + mlog(ML_ERROR, "error %d remastering locks for node %u, " + "retrying.\n", status, dlm->reco.dead_node); + /* yield a bit to allow any final network messages + * to get handled on remaining nodes */ + msleep(100); + } else { + /* success! see if any other nodes need recovery */ + mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", + dlm->name, dlm->reco.dead_node, dlm->node_num); + dlm_reset_recovery(dlm); + } + dlm_end_recovery(dlm); + + /* continue and look for another dead node */ + return -EAGAIN; +} + +static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) +{ + int status = 0; + struct dlm_reco_node_data *ndata; + int all_nodes_done; + int destroy = 0; + int pass = 0; + + do { + /* we have become recovery master. there is no escaping + * this, so just keep trying until we get it. */ + status = dlm_init_recovery_area(dlm, dead_node); + if (status < 0) { + mlog(ML_ERROR, "%s: failed to alloc recovery area, " + "retrying\n", dlm->name); + msleep(1000); + } + } while (status != 0); + + /* safe to access the node data list without a lock, since this + * process is the only one to change the list */ + list_for_each_entry(ndata, &dlm->reco.node_data, list) { + BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); + ndata->state = DLM_RECO_NODE_DATA_REQUESTING; + + mlog(0, "requesting lock info from node %u\n", + ndata->node_num); + + if (ndata->node_num == dlm->node_num) { + ndata->state = DLM_RECO_NODE_DATA_DONE; + continue; + } + + do { + status = dlm_request_all_locks(dlm, ndata->node_num, + dead_node); + if (status < 0) { + mlog_errno(status); + if (dlm_is_host_down(status)) { + /* node died, ignore it for recovery */ + status = 0; + ndata->state = DLM_RECO_NODE_DATA_DEAD; + /* wait for the domain map to catch up + * with the network state. */ + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, + ndata->node_num), + msecs_to_jiffies(1000)); + mlog(0, "waited 1 sec for %u, " + "dead? %s\n", ndata->node_num, + dlm_is_node_dead(dlm, ndata->node_num) ? + "yes" : "no"); + } else { + /* -ENOMEM on the other node */ + mlog(0, "%s: node %u returned " + "%d during recovery, retrying " + "after a short wait\n", + dlm->name, ndata->node_num, + status); + msleep(100); + } + } + } while (status != 0); + + spin_lock(&dlm_reco_state_lock); + switch (ndata->state) { + case DLM_RECO_NODE_DATA_INIT: + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + case DLM_RECO_NODE_DATA_REQUESTED: + BUG(); + break; + case DLM_RECO_NODE_DATA_DEAD: + mlog(0, "node %u died after requesting " + "recovery info for node %u\n", + ndata->node_num, dead_node); + /* fine. don't need this node's info. + * continue without it. */ + break; + case DLM_RECO_NODE_DATA_REQUESTING: + ndata->state = DLM_RECO_NODE_DATA_REQUESTED; + mlog(0, "now receiving recovery data from " + "node %u for dead node %u\n", + ndata->node_num, dead_node); + break; + case DLM_RECO_NODE_DATA_RECEIVING: + mlog(0, "already receiving recovery data from " + "node %u for dead node %u\n", + ndata->node_num, dead_node); + break; + case DLM_RECO_NODE_DATA_DONE: + mlog(0, "already DONE receiving recovery data " + "from node %u for dead node %u\n", + ndata->node_num, dead_node); + break; + } + spin_unlock(&dlm_reco_state_lock); + } + + mlog(0, "done requesting all lock info\n"); + + /* nodes should be sending reco data now + * just need to wait */ + + while (1) { + /* check all the nodes now to see if we are + * done, or if anyone died */ + all_nodes_done = 1; + spin_lock(&dlm_reco_state_lock); + list_for_each_entry(ndata, &dlm->reco.node_data, list) { + mlog(0, "checking recovery state of node %u\n", + ndata->node_num); + switch (ndata->state) { + case DLM_RECO_NODE_DATA_INIT: + case DLM_RECO_NODE_DATA_REQUESTING: + mlog(ML_ERROR, "bad ndata state for " + "node %u: state=%d\n", + ndata->node_num, ndata->state); + BUG(); + break; + case DLM_RECO_NODE_DATA_DEAD: + mlog(0, "node %u died after " + "requesting recovery info for " + "node %u\n", ndata->node_num, + dead_node); + break; + case DLM_RECO_NODE_DATA_RECEIVING: + case DLM_RECO_NODE_DATA_REQUESTED: + mlog(0, "%s: node %u still in state %s\n", + dlm->name, ndata->node_num, + ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? + "receiving" : "requested"); + all_nodes_done = 0; + break; + case DLM_RECO_NODE_DATA_DONE: + mlog(0, "%s: node %u state is done\n", + dlm->name, ndata->node_num); + break; + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + mlog(0, "%s: node %u state is finalize\n", + dlm->name, ndata->node_num); + break; + } + } + spin_unlock(&dlm_reco_state_lock); + + mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass, + all_nodes_done?"yes":"no"); + if (all_nodes_done) { + int ret; + + /* all nodes are now in DLM_RECO_NODE_DATA_DONE state + * just send a finalize message to everyone and + * clean up */ + mlog(0, "all nodes are done! send finalize\n"); + ret = dlm_send_finalize_reco_message(dlm); + if (ret < 0) + mlog_errno(ret); + + spin_lock(&dlm->spinlock); + dlm_finish_local_lockres_recovery(dlm, dead_node, + dlm->node_num); + spin_unlock(&dlm->spinlock); + mlog(0, "should be done with recovery!\n"); + + mlog(0, "finishing recovery of %s at %lu, " + "dead=%u, this=%u, new=%u\n", dlm->name, + jiffies, dlm->reco.dead_node, + dlm->node_num, dlm->reco.new_master); + destroy = 1; + status = 0; + /* rescan everything marked dirty along the way */ + dlm_kick_thread(dlm, NULL); + break; + } + /* wait to be signalled, with periodic timeout + * to check for node death */ + wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq, + kthread_should_stop(), + msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS)); + + } + + if (destroy) + dlm_destroy_recovery_area(dlm, dead_node); + + return status; +} + +static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) +{ + int num=0; + struct dlm_reco_node_data *ndata; + + spin_lock(&dlm->spinlock); + memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map)); + /* nodes can only be removed (by dying) after dropping + * this lock, and death will be trapped later, so this should do */ + spin_unlock(&dlm->spinlock); + + while (1) { + num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num); + if (num >= O2NM_MAX_NODES) { + break; + } + BUG_ON(num == dead_node); + + ndata = kzalloc(sizeof(*ndata), GFP_NOFS); + if (!ndata) { + dlm_destroy_recovery_area(dlm, dead_node); + return -ENOMEM; + } + ndata->node_num = num; + ndata->state = DLM_RECO_NODE_DATA_INIT; + spin_lock(&dlm_reco_state_lock); + list_add_tail(&ndata->list, &dlm->reco.node_data); + spin_unlock(&dlm_reco_state_lock); + num++; + } + + return 0; +} + +static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) +{ + struct dlm_reco_node_data *ndata, *next; + LIST_HEAD(tmplist); + + spin_lock(&dlm_reco_state_lock); + list_splice_init(&dlm->reco.node_data, &tmplist); + spin_unlock(&dlm_reco_state_lock); + + list_for_each_entry_safe(ndata, next, &tmplist, list) { + list_del_init(&ndata->list); + kfree(ndata); + } +} + +static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, + u8 dead_node) +{ + struct dlm_lock_request lr; + enum dlm_status ret; + + mlog(0, "\n"); + + + mlog(0, "dlm_request_all_locks: dead node is %u, sending request " + "to %u\n", dead_node, request_from); + + memset(&lr, 0, sizeof(lr)); + lr.node_idx = dlm->node_num; + lr.dead_node = dead_node; + + // send message + ret = DLM_NOLOCKMGR; + ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, + &lr, sizeof(lr), request_from, NULL); + + /* negative status is handled by caller */ + if (ret < 0) + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG, + dlm->key, request_from); + + // return from here, then + // sleep until all received or error + return ret; + +} + +int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf; + char *buf = NULL; + struct dlm_work_item *item = NULL; + + if (!dlm_grab(dlm)) + return -EINVAL; + + if (lr->dead_node != dlm->reco.dead_node) { + mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local " + "dead_node is %u\n", dlm->name, lr->node_idx, + lr->dead_node, dlm->reco.dead_node); + dlm_print_reco_node_status(dlm); + /* this is a hack */ + dlm_put(dlm); + return -ENOMEM; + } + BUG_ON(lr->dead_node != dlm->reco.dead_node); + + item = kzalloc(sizeof(*item), GFP_NOFS); + if (!item) { + dlm_put(dlm); + return -ENOMEM; + } + + /* this will get freed by dlm_request_all_locks_worker */ + buf = (char *) __get_free_page(GFP_NOFS); + if (!buf) { + kfree(item); + dlm_put(dlm); + return -ENOMEM; + } + + /* queue up work for dlm_request_all_locks_worker */ + dlm_grab(dlm); /* get an extra ref for the work item */ + dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf); + item->u.ral.reco_master = lr->node_idx; + item->u.ral.dead_node = lr->dead_node; + spin_lock(&dlm->work_lock); + list_add_tail(&item->list, &dlm->work_list); + spin_unlock(&dlm->work_lock); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); + + dlm_put(dlm); + return 0; +} + +static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) +{ + struct dlm_migratable_lockres *mres; + struct dlm_lock_resource *res; + struct dlm_ctxt *dlm; + LIST_HEAD(resources); + int ret; + u8 dead_node, reco_master; + int skip_all_done = 0; + + dlm = item->dlm; + dead_node = item->u.ral.dead_node; + reco_master = item->u.ral.reco_master; + mres = (struct dlm_migratable_lockres *)data; + + mlog(0, "%s: recovery worker started, dead=%u, master=%u\n", + dlm->name, dead_node, reco_master); + + if (dead_node != dlm->reco.dead_node || + reco_master != dlm->reco.new_master) { + /* worker could have been created before the recovery master + * died. if so, do not continue, but do not error. */ + if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { + mlog(ML_NOTICE, "%s: will not send recovery state, " + "recovery master %u died, thread=(dead=%u,mas=%u)" + " current=(dead=%u,mas=%u)\n", dlm->name, + reco_master, dead_node, reco_master, + dlm->reco.dead_node, dlm->reco.new_master); + } else { + mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, " + "master=%u), request(dead=%u, master=%u)\n", + dlm->name, dlm->reco.dead_node, + dlm->reco.new_master, dead_node, reco_master); + } + goto leave; + } + + /* lock resources should have already been moved to the + * dlm->reco.resources list. now move items from that list + * to a temp list if the dead owner matches. note that the + * whole cluster recovers only one node at a time, so we + * can safely move UNKNOWN lock resources for each recovery + * session. */ + dlm_move_reco_locks_to_list(dlm, &resources, dead_node); + + /* now we can begin blasting lockreses without the dlm lock */ + + /* any errors returned will be due to the new_master dying, + * the dlm_reco_thread should detect this */ + list_for_each_entry(res, &resources, recovering) { + ret = dlm_send_one_lockres(dlm, res, mres, reco_master, + DLM_MRES_RECOVERY); + if (ret < 0) { + mlog(ML_ERROR, "%s: node %u went down while sending " + "recovery state for dead node %u, ret=%d\n", dlm->name, + reco_master, dead_node, ret); + skip_all_done = 1; + break; + } + } + + /* move the resources back to the list */ + spin_lock(&dlm->spinlock); + list_splice_init(&resources, &dlm->reco.resources); + spin_unlock(&dlm->spinlock); + + if (!skip_all_done) { + ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); + if (ret < 0) { + mlog(ML_ERROR, "%s: node %u went down while sending " + "recovery all-done for dead node %u, ret=%d\n", + dlm->name, reco_master, dead_node, ret); + } + } +leave: + free_page((unsigned long)data); +} + + +static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) +{ + int ret, tmpret; + struct dlm_reco_data_done done_msg; + + memset(&done_msg, 0, sizeof(done_msg)); + done_msg.node_idx = dlm->node_num; + done_msg.dead_node = dead_node; + mlog(0, "sending DATA DONE message to %u, " + "my node=%u, dead node=%u\n", send_to, done_msg.node_idx, + done_msg.dead_node); + + ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, + sizeof(done_msg), send_to, &tmpret); + if (ret < 0) { + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG, + dlm->key, send_to); + if (!dlm_is_host_down(ret)) { + BUG(); + } + } else + ret = tmpret; + return ret; +} + + +int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; + struct dlm_reco_node_data *ndata = NULL; + int ret = -EINVAL; + + if (!dlm_grab(dlm)) + return -EINVAL; + + mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " + "node_idx=%u, this node=%u\n", done->dead_node, + dlm->reco.dead_node, done->node_idx, dlm->node_num); + + mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node), + "Got DATA DONE: dead_node=%u, reco.dead_node=%u, " + "node_idx=%u, this node=%u\n", done->dead_node, + dlm->reco.dead_node, done->node_idx, dlm->node_num); + + spin_lock(&dlm_reco_state_lock); + list_for_each_entry(ndata, &dlm->reco.node_data, list) { + if (ndata->node_num != done->node_idx) + continue; + + switch (ndata->state) { + /* should have moved beyond INIT but not to FINALIZE yet */ + case DLM_RECO_NODE_DATA_INIT: + case DLM_RECO_NODE_DATA_DEAD: + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + mlog(ML_ERROR, "bad ndata state for node %u:" + " state=%d\n", ndata->node_num, + ndata->state); + BUG(); + break; + /* these states are possible at this point, anywhere along + * the line of recovery */ + case DLM_RECO_NODE_DATA_DONE: + case DLM_RECO_NODE_DATA_RECEIVING: + case DLM_RECO_NODE_DATA_REQUESTED: + case DLM_RECO_NODE_DATA_REQUESTING: + mlog(0, "node %u is DONE sending " + "recovery data!\n", + ndata->node_num); + + ndata->state = DLM_RECO_NODE_DATA_DONE; + ret = 0; + break; + } + } + spin_unlock(&dlm_reco_state_lock); + + /* wake the recovery thread, some node is done */ + if (!ret) + dlm_kick_recovery_thread(dlm); + + if (ret < 0) + mlog(ML_ERROR, "failed to find recovery node data for node " + "%u\n", done->node_idx); + dlm_put(dlm); + + mlog(0, "leaving reco data done handler, ret=%d\n", ret); + return ret; +} + +static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, + struct list_head *list, + u8 dead_node) +{ + struct dlm_lock_resource *res, *next; + struct dlm_lock *lock; + + spin_lock(&dlm->spinlock); + list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { + /* always prune any $RECOVERY entries for dead nodes, + * otherwise hangs can occur during later recovery */ + if (dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + spin_lock(&res->spinlock); + list_for_each_entry(lock, &res->granted, list) { + if (lock->ml.node == dead_node) { + mlog(0, "AHA! there was " + "a $RECOVERY lock for dead " + "node %u (%s)!\n", + dead_node, dlm->name); + list_del_init(&lock->list); + dlm_lock_put(lock); + break; + } + } + spin_unlock(&res->spinlock); + continue; + } + + if (res->owner == dead_node) { + mlog(0, "found lockres owned by dead node while " + "doing recovery for node %u. sending it.\n", + dead_node); + list_move_tail(&res->recovering, list); + } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "found UNKNOWN owner while doing recovery " + "for node %u. sending it.\n", dead_node); + list_move_tail(&res->recovering, list); + } + } + spin_unlock(&dlm->spinlock); +} + +static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res) +{ + int total_locks = 0; + struct list_head *iter, *queue = &res->granted; + int i; + + for (i=0; i<3; i++) { + list_for_each(iter, queue) + total_locks++; + queue++; + } + return total_locks; +} + + +static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, + struct dlm_migratable_lockres *mres, + u8 send_to, + struct dlm_lock_resource *res, + int total_locks) +{ + u64 mig_cookie = be64_to_cpu(mres->mig_cookie); + int mres_total_locks = be32_to_cpu(mres->total_locks); + int sz, ret = 0, status = 0; + u8 orig_flags = mres->flags, + orig_master = mres->master; + + BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS); + if (!mres->num_locks) + return 0; + + sz = sizeof(struct dlm_migratable_lockres) + + (mres->num_locks * sizeof(struct dlm_migratable_lock)); + + /* add an all-done flag if we reached the last lock */ + orig_flags = mres->flags; + BUG_ON(total_locks > mres_total_locks); + if (total_locks == mres_total_locks) + mres->flags |= DLM_MRES_ALL_DONE; + + mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", + dlm->name, res->lockname.len, res->lockname.name, + orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery", + send_to); + + /* send it */ + ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, + sz, send_to, &status); + if (ret < 0) { + /* XXX: negative status is not handled. + * this will end up killing this node. */ + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG, + dlm->key, send_to); + } else { + /* might get an -ENOMEM back here */ + ret = status; + if (ret < 0) { + mlog_errno(ret); + + if (ret == -EFAULT) { + mlog(ML_ERROR, "node %u told me to kill " + "myself!\n", send_to); + BUG(); + } + } + } + + /* zero and reinit the message buffer */ + dlm_init_migratable_lockres(mres, res->lockname.name, + res->lockname.len, mres_total_locks, + mig_cookie, orig_flags, orig_master); + return ret; +} + +static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, + const char *lockname, int namelen, + int total_locks, u64 cookie, + u8 flags, u8 master) +{ + /* mres here is one full page */ + clear_page(mres); + mres->lockname_len = namelen; + memcpy(mres->lockname, lockname, namelen); + mres->num_locks = 0; + mres->total_locks = cpu_to_be32(total_locks); + mres->mig_cookie = cpu_to_be64(cookie); + mres->flags = flags; + mres->master = master; +} + +static void dlm_prepare_lvb_for_migration(struct dlm_lock *lock, + struct dlm_migratable_lockres *mres, + int queue) +{ + if (!lock->lksb) + return; + + /* Ignore lvb in all locks in the blocked list */ + if (queue == DLM_BLOCKED_LIST) + return; + + /* Only consider lvbs in locks with granted EX or PR lock levels */ + if (lock->ml.type != LKM_EXMODE && lock->ml.type != LKM_PRMODE) + return; + + if (dlm_lvb_is_empty(mres->lvb)) { + memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN); + return; + } + + /* Ensure the lvb copied for migration matches in other valid locks */ + if (!memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN)) + return; + + mlog(ML_ERROR, "Mismatched lvb in lock cookie=%u:%llu, name=%.*s, " + "node=%u\n", + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + lock->lockres->lockname.len, lock->lockres->lockname.name, + lock->ml.node); + dlm_print_one_lock_resource(lock->lockres); + BUG(); +} + +/* returns 1 if this lock fills the network structure, + * 0 otherwise */ +static int dlm_add_lock_to_array(struct dlm_lock *lock, + struct dlm_migratable_lockres *mres, int queue) +{ + struct dlm_migratable_lock *ml; + int lock_num = mres->num_locks; + + ml = &(mres->ml[lock_num]); + ml->cookie = lock->ml.cookie; + ml->type = lock->ml.type; + ml->convert_type = lock->ml.convert_type; + ml->highest_blocked = lock->ml.highest_blocked; + ml->list = queue; + if (lock->lksb) { + ml->flags = lock->lksb->flags; + dlm_prepare_lvb_for_migration(lock, mres, queue); + } + ml->node = lock->ml.node; + mres->num_locks++; + /* we reached the max, send this network message */ + if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS) + return 1; + return 0; +} + +static void dlm_add_dummy_lock(struct dlm_ctxt *dlm, + struct dlm_migratable_lockres *mres) +{ + struct dlm_lock dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.ml.cookie = 0; + dummy.ml.type = LKM_IVMODE; + dummy.ml.convert_type = LKM_IVMODE; + dummy.ml.highest_blocked = LKM_IVMODE; + dummy.lksb = NULL; + dummy.ml.node = dlm->node_num; + dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST); +} + +static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm, + struct dlm_migratable_lock *ml, + u8 *nodenum) +{ + if (unlikely(ml->cookie == 0 && + ml->type == LKM_IVMODE && + ml->convert_type == LKM_IVMODE && + ml->highest_blocked == LKM_IVMODE && + ml->list == DLM_BLOCKED_LIST)) { + *nodenum = ml->node; + return 1; + } + return 0; +} + +int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_migratable_lockres *mres, + u8 send_to, u8 flags) +{ + struct list_head *queue; + int total_locks, i; + u64 mig_cookie = 0; + struct dlm_lock *lock; + int ret = 0; + + BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); + + mlog(0, "sending to %u\n", send_to); + + total_locks = dlm_num_locks_in_lockres(res); + if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) { + /* rare, but possible */ + mlog(0, "argh. lockres has %d locks. this will " + "require more than one network packet to " + "migrate\n", total_locks); + mig_cookie = dlm_get_next_mig_cookie(); + } + + dlm_init_migratable_lockres(mres, res->lockname.name, + res->lockname.len, total_locks, + mig_cookie, flags, res->owner); + + total_locks = 0; + for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each_entry(lock, queue, list) { + /* add another lock. */ + total_locks++; + if (!dlm_add_lock_to_array(lock, mres, i)) + continue; + + /* this filled the lock message, + * we must send it immediately. */ + ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, + res, total_locks); + if (ret < 0) + goto error; + } + } + if (total_locks == 0) { + /* send a dummy lock to indicate a mastery reference only */ + mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n", + dlm->name, res->lockname.len, res->lockname.name, + send_to, flags & DLM_MRES_RECOVERY ? "recovery" : + "migration"); + dlm_add_dummy_lock(dlm, mres); + } + /* flush any remaining locks */ + ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); + if (ret < 0) + goto error; + return ret; + +error: + mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n", + dlm->name, ret); + if (!dlm_is_host_down(ret)) + BUG(); + mlog(0, "%s: node %u went down while sending %s " + "lockres %.*s\n", dlm->name, send_to, + flags & DLM_MRES_RECOVERY ? "recovery" : "migration", + res->lockname.len, res->lockname.name); + return ret; +} + + + +/* + * this message will contain no more than one page worth of + * recovery data, and it will work on only one lockres. + * there may be many locks in this page, and we may need to wait + * for additional packets to complete all the locks (rare, but + * possible). + */ +/* + * NOTE: the allocation error cases here are scary + * we really cannot afford to fail an alloc in recovery + * do we spin? returning an error only delays the problem really + */ + +int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_migratable_lockres *mres = + (struct dlm_migratable_lockres *)msg->buf; + int ret = 0; + u8 real_master; + u8 extra_refs = 0; + char *buf = NULL; + struct dlm_work_item *item = NULL; + struct dlm_lock_resource *res = NULL; + + if (!dlm_grab(dlm)) + return -EINVAL; + + BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); + + real_master = mres->master; + if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { + /* cannot migrate a lockres with no master */ + BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); + } + + mlog(0, "%s message received from node %u\n", + (mres->flags & DLM_MRES_RECOVERY) ? + "recovery" : "migration", mres->master); + if (mres->flags & DLM_MRES_ALL_DONE) + mlog(0, "all done flag. all lockres data received!\n"); + + ret = -ENOMEM; + buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS); + item = kzalloc(sizeof(*item), GFP_NOFS); + if (!buf || !item) + goto leave; + + /* lookup the lock to see if we have a secondary queue for this + * already... just add the locks in and this will have its owner + * and RECOVERY flag changed when it completes. */ + res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len); + if (res) { + /* this will get a ref on res */ + /* mark it as recovering/migrating and hash it */ + spin_lock(&res->spinlock); + if (mres->flags & DLM_MRES_RECOVERY) { + res->state |= DLM_LOCK_RES_RECOVERING; + } else { + if (res->state & DLM_LOCK_RES_MIGRATING) { + /* this is at least the second + * lockres message */ + mlog(0, "lock %.*s is already migrating\n", + mres->lockname_len, + mres->lockname); + } else if (res->state & DLM_LOCK_RES_RECOVERING) { + /* caller should BUG */ + mlog(ML_ERROR, "node is attempting to migrate " + "lock %.*s, but marked as recovering!\n", + mres->lockname_len, mres->lockname); + ret = -EFAULT; + spin_unlock(&res->spinlock); + goto leave; + } + res->state |= DLM_LOCK_RES_MIGRATING; + } + spin_unlock(&res->spinlock); + } else { + /* need to allocate, just like if it was + * mastered here normally */ + res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len); + if (!res) + goto leave; + + /* to match the ref that we would have gotten if + * dlm_lookup_lockres had succeeded */ + dlm_lockres_get(res); + + /* mark it as recovering/migrating and hash it */ + if (mres->flags & DLM_MRES_RECOVERY) + res->state |= DLM_LOCK_RES_RECOVERING; + else + res->state |= DLM_LOCK_RES_MIGRATING; + + spin_lock(&dlm->spinlock); + __dlm_insert_lockres(dlm, res); + spin_unlock(&dlm->spinlock); + + /* Add an extra ref for this lock-less lockres lest the + * dlm_thread purges it before we get the chance to add + * locks to it */ + dlm_lockres_get(res); + + /* There are three refs that need to be put. + * 1. Taken above. + * 2. kref_init in dlm_new_lockres()->dlm_init_lockres(). + * 3. dlm_lookup_lockres() + * The first one is handled at the end of this function. The + * other two are handled in the worker thread after locks have + * been attached. Yes, we don't wait for purge time to match + * kref_init. The lockres will still have atleast one ref + * added because it is in the hash __dlm_insert_lockres() */ + extra_refs++; + + /* now that the new lockres is inserted, + * make it usable by other processes */ + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + } + + /* at this point we have allocated everything we need, + * and we have a hashed lockres with an extra ref and + * the proper res->state flags. */ + ret = 0; + spin_lock(&res->spinlock); + /* drop this either when master requery finds a different master + * or when a lock is added by the recovery worker */ + dlm_lockres_grab_inflight_ref(dlm, res); + if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) { + /* migration cannot have an unknown master */ + BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); + mlog(0, "recovery has passed me a lockres with an " + "unknown owner.. will need to requery: " + "%.*s\n", mres->lockname_len, mres->lockname); + } else { + /* take a reference now to pin the lockres, drop it + * when locks are added in the worker */ + dlm_change_lockres_owner(dlm, res, dlm->node_num); + } + spin_unlock(&res->spinlock); + + /* queue up work for dlm_mig_lockres_worker */ + dlm_grab(dlm); /* get an extra ref for the work item */ + memcpy(buf, msg->buf, be16_to_cpu(msg->data_len)); /* copy the whole message */ + dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf); + item->u.ml.lockres = res; /* already have a ref */ + item->u.ml.real_master = real_master; + item->u.ml.extra_ref = extra_refs; + spin_lock(&dlm->work_lock); + list_add_tail(&item->list, &dlm->work_list); + spin_unlock(&dlm->work_lock); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); + +leave: + /* One extra ref taken needs to be put here */ + if (extra_refs) + dlm_lockres_put(res); + + dlm_put(dlm); + if (ret < 0) { + if (buf) + kfree(buf); + if (item) + kfree(item); + mlog_errno(ret); + } + + return ret; +} + + +static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data) +{ + struct dlm_ctxt *dlm; + struct dlm_migratable_lockres *mres; + int ret = 0; + struct dlm_lock_resource *res; + u8 real_master; + u8 extra_ref; + + dlm = item->dlm; + mres = (struct dlm_migratable_lockres *)data; + + res = item->u.ml.lockres; + real_master = item->u.ml.real_master; + extra_ref = item->u.ml.extra_ref; + + if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { + /* this case is super-rare. only occurs if + * node death happens during migration. */ +again: + ret = dlm_lockres_master_requery(dlm, res, &real_master); + if (ret < 0) { + mlog(0, "dlm_lockres_master_requery ret=%d\n", + ret); + goto again; + } + if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "lockres %.*s not claimed. " + "this node will take it.\n", + res->lockname.len, res->lockname.name); + } else { + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); + mlog(0, "master needs to respond to sender " + "that node %u still owns %.*s\n", + real_master, res->lockname.len, + res->lockname.name); + /* cannot touch this lockres */ + goto leave; + } + } + + ret = dlm_process_recovery_data(dlm, res, mres); + if (ret < 0) + mlog(0, "dlm_process_recovery_data returned %d\n", ret); + else + mlog(0, "dlm_process_recovery_data succeeded\n"); + + if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) == + (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) { + ret = dlm_finish_migration(dlm, res, mres->master); + if (ret < 0) + mlog_errno(ret); + } + +leave: + /* See comment in dlm_mig_lockres_handler() */ + if (res) { + if (extra_ref) + dlm_lockres_put(res); + dlm_lockres_put(res); + } + kfree(data); +} + + + +static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 *real_master) +{ + struct dlm_node_iter iter; + int nodenum; + int ret = 0; + + *real_master = DLM_LOCK_RES_OWNER_UNKNOWN; + + /* we only reach here if one of the two nodes in a + * migration died while the migration was in progress. + * at this point we need to requery the master. we + * know that the new_master got as far as creating + * an mle on at least one node, but we do not know + * if any nodes had actually cleared the mle and set + * the master to the new_master. the old master + * is supposed to set the owner to UNKNOWN in the + * event of a new_master death, so the only possible + * responses that we can get from nodes here are + * that the master is new_master, or that the master + * is UNKNOWN. + * if all nodes come back with UNKNOWN then we know + * the lock needs remastering here. + * if any node comes back with a valid master, check + * to see if that master is the one that we are + * recovering. if so, then the new_master died and + * we need to remaster this lock. if not, then the + * new_master survived and that node will respond to + * other nodes about the owner. + * if there is an owner, this node needs to dump this + * lockres and alert the sender that this lockres + * was rejected. */ + spin_lock(&dlm->spinlock); + dlm_node_iter_init(dlm->domain_map, &iter); + spin_unlock(&dlm->spinlock); + + while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { + /* do not send to self */ + if (nodenum == dlm->node_num) + continue; + ret = dlm_do_master_requery(dlm, res, nodenum, real_master); + if (ret < 0) { + mlog_errno(ret); + if (!dlm_is_host_down(ret)) + BUG(); + /* host is down, so answer for that node would be + * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ + } + if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "lock master is %u\n", *real_master); + break; + } + } + return ret; +} + + +int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + u8 nodenum, u8 *real_master) +{ + int ret = -EINVAL; + struct dlm_master_requery req; + int status = DLM_LOCK_RES_OWNER_UNKNOWN; + + memset(&req, 0, sizeof(req)); + req.node_idx = dlm->node_num; + req.namelen = res->lockname.len; + memcpy(req.name, res->lockname.name, res->lockname.len); + + ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key, + &req, sizeof(req), nodenum, &status); + /* XXX: negative status not handled properly here. */ + if (ret < 0) + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG, + dlm->key, nodenum); + else { + BUG_ON(status < 0); + BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); + *real_master = (u8) (status & 0xff); + mlog(0, "node %u responded to master requery with %u\n", + nodenum, *real_master); + ret = 0; + } + return ret; +} + + +/* this function cannot error, so unless the sending + * or receiving of the message failed, the owner can + * be trusted */ +int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; + struct dlm_lock_resource *res = NULL; + unsigned int hash; + int master = DLM_LOCK_RES_OWNER_UNKNOWN; + u32 flags = DLM_ASSERT_MASTER_REQUERY; + + if (!dlm_grab(dlm)) { + /* since the domain has gone away on this + * node, the proper response is UNKNOWN */ + return master; + } + + hash = dlm_lockid_hash(req->name, req->namelen); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash); + if (res) { + spin_lock(&res->spinlock); + master = res->owner; + if (master == dlm->node_num) { + int ret = dlm_dispatch_assert_master(dlm, res, + 0, 0, flags); + if (ret < 0) { + mlog_errno(-ENOMEM); + /* retry!? */ + BUG(); + } + } else /* put.. incase we are not the master */ + dlm_lockres_put(res); + spin_unlock(&res->spinlock); + } + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); + return master; +} + +static inline struct list_head * +dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num) +{ + struct list_head *ret; + BUG_ON(list_num < 0); + BUG_ON(list_num > 2); + ret = &(res->granted); + ret += list_num; + return ret; +} +/* TODO: do ast flush business + * TODO: do MIGRATING and RECOVERING spinning + */ + +/* +* NOTE about in-flight requests during migration: +* +* Before attempting the migrate, the master has marked the lockres as +* MIGRATING and then flushed all of its pending ASTS. So any in-flight +* requests either got queued before the MIGRATING flag got set, in which +* case the lock data will reflect the change and a return message is on +* the way, or the request failed to get in before MIGRATING got set. In +* this case, the caller will be told to spin and wait for the MIGRATING +* flag to be dropped, then recheck the master. +* This holds true for the convert, cancel and unlock cases, and since lvb +* updates are tied to these same messages, it applies to lvb updates as +* well. For the lock case, there is no way a lock can be on the master +* queue and not be on the secondary queue since the lock is always added +* locally first. This means that the new target node will never be sent +* a lock that he doesn't already have on the list. +* In total, this means that the local lock is correct and should not be +* updated to match the one sent by the master. Any messages sent back +* from the master before the MIGRATING flag will bring the lock properly +* up-to-date, and the change will be ordered properly for the waiter. +* We will *not* attempt to modify the lock underneath the waiter. +*/ + +static int dlm_process_recovery_data(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_migratable_lockres *mres) +{ + struct dlm_migratable_lock *ml; + struct list_head *queue; + struct list_head *tmpq = NULL; + struct dlm_lock *newlock = NULL; + struct dlm_lockstatus *lksb = NULL; + int ret = 0; + int i, j, bad; + struct dlm_lock *lock = NULL; + u8 from = O2NM_MAX_NODES; + unsigned int added = 0; + __be64 c; + + mlog(0, "running %d locks for this lockres\n", mres->num_locks); + for (i=0; inum_locks; i++) { + ml = &(mres->ml[i]); + + if (dlm_is_dummy_lock(dlm, ml, &from)) { + /* placeholder, just need to set the refmap bit */ + BUG_ON(mres->num_locks != 1); + mlog(0, "%s:%.*s: dummy lock for %u\n", + dlm->name, mres->lockname_len, mres->lockname, + from); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(from, res); + spin_unlock(&res->spinlock); + added++; + break; + } + BUG_ON(ml->highest_blocked != LKM_IVMODE); + newlock = NULL; + lksb = NULL; + + queue = dlm_list_num_to_pointer(res, ml->list); + tmpq = NULL; + + /* if the lock is for the local node it needs to + * be moved to the proper location within the queue. + * do not allocate a new lock structure. */ + if (ml->node == dlm->node_num) { + /* MIGRATION ONLY! */ + BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); + + spin_lock(&res->spinlock); + for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { + tmpq = dlm_list_idx_to_ptr(res, j); + list_for_each_entry(lock, tmpq, list) { + if (lock->ml.cookie != ml->cookie) + lock = NULL; + else + break; + } + if (lock) + break; + } + + /* lock is always created locally first, and + * destroyed locally last. it must be on the list */ + if (!lock) { + c = ml->cookie; + mlog(ML_ERROR, "Could not find local lock " + "with cookie %u:%llu, node %u, " + "list %u, flags 0x%x, type %d, " + "conv %d, highest blocked %d\n", + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c)), + ml->node, ml->list, ml->flags, ml->type, + ml->convert_type, ml->highest_blocked); + __dlm_print_one_lock_resource(res); + BUG(); + } + + if (lock->ml.node != ml->node) { + c = lock->ml.cookie; + mlog(ML_ERROR, "Mismatched node# in lock " + "cookie %u:%llu, name %.*s, node %u\n", + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c)), + res->lockname.len, res->lockname.name, + lock->ml.node); + c = ml->cookie; + mlog(ML_ERROR, "Migrate lock cookie %u:%llu, " + "node %u, list %u, flags 0x%x, type %d, " + "conv %d, highest blocked %d\n", + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c)), + ml->node, ml->list, ml->flags, ml->type, + ml->convert_type, ml->highest_blocked); + __dlm_print_one_lock_resource(res); + BUG(); + } + + if (tmpq != queue) { + c = ml->cookie; + mlog(0, "Lock cookie %u:%llu was on list %u " + "instead of list %u for %.*s\n", + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c)), + j, ml->list, res->lockname.len, + res->lockname.name); + __dlm_print_one_lock_resource(res); + spin_unlock(&res->spinlock); + continue; + } + + /* see NOTE above about why we do not update + * to match the master here */ + + /* move the lock to its proper place */ + /* do not alter lock refcount. switching lists. */ + list_move_tail(&lock->list, queue); + spin_unlock(&res->spinlock); + added++; + + mlog(0, "just reordered a local lock!\n"); + continue; + } + + /* lock is for another node. */ + newlock = dlm_new_lock(ml->type, ml->node, + be64_to_cpu(ml->cookie), NULL); + if (!newlock) { + ret = -ENOMEM; + goto leave; + } + lksb = newlock->lksb; + dlm_lock_attach_lockres(newlock, res); + + if (ml->convert_type != LKM_IVMODE) { + BUG_ON(queue != &res->converting); + newlock->ml.convert_type = ml->convert_type; + } + lksb->flags |= (ml->flags & + (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); + + if (ml->type == LKM_NLMODE) + goto skip_lvb; + + if (!dlm_lvb_is_empty(mres->lvb)) { + if (lksb->flags & DLM_LKSB_PUT_LVB) { + /* other node was trying to update + * lvb when node died. recreate the + * lksb with the updated lvb. */ + memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN); + /* the lock resource lvb update must happen + * NOW, before the spinlock is dropped. + * we no longer wait for the AST to update + * the lvb. */ + memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); + } else { + /* otherwise, the node is sending its + * most recent valid lvb info */ + BUG_ON(ml->type != LKM_EXMODE && + ml->type != LKM_PRMODE); + if (!dlm_lvb_is_empty(res->lvb) && + (ml->type == LKM_EXMODE || + memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { + int i; + mlog(ML_ERROR, "%s:%.*s: received bad " + "lvb! type=%d\n", dlm->name, + res->lockname.len, + res->lockname.name, ml->type); + printk("lockres lvb=["); + for (i=0; ilvb[i]); + printk("]\nmigrated lvb=["); + for (i=0; ilvb[i]); + printk("]\n"); + dlm_print_one_lock_resource(res); + BUG(); + } + memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); + } + } +skip_lvb: + + /* NOTE: + * wrt lock queue ordering and recovery: + * 1. order of locks on granted queue is + * meaningless. + * 2. order of locks on converting queue is + * LOST with the node death. sorry charlie. + * 3. order of locks on the blocked queue is + * also LOST. + * order of locks does not affect integrity, it + * just means that a lock request may get pushed + * back in line as a result of the node death. + * also note that for a given node the lock order + * for its secondary queue locks is preserved + * relative to each other, but clearly *not* + * preserved relative to locks from other nodes. + */ + bad = 0; + spin_lock(&res->spinlock); + list_for_each_entry(lock, queue, list) { + if (lock->ml.cookie == ml->cookie) { + c = lock->ml.cookie; + mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " + "exists on this lockres!\n", dlm->name, + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c))); + + mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, " + "node=%u, cookie=%u:%llu, queue=%d\n", + ml->type, ml->convert_type, ml->node, + dlm_get_lock_cookie_node(be64_to_cpu(ml->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(ml->cookie)), + ml->list); + + __dlm_print_one_lock_resource(res); + bad = 1; + break; + } + } + if (!bad) { + dlm_lock_get(newlock); + list_add_tail(&newlock->list, queue); + mlog(0, "%s:%.*s: added lock for node %u, " + "setting refmap bit\n", dlm->name, + res->lockname.len, res->lockname.name, ml->node); + dlm_lockres_set_refmap_bit(ml->node, res); + added++; + } + spin_unlock(&res->spinlock); + } + mlog(0, "done running all the locks\n"); + +leave: + /* balance the ref taken when the work was queued */ + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); + + if (ret < 0) { + mlog_errno(ret); + if (newlock) + dlm_lock_put(newlock); + } + + return ret; +} + +void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + int i; + struct list_head *queue; + struct dlm_lock *lock, *next; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + res->state |= DLM_LOCK_RES_RECOVERING; + if (!list_empty(&res->recovering)) { + mlog(0, + "Recovering res %s:%.*s, is already on recovery list!\n", + dlm->name, res->lockname.len, res->lockname.name); + list_del_init(&res->recovering); + dlm_lockres_put(res); + } + /* We need to hold a reference while on the recovery list */ + dlm_lockres_get(res); + list_add_tail(&res->recovering, &dlm->reco.resources); + + /* find any pending locks and put them back on proper list */ + for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each_entry_safe(lock, next, queue, list) { + dlm_lock_get(lock); + if (lock->convert_pending) { + /* move converting lock back to granted */ + BUG_ON(i != DLM_CONVERTING_LIST); + mlog(0, "node died with convert pending " + "on %.*s. move back to granted list.\n", + res->lockname.len, res->lockname.name); + dlm_revert_pending_convert(res, lock); + lock->convert_pending = 0; + } else if (lock->lock_pending) { + /* remove pending lock requests completely */ + BUG_ON(i != DLM_BLOCKED_LIST); + mlog(0, "node died with lock pending " + "on %.*s. remove from blocked list and skip.\n", + res->lockname.len, res->lockname.name); + /* lock will be floating until ref in + * dlmlock_remote is freed after the network + * call returns. ok for it to not be on any + * list since no ast can be called + * (the master is dead). */ + dlm_revert_pending_lock(res, lock); + lock->lock_pending = 0; + } else if (lock->unlock_pending) { + /* if an unlock was in progress, treat as + * if this had completed successfully + * before sending this lock state to the + * new master. note that the dlm_unlock + * call is still responsible for calling + * the unlockast. that will happen after + * the network call times out. for now, + * just move lists to prepare the new + * recovery master. */ + BUG_ON(i != DLM_GRANTED_LIST); + mlog(0, "node died with unlock pending " + "on %.*s. remove from blocked list and skip.\n", + res->lockname.len, res->lockname.name); + dlm_commit_pending_unlock(res, lock); + lock->unlock_pending = 0; + } else if (lock->cancel_pending) { + /* if a cancel was in progress, treat as + * if this had completed successfully + * before sending this lock state to the + * new master */ + BUG_ON(i != DLM_CONVERTING_LIST); + mlog(0, "node died with cancel pending " + "on %.*s. move back to granted list.\n", + res->lockname.len, res->lockname.name); + dlm_commit_pending_cancel(res, lock); + lock->cancel_pending = 0; + } + dlm_lock_put(lock); + } + } +} + + + +/* removes all recovered locks from the recovery list. + * sets the res->owner to the new master. + * unsets the RECOVERY flag and wakes waiters. */ +static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, + u8 dead_node, u8 new_master) +{ + int i; + struct hlist_node *hash_iter; + struct hlist_head *bucket; + struct dlm_lock_resource *res, *next; + + assert_spin_locked(&dlm->spinlock); + + list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { + if (res->owner == dead_node) { + list_del_init(&res->recovering); + spin_lock(&res->spinlock); + /* new_master has our reference from + * the lock state sent during recovery */ + dlm_change_lockres_owner(dlm, res, new_master); + res->state &= ~DLM_LOCK_RES_RECOVERING; + if (__dlm_lockres_has_locks(res)) + __dlm_dirty_lockres(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + dlm_lockres_put(res); + } + } + + /* this will become unnecessary eventually, but + * for now we need to run the whole hash, clear + * the RECOVERING state and set the owner + * if necessary */ + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_lockres_hash(dlm, i); + hlist_for_each_entry(res, hash_iter, bucket, hash_node) { + if (res->state & DLM_LOCK_RES_RECOVERING) { + if (res->owner == dead_node) { + mlog(0, "(this=%u) res %.*s owner=%u " + "was not on recovering list, but " + "clearing state anyway\n", + dlm->node_num, res->lockname.len, + res->lockname.name, new_master); + } else if (res->owner == dlm->node_num) { + mlog(0, "(this=%u) res %.*s owner=%u " + "was not on recovering list, " + "owner is THIS node, clearing\n", + dlm->node_num, res->lockname.len, + res->lockname.name, new_master); + } else + continue; + + if (!list_empty(&res->recovering)) { + mlog(0, "%s:%.*s: lockres was " + "marked RECOVERING, owner=%u\n", + dlm->name, res->lockname.len, + res->lockname.name, res->owner); + list_del_init(&res->recovering); + dlm_lockres_put(res); + } + spin_lock(&res->spinlock); + /* new_master has our reference from + * the lock state sent during recovery */ + dlm_change_lockres_owner(dlm, res, new_master); + res->state &= ~DLM_LOCK_RES_RECOVERING; + if (__dlm_lockres_has_locks(res)) + __dlm_dirty_lockres(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + } + } + } +} + +static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local) +{ + if (local) { + if (lock->ml.type != LKM_EXMODE && + lock->ml.type != LKM_PRMODE) + return 1; + } else if (lock->ml.type == LKM_EXMODE) + return 1; + return 0; +} + +static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, u8 dead_node) +{ + struct list_head *queue; + struct dlm_lock *lock; + int blank_lvb = 0, local = 0; + int i; + u8 search_node; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + if (res->owner == dlm->node_num) + /* if this node owned the lockres, and if the dead node + * had an EX when he died, blank out the lvb */ + search_node = dead_node; + else { + /* if this is a secondary lockres, and we had no EX or PR + * locks granted, we can no longer trust the lvb */ + search_node = dlm->node_num; + local = 1; /* check local state for valid lvb */ + } + + for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each_entry(lock, queue, list) { + if (lock->ml.node == search_node) { + if (dlm_lvb_needs_invalidation(lock, local)) { + /* zero the lksb lvb and lockres lvb */ + blank_lvb = 1; + memset(lock->lksb->lvb, 0, DLM_LVB_LEN); + } + } + } + } + + if (blank_lvb) { + mlog(0, "clearing %.*s lvb, dead node %u had EX\n", + res->lockname.len, res->lockname.name, dead_node); + memset(res->lvb, 0, DLM_LVB_LEN); + } +} + +static void dlm_free_dead_locks(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, u8 dead_node) +{ + struct dlm_lock *lock, *next; + unsigned int freed = 0; + + /* this node is the lockres master: + * 1) remove any stale locks for the dead node + * 2) if the dead node had an EX when he died, blank out the lvb + */ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + /* We do two dlm_lock_put(). One for removing from list and the other is + * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */ + + /* TODO: check pending_asts, pending_basts here */ + list_for_each_entry_safe(lock, next, &res->granted, list) { + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ + dlm_lock_put(lock); + freed++; + } + } + list_for_each_entry_safe(lock, next, &res->converting, list) { + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ + dlm_lock_put(lock); + freed++; + } + } + list_for_each_entry_safe(lock, next, &res->blocked, list) { + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ + dlm_lock_put(lock); + freed++; + } + } + + if (freed) { + mlog(0, "%s:%.*s: freed %u locks for dead node %u, " + "dropping ref from lockres\n", dlm->name, + res->lockname.len, res->lockname.name, freed, dead_node); + if(!test_bit(dead_node, res->refmap)) { + mlog(ML_ERROR, "%s:%.*s: freed %u locks for dead node %u, " + "but ref was not set\n", dlm->name, + res->lockname.len, res->lockname.name, freed, dead_node); + __dlm_print_one_lock_resource(res); + } + dlm_lockres_clear_refmap_bit(dead_node, res); + } else if (test_bit(dead_node, res->refmap)) { + mlog(0, "%s:%.*s: dead node %u had a ref, but had " + "no locks and had not purged before dying\n", dlm->name, + res->lockname.len, res->lockname.name, dead_node); + dlm_lockres_clear_refmap_bit(dead_node, res); + } + + /* do not kick thread yet */ + __dlm_dirty_lockres(dlm, res); +} + +/* if this node is the recovery master, and there are no + * locks for a given lockres owned by this node that are in + * either PR or EX mode, zero out the lvb before requesting. + * + */ + + +static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) +{ + struct hlist_node *iter; + struct dlm_lock_resource *res; + int i; + struct hlist_head *bucket; + struct dlm_lock *lock; + + + /* purge any stale mles */ + dlm_clean_master_list(dlm, dead_node); + + /* + * now clean up all lock resources. there are two rules: + * + * 1) if the dead node was the master, move the lockres + * to the recovering list. set the RECOVERING flag. + * this lockres needs to be cleaned up before it can + * be used further. + * + * 2) if this node was the master, remove all locks from + * each of the lockres queues that were owned by the + * dead node. once recovery finishes, the dlm thread + * can be kicked again to see if any ASTs or BASTs + * need to be fired as a result. + */ + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_lockres_hash(dlm, i); + hlist_for_each_entry(res, iter, bucket, hash_node) { + /* always prune any $RECOVERY entries for dead nodes, + * otherwise hangs can occur during later recovery */ + if (dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + spin_lock(&res->spinlock); + list_for_each_entry(lock, &res->granted, list) { + if (lock->ml.node == dead_node) { + mlog(0, "AHA! there was " + "a $RECOVERY lock for dead " + "node %u (%s)!\n", + dead_node, dlm->name); + list_del_init(&lock->list); + dlm_lock_put(lock); + break; + } + } + spin_unlock(&res->spinlock); + continue; + } + spin_lock(&res->spinlock); + /* zero the lvb if necessary */ + dlm_revalidate_lvb(dlm, res, dead_node); + if (res->owner == dead_node) { + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + mlog(ML_NOTICE, "Ignore %.*s for " + "recovery as it is being freed\n", + res->lockname.len, + res->lockname.name); + } else + dlm_move_lockres_to_recovery_list(dlm, + res); + + } else if (res->owner == dlm->node_num) { + dlm_free_dead_locks(dlm, res, dead_node); + __dlm_lockres_calc_usage(dlm, res); + } + spin_unlock(&res->spinlock); + } + } + +} + +static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) +{ + assert_spin_locked(&dlm->spinlock); + + if (dlm->reco.new_master == idx) { + mlog(0, "%s: recovery master %d just died\n", + dlm->name, idx); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + /* finalize1 was reached, so it is safe to clear + * the new_master and dead_node. that recovery + * is complete. */ + mlog(0, "%s: dead master %d had reached " + "finalize1 state, clearing\n", dlm->name, idx); + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + __dlm_reset_recovery(dlm); + } + } + + /* Clean up join state on node death. */ + if (dlm->joining_node == idx) { + mlog(0, "Clearing join state for node %u\n", idx); + __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); + } + + /* check to see if the node is already considered dead */ + if (!test_bit(idx, dlm->live_nodes_map)) { + mlog(0, "for domain %s, node %d is already dead. " + "another node likely did recovery already.\n", + dlm->name, idx); + return; + } + + /* check to see if we do not care about this node */ + if (!test_bit(idx, dlm->domain_map)) { + /* This also catches the case that we get a node down + * but haven't joined the domain yet. */ + mlog(0, "node %u already removed from domain!\n", idx); + return; + } + + clear_bit(idx, dlm->live_nodes_map); + + /* make sure local cleanup occurs before the heartbeat events */ + if (!test_bit(idx, dlm->recovery_map)) + dlm_do_local_recovery_cleanup(dlm, idx); + + /* notify anything attached to the heartbeat events */ + dlm_hb_event_notify_attached(dlm, idx, 0); + + mlog(0, "node %u being removed from domain map!\n", idx); + clear_bit(idx, dlm->domain_map); + clear_bit(idx, dlm->exit_domain_map); + /* wake up migration waiters if a node goes down. + * perhaps later we can genericize this for other waiters. */ + wake_up(&dlm->migration_wq); + + if (test_bit(idx, dlm->recovery_map)) + mlog(0, "domain %s, node %u already added " + "to recovery map!\n", dlm->name, idx); + else + set_bit(idx, dlm->recovery_map); +} + +void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data) +{ + struct dlm_ctxt *dlm = data; + + if (!dlm_grab(dlm)) + return; + + /* + * This will notify any dlm users that a node in our domain + * went away without notifying us first. + */ + if (test_bit(idx, dlm->domain_map)) + dlm_fire_domain_eviction_callbacks(dlm, idx); + + spin_lock(&dlm->spinlock); + __dlm_hb_node_down(dlm, idx); + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); +} + +void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data) +{ + struct dlm_ctxt *dlm = data; + + if (!dlm_grab(dlm)) + return; + + spin_lock(&dlm->spinlock); + set_bit(idx, dlm->live_nodes_map); + /* do NOT notify mle attached to the heartbeat events. + * new nodes are not interesting in mastery until joined. */ + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); +} + +static void dlm_reco_ast(void *astdata) +{ + struct dlm_ctxt *dlm = astdata; + mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n", + dlm->node_num, dlm->name); +} +static void dlm_reco_bast(void *astdata, int blocked_type) +{ + struct dlm_ctxt *dlm = astdata; + mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n", + dlm->node_num, dlm->name); +} +static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st) +{ + mlog(0, "unlockast for recovery lock fired!\n"); +} + +/* + * dlm_pick_recovery_master will continually attempt to use + * dlmlock() on the special "$RECOVERY" lockres with the + * LKM_NOQUEUE flag to get an EX. every thread that enters + * this function on each node racing to become the recovery + * master will not stop attempting this until either: + * a) this node gets the EX (and becomes the recovery master), + * or b) dlm->reco.new_master gets set to some nodenum + * != O2NM_INVALID_NODE_NUM (another node will do the reco). + * so each time a recovery master is needed, the entire cluster + * will sync at this point. if the new master dies, that will + * be detected in dlm_do_recovery */ +static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) +{ + enum dlm_status ret; + struct dlm_lockstatus lksb; + int status = -EINVAL; + + mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", + dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); +again: + memset(&lksb, 0, sizeof(lksb)); + + ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, + DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN, + dlm_reco_ast, dlm, dlm_reco_bast); + + mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n", + dlm->name, ret, lksb.status); + + if (ret == DLM_NORMAL) { + mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", + dlm->name, dlm->node_num); + + /* got the EX lock. check to see if another node + * just became the reco master */ + if (dlm_reco_master_ready(dlm)) { + mlog(0, "%s: got reco EX lock, but %u will " + "do the recovery\n", dlm->name, + dlm->reco.new_master); + status = -EEXIST; + } else { + status = 0; + + /* see if recovery was already finished elsewhere */ + spin_lock(&dlm->spinlock); + if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { + status = -EINVAL; + mlog(0, "%s: got reco EX lock, but " + "node got recovered already\n", dlm->name); + if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { + mlog(ML_ERROR, "%s: new master is %u " + "but no dead node!\n", + dlm->name, dlm->reco.new_master); + BUG(); + } + } + spin_unlock(&dlm->spinlock); + } + + /* if this node has actually become the recovery master, + * set the master and send the messages to begin recovery */ + if (!status) { + mlog(0, "%s: dead=%u, this=%u, sending " + "begin_reco now\n", dlm->name, + dlm->reco.dead_node, dlm->node_num); + status = dlm_send_begin_reco_message(dlm, + dlm->reco.dead_node); + /* this always succeeds */ + BUG_ON(status); + + /* set the new_master to this node */ + spin_lock(&dlm->spinlock); + dlm_set_reco_master(dlm, dlm->node_num); + spin_unlock(&dlm->spinlock); + } + + /* recovery lock is a special case. ast will not get fired, + * so just go ahead and unlock it. */ + ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); + if (ret == DLM_DENIED) { + mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n"); + ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm); + } + if (ret != DLM_NORMAL) { + /* this would really suck. this could only happen + * if there was a network error during the unlock + * because of node death. this means the unlock + * is actually "done" and the lock structure is + * even freed. we can continue, but only + * because this specific lock name is special. */ + mlog(ML_ERROR, "dlmunlock returned %d\n", ret); + } + } else if (ret == DLM_NOTQUEUED) { + mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", + dlm->name, dlm->node_num); + /* another node is master. wait on + * reco.new_master != O2NM_INVALID_NODE_NUM + * for at most one second */ + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_reco_master_ready(dlm), + msecs_to_jiffies(1000)); + if (!dlm_reco_master_ready(dlm)) { + mlog(0, "%s: reco master taking awhile\n", + dlm->name); + goto again; + } + /* another node has informed this one that it is reco master */ + mlog(0, "%s: reco master %u is ready to recover %u\n", + dlm->name, dlm->reco.new_master, dlm->reco.dead_node); + status = -EEXIST; + } else if (ret == DLM_RECOVERING) { + mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n", + dlm->name, dlm->node_num); + goto again; + } else { + struct dlm_lock_resource *res; + + /* dlmlock returned something other than NOTQUEUED or NORMAL */ + mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), " + "lksb.status=%s\n", dlm->name, dlm_errname(ret), + dlm_errname(lksb.status)); + res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, + DLM_RECOVERY_LOCK_NAME_LEN); + if (res) { + dlm_print_one_lock_resource(res); + dlm_lockres_put(res); + } else { + mlog(ML_ERROR, "recovery lock not found\n"); + } + BUG(); + } + + return status; +} + +static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) +{ + struct dlm_begin_reco br; + int ret = 0; + struct dlm_node_iter iter; + int nodenum; + int status; + + mlog(0, "%s: dead node is %u\n", dlm->name, dead_node); + + spin_lock(&dlm->spinlock); + dlm_node_iter_init(dlm->domain_map, &iter); + spin_unlock(&dlm->spinlock); + + clear_bit(dead_node, iter.node_map); + + memset(&br, 0, sizeof(br)); + br.node_idx = dlm->node_num; + br.dead_node = dead_node; + + while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { + ret = 0; + if (nodenum == dead_node) { + mlog(0, "not sending begin reco to dead node " + "%u\n", dead_node); + continue; + } + if (nodenum == dlm->node_num) { + mlog(0, "not sending begin reco to self\n"); + continue; + } +retry: + ret = -EINVAL; + mlog(0, "attempting to send begin reco msg to %d\n", + nodenum); + ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key, + &br, sizeof(br), nodenum, &status); + /* negative status is handled ok by caller here */ + if (ret >= 0) + ret = status; + if (dlm_is_host_down(ret)) { + /* node is down. not involved in recovery + * so just keep going */ + mlog(ML_NOTICE, "%s: node %u was down when sending " + "begin reco msg (%d)\n", dlm->name, nodenum, ret); + ret = 0; + } + + /* + * Prior to commit aad1b15310b9bcd59fa81ab8f2b1513b59553ea8, + * dlm_begin_reco_handler() returned EAGAIN and not -EAGAIN. + * We are handling both for compatibility reasons. + */ + if (ret == -EAGAIN || ret == EAGAIN) { + mlog(0, "%s: trying to start recovery of node " + "%u, but node %u is waiting for last recovery " + "to complete, backoff for a bit\n", dlm->name, + dead_node, nodenum); + msleep(100); + goto retry; + } + if (ret < 0) { + struct dlm_lock_resource *res; + + /* this is now a serious problem, possibly ENOMEM + * in the network stack. must retry */ + mlog_errno(ret); + mlog(ML_ERROR, "begin reco of dlm %s to node %u " + "returned %d\n", dlm->name, nodenum, ret); + res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, + DLM_RECOVERY_LOCK_NAME_LEN); + if (res) { + dlm_print_one_lock_resource(res); + dlm_lockres_put(res); + } else { + mlog(ML_ERROR, "recovery lock not found\n"); + } + /* sleep for a bit in hopes that we can avoid + * another ENOMEM */ + msleep(100); + goto retry; + } + } + + return ret; +} + +int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf; + + /* ok to return 0, domain has gone away */ + if (!dlm_grab(dlm)) + return 0; + + spin_lock(&dlm->spinlock); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + mlog(0, "%s: node %u wants to recover node %u (%u:%u) " + "but this node is in finalize state, waiting on finalize2\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); + spin_unlock(&dlm->spinlock); + return -EAGAIN; + } + spin_unlock(&dlm->spinlock); + + mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); + + dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); + + spin_lock(&dlm->spinlock); + if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { + if (test_bit(dlm->reco.new_master, dlm->recovery_map)) { + mlog(0, "%s: new_master %u died, changing " + "to %u\n", dlm->name, dlm->reco.new_master, + br->node_idx); + } else { + mlog(0, "%s: new_master %u NOT DEAD, changing " + "to %u\n", dlm->name, dlm->reco.new_master, + br->node_idx); + /* may not have seen the new master as dead yet */ + } + } + if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { + mlog(ML_NOTICE, "%s: dead_node previously set to %u, " + "node %u changing it to %u\n", dlm->name, + dlm->reco.dead_node, br->node_idx, br->dead_node); + } + dlm_set_reco_master(dlm, br->node_idx); + dlm_set_reco_dead_node(dlm, br->dead_node); + if (!test_bit(br->dead_node, dlm->recovery_map)) { + mlog(0, "recovery master %u sees %u as dead, but this " + "node has not yet. marking %u as dead\n", + br->node_idx, br->dead_node, br->dead_node); + if (!test_bit(br->dead_node, dlm->domain_map) || + !test_bit(br->dead_node, dlm->live_nodes_map)) + mlog(0, "%u not in domain/live_nodes map " + "so setting it in reco map manually\n", + br->dead_node); + /* force the recovery cleanup in __dlm_hb_node_down + * both of these will be cleared in a moment */ + set_bit(br->dead_node, dlm->domain_map); + set_bit(br->dead_node, dlm->live_nodes_map); + __dlm_hb_node_down(dlm, br->dead_node); + } + spin_unlock(&dlm->spinlock); + + dlm_kick_recovery_thread(dlm); + + mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); + + dlm_put(dlm); + return 0; +} + +#define DLM_FINALIZE_STAGE2 0x01 +static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) +{ + int ret = 0; + struct dlm_finalize_reco fr; + struct dlm_node_iter iter; + int nodenum; + int status; + int stage = 1; + + mlog(0, "finishing recovery for node %s:%u, " + "stage %d\n", dlm->name, dlm->reco.dead_node, stage); + + spin_lock(&dlm->spinlock); + dlm_node_iter_init(dlm->domain_map, &iter); + spin_unlock(&dlm->spinlock); + +stage2: + memset(&fr, 0, sizeof(fr)); + fr.node_idx = dlm->node_num; + fr.dead_node = dlm->reco.dead_node; + if (stage == 2) + fr.flags |= DLM_FINALIZE_STAGE2; + + while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { + if (nodenum == dlm->node_num) + continue; + ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key, + &fr, sizeof(fr), nodenum, &status); + if (ret >= 0) + ret = status; + if (ret < 0) { + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG, + dlm->key, nodenum); + if (dlm_is_host_down(ret)) { + /* this has no effect on this recovery + * session, so set the status to zero to + * finish out the last recovery */ + mlog(ML_ERROR, "node %u went down after this " + "node finished recovery.\n", nodenum); + ret = 0; + continue; + } + break; + } + } + if (stage == 1) { + /* reset the node_iter back to the top and send finalize2 */ + iter.curnode = -1; + stage = 2; + goto stage2; + } + + return ret; +} + +int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; + int stage = 1; + + /* ok to return 0, domain has gone away */ + if (!dlm_grab(dlm)) + return 0; + + if (fr->flags & DLM_FINALIZE_STAGE2) + stage = 2; + + mlog(0, "%s: node %u finalizing recovery stage%d of " + "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage, + fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master); + + spin_lock(&dlm->spinlock); + + if (dlm->reco.new_master != fr->node_idx) { + mlog(ML_ERROR, "node %u sent recovery finalize msg, but node " + "%u is supposed to be the new master, dead=%u\n", + fr->node_idx, dlm->reco.new_master, fr->dead_node); + BUG(); + } + if (dlm->reco.dead_node != fr->dead_node) { + mlog(ML_ERROR, "node %u sent recovery finalize msg for dead " + "node %u, but node %u is supposed to be dead\n", + fr->node_idx, fr->dead_node, dlm->reco.dead_node); + BUG(); + } + + switch (stage) { + case 1: + dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + mlog(ML_ERROR, "%s: received finalize1 from " + "new master %u for dead node %u, but " + "this node has already received it!\n", + dlm->name, fr->node_idx, fr->dead_node); + dlm_print_reco_node_status(dlm); + BUG(); + } + dlm->reco.state |= DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + break; + case 2: + if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) { + mlog(ML_ERROR, "%s: received finalize2 from " + "new master %u for dead node %u, but " + "this node did not have finalize1!\n", + dlm->name, fr->node_idx, fr->dead_node); + dlm_print_reco_node_status(dlm); + BUG(); + } + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + dlm_reset_recovery(dlm); + dlm_kick_recovery_thread(dlm); + break; + default: + BUG(); + } + + mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", + dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master); + + dlm_put(dlm); + return 0; +} diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c new file mode 100644 index 00000000..1d6d1d22 --- /dev/null +++ b/fs/ocfs2/dlm/dlmthread.c @@ -0,0 +1,762 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmthread.c + * + * standalone DLM module + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" +#include "dlmdomain.h" + +#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD) +#include "cluster/masklog.h" + +static int dlm_thread(void *data); +static void dlm_flush_asts(struct dlm_ctxt *dlm); + +#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num) + +/* will exit holding res->spinlock, but may drop in function */ +/* waits until flags are cleared on res->state */ +void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags) +{ + DECLARE_WAITQUEUE(wait, current); + + assert_spin_locked(&res->spinlock); + + add_wait_queue(&res->wq, &wait); +repeat: + set_current_state(TASK_UNINTERRUPTIBLE); + if (res->state & flags) { + spin_unlock(&res->spinlock); + schedule(); + spin_lock(&res->spinlock); + goto repeat; + } + remove_wait_queue(&res->wq, &wait); + __set_current_state(TASK_RUNNING); +} + +int __dlm_lockres_has_locks(struct dlm_lock_resource *res) +{ + if (list_empty(&res->granted) && + list_empty(&res->converting) && + list_empty(&res->blocked)) + return 0; + return 1; +} + +/* "unused": the lockres has no locks, is not on the dirty list, + * has no inflight locks (in the gap between mastery and acquiring + * the first lock), and has no bits in its refmap. + * truly ready to be freed. */ +int __dlm_lockres_unused(struct dlm_lock_resource *res) +{ + int bit; + + if (__dlm_lockres_has_locks(res)) + return 0; + + if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) + return 0; + + if (res->state & DLM_LOCK_RES_RECOVERING) + return 0; + + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (bit < O2NM_MAX_NODES) + return 0; + + /* + * since the bit for dlm->node_num is not set, inflight_locks better + * be zero + */ + BUG_ON(res->inflight_locks != 0); + return 1; +} + + +/* Call whenever you may have added or deleted something from one of + * the lockres queue's. This will figure out whether it belongs on the + * unused list or not and does the appropriate thing. */ +void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + if (__dlm_lockres_unused(res)){ + if (list_empty(&res->purge)) { + mlog(0, "%s: Adding res %.*s to purge list\n", + dlm->name, res->lockname.len, res->lockname.name); + + res->last_used = jiffies; + dlm_lockres_get(res); + list_add_tail(&res->purge, &dlm->purge_list); + dlm->purge_count++; + } + } else if (!list_empty(&res->purge)) { + mlog(0, "%s: Removing res %.*s from purge list\n", + dlm->name, res->lockname.len, res->lockname.name); + + list_del_init(&res->purge); + dlm_lockres_put(res); + dlm->purge_count--; + } +} + +void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + spin_lock(&dlm->spinlock); + spin_lock(&res->spinlock); + + __dlm_lockres_calc_usage(dlm, res); + + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); +} + +static void dlm_purge_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + int master; + int ret = 0; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + master = (res->owner == dlm->node_num); + + mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name, + res->lockname.len, res->lockname.name, master); + + if (!master) { + res->state |= DLM_LOCK_RES_DROPPING_REF; + /* drop spinlock... retake below */ + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + + spin_lock(&res->spinlock); + /* This ensures that clear refmap is sent after the set */ + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); + spin_unlock(&res->spinlock); + + /* clear our bit from the master's refmap, ignore errors */ + ret = dlm_drop_lockres_ref(dlm, res); + if (ret < 0) { + mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name, + res->lockname.len, res->lockname.name, ret); + if (!dlm_is_host_down(ret)) + BUG(); + } + spin_lock(&dlm->spinlock); + spin_lock(&res->spinlock); + } + + if (!list_empty(&res->purge)) { + mlog(0, "%s: Removing res %.*s from purgelist, master %d\n", + dlm->name, res->lockname.len, res->lockname.name, master); + list_del_init(&res->purge); + dlm_lockres_put(res); + dlm->purge_count--; + } + + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "%s: res %.*s in use after deref\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + BUG(); + } + + __dlm_unhash_lockres(res); + + /* lockres is not in the hash now. drop the flag and wake up + * any processes waiting in dlm_get_lock_resource. */ + if (!master) { + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + } else + spin_unlock(&res->spinlock); +} + +static void dlm_run_purge_list(struct dlm_ctxt *dlm, + int purge_now) +{ + unsigned int run_max, unused; + unsigned long purge_jiffies; + struct dlm_lock_resource *lockres; + + spin_lock(&dlm->spinlock); + run_max = dlm->purge_count; + + while(run_max && !list_empty(&dlm->purge_list)) { + run_max--; + + lockres = list_entry(dlm->purge_list.next, + struct dlm_lock_resource, purge); + + spin_lock(&lockres->spinlock); + + purge_jiffies = lockres->last_used + + msecs_to_jiffies(DLM_PURGE_INTERVAL_MS); + + /* Make sure that we want to be processing this guy at + * this time. */ + if (!purge_now && time_after(purge_jiffies, jiffies)) { + /* Since resources are added to the purge list + * in tail order, we can stop at the first + * unpurgable resource -- anyone added after + * him will have a greater last_used value */ + spin_unlock(&lockres->spinlock); + break; + } + + /* Status of the lockres *might* change so double + * check. If the lockres is unused, holding the dlm + * spinlock will prevent people from getting and more + * refs on it. */ + unused = __dlm_lockres_unused(lockres); + if (!unused || + (lockres->state & DLM_LOCK_RES_MIGRATING)) { + mlog(0, "%s: res %.*s is in use or being remastered, " + "used %d, state %d\n", dlm->name, + lockres->lockname.len, lockres->lockname.name, + !unused, lockres->state); + list_move_tail(&dlm->purge_list, &lockres->purge); + spin_unlock(&lockres->spinlock); + continue; + } + + dlm_lockres_get(lockres); + + dlm_purge_lockres(dlm, lockres); + + dlm_lockres_put(lockres); + + /* Avoid adding any scheduling latencies */ + cond_resched_lock(&dlm->spinlock); + } + + spin_unlock(&dlm->spinlock); +} + +static void dlm_shuffle_lists(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + struct dlm_lock *lock, *target; + struct list_head *iter; + struct list_head *head; + int can_grant = 1; + + /* + * Because this function is called with the lockres + * spinlock, and because we know that it is not migrating/ + * recovering/in-progress, it is fine to reserve asts and + * basts right before queueing them all throughout + */ + assert_spin_locked(&dlm->ast_lock); + assert_spin_locked(&res->spinlock); + BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING| + DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_IN_PROGRESS))); + +converting: + if (list_empty(&res->converting)) + goto blocked; + mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name, + res->lockname.len, res->lockname.name); + + target = list_entry(res->converting.next, struct dlm_lock, list); + if (target->ml.convert_type == LKM_IVMODE) { + mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n", + dlm->name, res->lockname.len, res->lockname.name); + BUG(); + } + head = &res->granted; + list_for_each(iter, head) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock==target) + continue; + if (!dlm_lock_compatible(lock->ml.type, + target->ml.convert_type)) { + can_grant = 0; + /* queue the BAST if not already */ + if (lock->ml.highest_blocked == LKM_IVMODE) { + __dlm_lockres_reserve_ast(res); + __dlm_queue_bast(dlm, lock); + } + /* update the highest_blocked if needed */ + if (lock->ml.highest_blocked < target->ml.convert_type) + lock->ml.highest_blocked = + target->ml.convert_type; + } + } + head = &res->converting; + list_for_each(iter, head) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock==target) + continue; + if (!dlm_lock_compatible(lock->ml.type, + target->ml.convert_type)) { + can_grant = 0; + if (lock->ml.highest_blocked == LKM_IVMODE) { + __dlm_lockres_reserve_ast(res); + __dlm_queue_bast(dlm, lock); + } + if (lock->ml.highest_blocked < target->ml.convert_type) + lock->ml.highest_blocked = + target->ml.convert_type; + } + } + + /* we can convert the lock */ + if (can_grant) { + spin_lock(&target->spinlock); + BUG_ON(target->ml.highest_blocked != LKM_IVMODE); + + mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type " + "%d => %d, node %u\n", dlm->name, res->lockname.len, + res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)), + target->ml.type, + target->ml.convert_type, target->ml.node); + + target->ml.type = target->ml.convert_type; + target->ml.convert_type = LKM_IVMODE; + list_move_tail(&target->list, &res->granted); + + BUG_ON(!target->lksb); + target->lksb->status = DLM_NORMAL; + + spin_unlock(&target->spinlock); + + __dlm_lockres_reserve_ast(res); + __dlm_queue_ast(dlm, target); + /* go back and check for more */ + goto converting; + } + +blocked: + if (list_empty(&res->blocked)) + goto leave; + target = list_entry(res->blocked.next, struct dlm_lock, list); + + head = &res->granted; + list_for_each(iter, head) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock==target) + continue; + if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { + can_grant = 0; + if (lock->ml.highest_blocked == LKM_IVMODE) { + __dlm_lockres_reserve_ast(res); + __dlm_queue_bast(dlm, lock); + } + if (lock->ml.highest_blocked < target->ml.type) + lock->ml.highest_blocked = target->ml.type; + } + } + + head = &res->converting; + list_for_each(iter, head) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock==target) + continue; + if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { + can_grant = 0; + if (lock->ml.highest_blocked == LKM_IVMODE) { + __dlm_lockres_reserve_ast(res); + __dlm_queue_bast(dlm, lock); + } + if (lock->ml.highest_blocked < target->ml.type) + lock->ml.highest_blocked = target->ml.type; + } + } + + /* we can grant the blocked lock (only + * possible if converting list empty) */ + if (can_grant) { + spin_lock(&target->spinlock); + BUG_ON(target->ml.highest_blocked != LKM_IVMODE); + + mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, " + "node %u\n", dlm->name, res->lockname.len, + res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)), + target->ml.type, target->ml.node); + + /* target->ml.type is already correct */ + list_move_tail(&target->list, &res->granted); + + BUG_ON(!target->lksb); + target->lksb->status = DLM_NORMAL; + + spin_unlock(&target->spinlock); + + __dlm_lockres_reserve_ast(res); + __dlm_queue_ast(dlm, target); + /* go back and check for more */ + goto converting; + } + +leave: + return; +} + +/* must have NO locks when calling this with res !=NULL * */ +void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + if (res) { + spin_lock(&dlm->spinlock); + spin_lock(&res->spinlock); + __dlm_dirty_lockres(dlm, res); + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + } + wake_up(&dlm->dlm_thread_wq); +} + +void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + /* don't shuffle secondary queues */ + if ((res->owner == dlm->node_num)) { + if (res->state & (DLM_LOCK_RES_MIGRATING | + DLM_LOCK_RES_BLOCK_DIRTY)) + return; + + if (list_empty(&res->dirty)) { + /* ref for dirty_list */ + dlm_lockres_get(res); + list_add_tail(&res->dirty, &dlm->dirty_list); + res->state |= DLM_LOCK_RES_DIRTY; + } + } + + mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len, + res->lockname.name); +} + + +/* Launch the NM thread for the mounted volume */ +int dlm_launch_thread(struct dlm_ctxt *dlm) +{ + mlog(0, "Starting dlm_thread...\n"); + + dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); + if (IS_ERR(dlm->dlm_thread_task)) { + mlog_errno(PTR_ERR(dlm->dlm_thread_task)); + dlm->dlm_thread_task = NULL; + return -EINVAL; + } + + return 0; +} + +void dlm_complete_thread(struct dlm_ctxt *dlm) +{ + if (dlm->dlm_thread_task) { + mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n"); + kthread_stop(dlm->dlm_thread_task); + dlm->dlm_thread_task = NULL; + } +} + +static int dlm_dirty_list_empty(struct dlm_ctxt *dlm) +{ + int empty; + + spin_lock(&dlm->spinlock); + empty = list_empty(&dlm->dirty_list); + spin_unlock(&dlm->spinlock); + + return empty; +} + +static void dlm_flush_asts(struct dlm_ctxt *dlm) +{ + int ret; + struct dlm_lock *lock; + struct dlm_lock_resource *res; + u8 hi; + + spin_lock(&dlm->ast_lock); + while (!list_empty(&dlm->pending_asts)) { + lock = list_entry(dlm->pending_asts.next, + struct dlm_lock, ast_list); + /* get an extra ref on lock */ + dlm_lock_get(lock); + res = lock->lockres; + mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, " + "node %u\n", dlm->name, res->lockname.len, + res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + lock->ml.type, lock->ml.node); + + BUG_ON(!lock->ast_pending); + + /* remove from list (including ref) */ + list_del_init(&lock->ast_list); + dlm_lock_put(lock); + spin_unlock(&dlm->ast_lock); + + if (lock->ml.node != dlm->node_num) { + ret = dlm_do_remote_ast(dlm, res, lock); + if (ret < 0) + mlog_errno(ret); + } else + dlm_do_local_ast(dlm, res, lock); + + spin_lock(&dlm->ast_lock); + + /* possible that another ast was queued while + * we were delivering the last one */ + if (!list_empty(&lock->ast_list)) { + mlog(0, "%s: res %.*s, AST queued while flushing last " + "one\n", dlm->name, res->lockname.len, + res->lockname.name); + } else + lock->ast_pending = 0; + + /* drop the extra ref. + * this may drop it completely. */ + dlm_lock_put(lock); + dlm_lockres_release_ast(dlm, res); + } + + while (!list_empty(&dlm->pending_basts)) { + lock = list_entry(dlm->pending_basts.next, + struct dlm_lock, bast_list); + /* get an extra ref on lock */ + dlm_lock_get(lock); + res = lock->lockres; + + BUG_ON(!lock->bast_pending); + + /* get the highest blocked lock, and reset */ + spin_lock(&lock->spinlock); + BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE); + hi = lock->ml.highest_blocked; + lock->ml.highest_blocked = LKM_IVMODE; + spin_unlock(&lock->spinlock); + + /* remove from list (including ref) */ + list_del_init(&lock->bast_list); + dlm_lock_put(lock); + spin_unlock(&dlm->ast_lock); + + mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, " + "blocked %d, node %u\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + hi, lock->ml.node); + + if (lock->ml.node != dlm->node_num) { + ret = dlm_send_proxy_bast(dlm, res, lock, hi); + if (ret < 0) + mlog_errno(ret); + } else + dlm_do_local_bast(dlm, res, lock, hi); + + spin_lock(&dlm->ast_lock); + + /* possible that another bast was queued while + * we were delivering the last one */ + if (!list_empty(&lock->bast_list)) { + mlog(0, "%s: res %.*s, BAST queued while flushing last " + "one\n", dlm->name, res->lockname.len, + res->lockname.name); + } else + lock->bast_pending = 0; + + /* drop the extra ref. + * this may drop it completely. */ + dlm_lock_put(lock); + dlm_lockres_release_ast(dlm, res); + } + wake_up(&dlm->ast_wq); + spin_unlock(&dlm->ast_lock); +} + + +#define DLM_THREAD_TIMEOUT_MS (4 * 1000) +#define DLM_THREAD_MAX_DIRTY 100 +#define DLM_THREAD_MAX_ASTS 10 + +static int dlm_thread(void *data) +{ + struct dlm_lock_resource *res; + struct dlm_ctxt *dlm = data; + unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS); + + mlog(0, "dlm thread running for %s...\n", dlm->name); + + while (!kthread_should_stop()) { + int n = DLM_THREAD_MAX_DIRTY; + + /* dlm_shutting_down is very point-in-time, but that + * doesn't matter as we'll just loop back around if we + * get false on the leading edge of a state + * transition. */ + dlm_run_purge_list(dlm, dlm_shutting_down(dlm)); + + /* We really don't want to hold dlm->spinlock while + * calling dlm_shuffle_lists on each lockres that + * needs to have its queues adjusted and AST/BASTs + * run. So let's pull each entry off the dirty_list + * and drop dlm->spinlock ASAP. Once off the list, + * res->spinlock needs to be taken again to protect + * the queues while calling dlm_shuffle_lists. */ + spin_lock(&dlm->spinlock); + while (!list_empty(&dlm->dirty_list)) { + int delay = 0; + res = list_entry(dlm->dirty_list.next, + struct dlm_lock_resource, dirty); + + /* peel a lockres off, remove it from the list, + * unset the dirty flag and drop the dlm lock */ + BUG_ON(!res); + dlm_lockres_get(res); + + spin_lock(&res->spinlock); + /* We clear the DLM_LOCK_RES_DIRTY state once we shuffle lists below */ + list_del_init(&res->dirty); + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + /* Drop dirty_list ref */ + dlm_lockres_put(res); + + /* lockres can be re-dirtied/re-added to the + * dirty_list in this gap, but that is ok */ + + spin_lock(&dlm->ast_lock); + spin_lock(&res->spinlock); + if (res->owner != dlm->node_num) { + __dlm_print_one_lock_resource(res); + mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d," + " dirty %d\n", dlm->name, + !!(res->state & DLM_LOCK_RES_IN_PROGRESS), + !!(res->state & DLM_LOCK_RES_MIGRATING), + !!(res->state & DLM_LOCK_RES_RECOVERING), + !!(res->state & DLM_LOCK_RES_DIRTY)); + } + BUG_ON(res->owner != dlm->node_num); + + /* it is now ok to move lockreses in these states + * to the dirty list, assuming that they will only be + * dirty for a short while. */ + BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); + if (res->state & (DLM_LOCK_RES_IN_PROGRESS | + DLM_LOCK_RES_RECOVERING)) { + /* move it to the tail and keep going */ + res->state &= ~DLM_LOCK_RES_DIRTY; + spin_unlock(&res->spinlock); + spin_unlock(&dlm->ast_lock); + mlog(0, "%s: res %.*s, inprogress, delay list " + "shuffle, state %d\n", dlm->name, + res->lockname.len, res->lockname.name, + res->state); + delay = 1; + goto in_progress; + } + + /* at this point the lockres is not migrating/ + * recovering/in-progress. we have the lockres + * spinlock and do NOT have the dlm lock. + * safe to reserve/queue asts and run the lists. */ + + /* called while holding lockres lock */ + dlm_shuffle_lists(dlm, res); + res->state &= ~DLM_LOCK_RES_DIRTY; + spin_unlock(&res->spinlock); + spin_unlock(&dlm->ast_lock); + + dlm_lockres_calc_usage(dlm, res); + +in_progress: + + spin_lock(&dlm->spinlock); + /* if the lock was in-progress, stick + * it on the back of the list */ + if (delay) { + spin_lock(&res->spinlock); + __dlm_dirty_lockres(dlm, res); + spin_unlock(&res->spinlock); + } + dlm_lockres_put(res); + + /* unlikely, but we may need to give time to + * other tasks */ + if (!--n) { + mlog(0, "%s: Throttling dlm thread\n", + dlm->name); + break; + } + } + + spin_unlock(&dlm->spinlock); + dlm_flush_asts(dlm); + + /* yield and continue right away if there is more work to do */ + if (!n) { + cond_resched(); + continue; + } + + wait_event_interruptible_timeout(dlm->dlm_thread_wq, + !dlm_dirty_list_empty(dlm) || + kthread_should_stop(), + timeout); + } + + mlog(0, "quitting DLM thread\n"); + return 0; +} diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c new file mode 100644 index 00000000..850aa7e8 --- /dev/null +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -0,0 +1,692 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmunlock.c + * + * underlying calls for unlocking locks + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +#define DLM_UNLOCK_FREE_LOCK 0x00000001 +#define DLM_UNLOCK_CALL_AST 0x00000002 +#define DLM_UNLOCK_REMOVE_LOCK 0x00000004 +#define DLM_UNLOCK_REGRANT_LOCK 0x00000008 +#define DLM_UNLOCK_CLEAR_CONVERT_TYPE 0x00000010 + + +static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int *actions); +static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int *actions); + +static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, + u8 owner); + + +/* + * according to the spec: + * http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf + * + * flags & LKM_CANCEL != 0: must be converting or blocked + * flags & LKM_CANCEL == 0: must be granted + * + * So to unlock a converting lock, you must first cancel the + * convert (passing LKM_CANCEL in flags), then call the unlock + * again (with no LKM_CANCEL in flags). + */ + + +/* + * locking: + * caller needs: none + * taken: res->spinlock and lock->spinlock taken and dropped + * held on exit: none + * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network + * all callers should have taken an extra ref on lock coming in + */ +static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, int *call_ast, + int master_node) +{ + enum dlm_status status; + int actions = 0; + int in_use; + u8 owner; + + mlog(0, "master_node = %d, valblk = %d\n", master_node, + flags & LKM_VALBLK); + + if (master_node) + BUG_ON(res->owner != dlm->node_num); + else + BUG_ON(res->owner == dlm->node_num); + + spin_lock(&dlm->ast_lock); + /* We want to be sure that we're not freeing a lock + * that still has AST's pending... */ + in_use = !list_empty(&lock->ast_list); + spin_unlock(&dlm->ast_lock); + if (in_use && !(flags & LKM_CANCEL)) { + mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " + "while waiting for an ast!", res->lockname.len, + res->lockname.name); + return DLM_BADPARAM; + } + + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_IN_PROGRESS) { + if (master_node && !(flags & LKM_CANCEL)) { + mlog(ML_ERROR, "lockres in progress!\n"); + spin_unlock(&res->spinlock); + return DLM_FORWARD; + } + /* ok for this to sleep if not in a network handler */ + __dlm_wait_on_lockres(res); + res->state |= DLM_LOCK_RES_IN_PROGRESS; + } + spin_lock(&lock->spinlock); + + if (res->state & DLM_LOCK_RES_RECOVERING) { + status = DLM_RECOVERING; + goto leave; + } + + if (res->state & DLM_LOCK_RES_MIGRATING) { + status = DLM_MIGRATING; + goto leave; + } + + /* see above for what the spec says about + * LKM_CANCEL and the lock queue state */ + if (flags & LKM_CANCEL) + status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions); + else + status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions); + + if (status != DLM_NORMAL && (status != DLM_CANCELGRANT || !master_node)) + goto leave; + + /* By now this has been masked out of cancel requests. */ + if (flags & LKM_VALBLK) { + /* make the final update to the lvb */ + if (master_node) + memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN); + else + flags |= LKM_PUT_LVB; /* let the send function + * handle it. */ + } + + if (!master_node) { + owner = res->owner; + /* drop locks and send message */ + if (flags & LKM_CANCEL) + lock->cancel_pending = 1; + else + lock->unlock_pending = 1; + spin_unlock(&lock->spinlock); + spin_unlock(&res->spinlock); + status = dlm_send_remote_unlock_request(dlm, res, lock, lksb, + flags, owner); + spin_lock(&res->spinlock); + spin_lock(&lock->spinlock); + /* if the master told us the lock was already granted, + * let the ast handle all of these actions */ + if (status == DLM_CANCELGRANT) { + actions &= ~(DLM_UNLOCK_REMOVE_LOCK| + DLM_UNLOCK_REGRANT_LOCK| + DLM_UNLOCK_CLEAR_CONVERT_TYPE); + } else if (status == DLM_RECOVERING || + status == DLM_MIGRATING || + status == DLM_FORWARD) { + /* must clear the actions because this unlock + * is about to be retried. cannot free or do + * any list manipulation. */ + mlog(0, "%s:%.*s: clearing actions, %s\n", + dlm->name, res->lockname.len, + res->lockname.name, + status==DLM_RECOVERING?"recovering": + (status==DLM_MIGRATING?"migrating": + "forward")); + actions = 0; + } + if (flags & LKM_CANCEL) + lock->cancel_pending = 0; + else + lock->unlock_pending = 0; + + } + + /* get an extra ref on lock. if we are just switching + * lists here, we dont want the lock to go away. */ + dlm_lock_get(lock); + + if (actions & DLM_UNLOCK_REMOVE_LOCK) { + list_del_init(&lock->list); + dlm_lock_put(lock); + } + if (actions & DLM_UNLOCK_REGRANT_LOCK) { + dlm_lock_get(lock); + list_add_tail(&lock->list, &res->granted); + } + if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) { + mlog(0, "clearing convert_type at %smaster node\n", + master_node ? "" : "non-"); + lock->ml.convert_type = LKM_IVMODE; + } + + /* remove the extra ref on lock */ + dlm_lock_put(lock); + +leave: + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + if (!dlm_lock_on_list(&res->converting, lock)) + BUG_ON(lock->ml.convert_type != LKM_IVMODE); + else + BUG_ON(lock->ml.convert_type == LKM_IVMODE); + spin_unlock(&lock->spinlock); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + /* let the caller's final dlm_lock_put handle the actual kfree */ + if (actions & DLM_UNLOCK_FREE_LOCK) { + /* this should always be coupled with list removal */ + BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK)); + mlog(0, "lock %u:%llu should be gone now! refs=%d\n", + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + atomic_read(&lock->lock_refs.refcount)-1); + dlm_lock_put(lock); + } + if (actions & DLM_UNLOCK_CALL_AST) + *call_ast = 1; + + /* if cancel or unlock succeeded, lvb work is done */ + if (status == DLM_NORMAL) + lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB); + + return status; +} + +void dlm_commit_pending_unlock(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + /* leave DLM_LKSB_PUT_LVB on the lksb so any final + * update of the lvb will be sent to the new master */ + list_del_init(&lock->list); +} + +void dlm_commit_pending_cancel(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + list_move_tail(&lock->list, &res->granted); + lock->ml.convert_type = LKM_IVMODE; +} + + +static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, + int *call_ast) +{ + return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1); +} + +static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, int *call_ast) +{ + return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0); +} + +/* + * locking: + * caller needs: none + * taken: none + * held on exit: none + * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network + */ +static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, + u8 owner) +{ + struct dlm_unlock_lock unlock; + int tmpret; + enum dlm_status ret; + int status = 0; + struct kvec vec[2]; + size_t veclen = 1; + + mlog(0, "%.*s\n", res->lockname.len, res->lockname.name); + + if (owner == dlm->node_num) { + /* ended up trying to contact ourself. this means + * that the lockres had been remote but became local + * via a migration. just retry it, now as local */ + mlog(0, "%s:%.*s: this node became the master due to a " + "migration, re-evaluate now\n", dlm->name, + res->lockname.len, res->lockname.name); + return DLM_FORWARD; + } + + memset(&unlock, 0, sizeof(unlock)); + unlock.node_idx = dlm->node_num; + unlock.flags = cpu_to_be32(flags); + unlock.cookie = lock->ml.cookie; + unlock.namelen = res->lockname.len; + memcpy(unlock.name, res->lockname.name, unlock.namelen); + + vec[0].iov_len = sizeof(struct dlm_unlock_lock); + vec[0].iov_base = &unlock; + + if (flags & LKM_PUT_LVB) { + /* extra data to send if we are updating lvb */ + vec[1].iov_len = DLM_LVB_LEN; + vec[1].iov_base = lock->lksb->lvb; + veclen++; + } + + tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key, + vec, veclen, owner, &status); + if (tmpret >= 0) { + // successfully sent and received + if (status == DLM_FORWARD) + mlog(0, "master was in-progress. retry\n"); + ret = status; + } else { + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner); + if (dlm_is_host_down(tmpret)) { + /* NOTE: this seems strange, but it is what we want. + * when the master goes down during a cancel or + * unlock, the recovery code completes the operation + * as if the master had not died, then passes the + * updated state to the recovery master. this thread + * just needs to finish out the operation and call + * the unlockast. */ + ret = DLM_NORMAL; + } else { + /* something bad. this will BUG in ocfs2 */ + ret = dlm_err_to_dlm_status(tmpret); + } + } + + return ret; +} + +/* + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID, + * return value from dlmunlock_master + */ +int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; + struct dlm_lock_resource *res = NULL; + struct list_head *iter; + struct dlm_lock *lock = NULL; + enum dlm_status status = DLM_NORMAL; + int found = 0, i; + struct dlm_lockstatus *lksb = NULL; + int ignore; + u32 flags; + struct list_head *queue; + + flags = be32_to_cpu(unlock->flags); + + if (flags & LKM_GET_LVB) { + mlog(ML_ERROR, "bad args! GET_LVB specified on unlock!\n"); + return DLM_BADARGS; + } + + if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) { + mlog(ML_ERROR, "bad args! cannot modify lvb on a CANCEL " + "request!\n"); + return DLM_BADARGS; + } + + if (unlock->namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length in unlock handler!\n"); + return DLM_IVBUFLEN; + } + + if (!dlm_grab(dlm)) + return DLM_REJECTED; + + mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), + "Domain %s not fully joined!\n", dlm->name); + + mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none"); + + res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen); + if (!res) { + /* We assume here that a no lock resource simply means + * it was migrated away and destroyed before the other + * node could detect it. */ + mlog(0, "returning DLM_FORWARD -- res no longer exists\n"); + status = DLM_FORWARD; + goto not_found; + } + + queue=&res->granted; + found = 0; + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + spin_unlock(&res->spinlock); + mlog(0, "returning DLM_RECOVERING\n"); + status = DLM_RECOVERING; + goto leave; + } + + if (res->state & DLM_LOCK_RES_MIGRATING) { + spin_unlock(&res->spinlock); + mlog(0, "returning DLM_MIGRATING\n"); + status = DLM_MIGRATING; + goto leave; + } + + if (res->owner != dlm->node_num) { + spin_unlock(&res->spinlock); + mlog(0, "returning DLM_FORWARD -- not master\n"); + status = DLM_FORWARD; + goto leave; + } + + for (i=0; i<3; i++) { + list_for_each(iter, queue) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock->ml.cookie == unlock->cookie && + lock->ml.node == unlock->node_idx) { + dlm_lock_get(lock); + found = 1; + break; + } + } + if (found) + break; + /* scan granted -> converting -> blocked queues */ + queue++; + } + spin_unlock(&res->spinlock); + if (!found) { + status = DLM_IVLOCKID; + goto not_found; + } + + /* lock was found on queue */ + lksb = lock->lksb; + if (flags & (LKM_VALBLK|LKM_PUT_LVB) && + lock->ml.type != LKM_EXMODE) + flags &= ~(LKM_VALBLK|LKM_PUT_LVB); + + /* unlockast only called on originating node */ + if (flags & LKM_PUT_LVB) { + lksb->flags |= DLM_LKSB_PUT_LVB; + memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN); + } + + /* if this is in-progress, propagate the DLM_FORWARD + * all the way back out */ + status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore); + if (status == DLM_FORWARD) + mlog(0, "lockres is in progress\n"); + + if (flags & LKM_PUT_LVB) + lksb->flags &= ~DLM_LKSB_PUT_LVB; + + dlm_lockres_calc_usage(dlm, res); + dlm_kick_thread(dlm, res); + +not_found: + if (!found) + mlog(ML_ERROR, "failed to find lock to unlock! " + "cookie=%u:%llu\n", + dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(unlock->cookie))); + else + dlm_lock_put(lock); + +leave: + if (res) + dlm_lockres_put(res); + + dlm_put(dlm); + + return status; +} + + +static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int *actions) +{ + enum dlm_status status; + + if (dlm_lock_on_list(&res->blocked, lock)) { + /* cancel this outright */ + status = DLM_NORMAL; + *actions = (DLM_UNLOCK_CALL_AST | + DLM_UNLOCK_REMOVE_LOCK); + } else if (dlm_lock_on_list(&res->converting, lock)) { + /* cancel the request, put back on granted */ + status = DLM_NORMAL; + *actions = (DLM_UNLOCK_CALL_AST | + DLM_UNLOCK_REMOVE_LOCK | + DLM_UNLOCK_REGRANT_LOCK | + DLM_UNLOCK_CLEAR_CONVERT_TYPE); + } else if (dlm_lock_on_list(&res->granted, lock)) { + /* too late, already granted. */ + status = DLM_CANCELGRANT; + *actions = DLM_UNLOCK_CALL_AST; + } else { + mlog(ML_ERROR, "lock to cancel is not on any list!\n"); + status = DLM_IVLOCKID; + *actions = 0; + } + return status; +} + +static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int *actions) +{ + enum dlm_status status; + + /* unlock request */ + if (!dlm_lock_on_list(&res->granted, lock)) { + status = DLM_DENIED; + dlm_error(status); + *actions = 0; + } else { + /* unlock granted lock */ + status = DLM_NORMAL; + *actions = (DLM_UNLOCK_FREE_LOCK | + DLM_UNLOCK_CALL_AST | + DLM_UNLOCK_REMOVE_LOCK); + } + return status; +} + +/* there seems to be no point in doing this async + * since (even for the remote case) there is really + * no work to queue up... so just do it and fire the + * unlockast by hand when done... */ +enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb, + int flags, dlm_astunlockfunc_t *unlockast, void *data) +{ + enum dlm_status status; + struct dlm_lock_resource *res; + struct dlm_lock *lock = NULL; + int call_ast, is_master; + + if (!lksb) { + dlm_error(DLM_BADARGS); + return DLM_BADARGS; + } + + if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) { + dlm_error(DLM_BADPARAM); + return DLM_BADPARAM; + } + + if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) { + mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n"); + flags &= ~LKM_VALBLK; + } + + if (!lksb->lockid || !lksb->lockid->lockres) { + dlm_error(DLM_BADPARAM); + return DLM_BADPARAM; + } + + lock = lksb->lockid; + BUG_ON(!lock); + dlm_lock_get(lock); + + res = lock->lockres; + BUG_ON(!res); + dlm_lockres_get(res); +retry: + call_ast = 0; + /* need to retry up here because owner may have changed */ + mlog(0, "lock=%p res=%p\n", lock, res); + + spin_lock(&res->spinlock); + is_master = (res->owner == dlm->node_num); + if (flags & LKM_VALBLK && lock->ml.type != LKM_EXMODE) + flags &= ~LKM_VALBLK; + spin_unlock(&res->spinlock); + + if (is_master) { + status = dlmunlock_master(dlm, res, lock, lksb, flags, + &call_ast); + mlog(0, "done calling dlmunlock_master: returned %d, " + "call_ast is %d\n", status, call_ast); + } else { + status = dlmunlock_remote(dlm, res, lock, lksb, flags, + &call_ast); + mlog(0, "done calling dlmunlock_remote: returned %d, " + "call_ast is %d\n", status, call_ast); + } + + if (status == DLM_RECOVERING || + status == DLM_MIGRATING || + status == DLM_FORWARD) { + /* We want to go away for a tiny bit to allow recovery + * / migration to complete on this resource. I don't + * know of any wait queue we could sleep on as this + * may be happening on another node. Perhaps the + * proper solution is to queue up requests on the + * other end? */ + + /* do we want to yield(); ?? */ + msleep(50); + + mlog(0, "retrying unlock due to pending recovery/" + "migration/in-progress\n"); + goto retry; + } + + if (call_ast) { + mlog(0, "calling unlockast(%p, %d)\n", data, status); + if (is_master) { + /* it is possible that there is one last bast + * pending. make sure it is flushed, then + * call the unlockast. + * not an issue if this is a mastered remotely, + * since this lock has been removed from the + * lockres queues and cannot be found. */ + dlm_kick_thread(dlm, NULL); + wait_event(dlm->ast_wq, + dlm_lock_basts_flushed(dlm, lock)); + } + (*unlockast)(data, status); + } + + if (status == DLM_CANCELGRANT) + status = DLM_NORMAL; + + if (status == DLM_NORMAL) { + mlog(0, "kicking the thread\n"); + dlm_kick_thread(dlm, res); + } else + dlm_error(status); + + dlm_lockres_calc_usage(dlm, res); + dlm_lockres_put(res); + dlm_lock_put(lock); + + mlog(0, "returning status=%d!\n", status); + return status; +} +EXPORT_SYMBOL_GPL(dlmunlock); + diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c new file mode 100644 index 00000000..dfc0da4d --- /dev/null +++ b/fs/ocfs2/dlm/dlmver.c @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmver.c + * + * version string + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include + +#include "dlmver.h" + +#define DLM_BUILD_VERSION "1.5.0" + +#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION + +void dlm_print_version(void) +{ + printk(KERN_INFO "%s\n", VERSION_STR); +} + +MODULE_DESCRIPTION(VERSION_STR); + +MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h new file mode 100644 index 00000000..f674aee7 --- /dev/null +++ b/fs/ocfs2/dlm/dlmver.h @@ -0,0 +1,31 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmfsver.h + * + * Function prototypes + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef DLM_VER_H +#define DLM_VER_H + +void dlm_print_version(void); + +#endif /* DLM_VER_H */ diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile new file mode 100644 index 00000000..f14be89a --- /dev/null +++ b/fs/ocfs2/dlmfs/Makefile @@ -0,0 +1,5 @@ +ccflags-y := -Ifs/ocfs2 + +obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o + +ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c new file mode 100644 index 00000000..b4207679 --- /dev/null +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -0,0 +1,725 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmfs.c + * + * Code which implements the kernel side of a minimal userspace + * interface to our DLM. This file handles the virtual file system + * used for communication with userspace. Credit should go to ramfs, + * which was a template for the fs side of this module. + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +/* Simple VFS hooks based on: */ +/* + * Resizable simple ram filesystem for Linux. + * + * Copyright (C) 2000 Linus Torvalds. + * 2000 Transmeta Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "stackglue.h" +#include "userdlm.h" +#include "dlmfsver.h" + +#define MLOG_MASK_PREFIX ML_DLMFS +#include "cluster/masklog.h" + + +static const struct super_operations dlmfs_ops; +static const struct file_operations dlmfs_file_operations; +static const struct inode_operations dlmfs_dir_inode_operations; +static const struct inode_operations dlmfs_root_inode_operations; +static const struct inode_operations dlmfs_file_inode_operations; +static struct kmem_cache *dlmfs_inode_cache; + +struct workqueue_struct *user_dlm_worker; + + + +/* + * These are the ABI capabilities of dlmfs. + * + * Over time, dlmfs has added some features that were not part of the + * initial ABI. Unfortunately, some of these features are not detectable + * via standard usage. For example, Linux's default poll always returns + * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs + * added poll support. Instead, we provide this list of new capabilities. + * + * Capabilities is a read-only attribute. We do it as a module parameter + * so we can discover it whether dlmfs is built in, loaded, or even not + * loaded. + * + * The ABI features are local to this machine's dlmfs mount. This is + * distinct from the locking protocol, which is concerned with inter-node + * interaction. + * + * Capabilities: + * - bast : POLLIN against the file descriptor of a held lock + * signifies a bast fired on the lock. + */ +#define DLMFS_CAPABILITIES "bast stackglue" +static int param_set_dlmfs_capabilities(const char *val, + struct kernel_param *kp) +{ + printk(KERN_ERR "%s: readonly parameter\n", kp->name); + return -EINVAL; +} +static int param_get_dlmfs_capabilities(char *buffer, + struct kernel_param *kp) +{ + return strlcpy(buffer, DLMFS_CAPABILITIES, + strlen(DLMFS_CAPABILITIES) + 1); +} +module_param_call(capabilities, param_set_dlmfs_capabilities, + param_get_dlmfs_capabilities, NULL, 0444); +MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES); + + +/* + * decodes a set of open flags into a valid lock level and a set of flags. + * returns < 0 if we have invalid flags + * flags which mean something to us: + * O_RDONLY -> PRMODE level + * O_WRONLY -> EXMODE level + * + * O_NONBLOCK -> NOQUEUE + */ +static int dlmfs_decode_open_flags(int open_flags, + int *level, + int *flags) +{ + if (open_flags & (O_WRONLY|O_RDWR)) + *level = DLM_LOCK_EX; + else + *level = DLM_LOCK_PR; + + *flags = 0; + if (open_flags & O_NONBLOCK) + *flags |= DLM_LKF_NOQUEUE; + + return 0; +} + +static int dlmfs_file_open(struct inode *inode, + struct file *file) +{ + int status, level, flags; + struct dlmfs_filp_private *fp = NULL; + struct dlmfs_inode_private *ip; + + if (S_ISDIR(inode->i_mode)) + BUG(); + + mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino, + file->f_flags); + + status = dlmfs_decode_open_flags(file->f_flags, &level, &flags); + if (status < 0) + goto bail; + + /* We don't want to honor O_APPEND at read/write time as it + * doesn't make sense for LVB writes. */ + file->f_flags &= ~O_APPEND; + + fp = kmalloc(sizeof(*fp), GFP_NOFS); + if (!fp) { + status = -ENOMEM; + goto bail; + } + fp->fp_lock_level = level; + + ip = DLMFS_I(inode); + + status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags); + if (status < 0) { + /* this is a strange error to return here but I want + * to be able userspace to be able to distinguish a + * valid lock request from one that simply couldn't be + * granted. */ + if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN) + status = -ETXTBSY; + kfree(fp); + goto bail; + } + + file->private_data = fp; +bail: + return status; +} + +static int dlmfs_file_release(struct inode *inode, + struct file *file) +{ + int level, status; + struct dlmfs_inode_private *ip = DLMFS_I(inode); + struct dlmfs_filp_private *fp = file->private_data; + + if (S_ISDIR(inode->i_mode)) + BUG(); + + mlog(0, "close called on inode %lu\n", inode->i_ino); + + status = 0; + if (fp) { + level = fp->fp_lock_level; + if (level != DLM_LOCK_IV) + user_dlm_cluster_unlock(&ip->ip_lockres, level); + + kfree(fp); + file->private_data = NULL; + } + + return 0; +} + +/* + * We do ->setattr() just to override size changes. Our size is the size + * of the LVB and nothing else. + */ +static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr) +{ + int error; + struct inode *inode = dentry->d_inode; + + attr->ia_valid &= ~ATTR_SIZE; + error = inode_change_ok(inode, attr); + if (error) + return error; + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait) +{ + int event = 0; + struct inode *inode = file->f_path.dentry->d_inode; + struct dlmfs_inode_private *ip = DLMFS_I(inode); + + poll_wait(file, &ip->ip_lockres.l_event, wait); + + spin_lock(&ip->ip_lockres.l_lock); + if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED) + event = POLLIN | POLLRDNORM; + spin_unlock(&ip->ip_lockres.l_lock); + + return event; +} + +static ssize_t dlmfs_file_read(struct file *filp, + char __user *buf, + size_t count, + loff_t *ppos) +{ + int bytes_left; + ssize_t readlen, got; + char *lvb_buf; + struct inode *inode = filp->f_path.dentry->d_inode; + + mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", + inode->i_ino, count, *ppos); + + if (*ppos >= i_size_read(inode)) + return 0; + + if (!count) + return 0; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + /* don't read past the lvb */ + if ((count + *ppos) > i_size_read(inode)) + readlen = i_size_read(inode) - *ppos; + else + readlen = count; + + lvb_buf = kmalloc(readlen, GFP_NOFS); + if (!lvb_buf) + return -ENOMEM; + + got = user_dlm_read_lvb(inode, lvb_buf, readlen); + if (got) { + BUG_ON(got != readlen); + bytes_left = __copy_to_user(buf, lvb_buf, readlen); + readlen -= bytes_left; + } else + readlen = 0; + + kfree(lvb_buf); + + *ppos = *ppos + readlen; + + mlog(0, "read %zd bytes\n", readlen); + return readlen; +} + +static ssize_t dlmfs_file_write(struct file *filp, + const char __user *buf, + size_t count, + loff_t *ppos) +{ + int bytes_left; + ssize_t writelen; + char *lvb_buf; + struct inode *inode = filp->f_path.dentry->d_inode; + + mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", + inode->i_ino, count, *ppos); + + if (*ppos >= i_size_read(inode)) + return -ENOSPC; + + if (!count) + return 0; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + /* don't write past the lvb */ + if ((count + *ppos) > i_size_read(inode)) + writelen = i_size_read(inode) - *ppos; + else + writelen = count - *ppos; + + lvb_buf = kmalloc(writelen, GFP_NOFS); + if (!lvb_buf) + return -ENOMEM; + + bytes_left = copy_from_user(lvb_buf, buf, writelen); + writelen -= bytes_left; + if (writelen) + user_dlm_write_lvb(inode, lvb_buf, writelen); + + kfree(lvb_buf); + + *ppos = *ppos + writelen; + mlog(0, "wrote %zd bytes\n", writelen); + return writelen; +} + +static void dlmfs_init_once(void *foo) +{ + struct dlmfs_inode_private *ip = + (struct dlmfs_inode_private *) foo; + + ip->ip_conn = NULL; + ip->ip_parent = NULL; + + inode_init_once(&ip->ip_vfs_inode); +} + +static struct inode *dlmfs_alloc_inode(struct super_block *sb) +{ + struct dlmfs_inode_private *ip; + + ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS); + if (!ip) + return NULL; + + return &ip->ip_vfs_inode; +} + +static void dlmfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); +} + +static void dlmfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, dlmfs_i_callback); +} + +static void dlmfs_evict_inode(struct inode *inode) +{ + int status; + struct dlmfs_inode_private *ip; + + end_writeback(inode); + + mlog(0, "inode %lu\n", inode->i_ino); + + ip = DLMFS_I(inode); + + if (S_ISREG(inode->i_mode)) { + status = user_dlm_destroy_lock(&ip->ip_lockres); + if (status < 0) + mlog_errno(status); + iput(ip->ip_parent); + goto clear_fields; + } + + mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn); + /* we must be a directory. If required, lets unregister the + * dlm context now. */ + if (ip->ip_conn) + user_dlm_unregister(ip->ip_conn); +clear_fields: + ip->ip_parent = NULL; + ip->ip_conn = NULL; +} + +static struct backing_dev_info dlmfs_backing_dev_info = { + .name = "ocfs2-dlmfs", + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, +}; + +static struct inode *dlmfs_get_root_inode(struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + int mode = S_IFDIR | 0755; + struct dlmfs_inode_private *ip; + + if (inode) { + ip = DLMFS_I(inode); + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inc_nlink(inode); + + inode->i_fop = &simple_dir_operations; + inode->i_op = &dlmfs_root_inode_operations; + } + + return inode; +} + +static struct inode *dlmfs_get_inode(struct inode *parent, + struct dentry *dentry, + int mode) +{ + struct super_block *sb = parent->i_sb; + struct inode * inode = new_inode(sb); + struct dlmfs_inode_private *ip; + + if (!inode) + return NULL; + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + ip = DLMFS_I(inode); + ip->ip_conn = DLMFS_I(parent)->ip_conn; + + switch (mode & S_IFMT) { + default: + /* for now we don't support anything other than + * directories and regular files. */ + BUG(); + break; + case S_IFREG: + inode->i_op = &dlmfs_file_inode_operations; + inode->i_fop = &dlmfs_file_operations; + + i_size_write(inode, DLM_LVB_LEN); + + user_dlm_lock_res_init(&ip->ip_lockres, dentry); + + /* released at clear_inode time, this insures that we + * get to drop the dlm reference on each lock *before* + * we call the unregister code for releasing parent + * directories. */ + ip->ip_parent = igrab(parent); + BUG_ON(!ip->ip_parent); + break; + case S_IFDIR: + inode->i_op = &dlmfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == + * 2 (for "." entry) */ + inc_nlink(inode); + break; + } + + if (parent->i_mode & S_ISGID) { + inode->i_gid = parent->i_gid; + if (S_ISDIR(mode)) + inode->i_mode |= S_ISGID; + } + + return inode; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +/* SMP-safe */ +static int dlmfs_mkdir(struct inode * dir, + struct dentry * dentry, + int mode) +{ + int status; + struct inode *inode = NULL; + struct qstr *domain = &dentry->d_name; + struct dlmfs_inode_private *ip; + struct ocfs2_cluster_connection *conn; + + mlog(0, "mkdir %.*s\n", domain->len, domain->name); + + /* verify that we have a proper domain */ + if (domain->len >= GROUP_NAME_MAX) { + status = -EINVAL; + mlog(ML_ERROR, "invalid domain name for directory.\n"); + goto bail; + } + + inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + ip = DLMFS_I(inode); + + conn = user_dlm_register(domain); + if (IS_ERR(conn)) { + status = PTR_ERR(conn); + mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n", + status, domain->len, domain->name); + goto bail; + } + ip->ip_conn = conn; + + inc_nlink(dir); + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + + status = 0; +bail: + if (status < 0) + iput(inode); + return status; +} + +static int dlmfs_create(struct inode *dir, + struct dentry *dentry, + int mode, + struct nameidata *nd) +{ + int status = 0; + struct inode *inode; + struct qstr *name = &dentry->d_name; + + mlog(0, "create %.*s\n", name->len, name->name); + + /* verify name is valid and doesn't contain any dlm reserved + * characters */ + if (name->len >= USER_DLM_LOCK_ID_MAX_LEN || + name->name[0] == '$') { + status = -EINVAL; + mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len, + name->name); + goto bail; + } + + inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ +bail: + return status; +} + +static int dlmfs_unlink(struct inode *dir, + struct dentry *dentry) +{ + int status; + struct inode *inode = dentry->d_inode; + + mlog(0, "unlink inode %lu\n", inode->i_ino); + + /* if there are no current holders, or none that are waiting + * to acquire a lock, this basically destroys our lockres. */ + status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres); + if (status < 0) { + mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n", + dentry->d_name.len, dentry->d_name.name, status); + goto bail; + } + status = simple_unlink(dir, dentry); +bail: + return status; +} + +static int dlmfs_fill_super(struct super_block * sb, + void * data, + int silent) +{ + struct inode * inode; + struct dentry * root; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = DLMFS_MAGIC; + sb->s_op = &dlmfs_ops; + inode = dlmfs_get_root_inode(sb); + if (!inode) + return -ENOMEM; + + root = d_alloc_root(inode); + if (!root) { + iput(inode); + return -ENOMEM; + } + sb->s_root = root; + return 0; +} + +static const struct file_operations dlmfs_file_operations = { + .open = dlmfs_file_open, + .release = dlmfs_file_release, + .poll = dlmfs_file_poll, + .read = dlmfs_file_read, + .write = dlmfs_file_write, + .llseek = default_llseek, +}; + +static const struct inode_operations dlmfs_dir_inode_operations = { + .create = dlmfs_create, + .lookup = simple_lookup, + .unlink = dlmfs_unlink, +}; + +/* this way we can restrict mkdir to only the toplevel of the fs. */ +static const struct inode_operations dlmfs_root_inode_operations = { + .lookup = simple_lookup, + .mkdir = dlmfs_mkdir, + .rmdir = simple_rmdir, +}; + +static const struct super_operations dlmfs_ops = { + .statfs = simple_statfs, + .alloc_inode = dlmfs_alloc_inode, + .destroy_inode = dlmfs_destroy_inode, + .evict_inode = dlmfs_evict_inode, + .drop_inode = generic_delete_inode, +}; + +static const struct inode_operations dlmfs_file_inode_operations = { + .getattr = simple_getattr, + .setattr = dlmfs_file_setattr, +}; + +static struct dentry *dlmfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, dlmfs_fill_super); +} + +static struct file_system_type dlmfs_fs_type = { + .owner = THIS_MODULE, + .name = "ocfs2_dlmfs", + .mount = dlmfs_mount, + .kill_sb = kill_litter_super, +}; + +static int __init init_dlmfs_fs(void) +{ + int status; + int cleanup_inode = 0, cleanup_worker = 0; + + dlmfs_print_version(); + + status = bdi_init(&dlmfs_backing_dev_info); + if (status) + return status; + + dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", + sizeof(struct dlmfs_inode_private), + 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + dlmfs_init_once); + if (!dlmfs_inode_cache) { + status = -ENOMEM; + goto bail; + } + cleanup_inode = 1; + + user_dlm_worker = create_singlethread_workqueue("user_dlm"); + if (!user_dlm_worker) { + status = -ENOMEM; + goto bail; + } + cleanup_worker = 1; + + user_dlm_set_locking_protocol(); + status = register_filesystem(&dlmfs_fs_type); +bail: + if (status) { + if (cleanup_inode) + kmem_cache_destroy(dlmfs_inode_cache); + if (cleanup_worker) + destroy_workqueue(user_dlm_worker); + bdi_destroy(&dlmfs_backing_dev_info); + } else + printk("OCFS2 User DLM kernel interface loaded\n"); + return status; +} + +static void __exit exit_dlmfs_fs(void) +{ + unregister_filesystem(&dlmfs_fs_type); + + flush_workqueue(user_dlm_worker); + destroy_workqueue(user_dlm_worker); + + kmem_cache_destroy(dlmfs_inode_cache); + + bdi_destroy(&dlmfs_backing_dev_info); +} + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); + +module_init(init_dlmfs_fs) +module_exit(exit_dlmfs_fs) diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c new file mode 100644 index 00000000..a733b332 --- /dev/null +++ b/fs/ocfs2/dlmfs/dlmfsver.c @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmfsver.c + * + * version string + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include + +#include "dlmfsver.h" + +#define DLM_BUILD_VERSION "1.5.0" + +#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION + +void dlmfs_print_version(void) +{ + printk(KERN_INFO "%s\n", VERSION_STR); +} + +MODULE_DESCRIPTION(VERSION_STR); + +MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h new file mode 100644 index 00000000..f35eadbe --- /dev/null +++ b/fs/ocfs2/dlmfs/dlmfsver.h @@ -0,0 +1,31 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmver.h + * + * Function prototypes + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef DLMFS_VER_H +#define DLMFS_VER_H + +void dlmfs_print_version(void); + +#endif /* DLMFS_VER_H */ diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c new file mode 100644 index 00000000..0499e3fb --- /dev/null +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -0,0 +1,688 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * userdlm.c + * + * Code which implements the kernel side of a minimal userspace + * interface to our DLM. + * + * Many of the functions here are pared down versions of dlmglue.c + * functions. + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include + +#include +#include +#include +#include + +#include "ocfs2_lockingver.h" +#include "stackglue.h" +#include "userdlm.h" + +#define MLOG_MASK_PREFIX ML_DLMFS +#include "cluster/masklog.h" + + +static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb) +{ + return container_of(lksb, struct user_lock_res, l_lksb); +} + +static inline int user_check_wait_flag(struct user_lock_res *lockres, + int flag) +{ + int ret; + + spin_lock(&lockres->l_lock); + ret = lockres->l_flags & flag; + spin_unlock(&lockres->l_lock); + + return ret; +} + +static inline void user_wait_on_busy_lock(struct user_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !user_check_wait_flag(lockres, USER_LOCK_BUSY)); +} + +static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !user_check_wait_flag(lockres, USER_LOCK_BLOCKED)); +} + +/* I heart container_of... */ +static inline struct ocfs2_cluster_connection * +cluster_connection_from_user_lockres(struct user_lock_res *lockres) +{ + struct dlmfs_inode_private *ip; + + ip = container_of(lockres, + struct dlmfs_inode_private, + ip_lockres); + return ip->ip_conn; +} + +static struct inode * +user_dlm_inode_from_user_lockres(struct user_lock_res *lockres) +{ + struct dlmfs_inode_private *ip; + + ip = container_of(lockres, + struct dlmfs_inode_private, + ip_lockres); + return &ip->ip_vfs_inode; +} + +static inline void user_recover_from_dlm_error(struct user_lock_res *lockres) +{ + spin_lock(&lockres->l_lock); + lockres->l_flags &= ~USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); +} + +#define user_log_dlm_error(_func, _stat, _lockres) do { \ + mlog(ML_ERROR, "Dlm error %d while calling %s on " \ + "resource %.*s\n", _stat, _func, \ + _lockres->l_namelen, _lockres->l_name); \ +} while (0) + +/* WARNING: This function lives in a world where the only three lock + * levels are EX, PR, and NL. It *will* have to be adjusted when more + * lock types are added. */ +static inline int user_highest_compat_lock_level(int level) +{ + int new_level = DLM_LOCK_EX; + + if (level == DLM_LOCK_EX) + new_level = DLM_LOCK_NL; + else if (level == DLM_LOCK_PR) + new_level = DLM_LOCK_PR; + return new_level; +} + +static void user_ast(struct ocfs2_dlm_lksb *lksb) +{ + struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); + int status; + + mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n", + lockres->l_namelen, lockres->l_name, lockres->l_level, + lockres->l_requested); + + spin_lock(&lockres->l_lock); + + status = ocfs2_dlm_lock_status(&lockres->l_lksb); + if (status) { + mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", + status, lockres->l_namelen, lockres->l_name); + spin_unlock(&lockres->l_lock); + return; + } + + mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV, + "Lockres %.*s, requested ivmode. flags 0x%x\n", + lockres->l_namelen, lockres->l_name, lockres->l_flags); + + /* we're downconverting. */ + if (lockres->l_requested < lockres->l_level) { + if (lockres->l_requested <= + user_highest_compat_lock_level(lockres->l_blocking)) { + lockres->l_blocking = DLM_LOCK_NL; + lockres->l_flags &= ~USER_LOCK_BLOCKED; + } + } + + lockres->l_level = lockres->l_requested; + lockres->l_requested = DLM_LOCK_IV; + lockres->l_flags |= USER_LOCK_ATTACHED; + lockres->l_flags &= ~USER_LOCK_BUSY; + + spin_unlock(&lockres->l_lock); + + wake_up(&lockres->l_event); +} + +static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres) +{ + struct inode *inode; + inode = user_dlm_inode_from_user_lockres(lockres); + if (!igrab(inode)) + BUG(); +} + +static void user_dlm_unblock_lock(struct work_struct *work); + +static void __user_dlm_queue_lockres(struct user_lock_res *lockres) +{ + if (!(lockres->l_flags & USER_LOCK_QUEUED)) { + user_dlm_grab_inode_ref(lockres); + + INIT_WORK(&lockres->l_work, user_dlm_unblock_lock); + + queue_work(user_dlm_worker, &lockres->l_work); + lockres->l_flags |= USER_LOCK_QUEUED; + } +} + +static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres) +{ + int queue = 0; + + if (!(lockres->l_flags & USER_LOCK_BLOCKED)) + return; + + switch (lockres->l_blocking) { + case DLM_LOCK_EX: + if (!lockres->l_ex_holders && !lockres->l_ro_holders) + queue = 1; + break; + case DLM_LOCK_PR: + if (!lockres->l_ex_holders) + queue = 1; + break; + default: + BUG(); + } + + if (queue) + __user_dlm_queue_lockres(lockres); +} + +static void user_bast(struct ocfs2_dlm_lksb *lksb, int level) +{ + struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); + + mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n", + lockres->l_namelen, lockres->l_name, level, lockres->l_level); + + spin_lock(&lockres->l_lock); + lockres->l_flags |= USER_LOCK_BLOCKED; + if (level > lockres->l_blocking) + lockres->l_blocking = level; + + __user_dlm_queue_lockres(lockres); + spin_unlock(&lockres->l_lock); + + wake_up(&lockres->l_event); +} + +static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status) +{ + struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); + + mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n", + lockres->l_namelen, lockres->l_name, lockres->l_flags); + + if (status) + mlog(ML_ERROR, "dlm returns status %d\n", status); + + spin_lock(&lockres->l_lock); + /* The teardown flag gets set early during the unlock process, + * so test the cancel flag to make sure that this ast isn't + * for a concurrent cancel. */ + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN + && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { + lockres->l_level = DLM_LOCK_IV; + } else if (status == DLM_CANCELGRANT) { + /* We tried to cancel a convert request, but it was + * already granted. Don't clear the busy flag - the + * ast should've done this already. */ + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + lockres->l_flags &= ~USER_LOCK_IN_CANCEL; + goto out_noclear; + } else { + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + /* Cancel succeeded, we want to re-queue */ + lockres->l_requested = DLM_LOCK_IV; /* cancel an + * upconvert + * request. */ + lockres->l_flags &= ~USER_LOCK_IN_CANCEL; + /* we want the unblock thread to look at it again + * now. */ + if (lockres->l_flags & USER_LOCK_BLOCKED) + __user_dlm_queue_lockres(lockres); + } + + lockres->l_flags &= ~USER_LOCK_BUSY; +out_noclear: + spin_unlock(&lockres->l_lock); + + wake_up(&lockres->l_event); +} + +/* + * This is the userdlmfs locking protocol version. + * + * See fs/ocfs2/dlmglue.c for more details on locking versions. + */ +static struct ocfs2_locking_protocol user_dlm_lproto = { + .lp_max_version = { + .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, + .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, + }, + .lp_lock_ast = user_ast, + .lp_blocking_ast = user_bast, + .lp_unlock_ast = user_unlock_ast, +}; + +static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) +{ + struct inode *inode; + inode = user_dlm_inode_from_user_lockres(lockres); + iput(inode); +} + +static void user_dlm_unblock_lock(struct work_struct *work) +{ + int new_level, status; + struct user_lock_res *lockres = + container_of(work, struct user_lock_res, l_work); + struct ocfs2_cluster_connection *conn = + cluster_connection_from_user_lockres(lockres); + + mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); + + spin_lock(&lockres->l_lock); + + mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), + "Lockres %.*s, flags 0x%x\n", + lockres->l_namelen, lockres->l_name, lockres->l_flags); + + /* notice that we don't clear USER_LOCK_BLOCKED here. If it's + * set, we want user_ast clear it. */ + lockres->l_flags &= ~USER_LOCK_QUEUED; + + /* It's valid to get here and no longer be blocked - if we get + * several basts in a row, we might be queued by the first + * one, the unblock thread might run and clear the queued + * flag, and finally we might get another bast which re-queues + * us before our ast for the downconvert is called. */ + if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { + mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n", + lockres->l_namelen, lockres->l_name); + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n", + lockres->l_namelen, lockres->l_name); + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + + if (lockres->l_flags & USER_LOCK_BUSY) { + if (lockres->l_flags & USER_LOCK_IN_CANCEL) { + mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n", + lockres->l_namelen, lockres->l_name); + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + + lockres->l_flags |= USER_LOCK_IN_CANCEL; + spin_unlock(&lockres->l_lock); + + status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, + DLM_LKF_CANCEL); + if (status) + user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); + goto drop_ref; + } + + /* If there are still incompat holders, we can exit safely + * without worrying about re-queueing this lock as that will + * happen on the last call to user_cluster_unlock. */ + if ((lockres->l_blocking == DLM_LOCK_EX) + && (lockres->l_ex_holders || lockres->l_ro_holders)) { + spin_unlock(&lockres->l_lock); + mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n", + lockres->l_namelen, lockres->l_name, + lockres->l_ex_holders, lockres->l_ro_holders); + goto drop_ref; + } + + if ((lockres->l_blocking == DLM_LOCK_PR) + && lockres->l_ex_holders) { + spin_unlock(&lockres->l_lock); + mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n", + lockres->l_namelen, lockres->l_name, + lockres->l_ex_holders); + goto drop_ref; + } + + /* yay, we can downconvert now. */ + new_level = user_highest_compat_lock_level(lockres->l_blocking); + lockres->l_requested = new_level; + lockres->l_flags |= USER_LOCK_BUSY; + mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n", + lockres->l_namelen, lockres->l_name, lockres->l_level, new_level); + spin_unlock(&lockres->l_lock); + + /* need lock downconvert request now... */ + status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb, + DLM_LKF_CONVERT|DLM_LKF_VALBLK, + lockres->l_name, + lockres->l_namelen); + if (status) { + user_log_dlm_error("ocfs2_dlm_lock", status, lockres); + user_recover_from_dlm_error(lockres); + } + +drop_ref: + user_dlm_drop_inode_ref(lockres); +} + +static inline void user_dlm_inc_holders(struct user_lock_res *lockres, + int level) +{ + switch(level) { + case DLM_LOCK_EX: + lockres->l_ex_holders++; + break; + case DLM_LOCK_PR: + lockres->l_ro_holders++; + break; + default: + BUG(); + } +} + +/* predict what lock level we'll be dropping down to on behalf + * of another node, and return true if the currently wanted + * level will be compatible with it. */ +static inline int +user_may_continue_on_blocked_lock(struct user_lock_res *lockres, + int wanted) +{ + BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); + + return wanted <= user_highest_compat_lock_level(lockres->l_blocking); +} + +int user_dlm_cluster_lock(struct user_lock_res *lockres, + int level, + int lkm_flags) +{ + int status, local_flags; + struct ocfs2_cluster_connection *conn = + cluster_connection_from_user_lockres(lockres); + + if (level != DLM_LOCK_EX && + level != DLM_LOCK_PR) { + mlog(ML_ERROR, "lockres %.*s: invalid request!\n", + lockres->l_namelen, lockres->l_name); + status = -EINVAL; + goto bail; + } + + mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n", + lockres->l_namelen, lockres->l_name, level, lkm_flags); + +again: + if (signal_pending(current)) { + status = -ERESTARTSYS; + goto bail; + } + + spin_lock(&lockres->l_lock); + + /* We only compare against the currently granted level + * here. If the lock is blocked waiting on a downconvert, + * we'll get caught below. */ + if ((lockres->l_flags & USER_LOCK_BUSY) && + (level > lockres->l_level)) { + /* is someone sitting in dlm_lock? If so, wait on + * them. */ + spin_unlock(&lockres->l_lock); + + user_wait_on_busy_lock(lockres); + goto again; + } + + if ((lockres->l_flags & USER_LOCK_BLOCKED) && + (!user_may_continue_on_blocked_lock(lockres, level))) { + /* is the lock is currently blocked on behalf of + * another node */ + spin_unlock(&lockres->l_lock); + + user_wait_on_blocked_lock(lockres); + goto again; + } + + if (level > lockres->l_level) { + local_flags = lkm_flags | DLM_LKF_VALBLK; + if (lockres->l_level != DLM_LOCK_IV) + local_flags |= DLM_LKF_CONVERT; + + lockres->l_requested = level; + lockres->l_flags |= USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); + + BUG_ON(level == DLM_LOCK_IV); + BUG_ON(level == DLM_LOCK_NL); + + /* call dlm_lock to upgrade lock now */ + status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb, + local_flags, lockres->l_name, + lockres->l_namelen); + if (status) { + if ((lkm_flags & DLM_LKF_NOQUEUE) && + (status != -EAGAIN)) + user_log_dlm_error("ocfs2_dlm_lock", + status, lockres); + user_recover_from_dlm_error(lockres); + goto bail; + } + + user_wait_on_busy_lock(lockres); + goto again; + } + + user_dlm_inc_holders(lockres, level); + spin_unlock(&lockres->l_lock); + + status = 0; +bail: + return status; +} + +static inline void user_dlm_dec_holders(struct user_lock_res *lockres, + int level) +{ + switch(level) { + case DLM_LOCK_EX: + BUG_ON(!lockres->l_ex_holders); + lockres->l_ex_holders--; + break; + case DLM_LOCK_PR: + BUG_ON(!lockres->l_ro_holders); + lockres->l_ro_holders--; + break; + default: + BUG(); + } +} + +void user_dlm_cluster_unlock(struct user_lock_res *lockres, + int level) +{ + if (level != DLM_LOCK_EX && + level != DLM_LOCK_PR) { + mlog(ML_ERROR, "lockres %.*s: invalid request!\n", + lockres->l_namelen, lockres->l_name); + return; + } + + spin_lock(&lockres->l_lock); + user_dlm_dec_holders(lockres, level); + __user_dlm_cond_queue_lockres(lockres); + spin_unlock(&lockres->l_lock); +} + +void user_dlm_write_lvb(struct inode *inode, + const char *val, + unsigned int len) +{ + struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; + char *lvb; + + BUG_ON(len > DLM_LVB_LEN); + + spin_lock(&lockres->l_lock); + + BUG_ON(lockres->l_level < DLM_LOCK_EX); + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + memcpy(lvb, val, len); + + spin_unlock(&lockres->l_lock); +} + +ssize_t user_dlm_read_lvb(struct inode *inode, + char *val, + unsigned int len) +{ + struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; + char *lvb; + ssize_t ret = len; + + BUG_ON(len > DLM_LVB_LEN); + + spin_lock(&lockres->l_lock); + + BUG_ON(lockres->l_level < DLM_LOCK_PR); + if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) { + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + memcpy(val, lvb, len); + } else + ret = 0; + + spin_unlock(&lockres->l_lock); + return ret; +} + +void user_dlm_lock_res_init(struct user_lock_res *lockres, + struct dentry *dentry) +{ + memset(lockres, 0, sizeof(*lockres)); + + spin_lock_init(&lockres->l_lock); + init_waitqueue_head(&lockres->l_event); + lockres->l_level = DLM_LOCK_IV; + lockres->l_requested = DLM_LOCK_IV; + lockres->l_blocking = DLM_LOCK_IV; + + /* should have been checked before getting here. */ + BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); + + memcpy(lockres->l_name, + dentry->d_name.name, + dentry->d_name.len); + lockres->l_namelen = dentry->d_name.len; +} + +int user_dlm_destroy_lock(struct user_lock_res *lockres) +{ + int status = -EBUSY; + struct ocfs2_cluster_connection *conn = + cluster_connection_from_user_lockres(lockres); + + mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); + + spin_lock(&lockres->l_lock); + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + spin_unlock(&lockres->l_lock); + return 0; + } + + lockres->l_flags |= USER_LOCK_IN_TEARDOWN; + + while (lockres->l_flags & USER_LOCK_BUSY) { + spin_unlock(&lockres->l_lock); + + user_wait_on_busy_lock(lockres); + + spin_lock(&lockres->l_lock); + } + + if (lockres->l_ro_holders || lockres->l_ex_holders) { + spin_unlock(&lockres->l_lock); + goto bail; + } + + status = 0; + if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { + spin_unlock(&lockres->l_lock); + goto bail; + } + + lockres->l_flags &= ~USER_LOCK_ATTACHED; + lockres->l_flags |= USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); + + status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK); + if (status) { + user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); + goto bail; + } + + user_wait_on_busy_lock(lockres); + + status = 0; +bail: + return status; +} + +static void user_dlm_recovery_handler_noop(int node_num, + void *recovery_data) +{ + /* We ignore recovery events */ + return; +} + +void user_dlm_set_locking_protocol(void) +{ + ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version); +} + +struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name) +{ + int rc; + struct ocfs2_cluster_connection *conn; + + rc = ocfs2_cluster_connect_agnostic(name->name, name->len, + &user_dlm_lproto, + user_dlm_recovery_handler_noop, + NULL, &conn); + if (rc) + mlog_errno(rc); + + return rc ? ERR_PTR(rc) : conn; +} + +void user_dlm_unregister(struct ocfs2_cluster_connection *conn) +{ + ocfs2_cluster_disconnect(conn, 0); +} diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h new file mode 100644 index 00000000..3b42d795 --- /dev/null +++ b/fs/ocfs2/dlmfs/userdlm.h @@ -0,0 +1,113 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * userdlm.h + * + * Userspace dlm defines + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + + +#ifndef USERDLM_H +#define USERDLM_H + +#include +#include +#include +#include + +/* user_lock_res->l_flags flags. */ +#define USER_LOCK_ATTACHED (0x00000001) /* we have initialized + * the lvb */ +#define USER_LOCK_BUSY (0x00000002) /* we are currently in + * dlm_lock */ +#define USER_LOCK_BLOCKED (0x00000004) /* blocked waiting to + * downconvert*/ +#define USER_LOCK_IN_TEARDOWN (0x00000008) /* we're currently + * destroying this + * lock. */ +#define USER_LOCK_QUEUED (0x00000010) /* lock is on the + * workqueue */ +#define USER_LOCK_IN_CANCEL (0x00000020) + +struct user_lock_res { + spinlock_t l_lock; + + int l_flags; + +#define USER_DLM_LOCK_ID_MAX_LEN 32 + char l_name[USER_DLM_LOCK_ID_MAX_LEN]; + int l_namelen; + int l_level; + unsigned int l_ro_holders; + unsigned int l_ex_holders; + struct ocfs2_dlm_lksb l_lksb; + + int l_requested; + int l_blocking; + + wait_queue_head_t l_event; + + struct work_struct l_work; +}; + +extern struct workqueue_struct *user_dlm_worker; + +void user_dlm_lock_res_init(struct user_lock_res *lockres, + struct dentry *dentry); +int user_dlm_destroy_lock(struct user_lock_res *lockres); +int user_dlm_cluster_lock(struct user_lock_res *lockres, + int level, + int lkm_flags); +void user_dlm_cluster_unlock(struct user_lock_res *lockres, + int level); +void user_dlm_write_lvb(struct inode *inode, + const char *val, + unsigned int len); +ssize_t user_dlm_read_lvb(struct inode *inode, + char *val, + unsigned int len); +struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name); +void user_dlm_unregister(struct ocfs2_cluster_connection *conn); +void user_dlm_set_locking_protocol(void); + +struct dlmfs_inode_private { + struct ocfs2_cluster_connection *ip_conn; + + struct user_lock_res ip_lockres; /* unused for directories. */ + struct inode *ip_parent; + + struct inode ip_vfs_inode; +}; + +static inline struct dlmfs_inode_private * +DLMFS_I(struct inode *inode) +{ + return container_of(inode, + struct dlmfs_inode_private, + ip_vfs_inode); +} + +struct dlmfs_filp_private { + int fp_lock_level; +}; + +#define DLMFS_MAGIC 0x76a9f425 + +#endif /* USERDLM_H */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c new file mode 100644 index 00000000..7642d7ca --- /dev/null +++ b/fs/ocfs2/dlmglue.c @@ -0,0 +1,4033 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmglue.c + * + * Code which implements an OCFS2 specific interface to our DLM. + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MLOG_MASK_PREFIX ML_DLM_GLUE +#include + +#include "ocfs2.h" +#include "ocfs2_lockingver.h" + +#include "alloc.h" +#include "dcache.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "stackglue.h" +#include "slot_map.h" +#include "super.h" +#include "uptodate.h" +#include "quota.h" +#include "refcounttree.h" + +#include "buffer_head_io.h" + +struct ocfs2_mask_waiter { + struct list_head mw_item; + int mw_status; + struct completion mw_complete; + unsigned long mw_mask; + unsigned long mw_goal; +#ifdef CONFIG_OCFS2_FS_STATS + ktime_t mw_lock_start; +#endif +}; + +static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); +static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); +static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres); +static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres); + +/* + * Return value from ->downconvert_worker functions. + * + * These control the precise actions of ocfs2_unblock_lock() + * and ocfs2_process_blocked_lock() + * + */ +enum ocfs2_unblock_action { + UNBLOCK_CONTINUE = 0, /* Continue downconvert */ + UNBLOCK_CONTINUE_POST = 1, /* Continue downconvert, fire + * ->post_unlock callback */ + UNBLOCK_STOP_POST = 2, /* Do not downconvert, fire + * ->post_unlock() callback. */ +}; + +struct ocfs2_unblock_ctl { + int requeue; + enum ocfs2_unblock_action unblock_action; +}; + +/* Lockdep class keys */ +struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES]; + +static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, + int new_level); +static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres); + +static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, + int blocking); + +static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, + int blocking); + +static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + +static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres); + +static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres, + int new_level); +static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres, + int blocking); + +#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) + +/* This aids in debugging situations where a bad LVB might be involved. */ +static void ocfs2_dump_meta_lvb_info(u64 level, + const char *function, + unsigned int line, + struct ocfs2_lock_res *lockres) +{ + struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + + mlog(level, "LVB information for %s (called from %s:%u):\n", + lockres->l_name, function, line); + mlog(level, "version: %u, clusters: %u, generation: 0x%x\n", + lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters), + be32_to_cpu(lvb->lvb_igeneration)); + mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n", + (unsigned long long)be64_to_cpu(lvb->lvb_isize), + be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid), + be16_to_cpu(lvb->lvb_imode)); + mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, " + "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink), + (long long)be64_to_cpu(lvb->lvb_iatime_packed), + (long long)be64_to_cpu(lvb->lvb_ictime_packed), + (long long)be64_to_cpu(lvb->lvb_imtime_packed), + be32_to_cpu(lvb->lvb_iattr)); +} + + +/* + * OCFS2 Lock Resource Operations + * + * These fine tune the behavior of the generic dlmglue locking infrastructure. + * + * The most basic of lock types can point ->l_priv to their respective + * struct ocfs2_super and allow the default actions to manage things. + * + * Right now, each lock type also needs to implement an init function, + * and trivial lock/unlock wrappers. ocfs2_simple_drop_lockres() + * should be called when the lock is no longer needed (i.e., object + * destruction time). + */ +struct ocfs2_lock_res_ops { + /* + * Translate an ocfs2_lock_res * into an ocfs2_super *. Define + * this callback if ->l_priv is not an ocfs2_super pointer + */ + struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *); + + /* + * Optionally called in the downconvert thread after a + * successful downconvert. The lockres will not be referenced + * after this callback is called, so it is safe to free + * memory, etc. + * + * The exact semantics of when this is called are controlled + * by ->downconvert_worker() + */ + void (*post_unlock)(struct ocfs2_super *, struct ocfs2_lock_res *); + + /* + * Allow a lock type to add checks to determine whether it is + * safe to downconvert a lock. Return 0 to re-queue the + * downconvert at a later time, nonzero to continue. + * + * For most locks, the default checks that there are no + * incompatible holders are sufficient. + * + * Called with the lockres spinlock held. + */ + int (*check_downconvert)(struct ocfs2_lock_res *, int); + + /* + * Allows a lock type to populate the lock value block. This + * is called on downconvert, and when we drop a lock. + * + * Locks that want to use this should set LOCK_TYPE_USES_LVB + * in the flags field. + * + * Called with the lockres spinlock held. + */ + void (*set_lvb)(struct ocfs2_lock_res *); + + /* + * Called from the downconvert thread when it is determined + * that a lock will be downconverted. This is called without + * any locks held so the function can do work that might + * schedule (syncing out data, etc). + * + * This should return any one of the ocfs2_unblock_action + * values, depending on what it wants the thread to do. + */ + int (*downconvert_worker)(struct ocfs2_lock_res *, int); + + /* + * LOCK_TYPE_* flags which describe the specific requirements + * of a lock type. Descriptions of each individual flag follow. + */ + int flags; +}; + +/* + * Some locks want to "refresh" potentially stale data when a + * meaningful (PRMODE or EXMODE) lock level is first obtained. If this + * flag is set, the OCFS2_LOCK_NEEDS_REFRESH flag will be set on the + * individual lockres l_flags member from the ast function. It is + * expected that the locking wrapper will clear the + * OCFS2_LOCK_NEEDS_REFRESH flag when done. + */ +#define LOCK_TYPE_REQUIRES_REFRESH 0x1 + +/* + * Indicate that a lock type makes use of the lock value block. The + * ->set_lvb lock type callback must be defined. + */ +#define LOCK_TYPE_USES_LVB 0x2 + +static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { + .get_osb = ocfs2_get_inode_osb, + .flags = 0, +}; + +static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = { + .get_osb = ocfs2_get_inode_osb, + .check_downconvert = ocfs2_check_meta_downconvert, + .set_lvb = ocfs2_set_meta_lvb, + .downconvert_worker = ocfs2_data_convert_worker, + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, +}; + +static struct ocfs2_lock_res_ops ocfs2_super_lops = { + .flags = LOCK_TYPE_REQUIRES_REFRESH, +}; + +static struct ocfs2_lock_res_ops ocfs2_rename_lops = { + .flags = 0, +}; + +static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = { + .flags = 0, +}; + +static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = { + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, +}; + +static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { + .get_osb = ocfs2_get_dentry_osb, + .post_unlock = ocfs2_dentry_post_unlock, + .downconvert_worker = ocfs2_dentry_convert_worker, + .flags = 0, +}; + +static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = { + .get_osb = ocfs2_get_inode_osb, + .flags = 0, +}; + +static struct ocfs2_lock_res_ops ocfs2_flock_lops = { + .get_osb = ocfs2_get_file_osb, + .flags = 0, +}; + +static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = { + .set_lvb = ocfs2_set_qinfo_lvb, + .get_osb = ocfs2_get_qinfo_osb, + .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB, +}; + +static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = { + .check_downconvert = ocfs2_check_refcount_downconvert, + .downconvert_worker = ocfs2_refcount_convert_worker, + .flags = 0, +}; + +static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) +{ + return lockres->l_type == OCFS2_LOCK_TYPE_META || + lockres->l_type == OCFS2_LOCK_TYPE_RW || + lockres->l_type == OCFS2_LOCK_TYPE_OPEN; +} + +static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb) +{ + return container_of(lksb, struct ocfs2_lock_res, l_lksb); +} + +static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) +{ + BUG_ON(!ocfs2_is_inode_lock(lockres)); + + return (struct inode *) lockres->l_priv; +} + +static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res *lockres) +{ + BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_DENTRY); + + return (struct ocfs2_dentry_lock *)lockres->l_priv; +} + +static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres) +{ + BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO); + + return (struct ocfs2_mem_dqinfo *)lockres->l_priv; +} + +static inline struct ocfs2_refcount_tree * +ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res) +{ + return container_of(res, struct ocfs2_refcount_tree, rf_lockres); +} + +static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres) +{ + if (lockres->l_ops->get_osb) + return lockres->l_ops->get_osb(lockres); + + return (struct ocfs2_super *)lockres->l_priv; +} + +static int ocfs2_lock_create(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + u32 dlm_flags); +static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, + int wanted); +static void __ocfs2_cluster_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, unsigned long caller_ip); +static inline void ocfs2_cluster_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level) +{ + __ocfs2_cluster_unlock(osb, lockres, level, _RET_IP_); +} + +static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres); +static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres); +static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres); +static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level); +static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); +static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, + int convert); +#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \ + if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY) \ + mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \ + _err, _func, _lockres->l_name); \ + else \ + mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n", \ + _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name, \ + (unsigned int)ocfs2_get_dentry_lock_ino(_lockres)); \ +} while (0) +static int ocfs2_downconvert_thread(void *arg); +static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); +static int ocfs2_inode_lock_update(struct inode *inode, + struct buffer_head **bh); +static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); +static inline int ocfs2_highest_compat_lock_level(int level); +static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, + int new_level); +static int ocfs2_downconvert_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int new_level, + int lvb, + unsigned int generation); +static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); +static int ocfs2_cancel_convert(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + + +static void ocfs2_build_lock_name(enum ocfs2_lock_type type, + u64 blkno, + u32 generation, + char *name) +{ + int len; + + BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); + + len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x", + ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, + (long long)blkno, generation); + + BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1)); + + mlog(0, "built lock resource with name: %s\n", name); +} + +static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock); + +static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res, + struct ocfs2_dlm_debug *dlm_debug) +{ + mlog(0, "Add tracking for lockres %s\n", res->l_name); + + spin_lock(&ocfs2_dlm_tracking_lock); + list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking); + spin_unlock(&ocfs2_dlm_tracking_lock); +} + +static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) +{ + spin_lock(&ocfs2_dlm_tracking_lock); + if (!list_empty(&res->l_debug_list)) + list_del_init(&res->l_debug_list); + spin_unlock(&ocfs2_dlm_tracking_lock); +} + +#ifdef CONFIG_OCFS2_FS_STATS +static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) +{ + res->l_lock_refresh = 0; + memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats)); + memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats)); +} + +static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level, + struct ocfs2_mask_waiter *mw, int ret) +{ + u32 usec; + ktime_t kt; + struct ocfs2_lock_stats *stats; + + if (level == LKM_PRMODE) + stats = &res->l_lock_prmode; + else if (level == LKM_EXMODE) + stats = &res->l_lock_exmode; + else + return; + + kt = ktime_sub(ktime_get(), mw->mw_lock_start); + usec = ktime_to_us(kt); + + stats->ls_gets++; + stats->ls_total += ktime_to_ns(kt); + /* overflow */ + if (unlikely(stats->ls_gets) == 0) { + stats->ls_gets++; + stats->ls_total = ktime_to_ns(kt); + } + + if (stats->ls_max < usec) + stats->ls_max = usec; + + if (ret) + stats->ls_fail++; +} + +static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) +{ + lockres->l_lock_refresh++; +} + +static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) +{ + mw->mw_lock_start = ktime_get(); +} +#else +static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) +{ +} +static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, + int level, struct ocfs2_mask_waiter *mw, int ret) +{ +} +static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) +{ +} +static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) +{ +} +#endif + +static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, + struct ocfs2_lock_res *res, + enum ocfs2_lock_type type, + struct ocfs2_lock_res_ops *ops, + void *priv) +{ + res->l_type = type; + res->l_ops = ops; + res->l_priv = priv; + + res->l_level = DLM_LOCK_IV; + res->l_requested = DLM_LOCK_IV; + res->l_blocking = DLM_LOCK_IV; + res->l_action = OCFS2_AST_INVALID; + res->l_unlock_action = OCFS2_UNLOCK_INVALID; + + res->l_flags = OCFS2_LOCK_INITIALIZED; + + ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); + + ocfs2_init_lock_stats(res); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (type != OCFS2_LOCK_TYPE_OPEN) + lockdep_init_map(&res->l_lockdep_map, ocfs2_lock_type_strings[type], + &lockdep_keys[type], 0); + else + res->l_lockdep_map.key = NULL; +#endif +} + +void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) +{ + /* This also clears out the lock status block */ + memset(res, 0, sizeof(struct ocfs2_lock_res)); + spin_lock_init(&res->l_lock); + init_waitqueue_head(&res->l_event); + INIT_LIST_HEAD(&res->l_blocked_list); + INIT_LIST_HEAD(&res->l_mask_waiters); +} + +void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, + enum ocfs2_lock_type type, + unsigned int generation, + struct inode *inode) +{ + struct ocfs2_lock_res_ops *ops; + + switch(type) { + case OCFS2_LOCK_TYPE_RW: + ops = &ocfs2_inode_rw_lops; + break; + case OCFS2_LOCK_TYPE_META: + ops = &ocfs2_inode_inode_lops; + break; + case OCFS2_LOCK_TYPE_OPEN: + ops = &ocfs2_inode_open_lops; + break; + default: + mlog_bug_on_msg(1, "type: %d\n", type); + ops = NULL; /* thanks, gcc */ + break; + }; + + ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno, + generation, res->l_name); + ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode); +} + +static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres) +{ + struct inode *inode = ocfs2_lock_res_inode(lockres); + + return OCFS2_SB(inode->i_sb); +} + +static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_mem_dqinfo *info = lockres->l_priv; + + return OCFS2_SB(info->dqi_gi.dqi_sb); +} + +static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_file_private *fp = lockres->l_priv; + + return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb); +} + +static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres) +{ + __be64 inode_blkno_be; + + memcpy(&inode_blkno_be, &lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], + sizeof(__be64)); + + return be64_to_cpu(inode_blkno_be); +} + +static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_dentry_lock *dl = lockres->l_priv; + + return OCFS2_SB(dl->dl_inode->i_sb); +} + +void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, + u64 parent, struct inode *inode) +{ + int len; + u64 inode_blkno = OCFS2_I(inode)->ip_blkno; + __be64 inode_blkno_be = cpu_to_be64(inode_blkno); + struct ocfs2_lock_res *lockres = &dl->dl_lockres; + + ocfs2_lock_res_init_once(lockres); + + /* + * Unfortunately, the standard lock naming scheme won't work + * here because we have two 16 byte values to use. Instead, + * we'll stuff the inode number as a binary value. We still + * want error prints to show something without garbling the + * display, so drop a null byte in there before the inode + * number. A future version of OCFS2 will likely use all + * binary lock names. The stringified names have been a + * tremendous aid in debugging, but now that the debugfs + * interface exists, we can mangle things there if need be. + * + * NOTE: We also drop the standard "pad" value (the total lock + * name size stays the same though - the last part is all + * zeros due to the memset in ocfs2_lock_res_init_once() + */ + len = snprintf(lockres->l_name, OCFS2_DENTRY_LOCK_INO_START, + "%c%016llx", + ocfs2_lock_type_char(OCFS2_LOCK_TYPE_DENTRY), + (long long)parent); + + BUG_ON(len != (OCFS2_DENTRY_LOCK_INO_START - 1)); + + memcpy(&lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], &inode_blkno_be, + sizeof(__be64)); + + ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres, + OCFS2_LOCK_TYPE_DENTRY, &ocfs2_dentry_lops, + dl); +} + +static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + /* Superblock lockres doesn't come from a slab so we call init + * once on it manually. */ + ocfs2_lock_res_init_once(res); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_SUPER, OCFS2_SUPER_BLOCK_BLKNO, + 0, res->l_name); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER, + &ocfs2_super_lops, osb); +} + +static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + /* Rename lockres doesn't come from a slab so we call init + * once on it manually. */ + ocfs2_lock_res_init_once(res); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_RENAME, 0, 0, res->l_name); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, + &ocfs2_rename_lops, osb); +} + +static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + /* nfs_sync lockres doesn't come from a slab so we call init + * once on it manually. */ + ocfs2_lock_res_init_once(res); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC, + &ocfs2_nfs_sync_lops, osb); +} + +static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + ocfs2_lock_res_init_once(res); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN, + &ocfs2_orphan_scan_lops, osb); +} + +void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_file_private *fp) +{ + struct inode *inode = fp->fp_file->f_mapping->host; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno, + inode->i_generation, lockres->l_name); + ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres, + OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops, + fp); + lockres->l_flags |= OCFS2_LOCK_NOCACHE; +} + +void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_mem_dqinfo *info) +{ + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type, + 0, lockres->l_name); + ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres, + OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops, + info); +} + +void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_super *osb, u64 ref_blkno, + unsigned int generation) +{ + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno, + generation, lockres->l_name); + ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT, + &ocfs2_refcount_block_lops, osb); +} + +void ocfs2_lock_res_free(struct ocfs2_lock_res *res) +{ + if (!(res->l_flags & OCFS2_LOCK_INITIALIZED)) + return; + + ocfs2_remove_lockres_tracking(res); + + mlog_bug_on_msg(!list_empty(&res->l_blocked_list), + "Lockres %s is on the blocked list\n", + res->l_name); + mlog_bug_on_msg(!list_empty(&res->l_mask_waiters), + "Lockres %s has mask waiters pending\n", + res->l_name); + mlog_bug_on_msg(spin_is_locked(&res->l_lock), + "Lockres %s is locked\n", + res->l_name); + mlog_bug_on_msg(res->l_ro_holders, + "Lockres %s has %u ro holders\n", + res->l_name, res->l_ro_holders); + mlog_bug_on_msg(res->l_ex_holders, + "Lockres %s has %u ex holders\n", + res->l_name, res->l_ex_holders); + + /* Need to clear out the lock status block for the dlm */ + memset(&res->l_lksb, 0, sizeof(res->l_lksb)); + + res->l_flags = 0UL; +} + +static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, + int level) +{ + BUG_ON(!lockres); + + switch(level) { + case DLM_LOCK_EX: + lockres->l_ex_holders++; + break; + case DLM_LOCK_PR: + lockres->l_ro_holders++; + break; + default: + BUG(); + } +} + +static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, + int level) +{ + BUG_ON(!lockres); + + switch(level) { + case DLM_LOCK_EX: + BUG_ON(!lockres->l_ex_holders); + lockres->l_ex_holders--; + break; + case DLM_LOCK_PR: + BUG_ON(!lockres->l_ro_holders); + lockres->l_ro_holders--; + break; + default: + BUG(); + } +} + +/* WARNING: This function lives in a world where the only three lock + * levels are EX, PR, and NL. It *will* have to be adjusted when more + * lock types are added. */ +static inline int ocfs2_highest_compat_lock_level(int level) +{ + int new_level = DLM_LOCK_EX; + + if (level == DLM_LOCK_EX) + new_level = DLM_LOCK_NL; + else if (level == DLM_LOCK_PR) + new_level = DLM_LOCK_PR; + return new_level; +} + +static void lockres_set_flags(struct ocfs2_lock_res *lockres, + unsigned long newflags) +{ + struct ocfs2_mask_waiter *mw, *tmp; + + assert_spin_locked(&lockres->l_lock); + + lockres->l_flags = newflags; + + list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) { + if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) + continue; + + list_del_init(&mw->mw_item); + mw->mw_status = 0; + complete(&mw->mw_complete); + } +} +static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or) +{ + lockres_set_flags(lockres, lockres->l_flags | or); +} +static void lockres_clear_flags(struct ocfs2_lock_res *lockres, + unsigned long clear) +{ + lockres_set_flags(lockres, lockres->l_flags & ~clear); +} + +static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres) +{ + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); + BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); + + lockres->l_level = lockres->l_requested; + if (lockres->l_level <= + ocfs2_highest_compat_lock_level(lockres->l_blocking)) { + lockres->l_blocking = DLM_LOCK_NL; + lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); + } + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); +} + +static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres) +{ + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); + + /* Convert from RO to EX doesn't really need anything as our + * information is already up to data. Convert from NL to + * *anything* however should mark ourselves as needing an + * update */ + if (lockres->l_level == DLM_LOCK_NL && + lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) + lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + + lockres->l_level = lockres->l_requested; + + /* + * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing + * the OCFS2_LOCK_BUSY flag to prevent the dc thread from + * downconverting the lock before the upconvert has fully completed. + */ + lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); +} + +static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres) +{ + BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY))); + BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); + + if (lockres->l_requested > DLM_LOCK_NL && + !(lockres->l_flags & OCFS2_LOCK_LOCAL) && + lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) + lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + + lockres->l_level = lockres->l_requested; + lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED); + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); +} + +static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, + int level) +{ + int needs_downconvert = 0; + + assert_spin_locked(&lockres->l_lock); + + if (level > lockres->l_blocking) { + /* only schedule a downconvert if we haven't already scheduled + * one that goes low enough to satisfy the level we're + * blocking. this also catches the case where we get + * duplicate BASTs */ + if (ocfs2_highest_compat_lock_level(level) < + ocfs2_highest_compat_lock_level(lockres->l_blocking)) + needs_downconvert = 1; + + lockres->l_blocking = level; + } + + mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n", + lockres->l_name, level, lockres->l_level, lockres->l_blocking, + needs_downconvert); + + if (needs_downconvert) + lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); + mlog(0, "needs_downconvert = %d\n", needs_downconvert); + return needs_downconvert; +} + +/* + * OCFS2_LOCK_PENDING and l_pending_gen. + * + * Why does OCFS2_LOCK_PENDING exist? To close a race between setting + * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock(). See ocfs2_unblock_lock() + * for more details on the race. + * + * OCFS2_LOCK_PENDING closes the race quite nicely. However, it introduces + * a race on itself. In o2dlm, we can get the ast before ocfs2_dlm_lock() + * returns. The ast clears OCFS2_LOCK_BUSY, and must therefore clear + * OCFS2_LOCK_PENDING at the same time. When ocfs2_dlm_lock() returns, + * the caller is going to try to clear PENDING again. If nothing else is + * happening, __lockres_clear_pending() sees PENDING is unset and does + * nothing. + * + * But what if another path (eg downconvert thread) has just started a + * new locking action? The other path has re-set PENDING. Our path + * cannot clear PENDING, because that will re-open the original race + * window. + * + * [Example] + * + * ocfs2_meta_lock() + * ocfs2_cluster_lock() + * set BUSY + * set PENDING + * drop l_lock + * ocfs2_dlm_lock() + * ocfs2_locking_ast() ocfs2_downconvert_thread() + * clear PENDING ocfs2_unblock_lock() + * take_l_lock + * !BUSY + * ocfs2_prepare_downconvert() + * set BUSY + * set PENDING + * drop l_lock + * take l_lock + * clear PENDING + * drop l_lock + * + * ocfs2_dlm_lock() + * + * So as you can see, we now have a window where l_lock is not held, + * PENDING is not set, and ocfs2_dlm_lock() has not been called. + * + * The core problem is that ocfs2_cluster_lock() has cleared the PENDING + * set by ocfs2_prepare_downconvert(). That wasn't nice. + * + * To solve this we introduce l_pending_gen. A call to + * lockres_clear_pending() will only do so when it is passed a generation + * number that matches the lockres. lockres_set_pending() will return the + * current generation number. When ocfs2_cluster_lock() goes to clear + * PENDING, it passes the generation it got from set_pending(). In our + * example above, the generation numbers will *not* match. Thus, + * ocfs2_cluster_lock() will not clear the PENDING set by + * ocfs2_prepare_downconvert(). + */ + +/* Unlocked version for ocfs2_locking_ast() */ +static void __lockres_clear_pending(struct ocfs2_lock_res *lockres, + unsigned int generation, + struct ocfs2_super *osb) +{ + assert_spin_locked(&lockres->l_lock); + + /* + * The ast and locking functions can race us here. The winner + * will clear pending, the loser will not. + */ + if (!(lockres->l_flags & OCFS2_LOCK_PENDING) || + (lockres->l_pending_gen != generation)) + return; + + lockres_clear_flags(lockres, OCFS2_LOCK_PENDING); + lockres->l_pending_gen++; + + /* + * The downconvert thread may have skipped us because we + * were PENDING. Wake it up. + */ + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) + ocfs2_wake_downconvert_thread(osb); +} + +/* Locked version for callers of ocfs2_dlm_lock() */ +static void lockres_clear_pending(struct ocfs2_lock_res *lockres, + unsigned int generation, + struct ocfs2_super *osb) +{ + unsigned long flags; + + spin_lock_irqsave(&lockres->l_lock, flags); + __lockres_clear_pending(lockres, generation, osb); + spin_unlock_irqrestore(&lockres->l_lock, flags); +} + +static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres) +{ + assert_spin_locked(&lockres->l_lock); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); + + lockres_or_flags(lockres, OCFS2_LOCK_PENDING); + + return lockres->l_pending_gen; +} + +static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level) +{ + struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); + struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); + int needs_downconvert; + unsigned long flags; + + BUG_ON(level <= DLM_LOCK_NL); + + mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, " + "type %s\n", lockres->l_name, level, lockres->l_level, + ocfs2_lock_type_string(lockres->l_type)); + + /* + * We can skip the bast for locks which don't enable caching - + * they'll be dropped at the earliest possible time anyway. + */ + if (lockres->l_flags & OCFS2_LOCK_NOCACHE) + return; + + spin_lock_irqsave(&lockres->l_lock, flags); + needs_downconvert = ocfs2_generic_handle_bast(lockres, level); + if (needs_downconvert) + ocfs2_schedule_blocked_lock(osb, lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + wake_up(&lockres->l_event); + + ocfs2_wake_downconvert_thread(osb); +} + +static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb) +{ + struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); + struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); + unsigned long flags; + int status; + + spin_lock_irqsave(&lockres->l_lock, flags); + + status = ocfs2_dlm_lock_status(&lockres->l_lksb); + + if (status == -EAGAIN) { + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + goto out; + } + + if (status) { + mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n", + lockres->l_name, status); + spin_unlock_irqrestore(&lockres->l_lock, flags); + return; + } + + mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, " + "level %d => %d\n", lockres->l_name, lockres->l_action, + lockres->l_unlock_action, lockres->l_level, lockres->l_requested); + + switch(lockres->l_action) { + case OCFS2_AST_ATTACH: + ocfs2_generic_handle_attach_action(lockres); + lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL); + break; + case OCFS2_AST_CONVERT: + ocfs2_generic_handle_convert_action(lockres); + break; + case OCFS2_AST_DOWNCONVERT: + ocfs2_generic_handle_downconvert_action(lockres); + break; + default: + mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, " + "flags 0x%lx, unlock: %u\n", + lockres->l_name, lockres->l_action, lockres->l_flags, + lockres->l_unlock_action); + BUG(); + } +out: + /* set it to something invalid so if we get called again we + * can catch it. */ + lockres->l_action = OCFS2_AST_INVALID; + + /* Did we try to cancel this lock? Clear that state */ + if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) + lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; + + /* + * We may have beaten the locking functions here. We certainly + * know that dlm_lock() has been called :-) + * Because we can't have two lock calls in flight at once, we + * can use lockres->l_pending_gen. + */ + __lockres_clear_pending(lockres, lockres->l_pending_gen, osb); + + wake_up(&lockres->l_event); + spin_unlock_irqrestore(&lockres->l_lock, flags); +} + +static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error) +{ + struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); + unsigned long flags; + + mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n", + lockres->l_name, lockres->l_unlock_action); + + spin_lock_irqsave(&lockres->l_lock, flags); + if (error) { + mlog(ML_ERROR, "Dlm passes error %d for lock %s, " + "unlock_action %d\n", error, lockres->l_name, + lockres->l_unlock_action); + spin_unlock_irqrestore(&lockres->l_lock, flags); + return; + } + + switch(lockres->l_unlock_action) { + case OCFS2_UNLOCK_CANCEL_CONVERT: + mlog(0, "Cancel convert success for %s\n", lockres->l_name); + lockres->l_action = OCFS2_AST_INVALID; + /* Downconvert thread may have requeued this lock, we + * need to wake it. */ + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) + ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres)); + break; + case OCFS2_UNLOCK_DROP_LOCK: + lockres->l_level = DLM_LOCK_IV; + break; + default: + BUG(); + } + + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; + wake_up(&lockres->l_event); + spin_unlock_irqrestore(&lockres->l_lock, flags); +} + +/* + * This is the filesystem locking protocol. It provides the lock handling + * hooks for the underlying DLM. It has a maximum version number. + * The version number allows interoperability with systems running at + * the same major number and an equal or smaller minor number. + * + * Whenever the filesystem does new things with locks (adds or removes a + * lock, orders them differently, does different things underneath a lock), + * the version must be changed. The protocol is negotiated when joining + * the dlm domain. A node may join the domain if its major version is + * identical to all other nodes and its minor version is greater than + * or equal to all other nodes. When its minor version is greater than + * the other nodes, it will run at the minor version specified by the + * other nodes. + * + * If a locking change is made that will not be compatible with older + * versions, the major number must be increased and the minor version set + * to zero. If a change merely adds a behavior that can be disabled when + * speaking to older versions, the minor version must be increased. If a + * change adds a fully backwards compatible change (eg, LVB changes that + * are just ignored by older versions), the version does not need to be + * updated. + */ +static struct ocfs2_locking_protocol lproto = { + .lp_max_version = { + .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, + .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, + }, + .lp_lock_ast = ocfs2_locking_ast, + .lp_blocking_ast = ocfs2_blocking_ast, + .lp_unlock_ast = ocfs2_unlock_ast, +}; + +void ocfs2_set_locking_protocol(void) +{ + ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version); +} + +static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, + int convert) +{ + unsigned long flags; + + spin_lock_irqsave(&lockres->l_lock, flags); + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + if (convert) + lockres->l_action = OCFS2_AST_INVALID; + else + lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + wake_up(&lockres->l_event); +} + +/* Note: If we detect another process working on the lock (i.e., + * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller + * to do the right thing in that case. + */ +static int ocfs2_lock_create(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + u32 dlm_flags) +{ + int ret = 0; + unsigned long flags; + unsigned int gen; + + mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level, + dlm_flags); + + spin_lock_irqsave(&lockres->l_lock, flags); + if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) || + (lockres->l_flags & OCFS2_LOCK_BUSY)) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto bail; + } + + lockres->l_action = OCFS2_AST_ATTACH; + lockres->l_requested = level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + gen = lockres_set_pending(lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ret = ocfs2_dlm_lock(osb->cconn, + level, + &lockres->l_lksb, + dlm_flags, + lockres->l_name, + OCFS2_LOCK_ID_MAX_LEN - 1); + lockres_clear_pending(lockres, gen, osb); + if (ret) { + ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); + ocfs2_recover_from_dlm_error(lockres, 1); + } + + mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name); + +bail: + return ret; +} + +static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres, + int flag) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&lockres->l_lock, flags); + ret = lockres->l_flags & flag; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + return ret; +} + +static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY)); +} + +static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING)); +} + +/* predict what lock level we'll be dropping down to on behalf + * of another node, and return true if the currently wanted + * level will be compatible with it. */ +static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, + int wanted) +{ + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); + + return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking); +} + +static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw) +{ + INIT_LIST_HEAD(&mw->mw_item); + init_completion(&mw->mw_complete); + ocfs2_init_start_time(mw); +} + +static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) +{ + wait_for_completion(&mw->mw_complete); + /* Re-arm the completion in case we want to wait on it again */ + INIT_COMPLETION(mw->mw_complete); + return mw->mw_status; +} + +static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, + struct ocfs2_mask_waiter *mw, + unsigned long mask, + unsigned long goal) +{ + BUG_ON(!list_empty(&mw->mw_item)); + + assert_spin_locked(&lockres->l_lock); + + list_add_tail(&mw->mw_item, &lockres->l_mask_waiters); + mw->mw_mask = mask; + mw->mw_goal = goal; +} + +/* returns 0 if the mw that was removed was already satisfied, -EBUSY + * if the mask still hadn't reached its goal */ +static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, + struct ocfs2_mask_waiter *mw) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&lockres->l_lock, flags); + if (!list_empty(&mw->mw_item)) { + if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) + ret = -EBUSY; + + list_del_init(&mw->mw_item); + init_completion(&mw->mw_complete); + } + spin_unlock_irqrestore(&lockres->l_lock, flags); + + return ret; + +} + +static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw, + struct ocfs2_lock_res *lockres) +{ + int ret; + + ret = wait_for_completion_interruptible(&mw->mw_complete); + if (ret) + lockres_remove_mask_waiter(lockres, mw); + else + ret = mw->mw_status; + /* Re-arm the completion in case we want to wait on it again */ + INIT_COMPLETION(mw->mw_complete); + return ret; +} + +static int __ocfs2_cluster_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + u32 lkm_flags, + int arg_flags, + int l_subclass, + unsigned long caller_ip) +{ + struct ocfs2_mask_waiter mw; + int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); + int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ + unsigned long flags; + unsigned int gen; + int noqueue_attempted = 0; + + ocfs2_init_mask_waiter(&mw); + + if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) + lkm_flags |= DLM_LKF_VALBLK; + +again: + wait = 0; + + spin_lock_irqsave(&lockres->l_lock, flags); + + if (catch_signals && signal_pending(current)) { + ret = -ERESTARTSYS; + goto unlock; + } + + mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, + "Cluster lock called on freeing lockres %s! flags " + "0x%lx\n", lockres->l_name, lockres->l_flags); + + /* We only compare against the currently granted level + * here. If the lock is blocked waiting on a downconvert, + * we'll get caught below. */ + if (lockres->l_flags & OCFS2_LOCK_BUSY && + level > lockres->l_level) { + /* is someone sitting in dlm_lock? If so, wait on + * them. */ + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + wait = 1; + goto unlock; + } + + if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) { + /* + * We've upconverted. If the lock now has a level we can + * work with, we take it. If, however, the lock is not at the + * required level, we go thru the full cycle. One way this could + * happen is if a process requesting an upconvert to PR is + * closely followed by another requesting upconvert to an EX. + * If the process requesting EX lands here, we want it to + * continue attempting to upconvert and let the process + * requesting PR take the lock. + * If multiple processes request upconvert to PR, the first one + * here will take the lock. The others will have to go thru the + * OCFS2_LOCK_BLOCKED check to ensure that there is no pending + * downconvert request. + */ + if (level <= lockres->l_level) + goto update_holders; + } + + if (lockres->l_flags & OCFS2_LOCK_BLOCKED && + !ocfs2_may_continue_on_blocked_lock(lockres, level)) { + /* is the lock is currently blocked on behalf of + * another node */ + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0); + wait = 1; + goto unlock; + } + + if (level > lockres->l_level) { + if (noqueue_attempted > 0) { + ret = -EAGAIN; + goto unlock; + } + if (lkm_flags & DLM_LKF_NOQUEUE) + noqueue_attempted = 1; + + if (lockres->l_action != OCFS2_AST_INVALID) + mlog(ML_ERROR, "lockres %s has action %u pending\n", + lockres->l_name, lockres->l_action); + + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { + lockres->l_action = OCFS2_AST_ATTACH; + lkm_flags &= ~DLM_LKF_CONVERT; + } else { + lockres->l_action = OCFS2_AST_CONVERT; + lkm_flags |= DLM_LKF_CONVERT; + } + + lockres->l_requested = level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + gen = lockres_set_pending(lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + BUG_ON(level == DLM_LOCK_IV); + BUG_ON(level == DLM_LOCK_NL); + + mlog(ML_BASTS, "lockres %s, convert from %d to %d\n", + lockres->l_name, lockres->l_level, level); + + /* call dlm_lock to upgrade lock now */ + ret = ocfs2_dlm_lock(osb->cconn, + level, + &lockres->l_lksb, + lkm_flags, + lockres->l_name, + OCFS2_LOCK_ID_MAX_LEN - 1); + lockres_clear_pending(lockres, gen, osb); + if (ret) { + if (!(lkm_flags & DLM_LKF_NOQUEUE) || + (ret != -EAGAIN)) { + ocfs2_log_dlm_error("ocfs2_dlm_lock", + ret, lockres); + } + ocfs2_recover_from_dlm_error(lockres, 1); + goto out; + } + + mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n", + lockres->l_name); + + /* At this point we've gone inside the dlm and need to + * complete our work regardless. */ + catch_signals = 0; + + /* wait for busy to clear and carry on */ + goto again; + } + +update_holders: + /* Ok, if we get here then we're good to go. */ + ocfs2_inc_holders(lockres, level); + + ret = 0; +unlock: + lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + + spin_unlock_irqrestore(&lockres->l_lock, flags); +out: + /* + * This is helping work around a lock inversion between the page lock + * and dlm locks. One path holds the page lock while calling aops + * which block acquiring dlm locks. The voting thread holds dlm + * locks while acquiring page locks while down converting data locks. + * This block is helping an aop path notice the inversion and back + * off to unlock its page lock before trying the dlm lock again. + */ + if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && + mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { + wait = 0; + if (lockres_remove_mask_waiter(lockres, &mw)) + ret = -EAGAIN; + else + goto again; + } + if (wait) { + ret = ocfs2_wait_for_mask(&mw); + if (ret == 0) + goto again; + mlog_errno(ret); + } + ocfs2_update_lock_stats(lockres, level, &mw, ret); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!ret && lockres->l_lockdep_map.key != NULL) { + if (level == DLM_LOCK_PR) + rwsem_acquire_read(&lockres->l_lockdep_map, l_subclass, + !!(arg_flags & OCFS2_META_LOCK_NOQUEUE), + caller_ip); + else + rwsem_acquire(&lockres->l_lockdep_map, l_subclass, + !!(arg_flags & OCFS2_META_LOCK_NOQUEUE), + caller_ip); + } +#endif + return ret; +} + +static inline int ocfs2_cluster_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + u32 lkm_flags, + int arg_flags) +{ + return __ocfs2_cluster_lock(osb, lockres, level, lkm_flags, arg_flags, + 0, _RET_IP_); +} + + +static void __ocfs2_cluster_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + unsigned long caller_ip) +{ + unsigned long flags; + + spin_lock_irqsave(&lockres->l_lock, flags); + ocfs2_dec_holders(lockres, level); + ocfs2_downconvert_on_unlock(osb, lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (lockres->l_lockdep_map.key != NULL) + rwsem_release(&lockres->l_lockdep_map, 1, caller_ip); +#endif +} + +static int ocfs2_create_new_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int ex, + int local) +{ + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + unsigned long flags; + u32 lkm_flags = local ? DLM_LKF_LOCAL : 0; + + spin_lock_irqsave(&lockres->l_lock, flags); + BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); + lockres_or_flags(lockres, OCFS2_LOCK_LOCAL); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + return ocfs2_lock_create(osb, lockres, level, lkm_flags); +} + +/* Grants us an EX lock on the data and metadata resources, skipping + * the normal cluster directory lookup. Use this ONLY on newly created + * inodes which other nodes can't possibly see, and which haven't been + * hashed in the inode hash yet. This can give us a good performance + * increase as it'll skip the network broadcast normally associated + * with creating a new lock resource. */ +int ocfs2_create_new_inode_locks(struct inode *inode) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!inode); + BUG_ON(!ocfs2_inode_is_new(inode)); + + mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); + + /* NOTE: That we don't increment any of the holder counts, nor + * do we add anything to a journal handle. Since this is + * supposed to be a new inode which the cluster doesn't know + * about yet, there is no need to. As far as the LVB handling + * is concerned, this is basically like acquiring an EX lock + * on a resource which has an invalid one -- we'll set it + * valid when we release the EX. */ + + ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1, 1); + if (ret) { + mlog_errno(ret); + goto bail; + } + + /* + * We don't want to use DLM_LKF_LOCAL on a meta data lock as they + * don't use a generation in their lock names. + */ + ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0); + if (ret) { + mlog_errno(ret); + goto bail; + } + + ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); + if (ret) { + mlog_errno(ret); + goto bail; + } + +bail: + return ret; +} + +int ocfs2_rw_lock(struct inode *inode, int write) +{ + int status, level; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!inode); + + mlog(0, "inode %llu take %s RW lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (ocfs2_mount_local(osb)) + return 0; + + lockres = &OCFS2_I(inode)->ip_rw_lockres; + + level = write ? DLM_LOCK_EX : DLM_LOCK_PR; + + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, + 0); + if (status < 0) + mlog_errno(status); + + return status; +} + +void ocfs2_rw_unlock(struct inode *inode, int write) +{ + int level = write ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "inode %llu drop %s RW lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); +} + +/* + * ocfs2_open_lock always get PR mode lock. + */ +int ocfs2_open_lock(struct inode *inode) +{ + int status = 0; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!inode); + + mlog(0, "inode %llu take PRMODE open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + if (ocfs2_mount_local(osb)) + goto out; + + lockres = &OCFS2_I(inode)->ip_open_lockres; + + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, + DLM_LOCK_PR, 0, 0); + if (status < 0) + mlog_errno(status); + +out: + return status; +} + +int ocfs2_try_open_lock(struct inode *inode, int write) +{ + int status = 0, level; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!inode); + + mlog(0, "inode %llu try to take %s open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (ocfs2_mount_local(osb)) + goto out; + + lockres = &OCFS2_I(inode)->ip_open_lockres; + + level = write ? DLM_LOCK_EX : DLM_LOCK_PR; + + /* + * The file system may already holding a PRMODE/EXMODE open lock. + * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on + * other nodes and the -EAGAIN will indicate to the caller that + * this inode is still in use. + */ + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, + level, DLM_LKF_NOQUEUE, 0); + +out: + return status; +} + +/* + * ocfs2_open_unlock unlock PR and EX mode open locks. + */ +void ocfs2_open_unlock(struct inode *inode) +{ + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "inode %llu drop open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + if (ocfs2_mount_local(osb)) + goto out; + + if(lockres->l_ro_holders) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, + DLM_LOCK_PR); + if(lockres->l_ex_holders) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, + DLM_LOCK_EX); + +out: + return; +} + +static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres, + int level) +{ + int ret; + struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); + unsigned long flags; + struct ocfs2_mask_waiter mw; + + ocfs2_init_mask_waiter(&mw); + +retry_cancel: + spin_lock_irqsave(&lockres->l_lock, flags); + if (lockres->l_flags & OCFS2_LOCK_BUSY) { + ret = ocfs2_prepare_cancel_convert(osb, lockres); + if (ret) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = ocfs2_cancel_convert(osb, lockres); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + goto retry_cancel; + } + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ocfs2_wait_for_mask(&mw); + goto retry_cancel; + } + + ret = -ERESTARTSYS; + /* + * We may still have gotten the lock, in which case there's no + * point to restarting the syscall. + */ + if (lockres->l_level == level) + ret = 0; + + mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret, + lockres->l_flags, lockres->l_level, lockres->l_action); + + spin_unlock_irqrestore(&lockres->l_lock, flags); + +out: + return ret; +} + +/* + * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of + * flock() calls. The locking approach this requires is sufficiently + * different from all other cluster lock types that we implement a + * separate path to the "low-level" dlm calls. In particular: + * + * - No optimization of lock levels is done - we take at exactly + * what's been requested. + * + * - No lock caching is employed. We immediately downconvert to + * no-lock at unlock time. This also means flock locks never go on + * the blocking list). + * + * - Since userspace can trivially deadlock itself with flock, we make + * sure to allow cancellation of a misbehaving applications flock() + * request. + * + * - Access to any flock lockres doesn't require concurrency, so we + * can simplify the code by requiring the caller to guarantee + * serialization of dlmglue flock calls. + */ +int ocfs2_file_lock(struct file *file, int ex, int trylock) +{ + int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0; + unsigned long flags; + struct ocfs2_file_private *fp = file->private_data; + struct ocfs2_lock_res *lockres = &fp->fp_flock; + struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb); + struct ocfs2_mask_waiter mw; + + ocfs2_init_mask_waiter(&mw); + + if ((lockres->l_flags & OCFS2_LOCK_BUSY) || + (lockres->l_level > DLM_LOCK_NL)) { + mlog(ML_ERROR, + "File lock \"%s\" has busy or locked state: flags: 0x%lx, " + "level: %u\n", lockres->l_name, lockres->l_flags, + lockres->l_level); + return -EINVAL; + } + + spin_lock_irqsave(&lockres->l_lock, flags); + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + /* + * Get the lock at NLMODE to start - that way we + * can cancel the upconvert request if need be. + */ + ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_wait_for_mask(&mw); + if (ret) { + mlog_errno(ret); + goto out; + } + spin_lock_irqsave(&lockres->l_lock, flags); + } + + lockres->l_action = OCFS2_AST_CONVERT; + lkm_flags |= DLM_LKF_CONVERT; + lockres->l_requested = level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags, + lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1); + if (ret) { + if (!trylock || (ret != -EAGAIN)) { + ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); + ret = -EINVAL; + } + + ocfs2_recover_from_dlm_error(lockres, 1); + lockres_remove_mask_waiter(lockres, &mw); + goto out; + } + + ret = ocfs2_wait_for_mask_interruptible(&mw, lockres); + if (ret == -ERESTARTSYS) { + /* + * Userspace can cause deadlock itself with + * flock(). Current behavior locally is to allow the + * deadlock, but abort the system call if a signal is + * received. We follow this example, otherwise a + * poorly written program could sit in kernel until + * reboot. + * + * Handling this is a bit more complicated for Ocfs2 + * though. We can't exit this function with an + * outstanding lock request, so a cancel convert is + * required. We intentionally overwrite 'ret' - if the + * cancel fails and the lock was granted, it's easier + * to just bubble success back up to the user. + */ + ret = ocfs2_flock_handle_signal(lockres, level); + } else if (!ret && (level > lockres->l_level)) { + /* Trylock failed asynchronously */ + BUG_ON(!trylock); + ret = -EAGAIN; + } + +out: + + mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n", + lockres->l_name, ex, trylock, ret); + return ret; +} + +void ocfs2_file_unlock(struct file *file) +{ + int ret; + unsigned int gen; + unsigned long flags; + struct ocfs2_file_private *fp = file->private_data; + struct ocfs2_lock_res *lockres = &fp->fp_flock; + struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb); + struct ocfs2_mask_waiter mw; + + ocfs2_init_mask_waiter(&mw); + + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) + return; + + if (lockres->l_level == DLM_LOCK_NL) + return; + + mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n", + lockres->l_name, lockres->l_flags, lockres->l_level, + lockres->l_action); + + spin_lock_irqsave(&lockres->l_lock, flags); + /* + * Fake a blocking ast for the downconvert code. + */ + lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); + lockres->l_blocking = DLM_LOCK_EX; + + gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL); + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen); + if (ret) { + mlog_errno(ret); + return; + } + + ret = ocfs2_wait_for_mask(&mw); + if (ret) + mlog_errno(ret); +} + +static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int kick = 0; + + /* If we know that another node is waiting on our lock, kick + * the downconvert thread * pre-emptively when we reach a release + * condition. */ + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { + switch(lockres->l_blocking) { + case DLM_LOCK_EX: + if (!lockres->l_ex_holders && !lockres->l_ro_holders) + kick = 1; + break; + case DLM_LOCK_PR: + if (!lockres->l_ex_holders) + kick = 1; + break; + default: + BUG(); + } + } + + if (kick) + ocfs2_wake_downconvert_thread(osb); +} + +#define OCFS2_SEC_BITS 34 +#define OCFS2_SEC_SHIFT (64 - 34) +#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1) + +/* LVB only has room for 64 bits of time here so we pack it for + * now. */ +static u64 ocfs2_pack_timespec(struct timespec *spec) +{ + u64 res; + u64 sec = spec->tv_sec; + u32 nsec = spec->tv_nsec; + + res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK); + + return res; +} + +/* Call this with the lockres locked. I am reasonably sure we don't + * need ip_lock in this function as anyone who would be changing those + * values is supposed to be blocked in ocfs2_inode_lock right now. */ +static void __ocfs2_stuff_meta_lvb(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; + struct ocfs2_meta_lvb *lvb; + + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + + /* + * Invalidate the LVB of a deleted inode - this way other + * nodes are forced to go to disk and discover the new inode + * status. + */ + if (oi->ip_flags & OCFS2_INODE_DELETED) { + lvb->lvb_version = 0; + goto out; + } + + lvb->lvb_version = OCFS2_LVB_VERSION; + lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); + lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); + lvb->lvb_iuid = cpu_to_be32(inode->i_uid); + lvb->lvb_igid = cpu_to_be32(inode->i_gid); + lvb->lvb_imode = cpu_to_be16(inode->i_mode); + lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); + lvb->lvb_iatime_packed = + cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); + lvb->lvb_ictime_packed = + cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); + lvb->lvb_imtime_packed = + cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); + lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); + lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features); + lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); + +out: + mlog_meta_lvb(0, lockres); +} + +static void ocfs2_unpack_timespec(struct timespec *spec, + u64 packed_time) +{ + spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT; + spec->tv_nsec = packed_time & OCFS2_NSEC_MASK; +} + +static void ocfs2_refresh_inode_from_lvb(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; + struct ocfs2_meta_lvb *lvb; + + mlog_meta_lvb(0, lockres); + + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + + /* We're safe here without the lockres lock... */ + spin_lock(&oi->ip_lock); + oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters); + i_size_write(inode, be64_to_cpu(lvb->lvb_isize)); + + oi->ip_attr = be32_to_cpu(lvb->lvb_iattr); + oi->ip_dyn_features = be16_to_cpu(lvb->lvb_idynfeatures); + ocfs2_set_inode_flags(inode); + + /* fast-symlinks are a special case */ + if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) + inode->i_blocks = 0; + else + inode->i_blocks = ocfs2_inode_sector_count(inode); + + inode->i_uid = be32_to_cpu(lvb->lvb_iuid); + inode->i_gid = be32_to_cpu(lvb->lvb_igid); + inode->i_mode = be16_to_cpu(lvb->lvb_imode); + inode->i_nlink = be16_to_cpu(lvb->lvb_inlink); + ocfs2_unpack_timespec(&inode->i_atime, + be64_to_cpu(lvb->lvb_iatime_packed)); + ocfs2_unpack_timespec(&inode->i_mtime, + be64_to_cpu(lvb->lvb_imtime_packed)); + ocfs2_unpack_timespec(&inode->i_ctime, + be64_to_cpu(lvb->lvb_ictime_packed)); + spin_unlock(&oi->ip_lock); +} + +static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, + struct ocfs2_lock_res *lockres) +{ + struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + + if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) + && lvb->lvb_version == OCFS2_LVB_VERSION + && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) + return 1; + return 0; +} + +/* Determine whether a lock resource needs to be refreshed, and + * arbitrate who gets to refresh it. + * + * 0 means no refresh needed. + * + * > 0 means you need to refresh this and you MUST call + * ocfs2_complete_lock_res_refresh afterwards. */ +static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres) +{ + unsigned long flags; + int status = 0; + +refresh_check: + spin_lock_irqsave(&lockres->l_lock, flags); + if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto bail; + } + + if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ocfs2_wait_on_refreshing_lock(lockres); + goto refresh_check; + } + + /* Ok, I'll be the one to refresh this lock. */ + lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + status = 1; +bail: + mlog(0, "status %d\n", status); + return status; +} + +/* If status is non zero, I'll mark it as not being in refresh + * anymroe, but i won't clear the needs refresh flag. */ +static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres, + int status) +{ + unsigned long flags; + + spin_lock_irqsave(&lockres->l_lock, flags); + lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING); + if (!status) + lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + wake_up(&lockres->l_event); +} + +/* may or may not return a bh if it went to disk. */ +static int ocfs2_inode_lock_update(struct inode *inode, + struct buffer_head **bh) +{ + int status = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; + struct ocfs2_dinode *fe; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (ocfs2_mount_local(osb)) + goto bail; + + spin_lock(&oi->ip_lock); + if (oi->ip_flags & OCFS2_INODE_DELETED) { + mlog(0, "Orphaned inode %llu was deleted while we " + "were waiting on a lock. ip_flags = 0x%x\n", + (unsigned long long)oi->ip_blkno, oi->ip_flags); + spin_unlock(&oi->ip_lock); + status = -ENOENT; + goto bail; + } + spin_unlock(&oi->ip_lock); + + if (!ocfs2_should_refresh_lock_res(lockres)) + goto bail; + + /* This will discard any caching information we might have had + * for the inode metadata. */ + ocfs2_metadata_cache_purge(INODE_CACHE(inode)); + + ocfs2_extent_map_trunc(inode, 0); + + if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { + mlog(0, "Trusting LVB on inode %llu\n", + (unsigned long long)oi->ip_blkno); + ocfs2_refresh_inode_from_lvb(inode); + } else { + /* Boo, we have to go to disk. */ + /* read bh, cast, ocfs2_refresh_inode */ + status = ocfs2_read_inode_block(inode, bh); + if (status < 0) { + mlog_errno(status); + goto bail_refresh; + } + fe = (struct ocfs2_dinode *) (*bh)->b_data; + + /* This is a good chance to make sure we're not + * locking an invalid object. ocfs2_read_inode_block() + * already checked that the inode block is sane. + * + * We bug on a stale inode here because we checked + * above whether it was wiped from disk. The wiping + * node provides a guarantee that we receive that + * message and can mark the inode before dropping any + * locks associated with it. */ + mlog_bug_on_msg(inode->i_generation != + le32_to_cpu(fe->i_generation), + "Invalid dinode %llu disk generation: %u " + "inode->i_generation: %u\n", + (unsigned long long)oi->ip_blkno, + le32_to_cpu(fe->i_generation), + inode->i_generation); + mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) || + !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)), + "Stale dinode %llu dtime: %llu flags: 0x%x\n", + (unsigned long long)oi->ip_blkno, + (unsigned long long)le64_to_cpu(fe->i_dtime), + le32_to_cpu(fe->i_flags)); + + ocfs2_refresh_inode(inode, fe); + ocfs2_track_lock_refresh(lockres); + } + + status = 0; +bail_refresh: + ocfs2_complete_lock_res_refresh(lockres, status); +bail: + return status; +} + +static int ocfs2_assign_bh(struct inode *inode, + struct buffer_head **ret_bh, + struct buffer_head *passed_bh) +{ + int status; + + if (passed_bh) { + /* Ok, the update went to disk for us, use the + * returned bh. */ + *ret_bh = passed_bh; + get_bh(*ret_bh); + + return 0; + } + + status = ocfs2_read_inode_block(inode, ret_bh); + if (status < 0) + mlog_errno(status); + + return status; +} + +/* + * returns < 0 error if the callback will never be called, otherwise + * the result of the lock will be communicated via the callback. + */ +int ocfs2_inode_lock_full_nested(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + int arg_flags, + int subclass) +{ + int status, level, acquired; + u32 dlm_flags; + struct ocfs2_lock_res *lockres = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *local_bh = NULL; + + BUG_ON(!inode); + + mlog(0, "inode %llu, take %s META lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + ex ? "EXMODE" : "PRMODE"); + + status = 0; + acquired = 0; + /* We'll allow faking a readonly metadata lock for + * rodevices. */ + if (ocfs2_is_hard_readonly(osb)) { + if (ex) + status = -EROFS; + goto bail; + } + + if (ocfs2_mount_local(osb)) + goto local; + + if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) + ocfs2_wait_for_recovery(osb); + + lockres = &OCFS2_I(inode)->ip_inode_lockres; + level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + dlm_flags = 0; + if (arg_flags & OCFS2_META_LOCK_NOQUEUE) + dlm_flags |= DLM_LKF_NOQUEUE; + + status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags, + arg_flags, subclass, _RET_IP_); + if (status < 0) { + if (status != -EAGAIN && status != -EIOCBRETRY) + mlog_errno(status); + goto bail; + } + + /* Notify the error cleanup path to drop the cluster lock. */ + acquired = 1; + + /* We wait twice because a node may have died while we were in + * the lower dlm layers. The second time though, we've + * committed to owning this lock so we don't allow signals to + * abort the operation. */ + if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) + ocfs2_wait_for_recovery(osb); + +local: + /* + * We only see this flag if we're being called from + * ocfs2_read_locked_inode(). It means we're locking an inode + * which hasn't been populated yet, so clear the refresh flag + * and let the caller handle it. + */ + if (inode->i_state & I_NEW) { + status = 0; + if (lockres) + ocfs2_complete_lock_res_refresh(lockres, 0); + goto bail; + } + + /* This is fun. The caller may want a bh back, or it may + * not. ocfs2_inode_lock_update definitely wants one in, but + * may or may not read one, depending on what's in the + * LVB. The result of all of this is that we've *only* gone to + * disk if we have to, so the complexity is worthwhile. */ + status = ocfs2_inode_lock_update(inode, &local_bh); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + + if (ret_bh) { + status = ocfs2_assign_bh(inode, ret_bh, local_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + +bail: + if (status < 0) { + if (ret_bh && (*ret_bh)) { + brelse(*ret_bh); + *ret_bh = NULL; + } + if (acquired) + ocfs2_inode_unlock(inode, ex); + } + + if (local_bh) + brelse(local_bh); + + return status; +} + +/* + * This is working around a lock inversion between tasks acquiring DLM + * locks while holding a page lock and the downconvert thread which + * blocks dlm lock acquiry while acquiring page locks. + * + * ** These _with_page variantes are only intended to be called from aop + * methods that hold page locks and return a very specific *positive* error + * code that aop methods pass up to the VFS -- test for errors with != 0. ** + * + * The DLM is called such that it returns -EAGAIN if it would have + * blocked waiting for the downconvert thread. In that case we unlock + * our page so the downconvert thread can make progress. Once we've + * done this we have to return AOP_TRUNCATED_PAGE so the aop method + * that called us can bubble that back up into the VFS who will then + * immediately retry the aop call. + * + * We do a blocking lock and immediate unlock before returning, though, so that + * the lock has a great chance of being cached on this node by the time the VFS + * calls back to retry the aop. This has a potential to livelock as nodes + * ping locks back and forth, but that's a risk we're willing to take to avoid + * the lock inversion simply. + */ +int ocfs2_inode_lock_with_page(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + struct page *page) +{ + int ret; + + ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); + if (ret == -EAGAIN) { + unlock_page(page); + if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) + ocfs2_inode_unlock(inode, ex); + ret = AOP_TRUNCATED_PAGE; + } + + return ret; +} + +int ocfs2_inode_lock_atime(struct inode *inode, + struct vfsmount *vfsmnt, + int *level) +{ + int ret; + + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + /* + * If we should update atime, we will get EX lock, + * otherwise we just get PR lock. + */ + if (ocfs2_should_update_atime(inode, vfsmnt)) { + struct buffer_head *bh = NULL; + + ocfs2_inode_unlock(inode, 0); + ret = ocfs2_inode_lock(inode, &bh, 1); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + *level = 1; + if (ocfs2_should_update_atime(inode, vfsmnt)) + ocfs2_update_inode_atime(inode, bh); + if (bh) + brelse(bh); + } else + *level = 0; + + return ret; +} + +void ocfs2_inode_unlock(struct inode *inode, + int ex) +{ + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "inode %llu drop %s META lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + ex ? "EXMODE" : "PRMODE"); + + if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && + !ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); +} + +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno) +{ + struct ocfs2_lock_res *lockres; + struct ocfs2_orphan_scan_lvb *lvb; + int status = 0; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + lockres = &osb->osb_orphan_scan.os_lockres; + status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0); + if (status < 0) + return status; + + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) && + lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION) + *seqno = be32_to_cpu(lvb->lvb_os_seqno); + else + *seqno = osb->osb_orphan_scan.os_seqno + 1; + + return status; +} + +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno) +{ + struct ocfs2_lock_res *lockres; + struct ocfs2_orphan_scan_lvb *lvb; + + if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) { + lockres = &osb->osb_orphan_scan.os_lockres; + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION; + lvb->lvb_os_seqno = cpu_to_be32(seqno); + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); + } +} + +int ocfs2_super_lock(struct ocfs2_super *osb, + int ex) +{ + int status = 0; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + goto bail; + + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* The super block lock path is really in the best position to + * know when resources covered by the lock need to be + * refreshed, so we do it here. Of course, making sense of + * everything is up to the caller :) */ + status = ocfs2_should_refresh_lock_res(lockres); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (status) { + status = ocfs2_refresh_slot_info(osb); + + ocfs2_complete_lock_res_refresh(lockres, status); + + if (status < 0) + mlog_errno(status); + ocfs2_track_lock_refresh(lockres); + } +bail: + return status; +} + +void ocfs2_super_unlock(struct ocfs2_super *osb, + int ex) +{ + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, level); +} + +int ocfs2_rename_lock(struct ocfs2_super *osb) +{ + int status; + struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0); + if (status < 0) + mlog_errno(status); + + return status; +} + +void ocfs2_rename_unlock(struct ocfs2_super *osb) +{ + struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); +} + +int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex) +{ + int status; + struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE, + 0, 0); + if (status < 0) + mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status); + + return status; +} + +void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex) +{ + struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres; + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, + ex ? LKM_EXMODE : LKM_PRMODE); +} + +int ocfs2_dentry_lock(struct dentry *dentry, int ex) +{ + int ret; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + + BUG_ON(!dl); + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0); + if (ret < 0) + mlog_errno(ret); + + return ret; +} + +void ocfs2_dentry_unlock(struct dentry *dentry, int ex) +{ + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); +} + +/* Reference counting of the dlm debug structure. We want this because + * open references on the debug inodes can live on after a mount, so + * we can't rely on the ocfs2_super to always exist. */ +static void ocfs2_dlm_debug_free(struct kref *kref) +{ + struct ocfs2_dlm_debug *dlm_debug; + + dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt); + + kfree(dlm_debug); +} + +void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug) +{ + if (dlm_debug) + kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free); +} + +static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug) +{ + kref_get(&debug->d_refcnt); +} + +struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) +{ + struct ocfs2_dlm_debug *dlm_debug; + + dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL); + if (!dlm_debug) { + mlog_errno(-ENOMEM); + goto out; + } + + kref_init(&dlm_debug->d_refcnt); + INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); + dlm_debug->d_locking_state = NULL; +out: + return dlm_debug; +} + +/* Access to this is arbitrated for us via seq_file->sem. */ +struct ocfs2_dlm_seq_priv { + struct ocfs2_dlm_debug *p_dlm_debug; + struct ocfs2_lock_res p_iter_res; + struct ocfs2_lock_res p_tmp_res; +}; + +static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start, + struct ocfs2_dlm_seq_priv *priv) +{ + struct ocfs2_lock_res *iter, *ret = NULL; + struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug; + + assert_spin_locked(&ocfs2_dlm_tracking_lock); + + list_for_each_entry(iter, &start->l_debug_list, l_debug_list) { + /* discover the head of the list */ + if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) { + mlog(0, "End of list found, %p\n", ret); + break; + } + + /* We track our "dummy" iteration lockres' by a NULL + * l_ops field. */ + if (iter->l_ops != NULL) { + ret = iter; + break; + } + } + + return ret; +} + +static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos) +{ + struct ocfs2_dlm_seq_priv *priv = m->private; + struct ocfs2_lock_res *iter; + + spin_lock(&ocfs2_dlm_tracking_lock); + iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv); + if (iter) { + /* Since lockres' have the lifetime of their container + * (which can be inodes, ocfs2_supers, etc) we want to + * copy this out to a temporary lockres while still + * under the spinlock. Obviously after this we can't + * trust any pointers on the copy returned, but that's + * ok as the information we want isn't typically held + * in them. */ + priv->p_tmp_res = *iter; + iter = &priv->p_tmp_res; + } + spin_unlock(&ocfs2_dlm_tracking_lock); + + return iter; +} + +static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v) +{ +} + +static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ocfs2_dlm_seq_priv *priv = m->private; + struct ocfs2_lock_res *iter = v; + struct ocfs2_lock_res *dummy = &priv->p_iter_res; + + spin_lock(&ocfs2_dlm_tracking_lock); + iter = ocfs2_dlm_next_res(iter, priv); + list_del_init(&dummy->l_debug_list); + if (iter) { + list_add(&dummy->l_debug_list, &iter->l_debug_list); + priv->p_tmp_res = *iter; + iter = &priv->p_tmp_res; + } + spin_unlock(&ocfs2_dlm_tracking_lock); + + return iter; +} + +/* + * Version is used by debugfs.ocfs2 to determine the format being used + * + * New in version 2 + * - Lock stats printed + * New in version 3 + * - Max time in lock stats is in usecs (instead of nsecs) + */ +#define OCFS2_DLM_DEBUG_STR_VERSION 3 +static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) +{ + int i; + char *lvb; + struct ocfs2_lock_res *lockres = v; + + if (!lockres) + return -EINVAL; + + seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION); + + if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY) + seq_printf(m, "%.*s%08x\t", OCFS2_DENTRY_LOCK_INO_START - 1, + lockres->l_name, + (unsigned int)ocfs2_get_dentry_lock_ino(lockres)); + else + seq_printf(m, "%.*s\t", OCFS2_LOCK_ID_MAX_LEN, lockres->l_name); + + seq_printf(m, "%d\t" + "0x%lx\t" + "0x%x\t" + "0x%x\t" + "%u\t" + "%u\t" + "%d\t" + "%d\t", + lockres->l_level, + lockres->l_flags, + lockres->l_action, + lockres->l_unlock_action, + lockres->l_ro_holders, + lockres->l_ex_holders, + lockres->l_requested, + lockres->l_blocking); + + /* Dump the raw LVB */ + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + for(i = 0; i < DLM_LVB_LEN; i++) + seq_printf(m, "0x%x\t", lvb[i]); + +#ifdef CONFIG_OCFS2_FS_STATS +# define lock_num_prmode(_l) ((_l)->l_lock_prmode.ls_gets) +# define lock_num_exmode(_l) ((_l)->l_lock_exmode.ls_gets) +# define lock_num_prmode_failed(_l) ((_l)->l_lock_prmode.ls_fail) +# define lock_num_exmode_failed(_l) ((_l)->l_lock_exmode.ls_fail) +# define lock_total_prmode(_l) ((_l)->l_lock_prmode.ls_total) +# define lock_total_exmode(_l) ((_l)->l_lock_exmode.ls_total) +# define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max) +# define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max) +# define lock_refresh(_l) ((_l)->l_lock_refresh) +#else +# define lock_num_prmode(_l) (0) +# define lock_num_exmode(_l) (0) +# define lock_num_prmode_failed(_l) (0) +# define lock_num_exmode_failed(_l) (0) +# define lock_total_prmode(_l) (0ULL) +# define lock_total_exmode(_l) (0ULL) +# define lock_max_prmode(_l) (0) +# define lock_max_exmode(_l) (0) +# define lock_refresh(_l) (0) +#endif + /* The following seq_print was added in version 2 of this output */ + seq_printf(m, "%u\t" + "%u\t" + "%u\t" + "%u\t" + "%llu\t" + "%llu\t" + "%u\t" + "%u\t" + "%u\t", + lock_num_prmode(lockres), + lock_num_exmode(lockres), + lock_num_prmode_failed(lockres), + lock_num_exmode_failed(lockres), + lock_total_prmode(lockres), + lock_total_exmode(lockres), + lock_max_prmode(lockres), + lock_max_exmode(lockres), + lock_refresh(lockres)); + + /* End the line */ + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations ocfs2_dlm_seq_ops = { + .start = ocfs2_dlm_seq_start, + .stop = ocfs2_dlm_seq_stop, + .next = ocfs2_dlm_seq_next, + .show = ocfs2_dlm_seq_show, +}; + +static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct ocfs2_dlm_seq_priv *priv = seq->private; + struct ocfs2_lock_res *res = &priv->p_iter_res; + + ocfs2_remove_lockres_tracking(res); + ocfs2_put_dlm_debug(priv->p_dlm_debug); + return seq_release_private(inode, file); +} + +static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) +{ + int ret; + struct ocfs2_dlm_seq_priv *priv; + struct seq_file *seq; + struct ocfs2_super *osb; + + priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); + if (!priv) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + osb = inode->i_private; + ocfs2_get_dlm_debug(osb->osb_dlm_debug); + priv->p_dlm_debug = osb->osb_dlm_debug; + INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); + + ret = seq_open(file, &ocfs2_dlm_seq_ops); + if (ret) { + kfree(priv); + mlog_errno(ret); + goto out; + } + + seq = file->private_data; + seq->private = priv; + + ocfs2_add_lockres_tracking(&priv->p_iter_res, + priv->p_dlm_debug); + +out: + return ret; +} + +static const struct file_operations ocfs2_dlm_debug_fops = { + .open = ocfs2_dlm_debug_open, + .release = ocfs2_dlm_debug_release, + .read = seq_read, + .llseek = seq_lseek, +}; + +static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) +{ + int ret = 0; + struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; + + dlm_debug->d_locking_state = debugfs_create_file("locking_state", + S_IFREG|S_IRUSR, + osb->osb_debug_root, + osb, + &ocfs2_dlm_debug_fops); + if (!dlm_debug->d_locking_state) { + ret = -EINVAL; + mlog(ML_ERROR, + "Unable to create locking state debugfs file.\n"); + goto out; + } + + ocfs2_get_dlm_debug(dlm_debug); +out: + return ret; +} + +static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) +{ + struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; + + if (dlm_debug) { + debugfs_remove(dlm_debug->d_locking_state); + ocfs2_put_dlm_debug(dlm_debug); + } +} + +int ocfs2_dlm_init(struct ocfs2_super *osb) +{ + int status = 0; + struct ocfs2_cluster_connection *conn = NULL; + + if (ocfs2_mount_local(osb)) { + osb->node_num = 0; + goto local; + } + + status = ocfs2_dlm_init_debug(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* launch downconvert thread */ + osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc"); + if (IS_ERR(osb->dc_task)) { + status = PTR_ERR(osb->dc_task); + osb->dc_task = NULL; + mlog_errno(status); + goto bail; + } + + /* for now, uuid == domain */ + status = ocfs2_cluster_connect(osb->osb_cluster_stack, + osb->uuid_str, + strlen(osb->uuid_str), + &lproto, ocfs2_do_node_down, osb, + &conn); + if (status) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_cluster_this_node(&osb->node_num); + if (status < 0) { + mlog_errno(status); + mlog(ML_ERROR, + "could not find this host's node number\n"); + ocfs2_cluster_disconnect(conn, 0); + goto bail; + } + +local: + ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); + ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); + ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); + ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); + + osb->cconn = conn; + + status = 0; +bail: + if (status < 0) { + ocfs2_dlm_shutdown_debug(osb); + if (osb->dc_task) + kthread_stop(osb->dc_task); + } + + return status; +} + +void ocfs2_dlm_shutdown(struct ocfs2_super *osb, + int hangup_pending) +{ + ocfs2_drop_osb_locks(osb); + + /* + * Now that we have dropped all locks and ocfs2_dismount_volume() + * has disabled recovery, the DLM won't be talking to us. It's + * safe to tear things down before disconnecting the cluster. + */ + + if (osb->dc_task) { + kthread_stop(osb->dc_task); + osb->dc_task = NULL; + } + + ocfs2_lock_res_free(&osb->osb_super_lockres); + ocfs2_lock_res_free(&osb->osb_rename_lockres); + ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres); + ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres); + + ocfs2_cluster_disconnect(osb->cconn, hangup_pending); + osb->cconn = NULL; + + ocfs2_dlm_shutdown_debug(osb); +} + +static int ocfs2_drop_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int ret; + unsigned long flags; + u32 lkm_flags = 0; + + /* We didn't get anywhere near actually using this lockres. */ + if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) + goto out; + + if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) + lkm_flags |= DLM_LKF_VALBLK; + + spin_lock_irqsave(&lockres->l_lock, flags); + + mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING), + "lockres %s, flags 0x%lx\n", + lockres->l_name, lockres->l_flags); + + while (lockres->l_flags & OCFS2_LOCK_BUSY) { + mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = " + "%u, unlock_action = %u\n", + lockres->l_name, lockres->l_flags, lockres->l_action, + lockres->l_unlock_action); + + spin_unlock_irqrestore(&lockres->l_lock, flags); + + /* XXX: Today we just wait on any busy + * locks... Perhaps we need to cancel converts in the + * future? */ + ocfs2_wait_on_busy_lock(lockres); + + spin_lock_irqsave(&lockres->l_lock, flags); + } + + if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { + if (lockres->l_flags & OCFS2_LOCK_ATTACHED && + lockres->l_level == DLM_LOCK_EX && + !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) + lockres->l_ops->set_lvb(lockres); + } + + if (lockres->l_flags & OCFS2_LOCK_BUSY) + mlog(ML_ERROR, "destroying busy lock: \"%s\"\n", + lockres->l_name); + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) + mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name); + + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto out; + } + + lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED); + + /* make sure we never get here while waiting for an ast to + * fire. */ + BUG_ON(lockres->l_action != OCFS2_AST_INVALID); + + /* is this necessary? */ + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + mlog(0, "lock %s\n", lockres->l_name); + + ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags); + if (ret) { + ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); + mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); + ocfs2_dlm_dump_lksb(&lockres->l_lksb); + BUG(); + } + mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n", + lockres->l_name); + + ocfs2_wait_on_busy_lock(lockres); +out: + return 0; +} + +/* Mark the lockres as being dropped. It will no longer be + * queued if blocking, but we still may have to wait on it + * being dequeued from the downconvert thread before we can consider + * it safe to drop. + * + * You can *not* attempt to call cluster_lock on this lockres anymore. */ +void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) +{ + int status; + struct ocfs2_mask_waiter mw; + unsigned long flags; + + ocfs2_init_mask_waiter(&mw); + + spin_lock_irqsave(&lockres->l_lock, flags); + lockres->l_flags |= OCFS2_LOCK_FREEING; + while (lockres->l_flags & OCFS2_LOCK_QUEUED) { + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + mlog(0, "Waiting on lockres %s\n", lockres->l_name); + + status = ocfs2_wait_for_mask(&mw); + if (status) + mlog_errno(status); + + spin_lock_irqsave(&lockres->l_lock, flags); + } + spin_unlock_irqrestore(&lockres->l_lock, flags); +} + +void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int ret; + + ocfs2_mark_lockres_freeing(lockres); + ret = ocfs2_drop_lock(osb, lockres); + if (ret) + mlog_errno(ret); +} + +static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) +{ + ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres); + ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres); + ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres); + ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres); +} + +int ocfs2_drop_inode_locks(struct inode *inode) +{ + int status, err; + + /* No need to call ocfs2_mark_lockres_freeing here - + * ocfs2_clear_inode has done it for us. */ + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_open_lockres); + if (err < 0) + mlog_errno(err); + + status = err; + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_inode_lockres); + if (err < 0) + mlog_errno(err); + if (err < 0 && !status) + status = err; + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_rw_lockres); + if (err < 0) + mlog_errno(err); + if (err < 0 && !status) + status = err; + + return status; +} + +static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, + int new_level) +{ + assert_spin_locked(&lockres->l_lock); + + BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); + + if (lockres->l_level <= new_level) { + mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, " + "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, " + "block %d, pgen %d\n", lockres->l_name, lockres->l_level, + new_level, list_empty(&lockres->l_blocked_list), + list_empty(&lockres->l_mask_waiters), lockres->l_type, + lockres->l_flags, lockres->l_ro_holders, + lockres->l_ex_holders, lockres->l_action, + lockres->l_unlock_action, lockres->l_requested, + lockres->l_blocking, lockres->l_pending_gen); + BUG(); + } + + mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n", + lockres->l_name, lockres->l_level, new_level, lockres->l_blocking); + + lockres->l_action = OCFS2_AST_DOWNCONVERT; + lockres->l_requested = new_level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + return lockres_set_pending(lockres); +} + +static int ocfs2_downconvert_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int new_level, + int lvb, + unsigned int generation) +{ + int ret; + u32 dlm_flags = DLM_LKF_CONVERT; + + mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name, + lockres->l_level, new_level); + + if (lvb) + dlm_flags |= DLM_LKF_VALBLK; + + ret = ocfs2_dlm_lock(osb->cconn, + new_level, + &lockres->l_lksb, + dlm_flags, + lockres->l_name, + OCFS2_LOCK_ID_MAX_LEN - 1); + lockres_clear_pending(lockres, generation, osb); + if (ret) { + ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); + ocfs2_recover_from_dlm_error(lockres, 1); + goto bail; + } + + ret = 0; +bail: + return ret; +} + +/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */ +static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + assert_spin_locked(&lockres->l_lock); + + if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { + /* If we're already trying to cancel a lock conversion + * then just drop the spinlock and allow the caller to + * requeue this lock. */ + mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name); + return 0; + } + + /* were we in a convert when we got the bast fire? */ + BUG_ON(lockres->l_action != OCFS2_AST_CONVERT && + lockres->l_action != OCFS2_AST_DOWNCONVERT); + /* set things up for the unlockast to know to just + * clear out the ast_action and unset busy, etc. */ + lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT; + + mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY), + "lock %s, invalid flags: 0x%lx\n", + lockres->l_name, lockres->l_flags); + + mlog(ML_BASTS, "lockres %s\n", lockres->l_name); + + return 1; +} + +static int ocfs2_cancel_convert(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int ret; + + ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, + DLM_LKF_CANCEL); + if (ret) { + ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); + ocfs2_recover_from_dlm_error(lockres, 0); + } + + mlog(ML_BASTS, "lockres %s\n", lockres->l_name); + + return ret; +} + +static int ocfs2_unblock_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + struct ocfs2_unblock_ctl *ctl) +{ + unsigned long flags; + int blocking; + int new_level; + int level; + int ret = 0; + int set_lvb = 0; + unsigned int gen; + + spin_lock_irqsave(&lockres->l_lock, flags); + +recheck: + /* + * Is it still blocking? If not, we have no more work to do. + */ + if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) { + BUG_ON(lockres->l_blocking != DLM_LOCK_NL); + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = 0; + goto leave; + } + + if (lockres->l_flags & OCFS2_LOCK_BUSY) { + /* XXX + * This is a *big* race. The OCFS2_LOCK_PENDING flag + * exists entirely for one reason - another thread has set + * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock(). + * + * If we do ocfs2_cancel_convert() before the other thread + * calls dlm_lock(), our cancel will do nothing. We will + * get no ast, and we will have no way of knowing the + * cancel failed. Meanwhile, the other thread will call + * into dlm_lock() and wait...forever. + * + * Why forever? Because another node has asked for the + * lock first; that's why we're here in unblock_lock(). + * + * The solution is OCFS2_LOCK_PENDING. When PENDING is + * set, we just requeue the unblock. Only when the other + * thread has called dlm_lock() and cleared PENDING will + * we then cancel their request. + * + * All callers of dlm_lock() must set OCFS2_DLM_PENDING + * at the same time they set OCFS2_DLM_BUSY. They must + * clear OCFS2_DLM_PENDING after dlm_lock() returns. + */ + if (lockres->l_flags & OCFS2_LOCK_PENDING) { + mlog(ML_BASTS, "lockres %s, ReQ: Pending\n", + lockres->l_name); + goto leave_requeue; + } + + ctl->requeue = 1; + ret = ocfs2_prepare_cancel_convert(osb, lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + if (ret) { + ret = ocfs2_cancel_convert(osb, lockres); + if (ret < 0) + mlog_errno(ret); + } + goto leave; + } + + /* + * This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is + * set when the ast is received for an upconvert just before the + * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast + * on the heels of the ast, we want to delay the downconvert just + * enough to allow the up requestor to do its task. Because this + * lock is in the blocked queue, the lock will be downconverted + * as soon as the requestor is done with the lock. + */ + if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) + goto leave_requeue; + + /* + * How can we block and yet be at NL? We were trying to upconvert + * from NL and got canceled. The code comes back here, and now + * we notice and clear BLOCKING. + */ + if (lockres->l_level == DLM_LOCK_NL) { + BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders); + mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name); + lockres->l_blocking = DLM_LOCK_NL; + lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto leave; + } + + /* if we're blocking an exclusive and we have *any* holders, + * then requeue. */ + if ((lockres->l_blocking == DLM_LOCK_EX) + && (lockres->l_ex_holders || lockres->l_ro_holders)) { + mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n", + lockres->l_name, lockres->l_ex_holders, + lockres->l_ro_holders); + goto leave_requeue; + } + + /* If it's a PR we're blocking, then only + * requeue if we've got any EX holders */ + if (lockres->l_blocking == DLM_LOCK_PR && + lockres->l_ex_holders) { + mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n", + lockres->l_name, lockres->l_ex_holders); + goto leave_requeue; + } + + /* + * Can we get a lock in this state if the holder counts are + * zero? The meta data unblock code used to check this. + */ + if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) + && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) { + mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n", + lockres->l_name); + goto leave_requeue; + } + + new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); + + if (lockres->l_ops->check_downconvert + && !lockres->l_ops->check_downconvert(lockres, new_level)) { + mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n", + lockres->l_name); + goto leave_requeue; + } + + /* If we get here, then we know that there are no more + * incompatible holders (and anyone asking for an incompatible + * lock is blocked). We can now downconvert the lock */ + if (!lockres->l_ops->downconvert_worker) + goto downconvert; + + /* Some lockres types want to do a bit of work before + * downconverting a lock. Allow that here. The worker function + * may sleep, so we save off a copy of what we're blocking as + * it may change while we're not holding the spin lock. */ + blocking = lockres->l_blocking; + level = lockres->l_level; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking); + + if (ctl->unblock_action == UNBLOCK_STOP_POST) { + mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n", + lockres->l_name); + goto leave; + } + + spin_lock_irqsave(&lockres->l_lock, flags); + if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) { + /* If this changed underneath us, then we can't drop + * it just yet. */ + mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, " + "Recheck\n", lockres->l_name, blocking, + lockres->l_blocking, level, lockres->l_level); + goto recheck; + } + +downconvert: + ctl->requeue = 0; + + if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { + if (lockres->l_level == DLM_LOCK_EX) + set_lvb = 1; + + /* + * We only set the lvb if the lock has been fully + * refreshed - otherwise we risk setting stale + * data. Otherwise, there's no need to actually clear + * out the lvb here as it's value is still valid. + */ + if (set_lvb && !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) + lockres->l_ops->set_lvb(lockres); + } + + gen = ocfs2_prepare_downconvert(lockres, new_level); + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb, + gen); + +leave: + if (ret) + mlog_errno(ret); + return ret; + +leave_requeue: + spin_unlock_irqrestore(&lockres->l_lock, flags); + ctl->requeue = 1; + + return 0; +} + +static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, + int blocking) +{ + struct inode *inode; + struct address_space *mapping; + struct ocfs2_inode_info *oi; + + inode = ocfs2_lock_res_inode(lockres); + mapping = inode->i_mapping; + + if (S_ISDIR(inode->i_mode)) { + oi = OCFS2_I(inode); + oi->ip_dir_lock_gen++; + mlog(0, "generation: %u\n", oi->ip_dir_lock_gen); + goto out; + } + + if (!S_ISREG(inode->i_mode)) + goto out; + + /* + * We need this before the filemap_fdatawrite() so that it can + * transfer the dirty bit from the PTE to the + * page. Unfortunately this means that even for EX->PR + * downconverts, we'll lose our mappings and have to build + * them up again. + */ + unmap_mapping_range(mapping, 0, 0, 0); + + if (filemap_fdatawrite(mapping)) { + mlog(ML_ERROR, "Could not sync inode %llu for downconvert!", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + } + sync_mapping_buffers(mapping); + if (blocking == DLM_LOCK_EX) { + truncate_inode_pages(mapping, 0); + } else { + /* We only need to wait on the I/O if we're not also + * truncating pages because truncate_inode_pages waits + * for us above. We don't truncate pages if we're + * blocking anything < EXMODE because we want to keep + * them around in that case. */ + filemap_fdatawait(mapping); + } + +out: + return UNBLOCK_CONTINUE; +} + +static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci, + struct ocfs2_lock_res *lockres, + int new_level) +{ + int checkpointed = ocfs2_ci_fully_checkpointed(ci); + + BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR); + BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed); + + if (checkpointed) + return 1; + + ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci))); + return 0; +} + +static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, + int new_level) +{ + struct inode *inode = ocfs2_lock_res_inode(lockres); + + return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level); +} + +static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres) +{ + struct inode *inode = ocfs2_lock_res_inode(lockres); + + __ocfs2_stuff_meta_lvb(inode); +} + +/* + * Does the final reference drop on our dentry lock. Right now this + * happens in the downconvert thread, but we could choose to simplify the + * dlmglue API and push these off to the ocfs2_wq in the future. + */ +static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres); + ocfs2_dentry_lock_put(osb, dl); +} + +/* + * d_delete() matching dentries before the lock downconvert. + * + * At this point, any process waiting to destroy the + * dentry_lock due to last ref count is stopped by the + * OCFS2_LOCK_QUEUED flag. + * + * We have two potential problems + * + * 1) If we do the last reference drop on our dentry_lock (via dput) + * we'll wind up in ocfs2_release_dentry_lock(), waiting on + * the downconvert to finish. Instead we take an elevated + * reference and push the drop until after we've completed our + * unblock processing. + * + * 2) There might be another process with a final reference, + * waiting on us to finish processing. If this is the case, we + * detect it and exit out - there's no more dentries anyway. + */ +static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, + int blocking) +{ + struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres); + struct ocfs2_inode_info *oi = OCFS2_I(dl->dl_inode); + struct dentry *dentry; + unsigned long flags; + int extra_ref = 0; + + /* + * This node is blocking another node from getting a read + * lock. This happens when we've renamed within a + * directory. We've forced the other nodes to d_delete(), but + * we never actually dropped our lock because it's still + * valid. The downconvert code will retain a PR for this node, + * so there's no further work to do. + */ + if (blocking == DLM_LOCK_PR) + return UNBLOCK_CONTINUE; + + /* + * Mark this inode as potentially orphaned. The code in + * ocfs2_delete_inode() will figure out whether it actually + * needs to be freed or not. + */ + spin_lock(&oi->ip_lock); + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&oi->ip_lock); + + /* + * Yuck. We need to make sure however that the check of + * OCFS2_LOCK_FREEING and the extra reference are atomic with + * respect to a reference decrement or the setting of that + * flag. + */ + spin_lock_irqsave(&lockres->l_lock, flags); + spin_lock(&dentry_attach_lock); + if (!(lockres->l_flags & OCFS2_LOCK_FREEING) + && dl->dl_count) { + dl->dl_count++; + extra_ref = 1; + } + spin_unlock(&dentry_attach_lock); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + mlog(0, "extra_ref = %d\n", extra_ref); + + /* + * We have a process waiting on us in ocfs2_dentry_iput(), + * which means we can't have any more outstanding + * aliases. There's no need to do any more work. + */ + if (!extra_ref) + return UNBLOCK_CONTINUE; + + spin_lock(&dentry_attach_lock); + while (1) { + dentry = ocfs2_find_local_alias(dl->dl_inode, + dl->dl_parent_blkno, 1); + if (!dentry) + break; + spin_unlock(&dentry_attach_lock); + + mlog(0, "d_delete(%.*s);\n", dentry->d_name.len, + dentry->d_name.name); + + /* + * The following dcache calls may do an + * iput(). Normally we don't want that from the + * downconverting thread, but in this case it's ok + * because the requesting node already has an + * exclusive lock on the inode, so it can't be queued + * for a downconvert. + */ + d_delete(dentry); + dput(dentry); + + spin_lock(&dentry_attach_lock); + } + spin_unlock(&dentry_attach_lock); + + /* + * If we are the last holder of this dentry lock, there is no + * reason to downconvert so skip straight to the unlock. + */ + if (dl->dl_count == 1) + return UNBLOCK_STOP_POST; + + return UNBLOCK_CONTINUE_POST; +} + +static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres, + int new_level) +{ + struct ocfs2_refcount_tree *tree = + ocfs2_lock_res_refcount_tree(lockres); + + return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level); +} + +static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres, + int blocking) +{ + struct ocfs2_refcount_tree *tree = + ocfs2_lock_res_refcount_tree(lockres); + + ocfs2_metadata_cache_purge(&tree->rf_ci); + + return UNBLOCK_CONTINUE; +} + +static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_qinfo_lvb *lvb; + struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres); + struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb, + oinfo->dqi_gi.dqi_type); + + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + lvb->lvb_version = OCFS2_QINFO_LVB_VERSION; + lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace); + lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace); + lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms); + lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks); + lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk); + lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry); +} + +void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock; + struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb); + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + + if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, level); +} + +static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo) +{ + struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb, + oinfo->dqi_gi.dqi_type); + struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock; + struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + struct buffer_head *bh = NULL; + struct ocfs2_global_disk_dqinfo *gdinfo; + int status = 0; + + if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) && + lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) { + info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace); + info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace); + oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms); + oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks); + oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk); + oinfo->dqi_gi.dqi_free_entry = + be32_to_cpu(lvb->lvb_free_entry); + } else { + status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode, + oinfo->dqi_giblk, &bh); + if (status) { + mlog_errno(status); + goto bail; + } + gdinfo = (struct ocfs2_global_disk_dqinfo *) + (bh->b_data + OCFS2_GLOBAL_INFO_OFF); + info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace); + info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace); + oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms); + oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks); + oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk); + oinfo->dqi_gi.dqi_free_entry = + le32_to_cpu(gdinfo->dqi_free_entry); + brelse(bh); + ocfs2_track_lock_refresh(lockres); + } + +bail: + return status; +} + +/* Lock quota info, this function expects at least shared lock on the quota file + * so that we can safely refresh quota info from disk. */ +int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock; + struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb); + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + int status = 0; + + /* On RO devices, locking really isn't needed... */ + if (ocfs2_is_hard_readonly(osb)) { + if (ex) + status = -EROFS; + goto bail; + } + if (ocfs2_mount_local(osb)) + goto bail; + + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (!ocfs2_should_refresh_lock_res(lockres)) + goto bail; + /* OK, we have the lock but we need to refresh the quota info */ + status = ocfs2_refresh_qinfo(oinfo); + if (status) + ocfs2_qinfo_unlock(oinfo, ex); + ocfs2_complete_lock_res_refresh(lockres, status); +bail: + return status; +} + +int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex) +{ + int status; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres; + struct ocfs2_super *osb = lockres->l_priv; + + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) + mlog_errno(status); + + return status; +} + +void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex) +{ + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres; + struct ocfs2_super *osb = lockres->l_priv; + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, level); +} + +static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int status; + struct ocfs2_unblock_ctl ctl = {0, 0,}; + unsigned long flags; + + /* Our reference to the lockres in this function can be + * considered valid until we remove the OCFS2_LOCK_QUEUED + * flag. */ + + BUG_ON(!lockres); + BUG_ON(!lockres->l_ops); + + mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name); + + /* Detect whether a lock has been marked as going away while + * the downconvert thread was processing other things. A lock can + * still be marked with OCFS2_LOCK_FREEING after this check, + * but short circuiting here will still save us some + * performance. */ + spin_lock_irqsave(&lockres->l_lock, flags); + if (lockres->l_flags & OCFS2_LOCK_FREEING) + goto unqueue; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + status = ocfs2_unblock_lock(osb, lockres, &ctl); + if (status < 0) + mlog_errno(status); + + spin_lock_irqsave(&lockres->l_lock, flags); +unqueue: + if (lockres->l_flags & OCFS2_LOCK_FREEING || !ctl.requeue) { + lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED); + } else + ocfs2_schedule_blocked_lock(osb, lockres); + + mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name, + ctl.requeue ? "yes" : "no"); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + if (ctl.unblock_action != UNBLOCK_CONTINUE + && lockres->l_ops->post_unlock) + lockres->l_ops->post_unlock(osb, lockres); +} + +static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + assert_spin_locked(&lockres->l_lock); + + if (lockres->l_flags & OCFS2_LOCK_FREEING) { + /* Do not schedule a lock for downconvert when it's on + * the way to destruction - any nodes wanting access + * to the resource will get it soon. */ + mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n", + lockres->l_name, lockres->l_flags); + return; + } + + lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); + + spin_lock(&osb->dc_task_lock); + if (list_empty(&lockres->l_blocked_list)) { + list_add_tail(&lockres->l_blocked_list, + &osb->blocked_lock_list); + osb->blocked_lock_count++; + } + spin_unlock(&osb->dc_task_lock); +} + +static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb) +{ + unsigned long processed; + struct ocfs2_lock_res *lockres; + + spin_lock(&osb->dc_task_lock); + /* grab this early so we know to try again if a state change and + * wake happens part-way through our work */ + osb->dc_work_sequence = osb->dc_wake_sequence; + + processed = osb->blocked_lock_count; + while (processed) { + BUG_ON(list_empty(&osb->blocked_lock_list)); + + lockres = list_entry(osb->blocked_lock_list.next, + struct ocfs2_lock_res, l_blocked_list); + list_del_init(&lockres->l_blocked_list); + osb->blocked_lock_count--; + spin_unlock(&osb->dc_task_lock); + + BUG_ON(!processed); + processed--; + + ocfs2_process_blocked_lock(osb, lockres); + + spin_lock(&osb->dc_task_lock); + } + spin_unlock(&osb->dc_task_lock); +} + +static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb) +{ + int empty = 0; + + spin_lock(&osb->dc_task_lock); + if (list_empty(&osb->blocked_lock_list)) + empty = 1; + + spin_unlock(&osb->dc_task_lock); + return empty; +} + +static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb) +{ + int should_wake = 0; + + spin_lock(&osb->dc_task_lock); + if (osb->dc_work_sequence != osb->dc_wake_sequence) + should_wake = 1; + spin_unlock(&osb->dc_task_lock); + + return should_wake; +} + +static int ocfs2_downconvert_thread(void *arg) +{ + int status = 0; + struct ocfs2_super *osb = arg; + + /* only quit once we've been asked to stop and there is no more + * work available */ + while (!(kthread_should_stop() && + ocfs2_downconvert_thread_lists_empty(osb))) { + + wait_event_interruptible(osb->dc_event, + ocfs2_downconvert_thread_should_wake(osb) || + kthread_should_stop()); + + mlog(0, "downconvert_thread: awoken\n"); + + ocfs2_downconvert_thread_do_work(osb); + } + + osb->dc_task = NULL; + return status; +} + +void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb) +{ + spin_lock(&osb->dc_task_lock); + /* make sure the voting thread gets a swipe at whatever changes + * the caller may have made to the voting state */ + osb->dc_wake_sequence++; + spin_unlock(&osb->dc_task_lock); + wake_up(&osb->dc_event); +} diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h new file mode 100644 index 00000000..1d596d8c --- /dev/null +++ b/fs/ocfs2/dlmglue.h @@ -0,0 +1,172 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmglue.h + * + * description here + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + + +#ifndef DLMGLUE_H +#define DLMGLUE_H + +#include "dcache.h" + +#define OCFS2_LVB_VERSION 5 + +struct ocfs2_meta_lvb { + __u8 lvb_version; + __u8 lvb_reserved0; + __be16 lvb_idynfeatures; + __be32 lvb_iclusters; + __be32 lvb_iuid; + __be32 lvb_igid; + __be64 lvb_iatime_packed; + __be64 lvb_ictime_packed; + __be64 lvb_imtime_packed; + __be64 lvb_isize; + __be16 lvb_imode; + __be16 lvb_inlink; + __be32 lvb_iattr; + __be32 lvb_igeneration; + __be32 lvb_reserved2; +}; + +#define OCFS2_QINFO_LVB_VERSION 1 + +struct ocfs2_qinfo_lvb { + __u8 lvb_version; + __u8 lvb_reserved[3]; + __be32 lvb_bgrace; + __be32 lvb_igrace; + __be32 lvb_syncms; + __be32 lvb_blocks; + __be32 lvb_free_blk; + __be32 lvb_free_entry; +}; + +#define OCFS2_ORPHAN_LVB_VERSION 1 + +struct ocfs2_orphan_scan_lvb { + __u8 lvb_version; + __u8 lvb_reserved[3]; + __be32 lvb_os_seqno; +}; + +/* ocfs2_inode_lock_full() 'arg_flags' flags */ +/* don't wait on recovery. */ +#define OCFS2_META_LOCK_RECOVERY (0x01) +/* Instruct the dlm not to queue ourselves on the other node. */ +#define OCFS2_META_LOCK_NOQUEUE (0x02) +/* don't block waiting for the downconvert thread, instead return -EAGAIN */ +#define OCFS2_LOCK_NONBLOCK (0x04) + +/* Locking subclasses of inode cluster lock */ +enum { + OI_LS_NORMAL = 0, + OI_LS_PARENT, + OI_LS_RENAME1, + OI_LS_RENAME2, + OI_LS_REFLINK_TARGET, +}; + +int ocfs2_dlm_init(struct ocfs2_super *osb); +void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending); +void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); +void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, + enum ocfs2_lock_type type, + unsigned int generation, + struct inode *inode); +void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, + u64 parent, struct inode *inode); +struct ocfs2_file_private; +void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_file_private *fp); +struct ocfs2_mem_dqinfo; +void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_mem_dqinfo *info); +void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_super *osb, u64 ref_blkno, + unsigned int generation); +void ocfs2_lock_res_free(struct ocfs2_lock_res *res); +int ocfs2_create_new_inode_locks(struct inode *inode); +int ocfs2_drop_inode_locks(struct inode *inode); +int ocfs2_rw_lock(struct inode *inode, int write); +void ocfs2_rw_unlock(struct inode *inode, int write); +int ocfs2_open_lock(struct inode *inode); +int ocfs2_try_open_lock(struct inode *inode, int write); +void ocfs2_open_unlock(struct inode *inode); +int ocfs2_inode_lock_atime(struct inode *inode, + struct vfsmount *vfsmnt, + int *level); +int ocfs2_inode_lock_full_nested(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + int arg_flags, + int subclass); +int ocfs2_inode_lock_with_page(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + struct page *page); +/* Variants without special locking class or flags */ +#define ocfs2_inode_lock_full(i, r, e, f)\ + ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL) +#define ocfs2_inode_lock_nested(i, b, e, s)\ + ocfs2_inode_lock_full_nested(i, b, e, 0, s) +/* 99% of the time we don't want to supply any additional flags -- + * those are for very specific cases only. */ +#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL) +void ocfs2_inode_unlock(struct inode *inode, + int ex); +int ocfs2_super_lock(struct ocfs2_super *osb, + int ex); +void ocfs2_super_unlock(struct ocfs2_super *osb, + int ex); +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno); +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno); + +int ocfs2_rename_lock(struct ocfs2_super *osb); +void ocfs2_rename_unlock(struct ocfs2_super *osb); +int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex); +void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex); +int ocfs2_dentry_lock(struct dentry *dentry, int ex); +void ocfs2_dentry_unlock(struct dentry *dentry, int ex); +int ocfs2_file_lock(struct file *file, int ex, int trylock); +void ocfs2_file_unlock(struct file *file); +int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex); +void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex); +struct ocfs2_refcount_tree; +int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex); +void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); + + +void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); +void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + +/* for the downconvert thread */ +void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb); + +struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); +void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); + +/* To set the locking protocol on module initialization */ +void ocfs2_set_locking_protocol(void); +#endif /* DLMGLUE_H */ diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c new file mode 100644 index 00000000..745db425 --- /dev/null +++ b/fs/ocfs2/export.c @@ -0,0 +1,276 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * export.c + * + * Functions to facilitate NFS exporting + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "dir.h" +#include "dlmglue.h" +#include "dcache.h" +#include "export.h" +#include "inode.h" + +#include "buffer_head_io.h" +#include "suballoc.h" +#include "ocfs2_trace.h" + +struct ocfs2_inode_handle +{ + u64 ih_blkno; + u32 ih_generation; +}; + +static struct dentry *ocfs2_get_dentry(struct super_block *sb, + struct ocfs2_inode_handle *handle) +{ + struct inode *inode; + struct ocfs2_super *osb = OCFS2_SB(sb); + u64 blkno = handle->ih_blkno; + int status, set; + struct dentry *result; + + trace_ocfs2_get_dentry_begin(sb, handle, (unsigned long long)blkno); + + if (blkno == 0) { + result = ERR_PTR(-ESTALE); + goto bail; + } + + inode = ocfs2_ilookup(sb, blkno); + /* + * If the inode exists in memory, we only need to check it's + * generation number + */ + if (inode) + goto check_gen; + + /* + * This will synchronize us against ocfs2_delete_inode() on + * all nodes + */ + status = ocfs2_nfs_sync_lock(osb, 1); + if (status < 0) { + mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status); + goto check_err; + } + + status = ocfs2_test_inode_bit(osb, blkno, &set); + trace_ocfs2_get_dentry_test_bit(status, set); + if (status < 0) { + if (status == -EINVAL) { + /* + * The blkno NFS gave us doesn't even show up + * as an inode, we return -ESTALE to be + * nice + */ + status = -ESTALE; + } else + mlog(ML_ERROR, "test inode bit failed %d\n", status); + goto unlock_nfs_sync; + } + + /* If the inode allocator bit is clear, this inode must be stale */ + if (!set) { + status = -ESTALE; + goto unlock_nfs_sync; + } + + inode = ocfs2_iget(osb, blkno, 0, 0); + +unlock_nfs_sync: + ocfs2_nfs_sync_unlock(osb, 1); + +check_err: + if (status < 0) { + if (status == -ESTALE) { + trace_ocfs2_get_dentry_stale((unsigned long long)blkno, + handle->ih_generation); + } + result = ERR_PTR(status); + goto bail; + } + + if (IS_ERR(inode)) { + mlog_errno(PTR_ERR(inode)); + result = (void *)inode; + goto bail; + } + +check_gen: + if (handle->ih_generation != inode->i_generation) { + iput(inode); + trace_ocfs2_get_dentry_generation((unsigned long long)blkno, + handle->ih_generation, + inode->i_generation); + result = ERR_PTR(-ESTALE); + goto bail; + } + + result = d_obtain_alias(inode); + if (IS_ERR(result)) + mlog_errno(PTR_ERR(result)); + +bail: + trace_ocfs2_get_dentry_end(result); + return result; +} + +static struct dentry *ocfs2_get_parent(struct dentry *child) +{ + int status; + u64 blkno; + struct dentry *parent; + struct inode *dir = child->d_inode; + + trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno); + + status = ocfs2_inode_lock(dir, NULL, 0); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + parent = ERR_PTR(status); + goto bail; + } + + status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno); + if (status < 0) { + parent = ERR_PTR(-ENOENT); + goto bail_unlock; + } + + parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0)); + +bail_unlock: + ocfs2_inode_unlock(dir, 0); + +bail: + trace_ocfs2_get_parent_end(parent); + + return parent; +} + +static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, + int connectable) +{ + struct inode *inode = dentry->d_inode; + int len = *max_len; + int type = 1; + u64 blkno; + u32 generation; + __le32 *fh = (__force __le32 *) fh_in; + + trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len, + dentry->d_name.name, + fh, len, connectable); + + if (connectable && (len < 6)) { + *max_len = 6; + type = 255; + goto bail; + } else if (len < 3) { + *max_len = 3; + type = 255; + goto bail; + } + + blkno = OCFS2_I(inode)->ip_blkno; + generation = inode->i_generation; + + trace_ocfs2_encode_fh_self((unsigned long long)blkno, generation); + + len = 3; + fh[0] = cpu_to_le32((u32)(blkno >> 32)); + fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff)); + fh[2] = cpu_to_le32(generation); + + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + + spin_lock(&dentry->d_lock); + + parent = dentry->d_parent->d_inode; + blkno = OCFS2_I(parent)->ip_blkno; + generation = parent->i_generation; + + fh[3] = cpu_to_le32((u32)(blkno >> 32)); + fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff)); + fh[5] = cpu_to_le32(generation); + + spin_unlock(&dentry->d_lock); + + len = 6; + type = 2; + + trace_ocfs2_encode_fh_parent((unsigned long long)blkno, + generation); + } + + *max_len = len; + +bail: + trace_ocfs2_encode_fh_type(type); + return type; +} + +static struct dentry *ocfs2_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct ocfs2_inode_handle handle; + + if (fh_len < 3 || fh_type > 2) + return NULL; + + handle.ih_blkno = (u64)le32_to_cpu(fid->raw[0]) << 32; + handle.ih_blkno |= (u64)le32_to_cpu(fid->raw[1]); + handle.ih_generation = le32_to_cpu(fid->raw[2]); + return ocfs2_get_dentry(sb, &handle); +} + +static struct dentry *ocfs2_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct ocfs2_inode_handle parent; + + if (fh_type != 2 || fh_len < 6) + return NULL; + + parent.ih_blkno = (u64)le32_to_cpu(fid->raw[3]) << 32; + parent.ih_blkno |= (u64)le32_to_cpu(fid->raw[4]); + parent.ih_generation = le32_to_cpu(fid->raw[5]); + return ocfs2_get_dentry(sb, &parent); +} + +const struct export_operations ocfs2_export_ops = { + .encode_fh = ocfs2_encode_fh, + .fh_to_dentry = ocfs2_fh_to_dentry, + .fh_to_parent = ocfs2_fh_to_parent, + .get_parent = ocfs2_get_parent, +}; diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h new file mode 100644 index 00000000..41a73867 --- /dev/null +++ b/fs/ocfs2/export.h @@ -0,0 +1,33 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * export.h + * + * Function prototypes + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_EXPORT_H +#define OCFS2_EXPORT_H + +#include + +extern const struct export_operations ocfs2_export_ops; + +#endif /* OCFS2_EXPORT_H */ diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c new file mode 100644 index 00000000..23457b49 --- /dev/null +++ b/fs/ocfs2/extent_map.c @@ -0,0 +1,902 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * extent_map.c + * + * Block/Cluster mapping functions + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "inode.h" +#include "super.h" +#include "symlink.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +/* + * The extent caching implementation is intentionally trivial. + * + * We only cache a small number of extents stored directly on the + * inode, so linear order operations are acceptable. If we ever want + * to increase the size of the extent map, then these algorithms must + * get smarter. + */ + +void ocfs2_extent_map_init(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + oi->ip_extent_map.em_num_items = 0; + INIT_LIST_HEAD(&oi->ip_extent_map.em_list); +} + +static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, + unsigned int cpos, + struct ocfs2_extent_map_item **ret_emi) +{ + unsigned int range; + struct ocfs2_extent_map_item *emi; + + *ret_emi = NULL; + + list_for_each_entry(emi, &em->em_list, ei_list) { + range = emi->ei_cpos + emi->ei_clusters; + + if (cpos >= emi->ei_cpos && cpos < range) { + list_move(&emi->ei_list, &em->em_list); + + *ret_emi = emi; + break; + } + } +} + +static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos, + unsigned int *phys, unsigned int *len, + unsigned int *flags) +{ + unsigned int coff; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map_item *emi; + + spin_lock(&oi->ip_lock); + + __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi); + if (emi) { + coff = cpos - emi->ei_cpos; + *phys = emi->ei_phys + coff; + if (len) + *len = emi->ei_clusters - coff; + if (flags) + *flags = emi->ei_flags; + } + + spin_unlock(&oi->ip_lock); + + if (emi == NULL) + return -ENOENT; + + return 0; +} + +/* + * Forget about all clusters equal to or greater than cpos. + */ +void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) +{ + struct ocfs2_extent_map_item *emi, *n; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map *em = &oi->ip_extent_map; + LIST_HEAD(tmp_list); + unsigned int range; + + spin_lock(&oi->ip_lock); + list_for_each_entry_safe(emi, n, &em->em_list, ei_list) { + if (emi->ei_cpos >= cpos) { + /* Full truncate of this record. */ + list_move(&emi->ei_list, &tmp_list); + BUG_ON(em->em_num_items == 0); + em->em_num_items--; + continue; + } + + range = emi->ei_cpos + emi->ei_clusters; + if (range > cpos) { + /* Partial truncate */ + emi->ei_clusters = cpos - emi->ei_cpos; + } + } + spin_unlock(&oi->ip_lock); + + list_for_each_entry_safe(emi, n, &tmp_list, ei_list) { + list_del(&emi->ei_list); + kfree(emi); + } +} + +/* + * Is any part of emi2 contained within emi1 + */ +static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1, + struct ocfs2_extent_map_item *emi2) +{ + unsigned int range1, range2; + + /* + * Check if logical start of emi2 is inside emi1 + */ + range1 = emi1->ei_cpos + emi1->ei_clusters; + if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1) + return 1; + + /* + * Check if logical end of emi2 is inside emi1 + */ + range2 = emi2->ei_cpos + emi2->ei_clusters; + if (range2 > emi1->ei_cpos && range2 <= range1) + return 1; + + return 0; +} + +static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest, + struct ocfs2_extent_map_item *src) +{ + dest->ei_cpos = src->ei_cpos; + dest->ei_phys = src->ei_phys; + dest->ei_clusters = src->ei_clusters; + dest->ei_flags = src->ei_flags; +} + +/* + * Try to merge emi with ins. Returns 1 if merge succeeds, zero + * otherwise. + */ +static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi, + struct ocfs2_extent_map_item *ins) +{ + /* + * Handle contiguousness + */ + if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) && + ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) && + ins->ei_flags == emi->ei_flags) { + emi->ei_clusters += ins->ei_clusters; + return 1; + } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys && + (ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos && + ins->ei_flags == emi->ei_flags) { + emi->ei_phys = ins->ei_phys; + emi->ei_cpos = ins->ei_cpos; + emi->ei_clusters += ins->ei_clusters; + return 1; + } + + /* + * Overlapping extents - this shouldn't happen unless we've + * split an extent to change it's flags. That is exceedingly + * rare, so there's no sense in trying to optimize it yet. + */ + if (ocfs2_ei_is_contained(emi, ins) || + ocfs2_ei_is_contained(ins, emi)) { + ocfs2_copy_emi_fields(emi, ins); + return 1; + } + + /* No merge was possible. */ + return 0; +} + +/* + * In order to reduce complexity on the caller, this insert function + * is intentionally liberal in what it will accept. + * + * The only rule is that the truncate call *must* be used whenever + * records have been deleted. This avoids inserting overlapping + * records with different physical mappings. + */ +void ocfs2_extent_map_insert_rec(struct inode *inode, + struct ocfs2_extent_rec *rec) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map *em = &oi->ip_extent_map; + struct ocfs2_extent_map_item *emi, *new_emi = NULL; + struct ocfs2_extent_map_item ins; + + ins.ei_cpos = le32_to_cpu(rec->e_cpos); + ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters); + ins.ei_flags = rec->e_flags; + +search: + spin_lock(&oi->ip_lock); + + list_for_each_entry(emi, &em->em_list, ei_list) { + if (ocfs2_try_to_merge_extent_map(emi, &ins)) { + list_move(&emi->ei_list, &em->em_list); + spin_unlock(&oi->ip_lock); + goto out; + } + } + + /* + * No item could be merged. + * + * Either allocate and add a new item, or overwrite the last recently + * inserted. + */ + + if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) { + if (new_emi == NULL) { + spin_unlock(&oi->ip_lock); + + new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS); + if (new_emi == NULL) + goto out; + + goto search; + } + + ocfs2_copy_emi_fields(new_emi, &ins); + list_add(&new_emi->ei_list, &em->em_list); + em->em_num_items++; + new_emi = NULL; + } else { + BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0); + emi = list_entry(em->em_list.prev, + struct ocfs2_extent_map_item, ei_list); + list_move(&emi->ei_list, &em->em_list); + ocfs2_copy_emi_fields(emi, &ins); + } + + spin_unlock(&oi->ip_lock); + +out: + if (new_emi) + kfree(new_emi); +} + +static int ocfs2_last_eb_is_empty(struct inode *inode, + struct ocfs2_dinode *di) +{ + int ret, next_free; + u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk); + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + + next_free = le16_to_cpu(el->l_next_free_rec); + + if (next_free == 0 || + (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) + ret = 1; + +out: + brelse(eb_bh); + return ret; +} + +/* + * Return the 1st index within el which contains an extent start + * larger than v_cluster. + */ +static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el, + u32 v_cluster) +{ + int i; + struct ocfs2_extent_rec *rec; + + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + + if (v_cluster < le32_to_cpu(rec->e_cpos)) + break; + } + + return i; +} + +/* + * Figure out the size of a hole which starts at v_cluster within the given + * extent list. + * + * If there is no more allocation past v_cluster, we return the maximum + * cluster size minus v_cluster. + * + * If we have in-inode extents, then el points to the dinode list and + * eb_bh is NULL. Otherwise, eb_bh should point to the extent block + * containing el. + */ +int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *el, + struct buffer_head *eb_bh, + u32 v_cluster, + u32 *num_clusters) +{ + int ret, i; + struct buffer_head *next_eb_bh = NULL; + struct ocfs2_extent_block *eb, *next_eb; + + i = ocfs2_search_for_hole_index(el, v_cluster); + + if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) { + eb = (struct ocfs2_extent_block *)eb_bh->b_data; + + /* + * Check the next leaf for any extents. + */ + + if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) + goto no_more_extents; + + ret = ocfs2_read_extent_block(ci, + le64_to_cpu(eb->h_next_leaf_blk), + &next_eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data; + el = &next_eb->h_list; + i = ocfs2_search_for_hole_index(el, v_cluster); + } + +no_more_extents: + if (i == le16_to_cpu(el->l_next_free_rec)) { + /* + * We're at the end of our existing allocation. Just + * return the maximum number of clusters we could + * possibly allocate. + */ + *num_clusters = UINT_MAX - v_cluster; + } else { + *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster; + } + + ret = 0; +out: + brelse(next_eb_bh); + return ret; +} + +static int ocfs2_get_clusters_nocache(struct inode *inode, + struct buffer_head *di_bh, + u32 v_cluster, unsigned int *hole_len, + struct ocfs2_extent_rec *ret_rec, + unsigned int *is_last) +{ + int i, ret, tree_height, len; + struct ocfs2_dinode *di; + struct ocfs2_extent_block *uninitialized_var(eb); + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + struct buffer_head *eb_bh = NULL; + + memset(ret_rec, 0, sizeof(*ret_rec)); + if (is_last) + *is_last = 0; + + di = (struct ocfs2_dinode *) di_bh->b_data; + el = &di->id2.i_list; + tree_height = le16_to_cpu(el->l_tree_depth); + + if (tree_height > 0) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster, + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + i = ocfs2_search_extent_list(el, v_cluster); + if (i == -1) { + /* + * Holes can be larger than the maximum size of an + * extent, so we return their lengths in a separate + * field. + */ + if (hole_len) { + ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode), + el, eb_bh, + v_cluster, &len); + if (ret) { + mlog_errno(ret); + goto out; + } + + *hole_len = len; + } + goto out_hole; + } + + rec = &el->l_recs[i]; + + BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); + + if (!rec->e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0)", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + *ret_rec = *rec; + + /* + * Checking for last extent is potentially expensive - we + * might have to look at the next leaf over to see if it's + * empty. + * + * The first two checks are to see whether the caller even + * cares for this information, and if the extent is at least + * the last in it's list. + * + * If those hold true, then the extent is last if any of the + * additional conditions hold true: + * - Extent list is in-inode + * - Extent list is right-most + * - Extent list is 2nd to rightmost, with empty right-most + */ + if (is_last) { + if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) { + if (tree_height == 0) + *is_last = 1; + else if (eb->h_blkno == di->i_last_eb_blk) + *is_last = 1; + else if (eb->h_next_leaf_blk == di->i_last_eb_blk) { + ret = ocfs2_last_eb_is_empty(inode, di); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + if (ret == 1) + *is_last = 1; + } + } + } + +out_hole: + ret = 0; +out: + brelse(eb_bh); + return ret; +} + +static void ocfs2_relative_extent_offsets(struct super_block *sb, + u32 v_cluster, + struct ocfs2_extent_rec *rec, + u32 *p_cluster, u32 *num_clusters) + +{ + u32 coff = v_cluster - le32_to_cpu(rec->e_cpos); + + *p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno)); + *p_cluster = *p_cluster + coff; + + if (num_clusters) + *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff; +} + +int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + struct ocfs2_extent_list *el, + unsigned int *extent_flags) +{ + int ret = 0, i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec; + u32 coff; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster, + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "xattr leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + i = ocfs2_search_extent_list(el, v_cluster); + if (i == -1) { + ret = -EROFS; + mlog_errno(ret); + goto out; + } else { + rec = &el->l_recs[i]; + BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); + + if (!rec->e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in xattr", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + coff = v_cluster - le32_to_cpu(rec->e_cpos); + *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + *p_cluster = *p_cluster + coff; + if (num_clusters) + *num_clusters = ocfs2_rec_clusters(el, rec) - coff; + + if (extent_flags) + *extent_flags = rec->e_flags; + } +out: + if (eb_bh) + brelse(eb_bh); + return ret; +} + +int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + unsigned int *extent_flags) +{ + int ret; + unsigned int uninitialized_var(hole_len), flags = 0; + struct buffer_head *di_bh = NULL; + struct ocfs2_extent_rec rec; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = -ERANGE; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, + num_clusters, extent_flags); + if (ret == 0) + goto out; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len, + &rec, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rec.e_blkno == 0ULL) { + /* + * A hole was found. Return some canned values that + * callers can key on. If asked for, num_clusters will + * be populated with the size of the hole. + */ + *p_cluster = 0; + if (num_clusters) { + *num_clusters = hole_len; + } + } else { + ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec, + p_cluster, num_clusters); + flags = rec.e_flags; + + ocfs2_extent_map_insert_rec(inode, &rec); + } + + if (extent_flags) + *extent_flags = flags; + +out: + brelse(di_bh); + return ret; +} + +/* + * This expects alloc_sem to be held. The allocation cannot change at + * all while the map is in the process of being updated. + */ +int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, + u64 *ret_count, unsigned int *extent_flags) +{ + int ret; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 cpos, num_clusters, p_cluster; + u64 boff = 0; + + cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); + + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, + extent_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * p_cluster == 0 indicates a hole. + */ + if (p_cluster) { + boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + boff += (v_blkno & (u64)(bpc - 1)); + } + + *p_blkno = boff; + + if (ret_count) { + *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); + *ret_count -= v_blkno & (u64)(bpc - 1); + } + +out: + return ret; +} + +/* + * The ocfs2_fiemap_inline() may be a little bit misleading, since + * it not only handles the fiemap for inlined files, but also deals + * with the fast symlink, cause they have no difference for extent + * mapping per se. + */ +static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, + struct fiemap_extent_info *fieinfo, + u64 map_start) +{ + int ret; + unsigned int id_count; + struct ocfs2_dinode *di; + u64 phys; + u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + di = (struct ocfs2_dinode *)di_bh->b_data; + if (ocfs2_inode_is_fast_symlink(inode)) + id_count = ocfs2_fast_symlink_chars(inode->i_sb); + else + id_count = le16_to_cpu(di->id2.i_data.id_count); + + if (map_start < id_count) { + phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits; + if (ocfs2_inode_is_fast_symlink(inode)) + phys += offsetof(struct ocfs2_dinode, id2.i_symlink); + else + phys += offsetof(struct ocfs2_dinode, + id2.i_data.id_data); + + ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, + flags); + if (ret < 0) + return ret; + } + + return 0; +} + +#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) + +int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 map_start, u64 map_len) +{ + int ret, is_last; + u32 mapping_end, cpos; + unsigned int hole_size; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u64 len_bytes, phys_bytes, virt_bytes; + struct buffer_head *di_bh = NULL; + struct ocfs2_extent_rec rec; + + ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS); + if (ret) + return ret; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + + /* + * Handle inline-data and fast symlink separately. + */ + if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || + ocfs2_inode_is_fast_symlink(inode)) { + ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start); + goto out_unlock; + } + + cpos = map_start >> osb->s_clustersize_bits; + mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, + map_start + map_len); + mapping_end -= cpos; + is_last = 0; + while (cpos < mapping_end && !is_last) { + u32 fe_flags; + + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, + &hole_size, &rec, &is_last); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rec.e_blkno == 0ULL) { + cpos += hole_size; + continue; + } + + fe_flags = 0; + if (rec.e_flags & OCFS2_EXT_UNWRITTEN) + fe_flags |= FIEMAP_EXTENT_UNWRITTEN; + if (rec.e_flags & OCFS2_EXT_REFCOUNTED) + fe_flags |= FIEMAP_EXTENT_SHARED; + if (is_last) + fe_flags |= FIEMAP_EXTENT_LAST; + len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; + phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits; + virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; + + ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, + len_bytes, fe_flags); + if (ret) + break; + + cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters); + } + + if (ret > 0) + ret = 0; + +out_unlock: + brelse(di_bh); + + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_inode_unlock(inode, 0); +out: + + return ret; +} + +int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) +{ + int rc = 0; + u64 p_block, p_count; + int i, count, done = 0; + + trace_ocfs2_read_virt_blocks( + inode, (unsigned long long)v_block, nr, bhs, flags, + validate); + + if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >= + i_size_read(inode)) { + BUG_ON(!(flags & OCFS2_BH_READAHEAD)); + goto out; + } + + while (done < nr) { + down_read(&OCFS2_I(inode)->ip_alloc_sem); + rc = ocfs2_extent_map_get_blocks(inode, v_block + done, + &p_block, &p_count, NULL); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (rc) { + mlog_errno(rc); + break; + } + + if (!p_block) { + rc = -EIO; + mlog(ML_ERROR, + "Inode #%llu contains a hole at offset %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)(v_block + done) << + inode->i_sb->s_blocksize_bits); + break; + } + + count = nr - done; + if (p_count < count) + count = p_count; + + /* + * If the caller passed us bhs, they should have come + * from a previous readahead call to this function. Thus, + * they should have the right b_blocknr. + */ + for (i = 0; i < count; i++) { + if (!bhs[done + i]) + continue; + BUG_ON(bhs[done + i]->b_blocknr != (p_block + i)); + } + + rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count, + bhs + done, flags, validate); + if (rc) { + mlog_errno(rc); + break; + } + done += count; + } + +out: + return rc; +} + + diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h new file mode 100644 index 00000000..e79d41c2 --- /dev/null +++ b/fs/ocfs2/extent_map.h @@ -0,0 +1,90 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * extent_map.h + * + * In-memory file extent mappings for OCFS2. + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _EXTENT_MAP_H +#define _EXTENT_MAP_H + +struct ocfs2_extent_map_item { + unsigned int ei_cpos; + unsigned int ei_phys; + unsigned int ei_clusters; + unsigned int ei_flags; + + struct list_head ei_list; +}; + +#define OCFS2_MAX_EXTENT_MAP_ITEMS 3 +struct ocfs2_extent_map { + unsigned int em_num_items; + struct list_head em_list; +}; + +void ocfs2_extent_map_init(struct inode *inode); +void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster); +void ocfs2_extent_map_insert_rec(struct inode *inode, + struct ocfs2_extent_rec *rec); + +int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, unsigned int *extent_flags); +int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, + u64 *ret_count, unsigned int *extent_flags); + +int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 map_start, u64 map_len); + +int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + struct ocfs2_extent_list *el, + unsigned int *extent_flags); + +int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)); +int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *el, + struct buffer_head *eb_bh, + u32 v_cluster, + u32 *num_clusters); +static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block, + struct buffer_head **bh, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) +{ + int status = 0; + + if (bh == NULL) { + printk("ocfs2: bh == NULL\n"); + status = -EINVAL; + goto bail; + } + + status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate); + +bail: + return status; +} + + +#endif /* _EXTENT_MAP_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c new file mode 100644 index 00000000..b1e35a39 --- /dev/null +++ b/fs/ocfs2/file.c @@ -0,0 +1,2688 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * file.c + * + * File open, close, extend, truncate + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "aops.h" +#include "dir.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "sysfile.h" +#include "inode.h" +#include "ioctl.h" +#include "journal.h" +#include "locks.h" +#include "mmap.h" +#include "suballoc.h" +#include "super.h" +#include "xattr.h" +#include "acl.h" +#include "quota.h" +#include "refcounttree.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +static int ocfs2_init_file_private(struct inode *inode, struct file *file) +{ + struct ocfs2_file_private *fp; + + fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL); + if (!fp) + return -ENOMEM; + + fp->fp_file = file; + mutex_init(&fp->fp_mutex); + ocfs2_file_lock_res_init(&fp->fp_flock, fp); + file->private_data = fp; + + return 0; +} + +static void ocfs2_free_file_private(struct inode *inode, struct file *file) +{ + struct ocfs2_file_private *fp = file->private_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (fp) { + ocfs2_simple_drop_lockres(osb, &fp->fp_flock); + ocfs2_lock_res_free(&fp->fp_flock); + kfree(fp); + file->private_data = NULL; + } +} + +static int ocfs2_file_open(struct inode *inode, struct file *file) +{ + int status; + int mode = file->f_flags; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + trace_ocfs2_file_open(inode, file, file->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name, mode); + + if (file->f_mode & FMODE_WRITE) + dquot_initialize(inode); + + spin_lock(&oi->ip_lock); + + /* Check that the inode hasn't been wiped from disk by another + * node. If it hasn't then we're safe as long as we hold the + * spin lock until our increment of open count. */ + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + spin_unlock(&oi->ip_lock); + + status = -ENOENT; + goto leave; + } + + if (mode & O_DIRECT) + oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; + + oi->ip_open_count++; + spin_unlock(&oi->ip_lock); + + status = ocfs2_init_file_private(inode, file); + if (status) { + /* + * We want to set open count back if we're failing the + * open. + */ + spin_lock(&oi->ip_lock); + oi->ip_open_count--; + spin_unlock(&oi->ip_lock); + } + +leave: + return status; +} + +static int ocfs2_file_release(struct inode *inode, struct file *file) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + spin_lock(&oi->ip_lock); + if (!--oi->ip_open_count) + oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; + + trace_ocfs2_file_release(inode, file, file->f_path.dentry, + oi->ip_blkno, + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name, + oi->ip_open_count); + spin_unlock(&oi->ip_lock); + + ocfs2_free_file_private(inode, file); + + return 0; +} + +static int ocfs2_dir_open(struct inode *inode, struct file *file) +{ + return ocfs2_init_file_private(inode, file); +} + +static int ocfs2_dir_release(struct inode *inode, struct file *file) +{ + ocfs2_free_file_private(inode, file); + return 0; +} + +static int ocfs2_sync_file(struct file *file, int datasync) +{ + int err = 0; + journal_t *journal; + struct inode *inode = file->f_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + trace_ocfs2_sync_file(inode, file, file->f_path.dentry, + OCFS2_I(inode)->ip_blkno, + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name, + (unsigned long long)datasync); + + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { + /* + * We still have to flush drive's caches to get data to the + * platter + */ + if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + goto bail; + } + + journal = osb->journal->j_journal; + err = jbd2_journal_force_commit(journal); + +bail: + if (err) + mlog_errno(err); + + return (err < 0) ? -EIO : 0; +} + +int ocfs2_should_update_atime(struct inode *inode, + struct vfsmount *vfsmnt) +{ + struct timespec now; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return 0; + + if ((inode->i_flags & S_NOATIME) || + ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) + return 0; + + /* + * We can be called with no vfsmnt structure - NFSD will + * sometimes do this. + * + * Note that our action here is different than touch_atime() - + * if we can't tell whether this is a noatime mount, then we + * don't know whether to trust the value of s_atime_quantum. + */ + if (vfsmnt == NULL) + return 0; + + if ((vfsmnt->mnt_flags & MNT_NOATIME) || + ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) + return 0; + + if (vfsmnt->mnt_flags & MNT_RELATIME) { + if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || + (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) + return 1; + + return 0; + } + + now = CURRENT_TIME; + if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) + return 0; + else + return 1; +} + +int ocfs2_update_inode_atime(struct inode *inode, + struct buffer_head *bh) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * Don't use ocfs2_mark_inode_dirty() here as we don't always + * have i_mutex to guard against concurrent changes to other + * inode fields. + */ + inode->i_atime = CURRENT_TIME; + di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); + di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + ocfs2_journal_dirty(handle, bh); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +static int ocfs2_set_inode_size(handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size) +{ + int status; + + i_size_write(inode, new_i_size); + inode->i_blocks = ocfs2_inode_sector_count(inode); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + return status; +} + +int ocfs2_simple_size_update(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle = NULL; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_set_inode_size(handle, inode, di_bh, + new_i_size); + if (ret < 0) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +static int ocfs2_cow_file_pos(struct inode *inode, + struct buffer_head *fe_bh, + u64 offset) +{ + int status; + u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + /* + * If the new offset is aligned to the range of the cluster, there is + * no space for ocfs2_zero_range_for_truncate to fill, so no need to + * CoW either. + */ + if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0) + return 0; + + status = ocfs2_get_clusters(inode, cpos, &phys, + &num_clusters, &ext_flags); + if (status) { + mlog_errno(status); + goto out; + } + + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1); + +out: + return status; +} + +static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size) +{ + int status; + handle_t *handle; + struct ocfs2_dinode *di; + u64 cluster_bytes; + + /* + * We need to CoW the cluster contains the offset if it is reflinked + * since we will call ocfs2_zero_range_for_truncate later which will + * write "0" from offset to the end of the cluster. + */ + status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size); + if (status) { + mlog_errno(status); + return status; + } + + /* TODO: This needs to actually orphan the inode in this + * transaction. */ + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + /* + * Do this before setting i_size. + */ + cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); + status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size, + cluster_bytes); + if (status) { + mlog_errno(status); + goto out_commit; + } + + i_size_write(inode, new_i_size); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + di = (struct ocfs2_dinode *) fe_bh->b_data; + di->i_size = cpu_to_le64(new_i_size); + di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ocfs2_journal_dirty(handle, fe_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return status; +} + +static int ocfs2_truncate_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) +{ + int status = 0; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + /* We trust di_bh because it comes from ocfs2_inode_lock(), which + * already validated it */ + fe = (struct ocfs2_dinode *) di_bh->b_data; + + trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)le64_to_cpu(fe->i_size), + (unsigned long long)new_i_size); + + mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), + "Inode %llu, inode i_size = %lld != di " + "i_size = %llu, i_flags = 0x%x\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + i_size_read(inode), + (unsigned long long)le64_to_cpu(fe->i_size), + le32_to_cpu(fe->i_flags)); + + if (new_i_size > le64_to_cpu(fe->i_size)) { + trace_ocfs2_truncate_file_error( + (unsigned long long)le64_to_cpu(fe->i_size), + (unsigned long long)new_i_size); + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + /* lets handle the simple truncate cases before doing any more + * cluster locking. */ + if (new_i_size == le64_to_cpu(fe->i_size)) + goto bail; + + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_resv_discard(&osb->osb_la_resmap, + &OCFS2_I(inode)->ip_la_data_resv); + + /* + * The inode lock forced other nodes to sync and drop their + * pages, which (correctly) happens even if we have a truncate + * without allocation change - ocfs2 cluster sizes can be much + * greater than page size, so we have to truncate them + * anyway. + */ + unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + status = ocfs2_truncate_inline(inode, di_bh, new_i_size, + i_size_read(inode), 1); + if (status) + mlog_errno(status); + + goto bail_unlock_sem; + } + + /* alright, we're going to need to do a full blown alloc size + * change. Orphan the inode so that recovery can complete the + * truncate if necessary. This does the task of marking + * i_size. */ + status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_sem; + } + + status = ocfs2_commit_truncate(osb, inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_sem; + } + + /* TODO: orphan dir cleanup here. */ +bail_unlock_sem: + up_write(&OCFS2_I(inode)->ip_alloc_sem); + +bail: + if (!status && OCFS2_I(inode)->ip_clusters == 0) + status = ocfs2_try_remove_refcount_tree(inode, di_bh); + + return status; +} + +/* + * extend file allocation only here. + * we'll update all the disk stuff, and oip->alloc_size + * + * expect stuff to be locked, a transaction started and enough data / + * metadata reservations in the contexts. + * + * Will return -EAGAIN, and a reason if a restart is needed. + * If passed in, *reason will always be set, even in error. + */ +int ocfs2_add_inode_data(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct buffer_head *fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) +{ + int ret; + struct ocfs2_extent_tree et; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); + ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, + clusters_to_add, mark_unwritten, + data_ac, meta_ac, reason_ret); + + return ret; +} + +static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) +{ + int status = 0; + int restart_func = 0; + int credits; + u32 prev_clusters; + struct buffer_head *bh = NULL; + struct ocfs2_dinode *fe = NULL; + handle_t *handle = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + enum ocfs2_alloc_restarted why; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_tree et; + int did_quota = 0; + + /* + * This function only exists for file systems which don't + * support holes. + */ + BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); + + status = ocfs2_read_inode_block(inode, &bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + fe = (struct ocfs2_dinode *) bh->b_data; + +restart_all: + BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh); + status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); + if (status) { + mlog_errno(status); + goto leave; + } + + credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, + clusters_to_add); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + +restarted_transaction: + trace_ocfs2_extend_allocation( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)i_size_read(inode), + le32_to_cpu(fe->i_clusters), clusters_to_add, + why, restart_func); + + status = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); + if (status) + goto leave; + did_quota = 1; + + /* reserve a write to the file entry early on - that we if we + * run out of credits in the allocation path, we can still + * update i_size. */ + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + prev_clusters = OCFS2_I(inode)->ip_clusters; + + status = ocfs2_add_inode_data(osb, + inode, + &logical_start, + clusters_to_add, + mark_unwritten, + bh, + handle, + data_ac, + meta_ac, + &why); + if ((status < 0) && (status != -EAGAIN)) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + ocfs2_journal_dirty(handle, bh); + + spin_lock(&OCFS2_I(inode)->ip_lock); + clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); + /* Release unused quota reservation */ + dquot_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); + did_quota = 0; + + if (why != RESTART_NONE && clusters_to_add) { + if (why == RESTART_META) { + restart_func = 1; + status = 0; + } else { + BUG_ON(why != RESTART_TRANS); + + /* TODO: This can be more intelligent. */ + credits = ocfs2_calc_extend_credits(osb->sb, + &fe->id2.i_list, + clusters_to_add); + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + /* handle still has to be committed at + * this point. */ + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + goto restarted_transaction; + } + } + + trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno, + le32_to_cpu(fe->i_clusters), + (unsigned long long)le64_to_cpu(fe->i_size), + OCFS2_I(inode)->ip_clusters, + (unsigned long long)i_size_read(inode)); + +leave: + if (status < 0 && did_quota) + dquot_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); + if (handle) { + ocfs2_commit_trans(osb, handle); + handle = NULL; + } + if (data_ac) { + ocfs2_free_alloc_context(data_ac); + data_ac = NULL; + } + if (meta_ac) { + ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } + if ((!status) && restart_func) { + restart_func = 0; + goto restart_all; + } + brelse(bh); + bh = NULL; + + return status; +} + +/* + * While a write will already be ordering the data, a truncate will not. + * Thus, we need to explicitly order the zeroed pages. + */ +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle = NULL; + int ret = 0; + + if (!ocfs2_should_order_data(inode)) + goto out; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_jbd2_file_inode(handle, inode); + if (ret < 0) + mlog_errno(ret); + +out: + if (ret) { + if (!IS_ERR(handle)) + ocfs2_commit_trans(osb, handle); + handle = ERR_PTR(ret); + } + return handle; +} + +/* Some parts of this taken from generic_cont_expand, which turned out + * to be too fragile to do exactly what we need without us having to + * worry about recursive locking in ->write_begin() and ->write_end(). */ +static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, + u64 abs_to) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long index = abs_from >> PAGE_CACHE_SHIFT; + handle_t *handle = NULL; + int ret = 0; + unsigned zero_from, zero_to, block_start, block_end; + + BUG_ON(abs_from >= abs_to); + BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); + BUG_ON(abs_from & (inode->i_blkbits - 1)); + + page = find_or_create_page(mapping, index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + /* Get the offsets within the page that we want to zero */ + zero_from = abs_from & (PAGE_CACHE_SIZE - 1); + zero_to = abs_to & (PAGE_CACHE_SIZE - 1); + if (!zero_to) + zero_to = PAGE_CACHE_SIZE; + + trace_ocfs2_write_zero_page( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)abs_from, + (unsigned long long)abs_to, + index, zero_from, zero_to); + + /* We know that zero_from is block aligned */ + for (block_start = zero_from; block_start < zero_to; + block_start = block_end) { + block_end = block_start + (1 << inode->i_blkbits); + + /* + * block_start is block-aligned. Bump it by one to force + * __block_write_begin and block_commit_write to zero the + * whole block. + */ + ret = __block_write_begin(page, block_start + 1, 0, + ocfs2_get_block); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + + if (!handle) { + handle = ocfs2_zero_start_ordered_transaction(inode); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + break; + } + } + + /* must not update i_size! */ + ret = block_commit_write(page, block_start + 1, + block_start + 1); + if (ret < 0) + mlog_errno(ret); + else + ret = 0; + } + + if (handle) + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + +out_unlock: + unlock_page(page); + page_cache_release(page); +out: + return ret; +} + +/* + * Find the next range to zero. We do this in terms of bytes because + * that's what ocfs2_zero_extend() wants, and it is dealing with the + * pagecache. We may return multiple extents. + * + * zero_start and zero_end are ocfs2_zero_extend()s current idea of what + * needs to be zeroed. range_start and range_end return the next zeroing + * range. A subsequent call should pass the previous range_end as its + * zero_start. If range_end is 0, there's nothing to do. + * + * Unwritten extents are skipped over. Refcounted extents are CoWd. + */ +static int ocfs2_zero_extend_get_range(struct inode *inode, + struct buffer_head *di_bh, + u64 zero_start, u64 zero_end, + u64 *range_start, u64 *range_end) +{ + int rc = 0, needs_cow = 0; + u32 p_cpos, zero_clusters = 0; + u32 zero_cpos = + zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end); + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + while (zero_cpos < last_cpos) { + rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (rc) { + mlog_errno(rc); + goto out; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { + zero_clusters = num_clusters; + if (ext_flags & OCFS2_EXT_REFCOUNTED) + needs_cow = 1; + break; + } + + zero_cpos += num_clusters; + } + if (!zero_clusters) { + *range_end = 0; + goto out; + } + + while ((zero_cpos + zero_clusters) < last_cpos) { + rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters, + &p_cpos, &num_clusters, + &ext_flags); + if (rc) { + mlog_errno(rc); + goto out; + } + + if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)) + break; + if (ext_flags & OCFS2_EXT_REFCOUNTED) + needs_cow = 1; + zero_clusters += num_clusters; + } + if ((zero_cpos + zero_clusters) > last_cpos) + zero_clusters = last_cpos - zero_cpos; + + if (needs_cow) { + rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos, + zero_clusters, UINT_MAX); + if (rc) { + mlog_errno(rc); + goto out; + } + } + + *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos); + *range_end = ocfs2_clusters_to_bytes(inode->i_sb, + zero_cpos + zero_clusters); + +out: + return rc; +} + +/* + * Zero one range returned from ocfs2_zero_extend_get_range(). The caller + * has made sure that the entire range needs zeroing. + */ +static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, + u64 range_end) +{ + int rc = 0; + u64 next_pos; + u64 zero_pos = range_start; + + trace_ocfs2_zero_extend_range( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)range_start, + (unsigned long long)range_end); + BUG_ON(range_start >= range_end); + + while (zero_pos < range_end) { + next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; + if (next_pos > range_end) + next_pos = range_end; + rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); + if (rc < 0) { + mlog_errno(rc); + break; + } + zero_pos = next_pos; + + /* + * Very large extends have the potential to lock up + * the cpu for extended periods of time. + */ + cond_resched(); + } + + return rc; +} + +int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, + loff_t zero_to_size) +{ + int ret = 0; + u64 zero_start, range_start = 0, range_end = 0; + struct super_block *sb = inode->i_sb; + + zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); + trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)zero_start, + (unsigned long long)i_size_read(inode)); + while (zero_start < zero_to_size) { + ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start, + zero_to_size, + &range_start, + &range_end); + if (ret) { + mlog_errno(ret); + break; + } + if (!range_end) + break; + /* Trim the ends */ + if (range_start < zero_start) + range_start = zero_start; + if (range_end > zero_to_size) + range_end = zero_to_size; + + ret = ocfs2_zero_extend_range(inode, range_start, + range_end); + if (ret) { + mlog_errno(ret); + break; + } + zero_start = range_end; + } + + return ret; +} + +int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, + u64 new_i_size, u64 zero_to) +{ + int ret; + u32 clusters_to_add; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + /* + * Only quota files call this without a bh, and they can't be + * refcounted. + */ + BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE)); + + clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); + if (clusters_to_add < oi->ip_clusters) + clusters_to_add = 0; + else + clusters_to_add -= oi->ip_clusters; + + if (clusters_to_add) { + ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, + clusters_to_add, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * Call this even if we don't add any clusters to the tree. We + * still need to zero the area between the old i_size and the + * new i_size. + */ + ret = ocfs2_zero_extend(inode, di_bh, zero_to); + if (ret < 0) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_extend_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) +{ + int ret = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + BUG_ON(!di_bh); + + /* setattr sometimes calls us like this. */ + if (new_i_size == 0) + goto out; + + if (i_size_read(inode) == new_i_size) + goto out; + BUG_ON(new_i_size < i_size_read(inode)); + + /* + * The alloc sem blocks people in read/write from reading our + * allocation until we're done changing it. We depend on + * i_mutex to block other extend/truncate calls while we're + * here. We even have to hold it for sparse files because there + * might be some tail zeroing. + */ + down_write(&oi->ip_alloc_sem); + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + /* + * We can optimize small extends by keeping the inodes + * inline data. + */ + if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) { + up_write(&oi->ip_alloc_sem); + goto out_update_size; + } + + ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); + if (ret) { + up_write(&oi->ip_alloc_sem); + mlog_errno(ret); + goto out; + } + } + + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_zero_extend(inode, di_bh, new_i_size); + else + ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size, + new_i_size); + + up_write(&oi->ip_alloc_sem); + + if (ret < 0) { + mlog_errno(ret); + goto out; + } + +out_update_size: + ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); + if (ret < 0) + mlog_errno(ret); + +out: + return ret; +} + +int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) +{ + int status = 0, size_change; + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct ocfs2_super *osb = OCFS2_SB(sb); + struct buffer_head *bh = NULL; + handle_t *handle = NULL; + struct dquot *transfer_to[MAXQUOTAS] = { }; + int qtype; + + trace_ocfs2_setattr(inode, dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + dentry->d_name.len, dentry->d_name.name, + attr->ia_valid, attr->ia_mode, + attr->ia_uid, attr->ia_gid); + + /* ensuring we don't even attempt to truncate a symlink */ + if (S_ISLNK(inode->i_mode)) + attr->ia_valid &= ~ATTR_SIZE; + +#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ + | ATTR_GID | ATTR_UID | ATTR_MODE) + if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) + return 0; + + status = inode_change_ok(inode, attr); + if (status) + return status; + + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; + if (size_change) { + status = ocfs2_rw_lock(inode, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + status = ocfs2_inode_lock(inode, &bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail_unlock_rw; + } + + if (size_change && attr->ia_size != i_size_read(inode)) { + status = inode_newsize_ok(inode, attr->ia_size); + if (status) + goto bail_unlock; + + if (i_size_read(inode) > attr->ia_size) { + if (ocfs2_should_order_data(inode)) { + status = ocfs2_begin_ordered_truncate(inode, + attr->ia_size); + if (status) + goto bail_unlock; + } + status = ocfs2_truncate_file(inode, bh, attr->ia_size); + } else + status = ocfs2_extend_file(inode, bh, attr->ia_size); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + status = -ENOSPC; + goto bail_unlock; + } + } + + if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + /* + * Gather pointers to quota structures so that allocation / + * freeing of quota structures happens here and not inside + * dquot_transfer() where we have problems with lock ordering + */ + if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid + && OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid, + USRQUOTA); + if (!transfer_to[USRQUOTA]) { + status = -ESRCH; + goto bail_unlock; + } + } + if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid + && OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid, + GRPQUOTA); + if (!transfer_to[GRPQUOTA]) { + status = -ESRCH; + goto bail_unlock; + } + } + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS + + 2 * ocfs2_quota_trans_credits(sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + status = __dquot_transfer(inode, transfer_to); + if (status < 0) + goto bail_commit; + } else { + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + } + + /* + * This will intentionally not wind up calling truncate_setsize(), + * since all the work for a size change has been done above. + * Otherwise, we could get into problems with truncate as + * ip_alloc_sem is used there to protect against i_size + * changes. + * + * XXX: this means the conditional below can probably be removed. + */ + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + status = vmtruncate(inode, attr->ia_size); + if (status) { + mlog_errno(status); + goto bail_commit; + } + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + + status = ocfs2_mark_inode_dirty(handle, inode, bh); + if (status < 0) + mlog_errno(status); + +bail_commit: + ocfs2_commit_trans(osb, handle); +bail_unlock: + ocfs2_inode_unlock(inode, 1); +bail_unlock_rw: + if (size_change) + ocfs2_rw_unlock(inode, 1); +bail: + brelse(bh); + + /* Release quota pointers in case we acquired them */ + for (qtype = 0; qtype < MAXQUOTAS; qtype++) + dqput(transfer_to[qtype]); + + if (!status && attr->ia_valid & ATTR_MODE) { + status = ocfs2_acl_chmod(inode); + if (status < 0) + mlog_errno(status); + } + + return status; +} + +int ocfs2_getattr(struct vfsmount *mnt, + struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct super_block *sb = dentry->d_inode->i_sb; + struct ocfs2_super *osb = sb->s_fs_info; + int err; + + err = ocfs2_inode_revalidate(dentry); + if (err) { + if (err != -ENOENT) + mlog_errno(err); + goto bail; + } + + generic_fillattr(inode, stat); + + /* We set the blksize from the cluster size for performance */ + stat->blksize = osb->s_clustersize; + +bail: + return err; +} + +int ocfs2_permission(struct inode *inode, int mask, unsigned int flags) +{ + int ret; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret) { + if (ret != -ENOENT) + mlog_errno(ret); + goto out; + } + + ret = generic_permission(inode, mask, flags, ocfs2_check_acl); + + ocfs2_inode_unlock(inode, 0); +out: + return ret; +} + +static int __ocfs2_write_remove_suid(struct inode *inode, + struct buffer_head *bh) +{ + int ret; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di; + + trace_ocfs2_write_remove_suid( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + inode->i_mode); + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_trans; + } + + inode->i_mode &= ~S_ISUID; + if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) + inode->i_mode &= ~S_ISGID; + + di = (struct ocfs2_dinode *) bh->b_data; + di->i_mode = cpu_to_le16(inode->i_mode); + + ocfs2_journal_dirty(handle, bh); + +out_trans: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +/* + * Will look for holes and unwritten extents in the range starting at + * pos for count bytes (inclusive). + */ +static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, + size_t count) +{ + int ret = 0; + unsigned int extent_flags; + u32 cpos, clusters, extent_len, phys_cpos; + struct super_block *sb = inode->i_sb; + + cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; + clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; + + while (clusters) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, + &extent_flags); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { + ret = 1; + break; + } + + if (extent_len > clusters) + extent_len = clusters; + + clusters -= extent_len; + cpos += extent_len; + } +out: + return ret; +} + +static int ocfs2_write_remove_suid(struct inode *inode) +{ + int ret; + struct buffer_head *bh = NULL; + + ret = ocfs2_read_inode_block(inode, &bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_write_remove_suid(inode, bh); +out: + brelse(bh); + return ret; +} + +/* + * Allocate enough extents to cover the region starting at byte offset + * start for len bytes. Existing extents are skipped, any extents + * added are marked as "unwritten". + */ +static int ocfs2_allocate_unwritten_extents(struct inode *inode, + u64 start, u64 len) +{ + int ret; + u32 cpos, phys_cpos, clusters, alloc_size; + u64 end = start + len; + struct buffer_head *di_bh = NULL; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Nothing to do if the requested reservation range + * fits within the inode. + */ + if (ocfs2_size_fits_inline_data(di_bh, end)) + goto out; + + ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * We consider both start and len to be inclusive. + */ + cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len); + clusters -= cpos; + + while (clusters) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, + &alloc_size, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Hole or existing extent len can be arbitrary, so + * cap it to our own allocation request. + */ + if (alloc_size > clusters) + alloc_size = clusters; + + if (phys_cpos) { + /* + * We already have an allocation at this + * region so we can safely skip it. + */ + goto next; + } + + ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + +next: + cpos += alloc_size; + clusters -= alloc_size; + } + + ret = 0; +out: + + brelse(di_bh); + return ret; +} + +/* + * Truncate a byte range, avoiding pages within partial clusters. This + * preserves those pages for the zeroing code to write to. + */ +static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, + u64 byte_len) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + loff_t start, end; + struct address_space *mapping = inode->i_mapping; + + start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start); + end = byte_start + byte_len; + end = end & ~(osb->s_clustersize - 1); + + if (start < end) { + unmap_mapping_range(mapping, start, end - start, 0); + truncate_inode_pages_range(mapping, start, end - 1); + } +} + +static int ocfs2_zero_partial_clusters(struct inode *inode, + u64 start, u64 len) +{ + int ret = 0; + u64 tmpend, end = start + len; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int csize = osb->s_clustersize; + handle_t *handle; + + /* + * The "start" and "end" values are NOT necessarily part of + * the range whose allocation is being deleted. Rather, this + * is what the user passed in with the request. We must zero + * partial clusters here. There's no need to worry about + * physical allocation - the zeroing code knows to skip holes. + */ + trace_ocfs2_zero_partial_clusters( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)start, (unsigned long long)end); + + /* + * If both edges are on a cluster boundary then there's no + * zeroing required as the region is part of the allocation to + * be truncated. + */ + if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) + goto out; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + /* + * We want to get the byte offset of the end of the 1st cluster. + */ + tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); + if (tmpend > end) + tmpend = end; + + trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start, + (unsigned long long)tmpend); + + ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); + if (ret) + mlog_errno(ret); + + if (tmpend < end) { + /* + * This may make start and end equal, but the zeroing + * code will skip any work in that case so there's no + * need to catch it up here. + */ + start = end & ~(osb->s_clustersize - 1); + + trace_ocfs2_zero_partial_clusters_range2( + (unsigned long long)start, (unsigned long long)end); + + ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); + if (ret) + mlog_errno(ret); + } + + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos) +{ + int i; + struct ocfs2_extent_rec *rec = NULL; + + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) < pos) + break; + } + + return i; +} + +/* + * Helper to calculate the punching pos and length in one run, we handle the + * following three cases in order: + * + * - remove the entire record + * - remove a partial record + * - no record needs to be removed (hole-punching completed) +*/ +static void ocfs2_calc_trunc_pos(struct inode *inode, + struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *rec, + u32 trunc_start, u32 *trunc_cpos, + u32 *trunc_len, u32 *trunc_end, + u64 *blkno, int *done) +{ + int ret = 0; + u32 coff, range; + + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + + if (le32_to_cpu(rec->e_cpos) >= trunc_start) { + /* + * remove an entire extent record. + */ + *trunc_cpos = le32_to_cpu(rec->e_cpos); + /* + * Skip holes if any. + */ + if (range < *trunc_end) + *trunc_end = range; + *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos); + *blkno = le64_to_cpu(rec->e_blkno); + *trunc_end = le32_to_cpu(rec->e_cpos); + } else if (range > trunc_start) { + /* + * remove a partial extent record, which means we're + * removing the last extent record. + */ + *trunc_cpos = trunc_start; + /* + * skip hole if any. + */ + if (range < *trunc_end) + *trunc_end = range; + *trunc_len = *trunc_end - trunc_start; + coff = trunc_start - le32_to_cpu(rec->e_cpos); + *blkno = le64_to_cpu(rec->e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, coff); + *trunc_end = trunc_start; + } else { + /* + * It may have two following possibilities: + * + * - last record has been removed + * - trunc_start was within a hole + * + * both two cases mean the completion of hole punching. + */ + ret = 1; + } + + *done = ret; +} + +static int ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, u64 byte_start, + u64 byte_len) +{ + int ret = 0, flags = 0, done = 0, i; + u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos; + u32 cluster_in_el; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_cached_dealloc_ctxt dealloc; + struct address_space *mapping = inode->i_mapping; + struct ocfs2_extent_tree et; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el = NULL; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc); + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + ocfs2_init_dealloc_ctxt(&dealloc); + + trace_ocfs2_remove_inode_range( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)byte_start, + (unsigned long long)byte_len); + + if (byte_len == 0) + return 0; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_truncate_inline(inode, di_bh, byte_start, + byte_start + byte_len, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + /* + * There's no need to get fancy with the page cache + * truncate of an inline-data inode. We're talking + * about less than a page here, which will be cached + * in the dinode buffer anyway. + */ + unmap_mapping_range(mapping, 0, 0, 0); + truncate_inode_pages(mapping, 0); + goto out; + } + + /* + * For reflinks, we may need to CoW 2 clusters which might be + * partially zero'd later, if hole's start and end offset were + * within one cluster(means is not exactly aligned to clustersize). + */ + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { + + ret = ocfs2_cow_file_pos(inode, di_bh, byte_start); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); + trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits; + cluster_in_el = trunc_end; + + ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + path = ocfs2_new_path_from_et(&et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + while (trunc_end > trunc_start) { + + ret = ocfs2_find_path(INODE_CACHE(inode), path, + cluster_in_el); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + i = ocfs2_find_rec(el, trunc_end); + /* + * Need to go to previous extent block. + */ + if (i < 0) { + if (path->p_tree_depth == 0) + break; + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, + path, + &cluster_in_el); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We've reached the leftmost extent block, + * it's safe to leave. + */ + if (cluster_in_el == 0) + break; + + /* + * The 'pos' searched for previous extent block is + * always one cluster less than actual trunc_end. + */ + trunc_end = cluster_in_el + 1; + + ocfs2_reinit_path(path, 1); + + continue; + + } else + rec = &el->l_recs[i]; + + ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos, + &trunc_len, &trunc_end, &blkno, &done); + if (done) + break; + + flags = rec->e_flags; + phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + + ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos, + phys_cpos, trunc_len, flags, + &dealloc, refcount_loc); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + cluster_in_el = trunc_end; + + ocfs2_reinit_path(path, 1); + } + + ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); + +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + + return ret; +} + +/* + * Parts of this function taken from xfs_change_file_space() + */ +static int __ocfs2_change_file_space(struct file *file, struct inode *inode, + loff_t f_pos, unsigned int cmd, + struct ocfs2_space_resv *sr, + int change_size) +{ + int ret; + s64 llen; + loff_t size; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + handle_t *handle; + unsigned long long max_off = inode->i_sb->s_maxbytes; + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + mutex_lock(&inode->i_mutex); + + /* + * This prevents concurrent writes on other nodes + */ + ret = ocfs2_rw_lock(inode, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_rw_unlock; + } + + if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { + ret = -EPERM; + goto out_inode_unlock; + } + + switch (sr->l_whence) { + case 0: /*SEEK_SET*/ + break; + case 1: /*SEEK_CUR*/ + sr->l_start += f_pos; + break; + case 2: /*SEEK_END*/ + sr->l_start += i_size_read(inode); + break; + default: + ret = -EINVAL; + goto out_inode_unlock; + } + sr->l_whence = 0; + + llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len; + + if (sr->l_start < 0 + || sr->l_start > max_off + || (sr->l_start + llen) < 0 + || (sr->l_start + llen) > max_off) { + ret = -EINVAL; + goto out_inode_unlock; + } + size = sr->l_start + sr->l_len; + + if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { + if (sr->l_len <= 0) { + ret = -EINVAL; + goto out_inode_unlock; + } + } + + if (file && should_remove_suid(file->f_path.dentry)) { + ret = __ocfs2_write_remove_suid(inode, di_bh); + if (ret) { + mlog_errno(ret); + goto out_inode_unlock; + } + } + + down_write(&OCFS2_I(inode)->ip_alloc_sem); + switch (cmd) { + case OCFS2_IOC_RESVSP: + case OCFS2_IOC_RESVSP64: + /* + * This takes unsigned offsets, but the signed ones we + * pass have been checked against overflow above. + */ + ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start, + sr->l_len); + break; + case OCFS2_IOC_UNRESVSP: + case OCFS2_IOC_UNRESVSP64: + ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start, + sr->l_len); + break; + default: + ret = -EINVAL; + } + up_write(&OCFS2_I(inode)->ip_alloc_sem); + if (ret) { + mlog_errno(ret); + goto out_inode_unlock; + } + + /* + * We update c/mtime for these changes + */ + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_inode_unlock; + } + + if (change_size && i_size_read(inode) < size) + i_size_write(inode, size); + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); + if (ret < 0) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); + +out_inode_unlock: + brelse(di_bh); + ocfs2_inode_unlock(inode, 1); +out_rw_unlock: + ocfs2_rw_unlock(inode, 1); + +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +int ocfs2_change_file_space(struct file *file, unsigned int cmd, + struct ocfs2_space_resv *sr) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && + !ocfs2_writes_unwritten_extents(osb)) + return -ENOTTY; + else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) && + !ocfs2_sparse_alloc(osb)) + return -ENOTTY; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + + return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); +} + +static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_space_resv sr; + int change_size = 1; + int cmd = OCFS2_IOC_RESVSP64; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + if (!ocfs2_writes_unwritten_extents(osb)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_KEEP_SIZE) + change_size = 0; + + if (mode & FALLOC_FL_PUNCH_HOLE) + cmd = OCFS2_IOC_UNRESVSP64; + + sr.l_whence = 0; + sr.l_start = (s64)offset; + sr.l_len = (s64)len; + + return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr, + change_size); +} + +int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, + size_t count) +{ + int ret = 0; + unsigned int extent_flags; + u32 cpos, clusters, extent_len, phys_cpos; + struct super_block *sb = inode->i_sb; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || + !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) || + OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; + + cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; + clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; + + while (clusters) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, + &extent_flags); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) { + ret = 1; + break; + } + + if (extent_len > clusters) + extent_len = clusters; + + clusters -= extent_len; + cpos += extent_len; + } +out: + return ret; +} + +static int ocfs2_prepare_inode_for_refcount(struct inode *inode, + struct file *file, + loff_t pos, size_t count, + int *meta_level) +{ + int ret; + struct buffer_head *di_bh = NULL; + u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + u32 clusters = + ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + *meta_level = 1; + + ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX); + if (ret) + mlog_errno(ret); +out: + brelse(di_bh); + return ret; +} + +static int ocfs2_prepare_inode_for_write(struct file *file, + loff_t *ppos, + size_t count, + int appending, + int *direct_io, + int *has_refcount) +{ + int ret = 0, meta_level = 0; + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + loff_t saved_pos = 0, end; + + /* + * We start with a read level meta lock and only jump to an ex + * if we need to make modifications here. + */ + for(;;) { + ret = ocfs2_inode_lock(inode, NULL, meta_level); + if (ret < 0) { + meta_level = -1; + mlog_errno(ret); + goto out; + } + + /* Clear suid / sgid if necessary. We do this here + * instead of later in the write path because + * remove_suid() calls ->setattr without any hint that + * we may have already done our cluster locking. Since + * ocfs2_setattr() *must* take cluster locks to + * proceeed, this will lead us to recursively lock the + * inode. There's also the dinode i_size state which + * can be lost via setattr during extending writes (we + * set inode->i_size at the end of a write. */ + if (should_remove_suid(dentry)) { + if (meta_level == 0) { + ocfs2_inode_unlock(inode, meta_level); + meta_level = 1; + continue; + } + + ret = ocfs2_write_remove_suid(inode); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + } + + /* work on a copy of ppos until we're sure that we won't have + * to recalculate it due to relocking. */ + if (appending) + saved_pos = i_size_read(inode); + else + saved_pos = *ppos; + + end = saved_pos + count; + + ret = ocfs2_check_range_for_refcount(inode, saved_pos, count); + if (ret == 1) { + ocfs2_inode_unlock(inode, meta_level); + meta_level = -1; + + ret = ocfs2_prepare_inode_for_refcount(inode, + file, + saved_pos, + count, + &meta_level); + if (has_refcount) + *has_refcount = 1; + if (direct_io) + *direct_io = 0; + } + + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + + /* + * Skip the O_DIRECT checks if we don't need + * them. + */ + if (!direct_io || !(*direct_io)) + break; + + /* + * There's no sane way to do direct writes to an inode + * with inline data. + */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + *direct_io = 0; + break; + } + + /* + * Allowing concurrent direct writes means + * i_size changes wouldn't be synchronized, so + * one node could wind up truncating another + * nodes writes. + */ + if (end > i_size_read(inode)) { + *direct_io = 0; + break; + } + + /* + * We don't fill holes during direct io, so + * check for them here. If any are found, the + * caller will have to retake some cluster + * locks and initiate the io as buffered. + */ + ret = ocfs2_check_range_for_holes(inode, saved_pos, count); + if (ret == 1) { + *direct_io = 0; + ret = 0; + } else if (ret < 0) + mlog_errno(ret); + break; + } + + if (appending) + *ppos = saved_pos; + +out_unlock: + trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, + saved_pos, appending, count, + direct_io, has_refcount); + + if (meta_level >= 0) + ocfs2_inode_unlock(inode, meta_level); + +out: + return ret; +} + +static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, + loff_t pos) +{ + int ret, direct_io, appending, rw_level, have_alloc_sem = 0; + int can_do_direct, has_refcount = 0; + ssize_t written = 0; + size_t ocount; /* original count */ + size_t count; /* after file limit checks */ + loff_t old_size, *ppos = &iocb->ki_pos; + u32 old_clusters; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_path.dentry->d_inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); + + trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name, + (unsigned int)nr_segs); + + if (iocb->ki_left == 0) + return 0; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + appending = file->f_flags & O_APPEND ? 1 : 0; + direct_io = file->f_flags & O_DIRECT ? 1 : 0; + + mutex_lock(&inode->i_mutex); + + ocfs2_iocb_clear_sem_locked(iocb); + +relock: + /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ + if (direct_io) { + down_read(&inode->i_alloc_sem); + have_alloc_sem = 1; + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_sem_locked(iocb); + } + + /* + * Concurrent O_DIRECT writes are allowed with + * mount_option "coherency=buffered". + */ + rw_level = (!direct_io || full_coherency); + + ret = ocfs2_rw_lock(inode, rw_level); + if (ret < 0) { + mlog_errno(ret); + goto out_sems; + } + + /* + * O_DIRECT writes with "coherency=full" need to take EX cluster + * inode_lock to guarantee coherency. + */ + if (direct_io && full_coherency) { + /* + * We need to take and drop the inode lock to force + * other nodes to drop their caches. Buffered I/O + * already does this in write_begin(). + */ + ret = ocfs2_inode_lock(inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_sems; + } + + ocfs2_inode_unlock(inode, 1); + } + + can_do_direct = direct_io; + ret = ocfs2_prepare_inode_for_write(file, ppos, + iocb->ki_left, appending, + &can_do_direct, &has_refcount); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + /* + * We can't complete the direct I/O as requested, fall back to + * buffered I/O. + */ + if (direct_io && !can_do_direct) { + ocfs2_rw_unlock(inode, rw_level); + up_read(&inode->i_alloc_sem); + + have_alloc_sem = 0; + rw_level = -1; + + direct_io = 0; + goto relock; + } + + /* + * To later detect whether a journal commit for sync writes is + * necessary, we sample i_size, and cluster count here. + */ + old_size = i_size_read(inode); + old_clusters = OCFS2_I(inode)->ip_clusters; + + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_rw_locked(iocb, rw_level); + + ret = generic_segment_checks(iov, &nr_segs, &ocount, + VERIFY_READ); + if (ret) + goto out_dio; + + count = ocount; + ret = generic_write_checks(file, ppos, &count, + S_ISBLK(inode->i_mode)); + if (ret) + goto out_dio; + + if (direct_io) { + written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, + ppos, count, ocount); + if (written < 0) { + ret = written; + goto out_dio; + } + } else { + current->backing_dev_info = file->f_mapping->backing_dev_info; + written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, + ppos, count, 0); + current->backing_dev_info = NULL; + } + +out_dio: + /* buffered aio wouldn't have proper lock coverage today */ + BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); + + if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || + ((file->f_flags & O_DIRECT) && !direct_io)) { + ret = filemap_fdatawrite_range(file->f_mapping, pos, + pos + count - 1); + if (ret < 0) + written = ret; + + if (!ret && ((old_size != i_size_read(inode)) || + (old_clusters != OCFS2_I(inode)->ip_clusters) || + has_refcount)) { + ret = jbd2_journal_force_commit(osb->journal->j_journal); + if (ret < 0) + written = ret; + } + + if (!ret) + ret = filemap_fdatawait_range(file->f_mapping, pos, + pos + count - 1); + } + + /* + * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io + * function pointer which is called when o_direct io completes so that + * it can unlock our rw lock. (it's the clustered equivalent of + * i_alloc_sem; protects truncate from racing with pending ios). + * Unfortunately there are error cases which call end_io and others + * that don't. so we don't have to unlock the rw_lock if either an + * async dio is going to do it in the future or an end_io after an + * error has already done it. + */ + if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { + rw_level = -1; + have_alloc_sem = 0; + } + +out: + if (rw_level != -1) + ocfs2_rw_unlock(inode, rw_level); + +out_sems: + if (have_alloc_sem) { + up_read(&inode->i_alloc_sem); + ocfs2_iocb_clear_sem_locked(iocb); + } + + mutex_unlock(&inode->i_mutex); + + if (written) + ret = written; + return ret; +} + +static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, + struct file *out, + struct splice_desc *sd) +{ + int ret; + + ret = ocfs2_prepare_inode_for_write(out, &sd->pos, + sd->total_len, 0, NULL, NULL); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + return splice_from_pipe_feed(pipe, sd, pipe_to_file); +} + +static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, + loff_t *ppos, + size_t len, + unsigned int flags) +{ + int ret; + struct address_space *mapping = out->f_mapping; + struct inode *inode = mapping->host; + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + }; + + + trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + out->f_path.dentry->d_name.len, + out->f_path.dentry->d_name.name, len); + + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); + + splice_from_pipe_begin(&sd); + do { + ret = splice_from_pipe_next(pipe, &sd); + if (ret <= 0) + break; + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) + mlog_errno(ret); + else { + ret = ocfs2_splice_to_file(pipe, out, &sd); + ocfs2_rw_unlock(inode, 1); + } + mutex_unlock(&inode->i_mutex); + } while (ret > 0); + splice_from_pipe_end(pipe, &sd); + + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + + if (sd.num_spliced) + ret = sd.num_spliced; + + if (ret > 0) { + unsigned long nr_pages; + int err; + + nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; + + balance_dirty_pages_ratelimited_nr(mapping, nr_pages); + } + + return ret; +} + +static ssize_t ocfs2_file_splice_read(struct file *in, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + int ret = 0, lock_level = 0; + struct inode *inode = in->f_path.dentry->d_inode; + + trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + in->f_path.dentry->d_name.len, + in->f_path.dentry->d_name.name, len); + + /* + * See the comment in ocfs2_file_aio_read() + */ + ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + ocfs2_inode_unlock(inode, lock_level); + + ret = generic_file_splice_read(in, ppos, pipe, len, flags); + +bail: + return ret; +} + +static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, + loff_t pos) +{ + int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; + struct file *filp = iocb->ki_filp; + struct inode *inode = filp->f_path.dentry->d_inode; + + trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + filp->f_path.dentry->d_name.len, + filp->f_path.dentry->d_name.name, nr_segs); + + + if (!inode) { + ret = -EINVAL; + mlog_errno(ret); + goto bail; + } + + ocfs2_iocb_clear_sem_locked(iocb); + + /* + * buffered reads protect themselves in ->readpage(). O_DIRECT reads + * need locks to protect pending reads from racing with truncate. + */ + if (filp->f_flags & O_DIRECT) { + down_read(&inode->i_alloc_sem); + have_alloc_sem = 1; + ocfs2_iocb_set_sem_locked(iocb); + + ret = ocfs2_rw_lock(inode, 0); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + rw_level = 0; + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_rw_locked(iocb, rw_level); + } + + /* + * We're fine letting folks race truncates and extending + * writes with read across the cluster, just like they can + * locally. Hence no rw_lock during read. + * + * Take and drop the meta data lock to update inode fields + * like i_size. This allows the checks down below + * generic_file_aio_read() a chance of actually working. + */ + ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + ocfs2_inode_unlock(inode, lock_level); + + ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); + trace_generic_file_aio_read_ret(ret); + + /* buffered aio wouldn't have proper lock coverage today */ + BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); + + /* see ocfs2_file_aio_write */ + if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { + rw_level = -1; + have_alloc_sem = 0; + } + +bail: + if (have_alloc_sem) { + up_read(&inode->i_alloc_sem); + ocfs2_iocb_clear_sem_locked(iocb); + } + if (rw_level != -1) + ocfs2_rw_unlock(inode, rw_level); + + return ret; +} + +const struct inode_operations ocfs2_file_iops = { + .setattr = ocfs2_setattr, + .getattr = ocfs2_getattr, + .permission = ocfs2_permission, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, +}; + +const struct inode_operations ocfs2_special_file_iops = { + .setattr = ocfs2_setattr, + .getattr = ocfs2_getattr, + .permission = ocfs2_permission, +}; + +/* + * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with + * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! + */ +const struct file_operations ocfs2_fops = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .mmap = ocfs2_mmap, + .fsync = ocfs2_sync_file, + .release = ocfs2_file_release, + .open = ocfs2_file_open, + .aio_read = ocfs2_file_aio_read, + .aio_write = ocfs2_file_aio_write, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .lock = ocfs2_lock, + .flock = ocfs2_flock, + .splice_read = ocfs2_file_splice_read, + .splice_write = ocfs2_file_splice_write, + .fallocate = ocfs2_fallocate, +}; + +const struct file_operations ocfs2_dops = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = ocfs2_readdir, + .fsync = ocfs2_sync_file, + .release = ocfs2_dir_release, + .open = ocfs2_dir_open, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .lock = ocfs2_lock, + .flock = ocfs2_flock, +}; + +/* + * POSIX-lockless variants of our file_operations. + * + * These will be used if the underlying cluster stack does not support + * posix file locking, if the user passes the "localflocks" mount + * option, or if we have a local-only fs. + * + * ocfs2_flock is in here because all stacks handle UNIX file locks, + * so we still want it in the case of no stack support for + * plocks. Internally, it will do the right thing when asked to ignore + * the cluster. + */ +const struct file_operations ocfs2_fops_no_plocks = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .mmap = ocfs2_mmap, + .fsync = ocfs2_sync_file, + .release = ocfs2_file_release, + .open = ocfs2_file_open, + .aio_read = ocfs2_file_aio_read, + .aio_write = ocfs2_file_aio_write, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .flock = ocfs2_flock, + .splice_read = ocfs2_file_splice_read, + .splice_write = ocfs2_file_splice_write, + .fallocate = ocfs2_fallocate, +}; + +const struct file_operations ocfs2_dops_no_plocks = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = ocfs2_readdir, + .fsync = ocfs2_sync_file, + .release = ocfs2_dir_release, + .open = ocfs2_dir_open, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .flock = ocfs2_flock, +}; diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h new file mode 100644 index 00000000..f5afbbef --- /dev/null +++ b/fs/ocfs2/file.h @@ -0,0 +1,76 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * file.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_FILE_H +#define OCFS2_FILE_H + +extern const struct file_operations ocfs2_fops; +extern const struct file_operations ocfs2_dops; +extern const struct file_operations ocfs2_fops_no_plocks; +extern const struct file_operations ocfs2_dops_no_plocks; +extern const struct inode_operations ocfs2_file_iops; +extern const struct inode_operations ocfs2_special_file_iops; +struct ocfs2_alloc_context; +enum ocfs2_alloc_restarted; + +struct ocfs2_file_private { + struct file *fp_file; + struct mutex fp_mutex; + struct ocfs2_lock_res fp_flock; +}; + +int ocfs2_add_inode_data(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct buffer_head *fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret); +int ocfs2_simple_size_update(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size); +int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, + u64 new_i_size, u64 zero_to); +int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, + loff_t zero_to); +int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); +int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +int ocfs2_permission(struct inode *inode, int mask, unsigned int flags); + +int ocfs2_should_update_atime(struct inode *inode, + struct vfsmount *vfsmnt); +int ocfs2_update_inode_atime(struct inode *inode, + struct buffer_head *bh); + +int ocfs2_change_file_space(struct file *file, unsigned int cmd, + struct ocfs2_space_resv *sr); + +int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, + size_t count); +#endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c new file mode 100644 index 00000000..d8208b20 --- /dev/null +++ b/fs/ocfs2/heartbeat.c @@ -0,0 +1,134 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * heartbeat.c + * + * Register ourselves with the heartbaet service, keep our node maps + * up to date, and fire off recovery when needed. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, + int bit); +static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, + int bit); + +/* special case -1 for now + * TODO: should *really* make sure the calling func never passes -1!! */ +static void ocfs2_node_map_init(struct ocfs2_node_map *map) +{ + map->num_nodes = OCFS2_NODE_MAP_MAX_NODES; + memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) * + sizeof(unsigned long)); +} + +void ocfs2_init_node_maps(struct ocfs2_super *osb) +{ + spin_lock_init(&osb->node_map_lock); + ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); +} + +void ocfs2_do_node_down(int node_num, void *data) +{ + struct ocfs2_super *osb = data; + + BUG_ON(osb->node_num == node_num); + + trace_ocfs2_do_node_down(node_num); + + if (!osb->cconn) { + /* + * No cluster connection means we're not even ready to + * participate yet. We check the slots after the cluster + * comes up, so we will notice the node death then. We + * can safely ignore it here. + */ + return; + } + + ocfs2_recovery_thread(osb, node_num); +} + +static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, + int bit) +{ + set_bit(bit, map->map); +} + +void ocfs2_node_map_set_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit) +{ + if (bit==-1) + return; + BUG_ON(bit >= map->num_nodes); + spin_lock(&osb->node_map_lock); + __ocfs2_node_map_set_bit(map, bit); + spin_unlock(&osb->node_map_lock); +} + +static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, + int bit) +{ + clear_bit(bit, map->map); +} + +void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit) +{ + if (bit==-1) + return; + BUG_ON(bit >= map->num_nodes); + spin_lock(&osb->node_map_lock); + __ocfs2_node_map_clear_bit(map, bit); + spin_unlock(&osb->node_map_lock); +} + +int ocfs2_node_map_test_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit) +{ + int ret; + if (bit >= map->num_nodes) { + mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes); + BUG(); + } + spin_lock(&osb->node_map_lock); + ret = test_bit(bit, map->map); + spin_unlock(&osb->node_map_lock); + return ret; +} + diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h new file mode 100644 index 00000000..74b9c5dd --- /dev/null +++ b/fs/ocfs2/heartbeat.h @@ -0,0 +1,45 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * heartbeat.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_HEARTBEAT_H +#define OCFS2_HEARTBEAT_H + +void ocfs2_init_node_maps(struct ocfs2_super *osb); + +void ocfs2_do_node_down(int node_num, void *data); + +/* node map functions - used to keep track of mounted and in-recovery + * nodes. */ +void ocfs2_node_map_set_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit); +void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit); +int ocfs2_node_map_test_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit); + +#endif /* OCFS2_HEARTBEAT_H */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c new file mode 100644 index 00000000..b4c8bb6b --- /dev/null +++ b/fs/ocfs2/inode.c @@ -0,0 +1,1447 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * inode.c + * + * vfs' aops, fops, dops and iops + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include + +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "dir.h" +#include "blockcheck.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "namei.h" +#include "suballoc.h" +#include "super.h" +#include "symlink.h" +#include "sysfile.h" +#include "uptodate.h" +#include "xattr.h" +#include "refcounttree.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +struct ocfs2_find_inode_args +{ + u64 fi_blkno; + unsigned long fi_ino; + unsigned int fi_flags; + unsigned int fi_sysfile_type; +}; + +static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; + +static int ocfs2_read_locked_inode(struct inode *inode, + struct ocfs2_find_inode_args *args); +static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); +static int ocfs2_find_actor(struct inode *inode, void *opaque); +static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh); + +void ocfs2_set_inode_flags(struct inode *inode) +{ + unsigned int flags = OCFS2_I(inode)->ip_attr; + + inode->i_flags &= ~(S_IMMUTABLE | + S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); + + if (flags & OCFS2_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + + if (flags & OCFS2_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & OCFS2_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & OCFS2_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & OCFS2_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +/* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */ +void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi) +{ + unsigned int flags = oi->vfs_inode.i_flags; + + oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL| + OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL); + if (flags & S_SYNC) + oi->ip_attr |= OCFS2_SYNC_FL; + if (flags & S_APPEND) + oi->ip_attr |= OCFS2_APPEND_FL; + if (flags & S_IMMUTABLE) + oi->ip_attr |= OCFS2_IMMUTABLE_FL; + if (flags & S_NOATIME) + oi->ip_attr |= OCFS2_NOATIME_FL; + if (flags & S_DIRSYNC) + oi->ip_attr |= OCFS2_DIRSYNC_FL; +} + +struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) +{ + struct ocfs2_find_inode_args args; + + args.fi_blkno = blkno; + args.fi_flags = 0; + args.fi_ino = ino_from_blkno(sb, blkno); + args.fi_sysfile_type = 0; + + return ilookup5(sb, blkno, ocfs2_find_actor, &args); +} +struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, + int sysfile_type) +{ + struct inode *inode = NULL; + struct super_block *sb = osb->sb; + struct ocfs2_find_inode_args args; + + trace_ocfs2_iget_begin((unsigned long long)blkno, flags, + sysfile_type); + + /* Ok. By now we've either got the offsets passed to us by the + * caller, or we just pulled them off the bh. Lets do some + * sanity checks to make sure they're OK. */ + if (blkno == 0) { + inode = ERR_PTR(-EINVAL); + mlog_errno(PTR_ERR(inode)); + goto bail; + } + + args.fi_blkno = blkno; + args.fi_flags = flags; + args.fi_ino = ino_from_blkno(sb, blkno); + args.fi_sysfile_type = sysfile_type; + + inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, + ocfs2_init_locked_inode, &args); + /* inode was *not* in the inode cache. 2.6.x requires + * us to do our own read_inode call and unlock it + * afterwards. */ + if (inode == NULL) { + inode = ERR_PTR(-ENOMEM); + mlog_errno(PTR_ERR(inode)); + goto bail; + } + trace_ocfs2_iget5_locked(inode->i_state); + if (inode->i_state & I_NEW) { + ocfs2_read_locked_inode(inode, &args); + unlock_new_inode(inode); + } + if (is_bad_inode(inode)) { + iput(inode); + inode = ERR_PTR(-ESTALE); + goto bail; + } + +bail: + if (!IS_ERR(inode)) { + trace_ocfs2_iget_end(inode, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + } + + return inode; +} + + +/* + * here's how inodes get read from disk: + * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR + * found? : return the in-memory inode + * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE + */ + +static int ocfs2_find_actor(struct inode *inode, void *opaque) +{ + struct ocfs2_find_inode_args *args = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + int ret = 0; + + args = opaque; + + mlog_bug_on_msg(!inode, "No inode in find actor!\n"); + + trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno); + + if (oi->ip_blkno != args->fi_blkno) + goto bail; + + ret = 1; +bail: + return ret; +} + +/* + * initialize the new inode, but don't do anything that would cause + * us to sleep. + * return 0 on success, 1 on failure + */ +static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) +{ + struct ocfs2_find_inode_args *args = opaque; + static struct lock_class_key ocfs2_quota_ip_alloc_sem_key, + ocfs2_file_ip_alloc_sem_key; + + inode->i_ino = args->fi_ino; + OCFS2_I(inode)->ip_blkno = args->fi_blkno; + if (args->fi_sysfile_type != 0) + lockdep_set_class(&inode->i_mutex, + &ocfs2_sysfile_lock_key[args->fi_sysfile_type]); + if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE || + args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE || + args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE || + args->fi_sysfile_type == LOCAL_GROUP_QUOTA_SYSTEM_INODE) + lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, + &ocfs2_quota_ip_alloc_sem_key); + else + lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, + &ocfs2_file_ip_alloc_sem_key); + + return 0; +} + +void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, + int create_ino) +{ + struct super_block *sb; + struct ocfs2_super *osb; + int use_plocks = 1; + + sb = inode->i_sb; + osb = OCFS2_SB(sb); + + if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || + ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) + use_plocks = 0; + + /* + * These have all been checked by ocfs2_read_inode_block() or set + * by ocfs2_mknod_locked(), so a failure is a code bug. + */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); /* This means that read_inode + cannot create a superblock + inode today. change if + that is needed. */ + BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))); + BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation); + + + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); + OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); + + inode->i_version = 1; + inode->i_generation = le32_to_cpu(fe->i_generation); + inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); + inode->i_mode = le16_to_cpu(fe->i_mode); + inode->i_uid = le32_to_cpu(fe->i_uid); + inode->i_gid = le32_to_cpu(fe->i_gid); + + /* Fast symlinks will have i_size but no allocated clusters. */ + if (S_ISLNK(inode->i_mode) && !fe->i_clusters) + inode->i_blocks = 0; + else + inode->i_blocks = ocfs2_inode_sector_count(inode); + inode->i_mapping->a_ops = &ocfs2_aops; + inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); + inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); + inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); + inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); + inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); + inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + + if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) + mlog(ML_ERROR, + "ip_blkno %llu != i_blkno %llu!\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)le64_to_cpu(fe->i_blkno)); + + inode->i_nlink = ocfs2_read_links_count(fe); + + trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno, + le32_to_cpu(fe->i_flags)); + if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; + inode->i_flags |= S_NOQUOTA; + } + + if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; + } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; + } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) { + inode->i_flags |= S_NOQUOTA; + } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { + /* we can't actually hit this as read_inode can't + * handle superblocks today ;-) */ + BUG(); + } + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + if (use_plocks) + inode->i_fop = &ocfs2_fops; + else + inode->i_fop = &ocfs2_fops_no_plocks; + inode->i_op = &ocfs2_file_iops; + i_size_write(inode, le64_to_cpu(fe->i_size)); + break; + case S_IFDIR: + inode->i_op = &ocfs2_dir_iops; + if (use_plocks) + inode->i_fop = &ocfs2_dops; + else + inode->i_fop = &ocfs2_dops_no_plocks; + i_size_write(inode, le64_to_cpu(fe->i_size)); + OCFS2_I(inode)->ip_dir_lock_gen = 1; + break; + case S_IFLNK: + if (ocfs2_inode_is_fast_symlink(inode)) + inode->i_op = &ocfs2_fast_symlink_inode_operations; + else + inode->i_op = &ocfs2_symlink_inode_operations; + i_size_write(inode, le64_to_cpu(fe->i_size)); + break; + default: + inode->i_op = &ocfs2_special_file_iops; + init_special_inode(inode, inode->i_mode, + inode->i_rdev); + break; + } + + if (create_ino) { + inode->i_ino = ino_from_blkno(inode->i_sb, + le64_to_cpu(fe->i_blkno)); + + /* + * If we ever want to create system files from kernel, + * the generation argument to + * ocfs2_inode_lock_res_init() will have to change. + */ + BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL); + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, + OCFS2_LOCK_TYPE_META, 0, inode); + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, + OCFS2_LOCK_TYPE_OPEN, 0, inode); + } + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, + OCFS2_LOCK_TYPE_RW, inode->i_generation, + inode); + + ocfs2_set_inode_flags(inode); + + OCFS2_I(inode)->ip_last_used_slot = 0; + OCFS2_I(inode)->ip_last_used_group = 0; + + if (S_ISDIR(inode->i_mode)) + ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv, + OCFS2_RESV_FLAG_DIR); +} + +static int ocfs2_read_locked_inode(struct inode *inode, + struct ocfs2_find_inode_args *args) +{ + struct super_block *sb; + struct ocfs2_super *osb; + struct ocfs2_dinode *fe; + struct buffer_head *bh = NULL; + int status, can_lock; + u32 generation = 0; + + status = -EINVAL; + if (inode == NULL || inode->i_sb == NULL) { + mlog(ML_ERROR, "bad inode\n"); + return status; + } + sb = inode->i_sb; + osb = OCFS2_SB(sb); + + if (!args) { + mlog(ML_ERROR, "bad inode args\n"); + make_bad_inode(inode); + return status; + } + + /* + * To improve performance of cold-cache inode stats, we take + * the cluster lock here if possible. + * + * Generally, OCFS2 never trusts the contents of an inode + * unless it's holding a cluster lock, so taking it here isn't + * a correctness issue as much as it is a performance + * improvement. + * + * There are three times when taking the lock is not a good idea: + * + * 1) During startup, before we have initialized the DLM. + * + * 2) If we are reading certain system files which never get + * cluster locks (local alloc, truncate log). + * + * 3) If the process doing the iget() is responsible for + * orphan dir recovery. We're holding the orphan dir lock and + * can get into a deadlock with another process on another + * node in ->delete_inode(). + * + * #1 and #2 can be simply solved by never taking the lock + * here for system files (which are the only type we read + * during mount). It's a heavier approach, but our main + * concern is user-accessible files anyway. + * + * #3 works itself out because we'll eventually take the + * cluster lock before trusting anything anyway. + */ + can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) + && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) + && !ocfs2_mount_local(osb); + + trace_ocfs2_read_locked_inode( + (unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock); + + /* + * To maintain backwards compatibility with older versions of + * ocfs2-tools, we still store the generation value for system + * files. The only ones that actually matter to userspace are + * the journals, but it's easier and inexpensive to just flag + * all system files similarly. + */ + if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) + generation = osb->fs_generation; + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, + OCFS2_LOCK_TYPE_META, + generation, inode); + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, + OCFS2_LOCK_TYPE_OPEN, + 0, inode); + + if (can_lock) { + status = ocfs2_open_lock(inode); + if (status) { + make_bad_inode(inode); + mlog_errno(status); + return status; + } + status = ocfs2_inode_lock(inode, NULL, 0); + if (status) { + make_bad_inode(inode); + mlog_errno(status); + return status; + } + } + + if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { + status = ocfs2_try_open_lock(inode, 0); + if (status) { + make_bad_inode(inode); + return status; + } + } + + if (can_lock) { + status = ocfs2_read_inode_block_full(inode, &bh, + OCFS2_BH_IGNORE_CACHE); + } else { + status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); + /* + * If buffer is in jbd, then its checksum may not have been + * computed as yet. + */ + if (!status && !buffer_jbd(bh)) + status = ocfs2_validate_inode_block(osb->sb, bh); + } + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = -EINVAL; + fe = (struct ocfs2_dinode *) bh->b_data; + + /* + * This is a code bug. Right now the caller needs to + * understand whether it is asking for a system file inode or + * not so the proper lock names can be built. + */ + mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != + !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), + "Inode %llu: system file state is ambigous\n", + (unsigned long long)args->fi_blkno); + + if (S_ISCHR(le16_to_cpu(fe->i_mode)) || + S_ISBLK(le16_to_cpu(fe->i_mode))) + inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); + + ocfs2_populate_inode(inode, fe, 0); + + BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); + + status = 0; + +bail: + if (can_lock) + ocfs2_inode_unlock(inode, 0); + + if (status < 0) + make_bad_inode(inode); + + if (args && bh) + brelse(bh); + + return status; +} + +void ocfs2_sync_blockdev(struct super_block *sb) +{ + sync_blockdev(sb->s_bdev); +} + +static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh) +{ + int status = 0; + struct ocfs2_dinode *fe; + handle_t *handle = NULL; + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + /* + * This check will also skip truncate of inodes with inline + * data and fast symlinks. + */ + if (fe->i_clusters) { + if (ocfs2_should_order_data(inode)) + ocfs2_begin_ordered_truncate(inode, 0); + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto out; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out; + } + + i_size_write(inode, 0); + + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto out; + } + + ocfs2_commit_trans(osb, handle); + handle = NULL; + + status = ocfs2_commit_truncate(osb, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto out; + } + } + +out: + if (handle) + ocfs2_commit_trans(osb, handle); + return status; +} + +static int ocfs2_remove_inode(struct inode *inode, + struct buffer_head *di_bh, + struct inode *orphan_dir_inode, + struct buffer_head *orphan_dir_bh) +{ + int status; + struct inode *inode_alloc_inode = NULL; + struct buffer_head *inode_alloc_bh = NULL; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + + inode_alloc_inode = + ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, + le16_to_cpu(di->i_suballoc_slot)); + if (!inode_alloc_inode) { + status = -EEXIST; + mlog_errno(status); + goto bail; + } + + mutex_lock(&inode_alloc_inode->i_mutex); + status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); + if (status < 0) { + mutex_unlock(&inode_alloc_inode->i_mutex); + + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + + ocfs2_quota_trans_credits(inode->i_sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + + if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, + orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + } + + /* set the inodes dtime */ + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); + di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); + ocfs2_journal_dirty(handle, di_bh); + + ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); + dquot_free_inode(inode); + + status = ocfs2_free_dinode(handle, inode_alloc_inode, + inode_alloc_bh, di); + if (status < 0) + mlog_errno(status); + +bail_commit: + ocfs2_commit_trans(osb, handle); +bail_unlock: + ocfs2_inode_unlock(inode_alloc_inode, 1); + mutex_unlock(&inode_alloc_inode->i_mutex); + brelse(inode_alloc_bh); +bail: + iput(inode_alloc_inode); + + return status; +} + +/* + * Serialize with orphan dir recovery. If the process doing + * recovery on this orphan dir does an iget() with the dir + * i_mutex held, we'll deadlock here. Instead we detect this + * and exit early - recovery will wipe this inode for us. + */ +static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, + int slot) +{ + int ret = 0; + + spin_lock(&osb->osb_lock); + if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) { + ret = -EDEADLK; + goto out; + } + /* This signals to the orphan recovery process that it should + * wait for us to handle the wipe. */ + osb->osb_orphan_wipes[slot]++; +out: + spin_unlock(&osb->osb_lock); + trace_ocfs2_check_orphan_recovery_state(slot, ret); + return ret; +} + +static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb, + int slot) +{ + spin_lock(&osb->osb_lock); + osb->osb_orphan_wipes[slot]--; + spin_unlock(&osb->osb_lock); + + wake_up(&osb->osb_wipe_event); +} + +static int ocfs2_wipe_inode(struct inode *inode, + struct buffer_head *di_bh) +{ + int status, orphaned_slot = -1; + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + + if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { + orphaned_slot = le16_to_cpu(di->i_orphaned_slot); + + status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); + if (status) + return status; + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + orphaned_slot); + if (!orphan_dir_inode) { + status = -EEXIST; + mlog_errno(status); + goto bail; + } + + /* Lock the orphan dir. The lock will be held for the entire + * delete_inode operation. We do this now to avoid races with + * recovery completion on other nodes. */ + mutex_lock(&orphan_dir_inode->i_mutex); + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + + mlog_errno(status); + goto bail; + } + } + + /* we do this while holding the orphan dir lock because we + * don't want recovery being run from another node to try an + * inode delete underneath us -- this will result in two nodes + * truncating the same file! */ + status = ocfs2_truncate_for_delete(osb, inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + + /* Remove any dir index tree */ + if (S_ISDIR(inode->i_mode)) { + status = ocfs2_dx_dir_truncate(inode, di_bh); + if (status) { + mlog_errno(status); + goto bail_unlock_dir; + } + } + + /*Free extended attribute resources associated with this inode.*/ + status = ocfs2_xattr_remove(inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + + status = ocfs2_remove_refcount_tree(inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + + status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, + orphan_dir_bh); + if (status < 0) + mlog_errno(status); + +bail_unlock_dir: + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR) + return status; + + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + brelse(orphan_dir_bh); +bail: + iput(orphan_dir_inode); + ocfs2_signal_wipe_completion(osb, orphaned_slot); + + return status; +} + +/* There is a series of simple checks that should be done before a + * trylock is even considered. Encapsulate those in this function. */ +static int ocfs2_inode_is_valid_to_delete(struct inode *inode) +{ + int ret = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task, + (unsigned long long)oi->ip_blkno, + oi->ip_flags); + + /* We shouldn't be getting here for the root directory + * inode.. */ + if (inode == osb->root_inode) { + mlog(ML_ERROR, "Skipping delete of root inode.\n"); + goto bail; + } + + /* If we're coming from downconvert_thread we can't go into our own + * voting [hello, deadlock city!], so unforuntately we just + * have to skip deleting this guy. That's OK though because + * the node who's doing the actual deleting should handle it + * anyway. */ + if (current == osb->dc_task) + goto bail; + + spin_lock(&oi->ip_lock); + /* OCFS2 *never* deletes system files. This should technically + * never get here as system file inodes should always have a + * positive link count. */ + if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) { + mlog(ML_ERROR, "Skipping delete of system file %llu\n", + (unsigned long long)oi->ip_blkno); + goto bail_unlock; + } + + /* If we have allowd wipe of this inode for another node, it + * will be marked here so we can safely skip it. Recovery will + * cleanup any inodes we might inadvertently skip here. */ + if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) + goto bail_unlock; + + ret = 1; +bail_unlock: + spin_unlock(&oi->ip_lock); +bail: + return ret; +} + +/* Query the cluster to determine whether we should wipe an inode from + * disk or not. + * + * Requires the inode to have the cluster lock. */ +static int ocfs2_query_inode_wipe(struct inode *inode, + struct buffer_head *di_bh, + int *wipe) +{ + int status = 0, reason = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di; + + *wipe = 0; + + trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno, + inode->i_nlink); + + /* While we were waiting for the cluster lock in + * ocfs2_delete_inode, another node might have asked to delete + * the inode. Recheck our flags to catch this. */ + if (!ocfs2_inode_is_valid_to_delete(inode)) { + reason = 1; + goto bail; + } + + /* Now that we have an up to date inode, we can double check + * the link count. */ + if (inode->i_nlink) + goto bail; + + /* Do some basic inode verification... */ + di = (struct ocfs2_dinode *) di_bh->b_data; + if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) && + !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { + /* + * Inodes in the orphan dir must have ORPHANED_FL. The only + * inodes that come back out of the orphan dir are reflink + * targets. A reflink target may be moved out of the orphan + * dir between the time we scan the directory and the time we + * process it. This would lead to HAS_REFCOUNT_FL being set but + * ORPHANED_FL not. + */ + if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) { + reason = 2; + goto bail; + } + + /* for lack of a better error? */ + status = -EEXIST; + mlog(ML_ERROR, + "Inode %llu (on-disk %llu) not orphaned! " + "Disk flags 0x%x, inode flags 0x%x\n", + (unsigned long long)oi->ip_blkno, + (unsigned long long)le64_to_cpu(di->i_blkno), + le32_to_cpu(di->i_flags), oi->ip_flags); + goto bail; + } + + /* has someone already deleted us?! baaad... */ + if (di->i_dtime) { + status = -EEXIST; + mlog_errno(status); + goto bail; + } + + /* + * This is how ocfs2 determines whether an inode is still live + * within the cluster. Every node takes a shared read lock on + * the inode open lock in ocfs2_read_locked_inode(). When we + * get to ->delete_inode(), each node tries to convert it's + * lock to an exclusive. Trylocks are serialized by the inode + * meta data lock. If the upconvert succeeds, we know the inode + * is no longer live and can be deleted. + * + * Though we call this with the meta data lock held, the + * trylock keeps us from ABBA deadlock. + */ + status = ocfs2_try_open_lock(inode, 1); + if (status == -EAGAIN) { + status = 0; + reason = 3; + goto bail; + } + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *wipe = 1; + trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot)); + +bail: + trace_ocfs2_query_inode_wipe_end(status, reason); + return status; +} + +/* Support function for ocfs2_delete_inode. Will help us keep the + * inode data in a consistent state for clear_inode. Always truncates + * pages, optionally sync's them first. */ +static void ocfs2_cleanup_delete_inode(struct inode *inode, + int sync_data) +{ + trace_ocfs2_cleanup_delete_inode( + (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); + if (sync_data) + write_inode_now(inode, 1); + truncate_inode_pages(&inode->i_data, 0); +} + +static void ocfs2_delete_inode(struct inode *inode) +{ + int wipe, status; + sigset_t oldset; + struct buffer_head *di_bh = NULL; + + trace_ocfs2_delete_inode(inode->i_ino, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + is_bad_inode(inode)); + + /* When we fail in read_inode() we mark inode as bad. The second test + * catches the case when inode allocation fails before allocating + * a block for inode. */ + if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) + goto bail; + + dquot_initialize(inode); + + if (!ocfs2_inode_is_valid_to_delete(inode)) { + /* It's probably not necessary to truncate_inode_pages + * here but we do it for safety anyway (it will most + * likely be a no-op anyway) */ + ocfs2_cleanup_delete_inode(inode, 0); + goto bail; + } + + /* We want to block signals in delete_inode as the lock and + * messaging paths may return us -ERESTARTSYS. Which would + * cause us to exit early, resulting in inodes being orphaned + * forever. */ + ocfs2_block_signals(&oldset); + + /* + * Synchronize us against ocfs2_get_dentry. We take this in + * shared mode so that all nodes can still concurrently + * process deletes. + */ + status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0); + if (status < 0) { + mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status); + ocfs2_cleanup_delete_inode(inode, 0); + goto bail_unblock; + } + /* Lock down the inode. This gives us an up to date view of + * it's metadata (for verification), and allows us to + * serialize delete_inode on multiple nodes. + * + * Even though we might be doing a truncate, we don't take the + * allocation lock here as it won't be needed - nobody will + * have the file open. + */ + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + ocfs2_cleanup_delete_inode(inode, 0); + goto bail_unlock_nfs_sync; + } + + /* Query the cluster. This will be the final decision made + * before we go ahead and wipe the inode. */ + status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); + if (!wipe || status < 0) { + /* Error and remote inode busy both mean we won't be + * removing the inode, so they take almost the same + * path. */ + if (status < 0) + mlog_errno(status); + + /* Someone in the cluster has disallowed a wipe of + * this inode, or it was never completely + * orphaned. Write out the pages and exit now. */ + ocfs2_cleanup_delete_inode(inode, 1); + goto bail_unlock_inode; + } + + ocfs2_cleanup_delete_inode(inode, 0); + + status = ocfs2_wipe_inode(inode, di_bh); + if (status < 0) { + if (status != -EDEADLK) + mlog_errno(status); + goto bail_unlock_inode; + } + + /* + * Mark the inode as successfully deleted. + * + * This is important for ocfs2_clear_inode() as it will check + * this flag and skip any checkpointing work + * + * ocfs2_stuff_meta_lvb() also uses this flag to invalidate + * the LVB for other nodes. + */ + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail_unlock_nfs_sync: + ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0); + +bail_unblock: + ocfs2_unblock_signals(&oldset); +bail: + return; +} + +static void ocfs2_clear_inode(struct inode *inode) +{ + int status; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + end_writeback(inode); + trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, + inode->i_nlink); + + mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, + "Inode=%lu\n", inode->i_ino); + + dquot_drop(inode); + + /* To preven remote deletes we hold open lock before, now it + * is time to unlock PR and EX open locks. */ + ocfs2_open_unlock(inode); + + /* Do these before all the other work so that we don't bounce + * the downconvert thread while waiting to destroy the locks. */ + ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); + ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); + ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); + + ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, + &oi->ip_la_data_resv); + ocfs2_resv_init_once(&oi->ip_la_data_resv); + + /* We very well may get a clear_inode before all an inodes + * metadata has hit disk. Of course, we can't drop any cluster + * locks until the journal has finished with it. The only + * exception here are successfully wiped inodes - their + * metadata can now be considered to be part of the system + * inodes from which it came. */ + if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED)) + ocfs2_checkpoint_inode(inode); + + mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), + "Clear inode of %llu, inode has io markers\n", + (unsigned long long)oi->ip_blkno); + + ocfs2_extent_map_trunc(inode, 0); + + status = ocfs2_drop_inode_locks(inode); + if (status < 0) + mlog_errno(status); + + ocfs2_lock_res_free(&oi->ip_rw_lockres); + ocfs2_lock_res_free(&oi->ip_inode_lockres); + ocfs2_lock_res_free(&oi->ip_open_lockres); + + ocfs2_metadata_cache_exit(INODE_CACHE(inode)); + + mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached, + "Clear inode of %llu, inode has %u cache items\n", + (unsigned long long)oi->ip_blkno, + INODE_CACHE(inode)->ci_num_cached); + + mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE), + "Clear inode of %llu, inode has a bad flag\n", + (unsigned long long)oi->ip_blkno); + + mlog_bug_on_msg(spin_is_locked(&oi->ip_lock), + "Clear inode of %llu, inode is locked\n", + (unsigned long long)oi->ip_blkno); + + mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex), + "Clear inode of %llu, io_mutex is locked\n", + (unsigned long long)oi->ip_blkno); + mutex_unlock(&oi->ip_io_mutex); + + /* + * down_trylock() returns 0, down_write_trylock() returns 1 + * kernel 1, world 0 + */ + mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem), + "Clear inode of %llu, alloc_sem is locked\n", + (unsigned long long)oi->ip_blkno); + up_write(&oi->ip_alloc_sem); + + mlog_bug_on_msg(oi->ip_open_count, + "Clear inode of %llu has open count %d\n", + (unsigned long long)oi->ip_blkno, oi->ip_open_count); + + /* Clear all other flags. */ + oi->ip_flags = 0; + oi->ip_dir_start_lookup = 0; + oi->ip_blkno = 0ULL; + + /* + * ip_jinode is used to track txns against this inode. We ensure that + * the journal is flushed before journal shutdown. Thus it is safe to + * have inodes get cleaned up after journal shutdown. + */ + jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, + &oi->ip_jinode); +} + +void ocfs2_evict_inode(struct inode *inode) +{ + if (!inode->i_nlink || + (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { + ocfs2_delete_inode(inode); + } else { + truncate_inode_pages(&inode->i_data, 0); + } + ocfs2_clear_inode(inode); +} + +/* Called under inode_lock, with no more references on the + * struct inode, so it's safe here to check the flags field + * and to manipulate i_nlink without any other locks. */ +int ocfs2_drop_inode(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + int res; + + trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, + inode->i_nlink, oi->ip_flags); + + if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) + res = 1; + else + res = generic_drop_inode(inode); + + return res; +} + +/* + * This is called from our getattr. + */ +int ocfs2_inode_revalidate(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int status = 0; + + trace_ocfs2_inode_revalidate(inode, + inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL, + inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0); + + if (!inode) { + status = -ENOENT; + goto bail; + } + + spin_lock(&OCFS2_I(inode)->ip_lock); + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + status = -ENOENT; + goto bail; + } + spin_unlock(&OCFS2_I(inode)->ip_lock); + + /* Let ocfs2_inode_lock do the work of updating our struct + * inode for us. */ + status = ocfs2_inode_lock(inode, NULL, 0); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + ocfs2_inode_unlock(inode, 0); +bail: + return status; +} + +/* + * Updates a disk inode from a + * struct inode. + * Only takes ip_lock. + */ +int ocfs2_mark_inode_dirty(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + int status; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; + + trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno); + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + spin_lock(&OCFS2_I(inode)->ip_lock); + fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); + ocfs2_get_inode_flags(OCFS2_I(inode)); + fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr); + fe->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + fe->i_size = cpu_to_le64(i_size_read(inode)); + ocfs2_set_links_count(fe, inode->i_nlink); + fe->i_uid = cpu_to_le32(inode->i_uid); + fe->i_gid = cpu_to_le32(inode->i_gid); + fe->i_mode = cpu_to_le16(inode->i_mode); + fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); + fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); + fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + + ocfs2_journal_dirty(handle, bh); +leave: + return status; +} + +/* + * + * Updates a struct inode from a disk inode. + * does no i/o, only takes ip_lock. + */ +void ocfs2_refresh_inode(struct inode *inode, + struct ocfs2_dinode *fe) +{ + spin_lock(&OCFS2_I(inode)->ip_lock); + + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); + OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); + ocfs2_set_inode_flags(inode); + i_size_write(inode, le64_to_cpu(fe->i_size)); + inode->i_nlink = ocfs2_read_links_count(fe); + inode->i_uid = le32_to_cpu(fe->i_uid); + inode->i_gid = le32_to_cpu(fe->i_gid); + inode->i_mode = le16_to_cpu(fe->i_mode); + if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) + inode->i_blocks = 0; + else + inode->i_blocks = ocfs2_inode_sector_count(inode); + inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); + inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); + inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); + inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); + inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); + inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + + spin_unlock(&OCFS2_I(inode)->ip_lock); +} + +int ocfs2_validate_inode_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for dinode %llu\n", + (unsigned long long)bh->b_blocknr); + goto bail; + } + + /* + * Errors after here are fatal. + */ + + rc = -EINVAL; + + if (!OCFS2_IS_VALID_DINODE(di)) { + ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + di->i_signature); + goto bail; + } + + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { + ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); + goto bail; + } + + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + ocfs2_error(sb, + "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", + (unsigned long long)bh->b_blocknr); + goto bail; + } + + if (le32_to_cpu(di->i_fs_generation) != + OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); + goto bail; + } + + rc = 0; + +bail: + return rc; +} + +int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, + int flags) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, + 1, &tmp, flags, ocfs2_validate_inode_block); + + /* If ocfs2_read_blocks() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh) +{ + return ocfs2_read_inode_block_full(inode, bh, 0); +} + + +static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + return oi->ip_blkno; +} + +static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + return oi->vfs_inode.i_sb; +} + +static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + spin_lock(&oi->ip_lock); +} + +static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + spin_unlock(&oi->ip_lock); +} + +static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + mutex_lock(&oi->ip_io_mutex); +} + +static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + mutex_unlock(&oi->ip_io_mutex); +} + +const struct ocfs2_caching_operations ocfs2_inode_caching_ops = { + .co_owner = ocfs2_inode_cache_owner, + .co_get_super = ocfs2_inode_cache_get_super, + .co_cache_lock = ocfs2_inode_cache_lock, + .co_cache_unlock = ocfs2_inode_cache_unlock, + .co_io_lock = ocfs2_inode_cache_io_lock, + .co_io_unlock = ocfs2_inode_cache_io_unlock, +}; + diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h new file mode 100644 index 00000000..1c508b14 --- /dev/null +++ b/fs/ocfs2/inode.h @@ -0,0 +1,180 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * inode.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_INODE_H +#define OCFS2_INODE_H + +#include "extent_map.h" + +/* OCFS2 Inode Private Data */ +struct ocfs2_inode_info +{ + u64 ip_blkno; + + struct ocfs2_lock_res ip_rw_lockres; + struct ocfs2_lock_res ip_inode_lockres; + struct ocfs2_lock_res ip_open_lockres; + + /* protects allocation changes on this inode. */ + struct rw_semaphore ip_alloc_sem; + + /* protects extended attribute changes on this inode */ + struct rw_semaphore ip_xattr_sem; + + /* These fields are protected by ip_lock */ + spinlock_t ip_lock; + u32 ip_open_count; + struct list_head ip_io_markers; + u32 ip_clusters; + + u16 ip_dyn_features; + struct mutex ip_io_mutex; + u32 ip_flags; /* see below */ + u32 ip_attr; /* inode attributes */ + + /* protected by recovery_lock. */ + struct inode *ip_next_orphan; + + struct ocfs2_caching_info ip_metadata_cache; + struct ocfs2_extent_map ip_extent_map; + struct inode vfs_inode; + struct jbd2_inode ip_jinode; + + u32 ip_dir_start_lookup; + + /* Only valid if the inode is the dir. */ + u32 ip_last_used_slot; + u64 ip_last_used_group; + u32 ip_dir_lock_gen; + + struct ocfs2_alloc_reservation ip_la_data_resv; +}; + +/* + * Flags for the ip_flags field + */ +/* System file inodes */ +#define OCFS2_INODE_SYSTEM_FILE 0x00000001 +#define OCFS2_INODE_JOURNAL 0x00000002 +#define OCFS2_INODE_BITMAP 0x00000004 +/* This inode has been wiped from disk */ +#define OCFS2_INODE_DELETED 0x00000008 +/* Another node is deleting, so our delete is a nop */ +#define OCFS2_INODE_SKIP_DELETE 0x00000010 +/* Has the inode been orphaned on another node? + * + * This hints to ocfs2_drop_inode that it should clear i_nlink before + * continuing. + * + * We *only* set this on unlink vote from another node. If the inode + * was locally orphaned, then we're sure of the state and don't need + * to twiddle i_nlink later - it's either zero or not depending on + * whether our unlink succeeded. Otherwise we got this from a node + * whose intention was to orphan the inode, however he may have + * crashed, failed etc, so we let ocfs2_drop_inode zero the value and + * rely on ocfs2_delete_inode to sort things out under the proper + * cluster locks. + */ +#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 +/* Does someone have the file open O_DIRECT */ +#define OCFS2_INODE_OPEN_DIRECT 0x00000040 +/* Tell the inode wipe code it's not in orphan dir */ +#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080 + +static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) +{ + return container_of(inode, struct ocfs2_inode_info, vfs_inode); +} + +#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL) +#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL) + +extern struct kmem_cache *ocfs2_inode_cache; + +extern const struct address_space_operations ocfs2_aops; +extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops; + +static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode) +{ + return &OCFS2_I(inode)->ip_metadata_cache; +} + +void ocfs2_evict_inode(struct inode *inode); +int ocfs2_drop_inode(struct inode *inode); + +/* Flags for ocfs2_iget() */ +#define OCFS2_FI_FLAG_SYSFILE 0x1 +#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 +struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); +struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, + int sysfile_type); +int ocfs2_inode_init_private(struct inode *inode); +int ocfs2_inode_revalidate(struct dentry *dentry); +void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, + int create_ino); +void ocfs2_read_inode(struct inode *inode); +void ocfs2_read_inode2(struct inode *inode, void *opaque); +ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, + size_t size, loff_t *offp); +void ocfs2_sync_blockdev(struct super_block *sb); +void ocfs2_refresh_inode(struct inode *inode, + struct ocfs2_dinode *fe); +int ocfs2_mark_inode_dirty(handle_t *handle, + struct inode *inode, + struct buffer_head *bh); +int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); +int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); +struct buffer_head *ocfs2_bread(struct inode *inode, + int block, int *err, int reada); + +void ocfs2_set_inode_flags(struct inode *inode); +void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); + +static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) +{ + int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9; + + return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); +} + +/* Validate that a bh contains a valid inode */ +int ocfs2_validate_inode_block(struct super_block *sb, + struct buffer_head *bh); +/* + * Read an inode block into *bh. If *bh is NULL, a bh will be allocated. + * This is a cached read. The inode will be validated with + * ocfs2_validate_inode_block(). + */ +int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh); +/* The same, but can be passed OCFS2_BH_* flags */ +int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, + int flags); + +static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_info *ci) +{ + return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache); +} + +#endif /* OCFS2_INODE_H */ diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c new file mode 100644 index 00000000..bc91072b --- /dev/null +++ b/fs/ocfs2/ioctl.c @@ -0,0 +1,1031 @@ +/* + * linux/fs/ocfs2/ioctl.c + * + * Copyright (C) 2006 Herbert Poetzl + * adapted from Remy Card's ext2/ioctl.c + */ + +#include +#include +#include + +#include + +#include "ocfs2.h" +#include "alloc.h" +#include "dlmglue.h" +#include "file.h" +#include "inode.h" +#include "journal.h" + +#include "ocfs2_fs.h" +#include "ioctl.h" +#include "resize.h" +#include "refcounttree.h" +#include "sysfile.h" +#include "dir.h" +#include "buffer_head_io.h" +#include "suballoc.h" +#include "move_extents.h" + +#include + +#define o2info_from_user(a, b) \ + copy_from_user(&(a), (b), sizeof(a)) +#define o2info_to_user(a, b) \ + copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) + +/* + * This call is void because we are already reporting an error that may + * be -EFAULT. The error will be returned from the ioctl(2) call. It's + * just a best-effort to tell userspace that this request caused the error. + */ +static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, + struct ocfs2_info_request __user *req) +{ + kreq->ir_flags |= OCFS2_INFO_FL_ERROR; + (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); +} + +static inline void o2info_set_request_filled(struct ocfs2_info_request *req) +{ + req->ir_flags |= OCFS2_INFO_FL_FILLED; +} + +static inline void o2info_clear_request_filled(struct ocfs2_info_request *req) +{ + req->ir_flags &= ~OCFS2_INFO_FL_FILLED; +} + +static inline int o2info_coherent(struct ocfs2_info_request *req) +{ + return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT)); +} + +static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) +{ + int status; + + status = ocfs2_inode_lock(inode, NULL, 0); + if (status < 0) { + mlog_errno(status); + return status; + } + ocfs2_get_inode_flags(OCFS2_I(inode)); + *flags = OCFS2_I(inode)->ip_attr; + ocfs2_inode_unlock(inode, 0); + + return status; +} + +static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, + unsigned mask) +{ + struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle = NULL; + struct buffer_head *bh = NULL; + unsigned oldflags; + int status; + + mutex_lock(&inode->i_mutex); + + status = ocfs2_inode_lock(inode, &bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = -EACCES; + if (!inode_owner_or_capable(inode)) + goto bail_unlock; + + if (!S_ISDIR(inode->i_mode)) + flags &= ~OCFS2_DIRSYNC_FL; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + + oldflags = ocfs2_inode->ip_attr; + flags = flags & mask; + flags |= oldflags & ~mask; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + */ + status = -EPERM; + if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & + (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto bail_unlock; + } + + ocfs2_inode->ip_attr = flags; + ocfs2_set_inode_flags(inode); + + status = ocfs2_mark_inode_dirty(handle, inode, bh); + if (status < 0) + mlog_errno(status); + + ocfs2_commit_trans(osb, handle); +bail_unlock: + ocfs2_inode_unlock(inode, 1); +bail: + mutex_unlock(&inode->i_mutex); + + brelse(bh); + + return status; +} + +int ocfs2_info_handle_blocksize(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_blocksize oib; + + if (o2info_from_user(oib, req)) + goto bail; + + oib.ib_blocksize = inode->i_sb->s_blocksize; + + o2info_set_request_filled(&oib.ib_req); + + if (o2info_to_user(oib, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(&oib.ib_req, req); + + return status; +} + +int ocfs2_info_handle_clustersize(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_clustersize oic; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oic, req)) + goto bail; + + oic.ic_clustersize = osb->s_clustersize; + + o2info_set_request_filled(&oic.ic_req); + + if (o2info_to_user(oic, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(&oic.ic_req, req); + + return status; +} + +int ocfs2_info_handle_maxslots(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_maxslots oim; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oim, req)) + goto bail; + + oim.im_max_slots = osb->max_slots; + + o2info_set_request_filled(&oim.im_req); + + if (o2info_to_user(oim, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(&oim.im_req, req); + + return status; +} + +int ocfs2_info_handle_label(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_label oil; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oil, req)) + goto bail; + + memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); + + o2info_set_request_filled(&oil.il_req); + + if (o2info_to_user(oil, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(&oil.il_req, req); + + return status; +} + +int ocfs2_info_handle_uuid(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_uuid oiu; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oiu, req)) + goto bail; + + memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); + + o2info_set_request_filled(&oiu.iu_req); + + if (o2info_to_user(oiu, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(&oiu.iu_req, req); + + return status; +} + +int ocfs2_info_handle_fs_features(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_fs_features oif; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oif, req)) + goto bail; + + oif.if_compat_features = osb->s_feature_compat; + oif.if_incompat_features = osb->s_feature_incompat; + oif.if_ro_compat_features = osb->s_feature_ro_compat; + + o2info_set_request_filled(&oif.if_req); + + if (o2info_to_user(oif, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(&oif.if_req, req); + + return status; +} + +int ocfs2_info_handle_journal_size(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_journal_size oij; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oij, req)) + goto bail; + + oij.ij_journal_size = osb->journal->j_inode->i_size; + + o2info_set_request_filled(&oij.ij_req); + + if (o2info_to_user(oij, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(&oij.ij_req, req); + + return status; +} + +int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, + struct inode *inode_alloc, u64 blkno, + struct ocfs2_info_freeinode *fi, u32 slot) +{ + int status = 0, unlock = 0; + + struct buffer_head *bh = NULL; + struct ocfs2_dinode *dinode_alloc = NULL; + + if (inode_alloc) + mutex_lock(&inode_alloc->i_mutex); + + if (o2info_coherent(&fi->ifi_req)) { + status = ocfs2_inode_lock(inode_alloc, &bh, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + unlock = 1; + } else { + status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + dinode_alloc = (struct ocfs2_dinode *)bh->b_data; + + fi->ifi_stat[slot].lfi_total = + le32_to_cpu(dinode_alloc->id1.bitmap1.i_total); + fi->ifi_stat[slot].lfi_free = + le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) - + le32_to_cpu(dinode_alloc->id1.bitmap1.i_used); + +bail: + if (unlock) + ocfs2_inode_unlock(inode_alloc, 0); + + if (inode_alloc) + mutex_unlock(&inode_alloc->i_mutex); + + brelse(bh); + + return status; +} + +int ocfs2_info_handle_freeinode(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + u32 i; + u64 blkno = -1; + char namebuf[40]; + int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE; + struct ocfs2_info_freeinode *oifi = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *inode_alloc = NULL; + + oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL); + if (!oifi) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + if (o2info_from_user(*oifi, req)) + goto bail; + + oifi->ifi_slotnum = osb->max_slots; + + for (i = 0; i < oifi->ifi_slotnum; i++) { + if (o2info_coherent(&oifi->ifi_req)) { + inode_alloc = ocfs2_get_system_file_inode(osb, type, i); + if (!inode_alloc) { + mlog(ML_ERROR, "unable to get alloc inode in " + "slot %u\n", i); + status = -EIO; + goto bail; + } + } else { + ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, i); + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, + namebuf, + strlen(namebuf), + &blkno); + if (status < 0) { + status = -ENOENT; + goto bail; + } + } + + status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); + if (status < 0) + goto bail; + + iput(inode_alloc); + inode_alloc = NULL; + } + + o2info_set_request_filled(&oifi->ifi_req); + + if (o2info_to_user(*oifi, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(&oifi->ifi_req, req); + + kfree(oifi); + + return status; +} + +static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist, + unsigned int chunksize) +{ + int index; + + index = __ilog2_u32(chunksize); + if (index >= OCFS2_INFO_MAX_HIST) + index = OCFS2_INFO_MAX_HIST - 1; + + hist->fc_chunks[index]++; + hist->fc_clusters[index] += chunksize; +} + +static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats, + unsigned int chunksize) +{ + if (chunksize > stats->ffs_max) + stats->ffs_max = chunksize; + + if (chunksize < stats->ffs_min) + stats->ffs_min = chunksize; + + stats->ffs_avg += chunksize; + stats->ffs_free_chunks_real++; +} + +void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, + unsigned int chunksize) +{ + o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); + o2ffg_update_stats(&(ffg->iff_ffs), chunksize); +} + +int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, + struct inode *gb_inode, + struct ocfs2_dinode *gb_dinode, + struct ocfs2_chain_rec *rec, + struct ocfs2_info_freefrag *ffg, + u32 chunks_in_group) +{ + int status = 0, used; + u64 blkno; + + struct buffer_head *bh = NULL; + struct ocfs2_group_desc *bg = NULL; + + unsigned int max_bits, num_clusters; + unsigned int offset = 0, cluster, chunk; + unsigned int chunk_free, last_chunksize = 0; + + if (!le32_to_cpu(rec->c_free)) + goto bail; + + do { + if (!bg) + blkno = le64_to_cpu(rec->c_blkno); + else + blkno = le64_to_cpu(bg->bg_next_group); + + if (bh) { + brelse(bh); + bh = NULL; + } + + if (o2info_coherent(&ffg->iff_req)) + status = ocfs2_read_group_descriptor(gb_inode, + gb_dinode, + blkno, &bh); + else + status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); + + if (status < 0) { + mlog(ML_ERROR, "Can't read the group descriptor # " + "%llu from device.", (unsigned long long)blkno); + status = -EIO; + goto bail; + } + + bg = (struct ocfs2_group_desc *)bh->b_data; + + if (!le16_to_cpu(bg->bg_free_bits_count)) + continue; + + max_bits = le16_to_cpu(bg->bg_bits); + offset = 0; + + for (chunk = 0; chunk < chunks_in_group; chunk++) { + /* + * last chunk may be not an entire one. + */ + if ((offset + ffg->iff_chunksize) > max_bits) + num_clusters = max_bits - offset; + else + num_clusters = ffg->iff_chunksize; + + chunk_free = 0; + for (cluster = 0; cluster < num_clusters; cluster++) { + used = ocfs2_test_bit(offset, + (unsigned long *)bg->bg_bitmap); + /* + * - chunk_free counts free clusters in #N chunk. + * - last_chunksize records the size(in) clusters + * for the last real free chunk being counted. + */ + if (!used) { + last_chunksize++; + chunk_free++; + } + + if (used && last_chunksize) { + ocfs2_info_update_ffg(ffg, + last_chunksize); + last_chunksize = 0; + } + + offset++; + } + + if (chunk_free == ffg->iff_chunksize) + ffg->iff_ffs.ffs_free_chunks++; + } + + /* + * need to update the info for last free chunk. + */ + if (last_chunksize) + ocfs2_info_update_ffg(ffg, last_chunksize); + + } while (le64_to_cpu(bg->bg_next_group)); + +bail: + brelse(bh); + + return status; +} + +int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, + struct inode *gb_inode, u64 blkno, + struct ocfs2_info_freefrag *ffg) +{ + u32 chunks_in_group; + int status = 0, unlock = 0, i; + + struct buffer_head *bh = NULL; + struct ocfs2_chain_list *cl = NULL; + struct ocfs2_chain_rec *rec = NULL; + struct ocfs2_dinode *gb_dinode = NULL; + + if (gb_inode) + mutex_lock(&gb_inode->i_mutex); + + if (o2info_coherent(&ffg->iff_req)) { + status = ocfs2_inode_lock(gb_inode, &bh, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + unlock = 1; + } else { + status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + gb_dinode = (struct ocfs2_dinode *)bh->b_data; + cl = &(gb_dinode->id2.i_chain); + + /* + * Chunksize(in) clusters from userspace should be + * less than clusters in a group. + */ + if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) { + status = -EINVAL; + goto bail; + } + + memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats)); + + ffg->iff_ffs.ffs_min = ~0U; + ffg->iff_ffs.ffs_clusters = + le32_to_cpu(gb_dinode->id1.bitmap1.i_total); + ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters - + le32_to_cpu(gb_dinode->id1.bitmap1.i_used); + + chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1; + + for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { + rec = &(cl->cl_recs[i]); + status = ocfs2_info_freefrag_scan_chain(osb, gb_inode, + gb_dinode, + rec, ffg, + chunks_in_group); + if (status) + goto bail; + } + + if (ffg->iff_ffs.ffs_free_chunks_real) + ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg / + ffg->iff_ffs.ffs_free_chunks_real); +bail: + if (unlock) + ocfs2_inode_unlock(gb_inode, 0); + + if (gb_inode) + mutex_unlock(&gb_inode->i_mutex); + + if (gb_inode) + iput(gb_inode); + + brelse(bh); + + return status; +} + +int ocfs2_info_handle_freefrag(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + u64 blkno = -1; + char namebuf[40]; + int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE; + + struct ocfs2_info_freefrag *oiff; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *gb_inode = NULL; + + oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL); + if (!oiff) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + if (o2info_from_user(*oiff, req)) + goto bail; + /* + * chunksize from userspace should be power of 2. + */ + if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) || + (!oiff->iff_chunksize)) { + status = -EINVAL; + goto bail; + } + + if (o2info_coherent(&oiff->iff_req)) { + gb_inode = ocfs2_get_system_file_inode(osb, type, + OCFS2_INVALID_SLOT); + if (!gb_inode) { + mlog(ML_ERROR, "unable to get global_bitmap inode\n"); + status = -EIO; + goto bail; + } + } else { + ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, + OCFS2_INVALID_SLOT); + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, + namebuf, + strlen(namebuf), + &blkno); + if (status < 0) { + status = -ENOENT; + goto bail; + } + } + + status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff); + if (status < 0) + goto bail; + + o2info_set_request_filled(&oiff->iff_req); + + if (o2info_to_user(*oiff, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(&oiff->iff_req, req); + + kfree(oiff); + + return status; +} + +int ocfs2_info_handle_unknown(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_request oir; + + if (o2info_from_user(oir, req)) + goto bail; + + o2info_clear_request_filled(&oir); + + if (o2info_to_user(oir, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(&oir, req); + + return status; +} + +/* + * Validate and distinguish OCFS2_IOC_INFO requests. + * + * - validate the magic number. + * - distinguish different requests. + * - validate size of different requests. + */ +int ocfs2_info_handle_request(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_request oir; + + if (o2info_from_user(oir, req)) + goto bail; + + status = -EINVAL; + if (oir.ir_magic != OCFS2_INFO_MAGIC) + goto bail; + + switch (oir.ir_code) { + case OCFS2_INFO_BLOCKSIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_blocksize)) + status = ocfs2_info_handle_blocksize(inode, req); + break; + case OCFS2_INFO_CLUSTERSIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_clustersize)) + status = ocfs2_info_handle_clustersize(inode, req); + break; + case OCFS2_INFO_MAXSLOTS: + if (oir.ir_size == sizeof(struct ocfs2_info_maxslots)) + status = ocfs2_info_handle_maxslots(inode, req); + break; + case OCFS2_INFO_LABEL: + if (oir.ir_size == sizeof(struct ocfs2_info_label)) + status = ocfs2_info_handle_label(inode, req); + break; + case OCFS2_INFO_UUID: + if (oir.ir_size == sizeof(struct ocfs2_info_uuid)) + status = ocfs2_info_handle_uuid(inode, req); + break; + case OCFS2_INFO_FS_FEATURES: + if (oir.ir_size == sizeof(struct ocfs2_info_fs_features)) + status = ocfs2_info_handle_fs_features(inode, req); + break; + case OCFS2_INFO_JOURNAL_SIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) + status = ocfs2_info_handle_journal_size(inode, req); + break; + case OCFS2_INFO_FREEINODE: + if (oir.ir_size == sizeof(struct ocfs2_info_freeinode)) + status = ocfs2_info_handle_freeinode(inode, req); + break; + case OCFS2_INFO_FREEFRAG: + if (oir.ir_size == sizeof(struct ocfs2_info_freefrag)) + status = ocfs2_info_handle_freefrag(inode, req); + break; + default: + status = ocfs2_info_handle_unknown(inode, req); + break; + } + +bail: + return status; +} + +int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, + u64 *req_addr, int compat_flag) +{ + int status = -EFAULT; + u64 __user *bp = NULL; + + if (compat_flag) { +#ifdef CONFIG_COMPAT + /* + * pointer bp stores the base address of a pointers array, + * which collects all addresses of separate request. + */ + bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests); +#else + BUG(); +#endif + } else + bp = (u64 __user *)(unsigned long)(info->oi_requests); + + if (o2info_from_user(*req_addr, bp + idx)) + goto bail; + + status = 0; +bail: + return status; +} + +/* + * OCFS2_IOC_INFO handles an array of requests passed from userspace. + * + * ocfs2_info_handle() recevies a large info aggregation, grab and + * validate the request count from header, then break it into small + * pieces, later specific handlers can handle them one by one. + * + * Idea here is to make each separate request small enough to ensure + * a better backward&forward compatibility, since a small piece of + * request will be less likely to be broken if disk layout get changed. + */ +int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, + int compat_flag) +{ + int i, status = 0; + u64 req_addr; + struct ocfs2_info_request __user *reqp; + + if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) || + (!info->oi_requests)) { + status = -EINVAL; + goto bail; + } + + for (i = 0; i < info->oi_count; i++) { + + status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag); + if (status) + break; + + reqp = (struct ocfs2_info_request *)(unsigned long)req_addr; + if (!reqp) { + status = -EINVAL; + goto bail; + } + + status = ocfs2_info_handle_request(inode, reqp); + if (status) + break; + } + +bail: + return status; +} + +long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + unsigned int flags; + int new_clusters; + int status; + struct ocfs2_space_resv sr; + struct ocfs2_new_group_input input; + struct reflink_arguments args; + const char *old_path, *new_path; + bool preserve; + struct ocfs2_info info; + + switch (cmd) { + case OCFS2_IOC_GETFLAGS: + status = ocfs2_get_inode_attr(inode, &flags); + if (status < 0) + return status; + + flags &= OCFS2_FL_VISIBLE; + return put_user(flags, (int __user *) arg); + case OCFS2_IOC_SETFLAGS: + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + status = mnt_want_write(filp->f_path.mnt); + if (status) + return status; + status = ocfs2_set_inode_attr(inode, flags, + OCFS2_FL_MODIFIABLE); + mnt_drop_write(filp->f_path.mnt); + return status; + case OCFS2_IOC_RESVSP: + case OCFS2_IOC_RESVSP64: + case OCFS2_IOC_UNRESVSP: + case OCFS2_IOC_UNRESVSP64: + if (copy_from_user(&sr, (int __user *) arg, sizeof(sr))) + return -EFAULT; + + return ocfs2_change_file_space(filp, cmd, &sr); + case OCFS2_IOC_GROUP_EXTEND: + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (get_user(new_clusters, (int __user *)arg)) + return -EFAULT; + + return ocfs2_group_extend(inode, new_clusters); + case OCFS2_IOC_GROUP_ADD: + case OCFS2_IOC_GROUP_ADD64: + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (copy_from_user(&input, (int __user *) arg, sizeof(input))) + return -EFAULT; + + return ocfs2_group_add(inode, &input); + case OCFS2_IOC_REFLINK: + if (copy_from_user(&args, (struct reflink_arguments *)arg, + sizeof(args))) + return -EFAULT; + old_path = (const char *)(unsigned long)args.old_path; + new_path = (const char *)(unsigned long)args.new_path; + preserve = (args.preserve != 0); + + return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); + case OCFS2_IOC_INFO: + if (copy_from_user(&info, (struct ocfs2_info __user *)arg, + sizeof(struct ocfs2_info))) + return -EFAULT; + + return ocfs2_info_handle(inode, &info, 0); + case FITRIM: + { + struct super_block *sb = inode->i_sb; + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct fstrim_range *)arg, + sizeof(range))) + return -EFAULT; + + ret = ocfs2_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + case OCFS2_IOC_MOVE_EXT: + return ocfs2_ioctl_move_extents(filp, (void __user *)arg); + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + bool preserve; + struct reflink_arguments args; + struct inode *inode = file->f_path.dentry->d_inode; + struct ocfs2_info info; + + switch (cmd) { + case OCFS2_IOC32_GETFLAGS: + cmd = OCFS2_IOC_GETFLAGS; + break; + case OCFS2_IOC32_SETFLAGS: + cmd = OCFS2_IOC_SETFLAGS; + break; + case OCFS2_IOC_RESVSP: + case OCFS2_IOC_RESVSP64: + case OCFS2_IOC_UNRESVSP: + case OCFS2_IOC_UNRESVSP64: + case OCFS2_IOC_GROUP_EXTEND: + case OCFS2_IOC_GROUP_ADD: + case OCFS2_IOC_GROUP_ADD64: + case FITRIM: + break; + case OCFS2_IOC_REFLINK: + if (copy_from_user(&args, (struct reflink_arguments *)arg, + sizeof(args))) + return -EFAULT; + preserve = (args.preserve != 0); + + return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), + compat_ptr(args.new_path), preserve); + case OCFS2_IOC_INFO: + if (copy_from_user(&info, (struct ocfs2_info __user *)arg, + sizeof(struct ocfs2_info))) + return -EFAULT; + + return ocfs2_info_handle(inode, &info, 1); + case OCFS2_IOC_MOVE_EXT: + break; + default: + return -ENOIOCTLCMD; + } + + return ocfs2_ioctl(file, cmd, arg); +} +#endif diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h new file mode 100644 index 00000000..0cd5323b --- /dev/null +++ b/fs/ocfs2/ioctl.h @@ -0,0 +1,16 @@ +/* + * ioctl.h + * + * Function prototypes + * + * Copyright (C) 2006 Herbert Poetzl + * + */ + +#ifndef OCFS2_IOCTL_PROTO_H +#define OCFS2_IOCTL_PROTO_H + +long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); + +#endif /* OCFS2_IOCTL_PROTO_H */ diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c new file mode 100644 index 00000000..295d5645 --- /dev/null +++ b/fs/ocfs2/journal.c @@ -0,0 +1,2192 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * journal.c + * + * Defines functions of journalling api + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "blockcheck.h" +#include "dir.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "slot_map.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" +#include "quota.h" + +#include "buffer_head_io.h" +#include "ocfs2_trace.h" + +DEFINE_SPINLOCK(trans_inc_lock); + +#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000 + +static int ocfs2_force_read_journal(struct inode *inode); +static int ocfs2_recover_node(struct ocfs2_super *osb, + int node_num, int slot_num); +static int __ocfs2_recovery_thread(void *arg); +static int ocfs2_commit_cache(struct ocfs2_super *osb); +static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota); +static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, + int dirty, int replayed); +static int ocfs2_trylock_journal(struct ocfs2_super *osb, + int slot_num); +static int ocfs2_recover_orphans(struct ocfs2_super *osb, + int slot); +static int ocfs2_commit_thread(void *arg); +static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, + int slot_num, + struct ocfs2_dinode *la_dinode, + struct ocfs2_dinode *tl_dinode, + struct ocfs2_quota_recovery *qrec); + +static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) +{ + return __ocfs2_wait_on_mount(osb, 0); +} + +static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb) +{ + return __ocfs2_wait_on_mount(osb, 1); +} + +/* + * This replay_map is to track online/offline slots, so we could recover + * offline slots during recovery and mount + */ + +enum ocfs2_replay_state { + REPLAY_UNNEEDED = 0, /* Replay is not needed, so ignore this map */ + REPLAY_NEEDED, /* Replay slots marked in rm_replay_slots */ + REPLAY_DONE /* Replay was already queued */ +}; + +struct ocfs2_replay_map { + unsigned int rm_slots; + enum ocfs2_replay_state rm_state; + unsigned char rm_replay_slots[0]; +}; + +void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) +{ + if (!osb->replay_map) + return; + + /* If we've already queued the replay, we don't have any more to do */ + if (osb->replay_map->rm_state == REPLAY_DONE) + return; + + osb->replay_map->rm_state = state; +} + +int ocfs2_compute_replay_slots(struct ocfs2_super *osb) +{ + struct ocfs2_replay_map *replay_map; + int i, node_num; + + /* If replay map is already set, we don't do it again */ + if (osb->replay_map) + return 0; + + replay_map = kzalloc(sizeof(struct ocfs2_replay_map) + + (osb->max_slots * sizeof(char)), GFP_KERNEL); + + if (!replay_map) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + spin_lock(&osb->osb_lock); + + replay_map->rm_slots = osb->max_slots; + replay_map->rm_state = REPLAY_UNNEEDED; + + /* set rm_replay_slots for offline slot(s) */ + for (i = 0; i < replay_map->rm_slots; i++) { + if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT) + replay_map->rm_replay_slots[i] = 1; + } + + osb->replay_map = replay_map; + spin_unlock(&osb->osb_lock); + return 0; +} + +void ocfs2_queue_replay_slots(struct ocfs2_super *osb) +{ + struct ocfs2_replay_map *replay_map = osb->replay_map; + int i; + + if (!replay_map) + return; + + if (replay_map->rm_state != REPLAY_NEEDED) + return; + + for (i = 0; i < replay_map->rm_slots; i++) + if (replay_map->rm_replay_slots[i]) + ocfs2_queue_recovery_completion(osb->journal, i, NULL, + NULL, NULL); + replay_map->rm_state = REPLAY_DONE; +} + +void ocfs2_free_replay_slots(struct ocfs2_super *osb) +{ + struct ocfs2_replay_map *replay_map = osb->replay_map; + + if (!osb->replay_map) + return; + + kfree(replay_map); + osb->replay_map = NULL; +} + +int ocfs2_recovery_init(struct ocfs2_super *osb) +{ + struct ocfs2_recovery_map *rm; + + mutex_init(&osb->recovery_lock); + osb->disable_recovery = 0; + osb->recovery_thread_task = NULL; + init_waitqueue_head(&osb->recovery_event); + + rm = kzalloc(sizeof(struct ocfs2_recovery_map) + + osb->max_slots * sizeof(unsigned int), + GFP_KERNEL); + if (!rm) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + rm->rm_entries = (unsigned int *)((char *)rm + + sizeof(struct ocfs2_recovery_map)); + osb->recovery_map = rm; + + return 0; +} + +/* we can't grab the goofy sem lock from inside wait_event, so we use + * memory barriers to make sure that we'll see the null task before + * being woken up */ +static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) +{ + mb(); + return osb->recovery_thread_task != NULL; +} + +void ocfs2_recovery_exit(struct ocfs2_super *osb) +{ + struct ocfs2_recovery_map *rm; + + /* disable any new recovery threads and wait for any currently + * running ones to exit. Do this before setting the vol_state. */ + mutex_lock(&osb->recovery_lock); + osb->disable_recovery = 1; + mutex_unlock(&osb->recovery_lock); + wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); + + /* At this point, we know that no more recovery threads can be + * launched, so wait for any recovery completion work to + * complete. */ + flush_workqueue(ocfs2_wq); + + /* + * Now that recovery is shut down, and the osb is about to be + * freed, the osb_lock is not taken here. + */ + rm = osb->recovery_map; + /* XXX: Should we bug if there are dirty entries? */ + + kfree(rm); +} + +static int __ocfs2_recovery_map_test(struct ocfs2_super *osb, + unsigned int node_num) +{ + int i; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + assert_spin_locked(&osb->osb_lock); + + for (i = 0; i < rm->rm_used; i++) { + if (rm->rm_entries[i] == node_num) + return 1; + } + + return 0; +} + +/* Behaves like test-and-set. Returns the previous value */ +static int ocfs2_recovery_map_set(struct ocfs2_super *osb, + unsigned int node_num) +{ + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + if (__ocfs2_recovery_map_test(osb, node_num)) { + spin_unlock(&osb->osb_lock); + return 1; + } + + /* XXX: Can this be exploited? Not from o2dlm... */ + BUG_ON(rm->rm_used >= osb->max_slots); + + rm->rm_entries[rm->rm_used] = node_num; + rm->rm_used++; + spin_unlock(&osb->osb_lock); + + return 0; +} + +static void ocfs2_recovery_map_clear(struct ocfs2_super *osb, + unsigned int node_num) +{ + int i; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + + for (i = 0; i < rm->rm_used; i++) { + if (rm->rm_entries[i] == node_num) + break; + } + + if (i < rm->rm_used) { + /* XXX: be careful with the pointer math */ + memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]), + (rm->rm_used - i - 1) * sizeof(unsigned int)); + rm->rm_used--; + } + + spin_unlock(&osb->osb_lock); +} + +static int ocfs2_commit_cache(struct ocfs2_super *osb) +{ + int status = 0; + unsigned int flushed; + struct ocfs2_journal *journal = NULL; + + journal = osb->journal; + + /* Flush all pending commits and checkpoint the journal. */ + down_write(&journal->j_trans_barrier); + + flushed = atomic_read(&journal->j_num_trans); + trace_ocfs2_commit_cache_begin(flushed); + if (flushed == 0) { + up_write(&journal->j_trans_barrier); + goto finally; + } + + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) { + up_write(&journal->j_trans_barrier); + mlog_errno(status); + goto finally; + } + + ocfs2_inc_trans_id(journal); + + flushed = atomic_read(&journal->j_num_trans); + atomic_set(&journal->j_num_trans, 0); + up_write(&journal->j_trans_barrier); + + trace_ocfs2_commit_cache_end(journal->j_trans_id, flushed); + + ocfs2_wake_downconvert_thread(osb); + wake_up(&journal->j_checkpointed); +finally: + return status; +} + +handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) +{ + journal_t *journal = osb->journal->j_journal; + handle_t *handle; + + BUG_ON(!osb || !osb->journal->j_journal); + + if (ocfs2_is_hard_readonly(osb)) + return ERR_PTR(-EROFS); + + BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE); + BUG_ON(max_buffs <= 0); + + /* Nested transaction? Just return the handle... */ + if (journal_current_handle()) + return jbd2_journal_start(journal, max_buffs); + + down_read(&osb->journal->j_trans_barrier); + + handle = jbd2_journal_start(journal, max_buffs); + if (IS_ERR(handle)) { + up_read(&osb->journal->j_trans_barrier); + + mlog_errno(PTR_ERR(handle)); + + if (is_journal_aborted(journal)) { + ocfs2_abort(osb->sb, "Detected aborted journal"); + handle = ERR_PTR(-EROFS); + } + } else { + if (!ocfs2_mount_local(osb)) + atomic_inc(&(osb->journal->j_num_trans)); + } + + return handle; +} + +int ocfs2_commit_trans(struct ocfs2_super *osb, + handle_t *handle) +{ + int ret, nested; + struct ocfs2_journal *journal = osb->journal; + + BUG_ON(!handle); + + nested = handle->h_ref > 1; + ret = jbd2_journal_stop(handle); + if (ret < 0) + mlog_errno(ret); + + if (!nested) + up_read(&journal->j_trans_barrier); + + return ret; +} + +/* + * 'nblocks' is what you want to add to the current transaction. + * + * This might call jbd2_journal_restart() which will commit dirty buffers + * and then restart the transaction. Before calling + * ocfs2_extend_trans(), any changed blocks should have been + * dirtied. After calling it, all blocks which need to be changed must + * go through another set of journal_access/journal_dirty calls. + * + * WARNING: This will not release any semaphores or disk locks taken + * during the transaction, so make sure they were taken *before* + * start_trans or we'll have ordering deadlocks. + * + * WARNING2: Note that we do *not* drop j_trans_barrier here. This is + * good because transaction ids haven't yet been recorded on the + * cluster locks associated with this handle. + */ +int ocfs2_extend_trans(handle_t *handle, int nblocks) +{ + int status, old_nblocks; + + BUG_ON(!handle); + BUG_ON(nblocks < 0); + + if (!nblocks) + return 0; + + old_nblocks = handle->h_buffer_credits; + + trace_ocfs2_extend_trans(old_nblocks, nblocks); + +#ifdef CONFIG_OCFS2_DEBUG_FS + status = 1; +#else + status = jbd2_journal_extend(handle, nblocks); + if (status < 0) { + mlog_errno(status); + goto bail; + } +#endif + + if (status > 0) { + trace_ocfs2_extend_trans_restart(old_nblocks + nblocks); + status = jbd2_journal_restart(handle, + old_nblocks + nblocks); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + status = 0; +bail: + return status; +} + +struct ocfs2_triggers { + struct jbd2_buffer_trigger_type ot_triggers; + int ot_offset; +}; + +static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers) +{ + return container_of(triggers, struct ocfs2_triggers, ot_triggers); +} + +static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers); + + /* + * We aren't guaranteed to have the superblock here, so we + * must unconditionally compute the ecc data. + * __ocfs2_journal_access() will only set the triggers if + * metaecc is enabled. + */ + ocfs2_block_check_compute(data, size, data + ot->ot_offset); +} + +/* + * Quota blocks have their own trigger because the struct ocfs2_block_check + * offset depends on the blocksize. + */ +static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct ocfs2_disk_dqtrailer *dqt = + ocfs2_block_dqtrailer(size, data); + + /* + * We aren't guaranteed to have the superblock here, so we + * must unconditionally compute the ecc data. + * __ocfs2_journal_access() will only set the triggers if + * metaecc is enabled. + */ + ocfs2_block_check_compute(data, size, &dqt->dq_check); +} + +/* + * Directory blocks also have their own trigger because the + * struct ocfs2_block_check offset depends on the blocksize. + */ +static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct ocfs2_dir_block_trailer *trailer = + ocfs2_dir_trailer_from_size(size, data); + + /* + * We aren't guaranteed to have the superblock here, so we + * must unconditionally compute the ecc data. + * __ocfs2_journal_access() will only set the triggers if + * metaecc is enabled. + */ + ocfs2_block_check_compute(data, size, &trailer->db_check); +} + +static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh) +{ + mlog(ML_ERROR, + "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, " + "bh->b_blocknr = %llu\n", + (unsigned long)bh, + (unsigned long long)bh->b_blocknr); + + /* We aren't guaranteed to have the superblock here - but if we + * don't, it'll just crash. */ + ocfs2_error(bh->b_assoc_map->host->i_sb, + "JBD2 has aborted our journal, ocfs2 cannot continue\n"); +} + +static struct ocfs2_triggers di_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_dinode, i_check), +}; + +static struct ocfs2_triggers eb_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_extent_block, h_check), +}; + +static struct ocfs2_triggers rb_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), +}; + +static struct ocfs2_triggers gd_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), +}; + +static struct ocfs2_triggers db_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_db_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, +}; + +static struct ocfs2_triggers xb_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), +}; + +static struct ocfs2_triggers dq_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_dq_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, +}; + +static struct ocfs2_triggers dr_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), +}; + +static struct ocfs2_triggers dl_triggers = { + .ot_triggers = { + .t_frozen = ocfs2_frozen_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), +}; + +static int __ocfs2_journal_access(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *bh, + struct ocfs2_triggers *triggers, + int type) +{ + int status; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); + + BUG_ON(!ci || !ci->ci_ops); + BUG_ON(!handle); + BUG_ON(!bh); + + trace_ocfs2_journal_access( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)bh->b_blocknr, type, bh->b_size); + + /* we can safely remove this assertion after testing. */ + if (!buffer_uptodate(bh)) { + mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); + mlog(ML_ERROR, "b_blocknr=%llu\n", + (unsigned long long)bh->b_blocknr); + BUG(); + } + + /* Set the current transaction information on the ci so + * that the locking code knows whether it can drop it's locks + * on this ci or not. We're protected from the commit + * thread updating the current transaction id until + * ocfs2_commit_trans() because ocfs2_start_trans() took + * j_trans_barrier for us. */ + ocfs2_set_ci_lock_trans(osb->journal, ci); + + ocfs2_metadata_cache_io_lock(ci); + switch (type) { + case OCFS2_JOURNAL_ACCESS_CREATE: + case OCFS2_JOURNAL_ACCESS_WRITE: + status = jbd2_journal_get_write_access(handle, bh); + break; + + case OCFS2_JOURNAL_ACCESS_UNDO: + status = jbd2_journal_get_undo_access(handle, bh); + break; + + default: + status = -EINVAL; + mlog(ML_ERROR, "Unknown access type!\n"); + } + if (!status && ocfs2_meta_ecc(osb) && triggers) + jbd2_journal_set_triggers(bh, &triggers->ot_triggers); + ocfs2_metadata_cache_io_unlock(ci); + + if (status < 0) + mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", + status, type); + + return status; +} + +int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type); +} + +int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type); +} + +int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &rb_triggers, + type); +} + +int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type); +} + +int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type); +} + +int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type); +} + +int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type); +} + +int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type); +} + +int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type); +} + +int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, NULL, type); +} + +void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) +{ + int status; + + trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr); + + status = jbd2_journal_dirty_metadata(handle, bh); + BUG_ON(status); +} + +#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) + +void ocfs2_set_journal_params(struct ocfs2_super *osb) +{ + journal_t *journal = osb->journal->j_journal; + unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; + + if (osb->osb_commit_interval) + commit_interval = osb->osb_commit_interval; + + write_lock(&journal->j_state_lock); + journal->j_commit_interval = commit_interval; + if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) + journal->j_flags |= JBD2_BARRIER; + else + journal->j_flags &= ~JBD2_BARRIER; + write_unlock(&journal->j_state_lock); +} + +int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) +{ + int status = -1; + struct inode *inode = NULL; /* the journal inode */ + journal_t *j_journal = NULL; + struct ocfs2_dinode *di = NULL; + struct buffer_head *bh = NULL; + struct ocfs2_super *osb; + int inode_lock = 0; + + BUG_ON(!journal); + + osb = journal->j_osb; + + /* already have the inode for our journal */ + inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, + osb->slot_num); + if (inode == NULL) { + status = -EACCES; + mlog_errno(status); + goto done; + } + if (is_bad_inode(inode)) { + mlog(ML_ERROR, "access error (bad inode)\n"); + iput(inode); + inode = NULL; + status = -EACCES; + goto done; + } + + SET_INODE_JOURNAL(inode); + OCFS2_I(inode)->ip_open_count++; + + /* Skip recovery waits here - journal inode metadata never + * changes in a live cluster so it can be considered an + * exception to the rule. */ + status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); + if (status < 0) { + if (status != -ERESTARTSYS) + mlog(ML_ERROR, "Could not get lock on journal!\n"); + goto done; + } + + inode_lock = 1; + di = (struct ocfs2_dinode *)bh->b_data; + + if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { + mlog(ML_ERROR, "Journal file size (%lld) is too small!\n", + inode->i_size); + status = -EINVAL; + goto done; + } + + trace_ocfs2_journal_init(inode->i_size, + (unsigned long long)inode->i_blocks, + OCFS2_I(inode)->ip_clusters); + + /* call the kernels journal init function now */ + j_journal = jbd2_journal_init_inode(inode); + if (j_journal == NULL) { + mlog(ML_ERROR, "Linux journal layer error\n"); + status = -EINVAL; + goto done; + } + + trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen); + + *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & + OCFS2_JOURNAL_DIRTY_FL); + + journal->j_journal = j_journal; + journal->j_inode = inode; + journal->j_bh = bh; + + ocfs2_set_journal_params(osb); + + journal->j_state = OCFS2_JOURNAL_LOADED; + + status = 0; +done: + if (status < 0) { + if (inode_lock) + ocfs2_inode_unlock(inode, 1); + brelse(bh); + if (inode) { + OCFS2_I(inode)->ip_open_count--; + iput(inode); + } + } + + return status; +} + +static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di) +{ + le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1); +} + +static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di) +{ + return le32_to_cpu(di->id1.journal1.ij_recovery_generation); +} + +static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, + int dirty, int replayed) +{ + int status; + unsigned int flags; + struct ocfs2_journal *journal = osb->journal; + struct buffer_head *bh = journal->j_bh; + struct ocfs2_dinode *fe; + + fe = (struct ocfs2_dinode *)bh->b_data; + + /* The journal bh on the osb always comes from ocfs2_journal_init() + * and was validated there inside ocfs2_inode_lock_full(). It's a + * code bug if we mess it up. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + + flags = le32_to_cpu(fe->id1.journal1.ij_flags); + if (dirty) + flags |= OCFS2_JOURNAL_DIRTY_FL; + else + flags &= ~OCFS2_JOURNAL_DIRTY_FL; + fe->id1.journal1.ij_flags = cpu_to_le32(flags); + + if (replayed) + ocfs2_bump_recovery_generation(fe); + + ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); + status = ocfs2_write_block(osb, bh, INODE_CACHE(journal->j_inode)); + if (status < 0) + mlog_errno(status); + + return status; +} + +/* + * If the journal has been kmalloc'd it needs to be freed after this + * call. + */ +void ocfs2_journal_shutdown(struct ocfs2_super *osb) +{ + struct ocfs2_journal *journal = NULL; + int status = 0; + struct inode *inode = NULL; + int num_running_trans = 0; + + BUG_ON(!osb); + + journal = osb->journal; + if (!journal) + goto done; + + inode = journal->j_inode; + + if (journal->j_state != OCFS2_JOURNAL_LOADED) + goto done; + + /* need to inc inode use count - jbd2_journal_destroy will iput. */ + if (!igrab(inode)) + BUG(); + + num_running_trans = atomic_read(&(osb->journal->j_num_trans)); + trace_ocfs2_journal_shutdown(num_running_trans); + + /* Do a commit_cache here. It will flush our journal, *and* + * release any locks that are still held. + * set the SHUTDOWN flag and release the trans lock. + * the commit thread will take the trans lock for us below. */ + journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN; + + /* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not + * drop the trans_lock (which we want to hold until we + * completely destroy the journal. */ + if (osb->commit_task) { + /* Wait for the commit thread */ + trace_ocfs2_journal_shutdown_wait(osb->commit_task); + kthread_stop(osb->commit_task); + osb->commit_task = NULL; + } + + BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); + + if (ocfs2_mount_local(osb)) { + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) + mlog_errno(status); + } + + if (status == 0) { + /* + * Do not toggle if flush was unsuccessful otherwise + * will leave dirty metadata in a "clean" journal + */ + status = ocfs2_journal_toggle_dirty(osb, 0, 0); + if (status < 0) + mlog_errno(status); + } + + /* Shutdown the kernel journal system */ + jbd2_journal_destroy(journal->j_journal); + journal->j_journal = NULL; + + OCFS2_I(inode)->ip_open_count--; + + /* unlock our journal */ + ocfs2_inode_unlock(inode, 1); + + brelse(journal->j_bh); + journal->j_bh = NULL; + + journal->j_state = OCFS2_JOURNAL_FREE; + +// up_write(&journal->j_trans_barrier); +done: + if (inode) + iput(inode); +} + +static void ocfs2_clear_journal_error(struct super_block *sb, + journal_t *journal, + int slot) +{ + int olderr; + + olderr = jbd2_journal_errno(journal); + if (olderr) { + mlog(ML_ERROR, "File system error %d recorded in " + "journal %u.\n", olderr, slot); + mlog(ML_ERROR, "File system on device %s needs checking.\n", + sb->s_id); + + jbd2_journal_ack_err(journal); + jbd2_journal_clear_err(journal); + } +} + +int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) +{ + int status = 0; + struct ocfs2_super *osb; + + BUG_ON(!journal); + + osb = journal->j_osb; + + status = jbd2_journal_load(journal->j_journal); + if (status < 0) { + mlog(ML_ERROR, "Failed to load journal!\n"); + goto done; + } + + ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); + + status = ocfs2_journal_toggle_dirty(osb, 1, replayed); + if (status < 0) { + mlog_errno(status); + goto done; + } + + /* Launch the commit thread */ + if (!local) { + osb->commit_task = kthread_run(ocfs2_commit_thread, osb, + "ocfs2cmt"); + if (IS_ERR(osb->commit_task)) { + status = PTR_ERR(osb->commit_task); + osb->commit_task = NULL; + mlog(ML_ERROR, "unable to launch ocfs2commit thread, " + "error=%d", status); + goto done; + } + } else + osb->commit_task = NULL; + +done: + return status; +} + + +/* 'full' flag tells us whether we clear out all blocks or if we just + * mark the journal clean */ +int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) +{ + int status; + + BUG_ON(!journal); + + status = jbd2_journal_wipe(journal->j_journal, full); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0); + if (status < 0) + mlog_errno(status); + +bail: + return status; +} + +static int ocfs2_recovery_completed(struct ocfs2_super *osb) +{ + int empty; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + empty = (rm->rm_used == 0); + spin_unlock(&osb->osb_lock); + + return empty; +} + +void ocfs2_wait_for_recovery(struct ocfs2_super *osb) +{ + wait_event(osb->recovery_event, ocfs2_recovery_completed(osb)); +} + +/* + * JBD Might read a cached version of another nodes journal file. We + * don't want this as this file changes often and we get no + * notification on those changes. The only way to be sure that we've + * got the most up to date version of those blocks then is to force + * read them off disk. Just searching through the buffer cache won't + * work as there may be pages backing this file which are still marked + * up to date. We know things can't change on this file underneath us + * as we have the lock by now :) + */ +static int ocfs2_force_read_journal(struct inode *inode) +{ + int status = 0; + int i; + u64 v_blkno, p_blkno, p_blocks, num_blocks; +#define CONCURRENT_JOURNAL_FILL 32ULL + struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; + + memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); + + num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size); + v_blkno = 0; + while (v_blkno < num_blocks) { + status = ocfs2_extent_map_get_blocks(inode, v_blkno, + &p_blkno, &p_blocks, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (p_blocks > CONCURRENT_JOURNAL_FILL) + p_blocks = CONCURRENT_JOURNAL_FILL; + + /* We are reading journal data which should not + * be put in the uptodate cache */ + status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb), + p_blkno, p_blocks, bhs); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + for(i = 0; i < p_blocks; i++) { + brelse(bhs[i]); + bhs[i] = NULL; + } + + v_blkno += p_blocks; + } + +bail: + for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) + brelse(bhs[i]); + return status; +} + +struct ocfs2_la_recovery_item { + struct list_head lri_list; + int lri_slot; + struct ocfs2_dinode *lri_la_dinode; + struct ocfs2_dinode *lri_tl_dinode; + struct ocfs2_quota_recovery *lri_qrec; +}; + +/* Does the second half of the recovery process. By this point, the + * node is marked clean and can actually be considered recovered, + * hence it's no longer in the recovery map, but there's still some + * cleanup we can do which shouldn't happen within the recovery thread + * as locking in that context becomes very difficult if we are to take + * recovering nodes into account. + * + * NOTE: This function can and will sleep on recovery of other nodes + * during cluster locking, just like any other ocfs2 process. + */ +void ocfs2_complete_recovery(struct work_struct *work) +{ + int ret = 0; + struct ocfs2_journal *journal = + container_of(work, struct ocfs2_journal, j_recovery_work); + struct ocfs2_super *osb = journal->j_osb; + struct ocfs2_dinode *la_dinode, *tl_dinode; + struct ocfs2_la_recovery_item *item, *n; + struct ocfs2_quota_recovery *qrec; + LIST_HEAD(tmp_la_list); + + trace_ocfs2_complete_recovery( + (unsigned long long)OCFS2_I(journal->j_inode)->ip_blkno); + + spin_lock(&journal->j_lock); + list_splice_init(&journal->j_la_cleanups, &tmp_la_list); + spin_unlock(&journal->j_lock); + + list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) { + list_del_init(&item->lri_list); + + ocfs2_wait_on_quotas(osb); + + la_dinode = item->lri_la_dinode; + tl_dinode = item->lri_tl_dinode; + qrec = item->lri_qrec; + + trace_ocfs2_complete_recovery_slot(item->lri_slot, + la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0, + tl_dinode ? le64_to_cpu(tl_dinode->i_blkno) : 0, + qrec); + + if (la_dinode) { + ret = ocfs2_complete_local_alloc_recovery(osb, + la_dinode); + if (ret < 0) + mlog_errno(ret); + + kfree(la_dinode); + } + + if (tl_dinode) { + ret = ocfs2_complete_truncate_log_recovery(osb, + tl_dinode); + if (ret < 0) + mlog_errno(ret); + + kfree(tl_dinode); + } + + ret = ocfs2_recover_orphans(osb, item->lri_slot); + if (ret < 0) + mlog_errno(ret); + + if (qrec) { + ret = ocfs2_finish_quota_recovery(osb, qrec, + item->lri_slot); + if (ret < 0) + mlog_errno(ret); + /* Recovery info is already freed now */ + } + + kfree(item); + } + + trace_ocfs2_complete_recovery_end(ret); +} + +/* NOTE: This function always eats your references to la_dinode and + * tl_dinode, either manually on error, or by passing them to + * ocfs2_complete_recovery */ +static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, + int slot_num, + struct ocfs2_dinode *la_dinode, + struct ocfs2_dinode *tl_dinode, + struct ocfs2_quota_recovery *qrec) +{ + struct ocfs2_la_recovery_item *item; + + item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_NOFS); + if (!item) { + /* Though we wish to avoid it, we are in fact safe in + * skipping local alloc cleanup as fsck.ocfs2 is more + * than capable of reclaiming unused space. */ + if (la_dinode) + kfree(la_dinode); + + if (tl_dinode) + kfree(tl_dinode); + + if (qrec) + ocfs2_free_quota_recovery(qrec); + + mlog_errno(-ENOMEM); + return; + } + + INIT_LIST_HEAD(&item->lri_list); + item->lri_la_dinode = la_dinode; + item->lri_slot = slot_num; + item->lri_tl_dinode = tl_dinode; + item->lri_qrec = qrec; + + spin_lock(&journal->j_lock); + list_add_tail(&item->lri_list, &journal->j_la_cleanups); + queue_work(ocfs2_wq, &journal->j_recovery_work); + spin_unlock(&journal->j_lock); +} + +/* Called by the mount code to queue recovery the last part of + * recovery for it's own and offline slot(s). */ +void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) +{ + struct ocfs2_journal *journal = osb->journal; + + if (ocfs2_is_hard_readonly(osb)) + return; + + /* No need to queue up our truncate_log as regular cleanup will catch + * that */ + ocfs2_queue_recovery_completion(journal, osb->slot_num, + osb->local_alloc_copy, NULL, NULL); + ocfs2_schedule_truncate_log_flush(osb, 0); + + osb->local_alloc_copy = NULL; + osb->dirty = 0; + + /* queue to recover orphan slots for all offline slots */ + ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); + ocfs2_queue_replay_slots(osb); + ocfs2_free_replay_slots(osb); +} + +void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) +{ + if (osb->quota_rec) { + ocfs2_queue_recovery_completion(osb->journal, + osb->slot_num, + NULL, + NULL, + osb->quota_rec); + osb->quota_rec = NULL; + } +} + +static int __ocfs2_recovery_thread(void *arg) +{ + int status, node_num, slot_num; + struct ocfs2_super *osb = arg; + struct ocfs2_recovery_map *rm = osb->recovery_map; + int *rm_quota = NULL; + int rm_quota_used = 0, i; + struct ocfs2_quota_recovery *qrec; + + status = ocfs2_wait_on_mount(osb); + if (status < 0) { + goto bail; + } + + rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS); + if (!rm_quota) { + status = -ENOMEM; + goto bail; + } +restart: + status = ocfs2_super_lock(osb, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_compute_replay_slots(osb); + if (status < 0) + mlog_errno(status); + + /* queue recovery for our own slot */ + ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, + NULL, NULL); + + spin_lock(&osb->osb_lock); + while (rm->rm_used) { + /* It's always safe to remove entry zero, as we won't + * clear it until ocfs2_recover_node() has succeeded. */ + node_num = rm->rm_entries[0]; + spin_unlock(&osb->osb_lock); + slot_num = ocfs2_node_num_to_slot(osb, node_num); + trace_ocfs2_recovery_thread_node(node_num, slot_num); + if (slot_num == -ENOENT) { + status = 0; + goto skip_recovery; + } + + /* It is a bit subtle with quota recovery. We cannot do it + * immediately because we have to obtain cluster locks from + * quota files and we also don't want to just skip it because + * then quota usage would be out of sync until some node takes + * the slot. So we remember which nodes need quota recovery + * and when everything else is done, we recover quotas. */ + for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++); + if (i == rm_quota_used) + rm_quota[rm_quota_used++] = slot_num; + + status = ocfs2_recover_node(osb, node_num, slot_num); +skip_recovery: + if (!status) { + ocfs2_recovery_map_clear(osb, node_num); + } else { + mlog(ML_ERROR, + "Error %d recovering node %d on device (%u,%u)!\n", + status, node_num, + MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + mlog(ML_ERROR, "Volume requires unmount.\n"); + } + + spin_lock(&osb->osb_lock); + } + spin_unlock(&osb->osb_lock); + trace_ocfs2_recovery_thread_end(status); + + /* Refresh all journal recovery generations from disk */ + status = ocfs2_check_journals_nolocks(osb); + status = (status == -EROFS) ? 0 : status; + if (status < 0) + mlog_errno(status); + + /* Now it is right time to recover quotas... We have to do this under + * superblock lock so that no one can start using the slot (and crash) + * before we recover it */ + for (i = 0; i < rm_quota_used; i++) { + qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); + if (IS_ERR(qrec)) { + status = PTR_ERR(qrec); + mlog_errno(status); + continue; + } + ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], + NULL, NULL, qrec); + } + + ocfs2_super_unlock(osb, 1); + + /* queue recovery for offline slots */ + ocfs2_queue_replay_slots(osb); + +bail: + mutex_lock(&osb->recovery_lock); + if (!status && !ocfs2_recovery_completed(osb)) { + mutex_unlock(&osb->recovery_lock); + goto restart; + } + + ocfs2_free_replay_slots(osb); + osb->recovery_thread_task = NULL; + mb(); /* sync with ocfs2_recovery_thread_running */ + wake_up(&osb->recovery_event); + + mutex_unlock(&osb->recovery_lock); + + if (rm_quota) + kfree(rm_quota); + + /* no one is callint kthread_stop() for us so the kthread() api + * requires that we call do_exit(). And it isn't exported, but + * complete_and_exit() seems to be a minimal wrapper around it. */ + complete_and_exit(NULL, status); + return status; +} + +void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) +{ + mutex_lock(&osb->recovery_lock); + + trace_ocfs2_recovery_thread(node_num, osb->node_num, + osb->disable_recovery, osb->recovery_thread_task, + osb->disable_recovery ? + -1 : ocfs2_recovery_map_set(osb, node_num)); + + if (osb->disable_recovery) + goto out; + + if (osb->recovery_thread_task) + goto out; + + osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, + "ocfs2rec"); + if (IS_ERR(osb->recovery_thread_task)) { + mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); + osb->recovery_thread_task = NULL; + } + +out: + mutex_unlock(&osb->recovery_lock); + wake_up(&osb->recovery_event); +} + +static int ocfs2_read_journal_inode(struct ocfs2_super *osb, + int slot_num, + struct buffer_head **bh, + struct inode **ret_inode) +{ + int status = -EACCES; + struct inode *inode = NULL; + + BUG_ON(slot_num >= osb->max_slots); + + inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, + slot_num); + if (!inode || is_bad_inode(inode)) { + mlog_errno(status); + goto bail; + } + SET_INODE_JOURNAL(inode); + + status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = 0; + +bail: + if (inode) { + if (status || !ret_inode) + iput(inode); + else + *ret_inode = inode; + } + return status; +} + +/* Does the actual journal replay and marks the journal inode as + * clean. Will only replay if the journal inode is marked dirty. */ +static int ocfs2_replay_journal(struct ocfs2_super *osb, + int node_num, + int slot_num) +{ + int status; + int got_lock = 0; + unsigned int flags; + struct inode *inode = NULL; + struct ocfs2_dinode *fe; + journal_t *journal = NULL; + struct buffer_head *bh = NULL; + u32 slot_reco_gen; + + status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode); + if (status) { + mlog_errno(status); + goto done; + } + + fe = (struct ocfs2_dinode *)bh->b_data; + slot_reco_gen = ocfs2_get_recovery_generation(fe); + brelse(bh); + bh = NULL; + + /* + * As the fs recovery is asynchronous, there is a small chance that + * another node mounted (and recovered) the slot before the recovery + * thread could get the lock. To handle that, we dirty read the journal + * inode for that slot to get the recovery generation. If it is + * different than what we expected, the slot has been recovered. + * If not, it needs recovery. + */ + if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) { + trace_ocfs2_replay_journal_recovered(slot_num, + osb->slot_recovery_generations[slot_num], slot_reco_gen); + osb->slot_recovery_generations[slot_num] = slot_reco_gen; + status = -EBUSY; + goto done; + } + + /* Continue with recovery as the journal has not yet been recovered */ + + status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); + if (status < 0) { + trace_ocfs2_replay_journal_lock_err(status); + if (status != -ERESTARTSYS) + mlog(ML_ERROR, "Could not lock journal!\n"); + goto done; + } + got_lock = 1; + + fe = (struct ocfs2_dinode *) bh->b_data; + + flags = le32_to_cpu(fe->id1.journal1.ij_flags); + slot_reco_gen = ocfs2_get_recovery_generation(fe); + + if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) { + trace_ocfs2_replay_journal_skip(node_num); + /* Refresh recovery generation for the slot */ + osb->slot_recovery_generations[slot_num] = slot_reco_gen; + goto done; + } + + /* we need to run complete recovery for offline orphan slots */ + ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); + + mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", + node_num, slot_num, + MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + + status = ocfs2_force_read_journal(inode); + if (status < 0) { + mlog_errno(status); + goto done; + } + + journal = jbd2_journal_init_inode(inode); + if (journal == NULL) { + mlog(ML_ERROR, "Linux journal layer error\n"); + status = -EIO; + goto done; + } + + status = jbd2_journal_load(journal); + if (status < 0) { + mlog_errno(status); + if (!igrab(inode)) + BUG(); + jbd2_journal_destroy(journal); + goto done; + } + + ocfs2_clear_journal_error(osb->sb, journal, slot_num); + + /* wipe the journal */ + jbd2_journal_lock_updates(journal); + status = jbd2_journal_flush(journal); + jbd2_journal_unlock_updates(journal); + if (status < 0) + mlog_errno(status); + + /* This will mark the node clean */ + flags = le32_to_cpu(fe->id1.journal1.ij_flags); + flags &= ~OCFS2_JOURNAL_DIRTY_FL; + fe->id1.journal1.ij_flags = cpu_to_le32(flags); + + /* Increment recovery generation to indicate successful recovery */ + ocfs2_bump_recovery_generation(fe); + osb->slot_recovery_generations[slot_num] = + ocfs2_get_recovery_generation(fe); + + ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); + status = ocfs2_write_block(osb, bh, INODE_CACHE(inode)); + if (status < 0) + mlog_errno(status); + + if (!igrab(inode)) + BUG(); + + jbd2_journal_destroy(journal); + +done: + /* drop the lock on this nodes journal */ + if (got_lock) + ocfs2_inode_unlock(inode, 1); + + if (inode) + iput(inode); + + brelse(bh); + + return status; +} + +/* + * Do the most important parts of node recovery: + * - Replay it's journal + * - Stamp a clean local allocator file + * - Stamp a clean truncate log + * - Mark the node clean + * + * If this function completes without error, a node in OCFS2 can be + * said to have been safely recovered. As a result, failure during the + * second part of a nodes recovery process (local alloc recovery) is + * far less concerning. + */ +static int ocfs2_recover_node(struct ocfs2_super *osb, + int node_num, int slot_num) +{ + int status = 0; + struct ocfs2_dinode *la_copy = NULL; + struct ocfs2_dinode *tl_copy = NULL; + + trace_ocfs2_recover_node(node_num, slot_num, osb->node_num); + + /* Should not ever be called to recover ourselves -- in that + * case we should've called ocfs2_journal_load instead. */ + BUG_ON(osb->node_num == node_num); + + status = ocfs2_replay_journal(osb, node_num, slot_num); + if (status < 0) { + if (status == -EBUSY) { + trace_ocfs2_recover_node_skip(slot_num, node_num); + status = 0; + goto done; + } + mlog_errno(status); + goto done; + } + + /* Stamp a clean local alloc file AFTER recovering the journal... */ + status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy); + if (status < 0) { + mlog_errno(status); + goto done; + } + + /* An error from begin_truncate_log_recovery is not + * serious enough to warrant halting the rest of + * recovery. */ + status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy); + if (status < 0) + mlog_errno(status); + + /* Likewise, this would be a strange but ultimately not so + * harmful place to get an error... */ + status = ocfs2_clear_slot(osb, slot_num); + if (status < 0) + mlog_errno(status); + + /* This will kfree the memory pointed to by la_copy and tl_copy */ + ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, + tl_copy, NULL); + + status = 0; +done: + + return status; +} + +/* Test node liveness by trylocking his journal. If we get the lock, + * we drop it here. Return 0 if we got the lock, -EAGAIN if node is + * still alive (we couldn't get the lock) and < 0 on error. */ +static int ocfs2_trylock_journal(struct ocfs2_super *osb, + int slot_num) +{ + int status, flags; + struct inode *inode = NULL; + + inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, + slot_num); + if (inode == NULL) { + mlog(ML_ERROR, "access error\n"); + status = -EACCES; + goto bail; + } + if (is_bad_inode(inode)) { + mlog(ML_ERROR, "access error (bad inode)\n"); + iput(inode); + inode = NULL; + status = -EACCES; + goto bail; + } + SET_INODE_JOURNAL(inode); + + flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE; + status = ocfs2_inode_lock_full(inode, NULL, 1, flags); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + goto bail; + } + + ocfs2_inode_unlock(inode, 1); +bail: + if (inode) + iput(inode); + + return status; +} + +/* Call this underneath ocfs2_super_lock. It also assumes that the + * slot info struct has been updated from disk. */ +int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) +{ + unsigned int node_num; + int status, i; + u32 gen; + struct buffer_head *bh = NULL; + struct ocfs2_dinode *di; + + /* This is called with the super block cluster lock, so we + * know that the slot map can't change underneath us. */ + + for (i = 0; i < osb->max_slots; i++) { + /* Read journal inode to get the recovery generation */ + status = ocfs2_read_journal_inode(osb, i, &bh, NULL); + if (status) { + mlog_errno(status); + goto bail; + } + di = (struct ocfs2_dinode *)bh->b_data; + gen = ocfs2_get_recovery_generation(di); + brelse(bh); + bh = NULL; + + spin_lock(&osb->osb_lock); + osb->slot_recovery_generations[i] = gen; + + trace_ocfs2_mark_dead_nodes(i, + osb->slot_recovery_generations[i]); + + if (i == osb->slot_num) { + spin_unlock(&osb->osb_lock); + continue; + } + + status = ocfs2_slot_to_node_num_locked(osb, i, &node_num); + if (status == -ENOENT) { + spin_unlock(&osb->osb_lock); + continue; + } + + if (__ocfs2_recovery_map_test(osb, node_num)) { + spin_unlock(&osb->osb_lock); + continue; + } + spin_unlock(&osb->osb_lock); + + /* Ok, we have a slot occupied by another node which + * is not in the recovery map. We trylock his journal + * file here to test if he's alive. */ + status = ocfs2_trylock_journal(osb, i); + if (!status) { + /* Since we're called from mount, we know that + * the recovery thread can't race us on + * setting / checking the recovery bits. */ + ocfs2_recovery_thread(osb, node_num); + } else if ((status < 0) && (status != -EAGAIN)) { + mlog_errno(status); + goto bail; + } + } + + status = 0; +bail: + return status; +} + +/* + * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some + * randomness to the timeout to minimize multple nodes firing the timer at the + * same time. + */ +static inline unsigned long ocfs2_orphan_scan_timeout(void) +{ + unsigned long time; + + get_random_bytes(&time, sizeof(time)); + time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000); + return msecs_to_jiffies(time); +} + +/* + * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for + * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This + * is done to catch any orphans that are left over in orphan directories. + * + * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT + * seconds. It gets an EX lock on os_lockres and checks sequence number + * stored in LVB. If the sequence number has changed, it means some other + * node has done the scan. This node skips the scan and tracks the + * sequence number. If the sequence number didn't change, it means a scan + * hasn't happened. The node queues a scan and increments the + * sequence number in the LVB. + */ +void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + int status, i; + u32 seqno = 0; + + os = &osb->osb_orphan_scan; + + if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) + goto out; + + trace_ocfs2_queue_orphan_scan_begin(os->os_count, os->os_seqno, + atomic_read(&os->os_state)); + + status = ocfs2_orphan_scan_lock(osb, &seqno); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + goto out; + } + + /* Do no queue the tasks if the volume is being umounted */ + if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) + goto unlock; + + if (os->os_seqno != seqno) { + os->os_seqno = seqno; + goto unlock; + } + + for (i = 0; i < osb->max_slots; i++) + ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL, + NULL); + /* + * We queued a recovery on orphan slots, increment the sequence + * number and update LVB so other node will skip the scan for a while + */ + seqno++; + os->os_count++; + os->os_scantime = CURRENT_TIME; +unlock: + ocfs2_orphan_scan_unlock(osb, seqno); +out: + trace_ocfs2_queue_orphan_scan_end(os->os_count, os->os_seqno, + atomic_read(&os->os_state)); + return; +} + +/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */ +void ocfs2_orphan_scan_work(struct work_struct *work) +{ + struct ocfs2_orphan_scan *os; + struct ocfs2_super *osb; + + os = container_of(work, struct ocfs2_orphan_scan, + os_orphan_scan_work.work); + osb = os->os_osb; + + mutex_lock(&os->os_lock); + ocfs2_queue_orphan_scan(osb); + if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) + queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, + ocfs2_orphan_scan_timeout()); + mutex_unlock(&os->os_lock); +} + +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) { + atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); + mutex_lock(&os->os_lock); + cancel_delayed_work(&os->os_orphan_scan_work); + mutex_unlock(&os->os_lock); + } +} + +void ocfs2_orphan_scan_init(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + os->os_osb = osb; + os->os_count = 0; + os->os_seqno = 0; + mutex_init(&os->os_lock); + INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work); +} + +void ocfs2_orphan_scan_start(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + os->os_scantime = CURRENT_TIME; + if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) + atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); + else { + atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); + queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, + ocfs2_orphan_scan_timeout()); + } +} + +struct ocfs2_orphan_filldir_priv { + struct inode *head; + struct ocfs2_super *osb; +}; + +static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len, + loff_t pos, u64 ino, unsigned type) +{ + struct ocfs2_orphan_filldir_priv *p = priv; + struct inode *iter; + + if (name_len == 1 && !strncmp(".", name, 1)) + return 0; + if (name_len == 2 && !strncmp("..", name, 2)) + return 0; + + /* Skip bad inodes so that recovery can continue */ + iter = ocfs2_iget(p->osb, ino, + OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0); + if (IS_ERR(iter)) + return 0; + + trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno); + /* No locking is required for the next_orphan queue as there + * is only ever a single process doing orphan recovery. */ + OCFS2_I(iter)->ip_next_orphan = p->head; + p->head = iter; + + return 0; +} + +static int ocfs2_queue_orphans(struct ocfs2_super *osb, + int slot, + struct inode **head) +{ + int status; + struct inode *orphan_dir_inode = NULL; + struct ocfs2_orphan_filldir_priv priv; + loff_t pos = 0; + + priv.osb = osb; + priv.head = *head; + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + slot); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + return status; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv, + ocfs2_orphan_filldir); + if (status) { + mlog_errno(status); + goto out_cluster; + } + + *head = priv.head; + +out_cluster: + ocfs2_inode_unlock(orphan_dir_inode, 0); +out: + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + return status; +} + +static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb, + int slot) +{ + int ret; + + spin_lock(&osb->osb_lock); + ret = !osb->osb_orphan_wipes[slot]; + spin_unlock(&osb->osb_lock); + return ret; +} + +static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb, + int slot) +{ + spin_lock(&osb->osb_lock); + /* Mark ourselves such that new processes in delete_inode() + * know to quit early. */ + ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot); + while (osb->osb_orphan_wipes[slot]) { + /* If any processes are already in the middle of an + * orphan wipe on this dir, then we need to wait for + * them. */ + spin_unlock(&osb->osb_lock); + wait_event_interruptible(osb->osb_wipe_event, + ocfs2_orphan_recovery_can_continue(osb, slot)); + spin_lock(&osb->osb_lock); + } + spin_unlock(&osb->osb_lock); +} + +static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb, + int slot) +{ + ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot); +} + +/* + * Orphan recovery. Each mounted node has it's own orphan dir which we + * must run during recovery. Our strategy here is to build a list of + * the inodes in the orphan dir and iget/iput them. The VFS does + * (most) of the rest of the work. + * + * Orphan recovery can happen at any time, not just mount so we have a + * couple of extra considerations. + * + * - We grab as many inodes as we can under the orphan dir lock - + * doing iget() outside the orphan dir risks getting a reference on + * an invalid inode. + * - We must be sure not to deadlock with other processes on the + * system wanting to run delete_inode(). This can happen when they go + * to lock the orphan dir and the orphan recovery process attempts to + * iget() inside the orphan dir lock. This can be avoided by + * advertising our state to ocfs2_delete_inode(). + */ +static int ocfs2_recover_orphans(struct ocfs2_super *osb, + int slot) +{ + int ret = 0; + struct inode *inode = NULL; + struct inode *iter; + struct ocfs2_inode_info *oi; + + trace_ocfs2_recover_orphans(slot); + + ocfs2_mark_recovering_orphan_dir(osb, slot); + ret = ocfs2_queue_orphans(osb, slot, &inode); + ocfs2_clear_recovering_orphan_dir(osb, slot); + + /* Error here should be noted, but we want to continue with as + * many queued inodes as we've got. */ + if (ret) + mlog_errno(ret); + + while (inode) { + oi = OCFS2_I(inode); + trace_ocfs2_recover_orphans_iput( + (unsigned long long)oi->ip_blkno); + + iter = oi->ip_next_orphan; + + spin_lock(&oi->ip_lock); + /* The remote delete code may have set these on the + * assumption that the other node would wipe them + * successfully. If they are still in the node's + * orphan dir, we need to reset that state. */ + oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); + + /* Set the proper information to get us going into + * ocfs2_delete_inode. */ + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&oi->ip_lock); + + iput(inode); + + inode = iter; + } + + return ret; +} + +static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota) +{ + /* This check is good because ocfs2 will wait on our recovery + * thread before changing it to something other than MOUNTED + * or DISABLED. */ + wait_event(osb->osb_mount_event, + (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) || + atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS || + atomic_read(&osb->vol_state) == VOLUME_DISABLED); + + /* If there's an error on mount, then we may never get to the + * MOUNTED flag, but this is set right before + * dismount_volume() so we can trust it. */ + if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) { + trace_ocfs2_wait_on_mount(VOLUME_DISABLED); + mlog(0, "mount error, exiting!\n"); + return -EBUSY; + } + + return 0; +} + +static int ocfs2_commit_thread(void *arg) +{ + int status; + struct ocfs2_super *osb = arg; + struct ocfs2_journal *journal = osb->journal; + + /* we can trust j_num_trans here because _should_stop() is only set in + * shutdown and nobody other than ourselves should be able to start + * transactions. committing on shutdown might take a few iterations + * as final transactions put deleted inodes on the list */ + while (!(kthread_should_stop() && + atomic_read(&journal->j_num_trans) == 0)) { + + wait_event_interruptible(osb->checkpoint_event, + atomic_read(&journal->j_num_trans) + || kthread_should_stop()); + + status = ocfs2_commit_cache(osb); + if (status < 0) + mlog_errno(status); + + if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ + mlog(ML_KTHREAD, + "commit_thread: %u transactions pending on " + "shutdown\n", + atomic_read(&journal->j_num_trans)); + } + } + + return 0; +} + +/* Reads all the journal inodes without taking any cluster locks. Used + * for hard readonly access to determine whether any journal requires + * recovery. Also used to refresh the recovery generation numbers after + * a journal has been recovered by another node. + */ +int ocfs2_check_journals_nolocks(struct ocfs2_super *osb) +{ + int ret = 0; + unsigned int slot; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + int journal_dirty = 0; + + for(slot = 0; slot < osb->max_slots; slot++) { + ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + + osb->slot_recovery_generations[slot] = + ocfs2_get_recovery_generation(di); + + if (le32_to_cpu(di->id1.journal1.ij_flags) & + OCFS2_JOURNAL_DIRTY_FL) + journal_dirty = 1; + + brelse(di_bh); + di_bh = NULL; + } + +out: + if (journal_dirty) + ret = -EROFS; + return ret; +} diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h new file mode 100644 index 00000000..68cf2f6d --- /dev/null +++ b/fs/ocfs2/journal.h @@ -0,0 +1,619 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * journal.h + * + * Defines journalling api and structures. + * + * Copyright (C) 2003, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_JOURNAL_H +#define OCFS2_JOURNAL_H + +#include +#include + +enum ocfs2_journal_state { + OCFS2_JOURNAL_FREE = 0, + OCFS2_JOURNAL_LOADED, + OCFS2_JOURNAL_IN_SHUTDOWN, +}; + +struct ocfs2_super; +struct ocfs2_dinode; + +/* + * The recovery_list is a simple linked list of node numbers to recover. + * It is protected by the recovery_lock. + */ + +struct ocfs2_recovery_map { + unsigned int rm_used; + unsigned int *rm_entries; +}; + + +struct ocfs2_journal { + enum ocfs2_journal_state j_state; /* Journals current state */ + + journal_t *j_journal; /* The kernels journal type */ + struct inode *j_inode; /* Kernel inode pointing to + * this journal */ + struct ocfs2_super *j_osb; /* pointer to the super + * block for the node + * we're currently + * running on -- not + * necessarily the super + * block from the node + * which we usually run + * from (recovery, + * etc) */ + struct buffer_head *j_bh; /* Journal disk inode block */ + atomic_t j_num_trans; /* Number of transactions + * currently in the system. */ + spinlock_t j_lock; + unsigned long j_trans_id; + struct rw_semaphore j_trans_barrier; + wait_queue_head_t j_checkpointed; + + /* both fields protected by j_lock*/ + struct list_head j_la_cleanups; + struct work_struct j_recovery_work; +}; + +extern spinlock_t trans_inc_lock; + +/* wrap j_trans_id so we never have it equal to zero. */ +static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j) +{ + unsigned long old_id; + spin_lock(&trans_inc_lock); + old_id = j->j_trans_id++; + if (unlikely(!j->j_trans_id)) + j->j_trans_id = 1; + spin_unlock(&trans_inc_lock); + return old_id; +} + +static inline void ocfs2_set_ci_lock_trans(struct ocfs2_journal *journal, + struct ocfs2_caching_info *ci) +{ + spin_lock(&trans_inc_lock); + ci->ci_last_trans = journal->j_trans_id; + spin_unlock(&trans_inc_lock); +} + +/* Used to figure out whether it's safe to drop a metadata lock on an + * cached object. Returns true if all the object's changes have been + * checkpointed to disk. You should be holding the spinlock on the + * metadata lock while calling this to be sure that nobody can take + * the lock and put it on another transaction. */ +static inline int ocfs2_ci_fully_checkpointed(struct ocfs2_caching_info *ci) +{ + int ret; + struct ocfs2_journal *journal = + OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal; + + spin_lock(&trans_inc_lock); + ret = time_after(journal->j_trans_id, ci->ci_last_trans); + spin_unlock(&trans_inc_lock); + return ret; +} + +/* convenience function to check if an object backed by struct + * ocfs2_caching_info is still new (has never hit disk) Will do you a + * favor and set created_trans = 0 when you've + * been checkpointed. returns '1' if the ci is still new. */ +static inline int ocfs2_ci_is_new(struct ocfs2_caching_info *ci) +{ + int ret; + struct ocfs2_journal *journal = + OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal; + + spin_lock(&trans_inc_lock); + ret = !(time_after(journal->j_trans_id, ci->ci_created_trans)); + if (!ret) + ci->ci_created_trans = 0; + spin_unlock(&trans_inc_lock); + return ret; +} + +/* Wrapper for inodes so we can check system files */ +static inline int ocfs2_inode_is_new(struct inode *inode) +{ + /* System files are never "new" as they're written out by + * mkfs. This helps us early during mount, before we have the + * journal open and j_trans_id could be junk. */ + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) + return 0; + + return ocfs2_ci_is_new(INODE_CACHE(inode)); +} + +static inline void ocfs2_ci_set_new(struct ocfs2_super *osb, + struct ocfs2_caching_info *ci) +{ + spin_lock(&trans_inc_lock); + ci->ci_created_trans = osb->journal->j_trans_id; + spin_unlock(&trans_inc_lock); +} + +/* Exported only for the journal struct init code in super.c. Do not call. */ +void ocfs2_orphan_scan_init(struct ocfs2_super *osb); +void ocfs2_orphan_scan_start(struct ocfs2_super *osb); +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); +void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); + +void ocfs2_complete_recovery(struct work_struct *work); +void ocfs2_wait_for_recovery(struct ocfs2_super *osb); + +int ocfs2_recovery_init(struct ocfs2_super *osb); +void ocfs2_recovery_exit(struct ocfs2_super *osb); + +int ocfs2_compute_replay_slots(struct ocfs2_super *osb); +/* + * Journal Control: + * Initialize, Load, Shutdown, Wipe a journal. + * + * ocfs2_journal_init - Initialize journal structures in the OSB. + * ocfs2_journal_load - Load the given journal off disk. Replay it if + * there's transactions still in there. + * ocfs2_journal_shutdown - Shutdown a journal, this will flush all + * uncommitted, uncheckpointed transactions. + * ocfs2_journal_wipe - Wipe transactions from a journal. Optionally + * zero out each block. + * ocfs2_recovery_thread - Perform recovery on a node. osb is our own osb. + * ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat + * event on. + * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. + */ +void ocfs2_set_journal_params(struct ocfs2_super *osb); +int ocfs2_journal_init(struct ocfs2_journal *journal, + int *dirty); +void ocfs2_journal_shutdown(struct ocfs2_super *osb); +int ocfs2_journal_wipe(struct ocfs2_journal *journal, + int full); +int ocfs2_journal_load(struct ocfs2_journal *journal, int local, + int replayed); +int ocfs2_check_journals_nolocks(struct ocfs2_super *osb); +void ocfs2_recovery_thread(struct ocfs2_super *osb, + int node_num); +int ocfs2_mark_dead_nodes(struct ocfs2_super *osb); +void ocfs2_complete_mount_recovery(struct ocfs2_super *osb); +void ocfs2_complete_quota_recovery(struct ocfs2_super *osb); + +static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) +{ + atomic_set(&osb->needs_checkpoint, 1); + wake_up(&osb->checkpoint_event); +} + +static inline void ocfs2_checkpoint_inode(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (ocfs2_mount_local(osb)) + return; + + if (!ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))) { + /* WARNING: This only kicks off a single + * checkpoint. If someone races you and adds more + * metadata to the journal, you won't know, and will + * wind up waiting *a lot* longer than necessary. Right + * now we only use this in clear_inode so that's + * OK. */ + ocfs2_start_checkpoint(osb); + + wait_event(osb->journal->j_checkpointed, + ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))); + } +} + +/* + * Transaction Handling: + * Manage the lifetime of a transaction handle. + * + * ocfs2_start_trans - Begin a transaction. Give it an upper estimate of + * the number of blocks that will be changed during + * this handle. + * ocfs2_commit_trans - Complete a handle. It might return -EIO if + * the journal was aborted. The majority of paths don't + * check the return value as an error there comes too + * late to do anything (and will be picked up in a + * later transaction). + * ocfs2_extend_trans - Extend a handle by nblocks credits. This may + * commit the handle to disk in the process, but will + * not release any locks taken during the transaction. + * ocfs2_journal_access* - Notify the handle that we want to journal this + * buffer. Will have to call ocfs2_journal_dirty once + * we've actually dirtied it. Type is one of . or . + * Always call the specific flavor of + * ocfs2_journal_access_*() unless you intend to + * manage the checksum by hand. + * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. + * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before + * the current handle commits. + */ + +/* You must always start_trans with a number of buffs > 0, but it's + * perfectly legal to go through an entire transaction without having + * dirtied any buffers. */ +handle_t *ocfs2_start_trans(struct ocfs2_super *osb, + int max_buffs); +int ocfs2_commit_trans(struct ocfs2_super *osb, + handle_t *handle); +int ocfs2_extend_trans(handle_t *handle, int nblocks); + +/* + * Create access is for when we get a newly created buffer and we're + * not gonna read it off disk, but rather fill it ourselves. Right + * now, we don't do anything special with this (it turns into a write + * request), but this is a good placeholder in case we do... + * + * Write access is for when we read a block off disk and are going to + * modify it. This way the journalling layer knows it may need to make + * a copy of that block (if it's part of another, uncommitted + * transaction) before we do so. + */ +#define OCFS2_JOURNAL_ACCESS_CREATE 0 +#define OCFS2_JOURNAL_ACCESS_WRITE 1 +#define OCFS2_JOURNAL_ACCESS_UNDO 2 + + +/* ocfs2_inode */ +int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* ocfs2_extent_block */ +int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* ocfs2_refcount_block */ +int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* ocfs2_group_desc */ +int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* ocfs2_xattr_block */ +int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* quota blocks */ +int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* dirblock */ +int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* ocfs2_dx_root_block */ +int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* ocfs2_dx_leaf */ +int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* Anything that has no ecc */ +int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); + +/* + * A word about the journal_access/journal_dirty "dance". It is + * entirely legal to journal_access a buffer more than once (as long + * as the access type is the same -- I'm not sure what will happen if + * access type is different but this should never happen anyway) It is + * also legal to journal_dirty a buffer more than once. In fact, you + * can even journal_access a buffer after you've done a + * journal_access/journal_dirty pair. The only thing you cannot do + * however, is journal_dirty a buffer which you haven't yet passed to + * journal_access at least once. + * + * That said, 99% of the time this doesn't matter and this is what the + * path looks like: + * + * + * ocfs2_journal_access(handle, bh, OCFS2_JOURNAL_ACCESS_WRITE); + * + * ocfs2_journal_dirty(handle, bh); + */ +void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh); + +/* + * Credit Macros: + * Convenience macros to calculate number of credits needed. + * + * For convenience sake, I have a set of macros here which calculate + * the *maximum* number of sectors which will be changed for various + * metadata updates. + */ + +/* simple file updates like chmod, etc. */ +#define OCFS2_INODE_UPDATE_CREDITS 1 + +/* extended attribute block update */ +#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 + +/* Update of a single quota block */ +#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1 + +/* global quotafile inode update, data block */ +#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) + +#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +/* + * The two writes below can accidentally see global info dirty due + * to set_info() quotactl so make them prepared for the writes. + */ +/* quota data block, global info */ +/* Write to local quota file */ +#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) + +/* global quota data block, local quota data block, global quota inode, + * global quota info */ +#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) + +static inline int ocfs2_quota_trans_credits(struct super_block *sb) +{ + int credits = 0; + + if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) + credits += OCFS2_QWRITE_CREDITS; + if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) + credits += OCFS2_QWRITE_CREDITS; + return credits; +} + +/* group extend. inode update and last group update. */ +#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* group add. inode update and the new group update. */ +#define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* get one bit out of a suballocator: dinode + group descriptor + + * prev. group desc. if we relink. */ +#define OCFS2_SUBALLOC_ALLOC (3) + +static inline int ocfs2_inline_to_extents_credits(struct super_block *sb) +{ + return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS + + ocfs2_quota_trans_credits(sb); +} + +/* dinode + group descriptor update. We don't relink on free yet. */ +#define OCFS2_SUBALLOC_FREE (2) + +#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS +#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ + + OCFS2_TRUNCATE_LOG_UPDATE) + +static inline int ocfs2_remove_extent_credits(struct super_block *sb) +{ + return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS + + ocfs2_quota_trans_credits(sb); +} + +/* data block for new dir/symlink, allocation of directory block, dx_root + * update for free list */ +#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1) + +static inline int ocfs2_add_dir_index_credits(struct super_block *sb) +{ + /* 1 block for index, 2 allocs (data, metadata), 1 clusters + * worth of blocks for initial extent. */ + return 1 + 2 * OCFS2_SUBALLOC_ALLOC + + ocfs2_clusters_to_blocks(sb, 1); +} + +/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode + * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr + * blocks + quota update */ +static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, + int xattr_credits) +{ + int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS; + + if (is_dir) + dir_credits += ocfs2_add_dir_index_credits(sb); + + return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits + + ocfs2_quota_trans_credits(sb); +} + +/* local alloc metadata change + main bitmap updates */ +#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \ + + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE) + +/* used when we don't need an allocation change for a dir extend. One + * for the dinode, one for the new block. */ +#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) + +/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota + * update on dir + index leaf + dx root update for free list */ +static inline int ocfs2_link_credits(struct super_block *sb) +{ + return 2*OCFS2_INODE_UPDATE_CREDITS + 3 + + ocfs2_quota_trans_credits(sb); +} + +/* inode + dir inode (if we unlink a dir), + dir entry block + orphan + * dir inode link + dir inode index leaf + dir index root */ +static inline int ocfs2_unlink_credits(struct super_block *sb) +{ + /* The quota update from ocfs2_link_credits is unused here... */ + return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb); +} + +/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + + * inode alloc group descriptor + orphan dir index root + + * orphan dir index leaf */ +#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4) + +/* dinode update, old dir dinode update, new dir dinode update, old + * dir dir entry, new dir dir entry, dir entry update for renaming + * directory + target unlink + 3 x dir index leaves */ +static inline int ocfs2_rename_credits(struct super_block *sb) +{ + return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb); +} + +/* global bitmap dinode, group desc., relinked group, + * suballocator dinode, group desc., relinked group, + * dinode, xattr block */ +#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \ + + OCFS2_INODE_UPDATE_CREDITS \ + + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) + +/* inode update, removal of dx root block from allocator */ +#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ + OCFS2_SUBALLOC_FREE) + +static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb) +{ + int credits = 1 + OCFS2_SUBALLOC_ALLOC; + + credits += ocfs2_clusters_to_blocks(sb, 1); + credits += ocfs2_quota_trans_credits(sb); + + return credits; +} + +/* inode update, new refcount block and its allocation credits. */ +#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \ + + OCFS2_SUBALLOC_ALLOC) + +/* inode and the refcount block update. */ +#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* + * inode and the refcount block update. + * It doesn't include the credits for sub alloc change. + * So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added. + */ +#define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* 2 metadata alloc, 2 new blocks and root refcount block */ +#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3) + +/* + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ +static inline int ocfs2_calc_extend_credits(struct super_block *sb, + struct ocfs2_extent_list *root_el, + u32 bits_wanted) +{ + int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks; + + /* bitmap dinode, group desc. + relinked group. */ + bitmap_blocks = OCFS2_SUBALLOC_ALLOC; + + /* we might need to shift tree depth so lets assume an + * absolute worst case of complete fragmentation. Even with + * that, we only need one update for the dinode, and then + * however many metadata chunks needed * a remaining suballoc + * alloc. */ + sysfile_bitmap_blocks = 1 + + (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el); + + /* this does not include *new* metadata blocks, which are + * accounted for in sysfile_bitmap_blocks. root_el + + * prev. last_eb_blk + blocks along edge of tree. + * calc_symlink_credits passes because we just need 1 + * credit for the dinode there. */ + extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); + + return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks + + ocfs2_quota_trans_credits(sb); +} + +static inline int ocfs2_calc_symlink_credits(struct super_block *sb) +{ + int blocks = ocfs2_mknod_credits(sb, 0, 0); + + /* links can be longer than one block so we may update many + * within our single allocated extent. */ + blocks += ocfs2_clusters_to_blocks(sb, 1); + + return blocks + ocfs2_quota_trans_credits(sb); +} + +static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb, + unsigned int cpg) +{ + int blocks; + int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1; + /* parent inode update + new block group header + bitmap inode update + + bitmap blocks affected */ + blocks = 1 + 1 + 1 + bitmap_blocks; + return blocks; +} + +/* + * Allocating a discontiguous block group requires the credits from + * ocfs2_calc_group_alloc_credits() as well as enough credits to fill + * the group descriptor's extent list. The caller already has started + * the transaction with ocfs2_calc_group_alloc_credits(). They extend + * it with these credits. + */ +static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb) +{ + return ocfs2_extent_recs_per_gd(sb); +} + +static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, + unsigned int clusters_to_del, + struct ocfs2_dinode *fe, + struct ocfs2_extent_list *last_el) +{ + /* for dinode + all headers in this pass + update to next leaf */ + u16 next_free = le16_to_cpu(last_el->l_next_free_rec); + u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth); + int credits = 1 + tree_depth + 1; + int i; + + i = next_free - 1; + BUG_ON(i < 0); + + /* We may be deleting metadata blocks, so metadata alloc dinode + + one desc. block for each possible delete. */ + if (tree_depth && next_free == 1 && + ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del) + credits += 1 + tree_depth; + + /* update to the truncate log. */ + credits += OCFS2_TRUNCATE_LOG_UPDATE; + + credits += ocfs2_quota_trans_credits(sb); + + return credits; +} + +static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) +{ + return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode); +} + +static inline int ocfs2_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + return jbd2_journal_begin_ordered_truncate( + OCFS2_SB(inode->i_sb)->journal->j_journal, + &OCFS2_I(inode)->ip_jinode, + new_size); +} + +#endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c new file mode 100644 index 00000000..210c3523 --- /dev/null +++ b/fs/ocfs2/localalloc.c @@ -0,0 +1,1307 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * localalloc.c + * + * Node local data allocation + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "blockcheck.h" +#include "dlmglue.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "suballoc.h" +#include "super.h" +#include "sysfile.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) + +static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); + +static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc, + u32 *numbits, + struct ocfs2_alloc_reservation *resv); + +static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc); + +static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_dinode *alloc, + struct inode *main_bm_inode, + struct buffer_head *main_bm_bh); + +static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, + struct ocfs2_alloc_context **ac, + struct inode **bitmap_inode, + struct buffer_head **bitmap_bh); + +static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac); + +static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, + struct inode *local_alloc_inode); + +/* + * ocfs2_la_default_mb() - determine a default size, in megabytes of + * the local alloc. + * + * Generally, we'd like to pick as large a local alloc as + * possible. Performance on large workloads tends to scale + * proportionally to la size. In addition to that, the reservations + * code functions more efficiently as it can reserve more windows for + * write. + * + * Some things work against us when trying to choose a large local alloc: + * + * - We need to ensure our sizing is picked to leave enough space in + * group descriptors for other allocations (such as block groups, + * etc). Picking default sizes which are a multiple of 4 could help + * - block groups are allocated in 2mb and 4mb chunks. + * + * - Likewise, we don't want to starve other nodes of bits on small + * file systems. This can easily be taken care of by limiting our + * default to a reasonable size (256M) on larger cluster sizes. + * + * - Some file systems can't support very large sizes - 4k and 8k in + * particular are limited to less than 128 and 256 megabytes respectively. + * + * The following reference table shows group descriptor and local + * alloc maximums at various cluster sizes (4k blocksize) + * + * csize: 4K group: 126M la: 121M + * csize: 8K group: 252M la: 243M + * csize: 16K group: 504M la: 486M + * csize: 32K group: 1008M la: 972M + * csize: 64K group: 2016M la: 1944M + * csize: 128K group: 4032M la: 3888M + * csize: 256K group: 8064M la: 7776M + * csize: 512K group: 16128M la: 15552M + * csize: 1024K group: 32256M la: 31104M + */ +#define OCFS2_LA_MAX_DEFAULT_MB 256 +#define OCFS2_LA_OLD_DEFAULT 8 +unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) +{ + unsigned int la_mb; + unsigned int gd_mb; + unsigned int la_max_mb; + unsigned int megs_per_slot; + struct super_block *sb = osb->sb; + + gd_mb = ocfs2_clusters_to_megabytes(osb->sb, + 8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat)); + + /* + * This takes care of files systems with very small group + * descriptors - 512 byte blocksize at cluster sizes lower + * than 16K and also 1k blocksize with 4k cluster size. + */ + if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192) + || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096)) + return OCFS2_LA_OLD_DEFAULT; + + /* + * Leave enough room for some block groups and make the final + * value we work from a multiple of 4. + */ + gd_mb -= 16; + gd_mb &= 0xFFFFFFFB; + + la_mb = gd_mb; + + /* + * Keep window sizes down to a reasonable default + */ + if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) { + /* + * Some clustersize / blocksize combinations will have + * given us a larger than OCFS2_LA_MAX_DEFAULT_MB + * default size, but get poor distribution when + * limited to exactly 256 megabytes. + * + * As an example, 16K clustersize at 4K blocksize + * gives us a cluster group size of 504M. Paring the + * local alloc size down to 256 however, would give us + * only one window and around 200MB left in the + * cluster group. Instead, find the first size below + * 256 which would give us an even distribution. + * + * Larger cluster group sizes actually work out pretty + * well when pared to 256, so we don't have to do this + * for any group that fits more than two + * OCFS2_LA_MAX_DEFAULT_MB windows. + */ + if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB)) + la_mb = 256; + else { + unsigned int gd_mult = gd_mb; + + while (gd_mult > 256) + gd_mult = gd_mult >> 1; + + la_mb = gd_mult; + } + } + + megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots; + megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot); + /* Too many nodes, too few disk clusters. */ + if (megs_per_slot < la_mb) + la_mb = megs_per_slot; + + /* We can't store more bits than we can in a block. */ + la_max_mb = ocfs2_clusters_to_megabytes(osb->sb, + ocfs2_local_alloc_size(sb) * 8); + if (la_mb > la_max_mb) + la_mb = la_max_mb; + + return la_mb; +} + +void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb) +{ + struct super_block *sb = osb->sb; + unsigned int la_default_mb = ocfs2_la_default_mb(osb); + unsigned int la_max_mb; + + la_max_mb = ocfs2_clusters_to_megabytes(sb, + ocfs2_local_alloc_size(sb) * 8); + + trace_ocfs2_la_set_sizes(requested_mb, la_max_mb, la_default_mb); + + if (requested_mb == -1) { + /* No user request - use defaults */ + osb->local_alloc_default_bits = + ocfs2_megabytes_to_clusters(sb, la_default_mb); + } else if (requested_mb > la_max_mb) { + /* Request is too big, we give the maximum available */ + osb->local_alloc_default_bits = + ocfs2_megabytes_to_clusters(sb, la_max_mb); + } else { + osb->local_alloc_default_bits = + ocfs2_megabytes_to_clusters(sb, requested_mb); + } + + osb->local_alloc_bits = osb->local_alloc_default_bits; +} + +static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) +{ + return (osb->local_alloc_state == OCFS2_LA_THROTTLED || + osb->local_alloc_state == OCFS2_LA_ENABLED); +} + +void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, + unsigned int num_clusters) +{ + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED || + osb->local_alloc_state == OCFS2_LA_THROTTLED) + if (num_clusters >= osb->local_alloc_default_bits) { + cancel_delayed_work(&osb->la_enable_wq); + osb->local_alloc_state = OCFS2_LA_ENABLED; + } + spin_unlock(&osb->osb_lock); +} + +void ocfs2_la_enable_worker(struct work_struct *work) +{ + struct ocfs2_super *osb = + container_of(work, struct ocfs2_super, + la_enable_wq.work); + spin_lock(&osb->osb_lock); + osb->local_alloc_state = OCFS2_LA_ENABLED; + spin_unlock(&osb->osb_lock); +} + +/* + * Tell us whether a given allocation should use the local alloc + * file. Otherwise, it has to go to the main bitmap. + * + * This function does semi-dirty reads of local alloc size and state! + * This is ok however, as the values are re-checked once under mutex. + */ +int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) +{ + int ret = 0; + int la_bits; + + spin_lock(&osb->osb_lock); + la_bits = osb->local_alloc_bits; + + if (!ocfs2_la_state_enabled(osb)) + goto bail; + + /* la_bits should be at least twice the size (in clusters) of + * a new block group. We want to be sure block group + * allocations go through the local alloc, so allow an + * allocation to take up to half the bitmap. */ + if (bits > (la_bits / 2)) + goto bail; + + ret = 1; +bail: + trace_ocfs2_alloc_should_use_local( + (unsigned long long)bits, osb->local_alloc_state, la_bits, ret); + spin_unlock(&osb->osb_lock); + return ret; +} + +int ocfs2_load_local_alloc(struct ocfs2_super *osb) +{ + int status = 0; + struct ocfs2_dinode *alloc = NULL; + struct buffer_head *alloc_bh = NULL; + u32 num_used; + struct inode *inode = NULL; + struct ocfs2_local_alloc *la; + + if (osb->local_alloc_bits == 0) + goto bail; + + if (osb->local_alloc_bits >= osb->bitmap_cpg) { + mlog(ML_NOTICE, "Requested local alloc window %d is larger " + "than max possible %u. Using defaults.\n", + osb->local_alloc_bits, (osb->bitmap_cpg - 1)); + osb->local_alloc_bits = + ocfs2_megabytes_to_clusters(osb->sb, + ocfs2_la_default_mb(osb)); + } + + /* read the alloc off disk */ + inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, + osb->slot_num); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_read_inode_block_full(inode, &alloc_bh, + OCFS2_BH_IGNORE_CACHE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + alloc = (struct ocfs2_dinode *) alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + if (!(le32_to_cpu(alloc->i_flags) & + (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) { + mlog(ML_ERROR, "Invalid local alloc inode, %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + status = -EINVAL; + goto bail; + } + + if ((la->la_size == 0) || + (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) { + mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n", + le16_to_cpu(la->la_size)); + status = -EINVAL; + goto bail; + } + + /* do a little verification. */ + num_used = ocfs2_local_alloc_count_bits(alloc); + + /* hopefully the local alloc has always been recovered before + * we load it. */ + if (num_used + || alloc->id1.bitmap1.i_used + || alloc->id1.bitmap1.i_total + || la->la_bm_off) + mlog(ML_ERROR, "Local alloc hasn't been recovered!\n" + "found = %u, set = %u, taken = %u, off = %u\n", + num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), + le32_to_cpu(alloc->id1.bitmap1.i_total), + OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + + osb->local_alloc_bh = alloc_bh; + osb->local_alloc_state = OCFS2_LA_ENABLED; + +bail: + if (status < 0) + brelse(alloc_bh); + if (inode) + iput(inode); + + trace_ocfs2_load_local_alloc(osb->local_alloc_bits); + + if (status) + mlog_errno(status); + return status; +} + +/* + * return any unused bits to the bitmap and write out a clean + * local_alloc. + * + * local_alloc_bh is optional. If not passed, we will simply use the + * one off osb. If you do pass it however, be warned that it *will* be + * returned brelse'd and NULL'd out.*/ +void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) +{ + int status; + handle_t *handle; + struct inode *local_alloc_inode = NULL; + struct buffer_head *bh = NULL; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + struct ocfs2_dinode *alloc_copy = NULL; + struct ocfs2_dinode *alloc = NULL; + + cancel_delayed_work(&osb->la_enable_wq); + flush_workqueue(ocfs2_wq); + + if (osb->local_alloc_state == OCFS2_LA_UNUSED) + goto out; + + local_alloc_inode = + ocfs2_get_system_file_inode(osb, + LOCAL_ALLOC_SYSTEM_INODE, + osb->slot_num); + if (!local_alloc_inode) { + status = -ENOENT; + mlog_errno(status); + goto out; + } + + osb->local_alloc_state = OCFS2_LA_DISABLED; + + ocfs2_resmap_uninit(&osb->osb_la_resmap); + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + status = -EINVAL; + mlog_errno(status); + goto out; + } + + mutex_lock(&main_bm_inode->i_mutex); + + status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); + if (status < 0) { + mlog_errno(status); + goto out_mutex; + } + + /* WINDOW_MOVE_CREDITS is a bit heavy... */ + handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS); + if (IS_ERR(handle)) { + mlog_errno(PTR_ERR(handle)); + handle = NULL; + goto out_unlock; + } + + bh = osb->local_alloc_bh; + alloc = (struct ocfs2_dinode *) bh->b_data; + + alloc_copy = kmalloc(bh->b_size, GFP_NOFS); + if (!alloc_copy) { + status = -ENOMEM; + goto out_commit; + } + memcpy(alloc_copy, alloc, bh->b_size); + + status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode), + bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + ocfs2_clear_local_alloc(alloc); + ocfs2_journal_dirty(handle, bh); + + brelse(bh); + osb->local_alloc_bh = NULL; + osb->local_alloc_state = OCFS2_LA_UNUSED; + + status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, + main_bm_inode, main_bm_bh); + if (status < 0) + mlog_errno(status); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_unlock: + brelse(main_bm_bh); + + ocfs2_inode_unlock(main_bm_inode, 1); + +out_mutex: + mutex_unlock(&main_bm_inode->i_mutex); + iput(main_bm_inode); + +out: + if (local_alloc_inode) + iput(local_alloc_inode); + + if (alloc_copy) + kfree(alloc_copy); +} + +/* + * We want to free the bitmap bits outside of any recovery context as + * we'll need a cluster lock to do so, but we must clear the local + * alloc before giving up the recovered nodes journal. To solve this, + * we kmalloc a copy of the local alloc before it's change for the + * caller to process with ocfs2_complete_local_alloc_recovery + */ +int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, + int slot_num, + struct ocfs2_dinode **alloc_copy) +{ + int status = 0; + struct buffer_head *alloc_bh = NULL; + struct inode *inode = NULL; + struct ocfs2_dinode *alloc; + + trace_ocfs2_begin_local_alloc_recovery(slot_num); + + *alloc_copy = NULL; + + inode = ocfs2_get_system_file_inode(osb, + LOCAL_ALLOC_SYSTEM_INODE, + slot_num); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + mutex_lock(&inode->i_mutex); + + status = ocfs2_read_inode_block_full(inode, &alloc_bh, + OCFS2_BH_IGNORE_CACHE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL); + if (!(*alloc_copy)) { + status = -ENOMEM; + goto bail; + } + memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size); + + alloc = (struct ocfs2_dinode *) alloc_bh->b_data; + ocfs2_clear_local_alloc(alloc); + + ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check); + status = ocfs2_write_block(osb, alloc_bh, INODE_CACHE(inode)); + if (status < 0) + mlog_errno(status); + +bail: + if ((status < 0) && (*alloc_copy)) { + kfree(*alloc_copy); + *alloc_copy = NULL; + } + + brelse(alloc_bh); + + if (inode) { + mutex_unlock(&inode->i_mutex); + iput(inode); + } + + if (status) + mlog_errno(status); + return status; +} + +/* + * Step 2: By now, we've completed the journal recovery, we've stamped + * a clean local alloc on disk and dropped the node out of the + * recovery map. Dlm locks will no longer stall, so lets clear out the + * main bitmap. + */ +int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc) +{ + int status; + handle_t *handle; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode; + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + status = -EINVAL; + mlog_errno(status); + goto out; + } + + mutex_lock(&main_bm_inode->i_mutex); + + status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); + if (status < 0) { + mlog_errno(status); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto out_unlock; + } + + /* we want the bitmap change to be recorded on disk asap */ + handle->h_sync = 1; + + status = ocfs2_sync_local_to_main(osb, handle, alloc, + main_bm_inode, main_bm_bh); + if (status < 0) + mlog_errno(status); + + ocfs2_commit_trans(osb, handle); + +out_unlock: + ocfs2_inode_unlock(main_bm_inode, 1); + +out_mutex: + mutex_unlock(&main_bm_inode->i_mutex); + + brelse(main_bm_bh); + + iput(main_bm_inode); + +out: + if (!status) + ocfs2_init_steal_slots(osb); + if (status) + mlog_errno(status); + return status; +} + +/* + * make sure we've got at least bits_wanted contiguous bits in the + * local alloc. You lose them when you drop i_mutex. + * + * We will add ourselves to the transaction passed in, but may start + * our own in order to shift windows. + */ +int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, + u32 bits_wanted, + struct ocfs2_alloc_context *ac) +{ + int status; + struct ocfs2_dinode *alloc; + struct inode *local_alloc_inode; + unsigned int free_bits; + + BUG_ON(!ac); + + local_alloc_inode = + ocfs2_get_system_file_inode(osb, + LOCAL_ALLOC_SYSTEM_INODE, + osb->slot_num); + if (!local_alloc_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail; + } + + mutex_lock(&local_alloc_inode->i_mutex); + + /* + * We must double check state and allocator bits because + * another process may have changed them while holding i_mutex. + */ + spin_lock(&osb->osb_lock); + if (!ocfs2_la_state_enabled(osb) || + (bits_wanted > osb->local_alloc_bits)) { + spin_unlock(&osb->osb_lock); + status = -ENOSPC; + goto bail; + } + spin_unlock(&osb->osb_lock); + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + +#ifdef CONFIG_OCFS2_DEBUG_FS + if (le32_to_cpu(alloc->id1.bitmap1.i_used) != + ocfs2_local_alloc_count_bits(alloc)) { + ocfs2_error(osb->sb, "local alloc inode %llu says it has " + "%u free bits, but a count shows %u", + (unsigned long long)le64_to_cpu(alloc->i_blkno), + le32_to_cpu(alloc->id1.bitmap1.i_used), + ocfs2_local_alloc_count_bits(alloc)); + status = -EIO; + goto bail; + } +#endif + + free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - + le32_to_cpu(alloc->id1.bitmap1.i_used); + if (bits_wanted > free_bits) { + /* uhoh, window change time. */ + status = + ocfs2_local_alloc_slide_window(osb, local_alloc_inode); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + /* + * Under certain conditions, the window slide code + * might have reduced the number of bits available or + * disabled the the local alloc entirely. Re-check + * here and return -ENOSPC if necessary. + */ + status = -ENOSPC; + if (!ocfs2_la_state_enabled(osb)) + goto bail; + + free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - + le32_to_cpu(alloc->id1.bitmap1.i_used); + if (bits_wanted > free_bits) + goto bail; + } + + ac->ac_inode = local_alloc_inode; + /* We should never use localalloc from another slot */ + ac->ac_alloc_slot = osb->slot_num; + ac->ac_which = OCFS2_AC_USE_LOCAL; + get_bh(osb->local_alloc_bh); + ac->ac_bh = osb->local_alloc_bh; + status = 0; +bail: + if (status < 0 && local_alloc_inode) { + mutex_unlock(&local_alloc_inode->i_mutex); + iput(local_alloc_inode); + } + + trace_ocfs2_reserve_local_alloc_bits( + (unsigned long long)ac->ac_max_block, + bits_wanted, osb->slot_num, status); + + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u32 *bit_off, + u32 *num_bits) +{ + int status, start; + struct inode *local_alloc_inode; + void *bitmap; + struct ocfs2_dinode *alloc; + struct ocfs2_local_alloc *la; + + BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); + + local_alloc_inode = ac->ac_inode; + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted, + ac->ac_resv); + if (start == -1) { + /* TODO: Shouldn't we just BUG here? */ + status = -ENOSPC; + mlog_errno(status); + goto bail; + } + + bitmap = la->la_bitmap; + *bit_off = le32_to_cpu(la->la_bm_off) + start; + *num_bits = bits_wanted; + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(local_alloc_inode), + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start, + bits_wanted); + + while(bits_wanted--) + ocfs2_set_bit(start++, bitmap); + + le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits); + ocfs2_journal_dirty(handle, osb->local_alloc_bh); + +bail: + if (status) + mlog_errno(status); + return status; +} + +static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) +{ + int i; + u8 *buffer; + u32 count = 0; + struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); + + buffer = la->la_bitmap; + for (i = 0; i < le16_to_cpu(la->la_size); i++) + count += hweight8(buffer[i]); + + trace_ocfs2_local_alloc_count_bits(count); + return count; +} + +static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc, + u32 *numbits, + struct ocfs2_alloc_reservation *resv) +{ + int numfound, bitoff, left, startoff, lastzero; + int local_resv = 0; + struct ocfs2_alloc_reservation r; + void *bitmap = NULL; + struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap; + + if (!alloc->id1.bitmap1.i_total) { + bitoff = -1; + goto bail; + } + + if (!resv) { + local_resv = 1; + ocfs2_resv_init_once(&r); + ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP); + resv = &r; + } + + numfound = *numbits; + if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) { + if (numfound < *numbits) + *numbits = numfound; + goto bail; + } + + /* + * Code error. While reservations are enabled, local + * allocation should _always_ go through them. + */ + BUG_ON(osb->osb_resv_level != 0); + + /* + * Reservations are disabled. Handle this the old way. + */ + + bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; + + numfound = bitoff = startoff = 0; + lastzero = -1; + left = le32_to_cpu(alloc->id1.bitmap1.i_total); + while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) { + if (bitoff == left) { + /* mlog(0, "bitoff (%d) == left", bitoff); */ + break; + } + /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, " + "numfound = %d\n", bitoff, startoff, numfound);*/ + + /* Ok, we found a zero bit... is it contig. or do we + * start over?*/ + if (bitoff == startoff) { + /* we found a zero */ + numfound++; + startoff++; + } else { + /* got a zero after some ones */ + numfound = 1; + startoff = bitoff+1; + } + /* we got everything we needed */ + if (numfound == *numbits) { + /* mlog(0, "Found it all!\n"); */ + break; + } + } + + trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound); + + if (numfound == *numbits) + bitoff = startoff - numfound; + else + bitoff = -1; + +bail: + if (local_resv) + ocfs2_resv_discard(resmap, resv); + + trace_ocfs2_local_alloc_find_clear_bits(*numbits, + le32_to_cpu(alloc->id1.bitmap1.i_total), + bitoff, numfound); + + return bitoff; +} + +static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc) +{ + struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); + int i; + + alloc->id1.bitmap1.i_total = 0; + alloc->id1.bitmap1.i_used = 0; + la->la_bm_off = 0; + for(i = 0; i < le16_to_cpu(la->la_size); i++) + la->la_bitmap[i] = 0; +} + +#if 0 +/* turn this on and uncomment below to aid debugging window shifts. */ +static void ocfs2_verify_zero_bits(unsigned long *bitmap, + unsigned int start, + unsigned int count) +{ + unsigned int tmp = count; + while(tmp--) { + if (ocfs2_test_bit(start + tmp, bitmap)) { + printk("ocfs2_verify_zero_bits: start = %u, count = " + "%u\n", start, count); + printk("ocfs2_verify_zero_bits: bit %u is set!", + start + tmp); + BUG(); + } + } +} +#endif + +/* + * sync the local alloc to main bitmap. + * + * assumes you've already locked the main bitmap -- the bitmap inode + * passed is used for caching. + */ +static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_dinode *alloc, + struct inode *main_bm_inode, + struct buffer_head *main_bm_bh) +{ + int status = 0; + int bit_off, left, count, start; + u64 la_start_blk; + u64 blkno; + void *bitmap; + struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); + + trace_ocfs2_sync_local_to_main( + le32_to_cpu(alloc->id1.bitmap1.i_total), + le32_to_cpu(alloc->id1.bitmap1.i_used)); + + if (!alloc->id1.bitmap1.i_total) { + goto bail; + } + + if (le32_to_cpu(alloc->id1.bitmap1.i_used) == + le32_to_cpu(alloc->id1.bitmap1.i_total)) { + goto bail; + } + + la_start_blk = ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(la->la_bm_off)); + bitmap = la->la_bitmap; + start = count = bit_off = 0; + left = le32_to_cpu(alloc->id1.bitmap1.i_total); + + while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) + != -1) { + if ((bit_off < left) && (bit_off == start)) { + count++; + start++; + continue; + } + if (count) { + blkno = la_start_blk + + ocfs2_clusters_to_blocks(osb->sb, + start - count); + + trace_ocfs2_sync_local_to_main_free( + count, start - count, + (unsigned long long)la_start_blk, + (unsigned long long)blkno); + + status = ocfs2_release_clusters(handle, + main_bm_inode, + main_bm_bh, blkno, + count); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + if (bit_off >= left) + break; + count = 1; + start = bit_off + 1; + } + +bail: + if (status) + mlog_errno(status); + return status; +} + +enum ocfs2_la_event { + OCFS2_LA_EVENT_SLIDE, /* Normal window slide. */ + OCFS2_LA_EVENT_FRAGMENTED, /* The global bitmap has + * enough bits theoretically + * free, but a contiguous + * allocation could not be + * found. */ + OCFS2_LA_EVENT_ENOSPC, /* Global bitmap doesn't have + * enough bits free to satisfy + * our request. */ +}; +#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ) +/* + * Given an event, calculate the size of our next local alloc window. + * + * This should always be called under i_mutex of the local alloc inode + * so that local alloc disabling doesn't race with processes trying to + * use the allocator. + * + * Returns the state which the local alloc was left in. This value can + * be ignored by some paths. + */ +static int ocfs2_recalc_la_window(struct ocfs2_super *osb, + enum ocfs2_la_event event) +{ + unsigned int bits; + int state; + + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED) { + WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED); + goto out_unlock; + } + + /* + * ENOSPC and fragmentation are treated similarly for now. + */ + if (event == OCFS2_LA_EVENT_ENOSPC || + event == OCFS2_LA_EVENT_FRAGMENTED) { + /* + * We ran out of contiguous space in the primary + * bitmap. Drastically reduce the number of bits used + * by local alloc until we have to disable it. + */ + bits = osb->local_alloc_bits >> 1; + if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) { + /* + * By setting state to THROTTLED, we'll keep + * the number of local alloc bits used down + * until an event occurs which would give us + * reason to assume the bitmap situation might + * have changed. + */ + osb->local_alloc_state = OCFS2_LA_THROTTLED; + osb->local_alloc_bits = bits; + } else { + osb->local_alloc_state = OCFS2_LA_DISABLED; + } + queue_delayed_work(ocfs2_wq, &osb->la_enable_wq, + OCFS2_LA_ENABLE_INTERVAL); + goto out_unlock; + } + + /* + * Don't increase the size of the local alloc window until we + * know we might be able to fulfill the request. Otherwise, we + * risk bouncing around the global bitmap during periods of + * low space. + */ + if (osb->local_alloc_state != OCFS2_LA_THROTTLED) + osb->local_alloc_bits = osb->local_alloc_default_bits; + +out_unlock: + state = osb->local_alloc_state; + spin_unlock(&osb->osb_lock); + + return state; +} + +static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, + struct ocfs2_alloc_context **ac, + struct inode **bitmap_inode, + struct buffer_head **bitmap_bh) +{ + int status; + + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + +retry_enospc: + (*ac)->ac_bits_wanted = osb->local_alloc_default_bits; + status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + if (status == -ENOSPC) { + if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == + OCFS2_LA_DISABLED) + goto bail; + + ocfs2_free_ac_resource(*ac); + memset(*ac, 0, sizeof(struct ocfs2_alloc_context)); + goto retry_enospc; + } + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *bitmap_inode = (*ac)->ac_inode; + igrab(*bitmap_inode); + *bitmap_bh = (*ac)->ac_bh; + get_bh(*bitmap_bh); + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + if (status) + mlog_errno(status); + return status; +} + +/* + * pass it the bitmap lock in lock_bh if you have it. + */ +static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac) +{ + int status = 0; + u32 cluster_off, cluster_count; + struct ocfs2_dinode *alloc = NULL; + struct ocfs2_local_alloc *la; + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + trace_ocfs2_local_alloc_new_window( + le32_to_cpu(alloc->id1.bitmap1.i_total), + osb->local_alloc_bits); + + /* Instruct the allocation code to try the most recently used + * cluster group. We'll re-record the group used this pass + * below. */ + ac->ac_last_group = osb->la_last_gd; + + /* we used the generic suballoc reserve function, but we set + * everything up nicely, so there's no reason why we can't use + * the more specific cluster api to claim bits. */ + status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits, + &cluster_off, &cluster_count); + if (status == -ENOSPC) { +retry_enospc: + /* + * Note: We could also try syncing the journal here to + * allow use of any free bits which the current + * transaction can't give us access to. --Mark + */ + if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) == + OCFS2_LA_DISABLED) + goto bail; + + ac->ac_bits_wanted = osb->local_alloc_default_bits; + status = ocfs2_claim_clusters(handle, ac, + osb->local_alloc_bits, + &cluster_off, + &cluster_count); + if (status == -ENOSPC) + goto retry_enospc; + /* + * We only shrunk the *minimum* number of in our + * request - it's entirely possible that the allocator + * might give us more than we asked for. + */ + if (status == 0) { + spin_lock(&osb->osb_lock); + osb->local_alloc_bits = cluster_count; + spin_unlock(&osb->osb_lock); + } + } + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + osb->la_last_gd = ac->ac_last_group; + + la->la_bm_off = cpu_to_le32(cluster_off); + alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count); + /* just in case... In the future when we find space ourselves, + * we don't have to get all contiguous -- but we'll have to + * set all previously used bits in bitmap and update + * la_bits_set before setting the bits in the main bitmap. */ + alloc->id1.bitmap1.i_used = 0; + memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0, + le16_to_cpu(la->la_size)); + + ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count, + OCFS2_LOCAL_ALLOC(alloc)->la_bitmap); + + trace_ocfs2_local_alloc_new_window_result( + OCFS2_LOCAL_ALLOC(alloc)->la_bm_off, + le32_to_cpu(alloc->id1.bitmap1.i_total)); + +bail: + if (status) + mlog_errno(status); + return status; +} + +/* Note that we do *NOT* lock the local alloc inode here as + * it's been locked already for us. */ +static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, + struct inode *local_alloc_inode) +{ + int status = 0; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + handle_t *handle = NULL; + struct ocfs2_dinode *alloc; + struct ocfs2_dinode *alloc_copy = NULL; + struct ocfs2_alloc_context *ac = NULL; + + ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE); + + /* This will lock the main bitmap for us. */ + status = ocfs2_local_alloc_reserve_for_window(osb, + &ac, + &main_bm_inode, + &main_bm_bh); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + + /* We want to clear the local alloc before doing anything + * else, so that if we error later during this operation, + * local alloc shutdown won't try to double free main bitmap + * bits. Make a copy so the sync function knows which bits to + * free. */ + alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_NOFS); + if (!alloc_copy) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(local_alloc_inode), + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_clear_local_alloc(alloc); + ocfs2_journal_dirty(handle, osb->local_alloc_bh); + + status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, + main_bm_inode, main_bm_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_local_alloc_new_window(osb, handle, ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + atomic_inc(&osb->alloc_stats.moves); + +bail: + if (handle) + ocfs2_commit_trans(osb, handle); + + brelse(main_bm_bh); + + if (main_bm_inode) + iput(main_bm_inode); + + if (alloc_copy) + kfree(alloc_copy); + + if (ac) + ocfs2_free_alloc_context(ac); + + if (status) + mlog_errno(status); + return status; +} + diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h new file mode 100644 index 00000000..1be9b586 --- /dev/null +++ b/fs/ocfs2/localalloc.h @@ -0,0 +1,62 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * localalloc.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_LOCALALLOC_H +#define OCFS2_LOCALALLOC_H + +int ocfs2_load_local_alloc(struct ocfs2_super *osb); + +void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb); + +void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb); +unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb); + +int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, + int node_num, + struct ocfs2_dinode **alloc_copy); + +int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc); + +int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, + u64 bits); + +struct ocfs2_alloc_context; +int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, + u32 bits_wanted, + struct ocfs2_alloc_context *ac); + +int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u32 *bit_off, + u32 *num_bits); + +void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, + unsigned int num_clusters); +void ocfs2_la_enable_worker(struct work_struct *work); + +#endif /* OCFS2_LOCALALLOC_H */ diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c new file mode 100644 index 00000000..e57c8040 --- /dev/null +++ b/fs/ocfs2/locks.c @@ -0,0 +1,139 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * locks.c + * + * Userspace file locking support + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include + +#include + +#include "ocfs2.h" + +#include "dlmglue.h" +#include "file.h" +#include "inode.h" +#include "locks.h" + +static int ocfs2_do_flock(struct file *file, struct inode *inode, + int cmd, struct file_lock *fl) +{ + int ret = 0, level = 0, trylock = 0; + struct ocfs2_file_private *fp = file->private_data; + struct ocfs2_lock_res *lockres = &fp->fp_flock; + + if (fl->fl_type == F_WRLCK) + level = 1; + if (!IS_SETLKW(cmd)) + trylock = 1; + + mutex_lock(&fp->fp_mutex); + + if (lockres->l_flags & OCFS2_LOCK_ATTACHED && + lockres->l_level > LKM_NLMODE) { + int old_level = 0; + + if (lockres->l_level == LKM_EXMODE) + old_level = 1; + + if (level == old_level) + goto out; + + /* + * Converting an existing lock is not guaranteed to be + * atomic, so we can get away with simply unlocking + * here and allowing the lock code to try at the new + * level. + */ + + flock_lock_file_wait(file, + &(struct file_lock){.fl_type = F_UNLCK}); + + ocfs2_file_unlock(file); + } + + ret = ocfs2_file_lock(file, level, trylock); + if (ret) { + if (ret == -EAGAIN && trylock) + ret = -EWOULDBLOCK; + else + mlog_errno(ret); + goto out; + } + + ret = flock_lock_file_wait(file, fl); + +out: + mutex_unlock(&fp->fp_mutex); + + return ret; +} + +static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl) +{ + int ret; + struct ocfs2_file_private *fp = file->private_data; + + mutex_lock(&fp->fp_mutex); + ocfs2_file_unlock(file); + ret = flock_lock_file_wait(file, fl); + mutex_unlock(&fp->fp_mutex); + + return ret; +} + +/* + * Overall flow of ocfs2_flock() was influenced by gfs2_flock(). + */ +int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file->f_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + if (__mandatory_lock(inode)) + return -ENOLCK; + + if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || + ocfs2_mount_local(osb)) + return flock_lock_file_wait(file, fl); + + if (fl->fl_type == F_UNLCK) + return ocfs2_do_funlock(file, cmd, fl); + else + return ocfs2_do_flock(file, inode, cmd, fl); +} + +int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file->f_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); +} diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h new file mode 100644 index 00000000..496d488b --- /dev/null +++ b/fs/ocfs2/locks.h @@ -0,0 +1,32 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * locks.h + * + * Function prototypes for Userspace file locking support + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_LOCKS_H +#define OCFS2_LOCKS_H + +int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl); +int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl); + +#endif /* OCFS2_LOCKS_H */ diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c new file mode 100644 index 00000000..3e9393ca --- /dev/null +++ b/fs/ocfs2/mmap.c @@ -0,0 +1,197 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * mmap.c + * + * Code to deal with the mess that is clustered mmap. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "aops.h" +#include "dlmglue.h" +#include "file.h" +#include "inode.h" +#include "mmap.h" +#include "super.h" +#include "ocfs2_trace.h" + + +static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) +{ + sigset_t oldset; + int ret; + + ocfs2_block_signals(&oldset); + ret = filemap_fault(area, vmf); + ocfs2_unblock_signals(&oldset); + + trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno, + area, vmf->page, vmf->pgoff); + return ret; +} + +static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, + struct page *page) +{ + int ret; + struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + loff_t pos = page_offset(page); + unsigned int len = PAGE_CACHE_SIZE; + pgoff_t last_index; + struct page *locked_page = NULL; + void *fsdata; + loff_t size = i_size_read(inode); + + /* + * Another node might have truncated while we were waiting on + * cluster locks. + * We don't check size == 0 before the shift. This is borrowed + * from do_generic_file_read. + */ + last_index = (size - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!size || page->index > last_index)) { + ret = -EINVAL; + goto out; + } + + /* + * The i_size check above doesn't catch the case where nodes + * truncated and then re-extended the file. We'll re-check the + * page mapping after taking the page lock inside of + * ocfs2_write_begin_nolock(). + */ + if (!PageUptodate(page) || page->mapping != inode->i_mapping) { + /* + * the page has been umapped in ocfs2_data_downconvert_worker. + * So return 0 here and let VFS retry. + */ + ret = 0; + goto out; + } + + /* + * Call ocfs2_write_begin() and ocfs2_write_end() to take + * advantage of the allocation code there. We pass a write + * length of the whole page (chopped to i_size) to make sure + * the whole thing is allocated. + * + * Since we know the page is up to date, we don't have to + * worry about ocfs2_write_begin() skipping some buffer reads + * because the "write" would invalidate their data. + */ + if (page->index == last_index) + len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; + + ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, + &fsdata, di_bh, page); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, + fsdata); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + BUG_ON(ret != len); + ret = 0; +out: + return ret; +} + +static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct buffer_head *di_bh = NULL; + sigset_t oldset; + int ret; + + ocfs2_block_signals(&oldset); + + /* + * The cluster locks taken will block a truncate from another + * node. Taking the data lock will also ensure that we don't + * attempt page truncation as part of a downconvert. + */ + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + /* + * The alloc sem should be enough to serialize with + * ocfs2_truncate_file() changing i_size as well as any thread + * modifying the inode btree. + */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page); + + up_write(&OCFS2_I(inode)->ip_alloc_sem); + + brelse(di_bh); + ocfs2_inode_unlock(inode, 1); + +out: + ocfs2_unblock_signals(&oldset); + if (ret) + ret = VM_FAULT_SIGBUS; + return ret; +} + +static const struct vm_operations_struct ocfs2_file_vm_ops = { + .fault = ocfs2_fault, + .page_mkwrite = ocfs2_page_mkwrite, +}; + +int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) +{ + int ret = 0, lock_level = 0; + + ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode, + file->f_vfsmnt, &lock_level); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level); +out: + vma->vm_ops = &ocfs2_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} + diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h new file mode 100644 index 00000000..1274ee0f --- /dev/null +++ b/fs/ocfs2/mmap.h @@ -0,0 +1,6 @@ +#ifndef OCFS2_MMAP_H +#define OCFS2_MMAP_H + +int ocfs2_mmap(struct file *file, struct vm_area_struct *vma); + +#endif /* OCFS2_MMAP_H */ diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c new file mode 100644 index 00000000..cd942702 --- /dev/null +++ b/fs/ocfs2/move_extents.c @@ -0,0 +1,1152 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * move_extents.c + * + * Copyright (C) 2011 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include + +#include + +#include "ocfs2.h" +#include "ocfs2_ioctl.h" + +#include "alloc.h" +#include "aops.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "inode.h" +#include "journal.h" +#include "suballoc.h" +#include "uptodate.h" +#include "super.h" +#include "dir.h" +#include "buffer_head_io.h" +#include "sysfile.h" +#include "suballoc.h" +#include "refcounttree.h" +#include "move_extents.h" + +struct ocfs2_move_extents_context { + struct inode *inode; + struct file *file; + int auto_defrag; + int partial; + int credits; + u32 new_phys_cpos; + u32 clusters_moved; + u64 refcount_loc; + struct ocfs2_move_extents *range; + struct ocfs2_extent_tree et; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; +}; + +static int __ocfs2_move_extent(handle_t *handle, + struct ocfs2_move_extents_context *context, + u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, + int ext_flags) +{ + int ret = 0, index; + struct inode *inode = context->inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_rec *rec, replace_rec; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); + u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); + + ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos, + p_cpos, new_p_cpos, len); + if (ret) { + mlog_errno(ret); + goto out; + } + + memset(&replace_rec, 0, sizeof(replace_rec)); + replace_rec.e_cpos = cpu_to_le32(cpos); + replace_rec.e_leaf_clusters = cpu_to_le16(len); + replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, + new_p_cpos)); + + path = ocfs2_new_path_from_et(&context->et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + index = ocfs2_search_extent_list(el, cpos); + if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + ocfs2_error(inode->i_sb, + "Inode %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long)ino, cpos); + ret = -EROFS; + goto out; + } + + rec = &el->l_recs[index]; + + BUG_ON(ext_flags != rec->e_flags); + /* + * after moving/defraging to new location, the extent is not going + * to be refcounted anymore. + */ + replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + context->et.et_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_split_extent(handle, &context->et, path, index, + &replace_rec, context->meta_ac, + &context->dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_journal_dirty(handle, context->et.et_root_bh); + + context->new_phys_cpos = new_p_cpos; + + /* + * need I to append truncate log for old clusters? + */ + if (old_blkno) { + if (ext_flags & OCFS2_EXT_REFCOUNTED) + ret = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(osb->sb, + old_blkno), + len, context->meta_ac, + &context->dealloc, 1); + else + ret = ocfs2_truncate_log_append(osb, handle, + old_blkno, len); + } + +out: + return ret; +} + +/* + * lock allocators, and reserving appropriate number of bits for + * meta blocks and data clusters. + * + * in some cases, we don't need to reserve clusters, just let data_ac + * be NULL. + */ +static int ocfs2_lock_allocators_move_extents(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters_to_move, + u32 extents_to_split, + struct ocfs2_alloc_context **meta_ac, + struct ocfs2_alloc_context **data_ac, + int extra_blocks, + int *credits) +{ + int ret, num_free_extents; + unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + num_free_extents = ocfs2_num_free_extents(osb, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) + extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); + + ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (data_ac) { + ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el, + clusters_to_move + 2); + + mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", + extra_blocks, clusters_to_move, *credits); +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + } + + return ret; +} + +/* + * Using one journal handle to guarantee the data consistency in case + * crash happens anywhere. + * + * XXX: defrag can end up with finishing partial extent as requested, + * due to not enough contiguous clusters can be found in allocator. + */ +static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, + u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) +{ + int ret, credits = 0, extra_blocks = 0, partial = context->partial; + handle_t *handle; + struct inode *inode = context->inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_refcount_tree *ref_tree = NULL; + u32 new_phys_cpos, new_len; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + + if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { + + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & + OCFS2_HAS_REFCOUNT_FL)); + + BUG_ON(!context->refcount_loc); + + ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_prepare_refcount_change_for_del(inode, + context->refcount_loc, + phys_blkno, + *len, + &credits, + &extra_blocks); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, + &context->meta_ac, + &context->data_ac, + extra_blocks, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * should be using allocation reservation strategy there? + * + * if (context->data_ac) + * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + */ + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock_mutex; + } + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock_mutex; + } + + ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, + &new_phys_cpos, &new_len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * allowing partial extent moving is kind of 'pros and cons', it makes + * whole defragmentation less likely to fail, on the contrary, the bad + * thing is it may make the fs even more fragmented after moving, let + * userspace make a good decision here. + */ + if (new_len != *len) { + mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); + if (!partial) { + context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; + ret = -ENOSPC; + goto out_commit; + } + } + + mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, + phys_cpos, new_phys_cpos); + + ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, + new_phys_cpos, ext_flags); + if (ret) + mlog_errno(ret); + + if (partial && (new_len != *len)) + *len = new_len; + + /* + * Here we should write the new page out first if we are + * in write-back mode. + */ + ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_unlock_mutex: + mutex_unlock(&tl_inode->i_mutex); + + if (context->data_ac) { + ocfs2_free_alloc_context(context->data_ac); + context->data_ac = NULL; + } + + if (context->meta_ac) { + ocfs2_free_alloc_context(context->meta_ac); + context->meta_ac = NULL; + } + +out: + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + + return ret; +} + +/* + * find the victim alloc group, where #blkno fits. + */ +static int ocfs2_find_victim_alloc_group(struct inode *inode, + u64 vict_blkno, + int type, int slot, + int *vict_bit, + struct buffer_head **ret_bh) +{ + int ret, i, bits_per_unit = 0; + u64 blkno; + char namebuf[40]; + + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *ac_bh = NULL, *gd_bh = NULL; + struct ocfs2_chain_list *cl; + struct ocfs2_chain_rec *rec; + struct ocfs2_dinode *ac_dinode; + struct ocfs2_group_desc *bg; + + ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); + ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, + strlen(namebuf), &blkno); + if (ret) { + ret = -ENOENT; + goto out; + } + + ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; + cl = &(ac_dinode->id2.i_chain); + rec = &(cl->cl_recs[0]); + + if (type == GLOBAL_BITMAP_SYSTEM_INODE) + bits_per_unit = osb->s_clustersize_bits - + inode->i_sb->s_blocksize_bits; + /* + * 'vict_blkno' was out of the valid range. + */ + if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || + (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << + bits_per_unit))) { + ret = -EINVAL; + goto out; + } + + for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { + + rec = &(cl->cl_recs[i]); + if (!rec) + continue; + + bg = NULL; + + do { + if (!bg) + blkno = le64_to_cpu(rec->c_blkno); + else + blkno = le64_to_cpu(bg->bg_next_group); + + if (gd_bh) { + brelse(gd_bh); + gd_bh = NULL; + } + + ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + bg = (struct ocfs2_group_desc *)gd_bh->b_data; + + if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + + le16_to_cpu(bg->bg_bits))) { + + *ret_bh = gd_bh; + *vict_bit = (vict_blkno - blkno) >> + bits_per_unit; + mlog(0, "find the victim group: #%llu, " + "total_bits: %u, vict_bit: %u\n", + blkno, le16_to_cpu(bg->bg_bits), + *vict_bit); + goto out; + } + + } while (le64_to_cpu(bg->bg_next_group)); + } + + ret = -EINVAL; +out: + brelse(ac_bh); + + /* + * caller has to release the gd_bh properly. + */ + return ret; +} + +/* + * XXX: helper to validate and adjust moving goal. + */ +static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, + struct ocfs2_move_extents *range) +{ + int ret, goal_bit = 0; + + struct buffer_head *gd_bh = NULL; + struct ocfs2_group_desc *bg = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int c_to_b = 1 << (osb->s_clustersize_bits - + inode->i_sb->s_blocksize_bits); + + /* + * make goal become cluster aligned. + */ + range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, + range->me_goal); + /* + * moving goal is not allowd to start with a group desc blok(#0 blk) + * let's compromise to the latter cluster. + */ + if (range->me_goal == le64_to_cpu(bg->bg_blkno)) + range->me_goal += c_to_b; + + /* + * validate goal sits within global_bitmap, and return the victim + * group desc + */ + ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT, + &goal_bit, &gd_bh); + if (ret) + goto out; + + bg = (struct ocfs2_group_desc *)gd_bh->b_data; + + /* + * movement is not gonna cross two groups. + */ + if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < + range->me_len) { + ret = -EINVAL; + goto out; + } + /* + * more exact validations/adjustments will be performed later during + * moving operation for each extent range. + */ + mlog(0, "extents get ready to be moved to #%llu block\n", + range->me_goal); + +out: + brelse(gd_bh); + + return ret; +} + +static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, + int *goal_bit, u32 move_len, u32 max_hop, + u32 *phys_cpos) +{ + int i, used, last_free_bits = 0, base_bit = *goal_bit; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(gd->bg_blkno)); + + for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) { + + used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap); + if (used) { + /* + * we even tried searching the free chunk by jumping + * a 'max_hop' distance, but still failed. + */ + if ((i - base_bit) > max_hop) { + *phys_cpos = 0; + break; + } + + if (last_free_bits) + last_free_bits = 0; + + continue; + } else + last_free_bits++; + + if (last_free_bits == move_len) { + *goal_bit = i; + *phys_cpos = base_cpos + i; + break; + } + } + + mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); +} + +static int ocfs2_alloc_dinode_update_counts(struct inode *inode, + handle_t *handle, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain) +{ + int ret; + u32 tmp_used; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + struct ocfs2_chain_list *cl = + (struct ocfs2_chain_list *) &di->id2.i_chain; + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); + di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); + le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); + ocfs2_journal_dirty(handle, di_bh); + +out: + return ret; +} + +static inline int ocfs2_block_group_set_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits) +{ + int status; + void *bitmap = bg->bg_bitmap; + int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; + + /* All callers get the descriptor via + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); + BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); + + mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, + num_bits); + + if (ocfs2_is_cluster_bitmap(alloc_inode)) + journal_type = OCFS2_JOURNAL_ACCESS_UNDO; + + status = ocfs2_journal_access_gd(handle, + INODE_CACHE(alloc_inode), + group_bh, + journal_type); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + le16_add_cpu(&bg->bg_free_bits_count, -num_bits); + if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" + " count %u but claims %u are freed. num_bits %d", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), num_bits); + return -EROFS; + } + while (num_bits--) + ocfs2_set_bit(bit_off++, bitmap); + + ocfs2_journal_dirty(handle, group_bh); + +bail: + return status; +} + +static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, + u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, + u32 len, int ext_flags) +{ + int ret, credits = 0, extra_blocks = 0, goal_bit = 0; + handle_t *handle; + struct inode *inode = context->inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + struct inode *gb_inode = NULL; + struct buffer_head *gb_bh = NULL; + struct buffer_head *gd_bh = NULL; + struct ocfs2_group_desc *gd; + struct ocfs2_refcount_tree *ref_tree = NULL; + u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb, + context->range->me_threshold); + u64 phys_blkno, new_phys_blkno; + + phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + + if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { + + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & + OCFS2_HAS_REFCOUNT_FL)); + + BUG_ON(!context->refcount_loc); + + ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_prepare_refcount_change_for_del(inode, + context->refcount_loc, + phys_blkno, + len, + &credits, + &extra_blocks); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, + &context->meta_ac, + NULL, extra_blocks, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * need to count 2 extra credits for global_bitmap inode and + * group descriptor. + */ + credits += OCFS2_INODE_UPDATE_CREDITS + 1; + + /* + * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() + * logic, while we still need to lock the global_bitmap. + */ + gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!gb_inode) { + mlog(ML_ERROR, "unable to get global_bitmap inode\n"); + ret = -EIO; + goto out; + } + + mutex_lock(&gb_inode->i_mutex); + + ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_unlock_gb_mutex; + } + + mutex_lock(&tl_inode->i_mutex); + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock_tl_inode; + } + + new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); + ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT, + &goal_bit, &gd_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * probe the victim cluster group to find a proper + * region to fit wanted movement, it even will perfrom + * a best-effort attempt by compromising to a threshold + * around the goal. + */ + ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, + new_phys_cpos); + if (!new_phys_cpos) { + ret = -ENOSPC; + goto out_commit; + } + + ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, + *new_phys_cpos, ext_flags); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + gd = (struct ocfs2_group_desc *)gd_bh->b_data; + ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, + le16_to_cpu(gd->bg_chain)); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, + goal_bit, len); + if (ret) + mlog_errno(ret); + + /* + * Here we should write the new page out first if we are + * in write-back mode. + */ + ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + brelse(gd_bh); + +out_unlock_tl_inode: + mutex_unlock(&tl_inode->i_mutex); + + ocfs2_inode_unlock(gb_inode, 1); +out_unlock_gb_mutex: + mutex_unlock(&gb_inode->i_mutex); + brelse(gb_bh); + iput(gb_inode); + +out: + if (context->meta_ac) { + ocfs2_free_alloc_context(context->meta_ac); + context->meta_ac = NULL; + } + + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + + return ret; +} + +/* + * Helper to calculate the defraging length in one run according to threshold. + */ +static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, + u32 threshold, int *skip) +{ + if ((*alloc_size + *len_defraged) < threshold) { + /* + * proceed defragmentation until we meet the thresh + */ + *len_defraged += *alloc_size; + } else if (*len_defraged == 0) { + /* + * XXX: skip a large extent. + */ + *skip = 1; + } else { + /* + * split this extent to coalesce with former pieces as + * to reach the threshold. + * + * we're done here with one cycle of defragmentation + * in a size of 'thresh', resetting 'len_defraged' + * forces a new defragmentation. + */ + *alloc_size = threshold - *len_defraged; + *len_defraged = 0; + } +} + +static int __ocfs2_move_extents_range(struct buffer_head *di_bh, + struct ocfs2_move_extents_context *context) +{ + int ret = 0, flags, do_defrag, skip = 0; + u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; + u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; + + struct inode *inode = context->inode; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_move_extents *range = context->range; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if ((inode->i_size == 0) || (range->me_len == 0)) + return 0; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; + + context->refcount_loc = le64_to_cpu(di->i_refcount_loc); + + ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); + ocfs2_init_dealloc_ctxt(&context->dealloc); + + /* + * TO-DO XXX: + * + * - xattr extents. + */ + + do_defrag = context->auto_defrag; + + /* + * extents moving happens in unit of clusters, for the sake + * of simplicity, we may ignore two clusters where 'byte_start' + * and 'byte_start + len' were within. + */ + move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); + len_to_move = (range->me_start + range->me_len) >> + osb->s_clustersize_bits; + if (len_to_move >= move_start) + len_to_move -= move_start; + else + len_to_move = 0; + + if (do_defrag) { + defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; + if (defrag_thresh <= 1) + goto done; + } else + new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, + range->me_goal); + + mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " + "thresh: %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)range->me_start, + (unsigned long long)range->me_len, + move_start, len_to_move, defrag_thresh); + + cpos = move_start; + while (len_to_move) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, + &flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (alloc_size > len_to_move) + alloc_size = len_to_move; + + /* + * XXX: how to deal with a hole: + * + * - skip the hole of course + * - force a new defragmentation + */ + if (!phys_cpos) { + if (do_defrag) + len_defraged = 0; + + goto next; + } + + if (do_defrag) { + ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, + defrag_thresh, &skip); + /* + * skip large extents + */ + if (skip) { + skip = 0; + goto next; + } + + mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " + "alloc_size: %u, len_defraged: %u\n", + cpos, phys_cpos, alloc_size, len_defraged); + + ret = ocfs2_defrag_extent(context, cpos, phys_cpos, + &alloc_size, flags); + } else { + ret = ocfs2_move_extent(context, cpos, phys_cpos, + &new_phys_cpos, alloc_size, + flags); + + new_phys_cpos += alloc_size; + } + + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + context->clusters_moved += alloc_size; +next: + cpos += alloc_size; + len_to_move -= alloc_size; + } + +done: + range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; + +out: + range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, + context->clusters_moved); + range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, + context->new_phys_cpos); + + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &context->dealloc); + + return ret; +} + +static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) +{ + int status; + handle_t *handle; + struct inode *inode = context->inode; + struct ocfs2_dinode *di; + struct buffer_head *di_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!inode) + return -ENOENT; + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + mutex_lock(&inode->i_mutex); + + /* + * This prevents concurrent writes from other nodes + */ + status = ocfs2_rw_lock(inode, 1); + if (status) { + mlog_errno(status); + goto out; + } + + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status) { + mlog_errno(status); + goto out_rw_unlock; + } + + /* + * rememer ip_xattr_sem also needs to be held if necessary + */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + status = __ocfs2_move_extents_range(di_bh, context); + + up_write(&OCFS2_I(inode)->ip_alloc_sem); + if (status) { + mlog_errno(status); + goto out_inode_unlock; + } + + /* + * We update ctime for these changes + */ + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_inode_unlock; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status) { + mlog_errno(status); + goto out_commit; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_inode_unlock: + brelse(di_bh); + ocfs2_inode_unlock(inode, 1); +out_rw_unlock: + ocfs2_rw_unlock(inode, 1); +out: + mutex_unlock(&inode->i_mutex); + + return status; +} + +int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) +{ + int status; + + struct inode *inode = filp->f_path.dentry->d_inode; + struct ocfs2_move_extents range; + struct ocfs2_move_extents_context *context = NULL; + + status = mnt_want_write(filp->f_path.mnt); + if (status) + return status; + + if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) + goto out; + + if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { + status = -EPERM; + goto out; + } + + context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); + if (!context) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + + context->inode = inode; + context->file = filp; + + if (argp) { + if (copy_from_user(&range, (struct ocfs2_move_extents *)argp, + sizeof(range))) { + status = -EFAULT; + goto out; + } + } else { + status = -EINVAL; + goto out; + } + + if (range.me_start > i_size_read(inode)) + goto out; + + if (range.me_start + range.me_len > i_size_read(inode)) + range.me_len = i_size_read(inode) - range.me_start; + + context->range = ⦥ + + if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { + context->auto_defrag = 1; + /* + * ok, the default theshold for the defragmentation + * is 1M, since our maximum clustersize was 1M also. + * any thought? + */ + if (!range.me_threshold) + range.me_threshold = 1024 * 1024; + + if (range.me_threshold > i_size_read(inode)) + range.me_threshold = i_size_read(inode); + + if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) + context->partial = 1; + } else { + /* + * first best-effort attempt to validate and adjust the goal + * (physical address in block), while it can't guarantee later + * operation can succeed all the time since global_bitmap may + * change a bit over time. + */ + + status = ocfs2_validate_and_adjust_move_goal(inode, &range); + if (status) + goto out; + } + + status = ocfs2_move_extents(context); + if (status) + mlog_errno(status); +out: + /* + * movement/defragmentation may end up being partially completed, + * that's the reason why we need to return userspace the finished + * length and new_offset even if failure happens somewhere. + */ + if (argp) { + if (copy_to_user((struct ocfs2_move_extents *)argp, &range, + sizeof(range))) + status = -EFAULT; + } + + kfree(context); + + mnt_drop_write(filp->f_path.mnt); + + return status; +} diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h new file mode 100644 index 00000000..4e143e81 --- /dev/null +++ b/fs/ocfs2/move_extents.h @@ -0,0 +1,22 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * move_extents.h + * + * Copyright (C) 2011 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef OCFS2_MOVE_EXTENTS_H +#define OCFS2_MOVE_EXTENTS_H + +int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp); + +#endif /* OCFS2_MOVE_EXTENTS_H */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c new file mode 100644 index 00000000..e5d738cd --- /dev/null +++ b/fs/ocfs2/namei.c @@ -0,0 +1,2501 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * namei.c + * + * Create and rename file, directory, symlinks + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * Portions of this code from linux/fs/ext3/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linux Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "dcache.h" +#include "dir.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "namei.h" +#include "suballoc.h" +#include "super.h" +#include "symlink.h" +#include "sysfile.h" +#include "uptodate.h" +#include "xattr.h" +#include "acl.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +static int ocfs2_mknod_locked(struct ocfs2_super *osb, + struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac); + +static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup); + +static int ocfs2_orphan_add(struct ocfs2_super *osb, + handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + char *name, + struct ocfs2_dir_lookup_result *lookup, + struct inode *orphan_dir_inode); + +static int ocfs2_create_symlink_data(struct ocfs2_super *osb, + handle_t *handle, + struct inode *inode, + const char *symname); + +/* An orphan dir name is an 8 byte value, printed as a hex string */ +#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) + +static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int status; + u64 blkno; + struct inode *inode = NULL; + struct dentry *ret; + struct ocfs2_inode_info *oi; + + trace_ocfs2_lookup(dir, dentry, dentry->d_name.len, + dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, 0); + + if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) { + ret = ERR_PTR(-ENAMETOOLONG); + goto bail; + } + + status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + ret = ERR_PTR(status); + goto bail; + } + + status = ocfs2_lookup_ino_from_name(dir, dentry->d_name.name, + dentry->d_name.len, &blkno); + if (status < 0) + goto bail_add; + + inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0); + if (IS_ERR(inode)) { + ret = ERR_PTR(-EACCES); + goto bail_unlock; + } + + oi = OCFS2_I(inode); + /* Clear any orphaned state... If we were able to look up the + * inode from a directory, it certainly can't be orphaned. We + * might have the bad state from a node which intended to + * orphan this inode but crashed before it could commit the + * unlink. */ + spin_lock(&oi->ip_lock); + oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&oi->ip_lock); + +bail_add: + ret = d_splice_alias(inode, dentry); + + if (inode) { + /* + * If d_splice_alias() finds a DCACHE_DISCONNECTED + * dentry, it will d_move() it on top of ourse. The + * return value will indicate this however, so in + * those cases, we switch them around for the locking + * code. + * + * NOTE: This dentry already has ->d_op set from + * ocfs2_get_parent() and ocfs2_get_dentry() + */ + if (ret) + dentry = ret; + + status = ocfs2_dentry_attach_lock(dentry, inode, + OCFS2_I(dir)->ip_blkno); + if (status) { + mlog_errno(status); + ret = ERR_PTR(status); + goto bail_unlock; + } + } else + ocfs2_dentry_attach_gen(dentry); + +bail_unlock: + /* Don't drop the cluster lock until *after* the d_add -- + * unlink on another node will message us to remove that + * dentry under this lock so otherwise we can race this with + * the downconvert thread and have a stale dentry. */ + ocfs2_inode_unlock(dir, 0); + +bail: + + trace_ocfs2_lookup_ret(ret); + + return ret; +} + +static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) +{ + struct inode *inode; + + inode = new_inode(dir->i_sb); + if (!inode) { + mlog(ML_ERROR, "new_inode failed!\n"); + return NULL; + } + + /* populate as many fields early on as possible - many of + * these are used by the support functions here and in + * callers. */ + if (S_ISDIR(mode)) + inode->i_nlink = 2; + else + inode->i_nlink = 1; + inode_init_owner(inode, dir, mode); + dquot_initialize(inode); + return inode; +} + +static int ocfs2_mknod(struct inode *dir, + struct dentry *dentry, + int mode, + dev_t dev) +{ + int status = 0; + struct buffer_head *parent_fe_bh = NULL; + handle_t *handle = NULL; + struct ocfs2_super *osb; + struct ocfs2_dinode *dirfe; + struct buffer_head *new_fe_bh = NULL; + struct inode *inode = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + int want_clusters = 0; + int want_meta = 0; + int xattr_credits = 0; + struct ocfs2_security_xattr_info si = { + .enable = 1, + }; + int did_quota_inode = 0; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + sigset_t oldset; + int did_block_signals = 0; + + trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long)dev, mode); + + dquot_initialize(dir); + + /* get our super block */ + osb = OCFS2_SB(dir->i_sb); + + status = ocfs2_inode_lock(dir, &parent_fe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) { + status = -EMLINK; + goto leave; + } + + dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + if (!ocfs2_read_links_count(dirfe)) { + /* can't make a file in a deleted directory. */ + status = -ENOENT; + goto leave; + } + + status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (status) + goto leave; + + /* get a spot inside the dir. */ + status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, + dentry->d_name.name, + dentry->d_name.len, &lookup); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* reserve an inode spot */ + status = ocfs2_reserve_new_inode(osb, &inode_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + inode = ocfs2_get_init_inode(dir, mode); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + /* get security xattr */ + status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si); + if (status) { + if (status == -EOPNOTSUPP) + si.enable = 0; + else { + mlog_errno(status); + goto leave; + } + } + + /* calculate meta data/clusters for setting security and acl xattr */ + status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode, + &si, &want_clusters, + &xattr_credits, &want_meta); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* Reserve a cluster if creating an extent based directory. */ + if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) { + want_clusters += 1; + + /* Dir indexing requires extra space as well */ + if (ocfs2_supports_indexed_dirs(osb)) + want_meta++; + } + + status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, + S_ISDIR(mode), + xattr_credits)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + + /* Starting to change things, restart is no longer possible. */ + ocfs2_block_signals(&oldset); + did_block_signals = 1; + + status = dquot_alloc_inode(inode); + if (status) + goto leave; + did_quota_inode = 1; + + /* do the real work now. */ + status = ocfs2_mknod_locked(osb, dir, inode, dev, + &new_fe_bh, parent_fe_bh, handle, + inode_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + if (S_ISDIR(mode)) { + status = ocfs2_fill_new_dir(osb, handle, dir, inode, + new_fe_bh, data_ac, meta_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(dir), + parent_fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + ocfs2_add_links_count(dirfe, 1); + ocfs2_journal_dirty(handle, parent_fe_bh); + inc_nlink(dir); + } + + status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, + meta_ac, data_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + if (si.enable) { + status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, + meta_ac, data_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + } + + /* + * Do this before adding the entry to the directory. We add + * also set d_op after success so that ->d_iput() will cleanup + * the dentry lock even if ocfs2_add_entry() fails below. + */ + status = ocfs2_dentry_attach_lock(dentry, inode, + OCFS2_I(dir)->ip_blkno); + if (status) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, parent_fe_bh, + &lookup); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + insert_inode_hash(inode); + d_instantiate(dentry, inode); + status = 0; +leave: + if (status < 0 && did_quota_inode) + dquot_free_inode(inode); + if (handle) + ocfs2_commit_trans(osb, handle); + + ocfs2_inode_unlock(dir, 1); + if (did_block_signals) + ocfs2_unblock_signals(&oldset); + + brelse(new_fe_bh); + brelse(parent_fe_bh); + kfree(si.name); + kfree(si.value); + + ocfs2_free_dir_lookup_result(&lookup); + + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + /* + * We should call iput after the i_mutex of the bitmap been + * unlocked in ocfs2_free_alloc_context, or the + * ocfs2_delete_inode will mutex_lock again. + */ + if ((status < 0) && inode) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; + clear_nlink(inode); + iput(inode); + } + + if (status) + mlog_errno(status); + + return status; +} + +static int __ocfs2_mknod_locked(struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac, + u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit) +{ + int status = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *fe = NULL; + struct ocfs2_extent_list *fel; + u16 feat; + + *new_fe_bh = NULL; + + /* populate as many fields early on as possible - many of + * these are used by the support functions here and in + * callers. */ + inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); + OCFS2_I(inode)->ip_blkno = fe_blkno; + spin_lock(&osb->osb_lock); + inode->i_generation = osb->s_next_generation++; + spin_unlock(&osb->osb_lock); + + *new_fe_bh = sb_getblk(osb->sb, fe_blkno); + if (!*new_fe_bh) { + status = -EIO; + mlog_errno(status); + goto leave; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), *new_fe_bh); + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + *new_fe_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data; + memset(fe, 0, osb->sb->s_blocksize); + + fe->i_generation = cpu_to_le32(inode->i_generation); + fe->i_fs_generation = cpu_to_le32(osb->fs_generation); + fe->i_blkno = cpu_to_le64(fe_blkno); + fe->i_suballoc_loc = cpu_to_le64(suballoc_loc); + fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); + fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); + fe->i_uid = cpu_to_le32(inode->i_uid); + fe->i_gid = cpu_to_le32(inode->i_gid); + fe->i_mode = cpu_to_le16(inode->i_mode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); + + ocfs2_set_links_count(fe, inode->i_nlink); + + fe->i_last_eb_blk = 0; + strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); + le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL); + fe->i_atime = fe->i_ctime = fe->i_mtime = + cpu_to_le64(CURRENT_TIME.tv_sec); + fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = + cpu_to_le32(CURRENT_TIME.tv_nsec); + fe->i_dtime = 0; + + /* + * If supported, directories start with inline data. If inline + * isn't supported, but indexing is, we start them as indexed. + */ + feat = le16_to_cpu(fe->i_dyn_features); + if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) { + fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); + + fe->id2.i_data.id_count = cpu_to_le16( + ocfs2_max_inline_data_with_xattr(osb->sb, fe)); + } else { + fel = &fe->id2.i_list; + fel->l_tree_depth = 0; + fel->l_next_free_rec = 0; + fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); + } + + ocfs2_journal_dirty(handle, *new_fe_bh); + + ocfs2_populate_inode(inode, fe, 1); + ocfs2_ci_set_new(osb, INODE_CACHE(inode)); + if (!ocfs2_mount_local(osb)) { + status = ocfs2_create_new_inode_locks(inode); + if (status < 0) + mlog_errno(status); + } + + status = 0; /* error in ocfs2_create_new_inode_locks is not + * critical */ + +leave: + if (status < 0) { + if (*new_fe_bh) { + brelse(*new_fe_bh); + *new_fe_bh = NULL; + } + } + + if (status) + mlog_errno(status); + return status; +} + +static int ocfs2_mknod_locked(struct ocfs2_super *osb, + struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac) +{ + int status = 0; + u64 suballoc_loc, fe_blkno = 0; + u16 suballoc_bit; + + *new_fe_bh = NULL; + + status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh, + inode_ac, &suballoc_loc, + &suballoc_bit, &fe_blkno); + if (status < 0) { + mlog_errno(status); + return status; + } + + return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + parent_fe_bh, handle, inode_ac, + fe_blkno, suballoc_loc, suballoc_bit); +} + +static int ocfs2_mkdir(struct inode *dir, + struct dentry *dentry, + int mode) +{ + int ret; + + trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name, + OCFS2_I(dir)->ip_blkno, mode); + ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0); + if (ret) + mlog_errno(ret); + + return ret; +} + +static int ocfs2_create(struct inode *dir, + struct dentry *dentry, + int mode, + struct nameidata *nd) +{ + int ret; + + trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, mode); + ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0); + if (ret) + mlog_errno(ret); + + return ret; +} + +static int ocfs2_link(struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + handle_t *handle; + struct inode *inode = old_dentry->d_inode; + int err; + struct buffer_head *fe_bh = NULL; + struct buffer_head *parent_fe_bh = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dir_lookup_result lookup = { NULL, }; + sigset_t oldset; + + trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno, + old_dentry->d_name.len, old_dentry->d_name.name, + dentry->d_name.len, dentry->d_name.name); + + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + dquot_initialize(dir); + + err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); + if (err < 0) { + if (err != -ENOENT) + mlog_errno(err); + return err; + } + + if (!dir->i_nlink) { + err = -ENOENT; + goto out; + } + + err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (err) + goto out; + + err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, + dentry->d_name.name, + dentry->d_name.len, &lookup); + if (err < 0) { + mlog_errno(err); + goto out; + } + + err = ocfs2_inode_lock(inode, &fe_bh, 1); + if (err < 0) { + if (err != -ENOENT) + mlog_errno(err); + goto out; + } + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) { + err = -EMLINK; + goto out_unlock_inode; + } + + handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb)); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + handle = NULL; + mlog_errno(err); + goto out_unlock_inode; + } + + /* Starting to change things, restart is no longer possible. */ + ocfs2_block_signals(&oldset); + + err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (err < 0) { + mlog_errno(err); + goto out_commit; + } + + inc_nlink(inode); + inode->i_ctime = CURRENT_TIME; + ocfs2_set_links_count(fe, inode->i_nlink); + fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_journal_dirty(handle, fe_bh); + + err = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, + parent_fe_bh, &lookup); + if (err) { + ocfs2_add_links_count(fe, -1); + drop_nlink(inode); + mlog_errno(err); + goto out_commit; + } + + err = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); + if (err) { + mlog_errno(err); + goto out_commit; + } + + ihold(inode); + d_instantiate(dentry, inode); + +out_commit: + ocfs2_commit_trans(osb, handle); + ocfs2_unblock_signals(&oldset); +out_unlock_inode: + ocfs2_inode_unlock(inode, 1); + +out: + ocfs2_inode_unlock(dir, 1); + + brelse(fe_bh); + brelse(parent_fe_bh); + + ocfs2_free_dir_lookup_result(&lookup); + + if (err) + mlog_errno(err); + + return err; +} + +/* + * Takes and drops an exclusive lock on the given dentry. This will + * force other nodes to drop it. + */ +static int ocfs2_remote_dentry_delete(struct dentry *dentry) +{ + int ret; + + ret = ocfs2_dentry_lock(dentry, 1); + if (ret) + mlog_errno(ret); + else + ocfs2_dentry_unlock(dentry, 1); + + return ret; +} + +static inline int inode_is_unlinkable(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode)) { + if (inode->i_nlink == 2) + return 1; + return 0; + } + + if (inode->i_nlink == 1) + return 1; + return 0; +} + +static int ocfs2_unlink(struct inode *dir, + struct dentry *dentry) +{ + int status; + int child_locked = 0; + struct inode *inode = dentry->d_inode; + struct inode *orphan_dir = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + u64 blkno; + struct ocfs2_dinode *fe = NULL; + struct buffer_head *fe_bh = NULL; + struct buffer_head *parent_node_bh = NULL; + handle_t *handle = NULL; + char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + + trace_ocfs2_unlink(dir, dentry, dentry->d_name.len, + dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + dquot_initialize(dir); + + BUG_ON(dentry->d_parent->d_inode != dir); + + if (inode == osb->root_inode) + return -EPERM; + + status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1, + OI_LS_PARENT); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + status = ocfs2_find_files_on_disk(dentry->d_name.name, + dentry->d_name.len, &blkno, dir, + &lookup); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto leave; + } + + if (OCFS2_I(inode)->ip_blkno != blkno) { + status = -ENOENT; + + trace_ocfs2_unlink_noent( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)blkno, + OCFS2_I(inode)->ip_flags); + goto leave; + } + + status = ocfs2_inode_lock(inode, &fe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto leave; + } + child_locked = 1; + + if (S_ISDIR(inode->i_mode)) { + if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) { + status = -ENOTEMPTY; + goto leave; + } + } + + status = ocfs2_remote_dentry_delete(dentry); + if (status < 0) { + /* This remote delete should succeed under all normal + * circumstances. */ + mlog_errno(status); + goto leave; + } + + if (inode_is_unlinkable(inode)) { + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, + OCFS2_I(inode)->ip_blkno, + orphan_name, &orphan_insert); + if (status < 0) { + mlog_errno(status); + goto leave; + } + } + + handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + if (inode_is_unlinkable(inode)) { + status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name, + &orphan_insert, orphan_dir); + if (status < 0) { + mlog_errno(status); + goto leave; + } + } + + /* delete the name from the parent dir */ + status = ocfs2_delete_entry(handle, dir, &lookup); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + if (S_ISDIR(inode->i_mode)) + drop_nlink(inode); + drop_nlink(inode); + ocfs2_set_links_count(fe, inode->i_nlink); + ocfs2_journal_dirty(handle, fe_bh); + + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + if (S_ISDIR(inode->i_mode)) + drop_nlink(dir); + + status = ocfs2_mark_inode_dirty(handle, dir, parent_node_bh); + if (status < 0) { + mlog_errno(status); + if (S_ISDIR(inode->i_mode)) + inc_nlink(dir); + } + +leave: + if (handle) + ocfs2_commit_trans(osb, handle); + + if (child_locked) + ocfs2_inode_unlock(inode, 1); + + ocfs2_inode_unlock(dir, 1); + + if (orphan_dir) { + /* This was locked for us in ocfs2_prepare_orphan_dir() */ + ocfs2_inode_unlock(orphan_dir, 1); + mutex_unlock(&orphan_dir->i_mutex); + iput(orphan_dir); + } + + brelse(fe_bh); + brelse(parent_node_bh); + + ocfs2_free_dir_lookup_result(&orphan_insert); + ocfs2_free_dir_lookup_result(&lookup); + + if (status) + mlog_errno(status); + + return status; +} + +/* + * The only place this should be used is rename! + * if they have the same id, then the 1st one is the only one locked. + */ +static int ocfs2_double_lock(struct ocfs2_super *osb, + struct buffer_head **bh1, + struct inode *inode1, + struct buffer_head **bh2, + struct inode *inode2) +{ + int status; + struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); + struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); + struct buffer_head **tmpbh; + struct inode *tmpinode; + + trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); + + if (*bh1) + *bh1 = NULL; + if (*bh2) + *bh2 = NULL; + + /* we always want to lock the one with the lower lockid first. */ + if (oi1->ip_blkno != oi2->ip_blkno) { + if (oi1->ip_blkno < oi2->ip_blkno) { + /* switch id1 and id2 around */ + tmpbh = bh2; + bh2 = bh1; + bh1 = tmpbh; + + tmpinode = inode2; + inode2 = inode1; + inode1 = tmpinode; + } + /* lock id2 */ + status = ocfs2_inode_lock_nested(inode2, bh2, 1, + OI_LS_RENAME1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + } + + /* lock id1 */ + status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2); + if (status < 0) { + /* + * An error return must mean that no cluster locks + * were held on function exit. + */ + if (oi1->ip_blkno != oi2->ip_blkno) { + ocfs2_inode_unlock(inode2, 1); + brelse(*bh2); + *bh2 = NULL; + } + + if (status != -ENOENT) + mlog_errno(status); + } + + trace_ocfs2_double_lock_end( + (unsigned long long)OCFS2_I(inode1)->ip_blkno, + (unsigned long long)OCFS2_I(inode2)->ip_blkno); + +bail: + if (status) + mlog_errno(status); + return status; +} + +static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) +{ + ocfs2_inode_unlock(inode1, 1); + + if (inode1 != inode2) + ocfs2_inode_unlock(inode2, 1); +} + +static int ocfs2_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0; + int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0; + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct inode *orphan_dir = NULL; + struct ocfs2_dinode *newfe = NULL; + char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; + struct buffer_head *newfe_bh = NULL; + struct buffer_head *old_inode_bh = NULL; + struct ocfs2_super *osb = NULL; + u64 newfe_blkno, old_de_ino; + handle_t *handle = NULL; + struct buffer_head *old_dir_bh = NULL; + struct buffer_head *new_dir_bh = NULL; + nlink_t old_dir_nlink = old_dir->i_nlink; + struct ocfs2_dinode *old_di; + struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, }; + struct ocfs2_dir_lookup_result target_lookup_res = { NULL, }; + struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, }; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + struct ocfs2_dir_lookup_result target_insert = { NULL, }; + + /* At some point it might be nice to break this function up a + * bit. */ + + trace_ocfs2_rename(old_dir, old_dentry, new_dir, new_dentry, + old_dentry->d_name.len, old_dentry->d_name.name, + new_dentry->d_name.len, new_dentry->d_name.name); + + dquot_initialize(old_dir); + dquot_initialize(new_dir); + + osb = OCFS2_SB(old_dir->i_sb); + + if (new_inode) { + if (!igrab(new_inode)) + BUG(); + } + + /* Assume a directory hierarchy thusly: + * a/b/c + * a/d + * a,b,c, and d are all directories. + * + * from cwd of 'a' on both nodes: + * node1: mv b/c d + * node2: mv d b/c + * + * And that's why, just like the VFS, we need a file system + * rename lock. */ + if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) { + status = ocfs2_rename_lock(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + rename_lock = 1; + } + + /* if old and new are the same, this'll just do one lock. */ + status = ocfs2_double_lock(osb, &old_dir_bh, old_dir, + &new_dir_bh, new_dir); + if (status < 0) { + mlog_errno(status); + goto bail; + } + parents_locked = 1; + + /* make sure both dirs have bhs + * get an extra ref on old_dir_bh if old==new */ + if (!new_dir_bh) { + if (old_dir_bh) { + new_dir_bh = old_dir_bh; + get_bh(new_dir_bh); + } else { + mlog(ML_ERROR, "no old_dir_bh!\n"); + status = -EIO; + goto bail; + } + } + + /* + * Aside from allowing a meta data update, the locking here + * also ensures that the downconvert thread on other nodes + * won't have to concurrently downconvert the inode and the + * dentry locks. + */ + status = ocfs2_inode_lock_nested(old_inode, &old_inode_bh, 1, + OI_LS_PARENT); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + old_child_locked = 1; + + status = ocfs2_remote_dentry_delete(old_dentry); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (S_ISDIR(old_inode->i_mode)) { + u64 old_inode_parent; + + update_dot_dot = 1; + status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent, + old_inode, + &old_inode_dot_dot_res); + if (status) { + status = -EIO; + goto bail; + } + + if (old_inode_parent != OCFS2_I(old_dir)->ip_blkno) { + status = -EIO; + goto bail; + } + + if (!new_inode && new_dir != old_dir && + new_dir->i_nlink >= ocfs2_link_max(osb)) { + status = -EMLINK; + goto bail; + } + } + + status = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name, + old_dentry->d_name.len, + &old_de_ino); + if (status) { + status = -ENOENT; + goto bail; + } + + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + if (old_de_ino != OCFS2_I(old_inode)->ip_blkno) { + status = -ENOENT; + goto bail; + } + + /* check if the target already exists (in which case we need + * to delete it */ + status = ocfs2_find_files_on_disk(new_dentry->d_name.name, + new_dentry->d_name.len, + &newfe_blkno, new_dir, + &target_lookup_res); + /* The only error we allow here is -ENOENT because the new + * file not existing is perfectly valid. */ + if ((status < 0) && (status != -ENOENT)) { + /* If we cannot find the file specified we should just */ + /* return the error... */ + mlog_errno(status); + goto bail; + } + if (status == 0) + target_exists = 1; + + if (!target_exists && new_inode) { + /* + * Target was unlinked by another node while we were + * waiting to get to ocfs2_rename(). There isn't + * anything we can do here to help the situation, so + * bubble up the appropriate error. + */ + status = -ENOENT; + goto bail; + } + + /* In case we need to overwrite an existing file, we blow it + * away first */ + if (target_exists) { + /* VFS didn't think there existed an inode here, but + * someone else in the cluster must have raced our + * rename to create one. Today we error cleanly, in + * the future we should consider calling iget to build + * a new struct inode for this entry. */ + if (!new_inode) { + status = -EACCES; + + trace_ocfs2_rename_target_exists(new_dentry->d_name.len, + new_dentry->d_name.name); + goto bail; + } + + if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) { + status = -EACCES; + + trace_ocfs2_rename_disagree( + (unsigned long long)OCFS2_I(new_inode)->ip_blkno, + (unsigned long long)newfe_blkno, + OCFS2_I(new_inode)->ip_flags); + goto bail; + } + + status = ocfs2_inode_lock(new_inode, &newfe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + new_child_locked = 1; + + status = ocfs2_remote_dentry_delete(new_dentry); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + newfe = (struct ocfs2_dinode *) newfe_bh->b_data; + + trace_ocfs2_rename_over_existing( + (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? + (unsigned long long)newfe_bh->b_blocknr : 0ULL); + + if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, + OCFS2_I(new_inode)->ip_blkno, + orphan_name, &orphan_insert); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + } else { + BUG_ON(new_dentry->d_parent->d_inode != new_dir); + + status = ocfs2_check_dir_for_entry(new_dir, + new_dentry->d_name.name, + new_dentry->d_name.len); + if (status) + goto bail; + + status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh, + new_dentry->d_name.name, + new_dentry->d_name.len, + &target_insert); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + if (target_exists) { + if (S_ISDIR(new_inode->i_mode)) { + if (new_inode->i_nlink != 2 || + !ocfs2_empty_dir(new_inode)) { + status = -ENOTEMPTY; + goto bail; + } + } + status = ocfs2_journal_access_di(handle, INODE_CACHE(new_inode), + newfe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (S_ISDIR(new_inode->i_mode) || + (ocfs2_read_links_count(newfe) == 1)) { + status = ocfs2_orphan_add(osb, handle, new_inode, + newfe_bh, orphan_name, + &orphan_insert, orphan_dir); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* change the dirent to point to the correct inode */ + status = ocfs2_update_entry(new_dir, handle, &target_lookup_res, + old_inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + new_dir->i_version++; + + if (S_ISDIR(new_inode->i_mode)) + ocfs2_set_links_count(newfe, 0); + else + ocfs2_add_links_count(newfe, -1); + ocfs2_journal_dirty(handle, newfe_bh); + } else { + /* if the name was not found in new_dir, add it now */ + status = ocfs2_add_entry(handle, new_dentry, old_inode, + OCFS2_I(old_inode)->ip_blkno, + new_dir_bh, &target_insert); + } + + old_inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(old_inode); + + status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode), + old_inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status >= 0) { + old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; + + old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec); + old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec); + ocfs2_journal_dirty(handle, old_inode_bh); + } else + mlog_errno(status); + + /* + * Now that the name has been added to new_dir, remove the old name. + * + * We don't keep any directory entry context around until now + * because the insert might have changed the type of directory + * we're dealing with. + */ + status = ocfs2_find_entry(old_dentry->d_name.name, + old_dentry->d_name.len, old_dir, + &old_entry_lookup); + if (status) + goto bail; + + status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (new_inode) { + new_inode->i_nlink--; + new_inode->i_ctime = CURRENT_TIME; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; + + if (update_dot_dot) { + status = ocfs2_update_entry(old_inode, handle, + &old_inode_dot_dot_res, new_dir); + old_dir->i_nlink--; + if (new_inode) { + new_inode->i_nlink--; + } else { + inc_nlink(new_dir); + mark_inode_dirty(new_dir); + } + } + mark_inode_dirty(old_dir); + ocfs2_mark_inode_dirty(handle, old_dir, old_dir_bh); + if (new_inode) { + mark_inode_dirty(new_inode); + ocfs2_mark_inode_dirty(handle, new_inode, newfe_bh); + } + + if (old_dir != new_dir) { + /* Keep the same times on both directories.*/ + new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime; + + /* + * This will also pick up the i_nlink change from the + * block above. + */ + ocfs2_mark_inode_dirty(handle, new_dir, new_dir_bh); + } + + if (old_dir_nlink != old_dir->i_nlink) { + if (!old_dir_bh) { + mlog(ML_ERROR, "need to change nlink for old dir " + "%llu from %d to %d but bh is NULL!\n", + (unsigned long long)OCFS2_I(old_dir)->ip_blkno, + (int)old_dir_nlink, old_dir->i_nlink); + } else { + struct ocfs2_dinode *fe; + status = ocfs2_journal_access_di(handle, + INODE_CACHE(old_dir), + old_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + fe = (struct ocfs2_dinode *) old_dir_bh->b_data; + ocfs2_set_links_count(fe, old_dir->i_nlink); + ocfs2_journal_dirty(handle, old_dir_bh); + } + } + ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); + status = 0; +bail: + if (rename_lock) + ocfs2_rename_unlock(osb); + + if (handle) + ocfs2_commit_trans(osb, handle); + + if (parents_locked) + ocfs2_double_unlock(old_dir, new_dir); + + if (old_child_locked) + ocfs2_inode_unlock(old_inode, 1); + + if (new_child_locked) + ocfs2_inode_unlock(new_inode, 1); + + if (orphan_dir) { + /* This was locked for us in ocfs2_prepare_orphan_dir() */ + ocfs2_inode_unlock(orphan_dir, 1); + mutex_unlock(&orphan_dir->i_mutex); + iput(orphan_dir); + } + + if (new_inode) + sync_mapping_buffers(old_inode->i_mapping); + + if (new_inode) + iput(new_inode); + + ocfs2_free_dir_lookup_result(&target_lookup_res); + ocfs2_free_dir_lookup_result(&old_entry_lookup); + ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res); + ocfs2_free_dir_lookup_result(&orphan_insert); + ocfs2_free_dir_lookup_result(&target_insert); + + brelse(newfe_bh); + brelse(old_inode_bh); + brelse(old_dir_bh); + brelse(new_dir_bh); + + if (status) + mlog_errno(status); + + return status; +} + +/* + * we expect i_size = strlen(symname). Copy symname into the file + * data, including the null terminator. + */ +static int ocfs2_create_symlink_data(struct ocfs2_super *osb, + handle_t *handle, + struct inode *inode, + const char *symname) +{ + struct buffer_head **bhs = NULL; + const char *c; + struct super_block *sb = osb->sb; + u64 p_blkno, p_blocks; + int virtual, blocks, status, i, bytes_left; + + bytes_left = i_size_read(inode) + 1; + /* we can't trust i_blocks because we're actually going to + * write i_size + 1 bytes. */ + blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + + trace_ocfs2_create_symlink_data((unsigned long long)inode->i_blocks, + i_size_read(inode), blocks); + + /* Sanity check -- make sure we're going to fit. */ + if (bytes_left > + ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) { + status = -EIO; + mlog_errno(status); + goto bail; + } + + bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL); + if (!bhs) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks, + NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* links can never be larger than one cluster so we know this + * is all going to be contiguous, but do a sanity check + * anyway. */ + if ((p_blocks << sb->s_blocksize_bits) < bytes_left) { + status = -EIO; + mlog_errno(status); + goto bail; + } + + virtual = 0; + while(bytes_left > 0) { + c = &symname[virtual * sb->s_blocksize]; + + bhs[virtual] = sb_getblk(sb, p_blkno); + if (!bhs[virtual]) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), + bhs[virtual]); + + status = ocfs2_journal_access(handle, INODE_CACHE(inode), + bhs[virtual], + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(bhs[virtual]->b_data, 0, sb->s_blocksize); + + memcpy(bhs[virtual]->b_data, c, + (bytes_left > sb->s_blocksize) ? sb->s_blocksize : + bytes_left); + + ocfs2_journal_dirty(handle, bhs[virtual]); + + virtual++; + p_blkno++; + bytes_left -= sb->s_blocksize; + } + + status = 0; +bail: + + if (bhs) { + for(i = 0; i < blocks; i++) + brelse(bhs[i]); + kfree(bhs); + } + + if (status) + mlog_errno(status); + return status; +} + +static int ocfs2_symlink(struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + int status, l, credits; + u64 newsize; + struct ocfs2_super *osb = NULL; + struct inode *inode = NULL; + struct super_block *sb; + struct buffer_head *new_fe_bh = NULL; + struct buffer_head *parent_fe_bh = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_dinode *dirfe; + handle_t *handle = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *xattr_ac = NULL; + int want_clusters = 0; + int xattr_credits = 0; + struct ocfs2_security_xattr_info si = { + .enable = 1, + }; + int did_quota = 0, did_quota_inode = 0; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + sigset_t oldset; + int did_block_signals = 0; + + trace_ocfs2_symlink_begin(dir, dentry, symname, + dentry->d_name.len, dentry->d_name.name); + + dquot_initialize(dir); + + sb = dir->i_sb; + osb = OCFS2_SB(sb); + + l = strlen(symname) + 1; + + credits = ocfs2_calc_symlink_credits(sb); + + /* lock the parent directory */ + status = ocfs2_inode_lock(dir, &parent_fe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + if (!ocfs2_read_links_count(dirfe)) { + /* can't make a file in a deleted directory. */ + status = -ENOENT; + goto bail; + } + + status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (status) + goto bail; + + status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, + dentry->d_name.name, + dentry->d_name.len, &lookup); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_reserve_new_inode(osb, &inode_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* get security xattr */ + status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si); + if (status) { + if (status == -EOPNOTSUPP) + si.enable = 0; + else { + mlog_errno(status); + goto bail; + } + } + + /* calculate meta data/clusters for setting security xattr */ + if (si.enable) { + status = ocfs2_calc_security_init(dir, &si, &want_clusters, + &xattr_credits, &xattr_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* don't reserve bitmap space for fast symlinks. */ + if (l > ocfs2_fast_symlink_chars(sb)) + want_clusters += 1; + + status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, credits + xattr_credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + /* Starting to change things, restart is no longer possible. */ + ocfs2_block_signals(&oldset); + did_block_signals = 1; + + status = dquot_alloc_inode(inode); + if (status) + goto bail; + did_quota_inode = 1; + + trace_ocfs2_symlink_create(dir, dentry, dentry->d_name.len, + dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + inode->i_mode); + + status = ocfs2_mknod_locked(osb, dir, inode, + 0, &new_fe_bh, parent_fe_bh, handle, + inode_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + fe = (struct ocfs2_dinode *) new_fe_bh->b_data; + inode->i_rdev = 0; + newsize = l - 1; + if (l > ocfs2_fast_symlink_chars(sb)) { + u32 offset = 0; + + inode->i_op = &ocfs2_symlink_inode_operations; + status = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (status) + goto bail; + did_quota = 1; + status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, + new_fe_bh, + handle, data_ac, NULL, + NULL); + if (status < 0) { + if (status != -ENOSPC && status != -EINTR) { + mlog(ML_ERROR, + "Failed to extend file to %llu\n", + (unsigned long long)newsize); + mlog_errno(status); + status = -ENOSPC; + } + goto bail; + } + i_size_write(inode, newsize); + inode->i_blocks = ocfs2_inode_sector_count(inode); + } else { + inode->i_op = &ocfs2_fast_symlink_inode_operations; + memcpy((char *) fe->id2.i_symlink, symname, l); + i_size_write(inode, newsize); + inode->i_blocks = 0; + } + + status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (!ocfs2_inode_is_fast_symlink(inode)) { + status = ocfs2_create_symlink_data(osb, handle, inode, + symname); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + if (si.enable) { + status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, + xattr_ac, data_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* + * Do this before adding the entry to the directory. We add + * also set d_op after success so that ->d_iput() will cleanup + * the dentry lock even if ocfs2_add_entry() fails below. + */ + status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); + if (status) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_add_entry(handle, dentry, inode, + le64_to_cpu(fe->i_blkno), parent_fe_bh, + &lookup); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + insert_inode_hash(inode); + d_instantiate(dentry, inode); +bail: + if (status < 0 && did_quota) + dquot_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (status < 0 && did_quota_inode) + dquot_free_inode(inode); + if (handle) + ocfs2_commit_trans(osb, handle); + + ocfs2_inode_unlock(dir, 1); + if (did_block_signals) + ocfs2_unblock_signals(&oldset); + + brelse(new_fe_bh); + brelse(parent_fe_bh); + kfree(si.name); + kfree(si.value); + ocfs2_free_dir_lookup_result(&lookup); + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (xattr_ac) + ocfs2_free_alloc_context(xattr_ac); + if ((status < 0) && inode) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; + clear_nlink(inode); + iput(inode); + } + + if (status) + mlog_errno(status); + + return status; +} + +static int ocfs2_blkno_stringify(u64 blkno, char *name) +{ + int status, namelen; + + namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx", + (long long)blkno); + if (namelen <= 0) { + if (namelen) + status = namelen; + else + status = -EINVAL; + mlog_errno(status); + goto bail; + } + if (namelen != OCFS2_ORPHAN_NAMELEN) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + trace_ocfs2_blkno_stringify(blkno, name, namelen); + + status = 0; +bail: + if (status < 0) + mlog_errno(status); + return status; +} + +static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + struct buffer_head **ret_orphan_dir_bh) +{ + struct inode *orphan_dir_inode; + struct buffer_head *orphan_dir_bh = NULL; + int ret = 0; + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + osb->slot_num); + if (!orphan_dir_inode) { + ret = -ENOENT; + mlog_errno(ret); + return ret; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + + ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (ret < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + + mlog_errno(ret); + return ret; + } + + *ret_orphan_dir = orphan_dir_inode; + *ret_orphan_dir_bh = orphan_dir_bh; + + return 0; +} + +static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, + struct buffer_head *orphan_dir_bh, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); + + ret = ocfs2_blkno_stringify(blkno, name); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, + orphan_dir_bh, name, + OCFS2_ORPHAN_NAMELEN, lookup); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + return 0; +} + +/** + * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for + * insertion of an orphan. + * @osb: ocfs2 file system + * @ret_orphan_dir: Orphan dir inode - returned locked! + * @blkno: Actual block number of the inode to be inserted into orphan dir. + * @lookup: dir lookup result, to be passed back into functions like + * ocfs2_orphan_add + * + * Returns zero on success and the ret_orphan_dir, name and lookup + * fields will be populated. + * + * Returns non-zero on failure. + */ +static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + int ret = 0; + + ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode, + &orphan_dir_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, + blkno, name, lookup); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + *ret_orphan_dir = orphan_dir_inode; + +out: + brelse(orphan_dir_bh); + + if (ret) { + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + } + + if (ret) + mlog_errno(ret); + return ret; +} + +static int ocfs2_orphan_add(struct ocfs2_super *osb, + handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + char *name, + struct ocfs2_dir_lookup_result *lookup, + struct inode *orphan_dir_inode) +{ + struct buffer_head *orphan_dir_bh = NULL; + int status = 0; + struct ocfs2_dinode *orphan_fe; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; + + trace_ocfs2_orphan_add_begin( + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(orphan_dir_inode), + orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* we're a cluster, and nlink can change on disk from + * underneath us... */ + orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; + if (S_ISDIR(inode->i_mode)) + ocfs2_add_links_count(orphan_fe, 1); + orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); + ocfs2_journal_dirty(handle, orphan_dir_bh); + + status = __ocfs2_add_entry(handle, orphan_dir_inode, name, + OCFS2_ORPHAN_NAMELEN, inode, + OCFS2_I(inode)->ip_blkno, + orphan_dir_bh, lookup); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* + * We're going to journal the change of i_flags and i_orphaned_slot. + * It's safe anyway, though some callers may duplicate the journaling. + * Journaling within the func just make the logic look more + * straightforward. + */ + status = ocfs2_journal_access_di(handle, + INODE_CACHE(inode), + fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); + OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; + + /* Record which orphan dir our inode now resides + * in. delete_inode will use this to determine which orphan + * dir to lock. */ + fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + + ocfs2_journal_dirty(handle, fe_bh); + + trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno, + osb->slot_num); + +leave: + brelse(orphan_dir_bh); + + if (status) + mlog_errno(status); + return status; +} + +/* unlike orphan_add, we expect the orphan dir to already be locked here. */ +int ocfs2_orphan_del(struct ocfs2_super *osb, + handle_t *handle, + struct inode *orphan_dir_inode, + struct inode *inode, + struct buffer_head *orphan_dir_bh) +{ + char name[OCFS2_ORPHAN_NAMELEN + 1]; + struct ocfs2_dinode *orphan_fe; + int status = 0; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + trace_ocfs2_orphan_del( + (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, + name, OCFS2_ORPHAN_NAMELEN); + + /* find it's spot in the orphan directory */ + status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode, + &lookup); + if (status) { + mlog_errno(status); + goto leave; + } + + /* remove it from the orphan directory */ + status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(orphan_dir_inode), + orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* do the i_nlink dance! :) */ + orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; + if (S_ISDIR(inode->i_mode)) + ocfs2_add_links_count(orphan_fe, -1); + orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); + ocfs2_journal_dirty(handle, orphan_dir_bh); + +leave: + ocfs2_free_dir_lookup_result(&lookup); + + if (status) + mlog_errno(status); + return status; +} + +/** + * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to receive a newly + * allocated file. This is different from the typical 'add to orphan dir' + * operation in that the inode does not yet exist. This is a problem because + * the orphan dir stringifies the inode block number to come up with it's + * dirent. Obviously if the inode does not yet exist we have a chicken and egg + * problem. This function works around it by calling deeper into the orphan + * and suballoc code than other callers. Use this only by necessity. + * @dir: The directory which this inode will ultimately wind up under - not the + * orphan dir! + * @dir_bh: buffer_head the @dir inode block + * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled + * with the string to be used for orphan dirent. Pass back to the orphan dir + * code. + * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan + * dir code. + * @ret_di_blkno: block number where the new inode will be allocated. + * @orphan_insert: Dir insert context to be passed back into orphan dir code. + * @ret_inode_ac: Inode alloc context to be passed back to the allocator. + * + * Returns zero on success and the ret_orphan_dir, name and lookup + * fields will be populated. + * + * Returns non-zero on failure. + */ +static int ocfs2_prep_new_orphaned_file(struct inode *dir, + struct buffer_head *dir_bh, + char *orphan_name, + struct inode **ret_orphan_dir, + u64 *ret_di_blkno, + struct ocfs2_dir_lookup_result *orphan_insert, + struct ocfs2_alloc_context **ret_inode_ac) +{ + int ret; + u64 di_blkno; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct inode *orphan_dir = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + + ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + /* reserve an inode spot */ + ret = ocfs2_reserve_new_inode(osb, &inode_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac, + &di_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh, + di_blkno, orphan_name, orphan_insert); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + +out: + if (ret == 0) { + *ret_orphan_dir = orphan_dir; + *ret_di_blkno = di_blkno; + *ret_inode_ac = inode_ac; + /* + * orphan_name and orphan_insert are already up to + * date via prepare_orphan_dir + */ + } else { + /* Unroll reserve_new_inode* */ + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + /* Unroll orphan dir locking */ + mutex_unlock(&orphan_dir->i_mutex); + ocfs2_inode_unlock(orphan_dir, 1); + iput(orphan_dir); + } + + brelse(orphan_dir_bh); + + return 0; +} + +int ocfs2_create_inode_in_orphan(struct inode *dir, + int mode, + struct inode **new_inode) +{ + int status, did_quota_inode = 0; + struct inode *inode = NULL; + struct inode *orphan_dir = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *di = NULL; + handle_t *handle = NULL; + char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; + struct buffer_head *parent_di_bh = NULL; + struct buffer_head *new_di_bh = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + u64 uninitialized_var(di_blkno), suballoc_loc; + u16 suballoc_bit; + + status = ocfs2_inode_lock(dir, &parent_di_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh, + orphan_name, &orphan_dir, + &di_blkno, &orphan_insert, &inode_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + inode = ocfs2_get_init_inode(dir, mode); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 0, 0)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + + status = dquot_alloc_inode(inode); + if (status) + goto leave; + did_quota_inode = 1; + + status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac, + &suballoc_loc, + &suballoc_bit, di_blkno); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + inode->i_nlink = 0; + /* do the real work now. */ + status = __ocfs2_mknod_locked(dir, inode, + 0, &new_di_bh, parent_di_bh, handle, + inode_ac, di_blkno, suballoc_loc, + suballoc_bit); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + di = (struct ocfs2_dinode *)new_di_bh->b_data; + status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, + &orphan_insert, orphan_dir); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* get open lock so that only nodes can't remove it from orphan dir. */ + status = ocfs2_open_lock(inode); + if (status < 0) + mlog_errno(status); + + insert_inode_hash(inode); +leave: + if (status < 0 && did_quota_inode) + dquot_free_inode(inode); + if (handle) + ocfs2_commit_trans(osb, handle); + + if (orphan_dir) { + /* This was locked for us in ocfs2_prepare_orphan_dir() */ + ocfs2_inode_unlock(orphan_dir, 1); + mutex_unlock(&orphan_dir->i_mutex); + iput(orphan_dir); + } + + if ((status < 0) && inode) { + clear_nlink(inode); + iput(inode); + } + + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + brelse(new_di_bh); + + if (!status) + *new_inode = inode; + + ocfs2_free_dir_lookup_result(&orphan_insert); + + ocfs2_inode_unlock(dir, 1); + brelse(parent_di_bh); + return status; +} + +int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct inode *inode, + struct dentry *dentry) +{ + int status = 0; + struct buffer_head *parent_di_bh = NULL; + handle_t *handle = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *dir_di, *di; + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + + trace_ocfs2_mv_orphaned_inode_to_new(dir, dentry, + dentry->d_name.len, dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + status = ocfs2_inode_lock(dir, &parent_di_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; + if (!dir_di->i_links_count) { + /* can't make a file in a deleted directory. */ + status = -ENOENT; + goto leave; + } + + status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (status) + goto leave; + + /* get a spot inside the dir. */ + status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, + dentry->d_name.name, + dentry->d_name.len, &lookup); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + osb->slot_num); + if (!orphan_dir_inode) { + status = -EEXIST; + mlog_errno(status); + goto leave; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mlog_errno(status); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + goto leave; + } + + status = ocfs2_read_inode_block(inode, &di_bh); + if (status < 0) { + mlog_errno(status); + goto orphan_unlock; + } + + handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto orphan_unlock; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + di_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, + orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); + di->i_orphaned_slot = 0; + inode->i_nlink = 1; + ocfs2_set_links_count(di, inode->i_nlink); + ocfs2_journal_dirty(handle, di_bh); + + status = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, parent_di_bh, + &lookup); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + status = ocfs2_dentry_attach_lock(dentry, inode, + OCFS2_I(dir)->ip_blkno); + if (status) { + mlog_errno(status); + goto out_commit; + } + + d_instantiate(dentry, inode); + status = 0; +out_commit: + ocfs2_commit_trans(osb, handle); +orphan_unlock: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); +leave: + + ocfs2_inode_unlock(dir, 1); + + brelse(di_bh); + brelse(parent_di_bh); + brelse(orphan_dir_bh); + + ocfs2_free_dir_lookup_result(&lookup); + + if (status) + mlog_errno(status); + + return status; +} + +const struct inode_operations ocfs2_dir_iops = { + .create = ocfs2_create, + .lookup = ocfs2_lookup, + .link = ocfs2_link, + .unlink = ocfs2_unlink, + .rmdir = ocfs2_unlink, + .symlink = ocfs2_symlink, + .mkdir = ocfs2_mkdir, + .mknod = ocfs2_mknod, + .rename = ocfs2_rename, + .setattr = ocfs2_setattr, + .getattr = ocfs2_getattr, + .permission = ocfs2_permission, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, +}; diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h new file mode 100644 index 00000000..e5d059d4 --- /dev/null +++ b/fs/ocfs2/namei.h @@ -0,0 +1,45 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * namei.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_NAMEI_H +#define OCFS2_NAMEI_H + +extern const struct inode_operations ocfs2_dir_iops; + +struct dentry *ocfs2_get_parent(struct dentry *child); + +int ocfs2_orphan_del(struct ocfs2_super *osb, + handle_t *handle, + struct inode *orphan_dir_inode, + struct inode *inode, + struct buffer_head *orphan_dir_bh); +int ocfs2_create_inode_in_orphan(struct inode *dir, + int mode, + struct inode **new_inode); +int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct inode *new_inode, + struct dentry *new_dentry); + +#endif /* OCFS2_NAMEI_H */ diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h new file mode 100644 index 00000000..dfb313bd --- /dev/null +++ b/fs/ocfs2/ocfs1_fs_compat.h @@ -0,0 +1,109 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs1_fs_compat.h + * + * OCFS1 volume header definitions. OCFS2 creates valid but unmountable + * OCFS1 volume headers on the first two sectors of an OCFS2 volume. + * This allows an OCFS1 volume to see the partition and cleanly fail to + * mount it. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _OCFS1_FS_COMPAT_H +#define _OCFS1_FS_COMPAT_H + +#define OCFS1_MAX_VOL_SIGNATURE_LEN 128 +#define OCFS1_MAX_MOUNT_POINT_LEN 128 +#define OCFS1_MAX_VOL_ID_LENGTH 16 +#define OCFS1_MAX_VOL_LABEL_LEN 64 +#define OCFS1_MAX_CLUSTER_NAME_LEN 64 + +#define OCFS1_MAJOR_VERSION (2) +#define OCFS1_MINOR_VERSION (0) +#define OCFS1_VOLUME_SIGNATURE "OracleCFS" + +/* + * OCFS1 superblock. Lives at sector 0. + */ +struct ocfs1_vol_disk_hdr +{ +/*00*/ __u32 minor_version; + __u32 major_version; +/*08*/ __u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN]; +/*88*/ __u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN]; +/*108*/ __u64 serial_num; +/*110*/ __u64 device_size; + __u64 start_off; +/*120*/ __u64 bitmap_off; + __u64 publ_off; +/*130*/ __u64 vote_off; + __u64 root_bitmap_off; +/*140*/ __u64 data_start_off; + __u64 root_bitmap_size; +/*150*/ __u64 root_off; + __u64 root_size; +/*160*/ __u64 cluster_size; + __u64 num_nodes; +/*170*/ __u64 num_clusters; + __u64 dir_node_size; +/*180*/ __u64 file_node_size; + __u64 internal_off; +/*190*/ __u64 node_cfg_off; + __u64 node_cfg_size; +/*1A0*/ __u64 new_cfg_off; + __u32 prot_bits; + __s32 excl_mount; +/*1B0*/ +}; + + +struct ocfs1_disk_lock +{ +/*00*/ __u32 curr_master; + __u8 file_lock; + __u8 compat_pad[3]; /* Not in original definition. Used to + make the already existing alignment + explicit */ + __u64 last_write_time; +/*10*/ __u64 last_read_time; + __u32 writer_node_num; + __u32 reader_node_num; +/*20*/ __u64 oin_node_map; + __u64 dlock_seq_num; +/*30*/ +}; + +/* + * OCFS1 volume label. Lives at sector 1. + */ +struct ocfs1_vol_label +{ +/*00*/ struct ocfs1_disk_lock disk_lock; +/*30*/ __u8 label[OCFS1_MAX_VOL_LABEL_LEN]; +/*70*/ __u16 label_len; +/*72*/ __u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH]; +/*82*/ __u16 vol_id_len; +/*84*/ __u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN]; +/*A4*/ __u16 cluster_name_len; +/*A6*/ +}; + + +#endif /* _OCFS1_FS_COMPAT_H */ + diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h new file mode 100644 index 00000000..40928585 --- /dev/null +++ b/fs/ocfs2/ocfs2.h @@ -0,0 +1,853 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2.h + * + * Defines macros and structures used in OCFS2 + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_H +#define OCFS2_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* For union ocfs2_dlm_lksb */ +#include "stackglue.h" + +#include "ocfs2_fs.h" +#include "ocfs2_lockid.h" +#include "ocfs2_ioctl.h" + +/* For struct ocfs2_blockcheck_stats */ +#include "blockcheck.h" + +#include "reservations.h" + +/* Caching of metadata buffers */ + +/* Most user visible OCFS2 inodes will have very few pieces of + * metadata, but larger files (including bitmaps, etc) must be taken + * into account when designing an access scheme. We allow a small + * amount of inlined blocks to be stored on an array and grow the + * structure into a rb tree when necessary. */ +#define OCFS2_CACHE_INFO_MAX_ARRAY 2 + +/* Flags for ocfs2_caching_info */ + +enum ocfs2_caching_info_flags { + /* Indicates that the metadata cache is using the inline array */ + OCFS2_CACHE_FL_INLINE = 1<<1, +}; + +struct ocfs2_caching_operations; +struct ocfs2_caching_info { + /* + * The parent structure provides the locks, but because the + * parent structure can differ, it provides locking operations + * to struct ocfs2_caching_info. + */ + const struct ocfs2_caching_operations *ci_ops; + + /* next two are protected by trans_inc_lock */ + /* which transaction were we created on? Zero if none. */ + unsigned long ci_created_trans; + /* last transaction we were a part of. */ + unsigned long ci_last_trans; + + /* Cache structures */ + unsigned int ci_flags; + unsigned int ci_num_cached; + union { + sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY]; + struct rb_root ci_tree; + } ci_cache; +}; +/* + * Need this prototype here instead of in uptodate.h because journal.h + * uses it. + */ +struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci); + +/* this limits us to 256 nodes + * if we need more, we can do a kmalloc for the map */ +#define OCFS2_NODE_MAP_MAX_NODES 256 +struct ocfs2_node_map { + u16 num_nodes; + unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)]; +}; + +enum ocfs2_ast_action { + OCFS2_AST_INVALID = 0, + OCFS2_AST_ATTACH, + OCFS2_AST_CONVERT, + OCFS2_AST_DOWNCONVERT, +}; + +/* actions for an unlockast function to take. */ +enum ocfs2_unlock_action { + OCFS2_UNLOCK_INVALID = 0, + OCFS2_UNLOCK_CANCEL_CONVERT, + OCFS2_UNLOCK_DROP_LOCK, +}; + +/* ocfs2_lock_res->l_flags flags. */ +#define OCFS2_LOCK_ATTACHED (0x00000001) /* we have initialized + * the lvb */ +#define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in + * dlm_lock */ +#define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to + * downconvert*/ +#define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */ +#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010) +#define OCFS2_LOCK_REFRESHING (0x00000020) +#define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization + * for shutdown paths */ +#define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track + * when to skip queueing + * a lock because it's + * about to be + * dropped. */ +#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ +#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */ +#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a + call to dlm_lock. Only + exists with BUSY set. */ +#define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread + * from downconverting + * before the upconvert + * has completed */ + +struct ocfs2_lock_res_ops; + +typedef void (*ocfs2_lock_callback)(int status, unsigned long data); + +#ifdef CONFIG_OCFS2_FS_STATS +struct ocfs2_lock_stats { + u64 ls_total; /* Total wait in NSEC */ + u32 ls_gets; /* Num acquires */ + u32 ls_fail; /* Num failed acquires */ + + /* Storing max wait in usecs saves 24 bytes per inode */ + u32 ls_max; /* Max wait in USEC */ +}; +#endif + +struct ocfs2_lock_res { + void *l_priv; + struct ocfs2_lock_res_ops *l_ops; + + + struct list_head l_blocked_list; + struct list_head l_mask_waiters; + + unsigned long l_flags; + char l_name[OCFS2_LOCK_ID_MAX_LEN]; + unsigned int l_ro_holders; + unsigned int l_ex_holders; + signed char l_level; + signed char l_requested; + signed char l_blocking; + + /* Data packed - type enum ocfs2_lock_type */ + unsigned char l_type; + + /* used from AST/BAST funcs. */ + /* Data packed - enum type ocfs2_ast_action */ + unsigned char l_action; + /* Data packed - enum type ocfs2_unlock_action */ + unsigned char l_unlock_action; + unsigned int l_pending_gen; + + spinlock_t l_lock; + + struct ocfs2_dlm_lksb l_lksb; + + wait_queue_head_t l_event; + + struct list_head l_debug_list; + +#ifdef CONFIG_OCFS2_FS_STATS + struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */ + u32 l_lock_refresh; /* Disk refreshes */ + struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */ +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map l_lockdep_map; +#endif +}; + +enum ocfs2_orphan_scan_state { + ORPHAN_SCAN_ACTIVE, + ORPHAN_SCAN_INACTIVE +}; + +struct ocfs2_orphan_scan { + struct mutex os_lock; + struct ocfs2_super *os_osb; + struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ + struct delayed_work os_orphan_scan_work; + struct timespec os_scantime; /* time this node ran the scan */ + u32 os_count; /* tracks node specific scans */ + u32 os_seqno; /* tracks cluster wide scans */ + atomic_t os_state; /* ACTIVE or INACTIVE */ +}; + +struct ocfs2_dlm_debug { + struct kref d_refcnt; + struct dentry *d_locking_state; + struct list_head d_lockres_tracking; +}; + +enum ocfs2_vol_state +{ + VOLUME_INIT = 0, + VOLUME_MOUNTED, + VOLUME_MOUNTED_QUOTAS, + VOLUME_DISMOUNTED, + VOLUME_DISABLED +}; + +struct ocfs2_alloc_stats +{ + atomic_t moves; + atomic_t local_data; + atomic_t bitmap_data; + atomic_t bg_allocs; + atomic_t bg_extends; +}; + +enum ocfs2_local_alloc_state +{ + OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for + * this mountpoint. */ + OCFS2_LA_ENABLED, /* Local alloc is in use. */ + OCFS2_LA_THROTTLED, /* Local alloc is in use, but number + * of bits has been reduced. */ + OCFS2_LA_DISABLED /* Local alloc has temporarily been + * disabled. */ +}; + +enum ocfs2_mount_options +{ + OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */ + OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ + OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ + OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ + OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ + OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ + OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ + OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ + OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */ + OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access + control lists */ + OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ + OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ + OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT + writes */ + OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ + OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ +}; + +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 +#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 + +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 + +struct ocfs2_journal; +struct ocfs2_slot_info; +struct ocfs2_recovery_map; +struct ocfs2_replay_map; +struct ocfs2_quota_recovery; +struct ocfs2_dentry_lock; +struct ocfs2_super +{ + struct task_struct *commit_task; + struct super_block *sb; + struct inode *root_inode; + struct inode *sys_root_inode; + struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES]; + struct inode **local_system_inodes; + + struct ocfs2_slot_info *slot_info; + + u32 *slot_recovery_generations; + + spinlock_t node_map_lock; + + u64 root_blkno; + u64 system_dir_blkno; + u64 bitmap_blkno; + u32 bitmap_cpg; + u8 *uuid; + char *uuid_str; + u32 uuid_hash; + u8 *vol_label; + u64 first_cluster_group_blkno; + u32 fs_generation; + + u32 s_feature_compat; + u32 s_feature_incompat; + u32 s_feature_ro_compat; + + /* Protects s_next_generation, osb_flags and s_inode_steal_slot. + * Could protect more on osb as it's very short lived. + */ + spinlock_t osb_lock; + u32 s_next_generation; + unsigned long osb_flags; + s16 s_inode_steal_slot; + s16 s_meta_steal_slot; + atomic_t s_num_inodes_stolen; + atomic_t s_num_meta_stolen; + + unsigned long s_mount_opt; + unsigned int s_atime_quantum; + + unsigned int max_slots; + unsigned int node_num; + int slot_num; + int preferred_slot; + int s_sectsize_bits; + int s_clustersize; + int s_clustersize_bits; + unsigned int s_xattr_inline_size; + + atomic_t vol_state; + struct mutex recovery_lock; + struct ocfs2_recovery_map *recovery_map; + struct ocfs2_replay_map *replay_map; + struct task_struct *recovery_thread_task; + int disable_recovery; + wait_queue_head_t checkpoint_event; + atomic_t needs_checkpoint; + struct ocfs2_journal *journal; + unsigned long osb_commit_interval; + + struct delayed_work la_enable_wq; + + /* + * Must hold local alloc i_mutex and osb->osb_lock to change + * local_alloc_bits. Reads can be done under either lock. + */ + unsigned int local_alloc_bits; + unsigned int local_alloc_default_bits; + /* osb_clusters_at_boot can become stale! Do not trust it to + * be up to date. */ + unsigned int osb_clusters_at_boot; + + enum ocfs2_local_alloc_state local_alloc_state; /* protected + * by osb_lock */ + + struct buffer_head *local_alloc_bh; + + u64 la_last_gd; + + struct ocfs2_reservation_map osb_la_resmap; + + unsigned int osb_resv_level; + unsigned int osb_dir_resv_level; + + /* Next three fields are for local node slot recovery during + * mount. */ + int dirty; + struct ocfs2_dinode *local_alloc_copy; + struct ocfs2_quota_recovery *quota_rec; + + struct ocfs2_blockcheck_stats osb_ecc_stats; + struct ocfs2_alloc_stats alloc_stats; + char dev_str[20]; /* "major,minor" of the device */ + + u8 osb_stackflags; + + char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; + struct ocfs2_cluster_connection *cconn; + struct ocfs2_lock_res osb_super_lockres; + struct ocfs2_lock_res osb_rename_lockres; + struct ocfs2_lock_res osb_nfs_sync_lockres; + struct ocfs2_dlm_debug *osb_dlm_debug; + + struct dentry *osb_debug_root; + struct dentry *osb_ctxt; + + wait_queue_head_t recovery_event; + + spinlock_t dc_task_lock; + struct task_struct *dc_task; + wait_queue_head_t dc_event; + unsigned long dc_wake_sequence; + unsigned long dc_work_sequence; + + /* + * Any thread can add locks to the list, but the downconvert + * thread is the only one allowed to remove locks. Any change + * to this rule requires updating + * ocfs2_downconvert_thread_do_work(). + */ + struct list_head blocked_lock_list; + unsigned long blocked_lock_count; + + /* List of dentry locks to release. Anyone can add locks to + * the list, ocfs2_wq processes the list */ + struct ocfs2_dentry_lock *dentry_lock_list; + struct work_struct dentry_lock_work; + + wait_queue_head_t osb_mount_event; + + /* Truncate log info */ + struct inode *osb_tl_inode; + struct buffer_head *osb_tl_bh; + struct delayed_work osb_truncate_log_wq; + /* + * How many clusters in our truncate log. + * It must be protected by osb_tl_inode->i_mutex. + */ + unsigned int truncated_clusters; + + struct ocfs2_node_map osb_recovering_orphan_dirs; + unsigned int *osb_orphan_wipes; + wait_queue_head_t osb_wipe_event; + + struct ocfs2_orphan_scan osb_orphan_scan; + + /* used to protect metaecc calculation check of xattr. */ + spinlock_t osb_xattr_lock; + + unsigned int osb_dx_mask; + u32 osb_dx_seed[4]; + + /* the group we used to allocate inodes. */ + u64 osb_inode_alloc_group; + + /* rb tree root for refcount lock. */ + struct rb_root osb_rf_lock_tree; + struct ocfs2_refcount_tree *osb_ref_tree_lru; +}; + +#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) + +/* Useful typedef for passing around journal access functions */ +typedef int (*ocfs2_journal_access_func)(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); + +static inline int ocfs2_should_order_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) + return 0; + return 1; +} + +static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) + return 1; + return 0; +} + +static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) +{ + /* + * Support for sparse files is a pre-requisite + */ + if (!ocfs2_sparse_alloc(osb)) + return 0; + + if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN) + return 1; + return 0; +} + +static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA) + return 1; + return 0; +} + +static inline int ocfs2_supports_xattr(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR) + return 1; + return 0; +} + +static inline int ocfs2_meta_ecc(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC) + return 1; + return 0; +} + +static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) + return 1; + return 0; +} + +static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) + return 1; + return 0; +} + +static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) +{ + if (ocfs2_supports_indexed_dirs(osb)) + return OCFS2_DX_LINK_MAX; + return OCFS2_LINK_MAX; +} + +static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di) +{ + u32 nlink = le16_to_cpu(di->i_links_count); + u32 hi = le16_to_cpu(di->i_links_count_hi); + + if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL)) + nlink |= (hi << OCFS2_LINKS_HI_SHIFT); + + return nlink; +} + +static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink) +{ + u16 lo, hi; + + lo = nlink; + hi = nlink >> OCFS2_LINKS_HI_SHIFT; + + di->i_links_count = cpu_to_le16(lo); + di->i_links_count_hi = cpu_to_le16(hi); +} + +static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n) +{ + u32 links = ocfs2_read_links_count(di); + + links += n; + + ocfs2_set_links_count(di, links); +} + +static inline int ocfs2_refcount_tree(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) + return 1; + return 0; +} + +/* set / clear functions because cluster events can make these happen + * in parallel so we want the transitions to be atomic. this also + * means that any future flags osb_flags must be protected by spinlock + * too! */ +static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, + unsigned long flag) +{ + spin_lock(&osb->osb_lock); + osb->osb_flags |= flag; + spin_unlock(&osb->osb_lock); +} + + +static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, + unsigned long flag) +{ + unsigned long ret; + + spin_lock(&osb->osb_lock); + ret = osb->osb_flags & flag; + spin_unlock(&osb->osb_lock); + return ret; +} + +static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, + int hard) +{ + spin_lock(&osb->osb_lock); + osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO); + if (hard) + osb->osb_flags |= OCFS2_OSB_HARD_RO; + else + osb->osb_flags |= OCFS2_OSB_SOFT_RO; + spin_unlock(&osb->osb_lock); +} + +static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb) +{ + int ret; + + spin_lock(&osb->osb_lock); + ret = osb->osb_flags & OCFS2_OSB_HARD_RO; + spin_unlock(&osb->osb_lock); + + return ret; +} + +static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) +{ + int ret; + + spin_lock(&osb->osb_lock); + ret = osb->osb_flags & OCFS2_OSB_SOFT_RO; + spin_unlock(&osb->osb_lock); + + return ret; +} + +static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb) +{ + return (osb->s_feature_incompat & + (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK | + OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)); +} + +static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) +{ + if (ocfs2_clusterinfo_valid(osb) && + memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + return 1; + return 0; +} + +static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb) +{ + if (ocfs2_clusterinfo_valid(osb) && + !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + return 1; + return 0; +} + +static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) +{ + return ocfs2_o2cb_stack(osb) && + (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT); +} + +static inline int ocfs2_mount_local(struct ocfs2_super *osb) +{ + return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); +} + +static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) +{ + return (osb->s_feature_incompat & + OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP); +} + + +#define OCFS2_IS_VALID_DINODE(ptr) \ + (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) + +#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ + (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) + +#define OCFS2_IS_VALID_GROUP_DESC(ptr) \ + (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) + + +#define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \ + (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE)) + +#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ + (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) + +#define OCFS2_IS_VALID_DX_ROOT(ptr) \ + (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE)) + +#define OCFS2_IS_VALID_DX_LEAF(ptr) \ + (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE)) + +#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \ + (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE)) + +static inline unsigned long ino_from_blkno(struct super_block *sb, + u64 blkno) +{ + return (unsigned long)(blkno & (u64)ULONG_MAX); +} + +static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb, + u32 clusters) +{ + int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits - + sb->s_blocksize_bits; + + return (u64)clusters << c_to_b_bits; +} + +static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, + u64 blocks) +{ + int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - + sb->s_blocksize_bits; + + return (u32)(blocks >> b_to_c_bits); +} + +static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + bytes += OCFS2_SB(sb)->s_clustersize - 1; + /* OCFS2 just cannot have enough clusters to overflow this */ + clusters = (unsigned int)(bytes >> cl_bits); + + return clusters; +} + +static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, + u64 bytes) +{ + bytes += sb->s_blocksize - 1; + return bytes >> sb->s_blocksize_bits; +} + +static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb, + u32 clusters) +{ + return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; +} + +static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb, + u64 blocks) +{ + int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits; + unsigned int clusters; + + clusters = ocfs2_blocks_to_clusters(sb, blocks); + return (u64)clusters << bits; +} + +static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + clusters = ocfs2_clusters_for_bytes(sb, bytes); + return (u64)clusters << cl_bits; +} + +static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb, + u64 bytes) +{ + u64 blocks; + + blocks = ocfs2_blocks_for_bytes(sb, bytes); + return blocks << sb->s_blocksize_bits; +} + +static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) +{ + return (unsigned long)((bytes + 511) >> 9); +} + +static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, + unsigned long pg_index) +{ + u32 clusters = pg_index; + unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; + + if (unlikely(PAGE_CACHE_SHIFT > cbits)) + clusters = pg_index << (PAGE_CACHE_SHIFT - cbits); + else if (PAGE_CACHE_SHIFT < cbits) + clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT); + + return clusters; +} + +/* + * Find the 1st page index which covers the given clusters. + */ +static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb, + u32 clusters) +{ + unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; + pgoff_t index = clusters; + + if (PAGE_CACHE_SHIFT > cbits) { + index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits); + } else if (PAGE_CACHE_SHIFT < cbits) { + index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT); + } + + return index; +} + +static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) +{ + unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int pages_per_cluster = 1; + + if (PAGE_CACHE_SHIFT < cbits) + pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT); + + return pages_per_cluster; +} + +static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb, + unsigned int megs) +{ + BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576); + + return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); +} + +static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb, + unsigned int clusters) +{ + return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits); +} + +static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) +{ + __test_and_set_bit_le(bit, bitmap); +} +#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) + +static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) +{ + __test_and_clear_bit_le(bit, bitmap); +} +#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) + +#define ocfs2_test_bit test_bit_le +#define ocfs2_find_next_zero_bit find_next_zero_bit_le +#define ocfs2_find_next_bit find_next_bit_le +#endif /* OCFS2_H */ + diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h new file mode 100644 index 00000000..938387a1 --- /dev/null +++ b/fs/ocfs2/ocfs2_fs.h @@ -0,0 +1,1640 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_fs.h + * + * On-disk structures for OCFS2. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _OCFS2_FS_H +#define _OCFS2_FS_H + +/* Version */ +#define OCFS2_MAJOR_REV_LEVEL 0 +#define OCFS2_MINOR_REV_LEVEL 90 + +/* + * An OCFS2 volume starts this way: + * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS. + * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS. + * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock. + * + * All other structures are found from the superblock information. + * + * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a + * blocksize of 2K, it is 4096 bytes into disk. + */ +#define OCFS2_SUPER_BLOCK_BLKNO 2 + +/* + * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could + * grow if needed. + */ +#define OCFS2_MIN_CLUSTERSIZE 4096 +#define OCFS2_MAX_CLUSTERSIZE 1048576 + +/* + * Blocks cannot be bigger than clusters, so the maximum blocksize is the + * minimum cluster size. + */ +#define OCFS2_MIN_BLOCKSIZE 512 +#define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE + +/* Filesystem magic number */ +#define OCFS2_SUPER_MAGIC 0x7461636f + +/* Object signatures */ +#define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2" +#define OCFS2_INODE_SIGNATURE "INODE01" +#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" +#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" +#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" +#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" +#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01" +#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1" +#define OCFS2_REFCOUNT_BLOCK_SIGNATURE "REFCNT1" + +/* Compatibility flags */ +#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ + ( OCFS2_SB(sb)->s_feature_compat & (mask) ) +#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( OCFS2_SB(sb)->s_feature_ro_compat & (mask) ) +#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( OCFS2_SB(sb)->s_feature_incompat & (mask) ) +#define OCFS2_SET_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_compat |= (mask) +#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_ro_compat |= (mask) +#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_incompat |= (mask) +#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_compat &= ~(mask) +#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask) +#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_incompat &= ~(mask) + +#define OCFS2_FEATURE_COMPAT_SUPP (OCFS2_FEATURE_COMPAT_BACKUP_SB \ + | OCFS2_FEATURE_COMPAT_JBD2_SB) +#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ + | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ + | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ + | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ + | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ + | OCFS2_FEATURE_INCOMPAT_XATTR \ + | OCFS2_FEATURE_INCOMPAT_META_ECC \ + | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ + | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ + | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ + | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) +#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ + | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ + | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) + +/* + * Heartbeat-only devices are missing journals and other files. The + * filesystem driver can't load them, but the library can. Never put + * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*. + */ +#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002 + +/* + * tunefs sets this incompat flag before starting the resize and clears it + * at the end. This flag protects users from inadvertently mounting the fs + * after an aborted run without fsck-ing. + */ +#define OCFS2_FEATURE_INCOMPAT_RESIZE_INPROG 0x0004 + +/* Used to denote a non-clustered volume */ +#define OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT 0x0008 + +/* Support for sparse allocation in b-trees */ +#define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC 0x0010 + +/* + * Tunefs sets this incompat flag before starting an operation which + * would require cleanup on abort. This is done to protect users from + * inadvertently mounting the fs after an aborted run without + * fsck-ing. + * + * s_tunefs_flags on the super block describes precisely which + * operations were in progress. + */ +#define OCFS2_FEATURE_INCOMPAT_TUNEFS_INPROG 0x0020 + +/* Support for data packed into inode blocks */ +#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 + +/* + * Support for alternate, userspace cluster stacks. If set, the superblock + * field s_cluster_info contains a tag for the alternate stack in use as + * well as the name of the cluster being joined. + * mount.ocfs2 must pass in a matching stack name. + * + * If not set, the classic stack will be used. This is compatbile with + * all older versions. + */ +#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 + +/* Support for the extended slot map */ +#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 + +/* Support for extended attributes */ +#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 + +/* Support for indexed directores */ +#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400 + +/* Metadata checksum and error correction */ +#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 + +/* Refcount tree support */ +#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 + +/* Discontigous block groups */ +#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 + +/* + * Incompat bit to indicate useable clusterinfo with stackflags for all + * cluster stacks (userspace adnd o2cb). If this bit is set, + * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set. + */ +#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 + +/* + * backup superblock flag is used to indicate that this volume + * has backup superblocks. + */ +#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 + +/* + * The filesystem will correctly handle journal feature bits. + */ +#define OCFS2_FEATURE_COMPAT_JBD2_SB 0x0002 + +/* + * Unwritten extents support. + */ +#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001 + +/* + * Maintain quota information for this filesystem + */ +#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 +#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 + +/* The byte offset of the first backup block will be 1G. + * The following will be 4G, 16G, 64G, 256G and 1T. + */ +#define OCFS2_BACKUP_SB_START 1 << 30 + +/* the max backup superblock nums */ +#define OCFS2_MAX_BACKUP_SUPERBLOCKS 6 + +/* + * Flags on ocfs2_super_block.s_tunefs_flags + */ +#define OCFS2_TUNEFS_INPROG_REMOVE_SLOT 0x0001 /* Removing slots */ + +/* + * Flags on ocfs2_dinode.i_flags + */ +#define OCFS2_VALID_FL (0x00000001) /* Inode is valid */ +#define OCFS2_UNUSED2_FL (0x00000002) +#define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */ +#define OCFS2_UNUSED3_FL (0x00000008) +/* System inode flags */ +#define OCFS2_SYSTEM_FL (0x00000010) /* System inode */ +#define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */ +#define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */ +#define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */ +#define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */ +#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */ +#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ +#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ +#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ + +/* + * Flags on ocfs2_dinode.i_dyn_features + * + * These can change much more often than i_flags. When adding flags, + * keep in mind that i_dyn_features is only 16 bits wide. + */ +#define OCFS2_INLINE_DATA_FL (0x0001) /* Data stored in inode block */ +#define OCFS2_HAS_XATTR_FL (0x0002) +#define OCFS2_INLINE_XATTR_FL (0x0004) +#define OCFS2_INDEXED_DIR_FL (0x0008) +#define OCFS2_HAS_REFCOUNT_FL (0x0010) + +/* Inode attributes, keep in sync with EXT2 */ +#define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */ +#define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */ +#define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */ +#define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */ +#define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */ +#define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */ +#define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */ +#define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */ +/* Reserved for compression usage... */ +#define OCFS2_DIRTY_FL FS_DIRTY_FL +#define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */ +#define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */ +#define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */ +/* End compression flags --- maybe not all used */ +#define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */ +#define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */ +#define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */ +#define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */ +#define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */ +#define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */ +#define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/ +#define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */ + +#define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */ +#define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */ + +/* + * Extent record flags (e_node.leaf.flags) + */ +#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but + * unwritten */ +#define OCFS2_EXT_REFCOUNTED (0x02) /* Extent is reference + * counted in an associated + * refcount tree */ + +/* + * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) + */ +#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ + +/* + * superblock s_state flags + */ +#define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */ + +/* Limit of space in ocfs2_dir_entry */ +#define OCFS2_MAX_FILENAME_LEN 255 + +/* Maximum slots on an ocfs2 file system */ +#define OCFS2_MAX_SLOTS 255 + +/* Slot map indicator for an empty slot */ +#define OCFS2_INVALID_SLOT -1 + +#define OCFS2_VOL_UUID_LEN 16 +#define OCFS2_MAX_VOL_LABEL_LEN 64 + +/* The cluster stack fields */ +#define OCFS2_STACK_LABEL_LEN 4 +#define OCFS2_CLUSTER_NAME_LEN 16 + +/* Classic (historically speaking) cluster stack */ +#define OCFS2_CLASSIC_CLUSTER_STACK "o2cb" + +/* Journal limits (in bytes) */ +#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) + +/* + * Inline extended attribute size (in bytes) + * The value chosen should be aligned to 16 byte boundaries. + */ +#define OCFS2_MIN_XATTR_INLINE_SIZE 256 + +/* + * Cluster info flags (ocfs2_cluster_info.ci_stackflags) + */ +#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01) + +struct ocfs2_system_inode_info { + char *si_name; + int si_iflags; + int si_mode; +}; + +/* System file index */ +enum { + BAD_BLOCK_SYSTEM_INODE = 0, + GLOBAL_INODE_ALLOC_SYSTEM_INODE, + SLOT_MAP_SYSTEM_INODE, +#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE + HEARTBEAT_SYSTEM_INODE, + GLOBAL_BITMAP_SYSTEM_INODE, + USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE, +#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE +#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE + ORPHAN_DIR_SYSTEM_INODE, + EXTENT_ALLOC_SYSTEM_INODE, + INODE_ALLOC_SYSTEM_INODE, + JOURNAL_SYSTEM_INODE, + LOCAL_ALLOC_SYSTEM_INODE, + TRUNCATE_LOG_SYSTEM_INODE, + LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE, +#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE + NUM_SYSTEM_INODES +}; +#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE +#define NUM_LOCAL_SYSTEM_INODES \ + (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE) + +static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { + /* Global system inodes (single copy) */ + /* The first two are only used from userspace mfks/tunefs */ + [BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 }, + [GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, + + /* These are used by the running filesystem */ + [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 }, + [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 }, + [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 }, + [USER_QUOTA_SYSTEM_INODE] = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 }, + [GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 }, + + /* Slot-specific system inodes (one copy per slot) */ + [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 }, + [EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, + [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, + [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 }, + [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 }, + [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }, + [LOCAL_USER_QUOTA_SYSTEM_INODE] = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 }, + [LOCAL_GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 }, +}; + +/* Parameter passed from mount.ocfs2 to module */ +#define OCFS2_HB_NONE "heartbeat=none" +#define OCFS2_HB_LOCAL "heartbeat=local" +#define OCFS2_HB_GLOBAL "heartbeat=global" + +/* + * OCFS2 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define OCFS2_FT_UNKNOWN 0 +#define OCFS2_FT_REG_FILE 1 +#define OCFS2_FT_DIR 2 +#define OCFS2_FT_CHRDEV 3 +#define OCFS2_FT_BLKDEV 4 +#define OCFS2_FT_FIFO 5 +#define OCFS2_FT_SOCK 6 +#define OCFS2_FT_SYMLINK 7 + +#define OCFS2_FT_MAX 8 + +/* + * OCFS2_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define OCFS2_DIR_PAD 4 +#define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1) +#define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name) +#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ + OCFS2_DIR_ROUND) & \ + ~OCFS2_DIR_ROUND) +#define OCFS2_DIR_MIN_REC_LEN OCFS2_DIR_REC_LEN(1) + +#define OCFS2_LINK_MAX 32000 +#define OCFS2_DX_LINK_MAX ((1U << 31) - 1U) +#define OCFS2_LINKS_HI_SHIFT 16 +#define OCFS2_DX_ENTRIES_MAX (0xffffffffU) + +#define S_SHIFT 12 +static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR, + [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK, + [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK, +}; + + +/* + * Convenience casts + */ +#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super)) + +/* + * Block checking structure. This is used in metadata to validate the + * contents. If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all + * zeros. + */ +struct ocfs2_block_check { +/*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */ + __le16 bc_ecc; /* Single-error-correction parity vector. + This is a simple Hamming code dependent + on the blocksize. OCFS2's maximum + blocksize, 4K, requires 16 parity bits, + so we fit in __le16. */ + __le16 bc_reserved1; +/*08*/ +}; + +/* + * On disk extent record for OCFS2 + * It describes a range of clusters on disk. + * + * Length fields are divided into interior and leaf node versions. + * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes. + */ +struct ocfs2_extent_rec { +/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ + union { + __le32 e_int_clusters; /* Clusters covered by all children */ + struct { + __le16 e_leaf_clusters; /* Clusters covered by this + extent */ + __u8 e_reserved1; + __u8 e_flags; /* Extent flags */ + }; + }; + __le64 e_blkno; /* Physical disk offset, in blocks */ +/*10*/ +}; + +struct ocfs2_chain_rec { + __le32 c_free; /* Number of free bits in this chain. */ + __le32 c_total; /* Number of total bits in this chain */ + __le64 c_blkno; /* Physical disk offset (blocks) of 1st group */ +}; + +struct ocfs2_truncate_rec { + __le32 t_start; /* 1st cluster in this log */ + __le32 t_clusters; /* Number of total clusters covered */ +}; + +/* + * On disk extent list for OCFS2 (node in the tree). Note that this + * is contained inside ocfs2_dinode or ocfs2_extent_block, so the + * offsets are relative to ocfs2_dinode.id2.i_list or + * ocfs2_extent_block.h_list, respectively. + */ +struct ocfs2_extent_list { +/*00*/ __le16 l_tree_depth; /* Extent tree depth from this + point. 0 means data extents + hang directly off this + header (a leaf) + NOTE: The high 8 bits cannot be + used - tree_depth is never that big. + */ + __le16 l_count; /* Number of extent records */ + __le16 l_next_free_rec; /* Next unused extent slot */ + __le16 l_reserved1; + __le64 l_reserved2; /* Pad to + sizeof(ocfs2_extent_rec) */ +/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */ +}; + +/* + * On disk allocation chain list for OCFS2. Note that this is + * contained inside ocfs2_dinode, so the offsets are relative to + * ocfs2_dinode.id2.i_chain. + */ +struct ocfs2_chain_list { +/*00*/ __le16 cl_cpg; /* Clusters per Block Group */ + __le16 cl_bpc; /* Bits per cluster */ + __le16 cl_count; /* Total chains in this list */ + __le16 cl_next_free_rec; /* Next unused chain slot */ + __le64 cl_reserved1; +/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */ +}; + +/* + * On disk deallocation log for OCFS2. Note that this is + * contained inside ocfs2_dinode, so the offsets are relative to + * ocfs2_dinode.id2.i_dealloc. + */ +struct ocfs2_truncate_log { +/*00*/ __le16 tl_count; /* Total records in this log */ + __le16 tl_used; /* Number of records in use */ + __le32 tl_reserved1; +/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */ +}; + +/* + * On disk extent block (indirect block) for OCFS2 + */ +struct ocfs2_extent_block +{ +/*00*/ __u8 h_signature[8]; /* Signature for verification */ + struct ocfs2_block_check h_check; /* Error checking */ +/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this + extent_header belongs to */ + __le16 h_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 h_fs_generation; /* Must match super block */ + __le64 h_blkno; /* Offset on disk, in blocks */ +/*20*/ __le64 h_suballoc_loc; /* Suballocator block group this + eb belongs to. Only valid + if allocated from a + discontiguous block group */ + __le64 h_next_leaf_blk; /* Offset on disk, in blocks, + of next leaf header pointing + to data */ +/*30*/ struct ocfs2_extent_list h_list; /* Extent record list */ +/* Actual on-disk size is one block */ +}; + +/* + * On disk slot map for OCFS2. This defines the contents of the "slot_map" + * system file. A slot is valid if it contains a node number >= 0. The + * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. + */ +struct ocfs2_slot_map { +/*00*/ __le16 sm_slots[0]; +/* + * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, + * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. + */ +}; + +struct ocfs2_extended_slot { +/*00*/ __u8 es_valid; + __u8 es_reserved1[3]; + __le32 es_node_num; +/*10*/ +}; + +/* + * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP + * is set. It separates out the valid marker from the node number, and + * has room to grow. Unlike the old slot map, this format is defined by + * i_size. + */ +struct ocfs2_slot_map_extended { +/*00*/ struct ocfs2_extended_slot se_slots[0]; +/* + * Actual size is i_size of the slot_map system file. It should + * match s_max_slots * sizeof(struct ocfs2_extended_slot) + */ +}; + +/* + * ci_stackflags is only valid if the incompat bit + * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set. + */ +struct ocfs2_cluster_info { +/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; + union { + __le32 ci_reserved; + struct { + __u8 ci_stackflags; + __u8 ci_reserved1; + __u8 ci_reserved2; + __u8 ci_reserved3; + }; + }; +/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; +/*18*/ +}; + +/* + * On disk superblock for OCFS2 + * Note that it is contained inside an ocfs2_dinode, so all offsets + * are relative to the start of ocfs2_dinode.id2. + */ +struct ocfs2_super_block { +/*00*/ __le16 s_major_rev_level; + __le16 s_minor_rev_level; + __le16 s_mnt_count; + __le16 s_max_mnt_count; + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le32 s_checkinterval; /* Max time between checks */ +/*10*/ __le64 s_lastcheck; /* Time of last check */ + __le32 s_creator_os; /* OS */ + __le32 s_feature_compat; /* Compatible feature set */ +/*20*/ __le32 s_feature_incompat; /* Incompatible feature set */ + __le32 s_feature_ro_compat; /* Readonly-compatible feature set */ + __le64 s_root_blkno; /* Offset, in blocks, of root directory + dinode */ +/*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system + directory dinode */ + __le32 s_blocksize_bits; /* Blocksize for this fs */ + __le32 s_clustersize_bits; /* Clustersize for this fs */ +/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts + before tunefs required */ + __le16 s_tunefs_flag; + __le32 s_uuid_hash; /* hash value of uuid */ + __le64 s_first_cluster_group; /* Block offset of 1st cluster + * group header */ +/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ +/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ +/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either + userspace or clusterinfo + INCOMPAT flag set. */ +/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size + for this fs*/ + __le16 s_reserved0; + __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash. + * s_uuid_hash serves as seed[3]. */ +/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */ +/*140*/ + + /* + * NOTE: As stated above, all offsets are relative to + * ocfs2_dinode.id2, which is at 0xC0 in the inode. + * 0xC0 + 0x140 = 0x200 or 512 bytes. A superblock must fit within + * our smallest blocksize, which is 512 bytes. To ensure this, + * we reserve the space in s_reserved2. Anything past s_reserved2 + * will not be available on the smallest blocksize. + */ +}; + +/* + * Local allocation bitmap for OCFS2 slots + * Note that it exists inside an ocfs2_dinode, so all offsets are + * relative to the start of ocfs2_dinode.id2. + */ +struct ocfs2_local_alloc +{ +/*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */ + __le16 la_size; /* Size of included bitmap, in bytes */ + __le16 la_reserved1; + __le64 la_reserved2; +/*10*/ __u8 la_bitmap[0]; +}; + +/* + * Data-in-inode header. This is only used if i_dyn_features has + * OCFS2_INLINE_DATA_FL set. + */ +struct ocfs2_inline_data +{ +/*00*/ __le16 id_count; /* Number of bytes that can be used + * for data, starting at id_data */ + __le16 id_reserved0; + __le32 id_reserved1; + __u8 id_data[0]; /* Start of user data */ +}; + +/* + * On disk inode for OCFS2 + */ +struct ocfs2_dinode { +/*00*/ __u8 i_signature[8]; /* Signature for validation */ + __le32 i_generation; /* Generation number */ + __le16 i_suballoc_slot; /* Slot suballocator this inode + belongs to */ + __le16 i_suballoc_bit; /* Bit offset in suballocator + block group */ +/*10*/ __le16 i_links_count_hi; /* High 16 bits of links count */ + __le16 i_xattr_inline_size; + __le32 i_clusters; /* Cluster count */ + __le32 i_uid; /* Owner UID */ + __le32 i_gid; /* Owning GID */ +/*20*/ __le64 i_size; /* Size in bytes */ + __le16 i_mode; /* File mode */ + __le16 i_links_count; /* Links count */ + __le32 i_flags; /* File flags */ +/*30*/ __le64 i_atime; /* Access time */ + __le64 i_ctime; /* Creation time */ +/*40*/ __le64 i_mtime; /* Modification time */ + __le64 i_dtime; /* Deletion time */ +/*50*/ __le64 i_blkno; /* Offset on disk, in blocks */ + __le64 i_last_eb_blk; /* Pointer to last extent + block */ +/*60*/ __le32 i_fs_generation; /* Generation per fs-instance */ + __le32 i_atime_nsec; + __le32 i_ctime_nsec; + __le32 i_mtime_nsec; +/*70*/ __le32 i_attr; + __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL + was set in i_flags */ + __le16 i_dyn_features; + __le64 i_xattr_loc; +/*80*/ struct ocfs2_block_check i_check; /* Error checking */ +/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ +/*90*/ __le64 i_refcount_loc; + __le64 i_suballoc_loc; /* Suballocator block group this + inode belongs to. Only valid + if allocated from a + discontiguous block group */ +/*A0*/ __le64 i_reserved2[3]; +/*B8*/ union { + __le64 i_pad1; /* Generic way to refer to this + 64bit union */ + struct { + __le64 i_rdev; /* Device number */ + } dev1; + struct { /* Info for bitmap system + inodes */ + __le32 i_used; /* Bits (ie, clusters) used */ + __le32 i_total; /* Total bits (clusters) + available */ + } bitmap1; + struct { /* Info for journal system + inodes */ + __le32 ij_flags; /* Mounted, version, etc. */ + __le32 ij_recovery_generation; /* Incremented when the + journal is recovered + after an unclean + shutdown */ + } journal1; + } id1; /* Inode type dependent 1 */ +/*C0*/ union { + struct ocfs2_super_block i_super; + struct ocfs2_local_alloc i_lab; + struct ocfs2_chain_list i_chain; + struct ocfs2_extent_list i_list; + struct ocfs2_truncate_log i_dealloc; + struct ocfs2_inline_data i_data; + __u8 i_symlink[0]; + } id2; +/* Actual on-disk size is one block */ +}; + +/* + * On-disk directory entry structure for OCFS2 + * + * Packed as this structure could be accessed unaligned on 64-bit platforms + */ +struct ocfs2_dir_entry { +/*00*/ __le64 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; +/*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */ +/* Actual on-disk length specified by rec_len */ +} __attribute__ ((packed)); + +/* + * Per-block record for the unindexed directory btree. This is carefully + * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are + * mirrored. That way, the directory manipulation code needs a minimal amount + * of update. + * + * NOTE: Keep this structure aligned to a multiple of 4 bytes. + */ +struct ocfs2_dir_block_trailer { +/*00*/ __le64 db_compat_inode; /* Always zero. Was inode */ + + __le16 db_compat_rec_len; /* Backwards compatible with + * ocfs2_dir_entry. */ + __u8 db_compat_name_len; /* Always zero. Was name_len */ + __u8 db_reserved0; + __le16 db_reserved1; + __le16 db_free_rec_len; /* Size of largest empty hole + * in this block. (unused) */ +/*10*/ __u8 db_signature[8]; /* Signature for verification */ + __le64 db_reserved2; + __le64 db_free_next; /* Next block in list (unused) */ +/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */ + __le64 db_parent_dinode; /* dinode which owns me, in + blocks */ +/*30*/ struct ocfs2_block_check db_check; /* Error checking */ +/*40*/ +}; + + /* + * A directory entry in the indexed tree. We don't store the full name here, + * but instead provide a pointer to the full dirent in the unindexed tree. + * + * We also store name_len here so as to reduce the number of leaf blocks we + * need to search in case of collisions. + */ +struct ocfs2_dx_entry { + __le32 dx_major_hash; /* Used to find logical + * cluster in index */ + __le32 dx_minor_hash; /* Lower bits used to find + * block in cluster */ + __le64 dx_dirent_blk; /* Physical block in unindexed + * tree holding this dirent. */ +}; + +struct ocfs2_dx_entry_list { + __le32 de_reserved; + __le16 de_count; /* Maximum number of entries + * possible in de_entries */ + __le16 de_num_used; /* Current number of + * de_entries entries */ + struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries + * in a packed array of + * length de_num_used */ +}; + +#define OCFS2_DX_FLAG_INLINE 0x01 + +/* + * A directory indexing block. Each indexed directory has one of these, + * pointed to by ocfs2_dinode. + * + * This block stores an indexed btree root, and a set of free space + * start-of-list pointers. + */ +struct ocfs2_dx_root_block { + __u8 dr_signature[8]; /* Signature for verification */ + struct ocfs2_block_check dr_check; /* Error checking */ + __le16 dr_suballoc_slot; /* Slot suballocator this + * block belongs to. */ + __le16 dr_suballoc_bit; /* Bit offset in suballocator + * block group */ + __le32 dr_fs_generation; /* Must match super block */ + __le64 dr_blkno; /* Offset on disk, in blocks */ + __le64 dr_last_eb_blk; /* Pointer to last + * extent block */ + __le32 dr_clusters; /* Clusters allocated + * to the indexed tree. */ + __u8 dr_flags; /* OCFS2_DX_FLAG_* flags */ + __u8 dr_reserved0; + __le16 dr_reserved1; + __le64 dr_dir_blkno; /* Pointer to parent inode */ + __le32 dr_num_entries; /* Total number of + * names stored in + * this directory.*/ + __le32 dr_reserved2; + __le64 dr_free_blk; /* Pointer to head of free + * unindexed block list. */ + __le64 dr_suballoc_loc; /* Suballocator block group + this root belongs to. + Only valid if allocated + from a discontiguous + block group */ + __le64 dr_reserved3[14]; + union { + struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 + * bits for maximum space + * efficiency. */ + struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of + * entries. We grow out + * to extents if this + * gets too big. */ + }; +}; + +/* + * The header of a leaf block in the indexed tree. + */ +struct ocfs2_dx_leaf { + __u8 dl_signature[8];/* Signature for verification */ + struct ocfs2_block_check dl_check; /* Error checking */ + __le64 dl_blkno; /* Offset on disk, in blocks */ + __le32 dl_fs_generation;/* Must match super block */ + __le32 dl_reserved0; + __le64 dl_reserved1; + struct ocfs2_dx_entry_list dl_list; +}; + +/* + * Largest bitmap for a block (suballocator) group in bytes. This limit + * does not affect cluster groups (global allocator). Cluster group + * bitmaps run to the end of the block. + */ +#define OCFS2_MAX_BG_BITMAP_SIZE 256 + +/* + * On disk allocator group structure for OCFS2 + */ +struct ocfs2_group_desc +{ +/*00*/ __u8 bg_signature[8]; /* Signature for validation */ + __le16 bg_size; /* Size of included bitmap in + bytes. */ + __le16 bg_bits; /* Bits represented by this + group. */ + __le16 bg_free_bits_count; /* Free bits count */ + __le16 bg_chain; /* What chain I am in. */ +/*10*/ __le32 bg_generation; + __le32 bg_reserved1; + __le64 bg_next_group; /* Next group in my list, in + blocks */ +/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in + blocks */ + __le64 bg_blkno; /* Offset on disk, in blocks */ +/*30*/ struct ocfs2_block_check bg_check; /* Error checking */ + __le64 bg_reserved2; +/*40*/ union { + __u8 bg_bitmap[0]; + struct { + /* + * Block groups may be discontiguous when + * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set. + * The extents of a discontigous block group are + * stored in bg_list. It is a flat list. + * l_tree_depth must always be zero. A + * discontiguous group is signified by a non-zero + * bg_list->l_next_free_rec. Only block groups + * can be discontiguous; Cluster groups cannot. + * We've never made a block group with more than + * 2048 blocks (256 bytes of bg_bitmap). This + * codifies that limit so that we can fit bg_list. + * bg_size of a discontiguous block group will + * be 256 to match bg_bitmap_filler. + */ + __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE]; +/*140*/ struct ocfs2_extent_list bg_list; + }; + }; +/* Actual on-disk size is one block */ +}; + +struct ocfs2_refcount_rec { +/*00*/ __le64 r_cpos; /* Physical offset, in clusters */ + __le32 r_clusters; /* Clusters covered by this extent */ + __le32 r_refcount; /* Reference count of this extent */ +/*10*/ +}; +#define OCFS2_32BIT_POS_MASK (0xffffffffULL) + +#define OCFS2_REFCOUNT_LEAF_FL (0x00000001) +#define OCFS2_REFCOUNT_TREE_FL (0x00000002) + +struct ocfs2_refcount_list { +/*00*/ __le16 rl_count; /* Maximum number of entries possible + in rl_records */ + __le16 rl_used; /* Current number of used records */ + __le32 rl_reserved2; + __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */ +/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */ +}; + + +struct ocfs2_refcount_block { +/*00*/ __u8 rf_signature[8]; /* Signature for verification */ + __le16 rf_suballoc_slot; /* Slot suballocator this block + belongs to */ + __le16 rf_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 rf_fs_generation; /* Must match superblock */ +/*10*/ __le64 rf_blkno; /* Offset on disk, in blocks */ + __le64 rf_parent; /* Parent block, only valid if + OCFS2_REFCOUNT_LEAF_FL is set in + rf_flags */ +/*20*/ struct ocfs2_block_check rf_check; /* Error checking */ + __le64 rf_last_eb_blk; /* Pointer to last extent block */ +/*30*/ __le32 rf_count; /* Number of inodes sharing this + refcount tree */ + __le32 rf_flags; /* See the flags above */ + __le32 rf_clusters; /* clusters covered by refcount tree. */ + __le32 rf_cpos; /* cluster offset in refcount tree.*/ +/*40*/ __le32 rf_generation; /* generation number. all be the same + * for the same refcount tree. */ + __le32 rf_reserved0; + __le64 rf_suballoc_loc; /* Suballocator block group this + refcount block belongs to. Only + valid if allocated from a + discontiguous block group */ +/*50*/ __le64 rf_reserved1[6]; +/*80*/ union { + struct ocfs2_refcount_list rf_records; /* List of refcount + records */ + struct ocfs2_extent_list rf_list; /* Extent record list, + only valid if + OCFS2_REFCOUNT_TREE_FL + is set in rf_flags */ + }; +/* Actual on-disk size is one block */ +}; + +/* + * On disk extended attribute structure for OCFS2. + */ + +/* + * ocfs2_xattr_entry indicates one extend attribute. + * + * Note that it can be stored in inode, one block or one xattr bucket. + */ +struct ocfs2_xattr_entry { + __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */ + __le16 xe_name_offset; /* byte offset from the 1st entry in the + local xattr storage(inode, xattr block or + xattr bucket). */ + __u8 xe_name_len; /* xattr name len, doesn't include prefix. */ + __u8 xe_type; /* the low 7 bits indicate the name prefix + * type and the highest bit indicates whether + * the EA is stored in the local storage. */ + __le64 xe_value_size; /* real xattr value length. */ +}; + +/* + * On disk structure for xattr header. + * + * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in + * the local xattr storage. + */ +struct ocfs2_xattr_header { + __le16 xh_count; /* contains the count of how + many records are in the + local xattr storage. */ + __le16 xh_free_start; /* current offset for storing + xattr. */ + __le16 xh_name_value_len; /* total length of name/value + length in this bucket. */ + __le16 xh_num_buckets; /* Number of xattr buckets + in this extent record, + only valid in the first + bucket. */ + struct ocfs2_block_check xh_check; /* Error checking + (Note, this is only + used for xattr + buckets. A block uses + xb_check and sets + this field to zero.) */ + struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ +}; + +/* + * On disk structure for xattr value root. + * + * When an xattr's value is large enough, it is stored in an external + * b-tree like file data. The xattr value root points to this structure. + */ +struct ocfs2_xattr_value_root { +/*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */ + __le32 xr_reserved0; + __le64 xr_last_eb_blk; /* Pointer to last extent block */ +/*10*/ struct ocfs2_extent_list xr_list; /* Extent record list */ +}; + +/* + * On disk structure for xattr tree root. + * + * It is used when there are too many extended attributes for one file. These + * attributes will be organized and stored in an indexed-btree. + */ +struct ocfs2_xattr_tree_root { +/*00*/ __le32 xt_clusters; /* clusters covered by xattr. */ + __le32 xt_reserved0; + __le64 xt_last_eb_blk; /* Pointer to last extent block */ +/*10*/ struct ocfs2_extent_list xt_list; /* Extent record list */ +}; + +#define OCFS2_XATTR_INDEXED 0x1 +#define OCFS2_HASH_SHIFT 5 +#define OCFS2_XATTR_ROUND 3 +#define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \ + ~(OCFS2_XATTR_ROUND)) + +#define OCFS2_XATTR_BUCKET_SIZE 4096 +#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \ + / OCFS2_MIN_BLOCKSIZE) + +/* + * On disk structure for xattr block. + */ +struct ocfs2_xattr_block { +/*00*/ __u8 xb_signature[8]; /* Signature for verification */ + __le16 xb_suballoc_slot; /* Slot suballocator this + block belongs to. */ + __le16 xb_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 xb_fs_generation; /* Must match super block */ +/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */ + struct ocfs2_block_check xb_check; /* Error checking */ +/*20*/ __le16 xb_flags; /* Indicates whether this block contains + real xattr or a xattr tree. */ + __le16 xb_reserved0; + __le32 xb_reserved1; + __le64 xb_suballoc_loc; /* Suballocator block group this + xattr block belongs to. Only + valid if allocated from a + discontiguous block group */ +/*30*/ union { + struct ocfs2_xattr_header xb_header; /* xattr header if this + block contains xattr */ + struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this + block cotains xattr + tree. */ + } xb_attrs; +}; + +#define OCFS2_XATTR_ENTRY_LOCAL 0x80 +#define OCFS2_XATTR_TYPE_MASK 0x7F +static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe, + int local) +{ + if (local) + xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL; + else + xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL; +} + +static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe) +{ + return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL; +} + +static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type) +{ + xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK; +} + +static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe) +{ + return xe->xe_type & OCFS2_XATTR_TYPE_MASK; +} + +/* + * On disk structures for global quota file + */ + +/* Magic numbers and known versions for global quota files */ +#define OCFS2_GLOBAL_QMAGICS {\ + 0x0cf52470, /* USRQUOTA */ \ + 0x0cf52471 /* GRPQUOTA */ \ +} + +#define OCFS2_GLOBAL_QVERSIONS {\ + 0, \ + 0, \ +} + + +/* Each block of each quota file has a certain fixed number of bytes reserved + * for OCFS2 internal use at its end. OCFS2 can use it for things like + * checksums, etc. */ +#define OCFS2_QBLK_RESERVED_SPACE 8 + +/* Generic header of all quota files */ +struct ocfs2_disk_dqheader { + __le32 dqh_magic; /* Magic number identifying file */ + __le32 dqh_version; /* Quota format version */ +}; + +#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader)) + +/* Information header of global quota file (immediately follows the generic + * header) */ +struct ocfs2_global_disk_dqinfo { +/*00*/ __le32 dqi_bgrace; /* Grace time for space softlimit excess */ + __le32 dqi_igrace; /* Grace time for inode softlimit excess */ + __le32 dqi_syncms; /* Time after which we sync local changes to + * global quota file */ + __le32 dqi_blocks; /* Number of blocks in quota file */ +/*10*/ __le32 dqi_free_blk; /* First free block in quota file */ + __le32 dqi_free_entry; /* First block with free dquot entry in quota + * file */ +}; + +/* Structure with global user / group information. We reserve some space + * for future use. */ +struct ocfs2_global_disk_dqblk { +/*00*/ __le32 dqb_id; /* ID the structure belongs to */ + __le32 dqb_use_count; /* Number of nodes having reference to this structure */ + __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ +/*10*/ __le64 dqb_isoftlimit; /* preferred inode limit */ + __le64 dqb_curinodes; /* current # allocated inodes */ +/*20*/ __le64 dqb_bhardlimit; /* absolute limit on disk space */ + __le64 dqb_bsoftlimit; /* preferred limit on disk space */ +/*30*/ __le64 dqb_curspace; /* current space occupied */ + __le64 dqb_btime; /* time limit for excessive disk use */ +/*40*/ __le64 dqb_itime; /* time limit for excessive inode use */ + __le64 dqb_pad1; +/*50*/ __le64 dqb_pad2; +}; + +/* + * On-disk structures for local quota file + */ + +/* Magic numbers and known versions for local quota files */ +#define OCFS2_LOCAL_QMAGICS {\ + 0x0cf524c0, /* USRQUOTA */ \ + 0x0cf524c1 /* GRPQUOTA */ \ +} + +#define OCFS2_LOCAL_QVERSIONS {\ + 0, \ + 0, \ +} + +/* Quota flags in dqinfo header */ +#define OLQF_CLEAN 0x0001 /* Quota file is empty (this should be after\ + * quota has been cleanly turned off) */ + +#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader)) + +/* Information header of local quota file (immediately follows the generic + * header) */ +struct ocfs2_local_disk_dqinfo { + __le32 dqi_flags; /* Flags for quota file */ + __le32 dqi_chunks; /* Number of chunks of quota structures + * with a bitmap */ + __le32 dqi_blocks; /* Number of blocks allocated for quota file */ +}; + +/* Header of one chunk of a quota file */ +struct ocfs2_local_disk_chunk { + __le32 dqc_free; /* Number of free entries in the bitmap */ + __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding + * chunk of quota file */ +}; + +/* One entry in local quota file */ +struct ocfs2_local_disk_dqblk { +/*00*/ __le64 dqb_id; /* id this quota applies to */ + __le64 dqb_spacemod; /* Change in the amount of used space */ +/*10*/ __le64 dqb_inodemod; /* Change in the amount of used inodes */ +}; + + +/* + * The quota trailer lives at the end of each quota block. + */ + +struct ocfs2_disk_dqtrailer { +/*00*/ struct ocfs2_block_check dq_check; /* Error checking */ +/*08*/ /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */ +}; + +static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize, + void *buf) +{ + char *ptr = buf; + ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE; + + return (struct ocfs2_disk_dqtrailer *)ptr; +} + +#ifdef __KERNEL__ +static inline int ocfs2_fast_symlink_chars(struct super_block *sb) +{ + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_symlink); +} + +static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, + struct ocfs2_dinode *di) +{ + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); + + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data) - + xattrsize; + else + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data); +} + +static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_extent_recs_per_inode_with_xattr( + struct super_block *sb, + struct ocfs2_dinode *di) +{ + int size; + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); + + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs) - + xattrsize; + else + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dx_root_block, dr_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); + + return size / sizeof(struct ocfs2_chain_rec); +} + +static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_extent_block, h_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_group_desc, bg_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dx_leaf, dl_list.de_entries); + + return size / sizeof(struct ocfs2_dx_entry); +} + +static inline int ocfs2_dx_entries_per_root(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries); + + return size / sizeof(struct ocfs2_dx_entry); +} + +static inline u16 ocfs2_local_alloc_size(struct super_block *sb) +{ + u16 size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); + + return size; +} + +static inline int ocfs2_group_bitmap_size(struct super_block *sb, + int suballocator, + u32 feature_incompat) +{ + int size = sb->s_blocksize - + offsetof(struct ocfs2_group_desc, bg_bitmap); + + /* + * The cluster allocator uses the entire block. Suballocators have + * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older + * code expects bg_size set to the maximum. Thus we must keep + * bg_size as-is unless discontig_bg is enabled. + */ + if (suballocator && + (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)) + size = OCFS2_MAX_BG_BITMAP_SIZE; + + return size; +} + +static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); + + return size / sizeof(struct ocfs2_truncate_rec); +} + +static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index) +{ + u64 offset = OCFS2_BACKUP_SB_START; + + if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) { + offset <<= (2 * index); + offset >>= sb->s_blocksize_bits; + return offset; + } + + return 0; + +} + +static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_root.xt_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_records.rl_recs); + + return size / sizeof(struct ocfs2_refcount_rec); +} + +static inline u32 +ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec) +{ + return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; +} +#else +static inline int ocfs2_fast_symlink_chars(int blocksize) +{ + return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); +} + +static inline int ocfs2_max_inline_data_with_xattr(int blocksize, + struct ocfs2_dinode *di) +{ + if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL)) + return blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data) - + di->i_xattr_inline_size; + else + return blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data); +} + +static inline int ocfs2_extent_recs_per_inode(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_chain_recs_per_inode(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); + + return size / sizeof(struct ocfs2_chain_rec); +} + +static inline int ocfs2_extent_recs_per_eb(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_extent_block, h_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_extent_recs_per_gd(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_group_desc, bg_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_local_alloc_size(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); + + return size; +} + +static inline int ocfs2_group_bitmap_size(int blocksize, + int suballocator, + uint32_t feature_incompat) +{ + int size = sb->s_blocksize - + offsetof(struct ocfs2_group_desc, bg_bitmap); + + /* + * The cluster allocator uses the entire block. Suballocators have + * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older + * code expects bg_size set to the maximum. Thus we must keep + * bg_size as-is unless discontig_bg is enabled. + */ + if (suballocator && + (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)) + size = OCFS2_MAX_BG_BITMAP_SIZE; + + return size; +} + +static inline int ocfs2_truncate_recs_per_inode(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); + + return size / sizeof(struct ocfs2_truncate_rec); +} + +static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index) +{ + uint64_t offset = OCFS2_BACKUP_SB_START; + + if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) { + offset <<= (2 * index); + offset /= blocksize; + return offset; + } + + return 0; +} + +static inline int ocfs2_xattr_recs_per_xb(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_root.xt_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} +#endif /* __KERNEL__ */ + + +static inline int ocfs2_system_inode_is_global(int type) +{ + return ((type >= 0) && + (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)); +} + +static inline int ocfs2_sprintf_system_inode_name(char *buf, int len, + int type, int slot) +{ + int chars; + + /* + * Global system inodes can only have one copy. Everything + * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode + * list has a copy per slot. + */ + if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE) + chars = snprintf(buf, len, "%s", + ocfs2_system_inodes[type].si_name); + else + chars = snprintf(buf, len, + ocfs2_system_inodes[type].si_name, + slot); + + return chars; +} + +static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, + umode_t mode) +{ + de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd) +{ + if ((offsetof(struct ocfs2_group_desc, bg_bitmap) + + le16_to_cpu(gd->bg_size)) != + offsetof(struct ocfs2_group_desc, bg_list)) + return 0; + /* + * Only valid to check l_next_free_rec if + * bg_bitmap + bg_size == bg_list. + */ + if (!gd->bg_list.l_next_free_rec) + return 0; + return 1; +} +#endif /* _OCFS2_FS_H */ + diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h new file mode 100644 index 00000000..5b27ff1f --- /dev/null +++ b/fs/ocfs2/ocfs2_ioctl.h @@ -0,0 +1,242 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_ioctl.h + * + * Defines OCFS2 ioctls. + * + * Copyright (C) 2010 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_IOCTL_H +#define OCFS2_IOCTL_H + +/* + * ioctl commands + */ +#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS +#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS +#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS + +/* + * Space reservation / allocation / free ioctls and argument structure + * are designed to be compatible with XFS. + * + * ALLOCSP* and FREESP* are not and will never be supported, but are + * included here for completeness. + */ +struct ocfs2_space_resv { + __s16 l_type; + __s16 l_whence; + __s64 l_start; + __s64 l_len; /* len == 0 means until end of file */ + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +}; + +#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv) +#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv) +#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv) +#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv) +#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv) +#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv) +#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv) +#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv) + +/* Used to pass group descriptor data when online resize is done */ +struct ocfs2_new_group_input { + __u64 group; /* Group descriptor's blkno. */ + __u32 clusters; /* Total number of clusters in this group */ + __u32 frees; /* Total free clusters in this group */ + __u16 chain; /* Chain for this group */ + __u16 reserved1; + __u32 reserved2; +}; + +#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int) +#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input) +#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input) + +/* Used to pass 2 file names to reflink. */ +struct reflink_arguments { + __u64 old_path; + __u64 new_path; + __u64 preserve; +}; +#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) + +/* Following definitions dedicated for ocfs2_info_request ioctls. */ +#define OCFS2_INFO_MAX_REQUEST (50) +#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2) + +/* Magic number of all requests */ +#define OCFS2_INFO_MAGIC (0x4F32494E) + +/* + * Always try to separate info request into small pieces to + * guarantee the backward&forward compatibility. + */ +struct ocfs2_info { + __u64 oi_requests; /* Array of __u64 pointers to requests */ + __u32 oi_count; /* Number of requests in info_requests */ + __u32 oi_pad; +}; + +struct ocfs2_info_request { +/*00*/ __u32 ir_magic; /* Magic number */ + __u32 ir_code; /* Info request code */ + __u32 ir_size; /* Size of request */ + __u32 ir_flags; /* Request flags */ +/*10*/ /* Request specific fields */ +}; + +struct ocfs2_info_clustersize { + struct ocfs2_info_request ic_req; + __u32 ic_clustersize; + __u32 ic_pad; +}; + +struct ocfs2_info_blocksize { + struct ocfs2_info_request ib_req; + __u32 ib_blocksize; + __u32 ib_pad; +}; + +struct ocfs2_info_maxslots { + struct ocfs2_info_request im_req; + __u32 im_max_slots; + __u32 im_pad; +}; + +struct ocfs2_info_label { + struct ocfs2_info_request il_req; + __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN]; +} __attribute__ ((packed)); + +struct ocfs2_info_uuid { + struct ocfs2_info_request iu_req; + __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1]; +} __attribute__ ((packed)); + +struct ocfs2_info_fs_features { + struct ocfs2_info_request if_req; + __u32 if_compat_features; + __u32 if_incompat_features; + __u32 if_ro_compat_features; + __u32 if_pad; +}; + +struct ocfs2_info_journal_size { + struct ocfs2_info_request ij_req; + __u64 ij_journal_size; +}; + +struct ocfs2_info_freeinode { + struct ocfs2_info_request ifi_req; + struct ocfs2_info_local_freeinode { + __u64 lfi_total; + __u64 lfi_free; + } ifi_stat[OCFS2_MAX_SLOTS]; + __u32 ifi_slotnum; /* out */ + __u32 ifi_pad; +}; + +#define OCFS2_INFO_MAX_HIST (32) + +struct ocfs2_info_freefrag { + struct ocfs2_info_request iff_req; + struct ocfs2_info_freefrag_stats { /* (out) */ + struct ocfs2_info_free_chunk_list { + __u32 fc_chunks[OCFS2_INFO_MAX_HIST]; + __u32 fc_clusters[OCFS2_INFO_MAX_HIST]; + } ffs_fc_hist; + __u32 ffs_clusters; + __u32 ffs_free_clusters; + __u32 ffs_free_chunks; + __u32 ffs_free_chunks_real; + __u32 ffs_min; /* Minimum free chunksize in clusters */ + __u32 ffs_max; + __u32 ffs_avg; + __u32 ffs_pad; + } iff_ffs; + __u32 iff_chunksize; /* chunksize in clusters(in) */ + __u32 iff_pad; +}; + +/* Codes for ocfs2_info_request */ +enum ocfs2_info_type { + OCFS2_INFO_CLUSTERSIZE = 1, + OCFS2_INFO_BLOCKSIZE, + OCFS2_INFO_MAXSLOTS, + OCFS2_INFO_LABEL, + OCFS2_INFO_UUID, + OCFS2_INFO_FS_FEATURES, + OCFS2_INFO_JOURNAL_SIZE, + OCFS2_INFO_FREEINODE, + OCFS2_INFO_FREEFRAG, + OCFS2_INFO_NUM_TYPES +}; + +/* Flags for struct ocfs2_info_request */ +/* Filled by the caller */ +#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not + required. This is a hint. + It is up to ocfs2 whether + the request can be fulfilled + without locking. */ +/* Filled by ocfs2 */ +#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood + this request and + filled in the answer */ + +#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during + request handling. */ + +#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info) + +struct ocfs2_move_extents { +/* All values are in bytes */ + /* in */ + __u64 me_start; /* Virtual start in the file to move */ + __u64 me_len; /* Length of the extents to be moved */ + __u64 me_goal; /* Physical offset of the goal, + it's in block unit */ + __u64 me_threshold; /* Maximum distance from goal or threshold + for auto defragmentation */ + __u64 me_flags; /* Flags for the operation: + * - auto defragmentation. + * - refcount,xattr cases. + */ + /* out */ + __u64 me_moved_len; /* Moved/defraged length */ + __u64 me_new_offset; /* Resulting physical location */ + __u32 me_reserved[2]; /* Reserved for futhure */ +}; + +#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to + claim new clusters + as the goal place + for extents moving */ +#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent + moving, is to make + movement less likely + to fail, may make fs + even more fragmented */ +#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation + completely gets done. + */ + +#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents) + +#endif /* OCFS2_IOCTL_H */ diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h new file mode 100644 index 00000000..d277aabf --- /dev/null +++ b/fs/ocfs2/ocfs2_lockid.h @@ -0,0 +1,128 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_lockid.h + * + * Defines OCFS2 lockid bits. + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_LOCKID_H +#define OCFS2_LOCKID_H + +/* lock ids are made up in the following manner: + * name[0] --> type + * name[1-6] --> 6 pad characters, reserved for now + * name[7-22] --> block number, expressed in hex as 16 chars + * name[23-30] --> i_generation, expressed in hex 8 chars + * name[31] --> '\0' */ +#define OCFS2_LOCK_ID_MAX_LEN 32 +#define OCFS2_LOCK_ID_PAD "000000" + +#define OCFS2_DENTRY_LOCK_INO_START 18 + +enum ocfs2_lock_type { + OCFS2_LOCK_TYPE_META = 0, + OCFS2_LOCK_TYPE_DATA, + OCFS2_LOCK_TYPE_SUPER, + OCFS2_LOCK_TYPE_RENAME, + OCFS2_LOCK_TYPE_RW, + OCFS2_LOCK_TYPE_DENTRY, + OCFS2_LOCK_TYPE_OPEN, + OCFS2_LOCK_TYPE_FLOCK, + OCFS2_LOCK_TYPE_QINFO, + OCFS2_LOCK_TYPE_NFS_SYNC, + OCFS2_LOCK_TYPE_ORPHAN_SCAN, + OCFS2_LOCK_TYPE_REFCOUNT, + OCFS2_NUM_LOCK_TYPES +}; + +static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) +{ + char c; + switch (type) { + case OCFS2_LOCK_TYPE_META: + c = 'M'; + break; + case OCFS2_LOCK_TYPE_DATA: + c = 'D'; + break; + case OCFS2_LOCK_TYPE_SUPER: + c = 'S'; + break; + case OCFS2_LOCK_TYPE_RENAME: + c = 'R'; + break; + case OCFS2_LOCK_TYPE_RW: + c = 'W'; + break; + case OCFS2_LOCK_TYPE_DENTRY: + c = 'N'; + break; + case OCFS2_LOCK_TYPE_OPEN: + c = 'O'; + break; + case OCFS2_LOCK_TYPE_FLOCK: + c = 'F'; + break; + case OCFS2_LOCK_TYPE_QINFO: + c = 'Q'; + break; + case OCFS2_LOCK_TYPE_NFS_SYNC: + c = 'Y'; + break; + case OCFS2_LOCK_TYPE_ORPHAN_SCAN: + c = 'P'; + break; + case OCFS2_LOCK_TYPE_REFCOUNT: + c = 'T'; + break; + default: + c = '\0'; + } + + return c; +} + +static char *ocfs2_lock_type_strings[] = { + [OCFS2_LOCK_TYPE_META] = "Meta", + [OCFS2_LOCK_TYPE_DATA] = "Data", + [OCFS2_LOCK_TYPE_SUPER] = "Super", + [OCFS2_LOCK_TYPE_RENAME] = "Rename", + /* Need to differntiate from [R]ename.. serializing writes is the + * important job it does, anyway. */ + [OCFS2_LOCK_TYPE_RW] = "Write/Read", + [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", + [OCFS2_LOCK_TYPE_OPEN] = "Open", + [OCFS2_LOCK_TYPE_FLOCK] = "Flock", + [OCFS2_LOCK_TYPE_QINFO] = "Quota", + [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", + [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", + [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount", +}; + +static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) +{ +#ifdef __KERNEL__ + BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); +#endif + return ocfs2_lock_type_strings[type]; +} + +#endif /* OCFS2_LOCKID_H */ diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h new file mode 100644 index 00000000..2e45c8d2 --- /dev/null +++ b/fs/ocfs2/ocfs2_lockingver.h @@ -0,0 +1,32 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_lockingver.h + * + * Defines OCFS2 Locking version values. + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_LOCKINGVER_H +#define OCFS2_LOCKINGVER_H + +/* + * The protocol version for ocfs2 cluster locking. See dlmglue.c for + * more details. + * + * 1.0 - Initial locking version from ocfs2 1.4. + */ +#define OCFS2_LOCKING_PROTOCOL_MAJOR 1 +#define OCFS2_LOCKING_PROTOCOL_MINOR 0 + +#endif /* OCFS2_LOCKINGVER_H */ diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h new file mode 100644 index 00000000..3b481f49 --- /dev/null +++ b/fs/ocfs2/ocfs2_trace.h @@ -0,0 +1,2764 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ocfs2 + +#if !defined(_TRACE_OCFS2_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_OCFS2_H + +#include + +DECLARE_EVENT_CLASS(ocfs2__int, + TP_PROTO(int num), + TP_ARGS(num), + TP_STRUCT__entry( + __field(int, num) + ), + TP_fast_assign( + __entry->num = num; + ), + TP_printk("%d", __entry->num) +); + +#define DEFINE_OCFS2_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__int, name, \ + TP_PROTO(int num), \ + TP_ARGS(num)) + +DECLARE_EVENT_CLASS(ocfs2__uint, + TP_PROTO(unsigned int num), + TP_ARGS(num), + TP_STRUCT__entry( + __field( unsigned int, num ) + ), + TP_fast_assign( + __entry->num = num; + ), + TP_printk("%u", __entry->num) +); + +#define DEFINE_OCFS2_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__uint, name, \ + TP_PROTO(unsigned int num), \ + TP_ARGS(num)) + +DECLARE_EVENT_CLASS(ocfs2__ull, + TP_PROTO(unsigned long long blkno), + TP_ARGS(blkno), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->blkno = blkno; + ), + TP_printk("%llu", __entry->blkno) +); + +#define DEFINE_OCFS2_ULL_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull, name, \ + TP_PROTO(unsigned long long num), \ + TP_ARGS(num)) + +DECLARE_EVENT_CLASS(ocfs2__pointer, + TP_PROTO(void *pointer), + TP_ARGS(pointer), + TP_STRUCT__entry( + __field(void *, pointer) + ), + TP_fast_assign( + __entry->pointer = pointer; + ), + TP_printk("%p", __entry->pointer) +); + +#define DEFINE_OCFS2_POINTER_EVENT(name) \ +DEFINE_EVENT(ocfs2__pointer, name, \ + TP_PROTO(void *pointer), \ + TP_ARGS(pointer)) + +DECLARE_EVENT_CLASS(ocfs2__string, + TP_PROTO(const char *name), + TP_ARGS(name), + TP_STRUCT__entry( + __string(name,name) + ), + TP_fast_assign( + __assign_str(name, name); + ), + TP_printk("%s", __get_str(name)) +); + +#define DEFINE_OCFS2_STRING_EVENT(name) \ +DEFINE_EVENT(ocfs2__string, name, \ + TP_PROTO(const char *name), \ + TP_ARGS(name)) + +DECLARE_EVENT_CLASS(ocfs2__int_int, + TP_PROTO(int value1, int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(int, value1) + __field(int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%d %d", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_INT_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__int_int, name, \ + TP_PROTO(int val1, int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__uint_int, + TP_PROTO(unsigned int value1, int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned int, value1) + __field(int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%u %d", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_UINT_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__uint_int, name, \ + TP_PROTO(unsigned int val1, int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__uint_uint, + TP_PROTO(unsigned int value1, unsigned int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned int, value1) + __field(unsigned int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%u %u", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__uint_uint, name, \ + TP_PROTO(unsigned int val1, unsigned int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__ull_uint, + TP_PROTO(unsigned long long value1, unsigned int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%llu %u", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_ULL_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_uint, name, \ + TP_PROTO(unsigned long long val1, unsigned int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__ull_int, + TP_PROTO(unsigned long long value1, int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%llu %d", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_ULL_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_int, name, \ + TP_PROTO(unsigned long long val1, int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__ull_ull, + TP_PROTO(unsigned long long value1, unsigned long long value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned long long, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%llu %llu", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_ULL_ULL_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_ull, name, \ + TP_PROTO(unsigned long long val1, unsigned long long val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint, + TP_PROTO(unsigned long long value1, + unsigned long long value2, unsigned int value3), + TP_ARGS(value1, value2, value3), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned long long, value2) + __field(unsigned int, value3) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %llu %u", + __entry->value1, __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_ULL_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_ull_uint, name, \ + TP_PROTO(unsigned long long val1, \ + unsigned long long val2, unsigned int val3), \ + TP_ARGS(val1, val2, val3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint, + TP_PROTO(unsigned long long value1, + unsigned int value2, unsigned int value3), + TP_ARGS(value1, value2, value3), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned int, value2) + __field(unsigned int, value3) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %u %u", __entry->value1, + __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_uint_uint, name, \ + TP_PROTO(unsigned long long val1, \ + unsigned int val2, unsigned int val3), \ + TP_ARGS(val1, val2, val3)) + +DECLARE_EVENT_CLASS(ocfs2__uint_uint_uint, + TP_PROTO(unsigned int value1, unsigned int value2, + unsigned int value3), + TP_ARGS(value1, value2, value3), + TP_STRUCT__entry( + __field( unsigned int, value1 ) + __field( unsigned int, value2 ) + __field( unsigned int, value3 ) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%u %u %u", __entry->value1, __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_UINT_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__uint_uint_uint, name, \ + TP_PROTO(unsigned int value1, unsigned int value2, \ + unsigned int value3), \ + TP_ARGS(value1, value2, value3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_ull_ull, + TP_PROTO(unsigned long long value1, + unsigned long long value2, unsigned long long value3), + TP_ARGS(value1, value2, value3), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned long long, value2) + __field(unsigned long long, value3) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %llu %llu", + __entry->value1, __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_ULL_ULL_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_ull_ull, name, \ + TP_PROTO(unsigned long long value1, unsigned long long value2, \ + unsigned long long value3), \ + TP_ARGS(value1, value2, value3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_int_int_int, + TP_PROTO(unsigned long long ull, int value1, int value2, int value3), + TP_ARGS(ull, value1, value2, value3), + TP_STRUCT__entry( + __field( unsigned long long, ull ) + __field( int, value1 ) + __field( int, value2 ) + __field( int, value3 ) + ), + TP_fast_assign( + __entry->ull = ull; + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %d %d %d", + __entry->ull, __entry->value1, + __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_int_int_int, name, \ + TP_PROTO(unsigned long long ull, int value1, \ + int value2, int value3), \ + TP_ARGS(ull, value1, value2, value3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint_uint, + TP_PROTO(unsigned long long ull, unsigned int value1, + unsigned int value2, unsigned int value3), + TP_ARGS(ull, value1, value2, value3), + TP_STRUCT__entry( + __field(unsigned long long, ull) + __field(unsigned int, value1) + __field(unsigned int, value2) + __field(unsigned int, value3) + ), + TP_fast_assign( + __entry->ull = ull; + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %u %u %u", + __entry->ull, __entry->value1, + __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_uint_uint_uint, name, \ + TP_PROTO(unsigned long long ull, unsigned int value1, \ + unsigned int value2, unsigned int value3), \ + TP_ARGS(ull, value1, value2, value3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint_uint, + TP_PROTO(unsigned long long value1, unsigned long long value2, + unsigned int value3, unsigned int value4), + TP_ARGS(value1, value2, value3, value4), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned long long, value2) + __field(unsigned int, value3) + __field(unsigned int, value4) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + __entry->value4 = value4; + ), + TP_printk("%llu %llu %u %u", + __entry->value1, __entry->value2, + __entry->value3, __entry->value4) +); + +#define DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_ull_uint_uint, name, \ + TP_PROTO(unsigned long long ull, unsigned long long ull1, \ + unsigned int value2, unsigned int value3), \ + TP_ARGS(ull, ull1, value2, value3)) + +/* Trace events for fs/ocfs2/alloc.c. */ +DECLARE_EVENT_CLASS(ocfs2__btree_ops, + TP_PROTO(unsigned long long owner,\ + unsigned int value1, unsigned int value2), + TP_ARGS(owner, value1, value2), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(unsigned int, value1) + __field(unsigned int, value2) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%llu %u %u", + __entry->owner, __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_BTREE_EVENT(name) \ +DEFINE_EVENT(ocfs2__btree_ops, name, \ + TP_PROTO(unsigned long long owner, \ + unsigned int value1, unsigned int value2), \ + TP_ARGS(owner, value1, value2)) + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_adjust_rightmost_branch); + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_rotate_tree_right); + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_append_rec_to_path); + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_insert_extent_start); + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_add_clusters_in_btree); + +DEFINE_OCFS2_INT_EVENT(ocfs2_num_free_extents); + +DEFINE_OCFS2_INT_EVENT(ocfs2_complete_edge_insert); + +TRACE_EVENT(ocfs2_grow_tree, + TP_PROTO(unsigned long long owner, int depth), + TP_ARGS(owner, depth), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(int, depth) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->depth = depth; + ), + TP_printk("%llu %d", __entry->owner, __entry->depth) +); + +TRACE_EVENT(ocfs2_rotate_subtree, + TP_PROTO(int subtree_root, unsigned long long blkno, + int depth), + TP_ARGS(subtree_root, blkno, depth), + TP_STRUCT__entry( + __field(int, subtree_root) + __field(unsigned long long, blkno) + __field(int, depth) + ), + TP_fast_assign( + __entry->subtree_root = subtree_root; + __entry->blkno = blkno; + __entry->depth = depth; + ), + TP_printk("%d %llu %d", __entry->subtree_root, + __entry->blkno, __entry->depth) +); + +TRACE_EVENT(ocfs2_insert_extent, + TP_PROTO(unsigned int ins_appending, unsigned int ins_contig, + int ins_contig_index, int free_records, int ins_tree_depth), + TP_ARGS(ins_appending, ins_contig, ins_contig_index, free_records, + ins_tree_depth), + TP_STRUCT__entry( + __field(unsigned int, ins_appending) + __field(unsigned int, ins_contig) + __field(int, ins_contig_index) + __field(int, free_records) + __field(int, ins_tree_depth) + ), + TP_fast_assign( + __entry->ins_appending = ins_appending; + __entry->ins_contig = ins_contig; + __entry->ins_contig_index = ins_contig_index; + __entry->free_records = free_records; + __entry->ins_tree_depth = ins_tree_depth; + ), + TP_printk("%u %u %d %d %d", + __entry->ins_appending, __entry->ins_contig, + __entry->ins_contig_index, __entry->free_records, + __entry->ins_tree_depth) +); + +TRACE_EVENT(ocfs2_split_extent, + TP_PROTO(int split_index, unsigned int c_contig_type, + unsigned int c_has_empty_extent, + unsigned int c_split_covers_rec), + TP_ARGS(split_index, c_contig_type, + c_has_empty_extent, c_split_covers_rec), + TP_STRUCT__entry( + __field(int, split_index) + __field(unsigned int, c_contig_type) + __field(unsigned int, c_has_empty_extent) + __field(unsigned int, c_split_covers_rec) + ), + TP_fast_assign( + __entry->split_index = split_index; + __entry->c_contig_type = c_contig_type; + __entry->c_has_empty_extent = c_has_empty_extent; + __entry->c_split_covers_rec = c_split_covers_rec; + ), + TP_printk("%d %u %u %u", __entry->split_index, __entry->c_contig_type, + __entry->c_has_empty_extent, __entry->c_split_covers_rec) +); + +TRACE_EVENT(ocfs2_remove_extent, + TP_PROTO(unsigned long long owner, unsigned int cpos, + unsigned int len, int index, + unsigned int e_cpos, unsigned int clusters), + TP_ARGS(owner, cpos, len, index, e_cpos, clusters), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(unsigned int, cpos) + __field(unsigned int, len) + __field(int, index) + __field(unsigned int, e_cpos) + __field(unsigned int, clusters) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->cpos = cpos; + __entry->len = len; + __entry->index = index; + __entry->e_cpos = e_cpos; + __entry->clusters = clusters; + ), + TP_printk("%llu %u %u %d %u %u", + __entry->owner, __entry->cpos, __entry->len, __entry->index, + __entry->e_cpos, __entry->clusters) +); + +TRACE_EVENT(ocfs2_commit_truncate, + TP_PROTO(unsigned long long ino, unsigned int new_cpos, + unsigned int clusters, unsigned int depth), + TP_ARGS(ino, new_cpos, clusters, depth), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, new_cpos) + __field(unsigned int, clusters) + __field(unsigned int, depth) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->new_cpos = new_cpos; + __entry->clusters = clusters; + __entry->depth = depth; + ), + TP_printk("%llu %u %u %u", + __entry->ino, __entry->new_cpos, + __entry->clusters, __entry->depth) +); + +TRACE_EVENT(ocfs2_validate_extent_block, + TP_PROTO(unsigned long long blkno), + TP_ARGS(blkno), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->blkno = blkno; + ), + TP_printk("%llu ", __entry->blkno) +); + +TRACE_EVENT(ocfs2_rotate_leaf, + TP_PROTO(unsigned int insert_cpos, int insert_index, + int has_empty, int next_free, + unsigned int l_count), + TP_ARGS(insert_cpos, insert_index, has_empty, + next_free, l_count), + TP_STRUCT__entry( + __field(unsigned int, insert_cpos) + __field(int, insert_index) + __field(int, has_empty) + __field(int, next_free) + __field(unsigned int, l_count) + ), + TP_fast_assign( + __entry->insert_cpos = insert_cpos; + __entry->insert_index = insert_index; + __entry->has_empty = has_empty; + __entry->next_free = next_free; + __entry->l_count = l_count; + ), + TP_printk("%u %d %d %d %u", __entry->insert_cpos, + __entry->insert_index, __entry->has_empty, + __entry->next_free, __entry->l_count) +); + +TRACE_EVENT(ocfs2_add_clusters_in_btree_ret, + TP_PROTO(int status, int reason, int err), + TP_ARGS(status, reason, err), + TP_STRUCT__entry( + __field(int, status) + __field(int, reason) + __field(int, err) + ), + TP_fast_assign( + __entry->status = status; + __entry->reason = reason; + __entry->err = err; + ), + TP_printk("%d %d %d", __entry->status, + __entry->reason, __entry->err) +); + +TRACE_EVENT(ocfs2_mark_extent_written, + TP_PROTO(unsigned long long owner, unsigned int cpos, + unsigned int len, unsigned int phys), + TP_ARGS(owner, cpos, len, phys), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(unsigned int, cpos) + __field(unsigned int, len) + __field(unsigned int, phys) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->cpos = cpos; + __entry->len = len; + __entry->phys = phys; + ), + TP_printk("%llu %u %u %u", + __entry->owner, __entry->cpos, + __entry->len, __entry->phys) +); + +DECLARE_EVENT_CLASS(ocfs2__truncate_log_ops, + TP_PROTO(unsigned long long blkno, int index, + unsigned int start, unsigned int num), + TP_ARGS(blkno, index, start, num), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + __field(int, index) + __field(unsigned int, start) + __field(unsigned int, num) + ), + TP_fast_assign( + __entry->blkno = blkno; + __entry->index = index; + __entry->start = start; + __entry->num = num; + ), + TP_printk("%llu %d %u %u", + __entry->blkno, __entry->index, + __entry->start, __entry->num) +); + +#define DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(name) \ +DEFINE_EVENT(ocfs2__truncate_log_ops, name, \ + TP_PROTO(unsigned long long blkno, int index, \ + unsigned int start, unsigned int num), \ + TP_ARGS(blkno, index, start, num)) + +DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_truncate_log_append); + +DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_replay_truncate_records); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_flush_truncate_log); + +DEFINE_OCFS2_INT_EVENT(ocfs2_begin_truncate_log_recovery); + +DEFINE_OCFS2_INT_EVENT(ocfs2_truncate_log_recovery_num); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_complete_truncate_log_recovery); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_free_cached_blocks); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_cache_cluster_dealloc); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_run_deallocs); + +TRACE_EVENT(ocfs2_cache_block_dealloc, + TP_PROTO(int type, int slot, unsigned long long suballoc, + unsigned long long blkno, unsigned int bit), + TP_ARGS(type, slot, suballoc, blkno, bit), + TP_STRUCT__entry( + __field(int, type) + __field(int, slot) + __field(unsigned long long, suballoc) + __field(unsigned long long, blkno) + __field(unsigned int, bit) + ), + TP_fast_assign( + __entry->type = type; + __entry->slot = slot; + __entry->suballoc = suballoc; + __entry->blkno = blkno; + __entry->bit = bit; + ), + TP_printk("%d %d %llu %llu %u", + __entry->type, __entry->slot, __entry->suballoc, + __entry->blkno, __entry->bit) +); + +TRACE_EVENT(ocfs2_trim_extent, + TP_PROTO(struct super_block *sb, unsigned long long blk, + unsigned long long count), + TP_ARGS(sb, blk, count), + TP_STRUCT__entry( + __field(int, dev_major) + __field(int, dev_minor) + __field(unsigned long long, blk) + __field(__u64, count) + ), + TP_fast_assign( + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); + __entry->blk = blk; + __entry->count = count; + ), + TP_printk("%d %d %llu %llu", + __entry->dev_major, __entry->dev_minor, + __entry->blk, __entry->count) +); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); + +/* End of trace events for fs/ocfs2/alloc.c. */ + +/* Trace events for fs/ocfs2/localalloc.c. */ + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_la_set_sizes); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_alloc_should_use_local); + +DEFINE_OCFS2_INT_EVENT(ocfs2_load_local_alloc); + +DEFINE_OCFS2_INT_EVENT(ocfs2_begin_local_alloc_recovery); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_reserve_local_alloc_bits); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_local_alloc_count_bits); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits_search_bitmap); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_sync_local_to_main); + +TRACE_EVENT(ocfs2_sync_local_to_main_free, + TP_PROTO(int count, int bit, unsigned long long start_blk, + unsigned long long blkno), + TP_ARGS(count, bit, start_blk, blkno), + TP_STRUCT__entry( + __field(int, count) + __field(int, bit) + __field(unsigned long long, start_blk) + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->count = count; + __entry->bit = bit; + __entry->start_blk = start_blk; + __entry->blkno = blkno; + ), + TP_printk("%d %d %llu %llu", + __entry->count, __entry->bit, __entry->start_blk, + __entry->blkno) +); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_new_window); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_local_alloc_new_window_result); + +/* End of trace events for fs/ocfs2/localalloc.c. */ + +/* Trace events for fs/ocfs2/resize.c. */ + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_update_last_group_and_inode); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_group_extend); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_group_add); + +/* End of trace events for fs/ocfs2/resize.c. */ + +/* Trace events for fs/ocfs2/suballoc.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_group_descriptor); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_contig); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_discontig); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_block_group_alloc); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_nospc); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_no_new_group); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_reserve_new_inode_new_group); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_set_bits); + +TRACE_EVENT(ocfs2_relink_block_group, + TP_PROTO(unsigned long long i_blkno, unsigned int chain, + unsigned long long bg_blkno, + unsigned long long prev_blkno), + TP_ARGS(i_blkno, chain, bg_blkno, prev_blkno), + TP_STRUCT__entry( + __field(unsigned long long, i_blkno) + __field(unsigned int, chain) + __field(unsigned long long, bg_blkno) + __field(unsigned long long, prev_blkno) + ), + TP_fast_assign( + __entry->i_blkno = i_blkno; + __entry->chain = chain; + __entry->bg_blkno = bg_blkno; + __entry->prev_blkno = prev_blkno; + ), + TP_printk("%llu %u %llu %llu", + __entry->i_blkno, __entry->chain, __entry->bg_blkno, + __entry->prev_blkno) +); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_cluster_group_search_wrong_max_bits); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cluster_group_search_max_block); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_block_group_search_max_block); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_search_chain_begin); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_succ); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_end); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_claim_suballoc_bits); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_claim_new_inode_at_loc); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_clear_bits); + +TRACE_EVENT(ocfs2_free_suballoc_bits, + TP_PROTO(unsigned long long inode, unsigned long long group, + unsigned int start_bit, unsigned int count), + TP_ARGS(inode, group, start_bit, count), + TP_STRUCT__entry( + __field(unsigned long long, inode) + __field(unsigned long long, group) + __field(unsigned int, start_bit) + __field(unsigned int, count) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->group = group; + __entry->start_bit = start_bit; + __entry->count = count; + ), + TP_printk("%llu %llu %u %u", __entry->inode, __entry->group, + __entry->start_bit, __entry->count) +); + +TRACE_EVENT(ocfs2_free_clusters, + TP_PROTO(unsigned long long bg_blkno, unsigned long long start_blk, + unsigned int start_bit, unsigned int count), + TP_ARGS(bg_blkno, start_blk, start_bit, count), + TP_STRUCT__entry( + __field(unsigned long long, bg_blkno) + __field(unsigned long long, start_blk) + __field(unsigned int, start_bit) + __field(unsigned int, count) + ), + TP_fast_assign( + __entry->bg_blkno = bg_blkno; + __entry->start_blk = start_blk; + __entry->start_bit = start_bit; + __entry->count = count; + ), + TP_printk("%llu %llu %u %u", __entry->bg_blkno, __entry->start_blk, + __entry->start_bit, __entry->count) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_get_suballoc_slot_bit); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_test_suballoc_bit); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_test_inode_bit); + +/* End of trace events for fs/ocfs2/suballoc.c. */ + +/* Trace events for fs/ocfs2/refcounttree.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_refcount_block); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_refcount_trees); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree_blkno); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_change_refcount_rec); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_expand_inline_ref_root); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_divide_leaf_refcount_block); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_new_leaf_refcount_block); + +DECLARE_EVENT_CLASS(ocfs2__refcount_tree_ops, + TP_PROTO(unsigned long long blkno, int index, + unsigned long long cpos, + unsigned int clusters, unsigned int refcount), + TP_ARGS(blkno, index, cpos, clusters, refcount), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + __field(int, index) + __field(unsigned long long, cpos) + __field(unsigned int, clusters) + __field(unsigned int, refcount) + ), + TP_fast_assign( + __entry->blkno = blkno; + __entry->index = index; + __entry->cpos = cpos; + __entry->clusters = clusters; + __entry->refcount = refcount; + ), + TP_printk("%llu %d %llu %u %u", __entry->blkno, __entry->index, + __entry->cpos, __entry->clusters, __entry->refcount) +); + +#define DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(name) \ +DEFINE_EVENT(ocfs2__refcount_tree_ops, name, \ + TP_PROTO(unsigned long long blkno, int index, \ + unsigned long long cpos, \ + unsigned int count, unsigned int refcount), \ + TP_ARGS(blkno, index, cpos, count, refcount)) + +DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_insert_refcount_rec); + +TRACE_EVENT(ocfs2_split_refcount_rec, + TP_PROTO(unsigned long long cpos, + unsigned int clusters, unsigned int refcount, + unsigned long long split_cpos, + unsigned int split_clusters, unsigned int split_refcount), + TP_ARGS(cpos, clusters, refcount, + split_cpos, split_clusters, split_refcount), + TP_STRUCT__entry( + __field(unsigned long long, cpos) + __field(unsigned int, clusters) + __field(unsigned int, refcount) + __field(unsigned long long, split_cpos) + __field(unsigned int, split_clusters) + __field(unsigned int, split_refcount) + ), + TP_fast_assign( + __entry->cpos = cpos; + __entry->clusters = clusters; + __entry->refcount = refcount; + __entry->split_cpos = split_cpos; + __entry->split_clusters = split_clusters; + __entry->split_refcount = split_refcount; + ), + TP_printk("%llu %u %u %llu %u %u", + __entry->cpos, __entry->clusters, __entry->refcount, + __entry->split_cpos, __entry->split_clusters, + __entry->split_refcount) +); + +DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_split_refcount_rec_insert); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_increase_refcount_begin); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_change); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_increase_refcount_insert); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_split); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_remove_refcount_extent); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_restore_refcount_block); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_decrease_refcount_rec); + +TRACE_EVENT(ocfs2_decrease_refcount, + TP_PROTO(unsigned long long owner, + unsigned long long cpos, + unsigned int len, int delete), + TP_ARGS(owner, cpos, len, delete), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(unsigned long long, cpos) + __field(unsigned int, len) + __field(int, delete) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->cpos = cpos; + __entry->len = len; + __entry->delete = delete; + ), + TP_printk("%llu %llu %u %d", + __entry->owner, __entry->cpos, __entry->len, __entry->delete) +); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_mark_extent_refcounted); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_calc_refcount_meta_credits); + +TRACE_EVENT(ocfs2_calc_refcount_meta_credits_iterate, + TP_PROTO(int recs_add, unsigned long long cpos, + unsigned int clusters, unsigned long long r_cpos, + unsigned int r_clusters, unsigned int refcount, int index), + TP_ARGS(recs_add, cpos, clusters, r_cpos, r_clusters, refcount, index), + TP_STRUCT__entry( + __field(int, recs_add) + __field(unsigned long long, cpos) + __field(unsigned int, clusters) + __field(unsigned long long, r_cpos) + __field(unsigned int, r_clusters) + __field(unsigned int, refcount) + __field(int, index) + ), + TP_fast_assign( + __entry->recs_add = recs_add; + __entry->cpos = cpos; + __entry->clusters = clusters; + __entry->r_cpos = r_cpos; + __entry->r_clusters = r_clusters; + __entry->refcount = refcount; + __entry->index = index; + ), + TP_printk("%d %llu %u %llu %u %u %d", + __entry->recs_add, __entry->cpos, __entry->clusters, + __entry->r_cpos, __entry->r_clusters, + __entry->refcount, __entry->index) +); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_add_refcount_flag); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_prepare_refcount_change_for_del); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_lock_refcount_allocators); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_page); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_jbd); + +TRACE_EVENT(ocfs2_clear_ext_refcount, + TP_PROTO(unsigned long long ino, unsigned int cpos, + unsigned int len, unsigned int p_cluster, + unsigned int ext_flags), + TP_ARGS(ino, cpos, len, p_cluster, ext_flags), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, cpos) + __field(unsigned int, len) + __field(unsigned int, p_cluster) + __field(unsigned int, ext_flags) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->cpos = cpos; + __entry->len = len; + __entry->p_cluster = p_cluster; + __entry->ext_flags = ext_flags; + ), + TP_printk("%llu %u %u %u %u", + __entry->ino, __entry->cpos, __entry->len, + __entry->p_cluster, __entry->ext_flags) +); + +TRACE_EVENT(ocfs2_replace_clusters, + TP_PROTO(unsigned long long ino, unsigned int cpos, + unsigned int old, unsigned int new, unsigned int len, + unsigned int ext_flags), + TP_ARGS(ino, cpos, old, new, len, ext_flags), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, cpos) + __field(unsigned int, old) + __field(unsigned int, new) + __field(unsigned int, len) + __field(unsigned int, ext_flags) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->cpos = cpos; + __entry->old = old; + __entry->new = new; + __entry->len = len; + __entry->ext_flags = ext_flags; + ), + TP_printk("%llu %u %u %u %u %u", + __entry->ino, __entry->cpos, __entry->old, __entry->new, + __entry->len, __entry->ext_flags) +); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_make_clusters_writable); + +TRACE_EVENT(ocfs2_refcount_cow_hunk, + TP_PROTO(unsigned long long ino, unsigned int cpos, + unsigned int write_len, unsigned int max_cpos, + unsigned int cow_start, unsigned int cow_len), + TP_ARGS(ino, cpos, write_len, max_cpos, cow_start, cow_len), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, cpos) + __field(unsigned int, write_len) + __field(unsigned int, max_cpos) + __field(unsigned int, cow_start) + __field(unsigned int, cow_len) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->cpos = cpos; + __entry->write_len = write_len; + __entry->max_cpos = max_cpos; + __entry->cow_start = cow_start; + __entry->cow_len = cow_len; + ), + TP_printk("%llu %u %u %u %u %u", + __entry->ino, __entry->cpos, __entry->write_len, + __entry->max_cpos, __entry->cow_start, __entry->cow_len) +); + +/* End of trace events for fs/ocfs2/refcounttree.c. */ + +/* Trace events for fs/ocfs2/aops.c. */ + +DECLARE_EVENT_CLASS(ocfs2__get_block, + TP_PROTO(unsigned long long ino, unsigned long long iblock, + void *bh_result, int create), + TP_ARGS(ino, iblock, bh_result, create), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned long long, iblock) + __field(void *, bh_result) + __field(int, create) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->iblock = iblock; + __entry->bh_result = bh_result; + __entry->create = create; + ), + TP_printk("%llu %llu %p %d", + __entry->ino, __entry->iblock, + __entry->bh_result, __entry->create) +); + +#define DEFINE_OCFS2_GET_BLOCK_EVENT(name) \ +DEFINE_EVENT(ocfs2__get_block, name, \ + TP_PROTO(unsigned long long ino, unsigned long long iblock, \ + void *bh_result, int create), \ + TP_ARGS(ino, iblock, bh_result, create)) + +DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_symlink_get_block); + +DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_get_block); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap); + +TRACE_EVENT(ocfs2_try_to_write_inline_data, + TP_PROTO(unsigned long long ino, unsigned int len, + unsigned long long pos, unsigned int flags), + TP_ARGS(ino, len, pos, flags), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, len) + __field(unsigned long long, pos) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->len = len; + __entry->pos = pos; + __entry->flags = flags; + ), + TP_printk("%llu %u %llu 0x%x", + __entry->ino, __entry->len, __entry->pos, __entry->flags) +); + +TRACE_EVENT(ocfs2_write_begin_nolock, + TP_PROTO(unsigned long long ino, + long long i_size, unsigned int i_clusters, + unsigned long long pos, unsigned int len, + unsigned int flags, void *page, + unsigned int clusters, unsigned int extents_to_split), + TP_ARGS(ino, i_size, i_clusters, pos, len, flags, + page, clusters, extents_to_split), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(long long, i_size) + __field(unsigned int, i_clusters) + __field(unsigned long long, pos) + __field(unsigned int, len) + __field(unsigned int, flags) + __field(void *, page) + __field(unsigned int, clusters) + __field(unsigned int, extents_to_split) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->i_size = i_size; + __entry->i_clusters = i_clusters; + __entry->pos = pos; + __entry->len = len; + __entry->flags = flags; + __entry->page = page; + __entry->clusters = clusters; + __entry->extents_to_split = extents_to_split; + ), + TP_printk("%llu %lld %u %llu %u %u %p %u %u", + __entry->ino, __entry->i_size, __entry->i_clusters, + __entry->pos, __entry->len, + __entry->flags, __entry->page, __entry->clusters, + __entry->extents_to_split) +); + +TRACE_EVENT(ocfs2_write_end_inline, + TP_PROTO(unsigned long long ino, + unsigned long long pos, unsigned int copied, + unsigned int id_count, unsigned int features), + TP_ARGS(ino, pos, copied, id_count, features), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned long long, pos) + __field(unsigned int, copied) + __field(unsigned int, id_count) + __field(unsigned int, features) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->pos = pos; + __entry->copied = copied; + __entry->id_count = id_count; + __entry->features = features; + ), + TP_printk("%llu %llu %u %u %u", + __entry->ino, __entry->pos, __entry->copied, + __entry->id_count, __entry->features) +); + +/* End of trace events for fs/ocfs2/aops.c. */ + +/* Trace events for fs/ocfs2/mmap.c. */ + +TRACE_EVENT(ocfs2_fault, + TP_PROTO(unsigned long long ino, + void *area, void *page, unsigned long pgoff), + TP_ARGS(ino, area, page, pgoff), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(void *, area) + __field(void *, page) + __field(unsigned long, pgoff) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->area = area; + __entry->page = page; + __entry->pgoff = pgoff; + ), + TP_printk("%llu %p %p %lu", + __entry->ino, __entry->area, __entry->page, __entry->pgoff) +); + +/* End of trace events for fs/ocfs2/mmap.c. */ + +/* Trace events for fs/ocfs2/file.c. */ + +DECLARE_EVENT_CLASS(ocfs2__file_ops, + TP_PROTO(void *inode, void *file, void *dentry, + unsigned long long ino, + unsigned int d_len, const unsigned char *d_name, + unsigned long long para), + TP_ARGS(inode, file, dentry, ino, d_len, d_name, para), + TP_STRUCT__entry( + __field(void *, inode) + __field(void *, file) + __field(void *, dentry) + __field(unsigned long long, ino) + __field(unsigned int, d_len) + __string(d_name, d_name) + __field(unsigned long long, para) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->file = file; + __entry->dentry = dentry; + __entry->ino = ino; + __entry->d_len = d_len; + __assign_str(d_name, d_name); + __entry->para = para; + ), + TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file, + __entry->dentry, __entry->ino, __entry->para, + __entry->d_len, __get_str(d_name)) +); + +#define DEFINE_OCFS2_FILE_OPS(name) \ +DEFINE_EVENT(ocfs2__file_ops, name, \ +TP_PROTO(void *inode, void *file, void *dentry, \ + unsigned long long ino, \ + unsigned int d_len, const unsigned char *d_name, \ + unsigned long long mode), \ + TP_ARGS(inode, file, dentry, ino, d_len, d_name, mode)) + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_open); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_release); + +DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error); + +TRACE_EVENT(ocfs2_extend_allocation, + TP_PROTO(unsigned long long ip_blkno, unsigned long long size, + unsigned int clusters, unsigned int clusters_to_add, + int why, int restart_func), + TP_ARGS(ip_blkno, size, clusters, clusters_to_add, why, restart_func), + TP_STRUCT__entry( + __field(unsigned long long, ip_blkno) + __field(unsigned long long, size) + __field(unsigned int, clusters) + __field(unsigned int, clusters_to_add) + __field(int, why) + __field(int, restart_func) + ), + TP_fast_assign( + __entry->ip_blkno = ip_blkno; + __entry->size = size; + __entry->clusters = clusters; + __entry->clusters_to_add = clusters_to_add; + __entry->why = why; + __entry->restart_func = restart_func; + ), + TP_printk("%llu %llu %u %u %d %d", + __entry->ip_blkno, __entry->size, __entry->clusters, + __entry->clusters_to_add, __entry->why, __entry->restart_func) +); + +TRACE_EVENT(ocfs2_extend_allocation_end, + TP_PROTO(unsigned long long ino, + unsigned int di_clusters, unsigned long long di_size, + unsigned int ip_clusters, unsigned long long i_size), + TP_ARGS(ino, di_clusters, di_size, ip_clusters, i_size), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, di_clusters) + __field(unsigned long long, di_size) + __field(unsigned int, ip_clusters) + __field(unsigned long long, i_size) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->di_clusters = di_clusters; + __entry->di_size = di_size; + __entry->ip_clusters = ip_clusters; + __entry->i_size = i_size; + ), + TP_printk("%llu %u %llu %u %llu", __entry->ino, __entry->di_clusters, + __entry->di_size, __entry->ip_clusters, __entry->i_size) +); + +TRACE_EVENT(ocfs2_write_zero_page, + TP_PROTO(unsigned long long ino, + unsigned long long abs_from, unsigned long long abs_to, + unsigned long index, unsigned int zero_from, + unsigned int zero_to), + TP_ARGS(ino, abs_from, abs_to, index, zero_from, zero_to), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned long long, abs_from) + __field(unsigned long long, abs_to) + __field(unsigned long, index) + __field(unsigned int, zero_from) + __field(unsigned int, zero_to) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->abs_from = abs_from; + __entry->abs_to = abs_to; + __entry->index = index; + __entry->zero_from = zero_from; + __entry->zero_to = zero_to; + ), + TP_printk("%llu %llu %llu %lu %u %u", __entry->ino, + __entry->abs_from, __entry->abs_to, + __entry->index, __entry->zero_from, __entry->zero_to) +); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend_range); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend); + +TRACE_EVENT(ocfs2_setattr, + TP_PROTO(void *inode, void *dentry, + unsigned long long ino, + unsigned int d_len, const unsigned char *d_name, + unsigned int ia_valid, unsigned int ia_mode, + unsigned int ia_uid, unsigned int ia_gid), + TP_ARGS(inode, dentry, ino, d_len, d_name, + ia_valid, ia_mode, ia_uid, ia_gid), + TP_STRUCT__entry( + __field(void *, inode) + __field(void *, dentry) + __field(unsigned long long, ino) + __field(unsigned int, d_len) + __string(d_name, d_name) + __field(unsigned int, ia_valid) + __field(unsigned int, ia_mode) + __field(unsigned int, ia_uid) + __field(unsigned int, ia_gid) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->dentry = dentry; + __entry->ino = ino; + __entry->d_len = d_len; + __assign_str(d_name, d_name); + __entry->ia_valid = ia_valid; + __entry->ia_mode = ia_mode; + __entry->ia_uid = ia_uid; + __entry->ia_gid = ia_gid; + ), + TP_printk("%p %p %llu %.*s %u %u %u %u", __entry->inode, + __entry->dentry, __entry->ino, __entry->d_len, + __get_str(d_name), __entry->ia_valid, __entry->ia_mode, + __entry->ia_uid, __entry->ia_gid) +); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_write_remove_suid); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_partial_clusters); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range1); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range2); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range); + +TRACE_EVENT(ocfs2_prepare_inode_for_write, + TP_PROTO(unsigned long long ino, unsigned long long saved_pos, + int appending, unsigned long count, + int *direct_io, int *has_refcount), + TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned long long, saved_pos) + __field(int, appending) + __field(unsigned long, count) + __field(int, direct_io) + __field(int, has_refcount) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->saved_pos = saved_pos; + __entry->appending = appending; + __entry->count = count; + __entry->direct_io = direct_io ? *direct_io : -1; + __entry->has_refcount = has_refcount ? *has_refcount : -1; + ), + TP_printk("%llu %llu %d %lu %d %d", __entry->ino, + __entry->saved_pos, __entry->appending, __entry->count, + __entry->direct_io, __entry->has_refcount) +); + +DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); + +/* End of trace events for fs/ocfs2/file.c. */ + +/* Trace events for fs/ocfs2/inode.c. */ + +TRACE_EVENT(ocfs2_iget_begin, + TP_PROTO(unsigned long long ino, unsigned int flags, int sysfile_type), + TP_ARGS(ino, flags, sysfile_type), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, flags) + __field(int, sysfile_type) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->flags = flags; + __entry->sysfile_type = sysfile_type; + ), + TP_printk("%llu %u %d", __entry->ino, + __entry->flags, __entry->sysfile_type) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_iget5_locked); + +TRACE_EVENT(ocfs2_iget_end, + TP_PROTO(void *inode, unsigned long long ino), + TP_ARGS(inode, ino), + TP_STRUCT__entry( + __field(void *, inode) + __field(unsigned long long, ino) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->ino = ino; + ), + TP_printk("%p %llu", __entry->inode, __entry->ino) +); + +TRACE_EVENT(ocfs2_find_actor, + TP_PROTO(void *inode, unsigned long long ino, + void *args, unsigned long long fi_blkno), + TP_ARGS(inode, ino, args, fi_blkno), + TP_STRUCT__entry( + __field(void *, inode) + __field(unsigned long long, ino) + __field(void *, args) + __field(unsigned long long, fi_blkno) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->ino = ino; + __entry->args = args; + __entry->fi_blkno = fi_blkno; + ), + TP_printk("%p %llu %p %llu", __entry->inode, __entry->ino, + __entry->args, __entry->fi_blkno) +); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_populate_inode); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block); + +TRACE_EVENT(ocfs2_inode_is_valid_to_delete, + TP_PROTO(void *task, void *dc_task, unsigned long long ino, + unsigned int flags), + TP_ARGS(task, dc_task, ino, flags), + TP_STRUCT__entry( + __field(void *, task) + __field(void *, dc_task) + __field(unsigned long long, ino) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->task = task; + __entry->dc_task = dc_task; + __entry->ino = ino; + __entry->flags = flags; + ), + TP_printk("%p %p %llu %u", __entry->task, __entry->dc_task, + __entry->ino, __entry->flags) +); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_query_inode_wipe_begin); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_query_inode_wipe_succ); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_query_inode_wipe_end); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_cleanup_delete_inode); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode); + +TRACE_EVENT(ocfs2_inode_revalidate, + TP_PROTO(void *inode, unsigned long long ino, + unsigned int flags), + TP_ARGS(inode, ino, flags), + TP_STRUCT__entry( + __field(void *, inode) + __field(unsigned long long, ino) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->ino = ino; + __entry->flags = flags; + ), + TP_printk("%p %llu %u", __entry->inode, __entry->ino, __entry->flags) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_mark_inode_dirty); + +/* End of trace events for fs/ocfs2/inode.c. */ + +/* Trace events for fs/ocfs2/extent_map.c. */ + +TRACE_EVENT(ocfs2_read_virt_blocks, + TP_PROTO(void *inode, unsigned long long vblock, int nr, + void *bhs, unsigned int flags, void *validate), + TP_ARGS(inode, vblock, nr, bhs, flags, validate), + TP_STRUCT__entry( + __field(void *, inode) + __field(unsigned long long, vblock) + __field(int, nr) + __field(void *, bhs) + __field(unsigned int, flags) + __field(void *, validate) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->vblock = vblock; + __entry->nr = nr; + __entry->bhs = bhs; + __entry->flags = flags; + __entry->validate = validate; + ), + TP_printk("%p %llu %d %p %x %p", __entry->inode, __entry->vblock, + __entry->nr, __entry->bhs, __entry->flags, __entry->validate) +); + +/* End of trace events for fs/ocfs2/extent_map.c. */ + +/* Trace events for fs/ocfs2/slot_map.c. */ + +DEFINE_OCFS2_UINT_EVENT(ocfs2_refresh_slot_info); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers_block); + +DEFINE_OCFS2_INT_EVENT(ocfs2_find_slot); + +/* End of trace events for fs/ocfs2/slot_map.c. */ + +/* Trace events for fs/ocfs2/heartbeat.c. */ + +DEFINE_OCFS2_INT_EVENT(ocfs2_do_node_down); + +/* End of trace events for fs/ocfs2/heartbeat.c. */ + +/* Trace events for fs/ocfs2/super.c. */ + +TRACE_EVENT(ocfs2_remount, + TP_PROTO(unsigned long s_flags, unsigned long osb_flags, int flags), + TP_ARGS(s_flags, osb_flags, flags), + TP_STRUCT__entry( + __field(unsigned long, s_flags) + __field(unsigned long, osb_flags) + __field(int, flags) + ), + TP_fast_assign( + __entry->s_flags = s_flags; + __entry->osb_flags = osb_flags; + __entry->flags = flags; + ), + TP_printk("%lu %lu %d", __entry->s_flags, + __entry->osb_flags, __entry->flags) +); + +TRACE_EVENT(ocfs2_fill_super, + TP_PROTO(void *sb, void *data, int silent), + TP_ARGS(sb, data, silent), + TP_STRUCT__entry( + __field(void *, sb) + __field(void *, data) + __field(int, silent) + ), + TP_fast_assign( + __entry->sb = sb; + __entry->data = data; + __entry->silent = silent; + ), + TP_printk("%p %p %d", __entry->sb, + __entry->data, __entry->silent) +); + +TRACE_EVENT(ocfs2_parse_options, + TP_PROTO(int is_remount, char *options), + TP_ARGS(is_remount, options), + TP_STRUCT__entry( + __field(int, is_remount) + __string(options, options) + ), + TP_fast_assign( + __entry->is_remount = is_remount; + __assign_str(options, options); + ), + TP_printk("%d %s", __entry->is_remount, __get_str(options)) +); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super); + +TRACE_EVENT(ocfs2_statfs, + TP_PROTO(void *sb, void *buf), + TP_ARGS(sb, buf), + TP_STRUCT__entry( + __field(void *, sb) + __field(void *, buf) + ), + TP_fast_assign( + __entry->sb = sb; + __entry->buf = buf; + ), + TP_printk("%p %p", __entry->sb, __entry->buf) +); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_dismount_volume); + +TRACE_EVENT(ocfs2_initialize_super, + TP_PROTO(char *label, char *uuid_str, unsigned long long root_dir, + unsigned long long system_dir, int cluster_bits), + TP_ARGS(label, uuid_str, root_dir, system_dir, cluster_bits), + TP_STRUCT__entry( + __string(label, label) + __string(uuid_str, uuid_str) + __field(unsigned long long, root_dir) + __field(unsigned long long, system_dir) + __field(int, cluster_bits) + ), + TP_fast_assign( + __assign_str(label, label); + __assign_str(uuid_str, uuid_str); + __entry->root_dir = root_dir; + __entry->system_dir = system_dir; + __entry->cluster_bits = cluster_bits; + ), + TP_printk("%s %s %llu %llu %d", __get_str(label), __get_str(uuid_str), + __entry->root_dir, __entry->system_dir, __entry->cluster_bits) +); + +/* End of trace events for fs/ocfs2/super.c. */ + +/* Trace events for fs/ocfs2/xattr.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_xattr_block); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_xattr_extend_allocation); + +TRACE_EVENT(ocfs2_init_xattr_set_ctxt, + TP_PROTO(const char *name, int meta, int clusters, int credits), + TP_ARGS(name, meta, clusters, credits), + TP_STRUCT__entry( + __string(name, name) + __field(int, meta) + __field(int, clusters) + __field(int, credits) + ), + TP_fast_assign( + __assign_str(name, name); + __entry->meta = meta; + __entry->clusters = clusters; + __entry->credits = credits; + ), + TP_printk("%s %d %d %d", __get_str(name), __entry->meta, + __entry->clusters, __entry->credits) +); + +DECLARE_EVENT_CLASS(ocfs2__xattr_find, + TP_PROTO(unsigned long long ino, const char *name, int name_index, + unsigned int hash, unsigned long long location, + int xe_index), + TP_ARGS(ino, name, name_index, hash, location, xe_index), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __string(name, name) + __field(int, name_index) + __field(unsigned int, hash) + __field(unsigned long long, location) + __field(int, xe_index) + ), + TP_fast_assign( + __entry->ino = ino; + __assign_str(name, name); + __entry->name_index = name_index; + __entry->hash = hash; + __entry->location = location; + __entry->xe_index = xe_index; + ), + TP_printk("%llu %s %d %u %llu %d", __entry->ino, __get_str(name), + __entry->name_index, __entry->hash, __entry->location, + __entry->xe_index) +); + +#define DEFINE_OCFS2_XATTR_FIND_EVENT(name) \ +DEFINE_EVENT(ocfs2__xattr_find, name, \ +TP_PROTO(unsigned long long ino, const char *name, int name_index, \ + unsigned int hash, unsigned long long bucket, \ + int xe_index), \ + TP_ARGS(ino, name, name_index, hash, bucket, xe_index)) + +DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_bucket_find); + +DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find); + +DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find_rec); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_iterate_xattr_buckets); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_iterate_xattr_bucket); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cp_xattr_block_to_bucket_begin); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cp_xattr_block_to_bucket_end); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block_begin); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_defrag_xattr_bucket); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_bucket_cross_cluster); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_divide_xattr_bucket_begin); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_divide_xattr_bucket_move); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_cp_xattr_bucket); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_buckets); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_adjust_xattr_cross_cluster); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_begin); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_add_new_xattr_cluster); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_insert); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_extend_xattr_bucket); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_add_new_xattr_bucket); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_xattr_bucket_value_truncate); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_rm_xattr_cluster); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_header); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_create_empty_xattr_block); + +DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_bucket); + +DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_index_block); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_xattr_bucket_value_refcount); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_reflink_xattr_buckets); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_rec); + +/* End of trace events for fs/ocfs2/xattr.c. */ + +/* Trace events for fs/ocfs2/reservations.c. */ + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_insert); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_begin); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_end); + +TRACE_EVENT(ocfs2_resv_find_window_begin, + TP_PROTO(unsigned int r_start, unsigned int r_end, unsigned int goal, + unsigned int wanted, int empty_root), + TP_ARGS(r_start, r_end, goal, wanted, empty_root), + TP_STRUCT__entry( + __field(unsigned int, r_start) + __field(unsigned int, r_end) + __field(unsigned int, goal) + __field(unsigned int, wanted) + __field(int, empty_root) + ), + TP_fast_assign( + __entry->r_start = r_start; + __entry->r_end = r_end; + __entry->goal = goal; + __entry->wanted = wanted; + __entry->empty_root = empty_root; + ), + TP_printk("%u %u %u %u %d", __entry->r_start, __entry->r_end, + __entry->goal, __entry->wanted, __entry->empty_root) +); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_find_window_prev); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_resv_find_window_next); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cannibalize_resv_begin); + +TRACE_EVENT(ocfs2_cannibalize_resv_end, + TP_PROTO(unsigned int start, unsigned int end, unsigned int len, + unsigned int last_start, unsigned int last_len), + TP_ARGS(start, end, len, last_start, last_len), + TP_STRUCT__entry( + __field(unsigned int, start) + __field(unsigned int, end) + __field(unsigned int, len) + __field(unsigned int, last_start) + __field(unsigned int, last_len) + ), + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->len = len; + __entry->last_start = last_start; + __entry->last_len = last_len; + ), + TP_printk("%u %u %u %u %u", __entry->start, __entry->end, + __entry->len, __entry->last_start, __entry->last_len) +); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_resv_bits); + +TRACE_EVENT(ocfs2_resmap_claimed_bits_begin, + TP_PROTO(unsigned int cstart, unsigned int cend, unsigned int clen, + unsigned int r_start, unsigned int r_end, unsigned int r_len, + unsigned int last_start, unsigned int last_len), + TP_ARGS(cstart, cend, clen, r_start, r_end, + r_len, last_start, last_len), + TP_STRUCT__entry( + __field(unsigned int, cstart) + __field(unsigned int, cend) + __field(unsigned int, clen) + __field(unsigned int, r_start) + __field(unsigned int, r_end) + __field(unsigned int, r_len) + __field(unsigned int, last_start) + __field(unsigned int, last_len) + ), + TP_fast_assign( + __entry->cstart = cstart; + __entry->cend = cend; + __entry->clen = clen; + __entry->r_start = r_start; + __entry->r_end = r_end; + __entry->r_len = r_len; + __entry->last_start = last_start; + __entry->last_len = last_len; + ), + TP_printk("%u %u %u %u %u %u %u %u", + __entry->cstart, __entry->cend, __entry->clen, + __entry->r_start, __entry->r_end, __entry->r_len, + __entry->last_start, __entry->last_len) +); + +TRACE_EVENT(ocfs2_resmap_claimed_bits_end, + TP_PROTO(unsigned int start, unsigned int end, unsigned int len, + unsigned int last_start, unsigned int last_len), + TP_ARGS(start, end, len, last_start, last_len), + TP_STRUCT__entry( + __field(unsigned int, start) + __field(unsigned int, end) + __field(unsigned int, len) + __field(unsigned int, last_start) + __field(unsigned int, last_len) + ), + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->len = len; + __entry->last_start = last_start; + __entry->last_len = last_len; + ), + TP_printk("%u %u %u %u %u", __entry->start, __entry->end, + __entry->len, __entry->last_start, __entry->last_len) +); + +/* End of trace events for fs/ocfs2/reservations.c. */ + +/* Trace events for fs/ocfs2/quota_local.c. */ + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_recover_local_quota_file); + +DEFINE_OCFS2_INT_EVENT(ocfs2_finish_quota_recovery); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(olq_set_dquot); + +/* End of trace events for fs/ocfs2/quota_local.c. */ + +/* Trace events for fs/ocfs2/quota_global.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_quota_block); + +TRACE_EVENT(ocfs2_sync_dquot, + TP_PROTO(unsigned int dq_id, long long dqb_curspace, + long long spacechange, long long curinodes, + long long inodechange), + TP_ARGS(dq_id, dqb_curspace, spacechange, curinodes, inodechange), + TP_STRUCT__entry( + __field(unsigned int, dq_id) + __field(long long, dqb_curspace) + __field(long long, spacechange) + __field(long long, curinodes) + __field(long long, inodechange) + ), + TP_fast_assign( + __entry->dq_id = dq_id; + __entry->dqb_curspace = dqb_curspace; + __entry->spacechange = spacechange; + __entry->curinodes = curinodes; + __entry->inodechange = inodechange; + ), + TP_printk("%u %lld %lld %lld %lld", __entry->dq_id, + __entry->dqb_curspace, __entry->spacechange, + __entry->curinodes, __entry->inodechange) +); + +TRACE_EVENT(ocfs2_sync_dquot_helper, + TP_PROTO(unsigned int dq_id, unsigned int dq_type, unsigned long type, + const char *s_id), + TP_ARGS(dq_id, dq_type, type, s_id), + + TP_STRUCT__entry( + __field(unsigned int, dq_id) + __field(unsigned int, dq_type) + __field(unsigned long, type) + __string(s_id, s_id) + ), + TP_fast_assign( + __entry->dq_id = dq_id; + __entry->dq_type = dq_type; + __entry->type = type; + __assign_str(s_id, s_id); + ), + TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type, + __entry->type, __get_str(s_id)) +); + +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_write_dquot); + +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot); + +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot); + +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty); + +/* End of trace events for fs/ocfs2/quota_global.c. */ + +/* Trace events for fs/ocfs2/dir.c. */ +DEFINE_OCFS2_INT_EVENT(ocfs2_search_dirblock); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_dir_block); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_find_entry_el); + +TRACE_EVENT(ocfs2_dx_dir_search, + TP_PROTO(unsigned long long ino, int namelen, const char *name, + unsigned int major_hash, unsigned int minor_hash, + unsigned long long blkno), + TP_ARGS(ino, namelen, name, major_hash, minor_hash, blkno), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(int, namelen) + __string(name, name) + __field(unsigned int, major_hash) + __field(unsigned int,minor_hash) + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->namelen = namelen; + __assign_str(name, name); + __entry->major_hash = major_hash; + __entry->minor_hash = minor_hash; + __entry->blkno = blkno; + ), + TP_printk("%llu %.*s %u %u %llu", __entry->ino, + __entry->namelen, __get_str(name), + __entry->major_hash, __entry->minor_hash, __entry->blkno) +); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_dx_dir_search_leaf_info); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_delete_entry_dx); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_readdir); + +TRACE_EVENT(ocfs2_find_files_on_disk, + TP_PROTO(int namelen, const char *name, void *blkno, + unsigned long long dir), + TP_ARGS(namelen, name, blkno, dir), + TP_STRUCT__entry( + __field(int, namelen) + __string(name, name) + __field(void *, blkno) + __field(unsigned long long, dir) + ), + TP_fast_assign( + __entry->namelen = namelen; + __assign_str(name, name); + __entry->blkno = blkno; + __entry->dir = dir; + ), + TP_printk("%.*s %p %llu", __entry->namelen, __get_str(name), + __entry->blkno, __entry->dir) +); + +TRACE_EVENT(ocfs2_check_dir_for_entry, + TP_PROTO(unsigned long long dir, int namelen, const char *name), + TP_ARGS(dir, namelen, name), + TP_STRUCT__entry( + __field(unsigned long long, dir) + __field(int, namelen) + __string(name, name) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->namelen = namelen; + __assign_str(name, name); + ), + TP_printk("%llu %.*s", __entry->dir, + __entry->namelen, __get_str(name)) +); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_dx_dir_attach_index); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_format_cluster); + +TRACE_EVENT(ocfs2_dx_dir_index_root_block, + TP_PROTO(unsigned long long dir, + unsigned int major_hash, unsigned int minor_hash, + int namelen, const char *name, unsigned int num_used), + TP_ARGS(dir, major_hash, minor_hash, namelen, name, num_used), + TP_STRUCT__entry( + __field(unsigned long long, dir) + __field(unsigned int, major_hash) + __field(unsigned int, minor_hash) + __field(int, namelen) + __string(name, name) + __field(unsigned int, num_used) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->major_hash = major_hash; + __entry->minor_hash = minor_hash; + __entry->namelen = namelen; + __assign_str(name, name); + __entry->num_used = num_used; + ), + TP_printk("%llu %x %x %.*s %u", __entry->dir, + __entry->major_hash, __entry->minor_hash, + __entry->namelen, __get_str(name), __entry->num_used) +); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_extend_dir); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_rebalance); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_dx_dir_rebalance_split); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_prepare_dir_for_insert); + +/* End of trace events for fs/ocfs2/dir.c. */ + +/* Trace events for fs/ocfs2/namei.c. */ + +DECLARE_EVENT_CLASS(ocfs2__dentry_ops, + TP_PROTO(void *dir, void *dentry, int name_len, const char *name, + unsigned long long dir_blkno, unsigned long long extra), + TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra), + TP_STRUCT__entry( + __field(void *, dir) + __field(void *, dentry) + __field(int, name_len) + __string(name, name) + __field(unsigned long long, dir_blkno) + __field(unsigned long long, extra) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->dentry = dentry; + __entry->name_len = name_len; + __assign_str(name, name); + __entry->dir_blkno = dir_blkno; + __entry->extra = extra; + ), + TP_printk("%p %p %.*s %llu %llu", __entry->dir, __entry->dentry, + __entry->name_len, __get_str(name), + __entry->dir_blkno, __entry->extra) +); + +#define DEFINE_OCFS2_DENTRY_OPS(name) \ +DEFINE_EVENT(ocfs2__dentry_ops, name, \ +TP_PROTO(void *dir, void *dentry, int name_len, const char *name, \ + unsigned long long dir_blkno, unsigned long long extra), \ + TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra)) + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_lookup); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_mkdir); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_create); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_unlink); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_symlink_create); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_mv_orphaned_inode_to_new); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_lookup_ret); + +TRACE_EVENT(ocfs2_mknod, + TP_PROTO(void *dir, void *dentry, int name_len, const char *name, + unsigned long long dir_blkno, unsigned long dev, int mode), + TP_ARGS(dir, dentry, name_len, name, dir_blkno, dev, mode), + TP_STRUCT__entry( + __field(void *, dir) + __field(void *, dentry) + __field(int, name_len) + __string(name, name) + __field(unsigned long long, dir_blkno) + __field(unsigned long, dev) + __field(int, mode) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->dentry = dentry; + __entry->name_len = name_len; + __assign_str(name, name); + __entry->dir_blkno = dir_blkno; + __entry->dev = dev; + __entry->mode = mode; + ), + TP_printk("%p %p %.*s %llu %lu %d", __entry->dir, __entry->dentry, + __entry->name_len, __get_str(name), + __entry->dir_blkno, __entry->dev, __entry->mode) +); + +TRACE_EVENT(ocfs2_link, + TP_PROTO(unsigned long long ino, int old_len, const char *old_name, + int name_len, const char *name), + TP_ARGS(ino, old_len, old_name, name_len, name), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(int, old_len) + __string(old_name, old_name) + __field(int, name_len) + __string(name, name) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->old_len = old_len; + __assign_str(old_name, old_name); + __entry->name_len = name_len; + __assign_str(name, name); + ), + TP_printk("%llu %.*s %.*s", __entry->ino, + __entry->old_len, __get_str(old_name), + __entry->name_len, __get_str(name)) +); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_unlink_noent); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock_end); + +TRACE_EVENT(ocfs2_rename, + TP_PROTO(void *old_dir, void *old_dentry, + void *new_dir, void *new_dentry, + int old_len, const char *old_name, + int new_len, const char *new_name), + TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, + old_len, old_name, new_len, new_name), + TP_STRUCT__entry( + __field(void *, old_dir) + __field(void *, old_dentry) + __field(void *, new_dir) + __field(void *, new_dentry) + __field(int, old_len) + __string(old_name, old_name) + __field(int, new_len) + __string(new_name, new_name) + ), + TP_fast_assign( + __entry->old_dir = old_dir; + __entry->old_dentry = old_dentry; + __entry->new_dir = new_dir; + __entry->new_dentry = new_dentry; + __entry->old_len = old_len; + __assign_str(old_name, old_name); + __entry->new_len = new_len; + __assign_str(new_name, new_name); + ), + TP_printk("%p %p %p %p %.*s %.*s", + __entry->old_dir, __entry->old_dentry, + __entry->new_dir, __entry->new_dentry, + __entry->old_len, __get_str(old_name), + __entry->new_len, __get_str(new_name)) +); + +TRACE_EVENT(ocfs2_rename_target_exists, + TP_PROTO(int new_len, const char *new_name), + TP_ARGS(new_len, new_name), + TP_STRUCT__entry( + __field(int, new_len) + __string(new_name, new_name) + ), + TP_fast_assign( + __entry->new_len = new_len; + __assign_str(new_name, new_name); + ), + TP_printk("%.*s", __entry->new_len, __get_str(new_name)) +); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_rename_disagree); + +TRACE_EVENT(ocfs2_rename_over_existing, + TP_PROTO(unsigned long long new_blkno, void *new_bh, + unsigned long long newdi_blkno), + TP_ARGS(new_blkno, new_bh, newdi_blkno), + TP_STRUCT__entry( + __field(unsigned long long, new_blkno) + __field(void *, new_bh) + __field(unsigned long long, newdi_blkno) + ), + TP_fast_assign( + __entry->new_blkno = new_blkno; + __entry->new_bh = new_bh; + __entry->newdi_blkno = newdi_blkno; + ), + TP_printk("%llu %p %llu", __entry->new_blkno, __entry->new_bh, + __entry->newdi_blkno) +); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_create_symlink_data); + +TRACE_EVENT(ocfs2_symlink_begin, + TP_PROTO(void *dir, void *dentry, const char *symname, + int len, const char *name), + TP_ARGS(dir, dentry, symname, len, name), + TP_STRUCT__entry( + __field(void *, dir) + __field(void *, dentry) + __field(const char *, symname) + __field(int, len) + __string(name, name) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->dentry = dentry; + __entry->symname = symname; + __entry->len = len; + __assign_str(name, name); + ), + TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry, + __entry->symname, __entry->len, __get_str(name)) +); + +TRACE_EVENT(ocfs2_blkno_stringify, + TP_PROTO(unsigned long long blkno, const char *name, int namelen), + TP_ARGS(blkno, name, namelen), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + __string(name, name) + __field(int, namelen) + ), + TP_fast_assign( + __entry->blkno = blkno; + __assign_str(name, name); + __entry->namelen = namelen; + ), + TP_printk("%llu %s %d", __entry->blkno, __get_str(name), + __entry->namelen) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_add_begin); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_orphan_add_end); + +TRACE_EVENT(ocfs2_orphan_del, + TP_PROTO(unsigned long long dir, const char *name, int namelen), + TP_ARGS(dir, name, namelen), + TP_STRUCT__entry( + __field(unsigned long long, dir) + __string(name, name) + __field(int, namelen) + ), + TP_fast_assign( + __entry->dir = dir; + __assign_str(name, name); + __entry->namelen = namelen; + ), + TP_printk("%llu %s %d", __entry->dir, __get_str(name), + __entry->namelen) +); + +/* End of trace events for fs/ocfs2/namei.c. */ + +/* Trace events for fs/ocfs2/dcache.c. */ + +TRACE_EVENT(ocfs2_dentry_revalidate, + TP_PROTO(void *dentry, int len, const char *name), + TP_ARGS(dentry, len, name), + TP_STRUCT__entry( + __field(void *, dentry) + __field(int, len) + __string(name, name) + ), + TP_fast_assign( + __entry->dentry = dentry; + __entry->len = len; + __assign_str(name, name); + ), + TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name)) +); + +TRACE_EVENT(ocfs2_dentry_revalidate_negative, + TP_PROTO(int len, const char *name, unsigned long pgen, + unsigned long gen), + TP_ARGS(len, name, pgen, gen), + TP_STRUCT__entry( + __field(int, len) + __string(name, name) + __field(unsigned long, pgen) + __field(unsigned long, gen) + ), + TP_fast_assign( + __entry->len = len; + __assign_str(name, name); + __entry->pgen = pgen; + __entry->gen = gen; + ), + TP_printk("%.*s %lu %lu", __entry->len, __get_str(name), + __entry->pgen, __entry->gen) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_delete); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_dentry_revalidate_orphaned); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_nofsdata); + +DEFINE_OCFS2_INT_EVENT(ocfs2_dentry_revalidate_ret); + +TRACE_EVENT(ocfs2_find_local_alias, + TP_PROTO(int len, const char *name), + TP_ARGS(len, name), + TP_STRUCT__entry( + __field(int, len) + __string(name, name) + ), + TP_fast_assign( + __entry->len = len; + __assign_str(name, name); + ), + TP_printk("%.*s", __entry->len, __get_str(name)) +); + +TRACE_EVENT(ocfs2_dentry_attach_lock, + TP_PROTO(int len, const char *name, + unsigned long long parent, void *fsdata), + TP_ARGS(len, name, parent, fsdata), + TP_STRUCT__entry( + __field(int, len) + __string(name, name) + __field(unsigned long long, parent) + __field(void *, fsdata) + ), + TP_fast_assign( + __entry->len = len; + __assign_str(name, name); + __entry->parent = parent; + __entry->fsdata = fsdata; + ), + TP_printk("%.*s %llu %p", __entry->len, __get_str(name), + __entry->parent, __entry->fsdata) +); + +TRACE_EVENT(ocfs2_dentry_attach_lock_found, + TP_PROTO(const char *name, unsigned long long parent, + unsigned long long ino), + TP_ARGS(name, parent, ino), + TP_STRUCT__entry( + __string(name, name) + __field(unsigned long long, parent) + __field(unsigned long long, ino) + ), + TP_fast_assign( + __assign_str(name, name); + __entry->parent = parent; + __entry->ino = ino; + ), + TP_printk("%s %llu %llu", __get_str(name), __entry->parent, __entry->ino) +); +/* End of trace events for fs/ocfs2/dcache.c. */ + +/* Trace events for fs/ocfs2/export.c. */ + +TRACE_EVENT(ocfs2_get_dentry_begin, + TP_PROTO(void *sb, void *handle, unsigned long long blkno), + TP_ARGS(sb, handle, blkno), + TP_STRUCT__entry( + __field(void *, sb) + __field(void *, handle) + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->sb = sb; + __entry->handle = handle; + __entry->blkno = blkno; + ), + TP_printk("%p %p %llu", __entry->sb, __entry->handle, __entry->blkno) +); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_get_dentry_test_bit); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_get_dentry_stale); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_get_dentry_generation); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_dentry_end); + +TRACE_EVENT(ocfs2_get_parent, + TP_PROTO(void *child, int len, const char *name, + unsigned long long ino), + TP_ARGS(child, len, name, ino), + TP_STRUCT__entry( + __field(void *, child) + __field(int, len) + __string(name, name) + __field(unsigned long long, ino) + ), + TP_fast_assign( + __entry->child = child; + __entry->len = len; + __assign_str(name, name); + __entry->ino = ino; + ), + TP_printk("%p %.*s %llu", __entry->child, __entry->len, + __get_str(name), __entry->ino) +); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_parent_end); + +TRACE_EVENT(ocfs2_encode_fh_begin, + TP_PROTO(void *dentry, int name_len, const char *name, + void *fh, int len, int connectable), + TP_ARGS(dentry, name_len, name, fh, len, connectable), + TP_STRUCT__entry( + __field(void *, dentry) + __field(int, name_len) + __string(name, name) + __field(void *, fh) + __field(int, len) + __field(int, connectable) + ), + TP_fast_assign( + __entry->dentry = dentry; + __entry->name_len = name_len; + __assign_str(name, name); + __entry->fh = fh; + __entry->len = len; + __entry->connectable = connectable; + ), + TP_printk("%p %.*s %p %d %d", __entry->dentry, __entry->name_len, + __get_str(name), __entry->fh, __entry->len, + __entry->connectable) +); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_self); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_parent); + +DEFINE_OCFS2_INT_EVENT(ocfs2_encode_fh_type); + +/* End of trace events for fs/ocfs2/export.c. */ + +/* Trace events for fs/ocfs2/journal.c. */ + +DEFINE_OCFS2_UINT_EVENT(ocfs2_commit_cache_begin); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans); + +DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_journal_init); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_journal_init_maxlen); + +DEFINE_OCFS2_INT_EVENT(ocfs2_journal_shutdown); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_journal_shutdown_wait); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_complete_recovery); + +DEFINE_OCFS2_INT_EVENT(ocfs2_complete_recovery_end); + +TRACE_EVENT(ocfs2_complete_recovery_slot, + TP_PROTO(int slot, unsigned long long la_ino, + unsigned long long tl_ino, void *qrec), + TP_ARGS(slot, la_ino, tl_ino, qrec), + TP_STRUCT__entry( + __field(int, slot) + __field(unsigned long long, la_ino) + __field(unsigned long long, tl_ino) + __field(void *, qrec) + ), + TP_fast_assign( + __entry->slot = slot; + __entry->la_ino = la_ino; + __entry->tl_ino = tl_ino; + __entry->qrec = qrec; + ), + TP_printk("%d %llu %llu %p", __entry->slot, __entry->la_ino, + __entry->tl_ino, __entry->qrec) +); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_recovery_thread_node); + +DEFINE_OCFS2_INT_EVENT(ocfs2_recovery_thread_end); + +TRACE_EVENT(ocfs2_recovery_thread, + TP_PROTO(int node_num, int osb_node_num, int disable, + void *recovery_thread, int map_set), + TP_ARGS(node_num, osb_node_num, disable, recovery_thread, map_set), + TP_STRUCT__entry( + __field(int, node_num) + __field(int, osb_node_num) + __field(int,disable) + __field(void *, recovery_thread) + __field(int,map_set) + ), + TP_fast_assign( + __entry->node_num = node_num; + __entry->osb_node_num = osb_node_num; + __entry->disable = disable; + __entry->recovery_thread = recovery_thread; + __entry->map_set = map_set; + ), + TP_printk("%d %d %d %p %d", __entry->node_num, + __entry->osb_node_num, __entry->disable, + __entry->recovery_thread, __entry->map_set) +); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_replay_journal_recovered); + +DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_lock_err); + +DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_skip); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_recover_node); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_recover_node_skip); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_mark_dead_nodes); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_begin); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_end); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_filldir); + +DEFINE_OCFS2_INT_EVENT(ocfs2_recover_orphans); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_recover_orphans_iput); + +DEFINE_OCFS2_INT_EVENT(ocfs2_wait_on_mount); + +/* End of trace events for fs/ocfs2/journal.c. */ + +/* Trace events for fs/ocfs2/buffer_head_io.c. */ + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_read_blocks_sync); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_read_blocks_sync_jbd); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_read_blocks_from_disk); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_bh); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_end); + +TRACE_EVENT(ocfs2_write_block, + TP_PROTO(unsigned long long block, void *ci), + TP_ARGS(block, ci), + TP_STRUCT__entry( + __field(unsigned long long, block) + __field(void *, ci) + ), + TP_fast_assign( + __entry->block = block; + __entry->ci = ci; + ), + TP_printk("%llu %p", __entry->block, __entry->ci) +); + +TRACE_EVENT(ocfs2_read_blocks_begin, + TP_PROTO(void *ci, unsigned long long block, + unsigned int nr, int flags), + TP_ARGS(ci, block, nr, flags), + TP_STRUCT__entry( + __field(void *, ci) + __field(unsigned long long, block) + __field(unsigned int, nr) + __field(int, flags) + ), + TP_fast_assign( + __entry->ci = ci; + __entry->block = block; + __entry->nr = nr; + __entry->flags = flags; + ), + TP_printk("%p %llu %u %d", __entry->ci, __entry->block, + __entry->nr, __entry->flags) +); + +/* End of trace events for fs/ocfs2/buffer_head_io.c. */ + +/* Trace events for fs/ocfs2/uptodate.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_copied_metadata_tree); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_metadata_cache_purge); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_buffer_cached_begin); + +TRACE_EVENT(ocfs2_buffer_cached_end, + TP_PROTO(int index, void *item), + TP_ARGS(index, item), + TP_STRUCT__entry( + __field(int, index) + __field(void *, item) + ), + TP_fast_assign( + __entry->index = index; + __entry->item = item; + ), + TP_printk("%d %p", __entry->index, __entry->item) +); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_append_cache_array); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_insert_cache_tree); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_expand_cache); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_set_buffer_uptodate); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_set_buffer_uptodate_begin); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_remove_metadata_array); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_remove_metadata_tree); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_remove_block_from_cache); + +/* End of trace events for fs/ocfs2/uptodate.c. */ +#endif /* _TRACE_OCFS2_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE ocfs2_trace +#include diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h new file mode 100644 index 00000000..d5ab56cb --- /dev/null +++ b/fs/ocfs2/quota.h @@ -0,0 +1,117 @@ +/* + * quota.h for OCFS2 + * + * On disk quota structures for local and global quota file, in-memory + * structures. + * + */ + +#ifndef _OCFS2_QUOTA_H +#define _OCFS2_QUOTA_H + +#include +#include +#include +#include +#include + +#include "ocfs2.h" + +/* + * In-memory structures + */ +struct ocfs2_dquot { + struct dquot dq_dquot; /* Generic VFS dquot */ + loff_t dq_local_off; /* Offset in the local quota file */ + u64 dq_local_phys_blk; /* Physical block carrying quota structure */ + struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */ + unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ + s64 dq_origspace; /* Last globally synced space usage */ + s64 dq_originodes; /* Last globally synced inode usage */ +}; + +/* Description of one chunk to recover in memory */ +struct ocfs2_recovery_chunk { + struct list_head rc_list; /* List of chunks */ + int rc_chunk; /* Chunk number */ + unsigned long *rc_bitmap; /* Bitmap of entries to recover */ +}; + +struct ocfs2_quota_recovery { + struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */ +}; + +/* In-memory structure with quota header information */ +struct ocfs2_mem_dqinfo { + unsigned int dqi_type; /* Quota type this structure describes */ + unsigned int dqi_chunks; /* Number of chunks in local quota file */ + unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ + unsigned int dqi_syncms; /* How often should we sync with other nodes */ + struct list_head dqi_chunk; /* List of chunks */ + struct inode *dqi_gqinode; /* Global quota file inode */ + struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ + struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */ + int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */ + u64 dqi_giblk; /* Number of block with global information header */ + struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */ + struct buffer_head *dqi_libh; /* Buffer with local information header */ + struct qtree_mem_dqinfo dqi_gi; /* Info about global file */ + struct delayed_work dqi_sync_work; /* Work for syncing dquots */ + struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery + * information, in case we + * enable quotas on file + * needing it */ +}; + +static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot) +{ + return container_of(dquot, struct ocfs2_dquot, dq_dquot); +} + +struct ocfs2_quota_chunk { + struct list_head qc_chunk; /* List of quotafile chunks */ + int qc_num; /* Number of quota chunk */ + struct buffer_head *qc_headerbh; /* Buffer head with chunk header */ +}; + +extern struct kmem_cache *ocfs2_dquot_cachep; +extern struct kmem_cache *ocfs2_qf_chunk_cachep; + +extern struct qtree_fmt_operations ocfs2_global_ops; + +struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( + struct ocfs2_super *osb, int slot_num); +int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, + struct ocfs2_quota_recovery *rec, + int slot_num); +void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec); +ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off); +ssize_t ocfs2_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); +int ocfs2_global_read_info(struct super_block *sb, int type); +int ocfs2_global_write_info(struct super_block *sb, int type); +int ocfs2_global_read_dquot(struct dquot *dquot); +int __ocfs2_sync_dquot(struct dquot *dquot, int freeing); +static inline int ocfs2_sync_dquot(struct dquot *dquot) +{ + return __ocfs2_sync_dquot(dquot, 0); +} +static inline int ocfs2_global_release_dquot(struct dquot *dquot) +{ + return __ocfs2_sync_dquot(dquot, 1); +} + +int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); +void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); +int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh); +int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block, + struct buffer_head **bh); +int ocfs2_create_local_dquot(struct dquot *dquot); +int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot); +int ocfs2_local_write_dquot(struct dquot *dquot); + +extern const struct dquot_operations ocfs2_quota_operations; +extern struct quota_format_type ocfs2_quota_format; + +#endif /* _OCFS2_QUOTA_H */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c new file mode 100644 index 00000000..92fcd575 --- /dev/null +++ b/fs/ocfs2/quota_global.c @@ -0,0 +1,922 @@ +/* + * Implementation of operations over global quota file + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2_fs.h" +#include "ocfs2.h" +#include "alloc.h" +#include "blockcheck.h" +#include "inode.h" +#include "journal.h" +#include "file.h" +#include "sysfile.h" +#include "dlmglue.h" +#include "uptodate.h" +#include "super.h" +#include "buffer_head_io.h" +#include "quota.h" +#include "ocfs2_trace.h" + +/* + * Locking of quotas with OCFS2 is rather complex. Here are rules that + * should be obeyed by all the functions: + * - any write of quota structure (either to local or global file) is protected + * by dqio_mutex or dquot->dq_lock. + * - any modification of global quota file holds inode cluster lock, i_mutex, + * and ip_alloc_sem of the global quota file (achieved by + * ocfs2_lock_global_qf). It also has to hold qinfo_lock. + * - an allocation of new blocks for local quota file is protected by + * its ip_alloc_sem + * + * A rough sketch of locking dependencies (lf = local file, gf = global file): + * Normal filesystem operation: + * start_trans -> dqio_mutex -> write to lf + * Syncing of local and global file: + * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock -> + * write to gf + * -> write to lf + * Acquire dquot for the first time: + * dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf + * -> alloc space for gf + * -> start_trans -> qinfo_lock -> write to gf + * -> ip_alloc_sem of lf -> alloc space for lf + * -> write to lf + * Release last reference to dquot: + * dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf + * -> write to lf + * Note that all the above operations also hold the inode cluster lock of lf. + * Recovery: + * inode cluster lock of recovered lf + * -> read bitmaps -> ip_alloc_sem of lf + * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock -> + * write to gf + */ + +static void qsync_work_fn(struct work_struct *work); + +static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp) +{ + struct ocfs2_global_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + + /* Update from disk only entries not set by the admin */ + if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) { + m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit); + m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit); + } + if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags)) + m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes); + if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) { + m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit); + m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit); + } + if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags)) + m->dqb_curspace = le64_to_cpu(d->dqb_curspace); + if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags)) + m->dqb_btime = le64_to_cpu(d->dqb_btime); + if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags)) + m->dqb_itime = le64_to_cpu(d->dqb_itime); + OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count); +} + +static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot) +{ + struct ocfs2_global_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + + d->dqb_id = cpu_to_le32(dquot->dq_id); + d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count); + d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); + d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); + d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes); + d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit); + d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit); + d->dqb_curspace = cpu_to_le64(m->dqb_curspace); + d->dqb_btime = cpu_to_le64(m->dqb_btime); + d->dqb_itime = cpu_to_le64(m->dqb_itime); + d->dqb_pad1 = d->dqb_pad2 = 0; +} + +static int ocfs2_global_is_id(void *dp, struct dquot *dquot) +{ + struct ocfs2_global_disk_dqblk *d = dp; + struct ocfs2_mem_dqinfo *oinfo = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + + if (qtree_entry_unused(&oinfo->dqi_gi, dp)) + return 0; + return le32_to_cpu(d->dqb_id) == dquot->dq_id; +} + +struct qtree_fmt_operations ocfs2_global_ops = { + .mem2disk_dqblk = ocfs2_global_mem2diskdqb, + .disk2mem_dqblk = ocfs2_global_disk2memdqb, + .is_id = ocfs2_global_is_id, +}; + +int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh) +{ + struct ocfs2_disk_dqtrailer *dqt = + ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data); + + trace_ocfs2_validate_quota_block((unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check); +} + +int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block, + struct buffer_head **bhp) +{ + int rc; + + *bhp = NULL; + rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0, + ocfs2_validate_quota_block); + if (rc) + mlog_errno(rc); + return rc; +} + +/* Read data from global quotafile - avoid pagecache and such because we cannot + * afford acquiring the locks... We use quota cluster lock to serialize + * operations. Caller is responsible for acquiring it. */ +ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct inode *gqinode = oinfo->dqi_gqinode; + loff_t i_size = i_size_read(gqinode); + int offset = off & (sb->s_blocksize - 1); + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0; + struct buffer_head *bh; + size_t toread, tocopy; + u64 pblock = 0, pcount = 0; + + if (off > i_size) + return 0; + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = min_t(size_t, (sb->s_blocksize - offset), toread); + if (!pcount) { + err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, + &pcount, NULL); + if (err) { + mlog_errno(err); + return err; + } + } else { + pcount--; + pblock++; + } + bh = NULL; + err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh); + if (err) { + mlog_errno(err); + return err; + } + memcpy(data, bh->b_data + offset, tocopy); + brelse(bh); + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +ssize_t ocfs2_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct inode *gqinode = oinfo->dqi_gqinode; + int offset = off & (sb->s_blocksize - 1); + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0, new = 0, ja_type; + struct buffer_head *bh = NULL; + handle_t *handle = journal_current_handle(); + u64 pblock, pcount; + + if (!handle) { + mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled " + "because transaction was not started.\n", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) { + WARN_ON(1); + len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset; + } + + if (gqinode->i_size < off + len) { + loff_t rounded_end = + ocfs2_align_bytes_to_blocks(sb, off + len); + + /* Space is already allocated in ocfs2_acquire_dquot() */ + err = ocfs2_simple_size_update(gqinode, + oinfo->dqi_gqi_bh, + rounded_end); + if (err < 0) + goto out; + new = 1; + } + err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL); + if (err) { + mlog_errno(err); + goto out; + } + /* Not rewriting whole block? */ + if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) && + !new) { + err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh); + ja_type = OCFS2_JOURNAL_ACCESS_WRITE; + } else { + bh = sb_getblk(sb, pblock); + if (!bh) + err = -ENOMEM; + ja_type = OCFS2_JOURNAL_ACCESS_CREATE; + } + if (err) { + mlog_errno(err); + goto out; + } + lock_buffer(bh); + if (new) + memset(bh->b_data, 0, sb->s_blocksize); + memcpy(bh->b_data + offset, data, len); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + unlock_buffer(bh); + ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh); + err = ocfs2_journal_access_dq(handle, INODE_CACHE(gqinode), bh, + ja_type); + if (err < 0) { + brelse(bh); + goto out; + } + ocfs2_journal_dirty(handle, bh); + brelse(bh); +out: + if (err) { + mlog_errno(err); + return err; + } + gqinode->i_version++; + ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh); + return len; +} + +int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + int status; + struct buffer_head *bh = NULL; + + status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex); + if (status < 0) + return status; + spin_lock(&dq_data_lock); + if (!oinfo->dqi_gqi_count++) + oinfo->dqi_gqi_bh = bh; + else + WARN_ON(bh != oinfo->dqi_gqi_bh); + spin_unlock(&dq_data_lock); + if (ex) { + mutex_lock(&oinfo->dqi_gqinode->i_mutex); + down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); + } else { + down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); + } + return 0; +} + +void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + if (ex) { + up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); + mutex_unlock(&oinfo->dqi_gqinode->i_mutex); + } else { + up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); + } + ocfs2_inode_unlock(oinfo->dqi_gqinode, ex); + brelse(oinfo->dqi_gqi_bh); + spin_lock(&dq_data_lock); + if (!--oinfo->dqi_gqi_count) + oinfo->dqi_gqi_bh = NULL; + spin_unlock(&dq_data_lock); +} + +/* Read information header from global quota file */ +int ocfs2_global_read_info(struct super_block *sb, int type) +{ + struct inode *gqinode = NULL; + unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE }; + struct ocfs2_global_disk_dqinfo dinfo; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + u64 pcount; + int status; + + /* Read global header */ + gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], + OCFS2_INVALID_SLOT); + if (!gqinode) { + mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n", + type); + status = -EINVAL; + goto out_err; + } + oinfo->dqi_gi.dqi_sb = sb; + oinfo->dqi_gi.dqi_type = type; + ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); + oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk); + oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops; + oinfo->dqi_gqi_bh = NULL; + oinfo->dqi_gqi_count = 0; + oinfo->dqi_gqinode = gqinode; + status = ocfs2_lock_global_qf(oinfo, 0); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + + status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk, + &pcount, NULL); + if (status < 0) + goto out_unlock; + + status = ocfs2_qinfo_lock(oinfo, 0); + if (status < 0) + goto out_unlock; + status = sb->s_op->quota_read(sb, type, (char *)&dinfo, + sizeof(struct ocfs2_global_disk_dqinfo), + OCFS2_GLOBAL_INFO_OFF); + ocfs2_qinfo_unlock(oinfo, 0); + ocfs2_unlock_global_qf(oinfo, 0); + if (status != sizeof(struct ocfs2_global_disk_dqinfo)) { + mlog(ML_ERROR, "Cannot read global quota info (%d).\n", + status); + if (status >= 0) + status = -EIO; + mlog_errno(status); + goto out_err; + } + info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); + info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); + oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms); + oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); + oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); + oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); + oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits; + oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize - + OCFS2_QBLK_RESERVED_SPACE; + oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); + INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); + schedule_delayed_work(&oinfo->dqi_sync_work, + msecs_to_jiffies(oinfo->dqi_syncms)); + +out_err: + if (status) + mlog_errno(status); + return status; +out_unlock: + ocfs2_unlock_global_qf(oinfo, 0); + mlog_errno(status); + goto out_err; +} + +/* Write information to global quota file. Expects exlusive lock on quota + * file inode and quota info */ +static int __ocfs2_global_write_info(struct super_block *sb, int type) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_global_disk_dqinfo dinfo; + ssize_t size; + + spin_lock(&dq_data_lock); + info->dqi_flags &= ~DQF_INFO_DIRTY; + dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace); + dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); + spin_unlock(&dq_data_lock); + dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms); + dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks); + dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk); + dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry); + size = sb->s_op->quota_write(sb, type, (char *)&dinfo, + sizeof(struct ocfs2_global_disk_dqinfo), + OCFS2_GLOBAL_INFO_OFF); + if (size != sizeof(struct ocfs2_global_disk_dqinfo)) { + mlog(ML_ERROR, "Cannot write global quota info structure\n"); + if (size >= 0) + size = -EIO; + return size; + } + return 0; +} + +int ocfs2_global_write_info(struct super_block *sb, int type) +{ + int err; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + + err = ocfs2_qinfo_lock(info, 1); + if (err < 0) + return err; + err = __ocfs2_global_write_info(sb, type); + ocfs2_qinfo_unlock(info, 1); + return err; +} + +static int ocfs2_global_qinit_alloc(struct super_block *sb, int type) +{ + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + + /* + * We may need to allocate tree blocks and a leaf block but not the + * root block + */ + return oinfo->dqi_gi.dqi_qtree_depth; +} + +static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type) +{ + /* We modify all the allocated blocks, tree root, info block and + * the inode */ + return (ocfs2_global_qinit_alloc(sb, type) + 2) * + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1; +} + +/* Sync local information about quota modifications with global quota file. + * Caller must have started the transaction and obtained exclusive lock for + * global quota file inode */ +int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) +{ + int err, err2; + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_global_disk_dqblk dqblk; + s64 spacechange, inodechange; + time_t olditime, oldbtime; + + err = sb->s_op->quota_read(sb, type, (char *)&dqblk, + sizeof(struct ocfs2_global_disk_dqblk), + dquot->dq_off); + if (err != sizeof(struct ocfs2_global_disk_dqblk)) { + if (err >= 0) { + mlog(ML_ERROR, "Short read from global quota file " + "(%u read)\n", err); + err = -EIO; + } + goto out; + } + + /* Update space and inode usage. Get also other information from + * global quota file so that we don't overwrite any changes there. + * We are */ + spin_lock(&dq_data_lock); + spacechange = dquot->dq_dqb.dqb_curspace - + OCFS2_DQUOT(dquot)->dq_origspace; + inodechange = dquot->dq_dqb.dqb_curinodes - + OCFS2_DQUOT(dquot)->dq_originodes; + olditime = dquot->dq_dqb.dqb_itime; + oldbtime = dquot->dq_dqb.dqb_btime; + ocfs2_global_disk2memdqb(dquot, &dqblk); + trace_ocfs2_sync_dquot(dquot->dq_id, dquot->dq_dqb.dqb_curspace, + (long long)spacechange, + dquot->dq_dqb.dqb_curinodes, + (long long)inodechange); + if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags)) + dquot->dq_dqb.dqb_curspace += spacechange; + if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags)) + dquot->dq_dqb.dqb_curinodes += inodechange; + /* Set properly space grace time... */ + if (dquot->dq_dqb.dqb_bsoftlimit && + dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) { + if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) && + oldbtime > 0) { + if (dquot->dq_dqb.dqb_btime > 0) + dquot->dq_dqb.dqb_btime = + min(dquot->dq_dqb.dqb_btime, oldbtime); + else + dquot->dq_dqb.dqb_btime = oldbtime; + } + } else { + dquot->dq_dqb.dqb_btime = 0; + clear_bit(DQ_BLKS_B, &dquot->dq_flags); + } + /* Set properly inode grace time... */ + if (dquot->dq_dqb.dqb_isoftlimit && + dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) { + if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) && + olditime > 0) { + if (dquot->dq_dqb.dqb_itime > 0) + dquot->dq_dqb.dqb_itime = + min(dquot->dq_dqb.dqb_itime, olditime); + else + dquot->dq_dqb.dqb_itime = olditime; + } + } else { + dquot->dq_dqb.dqb_itime = 0; + clear_bit(DQ_INODES_B, &dquot->dq_flags); + } + /* All information is properly updated, clear the flags */ + __clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); + OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; + OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; + spin_unlock(&dq_data_lock); + err = ocfs2_qinfo_lock(info, freeing); + if (err < 0) { + mlog(ML_ERROR, "Failed to lock quota info, losing quota write" + " (type=%d, id=%u)\n", dquot->dq_type, + (unsigned)dquot->dq_id); + goto out; + } + if (freeing) + OCFS2_DQUOT(dquot)->dq_use_count--; + err = qtree_write_dquot(&info->dqi_gi, dquot); + if (err < 0) + goto out_qlock; + if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) { + err = qtree_release_dquot(&info->dqi_gi, dquot); + if (info_dirty(sb_dqinfo(sb, type))) { + err2 = __ocfs2_global_write_info(sb, type); + if (!err) + err = err2; + } + } +out_qlock: + ocfs2_qinfo_unlock(info, freeing); +out: + if (err < 0) + mlog_errno(err); + return err; +} + +/* + * Functions for periodic syncing of dquots with global file + */ +static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) +{ + handle_t *handle; + struct super_block *sb = dquot->dq_sb; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(sb); + int status = 0; + + trace_ocfs2_sync_dquot_helper(dquot->dq_id, dquot->dq_type, + type, sb->s_id); + if (type != dquot->dq_type) + goto out; + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + + handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + mutex_lock(&sb_dqopt(sb)->dqio_mutex); + status = ocfs2_sync_dquot(dquot); + if (status < 0) + mlog_errno(status); + /* We have to write local structure as well... */ + status = ocfs2_local_write_dquot(dquot); + if (status < 0) + mlog_errno(status); + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + ocfs2_commit_trans(osb, handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + return status; +} + +static void qsync_work_fn(struct work_struct *work) +{ + struct ocfs2_mem_dqinfo *oinfo = container_of(work, + struct ocfs2_mem_dqinfo, + dqi_sync_work.work); + struct super_block *sb = oinfo->dqi_gqinode->i_sb; + + dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); + schedule_delayed_work(&oinfo->dqi_sync_work, + msecs_to_jiffies(oinfo->dqi_syncms)); +} + +/* + * Wrappers for generic quota functions + */ + +static int ocfs2_write_dquot(struct dquot *dquot) +{ + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); + int status = 0; + + trace_ocfs2_write_dquot(dquot->dq_id, dquot->dq_type); + + handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex); + status = ocfs2_local_write_dquot(dquot); + mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex); + ocfs2_commit_trans(osb, handle); +out: + return status; +} + +static int ocfs2_calc_qdel_credits(struct super_block *sb, int type) +{ + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + /* + * We modify tree, leaf block, global info, local chunk header, + * global and local inode; OCFS2_QINFO_WRITE_CREDITS already + * accounts for inode update + */ + return (oinfo->dqi_gi.dqi_qtree_depth + 2) * + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + + OCFS2_QINFO_WRITE_CREDITS + + OCFS2_INODE_UPDATE_CREDITS; +} + +static int ocfs2_release_dquot(struct dquot *dquot) +{ + handle_t *handle; + struct ocfs2_mem_dqinfo *oinfo = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); + int status = 0; + + trace_ocfs2_release_dquot(dquot->dq_id, dquot->dq_type); + + mutex_lock(&dquot->dq_lock); + /* Check whether we are not racing with some other dqget() */ + if (atomic_read(&dquot->dq_count) > 1) + goto out; + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + handle = ocfs2_start_trans(osb, + ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + + status = ocfs2_global_release_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + status = ocfs2_local_release_dquot(handle, dquot); + /* + * If we fail here, we cannot do much as global structure is + * already released. So just complain... + */ + if (status < 0) + mlog_errno(status); + clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); +out_trans: + ocfs2_commit_trans(osb, handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + mutex_unlock(&dquot->dq_lock); + if (status) + mlog_errno(status); + return status; +} + +/* + * Read global dquot structure from disk or create it if it does + * not exist. Also update use count of the global structure and + * create structure in node-local quota file. + */ +static int ocfs2_acquire_dquot(struct dquot *dquot) +{ + int status = 0, err; + int ex = 0; + struct super_block *sb = dquot->dq_sb; + struct ocfs2_super *osb = OCFS2_SB(sb); + int type = dquot->dq_type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct inode *gqinode = info->dqi_gqinode; + int need_alloc = ocfs2_global_qinit_alloc(sb, type); + handle_t *handle; + + trace_ocfs2_acquire_dquot(dquot->dq_id, type); + mutex_lock(&dquot->dq_lock); + /* + * We need an exclusive lock, because we're going to update use count + * and instantiate possibly new dquot structure + */ + status = ocfs2_lock_global_qf(info, 1); + if (status < 0) + goto out; + if (!test_bit(DQ_READ_B, &dquot->dq_flags)) { + status = ocfs2_qinfo_lock(info, 0); + if (status < 0) + goto out_dq; + status = qtree_read_dquot(&info->dqi_gi, dquot); + ocfs2_qinfo_unlock(info, 0); + if (status < 0) + goto out_dq; + } + set_bit(DQ_READ_B, &dquot->dq_flags); + + OCFS2_DQUOT(dquot)->dq_use_count++; + OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; + OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; + if (!dquot->dq_off) { /* No real quota entry? */ + ex = 1; + /* + * Add blocks to quota file before we start a transaction since + * locking allocators ranks above a transaction start + */ + WARN_ON(journal_current_handle()); + status = ocfs2_extend_no_holes(gqinode, NULL, + gqinode->i_size + (need_alloc << sb->s_blocksize_bits), + gqinode->i_size); + if (status < 0) + goto out_dq; + } + + handle = ocfs2_start_trans(osb, + ocfs2_calc_global_qinit_credits(sb, type)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto out_dq; + } + status = ocfs2_qinfo_lock(info, ex); + if (status < 0) + goto out_trans; + status = qtree_write_dquot(&info->dqi_gi, dquot); + if (ex && info_dirty(sb_dqinfo(sb, type))) { + err = __ocfs2_global_write_info(sb, type); + if (!status) + status = err; + } + ocfs2_qinfo_unlock(info, ex); +out_trans: + ocfs2_commit_trans(osb, handle); +out_dq: + ocfs2_unlock_global_qf(info, 1); + if (status < 0) + goto out; + + status = ocfs2_create_local_dquot(dquot); + if (status < 0) + goto out; + set_bit(DQ_ACTIVE_B, &dquot->dq_flags); +out: + mutex_unlock(&dquot->dq_lock); + if (status) + mlog_errno(status); + return status; +} + +static int ocfs2_mark_dquot_dirty(struct dquot *dquot) +{ + unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) | + (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) | + (1 << (DQ_LASTSET_B + QIF_INODES_B)) | + (1 << (DQ_LASTSET_B + QIF_SPACE_B)) | + (1 << (DQ_LASTSET_B + QIF_BTIME_B)) | + (1 << (DQ_LASTSET_B + QIF_ITIME_B)); + int sync = 0; + int status; + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_type; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(sb); + + trace_ocfs2_mark_dquot_dirty(dquot->dq_id, type); + + /* In case user set some limits, sync dquot immediately to global + * quota file so that information propagates quicker */ + spin_lock(&dq_data_lock); + if (dquot->dq_flags & mask) + sync = 1; + spin_unlock(&dq_data_lock); + /* This is a slight hack but we can't afford getting global quota + * lock if we already have a transaction started. */ + if (!sync || journal_current_handle()) { + status = ocfs2_write_dquot(dquot); + goto out; + } + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + mutex_lock(&sb_dqopt(sb)->dqio_mutex); + status = ocfs2_sync_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_dlock; + } + /* Now write updated local dquot structure */ + status = ocfs2_local_write_dquot(dquot); +out_dlock: + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + ocfs2_commit_trans(osb, handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + if (status) + mlog_errno(status); + return status; +} + +/* This should happen only after set_dqinfo(). */ +static int ocfs2_write_info(struct super_block *sb, int type) +{ + handle_t *handle; + int status = 0; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + status = dquot_commit_info(sb, type); + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + if (status) + mlog_errno(status); + return status; +} + +static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type) +{ + struct ocfs2_dquot *dquot = + kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS); + + if (!dquot) + return NULL; + return &dquot->dq_dquot; +} + +static void ocfs2_destroy_dquot(struct dquot *dquot) +{ + kmem_cache_free(ocfs2_dquot_cachep, dquot); +} + +const struct dquot_operations ocfs2_quota_operations = { + /* We never make dquot dirty so .write_dquot is never called */ + .acquire_dquot = ocfs2_acquire_dquot, + .release_dquot = ocfs2_release_dquot, + .mark_dirty = ocfs2_mark_dquot_dirty, + .write_info = ocfs2_write_info, + .alloc_dquot = ocfs2_alloc_dquot, + .destroy_dquot = ocfs2_destroy_dquot, +}; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c new file mode 100644 index 00000000..dc8007fc --- /dev/null +++ b/fs/ocfs2/quota_local.c @@ -0,0 +1,1316 @@ +/* + * Implementation of operations over local quota file + */ + +#include +#include +#include +#include +#include + +#include + +#include "ocfs2_fs.h" +#include "ocfs2.h" +#include "inode.h" +#include "alloc.h" +#include "file.h" +#include "buffer_head_io.h" +#include "journal.h" +#include "sysfile.h" +#include "dlmglue.h" +#include "quota.h" +#include "uptodate.h" +#include "super.h" +#include "ocfs2_trace.h" + +/* Number of local quota structures per block */ +static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) +{ + return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) / + sizeof(struct ocfs2_local_disk_dqblk)); +} + +/* Number of blocks with entries in one chunk */ +static inline unsigned int ol_chunk_blocks(struct super_block *sb) +{ + return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - + OCFS2_QBLK_RESERVED_SPACE) << 3) / + ol_quota_entries_per_block(sb); +} + +/* Number of entries in a chunk bitmap */ +static unsigned int ol_chunk_entries(struct super_block *sb) +{ + return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb); +} + +/* Offset of the chunk in quota file */ +static unsigned int ol_quota_chunk_block(struct super_block *sb, int c) +{ + /* 1 block for local quota file info, 1 block per chunk for chunk info */ + return 1 + (ol_chunk_blocks(sb) + 1) * c; +} + +static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off) +{ + int epb = ol_quota_entries_per_block(sb); + + return ol_quota_chunk_block(sb, c) + 1 + off / epb; +} + +static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off) +{ + int epb = ol_quota_entries_per_block(sb); + + return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk); +} + +/* Offset of the dquot structure in the quota file */ +static loff_t ol_dqblk_off(struct super_block *sb, int c, int off) +{ + return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) + + ol_dqblk_block_off(sb, c, off); +} + +/* Compute block number from given offset */ +static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off) +{ + return off >> sb->s_blocksize_bits; +} + +static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off) +{ + return off & ((1 << sb->s_blocksize_bits) - 1); +} + +/* Compute offset in the chunk of a structure with the given offset */ +static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off) +{ + int epb = ol_quota_entries_per_block(sb); + + return ((off >> sb->s_blocksize_bits) - + ol_quota_chunk_block(sb, c) - 1) * epb + + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) / + sizeof(struct ocfs2_local_disk_dqblk); +} + +/* Write bufferhead into the fs */ +static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, + void (*modify)(struct buffer_head *, void *), void *private) +{ + struct super_block *sb = inode->i_sb; + handle_t *handle; + int status; + + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + return status; + } + status = ocfs2_journal_access_dq(handle, INODE_CACHE(inode), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + ocfs2_commit_trans(OCFS2_SB(sb), handle); + return status; + } + lock_buffer(bh); + modify(bh, private); + unlock_buffer(bh); + ocfs2_journal_dirty(handle, bh); + + status = ocfs2_commit_trans(OCFS2_SB(sb), handle); + if (status < 0) { + mlog_errno(status); + return status; + } + return 0; +} + +/* + * Read quota block from a given logical offset. + * + * This function acquires ip_alloc_sem and thus it must not be called with a + * transaction started. + */ +static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, + struct buffer_head **bh) +{ + int rc = 0; + struct buffer_head *tmp = *bh; + + if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { + ocfs2_error(inode->i_sb, + "Quota file %llu is probably corrupted! Requested " + "to read block %Lu but file has size only %Lu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_block, + (unsigned long long)i_size_read(inode)); + return -EIO; + } + rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, + ocfs2_validate_quota_block); + if (rc) + mlog_errno(rc); + + /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +/* Check whether we understand format of quota files */ +static int ocfs2_local_check_quota_file(struct super_block *sb, int type) +{ + unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS; + unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS; + unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS; + unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS; + unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE }; + struct buffer_head *bh = NULL; + struct inode *linode = sb_dqopt(sb)->files[type]; + struct inode *ginode = NULL; + struct ocfs2_disk_dqheader *dqhead; + int status, ret = 0; + + /* First check whether we understand local quota file */ + status = ocfs2_read_quota_block(linode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file header (type=%d)\n", + type); + goto out_err; + } + dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data); + if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) { + mlog(ML_ERROR, "quota file magic does not match (%u != %u)," + " type=%d\n", le32_to_cpu(dqhead->dqh_magic), + lmagics[type], type); + goto out_err; + } + if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) { + mlog(ML_ERROR, "quota file version does not match (%u != %u)," + " type=%d\n", le32_to_cpu(dqhead->dqh_version), + lversions[type], type); + goto out_err; + } + brelse(bh); + bh = NULL; + + /* Next check whether we understand global quota file */ + ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], + OCFS2_INVALID_SLOT); + if (!ginode) { + mlog(ML_ERROR, "cannot get global quota file inode " + "(type=%d)\n", type); + goto out_err; + } + /* Since the header is read only, we don't care about locking */ + status = ocfs2_read_quota_block(ginode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read global quota file header " + "(type=%d)\n", type); + goto out_err; + } + dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data); + if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) { + mlog(ML_ERROR, "global quota file magic does not match " + "(%u != %u), type=%d\n", + le32_to_cpu(dqhead->dqh_magic), gmagics[type], type); + goto out_err; + } + if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) { + mlog(ML_ERROR, "global quota file version does not match " + "(%u != %u), type=%d\n", + le32_to_cpu(dqhead->dqh_version), gversions[type], + type); + goto out_err; + } + + ret = 1; +out_err: + brelse(bh); + iput(ginode); + return ret; +} + +/* Release given list of quota file chunks */ +static void ocfs2_release_local_quota_bitmaps(struct list_head *head) +{ + struct ocfs2_quota_chunk *pos, *next; + + list_for_each_entry_safe(pos, next, head, qc_chunk) { + list_del(&pos->qc_chunk); + brelse(pos->qc_headerbh); + kmem_cache_free(ocfs2_qf_chunk_cachep, pos); + } +} + +/* Load quota bitmaps into memory */ +static int ocfs2_load_local_quota_bitmaps(struct inode *inode, + struct ocfs2_local_disk_dqinfo *ldinfo, + struct list_head *head) +{ + struct ocfs2_quota_chunk *newchunk; + int i, status; + + INIT_LIST_HEAD(head); + for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) { + newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS); + if (!newchunk) { + ocfs2_release_local_quota_bitmaps(head); + return -ENOMEM; + } + newchunk->qc_num = i; + newchunk->qc_headerbh = NULL; + status = ocfs2_read_quota_block(inode, + ol_quota_chunk_block(inode->i_sb, i), + &newchunk->qc_headerbh); + if (status) { + mlog_errno(status); + kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk); + ocfs2_release_local_quota_bitmaps(head); + return status; + } + list_add_tail(&newchunk->qc_chunk, head); + } + return 0; +} + +static void olq_update_info(struct buffer_head *bh, void *private) +{ + struct mem_dqinfo *info = private; + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_local_disk_dqinfo *ldinfo; + + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + spin_lock(&dq_data_lock); + ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); + ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks); + ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks); + spin_unlock(&dq_data_lock); +} + +static int ocfs2_add_recovery_chunk(struct super_block *sb, + struct ocfs2_local_disk_chunk *dchunk, + int chunk, + struct list_head *head) +{ + struct ocfs2_recovery_chunk *rc; + + rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS); + if (!rc) + return -ENOMEM; + rc->rc_chunk = chunk; + rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); + if (!rc->rc_bitmap) { + kfree(rc); + return -ENOMEM; + } + memcpy(rc->rc_bitmap, dchunk->dqc_bitmap, + (ol_chunk_entries(sb) + 7) >> 3); + list_add_tail(&rc->rc_list, head); + return 0; +} + +static void free_recovery_list(struct list_head *head) +{ + struct ocfs2_recovery_chunk *next; + struct ocfs2_recovery_chunk *rchunk; + + list_for_each_entry_safe(rchunk, next, head, rc_list) { + list_del(&rchunk->rc_list); + kfree(rchunk->rc_bitmap); + kfree(rchunk); + } +} + +void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + free_recovery_list(&(rec->r_list[type])); + kfree(rec); +} + +/* Load entries in our quota file we have to recover*/ +static int ocfs2_recovery_load_quota(struct inode *lqinode, + struct ocfs2_local_disk_dqinfo *ldinfo, + int type, + struct list_head *head) +{ + struct super_block *sb = lqinode->i_sb; + struct buffer_head *hbh; + struct ocfs2_local_disk_chunk *dchunk; + int i, chunks = le32_to_cpu(ldinfo->dqi_chunks); + int status = 0; + + for (i = 0; i < chunks; i++) { + hbh = NULL; + status = ocfs2_read_quota_block(lqinode, + ol_quota_chunk_block(sb, i), + &hbh); + if (status) { + mlog_errno(status); + break; + } + dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; + if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb)) + status = ocfs2_add_recovery_chunk(sb, dchunk, i, head); + brelse(hbh); + if (status < 0) + break; + } + if (status < 0) + free_recovery_list(head); + return status; +} + +static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void) +{ + int type; + struct ocfs2_quota_recovery *rec; + + rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS); + if (!rec) + return NULL; + for (type = 0; type < MAXQUOTAS; type++) + INIT_LIST_HEAD(&(rec->r_list[type])); + return rec; +} + +/* Load information we need for quota recovery into memory */ +struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( + struct ocfs2_super *osb, + int slot_num) +{ + unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + struct super_block *sb = osb->sb; + struct ocfs2_local_disk_dqinfo *ldinfo; + struct inode *lqinode; + struct buffer_head *bh; + int type; + int status = 0; + struct ocfs2_quota_recovery *rec; + + mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num); + rec = ocfs2_alloc_quota_recovery(); + if (!rec) + return ERR_PTR(-ENOMEM); + /* First init... */ + + for (type = 0; type < MAXQUOTAS; type++) { + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + continue; + /* At this point, journal of the slot is already replayed so + * we can trust metadata and data of the quota file */ + lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num); + if (!lqinode) { + status = -ENOENT; + goto out; + } + status = ocfs2_inode_lock_full(lqinode, NULL, 1, + OCFS2_META_LOCK_RECOVERY); + if (status < 0) { + mlog_errno(status); + goto out_put; + } + /* Now read local header */ + bh = NULL; + status = ocfs2_read_quota_block(lqinode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file info header " + "(slot=%d type=%d)\n", slot_num, type); + goto out_lock; + } + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + status = ocfs2_recovery_load_quota(lqinode, ldinfo, type, + &rec->r_list[type]); + brelse(bh); +out_lock: + ocfs2_inode_unlock(lqinode, 1); +out_put: + iput(lqinode); + if (status < 0) + break; + } +out: + if (status < 0) { + ocfs2_free_quota_recovery(rec); + rec = ERR_PTR(status); + } + return rec; +} + +/* Sync changes in local quota file into global quota file and + * reinitialize local quota file. + * The function expects local quota file to be already locked and + * dqonoff_mutex locked. */ +static int ocfs2_recover_local_quota_file(struct inode *lqinode, + int type, + struct ocfs2_quota_recovery *rec) +{ + struct super_block *sb = lqinode->i_sb; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_local_disk_chunk *dchunk; + struct ocfs2_local_disk_dqblk *dqblk; + struct dquot *dquot; + handle_t *handle; + struct buffer_head *hbh = NULL, *qbh = NULL; + int status = 0; + int bit, chunk; + struct ocfs2_recovery_chunk *rchunk, *next; + qsize_t spacechange, inodechange; + + trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type); + + list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) { + chunk = rchunk->rc_chunk; + hbh = NULL; + status = ocfs2_read_quota_block(lqinode, + ol_quota_chunk_block(sb, chunk), + &hbh); + if (status) { + mlog_errno(status); + break; + } + dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; + for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { + qbh = NULL; + status = ocfs2_read_quota_block(lqinode, + ol_dqblk_block(sb, chunk, bit), + &qbh); + if (status) { + mlog_errno(status); + break; + } + dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data + + ol_dqblk_block_off(sb, chunk, bit)); + dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type); + if (!dquot) { + status = -EIO; + mlog(ML_ERROR, "Failed to get quota structure " + "for id %u, type %d. Cannot finish quota " + "file recovery.\n", + (unsigned)le64_to_cpu(dqblk->dqb_id), + type); + goto out_put_bh; + } + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) { + mlog_errno(status); + goto out_put_dquot; + } + + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_QSYNC_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_drop_lock; + } + mutex_lock(&sb_dqopt(sb)->dqio_mutex); + spin_lock(&dq_data_lock); + /* Add usage from quota entry into quota changes + * of our node. Auxiliary variables are important + * due to signedness */ + spacechange = le64_to_cpu(dqblk->dqb_spacemod); + inodechange = le64_to_cpu(dqblk->dqb_inodemod); + dquot->dq_dqb.dqb_curspace += spacechange; + dquot->dq_dqb.dqb_curinodes += inodechange; + spin_unlock(&dq_data_lock); + /* We want to drop reference held by the crashed + * node. Since we have our own reference we know + * global structure actually won't be freed. */ + status = ocfs2_global_release_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + /* Release local quota file entry */ + status = ocfs2_journal_access_dq(handle, + INODE_CACHE(lqinode), + qbh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + lock_buffer(qbh); + WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap)); + ocfs2_clear_bit(bit, dchunk->dqc_bitmap); + le32_add_cpu(&dchunk->dqc_free, 1); + unlock_buffer(qbh); + ocfs2_journal_dirty(handle, qbh); +out_commit: + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out_drop_lock: + ocfs2_unlock_global_qf(oinfo, 1); +out_put_dquot: + dqput(dquot); +out_put_bh: + brelse(qbh); + if (status < 0) + break; + } + brelse(hbh); + list_del(&rchunk->rc_list); + kfree(rchunk->rc_bitmap); + kfree(rchunk); + if (status < 0) + break; + } + if (status < 0) + free_recovery_list(&(rec->r_list[type])); + if (status) + mlog_errno(status); + return status; +} + +/* Recover local quota files for given node different from us */ +int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, + struct ocfs2_quota_recovery *rec, + int slot_num) +{ + unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + struct super_block *sb = osb->sb; + struct ocfs2_local_disk_dqinfo *ldinfo; + struct buffer_head *bh; + handle_t *handle; + int type; + int status = 0; + struct inode *lqinode; + unsigned int flags; + + mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num); + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + for (type = 0; type < MAXQUOTAS; type++) { + if (list_empty(&(rec->r_list[type]))) + continue; + trace_ocfs2_finish_quota_recovery(slot_num); + lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num); + if (!lqinode) { + status = -ENOENT; + goto out; + } + status = ocfs2_inode_lock_full(lqinode, NULL, 1, + OCFS2_META_LOCK_NOQUEUE); + /* Someone else is holding the lock? Then he must be + * doing the recovery. Just skip the file... */ + if (status == -EAGAIN) { + mlog(ML_NOTICE, "skipping quota recovery for slot %d " + "because quota file is locked.\n", slot_num); + status = 0; + goto out_put; + } else if (status < 0) { + mlog_errno(status); + goto out_put; + } + /* Now read local header */ + bh = NULL; + status = ocfs2_read_quota_block(lqinode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file info header " + "(slot=%d type=%d)\n", slot_num, type); + goto out_lock; + } + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + /* Is recovery still needed? */ + flags = le32_to_cpu(ldinfo->dqi_flags); + if (!(flags & OLQF_CLEAN)) + status = ocfs2_recover_local_quota_file(lqinode, + type, + rec); + /* We don't want to mark file as clean when it is actually + * active */ + if (slot_num == osb->slot_num) + goto out_bh; + /* Mark quota file as clean if we are recovering quota file of + * some other node. */ + handle = ocfs2_start_trans(osb, + OCFS2_LOCAL_QINFO_WRITE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_bh; + } + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), + bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN); + unlock_buffer(bh); + ocfs2_journal_dirty(handle, bh); +out_trans: + ocfs2_commit_trans(osb, handle); +out_bh: + brelse(bh); +out_lock: + ocfs2_inode_unlock(lqinode, 1); +out_put: + iput(lqinode); + if (status < 0) + break; + } +out: + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + kfree(rec); + return status; +} + +/* Read information header from quota file */ +static int ocfs2_local_read_info(struct super_block *sb, int type) +{ + struct ocfs2_local_disk_dqinfo *ldinfo; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + int status; + struct buffer_head *bh = NULL; + struct ocfs2_quota_recovery *rec; + int locked = 0; + + /* We don't need the lock and we have to acquire quota file locks + * which will later depend on this lock */ + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + info->dqi_maxblimit = 0x7fffffffffffffffLL; + info->dqi_maxilimit = 0x7fffffffffffffffLL; + oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS); + if (!oinfo) { + mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota" + " info."); + goto out_err; + } + info->dqi_priv = oinfo; + oinfo->dqi_type = type; + INIT_LIST_HEAD(&oinfo->dqi_chunk); + oinfo->dqi_rec = NULL; + oinfo->dqi_lqi_bh = NULL; + oinfo->dqi_libh = NULL; + + status = ocfs2_global_read_info(sb, type); + if (status < 0) + goto out_err; + + status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + locked = 1; + + /* Now read local header */ + status = ocfs2_read_quota_block(lqinode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file info header " + "(type=%d)\n", type); + goto out_err; + } + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); + oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks); + oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks); + oinfo->dqi_libh = bh; + + /* We crashed when using local quota file? */ + if (!(info->dqi_flags & OLQF_CLEAN)) { + rec = OCFS2_SB(sb)->quota_rec; + if (!rec) { + rec = ocfs2_alloc_quota_recovery(); + if (!rec) { + status = -ENOMEM; + mlog_errno(status); + goto out_err; + } + OCFS2_SB(sb)->quota_rec = rec; + } + + status = ocfs2_recovery_load_quota(lqinode, ldinfo, type, + &rec->r_list[type]); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + } + + status = ocfs2_load_local_quota_bitmaps(lqinode, + ldinfo, + &oinfo->dqi_chunk); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + + /* Now mark quota file as used */ + info->dqi_flags &= ~OLQF_CLEAN; + status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + + mutex_lock(&sb_dqopt(sb)->dqio_mutex); + return 0; +out_err: + if (oinfo) { + iput(oinfo->dqi_gqinode); + ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); + ocfs2_lock_res_free(&oinfo->dqi_gqlock); + brelse(oinfo->dqi_lqi_bh); + if (locked) + ocfs2_inode_unlock(lqinode, 1); + ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); + kfree(oinfo); + } + brelse(bh); + mutex_lock(&sb_dqopt(sb)->dqio_mutex); + return -1; +} + +/* Write local info to quota file */ +static int ocfs2_local_write_info(struct super_block *sb, int type) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv) + ->dqi_libh; + int status; + + status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info, + info); + if (status < 0) { + mlog_errno(status); + return -1; + } + + return 0; +} + +/* Release info from memory */ +static int ocfs2_local_free_info(struct super_block *sb, int type) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_quota_chunk *chunk; + struct ocfs2_local_disk_chunk *dchunk; + int mark_clean = 1, len; + int status; + + iput(oinfo->dqi_gqinode); + ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); + ocfs2_lock_res_free(&oinfo->dqi_gqlock); + list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) { + dchunk = (struct ocfs2_local_disk_chunk *) + (chunk->qc_headerbh->b_data); + if (chunk->qc_num < oinfo->dqi_chunks - 1) { + len = ol_chunk_entries(sb); + } else { + len = (oinfo->dqi_blocks - + ol_quota_chunk_block(sb, chunk->qc_num) - 1) + * ol_quota_entries_per_block(sb); + } + /* Not all entries free? Bug! */ + if (le32_to_cpu(dchunk->dqc_free) != len) { + mlog(ML_ERROR, "releasing quota file with used " + "entries (type=%d)\n", type); + mark_clean = 0; + } + } + ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); + + /* dqonoff_mutex protects us against racing with recovery thread... */ + if (oinfo->dqi_rec) { + ocfs2_free_quota_recovery(oinfo->dqi_rec); + mark_clean = 0; + } + + if (!mark_clean) + goto out; + + /* Mark local file as clean */ + info->dqi_flags |= OLQF_CLEAN; + status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], + oinfo->dqi_libh, + olq_update_info, + info); + if (status < 0) { + mlog_errno(status); + goto out; + } + +out: + ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1); + brelse(oinfo->dqi_libh); + brelse(oinfo->dqi_lqi_bh); + kfree(oinfo); + return 0; +} + +static void olq_set_dquot(struct buffer_head *bh, void *private) +{ + struct ocfs2_dquot *od = private; + struct ocfs2_local_disk_dqblk *dqblk; + struct super_block *sb = od->dq_dquot.dq_sb; + + dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data + + ol_dqblk_block_offset(sb, od->dq_local_off)); + + dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id); + spin_lock(&dq_data_lock); + dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace - + od->dq_origspace); + dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes - + od->dq_originodes); + spin_unlock(&dq_data_lock); + trace_olq_set_dquot( + (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod), + (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod), + od->dq_dquot.dq_id); +} + +/* Write dquot to local quota file */ +int ocfs2_local_write_dquot(struct dquot *dquot) +{ + struct super_block *sb = dquot->dq_sb; + struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); + struct buffer_head *bh; + struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_type]; + int status; + + status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk, + &bh); + if (status) { + mlog_errno(status); + goto out; + } + status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od); + if (status < 0) { + mlog_errno(status); + goto out; + } +out: + brelse(bh); + return status; +} + +/* Find free entry in local quota file */ +static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb, + int type, + int *offset) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_quota_chunk *chunk; + struct ocfs2_local_disk_chunk *dchunk; + int found = 0, len; + + list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) { + dchunk = (struct ocfs2_local_disk_chunk *) + chunk->qc_headerbh->b_data; + if (le32_to_cpu(dchunk->dqc_free) > 0) { + found = 1; + break; + } + } + if (!found) + return NULL; + + if (chunk->qc_num < oinfo->dqi_chunks - 1) { + len = ol_chunk_entries(sb); + } else { + len = (oinfo->dqi_blocks - + ol_quota_chunk_block(sb, chunk->qc_num) - 1) + * ol_quota_entries_per_block(sb); + } + + found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0); + /* We failed? */ + if (found == len) { + mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u" + " entries free (type=%d)\n", chunk->qc_num, + le32_to_cpu(dchunk->dqc_free), type); + return ERR_PTR(-EIO); + } + *offset = found; + return chunk; +} + +/* Add new chunk to the local quota file */ +static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( + struct super_block *sb, + int type, + int *offset) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + struct ocfs2_quota_chunk *chunk = NULL; + struct ocfs2_local_disk_chunk *dchunk; + int status; + handle_t *handle; + struct buffer_head *bh = NULL, *dbh = NULL; + u64 p_blkno; + + /* We are protected by dqio_sem so no locking needed */ + status = ocfs2_extend_no_holes(lqinode, NULL, + lqinode->i_size + 2 * sb->s_blocksize, + lqinode->i_size); + if (status < 0) { + mlog_errno(status); + goto out; + } + status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, + lqinode->i_size + 2 * sb->s_blocksize); + if (status < 0) { + mlog_errno(status); + goto out; + } + + chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS); + if (!chunk) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + /* Local quota info and two new blocks we initialize */ + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + + /* Initialize chunk header */ + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, + &p_blkno, NULL, NULL); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + bh = sb_getblk(sb, p_blkno); + if (!bh) { + status = -ENOMEM; + mlog_errno(status); + goto out_trans; + } + dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; + ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh); + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb)); + memset(dchunk->dqc_bitmap, 0, + sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - + OCFS2_QBLK_RESERVED_SPACE); + unlock_buffer(bh); + ocfs2_journal_dirty(handle, bh); + + /* Initialize new block with structures */ + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1, + &p_blkno, NULL, NULL); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + dbh = sb_getblk(sb, p_blkno); + if (!dbh) { + status = -ENOMEM; + mlog_errno(status); + goto out_trans; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), dbh); + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), dbh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(dbh); + memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE); + unlock_buffer(dbh); + ocfs2_journal_dirty(handle, dbh); + + /* Update local quotafile info */ + oinfo->dqi_blocks += 2; + oinfo->dqi_chunks++; + status = ocfs2_local_write_info(sb, type); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + status = ocfs2_commit_trans(OCFS2_SB(sb), handle); + if (status < 0) { + mlog_errno(status); + goto out; + } + + list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk); + chunk->qc_num = list_entry(chunk->qc_chunk.prev, + struct ocfs2_quota_chunk, + qc_chunk)->qc_num + 1; + chunk->qc_headerbh = bh; + *offset = 0; + return chunk; +out_trans: + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out: + brelse(bh); + brelse(dbh); + kmem_cache_free(ocfs2_qf_chunk_cachep, chunk); + return ERR_PTR(status); +} + +/* Find free entry in local quota file */ +static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( + struct super_block *sb, + int type, + int *offset) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_quota_chunk *chunk; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + struct ocfs2_local_disk_chunk *dchunk; + int epb = ol_quota_entries_per_block(sb); + unsigned int chunk_blocks; + struct buffer_head *bh; + u64 p_blkno; + int status; + handle_t *handle; + + if (list_empty(&oinfo->dqi_chunk)) + return ocfs2_local_quota_add_chunk(sb, type, offset); + /* Is the last chunk full? */ + chunk = list_entry(oinfo->dqi_chunk.prev, + struct ocfs2_quota_chunk, qc_chunk); + chunk_blocks = oinfo->dqi_blocks - + ol_quota_chunk_block(sb, chunk->qc_num) - 1; + if (ol_chunk_blocks(sb) == chunk_blocks) + return ocfs2_local_quota_add_chunk(sb, type, offset); + + /* We are protected by dqio_sem so no locking needed */ + status = ocfs2_extend_no_holes(lqinode, NULL, + lqinode->i_size + sb->s_blocksize, + lqinode->i_size); + if (status < 0) { + mlog_errno(status); + goto out; + } + status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, + lqinode->i_size + sb->s_blocksize); + if (status < 0) { + mlog_errno(status); + goto out; + } + + /* Get buffer from the just added block */ + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, + &p_blkno, NULL, NULL); + if (status < 0) { + mlog_errno(status); + goto out; + } + bh = sb_getblk(sb, p_blkno); + if (!bh) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh); + + /* Local quota info, chunk header and the new block we initialize */ + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + /* Zero created block */ + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + unlock_buffer(bh); + ocfs2_journal_dirty(handle, bh); + + /* Update chunk header */ + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), + chunk->qc_headerbh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data; + lock_buffer(chunk->qc_headerbh); + le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb)); + unlock_buffer(chunk->qc_headerbh); + ocfs2_journal_dirty(handle, chunk->qc_headerbh); + + /* Update file header */ + oinfo->dqi_blocks++; + status = ocfs2_local_write_info(sb, type); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + status = ocfs2_commit_trans(OCFS2_SB(sb), handle); + if (status < 0) { + mlog_errno(status); + goto out; + } + *offset = chunk_blocks * epb; + return chunk; +out_trans: + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out: + return ERR_PTR(status); +} + +static void olq_alloc_dquot(struct buffer_head *bh, void *private) +{ + int *offset = private; + struct ocfs2_local_disk_chunk *dchunk; + + dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; + ocfs2_set_bit(*offset, dchunk->dqc_bitmap); + le32_add_cpu(&dchunk->dqc_free, -1); +} + +/* Create dquot in the local file for given id */ +int ocfs2_create_local_dquot(struct dquot *dquot) +{ + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_type; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + struct ocfs2_quota_chunk *chunk; + struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); + int offset; + int status; + u64 pcount; + + down_write(&OCFS2_I(lqinode)->ip_alloc_sem); + chunk = ocfs2_find_free_entry(sb, type, &offset); + if (!chunk) { + chunk = ocfs2_extend_local_quota_file(sb, type, &offset); + if (IS_ERR(chunk)) { + status = PTR_ERR(chunk); + goto out; + } + } else if (IS_ERR(chunk)) { + status = PTR_ERR(chunk); + goto out; + } + od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset); + od->dq_chunk = chunk; + status = ocfs2_extent_map_get_blocks(lqinode, + ol_dqblk_block(sb, chunk->qc_num, offset), + &od->dq_local_phys_blk, + &pcount, + NULL); + + /* Initialize dquot structure on disk */ + status = ocfs2_local_write_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out; + } + + /* Mark structure as allocated */ + status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot, + &offset); + if (status < 0) { + mlog_errno(status); + goto out; + } +out: + up_write(&OCFS2_I(lqinode)->ip_alloc_sem); + return status; +} + +/* + * Release dquot structure from local quota file. ocfs2_release_dquot() has + * already started a transaction and written all changes to global quota file + */ +int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot) +{ + int status; + int type = dquot->dq_type; + struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); + struct super_block *sb = dquot->dq_sb; + struct ocfs2_local_disk_chunk *dchunk; + int offset; + + status = ocfs2_journal_access_dq(handle, + INODE_CACHE(sb_dqopt(sb)->files[type]), + od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out; + } + offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num, + od->dq_local_off); + dchunk = (struct ocfs2_local_disk_chunk *) + (od->dq_chunk->qc_headerbh->b_data); + /* Mark structure as freed */ + lock_buffer(od->dq_chunk->qc_headerbh); + ocfs2_clear_bit(offset, dchunk->dqc_bitmap); + le32_add_cpu(&dchunk->dqc_free, 1); + unlock_buffer(od->dq_chunk->qc_headerbh); + ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); + +out: + /* Clear the read bit so that next time someone uses this + * dquot he reads fresh info from disk and allocates local + * dquot structure */ + clear_bit(DQ_READ_B, &dquot->dq_flags); + return status; +} + +static const struct quota_format_ops ocfs2_format_ops = { + .check_quota_file = ocfs2_local_check_quota_file, + .read_file_info = ocfs2_local_read_info, + .write_file_info = ocfs2_global_write_info, + .free_file_info = ocfs2_local_free_info, +}; + +struct quota_format_type ocfs2_quota_format = { + .qf_fmt_id = QFMT_OCFS2, + .qf_ops = &ocfs2_format_ops, + .qf_owner = THIS_MODULE +}; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c new file mode 100644 index 00000000..15d29cce --- /dev/null +++ b/fs/ocfs2/refcounttree.c @@ -0,0 +1,4514 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * refcounttree.c + * + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include "ocfs2.h" +#include "inode.h" +#include "alloc.h" +#include "suballoc.h" +#include "journal.h" +#include "uptodate.h" +#include "super.h" +#include "buffer_head_io.h" +#include "blockcheck.h" +#include "refcounttree.h" +#include "sysfile.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "aops.h" +#include "xattr.h" +#include "namei.h" +#include "ocfs2_trace.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ocfs2_cow_context { + struct inode *inode; + struct file *file; + u32 cow_start; + u32 cow_len; + struct ocfs2_extent_tree data_et; + struct ocfs2_refcount_tree *ref_tree; + struct buffer_head *ref_root_bh; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; + void *cow_object; + struct ocfs2_post_refcount *post_refcount; + int extra_credits; + int (*get_clusters)(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags); + int (*cow_duplicate_clusters)(handle_t *handle, + struct file *file, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len); +}; + +static inline struct ocfs2_refcount_tree * +cache_info_to_refcount(struct ocfs2_caching_info *ci) +{ + return container_of(ci, struct ocfs2_refcount_tree, rf_ci); +} + +static int ocfs2_validate_refcount_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)bh->b_data; + + trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for refcount block %llu\n", + (unsigned long long)bh->b_blocknr); + return rc; + } + + + if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { + ocfs2_error(sb, + "Refcount block #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + rb->rf_signature); + return -EINVAL; + } + + if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(rb->rf_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Refcount block #%llu has an invalid " + "rf_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(rb->rf_fs_generation)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, + u64 rb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(ci, rb_blkno, &tmp, + ocfs2_validate_refcount_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + return rf->rf_blkno; +} + +static struct super_block * +ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + return rf->rf_sb; +} + +static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + spin_lock(&rf->rf_lock); +} + +static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + spin_unlock(&rf->rf_lock); +} + +static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + mutex_lock(&rf->rf_io_mutex); +} + +static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + mutex_unlock(&rf->rf_io_mutex); +} + +static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = { + .co_owner = ocfs2_refcount_cache_owner, + .co_get_super = ocfs2_refcount_cache_get_super, + .co_cache_lock = ocfs2_refcount_cache_lock, + .co_cache_unlock = ocfs2_refcount_cache_unlock, + .co_io_lock = ocfs2_refcount_cache_io_lock, + .co_io_unlock = ocfs2_refcount_cache_io_unlock, +}; + +static struct ocfs2_refcount_tree * +ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno) +{ + struct rb_node *n = osb->osb_rf_lock_tree.rb_node; + struct ocfs2_refcount_tree *tree = NULL; + + while (n) { + tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node); + + if (blkno < tree->rf_blkno) + n = n->rb_left; + else if (blkno > tree->rf_blkno) + n = n->rb_right; + else + return tree; + } + + return NULL; +} + +/* osb_lock is already locked. */ +static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *new) +{ + u64 rf_blkno = new->rf_blkno; + struct rb_node *parent = NULL; + struct rb_node **p = &osb->osb_rf_lock_tree.rb_node; + struct ocfs2_refcount_tree *tmp; + + while (*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_refcount_tree, + rf_node); + + if (rf_blkno < tmp->rf_blkno) + p = &(*p)->rb_left; + else if (rf_blkno > tmp->rf_blkno) + p = &(*p)->rb_right; + else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate refcount block %llu found!\n", + (unsigned long long)rf_blkno); + BUG(); + } + } + + rb_link_node(&new->rf_node, parent, p); + rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree); +} + +static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree) +{ + ocfs2_metadata_cache_exit(&tree->rf_ci); + ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres); + ocfs2_lock_res_free(&tree->rf_lockres); + kfree(tree); +} + +static inline void +ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree) +{ + rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree); + if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree) + osb->osb_ref_tree_lru = NULL; +} + +static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree) +{ + spin_lock(&osb->osb_lock); + ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); + spin_unlock(&osb->osb_lock); +} + +static void ocfs2_kref_remove_refcount_tree(struct kref *kref) +{ + struct ocfs2_refcount_tree *tree = + container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); + + ocfs2_free_refcount_tree(tree); +} + +static inline void +ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree) +{ + kref_get(&tree->rf_getcnt); +} + +static inline void +ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree) +{ + kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree); +} + +static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new, + struct super_block *sb) +{ + ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops); + mutex_init(&new->rf_io_mutex); + new->rf_sb = sb; + spin_lock_init(&new->rf_lock); +} + +static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *new, + u64 rf_blkno, u32 generation) +{ + init_rwsem(&new->rf_sem); + ocfs2_refcount_lock_res_init(&new->rf_lockres, osb, + rf_blkno, generation); +} + +static struct ocfs2_refcount_tree* +ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno) +{ + struct ocfs2_refcount_tree *new; + + new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS); + if (!new) + return NULL; + + new->rf_blkno = rf_blkno; + kref_init(&new->rf_getcnt); + ocfs2_init_refcount_tree_ci(new, osb->sb); + + return new; +} + +static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno, + struct ocfs2_refcount_tree **ret_tree) +{ + int ret = 0; + struct ocfs2_refcount_tree *tree, *new = NULL; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *ref_rb; + + spin_lock(&osb->osb_lock); + if (osb->osb_ref_tree_lru && + osb->osb_ref_tree_lru->rf_blkno == rf_blkno) + tree = osb->osb_ref_tree_lru; + else + tree = ocfs2_find_refcount_tree(osb, rf_blkno); + if (tree) + goto out; + + spin_unlock(&osb->osb_lock); + + new = ocfs2_allocate_refcount_tree(osb, rf_blkno); + if (!new) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + /* + * We need the generation to create the refcount tree lock and since + * it isn't changed during the tree modification, we are safe here to + * read without protection. + * We also have to purge the cache after we create the lock since the + * refcount block may have the stale data. It can only be trusted when + * we hold the refcount lock. + */ + ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh); + if (ret) { + mlog_errno(ret); + ocfs2_metadata_cache_exit(&new->rf_ci); + kfree(new); + return ret; + } + + ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + new->rf_generation = le32_to_cpu(ref_rb->rf_generation); + ocfs2_init_refcount_tree_lock(osb, new, rf_blkno, + new->rf_generation); + ocfs2_metadata_cache_purge(&new->rf_ci); + + spin_lock(&osb->osb_lock); + tree = ocfs2_find_refcount_tree(osb, rf_blkno); + if (tree) + goto out; + + ocfs2_insert_refcount_tree(osb, new); + + tree = new; + new = NULL; + +out: + *ret_tree = tree; + + osb->osb_ref_tree_lru = tree; + + spin_unlock(&osb->osb_lock); + + if (new) + ocfs2_free_refcount_tree(new); + + brelse(ref_root_bh); + return ret; +} + +static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + di = (struct ocfs2_dinode *)di_bh->b_data; + *ref_blkno = le64_to_cpu(di->i_refcount_loc); + brelse(di_bh); +out: + return ret; +} + +static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, int rw) +{ + int ret; + + ret = ocfs2_refcount_lock(tree, rw); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rw) + down_write(&tree->rf_sem); + else + down_read(&tree->rf_sem); + +out: + return ret; +} + +/* + * Lock the refcount tree pointed by ref_blkno and return the tree. + * In most case, we lock the tree and read the refcount block. + * So read it here if the caller really needs it. + * + * If the tree has been re-created by other node, it will free the + * old one and re-create it. + */ +int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, + u64 ref_blkno, int rw, + struct ocfs2_refcount_tree **ret_tree, + struct buffer_head **ref_bh) +{ + int ret, delete_tree = 0; + struct ocfs2_refcount_tree *tree = NULL; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *rb; + +again: + ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree); + if (ret) { + mlog_errno(ret); + return ret; + } + + ocfs2_refcount_tree_get(tree); + + ret = __ocfs2_lock_refcount_tree(osb, tree, rw); + if (ret) { + mlog_errno(ret); + ocfs2_refcount_tree_put(tree); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, + &ref_root_bh); + if (ret) { + mlog_errno(ret); + ocfs2_unlock_refcount_tree(osb, tree, rw); + ocfs2_refcount_tree_put(tree); + goto out; + } + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + /* + * If the refcount block has been freed and re-created, we may need + * to recreate the refcount tree also. + * + * Here we just remove the tree from the rb-tree, and the last + * kref holder will unlock and delete this refcount_tree. + * Then we goto "again" and ocfs2_get_refcount_tree will create + * the new refcount tree for us. + */ + if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) { + if (!tree->rf_removed) { + ocfs2_erase_refcount_tree_from_list(osb, tree); + tree->rf_removed = 1; + delete_tree = 1; + } + + ocfs2_unlock_refcount_tree(osb, tree, rw); + /* + * We get an extra reference when we create the refcount + * tree, so another put will destroy it. + */ + if (delete_tree) + ocfs2_refcount_tree_put(tree); + brelse(ref_root_bh); + ref_root_bh = NULL; + goto again; + } + + *ret_tree = tree; + if (ref_bh) { + *ref_bh = ref_root_bh; + ref_root_bh = NULL; + } +out: + brelse(ref_root_bh); + return ret; +} + +void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, int rw) +{ + if (rw) + up_write(&tree->rf_sem); + else + up_read(&tree->rf_sem); + + ocfs2_refcount_unlock(tree, rw); + ocfs2_refcount_tree_put(tree); +} + +void ocfs2_purge_refcount_trees(struct ocfs2_super *osb) +{ + struct rb_node *node; + struct ocfs2_refcount_tree *tree; + struct rb_root *root = &osb->osb_rf_lock_tree; + + while ((node = rb_last(root)) != NULL) { + tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node); + + trace_ocfs2_purge_refcount_trees( + (unsigned long long) tree->rf_blkno); + + rb_erase(&tree->rf_node, root); + ocfs2_free_refcount_tree(tree); + } +} + +/* + * Create a refcount tree for an inode. + * We take for granted that the inode is already locked. + */ +static int ocfs2_create_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; + u16 suballoc_bit_start; + u32 num_got; + u64 suballoc_loc, first_blkno; + + BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + + trace_ocfs2_create_refcount_tree( + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, + &suballoc_bit_start, &num_got, + &first_blkno); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno); + if (!new_tree) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_commit; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh); + + ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* Initialize ocfs2_refcount_block. */ + rb = (struct ocfs2_refcount_block *)new_bh->b_data; + memset(rb, 0, inode->i_sb->s_blocksize); + strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); + rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); + rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); + rb->rf_blkno = cpu_to_le64(first_blkno); + rb->rf_count = cpu_to_le32(1); + rb->rf_records.rl_count = + cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb)); + spin_lock(&osb->osb_lock); + rb->rf_generation = osb->s_next_generation++; + spin_unlock(&osb->osb_lock); + + ocfs2_journal_dirty(handle, new_bh); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = cpu_to_le64(first_blkno); + spin_unlock(&oi->ip_lock); + + trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno); + + ocfs2_journal_dirty(handle, di_bh); + + /* + * We have to init the tree lock here since it will use + * the generation number to create it. + */ + new_tree->rf_generation = le32_to_cpu(rb->rf_generation); + ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno, + new_tree->rf_generation); + + spin_lock(&osb->osb_lock); + tree = ocfs2_find_refcount_tree(osb, first_blkno); + + /* + * We've just created a new refcount tree in this block. If + * we found a refcount tree on the ocfs2_super, it must be + * one we just deleted. We free the old tree before + * inserting the new tree. + */ + BUG_ON(tree && tree->rf_generation == new_tree->rf_generation); + if (tree) + ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); + ocfs2_insert_refcount_tree(osb, new_tree); + spin_unlock(&osb->osb_lock); + new_tree = NULL; + if (tree) + ocfs2_refcount_tree_put(tree); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (new_tree) { + ocfs2_metadata_cache_exit(&new_tree->rf_ci); + kfree(new_tree); + } + + brelse(new_bh); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +static int ocfs2_set_refcount_tree(struct inode *inode, + struct buffer_head *di_bh, + u64 refcount_loc) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_tree *ref_tree; + + BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + + ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + le32_add_cpu(&rb->rf_count, 1); + + ocfs2_journal_dirty(handle, ref_root_bh); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = cpu_to_le64(refcount_loc); + spin_unlock(&oi->ip_lock); + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); + + return ret; +} + +int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) +{ + int ret, delete_tree = 0; + handle_t *handle = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_refcount_block *rb; + struct inode *alloc_inode = NULL; + struct buffer_head *alloc_bh = NULL; + struct buffer_head *blk_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS; + u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); + u16 bit = 0; + + if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + return 0; + + BUG_ON(!ref_blkno); + ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + rb = (struct ocfs2_refcount_block *)blk_bh->b_data; + + /* + * If we are the last user, we need to free the block. + * So lock the allocator ahead. + */ + if (le32_to_cpu(rb->rf_count) == 1) { + blk = le64_to_cpu(rb->rf_blkno); + bit = le16_to_cpu(rb->rf_suballoc_bit); + if (rb->rf_suballoc_loc) + bg_blkno = le64_to_cpu(rb->rf_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(rb->rf_suballoc_slot)); + if (!alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_mutex; + } + + credits += OCFS2_SUBALLOC_FREE; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = 0; + spin_unlock(&oi->ip_lock); + ocfs2_journal_dirty(handle, di_bh); + + le32_add_cpu(&rb->rf_count , -1); + ocfs2_journal_dirty(handle, blk_bh); + + if (!rb->rf_count) { + delete_tree = 1; + ocfs2_erase_refcount_tree_from_list(osb, ref_tree); + ret = ocfs2_free_suballoc_bits(handle, alloc_inode, + alloc_bh, bit, bg_blkno, 1); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); +out_unlock: + if (alloc_inode) { + ocfs2_inode_unlock(alloc_inode, 1); + brelse(alloc_bh); + } +out_mutex: + if (alloc_inode) { + mutex_unlock(&alloc_inode->i_mutex); + iput(alloc_inode); + } +out: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + if (delete_tree) + ocfs2_refcount_tree_put(ref_tree); + brelse(blk_bh); + + return ret; +} + +static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci, + struct buffer_head *ref_leaf_bh, + u64 cpos, unsigned int len, + struct ocfs2_refcount_rec *ret_rec, + int *index) +{ + int i = 0; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_rec *rec = NULL; + + for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) { + rec = &rb->rf_records.rl_recs[i]; + + if (le64_to_cpu(rec->r_cpos) + + le32_to_cpu(rec->r_clusters) <= cpos) + continue; + else if (le64_to_cpu(rec->r_cpos) > cpos) + break; + + /* ok, cpos fail in this rec. Just return. */ + if (ret_rec) + *ret_rec = *rec; + goto out; + } + + if (ret_rec) { + /* We meet with a hole here, so fake the rec. */ + ret_rec->r_cpos = cpu_to_le64(cpos); + ret_rec->r_refcount = 0; + if (i < le16_to_cpu(rb->rf_records.rl_used) && + le64_to_cpu(rec->r_cpos) < cpos + len) + ret_rec->r_clusters = + cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos); + else + ret_rec->r_clusters = cpu_to_le32(len); + } + +out: + *index = i; +} + +/* + * Try to remove refcount tree. The mechanism is: + * 1) Check whether i_clusters == 0, if no, exit. + * 2) check whether we have i_xattr_loc in dinode. if yes, exit. + * 3) Check whether we have inline xattr stored outside, if yes, exit. + * 4) Remove the tree. + */ +int ocfs2_try_remove_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&oi->ip_xattr_sem); + down_write(&oi->ip_alloc_sem); + + if (oi->ip_clusters) + goto out; + + if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc) + goto out; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL && + ocfs2_has_inline_xattr_value_outside(inode, di)) + goto out; + + ret = ocfs2_remove_refcount_tree(inode, di_bh); + if (ret) + mlog_errno(ret); +out: + up_write(&oi->ip_alloc_sem); + up_write(&oi->ip_xattr_sem); + return 0; +} + +/* + * Find the end range for a leaf refcount block indicated by + * el->l_recs[index].e_blkno. + */ +static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct ocfs2_extent_block *eb, + struct ocfs2_extent_list *el, + int index, u32 *cpos_end) +{ + int ret, i, subtree_root; + u32 cpos; + u64 blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_path *left_path = NULL, *right_path = NULL; + struct ocfs2_extent_tree et; + struct ocfs2_extent_list *tmp_el; + + if (index < le16_to_cpu(el->l_next_free_rec) - 1) { + /* + * We have a extent rec after index, so just use the e_cpos + * of the next extent rec. + */ + *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos); + return 0; + } + + if (!eb || (eb && !eb->h_next_leaf_blk)) { + /* + * We are the last extent rec, so any high cpos should + * be stored in this leaf refcount block. + */ + *cpos_end = UINT_MAX; + return 0; + } + + /* + * If the extent block isn't the last one, we have to find + * the subtree root between this extent block and the next + * leaf extent block and get the corresponding e_cpos from + * the subroot. Otherwise we may corrupt the b-tree. + */ + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + + left_path = ocfs2_new_path_from_et(&et); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos); + ret = ocfs2_find_path(ci, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + right_path = ocfs2_new_path_from_path(left_path); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(ci, right_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_root = ocfs2_find_subtree_root(&et, left_path, + right_path); + + tmp_el = left_path->p_node[subtree_root].el; + blkno = left_path->p_node[subtree_root+1].bh->b_blocknr; + for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) { + if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) { + *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos); + break; + } + } + + BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec)); + +out: + ocfs2_free_path(left_path); + ocfs2_free_path(right_path); + return ret; +} + +/* + * Given a cpos and len, try to find the refcount record which contains cpos. + * 1. If cpos can be found in one refcount record, return the record. + * 2. If cpos can't be found, return a fake record which start from cpos + * and end at a small value between cpos+len and start of the next record. + * This fake record has r_refcount = 0. + */ +static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, unsigned int len, + struct ocfs2_refcount_rec *ret_rec, + int *index, + struct buffer_head **ret_bh) +{ + int ret = 0, i, found; + u32 low_cpos, uninitialized_var(cpos_end); + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_extent_block *eb = NULL; + struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) { + ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len, + ret_rec, index); + *ret_bh = ref_root_bh; + get_bh(ref_root_bh); + return 0; + } + + el = &rb->rf_list; + low_cpos = cpos & OCFS2_32BIT_POS_MASK; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(sb, + "refcount tree %llu has non zero tree " + "depth in leaf btree tree block %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + found = 0; + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= low_cpos) { + found = 1; + break; + } + } + + if (found) { + ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh, + eb, el, i, &cpos_end); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (cpos_end < low_cpos + len) + len = cpos_end - low_cpos; + } + + ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len, + ret_rec, index); + *ret_bh = ref_leaf_bh; +out: + brelse(eb_bh); + return ret; +} + +enum ocfs2_ref_rec_contig { + REF_CONTIG_NONE = 0, + REF_CONTIG_LEFT, + REF_CONTIG_RIGHT, + REF_CONTIG_LEFTRIGHT, +}; + +static enum ocfs2_ref_rec_contig + ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb, + int index) +{ + if ((rb->rf_records.rl_recs[index].r_refcount == + rb->rf_records.rl_recs[index + 1].r_refcount) && + (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) + + le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) == + le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos))) + return REF_CONTIG_RIGHT; + + return REF_CONTIG_NONE; +} + +static enum ocfs2_ref_rec_contig + ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb, + int index) +{ + enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE; + + if (index < le16_to_cpu(rb->rf_records.rl_used) - 1) + ret = ocfs2_refcount_rec_adjacent(rb, index); + + if (index > 0) { + enum ocfs2_ref_rec_contig tmp; + + tmp = ocfs2_refcount_rec_adjacent(rb, index - 1); + + if (tmp == REF_CONTIG_RIGHT) { + if (ret == REF_CONTIG_RIGHT) + ret = REF_CONTIG_LEFTRIGHT; + else + ret = REF_CONTIG_LEFT; + } + } + + return ret; +} + +static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb, + int index) +{ + BUG_ON(rb->rf_records.rl_recs[index].r_refcount != + rb->rf_records.rl_recs[index+1].r_refcount); + + le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters, + le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters)); + + if (index < le16_to_cpu(rb->rf_records.rl_used) - 2) + memmove(&rb->rf_records.rl_recs[index + 1], + &rb->rf_records.rl_recs[index + 2], + sizeof(struct ocfs2_refcount_rec) * + (le16_to_cpu(rb->rf_records.rl_used) - index - 2)); + + memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1], + 0, sizeof(struct ocfs2_refcount_rec)); + le16_add_cpu(&rb->rf_records.rl_used, -1); +} + +/* + * Merge the refcount rec if we are contiguous with the adjacent recs. + */ +static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb, + int index) +{ + enum ocfs2_ref_rec_contig contig = + ocfs2_refcount_rec_contig(rb, index); + + if (contig == REF_CONTIG_NONE) + return; + + if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) { + BUG_ON(index == 0); + index--; + } + + ocfs2_rotate_refcount_rec_left(rb, index); + + if (contig == REF_CONTIG_LEFTRIGHT) + ocfs2_rotate_refcount_rec_left(rb, index); +} + +/* + * Change the refcount indexed by "index" in ref_bh. + * If refcount reaches 0, remove it. + */ +static int ocfs2_change_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_leaf_bh, + int index, int merge, int change) +{ + int ret; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rl = &rb->rf_records; + struct ocfs2_refcount_rec *rec = &rl->rl_recs[index]; + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_change_refcount_rec( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + index, le32_to_cpu(rec->r_refcount), change); + le32_add_cpu(&rec->r_refcount, change); + + if (!rec->r_refcount) { + if (index != le16_to_cpu(rl->rl_used) - 1) { + memmove(rec, rec + 1, + (le16_to_cpu(rl->rl_used) - index - 1) * + sizeof(struct ocfs2_refcount_rec)); + memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1], + 0, sizeof(struct ocfs2_refcount_rec)); + } + + le16_add_cpu(&rl->rl_used, -1); + } else if (merge) + ocfs2_refcount_rec_merge(rb, index); + + ocfs2_journal_dirty(handle, ref_leaf_bh); +out: + return ret; +} + +static int ocfs2_expand_inline_ref_root(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head **ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + u16 suballoc_bit_start; + u32 num_got; + u64 suballoc_loc, blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *new_rb; + struct ocfs2_refcount_block *root_rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, + &suballoc_bit_start, &num_got, + &blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_bh = sb_getblk(sb, blkno); + if (new_bh == NULL) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + ocfs2_set_new_buffer_uptodate(ci, new_bh); + + ret = ocfs2_journal_access_rb(handle, ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Initialize ocfs2_refcount_block. + * It should contain the same information as the old root. + * so just memcpy it and change the corresponding field. + */ + memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); + + new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; + new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); + new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); + new_rb->rf_blkno = cpu_to_le64(blkno); + new_rb->rf_cpos = cpu_to_le32(0); + new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); + new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); + ocfs2_journal_dirty(handle, new_bh); + + /* Now change the root. */ + memset(&root_rb->rf_list, 0, sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_list)); + root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb)); + root_rb->rf_clusters = cpu_to_le32(1); + root_rb->rf_list.l_next_free_rec = cpu_to_le16(1); + root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno); + root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1); + root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL); + + ocfs2_journal_dirty(handle, ref_root_bh); + + trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno, + le16_to_cpu(new_rb->rf_records.rl_used)); + + *ref_leaf_bh = new_bh; + new_bh = NULL; +out: + brelse(new_bh); + return ret; +} + +static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev, + struct ocfs2_refcount_rec *next) +{ + if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <= + ocfs2_get_ref_rec_low_cpos(next)) + return 1; + + return 0; +} + +static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b) +{ + const struct ocfs2_refcount_rec *l = a, *r = b; + u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l); + u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r); + + if (l_cpos > r_cpos) + return 1; + if (l_cpos < r_cpos) + return -1; + return 0; +} + +static int cmp_refcount_rec_by_cpos(const void *a, const void *b) +{ + const struct ocfs2_refcount_rec *l = a, *r = b; + u64 l_cpos = le64_to_cpu(l->r_cpos); + u64 r_cpos = le64_to_cpu(r->r_cpos); + + if (l_cpos > r_cpos) + return 1; + if (l_cpos < r_cpos) + return -1; + return 0; +} + +static void swap_refcount_rec(void *a, void *b, int size) +{ + struct ocfs2_refcount_rec *l = a, *r = b, tmp; + + tmp = *(struct ocfs2_refcount_rec *)l; + *(struct ocfs2_refcount_rec *)l = + *(struct ocfs2_refcount_rec *)r; + *(struct ocfs2_refcount_rec *)r = tmp; +} + +/* + * The refcount cpos are ordered by their 64bit cpos, + * But we will use the low 32 bit to be the e_cpos in the b-tree. + * So we need to make sure that this pos isn't intersected with others. + * + * Note: The refcount block is already sorted by their low 32 bit cpos, + * So just try the middle pos first, and we will exit when we find + * the good position. + */ +static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl, + u32 *split_pos, int *split_index) +{ + int num_used = le16_to_cpu(rl->rl_used); + int delta, middle = num_used / 2; + + for (delta = 0; delta < middle; delta++) { + /* Let's check delta earlier than middle */ + if (ocfs2_refcount_rec_no_intersect( + &rl->rl_recs[middle - delta - 1], + &rl->rl_recs[middle - delta])) { + *split_index = middle - delta; + break; + } + + /* For even counts, don't walk off the end */ + if ((middle + delta + 1) == num_used) + continue; + + /* Now try delta past middle */ + if (ocfs2_refcount_rec_no_intersect( + &rl->rl_recs[middle + delta], + &rl->rl_recs[middle + delta + 1])) { + *split_index = middle + delta + 1; + break; + } + } + + if (delta >= middle) + return -ENOSPC; + + *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]); + return 0; +} + +static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, + struct buffer_head *new_bh, + u32 *split_cpos) +{ + int split_index = 0, num_moved, ret; + u32 cpos = 0; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rl = &rb->rf_records; + struct ocfs2_refcount_block *new_rb = + (struct ocfs2_refcount_block *)new_bh->b_data; + struct ocfs2_refcount_list *new_rl = &new_rb->rf_records; + + trace_ocfs2_divide_leaf_refcount_block( + (unsigned long long)ref_leaf_bh->b_blocknr, + le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used)); + + /* + * XXX: Improvement later. + * If we know all the high 32 bit cpos is the same, no need to sort. + * + * In order to make the whole process safe, we do: + * 1. sort the entries by their low 32 bit cpos first so that we can + * find the split cpos easily. + * 2. call ocfs2_insert_extent to insert the new refcount block. + * 3. move the refcount rec to the new block. + * 4. sort the entries by their 64 bit cpos. + * 5. dirty the new_rb and rb. + */ + sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), + sizeof(struct ocfs2_refcount_rec), + cmp_refcount_rec_by_low_cpos, swap_refcount_rec); + + ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index); + if (ret) { + mlog_errno(ret); + return ret; + } + + new_rb->rf_cpos = cpu_to_le32(cpos); + + /* move refcount records starting from split_index to the new block. */ + num_moved = le16_to_cpu(rl->rl_used) - split_index; + memcpy(new_rl->rl_recs, &rl->rl_recs[split_index], + num_moved * sizeof(struct ocfs2_refcount_rec)); + + /*ok, remove the entries we just moved over to the other block. */ + memset(&rl->rl_recs[split_index], 0, + num_moved * sizeof(struct ocfs2_refcount_rec)); + + /* change old and new rl_used accordingly. */ + le16_add_cpu(&rl->rl_used, -num_moved); + new_rl->rl_used = cpu_to_le16(num_moved); + + sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), + sizeof(struct ocfs2_refcount_rec), + cmp_refcount_rec_by_cpos, swap_refcount_rec); + + sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used), + sizeof(struct ocfs2_refcount_rec), + cmp_refcount_rec_by_cpos, swap_refcount_rec); + + *split_cpos = cpos; + return 0; +} + +static int ocfs2_new_leaf_refcount_block(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + u16 suballoc_bit_start; + u32 num_got, new_cpos; + u64 suballoc_loc, blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_refcount_block *root_rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *new_rb; + struct ocfs2_extent_tree ref_et; + + BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)); + + ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, + &suballoc_bit_start, &num_got, + &blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_bh = sb_getblk(sb, blkno); + if (new_bh == NULL) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + ocfs2_set_new_buffer_uptodate(ci, new_bh); + + ret = ocfs2_journal_access_rb(handle, ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Initialize ocfs2_refcount_block. */ + new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; + memset(new_rb, 0, sb->s_blocksize); + strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); + new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); + new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + new_rb->rf_blkno = cpu_to_le64(blkno); + new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); + new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); + new_rb->rf_records.rl_count = + cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); + new_rb->rf_generation = root_rb->rf_generation; + + ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_journal_dirty(handle, ref_leaf_bh); + ocfs2_journal_dirty(handle, new_bh); + + ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh); + + trace_ocfs2_new_leaf_refcount_block( + (unsigned long long)new_bh->b_blocknr, new_cpos); + + /* Insert the new leaf block with the specific offset cpos. */ + ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr, + 1, 0, meta_ac); + if (ret) + mlog_errno(ret); + +out: + brelse(new_bh); + return ret; +} + +static int ocfs2_expand_refcount_tree(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + struct buffer_head *expand_bh = NULL; + + if (ref_root_bh == ref_leaf_bh) { + /* + * the old root bh hasn't been expanded to a b-tree, + * so expand it first. + */ + ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh, + &expand_bh, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + expand_bh = ref_leaf_bh; + get_bh(expand_bh); + } + + + /* Now add a new refcount block into the tree.*/ + ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh, + expand_bh, meta_ac); + if (ret) + mlog_errno(ret); +out: + brelse(expand_bh); + return ret; +} + +/* + * Adjust the extent rec in b-tree representing ref_leaf_bh. + * + * Only called when we have inserted a new refcount rec at index 0 + * which means ocfs2_extent_rec.e_cpos may need some change. + */ +static int ocfs2_adjust_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_refcount_rec *rec) +{ + int ret = 0, i; + u32 new_cpos, old_cpos; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_tree et; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + struct ocfs2_extent_list *el; + + if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) + goto out; + + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + old_cpos = le32_to_cpu(rb->rf_cpos); + new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; + if (old_cpos <= new_cpos) + goto out; + + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + + path = ocfs2_new_path_from_et(&et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(ci, path, old_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * 2 more credits, one for the leaf refcount block, one for + * the extent block contains the extent rec. + */ + ret = ocfs2_extend_trans(handle, 2); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path), + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + /* change the leaf extent block first. */ + el = path_leaf_el(path); + + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) + if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos) + break; + + BUG_ON(i == le16_to_cpu(el->l_next_free_rec)); + + el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); + + /* change the r_cpos in the leaf block. */ + rb->rf_cpos = cpu_to_le32(new_cpos); + + ocfs2_journal_dirty(handle, path_leaf_bh(path)); + ocfs2_journal_dirty(handle, ref_leaf_bh); + +out: + ocfs2_free_path(path); + return ret; +} + +static int ocfs2_insert_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_refcount_rec *rec, + int index, int merge, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rf_list = &rb->rf_records; + struct buffer_head *new_bh = NULL; + + BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); + + if (rf_list->rl_used == rf_list->rl_count) { + u64 cpos = le64_to_cpu(rec->r_cpos); + u32 len = le32_to_cpu(rec->r_clusters); + + ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, + ref_leaf_bh, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, NULL, &index, + &new_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ref_leaf_bh = new_bh; + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + rf_list = &rb->rf_records; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (index < le16_to_cpu(rf_list->rl_used)) + memmove(&rf_list->rl_recs[index + 1], + &rf_list->rl_recs[index], + (le16_to_cpu(rf_list->rl_used) - index) * + sizeof(struct ocfs2_refcount_rec)); + + trace_ocfs2_insert_refcount_rec( + (unsigned long long)ref_leaf_bh->b_blocknr, index, + (unsigned long long)le64_to_cpu(rec->r_cpos), + le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount)); + + rf_list->rl_recs[index] = *rec; + + le16_add_cpu(&rf_list->rl_used, 1); + + if (merge) + ocfs2_refcount_rec_merge(rb, index); + + ocfs2_journal_dirty(handle, ref_leaf_bh); + + if (index == 0) { + ret = ocfs2_adjust_refcount_rec(handle, ci, + ref_root_bh, + ref_leaf_bh, rec); + if (ret) + mlog_errno(ret); + } +out: + brelse(new_bh); + return ret; +} + +/* + * Split the refcount_rec indexed by "index" in ref_leaf_bh. + * This is much simple than our b-tree code. + * split_rec is the new refcount rec we want to insert. + * If split_rec->r_refcount > 0, we are changing the refcount(in case we + * increase refcount or decrease a refcount to non-zero). + * If split_rec->r_refcount == 0, we are punching a hole in current refcount + * rec( in case we decrease a refcount to zero). + */ +static int ocfs2_split_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_refcount_rec *split_rec, + int index, int merge, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, recs_need; + u32 len; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rf_list = &rb->rf_records; + struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index]; + struct ocfs2_refcount_rec *tail_rec = NULL; + struct buffer_head *new_bh = NULL; + + BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); + + trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos), + le32_to_cpu(orig_rec->r_clusters), + le32_to_cpu(orig_rec->r_refcount), + le64_to_cpu(split_rec->r_cpos), + le32_to_cpu(split_rec->r_clusters), + le32_to_cpu(split_rec->r_refcount)); + + /* + * If we just need to split the header or tail clusters, + * no more recs are needed, just split is OK. + * Otherwise we at least need one new recs. + */ + if (!split_rec->r_refcount && + (split_rec->r_cpos == orig_rec->r_cpos || + le64_to_cpu(split_rec->r_cpos) + + le32_to_cpu(split_rec->r_clusters) == + le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) + recs_need = 0; + else + recs_need = 1; + + /* + * We need one more rec if we split in the middle and the new rec have + * some refcount in it. + */ + if (split_rec->r_refcount && + (split_rec->r_cpos != orig_rec->r_cpos && + le64_to_cpu(split_rec->r_cpos) + + le32_to_cpu(split_rec->r_clusters) != + le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) + recs_need++; + + /* If the leaf block don't have enough record, expand it. */ + if (le16_to_cpu(rf_list->rl_used) + recs_need > + le16_to_cpu(rf_list->rl_count)) { + struct ocfs2_refcount_rec tmp_rec; + u64 cpos = le64_to_cpu(orig_rec->r_cpos); + len = le32_to_cpu(orig_rec->r_clusters); + ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, + ref_leaf_bh, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We have to re-get it since now cpos may be moved to + * another leaf block. + */ + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, &tmp_rec, &index, + &new_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ref_leaf_bh = new_bh; + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + rf_list = &rb->rf_records; + orig_rec = &rf_list->rl_recs[index]; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We have calculated out how many new records we need and store + * in recs_need, so spare enough space first by moving the records + * after "index" to the end. + */ + if (index != le16_to_cpu(rf_list->rl_used) - 1) + memmove(&rf_list->rl_recs[index + 1 + recs_need], + &rf_list->rl_recs[index + 1], + (le16_to_cpu(rf_list->rl_used) - index - 1) * + sizeof(struct ocfs2_refcount_rec)); + + len = (le64_to_cpu(orig_rec->r_cpos) + + le32_to_cpu(orig_rec->r_clusters)) - + (le64_to_cpu(split_rec->r_cpos) + + le32_to_cpu(split_rec->r_clusters)); + + /* + * If we have "len", the we will split in the tail and move it + * to the end of the space we have just spared. + */ + if (len) { + tail_rec = &rf_list->rl_recs[index + recs_need]; + + memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); + le64_add_cpu(&tail_rec->r_cpos, + le32_to_cpu(tail_rec->r_clusters) - len); + tail_rec->r_clusters = cpu_to_le32(len); + } + + /* + * If the split pos isn't the same as the original one, we need to + * split in the head. + * + * Note: We have the chance that split_rec.r_refcount = 0, + * recs_need = 0 and len > 0, which means we just cut the head from + * the orig_rec and in that case we have done some modification in + * orig_rec above, so the check for r_cpos is faked. + */ + if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) { + len = le64_to_cpu(split_rec->r_cpos) - + le64_to_cpu(orig_rec->r_cpos); + orig_rec->r_clusters = cpu_to_le32(len); + index++; + } + + le16_add_cpu(&rf_list->rl_used, recs_need); + + if (split_rec->r_refcount) { + rf_list->rl_recs[index] = *split_rec; + trace_ocfs2_split_refcount_rec_insert( + (unsigned long long)ref_leaf_bh->b_blocknr, index, + (unsigned long long)le64_to_cpu(split_rec->r_cpos), + le32_to_cpu(split_rec->r_clusters), + le32_to_cpu(split_rec->r_refcount)); + + if (merge) + ocfs2_refcount_rec_merge(rb, index); + } + + ocfs2_journal_dirty(handle, ref_leaf_bh); + +out: + brelse(new_bh); + return ret; +} + +static int __ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, int merge, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0, index; + struct buffer_head *ref_leaf_bh = NULL; + struct ocfs2_refcount_rec rec; + unsigned int set_len = 0; + + trace_ocfs2_increase_refcount_begin( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)cpos, len); + + while (len) { + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, &rec, &index, + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + set_len = le32_to_cpu(rec.r_clusters); + + /* + * Here we may meet with 3 situations: + * + * 1. If we find an already existing record, and the length + * is the same, cool, we just need to increase the r_refcount + * and it is OK. + * 2. If we find a hole, just insert it with r_refcount = 1. + * 3. If we are in the middle of one extent record, split + * it. + */ + if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos && + set_len <= len) { + trace_ocfs2_increase_refcount_change( + (unsigned long long)cpos, set_len, + le32_to_cpu(rec.r_refcount)); + ret = ocfs2_change_refcount_rec(handle, ci, + ref_leaf_bh, index, + merge, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + } else if (!rec.r_refcount) { + rec.r_refcount = cpu_to_le32(1); + + trace_ocfs2_increase_refcount_insert( + (unsigned long long)le64_to_cpu(rec.r_cpos), + set_len); + ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh, + ref_leaf_bh, + &rec, index, + merge, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + set_len = min((u64)(cpos + len), + le64_to_cpu(rec.r_cpos) + set_len) - cpos; + rec.r_cpos = cpu_to_le64(cpos); + rec.r_clusters = cpu_to_le32(set_len); + le32_add_cpu(&rec.r_refcount, 1); + + trace_ocfs2_increase_refcount_split( + (unsigned long long)le64_to_cpu(rec.r_cpos), + set_len, le32_to_cpu(rec.r_refcount)); + ret = ocfs2_split_refcount_rec(handle, ci, + ref_root_bh, ref_leaf_bh, + &rec, index, merge, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += set_len; + len -= set_len; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + } + +out: + brelse(ref_leaf_bh); + return ret; +} + +static int ocfs2_remove_refcount_extent(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_extent_tree et; + + BUG_ON(rb->rf_records.rl_used); + + trace_ocfs2_remove_refcount_extent( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)ref_leaf_bh->b_blocknr, + le32_to_cpu(rb->rf_cpos)); + + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos), + 1, meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_from_cache(ci, ref_leaf_bh); + + /* + * add the freed block to the dealloc so that it will be freed + * when we run dealloc. + */ + ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(rb->rf_suballoc_slot), + le64_to_cpu(rb->rf_suballoc_loc), + le64_to_cpu(rb->rf_blkno), + le16_to_cpu(rb->rf_suballoc_bit)); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + le32_add_cpu(&rb->rf_clusters, -1); + + /* + * check whether we need to restore the root refcount block if + * there is no leaf extent block at atll. + */ + if (!rb->rf_list.l_next_free_rec) { + BUG_ON(rb->rf_clusters); + + trace_ocfs2_restore_refcount_block( + (unsigned long long)ref_root_bh->b_blocknr); + + rb->rf_flags = 0; + rb->rf_parent = 0; + rb->rf_cpos = 0; + memset(&rb->rf_records, 0, sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_records)); + rb->rf_records.rl_count = + cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); + } + + ocfs2_journal_dirty(handle, ref_root_bh); + +out: + return ret; +} + +int ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + return __ocfs2_increase_refcount(handle, ci, ref_root_bh, + cpos, len, 1, + meta_ac, dealloc); +} + +static int ocfs2_decrease_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + int index, u64 cpos, unsigned int len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index]; + + BUG_ON(cpos < le64_to_cpu(rec->r_cpos)); + BUG_ON(cpos + len > + le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters)); + + trace_ocfs2_decrease_refcount_rec( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)cpos, len); + + if (cpos == le64_to_cpu(rec->r_cpos) && + len == le32_to_cpu(rec->r_clusters)) + ret = ocfs2_change_refcount_rec(handle, ci, + ref_leaf_bh, index, 1, -1); + else { + struct ocfs2_refcount_rec split = *rec; + split.r_cpos = cpu_to_le64(cpos); + split.r_clusters = cpu_to_le32(len); + + le32_add_cpu(&split.r_refcount, -1); + + ret = ocfs2_split_refcount_rec(handle, ci, + ref_root_bh, ref_leaf_bh, + &split, index, 1, + meta_ac, dealloc); + } + + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Remove the leaf refcount block if it contains no refcount record. */ + if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) { + ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh, + ref_leaf_bh, meta_ac, + dealloc); + if (ret) + mlog_errno(ret); + } + +out: + return ret; +} + +static int __ocfs2_decrease_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete) +{ + int ret = 0, index = 0; + struct ocfs2_refcount_rec rec; + unsigned int r_count = 0, r_len; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct buffer_head *ref_leaf_bh = NULL; + + trace_ocfs2_decrease_refcount( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)cpos, len, delete); + + while (len) { + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, &rec, &index, + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + r_count = le32_to_cpu(rec.r_refcount); + BUG_ON(r_count == 0); + if (!delete) + BUG_ON(r_count > 1); + + r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) - cpos; + + ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh, + ref_leaf_bh, index, + cpos, r_len, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (le32_to_cpu(rec.r_refcount) == 1 && delete) { + ret = ocfs2_cache_cluster_dealloc(dealloc, + ocfs2_clusters_to_blocks(sb, cpos), + r_len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += r_len; + len -= r_len; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + } + +out: + brelse(ref_leaf_bh); + return ret; +} + +/* Caller must hold refcount tree lock. */ +int ocfs2_decrease_refcount(struct inode *inode, + handle_t *handle, u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete) +{ + int ret; + u64 ref_blkno; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *tree; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_get_refcount_block(inode, &ref_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, + &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh, + cpos, len, meta_ac, dealloc, delete); + if (ret) + mlog_errno(ret); +out: + brelse(ref_root_bh); + return ret; +} + +/* + * Mark the already-existing extent at cpos as refcounted for len clusters. + * This adds the refcount extent flag. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +static int ocfs2_mark_extent_refcounted(struct inode *inode, + struct ocfs2_extent_tree *et, + handle_t *handle, u32 cpos, + u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + + trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno, + cpos, len, phys); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + ret = -EROFS; + goto out; + } + + ret = ocfs2_change_extent_flag(handle, et, cpos, + len, phys, meta_ac, dealloc, + OCFS2_EXT_REFCOUNTED, 0); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +/* + * Given some contiguous physical clusters, calculate what we need + * for modifying their refcount. + */ +static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 start_cpos, + u32 clusters, + int *meta_add, + int *credits) +{ + int ret = 0, index, ref_blocks = 0, recs_add = 0; + u64 cpos = start_cpos; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_rec rec; + struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL; + u32 len; + + while (clusters) { + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, clusters, &rec, + &index, &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (ref_leaf_bh != prev_bh) { + /* + * Now we encounter a new leaf block, so calculate + * whether we need to extend the old leaf. + */ + if (prev_bh) { + rb = (struct ocfs2_refcount_block *) + prev_bh->b_data; + + if (le16_to_cpu(rb->rf_records.rl_used) + + recs_add > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + } + + recs_add = 0; + *credits += 1; + brelse(prev_bh); + prev_bh = ref_leaf_bh; + get_bh(prev_bh); + } + + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + + trace_ocfs2_calc_refcount_meta_credits_iterate( + recs_add, (unsigned long long)cpos, clusters, + (unsigned long long)le64_to_cpu(rec.r_cpos), + le32_to_cpu(rec.r_clusters), + le32_to_cpu(rec.r_refcount), index); + + len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) - cpos; + /* + * We record all the records which will be inserted to the + * same refcount block, so that we can tell exactly whether + * we need a new refcount block or not. + * + * If we will insert a new one, this is easy and only happens + * during adding refcounted flag to the extent, so we don't + * have a chance of spliting. We just need one record. + * + * If the refcount rec already exists, that would be a little + * complicated. we may have to: + * 1) split at the beginning if the start pos isn't aligned. + * we need 1 more record in this case. + * 2) split int the end if the end pos isn't aligned. + * we need 1 more record in this case. + * 3) split in the middle because of file system fragmentation. + * we need 2 more records in this case(we can't detect this + * beforehand, so always think of the worst case). + */ + if (rec.r_refcount) { + recs_add += 2; + /* Check whether we need a split at the beginning. */ + if (cpos == start_cpos && + cpos != le64_to_cpu(rec.r_cpos)) + recs_add++; + + /* Check whether we need a split in the end. */ + if (cpos + clusters < le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) + recs_add++; + } else + recs_add++; + + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + clusters -= len; + cpos += len; + } + + if (prev_bh) { + rb = (struct ocfs2_refcount_block *)prev_bh->b_data; + + if (le16_to_cpu(rb->rf_records.rl_used) + recs_add > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + + *credits += 1; + } + + if (!ref_blocks) + goto out; + + *meta_add += ref_blocks; + *credits += ref_blocks; + + /* + * So we may need ref_blocks to insert into the tree. + * That also means we need to change the b-tree and add that number + * of records since we never merge them. + * We need one more block for expansion since the new created leaf + * block is also full and needs split. + */ + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) { + struct ocfs2_extent_tree et; + + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + *meta_add += ocfs2_extend_meta_needed(et.et_root_el); + *credits += ocfs2_calc_extend_credits(sb, + et.et_root_el, + ref_blocks); + } else { + *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + *meta_add += 1; + } + +out: + + trace_ocfs2_calc_refcount_meta_credits( + (unsigned long long)start_cpos, clusters, + *meta_add, *credits); + brelse(ref_leaf_bh); + brelse(prev_bh); + return ret; +} + +/* + * For refcount tree, we will decrease some contiguous clusters + * refcount count, so just go through it to see how many blocks + * we gonna touch and whether we need to create new blocks. + * + * Normally the refcount blocks store these refcount should be + * contiguous also, so that we can get the number easily. + * We will at most add split 2 refcount records and 2 more + * refcount blocks, so just check it in a rough way. + * + * Caller must hold refcount tree lock. + */ +int ocfs2_prepare_refcount_change_for_del(struct inode *inode, + u64 refcount_loc, + u64 phys_blkno, + u32 clusters, + int *credits, + int *ref_blocks) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *tree; + u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + ret = -EROFS; + goto out; + } + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), + refcount_loc, &tree); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc, + &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, + &tree->rf_ci, + ref_root_bh, + start_cpos, clusters, + ref_blocks, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits); + +out: + brelse(ref_root_bh); + return ret; +} + +#define MAX_CONTIG_BYTES 1048576 + +static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb) +{ + return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES); +} + +static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb) +{ + return ~(ocfs2_cow_contig_clusters(sb) - 1); +} + +/* + * Given an extent that starts at 'start' and an I/O that starts at 'cpos', + * find an offset (start + (n * contig_clusters)) that is closest to cpos + * while still being less than or equal to it. + * + * The goal is to break the extent at a multiple of contig_clusters. + */ +static inline unsigned int ocfs2_cow_align_start(struct super_block *sb, + unsigned int start, + unsigned int cpos) +{ + BUG_ON(start > cpos); + + return start + ((cpos - start) & ocfs2_cow_contig_mask(sb)); +} + +/* + * Given a cluster count of len, pad it out so that it is a multiple + * of contig_clusters. + */ +static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, + unsigned int len) +{ + unsigned int padded = + (len + (ocfs2_cow_contig_clusters(sb) - 1)) & + ocfs2_cow_contig_mask(sb); + + /* Did we wrap? */ + if (padded < len) + padded = UINT_MAX; + + return padded; +} + +/* + * Calculate out the start and number of virtual clusters we need to to CoW. + * + * cpos is vitual start cluster position we want to do CoW in a + * file and write_len is the cluster length. + * max_cpos is the place where we want to stop CoW intentionally. + * + * Normal we will start CoW from the beginning of extent record cotaining cpos. + * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we + * get good I/O from the resulting extent tree. + */ +static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, + struct ocfs2_extent_list *el, + u32 cpos, + u32 write_len, + u32 max_cpos, + u32 *cow_start, + u32 *cow_len) +{ + int ret = 0; + int tree_height = le16_to_cpu(el->l_tree_depth), i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb = NULL; + struct ocfs2_extent_rec *rec; + unsigned int want_clusters, rec_end = 0; + int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb); + int leaf_clusters; + + BUG_ON(cpos + write_len > max_cpos); + + if (tree_height > 0) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + *cow_len = 0; + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + + if (ocfs2_is_empty_extent(rec)) { + mlog_bug_on_msg(i != 0, "Inode %lu has empty record in " + "index %d\n", inode->i_ino, i); + continue; + } + + if (le32_to_cpu(rec->e_cpos) + + le16_to_cpu(rec->e_leaf_clusters) <= cpos) + continue; + + if (*cow_len == 0) { + /* + * We should find a refcounted record in the + * first pass. + */ + BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED)); + *cow_start = le32_to_cpu(rec->e_cpos); + } + + /* + * If we encounter a hole, a non-refcounted record or + * pass the max_cpos, stop the search. + */ + if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) || + (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) || + (max_cpos <= le32_to_cpu(rec->e_cpos))) + break; + + leaf_clusters = le16_to_cpu(rec->e_leaf_clusters); + rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters; + if (rec_end > max_cpos) { + rec_end = max_cpos; + leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos); + } + + /* + * How many clusters do we actually need from + * this extent? First we see how many we actually + * need to complete the write. If that's smaller + * than contig_clusters, we try for contig_clusters. + */ + if (!*cow_len) + want_clusters = write_len; + else + want_clusters = (cpos + write_len) - + (*cow_start + *cow_len); + if (want_clusters < contig_clusters) + want_clusters = contig_clusters; + + /* + * If the write does not cover the whole extent, we + * need to calculate how we're going to split the extent. + * We try to do it on contig_clusters boundaries. + * + * Any extent smaller than contig_clusters will be + * CoWed in its entirety. + */ + if (leaf_clusters <= contig_clusters) + *cow_len += leaf_clusters; + else if (*cow_len || (*cow_start == cpos)) { + /* + * This extent needs to be CoW'd from its + * beginning, so all we have to do is compute + * how many clusters to grab. We align + * want_clusters to the edge of contig_clusters + * to get better I/O. + */ + want_clusters = ocfs2_cow_align_length(inode->i_sb, + want_clusters); + + if (leaf_clusters < want_clusters) + *cow_len += leaf_clusters; + else + *cow_len += want_clusters; + } else if ((*cow_start + contig_clusters) >= + (cpos + write_len)) { + /* + * Breaking off contig_clusters at the front + * of the extent will cover our write. That's + * easy. + */ + *cow_len = contig_clusters; + } else if ((rec_end - cpos) <= contig_clusters) { + /* + * Breaking off contig_clusters at the tail of + * this extent will cover cpos. + */ + *cow_start = rec_end - contig_clusters; + *cow_len = contig_clusters; + } else if ((rec_end - cpos) <= want_clusters) { + /* + * While we can't fit the entire write in this + * extent, we know that the write goes from cpos + * to the end of the extent. Break that off. + * We try to break it at some multiple of + * contig_clusters from the front of the extent. + * Failing that (ie, cpos is within + * contig_clusters of the front), we'll CoW the + * entire extent. + */ + *cow_start = ocfs2_cow_align_start(inode->i_sb, + *cow_start, cpos); + *cow_len = rec_end - *cow_start; + } else { + /* + * Ok, the entire write lives in the middle of + * this extent. Let's try to slice the extent up + * nicely. Optimally, our CoW region starts at + * m*contig_clusters from the beginning of the + * extent and goes for n*contig_clusters, + * covering the entire write. + */ + *cow_start = ocfs2_cow_align_start(inode->i_sb, + *cow_start, cpos); + + want_clusters = (cpos + write_len) - *cow_start; + want_clusters = ocfs2_cow_align_length(inode->i_sb, + want_clusters); + if (*cow_start + want_clusters <= rec_end) + *cow_len = want_clusters; + else + *cow_len = rec_end - *cow_start; + } + + /* Have we covered our entire write yet? */ + if ((*cow_start + *cow_len) >= (cpos + write_len)) + break; + + /* + * If we reach the end of the extent block and don't get enough + * clusters, continue with the next extent block if possible. + */ + if (i + 1 == le16_to_cpu(el->l_next_free_rec) && + eb && eb->h_next_leaf_blk) { + brelse(eb_bh); + eb_bh = NULL; + + ret = ocfs2_read_extent_block(INODE_CACHE(inode), + le64_to_cpu(eb->h_next_leaf_blk), + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + i = -1; + } + } + +out: + brelse(eb_bh); + return ret; +} + +/* + * Prepare meta_ac, data_ac and calculate credits when we want to add some + * num_clusters in data_tree "et" and change the refcount for the old + * clusters(starting form p_cluster) in the refcount tree. + * + * Note: + * 1. since we may split the old tree, so we at most will need num_clusters + 2 + * more new leaf records. + * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so + * just give data_ac = NULL. + */ +static int ocfs2_lock_refcount_allocators(struct super_block *sb, + u32 p_cluster, u32 num_clusters, + struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_alloc_context **meta_ac, + struct ocfs2_alloc_context **data_ac, + int *credits) +{ + int ret = 0, meta_add = 0; + int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); + + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (num_free_extents < num_clusters + 2) + meta_add = + ocfs2_extend_meta_needed(et->et_root_el); + + *credits += ocfs2_calc_extend_credits(sb, et->et_root_el, + num_clusters + 2); + + ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh, + p_cluster, num_clusters, + &meta_add, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_lock_refcount_allocators(meta_add, *credits); + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (data_ac) { + ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters, + data_ac); + if (ret) + mlog_errno(ret); + } + +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + } + + return ret; +} + +static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) +{ + BUG_ON(buffer_dirty(bh)); + + clear_buffer_mapped(bh); + + return 0; +} + +int ocfs2_duplicate_clusters_by_page(handle_t *handle, + struct file *file, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) +{ + int ret = 0, partial; + struct inode *inode = file->f_path.dentry->d_inode; + struct ocfs2_caching_info *ci = INODE_CACHE(inode); + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); + struct page *page; + pgoff_t page_index; + unsigned int from, to, readahead_pages; + loff_t offset, end, map_end; + struct address_space *mapping = inode->i_mapping; + + trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, + new_cluster, new_len); + + readahead_pages = + (ocfs2_cow_contig_clusters(sb) << + OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT; + offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; + end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); + /* + * We only duplicate pages until we reach the page contains i_size - 1. + * So trim 'end' to i_size. + */ + if (end > i_size_read(inode)) + end = i_size_read(inode); + + while (offset < end) { + page_index = offset >> PAGE_CACHE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; + if (map_end > end) + map_end = end; + + /* from, to is the offset within the page. */ + from = offset & (PAGE_CACHE_SIZE - 1); + to = PAGE_CACHE_SIZE; + if (map_end & (PAGE_CACHE_SIZE - 1)) + to = map_end & (PAGE_CACHE_SIZE - 1); + + page = find_or_create_page(mapping, page_index, GFP_NOFS); + + /* + * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page + * can't be dirtied before we CoW it out. + */ + if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) + BUG_ON(PageDirty(page)); + + if (PageReadahead(page)) { + page_cache_async_readahead(mapping, + &file->f_ra, file, + page, page_index, + readahead_pages); + } + + if (!PageUptodate(page)) { + ret = block_read_full_page(page, ocfs2_get_block); + if (ret) { + mlog_errno(ret); + goto unlock; + } + lock_page(page); + } + + if (page_has_buffers(page)) { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_clear_cow_buffer); + if (ret) { + mlog_errno(ret); + goto unlock; + } + } + + ocfs2_map_and_dirty_page(inode, handle, from, to, + page, 0, &new_block); + mark_page_accessed(page); +unlock: + unlock_page(page); + page_cache_release(page); + page = NULL; + offset = map_end; + if (ret) + break; + } + + return ret; +} + +int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, + struct file *file, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) +{ + int ret = 0; + struct inode *inode = file->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct ocfs2_caching_info *ci = INODE_CACHE(inode); + int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); + u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); + u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); + struct ocfs2_super *osb = OCFS2_SB(sb); + struct buffer_head *old_bh = NULL; + struct buffer_head *new_bh = NULL; + + trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, + new_cluster, new_len); + + for (i = 0; i < blocks; i++, old_block++, new_block++) { + new_bh = sb_getblk(osb->sb, new_block); + if (new_bh == NULL) { + ret = -EIO; + mlog_errno(ret); + break; + } + + ocfs2_set_new_buffer_uptodate(ci, new_bh); + + ret = ocfs2_read_block(ci, old_block, &old_bh, NULL); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_journal_access(handle, ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + break; + } + + memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); + ocfs2_journal_dirty(handle, new_bh); + + brelse(new_bh); + brelse(old_bh); + new_bh = NULL; + old_bh = NULL; + } + + brelse(new_bh); + brelse(old_bh); + return ret; +} + +static int ocfs2_clear_ext_refcount(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 p_cluster, u32 len, + unsigned int ext_flags, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, index; + struct ocfs2_extent_rec replace_rec; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + u64 ino = ocfs2_metadata_cache_owner(et->et_ci); + + trace_ocfs2_clear_ext_refcount((unsigned long long)ino, + cpos, len, p_cluster, ext_flags); + + memset(&replace_rec, 0, sizeof(replace_rec)); + replace_rec.e_cpos = cpu_to_le32(cpos); + replace_rec.e_leaf_clusters = cpu_to_le16(len); + replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb, + p_cluster)); + replace_rec.e_flags = ext_flags; + replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED; + + path = ocfs2_new_path_from_et(et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + index = ocfs2_search_extent_list(el, cpos); + if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + ocfs2_error(sb, + "Inode %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long)ino, cpos); + ret = -EROFS; + goto out; + } + + ret = ocfs2_split_extent(handle, et, path, index, + &replace_rec, meta_ac, dealloc); + if (ret) + mlog_errno(ret); + +out: + ocfs2_free_path(path); + return ret; +} + +static int ocfs2_replace_clusters(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old, + u32 new, u32 len, + unsigned int ext_flags) +{ + int ret; + struct ocfs2_caching_info *ci = context->data_et.et_ci; + u64 ino = ocfs2_metadata_cache_owner(ci); + + trace_ocfs2_replace_clusters((unsigned long long)ino, + cpos, old, new, len, ext_flags); + + /*If the old clusters is unwritten, no need to duplicate. */ + if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { + ret = context->cow_duplicate_clusters(handle, context->file, + cpos, old, new, len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_clear_ext_refcount(handle, &context->data_et, + cpos, new, len, ext_flags, + context->meta_ac, &context->dealloc); + if (ret) + mlog_errno(ret); +out: + return ret; +} + +int ocfs2_cow_sync_writeback(struct super_block *sb, + struct inode *inode, + u32 cpos, u32 num_clusters) +{ + int ret = 0; + loff_t offset, end, map_end; + pgoff_t page_index; + struct page *page; + + if (ocfs2_should_order_data(inode)) + return 0; + + offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; + end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); + + ret = filemap_fdatawrite_range(inode->i_mapping, + offset, end - 1); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + while (offset < end) { + page_index = offset >> PAGE_CACHE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; + if (map_end > end) + map_end = end; + + page = find_or_create_page(inode->i_mapping, + page_index, GFP_NOFS); + BUG_ON(!page); + + wait_on_page_writeback(page); + if (PageError(page)) { + ret = -EIO; + mlog_errno(ret); + } else + mark_page_accessed(page); + + unlock_page(page); + page_cache_release(page); + page = NULL; + offset = map_end; + if (ret) + break; + } + + return ret; +} + +static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags) +{ + return ocfs2_get_clusters(context->inode, v_cluster, p_cluster, + num_clusters, extent_flags); +} + +static int ocfs2_make_clusters_writable(struct super_block *sb, + struct ocfs2_cow_context *context, + u32 cpos, u32 p_cluster, + u32 num_clusters, unsigned int e_flags) +{ + int ret, delete, index, credits = 0; + u32 new_bit, new_len, orig_num_clusters; + unsigned int set_len; + struct ocfs2_super *osb = OCFS2_SB(sb); + handle_t *handle; + struct buffer_head *ref_leaf_bh = NULL; + struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci; + struct ocfs2_refcount_rec rec; + + trace_ocfs2_make_clusters_writable(cpos, p_cluster, + num_clusters, e_flags); + + ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters, + &context->data_et, + ref_ci, + context->ref_root_bh, + &context->meta_ac, + &context->data_ac, &credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + if (context->post_refcount) + credits += context->post_refcount->credits; + + credits += context->extra_credits; + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + orig_num_clusters = num_clusters; + + while (num_clusters) { + ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, + p_cluster, num_clusters, + &rec, &index, &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + BUG_ON(!rec.r_refcount); + set_len = min((u64)p_cluster + num_clusters, + le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) - p_cluster; + + /* + * There are many different situation here. + * 1. If refcount == 1, remove the flag and don't COW. + * 2. If refcount > 1, allocate clusters. + * Here we may not allocate r_len once at a time, so continue + * until we reach num_clusters. + */ + if (le32_to_cpu(rec.r_refcount) == 1) { + delete = 0; + ret = ocfs2_clear_ext_refcount(handle, + &context->data_et, + cpos, p_cluster, + set_len, e_flags, + context->meta_ac, + &context->dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } else { + delete = 1; + + ret = __ocfs2_claim_clusters(handle, + context->data_ac, + 1, set_len, + &new_bit, &new_len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_replace_clusters(handle, context, + cpos, p_cluster, new_bit, + new_len, e_flags); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + set_len = new_len; + } + + ret = __ocfs2_decrease_refcount(handle, ref_ci, + context->ref_root_bh, + p_cluster, set_len, + context->meta_ac, + &context->dealloc, delete); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + cpos += set_len; + p_cluster += set_len; + num_clusters -= set_len; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + } + + /* handle any post_cow action. */ + if (context->post_refcount && context->post_refcount->func) { + ret = context->post_refcount->func(context->inode, handle, + context->post_refcount->para); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + /* + * Here we should write the new page out first if we are + * in write-back mode. + */ + if (context->get_clusters == ocfs2_di_get_clusters) { + ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos, + orig_num_clusters); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (context->data_ac) { + ocfs2_free_alloc_context(context->data_ac); + context->data_ac = NULL; + } + if (context->meta_ac) { + ocfs2_free_alloc_context(context->meta_ac); + context->meta_ac = NULL; + } + brelse(ref_leaf_bh); + + return ret; +} + +static int ocfs2_replace_cow(struct ocfs2_cow_context *context) +{ + int ret = 0; + struct inode *inode = context->inode; + u32 cow_start = context->cow_start, cow_len = context->cow_len; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + return -EROFS; + } + + ocfs2_init_dealloc_ctxt(&context->dealloc); + + while (cow_len) { + ret = context->get_clusters(context, cow_start, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED)); + + if (cow_len < num_clusters) + num_clusters = cow_len; + + ret = ocfs2_make_clusters_writable(inode->i_sb, context, + cow_start, p_cluster, + num_clusters, ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + cow_len -= num_clusters; + cow_start += num_clusters; + } + + if (ocfs2_dealloc_has_cluster(&context->dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &context->dealloc); + } + + return ret; +} + +static void ocfs2_readahead_for_cow(struct inode *inode, + struct file *file, + u32 start, u32 len) +{ + struct address_space *mapping; + pgoff_t index; + unsigned long num_pages; + int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; + + if (!file) + return; + + mapping = file->f_mapping; + num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT; + if (!num_pages) + num_pages = 1; + + index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT; + page_cache_sync_readahead(mapping, &file->f_ra, file, + index, num_pages); +} + +/* + * Starting at cpos, try to CoW write_len clusters. Don't CoW + * past max_cpos. This will stop when it runs into a hole or an + * unrefcounted extent. + */ +static int ocfs2_refcount_cow_hunk(struct inode *inode, + struct file *file, + struct buffer_head *di_bh, + u32 cpos, u32 write_len, u32 max_cpos) +{ + int ret; + u32 cow_start = 0, cow_len = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + struct ocfs2_cow_context *context = NULL; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, + cpos, write_len, max_cpos, + &cow_start, &cow_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno, + cpos, write_len, max_cpos, + cow_start, cow_len); + + BUG_ON(cow_len == 0); + + ocfs2_readahead_for_cow(inode, file, cow_start, cow_len); + + context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); + if (!context) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + context->inode = inode; + context->cow_start = cow_start; + context->cow_len = cow_len; + context->ref_tree = ref_tree; + context->ref_root_bh = ref_root_bh; + context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; + context->get_clusters = ocfs2_di_get_clusters; + context->file = file; + + ocfs2_init_dinode_extent_tree(&context->data_et, + INODE_CACHE(inode), di_bh); + + ret = ocfs2_replace_cow(context); + if (ret) + mlog_errno(ret); + + /* + * truncate the extent map here since no matter whether we meet with + * any error during the action, we shouldn't trust cached extent map + * any more. + */ + ocfs2_extent_map_trunc(inode, cow_start); + + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +out: + kfree(context); + return ret; +} + +/* + * CoW any and all clusters between cpos and cpos+write_len. + * Don't CoW past max_cpos. If this returns successfully, all + * clusters between cpos and cpos+write_len are safe to modify. + */ +int ocfs2_refcount_cow(struct inode *inode, + struct file *file, + struct buffer_head *di_bh, + u32 cpos, u32 write_len, u32 max_cpos) +{ + int ret = 0; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + + while (write_len) { + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + if (write_len < num_clusters) + num_clusters = write_len; + + if (ext_flags & OCFS2_EXT_REFCOUNTED) { + ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos, + num_clusters, max_cpos); + if (ret) { + mlog_errno(ret); + break; + } + } + + write_len -= num_clusters; + cpos += num_clusters; + } + + return ret; +} + +static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags) +{ + struct inode *inode = context->inode; + struct ocfs2_xattr_value_root *xv = context->cow_object; + + return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster, + num_clusters, &xv->xr_list, + extent_flags); +} + +/* + * Given a xattr value root, calculate the most meta/credits we need for + * refcount tree change if we truncate it to 0. + */ +int ocfs2_refcounted_xattr_delete_need(struct inode *inode, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_xattr_value_root *xv, + int *meta_add, int *credits) +{ + int ret = 0, index, ref_blocks = 0; + u32 p_cluster, num_clusters; + u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters); + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_rec rec; + struct buffer_head *ref_leaf_bh = NULL; + + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &xv->xr_list, + NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + cpos += num_clusters; + + while (num_clusters) { + ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh, + p_cluster, num_clusters, + &rec, &index, + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!rec.r_refcount); + + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + + /* + * We really don't know whether the other clusters is in + * this refcount block or not, so just take the worst + * case that all the clusters are in this block and each + * one will split a refcount rec, so totally we need + * clusters * 2 new refcount rec. + */ + if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + + *credits += 1; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + + if (num_clusters <= le32_to_cpu(rec.r_clusters)) + break; + else + num_clusters -= le32_to_cpu(rec.r_clusters); + p_cluster += num_clusters; + } + } + + *meta_add += ref_blocks; + if (!ref_blocks) + goto out; + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) + *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + else { + struct ocfs2_extent_tree et; + + ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh); + *credits += ocfs2_calc_extend_credits(inode->i_sb, + et.et_root_el, + ref_blocks); + } + +out: + brelse(ref_leaf_bh); + return ret; +} + +/* + * Do CoW for xattr. + */ +int ocfs2_refcount_cow_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_refcount_tree *ref_tree, + struct buffer_head *ref_root_bh, + u32 cpos, u32 write_len, + struct ocfs2_post_refcount *post) +{ + int ret; + struct ocfs2_xattr_value_root *xv = vb->vb_xv; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_cow_context *context = NULL; + u32 cow_start, cow_len; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, + cpos, write_len, UINT_MAX, + &cow_start, &cow_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(cow_len == 0); + + context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); + if (!context) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + context->inode = inode; + context->cow_start = cow_start; + context->cow_len = cow_len; + context->ref_tree = ref_tree; + context->ref_root_bh = ref_root_bh; + context->cow_object = xv; + + context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd; + /* We need the extra credits for duplicate_clusters by jbd. */ + context->extra_credits = + ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len; + context->get_clusters = ocfs2_xattr_value_get_clusters; + context->post_refcount = post; + + ocfs2_init_xattr_value_extent_tree(&context->data_et, + INODE_CACHE(inode), vb); + + ret = ocfs2_replace_cow(context); + if (ret) + mlog_errno(ret); + +out: + kfree(context); + return ret; +} + +/* + * Insert a new extent into refcount tree and mark a extent rec + * as refcounted in the dinode tree. + */ +int ocfs2_add_refcount_flag(struct inode *inode, + struct ocfs2_extent_tree *data_et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + u32 cpos, u32 p_cluster, u32 num_clusters, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *post) +{ + int ret; + handle_t *handle; + int credits = 1, ref_blocks = 0; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, + ref_ci, ref_root_bh, + p_cluster, num_clusters, + &ref_blocks, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_add_refcount_flag(ref_blocks, credits); + + if (ref_blocks) { + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + ref_blocks, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (post) + credits += post->credits; + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_mark_extent_refcounted(inode, data_et, handle, + cpos, num_clusters, p_cluster, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, + p_cluster, num_clusters, 0, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (post && post->func) { + ret = post->func(inode, handle, post->para); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_change_ctime(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + handle_t *handle; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +static int ocfs2_attach_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret, data_changed = 0; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_refcount_tree *ref_tree; + unsigned int ext_flags; + loff_t size; + u32 cpos, num_clusters, clusters, p_cluster; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree di_et; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { + ret = ocfs2_create_refcount_tree(inode, di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + BUG_ON(!di->i_refcount_loc); + ret = ocfs2_lock_refcount_tree(osb, + le64_to_cpu(di->i_refcount_loc), 1, + &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + goto attach_xattr; + + ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh); + + size = i_size_read(inode); + clusters = ocfs2_clusters_for_bytes(inode->i_sb, size); + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + + if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { + ret = ocfs2_add_refcount_flag(inode, &di_et, + &ref_tree->rf_ci, + ref_root_bh, cpos, + p_cluster, num_clusters, + &dealloc, NULL); + if (ret) { + mlog_errno(ret); + goto unlock; + } + + data_changed = 1; + } + cpos += num_clusters; + } + +attach_xattr: + if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { + ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh, + &ref_tree->rf_ci, + ref_root_bh, + &dealloc); + if (ret) { + mlog_errno(ret); + goto unlock; + } + } + + if (data_changed) { + ret = ocfs2_change_ctime(inode, di_bh); + if (ret) + mlog_errno(ret); + } + +unlock: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); + + if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } +out: + /* + * Empty the extent map so that we may get the right extent + * record from the disk. + */ + ocfs2_extent_map_trunc(inode, 0); + + return ret; +} + +static int ocfs2_add_refcounted_extent(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + u32 cpos, u32 p_cluster, u32 num_clusters, + unsigned int ext_flags, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + handle_t *handle; + int credits = 0; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + + ret = ocfs2_lock_refcount_allocators(inode->i_sb, + p_cluster, num_clusters, + et, ref_ci, + ref_root_bh, &meta_ac, + NULL, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_insert_extent(handle, et, cpos, + ocfs2_clusters_to_blocks(inode->i_sb, p_cluster), + num_clusters, ext_flags, meta_ac); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, + p_cluster, num_clusters, + meta_ac, dealloc); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_duplicate_inline_data(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh) +{ + int ret; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); + struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data; + + BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + t_di->id2.i_data.id_count = s_di->id2.i_data.id_count; + memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data, + le16_to_cpu(s_di->id2.i_data.id_count)); + spin_lock(&OCFS2_I(t_inode)->ip_lock); + OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL; + t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features); + spin_unlock(&OCFS2_I(t_inode)->ip_lock); + + ocfs2_journal_dirty(handle, t_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +static int ocfs2_duplicate_extent_list(struct inode *s_inode, + struct inode *t_inode, + struct buffer_head *t_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + u32 p_cluster, num_clusters, clusters, cpos; + loff_t size; + unsigned int ext_flags; + struct ocfs2_extent_tree et; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh); + + size = i_size_read(s_inode); + clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size); + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + + if (p_cluster) { + ret = ocfs2_add_refcounted_extent(t_inode, &et, + ref_ci, ref_root_bh, + cpos, p_cluster, + num_clusters, + ext_flags, + dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += num_clusters; + } + +out: + return ret; +} + +/* + * change the new file's attributes to the src. + * + * reflink creates a snapshot of a file, that means the attributes + * must be identical except for three exceptions - nlink, ino, and ctime. + */ +static int ocfs2_complete_reflink(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh, + bool preserve) +{ + int ret; + handle_t *handle; + struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data; + loff_t size = i_size_read(s_inode); + + handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&OCFS2_I(t_inode)->ip_lock); + OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters; + OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr; + OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; + spin_unlock(&OCFS2_I(t_inode)->ip_lock); + i_size_write(t_inode, size); + t_inode->i_blocks = s_inode->i_blocks; + + di->i_xattr_inline_size = s_di->i_xattr_inline_size; + di->i_clusters = s_di->i_clusters; + di->i_size = s_di->i_size; + di->i_dyn_features = s_di->i_dyn_features; + di->i_attr = s_di->i_attr; + + if (preserve) { + t_inode->i_uid = s_inode->i_uid; + t_inode->i_gid = s_inode->i_gid; + t_inode->i_mode = s_inode->i_mode; + di->i_uid = s_di->i_uid; + di->i_gid = s_di->i_gid; + di->i_mode = s_di->i_mode; + + /* + * update time. + * we want mtime to appear identical to the source and + * update ctime. + */ + t_inode->i_ctime = CURRENT_TIME; + + di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); + + t_inode->i_mtime = s_inode->i_mtime; + di->i_mtime = s_di->i_mtime; + di->i_mtime_nsec = s_di->i_mtime_nsec; + } + + ocfs2_journal_dirty(handle, t_bh); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle); + return ret; +} + +static int ocfs2_create_reflink_node(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh, + bool preserve) +{ + int ret; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); + struct ocfs2_refcount_block *rb; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_refcount_tree *ref_tree; + + ocfs2_init_dealloc_ctxt(&dealloc); + + ret = ocfs2_set_refcount_tree(t_inode, t_bh, + le64_to_cpu(di->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_duplicate_inline_data(s_inode, s_bh, + t_inode, t_bh); + if (ret) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, + &ref_tree->rf_ci, ref_root_bh, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + +out_unlock_refcount: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +out: + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } + + return ret; +} + +static int __ocfs2_reflink(struct dentry *old_dentry, + struct buffer_head *old_bh, + struct inode *new_inode, + bool preserve) +{ + int ret; + struct inode *inode = old_dentry->d_inode; + struct buffer_head *new_bh = NULL; + + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + ret = filemap_fdatawrite(inode->i_mapping); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_attach_refcount_tree(inode, old_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD); + ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1, + OI_LS_REFLINK_TARGET); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_create_reflink_node(inode, old_bh, + new_inode, new_bh, preserve); + if (ret) { + mlog_errno(ret); + goto inode_unlock; + } + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) { + ret = ocfs2_reflink_xattrs(inode, old_bh, + new_inode, new_bh, + preserve); + if (ret) { + mlog_errno(ret); + goto inode_unlock; + } + } + + ret = ocfs2_complete_reflink(inode, old_bh, + new_inode, new_bh, preserve); + if (ret) + mlog_errno(ret); + +inode_unlock: + ocfs2_inode_unlock(new_inode, 1); + brelse(new_bh); +out_unlock: + mutex_unlock(&new_inode->i_mutex); +out: + if (!ret) { + ret = filemap_fdatawait(inode->i_mapping); + if (ret) + mlog_errno(ret); + } + return ret; +} + +static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool preserve) +{ + int error; + struct inode *inode = old_dentry->d_inode; + struct buffer_head *old_bh = NULL; + struct inode *new_orphan_inode = NULL; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + &new_orphan_inode); + if (error) { + mlog_errno(error); + goto out; + } + + error = ocfs2_inode_lock(inode, &old_bh, 1); + if (error) { + mlog_errno(error); + goto out; + } + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + down_write(&OCFS2_I(inode)->ip_alloc_sem); + error = __ocfs2_reflink(old_dentry, old_bh, + new_orphan_inode, preserve); + up_write(&OCFS2_I(inode)->ip_alloc_sem); + up_write(&OCFS2_I(inode)->ip_xattr_sem); + + ocfs2_inode_unlock(inode, 1); + brelse(old_bh); + + if (error) { + mlog_errno(error); + goto out; + } + + /* If the security isn't preserved, we need to re-initialize them. */ + if (!preserve) { + error = ocfs2_init_security_and_acl(dir, new_orphan_inode, + &new_dentry->d_name); + if (error) + mlog_errno(error); + } +out: + if (!error) { + error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + new_dentry); + if (error) + mlog_errno(error); + } + + if (new_orphan_inode) { + /* + * We need to open_unlock the inode no matter whether we + * succeed or not, so that other nodes can delete it later. + */ + ocfs2_open_unlock(new_orphan_inode); + if (error) + iput(new_orphan_inode); + } + + return error; +} + +/* + * Below here are the bits used by OCFS2_IOC_REFLINK() to fake + * sys_reflink(). This will go away when vfs_reflink() exists in + * fs/namei.c. + */ + +/* copied from may_create in VFS. */ +static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) +{ + if (child->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/* copied from user_path_parent. */ +static int ocfs2_user_path_parent(const char __user *path, + struct nameidata *nd, char **name) +{ + char *s = getname(path); + int error; + + if (IS_ERR(s)) + return PTR_ERR(s); + + error = kern_path_parent(s, nd); + if (error) + putname(s); + else + *name = s; + + return error; +} + +/** + * ocfs2_vfs_reflink - Create a reference-counted link + * + * @old_dentry: source dentry + inode + * @dir: directory to create the target + * @new_dentry: target dentry + * @preserve: if true, preserve all file attributes + */ +static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool preserve) +{ + struct inode *inode = old_dentry->d_inode; + int error; + + if (!inode) + return -ENOENT; + + error = ocfs2_may_create(dir, new_dentry); + if (error) + return error; + + if (dir->i_sb != inode->i_sb) + return -EXDEV; + + /* + * A reflink to an append-only or immutable file cannot be created. + */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + /* Only regular files can be reflinked. */ + if (!S_ISREG(inode->i_mode)) + return -EPERM; + + /* + * If the caller wants to preserve ownership, they require the + * rights to do so. + */ + if (preserve) { + if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN)) + return -EPERM; + if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN)) + return -EPERM; + } + + /* + * If the caller is modifying any aspect of the attributes, they + * are not creating a snapshot. They need read permission on the + * file. + */ + if (!preserve) { + error = inode_permission(inode, MAY_READ); + if (error) + return error; + } + + mutex_lock(&inode->i_mutex); + dquot_initialize(dir); + error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); + mutex_unlock(&inode->i_mutex); + if (!error) + fsnotify_create(dir, new_dentry); + return error; +} +/* + * Most codes are copied from sys_linkat. + */ +int ocfs2_reflink_ioctl(struct inode *inode, + const char __user *oldname, + const char __user *newname, + bool preserve) +{ + struct dentry *new_dentry; + struct nameidata nd; + struct path old_path; + int error; + char *to = NULL; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + error = user_path_at(AT_FDCWD, oldname, 0, &old_path); + if (error) { + mlog_errno(error); + return error; + } + + error = ocfs2_user_path_parent(newname, &nd, &to); + if (error) { + mlog_errno(error); + goto out; + } + + error = -EXDEV; + if (old_path.mnt != nd.path.mnt) + goto out_release; + new_dentry = lookup_create(&nd, 0); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) { + mlog_errno(error); + goto out_unlock; + } + + error = mnt_want_write(nd.path.mnt); + if (error) { + mlog_errno(error); + goto out_dput; + } + + error = ocfs2_vfs_reflink(old_path.dentry, + nd.path.dentry->d_inode, + new_dentry, preserve); + mnt_drop_write(nd.path.mnt); +out_dput: + dput(new_dentry); +out_unlock: + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); +out_release: + path_put(&nd.path); + putname(to); +out: + path_put(&old_path); + + return error; +} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h new file mode 100644 index 00000000..7754608c --- /dev/null +++ b/fs/ocfs2/refcounttree.h @@ -0,0 +1,118 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * refcounttree.h + * + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef OCFS2_REFCOUNTTREE_H +#define OCFS2_REFCOUNTTREE_H + +struct ocfs2_refcount_tree { + struct rb_node rf_node; + u64 rf_blkno; + u32 rf_generation; + struct kref rf_getcnt; + struct rw_semaphore rf_sem; + struct ocfs2_lock_res rf_lockres; + int rf_removed; + + /* the following 4 fields are used by caching_info. */ + spinlock_t rf_lock; + struct ocfs2_caching_info rf_ci; + struct mutex rf_io_mutex; + struct super_block *rf_sb; +}; + +void ocfs2_purge_refcount_trees(struct ocfs2_super *osb); +int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw, + struct ocfs2_refcount_tree **tree, + struct buffer_head **ref_bh); +void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, + int rw); + +int ocfs2_decrease_refcount(struct inode *inode, + handle_t *handle, u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete); +int ocfs2_prepare_refcount_change_for_del(struct inode *inode, + u64 refcount_loc, + u64 phys_blkno, + u32 clusters, + int *credits, + int *ref_blocks); +int ocfs2_refcount_cow(struct inode *inode, + struct file *filep, struct buffer_head *di_bh, + u32 cpos, u32 write_len, u32 max_cpos); + +typedef int (ocfs2_post_refcount_func)(struct inode *inode, + handle_t *handle, + void *para); +/* + * Some refcount caller need to do more work after we modify the data b-tree + * during refcount operation(including CoW and add refcount flag), and make the + * transaction complete. So it must give us this structure so that we can do it + * within our transaction. + * + */ +struct ocfs2_post_refcount { + int credits; /* credits it need for journal. */ + ocfs2_post_refcount_func *func; /* real function. */ + void *para; +}; + +int ocfs2_refcounted_xattr_delete_need(struct inode *inode, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_xattr_value_root *xv, + int *meta_add, int *credits); +int ocfs2_refcount_cow_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_refcount_tree *ref_tree, + struct buffer_head *ref_root_bh, + u32 cpos, u32 write_len, + struct ocfs2_post_refcount *post); +int ocfs2_duplicate_clusters_by_page(handle_t *handle, + struct file *file, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len); +int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, + struct file *file, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len); +int ocfs2_cow_sync_writeback(struct super_block *sb, + struct inode *inode, + u32 cpos, u32 num_clusters); +int ocfs2_add_refcount_flag(struct inode *inode, + struct ocfs2_extent_tree *data_et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + u32 cpos, u32 p_cluster, u32 num_clusters, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *post); +int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh); +int ocfs2_try_remove_refcount_tree(struct inode *inode, + struct buffer_head *di_bh); +int ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_reflink_ioctl(struct inode *inode, + const char __user *oldname, + const char __user *newname, + bool preserve); +#endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c new file mode 100644 index 00000000..41ffd36c --- /dev/null +++ b/fs/ocfs2/reservations.c @@ -0,0 +1,839 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * reservations.c + * + * Allocation reservations implementation + * + * Some code borrowed from fs/ext3/balloc.c and is: + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * The rest is copyright (C) 2010 Novell. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" +#include "ocfs2_trace.h" + +#ifdef CONFIG_OCFS2_DEBUG_FS +#define OCFS2_CHECK_RESERVATIONS +#endif + +DEFINE_SPINLOCK(resv_lock); + +#define OCFS2_MIN_RESV_WINDOW_BITS 8 +#define OCFS2_MAX_RESV_WINDOW_BITS 1024 + +int ocfs2_dir_resv_allowed(struct ocfs2_super *osb) +{ + return (osb->osb_resv_level && osb->osb_dir_resv_level); +} + +static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + struct ocfs2_super *osb = resmap->m_osb; + unsigned int bits; + + if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) { + /* 8, 16, 32, 64, 128, 256, 512, 1024 */ + bits = 4 << osb->osb_resv_level; + } else { + bits = 4 << osb->osb_dir_resv_level; + } + return bits; +} + +static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv) +{ + if (resv->r_len) + return resv->r_start + resv->r_len - 1; + return resv->r_start; +} + +static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv) +{ + return !!(resv->r_len == 0); +} + +static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap) +{ + if (resmap->m_osb->osb_resv_level == 0) + return 1; + return 0; +} + +static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap) +{ + struct ocfs2_super *osb = resmap->m_osb; + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + int i = 0; + + mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n", + osb->dev_str, resmap->m_bitmap_len); + + node = rb_first(&resmap->m_reservations); + while (node) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u" + "\tlast_len: %u\n", resv->r_start, + ocfs2_resv_end(resv), resv->r_len, resv->r_last_start, + resv->r_last_len); + + node = rb_next(node); + i++; + } + + mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i); + + i = 0; + list_for_each_entry(resv, &resmap->m_lru, r_lru) { + mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t" + "last_start: %u\tlast_len: %u\n", i, resv->r_start, + ocfs2_resv_end(resv), resv->r_len, resv->r_last_start, + resv->r_last_len); + + i++; + } +} + +#ifdef OCFS2_CHECK_RESERVATIONS +static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap, + int i, + struct ocfs2_alloc_reservation *resv) +{ + char *disk_bitmap = resmap->m_disk_bitmap; + unsigned int start = resv->r_start; + unsigned int end = ocfs2_resv_end(resv); + + while (start <= end) { + if (ocfs2_test_bit(start, disk_bitmap)) { + mlog(ML_ERROR, + "reservation %d covers an allocated area " + "starting at bit %u!\n", i, start); + return 1; + } + + start++; + } + return 0; +} + +static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap) +{ + unsigned int off = 0; + int i = 0; + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + + node = rb_first(&resmap->m_reservations); + while (node) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + if (i > 0 && resv->r_start <= off) { + mlog(ML_ERROR, "reservation %d has bad start off!\n", + i); + goto bad; + } + + if (resv->r_len == 0) { + mlog(ML_ERROR, "reservation %d has no length!\n", + i); + goto bad; + } + + if (resv->r_start > ocfs2_resv_end(resv)) { + mlog(ML_ERROR, "reservation %d has invalid range!\n", + i); + goto bad; + } + + if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) { + mlog(ML_ERROR, "reservation %d extends past bitmap!\n", + i); + goto bad; + } + + if (ocfs2_validate_resmap_bits(resmap, i, resv)) + goto bad; + + off = ocfs2_resv_end(resv); + node = rb_next(node); + + i++; + } + return; + +bad: + ocfs2_dump_resv(resmap); + BUG(); +} +#else +static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap) +{ + +} +#endif + +void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv) +{ + memset(resv, 0, sizeof(*resv)); + INIT_LIST_HEAD(&resv->r_lru); +} + +void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, + unsigned int flags) +{ + BUG_ON(flags & ~OCFS2_RESV_TYPES); + + resv->r_flags |= flags; +} + +int ocfs2_resmap_init(struct ocfs2_super *osb, + struct ocfs2_reservation_map *resmap) +{ + memset(resmap, 0, sizeof(*resmap)); + + resmap->m_osb = osb; + resmap->m_reservations = RB_ROOT; + /* m_bitmap_len is initialized to zero by the above memset. */ + INIT_LIST_HEAD(&resmap->m_lru); + + return 0; +} + +static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + assert_spin_locked(&resv_lock); + + if (!list_empty(&resv->r_lru)) + list_del_init(&resv->r_lru); + + list_add_tail(&resv->r_lru, &resmap->m_lru); +} + +static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv) +{ + resv->r_len = 0; + resv->r_start = 0; +} + +static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) { + list_del_init(&resv->r_lru); + rb_erase(&resv->r_node, &resmap->m_reservations); + resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE; + } +} + +static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + assert_spin_locked(&resv_lock); + + __ocfs2_resv_trunc(resv); + /* + * last_len and last_start no longer make sense if + * we're changing the range of our allocations. + */ + resv->r_last_len = resv->r_last_start = 0; + + ocfs2_resv_remove(resmap, resv); +} + +/* does nothing if 'resv' is null */ +void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + if (resv) { + spin_lock(&resv_lock); + __ocfs2_resv_discard(resmap, resv); + spin_unlock(&resv_lock); + } +} + +static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap) +{ + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + + assert_spin_locked(&resv_lock); + + while ((node = rb_last(&resmap->m_reservations)) != NULL) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + __ocfs2_resv_discard(resmap, resv); + } +} + +void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, + unsigned int clen, char *disk_bitmap) +{ + if (ocfs2_resmap_disabled(resmap)) + return; + + spin_lock(&resv_lock); + + ocfs2_resmap_clear_all_resv(resmap); + resmap->m_bitmap_len = clen; + resmap->m_disk_bitmap = disk_bitmap; + + spin_unlock(&resv_lock); +} + +void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap) +{ + /* Does nothing for now. Keep this around for API symmetry */ +} + +static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *new) +{ + struct rb_root *root = &resmap->m_reservations; + struct rb_node *parent = NULL; + struct rb_node **p = &root->rb_node; + struct ocfs2_alloc_reservation *tmp; + + assert_spin_locked(&resv_lock); + + trace_ocfs2_resv_insert(new->r_start, new->r_len); + + while (*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node); + + if (new->r_start < tmp->r_start) { + p = &(*p)->rb_left; + + /* + * This is a good place to check for + * overlapping reservations. + */ + BUG_ON(ocfs2_resv_end(new) >= tmp->r_start); + } else if (new->r_start > ocfs2_resv_end(tmp)) { + p = &(*p)->rb_right; + } else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate reservation window!\n"); + BUG(); + } + } + + rb_link_node(&new->r_node, parent, p); + rb_insert_color(&new->r_node, root); + new->r_flags |= OCFS2_RESV_FLAG_INUSE; + + ocfs2_resv_mark_lru(resmap, new); + + ocfs2_check_resmap(resmap); +} + +/** + * ocfs2_find_resv_lhs() - find the window which contains goal + * @resmap: reservation map to search + * @goal: which bit to search for + * + * If a window containing that goal is not found, we return the window + * which comes before goal. Returns NULL on empty rbtree or no window + * before goal. + */ +static struct ocfs2_alloc_reservation * +ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal) +{ + struct ocfs2_alloc_reservation *resv = NULL; + struct ocfs2_alloc_reservation *prev_resv = NULL; + struct rb_node *node = resmap->m_reservations.rb_node; + + assert_spin_locked(&resv_lock); + + if (!node) + return NULL; + + node = rb_first(&resmap->m_reservations); + while (node) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal) + break; + + /* Check if we overshot the reservation just before goal? */ + if (resv->r_start > goal) { + resv = prev_resv; + break; + } + + prev_resv = resv; + node = rb_next(node); + } + + return resv; +} + +/* + * We are given a range within the bitmap, which corresponds to a gap + * inside the reservations tree (search_start, search_len). The range + * can be anything from the whole bitmap, to a gap between + * reservations. + * + * The start value of *rstart is insignificant. + * + * This function searches the bitmap range starting at search_start + * with length search_len for a set of contiguous free bits. We try + * to find up to 'wanted' bits, but can sometimes return less. + * + * Returns the length of allocation, 0 if no free bits are found. + * + * *cstart and *clen will also be populated with the result. + */ +static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap, + unsigned int wanted, + unsigned int search_start, + unsigned int search_len, + unsigned int *rstart, + unsigned int *rlen) +{ + void *bitmap = resmap->m_disk_bitmap; + unsigned int best_start, best_len = 0; + int offset, start, found; + + trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len, + wanted, resmap->m_bitmap_len); + + found = best_start = best_len = 0; + + start = search_start; + while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len, + start)) != -1) { + /* Search reached end of the region */ + if (offset >= (search_start + search_len)) + break; + + if (offset == start) { + /* we found a zero */ + found++; + /* move start to the next bit to test */ + start++; + } else { + /* got a zero after some ones */ + found = 1; + start = offset + 1; + } + if (found > best_len) { + best_len = found; + best_start = start - found; + } + + if (found >= wanted) + break; + } + + if (best_len == 0) + return 0; + + if (best_len >= wanted) + best_len = wanted; + + *rlen = best_len; + *rstart = best_start; + + trace_ocfs2_resmap_find_free_bits_end(best_start, best_len); + + return *rlen; +} + +static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int goal, unsigned int wanted) +{ + struct rb_root *root = &resmap->m_reservations; + unsigned int gap_start, gap_end, gap_len; + struct ocfs2_alloc_reservation *prev_resv, *next_resv; + struct rb_node *prev, *next; + unsigned int cstart, clen; + unsigned int best_start = 0, best_len = 0; + + /* + * Nasty cases to consider: + * + * - rbtree is empty + * - our window should be first in all reservations + * - our window should be last in all reservations + * - need to make sure we don't go past end of bitmap + */ + trace_ocfs2_resv_find_window_begin(resv->r_start, ocfs2_resv_end(resv), + goal, wanted, RB_EMPTY_ROOT(root)); + + assert_spin_locked(&resv_lock); + + if (RB_EMPTY_ROOT(root)) { + /* + * Easiest case - empty tree. We can just take + * whatever window of free bits we want. + */ + clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal, + resmap->m_bitmap_len - goal, + &cstart, &clen); + + /* + * This should never happen - the local alloc window + * will always have free bits when we're called. + */ + BUG_ON(goal == 0 && clen == 0); + + if (clen == 0) + return; + + resv->r_start = cstart; + resv->r_len = clen; + + ocfs2_resv_insert(resmap, resv); + return; + } + + prev_resv = ocfs2_find_resv_lhs(resmap, goal); + + if (prev_resv == NULL) { + /* + * A NULL here means that the search code couldn't + * find a window that starts before goal. + * + * However, we can take the first window after goal, + * which is also by definition, the leftmost window in + * the entire tree. If we can find free bits in the + * gap between goal and the LHS window, then the + * reservation can safely be placed there. + * + * Otherwise we fall back to a linear search, checking + * the gaps in between windows for a place to + * allocate. + */ + + next = rb_first(root); + next_resv = rb_entry(next, struct ocfs2_alloc_reservation, + r_node); + + /* + * The search should never return such a window. (see + * comment above + */ + if (next_resv->r_start <= goal) { + mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n", + goal, next_resv->r_start, next_resv->r_len); + ocfs2_dump_resv(resmap); + BUG(); + } + + clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal, + next_resv->r_start - goal, + &cstart, &clen); + if (clen) { + best_len = clen; + best_start = cstart; + if (best_len == wanted) + goto out_insert; + } + + prev_resv = next_resv; + next_resv = NULL; + } + + trace_ocfs2_resv_find_window_prev(prev_resv->r_start, + ocfs2_resv_end(prev_resv)); + + prev = &prev_resv->r_node; + + /* Now we do a linear search for a window, starting at 'prev_rsv' */ + while (1) { + next = rb_next(prev); + if (next) { + next_resv = rb_entry(next, + struct ocfs2_alloc_reservation, + r_node); + + gap_start = ocfs2_resv_end(prev_resv) + 1; + gap_end = next_resv->r_start - 1; + gap_len = gap_end - gap_start + 1; + } else { + /* + * We're at the rightmost edge of the + * tree. See if a reservation between this + * window and the end of the bitmap will work. + */ + gap_start = ocfs2_resv_end(prev_resv) + 1; + gap_len = resmap->m_bitmap_len - gap_start; + gap_end = resmap->m_bitmap_len - 1; + } + + trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1, + next ? ocfs2_resv_end(next_resv) : -1); + /* + * No need to check this gap if we have already found + * a larger region of free bits. + */ + if (gap_len <= best_len) + goto next_resv; + + clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start, + gap_len, &cstart, &clen); + if (clen == wanted) { + best_len = clen; + best_start = cstart; + goto out_insert; + } else if (clen > best_len) { + best_len = clen; + best_start = cstart; + } + +next_resv: + if (!next) + break; + + prev = next; + prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation, + r_node); + } + +out_insert: + if (best_len) { + resv->r_start = best_start; + resv->r_len = best_len; + ocfs2_resv_insert(resmap, resv); + } +} + +static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int wanted) +{ + struct ocfs2_alloc_reservation *lru_resv; + int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP); + unsigned int min_bits; + + if (!tmpwindow) + min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1; + else + min_bits = wanted; /* We at know the temp window will use all + * of these bits */ + + /* + * Take the first reservation off the LRU as our 'target'. We + * don't try to be smart about it. There might be a case for + * searching based on size but I don't have enough data to be + * sure. --Mark (3/16/2010) + */ + lru_resv = list_first_entry(&resmap->m_lru, + struct ocfs2_alloc_reservation, r_lru); + + trace_ocfs2_cannibalize_resv_begin(lru_resv->r_start, + lru_resv->r_len, + ocfs2_resv_end(lru_resv)); + + /* + * Cannibalize (some or all) of the target reservation and + * feed it to the current window. + */ + if (lru_resv->r_len <= min_bits) { + /* + * Discard completely if size is less than or equal to a + * reasonable threshold - 50% of window bits for non temporary + * windows. + */ + resv->r_start = lru_resv->r_start; + resv->r_len = lru_resv->r_len; + + __ocfs2_resv_discard(resmap, lru_resv); + } else { + unsigned int shrink; + if (tmpwindow) + shrink = min_bits; + else + shrink = lru_resv->r_len / 2; + + lru_resv->r_len -= shrink; + + resv->r_start = ocfs2_resv_end(lru_resv) + 1; + resv->r_len = shrink; + } + + trace_ocfs2_cannibalize_resv_end(resv->r_start, ocfs2_resv_end(resv), + resv->r_len, resv->r_last_start, + resv->r_last_len); + + ocfs2_resv_insert(resmap, resv); +} + +static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int wanted) +{ + unsigned int goal = 0; + + BUG_ON(!ocfs2_resv_empty(resv)); + + /* + * Begin by trying to get a window as close to the previous + * one as possible. Using the most recent allocation as a + * start goal makes sense. + */ + if (resv->r_last_len) { + goal = resv->r_last_start + resv->r_last_len; + if (goal >= resmap->m_bitmap_len) + goal = 0; + } + + __ocfs2_resv_find_window(resmap, resv, goal, wanted); + + /* Search from last alloc didn't work, try once more from beginning. */ + if (ocfs2_resv_empty(resv) && goal != 0) + __ocfs2_resv_find_window(resmap, resv, 0, wanted); + + if (ocfs2_resv_empty(resv)) { + /* + * Still empty? Pull oldest one off the LRU, remove it from + * tree, put this one in it's place. + */ + ocfs2_cannibalize_resv(resmap, resv, wanted); + } + + BUG_ON(ocfs2_resv_empty(resv)); +} + +int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + int *cstart, int *clen) +{ + if (resv == NULL || ocfs2_resmap_disabled(resmap)) + return -ENOSPC; + + spin_lock(&resv_lock); + + if (ocfs2_resv_empty(resv)) { + /* + * We don't want to over-allocate for temporary + * windows. Otherwise, we run the risk of fragmenting the + * allocation space. + */ + unsigned int wanted = ocfs2_resv_window_bits(resmap, resv); + + if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen) + wanted = *clen; + + /* + * Try to get a window here. If it works, we must fall + * through and test the bitmap . This avoids some + * ping-ponging of windows due to non-reserved space + * being allocation before we initialize a window for + * that inode. + */ + ocfs2_resv_find_window(resmap, resv, wanted); + trace_ocfs2_resmap_resv_bits(resv->r_start, resv->r_len); + } + + BUG_ON(ocfs2_resv_empty(resv)); + + *cstart = resv->r_start; + *clen = resv->r_len; + + spin_unlock(&resv_lock); + return 0; +} + +static void + ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int start, unsigned int end) +{ + unsigned int rhs = 0; + unsigned int old_end = ocfs2_resv_end(resv); + + BUG_ON(start != resv->r_start || old_end < end); + + /* + * Completely used? We can remove it then. + */ + if (old_end == end) { + __ocfs2_resv_discard(resmap, resv); + return; + } + + rhs = old_end - end; + + /* + * This should have been trapped above. + */ + BUG_ON(rhs == 0); + + resv->r_start = end + 1; + resv->r_len = old_end - resv->r_start + 1; +} + +void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + u32 cstart, u32 clen) +{ + unsigned int cend = cstart + clen - 1; + + if (resmap == NULL || ocfs2_resmap_disabled(resmap)) + return; + + if (resv == NULL) + return; + + BUG_ON(cstart != resv->r_start); + + spin_lock(&resv_lock); + + trace_ocfs2_resmap_claimed_bits_begin(cstart, cend, clen, resv->r_start, + ocfs2_resv_end(resv), resv->r_len, + resv->r_last_start, + resv->r_last_len); + + BUG_ON(cstart < resv->r_start); + BUG_ON(cstart > ocfs2_resv_end(resv)); + BUG_ON(cend > ocfs2_resv_end(resv)); + + ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend); + resv->r_last_start = cstart; + resv->r_last_len = clen; + + /* + * May have been discarded above from + * ocfs2_adjust_resv_from_alloc(). + */ + if (!ocfs2_resv_empty(resv)) + ocfs2_resv_mark_lru(resmap, resv); + + trace_ocfs2_resmap_claimed_bits_end(resv->r_start, ocfs2_resv_end(resv), + resv->r_len, resv->r_last_start, + resv->r_last_len); + + ocfs2_check_resmap(resmap); + + spin_unlock(&resv_lock); +} diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h new file mode 100644 index 00000000..42c2b804 --- /dev/null +++ b/fs/ocfs2/reservations.h @@ -0,0 +1,159 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * reservations.h + * + * Allocation reservations function prototypes and structures. + * + * Copyright (C) 2010 Novell. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_RESERVATIONS_H +#define OCFS2_RESERVATIONS_H + +#include + +#define OCFS2_DEFAULT_RESV_LEVEL 2 +#define OCFS2_MAX_RESV_LEVEL 9 +#define OCFS2_MIN_RESV_LEVEL 0 + +struct ocfs2_alloc_reservation { + struct rb_node r_node; + + unsigned int r_start; /* Beginning of current window */ + unsigned int r_len; /* Length of the window */ + + unsigned int r_last_len; /* Length of most recent alloc */ + unsigned int r_last_start; /* Start of most recent alloc */ + struct list_head r_lru; /* LRU list head */ + + unsigned int r_flags; +}; + +#define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */ +#define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be + * destroyed immedately after use */ +#define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed + * directory btree */ + +struct ocfs2_reservation_map { + struct rb_root m_reservations; + char *m_disk_bitmap; + + struct ocfs2_super *m_osb; + + /* The following are not initialized to meaningful values until a disk + * bitmap is provided. */ + u32 m_bitmap_len; /* Number of valid + * bits available */ + + struct list_head m_lru; /* LRU of reservations + * structures. */ + +}; + +void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv); + +#define OCFS2_RESV_TYPES (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR) +void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, + unsigned int flags); + +int ocfs2_dir_resv_allowed(struct ocfs2_super *osb); + +/** + * ocfs2_resv_discard() - truncate a reservation + * @resmap: + * @resv: the reservation to truncate. + * + * After this function is called, the reservation will be empty, and + * unlinked from the rbtree. + */ +void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv); + + +/** + * ocfs2_resmap_init() - Initialize fields of a reservations bitmap + * @resmap: struct ocfs2_reservation_map to initialize + * @obj: unused for now + * @ops: unused for now + * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize) + * + * Only possible return value other than '0' is -ENOMEM for failure to + * allocation mirror bitmap. + */ +int ocfs2_resmap_init(struct ocfs2_super *osb, + struct ocfs2_reservation_map *resmap); + +/** + * ocfs2_resmap_restart() - "restart" a reservation bitmap + * @resmap: reservations bitmap + * @clen: Number of valid bits in the bitmap + * @disk_bitmap: the disk bitmap this resmap should refer to. + * + * Re-initialize the parameters of a reservation bitmap. This is + * useful for local alloc window slides. + * + * This function will call ocfs2_trunc_resv against all existing + * reservations. A future version will recalculate existing + * reservations based on the new bitmap. + */ +void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, + unsigned int clen, char *disk_bitmap); + +/** + * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure + * @resmap: the struct ocfs2_reservation_map to uninitialize + */ +void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap); + +/** + * ocfs2_resmap_resv_bits() - Return still-valid reservation bits + * @resmap: reservations bitmap + * @resv: reservation to base search from + * @cstart: start of proposed allocation + * @clen: length (in clusters) of proposed allocation + * + * Using the reservation data from resv, this function will compare + * resmap and resmap->m_disk_bitmap to determine what part (if any) of + * the reservation window is still clear to use. If resv is empty, + * this function will try to allocate a window for it. + * + * On success, zero is returned and the valid allocation area is set in cstart + * and clen. + * + * Returns -ENOSPC if reservations are disabled. + */ +int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + int *cstart, int *clen); + +/** + * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used. + * @resmap: reservations bitmap + * @resv: optional reservation to recalulate based on new bitmap + * @cstart: start of allocation in clusters + * @clen: end of allocation in clusters. + * + * Tell the reservation code that bits were used to fulfill allocation in + * resmap. The bits don't have to have been part of any existing + * reservation. But we must always call this function when bits are claimed. + * Internally, the reservations code will use this information to mark the + * reservations bitmap. If resv is passed, it's next allocation window will be + * calculated. It also expects that 'cstart' is the same as we passed back + * from ocfs2_resmap_resv_bits(). + */ +void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + u32 cstart, u32 clen); + +#endif /* OCFS2_RESERVATIONS_H */ diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c new file mode 100644 index 00000000..ec55add7 --- /dev/null +++ b/fs/ocfs2/resize.c @@ -0,0 +1,589 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * resize.c + * + * volume resize. + * Inspired by ext3/resize.c. + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "inode.h" +#include "journal.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" +#include "suballoc.h" +#include "resize.h" + +/* + * Check whether there are new backup superblocks exist + * in the last group. If there are some, mark them or clear + * them in the bitmap. + * + * Return how many backups we find in the last group. + */ +static u16 ocfs2_calc_new_backup_super(struct inode *inode, + struct ocfs2_group_desc *gd, + int new_clusters, + u32 first_new_cluster, + u16 cl_cpg, + int set) +{ + int i; + u16 backups = 0; + u32 cluster; + u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno); + + for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { + blkno = ocfs2_backup_super_blkno(inode->i_sb, i); + cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + + gd_blkno = ocfs2_which_cluster_group(inode, cluster); + if (gd_blkno < lgd_blkno) + continue; + else if (gd_blkno > lgd_blkno) + break; + + if (set) + ocfs2_set_bit(cluster % cl_cpg, + (unsigned long *)gd->bg_bitmap); + else + ocfs2_clear_bit(cluster % cl_cpg, + (unsigned long *)gd->bg_bitmap); + backups++; + } + + return backups; +} + +static int ocfs2_update_last_group_and_inode(handle_t *handle, + struct inode *bm_inode, + struct buffer_head *bm_bh, + struct buffer_head *group_bh, + u32 first_new_cluster, + int new_clusters) +{ + int ret = 0; + struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb); + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data; + struct ocfs2_chain_list *cl = &fe->id2.i_chain; + struct ocfs2_chain_rec *cr; + struct ocfs2_group_desc *group; + u16 chain, num_bits, backups = 0; + u16 cl_bpc = le16_to_cpu(cl->cl_bpc); + u16 cl_cpg = le16_to_cpu(cl->cl_cpg); + + trace_ocfs2_update_last_group_and_inode(new_clusters, + first_new_cluster); + + ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode), + group_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + group = (struct ocfs2_group_desc *)group_bh->b_data; + + /* update the group first. */ + num_bits = new_clusters * cl_bpc; + le16_add_cpu(&group->bg_bits, num_bits); + le16_add_cpu(&group->bg_free_bits_count, num_bits); + + /* + * check whether there are some new backup superblocks exist in + * this group and update the group bitmap accordingly. + */ + if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_COMPAT_BACKUP_SB)) { + backups = ocfs2_calc_new_backup_super(bm_inode, + group, + new_clusters, + first_new_cluster, + cl_cpg, 1); + le16_add_cpu(&group->bg_free_bits_count, -1 * backups); + } + + ocfs2_journal_dirty(handle, group_bh); + + /* update the inode accordingly. */ + ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_rollback; + } + + chain = le16_to_cpu(group->bg_chain); + cr = (&cl->cl_recs[chain]); + le32_add_cpu(&cr->c_total, num_bits); + le32_add_cpu(&cr->c_free, num_bits); + le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits); + le32_add_cpu(&fe->i_clusters, new_clusters); + + if (backups) { + le32_add_cpu(&cr->c_free, -1 * backups); + le32_add_cpu(&fe->id1.bitmap1.i_used, backups); + } + + spin_lock(&OCFS2_I(bm_inode)->ip_lock); + OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits); + spin_unlock(&OCFS2_I(bm_inode)->ip_lock); + i_size_write(bm_inode, le64_to_cpu(fe->i_size)); + + ocfs2_journal_dirty(handle, bm_bh); + +out_rollback: + if (ret < 0) { + ocfs2_calc_new_backup_super(bm_inode, + group, + new_clusters, + first_new_cluster, + cl_cpg, 0); + le16_add_cpu(&group->bg_free_bits_count, backups); + le16_add_cpu(&group->bg_bits, -1 * num_bits); + le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits); + } +out: + if (ret) + mlog_errno(ret); + return ret; +} + +static int update_backups(struct inode * inode, u32 clusters, char *data) +{ + int i, ret = 0; + u32 cluster; + u64 blkno; + struct buffer_head *backup = NULL; + struct ocfs2_dinode *backup_di = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + /* calculate the real backups we need to update. */ + for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { + blkno = ocfs2_backup_super_blkno(inode->i_sb, i); + cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + if (cluster > clusters) + break; + + ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); + if (ret < 0) { + mlog_errno(ret); + break; + } + + memcpy(backup->b_data, data, inode->i_sb->s_blocksize); + + backup_di = (struct ocfs2_dinode *)backup->b_data; + backup_di->i_blkno = cpu_to_le64(blkno); + + ret = ocfs2_write_super_or_backup(osb, backup); + brelse(backup); + backup = NULL; + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +static void ocfs2_update_super_and_backups(struct inode *inode, + int new_clusters) +{ + int ret; + u32 clusters = 0; + struct buffer_head *super_bh = NULL; + struct ocfs2_dinode *super_di = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + /* + * update the superblock last. + * It doesn't matter if the write failed. + */ + ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1, + &super_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + super_di = (struct ocfs2_dinode *)super_bh->b_data; + le32_add_cpu(&super_di->i_clusters, new_clusters); + clusters = le32_to_cpu(super_di->i_clusters); + + ret = ocfs2_write_super_or_backup(osb, super_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB)) + ret = update_backups(inode, clusters, super_bh->b_data); + +out: + brelse(super_bh); + if (ret) + printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s" + " during fs resize. This condition is not fatal," + " but fsck.ocfs2 should be run to fix it\n", + osb->dev_str); + return; +} + +/* + * Extend the filesystem to the new number of clusters specified. This entry + * point is only used to extend the current filesystem to the end of the last + * existing group. + */ +int ocfs2_group_extend(struct inode * inode, int new_clusters) +{ + int ret; + handle_t *handle; + struct buffer_head *main_bm_bh = NULL; + struct buffer_head *group_bh = NULL; + struct inode *main_bm_inode = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_group_desc *group = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u16 cl_bpc; + u32 first_new_cluster; + u64 lgd_blkno; + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + if (new_clusters < 0) + return -EINVAL; + else if (new_clusters == 0) + return 0; + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + mutex_lock(&main_bm_inode->i_mutex); + + ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + + fe = (struct ocfs2_dinode *)main_bm_bh->b_data; + + /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(), + * so any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + + if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != + ocfs2_group_bitmap_size(osb->sb, 0, + osb->s_feature_incompat) * 8) { + mlog(ML_ERROR, "The disk is too old and small. " + "Force to do offline resize."); + ret = -EINVAL; + goto out_unlock; + } + + first_new_cluster = le32_to_cpu(fe->i_clusters); + lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, + first_new_cluster - 1); + + ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno, + &group_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + group = (struct ocfs2_group_desc *)group_bh->b_data; + + cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); + if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters > + le16_to_cpu(fe->id2.i_chain.cl_cpg)) { + ret = -EINVAL; + goto out_unlock; + } + + + trace_ocfs2_group_extend( + (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters); + + handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS); + if (IS_ERR(handle)) { + mlog_errno(PTR_ERR(handle)); + ret = -EINVAL; + goto out_unlock; + } + + /* update the last group descriptor and inode. */ + ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode, + main_bm_bh, group_bh, + first_new_cluster, + new_clusters); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_update_super_and_backups(main_bm_inode, new_clusters); + +out_commit: + ocfs2_commit_trans(osb, handle); +out_unlock: + brelse(group_bh); + brelse(main_bm_bh); + + ocfs2_inode_unlock(main_bm_inode, 1); + +out_mutex: + mutex_unlock(&main_bm_inode->i_mutex); + iput(main_bm_inode); + +out: + return ret; +} + +static int ocfs2_check_new_group(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_new_group_input *input, + struct buffer_head *group_bh) +{ + int ret; + struct ocfs2_group_desc *gd = + (struct ocfs2_group_desc *)group_bh->b_data; + u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc); + + ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh); + if (ret) + goto out; + + ret = -EINVAL; + if (le16_to_cpu(gd->bg_chain) != input->chain) + mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u " + "while input has %u set.\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_chain), input->chain); + else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc) + mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but " + "input has %u clusters set\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), input->clusters); + else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc) + mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u " + "but it should have %u set\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + input->frees * cl_bpc); + else + ret = 0; + +out: + return ret; +} + +static int ocfs2_verify_group_and_input(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_new_group_input *input, + struct buffer_head *group_bh) +{ + u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count); + u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg); + u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec); + u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group); + u32 total_clusters = le32_to_cpu(di->i_clusters); + int ret = -EINVAL; + + if (cluster < total_clusters) + mlog(ML_ERROR, "add a group which is in the current volume.\n"); + else if (input->chain >= cl_count) + mlog(ML_ERROR, "input chain exceeds the limit.\n"); + else if (next_free != cl_count && next_free != input->chain) + mlog(ML_ERROR, + "the add group should be in chain %u\n", next_free); + else if (total_clusters + input->clusters < total_clusters) + mlog(ML_ERROR, "add group's clusters overflow.\n"); + else if (input->clusters > cl_cpg) + mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n"); + else if (input->frees > input->clusters) + mlog(ML_ERROR, "the free cluster exceeds the total clusters\n"); + else if (total_clusters % cl_cpg != 0) + mlog(ML_ERROR, + "the last group isn't full. Use group extend first.\n"); + else if (input->group != ocfs2_which_cluster_group(inode, cluster)) + mlog(ML_ERROR, "group blkno is invalid\n"); + else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh))) + mlog(ML_ERROR, "group descriptor check failed.\n"); + else + ret = 0; + + return ret; +} + +/* Add a new group descriptor to global_bitmap. */ +int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) +{ + int ret; + handle_t *handle; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *group_bh = NULL; + struct ocfs2_group_desc *group = NULL; + struct ocfs2_chain_list *cl; + struct ocfs2_chain_rec *cr; + u16 cl_bpc; + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + mutex_lock(&main_bm_inode->i_mutex); + + ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + + fe = (struct ocfs2_dinode *)main_bm_bh->b_data; + + if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != + ocfs2_group_bitmap_size(osb->sb, 0, + osb->s_feature_incompat) * 8) { + mlog(ML_ERROR, "The disk is too old and small." + " Force to do offline resize."); + ret = -EINVAL; + goto out_unlock; + } + + ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh); + if (ret < 0) { + mlog(ML_ERROR, "Can't read the group descriptor # %llu " + "from the device.", (unsigned long long)input->group); + goto out_unlock; + } + + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), group_bh); + + ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + trace_ocfs2_group_add((unsigned long long)input->group, + input->chain, input->clusters, input->frees); + + handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS); + if (IS_ERR(handle)) { + mlog_errno(PTR_ERR(handle)); + ret = -EINVAL; + goto out_unlock; + } + + cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); + cl = &fe->id2.i_chain; + cr = &cl->cl_recs[input->chain]; + + ret = ocfs2_journal_access_gd(handle, INODE_CACHE(main_bm_inode), + group_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + group = (struct ocfs2_group_desc *)group_bh->b_data; + group->bg_next_group = cr->c_blkno; + ocfs2_journal_dirty(handle, group_bh); + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), + main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) { + le16_add_cpu(&cl->cl_next_free_rec, 1); + memset(cr, 0, sizeof(struct ocfs2_chain_rec)); + } + + cr->c_blkno = cpu_to_le64(input->group); + le32_add_cpu(&cr->c_total, input->clusters * cl_bpc); + le32_add_cpu(&cr->c_free, input->frees * cl_bpc); + + le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc); + le32_add_cpu(&fe->id1.bitmap1.i_used, + (input->clusters - input->frees) * cl_bpc); + le32_add_cpu(&fe->i_clusters, input->clusters); + + ocfs2_journal_dirty(handle, main_bm_bh); + + spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); + OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits); + spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); + i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); + + ocfs2_update_super_and_backups(main_bm_inode, input->clusters); + +out_commit: + ocfs2_commit_trans(osb, handle); +out_unlock: + brelse(group_bh); + brelse(main_bm_bh); + + ocfs2_inode_unlock(main_bm_inode, 1); + +out_mutex: + mutex_unlock(&main_bm_inode->i_mutex); + iput(main_bm_inode); + +out: + return ret; +} diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h new file mode 100644 index 00000000..f38841ab --- /dev/null +++ b/fs/ocfs2/resize.h @@ -0,0 +1,32 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * resize.h + * + * Function prototypes + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_RESIZE_H +#define OCFS2_RESIZE_H + +int ocfs2_group_extend(struct inode * inode, int new_clusters); +int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input); + +#endif /* OCFS2_RESIZE_H */ diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c new file mode 100644 index 00000000..26fc0014 --- /dev/null +++ b/fs/ocfs2/slot_map.c @@ -0,0 +1,538 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * slot_map.c + * + * + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "dlmglue.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "slot_map.h" +#include "super.h" +#include "sysfile.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + + +struct ocfs2_slot { + int sl_valid; + unsigned int sl_node_num; +}; + +struct ocfs2_slot_info { + int si_extended; + int si_slots_per_block; + struct inode *si_inode; + unsigned int si_blocks; + struct buffer_head **si_bh; + unsigned int si_num_slots; + struct ocfs2_slot *si_slots; +}; + + +static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, + unsigned int node_num); + +static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si, + int slot_num) +{ + BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots)); + si->si_slots[slot_num].sl_valid = 0; +} + +static void ocfs2_set_slot(struct ocfs2_slot_info *si, + int slot_num, unsigned int node_num) +{ + BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots)); + + si->si_slots[slot_num].sl_valid = 1; + si->si_slots[slot_num].sl_node_num = node_num; +} + +/* This version is for the extended slot map */ +static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si) +{ + int b, i, slotno; + struct ocfs2_slot_map_extended *se; + + slotno = 0; + for (b = 0; b < si->si_blocks; b++) { + se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data; + for (i = 0; + (i < si->si_slots_per_block) && + (slotno < si->si_num_slots); + i++, slotno++) { + if (se->se_slots[i].es_valid) + ocfs2_set_slot(si, slotno, + le32_to_cpu(se->se_slots[i].es_node_num)); + else + ocfs2_invalidate_slot(si, slotno); + } + } +} + +/* + * Post the slot information on disk into our slot_info struct. + * Must be protected by osb_lock. + */ +static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si) +{ + int i; + struct ocfs2_slot_map *sm; + + sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data; + + for (i = 0; i < si->si_num_slots; i++) { + if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT) + ocfs2_invalidate_slot(si, i); + else + ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i])); + } +} + +static void ocfs2_update_slot_info(struct ocfs2_slot_info *si) +{ + /* + * The slot data will have been refreshed when ocfs2_super_lock + * was taken. + */ + if (si->si_extended) + ocfs2_update_slot_info_extended(si); + else + ocfs2_update_slot_info_old(si); +} + +int ocfs2_refresh_slot_info(struct ocfs2_super *osb) +{ + int ret; + struct ocfs2_slot_info *si = osb->slot_info; + + if (si == NULL) + return 0; + + BUG_ON(si->si_blocks == 0); + BUG_ON(si->si_bh == NULL); + + trace_ocfs2_refresh_slot_info(si->si_blocks); + + /* + * We pass -1 as blocknr because we expect all of si->si_bh to + * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If + * this is not true, the read of -1 (UINT64_MAX) will fail. + */ + ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks, + si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL); + if (ret == 0) { + spin_lock(&osb->osb_lock); + ocfs2_update_slot_info(si); + spin_unlock(&osb->osb_lock); + } + + return ret; +} + +/* post the our slot info stuff into it's destination bh and write it + * out. */ +static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si, + int slot_num, + struct buffer_head **bh) +{ + int blkind = slot_num / si->si_slots_per_block; + int slotno = slot_num % si->si_slots_per_block; + struct ocfs2_slot_map_extended *se; + + BUG_ON(blkind >= si->si_blocks); + + se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data; + se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid; + if (si->si_slots[slot_num].sl_valid) + se->se_slots[slotno].es_node_num = + cpu_to_le32(si->si_slots[slot_num].sl_node_num); + *bh = si->si_bh[blkind]; +} + +static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si, + int slot_num, + struct buffer_head **bh) +{ + int i; + struct ocfs2_slot_map *sm; + + sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data; + for (i = 0; i < si->si_num_slots; i++) { + if (si->si_slots[i].sl_valid) + sm->sm_slots[i] = + cpu_to_le16(si->si_slots[i].sl_node_num); + else + sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT); + } + *bh = si->si_bh[0]; +} + +static int ocfs2_update_disk_slot(struct ocfs2_super *osb, + struct ocfs2_slot_info *si, + int slot_num) +{ + int status; + struct buffer_head *bh; + + spin_lock(&osb->osb_lock); + if (si->si_extended) + ocfs2_update_disk_slot_extended(si, slot_num, &bh); + else + ocfs2_update_disk_slot_old(si, slot_num, &bh); + spin_unlock(&osb->osb_lock); + + status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode)); + if (status < 0) + mlog_errno(status); + + return status; +} + +/* + * Calculate how many bytes are needed by the slot map. Returns + * an error if the slot map file is too small. + */ +static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb, + struct inode *inode, + unsigned long long *bytes) +{ + unsigned long long bytes_needed; + + if (ocfs2_uses_extended_slot_map(osb)) { + bytes_needed = osb->max_slots * + sizeof(struct ocfs2_extended_slot); + } else { + bytes_needed = osb->max_slots * sizeof(__le16); + } + if (bytes_needed > i_size_read(inode)) { + mlog(ML_ERROR, + "Slot map file is too small! (size %llu, needed %llu)\n", + i_size_read(inode), bytes_needed); + return -ENOSPC; + } + + *bytes = bytes_needed; + return 0; +} + +/* try to find global node in the slot info. Returns -ENOENT + * if nothing is found. */ +static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, + unsigned int node_num) +{ + int i, ret = -ENOENT; + + for(i = 0; i < si->si_num_slots; i++) { + if (si->si_slots[i].sl_valid && + (node_num == si->si_slots[i].sl_node_num)) { + ret = i; + break; + } + } + + return ret; +} + +static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, + int preferred) +{ + int i, ret = -ENOSPC; + + if ((preferred >= 0) && (preferred < si->si_num_slots)) { + if (!si->si_slots[preferred].sl_valid) { + ret = preferred; + goto out; + } + } + + for(i = 0; i < si->si_num_slots; i++) { + if (!si->si_slots[i].sl_valid) { + ret = i; + break; + } + } +out: + return ret; +} + +int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num) +{ + int slot; + struct ocfs2_slot_info *si = osb->slot_info; + + spin_lock(&osb->osb_lock); + slot = __ocfs2_node_num_to_slot(si, node_num); + spin_unlock(&osb->osb_lock); + + return slot; +} + +int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, + unsigned int *node_num) +{ + struct ocfs2_slot_info *si = osb->slot_info; + + assert_spin_locked(&osb->osb_lock); + + BUG_ON(slot_num < 0); + BUG_ON(slot_num > osb->max_slots); + + if (!si->si_slots[slot_num].sl_valid) + return -ENOENT; + + *node_num = si->si_slots[slot_num].sl_node_num; + return 0; +} + +static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) +{ + unsigned int i; + + if (si == NULL) + return; + + if (si->si_inode) + iput(si->si_inode); + if (si->si_bh) { + for (i = 0; i < si->si_blocks; i++) { + if (si->si_bh[i]) { + brelse(si->si_bh[i]); + si->si_bh[i] = NULL; + } + } + kfree(si->si_bh); + } + + kfree(si); +} + +int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num) +{ + struct ocfs2_slot_info *si = osb->slot_info; + + if (si == NULL) + return 0; + + spin_lock(&osb->osb_lock); + ocfs2_invalidate_slot(si, slot_num); + spin_unlock(&osb->osb_lock); + + return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num); +} + +static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, + struct ocfs2_slot_info *si) +{ + int status = 0; + u64 blkno; + unsigned long long blocks, bytes = 0; + unsigned int i; + struct buffer_head *bh; + + status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes); + if (status) + goto bail; + + blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes); + BUG_ON(blocks > UINT_MAX); + si->si_blocks = blocks; + if (!si->si_blocks) + goto bail; + + if (si->si_extended) + si->si_slots_per_block = + (osb->sb->s_blocksize / + sizeof(struct ocfs2_extended_slot)); + else + si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16); + + /* The size checks above should ensure this */ + BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks); + + trace_ocfs2_map_slot_buffers(bytes, si->si_blocks); + + si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks, + GFP_KERNEL); + if (!si->si_bh) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + for (i = 0; i < si->si_blocks; i++) { + status = ocfs2_extent_map_get_blocks(si->si_inode, i, + &blkno, NULL, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i); + + bh = NULL; /* Acquire a fresh bh */ + status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno, + 1, &bh, OCFS2_BH_IGNORE_CACHE, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + si->si_bh[i] = bh; + } + +bail: + return status; +} + +int ocfs2_init_slot_info(struct ocfs2_super *osb) +{ + int status; + struct inode *inode = NULL; + struct ocfs2_slot_info *si; + + si = kzalloc(sizeof(struct ocfs2_slot_info) + + (sizeof(struct ocfs2_slot) * osb->max_slots), + GFP_KERNEL); + if (!si) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + si->si_extended = ocfs2_uses_extended_slot_map(osb); + si->si_num_slots = osb->max_slots; + si->si_slots = (struct ocfs2_slot *)((char *)si + + sizeof(struct ocfs2_slot_info)); + + inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + si->si_inode = inode; + status = ocfs2_map_slot_buffers(osb, si); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + osb->slot_info = (struct ocfs2_slot_info *)si; +bail: + if (status < 0 && si) + __ocfs2_free_slot_info(si); + + return status; +} + +void ocfs2_free_slot_info(struct ocfs2_super *osb) +{ + struct ocfs2_slot_info *si = osb->slot_info; + + osb->slot_info = NULL; + __ocfs2_free_slot_info(si); +} + +int ocfs2_find_slot(struct ocfs2_super *osb) +{ + int status; + int slot; + struct ocfs2_slot_info *si; + + si = osb->slot_info; + + spin_lock(&osb->osb_lock); + ocfs2_update_slot_info(si); + + /* search for ourselves first and take the slot if it already + * exists. Perhaps we need to mark this in a variable for our + * own journal recovery? Possibly not, though we certainly + * need to warn to the user */ + slot = __ocfs2_node_num_to_slot(si, osb->node_num); + if (slot < 0) { + /* if no slot yet, then just take 1st available + * one. */ + slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); + if (slot < 0) { + spin_unlock(&osb->osb_lock); + mlog(ML_ERROR, "no free slots available!\n"); + status = -EINVAL; + goto bail; + } + } else + mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", + slot); + + ocfs2_set_slot(si, slot, osb->node_num); + osb->slot_num = slot; + spin_unlock(&osb->osb_lock); + + trace_ocfs2_find_slot(osb->slot_num); + + status = ocfs2_update_disk_slot(osb, si, osb->slot_num); + if (status < 0) + mlog_errno(status); + +bail: + return status; +} + +void ocfs2_put_slot(struct ocfs2_super *osb) +{ + int status, slot_num; + struct ocfs2_slot_info *si = osb->slot_info; + + if (!si) + return; + + spin_lock(&osb->osb_lock); + ocfs2_update_slot_info(si); + + slot_num = osb->slot_num; + ocfs2_invalidate_slot(si, osb->slot_num); + osb->slot_num = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + + status = ocfs2_update_disk_slot(osb, si, slot_num); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + ocfs2_free_slot_info(osb); +} + diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h new file mode 100644 index 00000000..601c95fd --- /dev/null +++ b/fs/ocfs2/slot_map.h @@ -0,0 +1,44 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * slotmap.h + * + * description here + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + + +#ifndef SLOTMAP_H +#define SLOTMAP_H + +int ocfs2_init_slot_info(struct ocfs2_super *osb); +void ocfs2_free_slot_info(struct ocfs2_super *osb); + +int ocfs2_find_slot(struct ocfs2_super *osb); +void ocfs2_put_slot(struct ocfs2_super *osb); + +int ocfs2_refresh_slot_info(struct ocfs2_super *osb); + +int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num); +int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, + unsigned int *node_num); + +int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num); + +#endif diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c new file mode 100644 index 00000000..19965b00 --- /dev/null +++ b/fs/ocfs2/stack_o2cb.c @@ -0,0 +1,393 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * stack_o2cb.c + * + * Code which interfaces ocfs2 with the o2cb stack. + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include + +/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */ +#include + +#include "cluster/masklog.h" +#include "cluster/nodemanager.h" +#include "cluster/heartbeat.h" + +#include "stackglue.h" + +struct o2dlm_private { + struct dlm_eviction_cb op_eviction_cb; +}; + +static struct ocfs2_stack_plugin o2cb_stack; + +/* These should be identical */ +#if (DLM_LOCK_IV != LKM_IVMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_NL != LKM_NLMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_CR != LKM_CRMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_CW != LKM_CWMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_PR != LKM_PRMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_PW != LKM_PWMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_EX != LKM_EXMODE) +# error Lock modes do not match +#endif +static inline int mode_to_o2dlm(int mode) +{ + BUG_ON(mode > LKM_MAXMODE); + + return mode; +} + +#define map_flag(_generic, _o2dlm) \ + if (flags & (_generic)) { \ + flags &= ~(_generic); \ + o2dlm_flags |= (_o2dlm); \ + } +static int flags_to_o2dlm(u32 flags) +{ + int o2dlm_flags = 0; + + map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE); + map_flag(DLM_LKF_CANCEL, LKM_CANCEL); + map_flag(DLM_LKF_CONVERT, LKM_CONVERT); + map_flag(DLM_LKF_VALBLK, LKM_VALBLK); + map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK); + map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN); + map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE); + map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT); + map_flag(DLM_LKF_LOCAL, LKM_LOCAL); + + /* map_flag() should have cleared every flag passed in */ + BUG_ON(flags != 0); + + return o2dlm_flags; +} +#undef map_flag + +/* + * Map an o2dlm status to standard errno values. + * + * o2dlm only uses a handful of these, and returns even fewer to the + * caller. Still, we try to assign sane values to each error. + * + * The following value pairs have special meanings to dlmglue, thus + * the right hand side needs to stay unique - never duplicate the + * mapping elsewhere in the table! + * + * DLM_NORMAL: 0 + * DLM_NOTQUEUED: -EAGAIN + * DLM_CANCELGRANT: -EBUSY + * DLM_CANCEL: -DLM_ECANCEL + */ +/* Keep in sync with dlmapi.h */ +static int status_map[] = { + [DLM_NORMAL] = 0, /* Success */ + [DLM_GRANTED] = -EINVAL, + [DLM_DENIED] = -EACCES, + [DLM_DENIED_NOLOCKS] = -EACCES, + [DLM_WORKING] = -EACCES, + [DLM_BLOCKED] = -EINVAL, + [DLM_BLOCKED_ORPHAN] = -EINVAL, + [DLM_DENIED_GRACE_PERIOD] = -EACCES, + [DLM_SYSERR] = -ENOMEM, /* It is what it is */ + [DLM_NOSUPPORT] = -EPROTO, + [DLM_CANCELGRANT] = -EBUSY, /* Cancel after grant */ + [DLM_IVLOCKID] = -EINVAL, + [DLM_SYNC] = -EINVAL, + [DLM_BADTYPE] = -EINVAL, + [DLM_BADRESOURCE] = -EINVAL, + [DLM_MAXHANDLES] = -ENOMEM, + [DLM_NOCLINFO] = -EINVAL, + [DLM_NOLOCKMGR] = -EINVAL, + [DLM_NOPURGED] = -EINVAL, + [DLM_BADARGS] = -EINVAL, + [DLM_VOID] = -EINVAL, + [DLM_NOTQUEUED] = -EAGAIN, /* Trylock failed */ + [DLM_IVBUFLEN] = -EINVAL, + [DLM_CVTUNGRANT] = -EPERM, + [DLM_BADPARAM] = -EINVAL, + [DLM_VALNOTVALID] = -EINVAL, + [DLM_REJECTED] = -EPERM, + [DLM_ABORT] = -EINVAL, + [DLM_CANCEL] = -DLM_ECANCEL, /* Successful cancel */ + [DLM_IVRESHANDLE] = -EINVAL, + [DLM_DEADLOCK] = -EDEADLK, + [DLM_DENIED_NOASTS] = -EINVAL, + [DLM_FORWARD] = -EINVAL, + [DLM_TIMEOUT] = -ETIMEDOUT, + [DLM_IVGROUPID] = -EINVAL, + [DLM_VERS_CONFLICT] = -EOPNOTSUPP, + [DLM_BAD_DEVICE_PATH] = -ENOENT, + [DLM_NO_DEVICE_PERMISSION] = -EPERM, + [DLM_NO_CONTROL_DEVICE] = -ENOENT, + [DLM_RECOVERING] = -ENOTCONN, + [DLM_MIGRATING] = -ERESTART, + [DLM_MAXSTATS] = -EINVAL, +}; + +static int dlm_status_to_errno(enum dlm_status status) +{ + BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map)); + + return status_map[status]; +} + +static void o2dlm_lock_ast_wrapper(void *astarg) +{ + struct ocfs2_dlm_lksb *lksb = astarg; + + lksb->lksb_conn->cc_proto->lp_lock_ast(lksb); +} + +static void o2dlm_blocking_ast_wrapper(void *astarg, int level) +{ + struct ocfs2_dlm_lksb *lksb = astarg; + + lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level); +} + +static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) +{ + struct ocfs2_dlm_lksb *lksb = astarg; + int error = dlm_status_to_errno(status); + + /* + * In o2dlm, you can get both the lock_ast() for the lock being + * granted and the unlock_ast() for the CANCEL failing. A + * successful cancel sends DLM_NORMAL here. If the + * lock grant happened before the cancel arrived, you get + * DLM_CANCELGRANT. + * + * There's no need for the double-ast. If we see DLM_CANCELGRANT, + * we just ignore it. We expect the lock_ast() to handle the + * granted lock. + */ + if (status == DLM_CANCELGRANT) + return; + + lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error); +} + +static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, + int mode, + struct ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen) +{ + enum dlm_status status; + int o2dlm_mode = mode_to_o2dlm(mode); + int o2dlm_flags = flags_to_o2dlm(flags); + int ret; + + status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm, + o2dlm_flags, name, namelen, + o2dlm_lock_ast_wrapper, lksb, + o2dlm_blocking_ast_wrapper); + ret = dlm_status_to_errno(status); + return ret; +} + +static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, + struct ocfs2_dlm_lksb *lksb, + u32 flags) +{ + enum dlm_status status; + int o2dlm_flags = flags_to_o2dlm(flags); + int ret; + + status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm, + o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb); + ret = dlm_status_to_errno(status); + return ret; +} + +static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) +{ + return dlm_status_to_errno(lksb->lksb_o2dlm.status); +} + +/* + * o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB + * contents, it will zero out the LVB. Thus the caller can always trust + * the contents. + */ +static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) +{ + return 1; +} + +static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb) +{ + return (void *)(lksb->lksb_o2dlm.lvb); +} + +static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb) +{ + dlm_print_one_lock(lksb->lksb_o2dlm.lockid); +} + +/* + * Called from the dlm when it's about to evict a node. This is how the + * classic stack signals node death. + */ +static void o2dlm_eviction_cb(int node_num, void *data) +{ + struct ocfs2_cluster_connection *conn = data; + + mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n", + node_num, conn->cc_namelen, conn->cc_name); + + conn->cc_recovery_handler(node_num, conn->cc_recovery_data); +} + +static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) +{ + int rc = 0; + u32 dlm_key; + struct dlm_ctxt *dlm; + struct o2dlm_private *priv; + struct dlm_protocol_version fs_version; + + BUG_ON(conn == NULL); + BUG_ON(conn->cc_proto == NULL); + + /* for now we only have one cluster/node, make sure we see it + * in the heartbeat universe */ + if (!o2hb_check_local_node_heartbeating()) { + if (o2hb_global_heartbeat_active()) + mlog(ML_ERROR, "Global heartbeat not started\n"); + rc = -EINVAL; + goto out; + } + + priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL); + if (!priv) { + rc = -ENOMEM; + goto out_free; + } + + /* This just fills the structure in. It is safe to pass conn. */ + dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb, + conn); + + conn->cc_private = priv; + + /* used by the dlm code to make message headers unique, each + * node in this domain must agree on this. */ + dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen); + fs_version.pv_major = conn->cc_version.pv_major; + fs_version.pv_minor = conn->cc_version.pv_minor; + + dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version); + if (IS_ERR(dlm)) { + rc = PTR_ERR(dlm); + mlog_errno(rc); + goto out_free; + } + + conn->cc_version.pv_major = fs_version.pv_major; + conn->cc_version.pv_minor = fs_version.pv_minor; + conn->cc_lockspace = dlm; + + dlm_register_eviction_cb(dlm, &priv->op_eviction_cb); + +out_free: + if (rc && conn->cc_private) + kfree(conn->cc_private); + +out: + return rc; +} + +static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn) +{ + struct dlm_ctxt *dlm = conn->cc_lockspace; + struct o2dlm_private *priv = conn->cc_private; + + dlm_unregister_eviction_cb(&priv->op_eviction_cb); + conn->cc_private = NULL; + kfree(priv); + + dlm_unregister_domain(dlm); + conn->cc_lockspace = NULL; + + return 0; +} + +static int o2cb_cluster_this_node(unsigned int *node) +{ + int node_num; + + node_num = o2nm_this_node(); + if (node_num == O2NM_INVALID_NODE_NUM) + return -ENOENT; + + if (node_num >= O2NM_MAX_NODES) + return -EOVERFLOW; + + *node = node_num; + return 0; +} + +static struct ocfs2_stack_operations o2cb_stack_ops = { + .connect = o2cb_cluster_connect, + .disconnect = o2cb_cluster_disconnect, + .this_node = o2cb_cluster_this_node, + .dlm_lock = o2cb_dlm_lock, + .dlm_unlock = o2cb_dlm_unlock, + .lock_status = o2cb_dlm_lock_status, + .lvb_valid = o2cb_dlm_lvb_valid, + .lock_lvb = o2cb_dlm_lvb, + .dump_lksb = o2cb_dump_lksb, +}; + +static struct ocfs2_stack_plugin o2cb_stack = { + .sp_name = "o2cb", + .sp_ops = &o2cb_stack_ops, + .sp_owner = THIS_MODULE, +}; + +static int __init o2cb_stack_init(void) +{ + return ocfs2_stack_glue_register(&o2cb_stack); +} + +static void __exit o2cb_stack_exit(void) +{ + ocfs2_stack_glue_unregister(&o2cb_stack); +} + +MODULE_AUTHOR("Oracle"); +MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack"); +MODULE_LICENSE("GPL"); +module_init(o2cb_stack_init); +module_exit(o2cb_stack_exit); diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c new file mode 100644 index 00000000..a5ebe421 --- /dev/null +++ b/fs/ocfs2/stack_user.c @@ -0,0 +1,908 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * stack_user.c + * + * Code which interfaces ocfs2 with fs/dlm and a userspace stack. + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "stackglue.h" + +#include + +/* + * The control protocol starts with a handshake. Until the handshake + * is complete, the control device will fail all write(2)s. + * + * The handshake is simple. First, the client reads until EOF. Each line + * of output is a supported protocol tag. All protocol tags are a single + * character followed by a two hex digit version number. Currently the + * only things supported is T01, for "Text-base version 0x01". Next, the + * client writes the version they would like to use, including the newline. + * Thus, the protocol tag is 'T01\n'. If the version tag written is + * unknown, -EINVAL is returned. Once the negotiation is complete, the + * client can start sending messages. + * + * The T01 protocol has three messages. First is the "SETN" message. + * It has the following syntax: + * + * SETN<8-char-hex-nodenum> + * + * This is 14 characters. + * + * The "SETN" message must be the first message following the protocol. + * It tells ocfs2_control the local node number. + * + * Next comes the "SETV" message. It has the following syntax: + * + * SETV<2-char-hex-major><2-char-hex-minor> + * + * This is 11 characters. + * + * The "SETV" message sets the filesystem locking protocol version as + * negotiated by the client. The client negotiates based on the maximum + * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major + * number from the "SETV" message must match + * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number + * must be less than or equal to ...sp_max_version.pv_minor. + * + * Once this information has been set, mounts will be allowed. From this + * point on, the "DOWN" message can be sent for node down notification. + * It has the following syntax: + * + * DOWN<32-char-cap-hex-uuid><8-char-hex-nodenum> + * + * eg: + * + * DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n + * + * This is 47 characters. + */ + +/* + * Whether or not the client has done the handshake. + * For now, we have just one protocol version. + */ +#define OCFS2_CONTROL_PROTO "T01\n" +#define OCFS2_CONTROL_PROTO_LEN 4 + +/* Handshake states */ +#define OCFS2_CONTROL_HANDSHAKE_INVALID (0) +#define OCFS2_CONTROL_HANDSHAKE_READ (1) +#define OCFS2_CONTROL_HANDSHAKE_PROTOCOL (2) +#define OCFS2_CONTROL_HANDSHAKE_VALID (3) + +/* Messages */ +#define OCFS2_CONTROL_MESSAGE_OP_LEN 4 +#define OCFS2_CONTROL_MESSAGE_SETNODE_OP "SETN" +#define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN 14 +#define OCFS2_CONTROL_MESSAGE_SETVERSION_OP "SETV" +#define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN 11 +#define OCFS2_CONTROL_MESSAGE_DOWN_OP "DOWN" +#define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN 47 +#define OCFS2_TEXT_UUID_LEN 32 +#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 +#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 + +/* + * ocfs2_live_connection is refcounted because the filesystem and + * miscdevice sides can detach in different order. Let's just be safe. + */ +struct ocfs2_live_connection { + struct list_head oc_list; + struct ocfs2_cluster_connection *oc_conn; +}; + +struct ocfs2_control_private { + struct list_head op_list; + int op_state; + int op_this_node; + struct ocfs2_protocol_version op_proto; +}; + +/* SETN<8-char-hex-nodenum> */ +struct ocfs2_control_message_setn { + char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; + char space; + char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN]; + char newline; +}; + +/* SETV<2-char-hex-major><2-char-hex-minor> */ +struct ocfs2_control_message_setv { + char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; + char space1; + char major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN]; + char space2; + char minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN]; + char newline; +}; + +/* DOWN<32-char-cap-hex-uuid><8-char-hex-nodenum> */ +struct ocfs2_control_message_down { + char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; + char space1; + char uuid[OCFS2_TEXT_UUID_LEN]; + char space2; + char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN]; + char newline; +}; + +union ocfs2_control_message { + char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; + struct ocfs2_control_message_setn u_setn; + struct ocfs2_control_message_setv u_setv; + struct ocfs2_control_message_down u_down; +}; + +static struct ocfs2_stack_plugin ocfs2_user_plugin; + +static atomic_t ocfs2_control_opened; +static int ocfs2_control_this_node = -1; +static struct ocfs2_protocol_version running_proto; + +static LIST_HEAD(ocfs2_live_connection_list); +static LIST_HEAD(ocfs2_control_private_list); +static DEFINE_MUTEX(ocfs2_control_lock); + +static inline void ocfs2_control_set_handshake_state(struct file *file, + int state) +{ + struct ocfs2_control_private *p = file->private_data; + p->op_state = state; +} + +static inline int ocfs2_control_get_handshake_state(struct file *file) +{ + struct ocfs2_control_private *p = file->private_data; + return p->op_state; +} + +static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) +{ + size_t len = strlen(name); + struct ocfs2_live_connection *c; + + BUG_ON(!mutex_is_locked(&ocfs2_control_lock)); + + list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) { + if ((c->oc_conn->cc_namelen == len) && + !strncmp(c->oc_conn->cc_name, name, len)) + return c; + } + + return NULL; +} + +/* + * ocfs2_live_connection structures are created underneath the ocfs2 + * mount path. Since the VFS prevents multiple calls to + * fill_super(), we can't get dupes here. + */ +static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, + struct ocfs2_live_connection **c_ret) +{ + int rc = 0; + struct ocfs2_live_connection *c; + + c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); + if (!c) + return -ENOMEM; + + mutex_lock(&ocfs2_control_lock); + c->oc_conn = conn; + + if (atomic_read(&ocfs2_control_opened)) + list_add(&c->oc_list, &ocfs2_live_connection_list); + else { + printk(KERN_ERR + "ocfs2: Userspace control daemon is not present\n"); + rc = -ESRCH; + } + + mutex_unlock(&ocfs2_control_lock); + + if (!rc) + *c_ret = c; + else + kfree(c); + + return rc; +} + +/* + * This function disconnects the cluster connection from ocfs2_control. + * Afterwards, userspace can't affect the cluster connection. + */ +static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c) +{ + mutex_lock(&ocfs2_control_lock); + list_del_init(&c->oc_list); + c->oc_conn = NULL; + mutex_unlock(&ocfs2_control_lock); + + kfree(c); +} + +static int ocfs2_control_cfu(void *target, size_t target_len, + const char __user *buf, size_t count) +{ + /* The T01 expects write(2) calls to have exactly one command */ + if ((count != target_len) || + (count > sizeof(union ocfs2_control_message))) + return -EINVAL; + + if (copy_from_user(target, buf, target_len)) + return -EFAULT; + + return 0; +} + +static ssize_t ocfs2_control_validate_protocol(struct file *file, + const char __user *buf, + size_t count) +{ + ssize_t ret; + char kbuf[OCFS2_CONTROL_PROTO_LEN]; + + ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN, + buf, count); + if (ret) + return ret; + + if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN)) + return -EINVAL; + + ocfs2_control_set_handshake_state(file, + OCFS2_CONTROL_HANDSHAKE_PROTOCOL); + + return count; +} + +static void ocfs2_control_send_down(const char *uuid, + int nodenum) +{ + struct ocfs2_live_connection *c; + + mutex_lock(&ocfs2_control_lock); + + c = ocfs2_connection_find(uuid); + if (c) { + BUG_ON(c->oc_conn == NULL); + c->oc_conn->cc_recovery_handler(nodenum, + c->oc_conn->cc_recovery_data); + } + + mutex_unlock(&ocfs2_control_lock); +} + +/* + * Called whenever configuration elements are sent to /dev/ocfs2_control. + * If all configuration elements are present, try to set the global + * values. If there is a problem, return an error. Skip any missing + * elements, and only bump ocfs2_control_opened when we have all elements + * and are successful. + */ +static int ocfs2_control_install_private(struct file *file) +{ + int rc = 0; + int set_p = 1; + struct ocfs2_control_private *p = file->private_data; + + BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL); + + mutex_lock(&ocfs2_control_lock); + + if (p->op_this_node < 0) { + set_p = 0; + } else if ((ocfs2_control_this_node >= 0) && + (ocfs2_control_this_node != p->op_this_node)) { + rc = -EINVAL; + goto out_unlock; + } + + if (!p->op_proto.pv_major) { + set_p = 0; + } else if (!list_empty(&ocfs2_live_connection_list) && + ((running_proto.pv_major != p->op_proto.pv_major) || + (running_proto.pv_minor != p->op_proto.pv_minor))) { + rc = -EINVAL; + goto out_unlock; + } + + if (set_p) { + ocfs2_control_this_node = p->op_this_node; + running_proto.pv_major = p->op_proto.pv_major; + running_proto.pv_minor = p->op_proto.pv_minor; + } + +out_unlock: + mutex_unlock(&ocfs2_control_lock); + + if (!rc && set_p) { + /* We set the global values successfully */ + atomic_inc(&ocfs2_control_opened); + ocfs2_control_set_handshake_state(file, + OCFS2_CONTROL_HANDSHAKE_VALID); + } + + return rc; +} + +static int ocfs2_control_get_this_node(void) +{ + int rc; + + mutex_lock(&ocfs2_control_lock); + if (ocfs2_control_this_node < 0) + rc = -EINVAL; + else + rc = ocfs2_control_this_node; + mutex_unlock(&ocfs2_control_lock); + + return rc; +} + +static int ocfs2_control_do_setnode_msg(struct file *file, + struct ocfs2_control_message_setn *msg) +{ + long nodenum; + char *ptr = NULL; + struct ocfs2_control_private *p = file->private_data; + + if (ocfs2_control_get_handshake_state(file) != + OCFS2_CONTROL_HANDSHAKE_PROTOCOL) + return -EINVAL; + + if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + return -EINVAL; + + if ((msg->space != ' ') || (msg->newline != '\n')) + return -EINVAL; + msg->space = msg->newline = '\0'; + + nodenum = simple_strtol(msg->nodestr, &ptr, 16); + if (!ptr || *ptr) + return -EINVAL; + + if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || + (nodenum > INT_MAX) || (nodenum < 0)) + return -ERANGE; + p->op_this_node = nodenum; + + return ocfs2_control_install_private(file); +} + +static int ocfs2_control_do_setversion_msg(struct file *file, + struct ocfs2_control_message_setv *msg) + { + long major, minor; + char *ptr = NULL; + struct ocfs2_control_private *p = file->private_data; + struct ocfs2_protocol_version *max = + &ocfs2_user_plugin.sp_max_proto; + + if (ocfs2_control_get_handshake_state(file) != + OCFS2_CONTROL_HANDSHAKE_PROTOCOL) + return -EINVAL; + + if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + return -EINVAL; + + if ((msg->space1 != ' ') || (msg->space2 != ' ') || + (msg->newline != '\n')) + return -EINVAL; + msg->space1 = msg->space2 = msg->newline = '\0'; + + major = simple_strtol(msg->major, &ptr, 16); + if (!ptr || *ptr) + return -EINVAL; + minor = simple_strtol(msg->minor, &ptr, 16); + if (!ptr || *ptr) + return -EINVAL; + + /* + * The major must be between 1 and 255, inclusive. The minor + * must be between 0 and 255, inclusive. The version passed in + * must be within the maximum version supported by the filesystem. + */ + if ((major == LONG_MIN) || (major == LONG_MAX) || + (major > (u8)-1) || (major < 1)) + return -ERANGE; + if ((minor == LONG_MIN) || (minor == LONG_MAX) || + (minor > (u8)-1) || (minor < 0)) + return -ERANGE; + if ((major != max->pv_major) || + (minor > max->pv_minor)) + return -EINVAL; + + p->op_proto.pv_major = major; + p->op_proto.pv_minor = minor; + + return ocfs2_control_install_private(file); +} + +static int ocfs2_control_do_down_msg(struct file *file, + struct ocfs2_control_message_down *msg) +{ + long nodenum; + char *p = NULL; + + if (ocfs2_control_get_handshake_state(file) != + OCFS2_CONTROL_HANDSHAKE_VALID) + return -EINVAL; + + if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + return -EINVAL; + + if ((msg->space1 != ' ') || (msg->space2 != ' ') || + (msg->newline != '\n')) + return -EINVAL; + msg->space1 = msg->space2 = msg->newline = '\0'; + + nodenum = simple_strtol(msg->nodestr, &p, 16); + if (!p || *p) + return -EINVAL; + + if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || + (nodenum > INT_MAX) || (nodenum < 0)) + return -ERANGE; + + ocfs2_control_send_down(msg->uuid, nodenum); + + return 0; +} + +static ssize_t ocfs2_control_message(struct file *file, + const char __user *buf, + size_t count) +{ + ssize_t ret; + union ocfs2_control_message msg; + + /* Try to catch padding issues */ + WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) != + (sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1))); + + memset(&msg, 0, sizeof(union ocfs2_control_message)); + ret = ocfs2_control_cfu(&msg, count, buf, count); + if (ret) + goto out; + + if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) && + !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn); + else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) && + !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv); + else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) && + !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + ret = ocfs2_control_do_down_msg(file, &msg.u_down); + else + ret = -EINVAL; + +out: + return ret ? ret : count; +} + +static ssize_t ocfs2_control_write(struct file *file, + const char __user *buf, + size_t count, + loff_t *ppos) +{ + ssize_t ret; + + switch (ocfs2_control_get_handshake_state(file)) { + case OCFS2_CONTROL_HANDSHAKE_INVALID: + ret = -EINVAL; + break; + + case OCFS2_CONTROL_HANDSHAKE_READ: + ret = ocfs2_control_validate_protocol(file, buf, + count); + break; + + case OCFS2_CONTROL_HANDSHAKE_PROTOCOL: + case OCFS2_CONTROL_HANDSHAKE_VALID: + ret = ocfs2_control_message(file, buf, count); + break; + + default: + BUG(); + ret = -EIO; + break; + } + + return ret; +} + +/* + * This is a naive version. If we ever have a new protocol, we'll expand + * it. Probably using seq_file. + */ +static ssize_t ocfs2_control_read(struct file *file, + char __user *buf, + size_t count, + loff_t *ppos) +{ + ssize_t ret; + + ret = simple_read_from_buffer(buf, count, ppos, + OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN); + + /* Have we read the whole protocol list? */ + if (ret > 0 && *ppos >= OCFS2_CONTROL_PROTO_LEN) + ocfs2_control_set_handshake_state(file, + OCFS2_CONTROL_HANDSHAKE_READ); + + return ret; +} + +static int ocfs2_control_release(struct inode *inode, struct file *file) +{ + struct ocfs2_control_private *p = file->private_data; + + mutex_lock(&ocfs2_control_lock); + + if (ocfs2_control_get_handshake_state(file) != + OCFS2_CONTROL_HANDSHAKE_VALID) + goto out; + + if (atomic_dec_and_test(&ocfs2_control_opened)) { + if (!list_empty(&ocfs2_live_connection_list)) { + /* XXX: Do bad things! */ + printk(KERN_ERR + "ocfs2: Unexpected release of ocfs2_control!\n" + " Loss of cluster connection requires " + "an emergency restart!\n"); + emergency_restart(); + } + /* + * Last valid close clears the node number and resets + * the locking protocol version + */ + ocfs2_control_this_node = -1; + running_proto.pv_major = 0; + running_proto.pv_major = 0; + } + +out: + list_del_init(&p->op_list); + file->private_data = NULL; + + mutex_unlock(&ocfs2_control_lock); + + kfree(p); + + return 0; +} + +static int ocfs2_control_open(struct inode *inode, struct file *file) +{ + struct ocfs2_control_private *p; + + p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL); + if (!p) + return -ENOMEM; + p->op_this_node = -1; + + mutex_lock(&ocfs2_control_lock); + file->private_data = p; + list_add(&p->op_list, &ocfs2_control_private_list); + mutex_unlock(&ocfs2_control_lock); + + return 0; +} + +static const struct file_operations ocfs2_control_fops = { + .open = ocfs2_control_open, + .release = ocfs2_control_release, + .read = ocfs2_control_read, + .write = ocfs2_control_write, + .owner = THIS_MODULE, + .llseek = default_llseek, +}; + +static struct miscdevice ocfs2_control_device = { + .minor = MISC_DYNAMIC_MINOR, + .name = "ocfs2_control", + .fops = &ocfs2_control_fops, +}; + +static int ocfs2_control_init(void) +{ + int rc; + + atomic_set(&ocfs2_control_opened, 0); + + rc = misc_register(&ocfs2_control_device); + if (rc) + printk(KERN_ERR + "ocfs2: Unable to register ocfs2_control device " + "(errno %d)\n", + -rc); + + return rc; +} + +static void ocfs2_control_exit(void) +{ + int rc; + + rc = misc_deregister(&ocfs2_control_device); + if (rc) + printk(KERN_ERR + "ocfs2: Unable to deregister ocfs2_control device " + "(errno %d)\n", + -rc); +} + +static void fsdlm_lock_ast_wrapper(void *astarg) +{ + struct ocfs2_dlm_lksb *lksb = astarg; + int status = lksb->lksb_fsdlm.sb_status; + + /* + * For now we're punting on the issue of other non-standard errors + * where we can't tell if the unlock_ast or lock_ast should be called. + * The main "other error" that's possible is EINVAL which means the + * function was called with invalid args, which shouldn't be possible + * since the caller here is under our control. Other non-standard + * errors probably fall into the same category, or otherwise are fatal + * which means we can't carry on anyway. + */ + + if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL) + lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0); + else + lksb->lksb_conn->cc_proto->lp_lock_ast(lksb); +} + +static void fsdlm_blocking_ast_wrapper(void *astarg, int level) +{ + struct ocfs2_dlm_lksb *lksb = astarg; + + lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level); +} + +static int user_dlm_lock(struct ocfs2_cluster_connection *conn, + int mode, + struct ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen) +{ + int ret; + + if (!lksb->lksb_fsdlm.sb_lvbptr) + lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + + sizeof(struct dlm_lksb); + + ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, + flags|DLM_LKF_NODLCKWT, name, namelen, 0, + fsdlm_lock_ast_wrapper, lksb, + fsdlm_blocking_ast_wrapper); + return ret; +} + +static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, + struct ocfs2_dlm_lksb *lksb, + u32 flags) +{ + int ret; + + ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, + flags, &lksb->lksb_fsdlm, lksb); + return ret; +} + +static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) +{ + return lksb->lksb_fsdlm.sb_status; +} + +static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) +{ + int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID; + + return !invalid; +} + +static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb) +{ + if (!lksb->lksb_fsdlm.sb_lvbptr) + lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + + sizeof(struct dlm_lksb); + return (void *)(lksb->lksb_fsdlm.sb_lvbptr); +} + +static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb) +{ +} + +static int user_plock(struct ocfs2_cluster_connection *conn, + u64 ino, + struct file *file, + int cmd, + struct file_lock *fl) +{ + /* + * This more or less just demuxes the plock request into any + * one of three dlm calls. + * + * Internally, fs/dlm will pass these to a misc device, which + * a userspace daemon will read and write to. + * + * For now, cancel requests (which happen internally only), + * are turned into unlocks. Most of this function taken from + * gfs2_lock. + */ + + if (cmd == F_CANCELLK) { + cmd = F_SETLK; + fl->fl_type = F_UNLCK; + } + + if (IS_GETLK(cmd)) + return dlm_posix_get(conn->cc_lockspace, ino, file, fl); + else if (fl->fl_type == F_UNLCK) + return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl); + else + return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl); +} + +/* + * Compare a requested locking protocol version against the current one. + * + * If the major numbers are different, they are incompatible. + * If the current minor is greater than the request, they are incompatible. + * If the current minor is less than or equal to the request, they are + * compatible, and the requester should run at the current minor version. + */ +static int fs_protocol_compare(struct ocfs2_protocol_version *existing, + struct ocfs2_protocol_version *request) +{ + if (existing->pv_major != request->pv_major) + return 1; + + if (existing->pv_minor > request->pv_minor) + return 1; + + if (existing->pv_minor < request->pv_minor) + request->pv_minor = existing->pv_minor; + + return 0; +} + +static int user_cluster_connect(struct ocfs2_cluster_connection *conn) +{ + dlm_lockspace_t *fsdlm; + struct ocfs2_live_connection *uninitialized_var(control); + int rc = 0; + + BUG_ON(conn == NULL); + + rc = ocfs2_live_connection_new(conn, &control); + if (rc) + goto out; + + /* + * running_proto must have been set before we allowed any mounts + * to proceed. + */ + if (fs_protocol_compare(&running_proto, &conn->cc_version)) { + printk(KERN_ERR + "Unable to mount with fs locking protocol version " + "%u.%u because the userspace control daemon has " + "negotiated %u.%u\n", + conn->cc_version.pv_major, conn->cc_version.pv_minor, + running_proto.pv_major, running_proto.pv_minor); + rc = -EPROTO; + ocfs2_live_connection_drop(control); + goto out; + } + + rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name), + &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN); + if (rc) { + ocfs2_live_connection_drop(control); + goto out; + } + + conn->cc_private = control; + conn->cc_lockspace = fsdlm; +out: + return rc; +} + +static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) +{ + dlm_release_lockspace(conn->cc_lockspace, 2); + conn->cc_lockspace = NULL; + ocfs2_live_connection_drop(conn->cc_private); + conn->cc_private = NULL; + return 0; +} + +static int user_cluster_this_node(unsigned int *this_node) +{ + int rc; + + rc = ocfs2_control_get_this_node(); + if (rc < 0) + return rc; + + *this_node = rc; + return 0; +} + +static struct ocfs2_stack_operations ocfs2_user_plugin_ops = { + .connect = user_cluster_connect, + .disconnect = user_cluster_disconnect, + .this_node = user_cluster_this_node, + .dlm_lock = user_dlm_lock, + .dlm_unlock = user_dlm_unlock, + .lock_status = user_dlm_lock_status, + .lvb_valid = user_dlm_lvb_valid, + .lock_lvb = user_dlm_lvb, + .plock = user_plock, + .dump_lksb = user_dlm_dump_lksb, +}; + +static struct ocfs2_stack_plugin ocfs2_user_plugin = { + .sp_name = "user", + .sp_ops = &ocfs2_user_plugin_ops, + .sp_owner = THIS_MODULE, +}; + + +static int __init ocfs2_user_plugin_init(void) +{ + int rc; + + rc = ocfs2_control_init(); + if (!rc) { + rc = ocfs2_stack_glue_register(&ocfs2_user_plugin); + if (rc) + ocfs2_control_exit(); + } + + return rc; +} + +static void __exit ocfs2_user_plugin_exit(void) +{ + ocfs2_stack_glue_unregister(&ocfs2_user_plugin); + ocfs2_control_exit(); +} + +MODULE_AUTHOR("Oracle"); +MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks"); +MODULE_LICENSE("GPL"); +module_init(ocfs2_user_plugin_init); +module_exit(ocfs2_user_plugin_exit); diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c new file mode 100644 index 00000000..39abf896 --- /dev/null +++ b/fs/ocfs2/stackglue.c @@ -0,0 +1,726 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * stackglue.c + * + * Code which implements an OCFS2 specific interface to underlying + * cluster stacks. + * + * Copyright (C) 2007, 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ocfs2_fs.h" + +#include "stackglue.h" + +#define OCFS2_STACK_PLUGIN_O2CB "o2cb" +#define OCFS2_STACK_PLUGIN_USER "user" +#define OCFS2_MAX_HB_CTL_PATH 256 + +static struct ocfs2_protocol_version locking_max_version; +static DEFINE_SPINLOCK(ocfs2_stack_lock); +static LIST_HEAD(ocfs2_stack_list); +static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; +static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; + +/* + * The stack currently in use. If not null, active_stack->sp_count > 0, + * the module is pinned, and the locking protocol cannot be changed. + */ +static struct ocfs2_stack_plugin *active_stack; + +static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) +{ + struct ocfs2_stack_plugin *p; + + assert_spin_locked(&ocfs2_stack_lock); + + list_for_each_entry(p, &ocfs2_stack_list, sp_list) { + if (!strcmp(p->sp_name, name)) + return p; + } + + return NULL; +} + +static int ocfs2_stack_driver_request(const char *stack_name, + const char *plugin_name) +{ + int rc; + struct ocfs2_stack_plugin *p; + + spin_lock(&ocfs2_stack_lock); + + /* + * If the stack passed by the filesystem isn't the selected one, + * we can't continue. + */ + if (strcmp(stack_name, cluster_stack_name)) { + rc = -EBUSY; + goto out; + } + + if (active_stack) { + /* + * If the active stack isn't the one we want, it cannot + * be selected right now. + */ + if (!strcmp(active_stack->sp_name, plugin_name)) + rc = 0; + else + rc = -EBUSY; + goto out; + } + + p = ocfs2_stack_lookup(plugin_name); + if (!p || !try_module_get(p->sp_owner)) { + rc = -ENOENT; + goto out; + } + + active_stack = p; + rc = 0; + +out: + /* If we found it, pin it */ + if (!rc) + active_stack->sp_count++; + + spin_unlock(&ocfs2_stack_lock); + return rc; +} + +/* + * This function looks up the appropriate stack and makes it active. If + * there is no stack, it tries to load it. It will fail if the stack still + * cannot be found. It will also fail if a different stack is in use. + */ +static int ocfs2_stack_driver_get(const char *stack_name) +{ + int rc; + char *plugin_name = OCFS2_STACK_PLUGIN_O2CB; + + /* + * Classic stack does not pass in a stack name. This is + * compatible with older tools as well. + */ + if (!stack_name || !*stack_name) + stack_name = OCFS2_STACK_PLUGIN_O2CB; + + if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) { + printk(KERN_ERR + "ocfs2 passed an invalid cluster stack label: \"%s\"\n", + stack_name); + return -EINVAL; + } + + /* Anything that isn't the classic stack is a user stack */ + if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB)) + plugin_name = OCFS2_STACK_PLUGIN_USER; + + rc = ocfs2_stack_driver_request(stack_name, plugin_name); + if (rc == -ENOENT) { + request_module("ocfs2_stack_%s", plugin_name); + rc = ocfs2_stack_driver_request(stack_name, plugin_name); + } + + if (rc == -ENOENT) { + printk(KERN_ERR + "ocfs2: Cluster stack driver \"%s\" cannot be found\n", + plugin_name); + } else if (rc == -EBUSY) { + printk(KERN_ERR + "ocfs2: A different cluster stack is in use\n"); + } + + return rc; +} + +static void ocfs2_stack_driver_put(void) +{ + spin_lock(&ocfs2_stack_lock); + BUG_ON(active_stack == NULL); + BUG_ON(active_stack->sp_count == 0); + + active_stack->sp_count--; + if (!active_stack->sp_count) { + module_put(active_stack->sp_owner); + active_stack = NULL; + } + spin_unlock(&ocfs2_stack_lock); +} + +int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin) +{ + int rc; + + spin_lock(&ocfs2_stack_lock); + if (!ocfs2_stack_lookup(plugin->sp_name)) { + plugin->sp_count = 0; + plugin->sp_max_proto = locking_max_version; + list_add(&plugin->sp_list, &ocfs2_stack_list); + printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", + plugin->sp_name); + rc = 0; + } else { + printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n", + plugin->sp_name); + rc = -EEXIST; + } + spin_unlock(&ocfs2_stack_lock); + + return rc; +} +EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register); + +void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin) +{ + struct ocfs2_stack_plugin *p; + + spin_lock(&ocfs2_stack_lock); + p = ocfs2_stack_lookup(plugin->sp_name); + if (p) { + BUG_ON(p != plugin); + BUG_ON(plugin == active_stack); + BUG_ON(plugin->sp_count != 0); + list_del_init(&plugin->sp_list); + printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n", + plugin->sp_name); + } else { + printk(KERN_ERR "Stack \"%s\" is not registered\n", + plugin->sp_name); + } + spin_unlock(&ocfs2_stack_lock); +} +EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); + +void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto) +{ + struct ocfs2_stack_plugin *p; + + spin_lock(&ocfs2_stack_lock); + if (memcmp(max_proto, &locking_max_version, + sizeof(struct ocfs2_protocol_version))) { + BUG_ON(locking_max_version.pv_major != 0); + + locking_max_version = *max_proto; + list_for_each_entry(p, &ocfs2_stack_list, sp_list) { + p->sp_max_proto = locking_max_version; + } + } + spin_unlock(&ocfs2_stack_lock); +} +EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version); + + +/* + * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument + * for the ast and bast functions. They will pass the lksb to the ast + * and bast. The caller can wrap the lksb with their own structure to + * get more information. + */ +int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, + int mode, + struct ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen) +{ + if (!lksb->lksb_conn) + lksb->lksb_conn = conn; + else + BUG_ON(lksb->lksb_conn != conn); + return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, + name, namelen); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); + +int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, + struct ocfs2_dlm_lksb *lksb, + u32 flags) +{ + BUG_ON(lksb->lksb_conn == NULL); + + return active_stack->sp_ops->dlm_unlock(conn, lksb, flags); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); + +int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) +{ + return active_stack->sp_ops->lock_status(lksb); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); + +int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) +{ + return active_stack->sp_ops->lvb_valid(lksb); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid); + +void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb) +{ + return active_stack->sp_ops->lock_lvb(lksb); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); + +void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb) +{ + active_stack->sp_ops->dump_lksb(lksb); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb); + +int ocfs2_stack_supports_plocks(void) +{ + return active_stack && active_stack->sp_ops->plock; +} +EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks); + +/* + * ocfs2_plock() can only be safely called if + * ocfs2_stack_supports_plocks() returned true + */ +int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, + struct file *file, int cmd, struct file_lock *fl) +{ + WARN_ON_ONCE(active_stack->sp_ops->plock == NULL); + if (active_stack->sp_ops->plock) + return active_stack->sp_ops->plock(conn, ino, file, cmd, fl); + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(ocfs2_plock); + +int ocfs2_cluster_connect(const char *stack_name, + const char *group, + int grouplen, + struct ocfs2_locking_protocol *lproto, + void (*recovery_handler)(int node_num, + void *recovery_data), + void *recovery_data, + struct ocfs2_cluster_connection **conn) +{ + int rc = 0; + struct ocfs2_cluster_connection *new_conn; + + BUG_ON(group == NULL); + BUG_ON(conn == NULL); + BUG_ON(recovery_handler == NULL); + + if (grouplen > GROUP_NAME_MAX) { + rc = -EINVAL; + goto out; + } + + if (memcmp(&lproto->lp_max_version, &locking_max_version, + sizeof(struct ocfs2_protocol_version))) { + rc = -EINVAL; + goto out; + } + + new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection), + GFP_KERNEL); + if (!new_conn) { + rc = -ENOMEM; + goto out; + } + + memcpy(new_conn->cc_name, group, grouplen); + new_conn->cc_namelen = grouplen; + new_conn->cc_recovery_handler = recovery_handler; + new_conn->cc_recovery_data = recovery_data; + + new_conn->cc_proto = lproto; + /* Start the new connection at our maximum compatibility level */ + new_conn->cc_version = lproto->lp_max_version; + + /* This will pin the stack driver if successful */ + rc = ocfs2_stack_driver_get(stack_name); + if (rc) + goto out_free; + + rc = active_stack->sp_ops->connect(new_conn); + if (rc) { + ocfs2_stack_driver_put(); + goto out_free; + } + + *conn = new_conn; + +out_free: + if (rc) + kfree(new_conn); + +out: + return rc; +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); + +/* The caller will ensure all nodes have the same cluster stack */ +int ocfs2_cluster_connect_agnostic(const char *group, + int grouplen, + struct ocfs2_locking_protocol *lproto, + void (*recovery_handler)(int node_num, + void *recovery_data), + void *recovery_data, + struct ocfs2_cluster_connection **conn) +{ + char *stack_name = NULL; + + if (cluster_stack_name[0]) + stack_name = cluster_stack_name; + return ocfs2_cluster_connect(stack_name, group, grouplen, lproto, + recovery_handler, recovery_data, conn); +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); + +/* If hangup_pending is 0, the stack driver will be dropped */ +int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, + int hangup_pending) +{ + int ret; + + BUG_ON(conn == NULL); + + ret = active_stack->sp_ops->disconnect(conn); + + /* XXX Should we free it anyway? */ + if (!ret) { + kfree(conn); + if (!hangup_pending) + ocfs2_stack_driver_put(); + } + + return ret; +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect); + +/* + * Leave the group for this filesystem. This is executed by a userspace + * program (stored in ocfs2_hb_ctl_path). + */ +static void ocfs2_leave_group(const char *group) +{ + int ret; + char *argv[5], *envp[3]; + + argv[0] = ocfs2_hb_ctl_path; + argv[1] = "-K"; + argv[2] = "-u"; + argv[3] = (char *)group; + argv[4] = NULL; + + /* minimal command environment taken from cpu_run_sbin_hotplug */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + if (ret < 0) { + printk(KERN_ERR + "ocfs2: Error %d running user helper " + "\"%s %s %s %s\"\n", + ret, argv[0], argv[1], argv[2], argv[3]); + } +} + +/* + * Hangup is a required post-umount. ocfs2-tools software expects the + * filesystem to call "ocfs2_hb_ctl" during unmount. This happens + * regardless of whether the DLM got started, so we can't do it + * in ocfs2_cluster_disconnect(). The ocfs2_leave_group() function does + * the actual work. + */ +void ocfs2_cluster_hangup(const char *group, int grouplen) +{ + BUG_ON(group == NULL); + BUG_ON(group[grouplen] != '\0'); + + ocfs2_leave_group(group); + + /* cluster_disconnect() was called with hangup_pending==1 */ + ocfs2_stack_driver_put(); +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); + +int ocfs2_cluster_this_node(unsigned int *node) +{ + return active_stack->sp_ops->this_node(node); +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); + + +/* + * Sysfs bits + */ + +static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t ret = 0; + + spin_lock(&ocfs2_stack_lock); + if (locking_max_version.pv_major) + ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", + locking_max_version.pv_major, + locking_max_version.pv_minor); + spin_unlock(&ocfs2_stack_lock); + + return ret; +} + +static struct kobj_attribute ocfs2_attr_max_locking_protocol = + __ATTR(max_locking_protocol, S_IFREG | S_IRUGO, + ocfs2_max_locking_protocol_show, NULL); + +static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t ret = 0, total = 0, remain = PAGE_SIZE; + struct ocfs2_stack_plugin *p; + + spin_lock(&ocfs2_stack_lock); + list_for_each_entry(p, &ocfs2_stack_list, sp_list) { + ret = snprintf(buf, remain, "%s\n", + p->sp_name); + if (ret < 0) { + total = ret; + break; + } + if (ret == remain) { + /* snprintf() didn't fit */ + total = -E2BIG; + break; + } + total += ret; + remain -= ret; + } + spin_unlock(&ocfs2_stack_lock); + + return total; +} + +static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = + __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO, + ocfs2_loaded_cluster_plugins_show, NULL); + +static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t ret = 0; + + spin_lock(&ocfs2_stack_lock); + if (active_stack) { + ret = snprintf(buf, PAGE_SIZE, "%s\n", + active_stack->sp_name); + if (ret == PAGE_SIZE) + ret = -E2BIG; + } + spin_unlock(&ocfs2_stack_lock); + + return ret; +} + +static struct kobj_attribute ocfs2_attr_active_cluster_plugin = + __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO, + ocfs2_active_cluster_plugin_show, NULL); + +static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t ret; + spin_lock(&ocfs2_stack_lock); + ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name); + spin_unlock(&ocfs2_stack_lock); + + return ret; +} + +static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + size_t len = count; + ssize_t ret; + + if (len == 0) + return len; + + if (buf[len - 1] == '\n') + len--; + + if ((len != OCFS2_STACK_LABEL_LEN) || + (strnlen(buf, len) != len)) + return -EINVAL; + + spin_lock(&ocfs2_stack_lock); + if (active_stack) { + if (!strncmp(buf, cluster_stack_name, len)) + ret = count; + else + ret = -EBUSY; + } else { + memcpy(cluster_stack_name, buf, len); + ret = count; + } + spin_unlock(&ocfs2_stack_lock); + + return ret; +} + + +static struct kobj_attribute ocfs2_attr_cluster_stack = + __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR, + ocfs2_cluster_stack_show, + ocfs2_cluster_stack_store); + +static struct attribute *ocfs2_attrs[] = { + &ocfs2_attr_max_locking_protocol.attr, + &ocfs2_attr_loaded_cluster_plugins.attr, + &ocfs2_attr_active_cluster_plugin.attr, + &ocfs2_attr_cluster_stack.attr, + NULL, +}; + +static struct attribute_group ocfs2_attr_group = { + .attrs = ocfs2_attrs, +}; + +static struct kset *ocfs2_kset; + +static void ocfs2_sysfs_exit(void) +{ + kset_unregister(ocfs2_kset); +} + +static int ocfs2_sysfs_init(void) +{ + int ret; + + ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj); + if (!ocfs2_kset) + return -ENOMEM; + + ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group); + if (ret) + goto error; + + return 0; + +error: + kset_unregister(ocfs2_kset); + return ret; +} + +/* + * Sysctl bits + * + * The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path. The 'nm' doesn't + * make as much sense in a multiple cluster stack world, but it's safer + * and easier to preserve the name. + */ + +#define FS_OCFS2_NM 1 + +static ctl_table ocfs2_nm_table[] = { + { + .procname = "hb_ctl_path", + .data = ocfs2_hb_ctl_path, + .maxlen = OCFS2_MAX_HB_CTL_PATH, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { } +}; + +static ctl_table ocfs2_mod_table[] = { + { + .procname = "nm", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ocfs2_nm_table + }, + { } +}; + +static ctl_table ocfs2_kern_table[] = { + { + .procname = "ocfs2", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ocfs2_mod_table + }, + { } +}; + +static ctl_table ocfs2_root_table[] = { + { + .procname = "fs", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ocfs2_kern_table + }, + { } +}; + +static struct ctl_table_header *ocfs2_table_header = NULL; + + +/* + * Initialization + */ + +static int __init ocfs2_stack_glue_init(void) +{ + strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); + + ocfs2_table_header = register_sysctl_table(ocfs2_root_table); + if (!ocfs2_table_header) { + printk(KERN_ERR + "ocfs2 stack glue: unable to register sysctl\n"); + return -ENOMEM; /* or something. */ + } + + return ocfs2_sysfs_init(); +} + +static void __exit ocfs2_stack_glue_exit(void) +{ + memset(&locking_max_version, 0, + sizeof(struct ocfs2_protocol_version)); + locking_max_version.pv_major = 0; + locking_max_version.pv_minor = 0; + ocfs2_sysfs_exit(); + if (ocfs2_table_header) + unregister_sysctl_table(ocfs2_table_header); +} + +MODULE_AUTHOR("Oracle"); +MODULE_DESCRIPTION("ocfs2 cluter stack glue layer"); +MODULE_LICENSE("GPL"); +module_init(ocfs2_stack_glue_init); +module_exit(ocfs2_stack_glue_exit); diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h new file mode 100644 index 00000000..1ec56fdb --- /dev/null +++ b/fs/ocfs2/stackglue.h @@ -0,0 +1,292 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * stackglue.h + * + * Glue to the underlying cluster stack. + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + + +#ifndef STACKGLUE_H +#define STACKGLUE_H + +#include +#include +#include + +#include "dlm/dlmapi.h" +#include + +/* Needed for plock-related prototypes */ +struct file; +struct file_lock; + +/* + * dlmconstants.h does not have a LOCAL flag. We hope to remove it + * some day, but right now we need it. Let's fake it. This value is larger + * than any flag in dlmconstants.h. + */ +#define DLM_LKF_LOCAL 0x00100000 + +/* + * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h. That probably + * wants to be in a public header. + */ +#define GROUP_NAME_MAX 64 + + +/* + * ocfs2_protocol_version changes when ocfs2 does something different in + * its inter-node behavior. See dlmglue.c for more information. + */ +struct ocfs2_protocol_version { + u8 pv_major; + u8 pv_minor; +}; + +/* + * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only + * has a pointer to separately allocated lvb space. This struct exists only to + * include in the lksb union to make space for a combined dlm_lksb and lvb. + */ +struct fsdlm_lksb_plus_lvb { + struct dlm_lksb lksb; + char lvb[DLM_LVB_LEN]; +}; + +/* + * A union of all lock status structures. We define it here so that the + * size of the union is known. Lock status structures are embedded in + * ocfs2 inodes. + */ +struct ocfs2_cluster_connection; +struct ocfs2_dlm_lksb { + union { + struct dlm_lockstatus lksb_o2dlm; + struct dlm_lksb lksb_fsdlm; + struct fsdlm_lksb_plus_lvb padding; + }; + struct ocfs2_cluster_connection *lksb_conn; +}; + +/* + * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf. + */ +struct ocfs2_locking_protocol { + struct ocfs2_protocol_version lp_max_version; + void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb); + void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level); + void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error); +}; + + +/* + * A cluster connection. Mostly opaque to ocfs2, the connection holds + * state for the underlying stack. ocfs2 does use cc_version to determine + * locking compatibility. + */ +struct ocfs2_cluster_connection { + char cc_name[GROUP_NAME_MAX]; + int cc_namelen; + struct ocfs2_protocol_version cc_version; + struct ocfs2_locking_protocol *cc_proto; + void (*cc_recovery_handler)(int node_num, void *recovery_data); + void *cc_recovery_data; + void *cc_lockspace; + void *cc_private; +}; + +/* + * Each cluster stack implements the stack operations structure. Not used + * in the ocfs2 code, the stackglue code translates generic cluster calls + * into stack operations. + */ +struct ocfs2_stack_operations { + /* + * The fs code calls ocfs2_cluster_connect() to attach a new + * filesystem to the cluster stack. The ->connect() op is passed + * an ocfs2_cluster_connection with the name and recovery field + * filled in. + * + * The stack must set up any notification mechanisms and create + * the filesystem lockspace in the DLM. The lockspace should be + * stored on cc_lockspace. Any other information can be stored on + * cc_private. + * + * ->connect() must not return until it is guaranteed that + * + * - Node down notifications for the filesystem will be received + * and passed to conn->cc_recovery_handler(). + * - Locking requests for the filesystem will be processed. + */ + int (*connect)(struct ocfs2_cluster_connection *conn); + + /* + * The fs code calls ocfs2_cluster_disconnect() when a filesystem + * no longer needs cluster services. All DLM locks have been + * dropped, and recovery notification is being ignored by the + * fs code. The stack must disengage from the DLM and discontinue + * recovery notification. + * + * Once ->disconnect() has returned, the connection structure will + * be freed. Thus, a stack must not return from ->disconnect() + * until it will no longer reference the conn pointer. + * + * Once this call returns, the stack glue will be dropping this + * connection's reference on the module. + */ + int (*disconnect)(struct ocfs2_cluster_connection *conn); + + /* + * ->this_node() returns the cluster's unique identifier for the + * local node. + */ + int (*this_node)(unsigned int *node); + + /* + * Call the underlying dlm lock function. The ->dlm_lock() + * callback should convert the flags and mode as appropriate. + * + * ast and bast functions are not part of the call because the + * stack will likely want to wrap ast and bast calls before passing + * them to stack->sp_proto. There is no astarg. The lksb will + * be passed back to the ast and bast functions. The caller can + * use this to find their object. + */ + int (*dlm_lock)(struct ocfs2_cluster_connection *conn, + int mode, + struct ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen); + + /* + * Call the underlying dlm unlock function. The ->dlm_unlock() + * function should convert the flags as appropriate. + * + * The unlock ast is not passed, as the stack will want to wrap + * it before calling stack->sp_proto->lp_unlock_ast(). There is + * no astarg. The lksb will be passed back to the unlock ast + * function. The caller can use this to find their object. + */ + int (*dlm_unlock)(struct ocfs2_cluster_connection *conn, + struct ocfs2_dlm_lksb *lksb, + u32 flags); + + /* + * Return the status of the current lock status block. The fs + * code should never dereference the union. The ->lock_status() + * callback pulls out the stack-specific lksb, converts the status + * to a proper errno, and returns it. + */ + int (*lock_status)(struct ocfs2_dlm_lksb *lksb); + + /* + * Return non-zero if the LVB is valid. + */ + int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb); + + /* + * Pull the lvb pointer off of the stack-specific lksb. + */ + void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb); + + /* + * Cluster-aware posix locks + * + * This is NULL for stacks which do not support posix locks. + */ + int (*plock)(struct ocfs2_cluster_connection *conn, + u64 ino, + struct file *file, + int cmd, + struct file_lock *fl); + + /* + * This is an optoinal debugging hook. If provided, the + * stack can dump debugging information about this lock. + */ + void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb); +}; + +/* + * Each stack plugin must describe itself by registering a + * ocfs2_stack_plugin structure. This is only seen by stackglue and the + * stack driver. + */ +struct ocfs2_stack_plugin { + char *sp_name; + struct ocfs2_stack_operations *sp_ops; + struct module *sp_owner; + + /* These are managed by the stackglue code. */ + struct list_head sp_list; + unsigned int sp_count; + struct ocfs2_protocol_version sp_max_proto; +}; + + +/* Used by the filesystem */ +int ocfs2_cluster_connect(const char *stack_name, + const char *group, + int grouplen, + struct ocfs2_locking_protocol *lproto, + void (*recovery_handler)(int node_num, + void *recovery_data), + void *recovery_data, + struct ocfs2_cluster_connection **conn); +/* + * Used by callers that don't store their stack name. They must ensure + * all nodes have the same stack. + */ +int ocfs2_cluster_connect_agnostic(const char *group, + int grouplen, + struct ocfs2_locking_protocol *lproto, + void (*recovery_handler)(int node_num, + void *recovery_data), + void *recovery_data, + struct ocfs2_cluster_connection **conn); +int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, + int hangup_pending); +void ocfs2_cluster_hangup(const char *group, int grouplen); +int ocfs2_cluster_this_node(unsigned int *node); + +struct ocfs2_lock_res; +int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, + int mode, + struct ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen); +int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, + struct ocfs2_dlm_lksb *lksb, + u32 flags); + +int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb); +int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb); +void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb); +void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb); + +int ocfs2_stack_supports_plocks(void); +int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, + struct file *file, int cmd, struct file_lock *fl); + +void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto); + + +/* Used by stack plugins */ +int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); +void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); + +#endif /* STACKGLUE_H */ diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c new file mode 100644 index 00000000..f169da46 --- /dev/null +++ b/fs/ocfs2/suballoc.c @@ -0,0 +1,2908 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * suballoc.c + * + * metadata alloc and free + * Inspired by ext3 block groups. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "blockcheck.h" +#include "dlmglue.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "suballoc.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" +#include "ocfs2_trace.h" + +#include "buffer_head_io.h" + +#define NOT_ALLOC_NEW_GROUP 0 +#define ALLOC_NEW_GROUP 0x1 +#define ALLOC_GROUPS_FROM_GLOBAL 0x2 + +#define OCFS2_MAX_TO_STEAL 1024 + +struct ocfs2_suballoc_result { + u64 sr_bg_blkno; /* The bg we allocated from. Set + to 0 when a block group is + contiguous. */ + u64 sr_bg_stable_blkno; /* + * Doesn't change, always + * set to target block + * group descriptor + * block. + */ + u64 sr_blkno; /* The first allocated block */ + unsigned int sr_bit_offset; /* The bit in the bg */ + unsigned int sr_bits; /* How many bits we claimed */ +}; + +static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) +{ + if (res->sr_blkno == 0) + return 0; + + if (res->sr_bg_blkno) + return res->sr_bg_blkno; + + return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); +} + +static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); +static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); +static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); +static int ocfs2_block_group_fill(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *bg_bh, + u64 group_blkno, + unsigned int group_clusters, + u16 my_chain, + struct ocfs2_chain_list *cl); +static int ocfs2_block_group_alloc(struct ocfs2_super *osb, + struct inode *alloc_inode, + struct buffer_head *bh, + u64 max_block, + u64 *last_alloc_group, + int flags); + +static int ocfs2_cluster_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u64 max_block, + struct ocfs2_suballoc_result *res); +static int ocfs2_block_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u64 max_block, + struct ocfs2_suballoc_result *res); +static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, + handle_t *handle, + u32 bits_wanted, + u32 min_bits, + struct ocfs2_suballoc_result *res); +static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, + int nr); +static inline int ocfs2_block_group_set_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits); +static int ocfs2_relink_block_group(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *fe_bh, + struct buffer_head *bg_bh, + struct buffer_head *prev_bg_bh, + u16 chain); +static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, + u32 wanted); +static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, + u64 bg_blkno, + u16 bg_bit_off); +static inline void ocfs2_block_to_cluster_group(struct inode *inode, + u64 data_blkno, + u64 *bg_blkno, + u16 *bg_bit_off); +static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, + u32 bits_wanted, u64 max_block, + int flags, + struct ocfs2_alloc_context **ac); + +void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) +{ + struct inode *inode = ac->ac_inode; + + if (inode) { + if (ac->ac_which != OCFS2_AC_USE_LOCAL) + ocfs2_inode_unlock(inode, 1); + + mutex_unlock(&inode->i_mutex); + + iput(inode); + ac->ac_inode = NULL; + } + brelse(ac->ac_bh); + ac->ac_bh = NULL; + ac->ac_resv = NULL; + if (ac->ac_find_loc_priv) { + kfree(ac->ac_find_loc_priv); + ac->ac_find_loc_priv = NULL; + } +} + +void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) +{ + ocfs2_free_ac_resource(ac); + kfree(ac); +} + +static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) +{ + return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); +} + +#define do_error(fmt, ...) \ + do{ \ + if (resize) \ + mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ + else \ + ocfs2_error(sb, fmt, ##__VA_ARGS__); \ + } while (0) + +static int ocfs2_validate_gd_self(struct super_block *sb, + struct buffer_head *bh, + int resize) +{ + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + + if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { + do_error("Group descriptor #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + gd->bg_signature); + return -EINVAL; + } + + if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { + do_error("Group descriptor #%llu has an invalid bg_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(gd->bg_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { + do_error("Group descriptor #%llu has an invalid " + "fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(gd->bg_generation)); + return -EINVAL; + } + + if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { + do_error("Group descriptor #%llu has bit count %u but " + "claims that %u are free", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits), + le16_to_cpu(gd->bg_free_bits_count)); + return -EINVAL; + } + + if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { + do_error("Group descriptor #%llu has bit count %u but " + "max bitmap bits of %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits), + 8 * le16_to_cpu(gd->bg_size)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_validate_gd_parent(struct super_block *sb, + struct ocfs2_dinode *di, + struct buffer_head *bh, + int resize) +{ + unsigned int max_bits; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + + if (di->i_blkno != gd->bg_parent_dinode) { + do_error("Group descriptor #%llu has bad parent " + "pointer (%llu, expected %llu)", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), + (unsigned long long)le64_to_cpu(di->i_blkno)); + return -EINVAL; + } + + max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); + if (le16_to_cpu(gd->bg_bits) > max_bits) { + do_error("Group descriptor #%llu has bit count of %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits)); + return -EINVAL; + } + + /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ + if ((le16_to_cpu(gd->bg_chain) > + le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || + ((le16_to_cpu(gd->bg_chain) == + le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { + do_error("Group descriptor #%llu has bad chain %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_chain)); + return -EINVAL; + } + + return 0; +} + +#undef do_error + +/* + * This version only prints errors. It does not fail the filesystem, and + * exists only for resize. + */ +int ocfs2_check_group_descriptor(struct super_block *sb, + struct ocfs2_dinode *di, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); + if (rc) { + mlog(ML_ERROR, + "Checksum failed for group descriptor %llu\n", + (unsigned long long)bh->b_blocknr); + } else + rc = ocfs2_validate_gd_self(sb, bh, 1); + if (!rc) + rc = ocfs2_validate_gd_parent(sb, di, bh, 1); + + return rc; +} + +static int ocfs2_validate_group_descriptor(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + + trace_ocfs2_validate_group_descriptor( + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); + if (rc) + return rc; + + /* + * Errors after here are fatal. + */ + + return ocfs2_validate_gd_self(sb, bh, 0); +} + +int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, + u64 gd_blkno, struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, + ocfs2_validate_group_descriptor); + if (rc) + goto out; + + rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); + if (rc) { + brelse(tmp); + goto out; + } + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!*bh) + *bh = tmp; + +out: + return rc; +} + +static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, + struct ocfs2_group_desc *bg, + struct ocfs2_chain_list *cl, + u64 p_blkno, unsigned int clusters) +{ + struct ocfs2_extent_list *el = &bg->bg_list; + struct ocfs2_extent_rec *rec; + + BUG_ON(!ocfs2_supports_discontig_bg(osb)); + if (!el->l_next_free_rec) + el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb)); + rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)]; + rec->e_blkno = cpu_to_le64(p_blkno); + rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / + le16_to_cpu(cl->cl_bpc)); + rec->e_leaf_clusters = cpu_to_le16(clusters); + le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); + le16_add_cpu(&bg->bg_free_bits_count, + clusters * le16_to_cpu(cl->cl_bpc)); + le16_add_cpu(&el->l_next_free_rec, 1); +} + +static int ocfs2_block_group_fill(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *bg_bh, + u64 group_blkno, + unsigned int group_clusters, + u16 my_chain, + struct ocfs2_chain_list *cl) +{ + int status = 0; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + struct super_block * sb = alloc_inode->i_sb; + + if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { + ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " + "b_blocknr (%llu)", + (unsigned long long)group_blkno, + (unsigned long long) bg_bh->b_blocknr); + status = -EIO; + goto bail; + } + + status = ocfs2_journal_access_gd(handle, + INODE_CACHE(alloc_inode), + bg_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(bg, 0, sb->s_blocksize); + strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); + bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, + osb->s_feature_incompat)); + bg->bg_chain = cpu_to_le16(my_chain); + bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; + bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); + bg->bg_blkno = cpu_to_le64(group_blkno); + if (group_clusters == le16_to_cpu(cl->cl_cpg)) + bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); + else + ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno, + group_clusters); + + /* set the 1st bit in the bitmap to account for the descriptor block */ + ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); + bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); + + ocfs2_journal_dirty(handle, bg_bh); + + /* There is no need to zero out or otherwise initialize the + * other blocks in a group - All valid FS metadata in a block + * group stores the superblock fs_generation value at + * allocation time. */ + +bail: + if (status) + mlog_errno(status); + return status; +} + +static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) +{ + u16 curr, best; + + best = curr = 0; + while (curr < le16_to_cpu(cl->cl_count)) { + if (le32_to_cpu(cl->cl_recs[best].c_total) > + le32_to_cpu(cl->cl_recs[curr].c_total)) + best = curr; + curr++; + } + return best; +} + +static struct buffer_head * +ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl) +{ + int status; + u32 bit_off, num_bits; + u64 bg_blkno; + struct buffer_head *bg_bh; + unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); + + status = ocfs2_claim_clusters(handle, ac, + le16_to_cpu(cl->cl_cpg), &bit_off, + &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + /* setup the group */ + bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); + trace_ocfs2_block_group_alloc_contig( + (unsigned long long)bg_blkno, alloc_rec); + + bg_bh = sb_getblk(osb->sb, bg_blkno); + if (!bg_bh) { + status = -EIO; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); + + status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, + bg_blkno, num_bits, alloc_rec, cl); + if (status < 0) { + brelse(bg_bh); + mlog_errno(status); + } + +bail: + return status ? ERR_PTR(status) : bg_bh; +} + +static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + unsigned int min_bits, + u32 *bit_off, u32 *num_bits) +{ + int status = 0; + + while (min_bits) { + status = ocfs2_claim_clusters(handle, ac, min_bits, + bit_off, num_bits); + if (status != -ENOSPC) + break; + + min_bits >>= 1; + } + + return status; +} + +static int ocfs2_block_group_grow_discontig(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *bg_bh, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl, + unsigned int min_bits) +{ + int status; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + struct ocfs2_group_desc *bg = + (struct ocfs2_group_desc *)bg_bh->b_data; + unsigned int needed = le16_to_cpu(cl->cl_cpg) - + le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); + u32 p_cpos, clusters; + u64 p_blkno; + struct ocfs2_extent_list *el = &bg->bg_list; + + status = ocfs2_journal_access_gd(handle, + INODE_CACHE(alloc_inode), + bg_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) < + le16_to_cpu(el->l_count))) { + if (min_bits > needed) + min_bits = needed; + status = ocfs2_block_group_claim_bits(osb, handle, ac, + min_bits, &p_cpos, + &clusters); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos); + ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno, + clusters); + + min_bits = clusters; + needed = le16_to_cpu(cl->cl_cpg) - + le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); + } + + if (needed > 0) { + /* + * We have used up all the extent rec but can't fill up + * the cpg. So bail out. + */ + status = -ENOSPC; + goto bail; + } + + ocfs2_journal_dirty(handle, bg_bh); + +bail: + return status; +} + +static void ocfs2_bg_alloc_cleanup(handle_t *handle, + struct ocfs2_alloc_context *cluster_ac, + struct inode *alloc_inode, + struct buffer_head *bg_bh) +{ + int i, ret; + struct ocfs2_group_desc *bg; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + if (!bg_bh) + return; + + bg = (struct ocfs2_group_desc *)bg_bh->b_data; + el = &bg->bg_list; + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode, + cluster_ac->ac_bh, + le64_to_cpu(rec->e_blkno), + le16_to_cpu(rec->e_leaf_clusters)); + if (ret) + mlog_errno(ret); + /* Try all the clusters to free */ + } + + ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh); + brelse(bg_bh); +} + +static struct buffer_head * +ocfs2_block_group_alloc_discontig(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl) +{ + int status; + u32 bit_off, num_bits; + u64 bg_blkno; + unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1; + struct buffer_head *bg_bh = NULL; + unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + + if (!ocfs2_supports_discontig_bg(osb)) { + status = -ENOSPC; + goto bail; + } + + status = ocfs2_extend_trans(handle, + ocfs2_calc_bg_discontig_credits(osb->sb)); + if (status) { + mlog_errno(status); + goto bail; + } + + /* + * We're going to be grabbing from multiple cluster groups. + * We don't have enough credits to relink them all, and the + * cluster groups will be staying in cache for the duration of + * this operation. + */ + ac->ac_allow_chain_relink = 0; + + /* Claim the first region */ + status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, + &bit_off, &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + min_bits = num_bits; + + /* setup the group */ + bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); + trace_ocfs2_block_group_alloc_discontig( + (unsigned long long)bg_blkno, alloc_rec); + + bg_bh = sb_getblk(osb->sb, bg_blkno); + if (!bg_bh) { + status = -EIO; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); + + status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, + bg_blkno, num_bits, alloc_rec, cl); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_block_group_grow_discontig(handle, alloc_inode, + bg_bh, ac, cl, min_bits); + if (status) + mlog_errno(status); + +bail: + if (status) + ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh); + return status ? ERR_PTR(status) : bg_bh; +} + +/* + * We expect the block group allocator to already be locked. + */ +static int ocfs2_block_group_alloc(struct ocfs2_super *osb, + struct inode *alloc_inode, + struct buffer_head *bh, + u64 max_block, + u64 *last_alloc_group, + int flags) +{ + int status, credits; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; + struct ocfs2_chain_list *cl; + struct ocfs2_alloc_context *ac = NULL; + handle_t *handle = NULL; + u16 alloc_rec; + struct buffer_head *bg_bh = NULL; + struct ocfs2_group_desc *bg; + + BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); + + cl = &fe->id2.i_chain; + status = ocfs2_reserve_clusters_with_limit(osb, + le16_to_cpu(cl->cl_cpg), + max_block, flags, &ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + credits = ocfs2_calc_group_alloc_credits(osb->sb, + le16_to_cpu(cl->cl_cpg)); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + if (last_alloc_group && *last_alloc_group != 0) { + trace_ocfs2_block_group_alloc( + (unsigned long long)*last_alloc_group); + ac->ac_last_group = *last_alloc_group; + } + + bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, + ac, cl); + if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC)) + bg_bh = ocfs2_block_group_alloc_discontig(handle, + alloc_inode, + ac, cl); + if (IS_ERR(bg_bh)) { + status = PTR_ERR(bg_bh); + bg_bh = NULL; + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + bg = (struct ocfs2_group_desc *) bg_bh->b_data; + + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), + bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + alloc_rec = le16_to_cpu(bg->bg_chain); + le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, + le16_to_cpu(bg->bg_free_bits_count)); + le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, + le16_to_cpu(bg->bg_bits)); + cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno; + if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) + le16_add_cpu(&cl->cl_next_free_rec, 1); + + le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - + le16_to_cpu(bg->bg_free_bits_count)); + le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); + le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); + + ocfs2_journal_dirty(handle, bh); + + spin_lock(&OCFS2_I(alloc_inode)->ip_lock); + OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, + le32_to_cpu(fe->i_clusters))); + spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); + i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); + alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); + + status = 0; + + /* save the new last alloc group so that the caller can cache it. */ + if (last_alloc_group) + *last_alloc_group = ac->ac_last_group; + +bail: + if (handle) + ocfs2_commit_trans(osb, handle); + + if (ac) + ocfs2_free_alloc_context(ac); + + brelse(bg_bh); + + if (status) + mlog_errno(status); + return status; +} + +static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac, + int type, + u32 slot, + u64 *last_alloc_group, + int flags) +{ + int status; + u32 bits_wanted = ac->ac_bits_wanted; + struct inode *alloc_inode; + struct buffer_head *bh = NULL; + struct ocfs2_dinode *fe; + u32 free_bits; + + alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); + if (!alloc_inode) { + mlog_errno(-EINVAL); + return -EINVAL; + } + + mutex_lock(&alloc_inode->i_mutex); + + status = ocfs2_inode_lock(alloc_inode, &bh, 1); + if (status < 0) { + mutex_unlock(&alloc_inode->i_mutex); + iput(alloc_inode); + + mlog_errno(status); + return status; + } + + ac->ac_inode = alloc_inode; + ac->ac_alloc_slot = slot; + + fe = (struct ocfs2_dinode *) bh->b_data; + + /* The bh was validated by the inode read inside + * ocfs2_inode_lock(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + + if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { + ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", + (unsigned long long)le64_to_cpu(fe->i_blkno)); + status = -EIO; + goto bail; + } + + free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - + le32_to_cpu(fe->id1.bitmap1.i_used); + + if (bits_wanted > free_bits) { + /* cluster bitmap never grows */ + if (ocfs2_is_cluster_bitmap(alloc_inode)) { + trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted, + free_bits); + status = -ENOSPC; + goto bail; + } + + if (!(flags & ALLOC_NEW_GROUP)) { + trace_ocfs2_reserve_suballoc_bits_no_new_group( + slot, bits_wanted, free_bits); + status = -ENOSPC; + goto bail; + } + + status = ocfs2_block_group_alloc(osb, alloc_inode, bh, + ac->ac_max_block, + last_alloc_group, flags); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + atomic_inc(&osb->alloc_stats.bg_extends); + + /* You should never ask for this much metadata */ + BUG_ON(bits_wanted > + (le32_to_cpu(fe->id1.bitmap1.i_total) + - le32_to_cpu(fe->id1.bitmap1.i_used))); + } + + get_bh(bh); + ac->ac_bh = bh; +bail: + brelse(bh); + + if (status) + mlog_errno(status); + return status; +} + +static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) +{ + spin_lock(&osb->osb_lock); + osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + atomic_set(&osb->s_num_inodes_stolen, 0); +} + +static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb) +{ + spin_lock(&osb->osb_lock); + osb->s_meta_steal_slot = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + atomic_set(&osb->s_num_meta_stolen, 0); +} + +void ocfs2_init_steal_slots(struct ocfs2_super *osb) +{ + ocfs2_init_inode_steal_slot(osb); + ocfs2_init_meta_steal_slot(osb); +} + +static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) +{ + spin_lock(&osb->osb_lock); + if (type == INODE_ALLOC_SYSTEM_INODE) + osb->s_inode_steal_slot = slot; + else if (type == EXTENT_ALLOC_SYSTEM_INODE) + osb->s_meta_steal_slot = slot; + spin_unlock(&osb->osb_lock); +} + +static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type) +{ + int slot = OCFS2_INVALID_SLOT; + + spin_lock(&osb->osb_lock); + if (type == INODE_ALLOC_SYSTEM_INODE) + slot = osb->s_inode_steal_slot; + else if (type == EXTENT_ALLOC_SYSTEM_INODE) + slot = osb->s_meta_steal_slot; + spin_unlock(&osb->osb_lock); + + return slot; +} + +static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) +{ + return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE); +} + +static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb) +{ + return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE); +} + +static int ocfs2_steal_resource(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac, + int type) +{ + int i, status = -ENOSPC; + int slot = __ocfs2_get_steal_slot(osb, type); + + /* Start to steal resource from the first slot after ours. */ + if (slot == OCFS2_INVALID_SLOT) + slot = osb->slot_num + 1; + + for (i = 0; i < osb->max_slots; i++, slot++) { + if (slot == osb->max_slots) + slot = 0; + + if (slot == osb->slot_num) + continue; + + status = ocfs2_reserve_suballoc_bits(osb, ac, + type, + (u32)slot, NULL, + NOT_ALLOC_NEW_GROUP); + if (status >= 0) { + __ocfs2_set_steal_slot(osb, slot, type); + break; + } + + ocfs2_free_ac_resource(ac); + } + + return status; +} + +static int ocfs2_steal_inode(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE); +} + +static int ocfs2_steal_meta(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE); +} + +int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, + int blocks, + struct ocfs2_alloc_context **ac) +{ + int status; + int slot = ocfs2_get_meta_steal_slot(osb); + + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_bits_wanted = blocks; + (*ac)->ac_which = OCFS2_AC_USE_META; + (*ac)->ac_group_search = ocfs2_block_group_search; + + if (slot != OCFS2_INVALID_SLOT && + atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL) + goto extent_steal; + + atomic_set(&osb->s_num_meta_stolen, 0); + status = ocfs2_reserve_suballoc_bits(osb, (*ac), + EXTENT_ALLOC_SYSTEM_INODE, + (u32)osb->slot_num, NULL, + ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP); + + + if (status >= 0) { + status = 0; + if (slot != OCFS2_INVALID_SLOT) + ocfs2_init_meta_steal_slot(osb); + goto bail; + } else if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + + ocfs2_free_ac_resource(*ac); + +extent_steal: + status = ocfs2_steal_meta(osb, *ac); + atomic_inc(&osb->s_num_meta_stolen); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, + struct ocfs2_extent_list *root_el, + struct ocfs2_alloc_context **ac) +{ + return ocfs2_reserve_new_metadata_blocks(osb, + ocfs2_extend_meta_needed(root_el), + ac); +} + +int ocfs2_reserve_new_inode(struct ocfs2_super *osb, + struct ocfs2_alloc_context **ac) +{ + int status; + int slot = ocfs2_get_inode_steal_slot(osb); + u64 alloc_group; + + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_bits_wanted = 1; + (*ac)->ac_which = OCFS2_AC_USE_INODE; + + (*ac)->ac_group_search = ocfs2_block_group_search; + + /* + * stat(2) can't handle i_ino > 32bits, so we tell the + * lower levels not to allocate us a block group past that + * limit. The 'inode64' mount option avoids this behavior. + */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) + (*ac)->ac_max_block = (u32)~0U; + + /* + * slot is set when we successfully steal inode from other nodes. + * It is reset in 3 places: + * 1. when we flush the truncate log + * 2. when we complete local alloc recovery. + * 3. when we successfully allocate from our own slot. + * After it is set, we will go on stealing inodes until we find the + * need to check our slots to see whether there is some space for us. + */ + if (slot != OCFS2_INVALID_SLOT && + atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL) + goto inode_steal; + + atomic_set(&osb->s_num_inodes_stolen, 0); + alloc_group = osb->osb_inode_alloc_group; + status = ocfs2_reserve_suballoc_bits(osb, *ac, + INODE_ALLOC_SYSTEM_INODE, + (u32)osb->slot_num, + &alloc_group, + ALLOC_NEW_GROUP | + ALLOC_GROUPS_FROM_GLOBAL); + if (status >= 0) { + status = 0; + + spin_lock(&osb->osb_lock); + osb->osb_inode_alloc_group = alloc_group; + spin_unlock(&osb->osb_lock); + trace_ocfs2_reserve_new_inode_new_group( + (unsigned long long)alloc_group); + + /* + * Some inodes must be freed by us, so try to allocate + * from our own next time. + */ + if (slot != OCFS2_INVALID_SLOT) + ocfs2_init_inode_steal_slot(osb); + goto bail; + } else if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + + ocfs2_free_ac_resource(*ac); + +inode_steal: + status = ocfs2_steal_inode(osb, *ac); + atomic_inc(&osb->s_num_inodes_stolen); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + if (status) + mlog_errno(status); + return status; +} + +/* local alloc code has to do the same thing, so rather than do this + * twice.. */ +int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + int status; + + ac->ac_which = OCFS2_AC_USE_MAIN; + ac->ac_group_search = ocfs2_cluster_group_search; + + status = ocfs2_reserve_suballoc_bits(osb, ac, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT, NULL, + ALLOC_NEW_GROUP); + if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + +bail: + return status; +} + +/* Callers don't need to care which bitmap (local alloc or main) to + * use so we figure it out for them, but unfortunately this clutters + * things a bit. */ +static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, + u32 bits_wanted, u64 max_block, + int flags, + struct ocfs2_alloc_context **ac) +{ + int status; + + *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_bits_wanted = bits_wanted; + (*ac)->ac_max_block = max_block; + + status = -ENOSPC; + if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) && + ocfs2_alloc_should_use_local(osb, bits_wanted)) { + status = ocfs2_reserve_local_alloc_bits(osb, + bits_wanted, + *ac); + if ((status < 0) && (status != -ENOSPC)) { + mlog_errno(status); + goto bail; + } + } + + if (status == -ENOSPC) { + status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + } + + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_reserve_clusters(struct ocfs2_super *osb, + u32 bits_wanted, + struct ocfs2_alloc_context **ac) +{ + return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, + ALLOC_NEW_GROUP, ac); +} + +/* + * More or less lifted from ext3. I'll leave their description below: + * + * "For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This + * prevents deletes from freeing up the page for reuse until we have + * committed the delete transaction. + * + * If we didn't do this, then deleting something and reallocating it as + * data would allow the old block to be overwritten before the + * transaction committed (because we force data to disk before commit). + * This would lead to corruption if we crashed between overwriting the + * data and committing the delete. + * + * @@@ We may want to make this allocation behaviour conditional on + * data-writes at some point, and disable it for metadata allocations or + * sync-data inodes." + * + * Note: OCFS2 already does this differently for metadata vs data + * allocations, as those bitmaps are separate and undo access is never + * called on a metadata group descriptor. + */ +static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, + int nr) +{ + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + int ret; + + if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) + return 0; + + if (!buffer_jbd(bg_bh)) + return 1; + + jbd_lock_bh_state(bg_bh); + bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; + if (bg) + ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); + else + ret = 1; + jbd_unlock_bh_state(bg_bh); + + return ret; +} + +static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, + struct buffer_head *bg_bh, + unsigned int bits_wanted, + unsigned int total_bits, + struct ocfs2_suballoc_result *res) +{ + void *bitmap; + u16 best_offset, best_size; + int offset, start, found, status = 0; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + + /* Callers got this descriptor from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); + + found = start = best_offset = best_size = 0; + bitmap = bg->bg_bitmap; + + while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) { + if (offset == total_bits) + break; + + if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { + /* We found a zero, but we can't use it as it + * hasn't been put to disk yet! */ + found = 0; + start = offset + 1; + } else if (offset == start) { + /* we found a zero */ + found++; + /* move start to the next bit to test */ + start++; + } else { + /* got a zero after some ones */ + found = 1; + start = offset + 1; + } + if (found > best_size) { + best_size = found; + best_offset = start - found; + } + /* we got everything we needed */ + if (found == bits_wanted) { + /* mlog(0, "Found it all!\n"); */ + break; + } + } + + if (best_size) { + res->sr_bit_offset = best_offset; + res->sr_bits = best_size; + } else { + status = -ENOSPC; + /* No error log here -- see the comment above + * ocfs2_test_bg_bit_allocatable */ + } + + return status; +} + +static inline int ocfs2_block_group_set_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits) +{ + int status; + void *bitmap = bg->bg_bitmap; + int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; + + /* All callers get the descriptor via + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); + BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); + + trace_ocfs2_block_group_set_bits(bit_off, num_bits); + + if (ocfs2_is_cluster_bitmap(alloc_inode)) + journal_type = OCFS2_JOURNAL_ACCESS_UNDO; + + status = ocfs2_journal_access_gd(handle, + INODE_CACHE(alloc_inode), + group_bh, + journal_type); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + le16_add_cpu(&bg->bg_free_bits_count, -num_bits); + if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" + " count %u but claims %u are freed. num_bits %d", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), num_bits); + return -EROFS; + } + while(num_bits--) + ocfs2_set_bit(bit_off++, bitmap); + + ocfs2_journal_dirty(handle, group_bh); + +bail: + if (status) + mlog_errno(status); + return status; +} + +/* find the one with the most empty bits */ +static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) +{ + u16 curr, best; + + BUG_ON(!cl->cl_next_free_rec); + + best = curr = 0; + while (curr < le16_to_cpu(cl->cl_next_free_rec)) { + if (le32_to_cpu(cl->cl_recs[curr].c_free) > + le32_to_cpu(cl->cl_recs[best].c_free)) + best = curr; + curr++; + } + + BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); + return best; +} + +static int ocfs2_relink_block_group(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *fe_bh, + struct buffer_head *bg_bh, + struct buffer_head *prev_bg_bh, + u16 chain) +{ + int status; + /* there is a really tiny chance the journal calls could fail, + * but we wouldn't want inconsistent blocks in *any* case. */ + u64 fe_ptr, bg_ptr, prev_bg_ptr; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; + + /* The caller got these descriptors from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); + + trace_ocfs2_relink_block_group( + (unsigned long long)le64_to_cpu(fe->i_blkno), chain, + (unsigned long long)le64_to_cpu(bg->bg_blkno), + (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); + + fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); + bg_ptr = le64_to_cpu(bg->bg_next_group); + prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); + + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + prev_bg_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + prev_bg->bg_next_group = bg->bg_next_group; + ocfs2_journal_dirty(handle, prev_bg_bh); + + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; + ocfs2_journal_dirty(handle, bg_bh); + + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), + fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; + ocfs2_journal_dirty(handle, fe_bh); + +out_rollback: + if (status < 0) { + fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); + bg->bg_next_group = cpu_to_le64(bg_ptr); + prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); + } + + if (status) + mlog_errno(status); + return status; +} + +static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, + u32 wanted) +{ + return le16_to_cpu(bg->bg_free_bits_count) > wanted; +} + +/* return 0 on success, -ENOSPC to keep searching and any other < 0 + * value on error. */ +static int ocfs2_cluster_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u64 max_block, + struct ocfs2_suballoc_result *res) +{ + int search = -ENOSPC; + int ret; + u64 blkoff; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int max_bits, gd_cluster_off; + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + if (gd->bg_free_bits_count) { + max_bits = le16_to_cpu(gd->bg_bits); + + /* Tail groups in cluster bitmaps which aren't cpg + * aligned are prone to partial extension by a failed + * fs resize. If the file system resize never got to + * update the dinode cluster count, then we don't want + * to trust any clusters past it, regardless of what + * the group descriptor says. */ + gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(gd->bg_blkno)); + if ((gd_cluster_off + max_bits) > + OCFS2_I(inode)->ip_clusters) { + max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; + trace_ocfs2_cluster_group_search_wrong_max_bits( + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + OCFS2_I(inode)->ip_clusters, max_bits); + } + + ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), + group_bh, bits_wanted, + max_bits, res); + if (ret) + return ret; + + if (max_block) { + blkoff = ocfs2_clusters_to_blocks(inode->i_sb, + gd_cluster_off + + res->sr_bit_offset + + res->sr_bits); + trace_ocfs2_cluster_group_search_max_block( + (unsigned long long)blkoff, + (unsigned long long)max_block); + if (blkoff > max_block) + return -ENOSPC; + } + + /* ocfs2_block_group_find_clear_bits() might + * return success, but we still want to return + * -ENOSPC unless it found the minimum number + * of bits. */ + if (min_bits <= res->sr_bits) + search = 0; /* success */ + else if (res->sr_bits) { + /* + * Don't show bits which we'll be returning + * for allocation to the local alloc bitmap. + */ + ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits); + } + } + + return search; +} + +static int ocfs2_block_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u64 max_block, + struct ocfs2_suballoc_result *res) +{ + int ret = -ENOSPC; + u64 blkoff; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; + + BUG_ON(min_bits != 1); + BUG_ON(ocfs2_is_cluster_bitmap(inode)); + + if (bg->bg_free_bits_count) { + ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), + group_bh, bits_wanted, + le16_to_cpu(bg->bg_bits), + res); + if (!ret && max_block) { + blkoff = le64_to_cpu(bg->bg_blkno) + + res->sr_bit_offset + res->sr_bits; + trace_ocfs2_block_group_search_max_block( + (unsigned long long)blkoff, + (unsigned long long)max_block); + if (blkoff > max_block) + ret = -ENOSPC; + } + } + + return ret; +} + +static int ocfs2_alloc_dinode_update_counts(struct inode *inode, + handle_t *handle, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain) +{ + int ret; + u32 tmp_used; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); + di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); + le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); + ocfs2_journal_dirty(handle, di_bh); + +out: + return ret; +} + +static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, + struct ocfs2_extent_rec *rec, + struct ocfs2_chain_list *cl) +{ + unsigned int bpc = le16_to_cpu(cl->cl_bpc); + unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc; + unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc; + + if (res->sr_bit_offset < bitoff) + return 0; + if (res->sr_bit_offset >= (bitoff + bitcount)) + return 0; + res->sr_blkno = le64_to_cpu(rec->e_blkno) + + (res->sr_bit_offset - bitoff); + if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount)) + res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset; + return 1; +} + +static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac, + struct ocfs2_group_desc *bg, + struct ocfs2_suballoc_result *res) +{ + int i; + u64 bg_blkno = res->sr_bg_blkno; /* Save off */ + struct ocfs2_extent_rec *rec; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; + struct ocfs2_chain_list *cl = &di->id2.i_chain; + + if (ocfs2_is_cluster_bitmap(ac->ac_inode)) { + res->sr_blkno = 0; + return; + } + + res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset; + res->sr_bg_blkno = 0; /* Clear it for contig block groups */ + if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) || + !bg->bg_list.l_next_free_rec) + return; + + for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) { + rec = &bg->bg_list.l_recs[i]; + if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) { + res->sr_bg_blkno = bg_blkno; /* Restore */ + break; + } + } +} + +static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, + handle_t *handle, + u32 bits_wanted, + u32 min_bits, + struct ocfs2_suballoc_result *res, + u16 *bits_left) +{ + int ret; + struct buffer_head *group_bh = NULL; + struct ocfs2_group_desc *gd; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; + struct inode *alloc_inode = ac->ac_inode; + + ret = ocfs2_read_group_descriptor(alloc_inode, di, + res->sr_bg_blkno, &group_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + gd = (struct ocfs2_group_desc *) group_bh->b_data; + ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, + ac->ac_max_block, res); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + if (!ret) + ocfs2_bg_discontig_fix_result(ac, gd, res); + + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; + + if (ac->ac_find_loc_only) + goto out_loc_only; + + ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, + res->sr_bits, + le16_to_cpu(gd->bg_chain)); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, + res->sr_bit_offset, res->sr_bits); + if (ret < 0) + mlog_errno(ret); + +out_loc_only: + *bits_left = le16_to_cpu(gd->bg_free_bits_count); + +out: + brelse(group_bh); + + return ret; +} + +static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, + handle_t *handle, + u32 bits_wanted, + u32 min_bits, + struct ocfs2_suballoc_result *res, + u16 *bits_left) +{ + int status; + u16 chain; + u64 next_group; + struct inode *alloc_inode = ac->ac_inode; + struct buffer_head *group_bh = NULL; + struct buffer_head *prev_group_bh = NULL; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; + struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; + struct ocfs2_group_desc *bg; + + chain = ac->ac_chain; + trace_ocfs2_search_chain_begin( + (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, + bits_wanted, chain); + + status = ocfs2_read_group_descriptor(alloc_inode, fe, + le64_to_cpu(cl->cl_recs[chain].c_blkno), + &group_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + bg = (struct ocfs2_group_desc *) group_bh->b_data; + + status = -ENOSPC; + /* for now, the chain search is a bit simplistic. We just use + * the 1st group with any empty bits. */ + while ((status = ac->ac_group_search(alloc_inode, group_bh, + bits_wanted, min_bits, + ac->ac_max_block, + res)) == -ENOSPC) { + if (!bg->bg_next_group) + break; + + brelse(prev_group_bh); + prev_group_bh = NULL; + + next_group = le64_to_cpu(bg->bg_next_group); + prev_group_bh = group_bh; + group_bh = NULL; + status = ocfs2_read_group_descriptor(alloc_inode, fe, + next_group, &group_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + bg = (struct ocfs2_group_desc *) group_bh->b_data; + } + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + trace_ocfs2_search_chain_succ( + (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits); + + res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno); + + BUG_ON(res->sr_bits == 0); + if (!status) + ocfs2_bg_discontig_fix_result(ac, bg, res); + + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; + + /* + * Keep track of previous block descriptor read. When + * we find a target, if we have read more than X + * number of descriptors, and the target is reasonably + * empty, relink him to top of his chain. + * + * We've read 0 extra blocks and only send one more to + * the transaction, yet the next guy to search has a + * much easier time. + * + * Do this *after* figuring out how many bits we're taking out + * of our target group. + */ + if (ac->ac_allow_chain_relink && + (prev_group_bh) && + (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) { + status = ocfs2_relink_block_group(handle, alloc_inode, + ac->ac_bh, group_bh, + prev_group_bh, chain); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + if (ac->ac_find_loc_only) + goto out_loc_only; + + status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (status) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_block_group_set_bits(handle, + alloc_inode, + bg, + group_bh, + res->sr_bit_offset, + res->sr_bits); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + trace_ocfs2_search_chain_end( + (unsigned long long)le64_to_cpu(fe->i_blkno), + res->sr_bits); + +out_loc_only: + *bits_left = le16_to_cpu(bg->bg_free_bits_count); +bail: + brelse(group_bh); + brelse(prev_group_bh); + + if (status) + mlog_errno(status); + return status; +} + +/* will give out up to bits_wanted contiguous bits. */ +static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, + handle_t *handle, + u32 bits_wanted, + u32 min_bits, + struct ocfs2_suballoc_result *res) +{ + int status; + u16 victim, i; + u16 bits_left = 0; + u64 hint = ac->ac_last_group; + struct ocfs2_chain_list *cl; + struct ocfs2_dinode *fe; + + BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); + BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); + BUG_ON(!ac->ac_bh); + + fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; + + /* The bh was validated by the inode read during + * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + + if (le32_to_cpu(fe->id1.bitmap1.i_used) >= + le32_to_cpu(fe->id1.bitmap1.i_total)) { + ocfs2_error(ac->ac_inode->i_sb, + "Chain allocator dinode %llu has %u used " + "bits but only %u total.", + (unsigned long long)le64_to_cpu(fe->i_blkno), + le32_to_cpu(fe->id1.bitmap1.i_used), + le32_to_cpu(fe->id1.bitmap1.i_total)); + status = -EIO; + goto bail; + } + + res->sr_bg_blkno = hint; + if (res->sr_bg_blkno) { + /* Attempt to short-circuit the usual search mechanism + * by jumping straight to the most recently used + * allocation group. This helps us maintain some + * contiguousness across allocations. */ + status = ocfs2_search_one_group(ac, handle, bits_wanted, + min_bits, res, &bits_left); + if (!status) + goto set_hint; + if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + } + + cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; + + victim = ocfs2_find_victim_chain(cl); + ac->ac_chain = victim; + ac->ac_allow_chain_relink = 1; + + status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, + res, &bits_left); + if (!status) { + hint = ocfs2_group_from_res(res); + goto set_hint; + } + if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + + trace_ocfs2_claim_suballoc_bits(victim); + + /* If we didn't pick a good victim, then just default to + * searching each chain in order. Don't allow chain relinking + * because we only calculate enough journal credits for one + * relink per alloc. */ + ac->ac_allow_chain_relink = 0; + for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { + if (i == victim) + continue; + if (!cl->cl_recs[i].c_free) + continue; + + ac->ac_chain = i; + status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, + res, &bits_left); + if (!status) { + hint = ocfs2_group_from_res(res); + break; + } + if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + } + +set_hint: + if (status != -ENOSPC) { + /* If the next search of this group is not likely to + * yield a suitable extent, then we reset the last + * group hint so as to not waste a disk read */ + if (bits_left < min_bits) + ac->ac_last_group = 0; + else + ac->ac_last_group = hint; + } + +bail: + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_claim_metadata(handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u64 *suballoc_loc, + u16 *suballoc_bit_start, + unsigned int *num_bits, + u64 *blkno_start) +{ + int status; + struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); + BUG_ON(ac->ac_which != OCFS2_AC_USE_META); + + status = ocfs2_claim_suballoc_bits(ac, + handle, + bits_wanted, + 1, + &res); + if (status < 0) { + mlog_errno(status); + goto bail; + } + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); + + *suballoc_loc = res.sr_bg_blkno; + *suballoc_bit_start = res.sr_bit_offset; + *blkno_start = res.sr_blkno; + ac->ac_bits_given += res.sr_bits; + *num_bits = res.sr_bits; + status = 0; +bail: + if (status) + mlog_errno(status); + return status; +} + +static void ocfs2_init_inode_ac_group(struct inode *dir, + struct buffer_head *parent_di_bh, + struct ocfs2_alloc_context *ac) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data; + /* + * Try to allocate inodes from some specific group. + * + * If the parent dir has recorded the last group used in allocation, + * cool, use it. Otherwise if we try to allocate new inode from the + * same slot the parent dir belongs to, use the same chunk. + * + * We are very careful here to avoid the mistake of setting + * ac_last_group to a group descriptor from a different (unlocked) slot. + */ + if (OCFS2_I(dir)->ip_last_used_group && + OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) + ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; + else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) { + if (di->i_suballoc_loc) + ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc); + else + ac->ac_last_group = ocfs2_which_suballoc_group( + le64_to_cpu(di->i_blkno), + le16_to_cpu(di->i_suballoc_bit)); + } +} + +static inline void ocfs2_save_inode_ac_group(struct inode *dir, + struct ocfs2_alloc_context *ac) +{ + OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group; + OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; +} + +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_suballoc_result *res; + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_given != 0); + BUG_ON(ac->ac_bits_wanted != 1); + BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); + + res = kzalloc(sizeof(*res), GFP_NOFS); + if (res == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); + + /* + * The handle started here is for chain relink. Alternatively, + * we could just disable relink for these calls. + */ + handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + /* + * This will instruct ocfs2_claim_suballoc_bits and + * ocfs2_search_one_group to search but save actual allocation + * for later. + */ + ac->ac_find_loc_only = 1; + + ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ac->ac_find_loc_priv = res; + *fe_blkno = res->sr_blkno; + +out: + if (handle) + ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); + + if (ret) + kfree(res); + + return ret; +} + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno) +{ + int ret; + u16 chain; + struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; + struct buffer_head *bg_bh = NULL; + struct ocfs2_group_desc *bg; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; + + /* + * Since di_blkno is being passed back in, we check for any + * inconsistencies which may have happened between + * calls. These are code bugs as di_blkno is not expected to + * change once returned from ocfs2_find_new_inode_loc() + */ + BUG_ON(res->sr_blkno != di_blkno); + + ret = ocfs2_read_group_descriptor(ac->ac_inode, di, + res->sr_bg_stable_blkno, &bg_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + bg = (struct ocfs2_group_desc *) bg_bh->b_data; + chain = le16_to_cpu(bg->bg_chain); + + ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_block_group_set_bits(handle, + ac->ac_inode, + bg, + bg_bh, + res->sr_bit_offset, + res->sr_bits); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno, + res->sr_bits); + + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); + + BUG_ON(res->sr_bits != 1); + + *suballoc_loc = res->sr_bg_blkno; + *suballoc_bit = res->sr_bit_offset; + ac->ac_bits_given++; + ocfs2_save_inode_ac_group(dir, ac); + +out: + brelse(bg_bh); + + return ret; +} + +int ocfs2_claim_new_inode(handle_t *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 *fe_blkno) +{ + int status; + struct ocfs2_suballoc_result res; + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_given != 0); + BUG_ON(ac->ac_bits_wanted != 1); + BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); + + ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); + + status = ocfs2_claim_suballoc_bits(ac, + handle, + 1, + 1, + &res); + if (status < 0) { + mlog_errno(status); + goto bail; + } + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); + + BUG_ON(res.sr_bits != 1); + + *suballoc_loc = res.sr_bg_blkno; + *suballoc_bit = res.sr_bit_offset; + *fe_blkno = res.sr_blkno; + ac->ac_bits_given++; + ocfs2_save_inode_ac_group(dir, ac); + status = 0; +bail: + if (status) + mlog_errno(status); + return status; +} + +/* translate a group desc. blkno and it's bitmap offset into + * disk cluster offset. */ +static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, + u64 bg_blkno, + u16 bg_bit_off) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 cluster = 0; + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + if (bg_blkno != osb->first_cluster_group_blkno) + cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); + cluster += (u32) bg_bit_off; + return cluster; +} + +/* given a cluster offset, calculate which block group it belongs to + * and return that block offset. */ +u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 group_no; + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + group_no = cluster / osb->bitmap_cpg; + if (!group_no) + return osb->first_cluster_group_blkno; + return ocfs2_clusters_to_blocks(inode->i_sb, + group_no * osb->bitmap_cpg); +} + +/* given the block number of a cluster start, calculate which cluster + * group and descriptor bitmap offset that corresponds to. */ +static inline void ocfs2_block_to_cluster_group(struct inode *inode, + u64 data_blkno, + u64 *bg_blkno, + u16 *bg_bit_off) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + *bg_blkno = ocfs2_which_cluster_group(inode, + data_cluster); + + if (*bg_blkno == osb->first_cluster_group_blkno) + *bg_bit_off = (u16) data_cluster; + else + *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, + data_blkno - *bg_blkno); +} + +/* + * min_bits - minimum contiguous chunk from this total allocation we + * can handle. set to what we asked for originally for a full + * contig. allocation, set to '1' to indicate we can deal with extents + * of any size. + */ +int __ocfs2_claim_clusters(handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 min_clusters, + u32 max_clusters, + u32 *cluster_start, + u32 *num_clusters) +{ + int status; + unsigned int bits_wanted = max_clusters; + struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; + struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb); + + BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); + + BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL + && ac->ac_which != OCFS2_AC_USE_MAIN); + + if (ac->ac_which == OCFS2_AC_USE_LOCAL) { + WARN_ON(min_clusters > 1); + + status = ocfs2_claim_local_alloc_bits(osb, + handle, + ac, + bits_wanted, + cluster_start, + num_clusters); + if (!status) + atomic_inc(&osb->alloc_stats.local_data); + } else { + if (min_clusters > (osb->bitmap_cpg - 1)) { + /* The only paths asking for contiguousness + * should know about this already. */ + mlog(ML_ERROR, "minimum allocation requested %u exceeds " + "group bitmap size %u!\n", min_clusters, + osb->bitmap_cpg); + status = -ENOSPC; + goto bail; + } + /* clamp the current request down to a realistic size. */ + if (bits_wanted > (osb->bitmap_cpg - 1)) + bits_wanted = osb->bitmap_cpg - 1; + + status = ocfs2_claim_suballoc_bits(ac, + handle, + bits_wanted, + min_clusters, + &res); + if (!status) { + BUG_ON(res.sr_blkno); /* cluster alloc can't set */ + *cluster_start = + ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, + res.sr_bg_blkno, + res.sr_bit_offset); + atomic_inc(&osb->alloc_stats.bitmap_data); + *num_clusters = res.sr_bits; + } + } + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + ac->ac_bits_given += *num_clusters; + +bail: + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_claim_clusters(handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 min_clusters, + u32 *cluster_start, + u32 *num_clusters) +{ + unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; + + return __ocfs2_claim_clusters(handle, ac, min_clusters, + bits_wanted, cluster_start, num_clusters); +} + +static int ocfs2_block_group_clear_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits, + void (*undo_fn)(unsigned int bit, + unsigned long *bmap)) +{ + int status; + unsigned int tmp; + struct ocfs2_group_desc *undo_bg = NULL; + + /* The caller got this descriptor from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); + + trace_ocfs2_block_group_clear_bits(bit_off, num_bits); + + BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode)); + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + group_bh, + undo_fn ? + OCFS2_JOURNAL_ACCESS_UNDO : + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (undo_fn) { + jbd_lock_bh_state(group_bh); + undo_bg = (struct ocfs2_group_desc *) + bh2jh(group_bh)->b_committed_data; + BUG_ON(!undo_bg); + } + + tmp = num_bits; + while(tmp--) { + ocfs2_clear_bit((bit_off + tmp), + (unsigned long *) bg->bg_bitmap); + if (undo_fn) + undo_fn(bit_off + tmp, + (unsigned long *) undo_bg->bg_bitmap); + } + le16_add_cpu(&bg->bg_free_bits_count, num_bits); + if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" + " count %u but claims %u are freed. num_bits %d", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), num_bits); + return -EROFS; + } + + if (undo_fn) + jbd_unlock_bh_state(group_bh); + + ocfs2_journal_dirty(handle, group_bh); +bail: + return status; +} + +/* + * expects the suballoc inode to already be locked. + */ +static int _ocfs2_free_suballoc_bits(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count, + void (*undo_fn)(unsigned int bit, + unsigned long *bitmap)) +{ + int status = 0; + u32 tmp_used; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; + struct ocfs2_chain_list *cl = &fe->id2.i_chain; + struct buffer_head *group_bh = NULL; + struct ocfs2_group_desc *group; + + /* The alloc_bh comes from ocfs2_free_dinode() or + * ocfs2_free_clusters(). The callers have all locked the + * allocator and gotten alloc_bh from the lock call. This + * validates the dinode buffer. Any corruption that has happened + * is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); + + trace_ocfs2_free_suballoc_bits( + (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, + (unsigned long long)bg_blkno, + start_bit, count); + + status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, + &group_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + group = (struct ocfs2_group_desc *) group_bh->b_data; + + BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); + + status = ocfs2_block_group_clear_bits(handle, alloc_inode, + group, group_bh, + start_bit, count, undo_fn); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), + alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, + count); + tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); + fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); + ocfs2_journal_dirty(handle, alloc_bh); + +bail: + brelse(group_bh); + + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_free_suballoc_bits(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count) +{ + return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh, + start_bit, bg_blkno, count, NULL); +} + +int ocfs2_free_dinode(handle_t *handle, + struct inode *inode_alloc_inode, + struct buffer_head *inode_alloc_bh, + struct ocfs2_dinode *di) +{ + u64 blk = le64_to_cpu(di->i_blkno); + u16 bit = le16_to_cpu(di->i_suballoc_bit); + u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + if (di->i_suballoc_loc) + bg_blkno = le64_to_cpu(di->i_suballoc_loc); + return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, + inode_alloc_bh, bit, bg_blkno, 1); +} + +static int _ocfs2_free_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters, + void (*undo_fn)(unsigned int bit, + unsigned long *bitmap)) +{ + int status; + u16 bg_start_bit; + u64 bg_blkno; + struct ocfs2_dinode *fe; + + /* You can't ever have a contiguous set of clusters + * bigger than a block group bitmap so we never have to worry + * about looping on them. + * This is expensive. We can safely remove once this stuff has + * gotten tested really well. */ + BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); + + fe = (struct ocfs2_dinode *) bitmap_bh->b_data; + + ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, + &bg_start_bit); + + trace_ocfs2_free_clusters((unsigned long long)bg_blkno, + (unsigned long long)start_blk, + bg_start_bit, num_clusters); + + status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, + bg_start_bit, bg_blkno, + num_clusters, undo_fn); + if (status < 0) { + mlog_errno(status); + goto out; + } + + ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), + num_clusters); + +out: + if (status) + mlog_errno(status); + return status; +} + +int ocfs2_free_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters) +{ + return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, + start_blk, num_clusters, + _ocfs2_set_bit); +} + +/* + * Give never-used clusters back to the global bitmap. We don't need + * to protect these bits in the undo buffer. + */ +int ocfs2_release_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters) +{ + return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, + start_blk, num_clusters, + _ocfs2_clear_bit); +} + +static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) +{ + printk("Block Group:\n"); + printk("bg_signature: %s\n", bg->bg_signature); + printk("bg_size: %u\n", bg->bg_size); + printk("bg_bits: %u\n", bg->bg_bits); + printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count); + printk("bg_chain: %u\n", bg->bg_chain); + printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation)); + printk("bg_next_group: %llu\n", + (unsigned long long)bg->bg_next_group); + printk("bg_parent_dinode: %llu\n", + (unsigned long long)bg->bg_parent_dinode); + printk("bg_blkno: %llu\n", + (unsigned long long)bg->bg_blkno); +} + +static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) +{ + int i; + + printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno); + printk("i_signature: %s\n", fe->i_signature); + printk("i_size: %llu\n", + (unsigned long long)fe->i_size); + printk("i_clusters: %u\n", fe->i_clusters); + printk("i_generation: %u\n", + le32_to_cpu(fe->i_generation)); + printk("id1.bitmap1.i_used: %u\n", + le32_to_cpu(fe->id1.bitmap1.i_used)); + printk("id1.bitmap1.i_total: %u\n", + le32_to_cpu(fe->id1.bitmap1.i_total)); + printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg); + printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc); + printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count); + printk("id2.i_chain.cl_next_free_rec: %u\n", + fe->id2.i_chain.cl_next_free_rec); + for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) { + printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i, + fe->id2.i_chain.cl_recs[i].c_free); + printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, + fe->id2.i_chain.cl_recs[i].c_total); + printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i, + (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); + } +} + +/* + * For a given allocation, determine which allocators will need to be + * accessed, and lock them, reserving the appropriate number of bits. + * + * Sparse file systems call this from ocfs2_write_begin_nolock() + * and ocfs2_allocate_unwritten_extents(). + * + * File systems which don't support holes call this from + * ocfs2_extend_allocation(). + */ +int ocfs2_lock_allocators(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters_to_add, u32 extents_to_split, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac) +{ + int ret = 0, num_free_extents; + unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + *meta_ac = NULL; + if (data_ac) + *data_ac = NULL; + + BUG_ON(clusters_to_add != 0 && data_ac == NULL); + + num_free_extents = ocfs2_num_free_extents(osb, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + /* + * Sparse allocation file systems need to be more conservative + * with reserving room for expansion - the actual allocation + * happens while we've got a journal handle open so re-taking + * a cluster lock (because we ran out of room for another + * extent) will violate ordering rules. + * + * Most of the time we'll only be seeing this 1 cluster at a time + * anyway. + * + * Always lock for any unwritten extents - we might want to + * add blocks during a split. + */ + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { + ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + + if (clusters_to_add == 0) + goto out; + + ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + + /* + * We cannot have an error and a non null *data_ac. + */ + } + + return ret; +} + +/* + * Read the inode specified by blkno to get suballoc_slot and + * suballoc_bit. + */ +static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, + u16 *suballoc_slot, u64 *group_blkno, + u16 *suballoc_bit) +{ + int status; + struct buffer_head *inode_bh = NULL; + struct ocfs2_dinode *inode_fe; + + trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno); + + /* dirty read disk */ + status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); + if (status < 0) { + mlog(ML_ERROR, "read block %llu failed %d\n", + (unsigned long long)blkno, status); + goto bail; + } + + inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; + if (!OCFS2_IS_VALID_DINODE(inode_fe)) { + mlog(ML_ERROR, "invalid inode %llu requested\n", + (unsigned long long)blkno); + status = -EINVAL; + goto bail; + } + + if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && + (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { + mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", + (unsigned long long)blkno, + (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); + status = -EINVAL; + goto bail; + } + + if (suballoc_slot) + *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); + if (suballoc_bit) + *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); + if (group_blkno) + *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); + +bail: + brelse(inode_bh); + + if (status) + mlog_errno(status); + return status; +} + +/* + * test whether bit is SET in allocator bitmap or not. on success, 0 + * is returned and *res is 1 for SET; 0 otherwise. when fails, errno + * is returned and *res is meaningless. Call this after you have + * cluster locked against suballoc, or you may get a result based on + * non-up2date contents + */ +static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, + struct inode *suballoc, + struct buffer_head *alloc_bh, + u64 group_blkno, u64 blkno, + u16 bit, int *res) +{ + struct ocfs2_dinode *alloc_di; + struct ocfs2_group_desc *group; + struct buffer_head *group_bh = NULL; + u64 bg_blkno; + int status; + + trace_ocfs2_test_suballoc_bit((unsigned long long)blkno, + (unsigned int)bit); + + alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data; + if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) { + mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", + (unsigned int)bit, + ocfs2_bits_per_group(&alloc_di->id2.i_chain)); + status = -EINVAL; + goto bail; + } + + bg_blkno = group_blkno ? group_blkno : + ocfs2_which_suballoc_group(blkno, bit); + status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, + &group_bh); + if (status < 0) { + mlog(ML_ERROR, "read group %llu failed %d\n", + (unsigned long long)bg_blkno, status); + goto bail; + } + + group = (struct ocfs2_group_desc *) group_bh->b_data; + *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); + +bail: + brelse(group_bh); + + if (status) + mlog_errno(status); + return status; +} + +/* + * Test if the bit representing this inode (blkno) is set in the + * suballocator. + * + * On success, 0 is returned and *res is 1 for SET; 0 otherwise. + * + * In the event of failure, a negative value is returned and *res is + * meaningless. + * + * Callers must make sure to hold nfs_sync_lock to prevent + * ocfs2_delete_inode() on another node from accessing the same + * suballocator concurrently. + */ +int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) +{ + int status; + u64 group_blkno = 0; + u16 suballoc_bit = 0, suballoc_slot = 0; + struct inode *inode_alloc_inode; + struct buffer_head *alloc_bh = NULL; + + trace_ocfs2_test_inode_bit((unsigned long long)blkno); + + status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, + &group_blkno, &suballoc_bit); + if (status < 0) { + mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); + goto bail; + } + + inode_alloc_inode = + ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, + suballoc_slot); + if (!inode_alloc_inode) { + /* the error code could be inaccurate, but we are not able to + * get the correct one. */ + status = -EINVAL; + mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", + (u32)suballoc_slot); + goto bail; + } + + mutex_lock(&inode_alloc_inode->i_mutex); + status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); + if (status < 0) { + mutex_unlock(&inode_alloc_inode->i_mutex); + mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", + (u32)suballoc_slot, status); + goto bail; + } + + status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, + group_blkno, blkno, suballoc_bit, res); + if (status < 0) + mlog(ML_ERROR, "test suballoc bit failed %d\n", status); + + ocfs2_inode_unlock(inode_alloc_inode, 0); + mutex_unlock(&inode_alloc_inode->i_mutex); + + iput(inode_alloc_inode); + brelse(alloc_bh); +bail: + if (status) + mlog_errno(status); + return status; +} diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h new file mode 100644 index 00000000..b8afabfe --- /dev/null +++ b/fs/ocfs2/suballoc.h @@ -0,0 +1,221 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * suballoc.h + * + * Defines sub allocator api + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _CHAINALLOC_H_ +#define _CHAINALLOC_H_ + +struct ocfs2_suballoc_result; +typedef int (group_search_t)(struct inode *, + struct buffer_head *, + u32, /* bits_wanted */ + u32, /* min_bits */ + u64, /* max_block */ + struct ocfs2_suballoc_result *); + /* found bits */ + +struct ocfs2_alloc_context { + struct inode *ac_inode; /* which bitmap are we allocating from? */ + struct buffer_head *ac_bh; /* file entry bh */ + u32 ac_alloc_slot; /* which slot are we allocating from? */ + u32 ac_bits_wanted; + u32 ac_bits_given; +#define OCFS2_AC_USE_LOCAL 1 +#define OCFS2_AC_USE_MAIN 2 +#define OCFS2_AC_USE_INODE 3 +#define OCFS2_AC_USE_META 4 + u32 ac_which; + + /* these are used by the chain search */ + u16 ac_chain; + int ac_allow_chain_relink; + group_search_t *ac_group_search; + + u64 ac_last_group; + u64 ac_max_block; /* Highest block number to allocate. 0 is + is the same as ~0 - unlimited */ + + int ac_find_loc_only; /* hack for reflink operation ordering */ + struct ocfs2_suballoc_result *ac_find_loc_priv; /* */ + + struct ocfs2_alloc_reservation *ac_resv; +}; + +void ocfs2_init_steal_slots(struct ocfs2_super *osb); +void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); +static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) +{ + return ac->ac_bits_wanted - ac->ac_bits_given; +} + +/* + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ +int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, + struct ocfs2_extent_list *root_el, + struct ocfs2_alloc_context **ac); +int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, + int blocks, + struct ocfs2_alloc_context **ac); +int ocfs2_reserve_new_inode(struct ocfs2_super *osb, + struct ocfs2_alloc_context **ac); +int ocfs2_reserve_clusters(struct ocfs2_super *osb, + u32 bits_wanted, + struct ocfs2_alloc_context **ac); + +int ocfs2_claim_metadata(handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u64 *suballoc_loc, + u16 *suballoc_bit_start, + u32 *num_bits, + u64 *blkno_start); +int ocfs2_claim_new_inode(handle_t *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 *fe_blkno); +int ocfs2_claim_clusters(handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 min_clusters, + u32 *cluster_start, + u32 *num_clusters); +/* + * Use this variant of ocfs2_claim_clusters to specify a maxiumum + * number of clusters smaller than the allocation reserved. + */ +int __ocfs2_claim_clusters(handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 min_clusters, + u32 max_clusters, + u32 *cluster_start, + u32 *num_clusters); + +int ocfs2_free_suballoc_bits(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count); +int ocfs2_free_dinode(handle_t *handle, + struct inode *inode_alloc_inode, + struct buffer_head *inode_alloc_bh, + struct ocfs2_dinode *di); +int ocfs2_free_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters); +int ocfs2_release_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters); + +static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) +{ + u64 group = block - (u64) bit; + + return group; +} + +static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb, + u64 bg_blkno) +{ + /* This should work for all block group descriptors as only + * the 1st group descriptor of the cluster bitmap is + * different. */ + + if (bg_blkno == osb->first_cluster_group_blkno) + return 0; + + /* the rest of the block groups are located at the beginning + * of their 1st cluster, so a direct translation just + * works. */ + return ocfs2_blocks_to_clusters(osb->sb, bg_blkno); +} + +static inline int ocfs2_is_cluster_bitmap(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno; +} + +/* This is for local alloc ONLY. Others should use the task-specific + * apis above. */ +int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac); +void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac); + +/* given a cluster offset, calculate which block group it belongs to + * and return that block offset. */ +u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster); + +/* + * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it + * finds a problem. A caller that wants to check a group descriptor + * without going readonly should read the block with ocfs2_read_block[s]() + * and then checking it with this function. This is only resize, really. + * Everyone else should be using ocfs2_read_group_descriptor(). + */ +int ocfs2_check_group_descriptor(struct super_block *sb, + struct ocfs2_dinode *di, + struct buffer_head *bh); +/* + * Read a group descriptor block into *bh. If *bh is NULL, a bh will be + * allocated. This is a cached read. The descriptor will be validated with + * ocfs2_validate_group_descriptor(). + */ +int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, + u64 gd_blkno, struct buffer_head **bh); + +int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, + u32 clusters_to_add, u32 extents_to_split, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac); + +int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res); + + + +/* + * The following two interfaces are for ocfs2_create_inode_in_orphan(). + */ +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno); + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno); + +#endif /* _CHAINALLOC_H_ */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c new file mode 100644 index 00000000..56f61027 --- /dev/null +++ b/fs/ocfs2/super.c @@ -0,0 +1,2651 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * super.c + * + * load/unload driver, mount/dismount volumes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include "ocfs2_trace.h" + +#include + +#include "ocfs2.h" + +/* this should be the only file to include a version 1 header */ +#include "ocfs1_fs_compat.h" + +#include "alloc.h" +#include "blockcheck.h" +#include "dlmglue.h" +#include "export.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "namei.h" +#include "slot_map.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" +#include "ver.h" +#include "xattr.h" +#include "quota.h" +#include "refcounttree.h" +#include "suballoc.h" + +#include "buffer_head_io.h" + +static struct kmem_cache *ocfs2_inode_cachep = NULL; +struct kmem_cache *ocfs2_dquot_cachep; +struct kmem_cache *ocfs2_qf_chunk_cachep; + +/* OCFS2 needs to schedule several different types of work which + * require cluster locking, disk I/O, recovery waits, etc. Since these + * types of work tend to be heavy we avoid using the kernel events + * workqueue and schedule on our own. */ +struct workqueue_struct *ocfs2_wq = NULL; + +static struct dentry *ocfs2_debugfs_root = NULL; + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); + +struct mount_options +{ + unsigned long commit_interval; + unsigned long mount_opt; + unsigned int atime_quantum; + signed short slot; + int localalloc_opt; + unsigned int resv_level; + int dir_resv_level; + char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; +}; + +static int ocfs2_parse_options(struct super_block *sb, char *options, + struct mount_options *mopt, + int is_remount); +static int ocfs2_check_set_options(struct super_block *sb, + struct mount_options *options); +static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt); +static void ocfs2_put_super(struct super_block *sb); +static int ocfs2_mount_volume(struct super_block *sb); +static int ocfs2_remount(struct super_block *sb, int *flags, char *data); +static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err); +static int ocfs2_initialize_mem_caches(void); +static void ocfs2_free_mem_caches(void); +static void ocfs2_delete_osb(struct ocfs2_super *osb); + +static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf); + +static int ocfs2_sync_fs(struct super_block *sb, int wait); + +static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); +static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); +static void ocfs2_release_system_inodes(struct ocfs2_super *osb); +static int ocfs2_check_volume(struct ocfs2_super *osb); +static int ocfs2_verify_volume(struct ocfs2_dinode *di, + struct buffer_head *bh, + u32 sectsize, + struct ocfs2_blockcheck_stats *stats); +static int ocfs2_initialize_super(struct super_block *sb, + struct buffer_head *bh, + int sector_size, + struct ocfs2_blockcheck_stats *stats); +static int ocfs2_get_sector(struct super_block *sb, + struct buffer_head **bh, + int block, + int sect_size); +static struct inode *ocfs2_alloc_inode(struct super_block *sb); +static void ocfs2_destroy_inode(struct inode *inode); +static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); +static int ocfs2_enable_quotas(struct ocfs2_super *osb); +static void ocfs2_disable_quotas(struct ocfs2_super *osb); + +static const struct super_operations ocfs2_sops = { + .statfs = ocfs2_statfs, + .alloc_inode = ocfs2_alloc_inode, + .destroy_inode = ocfs2_destroy_inode, + .drop_inode = ocfs2_drop_inode, + .evict_inode = ocfs2_evict_inode, + .sync_fs = ocfs2_sync_fs, + .put_super = ocfs2_put_super, + .remount_fs = ocfs2_remount, + .show_options = ocfs2_show_options, + .quota_read = ocfs2_quota_read, + .quota_write = ocfs2_quota_write, +}; + +enum { + Opt_barrier, + Opt_err_panic, + Opt_err_ro, + Opt_intr, + Opt_nointr, + Opt_hb_none, + Opt_hb_local, + Opt_hb_global, + Opt_data_ordered, + Opt_data_writeback, + Opt_atime_quantum, + Opt_slot, + Opt_commit, + Opt_localalloc, + Opt_localflocks, + Opt_stack, + Opt_user_xattr, + Opt_nouser_xattr, + Opt_inode64, + Opt_acl, + Opt_noacl, + Opt_usrquota, + Opt_grpquota, + Opt_coherency_buffered, + Opt_coherency_full, + Opt_resv_level, + Opt_dir_resv_level, + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_barrier, "barrier=%u"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_intr, "intr"}, + {Opt_nointr, "nointr"}, + {Opt_hb_none, OCFS2_HB_NONE}, + {Opt_hb_local, OCFS2_HB_LOCAL}, + {Opt_hb_global, OCFS2_HB_GLOBAL}, + {Opt_data_ordered, "data=ordered"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_atime_quantum, "atime_quantum=%u"}, + {Opt_slot, "preferred_slot=%u"}, + {Opt_commit, "commit=%u"}, + {Opt_localalloc, "localalloc=%d"}, + {Opt_localflocks, "localflocks"}, + {Opt_stack, "cluster_stack=%s"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_inode64, "inode64"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, + {Opt_coherency_buffered, "coherency=buffered"}, + {Opt_coherency_full, "coherency=full"}, + {Opt_resv_level, "resv_level=%u"}, + {Opt_dir_resv_level, "dir_resv_level=%u"}, + {Opt_err, NULL} +}; + +#ifdef CONFIG_DEBUG_FS +static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) +{ + struct ocfs2_cluster_connection *cconn = osb->cconn; + struct ocfs2_recovery_map *rm = osb->recovery_map; + struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan; + int i, out = 0; + + out += snprintf(buf + out, len - out, + "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", + "Device", osb->dev_str, osb->uuid_str, + osb->fs_generation, osb->vol_label); + + out += snprintf(buf + out, len - out, + "%10s => State: %d Flags: 0x%lX\n", "Volume", + atomic_read(&osb->vol_state), osb->osb_flags); + + out += snprintf(buf + out, len - out, + "%10s => Block: %lu Cluster: %d\n", "Sizes", + osb->sb->s_blocksize, osb->s_clustersize); + + out += snprintf(buf + out, len - out, + "%10s => Compat: 0x%X Incompat: 0x%X " + "ROcompat: 0x%X\n", + "Features", osb->s_feature_compat, + osb->s_feature_incompat, osb->s_feature_ro_compat); + + out += snprintf(buf + out, len - out, + "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount", + osb->s_mount_opt, osb->s_atime_quantum); + + if (cconn) { + out += snprintf(buf + out, len - out, + "%10s => Stack: %s Name: %*s " + "Version: %d.%d\n", "Cluster", + (*osb->osb_cluster_stack == '\0' ? + "o2cb" : osb->osb_cluster_stack), + cconn->cc_namelen, cconn->cc_name, + cconn->cc_version.pv_major, + cconn->cc_version.pv_minor); + } + + spin_lock(&osb->dc_task_lock); + out += snprintf(buf + out, len - out, + "%10s => Pid: %d Count: %lu WakeSeq: %lu " + "WorkSeq: %lu\n", "DownCnvt", + (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), + osb->blocked_lock_count, osb->dc_wake_sequence, + osb->dc_work_sequence); + spin_unlock(&osb->dc_task_lock); + + spin_lock(&osb->osb_lock); + out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", + "Recovery", + (osb->recovery_thread_task ? + task_pid_nr(osb->recovery_thread_task) : -1)); + if (rm->rm_used == 0) + out += snprintf(buf + out, len - out, " None\n"); + else { + for (i = 0; i < rm->rm_used; i++) + out += snprintf(buf + out, len - out, " %d", + rm->rm_entries[i]); + out += snprintf(buf + out, len - out, "\n"); + } + spin_unlock(&osb->osb_lock); + + out += snprintf(buf + out, len - out, + "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit", + (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), + osb->osb_commit_interval, + atomic_read(&osb->needs_checkpoint)); + + out += snprintf(buf + out, len - out, + "%10s => State: %d TxnId: %lu NumTxns: %d\n", + "Journal", osb->journal->j_state, + osb->journal->j_trans_id, + atomic_read(&osb->journal->j_num_trans)); + + out += snprintf(buf + out, len - out, + "%10s => GlobalAllocs: %d LocalAllocs: %d " + "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n", + "Stats", + atomic_read(&osb->alloc_stats.bitmap_data), + atomic_read(&osb->alloc_stats.local_data), + atomic_read(&osb->alloc_stats.bg_allocs), + atomic_read(&osb->alloc_stats.moves), + atomic_read(&osb->alloc_stats.bg_extends)); + + out += snprintf(buf + out, len - out, + "%10s => State: %u Descriptor: %llu Size: %u bits " + "Default: %u bits\n", + "LocalAlloc", osb->local_alloc_state, + (unsigned long long)osb->la_last_gd, + osb->local_alloc_bits, osb->local_alloc_default_bits); + + spin_lock(&osb->osb_lock); + out += snprintf(buf + out, len - out, + "%10s => InodeSlot: %d StolenInodes: %d, " + "MetaSlot: %d StolenMeta: %d\n", "Steal", + osb->s_inode_steal_slot, + atomic_read(&osb->s_num_inodes_stolen), + osb->s_meta_steal_slot, + atomic_read(&osb->s_num_meta_stolen)); + spin_unlock(&osb->osb_lock); + + out += snprintf(buf + out, len - out, "OrphanScan => "); + out += snprintf(buf + out, len - out, "Local: %u Global: %u ", + os->os_count, os->os_seqno); + out += snprintf(buf + out, len - out, " Last Scan: "); + if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) + out += snprintf(buf + out, len - out, "Disabled\n"); + else + out += snprintf(buf + out, len - out, "%lu seconds ago\n", + (get_seconds() - os->os_scantime.tv_sec)); + + out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", + "Slots", "Num", "RecoGen"); + for (i = 0; i < osb->max_slots; ++i) { + out += snprintf(buf + out, len - out, + "%10s %c %3d %10d\n", + " ", + (i == osb->slot_num ? '*' : ' '), + i, osb->slot_recovery_generations[i]); + } + + return out; +} + +static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) +{ + struct ocfs2_super *osb = inode->i_private; + char *buf = NULL; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto bail; + + i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE)); + + file->private_data = buf; + + return 0; +bail: + return -ENOMEM; +} + +static int ocfs2_debug_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, + i_size_read(file->f_mapping->host)); +} +#else +static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) +{ + return 0; +} +static int ocfs2_debug_release(struct inode *inode, struct file *file) +{ + return 0; +} +static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return 0; +} +#endif /* CONFIG_DEBUG_FS */ + +static const struct file_operations ocfs2_osb_debug_fops = { + .open = ocfs2_osb_debug_open, + .release = ocfs2_debug_release, + .read = ocfs2_debug_read, + .llseek = generic_file_llseek, +}; + +static int ocfs2_sync_fs(struct super_block *sb, int wait) +{ + int status; + tid_t target; + struct ocfs2_super *osb = OCFS2_SB(sb); + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (wait) { + status = ocfs2_flush_truncate_log(osb); + if (status < 0) + mlog_errno(status); + } else { + ocfs2_schedule_truncate_log_flush(osb, 0); + } + + if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal, + &target)) { + if (wait) + jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal, + target); + } + return 0; +} + +static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino) +{ + if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA) + && (ino == USER_QUOTA_SYSTEM_INODE + || ino == LOCAL_USER_QUOTA_SYSTEM_INODE)) + return 0; + if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) + && (ino == GROUP_QUOTA_SYSTEM_INODE + || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE)) + return 0; + return 1; +} + +static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) +{ + struct inode *new = NULL; + int status = 0; + int i; + + new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0); + if (IS_ERR(new)) { + status = PTR_ERR(new); + mlog_errno(status); + goto bail; + } + osb->root_inode = new; + + new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0); + if (IS_ERR(new)) { + status = PTR_ERR(new); + mlog_errno(status); + goto bail; + } + osb->sys_root_inode = new; + + for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; + i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { + if (!ocfs2_need_system_inode(osb, i)) + continue; + new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); + if (!new) { + ocfs2_release_system_inodes(osb); + status = -EINVAL; + mlog_errno(status); + /* FIXME: Should ERROR_RO_FS */ + mlog(ML_ERROR, "Unable to load system inode %d, " + "possibly corrupt fs?", i); + goto bail; + } + // the array now has one ref, so drop this one + iput(new); + } + +bail: + if (status) + mlog_errno(status); + return status; +} + +static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) +{ + struct inode *new = NULL; + int status = 0; + int i; + + for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; + i < NUM_SYSTEM_INODES; + i++) { + if (!ocfs2_need_system_inode(osb, i)) + continue; + new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); + if (!new) { + ocfs2_release_system_inodes(osb); + status = -EINVAL; + mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", + status, i, osb->slot_num); + goto bail; + } + /* the array now has one ref, so drop this one */ + iput(new); + } + +bail: + if (status) + mlog_errno(status); + return status; +} + +static void ocfs2_release_system_inodes(struct ocfs2_super *osb) +{ + int i; + struct inode *inode; + + for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) { + inode = osb->global_system_inodes[i]; + if (inode) { + iput(inode); + osb->global_system_inodes[i] = NULL; + } + } + + inode = osb->sys_root_inode; + if (inode) { + iput(inode); + osb->sys_root_inode = NULL; + } + + inode = osb->root_inode; + if (inode) { + iput(inode); + osb->root_inode = NULL; + } + + if (!osb->local_system_inodes) + return; + + for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) { + if (osb->local_system_inodes[i]) { + iput(osb->local_system_inodes[i]); + osb->local_system_inodes[i] = NULL; + } + } + + kfree(osb->local_system_inodes); + osb->local_system_inodes = NULL; +} + +/* We're allocating fs objects, use GFP_NOFS */ +static struct inode *ocfs2_alloc_inode(struct super_block *sb) +{ + struct ocfs2_inode_info *oi; + + oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS); + if (!oi) + return NULL; + + jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); + return &oi->vfs_inode; +} + +static void ocfs2_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); +} + +static void ocfs2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ocfs2_i_callback); +} + +static unsigned long long ocfs2_max_file_offset(unsigned int bbits, + unsigned int cbits) +{ + unsigned int bytes = 1 << cbits; + unsigned int trim = bytes; + unsigned int bitshift = 32; + + /* + * i_size and all block offsets in ocfs2 are always 64 bits + * wide. i_clusters is 32 bits, in cluster-sized units. So on + * 64 bit platforms, cluster size will be the limiting factor. + */ + +#if BITS_PER_LONG == 32 +# if defined(CONFIG_LBDAF) + BUILD_BUG_ON(sizeof(sector_t) != 8); + /* + * We might be limited by page cache size. + */ + if (bytes > PAGE_CACHE_SIZE) { + bytes = PAGE_CACHE_SIZE; + trim = 1; + /* + * Shift by 31 here so that we don't get larger than + * MAX_LFS_FILESIZE + */ + bitshift = 31; + } +# else + /* + * We are limited by the size of sector_t. Use block size, as + * that's what we expose to the VFS. + */ + bytes = 1 << bbits; + trim = 1; + bitshift = 31; +# endif +#endif + + /* + * Trim by a whole cluster when we can actually approach the + * on-disk limits. Otherwise we can overflow i_clusters when + * an extent start is at the max offset. + */ + return (((unsigned long long)bytes) << bitshift) - trim; +} + +static int ocfs2_remount(struct super_block *sb, int *flags, char *data) +{ + int incompat_features; + int ret = 0; + struct mount_options parsed_options; + struct ocfs2_super *osb = OCFS2_SB(sb); + u32 tmp; + + if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || + !ocfs2_check_set_options(sb, &parsed_options)) { + ret = -EINVAL; + goto out; + } + + tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE; + if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); + goto out; + } + + if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) != + (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot change data mode on remount\n"); + goto out; + } + + /* Probably don't want this on remount; it might + * mess with other nodes */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) && + (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot enable inode64 on remount\n"); + goto out; + } + + /* We're going to/from readonly mode. */ + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + /* Disable quota accounting before remounting RO */ + if (*flags & MS_RDONLY) { + ret = ocfs2_susp_quotas(osb, 0); + if (ret < 0) + goto out; + } + /* Lock here so the check of HARD_RO and the potential + * setting of SOFT_RO is atomic. */ + spin_lock(&osb->osb_lock); + if (osb->osb_flags & OCFS2_OSB_HARD_RO) { + mlog(ML_ERROR, "Remount on readonly device is forbidden.\n"); + ret = -EROFS; + goto unlock_osb; + } + + if (*flags & MS_RDONLY) { + sb->s_flags |= MS_RDONLY; + osb->osb_flags |= OCFS2_OSB_SOFT_RO; + } else { + if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { + mlog(ML_ERROR, "Cannot remount RDWR " + "filesystem due to previous errors.\n"); + ret = -EROFS; + goto unlock_osb; + } + incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP); + if (incompat_features) { + mlog(ML_ERROR, "Cannot remount RDWR because " + "of unsupported optional features " + "(%x).\n", incompat_features); + ret = -EINVAL; + goto unlock_osb; + } + sb->s_flags &= ~MS_RDONLY; + osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; + } + trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags); +unlock_osb: + spin_unlock(&osb->osb_lock); + /* Enable quota accounting after remounting RW */ + if (!ret && !(*flags & MS_RDONLY)) { + if (sb_any_quota_suspended(sb)) + ret = ocfs2_susp_quotas(osb, 1); + else + ret = ocfs2_enable_quotas(osb); + if (ret < 0) { + /* Return back changes... */ + spin_lock(&osb->osb_lock); + sb->s_flags |= MS_RDONLY; + osb->osb_flags |= OCFS2_OSB_SOFT_RO; + spin_unlock(&osb->osb_lock); + goto out; + } + } + } + + if (!ret) { + /* Only save off the new mount options in case of a successful + * remount. */ + osb->s_mount_opt = parsed_options.mount_opt; + osb->s_atime_quantum = parsed_options.atime_quantum; + osb->preferred_slot = parsed_options.slot; + if (parsed_options.commit_interval) + osb->osb_commit_interval = parsed_options.commit_interval; + + if (!ocfs2_is_hard_readonly(osb)) + ocfs2_set_journal_params(osb); + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? + MS_POSIXACL : 0); + } +out: + return ret; +} + +static int ocfs2_sb_probe(struct super_block *sb, + struct buffer_head **bh, + int *sector_size, + struct ocfs2_blockcheck_stats *stats) +{ + int status, tmpstat; + struct ocfs1_vol_disk_hdr *hdr; + struct ocfs2_dinode *di; + int blksize; + + *bh = NULL; + + /* may be > 512 */ + *sector_size = bdev_logical_block_size(sb->s_bdev); + if (*sector_size > OCFS2_MAX_BLOCKSIZE) { + mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", + *sector_size, OCFS2_MAX_BLOCKSIZE); + status = -EINVAL; + goto bail; + } + + /* Can this really happen? */ + if (*sector_size < OCFS2_MIN_BLOCKSIZE) + *sector_size = OCFS2_MIN_BLOCKSIZE; + + /* check block zero for old format */ + status = ocfs2_get_sector(sb, bh, 0, *sector_size); + if (status < 0) { + mlog_errno(status); + goto bail; + } + hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data; + if (hdr->major_version == OCFS1_MAJOR_VERSION) { + mlog(ML_ERROR, "incompatible version: %u.%u\n", + hdr->major_version, hdr->minor_version); + status = -EINVAL; + } + if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE, + strlen(OCFS1_VOLUME_SIGNATURE)) == 0) { + mlog(ML_ERROR, "incompatible volume signature: %8s\n", + hdr->signature); + status = -EINVAL; + } + brelse(*bh); + *bh = NULL; + if (status < 0) { + mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be " + "upgraded before mounting with ocfs v2\n"); + goto bail; + } + + /* + * Now check at magic offset for 512, 1024, 2048, 4096 + * blocksizes. 4096 is the maximum blocksize because it is + * the minimum clustersize. + */ + status = -EINVAL; + for (blksize = *sector_size; + blksize <= OCFS2_MAX_BLOCKSIZE; + blksize <<= 1) { + tmpstat = ocfs2_get_sector(sb, bh, + OCFS2_SUPER_BLOCK_BLKNO, + blksize); + if (tmpstat < 0) { + status = tmpstat; + mlog_errno(status); + break; + } + di = (struct ocfs2_dinode *) (*bh)->b_data; + memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); + spin_lock_init(&stats->b_lock); + tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats); + if (tmpstat < 0) { + brelse(*bh); + *bh = NULL; + } + if (tmpstat != -EAGAIN) { + status = tmpstat; + break; + } + } + +bail: + return status; +} + +static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) +{ + u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL; + + if (osb->s_mount_opt & hb_enabled) { + if (ocfs2_mount_local(osb)) { + mlog(ML_ERROR, "Cannot heartbeat on a locally " + "mounted device.\n"); + return -EINVAL; + } + if (ocfs2_userspace_stack(osb)) { + mlog(ML_ERROR, "Userspace stack expected, but " + "o2cb heartbeat arguments passed to mount\n"); + return -EINVAL; + } + if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) && + !ocfs2_cluster_o2cb_global_heartbeat(osb)) || + ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) && + ocfs2_cluster_o2cb_global_heartbeat(osb))) { + mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n"); + return -EINVAL; + } + } + + if (!(osb->s_mount_opt & hb_enabled)) { + if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && + !ocfs2_userspace_stack(osb)) { + mlog(ML_ERROR, "Heartbeat has to be started to mount " + "a read-write clustered device.\n"); + return -EINVAL; + } + } + + return 0; +} + +/* + * If we're using a userspace stack, mount should have passed + * a name that matches the disk. If not, mount should not + * have passed a stack. + */ +static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, + struct mount_options *mopt) +{ + if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) { + mlog(ML_ERROR, + "cluster stack passed to mount, but this filesystem " + "does not support it\n"); + return -EINVAL; + } + + if (ocfs2_userspace_stack(osb) && + strncmp(osb->osb_cluster_stack, mopt->cluster_stack, + OCFS2_STACK_LABEL_LEN)) { + mlog(ML_ERROR, + "cluster stack passed to mount (\"%s\") does not " + "match the filesystem (\"%s\")\n", + mopt->cluster_stack, + osb->osb_cluster_stack); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) +{ + int type; + struct super_block *sb = osb->sb; + unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + int status = 0; + + for (type = 0; type < MAXQUOTAS; type++) { + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + continue; + if (unsuspend) + status = dquot_resume(sb, type); + else { + struct ocfs2_mem_dqinfo *oinfo; + + /* Cancel periodic syncing before suspending */ + oinfo = sb_dqinfo(sb, type)->dqi_priv; + cancel_delayed_work_sync(&oinfo->dqi_sync_work); + status = dquot_suspend(sb, type); + } + if (status < 0) + break; + } + if (status < 0) + mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on " + "remount (error = %d).\n", status); + return status; +} + +static int ocfs2_enable_quotas(struct ocfs2_super *osb) +{ + struct inode *inode[MAXQUOTAS] = { NULL, NULL }; + struct super_block *sb = osb->sb; + unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + int status; + int type; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE; + for (type = 0; type < MAXQUOTAS; type++) { + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + continue; + inode[type] = ocfs2_get_system_file_inode(osb, ino[type], + osb->slot_num); + if (!inode[type]) { + status = -ENOENT; + goto out_quota_off; + } + status = dquot_enable(inode[type], type, QFMT_OCFS2, + DQUOT_USAGE_ENABLED); + if (status < 0) + goto out_quota_off; + } + + for (type = 0; type < MAXQUOTAS; type++) + iput(inode[type]); + return 0; +out_quota_off: + ocfs2_disable_quotas(osb); + for (type = 0; type < MAXQUOTAS; type++) + iput(inode[type]); + mlog_errno(status); + return status; +} + +static void ocfs2_disable_quotas(struct ocfs2_super *osb) +{ + int type; + struct inode *inode; + struct super_block *sb = osb->sb; + struct ocfs2_mem_dqinfo *oinfo; + + /* We mostly ignore errors in this function because there's not much + * we can do when we see them */ + for (type = 0; type < MAXQUOTAS; type++) { + if (!sb_has_quota_loaded(sb, type)) + continue; + /* Cancel periodic syncing before we grab dqonoff_mutex */ + oinfo = sb_dqinfo(sb, type)->dqi_priv; + cancel_delayed_work_sync(&oinfo->dqi_sync_work); + inode = igrab(sb->s_dquot.files[type]); + /* Turn off quotas. This will remove all dquot structures from + * memory and so they will be automatically synced to global + * quota files */ + dquot_disable(sb, type, DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED); + if (!inode) + continue; + iput(inode); + } +} + +/* Handle quota on quotactl */ +static int ocfs2_quota_on(struct super_block *sb, int type, int format_id) +{ + unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + return -EINVAL; + + return dquot_enable(sb_dqopt(sb)->files[type], type, + format_id, DQUOT_LIMITS_ENABLED); +} + +/* Handle quota off quotactl */ +static int ocfs2_quota_off(struct super_block *sb, int type) +{ + return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); +} + +static const struct quotactl_ops ocfs2_quotactl_ops = { + .quota_on_meta = ocfs2_quota_on, + .quota_off = ocfs2_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, +}; + +static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) +{ + struct dentry *root; + int status, sector_size; + struct mount_options parsed_options; + struct inode *inode = NULL; + struct ocfs2_super *osb = NULL; + struct buffer_head *bh = NULL; + char nodestr[8]; + struct ocfs2_blockcheck_stats stats; + + trace_ocfs2_fill_super(sb, data, silent); + + if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) { + status = -EINVAL; + goto read_super_error; + } + + /* probe for superblock */ + status = ocfs2_sb_probe(sb, &bh, §or_size, &stats); + if (status < 0) { + mlog(ML_ERROR, "superblock probe failed!\n"); + goto read_super_error; + } + + status = ocfs2_initialize_super(sb, bh, sector_size, &stats); + osb = OCFS2_SB(sb); + if (status < 0) { + mlog_errno(status); + goto read_super_error; + } + brelse(bh); + bh = NULL; + + if (!ocfs2_check_set_options(sb, &parsed_options)) { + status = -EINVAL; + goto read_super_error; + } + osb->s_mount_opt = parsed_options.mount_opt; + osb->s_atime_quantum = parsed_options.atime_quantum; + osb->preferred_slot = parsed_options.slot; + osb->osb_commit_interval = parsed_options.commit_interval; + + ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt); + osb->osb_resv_level = parsed_options.resv_level; + osb->osb_dir_resv_level = parsed_options.resv_level; + if (parsed_options.dir_resv_level == -1) + osb->osb_dir_resv_level = parsed_options.resv_level; + else + osb->osb_dir_resv_level = parsed_options.dir_resv_level; + + status = ocfs2_verify_userspace_stack(osb, &parsed_options); + if (status) + goto read_super_error; + + sb->s_magic = OCFS2_SUPER_MAGIC; + + sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) | + ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + + /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, + * heartbeat=none */ + if (bdev_read_only(sb->s_bdev)) { + if (!(sb->s_flags & MS_RDONLY)) { + status = -EACCES; + mlog(ML_ERROR, "Readonly device detected but readonly " + "mount was not specified.\n"); + goto read_super_error; + } + + /* You should not be able to start a local heartbeat + * on a readonly device. */ + if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { + status = -EROFS; + mlog(ML_ERROR, "Local heartbeat specified on readonly " + "device.\n"); + goto read_super_error; + } + + status = ocfs2_check_journals_nolocks(osb); + if (status < 0) { + if (status == -EROFS) + mlog(ML_ERROR, "Recovery required on readonly " + "file system, but write access is " + "unavailable.\n"); + else + mlog_errno(status); + goto read_super_error; + } + + ocfs2_set_ro_flag(osb, 1); + + printk(KERN_NOTICE "Readonly device detected. No cluster " + "services will be utilized for this mount. Recovery " + "will be skipped.\n"); + } + + if (!ocfs2_is_hard_readonly(osb)) { + if (sb->s_flags & MS_RDONLY) + ocfs2_set_ro_flag(osb, 0); + } + + status = ocfs2_verify_heartbeat(osb); + if (status < 0) { + mlog_errno(status); + goto read_super_error; + } + + osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, + ocfs2_debugfs_root); + if (!osb->osb_debug_root) { + status = -EINVAL; + mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); + goto read_super_error; + } + + osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, + osb->osb_debug_root, + osb, + &ocfs2_osb_debug_fops); + if (!osb->osb_ctxt) { + status = -EINVAL; + mlog_errno(status); + goto read_super_error; + } + + if (ocfs2_meta_ecc(osb)) { + status = ocfs2_blockcheck_stats_debugfs_install( + &osb->osb_ecc_stats, + osb->osb_debug_root); + if (status) { + mlog(ML_ERROR, + "Unable to create blockcheck statistics " + "files\n"); + goto read_super_error; + } + } + + status = ocfs2_mount_volume(sb); + if (osb->root_inode) + inode = igrab(osb->root_inode); + + if (status < 0) + goto read_super_error; + + if (!inode) { + status = -EIO; + mlog_errno(status); + goto read_super_error; + } + + root = d_alloc_root(inode); + if (!root) { + status = -ENOMEM; + mlog_errno(status); + goto read_super_error; + } + + sb->s_root = root; + + ocfs2_complete_mount_recovery(osb); + + if (ocfs2_mount_local(osb)) + snprintf(nodestr, sizeof(nodestr), "local"); + else + snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); + + printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " + "with %s data mode.\n", + osb->dev_str, nodestr, osb->slot_num, + osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : + "ordered"); + + atomic_set(&osb->vol_state, VOLUME_MOUNTED); + wake_up(&osb->osb_mount_event); + + /* Now we can initialize quotas because we can afford to wait + * for cluster locks recovery now. That also means that truncation + * log recovery can happen but that waits for proper quota setup */ + if (!(sb->s_flags & MS_RDONLY)) { + status = ocfs2_enable_quotas(osb); + if (status < 0) { + /* We have to err-out specially here because + * s_root is already set */ + mlog_errno(status); + atomic_set(&osb->vol_state, VOLUME_DISABLED); + wake_up(&osb->osb_mount_event); + return status; + } + } + + ocfs2_complete_quota_recovery(osb); + + /* Now we wake up again for processes waiting for quotas */ + atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS); + wake_up(&osb->osb_mount_event); + + /* Start this when the mount is almost sure of being successful */ + ocfs2_orphan_scan_start(osb); + + return status; + +read_super_error: + brelse(bh); + + if (inode) + iput(inode); + + if (osb) { + atomic_set(&osb->vol_state, VOLUME_DISABLED); + wake_up(&osb->osb_mount_event); + ocfs2_dismount_volume(sb, 1); + } + + if (status) + mlog_errno(status); + return status; +} + +static struct dentry *ocfs2_mount(struct file_system_type *fs_type, + int flags, + const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); +} + +static void ocfs2_kill_sb(struct super_block *sb) +{ + struct ocfs2_super *osb = OCFS2_SB(sb); + + /* Failed mount? */ + if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) + goto out; + + /* Prevent further queueing of inode drop events */ + spin_lock(&dentry_list_lock); + ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); + spin_unlock(&dentry_list_lock); + /* Wait for work to finish and/or remove it */ + cancel_work_sync(&osb->dentry_lock_work); +out: + kill_block_super(sb); +} + +static struct file_system_type ocfs2_fs_type = { + .owner = THIS_MODULE, + .name = "ocfs2", + .mount = ocfs2_mount, + .kill_sb = ocfs2_kill_sb, + + .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, + .next = NULL +}; + +static int ocfs2_check_set_options(struct super_block *sb, + struct mount_options *options) +{ + if (options->mount_opt & OCFS2_MOUNT_USRQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + mlog(ML_ERROR, "User quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + return 0; + } + if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + mlog(ML_ERROR, "Group quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + return 0; + } + if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL && + !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) { + mlog(ML_ERROR, "ACL support requested but extended attributes " + "feature is not enabled\n"); + return 0; + } + /* No ACL setting specified? Use XATTR feature... */ + if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL | + OCFS2_MOUNT_NO_POSIX_ACL))) { + if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) + options->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + else + options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; + } + return 1; +} + +static int ocfs2_parse_options(struct super_block *sb, + char *options, + struct mount_options *mopt, + int is_remount) +{ + int status, user_stack = 0; + char *p; + u32 tmp; + + trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); + + mopt->commit_interval = 0; + mopt->mount_opt = OCFS2_MOUNT_NOINTR; + mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; + mopt->slot = OCFS2_INVALID_SLOT; + mopt->localalloc_opt = -1; + mopt->cluster_stack[0] = '\0'; + mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; + mopt->dir_resv_level = -1; + + if (!options) { + status = 1; + goto bail; + } + + while ((p = strsep(&options, ",")) != NULL) { + int token, option; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_hb_local: + mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; + break; + case Opt_hb_none: + mopt->mount_opt |= OCFS2_MOUNT_HB_NONE; + break; + case Opt_hb_global: + mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL; + break; + case Opt_barrier: + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option) + mopt->mount_opt |= OCFS2_MOUNT_BARRIER; + else + mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER; + break; + case Opt_intr: + mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR; + break; + case Opt_nointr: + mopt->mount_opt |= OCFS2_MOUNT_NOINTR; + break; + case Opt_err_panic: + mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; + break; + case Opt_err_ro: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; + break; + case Opt_data_ordered: + mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; + break; + case Opt_data_writeback: + mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; + break; + case Opt_user_xattr: + mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; + break; + case Opt_nouser_xattr: + mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR; + break; + case Opt_atime_quantum: + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= 0) + mopt->atime_quantum = option; + break; + case Opt_slot: + option = 0; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option) + mopt->slot = (s16)option; + break; + case Opt_commit: + option = 0; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option < 0) + return 0; + if (option == 0) + option = JBD2_DEFAULT_MAX_COMMIT_AGE; + mopt->commit_interval = HZ * option; + break; + case Opt_localalloc: + option = 0; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= 0) + mopt->localalloc_opt = option; + break; + case Opt_localflocks: + /* + * Changing this during remount could race + * flock() requests, or "unbalance" existing + * ones (e.g., a lock is taken in one mode but + * dropped in the other). If users care enough + * to flip locking modes during remount, we + * could add a "local" flag to individual + * flock structures for proper tracking of + * state. + */ + if (!is_remount) + mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; + break; + case Opt_stack: + /* Check both that the option we were passed + * is of the right length and that it is a proper + * string of the right length. + */ + if (((args[0].to - args[0].from) != + OCFS2_STACK_LABEL_LEN) || + (strnlen(args[0].from, + OCFS2_STACK_LABEL_LEN) != + OCFS2_STACK_LABEL_LEN)) { + mlog(ML_ERROR, + "Invalid cluster_stack option\n"); + status = 0; + goto bail; + } + memcpy(mopt->cluster_stack, args[0].from, + OCFS2_STACK_LABEL_LEN); + mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + /* + * Open code the memcmp here as we don't have + * an osb to pass to + * ocfs2_userspace_stack(). + */ + if (memcmp(mopt->cluster_stack, + OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + user_stack = 1; + break; + case Opt_inode64: + mopt->mount_opt |= OCFS2_MOUNT_INODE64; + break; + case Opt_usrquota: + mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; + break; + case Opt_grpquota: + mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; + break; + case Opt_coherency_buffered: + mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED; + break; + case Opt_coherency_full: + mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; + break; + case Opt_acl: + mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; + break; + case Opt_noacl: + mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; + mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; + break; + case Opt_resv_level: + if (is_remount) + break; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= OCFS2_MIN_RESV_LEVEL && + option < OCFS2_MAX_RESV_LEVEL) + mopt->resv_level = option; + break; + case Opt_dir_resv_level: + if (is_remount) + break; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= OCFS2_MIN_RESV_LEVEL && + option < OCFS2_MAX_RESV_LEVEL) + mopt->dir_resv_level = option; + break; + default: + mlog(ML_ERROR, + "Unrecognized mount option \"%s\" " + "or missing value\n", p); + status = 0; + goto bail; + } + } + + if (user_stack == 0) { + /* Ensure only one heartbeat mode */ + tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | + OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE); + if (hweight32(tmp) != 1) { + mlog(ML_ERROR, "Invalid heartbeat mount options\n"); + status = 0; + goto bail; + } + } + + status = 1; + +bail: + return status; +} + +static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) +{ + struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb); + unsigned long opts = osb->s_mount_opt; + unsigned int local_alloc_megs; + + if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) { + seq_printf(s, ",_netdev"); + if (opts & OCFS2_MOUNT_HB_LOCAL) + seq_printf(s, ",%s", OCFS2_HB_LOCAL); + else + seq_printf(s, ",%s", OCFS2_HB_GLOBAL); + } else + seq_printf(s, ",%s", OCFS2_HB_NONE); + + if (opts & OCFS2_MOUNT_NOINTR) + seq_printf(s, ",nointr"); + + if (opts & OCFS2_MOUNT_DATA_WRITEBACK) + seq_printf(s, ",data=writeback"); + else + seq_printf(s, ",data=ordered"); + + if (opts & OCFS2_MOUNT_BARRIER) + seq_printf(s, ",barrier=1"); + + if (opts & OCFS2_MOUNT_ERRORS_PANIC) + seq_printf(s, ",errors=panic"); + else + seq_printf(s, ",errors=remount-ro"); + + if (osb->preferred_slot != OCFS2_INVALID_SLOT) + seq_printf(s, ",preferred_slot=%d", osb->preferred_slot); + + if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME)) + seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); + + if (osb->osb_commit_interval) + seq_printf(s, ",commit=%u", + (unsigned) (osb->osb_commit_interval / HZ)); + + local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); + if (local_alloc_megs != ocfs2_la_default_mb(osb)) + seq_printf(s, ",localalloc=%d", local_alloc_megs); + + if (opts & OCFS2_MOUNT_LOCALFLOCKS) + seq_printf(s, ",localflocks,"); + + if (osb->osb_cluster_stack[0]) + seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, + osb->osb_cluster_stack); + if (opts & OCFS2_MOUNT_USRQUOTA) + seq_printf(s, ",usrquota"); + if (opts & OCFS2_MOUNT_GRPQUOTA) + seq_printf(s, ",grpquota"); + + if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED) + seq_printf(s, ",coherency=buffered"); + else + seq_printf(s, ",coherency=full"); + + if (opts & OCFS2_MOUNT_NOUSERXATTR) + seq_printf(s, ",nouser_xattr"); + else + seq_printf(s, ",user_xattr"); + + if (opts & OCFS2_MOUNT_INODE64) + seq_printf(s, ",inode64"); + + if (opts & OCFS2_MOUNT_POSIX_ACL) + seq_printf(s, ",acl"); + else + seq_printf(s, ",noacl"); + + if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL) + seq_printf(s, ",resv_level=%d", osb->osb_resv_level); + + if (osb->osb_dir_resv_level != osb->osb_resv_level) + seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level); + + return 0; +} + +static int __init ocfs2_init(void) +{ + int status; + + ocfs2_print_version(); + + status = init_ocfs2_uptodate_cache(); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_initialize_mem_caches(); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); + if (!ocfs2_wq) { + status = -ENOMEM; + goto leave; + } + + ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); + if (!ocfs2_debugfs_root) { + status = -EFAULT; + mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); + } + + ocfs2_set_locking_protocol(); + + status = register_quota_format(&ocfs2_quota_format); +leave: + if (status < 0) { + ocfs2_free_mem_caches(); + exit_ocfs2_uptodate_cache(); + mlog_errno(status); + } + + if (status >= 0) { + return register_filesystem(&ocfs2_fs_type); + } else + return -1; +} + +static void __exit ocfs2_exit(void) +{ + if (ocfs2_wq) { + flush_workqueue(ocfs2_wq); + destroy_workqueue(ocfs2_wq); + } + + unregister_quota_format(&ocfs2_quota_format); + + debugfs_remove(ocfs2_debugfs_root); + + ocfs2_free_mem_caches(); + + unregister_filesystem(&ocfs2_fs_type); + + exit_ocfs2_uptodate_cache(); +} + +static void ocfs2_put_super(struct super_block *sb) +{ + trace_ocfs2_put_super(sb); + + ocfs2_sync_blockdev(sb); + ocfs2_dismount_volume(sb, 0); +} + +static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ocfs2_super *osb; + u32 numbits, freebits; + int status; + struct ocfs2_dinode *bm_lock; + struct buffer_head *bh = NULL; + struct inode *inode = NULL; + + trace_ocfs2_statfs(dentry->d_sb, buf); + + osb = OCFS2_SB(dentry->d_sb); + + inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!inode) { + mlog(ML_ERROR, "failed to get bitmap inode\n"); + status = -EIO; + goto bail; + } + + status = ocfs2_inode_lock(inode, &bh, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + bm_lock = (struct ocfs2_dinode *) bh->b_data; + + numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total); + freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used); + + buf->f_type = OCFS2_SUPER_MAGIC; + buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_namelen = OCFS2_MAX_FILENAME_LEN; + buf->f_blocks = ((sector_t) numbits) * + (osb->s_clustersize >> osb->sb->s_blocksize_bits); + buf->f_bfree = ((sector_t) freebits) * + (osb->s_clustersize >> osb->sb->s_blocksize_bits); + buf->f_bavail = buf->f_bfree; + buf->f_files = numbits; + buf->f_ffree = freebits; + buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN) + & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN, + OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL; + + brelse(bh); + + ocfs2_inode_unlock(inode, 0); + status = 0; +bail: + if (inode) + iput(inode); + + if (status) + mlog_errno(status); + + return status; +} + +static void ocfs2_inode_init_once(void *data) +{ + struct ocfs2_inode_info *oi = data; + + oi->ip_flags = 0; + oi->ip_open_count = 0; + spin_lock_init(&oi->ip_lock); + ocfs2_extent_map_init(&oi->vfs_inode); + INIT_LIST_HEAD(&oi->ip_io_markers); + oi->ip_dir_start_lookup = 0; + + init_rwsem(&oi->ip_alloc_sem); + init_rwsem(&oi->ip_xattr_sem); + mutex_init(&oi->ip_io_mutex); + + oi->ip_blkno = 0ULL; + oi->ip_clusters = 0; + + ocfs2_resv_init_once(&oi->ip_la_data_resv); + + ocfs2_lock_res_init_once(&oi->ip_rw_lockres); + ocfs2_lock_res_init_once(&oi->ip_inode_lockres); + ocfs2_lock_res_init_once(&oi->ip_open_lockres); + + ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), + &ocfs2_inode_caching_ops); + + inode_init_once(&oi->vfs_inode); +} + +static int ocfs2_initialize_mem_caches(void) +{ + ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache", + sizeof(struct ocfs2_inode_info), + 0, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + ocfs2_inode_init_once); + ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", + sizeof(struct ocfs2_dquot), + 0, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + NULL); + ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache", + sizeof(struct ocfs2_quota_chunk), + 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + NULL); + if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || + !ocfs2_qf_chunk_cachep) { + if (ocfs2_inode_cachep) + kmem_cache_destroy(ocfs2_inode_cachep); + if (ocfs2_dquot_cachep) + kmem_cache_destroy(ocfs2_dquot_cachep); + if (ocfs2_qf_chunk_cachep) + kmem_cache_destroy(ocfs2_qf_chunk_cachep); + return -ENOMEM; + } + + return 0; +} + +static void ocfs2_free_mem_caches(void) +{ + if (ocfs2_inode_cachep) + kmem_cache_destroy(ocfs2_inode_cachep); + ocfs2_inode_cachep = NULL; + + if (ocfs2_dquot_cachep) + kmem_cache_destroy(ocfs2_dquot_cachep); + ocfs2_dquot_cachep = NULL; + + if (ocfs2_qf_chunk_cachep) + kmem_cache_destroy(ocfs2_qf_chunk_cachep); + ocfs2_qf_chunk_cachep = NULL; +} + +static int ocfs2_get_sector(struct super_block *sb, + struct buffer_head **bh, + int block, + int sect_size) +{ + if (!sb_set_blocksize(sb, sect_size)) { + mlog(ML_ERROR, "unable to set blocksize\n"); + return -EIO; + } + + *bh = sb_getblk(sb, block); + if (!*bh) { + mlog_errno(-EIO); + return -EIO; + } + lock_buffer(*bh); + if (!buffer_dirty(*bh)) + clear_buffer_uptodate(*bh); + unlock_buffer(*bh); + ll_rw_block(READ, 1, bh); + wait_on_buffer(*bh); + if (!buffer_uptodate(*bh)) { + mlog_errno(-EIO); + brelse(*bh); + *bh = NULL; + return -EIO; + } + + return 0; +} + +static int ocfs2_mount_volume(struct super_block *sb) +{ + int status = 0; + int unlock_super = 0; + struct ocfs2_super *osb = OCFS2_SB(sb); + + if (ocfs2_is_hard_readonly(osb)) + goto leave; + + status = ocfs2_dlm_init(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_super_lock(osb, 1); + if (status < 0) { + mlog_errno(status); + goto leave; + } + unlock_super = 1; + + /* This will load up the node map and add ourselves to it. */ + status = ocfs2_find_slot(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* load all node-local system inodes */ + status = ocfs2_init_local_system_inodes(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_check_volume(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_truncate_log_init(osb); + if (status < 0) + mlog_errno(status); + +leave: + if (unlock_super) + ocfs2_super_unlock(osb, 1); + + return status; +} + +static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) +{ + int tmp, hangup_needed = 0; + struct ocfs2_super *osb = NULL; + char nodestr[8]; + + trace_ocfs2_dismount_volume(sb); + + BUG_ON(!sb); + osb = OCFS2_SB(sb); + BUG_ON(!osb); + + debugfs_remove(osb->osb_ctxt); + + /* + * Flush inode dropping work queue so that deletes are + * performed while the filesystem is still working + */ + ocfs2_drop_all_dl_inodes(osb); + + /* Orphan scan should be stopped as early as possible */ + ocfs2_orphan_scan_stop(osb); + + ocfs2_disable_quotas(osb); + + ocfs2_shutdown_local_alloc(osb); + + ocfs2_truncate_log_shutdown(osb); + + /* This will disable recovery and flush any recovery work. */ + ocfs2_recovery_exit(osb); + + ocfs2_journal_shutdown(osb); + + ocfs2_sync_blockdev(sb); + + ocfs2_purge_refcount_trees(osb); + + /* No cluster connection means we've failed during mount, so skip + * all the steps which depended on that to complete. */ + if (osb->cconn) { + tmp = ocfs2_super_lock(osb, 1); + if (tmp < 0) { + mlog_errno(tmp); + return; + } + } + + if (osb->slot_num != OCFS2_INVALID_SLOT) + ocfs2_put_slot(osb); + + if (osb->cconn) + ocfs2_super_unlock(osb, 1); + + ocfs2_release_system_inodes(osb); + + /* + * If we're dismounting due to mount error, mount.ocfs2 will clean + * up heartbeat. If we're a local mount, there is no heartbeat. + * If we failed before we got a uuid_str yet, we can't stop + * heartbeat. Otherwise, do it. + */ + if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str) + hangup_needed = 1; + + if (osb->cconn) + ocfs2_dlm_shutdown(osb, hangup_needed); + + ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); + debugfs_remove(osb->osb_debug_root); + + if (hangup_needed) + ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); + + atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); + + if (ocfs2_mount_local(osb)) + snprintf(nodestr, sizeof(nodestr), "local"); + else + snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); + + printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", + osb->dev_str, nodestr); + + ocfs2_delete_osb(osb); + kfree(osb); + sb->s_dev = 0; + sb->s_fs_info = NULL; +} + +static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid, + unsigned uuid_bytes) +{ + int i, ret; + char *ptr; + + BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN); + + osb->uuid_str = kzalloc(OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); + if (osb->uuid_str == NULL) + return -ENOMEM; + + for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) { + /* print with null */ + ret = snprintf(ptr, 3, "%02X", uuid[i]); + if (ret != 2) /* drop super cleans up */ + return -EINVAL; + /* then only advance past the last char */ + ptr += 2; + } + + return 0; +} + +/* Make sure entire volume is addressable by our journal. Requires + osb_clusters_at_boot to be valid and for the journal to have been + initialized by ocfs2_journal_init(). */ +static int ocfs2_journal_addressable(struct ocfs2_super *osb) +{ + int status = 0; + u64 max_block = + ocfs2_clusters_to_blocks(osb->sb, + osb->osb_clusters_at_boot) - 1; + + /* 32-bit block number is always OK. */ + if (max_block <= (u32)~0ULL) + goto out; + + /* Volume is "huge", so see if our journal is new enough to + support it. */ + if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_COMPAT_JBD2_SB) && + jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_64BIT))) { + mlog(ML_ERROR, "The journal cannot address the entire volume. " + "Enable the 'block64' journal option with tunefs.ocfs2"); + status = -EFBIG; + goto out; + } + + out: + return status; +} + +static int ocfs2_initialize_super(struct super_block *sb, + struct buffer_head *bh, + int sector_size, + struct ocfs2_blockcheck_stats *stats) +{ + int status; + int i, cbits, bbits; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + struct inode *inode = NULL; + struct ocfs2_journal *journal; + __le32 uuid_net_key; + struct ocfs2_super *osb; + u64 total_blocks; + + osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL); + if (!osb) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + sb->s_fs_info = osb; + sb->s_op = &ocfs2_sops; + sb->s_d_op = &ocfs2_dentry_ops; + sb->s_export_op = &ocfs2_export_ops; + sb->s_qcop = &ocfs2_quotactl_ops; + sb->dq_op = &ocfs2_quota_operations; + sb->s_xattr = ocfs2_xattr_handlers; + sb->s_time_gran = 1; + sb->s_flags |= MS_NOATIME; + /* this is needed to support O_LARGEFILE */ + cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); + bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); + sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); + + osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; + + for (i = 0; i < 3; i++) + osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]); + osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash); + + osb->sb = sb; + /* Save off for ocfs2_rw_direct */ + osb->s_sectsize_bits = blksize_bits(sector_size); + BUG_ON(!osb->s_sectsize_bits); + + spin_lock_init(&osb->dc_task_lock); + init_waitqueue_head(&osb->dc_event); + osb->dc_work_sequence = 0; + osb->dc_wake_sequence = 0; + INIT_LIST_HEAD(&osb->blocked_lock_list); + osb->blocked_lock_count = 0; + spin_lock_init(&osb->osb_lock); + spin_lock_init(&osb->osb_xattr_lock); + ocfs2_init_steal_slots(osb); + + atomic_set(&osb->alloc_stats.moves, 0); + atomic_set(&osb->alloc_stats.local_data, 0); + atomic_set(&osb->alloc_stats.bitmap_data, 0); + atomic_set(&osb->alloc_stats.bg_allocs, 0); + atomic_set(&osb->alloc_stats.bg_extends, 0); + + /* Copy the blockcheck stats from the superblock probe */ + osb->osb_ecc_stats = *stats; + + ocfs2_init_node_maps(osb); + + snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", + MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + + osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); + if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { + mlog(ML_ERROR, "Invalid number of node slots (%u)\n", + osb->max_slots); + status = -EINVAL; + goto bail; + } + + ocfs2_orphan_scan_init(osb); + + status = ocfs2_recovery_init(osb); + if (status) { + mlog(ML_ERROR, "Unable to initialize recovery state\n"); + mlog_errno(status); + goto bail; + } + + init_waitqueue_head(&osb->checkpoint_event); + atomic_set(&osb->needs_checkpoint, 0); + + osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; + + osb->slot_num = OCFS2_INVALID_SLOT; + + osb->s_xattr_inline_size = le16_to_cpu( + di->id2.i_super.s_xattr_inline_size); + + osb->local_alloc_state = OCFS2_LA_UNUSED; + osb->local_alloc_bh = NULL; + INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker); + + init_waitqueue_head(&osb->osb_mount_event); + + status = ocfs2_resmap_init(osb, &osb->osb_la_resmap); + if (status) { + mlog_errno(status); + goto bail; + } + + osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); + if (!osb->vol_label) { + mlog(ML_ERROR, "unable to alloc vol label\n"); + status = -ENOMEM; + goto bail; + } + + osb->slot_recovery_generations = + kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), + GFP_KERNEL); + if (!osb->slot_recovery_generations) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + init_waitqueue_head(&osb->osb_wipe_event); + osb->osb_orphan_wipes = kcalloc(osb->max_slots, + sizeof(*osb->osb_orphan_wipes), + GFP_KERNEL); + if (!osb->osb_orphan_wipes) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + osb->osb_rf_lock_tree = RB_ROOT; + + osb->s_feature_compat = + le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); + osb->s_feature_ro_compat = + le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat); + osb->s_feature_incompat = + le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat); + + if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) { + mlog(ML_ERROR, "couldn't mount because of unsupported " + "optional features (%x).\n", i); + status = -EINVAL; + goto bail; + } + if (!(osb->sb->s_flags & MS_RDONLY) && + (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { + mlog(ML_ERROR, "couldn't mount RDWR because of " + "unsupported optional features (%x).\n", i); + status = -EINVAL; + goto bail; + } + + if (ocfs2_clusterinfo_valid(osb)) { + osb->osb_stackflags = + OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; + memcpy(osb->osb_cluster_stack, + OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, + OCFS2_STACK_LABEL_LEN); + osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { + mlog(ML_ERROR, + "couldn't mount because of an invalid " + "cluster stack label (%s) \n", + osb->osb_cluster_stack); + status = -EINVAL; + goto bail; + } + } else { + /* The empty string is identical with classic tools that + * don't know about s_cluster_info. */ + osb->osb_cluster_stack[0] = '\0'; + } + + get_random_bytes(&osb->s_next_generation, sizeof(u32)); + + /* FIXME + * This should be done in ocfs2_journal_init(), but unknown + * ordering issues will cause the filesystem to crash. + * If anyone wants to figure out what part of the code + * refers to osb->journal before ocfs2_journal_init() is run, + * be my guest. + */ + /* initialize our journal structure */ + + journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); + if (!journal) { + mlog(ML_ERROR, "unable to alloc journal\n"); + status = -ENOMEM; + goto bail; + } + osb->journal = journal; + journal->j_osb = osb; + + atomic_set(&journal->j_num_trans, 0); + init_rwsem(&journal->j_trans_barrier); + init_waitqueue_head(&journal->j_checkpointed); + spin_lock_init(&journal->j_lock); + journal->j_trans_id = (unsigned long) 1; + INIT_LIST_HEAD(&journal->j_la_cleanups); + INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); + journal->j_state = OCFS2_JOURNAL_FREE; + + INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); + osb->dentry_lock_list = NULL; + + /* get some pseudo constants for clustersize bits */ + osb->s_clustersize_bits = + le32_to_cpu(di->id2.i_super.s_clustersize_bits); + osb->s_clustersize = 1 << osb->s_clustersize_bits; + + if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE || + osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) { + mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", + osb->s_clustersize); + status = -EINVAL; + goto bail; + } + + total_blocks = ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(di->i_clusters)); + + status = generic_check_addressable(osb->sb->s_blocksize_bits, + total_blocks); + if (status) { + mlog(ML_ERROR, "Volume too large " + "to mount safely on this system"); + status = -EFBIG; + goto bail; + } + + if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, + sizeof(di->id2.i_super.s_uuid))) { + mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); + status = -ENOMEM; + goto bail; + } + + memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); + + strncpy(osb->vol_label, di->id2.i_super.s_label, 63); + osb->vol_label[63] = '\0'; + osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); + osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); + osb->first_cluster_group_blkno = + le64_to_cpu(di->id2.i_super.s_first_cluster_group); + osb->fs_generation = le32_to_cpu(di->i_fs_generation); + osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash); + trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str, + (unsigned long long)osb->root_blkno, + (unsigned long long)osb->system_dir_blkno, + osb->s_clustersize_bits); + + osb->osb_dlm_debug = ocfs2_new_dlm_debug(); + if (!osb->osb_dlm_debug) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + atomic_set(&osb->vol_state, VOLUME_INIT); + + /* load root, system_dir, and all global system inodes */ + status = ocfs2_init_global_system_inodes(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* + * global bitmap + */ + inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; + osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters; + iput(inode); + + osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0, + osb->s_feature_incompat) * 8; + + status = ocfs2_init_slot_info(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + cleancache_init_shared_fs((char *)&uuid_net_key, sb); + +bail: + return status; +} + +/* + * will return: -EAGAIN if it is ok to keep searching for superblocks + * -EINVAL if there is a bad superblock + * 0 on success + */ +static int ocfs2_verify_volume(struct ocfs2_dinode *di, + struct buffer_head *bh, + u32 blksz, + struct ocfs2_blockcheck_stats *stats) +{ + int status = -EAGAIN; + + if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, + strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { + /* We have to do a raw check of the feature here */ + if (le32_to_cpu(di->id2.i_super.s_feature_incompat) & + OCFS2_FEATURE_INCOMPAT_META_ECC) { + status = ocfs2_block_check_validate(bh->b_data, + bh->b_size, + &di->i_check, + stats); + if (status) + goto out; + } + status = -EINVAL; + if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { + mlog(ML_ERROR, "found superblock with incorrect block " + "size: found %u, should be %u\n", + 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), + blksz); + } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != + OCFS2_MAJOR_REV_LEVEL || + le16_to_cpu(di->id2.i_super.s_minor_rev_level) != + OCFS2_MINOR_REV_LEVEL) { + mlog(ML_ERROR, "found superblock with bad version: " + "found %u.%u, should be %u.%u\n", + le16_to_cpu(di->id2.i_super.s_major_rev_level), + le16_to_cpu(di->id2.i_super.s_minor_rev_level), + OCFS2_MAJOR_REV_LEVEL, + OCFS2_MINOR_REV_LEVEL); + } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) { + mlog(ML_ERROR, "bad block number on superblock: " + "found %llu, should be %llu\n", + (unsigned long long)le64_to_cpu(di->i_blkno), + (unsigned long long)bh->b_blocknr); + } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 || + le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) { + mlog(ML_ERROR, "bad cluster size found: %u\n", + 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits)); + } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) { + mlog(ML_ERROR, "bad root_blkno: 0\n"); + } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) { + mlog(ML_ERROR, "bad system_dir_blkno: 0\n"); + } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) { + mlog(ML_ERROR, + "Superblock slots found greater than file system " + "maximum: found %u, max %u\n", + le16_to_cpu(di->id2.i_super.s_max_slots), + OCFS2_MAX_SLOTS); + } else { + /* found it! */ + status = 0; + } + } + +out: + if (status && status != -EAGAIN) + mlog_errno(status); + return status; +} + +static int ocfs2_check_volume(struct ocfs2_super *osb) +{ + int status; + int dirty; + int local; + struct ocfs2_dinode *local_alloc = NULL; /* only used if we + * recover + * ourselves. */ + + /* Init our journal object. */ + status = ocfs2_journal_init(osb->journal, &dirty); + if (status < 0) { + mlog(ML_ERROR, "Could not initialize journal!\n"); + goto finally; + } + + /* Now that journal has been initialized, check to make sure + entire volume is addressable. */ + status = ocfs2_journal_addressable(osb); + if (status) + goto finally; + + /* If the journal was unmounted cleanly then we don't want to + * recover anything. Otherwise, journal_load will do that + * dirty work for us :) */ + if (!dirty) { + status = ocfs2_journal_wipe(osb->journal, 0); + if (status < 0) { + mlog_errno(status); + goto finally; + } + } else { + mlog(ML_NOTICE, "File system was not unmounted cleanly, " + "recovering volume.\n"); + } + + local = ocfs2_mount_local(osb); + + /* will play back anything left in the journal. */ + status = ocfs2_journal_load(osb->journal, local, dirty); + if (status < 0) { + mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status); + goto finally; + } + + if (dirty) { + /* recover my local alloc if we didn't unmount cleanly. */ + status = ocfs2_begin_local_alloc_recovery(osb, + osb->slot_num, + &local_alloc); + if (status < 0) { + mlog_errno(status); + goto finally; + } + /* we complete the recovery process after we've marked + * ourselves as mounted. */ + } + + status = ocfs2_load_local_alloc(osb); + if (status < 0) { + mlog_errno(status); + goto finally; + } + + if (dirty) { + /* Recovery will be completed after we've mounted the + * rest of the volume. */ + osb->dirty = 1; + osb->local_alloc_copy = local_alloc; + local_alloc = NULL; + } + + /* go through each journal, trylock it and if you get the + * lock, and it's marked as dirty, set the bit in the recover + * map and launch a recovery thread for it. */ + status = ocfs2_mark_dead_nodes(osb); + if (status < 0) { + mlog_errno(status); + goto finally; + } + + status = ocfs2_compute_replay_slots(osb); + if (status < 0) + mlog_errno(status); + +finally: + if (local_alloc) + kfree(local_alloc); + + if (status) + mlog_errno(status); + return status; +} + +/* + * The routine gets called from dismount or close whenever a dismount on + * volume is requested and the osb open count becomes 1. + * It will remove the osb from the global list and also free up all the + * initialized resources and fileobject. + */ +static void ocfs2_delete_osb(struct ocfs2_super *osb) +{ + /* This function assumes that the caller has the main osb resource */ + + ocfs2_free_slot_info(osb); + + kfree(osb->osb_orphan_wipes); + kfree(osb->slot_recovery_generations); + /* FIXME + * This belongs in journal shutdown, but because we have to + * allocate osb->journal at the start of ocfs2_initialize_osb(), + * we free it here. + */ + kfree(osb->journal); + if (osb->local_alloc_copy) + kfree(osb->local_alloc_copy); + kfree(osb->uuid_str); + ocfs2_put_dlm_debug(osb->osb_dlm_debug); + memset(osb, 0, sizeof(struct ocfs2_super)); +} + +/* Put OCFS2 into a readonly state, or (if the user specifies it), + * panic(). We do not support continue-on-error operation. */ +static void ocfs2_handle_error(struct super_block *sb) +{ + struct ocfs2_super *osb = OCFS2_SB(sb); + + if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) + panic("OCFS2: (device %s): panic forced after error\n", + sb->s_id); + + ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); + + if (sb->s_flags & MS_RDONLY && + (ocfs2_is_soft_readonly(osb) || + ocfs2_is_hard_readonly(osb))) + return; + + printk(KERN_CRIT "File system is now read-only due to the potential " + "of on-disk corruption. Please run fsck.ocfs2 once the file " + "system is unmounted.\n"); + sb->s_flags |= MS_RDONLY; + ocfs2_set_ro_flag(osb, 0); +} + +static char error_buf[1024]; + +void __ocfs2_error(struct super_block *sb, + const char *function, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(error_buf, sizeof(error_buf), fmt, args); + va_end(args); + + /* Not using mlog here because we want to show the actual + * function the error came from. */ + printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n", + sb->s_id, function, error_buf); + + ocfs2_handle_error(sb); +} + +/* Handle critical errors. This is intentionally more drastic than + * ocfs2_handle_error, so we only use for things like journal errors, + * etc. */ +void __ocfs2_abort(struct super_block* sb, + const char *function, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(error_buf, sizeof(error_buf), fmt, args); + va_end(args); + + printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n", + sb->s_id, function, error_buf); + + /* We don't have the cluster support yet to go straight to + * hard readonly in here. Until then, we want to keep + * ocfs2_abort() so that we can at least mark critical + * errors. + * + * TODO: This should abort the journal and alert other nodes + * that our slot needs recovery. */ + + /* Force a panic(). This stinks, but it's better than letting + * things continue without having a proper hard readonly + * here. */ + if (!ocfs2_mount_local(OCFS2_SB(sb))) + OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; + ocfs2_handle_error(sb); +} + +/* + * Void signal blockers, because in-kernel sigprocmask() only fails + * when SIG_* is wrong. + */ +void ocfs2_block_signals(sigset_t *oldset) +{ + int rc; + sigset_t blocked; + + sigfillset(&blocked); + rc = sigprocmask(SIG_BLOCK, &blocked, oldset); + BUG_ON(rc); +} + +void ocfs2_unblock_signals(sigset_t *oldset) +{ + int rc = sigprocmask(SIG_SETMASK, oldset, NULL); + BUG_ON(rc); +} + +module_init(ocfs2_init); +module_exit(ocfs2_exit); diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h new file mode 100644 index 00000000..40c7de08 --- /dev/null +++ b/fs/ocfs2/super.h @@ -0,0 +1,55 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * super.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_SUPER_H +#define OCFS2_SUPER_H + +extern struct workqueue_struct *ocfs2_wq; + +int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, + int node_num); + +void __ocfs2_error(struct super_block *sb, + const char *function, + const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))); + +#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) + +void __ocfs2_abort(struct super_block *sb, + const char *function, + const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))); + +#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) + +/* + * Void signal blockers, because in-kernel sigprocmask() only fails + * when SIG_* is wrong. + */ +void ocfs2_block_signals(sigset_t *oldset); +void ocfs2_unblock_signals(sigset_t *oldset); + +#endif /* OCFS2_SUPER_H */ diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c new file mode 100644 index 00000000..5d22872e --- /dev/null +++ b/fs/ocfs2/symlink.c @@ -0,0 +1,173 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * linux/cluster/ssi/cfs/symlink.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE + * or NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net + * + * Copyright (C) 1992 Rick Sladkey + * + * Optimization changes Copyright (C) 1994 Florian La Roche + * + * Jun 7 1999, cache symlink lookups in the page cache. -DaveM + * + * Portions Copyright (C) 2001 Compaq Computer Corporation + * + * ocfs2 symlink handling code. + * + * Copyright (C) 2004, 2005 Oracle. + * + */ + +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "symlink.h" +#include "xattr.h" + +#include "buffer_head_io.h" + + +static char *ocfs2_fast_symlink_getlink(struct inode *inode, + struct buffer_head **bh) +{ + int status; + char *link = NULL; + struct ocfs2_dinode *fe; + + status = ocfs2_read_inode_block(inode, bh); + if (status < 0) { + mlog_errno(status); + link = ERR_PTR(status); + goto bail; + } + + fe = (struct ocfs2_dinode *) (*bh)->b_data; + link = (char *) fe->id2.i_symlink; +bail: + + return link; +} + +static int ocfs2_readlink(struct dentry *dentry, + char __user *buffer, + int buflen) +{ + int ret; + char *link; + struct buffer_head *bh = NULL; + struct inode *inode = dentry->d_inode; + + link = ocfs2_fast_symlink_getlink(inode, &bh); + if (IS_ERR(link)) { + ret = PTR_ERR(link); + goto out; + } + + /* + * Without vfsmount we can't update atime now, + * but we will update atime here ultimately. + */ + ret = vfs_readlink(dentry, buffer, buflen, link); + + brelse(bh); +out: + if (ret < 0) + mlog_errno(ret); + return ret; +} + +static void *ocfs2_fast_follow_link(struct dentry *dentry, + struct nameidata *nd) +{ + int status = 0; + int len; + char *target, *link = ERR_PTR(-ENOMEM); + struct inode *inode = dentry->d_inode; + struct buffer_head *bh = NULL; + + BUG_ON(!ocfs2_inode_is_fast_symlink(inode)); + target = ocfs2_fast_symlink_getlink(inode, &bh); + if (IS_ERR(target)) { + status = PTR_ERR(target); + mlog_errno(status); + goto bail; + } + + /* Fast symlinks can't be large */ + len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb)); + link = kzalloc(len + 1, GFP_NOFS); + if (!link) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + memcpy(link, target, len); + +bail: + nd_set_link(nd, status ? ERR_PTR(status) : link); + brelse(bh); + + if (status) + mlog_errno(status); + return NULL; +} + +static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +{ + char *link = nd_get_link(nd); + if (!IS_ERR(link)) + kfree(link); +} + +const struct inode_operations ocfs2_symlink_inode_operations = { + .readlink = page_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .getattr = ocfs2_getattr, + .setattr = ocfs2_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, +}; +const struct inode_operations ocfs2_fast_symlink_inode_operations = { + .readlink = ocfs2_readlink, + .follow_link = ocfs2_fast_follow_link, + .put_link = ocfs2_fast_put_link, + .getattr = ocfs2_getattr, + .setattr = ocfs2_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, +}; diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h new file mode 100644 index 00000000..65a6c9c6 --- /dev/null +++ b/fs/ocfs2/symlink.h @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * symlink.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_SYMLINK_H +#define OCFS2_SYMLINK_H + +extern const struct inode_operations ocfs2_symlink_inode_operations; +extern const struct inode_operations ocfs2_fast_symlink_inode_operations; + +/* + * Test whether an inode is a fast symlink. + */ +static inline int ocfs2_inode_is_fast_symlink(struct inode *inode) +{ + return (S_ISLNK(inode->i_mode) && + inode->i_blocks == 0); +} + + +#endif /* OCFS2_SYMLINK_H */ diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c new file mode 100644 index 00000000..3d635f4b --- /dev/null +++ b/fs/ocfs2/sysfile.c @@ -0,0 +1,180 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * sysfile.c + * + * Initialize, read, write, etc. system files. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "alloc.h" +#include "dir.h" +#include "inode.h" +#include "journal.h" +#include "sysfile.h" + +#include "buffer_head_io.h" + +static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES]; +#endif + +static inline int is_global_system_inode(int type) +{ + return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE && + type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; +} + +static struct inode **get_local_system_inode(struct ocfs2_super *osb, + int type, + u32 slot) +{ + int index; + struct inode **local_system_inodes, **free = NULL; + + BUG_ON(slot == OCFS2_INVALID_SLOT); + BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE || + type > OCFS2_LAST_LOCAL_SYSTEM_INODE); + + spin_lock(&osb->osb_lock); + local_system_inodes = osb->local_system_inodes; + spin_unlock(&osb->osb_lock); + + if (unlikely(!local_system_inodes)) { + local_system_inodes = kzalloc(sizeof(struct inode *) * + NUM_LOCAL_SYSTEM_INODES * + osb->max_slots, + GFP_NOFS); + if (!local_system_inodes) { + mlog_errno(-ENOMEM); + /* + * return NULL here so that ocfs2_get_sytem_file_inodes + * will try to create an inode and use it. We will try + * to initialize local_system_inodes next time. + */ + return NULL; + } + + spin_lock(&osb->osb_lock); + if (osb->local_system_inodes) { + /* Someone has initialized it for us. */ + free = local_system_inodes; + local_system_inodes = osb->local_system_inodes; + } else + osb->local_system_inodes = local_system_inodes; + spin_unlock(&osb->osb_lock); + if (unlikely(free)) + kfree(free); + } + + index = (slot * NUM_LOCAL_SYSTEM_INODES) + + (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE); + + return &local_system_inodes[index]; +} + +struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot) +{ + struct inode *inode = NULL; + struct inode **arr = NULL; + + /* avoid the lookup if cached in local system file array */ + if (is_global_system_inode(type)) { + arr = &(osb->global_system_inodes[type]); + } else + arr = get_local_system_inode(osb, type, slot); + + if (arr && ((inode = *arr) != NULL)) { + /* get a ref in addition to the array ref */ + inode = igrab(inode); + BUG_ON(!inode); + + return inode; + } + + /* this gets one ref thru iget */ + inode = _ocfs2_get_system_file_inode(osb, type, slot); + + /* add one more if putting into array for first time */ + if (arr && inode) { + *arr = igrab(inode); + BUG_ON(!*arr); + } + return inode; +} + +static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot) +{ + char namebuf[40]; + struct inode *inode = NULL; + u64 blkno; + int status = 0; + + ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, slot); + + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, + strlen(namebuf), &blkno); + if (status < 0) { + goto bail; + } + + inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type); + if (IS_ERR(inode)) { + mlog_errno(PTR_ERR(inode)); + inode = NULL; + goto bail; + } +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (type == LOCAL_USER_QUOTA_SYSTEM_INODE || + type == LOCAL_GROUP_QUOTA_SYSTEM_INODE || + type == JOURNAL_SYSTEM_INODE) { + /* Ignore inode lock on these inodes as the lock does not + * really belong to any process and lockdep cannot handle + * that */ + OCFS2_I(inode)->ip_inode_lockres.l_lockdep_map.key = NULL; + } else { + lockdep_init_map(&OCFS2_I(inode)->ip_inode_lockres. + l_lockdep_map, + ocfs2_system_inodes[type].si_name, + &ocfs2_sysfile_cluster_lock_key[type], 0); + } +#endif +bail: + + return inode; +} + diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h new file mode 100644 index 00000000..cc9ea661 --- /dev/null +++ b/fs/ocfs2/sysfile.h @@ -0,0 +1,33 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * sysfile.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_SYSFILE_H +#define OCFS2_SYSFILE_H + +struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot); + +#endif /* OCFS2_SYSFILE_H */ diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c new file mode 100644 index 00000000..52eaf33d --- /dev/null +++ b/fs/ocfs2/uptodate.c @@ -0,0 +1,638 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * uptodate.c + * + * Tracking the up-to-date-ness of a local buffer_head with respect to + * the cluster. + * + * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Standard buffer head caching flags (uptodate, etc) are insufficient + * in a clustered environment - a buffer may be marked up to date on + * our local node but could have been modified by another cluster + * member. As a result an additional (and performant) caching scheme + * is required. A further requirement is that we consume as little + * memory as possible - we never pin buffer_head structures in order + * to cache them. + * + * We track the existence of up to date buffers on the inodes which + * are associated with them. Because we don't want to pin + * buffer_heads, this is only a (strong) hint and several other checks + * are made in the I/O path to ensure that we don't use a stale or + * invalid buffer without going to disk: + * - buffer_jbd is used liberally - if a bh is in the journal on + * this node then it *must* be up to date. + * - the standard buffer_uptodate() macro is used to detect buffers + * which may be invalid (even if we have an up to date tracking + * item for them) + * + * For a full understanding of how this code works together, one + * should read the callers in dlmglue.c, the I/O functions in + * buffer_head_io.c and ocfs2_journal_access in journal.c + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" + +#include "inode.h" +#include "uptodate.h" +#include "ocfs2_trace.h" + +struct ocfs2_meta_cache_item { + struct rb_node c_node; + sector_t c_block; +}; + +static struct kmem_cache *ocfs2_uptodate_cachep = NULL; + +u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + return ci->ci_ops->co_owner(ci); +} + +struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + return ci->ci_ops->co_get_super(ci); +} + +static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_cache_lock(ci); +} + +static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_cache_unlock(ci); +} + +void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_io_lock(ci); +} + +void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_io_unlock(ci); +} + + +static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci, + int clear) +{ + ci->ci_flags |= OCFS2_CACHE_FL_INLINE; + ci->ci_num_cached = 0; + + if (clear) { + ci->ci_created_trans = 0; + ci->ci_last_trans = 0; + } +} + +void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci, + const struct ocfs2_caching_operations *ops) +{ + BUG_ON(!ops); + + ci->ci_ops = ops; + ocfs2_metadata_cache_reset(ci, 1); +} + +void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci) +{ + ocfs2_metadata_cache_purge(ci); + ocfs2_metadata_cache_reset(ci, 1); +} + + +/* No lock taken here as 'root' is not expected to be visible to other + * processes. */ +static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) +{ + unsigned int purged = 0; + struct rb_node *node; + struct ocfs2_meta_cache_item *item; + + while ((node = rb_last(root)) != NULL) { + item = rb_entry(node, struct ocfs2_meta_cache_item, c_node); + + trace_ocfs2_purge_copied_metadata_tree( + (unsigned long long) item->c_block); + + rb_erase(&item->c_node, root); + kmem_cache_free(ocfs2_uptodate_cachep, item); + + purged++; + } + return purged; +} + +/* Called from locking and called from ocfs2_clear_inode. Dump the + * cache for a given inode. + * + * This function is a few more lines longer than necessary due to some + * accounting done here, but I think it's worth tracking down those + * bugs sooner -- Mark */ +void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci) +{ + unsigned int tree, to_purge, purged; + struct rb_root root = RB_ROOT; + + BUG_ON(!ci || !ci->ci_ops); + + ocfs2_metadata_cache_lock(ci); + tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE); + to_purge = ci->ci_num_cached; + + trace_ocfs2_metadata_cache_purge( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + to_purge, tree); + + /* If we're a tree, save off the root so that we can safely + * initialize the cache. We do the work to free tree members + * without the spinlock. */ + if (tree) + root = ci->ci_cache.ci_tree; + + ocfs2_metadata_cache_reset(ci, 0); + ocfs2_metadata_cache_unlock(ci); + + purged = ocfs2_purge_copied_metadata_tree(&root); + /* If possible, track the number wiped so that we can more + * easily detect counting errors. Unfortunately, this is only + * meaningful for trees. */ + if (tree && purged != to_purge) + mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + to_purge, purged); +} + +/* Returns the index in the cache array, -1 if not found. + * Requires ip_lock. */ +static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci, + sector_t item) +{ + int i; + + for (i = 0; i < ci->ci_num_cached; i++) { + if (item == ci->ci_cache.ci_array[i]) + return i; + } + + return -1; +} + +/* Returns the cache item if found, otherwise NULL. + * Requires ip_lock. */ +static struct ocfs2_meta_cache_item * +ocfs2_search_cache_tree(struct ocfs2_caching_info *ci, + sector_t block) +{ + struct rb_node * n = ci->ci_cache.ci_tree.rb_node; + struct ocfs2_meta_cache_item *item = NULL; + + while (n) { + item = rb_entry(n, struct ocfs2_meta_cache_item, c_node); + + if (block < item->c_block) + n = n->rb_left; + else if (block > item->c_block) + n = n->rb_right; + else + return item; + } + + return NULL; +} + +static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + int index = -1; + struct ocfs2_meta_cache_item *item = NULL; + + ocfs2_metadata_cache_lock(ci); + + trace_ocfs2_buffer_cached_begin( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long) bh->b_blocknr, + !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE)); + + if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) + index = ocfs2_search_cache_array(ci, bh->b_blocknr); + else + item = ocfs2_search_cache_tree(ci, bh->b_blocknr); + + ocfs2_metadata_cache_unlock(ci); + + trace_ocfs2_buffer_cached_end(index, item); + + return (index != -1) || (item != NULL); +} + +/* Warning: even if it returns true, this does *not* guarantee that + * the block is stored in our inode metadata cache. + * + * This can be called under lock_buffer() + */ +int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + /* Doesn't matter if the bh is in our cache or not -- if it's + * not marked uptodate then we know it can't have correct + * data. */ + if (!buffer_uptodate(bh)) + return 0; + + /* OCFS2 does not allow multiple nodes to be changing the same + * block at the same time. */ + if (buffer_jbd(bh)) + return 1; + + /* Ok, locally the buffer is marked as up to date, now search + * our cache to see if we can trust that. */ + return ocfs2_buffer_cached(ci, bh); +} + +/* + * Determine whether a buffer is currently out on a read-ahead request. + * ci_io_sem should be held to serialize submitters with the logic here. + */ +int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh); +} + +/* Requires ip_lock */ +static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, + sector_t block) +{ + BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY); + + trace_ocfs2_append_cache_array( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)block, ci->ci_num_cached); + + ci->ci_cache.ci_array[ci->ci_num_cached] = block; + ci->ci_num_cached++; +} + +/* By now the caller should have checked that the item does *not* + * exist in the tree. + * Requires ip_lock. */ +static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci, + struct ocfs2_meta_cache_item *new) +{ + sector_t block = new->c_block; + struct rb_node *parent = NULL; + struct rb_node **p = &ci->ci_cache.ci_tree.rb_node; + struct ocfs2_meta_cache_item *tmp; + + trace_ocfs2_insert_cache_tree( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)block, ci->ci_num_cached); + + while(*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node); + + if (block < tmp->c_block) + p = &(*p)->rb_left; + else if (block > tmp->c_block) + p = &(*p)->rb_right; + else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate block %llu cached!\n", + (unsigned long long) block); + BUG(); + } + } + + rb_link_node(&new->c_node, parent, p); + rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree); + ci->ci_num_cached++; +} + +/* co_cache_lock() must be held */ +static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci) +{ + return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) && + (ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY); +} + +/* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the + * pointers in tree after we use them - this allows caller to detect + * when to free in case of error. + * + * The co_cache_lock() must be held. */ +static void ocfs2_expand_cache(struct ocfs2_caching_info *ci, + struct ocfs2_meta_cache_item **tree) +{ + int i; + + mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY, + "Owner %llu, num cached = %u, should be %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY); + mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE), + "Owner %llu not marked as inline anymore!\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci)); + + /* Be careful to initialize the tree members *first* because + * once the ci_tree is used, the array is junk... */ + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) + tree[i]->c_block = ci->ci_cache.ci_array[i]; + + ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE; + ci->ci_cache.ci_tree = RB_ROOT; + /* this will be set again by __ocfs2_insert_cache_tree */ + ci->ci_num_cached = 0; + + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) { + __ocfs2_insert_cache_tree(ci, tree[i]); + tree[i] = NULL; + } + + trace_ocfs2_expand_cache( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + ci->ci_flags, ci->ci_num_cached); +} + +/* Slow path function - memory allocation is necessary. See the + * comment above ocfs2_set_buffer_uptodate for more information. */ +static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, + sector_t block, + int expand_tree) +{ + int i; + struct ocfs2_meta_cache_item *new = NULL; + struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] = + { NULL, }; + + trace_ocfs2_set_buffer_uptodate( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)block, expand_tree); + + new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); + if (!new) { + mlog_errno(-ENOMEM); + return; + } + new->c_block = block; + + if (expand_tree) { + /* Do *not* allocate an array here - the removal code + * has no way of tracking that. */ + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) { + tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, + GFP_NOFS); + if (!tree[i]) { + mlog_errno(-ENOMEM); + goto out_free; + } + + /* These are initialized in ocfs2_expand_cache! */ + } + } + + ocfs2_metadata_cache_lock(ci); + if (ocfs2_insert_can_use_array(ci)) { + /* Ok, items were removed from the cache in between + * locks. Detect this and revert back to the fast path */ + ocfs2_append_cache_array(ci, block); + ocfs2_metadata_cache_unlock(ci); + goto out_free; + } + + if (expand_tree) + ocfs2_expand_cache(ci, tree); + + __ocfs2_insert_cache_tree(ci, new); + ocfs2_metadata_cache_unlock(ci); + + new = NULL; +out_free: + if (new) + kmem_cache_free(ocfs2_uptodate_cachep, new); + + /* If these were used, then ocfs2_expand_cache re-set them to + * NULL for us. */ + if (tree[0]) { + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) + if (tree[i]) + kmem_cache_free(ocfs2_uptodate_cachep, + tree[i]); + } +} + +/* Item insertion is guarded by co_io_lock(), so the insertion path takes + * advantage of this by not rechecking for a duplicate insert during + * the slow case. Additionally, if the cache needs to be bumped up to + * a tree, the code will not recheck after acquiring the lock -- + * multiple paths cannot be expanding to a tree at the same time. + * + * The slow path takes into account that items can be removed + * (including the whole tree wiped and reset) when this process it out + * allocating memory. In those cases, it reverts back to the fast + * path. + * + * Note that this function may actually fail to insert the block if + * memory cannot be allocated. This is not fatal however (but may + * result in a performance penalty) + * + * Readahead buffers can be passed in here before the I/O request is + * completed. + */ +void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + int expand; + + /* The block may very well exist in our cache already, so avoid + * doing any more work in that case. */ + if (ocfs2_buffer_cached(ci, bh)) + return; + + trace_ocfs2_set_buffer_uptodate_begin( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)bh->b_blocknr); + + /* No need to recheck under spinlock - insertion is guarded by + * co_io_lock() */ + ocfs2_metadata_cache_lock(ci); + if (ocfs2_insert_can_use_array(ci)) { + /* Fast case - it's an array and there's a free + * spot. */ + ocfs2_append_cache_array(ci, bh->b_blocknr); + ocfs2_metadata_cache_unlock(ci); + return; + } + + expand = 0; + if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) { + /* We need to bump things up to a tree. */ + expand = 1; + } + ocfs2_metadata_cache_unlock(ci); + + __ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand); +} + +/* Called against a newly allocated buffer. Most likely nobody should + * be able to read this sort of metadata while it's still being + * allocated, but this is careful to take co_io_lock() anyway. */ +void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + /* This should definitely *not* exist in our cache */ + BUG_ON(ocfs2_buffer_cached(ci, bh)); + + set_buffer_uptodate(bh); + + ocfs2_metadata_cache_io_lock(ci); + ocfs2_set_buffer_uptodate(ci, bh); + ocfs2_metadata_cache_io_unlock(ci); +} + +/* Requires ip_lock. */ +static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, + int index) +{ + sector_t *array = ci->ci_cache.ci_array; + int bytes; + + BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY); + BUG_ON(index >= ci->ci_num_cached); + BUG_ON(!ci->ci_num_cached); + + trace_ocfs2_remove_metadata_array( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + index, ci->ci_num_cached); + + ci->ci_num_cached--; + + /* don't need to copy if the array is now empty, or if we + * removed at the tail */ + if (ci->ci_num_cached && index < ci->ci_num_cached) { + bytes = sizeof(sector_t) * (ci->ci_num_cached - index); + memmove(&array[index], &array[index + 1], bytes); + } +} + +/* Requires ip_lock. */ +static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, + struct ocfs2_meta_cache_item *item) +{ + trace_ocfs2_remove_metadata_tree( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)item->c_block); + + rb_erase(&item->c_node, &ci->ci_cache.ci_tree); + ci->ci_num_cached--; +} + +static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci, + sector_t block) +{ + int index; + struct ocfs2_meta_cache_item *item = NULL; + + ocfs2_metadata_cache_lock(ci); + trace_ocfs2_remove_block_from_cache( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long) block, ci->ci_num_cached, + ci->ci_flags); + + if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) { + index = ocfs2_search_cache_array(ci, block); + if (index != -1) + ocfs2_remove_metadata_array(ci, index); + } else { + item = ocfs2_search_cache_tree(ci, block); + if (item) + ocfs2_remove_metadata_tree(ci, item); + } + ocfs2_metadata_cache_unlock(ci); + + if (item) + kmem_cache_free(ocfs2_uptodate_cachep, item); +} + +/* + * Called when we remove a chunk of metadata from an inode. We don't + * bother reverting things to an inlined array in the case of a remove + * which moves us back under the limit. + */ +void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + sector_t block = bh->b_blocknr; + + ocfs2_remove_block_from_cache(ci, block); +} + +/* Called when we remove xattr clusters from an inode. */ +void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci, + sector_t block, + u32 c_len) +{ + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len; + + for (i = 0; i < b_len; i++, block++) + ocfs2_remove_block_from_cache(ci, block); +} + +int __init init_ocfs2_uptodate_cache(void) +{ + ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", + sizeof(struct ocfs2_meta_cache_item), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!ocfs2_uptodate_cachep) + return -ENOMEM; + + return 0; +} + +void exit_ocfs2_uptodate_cache(void) +{ + if (ocfs2_uptodate_cachep) + kmem_cache_destroy(ocfs2_uptodate_cachep); +} diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h new file mode 100644 index 00000000..0d826fe2 --- /dev/null +++ b/fs/ocfs2/uptodate.h @@ -0,0 +1,84 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * uptodate.h + * + * Cluster uptodate tracking + * + * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_UPTODATE_H +#define OCFS2_UPTODATE_H + +/* + * The caching code relies on locking provided by the user of + * struct ocfs2_caching_info. These operations connect that up. + */ +struct ocfs2_caching_operations { + /* + * A u64 representing the owning structure. Usually this + * is the block number (i_blkno or whatnot). This is used so + * that caching log messages can identify the owning structure. + */ + u64 (*co_owner)(struct ocfs2_caching_info *ci); + + /* The superblock is needed during I/O. */ + struct super_block *(*co_get_super)(struct ocfs2_caching_info *ci); + /* + * Lock and unlock the caching data. These will not sleep, and + * should probably be spinlocks. + */ + void (*co_cache_lock)(struct ocfs2_caching_info *ci); + void (*co_cache_unlock)(struct ocfs2_caching_info *ci); + + /* + * Lock and unlock for disk I/O. These will sleep, and should + * be mutexes. + */ + void (*co_io_lock)(struct ocfs2_caching_info *ci); + void (*co_io_unlock)(struct ocfs2_caching_info *ci); +}; + +int __init init_ocfs2_uptodate_cache(void); +void exit_ocfs2_uptodate_cache(void); + +void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci, + const struct ocfs2_caching_operations *ops); +void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci); +void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci); + +u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci); +void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci); +void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci); + +int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci, + struct buffer_head *bh); +void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, + struct buffer_head *bh); +void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci, + struct buffer_head *bh); +void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci, + struct buffer_head *bh); +void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci, + sector_t block, + u32 c_len); +int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci, + struct buffer_head *bh); + +#endif /* OCFS2_UPTODATE_H */ diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c new file mode 100644 index 00000000..e2488f41 --- /dev/null +++ b/fs/ocfs2/ver.c @@ -0,0 +1,43 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ver.c + * + * version string + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include + +#include "ver.h" + +#define OCFS2_BUILD_VERSION "1.5.0" + +#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION + +void ocfs2_print_version(void) +{ + printk(KERN_INFO "%s\n", VERSION_STR); +} + +MODULE_DESCRIPTION(VERSION_STR); + +MODULE_VERSION(OCFS2_BUILD_VERSION); diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h new file mode 100644 index 00000000..d7395cb9 --- /dev/null +++ b/fs/ocfs2/ver.h @@ -0,0 +1,31 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ver.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_VER_H +#define OCFS2_VER_H + +void ocfs2_print_version(void); + +#endif /* OCFS2_VER_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c new file mode 100644 index 00000000..81ecf9c0 --- /dev/null +++ b/fs/ocfs2/xattr.c @@ -0,0 +1,7388 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * xattr.c + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * CREDITS: + * Lots of code in this file is copy from linux/fs/ext3/xattr.c. + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ocfs2.h" +#include "alloc.h" +#include "blockcheck.h" +#include "dlmglue.h" +#include "file.h" +#include "symlink.h" +#include "sysfile.h" +#include "inode.h" +#include "journal.h" +#include "ocfs2_fs.h" +#include "suballoc.h" +#include "uptodate.h" +#include "buffer_head_io.h" +#include "super.h" +#include "xattr.h" +#include "refcounttree.h" +#include "acl.h" +#include "ocfs2_trace.h" + +struct ocfs2_xattr_def_value_root { + struct ocfs2_xattr_value_root xv; + struct ocfs2_extent_rec er; +}; + +struct ocfs2_xattr_bucket { + /* The inode these xattrs are associated with */ + struct inode *bu_inode; + + /* The actual buffers that make up the bucket */ + struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET]; + + /* How many blocks make up one bucket for this filesystem */ + int bu_blocks; +}; + +struct ocfs2_xattr_set_ctxt { + handle_t *handle; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; + int set_abort; +}; + +#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) +#define OCFS2_XATTR_INLINE_SIZE 80 +#define OCFS2_XATTR_HEADER_GAP 4 +#define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \ + - sizeof(struct ocfs2_xattr_header) \ + - OCFS2_XATTR_HEADER_GAP) +#define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \ + - sizeof(struct ocfs2_xattr_block) \ + - sizeof(struct ocfs2_xattr_header) \ + - OCFS2_XATTR_HEADER_GAP) + +static struct ocfs2_xattr_def_value_root def_xv = { + .xv.xr_list.l_count = cpu_to_le16(1), +}; + +const struct xattr_handler *ocfs2_xattr_handlers[] = { + &ocfs2_xattr_user_handler, + &ocfs2_xattr_acl_access_handler, + &ocfs2_xattr_acl_default_handler, + &ocfs2_xattr_trusted_handler, + &ocfs2_xattr_security_handler, + NULL +}; + +static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { + [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, + [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] + = &ocfs2_xattr_acl_access_handler, + [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] + = &ocfs2_xattr_acl_default_handler, + [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, + [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, +}; + +struct ocfs2_xattr_info { + int xi_name_index; + const char *xi_name; + int xi_name_len; + const void *xi_value; + size_t xi_value_len; +}; + +struct ocfs2_xattr_search { + struct buffer_head *inode_bh; + /* + * xattr_bh point to the block buffer head which has extended attribute + * when extended attribute in inode, xattr_bh is equal to inode_bh. + */ + struct buffer_head *xattr_bh; + struct ocfs2_xattr_header *header; + struct ocfs2_xattr_bucket *bucket; + void *base; + void *end; + struct ocfs2_xattr_entry *here; + int not_found; +}; + +/* Operations on struct ocfs2_xa_entry */ +struct ocfs2_xa_loc; +struct ocfs2_xa_loc_operations { + /* + * Journal functions + */ + int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc, + int type); + void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc); + + /* + * Return a pointer to the appropriate buffer in loc->xl_storage + * at the given offset from loc->xl_header. + */ + void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset); + + /* Can we reuse the existing entry for the new value? */ + int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi); + + /* How much space is needed for the new value? */ + int (*xlo_check_space)(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi); + + /* + * Return the offset of the first name+value pair. This is + * the start of our downward-filling free space. + */ + int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc); + + /* + * Remove the name+value at this location. Do whatever is + * appropriate with the remaining name+value pairs. + */ + void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc); + + /* Fill xl_entry with a new entry */ + void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash); + + /* Add name+value storage to an entry */ + void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size); + + /* + * Initialize the value buf's access and bh fields for this entry. + * ocfs2_xa_fill_value_buf() will handle the xv pointer. + */ + void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_value_buf *vb); +}; + +/* + * Describes an xattr entry location. This is a memory structure + * tracking the on-disk structure. + */ +struct ocfs2_xa_loc { + /* This xattr belongs to this inode */ + struct inode *xl_inode; + + /* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */ + struct ocfs2_xattr_header *xl_header; + + /* Bytes from xl_header to the end of the storage */ + int xl_size; + + /* + * The ocfs2_xattr_entry this location describes. If this is + * NULL, this location describes the on-disk structure where it + * would have been. + */ + struct ocfs2_xattr_entry *xl_entry; + + /* + * Internal housekeeping + */ + + /* Buffer(s) containing this entry */ + void *xl_storage; + + /* Operations on the storage backing this location */ + const struct ocfs2_xa_loc_operations *xl_ops; +}; + +/* + * Convenience functions to calculate how much space is needed for a + * given name+value pair + */ +static int namevalue_size(int name_len, uint64_t value_len) +{ + if (value_len > OCFS2_XATTR_INLINE_SIZE) + return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + else + return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len); +} + +static int namevalue_size_xi(struct ocfs2_xattr_info *xi) +{ + return namevalue_size(xi->xi_name_len, xi->xi_value_len); +} + +static int namevalue_size_xe(struct ocfs2_xattr_entry *xe) +{ + u64 value_len = le64_to_cpu(xe->xe_value_size); + + BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) && + ocfs2_xattr_is_local(xe)); + return namevalue_size(xe->xe_name_len, value_len); +} + + +static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, + struct ocfs2_xattr_header *xh, + int index, + int *block_off, + int *new_offset); + +static int ocfs2_xattr_block_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs); +static int ocfs2_xattr_index_block_find(struct inode *inode, + struct buffer_head *root_bh, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs); + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct buffer_head *blk_bh, + char *buffer, + size_t buffer_size); + +static int ocfs2_xattr_create_index_block(struct inode *inode, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt); + +static int ocfs2_xattr_set_entry_index_block(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt); + +typedef int (xattr_tree_rec_func)(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para); +static int ocfs2_iterate_xattr_index_block(struct inode *inode, + struct buffer_head *root_bh, + xattr_tree_rec_func *rec_func, + void *para); +static int ocfs2_delete_xattr_in_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para); +static int ocfs2_rm_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len, + void *para); + +static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, + u64 src_blk, u64 last_blk, u64 to_blk, + unsigned int start_bucket, + u32 *first_hash); +static int ocfs2_prepare_refcount_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_refcount_tree **ref_tree, + int *meta_need, + int *credits); +static int ocfs2_get_xattr_tree_value_root(struct super_block *sb, + struct ocfs2_xattr_bucket *bucket, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **bh); + +static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) +{ + return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE; +} + +static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb) +{ + return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); +} + +#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr) +#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data) +#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0)) + +static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode) +{ + struct ocfs2_xattr_bucket *bucket; + int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET); + + bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS); + if (bucket) { + bucket->bu_inode = inode; + bucket->bu_blocks = blks; + } + + return bucket; +} + +static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket) +{ + int i; + + for (i = 0; i < bucket->bu_blocks; i++) { + brelse(bucket->bu_bhs[i]); + bucket->bu_bhs[i] = NULL; + } +} + +static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket) +{ + if (bucket) { + ocfs2_xattr_bucket_relse(bucket); + bucket->bu_inode = NULL; + kfree(bucket); + } +} + +/* + * A bucket that has never been written to disk doesn't need to be + * read. We just need the buffer_heads. Don't call this for + * buckets that are already on disk. ocfs2_read_xattr_bucket() initializes + * them fully. + */ +static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, + u64 xb_blkno) +{ + int i, rc = 0; + + for (i = 0; i < bucket->bu_blocks; i++) { + bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb, + xb_blkno + i); + if (!bucket->bu_bhs[i]) { + rc = -EIO; + mlog_errno(rc); + break; + } + + if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i])) + ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i]); + } + + if (rc) + ocfs2_xattr_bucket_relse(bucket); + return rc; +} + +/* Read the xattr bucket at xb_blkno */ +static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket, + u64 xb_blkno) +{ + int rc; + + rc = ocfs2_read_blocks(INODE_CACHE(bucket->bu_inode), xb_blkno, + bucket->bu_blocks, bucket->bu_bhs, 0, + NULL); + if (!rc) { + spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); + rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb, + bucket->bu_bhs, + bucket->bu_blocks, + &bucket_xh(bucket)->xh_check); + spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); + if (rc) + mlog_errno(rc); + } + + if (rc) + ocfs2_xattr_bucket_relse(bucket); + return rc; +} + +static int ocfs2_xattr_bucket_journal_access(handle_t *handle, + struct ocfs2_xattr_bucket *bucket, + int type) +{ + int i, rc = 0; + + for (i = 0; i < bucket->bu_blocks; i++) { + rc = ocfs2_journal_access(handle, + INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i], type); + if (rc) { + mlog_errno(rc); + break; + } + } + + return rc; +} + +static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle, + struct ocfs2_xattr_bucket *bucket) +{ + int i; + + spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); + ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb, + bucket->bu_bhs, bucket->bu_blocks, + &bucket_xh(bucket)->xh_check); + spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); + + for (i = 0; i < bucket->bu_blocks; i++) + ocfs2_journal_dirty(handle, bucket->bu_bhs[i]); +} + +static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest, + struct ocfs2_xattr_bucket *src) +{ + int i; + int blocksize = src->bu_inode->i_sb->s_blocksize; + + BUG_ON(dest->bu_blocks != src->bu_blocks); + BUG_ON(dest->bu_inode != src->bu_inode); + + for (i = 0; i < src->bu_blocks; i++) { + memcpy(bucket_block(dest, i), bucket_block(src, i), + blocksize); + } +} + +static int ocfs2_validate_xattr_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)bh->b_data; + + trace_ocfs2_validate_xattr_block((unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check); + if (rc) + return rc; + + /* + * Errors after here are fatal + */ + + if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { + ocfs2_error(sb, + "Extended attribute block #%llu has bad " + "signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + xb->xb_signature); + return -EINVAL; + } + + if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Extended attribute block #%llu has an " + "invalid xb_blkno of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(xb->xb_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Extended attribute block #%llu has an invalid " + "xb_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(xb->xb_fs_generation)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(INODE_CACHE(inode), xb_blkno, &tmp, + ocfs2_validate_xattr_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +static inline const char *ocfs2_xattr_prefix(int name_index) +{ + const struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < OCFS2_XATTR_MAX) + handler = ocfs2_xattr_handler_map[name_index]; + + return handler ? handler->prefix : NULL; +} + +static u32 ocfs2_xattr_name_hash(struct inode *inode, + const char *name, + int name_len) +{ + /* Get hash value of uuid from super block */ + u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash; + int i; + + /* hash extended attribute name */ + for (i = 0; i < name_len; i++) { + hash = (hash << OCFS2_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^ + *name++; + } + + return hash; +} + +static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len) +{ + return namevalue_size(name_len, value_len) + + sizeof(struct ocfs2_xattr_entry); +} + +static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi) +{ + return namevalue_size_xi(xi) + + sizeof(struct ocfs2_xattr_entry); +} + +static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe) +{ + return namevalue_size_xe(xe) + + sizeof(struct ocfs2_xattr_entry); +} + +int ocfs2_calc_security_init(struct inode *dir, + struct ocfs2_security_xattr_info *si, + int *want_clusters, + int *xattr_credits, + struct ocfs2_alloc_context **xattr_ac) +{ + int ret = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + int s_size = ocfs2_xattr_entry_real_size(strlen(si->name), + si->value_len); + + /* + * The max space of security xattr taken inline is + * 256(name) + 80(value) + 16(entry) = 352 bytes, + * So reserve one metadata block for it is ok. + */ + if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || + s_size > OCFS2_XATTR_FREE_IN_IBODY) { + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + } + + /* reserve clusters for xattr value which will be set in B tree*/ + if (si->value_len > OCFS2_XATTR_INLINE_SIZE) { + int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb, + si->value_len); + + *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb, + new_clusters); + *want_clusters += new_clusters; + } + return ret; +} + +int ocfs2_calc_xattr_init(struct inode *dir, + struct buffer_head *dir_bh, + int mode, + struct ocfs2_security_xattr_info *si, + int *want_clusters, + int *xattr_credits, + int *want_meta) +{ + int ret = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + int s_size = 0, a_size = 0, acl_len = 0, new_clusters; + + if (si->enable) + s_size = ocfs2_xattr_entry_real_size(strlen(si->name), + si->value_len); + + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + acl_len = ocfs2_xattr_get_nolock(dir, dir_bh, + OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, + "", NULL, 0); + if (acl_len > 0) { + a_size = ocfs2_xattr_entry_real_size(0, acl_len); + if (S_ISDIR(mode)) + a_size <<= 1; + } else if (acl_len != 0 && acl_len != -ENODATA) { + mlog_errno(ret); + return ret; + } + } + + if (!(s_size + a_size)) + return ret; + + /* + * The max space of security xattr taken inline is + * 256(name) + 80(value) + 16(entry) = 352 bytes, + * The max space of acl xattr taken inline is + * 80(value) + 16(entry) * 2(if directory) = 192 bytes, + * when blocksize = 512, may reserve one more cluser for + * xattr bucket, otherwise reserve one metadata block + * for them is ok. + * If this is a new directory with inline data, + * we choose to reserve the entire inline area for + * directory contents and force an external xattr block. + */ + if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || + (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) || + (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { + *want_meta = *want_meta + 1; + *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + } + + if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE && + (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) { + *want_clusters += 1; + *xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb); + } + + /* + * reserve credits and clusters for xattrs which has large value + * and have to be set outside + */ + if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) { + new_clusters = ocfs2_clusters_for_bytes(dir->i_sb, + si->value_len); + *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb, + new_clusters); + *want_clusters += new_clusters; + } + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL && + acl_len > OCFS2_XATTR_INLINE_SIZE) { + /* for directory, it has DEFAULT and ACCESS two types of acls */ + new_clusters = (S_ISDIR(mode) ? 2 : 1) * + ocfs2_clusters_for_bytes(dir->i_sb, acl_len); + *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb, + new_clusters); + *want_clusters += new_clusters; + } + + return ret; +} + +static int ocfs2_xattr_extend_allocation(struct inode *inode, + u32 clusters_to_add, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int status = 0, credits; + handle_t *handle = ctxt->handle; + enum ocfs2_alloc_restarted why; + u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); + struct ocfs2_extent_tree et; + + ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); + + while (clusters_to_add) { + trace_ocfs2_xattr_extend_allocation(clusters_to_add); + + status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + break; + } + + prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); + status = ocfs2_add_clusters_in_btree(handle, + &et, + &logical_start, + clusters_to_add, + 0, + ctxt->data_ac, + ctxt->meta_ac, + &why); + if ((status < 0) && (status != -EAGAIN)) { + if (status != -ENOSPC) + mlog_errno(status); + break; + } + + ocfs2_journal_dirty(handle, vb->vb_bh); + + clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - + prev_clusters; + + if (why != RESTART_NONE && clusters_to_add) { + /* + * We can only fail in case the alloc file doesn't give + * up enough clusters. + */ + BUG_ON(why == RESTART_META); + + credits = ocfs2_calc_extend_credits(inode->i_sb, + &vb->vb_xv->xr_list, + clusters_to_add); + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + status = -ENOMEM; + mlog_errno(status); + break; + } + } + } + + return status; +} + +static int __ocfs2_remove_xattr_range(struct inode *inode, + struct ocfs2_xattr_value_buf *vb, + u32 cpos, u32 phys_cpos, u32 len, + unsigned int ext_flags, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + handle_t *handle = ctxt->handle; + struct ocfs2_extent_tree et; + + ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); + + ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_remove_extent(handle, &et, cpos, len, ctxt->meta_ac, + &ctxt->dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + le32_add_cpu(&vb->vb_xv->xr_clusters, -len); + ocfs2_journal_dirty(handle, vb->vb_bh); + + if (ext_flags & OCFS2_EXT_REFCOUNTED) + ret = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(inode->i_sb, + phys_blkno), + len, ctxt->meta_ac, &ctxt->dealloc, 1); + else + ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, + phys_blkno, len); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_xattr_shrink_size(struct inode *inode, + u32 old_clusters, + u32 new_clusters, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret = 0; + unsigned int ext_flags; + u32 trunc_len, cpos, phys_cpos, alloc_size; + u64 block; + + if (old_clusters <= new_clusters) + return 0; + + cpos = new_clusters; + trunc_len = old_clusters - new_clusters; + while (trunc_len) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, + &alloc_size, + &vb->vb_xv->xr_list, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (alloc_size > trunc_len) + alloc_size = trunc_len; + + ret = __ocfs2_remove_xattr_range(inode, vb, cpos, + phys_cpos, alloc_size, + ext_flags, ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } + + block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), + block, alloc_size); + cpos += alloc_size; + trunc_len -= alloc_size; + } + +out: + return ret; +} + +static int ocfs2_xattr_value_truncate(struct inode *inode, + struct ocfs2_xattr_value_buf *vb, + int len, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len); + u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); + + if (new_clusters == old_clusters) + return 0; + + if (new_clusters > old_clusters) + ret = ocfs2_xattr_extend_allocation(inode, + new_clusters - old_clusters, + vb, ctxt); + else + ret = ocfs2_xattr_shrink_size(inode, + old_clusters, new_clusters, + vb, ctxt); + + return ret; +} + +static int ocfs2_xattr_list_entry(char *buffer, size_t size, + size_t *result, const char *prefix, + const char *name, int name_len) +{ + char *p = buffer + *result; + int prefix_len = strlen(prefix); + int total_len = prefix_len + name_len + 1; + + *result += total_len; + + /* we are just looking for how big our buffer needs to be */ + if (!size) + return 0; + + if (*result > size) + return -ERANGE; + + memcpy(p, prefix, prefix_len); + memcpy(p + prefix_len, name, name_len); + p[prefix_len + name_len] = '\0'; + + return 0; +} + +static int ocfs2_xattr_list_entries(struct inode *inode, + struct ocfs2_xattr_header *header, + char *buffer, size_t buffer_size) +{ + size_t result = 0; + int i, type, ret; + const char *prefix, *name; + + for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; + type = ocfs2_xattr_get_type(entry); + prefix = ocfs2_xattr_prefix(type); + + if (prefix) { + name = (const char *)header + + le16_to_cpu(entry->xe_name_offset); + + ret = ocfs2_xattr_list_entry(buffer, buffer_size, + &result, prefix, name, + entry->xe_name_len); + if (ret) + return ret; + } + } + + return result; +} + +int ocfs2_has_inline_xattr_value_outside(struct inode *inode, + struct ocfs2_dinode *di) +{ + struct ocfs2_xattr_header *xh; + int i; + + xh = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) + if (!ocfs2_xattr_is_local(&xh->xh_entries[i])) + return 1; + + return 0; +} + +static int ocfs2_xattr_ibody_list(struct inode *inode, + struct ocfs2_dinode *di, + char *buffer, + size_t buffer_size) +{ + struct ocfs2_xattr_header *header = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + int ret = 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) + return ret; + + header = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size); + + return ret; +} + +static int ocfs2_xattr_block_list(struct inode *inode, + struct ocfs2_dinode *di, + char *buffer, + size_t buffer_size) +{ + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + int ret = 0; + + if (!di->i_xattr_loc) + return ret; + + ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; + ret = ocfs2_xattr_list_entries(inode, header, + buffer, buffer_size); + } else + ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh, + buffer, buffer_size); + + brelse(blk_bh); + + return ret; +} + +ssize_t ocfs2_listxattr(struct dentry *dentry, + char *buffer, + size_t size) +{ + int ret = 0, i_ret = 0, b_ret = 0; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode); + + if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb))) + return -EOPNOTSUPP; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + return ret; + + ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_read(&oi->ip_xattr_sem); + i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size); + if (i_ret < 0) + b_ret = 0; + else { + if (buffer) { + buffer += i_ret; + size -= i_ret; + } + b_ret = ocfs2_xattr_block_list(dentry->d_inode, di, + buffer, size); + if (b_ret < 0) + i_ret = 0; + } + up_read(&oi->ip_xattr_sem); + ocfs2_inode_unlock(dentry->d_inode, 0); + + brelse(di_bh); + + return i_ret + b_ret; +} + +static int ocfs2_xattr_find_entry(int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_xattr_entry *entry; + size_t name_len; + int i, cmp = 1; + + if (name == NULL) + return -EINVAL; + + name_len = strlen(name); + entry = xs->here; + for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) { + cmp = name_index - ocfs2_xattr_get_type(entry); + if (!cmp) + cmp = name_len - entry->xe_name_len; + if (!cmp) + cmp = memcmp(name, (xs->base + + le16_to_cpu(entry->xe_name_offset)), + name_len); + if (cmp == 0) + break; + entry += 1; + } + xs->here = entry; + + return cmp ? -ENODATA : 0; +} + +static int ocfs2_xattr_get_value_outside(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + void *buffer, + size_t len) +{ + u32 cpos, p_cluster, num_clusters, bpc, clusters; + u64 blkno; + int i, ret = 0; + size_t cplen, blocksize; + struct buffer_head *bh = NULL; + struct ocfs2_extent_list *el; + + el = &xv->xr_list; + clusters = le32_to_cpu(xv->xr_clusters); + bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + blocksize = inode->i_sb->s_blocksize; + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, el, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + /* Copy ocfs2_xattr_value */ + for (i = 0; i < num_clusters * bpc; i++, blkno++) { + ret = ocfs2_read_block(INODE_CACHE(inode), blkno, + &bh, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + cplen = len >= blocksize ? blocksize : len; + memcpy(buffer, bh->b_data, cplen); + len -= cplen; + buffer += cplen; + + brelse(bh); + bh = NULL; + if (len == 0) + break; + } + cpos += num_clusters; + } +out: + return ret; +} + +static int ocfs2_xattr_ibody_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct ocfs2_xattr_value_root *xv; + size_t size; + int ret = 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) + return -ENODATA; + + xs->end = (void *)di + inode->i_sb->s_blocksize; + xs->header = (struct ocfs2_xattr_header *) + (xs->end - le16_to_cpu(di->i_xattr_inline_size)); + xs->base = (void *)xs->header; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + if (ret) + return ret; + size = le64_to_cpu(xs->here->xe_value_size); + if (buffer) { + if (size > buffer_size) + return -ERANGE; + if (ocfs2_xattr_is_local(xs->here)) { + memcpy(buffer, (void *)xs->base + + le16_to_cpu(xs->here->xe_name_offset) + + OCFS2_XATTR_SIZE(xs->here->xe_name_len), size); + } else { + xv = (struct ocfs2_xattr_value_root *) + (xs->base + le16_to_cpu( + xs->here->xe_name_offset) + + OCFS2_XATTR_SIZE(xs->here->xe_name_len)); + ret = ocfs2_xattr_get_value_outside(inode, xv, + buffer, size); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + } + } + + return size; +} + +static int ocfs2_xattr_block_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_xattr_block *xb; + struct ocfs2_xattr_value_root *xv; + size_t size; + int ret = -ENODATA, name_offset, name_len, i; + int uninitialized_var(block_off); + + xs->bucket = ocfs2_xattr_bucket_new(inode); + if (!xs->bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto cleanup; + } + + ret = ocfs2_xattr_block_find(inode, name_index, name, xs); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + + if (xs->not_found) { + ret = -ENODATA; + goto cleanup; + } + + xb = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; + size = le64_to_cpu(xs->here->xe_value_size); + if (buffer) { + ret = -ERANGE; + if (size > buffer_size) + goto cleanup; + + name_offset = le16_to_cpu(xs->here->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len); + i = xs->here - xs->header->xh_entries; + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + bucket_xh(xs->bucket), + i, + &block_off, + &name_offset); + xs->base = bucket_block(xs->bucket, block_off); + } + if (ocfs2_xattr_is_local(xs->here)) { + memcpy(buffer, (void *)xs->base + + name_offset + name_len, size); + } else { + xv = (struct ocfs2_xattr_value_root *) + (xs->base + name_offset + name_len); + ret = ocfs2_xattr_get_value_outside(inode, xv, + buffer, size); + if (ret < 0) { + mlog_errno(ret); + goto cleanup; + } + } + } + ret = size; +cleanup: + ocfs2_xattr_bucket_free(xs->bucket); + + brelse(xs->xattr_bh); + xs->xattr_bh = NULL; + return ret; +} + +int ocfs2_xattr_get_nolock(struct inode *inode, + struct buffer_head *di_bh, + int name_index, + const char *name, + void *buffer, + size_t buffer_size) +{ + int ret; + struct ocfs2_dinode *di = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + ret = -ENODATA; + + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size, &xis); + if (ret == -ENODATA && di->i_xattr_loc) + ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, + buffer_size, &xbs); + + return ret; +} + +/* ocfs2_xattr_get() + * + * Copy an extended attribute into the buffer provided. + * Buffer is NULL to compute the size of buffer required. + */ +static int ocfs2_xattr_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size) +{ + int ret; + struct buffer_head *di_bh = NULL; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + down_read(&OCFS2_I(inode)->ip_xattr_sem); + ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, + name, buffer, buffer_size); + up_read(&OCFS2_I(inode)->ip_xattr_sem); + + ocfs2_inode_unlock(inode, 0); + + brelse(di_bh); + + return ret; +} + +static int __ocfs2_xattr_set_value_outside(struct inode *inode, + handle_t *handle, + struct ocfs2_xattr_value_buf *vb, + const void *value, + int value_len) +{ + int ret = 0, i, cp_len; + u16 blocksize = inode->i_sb->s_blocksize; + u32 p_cluster, num_clusters; + u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); + u64 blkno; + struct buffer_head *bh = NULL; + unsigned int ext_flags; + struct ocfs2_xattr_value_root *xv = vb->vb_xv; + + BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); + + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); + + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + + for (i = 0; i < num_clusters * bpc; i++, blkno++) { + ret = ocfs2_read_block(INODE_CACHE(inode), blkno, + &bh, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, + INODE_CACHE(inode), + bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + cp_len = value_len > blocksize ? blocksize : value_len; + memcpy(bh->b_data, value, cp_len); + value_len -= cp_len; + value += cp_len; + if (cp_len < blocksize) + memset(bh->b_data + cp_len, 0, + blocksize - cp_len); + + ocfs2_journal_dirty(handle, bh); + brelse(bh); + bh = NULL; + + /* + * XXX: do we need to empty all the following + * blocks in this cluster? + */ + if (!value_len) + break; + } + cpos += num_clusters; + } +out: + brelse(bh); + + return ret; +} + +static int ocfs2_xa_check_space_helper(int needed_space, int free_start, + int num_entries) +{ + int free_space; + + if (!needed_space) + return 0; + + free_space = free_start - + sizeof(struct ocfs2_xattr_header) - + (num_entries * sizeof(struct ocfs2_xattr_entry)) - + OCFS2_XATTR_HEADER_GAP; + if (free_space < 0) + return -EIO; + if (free_space < needed_space) + return -ENOSPC; + + return 0; +} + +static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc, + int type) +{ + return loc->xl_ops->xlo_journal_access(handle, loc, type); +} + +static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc) +{ + loc->xl_ops->xlo_journal_dirty(handle, loc); +} + +/* Give a pointer into the storage for the given offset */ +static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset) +{ + BUG_ON(offset >= loc->xl_size); + return loc->xl_ops->xlo_offset_pointer(loc, offset); +} + +/* + * Wipe the name+value pair and allow the storage to reclaim it. This + * must be followed by either removal of the entry or a call to + * ocfs2_xa_add_namevalue(). + */ +static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc) +{ + loc->xl_ops->xlo_wipe_namevalue(loc); +} + +/* + * Find lowest offset to a name+value pair. This is the start of our + * downward-growing free space. + */ +static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc) +{ + return loc->xl_ops->xlo_get_free_start(loc); +} + +/* Can we reuse loc->xl_entry for xi? */ +static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + return loc->xl_ops->xlo_can_reuse(loc, xi); +} + +/* How much free space is needed to set the new value */ +static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + return loc->xl_ops->xlo_check_space(loc, xi); +} + +static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash) +{ + loc->xl_ops->xlo_add_entry(loc, name_hash); + loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash); + /* + * We can't leave the new entry's xe_name_offset at zero or + * add_namevalue() will go nuts. We set it to the size of our + * storage so that it can never be less than any other entry. + */ + loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size); +} + +static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + int size = namevalue_size_xi(xi); + int nameval_offset; + char *nameval_buf; + + loc->xl_ops->xlo_add_namevalue(loc, size); + loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len); + loc->xl_entry->xe_name_len = xi->xi_name_len; + ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index); + ocfs2_xattr_set_local(loc->xl_entry, + xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE); + + nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); + nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset); + memset(nameval_buf, 0, size); + memcpy(nameval_buf, xi->xi_name, xi->xi_name_len); +} + +static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_value_buf *vb) +{ + int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); + int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len); + + /* Value bufs are for value trees */ + BUG_ON(ocfs2_xattr_is_local(loc->xl_entry)); + BUG_ON(namevalue_size_xe(loc->xl_entry) != + (name_size + OCFS2_XATTR_ROOT_SIZE)); + + loc->xl_ops->xlo_fill_value_buf(loc, vb); + vb->vb_xv = + (struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc, + nameval_offset + + name_size); +} + +static int ocfs2_xa_block_journal_access(handle_t *handle, + struct ocfs2_xa_loc *loc, int type) +{ + struct buffer_head *bh = loc->xl_storage; + ocfs2_journal_access_func access; + + if (loc->xl_size == (bh->b_size - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_header))) + access = ocfs2_journal_access_xb; + else + access = ocfs2_journal_access_di; + return access(handle, INODE_CACHE(loc->xl_inode), bh, type); +} + +static void ocfs2_xa_block_journal_dirty(handle_t *handle, + struct ocfs2_xa_loc *loc) +{ + struct buffer_head *bh = loc->xl_storage; + + ocfs2_journal_dirty(handle, bh); +} + +static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc, + int offset) +{ + return (char *)loc->xl_header + offset; +} + +static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + /* + * Block storage is strict. If the sizes aren't exact, we will + * remove the old one and reinsert the new. + */ + return namevalue_size_xe(loc->xl_entry) == + namevalue_size_xi(xi); +} + +static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc) +{ + struct ocfs2_xattr_header *xh = loc->xl_header; + int i, count = le16_to_cpu(xh->xh_count); + int offset, free_start = loc->xl_size; + + for (i = 0; i < count; i++) { + offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset); + if (offset < free_start) + free_start = offset; + } + + return free_start; +} + +static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + int count = le16_to_cpu(loc->xl_header->xh_count); + int free_start = ocfs2_xa_get_free_start(loc); + int needed_space = ocfs2_xi_entry_usage(xi); + + /* + * Block storage will reclaim the original entry before inserting + * the new value, so we only need the difference. If the new + * entry is smaller than the old one, we don't need anything. + */ + if (loc->xl_entry) { + /* Don't need space if we're reusing! */ + if (ocfs2_xa_can_reuse_entry(loc, xi)) + needed_space = 0; + else + needed_space -= ocfs2_xe_entry_usage(loc->xl_entry); + } + if (needed_space < 0) + needed_space = 0; + return ocfs2_xa_check_space_helper(needed_space, free_start, count); +} + +/* + * Block storage for xattrs keeps the name+value pairs compacted. When + * we remove one, we have to shift any that preceded it towards the end. + */ +static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc) +{ + int i, offset; + int namevalue_offset, first_namevalue_offset, namevalue_size; + struct ocfs2_xattr_entry *entry = loc->xl_entry; + struct ocfs2_xattr_header *xh = loc->xl_header; + int count = le16_to_cpu(xh->xh_count); + + namevalue_offset = le16_to_cpu(entry->xe_name_offset); + namevalue_size = namevalue_size_xe(entry); + first_namevalue_offset = ocfs2_xa_get_free_start(loc); + + /* Shift the name+value pairs */ + memmove((char *)xh + first_namevalue_offset + namevalue_size, + (char *)xh + first_namevalue_offset, + namevalue_offset - first_namevalue_offset); + memset((char *)xh + first_namevalue_offset, 0, namevalue_size); + + /* Now tell xh->xh_entries about it */ + for (i = 0; i < count; i++) { + offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset); + if (offset <= namevalue_offset) + le16_add_cpu(&xh->xh_entries[i].xe_name_offset, + namevalue_size); + } + + /* + * Note that we don't update xh_free_start or xh_name_value_len + * because they're not used in block-stored xattrs. + */ +} + +static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash) +{ + int count = le16_to_cpu(loc->xl_header->xh_count); + loc->xl_entry = &(loc->xl_header->xh_entries[count]); + le16_add_cpu(&loc->xl_header->xh_count, 1); + memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry)); +} + +static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size) +{ + int free_start = ocfs2_xa_get_free_start(loc); + + loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size); +} + +static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_value_buf *vb) +{ + struct buffer_head *bh = loc->xl_storage; + + if (loc->xl_size == (bh->b_size - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_header))) + vb->vb_access = ocfs2_journal_access_xb; + else + vb->vb_access = ocfs2_journal_access_di; + vb->vb_bh = bh; +} + +/* + * Operations for xattrs stored in blocks. This includes inline inode + * storage and unindexed ocfs2_xattr_blocks. + */ +static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = { + .xlo_journal_access = ocfs2_xa_block_journal_access, + .xlo_journal_dirty = ocfs2_xa_block_journal_dirty, + .xlo_offset_pointer = ocfs2_xa_block_offset_pointer, + .xlo_check_space = ocfs2_xa_block_check_space, + .xlo_can_reuse = ocfs2_xa_block_can_reuse, + .xlo_get_free_start = ocfs2_xa_block_get_free_start, + .xlo_wipe_namevalue = ocfs2_xa_block_wipe_namevalue, + .xlo_add_entry = ocfs2_xa_block_add_entry, + .xlo_add_namevalue = ocfs2_xa_block_add_namevalue, + .xlo_fill_value_buf = ocfs2_xa_block_fill_value_buf, +}; + +static int ocfs2_xa_bucket_journal_access(handle_t *handle, + struct ocfs2_xa_loc *loc, int type) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + + return ocfs2_xattr_bucket_journal_access(handle, bucket, type); +} + +static void ocfs2_xa_bucket_journal_dirty(handle_t *handle, + struct ocfs2_xa_loc *loc) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + + ocfs2_xattr_bucket_journal_dirty(handle, bucket); +} + +static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc, + int offset) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + int block, block_offset; + + /* The header is at the front of the bucket */ + block = offset >> loc->xl_inode->i_sb->s_blocksize_bits; + block_offset = offset % loc->xl_inode->i_sb->s_blocksize; + + return bucket_block(bucket, block) + block_offset; +} + +static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + return namevalue_size_xe(loc->xl_entry) >= + namevalue_size_xi(xi); +} + +static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + return le16_to_cpu(bucket_xh(bucket)->xh_free_start); +} + +static int ocfs2_bucket_align_free_start(struct super_block *sb, + int free_start, int size) +{ + /* + * We need to make sure that the name+value pair fits within + * one block. + */ + if (((free_start - size) >> sb->s_blocksize_bits) != + ((free_start - 1) >> sb->s_blocksize_bits)) + free_start -= free_start % sb->s_blocksize; + + return free_start; +} + +static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + int rc; + int count = le16_to_cpu(loc->xl_header->xh_count); + int free_start = ocfs2_xa_get_free_start(loc); + int needed_space = ocfs2_xi_entry_usage(xi); + int size = namevalue_size_xi(xi); + struct super_block *sb = loc->xl_inode->i_sb; + + /* + * Bucket storage does not reclaim name+value pairs it cannot + * reuse. They live as holes until the bucket fills, and then + * the bucket is defragmented. However, the bucket can reclaim + * the ocfs2_xattr_entry. + */ + if (loc->xl_entry) { + /* Don't need space if we're reusing! */ + if (ocfs2_xa_can_reuse_entry(loc, xi)) + needed_space = 0; + else + needed_space -= sizeof(struct ocfs2_xattr_entry); + } + BUG_ON(needed_space < 0); + + if (free_start < size) { + if (needed_space) + return -ENOSPC; + } else { + /* + * First we check if it would fit in the first place. + * Below, we align the free start to a block. This may + * slide us below the minimum gap. By checking unaligned + * first, we avoid that error. + */ + rc = ocfs2_xa_check_space_helper(needed_space, free_start, + count); + if (rc) + return rc; + free_start = ocfs2_bucket_align_free_start(sb, free_start, + size); + } + return ocfs2_xa_check_space_helper(needed_space, free_start, count); +} + +static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc) +{ + le16_add_cpu(&loc->xl_header->xh_name_value_len, + -namevalue_size_xe(loc->xl_entry)); +} + +static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash) +{ + struct ocfs2_xattr_header *xh = loc->xl_header; + int count = le16_to_cpu(xh->xh_count); + int low = 0, high = count - 1, tmp; + struct ocfs2_xattr_entry *tmp_xe; + + /* + * We keep buckets sorted by name_hash, so we need to find + * our insert place. + */ + while (low <= high && count) { + tmp = (low + high) / 2; + tmp_xe = &xh->xh_entries[tmp]; + + if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash)) + low = tmp + 1; + else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash)) + high = tmp - 1; + else { + low = tmp; + break; + } + } + + if (low != count) + memmove(&xh->xh_entries[low + 1], + &xh->xh_entries[low], + ((count - low) * sizeof(struct ocfs2_xattr_entry))); + + le16_add_cpu(&xh->xh_count, 1); + loc->xl_entry = &xh->xh_entries[low]; + memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry)); +} + +static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size) +{ + int free_start = ocfs2_xa_get_free_start(loc); + struct ocfs2_xattr_header *xh = loc->xl_header; + struct super_block *sb = loc->xl_inode->i_sb; + int nameval_offset; + + free_start = ocfs2_bucket_align_free_start(sb, free_start, size); + nameval_offset = free_start - size; + loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset); + xh->xh_free_start = cpu_to_le16(nameval_offset); + le16_add_cpu(&xh->xh_name_value_len, size); + +} + +static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_value_buf *vb) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + struct super_block *sb = loc->xl_inode->i_sb; + int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); + int size = namevalue_size_xe(loc->xl_entry); + int block_offset = nameval_offset >> sb->s_blocksize_bits; + + /* Values are not allowed to straddle block boundaries */ + BUG_ON(block_offset != + ((nameval_offset + size - 1) >> sb->s_blocksize_bits)); + /* We expect the bucket to be filled in */ + BUG_ON(!bucket->bu_bhs[block_offset]); + + vb->vb_access = ocfs2_journal_access; + vb->vb_bh = bucket->bu_bhs[block_offset]; +} + +/* Operations for xattrs stored in buckets. */ +static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = { + .xlo_journal_access = ocfs2_xa_bucket_journal_access, + .xlo_journal_dirty = ocfs2_xa_bucket_journal_dirty, + .xlo_offset_pointer = ocfs2_xa_bucket_offset_pointer, + .xlo_check_space = ocfs2_xa_bucket_check_space, + .xlo_can_reuse = ocfs2_xa_bucket_can_reuse, + .xlo_get_free_start = ocfs2_xa_bucket_get_free_start, + .xlo_wipe_namevalue = ocfs2_xa_bucket_wipe_namevalue, + .xlo_add_entry = ocfs2_xa_bucket_add_entry, + .xlo_add_namevalue = ocfs2_xa_bucket_add_namevalue, + .xlo_fill_value_buf = ocfs2_xa_bucket_fill_value_buf, +}; + +static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc) +{ + struct ocfs2_xattr_value_buf vb; + + if (ocfs2_xattr_is_local(loc->xl_entry)) + return 0; + + ocfs2_xa_fill_value_buf(loc, &vb); + return le32_to_cpu(vb.vb_xv->xr_clusters); +} + +static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int trunc_rc, access_rc; + struct ocfs2_xattr_value_buf vb; + + ocfs2_xa_fill_value_buf(loc, &vb); + trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes, + ctxt); + + /* + * The caller of ocfs2_xa_value_truncate() has already called + * ocfs2_xa_journal_access on the loc. However, The truncate code + * calls ocfs2_extend_trans(). This may commit the previous + * transaction and open a new one. If this is a bucket, truncate + * could leave only vb->vb_bh set up for journaling. Meanwhile, + * the caller is expecting to dirty the entire bucket. So we must + * reset the journal work. We do this even if truncate has failed, + * as it could have failed after committing the extend. + */ + access_rc = ocfs2_xa_journal_access(ctxt->handle, loc, + OCFS2_JOURNAL_ACCESS_WRITE); + + /* Errors in truncate take precedence */ + return trunc_rc ? trunc_rc : access_rc; +} + +static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc) +{ + int index, count; + struct ocfs2_xattr_header *xh = loc->xl_header; + struct ocfs2_xattr_entry *entry = loc->xl_entry; + + ocfs2_xa_wipe_namevalue(loc); + loc->xl_entry = NULL; + + le16_add_cpu(&xh->xh_count, -1); + count = le16_to_cpu(xh->xh_count); + + /* + * Only zero out the entry if there are more remaining. This is + * important for an empty bucket, as it keeps track of the + * bucket's hash value. It doesn't hurt empty block storage. + */ + if (count) { + index = ((char *)entry - (char *)&xh->xh_entries) / + sizeof(struct ocfs2_xattr_entry); + memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1], + (count - index) * sizeof(struct ocfs2_xattr_entry)); + memset(&xh->xh_entries[count], 0, + sizeof(struct ocfs2_xattr_entry)); + } +} + +/* + * If we have a problem adjusting the size of an external value during + * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr + * in an intermediate state. For example, the value may be partially + * truncated. + * + * If the value tree hasn't changed, the extend/truncate went nowhere. + * We have nothing to do. The caller can treat it as a straight error. + * + * If the value tree got partially truncated, we now have a corrupted + * extended attribute. We're going to wipe its entry and leak the + * clusters. Better to leak some storage than leave a corrupt entry. + * + * If the value tree grew, it obviously didn't grow enough for the + * new entry. We're not going to try and reclaim those clusters either. + * If there was already an external value there (orig_clusters != 0), + * the new clusters are attached safely and we can just leave the old + * value in place. If there was no external value there, we remove + * the entry. + * + * This way, the xattr block we store in the journal will be consistent. + * If the size change broke because of the journal, no changes will hit + * disk anyway. + */ +static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc, + const char *what, + unsigned int orig_clusters) +{ + unsigned int new_clusters = ocfs2_xa_value_clusters(loc); + char *nameval_buf = ocfs2_xa_offset_pointer(loc, + le16_to_cpu(loc->xl_entry->xe_name_offset)); + + if (new_clusters < orig_clusters) { + mlog(ML_ERROR, + "Partial truncate while %s xattr %.*s. Leaking " + "%u clusters and removing the entry\n", + what, loc->xl_entry->xe_name_len, nameval_buf, + orig_clusters - new_clusters); + ocfs2_xa_remove_entry(loc); + } else if (!orig_clusters) { + mlog(ML_ERROR, + "Unable to allocate an external value for xattr " + "%.*s safely. Leaking %u clusters and removing the " + "entry\n", + loc->xl_entry->xe_name_len, nameval_buf, + new_clusters - orig_clusters); + ocfs2_xa_remove_entry(loc); + } else if (new_clusters > orig_clusters) + mlog(ML_ERROR, + "Unable to grow xattr %.*s safely. %u new clusters " + "have been added, but the value will not be " + "modified\n", + loc->xl_entry->xe_name_len, nameval_buf, + new_clusters - orig_clusters); +} + +static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int rc = 0; + unsigned int orig_clusters; + + if (!ocfs2_xattr_is_local(loc->xl_entry)) { + orig_clusters = ocfs2_xa_value_clusters(loc); + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); + if (rc) { + mlog_errno(rc); + /* + * Since this is remove, we can return 0 if + * ocfs2_xa_cleanup_value_truncate() is going to + * wipe the entry anyway. So we check the + * cluster count as well. + */ + if (orig_clusters != ocfs2_xa_value_clusters(loc)) + rc = 0; + ocfs2_xa_cleanup_value_truncate(loc, "removing", + orig_clusters); + if (rc) + goto out; + } + } + + ocfs2_xa_remove_entry(loc); + +out: + return rc; +} + +static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc) +{ + int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len); + char *nameval_buf; + + nameval_buf = ocfs2_xa_offset_pointer(loc, + le16_to_cpu(loc->xl_entry->xe_name_offset)); + memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE); +} + +/* + * Take an existing entry and make it ready for the new value. This + * won't allocate space, but it may free space. It should be ready for + * ocfs2_xa_prepare_entry() to finish the work. + */ +static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int rc = 0; + int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len); + unsigned int orig_clusters; + char *nameval_buf; + int xe_local = ocfs2_xattr_is_local(loc->xl_entry); + int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE; + + BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) != + name_size); + + nameval_buf = ocfs2_xa_offset_pointer(loc, + le16_to_cpu(loc->xl_entry->xe_name_offset)); + if (xe_local) { + memset(nameval_buf + name_size, 0, + namevalue_size_xe(loc->xl_entry) - name_size); + if (!xi_local) + ocfs2_xa_install_value_root(loc); + } else { + orig_clusters = ocfs2_xa_value_clusters(loc); + if (xi_local) { + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); + if (rc < 0) + mlog_errno(rc); + else + memset(nameval_buf + name_size, 0, + namevalue_size_xe(loc->xl_entry) - + name_size); + } else if (le64_to_cpu(loc->xl_entry->xe_value_size) > + xi->xi_value_len) { + rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, + ctxt); + if (rc < 0) + mlog_errno(rc); + } + + if (rc) { + ocfs2_xa_cleanup_value_truncate(loc, "reusing", + orig_clusters); + goto out; + } + } + + loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len); + ocfs2_xattr_set_local(loc->xl_entry, xi_local); + +out: + return rc; +} + +/* + * Prepares loc->xl_entry to receive the new xattr. This includes + * properly setting up the name+value pair region. If loc->xl_entry + * already exists, it will take care of modifying it appropriately. + * + * Note that this modifies the data. You did journal_access already, + * right? + */ +static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi, + u32 name_hash, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int rc = 0; + unsigned int orig_clusters; + __le64 orig_value_size = 0; + + rc = ocfs2_xa_check_space(loc, xi); + if (rc) + goto out; + + if (loc->xl_entry) { + if (ocfs2_xa_can_reuse_entry(loc, xi)) { + orig_value_size = loc->xl_entry->xe_value_size; + rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); + if (rc) + goto out; + goto alloc_value; + } + + if (!ocfs2_xattr_is_local(loc->xl_entry)) { + orig_clusters = ocfs2_xa_value_clusters(loc); + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); + if (rc) { + mlog_errno(rc); + ocfs2_xa_cleanup_value_truncate(loc, + "overwriting", + orig_clusters); + goto out; + } + } + ocfs2_xa_wipe_namevalue(loc); + } else + ocfs2_xa_add_entry(loc, name_hash); + + /* + * If we get here, we have a blank entry. Fill it. We grow our + * name+value pair back from the end. + */ + ocfs2_xa_add_namevalue(loc, xi); + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) + ocfs2_xa_install_value_root(loc); + +alloc_value: + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + orig_clusters = ocfs2_xa_value_clusters(loc); + rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt); + if (rc < 0) { + ctxt->set_abort = 1; + ocfs2_xa_cleanup_value_truncate(loc, "growing", + orig_clusters); + /* + * If we were growing an existing value, + * ocfs2_xa_cleanup_value_truncate() won't remove + * the entry. We need to restore the original value + * size. + */ + if (loc->xl_entry) { + BUG_ON(!orig_value_size); + loc->xl_entry->xe_value_size = orig_value_size; + } + mlog_errno(rc); + } + } + +out: + return rc; +} + +/* + * Store the value portion of the name+value pair. This will skip + * values that are stored externally. Their tree roots were set up + * by ocfs2_xa_prepare_entry(). + */ +static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int rc = 0; + int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); + int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len); + char *nameval_buf; + struct ocfs2_xattr_value_buf vb; + + nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset); + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + ocfs2_xa_fill_value_buf(loc, &vb); + rc = __ocfs2_xattr_set_value_outside(loc->xl_inode, + ctxt->handle, &vb, + xi->xi_value, + xi->xi_value_len); + } else + memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len); + + return rc; +} + +static int ocfs2_xa_set(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name, + xi->xi_name_len); + + ret = ocfs2_xa_journal_access(ctxt->handle, loc, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * From here on out, everything is going to modify the buffer a + * little. Errors are going to leave the xattr header in a + * sane state. Thus, even with errors we dirty the sucker. + */ + + /* Don't worry, we are never called with !xi_value and !xl_entry */ + if (!xi->xi_value) { + ret = ocfs2_xa_remove(loc, ctxt); + goto out_dirty; + } + + ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out_dirty; + } + + ret = ocfs2_xa_store_value(loc, xi, ctxt); + if (ret) + mlog_errno(ret); + +out_dirty: + ocfs2_xa_journal_dirty(ctxt->handle, loc); + +out: + return ret; +} + +static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_xattr_entry *entry) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL)); + + loc->xl_inode = inode; + loc->xl_ops = &ocfs2_xa_block_loc_ops; + loc->xl_storage = bh; + loc->xl_entry = entry; + loc->xl_size = le16_to_cpu(di->i_xattr_inline_size); + loc->xl_header = + (struct ocfs2_xattr_header *)(bh->b_data + bh->b_size - + loc->xl_size); +} + +static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_xattr_entry *entry) +{ + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)bh->b_data; + + BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED); + + loc->xl_inode = inode; + loc->xl_ops = &ocfs2_xa_block_loc_ops; + loc->xl_storage = bh; + loc->xl_header = &(xb->xb_attrs.xb_header); + loc->xl_entry = entry; + loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_header); +} + +static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_bucket *bucket, + struct ocfs2_xattr_entry *entry) +{ + loc->xl_inode = bucket->bu_inode; + loc->xl_ops = &ocfs2_xa_bucket_loc_ops; + loc->xl_storage = bucket; + loc->xl_header = bucket_xh(bucket); + loc->xl_entry = entry; + loc->xl_size = OCFS2_XATTR_BUCKET_SIZE; +} + +/* + * In xattr remove, if it is stored outside and refcounted, we may have + * the chance to split the refcount tree. So need the allocators. + */ +static int ocfs2_lock_xattr_remove_allocators(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_alloc_context **meta_ac, + int *ref_credits) +{ + int ret, meta_add = 0; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + + *ref_credits = 0; + ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster, + &num_clusters, + &xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + ret = ocfs2_refcounted_xattr_delete_need(inode, ref_ci, + ref_root_bh, xv, + &meta_add, ref_credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + meta_add, meta_ac); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_remove_value_outside(struct inode*inode, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_header *header, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) +{ + int ret = 0, i, ref_credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; + void *val; + + ocfs2_init_dealloc_ctxt(&ctxt.dealloc); + + for (i = 0; i < le16_to_cpu(header->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; + + if (ocfs2_xattr_is_local(entry)) + continue; + + val = (void *)header + + le16_to_cpu(entry->xe_name_offset); + vb->vb_xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); + + ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv, + ref_ci, ref_root_bh, + &ctxt.meta_ac, + &ref_credits); + + ctxt.handle = ocfs2_start_trans(osb, ref_credits + + ocfs2_remove_extent_credits(osb->sb)); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + break; + } + + ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); + if (ret < 0) { + mlog_errno(ret); + break; + } + + ocfs2_commit_trans(osb, ctxt.handle); + if (ctxt.meta_ac) { + ocfs2_free_alloc_context(ctxt.meta_ac); + ctxt.meta_ac = NULL; + } + } + + if (ctxt.meta_ac) + ocfs2_free_alloc_context(ctxt.meta_ac); + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &ctxt.dealloc); + return ret; +} + +static int ocfs2_xattr_ibody_remove(struct inode *inode, + struct buffer_head *di_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) +{ + + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_xattr_header *header; + int ret; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = di_bh, + .vb_access = ocfs2_journal_access_di, + }; + + header = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + ret = ocfs2_remove_value_outside(inode, &vb, header, + ref_ci, ref_root_bh); + + return ret; +} + +struct ocfs2_rm_xattr_bucket_para { + struct ocfs2_caching_info *ref_ci; + struct buffer_head *ref_root_bh; +}; + +static int ocfs2_xattr_block_remove(struct inode *inode, + struct buffer_head *blk_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) +{ + struct ocfs2_xattr_block *xb; + int ret = 0; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = blk_bh, + .vb_access = ocfs2_journal_access_xb, + }; + struct ocfs2_rm_xattr_bucket_para args = { + .ref_ci = ref_ci, + .ref_root_bh = ref_root_bh, + }; + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); + ret = ocfs2_remove_value_outside(inode, &vb, header, + ref_ci, ref_root_bh); + } else + ret = ocfs2_iterate_xattr_index_block(inode, + blk_bh, + ocfs2_rm_xattr_cluster, + &args); + + return ret; +} + +static int ocfs2_xattr_free_block(struct inode *inode, + u64 block, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) +{ + struct inode *xb_alloc_inode; + struct buffer_head *xb_alloc_bh = NULL; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle; + int ret = 0; + u64 blk, bg_blkno; + u16 bit; + + ret = ocfs2_read_xattr_block(inode, block, &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_ci, ref_root_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + blk = le64_to_cpu(xb->xb_blkno); + bit = le16_to_cpu(xb->xb_suballoc_bit); + if (xb->xb_suballoc_loc) + bg_blkno = le64_to_cpu(xb->xb_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + xb_alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(xb->xb_suballoc_slot)); + if (!xb_alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&xb_alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh, + bit, bg_blkno, 1); + if (ret < 0) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); +out_unlock: + ocfs2_inode_unlock(xb_alloc_inode, 1); + brelse(xb_alloc_bh); +out_mutex: + mutex_unlock(&xb_alloc_inode->i_mutex); + iput(xb_alloc_inode); +out: + brelse(blk_bh); + return ret; +} + +/* + * ocfs2_xattr_remove() + * + * Free extended attribute resources associated with this inode. + */ +int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_refcount_tree *ref_tree = NULL; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_caching_info *ref_ci = NULL; + handle_t *handle; + int ret; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return 0; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + return 0; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { + ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), + le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + ref_ci = &ref_tree->rf_ci; + + } + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_ibody_remove(inode, di_bh, + ref_ci, ref_root_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + if (di->i_xattr_loc) { + ret = ocfs2_xattr_free_block(inode, + le64_to_cpu(di->i_xattr_loc), + ref_ci, ref_root_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + di->i_xattr_loc = 0; + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL); + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + + ocfs2_journal_dirty(handle, di_bh); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + if (ref_tree) + ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1); + brelse(ref_root_bh); + return ret; +} + +static int ocfs2_xattr_has_space_inline(struct inode *inode, + struct ocfs2_dinode *di) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size; + int free; + + if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE) + return 0; + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + struct ocfs2_inline_data *idata = &di->id2.i_data; + free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size); + } else if (ocfs2_inode_is_fast_symlink(inode)) { + free = ocfs2_fast_symlink_chars(inode->i_sb) - + le64_to_cpu(di->i_size); + } else { + struct ocfs2_extent_list *el = &di->id2.i_list; + free = (le16_to_cpu(el->l_count) - + le16_to_cpu(el->l_next_free_rec)) * + sizeof(struct ocfs2_extent_rec); + } + if (free >= xattrsize) + return 1; + + return 0; +} + +/* + * ocfs2_xattr_ibody_find() + * + * Find extended attribute in inode block and + * fill search info into struct ocfs2_xattr_search. + */ +static int ocfs2_xattr_ibody_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + int ret; + int has_space = 0; + + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) + return 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + down_read(&oi->ip_alloc_sem); + has_space = ocfs2_xattr_has_space_inline(inode, di); + up_read(&oi->ip_alloc_sem); + if (!has_space) + return 0; + } + + xs->xattr_bh = xs->inode_bh; + xs->end = (void *)di + inode->i_sb->s_blocksize; + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) + xs->header = (struct ocfs2_xattr_header *) + (xs->end - le16_to_cpu(di->i_xattr_inline_size)); + else + xs->header = (struct ocfs2_xattr_header *) + (xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size); + xs->base = (void *)xs->header; + xs->here = xs->header->xh_entries; + + /* Find the named attribute. */ + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_find_entry(name_index, name, xs); + if (ret && ret != -ENODATA) + return ret; + xs->not_found = ret; + } + + return 0; +} + +static int ocfs2_xattr_ibody_init(struct inode *inode, + struct buffer_head *di_bh, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int xattrsize = osb->s_xattr_inline_size; + + if (!ocfs2_xattr_has_space_inline(inode, di)) { + ret = -ENOSPC; + goto out; + } + + ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Adjust extent record count or inline data size + * to reserve space for extended attribute. + */ + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + struct ocfs2_inline_data *idata = &di->id2.i_data; + le16_add_cpu(&idata->id_count, -xattrsize); + } else if (!(ocfs2_inode_is_fast_symlink(inode))) { + struct ocfs2_extent_list *el = &di->id2.i_list; + le16_add_cpu(&el->l_count, -(xattrsize / + sizeof(struct ocfs2_extent_rec))); + } + di->i_xattr_inline_size = cpu_to_le16(xattrsize); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + + ocfs2_journal_dirty(ctxt->handle, di_bh); + +out: + return ret; +} + +/* + * ocfs2_xattr_ibody_set() + * + * Set, replace or remove an extended attribute into inode block. + * + */ +static int ocfs2_xattr_ibody_set(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct ocfs2_xa_loc loc; + + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) + return -ENOSPC; + + down_write(&oi->ip_alloc_sem); + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + if (!ocfs2_xattr_has_space_inline(inode, di)) { + ret = -ENOSPC; + goto out; + } + } + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + + ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh, + xs->not_found ? NULL : xs->here); + ret = ocfs2_xa_set(&loc, xi, ctxt); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + xs->here = loc.xl_entry; + +out: + up_write(&oi->ip_alloc_sem); + + return ret; +} + +/* + * ocfs2_xattr_block_find() + * + * Find extended attribute in external block and + * fill search info into struct ocfs2_xattr_search. + */ +static int ocfs2_xattr_block_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + int ret = 0; + + if (!di->i_xattr_loc) + return ret; + + ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + xs->xattr_bh = blk_bh; + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + xs->header = &xb->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + } else + ret = ocfs2_xattr_index_block_find(inode, blk_bh, + name_index, + name, xs); + + if (ret && ret != -ENODATA) { + xs->xattr_bh = NULL; + goto cleanup; + } + xs->not_found = ret; + return 0; +cleanup: + brelse(blk_bh); + + return ret; +} + +static int ocfs2_create_xattr_block(struct inode *inode, + struct buffer_head *inode_bh, + struct ocfs2_xattr_set_ctxt *ctxt, + int indexed, + struct buffer_head **ret_bh) +{ + int ret; + u16 suballoc_bit_start; + u32 num_got; + u64 suballoc_loc, first_blkno; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data; + struct buffer_head *new_bh = NULL; + struct ocfs2_xattr_block *xblk; + + ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), + inode_bh, OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1, + &suballoc_loc, &suballoc_bit_start, + &num_got, &first_blkno); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); + + ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode), + new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + /* Initialize ocfs2_xattr_block */ + xblk = (struct ocfs2_xattr_block *)new_bh->b_data; + memset(xblk, 0, inode->i_sb->s_blocksize); + strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); + xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot); + xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc); + xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); + xblk->xb_fs_generation = + cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation); + xblk->xb_blkno = cpu_to_le64(first_blkno); + if (indexed) { + struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; + xr->xt_clusters = cpu_to_le32(1); + xr->xt_last_eb_blk = 0; + xr->xt_list.l_tree_depth = 0; + xr->xt_list.l_count = cpu_to_le16( + ocfs2_xattr_recs_per_xb(inode->i_sb)); + xr->xt_list.l_next_free_rec = cpu_to_le16(1); + xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED); + } + ocfs2_journal_dirty(ctxt->handle, new_bh); + + /* Add it to the inode */ + di->i_xattr_loc = cpu_to_le64(first_blkno); + + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL; + di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + ocfs2_journal_dirty(ctxt->handle, inode_bh); + + *ret_bh = new_bh; + new_bh = NULL; + +end: + brelse(new_bh); + return ret; +} + +/* + * ocfs2_xattr_block_set() + * + * Set, replace or remove an extended attribute into external block. + * + */ +static int ocfs2_xattr_block_set(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + struct buffer_head *new_bh = NULL; + struct ocfs2_xattr_block *xblk = NULL; + int ret; + struct ocfs2_xa_loc loc; + + if (!xs->xattr_bh) { + ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt, + 0, &new_bh); + if (ret) { + mlog_errno(ret); + goto end; + } + + xs->xattr_bh = new_bh; + xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; + xs->header = &xblk->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)xblk + inode->i_sb->s_blocksize; + xs->here = xs->header->xh_entries; + } else + xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; + + if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { + ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh, + xs->not_found ? NULL : xs->here); + + ret = ocfs2_xa_set(&loc, xi, ctxt); + if (!ret) + xs->here = loc.xl_entry; + else if ((ret != -ENOSPC) || ctxt->set_abort) + goto end; + else { + ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); + if (ret) + goto end; + } + } + + if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED) + ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt); + +end: + return ret; +} + +/* Check whether the new xattr can be inserted into the inode. */ +static int ocfs2_xattr_can_be_in_inode(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_xattr_entry *last; + int free, i; + size_t min_offs = xs->end - xs->base; + + if (!xs->header) + return 0; + + last = xs->header->xh_entries; + + for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) { + size_t offs = le16_to_cpu(last->xe_name_offset); + if (offs < min_offs) + min_offs = offs; + last += 1; + } + + free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP; + if (free < 0) + return 0; + + BUG_ON(!xs->not_found); + + if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi))) + return 1; + + return 0; +} + +static int ocfs2_calc_xattr_set_need(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + int *clusters_need, + int *meta_need, + int *credits_need) +{ + int ret = 0, old_in_xb = 0; + int clusters_add = 0, meta_add = 0, credits = 0; + struct buffer_head *bh = NULL; + struct ocfs2_xattr_block *xb = NULL; + struct ocfs2_xattr_entry *xe = NULL; + struct ocfs2_xattr_value_root *xv = NULL; + char *base = NULL; + int name_offset, name_len = 0; + u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, + xi->xi_value_len); + u64 value_size; + + /* + * Calculate the clusters we need to write. + * No matter whether we replace an old one or add a new one, + * we need this for writing. + */ + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) + credits += new_clusters * + ocfs2_clusters_to_blocks(inode->i_sb, 1); + + if (xis->not_found && xbs->not_found) { + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + clusters_add += new_clusters; + credits += ocfs2_calc_extend_credits(inode->i_sb, + &def_xv.xv.xr_list, + new_clusters); + } + + goto meta_guess; + } + + if (!xis->not_found) { + xe = xis->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + base = xis->base; + credits += OCFS2_INODE_UPDATE_CREDITS; + } else { + int i, block_off = 0; + xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data; + xe = xbs->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + i = xbs->here - xbs->header->xh_entries; + old_in_xb = 1; + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + bucket_xh(xbs->bucket), + i, &block_off, + &name_offset); + base = bucket_block(xbs->bucket, block_off); + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + } else { + base = xbs->base; + credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS; + } + } + + /* + * delete a xattr doesn't need metadata and cluster allocation. + * so just calculate the credits and return. + * + * The credits for removing the value tree will be extended + * by ocfs2_remove_extent itself. + */ + if (!xi->xi_value) { + if (!ocfs2_xattr_is_local(xe)) + credits += ocfs2_remove_extent_credits(inode->i_sb); + + goto out; + } + + /* do cluster allocation guess first. */ + value_size = le64_to_cpu(xe->xe_value_size); + + if (old_in_xb) { + /* + * In xattr set, we always try to set the xe in inode first, + * so if it can be inserted into inode successfully, the old + * one will be removed from the xattr block, and this xattr + * will be inserted into inode as a new xattr in inode. + */ + if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) { + clusters_add += new_clusters; + credits += ocfs2_remove_extent_credits(inode->i_sb) + + OCFS2_INODE_UPDATE_CREDITS; + if (!ocfs2_xattr_is_local(xe)) + credits += ocfs2_calc_extend_credits( + inode->i_sb, + &def_xv.xv.xr_list, + new_clusters); + goto out; + } + } + + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + /* the new values will be stored outside. */ + u32 old_clusters = 0; + + if (!ocfs2_xattr_is_local(xe)) { + old_clusters = ocfs2_clusters_for_bytes(inode->i_sb, + value_size); + xv = (struct ocfs2_xattr_value_root *) + (base + name_offset + name_len); + value_size = OCFS2_XATTR_ROOT_SIZE; + } else + xv = &def_xv.xv; + + if (old_clusters >= new_clusters) { + credits += ocfs2_remove_extent_credits(inode->i_sb); + goto out; + } else { + meta_add += ocfs2_extend_meta_needed(&xv->xr_list); + clusters_add += new_clusters - old_clusters; + credits += ocfs2_calc_extend_credits(inode->i_sb, + &xv->xr_list, + new_clusters - + old_clusters); + if (value_size >= OCFS2_XATTR_ROOT_SIZE) + goto out; + } + } else { + /* + * Now the new value will be stored inside. So if the new + * value is smaller than the size of value root or the old + * value, we don't need any allocation, otherwise we have + * to guess metadata allocation. + */ + if ((ocfs2_xattr_is_local(xe) && + (value_size >= xi->xi_value_len)) || + (!ocfs2_xattr_is_local(xe) && + OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len)) + goto out; + } + +meta_guess: + /* calculate metadata allocation. */ + if (di->i_xattr_loc) { + if (!xbs->xattr_bh) { + ret = ocfs2_read_xattr_block(inode, + le64_to_cpu(di->i_xattr_loc), + &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xb = (struct ocfs2_xattr_block *)bh->b_data; + } else + xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data; + + /* + * If there is already an xattr tree, good, we can calculate + * like other b-trees. Otherwise we may have the chance of + * create a tree, the credit calculation is borrowed from + * ocfs2_calc_extend_credits with root_el = NULL. And the + * new tree will be cluster based, so no meta is needed. + */ + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + struct ocfs2_extent_list *el = + &xb->xb_attrs.xb_root.xt_list; + meta_add += ocfs2_extend_meta_needed(el); + credits += ocfs2_calc_extend_credits(inode->i_sb, + el, 1); + } else + credits += OCFS2_SUBALLOC_ALLOC + 1; + + /* + * This cluster will be used either for new bucket or for + * new xattr block. + * If the cluster size is the same as the bucket size, one + * more is needed since we may need to extend the bucket + * also. + */ + clusters_add += 1; + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + if (OCFS2_XATTR_BUCKET_SIZE == + OCFS2_SB(inode->i_sb)->s_clustersize) { + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + clusters_add += 1; + } + } else { + meta_add += 1; + credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + } +out: + if (clusters_need) + *clusters_need = clusters_add; + if (meta_need) + *meta_need = meta_add; + if (credits_need) + *credits_need = credits; + brelse(bh); + return ret; +} + +static int ocfs2_init_xattr_set_ctxt(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_xattr_set_ctxt *ctxt, + int extra_meta, + int *credits) +{ + int clusters_add, meta_add, ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt)); + + ocfs2_init_dealloc_ctxt(&ctxt->dealloc); + + ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs, + &clusters_add, &meta_add, credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + meta_add += extra_meta; + trace_ocfs2_init_xattr_set_ctxt(xi->xi_name, meta_add, + clusters_add, *credits); + + if (meta_add) { + ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, + &ctxt->meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (clusters_add) { + ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac); + if (ret) + mlog_errno(ret); + } +out: + if (ret) { + if (ctxt->meta_ac) { + ocfs2_free_alloc_context(ctxt->meta_ac); + ctxt->meta_ac = NULL; + } + + /* + * We cannot have an error and a non null ctxt->data_ac. + */ + } + + return ret; +} + +static int __ocfs2_xattr_set_handle(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret = 0, credits, old_found; + + if (!xi->xi_value) { + /* Remove existing extended attribute */ + if (!xis->not_found) + ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); + else if (!xbs->not_found) + ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); + } else { + /* We always try to set extended attribute into inode first*/ + ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); + if (!ret && !xbs->not_found) { + /* + * If succeed and that extended attribute existing in + * external block, then we will remove it. + */ + xi->xi_value = NULL; + xi->xi_value_len = 0; + + old_found = xis->not_found; + xis->not_found = -ENODATA; + ret = ocfs2_calc_xattr_set_need(inode, + di, + xi, + xis, + xbs, + NULL, + NULL, + &credits); + xis->not_found = old_found; + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extend_trans(ctxt->handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); + } else if ((ret == -ENOSPC) && !ctxt->set_abort) { + if (di->i_xattr_loc && !xbs->xattr_bh) { + ret = ocfs2_xattr_block_find(inode, + xi->xi_name_index, + xi->xi_name, xbs); + if (ret) + goto out; + + old_found = xis->not_found; + xis->not_found = -ENODATA; + ret = ocfs2_calc_xattr_set_need(inode, + di, + xi, + xis, + xbs, + NULL, + NULL, + &credits); + xis->not_found = old_found; + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extend_trans(ctxt->handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + } + /* + * If no space in inode, we will set extended attribute + * into external block. + */ + ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); + if (ret) + goto out; + if (!xis->not_found) { + /* + * If succeed and that extended attribute + * existing in inode, we will remove it. + */ + xi->xi_value = NULL; + xi->xi_value_len = 0; + xbs->not_found = -ENODATA; + ret = ocfs2_calc_xattr_set_need(inode, + di, + xi, + xis, + xbs, + NULL, + NULL, + &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extend_trans(ctxt->handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_xattr_ibody_set(inode, xi, + xis, ctxt); + } + } + } + + if (!ret) { + /* Update inode ctime. */ + ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), + xis->inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_journal_dirty(ctxt->handle, xis->inode_bh); + } +out: + return ret; +} + +/* + * This function only called duing creating inode + * for init security/acl xattrs of the new inode. + * All transanction credits have been reserved in mknod. + */ +int ocfs2_xattr_set_handle(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + int name_index, + const char *name, + const void *value, + size_t value_len, + int flags, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + struct ocfs2_dinode *di; + int ret; + + struct ocfs2_xattr_info xi = { + .xi_name_index = name_index, + .xi_name = name, + .xi_name_len = strlen(name), + .xi_value = value, + .xi_value_len = value_len, + }; + + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + struct ocfs2_xattr_set_ctxt ctxt = { + .handle = handle, + .meta_ac = meta_ac, + .data_ac = data_ac, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + /* + * In extreme situation, may need xattr bucket when + * block size is too small. And we have already reserved + * the credits for bucket in mknod. + */ + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) { + xbs.bucket = ocfs2_xattr_bucket_new(inode); + if (!xbs.bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + } + + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + + ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis); + if (ret) + goto cleanup; + if (xis.not_found) { + ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs); + if (ret) + goto cleanup; + } + + ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); + +cleanup: + up_write(&OCFS2_I(inode)->ip_xattr_sem); + brelse(xbs.xattr_bh); + ocfs2_xattr_bucket_free(xbs.bucket); + + return ret; +} + +/* + * ocfs2_xattr_set() + * + * Set, replace or remove an extended attribute for this inode. + * value is NULL to remove an existing extended attribute, else either + * create or replace an extended attribute. + */ +int ocfs2_xattr_set(struct inode *inode, + int name_index, + const char *name, + const void *value, + size_t value_len, + int flags) +{ + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + int ret, credits, ref_meta = 0, ref_credits = 0; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; + struct ocfs2_refcount_tree *ref_tree = NULL; + + struct ocfs2_xattr_info xi = { + .xi_name_index = name_index, + .xi_name = name, + .xi_name_len = strlen(name), + .xi_value = value, + .xi_value_len = value_len, + }; + + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + /* + * Only xbs will be used on indexed trees. xis doesn't need a + * bucket. + */ + xbs.bucket = ocfs2_xattr_bucket_new(inode); + if (!xbs.bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto cleanup_nolock; + } + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + /* + * Scan inode and external block to find the same name + * extended attribute and collect search information. + */ + ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis); + if (ret) + goto cleanup; + if (xis.not_found) { + ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs); + if (ret) + goto cleanup; + } + + if (xis.not_found && xbs.not_found) { + ret = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + ret = 0; + if (!value) + goto cleanup; + } else { + ret = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + + /* Check whether the value is refcounted and do some preparation. */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL && + (!xis.not_found || !xbs.not_found)) { + ret = ocfs2_prepare_refcount_xattr(inode, di, &xi, + &xis, &xbs, &ref_tree, + &ref_meta, &ref_credits); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mutex_unlock(&tl_inode->i_mutex); + mlog_errno(ret); + goto cleanup; + } + } + mutex_unlock(&tl_inode->i_mutex); + + ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, + &xbs, &ctxt, ref_meta, &credits); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + + /* we need to update inode's ctime field, so add credit for it. */ + credits += OCFS2_INODE_UPDATE_CREDITS; + ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + goto cleanup; + } + + ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); + + ocfs2_commit_trans(osb, ctxt.handle); + + if (ctxt.data_ac) + ocfs2_free_alloc_context(ctxt.data_ac); + if (ctxt.meta_ac) + ocfs2_free_alloc_context(ctxt.meta_ac); + if (ocfs2_dealloc_has_cluster(&ctxt.dealloc)) + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &ctxt.dealloc); + +cleanup: + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + up_write(&OCFS2_I(inode)->ip_xattr_sem); + if (!value && !ret) { + ret = ocfs2_try_remove_refcount_tree(inode, di_bh); + if (ret) + mlog_errno(ret); + } + ocfs2_inode_unlock(inode, 1); +cleanup_nolock: + brelse(di_bh); + brelse(xbs.xattr_bh); + ocfs2_xattr_bucket_free(xbs.bucket); + + return ret; +} + +/* + * Find the xattr extent rec which may contains name_hash. + * e_cpos will be the first name hash of the xattr rec. + * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list. + */ +static int ocfs2_xattr_get_rec(struct inode *inode, + u32 name_hash, + u64 *p_blkno, + u32 *e_cpos, + u32 *num_clusters, + struct ocfs2_extent_list *el) +{ + int ret = 0, i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + u64 e_blkno = 0; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, name_hash, + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "xattr tree block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= name_hash) { + e_blkno = le64_to_cpu(rec->e_blkno); + break; + } + } + + if (!e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in xattr", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + *p_blkno = le64_to_cpu(rec->e_blkno); + *num_clusters = le16_to_cpu(rec->e_leaf_clusters); + if (e_cpos) + *e_cpos = le32_to_cpu(rec->e_cpos); +out: + brelse(eb_bh); + return ret; +} + +typedef int (xattr_bucket_func)(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para); + +static int ocfs2_find_xe_in_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + int name_index, + const char *name, + u32 name_hash, + u16 *xe_index, + int *found) +{ + int i, ret = 0, cmp = 1, block_off, new_offset; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + size_t name_len = strlen(name); + struct ocfs2_xattr_entry *xe = NULL; + char *xe_name; + + /* + * We don't use binary search in the bucket because there + * may be multiple entries with the same name hash. + */ + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + + if (name_hash > le32_to_cpu(xe->xe_name_hash)) + continue; + else if (name_hash < le32_to_cpu(xe->xe_name_hash)) + break; + + cmp = name_index - ocfs2_xattr_get_type(xe); + if (!cmp) + cmp = name_len - xe->xe_name_len; + if (cmp) + continue; + + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + xh, + i, + &block_off, + &new_offset); + if (ret) { + mlog_errno(ret); + break; + } + + + xe_name = bucket_block(bucket, block_off) + new_offset; + if (!memcmp(name, xe_name, name_len)) { + *xe_index = i; + *found = 1; + ret = 0; + break; + } + } + + return ret; +} + +/* + * Find the specified xattr entry in a series of buckets. + * This series start from p_blkno and last for num_clusters. + * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains + * the num of the valid buckets. + * + * Return the buffer_head this xattr should reside in. And if the xattr's + * hash is in the gap of 2 buckets, return the lower bucket. + */ +static int ocfs2_xattr_bucket_find(struct inode *inode, + int name_index, + const char *name, + u32 name_hash, + u64 p_blkno, + u32 first_hash, + u32 num_clusters, + struct ocfs2_xattr_search *xs) +{ + int ret, found = 0; + struct ocfs2_xattr_header *xh = NULL; + struct ocfs2_xattr_entry *xe = NULL; + u16 index = 0; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int low_bucket = 0, bucket, high_bucket; + struct ocfs2_xattr_bucket *search; + u32 last_hash; + u64 blkno, lower_blkno = 0; + + search = ocfs2_xattr_bucket_new(inode); + if (!search) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_xattr_bucket(search, p_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = bucket_xh(search); + high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1; + while (low_bucket <= high_bucket) { + ocfs2_xattr_bucket_relse(search); + + bucket = (low_bucket + high_bucket) / 2; + blkno = p_blkno + bucket * blk_per_bucket; + ret = ocfs2_read_xattr_bucket(search, blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = bucket_xh(search); + xe = &xh->xh_entries[0]; + if (name_hash < le32_to_cpu(xe->xe_name_hash)) { + high_bucket = bucket - 1; + continue; + } + + /* + * Check whether the hash of the last entry in our + * bucket is larger than the search one. for an empty + * bucket, the last one is also the first one. + */ + if (xh->xh_count) + xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1]; + + last_hash = le32_to_cpu(xe->xe_name_hash); + + /* record lower_blkno which may be the insert place. */ + lower_blkno = blkno; + + if (name_hash > le32_to_cpu(xe->xe_name_hash)) { + low_bucket = bucket + 1; + continue; + } + + /* the searched xattr should reside in this bucket if exists. */ + ret = ocfs2_find_xe_in_bucket(inode, search, + name_index, name, name_hash, + &index, &found); + if (ret) { + mlog_errno(ret); + goto out; + } + break; + } + + /* + * Record the bucket we have found. + * When the xattr's hash value is in the gap of 2 buckets, we will + * always set it to the previous bucket. + */ + if (!lower_blkno) + lower_blkno = p_blkno; + + /* This should be in cache - we just read it during the search */ + ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + xs->header = bucket_xh(xs->bucket); + xs->base = bucket_block(xs->bucket, 0); + xs->end = xs->base + inode->i_sb->s_blocksize; + + if (found) { + xs->here = &xs->header->xh_entries[index]; + trace_ocfs2_xattr_bucket_find(OCFS2_I(inode)->ip_blkno, + name, name_index, name_hash, + (unsigned long long)bucket_blkno(xs->bucket), + index); + } else + ret = -ENODATA; + +out: + ocfs2_xattr_bucket_free(search); + return ret; +} + +static int ocfs2_xattr_index_block_find(struct inode *inode, + struct buffer_head *root_bh, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + int ret; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)root_bh->b_data; + struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; + struct ocfs2_extent_list *el = &xb_root->xt_list; + u64 p_blkno = 0; + u32 first_hash, num_clusters = 0; + u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name)); + + if (le16_to_cpu(el->l_next_free_rec) == 0) + return -ENODATA; + + trace_ocfs2_xattr_index_block_find(OCFS2_I(inode)->ip_blkno, + name, name_index, name_hash, + (unsigned long long)root_bh->b_blocknr, + -1); + + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash); + + trace_ocfs2_xattr_index_block_find_rec(OCFS2_I(inode)->ip_blkno, + name, name_index, first_hash, + (unsigned long long)p_blkno, + num_clusters); + + ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash, + p_blkno, first_hash, num_clusters, xs); + +out: + return ret; +} + +static int ocfs2_iterate_xattr_buckets(struct inode *inode, + u64 blkno, + u32 clusters, + xattr_bucket_func *func, + void *para) +{ + int i, ret = 0; + u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); + u32 num_buckets = clusters * bpc; + struct ocfs2_xattr_bucket *bucket; + + bucket = ocfs2_xattr_bucket_new(inode); + if (!bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + trace_ocfs2_iterate_xattr_buckets( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)blkno, clusters); + + for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) { + ret = ocfs2_read_xattr_bucket(bucket, blkno); + if (ret) { + mlog_errno(ret); + break; + } + + /* + * The real bucket num in this series of blocks is stored + * in the 1st bucket. + */ + if (i == 0) + num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets); + + trace_ocfs2_iterate_xattr_bucket((unsigned long long)blkno, + le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash)); + if (func) { + ret = func(inode, bucket, para); + if (ret && ret != -ERANGE) + mlog_errno(ret); + /* Fall through to bucket_relse() */ + } + + ocfs2_xattr_bucket_relse(bucket); + if (ret) + break; + } + + ocfs2_xattr_bucket_free(bucket); + return ret; +} + +struct ocfs2_xattr_tree_list { + char *buffer; + size_t buffer_size; + size_t result; +}; + +static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, + struct ocfs2_xattr_header *xh, + int index, + int *block_off, + int *new_offset) +{ + u16 name_offset; + + if (index < 0 || index >= le16_to_cpu(xh->xh_count)) + return -EINVAL; + + name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset); + + *block_off = name_offset >> sb->s_blocksize_bits; + *new_offset = name_offset % sb->s_blocksize; + + return 0; +} + +static int ocfs2_list_xattr_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int ret = 0, type; + struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para; + int i, block_off, new_offset; + const char *prefix, *name; + + for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i]; + type = ocfs2_xattr_get_type(entry); + prefix = ocfs2_xattr_prefix(type); + + if (prefix) { + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + bucket_xh(bucket), + i, + &block_off, + &new_offset); + if (ret) + break; + + name = (const char *)bucket_block(bucket, block_off) + + new_offset; + ret = ocfs2_xattr_list_entry(xl->buffer, + xl->buffer_size, + &xl->result, + prefix, name, + entry->xe_name_len); + if (ret) + break; + } + } + + return ret; +} + +static int ocfs2_iterate_xattr_index_block(struct inode *inode, + struct buffer_head *blk_bh, + xattr_tree_rec_func *rec_func, + void *para) +{ + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; + int ret = 0; + u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0; + u64 p_blkno = 0; + + if (!el->l_next_free_rec || !rec_func) + return 0; + + while (name_hash > 0) { + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, + &e_cpos, &num_clusters, el); + if (ret) { + mlog_errno(ret); + break; + } + + ret = rec_func(inode, blk_bh, p_blkno, e_cpos, + num_clusters, para); + if (ret) { + if (ret != -ERANGE) + mlog_errno(ret); + break; + } + + if (e_cpos == 0) + break; + + name_hash = e_cpos - 1; + } + + return ret; + +} + +static int ocfs2_list_xattr_tree_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para) +{ + return ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_list_xattr_bucket, para); +} + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct buffer_head *blk_bh, + char *buffer, + size_t buffer_size) +{ + int ret; + struct ocfs2_xattr_tree_list xl = { + .buffer = buffer, + .buffer_size = buffer_size, + .result = 0, + }; + + ret = ocfs2_iterate_xattr_index_block(inode, blk_bh, + ocfs2_list_xattr_tree_rec, &xl); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = xl.result; +out: + return ret; +} + +static int cmp_xe(const void *a, const void *b) +{ + const struct ocfs2_xattr_entry *l = a, *r = b; + u32 l_hash = le32_to_cpu(l->xe_name_hash); + u32 r_hash = le32_to_cpu(r->xe_name_hash); + + if (l_hash > r_hash) + return 1; + if (l_hash < r_hash) + return -1; + return 0; +} + +static void swap_xe(void *a, void *b, int size) +{ + struct ocfs2_xattr_entry *l = a, *r = b, tmp; + + tmp = *l; + memcpy(l, r, sizeof(struct ocfs2_xattr_entry)); + memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry)); +} + +/* + * When the ocfs2_xattr_block is filled up, new bucket will be created + * and all the xattr entries will be moved to the new bucket. + * The header goes at the start of the bucket, and the names+values are + * filled from the end. This is why *target starts as the last buffer. + * Note: we need to sort the entries since they are not saved in order + * in the ocfs2_xattr_block. + */ +static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, + struct buffer_head *xb_bh, + struct ocfs2_xattr_bucket *bucket) +{ + int i, blocksize = inode->i_sb->s_blocksize; + int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u16 offset, size, off_change; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + u16 count = le16_to_cpu(xb_xh->xh_count); + char *src = xb_bh->b_data; + char *target = bucket_block(bucket, blks - 1); + + trace_ocfs2_cp_xattr_block_to_bucket_begin( + (unsigned long long)xb_bh->b_blocknr, + (unsigned long long)bucket_blkno(bucket)); + + for (i = 0; i < blks; i++) + memset(bucket_block(bucket, i), 0, blocksize); + + /* + * Since the xe_name_offset is based on ocfs2_xattr_header, + * there is a offset change corresponding to the change of + * ocfs2_xattr_header's position. + */ + off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + xe = &xb_xh->xh_entries[count - 1]; + offset = le16_to_cpu(xe->xe_name_offset) + off_change; + size = blocksize - offset; + + /* copy all the names and values. */ + memcpy(target + offset, src + offset, size); + + /* Init new header now. */ + xh->xh_count = xb_xh->xh_count; + xh->xh_num_buckets = cpu_to_le16(1); + xh->xh_name_value_len = cpu_to_le16(size); + xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size); + + /* copy all the entries. */ + target = bucket_block(bucket, 0); + offset = offsetof(struct ocfs2_xattr_header, xh_entries); + size = count * sizeof(struct ocfs2_xattr_entry); + memcpy(target + offset, (char *)xb_xh + offset, size); + + /* Change the xe offset for all the xe because of the move. */ + off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize + + offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + for (i = 0; i < count; i++) + le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change); + + trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change); + + sort(target + offset, count, sizeof(struct ocfs2_xattr_entry), + cmp_xe, swap_xe); +} + +/* + * After we move xattr from block to index btree, we have to + * update ocfs2_xattr_search to the new xe and base. + * + * When the entry is in xattr block, xattr_bh indicates the storage place. + * While if the entry is in index b-tree, "bucket" indicates the + * real place of the xattr. + */ +static void ocfs2_xattr_update_xattr_search(struct inode *inode, + struct ocfs2_xattr_search *xs, + struct buffer_head *old_bh) +{ + char *buf = old_bh->b_data; + struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf; + struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header; + int i; + + xs->header = bucket_xh(xs->bucket); + xs->base = bucket_block(xs->bucket, 0); + xs->end = xs->base + inode->i_sb->s_blocksize; + + if (xs->not_found) + return; + + i = xs->here - old_xh->xh_entries; + xs->here = &xs->header->xh_entries[i]; +} + +static int ocfs2_xattr_create_index_block(struct inode *inode, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + u32 bit_off, len; + u64 blkno; + handle_t *handle = ctxt->handle; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct buffer_head *xb_bh = xs->xattr_bh; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_tree_root *xr; + u16 xb_flags = le16_to_cpu(xb->xb_flags); + + trace_ocfs2_xattr_create_index_block_begin( + (unsigned long long)xb_bh->b_blocknr); + + BUG_ON(xb_flags & OCFS2_XATTR_INDEXED); + BUG_ON(!xs->bucket); + + /* + * XXX: + * We can use this lock for now, and maybe move to a dedicated mutex + * if performance becomes a problem later. + */ + down_write(&oi->ip_alloc_sem); + + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), xb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, + 1, 1, &bit_off, &len); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * The bucket may spread in many blocks, and + * we will only touch the 1st block and the last block + * in the whole bucket(one for entry and one for data). + */ + blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); + + trace_ocfs2_xattr_create_index_block((unsigned long long)blkno); + + ret = ocfs2_init_xattr_bucket(xs->bucket, blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket); + ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket); + + ocfs2_xattr_update_xattr_search(inode, xs, xb_bh); + + /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */ + memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, xb_attrs)); + + xr = &xb->xb_attrs.xb_root; + xr->xt_clusters = cpu_to_le32(1); + xr->xt_last_eb_blk = 0; + xr->xt_list.l_tree_depth = 0; + xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb)); + xr->xt_list.l_next_free_rec = cpu_to_le16(1); + + xr->xt_list.l_recs[0].e_cpos = 0; + xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno); + xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1); + + xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED); + + ocfs2_journal_dirty(handle, xb_bh); + +out: + up_write(&oi->ip_alloc_sem); + + return ret; +} + +static int cmp_xe_offset(const void *a, const void *b) +{ + const struct ocfs2_xattr_entry *l = a, *r = b; + u32 l_name_offset = le16_to_cpu(l->xe_name_offset); + u32 r_name_offset = le16_to_cpu(r->xe_name_offset); + + if (l_name_offset < r_name_offset) + return 1; + if (l_name_offset > r_name_offset) + return -1; + return 0; +} + +/* + * defrag a xattr bucket if we find that the bucket has some + * holes beteen name/value pairs. + * We will move all the name/value pairs to the end of the bucket + * so that we can spare some space for insertion. + */ +static int ocfs2_defrag_xattr_bucket(struct inode *inode, + handle_t *handle, + struct ocfs2_xattr_bucket *bucket) +{ + int ret, i; + size_t end, offset, len; + struct ocfs2_xattr_header *xh; + char *entries, *buf, *bucket_buf = NULL; + u64 blkno = bucket_blkno(bucket); + u16 xh_free_start; + size_t blocksize = inode->i_sb->s_blocksize; + struct ocfs2_xattr_entry *xe; + + /* + * In order to make the operation more efficient and generic, + * we copy all the blocks into a contiguous memory and do the + * defragment there, so if anything is error, we will not touch + * the real block. + */ + bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS); + if (!bucket_buf) { + ret = -EIO; + goto out; + } + + buf = bucket_buf; + for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize) + memcpy(buf, bucket_block(bucket, i), blocksize); + + ret = ocfs2_xattr_bucket_journal_access(handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + xh = (struct ocfs2_xattr_header *)bucket_buf; + entries = (char *)xh->xh_entries; + xh_free_start = le16_to_cpu(xh->xh_free_start); + + trace_ocfs2_defrag_xattr_bucket( + (unsigned long long)blkno, le16_to_cpu(xh->xh_count), + xh_free_start, le16_to_cpu(xh->xh_name_value_len)); + + /* + * sort all the entries by their offset. + * the largest will be the first, so that we can + * move them to the end one by one. + */ + sort(entries, le16_to_cpu(xh->xh_count), + sizeof(struct ocfs2_xattr_entry), + cmp_xe_offset, swap_xe); + + /* Move all name/values to the end of the bucket. */ + xe = xh->xh_entries; + end = OCFS2_XATTR_BUCKET_SIZE; + for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) { + offset = le16_to_cpu(xe->xe_name_offset); + len = namevalue_size_xe(xe); + + /* + * We must make sure that the name/value pair + * exist in the same block. So adjust end to + * the previous block end if needed. + */ + if (((end - len) / blocksize != + (end - 1) / blocksize)) + end = end - end % blocksize; + + if (end > offset + len) { + memmove(bucket_buf + end - len, + bucket_buf + offset, len); + xe->xe_name_offset = cpu_to_le16(end - len); + } + + mlog_bug_on_msg(end < offset + len, "Defrag check failed for " + "bucket %llu\n", (unsigned long long)blkno); + + end -= len; + } + + mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for " + "bucket %llu\n", (unsigned long long)blkno); + + if (xh_free_start == end) + goto out; + + memset(bucket_buf + xh_free_start, 0, end - xh_free_start); + xh->xh_free_start = cpu_to_le16(end); + + /* sort the entries by their name_hash. */ + sort(entries, le16_to_cpu(xh->xh_count), + sizeof(struct ocfs2_xattr_entry), + cmp_xe, swap_xe); + + buf = bucket_buf; + for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize) + memcpy(bucket_block(bucket, i), buf, blocksize); + ocfs2_xattr_bucket_journal_dirty(handle, bucket); + +out: + kfree(bucket_buf); + return ret; +} + +/* + * prev_blkno points to the start of an existing extent. new_blkno + * points to a newly allocated extent. Because we know each of our + * clusters contains more than bucket, we can easily split one cluster + * at a bucket boundary. So we take the last cluster of the existing + * extent and split it down the middle. We move the last half of the + * buckets in the last cluster of the existing extent over to the new + * extent. + * + * first_bh is the buffer at prev_blkno so we can update the existing + * extent's bucket count. header_bh is the bucket were we were hoping + * to insert our xattr. If the bucket move places the target in the new + * extent, we'll update first_bh and header_bh after modifying the old + * extent. + * + * first_hash will be set as the 1st xe's name_hash in the new extent. + */ +static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode, + handle_t *handle, + struct ocfs2_xattr_bucket *first, + struct ocfs2_xattr_bucket *target, + u64 new_blkno, + u32 num_clusters, + u32 *first_hash) +{ + int ret; + struct super_block *sb = inode->i_sb; + int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb); + int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb)); + int to_move = num_buckets / 2; + u64 src_blkno; + u64 last_cluster_blkno = bucket_blkno(first) + + ((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1)); + + BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets); + BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize); + + trace_ocfs2_mv_xattr_bucket_cross_cluster( + (unsigned long long)last_cluster_blkno, + (unsigned long long)new_blkno); + + ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first), + last_cluster_blkno, new_blkno, + to_move, first_hash); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* This is the first bucket that got moved */ + src_blkno = last_cluster_blkno + (to_move * blks_per_bucket); + + /* + * If the target bucket was part of the moved buckets, we need to + * update first and target. + */ + if (bucket_blkno(target) >= src_blkno) { + /* Find the block for the new target bucket */ + src_blkno = new_blkno + + (bucket_blkno(target) - src_blkno); + + ocfs2_xattr_bucket_relse(first); + ocfs2_xattr_bucket_relse(target); + + /* + * These shouldn't fail - the buffers are in the + * journal from ocfs2_cp_xattr_bucket(). + */ + ret = ocfs2_read_xattr_bucket(first, new_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_read_xattr_bucket(target, src_blkno); + if (ret) + mlog_errno(ret); + + } + +out: + return ret; +} + +/* + * Find the suitable pos when we divide a bucket into 2. + * We have to make sure the xattrs with the same hash value exist + * in the same bucket. + * + * If this ocfs2_xattr_header covers more than one hash value, find a + * place where the hash value changes. Try to find the most even split. + * The most common case is that all entries have different hash values, + * and the first check we make will find a place to split. + */ +static int ocfs2_xattr_find_divide_pos(struct ocfs2_xattr_header *xh) +{ + struct ocfs2_xattr_entry *entries = xh->xh_entries; + int count = le16_to_cpu(xh->xh_count); + int delta, middle = count / 2; + + /* + * We start at the middle. Each step gets farther away in both + * directions. We therefore hit the change in hash value + * nearest to the middle. Note that this loop does not execute for + * count < 2. + */ + for (delta = 0; delta < middle; delta++) { + /* Let's check delta earlier than middle */ + if (cmp_xe(&entries[middle - delta - 1], + &entries[middle - delta])) + return middle - delta; + + /* For even counts, don't walk off the end */ + if ((middle + delta + 1) == count) + continue; + + /* Now try delta past middle */ + if (cmp_xe(&entries[middle + delta], + &entries[middle + delta + 1])) + return middle + delta + 1; + } + + /* Every entry had the same hash */ + return count; +} + +/* + * Move some xattrs in old bucket(blk) to new bucket(new_blk). + * first_hash will record the 1st hash of the new bucket. + * + * Normally half of the xattrs will be moved. But we have to make + * sure that the xattrs with the same hash value are stored in the + * same bucket. If all the xattrs in this bucket have the same hash + * value, the new bucket will be initialized as an empty one and the + * first_hash will be initialized as (hash_value+1). + */ +static int ocfs2_divide_xattr_bucket(struct inode *inode, + handle_t *handle, + u64 blk, + u64 new_blk, + u32 *first_hash, + int new_bucket_head) +{ + int ret, i; + int count, start, len, name_value_len = 0, name_offset = 0; + struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; + struct ocfs2_xattr_header *xh; + struct ocfs2_xattr_entry *xe; + int blocksize = inode->i_sb->s_blocksize; + + trace_ocfs2_divide_xattr_bucket_begin((unsigned long long)blk, + (unsigned long long)new_blk); + + s_bucket = ocfs2_xattr_bucket_new(inode); + t_bucket = ocfs2_xattr_bucket_new(inode); + if (!s_bucket || !t_bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_xattr_bucket(s_bucket, blk); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Even if !new_bucket_head, we're overwriting t_bucket. Thus, + * there's no need to read it. + */ + ret = ocfs2_init_xattr_bucket(t_bucket, new_blk); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Hey, if we're overwriting t_bucket, what difference does + * ACCESS_CREATE vs ACCESS_WRITE make? See the comment in the + * same part of ocfs2_cp_xattr_bucket(). + */ + ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket, + new_bucket_head ? + OCFS2_JOURNAL_ACCESS_CREATE : + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = bucket_xh(s_bucket); + count = le16_to_cpu(xh->xh_count); + start = ocfs2_xattr_find_divide_pos(xh); + + if (start == count) { + xe = &xh->xh_entries[start-1]; + + /* + * initialized a new empty bucket here. + * The hash value is set as one larger than + * that of the last entry in the previous bucket. + */ + for (i = 0; i < t_bucket->bu_blocks; i++) + memset(bucket_block(t_bucket, i), 0, blocksize); + + xh = bucket_xh(t_bucket); + xh->xh_free_start = cpu_to_le16(blocksize); + xh->xh_entries[0].xe_name_hash = xe->xe_name_hash; + le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1); + + goto set_num_buckets; + } + + /* copy the whole bucket to the new first. */ + ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket); + + /* update the new bucket. */ + xh = bucket_xh(t_bucket); + + /* + * Calculate the total name/value len and xh_free_start for + * the old bucket first. + */ + name_offset = OCFS2_XATTR_BUCKET_SIZE; + name_value_len = 0; + for (i = 0; i < start; i++) { + xe = &xh->xh_entries[i]; + name_value_len += namevalue_size_xe(xe); + if (le16_to_cpu(xe->xe_name_offset) < name_offset) + name_offset = le16_to_cpu(xe->xe_name_offset); + } + + /* + * Now begin the modification to the new bucket. + * + * In the new bucket, We just move the xattr entry to the beginning + * and don't touch the name/value. So there will be some holes in the + * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is + * called. + */ + xe = &xh->xh_entries[start]; + len = sizeof(struct ocfs2_xattr_entry) * (count - start); + trace_ocfs2_divide_xattr_bucket_move(len, + (int)((char *)xe - (char *)xh), + (int)((char *)xh->xh_entries - (char *)xh)); + memmove((char *)xh->xh_entries, (char *)xe, len); + xe = &xh->xh_entries[count - start]; + len = sizeof(struct ocfs2_xattr_entry) * start; + memset((char *)xe, 0, len); + + le16_add_cpu(&xh->xh_count, -start); + le16_add_cpu(&xh->xh_name_value_len, -name_value_len); + + /* Calculate xh_free_start for the new bucket. */ + xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + if (le16_to_cpu(xe->xe_name_offset) < + le16_to_cpu(xh->xh_free_start)) + xh->xh_free_start = xe->xe_name_offset; + } + +set_num_buckets: + /* set xh->xh_num_buckets for the new xh. */ + if (new_bucket_head) + xh->xh_num_buckets = cpu_to_le16(1); + else + xh->xh_num_buckets = 0; + + ocfs2_xattr_bucket_journal_dirty(handle, t_bucket); + + /* store the first_hash of the new bucket. */ + if (first_hash) + *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); + + /* + * Now only update the 1st block of the old bucket. If we + * just added a new empty bucket, there is no need to modify + * it. + */ + if (start == count) + goto out; + + xh = bucket_xh(s_bucket); + memset(&xh->xh_entries[start], 0, + sizeof(struct ocfs2_xattr_entry) * (count - start)); + xh->xh_count = cpu_to_le16(start); + xh->xh_free_start = cpu_to_le16(name_offset); + xh->xh_name_value_len = cpu_to_le16(name_value_len); + + ocfs2_xattr_bucket_journal_dirty(handle, s_bucket); + +out: + ocfs2_xattr_bucket_free(s_bucket); + ocfs2_xattr_bucket_free(t_bucket); + + return ret; +} + +/* + * Copy xattr from one bucket to another bucket. + * + * The caller must make sure that the journal transaction + * has enough space for journaling. + */ +static int ocfs2_cp_xattr_bucket(struct inode *inode, + handle_t *handle, + u64 s_blkno, + u64 t_blkno, + int t_is_new) +{ + int ret; + struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; + + BUG_ON(s_blkno == t_blkno); + + trace_ocfs2_cp_xattr_bucket((unsigned long long)s_blkno, + (unsigned long long)t_blkno, + t_is_new); + + s_bucket = ocfs2_xattr_bucket_new(inode); + t_bucket = ocfs2_xattr_bucket_new(inode); + if (!s_bucket || !t_bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno); + if (ret) + goto out; + + /* + * Even if !t_is_new, we're overwriting t_bucket. Thus, + * there's no need to read it. + */ + ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno); + if (ret) + goto out; + + /* + * Hey, if we're overwriting t_bucket, what difference does + * ACCESS_CREATE vs ACCESS_WRITE make? Well, if we allocated a new + * cluster to fill, we came here from + * ocfs2_mv_xattr_buckets(), and it is really new - + * ACCESS_CREATE is required. But we also might have moved data + * out of t_bucket before extending back into it. + * ocfs2_add_new_xattr_bucket() can do this - its call to + * ocfs2_add_new_xattr_cluster() may have created a new extent + * and copied out the end of the old extent. Then it re-extends + * the old extent back to create space for new xattrs. That's + * how we get here, and the bucket isn't really new. + */ + ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket, + t_is_new ? + OCFS2_JOURNAL_ACCESS_CREATE : + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) + goto out; + + ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket); + ocfs2_xattr_bucket_journal_dirty(handle, t_bucket); + +out: + ocfs2_xattr_bucket_free(t_bucket); + ocfs2_xattr_bucket_free(s_bucket); + + return ret; +} + +/* + * src_blk points to the start of an existing extent. last_blk points to + * last cluster in that extent. to_blk points to a newly allocated + * extent. We copy the buckets from the cluster at last_blk to the new + * extent. If start_bucket is non-zero, we skip that many buckets before + * we start copying. The new extent's xh_num_buckets gets set to the + * number of buckets we copied. The old extent's xh_num_buckets shrinks + * by the same amount. + */ +static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, + u64 src_blk, u64 last_blk, u64 to_blk, + unsigned int start_bucket, + u32 *first_hash) +{ + int i, ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); + struct ocfs2_xattr_bucket *old_first, *new_first; + + trace_ocfs2_mv_xattr_buckets((unsigned long long)last_blk, + (unsigned long long)to_blk); + + BUG_ON(start_bucket >= num_buckets); + if (start_bucket) { + num_buckets -= start_bucket; + last_blk += (start_bucket * blks_per_bucket); + } + + /* The first bucket of the original extent */ + old_first = ocfs2_xattr_bucket_new(inode); + /* The first bucket of the new extent */ + new_first = ocfs2_xattr_bucket_new(inode); + if (!old_first || !new_first) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_xattr_bucket(old_first, src_blk); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We need to update the first bucket of the old extent and all + * the buckets going to the new extent. + */ + credits = ((num_buckets + 1) * blks_per_bucket); + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_bucket_journal_access(handle, old_first, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < num_buckets; i++) { + ret = ocfs2_cp_xattr_bucket(inode, handle, + last_blk + (i * blks_per_bucket), + to_blk + (i * blks_per_bucket), + 1); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * Get the new bucket ready before we dirty anything + * (This actually shouldn't fail, because we already dirtied + * it once in ocfs2_cp_xattr_bucket()). + */ + ret = ocfs2_read_xattr_bucket(new_first, to_blk); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_xattr_bucket_journal_access(handle, new_first, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Now update the headers */ + le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets); + ocfs2_xattr_bucket_journal_dirty(handle, old_first); + + bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets); + ocfs2_xattr_bucket_journal_dirty(handle, new_first); + + if (first_hash) + *first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash); + +out: + ocfs2_xattr_bucket_free(new_first); + ocfs2_xattr_bucket_free(old_first); + return ret; +} + +/* + * Move some xattrs in this cluster to the new cluster. + * This function should only be called when bucket size == cluster size. + * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead. + */ +static int ocfs2_divide_xattr_cluster(struct inode *inode, + handle_t *handle, + u64 prev_blk, + u64 new_blk, + u32 *first_hash) +{ + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int ret, credits = 2 * blk_per_bucket; + + BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); + + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* Move half of the xattr in start_blk to the next bucket. */ + return ocfs2_divide_xattr_bucket(inode, handle, prev_blk, + new_blk, first_hash, 1); +} + +/* + * Move some xattrs from the old cluster to the new one since they are not + * contiguous in ocfs2 xattr tree. + * + * new_blk starts a new separate cluster, and we will move some xattrs from + * prev_blk to it. v_start will be set as the first name hash value in this + * new cluster so that it can be used as e_cpos during tree insertion and + * don't collide with our original b-tree operations. first_bh and header_bh + * will also be updated since they will be used in ocfs2_extend_xattr_bucket + * to extend the insert bucket. + * + * The problem is how much xattr should we move to the new one and when should + * we update first_bh and header_bh? + * 1. If cluster size > bucket size, that means the previous cluster has more + * than 1 bucket, so just move half nums of bucket into the new cluster and + * update the first_bh and header_bh if the insert bucket has been moved + * to the new cluster. + * 2. If cluster_size == bucket_size: + * a) If the previous extent rec has more than one cluster and the insert + * place isn't in the last cluster, copy the entire last cluster to the + * new one. This time, we don't need to upate the first_bh and header_bh + * since they will not be moved into the new cluster. + * b) Otherwise, move the bottom half of the xattrs in the last cluster into + * the new one. And we set the extend flag to zero if the insert place is + * moved into the new allocated cluster since no extend is needed. + */ +static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, + handle_t *handle, + struct ocfs2_xattr_bucket *first, + struct ocfs2_xattr_bucket *target, + u64 new_blk, + u32 prev_clusters, + u32 *v_start, + int *extend) +{ + int ret; + + trace_ocfs2_adjust_xattr_cross_cluster( + (unsigned long long)bucket_blkno(first), + (unsigned long long)new_blk, prev_clusters); + + if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) { + ret = ocfs2_mv_xattr_bucket_cross_cluster(inode, + handle, + first, target, + new_blk, + prev_clusters, + v_start); + if (ret) + mlog_errno(ret); + } else { + /* The start of the last cluster in the first extent */ + u64 last_blk = bucket_blkno(first) + + ((prev_clusters - 1) * + ocfs2_clusters_to_blocks(inode->i_sb, 1)); + + if (prev_clusters > 1 && bucket_blkno(target) != last_blk) { + ret = ocfs2_mv_xattr_buckets(inode, handle, + bucket_blkno(first), + last_blk, new_blk, 0, + v_start); + if (ret) + mlog_errno(ret); + } else { + ret = ocfs2_divide_xattr_cluster(inode, handle, + last_blk, new_blk, + v_start); + if (ret) + mlog_errno(ret); + + if ((bucket_blkno(target) == last_blk) && extend) + *extend = 0; + } + } + + return ret; +} + +/* + * Add a new cluster for xattr storage. + * + * If the new cluster is contiguous with the previous one, it will be + * appended to the same extent record, and num_clusters will be updated. + * If not, we will insert a new extent for it and move some xattrs in + * the last cluster into the new allocated one. + * We also need to limit the maximum size of a btree leaf, otherwise we'll + * lose the benefits of hashing because we'll have to search large leaves. + * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize, + * if it's bigger). + * + * first_bh is the first block of the previous extent rec and header_bh + * indicates the bucket we will insert the new xattrs. They will be updated + * when the header_bh is moved into the new cluster. + */ +static int ocfs2_add_new_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + struct ocfs2_xattr_bucket *first, + struct ocfs2_xattr_bucket *target, + u32 *num_clusters, + u32 prev_cpos, + int *extend, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 prev_clusters = *num_clusters; + u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0; + u64 block; + handle_t *handle = ctxt->handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_tree et; + + trace_ocfs2_add_new_xattr_cluster_begin( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)bucket_blkno(first), + prev_cpos, prev_clusters); + + ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh); + + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1, + clusters_to_add, &bit_off, &num_bits); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto leave; + } + + BUG_ON(num_bits > clusters_to_add); + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + trace_ocfs2_add_new_xattr_cluster((unsigned long long)block, num_bits); + + if (bucket_blkno(first) + (prev_clusters * bpc) == block && + (prev_clusters + num_bits) << osb->s_clustersize_bits <= + OCFS2_MAX_XATTR_TREE_LEAF_SIZE) { + /* + * If this cluster is contiguous with the old one and + * adding this new cluster, we don't surpass the limit of + * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be + * initialized and used like other buckets in the previous + * cluster. + * So add it as a contiguous one. The caller will handle + * its init process. + */ + v_start = prev_cpos + prev_clusters; + *num_clusters = prev_clusters + num_bits; + } else { + ret = ocfs2_adjust_xattr_cross_cluster(inode, + handle, + first, + target, + block, + prev_clusters, + &v_start, + extend); + if (ret) { + mlog_errno(ret); + goto leave; + } + } + + trace_ocfs2_add_new_xattr_cluster_insert((unsigned long long)block, + v_start, num_bits); + ret = ocfs2_insert_extent(handle, &et, v_start, block, + num_bits, 0, ctxt->meta_ac); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + ocfs2_journal_dirty(handle, root_bh); + +leave: + return ret; +} + +/* + * We are given an extent. 'first' is the bucket at the very front of + * the extent. The extent has space for an additional bucket past + * bucket_xh(first)->xh_num_buckets. 'target_blkno' is the block number + * of the target bucket. We wish to shift every bucket past the target + * down one, filling in that additional space. When we get back to the + * target, we split the target between itself and the now-empty bucket + * at target+1 (aka, target_blkno + blks_per_bucket). + */ +static int ocfs2_extend_xattr_bucket(struct inode *inode, + handle_t *handle, + struct ocfs2_xattr_bucket *first, + u64 target_blk, + u32 num_clusters) +{ + int ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u64 end_blk; + u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets); + + trace_ocfs2_extend_xattr_bucket((unsigned long long)target_blk, + (unsigned long long)bucket_blkno(first), + num_clusters, new_bucket); + + /* The extent must have room for an additional bucket */ + BUG_ON(new_bucket >= + (num_clusters * ocfs2_xattr_buckets_per_cluster(osb))); + + /* end_blk points to the last existing bucket */ + end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket); + + /* + * end_blk is the start of the last existing bucket. + * Thus, (end_blk - target_blk) covers the target bucket and + * every bucket after it up to, but not including, the last + * existing bucket. Then we add the last existing bucket, the + * new bucket, and the first bucket (3 * blk_per_bucket). + */ + credits = (end_blk - target_blk) + (3 * blk_per_bucket); + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_bucket_journal_access(handle, first, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + while (end_blk != target_blk) { + ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk, + end_blk + blk_per_bucket, 0); + if (ret) + goto out; + end_blk -= blk_per_bucket; + } + + /* Move half of the xattr in target_blkno to the next bucket. */ + ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk, + target_blk + blk_per_bucket, NULL, 0); + + le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1); + ocfs2_xattr_bucket_journal_dirty(handle, first); + +out: + return ret; +} + +/* + * Add new xattr bucket in an extent record and adjust the buckets + * accordingly. xb_bh is the ocfs2_xattr_block, and target is the + * bucket we want to insert into. + * + * In the easy case, we will move all the buckets after target down by + * one. Half of target's xattrs will be moved to the next bucket. + * + * If current cluster is full, we'll allocate a new one. This may not + * be contiguous. The underlying calls will make sure that there is + * space for the insert, shifting buckets around if necessary. + * 'target' may be moved by those calls. + */ +static int ocfs2_add_new_xattr_bucket(struct inode *inode, + struct buffer_head *xb_bh, + struct ocfs2_xattr_bucket *target, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; + struct ocfs2_extent_list *el = &xb_root->xt_list; + u32 name_hash = + le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int ret, num_buckets, extend = 1; + u64 p_blkno; + u32 e_cpos, num_clusters; + /* The bucket at the front of the extent */ + struct ocfs2_xattr_bucket *first; + + trace_ocfs2_add_new_xattr_bucket( + (unsigned long long)bucket_blkno(target)); + + /* The first bucket of the original extent */ + first = ocfs2_xattr_bucket_new(inode); + if (!first) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_xattr_bucket(first, p_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters; + if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) { + /* + * This can move first+target if the target bucket moves + * to the new extent. + */ + ret = ocfs2_add_new_xattr_cluster(inode, + xb_bh, + first, + target, + &num_clusters, + e_cpos, + &extend, + ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (extend) { + ret = ocfs2_extend_xattr_bucket(inode, + ctxt->handle, + first, + bucket_blkno(target), + num_clusters); + if (ret) + mlog_errno(ret); + } + +out: + ocfs2_xattr_bucket_free(first); + + return ret; +} + +static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + int offs) +{ + int block_off = offs >> inode->i_sb->s_blocksize_bits; + + offs = offs % inode->i_sb->s_blocksize; + return bucket_block(bucket, block_off) + offs; +} + +/* + * Truncate the specified xe_off entry in xattr bucket. + * bucket is indicated by header_bh and len is the new length. + * Both the ocfs2_xattr_value_root and the entry will be updated here. + * + * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed. + */ +static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + int xe_off, + int len, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret, offset; + u64 value_blk; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + size_t blocksize = inode->i_sb->s_blocksize; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; + + xe = &xh->xh_entries[xe_off]; + + BUG_ON(!xe || ocfs2_xattr_is_local(xe)); + + offset = le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len); + + value_blk = offset / blocksize; + + /* We don't allow ocfs2_xattr_value to be stored in different block. */ + BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize); + + vb.vb_bh = bucket->bu_bhs[value_blk]; + BUG_ON(!vb.vb_bh); + + vb.vb_xv = (struct ocfs2_xattr_value_root *) + (vb.vb_bh->b_data + offset % blocksize); + + /* + * From here on out we have to dirty the bucket. The generic + * value calls only modify one of the bucket's bhs, but we need + * to send the bucket at once. So if they error, they *could* have + * modified something. We have to assume they did, and dirty + * the whole bucket. This leaves us in a consistent state. + */ + trace_ocfs2_xattr_bucket_value_truncate( + (unsigned long long)bucket_blkno(bucket), xe_off, len); + ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + xe->xe_value_size = cpu_to_le64(len); + + ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket); + +out: + return ret; +} + +static int ocfs2_rm_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len, + void *para) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)root_bh->b_data; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree et; + + ret = ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_delete_xattr_in_bucket, para); + if (ret) { + mlog_errno(ret); + return ret; + } + + ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh); + + ocfs2_init_dealloc_ctxt(&dealloc); + + trace_ocfs2_rm_xattr_cluster( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)blkno, cpos, len); + + ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno, + len); + + ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb)); + if (IS_ERR(handle)) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_remove_extent(handle, &et, cpos, len, meta_ac, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len); + ocfs2_journal_dirty(handle, root_bh); + + ret = ocfs2_truncate_log_append(osb, handle, blkno, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + + mutex_unlock(&tl_inode->i_mutex); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + ocfs2_run_deallocs(osb, &dealloc); + + return ret; +} + +/* + * check whether the xattr bucket is filled up with the same hash value. + * If we want to insert the xattr with the same hash, return -ENOSPC. + * If we want to insert a xattr with different hash value, go ahead + * and ocfs2_divide_xattr_bucket will handle this. + */ +static int ocfs2_check_xattr_bucket_collision(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + const char *name) +{ + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name)); + + if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash)) + return 0; + + if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash == + xh->xh_entries[0].xe_name_hash) { + mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, " + "hash = %u\n", + (unsigned long long)bucket_blkno(bucket), + le32_to_cpu(xh->xh_entries[0].xe_name_hash)); + return -ENOSPC; + } + + return 0; +} + +/* + * Try to set the entry in the current bucket. If we fail, the caller + * will handle getting us another bucket. + */ +static int ocfs2_xattr_set_entry_bucket(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + struct ocfs2_xa_loc loc; + + trace_ocfs2_xattr_set_entry_bucket(xi->xi_name); + + ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket, + xs->not_found ? NULL : xs->here); + ret = ocfs2_xa_set(&loc, xi, ctxt); + if (!ret) { + xs->here = loc.xl_entry; + goto out; + } + if (ret != -ENOSPC) { + mlog_errno(ret); + goto out; + } + + /* Ok, we need space. Let's try defragmenting the bucket. */ + ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle, + xs->bucket); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xa_set(&loc, xi, ctxt); + if (!ret) { + xs->here = loc.xl_entry; + goto out; + } + if (ret != -ENOSPC) + mlog_errno(ret); + + +out: + return ret; +} + +static int ocfs2_xattr_set_entry_index_block(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + + trace_ocfs2_xattr_set_entry_index_block(xi->xi_name); + + ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt); + if (!ret) + goto out; + if (ret != -ENOSPC) { + mlog_errno(ret); + goto out; + } + + /* Ack, need more space. Let's try to get another bucket! */ + + /* + * We do not allow for overlapping ranges between buckets. And + * the maximum number of collisions we will allow for then is + * one bucket's worth, so check it here whether we need to + * add a new bucket for the insert. + */ + ret = ocfs2_check_xattr_bucket_collision(inode, + xs->bucket, + xi->xi_name); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_add_new_xattr_bucket(inode, + xs->xattr_bh, + xs->bucket, + ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * ocfs2_add_new_xattr_bucket() will have updated + * xs->bucket if it moved, but it will not have updated + * any of the other search fields. Thus, we drop it and + * re-search. Everything should be cached, so it'll be + * quick. + */ + ocfs2_xattr_bucket_relse(xs->bucket); + ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh, + xi->xi_name_index, + xi->xi_name, xs); + if (ret && ret != -ENODATA) + goto out; + xs->not_found = ret; + + /* Ok, we have a new bucket, let's try again */ + ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt); + if (ret && (ret != -ENOSPC)) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_delete_xattr_in_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int ret = 0, ref_credits; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + u16 i; + struct ocfs2_xattr_entry *xe; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,}; + int credits = ocfs2_remove_extent_credits(osb->sb) + + ocfs2_blocks_per_xattr_bucket(inode->i_sb); + struct ocfs2_xattr_value_root *xv; + struct ocfs2_rm_xattr_bucket_para *args = + (struct ocfs2_rm_xattr_bucket_para *)para; + + ocfs2_init_dealloc_ctxt(&ctxt.dealloc); + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, + i, &xv, NULL); + + ret = ocfs2_lock_xattr_remove_allocators(inode, xv, + args->ref_ci, + args->ref_root_bh, + &ctxt.meta_ac, + &ref_credits); + + ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + break; + } + + ret = ocfs2_xattr_bucket_value_truncate(inode, bucket, + i, 0, &ctxt); + + ocfs2_commit_trans(osb, ctxt.handle); + if (ctxt.meta_ac) { + ocfs2_free_alloc_context(ctxt.meta_ac); + ctxt.meta_ac = NULL; + } + if (ret) { + mlog_errno(ret); + break; + } + } + + if (ctxt.meta_ac) + ocfs2_free_alloc_context(ctxt.meta_ac); + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &ctxt.dealloc); + return ret; +} + +/* + * Whenever we modify a xattr value root in the bucket(e.g, CoW + * or change the extent record flag), we need to recalculate + * the metaecc for the whole bucket. So it is done here. + * + * Note: + * We have to give the extra credits for the caller. + */ +static int ocfs2_xattr_bucket_post_refcount(struct inode *inode, + handle_t *handle, + void *para) +{ + int ret; + struct ocfs2_xattr_bucket *bucket = + (struct ocfs2_xattr_bucket *)para; + + ret = ocfs2_xattr_bucket_journal_access(handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + return ret; + } + + ocfs2_xattr_bucket_journal_dirty(handle, bucket); + + return 0; +} + +/* + * Special action we need if the xattr value is refcounted. + * + * 1. If the xattr is refcounted, lock the tree. + * 2. CoW the xattr if we are setting the new value and the value + * will be stored outside. + * 3. In other case, decrease_refcount will work for us, so just + * lock the refcount tree, calculate the meta and credits is OK. + * + * We have to do CoW before ocfs2_init_xattr_set_ctxt since + * currently CoW is a completed transaction, while this function + * will also lock the allocators and let us deadlock. So we will + * CoW the whole xattr value. + */ +static int ocfs2_prepare_refcount_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_refcount_tree **ref_tree, + int *meta_add, + int *credits) +{ + int ret = 0; + struct ocfs2_xattr_block *xb; + struct ocfs2_xattr_entry *xe; + char *base; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + int name_offset, name_len; + struct ocfs2_xattr_value_buf vb; + struct ocfs2_xattr_bucket *bucket = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_post_refcount refcount; + struct ocfs2_post_refcount *p = NULL; + struct buffer_head *ref_root_bh = NULL; + + if (!xis->not_found) { + xe = xis->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + base = xis->base; + vb.vb_bh = xis->inode_bh; + vb.vb_access = ocfs2_journal_access_di; + } else { + int i, block_off = 0; + xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data; + xe = xbs->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + i = xbs->here - xbs->header->xh_entries; + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + bucket_xh(xbs->bucket), + i, &block_off, + &name_offset); + if (ret) { + mlog_errno(ret); + goto out; + } + base = bucket_block(xbs->bucket, block_off); + vb.vb_bh = xbs->bucket->bu_bhs[block_off]; + vb.vb_access = ocfs2_journal_access; + + if (ocfs2_meta_ecc(osb)) { + /*create parameters for ocfs2_post_refcount. */ + bucket = xbs->bucket; + refcount.credits = bucket->bu_blocks; + refcount.para = bucket; + refcount.func = + ocfs2_xattr_bucket_post_refcount; + p = &refcount; + } + } else { + base = xbs->base; + vb.vb_bh = xbs->xattr_bh; + vb.vb_access = ocfs2_journal_access_xb; + } + } + + if (ocfs2_xattr_is_local(xe)) + goto out; + + vb.vb_xv = (struct ocfs2_xattr_value_root *) + (base + name_offset + name_len); + + ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster, + &num_clusters, &vb.vb_xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We just need to check the 1st extent record, since we always + * CoW the whole xattr. So there shouldn't be a xattr with + * some REFCOUNT extent recs after the 1st one. + */ + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * If we are deleting the xattr or the new size will be stored inside, + * cool, leave it there, the xattr truncate process will remove them + * for us(it still needs the refcount tree lock and the meta, credits). + * And the worse case is that every cluster truncate will split the + * refcount tree, and make the original extent become 3. So we will need + * 2 * cluster more extent recs at most. + */ + if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) { + + ret = ocfs2_refcounted_xattr_delete_need(inode, + &(*ref_tree)->rf_ci, + ref_root_bh, vb.vb_xv, + meta_add, credits); + if (ret) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_refcount_cow_xattr(inode, di, &vb, + *ref_tree, ref_root_bh, 0, + le32_to_cpu(vb.vb_xv->xr_clusters), p); + if (ret) + mlog_errno(ret); + +out: + brelse(ref_root_bh); + return ret; +} + +/* + * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root. + * The physical clusters will be added to refcount tree. + */ +static int ocfs2_xattr_value_attach_refcount(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + struct ocfs2_extent_tree *value_et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *refcount) +{ + int ret = 0; + u32 clusters = le32_to_cpu(xv->xr_clusters); + u32 cpos, p_cluster, num_clusters; + struct ocfs2_extent_list *el = &xv->xr_list; + unsigned int ext_flags; + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, el, &ext_flags); + + cpos += num_clusters; + if ((ext_flags & OCFS2_EXT_REFCOUNTED)) + continue; + + BUG_ON(!p_cluster); + + ret = ocfs2_add_refcount_flag(inode, value_et, + ref_ci, ref_root_bh, + cpos - num_clusters, + p_cluster, num_clusters, + dealloc, refcount); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +/* + * Given a normal ocfs2_xattr_header, refcount all the entries which + * have value stored outside. + * Used for xattrs stored in inode and ocfs2_xattr_block. + */ +static int ocfs2_xattr_attach_refcount_normal(struct inode *inode, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_header *header, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_extent_tree et; + int i, ret = 0; + + for (i = 0; i < le16_to_cpu(header->xh_count); i++) { + xe = &header->xh_entries[i]; + + if (ocfs2_xattr_is_local(xe)) + continue; + + xv = (struct ocfs2_xattr_value_root *)((void *)header + + le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + vb->vb_xv = xv; + ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); + + ret = ocfs2_xattr_value_attach_refcount(inode, xv, &et, + ref_ci, ref_root_bh, + dealloc, NULL); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +static int ocfs2_xattr_inline_attach_refcount(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *) + (fe_bh->b_data + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + struct ocfs2_xattr_value_buf vb = { + .vb_bh = fe_bh, + .vb_access = ocfs2_journal_access_di, + }; + + return ocfs2_xattr_attach_refcount_normal(inode, &vb, header, + ref_ci, ref_root_bh, dealloc); +} + +struct ocfs2_xattr_tree_value_refcount_para { + struct ocfs2_caching_info *ref_ci; + struct buffer_head *ref_root_bh; + struct ocfs2_cached_dealloc_ctxt *dealloc; +}; + +static int ocfs2_get_xattr_tree_value_root(struct super_block *sb, + struct ocfs2_xattr_bucket *bucket, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **bh) +{ + int ret, block_off, name_offset; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset]; + void *base; + + ret = ocfs2_xattr_bucket_get_name_value(sb, + bucket_xh(bucket), + offset, + &block_off, + &name_offset); + if (ret) { + mlog_errno(ret); + goto out; + } + + base = bucket_block(bucket, block_off); + + *xv = (struct ocfs2_xattr_value_root *)(base + name_offset + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + if (bh) + *bh = bucket->bu_bhs[block_off]; +out: + return ret; +} + +/* + * For a given xattr bucket, refcount all the entries which + * have value stored outside. + */ +static int ocfs2_xattr_bucket_value_refcount(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int i, ret = 0; + struct ocfs2_extent_tree et; + struct ocfs2_xattr_tree_value_refcount_para *ref = + (struct ocfs2_xattr_tree_value_refcount_para *)para; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; + struct ocfs2_post_refcount refcount = { + .credits = bucket->bu_blocks, + .para = bucket, + .func = ocfs2_xattr_bucket_post_refcount, + }; + struct ocfs2_post_refcount *p = NULL; + + /* We only need post_refcount if we support metaecc. */ + if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb))) + p = &refcount; + + trace_ocfs2_xattr_bucket_value_refcount( + (unsigned long long)bucket_blkno(bucket), + le16_to_cpu(xh->xh_count)); + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i, + &vb.vb_xv, &vb.vb_bh); + if (ret) { + mlog_errno(ret); + break; + } + + ocfs2_init_xattr_value_extent_tree(&et, + INODE_CACHE(inode), &vb); + + ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv, + &et, ref->ref_ci, + ref->ref_root_bh, + ref->dealloc, p); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; + +} + +static int ocfs2_refcount_xattr_tree_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para) +{ + return ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_xattr_bucket_value_refcount, + para); +} + +static int ocfs2_xattr_block_attach_refcount(struct inode *inode, + struct buffer_head *blk_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = blk_bh, + .vb_access = ocfs2_journal_access_xb, + }; + + ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header, + ref_ci, ref_root_bh, + dealloc); + } else { + struct ocfs2_xattr_tree_value_refcount_para para = { + .ref_ci = ref_ci, + .ref_root_bh = ref_root_bh, + .dealloc = dealloc, + }; + + ret = ocfs2_iterate_xattr_index_block(inode, blk_bh, + ocfs2_refcount_xattr_tree_rec, + ¶); + } + + return ret; +} + +int ocfs2_xattr_attach_refcount_tree(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + struct buffer_head *blk_bh = NULL; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh, + ref_ci, ref_root_bh, + dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (!di->i_xattr_loc) + goto out; + + ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_ci, + ref_root_bh, dealloc); + if (ret) + mlog_errno(ret); + + brelse(blk_bh); +out: + + return ret; +} + +typedef int (should_xattr_reflinked)(struct ocfs2_xattr_entry *xe); +/* + * Store the information we need in xattr reflink. + * old_bh and new_bh are inode bh for the old and new inode. + */ +struct ocfs2_xattr_reflink { + struct inode *old_inode; + struct inode *new_inode; + struct buffer_head *old_bh; + struct buffer_head *new_bh; + struct ocfs2_caching_info *ref_ci; + struct buffer_head *ref_root_bh; + struct ocfs2_cached_dealloc_ctxt *dealloc; + should_xattr_reflinked *xattr_reflinked; +}; + +/* + * Given a xattr header and xe offset, + * return the proper xv and the corresponding bh. + * xattr in inode, block and xattr tree have different implementaions. + */ +typedef int (get_xattr_value_root)(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para); + +/* + * Calculate all the xattr value root metadata stored in this xattr header and + * credits we need if we create them from the scratch. + * We use get_xattr_value_root so that all types of xattr container can use it. + */ +static int ocfs2_value_metas_in_xattr_header(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int *metas, int *credits, + int *num_recs, + get_xattr_value_root *func, + void *para) +{ + int i, ret = 0; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_xattr_entry *xe; + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = func(sb, bh, xh, i, &xv, NULL, para); + if (ret) { + mlog_errno(ret); + break; + } + + *metas += le16_to_cpu(xv->xr_list.l_tree_depth) * + le16_to_cpu(xv->xr_list.l_next_free_rec); + + *credits += ocfs2_calc_extend_credits(sb, + &def_xv.xv.xr_list, + le32_to_cpu(xv->xr_clusters)); + + /* + * If the value is a tree with depth > 1, We don't go deep + * to the extent block, so just calculate a maximum record num. + */ + if (!xv->xr_list.l_tree_depth) + *num_recs += le16_to_cpu(xv->xr_list.l_next_free_rec); + else + *num_recs += ocfs2_clusters_for_bytes(sb, + XATTR_SIZE_MAX); + } + + return ret; +} + +/* Used by xattr inode and block to return the right xv and buffer_head. */ +static int ocfs2_get_xattr_value_root(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset]; + + *xv = (struct ocfs2_xattr_value_root *)((void *)xh + + le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + if (ret_bh) + *ret_bh = bh; + + return 0; +} + +/* + * Lock the meta_ac and caculate how much credits we need for reflink xattrs. + * It is only used for inline xattr and xattr block. + */ +static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb, + struct ocfs2_xattr_header *xh, + struct buffer_head *ref_root_bh, + int *credits, + struct ocfs2_alloc_context **meta_ac) +{ + int ret, meta_add = 0, num_recs = 0; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + *credits = 0; + + ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh, + &meta_add, credits, &num_recs, + ocfs2_get_xattr_value_root, + NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We need to add/modify num_recs in refcount tree, so just calculate + * an approximate number we need for refcount tree change. + * Sometimes we need to split the tree, and after split, half recs + * will be moved to the new block, and a new block can only provide + * half number of recs. So we multiple new blocks by 2. + */ + num_recs = num_recs / ocfs2_refcount_recs_per_rb(osb->sb) * 2; + meta_add += num_recs; + *credits += num_recs + num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) + *credits += le16_to_cpu(rb->rf_list.l_tree_depth) * + le16_to_cpu(rb->rf_list.l_next_free_rec) + 1; + else + *credits += 1; + + ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +/* + * Given a xattr header, reflink all the xattrs in this container. + * It can be used for inode, block and bucket. + * + * NOTE: + * Before we call this function, the caller has memcpy the xattr in + * old_xh to the new_xh. + * + * If args.xattr_reflinked is set, call it to decide whether the xe should + * be reflinked or not. If not, remove it from the new xattr header. + */ +static int ocfs2_reflink_xattr_header(handle_t *handle, + struct ocfs2_xattr_reflink *args, + struct buffer_head *old_bh, + struct ocfs2_xattr_header *xh, + struct buffer_head *new_bh, + struct ocfs2_xattr_header *new_xh, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_alloc_context *meta_ac, + get_xattr_value_root *func, + void *para) +{ + int ret = 0, i, j; + struct super_block *sb = args->old_inode->i_sb; + struct buffer_head *value_bh; + struct ocfs2_xattr_entry *xe, *last; + struct ocfs2_xattr_value_root *xv, *new_xv; + struct ocfs2_extent_tree data_et; + u32 clusters, cpos, p_cluster, num_clusters; + unsigned int ext_flags = 0; + + trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr, + le16_to_cpu(xh->xh_count)); + + last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)]; + for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) { + xe = &xh->xh_entries[i]; + + if (args->xattr_reflinked && !args->xattr_reflinked(xe)) { + xe = &new_xh->xh_entries[j]; + + le16_add_cpu(&new_xh->xh_count, -1); + if (new_xh->xh_count) { + memmove(xe, xe + 1, + (void *)last - (void *)xe); + memset(last, 0, + sizeof(struct ocfs2_xattr_entry)); + } + + /* + * We don't want j to increase in the next round since + * it is already moved ahead. + */ + j--; + continue; + } + + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = func(sb, old_bh, xh, i, &xv, NULL, para); + if (ret) { + mlog_errno(ret); + break; + } + + ret = func(sb, new_bh, new_xh, j, &new_xv, &value_bh, para); + if (ret) { + mlog_errno(ret); + break; + } + + /* + * For the xattr which has l_tree_depth = 0, all the extent + * recs have already be copied to the new xh with the + * propriate OCFS2_EXT_REFCOUNTED flag we just need to + * increase the refount count int the refcount tree. + * + * For the xattr which has l_tree_depth > 0, we need + * to initialize it to the empty default value root, + * and then insert the extents one by one. + */ + if (xv->xr_list.l_tree_depth) { + memcpy(new_xv, &def_xv, sizeof(def_xv)); + vb->vb_xv = new_xv; + vb->vb_bh = value_bh; + ocfs2_init_xattr_value_extent_tree(&data_et, + INODE_CACHE(args->new_inode), vb); + } + + clusters = le32_to_cpu(xv->xr_clusters); + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(args->old_inode, + cpos, + &p_cluster, + &num_clusters, + &xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!p_cluster); + + if (xv->xr_list.l_tree_depth) { + ret = ocfs2_insert_extent(handle, + &data_et, cpos, + ocfs2_clusters_to_blocks( + args->old_inode->i_sb, + p_cluster), + num_clusters, ext_flags, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_increase_refcount(handle, args->ref_ci, + args->ref_root_bh, + p_cluster, num_clusters, + meta_ac, args->dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + cpos += num_clusters; + } + } + +out: + return ret; +} + +static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args) +{ + int ret = 0, credits = 0; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data; + int inline_size = le16_to_cpu(di->i_xattr_inline_size); + int header_off = osb->sb->s_blocksize - inline_size; + struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *) + (args->old_bh->b_data + header_off); + struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *) + (args->new_bh->b_data + header_off); + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_inode_info *new_oi; + struct ocfs2_dinode *new_di; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = args->new_bh, + .vb_access = ocfs2_journal_access_di, + }; + + ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh, + &credits, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode), + args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + memcpy(args->new_bh->b_data + header_off, + args->old_bh->b_data + header_off, inline_size); + + new_di = (struct ocfs2_dinode *)args->new_bh->b_data; + new_di->i_xattr_inline_size = cpu_to_le16(inline_size); + + ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh, + args->new_bh, new_xh, &vb, meta_ac, + ocfs2_get_xattr_value_root, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + new_oi = OCFS2_I(args->new_inode); + spin_lock(&new_oi->ip_lock); + new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL; + new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); + spin_unlock(&new_oi->ip_lock); + + ocfs2_journal_dirty(handle, args->new_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_create_empty_xattr_block(struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head **ret_bh, + int indexed) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_set_ctxt ctxt; + + memset(&ctxt, 0, sizeof(ctxt)); + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + goto out; + } + + trace_ocfs2_create_empty_xattr_block( + (unsigned long long)fe_bh->b_blocknr, indexed); + ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed, + ret_bh); + if (ret) + mlog_errno(ret); + + ocfs2_commit_trans(osb, ctxt.handle); +out: + ocfs2_free_alloc_context(ctxt.meta_ac); + return ret; +} + +static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh, + struct buffer_head *new_blk_bh) +{ + int ret = 0, credits = 0; + handle_t *handle; + struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode); + struct ocfs2_dinode *new_di; + struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb); + int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_block *new_xb = + (struct ocfs2_xattr_block *)new_blk_bh->b_data; + struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = new_blk_bh, + .vb_access = ocfs2_journal_access_xb, + }; + + ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh, + &credits, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* One more credits in case we need to add xattr flags in new inode. */ + handle = ocfs2_start_trans(osb, credits + 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) { + ret = ocfs2_journal_access_di(handle, + INODE_CACHE(args->new_inode), + args->new_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode), + new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off, + osb->sb->s_blocksize - header_off); + + ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh, + new_blk_bh, new_xh, &vb, meta_ac, + ocfs2_get_xattr_value_root, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_journal_dirty(handle, new_blk_bh); + + if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) { + new_di = (struct ocfs2_dinode *)args->new_bh->b_data; + spin_lock(&new_oi->ip_lock); + new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL; + new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); + spin_unlock(&new_oi->ip_lock); + + ocfs2_journal_dirty(handle, args->new_bh); + } + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +struct ocfs2_reflink_xattr_tree_args { + struct ocfs2_xattr_reflink *reflink; + struct buffer_head *old_blk_bh; + struct buffer_head *new_blk_bh; + struct ocfs2_xattr_bucket *old_bucket; + struct ocfs2_xattr_bucket *new_bucket; +}; + +/* + * NOTE: + * We have to handle the case that both old bucket and new bucket + * will call this function to get the right ret_bh. + * So The caller must give us the right bh. + */ +static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_reflink_xattr_tree_args *args = + (struct ocfs2_reflink_xattr_tree_args *)para; + struct ocfs2_xattr_bucket *bucket; + + if (bh == args->old_bucket->bu_bhs[0]) + bucket = args->old_bucket; + else + bucket = args->new_bucket; + + return ocfs2_get_xattr_tree_value_root(sb, bucket, offset, + xv, ret_bh); +} + +struct ocfs2_value_tree_metas { + int num_metas; + int credits; + int num_recs; +}; + +static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_xattr_bucket *bucket = + (struct ocfs2_xattr_bucket *)para; + + return ocfs2_get_xattr_tree_value_root(sb, bucket, offset, + xv, ret_bh); +} + +static int ocfs2_calc_value_tree_metas(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + struct ocfs2_value_tree_metas *metas = + (struct ocfs2_value_tree_metas *)para; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data; + + /* Add the credits for this bucket first. */ + metas->credits += bucket->bu_blocks; + return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0], + xh, &metas->num_metas, + &metas->credits, &metas->num_recs, + ocfs2_value_tree_metas_in_bucket, + bucket); +} + +/* + * Given a xattr extent rec starting from blkno and having len clusters, + * iterate all the buckets calculate how much metadata we need for reflinking + * all the ocfs2_xattr_value_root and lock the allocators accordingly. + */ +static int ocfs2_lock_reflink_xattr_rec_allocators( + struct ocfs2_reflink_xattr_tree_args *args, + struct ocfs2_extent_tree *xt_et, + u64 blkno, u32 len, int *credits, + struct ocfs2_alloc_context **meta_ac, + struct ocfs2_alloc_context **data_ac) +{ + int ret, num_free_extents; + struct ocfs2_value_tree_metas metas; + struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb); + struct ocfs2_refcount_block *rb; + + memset(&metas, 0, sizeof(metas)); + + ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len, + ocfs2_calc_value_tree_metas, &metas); + if (ret) { + mlog_errno(ret); + goto out; + } + + *credits = metas.credits; + + /* + * Calculate we need for refcount tree change. + * + * We need to add/modify num_recs in refcount tree, so just calculate + * an approximate number we need for refcount tree change. + * Sometimes we need to split the tree, and after split, half recs + * will be moved to the new block, and a new block can only provide + * half number of recs. So we multiple new blocks by 2. + * In the end, we have to add credits for modifying the already + * existed refcount block. + */ + rb = (struct ocfs2_refcount_block *)args->reflink->ref_root_bh->b_data; + metas.num_recs = + (metas.num_recs + ocfs2_refcount_recs_per_rb(osb->sb) - 1) / + ocfs2_refcount_recs_per_rb(osb->sb) * 2; + metas.num_metas += metas.num_recs; + *credits += metas.num_recs + + metas.num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) + *credits += le16_to_cpu(rb->rf_list.l_tree_depth) * + le16_to_cpu(rb->rf_list.l_next_free_rec) + 1; + else + *credits += 1; + + /* count in the xattr tree change. */ + num_free_extents = ocfs2_num_free_extents(osb, xt_et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (num_free_extents < len) + metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el); + + *credits += ocfs2_calc_extend_credits(osb->sb, + xt_et->et_root_el, len); + + if (metas.num_metas) { + ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (len) { + ret = ocfs2_reserve_clusters(osb, len, data_ac); + if (ret) + mlog_errno(ret); + } +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + meta_ac = NULL; + } + } + + return ret; +} + +static int ocfs2_reflink_xattr_bucket(handle_t *handle, + u64 blkno, u64 new_blkno, u32 clusters, + u32 *cpos, int num_buckets, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_reflink_xattr_tree_args *args) +{ + int i, j, ret = 0; + struct super_block *sb = args->reflink->old_inode->i_sb; + int bpb = args->old_bucket->bu_blocks; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; + + for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) { + ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_xattr_bucket_journal_access(handle, + args->new_bucket, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + break; + } + + for (j = 0; j < bpb; j++) + memcpy(bucket_block(args->new_bucket, j), + bucket_block(args->old_bucket, j), + sb->s_blocksize); + + /* + * Record the start cpos so that we can use it to initialize + * our xattr tree we also set the xh_num_bucket for the new + * bucket. + */ + if (i == 0) { + *cpos = le32_to_cpu(bucket_xh(args->new_bucket)-> + xh_entries[0].xe_name_hash); + bucket_xh(args->new_bucket)->xh_num_buckets = + cpu_to_le16(num_buckets); + } + + ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); + + ret = ocfs2_reflink_xattr_header(handle, args->reflink, + args->old_bucket->bu_bhs[0], + bucket_xh(args->old_bucket), + args->new_bucket->bu_bhs[0], + bucket_xh(args->new_bucket), + &vb, meta_ac, + ocfs2_get_reflink_xattr_value_root, + args); + if (ret) { + mlog_errno(ret); + break; + } + + /* + * Re-access and dirty the bucket to calculate metaecc. + * Because we may extend the transaction in reflink_xattr_header + * which will let the already accessed block gone. + */ + ret = ocfs2_xattr_bucket_journal_access(handle, + args->new_bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + break; + } + + ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); + + ocfs2_xattr_bucket_relse(args->old_bucket); + ocfs2_xattr_bucket_relse(args->new_bucket); + } + + ocfs2_xattr_bucket_relse(args->old_bucket); + ocfs2_xattr_bucket_relse(args->new_bucket); + return ret; +} + +static int ocfs2_reflink_xattr_buckets(handle_t *handle, + struct inode *inode, + struct ocfs2_reflink_xattr_tree_args *args, + struct ocfs2_extent_tree *et, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac, + u64 blkno, u32 cpos, u32 len) +{ + int ret, first_inserted = 0; + u32 p_cluster, num_clusters, reflink_cpos = 0; + u64 new_blkno; + unsigned int num_buckets, reflink_buckets; + unsigned int bpc = + ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); + + ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets); + ocfs2_xattr_bucket_relse(args->old_bucket); + + while (len && num_buckets) { + ret = ocfs2_claim_clusters(handle, data_ac, + 1, &p_cluster, &num_clusters); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + reflink_buckets = min(num_buckets, bpc * num_clusters); + + ret = ocfs2_reflink_xattr_bucket(handle, blkno, + new_blkno, num_clusters, + &reflink_cpos, reflink_buckets, + meta_ac, data_ac, args); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * For the 1st allocated cluster, we make it use the same cpos + * so that the xattr tree looks the same as the original one + * in the most case. + */ + if (!first_inserted) { + reflink_cpos = cpos; + first_inserted = 1; + } + ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno, + num_clusters, 0, meta_ac); + if (ret) + mlog_errno(ret); + + trace_ocfs2_reflink_xattr_buckets((unsigned long long)new_blkno, + num_clusters, reflink_cpos); + + len -= num_clusters; + blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); + num_buckets -= reflink_buckets; + } +out: + return ret; +} + +/* + * Create the same xattr extent record in the new inode's xattr tree. + */ +static int ocfs2_reflink_xattr_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len, + void *para) +{ + int ret, credits = 0; + handle_t *handle; + struct ocfs2_reflink_xattr_tree_args *args = + (struct ocfs2_reflink_xattr_tree_args *)para; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_extent_tree et; + + trace_ocfs2_reflink_xattr_rec((unsigned long long)blkno, len); + + ocfs2_init_xattr_tree_extent_tree(&et, + INODE_CACHE(args->reflink->new_inode), + args->new_blk_bh); + + ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno, + len, &credits, + &meta_ac, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et, + meta_ac, data_ac, + blkno, cpos, len); + if (ret) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); + +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + return ret; +} + +/* + * Create reflinked xattr buckets. + * We will add bucket one by one, and refcount all the xattrs in the bucket + * if they are stored outside. + */ +static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh, + struct buffer_head *new_blk_bh) +{ + int ret; + struct ocfs2_reflink_xattr_tree_args para; + + memset(¶, 0, sizeof(para)); + para.reflink = args; + para.old_blk_bh = blk_bh; + para.new_blk_bh = new_blk_bh; + + para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode); + if (!para.old_bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode); + if (!para.new_bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh, + ocfs2_reflink_xattr_rec, + ¶); + if (ret) + mlog_errno(ret); + +out: + ocfs2_xattr_bucket_free(para.old_bucket); + ocfs2_xattr_bucket_free(para.new_bucket); + return ret; +} + +static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh) +{ + int ret, indexed = 0; + struct buffer_head *new_blk_bh = NULL; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) + indexed = 1; + + ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh, + &new_blk_bh, indexed); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (!indexed) + ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh); + else + ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh); + if (ret) + mlog_errno(ret); + +out: + brelse(new_blk_bh); + return ret; +} + +static int ocfs2_reflink_xattr_no_security(struct ocfs2_xattr_entry *xe) +{ + int type = ocfs2_xattr_get_type(xe); + + return type != OCFS2_XATTR_INDEX_SECURITY && + type != OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS && + type != OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; +} + +int ocfs2_reflink_xattrs(struct inode *old_inode, + struct buffer_head *old_bh, + struct inode *new_inode, + struct buffer_head *new_bh, + bool preserve_security) +{ + int ret; + struct ocfs2_xattr_reflink args; + struct ocfs2_inode_info *oi = OCFS2_I(old_inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_refcount_tree *ref_tree; + struct buffer_head *ref_root_bh = NULL; + + ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb), + le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_init_dealloc_ctxt(&dealloc); + + args.old_inode = old_inode; + args.new_inode = new_inode; + args.old_bh = old_bh; + args.new_bh = new_bh; + args.ref_ci = &ref_tree->rf_ci; + args.ref_root_bh = ref_root_bh; + args.dealloc = &dealloc; + if (preserve_security) + args.xattr_reflinked = NULL; + else + args.xattr_reflinked = ocfs2_reflink_xattr_no_security; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_reflink_xattr_inline(&args); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + } + + if (!di->i_xattr_loc) + goto out_unlock; + + ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_reflink_xattr_in_block(&args, blk_bh); + if (ret) + mlog_errno(ret); + + brelse(blk_bh); + +out_unlock: + ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb), + ref_tree, 1); + brelse(ref_root_bh); + + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1); + ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc); + } + +out: + return ret; +} + +/* + * Initialize security and acl for a already created inode. + * Used for reflink a non-preserve-security file. + * + * It uses common api like ocfs2_xattr_set, so the caller + * must not hold any lock expect i_mutex. + */ +int ocfs2_init_security_and_acl(struct inode *dir, + struct inode *inode, + const struct qstr *qstr) +{ + int ret = 0; + struct buffer_head *dir_bh = NULL; + struct ocfs2_security_xattr_info si = { + .enable = 1, + }; + + ret = ocfs2_init_security_get(inode, dir, qstr, &si); + if (!ret) { + ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, + si.name, si.value, si.value_len, + XATTR_CREATE); + if (ret) { + mlog_errno(ret); + goto leave; + } + } else if (ret != -EOPNOTSUPP) { + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_inode_lock(dir, &dir_bh, 0); + if (ret) { + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); + if (ret) + mlog_errno(ret); + + ocfs2_inode_unlock(dir, 0); + brelse(dir_bh); +leave: + return ret; +} +/* + * 'security' attributes support + */ +static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list, + size_t list_size, const char *name, + size_t name_len, int type) +{ + const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + name, buffer, size); +} + +static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + + return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + name, value, size, flags); +} + +int ocfs2_init_security_get(struct inode *inode, + struct inode *dir, + const struct qstr *qstr, + struct ocfs2_security_xattr_info *si) +{ + /* check whether ocfs2 support feature xattr */ + if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) + return -EOPNOTSUPP; + return security_inode_init_security(inode, dir, qstr, &si->name, + &si->value, &si->value_len); +} + +int ocfs2_init_security_set(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + struct ocfs2_security_xattr_info *si, + struct ocfs2_alloc_context *xattr_ac, + struct ocfs2_alloc_context *data_ac) +{ + return ocfs2_xattr_set_handle(handle, inode, di_bh, + OCFS2_XATTR_INDEX_SECURITY, + si->name, si->value, si->value_len, 0, + xattr_ac, data_ac); +} + +const struct xattr_handler ocfs2_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = ocfs2_xattr_security_list, + .get = ocfs2_xattr_security_get, + .set = ocfs2_xattr_security_set, +}; + +/* + * 'trusted' attributes support + */ +static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list, + size_t list_size, const char *name, + size_t name_len, int type) +{ + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + name, buffer, size); +} + +static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + + return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + name, value, size, flags); +} + +const struct xattr_handler ocfs2_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ocfs2_xattr_trusted_list, + .get = ocfs2_xattr_trusted_get, + .set = ocfs2_xattr_trusted_set, +}; + +/* + * 'user' attributes support + */ +static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list, + size_t list_size, const char *name, + size_t name_len, int type) +{ + const size_t prefix_len = XATTR_USER_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + + if (strcmp(name, "") == 0) + return -EINVAL; + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return -EOPNOTSUPP; + return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name, + buffer, size); +} + +static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + + if (strcmp(name, "") == 0) + return -EINVAL; + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return -EOPNOTSUPP; + + return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER, + name, value, size, flags); +} + +const struct xattr_handler ocfs2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ocfs2_xattr_user_list, + .get = ocfs2_xattr_user_get, + .set = ocfs2_xattr_user_set, +}; diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h new file mode 100644 index 00000000..d63cfb72 --- /dev/null +++ b/fs/ocfs2/xattr.h @@ -0,0 +1,100 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * xattr.h + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_XATTR_H +#define OCFS2_XATTR_H + +#include +#include + +enum ocfs2_xattr_type { + OCFS2_XATTR_INDEX_USER = 1, + OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS, + OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, + OCFS2_XATTR_INDEX_TRUSTED, + OCFS2_XATTR_INDEX_SECURITY, + OCFS2_XATTR_MAX +}; + +struct ocfs2_security_xattr_info { + int enable; + char *name; + void *value; + size_t value_len; +}; + +extern const struct xattr_handler ocfs2_xattr_user_handler; +extern const struct xattr_handler ocfs2_xattr_trusted_handler; +extern const struct xattr_handler ocfs2_xattr_security_handler; +extern const struct xattr_handler ocfs2_xattr_acl_access_handler; +extern const struct xattr_handler ocfs2_xattr_acl_default_handler; +extern const struct xattr_handler *ocfs2_xattr_handlers[]; + +ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); +int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, + const char *, void *, size_t); +int ocfs2_xattr_set(struct inode *, int, const char *, const void *, + size_t, int); +int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *, + int, const char *, const void *, size_t, int, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); +int ocfs2_has_inline_xattr_value_outside(struct inode *inode, + struct ocfs2_dinode *di); +int ocfs2_xattr_remove(struct inode *, struct buffer_head *); +int ocfs2_init_security_get(struct inode *, struct inode *, + const struct qstr *, + struct ocfs2_security_xattr_info *); +int ocfs2_init_security_set(handle_t *, struct inode *, + struct buffer_head *, + struct ocfs2_security_xattr_info *, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); +int ocfs2_calc_security_init(struct inode *, + struct ocfs2_security_xattr_info *, + int *, int *, struct ocfs2_alloc_context **); +int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *, + int, struct ocfs2_security_xattr_info *, + int *, int *, int *); + +/* + * xattrs can live inside an inode, as part of an external xattr block, + * or inside an xattr bucket, which is the leaf of a tree rooted in an + * xattr block. Some of the xattr calls, especially the value setting + * functions, want to treat each of these locations as equal. Let's wrap + * them in a structure that we can pass around instead of raw buffer_heads. + */ +struct ocfs2_xattr_value_buf { + struct buffer_head *vb_bh; + ocfs2_journal_access_func vb_access; + struct ocfs2_xattr_value_root *vb_xv; +}; + +int ocfs2_xattr_attach_refcount_tree(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_reflink_xattrs(struct inode *old_inode, + struct buffer_head *old_bh, + struct inode *new_inode, + struct buffer_head *new_bh, + bool preserve_security); +int ocfs2_init_security_and_acl(struct inode *dir, + struct inode *inode, + const struct qstr *qstr); +#endif /* OCFS2_XATTR_H */ diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig new file mode 100644 index 00000000..b1b9a0ab --- /dev/null +++ b/fs/omfs/Kconfig @@ -0,0 +1,13 @@ +config OMFS_FS + tristate "SonicBlue Optimized MPEG File System support" + depends on BLOCK + select CRC_ITU_T + help + This is the proprietary file system used by the Rio Karma music + player and ReplayTV DVR. Despite the name, this filesystem is not + more efficient than a standard FS for MPEG files, in fact likely + the opposite is true. Say Y if you have either of these devices + and wish to mount its disk. + + To compile this file system support as a module, choose M here: the + module will be called omfs. If unsure, say N. diff --git a/fs/omfs/Makefile b/fs/omfs/Makefile new file mode 100644 index 00000000..8b82b63f --- /dev/null +++ b/fs/omfs/Makefile @@ -0,0 +1,4 @@ + +obj-$(CONFIG_OMFS_FS) += omfs.o + +omfs-y := bitmap.o dir.o file.o inode.o diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c new file mode 100644 index 00000000..08223458 --- /dev/null +++ b/fs/omfs/bitmap.c @@ -0,0 +1,193 @@ +#include +#include +#include +#include +#include "omfs.h" + +unsigned long omfs_count_free(struct super_block *sb) +{ + unsigned int i; + unsigned long sum = 0; + struct omfs_sb_info *sbi = OMFS_SB(sb); + int nbits = sb->s_blocksize * 8; + + for (i = 0; i < sbi->s_imap_size; i++) + sum += nbits - bitmap_weight(sbi->s_imap[i], nbits); + + return sum; +} + +/* + * Counts the run of zero bits starting at bit up to max. + * It handles the case where a run might spill over a buffer. + * Called with bitmap lock. + */ +static int count_run(unsigned long **addr, int nbits, + int addrlen, int bit, int max) +{ + int count = 0; + int x; + + for (; addrlen > 0; addrlen--, addr++) { + x = find_next_bit(*addr, nbits, bit); + count += x - bit; + + if (x < nbits || count > max) + return min(count, max); + + bit = 0; + } + return min(count, max); +} + +/* + * Sets or clears the run of count bits starting with bit. + * Called with bitmap lock. + */ +static int set_run(struct super_block *sb, int map, + int nbits, int bit, int count, int set) +{ + int i; + int err; + struct buffer_head *bh; + struct omfs_sb_info *sbi = OMFS_SB(sb); + + err = -ENOMEM; + bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map); + if (!bh) + goto out; + + for (i = 0; i < count; i++, bit++) { + if (bit >= nbits) { + bit = 0; + map++; + + mark_buffer_dirty(bh); + brelse(bh); + bh = sb_bread(sb, + clus_to_blk(sbi, sbi->s_bitmap_ino) + map); + if (!bh) + goto out; + } + if (set) { + set_bit(bit, sbi->s_imap[map]); + set_bit(bit, (unsigned long *)bh->b_data); + } else { + clear_bit(bit, sbi->s_imap[map]); + clear_bit(bit, (unsigned long *)bh->b_data); + } + } + mark_buffer_dirty(bh); + brelse(bh); + err = 0; +out: + return err; +} + +/* + * Tries to allocate exactly one block. Returns true if successful. + */ +int omfs_allocate_block(struct super_block *sb, u64 block) +{ + struct buffer_head *bh; + struct omfs_sb_info *sbi = OMFS_SB(sb); + int bits_per_entry = 8 * sb->s_blocksize; + unsigned int map, bit; + int ret = 0; + u64 tmp; + + tmp = block; + bit = do_div(tmp, bits_per_entry); + map = tmp; + + mutex_lock(&sbi->s_bitmap_lock); + if (map >= sbi->s_imap_size || test_and_set_bit(bit, sbi->s_imap[map])) + goto out; + + if (sbi->s_bitmap_ino > 0) { + bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map); + if (!bh) + goto out; + + set_bit(bit, (unsigned long *)bh->b_data); + mark_buffer_dirty(bh); + brelse(bh); + } + ret = 1; +out: + mutex_unlock(&sbi->s_bitmap_lock); + return ret; +} + + +/* + * Tries to allocate a set of blocks. The request size depends on the + * type: for inodes, we must allocate sbi->s_mirrors blocks, and for file + * blocks, we try to allocate sbi->s_clustersize, but can always get away + * with just one block. + */ +int omfs_allocate_range(struct super_block *sb, + int min_request, + int max_request, + u64 *return_block, + int *return_size) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + int bits_per_entry = 8 * sb->s_blocksize; + int ret = 0; + int i, run, bit; + + mutex_lock(&sbi->s_bitmap_lock); + for (i = 0; i < sbi->s_imap_size; i++) { + bit = 0; + while (bit < bits_per_entry) { + bit = find_next_zero_bit(sbi->s_imap[i], bits_per_entry, + bit); + + if (bit == bits_per_entry) + break; + + run = count_run(&sbi->s_imap[i], bits_per_entry, + sbi->s_imap_size-i, bit, max_request); + + if (run >= min_request) + goto found; + bit += run; + } + } + ret = -ENOSPC; + goto out; + +found: + *return_block = i * bits_per_entry + bit; + *return_size = run; + ret = set_run(sb, i, bits_per_entry, bit, run, 1); + +out: + mutex_unlock(&sbi->s_bitmap_lock); + return ret; +} + +/* + * Clears count bits starting at a given block. + */ +int omfs_clear_range(struct super_block *sb, u64 block, int count) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + int bits_per_entry = 8 * sb->s_blocksize; + u64 tmp; + unsigned int map, bit; + int ret; + + tmp = block; + bit = do_div(tmp, bits_per_entry); + map = tmp; + + if (map >= sbi->s_imap_size) + return 0; + + mutex_lock(&sbi->s_bitmap_lock); + ret = set_run(sb, map, bits_per_entry, bit, count, 0); + mutex_unlock(&sbi->s_bitmap_lock); + return ret; +} diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c new file mode 100644 index 00000000..3b8d3979 --- /dev/null +++ b/fs/omfs/dir.c @@ -0,0 +1,475 @@ +/* + * OMFS (as used by RIO Karma) directory operations. + * Copyright (C) 2005 Bob Copeland + * Released under GPL v2. + */ + +#include +#include +#include +#include "omfs.h" + +static int omfs_hash(const char *name, int namelen, int mod) +{ + int i, hash = 0; + for (i = 0; i < namelen; i++) + hash ^= tolower(name[i]) << (i % 24); + return hash % mod; +} + +/* + * Finds the bucket for a given name and reads the containing block; + * *ofs is set to the offset of the first list entry. + */ +static struct buffer_head *omfs_get_bucket(struct inode *dir, + const char *name, int namelen, int *ofs) +{ + int nbuckets = (dir->i_size - OMFS_DIR_START)/8; + int bucket = omfs_hash(name, namelen, nbuckets); + + *ofs = OMFS_DIR_START + bucket * 8; + return omfs_bread(dir->i_sb, dir->i_ino); +} + +static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block, + const char *name, int namelen, + u64 *prev_block) +{ + struct buffer_head *bh; + struct omfs_inode *oi; + int err = -ENOENT; + *prev_block = ~0; + + while (block != ~0) { + bh = omfs_bread(dir->i_sb, block); + if (!bh) { + err = -EIO; + goto err; + } + + oi = (struct omfs_inode *) bh->b_data; + if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, block)) { + brelse(bh); + goto err; + } + + if (strncmp(oi->i_name, name, namelen) == 0) + return bh; + + *prev_block = block; + block = be64_to_cpu(oi->i_sibling); + brelse(bh); + } +err: + return ERR_PTR(err); +} + +static struct buffer_head *omfs_find_entry(struct inode *dir, + const char *name, int namelen) +{ + struct buffer_head *bh; + int ofs; + u64 block, dummy; + + bh = omfs_get_bucket(dir, name, namelen, &ofs); + if (!bh) + return ERR_PTR(-EIO); + + block = be64_to_cpu(*((__be64 *) &bh->b_data[ofs])); + brelse(bh); + + return omfs_scan_list(dir, block, name, namelen, &dummy); +} + +int omfs_make_empty(struct inode *inode, struct super_block *sb) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + struct buffer_head *bh; + struct omfs_inode *oi; + + bh = omfs_bread(sb, inode->i_ino); + if (!bh) + return -ENOMEM; + + memset(bh->b_data, 0, sizeof(struct omfs_inode)); + + if (inode->i_mode & S_IFDIR) { + memset(&bh->b_data[OMFS_DIR_START], 0xff, + sbi->s_sys_blocksize - OMFS_DIR_START); + } else + omfs_make_empty_table(bh, OMFS_EXTENT_START); + + oi = (struct omfs_inode *) bh->b_data; + oi->i_head.h_self = cpu_to_be64(inode->i_ino); + oi->i_sibling = ~cpu_to_be64(0ULL); + + mark_buffer_dirty(bh); + brelse(bh); + return 0; +} + +static int omfs_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct omfs_inode *oi; + struct buffer_head *bh; + u64 block; + __be64 *entry; + int ofs; + + /* just prepend to head of queue in proper bucket */ + bh = omfs_get_bucket(dir, name, namelen, &ofs); + if (!bh) + goto out; + + entry = (__be64 *) &bh->b_data[ofs]; + block = be64_to_cpu(*entry); + *entry = cpu_to_be64(inode->i_ino); + mark_buffer_dirty(bh); + brelse(bh); + + /* now set the sibling and parent pointers on the new inode */ + bh = omfs_bread(dir->i_sb, inode->i_ino); + if (!bh) + goto out; + + oi = (struct omfs_inode *) bh->b_data; + memcpy(oi->i_name, name, namelen); + memset(oi->i_name + namelen, 0, OMFS_NAMELEN - namelen); + oi->i_sibling = cpu_to_be64(block); + oi->i_parent = cpu_to_be64(dir->i_ino); + mark_buffer_dirty(bh); + brelse(bh); + + dir->i_ctime = CURRENT_TIME_SEC; + + /* mark affected inodes dirty to rebuild checksums */ + mark_inode_dirty(dir); + mark_inode_dirty(inode); + return 0; +out: + return -ENOMEM; +} + +static int omfs_delete_entry(struct dentry *dentry) +{ + struct inode *dir = dentry->d_parent->d_inode; + struct inode *dirty; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct omfs_inode *oi; + struct buffer_head *bh, *bh2; + __be64 *entry, next; + u64 block, prev; + int ofs; + int err = -ENOMEM; + + /* delete the proper node in the bucket's linked list */ + bh = omfs_get_bucket(dir, name, namelen, &ofs); + if (!bh) + goto out; + + entry = (__be64 *) &bh->b_data[ofs]; + block = be64_to_cpu(*entry); + + bh2 = omfs_scan_list(dir, block, name, namelen, &prev); + if (IS_ERR(bh2)) { + err = PTR_ERR(bh2); + goto out_free_bh; + } + + oi = (struct omfs_inode *) bh2->b_data; + next = oi->i_sibling; + brelse(bh2); + + if (prev != ~0) { + /* found in middle of list, get list ptr */ + brelse(bh); + bh = omfs_bread(dir->i_sb, prev); + if (!bh) + goto out; + + oi = (struct omfs_inode *) bh->b_data; + entry = &oi->i_sibling; + } + + *entry = next; + mark_buffer_dirty(bh); + + if (prev != ~0) { + dirty = omfs_iget(dir->i_sb, prev); + if (!IS_ERR(dirty)) { + mark_inode_dirty(dirty); + iput(dirty); + } + } + + err = 0; +out_free_bh: + brelse(bh); +out: + return err; +} + +static int omfs_dir_is_empty(struct inode *inode) +{ + int nbuckets = (inode->i_size - OMFS_DIR_START) / 8; + struct buffer_head *bh; + u64 *ptr; + int i; + + bh = omfs_bread(inode->i_sb, inode->i_ino); + + if (!bh) + return 0; + + ptr = (u64 *) &bh->b_data[OMFS_DIR_START]; + + for (i = 0; i < nbuckets; i++, ptr++) + if (*ptr != ~0) + break; + + brelse(bh); + return *ptr != ~0; +} + +static int omfs_remove(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int ret; + + + if (S_ISDIR(inode->i_mode) && + !omfs_dir_is_empty(inode)) + return -ENOTEMPTY; + + ret = omfs_delete_entry(dentry); + if (ret) + return ret; + + clear_nlink(inode); + mark_inode_dirty(inode); + mark_inode_dirty(dir); + return 0; +} + +static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode) +{ + int err; + struct inode *inode = omfs_new_inode(dir, mode); + + if (IS_ERR(inode)) + return PTR_ERR(inode); + + err = omfs_make_empty(inode, dir->i_sb); + if (err) + goto out_free_inode; + + err = omfs_add_link(dentry, inode); + if (err) + goto out_free_inode; + + d_instantiate(dentry, inode); + return 0; + +out_free_inode: + iput(inode); + return err; +} + +static int omfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + return omfs_add_node(dir, dentry, mode | S_IFDIR); +} + +static int omfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + return omfs_add_node(dir, dentry, mode | S_IFREG); +} + +static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct buffer_head *bh; + struct inode *inode = NULL; + + if (dentry->d_name.len > OMFS_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + bh = omfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len); + if (!IS_ERR(bh)) { + struct omfs_inode *oi = (struct omfs_inode *)bh->b_data; + ino_t ino = be64_to_cpu(oi->i_head.h_self); + brelse(bh); + inode = omfs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + d_add(dentry, inode); + return NULL; +} + +/* sanity check block's self pointer */ +int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, + u64 fsblock) +{ + int is_bad; + u64 ino = be64_to_cpu(header->h_self); + is_bad = ((ino != fsblock) || (ino < sbi->s_root_ino) || + (ino > sbi->s_num_blocks)); + + if (is_bad) + printk(KERN_WARNING "omfs: bad hash chain detected\n"); + + return is_bad; +} + +static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir, + u64 fsblock, int hindex) +{ + struct inode *dir = filp->f_dentry->d_inode; + struct buffer_head *bh; + struct omfs_inode *oi; + u64 self; + int res = 0; + unsigned char d_type; + + /* follow chain in this bucket */ + while (fsblock != ~0) { + bh = omfs_bread(dir->i_sb, fsblock); + if (!bh) + goto out; + + oi = (struct omfs_inode *) bh->b_data; + if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) { + brelse(bh); + goto out; + } + + self = fsblock; + fsblock = be64_to_cpu(oi->i_sibling); + + /* skip visited nodes */ + if (hindex) { + hindex--; + brelse(bh); + continue; + } + + d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG; + + res = filldir(dirent, oi->i_name, strnlen(oi->i_name, + OMFS_NAMELEN), filp->f_pos, self, d_type); + brelse(bh); + if (res < 0) + break; + filp->f_pos++; + } +out: + return res; +} + +static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + int err; + + if (new_inode) { + /* overwriting existing file/dir */ + err = omfs_remove(new_dir, new_dentry); + if (err) + goto out; + } + + /* since omfs locates files by name, we need to unlink _before_ + * adding the new link or we won't find the old one */ + err = omfs_delete_entry(old_dentry); + if (err) + goto out; + + mark_inode_dirty(old_dir); + err = omfs_add_link(new_dentry, old_inode); + if (err) + goto out; + + old_inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(old_inode); +out: + return err; +} + +static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *dir = filp->f_dentry->d_inode; + struct buffer_head *bh; + loff_t offset, res; + unsigned int hchain, hindex; + int nbuckets; + u64 fsblock; + int ret = -EINVAL; + + if (filp->f_pos >> 32) + goto success; + + switch ((unsigned long) filp->f_pos) { + case 0: + if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0) + goto success; + filp->f_pos++; + /* fall through */ + case 1: + if (filldir(dirent, "..", 2, 1, + parent_ino(filp->f_dentry), DT_DIR) < 0) + goto success; + filp->f_pos = 1 << 20; + /* fall through */ + } + + nbuckets = (dir->i_size - OMFS_DIR_START) / 8; + + /* high 12 bits store bucket + 1 and low 20 bits store hash index */ + hchain = (filp->f_pos >> 20) - 1; + hindex = filp->f_pos & 0xfffff; + + bh = omfs_bread(dir->i_sb, dir->i_ino); + if (!bh) + goto out; + + offset = OMFS_DIR_START + hchain * 8; + + for (; hchain < nbuckets; hchain++, offset += 8) { + fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset])); + + res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex); + hindex = 0; + if (res < 0) + break; + + filp->f_pos = (hchain+2) << 20; + } + brelse(bh); +success: + ret = 0; +out: + return ret; +} + +const struct inode_operations omfs_dir_inops = { + .lookup = omfs_lookup, + .mkdir = omfs_mkdir, + .rename = omfs_rename, + .create = omfs_create, + .unlink = omfs_remove, + .rmdir = omfs_remove, +}; + +const struct file_operations omfs_dir_operations = { + .read = generic_read_dir, + .readdir = omfs_readdir, + .llseek = generic_file_llseek, +}; diff --git a/fs/omfs/file.c b/fs/omfs/file.c new file mode 100644 index 00000000..2c6d9525 --- /dev/null +++ b/fs/omfs/file.c @@ -0,0 +1,378 @@ +/* + * OMFS (as used by RIO Karma) file operations. + * Copyright (C) 2005 Bob Copeland + * Released under GPL v2. + */ + +#include +#include +#include +#include +#include "omfs.h" + +static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset) +{ + return (sbi->s_sys_blocksize - offset - + sizeof(struct omfs_extent)) / + sizeof(struct omfs_extent_entry) + 1; +} + +void omfs_make_empty_table(struct buffer_head *bh, int offset) +{ + struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset]; + + oe->e_next = ~cpu_to_be64(0ULL); + oe->e_extent_count = cpu_to_be32(1), + oe->e_fill = cpu_to_be32(0x22), + oe->e_entry.e_cluster = ~cpu_to_be64(0ULL); + oe->e_entry.e_blocks = ~cpu_to_be64(0ULL); +} + +int omfs_shrink_inode(struct inode *inode) +{ + struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); + struct omfs_extent *oe; + struct omfs_extent_entry *entry; + struct buffer_head *bh; + u64 next, last; + u32 extent_count; + u32 max_extents; + int ret; + + /* traverse extent table, freeing each entry that is greater + * than inode->i_size; + */ + next = inode->i_ino; + + /* only support truncate -> 0 for now */ + ret = -EIO; + if (inode->i_size != 0) + goto out; + + bh = omfs_bread(inode->i_sb, next); + if (!bh) + goto out; + + oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START); + + for (;;) { + + if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) + goto out_brelse; + + extent_count = be32_to_cpu(oe->e_extent_count); + + if (extent_count > max_extents) + goto out_brelse; + + last = next; + next = be64_to_cpu(oe->e_next); + entry = &oe->e_entry; + + /* ignore last entry as it is the terminator */ + for (; extent_count > 1; extent_count--) { + u64 start, count; + start = be64_to_cpu(entry->e_cluster); + count = be64_to_cpu(entry->e_blocks); + + omfs_clear_range(inode->i_sb, start, (int) count); + entry++; + } + omfs_make_empty_table(bh, (char *) oe - bh->b_data); + mark_buffer_dirty(bh); + brelse(bh); + + if (last != inode->i_ino) + omfs_clear_range(inode->i_sb, last, sbi->s_mirrors); + + if (next == ~0) + break; + + bh = omfs_bread(inode->i_sb, next); + if (!bh) + goto out; + oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT); + } + ret = 0; +out: + return ret; +out_brelse: + brelse(bh); + return ret; +} + +static void omfs_truncate(struct inode *inode) +{ + omfs_shrink_inode(inode); + mark_inode_dirty(inode); +} + +/* + * Add new blocks to the current extent, or create new entries/continuations + * as necessary. + */ +static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe, + u64 *ret_block) +{ + struct omfs_extent_entry *terminator; + struct omfs_extent_entry *entry = &oe->e_entry; + struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); + u32 extent_count = be32_to_cpu(oe->e_extent_count); + u64 new_block = 0; + u32 max_count; + int new_count; + int ret = 0; + + /* reached the end of the extent table with no blocks mapped. + * there are three possibilities for adding: grow last extent, + * add a new extent to the current extent table, and add a + * continuation inode. in last two cases need an allocator for + * sbi->s_cluster_size + */ + + /* TODO: handle holes */ + + /* should always have a terminator */ + if (extent_count < 1) + return -EIO; + + /* trivially grow current extent, if next block is not taken */ + terminator = entry + extent_count - 1; + if (extent_count > 1) { + entry = terminator-1; + new_block = be64_to_cpu(entry->e_cluster) + + be64_to_cpu(entry->e_blocks); + + if (omfs_allocate_block(inode->i_sb, new_block)) { + entry->e_blocks = + cpu_to_be64(be64_to_cpu(entry->e_blocks) + 1); + terminator->e_blocks = ~(cpu_to_be64( + be64_to_cpu(~terminator->e_blocks) + 1)); + goto out; + } + } + max_count = omfs_max_extents(sbi, OMFS_EXTENT_START); + + /* TODO: add a continuation block here */ + if (be32_to_cpu(oe->e_extent_count) > max_count-1) + return -EIO; + + /* try to allocate a new cluster */ + ret = omfs_allocate_range(inode->i_sb, 1, sbi->s_clustersize, + &new_block, &new_count); + if (ret) + goto out_fail; + + /* copy terminator down an entry */ + entry = terminator; + terminator++; + memcpy(terminator, entry, sizeof(struct omfs_extent_entry)); + + entry->e_cluster = cpu_to_be64(new_block); + entry->e_blocks = cpu_to_be64((u64) new_count); + + terminator->e_blocks = ~(cpu_to_be64( + be64_to_cpu(~terminator->e_blocks) + (u64) new_count)); + + /* write in new entry */ + oe->e_extent_count = cpu_to_be32(1 + be32_to_cpu(oe->e_extent_count)); + +out: + *ret_block = new_block; +out_fail: + return ret; +} + +/* + * Scans across the directory table for a given file block number. + * If block not found, return 0. + */ +static sector_t find_block(struct inode *inode, struct omfs_extent_entry *ent, + sector_t block, int count, int *left) +{ + /* count > 1 because of terminator */ + sector_t searched = 0; + for (; count > 1; count--) { + int numblocks = clus_to_blk(OMFS_SB(inode->i_sb), + be64_to_cpu(ent->e_blocks)); + + if (block >= searched && + block < searched + numblocks) { + /* + * found it at cluster + (block - searched) + * numblocks - (block - searched) is remainder + */ + *left = numblocks - (block - searched); + return clus_to_blk(OMFS_SB(inode->i_sb), + be64_to_cpu(ent->e_cluster)) + + block - searched; + } + searched += numblocks; + ent++; + } + return 0; +} + +static int omfs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + struct buffer_head *bh; + sector_t next, offset; + int ret; + u64 uninitialized_var(new_block); + u32 max_extents; + int extent_count; + struct omfs_extent *oe; + struct omfs_extent_entry *entry; + struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); + int max_blocks = bh_result->b_size >> inode->i_blkbits; + int remain; + + ret = -EIO; + bh = omfs_bread(inode->i_sb, inode->i_ino); + if (!bh) + goto out; + + oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START); + next = inode->i_ino; + + for (;;) { + + if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) + goto out_brelse; + + extent_count = be32_to_cpu(oe->e_extent_count); + next = be64_to_cpu(oe->e_next); + entry = &oe->e_entry; + + if (extent_count > max_extents) + goto out_brelse; + + offset = find_block(inode, entry, block, extent_count, &remain); + if (offset > 0) { + ret = 0; + map_bh(bh_result, inode->i_sb, offset); + if (remain > max_blocks) + remain = max_blocks; + bh_result->b_size = (remain << inode->i_blkbits); + goto out_brelse; + } + if (next == ~0) + break; + + brelse(bh); + bh = omfs_bread(inode->i_sb, next); + if (!bh) + goto out; + oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT); + } + if (create) { + ret = omfs_grow_extent(inode, oe, &new_block); + if (ret == 0) { + mark_buffer_dirty(bh); + mark_inode_dirty(inode); + map_bh(bh_result, inode->i_sb, + clus_to_blk(sbi, new_block)); + } + } +out_brelse: + brelse(bh); +out: + return ret; +} + +static int omfs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, omfs_get_block); +} + +static int omfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, omfs_get_block); +} + +static int omfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, omfs_get_block, wbc); +} + +static int +omfs_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, omfs_get_block); +} + +static int omfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, + omfs_get_block); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} + +static sector_t omfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, omfs_get_block); +} + +const struct file_operations omfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .fsync = generic_file_fsync, + .splice_read = generic_file_splice_read, +}; + +static int omfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations omfs_file_inops = { + .setattr = omfs_setattr, + .truncate = omfs_truncate +}; + +const struct address_space_operations omfs_aops = { + .readpage = omfs_readpage, + .readpages = omfs_readpages, + .writepage = omfs_writepage, + .writepages = omfs_writepages, + .write_begin = omfs_write_begin, + .write_end = generic_write_end, + .bmap = omfs_bmap, +}; + diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c new file mode 100644 index 00000000..e043c4cb --- /dev/null +++ b/fs/omfs/inode.c @@ -0,0 +1,585 @@ +/* + * Optimized MPEG FS - inode and super operations. + * Copyright (C) 2006 Bob Copeland + * Released under GPL v2. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "omfs.h" + +MODULE_AUTHOR("Bob Copeland "); +MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux"); +MODULE_LICENSE("GPL"); + +struct buffer_head *omfs_bread(struct super_block *sb, sector_t block) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + if (block >= sbi->s_num_blocks) + return NULL; + + return sb_bread(sb, clus_to_blk(sbi, block)); +} + +struct inode *omfs_new_inode(struct inode *dir, int mode) +{ + struct inode *inode; + u64 new_block; + int err; + int len; + struct omfs_sb_info *sbi = OMFS_SB(dir->i_sb); + + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + err = omfs_allocate_range(dir->i_sb, sbi->s_mirrors, sbi->s_mirrors, + &new_block, &len); + if (err) + goto fail; + + inode->i_ino = new_block; + inode_init_owner(inode, NULL, mode); + inode->i_mapping->a_ops = &omfs_aops; + + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + case S_IFDIR: + inode->i_op = &omfs_dir_inops; + inode->i_fop = &omfs_dir_operations; + inode->i_size = sbi->s_sys_blocksize; + inc_nlink(inode); + break; + case S_IFREG: + inode->i_op = &omfs_file_inops; + inode->i_fop = &omfs_file_operations; + inode->i_size = 0; + break; + } + + insert_inode_hash(inode); + mark_inode_dirty(inode); + return inode; +fail: + make_bad_inode(inode); + iput(inode); + return ERR_PTR(err); +} + +/* + * Update the header checksums for a dirty inode based on its contents. + * Caller is expected to hold the buffer head underlying oi and mark it + * dirty. + */ +static void omfs_update_checksums(struct omfs_inode *oi) +{ + int xor, i, ofs = 0, count; + u16 crc = 0; + unsigned char *ptr = (unsigned char *) oi; + + count = be32_to_cpu(oi->i_head.h_body_size); + ofs = sizeof(struct omfs_header); + + crc = crc_itu_t(crc, ptr + ofs, count); + oi->i_head.h_crc = cpu_to_be16(crc); + + xor = ptr[0]; + for (i = 1; i < OMFS_XOR_COUNT; i++) + xor ^= ptr[i]; + + oi->i_head.h_check_xor = xor; +} + +static int __omfs_write_inode(struct inode *inode, int wait) +{ + struct omfs_inode *oi; + struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); + struct buffer_head *bh, *bh2; + u64 ctime; + int i; + int ret = -EIO; + int sync_failed = 0; + + /* get current inode since we may have written sibling ptrs etc. */ + bh = omfs_bread(inode->i_sb, inode->i_ino); + if (!bh) + goto out; + + oi = (struct omfs_inode *) bh->b_data; + + oi->i_head.h_self = cpu_to_be64(inode->i_ino); + if (S_ISDIR(inode->i_mode)) + oi->i_type = OMFS_DIR; + else if (S_ISREG(inode->i_mode)) + oi->i_type = OMFS_FILE; + else { + printk(KERN_WARNING "omfs: unknown file type: %d\n", + inode->i_mode); + goto out_brelse; + } + + oi->i_head.h_body_size = cpu_to_be32(sbi->s_sys_blocksize - + sizeof(struct omfs_header)); + oi->i_head.h_version = 1; + oi->i_head.h_type = OMFS_INODE_NORMAL; + oi->i_head.h_magic = OMFS_IMAGIC; + oi->i_size = cpu_to_be64(inode->i_size); + + ctime = inode->i_ctime.tv_sec * 1000LL + + ((inode->i_ctime.tv_nsec + 999)/1000); + oi->i_ctime = cpu_to_be64(ctime); + + omfs_update_checksums(oi); + + mark_buffer_dirty(bh); + if (wait) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + sync_failed = 1; + } + + /* if mirroring writes, copy to next fsblock */ + for (i = 1; i < sbi->s_mirrors; i++) { + bh2 = omfs_bread(inode->i_sb, inode->i_ino + i); + if (!bh2) + goto out_brelse; + + memcpy(bh2->b_data, bh->b_data, bh->b_size); + mark_buffer_dirty(bh2); + if (wait) { + sync_dirty_buffer(bh2); + if (buffer_req(bh2) && !buffer_uptodate(bh2)) + sync_failed = 1; + } + brelse(bh2); + } + ret = (sync_failed) ? -EIO : 0; +out_brelse: + brelse(bh); +out: + return ret; +} + +static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + +int omfs_sync_inode(struct inode *inode) +{ + return __omfs_write_inode(inode, 1); +} + +/* + * called when an entry is deleted, need to clear the bits in the + * bitmaps. + */ +static void omfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + + if (inode->i_nlink) + return; + + if (S_ISREG(inode->i_mode)) { + inode->i_size = 0; + omfs_shrink_inode(inode); + } + + omfs_clear_range(inode->i_sb, inode->i_ino, 2); +} + +struct inode *omfs_iget(struct super_block *sb, ino_t ino) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + struct omfs_inode *oi; + struct buffer_head *bh; + u64 ctime; + unsigned long nsecs; + struct inode *inode; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + bh = omfs_bread(inode->i_sb, ino); + if (!bh) + goto iget_failed; + + oi = (struct omfs_inode *)bh->b_data; + + /* check self */ + if (ino != be64_to_cpu(oi->i_head.h_self)) + goto fail_bh; + + inode->i_uid = sbi->s_uid; + inode->i_gid = sbi->s_gid; + + ctime = be64_to_cpu(oi->i_ctime); + nsecs = do_div(ctime, 1000) * 1000L; + + inode->i_atime.tv_sec = ctime; + inode->i_mtime.tv_sec = ctime; + inode->i_ctime.tv_sec = ctime; + inode->i_atime.tv_nsec = nsecs; + inode->i_mtime.tv_nsec = nsecs; + inode->i_ctime.tv_nsec = nsecs; + + inode->i_mapping->a_ops = &omfs_aops; + + switch (oi->i_type) { + case OMFS_DIR: + inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask); + inode->i_op = &omfs_dir_inops; + inode->i_fop = &omfs_dir_operations; + inode->i_size = sbi->s_sys_blocksize; + inc_nlink(inode); + break; + case OMFS_FILE: + inode->i_mode = S_IFREG | (S_IRWXUGO & ~sbi->s_fmask); + inode->i_fop = &omfs_file_operations; + inode->i_size = be64_to_cpu(oi->i_size); + break; + } + brelse(bh); + unlock_new_inode(inode); + return inode; +fail_bh: + brelse(bh); +iget_failed: + iget_failed(inode); + return ERR_PTR(-EIO); +} + +static void omfs_put_super(struct super_block *sb) +{ + struct omfs_sb_info *sbi = OMFS_SB(sb); + kfree(sbi->s_imap); + kfree(sbi); + sb->s_fs_info = NULL; +} + +static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *s = dentry->d_sb; + struct omfs_sb_info *sbi = OMFS_SB(s); + u64 id = huge_encode_dev(s->s_bdev->bd_dev); + + buf->f_type = OMFS_MAGIC; + buf->f_bsize = sbi->s_blocksize; + buf->f_blocks = sbi->s_num_blocks; + buf->f_files = sbi->s_num_blocks; + buf->f_namelen = OMFS_NAMELEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + buf->f_bfree = buf->f_bavail = buf->f_ffree = + omfs_count_free(s); + + return 0; +} + +static const struct super_operations omfs_sops = { + .write_inode = omfs_write_inode, + .evict_inode = omfs_evict_inode, + .put_super = omfs_put_super, + .statfs = omfs_statfs, + .show_options = generic_show_options, +}; + +/* + * For Rio Karma, there is an on-disk free bitmap whose location is + * stored in the root block. For ReplayTV, there is no such free bitmap + * so we have to walk the tree. Both inodes and file data are allocated + * from the same map. This array can be big (300k) so we allocate + * in units of the blocksize. + */ +static int omfs_get_imap(struct super_block *sb) +{ + int bitmap_size; + int array_size; + int count; + struct omfs_sb_info *sbi = OMFS_SB(sb); + struct buffer_head *bh; + unsigned long **ptr; + sector_t block; + + bitmap_size = DIV_ROUND_UP(sbi->s_num_blocks, 8); + array_size = DIV_ROUND_UP(bitmap_size, sb->s_blocksize); + + if (sbi->s_bitmap_ino == ~0ULL) + goto out; + + sbi->s_imap_size = array_size; + sbi->s_imap = kzalloc(array_size * sizeof(unsigned long *), GFP_KERNEL); + if (!sbi->s_imap) + goto nomem; + + block = clus_to_blk(sbi, sbi->s_bitmap_ino); + if (block >= sbi->s_num_blocks) + goto nomem; + + ptr = sbi->s_imap; + for (count = bitmap_size; count > 0; count -= sb->s_blocksize) { + bh = sb_bread(sb, block++); + if (!bh) + goto nomem_free; + *ptr = kmalloc(sb->s_blocksize, GFP_KERNEL); + if (!*ptr) { + brelse(bh); + goto nomem_free; + } + memcpy(*ptr, bh->b_data, sb->s_blocksize); + if (count < sb->s_blocksize) + memset((void *)*ptr + count, 0xff, + sb->s_blocksize - count); + brelse(bh); + ptr++; + } +out: + return 0; + +nomem_free: + for (count = 0; count < array_size; count++) + kfree(sbi->s_imap[count]); + + kfree(sbi->s_imap); +nomem: + sbi->s_imap = NULL; + sbi->s_imap_size = 0; + return -ENOMEM; +} + +enum { + Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask +}; + +static const match_table_t tokens = { + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_umask, "umask=%o"}, + {Opt_dmask, "dmask=%o"}, + {Opt_fmask, "fmask=%o"}, +}; + +static int parse_options(char *options, struct omfs_sb_info *sbi) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_uid = option; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_gid = option; + break; + case Opt_umask: + if (match_octal(&args[0], &option)) + return 0; + sbi->s_fmask = sbi->s_dmask = option; + break; + case Opt_dmask: + if (match_octal(&args[0], &option)) + return 0; + sbi->s_dmask = option; + break; + case Opt_fmask: + if (match_octal(&args[0], &option)) + return 0; + sbi->s_fmask = option; + break; + default: + return 0; + } + } + return 1; +} + +static int omfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct buffer_head *bh, *bh2; + struct omfs_super_block *omfs_sb; + struct omfs_root_block *omfs_rb; + struct omfs_sb_info *sbi; + struct inode *root; + int ret = -EINVAL; + + save_mount_options(sb, (char *) data); + + sbi = kzalloc(sizeof(struct omfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + + sbi->s_uid = current_uid(); + sbi->s_gid = current_gid(); + sbi->s_dmask = sbi->s_fmask = current_umask(); + + if (!parse_options((char *) data, sbi)) + goto end; + + sb->s_maxbytes = 0xffffffff; + + sb_set_blocksize(sb, 0x200); + + bh = sb_bread(sb, 0); + if (!bh) + goto end; + + omfs_sb = (struct omfs_super_block *)bh->b_data; + + if (omfs_sb->s_magic != cpu_to_be32(OMFS_MAGIC)) { + if (!silent) + printk(KERN_ERR "omfs: Invalid superblock (%x)\n", + omfs_sb->s_magic); + goto out_brelse_bh; + } + sb->s_magic = OMFS_MAGIC; + + sbi->s_num_blocks = be64_to_cpu(omfs_sb->s_num_blocks); + sbi->s_blocksize = be32_to_cpu(omfs_sb->s_blocksize); + sbi->s_mirrors = be32_to_cpu(omfs_sb->s_mirrors); + sbi->s_root_ino = be64_to_cpu(omfs_sb->s_root_block); + sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize); + mutex_init(&sbi->s_bitmap_lock); + + if (sbi->s_sys_blocksize > PAGE_SIZE) { + printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n", + sbi->s_sys_blocksize); + goto out_brelse_bh; + } + + if (sbi->s_blocksize < sbi->s_sys_blocksize || + sbi->s_blocksize > OMFS_MAX_BLOCK_SIZE) { + printk(KERN_ERR "omfs: block size (%d) is out of range\n", + sbi->s_blocksize); + goto out_brelse_bh; + } + + /* + * Use sys_blocksize as the fs block since it is smaller than a + * page while the fs blocksize can be larger. + */ + sb_set_blocksize(sb, sbi->s_sys_blocksize); + + /* + * ...and the difference goes into a shift. sys_blocksize is always + * a power of two factor of blocksize. + */ + sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) - + get_bitmask_order(sbi->s_sys_blocksize); + + bh2 = omfs_bread(sb, be64_to_cpu(omfs_sb->s_root_block)); + if (!bh2) + goto out_brelse_bh; + + omfs_rb = (struct omfs_root_block *)bh2->b_data; + + sbi->s_bitmap_ino = be64_to_cpu(omfs_rb->r_bitmap); + sbi->s_clustersize = be32_to_cpu(omfs_rb->r_clustersize); + + if (sbi->s_num_blocks != be64_to_cpu(omfs_rb->r_num_blocks)) { + printk(KERN_ERR "omfs: block count discrepancy between " + "super and root blocks (%llx, %llx)\n", + (unsigned long long)sbi->s_num_blocks, + (unsigned long long)be64_to_cpu(omfs_rb->r_num_blocks)); + goto out_brelse_bh2; + } + + if (sbi->s_bitmap_ino != ~0ULL && + sbi->s_bitmap_ino > sbi->s_num_blocks) { + printk(KERN_ERR "omfs: free space bitmap location is corrupt " + "(%llx, total blocks %llx)\n", + (unsigned long long) sbi->s_bitmap_ino, + (unsigned long long) sbi->s_num_blocks); + goto out_brelse_bh2; + } + if (sbi->s_clustersize < 1 || + sbi->s_clustersize > OMFS_MAX_CLUSTER_SIZE) { + printk(KERN_ERR "omfs: cluster size out of range (%d)", + sbi->s_clustersize); + goto out_brelse_bh2; + } + + ret = omfs_get_imap(sb); + if (ret) + goto out_brelse_bh2; + + sb->s_op = &omfs_sops; + + root = omfs_iget(sb, be64_to_cpu(omfs_rb->r_root_dir)); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out_brelse_bh2; + } + + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + iput(root); + goto out_brelse_bh2; + } + printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name); + + ret = 0; +out_brelse_bh2: + brelse(bh2); +out_brelse_bh: + brelse(bh); +end: + if (ret) + kfree(sbi); + return ret; +} + +static struct dentry *omfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super); +} + +static struct file_system_type omfs_fs_type = { + .owner = THIS_MODULE, + .name = "omfs", + .mount = omfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_omfs_fs(void) +{ + return register_filesystem(&omfs_fs_type); +} + +static void __exit exit_omfs_fs(void) +{ + unregister_filesystem(&omfs_fs_type); +} + +module_init(init_omfs_fs); +module_exit(exit_omfs_fs); diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h new file mode 100644 index 00000000..7d414fef --- /dev/null +++ b/fs/omfs/omfs.h @@ -0,0 +1,68 @@ +#ifndef _OMFS_H +#define _OMFS_H + +#include +#include + +#include "omfs_fs.h" + +/* In-memory structures */ +struct omfs_sb_info { + u64 s_num_blocks; + u64 s_bitmap_ino; + u64 s_root_ino; + u32 s_blocksize; + u32 s_mirrors; + u32 s_sys_blocksize; + u32 s_clustersize; + int s_block_shift; + unsigned long **s_imap; + int s_imap_size; + struct mutex s_bitmap_lock; + int s_uid; + int s_gid; + int s_dmask; + int s_fmask; +}; + +/* convert a cluster number to a scaled block number */ +static inline sector_t clus_to_blk(struct omfs_sb_info *sbi, sector_t block) +{ + return block << sbi->s_block_shift; +} + +static inline struct omfs_sb_info *OMFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* bitmap.c */ +extern unsigned long omfs_count_free(struct super_block *sb); +extern int omfs_allocate_block(struct super_block *sb, u64 block); +extern int omfs_allocate_range(struct super_block *sb, int min_request, + int max_request, u64 *return_block, int *return_size); +extern int omfs_clear_range(struct super_block *sb, u64 block, int count); + +/* dir.c */ +extern const struct file_operations omfs_dir_operations; +extern const struct inode_operations omfs_dir_inops; +extern int omfs_make_empty(struct inode *inode, struct super_block *sb); +extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, + u64 fsblock); + +/* file.c */ +extern const struct file_operations omfs_file_operations; +extern const struct inode_operations omfs_file_inops; +extern const struct address_space_operations omfs_aops; +extern void omfs_make_empty_table(struct buffer_head *bh, int offset); +extern int omfs_shrink_inode(struct inode *inode); + +/* inode.c */ +extern struct buffer_head *omfs_bread(struct super_block *sb, sector_t block); +extern struct inode *omfs_iget(struct super_block *sb, ino_t inode); +extern struct inode *omfs_new_inode(struct inode *dir, int mode); +extern int omfs_reserve_block(struct super_block *sb, sector_t block); +extern int omfs_find_empty_block(struct super_block *sb, int mode, ino_t *ino); +extern int omfs_sync_inode(struct inode *inode); + +#endif diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h new file mode 100644 index 00000000..ee5e4327 --- /dev/null +++ b/fs/omfs/omfs_fs.h @@ -0,0 +1,81 @@ +#ifndef _OMFS_FS_H +#define _OMFS_FS_H + +/* OMFS On-disk structures */ + +#define OMFS_MAGIC 0xC2993D87 +#define OMFS_IMAGIC 0xD2 + +#define OMFS_DIR 'D' +#define OMFS_FILE 'F' +#define OMFS_INODE_NORMAL 'e' +#define OMFS_INODE_CONTINUATION 'c' +#define OMFS_INODE_SYSTEM 's' +#define OMFS_NAMELEN 256 +#define OMFS_DIR_START 0x1b8 +#define OMFS_EXTENT_START 0x1d0 +#define OMFS_EXTENT_CONT 0x40 +#define OMFS_XOR_COUNT 19 +#define OMFS_MAX_BLOCK_SIZE 8192 +#define OMFS_MAX_CLUSTER_SIZE 8 + +struct omfs_super_block { + char s_fill1[256]; + __be64 s_root_block; /* block number of omfs_root_block */ + __be64 s_num_blocks; /* total number of FS blocks */ + __be32 s_magic; /* OMFS_MAGIC */ + __be32 s_blocksize; /* size of a block */ + __be32 s_mirrors; /* # of mirrors of system blocks */ + __be32 s_sys_blocksize; /* size of non-data blocks */ +}; + +struct omfs_header { + __be64 h_self; /* FS block where this is located */ + __be32 h_body_size; /* size of useful data after header */ + __be16 h_crc; /* crc-ccitt of body_size bytes */ + char h_fill1[2]; + u8 h_version; /* version, always 1 */ + char h_type; /* OMFS_INODE_X */ + u8 h_magic; /* OMFS_IMAGIC */ + u8 h_check_xor; /* XOR of header bytes before this */ + __be32 h_fill2; +}; + +struct omfs_root_block { + struct omfs_header r_head; /* header */ + __be64 r_fill1; + __be64 r_num_blocks; /* total number of FS blocks */ + __be64 r_root_dir; /* block # of root directory */ + __be64 r_bitmap; /* block # of free space bitmap */ + __be32 r_blocksize; /* size of a block */ + __be32 r_clustersize; /* size allocated for data blocks */ + __be64 r_mirrors; /* # of mirrors of system blocks */ + char r_name[OMFS_NAMELEN]; /* partition label */ +}; + +struct omfs_inode { + struct omfs_header i_head; /* header */ + __be64 i_parent; /* parent containing this inode */ + __be64 i_sibling; /* next inode in hash bucket */ + __be64 i_ctime; /* ctime, in milliseconds */ + char i_fill1[35]; + char i_type; /* OMFS_[DIR,FILE] */ + __be32 i_fill2; + char i_fill3[64]; + char i_name[OMFS_NAMELEN]; /* filename */ + __be64 i_size; /* size of file, in bytes */ +}; + +struct omfs_extent_entry { + __be64 e_cluster; /* start location of a set of blocks */ + __be64 e_blocks; /* number of blocks after e_cluster */ +}; + +struct omfs_extent { + __be64 e_next; /* next extent table location */ + __be32 e_extent_count; /* total # extents in this table */ + __be32 e_fill; + struct omfs_extent_entry e_entry; /* start of extent entries */ +}; + +#endif diff --git a/fs/open.c b/fs/open.c new file mode 100644 index 00000000..b52cf013 --- /dev/null +++ b/fs/open.c @@ -0,0 +1,1161 @@ +/* + * linux/fs/open.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, + struct file *filp) +{ + int ret; + struct iattr newattrs; + + /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ + if (length < 0) + return -EINVAL; + + newattrs.ia_size = length; + newattrs.ia_valid = ATTR_SIZE | time_attrs; + if (filp) { + newattrs.ia_file = filp; + newattrs.ia_valid |= ATTR_FILE; + } + + /* Remove suid/sgid on truncate too */ + ret = should_remove_suid(dentry); + if (ret) + newattrs.ia_valid |= ret | ATTR_FORCE; + + mutex_lock(&dentry->d_inode->i_mutex); + ret = notify_change(dentry, &newattrs); + mutex_unlock(&dentry->d_inode->i_mutex); + return ret; +} + +static long do_sys_truncate(const char __user *pathname, loff_t length) +{ + struct path path; + struct inode *inode; + int error; + + error = -EINVAL; + if (length < 0) /* sorry, but loff_t says... */ + goto out; + + error = user_path(pathname, &path); + if (error) + goto out; + inode = path.dentry->d_inode; + + /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ + error = -EISDIR; + if (S_ISDIR(inode->i_mode)) + goto dput_and_out; + + error = -EINVAL; + if (!S_ISREG(inode->i_mode)) + goto dput_and_out; + + error = mnt_want_write(path.mnt); + if (error) + goto dput_and_out; + + error = inode_permission(inode, MAY_WRITE); + if (error) + goto mnt_drop_write_and_out; + + error = -EPERM; + if (IS_APPEND(inode)) + goto mnt_drop_write_and_out; + + error = get_write_access(inode); + if (error) + goto mnt_drop_write_and_out; + + /* + * Make sure that there are no leases. get_write_access() protects + * against the truncate racing with a lease-granting setlease(). + */ + error = break_lease(inode, O_WRONLY); + if (error) + goto put_write_and_out; + + error = locks_verify_truncate(inode, NULL, length); + if (!error) + error = security_path_truncate(&path); + if (!error) + error = do_truncate(path.dentry, length, 0, NULL); + +put_write_and_out: + put_write_access(inode); +mnt_drop_write_and_out: + mnt_drop_write(path.mnt); +dput_and_out: + path_put(&path); +out: + return error; +} + +SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) +{ + return do_sys_truncate(path, length); +} + +static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +{ + struct inode * inode; + struct dentry *dentry; + struct file * file; + int error; + + error = -EINVAL; + if (length < 0) + goto out; + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + + /* explicitly opened as large or we are on 64-bit box */ + if (file->f_flags & O_LARGEFILE) + small = 0; + + dentry = file->f_path.dentry; + inode = dentry->d_inode; + error = -EINVAL; + if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) + goto out_putf; + + error = -EINVAL; + /* Cannot ftruncate over 2^31 bytes without large file support */ + if (small && length > MAX_NON_LFS) + goto out_putf; + + error = -EPERM; + if (IS_APPEND(inode)) + goto out_putf; + + error = locks_verify_truncate(inode, file, length); + if (!error) + error = security_path_truncate(&file->f_path); + if (!error) + error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); +out_putf: + fput(file); +out: + return error; +} + +SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) +{ + long ret = do_sys_ftruncate(fd, length, 1); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(2, ret, fd, length); + return ret; +} + +/* LFS versions of truncate are only needed on 32 bit machines */ +#if BITS_PER_LONG == 32 +SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length) +{ + return do_sys_truncate(path, length); +} +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_truncate64(long path, loff_t length) +{ + return SYSC_truncate64((const char __user *) path, length); +} +SYSCALL_ALIAS(sys_truncate64, SyS_truncate64); +#endif + +SYSCALL_DEFINE(ftruncate64)(unsigned int fd, loff_t length) +{ + long ret = do_sys_ftruncate(fd, length, 0); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(2, ret, fd, length); + return ret; +} +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_ftruncate64(long fd, loff_t length) +{ + return SYSC_ftruncate64((unsigned int) fd, length); +} +SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64); +#endif +#endif /* BITS_PER_LONG == 32 */ + + +int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + long ret; + + if (offset < 0 || len <= 0) + return -EINVAL; + + /* Return error if mode is not supported */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + /* Punch hole must have keep size set */ + if ((mode & FALLOC_FL_PUNCH_HOLE) && + !(mode & FALLOC_FL_KEEP_SIZE)) + return -EOPNOTSUPP; + + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + + /* It's not possible punch hole on append only file */ + if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode)) + return -EPERM; + + if (IS_IMMUTABLE(inode)) + return -EPERM; + + /* + * Revalidate the write permissions, in case security policy has + * changed since the files were opened. + */ + ret = security_file_permission(file, MAY_WRITE); + if (ret) + return ret; + + if (S_ISFIFO(inode->i_mode)) + return -ESPIPE; + + /* + * Let individual file system decide if it supports preallocation + * for directories or not. + */ + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return -ENODEV; + + /* Check for wrap through zero too */ + if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) + return -EFBIG; + + if (!file->f_op->fallocate) + return -EOPNOTSUPP; + + return file->f_op->fallocate(file, mode, offset, len); +} + +SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) +{ + struct file *file; + int error = -EBADF; + + file = fget(fd); + if (file) { + error = do_fallocate(file, mode, offset, len); + fput(file); + } + + return error; +} + +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len) +{ + return SYSC_fallocate((int)fd, (int)mode, offset, len); +} +SYSCALL_ALIAS(sys_fallocate, SyS_fallocate); +#endif + +/* + * access() needs to use the real uid/gid, not the effective uid/gid. + * We do this by temporarily clearing all FS-related capabilities and + * switching the fsuid/fsgid around to the real ones. + */ +SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) +{ + const struct cred *old_cred; + struct cred *override_cred; + struct path path; + struct inode *inode; + int res; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; + + override_cred = prepare_creds(); + if (!override_cred) + return -ENOMEM; + + override_cred->fsuid = override_cred->uid; + override_cred->fsgid = override_cred->gid; + + if (!issecure(SECURE_NO_SETUID_FIXUP)) { + /* Clear the capabilities if we switch to a non-root user */ + if (override_cred->uid) + cap_clear(override_cred->cap_effective); + else + override_cred->cap_effective = + override_cred->cap_permitted; + } + + old_cred = override_creds(override_cred); + + res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); + if (res) + goto out; + + inode = path.dentry->d_inode; + + if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { + /* + * MAY_EXEC on regular files is denied if the fs is mounted + * with the "noexec" flag. + */ + res = -EACCES; + if (path.mnt->mnt_flags & MNT_NOEXEC) + goto out_path_release; + } + + res = inode_permission(inode, mode | MAY_ACCESS); + /* SuS v2 requires we report a read only fs too */ + if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) + goto out_path_release; + /* + * This is a rare case where using __mnt_is_readonly() + * is OK without a mnt_want/drop_write() pair. Since + * no actual write to the fs is performed here, we do + * not need to telegraph to that to anyone. + * + * By doing this, we accept that this access is + * inherently racy and know that the fs may change + * state before we even see this result. + */ + if (__mnt_is_readonly(path.mnt)) + res = -EROFS; + +out_path_release: + path_put(&path); +out: + revert_creds(old_cred); + put_cred(override_cred); + return res; +} + +SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) +{ + return sys_faccessat(AT_FDCWD, filename, mode); +} + +SYSCALL_DEFINE1(chdir, const char __user *, filename) +{ + struct path path; + int error; + + error = user_path_dir(filename, &path); + if (error) + goto out; + + error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + if (error) + goto dput_and_out; + + set_fs_pwd(current->fs, &path); + +dput_and_out: + path_put(&path); +out: + return error; +} + +SYSCALL_DEFINE1(fchdir, unsigned int, fd) +{ + struct file *file; + struct inode *inode; + int error; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + + inode = file->f_path.dentry->d_inode; + + error = -ENOTDIR; + if (!S_ISDIR(inode->i_mode)) + goto out_putf; + + error = inode_permission(inode, MAY_EXEC | MAY_CHDIR); + if (!error) + set_fs_pwd(current->fs, &file->f_path); +out_putf: + fput(file); +out: + return error; +} + +SYSCALL_DEFINE1(chroot, const char __user *, filename) +{ + struct path path; + int error; + + error = user_path_dir(filename, &path); + if (error) + goto out; + + error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + if (error) + goto dput_and_out; + + error = -EPERM; + if (!capable(CAP_SYS_CHROOT)) + goto dput_and_out; + error = security_path_chroot(&path); + if (error) + goto dput_and_out; + + set_fs_root(current->fs, &path); + error = 0; +dput_and_out: + path_put(&path); +out: + return error; +} + +SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) +{ + struct inode * inode; + struct dentry * dentry; + struct file * file; + int err = -EBADF; + struct iattr newattrs; + + file = fget(fd); + if (!file) + goto out; + + dentry = file->f_path.dentry; + inode = dentry->d_inode; + + audit_inode(NULL, dentry); + + err = mnt_want_write_file(file); + if (err) + goto out_putf; + mutex_lock(&inode->i_mutex); + err = security_path_chmod(dentry, file->f_vfsmnt, mode); + if (err) + goto out_unlock; + if (mode == (mode_t) -1) + mode = inode->i_mode; + newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + err = notify_change(dentry, &newattrs); +out_unlock: + mutex_unlock(&inode->i_mutex); + mnt_drop_write(file->f_path.mnt); +out_putf: + fput(file); +out: + return err; +} + +SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) +{ + struct path path; + struct inode *inode; + int error; + struct iattr newattrs; + + error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); + if (error) + goto out; + inode = path.dentry->d_inode; + + error = mnt_want_write(path.mnt); + if (error) + goto dput_and_out; + mutex_lock(&inode->i_mutex); + error = security_path_chmod(path.dentry, path.mnt, mode); + if (error) + goto out_unlock; + if (mode == (mode_t) -1) + mode = inode->i_mode; + newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + error = notify_change(path.dentry, &newattrs); +out_unlock: + mutex_unlock(&inode->i_mutex); + mnt_drop_write(path.mnt); +dput_and_out: + path_put(&path); +out: + return error; +} + +SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode) +{ + return sys_fchmodat(AT_FDCWD, filename, mode); +} + +static int chown_common(struct path *path, uid_t user, gid_t group) +{ + struct inode *inode = path->dentry->d_inode; + int error; + struct iattr newattrs; + + newattrs.ia_valid = ATTR_CTIME; + if (user != (uid_t) -1) { + newattrs.ia_valid |= ATTR_UID; + newattrs.ia_uid = user; + } + if (group != (gid_t) -1) { + newattrs.ia_valid |= ATTR_GID; + newattrs.ia_gid = group; + } + if (!S_ISDIR(inode->i_mode)) + newattrs.ia_valid |= + ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; + mutex_lock(&inode->i_mutex); + error = security_path_chown(path, user, group); + if (!error) + error = notify_change(path->dentry, &newattrs); + mutex_unlock(&inode->i_mutex); + + return error; +} + +SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) +{ + struct path path; + int error; + + error = user_path(filename, &path); + if (error) + goto out; + error = mnt_want_write(path.mnt); + if (error) + goto out_release; + error = chown_common(&path, user, group); + mnt_drop_write(path.mnt); +out_release: + path_put(&path); +out: + return error; +} + +SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, + gid_t, group, int, flag) +{ + struct path path; + int error = -EINVAL; + int lookup_flags; + + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + goto out; + + lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + error = user_path_at(dfd, filename, lookup_flags, &path); + if (error) + goto out; + error = mnt_want_write(path.mnt); + if (error) + goto out_release; + error = chown_common(&path, user, group); + mnt_drop_write(path.mnt); +out_release: + path_put(&path); +out: + return error; +} + +SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) +{ + struct path path; + int error; + + error = user_lpath(filename, &path); + if (error) + goto out; + error = mnt_want_write(path.mnt); + if (error) + goto out_release; + error = chown_common(&path, user, group); + mnt_drop_write(path.mnt); +out_release: + path_put(&path); +out: + return error; +} + +SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) +{ + struct file * file; + int error = -EBADF; + struct dentry * dentry; + + file = fget(fd); + if (!file) + goto out; + + error = mnt_want_write_file(file); + if (error) + goto out_fput; + dentry = file->f_path.dentry; + audit_inode(NULL, dentry); + error = chown_common(&file->f_path, user, group); + mnt_drop_write(file->f_path.mnt); +out_fput: + fput(file); +out: + return error; +} + +/* + * You have to be very careful that these write + * counts get cleaned up in error cases and + * upon __fput(). This should probably never + * be called outside of __dentry_open(). + */ +static inline int __get_file_write_access(struct inode *inode, + struct vfsmount *mnt) +{ + int error; + error = get_write_access(inode); + if (error) + return error; + /* + * Do not take mount writer counts on + * special files since no writes to + * the mount itself will occur. + */ + if (!special_file(inode->i_mode)) { + /* + * Balanced in __fput() + */ + error = mnt_want_write(mnt); + if (error) + put_write_access(inode); + } + return error; +} + +static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, + struct file *f, + int (*open)(struct inode *, struct file *), + const struct cred *cred) +{ + static const struct file_operations empty_fops = {}; + struct inode *inode; + int error; + + f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | + FMODE_PREAD | FMODE_PWRITE; + + if (unlikely(f->f_flags & O_PATH)) + f->f_mode = FMODE_PATH; + + inode = dentry->d_inode; + if (f->f_mode & FMODE_WRITE) { + error = __get_file_write_access(inode, mnt); + if (error) + goto cleanup_file; + if (!special_file(inode->i_mode)) + file_take_write(f); + } + + f->f_mapping = inode->i_mapping; + f->f_path.dentry = dentry; + f->f_path.mnt = mnt; + f->f_pos = 0; + file_sb_list_add(f, inode->i_sb); + + if (unlikely(f->f_mode & FMODE_PATH)) { + f->f_op = &empty_fops; + return f; + } + + f->f_op = fops_get(inode->i_fop); + + error = security_dentry_open(f, cred); + if (error) + goto cleanup_all; + + if (!open && f->f_op) + open = f->f_op->open; + if (open) { + error = open(inode, f); + if (error) + goto cleanup_all; + } + if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_inc(inode); + + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + + file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); + + /* NB: we're sure to have correct a_ops only after f_op->open */ + if (f->f_flags & O_DIRECT) { + if (!f->f_mapping->a_ops || + ((!f->f_mapping->a_ops->direct_IO) && + (!f->f_mapping->a_ops->get_xip_mem))) { + fput(f); + f = ERR_PTR(-EINVAL); + } + } + + return f; + +cleanup_all: + fops_put(f->f_op); + if (f->f_mode & FMODE_WRITE) { + put_write_access(inode); + if (!special_file(inode->i_mode)) { + /* + * We don't consider this a real + * mnt_want/drop_write() pair + * because it all happenend right + * here, so just reset the state. + */ + file_reset_write(f); + mnt_drop_write(mnt); + } + } + file_sb_list_del(f); + f->f_path.dentry = NULL; + f->f_path.mnt = NULL; +cleanup_file: + put_filp(f); + dput(dentry); + mntput(mnt); + return ERR_PTR(error); +} + +/** + * lookup_instantiate_filp - instantiates the open intent filp + * @nd: pointer to nameidata + * @dentry: pointer to dentry + * @open: open callback + * + * Helper for filesystems that want to use lookup open intents and pass back + * a fully instantiated struct file to the caller. + * This function is meant to be called from within a filesystem's + * lookup method. + * Beware of calling it for non-regular files! Those ->open methods might block + * (e.g. in fifo_open), leaving you with parent locked (and in case of fifo, + * leading to a deadlock, as nobody can open that fifo anymore, because + * another process to open fifo will block on locked parent when doing lookup). + * Note that in case of error, nd->intent.open.file is destroyed, but the + * path information remains valid. + * If the open callback is set to NULL, then the standard f_op->open() + * filesystem callback is substituted. + */ +struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry, + int (*open)(struct inode *, struct file *)) +{ + const struct cred *cred = current_cred(); + + if (IS_ERR(nd->intent.open.file)) + goto out; + if (IS_ERR(dentry)) + goto out_err; + nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt), + nd->intent.open.file, + open, cred); +out: + return nd->intent.open.file; +out_err: + release_open_intent(nd); + nd->intent.open.file = (struct file *)dentry; + goto out; +} +EXPORT_SYMBOL_GPL(lookup_instantiate_filp); + +/** + * nameidata_to_filp - convert a nameidata to an open filp. + * @nd: pointer to nameidata + * @flags: open flags + * + * Note that this function destroys the original nameidata + */ +struct file *nameidata_to_filp(struct nameidata *nd) +{ + const struct cred *cred = current_cred(); + struct file *filp; + + /* Pick up the filp from the open intent */ + filp = nd->intent.open.file; + nd->intent.open.file = NULL; + + /* Has the filesystem initialised the file for us? */ + if (filp->f_path.dentry == NULL) { + path_get(&nd->path); + filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp, + NULL, cred); + } + return filp; +} + +/* + * dentry_open() will have done dput(dentry) and mntput(mnt) if it returns an + * error. + */ +struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, + const struct cred *cred) +{ + int error; + struct file *f; + + validate_creds(cred); + + /* We must always pass in a valid mount pointer. */ + BUG_ON(!mnt); + + error = -ENFILE; + f = get_empty_filp(); + if (f == NULL) { + dput(dentry); + mntput(mnt); + return ERR_PTR(error); + } + + f->f_flags = flags; + return __dentry_open(dentry, mnt, f, NULL, cred); +} +EXPORT_SYMBOL(dentry_open); + +static void __put_unused_fd(struct files_struct *files, unsigned int fd) +{ + struct fdtable *fdt = files_fdtable(files); + __FD_CLR(fd, fdt->open_fds); + if (fd < files->next_fd) + files->next_fd = fd; +} + +void put_unused_fd(unsigned int fd) +{ + struct files_struct *files = current->files; + spin_lock(&files->file_lock); + __put_unused_fd(files, fd); + spin_unlock(&files->file_lock); +} + +EXPORT_SYMBOL(put_unused_fd); + +/* + * Install a file pointer in the fd array. + * + * The VFS is full of places where we drop the files lock between + * setting the open_fds bitmap and installing the file in the file + * array. At any such point, we are vulnerable to a dup2() race + * installing a file in the array before us. We need to detect this and + * fput() the struct file we are about to overwrite in this case. + * + * It should never happen - if we allow dup2() do it, _really_ bad things + * will follow. + */ + +void fd_install(unsigned int fd, struct file *file) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + BUG_ON(fdt->fd[fd] != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + spin_unlock(&files->file_lock); +} + +EXPORT_SYMBOL(fd_install); + +static inline int build_open_flags(int flags, int mode, struct open_flags *op) +{ + int lookup_flags = 0; + int acc_mode; + + if (!(flags & O_CREAT)) + mode = 0; + op->mode = mode; + + /* Must never be set by userspace */ + flags &= ~FMODE_NONOTIFY; + + /* + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. + */ + if (flags & __O_SYNC) + flags |= O_DSYNC; + + /* + * If we have O_PATH in the open flag. Then we + * cannot have anything other than the below set of flags + */ + if (flags & O_PATH) { + flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; + acc_mode = 0; + } else { + acc_mode = MAY_OPEN | ACC_MODE(flags); + } + + op->open_flag = flags; + + /* O_TRUNC implies we need access checks for write permissions */ + if (flags & O_TRUNC) + acc_mode |= MAY_WRITE; + + /* Allow the LSM permission hook to distinguish append + access from general write access. */ + if (flags & O_APPEND) + acc_mode |= MAY_APPEND; + + op->acc_mode = acc_mode; + + op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; + + if (flags & O_CREAT) { + op->intent |= LOOKUP_CREATE; + if (flags & O_EXCL) + op->intent |= LOOKUP_EXCL; + } + + if (flags & O_DIRECTORY) + lookup_flags |= LOOKUP_DIRECTORY; + if (!(flags & O_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + return lookup_flags; +} + +/** + * filp_open - open file and return file pointer + * + * @filename: path to open + * @flags: open flags as per the open(2) second argument + * @mode: mode for the new file if O_CREAT is set, else ignored + * + * This is the helper to open a file from kernelspace if you really + * have to. But in generally you should not do this, so please move + * along, nothing to see here.. + */ +struct file *filp_open(const char *filename, int flags, int mode) +{ + struct open_flags op; + int lookup = build_open_flags(flags, mode, &op); + return do_filp_open(AT_FDCWD, filename, &op, lookup); +} +EXPORT_SYMBOL(filp_open); + +struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *filename, int flags) +{ + struct open_flags op; + int lookup = build_open_flags(flags, 0, &op); + if (flags & O_CREAT) + return ERR_PTR(-EINVAL); + if (!filename && (flags & O_DIRECTORY)) + if (!dentry->d_inode->i_op->lookup) + return ERR_PTR(-ENOTDIR); + return do_file_open_root(dentry, mnt, filename, &op, lookup); +} +EXPORT_SYMBOL(file_open_root); + +long do_sys_open(int dfd, const char __user *filename, int flags, int mode) +{ + struct open_flags op; + int lookup = build_open_flags(flags, mode, &op); + char *tmp = getname(filename); + int fd = PTR_ERR(tmp); + + if (!IS_ERR(tmp)) { + fd = get_unused_fd_flags(flags); + if (fd >= 0) { + struct file *f = do_filp_open(dfd, tmp, &op, lookup); + if (IS_ERR(f)) { + put_unused_fd(fd); + fd = PTR_ERR(f); + } else { + fsnotify_open(f); + fd_install(fd, f); + } + } + putname(tmp); + } + return fd; +} + +SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) +{ + long ret; + + if (force_o_largefile()) + flags |= O_LARGEFILE; + + ret = do_sys_open(AT_FDCWD, filename, flags, mode); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(3, ret, filename, flags, mode); + return ret; +} + +SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, + int, mode) +{ + long ret; + + if (force_o_largefile()) + flags |= O_LARGEFILE; + + ret = do_sys_open(dfd, filename, flags, mode); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(4, ret, dfd, filename, flags, mode); + return ret; +} + +#ifndef __alpha__ + +/* + * For backward compatibility? Maybe this should be moved + * into arch/i386 instead? + */ +SYSCALL_DEFINE2(creat, const char __user *, pathname, int, mode) +{ + return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode); +} + +#endif + +/* + * "id" is the POSIX thread ID. We use the + * files pointer for this.. + */ +int filp_close(struct file *filp, fl_owner_t id) +{ + int retval = 0; + + if (!file_count(filp)) { + printk(KERN_ERR "VFS: Close: file count is 0\n"); + return 0; + } + + if (filp->f_op && filp->f_op->flush) + retval = filp->f_op->flush(filp, id); + + if (likely(!(filp->f_mode & FMODE_PATH))) { + dnotify_flush(filp, id); + locks_remove_posix(filp, id); + } + fput(filp); + return retval; +} + +EXPORT_SYMBOL(filp_close); + +/* + * Careful here! We test whether the file pointer is NULL before + * releasing the fd. This ensures that one clone task can't release + * an fd while another clone is opening it. + */ +SYSCALL_DEFINE1(close, unsigned int, fd) +{ + struct file * filp; + struct files_struct *files = current->files; + struct fdtable *fdt; + int retval; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + if (fd >= fdt->max_fds) + goto out_unlock; + filp = fdt->fd[fd]; + if (!filp) + goto out_unlock; + rcu_assign_pointer(fdt->fd[fd], NULL); + FD_CLR(fd, fdt->close_on_exec); + __put_unused_fd(files, fd); + spin_unlock(&files->file_lock); + retval = filp_close(filp, files); + + /* can't restart close syscall because file table entry was cleared */ + if (unlikely(retval == -ERESTARTSYS || + retval == -ERESTARTNOINTR || + retval == -ERESTARTNOHAND || + retval == -ERESTART_RESTARTBLOCK)) + retval = -EINTR; + + return retval; + +out_unlock: + spin_unlock(&files->file_lock); + return -EBADF; +} +EXPORT_SYMBOL(sys_close); + +/* + * This routine simulates a hangup on the tty, to arrange that users + * are given clean terminals at login time. + */ +SYSCALL_DEFINE0(vhangup) +{ + if (capable(CAP_SYS_TTY_CONFIG)) { + tty_vhangup_self(); + return 0; + } + return -EPERM; +} + +/* + * Called when an inode is about to be open. + * We use this to disallow opening large files on 32bit systems if + * the caller didn't specify O_LARGEFILE. On 64bit systems we force + * on this flag in sys_open. + */ +int generic_file_open(struct inode * inode, struct file * filp) +{ + if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + return -EOVERFLOW; + return 0; +} + +EXPORT_SYMBOL(generic_file_open); + +/* + * This is used by subsystems that don't want seekable + * file descriptors. The function is not supposed to ever fail, the only + * reason it returns an 'int' and not 'void' is so that it can be plugged + * directly into file_operations structure. + */ +int nonseekable_open(struct inode *inode, struct file *filp) +{ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); + return 0; +} + +EXPORT_SYMBOL(nonseekable_open); diff --git a/fs/openpromfs/Makefile b/fs/openpromfs/Makefile new file mode 100644 index 00000000..4563199b --- /dev/null +++ b/fs/openpromfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux Sun Openprom filesystem routines. +# + +obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs.o + +openpromfs-objs := inode.o diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c new file mode 100644 index 00000000..a2a5bff7 --- /dev/null +++ b/fs/openpromfs/inode.c @@ -0,0 +1,473 @@ +/* inode.c: /proc/openprom handling routines + * + * Copyright (C) 1996-1999 Jakub Jelinek (jakub@redhat.com) + * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static DEFINE_MUTEX(op_mutex); + +#define OPENPROM_ROOT_INO 0 + +enum op_inode_type { + op_inode_node, + op_inode_prop, +}; + +union op_inode_data { + struct device_node *node; + struct property *prop; +}; + +struct op_inode_info { + struct inode vfs_inode; + enum op_inode_type type; + union op_inode_data u; +}; + +static struct inode *openprom_iget(struct super_block *sb, ino_t ino); + +static inline struct op_inode_info *OP_I(struct inode *inode) +{ + return container_of(inode, struct op_inode_info, vfs_inode); +} + +static int is_string(unsigned char *p, int len) +{ + int i; + + for (i = 0; i < len; i++) { + unsigned char val = p[i]; + + if ((i && !val) || + (val >= ' ' && val <= '~')) + continue; + + return 0; + } + + return 1; +} + +static int property_show(struct seq_file *f, void *v) +{ + struct property *prop = f->private; + void *pval; + int len; + + len = prop->length; + pval = prop->value; + + if (is_string(pval, len)) { + while (len > 0) { + int n = strlen(pval); + + seq_printf(f, "%s", (char *) pval); + + /* Skip over the NULL byte too. */ + pval += n + 1; + len -= n + 1; + + if (len > 0) + seq_printf(f, " + "); + } + } else { + if (len & 3) { + while (len) { + len--; + if (len) + seq_printf(f, "%02x.", + *(unsigned char *) pval); + else + seq_printf(f, "%02x", + *(unsigned char *) pval); + pval++; + } + } else { + while (len >= 4) { + len -= 4; + + if (len) + seq_printf(f, "%08x.", + *(unsigned int *) pval); + else + seq_printf(f, "%08x", + *(unsigned int *) pval); + pval += 4; + } + } + } + seq_printf(f, "\n"); + + return 0; +} + +static void *property_start(struct seq_file *f, loff_t *pos) +{ + if (*pos == 0) + return pos; + return NULL; +} + +static void *property_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + return NULL; +} + +static void property_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static const struct seq_operations property_op = { + .start = property_start, + .next = property_next, + .stop = property_stop, + .show = property_show +}; + +static int property_open(struct inode *inode, struct file *file) +{ + struct op_inode_info *oi = OP_I(inode); + int ret; + + BUG_ON(oi->type != op_inode_prop); + + ret = seq_open(file, &property_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = oi->u.prop; + } + return ret; +} + +static const struct file_operations openpromfs_prop_ops = { + .open = property_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int openpromfs_readdir(struct file *, void *, filldir_t); + +static const struct file_operations openprom_operations = { + .read = generic_read_dir, + .readdir = openpromfs_readdir, + .llseek = generic_file_llseek, +}; + +static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *); + +static const struct inode_operations openprom_inode_operations = { + .lookup = openpromfs_lookup, +}; + +static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct op_inode_info *ent_oi, *oi = OP_I(dir); + struct device_node *dp, *child; + struct property *prop; + enum op_inode_type ent_type; + union op_inode_data ent_data; + const char *name; + struct inode *inode; + unsigned int ino; + int len; + + BUG_ON(oi->type != op_inode_node); + + dp = oi->u.node; + + name = dentry->d_name.name; + len = dentry->d_name.len; + + mutex_lock(&op_mutex); + + child = dp->child; + while (child) { + int n = strlen(child->path_component_name); + + if (len == n && + !strncmp(child->path_component_name, name, len)) { + ent_type = op_inode_node; + ent_data.node = child; + ino = child->unique_id; + goto found; + } + child = child->sibling; + } + + prop = dp->properties; + while (prop) { + int n = strlen(prop->name); + + if (len == n && !strncmp(prop->name, name, len)) { + ent_type = op_inode_prop; + ent_data.prop = prop; + ino = prop->unique_id; + goto found; + } + + prop = prop->next; + } + + mutex_unlock(&op_mutex); + return ERR_PTR(-ENOENT); + +found: + inode = openprom_iget(dir->i_sb, ino); + mutex_unlock(&op_mutex); + if (IS_ERR(inode)) + return ERR_CAST(inode); + ent_oi = OP_I(inode); + ent_oi->type = ent_type; + ent_oi->u = ent_data; + + switch (ent_type) { + case op_inode_node: + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + inode->i_op = &openprom_inode_operations; + inode->i_fop = &openprom_operations; + inode->i_nlink = 2; + break; + case op_inode_prop: + if (!strcmp(dp->name, "options") && (len == 17) && + !strncmp (name, "security-password", 17)) + inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; + else + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &openpromfs_prop_ops; + inode->i_nlink = 1; + inode->i_size = ent_oi->u.prop->length; + break; + } + + d_add(dentry, inode); + return NULL; +} + +static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct op_inode_info *oi = OP_I(inode); + struct device_node *dp = oi->u.node; + struct device_node *child; + struct property *prop; + unsigned int ino; + int i; + + mutex_lock(&op_mutex); + + ino = inode->i_ino; + i = filp->f_pos; + switch (i) { + case 0: + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall thru */ + case 1: + if (filldir(dirent, "..", 2, i, + (dp->parent == NULL ? + OPENPROM_ROOT_INO : + dp->parent->unique_id), DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall thru */ + default: + i -= 2; + + /* First, the children nodes as directories. */ + child = dp->child; + while (i && child) { + child = child->sibling; + i--; + } + while (child) { + if (filldir(dirent, + child->path_component_name, + strlen(child->path_component_name), + filp->f_pos, child->unique_id, DT_DIR) < 0) + goto out; + + filp->f_pos++; + child = child->sibling; + } + + /* Next, the properties as files. */ + prop = dp->properties; + while (i && prop) { + prop = prop->next; + i--; + } + while (prop) { + if (filldir(dirent, prop->name, strlen(prop->name), + filp->f_pos, prop->unique_id, DT_REG) < 0) + goto out; + + filp->f_pos++; + prop = prop->next; + } + } +out: + mutex_unlock(&op_mutex); + return 0; +} + +static struct kmem_cache *op_inode_cachep; + +static struct inode *openprom_alloc_inode(struct super_block *sb) +{ + struct op_inode_info *oi; + + oi = kmem_cache_alloc(op_inode_cachep, GFP_KERNEL); + if (!oi) + return NULL; + + return &oi->vfs_inode; +} + +static void openprom_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(op_inode_cachep, OP_I(inode)); +} + +static void openprom_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, openprom_i_callback); +} + +static struct inode *openprom_iget(struct super_block *sb, ino_t ino) +{ + struct inode *inode; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (inode->i_state & I_NEW) { + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + if (inode->i_ino == OPENPROM_ROOT_INO) { + inode->i_op = &openprom_inode_operations; + inode->i_fop = &openprom_operations; + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + } + unlock_new_inode(inode); + } + return inode; +} + +static int openprom_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_NOATIME; + return 0; +} + +static const struct super_operations openprom_sops = { + .alloc_inode = openprom_alloc_inode, + .destroy_inode = openprom_destroy_inode, + .statfs = simple_statfs, + .remount_fs = openprom_remount, +}; + +static int openprom_fill_super(struct super_block *s, void *data, int silent) +{ + struct inode *root_inode; + struct op_inode_info *oi; + int ret; + + s->s_flags |= MS_NOATIME; + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = OPENPROM_SUPER_MAGIC; + s->s_op = &openprom_sops; + s->s_time_gran = 1; + root_inode = openprom_iget(s, OPENPROM_ROOT_INO); + if (IS_ERR(root_inode)) { + ret = PTR_ERR(root_inode); + goto out_no_root; + } + + oi = OP_I(root_inode); + oi->type = op_inode_node; + oi->u.node = of_find_node_by_path("/"); + + s->s_root = d_alloc_root(root_inode); + if (!s->s_root) + goto out_no_root_dentry; + return 0; + +out_no_root_dentry: + iput(root_inode); + ret = -ENOMEM; +out_no_root: + printk("openprom_fill_super: get root inode failed\n"); + return ret; +} + +static struct dentry *openprom_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, openprom_fill_super); +} + +static struct file_system_type openprom_fs_type = { + .owner = THIS_MODULE, + .name = "openpromfs", + .mount = openprom_mount, + .kill_sb = kill_anon_super, +}; + +static void op_inode_init_once(void *data) +{ + struct op_inode_info *oi = (struct op_inode_info *) data; + + inode_init_once(&oi->vfs_inode); +} + +static int __init init_openprom_fs(void) +{ + int err; + + op_inode_cachep = kmem_cache_create("op_inode_cache", + sizeof(struct op_inode_info), + 0, + (SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD), + op_inode_init_once); + if (!op_inode_cachep) + return -ENOMEM; + + err = register_filesystem(&openprom_fs_type); + if (err) + kmem_cache_destroy(op_inode_cachep); + + return err; +} + +static void __exit exit_openprom_fs(void) +{ + unregister_filesystem(&openprom_fs_type); + kmem_cache_destroy(op_inode_cachep); +} + +module_init(init_openprom_fs) +module_exit(exit_openprom_fs) +MODULE_LICENSE("GPL"); diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig new file mode 100644 index 00000000..cb5f0a3f --- /dev/null +++ b/fs/partitions/Kconfig @@ -0,0 +1,251 @@ +# +# Partition configuration +# +config PARTITION_ADVANCED + bool "Advanced partition selection" + help + Say Y here if you would like to use hard disks under Linux which + were partitioned under an operating system running on a different + architecture than your Linux system. + + Note that the answer to this question won't directly affect the + kernel: saying N will just cause the configurator to skip all + the questions about foreign partitioning schemes. + + If unsure, say N. + +config ACORN_PARTITION + bool "Acorn partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + help + Support hard disks partitioned under Acorn operating systems. + +config ACORN_PARTITION_CUMANA + bool "Cumana partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + Say Y here if you would like to use hard disks under Linux which + were partitioned using the Cumana interface on Acorn machines. + +config ACORN_PARTITION_EESOX + bool "EESOX partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + +config ACORN_PARTITION_ICS + bool "ICS partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + Say Y here if you would like to use hard disks under Linux which + were partitioned using the ICS interface on Acorn machines. + +config ACORN_PARTITION_ADFS + bool "Native filecore partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + The Acorn Disc Filing System is the standard file system of the + RiscOS operating system which runs on Acorn's ARM-based Risc PC + systems and the Acorn Archimedes range of machines. If you say + `Y' here, Linux will support disk partitions created under ADFS. + +config ACORN_PARTITION_POWERTEC + bool "PowerTec partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + Support reading partition tables created on Acorn machines using + the PowerTec SCSI drive. + +config ACORN_PARTITION_RISCIX + bool "RISCiX partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + Once upon a time, there was a native Unix port for the Acorn series + of machines called RISCiX. If you say 'Y' here, Linux will be able + to read disks partitioned under RISCiX. + +config OSF_PARTITION + bool "Alpha OSF partition support" if PARTITION_ADVANCED + default y if ALPHA + help + Say Y here if you would like to use hard disks under Linux which + were partitioned on an Alpha machine. + +config AMIGA_PARTITION + bool "Amiga partition table support" if PARTITION_ADVANCED + default y if (AMIGA || AFFS_FS=y) + help + Say Y here if you would like to use hard disks under Linux which + were partitioned under AmigaOS. + +config ATARI_PARTITION + bool "Atari partition table support" if PARTITION_ADVANCED + default y if ATARI + help + Say Y here if you would like to use hard disks under Linux which + were partitioned under the Atari OS. + +config IBM_PARTITION + bool "IBM disk label and partition support" + depends on PARTITION_ADVANCED && S390 + help + Say Y here if you would like to be able to read the hard disk + partition table format used by IBM DASD disks operating under CMS. + Otherwise, say N. + +config MAC_PARTITION + bool "Macintosh partition map support" if PARTITION_ADVANCED + default y if (MAC || PPC_PMAC) + help + Say Y here if you would like to use hard disks under Linux which + were partitioned on a Macintosh. + +config MSDOS_PARTITION + bool "PC BIOS (MSDOS partition tables) support" if PARTITION_ADVANCED + default y + help + Say Y here. + +config BSD_DISKLABEL + bool "BSD disklabel (FreeBSD partition tables) support" + depends on PARTITION_ADVANCED && MSDOS_PARTITION + help + FreeBSD uses its own hard disk partition scheme on your PC. It + requires only one entry in the primary partition table of your disk + and manages it similarly to DOS extended partitions, putting in its + first sector a new partition table in BSD disklabel format. Saying Y + here allows you to read these disklabels and further mount FreeBSD + partitions from within Linux if you have also said Y to "UFS + file system support", above. If you don't know what all this is + about, say N. + +config MINIX_SUBPARTITION + bool "Minix subpartition support" + depends on PARTITION_ADVANCED && MSDOS_PARTITION + help + Minix 2.0.0/2.0.2 subpartition table support for Linux. + Say Y here if you want to mount and use Minix 2.0.0/2.0.2 + subpartitions. + +config SOLARIS_X86_PARTITION + bool "Solaris (x86) partition table support" + depends on PARTITION_ADVANCED && MSDOS_PARTITION + help + Like most systems, Solaris x86 uses its own hard disk partition + table format, incompatible with all others. Saying Y here allows you + to read these partition tables and further mount Solaris x86 + partitions from within Linux if you have also said Y to "UFS + file system support", above. + +config UNIXWARE_DISKLABEL + bool "Unixware slices support" + depends on PARTITION_ADVANCED && MSDOS_PARTITION + ---help--- + Like some systems, UnixWare uses its own slice table inside a + partition (VTOC - Virtual Table of Contents). Its format is + incompatible with all other OSes. Saying Y here allows you to read + VTOC and further mount UnixWare partitions read-only from within + Linux if you have also said Y to "UFS file system support" or + "System V and Coherent file system support", above. + + This is mainly used to carry data from a UnixWare box to your + Linux box via a removable medium like magneto-optical, ZIP or + removable IDE drives. Note, however, that a good portable way to + transport files and directories between unixes (and even other + operating systems) is given by the tar program ("man tar" or + preferably "info tar"). + + If you don't know what all this is about, say N. + +config LDM_PARTITION + bool "Windows Logical Disk Manager (Dynamic Disk) support" + depends on PARTITION_ADVANCED + ---help--- + Say Y here if you would like to use hard disks under Linux which + were partitioned using Windows 2000's/XP's or Vista's Logical Disk + Manager. They are also known as "Dynamic Disks". + + Note this driver only supports Dynamic Disks with a protective MBR + label, i.e. DOS partition table. It does not support GPT labelled + Dynamic Disks yet as can be created with Vista. + + Windows 2000 introduced the concept of Dynamic Disks to get around + the limitations of the PC's partitioning scheme. The Logical Disk + Manager allows the user to repartition a disk and create spanned, + mirrored, striped or RAID volumes, all without the need for + rebooting. + + Normal partitions are now called Basic Disks under Windows 2000, XP, + and Vista. + + For a fuller description read . + + If unsure, say N. + +config LDM_DEBUG + bool "Windows LDM extra logging" + depends on LDM_PARTITION + help + Say Y here if you would like LDM to log verbosely. This could be + helpful if the driver doesn't work as expected and you'd like to + report a bug. + + If unsure, say N. + +config SGI_PARTITION + bool "SGI partition support" if PARTITION_ADVANCED + default y if DEFAULT_SGI_PARTITION + help + Say Y here if you would like to be able to read the hard disk + partition table format used by SGI machines. + +config ULTRIX_PARTITION + bool "Ultrix partition table support" if PARTITION_ADVANCED + default y if MACH_DECSTATION + help + Say Y here if you would like to be able to read the hard disk + partition table format used by DEC (now Compaq) Ultrix machines. + Otherwise, say N. + +config SUN_PARTITION + bool "Sun partition tables support" if PARTITION_ADVANCED + default y if (SPARC || SUN3 || SUN3X) + ---help--- + Like most systems, SunOS uses its own hard disk partition table + format, incompatible with all others. Saying Y here allows you to + read these partition tables and further mount SunOS partitions from + within Linux if you have also said Y to "UFS file system support", + above. This is mainly used to carry data from a SPARC under SunOS to + your Linux box via a removable medium like magneto-optical or ZIP + drives; note however that a good portable way to transport files and + directories between unixes (and even other operating systems) is + given by the tar program ("man tar" or preferably "info tar"). If + you don't know what all this is about, say N. + +config KARMA_PARTITION + bool "Karma Partition support" + depends on PARTITION_ADVANCED + help + Say Y here if you would like to mount the Rio Karma MP3 player, as it + uses a proprietary partition table. + +config EFI_PARTITION + bool "EFI GUID Partition support" + depends on PARTITION_ADVANCED + select CRC32 + help + Say Y here if you would like to use hard disks under Linux which + were partitioned using EFI GPT. + +config SYSV68_PARTITION + bool "SYSV68 partition table support" if PARTITION_ADVANCED + default y if VME + help + Say Y here if you would like to be able to read the hard disk + partition table format used by Motorola Delta machines (using + sysv68). + Otherwise, say N. diff --git a/fs/partitions/Makefile b/fs/partitions/Makefile new file mode 100644 index 00000000..03af8eac --- /dev/null +++ b/fs/partitions/Makefile @@ -0,0 +1,20 @@ +# +# Makefile for the linux kernel. +# + +obj-$(CONFIG_BLOCK) := check.o + +obj-$(CONFIG_ACORN_PARTITION) += acorn.o +obj-$(CONFIG_AMIGA_PARTITION) += amiga.o +obj-$(CONFIG_ATARI_PARTITION) += atari.o +obj-$(CONFIG_MAC_PARTITION) += mac.o +obj-$(CONFIG_LDM_PARTITION) += ldm.o +obj-$(CONFIG_MSDOS_PARTITION) += msdos.o +obj-$(CONFIG_OSF_PARTITION) += osf.o +obj-$(CONFIG_SGI_PARTITION) += sgi.o +obj-$(CONFIG_SUN_PARTITION) += sun.o +obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o +obj-$(CONFIG_IBM_PARTITION) += ibm.o +obj-$(CONFIG_EFI_PARTITION) += efi.o +obj-$(CONFIG_KARMA_PARTITION) += karma.o +obj-$(CONFIG_SYSV68_PARTITION) += sysv68.o diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c new file mode 100644 index 00000000..fbeb6973 --- /dev/null +++ b/fs/partitions/acorn.c @@ -0,0 +1,556 @@ +/* + * linux/fs/partitions/acorn.c + * + * Copyright (c) 1996-2000 Russell King. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Scan ADFS partitions on hard disk drives. Unfortunately, there + * isn't a standard for partitioning drives on Acorn machines, so + * every single manufacturer of SCSI and IDE cards created their own + * method. + */ +#include +#include + +#include "check.h" +#include "acorn.h" + +/* + * Partition types. (Oh for reusability) + */ +#define PARTITION_RISCIX_MFM 1 +#define PARTITION_RISCIX_SCSI 2 +#define PARTITION_LINUX 9 + +#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ + defined(CONFIG_ACORN_PARTITION_ADFS) +static struct adfs_discrecord * +adfs_partition(struct parsed_partitions *state, char *name, char *data, + unsigned long first_sector, int slot) +{ + struct adfs_discrecord *dr; + unsigned int nr_sects; + + if (adfs_checkbblk(data)) + return NULL; + + dr = (struct adfs_discrecord *)(data + 0x1c0); + + if (dr->disc_size == 0 && dr->disc_size_high == 0) + return NULL; + + nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) | + (le32_to_cpu(dr->disc_size) >> 9); + + if (name) { + strlcat(state->pp_buf, " [", PAGE_SIZE); + strlcat(state->pp_buf, name, PAGE_SIZE); + strlcat(state->pp_buf, "]", PAGE_SIZE); + } + put_partition(state, slot, first_sector, nr_sects); + return dr; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_RISCIX + +struct riscix_part { + __le32 start; + __le32 length; + __le32 one; + char name[16]; +}; + +struct riscix_record { + __le32 magic; +#define RISCIX_MAGIC cpu_to_le32(0x4a657320) + __le32 date; + struct riscix_part part[8]; +}; + +#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ + defined(CONFIG_ACORN_PARTITION_ADFS) +static int riscix_partition(struct parsed_partitions *state, + unsigned long first_sect, int slot, + unsigned long nr_sects) +{ + Sector sect; + struct riscix_record *rr; + + rr = read_part_sector(state, first_sect, §); + if (!rr) + return -1; + + strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE); + + + if (rr->magic == RISCIX_MAGIC) { + unsigned long size = nr_sects > 2 ? 2 : nr_sects; + int part; + + strlcat(state->pp_buf, " <", PAGE_SIZE); + + put_partition(state, slot++, first_sect, size); + for (part = 0; part < 8; part++) { + if (rr->part[part].one && + memcmp(rr->part[part].name, "All\0", 4)) { + put_partition(state, slot++, + le32_to_cpu(rr->part[part].start), + le32_to_cpu(rr->part[part].length)); + strlcat(state->pp_buf, "(", PAGE_SIZE); + strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE); + strlcat(state->pp_buf, ")", PAGE_SIZE); + } + } + + strlcat(state->pp_buf, " >\n", PAGE_SIZE); + } else { + put_partition(state, slot++, first_sect, nr_sects); + } + + put_dev_sector(sect); + return slot; +} +#endif +#endif + +#define LINUX_NATIVE_MAGIC 0xdeafa1de +#define LINUX_SWAP_MAGIC 0xdeafab1e + +struct linux_part { + __le32 magic; + __le32 start_sect; + __le32 nr_sects; +}; + +#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ + defined(CONFIG_ACORN_PARTITION_ADFS) +static int linux_partition(struct parsed_partitions *state, + unsigned long first_sect, int slot, + unsigned long nr_sects) +{ + Sector sect; + struct linux_part *linuxp; + unsigned long size = nr_sects > 2 ? 2 : nr_sects; + + strlcat(state->pp_buf, " [Linux]", PAGE_SIZE); + + put_partition(state, slot++, first_sect, size); + + linuxp = read_part_sector(state, first_sect, §); + if (!linuxp) + return -1; + + strlcat(state->pp_buf, " <", PAGE_SIZE); + while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) || + linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) { + if (slot == state->limit) + break; + put_partition(state, slot++, first_sect + + le32_to_cpu(linuxp->start_sect), + le32_to_cpu(linuxp->nr_sects)); + linuxp ++; + } + strlcat(state->pp_buf, " >", PAGE_SIZE); + + put_dev_sector(sect); + return slot; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_CUMANA +int adfspart_check_CUMANA(struct parsed_partitions *state) +{ + unsigned long first_sector = 0; + unsigned int start_blk = 0; + Sector sect; + unsigned char *data; + char *name = "CUMANA/ADFS"; + int first = 1; + int slot = 1; + + /* + * Try Cumana style partitions - sector 6 contains ADFS boot block + * with pointer to next 'drive'. + * + * There are unknowns in this code - is the 'cylinder number' of the + * next partition relative to the start of this one - I'm assuming + * it is. + * + * Also, which ID did Cumana use? + * + * This is totally unfinished, and will require more work to get it + * going. Hence it is totally untested. + */ + do { + struct adfs_discrecord *dr; + unsigned int nr_sects; + + data = read_part_sector(state, start_blk * 2 + 6, §); + if (!data) + return -1; + + if (slot == state->limit) + break; + + dr = adfs_partition(state, name, data, first_sector, slot++); + if (!dr) + break; + + name = NULL; + + nr_sects = (data[0x1fd] + (data[0x1fe] << 8)) * + (dr->heads + (dr->lowsector & 0x40 ? 1 : 0)) * + dr->secspertrack; + + if (!nr_sects) + break; + + first = 0; + first_sector += nr_sects; + start_blk += nr_sects >> (BLOCK_SIZE_BITS - 9); + nr_sects = 0; /* hmm - should be partition size */ + + switch (data[0x1fc] & 15) { + case 0: /* No partition / ADFS? */ + break; + +#ifdef CONFIG_ACORN_PARTITION_RISCIX + case PARTITION_RISCIX_SCSI: + /* RISCiX - we don't know how to find the next one. */ + slot = riscix_partition(state, first_sector, slot, + nr_sects); + break; +#endif + + case PARTITION_LINUX: + slot = linux_partition(state, first_sector, slot, + nr_sects); + break; + } + put_dev_sector(sect); + if (slot == -1) + return -1; + } while (1); + put_dev_sector(sect); + return first ? 0 : 1; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_ADFS +/* + * Purpose: allocate ADFS partitions. + * + * Params : hd - pointer to gendisk structure to store partition info. + * dev - device number to access. + * + * Returns: -1 on error, 0 for no ADFS boot sector, 1 for ok. + * + * Alloc : hda = whole drive + * hda1 = ADFS partition on first drive. + * hda2 = non-ADFS partition. + */ +int adfspart_check_ADFS(struct parsed_partitions *state) +{ + unsigned long start_sect, nr_sects, sectscyl, heads; + Sector sect; + unsigned char *data; + struct adfs_discrecord *dr; + unsigned char id; + int slot = 1; + + data = read_part_sector(state, 6, §); + if (!data) + return -1; + + dr = adfs_partition(state, "ADFS", data, 0, slot++); + if (!dr) { + put_dev_sector(sect); + return 0; + } + + heads = dr->heads + ((dr->lowsector >> 6) & 1); + sectscyl = dr->secspertrack * heads; + start_sect = ((data[0x1fe] << 8) + data[0x1fd]) * sectscyl; + id = data[0x1fc] & 15; + put_dev_sector(sect); + + /* + * Work out start of non-adfs partition. + */ + nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; + + if (start_sect) { + switch (id) { +#ifdef CONFIG_ACORN_PARTITION_RISCIX + case PARTITION_RISCIX_SCSI: + case PARTITION_RISCIX_MFM: + slot = riscix_partition(state, start_sect, slot, + nr_sects); + break; +#endif + + case PARTITION_LINUX: + slot = linux_partition(state, start_sect, slot, + nr_sects); + break; + } + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_ICS + +struct ics_part { + __le32 start; + __le32 size; +}; + +static int adfspart_check_ICSLinux(struct parsed_partitions *state, + unsigned long block) +{ + Sector sect; + unsigned char *data = read_part_sector(state, block, §); + int result = 0; + + if (data) { + if (memcmp(data, "LinuxPart", 9) == 0) + result = 1; + put_dev_sector(sect); + } + + return result; +} + +/* + * Check for a valid ICS partition using the checksum. + */ +static inline int valid_ics_sector(const unsigned char *data) +{ + unsigned long sum; + int i; + + for (i = 0, sum = 0x50617274; i < 508; i++) + sum += data[i]; + + sum -= le32_to_cpu(*(__le32 *)(&data[508])); + + return sum == 0; +} + +/* + * Purpose: allocate ICS partitions. + * Params : hd - pointer to gendisk structure to store partition info. + * dev - device number to access. + * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok. + * Alloc : hda = whole drive + * hda1 = ADFS partition 0 on first drive. + * hda2 = ADFS partition 1 on first drive. + * ..etc.. + */ +int adfspart_check_ICS(struct parsed_partitions *state) +{ + const unsigned char *data; + const struct ics_part *p; + int slot; + Sector sect; + + /* + * Try ICS style partitions - sector 0 contains partition info. + */ + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + if (!valid_ics_sector(data)) { + put_dev_sector(sect); + return 0; + } + + strlcat(state->pp_buf, " [ICS]", PAGE_SIZE); + + for (slot = 1, p = (const struct ics_part *)data; p->size; p++) { + u32 start = le32_to_cpu(p->start); + s32 size = le32_to_cpu(p->size); /* yes, it's signed. */ + + if (slot == state->limit) + break; + + /* + * Negative sizes tell the RISC OS ICS driver to ignore + * this partition - in effect it says that this does not + * contain an ADFS filesystem. + */ + if (size < 0) { + size = -size; + + /* + * Our own extension - We use the first sector + * of the partition to identify what type this + * partition is. We must not make this visible + * to the filesystem. + */ + if (size > 1 && adfspart_check_ICSLinux(state, start)) { + start += 1; + size -= 1; + } + } + + if (size) + put_partition(state, slot++, start, size); + } + + put_dev_sector(sect); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_POWERTEC +struct ptec_part { + __le32 unused1; + __le32 unused2; + __le32 start; + __le32 size; + __le32 unused5; + char type[8]; +}; + +static inline int valid_ptec_sector(const unsigned char *data) +{ + unsigned char checksum = 0x2a; + int i; + + /* + * If it looks like a PC/BIOS partition, then it + * probably isn't PowerTec. + */ + if (data[510] == 0x55 && data[511] == 0xaa) + return 0; + + for (i = 0; i < 511; i++) + checksum += data[i]; + + return checksum == data[511]; +} + +/* + * Purpose: allocate ICS partitions. + * Params : hd - pointer to gendisk structure to store partition info. + * dev - device number to access. + * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok. + * Alloc : hda = whole drive + * hda1 = ADFS partition 0 on first drive. + * hda2 = ADFS partition 1 on first drive. + * ..etc.. + */ +int adfspart_check_POWERTEC(struct parsed_partitions *state) +{ + Sector sect; + const unsigned char *data; + const struct ptec_part *p; + int slot = 1; + int i; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + if (!valid_ptec_sector(data)) { + put_dev_sector(sect); + return 0; + } + + strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE); + + for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) { + u32 start = le32_to_cpu(p->start); + u32 size = le32_to_cpu(p->size); + + if (size) + put_partition(state, slot++, start, size); + } + + put_dev_sector(sect); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_EESOX +struct eesox_part { + char magic[6]; + char name[10]; + __le32 start; + __le32 unused6; + __le32 unused7; + __le32 unused8; +}; + +/* + * Guess who created this format? + */ +static const char eesox_name[] = { + 'N', 'e', 'i', 'l', ' ', + 'C', 'r', 'i', 't', 'c', 'h', 'e', 'l', 'l', ' ', ' ' +}; + +/* + * EESOX SCSI partition format. + * + * This is a goddamned awful partition format. We don't seem to store + * the size of the partition in this table, only the start addresses. + * + * There are two possibilities where the size comes from: + * 1. The individual ADFS boot block entries that are placed on the disk. + * 2. The start address of the next entry. + */ +int adfspart_check_EESOX(struct parsed_partitions *state) +{ + Sector sect; + const unsigned char *data; + unsigned char buffer[256]; + struct eesox_part *p; + sector_t start = 0; + int i, slot = 1; + + data = read_part_sector(state, 7, §); + if (!data) + return -1; + + /* + * "Decrypt" the partition table. God knows why... + */ + for (i = 0; i < 256; i++) + buffer[i] = data[i] ^ eesox_name[i & 15]; + + put_dev_sector(sect); + + for (i = 0, p = (struct eesox_part *)buffer; i < 8; i++, p++) { + sector_t next; + + if (memcmp(p->magic, "Eesox", 6)) + break; + + next = le32_to_cpu(p->start); + if (i) + put_partition(state, slot++, start, next - start); + start = next; + } + + if (i != 0) { + sector_t size; + + size = get_capacity(state->bdev->bd_disk); + put_partition(state, slot++, start, size - start); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + } + + return i ? 1 : 0; +} +#endif diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h new file mode 100644 index 00000000..ede82852 --- /dev/null +++ b/fs/partitions/acorn.h @@ -0,0 +1,14 @@ +/* + * linux/fs/partitions/acorn.h + * + * Copyright (C) 1996-2001 Russell King. + * + * I _hate_ this partitioning mess - why can't we have one defined + * format, and everyone stick to it? + */ + +int adfspart_check_CUMANA(struct parsed_partitions *state); +int adfspart_check_ADFS(struct parsed_partitions *state); +int adfspart_check_ICS(struct parsed_partitions *state); +int adfspart_check_POWERTEC(struct parsed_partitions *state); +int adfspart_check_EESOX(struct parsed_partitions *state); diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c new file mode 100644 index 00000000..70cbf44a --- /dev/null +++ b/fs/partitions/amiga.c @@ -0,0 +1,139 @@ +/* + * fs/partitions/amiga.c + * + * Code extracted from drivers/block/genhd.c + * + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#include +#include + +#include "check.h" +#include "amiga.h" + +static __inline__ u32 +checksum_block(__be32 *m, int size) +{ + u32 sum = 0; + + while (size--) + sum += be32_to_cpu(*m++); + return sum; +} + +int amiga_partition(struct parsed_partitions *state) +{ + Sector sect; + unsigned char *data; + struct RigidDiskBlock *rdb; + struct PartitionBlock *pb; + int start_sect, nr_sects, blk, part, res = 0; + int blksize = 1; /* Multiplier for disk block size */ + int slot = 1; + char b[BDEVNAME_SIZE]; + + for (blk = 0; ; blk++, put_dev_sector(sect)) { + if (blk == RDB_ALLOCATION_LIMIT) + goto rdb_done; + data = read_part_sector(state, blk, §); + if (!data) { + if (warn_no_part) + printk("Dev %s: unable to read RDB block %d\n", + bdevname(state->bdev, b), blk); + res = -1; + goto rdb_done; + } + if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK)) + continue; + + rdb = (struct RigidDiskBlock *)data; + if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0) + break; + /* Try again with 0xdc..0xdf zeroed, Windows might have + * trashed it. + */ + *(__be32 *)(data+0xdc) = 0; + if (checksum_block((__be32 *)data, + be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) { + printk("Warning: Trashed word at 0xd0 in block %d " + "ignored in checksum calculation\n",blk); + break; + } + + printk("Dev %s: RDB in block %d has bad checksum\n", + bdevname(state->bdev, b), blk); + } + + /* blksize is blocks per 512 byte standard block */ + blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; + + { + char tmp[7 + 10 + 1 + 1]; + + /* Be more informative */ + snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + blk = be32_to_cpu(rdb->rdb_PartitionList); + put_dev_sector(sect); + for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { + blk *= blksize; /* Read in terms partition table understands */ + data = read_part_sector(state, blk, §); + if (!data) { + if (warn_no_part) + printk("Dev %s: unable to read partition block %d\n", + bdevname(state->bdev, b), blk); + res = -1; + goto rdb_done; + } + pb = (struct PartitionBlock *)data; + blk = be32_to_cpu(pb->pb_Next); + if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION)) + continue; + if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 ) + continue; + + /* Tell Kernel about it */ + + nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 - + be32_to_cpu(pb->pb_Environment[9])) * + be32_to_cpu(pb->pb_Environment[3]) * + be32_to_cpu(pb->pb_Environment[5]) * + blksize; + if (!nr_sects) + continue; + start_sect = be32_to_cpu(pb->pb_Environment[9]) * + be32_to_cpu(pb->pb_Environment[3]) * + be32_to_cpu(pb->pb_Environment[5]) * + blksize; + put_partition(state,slot++,start_sect,nr_sects); + { + /* Be even more informative to aid mounting */ + char dostype[4]; + char tmp[42]; + + __be32 *dt = (__be32 *)dostype; + *dt = pb->pb_Environment[16]; + if (dostype[3] < ' ') + snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)", + dostype[0], dostype[1], + dostype[2], dostype[3] + '@' ); + else + snprintf(tmp, sizeof(tmp), " (%c%c%c%c)", + dostype[0], dostype[1], + dostype[2], dostype[3]); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + snprintf(tmp, sizeof(tmp), "(res %d spb %d)", + be32_to_cpu(pb->pb_Environment[6]), + be32_to_cpu(pb->pb_Environment[4])); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + res = 1; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + +rdb_done: + return res; +} diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h new file mode 100644 index 00000000..d094585c --- /dev/null +++ b/fs/partitions/amiga.h @@ -0,0 +1,6 @@ +/* + * fs/partitions/amiga.h + */ + +int amiga_partition(struct parsed_partitions *state); + diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c new file mode 100644 index 00000000..9875b05e --- /dev/null +++ b/fs/partitions/atari.c @@ -0,0 +1,149 @@ +/* + * fs/partitions/atari.c + * + * Code extracted from drivers/block/genhd.c + * + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#include +#include "check.h" +#include "atari.h" + +/* ++guenther: this should be settable by the user ("make config")?. + */ +#define ICD_PARTS + +/* check if a partition entry looks valid -- Atari format is assumed if at + least one of the primary entries is ok this way */ +#define VALID_PARTITION(pi,hdsiz) \ + (((pi)->flg & 1) && \ + isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \ + be32_to_cpu((pi)->st) <= (hdsiz) && \ + be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz)) + +static inline int OK_id(char *s) +{ + return memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 || + memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 || + memcmp (s, "RAW", 3) == 0 ; +} + +int atari_partition(struct parsed_partitions *state) +{ + Sector sect; + struct rootsector *rs; + struct partition_info *pi; + u32 extensect; + u32 hd_size; + int slot; +#ifdef ICD_PARTS + int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ +#endif + + rs = read_part_sector(state, 0, §); + if (!rs) + return -1; + + /* Verify this is an Atari rootsector: */ + hd_size = state->bdev->bd_inode->i_size >> 9; + if (!VALID_PARTITION(&rs->part[0], hd_size) && + !VALID_PARTITION(&rs->part[1], hd_size) && + !VALID_PARTITION(&rs->part[2], hd_size) && + !VALID_PARTITION(&rs->part[3], hd_size)) { + /* + * if there's no valid primary partition, assume that no Atari + * format partition table (there's no reliable magic or the like + * :-() + */ + put_dev_sector(sect); + return 0; + } + + pi = &rs->part[0]; + strlcat(state->pp_buf, " AHDI", PAGE_SIZE); + for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { + struct rootsector *xrs; + Sector sect2; + ulong partsect; + + if ( !(pi->flg & 1) ) + continue; + /* active partition */ + if (memcmp (pi->id, "XGM", 3) != 0) { + /* we don't care about other id's */ + put_partition (state, slot, be32_to_cpu(pi->st), + be32_to_cpu(pi->siz)); + continue; + } + /* extension partition */ +#ifdef ICD_PARTS + part_fmt = 1; +#endif + strlcat(state->pp_buf, " XGM<", PAGE_SIZE); + partsect = extensect = be32_to_cpu(pi->st); + while (1) { + xrs = read_part_sector(state, partsect, §2); + if (!xrs) { + printk (" block %ld read failed\n", partsect); + put_dev_sector(sect); + return -1; + } + + /* ++roman: sanity check: bit 0 of flg field must be set */ + if (!(xrs->part[0].flg & 1)) { + printk( "\nFirst sub-partition in extended partition is not valid!\n" ); + put_dev_sector(sect2); + break; + } + + put_partition(state, slot, + partsect + be32_to_cpu(xrs->part[0].st), + be32_to_cpu(xrs->part[0].siz)); + + if (!(xrs->part[1].flg & 1)) { + /* end of linked partition list */ + put_dev_sector(sect2); + break; + } + if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) { + printk("\nID of extended partition is not XGM!\n"); + put_dev_sector(sect2); + break; + } + + partsect = be32_to_cpu(xrs->part[1].st) + extensect; + put_dev_sector(sect2); + if (++slot == state->limit) { + printk( "\nMaximum number of partitions reached!\n" ); + break; + } + } + strlcat(state->pp_buf, " >", PAGE_SIZE); + } +#ifdef ICD_PARTS + if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ + pi = &rs->icdpart[0]; + /* sanity check: no ICD format if first partition invalid */ + if (OK_id(pi->id)) { + strlcat(state->pp_buf, " ICD<", PAGE_SIZE); + for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { + /* accept only GEM,BGM,RAW,LNX,SWP partitions */ + if (!((pi->flg & 1) && OK_id(pi->id))) + continue; + part_fmt = 2; + put_partition (state, slot, + be32_to_cpu(pi->st), + be32_to_cpu(pi->siz)); + } + strlcat(state->pp_buf, " >", PAGE_SIZE); + } + } +#endif + put_dev_sector(sect); + + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + return 1; +} diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h new file mode 100644 index 00000000..fe2d32a8 --- /dev/null +++ b/fs/partitions/atari.h @@ -0,0 +1,34 @@ +/* + * fs/partitions/atari.h + * Moved by Russell King from: + * + * linux/include/linux/atari_rootsec.h + * definitions for Atari Rootsector layout + * by Andreas Schwab (schwab@ls5.informatik.uni-dortmund.de) + * + * modified for ICD/Supra partitioning scheme restricted to at most 12 + * partitions + * by Guenther Kelleter (guenther@pool.informatik.rwth-aachen.de) + */ + +struct partition_info +{ + u8 flg; /* bit 0: active; bit 7: bootable */ + char id[3]; /* "GEM", "BGM", "XGM", or other */ + __be32 st; /* start of partition */ + __be32 siz; /* length of partition */ +}; + +struct rootsector +{ + char unused[0x156]; /* room for boot code */ + struct partition_info icdpart[8]; /* info for ICD-partitions 5..12 */ + char unused2[0xc]; + u32 hd_siz; /* size of disk in blocks */ + struct partition_info part[4]; + u32 bsl_st; /* start of bad sector list */ + u32 bsl_cnt; /* length of bad sector list */ + u16 checksum; /* checksum for bootable disks */ +} __attribute__((__packed__)); + +int atari_partition(struct parsed_partitions *state); diff --git a/fs/partitions/check.c b/fs/partitions/check.c new file mode 100644 index 00000000..811960a5 --- /dev/null +++ b/fs/partitions/check.c @@ -0,0 +1,730 @@ +/* + * fs/partitions/check.c + * + * Code extracted from drivers/block/genhd.c + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + * + * We now have independent partition support from the + * block drivers, which allows all the partition code to + * be grouped in one location, and it to be mostly self + * contained. + * + * Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl} + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "check.h" + +#include "acorn.h" +#include "amiga.h" +#include "atari.h" +#include "ldm.h" +#include "mac.h" +#include "msdos.h" +#include "osf.h" +#include "sgi.h" +#include "sun.h" +#include "ibm.h" +#include "ultrix.h" +#include "efi.h" +#include "karma.h" +#include "sysv68.h" + +#ifdef CONFIG_BLK_DEV_MD +extern void md_autodetect_dev(dev_t dev); +#endif + +int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ + +static int (*check_part[])(struct parsed_partitions *) = { + /* + * Probe partition formats with tables at disk address 0 + * that also have an ADFS boot block at 0xdc0. + */ +#ifdef CONFIG_ACORN_PARTITION_ICS + adfspart_check_ICS, +#endif +#ifdef CONFIG_ACORN_PARTITION_POWERTEC + adfspart_check_POWERTEC, +#endif +#ifdef CONFIG_ACORN_PARTITION_EESOX + adfspart_check_EESOX, +#endif + + /* + * Now move on to formats that only have partition info at + * disk address 0xdc0. Since these may also have stale + * PC/BIOS partition tables, they need to come before + * the msdos entry. + */ +#ifdef CONFIG_ACORN_PARTITION_CUMANA + adfspart_check_CUMANA, +#endif +#ifdef CONFIG_ACORN_PARTITION_ADFS + adfspart_check_ADFS, +#endif + +#ifdef CONFIG_EFI_PARTITION + efi_partition, /* this must come before msdos */ +#endif +#ifdef CONFIG_SGI_PARTITION + sgi_partition, +#endif +#ifdef CONFIG_LDM_PARTITION + ldm_partition, /* this must come before msdos */ +#endif +#ifdef CONFIG_MSDOS_PARTITION + msdos_partition, +#endif +#ifdef CONFIG_OSF_PARTITION + osf_partition, +#endif +#ifdef CONFIG_SUN_PARTITION + sun_partition, +#endif +#ifdef CONFIG_AMIGA_PARTITION + amiga_partition, +#endif +#ifdef CONFIG_ATARI_PARTITION + atari_partition, +#endif +#ifdef CONFIG_MAC_PARTITION + mac_partition, +#endif +#ifdef CONFIG_ULTRIX_PARTITION + ultrix_partition, +#endif +#ifdef CONFIG_IBM_PARTITION + ibm_partition, +#endif +#ifdef CONFIG_KARMA_PARTITION + karma_partition, +#endif +#ifdef CONFIG_SYSV68_PARTITION + sysv68_partition, +#endif + NULL +}; + +/* + * disk_name() is used by partition check code and the genhd driver. + * It formats the devicename of the indicated disk into + * the supplied buffer (of size at least 32), and returns + * a pointer to that same buffer (for convenience). + */ + +char *disk_name(struct gendisk *hd, int partno, char *buf) +{ + if (!partno) + snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); + else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) + snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); + else + snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); + + return buf; +} + +const char *bdevname(struct block_device *bdev, char *buf) +{ + return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); +} + +EXPORT_SYMBOL(bdevname); + +/* + * There's very little reason to use this, you should really + * have a struct block_device just about everywhere and use + * bdevname() instead. + */ +const char *__bdevname(dev_t dev, char *buffer) +{ + scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)", + MAJOR(dev), MINOR(dev)); + return buffer; +} + +EXPORT_SYMBOL(__bdevname); + +static struct parsed_partitions * +check_partition(struct gendisk *hd, struct block_device *bdev) +{ + struct parsed_partitions *state; + int i, res, err; + + state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL); + if (!state) + return NULL; + state->pp_buf = (char *)__get_free_page(GFP_KERNEL); + if (!state->pp_buf) { + kfree(state); + return NULL; + } + state->pp_buf[0] = '\0'; + + state->bdev = bdev; + disk_name(hd, 0, state->name); + snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); + if (isdigit(state->name[strlen(state->name)-1])) + sprintf(state->name, "p"); + + state->limit = disk_max_parts(hd); + i = res = err = 0; + while (!res && check_part[i]) { + memset(&state->parts, 0, sizeof(state->parts)); + res = check_part[i++](state); + if (res < 0) { + /* We have hit an I/O error which we don't report now. + * But record it, and let the others do their job. + */ + err = res; + res = 0; + } + + } + if (res > 0) { + printk(KERN_INFO "%s", state->pp_buf); + + free_page((unsigned long)state->pp_buf); + return state; + } + if (state->access_beyond_eod) + err = -ENOSPC; + if (err) + /* The partition is unrecognized. So report I/O errors if there were any */ + res = err; + if (!res) + strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE); + else if (warn_no_part) + strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); + + printk(KERN_INFO "%s", state->pp_buf); + + free_page((unsigned long)state->pp_buf); + kfree(state); + return ERR_PTR(res); +} + +static ssize_t part_partition_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->partno); +} + +static ssize_t part_start_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); +} + +ssize_t part_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); +} + +ssize_t part_ro_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%d\n", p->policy ? 1 : 0); +} + +ssize_t part_alignment_offset_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); +} + +ssize_t part_discard_alignment_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%u\n", p->discard_alignment); +} + +ssize_t part_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + int cpu; + + cpu = part_stat_lock(); + part_round_stats(cpu, p); + part_stat_unlock(); + return sprintf(buf, + "%8lu %8lu %8llu %8u " + "%8lu %8lu %8llu %8u " + "%8u %8u %8u" + "\n", + part_stat_read(p, ios[READ]), + part_stat_read(p, merges[READ]), + (unsigned long long)part_stat_read(p, sectors[READ]), + jiffies_to_msecs(part_stat_read(p, ticks[READ])), + part_stat_read(p, ios[WRITE]), + part_stat_read(p, merges[WRITE]), + (unsigned long long)part_stat_read(p, sectors[WRITE]), + jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), + part_in_flight(p), + jiffies_to_msecs(part_stat_read(p, io_ticks)), + jiffies_to_msecs(part_stat_read(p, time_in_queue))); +} + +ssize_t part_inflight_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]), + atomic_read(&p->in_flight[1])); +} + +#ifdef CONFIG_FAIL_MAKE_REQUEST +ssize_t part_fail_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->make_it_fail); +} + +ssize_t part_fail_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hd_struct *p = dev_to_part(dev); + int i; + + if (count > 0 && sscanf(buf, "%d", &i) > 0) + p->make_it_fail = (i == 0) ? 0 : 1; + + return count; +} +#endif + +static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); +static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); +static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); +static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL); +static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); +static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show, + NULL); +static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); +#ifdef CONFIG_FAIL_MAKE_REQUEST +static struct device_attribute dev_attr_fail = + __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); +#endif + +static struct attribute *part_attrs[] = { + &dev_attr_partition.attr, + &dev_attr_start.attr, + &dev_attr_size.attr, + &dev_attr_ro.attr, + &dev_attr_alignment_offset.attr, + &dev_attr_discard_alignment.attr, + &dev_attr_stat.attr, + &dev_attr_inflight.attr, +#ifdef CONFIG_FAIL_MAKE_REQUEST + &dev_attr_fail.attr, +#endif + NULL +}; + +static struct attribute_group part_attr_group = { + .attrs = part_attrs, +}; + +static const struct attribute_group *part_attr_groups[] = { + &part_attr_group, +#ifdef CONFIG_BLK_DEV_IO_TRACE + &blk_trace_attr_group, +#endif + NULL +}; + +static void part_release(struct device *dev) +{ + struct hd_struct *p = dev_to_part(dev); + free_part_stats(p); + free_part_info(p); + kfree(p); +} + +static int part_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct hd_struct *part = dev_to_part(dev); + + add_uevent_var(env, "PARTN=%u", part->partno); + if (part->info && part->info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", part->info->volname); + return 0; +} + +struct device_type part_type = { + .name = "partition", + .groups = part_attr_groups, + .release = part_release, + .uevent = part_uevent, +}; + +static void delete_partition_rcu_cb(struct rcu_head *head) +{ + struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); + + part->start_sect = 0; + part->nr_sects = 0; + part_stat_set_all(part, 0); + put_device(part_to_dev(part)); +} + +void __delete_partition(struct hd_struct *part) +{ + call_rcu(&part->rcu_head, delete_partition_rcu_cb); +} + +void delete_partition(struct gendisk *disk, int partno) +{ + struct disk_part_tbl *ptbl = disk->part_tbl; + struct hd_struct *part; + + if (partno >= ptbl->len) + return; + + part = ptbl->part[partno]; + if (!part) + return; + + blk_free_devt(part_devt(part)); + rcu_assign_pointer(ptbl->part[partno], NULL); + rcu_assign_pointer(ptbl->last_lookup, NULL); + kobject_put(part->holder_dir); + device_del(part_to_dev(part)); + + hd_struct_put(part); +} + +static ssize_t whole_disk_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return 0; +} +static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, + whole_disk_show, NULL); + +struct hd_struct *add_partition(struct gendisk *disk, int partno, + sector_t start, sector_t len, int flags, + struct partition_meta_info *info) +{ + struct hd_struct *p; + dev_t devt = MKDEV(0, 0); + struct device *ddev = disk_to_dev(disk); + struct device *pdev; + struct disk_part_tbl *ptbl; + const char *dname; + int err; + + err = disk_expand_part_tbl(disk, partno); + if (err) + return ERR_PTR(err); + ptbl = disk->part_tbl; + + if (ptbl->part[partno]) + return ERR_PTR(-EBUSY); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return ERR_PTR(-EBUSY); + + if (!init_part_stats(p)) { + err = -ENOMEM; + goto out_free; + } + pdev = part_to_dev(p); + + p->start_sect = start; + p->alignment_offset = + queue_limit_alignment_offset(&disk->queue->limits, start); + p->discard_alignment = + queue_limit_discard_alignment(&disk->queue->limits, start); + p->nr_sects = len; + p->partno = partno; + p->policy = get_disk_ro(disk); + + if (info) { + struct partition_meta_info *pinfo = alloc_part_info(disk); + if (!pinfo) + goto out_free_stats; + memcpy(pinfo, info, sizeof(*info)); + p->info = pinfo; + } + + dname = dev_name(ddev); + if (isdigit(dname[strlen(dname) - 1])) + dev_set_name(pdev, "%sp%d", dname, partno); + else + dev_set_name(pdev, "%s%d", dname, partno); + + device_initialize(pdev); + pdev->class = &block_class; + pdev->type = &part_type; + pdev->parent = ddev; + + err = blk_alloc_devt(p, &devt); + if (err) + goto out_free_info; + pdev->devt = devt; + + /* delay uevent until 'holders' subdir is created */ + dev_set_uevent_suppress(pdev, 1); + err = device_add(pdev); + if (err) + goto out_put; + + err = -ENOMEM; + p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); + if (!p->holder_dir) + goto out_del; + + dev_set_uevent_suppress(pdev, 0); + if (flags & ADDPART_FLAG_WHOLEDISK) { + err = device_create_file(pdev, &dev_attr_whole_disk); + if (err) + goto out_del; + } + + /* everything is up and running, commence */ + rcu_assign_pointer(ptbl->part[partno], p); + + /* suppress uevent if the disk suppresses it */ + if (!dev_get_uevent_suppress(ddev)) + kobject_uevent(&pdev->kobj, KOBJ_ADD); + + hd_ref_init(p); + return p; + +out_free_info: + free_part_info(p); +out_free_stats: + free_part_stats(p); +out_free: + kfree(p); + return ERR_PTR(err); +out_del: + kobject_put(p->holder_dir); + device_del(pdev); +out_put: + put_device(pdev); + blk_free_devt(devt); + return ERR_PTR(err); +} + +static bool disk_unlock_native_capacity(struct gendisk *disk) +{ + const struct block_device_operations *bdops = disk->fops; + + if (bdops->unlock_native_capacity && + !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { + printk(KERN_CONT "enabling native capacity\n"); + bdops->unlock_native_capacity(disk); + disk->flags |= GENHD_FL_NATIVE_CAPACITY; + return true; + } else { + printk(KERN_CONT "truncated\n"); + return false; + } +} + +static int drop_partitions(struct gendisk *disk, struct block_device *bdev) +{ + struct disk_part_iter piter; + struct hd_struct *part; + int res; + + if (bdev->bd_part_count) + return -EBUSY; + res = invalidate_partition(disk, 0); + if (res) + return res; + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + while ((part = disk_part_iter_next(&piter))) + delete_partition(disk, part->partno); + disk_part_iter_exit(&piter); + + return 0; +} + +int rescan_partitions(struct gendisk *disk, struct block_device *bdev) +{ + struct parsed_partitions *state = NULL; + struct hd_struct *part; + int p, highest, res; +rescan: + if (state && !IS_ERR(state)) { + kfree(state); + state = NULL; + } + + res = drop_partitions(disk, bdev); + if (res) + return res; + + if (disk->fops->revalidate_disk) + disk->fops->revalidate_disk(disk); + check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; + if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) + return 0; + if (IS_ERR(state)) { + /* + * I/O error reading the partition table. If any + * partition code tried to read beyond EOD, retry + * after unlocking native capacity. + */ + if (PTR_ERR(state) == -ENOSPC) { + printk(KERN_WARNING "%s: partition table beyond EOD, ", + disk->disk_name); + if (disk_unlock_native_capacity(disk)) + goto rescan; + } + return -EIO; + } + /* + * If any partition code tried to read beyond EOD, try + * unlocking native capacity even if partition table is + * successfully read as we could be missing some partitions. + */ + if (state->access_beyond_eod) { + printk(KERN_WARNING + "%s: partition table partially beyond EOD, ", + disk->disk_name); + if (disk_unlock_native_capacity(disk)) + goto rescan; + } + + /* tell userspace that the media / partition table may have changed */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + + /* Detect the highest partition number and preallocate + * disk->part_tbl. This is an optimization and not strictly + * necessary. + */ + for (p = 1, highest = 0; p < state->limit; p++) + if (state->parts[p].size) + highest = p; + + disk_expand_part_tbl(disk, highest); + + /* add partitions */ + for (p = 1; p < state->limit; p++) { + sector_t size, from; + struct partition_meta_info *info = NULL; + + size = state->parts[p].size; + if (!size) + continue; + + from = state->parts[p].from; + if (from >= get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d start %llu is beyond EOD, ", + disk->disk_name, p, (unsigned long long) from); + if (disk_unlock_native_capacity(disk)) + goto rescan; + continue; + } + + if (from + size > get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d size %llu extends beyond EOD, ", + disk->disk_name, p, (unsigned long long) size); + + if (disk_unlock_native_capacity(disk)) { + /* free state and restart */ + goto rescan; + } else { + /* + * we can not ignore partitions of broken tables + * created by for example camera firmware, but + * we limit them to the end of the disk to avoid + * creating invalid block devices + */ + size = get_capacity(disk) - from; + } + } + + if (state->parts[p].has_info) + info = &state->parts[p].info; + part = add_partition(disk, p, from, size, + state->parts[p].flags, + &state->parts[p].info); + if (IS_ERR(part)) { + printk(KERN_ERR " %s: p%d could not be added: %ld\n", + disk->disk_name, p, -PTR_ERR(part)); + continue; + } +#ifdef CONFIG_BLK_DEV_MD + if (state->parts[p].flags & ADDPART_FLAG_RAID) + md_autodetect_dev(part_to_dev(part)->devt); +#endif + } + kfree(state); + return 0; +} + +int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) +{ + int res; + + if (!bdev->bd_invalidated) + return 0; + + res = drop_partitions(disk, bdev); + if (res) + return res; + + set_capacity(disk, 0); + check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; + /* tell userspace that the media / partition table may have changed */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + + return 0; +} + +unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +{ + struct address_space *mapping = bdev->bd_inode->i_mapping; + struct page *page; + + page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), + NULL); + if (!IS_ERR(page)) { + if (PageError(page)) + goto fail; + p->v = page; + return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9); +fail: + page_cache_release(page); + } + p->v = NULL; + return NULL; +} + +EXPORT_SYMBOL(read_dev_sector); diff --git a/fs/partitions/check.h b/fs/partitions/check.h new file mode 100644 index 00000000..d68bf4dc --- /dev/null +++ b/fs/partitions/check.h @@ -0,0 +1,49 @@ +#include +#include +#include + +/* + * add_gd_partition adds a partitions details to the devices partition + * description. + */ +struct parsed_partitions { + struct block_device *bdev; + char name[BDEVNAME_SIZE]; + struct { + sector_t from; + sector_t size; + int flags; + bool has_info; + struct partition_meta_info info; + } parts[DISK_MAX_PARTS]; + int next; + int limit; + bool access_beyond_eod; + char *pp_buf; +}; + +static inline void *read_part_sector(struct parsed_partitions *state, + sector_t n, Sector *p) +{ + if (n >= get_capacity(state->bdev->bd_disk)) { + state->access_beyond_eod = true; + return NULL; + } + return read_dev_sector(state->bdev, n, p); +} + +static inline void +put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) +{ + if (n < p->limit) { + char tmp[1 + BDEVNAME_SIZE + 10 + 1]; + + p->parts[n].from = from; + p->parts[n].size = size; + snprintf(tmp, sizeof(tmp), " %s%d", p->name, n); + strlcat(p->pp_buf, tmp, PAGE_SIZE); + } +} + +extern int warn_no_part; + diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c new file mode 100644 index 00000000..6296b403 --- /dev/null +++ b/fs/partitions/efi.c @@ -0,0 +1,675 @@ +/************************************************************ + * EFI GUID Partition Table handling + * + * http://www.uefi.org/specs/ + * http://www.intel.com/technology/efi/ + * + * efi.[ch] by Matt Domsch + * Copyright 2000,2001,2002,2004 Dell Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * TODO: + * + * Changelog: + * Mon Nov 09 2004 Matt Domsch + * - test for valid PMBR and valid PGPT before ever reading + * AGPT, allow override with 'gpt' kernel command line option. + * - check for first/last_usable_lba outside of size of disk + * + * Tue Mar 26 2002 Matt Domsch + * - Ported to 2.5.7-pre1 and 2.5.7-dj2 + * - Applied patch to avoid fault in alternate header handling + * - cleaned up find_valid_gpt + * - On-disk structure and copy in memory is *always* LE now - + * swab fields as needed + * - remove print_gpt_header() + * - only use first max_p partition entries, to keep the kernel minor number + * and partition numbers tied. + * + * Mon Feb 04 2002 Matt Domsch + * - Removed __PRIPTR_PREFIX - not being used + * + * Mon Jan 14 2002 Matt Domsch + * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied + * + * Thu Dec 6 2001 Matt Domsch + * - Added compare_gpts(). + * - moved le_efi_guid_to_cpus() back into this file. GPT is the only + * thing that keeps EFI GUIDs on disk. + * - Changed gpt structure names and members to be simpler and more Linux-like. + * + * Wed Oct 17 2001 Matt Domsch + * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck + * + * Wed Oct 10 2001 Matt Domsch + * - Changed function comments to DocBook style per Andreas Dilger suggestion. + * + * Mon Oct 08 2001 Matt Domsch + * - Change read_lba() to use the page cache per Al Viro's work. + * - print u64s properly on all architectures + * - fixed debug_printk(), now Dprintk() + * + * Mon Oct 01 2001 Matt Domsch + * - Style cleanups + * - made most functions static + * - Endianness addition + * - remove test for second alternate header, as it's not per spec, + * and is unnecessary. There's now a method to read/write the last + * sector of an odd-sized disk from user space. No tools have ever + * been released which used this code, so it's effectively dead. + * - Per Asit Mallick of Intel, added a test for a valid PMBR. + * - Added kernel command line option 'gpt' to override valid PMBR test. + * + * Wed Jun 6 2001 Martin Wilck + * - added devfs volume UUID support (/dev/volumes/uuids) for + * mounting file systems by the partition GUID. + * + * Tue Dec 5 2000 Matt Domsch + * - Moved crc32() to linux/lib, added efi_crc32(). + * + * Thu Nov 30 2000 Matt Domsch + * - Replaced Intel's CRC32 function with an equivalent + * non-license-restricted version. + * + * Wed Oct 25 2000 Matt Domsch + * - Fixed the last_lba() call to return the proper last block + * + * Thu Oct 12 2000 Matt Domsch + * - Thanks to Andries Brouwer for his debugging assistance. + * - Code works, detects all the partitions. + * + ************************************************************/ +#include +#include +#include +#include +#include "check.h" +#include "efi.h" + +/* This allows a kernel command line option 'gpt' to override + * the test for invalid PMBR. Not __initdata because reloading + * the partition tables happens after init too. + */ +static int force_gpt; +static int __init +force_gpt_fn(char *str) +{ + force_gpt = 1; + return 1; +} +__setup("gpt", force_gpt_fn); + + +/** + * efi_crc32() - EFI version of crc32 function + * @buf: buffer to calculate crc32 of + * @len - length of buf + * + * Description: Returns EFI-style CRC32 value for @buf + * + * This function uses the little endian Ethernet polynomial + * but seeds the function with ~0, and xor's with ~0 at the end. + * Note, the EFI Specification, v1.02, has a reference to + * Dr. Dobbs Journal, May 1994 (actually it's in May 1992). + */ +static inline u32 +efi_crc32(const void *buf, unsigned long len) +{ + return (crc32(~0L, buf, len) ^ ~0L); +} + +/** + * last_lba(): return number of last logical block of device + * @bdev: block device + * + * Description: Returns last LBA value on success, 0 on error. + * This is stored (by sd and ide-geometry) in + * the part[0] entry for this disk, and is the number of + * physical sectors available on the disk. + */ +static u64 last_lba(struct block_device *bdev) +{ + if (!bdev || !bdev->bd_inode) + return 0; + return div_u64(bdev->bd_inode->i_size, + bdev_logical_block_size(bdev)) - 1ULL; +} + +static inline int +pmbr_part_valid(struct partition *part) +{ + if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT && + le32_to_cpu(part->start_sect) == 1UL) + return 1; + return 0; +} + +/** + * is_pmbr_valid(): test Protective MBR for validity + * @mbr: pointer to a legacy mbr structure + * + * Description: Returns 1 if PMBR is valid, 0 otherwise. + * Validity depends on two things: + * 1) MSDOS signature is in the last two bytes of the MBR + * 2) One partition of type 0xEE is found + */ +static int +is_pmbr_valid(legacy_mbr *mbr) +{ + int i; + if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) + return 0; + for (i = 0; i < 4; i++) + if (pmbr_part_valid(&mbr->partition_record[i])) + return 1; + return 0; +} + +/** + * read_lba(): Read bytes from disk, starting at given LBA + * @state + * @lba + * @buffer + * @size_t + * + * Description: Reads @count bytes from @state->bdev into @buffer. + * Returns number of bytes read on success, 0 on error. + */ +static size_t read_lba(struct parsed_partitions *state, + u64 lba, u8 *buffer, size_t count) +{ + size_t totalreadcount = 0; + struct block_device *bdev = state->bdev; + sector_t n = lba * (bdev_logical_block_size(bdev) / 512); + + if (!buffer || lba > last_lba(bdev)) + return 0; + + while (count) { + int copied = 512; + Sector sect; + unsigned char *data = read_part_sector(state, n++, §); + if (!data) + break; + if (copied > count) + copied = count; + memcpy(buffer, data, copied); + put_dev_sector(sect); + buffer += copied; + totalreadcount +=copied; + count -= copied; + } + return totalreadcount; +} + +/** + * alloc_read_gpt_entries(): reads partition entries from disk + * @state + * @gpt - GPT header + * + * Description: Returns ptes on success, NULL on error. + * Allocates space for PTEs based on information found in @gpt. + * Notes: remember to free pte when you're done! + */ +static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, + gpt_header *gpt) +{ + size_t count; + gpt_entry *pte; + + if (!gpt) + return NULL; + + count = le32_to_cpu(gpt->num_partition_entries) * + le32_to_cpu(gpt->sizeof_partition_entry); + if (!count) + return NULL; + pte = kzalloc(count, GFP_KERNEL); + if (!pte) + return NULL; + + if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), + (u8 *) pte, + count) < count) { + kfree(pte); + pte=NULL; + return NULL; + } + return pte; +} + +/** + * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk + * @state + * @lba is the Logical Block Address of the partition table + * + * Description: returns GPT header on success, NULL on error. Allocates + * and fills a GPT header starting at @ from @state->bdev. + * Note: remember to free gpt when finished with it. + */ +static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, + u64 lba) +{ + gpt_header *gpt; + unsigned ssz = bdev_logical_block_size(state->bdev); + + gpt = kzalloc(ssz, GFP_KERNEL); + if (!gpt) + return NULL; + + if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) { + kfree(gpt); + gpt=NULL; + return NULL; + } + + return gpt; +} + +/** + * is_gpt_valid() - tests one GPT header and PTEs for validity + * @state + * @lba is the logical block address of the GPT header to test + * @gpt is a GPT header ptr, filled on return. + * @ptes is a PTEs ptr, filled on return. + * + * Description: returns 1 if valid, 0 on error. + * If valid, returns pointers to newly allocated GPT header and PTEs. + */ +static int is_gpt_valid(struct parsed_partitions *state, u64 lba, + gpt_header **gpt, gpt_entry **ptes) +{ + u32 crc, origcrc; + u64 lastlba; + + if (!ptes) + return 0; + if (!(*gpt = alloc_read_gpt_header(state, lba))) + return 0; + + /* Check the GUID Partition Table signature */ + if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) { + pr_debug("GUID Partition Table Header signature is wrong:" + "%lld != %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->signature), + (unsigned long long)GPT_HEADER_SIGNATURE); + goto fail; + } + + /* Check the GUID Partition Table header size */ + if (le32_to_cpu((*gpt)->header_size) > + bdev_logical_block_size(state->bdev)) { + pr_debug("GUID Partition Table Header size is wrong: %u > %u\n", + le32_to_cpu((*gpt)->header_size), + bdev_logical_block_size(state->bdev)); + goto fail; + } + + /* Check the GUID Partition Table CRC */ + origcrc = le32_to_cpu((*gpt)->header_crc32); + (*gpt)->header_crc32 = 0; + crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size)); + + if (crc != origcrc) { + pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n", + crc, origcrc); + goto fail; + } + (*gpt)->header_crc32 = cpu_to_le32(origcrc); + + /* Check that the my_lba entry points to the LBA that contains + * the GUID Partition Table */ + if (le64_to_cpu((*gpt)->my_lba) != lba) { + pr_debug("GPT my_lba incorrect: %lld != %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->my_lba), + (unsigned long long)lba); + goto fail; + } + + /* Check the first_usable_lba and last_usable_lba are + * within the disk. + */ + lastlba = last_lba(state->bdev); + if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { + pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), + (unsigned long long)lastlba); + goto fail; + } + if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) { + pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), + (unsigned long long)lastlba); + goto fail; + } + + /* Check that sizeof_partition_entry has the correct value */ + if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { + pr_debug("GUID Partitition Entry Size check failed.\n"); + goto fail; + } + + if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) + goto fail; + + /* Check the GUID Partition Entry Array CRC */ + crc = efi_crc32((const unsigned char *) (*ptes), + le32_to_cpu((*gpt)->num_partition_entries) * + le32_to_cpu((*gpt)->sizeof_partition_entry)); + + if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { + pr_debug("GUID Partitition Entry Array CRC check failed.\n"); + goto fail_ptes; + } + + /* We're done, all's well */ + return 1; + + fail_ptes: + kfree(*ptes); + *ptes = NULL; + fail: + kfree(*gpt); + *gpt = NULL; + return 0; +} + +/** + * is_pte_valid() - tests one PTE for validity + * @pte is the pte to check + * @lastlba is last lba of the disk + * + * Description: returns 1 if valid, 0 on error. + */ +static inline int +is_pte_valid(const gpt_entry *pte, const u64 lastlba) +{ + if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) || + le64_to_cpu(pte->starting_lba) > lastlba || + le64_to_cpu(pte->ending_lba) > lastlba) + return 0; + return 1; +} + +/** + * compare_gpts() - Search disk for valid GPT headers and PTEs + * @pgpt is the primary GPT header + * @agpt is the alternate GPT header + * @lastlba is the last LBA number + * Description: Returns nothing. Sanity checks pgpt and agpt fields + * and prints warnings on discrepancies. + * + */ +static void +compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) +{ + int error_found = 0; + if (!pgpt || !agpt) + return; + if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) { + printk(KERN_WARNING + "GPT:Primary header LBA != Alt. header alternate_lba\n"); + printk(KERN_WARNING "GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->my_lba), + (unsigned long long)le64_to_cpu(agpt->alternate_lba)); + error_found++; + } + if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) { + printk(KERN_WARNING + "GPT:Primary header alternate_lba != Alt. header my_lba\n"); + printk(KERN_WARNING "GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->alternate_lba), + (unsigned long long)le64_to_cpu(agpt->my_lba)); + error_found++; + } + if (le64_to_cpu(pgpt->first_usable_lba) != + le64_to_cpu(agpt->first_usable_lba)) { + printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n"); + printk(KERN_WARNING "GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->first_usable_lba), + (unsigned long long)le64_to_cpu(agpt->first_usable_lba)); + error_found++; + } + if (le64_to_cpu(pgpt->last_usable_lba) != + le64_to_cpu(agpt->last_usable_lba)) { + printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n"); + printk(KERN_WARNING "GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->last_usable_lba), + (unsigned long long)le64_to_cpu(agpt->last_usable_lba)); + error_found++; + } + if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) { + printk(KERN_WARNING "GPT:disk_guids don't match.\n"); + error_found++; + } + if (le32_to_cpu(pgpt->num_partition_entries) != + le32_to_cpu(agpt->num_partition_entries)) { + printk(KERN_WARNING "GPT:num_partition_entries don't match: " + "0x%x != 0x%x\n", + le32_to_cpu(pgpt->num_partition_entries), + le32_to_cpu(agpt->num_partition_entries)); + error_found++; + } + if (le32_to_cpu(pgpt->sizeof_partition_entry) != + le32_to_cpu(agpt->sizeof_partition_entry)) { + printk(KERN_WARNING + "GPT:sizeof_partition_entry values don't match: " + "0x%x != 0x%x\n", + le32_to_cpu(pgpt->sizeof_partition_entry), + le32_to_cpu(agpt->sizeof_partition_entry)); + error_found++; + } + if (le32_to_cpu(pgpt->partition_entry_array_crc32) != + le32_to_cpu(agpt->partition_entry_array_crc32)) { + printk(KERN_WARNING + "GPT:partition_entry_array_crc32 values don't match: " + "0x%x != 0x%x\n", + le32_to_cpu(pgpt->partition_entry_array_crc32), + le32_to_cpu(agpt->partition_entry_array_crc32)); + error_found++; + } + if (le64_to_cpu(pgpt->alternate_lba) != lastlba) { + printk(KERN_WARNING + "GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); + printk(KERN_WARNING "GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->alternate_lba), + (unsigned long long)lastlba); + error_found++; + } + + if (le64_to_cpu(agpt->my_lba) != lastlba) { + printk(KERN_WARNING + "GPT:Alternate GPT header not at the end of the disk.\n"); + printk(KERN_WARNING "GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(agpt->my_lba), + (unsigned long long)lastlba); + error_found++; + } + + if (error_found) + printk(KERN_WARNING + "GPT: Use GNU Parted to correct GPT errors.\n"); + return; +} + +/** + * find_valid_gpt() - Search disk for valid GPT headers and PTEs + * @state + * @gpt is a GPT header ptr, filled on return. + * @ptes is a PTEs ptr, filled on return. + * Description: Returns 1 if valid, 0 on error. + * If valid, returns pointers to newly allocated GPT header and PTEs. + * Validity depends on PMBR being valid (or being overridden by the + * 'gpt' kernel command line option) and finding either the Primary + * GPT header and PTEs valid, or the Alternate GPT header and PTEs + * valid. If the Primary GPT header is not valid, the Alternate GPT header + * is not checked unless the 'gpt' kernel command line option is passed. + * This protects against devices which misreport their size, and forces + * the user to decide to use the Alternate GPT. + */ +static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, + gpt_entry **ptes) +{ + int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; + gpt_header *pgpt = NULL, *agpt = NULL; + gpt_entry *pptes = NULL, *aptes = NULL; + legacy_mbr *legacymbr; + u64 lastlba; + + if (!ptes) + return 0; + + lastlba = last_lba(state->bdev); + if (!force_gpt) { + /* This will be added to the EFI Spec. per Intel after v1.02. */ + legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); + if (legacymbr) { + read_lba(state, 0, (u8 *) legacymbr, + sizeof (*legacymbr)); + good_pmbr = is_pmbr_valid(legacymbr); + kfree(legacymbr); + } + if (!good_pmbr) + goto fail; + } + + good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, + &pgpt, &pptes); + if (good_pgpt) + good_agpt = is_gpt_valid(state, + le64_to_cpu(pgpt->alternate_lba), + &agpt, &aptes); + if (!good_agpt && force_gpt) + good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); + + /* The obviously unsuccessful case */ + if (!good_pgpt && !good_agpt) + goto fail; + + compare_gpts(pgpt, agpt, lastlba); + + /* The good cases */ + if (good_pgpt) { + *gpt = pgpt; + *ptes = pptes; + kfree(agpt); + kfree(aptes); + if (!good_agpt) { + printk(KERN_WARNING + "Alternate GPT is invalid, " + "using primary GPT.\n"); + } + return 1; + } + else if (good_agpt) { + *gpt = agpt; + *ptes = aptes; + kfree(pgpt); + kfree(pptes); + printk(KERN_WARNING + "Primary GPT is invalid, using alternate GPT.\n"); + return 1; + } + + fail: + kfree(pgpt); + kfree(agpt); + kfree(pptes); + kfree(aptes); + *gpt = NULL; + *ptes = NULL; + return 0; +} + +/** + * efi_partition(struct parsed_partitions *state) + * @state + * + * Description: called from check.c, if the disk contains GPT + * partitions, sets up partition entries in the kernel. + * + * If the first block on the disk is a legacy MBR, + * it will get handled by msdos_partition(). + * If it's a Protective MBR, we'll handle it here. + * + * We do not create a Linux partition for GPT, but + * only for the actual data partitions. + * Returns: + * -1 if unable to read the partition table + * 0 if this isn't our partition table + * 1 if successful + * + */ +int efi_partition(struct parsed_partitions *state) +{ + gpt_header *gpt = NULL; + gpt_entry *ptes = NULL; + u32 i; + unsigned ssz = bdev_logical_block_size(state->bdev) / 512; + u8 unparsed_guid[37]; + + if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { + kfree(gpt); + kfree(ptes); + return 0; + } + + pr_debug("GUID Partition Table is valid! Yea!\n"); + + for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { + struct partition_meta_info *info; + unsigned label_count = 0; + unsigned label_max; + u64 start = le64_to_cpu(ptes[i].starting_lba); + u64 size = le64_to_cpu(ptes[i].ending_lba) - + le64_to_cpu(ptes[i].starting_lba) + 1ULL; + + if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) + continue; + + put_partition(state, i+1, start * ssz, size * ssz); + + /* If this is a RAID volume, tell md */ + if (!efi_guidcmp(ptes[i].partition_type_guid, + PARTITION_LINUX_RAID_GUID)) + state->parts[i + 1].flags = ADDPART_FLAG_RAID; + + info = &state->parts[i + 1].info; + /* Instead of doing a manual swap to big endian, reuse the + * common ASCII hex format as the interim. + */ + efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid); + part_pack_uuid(unparsed_guid, info->uuid); + + /* Naively convert UTF16-LE to 7 bits. */ + label_max = min(sizeof(info->volname) - 1, + sizeof(ptes[i].partition_name)); + info->volname[label_max] = 0; + while (label_count < label_max) { + u8 c = ptes[i].partition_name[label_count] & 0xff; + if (c && !isprint(c)) + c = '!'; + info->volname[label_count] = c; + label_count++; + } + state->parts[i + 1].has_info = true; + } + kfree(ptes); + kfree(gpt); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h new file mode 100644 index 00000000..b69ab729 --- /dev/null +++ b/fs/partitions/efi.h @@ -0,0 +1,134 @@ +/************************************************************ + * EFI GUID Partition Table + * Per Intel EFI Specification v1.02 + * http://developer.intel.com/technology/efi/efi.htm + * + * By Matt Domsch Fri Sep 22 22:15:56 CDT 2000 + * Copyright 2000,2001 Dell Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + ************************************************************/ + +#ifndef FS_PART_EFI_H_INCLUDED +#define FS_PART_EFI_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include + +#define MSDOS_MBR_SIGNATURE 0xaa55 +#define EFI_PMBR_OSTYPE_EFI 0xEF +#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE + +#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL +#define GPT_HEADER_REVISION_V1 0x00010000 +#define GPT_PRIMARY_PARTITION_TABLE_LBA 1 + +#define PARTITION_SYSTEM_GUID \ + EFI_GUID( 0xC12A7328, 0xF81F, 0x11d2, \ + 0xBA, 0x4B, 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B) +#define LEGACY_MBR_PARTITION_GUID \ + EFI_GUID( 0x024DEE41, 0x33E7, 0x11d3, \ + 0x9D, 0x69, 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F) +#define PARTITION_MSFT_RESERVED_GUID \ + EFI_GUID( 0xE3C9E316, 0x0B5C, 0x4DB8, \ + 0x81, 0x7D, 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE) +#define PARTITION_BASIC_DATA_GUID \ + EFI_GUID( 0xEBD0A0A2, 0xB9E5, 0x4433, \ + 0x87, 0xC0, 0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7) +#define PARTITION_LINUX_RAID_GUID \ + EFI_GUID( 0xa19d880f, 0x05fc, 0x4d3b, \ + 0xa0, 0x06, 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e) +#define PARTITION_LINUX_SWAP_GUID \ + EFI_GUID( 0x0657fd6d, 0xa4ab, 0x43c4, \ + 0x84, 0xe5, 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f) +#define PARTITION_LINUX_LVM_GUID \ + EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \ + 0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28) + +typedef struct _gpt_header { + __le64 signature; + __le32 revision; + __le32 header_size; + __le32 header_crc32; + __le32 reserved1; + __le64 my_lba; + __le64 alternate_lba; + __le64 first_usable_lba; + __le64 last_usable_lba; + efi_guid_t disk_guid; + __le64 partition_entry_lba; + __le32 num_partition_entries; + __le32 sizeof_partition_entry; + __le32 partition_entry_array_crc32; + + /* The rest of the logical block is reserved by UEFI and must be zero. + * EFI standard handles this by: + * + * uint8_t reserved2[ BlockSize - 92 ]; + */ +} __attribute__ ((packed)) gpt_header; + +typedef struct _gpt_entry_attributes { + u64 required_to_function:1; + u64 reserved:47; + u64 type_guid_specific:16; +} __attribute__ ((packed)) gpt_entry_attributes; + +typedef struct _gpt_entry { + efi_guid_t partition_type_guid; + efi_guid_t unique_partition_guid; + __le64 starting_lba; + __le64 ending_lba; + gpt_entry_attributes attributes; + efi_char16_t partition_name[72 / sizeof (efi_char16_t)]; +} __attribute__ ((packed)) gpt_entry; + +typedef struct _legacy_mbr { + u8 boot_code[440]; + __le32 unique_mbr_signature; + __le16 unknown; + struct partition partition_record[4]; + __le16 signature; +} __attribute__ ((packed)) legacy_mbr; + +/* Functions */ +extern int efi_partition(struct parsed_partitions *state); + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * -------------------------------------------------------------------------- + * Local variables: + * c-indent-level: 4 + * c-brace-imaginary-offset: 0 + * c-brace-offset: -4 + * c-argdecl-indent: 4 + * c-label-offset: -4 + * c-continued-statement-offset: 4 + * c-continued-brace-offset: 0 + * indent-tabs-mode: nil + * tab-width: 8 + * End: + */ diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c new file mode 100644 index 00000000..d513a07f --- /dev/null +++ b/fs/partitions/ibm.c @@ -0,0 +1,275 @@ +/* + * File...........: linux/fs/partitions/ibm.c + * Author(s)......: Holger Smolinski + * Volker Sameske + * Bugreports.to..: + * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "check.h" +#include "ibm.h" + +/* + * compute the block number from a + * cyl-cyl-head-head structure + */ +static sector_t +cchh2blk (struct vtoc_cchh *ptr, struct hd_geometry *geo) { + + sector_t cyl; + __u16 head; + + /*decode cylinder and heads for large volumes */ + cyl = ptr->hh & 0xFFF0; + cyl <<= 12; + cyl |= ptr->cc; + head = ptr->hh & 0x000F; + return cyl * geo->heads * geo->sectors + + head * geo->sectors; +} + +/* + * compute the block number from a + * cyl-cyl-head-head-block structure + */ +static sector_t +cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) { + + sector_t cyl; + __u16 head; + + /*decode cylinder and heads for large volumes */ + cyl = ptr->hh & 0xFFF0; + cyl <<= 12; + cyl |= ptr->cc; + head = ptr->hh & 0x000F; + return cyl * geo->heads * geo->sectors + + head * geo->sectors + + ptr->b; +} + +/* + */ +int ibm_partition(struct parsed_partitions *state) +{ + struct block_device *bdev = state->bdev; + int blocksize, res; + loff_t i_size, offset, size, fmt_size; + dasd_information2_t *info; + struct hd_geometry *geo; + char type[5] = {0,}; + char name[7] = {0,}; + union label_t { + struct vtoc_volume_label_cdl vol; + struct vtoc_volume_label_ldl lnx; + struct vtoc_cms_label cms; + } *label; + unsigned char *data; + Sector sect; + sector_t labelsect; + char tmp[64]; + + res = 0; + blocksize = bdev_logical_block_size(bdev); + if (blocksize <= 0) + goto out_exit; + i_size = i_size_read(bdev->bd_inode); + if (i_size == 0) + goto out_exit; + + info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL); + if (info == NULL) + goto out_exit; + geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL); + if (geo == NULL) + goto out_nogeo; + label = kmalloc(sizeof(union label_t), GFP_KERNEL); + if (label == NULL) + goto out_nolab; + + if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0 || + ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) + goto out_freeall; + + /* + * Special case for FBA disks: label sector does not depend on + * blocksize. + */ + if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) || + (info->cu_type == 0x3880 && info->dev_type == 0x3370)) + labelsect = info->label_block; + else + labelsect = info->label_block * (blocksize >> 9); + + /* + * Get volume label, extract name and type. + */ + data = read_part_sector(state, labelsect, §); + if (data == NULL) + goto out_readerr; + + memcpy(label, data, sizeof(union label_t)); + put_dev_sector(sect); + + if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) { + strncpy(type, label->vol.vollbl, 4); + strncpy(name, label->vol.volid, 6); + } else { + strncpy(type, label->lnx.vollbl, 4); + strncpy(name, label->lnx.volid, 6); + } + EBCASC(type, 4); + EBCASC(name, 6); + + res = 1; + + /* + * Three different formats: LDL, CDL and unformated disk + * + * identified by info->format + * + * unformated disks we do not have to care about + */ + if (info->format == DASD_FORMAT_LDL) { + if (strncmp(type, "CMS1", 4) == 0) { + /* + * VM style CMS1 labeled disk + */ + blocksize = label->cms.block_size; + if (label->cms.disk_offset != 0) { + snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + /* disk is reserved minidisk */ + offset = label->cms.disk_offset; + size = (label->cms.block_count - 1) + * (blocksize >> 9); + } else { + snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + offset = (info->label_block + 1); + size = label->cms.block_count + * (blocksize >> 9); + } + put_partition(state, 1, offset*(blocksize >> 9), + size-offset*(blocksize >> 9)); + } else { + if (strncmp(type, "LNX1", 4) == 0) { + snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + if (label->lnx.ldl_version == 0xf2) { + fmt_size = label->lnx.formatted_blocks + * (blocksize >> 9); + } else if (!strcmp(info->type, "ECKD")) { + /* formated w/o large volume support */ + fmt_size = geo->cylinders * geo->heads + * geo->sectors * (blocksize >> 9); + } else { + /* old label and no usable disk geometry + * (e.g. DIAG) */ + fmt_size = i_size >> 9; + } + size = i_size >> 9; + if (fmt_size < size) + size = fmt_size; + offset = (info->label_block + 1); + } else { + /* unlabeled disk */ + strlcat(state->pp_buf, "(nonl)", PAGE_SIZE); + size = i_size >> 9; + offset = (info->label_block + 1); + } + put_partition(state, 1, offset*(blocksize >> 9), + size-offset*(blocksize >> 9)); + } + } else if (info->format == DASD_FORMAT_CDL) { + /* + * New style CDL formatted disk + */ + sector_t blk; + int counter; + + /* + * check if VOL1 label is available + * if not, something is wrong, skipping partition detection + */ + if (strncmp(type, "VOL1", 4) == 0) { + snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + /* + * get block number and read then go through format1 + * labels + */ + blk = cchhb2blk(&label->vol.vtoc, geo) + 1; + counter = 0; + data = read_part_sector(state, blk * (blocksize/512), + §); + while (data != NULL) { + struct vtoc_format1_label f1; + + memcpy(&f1, data, + sizeof(struct vtoc_format1_label)); + put_dev_sector(sect); + + /* skip FMT4 / FMT5 / FMT7 labels */ + if (f1.DS1FMTID == _ascebc['4'] + || f1.DS1FMTID == _ascebc['5'] + || f1.DS1FMTID == _ascebc['7'] + || f1.DS1FMTID == _ascebc['9']) { + blk++; + data = read_part_sector(state, + blk * (blocksize/512), §); + continue; + } + + /* only FMT1 and 8 labels valid at this point */ + if (f1.DS1FMTID != _ascebc['1'] && + f1.DS1FMTID != _ascebc['8']) + break; + + /* OK, we got valid partition data */ + offset = cchh2blk(&f1.DS1EXT1.llimit, geo); + size = cchh2blk(&f1.DS1EXT1.ulimit, geo) - + offset + geo->sectors; + if (counter >= state->limit) + break; + put_partition(state, counter + 1, + offset * (blocksize >> 9), + size * (blocksize >> 9)); + counter++; + blk++; + data = read_part_sector(state, + blk * (blocksize/512), §); + } + + if (!data) + /* Are we not supposed to report this ? */ + goto out_readerr; + } else + printk(KERN_WARNING "Warning, expected Label VOL1 not " + "found, treating as CDL formated Disk"); + + } + + strlcat(state->pp_buf, "\n", PAGE_SIZE); + goto out_freeall; + + +out_readerr: + res = -1; +out_freeall: + kfree(label); +out_nolab: + kfree(geo); +out_nogeo: + kfree(info); +out_exit: + return res; +} diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h new file mode 100644 index 00000000..08fb0804 --- /dev/null +++ b/fs/partitions/ibm.h @@ -0,0 +1 @@ +int ibm_partition(struct parsed_partitions *); diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c new file mode 100644 index 00000000..0ea19312 --- /dev/null +++ b/fs/partitions/karma.c @@ -0,0 +1,57 @@ +/* + * fs/partitions/karma.c + * Rio Karma partition info. + * + * Copyright (C) 2006 Bob Copeland (me@bobcopeland.com) + * based on osf.c + */ + +#include "check.h" +#include "karma.h" + +int karma_partition(struct parsed_partitions *state) +{ + int i; + int slot = 1; + Sector sect; + unsigned char *data; + struct disklabel { + u8 d_reserved[270]; + struct d_partition { + __le32 p_res; + u8 p_fstype; + u8 p_res2[3]; + __le32 p_offset; + __le32 p_size; + } d_partitions[2]; + u8 d_blank[208]; + __le16 d_magic; + } __attribute__((packed)) *label; + struct d_partition *p; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + label = (struct disklabel *)data; + if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) { + put_dev_sector(sect); + return 0; + } + + p = label->d_partitions; + for (i = 0 ; i < 2; i++, p++) { + if (slot == state->limit) + break; + + if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) { + put_partition(state, slot, le32_to_cpu(p->p_offset), + le32_to_cpu(p->p_size)); + } + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} + diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h new file mode 100644 index 00000000..c764b2e9 --- /dev/null +++ b/fs/partitions/karma.h @@ -0,0 +1,8 @@ +/* + * fs/partitions/karma.h + */ + +#define KARMA_LABEL_MAGIC 0xAB56 + +int karma_partition(struct parsed_partitions *state); + diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c new file mode 100644 index 00000000..af9fdf04 --- /dev/null +++ b/fs/partitions/ldm.c @@ -0,0 +1,1568 @@ +/** + * ldm - Support for Windows Logical Disk Manager (Dynamic Disks) + * + * Copyright (C) 2001,2002 Richard Russon + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (C) 2001,2002 Jakob Kemi + * + * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program (in the main directory of the source in the file COPYING); if + * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include "ldm.h" +#include "check.h" +#include "msdos.h" + +/** + * ldm_debug/info/error/crit - Output an error message + * @f: A printf format string containing the message + * @...: Variables to substitute into @f + * + * ldm_debug() writes a DEBUG level message to the syslog but only if the + * driver was compiled with debug enabled. Otherwise, the call turns into a NOP. + */ +#ifndef CONFIG_LDM_DEBUG +#define ldm_debug(...) do {} while (0) +#else +#define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a) +#endif + +#define ldm_crit(f, a...) _ldm_printk (KERN_CRIT, __func__, f, ##a) +#define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a) +#define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a) + +__attribute__ ((format (printf, 3, 4))) +static void _ldm_printk (const char *level, const char *function, + const char *fmt, ...) +{ + static char buf[128]; + va_list args; + + va_start (args, fmt); + vsnprintf (buf, sizeof (buf), fmt, args); + va_end (args); + + printk ("%s%s(): %s\n", level, function, buf); +} + +/** + * ldm_parse_hexbyte - Convert a ASCII hex number to a byte + * @src: Pointer to at least 2 characters to convert. + * + * Convert a two character ASCII hex string to a number. + * + * Return: 0-255 Success, the byte was parsed correctly + * -1 Error, an invalid character was supplied + */ +static int ldm_parse_hexbyte (const u8 *src) +{ + unsigned int x; /* For correct wrapping */ + int h; + + /* high part */ + x = h = hex_to_bin(src[0]); + if (h < 0) + return -1; + + /* low part */ + h = hex_to_bin(src[1]); + if (h < 0) + return -1; + + return (x << 4) + h; +} + +/** + * ldm_parse_guid - Convert GUID from ASCII to binary + * @src: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba + * @dest: Memory block to hold binary GUID (16 bytes) + * + * N.B. The GUID need not be NULL terminated. + * + * Return: 'true' @dest contains binary GUID + * 'false' @dest contents are undefined + */ +static bool ldm_parse_guid (const u8 *src, u8 *dest) +{ + static const int size[] = { 4, 2, 2, 2, 6 }; + int i, j, v; + + if (src[8] != '-' || src[13] != '-' || + src[18] != '-' || src[23] != '-') + return false; + + for (j = 0; j < 5; j++, src++) + for (i = 0; i < size[j]; i++, src+=2, *dest++ = v) + if ((v = ldm_parse_hexbyte (src)) < 0) + return false; + + return true; +} + +/** + * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure + * @data: Raw database PRIVHEAD structure loaded from the device + * @ph: In-memory privhead structure in which to return parsed information + * + * This parses the LDM database PRIVHEAD structure supplied in @data and + * sets up the in-memory privhead structure @ph with the obtained information. + * + * Return: 'true' @ph contains the PRIVHEAD data + * 'false' @ph contents are undefined + */ +static bool ldm_parse_privhead(const u8 *data, struct privhead *ph) +{ + bool is_vista = false; + + BUG_ON(!data || !ph); + if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) { + ldm_error("Cannot find PRIVHEAD structure. LDM database is" + " corrupt. Aborting."); + return false; + } + ph->ver_major = get_unaligned_be16(data + 0x000C); + ph->ver_minor = get_unaligned_be16(data + 0x000E); + ph->logical_disk_start = get_unaligned_be64(data + 0x011B); + ph->logical_disk_size = get_unaligned_be64(data + 0x0123); + ph->config_start = get_unaligned_be64(data + 0x012B); + ph->config_size = get_unaligned_be64(data + 0x0133); + /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */ + if (ph->ver_major == 2 && ph->ver_minor == 12) + is_vista = true; + if (!is_vista && (ph->ver_major != 2 || ph->ver_minor != 11)) { + ldm_error("Expected PRIVHEAD version 2.11 or 2.12, got %d.%d." + " Aborting.", ph->ver_major, ph->ver_minor); + return false; + } + ldm_debug("PRIVHEAD version %d.%d (Windows %s).", ph->ver_major, + ph->ver_minor, is_vista ? "Vista" : "2000/XP"); + if (ph->config_size != LDM_DB_SIZE) { /* 1 MiB in sectors. */ + /* Warn the user and continue, carefully. */ + ldm_info("Database is normally %u bytes, it claims to " + "be %llu bytes.", LDM_DB_SIZE, + (unsigned long long)ph->config_size); + } + if ((ph->logical_disk_size == 0) || (ph->logical_disk_start + + ph->logical_disk_size > ph->config_start)) { + ldm_error("PRIVHEAD disk size doesn't match real disk size"); + return false; + } + if (!ldm_parse_guid(data + 0x0030, ph->disk_id)) { + ldm_error("PRIVHEAD contains an invalid GUID."); + return false; + } + ldm_debug("Parsed PRIVHEAD successfully."); + return true; +} + +/** + * ldm_parse_tocblock - Read the LDM Database TOCBLOCK structure + * @data: Raw database TOCBLOCK structure loaded from the device + * @toc: In-memory toc structure in which to return parsed information + * + * This parses the LDM Database TOCBLOCK (table of contents) structure supplied + * in @data and sets up the in-memory tocblock structure @toc with the obtained + * information. + * + * N.B. The *_start and *_size values returned in @toc are not range-checked. + * + * Return: 'true' @toc contains the TOCBLOCK data + * 'false' @toc contents are undefined + */ +static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) +{ + BUG_ON (!data || !toc); + + if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) { + ldm_crit ("Cannot find TOCBLOCK, database may be corrupt."); + return false; + } + strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name)); + toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0; + toc->bitmap1_start = get_unaligned_be64(data + 0x2E); + toc->bitmap1_size = get_unaligned_be64(data + 0x36); + + if (strncmp (toc->bitmap1_name, TOC_BITMAP1, + sizeof (toc->bitmap1_name)) != 0) { + ldm_crit ("TOCBLOCK's first bitmap is '%s', should be '%s'.", + TOC_BITMAP1, toc->bitmap1_name); + return false; + } + strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name)); + toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0; + toc->bitmap2_start = get_unaligned_be64(data + 0x50); + toc->bitmap2_size = get_unaligned_be64(data + 0x58); + if (strncmp (toc->bitmap2_name, TOC_BITMAP2, + sizeof (toc->bitmap2_name)) != 0) { + ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.", + TOC_BITMAP2, toc->bitmap2_name); + return false; + } + ldm_debug ("Parsed TOCBLOCK successfully."); + return true; +} + +/** + * ldm_parse_vmdb - Read the LDM Database VMDB structure + * @data: Raw database VMDB structure loaded from the device + * @vm: In-memory vmdb structure in which to return parsed information + * + * This parses the LDM Database VMDB structure supplied in @data and sets up + * the in-memory vmdb structure @vm with the obtained information. + * + * N.B. The *_start, *_size and *_seq values will be range-checked later. + * + * Return: 'true' @vm contains VMDB info + * 'false' @vm contents are undefined + */ +static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) +{ + BUG_ON (!data || !vm); + + if (MAGIC_VMDB != get_unaligned_be32(data)) { + ldm_crit ("Cannot find the VMDB, database may be corrupt."); + return false; + } + + vm->ver_major = get_unaligned_be16(data + 0x12); + vm->ver_minor = get_unaligned_be16(data + 0x14); + if ((vm->ver_major != 4) || (vm->ver_minor != 10)) { + ldm_error ("Expected VMDB version %d.%d, got %d.%d. " + "Aborting.", 4, 10, vm->ver_major, vm->ver_minor); + return false; + } + + vm->vblk_size = get_unaligned_be32(data + 0x08); + if (vm->vblk_size == 0) { + ldm_error ("Illegal VBLK size"); + return false; + } + + vm->vblk_offset = get_unaligned_be32(data + 0x0C); + vm->last_vblk_seq = get_unaligned_be32(data + 0x04); + + ldm_debug ("Parsed VMDB successfully."); + return true; +} + +/** + * ldm_compare_privheads - Compare two privhead objects + * @ph1: First privhead + * @ph2: Second privhead + * + * This compares the two privhead structures @ph1 and @ph2. + * + * Return: 'true' Identical + * 'false' Different + */ +static bool ldm_compare_privheads (const struct privhead *ph1, + const struct privhead *ph2) +{ + BUG_ON (!ph1 || !ph2); + + return ((ph1->ver_major == ph2->ver_major) && + (ph1->ver_minor == ph2->ver_minor) && + (ph1->logical_disk_start == ph2->logical_disk_start) && + (ph1->logical_disk_size == ph2->logical_disk_size) && + (ph1->config_start == ph2->config_start) && + (ph1->config_size == ph2->config_size) && + !memcmp (ph1->disk_id, ph2->disk_id, GUID_SIZE)); +} + +/** + * ldm_compare_tocblocks - Compare two tocblock objects + * @toc1: First toc + * @toc2: Second toc + * + * This compares the two tocblock structures @toc1 and @toc2. + * + * Return: 'true' Identical + * 'false' Different + */ +static bool ldm_compare_tocblocks (const struct tocblock *toc1, + const struct tocblock *toc2) +{ + BUG_ON (!toc1 || !toc2); + + return ((toc1->bitmap1_start == toc2->bitmap1_start) && + (toc1->bitmap1_size == toc2->bitmap1_size) && + (toc1->bitmap2_start == toc2->bitmap2_start) && + (toc1->bitmap2_size == toc2->bitmap2_size) && + !strncmp (toc1->bitmap1_name, toc2->bitmap1_name, + sizeof (toc1->bitmap1_name)) && + !strncmp (toc1->bitmap2_name, toc2->bitmap2_name, + sizeof (toc1->bitmap2_name))); +} + +/** + * ldm_validate_privheads - Compare the primary privhead with its backups + * @state: Partition check state including device holding the LDM Database + * @ph1: Memory struct to fill with ph contents + * + * Read and compare all three privheads from disk. + * + * The privheads on disk show the size and location of the main disk area and + * the configuration area (the database). The values are range-checked against + * @hd, which contains the real size of the disk. + * + * Return: 'true' Success + * 'false' Error + */ +static bool ldm_validate_privheads(struct parsed_partitions *state, + struct privhead *ph1) +{ + static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; + struct privhead *ph[3] = { ph1 }; + Sector sect; + u8 *data; + bool result = false; + long num_sects; + int i; + + BUG_ON (!state || !ph1); + + ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); + ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); + if (!ph[1] || !ph[2]) { + ldm_crit ("Out of memory."); + goto out; + } + + /* off[1 & 2] are relative to ph[0]->config_start */ + ph[0]->config_start = 0; + + /* Read and parse privheads */ + for (i = 0; i < 3; i++) { + data = read_part_sector(state, ph[0]->config_start + off[i], + §); + if (!data) { + ldm_crit ("Disk read failed."); + goto out; + } + result = ldm_parse_privhead (data, ph[i]); + put_dev_sector (sect); + if (!result) { + ldm_error ("Cannot find PRIVHEAD %d.", i+1); /* Log again */ + if (i < 2) + goto out; /* Already logged */ + else + break; /* FIXME ignore for now, 3rd PH can fail on odd-sized disks */ + } + } + + num_sects = state->bdev->bd_inode->i_size >> 9; + + if ((ph[0]->config_start > num_sects) || + ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { + ldm_crit ("Database extends beyond the end of the disk."); + goto out; + } + + if ((ph[0]->logical_disk_start > ph[0]->config_start) || + ((ph[0]->logical_disk_start + ph[0]->logical_disk_size) + > ph[0]->config_start)) { + ldm_crit ("Disk and database overlap."); + goto out; + } + + if (!ldm_compare_privheads (ph[0], ph[1])) { + ldm_crit ("Primary and backup PRIVHEADs don't match."); + goto out; + } + /* FIXME ignore this for now + if (!ldm_compare_privheads (ph[0], ph[2])) { + ldm_crit ("Primary and backup PRIVHEADs don't match."); + goto out; + }*/ + ldm_debug ("Validated PRIVHEADs successfully."); + result = true; +out: + kfree (ph[1]); + kfree (ph[2]); + return result; +} + +/** + * ldm_validate_tocblocks - Validate the table of contents and its backups + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @state->bdev, of the database + * @ldb: Cache of the database structures + * + * Find and compare the four tables of contents of the LDM Database stored on + * @state->bdev and return the parsed information into @toc1. + * + * The offsets and sizes of the configs are range-checked against a privhead. + * + * Return: 'true' @toc1 contains validated TOCBLOCK info + * 'false' @toc1 contents are undefined + */ +static bool ldm_validate_tocblocks(struct parsed_partitions *state, + unsigned long base, struct ldmdb *ldb) +{ + static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; + struct tocblock *tb[4]; + struct privhead *ph; + Sector sect; + u8 *data; + int i, nr_tbs; + bool result = false; + + BUG_ON(!state || !ldb); + ph = &ldb->ph; + tb[0] = &ldb->toc; + tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); + if (!tb[1]) { + ldm_crit("Out of memory."); + goto err; + } + tb[2] = (struct tocblock*)((u8*)tb[1] + sizeof(*tb[1])); + tb[3] = (struct tocblock*)((u8*)tb[2] + sizeof(*tb[2])); + /* + * Try to read and parse all four TOCBLOCKs. + * + * Windows Vista LDM v2.12 does not always have all four TOCBLOCKs so + * skip any that fail as long as we get at least one valid TOCBLOCK. + */ + for (nr_tbs = i = 0; i < 4; i++) { + data = read_part_sector(state, base + off[i], §); + if (!data) { + ldm_error("Disk read failed for TOCBLOCK %d.", i); + continue; + } + if (ldm_parse_tocblock(data, tb[nr_tbs])) + nr_tbs++; + put_dev_sector(sect); + } + if (!nr_tbs) { + ldm_crit("Failed to find a valid TOCBLOCK."); + goto err; + } + /* Range check the TOCBLOCK against a privhead. */ + if (((tb[0]->bitmap1_start + tb[0]->bitmap1_size) > ph->config_size) || + ((tb[0]->bitmap2_start + tb[0]->bitmap2_size) > + ph->config_size)) { + ldm_crit("The bitmaps are out of range. Giving up."); + goto err; + } + /* Compare all loaded TOCBLOCKs. */ + for (i = 1; i < nr_tbs; i++) { + if (!ldm_compare_tocblocks(tb[0], tb[i])) { + ldm_crit("TOCBLOCKs 0 and %d do not match.", i); + goto err; + } + } + ldm_debug("Validated %d TOCBLOCKs successfully.", nr_tbs); + result = true; +err: + kfree(tb[1]); + return result; +} + +/** + * ldm_validate_vmdb - Read the VMDB and validate it + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @bdev, of the database + * @ldb: Cache of the database structures + * + * Find the vmdb of the LDM Database stored on @bdev and return the parsed + * information in @ldb. + * + * Return: 'true' @ldb contains validated VBDB info + * 'false' @ldb contents are undefined + */ +static bool ldm_validate_vmdb(struct parsed_partitions *state, + unsigned long base, struct ldmdb *ldb) +{ + Sector sect; + u8 *data; + bool result = false; + struct vmdb *vm; + struct tocblock *toc; + + BUG_ON (!state || !ldb); + + vm = &ldb->vm; + toc = &ldb->toc; + + data = read_part_sector(state, base + OFF_VMDB, §); + if (!data) { + ldm_crit ("Disk read failed."); + return false; + } + + if (!ldm_parse_vmdb (data, vm)) + goto out; /* Already logged */ + + /* Are there uncommitted transactions? */ + if (get_unaligned_be16(data + 0x10) != 0x01) { + ldm_crit ("Database is not in a consistent state. Aborting."); + goto out; + } + + if (vm->vblk_offset != 512) + ldm_info ("VBLKs start at offset 0x%04x.", vm->vblk_offset); + + /* + * The last_vblkd_seq can be before the end of the vmdb, just make sure + * it is not out of bounds. + */ + if ((vm->vblk_size * vm->last_vblk_seq) > (toc->bitmap1_size << 9)) { + ldm_crit ("VMDB exceeds allowed size specified by TOCBLOCK. " + "Database is corrupt. Aborting."); + goto out; + } + + result = true; +out: + put_dev_sector (sect); + return result; +} + + +/** + * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk + * @state: Partition check state including device holding the LDM Database + * + * This function provides a weak test to decide whether the device is a dynamic + * disk or not. It looks for an MS-DOS-style partition table containing at + * least one partition of type 0x42 (formerly SFS, now used by Windows for + * dynamic disks). + * + * N.B. The only possible error can come from the read_part_sector and that is + * only likely to happen if the underlying device is strange. If that IS + * the case we should return zero to let someone else try. + * + * Return: 'true' @state->bdev is a dynamic disk + * 'false' @state->bdev is not a dynamic disk, or an error occurred + */ +static bool ldm_validate_partition_table(struct parsed_partitions *state) +{ + Sector sect; + u8 *data; + struct partition *p; + int i; + bool result = false; + + BUG_ON(!state); + + data = read_part_sector(state, 0, §); + if (!data) { + ldm_info ("Disk read failed."); + return false; + } + + if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC)) + goto out; + + p = (struct partition*)(data + 0x01BE); + for (i = 0; i < 4; i++, p++) + if (SYS_IND (p) == LDM_PARTITION) { + result = true; + break; + } + + if (result) + ldm_debug ("Found W2K dynamic disk partition type."); + +out: + put_dev_sector (sect); + return result; +} + +/** + * ldm_get_disk_objid - Search a linked list of vblk's for a given Disk Id + * @ldb: Cache of the database structures + * + * The LDM Database contains a list of all partitions on all dynamic disks. + * The primary PRIVHEAD, at the beginning of the physical disk, tells us + * the GUID of this disk. This function searches for the GUID in a linked + * list of vblk's. + * + * Return: Pointer, A matching vblk was found + * NULL, No match, or an error + */ +static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb) +{ + struct list_head *item; + + BUG_ON (!ldb); + + list_for_each (item, &ldb->v_disk) { + struct vblk *v = list_entry (item, struct vblk, list); + if (!memcmp (v->vblk.disk.disk_id, ldb->ph.disk_id, GUID_SIZE)) + return v; + } + + return NULL; +} + +/** + * ldm_create_data_partitions - Create data partitions for this device + * @pp: List of the partitions parsed so far + * @ldb: Cache of the database structures + * + * The database contains ALL the partitions for ALL disk groups, so we need to + * filter out this specific disk. Using the disk's object id, we can find all + * the partitions in the database that belong to this disk. + * + * Add each partition in our database, to the parsed_partitions structure. + * + * N.B. This function creates the partitions in the order it finds partition + * objects in the linked list. + * + * Return: 'true' Partition created + * 'false' Error, probably a range checking problem + */ +static bool ldm_create_data_partitions (struct parsed_partitions *pp, + const struct ldmdb *ldb) +{ + struct list_head *item; + struct vblk *vb; + struct vblk *disk; + struct vblk_part *part; + int part_num = 1; + + BUG_ON (!pp || !ldb); + + disk = ldm_get_disk_objid (ldb); + if (!disk) { + ldm_crit ("Can't find the ID of this disk in the database."); + return false; + } + + strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE); + + /* Create the data partitions */ + list_for_each (item, &ldb->v_part) { + vb = list_entry (item, struct vblk, list); + part = &vb->vblk.part; + + if (part->disk_id != disk->obj_id) + continue; + + put_partition (pp, part_num, ldb->ph.logical_disk_start + + part->start, part->size); + part_num++; + } + + strlcat(pp->pp_buf, "\n", PAGE_SIZE); + return true; +} + + +/** + * ldm_relative - Calculate the next relative offset + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @base: Size of the previous fixed width fields + * @offset: Cumulative size of the previous variable-width fields + * + * Because many of the VBLK fields are variable-width, it's necessary + * to calculate each offset based on the previous one and the length + * of the field it pointed to. + * + * Return: -1 Error, the calculated offset exceeded the size of the buffer + * n OK, a range-checked offset into buffer + */ +static int ldm_relative(const u8 *buffer, int buflen, int base, int offset) +{ + + base += offset; + if (!buffer || offset < 0 || base > buflen) { + if (!buffer) + ldm_error("!buffer"); + if (offset < 0) + ldm_error("offset (%d) < 0", offset); + if (base > buflen) + ldm_error("base (%d) > buflen (%d)", base, buflen); + return -1; + } + if (base + buffer[base] >= buflen) { + ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base, + buffer[base], buflen); + return -1; + } + return buffer[base] + offset + 1; +} + +/** + * ldm_get_vnum - Convert a variable-width, big endian number, into cpu order + * @block: Pointer to the variable-width number to convert + * + * Large numbers in the LDM Database are often stored in a packed format. Each + * number is prefixed by a one byte width marker. All numbers in the database + * are stored in big-endian byte order. This function reads one of these + * numbers and returns the result + * + * N.B. This function DOES NOT perform any range checking, though the most + * it will read is eight bytes. + * + * Return: n A number + * 0 Zero, or an error occurred + */ +static u64 ldm_get_vnum (const u8 *block) +{ + u64 tmp = 0; + u8 length; + + BUG_ON (!block); + + length = *block++; + + if (length && length <= 8) + while (length--) + tmp = (tmp << 8) | *block++; + else + ldm_error ("Illegal length %d.", length); + + return tmp; +} + +/** + * ldm_get_vstr - Read a length-prefixed string into a buffer + * @block: Pointer to the length marker + * @buffer: Location to copy string to + * @buflen: Size of the output buffer + * + * Many of the strings in the LDM Database are not NULL terminated. Instead + * they are prefixed by a one byte length marker. This function copies one of + * these strings into a buffer. + * + * N.B. This function DOES NOT perform any range checking on the input. + * If the buffer is too small, the output will be truncated. + * + * Return: 0, Error and @buffer contents are undefined + * n, String length in characters (excluding NULL) + * buflen-1, String was truncated. + */ +static int ldm_get_vstr (const u8 *block, u8 *buffer, int buflen) +{ + int length; + + BUG_ON (!block || !buffer); + + length = block[0]; + if (length >= buflen) { + ldm_error ("Truncating string %d -> %d.", length, buflen); + length = buflen - 1; + } + memcpy (buffer, block + 1, length); + buffer[length] = 0; + return length; +} + + +/** + * ldm_parse_cmp3 - Read a raw VBLK Component object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Component object (version 3) into a vblk structure. + * + * Return: 'true' @vb contains a Component VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_vstate, r_child, r_parent, r_stripe, r_cols, len; + struct vblk_comp *comp; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + r_vstate = ldm_relative (buffer, buflen, 0x18, r_name); + r_child = ldm_relative (buffer, buflen, 0x1D, r_vstate); + r_parent = ldm_relative (buffer, buflen, 0x2D, r_child); + + if (buffer[0x12] & VBLK_FLAG_COMP_STRIPE) { + r_stripe = ldm_relative (buffer, buflen, 0x2E, r_parent); + r_cols = ldm_relative (buffer, buflen, 0x2E, r_stripe); + len = r_cols; + } else { + r_stripe = 0; + r_cols = 0; + len = r_parent; + } + if (len < 0) + return false; + + len += VBLK_SIZE_CMP3; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + comp = &vb->vblk.comp; + ldm_get_vstr (buffer + 0x18 + r_name, comp->state, + sizeof (comp->state)); + comp->type = buffer[0x18 + r_vstate]; + comp->children = ldm_get_vnum (buffer + 0x1D + r_vstate); + comp->parent_id = ldm_get_vnum (buffer + 0x2D + r_child); + comp->chunksize = r_stripe ? ldm_get_vnum (buffer+r_parent+0x2E) : 0; + + return true; +} + +/** + * ldm_parse_dgr3 - Read a raw VBLK Disk Group object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Disk Group object (version 3) into a vblk structure. + * + * Return: 'true' @vb contains a Disk Group VBLK + * 'false' @vb contents are not defined + */ +static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_diskid, r_id1, r_id2, len; + struct vblk_dgrp *dgrp; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); + + if (buffer[0x12] & VBLK_FLAG_DGR3_IDS) { + r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid); + r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1); + len = r_id2; + } else { + r_id1 = 0; + r_id2 = 0; + len = r_diskid; + } + if (len < 0) + return false; + + len += VBLK_SIZE_DGR3; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + dgrp = &vb->vblk.dgrp; + ldm_get_vstr (buffer + 0x18 + r_name, dgrp->disk_id, + sizeof (dgrp->disk_id)); + return true; +} + +/** + * ldm_parse_dgr4 - Read a raw VBLK Disk Group object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Disk Group object (version 4) into a vblk structure. + * + * Return: 'true' @vb contains a Disk Group VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) +{ + char buf[64]; + int r_objid, r_name, r_id1, r_id2, len; + struct vblk_dgrp *dgrp; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + + if (buffer[0x12] & VBLK_FLAG_DGR4_IDS) { + r_id1 = ldm_relative (buffer, buflen, 0x44, r_name); + r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1); + len = r_id2; + } else { + r_id1 = 0; + r_id2 = 0; + len = r_name; + } + if (len < 0) + return false; + + len += VBLK_SIZE_DGR4; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + dgrp = &vb->vblk.dgrp; + + ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf)); + return true; +} + +/** + * ldm_parse_dsk3 - Read a raw VBLK Disk object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Disk object (version 3) into a vblk structure. + * + * Return: 'true' @vb contains a Disk VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_diskid, r_altname, len; + struct vblk_disk *disk; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); + r_altname = ldm_relative (buffer, buflen, 0x18, r_diskid); + len = r_altname; + if (len < 0) + return false; + + len += VBLK_SIZE_DSK3; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + disk = &vb->vblk.disk; + ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name, + sizeof (disk->alt_name)); + if (!ldm_parse_guid (buffer + 0x19 + r_name, disk->disk_id)) + return false; + + return true; +} + +/** + * ldm_parse_dsk4 - Read a raw VBLK Disk object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Disk object (version 4) into a vblk structure. + * + * Return: 'true' @vb contains a Disk VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, len; + struct vblk_disk *disk; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + len = r_name; + if (len < 0) + return false; + + len += VBLK_SIZE_DSK4; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + disk = &vb->vblk.disk; + memcpy (disk->disk_id, buffer + 0x18 + r_name, GUID_SIZE); + return true; +} + +/** + * ldm_parse_prt3 - Read a raw VBLK Partition object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Partition object (version 3) into a vblk structure. + * + * Return: 'true' @vb contains a Partition VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len; + struct vblk_part *part; + + BUG_ON(!buffer || !vb); + r_objid = ldm_relative(buffer, buflen, 0x18, 0); + if (r_objid < 0) { + ldm_error("r_objid %d < 0", r_objid); + return false; + } + r_name = ldm_relative(buffer, buflen, 0x18, r_objid); + if (r_name < 0) { + ldm_error("r_name %d < 0", r_name); + return false; + } + r_size = ldm_relative(buffer, buflen, 0x34, r_name); + if (r_size < 0) { + ldm_error("r_size %d < 0", r_size); + return false; + } + r_parent = ldm_relative(buffer, buflen, 0x34, r_size); + if (r_parent < 0) { + ldm_error("r_parent %d < 0", r_parent); + return false; + } + r_diskid = ldm_relative(buffer, buflen, 0x34, r_parent); + if (r_diskid < 0) { + ldm_error("r_diskid %d < 0", r_diskid); + return false; + } + if (buffer[0x12] & VBLK_FLAG_PART_INDEX) { + r_index = ldm_relative(buffer, buflen, 0x34, r_diskid); + if (r_index < 0) { + ldm_error("r_index %d < 0", r_index); + return false; + } + len = r_index; + } else { + r_index = 0; + len = r_diskid; + } + if (len < 0) { + ldm_error("len %d < 0", len); + return false; + } + len += VBLK_SIZE_PRT3; + if (len > get_unaligned_be32(buffer + 0x14)) { + ldm_error("len %d > BE32(buffer + 0x14) %d", len, + get_unaligned_be32(buffer + 0x14)); + return false; + } + part = &vb->vblk.part; + part->start = get_unaligned_be64(buffer + 0x24 + r_name); + part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name); + part->size = ldm_get_vnum(buffer + 0x34 + r_name); + part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size); + part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent); + if (vb->flags & VBLK_FLAG_PART_INDEX) + part->partnum = buffer[0x35 + r_diskid]; + else + part->partnum = 0; + return true; +} + +/** + * ldm_parse_vol5 - Read a raw VBLK Volume object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Volume object (version 5) into a vblk structure. + * + * Return: 'true' @vb contains a Volume VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size; + int r_id1, r_id2, r_size2, r_drive, len; + struct vblk_volu *volu; + + BUG_ON(!buffer || !vb); + r_objid = ldm_relative(buffer, buflen, 0x18, 0); + if (r_objid < 0) { + ldm_error("r_objid %d < 0", r_objid); + return false; + } + r_name = ldm_relative(buffer, buflen, 0x18, r_objid); + if (r_name < 0) { + ldm_error("r_name %d < 0", r_name); + return false; + } + r_vtype = ldm_relative(buffer, buflen, 0x18, r_name); + if (r_vtype < 0) { + ldm_error("r_vtype %d < 0", r_vtype); + return false; + } + r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype); + if (r_disable_drive_letter < 0) { + ldm_error("r_disable_drive_letter %d < 0", + r_disable_drive_letter); + return false; + } + r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter); + if (r_child < 0) { + ldm_error("r_child %d < 0", r_child); + return false; + } + r_size = ldm_relative(buffer, buflen, 0x3D, r_child); + if (r_size < 0) { + ldm_error("r_size %d < 0", r_size); + return false; + } + if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) { + r_id1 = ldm_relative(buffer, buflen, 0x52, r_size); + if (r_id1 < 0) { + ldm_error("r_id1 %d < 0", r_id1); + return false; + } + } else + r_id1 = r_size; + if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) { + r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1); + if (r_id2 < 0) { + ldm_error("r_id2 %d < 0", r_id2); + return false; + } + } else + r_id2 = r_id1; + if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) { + r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2); + if (r_size2 < 0) { + ldm_error("r_size2 %d < 0", r_size2); + return false; + } + } else + r_size2 = r_id2; + if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { + r_drive = ldm_relative(buffer, buflen, 0x52, r_size2); + if (r_drive < 0) { + ldm_error("r_drive %d < 0", r_drive); + return false; + } + } else + r_drive = r_size2; + len = r_drive; + if (len < 0) { + ldm_error("len %d < 0", len); + return false; + } + len += VBLK_SIZE_VOL5; + if (len > get_unaligned_be32(buffer + 0x14)) { + ldm_error("len %d > BE32(buffer + 0x14) %d", len, + get_unaligned_be32(buffer + 0x14)); + return false; + } + volu = &vb->vblk.volu; + ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type, + sizeof(volu->volume_type)); + memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter, + sizeof(volu->volume_state)); + volu->size = ldm_get_vnum(buffer + 0x3D + r_child); + volu->partition_type = buffer[0x41 + r_size]; + memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid)); + if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { + ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint, + sizeof(volu->drive_hint)); + } + return true; +} + +/** + * ldm_parse_vblk - Read a raw VBLK object into a vblk structure + * @buf: Block of data being worked on + * @len: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK object into a vblk structure. This function just reads the + * information common to all VBLK types, then delegates the rest of the work to + * helper functions: ldm_parse_*. + * + * Return: 'true' @vb contains a VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb) +{ + bool result = false; + int r_objid; + + BUG_ON (!buf || !vb); + + r_objid = ldm_relative (buf, len, 0x18, 0); + if (r_objid < 0) { + ldm_error ("VBLK header is corrupt."); + return false; + } + + vb->flags = buf[0x12]; + vb->type = buf[0x13]; + vb->obj_id = ldm_get_vnum (buf + 0x18); + ldm_get_vstr (buf+0x18+r_objid, vb->name, sizeof (vb->name)); + + switch (vb->type) { + case VBLK_CMP3: result = ldm_parse_cmp3 (buf, len, vb); break; + case VBLK_DSK3: result = ldm_parse_dsk3 (buf, len, vb); break; + case VBLK_DSK4: result = ldm_parse_dsk4 (buf, len, vb); break; + case VBLK_DGR3: result = ldm_parse_dgr3 (buf, len, vb); break; + case VBLK_DGR4: result = ldm_parse_dgr4 (buf, len, vb); break; + case VBLK_PRT3: result = ldm_parse_prt3 (buf, len, vb); break; + case VBLK_VOL5: result = ldm_parse_vol5 (buf, len, vb); break; + } + + if (result) + ldm_debug ("Parsed VBLK 0x%llx (type: 0x%02x) ok.", + (unsigned long long) vb->obj_id, vb->type); + else + ldm_error ("Failed to parse VBLK 0x%llx (type: 0x%02x).", + (unsigned long long) vb->obj_id, vb->type); + + return result; +} + + +/** + * ldm_ldmdb_add - Adds a raw VBLK entry to the ldmdb database + * @data: Raw VBLK to add to the database + * @len: Size of the raw VBLK + * @ldb: Cache of the database structures + * + * The VBLKs are sorted into categories. Partitions are also sorted by offset. + * + * N.B. This function does not check the validity of the VBLKs. + * + * Return: 'true' The VBLK was added + * 'false' An error occurred + */ +static bool ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb) +{ + struct vblk *vb; + struct list_head *item; + + BUG_ON (!data || !ldb); + + vb = kmalloc (sizeof (*vb), GFP_KERNEL); + if (!vb) { + ldm_crit ("Out of memory."); + return false; + } + + if (!ldm_parse_vblk (data, len, vb)) { + kfree(vb); + return false; /* Already logged */ + } + + /* Put vblk into the correct list. */ + switch (vb->type) { + case VBLK_DGR3: + case VBLK_DGR4: + list_add (&vb->list, &ldb->v_dgrp); + break; + case VBLK_DSK3: + case VBLK_DSK4: + list_add (&vb->list, &ldb->v_disk); + break; + case VBLK_VOL5: + list_add (&vb->list, &ldb->v_volu); + break; + case VBLK_CMP3: + list_add (&vb->list, &ldb->v_comp); + break; + case VBLK_PRT3: + /* Sort by the partition's start sector. */ + list_for_each (item, &ldb->v_part) { + struct vblk *v = list_entry (item, struct vblk, list); + if ((v->vblk.part.disk_id == vb->vblk.part.disk_id) && + (v->vblk.part.start > vb->vblk.part.start)) { + list_add_tail (&vb->list, &v->list); + return true; + } + } + list_add_tail (&vb->list, &ldb->v_part); + break; + } + return true; +} + +/** + * ldm_frag_add - Add a VBLK fragment to a list + * @data: Raw fragment to be added to the list + * @size: Size of the raw fragment + * @frags: Linked list of VBLK fragments + * + * Fragmented VBLKs may not be consecutive in the database, so they are placed + * in a list so they can be pieced together later. + * + * Return: 'true' Success, the VBLK was added to the list + * 'false' Error, a problem occurred + */ +static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) +{ + struct frag *f; + struct list_head *item; + int rec, num, group; + + BUG_ON (!data || !frags); + + if (size < 2 * VBLK_SIZE_HEAD) { + ldm_error("Value of size is to small."); + return false; + } + + group = get_unaligned_be32(data + 0x08); + rec = get_unaligned_be16(data + 0x0C); + num = get_unaligned_be16(data + 0x0E); + if ((num < 1) || (num > 4)) { + ldm_error ("A VBLK claims to have %d parts.", num); + return false; + } + if (rec >= num) { + ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num); + return false; + } + + list_for_each (item, frags) { + f = list_entry (item, struct frag, list); + if (f->group == group) + goto found; + } + + f = kmalloc (sizeof (*f) + size*num, GFP_KERNEL); + if (!f) { + ldm_crit ("Out of memory."); + return false; + } + + f->group = group; + f->num = num; + f->rec = rec; + f->map = 0xFF << num; + + list_add_tail (&f->list, frags); +found: + if (rec >= f->num) { + ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num); + return false; + } + + if (f->map & (1 << rec)) { + ldm_error ("Duplicate VBLK, part %d.", rec); + f->map &= 0x7F; /* Mark the group as broken */ + return false; + } + + f->map |= (1 << rec); + + data += VBLK_SIZE_HEAD; + size -= VBLK_SIZE_HEAD; + + memcpy (f->data+rec*(size-VBLK_SIZE_HEAD)+VBLK_SIZE_HEAD, data, size); + + return true; +} + +/** + * ldm_frag_free - Free a linked list of VBLK fragments + * @list: Linked list of fragments + * + * Free a linked list of VBLK fragments + * + * Return: none + */ +static void ldm_frag_free (struct list_head *list) +{ + struct list_head *item, *tmp; + + BUG_ON (!list); + + list_for_each_safe (item, tmp, list) + kfree (list_entry (item, struct frag, list)); +} + +/** + * ldm_frag_commit - Validate fragmented VBLKs and add them to the database + * @frags: Linked list of VBLK fragments + * @ldb: Cache of the database structures + * + * Now that all the fragmented VBLKs have been collected, they must be added to + * the database for later use. + * + * Return: 'true' All the fragments we added successfully + * 'false' One or more of the fragments we invalid + */ +static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) +{ + struct frag *f; + struct list_head *item; + + BUG_ON (!frags || !ldb); + + list_for_each (item, frags) { + f = list_entry (item, struct frag, list); + + if (f->map != 0xFF) { + ldm_error ("VBLK group %d is incomplete (0x%02x).", + f->group, f->map); + return false; + } + + if (!ldm_ldmdb_add (f->data, f->num*ldb->vm.vblk_size, ldb)) + return false; /* Already logged */ + } + return true; +} + +/** + * ldm_get_vblks - Read the on-disk database of VBLKs into memory + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @state->bdev, of the database + * @ldb: Cache of the database structures + * + * To use the information from the VBLKs, they need to be read from the disk, + * unpacked and validated. We cache them in @ldb according to their type. + * + * Return: 'true' All the VBLKs were read successfully + * 'false' An error occurred + */ +static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base, + struct ldmdb *ldb) +{ + int size, perbuf, skip, finish, s, v, recs; + u8 *data = NULL; + Sector sect; + bool result = false; + LIST_HEAD (frags); + + BUG_ON(!state || !ldb); + + size = ldb->vm.vblk_size; + perbuf = 512 / size; + skip = ldb->vm.vblk_offset >> 9; /* Bytes to sectors */ + finish = (size * ldb->vm.last_vblk_seq) >> 9; + + for (s = skip; s < finish; s++) { /* For each sector */ + data = read_part_sector(state, base + OFF_VMDB + s, §); + if (!data) { + ldm_crit ("Disk read failed."); + goto out; + } + + for (v = 0; v < perbuf; v++, data+=size) { /* For each vblk */ + if (MAGIC_VBLK != get_unaligned_be32(data)) { + ldm_error ("Expected to find a VBLK."); + goto out; + } + + recs = get_unaligned_be16(data + 0x0E); /* Number of records */ + if (recs == 1) { + if (!ldm_ldmdb_add (data, size, ldb)) + goto out; /* Already logged */ + } else if (recs > 1) { + if (!ldm_frag_add (data, size, &frags)) + goto out; /* Already logged */ + } + /* else Record is not in use, ignore it. */ + } + put_dev_sector (sect); + data = NULL; + } + + result = ldm_frag_commit (&frags, ldb); /* Failures, already logged */ +out: + if (data) + put_dev_sector (sect); + ldm_frag_free (&frags); + + return result; +} + +/** + * ldm_free_vblks - Free a linked list of vblk's + * @lh: Head of a linked list of struct vblk + * + * Free a list of vblk's and free the memory used to maintain the list. + * + * Return: none + */ +static void ldm_free_vblks (struct list_head *lh) +{ + struct list_head *item, *tmp; + + BUG_ON (!lh); + + list_for_each_safe (item, tmp, lh) + kfree (list_entry (item, struct vblk, list)); +} + + +/** + * ldm_partition - Find out whether a device is a dynamic disk and handle it + * @state: Partition check state including device holding the LDM Database + * + * This determines whether the device @bdev is a dynamic disk and if so creates + * the partitions necessary in the gendisk structure pointed to by @hd. + * + * We create a dummy device 1, which contains the LDM database, and then create + * each partition described by the LDM database in sequence as devices 2+. For + * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, + * and so on: the actual data containing partitions. + * + * Return: 1 Success, @state->bdev is a dynamic disk and we handled it + * 0 Success, @state->bdev is not a dynamic disk + * -1 An error occurred before enough information had been read + * Or @state->bdev is a dynamic disk, but it may be corrupted + */ +int ldm_partition(struct parsed_partitions *state) +{ + struct ldmdb *ldb; + unsigned long base; + int result = -1; + + BUG_ON(!state); + + /* Look for signs of a Dynamic Disk */ + if (!ldm_validate_partition_table(state)) + return 0; + + ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); + if (!ldb) { + ldm_crit ("Out of memory."); + goto out; + } + + /* Parse and check privheads. */ + if (!ldm_validate_privheads(state, &ldb->ph)) + goto out; /* Already logged */ + + /* All further references are relative to base (database start). */ + base = ldb->ph.config_start; + + /* Parse and check tocs and vmdb. */ + if (!ldm_validate_tocblocks(state, base, ldb) || + !ldm_validate_vmdb(state, base, ldb)) + goto out; /* Already logged */ + + /* Initialize vblk lists in ldmdb struct */ + INIT_LIST_HEAD (&ldb->v_dgrp); + INIT_LIST_HEAD (&ldb->v_disk); + INIT_LIST_HEAD (&ldb->v_volu); + INIT_LIST_HEAD (&ldb->v_comp); + INIT_LIST_HEAD (&ldb->v_part); + + if (!ldm_get_vblks(state, base, ldb)) { + ldm_crit ("Failed to read the VBLKs from the database."); + goto cleanup; + } + + /* Finally, create the data partition devices. */ + if (ldm_create_data_partitions(state, ldb)) { + ldm_debug ("Parsed LDM database successfully."); + result = 1; + } + /* else Already logged */ + +cleanup: + ldm_free_vblks (&ldb->v_dgrp); + ldm_free_vblks (&ldb->v_disk); + ldm_free_vblks (&ldb->v_volu); + ldm_free_vblks (&ldb->v_comp); + ldm_free_vblks (&ldb->v_part); +out: + kfree (ldb); + return result; +} diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h new file mode 100644 index 00000000..374242c0 --- /dev/null +++ b/fs/partitions/ldm.h @@ -0,0 +1,215 @@ +/** + * ldm - Part of the Linux-NTFS project. + * + * Copyright (C) 2001,2002 Richard Russon + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (C) 2001,2002 Jakob Kemi + * + * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS source + * in the file COPYING); if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _FS_PT_LDM_H_ +#define _FS_PT_LDM_H_ + +#include +#include +#include +#include +#include +#include + +struct parsed_partitions; + +/* Magic numbers in CPU format. */ +#define MAGIC_VMDB 0x564D4442 /* VMDB */ +#define MAGIC_VBLK 0x56424C4B /* VBLK */ +#define MAGIC_PRIVHEAD 0x5052495648454144ULL /* PRIVHEAD */ +#define MAGIC_TOCBLOCK 0x544F43424C4F434BULL /* TOCBLOCK */ + +/* The defined vblk types. */ +#define VBLK_VOL5 0x51 /* Volume, version 5 */ +#define VBLK_CMP3 0x32 /* Component, version 3 */ +#define VBLK_PRT3 0x33 /* Partition, version 3 */ +#define VBLK_DSK3 0x34 /* Disk, version 3 */ +#define VBLK_DSK4 0x44 /* Disk, version 4 */ +#define VBLK_DGR3 0x35 /* Disk Group, version 3 */ +#define VBLK_DGR4 0x45 /* Disk Group, version 4 */ + +/* vblk flags indicating extra information will be present */ +#define VBLK_FLAG_COMP_STRIPE 0x10 +#define VBLK_FLAG_PART_INDEX 0x08 +#define VBLK_FLAG_DGR3_IDS 0x08 +#define VBLK_FLAG_DGR4_IDS 0x08 +#define VBLK_FLAG_VOLU_ID1 0x08 +#define VBLK_FLAG_VOLU_ID2 0x20 +#define VBLK_FLAG_VOLU_SIZE 0x80 +#define VBLK_FLAG_VOLU_DRIVE 0x02 + +/* size of a vblk's static parts */ +#define VBLK_SIZE_HEAD 16 +#define VBLK_SIZE_CMP3 22 /* Name and version */ +#define VBLK_SIZE_DGR3 12 +#define VBLK_SIZE_DGR4 44 +#define VBLK_SIZE_DSK3 12 +#define VBLK_SIZE_DSK4 45 +#define VBLK_SIZE_PRT3 28 +#define VBLK_SIZE_VOL5 58 + +/* component types */ +#define COMP_STRIPE 0x01 /* Stripe-set */ +#define COMP_BASIC 0x02 /* Basic disk */ +#define COMP_RAID 0x03 /* Raid-set */ + +/* Other constants. */ +#define LDM_DB_SIZE 2048 /* Size in sectors (= 1MiB). */ + +#define OFF_PRIV1 6 /* Offset of the first privhead + relative to the start of the + device in sectors */ + +/* Offsets to structures within the LDM Database in sectors. */ +#define OFF_PRIV2 1856 /* Backup private headers. */ +#define OFF_PRIV3 2047 + +#define OFF_TOCB1 1 /* Tables of contents. */ +#define OFF_TOCB2 2 +#define OFF_TOCB3 2045 +#define OFF_TOCB4 2046 + +#define OFF_VMDB 17 /* List of partitions. */ + +#define LDM_PARTITION 0x42 /* Formerly SFS (Landis). */ + +#define TOC_BITMAP1 "config" /* Names of the two defined */ +#define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */ + +/* Borrowed from msdos.c */ +#define SYS_IND(p) (get_unaligned(&(p)->sys_ind)) + +struct frag { /* VBLK Fragment handling */ + struct list_head list; + u32 group; + u8 num; /* Total number of records */ + u8 rec; /* This is record number n */ + u8 map; /* Which portions are in use */ + u8 data[0]; +}; + +/* In memory LDM database structures. */ + +#define GUID_SIZE 16 + +struct privhead { /* Offsets and sizes are in sectors. */ + u16 ver_major; + u16 ver_minor; + u64 logical_disk_start; + u64 logical_disk_size; + u64 config_start; + u64 config_size; + u8 disk_id[GUID_SIZE]; +}; + +struct tocblock { /* We have exactly two bitmaps. */ + u8 bitmap1_name[16]; + u64 bitmap1_start; + u64 bitmap1_size; + u8 bitmap2_name[16]; + u64 bitmap2_start; + u64 bitmap2_size; +}; + +struct vmdb { /* VMDB: The database header */ + u16 ver_major; + u16 ver_minor; + u32 vblk_size; + u32 vblk_offset; + u32 last_vblk_seq; +}; + +struct vblk_comp { /* VBLK Component */ + u8 state[16]; + u64 parent_id; + u8 type; + u8 children; + u16 chunksize; +}; + +struct vblk_dgrp { /* VBLK Disk Group */ + u8 disk_id[64]; +}; + +struct vblk_disk { /* VBLK Disk */ + u8 disk_id[GUID_SIZE]; + u8 alt_name[128]; +}; + +struct vblk_part { /* VBLK Partition */ + u64 start; + u64 size; /* start, size and vol_off in sectors */ + u64 volume_offset; + u64 parent_id; + u64 disk_id; + u8 partnum; +}; + +struct vblk_volu { /* VBLK Volume */ + u8 volume_type[16]; + u8 volume_state[16]; + u8 guid[16]; + u8 drive_hint[4]; + u64 size; + u8 partition_type; +}; + +struct vblk_head { /* VBLK standard header */ + u32 group; + u16 rec; + u16 nrec; +}; + +struct vblk { /* Generalised VBLK */ + u8 name[64]; + u64 obj_id; + u32 sequence; + u8 flags; + u8 type; + union { + struct vblk_comp comp; + struct vblk_dgrp dgrp; + struct vblk_disk disk; + struct vblk_part part; + struct vblk_volu volu; + } vblk; + struct list_head list; +}; + +struct ldmdb { /* Cache of the database */ + struct privhead ph; + struct tocblock toc; + struct vmdb vm; + struct list_head v_dgrp; + struct list_head v_disk; + struct list_head v_volu; + struct list_head v_comp; + struct list_head v_part; +}; + +int ldm_partition(struct parsed_partitions *state); + +#endif /* _FS_PT_LDM_H_ */ + diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c new file mode 100644 index 00000000..11f688bd --- /dev/null +++ b/fs/partitions/mac.c @@ -0,0 +1,134 @@ +/* + * fs/partitions/mac.c + * + * Code extracted from drivers/block/genhd.c + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#include +#include "check.h" +#include "mac.h" + +#ifdef CONFIG_PPC_PMAC +#include +extern void note_bootable_part(dev_t dev, int part, int goodness); +#endif + +/* + * Code to understand MacOS partition tables. + */ + +static inline void mac_fix_string(char *stg, int len) +{ + int i; + + for (i = len - 1; i >= 0 && stg[i] == ' '; i--) + stg[i] = 0; +} + +int mac_partition(struct parsed_partitions *state) +{ + Sector sect; + unsigned char *data; + int slot, blocks_in_map; + unsigned secsize; +#ifdef CONFIG_PPC_PMAC + int found_root = 0; + int found_root_goodness = 0; +#endif + struct mac_partition *part; + struct mac_driver_desc *md; + + /* Get 0th block and look at the first partition map entry. */ + md = read_part_sector(state, 0, §); + if (!md) + return -1; + if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { + put_dev_sector(sect); + return 0; + } + secsize = be16_to_cpu(md->block_size); + put_dev_sector(sect); + data = read_part_sector(state, secsize/512, §); + if (!data) + return -1; + part = (struct mac_partition *) (data + secsize%512); + if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { + put_dev_sector(sect); + return 0; /* not a MacOS disk */ + } + blocks_in_map = be32_to_cpu(part->map_count); + if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) { + put_dev_sector(sect); + return 0; + } + strlcat(state->pp_buf, " [mac]", PAGE_SIZE); + for (slot = 1; slot <= blocks_in_map; ++slot) { + int pos = slot * secsize; + put_dev_sector(sect); + data = read_part_sector(state, pos/512, §); + if (!data) + return -1; + part = (struct mac_partition *) (data + pos%512); + if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) + break; + put_partition(state, slot, + be32_to_cpu(part->start_block) * (secsize/512), + be32_to_cpu(part->block_count) * (secsize/512)); + + if (!strnicmp(part->type, "Linux_RAID", 10)) + state->parts[slot].flags = ADDPART_FLAG_RAID; +#ifdef CONFIG_PPC_PMAC + /* + * If this is the first bootable partition, tell the + * setup code, in case it wants to make this the root. + */ + if (machine_is(powermac)) { + int goodness = 0; + + mac_fix_string(part->processor, 16); + mac_fix_string(part->name, 32); + mac_fix_string(part->type, 32); + + if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE) + && strcasecmp(part->processor, "powerpc") == 0) + goodness++; + + if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0 + || (strnicmp(part->type, "Linux", 5) == 0 + && strcasecmp(part->type, "Linux_swap") != 0)) { + int i, l; + + goodness++; + l = strlen(part->name); + if (strcmp(part->name, "/") == 0) + goodness++; + for (i = 0; i <= l - 4; ++i) { + if (strnicmp(part->name + i, "root", + 4) == 0) { + goodness += 2; + break; + } + } + if (strnicmp(part->name, "swap", 4) == 0) + goodness--; + } + + if (goodness > found_root_goodness) { + found_root = slot; + found_root_goodness = goodness; + } + } +#endif /* CONFIG_PPC_PMAC */ + } +#ifdef CONFIG_PPC_PMAC + if (found_root_goodness) + note_bootable_part(state->bdev->bd_dev, found_root, + found_root_goodness); +#endif + + put_dev_sector(sect); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h new file mode 100644 index 00000000..3c7d9843 --- /dev/null +++ b/fs/partitions/mac.h @@ -0,0 +1,44 @@ +/* + * fs/partitions/mac.h + */ + +#define MAC_PARTITION_MAGIC 0x504d + +/* type field value for A/UX or other Unix partitions */ +#define APPLE_AUX_TYPE "Apple_UNIX_SVR2" + +struct mac_partition { + __be16 signature; /* expected to be MAC_PARTITION_MAGIC */ + __be16 res1; + __be32 map_count; /* # blocks in partition map */ + __be32 start_block; /* absolute starting block # of partition */ + __be32 block_count; /* number of blocks in partition */ + char name[32]; /* partition name */ + char type[32]; /* string type description */ + __be32 data_start; /* rel block # of first data block */ + __be32 data_count; /* number of data blocks */ + __be32 status; /* partition status bits */ + __be32 boot_start; + __be32 boot_size; + __be32 boot_load; + __be32 boot_load2; + __be32 boot_entry; + __be32 boot_entry2; + __be32 boot_cksum; + char processor[16]; /* identifies ISA of boot */ + /* there is more stuff after this that we don't need */ +}; + +#define MAC_STATUS_BOOTABLE 8 /* partition is bootable */ + +#define MAC_DRIVER_MAGIC 0x4552 + +/* Driver descriptor structure, in block 0 */ +struct mac_driver_desc { + __be16 signature; /* expected to be MAC_DRIVER_MAGIC */ + __be16 block_size; + __be32 block_count; + /* ... more stuff */ +}; + +int mac_partition(struct parsed_partitions *state); diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c new file mode 100644 index 00000000..5f79a667 --- /dev/null +++ b/fs/partitions/msdos.c @@ -0,0 +1,552 @@ +/* + * fs/partitions/msdos.c + * + * Code extracted from drivers/block/genhd.c + * Copyright (C) 1991-1998 Linus Torvalds + * + * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug + * in the early extended-partition checks and added DM partitions + * + * Support for DiskManager v6.0x added by Mark Lord, + * with information provided by OnTrack. This now works for linux fdisk + * and LILO, as well as loadlin and bootln. Note that disks other than + * /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1). + * + * More flexible handling of extended partitions - aeb, 950831 + * + * Check partition table on IDE disks for common CHS translations + * + * Re-organised Feb 1998 Russell King + */ +#include + +#include "check.h" +#include "msdos.h" +#include "efi.h" + +/* + * Many architectures don't like unaligned accesses, while + * the nr_sects and start_sect partition table entries are + * at a 2 (mod 4) address. + */ +#include + +#define SYS_IND(p) get_unaligned(&p->sys_ind) + +static inline sector_t nr_sects(struct partition *p) +{ + return (sector_t)get_unaligned_le32(&p->nr_sects); +} + +static inline sector_t start_sect(struct partition *p) +{ + return (sector_t)get_unaligned_le32(&p->start_sect); +} + +static inline int is_extended_partition(struct partition *p) +{ + return (SYS_IND(p) == DOS_EXTENDED_PARTITION || + SYS_IND(p) == WIN98_EXTENDED_PARTITION || + SYS_IND(p) == LINUX_EXTENDED_PARTITION); +} + +#define MSDOS_LABEL_MAGIC1 0x55 +#define MSDOS_LABEL_MAGIC2 0xAA + +static inline int +msdos_magic_present(unsigned char *p) +{ + return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2); +} + +/* Value is EBCDIC 'IBMA' */ +#define AIX_LABEL_MAGIC1 0xC9 +#define AIX_LABEL_MAGIC2 0xC2 +#define AIX_LABEL_MAGIC3 0xD4 +#define AIX_LABEL_MAGIC4 0xC1 +static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) +{ + struct partition *pt = (struct partition *) (p + 0x1be); + Sector sect; + unsigned char *d; + int slot, ret = 0; + + if (!(p[0] == AIX_LABEL_MAGIC1 && + p[1] == AIX_LABEL_MAGIC2 && + p[2] == AIX_LABEL_MAGIC3 && + p[3] == AIX_LABEL_MAGIC4)) + return 0; + /* Assume the partition table is valid if Linux partitions exists */ + for (slot = 1; slot <= 4; slot++, pt++) { + if (pt->sys_ind == LINUX_SWAP_PARTITION || + pt->sys_ind == LINUX_RAID_PARTITION || + pt->sys_ind == LINUX_DATA_PARTITION || + pt->sys_ind == LINUX_LVM_PARTITION || + is_extended_partition(pt)) + return 0; + } + d = read_part_sector(state, 7, §); + if (d) { + if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') + ret = 1; + put_dev_sector(sect); + }; + return ret; +} + +/* + * Create devices for each logical partition in an extended partition. + * The logical partitions form a linked list, with each entry being + * a partition table with two entries. The first entry + * is the real data partition (with a start relative to the partition + * table start). The second is a pointer to the next logical partition + * (with a start relative to the entire extended partition). + * We do not create a Linux partition for the partition tables, but + * only for the actual data partitions. + */ + +static void parse_extended(struct parsed_partitions *state, + sector_t first_sector, sector_t first_size) +{ + struct partition *p; + Sector sect; + unsigned char *data; + sector_t this_sector, this_size; + sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + int loopct = 0; /* number of links followed + without finding a data partition */ + int i; + + this_sector = first_sector; + this_size = first_size; + + while (1) { + if (++loopct > 100) + return; + if (state->next == state->limit) + return; + data = read_part_sector(state, this_sector, §); + if (!data) + return; + + if (!msdos_magic_present(data + 510)) + goto done; + + p = (struct partition *) (data + 0x1be); + + /* + * Usually, the first entry is the real data partition, + * the 2nd entry is the next extended partition, or empty, + * and the 3rd and 4th entries are unused. + * However, DRDOS sometimes has the extended partition as + * the first entry (when the data partition is empty), + * and OS/2 seems to use all four entries. + */ + + /* + * First process the data partition(s) + */ + for (i=0; i<4; i++, p++) { + sector_t offs, size, next; + if (!nr_sects(p) || is_extended_partition(p)) + continue; + + /* Check the 3rd and 4th entries - + these sometimes contain random garbage */ + offs = start_sect(p)*sector_size; + size = nr_sects(p)*sector_size; + next = this_sector + offs; + if (i >= 2) { + if (offs + size > this_size) + continue; + if (next < first_sector) + continue; + if (next + size > first_sector + first_size) + continue; + } + + put_partition(state, state->next, next, size); + if (SYS_IND(p) == LINUX_RAID_PARTITION) + state->parts[state->next].flags = ADDPART_FLAG_RAID; + loopct = 0; + if (++state->next == state->limit) + goto done; + } + /* + * Next, process the (first) extended partition, if present. + * (So far, there seems to be no reason to make + * parse_extended() recursive and allow a tree + * of extended partitions.) + * It should be a link to the next logical partition. + */ + p -= 4; + for (i=0; i<4; i++, p++) + if (nr_sects(p) && is_extended_partition(p)) + break; + if (i == 4) + goto done; /* nothing left to do */ + + this_sector = first_sector + start_sect(p) * sector_size; + this_size = nr_sects(p) * sector_size; + put_dev_sector(sect); + } +done: + put_dev_sector(sect); +} + +/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also + indicates linux swap. Be careful before believing this is Solaris. */ + +static void parse_solaris_x86(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_SOLARIS_X86_PARTITION + Sector sect; + struct solaris_x86_vtoc *v; + int i; + short max_nparts; + + v = read_part_sector(state, offset + 1, §); + if (!v) + return; + if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { + put_dev_sector(sect); + return; + } + { + char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1]; + + snprintf(tmp, sizeof(tmp), " %s%d: name, origin); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + if (le32_to_cpu(v->v_version) != 1) { + char tmp[64]; + + snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n", + le32_to_cpu(v->v_version)); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + put_dev_sector(sect); + return; + } + /* Ensure we can handle previous case of VTOC with 8 entries gracefully */ + max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; + for (i=0; inextlimit; i++) { + struct solaris_x86_slice *s = &v->v_slice[i]; + char tmp[3 + 10 + 1 + 1]; + + if (s->s_size == 0) + continue; + snprintf(tmp, sizeof(tmp), " [s%d]", i); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + /* solaris partitions are relative to current MS-DOS + * one; must add the offset of the current partition */ + put_partition(state, state->next++, + le32_to_cpu(s->s_start)+offset, + le32_to_cpu(s->s_size)); + } + put_dev_sector(sect); + strlcat(state->pp_buf, " >\n", PAGE_SIZE); +#endif +} + +#if defined(CONFIG_BSD_DISKLABEL) +/* + * Create devices for BSD partitions listed in a disklabel, under a + * dos-like partition. See parse_extended() for more information. + */ +static void parse_bsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin, char *flavour, + int max_partitions) +{ + Sector sect; + struct bsd_disklabel *l; + struct bsd_partition *p; + char tmp[64]; + + l = read_part_sector(state, offset + 1, §); + if (!l) + return; + if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { + put_dev_sector(sect); + return; + } + + snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + + if (le16_to_cpu(l->d_npartitions) < max_partitions) + max_partitions = le16_to_cpu(l->d_npartitions); + for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { + sector_t bsd_start, bsd_size; + + if (state->next == state->limit) + break; + if (p->p_fstype == BSD_FS_UNUSED) + continue; + bsd_start = le32_to_cpu(p->p_offset); + bsd_size = le32_to_cpu(p->p_size); + if (offset == bsd_start && size == bsd_size) + /* full parent partition, we have it already */ + continue; + if (offset > bsd_start || offset+size < bsd_start+bsd_size) { + strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE); + continue; + } + put_partition(state, state->next++, bsd_start, bsd_size); + } + put_dev_sector(sect); + if (le16_to_cpu(l->d_npartitions) > max_partitions) { + snprintf(tmp, sizeof(tmp), " (ignored %d more)", + le16_to_cpu(l->d_npartitions) - max_partitions); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + strlcat(state->pp_buf, " >\n", PAGE_SIZE); +} +#endif + +static void parse_freebsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_BSD_DISKLABEL + parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS); +#endif +} + +static void parse_netbsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_BSD_DISKLABEL + parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS); +#endif +} + +static void parse_openbsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_BSD_DISKLABEL + parse_bsd(state, offset, size, origin, "openbsd", + OPENBSD_MAXPARTITIONS); +#endif +} + +/* + * Create devices for Unixware partitions listed in a disklabel, under a + * dos-like partition. See parse_extended() for more information. + */ +static void parse_unixware(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_UNIXWARE_DISKLABEL + Sector sect; + struct unixware_disklabel *l; + struct unixware_slice *p; + + l = read_part_sector(state, offset + 29, §); + if (!l) + return; + if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || + le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) { + put_dev_sector(sect); + return; + } + { + char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1]; + + snprintf(tmp, sizeof(tmp), " %s%d: name, origin); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + p = &l->vtoc.v_slice[1]; + /* I omit the 0th slice as it is the same as whole disk. */ + while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { + if (state->next == state->limit) + break; + + if (p->s_label != UNIXWARE_FS_UNUSED) + put_partition(state, state->next++, + le32_to_cpu(p->start_sect), + le32_to_cpu(p->nr_sects)); + p++; + } + put_dev_sector(sect); + strlcat(state->pp_buf, " >\n", PAGE_SIZE); +#endif +} + +/* + * Minix 2.0.0/2.0.2 subpartition support. + * Anand Krishnamurthy + * Rajeev V. Pillai + */ +static void parse_minix(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_MINIX_SUBPARTITION + Sector sect; + unsigned char *data; + struct partition *p; + int i; + + data = read_part_sector(state, offset, §); + if (!data) + return; + + p = (struct partition *)(data + 0x1be); + + /* The first sector of a Minix partition can have either + * a secondary MBR describing its subpartitions, or + * the normal boot sector. */ + if (msdos_magic_present (data + 510) && + SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */ + char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; + + snprintf(tmp, sizeof(tmp), " %s%d: name, origin); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { + if (state->next == state->limit) + break; + /* add each partition in use */ + if (SYS_IND(p) == MINIX_PARTITION) + put_partition(state, state->next++, + start_sect(p), nr_sects(p)); + } + strlcat(state->pp_buf, " >\n", PAGE_SIZE); + } + put_dev_sector(sect); +#endif /* CONFIG_MINIX_SUBPARTITION */ +} + +static struct { + unsigned char id; + void (*parse)(struct parsed_partitions *, sector_t, sector_t, int); +} subtypes[] = { + {FREEBSD_PARTITION, parse_freebsd}, + {NETBSD_PARTITION, parse_netbsd}, + {OPENBSD_PARTITION, parse_openbsd}, + {MINIX_PARTITION, parse_minix}, + {UNIXWARE_PARTITION, parse_unixware}, + {SOLARIS_X86_PARTITION, parse_solaris_x86}, + {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86}, + {0, NULL}, +}; + +int msdos_partition(struct parsed_partitions *state) +{ + sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + Sector sect; + unsigned char *data; + struct partition *p; + struct fat_boot_sector *fb; + int slot; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + if (!msdos_magic_present(data + 510)) { + put_dev_sector(sect); + return 0; + } + + if (aix_magic_present(state, data)) { + put_dev_sector(sect); + strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); + return 0; + } + + /* + * Now that the 55aa signature is present, this is probably + * either the boot sector of a FAT filesystem or a DOS-type + * partition table. Reject this in case the boot indicator + * is not 0 or 0x80. + */ + p = (struct partition *) (data + 0x1be); + for (slot = 1; slot <= 4; slot++, p++) { + if (p->boot_ind != 0 && p->boot_ind != 0x80) { + /* + * Even without a valid boot inidicator value + * its still possible this is valid FAT filesystem + * without a partition table. + */ + fb = (struct fat_boot_sector *) data; + if (slot == 1 && fb->reserved && fb->fats + && fat_valid_media(fb->media)) { + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; + } else { + put_dev_sector(sect); + return 0; + } + } + } + +#ifdef CONFIG_EFI_PARTITION + p = (struct partition *) (data + 0x1be); + for (slot = 1 ; slot <= 4 ; slot++, p++) { + /* If this is an EFI GPT disk, msdos should ignore it. */ + if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) { + put_dev_sector(sect); + return 0; + } + } +#endif + p = (struct partition *) (data + 0x1be); + + /* + * Look for partitions in two passes: + * First find the primary and DOS-type extended partitions. + * On the second pass look inside *BSD, Unixware and Solaris partitions. + */ + + state->next = 5; + for (slot = 1 ; slot <= 4 ; slot++, p++) { + sector_t start = start_sect(p)*sector_size; + sector_t size = nr_sects(p)*sector_size; + if (!size) + continue; + if (is_extended_partition(p)) { + /* + * prevent someone doing mkfs or mkswap on an + * extended partition, but leave room for LILO + * FIXME: this uses one logical sector for > 512b + * sector, although it may not be enough/proper. + */ + sector_t n = 2; + n = min(size, max(sector_size, n)); + put_partition(state, slot, start, n); + + strlcat(state->pp_buf, " <", PAGE_SIZE); + parse_extended(state, start, size); + strlcat(state->pp_buf, " >", PAGE_SIZE); + continue; + } + put_partition(state, slot, start, size); + if (SYS_IND(p) == LINUX_RAID_PARTITION) + state->parts[slot].flags = ADDPART_FLAG_RAID; + if (SYS_IND(p) == DM6_PARTITION) + strlcat(state->pp_buf, "[DM]", PAGE_SIZE); + if (SYS_IND(p) == EZD_PARTITION) + strlcat(state->pp_buf, "[EZD]", PAGE_SIZE); + } + + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + /* second pass - output for each on a separate line */ + p = (struct partition *) (0x1be + data); + for (slot = 1 ; slot <= 4 ; slot++, p++) { + unsigned char id = SYS_IND(p); + int n; + + if (!nr_sects(p)) + continue; + + for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) + ; + + if (!subtypes[n].parse) + continue; + subtypes[n].parse(state, start_sect(p) * sector_size, + nr_sects(p) * sector_size, slot); + } + put_dev_sector(sect); + return 1; +} diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h new file mode 100644 index 00000000..38c781c4 --- /dev/null +++ b/fs/partitions/msdos.h @@ -0,0 +1,8 @@ +/* + * fs/partitions/msdos.h + */ + +#define MSDOS_LABEL_MAGIC 0xAA55 + +int msdos_partition(struct parsed_partitions *state); + diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c new file mode 100644 index 00000000..764b86a0 --- /dev/null +++ b/fs/partitions/osf.c @@ -0,0 +1,86 @@ +/* + * fs/partitions/osf.c + * + * Code extracted from drivers/block/genhd.c + * + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#include "check.h" +#include "osf.h" + +#define MAX_OSF_PARTITIONS 18 + +int osf_partition(struct parsed_partitions *state) +{ + int i; + int slot = 1; + unsigned int npartitions; + Sector sect; + unsigned char *data; + struct disklabel { + __le32 d_magic; + __le16 d_type,d_subtype; + u8 d_typename[16]; + u8 d_packname[16]; + __le32 d_secsize; + __le32 d_nsectors; + __le32 d_ntracks; + __le32 d_ncylinders; + __le32 d_secpercyl; + __le32 d_secprtunit; + __le16 d_sparespertrack; + __le16 d_sparespercyl; + __le32 d_acylinders; + __le16 d_rpm, d_interleave, d_trackskew, d_cylskew; + __le32 d_headswitch, d_trkseek, d_flags; + __le32 d_drivedata[5]; + __le32 d_spare[5]; + __le32 d_magic2; + __le16 d_checksum; + __le16 d_npartitions; + __le32 d_bbsize, d_sbsize; + struct d_partition { + __le32 p_size; + __le32 p_offset; + __le32 p_fsize; + u8 p_fstype; + u8 p_frag; + __le16 p_cpg; + } d_partitions[MAX_OSF_PARTITIONS]; + } * label; + struct d_partition * partition; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + label = (struct disklabel *) (data+64); + partition = label->d_partitions; + if (le32_to_cpu(label->d_magic) != DISKLABELMAGIC) { + put_dev_sector(sect); + return 0; + } + if (le32_to_cpu(label->d_magic2) != DISKLABELMAGIC) { + put_dev_sector(sect); + return 0; + } + npartitions = le16_to_cpu(label->d_npartitions); + if (npartitions > MAX_OSF_PARTITIONS) { + put_dev_sector(sect); + return 0; + } + for (i = 0 ; i < npartitions; i++, partition++) { + if (slot == state->limit) + break; + if (le32_to_cpu(partition->p_size)) + put_partition(state, slot, + le32_to_cpu(partition->p_offset), + le32_to_cpu(partition->p_size)); + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h new file mode 100644 index 00000000..20ed2315 --- /dev/null +++ b/fs/partitions/osf.h @@ -0,0 +1,7 @@ +/* + * fs/partitions/osf.h + */ + +#define DISKLABELMAGIC (0x82564557UL) + +int osf_partition(struct parsed_partitions *state); diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c new file mode 100644 index 00000000..ea8a86dc --- /dev/null +++ b/fs/partitions/sgi.c @@ -0,0 +1,82 @@ +/* + * fs/partitions/sgi.c + * + * Code extracted from drivers/block/genhd.c + */ + +#include "check.h" +#include "sgi.h" + +struct sgi_disklabel { + __be32 magic_mushroom; /* Big fat spliff... */ + __be16 root_part_num; /* Root partition number */ + __be16 swap_part_num; /* Swap partition number */ + s8 boot_file[16]; /* Name of boot file for ARCS */ + u8 _unused0[48]; /* Device parameter useless crapola.. */ + struct sgi_volume { + s8 name[8]; /* Name of volume */ + __be32 block_num; /* Logical block number */ + __be32 num_bytes; /* How big, in bytes */ + } volume[15]; + struct sgi_partition { + __be32 num_blocks; /* Size in logical blocks */ + __be32 first_block; /* First logical block */ + __be32 type; /* Type of this partition */ + } partitions[16]; + __be32 csum; /* Disk label checksum */ + __be32 _unused1; /* Padding */ +}; + +int sgi_partition(struct parsed_partitions *state) +{ + int i, csum; + __be32 magic; + int slot = 1; + unsigned int start, blocks; + __be32 *ui, cs; + Sector sect; + struct sgi_disklabel *label; + struct sgi_partition *p; + char b[BDEVNAME_SIZE]; + + label = read_part_sector(state, 0, §); + if (!label) + return -1; + p = &label->partitions[0]; + magic = label->magic_mushroom; + if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { + /*printk("Dev %s SGI disklabel: bad magic %08x\n", + bdevname(bdev, b), be32_to_cpu(magic));*/ + put_dev_sector(sect); + return 0; + } + ui = ((__be32 *) (label + 1)) - 1; + for(csum = 0; ui >= ((__be32 *) label);) { + cs = *ui--; + csum += be32_to_cpu(cs); + } + if(csum) { + printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", + bdevname(state->bdev, b)); + put_dev_sector(sect); + return 0; + } + /* All SGI disk labels have 16 partitions, disks under Linux only + * have 15 minor's. Luckily there are always a few zero length + * partitions which we don't care about so we never overflow the + * current_minor. + */ + for(i = 0; i < 16; i++, p++) { + blocks = be32_to_cpu(p->num_blocks); + start = be32_to_cpu(p->first_block); + if (blocks) { + put_partition(state, slot, start, blocks); + if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION) + state->parts[slot].flags = ADDPART_FLAG_RAID; + } + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h new file mode 100644 index 00000000..b9553ebd --- /dev/null +++ b/fs/partitions/sgi.h @@ -0,0 +1,8 @@ +/* + * fs/partitions/sgi.h + */ + +extern int sgi_partition(struct parsed_partitions *state); + +#define SGI_LABEL_MAGIC 0x0be5a941 + diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c new file mode 100644 index 00000000..b5b6fcfb --- /dev/null +++ b/fs/partitions/sun.c @@ -0,0 +1,122 @@ +/* + * fs/partitions/sun.c + * + * Code extracted from drivers/block/genhd.c + * + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#include "check.h" +#include "sun.h" + +int sun_partition(struct parsed_partitions *state) +{ + int i; + __be16 csum; + int slot = 1; + __be16 *ush; + Sector sect; + struct sun_disklabel { + unsigned char info[128]; /* Informative text string */ + struct sun_vtoc { + __be32 version; /* Layout version */ + char volume[8]; /* Volume name */ + __be16 nparts; /* Number of partitions */ + struct sun_info { /* Partition hdrs, sec 2 */ + __be16 id; + __be16 flags; + } infos[8]; + __be16 padding; /* Alignment padding */ + __be32 bootinfo[3]; /* Info needed by mboot */ + __be32 sanity; /* To verify vtoc sanity */ + __be32 reserved[10]; /* Free space */ + __be32 timestamp[8]; /* Partition timestamp */ + } vtoc; + __be32 write_reinstruct; /* sectors to skip, writes */ + __be32 read_reinstruct; /* sectors to skip, reads */ + unsigned char spare[148]; /* Padding */ + __be16 rspeed; /* Disk rotational speed */ + __be16 pcylcount; /* Physical cylinder count */ + __be16 sparecyl; /* extra sects per cylinder */ + __be16 obs1; /* gap1 */ + __be16 obs2; /* gap2 */ + __be16 ilfact; /* Interleave factor */ + __be16 ncyl; /* Data cylinder count */ + __be16 nacyl; /* Alt. cylinder count */ + __be16 ntrks; /* Tracks per cylinder */ + __be16 nsect; /* Sectors per track */ + __be16 obs3; /* bhead - Label head offset */ + __be16 obs4; /* ppart - Physical Partition */ + struct sun_partition { + __be32 start_cylinder; + __be32 num_sectors; + } partitions[8]; + __be16 magic; /* Magic number */ + __be16 csum; /* Label xor'd checksum */ + } * label; + struct sun_partition *p; + unsigned long spc; + char b[BDEVNAME_SIZE]; + int use_vtoc; + int nparts; + + label = read_part_sector(state, 0, §); + if (!label) + return -1; + + p = label->partitions; + if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { +/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", + bdevname(bdev, b), be16_to_cpu(label->magic)); */ + put_dev_sector(sect); + return 0; + } + /* Look at the checksum */ + ush = ((__be16 *) (label+1)) - 1; + for (csum = 0; ush >= ((__be16 *) label);) + csum ^= *ush--; + if (csum) { + printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", + bdevname(state->bdev, b)); + put_dev_sector(sect); + return 0; + } + + /* Check to see if we can use the VTOC table */ + use_vtoc = ((be32_to_cpu(label->vtoc.sanity) == SUN_VTOC_SANITY) && + (be32_to_cpu(label->vtoc.version) == 1) && + (be16_to_cpu(label->vtoc.nparts) <= 8)); + + /* Use 8 partition entries if not specified in validated VTOC */ + nparts = (use_vtoc) ? be16_to_cpu(label->vtoc.nparts) : 8; + + /* + * So that old Linux-Sun partitions continue to work, + * alow the VTOC to be used under the additional condition ... + */ + use_vtoc = use_vtoc || !(label->vtoc.sanity || + label->vtoc.version || label->vtoc.nparts); + spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect); + for (i = 0; i < nparts; i++, p++) { + unsigned long st_sector; + unsigned int num_sectors; + + st_sector = be32_to_cpu(p->start_cylinder) * spc; + num_sectors = be32_to_cpu(p->num_sectors); + if (num_sectors) { + put_partition(state, slot, st_sector, num_sectors); + state->parts[slot].flags = 0; + if (use_vtoc) { + if (be16_to_cpu(label->vtoc.infos[i].id) == LINUX_RAID_PARTITION) + state->parts[slot].flags |= ADDPART_FLAG_RAID; + else if (be16_to_cpu(label->vtoc.infos[i].id) == SUN_WHOLE_DISK) + state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK; + } + } + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h new file mode 100644 index 00000000..2424baa8 --- /dev/null +++ b/fs/partitions/sun.h @@ -0,0 +1,8 @@ +/* + * fs/partitions/sun.h + */ + +#define SUN_LABEL_MAGIC 0xDABE +#define SUN_VTOC_SANITY 0x600DDEEE + +int sun_partition(struct parsed_partitions *state); diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c new file mode 100644 index 00000000..9627ccff --- /dev/null +++ b/fs/partitions/sysv68.c @@ -0,0 +1,95 @@ +/* + * fs/partitions/sysv68.c + * + * Copyright (C) 2007 Philippe De Muyter + */ + +#include "check.h" +#include "sysv68.h" + +/* + * Volume ID structure: on first 256-bytes sector of disk + */ + +struct volumeid { + u8 vid_unused[248]; + u8 vid_mac[8]; /* ASCII string "MOTOROLA" */ +}; + +/* + * config block: second 256-bytes sector on disk + */ + +struct dkconfig { + u8 ios_unused0[128]; + __be32 ios_slcblk; /* Slice table block number */ + __be16 ios_slccnt; /* Number of entries in slice table */ + u8 ios_unused1[122]; +}; + +/* + * combined volumeid and dkconfig block + */ + +struct dkblk0 { + struct volumeid dk_vid; + struct dkconfig dk_ios; +}; + +/* + * Slice Table Structure + */ + +struct slice { + __be32 nblocks; /* slice size (in blocks) */ + __be32 blkoff; /* block offset of slice */ +}; + + +int sysv68_partition(struct parsed_partitions *state) +{ + int i, slices; + int slot = 1; + Sector sect; + unsigned char *data; + struct dkblk0 *b; + struct slice *slice; + char tmp[64]; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + b = (struct dkblk0 *)data; + if (memcmp(b->dk_vid.vid_mac, "MOTOROLA", sizeof(b->dk_vid.vid_mac))) { + put_dev_sector(sect); + return 0; + } + slices = be16_to_cpu(b->dk_ios.ios_slccnt); + i = be32_to_cpu(b->dk_ios.ios_slcblk); + put_dev_sector(sect); + + data = read_part_sector(state, i, §); + if (!data) + return -1; + + slices -= 1; /* last slice is the whole disk */ + snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + slice = (struct slice *)data; + for (i = 0; i < slices; i++, slice++) { + if (slot == state->limit) + break; + if (be32_to_cpu(slice->nblocks)) { + put_partition(state, slot, + be32_to_cpu(slice->blkoff), + be32_to_cpu(slice->nblocks)); + snprintf(tmp, sizeof(tmp), "(s%u)", i); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h new file mode 100644 index 00000000..bf2f5ffa --- /dev/null +++ b/fs/partitions/sysv68.h @@ -0,0 +1 @@ +extern int sysv68_partition(struct parsed_partitions *state); diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c new file mode 100644 index 00000000..8dbaf9f7 --- /dev/null +++ b/fs/partitions/ultrix.c @@ -0,0 +1,48 @@ +/* + * fs/partitions/ultrix.c + * + * Code extracted from drivers/block/genhd.c + * + * Re-organised Jul 1999 Russell King + */ + +#include "check.h" +#include "ultrix.h" + +int ultrix_partition(struct parsed_partitions *state) +{ + int i; + Sector sect; + unsigned char *data; + struct ultrix_disklabel { + s32 pt_magic; /* magic no. indicating part. info exits */ + s32 pt_valid; /* set by driver if pt is current */ + struct pt_info { + s32 pi_nblocks; /* no. of sectors */ + u32 pi_blkoff; /* block offset for start */ + } pt_part[8]; + } *label; + +#define PT_MAGIC 0x032957 /* Partition magic number */ +#define PT_VALID 1 /* Indicates if struct is valid */ + + data = read_part_sector(state, (16384 - sizeof(*label))/512, §); + if (!data) + return -1; + + label = (struct ultrix_disklabel *)(data + 512 - sizeof(*label)); + + if (label->pt_magic == PT_MAGIC && label->pt_valid == PT_VALID) { + for (i=0; i<8; i++) + if (label->pt_part[i].pi_nblocks) + put_partition(state, i+1, + label->pt_part[i].pi_blkoff, + label->pt_part[i].pi_nblocks); + put_dev_sector(sect); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; + } else { + put_dev_sector(sect); + return 0; + } +} diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h new file mode 100644 index 00000000..a3cc00b2 --- /dev/null +++ b/fs/partitions/ultrix.h @@ -0,0 +1,5 @@ +/* + * fs/partitions/ultrix.h + */ + +int ultrix_partition(struct parsed_partitions *state); diff --git a/fs/pipe.c b/fs/pipe.c new file mode 100644 index 00000000..0499a962 --- /dev/null +++ b/fs/pipe.c @@ -0,0 +1,1326 @@ +/* + * linux/fs/pipe.c + * + * Copyright (C) 1991, 1992, 1999 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * The max size that a non-root user is allowed to grow the pipe. Can + * be set by root in /proc/sys/fs/pipe-max-size + */ +unsigned int pipe_max_size = 1048576; + +/* + * Minimum pipe size, as required by POSIX + */ +unsigned int pipe_min_size = PAGE_SIZE; + +/* + * We use a start+len construction, which provides full use of the + * allocated memory. + * -- Florian Coosmann (FGC) + * + * Reads with count = 0 should always return 0. + * -- Julian Bradfield 1999-06-07. + * + * FIFOs and Pipes now generate SIGIO for both readers and writers. + * -- Jeremy Elson 2001-08-16 + * + * pipe_read & write cleanup + * -- Manfred Spraul 2002-05-09 + */ + +static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) +{ + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, subclass); +} + +void pipe_lock(struct pipe_inode_info *pipe) +{ + /* + * pipe_lock() nests non-pipe inode locks (for writing to a file) + */ + pipe_lock_nested(pipe, I_MUTEX_PARENT); +} +EXPORT_SYMBOL(pipe_lock); + +void pipe_unlock(struct pipe_inode_info *pipe) +{ + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); +} +EXPORT_SYMBOL(pipe_unlock); + +void pipe_double_lock(struct pipe_inode_info *pipe1, + struct pipe_inode_info *pipe2) +{ + BUG_ON(pipe1 == pipe2); + + if (pipe1 < pipe2) { + pipe_lock_nested(pipe1, I_MUTEX_PARENT); + pipe_lock_nested(pipe2, I_MUTEX_CHILD); + } else { + pipe_lock_nested(pipe2, I_MUTEX_PARENT); + pipe_lock_nested(pipe1, I_MUTEX_CHILD); + } +} + +/* Drop the inode semaphore and wait for a pipe event, atomically */ +void pipe_wait(struct pipe_inode_info *pipe) +{ + DEFINE_WAIT(wait); + + /* + * Pipes are system-local resources, so sleeping on them + * is considered a noninteractive wait: + */ + prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); + pipe_unlock(pipe); + schedule(); + finish_wait(&pipe->wait, &wait); + pipe_lock(pipe); +} + +static int +pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, + int atomic) +{ + unsigned long copy; + + while (len > 0) { + while (!iov->iov_len) + iov++; + copy = min_t(unsigned long, len, iov->iov_len); + + if (atomic) { + if (__copy_from_user_inatomic(to, iov->iov_base, copy)) + return -EFAULT; + } else { + if (copy_from_user(to, iov->iov_base, copy)) + return -EFAULT; + } + to += copy; + len -= copy; + iov->iov_base += copy; + iov->iov_len -= copy; + } + return 0; +} + +static int +pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, + int atomic) +{ + unsigned long copy; + + while (len > 0) { + while (!iov->iov_len) + iov++; + copy = min_t(unsigned long, len, iov->iov_len); + + if (atomic) { + if (__copy_to_user_inatomic(iov->iov_base, from, copy)) + return -EFAULT; + } else { + if (copy_to_user(iov->iov_base, from, copy)) + return -EFAULT; + } + from += copy; + len -= copy; + iov->iov_base += copy; + iov->iov_len -= copy; + } + return 0; +} + +/* + * Attempt to pre-fault in the user memory, so we can use atomic copies. + * Returns the number of bytes not faulted in. + */ +static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) +{ + while (!iov->iov_len) + iov++; + + while (len > 0) { + unsigned long this_len; + + this_len = min_t(unsigned long, len, iov->iov_len); + if (fault_in_pages_writeable(iov->iov_base, this_len)) + break; + + len -= this_len; + iov++; + } + + return len; +} + +/* + * Pre-fault in the user memory, so we can use atomic copies. + */ +static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) +{ + while (!iov->iov_len) + iov++; + + while (len > 0) { + unsigned long this_len; + + this_len = min_t(unsigned long, len, iov->iov_len); + fault_in_pages_readable(iov->iov_base, this_len); + len -= this_len; + iov++; + } +} + +static void anon_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + /* + * If nobody else uses this page, and we don't already have a + * temporary page, let's keep track of it as a one-deep + * allocation cache. (Otherwise just release our reference to it) + */ + if (page_count(page) == 1 && !pipe->tmp_page) + pipe->tmp_page = page; + else + page_cache_release(page); +} + +/** + * generic_pipe_buf_map - virtually map a pipe buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer that should be mapped + * @atomic: whether to use an atomic map + * + * Description: + * This function returns a kernel virtual address mapping for the + * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided + * and the caller has to be careful not to fault before calling + * the unmap function. + * + * Note that this function occupies KM_USER0 if @atomic != 0. + */ +void *generic_pipe_buf_map(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, int atomic) +{ + if (atomic) { + buf->flags |= PIPE_BUF_FLAG_ATOMIC; + return kmap_atomic(buf->page, KM_USER0); + } + + return kmap(buf->page); +} +EXPORT_SYMBOL(generic_pipe_buf_map); + +/** + * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer that should be unmapped + * @map_data: the data that the mapping function returned + * + * Description: + * This function undoes the mapping that ->map() provided. + */ +void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, void *map_data) +{ + if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { + buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; + kunmap_atomic(map_data, KM_USER0); + } else + kunmap(buf->page); +} +EXPORT_SYMBOL(generic_pipe_buf_unmap); + +/** + * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to attempt to steal + * + * Description: + * This function attempts to steal the &struct page attached to + * @buf. If successful, this function returns 0 and returns with + * the page locked. The caller may then reuse the page for whatever + * he wishes; the typical use is insertion into a different file + * page cache. + */ +int generic_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + /* + * A reference of one is golden, that means that the owner of this + * page is the only one holding a reference to it. lock the page + * and return OK. + */ + if (page_count(page) == 1) { + lock_page(page); + return 0; + } + + return 1; +} +EXPORT_SYMBOL(generic_pipe_buf_steal); + +/** + * generic_pipe_buf_get - get a reference to a &struct pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to get a reference to + * + * Description: + * This function grabs an extra reference to @buf. It's used in + * in the tee() system call, when we duplicate the buffers in one + * pipe into another. + */ +void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +{ + page_cache_get(buf->page); +} +EXPORT_SYMBOL(generic_pipe_buf_get); + +/** + * generic_pipe_buf_confirm - verify contents of the pipe buffer + * @info: the pipe that the buffer belongs to + * @buf: the buffer to confirm + * + * Description: + * This function does nothing, because the generic pipe code uses + * pages that are always good when inserted into the pipe. + */ +int generic_pipe_buf_confirm(struct pipe_inode_info *info, + struct pipe_buffer *buf) +{ + return 0; +} +EXPORT_SYMBOL(generic_pipe_buf_confirm); + +/** + * generic_pipe_buf_release - put a reference to a &struct pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to put a reference to + * + * Description: + * This function releases a reference to @buf. + */ +void generic_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + page_cache_release(buf->page); +} +EXPORT_SYMBOL(generic_pipe_buf_release); + +static const struct pipe_buf_operations anon_pipe_buf_ops = { + .can_merge = 1, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = anon_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static const struct pipe_buf_operations packet_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = anon_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static ssize_t +pipe_read(struct kiocb *iocb, const struct iovec *_iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *filp = iocb->ki_filp; + struct inode *inode = filp->f_path.dentry->d_inode; + struct pipe_inode_info *pipe; + int do_wakeup; + ssize_t ret; + struct iovec *iov = (struct iovec *)_iov; + size_t total_len; + + total_len = iov_length(iov, nr_segs); + /* Null read succeeds. */ + if (unlikely(total_len == 0)) + return 0; + + do_wakeup = 0; + ret = 0; + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; + for (;;) { + int bufs = pipe->nrbufs; + if (bufs) { + int curbuf = pipe->curbuf; + struct pipe_buffer *buf = pipe->bufs + curbuf; + const struct pipe_buf_operations *ops = buf->ops; + void *addr; + size_t chars = buf->len; + int error, atomic; + + if (chars > total_len) + chars = total_len; + + error = ops->confirm(pipe, buf); + if (error) { + if (!ret) + ret = error; + break; + } + + atomic = !iov_fault_in_pages_write(iov, chars); +redo: + addr = ops->map(pipe, buf, atomic); + error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); + ops->unmap(pipe, buf, addr); + if (unlikely(error)) { + /* + * Just retry with the slow path if we failed. + */ + if (atomic) { + atomic = 0; + goto redo; + } + if (!ret) + ret = error; + break; + } + ret += chars; + buf->offset += chars; + buf->len -= chars; + + /* Was it a packet buffer? Clean up and exit */ + if (buf->flags & PIPE_BUF_FLAG_PACKET) { + total_len = chars; + buf->len = 0; + } + + if (!buf->len) { + buf->ops = NULL; + ops->release(pipe, buf); + curbuf = (curbuf + 1) & (pipe->buffers - 1); + pipe->curbuf = curbuf; + pipe->nrbufs = --bufs; + do_wakeup = 1; + } + total_len -= chars; + if (!total_len) + break; /* common path: read succeeded */ + } + if (bufs) /* More to do? */ + continue; + if (!pipe->writers) + break; + if (!pipe->waiting_writers) { + /* syscall merging: Usually we must not sleep + * if O_NONBLOCK is set, or if we got some data. + * But if a writer sleeps in kernel space, then + * we can wait for that data without violating POSIX. + */ + if (ret) + break; + if (filp->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + break; + } + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + if (do_wakeup) { + wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + } + pipe_wait(pipe); + } + mutex_unlock(&inode->i_mutex); + + /* Signal writers asynchronously that there is more room. */ + if (do_wakeup) { + wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + } + if (ret > 0) + file_accessed(filp); + return ret; +} + +static inline int is_packetized(struct file *file) +{ + return (file->f_flags & O_DIRECT) != 0; +} + +static ssize_t +pipe_write(struct kiocb *iocb, const struct iovec *_iov, + unsigned long nr_segs, loff_t ppos) +{ + struct file *filp = iocb->ki_filp; + struct inode *inode = filp->f_path.dentry->d_inode; + struct pipe_inode_info *pipe; + ssize_t ret; + int do_wakeup; + struct iovec *iov = (struct iovec *)_iov; + size_t total_len; + ssize_t chars; + + total_len = iov_length(iov, nr_segs); + /* Null write succeeds. */ + if (unlikely(total_len == 0)) + return 0; + + do_wakeup = 0; + ret = 0; + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; + + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + goto out; + } + + /* We try to merge small writes */ + chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ + if (pipe->nrbufs && chars != 0) { + int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & + (pipe->buffers - 1); + struct pipe_buffer *buf = pipe->bufs + lastbuf; + const struct pipe_buf_operations *ops = buf->ops; + int offset = buf->offset + buf->len; + + if (ops->can_merge && offset + chars <= PAGE_SIZE) { + int error, atomic = 1; + void *addr; + + error = ops->confirm(pipe, buf); + if (error) + goto out; + + iov_fault_in_pages_read(iov, chars); +redo1: + addr = ops->map(pipe, buf, atomic); + error = pipe_iov_copy_from_user(offset + addr, iov, + chars, atomic); + ops->unmap(pipe, buf, addr); + ret = error; + do_wakeup = 1; + if (error) { + if (atomic) { + atomic = 0; + goto redo1; + } + goto out; + } + buf->len += chars; + total_len -= chars; + ret = chars; + if (!total_len) + goto out; + } + } + + for (;;) { + int bufs; + + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + bufs = pipe->nrbufs; + if (bufs < pipe->buffers) { + int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + struct page *page = pipe->tmp_page; + char *src; + int error, atomic = 1; + + if (!page) { + page = alloc_page(GFP_HIGHUSER); + if (unlikely(!page)) { + ret = ret ? : -ENOMEM; + break; + } + pipe->tmp_page = page; + } + /* Always wake up, even if the copy fails. Otherwise + * we lock up (O_NONBLOCK-)readers that sleep due to + * syscall merging. + * FIXME! Is this really true? + */ + do_wakeup = 1; + chars = PAGE_SIZE; + if (chars > total_len) + chars = total_len; + + iov_fault_in_pages_read(iov, chars); +redo2: + if (atomic) + src = kmap_atomic(page, KM_USER0); + else + src = kmap(page); + + error = pipe_iov_copy_from_user(src, iov, chars, + atomic); + if (atomic) + kunmap_atomic(src, KM_USER0); + else + kunmap(page); + + if (unlikely(error)) { + if (atomic) { + atomic = 0; + goto redo2; + } + if (!ret) + ret = error; + break; + } + ret += chars; + + /* Insert it into the buffer array */ + buf->page = page; + buf->ops = &anon_pipe_buf_ops; + buf->offset = 0; + buf->len = chars; + buf->flags = 0; + if (is_packetized(filp)) { + buf->ops = &packet_pipe_buf_ops; + buf->flags = PIPE_BUF_FLAG_PACKET; + } + pipe->nrbufs = ++bufs; + pipe->tmp_page = NULL; + + total_len -= chars; + if (!total_len) + break; + } + if (bufs < pipe->buffers) + continue; + if (filp->f_flags & O_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + if (do_wakeup) { + wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + do_wakeup = 0; + } + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } +out: + mutex_unlock(&inode->i_mutex); + if (do_wakeup) { + wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + } + if (ret > 0) + file_update_time(filp); + return ret; +} + +static ssize_t +bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) +{ + return -EBADF; +} + +static ssize_t +bad_pipe_w(struct file *filp, const char __user *buf, size_t count, + loff_t *ppos) +{ + return -EBADF; +} + +static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct pipe_inode_info *pipe; + int count, buf, nrbufs; + + switch (cmd) { + case FIONREAD: + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; + count = 0; + buf = pipe->curbuf; + nrbufs = pipe->nrbufs; + while (--nrbufs >= 0) { + count += pipe->bufs[buf].len; + buf = (buf+1) & (pipe->buffers - 1); + } + mutex_unlock(&inode->i_mutex); + + return put_user(count, (int __user *)arg); + default: + return -EINVAL; + } +} + +/* No kernel lock held - fine */ +static unsigned int +pipe_poll(struct file *filp, poll_table *wait) +{ + unsigned int mask; + struct inode *inode = filp->f_path.dentry->d_inode; + struct pipe_inode_info *pipe = inode->i_pipe; + int nrbufs; + + poll_wait(filp, &pipe->wait, wait); + + /* Reading only -- no need for acquiring the semaphore. */ + nrbufs = pipe->nrbufs; + mask = 0; + if (filp->f_mode & FMODE_READ) { + mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; + if (!pipe->writers && filp->f_version != pipe->w_counter) + mask |= POLLHUP; + } + + if (filp->f_mode & FMODE_WRITE) { + mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; + /* + * Most Unices do not set POLLERR for FIFOs but on Linux they + * behave exactly like pipes for poll(). + */ + if (!pipe->readers) + mask |= POLLERR; + } + + return mask; +} + +static int +pipe_release(struct inode *inode, int decr, int decw) +{ + struct pipe_inode_info *pipe; + + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; + pipe->readers -= decr; + pipe->writers -= decw; + + if (!pipe->readers && !pipe->writers) { + free_pipe_info(inode); + } else { + wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + } + mutex_unlock(&inode->i_mutex); + + return 0; +} + +static int +pipe_read_fasync(int fd, struct file *filp, int on) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); + mutex_unlock(&inode->i_mutex); + + return retval; +} + + +static int +pipe_write_fasync(int fd, struct file *filp, int on) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); + mutex_unlock(&inode->i_mutex); + + return retval; +} + + +static int +pipe_rdwr_fasync(int fd, struct file *filp, int on) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct pipe_inode_info *pipe = inode->i_pipe; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); + if (retval >= 0) { + retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); + if (retval < 0) /* this can happen only if on == T */ + fasync_helper(-1, filp, 0, &pipe->fasync_readers); + } + mutex_unlock(&inode->i_mutex); + return retval; +} + + +static int +pipe_read_release(struct inode *inode, struct file *filp) +{ + return pipe_release(inode, 1, 0); +} + +static int +pipe_write_release(struct inode *inode, struct file *filp) +{ + return pipe_release(inode, 0, 1); +} + +static int +pipe_rdwr_release(struct inode *inode, struct file *filp) +{ + int decr, decw; + + decr = (filp->f_mode & FMODE_READ) != 0; + decw = (filp->f_mode & FMODE_WRITE) != 0; + return pipe_release(inode, decr, decw); +} + +static int +pipe_read_open(struct inode *inode, struct file *filp) +{ + int ret = -ENOENT; + + mutex_lock(&inode->i_mutex); + + if (inode->i_pipe) { + ret = 0; + inode->i_pipe->readers++; + } + + mutex_unlock(&inode->i_mutex); + + return ret; +} + +static int +pipe_write_open(struct inode *inode, struct file *filp) +{ + int ret = -ENOENT; + + mutex_lock(&inode->i_mutex); + + if (inode->i_pipe) { + ret = 0; + inode->i_pipe->writers++; + } + + mutex_unlock(&inode->i_mutex); + + return ret; +} + +static int +pipe_rdwr_open(struct inode *inode, struct file *filp) +{ + int ret = -ENOENT; + + mutex_lock(&inode->i_mutex); + + if (inode->i_pipe) { + ret = 0; + if (filp->f_mode & FMODE_READ) + inode->i_pipe->readers++; + if (filp->f_mode & FMODE_WRITE) + inode->i_pipe->writers++; + } + + mutex_unlock(&inode->i_mutex); + + return ret; +} + +/* + * The file_operations structs are not static because they + * are also used in linux/fs/fifo.c to do operations on FIFOs. + * + * Pipes reuse fifos' file_operations structs. + */ +const struct file_operations read_pipefifo_fops = { + .llseek = no_llseek, + .read = do_sync_read, + .aio_read = pipe_read, + .write = bad_pipe_w, + .poll = pipe_poll, + .unlocked_ioctl = pipe_ioctl, + .open = pipe_read_open, + .release = pipe_read_release, + .fasync = pipe_read_fasync, +}; + +const struct file_operations write_pipefifo_fops = { + .llseek = no_llseek, + .read = bad_pipe_r, + .write = do_sync_write, + .aio_write = pipe_write, + .poll = pipe_poll, + .unlocked_ioctl = pipe_ioctl, + .open = pipe_write_open, + .release = pipe_write_release, + .fasync = pipe_write_fasync, +}; + +const struct file_operations rdwr_pipefifo_fops = { + .llseek = no_llseek, + .read = do_sync_read, + .aio_read = pipe_read, + .write = do_sync_write, + .aio_write = pipe_write, + .poll = pipe_poll, + .unlocked_ioctl = pipe_ioctl, + .open = pipe_rdwr_open, + .release = pipe_rdwr_release, + .fasync = pipe_rdwr_fasync, +}; + +struct pipe_inode_info * alloc_pipe_info(struct inode *inode) +{ + struct pipe_inode_info *pipe; + + pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + if (pipe) { + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); + if (pipe->bufs) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->inode = inode; + pipe->buffers = PIPE_DEF_BUFFERS; + return pipe; + } + kfree(pipe); + } + + return NULL; +} + +void __free_pipe_info(struct pipe_inode_info *pipe) +{ + int i; + + for (i = 0; i < pipe->buffers; i++) { + struct pipe_buffer *buf = pipe->bufs + i; + if (buf->ops) + buf->ops->release(pipe, buf); + } + if (pipe->tmp_page) + __free_page(pipe->tmp_page); + kfree(pipe->bufs); + kfree(pipe); +} + +void free_pipe_info(struct inode *inode) +{ + __free_pipe_info(inode->i_pipe); + inode->i_pipe = NULL; +} + +static struct vfsmount *pipe_mnt __read_mostly; + +/* + * pipefs_dname() is called from d_path(). + */ +static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", + dentry->d_inode->i_ino); +} + +static const struct dentry_operations pipefs_dentry_operations = { + .d_dname = pipefs_dname, +}; + +static struct inode * get_pipe_inode(void) +{ + struct inode *inode = new_inode(pipe_mnt->mnt_sb); + struct pipe_inode_info *pipe; + + if (!inode) + goto fail_inode; + + inode->i_ino = get_next_ino(); + + pipe = alloc_pipe_info(inode); + if (!pipe) + goto fail_iput; + inode->i_pipe = pipe; + + pipe->readers = pipe->writers = 1; + inode->i_fop = &rdwr_pipefifo_fops; + + /* + * Mark the inode dirty from the very beginning, + * that way it will never be moved to the dirty + * list because "mark_inode_dirty()" will think + * that it already _is_ on the dirty list. + */ + inode->i_state = I_DIRTY; + inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + return inode; + +fail_iput: + iput(inode); + +fail_inode: + return NULL; +} + +struct file *create_write_pipe(int flags) +{ + int err; + struct inode *inode; + struct file *f; + struct path path; + struct qstr name = { .name = "" }; + + err = -ENFILE; + inode = get_pipe_inode(); + if (!inode) + goto err; + + err = -ENOMEM; + path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); + if (!path.dentry) + goto err_inode; + path.mnt = mntget(pipe_mnt); + + d_instantiate(path.dentry, inode); + + err = -ENFILE; + f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); + if (!f) + goto err_dentry; + f->f_mapping = inode->i_mapping; + + f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); + f->f_version = 0; + + return f; + + err_dentry: + free_pipe_info(inode); + path_put(&path); + return ERR_PTR(err); + + err_inode: + free_pipe_info(inode); + iput(inode); + err: + return ERR_PTR(err); +} + +void free_write_pipe(struct file *f) +{ + free_pipe_info(f->f_dentry->d_inode); + path_put(&f->f_path); + put_filp(f); +} + +struct file *create_read_pipe(struct file *wrf, int flags) +{ + /* Grab pipe from the writer */ + struct file *f = alloc_file(&wrf->f_path, FMODE_READ, + &read_pipefifo_fops); + if (!f) + return ERR_PTR(-ENFILE); + + path_get(&wrf->f_path); + f->f_flags = O_RDONLY | (flags & O_NONBLOCK); + + return f; +} + +int do_pipe_flags(int *fd, int flags) +{ + struct file *fw, *fr; + int error; + int fdw, fdr; + + if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) + return -EINVAL; + + fw = create_write_pipe(flags); + if (IS_ERR(fw)) + return PTR_ERR(fw); + fr = create_read_pipe(fw, flags); + error = PTR_ERR(fr); + if (IS_ERR(fr)) + goto err_write_pipe; + + error = get_unused_fd_flags(flags); + if (error < 0) + goto err_read_pipe; + fdr = error; + + error = get_unused_fd_flags(flags); + if (error < 0) + goto err_fdr; + fdw = error; + + audit_fd_pair(fdr, fdw); + fd_install(fdr, fr); + fd_install(fdw, fw); + fd[0] = fdr; + fd[1] = fdw; + + return 0; + + err_fdr: + put_unused_fd(fdr); + err_read_pipe: + path_put(&fr->f_path); + put_filp(fr); + err_write_pipe: + free_write_pipe(fw); + return error; +} + +/* + * sys_pipe() is the normal C calling standard for creating + * a pipe. It's not the way Unix traditionally does this, though. + */ +SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) +{ + int fd[2]; + int error; + + error = do_pipe_flags(fd, flags); + if (!error) { + if (copy_to_user(fildes, fd, sizeof(fd))) { + sys_close(fd[0]); + sys_close(fd[1]); + error = -EFAULT; + } + } + return error; +} + +SYSCALL_DEFINE1(pipe, int __user *, fildes) +{ + return sys_pipe2(fildes, 0); +} + +/* + * Allocate a new array of pipe buffers and copy the info over. Returns the + * pipe size if successful, or return -ERROR on error. + */ +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) +{ + struct pipe_buffer *bufs; + + /* + * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't + * expect a lot of shrink+grow operations, just free and allocate + * again like we would do for growing. If the pipe currently + * contains more buffers than arg, then return busy. + */ + if (nr_pages < pipe->nrbufs) + return -EBUSY; + + bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); + if (unlikely(!bufs)) + return -ENOMEM; + + /* + * The pipe array wraps around, so just start the new one at zero + * and adjust the indexes. + */ + if (pipe->nrbufs) { + unsigned int tail; + unsigned int head; + + tail = pipe->curbuf + pipe->nrbufs; + if (tail < pipe->buffers) + tail = 0; + else + tail &= (pipe->buffers - 1); + + head = pipe->nrbufs - tail; + if (head) + memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); + if (tail) + memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); + } + + pipe->curbuf = 0; + kfree(pipe->bufs); + pipe->bufs = bufs; + pipe->buffers = nr_pages; + return nr_pages * PAGE_SIZE; +} + +/* + * Currently we rely on the pipe array holding a power-of-2 number + * of pages. + */ +static inline unsigned int round_pipe_size(unsigned int size) +{ + unsigned long nr_pages; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; +} + +/* + * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax + * will return an error. + */ +int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); + if (ret < 0 || !write) + return ret; + + pipe_max_size = round_pipe_size(pipe_max_size); + return ret; +} + +/* + * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same + * location, so checking ->i_pipe is not enough to verify that this is a + * pipe. + */ +struct pipe_inode_info *get_pipe_info(struct file *file) +{ + struct inode *i = file->f_path.dentry->d_inode; + + return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; +} + +long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct pipe_inode_info *pipe; + long ret; + + pipe = get_pipe_info(file); + if (!pipe) + return -EBADF; + + mutex_lock(&pipe->inode->i_mutex); + + switch (cmd) { + case F_SETPIPE_SZ: { + unsigned int size, nr_pages; + + size = round_pipe_size(arg); + nr_pages = size >> PAGE_SHIFT; + + ret = -EINVAL; + if (!nr_pages) + goto out; + + if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { + ret = -EPERM; + goto out; + } + ret = pipe_set_size(pipe, nr_pages); + break; + } + case F_GETPIPE_SZ: + ret = pipe->buffers * PAGE_SIZE; + break; + default: + ret = -EINVAL; + break; + } + +out: + mutex_unlock(&pipe->inode->i_mutex); + return ret; +} + +static const struct super_operations pipefs_ops = { + .destroy_inode = free_inode_nonrcu, +}; + +/* + * pipefs should _never_ be mounted by userland - too much of security hassle, + * no real gain from having the whole whorehouse mounted. So we don't need + * any operations on the root directory. However, we need a non-trivial + * d_name - pipe: will go nicely and kill the special-casing in procfs. + */ +static struct dentry *pipefs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "pipe:", &pipefs_ops, + &pipefs_dentry_operations, PIPEFS_MAGIC); +} + +static struct file_system_type pipe_fs_type = { + .name = "pipefs", + .mount = pipefs_mount, + .kill_sb = kill_anon_super, +}; + +static int __init init_pipe_fs(void) +{ + int err = register_filesystem(&pipe_fs_type); + + if (!err) { + pipe_mnt = kern_mount(&pipe_fs_type); + if (IS_ERR(pipe_mnt)) { + err = PTR_ERR(pipe_mnt); + unregister_filesystem(&pipe_fs_type); + } + } + return err; +} + +static void __exit exit_pipe_fs(void) +{ + unregister_filesystem(&pipe_fs_type); + mntput(pipe_mnt); +} + +fs_initcall(init_pipe_fs); +module_exit(exit_pipe_fs); diff --git a/fs/pnode.c b/fs/pnode.c new file mode 100644 index 00000000..d42514e3 --- /dev/null +++ b/fs/pnode.c @@ -0,0 +1,371 @@ +/* + * linux/fs/pnode.c + * + * (C) Copyright IBM Corporation 2005. + * Released under GPL v2. + * Author : Ram Pai (linuxram@us.ibm.com) + * + */ +#include +#include +#include +#include "internal.h" +#include "pnode.h" + +/* return the next shared peer mount of @p */ +static inline struct vfsmount *next_peer(struct vfsmount *p) +{ + return list_entry(p->mnt_share.next, struct vfsmount, mnt_share); +} + +static inline struct vfsmount *first_slave(struct vfsmount *p) +{ + return list_entry(p->mnt_slave_list.next, struct vfsmount, mnt_slave); +} + +static inline struct vfsmount *next_slave(struct vfsmount *p) +{ + return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave); +} + +/* + * Return true if path is reachable from root + * + * namespace_sem is held, and mnt is attached + */ +static bool is_path_reachable(struct vfsmount *mnt, struct dentry *dentry, + const struct path *root) +{ + while (mnt != root->mnt && mnt->mnt_parent != mnt) { + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + } + return mnt == root->mnt && is_subdir(dentry, root->dentry); +} + +static struct vfsmount *get_peer_under_root(struct vfsmount *mnt, + struct mnt_namespace *ns, + const struct path *root) +{ + struct vfsmount *m = mnt; + + do { + /* Check the namespace first for optimization */ + if (m->mnt_ns == ns && is_path_reachable(m, m->mnt_root, root)) + return m; + + m = next_peer(m); + } while (m != mnt); + + return NULL; +} + +/* + * Get ID of closest dominating peer group having a representative + * under the given root. + * + * Caller must hold namespace_sem + */ +int get_dominating_id(struct vfsmount *mnt, const struct path *root) +{ + struct vfsmount *m; + + for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) { + struct vfsmount *d = get_peer_under_root(m, mnt->mnt_ns, root); + if (d) + return d->mnt_group_id; + } + + return 0; +} + +static int do_make_slave(struct vfsmount *mnt) +{ + struct vfsmount *peer_mnt = mnt, *master = mnt->mnt_master; + struct vfsmount *slave_mnt; + + /* + * slave 'mnt' to a peer mount that has the + * same root dentry. If none is available then + * slave it to anything that is available. + */ + while ((peer_mnt = next_peer(peer_mnt)) != mnt && + peer_mnt->mnt_root != mnt->mnt_root) ; + + if (peer_mnt == mnt) { + peer_mnt = next_peer(mnt); + if (peer_mnt == mnt) + peer_mnt = NULL; + } + if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share)) + mnt_release_group_id(mnt); + + list_del_init(&mnt->mnt_share); + mnt->mnt_group_id = 0; + + if (peer_mnt) + master = peer_mnt; + + if (master) { + list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) + slave_mnt->mnt_master = master; + list_move(&mnt->mnt_slave, &master->mnt_slave_list); + list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); + INIT_LIST_HEAD(&mnt->mnt_slave_list); + } else { + struct list_head *p = &mnt->mnt_slave_list; + while (!list_empty(p)) { + slave_mnt = list_first_entry(p, + struct vfsmount, mnt_slave); + list_del_init(&slave_mnt->mnt_slave); + slave_mnt->mnt_master = NULL; + } + } + mnt->mnt_master = master; + CLEAR_MNT_SHARED(mnt); + return 0; +} + +/* + * vfsmount lock must be held for write + */ +void change_mnt_propagation(struct vfsmount *mnt, int type) +{ + if (type == MS_SHARED) { + set_mnt_shared(mnt); + return; + } + do_make_slave(mnt); + if (type != MS_SLAVE) { + list_del_init(&mnt->mnt_slave); + mnt->mnt_master = NULL; + if (type == MS_UNBINDABLE) + mnt->mnt_flags |= MNT_UNBINDABLE; + else + mnt->mnt_flags &= ~MNT_UNBINDABLE; + } +} + +/* + * get the next mount in the propagation tree. + * @m: the mount seen last + * @origin: the original mount from where the tree walk initiated + * + * Note that peer groups form contiguous segments of slave lists. + * We rely on that in get_source() to be able to find out if + * vfsmount found while iterating with propagation_next() is + * a peer of one we'd found earlier. + */ +static struct vfsmount *propagation_next(struct vfsmount *m, + struct vfsmount *origin) +{ + /* are there any slaves of this mount? */ + if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + return first_slave(m); + + while (1) { + struct vfsmount *next; + struct vfsmount *master = m->mnt_master; + + if (master == origin->mnt_master) { + next = next_peer(m); + return ((next == origin) ? NULL : next); + } else if (m->mnt_slave.next != &master->mnt_slave_list) + return next_slave(m); + + /* back at master */ + m = master; + } +} + +/* + * return the source mount to be used for cloning + * + * @dest the current destination mount + * @last_dest the last seen destination mount + * @last_src the last seen source mount + * @type return CL_SLAVE if the new mount has to be + * cloned as a slave. + */ +static struct vfsmount *get_source(struct vfsmount *dest, + struct vfsmount *last_dest, + struct vfsmount *last_src, + int *type) +{ + struct vfsmount *p_last_src = NULL; + struct vfsmount *p_last_dest = NULL; + + while (last_dest != dest->mnt_master) { + p_last_dest = last_dest; + p_last_src = last_src; + last_dest = last_dest->mnt_master; + last_src = last_src->mnt_master; + } + + if (p_last_dest) { + do { + p_last_dest = next_peer(p_last_dest); + } while (IS_MNT_NEW(p_last_dest)); + /* is that a peer of the earlier? */ + if (dest == p_last_dest) { + *type = CL_MAKE_SHARED; + return p_last_src; + } + } + /* slave of the earlier, then */ + *type = CL_SLAVE; + /* beginning of peer group among the slaves? */ + if (IS_MNT_SHARED(dest)) + *type |= CL_MAKE_SHARED; + return last_src; +} + +/* + * mount 'source_mnt' under the destination 'dest_mnt' at + * dentry 'dest_dentry'. And propagate that mount to + * all the peer and slave mounts of 'dest_mnt'. + * Link all the new mounts into a propagation tree headed at + * source_mnt. Also link all the new mounts using ->mnt_list + * headed at source_mnt's ->mnt_list + * + * @dest_mnt: destination mount. + * @dest_dentry: destination dentry. + * @source_mnt: source mount. + * @tree_list : list of heads of trees to be attached. + */ +int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry, + struct vfsmount *source_mnt, struct list_head *tree_list) +{ + struct vfsmount *m, *child; + int ret = 0; + struct vfsmount *prev_dest_mnt = dest_mnt; + struct vfsmount *prev_src_mnt = source_mnt; + LIST_HEAD(tmp_list); + LIST_HEAD(umount_list); + + for (m = propagation_next(dest_mnt, dest_mnt); m; + m = propagation_next(m, dest_mnt)) { + int type; + struct vfsmount *source; + + if (IS_MNT_NEW(m)) + continue; + + source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); + + if (!(child = copy_tree(source, source->mnt_root, type))) { + ret = -ENOMEM; + list_splice(tree_list, tmp_list.prev); + goto out; + } + + if (is_subdir(dest_dentry, m->mnt_root)) { + mnt_set_mountpoint(m, dest_dentry, child); + list_add_tail(&child->mnt_hash, tree_list); + } else { + /* + * This can happen if the parent mount was bind mounted + * on some subdirectory of a shared/slave mount. + */ + list_add_tail(&child->mnt_hash, &tmp_list); + } + prev_dest_mnt = m; + prev_src_mnt = child; + } +out: + br_write_lock(vfsmount_lock); + while (!list_empty(&tmp_list)) { + child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash); + umount_tree(child, 0, &umount_list); + } + br_write_unlock(vfsmount_lock); + release_mounts(&umount_list); + return ret; +} + +/* + * return true if the refcount is greater than count + */ +static inline int do_refcount_check(struct vfsmount *mnt, int count) +{ + int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts; + return (mycount > count); +} + +/* + * check if the mount 'mnt' can be unmounted successfully. + * @mnt: the mount to be checked for unmount + * NOTE: unmounting 'mnt' would naturally propagate to all + * other mounts its parent propagates to. + * Check if any of these mounts that **do not have submounts** + * have more references than 'refcnt'. If so return busy. + * + * vfsmount lock must be held for write + */ +int propagate_mount_busy(struct vfsmount *mnt, int refcnt) +{ + struct vfsmount *m, *child; + struct vfsmount *parent = mnt->mnt_parent; + int ret = 0; + + if (mnt == parent) + return do_refcount_check(mnt, refcnt); + + /* + * quickly check if the current mount can be unmounted. + * If not, we don't have to go checking for all other + * mounts + */ + if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt)) + return 1; + + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + child = __lookup_mnt(m, mnt->mnt_mountpoint, 0); + if (child && list_empty(&child->mnt_mounts) && + (ret = do_refcount_check(child, 1))) + break; + } + return ret; +} + +/* + * NOTE: unmounting 'mnt' naturally propagates to all other mounts its + * parent propagates to. + */ +static void __propagate_umount(struct vfsmount *mnt) +{ + struct vfsmount *parent = mnt->mnt_parent; + struct vfsmount *m; + + BUG_ON(parent == mnt); + + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + + struct vfsmount *child = __lookup_mnt(m, + mnt->mnt_mountpoint, 0); + /* + * umount the child only if the child has no + * other children + */ + if (child && list_empty(&child->mnt_mounts)) + list_move_tail(&child->mnt_hash, &mnt->mnt_hash); + } +} + +/* + * collect all mounts that receive propagation from the mount in @list, + * and return these additional mounts in the same list. + * @list: the list of mounts to be unmounted. + * + * vfsmount lock must be held for write + */ +int propagate_umount(struct list_head *list) +{ + struct vfsmount *mnt; + + list_for_each_entry(mnt, list, mnt_hash) + __propagate_umount(mnt); + return 0; +} diff --git a/fs/pnode.h b/fs/pnode.h new file mode 100644 index 00000000..1ea4ae1e --- /dev/null +++ b/fs/pnode.h @@ -0,0 +1,39 @@ +/* + * linux/fs/pnode.h + * + * (C) Copyright IBM Corporation 2005. + * Released under GPL v2. + * + */ +#ifndef _LINUX_PNODE_H +#define _LINUX_PNODE_H + +#include +#include + +#define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED) +#define IS_MNT_SLAVE(mnt) (mnt->mnt_master) +#define IS_MNT_NEW(mnt) (!mnt->mnt_ns) +#define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~MNT_SHARED) +#define IS_MNT_UNBINDABLE(mnt) (mnt->mnt_flags & MNT_UNBINDABLE) + +#define CL_EXPIRE 0x01 +#define CL_SLAVE 0x02 +#define CL_COPY_ALL 0x04 +#define CL_MAKE_SHARED 0x08 +#define CL_PRIVATE 0x10 + +static inline void set_mnt_shared(struct vfsmount *mnt) +{ + mnt->mnt_flags &= ~MNT_SHARED_MASK; + mnt->mnt_flags |= MNT_SHARED; +} + +void change_mnt_propagation(struct vfsmount *, int); +int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *, + struct list_head *); +int propagate_umount(struct list_head *); +int propagate_mount_busy(struct vfsmount *, int); +void mnt_release_group_id(struct vfsmount *); +int get_dominating_id(struct vfsmount *mnt, const struct path *root); +#endif /* _LINUX_PNODE_H */ diff --git a/fs/posix_acl.c b/fs/posix_acl.c new file mode 100644 index 00000000..b1cf6bf4 --- /dev/null +++ b/fs/posix_acl.c @@ -0,0 +1,388 @@ +/* + * linux/fs/posix_acl.c + * + * Copyright (C) 2002 by Andreas Gruenbacher + * + * Fixes from William Schumacher incorporated on 15 March 2001. + * (Reported by Charles Bertsch, ). + */ + +/* + * This file contains generic functions for manipulating + * POSIX 1003.1e draft standard 17 ACLs. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +EXPORT_SYMBOL(posix_acl_init); +EXPORT_SYMBOL(posix_acl_alloc); +EXPORT_SYMBOL(posix_acl_clone); +EXPORT_SYMBOL(posix_acl_valid); +EXPORT_SYMBOL(posix_acl_equiv_mode); +EXPORT_SYMBOL(posix_acl_from_mode); +EXPORT_SYMBOL(posix_acl_create_masq); +EXPORT_SYMBOL(posix_acl_chmod_masq); +EXPORT_SYMBOL(posix_acl_permission); + +/* + * Init a fresh posix_acl + */ +void +posix_acl_init(struct posix_acl *acl, int count) +{ + atomic_set(&acl->a_refcount, 1); + acl->a_count = count; +} + +/* + * Allocate a new ACL with the specified number of entries. + */ +struct posix_acl * +posix_acl_alloc(int count, gfp_t flags) +{ + const size_t size = sizeof(struct posix_acl) + + count * sizeof(struct posix_acl_entry); + struct posix_acl *acl = kmalloc(size, flags); + if (acl) + posix_acl_init(acl, count); + return acl; +} + +/* + * Clone an ACL. + */ +struct posix_acl * +posix_acl_clone(const struct posix_acl *acl, gfp_t flags) +{ + struct posix_acl *clone = NULL; + + if (acl) { + int size = sizeof(struct posix_acl) + acl->a_count * + sizeof(struct posix_acl_entry); + clone = kmemdup(acl, size, flags); + if (clone) + atomic_set(&clone->a_refcount, 1); + } + return clone; +} + +/* + * Check if an acl is valid. Returns 0 if it is, or -E... otherwise. + */ +int +posix_acl_valid(const struct posix_acl *acl) +{ + const struct posix_acl_entry *pa, *pe; + int state = ACL_USER_OBJ; + unsigned int id = 0; /* keep gcc happy */ + int needs_mask = 0; + + FOREACH_ACL_ENTRY(pa, acl, pe) { + if (pa->e_perm & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE)) + return -EINVAL; + switch (pa->e_tag) { + case ACL_USER_OBJ: + if (state == ACL_USER_OBJ) { + id = 0; + state = ACL_USER; + break; + } + return -EINVAL; + + case ACL_USER: + if (state != ACL_USER) + return -EINVAL; + if (pa->e_id == ACL_UNDEFINED_ID || + pa->e_id < id) + return -EINVAL; + id = pa->e_id + 1; + needs_mask = 1; + break; + + case ACL_GROUP_OBJ: + if (state == ACL_USER) { + id = 0; + state = ACL_GROUP; + break; + } + return -EINVAL; + + case ACL_GROUP: + if (state != ACL_GROUP) + return -EINVAL; + if (pa->e_id == ACL_UNDEFINED_ID || + pa->e_id < id) + return -EINVAL; + id = pa->e_id + 1; + needs_mask = 1; + break; + + case ACL_MASK: + if (state != ACL_GROUP) + return -EINVAL; + state = ACL_OTHER; + break; + + case ACL_OTHER: + if (state == ACL_OTHER || + (state == ACL_GROUP && !needs_mask)) { + state = 0; + break; + } + return -EINVAL; + + default: + return -EINVAL; + } + } + if (state == 0) + return 0; + return -EINVAL; +} + +/* + * Returns 0 if the acl can be exactly represented in the traditional + * file mode permission bits, or else 1. Returns -E... on error. + */ +int +posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p) +{ + const struct posix_acl_entry *pa, *pe; + mode_t mode = 0; + int not_equiv = 0; + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch (pa->e_tag) { + case ACL_USER_OBJ: + mode |= (pa->e_perm & S_IRWXO) << 6; + break; + case ACL_GROUP_OBJ: + mode |= (pa->e_perm & S_IRWXO) << 3; + break; + case ACL_OTHER: + mode |= pa->e_perm & S_IRWXO; + break; + case ACL_MASK: + mode = (mode & ~S_IRWXG) | + ((pa->e_perm & S_IRWXO) << 3); + not_equiv = 1; + break; + case ACL_USER: + case ACL_GROUP: + not_equiv = 1; + break; + default: + return -EINVAL; + } + } + if (mode_p) + *mode_p = (*mode_p & ~S_IRWXUGO) | mode; + return not_equiv; +} + +/* + * Create an ACL representing the file mode permission bits of an inode. + */ +struct posix_acl * +posix_acl_from_mode(mode_t mode, gfp_t flags) +{ + struct posix_acl *acl = posix_acl_alloc(3, flags); + if (!acl) + return ERR_PTR(-ENOMEM); + + acl->a_entries[0].e_tag = ACL_USER_OBJ; + acl->a_entries[0].e_id = ACL_UNDEFINED_ID; + acl->a_entries[0].e_perm = (mode & S_IRWXU) >> 6; + + acl->a_entries[1].e_tag = ACL_GROUP_OBJ; + acl->a_entries[1].e_id = ACL_UNDEFINED_ID; + acl->a_entries[1].e_perm = (mode & S_IRWXG) >> 3; + + acl->a_entries[2].e_tag = ACL_OTHER; + acl->a_entries[2].e_id = ACL_UNDEFINED_ID; + acl->a_entries[2].e_perm = (mode & S_IRWXO); + return acl; +} + +/* + * Return 0 if current is granted want access to the inode + * by the acl. Returns -E... otherwise. + */ +int +posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) +{ + const struct posix_acl_entry *pa, *pe, *mask_obj; + int found = 0; + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch(pa->e_tag) { + case ACL_USER_OBJ: + /* (May have been checked already) */ + if (inode->i_uid == current_fsuid()) + goto check_perm; + break; + case ACL_USER: + if (pa->e_id == current_fsuid()) + goto mask; + break; + case ACL_GROUP_OBJ: + if (in_group_p(inode->i_gid)) { + found = 1; + if ((pa->e_perm & want) == want) + goto mask; + } + break; + case ACL_GROUP: + if (in_group_p(pa->e_id)) { + found = 1; + if ((pa->e_perm & want) == want) + goto mask; + } + break; + case ACL_MASK: + break; + case ACL_OTHER: + if (found) + return -EACCES; + else + goto check_perm; + default: + return -EIO; + } + } + return -EIO; + +mask: + for (mask_obj = pa+1; mask_obj != pe; mask_obj++) { + if (mask_obj->e_tag == ACL_MASK) { + if ((pa->e_perm & mask_obj->e_perm & want) == want) + return 0; + return -EACCES; + } + } + +check_perm: + if ((pa->e_perm & want) == want) + return 0; + return -EACCES; +} + +/* + * Modify acl when creating a new inode. The caller must ensure the acl is + * only referenced once. + * + * mode_p initially must contain the mode parameter to the open() / creat() + * system calls. All permissions that are not granted by the acl are removed. + * The permissions in the acl are changed to reflect the mode_p parameter. + */ +int +posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p) +{ + struct posix_acl_entry *pa, *pe; + struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; + mode_t mode = *mode_p; + int not_equiv = 0; + + /* assert(atomic_read(acl->a_refcount) == 1); */ + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch(pa->e_tag) { + case ACL_USER_OBJ: + pa->e_perm &= (mode >> 6) | ~S_IRWXO; + mode &= (pa->e_perm << 6) | ~S_IRWXU; + break; + + case ACL_USER: + case ACL_GROUP: + not_equiv = 1; + break; + + case ACL_GROUP_OBJ: + group_obj = pa; + break; + + case ACL_OTHER: + pa->e_perm &= mode | ~S_IRWXO; + mode &= pa->e_perm | ~S_IRWXO; + break; + + case ACL_MASK: + mask_obj = pa; + not_equiv = 1; + break; + + default: + return -EIO; + } + } + + if (mask_obj) { + mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO; + mode &= (mask_obj->e_perm << 3) | ~S_IRWXG; + } else { + if (!group_obj) + return -EIO; + group_obj->e_perm &= (mode >> 3) | ~S_IRWXO; + mode &= (group_obj->e_perm << 3) | ~S_IRWXG; + } + + *mode_p = (*mode_p & ~S_IRWXUGO) | mode; + return not_equiv; +} + +/* + * Modify the ACL for the chmod syscall. + */ +int +posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode) +{ + struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; + struct posix_acl_entry *pa, *pe; + + /* assert(atomic_read(acl->a_refcount) == 1); */ + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch(pa->e_tag) { + case ACL_USER_OBJ: + pa->e_perm = (mode & S_IRWXU) >> 6; + break; + + case ACL_USER: + case ACL_GROUP: + break; + + case ACL_GROUP_OBJ: + group_obj = pa; + break; + + case ACL_MASK: + mask_obj = pa; + break; + + case ACL_OTHER: + pa->e_perm = (mode & S_IRWXO); + break; + + default: + return -EIO; + } + } + + if (mask_obj) { + mask_obj->e_perm = (mode & S_IRWXG) >> 3; + } else { + if (!group_obj) + return -EIO; + group_obj->e_perm = (mode & S_IRWXG) >> 3; + } + + return 0; +} diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig new file mode 100644 index 00000000..15af6222 --- /dev/null +++ b/fs/proc/Kconfig @@ -0,0 +1,69 @@ +config PROC_FS + bool "/proc file system support" if EXPERT + default y + help + This is a virtual file system providing information about the status + of the system. "Virtual" means that it doesn't take up any space on + your hard disk: the files are created on the fly by the kernel when + you try to access them. Also, you cannot read the files with older + version of the program less: you need to use more or cat. + + It's totally cool; for example, "cat /proc/interrupts" gives + information about what the different IRQs are used for at the moment + (there is a small number of Interrupt ReQuest lines in your computer + that are used by the attached devices to gain the CPU's attention -- + often a source of trouble if two devices are mistakenly configured + to use the same IRQ). The program procinfo to display some + information about your system gathered from the /proc file system. + + Before you can use the /proc file system, it has to be mounted, + meaning it has to be given a location in the directory hierarchy. + That location should be /proc. A command such as "mount -t proc proc + /proc" or the equivalent line in /etc/fstab does the job. + + The /proc file system is explained in the file + and on the proc(5) manpage + ("man 5 proc"). + + This option will enlarge your kernel by about 67 KB. Several + programs depend on this, so everyone should say Y here. + +config PROC_KCORE + bool "/proc/kcore support" if !ARM + depends on PROC_FS && MMU + +config PROC_VMCORE + bool "/proc/vmcore support" + depends on PROC_FS && CRASH_DUMP + default y + help + Exports the dump image of crashed kernel in ELF format. + +config PROC_SYSCTL + bool "Sysctl support (/proc/sys)" if EXPERT + depends on PROC_FS + select SYSCTL + default y + ---help--- + The sysctl interface provides a means of dynamically changing + certain kernel parameters and variables on the fly without requiring + a recompile of the kernel or reboot of the system. The primary + interface is through /proc/sys. If you say Y here a tree of + modifiable sysctl entries will be generated beneath the + /proc/sys directory. They are explained in the files + in . Note that enabling this + option will enlarge the kernel by at least 8 KB. + + As it is generally a good thing, you should say Y here unless + building a kernel for install/rescue disks or your system is very + limited in memory. + +config PROC_PAGE_MONITOR + default y + depends on PROC_FS && MMU + bool "Enable /proc page monitoring" if EXPERT + help + Various /proc files exist to monitor process memory utilization: + /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, + /proc/kpagecount, and /proc/kpageflags. Disabling these + interfaces will reduce the size of the kernel by approximately 4kb. diff --git a/fs/proc/Makefile b/fs/proc/Makefile new file mode 100644 index 00000000..c1c72933 --- /dev/null +++ b/fs/proc/Makefile @@ -0,0 +1,30 @@ +# +# Makefile for the Linux proc filesystem routines. +# + +obj-y += proc.o + +proc-y := nommu.o task_nommu.o +proc-$(CONFIG_MMU) := mmu.o task_mmu.o + +proc-y += inode.o root.o base.o generic.o array.o \ + proc_tty.o +proc-y += cmdline.o +proc-y += consoles.o +proc-y += cpuinfo.o +proc-y += devices.o +proc-y += interrupts.o +proc-y += loadavg.o +proc-y += meminfo.o +proc-y += stat.o +proc-y += uptime.o +proc-y += version.o +proc-y += softirqs.o +proc-y += namespaces.o +proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o +proc-$(CONFIG_NET) += proc_net.o +proc-$(CONFIG_PROC_KCORE) += kcore.o +proc-$(CONFIG_PROC_VMCORE) += vmcore.o +proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o +proc-$(CONFIG_PRINTK) += kmsg.o +proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o diff --git a/fs/proc/array.c b/fs/proc/array.c new file mode 100644 index 00000000..9b45ee84 --- /dev/null +++ b/fs/proc/array.c @@ -0,0 +1,546 @@ +/* + * linux/fs/proc/array.c + * + * Copyright (C) 1992 by Linus Torvalds + * based on ideas by Darren Senn + * + * Fixes: + * Michael. K. Johnson: stat,statm extensions. + * + * + * Pauline Middelink : Made cmdline,envline only break at '\0's, to + * make sure SET_PROCTITLE works. Also removed + * bad '!' which forced address recalculation for + * EVERY character on the current page. + * + * + * Danny ter Haar : added cpuinfo + * + * + * Alessandro Rubini : profile extension. + * + * + * Jeff Tranter : added BogoMips field to cpuinfo + * + * + * Bruno Haible : remove 4K limit for the maps file + * + * + * Yves Arrouye : remove removal of trailing spaces in get_array. + * + * + * Jerome Forissier : added per-CPU time information to /proc/stat + * and /proc//cpu extension + * + * - Incorporation and non-SMP safe operation + * of forissier patch in 2.1.78 by + * Hans Marcus + * + * aeb@cwi.nl : /proc/partitions + * + * + * Alan Cox : security fixes. + * + * + * Al Viro : safe handling of mm_struct + * + * Gerhard Wichert : added BIGMEM support + * Siemens AG + * + * Al Viro & Jeff Garzik : moved most of the thing into base.c and + * : proc_misc.c. The rest may eventually go into + * : base.c too. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "internal.h" + +static inline void task_name(struct seq_file *m, struct task_struct *p) +{ + int i; + char *buf, *end; + char *name; + char tcomm[sizeof(p->comm)]; + + get_task_comm(tcomm, p); + + seq_puts(m, "Name:\t"); + end = m->buf + m->size; + buf = m->buf + m->count; + name = tcomm; + i = sizeof(tcomm); + while (i && (buf < end)) { + unsigned char c = *name; + name++; + i--; + *buf = c; + if (!c) + break; + if (c == '\\') { + buf++; + if (buf < end) + *buf++ = c; + continue; + } + if (c == '\n') { + *buf++ = '\\'; + if (buf < end) + *buf++ = 'n'; + continue; + } + buf++; + } + m->count = buf - m->buf; + seq_putc(m, '\n'); +} + +/* + * The task state array is a strange "bitmap" of + * reasons to sleep. Thus "running" is zero, and + * you can test for combinations of others with + * simple bit tests. + */ +static const char * const task_state_array[] = { + "R (running)", /* 0 */ + "S (sleeping)", /* 1 */ + "D (disk sleep)", /* 2 */ + "T (stopped)", /* 4 */ + "t (tracing stop)", /* 8 */ + "Z (zombie)", /* 16 */ + "X (dead)", /* 32 */ + "x (dead)", /* 64 */ + "K (wakekill)", /* 128 */ + "W (waking)", /* 256 */ +}; + +static inline const char *get_task_state(struct task_struct *tsk) +{ + unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; + const char * const *p = &task_state_array[0]; + + BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array)); + + while (state) { + p++; + state >>= 1; + } + return *p; +} + +static inline void task_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *p) +{ + struct group_info *group_info; + int g; + struct fdtable *fdt = NULL; + const struct cred *cred; + pid_t ppid, tpid; + + rcu_read_lock(); + ppid = pid_alive(p) ? + task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; + tpid = 0; + if (pid_alive(p)) { + struct task_struct *tracer = tracehook_tracer_task(p); + if (tracer) + tpid = task_pid_nr_ns(tracer, ns); + } + cred = get_task_cred(p); + seq_printf(m, + "State:\t%s\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%d\t%d\t%d\t%d\n" + "Gid:\t%d\t%d\t%d\t%d\n", + get_task_state(p), + task_tgid_nr_ns(p, ns), + pid_nr_ns(pid, ns), + ppid, tpid, + cred->uid, cred->euid, cred->suid, cred->fsuid, + cred->gid, cred->egid, cred->sgid, cred->fsgid); + + task_lock(p); + if (p->files) + fdt = files_fdtable(p->files); + seq_printf(m, + "FDSize:\t%d\n" + "Groups:\t", + fdt ? fdt->max_fds : 0); + rcu_read_unlock(); + + group_info = cred->group_info; + task_unlock(p); + + for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) + seq_printf(m, "%d ", GROUP_AT(group_info, g)); + put_cred(cred); + + seq_putc(m, '\n'); +} + +static void render_sigset_t(struct seq_file *m, const char *header, + sigset_t *set) +{ + int i; + + seq_puts(m, header); + + i = _NSIG; + do { + int x = 0; + + i -= 4; + if (sigismember(set, i+1)) x |= 1; + if (sigismember(set, i+2)) x |= 2; + if (sigismember(set, i+3)) x |= 4; + if (sigismember(set, i+4)) x |= 8; + seq_printf(m, "%x", x); + } while (i >= 4); + + seq_putc(m, '\n'); +} + +static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, + sigset_t *catch) +{ + struct k_sigaction *k; + int i; + + k = p->sighand->action; + for (i = 1; i <= _NSIG; ++i, ++k) { + if (k->sa.sa_handler == SIG_IGN) + sigaddset(ign, i); + else if (k->sa.sa_handler != SIG_DFL) + sigaddset(catch, i); + } +} + +static inline void task_sig(struct seq_file *m, struct task_struct *p) +{ + unsigned long flags; + sigset_t pending, shpending, blocked, ignored, caught; + int num_threads = 0; + unsigned long qsize = 0; + unsigned long qlim = 0; + + sigemptyset(&pending); + sigemptyset(&shpending); + sigemptyset(&blocked); + sigemptyset(&ignored); + sigemptyset(&caught); + + if (lock_task_sighand(p, &flags)) { + pending = p->pending.signal; + shpending = p->signal->shared_pending.signal; + blocked = p->blocked; + collect_sigign_sigcatch(p, &ignored, &caught); + num_threads = get_nr_threads(p); + rcu_read_lock(); /* FIXME: is this correct? */ + qsize = atomic_read(&__task_cred(p)->user->sigpending); + rcu_read_unlock(); + qlim = task_rlimit(p, RLIMIT_SIGPENDING); + unlock_task_sighand(p, &flags); + } + + seq_printf(m, "Threads:\t%d\n", num_threads); + seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim); + + /* render them all */ + render_sigset_t(m, "SigPnd:\t", &pending); + render_sigset_t(m, "ShdPnd:\t", &shpending); + render_sigset_t(m, "SigBlk:\t", &blocked); + render_sigset_t(m, "SigIgn:\t", &ignored); + render_sigset_t(m, "SigCgt:\t", &caught); +} + +static void render_cap_t(struct seq_file *m, const char *header, + kernel_cap_t *a) +{ + unsigned __capi; + + seq_puts(m, header); + CAP_FOR_EACH_U32(__capi) { + seq_printf(m, "%08x", + a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); + } + seq_putc(m, '\n'); +} + +static inline void task_cap(struct seq_file *m, struct task_struct *p) +{ + const struct cred *cred; + kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset; + + rcu_read_lock(); + cred = __task_cred(p); + cap_inheritable = cred->cap_inheritable; + cap_permitted = cred->cap_permitted; + cap_effective = cred->cap_effective; + cap_bset = cred->cap_bset; + rcu_read_unlock(); + + render_cap_t(m, "CapInh:\t", &cap_inheritable); + render_cap_t(m, "CapPrm:\t", &cap_permitted); + render_cap_t(m, "CapEff:\t", &cap_effective); + render_cap_t(m, "CapBnd:\t", &cap_bset); +} + +static inline void task_context_switch_counts(struct seq_file *m, + struct task_struct *p) +{ + seq_printf(m, "voluntary_ctxt_switches:\t%lu\n" + "nonvoluntary_ctxt_switches:\t%lu\n", + p->nvcsw, + p->nivcsw); +} + +static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) +{ + seq_puts(m, "Cpus_allowed:\t"); + seq_cpumask(m, &task->cpus_allowed); + seq_putc(m, '\n'); + seq_puts(m, "Cpus_allowed_list:\t"); + seq_cpumask_list(m, &task->cpus_allowed); + seq_putc(m, '\n'); +} + +int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm = get_task_mm(task); + + task_name(m, task); + task_state(m, ns, pid, task); + + if (mm) { + task_mem(m, mm); + mmput(mm); + } + task_sig(m, task); + task_cap(m, task); + task_cpus_allowed(m, task); + cpuset_task_status_allowed(m, task); + task_context_switch_counts(m, task); + return 0; +} + +static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task, int whole) +{ + unsigned long vsize, eip, esp, wchan = ~0UL; + long priority, nice; + int tty_pgrp = -1, tty_nr = 0; + sigset_t sigign, sigcatch; + char state; + pid_t ppid = 0, pgid = -1, sid = -1; + int num_threads = 0; + int permitted; + struct mm_struct *mm; + unsigned long long start_time; + unsigned long cmin_flt = 0, cmaj_flt = 0; + unsigned long min_flt = 0, maj_flt = 0; + cputime_t cutime, cstime, utime, stime; + cputime_t cgtime, gtime; + unsigned long rsslim = 0; + char tcomm[sizeof(task->comm)]; + unsigned long flags; + + state = *get_task_state(task); + vsize = eip = esp = 0; + permitted = ptrace_may_access(task, PTRACE_MODE_READ); + mm = get_task_mm(task); + if (mm) { + vsize = task_vsize(mm); + if (permitted) { + eip = KSTK_EIP(task); + esp = KSTK_ESP(task); + } + } + + get_task_comm(tcomm, task); + + sigemptyset(&sigign); + sigemptyset(&sigcatch); + cutime = cstime = utime = stime = cputime_zero; + cgtime = gtime = cputime_zero; + + if (lock_task_sighand(task, &flags)) { + struct signal_struct *sig = task->signal; + + if (sig->tty) { + struct pid *pgrp = tty_get_pgrp(sig->tty); + tty_pgrp = pid_nr_ns(pgrp, ns); + put_pid(pgrp); + tty_nr = new_encode_dev(tty_devnum(sig->tty)); + } + + num_threads = get_nr_threads(task); + collect_sigign_sigcatch(task, &sigign, &sigcatch); + + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; + cgtime = sig->cgtime; + rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); + + /* add up live thread stats at the group level */ + if (whole) { + struct task_struct *t = task; + do { + min_flt += t->min_flt; + maj_flt += t->maj_flt; + gtime = cputime_add(gtime, t->gtime); + t = next_thread(t); + } while (t != task); + + min_flt += sig->min_flt; + maj_flt += sig->maj_flt; + thread_group_times(task, &utime, &stime); + gtime = cputime_add(gtime, sig->gtime); + } + + sid = task_session_nr_ns(task, ns); + ppid = task_tgid_nr_ns(task->real_parent, ns); + pgid = task_pgrp_nr_ns(task, ns); + + unlock_task_sighand(task, &flags); + } + + if (permitted && (!whole || num_threads < 2)) + wchan = get_wchan(task); + if (!whole) { + min_flt = task->min_flt; + maj_flt = task->maj_flt; + task_times(task, &utime, &stime); + gtime = task->gtime; + } + + /* scale priority and nice values from timeslices to -20..20 */ + /* to make it look like a "normal" Unix priority/nice value */ + priority = task_prio(task); + nice = task_nice(task); + + /* Temporary variable needed for gcc-2.96 */ + /* convert timespec -> nsec*/ + start_time = + (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC + + task->real_start_time.tv_nsec; + /* convert nsec -> ticks */ + start_time = nsec_to_clock_t(start_time); + + seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ +%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", + pid_nr_ns(pid, ns), + tcomm, + state, + ppid, + pgid, + sid, + tty_nr, + tty_pgrp, + task->flags, + min_flt, + cmin_flt, + maj_flt, + cmaj_flt, + cputime_to_clock_t(utime), + cputime_to_clock_t(stime), + cputime_to_clock_t(cutime), + cputime_to_clock_t(cstime), + priority, + nice, + num_threads, + start_time, + vsize, + mm ? get_mm_rss(mm) : 0, + rsslim, + mm ? (permitted ? mm->start_code : 1) : 0, + mm ? (permitted ? mm->end_code : 1) : 0, + (permitted && mm) ? mm->start_stack : 0, + esp, + eip, + /* The signal information here is obsolete. + * It must be decimal for Linux 2.0 compatibility. + * Use /proc/#/status for real-time signals. + */ + task->pending.signal.sig[0] & 0x7fffffffUL, + task->blocked.sig[0] & 0x7fffffffUL, + sigign .sig[0] & 0x7fffffffUL, + sigcatch .sig[0] & 0x7fffffffUL, + wchan, + 0UL, + 0UL, + task->exit_signal, + task_cpu(task), + task->rt_priority, + task->policy, + (unsigned long long)delayacct_blkio_ticks(task), + cputime_to_clock_t(gtime), + cputime_to_clock_t(cgtime)); + if (mm) + mmput(mm); + return 0; +} + +int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return do_task_stat(m, ns, pid, task, 0); +} + +int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return do_task_stat(m, ns, pid, task, 1); +} + +int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0; + struct mm_struct *mm = get_task_mm(task); + + if (mm) { + size = task_statm(mm, &shared, &text, &data, &resident); + mmput(mm); + } + seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", + size, resident, shared, text, data); + + return 0; +} diff --git a/fs/proc/base.c b/fs/proc/base.c new file mode 100644 index 00000000..bfa13ac6 --- /dev/null +++ b/fs/proc/base.c @@ -0,0 +1,3429 @@ +/* + * linux/fs/proc/base.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * proc base directory handling functions + * + * 1999, Al Viro. Rewritten. Now it covers the whole per-process part. + * Instead of using magical inumbers to determine the kind of object + * we allocate and fill in-core inodes upon lookup. They don't even + * go into icache. We cache the reference to task_struct upon lookup too. + * Eventually it should become a filesystem in its own. We don't use the + * rest of procfs anymore. + * + * + * Changelog: + * 17-Jan-2005 + * Allan Bezerra + * Bruna Moreira + * Edjard Mota + * Ilias Biris + * Mauricio Lin + * + * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT + * + * A new process specific entry (smaps) included in /proc. It shows the + * size of rss for each memory area. The maps entry lacks information + * about physical memory size (rss) for each mapped file, i.e., + * rss information for executables and library files. + * This additional information is useful for any tools that need to know + * about physical memory consumption for a process specific library. + * + * Changelog: + * 21-Feb-2005 + * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT + * Pud inclusion in the page table walking. + * + * ChangeLog: + * 10-Mar-2005 + * 10LE Instituto Nokia de Tecnologia - INdT: + * A better way to walks through the page table as suggested by Hugh Dickins. + * + * Simo Piiroinen : + * Smaps information related to shared, private, clean and dirty pages. + * + * Paul Mundt : + * Overall revision about smaps. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_HARDWALL +#include +#endif +#include "internal.h" + +/* NOTE: + * Implementing inode permission operations in /proc is almost + * certainly an error. Permission checks need to happen during + * each system call not at open time. The reason is that most of + * what we wish to check for permissions in /proc varies at runtime. + * + * The classic example of a problem is opening file descriptors + * in /proc for a task before it execs a suid executable. + */ + +struct pid_entry { + char *name; + int len; + mode_t mode; + const struct inode_operations *iop; + const struct file_operations *fop; + union proc_op op; +}; + +#define NOD(NAME, MODE, IOP, FOP, OP) { \ + .name = (NAME), \ + .len = sizeof(NAME) - 1, \ + .mode = MODE, \ + .iop = IOP, \ + .fop = FOP, \ + .op = OP, \ +} + +#define DIR(NAME, MODE, iops, fops) \ + NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} ) +#define LNK(NAME, get_link) \ + NOD(NAME, (S_IFLNK|S_IRWXUGO), \ + &proc_pid_link_inode_operations, NULL, \ + { .proc_get_link = get_link } ) +#define REG(NAME, MODE, fops) \ + NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) +#define INF(NAME, MODE, read) \ + NOD(NAME, (S_IFREG|(MODE)), \ + NULL, &proc_info_file_operations, \ + { .proc_read = read } ) +#define ONE(NAME, MODE, show) \ + NOD(NAME, (S_IFREG|(MODE)), \ + NULL, &proc_single_file_operations, \ + { .proc_show = show } ) + +/* ANDROID is for special files in /proc. */ +#define ANDROID(NAME, MODE, OTYPE) \ + NOD(NAME, (S_IFREG|(MODE)), \ + &proc_##OTYPE##_inode_operations, \ + &proc_##OTYPE##_operations, {}) + +/* + * Count the number of hardlinks for the pid_entry table, excluding the . + * and .. links. + */ +static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, + unsigned int n) +{ + unsigned int i; + unsigned int count; + + count = 0; + for (i = 0; i < n; ++i) { + if (S_ISDIR(entries[i].mode)) + ++count; + } + + return count; +} + +static int get_task_root(struct task_struct *task, struct path *root) +{ + int result = -ENOENT; + + task_lock(task); + if (task->fs) { + get_fs_root(task->fs, root); + result = 0; + } + task_unlock(task); + return result; +} + +static int proc_cwd_link(struct inode *inode, struct path *path) +{ + struct task_struct *task = get_proc_task(inode); + int result = -ENOENT; + + if (task) { + task_lock(task); + if (task->fs) { + get_fs_pwd(task->fs, path); + result = 0; + } + task_unlock(task); + put_task_struct(task); + } + return result; +} + +static int proc_root_link(struct inode *inode, struct path *path) +{ + struct task_struct *task = get_proc_task(inode); + int result = -ENOENT; + + if (task) { + result = get_task_root(task, path); + put_task_struct(task); + } + return result; +} + +static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) +{ + struct mm_struct *mm; + int err; + + err = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (err) + return ERR_PTR(err); + + mm = get_task_mm(task); + if (mm && mm != current->mm && + !ptrace_may_access(task, PTRACE_MODE_READ) && + !capable(CAP_SYS_RESOURCE)) { + mmput(mm); + mm = ERR_PTR(-EACCES); + } + mutex_unlock(&task->signal->cred_guard_mutex); + + return mm; +} + +struct mm_struct *mm_for_maps(struct task_struct *task) +{ + return mm_access(task, PTRACE_MODE_READ); +} + +static int proc_pid_cmdline(struct task_struct *task, char * buffer) +{ + int res = 0; + unsigned int len; + struct mm_struct *mm = get_task_mm(task); + if (!mm) + goto out; + if (!mm->arg_end) + goto out_mm; /* Shh! No looking before we're done */ + + len = mm->arg_end - mm->arg_start; + + if (len > PAGE_SIZE) + len = PAGE_SIZE; + + res = access_process_vm(task, mm->arg_start, buffer, len, 0); + + // If the nul at the end of args has been overwritten, then + // assume application is using setproctitle(3). + if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) { + len = strnlen(buffer, res); + if (len < res) { + res = len; + } else { + len = mm->env_end - mm->env_start; + if (len > PAGE_SIZE - res) + len = PAGE_SIZE - res; + res += access_process_vm(task, mm->env_start, buffer+res, len, 0); + res = strnlen(buffer, res); + } + } +out_mm: + mmput(mm); +out: + return res; +} + +static int proc_pid_auxv(struct task_struct *task, char *buffer) +{ + struct mm_struct *mm = mm_for_maps(task); + int res = PTR_ERR(mm); + if (mm && !IS_ERR(mm)) { + unsigned int nwords = 0; + do { + nwords += 2; + } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ + res = nwords * sizeof(mm->saved_auxv[0]); + if (res > PAGE_SIZE) + res = PAGE_SIZE; + memcpy(buffer, mm->saved_auxv, res); + mmput(mm); + } + return res; +} + + +#ifdef CONFIG_KALLSYMS +/* + * Provides a wchan file via kallsyms in a proper one-value-per-file format. + * Returns the resolved symbol. If that fails, simply return the address. + */ +static int proc_pid_wchan(struct task_struct *task, char *buffer) +{ + unsigned long wchan; + char symname[KSYM_NAME_LEN]; + + wchan = get_wchan(task); + + if (lookup_symbol_name(wchan, symname) < 0) + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + return 0; + else + return sprintf(buffer, "%lu", wchan); + else + return sprintf(buffer, "%s", symname); +} +#endif /* CONFIG_KALLSYMS */ + +static int lock_trace(struct task_struct *task) +{ + int err = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (err) + return err; + if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) { + mutex_unlock(&task->signal->cred_guard_mutex); + return -EPERM; + } + return 0; +} + +static void unlock_trace(struct task_struct *task) +{ + mutex_unlock(&task->signal->cred_guard_mutex); +} + +#ifdef CONFIG_STACKTRACE + +#define MAX_STACK_TRACE_DEPTH 64 + +static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct stack_trace trace; + unsigned long *entries; + int err; + int i; + + entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); + if (!entries) + return -ENOMEM; + + trace.nr_entries = 0; + trace.max_entries = MAX_STACK_TRACE_DEPTH; + trace.entries = entries; + trace.skip = 0; + + err = lock_trace(task); + if (!err) { + save_stack_trace_tsk(task, &trace); + + for (i = 0; i < trace.nr_entries; i++) { + seq_printf(m, "[<%pK>] %pS\n", + (void *)entries[i], (void *)entries[i]); + } + unlock_trace(task); + } + kfree(entries); + + return err; +} +#endif + +#ifdef CONFIG_SCHEDSTATS +/* + * Provides /proc/PID/schedstat + */ +static int proc_pid_schedstat(struct task_struct *task, char *buffer) +{ + return sprintf(buffer, "%llu %llu %lu\n", + (unsigned long long)task->se.sum_exec_runtime, + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); +} +#endif + +#ifdef CONFIG_LATENCYTOP +static int lstats_show_proc(struct seq_file *m, void *v) +{ + int i; + struct inode *inode = m->private; + struct task_struct *task = get_proc_task(inode); + + if (!task) + return -ESRCH; + seq_puts(m, "Latency Top version : v0.1\n"); + for (i = 0; i < 32; i++) { + struct latency_record *lr = &task->latency_record[i]; + if (lr->backtrace[0]) { + int q; + seq_printf(m, "%i %li %li", + lr->count, lr->time, lr->max); + for (q = 0; q < LT_BACKTRACEDEPTH; q++) { + unsigned long bt = lr->backtrace[q]; + if (!bt) + break; + if (bt == ULONG_MAX) + break; + seq_printf(m, " %ps", (void *)bt); + } + seq_putc(m, '\n'); + } + + } + put_task_struct(task); + return 0; +} + +static int lstats_open(struct inode *inode, struct file *file) +{ + return single_open(file, lstats_show_proc, inode); +} + +static ssize_t lstats_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + + if (!task) + return -ESRCH; + clear_all_latency_tracing(task); + put_task_struct(task); + + return count; +} + +static const struct file_operations proc_lstats_operations = { + .open = lstats_open, + .read = seq_read, + .write = lstats_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +static int proc_oom_score(struct task_struct *task, char *buffer) +{ + unsigned long points = 0; + + read_lock(&tasklist_lock); + if (pid_alive(task)) + points = oom_badness(task, NULL, NULL, + totalram_pages + total_swap_pages); + read_unlock(&tasklist_lock); + return sprintf(buffer, "%lu\n", points); +} + +struct limit_names { + char *name; + char *unit; +}; + +static const struct limit_names lnames[RLIM_NLIMITS] = { + [RLIMIT_CPU] = {"Max cpu time", "seconds"}, + [RLIMIT_FSIZE] = {"Max file size", "bytes"}, + [RLIMIT_DATA] = {"Max data size", "bytes"}, + [RLIMIT_STACK] = {"Max stack size", "bytes"}, + [RLIMIT_CORE] = {"Max core file size", "bytes"}, + [RLIMIT_RSS] = {"Max resident set", "bytes"}, + [RLIMIT_NPROC] = {"Max processes", "processes"}, + [RLIMIT_NOFILE] = {"Max open files", "files"}, + [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"}, + [RLIMIT_AS] = {"Max address space", "bytes"}, + [RLIMIT_LOCKS] = {"Max file locks", "locks"}, + [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"}, + [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"}, + [RLIMIT_NICE] = {"Max nice priority", NULL}, + [RLIMIT_RTPRIO] = {"Max realtime priority", NULL}, + [RLIMIT_RTTIME] = {"Max realtime timeout", "us"}, +}; + +/* Display limits for a process */ +static int proc_pid_limits(struct task_struct *task, char *buffer) +{ + unsigned int i; + int count = 0; + unsigned long flags; + char *bufptr = buffer; + + struct rlimit rlim[RLIM_NLIMITS]; + + if (!lock_task_sighand(task, &flags)) + return 0; + memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); + unlock_task_sighand(task, &flags); + + /* + * print the file header + */ + count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n", + "Limit", "Soft Limit", "Hard Limit", "Units"); + + for (i = 0; i < RLIM_NLIMITS; i++) { + if (rlim[i].rlim_cur == RLIM_INFINITY) + count += sprintf(&bufptr[count], "%-25s %-20s ", + lnames[i].name, "unlimited"); + else + count += sprintf(&bufptr[count], "%-25s %-20lu ", + lnames[i].name, rlim[i].rlim_cur); + + if (rlim[i].rlim_max == RLIM_INFINITY) + count += sprintf(&bufptr[count], "%-20s ", "unlimited"); + else + count += sprintf(&bufptr[count], "%-20lu ", + rlim[i].rlim_max); + + if (lnames[i].unit) + count += sprintf(&bufptr[count], "%-10s\n", + lnames[i].unit); + else + count += sprintf(&bufptr[count], "\n"); + } + + return count; +} + +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK +static int proc_pid_syscall(struct task_struct *task, char *buffer) +{ + long nr; + unsigned long args[6], sp, pc; + int res = lock_trace(task); + if (res) + return res; + + if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) + res = sprintf(buffer, "running\n"); + else if (nr < 0) + res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc); + else + res = sprintf(buffer, + "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", + nr, + args[0], args[1], args[2], args[3], args[4], args[5], + sp, pc); + unlock_trace(task); + return res; +} +#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ + +/************************************************************************/ +/* Here the fs part begins */ +/************************************************************************/ + +/* permission checks */ +static int proc_fd_access_allowed(struct inode *inode) +{ + struct task_struct *task; + int allowed = 0; + /* Allow access to a task's file descriptors if it is us or we + * may use ptrace attach to the process and find out that + * information. + */ + task = get_proc_task(inode); + if (task) { + allowed = ptrace_may_access(task, PTRACE_MODE_READ); + put_task_struct(task); + } + return allowed; +} + +int proc_setattr(struct dentry *dentry, struct iattr *attr) +{ + int error; + struct inode *inode = dentry->d_inode; + + if (attr->ia_valid & ATTR_MODE) + return -EPERM; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +static const struct inode_operations proc_def_inode_operations = { + .setattr = proc_setattr, +}; + +static int mounts_open_common(struct inode *inode, struct file *file, + const struct seq_operations *op) +{ + struct task_struct *task = get_proc_task(inode); + struct nsproxy *nsp; + struct mnt_namespace *ns = NULL; + struct path root; + struct proc_mounts *p; + int ret = -EINVAL; + + if (task) { + rcu_read_lock(); + nsp = task_nsproxy(task); + if (nsp) { + ns = nsp->mnt_ns; + if (ns) + get_mnt_ns(ns); + } + rcu_read_unlock(); + if (ns && get_task_root(task, &root) == 0) + ret = 0; + put_task_struct(task); + } + + if (!ns) + goto err; + if (ret) + goto err_put_ns; + + ret = -ENOMEM; + p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); + if (!p) + goto err_put_path; + + file->private_data = &p->m; + ret = seq_open(file, op); + if (ret) + goto err_free; + + p->m.private = p; + p->ns = ns; + p->root = root; + p->event = ns->event; + + return 0; + + err_free: + kfree(p); + err_put_path: + path_put(&root); + err_put_ns: + put_mnt_ns(ns); + err: + return ret; +} + +static int mounts_release(struct inode *inode, struct file *file) +{ + struct proc_mounts *p = file->private_data; + path_put(&p->root); + put_mnt_ns(p->ns); + return seq_release(inode, file); +} + +static unsigned mounts_poll(struct file *file, poll_table *wait) +{ + struct proc_mounts *p = file->private_data; + unsigned res = POLLIN | POLLRDNORM; + + poll_wait(file, &p->ns->poll, wait); + if (mnt_had_events(p)) + res |= POLLERR | POLLPRI; + + return res; +} + +static int mounts_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, &mounts_op); +} + +static const struct file_operations proc_mounts_operations = { + .open = mounts_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, + .poll = mounts_poll, +}; + +static int mountinfo_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, &mountinfo_op); +} + +static const struct file_operations proc_mountinfo_operations = { + .open = mountinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, + .poll = mounts_poll, +}; + +static int mountstats_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, &mountstats_op); +} + +static const struct file_operations proc_mountstats_operations = { + .open = mountstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, +}; + +#define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */ + +static ssize_t proc_info_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + unsigned long page; + ssize_t length; + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; + + if (count > PROC_BLOCK_SIZE) + count = PROC_BLOCK_SIZE; + + length = -ENOMEM; + if (!(page = __get_free_page(GFP_TEMPORARY))) + goto out; + + length = PROC_I(inode)->op.proc_read(task, (char*)page); + + if (length >= 0) + length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); + free_page(page); +out: + put_task_struct(task); +out_no_task: + return length; +} + +static const struct file_operations proc_info_file_operations = { + .read = proc_info_read, + .llseek = generic_file_llseek, +}; + +static int proc_single_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct pid_namespace *ns; + struct pid *pid; + struct task_struct *task; + int ret; + + ns = inode->i_sb->s_fs_info; + pid = proc_pid(inode); + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return -ESRCH; + + ret = PROC_I(inode)->op.proc_show(m, ns, pid, task); + + put_task_struct(task); + return ret; +} + +static int proc_single_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, proc_single_show, inode); +} + +static const struct file_operations proc_single_file_operations = { + .open = proc_single_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int mem_open(struct inode* inode, struct file* file) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct mm_struct *mm; + + if (!task) + return -ESRCH; + + mm = mm_access(task, PTRACE_MODE_ATTACH); + put_task_struct(task); + + if (IS_ERR(mm)) + return PTR_ERR(mm); + + if (mm) { + /* ensure this mm_struct can't be freed */ + atomic_inc(&mm->mm_count); + /* but do not pin its memory */ + mmput(mm); + } + + /* OK to pass negative loff_t, we can catch out-of-range */ + file->f_mode |= FMODE_UNSIGNED_OFFSET; + file->private_data = mm; + + return 0; +} + +static ssize_t mem_rw(struct file *file, char __user *buf, + size_t count, loff_t *ppos, int write) +{ + struct mm_struct *mm = file->private_data; + unsigned long addr = *ppos; + ssize_t copied; + char *page; + + if (!mm) + return 0; + + page = (char *)__get_free_page(GFP_TEMPORARY); + if (!page) + return -ENOMEM; + + copied = 0; + if (!atomic_inc_not_zero(&mm->mm_users)) + goto free; + + while (count > 0) { + int this_len = min_t(int, count, PAGE_SIZE); + + if (write && copy_from_user(page, buf, this_len)) { + copied = -EFAULT; + break; + } + + this_len = access_remote_vm(mm, addr, page, this_len, write); + if (!this_len) { + if (!copied) + copied = -EIO; + break; + } + + if (!write && copy_to_user(buf, page, this_len)) { + copied = -EFAULT; + break; + } + + buf += this_len; + addr += this_len; + copied += this_len; + count -= this_len; + } + *ppos = addr; + + mmput(mm); +free: + free_page((unsigned long) page); + return copied; +} + +static ssize_t mem_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, buf, count, ppos, 0); +} + +#define mem_write NULL + +#ifndef mem_write +/* This is a security hazard */ +static ssize_t mem_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, (char __user*)buf, count, ppos, 1); +} +#endif + +loff_t mem_lseek(struct file *file, loff_t offset, int orig) +{ + switch (orig) { + case 0: + file->f_pos = offset; + break; + case 1: + file->f_pos += offset; + break; + default: + return -EINVAL; + } + force_successful_syscall_return(); + return file->f_pos; +} + +static int mem_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + if (mm) + mmdrop(mm); + return 0; +} + +static const struct file_operations proc_mem_operations = { + .llseek = mem_lseek, + .read = mem_read, + .write = mem_write, + .open = mem_open, + .release = mem_release, +}; + +static ssize_t environ_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + char *page; + unsigned long src = *ppos; + int ret = -ESRCH; + struct mm_struct *mm; + + if (!task) + goto out_no_task; + + ret = -ENOMEM; + page = (char *)__get_free_page(GFP_TEMPORARY); + if (!page) + goto out; + + + mm = mm_for_maps(task); + ret = PTR_ERR(mm); + if (!mm || IS_ERR(mm)) + goto out_free; + + ret = 0; + while (count > 0) { + int this_len, retval, max_len; + + this_len = mm->env_end - (mm->env_start + src); + + if (this_len <= 0) + break; + + max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; + this_len = (this_len > max_len) ? max_len : this_len; + + retval = access_process_vm(task, (mm->env_start + src), + page, this_len, 0); + + if (retval <= 0) { + ret = retval; + break; + } + + if (copy_to_user(buf, page, retval)) { + ret = -EFAULT; + break; + } + + ret += retval; + src += retval; + buf += retval; + count -= retval; + } + *ppos = src; + + mmput(mm); +out_free: + free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: + return ret; +} + +static const struct file_operations proc_environ_operations = { + .read = environ_read, + .llseek = generic_file_llseek, +}; + +static ssize_t oom_adjust_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + char buffer[PROC_NUMBUF]; + size_t len; + int oom_adjust = OOM_DISABLE; + unsigned long flags; + + if (!task) + return -ESRCH; + + if (lock_task_sighand(task, &flags)) { + oom_adjust = task->signal->oom_adj; + unlock_task_sighand(task, &flags); + } + + put_task_struct(task); + + len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); + + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_adjust_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + int oom_adjust; + unsigned long flags; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_adjust); + if (err) + goto out; + if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && + oom_adjust != OOM_DISABLE) { + err = -EINVAL; + goto out; + } + + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + + if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + + if (oom_adjust != task->signal->oom_adj) { + if (oom_adjust == OOM_DISABLE) + atomic_inc(&task->mm->oom_disable_count); + if (task->signal->oom_adj == OOM_DISABLE) + atomic_dec(&task->mm->oom_disable_count); + } + + /* + * Warn that /proc/pid/oom_adj is deprecated, see + * Documentation/feature-removal-schedule.txt. + */ + printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, " + "please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), + task_pid_nr(task), task_pid_nr(task)); + task->signal->oom_adj = oom_adjust; + /* + * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum + * value is always attainable. + */ + if (task->signal->oom_adj == OOM_ADJUST_MAX) + task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX; + else + task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / + -OOM_DISABLE; +err_sighand: + unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); + put_task_struct(task); +out: + return err < 0 ? err : count; +} + +static int oom_adjust_permission(struct inode *inode, int mask, + unsigned int flags) +{ + uid_t uid; + struct task_struct *p; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + p = get_proc_task(inode); + if(p) { + uid = task_uid(p); + put_task_struct(p); + } + + /* + * System Server (uid == 1000) is granted access to oom_adj of all + * android applications (uid > 10000) as and services (uid >= 1000) + */ + if (p && (current_fsuid() == 1000) && (uid >= 1000)) { + if (inode->i_mode >> 6 & mask) { + return 0; + } + } + + /* Fall back to default. */ + return generic_permission(inode, mask, flags, NULL); +} + +static const struct inode_operations proc_oom_adjust_inode_operations = { + .permission = oom_adjust_permission, +}; + +static const struct file_operations proc_oom_adjust_operations = { + .read = oom_adjust_read, + .write = oom_adjust_write, + .llseek = generic_file_llseek, +}; + +static ssize_t oom_score_adj_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + char buffer[PROC_NUMBUF]; + int oom_score_adj = OOM_SCORE_ADJ_MIN; + unsigned long flags; + size_t len; + + if (!task) + return -ESRCH; + if (lock_task_sighand(task, &flags)) { + oom_score_adj = task->signal->oom_score_adj; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + unsigned long flags; + int oom_score_adj; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_score_adj); + if (err) + goto out; + if (oom_score_adj < OOM_SCORE_ADJ_MIN || + oom_score_adj > OOM_SCORE_ADJ_MAX) { + err = -EINVAL; + goto out; + } + + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + + if (oom_score_adj < task->signal->oom_score_adj_min && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + + if (oom_score_adj != task->signal->oom_score_adj) { + if (oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_inc(&task->mm->oom_disable_count); + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&task->mm->oom_disable_count); + } + task->signal->oom_score_adj = oom_score_adj; + if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) + task->signal->oom_score_adj_min = oom_score_adj; + /* + * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is + * always attainable. + */ + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + task->signal->oom_adj = OOM_DISABLE; + else + task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / + OOM_SCORE_ADJ_MAX; +err_sighand: + unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); + put_task_struct(task); +out: + return err < 0 ? err : count; +} + +static const struct file_operations proc_oom_score_adj_operations = { + .read = oom_score_adj_read, + .write = oom_score_adj_write, + .llseek = default_llseek, +}; + +#ifdef CONFIG_AUDITSYSCALL +#define TMPBUFLEN 21 +static ssize_t proc_loginuid_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + ssize_t length; + char tmpbuf[TMPBUFLEN]; + + if (!task) + return -ESRCH; + length = scnprintf(tmpbuf, TMPBUFLEN, "%u", + audit_get_loginuid(task)); + put_task_struct(task); + return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); +} + +static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + char *page, *tmp; + ssize_t length; + uid_t loginuid; + + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; + + rcu_read_lock(); + if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { + rcu_read_unlock(); + return -EPERM; + } + rcu_read_unlock(); + + if (count >= PAGE_SIZE) + count = PAGE_SIZE - 1; + + if (*ppos != 0) { + /* No partial writes. */ + return -EINVAL; + } + page = (char*)__get_free_page(GFP_TEMPORARY); + if (!page) + return -ENOMEM; + length = -EFAULT; + if (copy_from_user(page, buf, count)) + goto out_free_page; + + page[count] = '\0'; + loginuid = simple_strtoul(page, &tmp, 10); + if (tmp == page) { + length = -EINVAL; + goto out_free_page; + + } + length = audit_set_loginuid(current, loginuid); + if (likely(length == 0)) + length = count; + +out_free_page: + free_page((unsigned long) page); + return length; +} + +static const struct file_operations proc_loginuid_operations = { + .read = proc_loginuid_read, + .write = proc_loginuid_write, + .llseek = generic_file_llseek, +}; + +static ssize_t proc_sessionid_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + ssize_t length; + char tmpbuf[TMPBUFLEN]; + + if (!task) + return -ESRCH; + length = scnprintf(tmpbuf, TMPBUFLEN, "%u", + audit_get_sessionid(task)); + put_task_struct(task); + return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); +} + +static const struct file_operations proc_sessionid_operations = { + .read = proc_sessionid_read, + .llseek = generic_file_llseek, +}; +#endif + +#ifdef CONFIG_FAULT_INJECTION +static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + char buffer[PROC_NUMBUF]; + size_t len; + int make_it_fail; + + if (!task) + return -ESRCH; + make_it_fail = task->make_it_fail; + put_task_struct(task); + + len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail); + + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t proc_fault_inject_write(struct file * file, + const char __user * buf, size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF], *end; + int make_it_fail; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + make_it_fail = simple_strtol(strstrip(buffer), &end, 0); + if (*end) + return -EINVAL; + task = get_proc_task(file->f_dentry->d_inode); + if (!task) + return -ESRCH; + task->make_it_fail = make_it_fail; + put_task_struct(task); + + return count; +} + +static const struct file_operations proc_fault_inject_operations = { + .read = proc_fault_inject_read, + .write = proc_fault_inject_write, + .llseek = generic_file_llseek, +}; +#endif + + +#ifdef CONFIG_SCHED_DEBUG +/* + * Print out various scheduling related per-task fields: + */ +static int sched_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_set_task(p); + + put_task_struct(p); + + return count; +} + +static int sched_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_show, inode); +} + +static const struct file_operations proc_pid_sched_operations = { + .open = sched_open, + .read = seq_read, + .write = sched_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +/* + * Print out autogroup related information: + */ +static int sched_autogroup_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_autogroup_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_autogroup_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + char buffer[PROC_NUMBUF]; + int nice; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + err = kstrtoint(strstrip(buffer), 0, &nice); + if (err < 0) + return err; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = nice; + err = proc_sched_autogroup_set_nice(p, &err); + if (err) + count = err; + + put_task_struct(p); + + return count; +} + +static int sched_autogroup_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, sched_autogroup_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_sched_autogroup_operations = { + .open = sched_autogroup_open, + .read = seq_read, + .write = sched_autogroup_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif /* CONFIG_SCHED_AUTOGROUP */ + +static ssize_t comm_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + char buffer[TASK_COMM_LEN]; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (same_thread_group(current, p)) + set_task_comm(p, buffer); + else + count = -EINVAL; + + put_task_struct(p); + + return count; +} + +static int comm_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + task_lock(p); + seq_printf(m, "%s\n", p->comm); + task_unlock(p); + + put_task_struct(p); + + return 0; +} + +static int comm_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, comm_show, inode); +} + +static const struct file_operations proc_pid_set_comm_operations = { + .open = comm_open, + .read = seq_read, + .write = comm_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int proc_exe_link(struct inode *inode, struct path *exe_path) +{ + struct task_struct *task; + struct mm_struct *mm; + struct file *exe_file; + + task = get_proc_task(inode); + if (!task) + return -ENOENT; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + return -ENOENT; + exe_file = get_mm_exe_file(mm); + mmput(mm); + if (exe_file) { + *exe_path = exe_file->f_path; + path_get(&exe_file->f_path); + fput(exe_file); + return 0; + } else + return -ENOENT; +} + +static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + int error = -EACCES; + + /* We don't need a base pointer in the /proc filesystem */ + path_put(&nd->path); + + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) + goto out; + + error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); +out: + return ERR_PTR(error); +} + +static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) +{ + char *tmp = (char*)__get_free_page(GFP_TEMPORARY); + char *pathname; + int len; + + if (!tmp) + return -ENOMEM; + + pathname = d_path(path, tmp, PAGE_SIZE); + len = PTR_ERR(pathname); + if (IS_ERR(pathname)) + goto out; + len = tmp + PAGE_SIZE - 1 - pathname; + + if (len > buflen) + len = buflen; + if (copy_to_user(buffer, pathname, len)) + len = -EFAULT; + out: + free_page((unsigned long)tmp); + return len; +} + +static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) +{ + int error = -EACCES; + struct inode *inode = dentry->d_inode; + struct path path; + + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) + goto out; + + error = PROC_I(inode)->op.proc_get_link(inode, &path); + if (error) + goto out; + + error = do_proc_readlink(&path, buffer, buflen); + path_put(&path); +out: + return error; +} + +static const struct inode_operations proc_pid_link_inode_operations = { + .readlink = proc_pid_readlink, + .follow_link = proc_pid_follow_link, + .setattr = proc_setattr, +}; + + +/* building an inode */ + +static int task_dumpable(struct task_struct *task) +{ + int dumpable = 0; + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) + dumpable = get_dumpable(mm); + task_unlock(task); + if(dumpable == 1) + return 1; + return 0; +} + +struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) +{ + struct inode * inode; + struct proc_inode *ei; + const struct cred *cred; + + /* We need a new inode */ + + inode = new_inode(sb); + if (!inode) + goto out; + + /* Common stuff */ + ei = PROC_I(inode); + inode->i_ino = get_next_ino(); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &proc_def_inode_operations; + + /* + * grab the reference to task. + */ + ei->pid = get_task_pid(task, PIDTYPE_PID); + if (!ei->pid) + goto out_unlock; + + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } + security_task_to_inode(task, inode); + +out: + return inode; + +out_unlock: + iput(inode); + return NULL; +} + +int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *task; + const struct cred *cred; + + generic_fillattr(inode, stat); + + rcu_read_lock(); + stat->uid = 0; + stat->gid = 0; + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + cred = __task_cred(task); + stat->uid = cred->euid; + stat->gid = cred->egid; + } + } + rcu_read_unlock(); + return 0; +} + +/* dentry stuff */ + +/* + * Exceptional case: normally we are not allowed to unhash a busy + * directory. In this case, however, we can do it - no aliasing problems + * due to the way we treat inodes. + * + * Rewrite the inode's ownerships here because the owning task may have + * performed a setuid(), etc. + * + * Before the /proc/pid/status file was created the only way to read + * the effective uid of a /process was to stat /proc/pid. Reading + * /proc/pid/status is slow enough that procps and other packages + * kept stating /proc/pid. To keep the rules in /proc simple I have + * made this apply to all per process world readable and executable + * directories. + */ +int pid_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode; + struct task_struct *task; + const struct cred *cred; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); + + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + inode->i_mode &= ~(S_ISUID | S_ISGID); + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } + d_drop(dentry); + return 0; +} + +static int pid_delete_dentry(const struct dentry * dentry) +{ + /* Is the task we represent dead? + * If so, then don't put the dentry on the lru list, + * kill it immediately. + */ + return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; +} + +const struct dentry_operations pid_dentry_operations = +{ + .d_revalidate = pid_revalidate, + .d_delete = pid_delete_dentry, +}; + +/* Lookups */ + +/* + * Fill a directory entry. + * + * If possible create the dcache entry and derive our inode number and + * file type from dcache entry. + * + * Since all of the proc inode numbers are dynamically generated, the inode + * numbers do not exist until the inode is cache. This means creating the + * the dcache entry in readdir is necessary to keep the inode numbers + * reported by readdir in sync with the inode numbers reported + * by stat. + */ +int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + const char *name, int len, + instantiate_t instantiate, struct task_struct *task, const void *ptr) +{ + struct dentry *child, *dir = filp->f_path.dentry; + struct inode *inode; + struct qstr qname; + ino_t ino = 0; + unsigned type = DT_UNKNOWN; + + qname.name = name; + qname.len = len; + qname.hash = full_name_hash(name, len); + + child = d_lookup(dir, &qname); + if (!child) { + struct dentry *new; + new = d_alloc(dir, &qname); + if (new) { + child = instantiate(dir->d_inode, new, task, ptr); + if (child) + dput(new); + else + child = new; + } + } + if (!child || IS_ERR(child) || !child->d_inode) + goto end_instantiate; + inode = child->d_inode; + if (inode) { + ino = inode->i_ino; + type = inode->i_mode >> 12; + } + dput(child); +end_instantiate: + if (!ino) + ino = find_inode_number(dir, &qname); + if (!ino) + ino = 1; + return filldir(dirent, name, len, filp->f_pos, ino, type); +} + +static unsigned name_to_int(struct dentry *dentry) +{ + const char *name = dentry->d_name.name; + int len = dentry->d_name.len; + unsigned n = 0; + + if (len > 1 && *name == '0') + goto out; + while (len-- > 0) { + unsigned c = *name++ - '0'; + if (c > 9) + goto out; + if (n >= (~0U-9)/10) + goto out; + n *= 10; + n += c; + } + return n; +out: + return ~0U; +} + +#define PROC_FDINFO_MAX 64 + +static int proc_fd_info(struct inode *inode, struct path *path, char *info) +{ + struct task_struct *task = get_proc_task(inode); + struct files_struct *files = NULL; + struct file *file; + int fd = proc_fd(inode); + + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } + if (files) { + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { + unsigned int f_flags; + struct fdtable *fdt; + + fdt = files_fdtable(files); + f_flags = file->f_flags & ~O_CLOEXEC; + if (FD_ISSET(fd, fdt->close_on_exec)) + f_flags |= O_CLOEXEC; + + if (path) { + *path = file->f_path; + path_get(&file->f_path); + } + if (info) + snprintf(info, PROC_FDINFO_MAX, + "pos:\t%lli\n" + "flags:\t0%o\n", + (long long) file->f_pos, + f_flags); + spin_unlock(&files->file_lock); + put_files_struct(files); + return 0; + } + spin_unlock(&files->file_lock); + put_files_struct(files); + } + return -ENOENT; +} + +static int proc_fd_link(struct inode *inode, struct path *path) +{ + return proc_fd_info(inode, path, NULL); +} + +static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode; + struct task_struct *task; + int fd; + struct files_struct *files; + const struct cred *cred; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); + fd = proc_fd(inode); + + if (task) { + files = get_files_struct(task); + if (files) { + rcu_read_lock(); + if (fcheck_files(files, fd)) { + rcu_read_unlock(); + put_files_struct(files); + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + inode->i_mode &= ~(S_ISUID | S_ISGID); + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } + rcu_read_unlock(); + put_files_struct(files); + } + put_task_struct(task); + } + d_drop(dentry); + return 0; +} + +static const struct dentry_operations tid_fd_dentry_operations = +{ + .d_revalidate = tid_fd_revalidate, + .d_delete = pid_delete_dentry, +}; + +static struct dentry *proc_fd_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + unsigned fd = *(const unsigned *)ptr; + struct file *file; + struct files_struct *files; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-ENOENT); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + ei = PROC_I(inode); + ei->fd = fd; + files = get_files_struct(task); + if (!files) + goto out_iput; + inode->i_mode = S_IFLNK; + + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (!file) + goto out_unlock; + if (file->f_mode & FMODE_READ) + inode->i_mode |= S_IRUSR | S_IXUSR; + if (file->f_mode & FMODE_WRITE) + inode->i_mode |= S_IWUSR | S_IXUSR; + spin_unlock(&files->file_lock); + put_files_struct(files); + + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + ei->op.proc_get_link = proc_fd_link; + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, NULL)) + error = NULL; + + out: + return error; +out_unlock: + spin_unlock(&files->file_lock); + put_files_struct(files); +out_iput: + iput(inode); + goto out; +} + +static struct dentry *proc_lookupfd_common(struct inode *dir, + struct dentry *dentry, + instantiate_t instantiate) +{ + struct task_struct *task = get_proc_task(dir); + unsigned fd = name_to_int(dentry); + struct dentry *result = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + if (fd == ~0U) + goto out; + + result = instantiate(dir, dentry, task, &fd); +out: + put_task_struct(task); +out_no_task: + return result; +} + +static int proc_readfd_common(struct file * filp, void * dirent, + filldir_t filldir, instantiate_t instantiate) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); + unsigned int fd, ino; + int retval; + struct files_struct * files; + + retval = -ENOENT; + if (!p) + goto out_no_task; + retval = 0; + + fd = filp->f_pos; + switch (fd) { + case 0: + if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + default: + files = get_files_struct(p); + if (!files) + goto out; + rcu_read_lock(); + for (fd = filp->f_pos-2; + fd < files_fdtable(files)->max_fds; + fd++, filp->f_pos++) { + char name[PROC_NUMBUF]; + int len; + + if (!fcheck_files(files, fd)) + continue; + rcu_read_unlock(); + + len = snprintf(name, sizeof(name), "%d", fd); + if (proc_fill_cache(filp, dirent, filldir, + name, len, instantiate, + p, &fd) < 0) { + rcu_read_lock(); + break; + } + rcu_read_lock(); + } + rcu_read_unlock(); + put_files_struct(files); + } +out: + put_task_struct(p); +out_no_task: + return retval; +} + +static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); +} + +static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) +{ + return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); +} + +static ssize_t proc_fdinfo_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + char tmp[PROC_FDINFO_MAX]; + int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp); + if (!err) + err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp)); + return err; +} + +static const struct file_operations proc_fdinfo_file_operations = { + .open = nonseekable_open, + .read = proc_fdinfo_read, + .llseek = no_llseek, +}; + +static const struct file_operations proc_fd_operations = { + .read = generic_read_dir, + .readdir = proc_readfd, + .llseek = default_llseek, +}; + +/* + * /proc/pid/fd needs a special permission handler so that a process can still + * access /proc/self/fd after it has executed a setuid(). + */ +static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags) +{ + int rv = generic_permission(inode, mask, flags, NULL); + if (rv == 0) + return 0; + if (task_pid(current) == proc_pid(inode)) + rv = 0; + return rv; +} + +/* + * proc directories can do almost nothing.. + */ +static const struct inode_operations proc_fd_inode_operations = { + .lookup = proc_lookupfd, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static struct dentry *proc_fdinfo_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + unsigned fd = *(unsigned *)ptr; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-ENOENT); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + ei = PROC_I(inode); + ei->fd = fd; + inode->i_mode = S_IFREG | S_IRUSR; + inode->i_fop = &proc_fdinfo_file_operations; + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, NULL)) + error = NULL; + + out: + return error; +} + +static struct dentry *proc_lookupfdinfo(struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); +} + +static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir) +{ + return proc_readfd_common(filp, dirent, filldir, + proc_fdinfo_instantiate); +} + +static const struct file_operations proc_fdinfo_operations = { + .read = generic_read_dir, + .readdir = proc_readfdinfo, + .llseek = default_llseek, +}; + +/* + * proc directories can do almost nothing.. + */ +static const struct inode_operations proc_fdinfo_inode_operations = { + .lookup = proc_lookupfdinfo, + .setattr = proc_setattr, +}; + + +static struct dentry *proc_pident_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + const struct pid_entry *p = ptr; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-ENOENT); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + inode->i_mode = p->mode; + if (S_ISDIR(inode->i_mode)) + inode->i_nlink = 2; /* Use getattr to fix if necessary */ + if (p->iop) + inode->i_op = p->iop; + if (p->fop) + inode->i_fop = p->fop; + ei->op = p->op; + d_set_d_op(dentry, &pid_dentry_operations); + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; +out: + return error; +} + +static struct dentry *proc_pident_lookup(struct inode *dir, + struct dentry *dentry, + const struct pid_entry *ents, + unsigned int nents) +{ + struct dentry *error; + struct task_struct *task = get_proc_task(dir); + const struct pid_entry *p, *last; + + error = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + + /* + * Yes, it does not scale. And it should not. Don't add + * new entries into /proc// without very good reasons. + */ + last = &ents[nents - 1]; + for (p = ents; p <= last; p++) { + if (p->len != dentry->d_name.len) + continue; + if (!memcmp(dentry->d_name.name, p->name, p->len)) + break; + } + if (p > last) + goto out; + + error = proc_pident_instantiate(dir, dentry, task, p); +out: + put_task_struct(task); +out_no_task: + return error; +} + +static int proc_pident_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, struct task_struct *task, const struct pid_entry *p) +{ + return proc_fill_cache(filp, dirent, filldir, p->name, p->len, + proc_pident_instantiate, task, p); +} + +static int proc_pident_readdir(struct file *filp, + void *dirent, filldir_t filldir, + const struct pid_entry *ents, unsigned int nents) +{ + int i; + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + const struct pid_entry *p, *last; + ino_t ino; + int ret; + + ret = -ENOENT; + if (!task) + goto out_no_task; + + ret = 0; + i = filp->f_pos; + switch (i) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + default: + i -= 2; + if (i >= nents) { + ret = 1; + goto out; + } + p = ents + i; + last = &ents[nents - 1]; + while (p <= last) { + if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0) + goto out; + filp->f_pos++; + p++; + } + } + + ret = 1; +out: + put_task_struct(task); +out_no_task: + return ret; +} + +#ifdef CONFIG_SECURITY +static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + char *p = NULL; + ssize_t length; + struct task_struct *task = get_proc_task(inode); + + if (!task) + return -ESRCH; + + length = security_getprocattr(task, + (char*)file->f_path.dentry->d_name.name, + &p); + put_task_struct(task); + if (length > 0) + length = simple_read_from_buffer(buf, count, ppos, p, length); + kfree(p); + return length; +} + +static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + char *page; + ssize_t length; + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + /* No partial writes. */ + length = -EINVAL; + if (*ppos != 0) + goto out; + + length = -ENOMEM; + page = (char*)__get_free_page(GFP_TEMPORARY); + if (!page) + goto out; + + length = -EFAULT; + if (copy_from_user(page, buf, count)) + goto out_free; + + /* Guard against adverse ptrace interaction */ + length = mutex_lock_interruptible(&task->signal->cred_guard_mutex); + if (length < 0) + goto out_free; + + length = security_setprocattr(task, + (char*)file->f_path.dentry->d_name.name, + (void*)page, count); + mutex_unlock(&task->signal->cred_guard_mutex); +out_free: + free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: + return length; +} + +static const struct file_operations proc_pid_attr_operations = { + .read = proc_pid_attr_read, + .write = proc_pid_attr_write, + .llseek = generic_file_llseek, +}; + +static const struct pid_entry attr_dir_stuff[] = { + REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("prev", S_IRUGO, proc_pid_attr_operations), + REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), +}; + +static int proc_attr_dir_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + return proc_pident_readdir(filp,dirent,filldir, + attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff)); +} + +static const struct file_operations proc_attr_dir_operations = { + .read = generic_read_dir, + .readdir = proc_attr_dir_readdir, + .llseek = default_llseek, +}; + +static struct dentry *proc_attr_dir_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + return proc_pident_lookup(dir, dentry, + attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); +} + +static const struct inode_operations proc_attr_dir_inode_operations = { + .lookup = proc_attr_dir_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +#endif + +#ifdef CONFIG_ELF_CORE +static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + struct mm_struct *mm; + char buffer[PROC_NUMBUF]; + size_t len; + int ret; + + if (!task) + return -ESRCH; + + ret = 0; + mm = get_task_mm(task); + if (mm) { + len = snprintf(buffer, sizeof(buffer), "%08lx\n", + ((mm->flags & MMF_DUMP_FILTER_MASK) >> + MMF_DUMP_FILTER_SHIFT)); + mmput(mm); + ret = simple_read_from_buffer(buf, count, ppos, buffer, len); + } + + put_task_struct(task); + + return ret; +} + +static ssize_t proc_coredump_filter_write(struct file *file, + const char __user *buf, + size_t count, + loff_t *ppos) +{ + struct task_struct *task; + struct mm_struct *mm; + char buffer[PROC_NUMBUF], *end; + unsigned int val; + int ret; + int i; + unsigned long mask; + + ret = -EFAULT; + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + goto out_no_task; + + ret = -EINVAL; + val = (unsigned int)simple_strtoul(buffer, &end, 0); + if (*end == '\n') + end++; + if (end - buffer == 0) + goto out_no_task; + + ret = -ESRCH; + task = get_proc_task(file->f_dentry->d_inode); + if (!task) + goto out_no_task; + + ret = end - buffer; + mm = get_task_mm(task); + if (!mm) + goto out_no_mm; + + for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { + if (val & mask) + set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + else + clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + } + + mmput(mm); + out_no_mm: + put_task_struct(task); + out_no_task: + return ret; +} + +static const struct file_operations proc_coredump_filter_operations = { + .read = proc_coredump_filter_read, + .write = proc_coredump_filter_write, + .llseek = generic_file_llseek, +}; +#endif + +/* + * /proc/self: + */ +static int proc_self_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char tmp[PROC_NUMBUF]; + if (!tgid) + return -ENOENT; + sprintf(tmp, "%d", tgid); + return vfs_readlink(dentry,buffer,buflen,tmp); +} + +static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char *name = ERR_PTR(-ENOENT); + if (tgid) { + name = __getname(); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d", tgid); + } + nd_set_link(nd, name); + return NULL; +} + +static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + __putname(s); +} + +static const struct inode_operations proc_self_inode_operations = { + .readlink = proc_self_readlink, + .follow_link = proc_self_follow_link, + .put_link = proc_self_put_link, +}; + +/* + * proc base + * + * These are the directory entries in the root directory of /proc + * that properly belong to the /proc filesystem, as they describe + * describe something that is process related. + */ +static const struct pid_entry proc_base_stuff[] = { + NOD("self", S_IFLNK|S_IRWXUGO, + &proc_self_inode_operations, NULL, {}), +}; + +static struct dentry *proc_base_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + const struct pid_entry *p = ptr; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error; + + /* Allocate the inode */ + error = ERR_PTR(-ENOMEM); + inode = new_inode(dir->i_sb); + if (!inode) + goto out; + + /* Initialize the inode */ + ei = PROC_I(inode); + inode->i_ino = get_next_ino(); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + /* + * grab the reference to the task. + */ + ei->pid = get_task_pid(task, PIDTYPE_PID); + if (!ei->pid) + goto out_iput; + + inode->i_mode = p->mode; + if (S_ISDIR(inode->i_mode)) + inode->i_nlink = 2; + if (S_ISLNK(inode->i_mode)) + inode->i_size = 64; + if (p->iop) + inode->i_op = p->iop; + if (p->fop) + inode->i_fop = p->fop; + ei->op = p->op; + d_add(dentry, inode); + error = NULL; +out: + return error; +out_iput: + iput(inode); + goto out; +} + +static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) +{ + struct dentry *error; + struct task_struct *task = get_proc_task(dir); + const struct pid_entry *p, *last; + + error = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + + /* Lookup the directory entry */ + last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1]; + for (p = proc_base_stuff; p <= last; p++) { + if (p->len != dentry->d_name.len) + continue; + if (!memcmp(dentry->d_name.name, p->name, p->len)) + break; + } + if (p > last) + goto out; + + error = proc_base_instantiate(dir, dentry, task, p); + +out: + put_task_struct(task); +out_no_task: + return error; +} + +static int proc_base_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, struct task_struct *task, const struct pid_entry *p) +{ + return proc_fill_cache(filp, dirent, filldir, p->name, p->len, + proc_base_instantiate, task, p); +} + +#ifdef CONFIG_TASK_IO_ACCOUNTING +static int do_io_accounting(struct task_struct *task, char *buffer, int whole) +{ + struct task_io_accounting acct = task->ioac; + unsigned long flags; + int result; + + result = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (result) + return result; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + result = -EACCES; + goto out_unlock; + } + + if (whole && lock_task_sighand(task, &flags)) { + struct task_struct *t = task; + + task_io_accounting_add(&acct, &task->signal->ioac); + while_each_thread(task, t) + task_io_accounting_add(&acct, &t->ioac); + + unlock_task_sighand(task, &flags); + } + result = sprintf(buffer, + "rchar: %llu\n" + "wchar: %llu\n" + "syscr: %llu\n" + "syscw: %llu\n" + "read_bytes: %llu\n" + "write_bytes: %llu\n" + "cancelled_write_bytes: %llu\n", + (unsigned long long)acct.rchar, + (unsigned long long)acct.wchar, + (unsigned long long)acct.syscr, + (unsigned long long)acct.syscw, + (unsigned long long)acct.read_bytes, + (unsigned long long)acct.write_bytes, + (unsigned long long)acct.cancelled_write_bytes); +out_unlock: + mutex_unlock(&task->signal->cred_guard_mutex); + return result; +} + +static int proc_tid_io_accounting(struct task_struct *task, char *buffer) +{ + return do_io_accounting(task, buffer, 0); +} + +static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) +{ + return do_io_accounting(task, buffer, 1); +} +#endif /* CONFIG_TASK_IO_ACCOUNTING */ + +static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + int err = lock_trace(task); + if (!err) { + seq_printf(m, "%08x\n", task->personality); + unlock_trace(task); + } + return err; +} + +/* + * Thread groups + */ +static const struct file_operations proc_task_operations; +static const struct inode_operations proc_task_inode_operations; + +static const struct pid_entry tgid_base_stuff[] = { + DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), + DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), +#ifdef CONFIG_NET + DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), +#endif + REG("environ", S_IRUSR, proc_environ_operations), + INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("status", S_IRUGO, proc_pid_status), + ONE("personality", S_IRUGO, proc_pid_personality), + INF("limits", S_IRUGO, proc_pid_limits), +#ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), +#endif +#ifdef CONFIG_SCHED_AUTOGROUP + REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), +#endif + REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK + INF("syscall", S_IRUGO, proc_pid_syscall), +#endif + INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("stat", S_IRUGO, proc_tgid_stat), + ONE("statm", S_IRUGO, proc_pid_statm), + REG("maps", S_IRUGO, proc_maps_operations), +#ifdef CONFIG_NUMA + REG("numa_maps", S_IRUGO, proc_numa_maps_operations), +#endif + REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), + LNK("cwd", proc_cwd_link), + LNK("root", proc_root_link), + LNK("exe", proc_exe_link), + REG("mounts", S_IRUGO, proc_mounts_operations), + REG("mountinfo", S_IRUGO, proc_mountinfo_operations), + REG("mountstats", S_IRUSR, proc_mountstats_operations), +#ifdef CONFIG_PROC_PAGE_MONITOR + REG("clear_refs", S_IWUSR, proc_clear_refs_operations), + REG("smaps", S_IRUGO, proc_smaps_operations), + REG("pagemap", S_IRUGO, proc_pagemap_operations), +#endif +#ifdef CONFIG_SECURITY + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), +#endif +#ifdef CONFIG_KALLSYMS + INF("wchan", S_IRUGO, proc_pid_wchan), +#endif +#ifdef CONFIG_STACKTRACE + ONE("stack", S_IRUGO, proc_pid_stack), +#endif +#ifdef CONFIG_SCHEDSTATS + INF("schedstat", S_IRUGO, proc_pid_schedstat), +#endif +#ifdef CONFIG_LATENCYTOP + REG("latency", S_IRUGO, proc_lstats_operations), +#endif +#ifdef CONFIG_PROC_PID_CPUSET + REG("cpuset", S_IRUGO, proc_cpuset_operations), +#endif +#ifdef CONFIG_CGROUPS + REG("cgroup", S_IRUGO, proc_cgroup_operations), +#endif + INF("oom_score", S_IRUGO, proc_oom_score), + ANDROID("oom_adj",S_IRUGO|S_IWUSR, oom_adjust), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +#ifdef CONFIG_AUDITSYSCALL + REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), + REG("sessionid", S_IRUGO, proc_sessionid_operations), +#endif +#ifdef CONFIG_FAULT_INJECTION + REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), +#endif +#ifdef CONFIG_ELF_CORE + REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), +#endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + INF("io", S_IRUSR, proc_tgid_io_accounting), +#endif +#ifdef CONFIG_HARDWALL + INF("hardwall", S_IRUGO, proc_pid_hardwall), +#endif +}; + +static int proc_tgid_base_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + return proc_pident_readdir(filp,dirent,filldir, + tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff)); +} + +static const struct file_operations proc_tgid_base_operations = { + .read = generic_read_dir, + .readdir = proc_tgid_base_readdir, + .llseek = default_llseek, +}; + +static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ + return proc_pident_lookup(dir, dentry, + tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); +} + +static const struct inode_operations proc_tgid_base_inode_operations = { + .lookup = proc_tgid_base_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) +{ + struct dentry *dentry, *leader, *dir; + char buf[PROC_NUMBUF]; + struct qstr name; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", pid); + dentry = d_hash_and_lookup(mnt->mnt_root, &name); + if (dentry) { + shrink_dcache_parent(dentry); + d_drop(dentry); + dput(dentry); + } + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", tgid); + leader = d_hash_and_lookup(mnt->mnt_root, &name); + if (!leader) + goto out; + + name.name = "task"; + name.len = strlen(name.name); + dir = d_hash_and_lookup(leader, &name); + if (!dir) + goto out_put_leader; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", pid); + dentry = d_hash_and_lookup(dir, &name); + if (dentry) { + shrink_dcache_parent(dentry); + d_drop(dentry); + dput(dentry); + } + + dput(dir); +out_put_leader: + dput(leader); +out: + return; +} + +/** + * proc_flush_task - Remove dcache entries for @task from the /proc dcache. + * @task: task that should be flushed. + * + * When flushing dentries from proc, one needs to flush them from global + * proc (proc_mnt) and from all the namespaces' procs this task was seen + * in. This call is supposed to do all of this job. + * + * Looks in the dcache for + * /proc/@pid + * /proc/@tgid/task/@pid + * if either directory is present flushes it and all of it'ts children + * from the dcache. + * + * It is safe and reasonable to cache /proc entries for a task until + * that task exits. After that they just clog up the dcache with + * useless entries, possibly causing useful dcache entries to be + * flushed instead. This routine is proved to flush those useless + * dcache entries at process exit time. + * + * NOTE: This routine is just an optimization so it does not guarantee + * that no dcache entries will exist at process exit time it + * just makes it very unlikely that any will persist. + */ + +void proc_flush_task(struct task_struct *task) +{ + int i; + struct pid *pid, *tgid; + struct upid *upid; + + pid = task_pid(task); + tgid = task_tgid(task); + + for (i = 0; i <= pid->level; i++) { + upid = &pid->numbers[i]; + proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, + tgid->numbers[i].nr); + } + + upid = &pid->numbers[pid->level]; + if (upid->nr == 1) + pid_ns_release_proc(upid->ns); +} + +static struct dentry *proc_pid_instantiate(struct inode *dir, + struct dentry * dentry, + struct task_struct *task, const void *ptr) +{ + struct dentry *error = ERR_PTR(-ENOENT); + struct inode *inode; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; + inode->i_op = &proc_tgid_base_inode_operations; + inode->i_fop = &proc_tgid_base_operations; + inode->i_flags|=S_IMMUTABLE; + + inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff, + ARRAY_SIZE(tgid_base_stuff)); + + d_set_d_op(dentry, &pid_dentry_operations); + + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; +out: + return error; +} + +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +{ + struct dentry *result; + struct task_struct *task; + unsigned tgid; + struct pid_namespace *ns; + + result = proc_base_lookup(dir, dentry); + if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT) + goto out; + + tgid = name_to_int(dentry); + if (tgid == ~0U) + goto out; + + ns = dentry->d_sb->s_fs_info; + rcu_read_lock(); + task = find_task_by_pid_ns(tgid, ns); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) + goto out; + + result = proc_pid_instantiate(dir, dentry, task, NULL); + put_task_struct(task); +out: + return result; +} + +/* + * Find the first task with tgid >= tgid + * + */ +struct tgid_iter { + unsigned int tgid; + struct task_struct *task; +}; +static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter) +{ + struct pid *pid; + + if (iter.task) + put_task_struct(iter.task); + rcu_read_lock(); +retry: + iter.task = NULL; + pid = find_ge_pid(iter.tgid, ns); + if (pid) { + iter.tgid = pid_nr_ns(pid, ns); + iter.task = pid_task(pid, PIDTYPE_PID); + /* What we to know is if the pid we have find is the + * pid of a thread_group_leader. Testing for task + * being a thread_group_leader is the obvious thing + * todo but there is a window when it fails, due to + * the pid transfer logic in de_thread. + * + * So we perform the straight forward test of seeing + * if the pid we have found is the pid of a thread + * group leader, and don't worry if the task we have + * found doesn't happen to be a thread group leader. + * As we don't care in the case of readdir. + */ + if (!iter.task || !has_group_leader_pid(iter.task)) { + iter.tgid += 1; + goto retry; + } + get_task_struct(iter.task); + } + rcu_read_unlock(); + return iter; +} + +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) + +static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct tgid_iter iter) +{ + char name[PROC_NUMBUF]; + int len = snprintf(name, sizeof(name), "%d", iter.tgid); + return proc_fill_cache(filp, dirent, filldir, name, len, + proc_pid_instantiate, iter.task, NULL); +} + +/* for the /proc/ directory itself, after non-process stuff has been done */ +int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + unsigned int nr; + struct task_struct *reaper; + struct tgid_iter iter; + struct pid_namespace *ns; + + if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) + goto out_no_task; + nr = filp->f_pos - FIRST_PROCESS_ENTRY; + + reaper = get_proc_task(filp->f_path.dentry->d_inode); + if (!reaper) + goto out_no_task; + + for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) { + const struct pid_entry *p = &proc_base_stuff[nr]; + if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0) + goto out; + } + + ns = filp->f_dentry->d_sb->s_fs_info; + iter.task = NULL; + iter.tgid = filp->f_pos - TGID_OFFSET; + for (iter = next_tgid(ns, iter); + iter.task; + iter.tgid += 1, iter = next_tgid(ns, iter)) { + filp->f_pos = iter.tgid + TGID_OFFSET; + if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) { + put_task_struct(iter.task); + goto out; + } + } + filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; +out: + put_task_struct(reaper); +out_no_task: + return 0; +} + +/* + * Tasks + */ +static const struct pid_entry tid_base_stuff[] = { + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), + DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), + REG("environ", S_IRUSR, proc_environ_operations), + INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("status", S_IRUGO, proc_pid_status), + ONE("personality", S_IRUGO, proc_pid_personality), + INF("limits", S_IRUGO, proc_pid_limits), +#ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), +#endif + REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK + INF("syscall", S_IRUGO, proc_pid_syscall), +#endif + INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("stat", S_IRUGO, proc_tid_stat), + ONE("statm", S_IRUGO, proc_pid_statm), + REG("maps", S_IRUGO, proc_maps_operations), +#ifdef CONFIG_NUMA + REG("numa_maps", S_IRUGO, proc_numa_maps_operations), +#endif + REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), + LNK("cwd", proc_cwd_link), + LNK("root", proc_root_link), + LNK("exe", proc_exe_link), + REG("mounts", S_IRUGO, proc_mounts_operations), + REG("mountinfo", S_IRUGO, proc_mountinfo_operations), +#ifdef CONFIG_PROC_PAGE_MONITOR + REG("clear_refs", S_IWUSR, proc_clear_refs_operations), + REG("smaps", S_IRUGO, proc_smaps_operations), + REG("pagemap", S_IRUGO, proc_pagemap_operations), +#endif +#ifdef CONFIG_SECURITY + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), +#endif +#ifdef CONFIG_KALLSYMS + INF("wchan", S_IRUGO, proc_pid_wchan), +#endif +#ifdef CONFIG_STACKTRACE + ONE("stack", S_IRUGO, proc_pid_stack), +#endif +#ifdef CONFIG_SCHEDSTATS + INF("schedstat", S_IRUGO, proc_pid_schedstat), +#endif +#ifdef CONFIG_LATENCYTOP + REG("latency", S_IRUGO, proc_lstats_operations), +#endif +#ifdef CONFIG_PROC_PID_CPUSET + REG("cpuset", S_IRUGO, proc_cpuset_operations), +#endif +#ifdef CONFIG_CGROUPS + REG("cgroup", S_IRUGO, proc_cgroup_operations), +#endif + INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +#ifdef CONFIG_AUDITSYSCALL + REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), + REG("sessionid", S_IRUGO, proc_sessionid_operations), +#endif +#ifdef CONFIG_FAULT_INJECTION + REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), +#endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + INF("io", S_IRUSR, proc_tid_io_accounting), +#endif +#ifdef CONFIG_HARDWALL + INF("hardwall", S_IRUGO, proc_pid_hardwall), +#endif +}; + +static int proc_tid_base_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + return proc_pident_readdir(filp,dirent,filldir, + tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); +} + +static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ + return proc_pident_lookup(dir, dentry, + tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); +} + +static const struct file_operations proc_tid_base_operations = { + .read = generic_read_dir, + .readdir = proc_tid_base_readdir, + .llseek = default_llseek, +}; + +static const struct inode_operations proc_tid_base_inode_operations = { + .lookup = proc_tid_base_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +static struct dentry *proc_task_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + struct dentry *error = ERR_PTR(-ENOENT); + struct inode *inode; + inode = proc_pid_make_inode(dir->i_sb, task); + + if (!inode) + goto out; + inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; + inode->i_op = &proc_tid_base_inode_operations; + inode->i_fop = &proc_tid_base_operations; + inode->i_flags|=S_IMMUTABLE; + + inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff, + ARRAY_SIZE(tid_base_stuff)); + + d_set_d_op(dentry, &pid_dentry_operations); + + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; +out: + return error; +} + +static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +{ + struct dentry *result = ERR_PTR(-ENOENT); + struct task_struct *task; + struct task_struct *leader = get_proc_task(dir); + unsigned tid; + struct pid_namespace *ns; + + if (!leader) + goto out_no_task; + + tid = name_to_int(dentry); + if (tid == ~0U) + goto out; + + ns = dentry->d_sb->s_fs_info; + rcu_read_lock(); + task = find_task_by_pid_ns(tid, ns); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) + goto out; + if (!same_thread_group(leader, task)) + goto out_drop_task; + + result = proc_task_instantiate(dir, dentry, task, NULL); +out_drop_task: + put_task_struct(task); +out: + put_task_struct(leader); +out_no_task: + return result; +} + +/* + * Find the first tid of a thread group to return to user space. + * + * Usually this is just the thread group leader, but if the users + * buffer was too small or there was a seek into the middle of the + * directory we have more work todo. + * + * In the case of a short read we start with find_task_by_pid. + * + * In the case of a seek we start with the leader and walk nr + * threads past it. + */ +static struct task_struct *first_tid(struct task_struct *leader, + int tid, int nr, struct pid_namespace *ns) +{ + struct task_struct *pos; + + rcu_read_lock(); + /* Attempt to start with the pid of a thread */ + if (tid && (nr > 0)) { + pos = find_task_by_pid_ns(tid, ns); + if (pos && (pos->group_leader == leader)) + goto found; + } + + /* If nr exceeds the number of threads there is nothing todo */ + pos = NULL; + if (nr && nr >= get_nr_threads(leader)) + goto out; + + /* If we haven't found our starting place yet start + * with the leader and walk nr threads forward. + */ + for (pos = leader; nr > 0; --nr) { + pos = next_thread(pos); + if (pos == leader) { + pos = NULL; + goto out; + } + } +found: + get_task_struct(pos); +out: + rcu_read_unlock(); + return pos; +} + +/* + * Find the next thread in the thread list. + * Return NULL if there is an error or no next thread. + * + * The reference to the input task_struct is released. + */ +static struct task_struct *next_tid(struct task_struct *start) +{ + struct task_struct *pos = NULL; + rcu_read_lock(); + if (pid_alive(start)) { + pos = next_thread(start); + if (thread_group_leader(pos)) + pos = NULL; + else + get_task_struct(pos); + } + rcu_read_unlock(); + put_task_struct(start); + return pos; +} + +static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct task_struct *task, int tid) +{ + char name[PROC_NUMBUF]; + int len = snprintf(name, sizeof(name), "%d", tid); + return proc_fill_cache(filp, dirent, filldir, name, len, + proc_task_instantiate, task, NULL); +} + +/* for the /proc/TGID/task/ directories */ +static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *leader = NULL; + struct task_struct *task; + int retval = -ENOENT; + ino_t ino; + int tid; + struct pid_namespace *ns; + + task = get_proc_task(inode); + if (!task) + goto out_no_task; + rcu_read_lock(); + if (pid_alive(task)) { + leader = task->group_leader; + get_task_struct(leader); + } + rcu_read_unlock(); + put_task_struct(task); + if (!leader) + goto out_no_task; + retval = 0; + + switch ((unsigned long)filp->f_pos) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + /* fall through */ + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + /* fall through */ + } + + /* f_version caches the tgid value that the last readdir call couldn't + * return. lseek aka telldir automagically resets f_version to 0. + */ + ns = filp->f_dentry->d_sb->s_fs_info; + tid = (int)filp->f_version; + filp->f_version = 0; + for (task = first_tid(leader, tid, filp->f_pos - 2, ns); + task; + task = next_tid(task), filp->f_pos++) { + tid = task_pid_nr_ns(task, ns); + if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { + /* returning this tgid failed, save it as the first + * pid for the next readir call */ + filp->f_version = (u64)tid; + put_task_struct(task); + break; + } + } +out: + put_task_struct(leader); +out_no_task: + return retval; +} + +static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); + generic_fillattr(inode, stat); + + if (p) { + stat->nlink += get_nr_threads(p); + put_task_struct(p); + } + + return 0; +} + +static const struct inode_operations proc_task_inode_operations = { + .lookup = proc_task_lookup, + .getattr = proc_task_getattr, + .setattr = proc_setattr, +}; + +static const struct file_operations proc_task_operations = { + .read = generic_read_dir, + .readdir = proc_task_readdir, + .llseek = default_llseek, +}; diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c new file mode 100644 index 00000000..82676e3f --- /dev/null +++ b/fs/proc/cmdline.c @@ -0,0 +1,29 @@ +#include +#include +#include +#include + +static int cmdline_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%s\n", saved_command_line); + return 0; +} + +static int cmdline_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cmdline_proc_show, NULL); +} + +static const struct file_operations cmdline_proc_fops = { + .open = cmdline_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_cmdline_init(void) +{ + proc_create("cmdline", 0, NULL, &cmdline_proc_fops); + return 0; +} +module_init(proc_cmdline_init); diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c new file mode 100644 index 00000000..b701eaa4 --- /dev/null +++ b/fs/proc/consoles.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2010 Werner Fink, Jiri Slaby + * + * Licensed under GPLv2 + */ + +#include +#include +#include +#include +#include + +/* + * This is handler for /proc/consoles + */ +static int show_console_dev(struct seq_file *m, void *v) +{ + static const struct { + short flag; + char name; + } con_flags[] = { + { CON_ENABLED, 'E' }, + { CON_CONSDEV, 'C' }, + { CON_BOOT, 'B' }, + { CON_PRINTBUFFER, 'p' }, + { CON_BRL, 'b' }, + { CON_ANYTIME, 'a' }, + }; + char flags[ARRAY_SIZE(con_flags) + 1]; + struct console *con = v; + unsigned int a; + int len; + dev_t dev = 0; + + if (con->device) { + const struct tty_driver *driver; + int index; + driver = con->device(con, &index); + if (driver) { + dev = MKDEV(driver->major, driver->minor_start); + dev += index; + } + } + + for (a = 0; a < ARRAY_SIZE(con_flags); a++) + flags[a] = (con->flags & con_flags[a].flag) ? + con_flags[a].name : ' '; + flags[a] = 0; + + seq_printf(m, "%s%d%n", con->name, con->index, &len); + len = 21 - len; + if (len < 1) + len = 1; + seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-', + con->write ? 'W' : '-', con->unblank ? 'U' : '-', + flags); + if (dev) + seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); + + seq_printf(m, "\n"); + + return 0; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + struct console *con; + loff_t off = 0; + + console_lock(); + for_each_console(con) + if (off++ == *pos) + break; + + return con; +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct console *con = v; + ++*pos; + return con->next; +} + +static void c_stop(struct seq_file *m, void *v) +{ + console_unlock(); +} + +static const struct seq_operations consoles_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_console_dev +}; + +static int consoles_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &consoles_op); +} + +static const struct file_operations proc_consoles_operations = { + .open = consoles_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_consoles_init(void) +{ + proc_create("consoles", 0, NULL, &proc_consoles_operations); + return 0; +} +module_init(proc_consoles_init); diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c new file mode 100644 index 00000000..5a1e539a --- /dev/null +++ b/fs/proc/cpuinfo.c @@ -0,0 +1,24 @@ +#include +#include +#include +#include + +extern const struct seq_operations cpuinfo_op; +static int cpuinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &cpuinfo_op); +} + +static const struct file_operations proc_cpuinfo_operations = { + .open = cpuinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_cpuinfo_init(void) +{ + proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); + return 0; +} +module_init(proc_cpuinfo_init); diff --git a/fs/proc/devices.c b/fs/proc/devices.c new file mode 100644 index 00000000..b1434716 --- /dev/null +++ b/fs/proc/devices.c @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +static int devinfo_show(struct seq_file *f, void *v) +{ + int i = *(loff_t *) v; + + if (i < CHRDEV_MAJOR_HASH_SIZE) { + if (i == 0) + seq_puts(f, "Character devices:\n"); + chrdev_show(f, i); + } +#ifdef CONFIG_BLOCK + else { + i -= CHRDEV_MAJOR_HASH_SIZE; + if (i == 0) + seq_puts(f, "\nBlock devices:\n"); + blkdev_show(f, i); + } +#endif + return 0; +} + +static void *devinfo_start(struct seq_file *f, loff_t *pos) +{ + if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + return pos; + return NULL; +} + +static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + return NULL; + return pos; +} + +static void devinfo_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static const struct seq_operations devinfo_ops = { + .start = devinfo_start, + .next = devinfo_next, + .stop = devinfo_stop, + .show = devinfo_show +}; + +static int devinfo_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &devinfo_ops); +} + +static const struct file_operations proc_devinfo_operations = { + .open = devinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_devices_init(void) +{ + proc_create("devices", 0, NULL, &proc_devinfo_operations); + return 0; +} +module_init(proc_devices_init); diff --git a/fs/proc/generic.c b/fs/proc/generic.c new file mode 100644 index 00000000..f1637f17 --- /dev/null +++ b/fs/proc/generic.c @@ -0,0 +1,853 @@ +/* + * proc/fs/generic.c --- generic routines for the proc-fs + * + * This file contains generic proc-fs routines for handling + * directories and files. + * + * Copyright (C) 1991, 1992 Linus Torvalds. + * Copyright (C) 1997 Theodore Ts'o + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +DEFINE_SPINLOCK(proc_subdir_lock); + +static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) +{ + if (de->namelen != len) + return 0; + return !memcmp(name, de->name, len); +} + +/* buffer size is one page but our output routines use some slack for overruns */ +#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024) + +static ssize_t +__proc_file_read(struct file *file, char __user *buf, size_t nbytes, + loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + char *page; + ssize_t retval=0; + int eof=0; + ssize_t n, count; + char *start; + struct proc_dir_entry * dp; + unsigned long long pos; + + /* + * Gaah, please just use "seq_file" instead. The legacy /proc + * interfaces cut loff_t down to off_t for reads, and ignore + * the offset entirely for writes.. + */ + pos = *ppos; + if (pos > MAX_NON_LFS) + return 0; + if (nbytes > MAX_NON_LFS - pos) + nbytes = MAX_NON_LFS - pos; + + dp = PDE(inode); + if (!(page = (char*) __get_free_page(GFP_TEMPORARY))) + return -ENOMEM; + + while ((nbytes > 0) && !eof) { + count = min_t(size_t, PROC_BLOCK_SIZE, nbytes); + + start = NULL; + if (dp->read_proc) { + /* + * How to be a proc read function + * ------------------------------ + * Prototype: + * int f(char *buffer, char **start, off_t offset, + * int count, int *peof, void *dat) + * + * Assume that the buffer is "count" bytes in size. + * + * If you know you have supplied all the data you + * have, set *peof. + * + * You have three ways to return data: + * 0) Leave *start = NULL. (This is the default.) + * Put the data of the requested offset at that + * offset within the buffer. Return the number (n) + * of bytes there are from the beginning of the + * buffer up to the last byte of data. If the + * number of supplied bytes (= n - offset) is + * greater than zero and you didn't signal eof + * and the reader is prepared to take more data + * you will be called again with the requested + * offset advanced by the number of bytes + * absorbed. This interface is useful for files + * no larger than the buffer. + * 1) Set *start = an unsigned long value less than + * the buffer address but greater than zero. + * Put the data of the requested offset at the + * beginning of the buffer. Return the number of + * bytes of data placed there. If this number is + * greater than zero and you didn't signal eof + * and the reader is prepared to take more data + * you will be called again with the requested + * offset advanced by *start. This interface is + * useful when you have a large file consisting + * of a series of blocks which you want to count + * and return as wholes. + * (Hack by Paul.Russell@rustcorp.com.au) + * 2) Set *start = an address within the buffer. + * Put the data of the requested offset at *start. + * Return the number of bytes of data placed there. + * If this number is greater than zero and you + * didn't signal eof and the reader is prepared to + * take more data you will be called again with the + * requested offset advanced by the number of bytes + * absorbed. + */ + n = dp->read_proc(page, &start, *ppos, + count, &eof, dp->data); + } else + break; + + if (n == 0) /* end of file */ + break; + if (n < 0) { /* error */ + if (retval == 0) + retval = n; + break; + } + + if (start == NULL) { + if (n > PAGE_SIZE) { + printk(KERN_ERR + "proc_file_read: Apparent buffer overflow!\n"); + n = PAGE_SIZE; + } + n -= *ppos; + if (n <= 0) + break; + if (n > count) + n = count; + start = page + *ppos; + } else if (start < page) { + if (n > PAGE_SIZE) { + printk(KERN_ERR + "proc_file_read: Apparent buffer overflow!\n"); + n = PAGE_SIZE; + } + if (n > count) { + /* + * Don't reduce n because doing so might + * cut off part of a data block. + */ + printk(KERN_WARNING + "proc_file_read: Read count exceeded\n"); + } + } else /* start >= page */ { + unsigned long startoff = (unsigned long)(start - page); + if (n > (PAGE_SIZE - startoff)) { + printk(KERN_ERR + "proc_file_read: Apparent buffer overflow!\n"); + n = PAGE_SIZE - startoff; + } + if (n > count) + n = count; + } + + n -= copy_to_user(buf, start < page ? page : start, n); + if (n == 0) { + if (retval == 0) + retval = -EFAULT; + break; + } + + *ppos += start < page ? (unsigned long)start : n; + nbytes -= n; + buf += n; + retval += n; + } + free_page((unsigned long) page); + return retval; +} + +static ssize_t +proc_file_read(struct file *file, char __user *buf, size_t nbytes, + loff_t *ppos) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + ssize_t rv = -EIO; + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + spin_unlock(&pde->pde_unload_lock); + + rv = __proc_file_read(file, buf, nbytes, ppos); + + pde_users_dec(pde); + return rv; +} + +static ssize_t +proc_file_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + ssize_t rv = -EIO; + + if (pde->write_proc) { + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + spin_unlock(&pde->pde_unload_lock); + + /* FIXME: does this routine need ppos? probably... */ + rv = pde->write_proc(file, buffer, count, pde->data); + pde_users_dec(pde); + } + return rv; +} + + +static loff_t +proc_file_lseek(struct file *file, loff_t offset, int orig) +{ + loff_t retval = -EINVAL; + switch (orig) { + case 1: + offset += file->f_pos; + /* fallthrough */ + case 0: + if (offset < 0 || offset > MAX_NON_LFS) + break; + file->f_pos = retval = offset; + } + return retval; +} + +static const struct file_operations proc_file_operations = { + .llseek = proc_file_lseek, + .read = proc_file_read, + .write = proc_file_write, +}; + +static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + struct proc_dir_entry *de = PDE(inode); + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(inode)) { + error = vmtruncate(inode, iattr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + + de->uid = inode->i_uid; + de->gid = inode->i_gid; + de->mode = inode->i_mode; + return 0; +} + +static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct proc_dir_entry *de = PROC_I(inode)->pde; + if (de && de->nlink) + inode->i_nlink = de->nlink; + + generic_fillattr(inode, stat); + return 0; +} + +static const struct inode_operations proc_file_inode_operations = { + .setattr = proc_notify_change, +}; + +/* + * This function parses a name such as "tty/driver/serial", and + * returns the struct proc_dir_entry for "/proc/tty/driver", and + * returns "serial" in residual. + */ +static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) +{ + const char *cp = name, *next; + struct proc_dir_entry *de; + unsigned int len; + + de = *ret; + if (!de) + de = &proc_root; + + while (1) { + next = strchr(cp, '/'); + if (!next) + break; + + len = next - cp; + for (de = de->subdir; de ; de = de->next) { + if (proc_match(len, cp, de)) + break; + } + if (!de) { + WARN(1, "name '%s'\n", name); + return -ENOENT; + } + cp += len + 1; + } + *residual = cp; + *ret = de; + return 0; +} + +static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) +{ + int rv; + + spin_lock(&proc_subdir_lock); + rv = __xlate_proc_name(name, ret, residual); + spin_unlock(&proc_subdir_lock); + return rv; +} + +static DEFINE_IDA(proc_inum_ida); +static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ + +#define PROC_DYNAMIC_FIRST 0xF0000000U + +/* + * Return an inode number between PROC_DYNAMIC_FIRST and + * 0xffffffff, or zero on failure. + */ +static unsigned int get_inode_number(void) +{ + unsigned int i; + int error; + +retry: + if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) + return 0; + + spin_lock(&proc_inum_lock); + error = ida_get_new(&proc_inum_ida, &i); + spin_unlock(&proc_inum_lock); + if (error == -EAGAIN) + goto retry; + else if (error) + return 0; + + if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { + spin_lock(&proc_inum_lock); + ida_remove(&proc_inum_ida, i); + spin_unlock(&proc_inum_lock); + return 0; + } + return PROC_DYNAMIC_FIRST + i; +} + +static void release_inode_number(unsigned int inum) +{ + spin_lock(&proc_inum_lock); + ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); + spin_unlock(&proc_inum_lock); +} + +static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, PDE(dentry->d_inode)->data); + return NULL; +} + +static const struct inode_operations proc_link_inode_operations = { + .readlink = generic_readlink, + .follow_link = proc_follow_link, +}; + +/* + * As some entries in /proc are volatile, we want to + * get rid of unused dentries. This could be made + * smarter: we could keep a "volatile" flag in the + * inode to indicate which ones to keep. + */ +static int proc_delete_dentry(const struct dentry * dentry) +{ + return 1; +} + +static const struct dentry_operations proc_dentry_operations = +{ + .d_delete = proc_delete_dentry, +}; + +/* + * Don't create negative dentries here, return -ENOENT by hand + * instead. + */ +struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = NULL; + int error = -ENOENT; + + spin_lock(&proc_subdir_lock); + for (de = de->subdir; de ; de = de->next) { + if (de->namelen != dentry->d_name.len) + continue; + if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { + pde_get(de); + spin_unlock(&proc_subdir_lock); + error = -EINVAL; + inode = proc_get_inode(dir->i_sb, de); + goto out_unlock; + } + } + spin_unlock(&proc_subdir_lock); +out_unlock: + + if (inode) { + d_set_d_op(dentry, &proc_dentry_operations); + d_add(dentry, inode); + return NULL; + } + if (de) + pde_put(de); + return ERR_PTR(error); +} + +struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + return proc_lookup_de(PDE(dir), dir, dentry); +} + +/* + * This returns non-zero if at EOF, so that the /proc + * root directory can use this and check if it should + * continue with the entries.. + * + * Note that the VFS-layer doesn't care about the return + * value of the readdir() call, as long as it's non-negative + * for success.. + */ +int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, + filldir_t filldir) +{ + unsigned int ino; + int i; + struct inode *inode = filp->f_path.dentry->d_inode; + int ret = 0; + + ino = inode->i_ino; + i = filp->f_pos; + switch (i) { + case 0: + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + case 1: + if (filldir(dirent, "..", 2, i, + parent_ino(filp->f_path.dentry), + DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + default: + spin_lock(&proc_subdir_lock); + de = de->subdir; + i -= 2; + for (;;) { + if (!de) { + ret = 1; + spin_unlock(&proc_subdir_lock); + goto out; + } + if (!i) + break; + de = de->next; + i--; + } + + do { + struct proc_dir_entry *next; + + /* filldir passes info to user space */ + pde_get(de); + spin_unlock(&proc_subdir_lock); + if (filldir(dirent, de->name, de->namelen, filp->f_pos, + de->low_ino, de->mode >> 12) < 0) { + pde_put(de); + goto out; + } + spin_lock(&proc_subdir_lock); + filp->f_pos++; + next = de->next; + pde_put(de); + de = next; + } while (de); + spin_unlock(&proc_subdir_lock); + } + ret = 1; +out: + return ret; +} + +int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + + return proc_readdir_de(PDE(inode), filp, dirent, filldir); +} + +/* + * These are the generic /proc directory operations. They + * use the in-memory "struct proc_dir_entry" tree to parse + * the /proc directory. + */ +static const struct file_operations proc_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = proc_readdir, +}; + +/* + * proc directories can do almost nothing.. + */ +static const struct inode_operations proc_dir_inode_operations = { + .lookup = proc_lookup, + .getattr = proc_getattr, + .setattr = proc_notify_change, +}; + +static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) +{ + unsigned int i; + struct proc_dir_entry *tmp; + + i = get_inode_number(); + if (i == 0) + return -EAGAIN; + dp->low_ino = i; + + if (S_ISDIR(dp->mode)) { + if (dp->proc_iops == NULL) { + dp->proc_fops = &proc_dir_operations; + dp->proc_iops = &proc_dir_inode_operations; + } + dir->nlink++; + } else if (S_ISLNK(dp->mode)) { + if (dp->proc_iops == NULL) + dp->proc_iops = &proc_link_inode_operations; + } else if (S_ISREG(dp->mode)) { + if (dp->proc_fops == NULL) + dp->proc_fops = &proc_file_operations; + if (dp->proc_iops == NULL) + dp->proc_iops = &proc_file_inode_operations; + } + + spin_lock(&proc_subdir_lock); + + for (tmp = dir->subdir; tmp; tmp = tmp->next) + if (strcmp(tmp->name, dp->name) == 0) { + WARN(1, KERN_WARNING "proc_dir_entry '%s/%s' already registered\n", + dir->name, dp->name); + break; + } + + dp->next = dir->subdir; + dp->parent = dir; + dir->subdir = dp; + spin_unlock(&proc_subdir_lock); + + return 0; +} + +static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, + const char *name, + mode_t mode, + nlink_t nlink) +{ + struct proc_dir_entry *ent = NULL; + const char *fn = name; + unsigned int len; + + /* make sure name is valid */ + if (!name || !strlen(name)) goto out; + + if (xlate_proc_name(name, parent, &fn) != 0) + goto out; + + /* At this point there must not be any '/' characters beyond *fn */ + if (strchr(fn, '/')) + goto out; + + len = strlen(fn); + + ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); + if (!ent) goto out; + + memset(ent, 0, sizeof(struct proc_dir_entry)); + memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1); + ent->name = ((char *) ent) + sizeof(*ent); + ent->namelen = len; + ent->mode = mode; + ent->nlink = nlink; + atomic_set(&ent->count, 1); + ent->pde_users = 0; + spin_lock_init(&ent->pde_unload_lock); + ent->pde_unload_completion = NULL; + INIT_LIST_HEAD(&ent->pde_openers); + out: + return ent; +} + +struct proc_dir_entry *proc_symlink(const char *name, + struct proc_dir_entry *parent, const char *dest) +{ + struct proc_dir_entry *ent; + + ent = __proc_create(&parent, name, + (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1); + + if (ent) { + ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL); + if (ent->data) { + strcpy((char*)ent->data,dest); + if (proc_register(parent, ent) < 0) { + kfree(ent->data); + kfree(ent); + ent = NULL; + } + } else { + kfree(ent); + ent = NULL; + } + } + return ent; +} +EXPORT_SYMBOL(proc_symlink); + +struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, + struct proc_dir_entry *parent) +{ + struct proc_dir_entry *ent; + + ent = __proc_create(&parent, name, S_IFDIR | mode, 2); + if (ent) { + if (proc_register(parent, ent) < 0) { + kfree(ent); + ent = NULL; + } + } + return ent; +} +EXPORT_SYMBOL(proc_mkdir_mode); + +struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, + struct proc_dir_entry *parent) +{ + struct proc_dir_entry *ent; + + ent = __proc_create(&parent, name, S_IFDIR | S_IRUGO | S_IXUGO, 2); + if (ent) { + ent->data = net; + if (proc_register(parent, ent) < 0) { + kfree(ent); + ent = NULL; + } + } + return ent; +} +EXPORT_SYMBOL_GPL(proc_net_mkdir); + +struct proc_dir_entry *proc_mkdir(const char *name, + struct proc_dir_entry *parent) +{ + return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent); +} +EXPORT_SYMBOL(proc_mkdir); + +struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, + struct proc_dir_entry *parent) +{ + struct proc_dir_entry *ent; + nlink_t nlink; + + if (S_ISDIR(mode)) { + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO | S_IXUGO; + nlink = 2; + } else { + if ((mode & S_IFMT) == 0) + mode |= S_IFREG; + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + nlink = 1; + } + + ent = __proc_create(&parent, name, mode, nlink); + if (ent) { + if (proc_register(parent, ent) < 0) { + kfree(ent); + ent = NULL; + } + } + return ent; +} +EXPORT_SYMBOL(create_proc_entry); + +struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, + struct proc_dir_entry *parent, + const struct file_operations *proc_fops, + void *data) +{ + struct proc_dir_entry *pde; + nlink_t nlink; + + if (S_ISDIR(mode)) { + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO | S_IXUGO; + nlink = 2; + } else { + if ((mode & S_IFMT) == 0) + mode |= S_IFREG; + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + nlink = 1; + } + + pde = __proc_create(&parent, name, mode, nlink); + if (!pde) + goto out; + pde->proc_fops = proc_fops; + pde->data = data; + if (proc_register(parent, pde) < 0) + goto out_free; + return pde; +out_free: + kfree(pde); +out: + return NULL; +} +EXPORT_SYMBOL(proc_create_data); + +static void free_proc_entry(struct proc_dir_entry *de) +{ + release_inode_number(de->low_ino); + + if (S_ISLNK(de->mode)) + kfree(de->data); + kfree(de); +} + +void pde_put(struct proc_dir_entry *pde) +{ + if (atomic_dec_and_test(&pde->count)) + free_proc_entry(pde); +} + +/* + * Remove a /proc entry and free it if it's not currently in use. + */ +void remove_proc_entry(const char *name, struct proc_dir_entry *parent) +{ + struct proc_dir_entry **p; + struct proc_dir_entry *de = NULL; + const char *fn = name; + unsigned int len; + + spin_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + spin_unlock(&proc_subdir_lock); + return; + } + len = strlen(fn); + + for (p = &parent->subdir; *p; p=&(*p)->next ) { + if (proc_match(len, fn, *p)) { + de = *p; + *p = de->next; + de->next = NULL; + break; + } + } + spin_unlock(&proc_subdir_lock); + if (!de) { + WARN(1, "name '%s'\n", name); + return; + } + + spin_lock(&de->pde_unload_lock); + /* + * Stop accepting new callers into module. If you're + * dynamically allocating ->proc_fops, save a pointer somewhere. + */ + de->proc_fops = NULL; + /* Wait until all existing callers into module are done. */ + if (de->pde_users > 0) { + DECLARE_COMPLETION_ONSTACK(c); + + if (!de->pde_unload_completion) + de->pde_unload_completion = &c; + + spin_unlock(&de->pde_unload_lock); + + wait_for_completion(de->pde_unload_completion); + + spin_lock(&de->pde_unload_lock); + } + + while (!list_empty(&de->pde_openers)) { + struct pde_opener *pdeo; + + pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); + list_del(&pdeo->lh); + spin_unlock(&de->pde_unload_lock); + pdeo->release(pdeo->inode, pdeo->file); + kfree(pdeo); + spin_lock(&de->pde_unload_lock); + } + spin_unlock(&de->pde_unload_lock); + + if (S_ISDIR(de->mode)) + parent->nlink--; + de->nlink = 0; + WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory " + "'%s/%s', leaking at least '%s'\n", __func__, + de->parent->name, de->name, de->subdir->name); + pde_put(de); +} +EXPORT_SYMBOL(remove_proc_entry); diff --git a/fs/proc/inode.c b/fs/proc/inode.c new file mode 100644 index 00000000..74b48cfa --- /dev/null +++ b/fs/proc/inode.c @@ -0,0 +1,497 @@ +/* + * linux/fs/proc/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "internal.h" + +static void proc_evict_inode(struct inode *inode) +{ + struct proc_dir_entry *de; + struct ctl_table_header *head; + const struct proc_ns_operations *ns_ops; + + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + + /* Stop tracking associated processes */ + put_pid(PROC_I(inode)->pid); + + /* Let go of any associated proc directory entry */ + de = PROC_I(inode)->pde; + if (de) + pde_put(de); + head = PROC_I(inode)->sysctl; + if (head) { + rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); + sysctl_head_put(head); + } + /* Release any associated namespace */ + ns_ops = PROC_I(inode)->ns_ops; + if (ns_ops && ns_ops->put) + ns_ops->put(PROC_I(inode)->ns); +} + +static struct kmem_cache * proc_inode_cachep; + +static struct inode *proc_alloc_inode(struct super_block *sb) +{ + struct proc_inode *ei; + struct inode *inode; + + ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + ei->pid = NULL; + ei->fd = 0; + ei->op.proc_get_link = NULL; + ei->pde = NULL; + ei->sysctl = NULL; + ei->sysctl_entry = NULL; + ei->ns = NULL; + ei->ns_ops = NULL; + inode = &ei->vfs_inode; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + return inode; +} + +static void proc_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(proc_inode_cachep, PROC_I(inode)); +} + +static void proc_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, proc_i_callback); +} + +static void init_once(void *foo) +{ + struct proc_inode *ei = (struct proc_inode *) foo; + + inode_init_once(&ei->vfs_inode); +} + +void __init proc_init_inodecache(void) +{ + proc_inode_cachep = kmem_cache_create("proc_inode_cache", + sizeof(struct proc_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_PANIC), + init_once); +} + +static const struct super_operations proc_sops = { + .alloc_inode = proc_alloc_inode, + .destroy_inode = proc_destroy_inode, + .drop_inode = generic_delete_inode, + .evict_inode = proc_evict_inode, + .statfs = simple_statfs, +}; + +static void __pde_users_dec(struct proc_dir_entry *pde) +{ + pde->pde_users--; + if (pde->pde_unload_completion && pde->pde_users == 0) + complete(pde->pde_unload_completion); +} + +void pde_users_dec(struct proc_dir_entry *pde) +{ + spin_lock(&pde->pde_unload_lock); + __pde_users_dec(pde); + spin_unlock(&pde->pde_unload_lock); +} + +static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + loff_t rv = -EINVAL; + loff_t (*llseek)(struct file *, loff_t, int); + + spin_lock(&pde->pde_unload_lock); + /* + * remove_proc_entry() is going to delete PDE (as part of module + * cleanup sequence). No new callers into module allowed. + */ + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + /* + * Bump refcount so that remove_proc_entry will wail for ->llseek to + * complete. + */ + pde->pde_users++; + /* + * Save function pointer under lock, to protect against ->proc_fops + * NULL'ifying right after ->pde_unload_lock is dropped. + */ + llseek = pde->proc_fops->llseek; + spin_unlock(&pde->pde_unload_lock); + + if (!llseek) + llseek = default_llseek; + rv = llseek(file, offset, whence); + + pde_users_dec(pde); + return rv; +} + +static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + ssize_t rv = -EIO; + ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + read = pde->proc_fops->read; + spin_unlock(&pde->pde_unload_lock); + + if (read) + rv = read(file, buf, count, ppos); + + pde_users_dec(pde); + return rv; +} + +static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + ssize_t rv = -EIO; + ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + write = pde->proc_fops->write; + spin_unlock(&pde->pde_unload_lock); + + if (write) + rv = write(file, buf, count, ppos); + + pde_users_dec(pde); + return rv; +} + +static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + unsigned int rv = DEFAULT_POLLMASK; + unsigned int (*poll)(struct file *, struct poll_table_struct *); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + poll = pde->proc_fops->poll; + spin_unlock(&pde->pde_unload_lock); + + if (poll) + rv = poll(file, pts); + + pde_users_dec(pde); + return rv; +} + +static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + long rv = -ENOTTY; + long (*ioctl)(struct file *, unsigned int, unsigned long); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + ioctl = pde->proc_fops->unlocked_ioctl; + spin_unlock(&pde->pde_unload_lock); + + if (ioctl) + rv = ioctl(file, cmd, arg); + + pde_users_dec(pde); + return rv; +} + +#ifdef CONFIG_COMPAT +static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + long rv = -ENOTTY; + long (*compat_ioctl)(struct file *, unsigned int, unsigned long); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + compat_ioctl = pde->proc_fops->compat_ioctl; + spin_unlock(&pde->pde_unload_lock); + + if (compat_ioctl) + rv = compat_ioctl(file, cmd, arg); + + pde_users_dec(pde); + return rv; +} +#endif + +static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + int rv = -EIO; + int (*mmap)(struct file *, struct vm_area_struct *); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + mmap = pde->proc_fops->mmap; + spin_unlock(&pde->pde_unload_lock); + + if (mmap) + rv = mmap(file, vma); + + pde_users_dec(pde); + return rv; +} + +static int proc_reg_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + int rv = 0; + int (*open)(struct inode *, struct file *); + int (*release)(struct inode *, struct file *); + struct pde_opener *pdeo; + + /* + * What for, you ask? Well, we can have open, rmmod, remove_proc_entry + * sequence. ->release won't be called because ->proc_fops will be + * cleared. Depending on complexity of ->release, consequences vary. + * + * We can't wait for mercy when close will be done for real, it's + * deadlockable: rmmod foo release + * by hand in remove_proc_entry(). For this, save opener's credentials + * for later. + */ + pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); + if (!pdeo) + return -ENOMEM; + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + kfree(pdeo); + return -EINVAL; + } + pde->pde_users++; + open = pde->proc_fops->open; + release = pde->proc_fops->release; + spin_unlock(&pde->pde_unload_lock); + + if (open) + rv = open(inode, file); + + spin_lock(&pde->pde_unload_lock); + if (rv == 0 && release) { + /* To know what to release. */ + pdeo->inode = inode; + pdeo->file = file; + /* Strictly for "too late" ->release in proc_reg_release(). */ + pdeo->release = release; + list_add(&pdeo->lh, &pde->pde_openers); + } else + kfree(pdeo); + __pde_users_dec(pde); + spin_unlock(&pde->pde_unload_lock); + return rv; +} + +static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde, + struct inode *inode, struct file *file) +{ + struct pde_opener *pdeo; + + list_for_each_entry(pdeo, &pde->pde_openers, lh) { + if (pdeo->inode == inode && pdeo->file == file) + return pdeo; + } + return NULL; +} + +static int proc_reg_release(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + int rv = 0; + int (*release)(struct inode *, struct file *); + struct pde_opener *pdeo; + + spin_lock(&pde->pde_unload_lock); + pdeo = find_pde_opener(pde, inode, file); + if (!pde->proc_fops) { + /* + * Can't simply exit, __fput() will think that everything is OK, + * and move on to freeing struct file. remove_proc_entry() will + * find slacker in opener's list and will try to do non-trivial + * things with struct file. Therefore, remove opener from list. + * + * But if opener is removed from list, who will ->release it? + */ + if (pdeo) { + list_del(&pdeo->lh); + spin_unlock(&pde->pde_unload_lock); + rv = pdeo->release(inode, file); + kfree(pdeo); + } else + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + release = pde->proc_fops->release; + if (pdeo) { + list_del(&pdeo->lh); + kfree(pdeo); + } + spin_unlock(&pde->pde_unload_lock); + + if (release) + rv = release(inode, file); + + pde_users_dec(pde); + return rv; +} + +static const struct file_operations proc_reg_file_ops = { + .llseek = proc_reg_llseek, + .read = proc_reg_read, + .write = proc_reg_write, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = proc_reg_compat_ioctl, +#endif + .mmap = proc_reg_mmap, + .open = proc_reg_open, + .release = proc_reg_release, +}; + +#ifdef CONFIG_COMPAT +static const struct file_operations proc_reg_file_ops_no_compat = { + .llseek = proc_reg_llseek, + .read = proc_reg_read, + .write = proc_reg_write, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, + .mmap = proc_reg_mmap, + .open = proc_reg_open, + .release = proc_reg_release, +}; +#endif + +struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) +{ + struct inode * inode; + + inode = iget_locked(sb, de->low_ino); + if (!inode) + return NULL; + if (inode->i_state & I_NEW) { + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + PROC_I(inode)->fd = 0; + PROC_I(inode)->pde = de; + + if (de->mode) { + inode->i_mode = de->mode; + inode->i_uid = de->uid; + inode->i_gid = de->gid; + } + if (de->size) + inode->i_size = de->size; + if (de->nlink) + inode->i_nlink = de->nlink; + if (de->proc_iops) + inode->i_op = de->proc_iops; + if (de->proc_fops) { + if (S_ISREG(inode->i_mode)) { +#ifdef CONFIG_COMPAT + if (!de->proc_fops->compat_ioctl) + inode->i_fop = + &proc_reg_file_ops_no_compat; + else +#endif + inode->i_fop = &proc_reg_file_ops; + } else { + inode->i_fop = de->proc_fops; + } + } + unlock_new_inode(inode); + } else + pde_put(de); + return inode; +} + +int proc_fill_super(struct super_block *s) +{ + struct inode * root_inode; + + s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = PROC_SUPER_MAGIC; + s->s_op = &proc_sops; + s->s_time_gran = 1; + + pde_get(&proc_root); + root_inode = proc_get_inode(s, &proc_root); + if (!root_inode) + goto out_no_root; + root_inode->i_uid = 0; + root_inode->i_gid = 0; + s->s_root = d_alloc_root(root_inode); + if (!s->s_root) + goto out_no_root; + return 0; + +out_no_root: + printk("proc_read_super: get root inode failed\n"); + iput(root_inode); + pde_put(&proc_root); + return -ENOMEM; +} diff --git a/fs/proc/internal.h b/fs/proc/internal.h new file mode 100644 index 00000000..7838e5cf --- /dev/null +++ b/fs/proc/internal.h @@ -0,0 +1,147 @@ +/* internal.h: internal procfs definitions + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +extern struct proc_dir_entry proc_root; +#ifdef CONFIG_PROC_SYSCTL +extern int proc_sys_init(void); +#else +static inline void proc_sys_init(void) { } +#endif +#ifdef CONFIG_NET +extern int proc_net_init(void); +#else +static inline int proc_net_init(void) { return 0; } +#endif + +struct vmalloc_info { + unsigned long used; + unsigned long largest_chunk; +}; + +extern struct mm_struct *mm_for_maps(struct task_struct *); + +#ifdef CONFIG_MMU +#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) +extern void get_vmalloc_info(struct vmalloc_info *vmi); +#else + +#define VMALLOC_TOTAL 0UL +#define get_vmalloc_info(vmi) \ +do { \ + (vmi)->used = 0; \ + (vmi)->largest_chunk = 0; \ +} while(0) +#endif + +extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); +extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); +extern int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); +extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); +extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); + +extern const struct file_operations proc_maps_operations; +extern const struct file_operations proc_numa_maps_operations; +extern const struct file_operations proc_smaps_operations; +extern const struct file_operations proc_clear_refs_operations; +extern const struct file_operations proc_pagemap_operations; +extern const struct file_operations proc_net_operations; +extern const struct inode_operations proc_net_inode_operations; + +struct proc_maps_private { + struct pid *pid; + struct task_struct *task; +#ifdef CONFIG_MMU + struct vm_area_struct *tail_vma; +#endif +}; + +void proc_init_inodecache(void); + +static inline struct pid *proc_pid(struct inode *inode) +{ + return PROC_I(inode)->pid; +} + +static inline struct task_struct *get_proc_task(struct inode *inode) +{ + return get_pid_task(proc_pid(inode), PIDTYPE_PID); +} + +static inline int proc_fd(struct inode *inode) +{ + return PROC_I(inode)->fd; +} + +struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, + struct dentry *dentry); +int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, + filldir_t filldir); + +struct pde_opener { + struct inode *inode; + struct file *file; + int (*release)(struct inode *, struct file *); + struct list_head lh; +}; +void pde_users_dec(struct proc_dir_entry *pde); + +extern spinlock_t proc_subdir_lock; + +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); +int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); +unsigned long task_vsize(struct mm_struct *); +unsigned long task_statm(struct mm_struct *, + unsigned long *, unsigned long *, unsigned long *, unsigned long *); +void task_mem(struct seq_file *, struct mm_struct *); + +static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) +{ + atomic_inc(&pde->count); + return pde; +} +void pde_put(struct proc_dir_entry *pde); + +int proc_fill_super(struct super_block *); +struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); + +/* + * These are generic /proc routines that use the internal + * "struct proc_dir_entry" tree to traverse the filesystem. + * + * The /proc root directory has extended versions to take care + * of the /proc/ subdirectories. + */ +int proc_readdir(struct file *, void *, filldir_t); +struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *); + + + +/* Lookups */ +typedef struct dentry *instantiate_t(struct inode *, struct dentry *, + struct task_struct *, const void *); +int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + const char *name, int len, + instantiate_t instantiate, struct task_struct *task, const void *ptr); +int pid_revalidate(struct dentry *dentry, struct nameidata *nd); +struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task); +extern const struct dentry_operations pid_dentry_operations; +int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); +int proc_setattr(struct dentry *dentry, struct iattr *attr); + +extern const struct inode_operations proc_ns_dir_inode_operations; +extern const struct file_operations proc_ns_dir_operations; + diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c new file mode 100644 index 00000000..05029c0e --- /dev/null +++ b/fs/proc/interrupts.c @@ -0,0 +1,53 @@ +#include +#include +#include +#include +#include +#include + +/* + * /proc/interrupts + */ +static void *int_seq_start(struct seq_file *f, loff_t *pos) +{ + return (*pos <= nr_irqs) ? pos : NULL; +} + +static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + if (*pos > nr_irqs) + return NULL; + return pos; +} + +static void int_seq_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static const struct seq_operations int_seq_ops = { + .start = int_seq_start, + .next = int_seq_next, + .stop = int_seq_stop, + .show = show_interrupts +}; + +static int interrupts_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &int_seq_ops); +} + +static const struct file_operations proc_interrupts_operations = { + .open = interrupts_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_interrupts_init(void) +{ + proc_create("interrupts", 0, NULL, &proc_interrupts_operations); + return 0; +} +module_init(proc_interrupts_init); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c new file mode 100644 index 00000000..d245cb23 --- /dev/null +++ b/fs/proc/kcore.c @@ -0,0 +1,635 @@ +/* + * fs/proc/kcore.c kernel ELF core dumper + * + * Modelled on fs/exec.c:aout_core_dump() + * Jeremy Fitzhardinge + * ELF version written by David Howells + * Modified and incorporated into 2.3.x by Tigran Aivazian + * Support to dump vmalloc'd areas (ELF only), Tigran Aivazian + * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CORE_STR "CORE" + +#ifndef ELF_CORE_EFLAGS +#define ELF_CORE_EFLAGS 0 +#endif + +static struct proc_dir_entry *proc_root_kcore; + + +#ifndef kc_vaddr_to_offset +#define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET) +#endif +#ifndef kc_offset_to_vaddr +#define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET) +#endif + +/* An ELF note in memory */ +struct memelfnote +{ + const char *name; + int type; + unsigned int datasz; + void *data; +}; + +static LIST_HEAD(kclist_head); +static DEFINE_RWLOCK(kclist_lock); +static int kcore_need_update = 1; + +void +kclist_add(struct kcore_list *new, void *addr, size_t size, int type) +{ + new->addr = (unsigned long)addr; + new->size = size; + new->type = type; + + write_lock(&kclist_lock); + list_add_tail(&new->list, &kclist_head); + write_unlock(&kclist_lock); +} + +static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) +{ + size_t try, size; + struct kcore_list *m; + + *nphdr = 1; /* PT_NOTE */ + size = 0; + + list_for_each_entry(m, &kclist_head, list) { + try = kc_vaddr_to_offset((size_t)m->addr + m->size); + if (try > size) + size = try; + *nphdr = *nphdr + 1; + } + *elf_buflen = sizeof(struct elfhdr) + + (*nphdr + 2)*sizeof(struct elf_phdr) + + 3 * ((sizeof(struct elf_note)) + + roundup(sizeof(CORE_STR), 4)) + + roundup(sizeof(struct elf_prstatus), 4) + + roundup(sizeof(struct elf_prpsinfo), 4) + + roundup(sizeof(struct task_struct), 4); + *elf_buflen = PAGE_ALIGN(*elf_buflen); + return size + *elf_buflen; +} + +static void free_kclist_ents(struct list_head *head) +{ + struct kcore_list *tmp, *pos; + + list_for_each_entry_safe(pos, tmp, head, list) { + list_del(&pos->list); + kfree(pos); + } +} +/* + * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list. + */ +static void __kcore_update_ram(struct list_head *list) +{ + int nphdr; + size_t size; + struct kcore_list *tmp, *pos; + LIST_HEAD(garbage); + + write_lock(&kclist_lock); + if (kcore_need_update) { + list_for_each_entry_safe(pos, tmp, &kclist_head, list) { + if (pos->type == KCORE_RAM + || pos->type == KCORE_VMEMMAP) + list_move(&pos->list, &garbage); + } + list_splice_tail(list, &kclist_head); + } else + list_splice(list, &garbage); + kcore_need_update = 0; + proc_root_kcore->size = get_kcore_size(&nphdr, &size); + write_unlock(&kclist_lock); + + free_kclist_ents(&garbage); +} + + +#ifdef CONFIG_HIGHMEM +/* + * If no highmem, we can assume [0...max_low_pfn) continuous range of memory + * because memory hole is not as big as !HIGHMEM case. + * (HIGHMEM is special because part of memory is _invisible_ from the kernel.) + */ +static int kcore_update_ram(void) +{ + LIST_HEAD(head); + struct kcore_list *ent; + int ret = 0; + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->addr = (unsigned long)__va(0); + ent->size = max_low_pfn << PAGE_SHIFT; + ent->type = KCORE_RAM; + list_add(&ent->list, &head); + __kcore_update_ram(&head); + return ret; +} + +#else /* !CONFIG_HIGHMEM */ + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* calculate vmemmap's address from given system ram pfn and register it */ +int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +{ + unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT; + unsigned long nr_pages = ent->size >> PAGE_SHIFT; + unsigned long start, end; + struct kcore_list *vmm, *tmp; + + + start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK; + end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1; + end = ALIGN(end, PAGE_SIZE); + /* overlap check (because we have to align page */ + list_for_each_entry(tmp, head, list) { + if (tmp->type != KCORE_VMEMMAP) + continue; + if (start < tmp->addr + tmp->size) + if (end > tmp->addr) + end = tmp->addr; + } + if (start < end) { + vmm = kmalloc(sizeof(*vmm), GFP_KERNEL); + if (!vmm) + return 0; + vmm->addr = start; + vmm->size = end - start; + vmm->type = KCORE_VMEMMAP; + list_add_tail(&vmm->list, head); + } + return 1; + +} +#else +int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +{ + return 1; +} + +#endif + +static int +kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg) +{ + struct list_head *head = (struct list_head *)arg; + struct kcore_list *ent; + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT)); + ent->size = nr_pages << PAGE_SHIFT; + + /* Sanity check: Can happen in 32bit arch...maybe */ + if (ent->addr < (unsigned long) __va(0)) + goto free_out; + + /* cut not-mapped area. ....from ppc-32 code. */ + if (ULONG_MAX - ent->addr < ent->size) + ent->size = ULONG_MAX - ent->addr; + + /* cut when vmalloc() area is higher than direct-map area */ + if (VMALLOC_START > (unsigned long)__va(0)) { + if (ent->addr > VMALLOC_START) + goto free_out; + if (VMALLOC_START - ent->addr < ent->size) + ent->size = VMALLOC_START - ent->addr; + } + + ent->type = KCORE_RAM; + list_add_tail(&ent->list, head); + + if (!get_sparsemem_vmemmap_info(ent, head)) { + list_del(&ent->list); + goto free_out; + } + + return 0; +free_out: + kfree(ent); + return 1; +} + +static int kcore_update_ram(void) +{ + int nid, ret; + unsigned long end_pfn; + LIST_HEAD(head); + + /* Not inialized....update now */ + /* find out "max pfn" */ + end_pfn = 0; + for_each_node_state(nid, N_HIGH_MEMORY) { + unsigned long node_end; + node_end = NODE_DATA(nid)->node_start_pfn + + NODE_DATA(nid)->node_spanned_pages; + if (end_pfn < node_end) + end_pfn = node_end; + } + /* scan 0 to max_pfn */ + ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private); + if (ret) { + free_kclist_ents(&head); + return -ENOMEM; + } + __kcore_update_ram(&head); + return ret; +} +#endif /* CONFIG_HIGHMEM */ + +/*****************************************************************************/ +/* + * determine size of ELF note + */ +static int notesize(struct memelfnote *en) +{ + int sz; + + sz = sizeof(struct elf_note); + sz += roundup((strlen(en->name) + 1), 4); + sz += roundup(en->datasz, 4); + + return sz; +} /* end notesize() */ + +/*****************************************************************************/ +/* + * store a note in the header buffer + */ +static char *storenote(struct memelfnote *men, char *bufp) +{ + struct elf_note en; + +#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0) + + en.n_namesz = strlen(men->name) + 1; + en.n_descsz = men->datasz; + en.n_type = men->type; + + DUMP_WRITE(&en, sizeof(en)); + DUMP_WRITE(men->name, en.n_namesz); + + /* XXX - cast from long long to long to avoid need for libgcc.a */ + bufp = (char*) roundup((unsigned long)bufp,4); + DUMP_WRITE(men->data, men->datasz); + bufp = (char*) roundup((unsigned long)bufp,4); + +#undef DUMP_WRITE + + return bufp; +} /* end storenote() */ + +/* + * store an ELF coredump header in the supplied buffer + * nphdr is the number of elf_phdr to insert + */ +static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) +{ + struct elf_prstatus prstatus; /* NT_PRSTATUS */ + struct elf_prpsinfo prpsinfo; /* NT_PRPSINFO */ + struct elf_phdr *nhdr, *phdr; + struct elfhdr *elf; + struct memelfnote notes[3]; + off_t offset = 0; + struct kcore_list *m; + + /* setup ELF header */ + elf = (struct elfhdr *) bufp; + bufp += sizeof(struct elfhdr); + offset += sizeof(struct elfhdr); + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION]= EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + elf->e_type = ET_CORE; + elf->e_machine = ELF_ARCH; + elf->e_version = EV_CURRENT; + elf->e_entry = 0; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_shoff = 0; + elf->e_flags = ELF_CORE_EFLAGS; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize= sizeof(struct elf_phdr); + elf->e_phnum = nphdr; + elf->e_shentsize= 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + + /* setup ELF PT_NOTE program header */ + nhdr = (struct elf_phdr *) bufp; + bufp += sizeof(struct elf_phdr); + offset += sizeof(struct elf_phdr); + nhdr->p_type = PT_NOTE; + nhdr->p_offset = 0; + nhdr->p_vaddr = 0; + nhdr->p_paddr = 0; + nhdr->p_filesz = 0; + nhdr->p_memsz = 0; + nhdr->p_flags = 0; + nhdr->p_align = 0; + + /* setup ELF PT_LOAD program header for every area */ + list_for_each_entry(m, &kclist_head, list) { + phdr = (struct elf_phdr *) bufp; + bufp += sizeof(struct elf_phdr); + offset += sizeof(struct elf_phdr); + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = kc_vaddr_to_offset(m->addr) + dataoff; + phdr->p_vaddr = (size_t)m->addr; + phdr->p_paddr = 0; + phdr->p_filesz = phdr->p_memsz = m->size; + phdr->p_align = PAGE_SIZE; + } + + /* + * Set up the notes in similar form to SVR4 core dumps made + * with info from their /proc. + */ + nhdr->p_offset = offset; + + /* set up the process status */ + notes[0].name = CORE_STR; + notes[0].type = NT_PRSTATUS; + notes[0].datasz = sizeof(struct elf_prstatus); + notes[0].data = &prstatus; + + memset(&prstatus, 0, sizeof(struct elf_prstatus)); + + nhdr->p_filesz = notesize(¬es[0]); + bufp = storenote(¬es[0], bufp); + + /* set up the process info */ + notes[1].name = CORE_STR; + notes[1].type = NT_PRPSINFO; + notes[1].datasz = sizeof(struct elf_prpsinfo); + notes[1].data = &prpsinfo; + + memset(&prpsinfo, 0, sizeof(struct elf_prpsinfo)); + prpsinfo.pr_state = 0; + prpsinfo.pr_sname = 'R'; + prpsinfo.pr_zomb = 0; + + strcpy(prpsinfo.pr_fname, "vmlinux"); + strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ); + + nhdr->p_filesz += notesize(¬es[1]); + bufp = storenote(¬es[1], bufp); + + /* set up the task structure */ + notes[2].name = CORE_STR; + notes[2].type = NT_TASKSTRUCT; + notes[2].datasz = sizeof(struct task_struct); + notes[2].data = current; + + nhdr->p_filesz += notesize(¬es[2]); + bufp = storenote(¬es[2], bufp); + +} /* end elf_kcore_store_hdr() */ + +/*****************************************************************************/ +/* + * read from the ELF header and then kernel memory + */ +static ssize_t +read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) +{ + ssize_t acc = 0; + size_t size, tsz; + size_t elf_buflen; + int nphdr; + unsigned long start; + + read_lock(&kclist_lock); + size = get_kcore_size(&nphdr, &elf_buflen); + + if (buflen == 0 || *fpos >= size) { + read_unlock(&kclist_lock); + return 0; + } + + /* trim buflen to not go beyond EOF */ + if (buflen > size - *fpos) + buflen = size - *fpos; + + /* construct an ELF core header if we'll need some of it */ + if (*fpos < elf_buflen) { + char * elf_buf; + + tsz = elf_buflen - *fpos; + if (buflen < tsz) + tsz = buflen; + elf_buf = kzalloc(elf_buflen, GFP_ATOMIC); + if (!elf_buf) { + read_unlock(&kclist_lock); + return -ENOMEM; + } + elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen); + read_unlock(&kclist_lock); + if (copy_to_user(buffer, elf_buf + *fpos, tsz)) { + kfree(elf_buf); + return -EFAULT; + } + kfree(elf_buf); + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; + } else + read_unlock(&kclist_lock); + + /* + * Check to see if our file offset matches with any of + * the addresses in the elf_phdr on our list. + */ + start = kc_offset_to_vaddr(*fpos - elf_buflen); + if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) + tsz = buflen; + + while (buflen) { + struct kcore_list *m; + + read_lock(&kclist_lock); + list_for_each_entry(m, &kclist_head, list) { + if (start >= m->addr && start < (m->addr+m->size)) + break; + } + read_unlock(&kclist_lock); + + if (&m->list == &kclist_head) { + if (clear_user(buffer, tsz)) + return -EFAULT; + } else if (is_vmalloc_or_module_addr((void *)start)) { + char * elf_buf; + + elf_buf = kzalloc(tsz, GFP_KERNEL); + if (!elf_buf) + return -ENOMEM; + vread(elf_buf, (char *)start, tsz); + /* we have to zero-fill user buffer even if no read */ + if (copy_to_user(buffer, elf_buf, tsz)) { + kfree(elf_buf); + return -EFAULT; + } + kfree(elf_buf); + } else { + if (kern_addr_valid(start)) { + unsigned long n; + + n = copy_to_user(buffer, (char *)start, tsz); + /* + * We cannot distingush between fault on source + * and fault on destination. When this happens + * we clear too and hope it will trigger the + * EFAULT again. + */ + if (n) { + if (clear_user(buffer + tsz - n, + n)) + return -EFAULT; + } + } else { + if (clear_user(buffer, tsz)) + return -EFAULT; + } + } + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + start += tsz; + tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen); + } + + return acc; +} + + +static int open_kcore(struct inode *inode, struct file *filp) +{ + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + if (kcore_need_update) + kcore_update_ram(); + if (i_size_read(inode) != proc_root_kcore->size) { + mutex_lock(&inode->i_mutex); + i_size_write(inode, proc_root_kcore->size); + mutex_unlock(&inode->i_mutex); + } + return 0; +} + + +static const struct file_operations proc_kcore_operations = { + .read = read_kcore, + .open = open_kcore, + .llseek = default_llseek, +}; + +#ifdef CONFIG_MEMORY_HOTPLUG +/* just remember that we have to update kcore */ +static int __meminit kcore_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_ONLINE: + case MEM_OFFLINE: + write_lock(&kclist_lock); + kcore_need_update = 1; + write_unlock(&kclist_lock); + } + return NOTIFY_OK; +} +#endif + + +static struct kcore_list kcore_vmalloc; + +#ifdef CONFIG_ARCH_PROC_KCORE_TEXT +static struct kcore_list kcore_text; +/* + * If defined, special segment is used for mapping kernel text instead of + * direct-map area. We need to create special TEXT section. + */ +static void __init proc_kcore_text_init(void) +{ + kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT); +} +#else +static void __init proc_kcore_text_init(void) +{ +} +#endif + +#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) +/* + * MODULES_VADDR has no intersection with VMALLOC_ADDR. + */ +struct kcore_list kcore_modules; +static void __init add_modules_range(void) +{ + kclist_add(&kcore_modules, (void *)MODULES_VADDR, + MODULES_END - MODULES_VADDR, KCORE_VMALLOC); +} +#else +static void __init add_modules_range(void) +{ +} +#endif + +static int __init proc_kcore_init(void) +{ + proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, + &proc_kcore_operations); + if (!proc_root_kcore) { + printk(KERN_ERR "couldn't create /proc/kcore\n"); + return 0; /* Always returns 0. */ + } + /* Store text area if it's special */ + proc_kcore_text_init(); + /* Store vmalloc area */ + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + VMALLOC_END - VMALLOC_START, KCORE_VMALLOC); + add_modules_range(); + /* Store direct-map area from physical memory map */ + kcore_update_ram(); + hotplug_memory_notifier(kcore_callback, 0); + + return 0; +} +module_init(proc_kcore_init); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c new file mode 100644 index 00000000..bd4b5a74 --- /dev/null +++ b/fs/proc/kmsg.c @@ -0,0 +1,64 @@ +/* + * linux/fs/proc/kmsg.c + * + * Copyright (C) 1992 by Linus Torvalds + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +extern wait_queue_head_t log_wait; + +static int kmsg_open(struct inode * inode, struct file * file) +{ + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); +} + +static int kmsg_release(struct inode * inode, struct file * file) +{ + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); + return 0; +} + +static ssize_t kmsg_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + if ((file->f_flags & O_NONBLOCK) && + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + return -EAGAIN; + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); +} + +static unsigned int kmsg_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &log_wait, wait); + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + return POLLIN | POLLRDNORM; + return 0; +} + + +static const struct file_operations proc_kmsg_operations = { + .read = kmsg_read, + .poll = kmsg_poll, + .open = kmsg_open, + .release = kmsg_release, + .llseek = generic_file_llseek, +}; + +static int __init proc_kmsg_init(void) +{ + proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); + return 0; +} +module_init(proc_kmsg_init); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c new file mode 100644 index 00000000..1afa4dd4 --- /dev/null +++ b/fs/proc/loadavg.c @@ -0,0 +1,45 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + +static int loadavg_proc_show(struct seq_file *m, void *v) +{ + unsigned long avnrun[3]; + + get_avenrun(avnrun, FIXED_1/200, 0); + + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", + LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), + LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), + LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), + nr_running(), nr_threads, + task_active_pid_ns(current)->last_pid); + return 0; +} + +static int loadavg_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, loadavg_proc_show, NULL); +} + +static const struct file_operations loadavg_proc_fops = { + .open = loadavg_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_loadavg_init(void) +{ + proc_create("loadavg", 0, NULL, &loadavg_proc_fops); + return 0; +} +module_init(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c new file mode 100644 index 00000000..a9628278 --- /dev/null +++ b/fs/proc/meminfo.c @@ -0,0 +1,194 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) +{ +} + +static int meminfo_proc_show(struct seq_file *m, void *v) +{ + struct sysinfo i; + unsigned long committed; + unsigned long allowed; + struct vmalloc_info vmi; + long cached; + unsigned long pages[NR_LRU_LISTS]; + int lru; + +/* + * display in kilobytes. + */ +#define K(x) ((x) << (PAGE_SHIFT - 10)) + si_meminfo(&i); + si_swapinfo(&i); + committed = percpu_counter_read_positive(&vm_committed_as); + allowed = ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100) + total_swap_pages; + + cached = global_page_state(NR_FILE_PAGES) - + total_swapcache_pages - i.bufferram; + if (cached < 0) + cached = 0; + + get_vmalloc_info(&vmi); + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_page_state(NR_LRU_BASE + lru); + + /* + * Tagged format, for easy grepping and expansion. + */ + seq_printf(m, + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "Buffers: %8lu kB\n" + "Cached: %8lu kB\n" + "SwapCached: %8lu kB\n" + "Active: %8lu kB\n" + "Inactive: %8lu kB\n" + "Active(anon): %8lu kB\n" + "Inactive(anon): %8lu kB\n" + "Active(file): %8lu kB\n" + "Inactive(file): %8lu kB\n" + "Unevictable: %8lu kB\n" + "Mlocked: %8lu kB\n" +#ifdef CONFIG_HIGHMEM + "HighTotal: %8lu kB\n" + "HighFree: %8lu kB\n" + "LowTotal: %8lu kB\n" + "LowFree: %8lu kB\n" +#endif +#ifndef CONFIG_MMU + "MmapCopy: %8lu kB\n" +#endif + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n" + "Dirty: %8lu kB\n" + "Writeback: %8lu kB\n" + "AnonPages: %8lu kB\n" + "Mapped: %8lu kB\n" + "Shmem: %8lu kB\n" + "Slab: %8lu kB\n" + "SReclaimable: %8lu kB\n" + "SUnreclaim: %8lu kB\n" + "KernelStack: %8lu kB\n" + "PageTables: %8lu kB\n" +#ifdef CONFIG_QUICKLIST + "Quicklists: %8lu kB\n" +#endif + "NFS_Unstable: %8lu kB\n" + "Bounce: %8lu kB\n" + "WritebackTmp: %8lu kB\n" + "CommitLimit: %8lu kB\n" + "Committed_AS: %8lu kB\n" + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n" +#ifdef CONFIG_MEMORY_FAILURE + "HardwareCorrupted: %5lu kB\n" +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "AnonHugePages: %8lu kB\n" +#endif + , + K(i.totalram), + K(i.freeram), + K(i.bufferram), + K(cached), + K(total_swapcache_pages), + K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), + K(pages[LRU_ACTIVE_ANON]), + K(pages[LRU_INACTIVE_ANON]), + K(pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_FILE]), + K(pages[LRU_UNEVICTABLE]), + K(global_page_state(NR_MLOCK)), +#ifdef CONFIG_HIGHMEM + K(i.totalhigh), + K(i.freehigh), + K(i.totalram-i.totalhigh), + K(i.freeram-i.freehigh), +#endif +#ifndef CONFIG_MMU + K((unsigned long) atomic_long_read(&mmap_pages_allocated)), +#endif + K(i.totalswap), + K(i.freeswap), + K(global_page_state(NR_FILE_DIRTY)), + K(global_page_state(NR_WRITEBACK)), +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + K(global_page_state(NR_ANON_PAGES) + + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR), +#else + K(global_page_state(NR_ANON_PAGES)), +#endif + K(global_page_state(NR_FILE_MAPPED)), + K(global_page_state(NR_SHMEM)), + K(global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_SLAB_UNRECLAIMABLE)), + K(global_page_state(NR_SLAB_RECLAIMABLE)), + K(global_page_state(NR_SLAB_UNRECLAIMABLE)), + global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, + K(global_page_state(NR_PAGETABLE)), +#ifdef CONFIG_QUICKLIST + K(quicklist_total_size()), +#endif + K(global_page_state(NR_UNSTABLE_NFS)), + K(global_page_state(NR_BOUNCE)), + K(global_page_state(NR_WRITEBACK_TEMP)), + K(allowed), + K(committed), + (unsigned long)VMALLOC_TOTAL >> 10, + vmi.used >> 10, + vmi.largest_chunk >> 10 +#ifdef CONFIG_MEMORY_FAILURE + ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR) +#endif + ); + + hugetlb_report_meminfo(m); + + arch_report_meminfo(m); + + return 0; +#undef K +} + +static int meminfo_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, meminfo_proc_show, NULL); +} + +static const struct file_operations meminfo_proc_fops = { + .open = meminfo_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_meminfo_init(void) +{ + proc_create("meminfo", 0, NULL, &meminfo_proc_fops); + return 0; +} +module_init(proc_meminfo_init); diff --git a/fs/proc/mmu.c b/fs/proc/mmu.c new file mode 100644 index 00000000..8ae221df --- /dev/null +++ b/fs/proc/mmu.c @@ -0,0 +1,60 @@ +/* mmu.c: mmu memory info files + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include "internal.h" + +void get_vmalloc_info(struct vmalloc_info *vmi) +{ + struct vm_struct *vma; + unsigned long free_area_size; + unsigned long prev_end; + + vmi->used = 0; + + if (!vmlist) { + vmi->largest_chunk = VMALLOC_TOTAL; + } + else { + vmi->largest_chunk = 0; + + prev_end = VMALLOC_START; + + read_lock(&vmlist_lock); + + for (vma = vmlist; vma; vma = vma->next) { + unsigned long addr = (unsigned long) vma->addr; + + /* + * Some archs keep another range for modules in vmlist + */ + if (addr < VMALLOC_START) + continue; + if (addr >= VMALLOC_END) + break; + + vmi->used += vma->size; + + free_area_size = addr - prev_end; + if (vmi->largest_chunk < free_area_size) + vmi->largest_chunk = free_area_size; + + prev_end = vma->size + addr; + } + + if (VMALLOC_END - prev_end > vmi->largest_chunk) + vmi->largest_chunk = VMALLOC_END - prev_end; + + read_unlock(&vmlist_lock); + } +} diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c new file mode 100644 index 00000000..d6c078ea --- /dev/null +++ b/fs/proc/namespaces.c @@ -0,0 +1,201 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + + +static const struct proc_ns_operations *ns_entries[] = { +#ifdef CONFIG_NET_NS + &netns_operations, +#endif +#ifdef CONFIG_UTS_NS + &utsns_operations, +#endif +#ifdef CONFIG_IPC_NS + &ipcns_operations, +#endif +}; + +static const struct file_operations ns_file_operations = { + .llseek = no_llseek, +}; + +static struct dentry *proc_ns_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + const struct proc_ns_operations *ns_ops = ptr; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-ENOENT); + void *ns; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ns = ns_ops->get(task); + if (!ns) + goto out_iput; + + ei = PROC_I(inode); + inode->i_mode = S_IFREG|S_IRUSR; + inode->i_fop = &ns_file_operations; + ei->ns_ops = ns_ops; + ei->ns = ns; + + d_set_d_op(dentry, &pid_dentry_operations); + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; +out: + return error; +out_iput: + iput(inode); + goto out; +} + +static int proc_ns_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, struct task_struct *task, + const struct proc_ns_operations *ops) +{ + return proc_fill_cache(filp, dirent, filldir, + ops->name, strlen(ops->name), + proc_ns_instantiate, task, ops); +} + +static int proc_ns_dir_readdir(struct file *filp, void *dirent, + filldir_t filldir) +{ + int i; + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + const struct proc_ns_operations **entry, **last; + ino_t ino; + int ret; + + ret = -ENOENT; + if (!task) + goto out_no_task; + + ret = -EPERM; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + + ret = 0; + i = filp->f_pos; + switch (i) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + default: + i -= 2; + if (i >= ARRAY_SIZE(ns_entries)) { + ret = 1; + goto out; + } + entry = ns_entries + i; + last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; + while (entry <= last) { + if (proc_ns_fill_cache(filp, dirent, filldir, + task, *entry) < 0) + goto out; + filp->f_pos++; + entry++; + } + } + + ret = 1; +out: + put_task_struct(task); +out_no_task: + return ret; +} + +const struct file_operations proc_ns_dir_operations = { + .read = generic_read_dir, + .readdir = proc_ns_dir_readdir, +}; + +static struct dentry *proc_ns_dir_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *error; + struct task_struct *task = get_proc_task(dir); + const struct proc_ns_operations **entry, **last; + unsigned int len = dentry->d_name.len; + + error = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + + error = ERR_PTR(-EPERM); + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + + last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; + for (entry = ns_entries; entry <= last; entry++) { + if (strlen((*entry)->name) != len) + continue; + if (!memcmp(dentry->d_name.name, (*entry)->name, len)) + break; + } + error = ERR_PTR(-ENOENT); + if (entry > last) + goto out; + + error = proc_ns_instantiate(dir, dentry, task, *entry); +out: + put_task_struct(task); +out_no_task: + return error; +} + +const struct inode_operations proc_ns_dir_inode_operations = { + .lookup = proc_ns_dir_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +struct file *proc_ns_fget(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &ns_file_operations) + goto out_invalid; + + return file; + +out_invalid: + fput(file); + return ERR_PTR(-EINVAL); +} + diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c new file mode 100644 index 00000000..b1822dde --- /dev/null +++ b/fs/proc/nommu.c @@ -0,0 +1,136 @@ +/* nommu.c: mmu-less memory info files + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * display a single region to a sequenced file + */ +static int nommu_region_show(struct seq_file *m, struct vm_region *region) +{ + unsigned long ino = 0; + struct file *file; + dev_t dev = 0; + int flags, len; + + flags = region->vm_flags; + file = region->vm_file; + + if (file) { + struct inode *inode = region->vm_file->f_path.dentry->d_inode; + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + } + + seq_printf(m, + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", + region->vm_start, + region->vm_end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', + ((loff_t)region->vm_pgoff) << PAGE_SHIFT, + MAJOR(dev), MINOR(dev), ino, &len); + + if (file) { + len = 25 + sizeof(void *) * 6 - len; + if (len < 1) + len = 1; + seq_printf(m, "%*c", len, ' '); + seq_path(m, &file->f_path, ""); + } + + seq_putc(m, '\n'); + return 0; +} + +/* + * display a list of all the REGIONs the kernel knows about + * - nommu kernels have a single flat list + */ +static int nommu_region_list_show(struct seq_file *m, void *_p) +{ + struct rb_node *p = _p; + + return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb)); +} + +static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos) +{ + struct rb_node *p; + loff_t pos = *_pos; + + down_read(&nommu_region_sem); + + for (p = rb_first(&nommu_region_tree); p; p = rb_next(p)) + if (pos-- == 0) + return p; + return NULL; +} + +static void nommu_region_list_stop(struct seq_file *m, void *v) +{ + up_read(&nommu_region_sem); +} + +static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return rb_next((struct rb_node *) v); +} + +static const struct seq_operations proc_nommu_region_list_seqop = { + .start = nommu_region_list_start, + .next = nommu_region_list_next, + .stop = nommu_region_list_stop, + .show = nommu_region_list_show +}; + +static int proc_nommu_region_list_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &proc_nommu_region_list_seqop); +} + +static const struct file_operations proc_nommu_region_list_operations = { + .open = proc_nommu_region_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_nommu_init(void) +{ + proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations); + return 0; +} + +module_init(proc_nommu_init); diff --git a/fs/proc/page.c b/fs/proc/page.c new file mode 100644 index 00000000..6d8e6a9e --- /dev/null +++ b/fs/proc/page.c @@ -0,0 +1,212 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define KPMSIZE sizeof(u64) +#define KPMMASK (KPMSIZE - 1) + +/* /proc/kpagecount - an array exposing page counts + * + * Each entry is a u64 representing the corresponding + * physical page count. + */ +static ssize_t kpagecount_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 pcount; + + pfn = src / KPMSIZE; + count = min_t(size_t, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + + while (count > 0) { + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + else + ppage = NULL; + if (!ppage || PageSlab(ppage)) + pcount = 0; + else + pcount = page_mapcount(ppage); + + if (put_user(pcount, out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpagecount_operations = { + .llseek = mem_lseek, + .read = kpagecount_read, +}; + +/* /proc/kpageflags - an array exposing page flags + * + * Each entry is a u64 representing the corresponding + * physical page flags. + */ + +static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) +{ + return ((kflags >> kbit) & 1) << ubit; +} + +u64 stable_page_flags(struct page *page) +{ + u64 k; + u64 u; + + /* + * pseudo flag: KPF_NOPAGE + * it differentiates a memory hole from a page with no flags + */ + if (!page) + return 1 << KPF_NOPAGE; + + k = page->flags; + u = 0; + + /* + * pseudo flags for the well known (anonymous) memory mapped pages + * + * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the + * simple test in page_mapped() is not enough. + */ + if (!PageSlab(page) && page_mapped(page)) + u |= 1 << KPF_MMAP; + if (PageAnon(page)) + u |= 1 << KPF_ANON; + if (PageKsm(page)) + u |= 1 << KPF_KSM; + + /* + * compound pages: export both head/tail info + * they together define a compound page's start/end pos and order + */ + if (PageHead(page)) + u |= 1 << KPF_COMPOUND_HEAD; + if (PageTail(page)) + u |= 1 << KPF_COMPOUND_TAIL; + if (PageHuge(page)) + u |= 1 << KPF_HUGE; + + /* + * Caveats on high order pages: page->_count will only be set + * -1 on the head page; SLUB/SLQB do the same for PG_slab; + * SLOB won't set PG_slab at all on compound pages. + */ + if (PageBuddy(page)) + u |= 1 << KPF_BUDDY; + + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); + + u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); + + u |= kpf_copy_bit(k, KPF_ERROR, PG_error); + u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); + u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate); + u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback); + + u |= kpf_copy_bit(k, KPF_LRU, PG_lru); + u |= kpf_copy_bit(k, KPF_REFERENCED, PG_referenced); + u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active); + u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); + + u |= kpf_copy_bit(k, KPF_SWAPCACHE, PG_swapcache); + u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); + + u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); + u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); + +#ifdef CONFIG_MEMORY_FAILURE + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); +#endif + +#ifdef CONFIG_ARCH_USES_PG_UNCACHED + u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); +#endif + + u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); + u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk); + u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private); + u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); + u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); + u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); + + return u; +}; + +static ssize_t kpageflags_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + + pfn = src / KPMSIZE; + count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + + while (count > 0) { + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + else + ppage = NULL; + + if (put_user(stable_page_flags(ppage), out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpageflags_operations = { + .llseek = mem_lseek, + .read = kpageflags_read, +}; + +static int __init proc_page_init(void) +{ + proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); + proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); + return 0; +} +module_init(proc_page_init); diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c new file mode 100644 index 00000000..927cbd11 --- /dev/null +++ b/fs/proc/proc_devtree.c @@ -0,0 +1,241 @@ +/* + * proc_devtree.c - handles /proc/device-tree + * + * Copyright 1997 Paul Mackerras + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static inline void set_node_proc_entry(struct device_node *np, + struct proc_dir_entry *de) +{ +#ifdef HAVE_ARCH_DEVTREE_FIXUPS + np->pde = de; +#endif +} + +static struct proc_dir_entry *proc_device_tree; + +/* + * Supply data on a read from /proc/device-tree/node/property. + */ +static int property_proc_show(struct seq_file *m, void *v) +{ + struct property *pp = m->private; + + seq_write(m, pp->value, pp->length); + return 0; +} + +static int property_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, property_proc_show, PDE(inode)->data); +} + +static const struct file_operations property_proc_fops = { + .owner = THIS_MODULE, + .open = property_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * For a node with a name like "gc@10", we make symlinks called "gc" + * and "@10" to it. + */ + +/* + * Add a property to a node + */ +static struct proc_dir_entry * +__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp, + const char *name) +{ + struct proc_dir_entry *ent; + + /* + * Unfortunately proc_register puts each new entry + * at the beginning of the list. So we rearrange them. + */ + ent = proc_create_data(name, + strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR, + de, &property_proc_fops, pp); + if (ent == NULL) + return NULL; + + if (!strncmp(name, "security-", 9)) + ent->size = 0; /* don't leak number of password chars */ + else + ent->size = pp->length; + + return ent; +} + + +void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop) +{ + __proc_device_tree_add_prop(pde, prop, prop->name); +} + +void proc_device_tree_remove_prop(struct proc_dir_entry *pde, + struct property *prop) +{ + remove_proc_entry(prop->name, pde); +} + +void proc_device_tree_update_prop(struct proc_dir_entry *pde, + struct property *newprop, + struct property *oldprop) +{ + struct proc_dir_entry *ent; + + for (ent = pde->subdir; ent != NULL; ent = ent->next) + if (ent->data == oldprop) + break; + if (ent == NULL) { + printk(KERN_WARNING "device-tree: property \"%s\" " + " does not exist\n", oldprop->name); + } else { + ent->data = newprop; + ent->size = newprop->length; + } +} + +/* + * Various dodgy firmware might give us nodes and/or properties with + * conflicting names. That's generally ok, except for exporting via /proc, + * so munge names here to ensure they're unique. + */ + +static int duplicate_name(struct proc_dir_entry *de, const char *name) +{ + struct proc_dir_entry *ent; + int found = 0; + + spin_lock(&proc_subdir_lock); + + for (ent = de->subdir; ent != NULL; ent = ent->next) { + if (strcmp(ent->name, name) == 0) { + found = 1; + break; + } + } + + spin_unlock(&proc_subdir_lock); + + return found; +} + +static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de, + const char *name) +{ + char *fixed_name; + int fixup_len = strlen(name) + 2 + 1; /* name + #x + \0 */ + int i = 1, size; + +realloc: + fixed_name = kmalloc(fixup_len, GFP_KERNEL); + if (fixed_name == NULL) { + printk(KERN_ERR "device-tree: Out of memory trying to fixup " + "name \"%s\"\n", name); + return name; + } + +retry: + size = snprintf(fixed_name, fixup_len, "%s#%d", name, i); + size++; /* account for NULL */ + + if (size > fixup_len) { + /* We ran out of space, free and reallocate. */ + kfree(fixed_name); + fixup_len = size; + goto realloc; + } + + if (duplicate_name(de, fixed_name)) { + /* Multiple duplicates. Retry with a different offset. */ + i++; + goto retry; + } + + printk(KERN_WARNING "device-tree: Duplicate name in %s, " + "renamed to \"%s\"\n", np->full_name, fixed_name); + + return fixed_name; +} + +/* + * Process a node, adding entries for its children and its properties. + */ +void proc_device_tree_add_node(struct device_node *np, + struct proc_dir_entry *de) +{ + struct property *pp; + struct proc_dir_entry *ent; + struct device_node *child; + const char *p; + + set_node_proc_entry(np, de); + for (child = NULL; (child = of_get_next_child(np, child));) { + /* Use everything after the last slash, or the full name */ + p = strrchr(child->full_name, '/'); + if (!p) + p = child->full_name; + else + ++p; + + if (duplicate_name(de, p)) + p = fixup_name(np, de, p); + + ent = proc_mkdir(p, de); + if (ent == NULL) + break; + proc_device_tree_add_node(child, ent); + } + of_node_put(child); + + for (pp = np->properties; pp != NULL; pp = pp->next) { + p = pp->name; + + if (strchr(p, '/')) + continue; + + if (duplicate_name(de, p)) + p = fixup_name(np, de, p); + + ent = __proc_device_tree_add_prop(de, pp, p); + if (ent == NULL) + break; + } +} + +/* + * Called on initialization to set up the /proc/device-tree subtree + */ +void __init proc_device_tree_init(void) +{ + struct device_node *root; + + proc_device_tree = proc_mkdir("device-tree", NULL); + if (proc_device_tree == NULL) + return; + root = of_find_node_by_path("/"); + if (root == NULL) { + pr_debug("/proc/device-tree: can't find root\n"); + return; + } + proc_device_tree_add_node(root, proc_device_tree); + of_node_put(root); +} diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c new file mode 100644 index 00000000..9020ac15 --- /dev/null +++ b/fs/proc/proc_net.c @@ -0,0 +1,241 @@ +/* + * linux/fs/proc/net.c + * + * Copyright (C) 2007 + * + * Author: Eric Biederman + * + * proc net directory handling functions + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + + +static struct net *get_proc_net(const struct inode *inode) +{ + return maybe_get_net(PDE_NET(PDE(inode))); +} + +int seq_open_net(struct inode *ino, struct file *f, + const struct seq_operations *ops, int size) +{ + struct net *net; + struct seq_net_private *p; + + BUG_ON(size < sizeof(*p)); + + net = get_proc_net(ino); + if (net == NULL) + return -ENXIO; + + p = __seq_open_private(f, ops, size); + if (p == NULL) { + put_net(net); + return -ENOMEM; + } +#ifdef CONFIG_NET_NS + p->net = net; +#endif + return 0; +} +EXPORT_SYMBOL_GPL(seq_open_net); + +int single_open_net(struct inode *inode, struct file *file, + int (*show)(struct seq_file *, void *)) +{ + int err; + struct net *net; + + err = -ENXIO; + net = get_proc_net(inode); + if (net == NULL) + goto err_net; + + err = single_open(file, show, net); + if (err < 0) + goto err_open; + + return 0; + +err_open: + put_net(net); +err_net: + return err; +} +EXPORT_SYMBOL_GPL(single_open_net); + +int seq_release_net(struct inode *ino, struct file *f) +{ + struct seq_file *seq; + + seq = f->private_data; + + put_net(seq_file_net(seq)); + seq_release_private(ino, f); + return 0; +} +EXPORT_SYMBOL_GPL(seq_release_net); + +int single_release_net(struct inode *ino, struct file *f) +{ + struct seq_file *seq = f->private_data; + put_net(seq->private); + return single_release(ino, f); +} +EXPORT_SYMBOL_GPL(single_release_net); + +static struct net *get_proc_task_net(struct inode *dir) +{ + struct task_struct *task; + struct nsproxy *ns; + struct net *net = NULL; + + rcu_read_lock(); + task = pid_task(proc_pid(dir), PIDTYPE_PID); + if (task != NULL) { + ns = task_nsproxy(task); + if (ns != NULL) + net = get_net(ns->net_ns); + } + rcu_read_unlock(); + + return net; +} + +static struct dentry *proc_tgid_net_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *de; + struct net *net; + + de = ERR_PTR(-ENOENT); + net = get_proc_task_net(dir); + if (net != NULL) { + de = proc_lookup_de(net->proc_net, dir, dentry); + put_net(net); + } + return de; +} + +static int proc_tgid_net_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct net *net; + + net = get_proc_task_net(inode); + + generic_fillattr(inode, stat); + + if (net != NULL) { + stat->nlink = net->proc_net->nlink; + put_net(net); + } + + return 0; +} + +const struct inode_operations proc_net_inode_operations = { + .lookup = proc_tgid_net_lookup, + .getattr = proc_tgid_net_getattr, +}; + +static int proc_tgid_net_readdir(struct file *filp, void *dirent, + filldir_t filldir) +{ + int ret; + struct net *net; + + ret = -EINVAL; + net = get_proc_task_net(filp->f_path.dentry->d_inode); + if (net != NULL) { + ret = proc_readdir_de(net->proc_net, filp, dirent, filldir); + put_net(net); + } + return ret; +} + +const struct file_operations proc_net_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = proc_tgid_net_readdir, +}; + + +struct proc_dir_entry *proc_net_fops_create(struct net *net, + const char *name, mode_t mode, const struct file_operations *fops) +{ + return proc_create(name, mode, net->proc_net, fops); +} +EXPORT_SYMBOL_GPL(proc_net_fops_create); + +void proc_net_remove(struct net *net, const char *name) +{ + remove_proc_entry(name, net->proc_net); +} +EXPORT_SYMBOL_GPL(proc_net_remove); + +static __net_init int proc_net_ns_init(struct net *net) +{ + struct proc_dir_entry *netd, *net_statd; + int err; + + err = -ENOMEM; + netd = kzalloc(sizeof(*netd), GFP_KERNEL); + if (!netd) + goto out; + + netd->data = net; + netd->nlink = 2; + netd->name = "net"; + netd->namelen = 3; + netd->parent = &proc_root; + + err = -EEXIST; + net_statd = proc_net_mkdir(net, "stat", netd); + if (!net_statd) + goto free_net; + + net->proc_net = netd; + net->proc_net_stat = net_statd; + return 0; + +free_net: + kfree(netd); +out: + return err; +} + +static __net_exit void proc_net_ns_exit(struct net *net) +{ + remove_proc_entry("stat", net->proc_net); + kfree(net->proc_net); +} + +static struct pernet_operations __net_initdata proc_net_ns_ops = { + .init = proc_net_ns_init, + .exit = proc_net_ns_exit, +}; + +int __init proc_net_init(void) +{ + proc_symlink("net", NULL, "self/net"); + + return register_pernet_subsys(&proc_net_ns_ops); +} diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c new file mode 100644 index 00000000..d167de36 --- /dev/null +++ b/fs/proc/proc_sysctl.c @@ -0,0 +1,436 @@ +/* + * /proc/sys support + */ +#include +#include +#include +#include +#include +#include "internal.h" + +static const struct dentry_operations proc_sys_dentry_operations; +static const struct file_operations proc_sys_file_operations; +static const struct inode_operations proc_sys_inode_operations; +static const struct file_operations proc_sys_dir_file_operations; +static const struct inode_operations proc_sys_dir_operations; + +static struct inode *proc_sys_make_inode(struct super_block *sb, + struct ctl_table_header *head, struct ctl_table *table) +{ + struct inode *inode; + struct proc_inode *ei; + + inode = new_inode(sb); + if (!inode) + goto out; + + inode->i_ino = get_next_ino(); + + sysctl_head_get(head); + ei = PROC_I(inode); + ei->sysctl = head; + ei->sysctl_entry = table; + + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = table->mode; + if (!table->child) { + inode->i_mode |= S_IFREG; + inode->i_op = &proc_sys_inode_operations; + inode->i_fop = &proc_sys_file_operations; + } else { + inode->i_mode |= S_IFDIR; + inode->i_nlink = 0; + inode->i_op = &proc_sys_dir_operations; + inode->i_fop = &proc_sys_dir_file_operations; + } +out: + return inode; +} + +static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name) +{ + int len; + for ( ; p->procname; p++) { + + if (!p->procname) + continue; + + len = strlen(p->procname); + if (len != name->len) + continue; + + if (memcmp(p->procname, name->name, len) != 0) + continue; + + /* I have a match */ + return p; + } + return NULL; +} + +static struct ctl_table_header *grab_header(struct inode *inode) +{ + if (PROC_I(inode)->sysctl) + return sysctl_head_grab(PROC_I(inode)->sysctl); + else + return sysctl_head_next(NULL); +} + +static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct ctl_table_header *head = grab_header(dir); + struct ctl_table *table = PROC_I(dir)->sysctl_entry; + struct ctl_table_header *h = NULL; + struct qstr *name = &dentry->d_name; + struct ctl_table *p; + struct inode *inode; + struct dentry *err = ERR_PTR(-ENOENT); + + if (IS_ERR(head)) + return ERR_CAST(head); + + if (table && !table->child) { + WARN_ON(1); + goto out; + } + + table = table ? table->child : head->ctl_table; + + p = find_in_table(table, name); + if (!p) { + for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) { + if (h->attached_to != table) + continue; + p = find_in_table(h->attached_by, name); + if (p) + break; + } + } + + if (!p) + goto out; + + err = ERR_PTR(-ENOMEM); + inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); + if (h) + sysctl_head_finish(h); + + if (!inode) + goto out; + + err = NULL; + d_set_d_op(dentry, &proc_sys_dentry_operations); + d_add(dentry, inode); + +out: + sysctl_head_finish(head); + return err; +} + +static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, + size_t count, loff_t *ppos, int write) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + ssize_t error; + size_t res; + + if (IS_ERR(head)) + return PTR_ERR(head); + + /* + * At this point we know that the sysctl was not unregistered + * and won't be until we finish. + */ + error = -EPERM; + if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ)) + goto out; + + /* if that can happen at all, it should be -EINVAL, not -EISDIR */ + error = -EINVAL; + if (!table->proc_handler) + goto out; + + /* careful: calling conventions are nasty here */ + res = count; + error = table->proc_handler(table, write, buf, &res, ppos); + if (!error) + error = res; +out: + sysctl_head_finish(head); + + return error; +} + +static ssize_t proc_sys_read(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0); +} + +static ssize_t proc_sys_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); +} + + +static int proc_sys_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, + struct ctl_table_header *head, + struct ctl_table *table) +{ + struct dentry *child, *dir = filp->f_path.dentry; + struct inode *inode; + struct qstr qname; + ino_t ino = 0; + unsigned type = DT_UNKNOWN; + + qname.name = table->procname; + qname.len = strlen(table->procname); + qname.hash = full_name_hash(qname.name, qname.len); + + child = d_lookup(dir, &qname); + if (!child) { + child = d_alloc(dir, &qname); + if (child) { + inode = proc_sys_make_inode(dir->d_sb, head, table); + if (!inode) { + dput(child); + return -ENOMEM; + } else { + d_set_d_op(child, &proc_sys_dentry_operations); + d_add(child, inode); + } + } else { + return -ENOMEM; + } + } + inode = child->d_inode; + ino = inode->i_ino; + type = inode->i_mode >> 12; + dput(child); + return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); +} + +static int scan(struct ctl_table_header *head, ctl_table *table, + unsigned long *pos, struct file *file, + void *dirent, filldir_t filldir) +{ + + for (; table->procname; table++, (*pos)++) { + int res; + + /* Can't do anything without a proc name */ + if (!table->procname) + continue; + + if (*pos < file->f_pos) + continue; + + res = proc_sys_fill_cache(file, dirent, filldir, head, table); + if (res) + return res; + + file->f_pos = *pos + 1; + } + return 0; +} + +static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + struct ctl_table_header *h = NULL; + unsigned long pos; + int ret = -EINVAL; + + if (IS_ERR(head)) + return PTR_ERR(head); + + if (table && !table->child) { + WARN_ON(1); + goto out; + } + + table = table ? table->child : head->ctl_table; + + ret = 0; + /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ + if (filp->f_pos == 0) { + if (filldir(dirent, ".", 1, filp->f_pos, + inode->i_ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + } + if (filp->f_pos == 1) { + if (filldir(dirent, "..", 2, filp->f_pos, + parent_ino(dentry), DT_DIR) < 0) + goto out; + filp->f_pos++; + } + pos = 2; + + ret = scan(head, table, &pos, filp, dirent, filldir); + if (ret) + goto out; + + for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) { + if (h->attached_to != table) + continue; + ret = scan(h, h->attached_by, &pos, filp, dirent, filldir); + if (ret) { + sysctl_head_finish(h); + break; + } + } + ret = 1; +out: + sysctl_head_finish(head); + return ret; +} + +static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags) +{ + /* + * sysctl entries that are not writeable, + * are _NOT_ writeable, capabilities or not. + */ + struct ctl_table_header *head; + struct ctl_table *table; + int error; + + /* Executable files are not allowed under /proc/sys/ */ + if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) + return -EACCES; + + head = grab_header(inode); + if (IS_ERR(head)) + return PTR_ERR(head); + + table = PROC_I(inode)->sysctl_entry; + if (!table) /* global root - r-xr-xr-x */ + error = mask & MAY_WRITE ? -EACCES : 0; + else /* Use the permissions on the sysctl table entry */ + error = sysctl_perm(head->root, table, mask); + + sysctl_head_finish(head); + return error; +} + +static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error; + + if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) + return -EPERM; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + + if (IS_ERR(head)) + return PTR_ERR(head); + + generic_fillattr(inode, stat); + if (table) + stat->mode = (stat->mode & S_IFMT) | table->mode; + + sysctl_head_finish(head); + return 0; +} + +static const struct file_operations proc_sys_file_operations = { + .read = proc_sys_read, + .write = proc_sys_write, + .llseek = default_llseek, +}; + +static const struct file_operations proc_sys_dir_file_operations = { + .readdir = proc_sys_readdir, + .llseek = generic_file_llseek, +}; + +static const struct inode_operations proc_sys_inode_operations = { + .permission = proc_sys_permission, + .setattr = proc_sys_setattr, + .getattr = proc_sys_getattr, +}; + +static const struct inode_operations proc_sys_dir_operations = { + .lookup = proc_sys_lookup, + .permission = proc_sys_permission, + .setattr = proc_sys_setattr, + .getattr = proc_sys_getattr, +}; + +static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + return !PROC_I(dentry->d_inode)->sysctl->unregistering; +} + +static int proc_sys_delete(const struct dentry *dentry) +{ + return !!PROC_I(dentry->d_inode)->sysctl->unregistering; +} + +static int proc_sys_compare(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + struct ctl_table_header *head; + /* Although proc doesn't have negative dentries, rcu-walk means + * that inode here can be NULL */ + /* AV: can it, indeed? */ + if (!inode) + return 1; + if (name->len != len) + return 1; + if (memcmp(name->name, str, len)) + return 1; + head = rcu_dereference(PROC_I(inode)->sysctl); + return !head || !sysctl_is_seen(head); +} + +static const struct dentry_operations proc_sys_dentry_operations = { + .d_revalidate = proc_sys_revalidate, + .d_delete = proc_sys_delete, + .d_compare = proc_sys_compare, +}; + +int __init proc_sys_init(void) +{ + struct proc_dir_entry *proc_sys_root; + + proc_sys_root = proc_mkdir("sys", NULL); + proc_sys_root->proc_iops = &proc_sys_dir_operations; + proc_sys_root->proc_fops = &proc_sys_dir_file_operations; + proc_sys_root->nlink = 0; + return 0; +} diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c new file mode 100644 index 00000000..cb761f01 --- /dev/null +++ b/fs/proc/proc_tty.c @@ -0,0 +1,189 @@ +/* + * proc_tty.c -- handles /proc/tty + * + * Copyright 1997, Theodore Ts'o + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The /proc/tty directory inodes... + */ +static struct proc_dir_entry *proc_tty_ldisc, *proc_tty_driver; + +/* + * This is the handler for /proc/tty/drivers + */ +static void show_tty_range(struct seq_file *m, struct tty_driver *p, + dev_t from, int num) +{ + seq_printf(m, "%-20s ", p->driver_name ? p->driver_name : "unknown"); + seq_printf(m, "/dev/%-8s ", p->name); + if (p->num > 1) { + seq_printf(m, "%3d %d-%d ", MAJOR(from), MINOR(from), + MINOR(from) + num - 1); + } else { + seq_printf(m, "%3d %7d ", MAJOR(from), MINOR(from)); + } + switch (p->type) { + case TTY_DRIVER_TYPE_SYSTEM: + seq_puts(m, "system"); + if (p->subtype == SYSTEM_TYPE_TTY) + seq_puts(m, ":/dev/tty"); + else if (p->subtype == SYSTEM_TYPE_SYSCONS) + seq_puts(m, ":console"); + else if (p->subtype == SYSTEM_TYPE_CONSOLE) + seq_puts(m, ":vtmaster"); + break; + case TTY_DRIVER_TYPE_CONSOLE: + seq_puts(m, "console"); + break; + case TTY_DRIVER_TYPE_SERIAL: + seq_puts(m, "serial"); + break; + case TTY_DRIVER_TYPE_PTY: + if (p->subtype == PTY_TYPE_MASTER) + seq_puts(m, "pty:master"); + else if (p->subtype == PTY_TYPE_SLAVE) + seq_puts(m, "pty:slave"); + else + seq_puts(m, "pty"); + break; + default: + seq_printf(m, "type:%d.%d", p->type, p->subtype); + } + seq_putc(m, '\n'); +} + +static int show_tty_driver(struct seq_file *m, void *v) +{ + struct tty_driver *p = list_entry(v, struct tty_driver, tty_drivers); + dev_t from = MKDEV(p->major, p->minor_start); + dev_t to = from + p->num; + + if (&p->tty_drivers == tty_drivers.next) { + /* pseudo-drivers first */ + seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty"); + seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0); + seq_puts(m, "system:/dev/tty\n"); + seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console"); + seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1); + seq_puts(m, "system:console\n"); +#ifdef CONFIG_UNIX98_PTYS + seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx"); + seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2); + seq_puts(m, "system\n"); +#endif +#ifdef CONFIG_VT + seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0"); + seq_printf(m, "%3d %7d ", TTY_MAJOR, 0); + seq_puts(m, "system:vtmaster\n"); +#endif + } + + while (MAJOR(from) < MAJOR(to)) { + dev_t next = MKDEV(MAJOR(from)+1, 0); + show_tty_range(m, p, from, next - from); + from = next; + } + if (from != to) + show_tty_range(m, p, from, to - from); + return 0; +} + +/* iterator */ +static void *t_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&tty_mutex); + return seq_list_start(&tty_drivers, *pos); +} + +static void *t_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &tty_drivers, pos); +} + +static void t_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&tty_mutex); +} + +static const struct seq_operations tty_drivers_op = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = show_tty_driver +}; + +static int tty_drivers_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &tty_drivers_op); +} + +static const struct file_operations proc_tty_drivers_operations = { + .open = tty_drivers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * This function is called by tty_register_driver() to handle + * registering the driver's /proc handler into /proc/tty/driver/ + */ +void proc_tty_register_driver(struct tty_driver *driver) +{ + struct proc_dir_entry *ent; + + if (!driver->driver_name || driver->proc_entry || + !driver->ops->proc_fops) + return; + + ent = proc_create_data(driver->driver_name, 0, proc_tty_driver, + driver->ops->proc_fops, driver); + driver->proc_entry = ent; +} + +/* + * This function is called by tty_unregister_driver() + */ +void proc_tty_unregister_driver(struct tty_driver *driver) +{ + struct proc_dir_entry *ent; + + ent = driver->proc_entry; + if (!ent) + return; + + remove_proc_entry(driver->driver_name, proc_tty_driver); + + driver->proc_entry = NULL; +} + +/* + * Called by proc_root_init() to initialize the /proc/tty subtree + */ +void __init proc_tty_init(void) +{ + if (!proc_mkdir("tty", NULL)) + return; + proc_tty_ldisc = proc_mkdir("tty/ldisc", NULL); + /* + * /proc/tty/driver/serial reveals the exact character counts for + * serial links which is just too easy to abuse for inferring + * password lengths and inter-keystroke timings during password + * entry. + */ + proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL); + proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops); + proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations); +} diff --git a/fs/proc/root.c b/fs/proc/root.c new file mode 100644 index 00000000..d6c3b416 --- /dev/null +++ b/fs/proc/root.c @@ -0,0 +1,213 @@ +/* + * linux/fs/proc/root.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * proc root directory handling functions + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +static int proc_test_super(struct super_block *sb, void *data) +{ + return sb->s_fs_info == data; +} + +static int proc_set_super(struct super_block *sb, void *data) +{ + int err = set_anon_super(sb, NULL); + if (!err) { + struct pid_namespace *ns = (struct pid_namespace *)data; + sb->s_fs_info = get_pid_ns(ns); + } + return err; +} + +static struct dentry *proc_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + int err; + struct super_block *sb; + struct pid_namespace *ns; + struct proc_inode *ei; + + if (flags & MS_KERNMOUNT) + ns = (struct pid_namespace *)data; + else + ns = current->nsproxy->pid_ns; + + sb = sget(fs_type, proc_test_super, proc_set_super, ns); + if (IS_ERR(sb)) + return ERR_CAST(sb); + + if (!sb->s_root) { + sb->s_flags = flags; + err = proc_fill_super(sb); + if (err) { + deactivate_locked_super(sb); + return ERR_PTR(err); + } + + sb->s_flags |= MS_ACTIVE; + } + + ei = PROC_I(sb->s_root->d_inode); + if (!ei->pid) { + rcu_read_lock(); + ei->pid = get_pid(find_pid_ns(1, ns)); + rcu_read_unlock(); + } + + return dget(sb->s_root); +} + +static void proc_kill_sb(struct super_block *sb) +{ + struct pid_namespace *ns; + + ns = (struct pid_namespace *)sb->s_fs_info; + kill_anon_super(sb); + put_pid_ns(ns); +} + +static struct file_system_type proc_fs_type = { + .name = "proc", + .mount = proc_mount, + .kill_sb = proc_kill_sb, +}; + +void __init proc_root_init(void) +{ + struct vfsmount *mnt; + int err; + + proc_init_inodecache(); + err = register_filesystem(&proc_fs_type); + if (err) + return; + mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); + if (IS_ERR(mnt)) { + unregister_filesystem(&proc_fs_type); + return; + } + + init_pid_ns.proc_mnt = mnt; + proc_symlink("mounts", NULL, "self/mounts"); + + proc_net_init(); + +#ifdef CONFIG_SYSVIPC + proc_mkdir("sysvipc", NULL); +#endif + proc_mkdir("fs", NULL); + proc_mkdir("driver", NULL); + proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */ +#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) + /* just give it a mountpoint */ + proc_mkdir("openprom", NULL); +#endif + proc_tty_init(); +#ifdef CONFIG_PROC_DEVICETREE + proc_device_tree_init(); +#endif + proc_mkdir("bus", NULL); + proc_sys_init(); +} + +static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat +) +{ + generic_fillattr(dentry->d_inode, stat); + stat->nlink = proc_root.nlink + nr_processes(); + return 0; +} + +static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +{ + if (!proc_lookup(dir, dentry, nd)) { + return NULL; + } + + return proc_pid_lookup(dir, dentry, nd); +} + +static int proc_root_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + unsigned int nr = filp->f_pos; + int ret; + + if (nr < FIRST_PROCESS_ENTRY) { + int error = proc_readdir(filp, dirent, filldir); + if (error <= 0) + return error; + filp->f_pos = FIRST_PROCESS_ENTRY; + } + + ret = proc_pid_readdir(filp, dirent, filldir); + return ret; +} + +/* + * The root /proc directory is special, as it has the + * directories. Thus we don't use the generic + * directory handling functions for that.. + */ +static const struct file_operations proc_root_operations = { + .read = generic_read_dir, + .readdir = proc_root_readdir, + .llseek = default_llseek, +}; + +/* + * proc root can do almost nothing.. + */ +static const struct inode_operations proc_root_inode_operations = { + .lookup = proc_root_lookup, + .getattr = proc_root_getattr, +}; + +/* + * This is the root "inode" in the /proc tree.. + */ +struct proc_dir_entry proc_root = { + .low_ino = PROC_ROOT_INO, + .namelen = 5, + .name = "/proc", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .nlink = 2, + .count = ATOMIC_INIT(1), + .proc_iops = &proc_root_inode_operations, + .proc_fops = &proc_root_operations, + .parent = &proc_root, +}; + +int pid_ns_prepare_proc(struct pid_namespace *ns) +{ + struct vfsmount *mnt; + + mnt = kern_mount_data(&proc_fs_type, ns); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + ns->proc_mnt = mnt; + return 0; +} + +void pid_ns_release_proc(struct pid_namespace *ns) +{ + mntput(ns->proc_mnt); +} diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c new file mode 100644 index 00000000..62604be9 --- /dev/null +++ b/fs/proc/softirqs.c @@ -0,0 +1,44 @@ +#include +#include +#include +#include + +/* + * /proc/softirqs ... display the number of softirqs + */ +static int show_softirqs(struct seq_file *p, void *v) +{ + int i, j; + + seq_puts(p, " "); + for_each_possible_cpu(i) + seq_printf(p, "CPU%-8d", i); + seq_putc(p, '\n'); + + for (i = 0; i < NR_SOFTIRQS; i++) { + seq_printf(p, "%12s:", softirq_to_name[i]); + for_each_possible_cpu(j) + seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); + seq_putc(p, '\n'); + } + return 0; +} + +static int softirqs_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_softirqs, NULL); +} + +static const struct file_operations proc_softirqs_operations = { + .open = softirqs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_softirqs_init(void) +{ + proc_create("softirqs", 0, NULL, &proc_softirqs_operations); + return 0; +} +module_init(proc_softirqs_init); diff --git a/fs/proc/stat.c b/fs/proc/stat.c new file mode 100644 index 00000000..9758b654 --- /dev/null +++ b/fs/proc/stat.c @@ -0,0 +1,170 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif +#ifndef arch_irq_stat +#define arch_irq_stat() 0 +#endif +#ifndef arch_idle_time +#define arch_idle_time(cpu) 0 +#endif + +static int show_stat(struct seq_file *p, void *v) +{ + int i, j; + unsigned long jif; + cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; + cputime64_t guest, guest_nice; + u64 sum = 0; + u64 sum_softirq = 0; + unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; + struct timespec boottime; + + user = nice = system = idle = iowait = + irq = softirq = steal = cputime64_zero; + guest = guest_nice = cputime64_zero; + getboottime(&boottime); + jif = boottime.tv_sec; + + for_each_possible_cpu(i) { + user = cputime64_add(user, kstat_cpu(i).cpustat.user); + nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); + system = cputime64_add(system, kstat_cpu(i).cpustat.system); + idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); + idle = cputime64_add(idle, arch_idle_time(i)); + iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait); + irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); + softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); + steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); + guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); + guest_nice = cputime64_add(guest_nice, + kstat_cpu(i).cpustat.guest_nice); + sum += kstat_cpu_irqs_sum(i); + sum += arch_irq_stat_cpu(i); + + for (j = 0; j < NR_SOFTIRQS; j++) { + unsigned int softirq_stat = kstat_softirqs_cpu(j, i); + + per_softirq_sums[j] += softirq_stat; + sum_softirq += softirq_stat; + } + } + sum += arch_irq_stat(); + + seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu " + "%llu\n", + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), + (unsigned long long)cputime64_to_clock_t(idle), + (unsigned long long)cputime64_to_clock_t(iowait), + (unsigned long long)cputime64_to_clock_t(irq), + (unsigned long long)cputime64_to_clock_t(softirq), + (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(guest), + (unsigned long long)cputime64_to_clock_t(guest_nice)); + for_each_online_cpu(i) { + + /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ + user = kstat_cpu(i).cpustat.user; + nice = kstat_cpu(i).cpustat.nice; + system = kstat_cpu(i).cpustat.system; + idle = kstat_cpu(i).cpustat.idle; + idle = cputime64_add(idle, arch_idle_time(i)); + iowait = kstat_cpu(i).cpustat.iowait; + irq = kstat_cpu(i).cpustat.irq; + softirq = kstat_cpu(i).cpustat.softirq; + steal = kstat_cpu(i).cpustat.steal; + guest = kstat_cpu(i).cpustat.guest; + guest_nice = kstat_cpu(i).cpustat.guest_nice; + seq_printf(p, + "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " + "%llu\n", + i, + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), + (unsigned long long)cputime64_to_clock_t(idle), + (unsigned long long)cputime64_to_clock_t(iowait), + (unsigned long long)cputime64_to_clock_t(irq), + (unsigned long long)cputime64_to_clock_t(softirq), + (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(guest), + (unsigned long long)cputime64_to_clock_t(guest_nice)); + } + seq_printf(p, "intr %llu", (unsigned long long)sum); + + /* sum again ? it could be updated? */ + for_each_irq_nr(j) + seq_printf(p, " %u", kstat_irqs(j)); + + seq_printf(p, + "\nctxt %llu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + nr_context_switches(), + (unsigned long)jif, + total_forks, + nr_running(), + nr_iowait()); + + seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq); + + for (i = 0; i < NR_SOFTIRQS; i++) + seq_printf(p, " %u", per_softirq_sums[i]); + seq_putc(p, '\n'); + + return 0; +} + +static int stat_open(struct inode *inode, struct file *file) +{ + unsigned size = 4096 * (1 + num_possible_cpus() / 32); + char *buf; + struct seq_file *m; + int res; + + /* don't ask for more than the kmalloc() max size */ + if (size > KMALLOC_MAX_SIZE) + size = KMALLOC_MAX_SIZE; + buf = kmalloc(size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + res = single_open(file, show_stat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +static const struct file_operations proc_stat_operations = { + .open = stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_stat_init(void) +{ + proc_create("stat", 0, NULL, &proc_stat_operations); + return 0; +} +module_init(proc_stat_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c new file mode 100644 index 00000000..55a1f494 --- /dev/null +++ b/fs/proc/task_mmu.c @@ -0,0 +1,1125 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "internal.h" + +void task_mem(struct seq_file *m, struct mm_struct *mm) +{ + unsigned long data, text, lib, swap; + unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; + + /* + * Note: to minimize their overhead, mm maintains hiwater_vm and + * hiwater_rss only when about to *lower* total_vm or rss. Any + * collector of these hiwater stats must therefore get total_vm + * and rss too, which will usually be the higher. Barriers? not + * worth the effort, such snapshots can always be inconsistent. + */ + hiwater_vm = total_vm = mm->total_vm; + if (hiwater_vm < mm->hiwater_vm) + hiwater_vm = mm->hiwater_vm; + hiwater_rss = total_rss = get_mm_rss(mm); + if (hiwater_rss < mm->hiwater_rss) + hiwater_rss = mm->hiwater_rss; + + data = mm->total_vm - mm->shared_vm - mm->stack_vm; + text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; + lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; + swap = get_mm_counter(mm, MM_SWAPENTS); + seq_printf(m, + "VmPeak:\t%8lu kB\n" + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmHWM:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB\n" + "VmPTE:\t%8lu kB\n" + "VmSwap:\t%8lu kB\n", + hiwater_vm << (PAGE_SHIFT-10), + (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), + mm->locked_vm << (PAGE_SHIFT-10), + hiwater_rss << (PAGE_SHIFT-10), + total_rss << (PAGE_SHIFT-10), + data << (PAGE_SHIFT-10), + mm->stack_vm << (PAGE_SHIFT-10), text, lib, + (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10, + swap << (PAGE_SHIFT-10)); +} + +unsigned long task_vsize(struct mm_struct *mm) +{ + return PAGE_SIZE * mm->total_vm; +} + +unsigned long task_statm(struct mm_struct *mm, + unsigned long *shared, unsigned long *text, + unsigned long *data, unsigned long *resident) +{ + *shared = get_mm_counter(mm, MM_FILEPAGES); + *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) + >> PAGE_SHIFT; + *data = mm->total_vm - mm->shared_vm; + *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); + return mm->total_vm; +} + +static void pad_len_spaces(struct seq_file *m, int len) +{ + len = 25 + sizeof(void*) * 6 - len; + if (len < 1) + len = 1; + seq_printf(m, "%*c", len, ' '); +} + +static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) +{ + if (vma && vma != priv->tail_vma) { + struct mm_struct *mm = vma->vm_mm; + up_read(&mm->mmap_sem); + mmput(mm); + } +} + +static void *m_start(struct seq_file *m, loff_t *pos) +{ + struct proc_maps_private *priv = m->private; + unsigned long last_addr = m->version; + struct mm_struct *mm; + struct vm_area_struct *vma, *tail_vma = NULL; + loff_t l = *pos; + + /* Clear the per syscall fields in priv */ + priv->task = NULL; + priv->tail_vma = NULL; + + /* + * We remember last_addr rather than next_addr to hit with + * mmap_cache most of the time. We have zero last_addr at + * the beginning and also after lseek. We will have -1 last_addr + * after the end of the vmas. + */ + + if (last_addr == -1UL) + return NULL; + + priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + if (!priv->task) + return ERR_PTR(-ESRCH); + + mm = mm_for_maps(priv->task); + if (!mm || IS_ERR(mm)) + return mm; + down_read(&mm->mmap_sem); + + tail_vma = get_gate_vma(priv->task->mm); + priv->tail_vma = tail_vma; + + /* Start with last addr hint */ + vma = find_vma(mm, last_addr); + if (last_addr && vma) { + vma = vma->vm_next; + goto out; + } + + /* + * Check the vma index is within the range and do + * sequential scan until m_index. + */ + vma = NULL; + if ((unsigned long)l < mm->map_count) { + vma = mm->mmap; + while (l-- && vma) + vma = vma->vm_next; + goto out; + } + + if (l != mm->map_count) + tail_vma = NULL; /* After gate vma */ + +out: + if (vma) + return vma; + + /* End of vmas has been reached */ + m->version = (tail_vma != NULL)? 0: -1UL; + up_read(&mm->mmap_sem); + mmput(mm); + return tail_vma; +} + +static void *m_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + struct vm_area_struct *tail_vma = priv->tail_vma; + + (*pos)++; + if (vma && (vma != tail_vma) && vma->vm_next) + return vma->vm_next; + vma_stop(priv, vma); + return (vma != tail_vma)? tail_vma: NULL; +} + +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + + if (!IS_ERR(vma)) + vma_stop(priv, vma); + if (priv->task) + put_task_struct(priv->task); +} + +static int do_maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + struct proc_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->pid = proc_pid(inode); + ret = seq_open(file, ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; +} + +static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + struct file *file = vma->vm_file; + vm_flags_t flags = vma->vm_flags; + unsigned long ino = 0; + unsigned long long pgoff = 0; + unsigned long start, end; + dev_t dev = 0; + int len; + + if (file) { + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; + } + + /* We don't show the stack guard page in /proc/maps */ + start = vma->vm_start; + if (stack_guard_page_start(vma, start)) + start += PAGE_SIZE; + end = vma->vm_end; + if (stack_guard_page_end(vma, end)) + end -= PAGE_SIZE; + + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", + start, + end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? 's' : 'p', + pgoff, + MAJOR(dev), MINOR(dev), ino, &len); + + /* + * Print the dentry name for named mappings, and a + * special [heap] marker for the heap: + */ + if (file) { + pad_len_spaces(m, len); + seq_path(m, &file->f_path, "\n"); + } else { + const char *name = arch_vma_name(vma); + if (!name) { + if (mm) { + if (vma->vm_start <= mm->brk && + vma->vm_end >= mm->start_brk) { + name = "[heap]"; + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + name = "[stack]"; + } + } else { + name = "[vdso]"; + } + } + if (name) { + pad_len_spaces(m, len); + seq_puts(m, name); + } + } + seq_putc(m, '\n'); +} + +static int show_map(struct seq_file *m, void *v) +{ + struct vm_area_struct *vma = v; + struct proc_maps_private *priv = m->private; + struct task_struct *task = priv->task; + + show_map_vma(m, vma); + + if (m->count < m->size) /* vma is copied successfully */ + m->version = (vma != get_gate_vma(task->mm)) + ? vma->vm_start : 0; + return 0; +} + +static const struct seq_operations proc_pid_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_map +}; + +static int maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_maps_op); +} + +const struct file_operations proc_maps_operations = { + .open = maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* + * Proportional Set Size(PSS): my share of RSS. + * + * PSS of a process is the count of pages it has in memory, where each + * page is divided by the number of processes sharing it. So if a + * process has 1000 pages all to itself, and 1000 shared with one other + * process, its PSS will be 1500. + * + * To keep (accumulated) division errors low, we adopt a 64bit + * fixed-point pss counter to minimize division errors. So (pss >> + * PSS_SHIFT) would be the real byte count. + * + * A shift of 12 before division means (assuming 4K page size): + * - 1M 3-user-pages add up to 8KB errors; + * - supports mapcount up to 2^24, or 16M; + * - supports PSS up to 2^52 bytes, or 4PB. + */ +#define PSS_SHIFT 12 + +#ifdef CONFIG_PROC_PAGE_MONITOR +struct mem_size_stats { + struct vm_area_struct *vma; + unsigned long resident; + unsigned long shared_clean; + unsigned long shared_dirty; + unsigned long private_clean; + unsigned long private_dirty; + unsigned long referenced; + unsigned long anonymous; + unsigned long anonymous_thp; + unsigned long swap; + u64 pss; +}; + + +static void smaps_pte_entry(pte_t ptent, unsigned long addr, + unsigned long ptent_size, struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = mss->vma; + struct page *page; + int mapcount; + + if (is_swap_pte(ptent)) { + mss->swap += ptent_size; + return; + } + + if (!pte_present(ptent)) + return; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + return; + + if (PageAnon(page)) + mss->anonymous += ptent_size; + + mss->resident += ptent_size; + /* Accumulate the size in pages that have been accessed. */ + if (pte_young(ptent) || PageReferenced(page)) + mss->referenced += ptent_size; + mapcount = page_mapcount(page); + if (mapcount >= 2) { + if (pte_dirty(ptent) || PageDirty(page)) + mss->shared_dirty += ptent_size; + else + mss->shared_clean += ptent_size; + mss->pss += (ptent_size << PSS_SHIFT) / mapcount; + } else { + if (pte_dirty(ptent) || PageDirty(page)) + mss->private_dirty += ptent_size; + else + mss->private_clean += ptent_size; + mss->pss += (ptent_size << PSS_SHIFT); + } +} + +static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = mss->vma; + pte_t *pte; + spinlock_t *ptl; + + spin_lock(&walk->mm->page_table_lock); + if (pmd_trans_huge(*pmd)) { + if (pmd_trans_splitting(*pmd)) { + spin_unlock(&walk->mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + smaps_pte_entry(*(pte_t *)pmd, addr, + HPAGE_PMD_SIZE, walk); + spin_unlock(&walk->mm->page_table_lock); + mss->anonymous_thp += HPAGE_PMD_SIZE; + return 0; + } + } else { + spin_unlock(&walk->mm->page_table_lock); + } + + if (pmd_trans_unstable(pmd)) + return 0; + /* + * The mmap_sem held all the way back in m_start() is what + * keeps khugepaged out of here and from collapsing things + * in here. + */ + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) + smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + return 0; +} + +static int show_smap(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct task_struct *task = priv->task; + struct vm_area_struct *vma = v; + struct mem_size_stats mss; + struct mm_walk smaps_walk = { + .pmd_entry = smaps_pte_range, + .mm = vma->vm_mm, + .private = &mss, + }; + + memset(&mss, 0, sizeof mss); + mss.vma = vma; + /* mmap_sem is held in m_start */ + if (vma->vm_mm && !is_vm_hugetlb_page(vma)) + walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); + + show_map_vma(m, vma); + + seq_printf(m, + "Size: %8lu kB\n" + "Rss: %8lu kB\n" + "Pss: %8lu kB\n" + "Shared_Clean: %8lu kB\n" + "Shared_Dirty: %8lu kB\n" + "Private_Clean: %8lu kB\n" + "Private_Dirty: %8lu kB\n" + "Referenced: %8lu kB\n" + "Anonymous: %8lu kB\n" + "AnonHugePages: %8lu kB\n" + "Swap: %8lu kB\n" + "KernelPageSize: %8lu kB\n" + "MMUPageSize: %8lu kB\n" + "Locked: %8lu kB\n", + (vma->vm_end - vma->vm_start) >> 10, + mss.resident >> 10, + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), + mss.shared_clean >> 10, + mss.shared_dirty >> 10, + mss.private_clean >> 10, + mss.private_dirty >> 10, + mss.referenced >> 10, + mss.anonymous >> 10, + mss.anonymous_thp >> 10, + mss.swap >> 10, + vma_kernel_pagesize(vma) >> 10, + vma_mmu_pagesize(vma) >> 10, + (vma->vm_flags & VM_LOCKED) ? + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); + + if (m->count < m->size) /* vma is copied successfully */ + m->version = (vma != get_gate_vma(task->mm)) + ? vma->vm_start : 0; + return 0; +} + +static const struct seq_operations proc_pid_smaps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_smap +}; + +static int smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_smaps_op); +} + +const struct file_operations proc_smaps_operations = { + .open = smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->private; + pte_t *pte, ptent; + spinlock_t *ptl; + struct page *page; + + split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + if (PageReserved(page)) + continue; + + /* Clear accessed and referenced bits. */ + ptep_test_and_clear_young(vma, addr, pte); + ClearPageReferenced(page); + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + return 0; +} + +#define CLEAR_REFS_ALL 1 +#define CLEAR_REFS_ANON 2 +#define CLEAR_REFS_MAPPED 3 + +static ssize_t clear_refs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + struct mm_struct *mm; + struct vm_area_struct *vma; + int type; + int rv; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + rv = kstrtoint(strstrip(buffer), 10, &type); + if (rv < 0) + return rv; + if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) + return -EINVAL; + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) + return -ESRCH; + mm = get_task_mm(task); + if (mm) { + struct mm_walk clear_refs_walk = { + .pmd_entry = clear_refs_pte_range, + .mm = mm, + }; + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + clear_refs_walk.private = vma; + if (is_vm_hugetlb_page(vma)) + continue; + /* + * Writing 1 to /proc/pid/clear_refs affects all pages. + * + * Writing 2 to /proc/pid/clear_refs only affects + * Anonymous pages. + * + * Writing 3 to /proc/pid/clear_refs only affects file + * mapped pages. + */ + if (type == CLEAR_REFS_ANON && vma->vm_file) + continue; + if (type == CLEAR_REFS_MAPPED && !vma->vm_file) + continue; + walk_page_range(vma->vm_start, vma->vm_end, + &clear_refs_walk); + } + flush_tlb_mm(mm); + up_read(&mm->mmap_sem); + mmput(mm); + } + put_task_struct(task); + + return count; +} + +const struct file_operations proc_clear_refs_operations = { + .write = clear_refs_write, + .llseek = noop_llseek, +}; + +struct pagemapread { + int pos, len; + u64 *buffer; +}; + +#define PM_ENTRY_BYTES sizeof(u64) +#define PM_STATUS_BITS 3 +#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) +#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) +#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) +#define PM_PSHIFT_BITS 6 +#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) +#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) +#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) +#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) +#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) + +#define PM_PRESENT PM_STATUS(4LL) +#define PM_SWAP PM_STATUS(2LL) +#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) +#define PM_END_OF_BUFFER 1 + +static int add_to_pagemap(unsigned long addr, u64 pfn, + struct pagemapread *pm) +{ + pm->buffer[pm->pos++] = pfn; + if (pm->pos >= pm->len) + return PM_END_OF_BUFFER; + return 0; +} + +static int pagemap_pte_hole(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemapread *pm = walk->private; + unsigned long addr; + int err = 0; + for (addr = start; addr < end; addr += PAGE_SIZE) { + err = add_to_pagemap(addr, PM_NOT_PRESENT, pm); + if (err) + break; + } + return err; +} + +static u64 swap_pte_to_pagemap_entry(pte_t pte) +{ + swp_entry_t e = pte_to_swp_entry(pte); + return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); +} + +static u64 pte_to_pagemap_entry(pte_t pte) +{ + u64 pme = 0; + if (is_swap_pte(pte)) + pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte)) + | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP; + else if (pte_present(pte)) + pme = PM_PFRAME(pte_pfn(pte)) + | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; + return pme; +} + +static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma; + struct pagemapread *pm = walk->private; + pte_t *pte; + int err = 0; + + split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; + + /* find the first VMA at or above 'addr' */ + vma = find_vma(walk->mm, addr); + for (; addr != end; addr += PAGE_SIZE) { + u64 pfn = PM_NOT_PRESENT; + + /* check to see if we've left 'vma' behind + * and need a new, higher one */ + if (vma && (addr >= vma->vm_end)) + vma = find_vma(walk->mm, addr); + + /* check that 'vma' actually covers this address, + * and that it isn't a huge page vma */ + if (vma && (vma->vm_start <= addr) && + !is_vm_hugetlb_page(vma)) { + pte = pte_offset_map(pmd, addr); + pfn = pte_to_pagemap_entry(*pte); + /* unmap before userspace copy */ + pte_unmap(pte); + } + err = add_to_pagemap(addr, pfn, pm); + if (err) + return err; + } + + cond_resched(); + + return err; +} + +#ifdef CONFIG_HUGETLB_PAGE +static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) +{ + u64 pme = 0; + if (pte_present(pte)) + pme = PM_PFRAME(pte_pfn(pte) + offset) + | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; + return pme; +} + +/* This function walks within one hugetlb entry in the single call */ +static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct pagemapread *pm = walk->private; + int err = 0; + u64 pfn; + + for (; addr != end; addr += PAGE_SIZE) { + int offset = (addr & ~hmask) >> PAGE_SHIFT; + pfn = huge_pte_to_pagemap_entry(*pte, offset); + err = add_to_pagemap(addr, pfn, pm); + if (err) + return err; + } + + cond_resched(); + + return err; +} +#endif /* HUGETLB_PAGE */ + +/* + * /proc/pid/pagemap - an array mapping virtual pages to pfns + * + * For each page in the address space, this file contains one 64-bit entry + * consisting of the following: + * + * Bits 0-55 page frame number (PFN) if present + * Bits 0-4 swap type if swapped + * Bits 5-55 swap offset if swapped + * Bits 55-60 page shift (page size = 1<f_path.dentry->d_inode); + struct mm_struct *mm; + struct pagemapread pm; + int ret = -ESRCH; + struct mm_walk pagemap_walk = {}; + unsigned long src; + unsigned long svpfn; + unsigned long start_vaddr; + unsigned long end_vaddr; + int copied = 0; + + if (!task) + goto out; + + ret = -EINVAL; + /* file position must be aligned */ + if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) + goto out_task; + + ret = 0; + if (!count) + goto out_task; + + pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); + pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); + ret = -ENOMEM; + if (!pm.buffer) + goto out_task; + + mm = mm_for_maps(task); + ret = PTR_ERR(mm); + if (!mm || IS_ERR(mm)) + goto out_free; + + pagemap_walk.pmd_entry = pagemap_pte_range; + pagemap_walk.pte_hole = pagemap_pte_hole; +#ifdef CONFIG_HUGETLB_PAGE + pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; +#endif + pagemap_walk.mm = mm; + pagemap_walk.private = ± + + src = *ppos; + svpfn = src / PM_ENTRY_BYTES; + start_vaddr = svpfn << PAGE_SHIFT; + end_vaddr = TASK_SIZE_OF(task); + + /* watch out for wraparound */ + if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) + start_vaddr = end_vaddr; + + /* + * The odds are that this will stop walking way + * before end_vaddr, because the length of the + * user buffer is tracked in "pm", and the walk + * will stop when we hit the end of the buffer. + */ + ret = 0; + while (count && (start_vaddr < end_vaddr)) { + int len; + unsigned long end; + + pm.pos = 0; + end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; + /* overflow ? */ + if (end < start_vaddr || end > end_vaddr) + end = end_vaddr; + down_read(&mm->mmap_sem); + ret = walk_page_range(start_vaddr, end, &pagemap_walk); + up_read(&mm->mmap_sem); + start_vaddr = end; + + len = min(count, PM_ENTRY_BYTES * pm.pos); + if (copy_to_user(buf, pm.buffer, len)) { + ret = -EFAULT; + goto out_mm; + } + copied += len; + buf += len; + count -= len; + } + *ppos += copied; + if (!ret || ret == PM_END_OF_BUFFER) + ret = copied; + +out_mm: + mmput(mm); +out_free: + kfree(pm.buffer); +out_task: + put_task_struct(task); +out: + return ret; +} + +const struct file_operations proc_pagemap_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = pagemap_read, +}; +#endif /* CONFIG_PROC_PAGE_MONITOR */ + +#ifdef CONFIG_NUMA + +struct numa_maps { + struct vm_area_struct *vma; + unsigned long pages; + unsigned long anon; + unsigned long active; + unsigned long writeback; + unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; + unsigned long node[MAX_NUMNODES]; +}; + +struct numa_maps_private { + struct proc_maps_private proc_maps; + struct numa_maps md; +}; + +static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, + unsigned long nr_pages) +{ + int count = page_mapcount(page); + + md->pages += nr_pages; + if (pte_dirty || PageDirty(page)) + md->dirty += nr_pages; + + if (PageSwapCache(page)) + md->swapcache += nr_pages; + + if (PageActive(page) || PageUnevictable(page)) + md->active += nr_pages; + + if (PageWriteback(page)) + md->writeback += nr_pages; + + if (PageAnon(page)) + md->anon += nr_pages; + + if (count > md->mapcount_max) + md->mapcount_max = count; + + md->node[page_to_nid(page)] += nr_pages; +} + +static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + int nid; + + if (!pte_present(pte)) + return NULL; + + page = vm_normal_page(vma, addr, pte); + if (!page) + return NULL; + + if (PageReserved(page)) + return NULL; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_HIGH_MEMORY])) + return NULL; + + return page; +} + +static int gather_pte_stats(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md; + spinlock_t *ptl; + pte_t *orig_pte; + pte_t *pte; + + md = walk->private; + spin_lock(&walk->mm->page_table_lock); + if (pmd_trans_huge(*pmd)) { + if (pmd_trans_splitting(*pmd)) { + spin_unlock(&walk->mm->page_table_lock); + wait_split_huge_page(md->vma->anon_vma, pmd); + } else { + pte_t huge_pte = *(pte_t *)pmd; + struct page *page; + + page = can_gather_numa_stats(huge_pte, md->vma, addr); + if (page) + gather_stats(page, md, pte_dirty(huge_pte), + HPAGE_PMD_SIZE/PAGE_SIZE); + spin_unlock(&walk->mm->page_table_lock); + return 0; + } + } else { + spin_unlock(&walk->mm->page_table_lock); + } + + if (pmd_trans_unstable(pmd)) + return 0; + orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + do { + struct page *page = can_gather_numa_stats(*pte, md->vma, addr); + if (!page) + continue; + gather_stats(page, md, pte_dirty(*pte), 1); + + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap_unlock(orig_pte, ptl); + return 0; +} +#ifdef CONFIG_HUGETLB_PAGE +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md; + struct page *page; + + if (pte_none(*pte)) + return 0; + + page = pte_page(*pte); + if (!page) + return 0; + + md = walk->private; + gather_stats(page, md, pte_dirty(*pte), 1); + return 0; +} + +#else +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + return 0; +} +#endif + +/* + * Display pages allocated per node and memory policy via /proc. + */ +static int show_numa_map(struct seq_file *m, void *v) +{ + struct numa_maps_private *numa_priv = m->private; + struct proc_maps_private *proc_priv = &numa_priv->proc_maps; + struct vm_area_struct *vma = v; + struct numa_maps *md = &numa_priv->md; + struct file *file = vma->vm_file; + struct mm_struct *mm = vma->vm_mm; + struct mm_walk walk = {}; + struct mempolicy *pol; + int n; + char buffer[50]; + + if (!mm) + return 0; + + /* Ensure we start with an empty set of numa_maps statistics. */ + memset(md, 0, sizeof(*md)); + + md->vma = vma; + + walk.hugetlb_entry = gather_hugetbl_stats; + walk.pmd_entry = gather_pte_stats; + walk.private = md; + walk.mm = mm; + + pol = get_vma_policy(proc_priv->task, vma, vma->vm_start); + mpol_to_str(buffer, sizeof(buffer), pol, 0); + mpol_cond_put(pol); + + seq_printf(m, "%08lx %s", vma->vm_start, buffer); + + if (file) { + seq_printf(m, " file="); + seq_path(m, &file->f_path, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + seq_printf(m, " stack"); + } + + if (is_vm_hugetlb_page(vma)) + seq_printf(m, " huge"); + + walk_page_range(vma->vm_start, vma->vm_end, &walk); + + if (!md->pages) + goto out; + + if (md->anon) + seq_printf(m, " anon=%lu", md->anon); + + if (md->dirty) + seq_printf(m, " dirty=%lu", md->dirty); + + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m, " swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m, " active=%lu", md->active); + + if (md->writeback) + seq_printf(m, " writeback=%lu", md->writeback); + + for_each_node_state(n, N_HIGH_MEMORY) + if (md->node[n]) + seq_printf(m, " N%d=%lu", n, md->node[n]); +out: + seq_putc(m, '\n'); + + if (m->count < m->size) + m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; + return 0; +} + +static const struct seq_operations proc_pid_numa_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_numa_map, +}; + +static int numa_maps_open(struct inode *inode, struct file *file) +{ + struct numa_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->proc_maps.pid = proc_pid(inode); + ret = seq_open(file, &proc_pid_numa_maps_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; +} + +const struct file_operations proc_numa_maps_operations = { + .open = numa_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif /* CONFIG_NUMA */ diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c new file mode 100644 index 00000000..980de547 --- /dev/null +++ b/fs/proc/task_nommu.c @@ -0,0 +1,271 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * Logic: we've got two memory sums for each process, "shared", and + * "non-shared". Shared memory may get counted more than once, for + * each process that owns it. Non-shared memory is counted + * accurately. + */ +void task_mem(struct seq_file *m, struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct vm_region *region; + struct rb_node *p; + unsigned long bytes = 0, sbytes = 0, slack = 0, size; + + down_read(&mm->mmap_sem); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + + bytes += kobjsize(vma); + + region = vma->vm_region; + if (region) { + size = kobjsize(region); + size += region->vm_end - region->vm_start; + } else { + size = vma->vm_end - vma->vm_start; + } + + if (atomic_read(&mm->mm_count) > 1 || + vma->vm_flags & VM_MAYSHARE) { + sbytes += size; + } else { + bytes += size; + if (region) + slack = region->vm_end - vma->vm_end; + } + } + + if (atomic_read(&mm->mm_count) > 1) + sbytes += kobjsize(mm); + else + bytes += kobjsize(mm); + + if (current->fs && current->fs->users > 1) + sbytes += kobjsize(current->fs); + else + bytes += kobjsize(current->fs); + + if (current->files && atomic_read(¤t->files->count) > 1) + sbytes += kobjsize(current->files); + else + bytes += kobjsize(current->files); + + if (current->sighand && atomic_read(¤t->sighand->count) > 1) + sbytes += kobjsize(current->sighand); + else + bytes += kobjsize(current->sighand); + + bytes += kobjsize(current); /* includes kernel stack */ + + seq_printf(m, + "Mem:\t%8lu bytes\n" + "Slack:\t%8lu bytes\n" + "Shared:\t%8lu bytes\n", + bytes, slack, sbytes); + + up_read(&mm->mmap_sem); +} + +unsigned long task_vsize(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct rb_node *p; + unsigned long vsize = 0; + + down_read(&mm->mmap_sem); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + vsize += vma->vm_end - vma->vm_start; + } + up_read(&mm->mmap_sem); + return vsize; +} + +unsigned long task_statm(struct mm_struct *mm, + unsigned long *shared, unsigned long *text, + unsigned long *data, unsigned long *resident) +{ + struct vm_area_struct *vma; + struct vm_region *region; + struct rb_node *p; + unsigned long size = kobjsize(mm); + + down_read(&mm->mmap_sem); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + size += kobjsize(vma); + region = vma->vm_region; + if (region) { + size += kobjsize(region); + size += region->vm_end - region->vm_start; + } + } + + *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) + >> PAGE_SHIFT; + *data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK)) + >> PAGE_SHIFT; + up_read(&mm->mmap_sem); + size >>= PAGE_SHIFT; + size += *text + *data; + *resident = size; + return size; +} + +static void pad_len_spaces(struct seq_file *m, int len) +{ + len = 25 + sizeof(void*) * 6 - len; + if (len < 1) + len = 1; + seq_printf(m, "%*c", len, ' '); +} + +/* + * display a single VMA to a sequenced file + */ +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long ino = 0; + struct file *file; + dev_t dev = 0; + int flags, len; + unsigned long long pgoff = 0; + + flags = vma->vm_flags; + file = vma->vm_file; + + if (file) { + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; + } + + seq_printf(m, + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", + vma->vm_start, + vma->vm_end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', + pgoff, + MAJOR(dev), MINOR(dev), ino, &len); + + if (file) { + pad_len_spaces(m, len); + seq_path(m, &file->f_path, ""); + } else if (mm) { + if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + pad_len_spaces(m, len); + seq_puts(m, "[stack]"); + } + } + + seq_putc(m, '\n'); + return 0; +} + +/* + * display mapping lines for a particular process's /proc/pid/maps + */ +static int show_map(struct seq_file *m, void *_p) +{ + struct rb_node *p = _p; + + return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); +} + +static void *m_start(struct seq_file *m, loff_t *pos) +{ + struct proc_maps_private *priv = m->private; + struct mm_struct *mm; + struct rb_node *p; + loff_t n = *pos; + + /* pin the task and mm whilst we play with them */ + priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + if (!priv->task) + return ERR_PTR(-ESRCH); + + mm = mm_for_maps(priv->task); + if (!mm || IS_ERR(mm)) { + put_task_struct(priv->task); + priv->task = NULL; + return mm; + } + down_read(&mm->mmap_sem); + + /* start from the Nth VMA */ + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) + if (n-- == 0) + return p; + return NULL; +} + +static void m_stop(struct seq_file *m, void *_vml) +{ + struct proc_maps_private *priv = m->private; + + if (priv->task) { + struct mm_struct *mm = priv->task->mm; + up_read(&mm->mmap_sem); + mmput(mm); + put_task_struct(priv->task); + } +} + +static void *m_next(struct seq_file *m, void *_p, loff_t *pos) +{ + struct rb_node *p = _p; + + (*pos)++; + return p ? rb_next(p) : NULL; +} + +static const struct seq_operations proc_pid_maps_ops = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_map +}; + +static int maps_open(struct inode *inode, struct file *file) +{ + struct proc_maps_private *priv; + int ret = -ENOMEM; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->pid = proc_pid(inode); + ret = seq_open(file, &proc_pid_maps_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; +} + +const struct file_operations proc_maps_operations = { + .open = maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c new file mode 100644 index 00000000..29166ecd --- /dev/null +++ b/fs/proc/uptime.c @@ -0,0 +1,53 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +static int uptime_proc_show(struct seq_file *m, void *v) +{ + struct timespec uptime; + struct timespec idle; + cputime64_t idletime; + u64 nsec; + u32 rem; + int i; + + idletime = 0; + for_each_possible_cpu(i) + idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle); + + do_posix_clock_monotonic_gettime(&uptime); + monotonic_to_bootbased(&uptime); + nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_nsec = rem; + seq_printf(m, "%lu.%02lu %lu.%02lu\n", + (unsigned long) uptime.tv_sec, + (uptime.tv_nsec / (NSEC_PER_SEC / 100)), + (unsigned long) idle.tv_sec, + (idle.tv_nsec / (NSEC_PER_SEC / 100))); + return 0; +} + +static int uptime_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, uptime_proc_show, NULL); +} + +static const struct file_operations uptime_proc_fops = { + .open = uptime_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_uptime_init(void) +{ + proc_create("uptime", 0, NULL, &uptime_proc_fops); + return 0; +} +module_init(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c new file mode 100644 index 00000000..76817a60 --- /dev/null +++ b/fs/proc/version.c @@ -0,0 +1,34 @@ +#include +#include +#include +#include +#include +#include + +static int version_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, linux_proc_banner, + utsname()->sysname, + utsname()->release, + utsname()->version); + return 0; +} + +static int version_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, version_proc_show, NULL); +} + +static const struct file_operations version_proc_fops = { + .open = version_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_version_init(void) +{ + proc_create("version", 0, NULL, &version_proc_fops); + return 0; +} +module_init(proc_version_init); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c new file mode 100644 index 00000000..cd99bf55 --- /dev/null +++ b/fs/proc/vmcore.c @@ -0,0 +1,701 @@ +/* + * fs/proc/vmcore.c Interface for accessing the crash + * dump from the system's previous life. + * Heavily borrowed from fs/proc/kcore.c + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* List representing chunks of contiguous memory areas and their offsets in + * vmcore file. + */ +static LIST_HEAD(vmcore_list); + +/* Stores the pointer to the buffer containing kernel elf core headers. */ +static char *elfcorebuf; +static size_t elfcorebuf_sz; + +/* Total size of vmcore file. */ +static u64 vmcore_size; + +static struct proc_dir_entry *proc_vmcore = NULL; + +/* + * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error + * The called function has to take care of module refcounting. + */ +static int (*oldmem_pfn_is_ram)(unsigned long pfn); + +int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)) +{ + if (oldmem_pfn_is_ram) + return -EBUSY; + oldmem_pfn_is_ram = fn; + return 0; +} +EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram); + +void unregister_oldmem_pfn_is_ram(void) +{ + oldmem_pfn_is_ram = NULL; + wmb(); +} +EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram); + +static int pfn_is_ram(unsigned long pfn) +{ + int (*fn)(unsigned long pfn); + /* pfn is ram unless fn() checks pagetype */ + int ret = 1; + + /* + * Ask hypervisor if the pfn is really ram. + * A ballooned page contains no data and reading from such a page + * will cause high load in the hypervisor. + */ + fn = oldmem_pfn_is_ram; + if (fn) + ret = fn(pfn); + + return ret; +} + +/* Reads a page from the oldmem device from given offset. */ +static ssize_t read_from_oldmem(char *buf, size_t count, + u64 *ppos, int userbuf) +{ + unsigned long pfn, offset; + size_t nr_bytes; + ssize_t read = 0, tmp; + + if (!count) + return 0; + + offset = (unsigned long)(*ppos % PAGE_SIZE); + pfn = (unsigned long)(*ppos / PAGE_SIZE); + + do { + if (count > (PAGE_SIZE - offset)) + nr_bytes = PAGE_SIZE - offset; + else + nr_bytes = count; + + /* If pfn is not ram, return zeros for sparse dump files */ + if (pfn_is_ram(pfn) == 0) + memset(buf, 0, nr_bytes); + else { + tmp = copy_oldmem_page(pfn, buf, nr_bytes, + offset, userbuf); + if (tmp < 0) + return tmp; + } + *ppos += nr_bytes; + count -= nr_bytes; + buf += nr_bytes; + read += nr_bytes; + ++pfn; + offset = 0; + } while (count); + + return read; +} + +/* Maps vmcore file offset to respective physical address in memroy. */ +static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list, + struct vmcore **m_ptr) +{ + struct vmcore *m; + u64 paddr; + + list_for_each_entry(m, vc_list, list) { + u64 start, end; + start = m->offset; + end = m->offset + m->size - 1; + if (offset >= start && offset <= end) { + paddr = m->paddr + offset - start; + *m_ptr = m; + return paddr; + } + } + *m_ptr = NULL; + return 0; +} + +/* Read from the ELF header and then the crash dump. On error, negative value is + * returned otherwise number of bytes read are returned. + */ +static ssize_t read_vmcore(struct file *file, char __user *buffer, + size_t buflen, loff_t *fpos) +{ + ssize_t acc = 0, tmp; + size_t tsz; + u64 start, nr_bytes; + struct vmcore *curr_m = NULL; + + if (buflen == 0 || *fpos >= vmcore_size) + return 0; + + /* trim buflen to not go beyond EOF */ + if (buflen > vmcore_size - *fpos) + buflen = vmcore_size - *fpos; + + /* Read ELF core header */ + if (*fpos < elfcorebuf_sz) { + tsz = elfcorebuf_sz - *fpos; + if (buflen < tsz) + tsz = buflen; + if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) + return -EFAULT; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; + } + + start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m); + if (!curr_m) + return -EINVAL; + if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) + tsz = buflen; + + /* Calculate left bytes in current memory segment. */ + nr_bytes = (curr_m->size - (start - curr_m->paddr)); + if (tsz > nr_bytes) + tsz = nr_bytes; + + while (buflen) { + tmp = read_from_oldmem(buffer, tsz, &start, 1); + if (tmp < 0) + return tmp; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + if (start >= (curr_m->paddr + curr_m->size)) { + if (curr_m->list.next == &vmcore_list) + return acc; /*EOF*/ + curr_m = list_entry(curr_m->list.next, + struct vmcore, list); + start = curr_m->paddr; + } + if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) + tsz = buflen; + /* Calculate left bytes in current memory segment. */ + nr_bytes = (curr_m->size - (start - curr_m->paddr)); + if (tsz > nr_bytes) + tsz = nr_bytes; + } + return acc; +} + +static const struct file_operations proc_vmcore_operations = { + .read = read_vmcore, + .llseek = default_llseek, +}; + +static struct vmcore* __init get_new_element(void) +{ + return kzalloc(sizeof(struct vmcore), GFP_KERNEL); +} + +static u64 __init get_vmcore_size_elf64(char *elfptr) +{ + int i; + u64 size; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr *phdr_ptr; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); + size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++) { + size += phdr_ptr->p_memsz; + phdr_ptr++; + } + return size; +} + +static u64 __init get_vmcore_size_elf32(char *elfptr) +{ + int i; + u64 size; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr *phdr_ptr; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); + size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++) { + size += phdr_ptr->p_memsz; + phdr_ptr++; + } + return size; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, + struct list_head *vc_list) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr phdr, *phdr_ptr; + Elf64_Nhdr *nhdr_ptr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + int j; + void *notes_section; + struct vmcore *new; + u64 offset, max_sz, sz, real_sz = 0; + if (phdr_ptr->p_type != PT_NOTE) + continue; + nr_ptnote++; + max_sz = phdr_ptr->p_memsz; + offset = phdr_ptr->p_offset; + notes_section = kmalloc(max_sz, GFP_KERNEL); + if (!notes_section) + return -ENOMEM; + rc = read_from_oldmem(notes_section, max_sz, &offset, 0); + if (rc < 0) { + kfree(notes_section); + return rc; + } + nhdr_ptr = notes_section; + for (j = 0; j < max_sz; j += sz) { + if (nhdr_ptr->n_namesz == 0) + break; + sz = sizeof(Elf64_Nhdr) + + ((nhdr_ptr->n_namesz + 3) & ~3) + + ((nhdr_ptr->n_descsz + 3) & ~3); + real_sz += sz; + nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); + } + + /* Add this contiguous chunk of notes section to vmcore list.*/ + new = get_new_element(); + if (!new) { + kfree(notes_section); + return -ENOMEM; + } + new->paddr = phdr_ptr->p_offset; + new->size = real_sz; + list_add_tail(&new->list, vc_list); + phdr_sz += real_sz; + kfree(notes_section); + } + + /* Prepare merged PT_NOTE program header. */ + phdr.p_type = PT_NOTE; + phdr.p_flags = 0; + note_off = sizeof(Elf64_Ehdr) + + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr); + phdr.p_offset = note_off; + phdr.p_vaddr = phdr.p_paddr = 0; + phdr.p_filesz = phdr.p_memsz = phdr_sz; + phdr.p_align = 0; + + /* Add merged PT_NOTE program header*/ + tmp = elfptr + sizeof(Elf64_Ehdr); + memcpy(tmp, &phdr, sizeof(phdr)); + tmp += sizeof(phdr); + + /* Remove unwanted PT_NOTE program headers. */ + i = (nr_ptnote - 1) * sizeof(Elf64_Phdr); + *elfsz = *elfsz - i; + memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr))); + + /* Modify e_phnum to reflect merged headers. */ + ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, + struct list_head *vc_list) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr phdr, *phdr_ptr; + Elf32_Nhdr *nhdr_ptr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + int j; + void *notes_section; + struct vmcore *new; + u64 offset, max_sz, sz, real_sz = 0; + if (phdr_ptr->p_type != PT_NOTE) + continue; + nr_ptnote++; + max_sz = phdr_ptr->p_memsz; + offset = phdr_ptr->p_offset; + notes_section = kmalloc(max_sz, GFP_KERNEL); + if (!notes_section) + return -ENOMEM; + rc = read_from_oldmem(notes_section, max_sz, &offset, 0); + if (rc < 0) { + kfree(notes_section); + return rc; + } + nhdr_ptr = notes_section; + for (j = 0; j < max_sz; j += sz) { + if (nhdr_ptr->n_namesz == 0) + break; + sz = sizeof(Elf32_Nhdr) + + ((nhdr_ptr->n_namesz + 3) & ~3) + + ((nhdr_ptr->n_descsz + 3) & ~3); + real_sz += sz; + nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); + } + + /* Add this contiguous chunk of notes section to vmcore list.*/ + new = get_new_element(); + if (!new) { + kfree(notes_section); + return -ENOMEM; + } + new->paddr = phdr_ptr->p_offset; + new->size = real_sz; + list_add_tail(&new->list, vc_list); + phdr_sz += real_sz; + kfree(notes_section); + } + + /* Prepare merged PT_NOTE program header. */ + phdr.p_type = PT_NOTE; + phdr.p_flags = 0; + note_off = sizeof(Elf32_Ehdr) + + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr); + phdr.p_offset = note_off; + phdr.p_vaddr = phdr.p_paddr = 0; + phdr.p_filesz = phdr.p_memsz = phdr_sz; + phdr.p_align = 0; + + /* Add merged PT_NOTE program header*/ + tmp = elfptr + sizeof(Elf32_Ehdr); + memcpy(tmp, &phdr, sizeof(phdr)); + tmp += sizeof(phdr); + + /* Remove unwanted PT_NOTE program headers. */ + i = (nr_ptnote - 1) * sizeof(Elf32_Phdr); + *elfsz = *elfsz - i; + memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr))); + + /* Modify e_phnum to reflect merged headers. */ + ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + + return 0; +} + +/* Add memory chunks represented by program headers to vmcore list. Also update + * the new offset fields of exported program headers. */ +static int __init process_ptload_program_headers_elf64(char *elfptr, + size_t elfsz, + struct list_head *vc_list) +{ + int i; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr *phdr_ptr; + loff_t vmcore_off; + struct vmcore *new; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ + + /* First program header is PT_NOTE header. */ + vmcore_off = sizeof(Elf64_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) + + phdr_ptr->p_memsz; /* Note sections */ + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_LOAD) + continue; + + /* Add this contiguous chunk of memory to vmcore list.*/ + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = phdr_ptr->p_offset; + new->size = phdr_ptr->p_memsz; + list_add_tail(&new->list, vc_list); + + /* Update the program header offset. */ + phdr_ptr->p_offset = vmcore_off; + vmcore_off = vmcore_off + phdr_ptr->p_memsz; + } + return 0; +} + +static int __init process_ptload_program_headers_elf32(char *elfptr, + size_t elfsz, + struct list_head *vc_list) +{ + int i; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr *phdr_ptr; + loff_t vmcore_off; + struct vmcore *new; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ + + /* First program header is PT_NOTE header. */ + vmcore_off = sizeof(Elf32_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) + + phdr_ptr->p_memsz; /* Note sections */ + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_LOAD) + continue; + + /* Add this contiguous chunk of memory to vmcore list.*/ + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = phdr_ptr->p_offset; + new->size = phdr_ptr->p_memsz; + list_add_tail(&new->list, vc_list); + + /* Update the program header offset */ + phdr_ptr->p_offset = vmcore_off; + vmcore_off = vmcore_off + phdr_ptr->p_memsz; + } + return 0; +} + +/* Sets offset fields of vmcore elements. */ +static void __init set_vmcore_list_offsets_elf64(char *elfptr, + struct list_head *vc_list) +{ + loff_t vmcore_off; + Elf64_Ehdr *ehdr_ptr; + struct vmcore *m; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + + /* Skip Elf header and program headers. */ + vmcore_off = sizeof(Elf64_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr); + + list_for_each_entry(m, vc_list, list) { + m->offset = vmcore_off; + vmcore_off += m->size; + } +} + +/* Sets offset fields of vmcore elements. */ +static void __init set_vmcore_list_offsets_elf32(char *elfptr, + struct list_head *vc_list) +{ + loff_t vmcore_off; + Elf32_Ehdr *ehdr_ptr; + struct vmcore *m; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + + /* Skip Elf header and program headers. */ + vmcore_off = sizeof(Elf32_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr); + + list_for_each_entry(m, vc_list, list) { + m->offset = vmcore_off; + vmcore_off += m->size; + } +} + +static int __init parse_crash_elf64_headers(void) +{ + int rc=0; + Elf64_Ehdr ehdr; + u64 addr; + + addr = elfcorehdr_addr; + + /* Read Elf header */ + rc = read_from_oldmem((char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0); + if (rc < 0) + return rc; + + /* Do some basic Verification. */ + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || + (ehdr.e_type != ET_CORE) || + !vmcore_elf64_check_arch(&ehdr) || + ehdr.e_ident[EI_CLASS] != ELFCLASS64 || + ehdr.e_ident[EI_VERSION] != EV_CURRENT || + ehdr.e_version != EV_CURRENT || + ehdr.e_ehsize != sizeof(Elf64_Ehdr) || + ehdr.e_phentsize != sizeof(Elf64_Phdr) || + ehdr.e_phnum == 0) { + printk(KERN_WARNING "Warning: Core image elf header is not" + "sane\n"); + return -EINVAL; + } + + /* Read in all elf headers. */ + elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr); + elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); + if (!elfcorebuf) + return -ENOMEM; + addr = elfcorehdr_addr; + rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); + if (rc < 0) { + kfree(elfcorebuf); + return rc; + } + + /* Merge all PT_NOTE headers into one. */ + rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list); + if (rc) { + kfree(elfcorebuf); + return rc; + } + rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz, + &vmcore_list); + if (rc) { + kfree(elfcorebuf); + return rc; + } + set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list); + return 0; +} + +static int __init parse_crash_elf32_headers(void) +{ + int rc=0; + Elf32_Ehdr ehdr; + u64 addr; + + addr = elfcorehdr_addr; + + /* Read Elf header */ + rc = read_from_oldmem((char*)&ehdr, sizeof(Elf32_Ehdr), &addr, 0); + if (rc < 0) + return rc; + + /* Do some basic Verification. */ + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || + (ehdr.e_type != ET_CORE) || + !elf_check_arch(&ehdr) || + ehdr.e_ident[EI_CLASS] != ELFCLASS32|| + ehdr.e_ident[EI_VERSION] != EV_CURRENT || + ehdr.e_version != EV_CURRENT || + ehdr.e_ehsize != sizeof(Elf32_Ehdr) || + ehdr.e_phentsize != sizeof(Elf32_Phdr) || + ehdr.e_phnum == 0) { + printk(KERN_WARNING "Warning: Core image elf header is not" + "sane\n"); + return -EINVAL; + } + + /* Read in all elf headers. */ + elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); + elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); + if (!elfcorebuf) + return -ENOMEM; + addr = elfcorehdr_addr; + rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); + if (rc < 0) { + kfree(elfcorebuf); + return rc; + } + + /* Merge all PT_NOTE headers into one. */ + rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list); + if (rc) { + kfree(elfcorebuf); + return rc; + } + rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz, + &vmcore_list); + if (rc) { + kfree(elfcorebuf); + return rc; + } + set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list); + return 0; +} + +static int __init parse_crash_elf_headers(void) +{ + unsigned char e_ident[EI_NIDENT]; + u64 addr; + int rc=0; + + addr = elfcorehdr_addr; + rc = read_from_oldmem(e_ident, EI_NIDENT, &addr, 0); + if (rc < 0) + return rc; + if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { + printk(KERN_WARNING "Warning: Core image elf header" + " not found\n"); + return -EINVAL; + } + + if (e_ident[EI_CLASS] == ELFCLASS64) { + rc = parse_crash_elf64_headers(); + if (rc) + return rc; + + /* Determine vmcore size. */ + vmcore_size = get_vmcore_size_elf64(elfcorebuf); + } else if (e_ident[EI_CLASS] == ELFCLASS32) { + rc = parse_crash_elf32_headers(); + if (rc) + return rc; + + /* Determine vmcore size. */ + vmcore_size = get_vmcore_size_elf32(elfcorebuf); + } else { + printk(KERN_WARNING "Warning: Core image elf header is not" + " sane\n"); + return -EINVAL; + } + return 0; +} + +/* Init function for vmcore module. */ +static int __init vmcore_init(void) +{ + int rc = 0; + + /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ + if (!(is_vmcore_usable())) + return rc; + rc = parse_crash_elf_headers(); + if (rc) { + printk(KERN_WARNING "Kdump: vmcore not initialized\n"); + return rc; + } + + proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); + if (proc_vmcore) + proc_vmcore->size = vmcore_size; + return 0; +} +module_init(vmcore_init) diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig new file mode 100644 index 00000000..8007ae7c --- /dev/null +++ b/fs/pstore/Kconfig @@ -0,0 +1,13 @@ +config PSTORE + bool "Persistent store support" + default n + help + This option enables generic access to platform level + persistent storage via "pstore" filesystem that can + be mounted as /dev/pstore. Only useful if you have + a platform level driver that registers with pstore to + provide the data, so you probably should just go say "Y" + (or "M") to a platform specific persistent store driver + (e.g. ACPI_APEI on X86) which will select this for you. + If you don't have a platform persistent store driver, + say N. diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile new file mode 100644 index 00000000..760f4bce --- /dev/null +++ b/fs/pstore/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux pstorefs routines. +# + +obj-y += pstore.o + +pstore-objs += inode.o platform.o diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c new file mode 100644 index 00000000..977ed272 --- /dev/null +++ b/fs/pstore/inode.c @@ -0,0 +1,311 @@ +/* + * Persistent Storage - ramfs parts. + * + * Copyright (C) 2010 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define PSTORE_NAMELEN 64 + +struct pstore_private { + u64 id; + int (*erase)(u64); + ssize_t size; + char data[]; +}; + +static int pstore_file_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +static ssize_t pstore_file_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct pstore_private *ps = file->private_data; + + return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size); +} + +static const struct file_operations pstore_file_operations = { + .open = pstore_file_open, + .read = pstore_file_read, + .llseek = default_llseek, +}; + +/* + * When a file is unlinked from our file system we call the + * platform driver to erase the record from persistent store. + */ +static int pstore_unlink(struct inode *dir, struct dentry *dentry) +{ + struct pstore_private *p = dentry->d_inode->i_private; + + p->erase(p->id); + + return simple_unlink(dir, dentry); +} + +static void pstore_evict_inode(struct inode *inode) +{ + end_writeback(inode); + kfree(inode->i_private); +} + +static const struct inode_operations pstore_dir_inode_operations = { + .lookup = simple_lookup, + .unlink = pstore_unlink, +}; + +static struct inode *pstore_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev) +{ + struct inode *inode = new_inode(sb); + + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_uid = inode->i_gid = 0; + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_fop = &pstore_file_operations; + break; + case S_IFDIR: + inode->i_op = &pstore_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inc_nlink(inode); + break; + } + } + return inode; +} + +enum { + Opt_kmsg_bytes, Opt_err +}; + +static const match_table_t tokens = { + {Opt_kmsg_bytes, "kmsg_bytes=%u"}, + {Opt_err, NULL} +}; + +static void parse_options(char *options) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + if (!options) + return; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_kmsg_bytes: + if (!match_int(&args[0], &option)) + pstore_set_kmsg_bytes(option); + break; + } + } +} + +static int pstore_remount(struct super_block *sb, int *flags, char *data) +{ + parse_options(data); + + return 0; +} + +static const struct super_operations pstore_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = pstore_evict_inode, + .remount_fs = pstore_remount, + .show_options = generic_show_options, +}; + +static struct super_block *pstore_sb; + +int pstore_is_mounted(void) +{ + return pstore_sb != NULL; +} + +/* + * Make a regular file in the root directory of our file system. + * Load it up with "size" bytes of data from "buf". + * Set the mtime & ctime to the date that this record was originally stored. + */ +int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, + char *data, size_t size, + struct timespec time, int (*erase)(u64)) +{ + struct dentry *root = pstore_sb->s_root; + struct dentry *dentry; + struct inode *inode; + int rc; + char name[PSTORE_NAMELEN]; + struct pstore_private *private; + + rc = -ENOMEM; + inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0); + if (!inode) + goto fail; + private = kmalloc(sizeof *private + size, GFP_KERNEL); + if (!private) + goto fail_alloc; + private->id = id; + private->erase = erase; + + switch (type) { + case PSTORE_TYPE_DMESG: + sprintf(name, "dmesg-%s-%lld", psname, id); + break; + case PSTORE_TYPE_MCE: + sprintf(name, "mce-%s-%lld", psname, id); + break; + case PSTORE_TYPE_UNKNOWN: + sprintf(name, "unknown-%s-%lld", psname, id); + break; + default: + sprintf(name, "type%d-%s-%lld", type, psname, id); + break; + } + + mutex_lock(&root->d_inode->i_mutex); + + rc = -ENOSPC; + dentry = d_alloc_name(root, name); + if (IS_ERR(dentry)) + goto fail_lockedalloc; + + memcpy(private->data, data, size); + inode->i_size = private->size = size; + + inode->i_private = private; + + if (time.tv_sec) + inode->i_mtime = inode->i_ctime = time; + + d_add(dentry, inode); + + mutex_unlock(&root->d_inode->i_mutex); + + return 0; + +fail_lockedalloc: + mutex_unlock(&root->d_inode->i_mutex); + kfree(private); +fail_alloc: + iput(inode); + +fail: + return rc; +} + +int pstore_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode = NULL; + struct dentry *root; + int err; + + save_mount_options(sb, data); + + pstore_sb = sb; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = PSTOREFS_MAGIC; + sb->s_op = &pstore_ops; + sb->s_time_gran = 1; + + parse_options(data); + + inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0); + if (!inode) { + err = -ENOMEM; + goto fail; + } + /* override ramfs "dir" options so we catch unlink(2) */ + inode->i_op = &pstore_dir_inode_operations; + + root = d_alloc_root(inode); + sb->s_root = root; + if (!root) { + err = -ENOMEM; + goto fail; + } + + pstore_get_records(); + + return 0; +fail: + iput(inode); + return err; +} + +static struct dentry *pstore_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, pstore_fill_super); +} + +static void pstore_kill_sb(struct super_block *sb) +{ + kill_litter_super(sb); + pstore_sb = NULL; +} + +static struct file_system_type pstore_fs_type = { + .name = "pstore", + .mount = pstore_mount, + .kill_sb = pstore_kill_sb, +}; + +static int __init init_pstore_fs(void) +{ + return register_filesystem(&pstore_fs_type); +} +module_init(init_pstore_fs) + +MODULE_AUTHOR("Tony Luck "); +MODULE_LICENSE("GPL"); diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h new file mode 100644 index 00000000..8c9f23eb --- /dev/null +++ b/fs/pstore/internal.h @@ -0,0 +1,6 @@ +extern void pstore_set_kmsg_bytes(int); +extern void pstore_get_records(void); +extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, + char *data, size_t size, + struct timespec time, int (*erase)(u64)); +extern int pstore_is_mounted(void); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c new file mode 100644 index 00000000..f2c3ff20 --- /dev/null +++ b/fs/pstore/platform.c @@ -0,0 +1,207 @@ +/* + * Persistent Storage - platform driver interface parts. + * + * Copyright (C) 2010 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * pstore_lock just protects "psinfo" during + * calls to pstore_register() + */ +static DEFINE_SPINLOCK(pstore_lock); +static struct pstore_info *psinfo; + +/* How much of the console log to snapshot */ +static unsigned long kmsg_bytes = 10240; + +void pstore_set_kmsg_bytes(int bytes) +{ + kmsg_bytes = bytes; +} + +/* Tag each group of saved records with a sequence number */ +static int oopscount; + +static char *reason_str[] = { + "Oops", "Panic", "Kexec", "Restart", "Halt", "Poweroff", "Emergency" +}; + +/* + * callback from kmsg_dump. (s2,l2) has the most recently + * written bytes, older bytes are in (s1,l1). Save as much + * as we can from the end of the buffer. + */ +static void pstore_dump(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + const char *s1, unsigned long l1, + const char *s2, unsigned long l2) +{ + unsigned long s1_start, s2_start; + unsigned long l1_cpy, l2_cpy; + unsigned long size, total = 0; + char *dst, *why; + u64 id; + int hsize, part = 1; + + if (reason < ARRAY_SIZE(reason_str)) + why = reason_str[reason]; + else + why = "Unknown"; + + mutex_lock(&psinfo->buf_mutex); + oopscount++; + while (total < kmsg_bytes) { + dst = psinfo->buf; + hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++); + size = psinfo->bufsize - hsize; + dst += hsize; + + l2_cpy = min(l2, size); + l1_cpy = min(l1, size - l2_cpy); + + if (l1_cpy + l2_cpy == 0) + break; + + s2_start = l2 - l2_cpy; + s1_start = l1 - l1_cpy; + + memcpy(dst, s1 + s1_start, l1_cpy); + memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); + + id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy); + if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) + pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, + psinfo->buf, hsize + l1_cpy + l2_cpy, + CURRENT_TIME, psinfo->erase); + l1 -= l1_cpy; + l2 -= l2_cpy; + total += l1_cpy + l2_cpy; + } + mutex_unlock(&psinfo->buf_mutex); +} + +static struct kmsg_dumper pstore_dumper = { + .dump = pstore_dump, +}; + +/* + * platform specific persistent storage driver registers with + * us here. If pstore is already mounted, call the platform + * read function right away to populate the file system. If not + * then the pstore mount code will call us later to fill out + * the file system. + * + * Register with kmsg_dump to save last part of console log on panic. + */ +int pstore_register(struct pstore_info *psi) +{ + struct module *owner = psi->owner; + + spin_lock(&pstore_lock); + if (psinfo) { + spin_unlock(&pstore_lock); + return -EBUSY; + } + psinfo = psi; + spin_unlock(&pstore_lock); + + if (owner && !try_module_get(owner)) { + psinfo = NULL; + return -EINVAL; + } + + if (pstore_is_mounted()) + pstore_get_records(); + + kmsg_dump_register(&pstore_dumper); + + return 0; +} +EXPORT_SYMBOL_GPL(pstore_register); + +/* + * Read all the records from the persistent store. Create and + * file files in our filesystem. + */ +void pstore_get_records(void) +{ + struct pstore_info *psi = psinfo; + ssize_t size; + u64 id; + enum pstore_type_id type; + struct timespec time; + int failed = 0, rc; + + if (!psi) + return; + + mutex_lock(&psinfo->buf_mutex); + rc = psi->open(psi); + if (rc) + goto out; + + while ((size = psi->read(&id, &type, &time)) > 0) { + if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, + time, psi->erase)) + failed++; + } + psi->close(psi); +out: + mutex_unlock(&psinfo->buf_mutex); + + if (failed) + printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", + failed, psi->name); +} + +/* + * Call platform driver to write a record to the + * persistent store. + */ +int pstore_write(enum pstore_type_id type, char *buf, size_t size) +{ + u64 id; + + if (!psinfo) + return -ENODEV; + + if (size > psinfo->bufsize) + return -EFBIG; + + mutex_lock(&psinfo->buf_mutex); + memcpy(psinfo->buf, buf, size); + id = psinfo->write(type, size); + if (pstore_is_mounted()) + pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, + size, CURRENT_TIME, psinfo->erase); + mutex_unlock(&psinfo->buf_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(pstore_write); diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig new file mode 100644 index 00000000..5f608999 --- /dev/null +++ b/fs/qnx4/Kconfig @@ -0,0 +1,14 @@ +config QNX4FS_FS + tristate "QNX4 file system support (read only)" + depends on BLOCK + help + This is the file system used by the real-time operating systems + QNX 4 and QNX 6 (the latter is also called QNX RTP). + Further information is available at . + Say Y if you intend to mount QNX hard disks or floppies. + + To compile this file system support as a module, choose M here: the + module will be called qnx4. + + If you don't know whether you need it, then you don't need it: + answer N. diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile new file mode 100644 index 00000000..4a283b3f --- /dev/null +++ b/fs/qnx4/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux qnx4-filesystem routines. +# + +obj-$(CONFIG_QNX4FS_FS) += qnx4.o + +qnx4-objs := inode.o dir.o namei.o bitmap.o diff --git a/fs/qnx4/README b/fs/qnx4/README new file mode 100644 index 00000000..1f1e320d --- /dev/null +++ b/fs/qnx4/README @@ -0,0 +1,9 @@ + + This is a snapshot of the QNX4 filesystem for Linux. + Please send diffs and remarks to . + +Credits : + +Richard "Scuba" A. Frowijn +Frank "Jedi/Sector One" Denis +Anders Larsen (Maintainer) diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c new file mode 100644 index 00000000..22e0d60e --- /dev/null +++ b/fs/qnx4/bitmap.c @@ -0,0 +1,58 @@ +/* + * QNX4 file system, Linux implementation. + * + * Version : 0.2.1 + * + * Using parts of the xiafs filesystem. + * + * History : + * + * 28-05-1998 by Richard Frowijn : first release. + * 20-06-1998 by Frank Denis : basic optimisations. + * 25-06-1998 by Frank Denis : qnx4_is_free, qnx4_set_bitmap, qnx4_bmap . + * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) . + */ + +#include +#include +#include "qnx4.h" + +static void count_bits(register const char *bmPart, register int size, + int *const tf) +{ + char b; + int tot = *tf; + + if (size > QNX4_BLOCK_SIZE) { + size = QNX4_BLOCK_SIZE; + } + do { + b = *bmPart++; + tot += 8 - hweight8(b); + size--; + } while (size != 0); + *tf = tot; +} + +unsigned long qnx4_count_free_blocks(struct super_block *sb) +{ + int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1; + int total = 0; + int total_free = 0; + int offset = 0; + int size = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size); + struct buffer_head *bh; + + while (total < size) { + if ((bh = sb_bread(sb, start + offset)) == NULL) { + printk(KERN_ERR "qnx4: I/O error in counting free blocks\n"); + break; + } + count_bits(bh->b_data, size - total, &total_free); + brelse(bh); + total += QNX4_BLOCK_SIZE; + offset++; + } + + return total_free; +} diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c new file mode 100644 index 00000000..7b032946 --- /dev/null +++ b/fs/qnx4/dir.c @@ -0,0 +1,85 @@ +/* + * QNX4 file system, Linux implementation. + * + * Version : 0.2.1 + * + * Using parts of the xiafs filesystem. + * + * History : + * + * 28-05-1998 by Richard Frowijn : first release. + * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support. + */ + +#include +#include "qnx4.h" + +static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + unsigned int offset; + struct buffer_head *bh; + struct qnx4_inode_entry *de; + struct qnx4_link_info *le; + unsigned long blknum; + int ix, ino; + int size; + + QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size)); + QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos)); + + while (filp->f_pos < inode->i_size) { + blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS ); + bh = sb_bread(inode->i_sb, blknum); + if(bh==NULL) { + printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum); + break; + } + ix = (int)(filp->f_pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK; + while (ix < QNX4_INODES_PER_BLOCK) { + offset = ix * QNX4_DIR_ENTRY_SIZE; + de = (struct qnx4_inode_entry *) (bh->b_data + offset); + size = strlen(de->di_fname); + if (size) { + if ( !( de->di_status & QNX4_FILE_LINK ) && size > QNX4_SHORT_NAME_MAX ) + size = QNX4_SHORT_NAME_MAX; + else if ( size > QNX4_NAME_MAX ) + size = QNX4_NAME_MAX; + + if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) { + QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname)); + if ( ( de->di_status & QNX4_FILE_LINK ) == 0 ) + ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; + else { + le = (struct qnx4_link_info*)de; + ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) * + QNX4_INODES_PER_BLOCK + + le->dl_inode_ndx; + } + if (filldir(dirent, de->di_fname, size, filp->f_pos, ino, DT_UNKNOWN) < 0) { + brelse(bh); + goto out; + } + } + } + ix++; + filp->f_pos += QNX4_DIR_ENTRY_SIZE; + } + brelse(bh); + } +out: + return 0; +} + +const struct file_operations qnx4_dir_operations = +{ + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = qnx4_readdir, + .fsync = generic_file_fsync, +}; + +const struct inode_operations qnx4_dir_inode_operations = +{ + .lookup = qnx4_lookup, +}; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c new file mode 100644 index 00000000..2b064661 --- /dev/null +++ b/fs/qnx4/inode.c @@ -0,0 +1,504 @@ +/* + * QNX4 file system, Linux implementation. + * + * Version : 0.2.1 + * + * Using parts of the xiafs filesystem. + * + * History : + * + * 01-06-1998 by Richard Frowijn : first release. + * 20-06-1998 by Frank Denis : Linux 2.1.99+ support, boot signature, misc. + * 30-06-1998 by Frank Denis : first step to write inodes. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "qnx4.h" + +#define QNX4_VERSION 4 +#define QNX4_BMNAME ".bitmap" + +static const struct super_operations qnx4_sops; + +static void qnx4_put_super(struct super_block *sb); +static struct inode *qnx4_alloc_inode(struct super_block *sb); +static void qnx4_destroy_inode(struct inode *inode); +static int qnx4_remount(struct super_block *sb, int *flags, char *data); +static int qnx4_statfs(struct dentry *, struct kstatfs *); + +static const struct super_operations qnx4_sops = +{ + .alloc_inode = qnx4_alloc_inode, + .destroy_inode = qnx4_destroy_inode, + .put_super = qnx4_put_super, + .statfs = qnx4_statfs, + .remount_fs = qnx4_remount, +}; + +static int qnx4_remount(struct super_block *sb, int *flags, char *data) +{ + struct qnx4_sb_info *qs; + + qs = qnx4_sb(sb); + qs->Version = QNX4_VERSION; + *flags |= MS_RDONLY; + return 0; +} + +static struct buffer_head *qnx4_getblk(struct inode *inode, int nr, + int create) +{ + struct buffer_head *result = NULL; + + if ( nr >= 0 ) + nr = qnx4_block_map( inode, nr ); + if (nr) { + result = sb_getblk(inode->i_sb, nr); + return result; + } + return NULL; +} + +struct buffer_head *qnx4_bread(struct inode *inode, int block, int create) +{ + struct buffer_head *bh; + + bh = qnx4_getblk(inode, block, create); + if (!bh || buffer_uptodate(bh)) { + return bh; + } + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) { + return bh; + } + brelse(bh); + + return NULL; +} + +static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_head *bh, int create ) +{ + unsigned long phys; + + QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock)); + + phys = qnx4_block_map( inode, iblock ); + if ( phys ) { + // logical block is before EOF + map_bh(bh, inode->i_sb, phys); + } + return 0; +} + +unsigned long qnx4_block_map( struct inode *inode, long iblock ) +{ + int ix; + long offset, i_xblk; + unsigned long block = 0; + struct buffer_head *bh = NULL; + struct qnx4_xblk *xblk = NULL; + struct qnx4_inode_entry *qnx4_inode = qnx4_raw_inode(inode); + u16 nxtnt = le16_to_cpu(qnx4_inode->di_num_xtnts); + + if ( iblock < le32_to_cpu(qnx4_inode->di_first_xtnt.xtnt_size) ) { + // iblock is in the first extent. This is easy. + block = le32_to_cpu(qnx4_inode->di_first_xtnt.xtnt_blk) + iblock - 1; + } else { + // iblock is beyond first extent. We have to follow the extent chain. + i_xblk = le32_to_cpu(qnx4_inode->di_xblk); + offset = iblock - le32_to_cpu(qnx4_inode->di_first_xtnt.xtnt_size); + ix = 0; + while ( --nxtnt > 0 ) { + if ( ix == 0 ) { + // read next xtnt block. + bh = sb_bread(inode->i_sb, i_xblk - 1); + if ( !bh ) { + QNX4DEBUG((KERN_ERR "qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1)); + return -EIO; + } + xblk = (struct qnx4_xblk*)bh->b_data; + if ( memcmp( xblk->xblk_signature, "IamXblk", 7 ) ) { + QNX4DEBUG((KERN_ERR "qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk)); + return -EIO; + } + } + if ( offset < le32_to_cpu(xblk->xblk_xtnts[ix].xtnt_size) ) { + // got it! + block = le32_to_cpu(xblk->xblk_xtnts[ix].xtnt_blk) + offset - 1; + break; + } + offset -= le32_to_cpu(xblk->xblk_xtnts[ix].xtnt_size); + if ( ++ix >= xblk->xblk_num_xtnts ) { + i_xblk = le32_to_cpu(xblk->xblk_next_xblk); + ix = 0; + brelse( bh ); + bh = NULL; + } + } + if ( bh ) + brelse( bh ); + } + + QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block)); + return block; +} + +static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = sb->s_magic; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8; + buf->f_bfree = qnx4_count_free_blocks(sb); + buf->f_bavail = buf->f_bfree; + buf->f_namelen = QNX4_NAME_MAX; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + +/* + * Check the root directory of the filesystem to make sure + * it really _is_ a qnx4 filesystem, and to check the size + * of the directory entry. + */ +static const char *qnx4_checkroot(struct super_block *sb) +{ + struct buffer_head *bh; + struct qnx4_inode_entry *rootdir; + int rd, rl; + int i, j; + int found = 0; + + if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') { + return "no qnx4 filesystem (no root dir)."; + } else { + QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id)); + rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1; + rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size); + for (j = 0; j < rl; j++) { + bh = sb_bread(sb, rd + j); /* root dir, first block */ + if (bh == NULL) { + return "unable to read root entry."; + } + for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) { + rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE); + if (rootdir->di_fname != NULL) { + QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname)); + if (!strcmp(rootdir->di_fname, + QNX4_BMNAME)) { + found = 1; + qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL ); + if (!qnx4_sb(sb)->BitMap) { + brelse (bh); + return "not enough memory for bitmap inode"; + } + memcpy( qnx4_sb(sb)->BitMap, rootdir, sizeof( struct qnx4_inode_entry ) ); /* keep bitmap inode known */ + break; + } + } + } + brelse(bh); + if (found != 0) { + break; + } + } + if (found == 0) { + return "bitmap file not found."; + } + } + return NULL; +} + +static int qnx4_fill_super(struct super_block *s, void *data, int silent) +{ + struct buffer_head *bh; + struct inode *root; + const char *errmsg; + struct qnx4_sb_info *qs; + int ret = -EINVAL; + + qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL); + if (!qs) + return -ENOMEM; + s->s_fs_info = qs; + + sb_set_blocksize(s, QNX4_BLOCK_SIZE); + + /* Check the superblock signature. Since the qnx4 code is + dangerous, we should leave as quickly as possible + if we don't belong here... */ + bh = sb_bread(s, 1); + if (!bh) { + printk(KERN_ERR "qnx4: unable to read the superblock\n"); + goto outnobh; + } + if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) { + if (!silent) + printk(KERN_ERR "qnx4: wrong fsid in superblock.\n"); + goto out; + } + s->s_op = &qnx4_sops; + s->s_magic = QNX4_SUPER_MAGIC; + s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ + qnx4_sb(s)->sb_buf = bh; + qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data; + + + /* check before allocating dentries, inodes, .. */ + errmsg = qnx4_checkroot(s); + if (errmsg != NULL) { + if (!silent) + printk(KERN_ERR "qnx4: %s\n", errmsg); + goto out; + } + + /* does root not have inode number QNX4_ROOT_INO ?? */ + root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK); + if (IS_ERR(root)) { + printk(KERN_ERR "qnx4: get inode failed\n"); + ret = PTR_ERR(root); + goto out; + } + + ret = -ENOMEM; + s->s_root = d_alloc_root(root); + if (s->s_root == NULL) + goto outi; + + brelse(bh); + return 0; + + outi: + iput(root); + out: + brelse(bh); + outnobh: + kfree(qs); + s->s_fs_info = NULL; + return ret; +} + +static void qnx4_put_super(struct super_block *sb) +{ + struct qnx4_sb_info *qs = qnx4_sb(sb); + kfree( qs->BitMap ); + kfree( qs ); + sb->s_fs_info = NULL; + return; +} + +static int qnx4_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page,qnx4_get_block, wbc); +} + +static int qnx4_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,qnx4_get_block); +} + +static int qnx4_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct qnx4_inode_info *qnx4_inode = qnx4_i(mapping->host); + int ret; + + *pagep = NULL; + ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + qnx4_get_block, + &qnx4_inode->mmu_private); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} +static sector_t qnx4_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,qnx4_get_block); +} +static const struct address_space_operations qnx4_aops = { + .readpage = qnx4_readpage, + .writepage = qnx4_writepage, + .write_begin = qnx4_write_begin, + .write_end = generic_write_end, + .bmap = qnx4_bmap +}; + +struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) +{ + struct buffer_head *bh; + struct qnx4_inode_entry *raw_inode; + int block; + struct qnx4_inode_entry *qnx4_inode; + struct inode *inode; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + qnx4_inode = qnx4_raw_inode(inode); + inode->i_mode = 0; + + QNX4DEBUG((KERN_INFO "reading inode : [%d]\n", ino)); + if (!ino) { + printk(KERN_ERR "qnx4: bad inode number on dev %s: %lu is " + "out of range\n", + sb->s_id, ino); + iget_failed(inode); + return ERR_PTR(-EIO); + } + block = ino / QNX4_INODES_PER_BLOCK; + + if (!(bh = sb_bread(sb, block))) { + printk(KERN_ERR "qnx4: major problem: unable to read inode from dev " + "%s\n", sb->s_id); + iget_failed(inode); + return ERR_PTR(-EIO); + } + raw_inode = ((struct qnx4_inode_entry *) bh->b_data) + + (ino % QNX4_INODES_PER_BLOCK); + + inode->i_mode = le16_to_cpu(raw_inode->di_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->di_uid); + inode->i_gid = (gid_t)le16_to_cpu(raw_inode->di_gid); + inode->i_nlink = le16_to_cpu(raw_inode->di_nlink); + inode->i_size = le32_to_cpu(raw_inode->di_size); + inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime); + inode->i_mtime.tv_nsec = 0; + inode->i_atime.tv_sec = le32_to_cpu(raw_inode->di_atime); + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->di_ctime); + inode->i_ctime.tv_nsec = 0; + inode->i_blocks = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size); + + memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE); + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &generic_ro_fops; + inode->i_mapping->a_ops = &qnx4_aops; + qnx4_i(inode)->mmu_private = inode->i_size; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &qnx4_dir_inode_operations; + inode->i_fop = &qnx4_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &qnx4_aops; + qnx4_i(inode)->mmu_private = inode->i_size; + } else { + printk(KERN_ERR "qnx4: bad inode %lu on dev %s\n", + ino, sb->s_id); + iget_failed(inode); + brelse(bh); + return ERR_PTR(-EIO); + } + brelse(bh); + unlock_new_inode(inode); + return inode; +} + +static struct kmem_cache *qnx4_inode_cachep; + +static struct inode *qnx4_alloc_inode(struct super_block *sb) +{ + struct qnx4_inode_info *ei; + ei = kmem_cache_alloc(qnx4_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void qnx4_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode)); +} + +static void qnx4_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, qnx4_i_callback); +} + +static void init_once(void *foo) +{ + struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache", + sizeof(struct qnx4_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (qnx4_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(qnx4_inode_cachep); +} + +static struct dentry *qnx4_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super); +} + +static struct file_system_type qnx4_fs_type = { + .owner = THIS_MODULE, + .name = "qnx4", + .mount = qnx4_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_qnx4_fs(void) +{ + int err; + + err = init_inodecache(); + if (err) + return err; + + err = register_filesystem(&qnx4_fs_type); + if (err) { + destroy_inodecache(); + return err; + } + + printk(KERN_INFO "QNX4 filesystem 0.2.3 registered.\n"); + return 0; +} + +static void __exit exit_qnx4_fs(void) +{ + unregister_filesystem(&qnx4_fs_type); + destroy_inodecache(); +} + +module_init(init_qnx4_fs) +module_exit(exit_qnx4_fs) +MODULE_LICENSE("GPL"); + diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c new file mode 100644 index 00000000..275327b5 --- /dev/null +++ b/fs/qnx4/namei.c @@ -0,0 +1,132 @@ +/* + * QNX4 file system, Linux implementation. + * + * Version : 0.2.1 + * + * Using parts of the xiafs filesystem. + * + * History : + * + * 01-06-1998 by Richard Frowijn : first release. + * 21-06-1998 by Frank Denis : dcache support, fixed error codes. + * 04-07-1998 by Frank Denis : first step for rmdir/unlink. + */ + +#include +#include "qnx4.h" + + +/* + * check if the filename is correct. For some obscure reason, qnx writes a + * new file twice in the directory entry, first with all possible options at 0 + * and for a second time the way it is, they want us not to access the qnx + * filesystem when whe are using linux. + */ +static int qnx4_match(int len, const char *name, + struct buffer_head *bh, unsigned long *offset) +{ + struct qnx4_inode_entry *de; + int namelen, thislen; + + if (bh == NULL) { + printk(KERN_WARNING "qnx4: matching unassigned buffer !\n"); + return 0; + } + de = (struct qnx4_inode_entry *) (bh->b_data + *offset); + *offset += QNX4_DIR_ENTRY_SIZE; + if ((de->di_status & QNX4_FILE_LINK) != 0) { + namelen = QNX4_NAME_MAX; + } else { + namelen = QNX4_SHORT_NAME_MAX; + } + /* "" means "." ---> so paths like "/usr/lib//libc.a" work */ + if (!len && (de->di_fname[0] == '.') && (de->di_fname[1] == '\0')) { + return 1; + } + thislen = strlen( de->di_fname ); + if ( thislen > namelen ) + thislen = namelen; + if (len != thislen) { + return 0; + } + if (strncmp(name, de->di_fname, len) == 0) { + if ((de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)) != 0) { + return 1; + } + } + return 0; +} + +static struct buffer_head *qnx4_find_entry(int len, struct inode *dir, + const char *name, struct qnx4_inode_entry **res_dir, int *ino) +{ + unsigned long block, offset, blkofs; + struct buffer_head *bh; + + *res_dir = NULL; + if (!dir->i_sb) { + printk(KERN_WARNING "qnx4: no superblock on dir.\n"); + return NULL; + } + bh = NULL; + block = offset = blkofs = 0; + while (blkofs * QNX4_BLOCK_SIZE + offset < dir->i_size) { + if (!bh) { + bh = qnx4_bread(dir, blkofs, 0); + if (!bh) { + blkofs++; + continue; + } + } + *res_dir = (struct qnx4_inode_entry *) (bh->b_data + offset); + if (qnx4_match(len, name, bh, &offset)) { + block = qnx4_block_map( dir, blkofs ); + *ino = block * QNX4_INODES_PER_BLOCK + + (offset / QNX4_DIR_ENTRY_SIZE) - 1; + return bh; + } + if (offset < bh->b_size) { + continue; + } + brelse(bh); + bh = NULL; + offset = 0; + blkofs++; + } + brelse(bh); + *res_dir = NULL; + return NULL; +} + +struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + int ino; + struct qnx4_inode_entry *de; + struct qnx4_link_info *lnk; + struct buffer_head *bh; + const char *name = dentry->d_name.name; + int len = dentry->d_name.len; + struct inode *foundinode = NULL; + + if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino))) + goto out; + /* The entry is linked, let's get the real info */ + if ((de->di_status & QNX4_FILE_LINK) == QNX4_FILE_LINK) { + lnk = (struct qnx4_link_info *) de; + ino = (le32_to_cpu(lnk->dl_inode_blk) - 1) * + QNX4_INODES_PER_BLOCK + + lnk->dl_inode_ndx; + } + brelse(bh); + + foundinode = qnx4_iget(dir->i_sb, ino); + if (IS_ERR(foundinode)) { + QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n", + PTR_ERR(foundinode))); + return ERR_CAST(foundinode); + } +out: + d_add(dentry, foundinode); + + return NULL; +} diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h new file mode 100644 index 00000000..33a60858 --- /dev/null +++ b/fs/qnx4/qnx4.h @@ -0,0 +1,49 @@ +#include +#include + +#define QNX4_DEBUG 0 + +#if QNX4_DEBUG +#define QNX4DEBUG(X) printk X +#else +#define QNX4DEBUG(X) (void) 0 +#endif + +struct qnx4_sb_info { + struct buffer_head *sb_buf; /* superblock buffer */ + struct qnx4_super_block *sb; /* our superblock */ + unsigned int Version; /* may be useful */ + struct qnx4_inode_entry *BitMap; /* useful */ +}; + +struct qnx4_inode_info { + struct qnx4_inode_entry raw; + loff_t mmu_private; + struct inode vfs_inode; +}; + +extern struct inode *qnx4_iget(struct super_block *, unsigned long); +extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd); +extern unsigned long qnx4_count_free_blocks(struct super_block *sb); +extern unsigned long qnx4_block_map(struct inode *inode, long iblock); + +extern struct buffer_head *qnx4_bread(struct inode *, int, int); + +extern const struct inode_operations qnx4_dir_inode_operations; +extern const struct file_operations qnx4_dir_operations; +extern int qnx4_is_free(struct super_block *sb, long block); + +static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct qnx4_inode_info *qnx4_i(struct inode *inode) +{ + return container_of(inode, struct qnx4_inode_info, vfs_inode); +} + +static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode) +{ + return &qnx4_i(inode)->raw; +} diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig new file mode 100644 index 00000000..880fd988 --- /dev/null +++ b/fs/quota/Kconfig @@ -0,0 +1,74 @@ +# +# Quota configuration +# + +config QUOTA + bool "Quota support" + select QUOTACTL + help + If you say Y here, you will be able to set per user limits for disk + usage (also called disk quotas). Currently, it works for the + ext2, ext3, and reiserfs file system. ext3 also supports journalled + quotas for which you don't need to run quotacheck(8) after an unclean + shutdown. + For further details, read the Quota mini-HOWTO, available from + , or the documentation provided + with the quota tools. Probably the quota support is only useful for + multi user systems. If unsure, say N. + +config QUOTA_NETLINK_INTERFACE + bool "Report quota messages through netlink interface" + depends on QUOTACTL && NET + help + If you say Y here, quota warnings (about exceeding softlimit, reaching + hardlimit, etc.) will be reported through netlink interface. If unsure, + say Y. + +config PRINT_QUOTA_WARNING + bool "Print quota warnings to console (OBSOLETE)" + depends on QUOTA + default y + help + If you say Y here, quota warnings (about exceeding softlimit, reaching + hardlimit, etc.) will be printed to the process' controlling terminal. + Note that this behavior is currently deprecated and may go away in + future. Please use notification via netlink socket instead. + +config QUOTA_DEBUG + bool "Additional quota sanity checks" + depends on QUOTA + default n + help + If you say Y here, quota subsystem will perform some additional + sanity checks of quota internal structures. If unsure, say N. + +# Generic support for tree structured quota files. Selected when needed. +config QUOTA_TREE + tristate + +config QFMT_V1 + tristate "Old quota format support" + depends on QUOTA + help + This quota format was (is) used by kernels earlier than 2.4.22. If + you have quota working and you don't want to convert to new quota + format say Y here. + +config QFMT_V2 + tristate "Quota format vfsv0 and vfsv1 support" + depends on QUOTA + select QUOTA_TREE + help + This config option enables kernel support for vfsv0 and vfsv1 quota + formats. Both these formats support 32-bit UIDs/GIDs and vfsv1 format + also supports 64-bit inode and block quota limits. If you need this + functionality say Y here. + +config QUOTACTL + bool + default n + +config QUOTACTL_COMPAT + bool + depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT + default y diff --git a/fs/quota/Makefile b/fs/quota/Makefile new file mode 100644 index 00000000..5f9e9e27 --- /dev/null +++ b/fs/quota/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_QUOTA) += dquot.o +obj-$(CONFIG_QFMT_V1) += quota_v1.o +obj-$(CONFIG_QFMT_V2) += quota_v2.o +obj-$(CONFIG_QUOTA_TREE) += quota_tree.o +obj-$(CONFIG_QUOTACTL) += quota.o +obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o +obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o diff --git a/fs/quota/compat.c b/fs/quota/compat.c new file mode 100644 index 00000000..fb1892fe --- /dev/null +++ b/fs/quota/compat.c @@ -0,0 +1,118 @@ + +#include +#include +#include + +/* + * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) + * and is necessary due to alignment problems. + */ +struct compat_if_dqblk { + compat_u64 dqb_bhardlimit; + compat_u64 dqb_bsoftlimit; + compat_u64 dqb_curspace; + compat_u64 dqb_ihardlimit; + compat_u64 dqb_isoftlimit; + compat_u64 dqb_curinodes; + compat_u64 dqb_btime; + compat_u64 dqb_itime; + compat_uint_t dqb_valid; +}; + +/* XFS structures */ +struct compat_fs_qfilestat { + compat_u64 dqb_bhardlimit; + compat_u64 qfs_nblks; + compat_uint_t qfs_nextents; +}; + +struct compat_fs_quota_stat { + __s8 qs_version; + __u16 qs_flags; + __s8 qs_pad; + struct compat_fs_qfilestat qs_uquota; + struct compat_fs_qfilestat qs_gquota; + compat_uint_t qs_incoredqs; + compat_int_t qs_btimelimit; + compat_int_t qs_itimelimit; + compat_int_t qs_rtbtimelimit; + __u16 qs_bwarnlimit; + __u16 qs_iwarnlimit; +}; + +asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, + qid_t id, void __user *addr) +{ + unsigned int cmds; + struct if_dqblk __user *dqblk; + struct compat_if_dqblk __user *compat_dqblk; + struct fs_quota_stat __user *fsqstat; + struct compat_fs_quota_stat __user *compat_fsqstat; + compat_uint_t data; + u16 xdata; + long ret; + + cmds = cmd >> SUBCMDSHIFT; + + switch (cmds) { + case Q_GETQUOTA: + dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); + compat_dqblk = addr; + ret = sys_quotactl(cmd, special, id, dqblk); + if (ret) + break; + if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || + get_user(data, &dqblk->dqb_valid) || + put_user(data, &compat_dqblk->dqb_valid)) + ret = -EFAULT; + break; + case Q_SETQUOTA: + dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); + compat_dqblk = addr; + ret = -EFAULT; + if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || + get_user(data, &compat_dqblk->dqb_valid) || + put_user(data, &dqblk->dqb_valid)) + break; + ret = sys_quotactl(cmd, special, id, dqblk); + break; + case Q_XGETQSTAT: + fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); + compat_fsqstat = addr; + ret = sys_quotactl(cmd, special, id, fsqstat); + if (ret) + break; + ret = -EFAULT; + /* Copying qs_version, qs_flags, qs_pad */ + if (copy_in_user(compat_fsqstat, fsqstat, + offsetof(struct compat_fs_quota_stat, qs_uquota))) + break; + /* Copying qs_uquota */ + if (copy_in_user(&compat_fsqstat->qs_uquota, + &fsqstat->qs_uquota, + sizeof(compat_fsqstat->qs_uquota)) || + get_user(data, &fsqstat->qs_uquota.qfs_nextents) || + put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) + break; + /* Copying qs_gquota */ + if (copy_in_user(&compat_fsqstat->qs_gquota, + &fsqstat->qs_gquota, + sizeof(compat_fsqstat->qs_gquota)) || + get_user(data, &fsqstat->qs_gquota.qfs_nextents) || + put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) + break; + /* Copying the rest */ + if (copy_in_user(&compat_fsqstat->qs_incoredqs, + &fsqstat->qs_incoredqs, + sizeof(struct compat_fs_quota_stat) - + offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || + get_user(xdata, &fsqstat->qs_iwarnlimit) || + put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) + break; + ret = 0; + break; + default: + ret = sys_quotactl(cmd, special, id, addr); + } + return ret; +} diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c new file mode 100644 index 00000000..5b572c89 --- /dev/null +++ b/fs/quota/dquot.c @@ -0,0 +1,2661 @@ +/* + * Implementation of the diskquota system for the LINUX operating system. QUOTA + * is implemented using the BSD system call interface as the means of + * communication with the user level. This file contains the generic routines + * called by the different filesystems on allocation of an inode or block. + * These routines take care of the administration needed to have a consistent + * diskquota tracking system. The ideas of both user and group quotas are based + * on the Melbourne quota system as used on BSD derived systems. The internal + * implementation is based on one of the several variants of the LINUX + * inode-subsystem with added complexity of the diskquota system. + * + * Author: Marco van Wieringen + * + * Fixes: Dmitry Gorodchanin , 11 Feb 96 + * + * Revised list management to avoid races + * -- Bill Hawes, , 9/98 + * + * Fixed races in dquot_transfer(), dqget() and dquot_alloc_...(). + * As the consequence the locking was moved from dquot_decr_...(), + * dquot_incr_...() to calling functions. + * invalidate_dquots() now writes modified dquots. + * Serialized quota_off() and quota_on() for mount point. + * Fixed a few bugs in grow_dquots(). + * Fixed deadlock in write_dquot() - we no longer account quotas on + * quota files + * remove_dquot_ref() moved to inode.c - it now traverses through inodes + * add_dquot_ref() restarts after blocking + * Added check for bogus uid and fixed check for group in quotactl. + * Jan Kara, , sponsored by SuSE CR, 10-11/99 + * + * Used struct list_head instead of own list struct + * Invalidation of referenced dquots is no longer possible + * Improved free_dquots list management + * Quota and i_blocks are now updated in one place to avoid races + * Warnings are now delayed so we won't block in critical section + * Write updated not to require dquot lock + * Jan Kara, , 9/2000 + * + * Added dynamic quota structure allocation + * Jan Kara 12/2000 + * + * Rewritten quota interface. Implemented new quota format and + * formats registering. + * Jan Kara, , 2001,2002 + * + * New SMP locking. + * Jan Kara, , 10/2002 + * + * Added journalled quota support, fix lock inversion problems + * Jan Kara, , 2003,2004 + * + * (C) Copyright 1994 - 1997 Marco van Wieringen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../internal.h" /* ugh */ + +#include + +/* + * There are three quota SMP locks. dq_list_lock protects all lists with quotas + * and quota formats. + * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and + * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. + * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly + * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects + * modifications of quota state (on quotaon and quotaoff) and readers who care + * about latest values take it as well. + * + * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock, + * dq_list_lock > dq_state_lock + * + * Note that some things (eg. sb pointer, type, id) doesn't change during + * the life of the dquot structure and so needn't to be protected by a lock + * + * Any operation working on dquots via inode pointers must hold dqptr_sem. If + * operation is just reading pointers from inode (or not using them at all) the + * read lock is enough. If pointers are altered function must hold write lock. + * Special care needs to be taken about S_NOQUOTA inode flag (marking that + * inode is a quota file). Functions adding pointers from inode to dquots have + * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they + * have to do all pointer modifications before dropping dqptr_sem. This makes + * sure they cannot race with quotaon which first sets S_NOQUOTA flag and + * then drops all pointers to dquots from an inode. + * + * Each dquot has its dq_lock mutex. Locked dquots might not be referenced + * from inodes (dquot_alloc_space() and such don't check the dq_lock). + * Currently dquot is locked only when it is being read to memory (or space for + * it is being allocated) on the first dqget() and when it is being released on + * the last dqput(). The allocation and release oparations are serialized by + * the dq_lock and by checking the use count in dquot_release(). Write + * operations on dquots don't hold dq_lock as they copy data under dq_data_lock + * spinlock to internal buffers before writing. + * + * Lock ordering (including related VFS locks) is the following: + * i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock > + * dqio_mutex + * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem > + * dqptr_sem. But filesystem has to count with the fact that functions such as + * dquot_alloc_space() acquire dqptr_sem and they usually have to be called + * from inside a transaction to keep filesystem consistency after a crash. Also + * filesystems usually want to do some IO on dquot from ->mark_dirty which is + * called with dqptr_sem held. + * i_mutex on quota files is special (it's below dqio_mutex) + */ + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); +EXPORT_SYMBOL(dq_data_lock); + +void __quota_error(struct super_block *sb, const char *func, + const char *fmt, ...) +{ + if (printk_ratelimit()) { + va_list args; + struct va_format vaf; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_ERR "Quota error (device %s): %s: %pV\n", + sb->s_id, func, &vaf); + + va_end(args); + } +} +EXPORT_SYMBOL(__quota_error); + +#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING) +static char *quotatypes[] = INITQFNAMES; +#endif +static struct quota_format_type *quota_formats; /* List of registered formats */ +static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES; + +/* SLAB cache for dquot structures */ +static struct kmem_cache *dquot_cachep; + +int register_quota_format(struct quota_format_type *fmt) +{ + spin_lock(&dq_list_lock); + fmt->qf_next = quota_formats; + quota_formats = fmt; + spin_unlock(&dq_list_lock); + return 0; +} +EXPORT_SYMBOL(register_quota_format); + +void unregister_quota_format(struct quota_format_type *fmt) +{ + struct quota_format_type **actqf; + + spin_lock(&dq_list_lock); + for (actqf = "a_formats; *actqf && *actqf != fmt; + actqf = &(*actqf)->qf_next) + ; + if (*actqf) + *actqf = (*actqf)->qf_next; + spin_unlock(&dq_list_lock); +} +EXPORT_SYMBOL(unregister_quota_format); + +static struct quota_format_type *find_quota_format(int id) +{ + struct quota_format_type *actqf; + + spin_lock(&dq_list_lock); + for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; + actqf = actqf->qf_next) + ; + if (!actqf || !try_module_get(actqf->qf_owner)) { + int qm; + + spin_unlock(&dq_list_lock); + + for (qm = 0; module_names[qm].qm_fmt_id && + module_names[qm].qm_fmt_id != id; qm++) + ; + if (!module_names[qm].qm_fmt_id || + request_module(module_names[qm].qm_mod_name)) + return NULL; + + spin_lock(&dq_list_lock); + for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; + actqf = actqf->qf_next) + ; + if (actqf && !try_module_get(actqf->qf_owner)) + actqf = NULL; + } + spin_unlock(&dq_list_lock); + return actqf; +} + +static void put_quota_format(struct quota_format_type *fmt) +{ + module_put(fmt->qf_owner); +} + +/* + * Dquot List Management: + * The quota code uses three lists for dquot management: the inuse_list, + * free_dquots, and dquot_hash[] array. A single dquot structure may be + * on all three lists, depending on its current state. + * + * All dquots are placed to the end of inuse_list when first created, and this + * list is used for invalidate operation, which must look at every dquot. + * + * Unused dquots (dq_count == 0) are added to the free_dquots list when freed, + * and this list is searched whenever we need an available dquot. Dquots are + * removed from the list as soon as they are used again, and + * dqstats.free_dquots gives the number of dquots on the list. When + * dquot is invalidated it's completely released from memory. + * + * Dquots with a specific identity (device, type and id) are placed on + * one of the dquot_hash[] hash chains. The provides an efficient search + * mechanism to locate a specific dquot. + */ + +static LIST_HEAD(inuse_list); +static LIST_HEAD(free_dquots); +static unsigned int dq_hash_bits, dq_hash_mask; +static struct hlist_head *dquot_hash; + +struct dqstats dqstats; +EXPORT_SYMBOL(dqstats); + +static qsize_t inode_get_rsv_space(struct inode *inode); +static void __dquot_initialize(struct inode *inode, int type); + +static inline unsigned int +hashfn(const struct super_block *sb, unsigned int id, int type) +{ + unsigned long tmp; + + tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type); + return (tmp + (tmp >> dq_hash_bits)) & dq_hash_mask; +} + +/* + * Following list functions expect dq_list_lock to be held + */ +static inline void insert_dquot_hash(struct dquot *dquot) +{ + struct hlist_head *head; + head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id, dquot->dq_type); + hlist_add_head(&dquot->dq_hash, head); +} + +static inline void remove_dquot_hash(struct dquot *dquot) +{ + hlist_del_init(&dquot->dq_hash); +} + +static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, + unsigned int id, int type) +{ + struct hlist_node *node; + struct dquot *dquot; + + hlist_for_each (node, dquot_hash+hashent) { + dquot = hlist_entry(node, struct dquot, dq_hash); + if (dquot->dq_sb == sb && dquot->dq_id == id && + dquot->dq_type == type) + return dquot; + } + return NULL; +} + +/* Add a dquot to the tail of the free list */ +static inline void put_dquot_last(struct dquot *dquot) +{ + list_add_tail(&dquot->dq_free, &free_dquots); + dqstats_inc(DQST_FREE_DQUOTS); +} + +static inline void remove_free_dquot(struct dquot *dquot) +{ + if (list_empty(&dquot->dq_free)) + return; + list_del_init(&dquot->dq_free); + dqstats_dec(DQST_FREE_DQUOTS); +} + +static inline void put_inuse(struct dquot *dquot) +{ + /* We add to the back of inuse list so we don't have to restart + * when traversing this list and we block */ + list_add_tail(&dquot->dq_inuse, &inuse_list); + dqstats_inc(DQST_ALLOC_DQUOTS); +} + +static inline void remove_inuse(struct dquot *dquot) +{ + dqstats_dec(DQST_ALLOC_DQUOTS); + list_del(&dquot->dq_inuse); +} +/* + * End of list functions needing dq_list_lock + */ + +static void wait_on_dquot(struct dquot *dquot) +{ + mutex_lock(&dquot->dq_lock); + mutex_unlock(&dquot->dq_lock); +} + +static inline int dquot_dirty(struct dquot *dquot) +{ + return test_bit(DQ_MOD_B, &dquot->dq_flags); +} + +static inline int mark_dquot_dirty(struct dquot *dquot) +{ + return dquot->dq_sb->dq_op->mark_dirty(dquot); +} + +/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */ +int dquot_mark_dquot_dirty(struct dquot *dquot) +{ + int ret = 1; + + /* If quota is dirty already, we don't have to acquire dq_list_lock */ + if (test_bit(DQ_MOD_B, &dquot->dq_flags)) + return 1; + + spin_lock(&dq_list_lock); + if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) { + list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)-> + info[dquot->dq_type].dqi_dirty_list); + ret = 0; + } + spin_unlock(&dq_list_lock); + return ret; +} +EXPORT_SYMBOL(dquot_mark_dquot_dirty); + +/* Dirtify all the dquots - this can block when journalling */ +static inline int mark_all_dquot_dirty(struct dquot * const *dquot) +{ + int ret, err, cnt; + + ret = err = 0; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (dquot[cnt]) + /* Even in case of error we have to continue */ + ret = mark_dquot_dirty(dquot[cnt]); + if (!err) + err = ret; + } + return err; +} + +static inline void dqput_all(struct dquot **dquot) +{ + unsigned int cnt; + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + dqput(dquot[cnt]); +} + +/* This function needs dq_list_lock */ +static inline int clear_dquot_dirty(struct dquot *dquot) +{ + if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) + return 0; + list_del_init(&dquot->dq_dirty); + return 1; +} + +void mark_info_dirty(struct super_block *sb, int type) +{ + set_bit(DQF_INFO_DIRTY_B, &sb_dqopt(sb)->info[type].dqi_flags); +} +EXPORT_SYMBOL(mark_info_dirty); + +/* + * Read dquot from disk and alloc space for it + */ + +int dquot_acquire(struct dquot *dquot) +{ + int ret = 0, ret2 = 0; + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + + mutex_lock(&dquot->dq_lock); + mutex_lock(&dqopt->dqio_mutex); + if (!test_bit(DQ_READ_B, &dquot->dq_flags)) + ret = dqopt->ops[dquot->dq_type]->read_dqblk(dquot); + if (ret < 0) + goto out_iolock; + set_bit(DQ_READ_B, &dquot->dq_flags); + /* Instantiate dquot if needed */ + if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) { + ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot); + /* Write the info if needed */ + if (info_dirty(&dqopt->info[dquot->dq_type])) { + ret2 = dqopt->ops[dquot->dq_type]->write_file_info( + dquot->dq_sb, dquot->dq_type); + } + if (ret < 0) + goto out_iolock; + if (ret2 < 0) { + ret = ret2; + goto out_iolock; + } + } + set_bit(DQ_ACTIVE_B, &dquot->dq_flags); +out_iolock: + mutex_unlock(&dqopt->dqio_mutex); + mutex_unlock(&dquot->dq_lock); + return ret; +} +EXPORT_SYMBOL(dquot_acquire); + +/* + * Write dquot to disk + */ +int dquot_commit(struct dquot *dquot) +{ + int ret = 0; + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + + mutex_lock(&dqopt->dqio_mutex); + spin_lock(&dq_list_lock); + if (!clear_dquot_dirty(dquot)) { + spin_unlock(&dq_list_lock); + goto out_sem; + } + spin_unlock(&dq_list_lock); + /* Inactive dquot can be only if there was error during read/init + * => we have better not writing it */ + if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) + ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot); + else + ret = -EIO; +out_sem: + mutex_unlock(&dqopt->dqio_mutex); + return ret; +} +EXPORT_SYMBOL(dquot_commit); + +/* + * Release dquot + */ +int dquot_release(struct dquot *dquot) +{ + int ret = 0, ret2 = 0; + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + + mutex_lock(&dquot->dq_lock); + /* Check whether we are not racing with some other dqget() */ + if (atomic_read(&dquot->dq_count) > 1) + goto out_dqlock; + mutex_lock(&dqopt->dqio_mutex); + if (dqopt->ops[dquot->dq_type]->release_dqblk) { + ret = dqopt->ops[dquot->dq_type]->release_dqblk(dquot); + /* Write the info */ + if (info_dirty(&dqopt->info[dquot->dq_type])) { + ret2 = dqopt->ops[dquot->dq_type]->write_file_info( + dquot->dq_sb, dquot->dq_type); + } + if (ret >= 0) + ret = ret2; + } + clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); + mutex_unlock(&dqopt->dqio_mutex); +out_dqlock: + mutex_unlock(&dquot->dq_lock); + return ret; +} +EXPORT_SYMBOL(dquot_release); + +void dquot_destroy(struct dquot *dquot) +{ + kmem_cache_free(dquot_cachep, dquot); +} +EXPORT_SYMBOL(dquot_destroy); + +static inline void do_destroy_dquot(struct dquot *dquot) +{ + dquot->dq_sb->dq_op->destroy_dquot(dquot); +} + +/* Invalidate all dquots on the list. Note that this function is called after + * quota is disabled and pointers from inodes removed so there cannot be new + * quota users. There can still be some users of quotas due to inodes being + * just deleted or pruned by prune_icache() (those are not attached to any + * list) or parallel quotactl call. We have to wait for such users. + */ +static void invalidate_dquots(struct super_block *sb, int type) +{ + struct dquot *dquot, *tmp; + +restart: + spin_lock(&dq_list_lock); + list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) { + if (dquot->dq_sb != sb) + continue; + if (dquot->dq_type != type) + continue; + /* Wait for dquot users */ + if (atomic_read(&dquot->dq_count)) { + DEFINE_WAIT(wait); + + atomic_inc(&dquot->dq_count); + prepare_to_wait(&dquot->dq_wait_unused, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&dq_list_lock); + /* Once dqput() wakes us up, we know it's time to free + * the dquot. + * IMPORTANT: we rely on the fact that there is always + * at most one process waiting for dquot to free. + * Otherwise dq_count would be > 1 and we would never + * wake up. + */ + if (atomic_read(&dquot->dq_count) > 1) + schedule(); + finish_wait(&dquot->dq_wait_unused, &wait); + dqput(dquot); + /* At this moment dquot() need not exist (it could be + * reclaimed by prune_dqcache(). Hence we must + * restart. */ + goto restart; + } + /* + * Quota now has no users and it has been written on last + * dqput() + */ + remove_dquot_hash(dquot); + remove_free_dquot(dquot); + remove_inuse(dquot); + do_destroy_dquot(dquot); + } + spin_unlock(&dq_list_lock); +} + +/* Call callback for every active dquot on given filesystem */ +int dquot_scan_active(struct super_block *sb, + int (*fn)(struct dquot *dquot, unsigned long priv), + unsigned long priv) +{ + struct dquot *dquot, *old_dquot = NULL; + int ret = 0; + + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + spin_lock(&dq_list_lock); + list_for_each_entry(dquot, &inuse_list, dq_inuse) { + if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) + continue; + if (dquot->dq_sb != sb) + continue; + /* Now we have active dquot so we can just increase use count */ + atomic_inc(&dquot->dq_count); + spin_unlock(&dq_list_lock); + dqstats_inc(DQST_LOOKUPS); + dqput(old_dquot); + old_dquot = dquot; + ret = fn(dquot, priv); + if (ret < 0) + goto out; + spin_lock(&dq_list_lock); + /* We are safe to continue now because our dquot could not + * be moved out of the inuse list while we hold the reference */ + } + spin_unlock(&dq_list_lock); +out: + dqput(old_dquot); + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return ret; +} +EXPORT_SYMBOL(dquot_scan_active); + +int dquot_quota_sync(struct super_block *sb, int type, int wait) +{ + struct list_head *dirty; + struct dquot *dquot; + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + + mutex_lock(&dqopt->dqonoff_mutex); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + spin_lock(&dq_list_lock); + dirty = &dqopt->info[cnt].dqi_dirty_list; + while (!list_empty(dirty)) { + dquot = list_first_entry(dirty, struct dquot, + dq_dirty); + /* Dirty and inactive can be only bad dquot... */ + if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + clear_dquot_dirty(dquot); + continue; + } + /* Now we have active dquot from which someone is + * holding reference so we can safely just increase + * use count */ + atomic_inc(&dquot->dq_count); + spin_unlock(&dq_list_lock); + dqstats_inc(DQST_LOOKUPS); + sb->dq_op->write_dquot(dquot); + dqput(dquot); + spin_lock(&dq_list_lock); + } + spin_unlock(&dq_list_lock); + } + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt) + && info_dirty(&dqopt->info[cnt])) + sb->dq_op->write_info(sb, cnt); + dqstats_inc(DQST_SYNCS); + mutex_unlock(&dqopt->dqonoff_mutex); + + if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)) + return 0; + + /* This is not very clever (and fast) but currently I don't know about + * any other simple way of getting quota data to disk and we must get + * them there for userspace to be visible... */ + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + sync_blockdev(sb->s_bdev); + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, + I_MUTEX_QUOTA); + truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); + mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); + } + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + + return 0; +} +EXPORT_SYMBOL(dquot_quota_sync); + +/* Free unused dquots from cache */ +static void prune_dqcache(int count) +{ + struct list_head *head; + struct dquot *dquot; + + head = free_dquots.prev; + while (head != &free_dquots && count) { + dquot = list_entry(head, struct dquot, dq_free); + remove_dquot_hash(dquot); + remove_free_dquot(dquot); + remove_inuse(dquot); + do_destroy_dquot(dquot); + count--; + head = free_dquots.prev; + } +} + +/* + * This is called from kswapd when we think we need some + * more memory + */ +static int shrink_dqcache_memory(struct shrinker *shrink, + struct shrink_control *sc) +{ + int nr = sc->nr_to_scan; + + if (nr) { + spin_lock(&dq_list_lock); + prune_dqcache(nr); + spin_unlock(&dq_list_lock); + } + return ((unsigned) + percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]) + /100) * sysctl_vfs_cache_pressure; +} + +static struct shrinker dqcache_shrinker = { + .shrink = shrink_dqcache_memory, + .seeks = DEFAULT_SEEKS, +}; + +/* + * Put reference to dquot + * NOTE: If you change this function please check whether dqput_blocks() works right... + */ +void dqput(struct dquot *dquot) +{ + int ret; + + if (!dquot) + return; +#ifdef CONFIG_QUOTA_DEBUG + if (!atomic_read(&dquot->dq_count)) { + quota_error(dquot->dq_sb, "trying to free free dquot of %s %d", + quotatypes[dquot->dq_type], dquot->dq_id); + BUG(); + } +#endif + dqstats_inc(DQST_DROPS); +we_slept: + spin_lock(&dq_list_lock); + if (atomic_read(&dquot->dq_count) > 1) { + /* We have more than one user... nothing to do */ + atomic_dec(&dquot->dq_count); + /* Releasing dquot during quotaoff phase? */ + if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) && + atomic_read(&dquot->dq_count) == 1) + wake_up(&dquot->dq_wait_unused); + spin_unlock(&dq_list_lock); + return; + } + /* Need to release dquot? */ + if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && dquot_dirty(dquot)) { + spin_unlock(&dq_list_lock); + /* Commit dquot before releasing */ + ret = dquot->dq_sb->dq_op->write_dquot(dquot); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't write quota structure" + " (error %d). Quota may get out of sync!", + ret); + /* + * We clear dirty bit anyway, so that we avoid + * infinite loop here + */ + spin_lock(&dq_list_lock); + clear_dquot_dirty(dquot); + spin_unlock(&dq_list_lock); + } + goto we_slept; + } + /* Clear flag in case dquot was inactive (something bad happened) */ + clear_dquot_dirty(dquot); + if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + spin_unlock(&dq_list_lock); + dquot->dq_sb->dq_op->release_dquot(dquot); + goto we_slept; + } + atomic_dec(&dquot->dq_count); +#ifdef CONFIG_QUOTA_DEBUG + /* sanity check */ + BUG_ON(!list_empty(&dquot->dq_free)); +#endif + put_dquot_last(dquot); + spin_unlock(&dq_list_lock); +} +EXPORT_SYMBOL(dqput); + +struct dquot *dquot_alloc(struct super_block *sb, int type) +{ + return kmem_cache_zalloc(dquot_cachep, GFP_NOFS); +} +EXPORT_SYMBOL(dquot_alloc); + +static struct dquot *get_empty_dquot(struct super_block *sb, int type) +{ + struct dquot *dquot; + + dquot = sb->dq_op->alloc_dquot(sb, type); + if(!dquot) + return NULL; + + mutex_init(&dquot->dq_lock); + INIT_LIST_HEAD(&dquot->dq_free); + INIT_LIST_HEAD(&dquot->dq_inuse); + INIT_HLIST_NODE(&dquot->dq_hash); + INIT_LIST_HEAD(&dquot->dq_dirty); + init_waitqueue_head(&dquot->dq_wait_unused); + dquot->dq_sb = sb; + dquot->dq_type = type; + atomic_set(&dquot->dq_count, 1); + + return dquot; +} + +/* + * Get reference to dquot + * + * Locking is slightly tricky here. We are guarded from parallel quotaoff() + * destroying our dquot by: + * a) checking for quota flags under dq_list_lock and + * b) getting a reference to dquot before we release dq_list_lock + */ +struct dquot *dqget(struct super_block *sb, unsigned int id, int type) +{ + unsigned int hashent = hashfn(sb, id, type); + struct dquot *dquot = NULL, *empty = NULL; + + if (!sb_has_quota_active(sb, type)) + return NULL; +we_slept: + spin_lock(&dq_list_lock); + spin_lock(&dq_state_lock); + if (!sb_has_quota_active(sb, type)) { + spin_unlock(&dq_state_lock); + spin_unlock(&dq_list_lock); + goto out; + } + spin_unlock(&dq_state_lock); + + dquot = find_dquot(hashent, sb, id, type); + if (!dquot) { + if (!empty) { + spin_unlock(&dq_list_lock); + empty = get_empty_dquot(sb, type); + if (!empty) + schedule(); /* Try to wait for a moment... */ + goto we_slept; + } + dquot = empty; + empty = NULL; + dquot->dq_id = id; + /* all dquots go on the inuse_list */ + put_inuse(dquot); + /* hash it first so it can be found */ + insert_dquot_hash(dquot); + spin_unlock(&dq_list_lock); + dqstats_inc(DQST_LOOKUPS); + } else { + if (!atomic_read(&dquot->dq_count)) + remove_free_dquot(dquot); + atomic_inc(&dquot->dq_count); + spin_unlock(&dq_list_lock); + dqstats_inc(DQST_CACHE_HITS); + dqstats_inc(DQST_LOOKUPS); + } + /* Wait for dq_lock - after this we know that either dquot_release() is + * already finished or it will be canceled due to dq_count > 1 test */ + wait_on_dquot(dquot); + /* Read the dquot / allocate space in quota file */ + if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && + sb->dq_op->acquire_dquot(dquot) < 0) { + dqput(dquot); + dquot = NULL; + goto out; + } +#ifdef CONFIG_QUOTA_DEBUG + BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ +#endif +out: + if (empty) + do_destroy_dquot(empty); + + return dquot; +} +EXPORT_SYMBOL(dqget); + +static int dqinit_needed(struct inode *inode, int type) +{ + int cnt; + + if (IS_NOQUOTA(inode)) + return 0; + if (type != -1) + return !inode->i_dquot[type]; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (!inode->i_dquot[cnt]) + return 1; + return 0; +} + +/* This routine is guarded by dqonoff_mutex mutex */ +static void add_dquot_ref(struct super_block *sb, int type) +{ + struct inode *inode, *old_inode = NULL; +#ifdef CONFIG_QUOTA_DEBUG + int reserved = 0; +#endif + + spin_lock(&inode_sb_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + !atomic_read(&inode->i_writecount) || + !dqinit_needed(inode, type)) { + spin_unlock(&inode->i_lock); + continue; + } +#ifdef CONFIG_QUOTA_DEBUG + if (unlikely(inode_get_rsv_space(inode) > 0)) + reserved = 1; +#endif + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); + + iput(old_inode); + __dquot_initialize(inode, type); + + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock We cannot iput the inode now as we can be + * holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. + */ + old_inode = inode; + spin_lock(&inode_sb_list_lock); + } + spin_unlock(&inode_sb_list_lock); + iput(old_inode); + +#ifdef CONFIG_QUOTA_DEBUG + if (reserved) { + quota_error(sb, "Writes happened before quota was turned on " + "thus quota information is probably inconsistent. " + "Please run quotacheck(8)"); + } +#endif +} + +/* + * Return 0 if dqput() won't block. + * (note that 1 doesn't necessarily mean blocking) + */ +static inline int dqput_blocks(struct dquot *dquot) +{ + if (atomic_read(&dquot->dq_count) <= 1) + return 1; + return 0; +} + +/* + * Remove references to dquots from inode and add dquot to list for freeing + * if we have the last reference to dquot + * We can't race with anybody because we hold dqptr_sem for writing... + */ +static int remove_inode_dquot_ref(struct inode *inode, int type, + struct list_head *tofree_head) +{ + struct dquot *dquot = inode->i_dquot[type]; + + inode->i_dquot[type] = NULL; + if (dquot) { + if (dqput_blocks(dquot)) { +#ifdef CONFIG_QUOTA_DEBUG + if (atomic_read(&dquot->dq_count) != 1) + quota_error(inode->i_sb, "Adding dquot with " + "dq_count %d to dispose list", + atomic_read(&dquot->dq_count)); +#endif + spin_lock(&dq_list_lock); + /* As dquot must have currently users it can't be on + * the free list... */ + list_add(&dquot->dq_free, tofree_head); + spin_unlock(&dq_list_lock); + return 1; + } + else + dqput(dquot); /* We have guaranteed we won't block */ + } + return 0; +} + +/* + * Free list of dquots + * Dquots are removed from inodes and no new references can be got so we are + * the only ones holding reference + */ +static void put_dquot_list(struct list_head *tofree_head) +{ + struct list_head *act_head; + struct dquot *dquot; + + act_head = tofree_head->next; + while (act_head != tofree_head) { + dquot = list_entry(act_head, struct dquot, dq_free); + act_head = act_head->next; + /* Remove dquot from the list so we won't have problems... */ + list_del_init(&dquot->dq_free); + dqput(dquot); + } +} + +static void remove_dquot_ref(struct super_block *sb, int type, + struct list_head *tofree_head) +{ + struct inode *inode; + int reserved = 0; + + spin_lock(&inode_sb_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + /* + * We have to scan also I_NEW inodes because they can already + * have quota pointer initialized. Luckily, we need to touch + * only quota pointers and these have separate locking + * (dqptr_sem). + */ + if (!IS_NOQUOTA(inode)) { + if (unlikely(inode_get_rsv_space(inode) > 0)) + reserved = 1; + remove_inode_dquot_ref(inode, type, tofree_head); + } + } + spin_unlock(&inode_sb_list_lock); +#ifdef CONFIG_QUOTA_DEBUG + if (reserved) { + printk(KERN_WARNING "VFS (%s): Writes happened after quota" + " was disabled thus quota information is probably " + "inconsistent. Please run quotacheck(8).\n", sb->s_id); + } +#endif +} + +/* Gather all references from inodes and drop them */ +static void drop_dquot_ref(struct super_block *sb, int type) +{ + LIST_HEAD(tofree_head); + + if (sb->dq_op) { + down_write(&sb_dqopt(sb)->dqptr_sem); + remove_dquot_ref(sb, type, &tofree_head); + up_write(&sb_dqopt(sb)->dqptr_sem); + put_dquot_list(&tofree_head); + } +} + +static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number) +{ + dquot->dq_dqb.dqb_curinodes += number; +} + +static inline void dquot_incr_space(struct dquot *dquot, qsize_t number) +{ + dquot->dq_dqb.dqb_curspace += number; +} + +static inline void dquot_resv_space(struct dquot *dquot, qsize_t number) +{ + dquot->dq_dqb.dqb_rsvspace += number; +} + +/* + * Claim reserved quota space + */ +static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number) +{ + if (dquot->dq_dqb.dqb_rsvspace < number) { + WARN_ON_ONCE(1); + number = dquot->dq_dqb.dqb_rsvspace; + } + dquot->dq_dqb.dqb_curspace += number; + dquot->dq_dqb.dqb_rsvspace -= number; +} + +static inline +void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) +{ + if (dquot->dq_dqb.dqb_rsvspace >= number) + dquot->dq_dqb.dqb_rsvspace -= number; + else { + WARN_ON_ONCE(1); + dquot->dq_dqb.dqb_rsvspace = 0; + } +} + +static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) +{ + if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || + dquot->dq_dqb.dqb_curinodes >= number) + dquot->dq_dqb.dqb_curinodes -= number; + else + dquot->dq_dqb.dqb_curinodes = 0; + if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit) + dquot->dq_dqb.dqb_itime = (time_t) 0; + clear_bit(DQ_INODES_B, &dquot->dq_flags); +} + +static void dquot_decr_space(struct dquot *dquot, qsize_t number) +{ + if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || + dquot->dq_dqb.dqb_curspace >= number) + dquot->dq_dqb.dqb_curspace -= number; + else + dquot->dq_dqb.dqb_curspace = 0; + if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) + dquot->dq_dqb.dqb_btime = (time_t) 0; + clear_bit(DQ_BLKS_B, &dquot->dq_flags); +} + +static int warning_issued(struct dquot *dquot, const int warntype) +{ + int flag = (warntype == QUOTA_NL_BHARDWARN || + warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B : + ((warntype == QUOTA_NL_IHARDWARN || + warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0); + + if (!flag) + return 0; + return test_and_set_bit(flag, &dquot->dq_flags); +} + +#ifdef CONFIG_PRINT_QUOTA_WARNING +static int flag_print_warnings = 1; + +static int need_print_warning(struct dquot *dquot) +{ + if (!flag_print_warnings) + return 0; + + switch (dquot->dq_type) { + case USRQUOTA: + return current_fsuid() == dquot->dq_id; + case GRPQUOTA: + return in_group_p(dquot->dq_id); + } + return 0; +} + +/* Print warning to user which exceeded quota */ +static void print_warning(struct dquot *dquot, const int warntype) +{ + char *msg = NULL; + struct tty_struct *tty; + + if (warntype == QUOTA_NL_IHARDBELOW || + warntype == QUOTA_NL_ISOFTBELOW || + warntype == QUOTA_NL_BHARDBELOW || + warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot)) + return; + + tty = get_current_tty(); + if (!tty) + return; + tty_write_message(tty, dquot->dq_sb->s_id); + if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN) + tty_write_message(tty, ": warning, "); + else + tty_write_message(tty, ": write failed, "); + tty_write_message(tty, quotatypes[dquot->dq_type]); + switch (warntype) { + case QUOTA_NL_IHARDWARN: + msg = " file limit reached.\r\n"; + break; + case QUOTA_NL_ISOFTLONGWARN: + msg = " file quota exceeded too long.\r\n"; + break; + case QUOTA_NL_ISOFTWARN: + msg = " file quota exceeded.\r\n"; + break; + case QUOTA_NL_BHARDWARN: + msg = " block limit reached.\r\n"; + break; + case QUOTA_NL_BSOFTLONGWARN: + msg = " block quota exceeded too long.\r\n"; + break; + case QUOTA_NL_BSOFTWARN: + msg = " block quota exceeded.\r\n"; + break; + } + tty_write_message(tty, msg); + tty_kref_put(tty); +} +#endif + +/* + * Write warnings to the console and send warning messages over netlink. + * + * Note that this function can sleep. + */ +static void flush_warnings(struct dquot *const *dquots, char *warntype) +{ + struct dquot *dq; + int i; + + for (i = 0; i < MAXQUOTAS; i++) { + dq = dquots[i]; + if (dq && warntype[i] != QUOTA_NL_NOWARN && + !warning_issued(dq, warntype[i])) { +#ifdef CONFIG_PRINT_QUOTA_WARNING + print_warning(dq, warntype[i]); +#endif + quota_send_warning(dq->dq_type, dq->dq_id, + dq->dq_sb->s_dev, warntype[i]); + } + } +} + +static int ignore_hardlimit(struct dquot *dquot) +{ + struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; + + return capable(CAP_SYS_RESOURCE) && + (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || + !(info->dqi_flags & V1_DQF_RSQUASH)); +} + +/* needs dq_data_lock */ +static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) +{ + qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes; + + *warntype = QUOTA_NL_NOWARN; + if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || + test_bit(DQ_FAKE_B, &dquot->dq_flags)) + return 0; + + if (dquot->dq_dqb.dqb_ihardlimit && + newinodes > dquot->dq_dqb.dqb_ihardlimit && + !ignore_hardlimit(dquot)) { + *warntype = QUOTA_NL_IHARDWARN; + return -EDQUOT; + } + + if (dquot->dq_dqb.dqb_isoftlimit && + newinodes > dquot->dq_dqb.dqb_isoftlimit && + dquot->dq_dqb.dqb_itime && + get_seconds() >= dquot->dq_dqb.dqb_itime && + !ignore_hardlimit(dquot)) { + *warntype = QUOTA_NL_ISOFTLONGWARN; + return -EDQUOT; + } + + if (dquot->dq_dqb.dqb_isoftlimit && + newinodes > dquot->dq_dqb.dqb_isoftlimit && + dquot->dq_dqb.dqb_itime == 0) { + *warntype = QUOTA_NL_ISOFTWARN; + dquot->dq_dqb.dqb_itime = get_seconds() + + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; + } + + return 0; +} + +/* needs dq_data_lock */ +static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype) +{ + qsize_t tspace; + struct super_block *sb = dquot->dq_sb; + + *warntype = QUOTA_NL_NOWARN; + if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) || + test_bit(DQ_FAKE_B, &dquot->dq_flags)) + return 0; + + tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace + + space; + + if (dquot->dq_dqb.dqb_bhardlimit && + tspace > dquot->dq_dqb.dqb_bhardlimit && + !ignore_hardlimit(dquot)) { + if (!prealloc) + *warntype = QUOTA_NL_BHARDWARN; + return -EDQUOT; + } + + if (dquot->dq_dqb.dqb_bsoftlimit && + tspace > dquot->dq_dqb.dqb_bsoftlimit && + dquot->dq_dqb.dqb_btime && + get_seconds() >= dquot->dq_dqb.dqb_btime && + !ignore_hardlimit(dquot)) { + if (!prealloc) + *warntype = QUOTA_NL_BSOFTLONGWARN; + return -EDQUOT; + } + + if (dquot->dq_dqb.dqb_bsoftlimit && + tspace > dquot->dq_dqb.dqb_bsoftlimit && + dquot->dq_dqb.dqb_btime == 0) { + if (!prealloc) { + *warntype = QUOTA_NL_BSOFTWARN; + dquot->dq_dqb.dqb_btime = get_seconds() + + sb_dqopt(sb)->info[dquot->dq_type].dqi_bgrace; + } + else + /* + * We don't allow preallocation to exceed softlimit so exceeding will + * be always printed + */ + return -EDQUOT; + } + + return 0; +} + +static int info_idq_free(struct dquot *dquot, qsize_t inodes) +{ + qsize_t newinodes; + + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || + dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit || + !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type)) + return QUOTA_NL_NOWARN; + + newinodes = dquot->dq_dqb.dqb_curinodes - inodes; + if (newinodes <= dquot->dq_dqb.dqb_isoftlimit) + return QUOTA_NL_ISOFTBELOW; + if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit && + newinodes < dquot->dq_dqb.dqb_ihardlimit) + return QUOTA_NL_IHARDBELOW; + return QUOTA_NL_NOWARN; +} + +static int info_bdq_free(struct dquot *dquot, qsize_t space) +{ + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || + dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) + return QUOTA_NL_NOWARN; + + if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit) + return QUOTA_NL_BSOFTBELOW; + if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit && + dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit) + return QUOTA_NL_BHARDBELOW; + return QUOTA_NL_NOWARN; +} + +static int dquot_active(const struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (IS_NOQUOTA(inode)) + return 0; + return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb); +} + +/* + * Initialize quota pointers in inode + * + * We do things in a bit complicated way but by that we avoid calling + * dqget() and thus filesystem callbacks under dqptr_sem. + * + * It is better to call this function outside of any transaction as it + * might need a lot of space in journal for dquot structure allocation. + */ +static void __dquot_initialize(struct inode *inode, int type) +{ + unsigned int id = 0; + int cnt; + struct dquot *got[MAXQUOTAS]; + struct super_block *sb = inode->i_sb; + qsize_t rsv; + + /* First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex */ + if (!dquot_active(inode)) + return; + + /* First get references to structures we might need. */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + got[cnt] = NULL; + if (type != -1 && cnt != type) + continue; + switch (cnt) { + case USRQUOTA: + id = inode->i_uid; + break; + case GRPQUOTA: + id = inode->i_gid; + break; + } + got[cnt] = dqget(sb, id, cnt); + } + + down_write(&sb_dqopt(sb)->dqptr_sem); + if (IS_NOQUOTA(inode)) + goto out_err; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + /* Avoid races with quotaoff() */ + if (!sb_has_quota_active(sb, cnt)) + continue; + /* We could race with quotaon or dqget() could have failed */ + if (!got[cnt]) + continue; + if (!inode->i_dquot[cnt]) { + inode->i_dquot[cnt] = got[cnt]; + got[cnt] = NULL; + /* + * Make quota reservation system happy if someone + * did a write before quota was turned on + */ + rsv = inode_get_rsv_space(inode); + if (unlikely(rsv)) + dquot_resv_space(inode->i_dquot[cnt], rsv); + } + } +out_err: + up_write(&sb_dqopt(sb)->dqptr_sem); + /* Drop unused references */ + dqput_all(got); +} + +void dquot_initialize(struct inode *inode) +{ + __dquot_initialize(inode, -1); +} +EXPORT_SYMBOL(dquot_initialize); + +/* + * Release all quotas referenced by inode + */ +static void __dquot_drop(struct inode *inode) +{ + int cnt; + struct dquot *put[MAXQUOTAS]; + + down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + put[cnt] = inode->i_dquot[cnt]; + inode->i_dquot[cnt] = NULL; + } + up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + dqput_all(put); +} + +void dquot_drop(struct inode *inode) +{ + int cnt; + + if (IS_NOQUOTA(inode)) + return; + + /* + * Test before calling to rule out calls from proc and such + * where we are not allowed to block. Note that this is + * actually reliable test even without the lock - the caller + * must assure that nobody can come after the DQUOT_DROP and + * add quota pointers back anyway. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (inode->i_dquot[cnt]) + break; + } + + if (cnt < MAXQUOTAS) + __dquot_drop(inode); +} +EXPORT_SYMBOL(dquot_drop); + +/* + * inode_reserved_space is managed internally by quota, and protected by + * i_lock similar to i_blocks+i_bytes. + */ +static qsize_t *inode_reserved_space(struct inode * inode) +{ + /* Filesystem must explicitly define it's own method in order to use + * quota reservation interface */ + BUG_ON(!inode->i_sb->dq_op->get_reserved_space); + return inode->i_sb->dq_op->get_reserved_space(inode); +} + +void inode_add_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(inode_add_rsv_space); + +void inode_claim_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + __inode_add_bytes(inode, number); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(inode_claim_rsv_space); + +void inode_sub_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(inode_sub_rsv_space); + +static qsize_t inode_get_rsv_space(struct inode *inode) +{ + qsize_t ret; + + if (!inode->i_sb->dq_op->get_reserved_space) + return 0; + spin_lock(&inode->i_lock); + ret = *inode_reserved_space(inode); + spin_unlock(&inode->i_lock); + return ret; +} + +static void inode_incr_space(struct inode *inode, qsize_t number, + int reserve) +{ + if (reserve) + inode_add_rsv_space(inode, number); + else + inode_add_bytes(inode, number); +} + +static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) +{ + if (reserve) + inode_sub_rsv_space(inode, number); + else + inode_sub_bytes(inode, number); +} + +/* + * This functions updates i_blocks+i_bytes fields and quota information + * (together with appropriate checks). + * + * NOTE: We absolutely rely on the fact that caller dirties the inode + * (usually helpers in quotaops.h care about this) and holds a handle for + * the current transaction so that dquot write and inode write go into the + * same transaction. + */ + +/* + * This operation can block, but only after everything is updated + */ +int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) +{ + int cnt, ret = 0; + char warntype[MAXQUOTAS]; + int warn = flags & DQUOT_SPACE_WARN; + int reserve = flags & DQUOT_SPACE_RESERVE; + int nofail = flags & DQUOT_SPACE_NOFAIL; + + /* + * First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex + */ + if (!dquot_active(inode)) { + inode_incr_space(inode, number, reserve); + goto out; + } + + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + warntype[cnt] = QUOTA_NL_NOWARN; + + spin_lock(&dq_data_lock); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!inode->i_dquot[cnt]) + continue; + ret = check_bdq(inode->i_dquot[cnt], number, !warn, + warntype+cnt); + if (ret && !nofail) { + spin_unlock(&dq_data_lock); + goto out_flush_warn; + } + } + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!inode->i_dquot[cnt]) + continue; + if (reserve) + dquot_resv_space(inode->i_dquot[cnt], number); + else + dquot_incr_space(inode->i_dquot[cnt], number); + } + inode_incr_space(inode, number, reserve); + spin_unlock(&dq_data_lock); + + if (reserve) + goto out_flush_warn; + mark_all_dquot_dirty(inode->i_dquot); +out_flush_warn: + flush_warnings(inode->i_dquot, warntype); + up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); +out: + return ret; +} +EXPORT_SYMBOL(__dquot_alloc_space); + +/* + * This operation can block, but only after everything is updated + */ +int dquot_alloc_inode(const struct inode *inode) +{ + int cnt, ret = 0; + char warntype[MAXQUOTAS]; + + /* First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex */ + if (!dquot_active(inode)) + return 0; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + warntype[cnt] = QUOTA_NL_NOWARN; + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock(&dq_data_lock); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!inode->i_dquot[cnt]) + continue; + ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt); + if (ret) + goto warn_put_all; + } + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!inode->i_dquot[cnt]) + continue; + dquot_incr_inodes(inode->i_dquot[cnt], 1); + } + +warn_put_all: + spin_unlock(&dq_data_lock); + if (ret == 0) + mark_all_dquot_dirty(inode->i_dquot); + flush_warnings(inode->i_dquot, warntype); + up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + return ret; +} +EXPORT_SYMBOL(dquot_alloc_inode); + +/* + * Convert in-memory reserved quotas to real consumed quotas + */ +int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) +{ + int cnt; + + if (!dquot_active(inode)) { + inode_claim_rsv_space(inode, number); + return 0; + } + + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock(&dq_data_lock); + /* Claim reserved quotas to allocated quotas */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (inode->i_dquot[cnt]) + dquot_claim_reserved_space(inode->i_dquot[cnt], + number); + } + /* Update inode bytes */ + inode_claim_rsv_space(inode, number); + spin_unlock(&dq_data_lock); + mark_all_dquot_dirty(inode->i_dquot); + up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + return 0; +} +EXPORT_SYMBOL(dquot_claim_space_nodirty); + +/* + * This operation can block, but only after everything is updated + */ +void __dquot_free_space(struct inode *inode, qsize_t number, int flags) +{ + unsigned int cnt; + char warntype[MAXQUOTAS]; + int reserve = flags & DQUOT_SPACE_RESERVE; + + /* First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex */ + if (!dquot_active(inode)) { + inode_decr_space(inode, number, reserve); + return; + } + + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock(&dq_data_lock); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!inode->i_dquot[cnt]) + continue; + warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); + if (reserve) + dquot_free_reserved_space(inode->i_dquot[cnt], number); + else + dquot_decr_space(inode->i_dquot[cnt], number); + } + inode_decr_space(inode, number, reserve); + spin_unlock(&dq_data_lock); + + if (reserve) + goto out_unlock; + mark_all_dquot_dirty(inode->i_dquot); +out_unlock: + flush_warnings(inode->i_dquot, warntype); + up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); +} +EXPORT_SYMBOL(__dquot_free_space); + +/* + * This operation can block, but only after everything is updated + */ +void dquot_free_inode(const struct inode *inode) +{ + unsigned int cnt; + char warntype[MAXQUOTAS]; + + /* First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex */ + if (!dquot_active(inode)) + return; + + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock(&dq_data_lock); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!inode->i_dquot[cnt]) + continue; + warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1); + dquot_decr_inodes(inode->i_dquot[cnt], 1); + } + spin_unlock(&dq_data_lock); + mark_all_dquot_dirty(inode->i_dquot); + flush_warnings(inode->i_dquot, warntype); + up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); +} +EXPORT_SYMBOL(dquot_free_inode); + +/* + * Transfer the number of inode and blocks from one diskquota to an other. + * On success, dquot references in transfer_to are consumed and references + * to original dquots that need to be released are placed there. On failure, + * references are kept untouched. + * + * This operation can block, but only after everything is updated + * A transaction must be started when entering this function. + * + */ +int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) +{ + qsize_t space, cur_space; + qsize_t rsv_space = 0; + struct dquot *transfer_from[MAXQUOTAS] = {}; + int cnt, ret = 0; + char is_valid[MAXQUOTAS] = {}; + char warntype_to[MAXQUOTAS]; + char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; + + /* First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex */ + if (IS_NOQUOTA(inode)) + return 0; + /* Initialize the arrays */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + warntype_to[cnt] = QUOTA_NL_NOWARN; + down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ + up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + return 0; + } + spin_lock(&dq_data_lock); + cur_space = inode_get_bytes(inode); + rsv_space = inode_get_rsv_space(inode); + space = cur_space + rsv_space; + /* Build the transfer_from list and check the limits */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + /* + * Skip changes for same uid or gid or for turned off quota-type. + */ + if (!transfer_to[cnt]) + continue; + /* Avoid races with quotaoff() */ + if (!sb_has_quota_active(inode->i_sb, cnt)) + continue; + is_valid[cnt] = 1; + transfer_from[cnt] = inode->i_dquot[cnt]; + ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt); + if (ret) + goto over_quota; + ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt); + if (ret) + goto over_quota; + } + + /* + * Finally perform the needed transfer from transfer_from to transfer_to + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!is_valid[cnt]) + continue; + /* Due to IO error we might not have transfer_from[] structure */ + if (transfer_from[cnt]) { + warntype_from_inodes[cnt] = + info_idq_free(transfer_from[cnt], 1); + warntype_from_space[cnt] = + info_bdq_free(transfer_from[cnt], space); + dquot_decr_inodes(transfer_from[cnt], 1); + dquot_decr_space(transfer_from[cnt], cur_space); + dquot_free_reserved_space(transfer_from[cnt], + rsv_space); + } + + dquot_incr_inodes(transfer_to[cnt], 1); + dquot_incr_space(transfer_to[cnt], cur_space); + dquot_resv_space(transfer_to[cnt], rsv_space); + + inode->i_dquot[cnt] = transfer_to[cnt]; + } + spin_unlock(&dq_data_lock); + up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + + mark_all_dquot_dirty(transfer_from); + mark_all_dquot_dirty(transfer_to); + flush_warnings(transfer_to, warntype_to); + flush_warnings(transfer_from, warntype_from_inodes); + flush_warnings(transfer_from, warntype_from_space); + /* Pass back references to put */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (is_valid[cnt]) + transfer_to[cnt] = transfer_from[cnt]; + return 0; +over_quota: + spin_unlock(&dq_data_lock); + up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + flush_warnings(transfer_to, warntype_to); + return ret; +} +EXPORT_SYMBOL(__dquot_transfer); + +/* Wrapper for transferring ownership of an inode for uid/gid only + * Called from FSXXX_setattr() + */ +int dquot_transfer(struct inode *inode, struct iattr *iattr) +{ + struct dquot *transfer_to[MAXQUOTAS] = {}; + struct super_block *sb = inode->i_sb; + int ret; + + if (!dquot_active(inode)) + return 0; + + if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) + transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA); + if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) + transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA); + + ret = __dquot_transfer(inode, transfer_to); + dqput_all(transfer_to); + return ret; +} +EXPORT_SYMBOL(dquot_transfer); + +/* + * Write info of quota file to disk + */ +int dquot_commit_info(struct super_block *sb, int type) +{ + int ret; + struct quota_info *dqopt = sb_dqopt(sb); + + mutex_lock(&dqopt->dqio_mutex); + ret = dqopt->ops[type]->write_file_info(sb, type); + mutex_unlock(&dqopt->dqio_mutex); + return ret; +} +EXPORT_SYMBOL(dquot_commit_info); + +/* + * Definitions of diskquota operations. + */ +const struct dquot_operations dquot_operations = { + .write_dquot = dquot_commit, + .acquire_dquot = dquot_acquire, + .release_dquot = dquot_release, + .mark_dirty = dquot_mark_dquot_dirty, + .write_info = dquot_commit_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +}; +EXPORT_SYMBOL(dquot_operations); + +/* + * Generic helper for ->open on filesystems supporting disk quotas. + */ +int dquot_file_open(struct inode *inode, struct file *file) +{ + int error; + + error = generic_file_open(inode, file); + if (!error && (file->f_mode & FMODE_WRITE)) + dquot_initialize(inode); + return error; +} +EXPORT_SYMBOL(dquot_file_open); + +/* + * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) + */ +int dquot_disable(struct super_block *sb, int type, unsigned int flags) +{ + int cnt, ret = 0; + struct quota_info *dqopt = sb_dqopt(sb); + struct inode *toputinode[MAXQUOTAS]; + + /* Cannot turn off usage accounting without turning off limits, or + * suspend quotas and simultaneously turn quotas off. */ + if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED)) + || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED | + DQUOT_USAGE_ENABLED))) + return -EINVAL; + + /* We need to serialize quota_off() for device */ + mutex_lock(&dqopt->dqonoff_mutex); + + /* + * Skip everything if there's nothing to do. We have to do this because + * sometimes we are called when fill_super() failed and calling + * sync_fs() in such cases does no good. + */ + if (!sb_any_quota_loaded(sb)) { + mutex_unlock(&dqopt->dqonoff_mutex); + return 0; + } + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + toputinode[cnt] = NULL; + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_loaded(sb, cnt)) + continue; + + if (flags & DQUOT_SUSPENDED) { + spin_lock(&dq_state_lock); + dqopt->flags |= + dquot_state_flag(DQUOT_SUSPENDED, cnt); + spin_unlock(&dq_state_lock); + } else { + spin_lock(&dq_state_lock); + dqopt->flags &= ~dquot_state_flag(flags, cnt); + /* Turning off suspended quotas? */ + if (!sb_has_quota_loaded(sb, cnt) && + sb_has_quota_suspended(sb, cnt)) { + dqopt->flags &= ~dquot_state_flag( + DQUOT_SUSPENDED, cnt); + spin_unlock(&dq_state_lock); + iput(dqopt->files[cnt]); + dqopt->files[cnt] = NULL; + continue; + } + spin_unlock(&dq_state_lock); + } + + /* We still have to keep quota loaded? */ + if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED)) + continue; + + /* Note: these are blocking operations */ + drop_dquot_ref(sb, cnt); + invalidate_dquots(sb, cnt); + /* + * Now all dquots should be invalidated, all writes done so we + * should be only users of the info. No locks needed. + */ + if (info_dirty(&dqopt->info[cnt])) + sb->dq_op->write_info(sb, cnt); + if (dqopt->ops[cnt]->free_file_info) + dqopt->ops[cnt]->free_file_info(sb, cnt); + put_quota_format(dqopt->info[cnt].dqi_format); + + toputinode[cnt] = dqopt->files[cnt]; + if (!sb_has_quota_loaded(sb, cnt)) + dqopt->files[cnt] = NULL; + dqopt->info[cnt].dqi_flags = 0; + dqopt->info[cnt].dqi_igrace = 0; + dqopt->info[cnt].dqi_bgrace = 0; + dqopt->ops[cnt] = NULL; + } + mutex_unlock(&dqopt->dqonoff_mutex); + + /* Skip syncing and setting flags if quota files are hidden */ + if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) + goto put_inodes; + + /* Sync the superblock so that buffers with quota data are written to + * disk (and so userspace sees correct data afterwards). */ + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + sync_blockdev(sb->s_bdev); + /* Now the quota files are just ordinary files and we can set the + * inode flags back. Moreover we discard the pagecache so that + * userspace sees the writes we did bypassing the pagecache. We + * must also discard the blockdev buffers so that we see the + * changes done by userspace on the next quotaon() */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (toputinode[cnt]) { + mutex_lock(&dqopt->dqonoff_mutex); + /* If quota was reenabled in the meantime, we have + * nothing to do */ + if (!sb_has_quota_loaded(sb, cnt)) { + mutex_lock_nested(&toputinode[cnt]->i_mutex, + I_MUTEX_QUOTA); + toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | + S_NOATIME | S_NOQUOTA); + truncate_inode_pages(&toputinode[cnt]->i_data, + 0); + mutex_unlock(&toputinode[cnt]->i_mutex); + mark_inode_dirty_sync(toputinode[cnt]); + } + mutex_unlock(&dqopt->dqonoff_mutex); + } + if (sb->s_bdev) + invalidate_bdev(sb->s_bdev); +put_inodes: + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (toputinode[cnt]) { + /* On remount RO, we keep the inode pointer so that we + * can reenable quota on the subsequent remount RW. We + * have to check 'flags' variable and not use sb_has_ + * function because another quotaon / quotaoff could + * change global state before we got here. We refuse + * to suspend quotas when there is pending delete on + * the quota file... */ + if (!(flags & DQUOT_SUSPENDED)) + iput(toputinode[cnt]); + else if (!toputinode[cnt]->i_nlink) + ret = -EBUSY; + } + return ret; +} +EXPORT_SYMBOL(dquot_disable); + +int dquot_quota_off(struct super_block *sb, int type) +{ + return dquot_disable(sb, type, + DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); +} +EXPORT_SYMBOL(dquot_quota_off); + +/* + * Turn quotas on on a device + */ + +/* + * Helper function to turn quotas on when we already have the inode of + * quota file and no quota information is loaded. + */ +static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, + unsigned int flags) +{ + struct quota_format_type *fmt = find_quota_format(format_id); + struct super_block *sb = inode->i_sb; + struct quota_info *dqopt = sb_dqopt(sb); + int error; + int oldflags = -1; + + if (!fmt) + return -ESRCH; + if (!S_ISREG(inode->i_mode)) { + error = -EACCES; + goto out_fmt; + } + if (IS_RDONLY(inode)) { + error = -EROFS; + goto out_fmt; + } + if (!sb->s_op->quota_write || !sb->s_op->quota_read) { + error = -EINVAL; + goto out_fmt; + } + /* Usage always has to be set... */ + if (!(flags & DQUOT_USAGE_ENABLED)) { + error = -EINVAL; + goto out_fmt; + } + + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + /* As we bypass the pagecache we must now flush all the + * dirty data and invalidate caches so that kernel sees + * changes from userspace. It is not enough to just flush + * the quota file since if blocksize < pagesize, invalidation + * of the cache could fail because of other unrelated dirty + * data */ + sync_filesystem(sb); + invalidate_bdev(sb->s_bdev); + } + mutex_lock(&dqopt->dqonoff_mutex); + if (sb_has_quota_loaded(sb, type)) { + error = -EBUSY; + goto out_lock; + } + + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + /* We don't want quota and atime on quota files (deadlocks + * possible) Also nobody should write to the file - we use + * special IO operations which ignore the immutable bit. */ + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | + S_NOQUOTA); + inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; + mutex_unlock(&inode->i_mutex); + /* + * When S_NOQUOTA is set, remove dquot references as no more + * references can be added + */ + __dquot_drop(inode); + } + + error = -EIO; + dqopt->files[type] = igrab(inode); + if (!dqopt->files[type]) + goto out_lock; + error = -EINVAL; + if (!fmt->qf_ops->check_quota_file(sb, type)) + goto out_file_init; + + dqopt->ops[type] = fmt->qf_ops; + dqopt->info[type].dqi_format = fmt; + dqopt->info[type].dqi_fmt_id = format_id; + INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list); + mutex_lock(&dqopt->dqio_mutex); + error = dqopt->ops[type]->read_file_info(sb, type); + if (error < 0) { + mutex_unlock(&dqopt->dqio_mutex); + goto out_file_init; + } + mutex_unlock(&dqopt->dqio_mutex); + spin_lock(&dq_state_lock); + dqopt->flags |= dquot_state_flag(flags, type); + spin_unlock(&dq_state_lock); + + add_dquot_ref(sb, type); + mutex_unlock(&dqopt->dqonoff_mutex); + + return 0; + +out_file_init: + dqopt->files[type] = NULL; + iput(inode); +out_lock: + if (oldflags != -1) { + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + /* Set the flags back (in the case of accidental quotaon() + * on a wrong file we don't want to mess up the flags) */ + inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); + inode->i_flags |= oldflags; + mutex_unlock(&inode->i_mutex); + } + mutex_unlock(&dqopt->dqonoff_mutex); +out_fmt: + put_quota_format(fmt); + + return error; +} + +/* Reenable quotas on remount RW */ +int dquot_resume(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + struct inode *inode; + int ret = 0, cnt; + unsigned int flags; + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + + mutex_lock(&dqopt->dqonoff_mutex); + if (!sb_has_quota_suspended(sb, cnt)) { + mutex_unlock(&dqopt->dqonoff_mutex); + continue; + } + inode = dqopt->files[cnt]; + dqopt->files[cnt] = NULL; + spin_lock(&dq_state_lock); + flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED, + cnt); + dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt); + spin_unlock(&dq_state_lock); + mutex_unlock(&dqopt->dqonoff_mutex); + + flags = dquot_generic_flag(flags, cnt); + ret = vfs_load_quota_inode(inode, cnt, + dqopt->info[cnt].dqi_fmt_id, flags); + iput(inode); + } + + return ret; +} +EXPORT_SYMBOL(dquot_resume); + +int dquot_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + int error = security_quota_on(path->dentry); + if (error) + return error; + /* Quota file not on the same filesystem? */ + if (path->mnt->mnt_sb != sb) + error = -EXDEV; + else + error = vfs_load_quota_inode(path->dentry->d_inode, type, + format_id, DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED); + return error; +} +EXPORT_SYMBOL(dquot_quota_on); + +/* + * More powerful function for turning on quotas allowing setting + * of individual quota flags + */ +int dquot_enable(struct inode *inode, int type, int format_id, + unsigned int flags) +{ + int ret = 0; + struct super_block *sb = inode->i_sb; + struct quota_info *dqopt = sb_dqopt(sb); + + /* Just unsuspend quotas? */ + BUG_ON(flags & DQUOT_SUSPENDED); + + if (!flags) + return 0; + /* Just updating flags needed? */ + if (sb_has_quota_loaded(sb, type)) { + mutex_lock(&dqopt->dqonoff_mutex); + /* Now do a reliable test... */ + if (!sb_has_quota_loaded(sb, type)) { + mutex_unlock(&dqopt->dqonoff_mutex); + goto load_quota; + } + if (flags & DQUOT_USAGE_ENABLED && + sb_has_quota_usage_enabled(sb, type)) { + ret = -EBUSY; + goto out_lock; + } + if (flags & DQUOT_LIMITS_ENABLED && + sb_has_quota_limits_enabled(sb, type)) { + ret = -EBUSY; + goto out_lock; + } + spin_lock(&dq_state_lock); + sb_dqopt(sb)->flags |= dquot_state_flag(flags, type); + spin_unlock(&dq_state_lock); +out_lock: + mutex_unlock(&dqopt->dqonoff_mutex); + return ret; + } + +load_quota: + return vfs_load_quota_inode(inode, type, format_id, flags); +} +EXPORT_SYMBOL(dquot_enable); + +/* + * This function is used when filesystem needs to initialize quotas + * during mount time. + */ +int dquot_quota_on_mount(struct super_block *sb, char *qf_name, + int format_id, int type) +{ + struct dentry *dentry; + int error; + + mutex_lock(&sb->s_root->d_inode->i_mutex); + dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); + mutex_unlock(&sb->s_root->d_inode->i_mutex); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + if (!dentry->d_inode) { + error = -ENOENT; + goto out; + } + + error = security_quota_on(dentry); + if (!error) + error = vfs_load_quota_inode(dentry->d_inode, type, format_id, + DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + +out: + dput(dentry); + return error; +} +EXPORT_SYMBOL(dquot_quota_on_mount); + +static inline qsize_t qbtos(qsize_t blocks) +{ + return blocks << QIF_DQBLKSIZE_BITS; +} + +static inline qsize_t stoqb(qsize_t space) +{ + return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS; +} + +/* Generic routine for getting common part of quota structure */ +static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di) +{ + struct mem_dqblk *dm = &dquot->dq_dqb; + + memset(di, 0, sizeof(*di)); + di->d_version = FS_DQUOT_VERSION; + di->d_flags = dquot->dq_type == USRQUOTA ? + FS_USER_QUOTA : FS_GROUP_QUOTA; + di->d_id = dquot->dq_id; + + spin_lock(&dq_data_lock); + di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit); + di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit); + di->d_ino_hardlimit = dm->dqb_ihardlimit; + di->d_ino_softlimit = dm->dqb_isoftlimit; + di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace; + di->d_icount = dm->dqb_curinodes; + di->d_btimer = dm->dqb_btime; + di->d_itimer = dm->dqb_itime; + spin_unlock(&dq_data_lock); +} + +int dquot_get_dqblk(struct super_block *sb, int type, qid_t id, + struct fs_disk_quota *di) +{ + struct dquot *dquot; + + dquot = dqget(sb, id, type); + if (!dquot) + return -ESRCH; + do_get_dqblk(dquot, di); + dqput(dquot); + + return 0; +} +EXPORT_SYMBOL(dquot_get_dqblk); + +#define VFS_FS_DQ_MASK \ + (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \ + FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \ + FS_DQ_BTIMER | FS_DQ_ITIMER) + +/* Generic routine for setting common part of quota structure */ +static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di) +{ + struct mem_dqblk *dm = &dquot->dq_dqb; + int check_blim = 0, check_ilim = 0; + struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; + + if (di->d_fieldmask & ~VFS_FS_DQ_MASK) + return -EINVAL; + + if (((di->d_fieldmask & FS_DQ_BSOFT) && + (di->d_blk_softlimit > dqi->dqi_maxblimit)) || + ((di->d_fieldmask & FS_DQ_BHARD) && + (di->d_blk_hardlimit > dqi->dqi_maxblimit)) || + ((di->d_fieldmask & FS_DQ_ISOFT) && + (di->d_ino_softlimit > dqi->dqi_maxilimit)) || + ((di->d_fieldmask & FS_DQ_IHARD) && + (di->d_ino_hardlimit > dqi->dqi_maxilimit))) + return -ERANGE; + + spin_lock(&dq_data_lock); + if (di->d_fieldmask & FS_DQ_BCOUNT) { + dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace; + check_blim = 1; + set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); + } + + if (di->d_fieldmask & FS_DQ_BSOFT) + dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit); + if (di->d_fieldmask & FS_DQ_BHARD) + dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit); + if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) { + check_blim = 1; + set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); + } + + if (di->d_fieldmask & FS_DQ_ICOUNT) { + dm->dqb_curinodes = di->d_icount; + check_ilim = 1; + set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); + } + + if (di->d_fieldmask & FS_DQ_ISOFT) + dm->dqb_isoftlimit = di->d_ino_softlimit; + if (di->d_fieldmask & FS_DQ_IHARD) + dm->dqb_ihardlimit = di->d_ino_hardlimit; + if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) { + check_ilim = 1; + set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); + } + + if (di->d_fieldmask & FS_DQ_BTIMER) { + dm->dqb_btime = di->d_btimer; + check_blim = 1; + set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); + } + + if (di->d_fieldmask & FS_DQ_ITIMER) { + dm->dqb_itime = di->d_itimer; + check_ilim = 1; + set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); + } + + if (check_blim) { + if (!dm->dqb_bsoftlimit || + dm->dqb_curspace < dm->dqb_bsoftlimit) { + dm->dqb_btime = 0; + clear_bit(DQ_BLKS_B, &dquot->dq_flags); + } else if (!(di->d_fieldmask & FS_DQ_BTIMER)) + /* Set grace only if user hasn't provided his own... */ + dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; + } + if (check_ilim) { + if (!dm->dqb_isoftlimit || + dm->dqb_curinodes < dm->dqb_isoftlimit) { + dm->dqb_itime = 0; + clear_bit(DQ_INODES_B, &dquot->dq_flags); + } else if (!(di->d_fieldmask & FS_DQ_ITIMER)) + /* Set grace only if user hasn't provided his own... */ + dm->dqb_itime = get_seconds() + dqi->dqi_igrace; + } + if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || + dm->dqb_isoftlimit) + clear_bit(DQ_FAKE_B, &dquot->dq_flags); + else + set_bit(DQ_FAKE_B, &dquot->dq_flags); + spin_unlock(&dq_data_lock); + mark_dquot_dirty(dquot); + + return 0; +} + +int dquot_set_dqblk(struct super_block *sb, int type, qid_t id, + struct fs_disk_quota *di) +{ + struct dquot *dquot; + int rc; + + dquot = dqget(sb, id, type); + if (!dquot) { + rc = -ESRCH; + goto out; + } + rc = do_set_dqblk(dquot, di); + dqput(dquot); +out: + return rc; +} +EXPORT_SYMBOL(dquot_set_dqblk); + +/* Generic routine for getting common part of quota file information */ +int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) +{ + struct mem_dqinfo *mi; + + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + if (!sb_has_quota_active(sb, type)) { + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return -ESRCH; + } + mi = sb_dqopt(sb)->info + type; + spin_lock(&dq_data_lock); + ii->dqi_bgrace = mi->dqi_bgrace; + ii->dqi_igrace = mi->dqi_igrace; + ii->dqi_flags = mi->dqi_flags & DQF_MASK; + ii->dqi_valid = IIF_ALL; + spin_unlock(&dq_data_lock); + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return 0; +} +EXPORT_SYMBOL(dquot_get_dqinfo); + +/* Generic routine for setting common part of quota file information */ +int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) +{ + struct mem_dqinfo *mi; + int err = 0; + + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + if (!sb_has_quota_active(sb, type)) { + err = -ESRCH; + goto out; + } + mi = sb_dqopt(sb)->info + type; + spin_lock(&dq_data_lock); + if (ii->dqi_valid & IIF_BGRACE) + mi->dqi_bgrace = ii->dqi_bgrace; + if (ii->dqi_valid & IIF_IGRACE) + mi->dqi_igrace = ii->dqi_igrace; + if (ii->dqi_valid & IIF_FLAGS) + mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) | + (ii->dqi_flags & DQF_MASK); + spin_unlock(&dq_data_lock); + mark_info_dirty(sb, type); + /* Force write to disk */ + sb->dq_op->write_info(sb, type); +out: + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return err; +} +EXPORT_SYMBOL(dquot_set_dqinfo); + +const struct quotactl_ops dquot_quotactl_ops = { + .quota_on = dquot_quota_on, + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk +}; +EXPORT_SYMBOL(dquot_quotactl_ops); + +static int do_proc_dqstats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int type = (int *)table->data - dqstats.stat; + + /* Update global table */ + dqstats.stat[type] = + percpu_counter_sum_positive(&dqstats.counter[type]); + return proc_dointvec(table, write, buffer, lenp, ppos); +} + +static ctl_table fs_dqstats_table[] = { + { + .procname = "lookups", + .data = &dqstats.stat[DQST_LOOKUPS], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, + { + .procname = "drops", + .data = &dqstats.stat[DQST_DROPS], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, + { + .procname = "reads", + .data = &dqstats.stat[DQST_READS], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, + { + .procname = "writes", + .data = &dqstats.stat[DQST_WRITES], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, + { + .procname = "cache_hits", + .data = &dqstats.stat[DQST_CACHE_HITS], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, + { + .procname = "allocated_dquots", + .data = &dqstats.stat[DQST_ALLOC_DQUOTS], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, + { + .procname = "free_dquots", + .data = &dqstats.stat[DQST_FREE_DQUOTS], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, + { + .procname = "syncs", + .data = &dqstats.stat[DQST_SYNCS], + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = do_proc_dqstats, + }, +#ifdef CONFIG_PRINT_QUOTA_WARNING + { + .procname = "warnings", + .data = &flag_print_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { }, +}; + +static ctl_table fs_table[] = { + { + .procname = "quota", + .mode = 0555, + .child = fs_dqstats_table, + }, + { }, +}; + +static ctl_table sys_table[] = { + { + .procname = "fs", + .mode = 0555, + .child = fs_table, + }, + { }, +}; + +static int __init dquot_init(void) +{ + int i, ret; + unsigned long nr_hash, order; + + printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); + + register_sysctl_table(sys_table); + + dquot_cachep = kmem_cache_create("dquot", + sizeof(struct dquot), sizeof(unsigned long) * 4, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_PANIC), + NULL); + + order = 0; + dquot_hash = (struct hlist_head *)__get_free_pages(GFP_ATOMIC, order); + if (!dquot_hash) + panic("Cannot create dquot hash table"); + + for (i = 0; i < _DQST_DQSTAT_LAST; i++) { + ret = percpu_counter_init(&dqstats.counter[i], 0); + if (ret) + panic("Cannot create dquot stat counters"); + } + + /* Find power-of-two hlist_heads which can fit into allocation */ + nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); + dq_hash_bits = 0; + do { + dq_hash_bits++; + } while (nr_hash >> dq_hash_bits); + dq_hash_bits--; + + nr_hash = 1UL << dq_hash_bits; + dq_hash_mask = nr_hash - 1; + for (i = 0; i < nr_hash; i++) + INIT_HLIST_HEAD(dquot_hash + i); + + printk("Dquot-cache hash table entries: %ld (order %ld, %ld bytes)\n", + nr_hash, order, (PAGE_SIZE << order)); + + register_shrinker(&dqcache_shrinker); + + return 0; +} +module_init(dquot_init); diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c new file mode 100644 index 00000000..d67908b4 --- /dev/null +++ b/fs/quota/netlink.c @@ -0,0 +1,96 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Netlink family structure for quota */ +static struct genl_family quota_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "VFS_DQUOT", + .version = 1, + .maxattr = QUOTA_NL_A_MAX, +}; + +/** + * quota_send_warning - Send warning to userspace about exceeded quota + * @type: The quota type: USRQQUOTA, GRPQUOTA,... + * @id: The user or group id of the quota that was exceeded + * @dev: The device on which the fs is mounted (sb->s_dev) + * @warntype: The type of the warning: QUOTA_NL_... + * + * This can be used by filesystems (including those which don't use + * dquot) to send a message to userspace relating to quota limits. + * + */ + +void quota_send_warning(short type, unsigned int id, dev_t dev, + const char warntype) +{ + static atomic_t seq; + struct sk_buff *skb; + void *msg_head; + int ret; + int msg_size = 4 * nla_total_size(sizeof(u32)) + + 2 * nla_total_size(sizeof(u64)); + + /* We have to allocate using GFP_NOFS as we are called from a + * filesystem performing write and thus further recursion into + * the fs to free some data could cause deadlocks. */ + skb = genlmsg_new(msg_size, GFP_NOFS); + if (!skb) { + printk(KERN_ERR + "VFS: Not enough memory to send quota warning.\n"); + return; + } + msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), + "a_genl_family, 0, QUOTA_NL_C_WARNING); + if (!msg_head) { + printk(KERN_ERR + "VFS: Cannot store netlink header in quota warning.\n"); + goto err_out; + } + ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type); + if (ret) + goto attr_err_out; + ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); + if (ret) + goto attr_err_out; + ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid()); + if (ret) + goto attr_err_out; + genlmsg_end(skb, msg_head); + + genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS); + return; +attr_err_out: + printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); +err_out: + kfree_skb(skb); +} +EXPORT_SYMBOL(quota_send_warning); + +static int __init quota_init(void) +{ + if (genl_register_family("a_genl_family) != 0) + printk(KERN_ERR + "VFS: Failed to create quota netlink interface.\n"); + return 0; +}; + +module_init(quota_init); diff --git a/fs/quota/quota.c b/fs/quota/quota.c new file mode 100644 index 00000000..10b6be3c --- /dev/null +++ b/fs/quota/quota.c @@ -0,0 +1,375 @@ +/* + * Quota code necessary even when VFS quota support is not compiled + * into the kernel. The interesting stuff is over in dquot.c, here + * we have symbols for initial quotactl(2) handling, the sysctl(2) + * variables, etc - things needed even when quota support disabled. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int check_quotactl_permission(struct super_block *sb, int type, int cmd, + qid_t id) +{ + switch (cmd) { + /* these commands do not require any special privilegues */ + case Q_GETFMT: + case Q_SYNC: + case Q_GETINFO: + case Q_XGETQSTAT: + case Q_XQUOTASYNC: + break; + /* allow to query information for dquots we "own" */ + case Q_GETQUOTA: + case Q_XGETQUOTA: + if ((type == USRQUOTA && current_euid() == id) || + (type == GRPQUOTA && in_egroup_p(id))) + break; + /*FALLTHROUGH*/ + default: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + return security_quotactl(cmd, type, id, sb); +} + +static void quota_sync_one(struct super_block *sb, void *arg) +{ + if (sb->s_qcop && sb->s_qcop->quota_sync) + sb->s_qcop->quota_sync(sb, *(int *)arg, 1); +} + +static int quota_sync_all(int type) +{ + int ret; + + if (type >= MAXQUOTAS) + return -EINVAL; + ret = security_quotactl(Q_SYNC, type, 0, NULL); + if (!ret) + iterate_supers(quota_sync_one, &type); + return ret; +} + +static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, + struct path *path) +{ + if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta) + return -ENOSYS; + if (sb->s_qcop->quota_on_meta) + return sb->s_qcop->quota_on_meta(sb, type, id); + if (IS_ERR(path)) + return PTR_ERR(path); + return sb->s_qcop->quota_on(sb, type, id, path); +} + +static int quota_getfmt(struct super_block *sb, int type, void __user *addr) +{ + __u32 fmt; + + down_read(&sb_dqopt(sb)->dqptr_sem); + if (!sb_has_quota_active(sb, type)) { + up_read(&sb_dqopt(sb)->dqptr_sem); + return -ESRCH; + } + fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; + up_read(&sb_dqopt(sb)->dqptr_sem); + if (copy_to_user(addr, &fmt, sizeof(fmt))) + return -EFAULT; + return 0; +} + +static int quota_getinfo(struct super_block *sb, int type, void __user *addr) +{ + struct if_dqinfo info; + int ret; + + if (!sb->s_qcop->get_info) + return -ENOSYS; + ret = sb->s_qcop->get_info(sb, type, &info); + if (!ret && copy_to_user(addr, &info, sizeof(info))) + return -EFAULT; + return ret; +} + +static int quota_setinfo(struct super_block *sb, int type, void __user *addr) +{ + struct if_dqinfo info; + + if (copy_from_user(&info, addr, sizeof(info))) + return -EFAULT; + if (!sb->s_qcop->set_info) + return -ENOSYS; + return sb->s_qcop->set_info(sb, type, &info); +} + +static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src) +{ + dst->dqb_bhardlimit = src->d_blk_hardlimit; + dst->dqb_bsoftlimit = src->d_blk_softlimit; + dst->dqb_curspace = src->d_bcount; + dst->dqb_ihardlimit = src->d_ino_hardlimit; + dst->dqb_isoftlimit = src->d_ino_softlimit; + dst->dqb_curinodes = src->d_icount; + dst->dqb_btime = src->d_btimer; + dst->dqb_itime = src->d_itimer; + dst->dqb_valid = QIF_ALL; +} + +static int quota_getquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct fs_disk_quota fdq; + struct if_dqblk idq; + int ret; + + if (!sb->s_qcop->get_dqblk) + return -ENOSYS; + ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq); + if (ret) + return ret; + copy_to_if_dqblk(&idq, &fdq); + if (copy_to_user(addr, &idq, sizeof(idq))) + return -EFAULT; + return 0; +} + +static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src) +{ + dst->d_blk_hardlimit = src->dqb_bhardlimit; + dst->d_blk_softlimit = src->dqb_bsoftlimit; + dst->d_bcount = src->dqb_curspace; + dst->d_ino_hardlimit = src->dqb_ihardlimit; + dst->d_ino_softlimit = src->dqb_isoftlimit; + dst->d_icount = src->dqb_curinodes; + dst->d_btimer = src->dqb_btime; + dst->d_itimer = src->dqb_itime; + + dst->d_fieldmask = 0; + if (src->dqb_valid & QIF_BLIMITS) + dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD; + if (src->dqb_valid & QIF_SPACE) + dst->d_fieldmask |= FS_DQ_BCOUNT; + if (src->dqb_valid & QIF_ILIMITS) + dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD; + if (src->dqb_valid & QIF_INODES) + dst->d_fieldmask |= FS_DQ_ICOUNT; + if (src->dqb_valid & QIF_BTIME) + dst->d_fieldmask |= FS_DQ_BTIMER; + if (src->dqb_valid & QIF_ITIME) + dst->d_fieldmask |= FS_DQ_ITIMER; +} + +static int quota_setquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct fs_disk_quota fdq; + struct if_dqblk idq; + + if (copy_from_user(&idq, addr, sizeof(idq))) + return -EFAULT; + if (!sb->s_qcop->set_dqblk) + return -ENOSYS; + copy_from_if_dqblk(&fdq, &idq); + return sb->s_qcop->set_dqblk(sb, type, id, &fdq); +} + +static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) +{ + __u32 flags; + + if (copy_from_user(&flags, addr, sizeof(flags))) + return -EFAULT; + if (!sb->s_qcop->set_xstate) + return -ENOSYS; + return sb->s_qcop->set_xstate(sb, flags, cmd); +} + +static int quota_getxstate(struct super_block *sb, void __user *addr) +{ + struct fs_quota_stat fqs; + int ret; + + if (!sb->s_qcop->get_xstate) + return -ENOSYS; + ret = sb->s_qcop->get_xstate(sb, &fqs); + if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) + return -EFAULT; + return ret; +} + +static int quota_setxquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct fs_disk_quota fdq; + + if (copy_from_user(&fdq, addr, sizeof(fdq))) + return -EFAULT; + if (!sb->s_qcop->set_dqblk) + return -ENOSYS; + return sb->s_qcop->set_dqblk(sb, type, id, &fdq); +} + +static int quota_getxquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct fs_disk_quota fdq; + int ret; + + if (!sb->s_qcop->get_dqblk) + return -ENOSYS; + ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq); + if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) + return -EFAULT; + return ret; +} + +/* Copy parameters and call proper function */ +static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, + void __user *addr, struct path *path) +{ + int ret; + + if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS)) + return -EINVAL; + if (!sb->s_qcop) + return -ENOSYS; + + ret = check_quotactl_permission(sb, type, cmd, id); + if (ret < 0) + return ret; + + switch (cmd) { + case Q_QUOTAON: + return quota_quotaon(sb, type, cmd, id, path); + case Q_QUOTAOFF: + if (!sb->s_qcop->quota_off) + return -ENOSYS; + return sb->s_qcop->quota_off(sb, type); + case Q_GETFMT: + return quota_getfmt(sb, type, addr); + case Q_GETINFO: + return quota_getinfo(sb, type, addr); + case Q_SETINFO: + return quota_setinfo(sb, type, addr); + case Q_GETQUOTA: + return quota_getquota(sb, type, id, addr); + case Q_SETQUOTA: + return quota_setquota(sb, type, id, addr); + case Q_SYNC: + if (!sb->s_qcop->quota_sync) + return -ENOSYS; + return sb->s_qcop->quota_sync(sb, type, 1); + case Q_XQUOTAON: + case Q_XQUOTAOFF: + case Q_XQUOTARM: + return quota_setxstate(sb, cmd, addr); + case Q_XGETQSTAT: + return quota_getxstate(sb, addr); + case Q_XSETQLIM: + return quota_setxquota(sb, type, id, addr); + case Q_XGETQUOTA: + return quota_getxquota(sb, type, id, addr); + case Q_XQUOTASYNC: + /* caller already holds s_umount */ + if (sb->s_flags & MS_RDONLY) + return -EROFS; + writeback_inodes_sb(sb); + return 0; + default: + return -EINVAL; + } +} + +/* + * look up a superblock on which quota ops will be performed + * - use the name of a block device to find the superblock thereon + */ +static struct super_block *quotactl_block(const char __user *special) +{ +#ifdef CONFIG_BLOCK + struct block_device *bdev; + struct super_block *sb; + char *tmp = getname(special); + + if (IS_ERR(tmp)) + return ERR_CAST(tmp); + bdev = lookup_bdev(tmp); + putname(tmp); + if (IS_ERR(bdev)) + return ERR_CAST(bdev); + sb = get_super(bdev); + bdput(bdev); + if (!sb) + return ERR_PTR(-ENODEV); + + return sb; +#else + return ERR_PTR(-ENODEV); +#endif +} + +/* + * This is the system call interface. This communicates with + * the user-level programs. Currently this only supports diskquota + * calls. Maybe we need to add the process quotas etc. in the future, + * but we probably should use rlimits for that. + */ +SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, + qid_t, id, void __user *, addr) +{ + uint cmds, type; + struct super_block *sb = NULL; + struct path path, *pathp = NULL; + int ret; + + cmds = cmd >> SUBCMDSHIFT; + type = cmd & SUBCMDMASK; + + /* + * As a special case Q_SYNC can be called without a specific device. + * It will iterate all superblocks that have quota enabled and call + * the sync action on each of them. + */ + if (!special) { + if (cmds == Q_SYNC) + return quota_sync_all(type); + return -ENODEV; + } + + /* + * Path for quotaon has to be resolved before grabbing superblock + * because that gets s_umount sem which is also possibly needed by path + * resolution (think about autofs) and thus deadlocks could arise. + */ + if (cmds == Q_QUOTAON) { + ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + if (ret) + pathp = ERR_PTR(ret); + else + pathp = &path; + } + + sb = quotactl_block(special); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + ret = do_quotactl(sb, type, cmds, id, addr, pathp); + + drop_super(sb); + if (pathp && !IS_ERR(pathp)) + path_put(pathp); + return ret; +} diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c new file mode 100644 index 00000000..e41c1bec --- /dev/null +++ b/fs/quota/quota_tree.c @@ -0,0 +1,659 @@ +/* + * vfsv0 quota IO operations on file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "quota_tree.h" + +MODULE_AUTHOR("Jan Kara"); +MODULE_DESCRIPTION("Quota trie support"); +MODULE_LICENSE("GPL"); + +#define __QUOTA_QT_PARANOIA + +static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth) +{ + unsigned int epb = info->dqi_usable_bs >> 2; + + depth = info->dqi_qtree_depth - depth - 1; + while (depth--) + id /= epb; + return id % epb; +} + +/* Number of entries in one blocks */ +static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info) +{ + return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader)) + / info->dqi_entry_size; +} + +static char *getdqbuf(size_t size) +{ + char *buf = kmalloc(size, GFP_NOFS); + if (!buf) + printk(KERN_WARNING + "VFS: Not enough memory for quota buffers.\n"); + return buf; +} + +static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) +{ + struct super_block *sb = info->dqi_sb; + + memset(buf, 0, info->dqi_usable_bs); + return sb->s_op->quota_read(sb, info->dqi_type, buf, + info->dqi_usable_bs, blk << info->dqi_blocksize_bits); +} + +static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) +{ + struct super_block *sb = info->dqi_sb; + ssize_t ret; + + ret = sb->s_op->quota_write(sb, info->dqi_type, buf, + info->dqi_usable_bs, blk << info->dqi_blocksize_bits); + if (ret != info->dqi_usable_bs) { + quota_error(sb, "dquota write failed"); + if (ret >= 0) + ret = -EIO; + } + return ret; +} + +/* Remove empty block from list and return it */ +static int get_free_dqblk(struct qtree_mem_dqinfo *info) +{ + char *buf = getdqbuf(info->dqi_usable_bs); + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + int ret, blk; + + if (!buf) + return -ENOMEM; + if (info->dqi_free_blk) { + blk = info->dqi_free_blk; + ret = read_blk(info, blk, buf); + if (ret < 0) + goto out_buf; + info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); + } + else { + memset(buf, 0, info->dqi_usable_bs); + /* Assure block allocation... */ + ret = write_blk(info, info->dqi_blocks, buf); + if (ret < 0) + goto out_buf; + blk = info->dqi_blocks++; + } + mark_info_dirty(info->dqi_sb, info->dqi_type); + ret = blk; +out_buf: + kfree(buf); + return ret; +} + +/* Insert empty block to the list */ +static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk) +{ + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + int err; + + dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk); + dh->dqdh_prev_free = cpu_to_le32(0); + dh->dqdh_entries = cpu_to_le16(0); + err = write_blk(info, blk, buf); + if (err < 0) + return err; + info->dqi_free_blk = blk; + mark_info_dirty(info->dqi_sb, info->dqi_type); + return 0; +} + +/* Remove given block from the list of blocks with free entries */ +static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, + uint blk) +{ + char *tmpbuf = getdqbuf(info->dqi_usable_bs); + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + uint nextblk = le32_to_cpu(dh->dqdh_next_free); + uint prevblk = le32_to_cpu(dh->dqdh_prev_free); + int err; + + if (!tmpbuf) + return -ENOMEM; + if (nextblk) { + err = read_blk(info, nextblk, tmpbuf); + if (err < 0) + goto out_buf; + ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = + dh->dqdh_prev_free; + err = write_blk(info, nextblk, tmpbuf); + if (err < 0) + goto out_buf; + } + if (prevblk) { + err = read_blk(info, prevblk, tmpbuf); + if (err < 0) + goto out_buf; + ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free = + dh->dqdh_next_free; + err = write_blk(info, prevblk, tmpbuf); + if (err < 0) + goto out_buf; + } else { + info->dqi_free_entry = nextblk; + mark_info_dirty(info->dqi_sb, info->dqi_type); + } + kfree(tmpbuf); + dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); + /* No matter whether write succeeds block is out of list */ + if (write_blk(info, blk, buf) < 0) + quota_error(info->dqi_sb, "Can't write block (%u) " + "with free entries", blk); + return 0; +out_buf: + kfree(tmpbuf); + return err; +} + +/* Insert given block to the beginning of list with free entries */ +static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, + uint blk) +{ + char *tmpbuf = getdqbuf(info->dqi_usable_bs); + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + int err; + + if (!tmpbuf) + return -ENOMEM; + dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry); + dh->dqdh_prev_free = cpu_to_le32(0); + err = write_blk(info, blk, buf); + if (err < 0) + goto out_buf; + if (info->dqi_free_entry) { + err = read_blk(info, info->dqi_free_entry, tmpbuf); + if (err < 0) + goto out_buf; + ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = + cpu_to_le32(blk); + err = write_blk(info, info->dqi_free_entry, tmpbuf); + if (err < 0) + goto out_buf; + } + kfree(tmpbuf); + info->dqi_free_entry = blk; + mark_info_dirty(info->dqi_sb, info->dqi_type); + return 0; +out_buf: + kfree(tmpbuf); + return err; +} + +/* Is the entry in the block free? */ +int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk) +{ + int i; + + for (i = 0; i < info->dqi_entry_size; i++) + if (disk[i]) + return 0; + return 1; +} +EXPORT_SYMBOL(qtree_entry_unused); + +/* Find space for dquot */ +static uint find_free_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot, int *err) +{ + uint blk, i; + struct qt_disk_dqdbheader *dh; + char *buf = getdqbuf(info->dqi_usable_bs); + char *ddquot; + + *err = 0; + if (!buf) { + *err = -ENOMEM; + return 0; + } + dh = (struct qt_disk_dqdbheader *)buf; + if (info->dqi_free_entry) { + blk = info->dqi_free_entry; + *err = read_blk(info, blk, buf); + if (*err < 0) + goto out_buf; + } else { + blk = get_free_dqblk(info); + if ((int)blk < 0) { + *err = blk; + kfree(buf); + return 0; + } + memset(buf, 0, info->dqi_usable_bs); + /* This is enough as the block is already zeroed and the entry + * list is empty... */ + info->dqi_free_entry = blk; + mark_info_dirty(dquot->dq_sb, dquot->dq_type); + } + /* Block will be full? */ + if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { + *err = remove_free_dqentry(info, buf, blk); + if (*err < 0) { + quota_error(dquot->dq_sb, "Can't remove block (%u) " + "from entry free list", blk); + goto out_buf; + } + } + le16_add_cpu(&dh->dqdh_entries, 1); + /* Find free structure in block */ + ddquot = buf + sizeof(struct qt_disk_dqdbheader); + for (i = 0; i < qtree_dqstr_in_blk(info); i++) { + if (qtree_entry_unused(info, ddquot)) + break; + ddquot += info->dqi_entry_size; + } +#ifdef __QUOTA_QT_PARANOIA + if (i == qtree_dqstr_in_blk(info)) { + quota_error(dquot->dq_sb, "Data block full but it shouldn't"); + *err = -EIO; + goto out_buf; + } +#endif + *err = write_blk(info, blk, buf); + if (*err < 0) { + quota_error(dquot->dq_sb, "Can't write quota data block %u", + blk); + goto out_buf; + } + dquot->dq_off = (blk << info->dqi_blocksize_bits) + + sizeof(struct qt_disk_dqdbheader) + + i * info->dqi_entry_size; + kfree(buf); + return blk; +out_buf: + kfree(buf); + return 0; +} + +/* Insert reference to structure into the trie */ +static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, + uint *treeblk, int depth) +{ + char *buf = getdqbuf(info->dqi_usable_bs); + int ret = 0, newson = 0, newact = 0; + __le32 *ref; + uint newblk; + + if (!buf) + return -ENOMEM; + if (!*treeblk) { + ret = get_free_dqblk(info); + if (ret < 0) + goto out_buf; + *treeblk = ret; + memset(buf, 0, info->dqi_usable_bs); + newact = 1; + } else { + ret = read_blk(info, *treeblk, buf); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't read tree quota " + "block %u", *treeblk); + goto out_buf; + } + } + ref = (__le32 *)buf; + newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (!newblk) + newson = 1; + if (depth == info->dqi_qtree_depth - 1) { +#ifdef __QUOTA_QT_PARANOIA + if (newblk) { + quota_error(dquot->dq_sb, "Inserting already present " + "quota entry (block %u)", + le32_to_cpu(ref[get_index(info, + dquot->dq_id, depth)])); + ret = -EIO; + goto out_buf; + } +#endif + newblk = find_free_dqentry(info, dquot, &ret); + } else { + ret = do_insert_tree(info, dquot, &newblk, depth+1); + } + if (newson && ret >= 0) { + ref[get_index(info, dquot->dq_id, depth)] = + cpu_to_le32(newblk); + ret = write_blk(info, *treeblk, buf); + } else if (newact && ret < 0) { + put_free_dqblk(info, buf, *treeblk); + } +out_buf: + kfree(buf); + return ret; +} + +/* Wrapper for inserting quota structure into tree */ +static inline int dq_insert_tree(struct qtree_mem_dqinfo *info, + struct dquot *dquot) +{ + int tmp = QT_TREEOFF; + return do_insert_tree(info, dquot, &tmp, 0); +} + +/* + * We don't have to be afraid of deadlocks as we never have quotas on quota + * files... + */ +int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + int type = dquot->dq_type; + struct super_block *sb = dquot->dq_sb; + ssize_t ret; + char *ddquot = getdqbuf(info->dqi_entry_size); + + if (!ddquot) + return -ENOMEM; + + /* dq_off is guarded by dqio_mutex */ + if (!dquot->dq_off) { + ret = dq_insert_tree(info, dquot); + if (ret < 0) { + quota_error(sb, "Error %zd occurred while creating " + "quota", ret); + kfree(ddquot); + return ret; + } + } + spin_lock(&dq_data_lock); + info->dqi_ops->mem2disk_dqblk(ddquot, dquot); + spin_unlock(&dq_data_lock); + ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, + dquot->dq_off); + if (ret != info->dqi_entry_size) { + quota_error(sb, "dquota write failed"); + if (ret >= 0) + ret = -ENOSPC; + } else { + ret = 0; + } + dqstats_inc(DQST_WRITES); + kfree(ddquot); + + return ret; +} +EXPORT_SYMBOL(qtree_write_dquot); + +/* Free dquot entry in data block */ +static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, + uint blk) +{ + struct qt_disk_dqdbheader *dh; + char *buf = getdqbuf(info->dqi_usable_bs); + int ret = 0; + + if (!buf) + return -ENOMEM; + if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { + quota_error(dquot->dq_sb, "Quota structure has offset to " + "other block (%u) than it should (%u)", blk, + (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); + goto out_buf; + } + ret = read_blk(info, blk, buf); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't read quota data block %u", + blk); + goto out_buf; + } + dh = (struct qt_disk_dqdbheader *)buf; + le16_add_cpu(&dh->dqdh_entries, -1); + if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ + ret = remove_free_dqentry(info, buf, blk); + if (ret >= 0) + ret = put_free_dqblk(info, buf, blk); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't move quota data block " + "(%u) to free list", blk); + goto out_buf; + } + } else { + memset(buf + + (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)), + 0, info->dqi_entry_size); + if (le16_to_cpu(dh->dqdh_entries) == + qtree_dqstr_in_blk(info) - 1) { + /* Insert will write block itself */ + ret = insert_free_dqentry(info, buf, blk); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't insert quota " + "data block (%u) to free entry list", blk); + goto out_buf; + } + } else { + ret = write_blk(info, blk, buf); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't write quota " + "data block %u", blk); + goto out_buf; + } + } + } + dquot->dq_off = 0; /* Quota is now unattached */ +out_buf: + kfree(buf); + return ret; +} + +/* Remove reference to dquot from tree */ +static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, + uint *blk, int depth) +{ + char *buf = getdqbuf(info->dqi_usable_bs); + int ret = 0; + uint newblk; + __le32 *ref = (__le32 *)buf; + + if (!buf) + return -ENOMEM; + ret = read_blk(info, *blk, buf); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't read quota data block %u", + *blk); + goto out_buf; + } + newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (depth == info->dqi_qtree_depth - 1) { + ret = free_dqentry(info, dquot, newblk); + newblk = 0; + } else { + ret = remove_tree(info, dquot, &newblk, depth+1); + } + if (ret >= 0 && !newblk) { + int i; + ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0); + /* Block got empty? */ + for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++) + ; + /* Don't put the root block into the free block list */ + if (i == (info->dqi_usable_bs >> 2) + && *blk != QT_TREEOFF) { + put_free_dqblk(info, buf, *blk); + *blk = 0; + } else { + ret = write_blk(info, *blk, buf); + if (ret < 0) + quota_error(dquot->dq_sb, + "Can't write quota tree block %u", + *blk); + } + } +out_buf: + kfree(buf); + return ret; +} + +/* Delete dquot from tree */ +int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + uint tmp = QT_TREEOFF; + + if (!dquot->dq_off) /* Even not allocated? */ + return 0; + return remove_tree(info, dquot, &tmp, 0); +} +EXPORT_SYMBOL(qtree_delete_dquot); + +/* Find entry in block */ +static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot, uint blk) +{ + char *buf = getdqbuf(info->dqi_usable_bs); + loff_t ret = 0; + int i; + char *ddquot; + + if (!buf) + return -ENOMEM; + ret = read_blk(info, blk, buf); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't read quota tree " + "block %u", blk); + goto out_buf; + } + ddquot = buf + sizeof(struct qt_disk_dqdbheader); + for (i = 0; i < qtree_dqstr_in_blk(info); i++) { + if (info->dqi_ops->is_id(ddquot, dquot)) + break; + ddquot += info->dqi_entry_size; + } + if (i == qtree_dqstr_in_blk(info)) { + quota_error(dquot->dq_sb, "Quota for id %u referenced " + "but not present", dquot->dq_id); + ret = -EIO; + goto out_buf; + } else { + ret = (blk << info->dqi_blocksize_bits) + sizeof(struct + qt_disk_dqdbheader) + i * info->dqi_entry_size; + } +out_buf: + kfree(buf); + return ret; +} + +/* Find entry for given id in the tree */ +static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot, uint blk, int depth) +{ + char *buf = getdqbuf(info->dqi_usable_bs); + loff_t ret = 0; + __le32 *ref = (__le32 *)buf; + + if (!buf) + return -ENOMEM; + ret = read_blk(info, blk, buf); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't read quota tree block %u", + blk); + goto out_buf; + } + ret = 0; + blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (!blk) /* No reference? */ + goto out_buf; + if (depth < info->dqi_qtree_depth - 1) + ret = find_tree_dqentry(info, dquot, blk, depth+1); + else + ret = find_block_dqentry(info, dquot, blk); +out_buf: + kfree(buf); + return ret; +} + +/* Find entry for given id in the tree - wrapper function */ +static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot) +{ + return find_tree_dqentry(info, dquot, QT_TREEOFF, 0); +} + +int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + int type = dquot->dq_type; + struct super_block *sb = dquot->dq_sb; + loff_t offset; + char *ddquot; + int ret = 0; + +#ifdef __QUOTA_QT_PARANOIA + /* Invalidated quota? */ + if (!sb_dqopt(dquot->dq_sb)->files[type]) { + quota_error(sb, "Quota invalidated while reading!"); + return -EIO; + } +#endif + /* Do we know offset of the dquot entry in the quota file? */ + if (!dquot->dq_off) { + offset = find_dqentry(info, dquot); + if (offset <= 0) { /* Entry not present? */ + if (offset < 0) + quota_error(sb, "Can't read quota structure " + "for id %u", dquot->dq_id); + dquot->dq_off = 0; + set_bit(DQ_FAKE_B, &dquot->dq_flags); + memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); + ret = offset; + goto out; + } + dquot->dq_off = offset; + } + ddquot = getdqbuf(info->dqi_entry_size); + if (!ddquot) + return -ENOMEM; + ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size, + dquot->dq_off); + if (ret != info->dqi_entry_size) { + if (ret >= 0) + ret = -EIO; + quota_error(sb, "Error while reading quota structure for id %u", + dquot->dq_id); + set_bit(DQ_FAKE_B, &dquot->dq_flags); + memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); + kfree(ddquot); + goto out; + } + spin_lock(&dq_data_lock); + info->dqi_ops->disk2mem_dqblk(dquot, ddquot); + if (!dquot->dq_dqb.dqb_bhardlimit && + !dquot->dq_dqb.dqb_bsoftlimit && + !dquot->dq_dqb.dqb_ihardlimit && + !dquot->dq_dqb.dqb_isoftlimit) + set_bit(DQ_FAKE_B, &dquot->dq_flags); + spin_unlock(&dq_data_lock); + kfree(ddquot); +out: + dqstats_inc(DQST_READS); + return ret; +} +EXPORT_SYMBOL(qtree_read_dquot); + +/* Check whether dquot should not be deleted. We know we are + * the only one operating on dquot (thanks to dq_lock) */ +int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && + !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) + return qtree_delete_dquot(info, dquot); + return 0; +} +EXPORT_SYMBOL(qtree_release_dquot); diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h new file mode 100644 index 00000000..a1ab8db8 --- /dev/null +++ b/fs/quota/quota_tree.h @@ -0,0 +1,25 @@ +/* + * Definitions of structures for vfsv0 quota format + */ + +#ifndef _LINUX_QUOTA_TREE_H +#define _LINUX_QUOTA_TREE_H + +#include +#include + +/* + * Structure of header of block with quota structures. It is padded to 16 bytes so + * there will be space for exactly 21 quota-entries in a block + */ +struct qt_disk_dqdbheader { + __le32 dqdh_next_free; /* Number of next block with free entry */ + __le32 dqdh_prev_free; /* Number of previous block with free entry */ + __le16 dqdh_entries; /* Number of valid entries in block */ + __le16 dqdh_pad1; + __le32 dqdh_pad2; +}; + +#define QT_TREEOFF 1 /* Offset of tree in file in blocks */ + +#endif /* _LINUX_QUOTAIO_TREE_H */ diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c new file mode 100644 index 00000000..34b37a67 --- /dev/null +++ b/fs/quota/quota_v1.c @@ -0,0 +1,233 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "quotaio_v1.h" + +MODULE_AUTHOR("Jan Kara"); +MODULE_DESCRIPTION("Old quota format support"); +MODULE_LICENSE("GPL"); + +#define QUOTABLOCK_BITS 10 +#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) + +static inline qsize_t v1_stoqb(qsize_t space) +{ + return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS; +} + +static inline qsize_t v1_qbtos(qsize_t blocks) +{ + return blocks << QUOTABLOCK_BITS; +} + +static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d) +{ + m->dqb_ihardlimit = d->dqb_ihardlimit; + m->dqb_isoftlimit = d->dqb_isoftlimit; + m->dqb_curinodes = d->dqb_curinodes; + m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit); + m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit); + m->dqb_curspace = v1_qbtos(d->dqb_curblocks); + m->dqb_itime = d->dqb_itime; + m->dqb_btime = d->dqb_btime; +} + +static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m) +{ + d->dqb_ihardlimit = m->dqb_ihardlimit; + d->dqb_isoftlimit = m->dqb_isoftlimit; + d->dqb_curinodes = m->dqb_curinodes; + d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit); + d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit); + d->dqb_curblocks = v1_stoqb(m->dqb_curspace); + d->dqb_itime = m->dqb_itime; + d->dqb_btime = m->dqb_btime; +} + +static int v1_read_dqblk(struct dquot *dquot) +{ + int type = dquot->dq_type; + struct v1_disk_dqblk dqblk; + + if (!sb_dqopt(dquot->dq_sb)->files[type]) + return -EINVAL; + + /* Set structure to 0s in case read fails/is after end of file */ + memset(&dqblk, 0, sizeof(struct v1_disk_dqblk)); + dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk, + sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id)); + + v1_disk2mem_dqblk(&dquot->dq_dqb, &dqblk); + if (dquot->dq_dqb.dqb_bhardlimit == 0 && + dquot->dq_dqb.dqb_bsoftlimit == 0 && + dquot->dq_dqb.dqb_ihardlimit == 0 && + dquot->dq_dqb.dqb_isoftlimit == 0) + set_bit(DQ_FAKE_B, &dquot->dq_flags); + dqstats_inc(DQST_READS); + + return 0; +} + +static int v1_commit_dqblk(struct dquot *dquot) +{ + short type = dquot->dq_type; + ssize_t ret; + struct v1_disk_dqblk dqblk; + + v1_mem2disk_dqblk(&dqblk, &dquot->dq_dqb); + if (dquot->dq_id == 0) { + dqblk.dqb_btime = + sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace; + dqblk.dqb_itime = + sb_dqopt(dquot->dq_sb)->info[type].dqi_igrace; + } + ret = 0; + if (sb_dqopt(dquot->dq_sb)->files[type]) + ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, + (char *)&dqblk, sizeof(struct v1_disk_dqblk), + v1_dqoff(dquot->dq_id)); + if (ret != sizeof(struct v1_disk_dqblk)) { + quota_error(dquot->dq_sb, "dquota write failed"); + if (ret >= 0) + ret = -EIO; + goto out; + } + ret = 0; + +out: + dqstats_inc(DQST_WRITES); + + return ret; +} + +/* Magics of new quota format */ +#define V2_INITQMAGICS {\ + 0xd9c01f11, /* USRQUOTA */\ + 0xd9c01927 /* GRPQUOTA */\ +} + +/* Header of new quota format */ +struct v2_disk_dqheader { + __le32 dqh_magic; /* Magic number identifying file */ + __le32 dqh_version; /* File version */ +}; + +static int v1_check_quota_file(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + ulong blocks; + size_t off; + struct v2_disk_dqheader dqhead; + ssize_t size; + loff_t isize; + static const uint quota_magics[] = V2_INITQMAGICS; + + isize = i_size_read(inode); + if (!isize) + return 0; + blocks = isize >> BLOCK_SIZE_BITS; + off = isize & (BLOCK_SIZE - 1); + if ((blocks % sizeof(struct v1_disk_dqblk) * BLOCK_SIZE + off) % + sizeof(struct v1_disk_dqblk)) + return 0; + /* Doublecheck whether we didn't get file with new format - with old + * quotactl() this could happen */ + size = sb->s_op->quota_read(sb, type, (char *)&dqhead, + sizeof(struct v2_disk_dqheader), 0); + if (size != sizeof(struct v2_disk_dqheader)) + return 1; /* Probably not new format */ + if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type]) + return 1; /* Definitely not new format */ + printk(KERN_INFO + "VFS: %s: Refusing to turn on old quota format on given file." + " It probably contains newer quota format.\n", sb->s_id); + return 0; /* Seems like a new format file -> refuse it */ +} + +static int v1_read_file_info(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + struct v1_disk_dqblk dqblk; + int ret; + + ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, + sizeof(struct v1_disk_dqblk), v1_dqoff(0)); + if (ret != sizeof(struct v1_disk_dqblk)) { + if (ret >= 0) + ret = -EIO; + goto out; + } + ret = 0; + /* limits are stored as unsigned 32-bit data */ + dqopt->info[type].dqi_maxblimit = 0xffffffff; + dqopt->info[type].dqi_maxilimit = 0xffffffff; + dqopt->info[type].dqi_igrace = + dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME; + dqopt->info[type].dqi_bgrace = + dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME; +out: + return ret; +} + +static int v1_write_file_info(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + struct v1_disk_dqblk dqblk; + int ret; + + dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY; + ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, + sizeof(struct v1_disk_dqblk), v1_dqoff(0)); + if (ret != sizeof(struct v1_disk_dqblk)) { + if (ret >= 0) + ret = -EIO; + goto out; + } + dqblk.dqb_itime = dqopt->info[type].dqi_igrace; + dqblk.dqb_btime = dqopt->info[type].dqi_bgrace; + ret = sb->s_op->quota_write(sb, type, (char *)&dqblk, + sizeof(struct v1_disk_dqblk), v1_dqoff(0)); + if (ret == sizeof(struct v1_disk_dqblk)) + ret = 0; + else if (ret > 0) + ret = -EIO; +out: + return ret; +} + +static const struct quota_format_ops v1_format_ops = { + .check_quota_file = v1_check_quota_file, + .read_file_info = v1_read_file_info, + .write_file_info = v1_write_file_info, + .free_file_info = NULL, + .read_dqblk = v1_read_dqblk, + .commit_dqblk = v1_commit_dqblk, +}; + +static struct quota_format_type v1_quota_format = { + .qf_fmt_id = QFMT_VFS_OLD, + .qf_ops = &v1_format_ops, + .qf_owner = THIS_MODULE +}; + +static int __init init_v1_quota_format(void) +{ + return register_quota_format(&v1_quota_format); +} + +static void __exit exit_v1_quota_format(void) +{ + unregister_quota_format(&v1_quota_format); +} + +module_init(init_v1_quota_format); +module_exit(exit_v1_quota_format); + diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c new file mode 100644 index 00000000..f1ab3604 --- /dev/null +++ b/fs/quota/quota_v2.c @@ -0,0 +1,336 @@ +/* + * vfsv0 quota IO operations on file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "quota_tree.h" +#include "quotaio_v2.h" + +MODULE_AUTHOR("Jan Kara"); +MODULE_DESCRIPTION("Quota format v2 support"); +MODULE_LICENSE("GPL"); + +#define __QUOTA_V2_PARANOIA + +static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot); +static void v2r0_disk2memdqb(struct dquot *dquot, void *dp); +static int v2r0_is_id(void *dp, struct dquot *dquot); +static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot); +static void v2r1_disk2memdqb(struct dquot *dquot, void *dp); +static int v2r1_is_id(void *dp, struct dquot *dquot); + +static struct qtree_fmt_operations v2r0_qtree_ops = { + .mem2disk_dqblk = v2r0_mem2diskdqb, + .disk2mem_dqblk = v2r0_disk2memdqb, + .is_id = v2r0_is_id, +}; + +static struct qtree_fmt_operations v2r1_qtree_ops = { + .mem2disk_dqblk = v2r1_mem2diskdqb, + .disk2mem_dqblk = v2r1_disk2memdqb, + .is_id = v2r1_is_id, +}; + +#define QUOTABLOCK_BITS 10 +#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) + +static inline qsize_t v2_stoqb(qsize_t space) +{ + return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS; +} + +static inline qsize_t v2_qbtos(qsize_t blocks) +{ + return blocks << QUOTABLOCK_BITS; +} + +static int v2_read_header(struct super_block *sb, int type, + struct v2_disk_dqheader *dqhead) +{ + ssize_t size; + + size = sb->s_op->quota_read(sb, type, (char *)dqhead, + sizeof(struct v2_disk_dqheader), 0); + if (size != sizeof(struct v2_disk_dqheader)) { + quota_error(sb, "Failed header read: expected=%zd got=%zd", + sizeof(struct v2_disk_dqheader), size); + return 0; + } + return 1; +} + +/* Check whether given file is really vfsv0 quotafile */ +static int v2_check_quota_file(struct super_block *sb, int type) +{ + struct v2_disk_dqheader dqhead; + static const uint quota_magics[] = V2_INITQMAGICS; + static const uint quota_versions[] = V2_INITQVERSIONS; + + if (!v2_read_header(sb, type, &dqhead)) + return 0; + if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || + le32_to_cpu(dqhead.dqh_version) > quota_versions[type]) + return 0; + return 1; +} + +/* Read information header from quota file */ +static int v2_read_file_info(struct super_block *sb, int type) +{ + struct v2_disk_dqinfo dinfo; + struct v2_disk_dqheader dqhead; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct qtree_mem_dqinfo *qinfo; + ssize_t size; + unsigned int version; + + if (!v2_read_header(sb, type, &dqhead)) + return -1; + version = le32_to_cpu(dqhead.dqh_version); + if ((info->dqi_fmt_id == QFMT_VFS_V0 && version != 0) || + (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1)) + return -1; + + size = sb->s_op->quota_read(sb, type, (char *)&dinfo, + sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); + if (size != sizeof(struct v2_disk_dqinfo)) { + quota_error(sb, "Can't read info structure"); + return -1; + } + info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS); + if (!info->dqi_priv) { + printk(KERN_WARNING + "Not enough memory for quota information structure.\n"); + return -ENOMEM; + } + qinfo = info->dqi_priv; + if (version == 0) { + /* limits are stored as unsigned 32-bit data */ + info->dqi_maxblimit = 0xffffffff; + info->dqi_maxilimit = 0xffffffff; + } else { + /* used space is stored as unsigned 64-bit value */ + info->dqi_maxblimit = 0xffffffffffffffffULL; /* 2^64-1 */ + info->dqi_maxilimit = 0xffffffffffffffffULL; + } + info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); + info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); + info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); + qinfo->dqi_sb = sb; + qinfo->dqi_type = type; + qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); + qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); + qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); + qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS; + qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS; + qinfo->dqi_qtree_depth = qtree_depth(qinfo); + if (version == 0) { + qinfo->dqi_entry_size = sizeof(struct v2r0_disk_dqblk); + qinfo->dqi_ops = &v2r0_qtree_ops; + } else { + qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk); + qinfo->dqi_ops = &v2r1_qtree_ops; + } + return 0; +} + +/* Write information header to quota file */ +static int v2_write_file_info(struct super_block *sb, int type) +{ + struct v2_disk_dqinfo dinfo; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct qtree_mem_dqinfo *qinfo = info->dqi_priv; + ssize_t size; + + spin_lock(&dq_data_lock); + info->dqi_flags &= ~DQF_INFO_DIRTY; + dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace); + dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); + dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); + spin_unlock(&dq_data_lock); + dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks); + dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk); + dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry); + size = sb->s_op->quota_write(sb, type, (char *)&dinfo, + sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); + if (size != sizeof(struct v2_disk_dqinfo)) { + quota_error(sb, "Can't write info structure"); + return -1; + } + return 0; +} + +static void v2r0_disk2memdqb(struct dquot *dquot, void *dp) +{ + struct v2r0_disk_dqblk *d = dp, empty; + struct mem_dqblk *m = &dquot->dq_dqb; + + m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); + m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit); + m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes); + m->dqb_itime = le64_to_cpu(d->dqb_itime); + m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit)); + m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit)); + m->dqb_curspace = le64_to_cpu(d->dqb_curspace); + m->dqb_btime = le64_to_cpu(d->dqb_btime); + /* We need to escape back all-zero structure */ + memset(&empty, 0, sizeof(struct v2r0_disk_dqblk)); + empty.dqb_itime = cpu_to_le64(1); + if (!memcmp(&empty, dp, sizeof(struct v2r0_disk_dqblk))) + m->dqb_itime = 0; +} + +static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot) +{ + struct v2r0_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + + d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit); + d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit); + d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes); + d->dqb_itime = cpu_to_le64(m->dqb_itime); + d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit)); + d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit)); + d->dqb_curspace = cpu_to_le64(m->dqb_curspace); + d->dqb_btime = cpu_to_le64(m->dqb_btime); + d->dqb_id = cpu_to_le32(dquot->dq_id); + if (qtree_entry_unused(info, dp)) + d->dqb_itime = cpu_to_le64(1); +} + +static int v2r0_is_id(void *dp, struct dquot *dquot) +{ + struct v2r0_disk_dqblk *d = dp; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + + if (qtree_entry_unused(info, dp)) + return 0; + return le32_to_cpu(d->dqb_id) == dquot->dq_id; +} + +static void v2r1_disk2memdqb(struct dquot *dquot, void *dp) +{ + struct v2r1_disk_dqblk *d = dp, empty; + struct mem_dqblk *m = &dquot->dq_dqb; + + m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit); + m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit); + m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes); + m->dqb_itime = le64_to_cpu(d->dqb_itime); + m->dqb_bhardlimit = v2_qbtos(le64_to_cpu(d->dqb_bhardlimit)); + m->dqb_bsoftlimit = v2_qbtos(le64_to_cpu(d->dqb_bsoftlimit)); + m->dqb_curspace = le64_to_cpu(d->dqb_curspace); + m->dqb_btime = le64_to_cpu(d->dqb_btime); + /* We need to escape back all-zero structure */ + memset(&empty, 0, sizeof(struct v2r1_disk_dqblk)); + empty.dqb_itime = cpu_to_le64(1); + if (!memcmp(&empty, dp, sizeof(struct v2r1_disk_dqblk))) + m->dqb_itime = 0; +} + +static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot) +{ + struct v2r1_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + + d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); + d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); + d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes); + d->dqb_itime = cpu_to_le64(m->dqb_itime); + d->dqb_bhardlimit = cpu_to_le64(v2_stoqb(m->dqb_bhardlimit)); + d->dqb_bsoftlimit = cpu_to_le64(v2_stoqb(m->dqb_bsoftlimit)); + d->dqb_curspace = cpu_to_le64(m->dqb_curspace); + d->dqb_btime = cpu_to_le64(m->dqb_btime); + d->dqb_id = cpu_to_le32(dquot->dq_id); + if (qtree_entry_unused(info, dp)) + d->dqb_itime = cpu_to_le64(1); +} + +static int v2r1_is_id(void *dp, struct dquot *dquot) +{ + struct v2r1_disk_dqblk *d = dp; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + + if (qtree_entry_unused(info, dp)) + return 0; + return le32_to_cpu(d->dqb_id) == dquot->dq_id; +} + +static int v2_read_dquot(struct dquot *dquot) +{ + return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); +} + +static int v2_write_dquot(struct dquot *dquot) +{ + return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); +} + +static int v2_release_dquot(struct dquot *dquot) +{ + return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); +} + +static int v2_free_file_info(struct super_block *sb, int type) +{ + kfree(sb_dqinfo(sb, type)->dqi_priv); + return 0; +} + +static const struct quota_format_ops v2_format_ops = { + .check_quota_file = v2_check_quota_file, + .read_file_info = v2_read_file_info, + .write_file_info = v2_write_file_info, + .free_file_info = v2_free_file_info, + .read_dqblk = v2_read_dquot, + .commit_dqblk = v2_write_dquot, + .release_dqblk = v2_release_dquot, +}; + +static struct quota_format_type v2r0_quota_format = { + .qf_fmt_id = QFMT_VFS_V0, + .qf_ops = &v2_format_ops, + .qf_owner = THIS_MODULE +}; + +static struct quota_format_type v2r1_quota_format = { + .qf_fmt_id = QFMT_VFS_V1, + .qf_ops = &v2_format_ops, + .qf_owner = THIS_MODULE +}; + +static int __init init_v2_quota_format(void) +{ + int ret; + + ret = register_quota_format(&v2r0_quota_format); + if (ret) + return ret; + return register_quota_format(&v2r1_quota_format); +} + +static void __exit exit_v2_quota_format(void) +{ + unregister_quota_format(&v2r0_quota_format); + unregister_quota_format(&v2r1_quota_format); +} + +module_init(init_v2_quota_format); +module_exit(exit_v2_quota_format); diff --git a/fs/quota/quotaio_v1.h b/fs/quota/quotaio_v1.h new file mode 100644 index 00000000..746654b5 --- /dev/null +++ b/fs/quota/quotaio_v1.h @@ -0,0 +1,33 @@ +#ifndef _LINUX_QUOTAIO_V1_H +#define _LINUX_QUOTAIO_V1_H + +#include + +/* + * The following constants define the amount of time given a user + * before the soft limits are treated as hard limits (usually resulting + * in an allocation failure). The timer is started when the user crosses + * their soft limit, it is reset when they go below their soft limit. + */ +#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */ +#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */ + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is an array of these structures + * indexed by user or group number. + */ +struct v1_disk_dqblk { + __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */ + __u32 dqb_bsoftlimit; /* preferred limit on disk blks */ + __u32 dqb_curblocks; /* current block count */ + __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __u32 dqb_isoftlimit; /* preferred inode limit */ + __u32 dqb_curinodes; /* current # allocated inodes */ + time_t dqb_btime; /* time limit for excessive disk use */ + time_t dqb_itime; /* time limit for excessive inode use */ +}; + +#define v1_dqoff(UID) ((loff_t)((UID) * sizeof (struct v1_disk_dqblk))) + +#endif /* _LINUX_QUOTAIO_V1_H */ diff --git a/fs/quota/quotaio_v2.h b/fs/quota/quotaio_v2.h new file mode 100644 index 00000000..f1966b42 --- /dev/null +++ b/fs/quota/quotaio_v2.h @@ -0,0 +1,73 @@ +/* + * Definitions of structures for vfsv0 quota format + */ + +#ifndef _LINUX_QUOTAIO_V2_H +#define _LINUX_QUOTAIO_V2_H + +#include +#include + +/* + * Definitions of magics and versions of current quota files + */ +#define V2_INITQMAGICS {\ + 0xd9c01f11, /* USRQUOTA */\ + 0xd9c01927 /* GRPQUOTA */\ +} + +#define V2_INITQVERSIONS {\ + 1, /* USRQUOTA */\ + 1 /* GRPQUOTA */\ +} + +/* First generic header */ +struct v2_disk_dqheader { + __le32 dqh_magic; /* Magic number identifying file */ + __le32 dqh_version; /* File version */ +}; + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is a radix tree whose leaves point + * to blocks of these structures. + */ +struct v2r0_disk_dqblk { + __le32 dqb_id; /* id this quota applies to */ + __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __le32 dqb_isoftlimit; /* preferred inode limit */ + __le32 dqb_curinodes; /* current # allocated inodes */ + __le32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */ + __le32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */ + __le64 dqb_curspace; /* current space occupied (in bytes) */ + __le64 dqb_btime; /* time limit for excessive disk use */ + __le64 dqb_itime; /* time limit for excessive inode use */ +}; + +struct v2r1_disk_dqblk { + __le32 dqb_id; /* id this quota applies to */ + __le32 dqb_pad; + __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __le64 dqb_isoftlimit; /* preferred inode limit */ + __le64 dqb_curinodes; /* current # allocated inodes */ + __le64 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */ + __le64 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */ + __le64 dqb_curspace; /* current space occupied (in bytes) */ + __le64 dqb_btime; /* time limit for excessive disk use */ + __le64 dqb_itime; /* time limit for excessive inode use */ +}; + +/* Header with type and version specific information */ +struct v2_disk_dqinfo { + __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */ + __le32 dqi_igrace; /* Time before inode soft limit becomes hard limit */ + __le32 dqi_flags; /* Flags for quotafile (DQF_*) */ + __le32 dqi_blocks; /* Number of blocks in file */ + __le32 dqi_free_blk; /* Number of first free block in the list */ + __le32 dqi_free_entry; /* Number of block with at least one free entry */ +}; + +#define V2_DQINFOOFF sizeof(struct v2_disk_dqheader) /* Offset of info header in file */ +#define V2_DQBLKSIZE_BITS 10 /* Size of leaf block in tree */ + +#endif /* _LINUX_QUOTAIO_V2_H */ diff --git a/fs/ramfs/Makefile b/fs/ramfs/Makefile new file mode 100644 index 00000000..c71e65dc --- /dev/null +++ b/fs/ramfs/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the linux ramfs routines. +# + +obj-y += ramfs.o + +file-mmu-y := file-nommu.o +file-mmu-$(CONFIG_MMU) := file-mmu.o +ramfs-objs += inode.o $(file-mmu-y) diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c new file mode 100644 index 00000000..4884ac5a --- /dev/null +++ b/fs/ramfs/file-mmu.c @@ -0,0 +1,55 @@ +/* file-mmu.c: ramfs MMU-based file operations + * + * Resizable simple ram filesystem for Linux. + * + * Copyright (C) 2000 Linus Torvalds. + * 2000 Transmeta Corp. + * + * Usage limits added by David Gibson, Linuxcare Australia. + * This file is released under the GPL. + */ + +/* + * NOTE! This filesystem is probably most useful + * not as a real filesystem, but as an example of + * how virtual filesystems can be written. + * + * It doesn't get much simpler than this. Consider + * that this file implements the full semantics of + * a POSIX-compliant read-write filesystem. + * + * Note in particular how the filesystem does not + * need to implement any data structures of its own + * to keep track of the virtual data: using the VFS + * caches is sufficient. + */ + +#include +#include +#include + +#include "internal.h" + +const struct address_space_operations ramfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, + .set_page_dirty = __set_page_dirty_no_writeback, +}; + +const struct file_operations ramfs_file_operations = { + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .fsync = noop_fsync, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, + .llseek = generic_file_llseek, +}; + +const struct inode_operations ramfs_file_inode_operations = { + .setattr = simple_setattr, + .getattr = simple_getattr, +}; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c new file mode 100644 index 00000000..fbb0b478 --- /dev/null +++ b/fs/ramfs/file-nommu.c @@ -0,0 +1,266 @@ +/* file-nommu.c: no-MMU version of ramfs + * + * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "internal.h" + +static int ramfs_nommu_setattr(struct dentry *, struct iattr *); + +const struct address_space_operations ramfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, + .set_page_dirty = __set_page_dirty_no_writeback, +}; + +const struct file_operations ramfs_file_operations = { + .mmap = ramfs_nommu_mmap, + .get_unmapped_area = ramfs_nommu_get_unmapped_area, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .fsync = noop_fsync, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, + .llseek = generic_file_llseek, +}; + +const struct inode_operations ramfs_file_inode_operations = { + .setattr = ramfs_nommu_setattr, + .getattr = simple_getattr, +}; + +/*****************************************************************************/ +/* + * add a contiguous set of pages into a ramfs inode when it's truncated from + * size 0 on the assumption that it's going to be used for an mmap of shared + * memory + */ +int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) +{ + unsigned long npages, xpages, loop; + struct page *pages; + unsigned order; + void *data; + int ret; + + /* make various checks */ + order = get_order(newsize); + if (unlikely(order >= MAX_ORDER)) + return -EFBIG; + + ret = inode_newsize_ok(inode, newsize); + if (ret) + return ret; + + i_size_write(inode, newsize); + + /* allocate enough contiguous pages to be able to satisfy the + * request */ + pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order); + if (!pages) + return -ENOMEM; + + /* split the high-order page into an array of single pages */ + xpages = 1UL << order; + npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT; + + split_page(pages, order); + + /* trim off any pages we don't actually require */ + for (loop = npages; loop < xpages; loop++) + __free_page(pages + loop); + + /* clear the memory we allocated */ + newsize = PAGE_SIZE * npages; + data = page_address(pages); + memset(data, 0, newsize); + + /* attach all the pages to the inode's address space */ + for (loop = 0; loop < npages; loop++) { + struct page *page = pages + loop; + + ret = add_to_page_cache_lru(page, inode->i_mapping, loop, + GFP_KERNEL); + if (ret < 0) + goto add_error; + + /* prevent the page from being discarded on memory pressure */ + SetPageDirty(page); + + unlock_page(page); + put_page(page); + } + + return 0; + +add_error: + while (loop < npages) + __free_page(pages + loop++); + return ret; +} + +/*****************************************************************************/ +/* + * + */ +static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) +{ + int ret; + + /* assume a truncate from zero size is going to be for the purposes of + * shared mmap */ + if (size == 0) { + if (unlikely(newsize >> 32)) + return -EFBIG; + + return ramfs_nommu_expand_for_mapping(inode, newsize); + } + + /* check that a decrease in size doesn't cut off any shared mappings */ + if (newsize < size) { + ret = nommu_shrink_inode_mappings(inode, size, newsize); + if (ret < 0) + return ret; + } + + truncate_setsize(inode, newsize); + return 0; +} + +/*****************************************************************************/ +/* + * handle a change of attributes + * - we're specifically interested in a change of size + */ +static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) +{ + struct inode *inode = dentry->d_inode; + unsigned int old_ia_valid = ia->ia_valid; + int ret = 0; + + /* POSIX UID/GID verification for setting inode attributes */ + ret = inode_change_ok(inode, ia); + if (ret) + return ret; + + /* pick out size-changing events */ + if (ia->ia_valid & ATTR_SIZE) { + loff_t size = inode->i_size; + + if (ia->ia_size != size) { + ret = ramfs_nommu_resize(inode, ia->ia_size, size); + if (ret < 0 || ia->ia_valid == ATTR_SIZE) + goto out; + } else { + /* we skipped the truncate but must still update + * timestamps + */ + ia->ia_valid |= ATTR_MTIME|ATTR_CTIME; + } + } + + setattr_copy(inode, ia); + out: + ia->ia_valid = old_ia_valid; + return ret; +} + +/*****************************************************************************/ +/* + * try to determine where a shared mapping can be made + * - we require that: + * - the pages to be mapped must exist + * - the pages be physically contiguous in sequence + */ +unsigned long ramfs_nommu_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + unsigned long maxpages, lpages, nr, loop, ret; + struct inode *inode = file->f_path.dentry->d_inode; + struct page **pages = NULL, **ptr, *page; + loff_t isize; + + if (!(flags & MAP_SHARED)) + return addr; + + /* the mapping mustn't extend beyond the EOF */ + lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + isize = i_size_read(inode); + + ret = -EINVAL; + maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= maxpages) + goto out; + + if (maxpages - pgoff < lpages) + goto out; + + /* gang-find the pages */ + ret = -ENOMEM; + pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); + if (!pages) + goto out_free; + + nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); + if (nr != lpages) + goto out_free_pages; /* leave if some pages were missing */ + + /* check the pages for physical adjacency */ + ptr = pages; + page = *ptr++; + page++; + for (loop = lpages; loop > 1; loop--) + if (*ptr++ != page++) + goto out_free_pages; + + /* okay - all conditions fulfilled */ + ret = (unsigned long) page_address(pages[0]); + +out_free_pages: + ptr = pages; + for (loop = nr; loop > 0; loop--) + put_page(*ptr++); +out_free: + kfree(pages); +out: + return ret; +} + +/*****************************************************************************/ +/* + * set up a mapping for shared memory segments + */ +int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!(vma->vm_flags & VM_SHARED)) + return -ENOSYS; + + file_accessed(file); + vma->vm_ops = &generic_file_vm_ops; + return 0; +} diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c new file mode 100644 index 00000000..eacb166f --- /dev/null +++ b/fs/ramfs/inode.c @@ -0,0 +1,315 @@ +/* + * Resizable simple ram filesystem for Linux. + * + * Copyright (C) 2000 Linus Torvalds. + * 2000 Transmeta Corp. + * + * Usage limits added by David Gibson, Linuxcare Australia. + * This file is released under the GPL. + */ + +/* + * NOTE! This filesystem is probably most useful + * not as a real filesystem, but as an example of + * how virtual filesystems can be written. + * + * It doesn't get much simpler than this. Consider + * that this file implements the full semantics of + * a POSIX-compliant read-write filesystem. + * + * Note in particular how the filesystem does not + * need to implement any data structures of its own + * to keep track of the virtual data: using the VFS + * caches is sufficient. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define RAMFS_DEFAULT_MODE 0755 + +static const struct super_operations ramfs_ops; +static const struct inode_operations ramfs_dir_inode_operations; + +static struct backing_dev_info ramfs_backing_dev_info = { + .name = "ramfs", + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | + BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | + BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP, +}; + +struct inode *ramfs_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev) +{ + struct inode * inode = new_inode(sb); + + if (inode) { + inode->i_ino = get_next_ino(); + inode_init_owner(inode, dir, mode); + inode->i_mapping->a_ops = &ramfs_aops; + inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unevictable(inode->i_mapping); + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + default: + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_op = &ramfs_file_inode_operations; + inode->i_fop = &ramfs_file_operations; + break; + case S_IFDIR: + inode->i_op = &ramfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + break; + case S_IFLNK: + inode->i_op = &page_symlink_inode_operations; + break; + } + } + return inode; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +/* SMP-safe */ +static int +ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +{ + struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev); + int error = -ENOSPC; + + if (inode) { + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + error = 0; + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + } + return error; +} + +static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode) +{ + int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0); + if (!retval) + inc_nlink(dir); + return retval; +} + +static int ramfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +{ + return ramfs_mknod(dir, dentry, mode | S_IFREG, 0); +} + +static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname) +{ + struct inode *inode; + int error = -ENOSPC; + + inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); + if (inode) { + int l = strlen(symname)+1; + error = page_symlink(inode, symname, l); + if (!error) { + d_instantiate(dentry, inode); + dget(dentry); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + } else + iput(inode); + } + return error; +} + +static const struct inode_operations ramfs_dir_inode_operations = { + .create = ramfs_create, + .lookup = simple_lookup, + .link = simple_link, + .unlink = simple_unlink, + .symlink = ramfs_symlink, + .mkdir = ramfs_mkdir, + .rmdir = simple_rmdir, + .mknod = ramfs_mknod, + .rename = simple_rename, +}; + +static const struct super_operations ramfs_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .show_options = generic_show_options, +}; + +struct ramfs_mount_opts { + umode_t mode; +}; + +enum { + Opt_mode, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_mode, "mode=%o"}, + {Opt_err, NULL} +}; + +struct ramfs_fs_info { + struct ramfs_mount_opts mount_opts; +}; + +static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts) +{ + substring_t args[MAX_OPT_ARGS]; + int option; + int token; + char *p; + + opts->mode = RAMFS_DEFAULT_MODE; + + while ((p = strsep(&data, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_mode: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; + /* + * We might like to report bad mount options here; + * but traditionally ramfs has ignored all mount options, + * and as it is used as a !CONFIG_SHMEM simple substitute + * for tmpfs, better continue to ignore other mount options. + */ + } + } + + return 0; +} + +int ramfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct ramfs_fs_info *fsi; + struct inode *inode = NULL; + struct dentry *root; + int err; + + save_mount_options(sb, data); + + fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL); + sb->s_fs_info = fsi; + if (!fsi) { + err = -ENOMEM; + goto fail; + } + + err = ramfs_parse_options(data, &fsi->mount_opts); + if (err) + goto fail; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = RAMFS_MAGIC; + sb->s_op = &ramfs_ops; + sb->s_time_gran = 1; + + inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); + if (!inode) { + err = -ENOMEM; + goto fail; + } + + root = d_alloc_root(inode); + sb->s_root = root; + if (!root) { + err = -ENOMEM; + goto fail; + } + + return 0; +fail: + kfree(fsi); + sb->s_fs_info = NULL; + iput(inode); + return err; +} + +struct dentry *ramfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, ramfs_fill_super); +} + +static struct dentry *rootfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super); +} + +static void ramfs_kill_sb(struct super_block *sb) +{ + kfree(sb->s_fs_info); + kill_litter_super(sb); +} + +static struct file_system_type ramfs_fs_type = { + .name = "ramfs", + .mount = ramfs_mount, + .kill_sb = ramfs_kill_sb, +}; +static struct file_system_type rootfs_fs_type = { + .name = "rootfs", + .mount = rootfs_mount, + .kill_sb = kill_litter_super, +}; + +static int __init init_ramfs_fs(void) +{ + return register_filesystem(&ramfs_fs_type); +} + +static void __exit exit_ramfs_fs(void) +{ + unregister_filesystem(&ramfs_fs_type); +} + +module_init(init_ramfs_fs) +module_exit(exit_ramfs_fs) + +int __init init_rootfs(void) +{ + int err; + + err = bdi_init(&ramfs_backing_dev_info); + if (err) + return err; + + err = register_filesystem(&rootfs_fs_type); + if (err) + bdi_destroy(&ramfs_backing_dev_info); + + return err; +} + +MODULE_LICENSE("GPL"); diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h new file mode 100644 index 00000000..6b330639 --- /dev/null +++ b/fs/ramfs/internal.h @@ -0,0 +1,14 @@ +/* internal.h: ramfs internal definitions + * + * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +extern const struct address_space_operations ramfs_aops; +extern const struct inode_operations ramfs_file_inode_operations; diff --git a/fs/read_write.c b/fs/read_write.c new file mode 100644 index 00000000..5520f8ad --- /dev/null +++ b/fs/read_write.c @@ -0,0 +1,948 @@ +/* + * linux/fs/read_write.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "read_write.h" + +#include +#include + +const struct file_operations generic_ro_fops = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .mmap = generic_file_readonly_mmap, + .splice_read = generic_file_splice_read, +}; + +EXPORT_SYMBOL(generic_ro_fops); + +static inline int unsigned_offsets(struct file *file) +{ + return file->f_mode & FMODE_UNSIGNED_OFFSET; +} + +/** + * generic_file_llseek_unlocked - lockless generic llseek implementation + * @file: file structure to seek on + * @offset: file offset to seek to + * @origin: type of seek + * + * Updates the file offset to the value specified by @offset and @origin. + * Locking must be provided by the caller. + */ +loff_t +generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + + switch (origin) { + case SEEK_END: + offset += inode->i_size; + break; + case SEEK_CUR: + /* + * Here we special-case the lseek(fd, 0, SEEK_CUR) + * position-querying operation. Avoid rewriting the "same" + * f_pos value back to the file because a concurrent read(), + * write() or lseek() might have altered it + */ + if (offset == 0) + return file->f_pos; + offset += file->f_pos; + break; + } + + if (offset < 0 && !unsigned_offsets(file)) + return -EINVAL; + if (offset > inode->i_sb->s_maxbytes) + return -EINVAL; + + /* Special lock needed here? */ + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + + return offset; +} +EXPORT_SYMBOL(generic_file_llseek_unlocked); + +/** + * generic_file_llseek - generic llseek implementation for regular files + * @file: file structure to seek on + * @offset: file offset to seek to + * @origin: type of seek + * + * This is a generic implemenation of ->llseek useable for all normal local + * filesystems. It just updates the file offset to the value specified by + * @offset and @origin under i_mutex. + */ +loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t rval; + + mutex_lock(&file->f_dentry->d_inode->i_mutex); + rval = generic_file_llseek_unlocked(file, offset, origin); + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + + return rval; +} +EXPORT_SYMBOL(generic_file_llseek); + +/** + * noop_llseek - No Operation Performed llseek implementation + * @file: file structure to seek on + * @offset: file offset to seek to + * @origin: type of seek + * + * This is an implementation of ->llseek useable for the rare special case when + * userspace expects the seek to succeed but the (device) file is actually not + * able to perform the seek. In this case you use noop_llseek() instead of + * falling back to the default implementation of ->llseek. + */ +loff_t noop_llseek(struct file *file, loff_t offset, int origin) +{ + return file->f_pos; +} +EXPORT_SYMBOL(noop_llseek); + +loff_t no_llseek(struct file *file, loff_t offset, int origin) +{ + return -ESPIPE; +} +EXPORT_SYMBOL(no_llseek); + +loff_t default_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t retval; + + mutex_lock(&file->f_dentry->d_inode->i_mutex); + switch (origin) { + case SEEK_END: + offset += i_size_read(file->f_path.dentry->d_inode); + break; + case SEEK_CUR: + if (offset == 0) { + retval = file->f_pos; + goto out; + } + offset += file->f_pos; + } + retval = -EINVAL; + if (offset >= 0 || unsigned_offsets(file)) { + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + retval = offset; + } +out: + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + return retval; +} +EXPORT_SYMBOL(default_llseek); + +loff_t vfs_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t (*fn)(struct file *, loff_t, int); + + fn = no_llseek; + if (file->f_mode & FMODE_LSEEK) { + if (file->f_op && file->f_op->llseek) + fn = file->f_op->llseek; + } + return fn(file, offset, origin); +} +EXPORT_SYMBOL(vfs_llseek); + +SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) +{ + off_t retval; + struct file * file; + int fput_needed; + + retval = -EBADF; + file = fget_light(fd, &fput_needed); + if (!file) + goto bad; + + retval = -EINVAL; + if (origin <= SEEK_MAX) { + loff_t res = vfs_llseek(file, offset, origin); + retval = res; + if (res != (loff_t)retval) + retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ + } + fput_light(file, fput_needed); +bad: + return retval; +} + +#ifdef __ARCH_WANT_SYS_LLSEEK +SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, + unsigned long, offset_low, loff_t __user *, result, + unsigned int, origin) +{ + int retval; + struct file * file; + loff_t offset; + int fput_needed; + + retval = -EBADF; + file = fget_light(fd, &fput_needed); + if (!file) + goto bad; + + retval = -EINVAL; + if (origin > SEEK_MAX) + goto out_putf; + + offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low, + origin); + + retval = (int)offset; + if (offset >= 0) { + retval = -EFAULT; + if (!copy_to_user(result, &offset, sizeof(offset))) + retval = 0; + } +out_putf: + fput_light(file, fput_needed); +bad: + return retval; +} +#endif + + +/* + * rw_verify_area doesn't like huge counts. We limit + * them to something that fits in "int" so that others + * won't have to do range checks all the time. + */ +int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) +{ + struct inode *inode; + loff_t pos; + int retval = -EINVAL; + + inode = file->f_path.dentry->d_inode; + if (unlikely((ssize_t) count < 0)) + return retval; + pos = *ppos; + if (unlikely(pos < 0)) { + if (!unsigned_offsets(file)) + return retval; + if (count >= -pos) /* both values are in 0..LLONG_MAX */ + return -EOVERFLOW; + } else if (unlikely((loff_t) (pos + count) < 0)) { + if (!unsigned_offsets(file)) + return retval; + } + + if (unlikely(inode->i_flock && mandatory_lock(inode))) { + retval = locks_mandatory_area( + read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, + inode, file, pos, count); + if (retval < 0) + return retval; + } + retval = security_file_permission(file, + read_write == READ ? MAY_READ : MAY_WRITE); + if (retval) + return retval; + return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; +} + +static void wait_on_retry_sync_kiocb(struct kiocb *iocb) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + if (!kiocbIsKicked(iocb)) + schedule(); + else + kiocbClearKicked(iocb); + __set_current_state(TASK_RUNNING); +} + +ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) +{ + struct iovec iov = { .iov_base = buf, .iov_len = len }; + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + kiocb.ki_pos = *ppos; + kiocb.ki_left = len; + kiocb.ki_nbytes = len; + + for (;;) { + ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); + if (ret != -EIOCBRETRY) + break; + wait_on_retry_sync_kiocb(&kiocb); + } + + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + *ppos = kiocb.ki_pos; + return ret; +} + +EXPORT_SYMBOL(do_sync_read); + +ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) +{ + ssize_t ret; + + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read)) + return -EINVAL; + if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) + return -EFAULT; + + ret = rw_verify_area(READ, file, pos, count); + if (ret >= 0) { + count = ret; + if (file->f_op->read) + ret = file->f_op->read(file, buf, count, pos); + else + ret = do_sync_read(file, buf, count, pos); + if (ret > 0) { + fsnotify_access(file); + add_rchar(current, ret); + } + inc_syscr(current); + } + + return ret; +} + +EXPORT_SYMBOL(vfs_read); + +ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) +{ + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + kiocb.ki_pos = *ppos; + kiocb.ki_left = len; + kiocb.ki_nbytes = len; + + for (;;) { + ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); + if (ret != -EIOCBRETRY) + break; + wait_on_retry_sync_kiocb(&kiocb); + } + + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + *ppos = kiocb.ki_pos; + return ret; +} + +EXPORT_SYMBOL(do_sync_write); + +ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) +{ + ssize_t ret; + + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) + return -EINVAL; + if (unlikely(!access_ok(VERIFY_READ, buf, count))) + return -EFAULT; + + ret = rw_verify_area(WRITE, file, pos, count); + if (ret >= 0) { + count = ret; + if (file->f_op->write) + ret = file->f_op->write(file, buf, count, pos); + else + ret = do_sync_write(file, buf, count, pos); + if (ret > 0) { + fsnotify_modify(file); + add_wchar(current, ret); + } + inc_syscw(current); + } + + return ret; +} + +EXPORT_SYMBOL(vfs_write); + +static inline loff_t file_pos_read(struct file *file) +{ + return file->f_pos; +} + +static inline void file_pos_write(struct file *file, loff_t pos) +{ + file->f_pos = pos; +} + +SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) +{ + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + + file = fget_light(fd, &fput_needed); + if (file) { + loff_t pos = file_pos_read(file); + ret = vfs_read(file, buf, count, &pos); + file_pos_write(file, pos); + fput_light(file, fput_needed); + } + + return ret; +} + +SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, + size_t, count) +{ + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + + file = fget_light(fd, &fput_needed); + if (file) { + loff_t pos = file_pos_read(file); + ret = vfs_write(file, buf, count, &pos); + file_pos_write(file, pos); + fput_light(file, fput_needed); + } + + return ret; +} + +SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf, + size_t count, loff_t pos) +{ + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + + if (pos < 0) + return -EINVAL; + + file = fget_light(fd, &fput_needed); + if (file) { + ret = -ESPIPE; + if (file->f_mode & FMODE_PREAD) + ret = vfs_read(file, buf, count, &pos); + fput_light(file, fput_needed); + } + + return ret; +} +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos) +{ + return SYSC_pread64((unsigned int) fd, (char __user *) buf, + (size_t) count, pos); +} +SYSCALL_ALIAS(sys_pread64, SyS_pread64); +#endif + +SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf, + size_t count, loff_t pos) +{ + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + + if (pos < 0) + return -EINVAL; + + file = fget_light(fd, &fput_needed); + if (file) { + ret = -ESPIPE; + if (file->f_mode & FMODE_PWRITE) + ret = vfs_write(file, buf, count, &pos); + fput_light(file, fput_needed); + } + + return ret; +} +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos) +{ + return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf, + (size_t) count, pos); +} +SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64); +#endif + +/* + * Reduce an iovec's length in-place. Return the resulting number of segments + */ +unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) +{ + unsigned long seg = 0; + size_t len = 0; + + while (seg < nr_segs) { + seg++; + if (len + iov->iov_len >= to) { + iov->iov_len = to - len; + break; + } + len += iov->iov_len; + iov++; + } + return seg; +} +EXPORT_SYMBOL(iov_shorten); + +ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, + unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) +{ + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + kiocb.ki_pos = *ppos; + kiocb.ki_left = len; + kiocb.ki_nbytes = len; + + for (;;) { + ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); + if (ret != -EIOCBRETRY) + break; + wait_on_retry_sync_kiocb(&kiocb); + } + + if (ret == -EIOCBQUEUED) + ret = wait_on_sync_kiocb(&kiocb); + *ppos = kiocb.ki_pos; + return ret; +} + +/* Do it by hand, with file-ops */ +ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, + unsigned long nr_segs, loff_t *ppos, io_fn_t fn) +{ + struct iovec *vector = iov; + ssize_t ret = 0; + + while (nr_segs > 0) { + void __user *base; + size_t len; + ssize_t nr; + + base = vector->iov_base; + len = vector->iov_len; + vector++; + nr_segs--; + + nr = fn(filp, base, len, ppos); + + if (nr < 0) { + if (!ret) + ret = nr; + break; + } + ret += nr; + if (nr != len) + break; + } + + return ret; +} + +/* A write operation does a read from user space and vice versa */ +#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) + +ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, + unsigned long nr_segs, unsigned long fast_segs, + struct iovec *fast_pointer, + struct iovec **ret_pointer) +{ + unsigned long seg; + ssize_t ret; + struct iovec *iov = fast_pointer; + + /* + * SuS says "The readv() function *may* fail if the iovcnt argument + * was less than or equal to 0, or greater than {IOV_MAX}. Linux has + * traditionally returned zero for zero segments, so... + */ + if (nr_segs == 0) { + ret = 0; + goto out; + } + + /* + * First get the "struct iovec" from user memory and + * verify all the pointers + */ + if (nr_segs > UIO_MAXIOV) { + ret = -EINVAL; + goto out; + } + if (nr_segs > fast_segs) { + iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + if (iov == NULL) { + ret = -ENOMEM; + goto out; + } + } + if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { + ret = -EFAULT; + goto out; + } + + /* + * According to the Single Unix Specification we should return EINVAL + * if an element length is < 0 when cast to ssize_t or if the + * total length would overflow the ssize_t return value of the + * system call. + * + * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the + * overflow case. + */ + ret = 0; + for (seg = 0; seg < nr_segs; seg++) { + void __user *buf = iov[seg].iov_base; + ssize_t len = (ssize_t)iov[seg].iov_len; + + /* see if we we're about to use an invalid len or if + * it's about to overflow ssize_t */ + if (len < 0) { + ret = -EINVAL; + goto out; + } + if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { + ret = -EFAULT; + goto out; + } + if (len > MAX_RW_COUNT - ret) { + len = MAX_RW_COUNT - ret; + iov[seg].iov_len = len; + } + ret += len; + } +out: + *ret_pointer = iov; + return ret; +} + +static ssize_t do_readv_writev(int type, struct file *file, + const struct iovec __user * uvector, + unsigned long nr_segs, loff_t *pos) +{ + size_t tot_len; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + ssize_t ret; + io_fn_t fn; + iov_fn_t fnv; + + if (!file->f_op) { + ret = -EINVAL; + goto out; + } + + ret = rw_copy_check_uvector(type, uvector, nr_segs, + ARRAY_SIZE(iovstack), iovstack, &iov); + if (ret <= 0) + goto out; + + tot_len = ret; + ret = rw_verify_area(type, file, pos, tot_len); + if (ret < 0) + goto out; + + fnv = NULL; + if (type == READ) { + fn = file->f_op->read; + fnv = file->f_op->aio_read; + } else { + fn = (io_fn_t)file->f_op->write; + fnv = file->f_op->aio_write; + } + + if (fnv) + ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, + pos, fnv); + else + ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); + +out: + if (iov != iovstack) + kfree(iov); + if ((ret + (type == READ)) > 0) { + if (type == READ) + fsnotify_access(file); + else + fsnotify_modify(file); + } + return ret; +} + +ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) + return -EINVAL; + + return do_readv_writev(READ, file, vec, vlen, pos); +} + +EXPORT_SYMBOL(vfs_readv); + +ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) + return -EINVAL; + + return do_readv_writev(WRITE, file, vec, vlen, pos); +} + +EXPORT_SYMBOL(vfs_writev); + +SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen) +{ + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + + file = fget_light(fd, &fput_needed); + if (file) { + loff_t pos = file_pos_read(file); + ret = vfs_readv(file, vec, vlen, &pos); + file_pos_write(file, pos); + fput_light(file, fput_needed); + } + + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen) +{ + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + + file = fget_light(fd, &fput_needed); + if (file) { + loff_t pos = file_pos_read(file); + ret = vfs_writev(file, vec, vlen, &pos); + file_pos_write(file, pos); + fput_light(file, fput_needed); + } + + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + +static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) +{ +#define HALF_LONG_BITS (BITS_PER_LONG / 2) + return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; +} + +SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) +{ + loff_t pos = pos_from_hilo(pos_h, pos_l); + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + + if (pos < 0) + return -EINVAL; + + file = fget_light(fd, &fput_needed); + if (file) { + ret = -ESPIPE; + if (file->f_mode & FMODE_PREAD) + ret = vfs_readv(file, vec, vlen, &pos); + fput_light(file, fput_needed); + } + + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) +{ + loff_t pos = pos_from_hilo(pos_h, pos_l); + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + + if (pos < 0) + return -EINVAL; + + file = fget_light(fd, &fput_needed); + if (file) { + ret = -ESPIPE; + if (file->f_mode & FMODE_PWRITE) + ret = vfs_writev(file, vec, vlen, &pos); + fput_light(file, fput_needed); + } + + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + +static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, + size_t count, loff_t max) +{ + struct file * in_file, * out_file; + struct inode * in_inode, * out_inode; + loff_t pos; + ssize_t retval; + int fput_needed_in, fput_needed_out, fl; + + /* + * Get input file, and verify that it is ok.. + */ + retval = -EBADF; + in_file = fget_light(in_fd, &fput_needed_in); + if (!in_file) + goto out; + if (!(in_file->f_mode & FMODE_READ)) + goto fput_in; + retval = -ESPIPE; + if (!ppos) + ppos = &in_file->f_pos; + else + if (!(in_file->f_mode & FMODE_PREAD)) + goto fput_in; + retval = rw_verify_area(READ, in_file, ppos, count); + if (retval < 0) + goto fput_in; + count = retval; + + /* + * Get output file, and verify that it is ok.. + */ + retval = -EBADF; + out_file = fget_light(out_fd, &fput_needed_out); + if (!out_file) + goto fput_in; + if (!(out_file->f_mode & FMODE_WRITE)) + goto fput_out; + retval = -EINVAL; + in_inode = in_file->f_path.dentry->d_inode; + out_inode = out_file->f_path.dentry->d_inode; + retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); + if (retval < 0) + goto fput_out; + count = retval; + + if (!max) + max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); + + pos = *ppos; + if (unlikely(pos + count > max)) { + retval = -EOVERFLOW; + if (pos >= max) + goto fput_out; + count = max - pos; + } + + fl = 0; +#if 0 + /* + * We need to debate whether we can enable this or not. The + * man page documents EAGAIN return for the output at least, + * and the application is arguably buggy if it doesn't expect + * EAGAIN on a non-blocking file descriptor. + */ + if (in_file->f_flags & O_NONBLOCK) + fl = SPLICE_F_NONBLOCK; +#endif + retval = do_splice_direct(in_file, ppos, out_file, count, fl); + + if (retval > 0) { + add_rchar(current, retval); + add_wchar(current, retval); + } + + inc_syscr(current); + inc_syscw(current); + if (*ppos > max) + retval = -EOVERFLOW; + +fput_out: + fput_light(out_file, fput_needed_out); +fput_in: + fput_light(in_file, fput_needed_in); +out: + return retval; +} + +SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) +{ + loff_t pos; + off_t off; + ssize_t ret; + + if (offset) { + if (unlikely(get_user(off, offset))) + return -EFAULT; + pos = off; + ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); + if (unlikely(put_user(pos, offset))) + return -EFAULT; + return ret; + } + + return do_sendfile(out_fd, in_fd, NULL, count, 0); +} + +SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) +{ + loff_t pos; + ssize_t ret; + + if (offset) { + if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) + return -EFAULT; + ret = do_sendfile(out_fd, in_fd, &pos, count, 0); + if (unlikely(put_user(pos, offset))) + return -EFAULT; + return ret; + } + + return do_sendfile(out_fd, in_fd, NULL, count, 0); +} diff --git a/fs/read_write.h b/fs/read_write.h new file mode 100644 index 00000000..d07b954c --- /dev/null +++ b/fs/read_write.h @@ -0,0 +1,14 @@ +/* + * This file is only for sharing some helpers from read_write.c with compat.c. + * Don't use anywhere else. + */ + + +typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); +typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + +ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, + unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn); +ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, + unsigned long nr_segs, loff_t *ppos, io_fn_t fn); diff --git a/fs/readdir.c b/fs/readdir.c new file mode 100644 index 00000000..356f7152 --- /dev/null +++ b/fs/readdir.c @@ -0,0 +1,311 @@ +/* + * linux/fs/readdir.c + * + * Copyright (C) 1995 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +int vfs_readdir(struct file *file, filldir_t filler, void *buf) +{ + struct inode *inode = file->f_path.dentry->d_inode; + int res = -ENOTDIR; + if (!file->f_op || !file->f_op->readdir) + goto out; + + res = security_file_permission(file, MAY_READ); + if (res) + goto out; + + res = mutex_lock_killable(&inode->i_mutex); + if (res) + goto out; + + res = -ENOENT; + if (!IS_DEADDIR(inode)) { + res = file->f_op->readdir(file, buf, filler); + file_accessed(file); + } + mutex_unlock(&inode->i_mutex); +out: + return res; +} + +EXPORT_SYMBOL(vfs_readdir); + +/* + * Traditional linux readdir() handling.. + * + * "count=1" is a special case, meaning that the buffer is one + * dirent-structure in size and that the code can't handle more + * anyway. Thus the special "fillonedir()" function for that + * case (the low-level handlers don't need to care about this). + */ + +#ifdef __ARCH_WANT_OLD_READDIR + +struct old_linux_dirent { + unsigned long d_ino; + unsigned long d_offset; + unsigned short d_namlen; + char d_name[1]; +}; + +struct readdir_callback { + struct old_linux_dirent __user * dirent; + int result; +}; + +static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset, + u64 ino, unsigned int d_type) +{ + struct readdir_callback * buf = (struct readdir_callback *) __buf; + struct old_linux_dirent __user * dirent; + unsigned long d_ino; + + if (buf->result) + return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; + return -EOVERFLOW; + } + buf->result++; + dirent = buf->dirent; + if (!access_ok(VERIFY_WRITE, dirent, + (unsigned long)(dirent->d_name + namlen + 1) - + (unsigned long)dirent)) + goto efault; + if ( __put_user(d_ino, &dirent->d_ino) || + __put_user(offset, &dirent->d_offset) || + __put_user(namlen, &dirent->d_namlen) || + __copy_to_user(dirent->d_name, name, namlen) || + __put_user(0, dirent->d_name + namlen)) + goto efault; + return 0; +efault: + buf->result = -EFAULT; + return -EFAULT; +} + +SYSCALL_DEFINE3(old_readdir, unsigned int, fd, + struct old_linux_dirent __user *, dirent, unsigned int, count) +{ + int error; + struct file * file; + struct readdir_callback buf; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + + buf.result = 0; + buf.dirent = dirent; + + error = vfs_readdir(file, fillonedir, &buf); + if (buf.result) + error = buf.result; + + fput(file); +out: + return error; +} + +#endif /* __ARCH_WANT_OLD_READDIR */ + +/* + * New, all-improved, singing, dancing, iBCS2-compliant getdents() + * interface. + */ +struct linux_dirent { + unsigned long d_ino; + unsigned long d_off; + unsigned short d_reclen; + char d_name[1]; +}; + +struct getdents_callback { + struct linux_dirent __user * current_dir; + struct linux_dirent __user * previous; + int count; + int error; +}; + +static int filldir(void * __buf, const char * name, int namlen, loff_t offset, + u64 ino, unsigned int d_type) +{ + struct linux_dirent __user * dirent; + struct getdents_callback * buf = (struct getdents_callback *) __buf; + unsigned long d_ino; + int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, + sizeof(long)); + + buf->error = -EINVAL; /* only used if we fail.. */ + if (reclen > buf->count) + return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; + return -EOVERFLOW; + } + dirent = buf->previous; + if (dirent) { + if (__put_user(offset, &dirent->d_off)) + goto efault; + } + dirent = buf->current_dir; + if (__put_user(d_ino, &dirent->d_ino)) + goto efault; + if (__put_user(reclen, &dirent->d_reclen)) + goto efault; + if (copy_to_user(dirent->d_name, name, namlen)) + goto efault; + if (__put_user(0, dirent->d_name + namlen)) + goto efault; + if (__put_user(d_type, (char __user *) dirent + reclen - 1)) + goto efault; + buf->previous = dirent; + dirent = (void __user *)dirent + reclen; + buf->current_dir = dirent; + buf->count -= reclen; + return 0; +efault: + buf->error = -EFAULT; + return -EFAULT; +} + +SYSCALL_DEFINE3(getdents, unsigned int, fd, + struct linux_dirent __user *, dirent, unsigned int, count) +{ + struct file * file; + struct linux_dirent __user * lastdirent; + struct getdents_callback buf; + int error; + + error = -EFAULT; + if (!access_ok(VERIFY_WRITE, dirent, count)) + goto out; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + + buf.current_dir = dirent; + buf.previous = NULL; + buf.count = count; + buf.error = 0; + + error = vfs_readdir(file, filldir, &buf); + if (error >= 0) + error = buf.error; + lastdirent = buf.previous; + if (lastdirent) { + if (put_user(file->f_pos, &lastdirent->d_off)) + error = -EFAULT; + else + error = count - buf.count; + } + fput(file); +out: + return error; +} + +struct getdents_callback64 { + struct linux_dirent64 __user * current_dir; + struct linux_dirent64 __user * previous; + int count; + int error; +}; + +static int filldir64(void * __buf, const char * name, int namlen, loff_t offset, + u64 ino, unsigned int d_type) +{ + struct linux_dirent64 __user *dirent; + struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf; + int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, + sizeof(u64)); + + buf->error = -EINVAL; /* only used if we fail.. */ + if (reclen > buf->count) + return -EINVAL; + dirent = buf->previous; + if (dirent) { + if (__put_user(offset, &dirent->d_off)) + goto efault; + } + dirent = buf->current_dir; + if (__put_user(ino, &dirent->d_ino)) + goto efault; + if (__put_user(0, &dirent->d_off)) + goto efault; + if (__put_user(reclen, &dirent->d_reclen)) + goto efault; + if (__put_user(d_type, &dirent->d_type)) + goto efault; + if (copy_to_user(dirent->d_name, name, namlen)) + goto efault; + if (__put_user(0, dirent->d_name + namlen)) + goto efault; + buf->previous = dirent; + dirent = (void __user *)dirent + reclen; + buf->current_dir = dirent; + buf->count -= reclen; + return 0; +efault: + buf->error = -EFAULT; + return -EFAULT; +} + +SYSCALL_DEFINE3(getdents64, unsigned int, fd, + struct linux_dirent64 __user *, dirent, unsigned int, count) +{ + struct file * file; + struct linux_dirent64 __user * lastdirent; + struct getdents_callback64 buf; + int error; + + error = -EFAULT; + if (!access_ok(VERIFY_WRITE, dirent, count)) + goto out; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + + buf.current_dir = dirent; + buf.previous = NULL; + buf.count = count; + buf.error = 0; + + error = vfs_readdir(file, filldir64, &buf); + if (error >= 0) + error = buf.error; + lastdirent = buf.previous; + if (lastdirent) { + typeof(lastdirent->d_off) d_off = file->f_pos; + if (__put_user(d_off, &lastdirent->d_off)) + error = -EFAULT; + else + error = count - buf.count; + } + fput(file); +out: + return error; +} diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig new file mode 100644 index 00000000..7cd46666 --- /dev/null +++ b/fs/reiserfs/Kconfig @@ -0,0 +1,88 @@ +config REISERFS_FS + tristate "Reiserfs support" + select CRC32 + help + Stores not just filenames but the files themselves in a balanced + tree. Uses journalling. + + Balanced trees are more efficient than traditional file system + architectural foundations. + + In general, ReiserFS is as fast as ext2, but is very efficient with + large directories and small files. Additional patches are needed + for NFS and quotas, please see + for links. + + It is more easily extended to have features currently found in + database and keyword search systems than block allocation based file + systems are. The next version will be so extended, and will support + plugins consistent with our motto ``It takes more than a license to + make source code open.'' + + Read + to learn more about reiserfs. + + Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com. + + If you like it, you can pay us to add new features to it that you + need, buy a support contract, or pay us to port it to another OS. + +config REISERFS_CHECK + bool "Enable reiserfs debug mode" + depends on REISERFS_FS + help + If you set this to Y, then ReiserFS will perform every check it can + possibly imagine of its internal consistency throughout its + operation. It will also go substantially slower. More than once we + have forgotten that this was on, and then gone despondent over the + latest benchmarks.:-) Use of this option allows our team to go all + out in checking for consistency when debugging without fear of its + effect on end users. If you are on the verge of sending in a bug + report, say Y and you might get a useful error message. Almost + everyone should say N. + +config REISERFS_PROC_INFO + bool "Stats in /proc/fs/reiserfs" + depends on REISERFS_FS && PROC_FS + help + Create under /proc/fs/reiserfs a hierarchy of files, displaying + various ReiserFS statistics and internal data at the expense of + making your kernel or module slightly larger (+8 KB). This also + increases the amount of kernel memory required for each mount. + Almost everyone but ReiserFS developers and people fine-tuning + reiserfs or tracing problems should say N. + +config REISERFS_FS_XATTR + bool "ReiserFS extended attributes" + depends on REISERFS_FS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + +config REISERFS_FS_POSIX_ACL + bool "ReiserFS POSIX Access Control Lists" + depends on REISERFS_FS_XATTR + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config REISERFS_FS_SECURITY + bool "ReiserFS Security Labels" + depends on REISERFS_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ReiserFS filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile new file mode 100644 index 00000000..3c3b0016 --- /dev/null +++ b/fs/reiserfs/Makefile @@ -0,0 +1,38 @@ +# +# Makefile for the linux reiser-filesystem routines. +# + +obj-$(CONFIG_REISERFS_FS) += reiserfs.o + +reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \ + super.o prints.o objectid.o lbalance.o ibalance.o stree.o \ + hashes.o tail_conversion.o journal.o resize.o \ + item_ops.o ioctl.o xattr.o lock.o + +ifeq ($(CONFIG_REISERFS_PROC_INFO),y) +reiserfs-objs += procfs.o +endif + +ifeq ($(CONFIG_REISERFS_FS_XATTR),y) +reiserfs-objs += xattr_user.o xattr_trusted.o +endif + +ifeq ($(CONFIG_REISERFS_FS_SECURITY),y) +reiserfs-objs += xattr_security.o +endif + +ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y) +reiserfs-objs += xattr_acl.o +endif + +# gcc -O2 (the kernel default) is overaggressive on ppc32 when many inline +# functions are used. This causes the compiler to advance the stack +# pointer out of the available stack space, corrupting kernel space, +# and causing a panic. Since this behavior only affects ppc32, this ifeq +# will work around it. If any other architecture displays this behavior, +# add it here. +ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1) + +TAGS: + etags *.c + diff --git a/fs/reiserfs/README b/fs/reiserfs/README new file mode 100644 index 00000000..e2f7a264 --- /dev/null +++ b/fs/reiserfs/README @@ -0,0 +1,161 @@ +[LICENSING] + +ReiserFS is hereby licensed under the GNU General +Public License version 2. + +Source code files that contain the phrase "licensing governed by +reiserfs/README" are "governed files" throughout this file. Governed +files are licensed under the GPL. The portions of them owned by Hans +Reiser, or authorized to be licensed by him, have been in the past, +and likely will be in the future, licensed to other parties under +other licenses. If you add your code to governed files, and don't +want it to be owned by Hans Reiser, put your copyright label on that +code so the poor blight and his customers can keep things straight. +All portions of governed files not labeled otherwise are owned by Hans +Reiser, and by adding your code to it, widely distributing it to +others or sending us a patch, and leaving the sentence in stating that +licensing is governed by the statement in this file, you accept this. +It will be a kindness if you identify whether Hans Reiser is allowed +to license code labeled as owned by you on your behalf other than +under the GPL, because he wants to know if it is okay to do so and put +a check in the mail to you (for non-trivial improvements) when he +makes his next sale. He makes no guarantees as to the amount if any, +though he feels motivated to motivate contributors, and you can surely +discuss this with him before or after contributing. You have the +right to decline to allow him to license your code contribution other +than under the GPL. + +Further licensing options are available for commercial and/or other +interests directly from Hans Reiser: hans@reiser.to. If you interpret +the GPL as not allowing those additional licensing options, you read +it wrongly, and Richard Stallman agrees with me, when carefully read +you can see that those restrictions on additional terms do not apply +to the owner of the copyright, and my interpretation of this shall +govern for this license. + +Finally, nothing in this license shall be interpreted to allow you to +fail to fairly credit me, or to remove my credits, without my +permission, unless you are an end user not redistributing to others. +If you have doubts about how to properly do that, or about what is +fair, ask. (Last I spoke with him Richard was contemplating how best +to address the fair crediting issue in the next GPL version.) + +[END LICENSING] + +Reiserfs is a file system based on balanced tree algorithms, which is +described at https://reiser4.wiki.kernel.org/index.php/Main_Page + +Stop reading here. Go there, then return. + +Send bug reports to yura@namesys.botik.ru. + +mkreiserfs and other utilities are in reiserfs/utils, or wherever your +Linux provider put them. There is some disagreement about how useful +it is for users to get their fsck and mkreiserfs out of sync with the +version of reiserfs that is in their kernel, with many important +distributors wanting them out of sync.:-) Please try to remember to +recompile and reinstall fsck and mkreiserfs with every update of +reiserfs, this is a common source of confusion. Note that some of the +utilities cannot be compiled without accessing the balancing code +which is in the kernel code, and relocating the utilities may require +you to specify where that code can be found. + +Yes, if you update your reiserfs kernel module you do have to +recompile your kernel, most of the time. The errors you get will be +quite cryptic if your forget to do so. + +Real users, as opposed to folks who want to hack and then understand +what went wrong, will want REISERFS_CHECK off. + +Hideous Commercial Pitch: Spread your development costs across other OS +vendors. Select from the best in the world, not the best in your +building, by buying from third party OS component suppliers. Leverage +the software component development power of the internet. Be the most +aggressive in taking advantage of the commercial possibilities of +decentralized internet development, and add value through your branded +integration that you sell as an operating system. Let your competitors +be the ones to compete against the entire internet by themselves. Be +hip, get with the new economic trend, before your competitors do. Send +email to hans@reiser.to. + +To understand the code, after reading the website, start reading the +code by reading reiserfs_fs.h first. + +Hans Reiser was the project initiator, primary architect, source of all +funding for the first 5.5 years, and one of the programmers. He owns +the copyright. + +Vladimir Saveljev was one of the programmers, and he worked long hours +writing the cleanest code. He always made the effort to be the best he +could be, and to make his code the best that it could be. What resulted +was quite remarkable. I don't think that money can ever motivate someone +to work the way he did, he is one of the most selfless men I know. + +Yura helps with benchmarking, coding hashes, and block pre-allocation +code. + +Anatoly Pinchuk is a former member of our team who worked closely with +Vladimir throughout the project's development. He wrote a quite +substantial portion of the total code. He realized that there was a +space problem with packing tails of files for files larger than a node +that start on a node aligned boundary (there are reasons to want to node +align files), and he invented and implemented indirect items and +unformatted nodes as the solution. + +Konstantin Shvachko, with the help of the Russian version of a VC, +tried to put me in a position where I was forced into giving control +of the project to him. (Fortunately, as the person paying the money +for all salaries from my dayjob I owned all copyrights, and you can't +really force takeovers of sole proprietorships.) This was something +curious, because he never really understood the value of our project, +why we should do what we do, or why innovation was possible in +general, but he was sure that he ought to be controlling it. Every +innovation had to be forced past him while he was with us. He added +two years to the time required to complete reiserfs, and was a net +loss for me. Mikhail Gilula was a brilliant innovator who also left +in a destructive way that erased the value of his contributions, and +that he was shown much generosity just makes it more painful. + +Grigory Zaigralin was an extremely effective system administrator for +our group. + +Igor Krasheninnikov was wonderful at hardware procurement, repair, and +network installation. + +Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a +textbook he got the algorithm from in the code. Note that his analysis +of how we could use the hashing code in making 32 bit NFS cookies work +was probably more important than the actual algorithm. Colin Plumb also +contributed to it. + +Chris Mason dived right into our code, and in just a few months produced +the journaling code that dramatically increased the value of ReiserFS. +He is just an amazing programmer. + +Igor Zagorovsky is writing much of the new item handler and extent code +for our next major release. + +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the +resizer, and is hard at work on implementing allocate on flush. SGI +implemented allocate on flush before us for XFS, and generously took +the time to convince me we should do it also. They are great people, +and a great company. + +Yuri Shevchuk and Nikita Danilov are doing squid cache optimization. + +Vitaly Fertman is doing fsck. + +Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably +the endian safe patches which allow ReiserFS to run on any platform +supported by the Linux kernel. + +SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the +Alpha PC Company made it possible for me to not have a day job +anymore, and to dramatically increase our staffing. Ecila funded +hypertext feature development, MP3.com funded journaling, SuSE funded +core development, IntegratedLinux.com funded squid web cache +appliances, bigstorage.com funded HSM, and the alpha PC company funded +the alpha port. Many of these tasks were helped by sponsors other +than the ones just named. SuSE has helped in much more than just +funding.... + diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c new file mode 100644 index 00000000..483442e6 --- /dev/null +++ b/fs/reiserfs/bitmap.c @@ -0,0 +1,1300 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ +/* Reiserfs block (de)allocator, bitmap-based. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PREALLOCATION_SIZE 9 + +/* different reiserfs block allocator options */ + +#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits) + +#define _ALLOC_concentrating_formatted_nodes 0 +#define _ALLOC_displacing_large_files 1 +#define _ALLOC_displacing_new_packing_localities 2 +#define _ALLOC_old_hashed_relocation 3 +#define _ALLOC_new_hashed_relocation 4 +#define _ALLOC_skip_busy 5 +#define _ALLOC_displace_based_on_dirid 6 +#define _ALLOC_hashed_formatted_nodes 7 +#define _ALLOC_old_way 8 +#define _ALLOC_hundredth_slices 9 +#define _ALLOC_dirid_groups 10 +#define _ALLOC_oid_groups 11 +#define _ALLOC_packing_groups 12 + +#define concentrating_formatted_nodes(s) test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s)) +#define displacing_large_files(s) test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s)) +#define displacing_new_packing_localities(s) test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s)) + +#define SET_OPTION(optname) \ + do { \ + reiserfs_info(s, "block allocator option \"%s\" is set", #optname); \ + set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \ + } while(0) +#define TEST_OPTION(optname, s) \ + test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)) + +static inline void get_bit_address(struct super_block *s, + b_blocknr_t block, + unsigned int *bmap_nr, + unsigned int *offset) +{ + /* It is in the bitmap block number equal to the block + * number divided by the number of bits in a block. */ + *bmap_nr = block >> (s->s_blocksize_bits + 3); + /* Within that bitmap block it is located at bit offset *offset. */ + *offset = block & ((s->s_blocksize << 3) - 1); +} + +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value) +{ + unsigned int bmap, offset; + unsigned int bmap_count = reiserfs_bmap_count(s); + + if (block == 0 || block >= SB_BLOCK_COUNT(s)) { + reiserfs_error(s, "vs-4010", + "block number is out of range %lu (%u)", + block, SB_BLOCK_COUNT(s)); + return 0; + } + + get_bit_address(s, block, &bmap, &offset); + + /* Old format filesystem? Unlikely, but the bitmaps are all up front so + * we need to account for it. */ + if (unlikely(test_bit(REISERFS_OLD_FORMAT, + &(REISERFS_SB(s)->s_properties)))) { + b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1; + if (block >= bmap1 && + block <= bmap1 + bmap_count) { + reiserfs_error(s, "vs-4019", "bitmap block %lu(%u) " + "can't be freed or reused", + block, bmap_count); + return 0; + } + } else { + if (offset == 0) { + reiserfs_error(s, "vs-4020", "bitmap block %lu(%u) " + "can't be freed or reused", + block, bmap_count); + return 0; + } + } + + if (bmap >= bmap_count) { + reiserfs_error(s, "vs-4030", "bitmap for requested block " + "is out of range: block=%lu, bitmap_nr=%u", + block, bmap); + return 0; + } + + if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) { + reiserfs_error(s, "vs-4050", "this is root block (%u), " + "it must be busy", SB_ROOT_BLOCK(s)); + return 0; + } + + return 1; +} + +/* searches in journal structures for a given block number (bmap, off). If block + is found in reiserfs journal it suggests next free block candidate to test. */ +static inline int is_block_in_journal(struct super_block *s, unsigned int bmap, + int off, int *next) +{ + b_blocknr_t tmp; + + if (reiserfs_in_journal(s, bmap, off, 1, &tmp)) { + if (tmp) { /* hint supplied */ + *next = tmp; + PROC_INFO_INC(s, scan_bitmap.in_journal_hint); + } else { + (*next) = off + 1; /* inc offset to avoid looping. */ + PROC_INFO_INC(s, scan_bitmap.in_journal_nohint); + } + PROC_INFO_INC(s, scan_bitmap.retry); + return 1; + } + return 0; +} + +/* it searches for a window of zero bits with given minimum and maximum lengths in one bitmap + * block; */ +static int scan_bitmap_block(struct reiserfs_transaction_handle *th, + unsigned int bmap_n, int *beg, int boundary, + int min, int max, int unfm) +{ + struct super_block *s = th->t_super; + struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n]; + struct buffer_head *bh; + int end, next; + int org = *beg; + + BUG_ON(!th->t_trans_id); + + RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of " + "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1); + PROC_INFO_INC(s, scan_bitmap.bmap); +/* this is unclear and lacks comments, explain how journal bitmaps + work here for the reader. Convey a sense of the design here. What + is a window? */ +/* - I mean `a window of zero bits' as in description of this function - Zam. */ + + if (!bi) { + reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer " + "for bitmap %d", bmap_n); + return 0; + } + + bh = reiserfs_read_bitmap_block(s, bmap_n); + if (bh == NULL) + return 0; + + while (1) { + cont: + if (bi->free_count < min) { + brelse(bh); + return 0; // No free blocks in this bitmap + } + + /* search for a first zero bit -- beginning of a window */ + *beg = reiserfs_find_next_zero_le_bit + ((unsigned long *)(bh->b_data), boundary, *beg); + + if (*beg + min > boundary) { /* search for a zero bit fails or the rest of bitmap block + * cannot contain a zero window of minimum size */ + brelse(bh); + return 0; + } + + if (unfm && is_block_in_journal(s, bmap_n, *beg, beg)) + continue; + /* first zero bit found; we check next bits */ + for (end = *beg + 1;; end++) { + if (end >= *beg + max || end >= boundary + || reiserfs_test_le_bit(end, bh->b_data)) { + next = end; + break; + } + /* finding the other end of zero bit window requires looking into journal structures (in + * case of searching for free blocks for unformatted nodes) */ + if (unfm && is_block_in_journal(s, bmap_n, end, &next)) + break; + } + + /* now (*beg) points to beginning of zero bits window, + * (end) points to one bit after the window end */ + if (end - *beg >= min) { /* it seems we have found window of proper size */ + int i; + reiserfs_prepare_for_journal(s, bh, 1); + /* try to set all blocks used checking are they still free */ + for (i = *beg; i < end; i++) { + /* It seems that we should not check in journal again. */ + if (reiserfs_test_and_set_le_bit + (i, bh->b_data)) { + /* bit was set by another process + * while we slept in prepare_for_journal() */ + PROC_INFO_INC(s, scan_bitmap.stolen); + if (i >= *beg + min) { /* we can continue with smaller set of allocated blocks, + * if length of this set is more or equal to `min' */ + end = i; + break; + } + /* otherwise we clear all bit were set ... */ + while (--i >= *beg) + reiserfs_test_and_clear_le_bit + (i, bh->b_data); + reiserfs_restore_prepared_buffer(s, bh); + *beg = org; + /* ... and search again in current block from beginning */ + goto cont; + } + } + bi->free_count -= (end - *beg); + journal_mark_dirty(th, s, bh); + brelse(bh); + + /* free block count calculation */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), + 1); + PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg)); + journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s)); + + return end - (*beg); + } else { + *beg = next; + } + } +} + +static int bmap_hash_id(struct super_block *s, u32 id) +{ + char *hash_in = NULL; + unsigned long hash; + unsigned bm; + + if (id <= 2) { + bm = 1; + } else { + hash_in = (char *)(&id); + hash = keyed_hash(hash_in, 4); + bm = hash % reiserfs_bmap_count(s); + if (!bm) + bm = 1; + } + /* this can only be true when SB_BMAP_NR = 1 */ + if (bm >= reiserfs_bmap_count(s)) + bm = 0; + return bm; +} + +/* + * hashes the id and then returns > 0 if the block group for the + * corresponding hash is full + */ +static inline int block_group_used(struct super_block *s, u32 id) +{ + int bm = bmap_hash_id(s, id); + struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm]; + + /* If we don't have cached information on this bitmap block, we're + * going to have to load it later anyway. Loading it here allows us + * to make a better decision. This favors long-term performance gain + * with a better on-disk layout vs. a short term gain of skipping the + * read and potentially having a bad placement. */ + if (info->free_count == UINT_MAX) { + struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm); + brelse(bh); + } + + if (info->free_count > ((s->s_blocksize << 3) * 60 / 100)) { + return 0; + } + return 1; +} + +/* + * the packing is returned in disk byte order + */ +__le32 reiserfs_choose_packing(struct inode * dir) +{ + __le32 packing; + if (TEST_OPTION(packing_groups, dir->i_sb)) { + u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id); + /* + * some versions of reiserfsck expect packing locality 1 to be + * special + */ + if (parent_dir == 1 || block_group_used(dir->i_sb, parent_dir)) + packing = INODE_PKEY(dir)->k_objectid; + else + packing = INODE_PKEY(dir)->k_dir_id; + } else + packing = INODE_PKEY(dir)->k_objectid; + return packing; +} + +/* Tries to find contiguous zero bit window (given size) in given region of + * bitmap and place new blocks there. Returns number of allocated blocks. */ +static int scan_bitmap(struct reiserfs_transaction_handle *th, + b_blocknr_t * start, b_blocknr_t finish, + int min, int max, int unfm, sector_t file_block) +{ + int nr_allocated = 0; + struct super_block *s = th->t_super; + /* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr + * - Hans, it is not a block number - Zam. */ + + unsigned int bm, off; + unsigned int end_bm, end_off; + unsigned int off_max = s->s_blocksize << 3; + + BUG_ON(!th->t_trans_id); + + PROC_INFO_INC(s, scan_bitmap.call); + if (SB_FREE_BLOCKS(s) <= 0) + return 0; // No point in looking for more free blocks + + get_bit_address(s, *start, &bm, &off); + get_bit_address(s, finish, &end_bm, &end_off); + if (bm > reiserfs_bmap_count(s)) + return 0; + if (end_bm > reiserfs_bmap_count(s)) + end_bm = reiserfs_bmap_count(s); + + /* When the bitmap is more than 10% free, anyone can allocate. + * When it's less than 10% free, only files that already use the + * bitmap are allowed. Once we pass 80% full, this restriction + * is lifted. + * + * We do this so that files that grow later still have space close to + * their original allocation. This improves locality, and presumably + * performance as a result. + * + * This is only an allocation policy and does not make up for getting a + * bad hint. Decent hinting must be implemented for this to work well. + */ + if (TEST_OPTION(skip_busy, s) + && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s) / 20) { + for (; bm < end_bm; bm++, off = 0) { + if ((off && (!unfm || (file_block != 0))) + || SB_AP_BITMAP(s)[bm].free_count > + (s->s_blocksize << 3) / 10) + nr_allocated = + scan_bitmap_block(th, bm, &off, off_max, + min, max, unfm); + if (nr_allocated) + goto ret; + } + /* we know from above that start is a reasonable number */ + get_bit_address(s, *start, &bm, &off); + } + + for (; bm < end_bm; bm++, off = 0) { + nr_allocated = + scan_bitmap_block(th, bm, &off, off_max, min, max, unfm); + if (nr_allocated) + goto ret; + } + + nr_allocated = + scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm); + + ret: + *start = bm * off_max + off; + return nr_allocated; + +} + +static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block, + int for_unformatted) +{ + struct super_block *s = th->t_super; + struct reiserfs_super_block *rs; + struct buffer_head *sbh, *bmbh; + struct reiserfs_bitmap_info *apbi; + unsigned int nr, offset; + + BUG_ON(!th->t_trans_id); + + PROC_INFO_INC(s, free_block); + + rs = SB_DISK_SUPER_BLOCK(s); + sbh = SB_BUFFER_WITH_SB(s); + apbi = SB_AP_BITMAP(s); + + get_bit_address(s, block, &nr, &offset); + + if (nr >= reiserfs_bmap_count(s)) { + reiserfs_error(s, "vs-4075", "block %lu is out of range", + block); + return; + } + + bmbh = reiserfs_read_bitmap_block(s, nr); + if (!bmbh) + return; + + reiserfs_prepare_for_journal(s, bmbh, 1); + + /* clear bit for the given block in bit map */ + if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) { + reiserfs_error(s, "vs-4080", + "block %lu: bit already cleared", block); + } + apbi[nr].free_count++; + journal_mark_dirty(th, s, bmbh); + brelse(bmbh); + + reiserfs_prepare_for_journal(s, sbh, 1); + /* update super block */ + set_sb_free_blocks(rs, sb_free_blocks(rs) + 1); + + journal_mark_dirty(th, s, sbh); + if (for_unformatted) + dquot_free_block_nodirty(inode, 1); +} + +void reiserfs_free_block(struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block, + int for_unformatted) +{ + struct super_block *s = th->t_super; + BUG_ON(!th->t_trans_id); + + RFALSE(!s, "vs-4061: trying to free block on nonexistent device"); + if (!is_reusable(s, block, 1)) + return; + + if (block > sb_block_count(REISERFS_SB(s)->s_rs)) { + reiserfs_error(th->t_super, "bitmap-4072", + "Trying to free block outside file system " + "boundaries (%lu > %lu)", + block, sb_block_count(REISERFS_SB(s)->s_rs)); + return; + } + /* mark it before we clear it, just in case */ + journal_mark_freed(th, s, block); + _reiserfs_free_block(th, inode, block, for_unformatted); +} + +/* preallocated blocks don't need to be run through journal_mark_freed */ +static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block) +{ + BUG_ON(!th->t_trans_id); + RFALSE(!th->t_super, + "vs-4060: trying to free block on nonexistent device"); + if (!is_reusable(th->t_super, block, 1)) + return; + _reiserfs_free_block(th, inode, block, 1); +} + +static void __discard_prealloc(struct reiserfs_transaction_handle *th, + struct reiserfs_inode_info *ei) +{ + unsigned long save = ei->i_prealloc_block; + int dirty = 0; + struct inode *inode = &ei->vfs_inode; + BUG_ON(!th->t_trans_id); +#ifdef CONFIG_REISERFS_CHECK + if (ei->i_prealloc_count < 0) + reiserfs_error(th->t_super, "zam-4001", + "inode has negative prealloc blocks count."); +#endif + while (ei->i_prealloc_count > 0) { + reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block); + ei->i_prealloc_block++; + ei->i_prealloc_count--; + dirty = 1; + } + if (dirty) + reiserfs_update_sd(th, inode); + ei->i_prealloc_block = save; + list_del_init(&(ei->i_prealloc_list)); +} + +/* FIXME: It should be inline function */ +void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th, + struct inode *inode) +{ + struct reiserfs_inode_info *ei = REISERFS_I(inode); + BUG_ON(!th->t_trans_id); + if (ei->i_prealloc_count) + __discard_prealloc(th, ei); +} + +void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th) +{ + struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list; + + BUG_ON(!th->t_trans_id); + + while (!list_empty(plist)) { + struct reiserfs_inode_info *ei; + ei = list_entry(plist->next, struct reiserfs_inode_info, + i_prealloc_list); +#ifdef CONFIG_REISERFS_CHECK + if (!ei->i_prealloc_count) { + reiserfs_error(th->t_super, "zam-4001", + "inode is in prealloc list but has " + "no preallocated blocks."); + } +#endif + __discard_prealloc(th, ei); + } +} + +void reiserfs_init_alloc_options(struct super_block *s) +{ + set_bit(_ALLOC_skip_busy, &SB_ALLOC_OPTS(s)); + set_bit(_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s)); + set_bit(_ALLOC_packing_groups, &SB_ALLOC_OPTS(s)); +} + +/* block allocator related options are parsed here */ +int reiserfs_parse_alloc_options(struct super_block *s, char *options) +{ + char *this_char, *value; + + REISERFS_SB(s)->s_alloc_options.bits = 0; /* clear default settings */ + + while ((this_char = strsep(&options, ":")) != NULL) { + if ((value = strchr(this_char, '=')) != NULL) + *value++ = 0; + + if (!strcmp(this_char, "concentrating_formatted_nodes")) { + int temp; + SET_OPTION(concentrating_formatted_nodes); + temp = (value + && *value) ? simple_strtoul(value, &value, + 0) : 10; + if (temp <= 0 || temp > 100) { + REISERFS_SB(s)->s_alloc_options.border = 10; + } else { + REISERFS_SB(s)->s_alloc_options.border = + 100 / temp; + } + continue; + } + if (!strcmp(this_char, "displacing_large_files")) { + SET_OPTION(displacing_large_files); + REISERFS_SB(s)->s_alloc_options.large_file_size = + (value + && *value) ? simple_strtoul(value, &value, 0) : 16; + continue; + } + if (!strcmp(this_char, "displacing_new_packing_localities")) { + SET_OPTION(displacing_new_packing_localities); + continue; + }; + + if (!strcmp(this_char, "old_hashed_relocation")) { + SET_OPTION(old_hashed_relocation); + continue; + } + + if (!strcmp(this_char, "new_hashed_relocation")) { + SET_OPTION(new_hashed_relocation); + continue; + } + + if (!strcmp(this_char, "dirid_groups")) { + SET_OPTION(dirid_groups); + continue; + } + if (!strcmp(this_char, "oid_groups")) { + SET_OPTION(oid_groups); + continue; + } + if (!strcmp(this_char, "packing_groups")) { + SET_OPTION(packing_groups); + continue; + } + if (!strcmp(this_char, "hashed_formatted_nodes")) { + SET_OPTION(hashed_formatted_nodes); + continue; + } + + if (!strcmp(this_char, "skip_busy")) { + SET_OPTION(skip_busy); + continue; + } + + if (!strcmp(this_char, "hundredth_slices")) { + SET_OPTION(hundredth_slices); + continue; + } + + if (!strcmp(this_char, "old_way")) { + SET_OPTION(old_way); + continue; + } + + if (!strcmp(this_char, "displace_based_on_dirid")) { + SET_OPTION(displace_based_on_dirid); + continue; + } + + if (!strcmp(this_char, "preallocmin")) { + REISERFS_SB(s)->s_alloc_options.preallocmin = + (value + && *value) ? simple_strtoul(value, &value, 0) : 4; + continue; + } + + if (!strcmp(this_char, "preallocsize")) { + REISERFS_SB(s)->s_alloc_options.preallocsize = + (value + && *value) ? simple_strtoul(value, &value, + 0) : + PREALLOCATION_SIZE; + continue; + } + + reiserfs_warning(s, "zam-4001", "unknown option - %s", + this_char); + return 1; + } + + reiserfs_info(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s)); + return 0; +} + +static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint) +{ + char *hash_in; + if (hint->formatted_node) { + hash_in = (char *)&hint->key.k_dir_id; + } else { + if (!hint->inode) { + //hint->search_start = hint->beg; + hash_in = (char *)&hint->key.k_dir_id; + } else + if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); + else + hash_in = + (char *)(&INODE_PKEY(hint->inode)->k_objectid); + } + + hint->search_start = + hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); +} + +/* + * Relocation based on dirid, hashing them into a given bitmap block + * files. Formatted nodes are unaffected, a separate policy covers them + */ +static void dirid_groups(reiserfs_blocknr_hint_t * hint) +{ + unsigned long hash; + __u32 dirid = 0; + int bm = 0; + struct super_block *sb = hint->th->t_super; + if (hint->inode) + dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); + else if (hint->formatted_node) + dirid = hint->key.k_dir_id; + + if (dirid) { + bm = bmap_hash_id(sb, dirid); + hash = bm * (sb->s_blocksize << 3); + /* give a portion of the block group to metadata */ + if (hint->inode) + hash += sb->s_blocksize / 2; + hint->search_start = hash; + } +} + +/* + * Relocation based on oid, hashing them into a given bitmap block + * files. Formatted nodes are unaffected, a separate policy covers them + */ +static void oid_groups(reiserfs_blocknr_hint_t * hint) +{ + if (hint->inode) { + unsigned long hash; + __u32 oid; + __u32 dirid; + int bm; + + dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); + + /* keep the root dir and it's first set of subdirs close to + * the start of the disk + */ + if (dirid <= 2) + hash = (hint->inode->i_sb->s_blocksize << 3); + else { + oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid); + bm = bmap_hash_id(hint->inode->i_sb, oid); + hash = bm * (hint->inode->i_sb->s_blocksize << 3); + } + hint->search_start = hash; + } +} + +/* returns 1 if it finds an indirect item and gets valid hint info + * from it, otherwise 0 + */ +static int get_left_neighbor(reiserfs_blocknr_hint_t * hint) +{ + struct treepath *path; + struct buffer_head *bh; + struct item_head *ih; + int pos_in_item; + __le32 *item; + int ret = 0; + + if (!hint->path) /* reiserfs code can call this function w/o pointer to path + * structure supplied; then we rely on supplied search_start */ + return 0; + + path = hint->path; + bh = get_last_bh(path); + RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor"); + ih = get_ih(path); + pos_in_item = path->pos_in_item; + item = get_item(path); + + hint->search_start = bh->b_blocknr; + + if (!hint->formatted_node && is_indirect_le_ih(ih)) { + /* for indirect item: go to left and look for the first non-hole entry + in the indirect item */ + if (pos_in_item == I_UNFM_NUM(ih)) + pos_in_item--; +// pos_in_item = I_UNFM_NUM (ih) - 1; + while (pos_in_item >= 0) { + int t = get_block_num(item, pos_in_item); + if (t) { + hint->search_start = t; + ret = 1; + break; + } + pos_in_item--; + } + } + + /* does result value fit into specified region? */ + return ret; +} + +/* should be, if formatted node, then try to put on first part of the device + specified as number of percent with mount option device, else try to put + on last of device. This is not to say it is good code to do so, + but the effect should be measured. */ +static inline void set_border_in_hint(struct super_block *s, + reiserfs_blocknr_hint_t * hint) +{ + b_blocknr_t border = + SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border; + + if (hint->formatted_node) + hint->end = border - 1; + else + hint->beg = border; +} + +static inline void displace_large_file(reiserfs_blocknr_hint_t * hint) +{ + if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hint->search_start = + hint->beg + + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id), + 4) % (hint->end - hint->beg); + else + hint->search_start = + hint->beg + + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid), + 4) % (hint->end - hint->beg); +} + +static inline void hash_formatted_node(reiserfs_blocknr_hint_t * hint) +{ + char *hash_in; + + if (!hint->inode) + hash_in = (char *)&hint->key.k_dir_id; + else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); + else + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid); + + hint->search_start = + hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); +} + +static inline int +this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t * + hint) +{ + return hint->block == + REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size; +} + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES +static inline void displace_new_packing_locality(reiserfs_blocknr_hint_t * hint) +{ + struct in_core_key *key = &hint->key; + + hint->th->displace_new_blocks = 0; + hint->search_start = + hint->beg + keyed_hash((char *)(&key->k_objectid), + 4) % (hint->end - hint->beg); +} +#endif + +static inline int old_hashed_relocation(reiserfs_blocknr_hint_t * hint) +{ + b_blocknr_t border; + u32 hash_in; + + if (hint->formatted_node || hint->inode == NULL) { + return 0; + } + + hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id); + border = + hint->beg + (u32) keyed_hash(((char *)(&hash_in)), + 4) % (hint->end - hint->beg - 1); + if (border > hint->search_start) + hint->search_start = border; + + return 1; +} + +static inline int old_way(reiserfs_blocknr_hint_t * hint) +{ + b_blocknr_t border; + + if (hint->formatted_node || hint->inode == NULL) { + return 0; + } + + border = + hint->beg + + le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end - + hint->beg); + if (border > hint->search_start) + hint->search_start = border; + + return 1; +} + +static inline void hundredth_slices(reiserfs_blocknr_hint_t * hint) +{ + struct in_core_key *key = &hint->key; + b_blocknr_t slice_start; + + slice_start = + (keyed_hash((char *)(&key->k_dir_id), 4) % 100) * (hint->end / 100); + if (slice_start > hint->search_start + || slice_start + (hint->end / 100) <= hint->search_start) { + hint->search_start = slice_start; + } +} + +static void determine_search_start(reiserfs_blocknr_hint_t * hint, + int amount_needed) +{ + struct super_block *s = hint->th->t_super; + int unfm_hint; + + hint->beg = 0; + hint->end = SB_BLOCK_COUNT(s) - 1; + + /* This is former border algorithm. Now with tunable border offset */ + if (concentrating_formatted_nodes(s)) + set_border_in_hint(s, hint); + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + /* whenever we create a new directory, we displace it. At first we will + hash for location, later we might look for a moderately empty place for + it */ + if (displacing_new_packing_localities(s) + && hint->th->displace_new_blocks) { + displace_new_packing_locality(hint); + + /* we do not continue determine_search_start, + * if new packing locality is being displaced */ + return; + } +#endif + + /* all persons should feel encouraged to add more special cases here and + * test them */ + + if (displacing_large_files(s) && !hint->formatted_node + && this_blocknr_allocation_would_make_it_a_large_file(hint)) { + displace_large_file(hint); + return; + } + + /* if none of our special cases is relevant, use the left neighbor in the + tree order of the new node we are allocating for */ + if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) { + hash_formatted_node(hint); + return; + } + + unfm_hint = get_left_neighbor(hint); + + /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation, + new blocks are displaced based on directory ID. Also, if suggested search_start + is less than last preallocated block, we start searching from it, assuming that + HDD dataflow is faster in forward direction */ + if (TEST_OPTION(old_way, s)) { + if (!hint->formatted_node) { + if (!reiserfs_hashed_relocation(s)) + old_way(hint); + else if (!reiserfs_no_unhashed_relocation(s)) + old_hashed_relocation(hint); + + if (hint->inode + && hint->search_start < + REISERFS_I(hint->inode)->i_prealloc_block) + hint->search_start = + REISERFS_I(hint->inode)->i_prealloc_block; + } + return; + } + + /* This is an approach proposed by Hans */ + if (TEST_OPTION(hundredth_slices, s) + && !(displacing_large_files(s) && !hint->formatted_node)) { + hundredth_slices(hint); + return; + } + + /* old_hashed_relocation only works on unformatted */ + if (!unfm_hint && !hint->formatted_node && + TEST_OPTION(old_hashed_relocation, s)) { + old_hashed_relocation(hint); + } + /* new_hashed_relocation works with both formatted/unformatted nodes */ + if ((!unfm_hint || hint->formatted_node) && + TEST_OPTION(new_hashed_relocation, s)) { + new_hashed_relocation(hint); + } + /* dirid grouping works only on unformatted nodes */ + if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) { + dirid_groups(hint); + } +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + if (hint->formatted_node && TEST_OPTION(dirid_groups, s)) { + dirid_groups(hint); + } +#endif + + /* oid grouping works only on unformatted nodes */ + if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups, s)) { + oid_groups(hint); + } + return; +} + +static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint) +{ + /* make minimum size a mount option and benchmark both ways */ + /* we preallocate blocks only for regular files, specific size */ + /* benchmark preallocating always and see what happens */ + + hint->prealloc_size = 0; + + if (!hint->formatted_node && hint->preallocate) { + if (S_ISREG(hint->inode->i_mode) + && hint->inode->i_size >= + REISERFS_SB(hint->th->t_super)->s_alloc_options. + preallocmin * hint->inode->i_sb->s_blocksize) + hint->prealloc_size = + REISERFS_SB(hint->th->t_super)->s_alloc_options. + preallocsize - 1; + } + return CARRY_ON; +} + +/* XXX I know it could be merged with upper-level function; + but may be result function would be too complex. */ +static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint, + b_blocknr_t * new_blocknrs, + b_blocknr_t start, + b_blocknr_t finish, int min, + int amount_needed, + int prealloc_size) +{ + int rest = amount_needed; + int nr_allocated; + + while (rest > 0 && start <= finish) { + nr_allocated = scan_bitmap(hint->th, &start, finish, min, + rest + prealloc_size, + !hint->formatted_node, hint->block); + + if (nr_allocated == 0) /* no new blocks allocated, return */ + break; + + /* fill free_blocknrs array first */ + while (rest > 0 && nr_allocated > 0) { + *new_blocknrs++ = start++; + rest--; + nr_allocated--; + } + + /* do we have something to fill prealloc. array also ? */ + if (nr_allocated > 0) { + /* it means prealloc_size was greater that 0 and we do preallocation */ + list_add(&REISERFS_I(hint->inode)->i_prealloc_list, + &SB_JOURNAL(hint->th->t_super)-> + j_prealloc_list); + REISERFS_I(hint->inode)->i_prealloc_block = start; + REISERFS_I(hint->inode)->i_prealloc_count = + nr_allocated; + break; + } + } + + return (amount_needed - rest); +} + +static inline int blocknrs_and_prealloc_arrays_from_search_start + (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, + int amount_needed) { + struct super_block *s = hint->th->t_super; + b_blocknr_t start = hint->search_start; + b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1; + int passno = 0; + int nr_allocated = 0; + + determine_prealloc_size(hint); + if (!hint->formatted_node) { + int quota_ret; +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: allocating %d blocks id=%u", + amount_needed, hint->inode->i_uid); +#endif + quota_ret = + dquot_alloc_block_nodirty(hint->inode, amount_needed); + if (quota_ret) /* Quota exceeded? */ + return QUOTA_EXCEEDED; + if (hint->preallocate && hint->prealloc_size) { +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: allocating (prealloc) %d blocks id=%u", + hint->prealloc_size, hint->inode->i_uid); +#endif + quota_ret = dquot_prealloc_block_nodirty(hint->inode, + hint->prealloc_size); + if (quota_ret) + hint->preallocate = hint->prealloc_size = 0; + } + /* for unformatted nodes, force large allocations */ + } + + do { + switch (passno++) { + case 0: /* Search from hint->search_start to end of disk */ + start = hint->search_start; + finish = SB_BLOCK_COUNT(s) - 1; + break; + case 1: /* Search from hint->beg to hint->search_start */ + start = hint->beg; + finish = hint->search_start; + break; + case 2: /* Last chance: Search from 0 to hint->beg */ + start = 0; + finish = hint->beg; + break; + default: /* We've tried searching everywhere, not enough space */ + /* Free the blocks */ + if (!hint->formatted_node) { +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: freeing (nospace) %d blocks id=%u", + amount_needed + + hint->prealloc_size - + nr_allocated, + hint->inode->i_uid); +#endif + /* Free not allocated blocks */ + dquot_free_block_nodirty(hint->inode, + amount_needed + hint->prealloc_size - + nr_allocated); + } + while (nr_allocated--) + reiserfs_free_block(hint->th, hint->inode, + new_blocknrs[nr_allocated], + !hint->formatted_node); + + return NO_DISK_SPACE; + } + } while ((nr_allocated += allocate_without_wrapping_disk(hint, + new_blocknrs + + nr_allocated, + start, finish, + 1, + amount_needed - + nr_allocated, + hint-> + prealloc_size)) + < amount_needed); + if (!hint->formatted_node && + amount_needed + hint->prealloc_size > + nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) { + /* Some of preallocation blocks were not allocated */ +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: freeing (failed prealloc) %d blocks id=%u", + amount_needed + hint->prealloc_size - + nr_allocated - + REISERFS_I(hint->inode)->i_prealloc_count, + hint->inode->i_uid); +#endif + dquot_free_block_nodirty(hint->inode, amount_needed + + hint->prealloc_size - nr_allocated - + REISERFS_I(hint->inode)-> + i_prealloc_count); + } + + return CARRY_ON; +} + +/* grab new blocknrs from preallocated list */ +/* return amount still needed after using them */ +static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint, + b_blocknr_t * new_blocknrs, + int amount_needed) +{ + struct inode *inode = hint->inode; + + if (REISERFS_I(inode)->i_prealloc_count > 0) { + while (amount_needed) { + + *new_blocknrs++ = REISERFS_I(inode)->i_prealloc_block++; + REISERFS_I(inode)->i_prealloc_count--; + + amount_needed--; + + if (REISERFS_I(inode)->i_prealloc_count <= 0) { + list_del(&REISERFS_I(inode)->i_prealloc_list); + break; + } + } + } + /* return amount still needed after using preallocated blocks */ + return amount_needed; +} + +int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, int amount_needed, int reserved_by_us /* Amount of blocks we have + already reserved */ ) +{ + int initial_amount_needed = amount_needed; + int ret; + struct super_block *s = hint->th->t_super; + + /* Check if there is enough space, taking into account reserved space */ + if (SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks < + amount_needed - reserved_by_us) + return NO_DISK_SPACE; + /* should this be if !hint->inode && hint->preallocate? */ + /* do you mean hint->formatted_node can be removed ? - Zam */ + /* hint->formatted_node cannot be removed because we try to access + inode information here, and there is often no inode assotiated with + metadata allocations - green */ + + if (!hint->formatted_node && hint->preallocate) { + amount_needed = use_preallocated_list_if_available + (hint, new_blocknrs, amount_needed); + if (amount_needed == 0) /* all blocknrs we need we got from + prealloc. list */ + return CARRY_ON; + new_blocknrs += (initial_amount_needed - amount_needed); + } + + /* find search start and save it in hint structure */ + determine_search_start(hint, amount_needed); + if (hint->search_start >= SB_BLOCK_COUNT(s)) + hint->search_start = SB_BLOCK_COUNT(s) - 1; + + /* allocation itself; fill new_blocknrs and preallocation arrays */ + ret = blocknrs_and_prealloc_arrays_from_search_start + (hint, new_blocknrs, amount_needed); + + /* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we + * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second + * variant) */ + + if (ret != CARRY_ON) { + while (amount_needed++ < initial_amount_needed) { + reiserfs_free_block(hint->th, hint->inode, + *(--new_blocknrs), 1); + } + } + return ret; +} + +void reiserfs_cache_bitmap_metadata(struct super_block *sb, + struct buffer_head *bh, + struct reiserfs_bitmap_info *info) +{ + unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size); + + /* The first bit must ALWAYS be 1 */ + if (!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data)) + reiserfs_error(sb, "reiserfs-2025", "bitmap block %lu is " + "corrupted: first bit must be 1", bh->b_blocknr); + + info->free_count = 0; + + while (--cur >= (unsigned long *)bh->b_data) { + int i; + + /* 0 and ~0 are special, we can optimize for them */ + if (*cur == 0) + info->free_count += BITS_PER_LONG; + else if (*cur != ~0L) /* A mix, investigate */ + for (i = BITS_PER_LONG - 1; i >= 0; i--) + if (!reiserfs_test_le_bit(i, cur)) + info->free_count++; + } +} + +struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, + unsigned int bitmap) +{ + b_blocknr_t block = (sb->s_blocksize << 3) * bitmap; + struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap; + struct buffer_head *bh; + + /* Way old format filesystems had the bitmaps packed up front. + * I doubt there are any of these left, but just in case... */ + if (unlikely(test_bit(REISERFS_OLD_FORMAT, + &(REISERFS_SB(sb)->s_properties)))) + block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap; + else if (bitmap == 0) + block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1; + + reiserfs_write_unlock(sb); + bh = sb_bread(sb, block); + reiserfs_write_lock(sb); + if (bh == NULL) + reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) " + "reading failed", __func__, block); + else { + if (buffer_locked(bh)) { + PROC_INFO_INC(sb, scan_bitmap.wait); + reiserfs_write_unlock(sb); + __wait_on_buffer(bh); + reiserfs_write_lock(sb); + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(atomic_read(&bh->b_count) == 0); + + if (info->free_count == UINT_MAX) + reiserfs_cache_bitmap_metadata(sb, bh, info); + } + + return bh; +} + +int reiserfs_init_bitmap_cache(struct super_block *sb) +{ + struct reiserfs_bitmap_info *bitmap; + unsigned int bmap_nr = reiserfs_bmap_count(sb); + + /* Avoid lock recursion in fault case */ + reiserfs_write_unlock(sb); + bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); + reiserfs_write_lock(sb); + if (bitmap == NULL) + return -ENOMEM; + + memset(bitmap, 0xff, sizeof(*bitmap) * bmap_nr); + + SB_AP_BITMAP(sb) = bitmap; + + return 0; +} + +void reiserfs_free_bitmap_cache(struct super_block *sb) +{ + if (SB_AP_BITMAP(sb)) { + vfree(SB_AP_BITMAP(sb)); + SB_AP_BITMAP(sb) = NULL; + } +} diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c new file mode 100644 index 00000000..198dabf1 --- /dev/null +++ b/fs/reiserfs/dir.c @@ -0,0 +1,310 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +extern const struct reiserfs_key MIN_KEY; + +static int reiserfs_readdir(struct file *, void *, filldir_t); +static int reiserfs_dir_fsync(struct file *filp, int datasync); + +const struct file_operations reiserfs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = reiserfs_readdir, + .fsync = reiserfs_dir_fsync, + .unlocked_ioctl = reiserfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = reiserfs_compat_ioctl, +#endif +}; + +static int reiserfs_dir_fsync(struct file *filp, int datasync) +{ + struct inode *inode = filp->f_mapping->host; + int err; + reiserfs_write_lock(inode->i_sb); + err = reiserfs_commit_for_inode(inode); + reiserfs_write_unlock(inode->i_sb); + if (err < 0) + return err; + return 0; +} + +#define store_ih(where,what) copy_item_head (where, what) + +static inline bool is_privroot_deh(struct dentry *dir, + struct reiserfs_de_head *deh) +{ + struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root; + return (dir == dir->d_parent && privroot->d_inode && + deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); +} + +int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, + filldir_t filldir, loff_t *pos) +{ + struct inode *inode = dentry->d_inode; + struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ + INITIALIZE_PATH(path_to_entry); + struct buffer_head *bh; + int item_num, entry_num; + const struct reiserfs_key *rkey; + struct item_head *ih, tmp_ih; + int search_res; + char *local_buf; + loff_t next_pos; + char small_buf[32]; /* avoid kmalloc if we can */ + struct reiserfs_dir_entry de; + int ret = 0; + + reiserfs_write_lock(inode->i_sb); + + reiserfs_check_lock_depth(inode->i_sb, "readdir"); + + /* form key for search the next directory entry using f_pos field of + file structure */ + make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3); + next_pos = cpu_key_k_offset(&pos_key); + + path_to_entry.reada = PATH_READA; + while (1) { + research: + /* search the directory item, containing entry with specified key */ + search_res = + search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, + &de); + if (search_res == IO_ERROR) { + // FIXME: we could just skip part of directory which could + // not be read + ret = -EIO; + goto out; + } + entry_num = de.de_entry_num; + bh = de.de_bh; + item_num = de.de_item_num; + ih = de.de_ih; + store_ih(&tmp_ih, ih); + + /* we must have found item, that is item of this directory, */ + RFALSE(COMP_SHORT_KEYS(&(ih->ih_key), &pos_key), + "vs-9000: found item %h does not match to dir we readdir %K", + ih, &pos_key); + RFALSE(item_num > B_NR_ITEMS(bh) - 1, + "vs-9005 item_num == %d, item amount == %d", + item_num, B_NR_ITEMS(bh)); + + /* and entry must be not more than number of entries in the item */ + RFALSE(I_ENTRY_COUNT(ih) < entry_num, + "vs-9010: entry number is too big %d (%d)", + entry_num, I_ENTRY_COUNT(ih)); + + if (search_res == POSITION_FOUND + || entry_num < I_ENTRY_COUNT(ih)) { + /* go through all entries in the directory item beginning from the entry, that has been found */ + struct reiserfs_de_head *deh = + B_I_DEH(bh, ih) + entry_num; + + for (; entry_num < I_ENTRY_COUNT(ih); + entry_num++, deh++) { + int d_reclen; + char *d_name; + off_t d_off; + ino_t d_ino; + + if (!de_visible(deh)) + /* it is hidden entry */ + continue; + d_reclen = entry_length(bh, ih, entry_num); + d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh); + + if (d_reclen <= 0 || + d_name + d_reclen > bh->b_data + bh->b_size) { + /* There is corrupted data in entry, + * We'd better stop here */ + pathrelse(&path_to_entry); + ret = -EIO; + goto out; + } + + if (!d_name[d_reclen - 1]) + d_reclen = strlen(d_name); + + if (d_reclen > + REISERFS_MAX_NAME(inode->i_sb-> + s_blocksize)) { + /* too big to send back to VFS */ + continue; + } + + /* Ignore the .reiserfs_priv entry */ + if (is_privroot_deh(dentry, deh)) + continue; + + d_off = deh_offset(deh); + *pos = d_off; + d_ino = deh_objectid(deh); + if (d_reclen <= 32) { + local_buf = small_buf; + } else { + local_buf = kmalloc(d_reclen, + GFP_NOFS); + if (!local_buf) { + pathrelse(&path_to_entry); + ret = -ENOMEM; + goto out; + } + if (item_moved(&tmp_ih, &path_to_entry)) { + kfree(local_buf); + goto research; + } + } + // Note, that we copy name to user space via temporary + // buffer (local_buf) because filldir will block if + // user space buffer is swapped out. At that time + // entry can move to somewhere else + memcpy(local_buf, d_name, d_reclen); + + /* + * Since filldir might sleep, we can release + * the write lock here for other waiters + */ + reiserfs_write_unlock(inode->i_sb); + if (filldir + (dirent, local_buf, d_reclen, d_off, d_ino, + DT_UNKNOWN) < 0) { + reiserfs_write_lock(inode->i_sb); + if (local_buf != small_buf) { + kfree(local_buf); + } + goto end; + } + reiserfs_write_lock(inode->i_sb); + if (local_buf != small_buf) { + kfree(local_buf); + } + // next entry should be looked for with such offset + next_pos = deh_offset(deh) + 1; + + if (item_moved(&tmp_ih, &path_to_entry)) { + goto research; + } + } /* for */ + } + + if (item_num != B_NR_ITEMS(bh) - 1) + // end of directory has been reached + goto end; + + /* item we went through is last item of node. Using right + delimiting key check is it directory end */ + rkey = get_rkey(&path_to_entry, inode->i_sb); + if (!comp_le_keys(rkey, &MIN_KEY)) { + /* set pos_key to key, that is the smallest and greater + that key of the last entry in the item */ + set_cpu_key_k_offset(&pos_key, next_pos); + continue; + } + + if (COMP_SHORT_KEYS(rkey, &pos_key)) { + // end of directory has been reached + goto end; + } + + /* directory continues in the right neighboring block */ + set_cpu_key_k_offset(&pos_key, + le_key_k_offset(KEY_FORMAT_3_5, rkey)); + + } /* while */ + +end: + *pos = next_pos; + pathrelse(&path_to_entry); + reiserfs_check_path(&path_to_entry); +out: + reiserfs_write_unlock(inode->i_sb); + return ret; +} + +static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = file->f_path.dentry; + return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos); +} + +/* compose directory item containing "." and ".." entries (entries are + not aligned to 4 byte boundary) */ +/* the last four params are LE */ +void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid, + __le32 par_dirid, __le32 par_objid) +{ + struct reiserfs_de_head *deh; + + memset(body, 0, EMPTY_DIR_SIZE_V1); + deh = (struct reiserfs_de_head *)body; + + /* direntry header of "." */ + put_deh_offset(&(deh[0]), DOT_OFFSET); + /* these two are from make_le_item_head, and are are LE */ + deh[0].deh_dir_id = dirid; + deh[0].deh_objectid = objid; + deh[0].deh_state = 0; /* Endian safe if 0 */ + put_deh_location(&(deh[0]), EMPTY_DIR_SIZE_V1 - strlen(".")); + mark_de_visible(&(deh[0])); + + /* direntry header of ".." */ + put_deh_offset(&(deh[1]), DOT_DOT_OFFSET); + /* key of ".." for the root directory */ + /* these two are from the inode, and are are LE */ + deh[1].deh_dir_id = par_dirid; + deh[1].deh_objectid = par_objid; + deh[1].deh_state = 0; /* Endian safe if 0 */ + put_deh_location(&(deh[1]), deh_location(&(deh[0])) - strlen("..")); + mark_de_visible(&(deh[1])); + + /* copy ".." and "." */ + memcpy(body + deh_location(&(deh[0])), ".", 1); + memcpy(body + deh_location(&(deh[1])), "..", 2); +} + +/* compose directory item containing "." and ".." entries */ +void make_empty_dir_item(char *body, __le32 dirid, __le32 objid, + __le32 par_dirid, __le32 par_objid) +{ + struct reiserfs_de_head *deh; + + memset(body, 0, EMPTY_DIR_SIZE); + deh = (struct reiserfs_de_head *)body; + + /* direntry header of "." */ + put_deh_offset(&(deh[0]), DOT_OFFSET); + /* these two are from make_le_item_head, and are are LE */ + deh[0].deh_dir_id = dirid; + deh[0].deh_objectid = objid; + deh[0].deh_state = 0; /* Endian safe if 0 */ + put_deh_location(&(deh[0]), EMPTY_DIR_SIZE - ROUND_UP(strlen("."))); + mark_de_visible(&(deh[0])); + + /* direntry header of ".." */ + put_deh_offset(&(deh[1]), DOT_DOT_OFFSET); + /* key of ".." for the root directory */ + /* these two are from the inode, and are are LE */ + deh[1].deh_dir_id = par_dirid; + deh[1].deh_objectid = par_objid; + deh[1].deh_state = 0; /* Endian safe if 0 */ + put_deh_location(&(deh[1]), + deh_location(&(deh[0])) - ROUND_UP(strlen(".."))); + mark_de_visible(&(deh[1])); + + /* copy ".." and "." */ + memcpy(body + deh_location(&(deh[0])), ".", 1); + memcpy(body + deh_location(&(deh[1])), "..", 2); +} diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c new file mode 100644 index 00000000..60c08044 --- /dev/null +++ b/fs/reiserfs/do_balan.c @@ -0,0 +1,2074 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* Now we have all buffers that must be used in balancing of the tree */ +/* Further calculations can not cause schedule(), and thus the buffer */ +/* tree will be stable until the balancing will be finished */ +/* balance the tree according to the analysis made before, */ +/* and using buffers obtained after all above. */ + +/** + ** balance_leaf_when_delete + ** balance_leaf + ** do_balance + ** + **/ + +#include +#include +#include +#include +#include + +static inline void buffer_info_init_left(struct tree_balance *tb, + struct buffer_info *bi) +{ + bi->tb = tb; + bi->bi_bh = tb->L[0]; + bi->bi_parent = tb->FL[0]; + bi->bi_position = get_left_neighbor_position(tb, 0); +} + +static inline void buffer_info_init_right(struct tree_balance *tb, + struct buffer_info *bi) +{ + bi->tb = tb; + bi->bi_bh = tb->R[0]; + bi->bi_parent = tb->FR[0]; + bi->bi_position = get_right_neighbor_position(tb, 0); +} + +static inline void buffer_info_init_tbS0(struct tree_balance *tb, + struct buffer_info *bi) +{ + bi->tb = tb; + bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + bi->bi_position = PATH_H_POSITION(tb->tb_path, 1); +} + +static inline void buffer_info_init_bh(struct tree_balance *tb, + struct buffer_info *bi, + struct buffer_head *bh) +{ + bi->tb = tb; + bi->bi_bh = bh; + bi->bi_parent = NULL; + bi->bi_position = 0; +} + +inline void do_balance_mark_leaf_dirty(struct tree_balance *tb, + struct buffer_head *bh, int flag) +{ + journal_mark_dirty(tb->transaction_handle, + tb->transaction_handle->t_super, bh); +} + +#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty +#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty + +/* summary: + if deleting something ( tb->insert_size[0] < 0 ) + return(balance_leaf_when_delete()); (flag d handled here) + else + if lnum is larger than 0 we put items into the left node + if rnum is larger than 0 we put items into the right node + if snum1 is larger than 0 we put items into the new node s1 + if snum2 is larger than 0 we put items into the new node s2 +Note that all *num* count new items being created. + +It would be easier to read balance_leaf() if each of these summary +lines was a separate procedure rather than being inlined. I think +that there are many passages here and in balance_leaf_when_delete() in +which two calls to one procedure can replace two passages, and it +might save cache space and improve software maintenance costs to do so. + +Vladimir made the perceptive comment that we should offload most of +the decision making in this function into fix_nodes/check_balance, and +then create some sort of structure in tb that says what actions should +be performed by do_balance. + +-Hans */ + +/* Balance leaf node in case of delete or cut: insert_size[0] < 0 + * + * lnum, rnum can have values >= -1 + * -1 means that the neighbor must be joined with S + * 0 means that nothing should be done with the neighbor + * >0 means to shift entirely or partly the specified number of items to the neighbor + */ +static int balance_leaf_when_delete(struct tree_balance *tb, int flag) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int item_pos = PATH_LAST_POSITION(tb->tb_path); + int pos_in_item = tb->tb_path->pos_in_item; + struct buffer_info bi; + int n; + struct item_head *ih; + + RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1, + "vs- 12000: level: wrong FR %z", tb->FR[0]); + RFALSE(tb->blknum[0] > 1, + "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]); + RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0), + "PAP-12010: tree can not be empty"); + + ih = B_N_PITEM_HEAD(tbS0, item_pos); + buffer_info_init_tbS0(tb, &bi); + + /* Delete or truncate the item */ + + switch (flag) { + case M_DELETE: /* delete item in S[0] */ + + RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0], + "vs-12013: mode Delete, insert size %d, ih to be deleted %h", + -tb->insert_size[0], ih); + + leaf_delete_items(&bi, 0, item_pos, 1, -1); + + if (!item_pos && tb->CFL[0]) { + if (B_NR_ITEMS(tbS0)) { + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, + 0); + } else { + if (!PATH_H_POSITION(tb->tb_path, 1)) + replace_key(tb, tb->CFL[0], tb->lkey[0], + PATH_H_PPARENT(tb->tb_path, + 0), 0); + } + } + + RFALSE(!item_pos && !tb->CFL[0], + "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], + tb->L[0]); + + break; + + case M_CUT:{ /* cut item in S[0] */ + if (is_direntry_le_ih(ih)) { + + /* UFS unlink semantics are such that you can only delete one directory entry at a time. */ + /* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */ + tb->insert_size[0] = -1; + leaf_cut_from_buffer(&bi, item_pos, pos_in_item, + -tb->insert_size[0]); + + RFALSE(!item_pos && !pos_in_item && !tb->CFL[0], + "PAP-12030: can not change delimiting key. CFL[0]=%p", + tb->CFL[0]); + + if (!item_pos && !pos_in_item && tb->CFL[0]) { + replace_key(tb, tb->CFL[0], tb->lkey[0], + tbS0, 0); + } + } else { + leaf_cut_from_buffer(&bi, item_pos, pos_in_item, + -tb->insert_size[0]); + + RFALSE(!ih_item_len(ih), + "PAP-12035: cut must leave non-zero dynamic length of item"); + } + break; + } + + default: + print_cur_tb("12040"); + reiserfs_panic(tb->tb_sb, "PAP-12040", + "unexpected mode: %s(%d)", + (flag == + M_PASTE) ? "PASTE" : ((flag == + M_INSERT) ? "INSERT" : + "UNKNOWN"), flag); + } + + /* the rule is that no shifting occurs unless by shifting a node can be freed */ + n = B_NR_ITEMS(tbS0); + if (tb->lnum[0]) { /* L[0] takes part in balancing */ + if (tb->lnum[0] == -1) { /* L[0] must be joined with S[0] */ + if (tb->rnum[0] == -1) { /* R[0] must be also joined with S[0] */ + if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) { + /* all contents of all the 3 buffers will be in L[0] */ + if (PATH_H_POSITION(tb->tb_path, 1) == 0 + && 1 < B_NR_ITEMS(tb->FR[0])) + replace_key(tb, tb->CFL[0], + tb->lkey[0], + tb->FR[0], 1); + + leaf_move_items(LEAF_FROM_S_TO_L, tb, n, + -1, NULL); + leaf_move_items(LEAF_FROM_R_TO_L, tb, + B_NR_ITEMS(tb->R[0]), + -1, NULL); + + reiserfs_invalidate_buffer(tb, tbS0); + reiserfs_invalidate_buffer(tb, + tb->R[0]); + + return 0; + } + /* all contents of all the 3 buffers will be in R[0] */ + leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, + NULL); + leaf_move_items(LEAF_FROM_L_TO_R, tb, + B_NR_ITEMS(tb->L[0]), -1, NULL); + + /* right_delimiting_key is correct in R[0] */ + replace_key(tb, tb->CFR[0], tb->rkey[0], + tb->R[0], 0); + + reiserfs_invalidate_buffer(tb, tbS0); + reiserfs_invalidate_buffer(tb, tb->L[0]); + + return -1; + } + + RFALSE(tb->rnum[0] != 0, + "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]); + /* all contents of L[0] and S[0] will be in L[0] */ + leaf_shift_left(tb, n, -1); + + reiserfs_invalidate_buffer(tb, tbS0); + + return 0; + } + /* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */ + + RFALSE((tb->lnum[0] + tb->rnum[0] < n) || + (tb->lnum[0] + tb->rnum[0] > n + 1), + "PAP-12050: rnum(%d) and lnum(%d) and item number(%d) in S[0] are not consistent", + tb->rnum[0], tb->lnum[0], n); + RFALSE((tb->lnum[0] + tb->rnum[0] == n) && + (tb->lbytes != -1 || tb->rbytes != -1), + "PAP-12055: bad rbytes (%d)/lbytes (%d) parameters when items are not split", + tb->rbytes, tb->lbytes); + RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) && + (tb->lbytes < 1 || tb->rbytes != -1), + "PAP-12060: bad rbytes (%d)/lbytes (%d) parameters when items are split", + tb->rbytes, tb->lbytes); + + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + + reiserfs_invalidate_buffer(tb, tbS0); + + return 0; + } + + if (tb->rnum[0] == -1) { + /* all contents of R[0] and S[0] will be in R[0] */ + leaf_shift_right(tb, n, -1); + reiserfs_invalidate_buffer(tb, tbS0); + return 0; + } + + RFALSE(tb->rnum[0], + "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]); + return 0; +} + +static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item header of inserted item (this is on little endian) */ + const char *body, /* body of inserted item or bytes to paste */ + int flag, /* i - insert, d - delete, c - cut, p - paste + (see comment to do_balance) */ + struct item_head *insert_key, /* in our processing of one level we sometimes determine what + must be inserted into the next higher level. This insertion + consists of a key or two keys and their corresponding + pointers */ + struct buffer_head **insert_ptr /* inserted node-ptrs for the next level */ + ) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int item_pos = PATH_LAST_POSITION(tb->tb_path); /* index into the array of item headers in S[0] + of the affected item */ + struct buffer_info bi; + struct buffer_head *S_new[2]; /* new nodes allocated to hold what could not fit into S */ + int snum[2]; /* number of items that will be placed + into S_new (includes partially shifted + items) */ + int sbytes[2]; /* if an item is partially shifted into S_new then + if it is a directory item + it is the number of entries from the item that are shifted into S_new + else + it is the number of bytes from the item that are shifted into S_new + */ + int n, i; + int ret_val; + int pos_in_item; + int zeros_num; + + PROC_INFO_INC(tb->tb_sb, balance_at[0]); + + /* Make balance in case insert_size[0] < 0 */ + if (tb->insert_size[0] < 0) + return balance_leaf_when_delete(tb, flag); + + zeros_num = 0; + if (flag == M_INSERT && !body) + zeros_num = ih_item_len(ih); + + pos_in_item = tb->tb_path->pos_in_item; + /* for indirect item pos_in_item is measured in unformatted node + pointers. Recalculate to bytes */ + if (flag != M_INSERT + && is_indirect_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) + pos_in_item *= UNFM_P_SIZE; + + if (tb->lnum[0] > 0) { + /* Shift lnum[0] items from S[0] to the left neighbor L[0] */ + if (item_pos < tb->lnum[0]) { + /* new item or it part falls to L[0], shift it too */ + n = B_NR_ITEMS(tb->L[0]); + + switch (flag) { + case M_INSERT: /* insert item into L[0] */ + + if (item_pos == tb->lnum[0] - 1 + && tb->lbytes != -1) { + /* part of new item falls into L[0] */ + int new_item_len; + int version; + + ret_val = + leaf_shift_left(tb, tb->lnum[0] - 1, + -1); + + /* Calculate item length to insert to S[0] */ + new_item_len = + ih_item_len(ih) - tb->lbytes; + /* Calculate and check item length to insert to L[0] */ + put_ih_item_len(ih, + ih_item_len(ih) - + new_item_len); + + RFALSE(ih_item_len(ih) <= 0, + "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d", + ih_item_len(ih)); + + /* Insert new item into L[0] */ + buffer_info_init_left(tb, &bi); + leaf_insert_into_buf(&bi, + n + item_pos - + ret_val, ih, body, + zeros_num > + ih_item_len(ih) ? + ih_item_len(ih) : + zeros_num); + + version = ih_version(ih); + + /* Calculate key component, item length and body to insert into S[0] */ + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + (tb-> + lbytes << + (is_indirect_le_ih + (ih) ? tb->tb_sb-> + s_blocksize_bits - + UNFM_P_SHIFT : + 0))); + + put_ih_item_len(ih, new_item_len); + if (tb->lbytes > zeros_num) { + body += + (tb->lbytes - zeros_num); + zeros_num = 0; + } else + zeros_num -= tb->lbytes; + + RFALSE(ih_item_len(ih) <= 0, + "PAP-12085: there is nothing to insert into S[0]: ih_item_len=%d", + ih_item_len(ih)); + } else { + /* new item in whole falls into L[0] */ + /* Shift lnum[0]-1 items to L[0] */ + ret_val = + leaf_shift_left(tb, tb->lnum[0] - 1, + tb->lbytes); + /* Insert new item into L[0] */ + buffer_info_init_left(tb, &bi); + leaf_insert_into_buf(&bi, + n + item_pos - + ret_val, ih, body, + zeros_num); + tb->insert_size[0] = 0; + zeros_num = 0; + } + break; + + case M_PASTE: /* append item in L[0] */ + + if (item_pos == tb->lnum[0] - 1 + && tb->lbytes != -1) { + /* we must shift the part of the appended item */ + if (is_direntry_le_ih + (B_N_PITEM_HEAD(tbS0, item_pos))) { + + RFALSE(zeros_num, + "PAP-12090: invalid parameter in case of a directory"); + /* directory item */ + if (tb->lbytes > pos_in_item) { + /* new directory entry falls into L[0] */ + struct item_head + *pasted; + int l_pos_in_item = + pos_in_item; + + /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */ + ret_val = + leaf_shift_left(tb, + tb-> + lnum + [0], + tb-> + lbytes + - + 1); + if (ret_val + && !item_pos) { + pasted = + B_N_PITEM_HEAD + (tb->L[0], + B_NR_ITEMS + (tb-> + L[0]) - + 1); + l_pos_in_item += + I_ENTRY_COUNT + (pasted) - + (tb-> + lbytes - + 1); + } + + /* Append given directory entry to directory item */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer + (&bi, + n + item_pos - + ret_val, + l_pos_in_item, + tb->insert_size[0], + body, zeros_num); + + /* previous string prepared space for pasting new entry, following string pastes this entry */ + + /* when we have merge directory item, pos_in_item has been changed too */ + + /* paste new directory entry. 1 is entry number */ + leaf_paste_entries(&bi, + n + + item_pos + - + ret_val, + l_pos_in_item, + 1, + (struct + reiserfs_de_head + *) + body, + body + + + DEH_SIZE, + tb-> + insert_size + [0] + ); + tb->insert_size[0] = 0; + } else { + /* new directory item doesn't fall into L[0] */ + /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */ + leaf_shift_left(tb, + tb-> + lnum[0], + tb-> + lbytes); + } + /* Calculate new position to append in item body */ + pos_in_item -= tb->lbytes; + } else { + /* regular object */ + RFALSE(tb->lbytes <= 0, + "PAP-12095: there is nothing to shift to L[0]. lbytes=%d", + tb->lbytes); + RFALSE(pos_in_item != + ih_item_len + (B_N_PITEM_HEAD + (tbS0, item_pos)), + "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d", + ih_item_len + (B_N_PITEM_HEAD + (tbS0, item_pos)), + pos_in_item); + + if (tb->lbytes >= pos_in_item) { + /* appended item will be in L[0] in whole */ + int l_n; + + /* this bytes number must be appended to the last item of L[h] */ + l_n = + tb->lbytes - + pos_in_item; + + /* Calculate new insert_size[0] */ + tb->insert_size[0] -= + l_n; + + RFALSE(tb-> + insert_size[0] <= + 0, + "PAP-12105: there is nothing to paste into L[0]. insert_size=%d", + tb-> + insert_size[0]); + ret_val = + leaf_shift_left(tb, + tb-> + lnum + [0], + ih_item_len + (B_N_PITEM_HEAD + (tbS0, + item_pos))); + /* Append to body of item in L[0] */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer + (&bi, + n + item_pos - + ret_val, + ih_item_len + (B_N_PITEM_HEAD + (tb->L[0], + n + item_pos - + ret_val)), l_n, + body, + zeros_num > + l_n ? l_n : + zeros_num); + /* 0-th item in S0 can be only of DIRECT type when l_n != 0 */ + { + int version; + int temp_l = + l_n; + + RFALSE + (ih_item_len + (B_N_PITEM_HEAD + (tbS0, + 0)), + "PAP-12106: item length must be 0"); + RFALSE + (comp_short_le_keys + (B_N_PKEY + (tbS0, 0), + B_N_PKEY + (tb->L[0], + n + + item_pos + - + ret_val)), + "PAP-12107: items must be of the same file"); + if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val))) { + temp_l = + l_n + << + (tb-> + tb_sb-> + s_blocksize_bits + - + UNFM_P_SHIFT); + } + /* update key of first item in S0 */ + version = + ih_version + (B_N_PITEM_HEAD + (tbS0, 0)); + set_le_key_k_offset + (version, + B_N_PKEY + (tbS0, 0), + le_key_k_offset + (version, + B_N_PKEY + (tbS0, + 0)) + + temp_l); + /* update left delimiting key */ + set_le_key_k_offset + (version, + B_N_PDELIM_KEY + (tb-> + CFL[0], + tb-> + lkey[0]), + le_key_k_offset + (version, + B_N_PDELIM_KEY + (tb-> + CFL[0], + tb-> + lkey[0])) + + temp_l); + } + + /* Calculate new body, position in item and insert_size[0] */ + if (l_n > zeros_num) { + body += + (l_n - + zeros_num); + zeros_num = 0; + } else + zeros_num -= + l_n; + pos_in_item = 0; + + RFALSE + (comp_short_le_keys + (B_N_PKEY(tbS0, 0), + B_N_PKEY(tb->L[0], + B_NR_ITEMS + (tb-> + L[0]) - + 1)) + || + !op_is_left_mergeable + (B_N_PKEY(tbS0, 0), + tbS0->b_size) + || + !op_is_left_mergeable + (B_N_PDELIM_KEY + (tb->CFL[0], + tb->lkey[0]), + tbS0->b_size), + "PAP-12120: item must be merge-able with left neighboring item"); + } else { /* only part of the appended item will be in L[0] */ + + /* Calculate position in item for append in S[0] */ + pos_in_item -= + tb->lbytes; + + RFALSE(pos_in_item <= 0, + "PAP-12125: no place for paste. pos_in_item=%d", + pos_in_item); + + /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ + leaf_shift_left(tb, + tb-> + lnum[0], + tb-> + lbytes); + } + } + } else { /* appended item will be in L[0] in whole */ + + struct item_head *pasted; + + if (!item_pos && op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)) { /* if we paste into first item of S[0] and it is left mergable */ + /* then increment pos_in_item by the size of the last item in L[0] */ + pasted = + B_N_PITEM_HEAD(tb->L[0], + n - 1); + if (is_direntry_le_ih(pasted)) + pos_in_item += + ih_entry_count + (pasted); + else + pos_in_item += + ih_item_len(pasted); + } + + /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ + ret_val = + leaf_shift_left(tb, tb->lnum[0], + tb->lbytes); + /* Append to body of item in L[0] */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer(&bi, + n + item_pos - + ret_val, + pos_in_item, + tb->insert_size[0], + body, zeros_num); + + /* if appended item is directory, paste entry */ + pasted = + B_N_PITEM_HEAD(tb->L[0], + n + item_pos - + ret_val); + if (is_direntry_le_ih(pasted)) + leaf_paste_entries(&bi, + n + + item_pos - + ret_val, + pos_in_item, + 1, + (struct + reiserfs_de_head + *)body, + body + + DEH_SIZE, + tb-> + insert_size + [0] + ); + /* if appended item is indirect item, put unformatted node into un list */ + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + tb->insert_size[0] = 0; + zeros_num = 0; + } + break; + default: /* cases d and t */ + reiserfs_panic(tb->tb_sb, "PAP-12130", + "lnum > 0: unexpected mode: " + " %s(%d)", + (flag == + M_DELETE) ? "DELETE" : ((flag == + M_CUT) + ? "CUT" + : + "UNKNOWN"), + flag); + } + } else { + /* new item doesn't fall into L[0] */ + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + } + } + + /* tb->lnum[0] > 0 */ + /* Calculate new item position */ + item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0)); + + if (tb->rnum[0] > 0) { + /* shift rnum[0] items from S[0] to the right neighbor R[0] */ + n = B_NR_ITEMS(tbS0); + switch (flag) { + + case M_INSERT: /* insert item */ + if (n - tb->rnum[0] < item_pos) { /* new item or its part falls to R[0] */ + if (item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { /* part of new item falls into R[0] */ + loff_t old_key_comp, old_len, + r_zeros_number; + const char *r_body; + int version; + loff_t offset; + + leaf_shift_right(tb, tb->rnum[0] - 1, + -1); + + version = ih_version(ih); + /* Remember key component and item length */ + old_key_comp = le_ih_k_offset(ih); + old_len = ih_item_len(ih); + + /* Calculate key component and item length to insert into R[0] */ + offset = + le_ih_k_offset(ih) + + ((old_len - + tb-> + rbytes) << (is_indirect_le_ih(ih) + ? tb->tb_sb-> + s_blocksize_bits - + UNFM_P_SHIFT : 0)); + set_le_ih_k_offset(ih, offset); + put_ih_item_len(ih, tb->rbytes); + /* Insert part of the item into R[0] */ + buffer_info_init_right(tb, &bi); + if ((old_len - tb->rbytes) > zeros_num) { + r_zeros_number = 0; + r_body = + body + (old_len - + tb->rbytes) - + zeros_num; + } else { + r_body = body; + r_zeros_number = + zeros_num - (old_len - + tb->rbytes); + zeros_num -= r_zeros_number; + } + + leaf_insert_into_buf(&bi, 0, ih, r_body, + r_zeros_number); + + /* Replace right delimiting key by first key in R[0] */ + replace_key(tb, tb->CFR[0], tb->rkey[0], + tb->R[0], 0); + + /* Calculate key component and item length to insert into S[0] */ + set_le_ih_k_offset(ih, old_key_comp); + put_ih_item_len(ih, + old_len - tb->rbytes); + + tb->insert_size[0] -= tb->rbytes; + + } else { /* whole new item falls into R[0] */ + + /* Shift rnum[0]-1 items to R[0] */ + ret_val = + leaf_shift_right(tb, + tb->rnum[0] - 1, + tb->rbytes); + /* Insert new item into R[0] */ + buffer_info_init_right(tb, &bi); + leaf_insert_into_buf(&bi, + item_pos - n + + tb->rnum[0] - 1, + ih, body, + zeros_num); + + if (item_pos - n + tb->rnum[0] - 1 == 0) { + replace_key(tb, tb->CFR[0], + tb->rkey[0], + tb->R[0], 0); + + } + zeros_num = tb->insert_size[0] = 0; + } + } else { /* new item or part of it doesn't fall into R[0] */ + + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + } + break; + + case M_PASTE: /* append item */ + + if (n - tb->rnum[0] <= item_pos) { /* pasted item or part of it falls to R[0] */ + if (item_pos == n - tb->rnum[0] && tb->rbytes != -1) { /* we must shift the part of the appended item */ + if (is_direntry_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) { /* we append to directory item */ + int entry_count; + + RFALSE(zeros_num, + "PAP-12145: invalid parameter in case of a directory"); + entry_count = + I_ENTRY_COUNT(B_N_PITEM_HEAD + (tbS0, + item_pos)); + if (entry_count - tb->rbytes < + pos_in_item) + /* new directory entry falls into R[0] */ + { + int paste_entry_position; + + RFALSE(tb->rbytes - 1 >= + entry_count + || !tb-> + insert_size[0], + "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d", + tb->rbytes, + entry_count); + /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */ + leaf_shift_right(tb, + tb-> + rnum + [0], + tb-> + rbytes + - 1); + /* Paste given directory entry to directory item */ + paste_entry_position = + pos_in_item - + entry_count + + tb->rbytes - 1; + buffer_info_init_right(tb, &bi); + leaf_paste_in_buffer + (&bi, 0, + paste_entry_position, + tb->insert_size[0], + body, zeros_num); + /* paste entry */ + leaf_paste_entries(&bi, + 0, + paste_entry_position, + 1, + (struct + reiserfs_de_head + *) + body, + body + + + DEH_SIZE, + tb-> + insert_size + [0] + ); + + if (paste_entry_position + == 0) { + /* change delimiting keys */ + replace_key(tb, + tb-> + CFR + [0], + tb-> + rkey + [0], + tb-> + R + [0], + 0); + } + + tb->insert_size[0] = 0; + pos_in_item++; + } else { /* new directory entry doesn't fall into R[0] */ + + leaf_shift_right(tb, + tb-> + rnum + [0], + tb-> + rbytes); + } + } else { /* regular object */ + + int n_shift, n_rem, + r_zeros_number; + const char *r_body; + + /* Calculate number of bytes which must be shifted from appended item */ + if ((n_shift = + tb->rbytes - + tb->insert_size[0]) < 0) + n_shift = 0; + + RFALSE(pos_in_item != + ih_item_len + (B_N_PITEM_HEAD + (tbS0, item_pos)), + "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d", + pos_in_item, + ih_item_len + (B_N_PITEM_HEAD + (tbS0, item_pos))); + + leaf_shift_right(tb, + tb->rnum[0], + n_shift); + /* Calculate number of bytes which must remain in body after appending to R[0] */ + if ((n_rem = + tb->insert_size[0] - + tb->rbytes) < 0) + n_rem = 0; + + { + int version; + unsigned long temp_rem = + n_rem; + + version = + ih_version + (B_N_PITEM_HEAD + (tb->R[0], 0)); + if (is_indirect_le_key + (version, + B_N_PKEY(tb->R[0], + 0))) { + temp_rem = + n_rem << + (tb->tb_sb-> + s_blocksize_bits + - + UNFM_P_SHIFT); + } + set_le_key_k_offset + (version, + B_N_PKEY(tb->R[0], + 0), + le_key_k_offset + (version, + B_N_PKEY(tb->R[0], + 0)) + + temp_rem); + set_le_key_k_offset + (version, + B_N_PDELIM_KEY(tb-> + CFR + [0], + tb-> + rkey + [0]), + le_key_k_offset + (version, + B_N_PDELIM_KEY + (tb->CFR[0], + tb->rkey[0])) + + temp_rem); + } +/* k_offset (B_N_PKEY(tb->R[0],0)) += n_rem; + k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/ + do_balance_mark_internal_dirty + (tb, tb->CFR[0], 0); + + /* Append part of body into R[0] */ + buffer_info_init_right(tb, &bi); + if (n_rem > zeros_num) { + r_zeros_number = 0; + r_body = + body + n_rem - + zeros_num; + } else { + r_body = body; + r_zeros_number = + zeros_num - n_rem; + zeros_num -= + r_zeros_number; + } + + leaf_paste_in_buffer(&bi, 0, + n_shift, + tb-> + insert_size + [0] - + n_rem, + r_body, + r_zeros_number); + + if (is_indirect_le_ih + (B_N_PITEM_HEAD + (tb->R[0], 0))) { +#if 0 + RFALSE(n_rem, + "PAP-12160: paste more than one unformatted node pointer"); +#endif + set_ih_free_space + (B_N_PITEM_HEAD + (tb->R[0], 0), 0); + } + tb->insert_size[0] = n_rem; + if (!n_rem) + pos_in_item++; + } + } else { /* pasted item in whole falls into R[0] */ + + struct item_head *pasted; + + ret_val = + leaf_shift_right(tb, tb->rnum[0], + tb->rbytes); + /* append item in R[0] */ + if (pos_in_item >= 0) { + buffer_info_init_right(tb, &bi); + leaf_paste_in_buffer(&bi, + item_pos - + n + + tb-> + rnum[0], + pos_in_item, + tb-> + insert_size + [0], body, + zeros_num); + } + + /* paste new entry, if item is directory item */ + pasted = + B_N_PITEM_HEAD(tb->R[0], + item_pos - n + + tb->rnum[0]); + if (is_direntry_le_ih(pasted) + && pos_in_item >= 0) { + leaf_paste_entries(&bi, + item_pos - + n + + tb->rnum[0], + pos_in_item, + 1, + (struct + reiserfs_de_head + *)body, + body + + DEH_SIZE, + tb-> + insert_size + [0] + ); + if (!pos_in_item) { + + RFALSE(item_pos - n + + tb->rnum[0], + "PAP-12165: directory item must be first item of node when pasting is in 0th position"); + + /* update delimiting keys */ + replace_key(tb, + tb->CFR[0], + tb->rkey[0], + tb->R[0], + 0); + } + } + + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + zeros_num = tb->insert_size[0] = 0; + } + } else { /* new item doesn't fall into R[0] */ + + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + } + break; + default: /* cases d and t */ + reiserfs_panic(tb->tb_sb, "PAP-12175", + "rnum > 0: unexpected mode: %s(%d)", + (flag == + M_DELETE) ? "DELETE" : ((flag == + M_CUT) ? "CUT" + : "UNKNOWN"), + flag); + } + + } + + /* tb->rnum[0] > 0 */ + RFALSE(tb->blknum[0] > 3, + "PAP-12180: blknum can not be %d. It must be <= 3", + tb->blknum[0]); + RFALSE(tb->blknum[0] < 0, + "PAP-12185: blknum can not be %d. It must be >= 0", + tb->blknum[0]); + + /* if while adding to a node we discover that it is possible to split + it in two, and merge the left part into the left neighbor and the + right part into the right neighbor, eliminating the node */ + if (tb->blknum[0] == 0) { /* node S[0] is empty now */ + + RFALSE(!tb->lnum[0] || !tb->rnum[0], + "PAP-12190: lnum and rnum must not be zero"); + /* if insertion was done before 0-th position in R[0], right + delimiting key of the tb->L[0]'s and left delimiting key are + not set correctly */ + if (tb->CFL[0]) { + if (!tb->CFR[0]) + reiserfs_panic(tb->tb_sb, "vs-12195", + "CFR not initialized"); + copy_key(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]), + B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0])); + do_balance_mark_internal_dirty(tb, tb->CFL[0], 0); + } + + reiserfs_invalidate_buffer(tb, tbS0); + return 0; + } + + /* Fill new nodes that appear in place of S[0] */ + + /* I am told that this copying is because we need an array to enable + the looping code. -Hans */ + snum[0] = tb->s1num, snum[1] = tb->s2num; + sbytes[0] = tb->s1bytes; + sbytes[1] = tb->s2bytes; + for (i = tb->blknum[0] - 2; i >= 0; i--) { + + RFALSE(!snum[i], "PAP-12200: snum[%d] == %d. Must be > 0", i, + snum[i]); + + /* here we shift from S to S_new nodes */ + + S_new[i] = get_FEB(tb); + + /* initialized block type and tree level */ + set_blkh_level(B_BLK_HEAD(S_new[i]), DISK_LEAF_NODE_LEVEL); + + n = B_NR_ITEMS(tbS0); + + switch (flag) { + case M_INSERT: /* insert item */ + + if (n - snum[i] < item_pos) { /* new item or it's part falls to first new node S_new[i] */ + if (item_pos == n - snum[i] + 1 && sbytes[i] != -1) { /* part of new item falls into S_new[i] */ + int old_key_comp, old_len, + r_zeros_number; + const char *r_body; + int version; + + /* Move snum[i]-1 items from S[0] to S_new[i] */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + snum[i] - 1, -1, + S_new[i]); + /* Remember key component and item length */ + version = ih_version(ih); + old_key_comp = le_ih_k_offset(ih); + old_len = ih_item_len(ih); + + /* Calculate key component and item length to insert into S_new[i] */ + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + ((old_len - + sbytes[i]) << + (is_indirect_le_ih + (ih) ? tb->tb_sb-> + s_blocksize_bits - + UNFM_P_SHIFT : + 0))); + + put_ih_item_len(ih, sbytes[i]); + + /* Insert part of the item into S_new[i] before 0-th item */ + buffer_info_init_bh(tb, &bi, S_new[i]); + + if ((old_len - sbytes[i]) > zeros_num) { + r_zeros_number = 0; + r_body = + body + (old_len - + sbytes[i]) - + zeros_num; + } else { + r_body = body; + r_zeros_number = + zeros_num - (old_len - + sbytes[i]); + zeros_num -= r_zeros_number; + } + + leaf_insert_into_buf(&bi, 0, ih, r_body, + r_zeros_number); + + /* Calculate key component and item length to insert into S[i] */ + set_le_ih_k_offset(ih, old_key_comp); + put_ih_item_len(ih, + old_len - sbytes[i]); + tb->insert_size[0] -= sbytes[i]; + } else { /* whole new item falls into S_new[i] */ + + /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + snum[i] - 1, sbytes[i], + S_new[i]); + + /* Insert new item into S_new[i] */ + buffer_info_init_bh(tb, &bi, S_new[i]); + leaf_insert_into_buf(&bi, + item_pos - n + + snum[i] - 1, ih, + body, zeros_num); + + zeros_num = tb->insert_size[0] = 0; + } + } + + else { /* new item or it part don't falls into S_new[i] */ + + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + snum[i], sbytes[i], S_new[i]); + } + break; + + case M_PASTE: /* append item */ + + if (n - snum[i] <= item_pos) { /* pasted item or part if it falls to S_new[i] */ + if (item_pos == n - snum[i] && sbytes[i] != -1) { /* we must shift part of the appended item */ + struct item_head *aux_ih; + + RFALSE(ih, "PAP-12210: ih must be 0"); + + aux_ih = B_N_PITEM_HEAD(tbS0, item_pos); + if (is_direntry_le_ih(aux_ih)) { + /* we append to directory item */ + + int entry_count; + + entry_count = + ih_entry_count(aux_ih); + + if (entry_count - sbytes[i] < + pos_in_item + && pos_in_item <= + entry_count) { + /* new directory entry falls into S_new[i] */ + + RFALSE(!tb-> + insert_size[0], + "PAP-12215: insert_size is already 0"); + RFALSE(sbytes[i] - 1 >= + entry_count, + "PAP-12220: there are no so much entries (%d), only %d", + sbytes[i] - 1, + entry_count); + + /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */ + leaf_move_items + (LEAF_FROM_S_TO_SNEW, + tb, snum[i], + sbytes[i] - 1, + S_new[i]); + /* Paste given directory entry to directory item */ + buffer_info_init_bh(tb, &bi, S_new[i]); + leaf_paste_in_buffer + (&bi, 0, + pos_in_item - + entry_count + + sbytes[i] - 1, + tb->insert_size[0], + body, zeros_num); + /* paste new directory entry */ + leaf_paste_entries(&bi, + 0, + pos_in_item + - + entry_count + + + sbytes + [i] - + 1, 1, + (struct + reiserfs_de_head + *) + body, + body + + + DEH_SIZE, + tb-> + insert_size + [0] + ); + tb->insert_size[0] = 0; + pos_in_item++; + } else { /* new directory entry doesn't fall into S_new[i] */ + leaf_move_items + (LEAF_FROM_S_TO_SNEW, + tb, snum[i], + sbytes[i], + S_new[i]); + } + } else { /* regular object */ + + int n_shift, n_rem, + r_zeros_number; + const char *r_body; + + RFALSE(pos_in_item != + ih_item_len + (B_N_PITEM_HEAD + (tbS0, item_pos)) + || tb->insert_size[0] <= + 0, + "PAP-12225: item too short or insert_size <= 0"); + + /* Calculate number of bytes which must be shifted from appended item */ + n_shift = + sbytes[i] - + tb->insert_size[0]; + if (n_shift < 0) + n_shift = 0; + leaf_move_items + (LEAF_FROM_S_TO_SNEW, tb, + snum[i], n_shift, + S_new[i]); + + /* Calculate number of bytes which must remain in body after append to S_new[i] */ + n_rem = + tb->insert_size[0] - + sbytes[i]; + if (n_rem < 0) + n_rem = 0; + /* Append part of body into S_new[0] */ + buffer_info_init_bh(tb, &bi, S_new[i]); + if (n_rem > zeros_num) { + r_zeros_number = 0; + r_body = + body + n_rem - + zeros_num; + } else { + r_body = body; + r_zeros_number = + zeros_num - n_rem; + zeros_num -= + r_zeros_number; + } + + leaf_paste_in_buffer(&bi, 0, + n_shift, + tb-> + insert_size + [0] - + n_rem, + r_body, + r_zeros_number); + { + struct item_head *tmp; + + tmp = + B_N_PITEM_HEAD(S_new + [i], + 0); + if (is_indirect_le_ih + (tmp)) { + set_ih_free_space + (tmp, 0); + set_le_ih_k_offset + (tmp, + le_ih_k_offset + (tmp) + + (n_rem << + (tb-> + tb_sb-> + s_blocksize_bits + - + UNFM_P_SHIFT))); + } else { + set_le_ih_k_offset + (tmp, + le_ih_k_offset + (tmp) + + n_rem); + } + } + + tb->insert_size[0] = n_rem; + if (!n_rem) + pos_in_item++; + } + } else + /* item falls wholly into S_new[i] */ + { + int leaf_mi; + struct item_head *pasted; + +#ifdef CONFIG_REISERFS_CHECK + struct item_head *ih_check = + B_N_PITEM_HEAD(tbS0, item_pos); + + if (!is_direntry_le_ih(ih_check) + && (pos_in_item != ih_item_len(ih_check) + || tb->insert_size[0] <= 0)) + reiserfs_panic(tb->tb_sb, + "PAP-12235", + "pos_in_item " + "must be equal " + "to ih_item_len"); +#endif /* CONFIG_REISERFS_CHECK */ + + leaf_mi = + leaf_move_items(LEAF_FROM_S_TO_SNEW, + tb, snum[i], + sbytes[i], + S_new[i]); + + RFALSE(leaf_mi, + "PAP-12240: unexpected value returned by leaf_move_items (%d)", + leaf_mi); + + /* paste into item */ + buffer_info_init_bh(tb, &bi, S_new[i]); + leaf_paste_in_buffer(&bi, + item_pos - n + + snum[i], + pos_in_item, + tb->insert_size[0], + body, zeros_num); + + pasted = + B_N_PITEM_HEAD(S_new[i], + item_pos - n + + snum[i]); + if (is_direntry_le_ih(pasted)) { + leaf_paste_entries(&bi, + item_pos - + n + snum[i], + pos_in_item, + 1, + (struct + reiserfs_de_head + *)body, + body + + DEH_SIZE, + tb-> + insert_size + [0] + ); + } + + /* if we paste to indirect item update ih_free_space */ + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + zeros_num = tb->insert_size[0] = 0; + } + } + + else { /* pasted item doesn't fall into S_new[i] */ + + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + snum[i], sbytes[i], S_new[i]); + } + break; + default: /* cases d and t */ + reiserfs_panic(tb->tb_sb, "PAP-12245", + "blknum > 2: unexpected mode: %s(%d)", + (flag == + M_DELETE) ? "DELETE" : ((flag == + M_CUT) ? "CUT" + : "UNKNOWN"), + flag); + } + + memcpy(insert_key + i, B_N_PKEY(S_new[i], 0), KEY_SIZE); + insert_ptr[i] = S_new[i]; + + RFALSE(!buffer_journaled(S_new[i]) + || buffer_journal_dirty(S_new[i]) + || buffer_dirty(S_new[i]), "PAP-12247: S_new[%d] : (%b)", + i, S_new[i]); + } + + /* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the + affected item which remains in S */ + if (0 <= item_pos && item_pos < tb->s0num) { /* if we must insert or append into buffer S[0] */ + + switch (flag) { + case M_INSERT: /* insert item into S[0] */ + buffer_info_init_tbS0(tb, &bi); + leaf_insert_into_buf(&bi, item_pos, ih, body, + zeros_num); + + /* If we insert the first key change the delimiting key */ + if (item_pos == 0) { + if (tb->CFL[0]) /* can be 0 in reiserfsck */ + replace_key(tb, tb->CFL[0], tb->lkey[0], + tbS0, 0); + + } + break; + + case M_PASTE:{ /* append item in S[0] */ + struct item_head *pasted; + + pasted = B_N_PITEM_HEAD(tbS0, item_pos); + /* when directory, may be new entry already pasted */ + if (is_direntry_le_ih(pasted)) { + if (pos_in_item >= 0 && + pos_in_item <= + ih_entry_count(pasted)) { + + RFALSE(!tb->insert_size[0], + "PAP-12260: insert_size is 0 already"); + + /* prepare space */ + buffer_info_init_tbS0(tb, &bi); + leaf_paste_in_buffer(&bi, + item_pos, + pos_in_item, + tb-> + insert_size + [0], body, + zeros_num); + + /* paste entry */ + leaf_paste_entries(&bi, + item_pos, + pos_in_item, + 1, + (struct + reiserfs_de_head + *)body, + body + + DEH_SIZE, + tb-> + insert_size + [0] + ); + if (!item_pos && !pos_in_item) { + RFALSE(!tb->CFL[0] + || !tb->L[0], + "PAP-12270: CFL[0]/L[0] must be specified"); + if (tb->CFL[0]) { + replace_key(tb, + tb-> + CFL + [0], + tb-> + lkey + [0], + tbS0, + 0); + + } + } + tb->insert_size[0] = 0; + } + } else { /* regular object */ + if (pos_in_item == ih_item_len(pasted)) { + + RFALSE(tb->insert_size[0] <= 0, + "PAP-12275: insert size must not be %d", + tb->insert_size[0]); + buffer_info_init_tbS0(tb, &bi); + leaf_paste_in_buffer(&bi, + item_pos, + pos_in_item, + tb-> + insert_size + [0], body, + zeros_num); + + if (is_indirect_le_ih(pasted)) { +#if 0 + RFALSE(tb-> + insert_size[0] != + UNFM_P_SIZE, + "PAP-12280: insert_size for indirect item must be %d, not %d", + UNFM_P_SIZE, + tb-> + insert_size[0]); +#endif + set_ih_free_space + (pasted, 0); + } + tb->insert_size[0] = 0; + } +#ifdef CONFIG_REISERFS_CHECK + else { + if (tb->insert_size[0]) { + print_cur_tb("12285"); + reiserfs_panic(tb-> + tb_sb, + "PAP-12285", + "insert_size " + "must be 0 " + "(%d)", + tb->insert_size[0]); + } + } +#endif /* CONFIG_REISERFS_CHECK */ + + } + } /* case M_PASTE: */ + } + } +#ifdef CONFIG_REISERFS_CHECK + if (flag == M_PASTE && tb->insert_size[0]) { + print_cur_tb("12290"); + reiserfs_panic(tb->tb_sb, + "PAP-12290", "insert_size is still not 0 (%d)", + tb->insert_size[0]); + } +#endif /* CONFIG_REISERFS_CHECK */ + return 0; +} /* Leaf level of the tree is balanced (end of balance_leaf) */ + +/* Make empty node */ +void make_empty_node(struct buffer_info *bi) +{ + struct block_head *blkh; + + RFALSE(bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL"); + + blkh = B_BLK_HEAD(bi->bi_bh); + set_blkh_nr_item(blkh, 0); + set_blkh_free_space(blkh, MAX_CHILD_SIZE(bi->bi_bh)); + + if (bi->bi_parent) + B_N_CHILD(bi->bi_parent, bi->bi_position)->dc_size = 0; /* Endian safe if 0 */ +} + +/* Get first empty buffer */ +struct buffer_head *get_FEB(struct tree_balance *tb) +{ + int i; + struct buffer_info bi; + + for (i = 0; i < MAX_FEB_SIZE; i++) + if (tb->FEB[i] != NULL) + break; + + if (i == MAX_FEB_SIZE) + reiserfs_panic(tb->tb_sb, "vs-12300", "FEB list is empty"); + + buffer_info_init_bh(tb, &bi, tb->FEB[i]); + make_empty_node(&bi); + set_buffer_uptodate(tb->FEB[i]); + tb->used[i] = tb->FEB[i]; + tb->FEB[i] = NULL; + + return tb->used[i]; +} + +/* This is now used because reiserfs_free_block has to be able to +** schedule. +*/ +static void store_thrown(struct tree_balance *tb, struct buffer_head *bh) +{ + int i; + + if (buffer_dirty(bh)) + reiserfs_warning(tb->tb_sb, "reiserfs-12320", + "called with dirty buffer"); + for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) + if (!tb->thrown[i]) { + tb->thrown[i] = bh; + get_bh(bh); /* free_thrown puts this */ + return; + } + reiserfs_warning(tb->tb_sb, "reiserfs-12321", + "too many thrown buffers"); +} + +static void free_thrown(struct tree_balance *tb) +{ + int i; + b_blocknr_t blocknr; + for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) { + if (tb->thrown[i]) { + blocknr = tb->thrown[i]->b_blocknr; + if (buffer_dirty(tb->thrown[i])) + reiserfs_warning(tb->tb_sb, "reiserfs-12322", + "called with dirty buffer %d", + blocknr); + brelse(tb->thrown[i]); /* incremented in store_thrown */ + reiserfs_free_block(tb->transaction_handle, NULL, + blocknr, 0); + } + } +} + +void reiserfs_invalidate_buffer(struct tree_balance *tb, struct buffer_head *bh) +{ + struct block_head *blkh; + blkh = B_BLK_HEAD(bh); + set_blkh_level(blkh, FREE_LEVEL); + set_blkh_nr_item(blkh, 0); + + clear_buffer_dirty(bh); + store_thrown(tb, bh); +} + +/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/ +void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest, + struct buffer_head *src, int n_src) +{ + + RFALSE(dest == NULL || src == NULL, + "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)", + src, dest); + RFALSE(!B_IS_KEYS_LEVEL(dest), + "vs-12310: invalid level (%z) for destination buffer. dest must be leaf", + dest); + RFALSE(n_dest < 0 || n_src < 0, + "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest); + RFALSE(n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src), + "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big", + n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest)); + + if (B_IS_ITEMS_LEVEL(src)) + /* source buffer contains leaf node */ + memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PITEM_HEAD(src, n_src), + KEY_SIZE); + else + memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PDELIM_KEY(src, n_src), + KEY_SIZE); + + do_balance_mark_internal_dirty(tb, dest, 0); +} + +int get_left_neighbor_position(struct tree_balance *tb, int h) +{ + int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1); + + RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FL[h] == NULL, + "vs-12325: FL[%d](%p) or F[%d](%p) does not exist", + h, tb->FL[h], h, PATH_H_PPARENT(tb->tb_path, h)); + + if (Sh_position == 0) + return B_NR_ITEMS(tb->FL[h]); + else + return Sh_position - 1; +} + +int get_right_neighbor_position(struct tree_balance *tb, int h) +{ + int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1); + + RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FR[h] == NULL, + "vs-12330: F[%d](%p) or FR[%d](%p) does not exist", + h, PATH_H_PPARENT(tb->tb_path, h), h, tb->FR[h]); + + if (Sh_position == B_NR_ITEMS(PATH_H_PPARENT(tb->tb_path, h))) + return 0; + else + return Sh_position + 1; +} + +#ifdef CONFIG_REISERFS_CHECK + +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value); +static void check_internal_node(struct super_block *s, struct buffer_head *bh, + char *mes) +{ + struct disk_child *dc; + int i; + + RFALSE(!bh, "PAP-12336: bh == 0"); + + if (!bh || !B_IS_IN_TREE(bh)) + return; + + RFALSE(!buffer_dirty(bh) && + !(buffer_journaled(bh) || buffer_journal_dirty(bh)), + "PAP-12337: buffer (%b) must be dirty", bh); + dc = B_N_CHILD(bh, 0); + + for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) { + if (!is_reusable(s, dc_block_number(dc), 1)) { + print_cur_tb(mes); + reiserfs_panic(s, "PAP-12338", + "invalid child pointer %y in %b", + dc, bh); + } + } +} + +static int locked_or_not_in_tree(struct tree_balance *tb, + struct buffer_head *bh, char *which) +{ + if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) || + !B_IS_IN_TREE(bh)) { + reiserfs_warning(tb->tb_sb, "vs-12339", "%s (%b)", which, bh); + return 1; + } + return 0; +} + +static int check_before_balancing(struct tree_balance *tb) +{ + int retval = 0; + + if (REISERFS_SB(tb->tb_sb)->cur_tb) { + reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule " + "occurred based on cur_tb not being null at " + "this point in code. do_balance cannot properly " + "handle concurrent tree accesses on a same " + "mount point."); + } + + /* double check that buffers that we will modify are unlocked. (fix_nodes should already have + prepped all of these for us). */ + if (tb->lnum[0]) { + retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]"); + retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]"); + retval |= locked_or_not_in_tree(tb, tb->CFL[0], "CFL[0]"); + check_leaf(tb->L[0]); + } + if (tb->rnum[0]) { + retval |= locked_or_not_in_tree(tb, tb->R[0], "R[0]"); + retval |= locked_or_not_in_tree(tb, tb->FR[0], "FR[0]"); + retval |= locked_or_not_in_tree(tb, tb->CFR[0], "CFR[0]"); + check_leaf(tb->R[0]); + } + retval |= locked_or_not_in_tree(tb, PATH_PLAST_BUFFER(tb->tb_path), + "S[0]"); + check_leaf(PATH_PLAST_BUFFER(tb->tb_path)); + + return retval; +} + +static void check_after_balance_leaf(struct tree_balance *tb) +{ + if (tb->lnum[0]) { + if (B_FREE_SPACE(tb->L[0]) != + MAX_CHILD_SIZE(tb->L[0]) - + dc_size(B_N_CHILD + (tb->FL[0], get_left_neighbor_position(tb, 0)))) { + print_cur_tb("12221"); + reiserfs_panic(tb->tb_sb, "PAP-12355", + "shift to left was incorrect"); + } + } + if (tb->rnum[0]) { + if (B_FREE_SPACE(tb->R[0]) != + MAX_CHILD_SIZE(tb->R[0]) - + dc_size(B_N_CHILD + (tb->FR[0], get_right_neighbor_position(tb, 0)))) { + print_cur_tb("12222"); + reiserfs_panic(tb->tb_sb, "PAP-12360", + "shift to right was incorrect"); + } + } + if (PATH_H_PBUFFER(tb->tb_path, 1) && + (B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)) != + (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) - + dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, 1)))))) { + int left = B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)); + int right = (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) - + dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, + 1)))); + print_cur_tb("12223"); + reiserfs_warning(tb->tb_sb, "reiserfs-12363", + "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; " + "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d", + left, + MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)), + PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, 1), + dc_size(B_N_CHILD + (PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, 1))), + right); + reiserfs_panic(tb->tb_sb, "PAP-12365", "S is incorrect"); + } +} + +static void check_leaf_level(struct tree_balance *tb) +{ + check_leaf(tb->L[0]); + check_leaf(tb->R[0]); + check_leaf(PATH_PLAST_BUFFER(tb->tb_path)); +} + +static void check_internal_levels(struct tree_balance *tb) +{ + int h; + + /* check all internal nodes */ + for (h = 1; tb->insert_size[h]; h++) { + check_internal_node(tb->tb_sb, PATH_H_PBUFFER(tb->tb_path, h), + "BAD BUFFER ON PATH"); + if (tb->lnum[h]) + check_internal_node(tb->tb_sb, tb->L[h], "BAD L"); + if (tb->rnum[h]) + check_internal_node(tb->tb_sb, tb->R[h], "BAD R"); + } + +} + +#endif + +/* Now we have all of the buffers that must be used in balancing of + the tree. We rely on the assumption that schedule() will not occur + while do_balance works. ( Only interrupt handlers are acceptable.) + We balance the tree according to the analysis made before this, + using buffers already obtained. For SMP support it will someday be + necessary to add ordered locking of tb. */ + +/* Some interesting rules of balancing: + + we delete a maximum of two nodes per level per balancing: we never + delete R, when we delete two of three nodes L, S, R then we move + them into R. + + we only delete L if we are deleting two nodes, if we delete only + one node we delete S + + if we shift leaves then we shift as much as we can: this is a + deliberate policy of extremism in node packing which results in + higher average utilization after repeated random balance operations + at the cost of more memory copies and more balancing as a result of + small insertions to full nodes. + + if we shift internal nodes we try to evenly balance the node + utilization, with consequent less balancing at the cost of lower + utilization. + + one could argue that the policy for directories in leaves should be + that of internal nodes, but we will wait until another day to + evaluate this.... It would be nice to someday measure and prove + these assumptions as to what is optimal.... + +*/ + +static inline void do_balance_starts(struct tree_balance *tb) +{ + /* use print_cur_tb() to see initial state of struct + tree_balance */ + + /* store_print_tb (tb); */ + + /* do not delete, just comment it out */ +/* print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb, + "check");*/ + RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB"); +#ifdef CONFIG_REISERFS_CHECK + REISERFS_SB(tb->tb_sb)->cur_tb = tb; +#endif +} + +static inline void do_balance_completed(struct tree_balance *tb) +{ + +#ifdef CONFIG_REISERFS_CHECK + check_leaf_level(tb); + check_internal_levels(tb); + REISERFS_SB(tb->tb_sb)->cur_tb = NULL; +#endif + + /* reiserfs_free_block is no longer schedule safe. So, we need to + ** put the buffers we want freed on the thrown list during do_balance, + ** and then free them now + */ + + REISERFS_SB(tb->tb_sb)->s_do_balance++; + + /* release all nodes hold to perform the balancing */ + unfix_nodes(tb); + + free_thrown(tb); +} + +void do_balance(struct tree_balance *tb, /* tree_balance structure */ + struct item_head *ih, /* item header of inserted item */ + const char *body, /* body of inserted item or bytes to paste */ + int flag) +{ /* i - insert, d - delete + c - cut, p - paste + + Cut means delete part of an item + (includes removing an entry from a + directory). + + Delete means delete whole item. + + Insert means add a new item into the + tree. + + Paste means to append to the end of an + existing file or to insert a directory + entry. */ + int child_pos, /* position of a child node in its parent */ + h; /* level of the tree being processed */ + struct item_head insert_key[2]; /* in our processing of one level + we sometimes determine what + must be inserted into the next + higher level. This insertion + consists of a key or two keys + and their corresponding + pointers */ + struct buffer_head *insert_ptr[2]; /* inserted node-ptrs for the next + level */ + + tb->tb_mode = flag; + tb->need_balance_dirty = 0; + + if (FILESYSTEM_CHANGED_TB(tb)) { + reiserfs_panic(tb->tb_sb, "clm-6000", "fs generation has " + "changed"); + } + /* if we have no real work to do */ + if (!tb->insert_size[0]) { + reiserfs_warning(tb->tb_sb, "PAP-12350", + "insert_size == 0, mode == %c", flag); + unfix_nodes(tb); + return; + } + + atomic_inc(&(fs_generation(tb->tb_sb))); + do_balance_starts(tb); + + /* balance leaf returns 0 except if combining L R and S into + one node. see balance_internal() for explanation of this + line of code. */ + child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) + + balance_leaf(tb, ih, body, flag, insert_key, insert_ptr); + +#ifdef CONFIG_REISERFS_CHECK + check_after_balance_leaf(tb); +#endif + + /* Balance internal level of the tree. */ + for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++) + child_pos = + balance_internal(tb, h, child_pos, insert_key, insert_ptr); + + do_balance_completed(tb); + +} diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c new file mode 100644 index 00000000..91f080cc --- /dev/null +++ b/fs/reiserfs/file.c @@ -0,0 +1,315 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* +** We pack the tails of files on file close, not at the time they are written. +** This implies an unnecessary copy of the tail and an unnecessary indirect item +** insertion/balancing, for files that are written in one write. +** It avoids unnecessary tail packings (balances) for files that are written in +** multiple writes and are small enough to have tails. +** +** file_release is called by the VFS layer when the file is closed. If +** this is the last open file descriptor, and the file +** small enough to have a tail, and the tail is currently in an +** unformatted node, the tail is converted back into a direct item. +** +** We use reiserfs_truncate_file to pack the tail, since it already has +** all the conditions coded. +*/ +static int reiserfs_file_release(struct inode *inode, struct file *filp) +{ + + struct reiserfs_transaction_handle th; + int err; + int jbegin_failure = 0; + + BUG_ON(!S_ISREG(inode->i_mode)); + + if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1)) + return 0; + + mutex_lock(&(REISERFS_I(inode)->tailpack)); + + if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) { + mutex_unlock(&(REISERFS_I(inode)->tailpack)); + return 0; + } + + /* fast out for when nothing needs to be done */ + if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) || + !tail_has_to_be_packed(inode)) && + REISERFS_I(inode)->i_prealloc_count <= 0) { + mutex_unlock(&(REISERFS_I(inode)->tailpack)); + return 0; + } + + reiserfs_write_lock(inode->i_sb); + /* freeing preallocation only involves relogging blocks that + * are already in the current transaction. preallocation gets + * freed at the end of each transaction, so it is impossible for + * us to log any additional blocks (including quota blocks) + */ + err = journal_begin(&th, inode->i_sb, 1); + if (err) { + /* uh oh, we can't allow the inode to go away while there + * is still preallocation blocks pending. Try to join the + * aborted transaction + */ + jbegin_failure = err; + err = journal_join_abort(&th, inode->i_sb, 1); + + if (err) { + /* hmpf, our choices here aren't good. We can pin the inode + * which will disallow unmount from every happening, we can + * do nothing, which will corrupt random memory on unmount, + * or we can forcibly remove the file from the preallocation + * list, which will leak blocks on disk. Lets pin the inode + * and let the admin know what is going on. + */ + igrab(inode); + reiserfs_warning(inode->i_sb, "clm-9001", + "pinning inode %lu because the " + "preallocation can't be freed", + inode->i_ino); + goto out; + } + } + reiserfs_update_inode_transaction(inode); + +#ifdef REISERFS_PREALLOCATE + reiserfs_discard_prealloc(&th, inode); +#endif + err = journal_end(&th, inode->i_sb, 1); + + /* copy back the error code from journal_begin */ + if (!err) + err = jbegin_failure; + + if (!err && + (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) && + tail_has_to_be_packed(inode)) { + + /* if regular file is released by last holder and it has been + appended (we append by unformatted node only) or its direct + item(s) had to be converted, then it may have to be + indirect2direct converted */ + err = reiserfs_truncate_file(inode, 0); + } + out: + reiserfs_write_unlock(inode->i_sb); + mutex_unlock(&(REISERFS_I(inode)->tailpack)); + return err; +} + +static int reiserfs_file_open(struct inode *inode, struct file *file) +{ + int err = dquot_file_open(inode, file); + if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) { + /* somebody might be tailpacking on final close; wait for it */ + mutex_lock(&(REISERFS_I(inode)->tailpack)); + atomic_inc(&REISERFS_I(inode)->openers); + mutex_unlock(&(REISERFS_I(inode)->tailpack)); + } + return err; +} + +static void reiserfs_vfs_truncate_file(struct inode *inode) +{ + mutex_lock(&(REISERFS_I(inode)->tailpack)); + reiserfs_truncate_file(inode, 1); + mutex_unlock(&(REISERFS_I(inode)->tailpack)); +} + +/* Sync a reiserfs file. */ + +/* + * FIXME: sync_mapping_buffers() never has anything to sync. Can + * be removed... + */ + +static int reiserfs_sync_file(struct file *filp, int datasync) +{ + struct inode *inode = filp->f_mapping->host; + int err; + int barrier_done; + + BUG_ON(!S_ISREG(inode->i_mode)); + err = sync_mapping_buffers(inode->i_mapping); + reiserfs_write_lock(inode->i_sb); + barrier_done = reiserfs_commit_for_inode(inode); + reiserfs_write_unlock(inode->i_sb); + if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (barrier_done < 0) + return barrier_done; + return (err < 0) ? -EIO : 0; +} + +/* taken fs/buffer.c:__block_commit_write */ +int reiserfs_commit_page(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT; + int new; + int logit = reiserfs_file_data_log(inode); + struct super_block *s = inode->i_sb; + int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; + struct reiserfs_transaction_handle th; + int ret = 0; + + th.t_trans_id = 0; + blocksize = 1 << inode->i_blkbits; + + if (logit) { + reiserfs_write_lock(s); + ret = journal_begin(&th, s, bh_per_page + 1); + if (ret) + goto drop_write_lock; + reiserfs_update_inode_transaction(inode); + } + for (bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + + new = buffer_new(bh); + clear_buffer_new(bh); + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_buffer_uptodate(bh); + if (logit) { + reiserfs_prepare_for_journal(s, bh, 1); + journal_mark_dirty(&th, s, bh); + } else if (!buffer_dirty(bh)) { + mark_buffer_dirty(bh); + /* do data=ordered on any page past the end + * of file and any buffer marked BH_New. + */ + if (reiserfs_data_ordered(inode->i_sb) && + (new || page->index >= i_size_index)) { + reiserfs_add_ordered_list(inode, bh); + } + } + } + } + if (logit) { + ret = journal_end(&th, s, bh_per_page + 1); + drop_write_lock: + reiserfs_write_unlock(s); + } + /* + * If this is a partial write which happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' whether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return ret; +} + +/* Write @count bytes at position @ppos in a file indicated by @file + from the buffer @buf. + + generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want + something simple that works. It is not for serious use by general purpose filesystems, excepting the one that it was + written for (ext2/3). This is for several reasons: + + * It has no understanding of any filesystem specific optimizations. + + * It enters the filesystem repeatedly for each page that is written. + + * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key + * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time + * to reiserfs which allows for fewer tree traversals. + + * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks. + + * Asking the block allocation code for blocks one at a time is slightly less efficient. + + All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to + use it, but we were in a hurry to make code freeze, and so it couldn't be revised then. This new code should make + things right finally. + + Future Features: providing search_by_key with hints. + +*/ +static ssize_t reiserfs_file_write(struct file *file, /* the file we are going to write into */ + const char __user * buf, /* pointer to user supplied data + (in userspace) */ + size_t count, /* amount of bytes to write */ + loff_t * ppos /* pointer to position in file that we start writing at. Should be updated to + * new current position before returning. */ + ) +{ + struct inode *inode = file->f_path.dentry->d_inode; // Inode of the file that we are writing to. + /* To simplify coding at this time, we store + locked pages in array for now */ + struct reiserfs_transaction_handle th; + th.t_trans_id = 0; + + /* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items + * lying around (most of the disk, in fact). Despite the filesystem + * now being a v3.6 format, the old items still can't support large + * file sizes. Catch this case here, as the rest of the VFS layer is + * oblivious to the different limitations between old and new items. + * reiserfs_setattr catches this for truncates. This chunk is lifted + * from generic_write_checks. */ + if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 && + *ppos + count > MAX_NON_LFS) { + if (*ppos >= MAX_NON_LFS) { + return -EFBIG; + } + if (count > MAX_NON_LFS - (unsigned long)*ppos) + count = MAX_NON_LFS - (unsigned long)*ppos; + } + + return do_sync_write(file, buf, count, ppos); +} + +const struct file_operations reiserfs_file_operations = { + .read = do_sync_read, + .write = reiserfs_file_write, + .unlocked_ioctl = reiserfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = reiserfs_compat_ioctl, +#endif + .mmap = generic_file_mmap, + .open = reiserfs_file_open, + .release = reiserfs_file_release, + .fsync = reiserfs_sync_file, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, + .llseek = generic_file_llseek, +}; + +const struct inode_operations reiserfs_file_inode_operations = { + .truncate = reiserfs_vfs_truncate_file, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, +}; diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c new file mode 100644 index 00000000..1e4250bc --- /dev/null +++ b/fs/reiserfs/fix_node.c @@ -0,0 +1,2593 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/** + ** old_item_num + ** old_entry_num + ** set_entry_sizes + ** create_virtual_node + ** check_left + ** check_right + ** directory_part_size + ** get_num_ver + ** set_parameters + ** is_leaf_removable + ** are_leaves_removable + ** get_empty_nodes + ** get_lfree + ** get_rfree + ** is_left_neighbor_in_cache + ** decrement_key + ** get_far_parent + ** get_parents + ** can_node_be_removed + ** ip_check_balance + ** dc_check_balance_internal + ** dc_check_balance_leaf + ** dc_check_balance + ** check_balance + ** get_direct_parent + ** get_neighbors + ** fix_nodes + ** + ** + **/ + +#include +#include +#include +#include +#include + +/* To make any changes in the tree we find a node, that contains item + to be changed/deleted or position in the node we insert a new item + to. We call this node S. To do balancing we need to decide what we + will shift to left/right neighbor, or to a new node, where new item + will be etc. To make this analysis simpler we build virtual + node. Virtual node is an array of items, that will replace items of + node S. (For instance if we are going to delete an item, virtual + node does not contain it). Virtual node keeps information about + item sizes and types, mergeability of first and last items, sizes + of all entries in directory item. We use this array of items when + calculating what we can shift to neighbors and how many nodes we + have to have if we do not any shiftings, if we shift to left/right + neighbor or to both. */ + +/* taking item number in virtual node, returns number of item, that it has in source buffer */ +static inline int old_item_num(int new_num, int affected_item_num, int mode) +{ + if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num) + return new_num; + + if (mode == M_INSERT) { + + RFALSE(new_num == 0, + "vs-8005: for INSERT mode and item number of inserted item"); + + return new_num - 1; + } + + RFALSE(mode != M_DELETE, + "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", + mode); + /* delete mode */ + return new_num + 1; +} + +static void create_virtual_node(struct tree_balance *tb, int h) +{ + struct item_head *ih; + struct virtual_node *vn = tb->tb_vn; + int new_num; + struct buffer_head *Sh; /* this comes from tb->S[h] */ + + Sh = PATH_H_PBUFFER(tb->tb_path, h); + + /* size of changed node */ + vn->vn_size = + MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h]; + + /* for internal nodes array if virtual items is not created */ + if (h) { + vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE); + return; + } + + /* number of items in virtual node */ + vn->vn_nr_item = + B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) - + ((vn->vn_mode == M_DELETE) ? 1 : 0); + + /* first virtual item */ + vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1); + memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item)); + vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item); + + /* first item in the node */ + ih = B_N_PITEM_HEAD(Sh, 0); + + /* define the mergeability for 0-th item (if it is not being deleted) */ + if (op_is_left_mergeable(&(ih->ih_key), Sh->b_size) + && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num)) + vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE; + + /* go through all items those remain in the virtual node (except for the new (inserted) one) */ + for (new_num = 0; new_num < vn->vn_nr_item; new_num++) { + int j; + struct virtual_item *vi = vn->vn_vi + new_num; + int is_affected = + ((new_num != vn->vn_affected_item_num) ? 0 : 1); + + if (is_affected && vn->vn_mode == M_INSERT) + continue; + + /* get item number in source node */ + j = old_item_num(new_num, vn->vn_affected_item_num, + vn->vn_mode); + + vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE; + vi->vi_ih = ih + j; + vi->vi_item = B_I_PITEM(Sh, ih + j); + vi->vi_uarea = vn->vn_free_ptr; + + // FIXME: there is no check, that item operation did not + // consume too much memory + vn->vn_free_ptr += + op_create_vi(vn, vi, is_affected, tb->insert_size[0]); + if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr) + reiserfs_panic(tb->tb_sb, "vs-8030", + "virtual node space consumed"); + + if (!is_affected) + /* this is not being changed */ + continue; + + if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) { + vn->vn_vi[new_num].vi_item_len += tb->insert_size[0]; + vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted + } + } + + /* virtual inserted item is not defined yet */ + if (vn->vn_mode == M_INSERT) { + struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num; + + RFALSE(vn->vn_ins_ih == NULL, + "vs-8040: item header of inserted item is not specified"); + vi->vi_item_len = tb->insert_size[0]; + vi->vi_ih = vn->vn_ins_ih; + vi->vi_item = vn->vn_data; + vi->vi_uarea = vn->vn_free_ptr; + + op_create_vi(vn, vi, 0 /*not pasted or cut */ , + tb->insert_size[0]); + } + + /* set right merge flag we take right delimiting key and check whether it is a mergeable item */ + if (tb->CFR[0]) { + struct reiserfs_key *key; + + key = B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]); + if (op_is_left_mergeable(key, Sh->b_size) + && (vn->vn_mode != M_DELETE + || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) + vn->vn_vi[vn->vn_nr_item - 1].vi_type |= + VI_TYPE_RIGHT_MERGEABLE; + +#ifdef CONFIG_REISERFS_CHECK + if (op_is_left_mergeable(key, Sh->b_size) && + !(vn->vn_mode != M_DELETE + || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) { + /* we delete last item and it could be merged with right neighbor's first item */ + if (! + (B_NR_ITEMS(Sh) == 1 + && is_direntry_le_ih(B_N_PITEM_HEAD(Sh, 0)) + && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) { + /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */ + print_block(Sh, 0, -1, -1); + reiserfs_panic(tb->tb_sb, "vs-8045", + "rdkey %k, affected item==%d " + "(mode==%c) Must be %c", + key, vn->vn_affected_item_num, + vn->vn_mode, M_DELETE); + } + } +#endif + + } +} + +/* using virtual node check, how many items can be shifted to left + neighbor */ +static void check_left(struct tree_balance *tb, int h, int cur_free) +{ + int i; + struct virtual_node *vn = tb->tb_vn; + struct virtual_item *vi; + int d_size, ih_size; + + RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free); + + /* internal level */ + if (h > 0) { + tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE); + return; + } + + /* leaf level */ + + if (!cur_free || !vn->vn_nr_item) { + /* no free space or nothing to move */ + tb->lnum[h] = 0; + tb->lbytes = -1; + return; + } + + RFALSE(!PATH_H_PPARENT(tb->tb_path, 0), + "vs-8055: parent does not exist or invalid"); + + vi = vn->vn_vi; + if ((unsigned int)cur_free >= + (vn->vn_size - + ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) { + /* all contents of S[0] fits into L[0] */ + + RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, + "vs-8055: invalid mode or balance condition failed"); + + tb->lnum[0] = vn->vn_nr_item; + tb->lbytes = -1; + return; + } + + d_size = 0, ih_size = IH_SIZE; + + /* first item may be merge with last item in left neighbor */ + if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE) + d_size = -((int)IH_SIZE), ih_size = 0; + + tb->lnum[0] = 0; + for (i = 0; i < vn->vn_nr_item; + i++, ih_size = IH_SIZE, d_size = 0, vi++) { + d_size += vi->vi_item_len; + if (cur_free >= d_size) { + /* the item can be shifted entirely */ + cur_free -= d_size; + tb->lnum[0]++; + continue; + } + + /* the item cannot be shifted entirely, try to split it */ + /* check whether L[0] can hold ih and at least one byte of the item body */ + if (cur_free <= ih_size) { + /* cannot shift even a part of the current item */ + tb->lbytes = -1; + return; + } + cur_free -= ih_size; + + tb->lbytes = op_check_left(vi, cur_free, 0, 0); + if (tb->lbytes != -1) + /* count partially shifted item */ + tb->lnum[0]++; + + break; + } + + return; +} + +/* using virtual node check, how many items can be shifted to right + neighbor */ +static void check_right(struct tree_balance *tb, int h, int cur_free) +{ + int i; + struct virtual_node *vn = tb->tb_vn; + struct virtual_item *vi; + int d_size, ih_size; + + RFALSE(cur_free < 0, "vs-8070: cur_free < 0"); + + /* internal level */ + if (h > 0) { + tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE); + return; + } + + /* leaf level */ + + if (!cur_free || !vn->vn_nr_item) { + /* no free space */ + tb->rnum[h] = 0; + tb->rbytes = -1; + return; + } + + RFALSE(!PATH_H_PPARENT(tb->tb_path, 0), + "vs-8075: parent does not exist or invalid"); + + vi = vn->vn_vi + vn->vn_nr_item - 1; + if ((unsigned int)cur_free >= + (vn->vn_size - + ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) { + /* all contents of S[0] fits into R[0] */ + + RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, + "vs-8080: invalid mode or balance condition failed"); + + tb->rnum[h] = vn->vn_nr_item; + tb->rbytes = -1; + return; + } + + d_size = 0, ih_size = IH_SIZE; + + /* last item may be merge with first item in right neighbor */ + if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) + d_size = -(int)IH_SIZE, ih_size = 0; + + tb->rnum[0] = 0; + for (i = vn->vn_nr_item - 1; i >= 0; + i--, d_size = 0, ih_size = IH_SIZE, vi--) { + d_size += vi->vi_item_len; + if (cur_free >= d_size) { + /* the item can be shifted entirely */ + cur_free -= d_size; + tb->rnum[0]++; + continue; + } + + /* check whether R[0] can hold ih and at least one byte of the item body */ + if (cur_free <= ih_size) { /* cannot shift even a part of the current item */ + tb->rbytes = -1; + return; + } + + /* R[0] can hold the header of the item and at least one byte of its body */ + cur_free -= ih_size; /* cur_free is still > 0 */ + + tb->rbytes = op_check_right(vi, cur_free); + if (tb->rbytes != -1) + /* count partially shifted item */ + tb->rnum[0]++; + + break; + } + + return; +} + +/* + * from - number of items, which are shifted to left neighbor entirely + * to - number of item, which are shifted to right neighbor entirely + * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor + * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */ +static int get_num_ver(int mode, struct tree_balance *tb, int h, + int from, int from_bytes, + int to, int to_bytes, short *snum012, int flow) +{ + int i; + int cur_free; + // int bytes; + int units; + struct virtual_node *vn = tb->tb_vn; + // struct virtual_item * vi; + + int total_node_size, max_node_size, current_item_size; + int needed_nodes; + int start_item, /* position of item we start filling node from */ + end_item, /* position of item we finish filling node by */ + start_bytes, /* number of first bytes (entries for directory) of start_item-th item + we do not include into node that is being filled */ + end_bytes; /* number of last bytes (entries for directory) of end_item-th item + we do node include into node that is being filled */ + int split_item_positions[2]; /* these are positions in virtual item of + items, that are split between S[0] and + S1new and S1new and S2new */ + + split_item_positions[0] = -1; + split_item_positions[1] = -1; + + /* We only create additional nodes if we are in insert or paste mode + or we are in replace mode at the internal level. If h is 0 and + the mode is M_REPLACE then in fix_nodes we change the mode to + paste or insert before we get here in the code. */ + RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE), + "vs-8100: insert_size < 0 in overflow"); + + max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h)); + + /* snum012 [0-2] - number of items, that lay + to S[0], first new node and second new node */ + snum012[3] = -1; /* s1bytes */ + snum012[4] = -1; /* s2bytes */ + + /* internal level */ + if (h > 0) { + i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE); + if (i == max_node_size) + return 1; + return (i / max_node_size + 1); + } + + /* leaf level */ + needed_nodes = 1; + total_node_size = 0; + cur_free = max_node_size; + + // start from 'from'-th item + start_item = from; + // skip its first 'start_bytes' units + start_bytes = ((from_bytes != -1) ? from_bytes : 0); + + // last included item is the 'end_item'-th one + end_item = vn->vn_nr_item - to - 1; + // do not count last 'end_bytes' units of 'end_item'-th item + end_bytes = (to_bytes != -1) ? to_bytes : 0; + + /* go through all item beginning from the start_item-th item and ending by + the end_item-th item. Do not count first 'start_bytes' units of + 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */ + + for (i = start_item; i <= end_item; i++) { + struct virtual_item *vi = vn->vn_vi + i; + int skip_from_end = ((i == end_item) ? end_bytes : 0); + + RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed"); + + /* get size of current item */ + current_item_size = vi->vi_item_len; + + /* do not take in calculation head part (from_bytes) of from-th item */ + current_item_size -= + op_part_size(vi, 0 /*from start */ , start_bytes); + + /* do not take in calculation tail part of last item */ + current_item_size -= + op_part_size(vi, 1 /*from end */ , skip_from_end); + + /* if item fits into current node entierly */ + if (total_node_size + current_item_size <= max_node_size) { + snum012[needed_nodes - 1]++; + total_node_size += current_item_size; + start_bytes = 0; + continue; + } + + if (current_item_size > max_node_size) { + /* virtual item length is longer, than max size of item in + a node. It is impossible for direct item */ + RFALSE(is_direct_le_ih(vi->vi_ih), + "vs-8110: " + "direct item length is %d. It can not be longer than %d", + current_item_size, max_node_size); + /* we will try to split it */ + flow = 1; + } + + if (!flow) { + /* as we do not split items, take new node and continue */ + needed_nodes++; + i--; + total_node_size = 0; + continue; + } + // calculate number of item units which fit into node being + // filled + { + int free_space; + + free_space = max_node_size - total_node_size - IH_SIZE; + units = + op_check_left(vi, free_space, start_bytes, + skip_from_end); + if (units == -1) { + /* nothing fits into current node, take new node and continue */ + needed_nodes++, i--, total_node_size = 0; + continue; + } + } + + /* something fits into the current node */ + //if (snum012[3] != -1 || needed_nodes != 1) + // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required"); + //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units; + start_bytes += units; + snum012[needed_nodes - 1 + 3] = units; + + if (needed_nodes > 2) + reiserfs_warning(tb->tb_sb, "vs-8111", + "split_item_position is out of range"); + snum012[needed_nodes - 1]++; + split_item_positions[needed_nodes - 1] = i; + needed_nodes++; + /* continue from the same item with start_bytes != -1 */ + start_item = i; + i--; + total_node_size = 0; + } + + // sum012[4] (if it is not -1) contains number of units of which + // are to be in S1new, snum012[3] - to be in S0. They are supposed + // to be S1bytes and S2bytes correspondingly, so recalculate + if (snum012[4] > 0) { + int split_item_num; + int bytes_to_r, bytes_to_l; + int bytes_to_S1new; + + split_item_num = split_item_positions[1]; + bytes_to_l = + ((from == split_item_num + && from_bytes != -1) ? from_bytes : 0); + bytes_to_r = + ((end_item == split_item_num + && end_bytes != -1) ? end_bytes : 0); + bytes_to_S1new = + ((split_item_positions[0] == + split_item_positions[1]) ? snum012[3] : 0); + + // s2bytes + snum012[4] = + op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] - + bytes_to_r - bytes_to_l - bytes_to_S1new; + + if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY && + vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT) + reiserfs_warning(tb->tb_sb, "vs-8115", + "not directory or indirect item"); + } + + /* now we know S2bytes, calculate S1bytes */ + if (snum012[3] > 0) { + int split_item_num; + int bytes_to_r, bytes_to_l; + int bytes_to_S2new; + + split_item_num = split_item_positions[0]; + bytes_to_l = + ((from == split_item_num + && from_bytes != -1) ? from_bytes : 0); + bytes_to_r = + ((end_item == split_item_num + && end_bytes != -1) ? end_bytes : 0); + bytes_to_S2new = + ((split_item_positions[0] == split_item_positions[1] + && snum012[4] != -1) ? snum012[4] : 0); + + // s1bytes + snum012[3] = + op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] - + bytes_to_r - bytes_to_l - bytes_to_S2new; + } + + return needed_nodes; +} + + +/* Set parameters for balancing. + * Performs write of results of analysis of balancing into structure tb, + * where it will later be used by the functions that actually do the balancing. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * lnum number of items from S[h] that must be shifted to L[h]; + * rnum number of items from S[h] that must be shifted to R[h]; + * blk_num number of blocks that S[h] will be splitted into; + * s012 number of items that fall into splitted nodes. + * lbytes number of bytes which flow to the left neighbor from the item that is not + * not shifted entirely + * rbytes number of bytes which flow to the right neighbor from the item that is not + * not shifted entirely + * s1bytes number of bytes which flow to the first new node when S[0] splits (this number is contained in s012 array) + */ + +static void set_parameters(struct tree_balance *tb, int h, int lnum, + int rnum, int blk_num, short *s012, int lb, int rb) +{ + + tb->lnum[h] = lnum; + tb->rnum[h] = rnum; + tb->blknum[h] = blk_num; + + if (h == 0) { /* only for leaf level */ + if (s012 != NULL) { + tb->s0num = *s012++, + tb->s1num = *s012++, tb->s2num = *s012++; + tb->s1bytes = *s012++; + tb->s2bytes = *s012; + } + tb->lbytes = lb; + tb->rbytes = rb; + } + PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum); + PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum); + + PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb); + PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb); +} + +/* check, does node disappear if we shift tb->lnum[0] items to left + neighbor and tb->rnum[0] to the right one. */ +static int is_leaf_removable(struct tree_balance *tb) +{ + struct virtual_node *vn = tb->tb_vn; + int to_left, to_right; + int size; + int remain_items; + + /* number of items, that will be shifted to left (right) neighbor + entirely */ + to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0); + to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0); + remain_items = vn->vn_nr_item; + + /* how many items remain in S[0] after shiftings to neighbors */ + remain_items -= (to_left + to_right); + + if (remain_items < 1) { + /* all content of node can be shifted to neighbors */ + set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0, + NULL, -1, -1); + return 1; + } + + if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1) + /* S[0] is not removable */ + return 0; + + /* check, whether we can divide 1 remaining item between neighbors */ + + /* get size of remaining item (in item units) */ + size = op_unit_num(&(vn->vn_vi[to_left])); + + if (tb->lbytes + tb->rbytes >= size) { + set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL, + tb->lbytes, -1); + return 1; + } + + return 0; +} + +/* check whether L, S, R can be joined in one node */ +static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree) +{ + struct virtual_node *vn = tb->tb_vn; + int ih_size; + struct buffer_head *S0; + + S0 = PATH_H_PBUFFER(tb->tb_path, 0); + + ih_size = 0; + if (vn->vn_nr_item) { + if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE) + ih_size += IH_SIZE; + + if (vn->vn_vi[vn->vn_nr_item - 1]. + vi_type & VI_TYPE_RIGHT_MERGEABLE) + ih_size += IH_SIZE; + } else { + /* there was only one item and it will be deleted */ + struct item_head *ih; + + RFALSE(B_NR_ITEMS(S0) != 1, + "vs-8125: item number must be 1: it is %d", + B_NR_ITEMS(S0)); + + ih = B_N_PITEM_HEAD(S0, 0); + if (tb->CFR[0] + && !comp_short_le_keys(&(ih->ih_key), + B_N_PDELIM_KEY(tb->CFR[0], + tb->rkey[0]))) + if (is_direntry_le_ih(ih)) { + /* Directory must be in correct state here: that is + somewhere at the left side should exist first directory + item. But the item being deleted can not be that first + one because its right neighbor is item of the same + directory. (But first item always gets deleted in last + turn). So, neighbors of deleted item can be merged, so + we can save ih_size */ + ih_size = IH_SIZE; + + /* we might check that left neighbor exists and is of the + same directory */ + RFALSE(le_ih_k_offset(ih) == DOT_OFFSET, + "vs-8130: first directory item can not be removed until directory is not empty"); + } + + } + + if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) { + set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1); + PROC_INFO_INC(tb->tb_sb, leaves_removable); + return 1; + } + return 0; + +} + +/* when we do not split item, lnum and rnum are numbers of entire items */ +#define SET_PAR_SHIFT_LEFT \ +if (h)\ +{\ + int to_l;\ + \ + to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\ + (MAX_NR_KEY(Sh) + 1 - lpar);\ + \ + set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\ +}\ +else \ +{\ + if (lset==LEFT_SHIFT_FLOW)\ + set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\ + tb->lbytes, -1);\ + else\ + set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\ + -1, -1);\ +} + +#define SET_PAR_SHIFT_RIGHT \ +if (h)\ +{\ + int to_r;\ + \ + to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\ + \ + set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\ +}\ +else \ +{\ + if (rset==RIGHT_SHIFT_FLOW)\ + set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\ + -1, tb->rbytes);\ + else\ + set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\ + -1, -1);\ +} + +static void free_buffers_in_tb(struct tree_balance *tb) +{ + int i; + + pathrelse(tb->tb_path); + + for (i = 0; i < MAX_HEIGHT; i++) { + brelse(tb->L[i]); + brelse(tb->R[i]); + brelse(tb->FL[i]); + brelse(tb->FR[i]); + brelse(tb->CFL[i]); + brelse(tb->CFR[i]); + + tb->L[i] = NULL; + tb->R[i] = NULL; + tb->FL[i] = NULL; + tb->FR[i] = NULL; + tb->CFL[i] = NULL; + tb->CFR[i] = NULL; + } +} + +/* Get new buffers for storing new nodes that are created while balancing. + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + * NO_DISK_SPACE - no disk space. + */ +/* The function is NOT SCHEDULE-SAFE! */ +static int get_empty_nodes(struct tree_balance *tb, int h) +{ + struct buffer_head *new_bh, + *Sh = PATH_H_PBUFFER(tb->tb_path, h); + b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, }; + int counter, number_of_freeblk, amount_needed, /* number of needed empty blocks */ + retval = CARRY_ON; + struct super_block *sb = tb->tb_sb; + + /* number_of_freeblk is the number of empty blocks which have been + acquired for use by the balancing algorithm minus the number of + empty blocks used in the previous levels of the analysis, + number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs + after empty blocks are acquired, and the balancing analysis is + then restarted, amount_needed is the number needed by this level + (h) of the balancing analysis. + + Note that for systems with many processes writing, it would be + more layout optimal to calculate the total number needed by all + levels and then to run reiserfs_new_blocks to get all of them at once. */ + + /* Initiate number_of_freeblk to the amount acquired prior to the restart of + the analysis or 0 if not restarted, then subtract the amount needed + by all of the levels of the tree below h. */ + /* blknum includes S[h], so we subtract 1 in this calculation */ + for (counter = 0, number_of_freeblk = tb->cur_blknum; + counter < h; counter++) + number_of_freeblk -= + (tb->blknum[counter]) ? (tb->blknum[counter] - + 1) : 0; + + /* Allocate missing empty blocks. */ + /* if Sh == 0 then we are getting a new root */ + amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1; + /* Amount_needed = the amount that we need more than the amount that we have. */ + if (amount_needed > number_of_freeblk) + amount_needed -= number_of_freeblk; + else /* If we have enough already then there is nothing to do. */ + return CARRY_ON; + + /* No need to check quota - is not allocated for blocks used for formatted nodes */ + if (reiserfs_new_form_blocknrs(tb, blocknrs, + amount_needed) == NO_DISK_SPACE) + return NO_DISK_SPACE; + + /* for each blocknumber we just got, get a buffer and stick it on FEB */ + for (blocknr = blocknrs, counter = 0; + counter < amount_needed; blocknr++, counter++) { + + RFALSE(!*blocknr, + "PAP-8135: reiserfs_new_blocknrs failed when got new blocks"); + + new_bh = sb_getblk(sb, *blocknr); + RFALSE(buffer_dirty(new_bh) || + buffer_journaled(new_bh) || + buffer_journal_dirty(new_bh), + "PAP-8140: journaled or dirty buffer %b for the new block", + new_bh); + + /* Put empty buffers into the array. */ + RFALSE(tb->FEB[tb->cur_blknum], + "PAP-8141: busy slot for new buffer"); + + set_buffer_journal_new(new_bh); + tb->FEB[tb->cur_blknum++] = new_bh; + } + + if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb)) + retval = REPEAT_SEARCH; + + return retval; +} + +/* Get free space of the left neighbor, which is stored in the parent + * node of the left neighbor. */ +static int get_lfree(struct tree_balance *tb, int h) +{ + struct buffer_head *l, *f; + int order; + + if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL || + (l = tb->FL[h]) == NULL) + return 0; + + if (f == l) + order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1; + else { + order = B_NR_ITEMS(l); + f = l; + } + + return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); +} + +/* Get free space of the right neighbor, + * which is stored in the parent node of the right neighbor. + */ +static int get_rfree(struct tree_balance *tb, int h) +{ + struct buffer_head *r, *f; + int order; + + if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL || + (r = tb->FR[h]) == NULL) + return 0; + + if (f == r) + order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1; + else { + order = 0; + f = r; + } + + return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); + +} + +/* Check whether left neighbor is in memory. */ +static int is_left_neighbor_in_cache(struct tree_balance *tb, int h) +{ + struct buffer_head *father, *left; + struct super_block *sb = tb->tb_sb; + b_blocknr_t left_neighbor_blocknr; + int left_neighbor_position; + + /* Father of the left neighbor does not exist. */ + if (!tb->FL[h]) + return 0; + + /* Calculate father of the node to be balanced. */ + father = PATH_H_PBUFFER(tb->tb_path, h + 1); + + RFALSE(!father || + !B_IS_IN_TREE(father) || + !B_IS_IN_TREE(tb->FL[h]) || + !buffer_uptodate(father) || + !buffer_uptodate(tb->FL[h]), + "vs-8165: F[h] (%b) or FL[h] (%b) is invalid", + father, tb->FL[h]); + + /* Get position of the pointer to the left neighbor into the left father. */ + left_neighbor_position = (father == tb->FL[h]) ? + tb->lkey[h] : B_NR_ITEMS(tb->FL[h]); + /* Get left neighbor block number. */ + left_neighbor_blocknr = + B_N_CHILD_NUM(tb->FL[h], left_neighbor_position); + /* Look for the left neighbor in the cache. */ + if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) { + + RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left), + "vs-8170: left neighbor (%b %z) is not in the tree", + left, left); + put_bh(left); + return 1; + } + + return 0; +} + +#define LEFT_PARENTS 'l' +#define RIGHT_PARENTS 'r' + +static void decrement_key(struct cpu_key *key) +{ + // call item specific function for this key + item_ops[cpu_key_k_type(key)]->decrement_key(key); +} + +/* Calculate far left/right parent of the left/right neighbor of the current node, that + * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h]. + * Calculate left/right common parent of the current node and L[h]/R[h]. + * Calculate left/right delimiting key position. + * Returns: PATH_INCORRECT - path in the tree is not correct; + SCHEDULE_OCCURRED - schedule occurred while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + */ +static int get_far_parent(struct tree_balance *tb, + int h, + struct buffer_head **pfather, + struct buffer_head **pcom_father, char c_lr_par) +{ + struct buffer_head *parent; + INITIALIZE_PATH(s_path_to_neighbor_father); + struct treepath *path = tb->tb_path; + struct cpu_key s_lr_father_key; + int counter, + position = INT_MAX, + first_last_position = 0, + path_offset = PATH_H_PATH_OFFSET(path, h); + + /* Starting from F[h] go upwards in the tree, and look for the common + ancestor of F[h], and its neighbor l/r, that should be obtained. */ + + counter = path_offset; + + RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET, + "PAP-8180: invalid path length"); + + for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) { + /* Check whether parent of the current buffer in the path is really parent in the tree. */ + if (!B_IS_IN_TREE + (parent = PATH_OFFSET_PBUFFER(path, counter - 1))) + return REPEAT_SEARCH; + /* Check whether position in the parent is correct. */ + if ((position = + PATH_OFFSET_POSITION(path, + counter - 1)) > + B_NR_ITEMS(parent)) + return REPEAT_SEARCH; + /* Check whether parent at the path really points to the child. */ + if (B_N_CHILD_NUM(parent, position) != + PATH_OFFSET_PBUFFER(path, counter)->b_blocknr) + return REPEAT_SEARCH; + /* Return delimiting key if position in the parent is not equal to first/last one. */ + if (c_lr_par == RIGHT_PARENTS) + first_last_position = B_NR_ITEMS(parent); + if (position != first_last_position) { + *pcom_father = parent; + get_bh(*pcom_father); + /*(*pcom_father = parent)->b_count++; */ + break; + } + } + + /* if we are in the root of the tree, then there is no common father */ + if (counter == FIRST_PATH_ELEMENT_OFFSET) { + /* Check whether first buffer in the path is the root of the tree. */ + if (PATH_OFFSET_PBUFFER + (tb->tb_path, + FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == + SB_ROOT_BLOCK(tb->tb_sb)) { + *pfather = *pcom_father = NULL; + return CARRY_ON; + } + return REPEAT_SEARCH; + } + + RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL, + "PAP-8185: (%b %z) level too small", + *pcom_father, *pcom_father); + + /* Check whether the common parent is locked. */ + + if (buffer_locked(*pcom_father)) { + + /* Release the write lock while the buffer is busy */ + reiserfs_write_unlock(tb->tb_sb); + __wait_on_buffer(*pcom_father); + reiserfs_write_lock(tb->tb_sb); + if (FILESYSTEM_CHANGED_TB(tb)) { + brelse(*pcom_father); + return REPEAT_SEARCH; + } + } + + /* So, we got common parent of the current node and its left/right neighbor. + Now we are geting the parent of the left/right neighbor. */ + + /* Form key to get parent of the left/right neighbor. */ + le_key2cpu_key(&s_lr_father_key, + B_N_PDELIM_KEY(*pcom_father, + (c_lr_par == + LEFT_PARENTS) ? (tb->lkey[h - 1] = + position - + 1) : (tb->rkey[h - + 1] = + position))); + + if (c_lr_par == LEFT_PARENTS) + decrement_key(&s_lr_father_key); + + if (search_by_key + (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, + h + 1) == IO_ERROR) + // path is released + return IO_ERROR; + + if (FILESYSTEM_CHANGED_TB(tb)) { + pathrelse(&s_path_to_neighbor_father); + brelse(*pcom_father); + return REPEAT_SEARCH; + } + + *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father); + + RFALSE(B_LEVEL(*pfather) != h + 1, + "PAP-8190: (%b %z) level too small", *pfather, *pfather); + RFALSE(s_path_to_neighbor_father.path_length < + FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small"); + + s_path_to_neighbor_father.path_length--; + pathrelse(&s_path_to_neighbor_father); + return CARRY_ON; +} + +/* Get parents of neighbors of node in the path(S[path_offset]) and common parents of + * S[path_offset] and L[path_offset]/R[path_offset]: F[path_offset], FL[path_offset], + * FR[path_offset], CFL[path_offset], CFR[path_offset]. + * Calculate numbers of left and right delimiting keys position: lkey[path_offset], rkey[path_offset]. + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + */ +static int get_parents(struct tree_balance *tb, int h) +{ + struct treepath *path = tb->tb_path; + int position, + ret, + path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h); + struct buffer_head *curf, *curcf; + + /* Current node is the root of the tree or will be root of the tree */ + if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) { + /* The root can not have parents. + Release nodes which previously were obtained as parents of the current node neighbors. */ + brelse(tb->FL[h]); + brelse(tb->CFL[h]); + brelse(tb->FR[h]); + brelse(tb->CFR[h]); + tb->FL[h] = NULL; + tb->CFL[h] = NULL; + tb->FR[h] = NULL; + tb->CFR[h] = NULL; + return CARRY_ON; + } + + /* Get parent FL[path_offset] of L[path_offset]. */ + position = PATH_OFFSET_POSITION(path, path_offset - 1); + if (position) { + /* Current node is not the first child of its parent. */ + curf = PATH_OFFSET_PBUFFER(path, path_offset - 1); + curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1); + get_bh(curf); + get_bh(curf); + tb->lkey[h] = position - 1; + } else { + /* Calculate current parent of L[path_offset], which is the left neighbor of the current node. + Calculate current common parent of L[path_offset] and the current node. Note that + CFL[path_offset] not equal FL[path_offset] and CFL[path_offset] not equal F[path_offset]. + Calculate lkey[path_offset]. */ + if ((ret = get_far_parent(tb, h + 1, &curf, + &curcf, + LEFT_PARENTS)) != CARRY_ON) + return ret; + } + + brelse(tb->FL[h]); + tb->FL[h] = curf; /* New initialization of FL[h]. */ + brelse(tb->CFL[h]); + tb->CFL[h] = curcf; /* New initialization of CFL[h]. */ + + RFALSE((curf && !B_IS_IN_TREE(curf)) || + (curcf && !B_IS_IN_TREE(curcf)), + "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf); + +/* Get parent FR[h] of R[h]. */ + +/* Current node is the last child of F[h]. FR[h] != F[h]. */ + if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) { +/* Calculate current parent of R[h], which is the right neighbor of F[h]. + Calculate current common parent of R[h] and current node. Note that CFR[h] + not equal FR[path_offset] and CFR[h] not equal F[h]. */ + if ((ret = + get_far_parent(tb, h + 1, &curf, &curcf, + RIGHT_PARENTS)) != CARRY_ON) + return ret; + } else { +/* Current node is not the last child of its parent F[h]. */ + curf = PATH_OFFSET_PBUFFER(path, path_offset - 1); + curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1); + get_bh(curf); + get_bh(curf); + tb->rkey[h] = position; + } + + brelse(tb->FR[h]); + /* New initialization of FR[path_offset]. */ + tb->FR[h] = curf; + + brelse(tb->CFR[h]); + /* New initialization of CFR[path_offset]. */ + tb->CFR[h] = curcf; + + RFALSE((curf && !B_IS_IN_TREE(curf)) || + (curcf && !B_IS_IN_TREE(curcf)), + "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf); + + return CARRY_ON; +} + +/* it is possible to remove node as result of shiftings to + neighbors even when we insert or paste item. */ +static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, + struct tree_balance *tb, int h) +{ + struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h); + int levbytes = tb->insert_size[h]; + struct item_head *ih; + struct reiserfs_key *r_key = NULL; + + ih = B_N_PITEM_HEAD(Sh, 0); + if (tb->CFR[h]) + r_key = B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]); + + if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes + /* shifting may merge items which might save space */ + - + ((!h + && op_is_left_mergeable(&(ih->ih_key), Sh->b_size)) ? IH_SIZE : 0) + - + ((!h && r_key + && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0) + + ((h) ? KEY_SIZE : 0)) { + /* node can not be removed */ + if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */ + if (!h) + tb->s0num = + B_NR_ITEMS(Sh) + + ((mode == M_INSERT) ? 1 : 0); + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + } + PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]); + return !NO_BALANCING_NEEDED; +} + +/* Check whether current node S[h] is balanced when increasing its size by + * Inserting or Pasting. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +/* ip means Inserting or Pasting */ +static int ip_check_balance(struct tree_balance *tb, int h) +{ + struct virtual_node *vn = tb->tb_vn; + int levbytes, /* Number of bytes that must be inserted into (value + is negative if bytes are deleted) buffer which + contains node being balanced. The mnemonic is + that the attempted change in node space used level + is levbytes bytes. */ + ret; + + int lfree, sfree, rfree /* free space in L, S and R */ ; + + /* nver is short for number of vertixes, and lnver is the number if + we shift to the left, rnver is the number if we shift to the + right, and lrnver is the number if we shift in both directions. + The goal is to minimize first the number of vertixes, and second, + the number of vertixes whose contents are changed by shifting, + and third the number of uncached vertixes whose contents are + changed by shifting and must be read from disk. */ + int nver, lnver, rnver, lrnver; + + /* used at leaf level only, S0 = S[0] is the node being balanced, + sInum [ I = 0,1,2 ] is the number of items that will + remain in node SI after balancing. S1 and S2 are new + nodes that might be created. */ + + /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters. + where 4th parameter is s1bytes and 5th - s2bytes + */ + short snum012[40] = { 0, }; /* s0num, s1num, s2num for 8 cases + 0,1 - do not shift and do not shift but bottle + 2 - shift only whole item to left + 3 - shift to left and bottle as much as possible + 4,5 - shift to right (whole items and as much as possible + 6,7 - shift to both directions (whole items and as much as possible) + */ + + /* Sh is the node whose balance is currently being checked */ + struct buffer_head *Sh; + + Sh = PATH_H_PBUFFER(tb->tb_path, h); + levbytes = tb->insert_size[h]; + + /* Calculate balance parameters for creating new root. */ + if (!Sh) { + if (!h) + reiserfs_panic(tb->tb_sb, "vs-8210", + "S[0] can not be 0"); + switch (ret = get_empty_nodes(tb, h)) { + case CARRY_ON: + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ + + case NO_DISK_SPACE: + case REPEAT_SEARCH: + return ret; + default: + reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect " + "return value of get_empty_nodes"); + } + } + + if ((ret = get_parents(tb, h)) != CARRY_ON) /* get parents of S[h] neighbors. */ + return ret; + + sfree = B_FREE_SPACE(Sh); + + /* get free space of neighbors */ + rfree = get_rfree(tb, h); + lfree = get_lfree(tb, h); + + if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) == + NO_BALANCING_NEEDED) + /* and new item fits into node S[h] without any shifting */ + return NO_BALANCING_NEEDED; + + create_virtual_node(tb, h); + + /* + determine maximal number of items we can shift to the left neighbor (in tb structure) + and the maximal number of bytes that can flow to the left neighbor + from the left most liquid item that cannot be shifted from S[0] entirely (returned value) + */ + check_left(tb, h, lfree); + + /* + determine maximal number of items we can shift to the right neighbor (in tb structure) + and the maximal number of bytes that can flow to the right neighbor + from the right most liquid item that cannot be shifted from S[0] entirely (returned value) + */ + check_right(tb, h, rfree); + + /* all contents of internal node S[h] can be moved into its + neighbors, S[h] will be removed after balancing */ + if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) { + int to_r; + + /* Since we are working on internal nodes, and our internal + nodes have fixed size entries, then we can balance by the + number of items rather than the space they consume. In this + routine we set the left node equal to the right node, + allowing a difference of less than or equal to 1 child + pointer. */ + to_r = + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - + tb->rnum[h]); + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, + -1, -1); + return CARRY_ON; + } + + /* this checks balance condition, that any two neighboring nodes can not fit in one node */ + RFALSE(h && + (tb->lnum[h] >= vn->vn_nr_item + 1 || + tb->rnum[h] >= vn->vn_nr_item + 1), + "vs-8220: tree is not balanced on internal level"); + RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) || + (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))), + "vs-8225: tree is not balanced on leaf level"); + + /* all contents of S[0] can be moved into its neighbors + S[0] will be removed after balancing. */ + if (!h && is_leaf_removable(tb)) + return CARRY_ON; + + /* why do we perform this check here rather than earlier?? + Answer: we can win 1 node in some cases above. Moreover we + checked it above, when we checked, that S[0] is not removable + in principle */ + if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */ + if (!h) + tb->s0num = vn->vn_nr_item; + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + { + int lpar, rpar, nset, lset, rset, lrset; + /* + * regular overflowing of the node + */ + + /* get_num_ver works in 2 modes (FLOW & NO_FLOW) + lpar, rpar - number of items we can shift to left/right neighbor (including splitting item) + nset, lset, rset, lrset - shows, whether flowing items give better packing + */ +#define FLOW 1 +#define NO_FLOW 0 /* do not any splitting */ + + /* we choose one the following */ +#define NOTHING_SHIFT_NO_FLOW 0 +#define NOTHING_SHIFT_FLOW 5 +#define LEFT_SHIFT_NO_FLOW 10 +#define LEFT_SHIFT_FLOW 15 +#define RIGHT_SHIFT_NO_FLOW 20 +#define RIGHT_SHIFT_FLOW 25 +#define LR_SHIFT_NO_FLOW 30 +#define LR_SHIFT_FLOW 35 + + lpar = tb->lnum[h]; + rpar = tb->rnum[h]; + + /* calculate number of blocks S[h] must be split into when + nothing is shifted to the neighbors, + as well as number of items in each part of the split node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any */ + nset = NOTHING_SHIFT_NO_FLOW; + nver = get_num_ver(vn->vn_mode, tb, h, + 0, -1, h ? vn->vn_nr_item : 0, -1, + snum012, NO_FLOW); + + if (!h) { + int nver1; + + /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */ + nver1 = get_num_ver(vn->vn_mode, tb, h, + 0, -1, 0, -1, + snum012 + NOTHING_SHIFT_FLOW, FLOW); + if (nver > nver1) + nset = NOTHING_SHIFT_FLOW, nver = nver1; + } + + /* calculate number of blocks S[h] must be split into when + l_shift_num first items and l_shift_bytes of the right most + liquid item to be shifted are shifted to the left neighbor, + as well as number of items in each part of the splitted node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any + */ + lset = LEFT_SHIFT_NO_FLOW; + lnver = get_num_ver(vn->vn_mode, tb, h, + lpar - ((h || tb->lbytes == -1) ? 0 : 1), + -1, h ? vn->vn_nr_item : 0, -1, + snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW); + if (!h) { + int lnver1; + + lnver1 = get_num_ver(vn->vn_mode, tb, h, + lpar - + ((tb->lbytes != -1) ? 1 : 0), + tb->lbytes, 0, -1, + snum012 + LEFT_SHIFT_FLOW, FLOW); + if (lnver > lnver1) + lset = LEFT_SHIFT_FLOW, lnver = lnver1; + } + + /* calculate number of blocks S[h] must be split into when + r_shift_num first items and r_shift_bytes of the left most + liquid item to be shifted are shifted to the right neighbor, + as well as number of items in each part of the splitted node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any + */ + rset = RIGHT_SHIFT_NO_FLOW; + rnver = get_num_ver(vn->vn_mode, tb, h, + 0, -1, + h ? (vn->vn_nr_item - rpar) : (rpar - + ((tb-> + rbytes != + -1) ? 1 : + 0)), -1, + snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW); + if (!h) { + int rnver1; + + rnver1 = get_num_ver(vn->vn_mode, tb, h, + 0, -1, + (rpar - + ((tb->rbytes != -1) ? 1 : 0)), + tb->rbytes, + snum012 + RIGHT_SHIFT_FLOW, FLOW); + + if (rnver > rnver1) + rset = RIGHT_SHIFT_FLOW, rnver = rnver1; + } + + /* calculate number of blocks S[h] must be split into when + items are shifted in both directions, + as well as number of items in each part of the splitted node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any + */ + lrset = LR_SHIFT_NO_FLOW; + lrnver = get_num_ver(vn->vn_mode, tb, h, + lpar - ((h || tb->lbytes == -1) ? 0 : 1), + -1, + h ? (vn->vn_nr_item - rpar) : (rpar - + ((tb-> + rbytes != + -1) ? 1 : + 0)), -1, + snum012 + LR_SHIFT_NO_FLOW, NO_FLOW); + if (!h) { + int lrnver1; + + lrnver1 = get_num_ver(vn->vn_mode, tb, h, + lpar - + ((tb->lbytes != -1) ? 1 : 0), + tb->lbytes, + (rpar - + ((tb->rbytes != -1) ? 1 : 0)), + tb->rbytes, + snum012 + LR_SHIFT_FLOW, FLOW); + if (lrnver > lrnver1) + lrset = LR_SHIFT_FLOW, lrnver = lrnver1; + } + + /* Our general shifting strategy is: + 1) to minimized number of new nodes; + 2) to minimized number of neighbors involved in shifting; + 3) to minimized number of disk reads; */ + + /* we can win TWO or ONE nodes by shifting in both directions */ + if (lrnver < lnver && lrnver < rnver) { + RFALSE(h && + (tb->lnum[h] != 1 || + tb->rnum[h] != 1 || + lrnver != 1 || rnver != 2 || lnver != 2 + || h != 1), "vs-8230: bad h"); + if (lrset == LR_SHIFT_FLOW) + set_parameters(tb, h, tb->lnum[h], tb->rnum[h], + lrnver, snum012 + lrset, + tb->lbytes, tb->rbytes); + else + set_parameters(tb, h, + tb->lnum[h] - + ((tb->lbytes == -1) ? 0 : 1), + tb->rnum[h] - + ((tb->rbytes == -1) ? 0 : 1), + lrnver, snum012 + lrset, -1, -1); + + return CARRY_ON; + } + + /* if shifting doesn't lead to better packing then don't shift */ + if (nver == lrnver) { + set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1, + -1); + return CARRY_ON; + } + + /* now we know that for better packing shifting in only one + direction either to the left or to the right is required */ + + /* if shifting to the left is better than shifting to the right */ + if (lnver < rnver) { + SET_PAR_SHIFT_LEFT; + return CARRY_ON; + } + + /* if shifting to the right is better than shifting to the left */ + if (lnver > rnver) { + SET_PAR_SHIFT_RIGHT; + return CARRY_ON; + } + + /* now shifting in either direction gives the same number + of nodes and we can make use of the cached neighbors */ + if (is_left_neighbor_in_cache(tb, h)) { + SET_PAR_SHIFT_LEFT; + return CARRY_ON; + } + + /* shift to the right independently on whether the right neighbor in cache or not */ + SET_PAR_SHIFT_RIGHT; + return CARRY_ON; + } +} + +/* Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Cutting for INTERNAL node of S+tree. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + * + * Note: Items of internal nodes have fixed size, so the balance condition for + * the internal part of S+tree is as for the B-trees. + */ +static int dc_check_balance_internal(struct tree_balance *tb, int h) +{ + struct virtual_node *vn = tb->tb_vn; + + /* Sh is the node whose balance is currently being checked, + and Fh is its father. */ + struct buffer_head *Sh, *Fh; + int maxsize, ret; + int lfree, rfree /* free space in L and R */ ; + + Sh = PATH_H_PBUFFER(tb->tb_path, h); + Fh = PATH_H_PPARENT(tb->tb_path, h); + + maxsize = MAX_CHILD_SIZE(Sh); + +/* using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */ +/* new_nr_item = number of items node would have if operation is */ +/* performed without balancing (new_nr_item); */ + create_virtual_node(tb, h); + + if (!Fh) { /* S[h] is the root. */ + if (vn->vn_nr_item > 0) { + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ + } + /* new_nr_item == 0. + * Current root will be deleted resulting in + * decrementing the tree height. */ + set_parameters(tb, h, 0, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + if ((ret = get_parents(tb, h)) != CARRY_ON) + return ret; + + /* get free space of neighbors */ + rfree = get_rfree(tb, h); + lfree = get_lfree(tb, h); + + /* determine maximal number of items we can fit into neighbors */ + check_left(tb, h, lfree); + check_right(tb, h, rfree); + + if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { /* Balance condition for the internal node is valid. + * In this case we balance only if it leads to better packing. */ + if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { /* Here we join S[h] with one of its neighbors, + * which is impossible with greater values of new_nr_item. */ + if (tb->lnum[h] >= vn->vn_nr_item + 1) { + /* All contents of S[h] can be moved to L[h]. */ + int n; + int order_L; + + order_L = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == + 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; + n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / + (DC_SIZE + KEY_SIZE); + set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, + -1); + return CARRY_ON; + } + + if (tb->rnum[h] >= vn->vn_nr_item + 1) { + /* All contents of S[h] can be moved to R[h]. */ + int n; + int order_R; + + order_R = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == + B_NR_ITEMS(Fh)) ? 0 : n + 1; + n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / + (DC_SIZE + KEY_SIZE); + set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, + -1); + return CARRY_ON; + } + } + + if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { + /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ + int to_r; + + to_r = + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - + tb->rnum[h] + vn->vn_nr_item + 1) / 2 - + (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, + 0, NULL, -1, -1); + return CARRY_ON; + } + + /* Balancing does not lead to better packing. */ + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + /* Current node contain insufficient number of items. Balancing is required. */ + /* Check whether we can merge S[h] with left neighbor. */ + if (tb->lnum[h] >= vn->vn_nr_item + 1) + if (is_left_neighbor_in_cache(tb, h) + || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) { + int n; + int order_L; + + order_L = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == + 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; + n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE + + KEY_SIZE); + set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* Check whether we can merge S[h] with right neighbor. */ + if (tb->rnum[h] >= vn->vn_nr_item + 1) { + int n; + int order_R; + + order_R = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1); + n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE + + KEY_SIZE); + set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ + if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { + int to_r; + + to_r = + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - + tb->rnum[h]); + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, + -1, -1); + return CARRY_ON; + } + + /* For internal nodes try to borrow item from a neighbor */ + RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root"); + + /* Borrow one or two items from caching neighbor */ + if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) { + int from_l; + + from_l = + (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + + 1) / 2 - (vn->vn_nr_item + 1); + set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1); + return CARRY_ON; + } + + set_parameters(tb, h, 0, + -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item + + 1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1); + return CARRY_ON; +} + +/* Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Truncating for LEAF node of S+tree. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int dc_check_balance_leaf(struct tree_balance *tb, int h) +{ + struct virtual_node *vn = tb->tb_vn; + + /* Number of bytes that must be deleted from + (value is negative if bytes are deleted) buffer which + contains node being balanced. The mnemonic is that the + attempted change in node space used level is levbytes bytes. */ + int levbytes; + /* the maximal item size */ + int maxsize, ret; + /* S0 is the node whose balance is currently being checked, + and F0 is its father. */ + struct buffer_head *S0, *F0; + int lfree, rfree /* free space in L and R */ ; + + S0 = PATH_H_PBUFFER(tb->tb_path, 0); + F0 = PATH_H_PPARENT(tb->tb_path, 0); + + levbytes = tb->insert_size[h]; + + maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */ + + if (!F0) { /* S[0] is the root now. */ + + RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0), + "vs-8240: attempt to create empty buffer tree"); + + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + if ((ret = get_parents(tb, h)) != CARRY_ON) + return ret; + + /* get free space of neighbors */ + rfree = get_rfree(tb, h); + lfree = get_lfree(tb, h); + + create_virtual_node(tb, h); + + /* if 3 leaves can be merge to one, set parameters and return */ + if (are_leaves_removable(tb, lfree, rfree)) + return CARRY_ON; + + /* determine maximal number of items we can shift to the left/right neighbor + and the maximal number of bytes that can flow to the left/right neighbor + from the left/right most liquid item that cannot be shifted from S[0] entirely + */ + check_left(tb, h, lfree); + check_right(tb, h, rfree); + + /* check whether we can merge S with left neighbor. */ + if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1) + if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */ + !tb->FR[h]) { + + RFALSE(!tb->FL[h], + "vs-8245: dc_check_balance_leaf: FL[h] must exist"); + + /* set parameter to merge S[0] with its left neighbor */ + set_parameters(tb, h, -1, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* check whether we can merge S[0] with right neighbor. */ + if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) { + set_parameters(tb, h, 0, -1, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */ + if (is_leaf_removable(tb)) + return CARRY_ON; + + /* Balancing is not required. */ + tb->s0num = vn->vn_nr_item; + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; +} + +/* Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Cutting. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode d - delete, c - cut. + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int dc_check_balance(struct tree_balance *tb, int h) +{ + RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)), + "vs-8250: S is not initialized"); + + if (h) + return dc_check_balance_internal(tb, h); + else + return dc_check_balance_leaf(tb, h); +} + +/* Check whether current node S[h] is balanced. + * Calculate parameters for balancing for current level h. + * Parameters: + * + * tb tree_balance structure: + * + * tb is a large structure that must be read about in the header file + * at the same time as this procedure if the reader is to successfully + * understand this procedure + * + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste, d - delete, c - cut. + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int check_balance(int mode, + struct tree_balance *tb, + int h, + int inum, + int pos_in_item, + struct item_head *ins_ih, const void *data) +{ + struct virtual_node *vn; + + vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf); + vn->vn_free_ptr = (char *)(tb->tb_vn + 1); + vn->vn_mode = mode; + vn->vn_affected_item_num = inum; + vn->vn_pos_in_item = pos_in_item; + vn->vn_ins_ih = ins_ih; + vn->vn_data = data; + + RFALSE(mode == M_INSERT && !vn->vn_ins_ih, + "vs-8255: ins_ih can not be 0 in insert mode"); + + if (tb->insert_size[h] > 0) + /* Calculate balance parameters when size of node is increasing. */ + return ip_check_balance(tb, h); + + /* Calculate balance parameters when size of node is decreasing. */ + return dc_check_balance(tb, h); +} + +/* Check whether parent at the path is the really parent of the current node.*/ +static int get_direct_parent(struct tree_balance *tb, int h) +{ + struct buffer_head *bh; + struct treepath *path = tb->tb_path; + int position, + path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h); + + /* We are in the root or in the new root. */ + if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) { + + RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1, + "PAP-8260: invalid offset in the path"); + + if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)-> + b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) { + /* Root is not changed. */ + PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL; + PATH_OFFSET_POSITION(path, path_offset - 1) = 0; + return CARRY_ON; + } + return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */ + } + + if (!B_IS_IN_TREE + (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1))) + return REPEAT_SEARCH; /* Parent in the path is not in the tree. */ + + if ((position = + PATH_OFFSET_POSITION(path, + path_offset - 1)) > B_NR_ITEMS(bh)) + return REPEAT_SEARCH; + + if (B_N_CHILD_NUM(bh, position) != + PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr) + /* Parent in the path is not parent of the current node in the tree. */ + return REPEAT_SEARCH; + + if (buffer_locked(bh)) { + reiserfs_write_unlock(tb->tb_sb); + __wait_on_buffer(bh); + reiserfs_write_lock(tb->tb_sb); + if (FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + } + + return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */ +} + +/* Using lnum[h] and rnum[h] we should determine what neighbors + * of S[h] we + * need in order to balance S[h], and get them if necessary. + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + */ +static int get_neighbors(struct tree_balance *tb, int h) +{ + int child_position, + path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1); + unsigned long son_number; + struct super_block *sb = tb->tb_sb; + struct buffer_head *bh; + + PROC_INFO_INC(sb, get_neighbors[h]); + + if (tb->lnum[h]) { + /* We need left neighbor to balance S[h]. */ + PROC_INFO_INC(sb, need_l_neighbor[h]); + bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset); + + RFALSE(bh == tb->FL[h] && + !PATH_OFFSET_POSITION(tb->tb_path, path_offset), + "PAP-8270: invalid position in the parent"); + + child_position = + (bh == + tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb-> + FL[h]); + son_number = B_N_CHILD_NUM(tb->FL[h], child_position); + reiserfs_write_unlock(sb); + bh = sb_bread(sb, son_number); + reiserfs_write_lock(sb); + if (!bh) + return IO_ERROR; + if (FILESYSTEM_CHANGED_TB(tb)) { + brelse(bh); + PROC_INFO_INC(sb, get_neighbors_restart[h]); + return REPEAT_SEARCH; + } + + RFALSE(!B_IS_IN_TREE(tb->FL[h]) || + child_position > B_NR_ITEMS(tb->FL[h]) || + B_N_CHILD_NUM(tb->FL[h], child_position) != + bh->b_blocknr, "PAP-8275: invalid parent"); + RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child"); + RFALSE(!h && + B_FREE_SPACE(bh) != + MAX_CHILD_SIZE(bh) - + dc_size(B_N_CHILD(tb->FL[0], child_position)), + "PAP-8290: invalid child size of left neighbor"); + + brelse(tb->L[h]); + tb->L[h] = bh; + } + + /* We need right neighbor to balance S[path_offset]. */ + if (tb->rnum[h]) { /* We need right neighbor to balance S[path_offset]. */ + PROC_INFO_INC(sb, need_r_neighbor[h]); + bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset); + + RFALSE(bh == tb->FR[h] && + PATH_OFFSET_POSITION(tb->tb_path, + path_offset) >= + B_NR_ITEMS(bh), + "PAP-8295: invalid position in the parent"); + + child_position = + (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0; + son_number = B_N_CHILD_NUM(tb->FR[h], child_position); + reiserfs_write_unlock(sb); + bh = sb_bread(sb, son_number); + reiserfs_write_lock(sb); + if (!bh) + return IO_ERROR; + if (FILESYSTEM_CHANGED_TB(tb)) { + brelse(bh); + PROC_INFO_INC(sb, get_neighbors_restart[h]); + return REPEAT_SEARCH; + } + brelse(tb->R[h]); + tb->R[h] = bh; + + RFALSE(!h + && B_FREE_SPACE(bh) != + MAX_CHILD_SIZE(bh) - + dc_size(B_N_CHILD(tb->FR[0], child_position)), + "PAP-8300: invalid child size of right neighbor (%d != %d - %d)", + B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh), + dc_size(B_N_CHILD(tb->FR[0], child_position))); + + } + return CARRY_ON; +} + +static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) +{ + int max_num_of_items; + int max_num_of_entries; + unsigned long blocksize = sb->s_blocksize; + +#define MIN_NAME_LEN 1 + + max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN); + max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) / + (DEH_SIZE + MIN_NAME_LEN); + + return sizeof(struct virtual_node) + + max(max_num_of_items * sizeof(struct virtual_item), + sizeof(struct virtual_item) + sizeof(struct direntry_uarea) + + (max_num_of_entries - 1) * sizeof(__u16)); +} + +/* maybe we should fail balancing we are going to perform when kmalloc + fails several times. But now it will loop until kmalloc gets + required memory */ +static int get_mem_for_virtual_node(struct tree_balance *tb) +{ + int check_fs = 0; + int size; + char *buf; + + size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path)); + + if (size > tb->vn_buf_size) { + /* we have to allocate more memory for virtual node */ + if (tb->vn_buf) { + /* free memory allocated before */ + kfree(tb->vn_buf); + /* this is not needed if kfree is atomic */ + check_fs = 1; + } + + /* virtual node requires now more memory */ + tb->vn_buf_size = size; + + /* get memory for virtual item */ + buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); + if (!buf) { + /* getting memory with GFP_KERNEL priority may involve + balancing now (due to indirect_to_direct conversion on + dcache shrinking). So, release path and collected + resources here */ + free_buffers_in_tb(tb); + buf = kmalloc(size, GFP_NOFS); + if (!buf) { + tb->vn_buf_size = 0; + } + tb->vn_buf = buf; + schedule(); + return REPEAT_SEARCH; + } + + tb->vn_buf = buf; + } + + if (check_fs && FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + + return CARRY_ON; +} + +#ifdef CONFIG_REISERFS_CHECK +static void tb_buffer_sanity_check(struct super_block *sb, + struct buffer_head *bh, + const char *descr, int level) +{ + if (bh) { + if (atomic_read(&(bh->b_count)) <= 0) + + reiserfs_panic(sb, "jmacd-1", "negative or zero " + "reference counter for buffer %s[%d] " + "(%b)", descr, level, bh); + + if (!buffer_uptodate(bh)) + reiserfs_panic(sb, "jmacd-2", "buffer is not up " + "to date %s[%d] (%b)", + descr, level, bh); + + if (!B_IS_IN_TREE(bh)) + reiserfs_panic(sb, "jmacd-3", "buffer is not " + "in tree %s[%d] (%b)", + descr, level, bh); + + if (bh->b_bdev != sb->s_bdev) + reiserfs_panic(sb, "jmacd-4", "buffer has wrong " + "device %s[%d] (%b)", + descr, level, bh); + + if (bh->b_size != sb->s_blocksize) + reiserfs_panic(sb, "jmacd-5", "buffer has wrong " + "blocksize %s[%d] (%b)", + descr, level, bh); + + if (bh->b_blocknr > SB_BLOCK_COUNT(sb)) + reiserfs_panic(sb, "jmacd-6", "buffer block " + "number too high %s[%d] (%b)", + descr, level, bh); + } +} +#else +static void tb_buffer_sanity_check(struct super_block *sb, + struct buffer_head *bh, + const char *descr, int level) +{; +} +#endif + +static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh) +{ + return reiserfs_prepare_for_journal(s, bh, 0); +} + +static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) +{ + struct buffer_head *locked; +#ifdef CONFIG_REISERFS_CHECK + int repeat_counter = 0; +#endif + int i; + + do { + + locked = NULL; + + for (i = tb->tb_path->path_length; + !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) { + if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) { + /* if I understand correctly, we can only be sure the last buffer + ** in the path is in the tree --clm + */ +#ifdef CONFIG_REISERFS_CHECK + if (PATH_PLAST_BUFFER(tb->tb_path) == + PATH_OFFSET_PBUFFER(tb->tb_path, i)) + tb_buffer_sanity_check(tb->tb_sb, + PATH_OFFSET_PBUFFER + (tb->tb_path, + i), "S", + tb->tb_path-> + path_length - i); +#endif + if (!clear_all_dirty_bits(tb->tb_sb, + PATH_OFFSET_PBUFFER + (tb->tb_path, + i))) { + locked = + PATH_OFFSET_PBUFFER(tb->tb_path, + i); + } + } + } + + for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i]; + i++) { + + if (tb->lnum[i]) { + + if (tb->L[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->L[i], + "L", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->L[i])) + locked = tb->L[i]; + } + + if (!locked && tb->FL[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->FL[i], + "FL", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->FL[i])) + locked = tb->FL[i]; + } + + if (!locked && tb->CFL[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->CFL[i], + "CFL", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->CFL[i])) + locked = tb->CFL[i]; + } + + } + + if (!locked && (tb->rnum[i])) { + + if (tb->R[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->R[i], + "R", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->R[i])) + locked = tb->R[i]; + } + + if (!locked && tb->FR[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->FR[i], + "FR", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->FR[i])) + locked = tb->FR[i]; + } + + if (!locked && tb->CFR[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->CFR[i], + "CFR", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->CFR[i])) + locked = tb->CFR[i]; + } + } + } + /* as far as I can tell, this is not required. The FEB list seems + ** to be full of newly allocated nodes, which will never be locked, + ** dirty, or anything else. + ** To be safe, I'm putting in the checks and waits in. For the moment, + ** they are needed to keep the code in journal.c from complaining + ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well. + ** --clm + */ + for (i = 0; !locked && i < MAX_FEB_SIZE; i++) { + if (tb->FEB[i]) { + if (!clear_all_dirty_bits + (tb->tb_sb, tb->FEB[i])) + locked = tb->FEB[i]; + } + } + + if (locked) { +#ifdef CONFIG_REISERFS_CHECK + repeat_counter++; + if ((repeat_counter % 10000) == 0) { + reiserfs_warning(tb->tb_sb, "reiserfs-8200", + "too many iterations waiting " + "for buffer to unlock " + "(%b)", locked); + + /* Don't loop forever. Try to recover from possible error. */ + + return (FILESYSTEM_CHANGED_TB(tb)) ? + REPEAT_SEARCH : CARRY_ON; + } +#endif + reiserfs_write_unlock(tb->tb_sb); + __wait_on_buffer(locked); + reiserfs_write_lock(tb->tb_sb); + if (FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + } + + } while (locked); + + return CARRY_ON; +} + +/* Prepare for balancing, that is + * get all necessary parents, and neighbors; + * analyze what and where should be moved; + * get sufficient number of new nodes; + * Balancing will start only after all resources will be collected at a time. + * + * When ported to SMP kernels, only at the last moment after all needed nodes + * are collected in cache, will the resources be locked using the usual + * textbook ordered lock acquisition algorithms. Note that ensuring that + * this code neither write locks what it does not need to write lock nor locks out of order + * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans + * + * fix is meant in the sense of render unchanging + * + * Latency might be improved by first gathering a list of what buffers are needed + * and then getting as many of them in parallel as possible? -Hans + * + * Parameters: + * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append) + * tb tree_balance structure; + * inum item number in S[h]; + * pos_in_item - comment this if you can + * ins_ih item head of item being inserted + * data inserted item or data to be pasted + * Returns: 1 - schedule occurred while the function worked; + * 0 - schedule didn't occur while the function worked; + * -1 - if no_disk_space + */ + +int fix_nodes(int op_mode, struct tree_balance *tb, + struct item_head *ins_ih, const void *data) +{ + int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path); + int pos_in_item; + + /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared + ** during wait_tb_buffers_run + */ + int wait_tb_buffers_run = 0; + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + + ++REISERFS_SB(tb->tb_sb)->s_fix_nodes; + + pos_in_item = tb->tb_path->pos_in_item; + + tb->fs_gen = get_generation(tb->tb_sb); + + /* we prepare and log the super here so it will already be in the + ** transaction when do_balance needs to change it. + ** This way do_balance won't have to schedule when trying to prepare + ** the super for logging + */ + reiserfs_prepare_for_journal(tb->tb_sb, + SB_BUFFER_WITH_SB(tb->tb_sb), 1); + journal_mark_dirty(tb->transaction_handle, tb->tb_sb, + SB_BUFFER_WITH_SB(tb->tb_sb)); + if (FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + + /* if it possible in indirect_to_direct conversion */ + if (buffer_locked(tbS0)) { + reiserfs_write_unlock(tb->tb_sb); + __wait_on_buffer(tbS0); + reiserfs_write_lock(tb->tb_sb); + if (FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + } +#ifdef CONFIG_REISERFS_CHECK + if (REISERFS_SB(tb->tb_sb)->cur_tb) { + print_cur_tb("fix_nodes"); + reiserfs_panic(tb->tb_sb, "PAP-8305", + "there is pending do_balance"); + } + + if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0)) + reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is " + "not uptodate at the beginning of fix_nodes " + "or not in tree (mode %c)", + tbS0, tbS0, op_mode); + + /* Check parameters. */ + switch (op_mode) { + case M_INSERT: + if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0)) + reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect " + "item number %d (in S0 - %d) in case " + "of insert", item_num, + B_NR_ITEMS(tbS0)); + break; + case M_PASTE: + case M_DELETE: + case M_CUT: + if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) { + print_block(tbS0, 0, -1, -1); + reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect " + "item number(%d); mode = %c " + "insert_size = %d", + item_num, op_mode, + tb->insert_size[0]); + } + break; + default: + reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode " + "of operation"); + } +#endif + + if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH) + // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat + return REPEAT_SEARCH; + + /* Starting from the leaf level; for all levels h of the tree. */ + for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) { + ret = get_direct_parent(tb, h); + if (ret != CARRY_ON) + goto repeat; + + ret = check_balance(op_mode, tb, h, item_num, + pos_in_item, ins_ih, data); + if (ret != CARRY_ON) { + if (ret == NO_BALANCING_NEEDED) { + /* No balancing for higher levels needed. */ + ret = get_neighbors(tb, h); + if (ret != CARRY_ON) + goto repeat; + if (h != MAX_HEIGHT - 1) + tb->insert_size[h + 1] = 0; + /* ok, analysis and resource gathering are complete */ + break; + } + goto repeat; + } + + ret = get_neighbors(tb, h); + if (ret != CARRY_ON) + goto repeat; + + /* No disk space, or schedule occurred and analysis may be + * invalid and needs to be redone. */ + ret = get_empty_nodes(tb, h); + if (ret != CARRY_ON) + goto repeat; + + if (!PATH_H_PBUFFER(tb->tb_path, h)) { + /* We have a positive insert size but no nodes exist on this + level, this means that we are creating a new root. */ + + RFALSE(tb->blknum[h] != 1, + "PAP-8350: creating new empty root"); + + if (h < MAX_HEIGHT - 1) + tb->insert_size[h + 1] = 0; + } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) { + if (tb->blknum[h] > 1) { + /* The tree needs to be grown, so this node S[h] + which is the root node is split into two nodes, + and a new node (S[h+1]) will be created to + become the root node. */ + + RFALSE(h == MAX_HEIGHT - 1, + "PAP-8355: attempt to create too high of a tree"); + + tb->insert_size[h + 1] = + (DC_SIZE + + KEY_SIZE) * (tb->blknum[h] - 1) + + DC_SIZE; + } else if (h < MAX_HEIGHT - 1) + tb->insert_size[h + 1] = 0; + } else + tb->insert_size[h + 1] = + (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1); + } + + ret = wait_tb_buffers_until_unlocked(tb); + if (ret == CARRY_ON) { + if (FILESYSTEM_CHANGED_TB(tb)) { + wait_tb_buffers_run = 1; + ret = REPEAT_SEARCH; + goto repeat; + } else { + return CARRY_ON; + } + } else { + wait_tb_buffers_run = 1; + goto repeat; + } + + repeat: + // fix_nodes was unable to perform its calculation due to + // filesystem got changed under us, lack of free disk space or i/o + // failure. If the first is the case - the search will be + // repeated. For now - free all resources acquired so far except + // for the new allocated nodes + { + int i; + + /* Release path buffers. */ + if (wait_tb_buffers_run) { + pathrelse_and_restore(tb->tb_sb, tb->tb_path); + } else { + pathrelse(tb->tb_path); + } + /* brelse all resources collected for balancing */ + for (i = 0; i < MAX_HEIGHT; i++) { + if (wait_tb_buffers_run) { + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb->L[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb->R[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb->FL[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb->FR[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb-> + CFL[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb-> + CFR[i]); + } + + brelse(tb->L[i]); + brelse(tb->R[i]); + brelse(tb->FL[i]); + brelse(tb->FR[i]); + brelse(tb->CFL[i]); + brelse(tb->CFR[i]); + + tb->L[i] = NULL; + tb->R[i] = NULL; + tb->FL[i] = NULL; + tb->FR[i] = NULL; + tb->CFL[i] = NULL; + tb->CFR[i] = NULL; + } + + if (wait_tb_buffers_run) { + for (i = 0; i < MAX_FEB_SIZE; i++) { + if (tb->FEB[i]) + reiserfs_restore_prepared_buffer + (tb->tb_sb, tb->FEB[i]); + } + } + return ret; + } + +} + +/* Anatoly will probably forgive me renaming tb to tb. I just + wanted to make lines shorter */ +void unfix_nodes(struct tree_balance *tb) +{ + int i; + + /* Release path buffers. */ + pathrelse_and_restore(tb->tb_sb, tb->tb_path); + + /* brelse all resources collected for balancing */ + for (i = 0; i < MAX_HEIGHT; i++) { + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]); + + brelse(tb->L[i]); + brelse(tb->R[i]); + brelse(tb->FL[i]); + brelse(tb->FR[i]); + brelse(tb->CFL[i]); + brelse(tb->CFR[i]); + } + + /* deal with list of allocated (used and unused) nodes */ + for (i = 0; i < MAX_FEB_SIZE; i++) { + if (tb->FEB[i]) { + b_blocknr_t blocknr = tb->FEB[i]->b_blocknr; + /* de-allocated block which was not used by balancing and + bforget about buffer for it */ + brelse(tb->FEB[i]); + reiserfs_free_block(tb->transaction_handle, NULL, + blocknr, 0); + } + if (tb->used[i]) { + /* release used as new nodes including a new root */ + brelse(tb->used[i]); + } + } + + kfree(tb->vn_buf); + +} diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c new file mode 100644 index 00000000..6471c670 --- /dev/null +++ b/fs/reiserfs/hashes.c @@ -0,0 +1,182 @@ + +/* + * Keyed 32-bit hash function using TEA in a Davis-Meyer function + * H0 = Key + * Hi = E Mi(Hi-1) + Hi-1 + * + * (see Applied Cryptography, 2nd edition, p448). + * + * Jeremy Fitzhardinge 1998 + * + * Jeremy has agreed to the contents of reiserfs/README. -Hans + * Yura's function is added (04/07/2000) + */ + +// +// keyed_hash +// yura_hash +// r5_hash +// + +#include +#include +#include + +#define DELTA 0x9E3779B9 +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ +#define PARTROUNDS 6 /* 6 gets complete mixing */ + +/* a, b, c, d - data; h0, h1 - accumulated hash */ +#define TEACORE(rounds) \ + do { \ + u32 sum = 0; \ + int n = rounds; \ + u32 b0, b1; \ + \ + b0 = h0; \ + b1 = h1; \ + \ + do \ + { \ + sum += DELTA; \ + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \ + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \ + } while(--n); \ + \ + h0 += b0; \ + h1 += b1; \ + } while(0) + +u32 keyed_hash(const signed char *msg, int len) +{ + u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3 }; + + u32 h0 = k[0], h1 = k[1]; + u32 a, b, c, d; + u32 pad; + int i; + + // assert(len >= 0 && len < 256); + + pad = (u32) len | ((u32) len << 8); + pad |= pad << 16; + + while (len >= 16) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + b = (u32) msg[4] | + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; + c = (u32) msg[8] | + (u32) msg[9] << 8 | + (u32) msg[10] << 16 | (u32) msg[11] << 24; + d = (u32) msg[12] | + (u32) msg[13] << 8 | + (u32) msg[14] << 16 | (u32) msg[15] << 24; + + TEACORE(PARTROUNDS); + + len -= 16; + msg += 16; + } + + if (len >= 12) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + b = (u32) msg[4] | + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; + c = (u32) msg[8] | + (u32) msg[9] << 8 | + (u32) msg[10] << 16 | (u32) msg[11] << 24; + + d = pad; + for (i = 12; i < len; i++) { + d <<= 8; + d |= msg[i]; + } + } else if (len >= 8) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + b = (u32) msg[4] | + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; + + c = d = pad; + for (i = 8; i < len; i++) { + c <<= 8; + c |= msg[i]; + } + } else if (len >= 4) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + + b = c = d = pad; + for (i = 4; i < len; i++) { + b <<= 8; + b |= msg[i]; + } + } else { + a = b = c = d = pad; + for (i = 0; i < len; i++) { + a <<= 8; + a |= msg[i]; + } + } + + TEACORE(FULLROUNDS); + +/* return 0;*/ + return h0 ^ h1; +} + +/* What follows in this file is copyright 2000 by Hans Reiser, and the + * licensing of what follows is governed by reiserfs/README */ + +u32 yura_hash(const signed char *msg, int len) +{ + int j, pow; + u32 a, c; + int i; + + for (pow = 1, i = 1; i < len; i++) + pow = pow * 10; + + if (len == 1) + a = msg[0] - 48; + else + a = (msg[0] - 48) * pow; + + for (i = 1; i < len; i++) { + c = msg[i] - 48; + for (pow = 1, j = i; j < len - 1; j++) + pow = pow * 10; + a = a + c * pow; + } + + for (; i < 40; i++) { + c = '0' - 48; + for (pow = 1, j = i; j < len - 1; j++) + pow = pow * 10; + a = a + c * pow; + } + + for (; i < 256; i++) { + c = i; + for (pow = 1, j = i; j < len - 1; j++) + pow = pow * 10; + a = a + c * pow; + } + + a = a << 7; + return a; +} + +u32 r5_hash(const signed char *msg, int len) +{ + u32 a = 0; + while (*msg) { + a += *msg << 4; + a += *msg >> 4; + a *= 11; + msg++; + } + return a; +} diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c new file mode 100644 index 00000000..2074fd95 --- /dev/null +++ b/fs/reiserfs/ibalance.c @@ -0,0 +1,1089 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include +#include + +/* this is one and only function that is used outside (do_balance.c) */ +int balance_internal(struct tree_balance *, + int, int, struct item_head *, struct buffer_head **); + +/* modes of internal_shift_left, internal_shift_right and internal_insert_childs */ +#define INTERNAL_SHIFT_FROM_S_TO_L 0 +#define INTERNAL_SHIFT_FROM_R_TO_S 1 +#define INTERNAL_SHIFT_FROM_L_TO_S 2 +#define INTERNAL_SHIFT_FROM_S_TO_R 3 +#define INTERNAL_INSERT_TO_S 4 +#define INTERNAL_INSERT_TO_L 5 +#define INTERNAL_INSERT_TO_R 6 + +static void internal_define_dest_src_infos(int shift_mode, + struct tree_balance *tb, + int h, + struct buffer_info *dest_bi, + struct buffer_info *src_bi, + int *d_key, struct buffer_head **cf) +{ + memset(dest_bi, 0, sizeof(struct buffer_info)); + memset(src_bi, 0, sizeof(struct buffer_info)); + /* define dest, src, dest parent, dest position */ + switch (shift_mode) { + case INTERNAL_SHIFT_FROM_S_TO_L: /* used in internal_shift_left */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[h]; + dest_bi->bi_parent = tb->FL[h]; + dest_bi->bi_position = get_left_neighbor_position(tb, h); + *d_key = tb->lkey[h]; + *cf = tb->CFL[h]; + break; + case INTERNAL_SHIFT_FROM_L_TO_S: + src_bi->tb = tb; + src_bi->bi_bh = tb->L[h]; + src_bi->bi_parent = tb->FL[h]; + src_bi->bi_position = get_left_neighbor_position(tb, h); + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); /* dest position is analog of dest->b_item_order */ + *d_key = tb->lkey[h]; + *cf = tb->CFL[h]; + break; + + case INTERNAL_SHIFT_FROM_R_TO_S: /* used in internal_shift_left */ + src_bi->tb = tb; + src_bi->bi_bh = tb->R[h]; + src_bi->bi_parent = tb->FR[h]; + src_bi->bi_position = get_right_neighbor_position(tb, h); + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + *d_key = tb->rkey[h]; + *cf = tb->CFR[h]; + break; + + case INTERNAL_SHIFT_FROM_S_TO_R: + src_bi->tb = tb; + src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[h]; + dest_bi->bi_parent = tb->FR[h]; + dest_bi->bi_position = get_right_neighbor_position(tb, h); + *d_key = tb->rkey[h]; + *cf = tb->CFR[h]; + break; + + case INTERNAL_INSERT_TO_L: + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[h]; + dest_bi->bi_parent = tb->FL[h]; + dest_bi->bi_position = get_left_neighbor_position(tb, h); + break; + + case INTERNAL_INSERT_TO_S: + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + break; + + case INTERNAL_INSERT_TO_R: + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[h]; + dest_bi->bi_parent = tb->FR[h]; + dest_bi->bi_position = get_right_neighbor_position(tb, h); + break; + + default: + reiserfs_panic(tb->tb_sb, "ibalance-1", + "shift type is unknown (%d)", + shift_mode); + } +} + +/* Insert count node pointers into buffer cur before position to + 1. + * Insert count items into buffer cur before position to. + * Items and node pointers are specified by inserted and bh respectively. + */ +static void internal_insert_childs(struct buffer_info *cur_bi, + int to, int count, + struct item_head *inserted, + struct buffer_head **bh) +{ + struct buffer_head *cur = cur_bi->bi_bh; + struct block_head *blkh; + int nr; + struct reiserfs_key *ih; + struct disk_child new_dc[2]; + struct disk_child *dc; + int i; + + if (count <= 0) + return; + + blkh = B_BLK_HEAD(cur); + nr = blkh_nr_item(blkh); + + RFALSE(count > 2, "too many children (%d) are to be inserted", count); + RFALSE(B_FREE_SPACE(cur) < count * (KEY_SIZE + DC_SIZE), + "no enough free space (%d), needed %d bytes", + B_FREE_SPACE(cur), count * (KEY_SIZE + DC_SIZE)); + + /* prepare space for count disk_child */ + dc = B_N_CHILD(cur, to + 1); + + memmove(dc + count, dc, (nr + 1 - (to + 1)) * DC_SIZE); + + /* copy to_be_insert disk children */ + for (i = 0; i < count; i++) { + put_dc_size(&(new_dc[i]), + MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i])); + put_dc_block_number(&(new_dc[i]), bh[i]->b_blocknr); + } + memcpy(dc, new_dc, DC_SIZE * count); + + /* prepare space for count items */ + ih = B_N_PDELIM_KEY(cur, ((to == -1) ? 0 : to)); + + memmove(ih + count, ih, + (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE); + + /* copy item headers (keys) */ + memcpy(ih, inserted, KEY_SIZE); + if (count > 1) + memcpy(ih + 1, inserted + 1, KEY_SIZE); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + count); + set_blkh_free_space(blkh, + blkh_free_space(blkh) - count * (DC_SIZE + + KEY_SIZE)); + + do_balance_mark_internal_dirty(cur_bi->tb, cur, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + + if (cur_bi->bi_parent) { + struct disk_child *t_dc = + B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE))); + do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, + 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + } + +} + +/* Delete del_num items and node pointers from buffer cur starting from * + * the first_i'th item and first_p'th pointers respectively. */ +static void internal_delete_pointers_items(struct buffer_info *cur_bi, + int first_p, + int first_i, int del_num) +{ + struct buffer_head *cur = cur_bi->bi_bh; + int nr; + struct block_head *blkh; + struct reiserfs_key *key; + struct disk_child *dc; + + RFALSE(cur == NULL, "buffer is 0"); + RFALSE(del_num < 0, + "negative number of items (%d) can not be deleted", del_num); + RFALSE(first_p < 0 || first_p + del_num > B_NR_ITEMS(cur) + 1 + || first_i < 0, + "first pointer order (%d) < 0 or " + "no so many pointers (%d), only (%d) or " + "first key order %d < 0", first_p, first_p + del_num, + B_NR_ITEMS(cur) + 1, first_i); + if (del_num == 0) + return; + + blkh = B_BLK_HEAD(cur); + nr = blkh_nr_item(blkh); + + if (first_p == 0 && del_num == nr + 1) { + RFALSE(first_i != 0, + "1st deleted key must have order 0, not %d", first_i); + make_empty_node(cur_bi); + return; + } + + RFALSE(first_i + del_num > B_NR_ITEMS(cur), + "first_i = %d del_num = %d " + "no so many keys (%d) in the node (%b)(%z)", + first_i, del_num, first_i + del_num, cur, cur); + + /* deleting */ + dc = B_N_CHILD(cur, first_p); + + memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE); + key = B_N_PDELIM_KEY(cur, first_i); + memmove(key, key + del_num, + (nr - first_i - del_num) * KEY_SIZE + (nr + 1 - + del_num) * DC_SIZE); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num); + set_blkh_free_space(blkh, + blkh_free_space(blkh) + + (del_num * (KEY_SIZE + DC_SIZE))); + + do_balance_mark_internal_dirty(cur_bi->tb, cur, 0); + /*&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur); + /*&&&&&&&&&&&&&&&&&&&&&&& */ + + if (cur_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE))); + + do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, + 0); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + } +} + +/* delete n node pointers and items starting from given position */ +static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n) +{ + int i_from; + + i_from = (from == 0) ? from : from - 1; + + /* delete n pointers starting from `from' position in CUR; + delete n keys starting from 'i_from' position in CUR; + */ + internal_delete_pointers_items(cur_bi, from, i_from, n); +} + +/* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest +* last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest + * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest + */ +static void internal_copy_pointers_items(struct buffer_info *dest_bi, + struct buffer_head *src, + int last_first, int cpy_num) +{ + /* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST * + * as delimiting key have already inserted to buffer dest.*/ + struct buffer_head *dest = dest_bi->bi_bh; + int nr_dest, nr_src; + int dest_order, src_order; + struct block_head *blkh; + struct reiserfs_key *key; + struct disk_child *dc; + + nr_src = B_NR_ITEMS(src); + + RFALSE(dest == NULL || src == NULL, + "src (%p) or dest (%p) buffer is 0", src, dest); + RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, + "invalid last_first parameter (%d)", last_first); + RFALSE(nr_src < cpy_num - 1, + "no so many items (%d) in src (%d)", cpy_num, nr_src); + RFALSE(cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num); + RFALSE(cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest), + "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)", + cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest)); + + if (cpy_num == 0) + return; + + /* coping */ + blkh = B_BLK_HEAD(dest); + nr_dest = blkh_nr_item(blkh); + + /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest; */ + /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0; */ + (last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order = + nr_src - cpy_num + 1) : (dest_order = + nr_dest, + src_order = + 0); + + /* prepare space for cpy_num pointers */ + dc = B_N_CHILD(dest, dest_order); + + memmove(dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE); + + /* insert pointers */ + memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num); + + /* prepare space for cpy_num - 1 item headers */ + key = B_N_PDELIM_KEY(dest, dest_order); + memmove(key + cpy_num - 1, key, + KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest + + cpy_num)); + + /* insert headers */ + memcpy(key, B_N_PDELIM_KEY(src, src_order), KEY_SIZE * (cpy_num - 1)); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1)); + set_blkh_free_space(blkh, + blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) + + DC_SIZE * cpy_num)); + + do_balance_mark_internal_dirty(dest_bi->tb, dest, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(dest); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) + + DC_SIZE * cpy_num)); + + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, + 0); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(dest_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + } + +} + +/* Copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest. + * Delete cpy_num - del_par items and node pointers from buffer src. + * last_first == FIRST_TO_LAST means, that we copy/delete first items from src. + * last_first == LAST_TO_FIRST means, that we copy/delete last items from src. + */ +static void internal_move_pointers_items(struct buffer_info *dest_bi, + struct buffer_info *src_bi, + int last_first, int cpy_num, + int del_par) +{ + int first_pointer; + int first_item; + + internal_copy_pointers_items(dest_bi, src_bi->bi_bh, last_first, + cpy_num); + + if (last_first == FIRST_TO_LAST) { /* shift_left occurs */ + first_pointer = 0; + first_item = 0; + /* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer, + for key - with first_item */ + internal_delete_pointers_items(src_bi, first_pointer, + first_item, cpy_num - del_par); + } else { /* shift_right occurs */ + int i, j; + + i = (cpy_num - del_par == + (j = + B_NR_ITEMS(src_bi->bi_bh)) + 1) ? 0 : j - cpy_num + + del_par; + + internal_delete_pointers_items(src_bi, + j + 1 - cpy_num + del_par, i, + cpy_num - del_par); + } +} + +/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */ +static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_before, /* insert key before key with n_dest number */ + struct buffer_head *src, int src_position) +{ + struct buffer_head *dest = dest_bi->bi_bh; + int nr; + struct block_head *blkh; + struct reiserfs_key *key; + + RFALSE(dest == NULL || src == NULL, + "source(%p) or dest(%p) buffer is 0", src, dest); + RFALSE(dest_position_before < 0 || src_position < 0, + "source(%d) or dest(%d) key number less than 0", + src_position, dest_position_before); + RFALSE(dest_position_before > B_NR_ITEMS(dest) || + src_position >= B_NR_ITEMS(src), + "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))", + dest_position_before, B_NR_ITEMS(dest), + src_position, B_NR_ITEMS(src)); + RFALSE(B_FREE_SPACE(dest) < KEY_SIZE, + "no enough free space (%d) in dest buffer", B_FREE_SPACE(dest)); + + blkh = B_BLK_HEAD(dest); + nr = blkh_nr_item(blkh); + + /* prepare space for inserting key */ + key = B_N_PDELIM_KEY(dest, dest_position_before); + memmove(key + 1, key, + (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE); + + /* insert key */ + memcpy(key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE); + + /* Change dirt, free space, item number fields. */ + + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1); + set_blkh_free_space(blkh, blkh_free_space(blkh) - KEY_SIZE); + + do_balance_mark_internal_dirty(dest_bi->tb, dest, 0); + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); + put_dc_size(t_dc, dc_size(t_dc) + KEY_SIZE); + + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, + 0); + } +} + +/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest. + * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest. + * Replace d_key'th key in buffer cfl. + * Delete pointer_amount items and node pointers from buffer src. + */ +/* this can be invoked both to shift from S to L and from R to S */ +static void internal_shift_left(int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */ + struct tree_balance *tb, + int h, int pointer_amount) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + + internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi, + &d_key_position, &cf); + + /*printk("pointer_amount = %d\n",pointer_amount); */ + + if (pointer_amount) { + /* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */ + internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, + d_key_position); + + if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) { + if (src_bi.bi_position /*src->b_item_order */ == 0) + replace_key(tb, cf, d_key_position, + src_bi. + bi_parent /*src->b_parent */ , 0); + } else + replace_key(tb, cf, d_key_position, src_bi.bi_bh, + pointer_amount - 1); + } + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST, + pointer_amount, 0); + +} + +/* Insert delimiting key to L[h]. + * Copy n node pointers and n - 1 items from buffer S[h] to L[h]. + * Delete n - 1 items and node pointers from buffer S[h]. + */ +/* it always shifts from S[h] to L[h] */ +static void internal_shift1_left(struct tree_balance *tb, + int h, int pointer_amount) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + + internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, + &dest_bi, &src_bi, &d_key_position, &cf); + + if (pointer_amount > 0) /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */ + internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, + d_key_position); + /* internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]); */ + + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST, + pointer_amount, 1); + /* internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1); */ +} + +/* Insert d_key'th (delimiting) key from buffer cfr to head of dest. + * Copy n node pointers and n - 1 items from buffer src to buffer dest. + * Replace d_key'th key in buffer cfr. + * Delete n items and node pointers from buffer src. + */ +static void internal_shift_right(int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */ + struct tree_balance *tb, + int h, int pointer_amount) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + int nr; + + internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi, + &d_key_position, &cf); + + nr = B_NR_ITEMS(src_bi.bi_bh); + + if (pointer_amount > 0) { + /* insert delimiting key from common father of dest and src to dest node into position 0 */ + internal_insert_key(&dest_bi, 0, cf, d_key_position); + if (nr == pointer_amount - 1) { + RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ || + dest_bi.bi_bh != tb->R[h], + "src (%p) must be == tb->S[h](%p) when it disappears", + src_bi.bi_bh, PATH_H_PBUFFER(tb->tb_path, h)); + /* when S[h] disappers replace left delemiting key as well */ + if (tb->CFL[h]) + replace_key(tb, cf, d_key_position, tb->CFL[h], + tb->lkey[h]); + } else + replace_key(tb, cf, d_key_position, src_bi.bi_bh, + nr - pointer_amount); + } + + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, + pointer_amount, 0); +} + +/* Insert delimiting key to R[h]. + * Copy n node pointers and n - 1 items from buffer S[h] to R[h]. + * Delete n - 1 items and node pointers from buffer S[h]. + */ +/* it always shift from S[h] to R[h] */ +static void internal_shift1_right(struct tree_balance *tb, + int h, int pointer_amount) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + + internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + &dest_bi, &src_bi, &d_key_position, &cf); + + if (pointer_amount > 0) /* insert rkey from CFR[h] to right neighbor R[h] */ + internal_insert_key(&dest_bi, 0, cf, d_key_position); + /* internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]); */ + + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, + pointer_amount, 1); + /* internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1); */ +} + +/* Delete insert_num node pointers together with their left items + * and balance current node.*/ +static void balance_internal_when_delete(struct tree_balance *tb, + int h, int child_pos) +{ + int insert_num; + int n; + struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h); + struct buffer_info bi; + + insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE)); + + /* delete child-node-pointer(s) together with their left item(s) */ + bi.tb = tb; + bi.bi_bh = tbSh; + bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); + bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + + internal_delete_childs(&bi, child_pos, -insert_num); + + RFALSE(tb->blknum[h] > 1, + "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]); + + n = B_NR_ITEMS(tbSh); + + if (tb->lnum[h] == 0 && tb->rnum[h] == 0) { + if (tb->blknum[h] == 0) { + /* node S[h] (root of the tree) is empty now */ + struct buffer_head *new_root; + + RFALSE(n + || B_FREE_SPACE(tbSh) != + MAX_CHILD_SIZE(tbSh) - DC_SIZE, + "buffer must have only 0 keys (%d)", n); + RFALSE(bi.bi_parent, "root has parent (%p)", + bi.bi_parent); + + /* choose a new root */ + if (!tb->L[h - 1] || !B_NR_ITEMS(tb->L[h - 1])) + new_root = tb->R[h - 1]; + else + new_root = tb->L[h - 1]; + /* switch super block's tree root block number to the new value */ + PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr); + //REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; + PUT_SB_TREE_HEIGHT(tb->tb_sb, + SB_TREE_HEIGHT(tb->tb_sb) - 1); + + do_balance_mark_sb_dirty(tb, + REISERFS_SB(tb->tb_sb)->s_sbh, + 1); + /*&&&&&&&&&&&&&&&&&&&&&& */ + if (h > 1) + /* use check_internal if new root is an internal node */ + check_internal(new_root); + /*&&&&&&&&&&&&&&&&&&&&&& */ + + /* do what is needed for buffer thrown from tree */ + reiserfs_invalidate_buffer(tb, tbSh); + return; + } + return; + } + + if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) { /* join S[h] with L[h] */ + + RFALSE(tb->rnum[h] != 0, + "invalid tb->rnum[%d]==%d when joining S[h] with L[h]", + h, tb->rnum[h]); + + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1); + reiserfs_invalidate_buffer(tb, tbSh); + + return; + } + + if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) { /* join S[h] with R[h] */ + RFALSE(tb->lnum[h] != 0, + "invalid tb->lnum[%d]==%d when joining S[h] with R[h]", + h, tb->lnum[h]); + + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1); + + reiserfs_invalidate_buffer(tb, tbSh); + return; + } + + if (tb->lnum[h] < 0) { /* borrow from left neighbor L[h] */ + RFALSE(tb->rnum[h] != 0, + "wrong tb->rnum[%d]==%d when borrow from L[h]", h, + tb->rnum[h]); + /*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]); */ + internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h, + -tb->lnum[h]); + return; + } + + if (tb->rnum[h] < 0) { /* borrow from right neighbor R[h] */ + RFALSE(tb->lnum[h] != 0, + "invalid tb->lnum[%d]==%d when borrow from R[h]", + h, tb->lnum[h]); + internal_shift_left(INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]); /*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]); */ + return; + } + + if (tb->lnum[h] > 0) { /* split S[h] into two parts and put them into neighbors */ + RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1, + "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them", + h, tb->lnum[h], h, tb->rnum[h], n); + + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]); /*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]); */ + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + tb->rnum[h]); + + reiserfs_invalidate_buffer(tb, tbSh); + + return; + } + reiserfs_panic(tb->tb_sb, "ibalance-2", + "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d", + h, tb->lnum[h], h, tb->rnum[h]); +} + +/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/ +static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key) +{ + RFALSE(tb->L[h] == NULL || tb->CFL[h] == NULL, + "L[h](%p) and CFL[h](%p) must exist in replace_lkey", + tb->L[h], tb->CFL[h]); + + if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0) + return; + + memcpy(B_N_PDELIM_KEY(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE); + + do_balance_mark_internal_dirty(tb, tb->CFL[h], 0); +} + +/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/ +static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key) +{ + RFALSE(tb->R[h] == NULL || tb->CFR[h] == NULL, + "R[h](%p) and CFR[h](%p) must exist in replace_rkey", + tb->R[h], tb->CFR[h]); + RFALSE(B_NR_ITEMS(tb->R[h]) == 0, + "R[h] can not be empty if it exists (item number=%d)", + B_NR_ITEMS(tb->R[h])); + + memcpy(B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE); + + do_balance_mark_internal_dirty(tb, tb->CFR[h], 0); +} + +int balance_internal(struct tree_balance *tb, /* tree_balance structure */ + int h, /* level of the tree */ + int child_pos, struct item_head *insert_key, /* key for insertion on higher level */ + struct buffer_head **insert_ptr /* node for insertion on higher level */ + ) + /* if inserting/pasting + { + child_pos is the position of the node-pointer in S[h] that * + pointed to S[h-1] before balancing of the h-1 level; * + this means that new pointers and items must be inserted AFTER * + child_pos + } + else + { + it is the position of the leftmost pointer that must be deleted (together with + its corresponding key to the left of the pointer) + as a result of the previous level's balancing. + } + */ +{ + struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h); + struct buffer_info bi; + int order; /* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */ + int insert_num, n, k; + struct buffer_head *S_new; + struct item_head new_insert_key; + struct buffer_head *new_insert_ptr = NULL; + struct item_head *new_insert_key_addr = insert_key; + + RFALSE(h < 1, "h (%d) can not be < 1 on internal level", h); + + PROC_INFO_INC(tb->tb_sb, balance_at[h]); + + order = + (tbSh) ? PATH_H_POSITION(tb->tb_path, + h + 1) /*tb->S[h]->b_item_order */ : 0; + + /* Using insert_size[h] calculate the number insert_num of items + that must be inserted to or deleted from S[h]. */ + insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE)); + + /* Check whether insert_num is proper * */ + RFALSE(insert_num < -2 || insert_num > 2, + "incorrect number of items inserted to the internal node (%d)", + insert_num); + RFALSE(h > 1 && (insert_num > 1 || insert_num < -1), + "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level", + insert_num, h); + + /* Make balance in case insert_num < 0 */ + if (insert_num < 0) { + balance_internal_when_delete(tb, h, child_pos); + return order; + } + + k = 0; + if (tb->lnum[h] > 0) { + /* shift lnum[h] items from S[h] to the left neighbor L[h]. + check how many of new items fall into L[h] or CFL[h] after + shifting */ + n = B_NR_ITEMS(tb->L[h]); /* number of items in L[h] */ + if (tb->lnum[h] <= child_pos) { + /* new items don't fall into L[h] or CFL[h] */ + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, + tb->lnum[h]); + /*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]); */ + child_pos -= tb->lnum[h]; + } else if (tb->lnum[h] > child_pos + insert_num) { + /* all new items fall into L[h] */ + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, + tb->lnum[h] - insert_num); + /* internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh, + tb->lnum[h]-insert_num); + */ + /* insert insert_num keys and node-pointers into L[h] */ + bi.tb = tb; + bi.bi_bh = tb->L[h]; + bi.bi_parent = tb->FL[h]; + bi.bi_position = get_left_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->L[h], tb->S[h-1]->b_next */ + n + child_pos + 1, + insert_num, insert_key, + insert_ptr); + + insert_num = 0; + } else { + struct disk_child *dc; + + /* some items fall into L[h] or CFL[h], but some don't fall */ + internal_shift1_left(tb, h, child_pos + 1); + /* calculate number of new items that fall into L[h] */ + k = tb->lnum[h] - child_pos - 1; + bi.tb = tb; + bi.bi_bh = tb->L[h]; + bi.bi_parent = tb->FL[h]; + bi.bi_position = get_left_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->L[h], tb->S[h-1]->b_next, */ + n + child_pos + 1, k, + insert_key, insert_ptr); + + replace_lkey(tb, h, insert_key + k); + + /* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */ + dc = B_N_CHILD(tbSh, 0); + put_dc_size(dc, + MAX_CHILD_SIZE(insert_ptr[k]) - + B_FREE_SPACE(insert_ptr[k])); + put_dc_block_number(dc, insert_ptr[k]->b_blocknr); + + do_balance_mark_internal_dirty(tb, tbSh, 0); + + k++; + insert_key += k; + insert_ptr += k; + insert_num -= k; + child_pos = 0; + } + } + /* tb->lnum[h] > 0 */ + if (tb->rnum[h] > 0) { + /*shift rnum[h] items from S[h] to the right neighbor R[h] */ + /* check how many of new items fall into R or CFR after shifting */ + n = B_NR_ITEMS(tbSh); /* number of items in S[h] */ + if (n - tb->rnum[h] >= child_pos) + /* new items fall into S[h] */ + /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]); */ + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + tb->rnum[h]); + else if (n + insert_num - tb->rnum[h] < child_pos) { + /* all new items fall into R[h] */ + /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h], + tb->rnum[h] - insert_num); */ + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + tb->rnum[h] - insert_num); + + /* insert insert_num keys and node-pointers into R[h] */ + bi.tb = tb; + bi.bi_bh = tb->R[h]; + bi.bi_parent = tb->FR[h]; + bi.bi_position = get_right_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->R[h],tb->S[h-1]->b_next */ + child_pos - n - insert_num + + tb->rnum[h] - 1, + insert_num, insert_key, + insert_ptr); + insert_num = 0; + } else { + struct disk_child *dc; + + /* one of the items falls into CFR[h] */ + internal_shift1_right(tb, h, n - child_pos + 1); + /* calculate number of new items that fall into R[h] */ + k = tb->rnum[h] - n + child_pos - 1; + bi.tb = tb; + bi.bi_bh = tb->R[h]; + bi.bi_parent = tb->FR[h]; + bi.bi_position = get_right_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->R[h], tb->R[h]->b_child, */ + 0, k, insert_key + 1, + insert_ptr + 1); + + replace_rkey(tb, h, insert_key + insert_num - k - 1); + + /* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1] */ + dc = B_N_CHILD(tb->R[h], 0); + put_dc_size(dc, + MAX_CHILD_SIZE(insert_ptr + [insert_num - k - 1]) - + B_FREE_SPACE(insert_ptr + [insert_num - k - 1])); + put_dc_block_number(dc, + insert_ptr[insert_num - k - + 1]->b_blocknr); + + do_balance_mark_internal_dirty(tb, tb->R[h], 0); + + insert_num -= (k + 1); + } + } + + /** Fill new node that appears instead of S[h] **/ + RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level"); + RFALSE(tb->blknum[h] < 0, "blknum can not be < 0"); + + if (!tb->blknum[h]) { /* node S[h] is empty now */ + RFALSE(!tbSh, "S[h] is equal NULL"); + + /* do what is needed for buffer thrown from tree */ + reiserfs_invalidate_buffer(tb, tbSh); + return order; + } + + if (!tbSh) { + /* create new root */ + struct disk_child *dc; + struct buffer_head *tbSh_1 = PATH_H_PBUFFER(tb->tb_path, h - 1); + struct block_head *blkh; + + if (tb->blknum[h] != 1) + reiserfs_panic(NULL, "ibalance-3", "One new node " + "required for creating the new root"); + /* S[h] = empty buffer from the list FEB. */ + tbSh = get_FEB(tb); + blkh = B_BLK_HEAD(tbSh); + set_blkh_level(blkh, h + 1); + + /* Put the unique node-pointer to S[h] that points to S[h-1]. */ + + dc = B_N_CHILD(tbSh, 0); + put_dc_block_number(dc, tbSh_1->b_blocknr); + put_dc_size(dc, + (MAX_CHILD_SIZE(tbSh_1) - B_FREE_SPACE(tbSh_1))); + + tb->insert_size[h] -= DC_SIZE; + set_blkh_free_space(blkh, blkh_free_space(blkh) - DC_SIZE); + + do_balance_mark_internal_dirty(tb, tbSh, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(tbSh); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + + /* put new root into path structure */ + PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) = + tbSh; + + /* Change root in structure super block. */ + PUT_SB_ROOT_BLOCK(tb->tb_sb, tbSh->b_blocknr); + PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1); + do_balance_mark_sb_dirty(tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); + } + + if (tb->blknum[h] == 2) { + int snum; + struct buffer_info dest_bi, src_bi; + + /* S_new = free buffer from list FEB */ + S_new = get_FEB(tb); + + set_blkh_level(B_BLK_HEAD(S_new), h + 1); + + dest_bi.tb = tb; + dest_bi.bi_bh = S_new; + dest_bi.bi_parent = NULL; + dest_bi.bi_position = 0; + src_bi.tb = tb; + src_bi.bi_bh = tbSh; + src_bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); + src_bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + + n = B_NR_ITEMS(tbSh); /* number of items in S[h] */ + snum = (insert_num + n + 1) / 2; + if (n - snum >= child_pos) { + /* new items don't fall into S_new */ + /* store the delimiting key for the next level */ + /* new_insert_key = (n - snum)'th key in S[h] */ + memcpy(&new_insert_key, B_N_PDELIM_KEY(tbSh, n - snum), + KEY_SIZE); + /* last parameter is del_par */ + internal_move_pointers_items(&dest_bi, &src_bi, + LAST_TO_FIRST, snum, 0); + /* internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0); */ + } else if (n + insert_num - snum < child_pos) { + /* all new items fall into S_new */ + /* store the delimiting key for the next level */ + /* new_insert_key = (n + insert_item - snum)'th key in S[h] */ + memcpy(&new_insert_key, + B_N_PDELIM_KEY(tbSh, n + insert_num - snum), + KEY_SIZE); + /* last parameter is del_par */ + internal_move_pointers_items(&dest_bi, &src_bi, + LAST_TO_FIRST, + snum - insert_num, 0); + /* internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0); */ + + /* insert insert_num keys and node-pointers into S_new */ + internal_insert_childs(&dest_bi, + /*S_new,tb->S[h-1]->b_next, */ + child_pos - n - insert_num + + snum - 1, + insert_num, insert_key, + insert_ptr); + + insert_num = 0; + } else { + struct disk_child *dc; + + /* some items fall into S_new, but some don't fall */ + /* last parameter is del_par */ + internal_move_pointers_items(&dest_bi, &src_bi, + LAST_TO_FIRST, + n - child_pos + 1, 1); + /* internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1); */ + /* calculate number of new items that fall into S_new */ + k = snum - n + child_pos - 1; + + internal_insert_childs(&dest_bi, /*S_new, */ 0, k, + insert_key + 1, insert_ptr + 1); + + /* new_insert_key = insert_key[insert_num - k - 1] */ + memcpy(&new_insert_key, insert_key + insert_num - k - 1, + KEY_SIZE); + /* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */ + + dc = B_N_CHILD(S_new, 0); + put_dc_size(dc, + (MAX_CHILD_SIZE + (insert_ptr[insert_num - k - 1]) - + B_FREE_SPACE(insert_ptr + [insert_num - k - 1]))); + put_dc_block_number(dc, + insert_ptr[insert_num - k - + 1]->b_blocknr); + + do_balance_mark_internal_dirty(tb, S_new, 0); + + insert_num -= (k + 1); + } + /* new_insert_ptr = node_pointer to S_new */ + new_insert_ptr = S_new; + + RFALSE(!buffer_journaled(S_new) || buffer_journal_dirty(S_new) + || buffer_dirty(S_new), "cm-00001: bad S_new (%b)", + S_new); + + // S_new is released in unfix_nodes + } + + n = B_NR_ITEMS(tbSh); /*number of items in S[h] */ + + if (0 <= child_pos && child_pos <= n && insert_num > 0) { + bi.tb = tb; + bi.bi_bh = tbSh; + bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); + bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + internal_insert_childs(&bi, /*tbSh, */ + /* ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next : tb->S[h]->b_child->b_next, */ + child_pos, insert_num, insert_key, + insert_ptr); + } + + memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE); + insert_ptr[0] = new_insert_ptr; + + return order; +} diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c new file mode 100644 index 00000000..4fd5bb33 --- /dev/null +++ b/fs/reiserfs/inode.c @@ -0,0 +1,3225 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to); + +void reiserfs_evict_inode(struct inode *inode) +{ + /* We need blocks for transaction + (user+group) quota update (possibly delete) */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); + struct reiserfs_transaction_handle th; + int depth; + int err; + + if (!inode->i_nlink && !is_bad_inode(inode)) + dquot_initialize(inode); + + truncate_inode_pages(&inode->i_data, 0); + if (inode->i_nlink) + goto no_delete; + + depth = reiserfs_write_lock_once(inode->i_sb); + + /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ + if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ + reiserfs_delete_xattrs(inode); + + if (journal_begin(&th, inode->i_sb, jbegin_count)) + goto out; + reiserfs_update_inode_transaction(inode); + + reiserfs_discard_prealloc(&th, inode); + + err = reiserfs_delete_object(&th, inode); + + /* Do quota update inside a transaction for journaled quotas. We must do that + * after delete_object so that quota updates go into the same transaction as + * stat data deletion */ + if (!err) + dquot_free_inode(inode); + + if (journal_end(&th, inode->i_sb, jbegin_count)) + goto out; + + /* check return value from reiserfs_delete_object after + * ending the transaction + */ + if (err) + goto out; + + /* all items of file are deleted, so we can remove "save" link */ + remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything + * about an error here */ + } else { + /* no object items are in the tree */ + ; + } + out: + end_writeback(inode); /* note this must go after the journal_end to prevent deadlock */ + dquot_drop(inode); + inode->i_blocks = 0; + reiserfs_write_unlock_once(inode->i_sb, depth); + return; + +no_delete: + end_writeback(inode); + dquot_drop(inode); +} + +static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, + __u32 objectid, loff_t offset, int type, int length) +{ + key->version = version; + + key->on_disk_key.k_dir_id = dirid; + key->on_disk_key.k_objectid = objectid; + set_cpu_key_k_offset(key, offset); + set_cpu_key_k_type(key, type); + key->key_length = length; +} + +/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set + offset and type of key */ +void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset, + int type, int length) +{ + _make_cpu_key(key, get_inode_item_key_version(inode), + le32_to_cpu(INODE_PKEY(inode)->k_dir_id), + le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type, + length); +} + +// +// when key is 0, do not set version and short key +// +inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key, + int version, + loff_t offset, int type, int length, + int entry_count /*or ih_free_space */ ) +{ + if (key) { + ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id); + ih->ih_key.k_objectid = + cpu_to_le32(key->on_disk_key.k_objectid); + } + put_ih_version(ih, version); + set_le_ih_k_offset(ih, offset); + set_le_ih_k_type(ih, type); + put_ih_item_len(ih, length); + /* set_ih_free_space (ih, 0); */ + // for directory items it is entry count, for directs and stat + // datas - 0xffff, for indirects - 0 + put_ih_entry_count(ih, entry_count); +} + +// +// FIXME: we might cache recently accessed indirect item + +// Ugh. Not too eager for that.... +// I cut the code until such time as I see a convincing argument (benchmark). +// I don't want a bloated inode struct..., and I don't like code complexity.... + +/* cutting the code is fine, since it really isn't in use yet and is easy +** to add back in. But, Vladimir has a really good idea here. Think +** about what happens for reading a file. For each page, +** The VFS layer calls reiserfs_readpage, who searches the tree to find +** an indirect item. This indirect item has X number of pointers, where +** X is a big number if we've done the block allocation right. But, +** we only use one or two of these pointers during each call to readpage, +** needlessly researching again later on. +** +** The size of the cache could be dynamic based on the size of the file. +** +** I'd also like to see us cache the location the stat data item, since +** we are needlessly researching for that frequently. +** +** --chris +*/ + +/* If this page has a file tail in it, and +** it was read in by get_block_create_0, the page data is valid, +** but tail is still sitting in a direct item, and we can't write to +** it. So, look through this page, and check all the mapped buffers +** to make sure they have valid block numbers. Any that don't need +** to be unmapped, so that __block_write_begin will correctly call +** reiserfs_get_block to convert the tail into an unformatted node +*/ +static inline void fix_tail_page_for_writing(struct page *page) +{ + struct buffer_head *head, *next, *bh; + + if (page && page_has_buffers(page)) { + head = page_buffers(page); + bh = head; + do { + next = bh->b_this_page; + if (buffer_mapped(bh) && bh->b_blocknr == 0) { + reiserfs_unmap_buffer(bh); + } + bh = next; + } while (bh != head); + } +} + +/* reiserfs_get_block does not need to allocate a block only if it has been + done already or non-hole position has been found in the indirect item */ +static inline int allocation_needed(int retval, b_blocknr_t allocated, + struct item_head *ih, + __le32 * item, int pos_in_item) +{ + if (allocated) + return 0; + if (retval == POSITION_FOUND && is_indirect_le_ih(ih) && + get_block_num(item, pos_in_item)) + return 0; + return 1; +} + +static inline int indirect_item_found(int retval, struct item_head *ih) +{ + return (retval == POSITION_FOUND) && is_indirect_le_ih(ih); +} + +static inline void set_block_dev_mapped(struct buffer_head *bh, + b_blocknr_t block, struct inode *inode) +{ + map_bh(bh, inode->i_sb, block); +} + +// +// files which were created in the earlier version can not be longer, +// than 2 gb +// +static int file_capable(struct inode *inode, sector_t block) +{ + if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 || // it is new file. + block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb + return 1; + + return 0; +} + +static int restart_transaction(struct reiserfs_transaction_handle *th, + struct inode *inode, struct treepath *path) +{ + struct super_block *s = th->t_super; + int len = th->t_blocks_allocated; + int err; + + BUG_ON(!th->t_trans_id); + BUG_ON(!th->t_refcount); + + pathrelse(path); + + /* we cannot restart while nested */ + if (th->t_refcount > 1) { + return 0; + } + reiserfs_update_sd(th, inode); + err = journal_end(th, s, len); + if (!err) { + err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6); + if (!err) + reiserfs_update_inode_transaction(inode); + } + return err; +} + +// it is called by get_block when create == 0. Returns block number +// for 'block'-th logical block of file. When it hits direct item it +// returns 0 (being called from bmap) or read direct item into piece +// of page (bh_result) + +// Please improve the english/clarity in the comment above, as it is +// hard to understand. + +static int _get_block_create_0(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int args) +{ + INITIALIZE_PATH(path); + struct cpu_key key; + struct buffer_head *bh; + struct item_head *ih, tmp_ih; + b_blocknr_t blocknr; + char *p = NULL; + int chars; + int ret; + int result; + int done = 0; + unsigned long offset; + + // prepare the key to look for the 'block'-th block of file + make_cpu_key(&key, inode, + (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY, + 3); + + result = search_for_position_by_key(inode->i_sb, &key, &path); + if (result != POSITION_FOUND) { + pathrelse(&path); + if (p) + kunmap(bh_result->b_page); + if (result == IO_ERROR) + return -EIO; + // We do not return -ENOENT if there is a hole but page is uptodate, because it means + // That there is some MMAPED data associated with it that is yet to be written to disk. + if ((args & GET_BLOCK_NO_HOLE) + && !PageUptodate(bh_result->b_page)) { + return -ENOENT; + } + return 0; + } + // + bh = get_last_bh(&path); + ih = get_ih(&path); + if (is_indirect_le_ih(ih)) { + __le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih); + + /* FIXME: here we could cache indirect item or part of it in + the inode to avoid search_by_key in case of subsequent + access to file */ + blocknr = get_block_num(ind_item, path.pos_in_item); + ret = 0; + if (blocknr) { + map_bh(bh_result, inode->i_sb, blocknr); + if (path.pos_in_item == + ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) { + set_buffer_boundary(bh_result); + } + } else + // We do not return -ENOENT if there is a hole but page is uptodate, because it means + // That there is some MMAPED data associated with it that is yet to be written to disk. + if ((args & GET_BLOCK_NO_HOLE) + && !PageUptodate(bh_result->b_page)) { + ret = -ENOENT; + } + + pathrelse(&path); + if (p) + kunmap(bh_result->b_page); + return ret; + } + // requested data are in direct item(s) + if (!(args & GET_BLOCK_READ_DIRECT)) { + // we are called by bmap. FIXME: we can not map block of file + // when it is stored in direct item(s) + pathrelse(&path); + if (p) + kunmap(bh_result->b_page); + return -ENOENT; + } + + /* if we've got a direct item, and the buffer or page was uptodate, + ** we don't want to pull data off disk again. skip to the + ** end, where we map the buffer and return + */ + if (buffer_uptodate(bh_result)) { + goto finished; + } else + /* + ** grab_tail_page can trigger calls to reiserfs_get_block on up to date + ** pages without any buffers. If the page is up to date, we don't want + ** read old data off disk. Set the up to date bit on the buffer instead + ** and jump to the end + */ + if (!bh_result->b_page || PageUptodate(bh_result->b_page)) { + set_buffer_uptodate(bh_result); + goto finished; + } + // read file tail into part of page + offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1); + copy_item_head(&tmp_ih, ih); + + /* we only want to kmap if we are reading the tail into the page. + ** this is not the common case, so we don't kmap until we are + ** sure we need to. But, this means the item might move if + ** kmap schedules + */ + if (!p) + p = (char *)kmap(bh_result->b_page); + + p += offset; + memset(p, 0, inode->i_sb->s_blocksize); + do { + if (!is_direct_le_ih(ih)) { + BUG(); + } + /* make sure we don't read more bytes than actually exist in + ** the file. This can happen in odd cases where i_size isn't + ** correct, and when direct item padding results in a few + ** extra bytes at the end of the direct item + */ + if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size) + break; + if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) { + chars = + inode->i_size - (le_ih_k_offset(ih) - 1) - + path.pos_in_item; + done = 1; + } else { + chars = ih_item_len(ih) - path.pos_in_item; + } + memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars); + + if (done) + break; + + p += chars; + + if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1)) + // we done, if read direct item is not the last item of + // node FIXME: we could try to check right delimiting key + // to see whether direct item continues in the right + // neighbor or rely on i_size + break; + + // update key to look for the next piece + set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars); + result = search_for_position_by_key(inode->i_sb, &key, &path); + if (result != POSITION_FOUND) + // i/o error most likely + break; + bh = get_last_bh(&path); + ih = get_ih(&path); + } while (1); + + flush_dcache_page(bh_result->b_page); + kunmap(bh_result->b_page); + + finished: + pathrelse(&path); + + if (result == IO_ERROR) + return -EIO; + + /* this buffer has valid data, but isn't valid for io. mapping it to + * block #0 tells the rest of reiserfs it just has a tail in it + */ + map_bh(bh_result, inode->i_sb, 0); + set_buffer_uptodate(bh_result); + return 0; +} + +// this is called to create file map. So, _get_block_create_0 will not +// read direct item +static int reiserfs_bmap(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + if (!file_capable(inode, block)) + return -EFBIG; + + reiserfs_write_lock(inode->i_sb); + /* do not read the direct item */ + _get_block_create_0(inode, block, bh_result, 0); + reiserfs_write_unlock(inode->i_sb); + return 0; +} + +/* special version of get_block that is only used by grab_tail_page right +** now. It is sent to __block_write_begin, and when you try to get a +** block past the end of the file (or a block from a hole) it returns +** -ENOENT instead of a valid buffer. __block_write_begin expects to +** be able to do i/o on the buffers returned, unless an error value +** is also returned. +** +** So, this allows __block_write_begin to be used for reading a single block +** in a page. Where it does not produce a valid page for holes, or past the +** end of the file. This turns out to be exactly what we need for reading +** tails for conversion. +** +** The point of the wrapper is forcing a certain value for create, even +** though the VFS layer is calling this function with create==1. If you +** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, +** don't use this function. +*/ +static int reiserfs_get_block_create_0(struct inode *inode, sector_t block, + struct buffer_head *bh_result, + int create) +{ + return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE); +} + +/* This is special helper for reiserfs_get_block in case we are executing + direct_IO request. */ +static int reiserfs_get_blocks_direct_io(struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + int ret; + + bh_result->b_page = NULL; + + /* We set the b_size before reiserfs_get_block call since it is + referenced in convert_tail_for_hole() that may be called from + reiserfs_get_block() */ + bh_result->b_size = (1 << inode->i_blkbits); + + ret = reiserfs_get_block(inode, iblock, bh_result, + create | GET_BLOCK_NO_DANGLE); + if (ret) + goto out; + + /* don't allow direct io onto tail pages */ + if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { + /* make sure future calls to the direct io funcs for this offset + ** in the file fail by unmapping the buffer + */ + clear_buffer_mapped(bh_result); + ret = -EINVAL; + } + /* Possible unpacked tail. Flush the data before pages have + disappeared */ + if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { + int err; + + reiserfs_write_lock(inode->i_sb); + + err = reiserfs_commit_for_inode(inode); + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + + reiserfs_write_unlock(inode->i_sb); + + if (err < 0) + ret = err; + } + out: + return ret; +} + +/* +** helper function for when reiserfs_get_block is called for a hole +** but the file tail is still in a direct item +** bh_result is the buffer head for the hole +** tail_offset is the offset of the start of the tail in the file +** +** This calls prepare_write, which will start a new transaction +** you should not be in a transaction, or have any paths held when you +** call this. +*/ +static int convert_tail_for_hole(struct inode *inode, + struct buffer_head *bh_result, + loff_t tail_offset) +{ + unsigned long index; + unsigned long tail_end; + unsigned long tail_start; + struct page *tail_page; + struct page *hole_page = bh_result->b_page; + int retval = 0; + + if ((tail_offset & (bh_result->b_size - 1)) != 1) + return -EIO; + + /* always try to read until the end of the block */ + tail_start = tail_offset & (PAGE_CACHE_SIZE - 1); + tail_end = (tail_start | (bh_result->b_size - 1)) + 1; + + index = tail_offset >> PAGE_CACHE_SHIFT; + /* hole_page can be zero in case of direct_io, we are sure + that we cannot get here if we write with O_DIRECT into + tail page */ + if (!hole_page || index != hole_page->index) { + tail_page = grab_cache_page(inode->i_mapping, index); + retval = -ENOMEM; + if (!tail_page) { + goto out; + } + } else { + tail_page = hole_page; + } + + /* we don't have to make sure the conversion did not happen while + ** we were locking the page because anyone that could convert + ** must first take i_mutex. + ** + ** We must fix the tail page for writing because it might have buffers + ** that are mapped, but have a block number of 0. This indicates tail + ** data that has been read directly into the page, and + ** __block_write_begin won't trigger a get_block in this case. + */ + fix_tail_page_for_writing(tail_page); + retval = __reiserfs_write_begin(tail_page, tail_start, + tail_end - tail_start); + if (retval) + goto unlock; + + /* tail conversion might change the data in the page */ + flush_dcache_page(tail_page); + + retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end); + + unlock: + if (tail_page != hole_page) { + unlock_page(tail_page); + page_cache_release(tail_page); + } + out: + return retval; +} + +static inline int _allocate_block(struct reiserfs_transaction_handle *th, + sector_t block, + struct inode *inode, + b_blocknr_t * allocated_block_nr, + struct treepath *path, int flags) +{ + BUG_ON(!th->t_trans_id); + +#ifdef REISERFS_PREALLOCATE + if (!(flags & GET_BLOCK_NO_IMUX)) { + return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, + path, block); + } +#endif + return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path, + block); +} + +int reiserfs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + int repeat, retval = 0; + b_blocknr_t allocated_block_nr = 0; // b_blocknr_t is (unsigned) 32 bit int + INITIALIZE_PATH(path); + int pos_in_item; + struct cpu_key key; + struct buffer_head *bh, *unbh = NULL; + struct item_head *ih, tmp_ih; + __le32 *item; + int done; + int fs_gen; + int lock_depth; + struct reiserfs_transaction_handle *th = NULL; + /* space reserved in transaction batch: + . 3 balancings in direct->indirect conversion + . 1 block involved into reiserfs_update_sd() + XXX in practically impossible worst case direct2indirect() + can incur (much) more than 3 balancings. + quota update for user, group */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + 1 + + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); + int version; + int dangle = 1; + loff_t new_offset = + (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; + + lock_depth = reiserfs_write_lock_once(inode->i_sb); + version = get_inode_item_key_version(inode); + + if (!file_capable(inode, block)) { + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + return -EFBIG; + } + + /* if !create, we aren't changing the FS, so we don't need to + ** log anything, so we don't need to start a transaction + */ + if (!(create & GET_BLOCK_CREATE)) { + int ret; + /* find number of block-th logical block of the file */ + ret = _get_block_create_0(inode, block, bh_result, + create | GET_BLOCK_READ_DIRECT); + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + return ret; + } + /* + * if we're already in a transaction, make sure to close + * any new transactions we start in this func + */ + if ((create & GET_BLOCK_NO_DANGLE) || + reiserfs_transaction_running(inode->i_sb)) + dangle = 0; + + /* If file is of such a size, that it might have a tail and tails are enabled + ** we should mark it as possibly needing tail packing on close + */ + if ((have_large_tails(inode->i_sb) + && inode->i_size < i_block_size(inode) * 4) + || (have_small_tails(inode->i_sb) + && inode->i_size < i_block_size(inode))) + REISERFS_I(inode)->i_flags |= i_pack_on_close_mask; + + /* set the key of the first byte in the 'block'-th block of file */ + make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ ); + if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) { + start_trans: + th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count); + if (!th) { + retval = -ENOMEM; + goto failure; + } + reiserfs_update_inode_transaction(inode); + } + research: + + retval = search_for_position_by_key(inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + retval = -EIO; + goto failure; + } + + bh = get_last_bh(&path); + ih = get_ih(&path); + item = get_item(&path); + pos_in_item = path.pos_in_item; + + fs_gen = get_generation(inode->i_sb); + copy_item_head(&tmp_ih, ih); + + if (allocation_needed + (retval, allocated_block_nr, ih, item, pos_in_item)) { + /* we have to allocate block for the unformatted node */ + if (!th) { + pathrelse(&path); + goto start_trans; + } + + repeat = + _allocate_block(th, block, inode, &allocated_block_nr, + &path, create); + + if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) { + /* restart the transaction to give the journal a chance to free + ** some blocks. releases the path, so we have to go back to + ** research if we succeed on the second try + */ + SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1; + retval = restart_transaction(th, inode, &path); + if (retval) + goto failure; + repeat = + _allocate_block(th, block, inode, + &allocated_block_nr, NULL, create); + + if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) { + goto research; + } + if (repeat == QUOTA_EXCEEDED) + retval = -EDQUOT; + else + retval = -ENOSPC; + goto failure; + } + + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + goto research; + } + } + + if (indirect_item_found(retval, ih)) { + b_blocknr_t unfm_ptr; + /* 'block'-th block is in the file already (there is + corresponding cell in some indirect item). But it may be + zero unformatted node pointer (hole) */ + unfm_ptr = get_block_num(item, pos_in_item); + if (unfm_ptr == 0) { + /* use allocated block to plug the hole */ + reiserfs_prepare_for_journal(inode->i_sb, bh, 1); + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, + bh); + goto research; + } + set_buffer_new(bh_result); + if (buffer_dirty(bh_result) + && reiserfs_data_ordered(inode->i_sb)) + reiserfs_add_ordered_list(inode, bh_result); + put_block_num(item, pos_in_item, allocated_block_nr); + unfm_ptr = allocated_block_nr; + journal_mark_dirty(th, inode->i_sb, bh); + reiserfs_update_sd(th, inode); + } + set_block_dev_mapped(bh_result, unfm_ptr, inode); + pathrelse(&path); + retval = 0; + if (!dangle && th) + retval = reiserfs_end_persistent_transaction(th); + + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + + /* the item was found, so new blocks were not added to the file + ** there is no need to make sure the inode is updated with this + ** transaction + */ + return retval; + } + + if (!th) { + pathrelse(&path); + goto start_trans; + } + + /* desired position is not found or is in the direct item. We have + to append file with holes up to 'block'-th block converting + direct items to indirect one if necessary */ + done = 0; + do { + if (is_statdata_le_ih(ih)) { + __le32 unp = 0; + struct cpu_key tmp_key; + + /* indirect item has to be inserted */ + make_le_item_head(&tmp_ih, &key, version, 1, + TYPE_INDIRECT, UNFM_P_SIZE, + 0 /* free_space */ ); + + if (cpu_key_k_offset(&key) == 1) { + /* we are going to add 'block'-th block to the file. Use + allocated block for that */ + unp = cpu_to_le32(allocated_block_nr); + set_block_dev_mapped(bh_result, + allocated_block_nr, inode); + set_buffer_new(bh_result); + done = 1; + } + tmp_key = key; // ;) + set_cpu_key_k_offset(&tmp_key, 1); + PATH_LAST_POSITION(&path)++; + + retval = + reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih, + inode, (char *)&unp); + if (retval) { + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST + } + //mark_tail_converted (inode); + } else if (is_direct_le_ih(ih)) { + /* direct item has to be converted */ + loff_t tail_offset; + + tail_offset = + ((le_ih_k_offset(ih) - + 1) & ~(inode->i_sb->s_blocksize - 1)) + 1; + if (tail_offset == cpu_key_k_offset(&key)) { + /* direct item we just found fits into block we have + to map. Convert it into unformatted node: use + bh_result for the conversion */ + set_block_dev_mapped(bh_result, + allocated_block_nr, inode); + unbh = bh_result; + done = 1; + } else { + /* we have to padd file tail stored in direct item(s) + up to block size and convert it to unformatted + node. FIXME: this should also get into page cache */ + + pathrelse(&path); + /* + * ugly, but we can only end the transaction if + * we aren't nested + */ + BUG_ON(!th->t_refcount); + if (th->t_refcount == 1) { + retval = + reiserfs_end_persistent_transaction + (th); + th = NULL; + if (retval) + goto failure; + } + + retval = + convert_tail_for_hole(inode, bh_result, + tail_offset); + if (retval) { + if (retval != -ENOSPC) + reiserfs_error(inode->i_sb, + "clm-6004", + "convert tail failed " + "inode %lu, error %d", + inode->i_ino, + retval); + if (allocated_block_nr) { + /* the bitmap, the super, and the stat data == 3 */ + if (!th) + th = reiserfs_persistent_transaction(inode->i_sb, 3); + if (th) + reiserfs_free_block(th, + inode, + allocated_block_nr, + 1); + } + goto failure; + } + goto research; + } + retval = + direct2indirect(th, inode, &path, unbh, + tail_offset); + if (retval) { + reiserfs_unmap_buffer(unbh); + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + goto failure; + } + /* it is important the set_buffer_uptodate is done after + ** the direct2indirect. The buffer might contain valid + ** data newer than the data on disk (read by readpage, changed, + ** and then sent here by writepage). direct2indirect needs + ** to know if unbh was already up to date, so it can decide + ** if the data in unbh needs to be replaced with data from + ** the disk + */ + set_buffer_uptodate(unbh); + + /* unbh->b_page == NULL in case of DIRECT_IO request, this means + buffer will disappear shortly, so it should not be added to + */ + if (unbh->b_page) { + /* we've converted the tail, so we must + ** flush unbh before the transaction commits + */ + reiserfs_add_tail_list(inode, unbh); + + /* mark it dirty now to prevent commit_write from adding + ** this buffer to the inode's dirty buffer list + */ + /* + * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty(). + * It's still atomic, but it sets the page dirty too, + * which makes it eligible for writeback at any time by the + * VM (which was also the case with __mark_buffer_dirty()) + */ + mark_buffer_dirty(unbh); + } + } else { + /* append indirect item with holes if needed, when appending + pointer to 'block'-th block use block, which is already + allocated */ + struct cpu_key tmp_key; + unp_t unf_single = 0; // We use this in case we need to allocate only + // one block which is a fastpath + unp_t *un; + __u64 max_to_insert = + MAX_ITEM_LEN(inode->i_sb->s_blocksize) / + UNFM_P_SIZE; + __u64 blocks_needed; + + RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE, + "vs-804: invalid position for append"); + /* indirect item has to be appended, set up key of that position */ + make_cpu_key(&tmp_key, inode, + le_key_k_offset(version, + &(ih->ih_key)) + + op_bytes_number(ih, + inode->i_sb->s_blocksize), + //pos_in_item * inode->i_sb->s_blocksize, + TYPE_INDIRECT, 3); // key type is unimportant + + RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key), + "green-805: invalid offset"); + blocks_needed = + 1 + + ((cpu_key_k_offset(&key) - + cpu_key_k_offset(&tmp_key)) >> inode->i_sb-> + s_blocksize_bits); + + if (blocks_needed == 1) { + un = &unf_single; + } else { + un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS); + if (!un) { + un = &unf_single; + blocks_needed = 1; + max_to_insert = 0; + } + } + if (blocks_needed <= max_to_insert) { + /* we are going to add target block to the file. Use allocated + block for that */ + un[blocks_needed - 1] = + cpu_to_le32(allocated_block_nr); + set_block_dev_mapped(bh_result, + allocated_block_nr, inode); + set_buffer_new(bh_result); + done = 1; + } else { + /* paste hole to the indirect item */ + /* If kmalloc failed, max_to_insert becomes zero and it means we + only have space for one block */ + blocks_needed = + max_to_insert ? max_to_insert : 1; + } + retval = + reiserfs_paste_into_item(th, &path, &tmp_key, inode, + (char *)un, + UNFM_P_SIZE * + blocks_needed); + + if (blocks_needed != 1) + kfree(un); + + if (retval) { + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + goto failure; + } + if (!done) { + /* We need to mark new file size in case this function will be + interrupted/aborted later on. And we may do this only for + holes. */ + inode->i_size += + inode->i_sb->s_blocksize * blocks_needed; + } + } + + if (done == 1) + break; + + /* this loop could log more blocks than we had originally asked + ** for. So, we have to allow the transaction to end if it is + ** too big or too full. Update the inode so things are + ** consistent if we crash before the function returns + ** + ** release the path so that anybody waiting on the path before + ** ending their transaction will be able to continue. + */ + if (journal_transaction_should_end(th, th->t_blocks_allocated)) { + retval = restart_transaction(th, inode, &path); + if (retval) + goto failure; + } + /* + * inserting indirect pointers for a hole can take a + * long time. reschedule if needed and also release the write + * lock for others. + */ + if (need_resched()) { + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + schedule(); + lock_depth = reiserfs_write_lock_once(inode->i_sb); + } + + retval = search_for_position_by_key(inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + retval = -EIO; + goto failure; + } + if (retval == POSITION_FOUND) { + reiserfs_warning(inode->i_sb, "vs-825", + "%K should not be found", &key); + retval = -EEXIST; + if (allocated_block_nr) + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + pathrelse(&path); + goto failure; + } + bh = get_last_bh(&path); + ih = get_ih(&path); + item = get_item(&path); + pos_in_item = path.pos_in_item; + } while (1); + + retval = 0; + + failure: + if (th && (!dangle || (retval && !th->t_trans_id))) { + int err; + if (th->t_trans_id) + reiserfs_update_sd(th, inode); + err = reiserfs_end_persistent_transaction(th); + if (err) + retval = err; + } + + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_check_path(&path); + return retval; +} + +static int +reiserfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); +} + +/* Compute real number of used bytes by file + * Following three functions can go away when we'll have enough space in stat item + */ +static int real_space_diff(struct inode *inode, int sd_size) +{ + int bytes; + loff_t blocksize = inode->i_sb->s_blocksize; + + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) + return sd_size; + + /* End of file is also in full block with indirect reference, so round + ** up to the next block. + ** + ** there is just no way to know if the tail is actually packed + ** on the file, so we have to assume it isn't. When we pack the + ** tail, we add 4 bytes to pretend there really is an unformatted + ** node pointer + */ + bytes = + ((inode->i_size + + (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + + sd_size; + return bytes; +} + +static inline loff_t to_real_used_space(struct inode *inode, ulong blocks, + int sd_size) +{ + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { + return inode->i_size + + (loff_t) (real_space_diff(inode, sd_size)); + } + return ((loff_t) real_space_diff(inode, sd_size)) + + (((loff_t) blocks) << 9); +} + +/* Compute number of blocks used by file in ReiserFS counting */ +static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size) +{ + loff_t bytes = inode_get_bytes(inode); + loff_t real_space = real_space_diff(inode, sd_size); + + /* keeps fsck and non-quota versions of reiserfs happy */ + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { + bytes += (loff_t) 511; + } + + /* files from before the quota patch might i_blocks such that + ** bytes < real_space. Deal with that here to prevent it from + ** going negative. + */ + if (bytes < real_space) + return 0; + return (bytes - real_space) >> 9; +} + +// +// BAD: new directories have stat data of new type and all other items +// of old type. Version stored in the inode says about body items, so +// in update_stat_data we can not rely on inode, but have to check +// item version directly +// + +// called by read_locked_inode +static void init_inode(struct inode *inode, struct treepath *path) +{ + struct buffer_head *bh; + struct item_head *ih; + __u32 rdev; + //int version = ITEM_VERSION_1; + + bh = PATH_PLAST_BUFFER(path); + ih = PATH_PITEM_HEAD(path); + + copy_key(INODE_PKEY(inode), &(ih->ih_key)); + + INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list)); + REISERFS_I(inode)->i_flags = 0; + REISERFS_I(inode)->i_prealloc_block = 0; + REISERFS_I(inode)->i_prealloc_count = 0; + REISERFS_I(inode)->i_trans_id = 0; + REISERFS_I(inode)->i_jl = NULL; + reiserfs_init_xattr_rwsem(inode); + + if (stat_data_v1(ih)) { + struct stat_data_v1 *sd = + (struct stat_data_v1 *)B_I_PITEM(bh, ih); + unsigned long blocks; + + set_inode_item_key_version(inode, KEY_FORMAT_3_5); + set_inode_sd_version(inode, STAT_DATA_V1); + inode->i_mode = sd_v1_mode(sd); + inode->i_nlink = sd_v1_nlink(sd); + inode->i_uid = sd_v1_uid(sd); + inode->i_gid = sd_v1_gid(sd); + inode->i_size = sd_v1_size(sd); + inode->i_atime.tv_sec = sd_v1_atime(sd); + inode->i_mtime.tv_sec = sd_v1_mtime(sd); + inode->i_ctime.tv_sec = sd_v1_ctime(sd); + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + + inode->i_blocks = sd_v1_blocks(sd); + inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); + blocks = (inode->i_size + 511) >> 9; + blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9); + if (inode->i_blocks > blocks) { + // there was a bug in <=3.5.23 when i_blocks could take negative + // values. Starting from 3.5.17 this value could even be stored in + // stat data. For such files we set i_blocks based on file + // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be + // only updated if file's inode will ever change + inode->i_blocks = blocks; + } + + rdev = sd_v1_rdev(sd); + REISERFS_I(inode)->i_first_direct_byte = + sd_v1_first_direct_byte(sd); + /* an early bug in the quota code can give us an odd number for the + ** block count. This is incorrect, fix it here. + */ + if (inode->i_blocks & 1) { + inode->i_blocks++; + } + inode_set_bytes(inode, + to_real_used_space(inode, inode->i_blocks, + SD_V1_SIZE)); + /* nopack is initially zero for v1 objects. For v2 objects, + nopack is initialised from sd_attrs */ + REISERFS_I(inode)->i_flags &= ~i_nopack_mask; + } else { + // new stat data found, but object may have old items + // (directories and symlinks) + struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih); + + inode->i_mode = sd_v2_mode(sd); + inode->i_nlink = sd_v2_nlink(sd); + inode->i_uid = sd_v2_uid(sd); + inode->i_size = sd_v2_size(sd); + inode->i_gid = sd_v2_gid(sd); + inode->i_mtime.tv_sec = sd_v2_mtime(sd); + inode->i_atime.tv_sec = sd_v2_atime(sd); + inode->i_ctime.tv_sec = sd_v2_ctime(sd); + inode->i_ctime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_atime.tv_nsec = 0; + inode->i_blocks = sd_v2_blocks(sd); + rdev = sd_v2_rdev(sd); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + inode->i_generation = + le32_to_cpu(INODE_PKEY(inode)->k_dir_id); + else + inode->i_generation = sd_v2_generation(sd); + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + set_inode_item_key_version(inode, KEY_FORMAT_3_5); + else + set_inode_item_key_version(inode, KEY_FORMAT_3_6); + REISERFS_I(inode)->i_first_direct_byte = 0; + set_inode_sd_version(inode, STAT_DATA_V2); + inode_set_bytes(inode, + to_real_used_space(inode, inode->i_blocks, + SD_V2_SIZE)); + /* read persistent inode attributes from sd and initialise + generic inode flags from them */ + REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd); + sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode); + } + + pathrelse(path); + if (S_ISREG(inode->i_mode)) { + inode->i_op = &reiserfs_file_inode_operations; + inode->i_fop = &reiserfs_file_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &reiserfs_dir_inode_operations; + inode->i_fop = &reiserfs_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &reiserfs_symlink_inode_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + } else { + inode->i_blocks = 0; + inode->i_op = &reiserfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + } +} + +// update new stat data with inode fields +static void inode2sd(void *sd, struct inode *inode, loff_t size) +{ + struct stat_data *sd_v2 = (struct stat_data *)sd; + __u16 flags; + + set_sd_v2_mode(sd_v2, inode->i_mode); + set_sd_v2_nlink(sd_v2, inode->i_nlink); + set_sd_v2_uid(sd_v2, inode->i_uid); + set_sd_v2_size(sd_v2, size); + set_sd_v2_gid(sd_v2, inode->i_gid); + set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec); + set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec); + set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec); + set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE)); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); + else + set_sd_v2_generation(sd_v2, inode->i_generation); + flags = REISERFS_I(inode)->i_attrs; + i_attrs_to_sd_attrs(inode, &flags); + set_sd_v2_attrs(sd_v2, flags); +} + +// used to copy inode's fields to old stat data +static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) +{ + struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd; + + set_sd_v1_mode(sd_v1, inode->i_mode); + set_sd_v1_uid(sd_v1, inode->i_uid); + set_sd_v1_gid(sd_v1, inode->i_gid); + set_sd_v1_nlink(sd_v1, inode->i_nlink); + set_sd_v1_size(sd_v1, size); + set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec); + set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec); + set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev)); + else + set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE)); + + // Sigh. i_first_direct_byte is back + set_sd_v1_first_direct_byte(sd_v1, + REISERFS_I(inode)->i_first_direct_byte); +} + +/* NOTE, you must prepare the buffer head before sending it here, +** and then log it after the call +*/ +static void update_stat_data(struct treepath *path, struct inode *inode, + loff_t size) +{ + struct buffer_head *bh; + struct item_head *ih; + + bh = PATH_PLAST_BUFFER(path); + ih = PATH_PITEM_HEAD(path); + + if (!is_statdata_le_ih(ih)) + reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h", + INODE_PKEY(inode), ih); + + if (stat_data_v1(ih)) { + // path points to old stat data + inode2sd_v1(B_I_PITEM(bh, ih), inode, size); + } else { + inode2sd(B_I_PITEM(bh, ih), inode, size); + } + + return; +} + +void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, + struct inode *inode, loff_t size) +{ + struct cpu_key key; + INITIALIZE_PATH(path); + struct buffer_head *bh; + int fs_gen; + struct item_head *ih, tmp_ih; + int retval; + + BUG_ON(!th->t_trans_id); + + make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3); //key type is unimportant + + for (;;) { + int pos; + /* look for the object's stat data */ + retval = search_item(inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + reiserfs_error(inode->i_sb, "vs-13050", + "i/o failure occurred trying to " + "update %K stat data", &key); + return; + } + if (retval == ITEM_NOT_FOUND) { + pos = PATH_LAST_POSITION(&path); + pathrelse(&path); + if (inode->i_nlink == 0) { + /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */ + return; + } + reiserfs_warning(inode->i_sb, "vs-13060", + "stat data of object %k (nlink == %d) " + "not found (pos %d)", + INODE_PKEY(inode), inode->i_nlink, + pos); + reiserfs_check_path(&path); + return; + } + + /* sigh, prepare_for_journal might schedule. When it schedules the + ** FS might change. We have to detect that, and loop back to the + ** search if the stat data item has moved + */ + bh = get_last_bh(&path); + ih = get_ih(&path); + copy_item_head(&tmp_ih, ih); + fs_gen = get_generation(inode->i_sb); + reiserfs_prepare_for_journal(inode->i_sb, bh, 1); + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh); + continue; /* Stat_data item has been moved after scheduling. */ + } + break; + } + update_stat_data(&path, inode, size); + journal_mark_dirty(th, th->t_super, bh); + pathrelse(&path); + return; +} + +/* reiserfs_read_locked_inode is called to read the inode off disk, and it +** does a make_bad_inode when things go wrong. But, we need to make sure +** and clear the key in the private portion of the inode, otherwise a +** corresponding iput might try to delete whatever object the inode last +** represented. +*/ +static void reiserfs_make_bad_inode(struct inode *inode) +{ + memset(INODE_PKEY(inode), 0, KEY_SIZE); + make_bad_inode(inode); +} + +// +// initially this function was derived from minix or ext2's analog and +// evolved as the prototype did +// + +int reiserfs_init_locked_inode(struct inode *inode, void *p) +{ + struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p; + inode->i_ino = args->objectid; + INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid); + return 0; +} + +/* looks for stat data in the tree, and fills up the fields of in-core + inode stat data fields */ +void reiserfs_read_locked_inode(struct inode *inode, + struct reiserfs_iget_args *args) +{ + INITIALIZE_PATH(path_to_sd); + struct cpu_key key; + unsigned long dirino; + int retval; + + dirino = args->dirid; + + /* set version 1, version 2 could be used too, because stat data + key is the same in both versions */ + key.version = KEY_FORMAT_3_5; + key.on_disk_key.k_dir_id = dirino; + key.on_disk_key.k_objectid = inode->i_ino; + key.on_disk_key.k_offset = 0; + key.on_disk_key.k_type = 0; + + /* look for the object's stat data */ + retval = search_item(inode->i_sb, &key, &path_to_sd); + if (retval == IO_ERROR) { + reiserfs_error(inode->i_sb, "vs-13070", + "i/o failure occurred trying to find " + "stat data of %K", &key); + reiserfs_make_bad_inode(inode); + return; + } + if (retval != ITEM_FOUND) { + /* a stale NFS handle can trigger this without it being an error */ + pathrelse(&path_to_sd); + reiserfs_make_bad_inode(inode); + inode->i_nlink = 0; + return; + } + + init_inode(inode, &path_to_sd); + + /* It is possible that knfsd is trying to access inode of a file + that is being removed from the disk by some other thread. As we + update sd on unlink all that is required is to check for nlink + here. This bug was first found by Sizif when debugging + SquidNG/Butterfly, forgotten, and found again after Philippe + Gramoulle reproduced it. + + More logical fix would require changes in fs/inode.c:iput() to + remove inode from hash-table _after_ fs cleaned disk stuff up and + in iget() to return NULL if I_FREEING inode is found in + hash-table. */ + /* Currently there is one place where it's ok to meet inode with + nlink==0: processing of open-unlinked and half-truncated files + during mount (fs/reiserfs/super.c:finish_unfinished()). */ + if ((inode->i_nlink == 0) && + !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) { + reiserfs_warning(inode->i_sb, "vs-13075", + "dead inode read from disk %K. " + "This is likely to be race with knfsd. Ignore", + &key); + reiserfs_make_bad_inode(inode); + } + + reiserfs_check_path(&path_to_sd); /* init inode should be relsing */ + +} + +/** + * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked(). + * + * @inode: inode from hash table to check + * @opaque: "cookie" passed to iget5_locked(). This is &reiserfs_iget_args. + * + * This function is called by iget5_locked() to distinguish reiserfs inodes + * having the same inode numbers. Such inodes can only exist due to some + * error condition. One of them should be bad. Inodes with identical + * inode numbers (objectids) are distinguished by parent directory ids. + * + */ +int reiserfs_find_actor(struct inode *inode, void *opaque) +{ + struct reiserfs_iget_args *args; + + args = opaque; + /* args is already in CPU order */ + return (inode->i_ino == args->objectid) && + (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid); +} + +struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key) +{ + struct inode *inode; + struct reiserfs_iget_args args; + + args.objectid = key->on_disk_key.k_objectid; + args.dirid = key->on_disk_key.k_dir_id; + reiserfs_write_unlock(s); + inode = iget5_locked(s, key->on_disk_key.k_objectid, + reiserfs_find_actor, reiserfs_init_locked_inode, + (void *)(&args)); + reiserfs_write_lock(s); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + reiserfs_read_locked_inode(inode, &args); + unlock_new_inode(inode); + } + + if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) { + /* either due to i/o error or a stale NFS handle */ + iput(inode); + inode = NULL; + } + return inode; +} + +static struct dentry *reiserfs_get_dentry(struct super_block *sb, + u32 objectid, u32 dir_id, u32 generation) + +{ + struct cpu_key key; + struct inode *inode; + + key.on_disk_key.k_objectid = objectid; + key.on_disk_key.k_dir_id = dir_id; + reiserfs_write_lock(sb); + inode = reiserfs_iget(sb, &key); + if (inode && !IS_ERR(inode) && generation != 0 && + generation != inode->i_generation) { + iput(inode); + inode = NULL; + } + reiserfs_write_unlock(sb); + + return d_obtain_alias(inode); +} + +struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + /* fhtype happens to reflect the number of u32s encoded. + * due to a bug in earlier code, fhtype might indicate there + * are more u32s then actually fitted. + * so if fhtype seems to be more than len, reduce fhtype. + * Valid types are: + * 2 - objectid + dir_id - legacy support + * 3 - objectid + dir_id + generation + * 4 - objectid + dir_id + objectid and dirid of parent - legacy + * 5 - objectid + dir_id + generation + objectid and dirid of parent + * 6 - as above plus generation of directory + * 6 does not fit in NFSv2 handles + */ + if (fh_type > fh_len) { + if (fh_type != 6 || fh_len != 5) + reiserfs_warning(sb, "reiserfs-13077", + "nfsd/reiserfs, fhtype=%d, len=%d - odd", + fh_type, fh_len); + fh_type = 5; + } + + return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1], + (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0); +} + +struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + if (fh_type < 4) + return NULL; + + return reiserfs_get_dentry(sb, + (fh_type >= 5) ? fid->raw[3] : fid->raw[2], + (fh_type >= 5) ? fid->raw[4] : fid->raw[3], + (fh_type == 6) ? fid->raw[5] : 0); +} + +int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, + int need_parent) +{ + struct inode *inode = dentry->d_inode; + int maxlen = *lenp; + + if (need_parent && (maxlen < 5)) { + *lenp = 5; + return 255; + } else if (maxlen < 3) { + *lenp = 3; + return 255; + } + + data[0] = inode->i_ino; + data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); + data[2] = inode->i_generation; + *lenp = 3; + /* no room for directory info? return what we've stored so far */ + if (maxlen < 5 || !need_parent) + return 3; + + spin_lock(&dentry->d_lock); + inode = dentry->d_parent->d_inode; + data[3] = inode->i_ino; + data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); + *lenp = 5; + if (maxlen >= 6) { + data[5] = inode->i_generation; + *lenp = 6; + } + spin_unlock(&dentry->d_lock); + return *lenp; +} + +/* looks for stat data, then copies fields to it, marks the buffer + containing stat data as dirty */ +/* reiserfs inodes are never really dirty, since the dirty inode call +** always logs them. This call allows the VFS inode marking routines +** to properly mark inodes for datasync and such, but only actually +** does something when called for a synchronous update. +*/ +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct reiserfs_transaction_handle th; + int jbegin_count = 1; + + if (inode->i_sb->s_flags & MS_RDONLY) + return -EROFS; + /* memory pressure can sometimes initiate write_inode calls with sync == 1, + ** these cases are just when the system needs ram, not when the + ** inode needs to reach disk for safety, and they can safely be + ** ignored because the altered inode has already been logged. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) { + reiserfs_write_lock(inode->i_sb); + if (!journal_begin(&th, inode->i_sb, jbegin_count)) { + reiserfs_update_sd(&th, inode); + journal_end_sync(&th, inode->i_sb, jbegin_count); + } + reiserfs_write_unlock(inode->i_sb); + } + return 0; +} + +/* stat data of new object is inserted already, this inserts the item + containing "." and ".." entries */ +static int reiserfs_new_directory(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct item_head *ih, struct treepath *path, + struct inode *dir) +{ + struct super_block *sb = th->t_super; + char empty_dir[EMPTY_DIR_SIZE]; + char *body = empty_dir; + struct cpu_key key; + int retval; + + BUG_ON(!th->t_trans_id); + + _make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id), + le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET, + TYPE_DIRENTRY, 3 /*key length */ ); + + /* compose item head for new item. Directories consist of items of + old type (ITEM_VERSION_1). Do not set key (second arg is 0), it + is done by reiserfs_new_inode */ + if (old_format_only(sb)) { + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, + TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2); + + make_empty_dir_item_v1(body, ih->ih_key.k_dir_id, + ih->ih_key.k_objectid, + INODE_PKEY(dir)->k_dir_id, + INODE_PKEY(dir)->k_objectid); + } else { + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, + TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2); + + make_empty_dir_item(body, ih->ih_key.k_dir_id, + ih->ih_key.k_objectid, + INODE_PKEY(dir)->k_dir_id, + INODE_PKEY(dir)->k_objectid); + } + + /* look for place in the tree for new item */ + retval = search_item(sb, &key, path); + if (retval == IO_ERROR) { + reiserfs_error(sb, "vs-13080", + "i/o failure occurred creating new directory"); + return -EIO; + } + if (retval == ITEM_FOUND) { + pathrelse(path); + reiserfs_warning(sb, "vs-13070", + "object with this key exists (%k)", + &(ih->ih_key)); + return -EEXIST; + } + + /* insert item, that is empty directory item */ + return reiserfs_insert_item(th, path, &key, ih, inode, body); +} + +/* stat data of object has been inserted, this inserts the item + containing the body of symlink */ +static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode, /* Inode of symlink */ + struct item_head *ih, + struct treepath *path, const char *symname, + int item_len) +{ + struct super_block *sb = th->t_super; + struct cpu_key key; + int retval; + + BUG_ON(!th->t_trans_id); + + _make_cpu_key(&key, KEY_FORMAT_3_5, + le32_to_cpu(ih->ih_key.k_dir_id), + le32_to_cpu(ih->ih_key.k_objectid), + 1, TYPE_DIRECT, 3 /*key length */ ); + + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, + 0 /*free_space */ ); + + /* look for place in the tree for new item */ + retval = search_item(sb, &key, path); + if (retval == IO_ERROR) { + reiserfs_error(sb, "vs-13080", + "i/o failure occurred creating new symlink"); + return -EIO; + } + if (retval == ITEM_FOUND) { + pathrelse(path); + reiserfs_warning(sb, "vs-13080", + "object with this key exists (%k)", + &(ih->ih_key)); + return -EEXIST; + } + + /* insert item, that is body of symlink */ + return reiserfs_insert_item(th, path, &key, ih, inode, symname); +} + +/* inserts the stat data into the tree, and then calls + reiserfs_new_directory (to insert ".", ".." item if new object is + directory) or reiserfs_new_symlink (to insert symlink body if new + object is symlink) or nothing (if new object is regular file) + + NOTE! uid and gid must already be set in the inode. If we return + non-zero due to an error, we have to drop the quota previously allocated + for the fresh inode. This can only be done outside a transaction, so + if we return non-zero, we also end the transaction. */ +int reiserfs_new_inode(struct reiserfs_transaction_handle *th, + struct inode *dir, int mode, const char *symname, + /* 0 for regular, EMTRY_DIR_SIZE for dirs, + strlen (symname) for symlinks) */ + loff_t i_size, struct dentry *dentry, + struct inode *inode, + struct reiserfs_security_handle *security) +{ + struct super_block *sb; + struct reiserfs_iget_args args; + INITIALIZE_PATH(path_to_key); + struct cpu_key key; + struct item_head ih; + struct stat_data sd; + int retval; + int err; + + BUG_ON(!th->t_trans_id); + + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) + goto out_end_trans; + if (!dir->i_nlink) { + err = -EPERM; + goto out_bad_inode; + } + + sb = dir->i_sb; + + /* item head of new item */ + ih.ih_key.k_dir_id = reiserfs_choose_packing(dir); + ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th)); + if (!ih.ih_key.k_objectid) { + err = -ENOMEM; + goto out_bad_inode; + } + args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid); + if (old_format_only(sb)) + make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, + TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); + else + make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, + TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); + memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); + args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); + if (insert_inode_locked4(inode, args.objectid, + reiserfs_find_actor, &args) < 0) { + err = -EINVAL; + goto out_bad_inode; + } + if (old_format_only(sb)) + /* not a perfect generation count, as object ids can be reused, but + ** this is as good as reiserfs can do right now. + ** note that the private part of inode isn't filled in yet, we have + ** to use the directory. + */ + inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid); + else +#if defined( USE_INODE_GENERATION_COUNTER ) + inode->i_generation = + le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation); +#else + inode->i_generation = ++event; +#endif + + /* fill stat data */ + inode->i_nlink = (S_ISDIR(mode) ? 2 : 1); + + /* uid and gid must already be set by the caller for quota init */ + + /* symlink cannot be immutable or append only, right? */ + if (S_ISLNK(inode->i_mode)) + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND); + + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_size = i_size; + inode->i_blocks = 0; + inode->i_bytes = 0; + REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 : + U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ; + + INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list)); + REISERFS_I(inode)->i_flags = 0; + REISERFS_I(inode)->i_prealloc_block = 0; + REISERFS_I(inode)->i_prealloc_count = 0; + REISERFS_I(inode)->i_trans_id = 0; + REISERFS_I(inode)->i_jl = NULL; + REISERFS_I(inode)->i_attrs = + REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; + sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode); + reiserfs_init_xattr_rwsem(inode); + + /* key to search for correct place for new stat data */ + _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id), + le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET, + TYPE_STAT_DATA, 3 /*key length */ ); + + /* find proper place for inserting of stat data */ + retval = search_item(sb, &key, &path_to_key); + if (retval == IO_ERROR) { + err = -EIO; + goto out_bad_inode; + } + if (retval == ITEM_FOUND) { + pathrelse(&path_to_key); + err = -EEXIST; + goto out_bad_inode; + } + if (old_format_only(sb)) { + if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) { + pathrelse(&path_to_key); + /* i_uid or i_gid is too big to be stored in stat data v3.5 */ + err = -EINVAL; + goto out_bad_inode; + } + inode2sd_v1(&sd, inode, inode->i_size); + } else { + inode2sd(&sd, inode, inode->i_size); + } + // store in in-core inode the key of stat data and version all + // object items will have (directory items will have old offset + // format, other new objects will consist of new items) + if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode)) + set_inode_item_key_version(inode, KEY_FORMAT_3_5); + else + set_inode_item_key_version(inode, KEY_FORMAT_3_6); + if (old_format_only(sb)) + set_inode_sd_version(inode, STAT_DATA_V1); + else + set_inode_sd_version(inode, STAT_DATA_V2); + + /* insert the stat data into the tree */ +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + if (REISERFS_I(dir)->new_packing_locality) + th->displace_new_blocks = 1; +#endif + retval = + reiserfs_insert_item(th, &path_to_key, &key, &ih, inode, + (char *)(&sd)); + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + goto out_bad_inode; + } +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + if (!th->displace_new_blocks) + REISERFS_I(dir)->new_packing_locality = 0; +#endif + if (S_ISDIR(mode)) { + /* insert item with "." and ".." */ + retval = + reiserfs_new_directory(th, inode, &ih, &path_to_key, dir); + } + + if (S_ISLNK(mode)) { + /* insert body of symlink */ + if (!old_format_only(sb)) + i_size = ROUND_UP(i_size); + retval = + reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname, + i_size); + } + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + journal_end(th, th->t_super, th->t_blocks_allocated); + goto out_inserted_sd; + } + + if (reiserfs_posixacl(inode->i_sb)) { + retval = reiserfs_inherit_default_acl(th, dir, dentry, inode); + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + journal_end(th, th->t_super, th->t_blocks_allocated); + goto out_inserted_sd; + } + } else if (inode->i_sb->s_flags & MS_POSIXACL) { + reiserfs_warning(inode->i_sb, "jdm-13090", + "ACLs aren't enabled in the fs, " + "but vfs thinks they are!"); + } else if (IS_PRIVATE(dir)) + inode->i_flags |= S_PRIVATE; + + if (security->name) { + retval = reiserfs_security_write(th, inode, security); + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + retval = journal_end(th, th->t_super, + th->t_blocks_allocated); + if (retval) + err = retval; + goto out_inserted_sd; + } + } + + reiserfs_update_sd(th, inode); + reiserfs_check_path(&path_to_key); + + return 0; + +/* it looks like you can easily compress these two goto targets into + * one. Keeping it like this doesn't actually hurt anything, and they + * are place holders for what the quota code actually needs. + */ + out_bad_inode: + /* Invalidate the object, nothing was inserted yet */ + INODE_PKEY(inode)->k_objectid = 0; + + /* Quota change must be inside a transaction for journaling */ + dquot_free_inode(inode); + + out_end_trans: + journal_end(th, th->t_super, th->t_blocks_allocated); + /* Drop can be outside and it needs more credits so it's better to have it outside */ + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + make_bad_inode(inode); + + out_inserted_sd: + inode->i_nlink = 0; + th->t_trans_id = 0; /* so the caller can't use this handle later */ + unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ + iput(inode); + return err; +} + +/* +** finds the tail page in the page cache, +** reads the last block in. +** +** On success, page_result is set to a locked, pinned page, and bh_result +** is set to an up to date buffer for the last block in the file. returns 0. +** +** tail conversion is not done, so bh_result might not be valid for writing +** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before +** trying to write the block. +** +** on failure, nonzero is returned, page_result and bh_result are untouched. +*/ +static int grab_tail_page(struct inode *inode, + struct page **page_result, + struct buffer_head **bh_result) +{ + + /* we want the page with the last byte in the file, + ** not the page that will hold the next byte for appending + */ + unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + unsigned long pos = 0; + unsigned long start = 0; + unsigned long blocksize = inode->i_sb->s_blocksize; + unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1); + struct buffer_head *bh; + struct buffer_head *head; + struct page *page; + int error; + + /* we know that we are only called with inode->i_size > 0. + ** we also know that a file tail can never be as big as a block + ** If i_size % blocksize == 0, our file is currently block aligned + ** and it won't need converting or zeroing after a truncate. + */ + if ((offset & (blocksize - 1)) == 0) { + return -ENOENT; + } + page = grab_cache_page(inode->i_mapping, index); + error = -ENOMEM; + if (!page) { + goto out; + } + /* start within the page of the last block in the file */ + start = (offset / blocksize) * blocksize; + + error = __block_write_begin(page, start, offset - start, + reiserfs_get_block_create_0); + if (error) + goto unlock; + + head = page_buffers(page); + bh = head; + do { + if (pos >= start) { + break; + } + bh = bh->b_this_page; + pos += blocksize; + } while (bh != head); + + if (!buffer_uptodate(bh)) { + /* note, this should never happen, prepare_write should + ** be taking care of this for us. If the buffer isn't up to date, + ** I've screwed up the code to find the buffer, or the code to + ** call prepare_write + */ + reiserfs_error(inode->i_sb, "clm-6000", + "error reading block %lu", bh->b_blocknr); + error = -EIO; + goto unlock; + } + *bh_result = bh; + *page_result = page; + + out: + return error; + + unlock: + unlock_page(page); + page_cache_release(page); + return error; +} + +/* +** vfs version of truncate file. Must NOT be called with +** a transaction already started. +** +** some code taken from block_truncate_page +*/ +int reiserfs_truncate_file(struct inode *inode, int update_timestamps) +{ + struct reiserfs_transaction_handle th; + /* we want the offset for the first byte after the end of the file */ + unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1); + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned length; + struct page *page = NULL; + int error; + struct buffer_head *bh = NULL; + int err2; + int lock_depth; + + lock_depth = reiserfs_write_lock_once(inode->i_sb); + + if (inode->i_size > 0) { + error = grab_tail_page(inode, &page, &bh); + if (error) { + // -ENOENT means we truncated past the end of the file, + // and get_block_create_0 could not find a block to read in, + // which is ok. + if (error != -ENOENT) + reiserfs_error(inode->i_sb, "clm-6001", + "grab_tail_page failed %d", + error); + page = NULL; + bh = NULL; + } + } + + /* so, if page != NULL, we have a buffer head for the offset at + ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0, + ** then we have an unformatted node. Otherwise, we have a direct item, + ** and no zeroing is required on disk. We zero after the truncate, + ** because the truncate might pack the item anyway + ** (it will unmap bh if it packs). + */ + /* it is enough to reserve space in transaction for 2 balancings: + one for "save" link adding and another for the first + cut_from_item. 1 is for update_sd */ + error = journal_begin(&th, inode->i_sb, + JOURNAL_PER_BALANCE_CNT * 2 + 1); + if (error) + goto out; + reiserfs_update_inode_transaction(inode); + if (update_timestamps) + /* we are doing real truncate: if the system crashes before the last + transaction of truncating gets committed - on reboot the file + either appears truncated properly or not truncated at all */ + add_save_link(&th, inode, 1); + err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps); + error = + journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1); + if (error) + goto out; + + /* check reiserfs_do_truncate after ending the transaction */ + if (err2) { + error = err2; + goto out; + } + + if (update_timestamps) { + error = remove_save_link(inode, 1 /* truncate */); + if (error) + goto out; + } + + if (page) { + length = offset & (blocksize - 1); + /* if we are not on a block boundary */ + if (length) { + length = blocksize - length; + zero_user(page, offset, length); + if (buffer_mapped(bh) && bh->b_blocknr != 0) { + mark_buffer_dirty(bh); + } + } + unlock_page(page); + page_cache_release(page); + } + + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + + return 0; + out: + if (page) { + unlock_page(page); + page_cache_release(page); + } + + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + + return error; +} + +static int map_block_for_writepage(struct inode *inode, + struct buffer_head *bh_result, + unsigned long block) +{ + struct reiserfs_transaction_handle th; + int fs_gen; + struct item_head tmp_ih; + struct item_head *ih; + struct buffer_head *bh; + __le32 *item; + struct cpu_key key; + INITIALIZE_PATH(path); + int pos_in_item; + int jbegin_count = JOURNAL_PER_BALANCE_CNT; + loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1; + int retval; + int use_get_block = 0; + int bytes_copied = 0; + int copy_size; + int trans_running = 0; + + /* catch places below that try to log something without starting a trans */ + th.t_trans_id = 0; + + if (!buffer_uptodate(bh_result)) { + return -EIO; + } + + kmap(bh_result->b_page); + start_over: + reiserfs_write_lock(inode->i_sb); + make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3); + + research: + retval = search_for_position_by_key(inode->i_sb, &key, &path); + if (retval != POSITION_FOUND) { + use_get_block = 1; + goto out; + } + + bh = get_last_bh(&path); + ih = get_ih(&path); + item = get_item(&path); + pos_in_item = path.pos_in_item; + + /* we've found an unformatted node */ + if (indirect_item_found(retval, ih)) { + if (bytes_copied > 0) { + reiserfs_warning(inode->i_sb, "clm-6002", + "bytes_copied %d", bytes_copied); + } + if (!get_block_num(item, pos_in_item)) { + /* crap, we are writing to a hole */ + use_get_block = 1; + goto out; + } + set_block_dev_mapped(bh_result, + get_block_num(item, pos_in_item), inode); + } else if (is_direct_le_ih(ih)) { + char *p; + p = page_address(bh_result->b_page); + p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1); + copy_size = ih_item_len(ih) - pos_in_item; + + fs_gen = get_generation(inode->i_sb); + copy_item_head(&tmp_ih, ih); + + if (!trans_running) { + /* vs-3050 is gone, no need to drop the path */ + retval = journal_begin(&th, inode->i_sb, jbegin_count); + if (retval) + goto out; + reiserfs_update_inode_transaction(inode); + trans_running = 1; + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, + bh); + goto research; + } + } + + reiserfs_prepare_for_journal(inode->i_sb, bh, 1); + + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh); + goto research; + } + + memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, + copy_size); + + journal_mark_dirty(&th, inode->i_sb, bh); + bytes_copied += copy_size; + set_block_dev_mapped(bh_result, 0, inode); + + /* are there still bytes left? */ + if (bytes_copied < bh_result->b_size && + (byte_offset + bytes_copied) < inode->i_size) { + set_cpu_key_k_offset(&key, + cpu_key_k_offset(&key) + + copy_size); + goto research; + } + } else { + reiserfs_warning(inode->i_sb, "clm-6003", + "bad item inode %lu", inode->i_ino); + retval = -EIO; + goto out; + } + retval = 0; + + out: + pathrelse(&path); + if (trans_running) { + int err = journal_end(&th, inode->i_sb, jbegin_count); + if (err) + retval = err; + trans_running = 0; + } + reiserfs_write_unlock(inode->i_sb); + + /* this is where we fill in holes in the file. */ + if (use_get_block) { + retval = reiserfs_get_block(inode, block, bh_result, + GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX + | GET_BLOCK_NO_DANGLE); + if (!retval) { + if (!buffer_mapped(bh_result) + || bh_result->b_blocknr == 0) { + /* get_block failed to find a mapped unformatted node. */ + use_get_block = 0; + goto start_over; + } + } + } + kunmap(bh_result->b_page); + + if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { + /* we've copied data from the page into the direct item, so the + * buffer in the page is now clean, mark it to reflect that. + */ + lock_buffer(bh_result); + clear_buffer_dirty(bh_result); + unlock_buffer(bh_result); + } + return retval; +} + +/* + * mason@suse.com: updated in 2.5.54 to follow the same general io + * start/recovery path as __block_write_full_page, along with special + * code to handle reiserfs tails. + */ +static int reiserfs_write_full_page(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; + int error = 0; + unsigned long block; + sector_t last_block; + struct buffer_head *head, *bh; + int partial = 0; + int nr = 0; + int checked = PageChecked(page); + struct reiserfs_transaction_handle th; + struct super_block *s = inode->i_sb; + int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; + th.t_trans_id = 0; + + /* no logging allowed when nonblocking or from PF_MEMALLOC */ + if (checked && (current->flags & PF_MEMALLOC)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + + /* The page dirty bit is cleared before writepage is called, which + * means we have to tell create_empty_buffers to make dirty buffers + * The page really should be up to date at this point, so tossing + * in the BH_Uptodate is just a sanity check. + */ + if (!page_has_buffers(page)) { + create_empty_buffers(page, s->s_blocksize, + (1 << BH_Dirty) | (1 << BH_Uptodate)); + } + head = page_buffers(page); + + /* last page in the file, zero out any contents past the + ** last byte in the file + */ + if (page->index >= end_index) { + unsigned last_offset; + + last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1); + /* no file contents in this page */ + if (page->index >= end_index + 1 || !last_offset) { + unlock_page(page); + return 0; + } + zero_user_segment(page, last_offset, PAGE_CACHE_SIZE); + } + bh = head; + block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits); + last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; + /* first map all the buffers, logging any direct items we find */ + do { + if (block > last_block) { + /* + * This can happen when the block size is less than + * the page size. The corresponding bytes in the page + * were zero filled above + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + } else if ((checked || buffer_dirty(bh)) && + (!buffer_mapped(bh) || (buffer_mapped(bh) + && bh->b_blocknr == + 0))) { + /* not mapped yet, or it points to a direct item, search + * the btree for the mapping info, and log any direct + * items found + */ + if ((error = map_block_for_writepage(inode, bh, block))) { + goto fail; + } + } + bh = bh->b_this_page; + block++; + } while (bh != head); + + /* + * we start the transaction after map_block_for_writepage, + * because it can create holes in the file (an unbounded operation). + * starting it here, we can make a reliable estimate for how many + * blocks we're going to log + */ + if (checked) { + ClearPageChecked(page); + reiserfs_write_lock(s); + error = journal_begin(&th, s, bh_per_page + 1); + if (error) { + reiserfs_write_unlock(s); + goto fail; + } + reiserfs_update_inode_transaction(inode); + } + /* now go through and lock any dirty buffers on the page */ + do { + get_bh(bh); + if (!buffer_mapped(bh)) + continue; + if (buffer_mapped(bh) && bh->b_blocknr == 0) + continue; + + if (checked) { + reiserfs_prepare_for_journal(s, bh, 1); + journal_mark_dirty(&th, s, bh); + continue; + } + /* from this point on, we know the buffer is mapped to a + * real block and not a direct item + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + lock_buffer(bh); + } else { + if (!trylock_buffer(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + } + if (test_clear_buffer_dirty(bh)) { + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } while ((bh = bh->b_this_page) != head); + + if (checked) { + error = journal_end(&th, s, bh_per_page + 1); + reiserfs_write_unlock(s); + if (error) + goto fail; + } + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + + /* + * since any buffer might be the only dirty buffer on the page, + * the first submit_bh can bring the page out of writeback. + * be careful with the buffers. + */ + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(WRITE, bh); + nr++; + } + put_bh(bh); + bh = next; + } while (bh != head); + + error = 0; + done: + if (nr == 0) { + /* + * if this page only had a direct item, it is very possible for + * no io to be required without there being an error. Or, + * someone else could have locked them and sent them down the + * pipe without locking the page + */ + bh = head; + do { + if (!buffer_uptodate(bh)) { + partial = 1; + break; + } + bh = bh->b_this_page; + } while (bh != head); + if (!partial) + SetPageUptodate(page); + end_page_writeback(page); + } + return error; + + fail: + /* catches various errors, we need to make sure any valid dirty blocks + * get to the media. The page is currently locked and not marked for + * writeback + */ + ClearPageUptodate(page); + bh = head; + do { + get_bh(bh); + if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) { + lock_buffer(bh); + mark_buffer_async_write(bh); + } else { + /* + * clear any dirty bits that might have come from getting + * attached to a dirty page + */ + clear_buffer_dirty(bh); + } + bh = bh->b_this_page; + } while (bh != head); + SetPageError(page); + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + clear_buffer_dirty(bh); + submit_bh(WRITE, bh); + nr++; + } + put_bh(bh); + bh = next; + } while (bh != head); + goto done; +} + +static int reiserfs_readpage(struct file *f, struct page *page) +{ + return block_read_full_page(page, reiserfs_get_block); +} + +static int reiserfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + reiserfs_wait_on_write_block(inode->i_sb); + return reiserfs_write_full_page(page, wbc); +} + +static void reiserfs_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + reiserfs_truncate_file(inode, 0); +} + +static int reiserfs_write_begin(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode; + struct page *page; + pgoff_t index; + int ret; + int old_ref = 0; + + inode = mapping->host; + *fsdata = 0; + if (flags & AOP_FLAG_CONT_EXPAND && + (pos & (inode->i_sb->s_blocksize - 1)) == 0) { + pos ++; + *fsdata = (void *)(unsigned long)flags; + } + + index = pos >> PAGE_CACHE_SHIFT; + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + reiserfs_wait_on_write_block(inode->i_sb); + fix_tail_page_for_writing(page); + if (reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th; + th = (struct reiserfs_transaction_handle *)current-> + journal_info; + BUG_ON(!th->t_refcount); + BUG_ON(!th->t_trans_id); + old_ref = th->t_refcount; + th->t_refcount++; + } + ret = __block_write_begin(page, pos, len, reiserfs_get_block); + if (ret && reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th = current->journal_info; + /* this gets a little ugly. If reiserfs_get_block returned an + * error and left a transacstion running, we've got to close it, + * and we've got to free handle if it was a persistent transaction. + * + * But, if we had nested into an existing transaction, we need + * to just drop the ref count on the handle. + * + * If old_ref == 0, the transaction is from reiserfs_get_block, + * and it was a persistent trans. Otherwise, it was nested above. + */ + if (th->t_refcount > old_ref) { + if (old_ref) + th->t_refcount--; + else { + int err; + reiserfs_write_lock(inode->i_sb); + err = reiserfs_end_persistent_transaction(th); + reiserfs_write_unlock(inode->i_sb); + if (err) + ret = err; + } + } + } + if (ret) { + unlock_page(page); + page_cache_release(page); + /* Truncate allocated blocks */ + reiserfs_truncate_failed_write(inode); + } + return ret; +} + +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) +{ + struct inode *inode = page->mapping->host; + int ret; + int old_ref = 0; + + reiserfs_write_unlock(inode->i_sb); + reiserfs_wait_on_write_block(inode->i_sb); + reiserfs_write_lock(inode->i_sb); + + fix_tail_page_for_writing(page); + if (reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th; + th = (struct reiserfs_transaction_handle *)current-> + journal_info; + BUG_ON(!th->t_refcount); + BUG_ON(!th->t_trans_id); + old_ref = th->t_refcount; + th->t_refcount++; + } + + ret = __block_write_begin(page, from, len, reiserfs_get_block); + if (ret && reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th = current->journal_info; + /* this gets a little ugly. If reiserfs_get_block returned an + * error and left a transacstion running, we've got to close it, + * and we've got to free handle if it was a persistent transaction. + * + * But, if we had nested into an existing transaction, we need + * to just drop the ref count on the handle. + * + * If old_ref == 0, the transaction is from reiserfs_get_block, + * and it was a persistent trans. Otherwise, it was nested above. + */ + if (th->t_refcount > old_ref) { + if (old_ref) + th->t_refcount--; + else { + int err; + reiserfs_write_lock(inode->i_sb); + err = reiserfs_end_persistent_transaction(th); + reiserfs_write_unlock(inode->i_sb); + if (err) + ret = err; + } + } + } + return ret; + +} + +static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) +{ + return generic_block_bmap(as, block, reiserfs_bmap); +} + +static int reiserfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + int ret = 0; + int update_sd = 0; + struct reiserfs_transaction_handle *th; + unsigned start; + int lock_depth = 0; + bool locked = false; + + if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND) + pos ++; + + reiserfs_wait_on_write_block(inode->i_sb); + if (reiserfs_transaction_running(inode->i_sb)) + th = current->journal_info; + else + th = NULL; + + start = pos & (PAGE_CACHE_SIZE - 1); + if (unlikely(copied < len)) { + if (!PageUptodate(page)) + copied = 0; + + page_zero_new_buffers(page, start + copied, start + len); + } + flush_dcache_page(page); + + reiserfs_commit_page(inode, page, start, start + copied); + + /* generic_commit_write does this for us, but does not update the + ** transaction tracking stuff when the size changes. So, we have + ** to do the i_size updates here. + */ + if (pos + copied > inode->i_size) { + struct reiserfs_transaction_handle myth; + lock_depth = reiserfs_write_lock_once(inode->i_sb); + locked = true; + /* If the file have grown beyond the border where it + can have a tail, unmark it as needing a tail + packing */ + if ((have_large_tails(inode->i_sb) + && inode->i_size > i_block_size(inode) * 4) + || (have_small_tails(inode->i_sb) + && inode->i_size > i_block_size(inode))) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + + ret = journal_begin(&myth, inode->i_sb, 1); + if (ret) + goto journal_error; + + reiserfs_update_inode_transaction(inode); + inode->i_size = pos + copied; + /* + * this will just nest into our transaction. It's important + * to use mark_inode_dirty so the inode gets pushed around on the + * dirty lists, and so that O_SYNC works as expected + */ + mark_inode_dirty(inode); + reiserfs_update_sd(&myth, inode); + update_sd = 1; + ret = journal_end(&myth, inode->i_sb, 1); + if (ret) + goto journal_error; + } + if (th) { + if (!locked) { + lock_depth = reiserfs_write_lock_once(inode->i_sb); + locked = true; + } + if (!update_sd) + mark_inode_dirty(inode); + ret = reiserfs_end_persistent_transaction(th); + if (ret) + goto out; + } + + out: + if (locked) + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + unlock_page(page); + page_cache_release(page); + + if (pos + len > inode->i_size) + reiserfs_truncate_failed_write(inode); + + return ret == 0 ? copied : ret; + + journal_error: + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + locked = false; + if (th) { + if (!update_sd) + reiserfs_update_sd(th, inode); + ret = reiserfs_end_persistent_transaction(th); + } + goto out; +} + +int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to; + int ret = 0; + int update_sd = 0; + struct reiserfs_transaction_handle *th = NULL; + + reiserfs_write_unlock(inode->i_sb); + reiserfs_wait_on_write_block(inode->i_sb); + reiserfs_write_lock(inode->i_sb); + + if (reiserfs_transaction_running(inode->i_sb)) { + th = current->journal_info; + } + reiserfs_commit_page(inode, page, from, to); + + /* generic_commit_write does this for us, but does not update the + ** transaction tracking stuff when the size changes. So, we have + ** to do the i_size updates here. + */ + if (pos > inode->i_size) { + struct reiserfs_transaction_handle myth; + /* If the file have grown beyond the border where it + can have a tail, unmark it as needing a tail + packing */ + if ((have_large_tails(inode->i_sb) + && inode->i_size > i_block_size(inode) * 4) + || (have_small_tails(inode->i_sb) + && inode->i_size > i_block_size(inode))) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + + ret = journal_begin(&myth, inode->i_sb, 1); + if (ret) + goto journal_error; + + reiserfs_update_inode_transaction(inode); + inode->i_size = pos; + /* + * this will just nest into our transaction. It's important + * to use mark_inode_dirty so the inode gets pushed around on the + * dirty lists, and so that O_SYNC works as expected + */ + mark_inode_dirty(inode); + reiserfs_update_sd(&myth, inode); + update_sd = 1; + ret = journal_end(&myth, inode->i_sb, 1); + if (ret) + goto journal_error; + } + if (th) { + if (!update_sd) + mark_inode_dirty(inode); + ret = reiserfs_end_persistent_transaction(th); + if (ret) + goto out; + } + + out: + return ret; + + journal_error: + if (th) { + if (!update_sd) + reiserfs_update_sd(th, inode); + ret = reiserfs_end_persistent_transaction(th); + } + + return ret; +} + +void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode) +{ + if (reiserfs_attrs(inode->i_sb)) { + if (sd_attrs & REISERFS_SYNC_FL) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (sd_attrs & REISERFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (sd_attrs & REISERFS_APPEND_FL) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (sd_attrs & REISERFS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; + if (sd_attrs & REISERFS_NOTAIL_FL) + REISERFS_I(inode)->i_flags |= i_nopack_mask; + else + REISERFS_I(inode)->i_flags &= ~i_nopack_mask; + } +} + +void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs) +{ + if (reiserfs_attrs(inode->i_sb)) { + if (inode->i_flags & S_IMMUTABLE) + *sd_attrs |= REISERFS_IMMUTABLE_FL; + else + *sd_attrs &= ~REISERFS_IMMUTABLE_FL; + if (inode->i_flags & S_SYNC) + *sd_attrs |= REISERFS_SYNC_FL; + else + *sd_attrs &= ~REISERFS_SYNC_FL; + if (inode->i_flags & S_NOATIME) + *sd_attrs |= REISERFS_NOATIME_FL; + else + *sd_attrs &= ~REISERFS_NOATIME_FL; + if (REISERFS_I(inode)->i_flags & i_nopack_mask) + *sd_attrs |= REISERFS_NOTAIL_FL; + else + *sd_attrs &= ~REISERFS_NOTAIL_FL; + } +} + +/* decide if this buffer needs to stay around for data logging or ordered +** write purposes +*/ +static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) +{ + int ret = 1; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + + lock_buffer(bh); + spin_lock(&j->j_dirty_buffers_lock); + if (!buffer_mapped(bh)) { + goto free_jh; + } + /* the page is locked, and the only places that log a data buffer + * also lock the page. + */ + if (reiserfs_file_data_log(inode)) { + /* + * very conservative, leave the buffer pinned if + * anyone might need it. + */ + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { + ret = 0; + } + } else if (buffer_dirty(bh)) { + struct reiserfs_journal_list *jl; + struct reiserfs_jh *jh = bh->b_private; + + /* why is this safe? + * reiserfs_setattr updates i_size in the on disk + * stat data before allowing vmtruncate to be called. + * + * If buffer was put onto the ordered list for this + * transaction, we know for sure either this transaction + * or an older one already has updated i_size on disk, + * and this ordered data won't be referenced in the file + * if we crash. + * + * if the buffer was put onto the ordered list for an older + * transaction, we need to leave it around + */ + if (jh && (jl = jh->jl) + && jl != SB_JOURNAL(inode->i_sb)->j_current_jl) + ret = 0; + } + free_jh: + if (ret && bh->b_private) { + reiserfs_free_jh(bh); + } + spin_unlock(&j->j_dirty_buffers_lock); + unlock_buffer(bh); + return ret; +} + +/* clm -- taken from fs/buffer.c:block_invalidate_page */ +static void reiserfs_invalidatepage(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh, *next; + struct inode *inode = page->mapping->host; + unsigned int curr_off = 0; + int ret = 1; + + BUG_ON(!PageLocked(page)); + + if (offset == 0) + ClearPageChecked(page); + + if (!page_has_buffers(page)) + goto out; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + /* + * is this block fully invalidated? + */ + if (offset <= curr_off) { + if (invalidatepage_can_drop(inode, bh)) + reiserfs_unmap_buffer(bh); + else + ret = 0; + } + curr_off = next_off; + bh = next; + } while (bh != head); + + /* + * We release buffers only if the entire page is being invalidated. + * The get_block cached value has been unconditionally invalidated, + * so real IO is not possible anymore. + */ + if (!offset && ret) { + ret = try_to_release_page(page, 0); + /* maybe should BUG_ON(!ret); - neilb */ + } + out: + return; +} + +static int reiserfs_set_page_dirty(struct page *page) +{ + struct inode *inode = page->mapping->host; + if (reiserfs_file_data_log(inode)) { + SetPageChecked(page); + return __set_page_dirty_nobuffers(page); + } + return __set_page_dirty_buffers(page); +} + +/* + * Returns 1 if the page's buffers were dropped. The page is locked. + * + * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads + * in the buffers at page_buffers(page). + * + * even in -o notail mode, we can't be sure an old mount without -o notail + * didn't create files with tails. + */ +static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags) +{ + struct inode *inode = page->mapping->host; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + struct buffer_head *head; + struct buffer_head *bh; + int ret = 1; + + WARN_ON(PageChecked(page)); + spin_lock(&j->j_dirty_buffers_lock); + head = page_buffers(page); + bh = head; + do { + if (bh->b_private) { + if (!buffer_dirty(bh) && !buffer_locked(bh)) { + reiserfs_free_jh(bh); + } else { + ret = 0; + break; + } + } + bh = bh->b_this_page; + } while (bh != head); + if (ret) + ret = try_to_free_buffers(page); + spin_unlock(&j->j_dirty_buffers_lock); + return ret; +} + +/* We thank Mingming Cao for helping us understand in great detail what + to do in this section of the code. */ +static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, + reiserfs_get_blocks_direct_io, NULL); + + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely((rw & WRITE) && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + + if (end > isize) + vmtruncate(inode, isize); + } + + return ret; +} + +int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + unsigned int ia_valid; + int depth; + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + /* must be turned off for recursive notify_change calls */ + ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); + + depth = reiserfs_write_lock_once(inode->i_sb); + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + + if (attr->ia_valid & ATTR_SIZE) { + /* version 2 items will be caught by the s_maxbytes check + ** done for us in vmtruncate + */ + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && + attr->ia_size > MAX_NON_LFS) { + error = -EFBIG; + goto out; + } + /* fill in hole pointers in the expanding truncate case. */ + if (attr->ia_size > inode->i_size) { + error = generic_cont_expand_simple(inode, attr->ia_size); + if (REISERFS_I(inode)->i_prealloc_count > 0) { + int err; + struct reiserfs_transaction_handle th; + /* we're changing at most 2 bitmaps, inode + super */ + err = journal_begin(&th, inode->i_sb, 4); + if (!err) { + reiserfs_discard_prealloc(&th, inode); + err = journal_end(&th, inode->i_sb, 4); + } + if (err) + error = err; + } + if (error) + goto out; + /* + * file size is changed, ctime and mtime are + * to be updated + */ + attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME); + } + } + + if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) || + ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) && + (get_inode_sd_version(inode) == STAT_DATA_V1)) { + /* stat data of format v3.5 has 16 bit uid and gid */ + error = -EINVAL; + goto out; + } + + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + struct reiserfs_transaction_handle th; + int jbegin_count = + 2 * + (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) + + REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) + + 2; + + error = reiserfs_chown_xattrs(inode, attr); + + if (error) + return error; + + /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */ + error = journal_begin(&th, inode->i_sb, jbegin_count); + if (error) + goto out; + error = dquot_transfer(inode, attr); + if (error) { + journal_end(&th, inode->i_sb, jbegin_count); + goto out; + } + + /* Update corresponding info in inode so that everything is in + * one transaction */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + mark_inode_dirty(inode); + error = journal_end(&th, inode->i_sb, jbegin_count); + if (error) + goto out; + } + + /* + * Relax the lock here, as it might truncate the + * inode pages and wait for inode pages locks. + * To release such page lock, the owner needs the + * reiserfs lock + */ + reiserfs_write_unlock_once(inode->i_sb, depth); + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) + error = vmtruncate(inode, attr->ia_size); + + if (!error) { + setattr_copy(inode, attr); + mark_inode_dirty(inode); + } + depth = reiserfs_write_lock_once(inode->i_sb); + + if (!error && reiserfs_posixacl(inode->i_sb)) { + if (attr->ia_valid & ATTR_MODE) + error = reiserfs_acl_chmod(inode); + } + + out: + reiserfs_write_unlock_once(inode->i_sb, depth); + + return error; +} + +const struct address_space_operations reiserfs_address_space_operations = { + .writepage = reiserfs_writepage, + .readpage = reiserfs_readpage, + .readpages = reiserfs_readpages, + .releasepage = reiserfs_releasepage, + .invalidatepage = reiserfs_invalidatepage, + .write_begin = reiserfs_write_begin, + .write_end = reiserfs_write_end, + .bmap = reiserfs_aop_bmap, + .direct_IO = reiserfs_direct_IO, + .set_page_dirty = reiserfs_set_page_dirty, +}; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c new file mode 100644 index 00000000..4e153051 --- /dev/null +++ b/fs/reiserfs/ioctl.c @@ -0,0 +1,226 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * reiserfs_ioctl - handler for ioctl for inode + * supported commands: + * 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect + * and prevent packing file (argument arg has to be non-zero) + * 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION + * 3) That's all for a while ... + */ +long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + unsigned int flags; + int err = 0; + + reiserfs_write_lock(inode->i_sb); + + switch (cmd) { + case REISERFS_IOC_UNPACK: + if (S_ISREG(inode->i_mode)) { + if (arg) + err = reiserfs_unpack(inode, filp); + } else + err = -ENOTTY; + break; + /* + * following two cases are taken from fs/ext2/ioctl.c by Remy + * Card (card@masi.ibp.fr) + */ + case REISERFS_IOC_GETFLAGS: + if (!reiserfs_attrs(inode->i_sb)) { + err = -ENOTTY; + break; + } + + flags = REISERFS_I(inode)->i_attrs; + i_attrs_to_sd_attrs(inode, (__u16 *) & flags); + err = put_user(flags, (int __user *)arg); + break; + case REISERFS_IOC_SETFLAGS:{ + if (!reiserfs_attrs(inode->i_sb)) { + err = -ENOTTY; + break; + } + + err = mnt_want_write(filp->f_path.mnt); + if (err) + break; + + if (!inode_owner_or_capable(inode)) { + err = -EPERM; + goto setflags_out; + } + if (get_user(flags, (int __user *)arg)) { + err = -EFAULT; + goto setflags_out; + } + /* + * Is it quota file? Do not allow user to mess with it + */ + if (IS_NOQUOTA(inode)) { + err = -EPERM; + goto setflags_out; + } + if (((flags ^ REISERFS_I(inode)-> + i_attrs) & (REISERFS_IMMUTABLE_FL | + REISERFS_APPEND_FL)) + && !capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto setflags_out; + } + if ((flags & REISERFS_NOTAIL_FL) && + S_ISREG(inode->i_mode)) { + int result; + + result = reiserfs_unpack(inode, filp); + if (result) { + err = result; + goto setflags_out; + } + } + sd_attrs_to_i_attrs(flags, inode); + REISERFS_I(inode)->i_attrs = flags; + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +setflags_out: + mnt_drop_write(filp->f_path.mnt); + break; + } + case REISERFS_IOC_GETVERSION: + err = put_user(inode->i_generation, (int __user *)arg); + break; + case REISERFS_IOC_SETVERSION: + if (!inode_owner_or_capable(inode)) { + err = -EPERM; + break; + } + err = mnt_want_write(filp->f_path.mnt); + if (err) + break; + if (get_user(inode->i_generation, (int __user *)arg)) { + err = -EFAULT; + goto setversion_out; + } + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +setversion_out: + mnt_drop_write(filp->f_path.mnt); + break; + default: + err = -ENOTTY; + } + + reiserfs_write_unlock(inode->i_sb); + + return err; +} + +#ifdef CONFIG_COMPAT +long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case REISERFS_IOC32_UNPACK: + cmd = REISERFS_IOC_UNPACK; + break; + case REISERFS_IOC32_GETFLAGS: + cmd = REISERFS_IOC_GETFLAGS; + break; + case REISERFS_IOC32_SETFLAGS: + cmd = REISERFS_IOC_SETFLAGS; + break; + case REISERFS_IOC32_GETVERSION: + cmd = REISERFS_IOC_GETVERSION; + break; + case REISERFS_IOC32_SETVERSION: + cmd = REISERFS_IOC_SETVERSION; + break; + default: + return -ENOIOCTLCMD; + } + + return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + +int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to); +/* +** reiserfs_unpack +** Function try to convert tail from direct item into indirect. +** It set up nopack attribute in the REISERFS_I(inode)->nopack +*/ +int reiserfs_unpack(struct inode *inode, struct file *filp) +{ + int retval = 0; + int depth; + int index; + struct page *page; + struct address_space *mapping; + unsigned long write_from; + unsigned long blocksize = inode->i_sb->s_blocksize; + + if (inode->i_size == 0) { + REISERFS_I(inode)->i_flags |= i_nopack_mask; + return 0; + } + /* ioctl already done */ + if (REISERFS_I(inode)->i_flags & i_nopack_mask) { + return 0; + } + + depth = reiserfs_write_lock_once(inode->i_sb); + + /* we need to make sure nobody is changing the file size beneath us */ + reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); + + write_from = inode->i_size & (blocksize - 1); + /* if we are on a block boundary, we are already unpacked. */ + if (write_from == 0) { + REISERFS_I(inode)->i_flags |= i_nopack_mask; + goto out; + } + + /* we unpack by finding the page with the tail, and calling + ** __reiserfs_write_begin on that page. This will force a + ** reiserfs_get_block to unpack the tail for us. + */ + index = inode->i_size >> PAGE_CACHE_SHIFT; + mapping = inode->i_mapping; + page = grab_cache_page(mapping, index); + retval = -ENOMEM; + if (!page) { + goto out; + } + retval = __reiserfs_write_begin(page, write_from, 0); + if (retval) + goto out_unlock; + + /* conversion can change page contents, must flush */ + flush_dcache_page(page); + retval = reiserfs_commit_write(NULL, page, write_from, write_from); + REISERFS_I(inode)->i_flags |= i_nopack_mask; + + out_unlock: + unlock_page(page); + page_cache_release(page); + + out: + mutex_unlock(&inode->i_mutex); + reiserfs_write_unlock_once(inode->i_sb, depth); + return retval; +} diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c new file mode 100644 index 00000000..72cb1cc5 --- /dev/null +++ b/fs/reiserfs/item_ops.c @@ -0,0 +1,756 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include + +// this contains item handlers for old item types: sd, direct, +// indirect, directory + +/* and where are the comments? how about saying where we can find an + explanation of each item handler method? -Hans */ + +////////////////////////////////////////////////////////////////////////////// +// stat data functions +// +static int sd_bytes_number(struct item_head *ih, int block_size) +{ + return 0; +} + +static void sd_decrement_key(struct cpu_key *key) +{ + key->on_disk_key.k_objectid--; + set_cpu_key_k_type(key, TYPE_ANY); + set_cpu_key_k_offset(key, (loff_t)(~0ULL >> 1)); +} + +static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize) +{ + return 0; +} + +static char *print_time(time_t t) +{ + static char timebuf[256]; + + sprintf(timebuf, "%ld", t); + return timebuf; +} + +static void sd_print_item(struct item_head *ih, char *item) +{ + printk("\tmode | size | nlinks | first direct | mtime\n"); + if (stat_data_v1(ih)) { + struct stat_data_v1 *sd = (struct stat_data_v1 *)item; + + printk("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd), + sd_v1_size(sd), sd_v1_nlink(sd), + sd_v1_first_direct_byte(sd), + print_time(sd_v1_mtime(sd))); + } else { + struct stat_data *sd = (struct stat_data *)item; + + printk("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd), + (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), + sd_v2_rdev(sd), print_time(sd_v2_mtime(sd))); + } +} + +static void sd_check_item(struct item_head *ih, char *item) +{ + // FIXME: type something here! +} + +static int sd_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + vi->vi_index = TYPE_STAT_DATA; + //vi->vi_type |= VI_TYPE_STAT_DATA;// not needed? + return 0; +} + +static int sd_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + BUG_ON(start_skip || end_skip); + return -1; +} + +static int sd_check_right(struct virtual_item *vi, int free) +{ + return -1; +} + +static int sd_part_size(struct virtual_item *vi, int first, int count) +{ + BUG_ON(count); + return 0; +} + +static int sd_unit_num(struct virtual_item *vi) +{ + return vi->vi_item_len - IH_SIZE; +} + +static void sd_print_vi(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "reiserfs-16100", + "STATDATA, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +static struct item_operations stat_data_ops = { + .bytes_number = sd_bytes_number, + .decrement_key = sd_decrement_key, + .is_left_mergeable = sd_is_left_mergeable, + .print_item = sd_print_item, + .check_item = sd_check_item, + + .create_vi = sd_create_vi, + .check_left = sd_check_left, + .check_right = sd_check_right, + .part_size = sd_part_size, + .unit_num = sd_unit_num, + .print_vi = sd_print_vi +}; + +////////////////////////////////////////////////////////////////////////////// +// direct item functions +// +static int direct_bytes_number(struct item_head *ih, int block_size) +{ + return ih_item_len(ih); +} + +// FIXME: this should probably switch to indirect as well +static void direct_decrement_key(struct cpu_key *key) +{ + cpu_key_k_offset_dec(key); + if (cpu_key_k_offset(key) == 0) + set_cpu_key_k_type(key, TYPE_STAT_DATA); +} + +static int direct_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) +{ + int version = le_key_version(key); + return ((le_key_k_offset(version, key) & (bsize - 1)) != 1); +} + +static void direct_print_item(struct item_head *ih, char *item) +{ + int j = 0; + +// return; + printk("\""); + while (j < ih_item_len(ih)) + printk("%c", item[j++]); + printk("\"\n"); +} + +static void direct_check_item(struct item_head *ih, char *item) +{ + // FIXME: type something here! +} + +static int direct_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + vi->vi_index = TYPE_DIRECT; + //vi->vi_type |= VI_TYPE_DIRECT; + return 0; +} + +static int direct_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + int bytes; + + bytes = free - free % 8; + return bytes ? : -1; +} + +static int direct_check_right(struct virtual_item *vi, int free) +{ + return direct_check_left(vi, free, 0, 0); +} + +static int direct_part_size(struct virtual_item *vi, int first, int count) +{ + return count; +} + +static int direct_unit_num(struct virtual_item *vi) +{ + return vi->vi_item_len - IH_SIZE; +} + +static void direct_print_vi(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "reiserfs-16101", + "DIRECT, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +static struct item_operations direct_ops = { + .bytes_number = direct_bytes_number, + .decrement_key = direct_decrement_key, + .is_left_mergeable = direct_is_left_mergeable, + .print_item = direct_print_item, + .check_item = direct_check_item, + + .create_vi = direct_create_vi, + .check_left = direct_check_left, + .check_right = direct_check_right, + .part_size = direct_part_size, + .unit_num = direct_unit_num, + .print_vi = direct_print_vi +}; + +////////////////////////////////////////////////////////////////////////////// +// indirect item functions +// + +static int indirect_bytes_number(struct item_head *ih, int block_size) +{ + return ih_item_len(ih) / UNFM_P_SIZE * block_size; //- get_ih_free_space (ih); +} + +// decrease offset, if it becomes 0, change type to stat data +static void indirect_decrement_key(struct cpu_key *key) +{ + cpu_key_k_offset_dec(key); + if (cpu_key_k_offset(key) == 0) + set_cpu_key_k_type(key, TYPE_STAT_DATA); +} + +// if it is not first item of the body, then it is mergeable +static int indirect_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) +{ + int version = le_key_version(key); + return (le_key_k_offset(version, key) != 1); +} + +// printing of indirect item +static void start_new_sequence(__u32 * start, int *len, __u32 new) +{ + *start = new; + *len = 1; +} + +static int sequence_finished(__u32 start, int *len, __u32 new) +{ + if (start == INT_MAX) + return 1; + + if (start == 0 && new == 0) { + (*len)++; + return 0; + } + if (start != 0 && (start + *len) == new) { + (*len)++; + return 0; + } + return 1; +} + +static void print_sequence(__u32 start, int len) +{ + if (start == INT_MAX) + return; + + if (len == 1) + printk(" %d", start); + else + printk(" %d(%d)", start, len); +} + +static void indirect_print_item(struct item_head *ih, char *item) +{ + int j; + __le32 *unp; + __u32 prev = INT_MAX; + int num = 0; + + unp = (__le32 *) item; + + if (ih_item_len(ih) % UNFM_P_SIZE) + reiserfs_warning(NULL, "reiserfs-16102", "invalid item len"); + + printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih)); + for (j = 0; j < I_UNFM_NUM(ih); j++) { + if (sequence_finished(prev, &num, get_block_num(unp, j))) { + print_sequence(prev, num); + start_new_sequence(&prev, &num, get_block_num(unp, j)); + } + } + print_sequence(prev, num); + printk("]\n"); +} + +static void indirect_check_item(struct item_head *ih, char *item) +{ + // FIXME: type something here! +} + +static int indirect_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + vi->vi_index = TYPE_INDIRECT; + //vi->vi_type |= VI_TYPE_INDIRECT; + return 0; +} + +static int indirect_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + int bytes; + + bytes = free - free % UNFM_P_SIZE; + return bytes ? : -1; +} + +static int indirect_check_right(struct virtual_item *vi, int free) +{ + return indirect_check_left(vi, free, 0, 0); +} + +// return size in bytes of 'units' units. If first == 0 - calculate from the head (left), otherwise - from tail (right) +static int indirect_part_size(struct virtual_item *vi, int first, int units) +{ + // unit of indirect item is byte (yet) + return units; +} + +static int indirect_unit_num(struct virtual_item *vi) +{ + // unit of indirect item is byte (yet) + return vi->vi_item_len - IH_SIZE; +} + +static void indirect_print_vi(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "reiserfs-16103", + "INDIRECT, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +static struct item_operations indirect_ops = { + .bytes_number = indirect_bytes_number, + .decrement_key = indirect_decrement_key, + .is_left_mergeable = indirect_is_left_mergeable, + .print_item = indirect_print_item, + .check_item = indirect_check_item, + + .create_vi = indirect_create_vi, + .check_left = indirect_check_left, + .check_right = indirect_check_right, + .part_size = indirect_part_size, + .unit_num = indirect_unit_num, + .print_vi = indirect_print_vi +}; + +////////////////////////////////////////////////////////////////////////////// +// direntry functions +// + +static int direntry_bytes_number(struct item_head *ih, int block_size) +{ + reiserfs_warning(NULL, "vs-16090", + "bytes number is asked for direntry"); + return 0; +} + +static void direntry_decrement_key(struct cpu_key *key) +{ + cpu_key_k_offset_dec(key); + if (cpu_key_k_offset(key) == 0) + set_cpu_key_k_type(key, TYPE_STAT_DATA); +} + +static int direntry_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) +{ + if (le32_to_cpu(key->u.k_offset_v1.k_offset) == DOT_OFFSET) + return 0; + return 1; + +} + +static void direntry_print_item(struct item_head *ih, char *item) +{ + int i; + int namelen; + struct reiserfs_de_head *deh; + char *name; + static char namebuf[80]; + + printk("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name", + "Key of pointed object", "Hash", "Gen number", "Status"); + + deh = (struct reiserfs_de_head *)item; + + for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) { + namelen = + (i ? (deh_location(deh - 1)) : ih_item_len(ih)) - + deh_location(deh); + name = item + deh_location(deh); + if (name[namelen - 1] == 0) + namelen = strlen(name); + namebuf[0] = '"'; + if (namelen > sizeof(namebuf) - 3) { + strncpy(namebuf + 1, name, sizeof(namebuf) - 3); + namebuf[sizeof(namebuf) - 2] = '"'; + namebuf[sizeof(namebuf) - 1] = 0; + } else { + memcpy(namebuf + 1, name, namelen); + namebuf[namelen + 1] = '"'; + namebuf[namelen + 2] = 0; + } + + printk("%d: %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n", + i, namebuf, + deh_dir_id(deh), deh_objectid(deh), + GET_HASH_VALUE(deh_offset(deh)), + GET_GENERATION_NUMBER((deh_offset(deh))), + (de_hidden(deh)) ? "HIDDEN" : "VISIBLE"); + } +} + +static void direntry_check_item(struct item_head *ih, char *item) +{ + int i; + struct reiserfs_de_head *deh; + + // FIXME: type something here! + deh = (struct reiserfs_de_head *)item; + for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) { + ; + } +} + +#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1 + +/* + * function returns old entry number in directory item in real node + * using new entry number in virtual item in virtual node */ +static inline int old_entry_num(int is_affected, int virtual_entry_num, + int pos_in_item, int mode) +{ + if (mode == M_INSERT || mode == M_DELETE) + return virtual_entry_num; + + if (!is_affected) + /* cut or paste is applied to another item */ + return virtual_entry_num; + + if (virtual_entry_num < pos_in_item) + return virtual_entry_num; + + if (mode == M_CUT) + return virtual_entry_num + 1; + + RFALSE(mode != M_PASTE || virtual_entry_num == 0, + "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'", + mode); + + return virtual_entry_num - 1; +} + +/* Create an array of sizes of directory entries for virtual + item. Return space used by an item. FIXME: no control over + consuming of space used by this item handler */ +static int direntry_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + struct direntry_uarea *dir_u = vi->vi_uarea; + int i, j; + int size = sizeof(struct direntry_uarea); + struct reiserfs_de_head *deh; + + vi->vi_index = TYPE_DIRENTRY; + + BUG_ON(!(vi->vi_ih) || !vi->vi_item); + + dir_u->flags = 0; + if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET) + dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM; + + deh = (struct reiserfs_de_head *)(vi->vi_item); + + /* virtual directory item have this amount of entry after */ + dir_u->entry_count = ih_entry_count(vi->vi_ih) + + ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 : + (vn->vn_mode == M_PASTE ? 1 : 0)) : 0); + + for (i = 0; i < dir_u->entry_count; i++) { + j = old_entry_num(is_affected, i, vn->vn_pos_in_item, + vn->vn_mode); + dir_u->entry_sizes[i] = + (j ? deh_location(&(deh[j - 1])) : ih_item_len(vi->vi_ih)) - + deh_location(&(deh[j])) + DEH_SIZE; + } + + size += (dir_u->entry_count * sizeof(short)); + + /* set size of pasted entry */ + if (is_affected && vn->vn_mode == M_PASTE) + dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size; + +#ifdef CONFIG_REISERFS_CHECK + /* compare total size of entries with item length */ + { + int k, l; + + l = 0; + for (k = 0; k < dir_u->entry_count; k++) + l += dir_u->entry_sizes[k]; + + if (l + IH_SIZE != vi->vi_item_len + + ((is_affected + && (vn->vn_mode == M_PASTE + || vn->vn_mode == M_CUT)) ? insert_size : 0)) { + reiserfs_panic(NULL, "vs-8025", "(mode==%c, " + "insert_size==%d), invalid length of " + "directory item", + vn->vn_mode, insert_size); + } + } +#endif + + return size; + +} + +// +// return number of entries which may fit into specified amount of +// free space, or -1 if free space is not enough even for 1 entry +// +static int direntry_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + int i; + int entries = 0; + struct direntry_uarea *dir_u = vi->vi_uarea; + + for (i = start_skip; i < dir_u->entry_count - end_skip; i++) { + if (dir_u->entry_sizes[i] > free) + /* i-th entry doesn't fit into the remaining free space */ + break; + + free -= dir_u->entry_sizes[i]; + entries++; + } + + if (entries == dir_u->entry_count) { + reiserfs_panic(NULL, "item_ops-1", + "free space %d, entry_count %d", free, + dir_u->entry_count); + } + + /* "." and ".." can not be separated from each other */ + if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) + && entries < 2) + entries = 0; + + return entries ? : -1; +} + +static int direntry_check_right(struct virtual_item *vi, int free) +{ + int i; + int entries = 0; + struct direntry_uarea *dir_u = vi->vi_uarea; + + for (i = dir_u->entry_count - 1; i >= 0; i--) { + if (dir_u->entry_sizes[i] > free) + /* i-th entry doesn't fit into the remaining free space */ + break; + + free -= dir_u->entry_sizes[i]; + entries++; + } + BUG_ON(entries == dir_u->entry_count); + + /* "." and ".." can not be separated from each other */ + if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) + && entries > dir_u->entry_count - 2) + entries = dir_u->entry_count - 2; + + return entries ? : -1; +} + +/* sum of entry sizes between from-th and to-th entries including both edges */ +static int direntry_part_size(struct virtual_item *vi, int first, int count) +{ + int i, retval; + int from, to; + struct direntry_uarea *dir_u = vi->vi_uarea; + + retval = 0; + if (first == 0) + from = 0; + else + from = dir_u->entry_count - count; + to = from + count - 1; + + for (i = from; i <= to; i++) + retval += dir_u->entry_sizes[i]; + + return retval; +} + +static int direntry_unit_num(struct virtual_item *vi) +{ + struct direntry_uarea *dir_u = vi->vi_uarea; + + return dir_u->entry_count; +} + +static void direntry_print_vi(struct virtual_item *vi) +{ + int i; + struct direntry_uarea *dir_u = vi->vi_uarea; + + reiserfs_warning(NULL, "reiserfs-16104", + "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x", + vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags); + printk("%d entries: ", dir_u->entry_count); + for (i = 0; i < dir_u->entry_count; i++) + printk("%d ", dir_u->entry_sizes[i]); + printk("\n"); +} + +static struct item_operations direntry_ops = { + .bytes_number = direntry_bytes_number, + .decrement_key = direntry_decrement_key, + .is_left_mergeable = direntry_is_left_mergeable, + .print_item = direntry_print_item, + .check_item = direntry_check_item, + + .create_vi = direntry_create_vi, + .check_left = direntry_check_left, + .check_right = direntry_check_right, + .part_size = direntry_part_size, + .unit_num = direntry_unit_num, + .print_vi = direntry_print_vi +}; + +////////////////////////////////////////////////////////////////////////////// +// Error catching functions to catch errors caused by incorrect item types. +// +static int errcatch_bytes_number(struct item_head *ih, int block_size) +{ + reiserfs_warning(NULL, "green-16001", + "Invalid item type observed, run fsck ASAP"); + return 0; +} + +static void errcatch_decrement_key(struct cpu_key *key) +{ + reiserfs_warning(NULL, "green-16002", + "Invalid item type observed, run fsck ASAP"); +} + +static int errcatch_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) +{ + reiserfs_warning(NULL, "green-16003", + "Invalid item type observed, run fsck ASAP"); + return 0; +} + +static void errcatch_print_item(struct item_head *ih, char *item) +{ + reiserfs_warning(NULL, "green-16004", + "Invalid item type observed, run fsck ASAP"); +} + +static void errcatch_check_item(struct item_head *ih, char *item) +{ + reiserfs_warning(NULL, "green-16005", + "Invalid item type observed, run fsck ASAP"); +} + +static int errcatch_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + reiserfs_warning(NULL, "green-16006", + "Invalid item type observed, run fsck ASAP"); + return 0; // We might return -1 here as well, but it won't help as create_virtual_node() from where + // this operation is called from is of return type void. +} + +static int errcatch_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + reiserfs_warning(NULL, "green-16007", + "Invalid item type observed, run fsck ASAP"); + return -1; +} + +static int errcatch_check_right(struct virtual_item *vi, int free) +{ + reiserfs_warning(NULL, "green-16008", + "Invalid item type observed, run fsck ASAP"); + return -1; +} + +static int errcatch_part_size(struct virtual_item *vi, int first, int count) +{ + reiserfs_warning(NULL, "green-16009", + "Invalid item type observed, run fsck ASAP"); + return 0; +} + +static int errcatch_unit_num(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "green-16010", + "Invalid item type observed, run fsck ASAP"); + return 0; +} + +static void errcatch_print_vi(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "green-16011", + "Invalid item type observed, run fsck ASAP"); +} + +static struct item_operations errcatch_ops = { + errcatch_bytes_number, + errcatch_decrement_key, + errcatch_is_left_mergeable, + errcatch_print_item, + errcatch_check_item, + + errcatch_create_vi, + errcatch_check_left, + errcatch_check_right, + errcatch_part_size, + errcatch_unit_num, + errcatch_print_vi +}; + +////////////////////////////////////////////////////////////////////////////// +// +// +#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) +#error Item types must use disk-format assigned values. +#endif + +struct item_operations *item_ops[TYPE_ANY + 1] = { + &stat_data_ops, + &indirect_ops, + &direct_ops, + &direntry_ops, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + &errcatch_ops /* This is to catch errors with invalid type (15th entry for TYPE_ANY) */ +}; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c new file mode 100644 index 00000000..c5e82ece --- /dev/null +++ b/fs/reiserfs/journal.c @@ -0,0 +1,4319 @@ +/* +** Write ahead logging implementation copyright Chris Mason 2000 +** +** The background commits make this code very interrelated, and +** overly complex. I need to rethink things a bit....The major players: +** +** journal_begin -- call with the number of blocks you expect to log. +** If the current transaction is too +** old, it will block until the current transaction is +** finished, and then start a new one. +** Usually, your transaction will get joined in with +** previous ones for speed. +** +** journal_join -- same as journal_begin, but won't block on the current +** transaction regardless of age. Don't ever call +** this. Ever. There are only two places it should be +** called from, and they are both inside this file. +** +** journal_mark_dirty -- adds blocks into this transaction. clears any flags +** that might make them get sent to disk +** and then marks them BH_JDirty. Puts the buffer head +** into the current transaction hash. +** +** journal_end -- if the current transaction is batchable, it does nothing +** otherwise, it could do an async/synchronous commit, or +** a full flush of all log and real blocks in the +** transaction. +** +** flush_old_commits -- if the current transaction is too old, it is ended and +** commit blocks are sent to disk. Forces commit blocks +** to disk for all backgrounded commits that have been +** around too long. +** -- Note, if you call this as an immediate flush from +** from within kupdate, it will ignore the immediate flag +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* gets a struct reiserfs_journal_list * from a list head */ +#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ + j_list)) +#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ + j_working_list)) + +/* the number of mounted filesystems. This is used to decide when to +** start and kill the commit workqueue +*/ +static int reiserfs_mounted_fs_count; + +static struct workqueue_struct *commit_wq; + +#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit + structs at 4k */ +#define BUFNR 64 /*read ahead */ + +/* cnode stat bits. Move these into reiserfs_fs.h */ + +#define BLOCK_FREED 2 /* this block was freed, and can't be written. */ +#define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */ + +#define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */ +#define BLOCK_DIRTIED 5 + +/* journal list state bits */ +#define LIST_TOUCHED 1 +#define LIST_DIRTY 2 +#define LIST_COMMIT_PENDING 4 /* someone will commit this list */ + +/* flags for do_journal_end */ +#define FLUSH_ALL 1 /* flush commit and real blocks */ +#define COMMIT_NOW 2 /* end and commit this transaction */ +#define WAIT 4 /* wait for the log blocks to hit the disk */ + +static int do_journal_end(struct reiserfs_transaction_handle *, + struct super_block *, unsigned long nblocks, + int flags); +static int flush_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall); +static int flush_commit_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall); +static int can_dirty(struct reiserfs_journal_cnode *cn); +static int journal_join(struct reiserfs_transaction_handle *th, + struct super_block *sb, unsigned long nblocks); +static int release_journal_dev(struct super_block *super, + struct reiserfs_journal *journal); +static int dirty_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl); +static void flush_async_commits(struct work_struct *work); +static void queue_log_writer(struct super_block *s); + +/* values for join in do_journal_begin_r */ +enum { + JBEGIN_REG = 0, /* regular journal begin */ + JBEGIN_JOIN = 1, /* join the running transaction if at all possible */ + JBEGIN_ABORT = 2, /* called from cleanup code, ignores aborted flag */ +}; + +static int do_journal_begin_r(struct reiserfs_transaction_handle *th, + struct super_block *sb, + unsigned long nblocks, int join); + +static void init_journal_hash(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + memset(journal->j_hash_table, 0, + JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)); +} + +/* +** clears BH_Dirty and sticks the buffer on the clean list. Called because I can't allow refile_buffer to +** make schedule happen after I've freed a block. Look at remove_from_transaction and journal_mark_freed for +** more details. +*/ +static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) +{ + if (bh) { + clear_buffer_dirty(bh); + clear_buffer_journal_test(bh); + } + return 0; +} + +static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block + *sb) +{ + struct reiserfs_bitmap_node *bn; + static int id; + + bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS); + if (!bn) { + return NULL; + } + bn->data = kzalloc(sb->s_blocksize, GFP_NOFS); + if (!bn->data) { + kfree(bn); + return NULL; + } + bn->id = id++; + INIT_LIST_HEAD(&bn->list); + return bn; +} + +static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_bitmap_node *bn = NULL; + struct list_head *entry = journal->j_bitmap_nodes.next; + + journal->j_used_bitmap_nodes++; + repeat: + + if (entry != &journal->j_bitmap_nodes) { + bn = list_entry(entry, struct reiserfs_bitmap_node, list); + list_del(entry); + memset(bn->data, 0, sb->s_blocksize); + journal->j_free_bitmap_nodes--; + return bn; + } + bn = allocate_bitmap_node(sb); + if (!bn) { + yield(); + goto repeat; + } + return bn; +} +static inline void free_bitmap_node(struct super_block *sb, + struct reiserfs_bitmap_node *bn) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + journal->j_used_bitmap_nodes--; + if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { + kfree(bn->data); + kfree(bn); + } else { + list_add(&bn->list, &journal->j_bitmap_nodes); + journal->j_free_bitmap_nodes++; + } +} + +static void allocate_bitmap_nodes(struct super_block *sb) +{ + int i; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_bitmap_node *bn = NULL; + for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) { + bn = allocate_bitmap_node(sb); + if (bn) { + list_add(&bn->list, &journal->j_bitmap_nodes); + journal->j_free_bitmap_nodes++; + } else { + break; /* this is ok, we'll try again when more are needed */ + } + } +} + +static int set_bit_in_list_bitmap(struct super_block *sb, + b_blocknr_t block, + struct reiserfs_list_bitmap *jb) +{ + unsigned int bmap_nr = block / (sb->s_blocksize << 3); + unsigned int bit_nr = block % (sb->s_blocksize << 3); + + if (!jb->bitmaps[bmap_nr]) { + jb->bitmaps[bmap_nr] = get_bitmap_node(sb); + } + set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data); + return 0; +} + +static void cleanup_bitmap_list(struct super_block *sb, + struct reiserfs_list_bitmap *jb) +{ + int i; + if (jb->bitmaps == NULL) + return; + + for (i = 0; i < reiserfs_bmap_count(sb); i++) { + if (jb->bitmaps[i]) { + free_bitmap_node(sb, jb->bitmaps[i]); + jb->bitmaps[i] = NULL; + } + } +} + +/* +** only call this on FS unmount. +*/ +static int free_list_bitmaps(struct super_block *sb, + struct reiserfs_list_bitmap *jb_array) +{ + int i; + struct reiserfs_list_bitmap *jb; + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + jb = jb_array + i; + jb->journal_list = NULL; + cleanup_bitmap_list(sb, jb); + vfree(jb->bitmaps); + jb->bitmaps = NULL; + } + return 0; +} + +static int free_bitmap_nodes(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct list_head *next = journal->j_bitmap_nodes.next; + struct reiserfs_bitmap_node *bn; + + while (next != &journal->j_bitmap_nodes) { + bn = list_entry(next, struct reiserfs_bitmap_node, list); + list_del(next); + kfree(bn->data); + kfree(bn); + next = journal->j_bitmap_nodes.next; + journal->j_free_bitmap_nodes--; + } + + return 0; +} + +/* +** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. +** jb_array is the array to be filled in. +*/ +int reiserfs_allocate_list_bitmaps(struct super_block *sb, + struct reiserfs_list_bitmap *jb_array, + unsigned int bmap_nr) +{ + int i; + int failed = 0; + struct reiserfs_list_bitmap *jb; + int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *); + + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + jb = jb_array + i; + jb->journal_list = NULL; + jb->bitmaps = vmalloc(mem); + if (!jb->bitmaps) { + reiserfs_warning(sb, "clm-2000", "unable to " + "allocate bitmaps for journal lists"); + failed = 1; + break; + } + memset(jb->bitmaps, 0, mem); + } + if (failed) { + free_list_bitmaps(sb, jb_array); + return -1; + } + return 0; +} + +/* +** find an available list bitmap. If you can't find one, flush a commit list +** and try again +*/ +static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb, + struct reiserfs_journal_list + *jl) +{ + int i, j; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_list_bitmap *jb = NULL; + + for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) { + i = journal->j_list_bitmap_index; + journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS; + jb = journal->j_list_bitmap + i; + if (journal->j_list_bitmap[i].journal_list) { + flush_commit_list(sb, + journal->j_list_bitmap[i]. + journal_list, 1); + if (!journal->j_list_bitmap[i].journal_list) { + break; + } + } else { + break; + } + } + if (jb->journal_list) { /* double check to make sure if flushed correctly */ + return NULL; + } + jb->journal_list = jl; + return jb; +} + +/* +** allocates a new chunk of X nodes, and links them all together as a list. +** Uses the cnode->next and cnode->prev pointers +** returns NULL on failure +*/ +static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) +{ + struct reiserfs_journal_cnode *head; + int i; + if (num_cnodes <= 0) { + return NULL; + } + head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)); + if (!head) { + return NULL; + } + memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode)); + head[0].prev = NULL; + head[0].next = head + 1; + for (i = 1; i < num_cnodes; i++) { + head[i].prev = head + (i - 1); + head[i].next = head + (i + 1); /* if last one, overwrite it after the if */ + } + head[num_cnodes - 1].next = NULL; + return head; +} + +/* +** pulls a cnode off the free list, or returns NULL on failure +*/ +static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb) +{ + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + reiserfs_check_lock_depth(sb, "get_cnode"); + + if (journal->j_cnode_free <= 0) { + return NULL; + } + journal->j_cnode_used++; + journal->j_cnode_free--; + cn = journal->j_cnode_free_list; + if (!cn) { + return cn; + } + if (cn->next) { + cn->next->prev = NULL; + } + journal->j_cnode_free_list = cn->next; + memset(cn, 0, sizeof(struct reiserfs_journal_cnode)); + return cn; +} + +/* +** returns a cnode to the free list +*/ +static void free_cnode(struct super_block *sb, + struct reiserfs_journal_cnode *cn) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + reiserfs_check_lock_depth(sb, "free_cnode"); + + journal->j_cnode_used--; + journal->j_cnode_free++; + /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */ + cn->next = journal->j_cnode_free_list; + if (journal->j_cnode_free_list) { + journal->j_cnode_free_list->prev = cn; + } + cn->prev = NULL; /* not needed with the memset, but I might kill the memset, and forget to do this */ + journal->j_cnode_free_list = cn; +} + +static void clear_prepared_bits(struct buffer_head *bh) +{ + clear_buffer_journal_prepared(bh); + clear_buffer_journal_restore_dirty(bh); +} + +/* return a cnode with same dev, block number and size in table, or null if not found */ +static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct + super_block + *sb, + struct + reiserfs_journal_cnode + **table, + long bl) +{ + struct reiserfs_journal_cnode *cn; + cn = journal_hash(table, sb, bl); + while (cn) { + if (cn->blocknr == bl && cn->sb == sb) + return cn; + cn = cn->hnext; + } + return (struct reiserfs_journal_cnode *)0; +} + +/* +** this actually means 'can this block be reallocated yet?'. If you set search_all, a block can only be allocated +** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever +** being overwritten by a replay after crashing. +** +** If you don't set search_all, a block can only be allocated if it is not in the current transaction. Since deleting +** a block removes it from the current transaction, this case should never happen. If you don't set search_all, make +** sure you never write the block without logging it. +** +** next_zero_bit is a suggestion about the next block to try for find_forward. +** when bl is rejected because it is set in a journal list bitmap, we search +** for the next zero bit in the bitmap that rejected bl. Then, we return that +** through next_zero_bit for find_forward to try. +** +** Just because we return something in next_zero_bit does not mean we won't +** reject it on the next call to reiserfs_in_journal +** +*/ +int reiserfs_in_journal(struct super_block *sb, + unsigned int bmap_nr, int bit_nr, int search_all, + b_blocknr_t * next_zero_bit) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn; + struct reiserfs_list_bitmap *jb; + int i; + unsigned long bl; + + *next_zero_bit = 0; /* always start this at zero. */ + + PROC_INFO_INC(sb, journal.in_journal); + /* If we aren't doing a search_all, this is a metablock, and it will be logged before use. + ** if we crash before the transaction that freed it commits, this transaction won't + ** have committed either, and the block will never be written + */ + if (search_all) { + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + PROC_INFO_INC(sb, journal.in_journal_bitmap); + jb = journal->j_list_bitmap + i; + if (jb->journal_list && jb->bitmaps[bmap_nr] && + test_bit(bit_nr, + (unsigned long *)jb->bitmaps[bmap_nr]-> + data)) { + *next_zero_bit = + find_next_zero_bit((unsigned long *) + (jb->bitmaps[bmap_nr]-> + data), + sb->s_blocksize << 3, + bit_nr + 1); + return 1; + } + } + } + + bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr; + /* is it in any old transactions? */ + if (search_all + && (cn = + get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) { + return 1; + } + + /* is it in the current transaction. This should never happen */ + if ((cn = get_journal_hash_dev(sb, journal->j_hash_table, bl))) { + BUG(); + return 1; + } + + PROC_INFO_INC(sb, journal.in_journal_reusable); + /* safe for reuse */ + return 0; +} + +/* insert cn into table +*/ +static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, + struct reiserfs_journal_cnode *cn) +{ + struct reiserfs_journal_cnode *cn_orig; + + cn_orig = journal_hash(table, cn->sb, cn->blocknr); + cn->hnext = cn_orig; + cn->hprev = NULL; + if (cn_orig) { + cn_orig->hprev = cn; + } + journal_hash(table, cn->sb, cn->blocknr) = cn; +} + +/* lock the current transaction */ +static inline void lock_journal(struct super_block *sb) +{ + PROC_INFO_INC(sb, journal.lock_journal); + + reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb); +} + +/* unlock the current transaction */ +static inline void unlock_journal(struct super_block *sb) +{ + mutex_unlock(&SB_JOURNAL(sb)->j_mutex); +} + +static inline void get_journal_list(struct reiserfs_journal_list *jl) +{ + jl->j_refcount++; +} + +static inline void put_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + if (jl->j_refcount < 1) { + reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d", + jl->j_trans_id, jl->j_refcount); + } + if (--jl->j_refcount == 0) + kfree(jl); +} + +/* +** this used to be much more involved, and I'm keeping it just in case things get ugly again. +** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a +** transaction. +*/ +static void cleanup_freed_for_journal_list(struct super_block *sb, + struct reiserfs_journal_list *jl) +{ + + struct reiserfs_list_bitmap *jb = jl->j_list_bitmap; + if (jb) { + cleanup_bitmap_list(sb, jb); + } + jl->j_list_bitmap->journal_list = NULL; + jl->j_list_bitmap = NULL; +} + +static int journal_list_still_alive(struct super_block *s, + unsigned int trans_id) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + struct list_head *entry = &journal->j_journal_list; + struct reiserfs_journal_list *jl; + + if (!list_empty(entry)) { + jl = JOURNAL_LIST_ENTRY(entry->next); + if (jl->j_trans_id <= trans_id) { + return 1; + } + } + return 0; +} + +/* + * If page->mapping was null, we failed to truncate this page for + * some reason. Most likely because it was truncated after being + * logged via data=journal. + * + * This does a check to see if the buffer belongs to one of these + * lost pages before doing the final put_bh. If page->mapping was + * null, it tries to free buffers on the page, which should make the + * final page_cache_release drop the page from the lru. + */ +static void release_buffer_page(struct buffer_head *bh) +{ + struct page *page = bh->b_page; + if (!page->mapping && trylock_page(page)) { + page_cache_get(page); + put_bh(bh); + if (!page->mapping) + try_to_free_buffers(page); + unlock_page(page); + page_cache_release(page); + } else { + put_bh(bh); + } +} + +static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (buffer_journaled(bh)) { + reiserfs_warning(NULL, "clm-2084", + "pinned buffer %lu:%s sent to disk", + bh->b_blocknr, bdevname(bh->b_bdev, b)); + } + if (uptodate) + set_buffer_uptodate(bh); + else + clear_buffer_uptodate(bh); + + unlock_buffer(bh); + release_buffer_page(bh); +} + +static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) +{ + if (uptodate) + set_buffer_uptodate(bh); + else + clear_buffer_uptodate(bh); + unlock_buffer(bh); + put_bh(bh); +} + +static void submit_logged_buffer(struct buffer_head *bh) +{ + get_bh(bh); + bh->b_end_io = reiserfs_end_buffer_io_sync; + clear_buffer_journal_new(bh); + clear_buffer_dirty(bh); + if (!test_clear_buffer_journal_test(bh)) + BUG(); + if (!buffer_uptodate(bh)) + BUG(); + submit_bh(WRITE, bh); +} + +static void submit_ordered_buffer(struct buffer_head *bh) +{ + get_bh(bh); + bh->b_end_io = reiserfs_end_ordered_io; + clear_buffer_dirty(bh); + if (!buffer_uptodate(bh)) + BUG(); + submit_bh(WRITE, bh); +} + +#define CHUNK_SIZE 32 +struct buffer_chunk { + struct buffer_head *bh[CHUNK_SIZE]; + int nr; +}; + +static void write_chunk(struct buffer_chunk *chunk) +{ + int i; + get_fs_excl(); + for (i = 0; i < chunk->nr; i++) { + submit_logged_buffer(chunk->bh[i]); + } + chunk->nr = 0; + put_fs_excl(); +} + +static void write_ordered_chunk(struct buffer_chunk *chunk) +{ + int i; + get_fs_excl(); + for (i = 0; i < chunk->nr; i++) { + submit_ordered_buffer(chunk->bh[i]); + } + chunk->nr = 0; + put_fs_excl(); +} + +static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, + spinlock_t * lock, void (fn) (struct buffer_chunk *)) +{ + int ret = 0; + BUG_ON(chunk->nr >= CHUNK_SIZE); + chunk->bh[chunk->nr++] = bh; + if (chunk->nr >= CHUNK_SIZE) { + ret = 1; + if (lock) + spin_unlock(lock); + fn(chunk); + if (lock) + spin_lock(lock); + } + return ret; +} + +static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0); +static struct reiserfs_jh *alloc_jh(void) +{ + struct reiserfs_jh *jh; + while (1) { + jh = kmalloc(sizeof(*jh), GFP_NOFS); + if (jh) { + atomic_inc(&nr_reiserfs_jh); + return jh; + } + yield(); + } +} + +/* + * we want to free the jh when the buffer has been written + * and waited on + */ +void reiserfs_free_jh(struct buffer_head *bh) +{ + struct reiserfs_jh *jh; + + jh = bh->b_private; + if (jh) { + bh->b_private = NULL; + jh->bh = NULL; + list_del_init(&jh->list); + kfree(jh); + if (atomic_read(&nr_reiserfs_jh) <= 0) + BUG(); + atomic_dec(&nr_reiserfs_jh); + put_bh(bh); + } +} + +static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh, + int tail) +{ + struct reiserfs_jh *jh; + + if (bh->b_private) { + spin_lock(&j->j_dirty_buffers_lock); + if (!bh->b_private) { + spin_unlock(&j->j_dirty_buffers_lock); + goto no_jh; + } + jh = bh->b_private; + list_del_init(&jh->list); + } else { + no_jh: + get_bh(bh); + jh = alloc_jh(); + spin_lock(&j->j_dirty_buffers_lock); + /* buffer must be locked for __add_jh, should be able to have + * two adds at the same time + */ + BUG_ON(bh->b_private); + jh->bh = bh; + bh->b_private = jh; + } + jh->jl = j->j_current_jl; + if (tail) + list_add_tail(&jh->list, &jh->jl->j_tail_bh_list); + else { + list_add_tail(&jh->list, &jh->jl->j_bh_list); + } + spin_unlock(&j->j_dirty_buffers_lock); + return 0; +} + +int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) +{ + return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1); +} +int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) +{ + return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0); +} + +#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list) +static int write_ordered_buffers(spinlock_t * lock, + struct reiserfs_journal *j, + struct reiserfs_journal_list *jl, + struct list_head *list) +{ + struct buffer_head *bh; + struct reiserfs_jh *jh; + int ret = j->j_errno; + struct buffer_chunk chunk; + struct list_head tmp; + INIT_LIST_HEAD(&tmp); + + chunk.nr = 0; + spin_lock(lock); + while (!list_empty(list)) { + jh = JH_ENTRY(list->next); + bh = jh->bh; + get_bh(bh); + if (!trylock_buffer(bh)) { + if (!buffer_dirty(bh)) { + list_move(&jh->list, &tmp); + goto loop_next; + } + spin_unlock(lock); + if (chunk.nr) + write_ordered_chunk(&chunk); + wait_on_buffer(bh); + cond_resched(); + spin_lock(lock); + goto loop_next; + } + /* in theory, dirty non-uptodate buffers should never get here, + * but the upper layer io error paths still have a few quirks. + * Handle them here as gracefully as we can + */ + if (!buffer_uptodate(bh) && buffer_dirty(bh)) { + clear_buffer_dirty(bh); + ret = -EIO; + } + if (buffer_dirty(bh)) { + list_move(&jh->list, &tmp); + add_to_chunk(&chunk, bh, lock, write_ordered_chunk); + } else { + reiserfs_free_jh(bh); + unlock_buffer(bh); + } + loop_next: + put_bh(bh); + cond_resched_lock(lock); + } + if (chunk.nr) { + spin_unlock(lock); + write_ordered_chunk(&chunk); + spin_lock(lock); + } + while (!list_empty(&tmp)) { + jh = JH_ENTRY(tmp.prev); + bh = jh->bh; + get_bh(bh); + reiserfs_free_jh(bh); + + if (buffer_locked(bh)) { + spin_unlock(lock); + wait_on_buffer(bh); + spin_lock(lock); + } + if (!buffer_uptodate(bh)) { + ret = -EIO; + } + /* ugly interaction with invalidatepage here. + * reiserfs_invalidate_page will pin any buffer that has a valid + * journal head from an older transaction. If someone else sets + * our buffer dirty after we write it in the first loop, and + * then someone truncates the page away, nobody will ever write + * the buffer. We're safe if we write the page one last time + * after freeing the journal header. + */ + if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { + spin_unlock(lock); + ll_rw_block(WRITE, 1, &bh); + spin_lock(lock); + } + put_bh(bh); + cond_resched_lock(lock); + } + spin_unlock(lock); + return ret; +} + +static int flush_older_commits(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + struct reiserfs_journal_list *other_jl; + struct reiserfs_journal_list *first_jl; + struct list_head *entry; + unsigned int trans_id = jl->j_trans_id; + unsigned int other_trans_id; + unsigned int first_trans_id; + + find_first: + /* + * first we walk backwards to find the oldest uncommitted transation + */ + first_jl = jl; + entry = jl->j_list.prev; + while (1) { + other_jl = JOURNAL_LIST_ENTRY(entry); + if (entry == &journal->j_journal_list || + atomic_read(&other_jl->j_older_commits_done)) + break; + + first_jl = other_jl; + entry = other_jl->j_list.prev; + } + + /* if we didn't find any older uncommitted transactions, return now */ + if (first_jl == jl) { + return 0; + } + + first_trans_id = first_jl->j_trans_id; + + entry = &first_jl->j_list; + while (1) { + other_jl = JOURNAL_LIST_ENTRY(entry); + other_trans_id = other_jl->j_trans_id; + + if (other_trans_id < trans_id) { + if (atomic_read(&other_jl->j_commit_left) != 0) { + flush_commit_list(s, other_jl, 0); + + /* list we were called with is gone, return */ + if (!journal_list_still_alive(s, trans_id)) + return 1; + + /* the one we just flushed is gone, this means all + * older lists are also gone, so first_jl is no longer + * valid either. Go back to the beginning. + */ + if (!journal_list_still_alive + (s, other_trans_id)) { + goto find_first; + } + } + entry = entry->next; + if (entry == &journal->j_journal_list) + return 0; + } else { + return 0; + } + } + return 0; +} + +static int reiserfs_async_progress_wait(struct super_block *s) +{ + struct reiserfs_journal *j = SB_JOURNAL(s); + + if (atomic_read(&j->j_async_throttle)) { + reiserfs_write_unlock(s); + congestion_wait(BLK_RW_ASYNC, HZ / 10); + reiserfs_write_lock(s); + } + + return 0; +} + +/* +** if this journal list still has commit blocks unflushed, send them to disk. +** +** log areas must be flushed in order (transaction 2 can't commit before transaction 1) +** Before the commit block can by written, every other log block must be safely on disk +** +*/ +static int flush_commit_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall) +{ + int i; + b_blocknr_t bn; + struct buffer_head *tbh = NULL; + unsigned int trans_id = jl->j_trans_id; + struct reiserfs_journal *journal = SB_JOURNAL(s); + int retval = 0; + int write_len; + + reiserfs_check_lock_depth(s, "flush_commit_list"); + + if (atomic_read(&jl->j_older_commits_done)) { + return 0; + } + + get_fs_excl(); + + /* before we can put our commit blocks on disk, we have to make sure everyone older than + ** us is on disk too + */ + BUG_ON(jl->j_len <= 0); + BUG_ON(trans_id == journal->j_trans_id); + + get_journal_list(jl); + if (flushall) { + if (flush_older_commits(s, jl) == 1) { + /* list disappeared during flush_older_commits. return */ + goto put_jl; + } + } + + /* make sure nobody is trying to flush this one at the same time */ + reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s); + + if (!journal_list_still_alive(s, trans_id)) { + mutex_unlock(&jl->j_commit_mutex); + goto put_jl; + } + BUG_ON(jl->j_trans_id == 0); + + /* this commit is done, exit */ + if (atomic_read(&(jl->j_commit_left)) <= 0) { + if (flushall) { + atomic_set(&(jl->j_older_commits_done), 1); + } + mutex_unlock(&jl->j_commit_mutex); + goto put_jl; + } + + if (!list_empty(&jl->j_bh_list)) { + int ret; + + /* + * We might sleep in numerous places inside + * write_ordered_buffers. Relax the write lock. + */ + reiserfs_write_unlock(s); + ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, + journal, jl, &jl->j_bh_list); + if (ret < 0 && retval == 0) + retval = ret; + reiserfs_write_lock(s); + } + BUG_ON(!list_empty(&jl->j_bh_list)); + /* + * for the description block and all the log blocks, submit any buffers + * that haven't already reached the disk. Try to write at least 256 + * log blocks. later on, we will only wait on blocks that correspond + * to this transaction, but while we're unplugging we might as well + * get a chunk of data on there. + */ + atomic_inc(&journal->j_async_throttle); + write_len = jl->j_len + 1; + if (write_len < 256) + write_len = 256; + for (i = 0 ; i < write_len ; i++) { + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % + SB_ONDISK_JOURNAL_SIZE(s); + tbh = journal_find_get_block(s, bn); + if (tbh) { + if (buffer_dirty(tbh)) { + reiserfs_write_unlock(s); + ll_rw_block(WRITE, 1, &tbh); + reiserfs_write_lock(s); + } + put_bh(tbh) ; + } + } + atomic_dec(&journal->j_async_throttle); + + for (i = 0; i < (jl->j_len + 1); i++) { + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); + tbh = journal_find_get_block(s, bn); + + reiserfs_write_unlock(s); + wait_on_buffer(tbh); + reiserfs_write_lock(s); + // since we're using ll_rw_blk above, it might have skipped over + // a locked buffer. Double check here + // + /* redundant, sync_dirty_buffer() checks */ + if (buffer_dirty(tbh)) { + reiserfs_write_unlock(s); + sync_dirty_buffer(tbh); + reiserfs_write_lock(s); + } + if (unlikely(!buffer_uptodate(tbh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(s, "journal-601", + "buffer write failed"); +#endif + retval = -EIO; + } + put_bh(tbh); /* once for journal_find_get_block */ + put_bh(tbh); /* once due to original getblk in do_journal_end */ + atomic_dec(&(jl->j_commit_left)); + } + + BUG_ON(atomic_read(&(jl->j_commit_left)) != 1); + + /* If there was a write error in the journal - we can't commit + * this transaction - it will be invalid and, if successful, + * will just end up propagating the write error out to + * the file system. */ + if (likely(!retval && !reiserfs_is_journal_aborted (journal))) { + if (buffer_dirty(jl->j_commit_bh)) + BUG(); + mark_buffer_dirty(jl->j_commit_bh) ; + reiserfs_write_unlock(s); + if (reiserfs_barrier_flush(s)) + __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA); + else + sync_dirty_buffer(jl->j_commit_bh); + reiserfs_write_lock(s); + } + + /* If there was a write error in the journal - we can't commit this + * transaction - it will be invalid and, if successful, will just end + * up propagating the write error out to the filesystem. */ + if (unlikely(!buffer_uptodate(jl->j_commit_bh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(s, "journal-615", "buffer write failed"); +#endif + retval = -EIO; + } + bforget(jl->j_commit_bh); + if (journal->j_last_commit_id != 0 && + (jl->j_trans_id - journal->j_last_commit_id) != 1) { + reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu", + journal->j_last_commit_id, jl->j_trans_id); + } + journal->j_last_commit_id = jl->j_trans_id; + + /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */ + cleanup_freed_for_journal_list(s, jl); + + retval = retval ? retval : journal->j_errno; + + /* mark the metadata dirty */ + if (!retval) + dirty_one_transaction(s, jl); + atomic_dec(&(jl->j_commit_left)); + + if (flushall) { + atomic_set(&(jl->j_older_commits_done), 1); + } + mutex_unlock(&jl->j_commit_mutex); + put_jl: + put_journal_list(s, jl); + + if (retval) + reiserfs_abort(s, retval, "Journal write error in %s", + __func__); + put_fs_excl(); + return retval; +} + +/* +** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or +** returns NULL if it can't find anything +*/ +static struct reiserfs_journal_list *find_newer_jl_for_cn(struct + reiserfs_journal_cnode + *cn) +{ + struct super_block *sb = cn->sb; + b_blocknr_t blocknr = cn->blocknr; + + cn = cn->hprev; + while (cn) { + if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) { + return cn->jlist; + } + cn = cn->hprev; + } + return NULL; +} + +static int newer_jl_done(struct reiserfs_journal_cnode *cn) +{ + struct super_block *sb = cn->sb; + b_blocknr_t blocknr = cn->blocknr; + + cn = cn->hprev; + while (cn) { + if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist && + atomic_read(&cn->jlist->j_commit_left) != 0) + return 0; + cn = cn->hprev; + } + return 1; +} + +static void remove_journal_hash(struct super_block *, + struct reiserfs_journal_cnode **, + struct reiserfs_journal_list *, unsigned long, + int); + +/* +** once all the real blocks have been flushed, it is safe to remove them from the +** journal list for this transaction. Aside from freeing the cnode, this also allows the +** block to be reallocated for data blocks if it had been deleted. +*/ +static void remove_all_from_journal_list(struct super_block *sb, + struct reiserfs_journal_list *jl, + int debug) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn, *last; + cn = jl->j_realblock; + + /* which is better, to lock once around the whole loop, or + ** to lock for each call to remove_journal_hash? + */ + while (cn) { + if (cn->blocknr != 0) { + if (debug) { + reiserfs_warning(sb, "reiserfs-2201", + "block %u, bh is %d, state %ld", + cn->blocknr, cn->bh ? 1 : 0, + cn->state); + } + cn->state = 0; + remove_journal_hash(sb, journal->j_list_hash_table, + jl, cn->blocknr, 1); + } + last = cn; + cn = cn->next; + free_cnode(sb, last); + } + jl->j_realblock = NULL; +} + +/* +** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block. +** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start +** releasing blocks in this transaction for reuse as data blocks. +** called by flush_journal_list, before it calls remove_all_from_journal_list +** +*/ +static int _update_journal_header_block(struct super_block *sb, + unsigned long offset, + unsigned int trans_id) +{ + struct reiserfs_journal_header *jh; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + if (reiserfs_is_journal_aborted(journal)) + return -EIO; + + if (trans_id >= journal->j_last_flush_trans_id) { + if (buffer_locked((journal->j_header_bh))) { + reiserfs_write_unlock(sb); + wait_on_buffer((journal->j_header_bh)); + reiserfs_write_lock(sb); + if (unlikely(!buffer_uptodate(journal->j_header_bh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(sb, "journal-699", + "buffer write failed"); +#endif + return -EIO; + } + } + journal->j_last_flush_trans_id = trans_id; + journal->j_first_unflushed_offset = offset; + jh = (struct reiserfs_journal_header *)(journal->j_header_bh-> + b_data); + jh->j_last_flush_trans_id = cpu_to_le32(trans_id); + jh->j_first_unflushed_offset = cpu_to_le32(offset); + jh->j_mount_id = cpu_to_le32(journal->j_mount_id); + + set_buffer_dirty(journal->j_header_bh); + reiserfs_write_unlock(sb); + + if (reiserfs_barrier_flush(sb)) + __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA); + else + sync_dirty_buffer(journal->j_header_bh); + + reiserfs_write_lock(sb); + if (!buffer_uptodate(journal->j_header_bh)) { + reiserfs_warning(sb, "journal-837", + "IO error during journal replay"); + return -EIO; + } + } + return 0; +} + +static int update_journal_header_block(struct super_block *sb, + unsigned long offset, + unsigned int trans_id) +{ + return _update_journal_header_block(sb, offset, trans_id); +} + +/* +** flush any and all journal lists older than you are +** can only be called from flush_journal_list +*/ +static int flush_older_journal_lists(struct super_block *sb, + struct reiserfs_journal_list *jl) +{ + struct list_head *entry; + struct reiserfs_journal_list *other_jl; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + unsigned int trans_id = jl->j_trans_id; + + /* we know we are the only ones flushing things, no extra race + * protection is required. + */ + restart: + entry = journal->j_journal_list.next; + /* Did we wrap? */ + if (entry == &journal->j_journal_list) + return 0; + other_jl = JOURNAL_LIST_ENTRY(entry); + if (other_jl->j_trans_id < trans_id) { + BUG_ON(other_jl->j_refcount <= 0); + /* do not flush all */ + flush_journal_list(sb, other_jl, 0); + + /* other_jl is now deleted from the list */ + goto restart; + } + return 0; +} + +static void del_from_work_list(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + if (!list_empty(&jl->j_working_list)) { + list_del_init(&jl->j_working_list); + journal->j_num_work_lists--; + } +} + +/* flush a journal list, both commit and real blocks +** +** always set flushall to 1, unless you are calling from inside +** flush_journal_list +** +** IMPORTANT. This can only be called while there are no journal writers, +** and the journal is locked. That means it can only be called from +** do_journal_end, or by journal_release +*/ +static int flush_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall) +{ + struct reiserfs_journal_list *pjl; + struct reiserfs_journal_cnode *cn, *last; + int count; + int was_jwait = 0; + int was_dirty = 0; + struct buffer_head *saved_bh; + unsigned long j_len_saved = jl->j_len; + struct reiserfs_journal *journal = SB_JOURNAL(s); + int err = 0; + + BUG_ON(j_len_saved <= 0); + + if (atomic_read(&journal->j_wcount) != 0) { + reiserfs_warning(s, "clm-2048", "called with wcount %d", + atomic_read(&journal->j_wcount)); + } + BUG_ON(jl->j_trans_id == 0); + + /* if flushall == 0, the lock is already held */ + if (flushall) { + reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s); + } else if (mutex_trylock(&journal->j_flush_mutex)) { + BUG(); + } + + count = 0; + if (j_len_saved > journal->j_trans_max) { + reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu", + j_len_saved, jl->j_trans_id); + return 0; + } + + get_fs_excl(); + + /* if all the work is already done, get out of here */ + if (atomic_read(&(jl->j_nonzerolen)) <= 0 && + atomic_read(&(jl->j_commit_left)) <= 0) { + goto flush_older_and_return; + } + + /* start by putting the commit list on disk. This will also flush + ** the commit lists of any olders transactions + */ + flush_commit_list(s, jl, 1); + + if (!(jl->j_state & LIST_DIRTY) + && !reiserfs_is_journal_aborted(journal)) + BUG(); + + /* are we done now? */ + if (atomic_read(&(jl->j_nonzerolen)) <= 0 && + atomic_read(&(jl->j_commit_left)) <= 0) { + goto flush_older_and_return; + } + + /* loop through each cnode, see if we need to write it, + ** or wait on a more recent transaction, or just ignore it + */ + if (atomic_read(&(journal->j_wcount)) != 0) { + reiserfs_panic(s, "journal-844", "journal list is flushing, " + "wcount is not 0"); + } + cn = jl->j_realblock; + while (cn) { + was_jwait = 0; + was_dirty = 0; + saved_bh = NULL; + /* blocknr of 0 is no longer in the hash, ignore it */ + if (cn->blocknr == 0) { + goto free_cnode; + } + + /* This transaction failed commit. Don't write out to the disk */ + if (!(jl->j_state & LIST_DIRTY)) + goto free_cnode; + + pjl = find_newer_jl_for_cn(cn); + /* the order is important here. We check pjl to make sure we + ** don't clear BH_JDirty_wait if we aren't the one writing this + ** block to disk + */ + if (!pjl && cn->bh) { + saved_bh = cn->bh; + + /* we do this to make sure nobody releases the buffer while + ** we are working with it + */ + get_bh(saved_bh); + + if (buffer_journal_dirty(saved_bh)) { + BUG_ON(!can_dirty(cn)); + was_jwait = 1; + was_dirty = 1; + } else if (can_dirty(cn)) { + /* everything with !pjl && jwait should be writable */ + BUG(); + } + } + + /* if someone has this block in a newer transaction, just make + ** sure they are committed, and don't try writing it to disk + */ + if (pjl) { + if (atomic_read(&pjl->j_commit_left)) + flush_commit_list(s, pjl, 1); + goto free_cnode; + } + + /* bh == NULL when the block got to disk on its own, OR, + ** the block got freed in a future transaction + */ + if (saved_bh == NULL) { + goto free_cnode; + } + + /* this should never happen. kupdate_one_transaction has this list + ** locked while it works, so we should never see a buffer here that + ** is not marked JDirty_wait + */ + if ((!was_jwait) && !buffer_locked(saved_bh)) { + reiserfs_warning(s, "journal-813", + "BAD! buffer %llu %cdirty %cjwait, " + "not in a newer tranasction", + (unsigned long long)saved_bh-> + b_blocknr, was_dirty ? ' ' : '!', + was_jwait ? ' ' : '!'); + } + if (was_dirty) { + /* we inc again because saved_bh gets decremented at free_cnode */ + get_bh(saved_bh); + set_bit(BLOCK_NEEDS_FLUSH, &cn->state); + lock_buffer(saved_bh); + BUG_ON(cn->blocknr != saved_bh->b_blocknr); + if (buffer_dirty(saved_bh)) + submit_logged_buffer(saved_bh); + else + unlock_buffer(saved_bh); + count++; + } else { + reiserfs_warning(s, "clm-2082", + "Unable to flush buffer %llu in %s", + (unsigned long long)saved_bh-> + b_blocknr, __func__); + } + free_cnode: + last = cn; + cn = cn->next; + if (saved_bh) { + /* we incremented this to keep others from taking the buffer head away */ + put_bh(saved_bh); + if (atomic_read(&(saved_bh->b_count)) < 0) { + reiserfs_warning(s, "journal-945", + "saved_bh->b_count < 0"); + } + } + } + if (count > 0) { + cn = jl->j_realblock; + while (cn) { + if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) { + if (!cn->bh) { + reiserfs_panic(s, "journal-1011", + "cn->bh is NULL"); + } + + reiserfs_write_unlock(s); + wait_on_buffer(cn->bh); + reiserfs_write_lock(s); + + if (!cn->bh) { + reiserfs_panic(s, "journal-1012", + "cn->bh is NULL"); + } + if (unlikely(!buffer_uptodate(cn->bh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(s, "journal-949", + "buffer write failed"); +#endif + err = -EIO; + } + /* note, we must clear the JDirty_wait bit after the up to date + ** check, otherwise we race against our flushpage routine + */ + BUG_ON(!test_clear_buffer_journal_dirty + (cn->bh)); + + /* drop one ref for us */ + put_bh(cn->bh); + /* drop one ref for journal_mark_dirty */ + release_buffer_page(cn->bh); + } + cn = cn->next; + } + } + + if (err) + reiserfs_abort(s, -EIO, + "Write error while pushing transaction to disk in %s", + __func__); + flush_older_and_return: + + /* before we can update the journal header block, we _must_ flush all + ** real blocks from all older transactions to disk. This is because + ** once the header block is updated, this transaction will not be + ** replayed after a crash + */ + if (flushall) { + flush_older_journal_lists(s, jl); + } + + err = journal->j_errno; + /* before we can remove everything from the hash tables for this + ** transaction, we must make sure it can never be replayed + ** + ** since we are only called from do_journal_end, we know for sure there + ** are no allocations going on while we are flushing journal lists. So, + ** we only need to update the journal header block for the last list + ** being flushed + */ + if (!err && flushall) { + err = + update_journal_header_block(s, + (jl->j_start + jl->j_len + + 2) % SB_ONDISK_JOURNAL_SIZE(s), + jl->j_trans_id); + if (err) + reiserfs_abort(s, -EIO, + "Write error while updating journal header in %s", + __func__); + } + remove_all_from_journal_list(s, jl, 0); + list_del_init(&jl->j_list); + journal->j_num_lists--; + del_from_work_list(s, jl); + + if (journal->j_last_flush_id != 0 && + (jl->j_trans_id - journal->j_last_flush_id) != 1) { + reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu", + journal->j_last_flush_id, jl->j_trans_id); + } + journal->j_last_flush_id = jl->j_trans_id; + + /* not strictly required since we are freeing the list, but it should + * help find code using dead lists later on + */ + jl->j_len = 0; + atomic_set(&(jl->j_nonzerolen), 0); + jl->j_start = 0; + jl->j_realblock = NULL; + jl->j_commit_bh = NULL; + jl->j_trans_id = 0; + jl->j_state = 0; + put_journal_list(s, jl); + if (flushall) + mutex_unlock(&journal->j_flush_mutex); + put_fs_excl(); + return err; +} + +static int test_transaction(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal_cnode *cn; + + if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) + return 1; + + cn = jl->j_realblock; + while (cn) { + /* if the blocknr == 0, this has been cleared from the hash, + ** skip it + */ + if (cn->blocknr == 0) { + goto next; + } + if (cn->bh && !newer_jl_done(cn)) + return 0; + next: + cn = cn->next; + cond_resched(); + } + return 0; +} + +static int write_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl, + struct buffer_chunk *chunk) +{ + struct reiserfs_journal_cnode *cn; + int ret = 0; + + jl->j_state |= LIST_TOUCHED; + del_from_work_list(s, jl); + if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) { + return 0; + } + + cn = jl->j_realblock; + while (cn) { + /* if the blocknr == 0, this has been cleared from the hash, + ** skip it + */ + if (cn->blocknr == 0) { + goto next; + } + if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) { + struct buffer_head *tmp_bh; + /* we can race against journal_mark_freed when we try + * to lock_buffer(cn->bh), so we have to inc the buffer + * count, and recheck things after locking + */ + tmp_bh = cn->bh; + get_bh(tmp_bh); + lock_buffer(tmp_bh); + if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) { + if (!buffer_journal_dirty(tmp_bh) || + buffer_journal_prepared(tmp_bh)) + BUG(); + add_to_chunk(chunk, tmp_bh, NULL, write_chunk); + ret++; + } else { + /* note, cn->bh might be null now */ + unlock_buffer(tmp_bh); + } + put_bh(tmp_bh); + } + next: + cn = cn->next; + cond_resched(); + } + return ret; +} + +/* used by flush_commit_list */ +static int dirty_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal_list *pjl; + int ret = 0; + + jl->j_state |= LIST_DIRTY; + cn = jl->j_realblock; + while (cn) { + /* look for a more recent transaction that logged this + ** buffer. Only the most recent transaction with a buffer in + ** it is allowed to send that buffer to disk + */ + pjl = find_newer_jl_for_cn(cn); + if (!pjl && cn->blocknr && cn->bh + && buffer_journal_dirty(cn->bh)) { + BUG_ON(!can_dirty(cn)); + /* if the buffer is prepared, it will either be logged + * or restored. If restored, we need to make sure + * it actually gets marked dirty + */ + clear_buffer_journal_new(cn->bh); + if (buffer_journal_prepared(cn->bh)) { + set_buffer_journal_restore_dirty(cn->bh); + } else { + set_buffer_journal_test(cn->bh); + mark_buffer_dirty(cn->bh); + } + } + cn = cn->next; + } + return ret; +} + +static int kupdate_transactions(struct super_block *s, + struct reiserfs_journal_list *jl, + struct reiserfs_journal_list **next_jl, + unsigned int *next_trans_id, + int num_blocks, int num_trans) +{ + int ret = 0; + int written = 0; + int transactions_flushed = 0; + unsigned int orig_trans_id = jl->j_trans_id; + struct buffer_chunk chunk; + struct list_head *entry; + struct reiserfs_journal *journal = SB_JOURNAL(s); + chunk.nr = 0; + + reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s); + if (!journal_list_still_alive(s, orig_trans_id)) { + goto done; + } + + /* we've got j_flush_mutex held, nobody is going to delete any + * of these lists out from underneath us + */ + while ((num_trans && transactions_flushed < num_trans) || + (!num_trans && written < num_blocks)) { + + if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) || + atomic_read(&jl->j_commit_left) + || !(jl->j_state & LIST_DIRTY)) { + del_from_work_list(s, jl); + break; + } + ret = write_one_transaction(s, jl, &chunk); + + if (ret < 0) + goto done; + transactions_flushed++; + written += ret; + entry = jl->j_list.next; + + /* did we wrap? */ + if (entry == &journal->j_journal_list) { + break; + } + jl = JOURNAL_LIST_ENTRY(entry); + + /* don't bother with older transactions */ + if (jl->j_trans_id <= orig_trans_id) + break; + } + if (chunk.nr) { + write_chunk(&chunk); + } + + done: + mutex_unlock(&journal->j_flush_mutex); + return ret; +} + +/* for o_sync and fsync heavy applications, they tend to use +** all the journa list slots with tiny transactions. These +** trigger lots and lots of calls to update the header block, which +** adds seeks and slows things down. +** +** This function tries to clear out a large chunk of the journal lists +** at once, which makes everything faster since only the newest journal +** list updates the header block +*/ +static int flush_used_journal_lists(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + unsigned long len = 0; + unsigned long cur_len; + int ret; + int i; + int limit = 256; + struct reiserfs_journal_list *tjl; + struct reiserfs_journal_list *flush_jl; + unsigned int trans_id; + struct reiserfs_journal *journal = SB_JOURNAL(s); + + flush_jl = tjl = jl; + + /* in data logging mode, try harder to flush a lot of blocks */ + if (reiserfs_data_log(s)) + limit = 1024; + /* flush for 256 transactions or limit blocks, whichever comes first */ + for (i = 0; i < 256 && len < limit; i++) { + if (atomic_read(&tjl->j_commit_left) || + tjl->j_trans_id < jl->j_trans_id) { + break; + } + cur_len = atomic_read(&tjl->j_nonzerolen); + if (cur_len > 0) { + tjl->j_state &= ~LIST_TOUCHED; + } + len += cur_len; + flush_jl = tjl; + if (tjl->j_list.next == &journal->j_journal_list) + break; + tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next); + } + /* try to find a group of blocks we can flush across all the + ** transactions, but only bother if we've actually spanned + ** across multiple lists + */ + if (flush_jl != jl) { + ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); + } + flush_journal_list(s, flush_jl, 1); + return 0; +} + +/* +** removes any nodes in table with name block and dev as bh. +** only touchs the hnext and hprev pointers. +*/ +void remove_journal_hash(struct super_block *sb, + struct reiserfs_journal_cnode **table, + struct reiserfs_journal_list *jl, + unsigned long block, int remove_freed) +{ + struct reiserfs_journal_cnode *cur; + struct reiserfs_journal_cnode **head; + + head = &(journal_hash(table, sb, block)); + if (!head) { + return; + } + cur = *head; + while (cur) { + if (cur->blocknr == block && cur->sb == sb + && (jl == NULL || jl == cur->jlist) + && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) { + if (cur->hnext) { + cur->hnext->hprev = cur->hprev; + } + if (cur->hprev) { + cur->hprev->hnext = cur->hnext; + } else { + *head = cur->hnext; + } + cur->blocknr = 0; + cur->sb = NULL; + cur->state = 0; + if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */ + atomic_dec(&(cur->jlist->j_nonzerolen)); + cur->bh = NULL; + cur->jlist = NULL; + } + cur = cur->hnext; + } +} + +static void free_journal_ram(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + kfree(journal->j_current_jl); + journal->j_num_lists--; + + vfree(journal->j_cnode_free_orig); + free_list_bitmaps(sb, journal->j_list_bitmap); + free_bitmap_nodes(sb); /* must be after free_list_bitmaps */ + if (journal->j_header_bh) { + brelse(journal->j_header_bh); + } + /* j_header_bh is on the journal dev, make sure not to release the journal + * dev until we brelse j_header_bh + */ + release_journal_dev(sb, journal); + vfree(journal); +} + +/* +** call on unmount. Only set error to 1 if you haven't made your way out +** of read_super() yet. Any other caller must keep error at 0. +*/ +static int do_journal_release(struct reiserfs_transaction_handle *th, + struct super_block *sb, int error) +{ + struct reiserfs_transaction_handle myth; + int flushed = 0; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + /* we only want to flush out transactions if we were called with error == 0 + */ + if (!error && !(sb->s_flags & MS_RDONLY)) { + /* end the current trans */ + BUG_ON(!th->t_trans_id); + do_journal_end(th, sb, 10, FLUSH_ALL); + + /* make sure something gets logged to force our way into the flush code */ + if (!journal_join(&myth, sb, 1)) { + reiserfs_prepare_for_journal(sb, + SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(&myth, sb, + SB_BUFFER_WITH_SB(sb)); + do_journal_end(&myth, sb, 1, FLUSH_ALL); + flushed = 1; + } + } + + /* this also catches errors during the do_journal_end above */ + if (!error && reiserfs_is_journal_aborted(journal)) { + memset(&myth, 0, sizeof(myth)); + if (!journal_join_abort(&myth, sb, 1)) { + reiserfs_prepare_for_journal(sb, + SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(&myth, sb, + SB_BUFFER_WITH_SB(sb)); + do_journal_end(&myth, sb, 1, FLUSH_ALL); + } + } + + reiserfs_mounted_fs_count--; + /* wait for all commits to finish */ + cancel_delayed_work(&SB_JOURNAL(sb)->j_work); + + /* + * We must release the write lock here because + * the workqueue job (flush_async_commit) needs this lock + */ + reiserfs_write_unlock(sb); + flush_workqueue(commit_wq); + + if (!reiserfs_mounted_fs_count) { + destroy_workqueue(commit_wq); + commit_wq = NULL; + } + + free_journal_ram(sb); + + reiserfs_write_lock(sb); + + return 0; +} + +/* +** call on unmount. flush all journal trans, release all alloc'd ram +*/ +int journal_release(struct reiserfs_transaction_handle *th, + struct super_block *sb) +{ + return do_journal_release(th, sb, 0); +} + +/* +** only call from an error condition inside reiserfs_read_super! +*/ +int journal_release_error(struct reiserfs_transaction_handle *th, + struct super_block *sb) +{ + return do_journal_release(th, sb, 1); +} + +/* compares description block with commit block. returns 1 if they differ, 0 if they are the same */ +static int journal_compare_desc_commit(struct super_block *sb, + struct reiserfs_journal_desc *desc, + struct reiserfs_journal_commit *commit) +{ + if (get_commit_trans_id(commit) != get_desc_trans_id(desc) || + get_commit_trans_len(commit) != get_desc_trans_len(desc) || + get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max || + get_commit_trans_len(commit) <= 0) { + return 1; + } + return 0; +} + +/* returns 0 if it did not find a description block +** returns -1 if it found a corrupt commit block +** returns 1 if both desc and commit were valid +*/ +static int journal_transaction_is_valid(struct super_block *sb, + struct buffer_head *d_bh, + unsigned int *oldest_invalid_trans_id, + unsigned long *newest_mount_id) +{ + struct reiserfs_journal_desc *desc; + struct reiserfs_journal_commit *commit; + struct buffer_head *c_bh; + unsigned long offset; + + if (!d_bh) + return 0; + + desc = (struct reiserfs_journal_desc *)d_bh->b_data; + if (get_desc_trans_len(desc) > 0 + && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) { + if (oldest_invalid_trans_id && *oldest_invalid_trans_id + && get_desc_trans_id(desc) > *oldest_invalid_trans_id) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-986: transaction " + "is valid returning because trans_id %d is greater than " + "oldest_invalid %lu", + get_desc_trans_id(desc), + *oldest_invalid_trans_id); + return 0; + } + if (newest_mount_id + && *newest_mount_id > get_desc_mount_id(desc)) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1087: transaction " + "is valid returning because mount_id %d is less than " + "newest_mount_id %lu", + get_desc_mount_id(desc), + *newest_mount_id); + return -1; + } + if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) { + reiserfs_warning(sb, "journal-2018", + "Bad transaction length %d " + "encountered, ignoring transaction", + get_desc_trans_len(desc)); + return -1; + } + offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb); + + /* ok, we have a journal description block, lets see if the transaction was valid */ + c_bh = + journal_bread(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((offset + get_desc_trans_len(desc) + + 1) % SB_ONDISK_JOURNAL_SIZE(sb))); + if (!c_bh) + return 0; + commit = (struct reiserfs_journal_commit *)c_bh->b_data; + if (journal_compare_desc_commit(sb, desc, commit)) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal_transaction_is_valid, commit offset %ld had bad " + "time %d or length %d", + c_bh->b_blocknr - + SB_ONDISK_JOURNAL_1st_BLOCK(sb), + get_commit_trans_id(commit), + get_commit_trans_len(commit)); + brelse(c_bh); + if (oldest_invalid_trans_id) { + *oldest_invalid_trans_id = + get_desc_trans_id(desc); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1004: " + "transaction_is_valid setting oldest invalid trans_id " + "to %d", + get_desc_trans_id(desc)); + } + return -1; + } + brelse(c_bh); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1006: found valid " + "transaction start offset %llu, len %d id %d", + d_bh->b_blocknr - + SB_ONDISK_JOURNAL_1st_BLOCK(sb), + get_desc_trans_len(desc), + get_desc_trans_id(desc)); + return 1; + } else { + return 0; + } +} + +static void brelse_array(struct buffer_head **heads, int num) +{ + int i; + for (i = 0; i < num; i++) { + brelse(heads[i]); + } +} + +/* +** given the start, and values for the oldest acceptable transactions, +** this either reads in a replays a transaction, or returns because the transaction +** is invalid, or too old. +*/ +static int journal_read_transaction(struct super_block *sb, + unsigned long cur_dblock, + unsigned long oldest_start, + unsigned int oldest_trans_id, + unsigned long newest_mount_id) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_desc *desc; + struct reiserfs_journal_commit *commit; + unsigned int trans_id = 0; + struct buffer_head *c_bh; + struct buffer_head *d_bh; + struct buffer_head **log_blocks = NULL; + struct buffer_head **real_blocks = NULL; + unsigned int trans_offset; + int i; + int trans_half; + + d_bh = journal_bread(sb, cur_dblock); + if (!d_bh) + return 1; + desc = (struct reiserfs_journal_desc *)d_bh->b_data; + trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: " + "journal_read_transaction, offset %llu, len %d mount_id %d", + d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb), + get_desc_trans_len(desc), get_desc_mount_id(desc)); + if (get_desc_trans_id(desc) < oldest_trans_id) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: " + "journal_read_trans skipping because %lu is too old", + cur_dblock - + SB_ONDISK_JOURNAL_1st_BLOCK(sb)); + brelse(d_bh); + return 1; + } + if (get_desc_mount_id(desc) != newest_mount_id) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: " + "journal_read_trans skipping because %d is != " + "newest_mount_id %lu", get_desc_mount_id(desc), + newest_mount_id); + brelse(d_bh); + return 1; + } + c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((trans_offset + get_desc_trans_len(desc) + 1) % + SB_ONDISK_JOURNAL_SIZE(sb))); + if (!c_bh) { + brelse(d_bh); + return 1; + } + commit = (struct reiserfs_journal_commit *)c_bh->b_data; + if (journal_compare_desc_commit(sb, desc, commit)) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal_read_transaction, " + "commit offset %llu had bad time %d or length %d", + c_bh->b_blocknr - + SB_ONDISK_JOURNAL_1st_BLOCK(sb), + get_commit_trans_id(commit), + get_commit_trans_len(commit)); + brelse(c_bh); + brelse(d_bh); + return 1; + } + + if (bdev_read_only(sb->s_bdev)) { + reiserfs_warning(sb, "clm-2076", + "device is readonly, unable to replay log"); + brelse(c_bh); + brelse(d_bh); + return -EROFS; + } + + trans_id = get_desc_trans_id(desc); + /* now we know we've got a good transaction, and it was inside the valid time ranges */ + log_blocks = kmalloc(get_desc_trans_len(desc) * + sizeof(struct buffer_head *), GFP_NOFS); + real_blocks = kmalloc(get_desc_trans_len(desc) * + sizeof(struct buffer_head *), GFP_NOFS); + if (!log_blocks || !real_blocks) { + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + reiserfs_warning(sb, "journal-1169", + "kmalloc failed, unable to mount FS"); + return -1; + } + /* get all the buffer heads */ + trans_half = journal_trans_half(sb->s_blocksize); + for (i = 0; i < get_desc_trans_len(desc); i++) { + log_blocks[i] = + journal_getblk(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + (trans_offset + 1 + + i) % SB_ONDISK_JOURNAL_SIZE(sb)); + if (i < trans_half) { + real_blocks[i] = + sb_getblk(sb, + le32_to_cpu(desc->j_realblock[i])); + } else { + real_blocks[i] = + sb_getblk(sb, + le32_to_cpu(commit-> + j_realblock[i - trans_half])); + } + if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) { + reiserfs_warning(sb, "journal-1207", + "REPLAY FAILURE fsck required! " + "Block to replay is outside of " + "filesystem"); + goto abort_replay; + } + /* make sure we don't try to replay onto log or reserved area */ + if (is_block_in_log_or_reserved_area + (sb, real_blocks[i]->b_blocknr)) { + reiserfs_warning(sb, "journal-1204", + "REPLAY FAILURE fsck required! " + "Trying to replay onto a log block"); + abort_replay: + brelse_array(log_blocks, i); + brelse_array(real_blocks, i); + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + return -1; + } + } + /* read in the log blocks, memcpy to the corresponding real block */ + ll_rw_block(READ, get_desc_trans_len(desc), log_blocks); + for (i = 0; i < get_desc_trans_len(desc); i++) { + + reiserfs_write_unlock(sb); + wait_on_buffer(log_blocks[i]); + reiserfs_write_lock(sb); + + if (!buffer_uptodate(log_blocks[i])) { + reiserfs_warning(sb, "journal-1212", + "REPLAY FAILURE fsck required! " + "buffer write failed"); + brelse_array(log_blocks + i, + get_desc_trans_len(desc) - i); + brelse_array(real_blocks, get_desc_trans_len(desc)); + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + return -1; + } + memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, + real_blocks[i]->b_size); + set_buffer_uptodate(real_blocks[i]); + brelse(log_blocks[i]); + } + /* flush out the real blocks */ + for (i = 0; i < get_desc_trans_len(desc); i++) { + set_buffer_dirty(real_blocks[i]); + write_dirty_buffer(real_blocks[i], WRITE); + } + for (i = 0; i < get_desc_trans_len(desc); i++) { + wait_on_buffer(real_blocks[i]); + if (!buffer_uptodate(real_blocks[i])) { + reiserfs_warning(sb, "journal-1226", + "REPLAY FAILURE, fsck required! " + "buffer write failed"); + brelse_array(real_blocks + i, + get_desc_trans_len(desc) - i); + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + return -1; + } + brelse(real_blocks[i]); + } + cur_dblock = + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((trans_offset + get_desc_trans_len(desc) + + 2) % SB_ONDISK_JOURNAL_SIZE(sb)); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1095: setting journal " "start to offset %ld", + cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb)); + + /* init starting values for the first transaction, in case this is the last transaction to be replayed. */ + journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb); + journal->j_last_flush_trans_id = trans_id; + journal->j_trans_id = trans_id + 1; + /* check for trans_id overflow */ + if (journal->j_trans_id == 0) + journal->j_trans_id = 10; + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + return 0; +} + +/* This function reads blocks starting from block and to max_block of bufsize + size (but no more than BUFNR blocks at a time). This proved to improve + mounting speed on self-rebuilding raid5 arrays at least. + Right now it is only used from journal code. But later we might use it + from other places. + Note: Do not use journal_getblk/sb_getblk functions here! */ +static struct buffer_head *reiserfs_breada(struct block_device *dev, + b_blocknr_t block, int bufsize, + b_blocknr_t max_block) +{ + struct buffer_head *bhlist[BUFNR]; + unsigned int blocks = BUFNR; + struct buffer_head *bh; + int i, j; + + bh = __getblk(dev, block, bufsize); + if (buffer_uptodate(bh)) + return (bh); + + if (block + BUFNR > max_block) { + blocks = max_block - block; + } + bhlist[0] = bh; + j = 1; + for (i = 1; i < blocks; i++) { + bh = __getblk(dev, block + i, bufsize); + if (buffer_uptodate(bh)) { + brelse(bh); + break; + } else + bhlist[j++] = bh; + } + ll_rw_block(READ, j, bhlist); + for (i = 1; i < j; i++) + brelse(bhlist[i]); + bh = bhlist[0]; + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + brelse(bh); + return NULL; +} + +/* +** read and replay the log +** on a clean unmount, the journal header's next unflushed pointer will be to an invalid +** transaction. This tests that before finding all the transactions in the log, which makes normal mount times fast. +** +** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid. +** +** On exit, it sets things up so the first transaction will work correctly. +*/ +static int journal_read(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_desc *desc; + unsigned int oldest_trans_id = 0; + unsigned int oldest_invalid_trans_id = 0; + time_t start; + unsigned long oldest_start = 0; + unsigned long cur_dblock = 0; + unsigned long newest_mount_id = 9; + struct buffer_head *d_bh; + struct reiserfs_journal_header *jh; + int valid_journal_header = 0; + int replay_count = 0; + int continue_replay = 1; + int ret; + char b[BDEVNAME_SIZE]; + + cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb); + reiserfs_info(sb, "checking transaction log (%s)\n", + bdevname(journal->j_dev_bd, b)); + start = get_seconds(); + + /* step 1, read in the journal header block. Check the transaction it says + ** is the first unflushed, and if that transaction is not valid, + ** replay is done + */ + journal->j_header_bh = journal_bread(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb)); + if (!journal->j_header_bh) { + return 1; + } + jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data); + if (le32_to_cpu(jh->j_first_unflushed_offset) < + SB_ONDISK_JOURNAL_SIZE(sb) + && le32_to_cpu(jh->j_last_flush_trans_id) > 0) { + oldest_start = + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + le32_to_cpu(jh->j_first_unflushed_offset); + oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; + newest_mount_id = le32_to_cpu(jh->j_mount_id); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1153: found in " + "header: first_unflushed_offset %d, last_flushed_trans_id " + "%lu", le32_to_cpu(jh->j_first_unflushed_offset), + le32_to_cpu(jh->j_last_flush_trans_id)); + valid_journal_header = 1; + + /* now, we try to read the first unflushed offset. If it is not valid, + ** there is nothing more we can do, and it makes no sense to read + ** through the whole log. + */ + d_bh = + journal_bread(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + le32_to_cpu(jh->j_first_unflushed_offset)); + ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL); + if (!ret) { + continue_replay = 0; + } + brelse(d_bh); + goto start_log_replay; + } + + /* ok, there are transactions that need to be replayed. start with the first log block, find + ** all the valid transactions, and pick out the oldest. + */ + while (continue_replay + && cur_dblock < + (SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb))) { + /* Note that it is required for blocksize of primary fs device and journal + device to be the same */ + d_bh = + reiserfs_breada(journal->j_dev_bd, cur_dblock, + sb->s_blocksize, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb)); + ret = + journal_transaction_is_valid(sb, d_bh, + &oldest_invalid_trans_id, + &newest_mount_id); + if (ret == 1) { + desc = (struct reiserfs_journal_desc *)d_bh->b_data; + if (oldest_start == 0) { /* init all oldest_ values */ + oldest_trans_id = get_desc_trans_id(desc); + oldest_start = d_bh->b_blocknr; + newest_mount_id = get_desc_mount_id(desc); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1179: Setting " + "oldest_start to offset %llu, trans_id %lu", + oldest_start - + SB_ONDISK_JOURNAL_1st_BLOCK + (sb), oldest_trans_id); + } else if (oldest_trans_id > get_desc_trans_id(desc)) { + /* one we just read was older */ + oldest_trans_id = get_desc_trans_id(desc); + oldest_start = d_bh->b_blocknr; + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1180: Resetting " + "oldest_start to offset %lu, trans_id %lu", + oldest_start - + SB_ONDISK_JOURNAL_1st_BLOCK + (sb), oldest_trans_id); + } + if (newest_mount_id < get_desc_mount_id(desc)) { + newest_mount_id = get_desc_mount_id(desc); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1299: Setting " + "newest_mount_id to %d", + get_desc_mount_id(desc)); + } + cur_dblock += get_desc_trans_len(desc) + 2; + } else { + cur_dblock++; + } + brelse(d_bh); + } + + start_log_replay: + cur_dblock = oldest_start; + if (oldest_trans_id) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1206: Starting replay " + "from offset %llu, trans_id %lu", + cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb), + oldest_trans_id); + + } + replay_count = 0; + while (continue_replay && oldest_trans_id > 0) { + ret = + journal_read_transaction(sb, cur_dblock, oldest_start, + oldest_trans_id, newest_mount_id); + if (ret < 0) { + return ret; + } else if (ret != 0) { + break; + } + cur_dblock = + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start; + replay_count++; + if (cur_dblock == oldest_start) + break; + } + + if (oldest_trans_id == 0) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1225: No valid " "transactions found"); + } + /* j_start does not get set correctly if we don't replay any transactions. + ** if we had a valid journal_header, set j_start to the first unflushed transaction value, + ** copy the trans_id from the header + */ + if (valid_journal_header && replay_count == 0) { + journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset); + journal->j_trans_id = + le32_to_cpu(jh->j_last_flush_trans_id) + 1; + /* check for trans_id overflow */ + if (journal->j_trans_id == 0) + journal->j_trans_id = 10; + journal->j_last_flush_trans_id = + le32_to_cpu(jh->j_last_flush_trans_id); + journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1; + } else { + journal->j_mount_id = newest_mount_id + 1; + } + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " + "newest_mount_id to %lu", journal->j_mount_id); + journal->j_first_unflushed_offset = journal->j_start; + if (replay_count > 0) { + reiserfs_info(sb, + "replayed %d transactions in %lu seconds\n", + replay_count, get_seconds() - start); + } + if (!bdev_read_only(sb->s_bdev) && + _update_journal_header_block(sb, journal->j_start, + journal->j_last_flush_trans_id)) { + /* replay failed, caller must call free_journal_ram and abort + ** the mount + */ + return -1; + } + return 0; +} + +static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) +{ + struct reiserfs_journal_list *jl; + jl = kzalloc(sizeof(struct reiserfs_journal_list), + GFP_NOFS | __GFP_NOFAIL); + INIT_LIST_HEAD(&jl->j_list); + INIT_LIST_HEAD(&jl->j_working_list); + INIT_LIST_HEAD(&jl->j_tail_bh_list); + INIT_LIST_HEAD(&jl->j_bh_list); + mutex_init(&jl->j_commit_mutex); + SB_JOURNAL(s)->j_num_lists++; + get_journal_list(jl); + return jl; +} + +static void journal_list_init(struct super_block *sb) +{ + SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb); +} + +static int release_journal_dev(struct super_block *super, + struct reiserfs_journal *journal) +{ + int result; + + result = 0; + + if (journal->j_dev_bd != NULL) { + result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode); + journal->j_dev_bd = NULL; + } + + if (result != 0) { + reiserfs_warning(super, "sh-457", + "Cannot release journal device: %i", result); + } + return result; +} + +static int journal_init_dev(struct super_block *super, + struct reiserfs_journal *journal, + const char *jdev_name) +{ + int result; + dev_t jdev; + fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; + char b[BDEVNAME_SIZE]; + + result = 0; + + journal->j_dev_bd = NULL; + jdev = SB_ONDISK_JOURNAL_DEVICE(super) ? + new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; + + if (bdev_read_only(super->s_bdev)) + blkdev_mode = FMODE_READ; + + /* there is no "jdev" option and journal is on separate device */ + if ((!jdev_name || !jdev_name[0])) { + if (jdev == super->s_dev) + blkdev_mode &= ~FMODE_EXCL; + journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, + journal); + journal->j_dev_mode = blkdev_mode; + if (IS_ERR(journal->j_dev_bd)) { + result = PTR_ERR(journal->j_dev_bd); + journal->j_dev_bd = NULL; + reiserfs_warning(super, "sh-458", + "cannot init journal device '%s': %i", + __bdevname(jdev, b), result); + return result; + } else if (jdev != super->s_dev) + set_blocksize(journal->j_dev_bd, super->s_blocksize); + + return 0; + } + + journal->j_dev_mode = blkdev_mode; + journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal); + if (IS_ERR(journal->j_dev_bd)) { + result = PTR_ERR(journal->j_dev_bd); + journal->j_dev_bd = NULL; + reiserfs_warning(super, + "journal_init_dev: Cannot open '%s': %i", + jdev_name, result); + return result; + } + + set_blocksize(journal->j_dev_bd, super->s_blocksize); + reiserfs_info(super, + "journal_init_dev: journal device: %s\n", + bdevname(journal->j_dev_bd, b)); + return 0; +} + +/** + * When creating/tuning a file system user can assign some + * journal params within boundaries which depend on the ratio + * blocksize/standard_blocksize. + * + * For blocks >= standard_blocksize transaction size should + * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more + * then JOURNAL_TRANS_MAX_DEFAULT. + * + * For blocks < standard_blocksize these boundaries should be + * decreased proportionally. + */ +#define REISERFS_STANDARD_BLKSIZE (4096) + +static int check_advise_trans_params(struct super_block *sb, + struct reiserfs_journal *journal) +{ + if (journal->j_trans_max) { + /* Non-default journal params. + Do sanity check for them. */ + int ratio = 1; + if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE) + ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize; + + if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio || + journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio || + SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max < + JOURNAL_MIN_RATIO) { + reiserfs_warning(sb, "sh-462", + "bad transaction max size (%u). " + "FSCK?", journal->j_trans_max); + return 1; + } + if (journal->j_max_batch != (journal->j_trans_max) * + JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) { + reiserfs_warning(sb, "sh-463", + "bad transaction max batch (%u). " + "FSCK?", journal->j_max_batch); + return 1; + } + } else { + /* Default journal params. + The file system was created by old version + of mkreiserfs, so some fields contain zeros, + and we need to advise proper values for them */ + if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) { + reiserfs_warning(sb, "sh-464", "bad blocksize (%u)", + sb->s_blocksize); + return 1; + } + journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT; + journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT; + journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE; + } + return 0; +} + +/* +** must be called once on fs mount. calls journal_read for you +*/ +int journal_init(struct super_block *sb, const char *j_dev_name, + int old_format, unsigned int commit_max_age) +{ + int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2; + struct buffer_head *bhjh; + struct reiserfs_super_block *rs; + struct reiserfs_journal_header *jh; + struct reiserfs_journal *journal; + struct reiserfs_journal_list *jl; + char b[BDEVNAME_SIZE]; + int ret; + + /* + * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS + * dependency inversion warnings. + */ + reiserfs_write_unlock(sb); + journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal)); + if (!journal) { + reiserfs_warning(sb, "journal-1256", + "unable to get memory for journal structure"); + reiserfs_write_lock(sb); + return 1; + } + memset(journal, 0, sizeof(struct reiserfs_journal)); + INIT_LIST_HEAD(&journal->j_bitmap_nodes); + INIT_LIST_HEAD(&journal->j_prealloc_list); + INIT_LIST_HEAD(&journal->j_working_list); + INIT_LIST_HEAD(&journal->j_journal_list); + journal->j_persistent_trans = 0; + ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, + reiserfs_bmap_count(sb)); + reiserfs_write_lock(sb); + if (ret) + goto free_and_return; + + allocate_bitmap_nodes(sb); + + /* reserved for journal area support */ + SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ? + REISERFS_OLD_DISK_OFFSET_IN_BYTES + / sb->s_blocksize + + reiserfs_bmap_count(sb) + + 1 : + REISERFS_DISK_OFFSET_IN_BYTES / + sb->s_blocksize + 2); + + /* Sanity check to see is the standard journal fitting within first bitmap + (actual for small blocksizes) */ + if (!SB_ONDISK_JOURNAL_DEVICE(sb) && + (SB_JOURNAL_1st_RESERVED_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) { + reiserfs_warning(sb, "journal-1393", + "journal does not fit for area addressed " + "by first of bitmap blocks. It starts at " + "%u and its size is %u. Block size %ld", + SB_JOURNAL_1st_RESERVED_BLOCK(sb), + SB_ONDISK_JOURNAL_SIZE(sb), + sb->s_blocksize); + goto free_and_return; + } + + /* + * We need to unlock here to avoid creating the following + * dependency: + * reiserfs_lock -> sysfs_mutex + * Because the reiserfs mmap path creates the following dependency: + * mm->mmap -> reiserfs_lock, hence we have + * mm->mmap -> reiserfs_lock ->sysfs_mutex + * This would ends up in a circular dependency with sysfs readdir path + * which does sysfs_mutex -> mm->mmap_sem + * This is fine because the reiserfs lock is useless in mount path, + * at least until we call journal_begin. We keep it for paranoid + * reasons. + */ + reiserfs_write_unlock(sb); + if (journal_init_dev(sb, journal, j_dev_name) != 0) { + reiserfs_write_lock(sb); + reiserfs_warning(sb, "sh-462", + "unable to initialize jornal device"); + goto free_and_return; + } + reiserfs_write_lock(sb); + + rs = SB_DISK_SUPER_BLOCK(sb); + + /* read journal header */ + bhjh = journal_bread(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb)); + if (!bhjh) { + reiserfs_warning(sb, "sh-459", + "unable to read journal header"); + goto free_and_return; + } + jh = (struct reiserfs_journal_header *)(bhjh->b_data); + + /* make sure that journal matches to the super block */ + if (is_reiserfs_jr(rs) + && (le32_to_cpu(jh->jh_journal.jp_journal_magic) != + sb_jp_journal_magic(rs))) { + reiserfs_warning(sb, "sh-460", + "journal header magic %x (device %s) does " + "not match to magic found in super block %x", + jh->jh_journal.jp_journal_magic, + bdevname(journal->j_dev_bd, b), + sb_jp_journal_magic(rs)); + brelse(bhjh); + goto free_and_return; + } + + journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max); + journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch); + journal->j_max_commit_age = + le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age); + journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; + + if (check_advise_trans_params(sb, journal) != 0) + goto free_and_return; + journal->j_default_max_commit_age = journal->j_max_commit_age; + + if (commit_max_age != 0) { + journal->j_max_commit_age = commit_max_age; + journal->j_max_trans_age = commit_max_age; + } + + reiserfs_info(sb, "journal params: device %s, size %u, " + "journal first block %u, max trans len %u, max batch %u, " + "max commit age %u, max trans age %u\n", + bdevname(journal->j_dev_bd, b), + SB_ONDISK_JOURNAL_SIZE(sb), + SB_ONDISK_JOURNAL_1st_BLOCK(sb), + journal->j_trans_max, + journal->j_max_batch, + journal->j_max_commit_age, journal->j_max_trans_age); + + brelse(bhjh); + + journal->j_list_bitmap_index = 0; + journal_list_init(sb); + + memset(journal->j_list_hash_table, 0, + JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)); + + INIT_LIST_HEAD(&journal->j_dirty_buffers); + spin_lock_init(&journal->j_dirty_buffers_lock); + + journal->j_start = 0; + journal->j_len = 0; + journal->j_len_alloc = 0; + atomic_set(&(journal->j_wcount), 0); + atomic_set(&(journal->j_async_throttle), 0); + journal->j_bcount = 0; + journal->j_trans_start_time = 0; + journal->j_last = NULL; + journal->j_first = NULL; + init_waitqueue_head(&(journal->j_join_wait)); + mutex_init(&journal->j_mutex); + mutex_init(&journal->j_flush_mutex); + + journal->j_trans_id = 10; + journal->j_mount_id = 10; + journal->j_state = 0; + atomic_set(&(journal->j_jlock), 0); + reiserfs_write_unlock(sb); + journal->j_cnode_free_list = allocate_cnodes(num_cnodes); + reiserfs_write_lock(sb); + journal->j_cnode_free_orig = journal->j_cnode_free_list; + journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; + journal->j_cnode_used = 0; + journal->j_must_wait = 0; + + if (journal->j_cnode_free == 0) { + reiserfs_warning(sb, "journal-2004", "Journal cnode memory " + "allocation failed (%ld bytes). Journal is " + "too large for available memory. Usually " + "this is due to a journal that is too large.", + sizeof (struct reiserfs_journal_cnode) * num_cnodes); + goto free_and_return; + } + + init_journal_hash(sb); + jl = journal->j_current_jl; + jl->j_list_bitmap = get_list_bitmap(sb, jl); + if (!jl->j_list_bitmap) { + reiserfs_warning(sb, "journal-2005", + "get_list_bitmap failed for journal list 0"); + goto free_and_return; + } + if (journal_read(sb) < 0) { + reiserfs_warning(sb, "reiserfs-2006", + "Replay Failure, unable to mount"); + goto free_and_return; + } + + reiserfs_mounted_fs_count++; + if (reiserfs_mounted_fs_count <= 1) { + reiserfs_write_unlock(sb); + commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0); + reiserfs_write_lock(sb); + } + + INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); + journal->j_work_sb = sb; + return 0; + free_and_return: + free_journal_ram(sb); + return 1; +} + +/* +** test for a polite end of the current transaction. Used by file_write, and should +** be used by delete to make sure they don't write more than can fit inside a single +** transaction +*/ +int journal_transaction_should_end(struct reiserfs_transaction_handle *th, + int new_alloc) +{ + struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); + time_t now = get_seconds(); + /* cannot restart while nested */ + BUG_ON(!th->t_trans_id); + if (th->t_refcount > 1) + return 0; + if (journal->j_must_wait > 0 || + (journal->j_len_alloc + new_alloc) >= journal->j_max_batch || + atomic_read(&(journal->j_jlock)) || + (now - journal->j_trans_start_time) > journal->j_max_trans_age || + journal->j_cnode_free < (journal->j_trans_max * 3)) { + return 1; + } + /* protected by the BKL here */ + journal->j_len_alloc += new_alloc; + th->t_blocks_allocated += new_alloc ; + return 0; +} + +/* this must be called inside a transaction, and requires the +** kernel_lock to be held +*/ +void reiserfs_block_writes(struct reiserfs_transaction_handle *th) +{ + struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); + BUG_ON(!th->t_trans_id); + journal->j_must_wait = 1; + set_bit(J_WRITERS_BLOCKED, &journal->j_state); + return; +} + +/* this must be called without a transaction started, and does not +** require BKL +*/ +void reiserfs_allow_writes(struct super_block *s) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + clear_bit(J_WRITERS_BLOCKED, &journal->j_state); + wake_up(&journal->j_join_wait); +} + +/* this must be called without a transaction started, and does not +** require BKL +*/ +void reiserfs_wait_on_write_block(struct super_block *s) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + wait_event(journal->j_join_wait, + !test_bit(J_WRITERS_BLOCKED, &journal->j_state)); +} + +static void queue_log_writer(struct super_block *s) +{ + wait_queue_t wait; + struct reiserfs_journal *journal = SB_JOURNAL(s); + set_bit(J_WRITERS_QUEUED, &journal->j_state); + + /* + * we don't want to use wait_event here because + * we only want to wait once. + */ + init_waitqueue_entry(&wait, current); + add_wait_queue(&journal->j_join_wait, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) { + reiserfs_write_unlock(s); + schedule(); + reiserfs_write_lock(s); + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(&journal->j_join_wait, &wait); +} + +static void wake_queued_writers(struct super_block *s) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state)) + wake_up(&journal->j_join_wait); +} + +static void let_transaction_grow(struct super_block *sb, unsigned int trans_id) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + unsigned long bcount = journal->j_bcount; + while (1) { + reiserfs_write_unlock(sb); + schedule_timeout_uninterruptible(1); + reiserfs_write_lock(sb); + journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; + while ((atomic_read(&journal->j_wcount) > 0 || + atomic_read(&journal->j_jlock)) && + journal->j_trans_id == trans_id) { + queue_log_writer(sb); + } + if (journal->j_trans_id != trans_id) + break; + if (bcount == journal->j_bcount) + break; + bcount = journal->j_bcount; + } +} + +/* join == true if you must join an existing transaction. +** join == false if you can deal with waiting for others to finish +** +** this will block until the transaction is joinable. send the number of blocks you +** expect to use in nblocks. +*/ +static int do_journal_begin_r(struct reiserfs_transaction_handle *th, + struct super_block *sb, unsigned long nblocks, + int join) +{ + time_t now = get_seconds(); + unsigned int old_trans_id; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_transaction_handle myth; + int sched_count = 0; + int retval; + + reiserfs_check_lock_depth(sb, "journal_begin"); + BUG_ON(nblocks > journal->j_trans_max); + + PROC_INFO_INC(sb, journal.journal_being); + /* set here for journal_join */ + th->t_refcount = 1; + th->t_super = sb; + + relock: + lock_journal(sb); + if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) { + unlock_journal(sb); + retval = journal->j_errno; + goto out_fail; + } + journal->j_bcount++; + + if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { + unlock_journal(sb); + reiserfs_write_unlock(sb); + reiserfs_wait_on_write_block(sb); + reiserfs_write_lock(sb); + PROC_INFO_INC(sb, journal.journal_relock_writers); + goto relock; + } + now = get_seconds(); + + /* if there is no room in the journal OR + ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning + ** we don't sleep if there aren't other writers + */ + + if ((!join && journal->j_must_wait > 0) || + (!join + && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch) + || (!join && atomic_read(&journal->j_wcount) > 0 + && journal->j_trans_start_time > 0 + && (now - journal->j_trans_start_time) > + journal->j_max_trans_age) || (!join + && atomic_read(&journal->j_jlock)) + || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) { + + old_trans_id = journal->j_trans_id; + unlock_journal(sb); /* allow others to finish this transaction */ + + if (!join && (journal->j_len_alloc + nblocks + 2) >= + journal->j_max_batch && + ((journal->j_len + nblocks + 2) * 100) < + (journal->j_len_alloc * 75)) { + if (atomic_read(&journal->j_wcount) > 10) { + sched_count++; + queue_log_writer(sb); + goto relock; + } + } + /* don't mess with joining the transaction if all we have to do is + * wait for someone else to do a commit + */ + if (atomic_read(&journal->j_jlock)) { + while (journal->j_trans_id == old_trans_id && + atomic_read(&journal->j_jlock)) { + queue_log_writer(sb); + } + goto relock; + } + retval = journal_join(&myth, sb, 1); + if (retval) + goto out_fail; + + /* someone might have ended the transaction while we joined */ + if (old_trans_id != journal->j_trans_id) { + retval = do_journal_end(&myth, sb, 1, 0); + } else { + retval = do_journal_end(&myth, sb, 1, COMMIT_NOW); + } + + if (retval) + goto out_fail; + + PROC_INFO_INC(sb, journal.journal_relock_wcount); + goto relock; + } + /* we are the first writer, set trans_id */ + if (journal->j_trans_start_time == 0) { + journal->j_trans_start_time = get_seconds(); + } + atomic_inc(&(journal->j_wcount)); + journal->j_len_alloc += nblocks; + th->t_blocks_logged = 0; + th->t_blocks_allocated = nblocks; + th->t_trans_id = journal->j_trans_id; + unlock_journal(sb); + INIT_LIST_HEAD(&th->t_list); + get_fs_excl(); + return 0; + + out_fail: + memset(th, 0, sizeof(*th)); + /* Re-set th->t_super, so we can properly keep track of how many + * persistent transactions there are. We need to do this so if this + * call is part of a failed restart_transaction, we can free it later */ + th->t_super = sb; + return retval; +} + +struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct + super_block + *s, + int nblocks) +{ + int ret; + struct reiserfs_transaction_handle *th; + + /* if we're nesting into an existing transaction. It will be + ** persistent on its own + */ + if (reiserfs_transaction_running(s)) { + th = current->journal_info; + th->t_refcount++; + BUG_ON(th->t_refcount < 2); + + return th; + } + th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS); + if (!th) + return NULL; + ret = journal_begin(th, s, nblocks); + if (ret) { + kfree(th); + return NULL; + } + + SB_JOURNAL(s)->j_persistent_trans++; + return th; +} + +int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) +{ + struct super_block *s = th->t_super; + int ret = 0; + if (th->t_trans_id) + ret = journal_end(th, th->t_super, th->t_blocks_allocated); + else + ret = -EIO; + if (th->t_refcount == 0) { + SB_JOURNAL(s)->j_persistent_trans--; + kfree(th); + } + return ret; +} + +static int journal_join(struct reiserfs_transaction_handle *th, + struct super_block *sb, unsigned long nblocks) +{ + struct reiserfs_transaction_handle *cur_th = current->journal_info; + + /* this keeps do_journal_end from NULLing out the current->journal_info + ** pointer + */ + th->t_handle_save = cur_th; + BUG_ON(cur_th && cur_th->t_refcount > 1); + return do_journal_begin_r(th, sb, nblocks, JBEGIN_JOIN); +} + +int journal_join_abort(struct reiserfs_transaction_handle *th, + struct super_block *sb, unsigned long nblocks) +{ + struct reiserfs_transaction_handle *cur_th = current->journal_info; + + /* this keeps do_journal_end from NULLing out the current->journal_info + ** pointer + */ + th->t_handle_save = cur_th; + BUG_ON(cur_th && cur_th->t_refcount > 1); + return do_journal_begin_r(th, sb, nblocks, JBEGIN_ABORT); +} + +int journal_begin(struct reiserfs_transaction_handle *th, + struct super_block *sb, unsigned long nblocks) +{ + struct reiserfs_transaction_handle *cur_th = current->journal_info; + int ret; + + th->t_handle_save = NULL; + if (cur_th) { + /* we are nesting into the current transaction */ + if (cur_th->t_super == sb) { + BUG_ON(!cur_th->t_refcount); + cur_th->t_refcount++; + memcpy(th, cur_th, sizeof(*th)); + if (th->t_refcount <= 1) + reiserfs_warning(sb, "reiserfs-2005", + "BAD: refcount <= 1, but " + "journal_info != 0"); + return 0; + } else { + /* we've ended up with a handle from a different filesystem. + ** save it and restore on journal_end. This should never + ** really happen... + */ + reiserfs_warning(sb, "clm-2100", + "nesting info a different FS"); + th->t_handle_save = current->journal_info; + current->journal_info = th; + } + } else { + current->journal_info = th; + } + ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG); + BUG_ON(current->journal_info != th); + + /* I guess this boils down to being the reciprocal of clm-2100 above. + * If do_journal_begin_r fails, we need to put it back, since journal_end + * won't be called to do it. */ + if (ret) + current->journal_info = th->t_handle_save; + else + BUG_ON(!th->t_refcount); + + return ret; +} + +/* +** puts bh into the current transaction. If it was already there, reorders removes the +** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order). +** +** if it was dirty, cleans and files onto the clean list. I can't let it be dirty again until the +** transaction is committed. +** +** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len. +*/ +int journal_mark_dirty(struct reiserfs_transaction_handle *th, + struct super_block *sb, struct buffer_head *bh) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn = NULL; + int count_already_incd = 0; + int prepared = 0; + BUG_ON(!th->t_trans_id); + + PROC_INFO_INC(sb, journal.mark_dirty); + if (th->t_trans_id != journal->j_trans_id) { + reiserfs_panic(th->t_super, "journal-1577", + "handle trans id %ld != current trans id %ld", + th->t_trans_id, journal->j_trans_id); + } + + sb->s_dirt = 1; + + prepared = test_clear_buffer_journal_prepared(bh); + clear_buffer_journal_restore_dirty(bh); + /* already in this transaction, we are done */ + if (buffer_journaled(bh)) { + PROC_INFO_INC(sb, journal.mark_dirty_already); + return 0; + } + + /* this must be turned into a panic instead of a warning. We can't allow + ** a dirty or journal_dirty or locked buffer to be logged, as some changes + ** could get to disk too early. NOT GOOD. + */ + if (!prepared || buffer_dirty(bh)) { + reiserfs_warning(sb, "journal-1777", + "buffer %llu bad state " + "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT", + (unsigned long long)bh->b_blocknr, + prepared ? ' ' : '!', + buffer_locked(bh) ? ' ' : '!', + buffer_dirty(bh) ? ' ' : '!', + buffer_journal_dirty(bh) ? ' ' : '!'); + } + + if (atomic_read(&(journal->j_wcount)) <= 0) { + reiserfs_warning(sb, "journal-1409", + "returning because j_wcount was %d", + atomic_read(&(journal->j_wcount))); + return 1; + } + /* this error means I've screwed up, and we've overflowed the transaction. + ** Nothing can be done here, except make the FS readonly or panic. + */ + if (journal->j_len >= journal->j_trans_max) { + reiserfs_panic(th->t_super, "journal-1413", + "j_len (%lu) is too big", + journal->j_len); + } + + if (buffer_journal_dirty(bh)) { + count_already_incd = 1; + PROC_INFO_INC(sb, journal.mark_dirty_notjournal); + clear_buffer_journal_dirty(bh); + } + + if (journal->j_len > journal->j_len_alloc) { + journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT; + } + + set_buffer_journaled(bh); + + /* now put this guy on the end */ + if (!cn) { + cn = get_cnode(sb); + if (!cn) { + reiserfs_panic(sb, "journal-4", "get_cnode failed!"); + } + + if (th->t_blocks_logged == th->t_blocks_allocated) { + th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT; + journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT; + } + th->t_blocks_logged++; + journal->j_len++; + + cn->bh = bh; + cn->blocknr = bh->b_blocknr; + cn->sb = sb; + cn->jlist = NULL; + insert_journal_hash(journal->j_hash_table, cn); + if (!count_already_incd) { + get_bh(bh); + } + } + cn->next = NULL; + cn->prev = journal->j_last; + cn->bh = bh; + if (journal->j_last) { + journal->j_last->next = cn; + journal->j_last = cn; + } else { + journal->j_first = cn; + journal->j_last = cn; + } + return 0; +} + +int journal_end(struct reiserfs_transaction_handle *th, + struct super_block *sb, unsigned long nblocks) +{ + if (!current->journal_info && th->t_refcount > 1) + reiserfs_warning(sb, "REISER-NESTING", + "th NULL, refcount %d", th->t_refcount); + + if (!th->t_trans_id) { + WARN_ON(1); + return -EIO; + } + + th->t_refcount--; + if (th->t_refcount > 0) { + struct reiserfs_transaction_handle *cur_th = + current->journal_info; + + /* we aren't allowed to close a nested transaction on a different + ** filesystem from the one in the task struct + */ + BUG_ON(cur_th->t_super != th->t_super); + + if (th != cur_th) { + memcpy(current->journal_info, th, sizeof(*th)); + th->t_trans_id = 0; + } + return 0; + } else { + return do_journal_end(th, sb, nblocks, 0); + } +} + +/* removes from the current transaction, relsing and descrementing any counters. +** also files the removed buffer directly onto the clean list +** +** called by journal_mark_freed when a block has been deleted +** +** returns 1 if it cleaned and relsed the buffer. 0 otherwise +*/ +static int remove_from_transaction(struct super_block *sb, + b_blocknr_t blocknr, int already_cleaned) +{ + struct buffer_head *bh; + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + int ret = 0; + + cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr); + if (!cn || !cn->bh) { + return ret; + } + bh = cn->bh; + if (cn->prev) { + cn->prev->next = cn->next; + } + if (cn->next) { + cn->next->prev = cn->prev; + } + if (cn == journal->j_first) { + journal->j_first = cn->next; + } + if (cn == journal->j_last) { + journal->j_last = cn->prev; + } + if (bh) + remove_journal_hash(sb, journal->j_hash_table, NULL, + bh->b_blocknr, 0); + clear_buffer_journaled(bh); /* don't log this one */ + + if (!already_cleaned) { + clear_buffer_journal_dirty(bh); + clear_buffer_dirty(bh); + clear_buffer_journal_test(bh); + put_bh(bh); + if (atomic_read(&(bh->b_count)) < 0) { + reiserfs_warning(sb, "journal-1752", + "b_count < 0"); + } + ret = 1; + } + journal->j_len--; + journal->j_len_alloc--; + free_cnode(sb, cn); + return ret; +} + +/* +** for any cnode in a journal list, it can only be dirtied of all the +** transactions that include it are committed to disk. +** this checks through each transaction, and returns 1 if you are allowed to dirty, +** and 0 if you aren't +** +** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log +** blocks for a given transaction on disk +** +*/ +static int can_dirty(struct reiserfs_journal_cnode *cn) +{ + struct super_block *sb = cn->sb; + b_blocknr_t blocknr = cn->blocknr; + struct reiserfs_journal_cnode *cur = cn->hprev; + int can_dirty = 1; + + /* first test hprev. These are all newer than cn, so any node here + ** with the same block number and dev means this node can't be sent + ** to disk right now. + */ + while (cur && can_dirty) { + if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb && + cur->blocknr == blocknr) { + can_dirty = 0; + } + cur = cur->hprev; + } + /* then test hnext. These are all older than cn. As long as they + ** are committed to the log, it is safe to write cn to disk + */ + cur = cn->hnext; + while (cur && can_dirty) { + if (cur->jlist && cur->jlist->j_len > 0 && + atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh && + cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) { + can_dirty = 0; + } + cur = cur->hnext; + } + return can_dirty; +} + +/* syncs the commit blocks, but does not force the real buffers to disk +** will wait until the current transaction is done/committed before returning +*/ +int journal_end_sync(struct reiserfs_transaction_handle *th, + struct super_block *sb, unsigned long nblocks) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + BUG_ON(!th->t_trans_id); + /* you can sync while nested, very, very bad */ + BUG_ON(th->t_refcount > 1); + if (journal->j_len == 0) { + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb)); + } + return do_journal_end(th, sb, nblocks, COMMIT_NOW | WAIT); +} + +/* +** writeback the pending async commits to disk +*/ +static void flush_async_commits(struct work_struct *work) +{ + struct reiserfs_journal *journal = + container_of(work, struct reiserfs_journal, j_work.work); + struct super_block *sb = journal->j_work_sb; + struct reiserfs_journal_list *jl; + struct list_head *entry; + + reiserfs_write_lock(sb); + if (!list_empty(&journal->j_journal_list)) { + /* last entry is the youngest, commit it and you get everything */ + entry = journal->j_journal_list.prev; + jl = JOURNAL_LIST_ENTRY(entry); + flush_commit_list(sb, jl, 1); + } + reiserfs_write_unlock(sb); +} + +/* +** flushes any old transactions to disk +** ends the current transaction if it is too old +*/ +int reiserfs_flush_old_commits(struct super_block *sb) +{ + time_t now; + struct reiserfs_transaction_handle th; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + now = get_seconds(); + /* safety check so we don't flush while we are replaying the log during + * mount + */ + if (list_empty(&journal->j_journal_list)) { + return 0; + } + + /* check the current transaction. If there are no writers, and it is + * too old, finish it, and force the commit blocks to disk + */ + if (atomic_read(&journal->j_wcount) <= 0 && + journal->j_trans_start_time > 0 && + journal->j_len > 0 && + (now - journal->j_trans_start_time) > journal->j_max_trans_age) { + if (!journal_join(&th, sb, 1)) { + reiserfs_prepare_for_journal(sb, + SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(&th, sb, + SB_BUFFER_WITH_SB(sb)); + + /* we're only being called from kreiserfsd, it makes no sense to do + ** an async commit so that kreiserfsd can do it later + */ + do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT); + } + } + return sb->s_dirt; +} + +/* +** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit +** +** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all +** the writers are done. By the time it wakes up, the transaction it was called has already ended, so it just +** flushes the commit list and returns 0. +** +** Won't batch when flush or commit_now is set. Also won't batch when others are waiting on j_join_wait. +** +** Note, we can't allow the journal_end to proceed while there are still writers in the log. +*/ +static int check_journal_end(struct reiserfs_transaction_handle *th, + struct super_block *sb, unsigned long nblocks, + int flags) +{ + + time_t now; + int flush = flags & FLUSH_ALL; + int commit_now = flags & COMMIT_NOW; + int wait_on_commit = flags & WAIT; + struct reiserfs_journal_list *jl; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + BUG_ON(!th->t_trans_id); + + if (th->t_trans_id != journal->j_trans_id) { + reiserfs_panic(th->t_super, "journal-1577", + "handle trans id %ld != current trans id %ld", + th->t_trans_id, journal->j_trans_id); + } + + journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged); + if (atomic_read(&(journal->j_wcount)) > 0) { /* <= 0 is allowed. unmounting might not call begin */ + atomic_dec(&(journal->j_wcount)); + } + + /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released + ** will be dealt with by next transaction that actually writes something, but should be taken + ** care of in this trans + */ + BUG_ON(journal->j_len == 0); + + /* if wcount > 0, and we are called to with flush or commit_now, + ** we wait on j_join_wait. We will wake up when the last writer has + ** finished the transaction, and started it on its way to the disk. + ** Then, we flush the commit or journal list, and just return 0 + ** because the rest of journal end was already done for this transaction. + */ + if (atomic_read(&(journal->j_wcount)) > 0) { + if (flush || commit_now) { + unsigned trans_id; + + jl = journal->j_current_jl; + trans_id = jl->j_trans_id; + if (wait_on_commit) + jl->j_state |= LIST_COMMIT_PENDING; + atomic_set(&(journal->j_jlock), 1); + if (flush) { + journal->j_next_full_flush = 1; + } + unlock_journal(sb); + + /* sleep while the current transaction is still j_jlocked */ + while (journal->j_trans_id == trans_id) { + if (atomic_read(&journal->j_jlock)) { + queue_log_writer(sb); + } else { + lock_journal(sb); + if (journal->j_trans_id == trans_id) { + atomic_set(&(journal->j_jlock), + 1); + } + unlock_journal(sb); + } + } + BUG_ON(journal->j_trans_id == trans_id); + + if (commit_now + && journal_list_still_alive(sb, trans_id) + && wait_on_commit) { + flush_commit_list(sb, jl, 1); + } + return 0; + } + unlock_journal(sb); + return 0; + } + + /* deal with old transactions where we are the last writers */ + now = get_seconds(); + if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) { + commit_now = 1; + journal->j_next_async_flush = 1; + } + /* don't batch when someone is waiting on j_join_wait */ + /* don't batch when syncing the commit or flushing the whole trans */ + if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock))) + && !flush && !commit_now && (journal->j_len < journal->j_max_batch) + && journal->j_len_alloc < journal->j_max_batch + && journal->j_cnode_free > (journal->j_trans_max * 3)) { + journal->j_bcount++; + unlock_journal(sb); + return 0; + } + + if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) { + reiserfs_panic(sb, "journal-003", + "j_start (%ld) is too high", + journal->j_start); + } + return 1; +} + +/* +** Does all the work that makes deleting blocks safe. +** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on. +** +** otherwise: +** set a bit for the block in the journal bitmap. That will prevent it from being allocated for unformatted nodes +** before this transaction has finished. +** +** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. That will prevent any old transactions with +** this block from trying to flush to the real location. Since we aren't removing the cnode from the journal_list_hash, +** the block can't be reallocated yet. +** +** Then remove it from the current transaction, decrementing any counters and filing it on the clean list. +*/ +int journal_mark_freed(struct reiserfs_transaction_handle *th, + struct super_block *sb, b_blocknr_t blocknr) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn = NULL; + struct buffer_head *bh = NULL; + struct reiserfs_list_bitmap *jb = NULL; + int cleaned = 0; + BUG_ON(!th->t_trans_id); + + cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr); + if (cn && cn->bh) { + bh = cn->bh; + get_bh(bh); + } + /* if it is journal new, we just remove it from this transaction */ + if (bh && buffer_journal_new(bh)) { + clear_buffer_journal_new(bh); + clear_prepared_bits(bh); + reiserfs_clean_and_file_buffer(bh); + cleaned = remove_from_transaction(sb, blocknr, cleaned); + } else { + /* set the bit for this block in the journal bitmap for this transaction */ + jb = journal->j_current_jl->j_list_bitmap; + if (!jb) { + reiserfs_panic(sb, "journal-1702", + "journal_list_bitmap is NULL"); + } + set_bit_in_list_bitmap(sb, blocknr, jb); + + /* Note, the entire while loop is not allowed to schedule. */ + + if (bh) { + clear_prepared_bits(bh); + reiserfs_clean_and_file_buffer(bh); + } + cleaned = remove_from_transaction(sb, blocknr, cleaned); + + /* find all older transactions with this block, make sure they don't try to write it out */ + cn = get_journal_hash_dev(sb, journal->j_list_hash_table, + blocknr); + while (cn) { + if (sb == cn->sb && blocknr == cn->blocknr) { + set_bit(BLOCK_FREED, &cn->state); + if (cn->bh) { + if (!cleaned) { + /* remove_from_transaction will brelse the buffer if it was + ** in the current trans + */ + clear_buffer_journal_dirty(cn-> + bh); + clear_buffer_dirty(cn->bh); + clear_buffer_journal_test(cn-> + bh); + cleaned = 1; + put_bh(cn->bh); + if (atomic_read + (&(cn->bh->b_count)) < 0) { + reiserfs_warning(sb, + "journal-2138", + "cn->bh->b_count < 0"); + } + } + if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */ + atomic_dec(& + (cn->jlist-> + j_nonzerolen)); + } + cn->bh = NULL; + } + } + cn = cn->hnext; + } + } + + if (bh) + release_buffer_page(bh); /* get_hash grabs the buffer */ + return 0; +} + +void reiserfs_update_inode_transaction(struct inode *inode) +{ + struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb); + REISERFS_I(inode)->i_jl = journal->j_current_jl; + REISERFS_I(inode)->i_trans_id = journal->j_trans_id; +} + +/* + * returns -1 on error, 0 if no commits/barriers were done and 1 + * if a transaction was actually committed and the barrier was done + */ +static int __commit_trans_jl(struct inode *inode, unsigned long id, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_transaction_handle th; + struct super_block *sb = inode->i_sb; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + int ret = 0; + + /* is it from the current transaction, or from an unknown transaction? */ + if (id == journal->j_trans_id) { + jl = journal->j_current_jl; + /* try to let other writers come in and grow this transaction */ + let_transaction_grow(sb, id); + if (journal->j_trans_id != id) { + goto flush_commit_only; + } + + ret = journal_begin(&th, sb, 1); + if (ret) + return ret; + + /* someone might have ended this transaction while we joined */ + if (journal->j_trans_id != id) { + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)); + ret = journal_end(&th, sb, 1); + goto flush_commit_only; + } + + ret = journal_end_sync(&th, sb, 1); + if (!ret) + ret = 1; + + } else { + /* this gets tricky, we have to make sure the journal list in + * the inode still exists. We know the list is still around + * if we've got a larger transaction id than the oldest list + */ + flush_commit_only: + if (journal_list_still_alive(inode->i_sb, id)) { + /* + * we only set ret to 1 when we know for sure + * the barrier hasn't been started yet on the commit + * block. + */ + if (atomic_read(&jl->j_commit_left) > 1) + ret = 1; + flush_commit_list(sb, jl, 1); + if (journal->j_errno) + ret = journal->j_errno; + } + } + /* otherwise the list is gone, and long since committed */ + return ret; +} + +int reiserfs_commit_for_inode(struct inode *inode) +{ + unsigned int id = REISERFS_I(inode)->i_trans_id; + struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; + + /* for the whole inode, assume unset id means it was + * changed in the current transaction. More conservative + */ + if (!id || !jl) { + reiserfs_update_inode_transaction(inode); + id = REISERFS_I(inode)->i_trans_id; + /* jl will be updated in __commit_trans_jl */ + } + + return __commit_trans_jl(inode, id, jl); +} + +void reiserfs_restore_prepared_buffer(struct super_block *sb, + struct buffer_head *bh) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + PROC_INFO_INC(sb, journal.restore_prepared); + if (!bh) { + return; + } + if (test_clear_buffer_journal_restore_dirty(bh) && + buffer_journal_dirty(bh)) { + struct reiserfs_journal_cnode *cn; + cn = get_journal_hash_dev(sb, + journal->j_list_hash_table, + bh->b_blocknr); + if (cn && can_dirty(cn)) { + set_buffer_journal_test(bh); + mark_buffer_dirty(bh); + } + } + clear_buffer_journal_prepared(bh); +} + +extern struct tree_balance *cur_tb; +/* +** before we can change a metadata block, we have to make sure it won't +** be written to disk while we are altering it. So, we must: +** clean it +** wait on it. +** +*/ +int reiserfs_prepare_for_journal(struct super_block *sb, + struct buffer_head *bh, int wait) +{ + PROC_INFO_INC(sb, journal.prepare); + + if (!trylock_buffer(bh)) { + if (!wait) + return 0; + lock_buffer(bh); + } + set_buffer_journal_prepared(bh); + if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) { + clear_buffer_journal_test(bh); + set_buffer_journal_restore_dirty(bh); + } + unlock_buffer(bh); + return 1; +} + +static void flush_old_journal_lists(struct super_block *s) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + struct reiserfs_journal_list *jl; + struct list_head *entry; + time_t now = get_seconds(); + + while (!list_empty(&journal->j_journal_list)) { + entry = journal->j_journal_list.next; + jl = JOURNAL_LIST_ENTRY(entry); + /* this check should always be run, to send old lists to disk */ + if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4)) && + atomic_read(&jl->j_commit_left) == 0 && + test_transaction(s, jl)) { + flush_used_journal_lists(s, jl); + } else { + break; + } + } +} + +/* +** long and ugly. If flush, will not return until all commit +** blocks and all real buffers in the trans are on disk. +** If no_async, won't return until all commit blocks are on disk. +** +** keep reading, there are comments as you go along +** +** If the journal is aborted, we just clean up. Things like flushing +** journal lists, etc just won't happen. +*/ +static int do_journal_end(struct reiserfs_transaction_handle *th, + struct super_block *sb, unsigned long nblocks, + int flags) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn, *next, *jl_cn; + struct reiserfs_journal_cnode *last_cn = NULL; + struct reiserfs_journal_desc *desc; + struct reiserfs_journal_commit *commit; + struct buffer_head *c_bh; /* commit bh */ + struct buffer_head *d_bh; /* desc bh */ + int cur_write_start = 0; /* start index of current log write */ + int old_start; + int i; + int flush; + int wait_on_commit; + struct reiserfs_journal_list *jl, *temp_jl; + struct list_head *entry, *safe; + unsigned long jindex; + unsigned int commit_trans_id; + int trans_half; + + BUG_ON(th->t_refcount > 1); + BUG_ON(!th->t_trans_id); + + /* protect flush_older_commits from doing mistakes if the + transaction ID counter gets overflowed. */ + if (th->t_trans_id == ~0U) + flags |= FLUSH_ALL | COMMIT_NOW | WAIT; + flush = flags & FLUSH_ALL; + wait_on_commit = flags & WAIT; + + put_fs_excl(); + current->journal_info = th->t_handle_save; + reiserfs_check_lock_depth(sb, "journal end"); + if (journal->j_len == 0) { + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb)); + } + + lock_journal(sb); + if (journal->j_next_full_flush) { + flags |= FLUSH_ALL; + flush = 1; + } + if (journal->j_next_async_flush) { + flags |= COMMIT_NOW | WAIT; + wait_on_commit = 1; + } + + /* check_journal_end locks the journal, and unlocks if it does not return 1 + ** it tells us if we should continue with the journal_end, or just return + */ + if (!check_journal_end(th, sb, nblocks, flags)) { + sb->s_dirt = 1; + wake_queued_writers(sb); + reiserfs_async_progress_wait(sb); + goto out; + } + + /* check_journal_end might set these, check again */ + if (journal->j_next_full_flush) { + flush = 1; + } + + /* + ** j must wait means we have to flush the log blocks, and the real blocks for + ** this transaction + */ + if (journal->j_must_wait > 0) { + flush = 1; + } +#ifdef REISERFS_PREALLOCATE + /* quota ops might need to nest, setup the journal_info pointer for them + * and raise the refcount so that it is > 0. */ + current->journal_info = th; + th->t_refcount++; + reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into + * the transaction */ + th->t_refcount--; + current->journal_info = th->t_handle_save; +#endif + + /* setup description block */ + d_bh = + journal_getblk(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + journal->j_start); + set_buffer_uptodate(d_bh); + desc = (struct reiserfs_journal_desc *)(d_bh)->b_data; + memset(d_bh->b_data, 0, d_bh->b_size); + memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8); + set_desc_trans_id(desc, journal->j_trans_id); + + /* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */ + c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((journal->j_start + journal->j_len + + 1) % SB_ONDISK_JOURNAL_SIZE(sb))); + commit = (struct reiserfs_journal_commit *)c_bh->b_data; + memset(c_bh->b_data, 0, c_bh->b_size); + set_commit_trans_id(commit, journal->j_trans_id); + set_buffer_uptodate(c_bh); + + /* init this journal list */ + jl = journal->j_current_jl; + + /* we lock the commit before doing anything because + * we want to make sure nobody tries to run flush_commit_list until + * the new transaction is fully setup, and we've already flushed the + * ordered bh list + */ + reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb); + + /* save the transaction id in case we need to commit it later */ + commit_trans_id = jl->j_trans_id; + + atomic_set(&jl->j_older_commits_done, 0); + jl->j_trans_id = journal->j_trans_id; + jl->j_timestamp = journal->j_trans_start_time; + jl->j_commit_bh = c_bh; + jl->j_start = journal->j_start; + jl->j_len = journal->j_len; + atomic_set(&jl->j_nonzerolen, journal->j_len); + atomic_set(&jl->j_commit_left, journal->j_len + 2); + jl->j_realblock = NULL; + + /* The ENTIRE FOR LOOP MUST not cause schedule to occur. + ** for each real block, add it to the journal list hash, + ** copy into real block index array in the commit or desc block + */ + trans_half = journal_trans_half(sb->s_blocksize); + for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) { + if (buffer_journaled(cn->bh)) { + jl_cn = get_cnode(sb); + if (!jl_cn) { + reiserfs_panic(sb, "journal-1676", + "get_cnode returned NULL"); + } + if (i == 0) { + jl->j_realblock = jl_cn; + } + jl_cn->prev = last_cn; + jl_cn->next = NULL; + if (last_cn) { + last_cn->next = jl_cn; + } + last_cn = jl_cn; + /* make sure the block we are trying to log is not a block + of journal or reserved area */ + + if (is_block_in_log_or_reserved_area + (sb, cn->bh->b_blocknr)) { + reiserfs_panic(sb, "journal-2332", + "Trying to log block %lu, " + "which is a log block", + cn->bh->b_blocknr); + } + jl_cn->blocknr = cn->bh->b_blocknr; + jl_cn->state = 0; + jl_cn->sb = sb; + jl_cn->bh = cn->bh; + jl_cn->jlist = jl; + insert_journal_hash(journal->j_list_hash_table, jl_cn); + if (i < trans_half) { + desc->j_realblock[i] = + cpu_to_le32(cn->bh->b_blocknr); + } else { + commit->j_realblock[i - trans_half] = + cpu_to_le32(cn->bh->b_blocknr); + } + } else { + i--; + } + } + set_desc_trans_len(desc, journal->j_len); + set_desc_mount_id(desc, journal->j_mount_id); + set_desc_trans_id(desc, journal->j_trans_id); + set_commit_trans_len(commit, journal->j_len); + + /* special check in case all buffers in the journal were marked for not logging */ + BUG_ON(journal->j_len == 0); + + /* we're about to dirty all the log blocks, mark the description block + * dirty now too. Don't mark the commit block dirty until all the + * others are on disk + */ + mark_buffer_dirty(d_bh); + + /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */ + cur_write_start = journal->j_start; + cn = journal->j_first; + jindex = 1; /* start at one so we don't get the desc again */ + while (cn) { + clear_buffer_journal_new(cn->bh); + /* copy all the real blocks into log area. dirty log blocks */ + if (buffer_journaled(cn->bh)) { + struct buffer_head *tmp_bh; + char *addr; + struct page *page; + tmp_bh = + journal_getblk(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((cur_write_start + + jindex) % + SB_ONDISK_JOURNAL_SIZE(sb))); + set_buffer_uptodate(tmp_bh); + page = cn->bh->b_page; + addr = kmap(page); + memcpy(tmp_bh->b_data, + addr + offset_in_page(cn->bh->b_data), + cn->bh->b_size); + kunmap(page); + mark_buffer_dirty(tmp_bh); + jindex++; + set_buffer_journal_dirty(cn->bh); + clear_buffer_journaled(cn->bh); + } else { + /* JDirty cleared sometime during transaction. don't log this one */ + reiserfs_warning(sb, "journal-2048", + "BAD, buffer in journal hash, " + "but not JDirty!"); + brelse(cn->bh); + } + next = cn->next; + free_cnode(sb, cn); + cn = next; + reiserfs_write_unlock(sb); + cond_resched(); + reiserfs_write_lock(sb); + } + + /* we are done with both the c_bh and d_bh, but + ** c_bh must be written after all other commit blocks, + ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. + */ + + journal->j_current_jl = alloc_journal_list(sb); + + /* now it is safe to insert this transaction on the main list */ + list_add_tail(&jl->j_list, &journal->j_journal_list); + list_add_tail(&jl->j_working_list, &journal->j_working_list); + journal->j_num_work_lists++; + + /* reset journal values for the next transaction */ + old_start = journal->j_start; + journal->j_start = + (journal->j_start + journal->j_len + + 2) % SB_ONDISK_JOURNAL_SIZE(sb); + atomic_set(&(journal->j_wcount), 0); + journal->j_bcount = 0; + journal->j_last = NULL; + journal->j_first = NULL; + journal->j_len = 0; + journal->j_trans_start_time = 0; + /* check for trans_id overflow */ + if (++journal->j_trans_id == 0) + journal->j_trans_id = 10; + journal->j_current_jl->j_trans_id = journal->j_trans_id; + journal->j_must_wait = 0; + journal->j_len_alloc = 0; + journal->j_next_full_flush = 0; + journal->j_next_async_flush = 0; + init_journal_hash(sb); + + // make sure reiserfs_add_jh sees the new current_jl before we + // write out the tails + smp_mb(); + + /* tail conversion targets have to hit the disk before we end the + * transaction. Otherwise a later transaction might repack the tail + * before this transaction commits, leaving the data block unflushed and + * clean, if we crash before the later transaction commits, the data block + * is lost. + */ + if (!list_empty(&jl->j_tail_bh_list)) { + reiserfs_write_unlock(sb); + write_ordered_buffers(&journal->j_dirty_buffers_lock, + journal, jl, &jl->j_tail_bh_list); + reiserfs_write_lock(sb); + } + BUG_ON(!list_empty(&jl->j_tail_bh_list)); + mutex_unlock(&jl->j_commit_mutex); + + /* honor the flush wishes from the caller, simple commits can + ** be done outside the journal lock, they are done below + ** + ** if we don't flush the commit list right now, we put it into + ** the work queue so the people waiting on the async progress work + ** queue don't wait for this proc to flush journal lists and such. + */ + if (flush) { + flush_commit_list(sb, jl, 1); + flush_journal_list(sb, jl, 1); + } else if (!(jl->j_state & LIST_COMMIT_PENDING)) + queue_delayed_work(commit_wq, &journal->j_work, HZ / 10); + + /* if the next transaction has any chance of wrapping, flush + ** transactions that might get overwritten. If any journal lists are very + ** old flush them as well. + */ + first_jl: + list_for_each_safe(entry, safe, &journal->j_journal_list) { + temp_jl = JOURNAL_LIST_ENTRY(entry); + if (journal->j_start <= temp_jl->j_start) { + if ((journal->j_start + journal->j_trans_max + 1) >= + temp_jl->j_start) { + flush_used_journal_lists(sb, temp_jl); + goto first_jl; + } else if ((journal->j_start + + journal->j_trans_max + 1) < + SB_ONDISK_JOURNAL_SIZE(sb)) { + /* if we don't cross into the next transaction and we don't + * wrap, there is no way we can overlap any later transactions + * break now + */ + break; + } + } else if ((journal->j_start + + journal->j_trans_max + 1) > + SB_ONDISK_JOURNAL_SIZE(sb)) { + if (((journal->j_start + journal->j_trans_max + 1) % + SB_ONDISK_JOURNAL_SIZE(sb)) >= + temp_jl->j_start) { + flush_used_journal_lists(sb, temp_jl); + goto first_jl; + } else { + /* we don't overlap anything from out start to the end of the + * log, and our wrapped portion doesn't overlap anything at + * the start of the log. We can break + */ + break; + } + } + } + flush_old_journal_lists(sb); + + journal->j_current_jl->j_list_bitmap = + get_list_bitmap(sb, journal->j_current_jl); + + if (!(journal->j_current_jl->j_list_bitmap)) { + reiserfs_panic(sb, "journal-1996", + "could not get a list bitmap"); + } + + atomic_set(&(journal->j_jlock), 0); + unlock_journal(sb); + /* wake up any body waiting to join. */ + clear_bit(J_WRITERS_QUEUED, &journal->j_state); + wake_up(&(journal->j_join_wait)); + + if (!flush && wait_on_commit && + journal_list_still_alive(sb, commit_trans_id)) { + flush_commit_list(sb, jl, 1); + } + out: + reiserfs_check_lock_depth(sb, "journal end2"); + + memset(th, 0, sizeof(*th)); + /* Re-set th->t_super, so we can properly keep track of how many + * persistent transactions there are. We need to do this so if this + * call is part of a failed restart_transaction, we can free it later */ + th->t_super = sb; + + return journal->j_errno; +} + +/* Send the file system read only and refuse new transactions */ +void reiserfs_abort_journal(struct super_block *sb, int errno) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + if (test_bit(J_ABORTED, &journal->j_state)) + return; + + if (!journal->j_errno) + journal->j_errno = errno; + + sb->s_flags |= MS_RDONLY; + set_bit(J_ABORTED, &journal->j_state); + +#ifdef CONFIG_REISERFS_CHECK + dump_stack(); +#endif +} + diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c new file mode 100644 index 00000000..03d85cbf --- /dev/null +++ b/fs/reiserfs/lbalance.c @@ -0,0 +1,1311 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include +#include + +/* these are used in do_balance.c */ + +/* leaf_move_items + leaf_shift_left + leaf_shift_right + leaf_delete_items + leaf_insert_into_buf + leaf_paste_in_buffer + leaf_cut_from_buffer + leaf_paste_entries + */ + +/* copy copy_count entries from source directory item to dest buffer (creating new item if needed) */ +static void leaf_copy_dir_entries(struct buffer_info *dest_bi, + struct buffer_head *source, int last_first, + int item_num, int from, int copy_count) +{ + struct buffer_head *dest = dest_bi->bi_bh; + int item_num_in_dest; /* either the number of target item, + or if we must create a new item, + the number of the item we will + create it next to */ + struct item_head *ih; + struct reiserfs_de_head *deh; + int copy_records_len; /* length of all records in item to be copied */ + char *records; + + ih = B_N_PITEM_HEAD(source, item_num); + + RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item"); + + /* length of all record to be copied and first byte of the last of them */ + deh = B_I_DEH(source, ih); + if (copy_count) { + copy_records_len = (from ? deh_location(&(deh[from - 1])) : + ih_item_len(ih)) - + deh_location(&(deh[from + copy_count - 1])); + records = + source->b_data + ih_location(ih) + + deh_location(&(deh[from + copy_count - 1])); + } else { + copy_records_len = 0; + records = NULL; + } + + /* when copy last to first, dest buffer can contain 0 items */ + item_num_in_dest = + (last_first == + LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest) + - 1); + + /* if there are no items in dest or the first/last item in dest is not item of the same directory */ + if ((item_num_in_dest == -1) || + (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) || + (last_first == LAST_TO_FIRST + && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key, + B_N_PKEY(dest, + item_num_in_dest)))) + { + /* create new item in dest */ + struct item_head new_ih; + + /* form item header */ + memcpy(&new_ih.ih_key, &ih->ih_key, KEY_SIZE); + put_ih_version(&new_ih, KEY_FORMAT_3_5); + /* calculate item len */ + put_ih_item_len(&new_ih, + DEH_SIZE * copy_count + copy_records_len); + put_ih_entry_count(&new_ih, 0); + + if (last_first == LAST_TO_FIRST) { + /* form key by the following way */ + if (from < I_ENTRY_COUNT(ih)) { + set_le_ih_k_offset(&new_ih, + deh_offset(&(deh[from]))); + /*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE); */ + } else { + /* no entries will be copied to this item in this function */ + set_le_ih_k_offset(&new_ih, U32_MAX); + /* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */ + } + set_le_key_k_type(KEY_FORMAT_3_5, &(new_ih.ih_key), + TYPE_DIRENTRY); + } + + /* insert item into dest buffer */ + leaf_insert_into_buf(dest_bi, + (last_first == + LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest), + &new_ih, NULL, 0); + } else { + /* prepare space for entries */ + leaf_paste_in_buffer(dest_bi, + (last_first == + FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - + 1) : 0, MAX_US_INT, + DEH_SIZE * copy_count + copy_records_len, + records, 0); + } + + item_num_in_dest = + (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0; + + leaf_paste_entries(dest_bi, item_num_in_dest, + (last_first == + FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD(dest, + item_num_in_dest)) + : 0, copy_count, deh + from, records, + DEH_SIZE * copy_count + copy_records_len); +} + +/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or + part of it or nothing (see the return 0 below) from SOURCE to the end + (if last_first) or beginning (!last_first) of the DEST */ +/* returns 1 if anything was copied, else 0 */ +static int leaf_copy_boundary_item(struct buffer_info *dest_bi, + struct buffer_head *src, int last_first, + int bytes_or_entries) +{ + struct buffer_head *dest = dest_bi->bi_bh; + int dest_nr_item, src_nr_item; /* number of items in the source and destination buffers */ + struct item_head *ih; + struct item_head *dih; + + dest_nr_item = B_NR_ITEMS(dest); + + if (last_first == FIRST_TO_LAST) { + /* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects + or of different types ) then there is no need to treat this item differently from the other items + that we copy, so we return */ + ih = B_N_PITEM_HEAD(src, 0); + dih = B_N_PITEM_HEAD(dest, dest_nr_item - 1); + if (!dest_nr_item + || (!op_is_left_mergeable(&(ih->ih_key), src->b_size))) + /* there is nothing to merge */ + return 0; + + RFALSE(!ih_item_len(ih), + "vs-10010: item can not have empty length"); + + if (is_direntry_le_ih(ih)) { + if (bytes_or_entries == -1) + /* copy all entries to dest */ + bytes_or_entries = ih_entry_count(ih); + leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 0, 0, + bytes_or_entries); + return 1; + } + + /* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST + part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header + */ + if (bytes_or_entries == -1) + bytes_or_entries = ih_item_len(ih); + +#ifdef CONFIG_REISERFS_CHECK + else { + if (bytes_or_entries == ih_item_len(ih) + && is_indirect_le_ih(ih)) + if (get_ih_free_space(ih)) + reiserfs_panic(sb_from_bi(dest_bi), + "vs-10020", + "last unformatted node " + "must be filled " + "entirely (%h)", ih); + } +#endif + + /* merge first item (or its part) of src buffer with the last + item of dest buffer. Both are of the same file */ + leaf_paste_in_buffer(dest_bi, + dest_nr_item - 1, ih_item_len(dih), + bytes_or_entries, B_I_PITEM(src, ih), 0); + + if (is_indirect_le_ih(dih)) { + RFALSE(get_ih_free_space(dih), + "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space", + ih); + if (bytes_or_entries == ih_item_len(ih)) + set_ih_free_space(dih, get_ih_free_space(ih)); + } + + return 1; + } + + /* copy boundary item to right (last_first == LAST_TO_FIRST) */ + + /* ( DEST is empty or last item of SOURCE and first item of DEST + are the items of different object or of different types ) + */ + src_nr_item = B_NR_ITEMS(src); + ih = B_N_PITEM_HEAD(src, src_nr_item - 1); + dih = B_N_PITEM_HEAD(dest, 0); + + if (!dest_nr_item || !op_is_left_mergeable(&(dih->ih_key), src->b_size)) + return 0; + + if (is_direntry_le_ih(ih)) { + if (bytes_or_entries == -1) + /* bytes_or_entries = entries number in last item body of SOURCE */ + bytes_or_entries = ih_entry_count(ih); + + leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, + src_nr_item - 1, + ih_entry_count(ih) - bytes_or_entries, + bytes_or_entries); + return 1; + } + + /* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST; + part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST; + don't create new item header + */ + + RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih), + "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)", + ih); + + if (bytes_or_entries == -1) { + /* bytes_or_entries = length of last item body of SOURCE */ + bytes_or_entries = ih_item_len(ih); + + RFALSE(le_ih_k_offset(dih) != + le_ih_k_offset(ih) + op_bytes_number(ih, src->b_size), + "vs-10050: items %h and %h do not match", ih, dih); + + /* change first item key of the DEST */ + set_le_ih_k_offset(dih, le_ih_k_offset(ih)); + + /* item becomes non-mergeable */ + /* or mergeable if left item was */ + set_le_ih_k_type(dih, le_ih_k_type(ih)); + } else { + /* merge to right only part of item */ + RFALSE(ih_item_len(ih) <= bytes_or_entries, + "vs-10060: no so much bytes %lu (needed %lu)", + (unsigned long)ih_item_len(ih), + (unsigned long)bytes_or_entries); + + /* change first item key of the DEST */ + if (is_direct_le_ih(dih)) { + RFALSE(le_ih_k_offset(dih) <= + (unsigned long)bytes_or_entries, + "vs-10070: dih %h, bytes_or_entries(%d)", dih, + bytes_or_entries); + set_le_ih_k_offset(dih, + le_ih_k_offset(dih) - + bytes_or_entries); + } else { + RFALSE(le_ih_k_offset(dih) <= + (bytes_or_entries / UNFM_P_SIZE) * dest->b_size, + "vs-10080: dih %h, bytes_or_entries(%d)", + dih, + (bytes_or_entries / UNFM_P_SIZE) * dest->b_size); + set_le_ih_k_offset(dih, + le_ih_k_offset(dih) - + ((bytes_or_entries / UNFM_P_SIZE) * + dest->b_size)); + } + } + + leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries, + B_I_PITEM(src, + ih) + ih_item_len(ih) - bytes_or_entries, + 0); + return 1; +} + +/* copy cpy_mun items from buffer src to buffer dest + * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning from first-th item in src to tail of dest + * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning from first-th item in src to head of dest + */ +static void leaf_copy_items_entirely(struct buffer_info *dest_bi, + struct buffer_head *src, int last_first, + int first, int cpy_num) +{ + struct buffer_head *dest; + int nr, free_space; + int dest_before; + int last_loc, last_inserted_loc, location; + int i, j; + struct block_head *blkh; + struct item_head *ih; + + RFALSE(last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST, + "vs-10090: bad last_first parameter %d", last_first); + RFALSE(B_NR_ITEMS(src) - first < cpy_num, + "vs-10100: too few items in source %d, required %d from %d", + B_NR_ITEMS(src), cpy_num, first); + RFALSE(cpy_num < 0, "vs-10110: can not copy negative amount of items"); + RFALSE(!dest_bi, "vs-10120: can not copy negative amount of items"); + + dest = dest_bi->bi_bh; + + RFALSE(!dest, "vs-10130: can not copy negative amount of items"); + + if (cpy_num == 0) + return; + + blkh = B_BLK_HEAD(dest); + nr = blkh_nr_item(blkh); + free_space = blkh_free_space(blkh); + + /* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */ + dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr; + + /* location of head of first new item */ + ih = B_N_PITEM_HEAD(dest, dest_before); + + RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE, + "vs-10140: not enough free space for headers %d (needed %d)", + B_FREE_SPACE(dest), cpy_num * IH_SIZE); + + /* prepare space for headers */ + memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE); + + /* copy item headers */ + memcpy(ih, B_N_PITEM_HEAD(src, first), cpy_num * IH_SIZE); + + free_space -= (IH_SIZE * cpy_num); + set_blkh_free_space(blkh, free_space); + + /* location of unmovable item */ + j = location = (dest_before == 0) ? dest->b_size : ih_location(ih - 1); + for (i = dest_before; i < nr + cpy_num; i++) { + location -= ih_item_len(ih + i - dest_before); + put_ih_location(ih + i - dest_before, location); + } + + /* prepare space for items */ + last_loc = ih_location(&(ih[nr + cpy_num - 1 - dest_before])); + last_inserted_loc = ih_location(&(ih[cpy_num - 1])); + + /* check free space */ + RFALSE(free_space < j - last_inserted_loc, + "vs-10150: not enough free space for items %d (needed %d)", + free_space, j - last_inserted_loc); + + memmove(dest->b_data + last_loc, + dest->b_data + last_loc + j - last_inserted_loc, + last_inserted_loc - last_loc); + + /* copy items */ + memcpy(dest->b_data + last_inserted_loc, + B_N_PITEM(src, (first + cpy_num - 1)), j - last_inserted_loc); + + /* sizes, item number */ + set_blkh_nr_item(blkh, nr + cpy_num); + set_blkh_free_space(blkh, free_space - (j - last_inserted_loc)); + + do_balance_mark_leaf_dirty(dest_bi->tb, dest, 0); + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); + RFALSE(dc_block_number(t_dc) != dest->b_blocknr, + "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu", + (long unsigned)dest->b_blocknr, + (long unsigned)dc_block_number(t_dc)); + put_dc_size(t_dc, + dc_size(t_dc) + (j - last_inserted_loc + + IH_SIZE * cpy_num)); + + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, + 0); + } +} + +/* This function splits the (liquid) item into two items (useful when + shifting part of an item into another node.) */ +static void leaf_item_bottle(struct buffer_info *dest_bi, + struct buffer_head *src, int last_first, + int item_num, int cpy_bytes) +{ + struct buffer_head *dest = dest_bi->bi_bh; + struct item_head *ih; + + RFALSE(cpy_bytes == -1, + "vs-10170: bytes == - 1 means: do not split item"); + + if (last_first == FIRST_TO_LAST) { + /* if ( if item in position item_num in buffer SOURCE is directory item ) */ + ih = B_N_PITEM_HEAD(src, item_num); + if (is_direntry_le_ih(ih)) + leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, + item_num, 0, cpy_bytes); + else { + struct item_head n_ih; + + /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST + part defined by 'cpy_bytes'; create new item header; change old item_header (????); + n_ih = new item_header; + */ + memcpy(&n_ih, ih, IH_SIZE); + put_ih_item_len(&n_ih, cpy_bytes); + if (is_indirect_le_ih(ih)) { + RFALSE(cpy_bytes == ih_item_len(ih) + && get_ih_free_space(ih), + "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)", + (long unsigned)get_ih_free_space(ih)); + set_ih_free_space(&n_ih, 0); + } + + RFALSE(op_is_left_mergeable(&(ih->ih_key), src->b_size), + "vs-10190: bad mergeability of item %h", ih); + n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih, + B_N_PITEM(src, item_num), 0); + } + } else { + /* if ( if item in position item_num in buffer SOURCE is directory item ) */ + ih = B_N_PITEM_HEAD(src, item_num); + if (is_direntry_le_ih(ih)) + leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, + item_num, + I_ENTRY_COUNT(ih) - cpy_bytes, + cpy_bytes); + else { + struct item_head n_ih; + + /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST + part defined by 'cpy_bytes'; create new item header; + n_ih = new item_header; + */ + memcpy(&n_ih, ih, SHORT_KEY_SIZE); + + n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + + if (is_direct_le_ih(ih)) { + set_le_ih_k_offset(&n_ih, + le_ih_k_offset(ih) + + ih_item_len(ih) - cpy_bytes); + set_le_ih_k_type(&n_ih, TYPE_DIRECT); + set_ih_free_space(&n_ih, MAX_US_INT); + } else { + /* indirect item */ + RFALSE(!cpy_bytes && get_ih_free_space(ih), + "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended"); + set_le_ih_k_offset(&n_ih, + le_ih_k_offset(ih) + + (ih_item_len(ih) - + cpy_bytes) / UNFM_P_SIZE * + dest->b_size); + set_le_ih_k_type(&n_ih, TYPE_INDIRECT); + set_ih_free_space(&n_ih, get_ih_free_space(ih)); + } + + /* set item length */ + put_ih_item_len(&n_ih, cpy_bytes); + + n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + + leaf_insert_into_buf(dest_bi, 0, &n_ih, + B_N_PITEM(src, + item_num) + + ih_item_len(ih) - cpy_bytes, 0); + } + } +} + +/* If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE to DEST. + If cpy_bytes not equal to minus one than copy cpy_num-1 whole items from SOURCE to DEST. + From last item copy cpy_num bytes for regular item and cpy_num directory entries for + directory item. */ +static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, + int last_first, int cpy_num, int cpy_bytes) +{ + struct buffer_head *dest; + int pos, i, src_nr_item, bytes; + + dest = dest_bi->bi_bh; + RFALSE(!dest || !src, "vs-10210: !dest || !src"); + RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, + "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST"); + RFALSE(B_NR_ITEMS(src) < cpy_num, + "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src), + cpy_num); + RFALSE(cpy_num < 0, "vs-10240: cpy_num < 0 (%d)", cpy_num); + + if (cpy_num == 0) + return 0; + + if (last_first == FIRST_TO_LAST) { + /* copy items to left */ + pos = 0; + if (cpy_num == 1) + bytes = cpy_bytes; + else + bytes = -1; + + /* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */ + i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes); + cpy_num -= i; + if (cpy_num == 0) + return i; + pos += i; + if (cpy_bytes == -1) + /* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */ + leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, + pos, cpy_num); + else { + /* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */ + leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, + pos, cpy_num - 1); + + /* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */ + leaf_item_bottle(dest_bi, src, FIRST_TO_LAST, + cpy_num + pos - 1, cpy_bytes); + } + } else { + /* copy items to right */ + src_nr_item = B_NR_ITEMS(src); + if (cpy_num == 1) + bytes = cpy_bytes; + else + bytes = -1; + + /* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */ + i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes); + + cpy_num -= i; + if (cpy_num == 0) + return i; + + pos = src_nr_item - cpy_num - i; + if (cpy_bytes == -1) { + /* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */ + leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, + pos, cpy_num); + } else { + /* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */ + leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, + pos + 1, cpy_num - 1); + + /* copy part of the item which number is pos to the begin of the DEST */ + leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos, + cpy_bytes); + } + } + return i; +} + +/* there are types of coping: from S[0] to L[0], from S[0] to R[0], + from R[0] to L[0]. for each of these we have to define parent and + positions of destination and source buffers */ +static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb, + struct buffer_info *dest_bi, + struct buffer_info *src_bi, + int *first_last, + struct buffer_head *Snew) +{ + memset(dest_bi, 0, sizeof(struct buffer_info)); + memset(src_bi, 0, sizeof(struct buffer_info)); + + /* define dest, src, dest parent, dest position */ + switch (shift_mode) { + case LEAF_FROM_S_TO_L: /* it is used in leaf_shift_left */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); /* src->b_item_order */ + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[0]; + dest_bi->bi_parent = tb->FL[0]; + dest_bi->bi_position = get_left_neighbor_position(tb, 0); + *first_last = FIRST_TO_LAST; + break; + + case LEAF_FROM_S_TO_R: /* it is used in leaf_shift_right */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[0]; + dest_bi->bi_parent = tb->FR[0]; + dest_bi->bi_position = get_right_neighbor_position(tb, 0); + *first_last = LAST_TO_FIRST; + break; + + case LEAF_FROM_R_TO_L: /* it is used in balance_leaf_when_delete */ + src_bi->tb = tb; + src_bi->bi_bh = tb->R[0]; + src_bi->bi_parent = tb->FR[0]; + src_bi->bi_position = get_right_neighbor_position(tb, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[0]; + dest_bi->bi_parent = tb->FL[0]; + dest_bi->bi_position = get_left_neighbor_position(tb, 0); + *first_last = FIRST_TO_LAST; + break; + + case LEAF_FROM_L_TO_R: /* it is used in balance_leaf_when_delete */ + src_bi->tb = tb; + src_bi->bi_bh = tb->L[0]; + src_bi->bi_parent = tb->FL[0]; + src_bi->bi_position = get_left_neighbor_position(tb, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[0]; + dest_bi->bi_parent = tb->FR[0]; + dest_bi->bi_position = get_right_neighbor_position(tb, 0); + *first_last = LAST_TO_FIRST; + break; + + case LEAF_FROM_S_TO_SNEW: + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = Snew; + dest_bi->bi_parent = NULL; + dest_bi->bi_position = 0; + *first_last = LAST_TO_FIRST; + break; + + default: + reiserfs_panic(sb_from_bi(src_bi), "vs-10250", + "shift type is unknown (%d)", shift_mode); + } + RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh, + "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly", + shift_mode, src_bi->bi_bh, dest_bi->bi_bh); +} + +/* copy mov_num items and mov_bytes of the (mov_num-1)th item to + neighbor. Delete them from source */ +int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num, + int mov_bytes, struct buffer_head *Snew) +{ + int ret_value; + struct buffer_info dest_bi, src_bi; + int first_last; + + leaf_define_dest_src_infos(shift_mode, tb, &dest_bi, &src_bi, + &first_last, Snew); + + ret_value = + leaf_copy_items(&dest_bi, src_bi.bi_bh, first_last, mov_num, + mov_bytes); + + leaf_delete_items(&src_bi, first_last, + (first_last == + FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) - + mov_num), mov_num, mov_bytes); + + return ret_value; +} + +/* Shift shift_num items (and shift_bytes of last shifted item if shift_bytes != -1) + from S[0] to L[0] and replace the delimiting key */ +int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes) +{ + struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path); + int i; + + /* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */ + i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL); + + if (shift_num) { + if (B_NR_ITEMS(S0) == 0) { /* number of items in S[0] == 0 */ + + RFALSE(shift_bytes != -1, + "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)", + shift_bytes); +#ifdef CONFIG_REISERFS_CHECK + if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) { + print_cur_tb("vs-10275"); + reiserfs_panic(tb->tb_sb, "vs-10275", + "balance condition corrupted " + "(%c)", tb->tb_mode); + } +#endif + + if (PATH_H_POSITION(tb->tb_path, 1) == 0) + replace_key(tb, tb->CFL[0], tb->lkey[0], + PATH_H_PPARENT(tb->tb_path, 0), 0); + + } else { + /* replace lkey in CFL[0] by 0-th key from S[0]; */ + replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0); + + RFALSE((shift_bytes != -1 && + !(is_direntry_le_ih(B_N_PITEM_HEAD(S0, 0)) + && !I_ENTRY_COUNT(B_N_PITEM_HEAD(S0, 0)))) && + (!op_is_left_mergeable + (B_N_PKEY(S0, 0), S0->b_size)), + "vs-10280: item must be mergeable"); + } + } + + return i; +} + +/* CLEANING STOPPED HERE */ + +/* Shift shift_num (shift_bytes) items from S[0] to the right neighbor, and replace the delimiting key */ +int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes) +{ + // struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path); + int ret_value; + + /* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */ + ret_value = + leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL); + + /* replace rkey in CFR[0] by the 0-th key from R[0] */ + if (shift_num) { + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + } + + return ret_value; +} + +static void leaf_delete_items_entirely(struct buffer_info *bi, + int first, int del_num); +/* If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR. + If not. + If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of + the first item. Part defined by del_bytes. Don't delete first item header + If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of + the last item . Part defined by del_bytes. Don't delete last item header. +*/ +void leaf_delete_items(struct buffer_info *cur_bi, int last_first, + int first, int del_num, int del_bytes) +{ + struct buffer_head *bh; + int item_amount = B_NR_ITEMS(bh = cur_bi->bi_bh); + + RFALSE(!bh, "10155: bh is not defined"); + RFALSE(del_num < 0, "10160: del_num can not be < 0. del_num==%d", + del_num); + RFALSE(first < 0 + || first + del_num > item_amount, + "10165: invalid number of first item to be deleted (%d) or " + "no so much items (%d) to delete (only %d)", first, + first + del_num, item_amount); + + if (del_num == 0) + return; + + if (first == 0 && del_num == item_amount && del_bytes == -1) { + make_empty_node(cur_bi); + do_balance_mark_leaf_dirty(cur_bi->tb, bh, 0); + return; + } + + if (del_bytes == -1) + /* delete del_num items beginning from item in position first */ + leaf_delete_items_entirely(cur_bi, first, del_num); + else { + if (last_first == FIRST_TO_LAST) { + /* delete del_num-1 items beginning from item in position first */ + leaf_delete_items_entirely(cur_bi, first, del_num - 1); + + /* delete the part of the first item of the bh + do not delete item header + */ + leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes); + } else { + struct item_head *ih; + int len; + + /* delete del_num-1 items beginning from item in position first+1 */ + leaf_delete_items_entirely(cur_bi, first + 1, + del_num - 1); + + ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1); + if (is_direntry_le_ih(ih)) + /* the last item is directory */ + /* len = numbers of directory entries in this item */ + len = ih_entry_count(ih); + else + /* len = body len of item */ + len = ih_item_len(ih); + + /* delete the part of the last item of the bh + do not delete item header + */ + leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1, + len - del_bytes, del_bytes); + } + } +} + +/* insert item into the leaf node in position before */ +void leaf_insert_into_buf(struct buffer_info *bi, int before, + struct item_head *inserted_item_ih, + const char *inserted_item_body, int zeros_number) +{ + struct buffer_head *bh = bi->bi_bh; + int nr, free_space; + struct block_head *blkh; + struct item_head *ih; + int i; + int last_loc, unmoved_loc; + char *to; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + free_space = blkh_free_space(blkh); + + /* check free space */ + RFALSE(free_space < ih_item_len(inserted_item_ih) + IH_SIZE, + "vs-10170: not enough free space in block %z, new item %h", + bh, inserted_item_ih); + RFALSE(zeros_number > ih_item_len(inserted_item_ih), + "vs-10172: zero number == %d, item length == %d", + zeros_number, ih_item_len(inserted_item_ih)); + + /* get item new item must be inserted before */ + ih = B_N_PITEM_HEAD(bh, before); + + /* prepare space for the body of new item */ + last_loc = nr ? ih_location(&(ih[nr - before - 1])) : bh->b_size; + unmoved_loc = before ? ih_location(ih - 1) : bh->b_size; + + memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih), + bh->b_data + last_loc, unmoved_loc - last_loc); + + to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih); + memset(to, 0, zeros_number); + to += zeros_number; + + /* copy body to prepared space */ + if (inserted_item_body) + memmove(to, inserted_item_body, + ih_item_len(inserted_item_ih) - zeros_number); + else + memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number); + + /* insert item header */ + memmove(ih + 1, ih, IH_SIZE * (nr - before)); + memmove(ih, inserted_item_ih, IH_SIZE); + + /* change locations */ + for (i = before; i < nr + 1; i++) { + unmoved_loc -= ih_item_len(&(ih[i - before])); + put_ih_location(&(ih[i - before]), unmoved_loc); + } + + /* sizes, free space, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1); + set_blkh_free_space(blkh, + free_space - (IH_SIZE + + ih_item_len(inserted_item_ih))); + do_balance_mark_leaf_dirty(bi->tb, bh, 1); + + if (bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) + (IH_SIZE + + ih_item_len(inserted_item_ih))); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} + +/* paste paste_size bytes to affected_item_num-th item. + When item is a directory, this only prepare space for new entries */ +void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, + int pos_in_item, int paste_size, + const char *body, int zeros_number) +{ + struct buffer_head *bh = bi->bi_bh; + int nr, free_space; + struct block_head *blkh; + struct item_head *ih; + int i; + int last_loc, unmoved_loc; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + free_space = blkh_free_space(blkh); + + /* check free space */ + RFALSE(free_space < paste_size, + "vs-10175: not enough free space: needed %d, available %d", + paste_size, free_space); + +#ifdef CONFIG_REISERFS_CHECK + if (zeros_number > paste_size) { + struct super_block *sb = NULL; + if (bi && bi->tb) + sb = bi->tb->tb_sb; + print_cur_tb("10177"); + reiserfs_panic(sb, "vs-10177", + "zeros_number == %d, paste_size == %d", + zeros_number, paste_size); + } +#endif /* CONFIG_REISERFS_CHECK */ + + /* item to be appended */ + ih = B_N_PITEM_HEAD(bh, affected_item_num); + + last_loc = ih_location(&(ih[nr - affected_item_num - 1])); + unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size; + + /* prepare space */ + memmove(bh->b_data + last_loc - paste_size, bh->b_data + last_loc, + unmoved_loc - last_loc); + + /* change locations */ + for (i = affected_item_num; i < nr; i++) + put_ih_location(&(ih[i - affected_item_num]), + ih_location(&(ih[i - affected_item_num])) - + paste_size); + + if (body) { + if (!is_direntry_le_ih(ih)) { + if (!pos_in_item) { + /* shift data to right */ + memmove(bh->b_data + ih_location(ih) + + paste_size, + bh->b_data + ih_location(ih), + ih_item_len(ih)); + /* paste data in the head of item */ + memset(bh->b_data + ih_location(ih), 0, + zeros_number); + memcpy(bh->b_data + ih_location(ih) + + zeros_number, body, + paste_size - zeros_number); + } else { + memset(bh->b_data + unmoved_loc - paste_size, 0, + zeros_number); + memcpy(bh->b_data + unmoved_loc - paste_size + + zeros_number, body, + paste_size - zeros_number); + } + } + } else + memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size); + + put_ih_item_len(ih, ih_item_len(ih) + paste_size); + + /* change free space */ + set_blkh_free_space(blkh, free_space - paste_size); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc = + B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, dc_size(t_dc) + paste_size); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} + +/* cuts DEL_COUNT entries beginning from FROM-th entry. Directory item + does not have free space, so it moves DEHs and remaining records as + necessary. Return value is size of removed part of directory item + in bytes. */ +static int leaf_cut_entries(struct buffer_head *bh, + struct item_head *ih, int from, int del_count) +{ + char *item; + struct reiserfs_de_head *deh; + int prev_record_offset; /* offset of record, that is (from-1)th */ + char *prev_record; /* */ + int cut_records_len; /* length of all removed records */ + int i; + + /* make sure, that item is directory and there are enough entries to + remove */ + RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item"); + RFALSE(I_ENTRY_COUNT(ih) < from + del_count, + "10185: item contains not enough entries: entry_cout = %d, from = %d, to delete = %d", + I_ENTRY_COUNT(ih), from, del_count); + + if (del_count == 0) + return 0; + + /* first byte of item */ + item = bh->b_data + ih_location(ih); + + /* entry head array */ + deh = B_I_DEH(bh, ih); + + /* first byte of remaining entries, those are BEFORE cut entries + (prev_record) and length of all removed records (cut_records_len) */ + prev_record_offset = + (from ? deh_location(&(deh[from - 1])) : ih_item_len(ih)); + cut_records_len = prev_record_offset /*from_record */ - + deh_location(&(deh[from + del_count - 1])); + prev_record = item + prev_record_offset; + + /* adjust locations of remaining entries */ + for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i--) + put_deh_location(&(deh[i]), + deh_location(&deh[i]) - + (DEH_SIZE * del_count)); + + for (i = 0; i < from; i++) + put_deh_location(&(deh[i]), + deh_location(&deh[i]) - (DEH_SIZE * del_count + + cut_records_len)); + + put_ih_entry_count(ih, ih_entry_count(ih) - del_count); + + /* shift entry head array and entries those are AFTER removed entries */ + memmove((char *)(deh + from), + deh + from + del_count, + prev_record - cut_records_len - (char *)(deh + from + + del_count)); + + /* shift records, those are BEFORE removed entries */ + memmove(prev_record - cut_records_len - DEH_SIZE * del_count, + prev_record, item + ih_item_len(ih) - prev_record); + + return DEH_SIZE * del_count + cut_records_len; +} + +/* when cut item is part of regular file + pos_in_item - first byte that must be cut + cut_size - number of bytes to be cut beginning from pos_in_item + + when cut item is part of directory + pos_in_item - number of first deleted entry + cut_size - count of deleted entries + */ +void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, + int pos_in_item, int cut_size) +{ + int nr; + struct buffer_head *bh = bi->bi_bh; + struct block_head *blkh; + struct item_head *ih; + int last_loc, unmoved_loc; + int i; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + + /* item head of truncated item */ + ih = B_N_PITEM_HEAD(bh, cut_item_num); + + if (is_direntry_le_ih(ih)) { + /* first cut entry () */ + cut_size = leaf_cut_entries(bh, ih, pos_in_item, cut_size); + if (pos_in_item == 0) { + /* change key */ + RFALSE(cut_item_num, + "when 0-th enrty of item is cut, that item must be first in the node, not %d-th", + cut_item_num); + /* change item key by key of first entry in the item */ + set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih))); + /*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE); */ + } + } else { + /* item is direct or indirect */ + RFALSE(is_statdata_le_ih(ih), "10195: item is stat data"); + RFALSE(pos_in_item && pos_in_item + cut_size != ih_item_len(ih), + "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)", + (long unsigned)pos_in_item, (long unsigned)cut_size, + (long unsigned)ih_item_len(ih)); + + /* shift item body to left if cut is from the head of item */ + if (pos_in_item == 0) { + memmove(bh->b_data + ih_location(ih), + bh->b_data + ih_location(ih) + cut_size, + ih_item_len(ih) - cut_size); + + /* change key of item */ + if (is_direct_le_ih(ih)) + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + cut_size); + else { + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + (cut_size / UNFM_P_SIZE) * + bh->b_size); + RFALSE(ih_item_len(ih) == cut_size + && get_ih_free_space(ih), + "10205: invalid ih_free_space (%h)", ih); + } + } + } + + /* location of the last item */ + last_loc = ih_location(&(ih[nr - cut_item_num - 1])); + + /* location of the item, which is remaining at the same place */ + unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size; + + /* shift */ + memmove(bh->b_data + last_loc + cut_size, bh->b_data + last_loc, + unmoved_loc - last_loc - cut_size); + + /* change item length */ + put_ih_item_len(ih, ih_item_len(ih) - cut_size); + + if (is_indirect_le_ih(ih)) { + if (pos_in_item) + set_ih_free_space(ih, 0); + } + + /* change locations */ + for (i = cut_item_num; i < nr; i++) + put_ih_location(&(ih[i - cut_item_num]), + ih_location(&ih[i - cut_item_num]) + cut_size); + + /* size, free space */ + set_blkh_free_space(blkh, blkh_free_space(blkh) + cut_size); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, dc_size(t_dc) - cut_size); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} + +/* delete del_num items from buffer starting from the first'th item */ +static void leaf_delete_items_entirely(struct buffer_info *bi, + int first, int del_num) +{ + struct buffer_head *bh = bi->bi_bh; + int nr; + int i, j; + int last_loc, last_removed_loc; + struct block_head *blkh; + struct item_head *ih; + + RFALSE(bh == NULL, "10210: buffer is 0"); + RFALSE(del_num < 0, "10215: del_num less than 0 (%d)", del_num); + + if (del_num == 0) + return; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + + RFALSE(first < 0 || first + del_num > nr, + "10220: first=%d, number=%d, there is %d items", first, del_num, + nr); + + if (first == 0 && del_num == nr) { + /* this does not work */ + make_empty_node(bi); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + return; + } + + ih = B_N_PITEM_HEAD(bh, first); + + /* location of unmovable item */ + j = (first == 0) ? bh->b_size : ih_location(ih - 1); + + /* delete items */ + last_loc = ih_location(&(ih[nr - 1 - first])); + last_removed_loc = ih_location(&(ih[del_num - 1])); + + memmove(bh->b_data + last_loc + j - last_removed_loc, + bh->b_data + last_loc, last_removed_loc - last_loc); + + /* delete item headers */ + memmove(ih, ih + del_num, (nr - first - del_num) * IH_SIZE); + + /* change item location */ + for (i = first; i < nr - del_num; i++) + put_ih_location(&(ih[i - first]), + ih_location(&(ih[i - first])) + (j - + last_removed_loc)); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num); + set_blkh_free_space(blkh, + blkh_free_space(blkh) + (j - last_removed_loc + + IH_SIZE * del_num)); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc = + B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) - (j - last_removed_loc + + IH_SIZE * del_num)); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} + +/* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */ +void leaf_paste_entries(struct buffer_info *bi, + int item_num, + int before, + int new_entry_count, + struct reiserfs_de_head *new_dehs, + const char *records, int paste_size) +{ + struct item_head *ih; + char *item; + struct reiserfs_de_head *deh; + char *insert_point; + int i, old_entry_num; + struct buffer_head *bh = bi->bi_bh; + + if (new_entry_count == 0) + return; + + ih = B_N_PITEM_HEAD(bh, item_num); + + /* make sure, that item is directory, and there are enough records in it */ + RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item"); + RFALSE(I_ENTRY_COUNT(ih) < before, + "10230: there are no entry we paste entries before. entry_count = %d, before = %d", + I_ENTRY_COUNT(ih), before); + + /* first byte of dest item */ + item = bh->b_data + ih_location(ih); + + /* entry head array */ + deh = B_I_DEH(bh, ih); + + /* new records will be pasted at this point */ + insert_point = + item + + (before ? deh_location(&(deh[before - 1])) + : (ih_item_len(ih) - paste_size)); + + /* adjust locations of records that will be AFTER new records */ + for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i--) + put_deh_location(&(deh[i]), + deh_location(&(deh[i])) + + (DEH_SIZE * new_entry_count)); + + /* adjust locations of records that will be BEFORE new records */ + for (i = 0; i < before; i++) + put_deh_location(&(deh[i]), + deh_location(&(deh[i])) + paste_size); + + old_entry_num = I_ENTRY_COUNT(ih); + put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count); + + /* prepare space for pasted records */ + memmove(insert_point + paste_size, insert_point, + item + (ih_item_len(ih) - paste_size) - insert_point); + + /* copy new records */ + memcpy(insert_point + DEH_SIZE * new_entry_count, records, + paste_size - DEH_SIZE * new_entry_count); + + /* prepare space for new entry heads */ + deh += before; + memmove((char *)(deh + new_entry_count), deh, + insert_point - (char *)deh); + + /* copy new entry heads */ + deh = (struct reiserfs_de_head *)((char *)deh); + memcpy(deh, new_dehs, DEH_SIZE * new_entry_count); + + /* set locations of new records */ + for (i = 0; i < new_entry_count; i++) { + put_deh_location(&(deh[i]), + deh_location(&(deh[i])) + + (-deh_location + (&(new_dehs[new_entry_count - 1])) + + insert_point + DEH_SIZE * new_entry_count - + item)); + } + + /* change item key if necessary (when we paste before 0-th entry */ + if (!before) { + set_le_ih_k_offset(ih, deh_offset(new_dehs)); +/* memcpy (&ih->ih_key.k_offset, + &new_dehs->deh_offset, SHORT_KEY_SIZE);*/ + } +#ifdef CONFIG_REISERFS_CHECK + { + int prev, next; + /* check record locations */ + deh = B_I_DEH(bh, ih); + for (i = 0; i < I_ENTRY_COUNT(ih); i++) { + next = + (i < + I_ENTRY_COUNT(ih) - + 1) ? deh_location(&(deh[i + 1])) : 0; + prev = (i != 0) ? deh_location(&(deh[i - 1])) : 0; + + if (prev && prev <= deh_location(&(deh[i]))) + reiserfs_error(sb_from_bi(bi), "vs-10240", + "directory item (%h) " + "corrupted (prev %a, " + "cur(%d) %a)", + ih, deh + i - 1, i, deh + i); + if (next && next >= deh_location(&(deh[i]))) + reiserfs_error(sb_from_bi(bi), "vs-10250", + "directory item (%h) " + "corrupted (cur(%d) %a, " + "next %a)", + ih, i, deh + i, deh + i + 1); + } + } +#endif + +} diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c new file mode 100644 index 00000000..7df1ce48 --- /dev/null +++ b/fs/reiserfs/lock.c @@ -0,0 +1,97 @@ +#include +#include + +/* + * The previous reiserfs locking scheme was heavily based on + * the tricky properties of the Bkl: + * + * - it was acquired recursively by a same task + * - the performances relied on the release-while-schedule() property + * + * Now that we replace it by a mutex, we still want to keep the same + * recursive property to avoid big changes in the code structure. + * We use our own lock_owner here because the owner field on a mutex + * is only available in SMP or mutex debugging, also we only need this field + * for this mutex, no need for a system wide mutex facility. + * + * Also this lock is often released before a call that could block because + * reiserfs performances were partially based on the release while schedule() + * property of the Bkl. + */ +void reiserfs_write_lock(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + if (sb_i->lock_owner != current) { + mutex_lock(&sb_i->lock); + sb_i->lock_owner = current; + } + + /* No need to protect it, only the current task touches it */ + sb_i->lock_depth++; +} + +void reiserfs_write_unlock(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + /* + * Are we unlocking without even holding the lock? + * Such a situation must raise a BUG() if we don't want + * to corrupt the data. + */ + BUG_ON(sb_i->lock_owner != current); + + if (--sb_i->lock_depth == -1) { + sb_i->lock_owner = NULL; + mutex_unlock(&sb_i->lock); + } +} + +/* + * If we already own the lock, just exit and don't increase the depth. + * Useful when we don't want to lock more than once. + * + * We always return the lock_depth we had before calling + * this function. + */ +int reiserfs_write_lock_once(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + if (sb_i->lock_owner != current) { + mutex_lock(&sb_i->lock); + sb_i->lock_owner = current; + return sb_i->lock_depth++; + } + + return sb_i->lock_depth; +} + +void reiserfs_write_unlock_once(struct super_block *s, int lock_depth) +{ + if (lock_depth == -1) + reiserfs_write_unlock(s); +} + +/* + * Utility function to force a BUG if it is called without the superblock + * write lock held. caller is the string printed just before calling BUG() + */ +void reiserfs_check_lock_depth(struct super_block *sb, char *caller) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); + + if (sb_i->lock_depth < 0) + reiserfs_panic(sb, "%s called without kernel lock held %d", + caller); +} + +#ifdef CONFIG_REISERFS_CHECK +void reiserfs_lock_check_recursive(struct super_block *sb) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); + + WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n"); +} +#endif diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c new file mode 100644 index 00000000..11866269 --- /dev/null +++ b/fs/reiserfs/namei.c @@ -0,0 +1,1562 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + * + * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility + * + * Trivial Changes: + * Rights granted to Hans Reiser to redistribute under other terms providing + * he accepts all liability including but not limited to patent, fitness + * for purpose, and direct or indirect claims arising from failure to perform. + * + * NO WARRANTY + */ + +#include +#include +#include +#include +#include +#include +#include + +#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; } +#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i); + +// directory item contains array of entry headers. This performs +// binary search through that array +static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off) +{ + struct item_head *ih = de->de_ih; + struct reiserfs_de_head *deh = de->de_deh; + int rbound, lbound, j; + + lbound = 0; + rbound = I_ENTRY_COUNT(ih) - 1; + + for (j = (rbound + lbound) / 2; lbound <= rbound; + j = (rbound + lbound) / 2) { + if (off < deh_offset(deh + j)) { + rbound = j - 1; + continue; + } + if (off > deh_offset(deh + j)) { + lbound = j + 1; + continue; + } + // this is not name found, but matched third key component + de->de_entry_num = j; + return NAME_FOUND; + } + + de->de_entry_num = lbound; + return NAME_NOT_FOUND; +} + +// comment? maybe something like set de to point to what the path points to? +static inline void set_de_item_location(struct reiserfs_dir_entry *de, + struct treepath *path) +{ + de->de_bh = get_last_bh(path); + de->de_ih = get_ih(path); + de->de_deh = B_I_DEH(de->de_bh, de->de_ih); + de->de_item_num = PATH_LAST_POSITION(path); +} + +// de_bh, de_ih, de_deh (points to first element of array), de_item_num is set +inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de) +{ + struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; + + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); + + de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num); + de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0); + de->de_name = B_I_PITEM(de->de_bh, de->de_ih) + deh_location(deh); + if (de->de_name[de->de_namelen - 1] == 0) + de->de_namelen = strlen(de->de_name); +} + +// what entry points to +static inline void set_de_object_key(struct reiserfs_dir_entry *de) +{ + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); + de->de_dir_id = deh_dir_id(&(de->de_deh[de->de_entry_num])); + de->de_objectid = deh_objectid(&(de->de_deh[de->de_entry_num])); +} + +static inline void store_de_entry_key(struct reiserfs_dir_entry *de) +{ + struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; + + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); + + /* store key of the found entry */ + de->de_entry_key.version = KEY_FORMAT_3_5; + de->de_entry_key.on_disk_key.k_dir_id = + le32_to_cpu(de->de_ih->ih_key.k_dir_id); + de->de_entry_key.on_disk_key.k_objectid = + le32_to_cpu(de->de_ih->ih_key.k_objectid); + set_cpu_key_k_offset(&(de->de_entry_key), deh_offset(deh)); + set_cpu_key_k_type(&(de->de_entry_key), TYPE_DIRENTRY); +} + +/* We assign a key to each directory item, and place multiple entries +in a single directory item. A directory item has a key equal to the +key of the first directory entry in it. + +This function first calls search_by_key, then, if item whose first +entry matches is not found it looks for the entry inside directory +item found by search_by_key. Fills the path to the entry, and to the +entry position in the item + +*/ + +/* The function is NOT SCHEDULE-SAFE! */ +int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, + struct treepath *path, struct reiserfs_dir_entry *de) +{ + int retval; + + retval = search_item(sb, key, path); + switch (retval) { + case ITEM_NOT_FOUND: + if (!PATH_LAST_POSITION(path)) { + reiserfs_error(sb, "vs-7000", "search_by_key " + "returned item position == 0"); + pathrelse(path); + return IO_ERROR; + } + PATH_LAST_POSITION(path)--; + + case ITEM_FOUND: + break; + + case IO_ERROR: + return retval; + + default: + pathrelse(path); + reiserfs_error(sb, "vs-7002", "no path to here"); + return IO_ERROR; + } + + set_de_item_location(de, path); + +#ifdef CONFIG_REISERFS_CHECK + if (!is_direntry_le_ih(de->de_ih) || + COMP_SHORT_KEYS(&(de->de_ih->ih_key), key)) { + print_block(de->de_bh, 0, -1, -1); + reiserfs_panic(sb, "vs-7005", "found item %h is not directory " + "item or does not belong to the same directory " + "as key %K", de->de_ih, key); + } +#endif /* CONFIG_REISERFS_CHECK */ + + /* binary search in directory item by third componen t of the + key. sets de->de_entry_num of de */ + retval = bin_search_in_dir_item(de, cpu_key_k_offset(key)); + path->pos_in_item = de->de_entry_num; + if (retval != NAME_NOT_FOUND) { + // ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set + set_de_name_and_namelen(de); + set_de_object_key(de); + } + return retval; +} + +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */ + +/* The third component is hashed, and you can choose from more than + one hash function. Per directory hashes are not yet implemented + but are thought about. This function should be moved to hashes.c + Jedi, please do so. -Hans */ + +static __u32 get_third_component(struct super_block *s, + const char *name, int len) +{ + __u32 res; + + if (!len || (len == 1 && name[0] == '.')) + return DOT_OFFSET; + if (len == 2 && name[0] == '.' && name[1] == '.') + return DOT_DOT_OFFSET; + + res = REISERFS_SB(s)->s_hash_function(name, len); + + // take bits from 7-th to 30-th including both bounds + res = GET_HASH_VALUE(res); + if (res == 0) + // needed to have no names before "." and ".." those have hash + // value == 0 and generation conters 1 and 2 accordingly + res = 128; + return res + MAX_GENERATION_NUMBER; +} + +static int reiserfs_match(struct reiserfs_dir_entry *de, + const char *name, int namelen) +{ + int retval = NAME_NOT_FOUND; + + if ((namelen == de->de_namelen) && + !memcmp(de->de_name, name, de->de_namelen)) + retval = + (de_visible(de->de_deh + de->de_entry_num) ? NAME_FOUND : + NAME_FOUND_INVISIBLE); + + return retval; +} + +/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */ + + /* used when hash collisions exist */ + +static int linear_search_in_dir_item(struct cpu_key *key, + struct reiserfs_dir_entry *de, + const char *name, int namelen) +{ + struct reiserfs_de_head *deh = de->de_deh; + int retval; + int i; + + i = de->de_entry_num; + + if (i == I_ENTRY_COUNT(de->de_ih) || + GET_HASH_VALUE(deh_offset(deh + i)) != + GET_HASH_VALUE(cpu_key_k_offset(key))) { + i--; + } + + RFALSE(de->de_deh != B_I_DEH(de->de_bh, de->de_ih), + "vs-7010: array of entry headers not found"); + + deh += i; + + for (; i >= 0; i--, deh--) { + if (GET_HASH_VALUE(deh_offset(deh)) != + GET_HASH_VALUE(cpu_key_k_offset(key))) { + // hash value does not match, no need to check whole name + return NAME_NOT_FOUND; + } + + /* mark, that this generation number is used */ + if (de->de_gen_number_bit_string) + set_bit(GET_GENERATION_NUMBER(deh_offset(deh)), + de->de_gen_number_bit_string); + + // calculate pointer to name and namelen + de->de_entry_num = i; + set_de_name_and_namelen(de); + + if ((retval = + reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) { + // de's de_name, de_namelen, de_recordlen are set. Fill the rest: + + // key of pointed object + set_de_object_key(de); + + store_de_entry_key(de); + + // retval can be NAME_FOUND or NAME_FOUND_INVISIBLE + return retval; + } + } + + if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0) + /* we have reached left most entry in the node. In common we + have to go to the left neighbor, but if generation counter + is 0 already, we know for sure, that there is no name with + the same hash value */ + // FIXME: this work correctly only because hash value can not + // be 0. Btw, in case of Yura's hash it is probably possible, + // so, this is a bug + return NAME_NOT_FOUND; + + RFALSE(de->de_item_num, + "vs-7015: two diritems of the same directory in one node?"); + + return GOTO_PREVIOUS_ITEM; +} + +// may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND +// FIXME: should add something like IOERROR +static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen, + struct treepath *path_to_entry, + struct reiserfs_dir_entry *de) +{ + struct cpu_key key_to_search; + int retval; + + if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize)) + return NAME_NOT_FOUND; + + /* we will search for this key in the tree */ + make_cpu_key(&key_to_search, dir, + get_third_component(dir->i_sb, name, namelen), + TYPE_DIRENTRY, 3); + + while (1) { + retval = + search_by_entry_key(dir->i_sb, &key_to_search, + path_to_entry, de); + if (retval == IO_ERROR) { + reiserfs_error(dir->i_sb, "zam-7001", "io error"); + return IO_ERROR; + } + + /* compare names for all entries having given hash value */ + retval = + linear_search_in_dir_item(&key_to_search, de, name, + namelen); + if (retval != GOTO_PREVIOUS_ITEM) { + /* there is no need to scan directory anymore. Given entry found or does not exist */ + path_to_entry->pos_in_item = de->de_entry_num; + return retval; + } + + /* there is left neighboring item of this directory and given entry can be there */ + set_cpu_key_k_offset(&key_to_search, + le_ih_k_offset(de->de_ih) - 1); + pathrelse(path_to_entry); + + } /* while (1) */ +} + +static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int retval; + int lock_depth; + struct inode *inode = NULL; + struct reiserfs_dir_entry de; + INITIALIZE_PATH(path_to_entry); + + if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len) + return ERR_PTR(-ENAMETOOLONG); + + /* + * Might be called with or without the write lock, must be careful + * to not recursively hold it in case we want to release the lock + * before rescheduling. + */ + lock_depth = reiserfs_write_lock_once(dir->i_sb); + + de.de_gen_number_bit_string = NULL; + retval = + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, + &path_to_entry, &de); + pathrelse(&path_to_entry); + if (retval == NAME_FOUND) { + inode = reiserfs_iget(dir->i_sb, + (struct cpu_key *)&(de.de_dir_id)); + if (!inode || IS_ERR(inode)) { + reiserfs_write_unlock_once(dir->i_sb, lock_depth); + return ERR_PTR(-EACCES); + } + + /* Propagate the private flag so we know we're + * in the priv tree */ + if (IS_PRIVATE(dir)) + inode->i_flags |= S_PRIVATE; + } + reiserfs_write_unlock_once(dir->i_sb, lock_depth); + if (retval == IO_ERROR) { + return ERR_PTR(-EIO); + } + + return d_splice_alias(inode, dentry); +} + +/* +** looks up the dentry of the parent directory for child. +** taken from ext2_get_parent +*/ +struct dentry *reiserfs_get_parent(struct dentry *child) +{ + int retval; + struct inode *inode = NULL; + struct reiserfs_dir_entry de; + INITIALIZE_PATH(path_to_entry); + struct inode *dir = child->d_inode; + + if (dir->i_nlink == 0) { + return ERR_PTR(-ENOENT); + } + de.de_gen_number_bit_string = NULL; + + reiserfs_write_lock(dir->i_sb); + retval = reiserfs_find_entry(dir, "..", 2, &path_to_entry, &de); + pathrelse(&path_to_entry); + if (retval != NAME_FOUND) { + reiserfs_write_unlock(dir->i_sb); + return ERR_PTR(-ENOENT); + } + inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); + reiserfs_write_unlock(dir->i_sb); + + return d_obtain_alias(inode); +} + +/* add entry to the directory (entry can be hidden). + +insert definition of when hidden directories are used here -Hans + + Does not mark dir inode dirty, do it after successesfull call to it */ + +static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, + struct inode *dir, const char *name, int namelen, + struct inode *inode, int visible) +{ + struct cpu_key entry_key; + struct reiserfs_de_head *deh; + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1); + int gen_number; + char small_buf[32 + DEH_SIZE]; /* 48 bytes now and we avoid kmalloc + if we create file with short name */ + char *buffer; + int buflen, paste_size; + int retval; + + BUG_ON(!th->t_trans_id); + + /* cannot allow items to be added into a busy deleted directory */ + if (!namelen) + return -EINVAL; + + if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize)) + return -ENAMETOOLONG; + + /* each entry has unique key. compose it */ + make_cpu_key(&entry_key, dir, + get_third_component(dir->i_sb, name, namelen), + TYPE_DIRENTRY, 3); + + /* get memory for composing the entry */ + buflen = DEH_SIZE + ROUND_UP(namelen); + if (buflen > sizeof(small_buf)) { + buffer = kmalloc(buflen, GFP_NOFS); + if (!buffer) + return -ENOMEM; + } else + buffer = small_buf; + + paste_size = + (get_inode_sd_version(dir) == + STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen; + + /* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */ + deh = (struct reiserfs_de_head *)buffer; + deh->deh_location = 0; /* JDM Endian safe if 0 */ + put_deh_offset(deh, cpu_key_k_offset(&entry_key)); + deh->deh_state = 0; /* JDM Endian safe if 0 */ + /* put key (ino analog) to de */ + deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id; /* safe: k_dir_id is le */ + deh->deh_objectid = INODE_PKEY(inode)->k_objectid; /* safe: k_objectid is le */ + + /* copy name */ + memcpy((char *)(deh + 1), name, namelen); + /* padd by 0s to the 4 byte boundary */ + padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen); + + /* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */ + mark_de_without_sd(deh); + visible ? mark_de_visible(deh) : mark_de_hidden(deh); + + /* find the proper place for the new entry */ + memset(bit_string, 0, sizeof(bit_string)); + de.de_gen_number_bit_string = bit_string; + retval = reiserfs_find_entry(dir, name, namelen, &path, &de); + if (retval != NAME_NOT_FOUND) { + if (buffer != small_buf) + kfree(buffer); + pathrelse(&path); + + if (retval == IO_ERROR) { + return -EIO; + } + + if (retval != NAME_FOUND) { + reiserfs_error(dir->i_sb, "zam-7002", + "reiserfs_find_entry() returned " + "unexpected value (%d)", retval); + } + + return -EEXIST; + } + + gen_number = + find_first_zero_bit(bit_string, + MAX_GENERATION_NUMBER + 1); + if (gen_number > MAX_GENERATION_NUMBER) { + /* there is no free generation number */ + reiserfs_warning(dir->i_sb, "reiserfs-7010", + "Congratulations! we have got hash function " + "screwed up"); + if (buffer != small_buf) + kfree(buffer); + pathrelse(&path); + return -EBUSY; + } + /* adjust offset of directory enrty */ + put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number)); + set_cpu_key_k_offset(&entry_key, deh_offset(deh)); + + /* update max-hash-collisions counter in reiserfs_sb_info */ + PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number); + + if (gen_number != 0) { /* we need to re-search for the insertion point */ + if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) != + NAME_NOT_FOUND) { + reiserfs_warning(dir->i_sb, "vs-7032", + "entry with this key (%K) already " + "exists", &entry_key); + + if (buffer != small_buf) + kfree(buffer); + pathrelse(&path); + return -EBUSY; + } + } + + /* perform the insertion of the entry that we have prepared */ + retval = + reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer, + paste_size); + if (buffer != small_buf) + kfree(buffer); + if (retval) { + reiserfs_check_path(&path); + return retval; + } + + dir->i_size += paste_size; + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + if (!S_ISDIR(inode->i_mode) && visible) + // reiserfs_mkdir or reiserfs_rename will do that by itself + reiserfs_update_sd(th, dir); + + reiserfs_check_path(&path); + return 0; +} + +/* quota utility function, call if you've had to abort after calling +** new_inode_init, and have not called reiserfs_new_inode yet. +** This should only be called on inodes that do not have stat data +** inserted into the tree yet. +*/ +static int drop_new_inode(struct inode *inode) +{ + dquot_drop(inode); + make_bad_inode(inode); + inode->i_flags |= S_NOQUOTA; + iput(inode); + return 0; +} + +/* utility function that does setup for reiserfs_new_inode. +** dquot_initialize needs lots of credits so it's better to have it +** outside of a transaction, so we had to pull some bits of +** reiserfs_new_inode out into this func. +*/ +static int new_inode_init(struct inode *inode, struct inode *dir, int mode) +{ + /* Make inode invalid - just in case we are going to drop it before + * the initialization happens */ + INODE_PKEY(inode)->k_objectid = 0; + /* the quota init calls have to know who to charge the quota to, so + ** we have to set uid and gid here + */ + inode_init_owner(inode, dir, mode); + dquot_initialize(inode); + return 0; +} + +static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + int retval; + struct inode *inode; + /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + struct reiserfs_transaction_handle th; + struct reiserfs_security_handle security; + + dquot_initialize(dir); + + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, dir, mode); + + jbegin_count += reiserfs_cache_default_acl(dir); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); + if (retval < 0) { + drop_new_inode(inode); + return retval; + } + jbegin_count += retval; + reiserfs_write_lock(dir->i_sb); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + goto out_failed; + } + + retval = + reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry, + inode, &security); + if (retval) + goto out_failed; + + inode->i_op = &reiserfs_file_inode_operations; + inode->i_fop = &reiserfs_file_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + inode->i_nlink--; + reiserfs_update_sd(&th, inode); + err = journal_end(&th, dir->i_sb, jbegin_count); + if (err) + retval = err; + unlock_new_inode(inode); + iput(inode); + goto out_failed; + } + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + retval = journal_end(&th, dir->i_sb, jbegin_count); + + out_failed: + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + int retval; + struct inode *inode; + struct reiserfs_transaction_handle th; + struct reiserfs_security_handle security; + /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + dquot_initialize(dir); + + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, dir, mode); + + jbegin_count += reiserfs_cache_default_acl(dir); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); + if (retval < 0) { + drop_new_inode(inode); + return retval; + } + jbegin_count += retval; + reiserfs_write_lock(dir->i_sb); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + goto out_failed; + } + + retval = + reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry, + inode, &security); + if (retval) { + goto out_failed; + } + + inode->i_op = &reiserfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + + //FIXME: needed for block and char devices only + reiserfs_update_sd(&th, inode); + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + inode->i_nlink--; + reiserfs_update_sd(&th, inode); + err = journal_end(&th, dir->i_sb, jbegin_count); + if (err) + retval = err; + unlock_new_inode(inode); + iput(inode); + goto out_failed; + } + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + retval = journal_end(&th, dir->i_sb, jbegin_count); + + out_failed: + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int retval; + struct inode *inode; + struct reiserfs_transaction_handle th; + struct reiserfs_security_handle security; + int lock_depth; + /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + + dquot_initialize(dir); + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ + REISERFS_I(dir)->new_packing_locality = 1; +#endif + mode = S_IFDIR | mode; + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, dir, mode); + + jbegin_count += reiserfs_cache_default_acl(dir); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); + if (retval < 0) { + drop_new_inode(inode); + return retval; + } + jbegin_count += retval; + lock_depth = reiserfs_write_lock_once(dir->i_sb); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + goto out_failed; + } + + /* inc the link count now, so another writer doesn't overflow it while + ** we sleep later on. + */ + INC_DIR_INODE_NLINK(dir) + + retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ , + old_format_only(dir->i_sb) ? + EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, + dentry, inode, &security); + if (retval) { + DEC_DIR_INODE_NLINK(dir) + goto out_failed; + } + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + inode->i_op = &reiserfs_dir_inode_operations; + inode->i_fop = &reiserfs_dir_operations; + + // note, _this_ add_entry will not update dir's stat data + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + inode->i_nlink = 0; + DEC_DIR_INODE_NLINK(dir); + reiserfs_update_sd(&th, inode); + err = journal_end(&th, dir->i_sb, jbegin_count); + if (err) + retval = err; + unlock_new_inode(inode); + iput(inode); + goto out_failed; + } + // the above add_entry did not update dir's stat data + reiserfs_update_sd(&th, dir); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + retval = journal_end(&th, dir->i_sb, jbegin_count); +out_failed: + reiserfs_write_unlock_once(dir->i_sb, lock_depth); + return retval; +} + +static inline int reiserfs_empty_dir(struct inode *inode) +{ + /* we can cheat because an old format dir cannot have + ** EMPTY_DIR_SIZE, and a new format dir cannot have + ** EMPTY_DIR_SIZE_V1. So, if the inode is either size, + ** regardless of disk format version, the directory is empty. + */ + if (inode->i_size != EMPTY_DIR_SIZE && + inode->i_size != EMPTY_DIR_SIZE_V1) { + return 0; + } + return 1; +} + +static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + int retval, err; + struct inode *inode; + struct reiserfs_transaction_handle th; + int jbegin_count; + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + + /* we will be doing 2 balancings and update 2 stat data, we change quotas + * of the owner of the directory and of the owner of the parent directory. + * The quota structure is possibly deleted only on last iput => outside + * of this transaction */ + jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + 2 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + + dquot_initialize(dir); + + reiserfs_write_lock(dir->i_sb); + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) + goto out_rmdir; + + de.de_gen_number_bit_string = NULL; + if ((retval = + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, + &path, &de)) == NAME_NOT_FOUND) { + retval = -ENOENT; + goto end_rmdir; + } else if (retval == IO_ERROR) { + retval = -EIO; + goto end_rmdir; + } + + inode = dentry->d_inode; + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + if (de.de_objectid != inode->i_ino) { + // FIXME: compare key of an object and a key found in the + // entry + retval = -EIO; + goto end_rmdir; + } + if (!reiserfs_empty_dir(inode)) { + retval = -ENOTEMPTY; + goto end_rmdir; + } + + /* cut entry from dir directory */ + retval = reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL, /* page */ + 0 /*new file size - not used here */ ); + if (retval < 0) + goto end_rmdir; + + if (inode->i_nlink != 2 && inode->i_nlink != 1) + reiserfs_error(inode->i_sb, "reiserfs-7040", + "empty directory has nlink != 2 (%d)", + inode->i_nlink); + + clear_nlink(inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, inode); + + DEC_DIR_INODE_NLINK(dir) + dir->i_size -= (DEH_SIZE + de.de_entrylen); + reiserfs_update_sd(&th, dir); + + /* prevent empty directory from getting lost */ + add_save_link(&th, inode, 0 /* not truncate */ ); + + retval = journal_end(&th, dir->i_sb, jbegin_count); + reiserfs_check_path(&path); + out_rmdir: + reiserfs_write_unlock(dir->i_sb); + return retval; + + end_rmdir: + /* we must release path, because we did not call + reiserfs_cut_from_item, or reiserfs_cut_from_item does not + release path if operation was not complete */ + pathrelse(&path); + err = journal_end(&th, dir->i_sb, jbegin_count); + reiserfs_write_unlock(dir->i_sb); + return err ? err : retval; +} + +static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int retval, err; + struct inode *inode; + struct reiserfs_dir_entry de; + INITIALIZE_PATH(path); + struct reiserfs_transaction_handle th; + int jbegin_count; + unsigned long savelink; + int depth; + + dquot_initialize(dir); + + inode = dentry->d_inode; + + /* in this transaction we can be doing at max two balancings and update + * two stat datas, we change quotas of the owner of the directory and of + * the owner of the parent directory. The quota structure is possibly + * deleted only on iput => outside of this transaction */ + jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + 2 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + + depth = reiserfs_write_lock_once(dir->i_sb); + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) + goto out_unlink; + + de.de_gen_number_bit_string = NULL; + if ((retval = + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, + &path, &de)) == NAME_NOT_FOUND) { + retval = -ENOENT; + goto end_unlink; + } else if (retval == IO_ERROR) { + retval = -EIO; + goto end_unlink; + } + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + if (de.de_objectid != inode->i_ino) { + // FIXME: compare key of an object and a key found in the + // entry + retval = -EIO; + goto end_unlink; + } + + if (!inode->i_nlink) { + reiserfs_warning(inode->i_sb, "reiserfs-7042", + "deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); + inode->i_nlink = 1; + } + + drop_nlink(inode); + + /* + * we schedule before doing the add_save_link call, save the link + * count so we don't race + */ + savelink = inode->i_nlink; + + retval = + reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL, + 0); + if (retval < 0) { + inc_nlink(inode); + goto end_unlink; + } + inode->i_ctime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, inode); + + dir->i_size -= (de.de_entrylen + DEH_SIZE); + dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, dir); + + if (!savelink) + /* prevent file from getting lost */ + add_save_link(&th, inode, 0 /* not truncate */ ); + + retval = journal_end(&th, dir->i_sb, jbegin_count); + reiserfs_check_path(&path); + reiserfs_write_unlock_once(dir->i_sb, depth); + return retval; + + end_unlink: + pathrelse(&path); + err = journal_end(&th, dir->i_sb, jbegin_count); + reiserfs_check_path(&path); + if (err) + retval = err; + out_unlink: + reiserfs_write_unlock_once(dir->i_sb, depth); + return retval; +} + +static int reiserfs_symlink(struct inode *parent_dir, + struct dentry *dentry, const char *symname) +{ + int retval; + struct inode *inode; + char *name; + int item_len; + struct reiserfs_transaction_handle th; + struct reiserfs_security_handle security; + int mode = S_IFLNK | S_IRWXUGO; + /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); + + dquot_initialize(parent_dir); + + if (!(inode = new_inode(parent_dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, parent_dir, mode); + + retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name, + &security); + if (retval < 0) { + drop_new_inode(inode); + return retval; + } + jbegin_count += retval; + + reiserfs_write_lock(parent_dir->i_sb); + item_len = ROUND_UP(strlen(symname)); + if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) { + retval = -ENAMETOOLONG; + drop_new_inode(inode); + goto out_failed; + } + + name = kmalloc(item_len, GFP_NOFS); + if (!name) { + drop_new_inode(inode); + retval = -ENOMEM; + goto out_failed; + } + memcpy(name, symname, strlen(symname)); + padd_item(name, item_len, strlen(symname)); + + retval = journal_begin(&th, parent_dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + kfree(name); + goto out_failed; + } + + retval = + reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname), + dentry, inode, &security); + kfree(name); + if (retval) { /* reiserfs_new_inode iputs for us */ + goto out_failed; + } + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(parent_dir); + + inode->i_op = &reiserfs_symlink_inode_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + + // must be sure this inode is written with this transaction + // + //reiserfs_update_sd (&th, inode, READ_BLOCKS); + + retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + inode->i_nlink--; + reiserfs_update_sd(&th, inode); + err = journal_end(&th, parent_dir->i_sb, jbegin_count); + if (err) + retval = err; + unlock_new_inode(inode); + iput(inode); + goto out_failed; + } + + d_instantiate(dentry, inode); + unlock_new_inode(inode); + retval = journal_end(&th, parent_dir->i_sb, jbegin_count); + out_failed: + reiserfs_write_unlock(parent_dir->i_sb); + return retval; +} + +static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + int retval; + struct inode *inode = old_dentry->d_inode; + struct reiserfs_transaction_handle th; + /* We need blocks for transaction + update of quotas for the owners of the directory */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + + dquot_initialize(dir); + + reiserfs_write_lock(dir->i_sb); + if (inode->i_nlink >= REISERFS_LINK_MAX) { + //FIXME: sd_nlink is 32 bit for new files + reiserfs_write_unlock(dir->i_sb); + return -EMLINK; + } + + /* inc before scheduling so reiserfs_unlink knows we are here */ + inc_nlink(inode); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + inode->i_nlink--; + reiserfs_write_unlock(dir->i_sb); + return retval; + } + + /* create new entry */ + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + if (retval) { + int err; + inode->i_nlink--; + err = journal_end(&th, dir->i_sb, jbegin_count); + reiserfs_write_unlock(dir->i_sb); + return err ? err : retval; + } + + inode->i_ctime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, inode); + + ihold(inode); + d_instantiate(dentry, inode); + retval = journal_end(&th, dir->i_sb, jbegin_count); + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +/* de contains information pointing to an entry which */ +static int de_still_valid(const char *name, int len, + struct reiserfs_dir_entry *de) +{ + struct reiserfs_dir_entry tmp = *de; + + // recalculate pointer to name and name length + set_de_name_and_namelen(&tmp); + // FIXME: could check more + if (tmp.de_namelen != len || memcmp(name, de->de_name, len)) + return 0; + return 1; +} + +static int entry_points_to_object(const char *name, int len, + struct reiserfs_dir_entry *de, + struct inode *inode) +{ + if (!de_still_valid(name, len, de)) + return 0; + + if (inode) { + if (!de_visible(de->de_deh + de->de_entry_num)) + reiserfs_panic(inode->i_sb, "vs-7042", + "entry must be visible"); + return (de->de_objectid == inode->i_ino) ? 1 : 0; + } + + /* this must be added hidden entry */ + if (de_visible(de->de_deh + de->de_entry_num)) + reiserfs_panic(NULL, "vs-7043", "entry must be visible"); + + return 1; +} + +/* sets key of objectid the entry has to point to */ +static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de, + struct reiserfs_key *key) +{ + /* JDM These operations are endian safe - both are le */ + de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id; + de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid; +} + +/* + * process, that is going to call fix_nodes/do_balance must hold only + * one path. If it holds 2 or more, it can get into endless waiting in + * get_empty_nodes or its clones + */ +static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int retval; + INITIALIZE_PATH(old_entry_path); + INITIALIZE_PATH(new_entry_path); + INITIALIZE_PATH(dot_dot_entry_path); + struct item_head new_entry_ih, old_entry_ih, dot_dot_ih; + struct reiserfs_dir_entry old_de, new_de, dot_dot_de; + struct inode *old_inode, *new_dentry_inode; + struct reiserfs_transaction_handle th; + int jbegin_count; + umode_t old_inode_mode; + unsigned long savelink = 1; + struct timespec ctime; + + /* three balancings: (1) old name removal, (2) new name insertion + and (3) maybe "save" link insertion + stat data updates: (1) old directory, + (2) new directory and (3) maybe old object stat data (when it is + directory) and (4) maybe stat data of object to which new entry + pointed initially and (5) maybe block containing ".." of + renamed directory + quota updates: two parent directories */ + jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + 5 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); + + dquot_initialize(old_dir); + dquot_initialize(new_dir); + + old_inode = old_dentry->d_inode; + new_dentry_inode = new_dentry->d_inode; + + // make sure, that oldname still exists and points to an object we + // are going to rename + old_de.de_gen_number_bit_string = NULL; + reiserfs_write_lock(old_dir->i_sb); + retval = + reiserfs_find_entry(old_dir, old_dentry->d_name.name, + old_dentry->d_name.len, &old_entry_path, + &old_de); + pathrelse(&old_entry_path); + if (retval == IO_ERROR) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) { + reiserfs_write_unlock(old_dir->i_sb); + return -ENOENT; + } + + old_inode_mode = old_inode->i_mode; + if (S_ISDIR(old_inode_mode)) { + // make sure, that directory being renamed has correct ".." + // and that its new parent directory has not too many links + // already + + if (new_dentry_inode) { + if (!reiserfs_empty_dir(new_dentry_inode)) { + reiserfs_write_unlock(old_dir->i_sb); + return -ENOTEMPTY; + } + } + + /* directory is renamed, its parent directory will be changed, + ** so find ".." entry + */ + dot_dot_de.de_gen_number_bit_string = NULL; + retval = + reiserfs_find_entry(old_inode, "..", 2, &dot_dot_entry_path, + &dot_dot_de); + pathrelse(&dot_dot_entry_path); + if (retval != NAME_FOUND) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + /* inode number of .. must equal old_dir->i_ino */ + if (dot_dot_de.de_objectid != old_dir->i_ino) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + } + + retval = journal_begin(&th, old_dir->i_sb, jbegin_count); + if (retval) { + reiserfs_write_unlock(old_dir->i_sb); + return retval; + } + + /* add new entry (or find the existing one) */ + retval = + reiserfs_add_entry(&th, new_dir, new_dentry->d_name.name, + new_dentry->d_name.len, old_inode, 0); + if (retval == -EEXIST) { + if (!new_dentry_inode) { + reiserfs_panic(old_dir->i_sb, "vs-7050", + "new entry is found, new inode == 0"); + } + } else if (retval) { + int err = journal_end(&th, old_dir->i_sb, jbegin_count); + reiserfs_write_unlock(old_dir->i_sb); + return err ? err : retval; + } + + reiserfs_update_inode_transaction(old_dir); + reiserfs_update_inode_transaction(new_dir); + + /* this makes it so an fsync on an open fd for the old name will + ** commit the rename operation + */ + reiserfs_update_inode_transaction(old_inode); + + if (new_dentry_inode) + reiserfs_update_inode_transaction(new_dentry_inode); + + while (1) { + // look for old name using corresponding entry key (found by reiserfs_find_entry) + if ((retval = + search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key, + &old_entry_path, + &old_de)) != NAME_FOUND) { + pathrelse(&old_entry_path); + journal_end(&th, old_dir->i_sb, jbegin_count); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + copy_item_head(&old_entry_ih, get_ih(&old_entry_path)); + + reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1); + + // look for new name by reiserfs_find_entry + new_de.de_gen_number_bit_string = NULL; + retval = + reiserfs_find_entry(new_dir, new_dentry->d_name.name, + new_dentry->d_name.len, &new_entry_path, + &new_de); + // reiserfs_add_entry should not return IO_ERROR, because it is called with essentially same parameters from + // reiserfs_add_entry above, and we'll catch any i/o errors before we get here. + if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) { + pathrelse(&new_entry_path); + pathrelse(&old_entry_path); + journal_end(&th, old_dir->i_sb, jbegin_count); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + copy_item_head(&new_entry_ih, get_ih(&new_entry_path)); + + reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1); + + if (S_ISDIR(old_inode->i_mode)) { + if ((retval = + search_by_entry_key(new_dir->i_sb, + &dot_dot_de.de_entry_key, + &dot_dot_entry_path, + &dot_dot_de)) != NAME_FOUND) { + pathrelse(&dot_dot_entry_path); + pathrelse(&new_entry_path); + pathrelse(&old_entry_path); + journal_end(&th, old_dir->i_sb, jbegin_count); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + copy_item_head(&dot_dot_ih, + get_ih(&dot_dot_entry_path)); + // node containing ".." gets into transaction + reiserfs_prepare_for_journal(old_inode->i_sb, + dot_dot_de.de_bh, 1); + } + /* we should check seals here, not do + this stuff, yes? Then, having + gathered everything into RAM we + should lock the buffers, yes? -Hans */ + /* probably. our rename needs to hold more + ** than one path at once. The seals would + ** have to be written to deal with multi-path + ** issues -chris + */ + /* sanity checking before doing the rename - avoid races many + ** of the above checks could have scheduled. We have to be + ** sure our items haven't been shifted by another process. + */ + if (item_moved(&new_entry_ih, &new_entry_path) || + !entry_points_to_object(new_dentry->d_name.name, + new_dentry->d_name.len, + &new_de, new_dentry_inode) || + item_moved(&old_entry_ih, &old_entry_path) || + !entry_points_to_object(old_dentry->d_name.name, + old_dentry->d_name.len, + &old_de, old_inode)) { + reiserfs_restore_prepared_buffer(old_inode->i_sb, + new_de.de_bh); + reiserfs_restore_prepared_buffer(old_inode->i_sb, + old_de.de_bh); + if (S_ISDIR(old_inode_mode)) + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + dot_dot_de. + de_bh); + continue; + } + if (S_ISDIR(old_inode_mode)) { + if (item_moved(&dot_dot_ih, &dot_dot_entry_path) || + !entry_points_to_object("..", 2, &dot_dot_de, + old_dir)) { + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + old_de.de_bh); + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + new_de.de_bh); + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + dot_dot_de. + de_bh); + continue; + } + } + + RFALSE(S_ISDIR(old_inode_mode) && + !buffer_journal_prepared(dot_dot_de.de_bh), ""); + + break; + } + + /* ok, all the changes can be done in one fell swoop when we + have claimed all the buffers needed. */ + + mark_de_visible(new_de.de_deh + new_de.de_entry_num); + set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode)); + journal_mark_dirty(&th, old_dir->i_sb, new_de.de_bh); + + mark_de_hidden(old_de.de_deh + old_de.de_entry_num); + journal_mark_dirty(&th, old_dir->i_sb, old_de.de_bh); + ctime = CURRENT_TIME_SEC; + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + /* thanks to Alex Adriaanse for patch which adds ctime update of + renamed object */ + old_inode->i_ctime = ctime; + + if (new_dentry_inode) { + // adjust link number of the victim + if (S_ISDIR(new_dentry_inode->i_mode)) { + clear_nlink(new_dentry_inode); + } else { + drop_nlink(new_dentry_inode); + } + new_dentry_inode->i_ctime = ctime; + savelink = new_dentry_inode->i_nlink; + } + + if (S_ISDIR(old_inode_mode)) { + /* adjust ".." of renamed directory */ + set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir)); + journal_mark_dirty(&th, new_dir->i_sb, dot_dot_de.de_bh); + + if (!new_dentry_inode) + /* there (in new_dir) was no directory, so it got new link + (".." of renamed directory) */ + INC_DIR_INODE_NLINK(new_dir); + + /* old directory lost one link - ".. " of renamed directory */ + DEC_DIR_INODE_NLINK(old_dir); + } + // looks like in 2.3.99pre3 brelse is atomic. so we can use pathrelse + pathrelse(&new_entry_path); + pathrelse(&dot_dot_entry_path); + + // FIXME: this reiserfs_cut_from_item's return value may screw up + // anybody, but it will panic if will not be able to find the + // entry. This needs one more clean up + if (reiserfs_cut_from_item + (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL, + 0) < 0) + reiserfs_error(old_dir->i_sb, "vs-7060", + "couldn't not cut old name. Fsck later?"); + + old_dir->i_size -= DEH_SIZE + old_de.de_entrylen; + + reiserfs_update_sd(&th, old_dir); + reiserfs_update_sd(&th, new_dir); + reiserfs_update_sd(&th, old_inode); + + if (new_dentry_inode) { + if (savelink == 0) + add_save_link(&th, new_dentry_inode, + 0 /* not truncate */ ); + reiserfs_update_sd(&th, new_dentry_inode); + } + + retval = journal_end(&th, old_dir->i_sb, jbegin_count); + reiserfs_write_unlock(old_dir->i_sb); + return retval; +} + +/* + * directories can handle most operations... + */ +const struct inode_operations reiserfs_dir_inode_operations = { + //&reiserfs_dir_operations, /* default_file_ops */ + .create = reiserfs_create, + .lookup = reiserfs_lookup, + .link = reiserfs_link, + .unlink = reiserfs_unlink, + .symlink = reiserfs_symlink, + .mkdir = reiserfs_mkdir, + .rmdir = reiserfs_rmdir, + .mknod = reiserfs_mknod, + .rename = reiserfs_rename, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, +}; + +/* + * symlink operations.. same as page_symlink_inode_operations, with xattr + * stuff added + */ +const struct inode_operations reiserfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, + +}; + +/* + * special file operations.. just xattr/acl stuff + */ +const struct inode_operations reiserfs_special_inode_operations = { + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, + +}; diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c new file mode 100644 index 00000000..3a6de810 --- /dev/null +++ b/fs/reiserfs/objectid.c @@ -0,0 +1,203 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include +#include + +// find where objectid map starts +#define objectid_map(s,rs) (old_format_only (s) ? \ + (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\ + (__le32 *)((rs) + 1)) + +#ifdef CONFIG_REISERFS_CHECK + +static void check_objectid_map(struct super_block *s, __le32 * map) +{ + if (le32_to_cpu(map[0]) != 1) + reiserfs_panic(s, "vs-15010", "map corrupted: %lx", + (long unsigned int)le32_to_cpu(map[0])); + + // FIXME: add something else here +} + +#else +static void check_objectid_map(struct super_block *s, __le32 * map) +{; +} +#endif + +/* When we allocate objectids we allocate the first unused objectid. + Each sequence of objectids in use (the odd sequences) is followed + by a sequence of objectids not in use (the even sequences). We + only need to record the last objectid in each of these sequences + (both the odd and even sequences) in order to fully define the + boundaries of the sequences. A consequence of allocating the first + objectid not in use is that under most conditions this scheme is + extremely compact. The exception is immediately after a sequence + of operations which deletes a large number of objects of + non-sequential objectids, and even then it will become compact + again as soon as more objects are created. Note that many + interesting optimizations of layout could result from complicating + objectid assignment, but we have deferred making them for now. */ + +/* get unique object identifier */ +__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th) +{ + struct super_block *s = th->t_super; + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); + __le32 *map = objectid_map(s, rs); + __u32 unused_objectid; + + BUG_ON(!th->t_trans_id); + + check_objectid_map(s, map); + + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + /* comment needed -Hans */ + unused_objectid = le32_to_cpu(map[1]); + if (unused_objectid == U32_MAX) { + reiserfs_warning(s, "reiserfs-15100", "no more object ids"); + reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s)); + return 0; + } + + /* This incrementation allocates the first unused objectid. That + is to say, the first entry on the objectid map is the first + unused objectid, and by incrementing it we use it. See below + where we check to see if we eliminated a sequence of unused + objectids.... */ + map[1] = cpu_to_le32(unused_objectid + 1); + + /* Now we check to see if we eliminated the last remaining member of + the first even sequence (and can eliminate the sequence by + eliminating its last objectid from oids), and can collapse the + first two odd sequences into one sequence. If so, then the net + result is to eliminate a pair of objectids from oids. We do this + by shifting the entire map to the left. */ + if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) { + memmove(map + 1, map + 3, + (sb_oid_cursize(rs) - 3) * sizeof(__u32)); + set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2); + } + + journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s)); + return unused_objectid; +} + +/* makes object identifier unused */ +void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, + __u32 objectid_to_release) +{ + struct super_block *s = th->t_super; + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); + __le32 *map = objectid_map(s, rs); + int i = 0; + + BUG_ON(!th->t_trans_id); + //return; + check_objectid_map(s, map); + + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s)); + + /* start at the beginning of the objectid map (i = 0) and go to + the end of it (i = disk_sb->s_oid_cursize). Linear search is + what we use, though it is possible that binary search would be + more efficient after performing lots of deletions (which is + when oids is large.) We only check even i's. */ + while (i < sb_oid_cursize(rs)) { + if (objectid_to_release == le32_to_cpu(map[i])) { + /* This incrementation unallocates the objectid. */ + //map[i]++; + le32_add_cpu(&map[i], 1); + + /* Did we unallocate the last member of an odd sequence, and can shrink oids? */ + if (map[i] == map[i + 1]) { + /* shrink objectid map */ + memmove(map + i, map + i + 2, + (sb_oid_cursize(rs) - i - + 2) * sizeof(__u32)); + //disk_sb->s_oid_cursize -= 2; + set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2); + + RFALSE(sb_oid_cursize(rs) < 2 || + sb_oid_cursize(rs) > sb_oid_maxsize(rs), + "vs-15005: objectid map corrupted cur_size == %d (max == %d)", + sb_oid_cursize(rs), sb_oid_maxsize(rs)); + } + return; + } + + if (objectid_to_release > le32_to_cpu(map[i]) && + objectid_to_release < le32_to_cpu(map[i + 1])) { + /* size of objectid map is not changed */ + if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) { + //objectid_map[i+1]--; + le32_add_cpu(&map[i + 1], -1); + return; + } + + /* JDM comparing two little-endian values for equality -- safe */ + if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) { + /* objectid map must be expanded, but there is no space */ + PROC_INFO_INC(s, leaked_oid); + return; + } + + /* expand the objectid map */ + memmove(map + i + 3, map + i + 1, + (sb_oid_cursize(rs) - i - 1) * sizeof(__u32)); + map[i + 1] = cpu_to_le32(objectid_to_release); + map[i + 2] = cpu_to_le32(objectid_to_release + 1); + set_sb_oid_cursize(rs, sb_oid_cursize(rs) + 2); + return; + } + i += 2; + } + + reiserfs_error(s, "vs-15011", "tried to free free object id (%lu)", + (long unsigned)objectid_to_release); +} + +int reiserfs_convert_objectid_map_v1(struct super_block *s) +{ + struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK(s); + int cur_size = sb_oid_cursize(disk_sb); + int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2; + int old_max = sb_oid_maxsize(disk_sb); + struct reiserfs_super_block_v1 *disk_sb_v1; + __le32 *objectid_map, *new_objectid_map; + int i; + + disk_sb_v1 = + (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data); + objectid_map = (__le32 *) (disk_sb_v1 + 1); + new_objectid_map = (__le32 *) (disk_sb + 1); + + if (cur_size > new_size) { + /* mark everyone used that was listed as free at the end of the objectid + ** map + */ + objectid_map[new_size - 1] = objectid_map[cur_size - 1]; + set_sb_oid_cursize(disk_sb, new_size); + } + /* move the smaller objectid map past the end of the new super */ + for (i = new_size - 1; i >= 0; i--) { + objectid_map[i + (old_max - new_size)] = objectid_map[i]; + } + + /* set the max size so we don't overflow later */ + set_sb_oid_maxsize(disk_sb, new_size); + + /* Zero out label and generate random UUID */ + memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label)); + generate_random_uuid(disk_sb->s_uuid); + + /* finally, zero out the unused chunk of the new super */ + memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused)); + return 0; +} diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c new file mode 100644 index 00000000..45de98b5 --- /dev/null +++ b/fs/reiserfs/prints.c @@ -0,0 +1,768 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include +#include + +#include + +static char error_buf[1024]; +static char fmt_buf[1024]; +static char off_buf[80]; + +static char *reiserfs_cpu_offset(struct cpu_key *key) +{ + if (cpu_key_k_type(key) == TYPE_DIRENTRY) + sprintf(off_buf, "%Lu(%Lu)", + (unsigned long long) + GET_HASH_VALUE(cpu_key_k_offset(key)), + (unsigned long long) + GET_GENERATION_NUMBER(cpu_key_k_offset(key))); + else + sprintf(off_buf, "0x%Lx", + (unsigned long long)cpu_key_k_offset(key)); + return off_buf; +} + +static char *le_offset(struct reiserfs_key *key) +{ + int version; + + version = le_key_version(key); + if (le_key_k_type(version, key) == TYPE_DIRENTRY) + sprintf(off_buf, "%Lu(%Lu)", + (unsigned long long) + GET_HASH_VALUE(le_key_k_offset(version, key)), + (unsigned long long) + GET_GENERATION_NUMBER(le_key_k_offset(version, key))); + else + sprintf(off_buf, "0x%Lx", + (unsigned long long)le_key_k_offset(version, key)); + return off_buf; +} + +static char *cpu_type(struct cpu_key *key) +{ + if (cpu_key_k_type(key) == TYPE_STAT_DATA) + return "SD"; + if (cpu_key_k_type(key) == TYPE_DIRENTRY) + return "DIR"; + if (cpu_key_k_type(key) == TYPE_DIRECT) + return "DIRECT"; + if (cpu_key_k_type(key) == TYPE_INDIRECT) + return "IND"; + return "UNKNOWN"; +} + +static char *le_type(struct reiserfs_key *key) +{ + int version; + + version = le_key_version(key); + + if (le_key_k_type(version, key) == TYPE_STAT_DATA) + return "SD"; + if (le_key_k_type(version, key) == TYPE_DIRENTRY) + return "DIR"; + if (le_key_k_type(version, key) == TYPE_DIRECT) + return "DIRECT"; + if (le_key_k_type(version, key) == TYPE_INDIRECT) + return "IND"; + return "UNKNOWN"; +} + +/* %k */ +static void sprintf_le_key(char *buf, struct reiserfs_key *key) +{ + if (key) + sprintf(buf, "[%d %d %s %s]", le32_to_cpu(key->k_dir_id), + le32_to_cpu(key->k_objectid), le_offset(key), + le_type(key)); + else + sprintf(buf, "[NULL]"); +} + +/* %K */ +static void sprintf_cpu_key(char *buf, struct cpu_key *key) +{ + if (key) + sprintf(buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id, + key->on_disk_key.k_objectid, reiserfs_cpu_offset(key), + cpu_type(key)); + else + sprintf(buf, "[NULL]"); +} + +static void sprintf_de_head(char *buf, struct reiserfs_de_head *deh) +{ + if (deh) + sprintf(buf, + "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", + deh_offset(deh), deh_dir_id(deh), deh_objectid(deh), + deh_location(deh), deh_state(deh)); + else + sprintf(buf, "[NULL]"); + +} + +static void sprintf_item_head(char *buf, struct item_head *ih) +{ + if (ih) { + strcpy(buf, + (ih_version(ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*"); + sprintf_le_key(buf + strlen(buf), &(ih->ih_key)); + sprintf(buf + strlen(buf), ", item_len %d, item_location %d, " + "free_space(entry_count) %d", + ih_item_len(ih), ih_location(ih), ih_free_space(ih)); + } else + sprintf(buf, "[NULL]"); +} + +static void sprintf_direntry(char *buf, struct reiserfs_dir_entry *de) +{ + char name[20]; + + memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen); + name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0; + sprintf(buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid); +} + +static void sprintf_block_head(char *buf, struct buffer_head *bh) +{ + sprintf(buf, "level=%d, nr_items=%d, free_space=%d rdkey ", + B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh)); +} + +static void sprintf_buffer_head(char *buf, struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + + sprintf(buf, + "dev %s, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", + bdevname(bh->b_bdev, b), bh->b_size, + (unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)), + bh->b_state, bh->b_page, + buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE", + buffer_dirty(bh) ? "DIRTY" : "CLEAN", + buffer_locked(bh) ? "LOCKED" : "UNLOCKED"); +} + +static void sprintf_disk_child(char *buf, struct disk_child *dc) +{ + sprintf(buf, "[dc_number=%d, dc_size=%u]", dc_block_number(dc), + dc_size(dc)); +} + +static char *is_there_reiserfs_struct(char *fmt, int *what) +{ + char *k = fmt; + + while ((k = strchr(k, '%')) != NULL) { + if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' || + k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') { + *what = k[1]; + break; + } + k++; + } + return k; +} + +/* debugging reiserfs we used to print out a lot of different + variables, like keys, item headers, buffer heads etc. Values of + most fields matter. So it took a long time just to write + appropriative printk. With this reiserfs_warning you can use format + specification for complex structures like you used to do with + printfs for integers, doubles and pointers. For instance, to print + out key structure you have to write just: + reiserfs_warning ("bad key %k", key); + instead of + printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, + key->k_offset, key->k_uniqueness); +*/ +static DEFINE_SPINLOCK(error_lock); +static void prepare_error_buf(const char *fmt, va_list args) +{ + char *fmt1 = fmt_buf; + char *k; + char *p = error_buf; + int what; + + spin_lock(&error_lock); + + strcpy(fmt1, fmt); + + while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) { + *k = 0; + + p += vsprintf(p, fmt1, args); + + switch (what) { + case 'k': + sprintf_le_key(p, va_arg(args, struct reiserfs_key *)); + break; + case 'K': + sprintf_cpu_key(p, va_arg(args, struct cpu_key *)); + break; + case 'h': + sprintf_item_head(p, va_arg(args, struct item_head *)); + break; + case 't': + sprintf_direntry(p, + va_arg(args, + struct reiserfs_dir_entry *)); + break; + case 'y': + sprintf_disk_child(p, + va_arg(args, struct disk_child *)); + break; + case 'z': + sprintf_block_head(p, + va_arg(args, struct buffer_head *)); + break; + case 'b': + sprintf_buffer_head(p, + va_arg(args, struct buffer_head *)); + break; + case 'a': + sprintf_de_head(p, + va_arg(args, + struct reiserfs_de_head *)); + break; + } + + p += strlen(p); + fmt1 = k + 2; + } + vsprintf(p, fmt1, args); + spin_unlock(&error_lock); + +} + +/* in addition to usual conversion specifiers this accepts reiserfs + specific conversion specifiers: + %k to print little endian key, + %K to print cpu key, + %h to print item_head, + %t to print directory entry + %z to print block head (arg must be struct buffer_head * + %b to print buffer_head +*/ + +#define do_reiserfs_warning(fmt)\ +{\ + va_list args;\ + va_start( args, fmt );\ + prepare_error_buf( fmt, args );\ + va_end( args );\ +} + +void __reiserfs_warning(struct super_block *sb, const char *id, + const char *function, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + if (sb) + printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: " + "%s\n", sb->s_id, id ? id : "", id ? " " : "", + function, error_buf); + else + printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n", + id ? id : "", id ? " " : "", function, error_buf); +} + +/* No newline.. reiserfs_info calls can be followed by printk's */ +void reiserfs_info(struct super_block *sb, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + if (sb) + printk(KERN_NOTICE "REISERFS (device %s): %s", + sb->s_id, error_buf); + else + printk(KERN_NOTICE "REISERFS %s:", error_buf); +} + +/* No newline.. reiserfs_printk calls can be followed by printk's */ +static void reiserfs_printk(const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + printk(error_buf); +} + +void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...) +{ +#ifdef CONFIG_REISERFS_CHECK + do_reiserfs_warning(fmt); + if (s) + printk(KERN_DEBUG "REISERFS debug (device %s): %s\n", + s->s_id, error_buf); + else + printk(KERN_DEBUG "REISERFS debug: %s\n", error_buf); +#endif +} + +/* The format: + + maintainer-errorid: [function-name:] message + + where errorid is unique to the maintainer and function-name is + optional, is recommended, so that anyone can easily find the bug + with a simple grep for the short to type string + maintainer-errorid. Don't bother with reusing errorids, there are + lots of numbers out there. + + Example: + + reiserfs_panic( + p_sb, "reiser-29: reiserfs_new_blocknrs: " + "one of search_start or rn(%d) is equal to MAX_B_NUM," + "which means that we are optimizing location based on the bogus location of a temp buffer (%p).", + rn, bh + ); + + Regular panic()s sometimes clear the screen before the message can + be read, thus the need for the while loop. + + Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it + pointless complexity): + + panics in reiserfs_fs.h have numbers from 1000 to 1999 + super.c 2000 to 2999 + preserve.c (unused) 3000 to 3999 + bitmap.c 4000 to 4999 + stree.c 5000 to 5999 + prints.c 6000 to 6999 + namei.c 7000 to 7999 + fix_nodes.c 8000 to 8999 + dir.c 9000 to 9999 + lbalance.c 10000 to 10999 + ibalance.c 11000 to 11999 not ready + do_balan.c 12000 to 12999 + inode.c 13000 to 13999 + file.c 14000 to 14999 + objectid.c 15000 - 15999 + buffer.c 16000 - 16999 + symlink.c 17000 - 17999 + + . */ + +void __reiserfs_panic(struct super_block *sb, const char *id, + const char *function, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + +#ifdef CONFIG_REISERFS_CHECK + dump_stack(); +#endif + if (sb) + panic(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n", + sb->s_id, id ? id : "", id ? " " : "", + function, error_buf); + else + panic(KERN_WARNING "REISERFS panic: %s%s%s: %s\n", + id ? id : "", id ? " " : "", function, error_buf); +} + +void __reiserfs_error(struct super_block *sb, const char *id, + const char *function, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + + BUG_ON(sb == NULL); + + if (reiserfs_error_panic(sb)) + __reiserfs_panic(sb, id, function, error_buf); + + if (id && id[0]) + printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n", + sb->s_id, id, function, error_buf); + else + printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n", + sb->s_id, function, error_buf); + + if (sb->s_flags & MS_RDONLY) + return; + + reiserfs_info(sb, "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + reiserfs_abort_journal(sb, -EIO); +} + +void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + + if (reiserfs_error_panic(sb)) { + panic(KERN_CRIT "REISERFS panic (device %s): %s\n", sb->s_id, + error_buf); + } + + if (reiserfs_is_journal_aborted(SB_JOURNAL(sb))) + return; + + printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id, + error_buf); + + sb->s_flags |= MS_RDONLY; + reiserfs_abort_journal(sb, errno); +} + +/* this prints internal nodes (4 keys/items in line) (dc_number, + dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number, + dc_size)...*/ +static int print_internal(struct buffer_head *bh, int first, int last) +{ + struct reiserfs_key *key; + struct disk_child *dc; + int i; + int from, to; + + if (!B_IS_KEYS_LEVEL(bh)) + return 1; + + check_internal(bh); + + if (first == -1) { + from = 0; + to = B_NR_ITEMS(bh); + } else { + from = first; + to = last < B_NR_ITEMS(bh) ? last : B_NR_ITEMS(bh); + } + + reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh); + + dc = B_N_CHILD(bh, from); + reiserfs_printk("PTR %d: %y ", from, dc); + + for (i = from, key = B_N_PDELIM_KEY(bh, from), dc++; i < to; + i++, key++, dc++) { + reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc); + if (i && i % 4 == 0) + printk("\n"); + } + printk("\n"); + return 0; +} + +static int print_leaf(struct buffer_head *bh, int print_mode, int first, + int last) +{ + struct block_head *blkh; + struct item_head *ih; + int i, nr; + int from, to; + + if (!B_IS_ITEMS_LEVEL(bh)) + return 1; + + check_leaf(bh); + + blkh = B_BLK_HEAD(bh); + ih = B_N_PITEM_HEAD(bh, 0); + nr = blkh_nr_item(blkh); + + printk + ("\n===================================================================\n"); + reiserfs_printk("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh); + + if (!(print_mode & PRINT_LEAF_ITEMS)) { + reiserfs_printk("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n", + &(ih->ih_key), &((ih + nr - 1)->ih_key)); + return 0; + } + + if (first < 0 || first > nr - 1) + from = 0; + else + from = first; + + if (last < 0 || last > nr) + to = nr; + else + to = last; + + ih += from; + printk + ("-------------------------------------------------------------------------------\n"); + printk + ("|##| type | key | ilen | free_space | version | loc |\n"); + for (i = from; i < to; i++, ih++) { + printk + ("-------------------------------------------------------------------------------\n"); + reiserfs_printk("|%2d| %h |\n", i, ih); + if (print_mode & PRINT_LEAF_ITEMS) + op_print_item(ih, B_I_PITEM(bh, ih)); + } + + printk + ("===================================================================\n"); + + return 0; +} + +char *reiserfs_hashname(int code) +{ + if (code == YURA_HASH) + return "rupasov"; + if (code == TEA_HASH) + return "tea"; + if (code == R5_HASH) + return "r5"; + + return "unknown"; +} + +/* return 1 if this is not super block */ +static int print_super_block(struct buffer_head *bh) +{ + struct reiserfs_super_block *rs = + (struct reiserfs_super_block *)(bh->b_data); + int skipped, data_blocks; + char *version; + char b[BDEVNAME_SIZE]; + + if (is_reiserfs_3_5(rs)) { + version = "3.5"; + } else if (is_reiserfs_3_6(rs)) { + version = "3.6"; + } else if (is_reiserfs_jr(rs)) { + version = ((sb_version(rs) == REISERFS_VERSION_2) ? + "3.6" : "3.5"); + } else { + return 1; + } + + printk("%s\'s super block is in block %llu\n", bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); + printk("Reiserfs version %s\n", version); + printk("Block count %u\n", sb_block_count(rs)); + printk("Blocksize %d\n", sb_blocksize(rs)); + printk("Free blocks %u\n", sb_free_blocks(rs)); + // FIXME: this would be confusing if + // someone stores reiserfs super block in some data block ;) +// skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs); + skipped = bh->b_blocknr; + data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) - + (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) + + 1 : sb_reserved_for_journal(rs)) - sb_free_blocks(rs); + printk + ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n" + "1 super block, %d data blocks\n", skipped, sb_bmap_nr(rs), + (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) : + sb_reserved_for_journal(rs)), data_blocks); + printk("Root block %u\n", sb_root_block(rs)); + printk("Journal block (first) %d\n", sb_jp_journal_1st_block(rs)); + printk("Journal dev %d\n", sb_jp_journal_dev(rs)); + printk("Journal orig size %d\n", sb_jp_journal_size(rs)); + printk("FS state %d\n", sb_fs_state(rs)); + printk("Hash function \"%s\"\n", + reiserfs_hashname(sb_hash_function_code(rs))); + + printk("Tree height %d\n", sb_tree_height(rs)); + return 0; +} + +static int print_desc_block(struct buffer_head *bh) +{ + struct reiserfs_journal_desc *desc; + + if (memcmp(get_journal_desc_magic(bh), JOURNAL_DESC_MAGIC, 8)) + return 1; + + desc = (struct reiserfs_journal_desc *)(bh->b_data); + printk("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)", + (unsigned long long)bh->b_blocknr, get_desc_trans_id(desc), + get_desc_mount_id(desc), get_desc_trans_len(desc)); + + return 0; +} + +void print_block(struct buffer_head *bh, ...) //int print_mode, int first, int last) +{ + va_list args; + int mode, first, last; + + if (!bh) { + printk("print_block: buffer is NULL\n"); + return; + } + + va_start(args, bh); + + mode = va_arg(args, int); + first = va_arg(args, int); + last = va_arg(args, int); + if (print_leaf(bh, mode, first, last)) + if (print_internal(bh, first, last)) + if (print_super_block(bh)) + if (print_desc_block(bh)) + printk + ("Block %llu contains unformatted data\n", + (unsigned long long)bh->b_blocknr); + + va_end(args); +} + +static char print_tb_buf[2048]; + +/* this stores initial state of tree balance in the print_tb_buf */ +void store_print_tb(struct tree_balance *tb) +{ + int h = 0; + int i; + struct buffer_head *tbSh, *tbFh; + + if (!tb) + return; + + sprintf(print_tb_buf, "\n" + "BALANCING %d\n" + "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n" + "=====================================================================\n" + "* h * S * L * R * F * FL * FR * CFL * CFR *\n", + REISERFS_SB(tb->tb_sb)->s_do_balance, + tb->tb_mode, PATH_LAST_POSITION(tb->tb_path), + tb->tb_path->pos_in_item); + + for (h = 0; h < ARRAY_SIZE(tb->insert_size); h++) { + if (PATH_H_PATH_OFFSET(tb->tb_path, h) <= + tb->tb_path->path_length + && PATH_H_PATH_OFFSET(tb->tb_path, + h) > ILLEGAL_PATH_ELEMENT_OFFSET) { + tbSh = PATH_H_PBUFFER(tb->tb_path, h); + tbFh = PATH_H_PPARENT(tb->tb_path, h); + } else { + tbSh = NULL; + tbFh = NULL; + } + sprintf(print_tb_buf + strlen(print_tb_buf), + "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n", + h, + (tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL), + (tbSh) ? atomic_read(&(tbSh->b_count)) : -1, + (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL), + (tb->L[h]) ? atomic_read(&(tb->L[h]->b_count)) : -1, + (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL), + (tb->R[h]) ? atomic_read(&(tb->R[h]->b_count)) : -1, + (tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL), + (tb->FL[h]) ? (long long)(tb->FL[h]-> + b_blocknr) : (-1LL), + (tb->FR[h]) ? (long long)(tb->FR[h]-> + b_blocknr) : (-1LL), + (tb->CFL[h]) ? (long long)(tb->CFL[h]-> + b_blocknr) : (-1LL), + (tb->CFR[h]) ? (long long)(tb->CFR[h]-> + b_blocknr) : (-1LL)); + } + + sprintf(print_tb_buf + strlen(print_tb_buf), + "=====================================================================\n" + "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n" + "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n", + tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0], + tb->rbytes, tb->blknum[0], tb->s0num, tb->s1num, tb->s1bytes, + tb->s2num, tb->s2bytes, tb->cur_blknum, tb->lkey[0], + tb->rkey[0]); + + /* this prints balance parameters for non-leaf levels */ + h = 0; + do { + h++; + sprintf(print_tb_buf + strlen(print_tb_buf), + "* %d * %4d * %2d * * %2d * * %2d *\n", + h, tb->insert_size[h], tb->lnum[h], tb->rnum[h], + tb->blknum[h]); + } while (tb->insert_size[h]); + + sprintf(print_tb_buf + strlen(print_tb_buf), + "=====================================================================\n" + "FEB list: "); + + /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */ + h = 0; + for (i = 0; i < ARRAY_SIZE(tb->FEB); i++) + sprintf(print_tb_buf + strlen(print_tb_buf), + "%p (%llu %d)%s", tb->FEB[i], + tb->FEB[i] ? (unsigned long long)tb->FEB[i]-> + b_blocknr : 0ULL, + tb->FEB[i] ? atomic_read(&(tb->FEB[i]->b_count)) : 0, + (i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", "); + + sprintf(print_tb_buf + strlen(print_tb_buf), + "======================== the end ====================================\n"); +} + +void print_cur_tb(char *mes) +{ + printk("%s\n%s", mes, print_tb_buf); +} + +static void check_leaf_block_head(struct buffer_head *bh) +{ + struct block_head *blkh; + int nr; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE) + reiserfs_panic(NULL, "vs-6010", "invalid item number %z", + bh); + if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr) + reiserfs_panic(NULL, "vs-6020", "invalid free space %z", + bh); + +} + +static void check_internal_block_head(struct buffer_head *bh) +{ + struct block_head *blkh; + + blkh = B_BLK_HEAD(bh); + if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT)) + reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh); + + if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE) + reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh); + + if (B_FREE_SPACE(bh) != + bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) - + DC_SIZE * (B_NR_ITEMS(bh) + 1)) + reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh); + +} + +void check_leaf(struct buffer_head *bh) +{ + int i; + struct item_head *ih; + + if (!bh) + return; + check_leaf_block_head(bh); + for (i = 0, ih = B_N_PITEM_HEAD(bh, 0); i < B_NR_ITEMS(bh); i++, ih++) + op_check_item(ih, B_I_PITEM(bh, ih)); +} + +void check_internal(struct buffer_head *bh) +{ + if (!bh) + return; + check_internal_block_head(bh); +} + +void print_statistics(struct super_block *s) +{ + + /* + printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \ + bmap with search %d, without %d, dir2ind %d, ind2dir %d\n", + REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes, + REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search, + REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct); + */ + +} diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c new file mode 100644 index 00000000..7a998119 --- /dev/null +++ b/fs/reiserfs/procfs.c @@ -0,0 +1,576 @@ +/* -*- linux-c -*- */ + +/* fs/reiserfs/procfs.c */ + +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* proc info support a la one created by Sizif@Botik.RU for PGC */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * LOCKING: + * + * We rely on new Alexander Viro's super-block locking. + * + */ + +static int show_version(struct seq_file *m, struct super_block *sb) +{ + char *format; + + if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) { + format = "3.6"; + } else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) { + format = "3.5"; + } else { + format = "unknown"; + } + + seq_printf(m, "%s format\twith checks %s\n", format, +#if defined( CONFIG_REISERFS_CHECK ) + "on" +#else + "off" +#endif + ); + return 0; +} + +#define SF( x ) ( r -> x ) +#define SFP( x ) SF( s_proc_info_data.x ) +#define SFPL( x ) SFP( x[ level ] ) +#define SFPF( x ) SFP( scan_bitmap.x ) +#define SFPJ( x ) SFP( journal.x ) + +#define D2C( x ) le16_to_cpu( x ) +#define D4C( x ) le32_to_cpu( x ) +#define DF( x ) D2C( rs -> s_v1.x ) +#define DFL( x ) D4C( rs -> s_v1.x ) + +#define objectid_map( s, rs ) (old_format_only (s) ? \ + (__le32 *)((struct reiserfs_super_block_v1 *)rs + 1) : \ + (__le32 *)(rs + 1)) +#define MAP( i ) D4C( objectid_map( sb, rs )[ i ] ) + +#define DJF( x ) le32_to_cpu( rs -> x ) +#define DJV( x ) le32_to_cpu( s_v1 -> x ) +#define DJP( x ) le32_to_cpu( jp -> x ) +#define JF( x ) ( r -> s_journal -> x ) + +static int show_super(struct seq_file *m, struct super_block *sb) +{ + struct reiserfs_sb_info *r = REISERFS_SB(sb); + + seq_printf(m, "state: \t%s\n" + "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" + "gen. counter: \t%i\n" + "s_disk_reads: \t%i\n" + "s_disk_writes: \t%i\n" + "s_fix_nodes: \t%i\n" + "s_do_balance: \t%i\n" + "s_unneeded_left_neighbor: \t%i\n" + "s_good_search_by_key_reada: \t%i\n" + "s_bmaps: \t%i\n" + "s_bmaps_without_search: \t%i\n" + "s_direct2indirect: \t%i\n" + "s_indirect2direct: \t%i\n" + "\n" + "max_hash_collisions: \t%i\n" + "breads: \t%lu\n" + "bread_misses: \t%lu\n" + "search_by_key: \t%lu\n" + "search_by_key_fs_changed: \t%lu\n" + "search_by_key_restarted: \t%lu\n" + "insert_item_restarted: \t%lu\n" + "paste_into_item_restarted: \t%lu\n" + "cut_from_item_restarted: \t%lu\n" + "delete_solid_item_restarted: \t%lu\n" + "delete_item_restarted: \t%lu\n" + "leaked_oid: \t%lu\n" + "leaves_removable: \t%lu\n", + SF(s_mount_state) == REISERFS_VALID_FS ? + "REISERFS_VALID_FS" : "REISERFS_ERROR_FS", + reiserfs_r5_hash(sb) ? "FORCE_R5 " : "", + reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "", + reiserfs_tea_hash(sb) ? "FORCE_TEA " : "", + reiserfs_hash_detect(sb) ? "DETECT_HASH " : "", + reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ", + reiserfs_no_unhashed_relocation(sb) ? + "NO_UNHASHED_RELOCATION " : "", + reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "", + reiserfs_test4(sb) ? "TEST4 " : "", + have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ? + "SMALL_TAILS " : "NO_TAILS ", + replay_only(sb) ? "REPLAY_ONLY " : "", + convert_reiserfs(sb) ? "CONV " : "", + atomic_read(&r->s_generation_counter), + SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes), + SF(s_do_balance), SF(s_unneeded_left_neighbor), + SF(s_good_search_by_key_reada), SF(s_bmaps), + SF(s_bmaps_without_search), SF(s_direct2indirect), + SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads), + SFP(bread_miss), SFP(search_by_key), + SFP(search_by_key_fs_changed), SFP(search_by_key_restarted), + SFP(insert_item_restarted), SFP(paste_into_item_restarted), + SFP(cut_from_item_restarted), + SFP(delete_solid_item_restarted), SFP(delete_item_restarted), + SFP(leaked_oid), SFP(leaves_removable)); + + return 0; +} + +static int show_per_level(struct seq_file *m, struct super_block *sb) +{ + struct reiserfs_sb_info *r = REISERFS_SB(sb); + int level; + + seq_printf(m, "level\t" + " balances" + " [sbk: reads" + " fs_changed" + " restarted]" + " free space" + " items" + " can_remove" + " lnum" + " rnum" + " lbytes" + " rbytes" + " get_neig" + " get_neig_res" " need_l_neig" " need_r_neig" "\n"); + + for (level = 0; level < MAX_HEIGHT; ++level) { + seq_printf(m, "%i\t" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12li" + " %12li" + " %12li" + " %12li" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + "\n", + level, + SFPL(balance_at), + SFPL(sbk_read_at), + SFPL(sbk_fs_changed), + SFPL(sbk_restarted), + SFPL(free_at), + SFPL(items_at), + SFPL(can_node_be_removed), + SFPL(lnum), + SFPL(rnum), + SFPL(lbytes), + SFPL(rbytes), + SFPL(get_neighbors), + SFPL(get_neighbors_restart), + SFPL(need_l_neighbor), SFPL(need_r_neighbor) + ); + } + return 0; +} + +static int show_bitmap(struct seq_file *m, struct super_block *sb) +{ + struct reiserfs_sb_info *r = REISERFS_SB(sb); + + seq_printf(m, "free_block: %lu\n" + " scan_bitmap:" + " wait" + " bmap" + " retry" + " stolen" + " journal_hint" + "journal_nohint" + "\n" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + "\n", + SFP(free_block), + SFPF(call), + SFPF(wait), + SFPF(bmap), + SFPF(retry), + SFPF(stolen), + SFPF(in_journal_hint), SFPF(in_journal_nohint)); + + return 0; +} + +static int show_on_disk_super(struct seq_file *m, struct super_block *sb) +{ + struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); + struct reiserfs_super_block *rs = sb_info->s_rs; + int hash_code = DFL(s_hash_function_code); + __u32 flags = DJF(s_flags); + + seq_printf(m, "block_count: \t%i\n" + "free_blocks: \t%i\n" + "root_block: \t%i\n" + "blocksize: \t%i\n" + "oid_maxsize: \t%i\n" + "oid_cursize: \t%i\n" + "umount_state: \t%i\n" + "magic: \t%10.10s\n" + "fs_state: \t%i\n" + "hash: \t%s\n" + "tree_height: \t%i\n" + "bmap_nr: \t%i\n" + "version: \t%i\n" + "flags: \t%x[%s]\n" + "reserved_for_journal: \t%i\n", + DFL(s_block_count), + DFL(s_free_blocks), + DFL(s_root_block), + DF(s_blocksize), + DF(s_oid_maxsize), + DF(s_oid_cursize), + DF(s_umount_state), + rs->s_v1.s_magic, + DF(s_fs_state), + hash_code == TEA_HASH ? "tea" : + (hash_code == YURA_HASH) ? "rupasov" : + (hash_code == R5_HASH) ? "r5" : + (hash_code == UNSET_HASH) ? "unset" : "unknown", + DF(s_tree_height), + DF(s_bmap_nr), + DF(s_version), flags, (flags & reiserfs_attrs_cleared) + ? "attrs_cleared" : "", DF(s_reserved_for_journal)); + + return 0; +} + +static int show_oidmap(struct seq_file *m, struct super_block *sb) +{ + struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); + struct reiserfs_super_block *rs = sb_info->s_rs; + unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize); + unsigned long total_used = 0; + int i; + + for (i = 0; i < mapsize; ++i) { + __u32 right; + + right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1); + seq_printf(m, "%s: [ %x .. %x )\n", + (i & 1) ? "free" : "used", MAP(i), right); + if (!(i & 1)) { + total_used += right - MAP(i); + } + } +#if defined( REISERFS_USE_OIDMAPF ) + if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) { + loff_t size = sb_info->oidmap.mapf->f_path.dentry->d_inode->i_size; + total_used += size / sizeof(reiserfs_oidinterval_d_t); + } +#endif + seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n", + mapsize, + mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used); + return 0; +} + +static int show_journal(struct seq_file *m, struct super_block *sb) +{ + struct reiserfs_sb_info *r = REISERFS_SB(sb); + struct reiserfs_super_block *rs = r->s_rs; + struct journal_params *jp = &rs->s_v1.s_journal; + char b[BDEVNAME_SIZE]; + + seq_printf(m, /* on-disk fields */ + "jp_journal_1st_block: \t%i\n" + "jp_journal_dev: \t%s[%x]\n" + "jp_journal_size: \t%i\n" + "jp_journal_trans_max: \t%i\n" + "jp_journal_magic: \t%i\n" + "jp_journal_max_batch: \t%i\n" + "jp_journal_max_commit_age: \t%i\n" + "jp_journal_max_trans_age: \t%i\n" + /* incore fields */ + "j_1st_reserved_block: \t%i\n" + "j_state: \t%li\n" + "j_trans_id: \t%u\n" + "j_mount_id: \t%lu\n" + "j_start: \t%lu\n" + "j_len: \t%lu\n" + "j_len_alloc: \t%lu\n" + "j_wcount: \t%i\n" + "j_bcount: \t%lu\n" + "j_first_unflushed_offset: \t%lu\n" + "j_last_flush_trans_id: \t%u\n" + "j_trans_start_time: \t%li\n" + "j_list_bitmap_index: \t%i\n" + "j_must_wait: \t%i\n" + "j_next_full_flush: \t%i\n" + "j_next_async_flush: \t%i\n" + "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n" + /* reiserfs_proc_info_data_t.journal fields */ + "in_journal: \t%12lu\n" + "in_journal_bitmap: \t%12lu\n" + "in_journal_reusable: \t%12lu\n" + "lock_journal: \t%12lu\n" + "lock_journal_wait: \t%12lu\n" + "journal_begin: \t%12lu\n" + "journal_relock_writers: \t%12lu\n" + "journal_relock_wcount: \t%12lu\n" + "mark_dirty: \t%12lu\n" + "mark_dirty_already: \t%12lu\n" + "mark_dirty_notjournal: \t%12lu\n" + "restore_prepared: \t%12lu\n" + "prepare: \t%12lu\n" + "prepare_retry: \t%12lu\n", + DJP(jp_journal_1st_block), + bdevname(SB_JOURNAL(sb)->j_dev_bd, b), + DJP(jp_journal_dev), + DJP(jp_journal_size), + DJP(jp_journal_trans_max), + DJP(jp_journal_magic), + DJP(jp_journal_max_batch), + SB_JOURNAL(sb)->j_max_commit_age, + DJP(jp_journal_max_trans_age), + JF(j_1st_reserved_block), + JF(j_state), + JF(j_trans_id), + JF(j_mount_id), + JF(j_start), + JF(j_len), + JF(j_len_alloc), + atomic_read(&r->s_journal->j_wcount), + JF(j_bcount), + JF(j_first_unflushed_offset), + JF(j_last_flush_trans_id), + JF(j_trans_start_time), + JF(j_list_bitmap_index), + JF(j_must_wait), + JF(j_next_full_flush), + JF(j_next_async_flush), + JF(j_cnode_used), + JF(j_cnode_free), + SFPJ(in_journal), + SFPJ(in_journal_bitmap), + SFPJ(in_journal_reusable), + SFPJ(lock_journal), + SFPJ(lock_journal_wait), + SFPJ(journal_being), + SFPJ(journal_relock_writers), + SFPJ(journal_relock_wcount), + SFPJ(mark_dirty), + SFPJ(mark_dirty_already), + SFPJ(mark_dirty_notjournal), + SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry) + ); + return 0; +} + +/* iterator */ +static int test_sb(struct super_block *sb, void *data) +{ + return data == sb; +} + +static int set_sb(struct super_block *sb, void *data) +{ + return -ENOENT; +} + +static void *r_start(struct seq_file *m, loff_t * pos) +{ + struct proc_dir_entry *de = m->private; + struct super_block *s = de->parent->data; + loff_t l = *pos; + + if (l) + return NULL; + + if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, s))) + return NULL; + + up_write(&s->s_umount); + return s; +} + +static void *r_next(struct seq_file *m, void *v, loff_t * pos) +{ + ++*pos; + if (v) + deactivate_super(v); + return NULL; +} + +static void r_stop(struct seq_file *m, void *v) +{ + if (v) + deactivate_super(v); +} + +static int r_show(struct seq_file *m, void *v) +{ + struct proc_dir_entry *de = m->private; + int (*show) (struct seq_file *, struct super_block *) = de->data; + return show(m, v); +} + +static const struct seq_operations r_ops = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = r_show, +}; + +static int r_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &r_ops); + + if (!ret) { + struct seq_file *m = file->private_data; + m->private = PDE(inode); + } + return ret; +} + +static const struct file_operations r_file_operations = { + .open = r_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + +static struct proc_dir_entry *proc_info_root = NULL; +static const char proc_info_root_name[] = "fs/reiserfs"; + +static void add_file(struct super_block *sb, char *name, + int (*func) (struct seq_file *, struct super_block *)) +{ + proc_create_data(name, 0, REISERFS_SB(sb)->procdir, + &r_file_operations, func); +} + +int reiserfs_proc_info_init(struct super_block *sb) +{ + char b[BDEVNAME_SIZE]; + char *s; + + /* Some block devices use /'s */ + strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; + + spin_lock_init(&__PINFO(sb).lock); + REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root); + if (REISERFS_SB(sb)->procdir) { + REISERFS_SB(sb)->procdir->data = sb; + add_file(sb, "version", show_version); + add_file(sb, "super", show_super); + add_file(sb, "per-level", show_per_level); + add_file(sb, "bitmap", show_bitmap); + add_file(sb, "on-disk-super", show_on_disk_super); + add_file(sb, "oidmap", show_oidmap); + add_file(sb, "journal", show_journal); + return 0; + } + reiserfs_warning(sb, "cannot create /proc/%s/%s", + proc_info_root_name, b); + return 1; +} + +int reiserfs_proc_info_done(struct super_block *sb) +{ + struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; + char b[BDEVNAME_SIZE]; + char *s; + + /* Some block devices use /'s */ + strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; + + if (de) { + remove_proc_entry("journal", de); + remove_proc_entry("oidmap", de); + remove_proc_entry("on-disk-super", de); + remove_proc_entry("bitmap", de); + remove_proc_entry("per-level", de); + remove_proc_entry("super", de); + remove_proc_entry("version", de); + } + spin_lock(&__PINFO(sb).lock); + __PINFO(sb).exiting = 1; + spin_unlock(&__PINFO(sb).lock); + if (proc_info_root) { + remove_proc_entry(b, proc_info_root); + REISERFS_SB(sb)->procdir = NULL; + } + return 0; +} + +int reiserfs_proc_info_global_init(void) +{ + if (proc_info_root == NULL) { + proc_info_root = proc_mkdir(proc_info_root_name, NULL); + if (!proc_info_root) { + reiserfs_warning(NULL, "cannot create /proc/%s", + proc_info_root_name); + return 1; + } + } + return 0; +} + +int reiserfs_proc_info_global_done(void) +{ + if (proc_info_root != NULL) { + proc_info_root = NULL; + remove_proc_entry(proc_info_root_name, NULL); + } + return 0; +} +/* + * Revision 1.1.8.2 2001/07/15 17:08:42 god + * . use get_super() in procfs.c + * . remove remove_save_link() from reiserfs_do_truncate() + * + * I accept terms and conditions stated in the Legal Agreement + * (available at http://www.namesys.com/legalese.html) + * + * Revision 1.1.8.1 2001/07/11 16:48:50 god + * proc info support + * + * I accept terms and conditions stated in the Legal Agreement + * (available at http://www.namesys.com/legalese.html) + * + */ + +/* + * Make Linus happy. + * Local variables: + * c-indentation-style: "K&R" + * mode-name: "LC" + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c new file mode 100644 index 00000000..b3a94d20 --- /dev/null +++ b/fs/reiserfs/resize.c @@ -0,0 +1,212 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* + * Written by Alexander Zarochentcev. + * + * The kernel part of the (on-line) reiserfs resizer. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int reiserfs_resize(struct super_block *s, unsigned long block_count_new) +{ + int err = 0; + struct reiserfs_super_block *sb; + struct reiserfs_bitmap_info *bitmap; + struct reiserfs_bitmap_info *info; + struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s); + struct buffer_head *bh; + struct reiserfs_transaction_handle th; + unsigned int bmap_nr_new, bmap_nr; + unsigned int block_r_new, block_r; + + struct reiserfs_list_bitmap *jb; + struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS]; + + unsigned long int block_count, free_blocks; + int i; + int copy_size; + + sb = SB_DISK_SUPER_BLOCK(s); + + if (SB_BLOCK_COUNT(s) >= block_count_new) { + printk("can\'t shrink filesystem on-line\n"); + return -EINVAL; + } + + /* check the device size */ + bh = sb_bread(s, block_count_new - 1); + if (!bh) { + printk("reiserfs_resize: can\'t read last block\n"); + return -EINVAL; + } + bforget(bh); + + /* old disk layout detection; those partitions can be mounted, but + * cannot be resized */ + if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size + != REISERFS_DISK_OFFSET_IN_BYTES) { + printk + ("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n"); + return -ENOTSUPP; + } + + /* count used bits in last bitmap block */ + block_r = SB_BLOCK_COUNT(s) - + (reiserfs_bmap_count(s) - 1) * s->s_blocksize * 8; + + /* count bitmap blocks in new fs */ + bmap_nr_new = block_count_new / (s->s_blocksize * 8); + block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8; + if (block_r_new) + bmap_nr_new++; + else + block_r_new = s->s_blocksize * 8; + + /* save old values */ + block_count = SB_BLOCK_COUNT(s); + bmap_nr = reiserfs_bmap_count(s); + + /* resizing of reiserfs bitmaps (journal and real), if needed */ + if (bmap_nr_new > bmap_nr) { + /* reallocate journal bitmaps */ + if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) { + printk + ("reiserfs_resize: unable to allocate memory for journal bitmaps\n"); + return -ENOMEM; + } + /* the new journal bitmaps are zero filled, now we copy in the bitmap + ** node pointers from the old journal bitmap structs, and then + ** transfer the new data structures into the journal struct. + ** + ** using the copy_size var below allows this code to work for + ** both shrinking and expanding the FS. + */ + copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr; + copy_size = + copy_size * sizeof(struct reiserfs_list_bitmap_node *); + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + struct reiserfs_bitmap_node **node_tmp; + jb = SB_JOURNAL(s)->j_list_bitmap + i; + memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size); + + /* just in case vfree schedules on us, copy the new + ** pointer into the journal struct before freeing the + ** old one + */ + node_tmp = jb->bitmaps; + jb->bitmaps = jbitmap[i].bitmaps; + vfree(node_tmp); + } + + /* allocate additional bitmap blocks, reallocate array of bitmap + * block pointers */ + bitmap = + vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); + if (!bitmap) { + /* Journal bitmaps are still supersized, but the memory isn't + * leaked, so I guess it's ok */ + printk("reiserfs_resize: unable to allocate memory.\n"); + return -ENOMEM; + } + memset(bitmap, 0, + sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); + for (i = 0; i < bmap_nr; i++) + bitmap[i] = old_bitmap[i]; + + /* This doesn't go through the journal, but it doesn't have to. + * The changes are still atomic: We're synced up when the journal + * transaction begins, and the new bitmaps don't matter if the + * transaction fails. */ + for (i = bmap_nr; i < bmap_nr_new; i++) { + /* don't use read_bitmap_block since it will cache + * the uninitialized bitmap */ + bh = sb_bread(s, i * s->s_blocksize * 8); + if (!bh) { + vfree(bitmap); + return -EIO; + } + memset(bh->b_data, 0, sb_blocksize(sb)); + reiserfs_test_and_set_le_bit(0, bh->b_data); + reiserfs_cache_bitmap_metadata(s, bh, bitmap + i); + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + reiserfs_write_unlock(s); + sync_dirty_buffer(bh); + reiserfs_write_lock(s); + // update bitmap_info stuff + bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; + brelse(bh); + } + /* free old bitmap blocks array */ + SB_AP_BITMAP(s) = bitmap; + vfree(old_bitmap); + } + + /* begin transaction, if there was an error, it's fine. Yes, we have + * incorrect bitmaps now, but none of it is ever going to touch the + * disk anyway. */ + err = journal_begin(&th, s, 10); + if (err) + return err; + + /* Extend old last bitmap block - new blocks have been made available */ + info = SB_AP_BITMAP(s) + bmap_nr - 1; + bh = reiserfs_read_bitmap_block(s, bmap_nr - 1); + if (!bh) { + int jerr = journal_end(&th, s, 10); + if (jerr) + return jerr; + return -EIO; + } + + reiserfs_prepare_for_journal(s, bh, 1); + for (i = block_r; i < s->s_blocksize * 8; i++) + reiserfs_test_and_clear_le_bit(i, bh->b_data); + info->free_count += s->s_blocksize * 8 - block_r; + + journal_mark_dirty(&th, s, bh); + brelse(bh); + + /* Correct new last bitmap block - It may not be full */ + info = SB_AP_BITMAP(s) + bmap_nr_new - 1; + bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1); + if (!bh) { + int jerr = journal_end(&th, s, 10); + if (jerr) + return jerr; + return -EIO; + } + + reiserfs_prepare_for_journal(s, bh, 1); + for (i = block_r_new; i < s->s_blocksize * 8; i++) + reiserfs_test_and_set_le_bit(i, bh->b_data); + journal_mark_dirty(&th, s, bh); + brelse(bh); + + info->free_count -= s->s_blocksize * 8 - block_r_new; + /* update super */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + free_blocks = SB_FREE_BLOCKS(s); + PUT_SB_FREE_BLOCKS(s, + free_blocks + (block_count_new - block_count - + (bmap_nr_new - bmap_nr))); + PUT_SB_BLOCK_COUNT(s, block_count_new); + PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new); + s->s_dirt = 1; + + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + + SB_JOURNAL(s)->j_must_wait = 1; + return journal_end(&th, s, 10); +} diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c new file mode 100644 index 00000000..313d39d6 --- /dev/null +++ b/fs/reiserfs/stree.c @@ -0,0 +1,2120 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* + * Written by Anatoly P. Pinchuk pap@namesys.botik.ru + * Programm System Institute + * Pereslavl-Zalessky Russia + */ + +/* + * This file contains functions dealing with S+tree + * + * B_IS_IN_TREE + * copy_item_head + * comp_short_keys + * comp_keys + * comp_short_le_keys + * le_key2cpu_key + * comp_le_keys + * bin_search + * get_lkey + * get_rkey + * key_in_buffer + * decrement_bcount + * reiserfs_check_path + * pathrelse_and_restore + * pathrelse + * search_by_key_reada + * search_by_key + * search_for_position_by_key + * comp_items + * prepare_for_direct_item + * prepare_for_direntry_item + * prepare_for_delete_or_cut + * calc_deleted_bytes_number + * init_tb_struct + * padd_item + * reiserfs_delete_item + * reiserfs_delete_solid_item + * reiserfs_delete_object + * maybe_indirect_to_direct + * indirect_to_direct_roll_back + * reiserfs_cut_from_item + * truncate_directory + * reiserfs_do_truncate + * reiserfs_paste_into_item + * reiserfs_insert_item + */ + +#include +#include +#include +#include +#include +#include + +/* Does the buffer contain a disk block which is in the tree. */ +inline int B_IS_IN_TREE(const struct buffer_head *bh) +{ + + RFALSE(B_LEVEL(bh) > MAX_HEIGHT, + "PAP-1010: block (%b) has too big level (%z)", bh, bh); + + return (B_LEVEL(bh) != FREE_LEVEL); +} + +// +// to gets item head in le form +// +inline void copy_item_head(struct item_head *to, + const struct item_head *from) +{ + memcpy(to, from, IH_SIZE); +} + +/* k1 is pointer to on-disk structure which is stored in little-endian + form. k2 is pointer to cpu variable. For key of items of the same + object this returns 0. + Returns: -1 if key1 < key2 + 0 if key1 == key2 + 1 if key1 > key2 */ +inline int comp_short_keys(const struct reiserfs_key *le_key, + const struct cpu_key *cpu_key) +{ + __u32 n; + n = le32_to_cpu(le_key->k_dir_id); + if (n < cpu_key->on_disk_key.k_dir_id) + return -1; + if (n > cpu_key->on_disk_key.k_dir_id) + return 1; + n = le32_to_cpu(le_key->k_objectid); + if (n < cpu_key->on_disk_key.k_objectid) + return -1; + if (n > cpu_key->on_disk_key.k_objectid) + return 1; + return 0; +} + +/* k1 is pointer to on-disk structure which is stored in little-endian + form. k2 is pointer to cpu variable. + Compare keys using all 4 key fields. + Returns: -1 if key1 < key2 0 + if key1 = key2 1 if key1 > key2 */ +static inline int comp_keys(const struct reiserfs_key *le_key, + const struct cpu_key *cpu_key) +{ + int retval; + + retval = comp_short_keys(le_key, cpu_key); + if (retval) + return retval; + if (le_key_k_offset(le_key_version(le_key), le_key) < + cpu_key_k_offset(cpu_key)) + return -1; + if (le_key_k_offset(le_key_version(le_key), le_key) > + cpu_key_k_offset(cpu_key)) + return 1; + + if (cpu_key->key_length == 3) + return 0; + + /* this part is needed only when tail conversion is in progress */ + if (le_key_k_type(le_key_version(le_key), le_key) < + cpu_key_k_type(cpu_key)) + return -1; + + if (le_key_k_type(le_key_version(le_key), le_key) > + cpu_key_k_type(cpu_key)) + return 1; + + return 0; +} + +inline int comp_short_le_keys(const struct reiserfs_key *key1, + const struct reiserfs_key *key2) +{ + __u32 *k1_u32, *k2_u32; + int key_length = REISERFS_SHORT_KEY_LEN; + + k1_u32 = (__u32 *) key1; + k2_u32 = (__u32 *) key2; + for (; key_length--; ++k1_u32, ++k2_u32) { + if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32)) + return -1; + if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32)) + return 1; + } + return 0; +} + +inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from) +{ + int version; + to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id); + to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid); + + // find out version of the key + version = le_key_version(from); + to->version = version; + to->on_disk_key.k_offset = le_key_k_offset(version, from); + to->on_disk_key.k_type = le_key_k_type(version, from); +} + +// this does not say which one is bigger, it only returns 1 if keys +// are not equal, 0 otherwise +inline int comp_le_keys(const struct reiserfs_key *k1, + const struct reiserfs_key *k2) +{ + return memcmp(k1, k2, sizeof(struct reiserfs_key)); +} + +/************************************************************************** + * Binary search toolkit function * + * Search for an item in the array by the item key * + * Returns: 1 if found, 0 if not found; * + * *pos = number of the searched element if found, else the * + * number of the first element that is larger than key. * + **************************************************************************/ +/* For those not familiar with binary search: lbound is the leftmost item that it + could be, rbound the rightmost item that it could be. We examine the item + halfway between lbound and rbound, and that tells us either that we can increase + lbound, or decrease rbound, or that we have found it, or if lbound <= rbound that + there are no possible items, and we have not found it. With each examination we + cut the number of possible items it could be by one more than half rounded down, + or we find it. */ +static inline int bin_search(const void *key, /* Key to search for. */ + const void *base, /* First item in the array. */ + int num, /* Number of items in the array. */ + int width, /* Item size in the array. + searched. Lest the reader be + confused, note that this is crafted + as a general function, and when it + is applied specifically to the array + of item headers in a node, width + is actually the item header size not + the item size. */ + int *pos /* Number of the searched for element. */ + ) +{ + int rbound, lbound, j; + + for (j = ((rbound = num - 1) + (lbound = 0)) / 2; + lbound <= rbound; j = (rbound + lbound) / 2) + switch (comp_keys + ((struct reiserfs_key *)((char *)base + j * width), + (struct cpu_key *)key)) { + case -1: + lbound = j + 1; + continue; + case 1: + rbound = j - 1; + continue; + case 0: + *pos = j; + return ITEM_FOUND; /* Key found in the array. */ + } + + /* bin_search did not find given key, it returns position of key, + that is minimal and greater than the given one. */ + *pos = lbound; + return ITEM_NOT_FOUND; +} + + +/* Minimal possible key. It is never in the tree. */ +const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} }; + +/* Maximal possible key. It is never in the tree. */ +static const struct reiserfs_key MAX_KEY = { + __constant_cpu_to_le32(0xffffffff), + __constant_cpu_to_le32(0xffffffff), + {{__constant_cpu_to_le32(0xffffffff), + __constant_cpu_to_le32(0xffffffff)},} +}; + +/* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom + of the path, and going upwards. We must check the path's validity at each step. If the key is not in + the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this + case we return a special key, either MIN_KEY or MAX_KEY. */ +static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path, + const struct super_block *sb) +{ + int position, path_offset = chk_path->path_length; + struct buffer_head *parent; + + RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET, + "PAP-5010: invalid offset in the path"); + + /* While not higher in path than first element. */ + while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) { + + RFALSE(!buffer_uptodate + (PATH_OFFSET_PBUFFER(chk_path, path_offset)), + "PAP-5020: parent is not uptodate"); + + /* Parent at the path is not in the tree now. */ + if (!B_IS_IN_TREE + (parent = + PATH_OFFSET_PBUFFER(chk_path, path_offset))) + return &MAX_KEY; + /* Check whether position in the parent is correct. */ + if ((position = + PATH_OFFSET_POSITION(chk_path, + path_offset)) > + B_NR_ITEMS(parent)) + return &MAX_KEY; + /* Check whether parent at the path really points to the child. */ + if (B_N_CHILD_NUM(parent, position) != + PATH_OFFSET_PBUFFER(chk_path, + path_offset + 1)->b_blocknr) + return &MAX_KEY; + /* Return delimiting key if position in the parent is not equal to zero. */ + if (position) + return B_N_PDELIM_KEY(parent, position - 1); + } + /* Return MIN_KEY if we are in the root of the buffer tree. */ + if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)-> + b_blocknr == SB_ROOT_BLOCK(sb)) + return &MIN_KEY; + return &MAX_KEY; +} + +/* Get delimiting key of the buffer at the path and its right neighbor. */ +inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path, + const struct super_block *sb) +{ + int position, path_offset = chk_path->path_length; + struct buffer_head *parent; + + RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET, + "PAP-5030: invalid offset in the path"); + + while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) { + + RFALSE(!buffer_uptodate + (PATH_OFFSET_PBUFFER(chk_path, path_offset)), + "PAP-5040: parent is not uptodate"); + + /* Parent at the path is not in the tree now. */ + if (!B_IS_IN_TREE + (parent = + PATH_OFFSET_PBUFFER(chk_path, path_offset))) + return &MIN_KEY; + /* Check whether position in the parent is correct. */ + if ((position = + PATH_OFFSET_POSITION(chk_path, + path_offset)) > + B_NR_ITEMS(parent)) + return &MIN_KEY; + /* Check whether parent at the path really points to the child. */ + if (B_N_CHILD_NUM(parent, position) != + PATH_OFFSET_PBUFFER(chk_path, + path_offset + 1)->b_blocknr) + return &MIN_KEY; + /* Return delimiting key if position in the parent is not the last one. */ + if (position != B_NR_ITEMS(parent)) + return B_N_PDELIM_KEY(parent, position); + } + /* Return MAX_KEY if we are in the root of the buffer tree. */ + if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)-> + b_blocknr == SB_ROOT_BLOCK(sb)) + return &MAX_KEY; + return &MIN_KEY; +} + +/* Check whether a key is contained in the tree rooted from a buffer at a path. */ +/* This works by looking at the left and right delimiting keys for the buffer in the last path_element in + the path. These delimiting keys are stored at least one level above that buffer in the tree. If the + buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in + this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */ +static inline int key_in_buffer(struct treepath *chk_path, /* Path which should be checked. */ + const struct cpu_key *key, /* Key which should be checked. */ + struct super_block *sb + ) +{ + + RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET + || chk_path->path_length > MAX_HEIGHT, + "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)", + key, chk_path->path_length); + RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev, + "PAP-5060: device must not be NODEV"); + + if (comp_keys(get_lkey(chk_path, sb), key) == 1) + /* left delimiting key is bigger, that the key we look for */ + return 0; + /* if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */ + if (comp_keys(get_rkey(chk_path, sb), key) != 1) + /* key must be less than right delimitiing key */ + return 0; + return 1; +} + +int reiserfs_check_path(struct treepath *p) +{ + RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET, + "path not properly relsed"); + return 0; +} + +/* Drop the reference to each buffer in a path and restore + * dirty bits clean when preparing the buffer for the log. + * This version should only be called from fix_nodes() */ +void pathrelse_and_restore(struct super_block *sb, + struct treepath *search_path) +{ + int path_offset = search_path->path_length; + + RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, + "clm-4000: invalid path offset"); + + while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) { + struct buffer_head *bh; + bh = PATH_OFFSET_PBUFFER(search_path, path_offset--); + reiserfs_restore_prepared_buffer(sb, bh); + brelse(bh); + } + search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; +} + +/* Drop the reference to each buffer in a path */ +void pathrelse(struct treepath *search_path) +{ + int path_offset = search_path->path_length; + + RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, + "PAP-5090: invalid path offset"); + + while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) + brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--)); + + search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; +} + +static int is_leaf(char *buf, int blocksize, struct buffer_head *bh) +{ + struct block_head *blkh; + struct item_head *ih; + int used_space; + int prev_location; + int i; + int nr; + + blkh = (struct block_head *)buf; + if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) { + reiserfs_warning(NULL, "reiserfs-5080", + "this should be caught earlier"); + return 0; + } + + nr = blkh_nr_item(blkh); + if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) { + /* item number is too big or too small */ + reiserfs_warning(NULL, "reiserfs-5081", + "nr_item seems wrong: %z", bh); + return 0; + } + ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1; + used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih)); + if (used_space != blocksize - blkh_free_space(blkh)) { + /* free space does not match to calculated amount of use space */ + reiserfs_warning(NULL, "reiserfs-5082", + "free space seems wrong: %z", bh); + return 0; + } + // FIXME: it is_leaf will hit performance too much - we may have + // return 1 here + + /* check tables of item heads */ + ih = (struct item_head *)(buf + BLKH_SIZE); + prev_location = blocksize; + for (i = 0; i < nr; i++, ih++) { + if (le_ih_k_type(ih) == TYPE_ANY) { + reiserfs_warning(NULL, "reiserfs-5083", + "wrong item type for item %h", + ih); + return 0; + } + if (ih_location(ih) >= blocksize + || ih_location(ih) < IH_SIZE * nr) { + reiserfs_warning(NULL, "reiserfs-5084", + "item location seems wrong: %h", + ih); + return 0; + } + if (ih_item_len(ih) < 1 + || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) { + reiserfs_warning(NULL, "reiserfs-5085", + "item length seems wrong: %h", + ih); + return 0; + } + if (prev_location - ih_location(ih) != ih_item_len(ih)) { + reiserfs_warning(NULL, "reiserfs-5086", + "item location seems wrong " + "(second one): %h", ih); + return 0; + } + prev_location = ih_location(ih); + } + + // one may imagine much more checks + return 1; +} + +/* returns 1 if buf looks like an internal node, 0 otherwise */ +static int is_internal(char *buf, int blocksize, struct buffer_head *bh) +{ + struct block_head *blkh; + int nr; + int used_space; + + blkh = (struct block_head *)buf; + nr = blkh_level(blkh); + if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) { + /* this level is not possible for internal nodes */ + reiserfs_warning(NULL, "reiserfs-5087", + "this should be caught earlier"); + return 0; + } + + nr = blkh_nr_item(blkh); + if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) { + /* for internal which is not root we might check min number of keys */ + reiserfs_warning(NULL, "reiserfs-5088", + "number of key seems wrong: %z", bh); + return 0; + } + + used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1); + if (used_space != blocksize - blkh_free_space(blkh)) { + reiserfs_warning(NULL, "reiserfs-5089", + "free space seems wrong: %z", bh); + return 0; + } + // one may imagine much more checks + return 1; +} + +// make sure that bh contains formatted node of reiserfs tree of +// 'level'-th level +static int is_tree_node(struct buffer_head *bh, int level) +{ + if (B_LEVEL(bh) != level) { + reiserfs_warning(NULL, "reiserfs-5090", "node level %d does " + "not match to the expected one %d", + B_LEVEL(bh), level); + return 0; + } + if (level == DISK_LEAF_NODE_LEVEL) + return is_leaf(bh->b_data, bh->b_size, bh); + + return is_internal(bh->b_data, bh->b_size, bh); +} + +#define SEARCH_BY_KEY_READA 16 + +/* + * The function is NOT SCHEDULE-SAFE! + * It might unlock the write lock if we needed to wait for a block + * to be read. Note that in this case it won't recover the lock to avoid + * high contention resulting from too much lock requests, especially + * the caller (search_by_key) will perform other schedule-unsafe + * operations just after calling this function. + * + * @return true if we have unlocked + */ +static bool search_by_key_reada(struct super_block *s, + struct buffer_head **bh, + b_blocknr_t *b, int num) +{ + int i, j; + bool unlocked = false; + + for (i = 0; i < num; i++) { + bh[i] = sb_getblk(s, b[i]); + } + /* + * We are going to read some blocks on which we + * have a reference. It's safe, though we might be + * reading blocks concurrently changed if we release + * the lock. But it's still fine because we check later + * if the tree changed + */ + for (j = 0; j < i; j++) { + /* + * note, this needs attention if we are getting rid of the BKL + * you have to make sure the prepared bit isn't set on this buffer + */ + if (!buffer_uptodate(bh[j])) { + if (!unlocked) { + reiserfs_write_unlock(s); + unlocked = true; + } + ll_rw_block(READA, 1, bh + j); + } + brelse(bh[j]); + } + return unlocked; +} + +/************************************************************************** + * Algorithm SearchByKey * + * look for item in the Disk S+Tree by its key * + * Input: sb - super block * + * key - pointer to the key to search * + * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR * + * search_path - path from the root to the needed leaf * + **************************************************************************/ + +/* This function fills up the path from the root to the leaf as it + descends the tree looking for the key. It uses reiserfs_bread to + try to find buffers in the cache given their block number. If it + does not find them in the cache it reads them from disk. For each + node search_by_key finds using reiserfs_bread it then uses + bin_search to look through that node. bin_search will find the + position of the block_number of the next node if it is looking + through an internal node. If it is looking through a leaf node + bin_search will find the position of the item which has key either + equal to given key, or which is the maximal key less than the given + key. search_by_key returns a path that must be checked for the + correctness of the top of the path but need not be checked for the + correctness of the bottom of the path */ +/* The function is NOT SCHEDULE-SAFE! */ +int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to search. */ + struct treepath *search_path,/* This structure was + allocated and initialized + by the calling + function. It is filled up + by this function. */ + int stop_level /* How far down the tree to search. To + stop at leaf level - set to + DISK_LEAF_NODE_LEVEL */ + ) +{ + b_blocknr_t block_number; + int expected_level; + struct buffer_head *bh; + struct path_element *last_element; + int node_level, retval; + int right_neighbor_of_leaf_node; + int fs_gen; + struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; + b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA]; + int reada_count = 0; + +#ifdef CONFIG_REISERFS_CHECK + int repeat_counter = 0; +#endif + + PROC_INFO_INC(sb, search_by_key); + + /* As we add each node to a path we increase its count. This means that + we must be careful to release all nodes in a path before we either + discard the path struct or re-use the path struct, as we do here. */ + + pathrelse(search_path); + + right_neighbor_of_leaf_node = 0; + + /* With each iteration of this loop we search through the items in the + current node, and calculate the next current node(next path element) + for the next iteration of this loop.. */ + block_number = SB_ROOT_BLOCK(sb); + expected_level = -1; + while (1) { + +#ifdef CONFIG_REISERFS_CHECK + if (!(++repeat_counter % 50000)) + reiserfs_warning(sb, "PAP-5100", + "%s: there were %d iterations of " + "while loop looking for key %K", + current->comm, repeat_counter, + key); +#endif + + /* prep path to have another element added to it. */ + last_element = + PATH_OFFSET_PELEMENT(search_path, + ++search_path->path_length); + fs_gen = get_generation(sb); + + /* Read the next tree node, and set the last element in the path to + have a pointer to it. */ + if ((bh = last_element->pe_buffer = + sb_getblk(sb, block_number))) { + bool unlocked = false; + + if (!buffer_uptodate(bh) && reada_count > 1) + /* may unlock the write lock */ + unlocked = search_by_key_reada(sb, reada_bh, + reada_blocks, reada_count); + /* + * If we haven't already unlocked the write lock, + * then we need to do that here before reading + * the current block + */ + if (!buffer_uptodate(bh) && !unlocked) { + reiserfs_write_unlock(sb); + unlocked = true; + } + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + + if (unlocked) + reiserfs_write_lock(sb); + if (!buffer_uptodate(bh)) + goto io_error; + } else { + io_error: + search_path->path_length--; + pathrelse(search_path); + return IO_ERROR; + } + reada_count = 0; + if (expected_level == -1) + expected_level = SB_TREE_HEIGHT(sb); + expected_level--; + + /* It is possible that schedule occurred. We must check whether the key + to search is still in the tree rooted from the current buffer. If + not then repeat search from the root. */ + if (fs_changed(fs_gen, sb) && + (!B_IS_IN_TREE(bh) || + B_LEVEL(bh) != expected_level || + !key_in_buffer(search_path, key, sb))) { + PROC_INFO_INC(sb, search_by_key_fs_changed); + PROC_INFO_INC(sb, search_by_key_restarted); + PROC_INFO_INC(sb, + sbk_restarted[expected_level - 1]); + pathrelse(search_path); + + /* Get the root block number so that we can repeat the search + starting from the root. */ + block_number = SB_ROOT_BLOCK(sb); + expected_level = -1; + right_neighbor_of_leaf_node = 0; + + /* repeat search from the root */ + continue; + } + + /* only check that the key is in the buffer if key is not + equal to the MAX_KEY. Latter case is only possible in + "finish_unfinished()" processing during mount. */ + RFALSE(comp_keys(&MAX_KEY, key) && + !key_in_buffer(search_path, key, sb), + "PAP-5130: key is not in the buffer"); +#ifdef CONFIG_REISERFS_CHECK + if (REISERFS_SB(sb)->cur_tb) { + print_cur_tb("5140"); + reiserfs_panic(sb, "PAP-5140", + "schedule occurred in do_balance!"); + } +#endif + + // make sure, that the node contents look like a node of + // certain level + if (!is_tree_node(bh, expected_level)) { + reiserfs_error(sb, "vs-5150", + "invalid format found in block %ld. " + "Fsck?", bh->b_blocknr); + pathrelse(search_path); + return IO_ERROR; + } + + /* ok, we have acquired next formatted node in the tree */ + node_level = B_LEVEL(bh); + + PROC_INFO_BH_STAT(sb, bh, node_level - 1); + + RFALSE(node_level < stop_level, + "vs-5152: tree level (%d) is less than stop level (%d)", + node_level, stop_level); + + retval = bin_search(key, B_N_PITEM_HEAD(bh, 0), + B_NR_ITEMS(bh), + (node_level == + DISK_LEAF_NODE_LEVEL) ? IH_SIZE : + KEY_SIZE, + &(last_element->pe_position)); + if (node_level == stop_level) { + return retval; + } + + /* we are not in the stop level */ + if (retval == ITEM_FOUND) + /* item has been found, so we choose the pointer which is to the right of the found one */ + last_element->pe_position++; + + /* if item was not found we choose the position which is to + the left of the found item. This requires no code, + bin_search did it already. */ + + /* So we have chosen a position in the current node which is + an internal node. Now we calculate child block number by + position in the node. */ + block_number = + B_N_CHILD_NUM(bh, last_element->pe_position); + + /* if we are going to read leaf nodes, try for read ahead as well */ + if ((search_path->reada & PATH_READA) && + node_level == DISK_LEAF_NODE_LEVEL + 1) { + int pos = last_element->pe_position; + int limit = B_NR_ITEMS(bh); + struct reiserfs_key *le_key; + + if (search_path->reada & PATH_READA_BACK) + limit = 0; + while (reada_count < SEARCH_BY_KEY_READA) { + if (pos == limit) + break; + reada_blocks[reada_count++] = + B_N_CHILD_NUM(bh, pos); + if (search_path->reada & PATH_READA_BACK) + pos--; + else + pos++; + + /* + * check to make sure we're in the same object + */ + le_key = B_N_PDELIM_KEY(bh, pos); + if (le32_to_cpu(le_key->k_objectid) != + key->on_disk_key.k_objectid) { + break; + } + } + } + } +} + +/* Form the path to an item and position in this item which contains + file byte defined by key. If there is no such item + corresponding to the key, we point the path to the item with + maximal key less than key, and *pos_in_item is set to one + past the last entry/byte in the item. If searching for entry in a + directory item, and it is not found, *pos_in_item is set to one + entry more than the entry with maximal key which is less than the + sought key. + + Note that if there is no entry in this same node which is one more, + then we point to an imaginary entry. for direct items, the + position is in units of bytes, for indirect items the position is + in units of blocknr entries, for directory items the position is in + units of directory entries. */ + +/* The function is NOT SCHEDULE-SAFE! */ +int search_for_position_by_key(struct super_block *sb, /* Pointer to the super block. */ + const struct cpu_key *p_cpu_key, /* Key to search (cpu variable) */ + struct treepath *search_path /* Filled up by this function. */ + ) +{ + struct item_head *p_le_ih; /* pointer to on-disk structure */ + int blk_size; + loff_t item_offset, offset; + struct reiserfs_dir_entry de; + int retval; + + /* If searching for directory entry. */ + if (is_direntry_cpu_key(p_cpu_key)) + return search_by_entry_key(sb, p_cpu_key, search_path, + &de); + + /* If not searching for directory entry. */ + + /* If item is found. */ + retval = search_item(sb, p_cpu_key, search_path); + if (retval == IO_ERROR) + return retval; + if (retval == ITEM_FOUND) { + + RFALSE(!ih_item_len + (B_N_PITEM_HEAD + (PATH_PLAST_BUFFER(search_path), + PATH_LAST_POSITION(search_path))), + "PAP-5165: item length equals zero"); + + pos_in_item(search_path) = 0; + return POSITION_FOUND; + } + + RFALSE(!PATH_LAST_POSITION(search_path), + "PAP-5170: position equals zero"); + + /* Item is not found. Set path to the previous item. */ + p_le_ih = + B_N_PITEM_HEAD(PATH_PLAST_BUFFER(search_path), + --PATH_LAST_POSITION(search_path)); + blk_size = sb->s_blocksize; + + if (comp_short_keys(&(p_le_ih->ih_key), p_cpu_key)) { + return FILE_NOT_FOUND; + } + // FIXME: quite ugly this far + + item_offset = le_ih_k_offset(p_le_ih); + offset = cpu_key_k_offset(p_cpu_key); + + /* Needed byte is contained in the item pointed to by the path. */ + if (item_offset <= offset && + item_offset + op_bytes_number(p_le_ih, blk_size) > offset) { + pos_in_item(search_path) = offset - item_offset; + if (is_indirect_le_ih(p_le_ih)) { + pos_in_item(search_path) /= blk_size; + } + return POSITION_FOUND; + } + + /* Needed byte is not contained in the item pointed to by the + path. Set pos_in_item out of the item. */ + if (is_indirect_le_ih(p_le_ih)) + pos_in_item(search_path) = + ih_item_len(p_le_ih) / UNFM_P_SIZE; + else + pos_in_item(search_path) = ih_item_len(p_le_ih); + + return POSITION_NOT_FOUND; +} + +/* Compare given item and item pointed to by the path. */ +int comp_items(const struct item_head *stored_ih, const struct treepath *path) +{ + struct buffer_head *bh = PATH_PLAST_BUFFER(path); + struct item_head *ih; + + /* Last buffer at the path is not in the tree. */ + if (!B_IS_IN_TREE(bh)) + return 1; + + /* Last path position is invalid. */ + if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh)) + return 1; + + /* we need only to know, whether it is the same item */ + ih = get_ih(path); + return memcmp(stored_ih, ih, IH_SIZE); +} + +/* unformatted nodes are not logged anymore, ever. This is safe +** now +*/ +#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1) + +// block can not be forgotten as it is in I/O or held by someone +#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh))) + +// prepare for delete or cut of direct item +static inline int prepare_for_direct_item(struct treepath *path, + struct item_head *le_ih, + struct inode *inode, + loff_t new_file_length, int *cut_size) +{ + loff_t round_len; + + if (new_file_length == max_reiserfs_offset(inode)) { + /* item has to be deleted */ + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; + } + // new file gets truncated + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) { + // + round_len = ROUND_UP(new_file_length); + /* this was new_file_length < le_ih ... */ + if (round_len < le_ih_k_offset(le_ih)) { + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; /* Delete this item. */ + } + /* Calculate first position and size for cutting from item. */ + pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1); + *cut_size = -(ih_item_len(le_ih) - pos_in_item(path)); + + return M_CUT; /* Cut from this item. */ + } + + // old file: items may have any length + + if (new_file_length < le_ih_k_offset(le_ih)) { + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; /* Delete this item. */ + } + /* Calculate first position and size for cutting from item. */ + *cut_size = -(ih_item_len(le_ih) - + (pos_in_item(path) = + new_file_length + 1 - le_ih_k_offset(le_ih))); + return M_CUT; /* Cut from this item. */ +} + +static inline int prepare_for_direntry_item(struct treepath *path, + struct item_head *le_ih, + struct inode *inode, + loff_t new_file_length, + int *cut_size) +{ + if (le_ih_k_offset(le_ih) == DOT_OFFSET && + new_file_length == max_reiserfs_offset(inode)) { + RFALSE(ih_entry_count(le_ih) != 2, + "PAP-5220: incorrect empty directory item (%h)", le_ih); + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; /* Delete the directory item containing "." and ".." entry. */ + } + + if (ih_entry_count(le_ih) == 1) { + /* Delete the directory item such as there is one record only + in this item */ + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; + } + + /* Cut one record from the directory item. */ + *cut_size = + -(DEH_SIZE + + entry_length(get_last_bh(path), le_ih, pos_in_item(path))); + return M_CUT; +} + +#define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1) + +/* If the path points to a directory or direct item, calculate mode and the size cut, for balance. + If the path points to an indirect item, remove some number of its unformatted nodes. + In case of file truncate calculate whether this item must be deleted/truncated or last + unformatted node of this item will be converted to a direct item. + This function returns a determination of what balance mode the calling function should employ. */ +static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path, const struct cpu_key *item_key, int *removed, /* Number of unformatted nodes which were removed + from end of the file. */ + int *cut_size, unsigned long long new_file_length /* MAX_KEY_OFFSET in case of delete. */ + ) +{ + struct super_block *sb = inode->i_sb; + struct item_head *p_le_ih = PATH_PITEM_HEAD(path); + struct buffer_head *bh = PATH_PLAST_BUFFER(path); + + BUG_ON(!th->t_trans_id); + + /* Stat_data item. */ + if (is_statdata_le_ih(p_le_ih)) { + + RFALSE(new_file_length != max_reiserfs_offset(inode), + "PAP-5210: mode must be M_DELETE"); + + *cut_size = -(IH_SIZE + ih_item_len(p_le_ih)); + return M_DELETE; + } + + /* Directory item. */ + if (is_direntry_le_ih(p_le_ih)) + return prepare_for_direntry_item(path, p_le_ih, inode, + new_file_length, + cut_size); + + /* Direct item. */ + if (is_direct_le_ih(p_le_ih)) + return prepare_for_direct_item(path, p_le_ih, inode, + new_file_length, cut_size); + + /* Case of an indirect item. */ + { + int blk_size = sb->s_blocksize; + struct item_head s_ih; + int need_re_search; + int delete = 0; + int result = M_CUT; + int pos = 0; + + if ( new_file_length == max_reiserfs_offset (inode) ) { + /* prepare_for_delete_or_cut() is called by + * reiserfs_delete_item() */ + new_file_length = 0; + delete = 1; + } + + do { + need_re_search = 0; + *cut_size = 0; + bh = PATH_PLAST_BUFFER(path); + copy_item_head(&s_ih, PATH_PITEM_HEAD(path)); + pos = I_UNFM_NUM(&s_ih); + + while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) { + __le32 *unfm; + __u32 block; + + /* Each unformatted block deletion may involve one additional + * bitmap block into the transaction, thereby the initial + * journal space reservation might not be enough. */ + if (!delete && (*cut_size) != 0 && + reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) + break; + + unfm = (__le32 *)B_I_PITEM(bh, &s_ih) + pos - 1; + block = get_block_num(unfm, 0); + + if (block != 0) { + reiserfs_prepare_for_journal(sb, bh, 1); + put_block_num(unfm, 0, 0); + journal_mark_dirty(th, sb, bh); + reiserfs_free_block(th, inode, block, 1); + } + + reiserfs_write_unlock(sb); + cond_resched(); + reiserfs_write_lock(sb); + + if (item_moved (&s_ih, path)) { + need_re_search = 1; + break; + } + + pos --; + (*removed)++; + (*cut_size) -= UNFM_P_SIZE; + + if (pos == 0) { + (*cut_size) -= IH_SIZE; + result = M_DELETE; + break; + } + } + /* a trick. If the buffer has been logged, this will do nothing. If + ** we've broken the loop without logging it, it will restore the + ** buffer */ + reiserfs_restore_prepared_buffer(sb, bh); + } while (need_re_search && + search_for_position_by_key(sb, item_key, path) == POSITION_FOUND); + pos_in_item(path) = pos * UNFM_P_SIZE; + + if (*cut_size == 0) { + /* Nothing were cut. maybe convert last unformatted node to the + * direct item? */ + result = M_CONVERT; + } + return result; + } +} + +/* Calculate number of bytes which will be deleted or cut during balance */ +static int calc_deleted_bytes_number(struct tree_balance *tb, char mode) +{ + int del_size; + struct item_head *p_le_ih = PATH_PITEM_HEAD(tb->tb_path); + + if (is_statdata_le_ih(p_le_ih)) + return 0; + + del_size = + (mode == + M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0]; + if (is_direntry_le_ih(p_le_ih)) { + /* return EMPTY_DIR_SIZE; We delete emty directoris only. + * we can't use EMPTY_DIR_SIZE, as old format dirs have a different + * empty size. ick. FIXME, is this right? */ + return del_size; + } + + if (is_indirect_le_ih(p_le_ih)) + del_size = (del_size / UNFM_P_SIZE) * + (PATH_PLAST_BUFFER(tb->tb_path)->b_size); + return del_size; +} + +static void init_tb_struct(struct reiserfs_transaction_handle *th, + struct tree_balance *tb, + struct super_block *sb, + struct treepath *path, int size) +{ + + BUG_ON(!th->t_trans_id); + + memset(tb, '\0', sizeof(struct tree_balance)); + tb->transaction_handle = th; + tb->tb_sb = sb; + tb->tb_path = path; + PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL; + PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0; + tb->insert_size[0] = size; +} + +void padd_item(char *item, int total_length, int length) +{ + int i; + + for (i = total_length; i > length;) + item[--i] = 0; +} + +#ifdef REISERQUOTA_DEBUG +char key2type(struct reiserfs_key *ih) +{ + if (is_direntry_le_key(2, ih)) + return 'd'; + if (is_direct_le_key(2, ih)) + return 'D'; + if (is_indirect_le_key(2, ih)) + return 'i'; + if (is_statdata_le_key(2, ih)) + return 's'; + return 'u'; +} + +char head2type(struct item_head *ih) +{ + if (is_direntry_le_ih(ih)) + return 'd'; + if (is_direct_le_ih(ih)) + return 'D'; + if (is_indirect_le_ih(ih)) + return 'i'; + if (is_statdata_le_ih(ih)) + return 's'; + return 'u'; +} +#endif + +/* Delete object item. + * th - active transaction handle + * path - path to the deleted item + * item_key - key to search for the deleted item + * indode - used for updating i_blocks and quotas + * un_bh - NULL or unformatted node pointer + */ +int reiserfs_delete_item(struct reiserfs_transaction_handle *th, + struct treepath *path, const struct cpu_key *item_key, + struct inode *inode, struct buffer_head *un_bh) +{ + struct super_block *sb = inode->i_sb; + struct tree_balance s_del_balance; + struct item_head s_ih; + struct item_head *q_ih; + int quota_cut_bytes; + int ret_value, del_size, removed; + +#ifdef CONFIG_REISERFS_CHECK + char mode; + int iter = 0; +#endif + + BUG_ON(!th->t_trans_id); + + init_tb_struct(th, &s_del_balance, sb, path, + 0 /*size is unknown */ ); + + while (1) { + removed = 0; + +#ifdef CONFIG_REISERFS_CHECK + iter++; + mode = +#endif + prepare_for_delete_or_cut(th, inode, path, + item_key, &removed, + &del_size, + max_reiserfs_offset(inode)); + + RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE"); + + copy_item_head(&s_ih, PATH_PITEM_HEAD(path)); + s_del_balance.insert_size[0] = del_size; + + ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL); + if (ret_value != REPEAT_SEARCH) + break; + + PROC_INFO_INC(sb, delete_item_restarted); + + // file system changed, repeat search + ret_value = + search_for_position_by_key(sb, item_key, path); + if (ret_value == IO_ERROR) + break; + if (ret_value == FILE_NOT_FOUND) { + reiserfs_warning(sb, "vs-5340", + "no items of the file %K found", + item_key); + break; + } + } /* while (1) */ + + if (ret_value != CARRY_ON) { + unfix_nodes(&s_del_balance); + return 0; + } + // reiserfs_delete_item returns item length when success + ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE); + q_ih = get_ih(path); + quota_cut_bytes = ih_item_len(q_ih); + + /* hack so the quota code doesn't have to guess if the file + ** has a tail. On tail insert, we allocate quota for 1 unformatted node. + ** We test the offset because the tail might have been + ** split into multiple items, and we only want to decrement for + ** the unfm node once + */ + if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) { + if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) { + quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE; + } else { + quota_cut_bytes = 0; + } + } + + if (un_bh) { + int off; + char *data; + + /* We are in direct2indirect conversion, so move tail contents + to the unformatted node */ + /* note, we do the copy before preparing the buffer because we + ** don't care about the contents of the unformatted node yet. + ** the only thing we really care about is the direct item's data + ** is in the unformatted node. + ** + ** Otherwise, we would have to call reiserfs_prepare_for_journal on + ** the unformatted node, which might schedule, meaning we'd have to + ** loop all the way back up to the start of the while loop. + ** + ** The unformatted node must be dirtied later on. We can't be + ** sure here if the entire tail has been deleted yet. + ** + ** un_bh is from the page cache (all unformatted nodes are + ** from the page cache) and might be a highmem page. So, we + ** can't use un_bh->b_data. + ** -clm + */ + + data = kmap_atomic(un_bh->b_page, KM_USER0); + off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1)); + memcpy(data + off, + B_I_PITEM(PATH_PLAST_BUFFER(path), &s_ih), + ret_value); + kunmap_atomic(data, KM_USER0); + } + /* Perform balancing after all resources have been collected at once. */ + do_balance(&s_del_balance, NULL, NULL, M_DELETE); + +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "reiserquota delete_item(): freeing %u, id=%u type=%c", + quota_cut_bytes, inode->i_uid, head2type(&s_ih)); +#endif + dquot_free_space_nodirty(inode, quota_cut_bytes); + + /* Return deleted body length */ + return ret_value; +} + +/* Summary Of Mechanisms For Handling Collisions Between Processes: + + deletion of the body of the object is performed by iput(), with the + result that if multiple processes are operating on a file, the + deletion of the body of the file is deferred until the last process + that has an open inode performs its iput(). + + writes and truncates are protected from collisions by use of + semaphores. + + creates, linking, and mknod are protected from collisions with other + processes by making the reiserfs_add_entry() the last step in the + creation, and then rolling back all changes if there was a collision. + - Hans +*/ + +/* this deletes item which never gets split */ +void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, + struct inode *inode, struct reiserfs_key *key) +{ + struct tree_balance tb; + INITIALIZE_PATH(path); + int item_len = 0; + int tb_init = 0; + struct cpu_key cpu_key; + int retval; + int quota_cut_bytes = 0; + + BUG_ON(!th->t_trans_id); + + le_key2cpu_key(&cpu_key, key); + + while (1) { + retval = search_item(th->t_super, &cpu_key, &path); + if (retval == IO_ERROR) { + reiserfs_error(th->t_super, "vs-5350", + "i/o failure occurred trying " + "to delete %K", &cpu_key); + break; + } + if (retval != ITEM_FOUND) { + pathrelse(&path); + // No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir + if (! + ((unsigned long long) + GET_HASH_VALUE(le_key_k_offset + (le_key_version(key), key)) == 0 + && (unsigned long long) + GET_GENERATION_NUMBER(le_key_k_offset + (le_key_version(key), + key)) == 1)) + reiserfs_warning(th->t_super, "vs-5355", + "%k not found", key); + break; + } + if (!tb_init) { + tb_init = 1; + item_len = ih_item_len(PATH_PITEM_HEAD(&path)); + init_tb_struct(th, &tb, th->t_super, &path, + -(IH_SIZE + item_len)); + } + quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)); + + retval = fix_nodes(M_DELETE, &tb, NULL, NULL); + if (retval == REPEAT_SEARCH) { + PROC_INFO_INC(th->t_super, delete_solid_item_restarted); + continue; + } + + if (retval == CARRY_ON) { + do_balance(&tb, NULL, NULL, M_DELETE); + if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */ +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, + "reiserquota delete_solid_item(): freeing %u id=%u type=%c", + quota_cut_bytes, inode->i_uid, + key2type(key)); +#endif + dquot_free_space_nodirty(inode, + quota_cut_bytes); + } + break; + } + // IO_ERROR, NO_DISK_SPACE, etc + reiserfs_warning(th->t_super, "vs-5360", + "could not delete %K due to fix_nodes failure", + &cpu_key); + unfix_nodes(&tb); + break; + } + + reiserfs_check_path(&path); +} + +int reiserfs_delete_object(struct reiserfs_transaction_handle *th, + struct inode *inode) +{ + int err; + inode->i_size = 0; + BUG_ON(!th->t_trans_id); + + /* for directory this deletes item containing "." and ".." */ + err = + reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ ); + if (err) + return err; + +#if defined( USE_INODE_GENERATION_COUNTER ) + if (!old_format_only(th->t_super)) { + __le32 *inode_generation; + + inode_generation = + &REISERFS_SB(th->t_super)->s_rs->s_inode_generation; + le32_add_cpu(inode_generation, 1); + } +/* USE_INODE_GENERATION_COUNTER */ +#endif + reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode)); + + return err; +} + +static void unmap_buffers(struct page *page, loff_t pos) +{ + struct buffer_head *bh; + struct buffer_head *head; + struct buffer_head *next; + unsigned long tail_index; + unsigned long cur_index; + + if (page) { + if (page_has_buffers(page)) { + tail_index = pos & (PAGE_CACHE_SIZE - 1); + cur_index = 0; + head = page_buffers(page); + bh = head; + do { + next = bh->b_this_page; + + /* we want to unmap the buffers that contain the tail, and + ** all the buffers after it (since the tail must be at the + ** end of the file). We don't want to unmap file data + ** before the tail, since it might be dirty and waiting to + ** reach disk + */ + cur_index += bh->b_size; + if (cur_index > tail_index) { + reiserfs_unmap_buffer(bh); + } + bh = next; + } while (bh != head); + } + } +} + +static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct page *page, + struct treepath *path, + const struct cpu_key *item_key, + loff_t new_file_size, char *mode) +{ + struct super_block *sb = inode->i_sb; + int block_size = sb->s_blocksize; + int cut_bytes; + BUG_ON(!th->t_trans_id); + BUG_ON(new_file_size != inode->i_size); + + /* the page being sent in could be NULL if there was an i/o error + ** reading in the last block. The user will hit problems trying to + ** read the file, but for now we just skip the indirect2direct + */ + if (atomic_read(&inode->i_count) > 1 || + !tail_has_to_be_packed(inode) || + !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) { + /* leave tail in an unformatted node */ + *mode = M_SKIP_BALANCING; + cut_bytes = + block_size - (new_file_size & (block_size - 1)); + pathrelse(path); + return cut_bytes; + } + /* Perform the conversion to a direct_item. */ + /* return indirect_to_direct(inode, path, item_key, + new_file_size, mode); */ + return indirect2direct(th, inode, page, path, item_key, + new_file_size, mode); +} + +/* we did indirect_to_direct conversion. And we have inserted direct + item successesfully, but there were no disk space to cut unfm + pointer being converted. Therefore we have to delete inserted + direct item(s) */ +static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, + struct inode *inode, struct treepath *path) +{ + struct cpu_key tail_key; + int tail_len; + int removed; + BUG_ON(!th->t_trans_id); + + make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4); // !!!! + tail_key.key_length = 4; + + tail_len = + (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1; + while (tail_len) { + /* look for the last byte of the tail */ + if (search_for_position_by_key(inode->i_sb, &tail_key, path) == + POSITION_NOT_FOUND) + reiserfs_panic(inode->i_sb, "vs-5615", + "found invalid item"); + RFALSE(path->pos_in_item != + ih_item_len(PATH_PITEM_HEAD(path)) - 1, + "vs-5616: appended bytes found"); + PATH_LAST_POSITION(path)--; + + removed = + reiserfs_delete_item(th, path, &tail_key, inode, + NULL /*unbh not needed */ ); + RFALSE(removed <= 0 + || removed > tail_len, + "vs-5617: there was tail %d bytes, removed item length %d bytes", + tail_len, removed); + tail_len -= removed; + set_cpu_key_k_offset(&tail_key, + cpu_key_k_offset(&tail_key) - removed); + } + reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct " + "conversion has been rolled back due to " + "lack of disk space"); + //mark_file_without_tail (inode); + mark_inode_dirty(inode); +} + +/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */ +int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, + struct treepath *path, + struct cpu_key *item_key, + struct inode *inode, + struct page *page, loff_t new_file_size) +{ + struct super_block *sb = inode->i_sb; + /* Every function which is going to call do_balance must first + create a tree_balance structure. Then it must fill up this + structure by using the init_tb_struct and fix_nodes functions. + After that we can make tree balancing. */ + struct tree_balance s_cut_balance; + struct item_head *p_le_ih; + int cut_size = 0, /* Amount to be cut. */ + ret_value = CARRY_ON, removed = 0, /* Number of the removed unformatted nodes. */ + is_inode_locked = 0; + char mode; /* Mode of the balance. */ + int retval2 = -1; + int quota_cut_bytes; + loff_t tail_pos = 0; + + BUG_ON(!th->t_trans_id); + + init_tb_struct(th, &s_cut_balance, inode->i_sb, path, + cut_size); + + /* Repeat this loop until we either cut the item without needing + to balance, or we fix_nodes without schedule occurring */ + while (1) { + /* Determine the balance mode, position of the first byte to + be cut, and size to be cut. In case of the indirect item + free unformatted nodes which are pointed to by the cut + pointers. */ + + mode = + prepare_for_delete_or_cut(th, inode, path, + item_key, &removed, + &cut_size, new_file_size); + if (mode == M_CONVERT) { + /* convert last unformatted node to direct item or leave + tail in the unformatted node */ + RFALSE(ret_value != CARRY_ON, + "PAP-5570: can not convert twice"); + + ret_value = + maybe_indirect_to_direct(th, inode, page, + path, item_key, + new_file_size, &mode); + if (mode == M_SKIP_BALANCING) + /* tail has been left in the unformatted node */ + return ret_value; + + is_inode_locked = 1; + + /* removing of last unformatted node will change value we + have to return to truncate. Save it */ + retval2 = ret_value; + /*retval2 = sb->s_blocksize - (new_file_size & (sb->s_blocksize - 1)); */ + + /* So, we have performed the first part of the conversion: + inserting the new direct item. Now we are removing the + last unformatted node pointer. Set key to search for + it. */ + set_cpu_key_k_type(item_key, TYPE_INDIRECT); + item_key->key_length = 4; + new_file_size -= + (new_file_size & (sb->s_blocksize - 1)); + tail_pos = new_file_size; + set_cpu_key_k_offset(item_key, new_file_size + 1); + if (search_for_position_by_key + (sb, item_key, + path) == POSITION_NOT_FOUND) { + print_block(PATH_PLAST_BUFFER(path), 3, + PATH_LAST_POSITION(path) - 1, + PATH_LAST_POSITION(path) + 1); + reiserfs_panic(sb, "PAP-5580", "item to " + "convert does not exist (%K)", + item_key); + } + continue; + } + if (cut_size == 0) { + pathrelse(path); + return 0; + } + + s_cut_balance.insert_size[0] = cut_size; + + ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL); + if (ret_value != REPEAT_SEARCH) + break; + + PROC_INFO_INC(sb, cut_from_item_restarted); + + ret_value = + search_for_position_by_key(sb, item_key, path); + if (ret_value == POSITION_FOUND) + continue; + + reiserfs_warning(sb, "PAP-5610", "item %K not found", + item_key); + unfix_nodes(&s_cut_balance); + return (ret_value == IO_ERROR) ? -EIO : -ENOENT; + } /* while */ + + // check fix_nodes results (IO_ERROR or NO_DISK_SPACE) + if (ret_value != CARRY_ON) { + if (is_inode_locked) { + // FIXME: this seems to be not needed: we are always able + // to cut item + indirect_to_direct_roll_back(th, inode, path); + } + if (ret_value == NO_DISK_SPACE) + reiserfs_warning(sb, "reiserfs-5092", + "NO_DISK_SPACE"); + unfix_nodes(&s_cut_balance); + return -EIO; + } + + /* go ahead and perform balancing */ + + RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode"); + + /* Calculate number of bytes that need to be cut from the item. */ + quota_cut_bytes = + (mode == + M_DELETE) ? ih_item_len(get_ih(path)) : -s_cut_balance. + insert_size[0]; + if (retval2 == -1) + ret_value = calc_deleted_bytes_number(&s_cut_balance, mode); + else + ret_value = retval2; + + /* For direct items, we only change the quota when deleting the last + ** item. + */ + p_le_ih = PATH_PITEM_HEAD(s_cut_balance.tb_path); + if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) { + if (mode == M_DELETE && + (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) == + 1) { + // FIXME: this is to keep 3.5 happy + REISERFS_I(inode)->i_first_direct_byte = U32_MAX; + quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE; + } else { + quota_cut_bytes = 0; + } + } +#ifdef CONFIG_REISERFS_CHECK + if (is_inode_locked) { + struct item_head *le_ih = + PATH_PITEM_HEAD(s_cut_balance.tb_path); + /* we are going to complete indirect2direct conversion. Make + sure, that we exactly remove last unformatted node pointer + of the item */ + if (!is_indirect_le_ih(le_ih)) + reiserfs_panic(sb, "vs-5652", + "item must be indirect %h", le_ih); + + if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE) + reiserfs_panic(sb, "vs-5653", "completing " + "indirect2direct conversion indirect " + "item %h being deleted must be of " + "4 byte long", le_ih); + + if (mode == M_CUT + && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) { + reiserfs_panic(sb, "vs-5654", "can not complete " + "indirect2direct conversion of %h " + "(CUT, insert_size==%d)", + le_ih, s_cut_balance.insert_size[0]); + } + /* it would be useful to make sure, that right neighboring + item is direct item of this file */ + } +#endif + + do_balance(&s_cut_balance, NULL, NULL, mode); + if (is_inode_locked) { + /* we've done an indirect->direct conversion. when the data block + ** was freed, it was removed from the list of blocks that must + ** be flushed before the transaction commits, make sure to + ** unmap and invalidate it + */ + unmap_buffers(page, tail_pos); + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + } +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota cut_from_item(): freeing %u id=%u type=%c", + quota_cut_bytes, inode->i_uid, '?'); +#endif + dquot_free_space_nodirty(inode, quota_cut_bytes); + return ret_value; +} + +static void truncate_directory(struct reiserfs_transaction_handle *th, + struct inode *inode) +{ + BUG_ON(!th->t_trans_id); + if (inode->i_nlink) + reiserfs_error(inode->i_sb, "vs-5655", "link count != 0"); + + set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET); + set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY); + reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode)); + reiserfs_update_sd(th, inode); + set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET); + set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA); +} + +/* Truncate file to the new size. Note, this must be called with a transaction + already started */ +int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, + struct inode *inode, /* ->i_size contains new size */ + struct page *page, /* up to date for last block */ + int update_timestamps /* when it is called by + file_release to convert + the tail - no timestamps + should be updated */ + ) +{ + INITIALIZE_PATH(s_search_path); /* Path to the current object item. */ + struct item_head *p_le_ih; /* Pointer to an item header. */ + struct cpu_key s_item_key; /* Key to search for a previous file item. */ + loff_t file_size, /* Old file size. */ + new_file_size; /* New file size. */ + int deleted; /* Number of deleted or truncated bytes. */ + int retval; + int err = 0; + + BUG_ON(!th->t_trans_id); + if (! + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) + || S_ISLNK(inode->i_mode))) + return 0; + + if (S_ISDIR(inode->i_mode)) { + // deletion of directory - no need to update timestamps + truncate_directory(th, inode); + return 0; + } + + /* Get new file size. */ + new_file_size = inode->i_size; + + // FIXME: note, that key type is unimportant here + make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode), + TYPE_DIRECT, 3); + + retval = + search_for_position_by_key(inode->i_sb, &s_item_key, + &s_search_path); + if (retval == IO_ERROR) { + reiserfs_error(inode->i_sb, "vs-5657", + "i/o failure occurred trying to truncate %K", + &s_item_key); + err = -EIO; + goto out; + } + if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) { + reiserfs_error(inode->i_sb, "PAP-5660", + "wrong result %d of search for %K", retval, + &s_item_key); + + err = -EIO; + goto out; + } + + s_search_path.pos_in_item--; + + /* Get real file size (total length of all file items) */ + p_le_ih = PATH_PITEM_HEAD(&s_search_path); + if (is_statdata_le_ih(p_le_ih)) + file_size = 0; + else { + loff_t offset = le_ih_k_offset(p_le_ih); + int bytes = + op_bytes_number(p_le_ih, inode->i_sb->s_blocksize); + + /* this may mismatch with real file size: if last direct item + had no padding zeros and last unformatted node had no free + space, this file would have this file size */ + file_size = offset + bytes - 1; + } + /* + * are we doing a full truncate or delete, if so + * kick in the reada code + */ + if (new_file_size == 0) + s_search_path.reada = PATH_READA | PATH_READA_BACK; + + if (file_size == 0 || file_size < new_file_size) { + goto update_and_out; + } + + /* Update key to search for the last file item. */ + set_cpu_key_k_offset(&s_item_key, file_size); + + do { + /* Cut or delete file item. */ + deleted = + reiserfs_cut_from_item(th, &s_search_path, &s_item_key, + inode, page, new_file_size); + if (deleted < 0) { + reiserfs_warning(inode->i_sb, "vs-5665", + "reiserfs_cut_from_item failed"); + reiserfs_check_path(&s_search_path); + return 0; + } + + RFALSE(deleted > file_size, + "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K", + deleted, file_size, &s_item_key); + + /* Change key to search the last file item. */ + file_size -= deleted; + + set_cpu_key_k_offset(&s_item_key, file_size); + + /* While there are bytes to truncate and previous file item is presented in the tree. */ + + /* + ** This loop could take a really long time, and could log + ** many more blocks than a transaction can hold. So, we do a polite + ** journal end here, and if the transaction needs ending, we make + ** sure the file is consistent before ending the current trans + ** and starting a new one + */ + if (journal_transaction_should_end(th, 0) || + reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) { + int orig_len_alloc = th->t_blocks_allocated; + pathrelse(&s_search_path); + + if (update_timestamps) { + inode->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = CURRENT_TIME_SEC; + } + reiserfs_update_sd(th, inode); + + err = journal_end(th, inode->i_sb, orig_len_alloc); + if (err) + goto out; + err = journal_begin(th, inode->i_sb, + JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ; + if (err) + goto out; + reiserfs_update_inode_transaction(inode); + } + } while (file_size > ROUND_UP(new_file_size) && + search_for_position_by_key(inode->i_sb, &s_item_key, + &s_search_path) == POSITION_FOUND); + + RFALSE(file_size > ROUND_UP(new_file_size), + "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d", + new_file_size, file_size, s_item_key.on_disk_key.k_objectid); + + update_and_out: + if (update_timestamps) { + // this is truncate, not file closing + inode->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = CURRENT_TIME_SEC; + } + reiserfs_update_sd(th, inode); + + out: + pathrelse(&s_search_path); + return err; +} + +#ifdef CONFIG_REISERFS_CHECK +// this makes sure, that we __append__, not overwrite or add holes +static void check_research_for_paste(struct treepath *path, + const struct cpu_key *key) +{ + struct item_head *found_ih = get_ih(path); + + if (is_direct_le_ih(found_ih)) { + if (le_ih_k_offset(found_ih) + + op_bytes_number(found_ih, + get_last_bh(path)->b_size) != + cpu_key_k_offset(key) + || op_bytes_number(found_ih, + get_last_bh(path)->b_size) != + pos_in_item(path)) + reiserfs_panic(NULL, "PAP-5720", "found direct item " + "%h or position (%d) does not match " + "to key %K", found_ih, + pos_in_item(path), key); + } + if (is_indirect_le_ih(found_ih)) { + if (le_ih_k_offset(found_ih) + + op_bytes_number(found_ih, + get_last_bh(path)->b_size) != + cpu_key_k_offset(key) + || I_UNFM_NUM(found_ih) != pos_in_item(path) + || get_ih_free_space(found_ih) != 0) + reiserfs_panic(NULL, "PAP-5730", "found indirect " + "item (%h) or position (%d) does not " + "match to key (%K)", + found_ih, pos_in_item(path), key); + } +} +#endif /* config reiserfs check */ + +/* Paste bytes to the existing item. Returns bytes number pasted into the item. */ +int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct treepath *search_path, /* Path to the pasted item. */ + const struct cpu_key *key, /* Key to search for the needed item. */ + struct inode *inode, /* Inode item belongs to */ + const char *body, /* Pointer to the bytes to paste. */ + int pasted_size) +{ /* Size of pasted bytes. */ + struct tree_balance s_paste_balance; + int retval; + int fs_gen; + + BUG_ON(!th->t_trans_id); + + fs_gen = get_generation(inode->i_sb); + +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota paste_into_item(): allocating %u id=%u type=%c", + pasted_size, inode->i_uid, + key2type(&(key->on_disk_key))); +#endif + + retval = dquot_alloc_space_nodirty(inode, pasted_size); + if (retval) { + pathrelse(search_path); + return retval; + } + init_tb_struct(th, &s_paste_balance, th->t_super, search_path, + pasted_size); +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + s_paste_balance.key = key->on_disk_key; +#endif + + /* DQUOT_* can schedule, must check before the fix_nodes */ + if (fs_changed(fs_gen, inode->i_sb)) { + goto search_again; + } + + while ((retval = + fix_nodes(M_PASTE, &s_paste_balance, NULL, + body)) == REPEAT_SEARCH) { + search_again: + /* file system changed while we were in the fix_nodes */ + PROC_INFO_INC(th->t_super, paste_into_item_restarted); + retval = + search_for_position_by_key(th->t_super, key, + search_path); + if (retval == IO_ERROR) { + retval = -EIO; + goto error_out; + } + if (retval == POSITION_FOUND) { + reiserfs_warning(inode->i_sb, "PAP-5710", + "entry or pasted byte (%K) exists", + key); + retval = -EEXIST; + goto error_out; + } +#ifdef CONFIG_REISERFS_CHECK + check_research_for_paste(search_path, key); +#endif + } + + /* Perform balancing after all resources are collected by fix_nodes, and + accessing them will not risk triggering schedule. */ + if (retval == CARRY_ON) { + do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE); + return 0; + } + retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; + error_out: + /* this also releases the path */ + unfix_nodes(&s_paste_balance); +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota paste_into_item(): freeing %u id=%u type=%c", + pasted_size, inode->i_uid, + key2type(&(key->on_disk_key))); +#endif + dquot_free_space_nodirty(inode, pasted_size); + return retval; +} + +/* Insert new item into the buffer at the path. + * th - active transaction handle + * path - path to the inserted item + * ih - pointer to the item header to insert + * body - pointer to the bytes to insert + */ +int reiserfs_insert_item(struct reiserfs_transaction_handle *th, + struct treepath *path, const struct cpu_key *key, + struct item_head *ih, struct inode *inode, + const char *body) +{ + struct tree_balance s_ins_balance; + int retval; + int fs_gen = 0; + int quota_bytes = 0; + + BUG_ON(!th->t_trans_id); + + if (inode) { /* Do we count quotas for item? */ + fs_gen = get_generation(inode->i_sb); + quota_bytes = ih_item_len(ih); + + /* hack so the quota code doesn't have to guess if the file has + ** a tail, links are always tails, so there's no guessing needed + */ + if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih)) + quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE; +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota insert_item(): allocating %u id=%u type=%c", + quota_bytes, inode->i_uid, head2type(ih)); +#endif + /* We can't dirty inode here. It would be immediately written but + * appropriate stat item isn't inserted yet... */ + retval = dquot_alloc_space_nodirty(inode, quota_bytes); + if (retval) { + pathrelse(path); + return retval; + } + } + init_tb_struct(th, &s_ins_balance, th->t_super, path, + IH_SIZE + ih_item_len(ih)); +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + s_ins_balance.key = key->on_disk_key; +#endif + /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */ + if (inode && fs_changed(fs_gen, inode->i_sb)) { + goto search_again; + } + + while ((retval = + fix_nodes(M_INSERT, &s_ins_balance, ih, + body)) == REPEAT_SEARCH) { + search_again: + /* file system changed while we were in the fix_nodes */ + PROC_INFO_INC(th->t_super, insert_item_restarted); + retval = search_item(th->t_super, key, path); + if (retval == IO_ERROR) { + retval = -EIO; + goto error_out; + } + if (retval == ITEM_FOUND) { + reiserfs_warning(th->t_super, "PAP-5760", + "key %K already exists in the tree", + key); + retval = -EEXIST; + goto error_out; + } + } + + /* make balancing after all resources will be collected at a time */ + if (retval == CARRY_ON) { + do_balance(&s_ins_balance, ih, body, M_INSERT); + return 0; + } + + retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; + error_out: + /* also releases the path */ + unfix_nodes(&s_ins_balance); +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, + "reiserquota insert_item(): freeing %u id=%u type=%c", + quota_bytes, inode->i_uid, head2type(ih)); +#endif + if (inode) + dquot_free_space_nodirty(inode, quota_bytes); + return retval; +} diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c new file mode 100644 index 00000000..f19dfbf6 --- /dev/null +++ b/fs/reiserfs/super.c @@ -0,0 +1,2271 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + * + * Trivial changes by Alan Cox to add the LFS fixes + * + * Trivial Changes: + * Rights granted to Hans Reiser to redistribute under other terms providing + * he accepts all liability including but not limited to patent, fitness + * for purpose, and direct or indirect claims arising from failure to perform. + * + * NO WARRANTY + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct file_system_type reiserfs_fs_type; + +static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING; +static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING; +static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING; + +int is_reiserfs_3_5(struct reiserfs_super_block *rs) +{ + return !strncmp(rs->s_v1.s_magic, reiserfs_3_5_magic_string, + strlen(reiserfs_3_5_magic_string)); +} + +int is_reiserfs_3_6(struct reiserfs_super_block *rs) +{ + return !strncmp(rs->s_v1.s_magic, reiserfs_3_6_magic_string, + strlen(reiserfs_3_6_magic_string)); +} + +int is_reiserfs_jr(struct reiserfs_super_block *rs) +{ + return !strncmp(rs->s_v1.s_magic, reiserfs_jr_magic_string, + strlen(reiserfs_jr_magic_string)); +} + +static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs) +{ + return (is_reiserfs_3_5(rs) || is_reiserfs_3_6(rs) || + is_reiserfs_jr(rs)); +} + +static int reiserfs_remount(struct super_block *s, int *flags, char *data); +static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); + +static int reiserfs_sync_fs(struct super_block *s, int wait) +{ + struct reiserfs_transaction_handle th; + + reiserfs_write_lock(s); + if (!journal_begin(&th, s, 1)) + if (!journal_end_sync(&th, s, 1)) + reiserfs_flush_old_commits(s); + s->s_dirt = 0; /* Even if it's not true. + * We'll loop forever in sync_supers otherwise */ + reiserfs_write_unlock(s); + return 0; +} + +static void reiserfs_write_super(struct super_block *s) +{ + reiserfs_sync_fs(s, 1); +} + +static int reiserfs_freeze(struct super_block *s) +{ + struct reiserfs_transaction_handle th; + reiserfs_write_lock(s); + if (!(s->s_flags & MS_RDONLY)) { + int err = journal_begin(&th, s, 1); + if (err) { + reiserfs_block_writes(&th); + } else { + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), + 1); + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + reiserfs_block_writes(&th); + journal_end_sync(&th, s, 1); + } + } + s->s_dirt = 0; + reiserfs_write_unlock(s); + return 0; +} + +static int reiserfs_unfreeze(struct super_block *s) +{ + reiserfs_allow_writes(s); + return 0; +} + +extern const struct in_core_key MAX_IN_CORE_KEY; + +/* this is used to delete "save link" when there are no items of a + file it points to. It can either happen if unlink is completed but + "save unlink" removal, or if file has both unlink and truncate + pending and as unlink completes first (because key of "save link" + protecting unlink is bigger that a key lf "save link" which + protects truncate), so there left no items to make truncate + completion on */ +static int remove_save_link_only(struct super_block *s, + struct reiserfs_key *key, int oid_free) +{ + struct reiserfs_transaction_handle th; + int err; + + /* we are going to do one balancing */ + err = journal_begin(&th, s, JOURNAL_PER_BALANCE_CNT); + if (err) + return err; + + reiserfs_delete_solid_item(&th, NULL, key); + if (oid_free) + /* removals are protected by direct items */ + reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid)); + + return journal_end(&th, s, JOURNAL_PER_BALANCE_CNT); +} + +#ifdef CONFIG_QUOTA +static int reiserfs_quota_on_mount(struct super_block *, int); +#endif + +/* look for uncompleted unlinks and truncates and complete them */ +static int finish_unfinished(struct super_block *s) +{ + INITIALIZE_PATH(path); + struct cpu_key max_cpu_key, obj_key; + struct reiserfs_key save_link_key, last_inode_key; + int retval = 0; + struct item_head *ih; + struct buffer_head *bh; + int item_pos; + char *item; + int done; + struct inode *inode; + int truncate; +#ifdef CONFIG_QUOTA + int i; + int ms_active_set; + int quota_enabled[MAXQUOTAS]; +#endif + + /* compose key to look for "save" links */ + max_cpu_key.version = KEY_FORMAT_3_5; + max_cpu_key.on_disk_key.k_dir_id = ~0U; + max_cpu_key.on_disk_key.k_objectid = ~0U; + set_cpu_key_k_offset(&max_cpu_key, ~0U); + max_cpu_key.key_length = 3; + + memset(&last_inode_key, 0, sizeof(last_inode_key)); + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + if (s->s_flags & MS_ACTIVE) { + ms_active_set = 0; + } else { + ms_active_set = 1; + s->s_flags |= MS_ACTIVE; + } + /* Turn on quotas so that they are updated correctly */ + for (i = 0; i < MAXQUOTAS; i++) { + quota_enabled[i] = 1; + if (REISERFS_SB(s)->s_qf_names[i]) { + int ret; + + if (sb_has_quota_active(s, i)) { + quota_enabled[i] = 0; + continue; + } + ret = reiserfs_quota_on_mount(s, i); + if (ret < 0) + reiserfs_warning(s, "reiserfs-2500", + "cannot turn on journaled " + "quota: error %d", ret); + } + } +#endif + + done = 0; + REISERFS_SB(s)->s_is_unlinked_ok = 1; + while (!retval) { + retval = search_item(s, &max_cpu_key, &path); + if (retval != ITEM_NOT_FOUND) { + reiserfs_error(s, "vs-2140", + "search_by_key returned %d", retval); + break; + } + + bh = get_last_bh(&path); + item_pos = get_item_pos(&path); + if (item_pos != B_NR_ITEMS(bh)) { + reiserfs_warning(s, "vs-2060", + "wrong position found"); + break; + } + item_pos--; + ih = B_N_PITEM_HEAD(bh, item_pos); + + if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID) + /* there are no "save" links anymore */ + break; + + save_link_key = ih->ih_key; + if (is_indirect_le_ih(ih)) + truncate = 1; + else + truncate = 0; + + /* reiserfs_iget needs k_dirid and k_objectid only */ + item = B_I_PITEM(bh, ih); + obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item); + obj_key.on_disk_key.k_objectid = + le32_to_cpu(ih->ih_key.k_objectid); + obj_key.on_disk_key.k_offset = 0; + obj_key.on_disk_key.k_type = 0; + + pathrelse(&path); + + inode = reiserfs_iget(s, &obj_key); + if (!inode) { + /* the unlink almost completed, it just did not manage to remove + "save" link and release objectid */ + reiserfs_warning(s, "vs-2180", "iget failed for %K", + &obj_key); + retval = remove_save_link_only(s, &save_link_key, 1); + continue; + } + + if (!truncate && inode->i_nlink) { + /* file is not unlinked */ + reiserfs_warning(s, "vs-2185", + "file %K is not unlinked", + &obj_key); + retval = remove_save_link_only(s, &save_link_key, 0); + continue; + } + dquot_initialize(inode); + + if (truncate && S_ISDIR(inode->i_mode)) { + /* We got a truncate request for a dir which is impossible. + The only imaginable way is to execute unfinished truncate request + then boot into old kernel, remove the file and create dir with + the same key. */ + reiserfs_warning(s, "green-2101", + "impossible truncate on a " + "directory %k. Please report", + INODE_PKEY(inode)); + retval = remove_save_link_only(s, &save_link_key, 0); + truncate = 0; + iput(inode); + continue; + } + + if (truncate) { + REISERFS_I(inode)->i_flags |= + i_link_saved_truncate_mask; + /* not completed truncate found. New size was committed together + with "save" link */ + reiserfs_info(s, "Truncating %k to %Ld ..", + INODE_PKEY(inode), inode->i_size); + reiserfs_truncate_file(inode, + 0 + /*don't update modification time */ + ); + retval = remove_save_link(inode, truncate); + } else { + REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask; + /* not completed unlink (rmdir) found */ + reiserfs_info(s, "Removing %k..", INODE_PKEY(inode)); + if (memcmp(&last_inode_key, INODE_PKEY(inode), + sizeof(last_inode_key))){ + last_inode_key = *INODE_PKEY(inode); + /* removal gets completed in iput */ + retval = 0; + } else { + reiserfs_warning(s, "super-2189", "Dead loop " + "in finish_unfinished " + "detected, just remove " + "save link\n"); + retval = remove_save_link_only(s, + &save_link_key, 0); + } + } + + iput(inode); + printk("done\n"); + done++; + } + REISERFS_SB(s)->s_is_unlinked_ok = 0; + +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + for (i = 0; i < MAXQUOTAS; i++) { + if (sb_dqopt(s)->files[i] && quota_enabled[i]) + dquot_quota_off(s, i); + } + if (ms_active_set) + /* Restore the flag back */ + s->s_flags &= ~MS_ACTIVE; +#endif + pathrelse(&path); + if (done) + reiserfs_info(s, "There were %d uncompleted unlinks/truncates. " + "Completed\n", done); + return retval; +} + +/* to protect file being unlinked from getting lost we "safe" link files + being unlinked. This link will be deleted in the same transaction with last + item of file. mounting the filesystem we scan all these links and remove + files which almost got lost */ +void add_save_link(struct reiserfs_transaction_handle *th, + struct inode *inode, int truncate) +{ + INITIALIZE_PATH(path); + int retval; + struct cpu_key key; + struct item_head ih; + __le32 link; + + BUG_ON(!th->t_trans_id); + + /* file can only get one "save link" of each kind */ + RFALSE(truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask), + "saved link already exists for truncated inode %lx", + (long)inode->i_ino); + RFALSE(!truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask), + "saved link already exists for unlinked inode %lx", + (long)inode->i_ino); + + /* setup key of "save" link */ + key.version = KEY_FORMAT_3_5; + key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID; + key.on_disk_key.k_objectid = inode->i_ino; + if (!truncate) { + /* unlink, rmdir, rename */ + set_cpu_key_k_offset(&key, 1 + inode->i_sb->s_blocksize); + set_cpu_key_k_type(&key, TYPE_DIRECT); + + /* item head of "safe" link */ + make_le_item_head(&ih, &key, key.version, + 1 + inode->i_sb->s_blocksize, TYPE_DIRECT, + 4 /*length */ , 0xffff /*free space */ ); + } else { + /* truncate */ + if (S_ISDIR(inode->i_mode)) + reiserfs_warning(inode->i_sb, "green-2102", + "Adding a truncate savelink for " + "a directory %k! Please report", + INODE_PKEY(inode)); + set_cpu_key_k_offset(&key, 1); + set_cpu_key_k_type(&key, TYPE_INDIRECT); + + /* item head of "safe" link */ + make_le_item_head(&ih, &key, key.version, 1, TYPE_INDIRECT, + 4 /*length */ , 0 /*free space */ ); + } + key.key_length = 3; + + /* look for its place in the tree */ + retval = search_item(inode->i_sb, &key, &path); + if (retval != ITEM_NOT_FOUND) { + if (retval != -ENOSPC) + reiserfs_error(inode->i_sb, "vs-2100", + "search_by_key (%K) returned %d", &key, + retval); + pathrelse(&path); + return; + } + + /* body of "save" link */ + link = INODE_PKEY(inode)->k_dir_id; + + /* put "save" link into tree, don't charge quota to anyone */ + retval = + reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link); + if (retval) { + if (retval != -ENOSPC) + reiserfs_error(inode->i_sb, "vs-2120", + "insert_item returned %d", retval); + } else { + if (truncate) + REISERFS_I(inode)->i_flags |= + i_link_saved_truncate_mask; + else + REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask; + } +} + +/* this opens transaction unlike add_save_link */ +int remove_save_link(struct inode *inode, int truncate) +{ + struct reiserfs_transaction_handle th; + struct reiserfs_key key; + int err; + + /* we are going to do one balancing only */ + err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); + if (err) + return err; + + /* setup key of "save" link */ + key.k_dir_id = cpu_to_le32(MAX_KEY_OBJECTID); + key.k_objectid = INODE_PKEY(inode)->k_objectid; + if (!truncate) { + /* unlink, rmdir, rename */ + set_le_key_k_offset(KEY_FORMAT_3_5, &key, + 1 + inode->i_sb->s_blocksize); + set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_DIRECT); + } else { + /* truncate */ + set_le_key_k_offset(KEY_FORMAT_3_5, &key, 1); + set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_INDIRECT); + } + + if ((truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask)) || + (!truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask))) + /* don't take quota bytes from anywhere */ + reiserfs_delete_solid_item(&th, NULL, &key); + if (!truncate) { + reiserfs_release_objectid(&th, inode->i_ino); + REISERFS_I(inode)->i_flags &= ~i_link_saved_unlink_mask; + } else + REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask; + + return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); +} + +static void reiserfs_kill_sb(struct super_block *s) +{ + if (REISERFS_SB(s)) { + /* + * Force any pending inode evictions to occur now. Any + * inodes to be removed that have extended attributes + * associated with them need to clean them up before + * we can release the extended attribute root dentries. + * shrink_dcache_for_umount will BUG if we don't release + * those before it's called so ->put_super is too late. + */ + shrink_dcache_sb(s); + + dput(REISERFS_SB(s)->xattr_root); + REISERFS_SB(s)->xattr_root = NULL; + dput(REISERFS_SB(s)->priv_root); + REISERFS_SB(s)->priv_root = NULL; + } + + kill_block_super(s); +} + +static void reiserfs_put_super(struct super_block *s) +{ + struct reiserfs_transaction_handle th; + th.t_trans_id = 0; + + dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + + reiserfs_write_lock(s); + + if (s->s_dirt) + reiserfs_write_super(s); + + /* change file system state to current state if it was mounted with read-write permissions */ + if (!(s->s_flags & MS_RDONLY)) { + if (!journal_begin(&th, s, 10)) { + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), + 1); + set_sb_umount_state(SB_DISK_SUPER_BLOCK(s), + REISERFS_SB(s)->s_mount_state); + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + } + } + + /* note, journal_release checks for readonly mount, and can decide not + ** to do a journal_end + */ + journal_release(&th, s); + + reiserfs_free_bitmap_cache(s); + + brelse(SB_BUFFER_WITH_SB(s)); + + print_statistics(s); + + if (REISERFS_SB(s)->reserved_blocks != 0) { + reiserfs_warning(s, "green-2005", "reserved blocks left %d", + REISERFS_SB(s)->reserved_blocks); + } + + reiserfs_proc_info_done(s); + + reiserfs_write_unlock(s); + mutex_destroy(&REISERFS_SB(s)->lock); + kfree(s->s_fs_info); + s->s_fs_info = NULL; +} + +static struct kmem_cache *reiserfs_inode_cachep; + +static struct inode *reiserfs_alloc_inode(struct super_block *sb) +{ + struct reiserfs_inode_info *ei; + ei = (struct reiserfs_inode_info *) + kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + atomic_set(&ei->openers, 0); + mutex_init(&ei->tailpack); + return &ei->vfs_inode; +} + +static void reiserfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); +} + +static void reiserfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, reiserfs_i_callback); +} + +static void init_once(void *foo) +{ + struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo; + + INIT_LIST_HEAD(&ei->i_prealloc_list); + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", + sizeof(struct + reiserfs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (reiserfs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(reiserfs_inode_cachep); +} + +/* we don't mark inodes dirty, we just log them */ +static void reiserfs_dirty_inode(struct inode *inode, int flags) +{ + struct reiserfs_transaction_handle th; + + int err = 0; + int lock_depth; + + if (inode->i_sb->s_flags & MS_RDONLY) { + reiserfs_warning(inode->i_sb, "clm-6006", + "writing inode %lu on readonly FS", + inode->i_ino); + return; + } + lock_depth = reiserfs_write_lock_once(inode->i_sb); + + /* this is really only used for atime updates, so they don't have + ** to be included in O_SYNC or fsync + */ + err = journal_begin(&th, inode->i_sb, 1); + if (err) + goto out; + + reiserfs_update_sd(&th, inode); + journal_end(&th, inode->i_sb, 1); + +out: + reiserfs_write_unlock_once(inode->i_sb, lock_depth); +} + +#ifdef CONFIG_QUOTA +static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, + size_t, loff_t); +static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t, + loff_t); +#endif + +static const struct super_operations reiserfs_sops = { + .alloc_inode = reiserfs_alloc_inode, + .destroy_inode = reiserfs_destroy_inode, + .write_inode = reiserfs_write_inode, + .dirty_inode = reiserfs_dirty_inode, + .evict_inode = reiserfs_evict_inode, + .put_super = reiserfs_put_super, + .write_super = reiserfs_write_super, + .sync_fs = reiserfs_sync_fs, + .freeze_fs = reiserfs_freeze, + .unfreeze_fs = reiserfs_unfreeze, + .statfs = reiserfs_statfs, + .remount_fs = reiserfs_remount, + .show_options = generic_show_options, +#ifdef CONFIG_QUOTA + .quota_read = reiserfs_quota_read, + .quota_write = reiserfs_quota_write, +#endif +}; + +#ifdef CONFIG_QUOTA +#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") + +static int reiserfs_write_dquot(struct dquot *); +static int reiserfs_acquire_dquot(struct dquot *); +static int reiserfs_release_dquot(struct dquot *); +static int reiserfs_mark_dquot_dirty(struct dquot *); +static int reiserfs_write_info(struct super_block *, int); +static int reiserfs_quota_on(struct super_block *, int, int, struct path *); + +static const struct dquot_operations reiserfs_quota_operations = { + .write_dquot = reiserfs_write_dquot, + .acquire_dquot = reiserfs_acquire_dquot, + .release_dquot = reiserfs_release_dquot, + .mark_dirty = reiserfs_mark_dquot_dirty, + .write_info = reiserfs_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +}; + +static const struct quotactl_ops reiserfs_qctl_operations = { + .quota_on = reiserfs_quota_on, + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, +}; +#endif + +static const struct export_operations reiserfs_export_ops = { + .encode_fh = reiserfs_encode_fh, + .fh_to_dentry = reiserfs_fh_to_dentry, + .fh_to_parent = reiserfs_fh_to_parent, + .get_parent = reiserfs_get_parent, +}; + +/* this struct is used in reiserfs_getopt () for containing the value for those + mount options that have values rather than being toggles. */ +typedef struct { + char *value; + int setmask; /* bitmask which is to set on mount_options bitmask when this + value is found, 0 is no bits are to be changed. */ + int clrmask; /* bitmask which is to clear on mount_options bitmask when this + value is found, 0 is no bits are to be changed. This is + applied BEFORE setmask */ +} arg_desc_t; + +/* Set this bit in arg_required to allow empty arguments */ +#define REISERFS_OPT_ALLOWEMPTY 31 + +/* this struct is used in reiserfs_getopt() for describing the set of reiserfs + mount options */ +typedef struct { + char *option_name; + int arg_required; /* 0 if argument is not required, not 0 otherwise */ + const arg_desc_t *values; /* list of values accepted by an option */ + int setmask; /* bitmask which is to set on mount_options bitmask when this + value is found, 0 is no bits are to be changed. */ + int clrmask; /* bitmask which is to clear on mount_options bitmask when this + value is found, 0 is no bits are to be changed. This is + applied BEFORE setmask */ +} opt_desc_t; + +/* possible values for -o data= */ +static const arg_desc_t logging_mode[] = { + {"ordered", 1 << REISERFS_DATA_ORDERED, + (1 << REISERFS_DATA_LOG | 1 << REISERFS_DATA_WRITEBACK)}, + {"journal", 1 << REISERFS_DATA_LOG, + (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_WRITEBACK)}, + {"writeback", 1 << REISERFS_DATA_WRITEBACK, + (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_LOG)}, + {.value = NULL} +}; + +/* possible values for -o barrier= */ +static const arg_desc_t barrier_mode[] = { + {"none", 1 << REISERFS_BARRIER_NONE, 1 << REISERFS_BARRIER_FLUSH}, + {"flush", 1 << REISERFS_BARRIER_FLUSH, 1 << REISERFS_BARRIER_NONE}, + {.value = NULL} +}; + +/* possible values for "-o block-allocator=" and bits which are to be set in + s_mount_opt of reiserfs specific part of in-core super block */ +static const arg_desc_t balloc[] = { + {"noborder", 1 << REISERFS_NO_BORDER, 0}, + {"border", 0, 1 << REISERFS_NO_BORDER}, + {"no_unhashed_relocation", 1 << REISERFS_NO_UNHASHED_RELOCATION, 0}, + {"hashed_relocation", 1 << REISERFS_HASHED_RELOCATION, 0}, + {"test4", 1 << REISERFS_TEST4, 0}, + {"notest4", 0, 1 << REISERFS_TEST4}, + {NULL, 0, 0} +}; + +static const arg_desc_t tails[] = { + {"on", 1 << REISERFS_LARGETAIL, 1 << REISERFS_SMALLTAIL}, + {"off", 0, (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)}, + {"small", 1 << REISERFS_SMALLTAIL, 1 << REISERFS_LARGETAIL}, + {NULL, 0, 0} +}; + +static const arg_desc_t error_actions[] = { + {"panic", 1 << REISERFS_ERROR_PANIC, + (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)}, + {"ro-remount", 1 << REISERFS_ERROR_RO, + (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)}, +#ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG + {"continue", 1 << REISERFS_ERROR_CONTINUE, + (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)}, +#endif + {NULL, 0, 0}, +}; + +/* proceed only one option from a list *cur - string containing of mount options + opts - array of options which are accepted + opt_arg - if option is found and requires an argument and if it is specifed + in the input - pointer to the argument is stored here + bit_flags - if option requires to set a certain bit - it is set here + return -1 if unknown option is found, opt->arg_required otherwise */ +static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts, + char **opt_arg, unsigned long *bit_flags) +{ + char *p; + /* foo=bar, + ^ ^ ^ + | | +-- option_end + | +-- arg_start + +-- option_start + */ + const opt_desc_t *opt; + const arg_desc_t *arg; + + p = *cur; + + /* assume argument cannot contain commas */ + *cur = strchr(p, ','); + if (*cur) { + *(*cur) = '\0'; + (*cur)++; + } + + if (!strncmp(p, "alloc=", 6)) { + /* Ugly special case, probably we should redo options parser so that + it can understand several arguments for some options, also so that + it can fill several bitfields with option values. */ + if (reiserfs_parse_alloc_options(s, p + 6)) { + return -1; + } else { + return 0; + } + } + + /* for every option in the list */ + for (opt = opts; opt->option_name; opt++) { + if (!strncmp(p, opt->option_name, strlen(opt->option_name))) { + if (bit_flags) { + if (opt->clrmask == + (1 << REISERFS_UNSUPPORTED_OPT)) + reiserfs_warning(s, "super-6500", + "%s not supported.\n", + p); + else + *bit_flags &= ~opt->clrmask; + if (opt->setmask == + (1 << REISERFS_UNSUPPORTED_OPT)) + reiserfs_warning(s, "super-6501", + "%s not supported.\n", + p); + else + *bit_flags |= opt->setmask; + } + break; + } + } + if (!opt->option_name) { + reiserfs_warning(s, "super-6502", + "unknown mount option \"%s\"", p); + return -1; + } + + p += strlen(opt->option_name); + switch (*p) { + case '=': + if (!opt->arg_required) { + reiserfs_warning(s, "super-6503", + "the option \"%s\" does not " + "require an argument\n", + opt->option_name); + return -1; + } + break; + + case 0: + if (opt->arg_required) { + reiserfs_warning(s, "super-6504", + "the option \"%s\" requires an " + "argument\n", opt->option_name); + return -1; + } + break; + default: + reiserfs_warning(s, "super-6505", + "head of option \"%s\" is only correct\n", + opt->option_name); + return -1; + } + + /* move to the argument, or to next option if argument is not required */ + p++; + + if (opt->arg_required + && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY)) + && !strlen(p)) { + /* this catches "option=," if not allowed */ + reiserfs_warning(s, "super-6506", + "empty argument for \"%s\"\n", + opt->option_name); + return -1; + } + + if (!opt->values) { + /* *=NULLopt_arg contains pointer to argument */ + *opt_arg = p; + return opt->arg_required & ~(1 << REISERFS_OPT_ALLOWEMPTY); + } + + /* values possible for this option are listed in opt->values */ + for (arg = opt->values; arg->value; arg++) { + if (!strcmp(p, arg->value)) { + if (bit_flags) { + *bit_flags &= ~arg->clrmask; + *bit_flags |= arg->setmask; + } + return opt->arg_required; + } + } + + reiserfs_warning(s, "super-6506", + "bad value \"%s\" for option \"%s\"\n", p, + opt->option_name); + return -1; +} + +/* returns 0 if something is wrong in option string, 1 - otherwise */ +static int reiserfs_parse_options(struct super_block *s, char *options, /* string given via mount's -o */ + unsigned long *mount_options, + /* after the parsing phase, contains the + collection of bitflags defining what + mount options were selected. */ + unsigned long *blocks, /* strtol-ed from NNN of resize=NNN */ + char **jdev_name, + unsigned int *commit_max_age, + char **qf_names, + unsigned int *qfmt) +{ + int c; + char *arg = NULL; + char *pos; + opt_desc_t opts[] = { + /* Compatibility stuff, so that -o notail for old setups still work */ + {"tails",.arg_required = 't',.values = tails}, + {"notail",.clrmask = + (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)}, + {"conv",.setmask = 1 << REISERFS_CONVERT}, + {"attrs",.setmask = 1 << REISERFS_ATTRS}, + {"noattrs",.clrmask = 1 << REISERFS_ATTRS}, + {"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT}, +#ifdef CONFIG_REISERFS_FS_XATTR + {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER}, + {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER}, +#else + {"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT}, + {"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT}, +#endif +#ifdef CONFIG_REISERFS_FS_POSIX_ACL + {"acl",.setmask = 1 << REISERFS_POSIXACL}, + {"noacl",.clrmask = 1 << REISERFS_POSIXACL}, +#else + {"acl",.setmask = 1 << REISERFS_UNSUPPORTED_OPT}, + {"noacl",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT}, +#endif + {.option_name = "nolog"}, + {"replayonly",.setmask = 1 << REPLAYONLY}, + {"block-allocator",.arg_required = 'a',.values = balloc}, + {"data",.arg_required = 'd',.values = logging_mode}, + {"barrier",.arg_required = 'b',.values = barrier_mode}, + {"resize",.arg_required = 'r',.values = NULL}, + {"jdev",.arg_required = 'j',.values = NULL}, + {"nolargeio",.arg_required = 'w',.values = NULL}, + {"commit",.arg_required = 'c',.values = NULL}, + {"usrquota",.setmask = 1 << REISERFS_QUOTA}, + {"grpquota",.setmask = 1 << REISERFS_QUOTA}, + {"noquota",.clrmask = 1 << REISERFS_QUOTA}, + {"errors",.arg_required = 'e',.values = error_actions}, + {"usrjquota",.arg_required = + 'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL}, + {"grpjquota",.arg_required = + 'g' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL}, + {"jqfmt",.arg_required = 'f',.values = NULL}, + {.option_name = NULL} + }; + + *blocks = 0; + if (!options || !*options) + /* use default configuration: create tails, journaling on, no + conversion to newest format */ + return 1; + + for (pos = options; pos;) { + c = reiserfs_getopt(s, &pos, opts, &arg, mount_options); + if (c == -1) + /* wrong option is given */ + return 0; + + if (c == 'r') { + char *p; + + p = NULL; + /* "resize=NNN" or "resize=auto" */ + + if (!strcmp(arg, "auto")) { + /* From JFS code, to auto-get the size. */ + *blocks = + s->s_bdev->bd_inode->i_size >> s-> + s_blocksize_bits; + } else { + *blocks = simple_strtoul(arg, &p, 0); + if (*p != '\0') { + /* NNN does not look like a number */ + reiserfs_warning(s, "super-6507", + "bad value %s for " + "-oresize\n", arg); + return 0; + } + } + } + + if (c == 'c') { + char *p = NULL; + unsigned long val = simple_strtoul(arg, &p, 0); + /* commit=NNN (time in seconds) */ + if (*p != '\0' || val >= (unsigned int)-1) { + reiserfs_warning(s, "super-6508", + "bad value %s for -ocommit\n", + arg); + return 0; + } + *commit_max_age = (unsigned int)val; + } + + if (c == 'w') { + reiserfs_warning(s, "super-6509", "nolargeio option " + "is no longer supported"); + return 0; + } + + if (c == 'j') { + if (arg && *arg && jdev_name) { + if (*jdev_name) { //Hm, already assigned? + reiserfs_warning(s, "super-6510", + "journal device was " + "already specified to " + "be %s", *jdev_name); + return 0; + } + *jdev_name = arg; + } + } +#ifdef CONFIG_QUOTA + if (c == 'u' || c == 'g') { + int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; + + if (sb_any_quota_loaded(s) && + (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) { + reiserfs_warning(s, "super-6511", + "cannot change journaled " + "quota options when quota " + "turned on."); + return 0; + } + if (*arg) { /* Some filename specified? */ + if (REISERFS_SB(s)->s_qf_names[qtype] + && strcmp(REISERFS_SB(s)->s_qf_names[qtype], + arg)) { + reiserfs_warning(s, "super-6512", + "%s quota file " + "already specified.", + QTYPE2NAME(qtype)); + return 0; + } + if (strchr(arg, '/')) { + reiserfs_warning(s, "super-6513", + "quotafile must be " + "on filesystem root."); + return 0; + } + qf_names[qtype] = + kmalloc(strlen(arg) + 1, GFP_KERNEL); + if (!qf_names[qtype]) { + reiserfs_warning(s, "reiserfs-2502", + "not enough memory " + "for storing " + "quotafile name."); + return 0; + } + strcpy(qf_names[qtype], arg); + *mount_options |= 1 << REISERFS_QUOTA; + } else { + if (qf_names[qtype] != + REISERFS_SB(s)->s_qf_names[qtype]) + kfree(qf_names[qtype]); + qf_names[qtype] = NULL; + } + } + if (c == 'f') { + if (!strcmp(arg, "vfsold")) + *qfmt = QFMT_VFS_OLD; + else if (!strcmp(arg, "vfsv0")) + *qfmt = QFMT_VFS_V0; + else { + reiserfs_warning(s, "super-6514", + "unknown quota format " + "specified."); + return 0; + } + if (sb_any_quota_loaded(s) && + *qfmt != REISERFS_SB(s)->s_jquota_fmt) { + reiserfs_warning(s, "super-6515", + "cannot change journaled " + "quota options when quota " + "turned on."); + return 0; + } + } +#else + if (c == 'u' || c == 'g' || c == 'f') { + reiserfs_warning(s, "reiserfs-2503", "journaled " + "quota options not supported."); + return 0; + } +#endif + } + +#ifdef CONFIG_QUOTA + if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt + && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) { + reiserfs_warning(s, "super-6515", + "journaled quota format not specified."); + return 0; + } + /* This checking is not precise wrt the quota type but for our purposes it is sufficient */ + if (!(*mount_options & (1 << REISERFS_QUOTA)) + && sb_any_quota_loaded(s)) { + reiserfs_warning(s, "super-6516", "quota options must " + "be present when quota is turned on."); + return 0; + } +#endif + + return 1; +} + +static void switch_data_mode(struct super_block *s, unsigned long mode) +{ + REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) | + (1 << REISERFS_DATA_ORDERED) | + (1 << REISERFS_DATA_WRITEBACK)); + REISERFS_SB(s)->s_mount_opt |= (1 << mode); +} + +static void handle_data_mode(struct super_block *s, unsigned long mount_options) +{ + if (mount_options & (1 << REISERFS_DATA_LOG)) { + if (!reiserfs_data_log(s)) { + switch_data_mode(s, REISERFS_DATA_LOG); + reiserfs_info(s, "switching to journaled data mode\n"); + } + } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) { + if (!reiserfs_data_ordered(s)) { + switch_data_mode(s, REISERFS_DATA_ORDERED); + reiserfs_info(s, "switching to ordered data mode\n"); + } + } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) { + if (!reiserfs_data_writeback(s)) { + switch_data_mode(s, REISERFS_DATA_WRITEBACK); + reiserfs_info(s, "switching to writeback data mode\n"); + } + } +} + +static void handle_barrier_mode(struct super_block *s, unsigned long bits) +{ + int flush = (1 << REISERFS_BARRIER_FLUSH); + int none = (1 << REISERFS_BARRIER_NONE); + int all_barrier = flush | none; + + if (bits & all_barrier) { + REISERFS_SB(s)->s_mount_opt &= ~all_barrier; + if (bits & flush) { + REISERFS_SB(s)->s_mount_opt |= flush; + printk("reiserfs: enabling write barrier flush mode\n"); + } else if (bits & none) { + REISERFS_SB(s)->s_mount_opt |= none; + printk("reiserfs: write barriers turned off\n"); + } + } +} + +static void handle_attrs(struct super_block *s) +{ + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); + + if (reiserfs_attrs(s)) { + if (old_format_only(s)) { + reiserfs_warning(s, "super-6517", "cannot support " + "attributes on 3.5.x disk format"); + REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); + return; + } + if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) { + reiserfs_warning(s, "super-6518", "cannot support " + "attributes until flag is set in " + "super-block"); + REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); + } + } +} + +#ifdef CONFIG_QUOTA +static void handle_quota_files(struct super_block *s, char **qf_names, + unsigned int *qfmt) +{ + int i; + + for (i = 0; i < MAXQUOTAS; i++) { + if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) + kfree(REISERFS_SB(s)->s_qf_names[i]); + REISERFS_SB(s)->s_qf_names[i] = qf_names[i]; + } + if (*qfmt) + REISERFS_SB(s)->s_jquota_fmt = *qfmt; +} +#endif + +static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) +{ + struct reiserfs_super_block *rs; + struct reiserfs_transaction_handle th; + unsigned long blocks; + unsigned long mount_options = REISERFS_SB(s)->s_mount_opt; + unsigned long safe_mask = 0; + unsigned int commit_max_age = (unsigned int)-1; + struct reiserfs_journal *journal = SB_JOURNAL(s); + char *new_opts = kstrdup(arg, GFP_KERNEL); + int err; + char *qf_names[MAXQUOTAS]; + unsigned int qfmt = 0; +#ifdef CONFIG_QUOTA + int i; +#endif + + reiserfs_write_lock(s); + +#ifdef CONFIG_QUOTA + memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); +#endif + + rs = SB_DISK_SUPER_BLOCK(s); + + if (!reiserfs_parse_options + (s, arg, &mount_options, &blocks, NULL, &commit_max_age, + qf_names, &qfmt)) { +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) + kfree(qf_names[i]); +#endif + err = -EINVAL; + goto out_err; + } +#ifdef CONFIG_QUOTA + handle_quota_files(s, qf_names, &qfmt); +#endif + + handle_attrs(s); + + /* Add options that are safe here */ + safe_mask |= 1 << REISERFS_SMALLTAIL; + safe_mask |= 1 << REISERFS_LARGETAIL; + safe_mask |= 1 << REISERFS_NO_BORDER; + safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION; + safe_mask |= 1 << REISERFS_HASHED_RELOCATION; + safe_mask |= 1 << REISERFS_TEST4; + safe_mask |= 1 << REISERFS_ATTRS; + safe_mask |= 1 << REISERFS_XATTRS_USER; + safe_mask |= 1 << REISERFS_POSIXACL; + safe_mask |= 1 << REISERFS_BARRIER_FLUSH; + safe_mask |= 1 << REISERFS_BARRIER_NONE; + safe_mask |= 1 << REISERFS_ERROR_RO; + safe_mask |= 1 << REISERFS_ERROR_CONTINUE; + safe_mask |= 1 << REISERFS_ERROR_PANIC; + safe_mask |= 1 << REISERFS_QUOTA; + + /* Update the bitmask, taking care to keep + * the bits we're not allowed to change here */ + REISERFS_SB(s)->s_mount_opt = + (REISERFS_SB(s)-> + s_mount_opt & ~safe_mask) | (mount_options & safe_mask); + + if (commit_max_age != 0 && commit_max_age != (unsigned int)-1) { + journal->j_max_commit_age = commit_max_age; + journal->j_max_trans_age = commit_max_age; + } else if (commit_max_age == 0) { + /* 0 means restore defaults. */ + journal->j_max_commit_age = journal->j_default_max_commit_age; + journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; + } + + if (blocks) { + err = reiserfs_resize(s, blocks); + if (err != 0) + goto out_err; + } + + if (*mount_flags & MS_RDONLY) { + reiserfs_xattr_init(s, *mount_flags); + /* remount read-only */ + if (s->s_flags & MS_RDONLY) + /* it is read-only already */ + goto out_ok; + + err = dquot_suspend(s, -1); + if (err < 0) + goto out_err; + + /* try to remount file system with read-only permissions */ + if (sb_umount_state(rs) == REISERFS_VALID_FS + || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { + goto out_ok; + } + + err = journal_begin(&th, s, 10); + if (err) + goto out_err; + + /* Mounting a rw partition read-only. */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state); + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + } else { + /* remount read-write */ + if (!(s->s_flags & MS_RDONLY)) { + reiserfs_xattr_init(s, *mount_flags); + goto out_ok; /* We are read-write already */ + } + + if (reiserfs_is_journal_aborted(journal)) { + err = journal->j_errno; + goto out_err; + } + + handle_data_mode(s, mount_options); + handle_barrier_mode(s, mount_options); + REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); + s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */ + err = journal_begin(&th, s, 10); + if (err) + goto out_err; + + /* Mount a partition which is read-only, read-write */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); + s->s_flags &= ~MS_RDONLY; + set_sb_umount_state(rs, REISERFS_ERROR_FS); + if (!old_format_only(s)) + set_sb_mnt_count(rs, sb_mnt_count(rs) + 1); + /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */ + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS; + } + /* this will force a full flush of all journal lists */ + SB_JOURNAL(s)->j_must_wait = 1; + err = journal_end(&th, s, 10); + if (err) + goto out_err; + s->s_dirt = 0; + + if (!(*mount_flags & MS_RDONLY)) { + dquot_resume(s, -1); + finish_unfinished(s); + reiserfs_xattr_init(s, *mount_flags); + } + +out_ok: + replace_mount_options(s, new_opts); + reiserfs_write_unlock(s); + return 0; + +out_err: + kfree(new_opts); + reiserfs_write_unlock(s); + return err; +} + +static int read_super_block(struct super_block *s, int offset) +{ + struct buffer_head *bh; + struct reiserfs_super_block *rs; + int fs_blocksize; + + bh = sb_bread(s, offset / s->s_blocksize); + if (!bh) { + reiserfs_warning(s, "sh-2006", + "bread failed (dev %s, block %lu, size %lu)", + reiserfs_bdevname(s), offset / s->s_blocksize, + s->s_blocksize); + return 1; + } + + rs = (struct reiserfs_super_block *)bh->b_data; + if (!is_any_reiserfs_magic_string(rs)) { + brelse(bh); + return 1; + } + // + // ok, reiserfs signature (old or new) found in at the given offset + // + fs_blocksize = sb_blocksize(rs); + brelse(bh); + sb_set_blocksize(s, fs_blocksize); + + bh = sb_bread(s, offset / s->s_blocksize); + if (!bh) { + reiserfs_warning(s, "sh-2007", + "bread failed (dev %s, block %lu, size %lu)", + reiserfs_bdevname(s), offset / s->s_blocksize, + s->s_blocksize); + return 1; + } + + rs = (struct reiserfs_super_block *)bh->b_data; + if (sb_blocksize(rs) != s->s_blocksize) { + reiserfs_warning(s, "sh-2011", "can't find a reiserfs " + "filesystem on (dev %s, block %Lu, size %lu)", + reiserfs_bdevname(s), + (unsigned long long)bh->b_blocknr, + s->s_blocksize); + brelse(bh); + return 1; + } + + if (rs->s_v1.s_root_block == cpu_to_le32(-1)) { + brelse(bh); + reiserfs_warning(s, "super-6519", "Unfinished reiserfsck " + "--rebuild-tree run detected. Please run\n" + "reiserfsck --rebuild-tree and wait for a " + "completion. If that fails\n" + "get newer reiserfsprogs package"); + return 1; + } + + SB_BUFFER_WITH_SB(s) = bh; + SB_DISK_SUPER_BLOCK(s) = rs; + + if (is_reiserfs_jr(rs)) { + /* magic is of non-standard journal filesystem, look at s_version to + find which format is in use */ + if (sb_version(rs) == REISERFS_VERSION_2) + reiserfs_info(s, "found reiserfs format \"3.6\"" + " with non-standard journal\n"); + else if (sb_version(rs) == REISERFS_VERSION_1) + reiserfs_info(s, "found reiserfs format \"3.5\"" + " with non-standard journal\n"); + else { + reiserfs_warning(s, "sh-2012", "found unknown " + "format \"%u\" of reiserfs with " + "non-standard magic", sb_version(rs)); + return 1; + } + } else + /* s_version of standard format may contain incorrect information, + so we just look at the magic string */ + reiserfs_info(s, + "found reiserfs format \"%s\" with standard journal\n", + is_reiserfs_3_5(rs) ? "3.5" : "3.6"); + + s->s_op = &reiserfs_sops; + s->s_export_op = &reiserfs_export_ops; +#ifdef CONFIG_QUOTA + s->s_qcop = &reiserfs_qctl_operations; + s->dq_op = &reiserfs_quota_operations; +#endif + + /* new format is limited by the 32 bit wide i_blocks field, want to + ** be one full block below that. + */ + s->s_maxbytes = (512LL << 32) - s->s_blocksize; + return 0; +} + +/* after journal replay, reread all bitmap and super blocks */ +static int reread_meta_blocks(struct super_block *s) +{ + ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); + reiserfs_write_unlock(s); + wait_on_buffer(SB_BUFFER_WITH_SB(s)); + reiserfs_write_lock(s); + if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { + reiserfs_warning(s, "reiserfs-2504", "error reading the super"); + return 1; + } + + return 0; +} + +///////////////////////////////////////////////////// +// hash detection stuff + +// if root directory is empty - we set default - Yura's - hash and +// warn about it +// FIXME: we look for only one name in a directory. If tea and yura +// bith have the same value - we ask user to send report to the +// mailing list +static __u32 find_hash_out(struct super_block *s) +{ + int retval; + struct inode *inode; + struct cpu_key key; + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + __u32 hash = DEFAULT_HASH; + + inode = s->s_root->d_inode; + + do { // Some serious "goto"-hater was there ;) + u32 teahash, r5hash, yurahash; + + make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3); + retval = search_by_entry_key(s, &key, &path, &de); + if (retval == IO_ERROR) { + pathrelse(&path); + return UNSET_HASH; + } + if (retval == NAME_NOT_FOUND) + de.de_entry_num--; + set_de_name_and_namelen(&de); + if (deh_offset(&(de.de_deh[de.de_entry_num])) == DOT_DOT_OFFSET) { + /* allow override in this case */ + if (reiserfs_rupasov_hash(s)) { + hash = YURA_HASH; + } + reiserfs_info(s, "FS seems to be empty, autodetect " + "is using the default hash\n"); + break; + } + r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen)); + teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen)); + yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen)); + if (((teahash == r5hash) + && + (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num]))) + == r5hash)) || ((teahash == yurahash) + && (yurahash == + GET_HASH_VALUE(deh_offset + (& + (de. + de_deh[de. + de_entry_num]))))) + || ((r5hash == yurahash) + && (yurahash == + GET_HASH_VALUE(deh_offset + (&(de.de_deh[de.de_entry_num])))))) { + reiserfs_warning(s, "reiserfs-2506", "Unable to " + "automatically detect hash function. " + "Please mount with -o " + "hash={tea,rupasov,r5}"); + hash = UNSET_HASH; + break; + } + if (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num]))) == + yurahash) + hash = YURA_HASH; + else if (GET_HASH_VALUE + (deh_offset(&(de.de_deh[de.de_entry_num]))) == teahash) + hash = TEA_HASH; + else if (GET_HASH_VALUE + (deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash) + hash = R5_HASH; + else { + reiserfs_warning(s, "reiserfs-2506", + "Unrecognised hash function"); + hash = UNSET_HASH; + } + } while (0); + + pathrelse(&path); + return hash; +} + +// finds out which hash names are sorted with +static int what_hash(struct super_block *s) +{ + __u32 code; + + code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s)); + + /* reiserfs_hash_detect() == true if any of the hash mount options + ** were used. We must check them to make sure the user isn't + ** using a bad hash value + */ + if (code == UNSET_HASH || reiserfs_hash_detect(s)) + code = find_hash_out(s); + + if (code != UNSET_HASH && reiserfs_hash_detect(s)) { + /* detection has found the hash, and we must check against the + ** mount options + */ + if (reiserfs_rupasov_hash(s) && code != YURA_HASH) { + reiserfs_warning(s, "reiserfs-2507", + "Error, %s hash detected, " + "unable to force rupasov hash", + reiserfs_hashname(code)); + code = UNSET_HASH; + } else if (reiserfs_tea_hash(s) && code != TEA_HASH) { + reiserfs_warning(s, "reiserfs-2508", + "Error, %s hash detected, " + "unable to force tea hash", + reiserfs_hashname(code)); + code = UNSET_HASH; + } else if (reiserfs_r5_hash(s) && code != R5_HASH) { + reiserfs_warning(s, "reiserfs-2509", + "Error, %s hash detected, " + "unable to force r5 hash", + reiserfs_hashname(code)); + code = UNSET_HASH; + } + } else { + /* find_hash_out was not called or could not determine the hash */ + if (reiserfs_rupasov_hash(s)) { + code = YURA_HASH; + } else if (reiserfs_tea_hash(s)) { + code = TEA_HASH; + } else if (reiserfs_r5_hash(s)) { + code = R5_HASH; + } + } + + /* if we are mounted RW, and we have a new valid hash code, update + ** the super + */ + if (code != UNSET_HASH && + !(s->s_flags & MS_RDONLY) && + code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) { + set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code); + } + return code; +} + +// return pointer to appropriate function +static hashf_t hash_function(struct super_block *s) +{ + switch (what_hash(s)) { + case TEA_HASH: + reiserfs_info(s, "Using tea hash to sort names\n"); + return keyed_hash; + case YURA_HASH: + reiserfs_info(s, "Using rupasov hash to sort names\n"); + return yura_hash; + case R5_HASH: + reiserfs_info(s, "Using r5 hash to sort names\n"); + return r5_hash; + } + return NULL; +} + +// this is used to set up correct value for old partitions +static int function2code(hashf_t func) +{ + if (func == keyed_hash) + return TEA_HASH; + if (func == yura_hash) + return YURA_HASH; + if (func == r5_hash) + return R5_HASH; + + BUG(); // should never happen + + return 0; +} + +#define SWARN(silent, s, id, ...) \ + if (!(silent)) \ + reiserfs_warning(s, id, __VA_ARGS__) + +static int reiserfs_fill_super(struct super_block *s, void *data, int silent) +{ + struct inode *root_inode; + struct reiserfs_transaction_handle th; + int old_format = 0; + unsigned long blocks; + unsigned int commit_max_age = 0; + int jinit_done = 0; + struct reiserfs_iget_args args; + struct reiserfs_super_block *rs; + char *jdev_name; + struct reiserfs_sb_info *sbi; + int errval = -EINVAL; + char *qf_names[MAXQUOTAS] = {}; + unsigned int qfmt = 0; + + save_mount_options(s, data); + + sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + s->s_fs_info = sbi; + /* Set default values for options: non-aggressive tails, RO on errors */ + REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); + REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO); + /* no preallocation minimum, be smart in + reiserfs_file_write instead */ + REISERFS_SB(s)->s_alloc_options.preallocmin = 0; + /* Preallocate by 16 blocks (17-1) at once */ + REISERFS_SB(s)->s_alloc_options.preallocsize = 17; + /* setup default block allocator options */ + reiserfs_init_alloc_options(s); + + mutex_init(&REISERFS_SB(s)->lock); + REISERFS_SB(s)->lock_depth = -1; + + /* + * This function is called with the bkl, which also was the old + * locking used here. + * do_journal_begin() will soon check if we hold the lock (ie: was the + * bkl). This is likely because do_journal_begin() has several another + * callers because at this time, it doesn't seem to be necessary to + * protect against anything. + * Anyway, let's be conservative and lock for now. + */ + reiserfs_write_lock(s); + + jdev_name = NULL; + if (reiserfs_parse_options + (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, + &commit_max_age, qf_names, &qfmt) == 0) { + goto error; + } +#ifdef CONFIG_QUOTA + handle_quota_files(s, qf_names, &qfmt); +#endif + + if (blocks) { + SWARN(silent, s, "jmacd-7", "resize option for remount only"); + goto error; + } + + /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */ + if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES)) + old_format = 1; + /* try new format (64-th 1k block), which can contain reiserfs super block */ + else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { + SWARN(silent, s, "sh-2021", "can not find reiserfs on %s", + reiserfs_bdevname(s)); + goto error; + } + + rs = SB_DISK_SUPER_BLOCK(s); + /* Let's do basic sanity check to verify that underlying device is not + smaller than the filesystem. If the check fails then abort and scream, + because bad stuff will happen otherwise. */ + if (s->s_bdev && s->s_bdev->bd_inode + && i_size_read(s->s_bdev->bd_inode) < + sb_block_count(rs) * sb_blocksize(rs)) { + SWARN(silent, s, "", "Filesystem cannot be " + "mounted because it is bigger than the device"); + SWARN(silent, s, "", "You may need to run fsck " + "or increase size of your LVM partition"); + SWARN(silent, s, "", "Or may be you forgot to " + "reboot after fdisk when it told you to"); + goto error; + } + + sbi->s_mount_state = SB_REISERFS_STATE(s); + sbi->s_mount_state = REISERFS_VALID_FS; + + if ((errval = reiserfs_init_bitmap_cache(s))) { + SWARN(silent, s, "jmacd-8", "unable to read bitmap"); + goto error; + } + errval = -EINVAL; +#ifdef CONFIG_REISERFS_CHECK + SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON"); + SWARN(silent, s, "", "- it is slow mode for debugging."); +#endif + + /* make data=ordered the default */ + if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) && + !reiserfs_data_writeback(s)) { + REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED); + } + + if (reiserfs_data_log(s)) { + reiserfs_info(s, "using journaled data mode\n"); + } else if (reiserfs_data_ordered(s)) { + reiserfs_info(s, "using ordered data mode\n"); + } else { + reiserfs_info(s, "using writeback data mode\n"); + } + if (reiserfs_barrier_flush(s)) { + printk("reiserfs: using flush barriers\n"); + } + // set_device_ro(s->s_dev, 1) ; + if (journal_init(s, jdev_name, old_format, commit_max_age)) { + SWARN(silent, s, "sh-2022", + "unable to initialize journal space"); + goto error; + } else { + jinit_done = 1; /* once this is set, journal_release must be called + ** if we error out of the mount + */ + } + if (reread_meta_blocks(s)) { + SWARN(silent, s, "jmacd-9", + "unable to reread meta blocks after journal init"); + goto error; + } + + if (replay_only(s)) + goto error; + + if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) { + SWARN(silent, s, "clm-7000", + "Detected readonly device, marking FS readonly"); + s->s_flags |= MS_RDONLY; + } + args.objectid = REISERFS_ROOT_OBJECTID; + args.dirid = REISERFS_ROOT_PARENT_OBJECTID; + root_inode = + iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, + reiserfs_init_locked_inode, (void *)(&args)); + if (!root_inode) { + SWARN(silent, s, "jmacd-10", "get root inode failed"); + goto error; + } + + if (root_inode->i_state & I_NEW) { + reiserfs_read_locked_inode(root_inode, &args); + unlock_new_inode(root_inode); + } + + s->s_root = d_alloc_root(root_inode); + if (!s->s_root) { + iput(root_inode); + goto error; + } + // define and initialize hash function + sbi->s_hash_function = hash_function(s); + if (sbi->s_hash_function == NULL) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + + if (is_reiserfs_3_5(rs) + || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1)) + set_bit(REISERFS_3_5, &(sbi->s_properties)); + else if (old_format) + set_bit(REISERFS_OLD_FORMAT, &(sbi->s_properties)); + else + set_bit(REISERFS_3_6, &(sbi->s_properties)); + + if (!(s->s_flags & MS_RDONLY)) { + + errval = journal_begin(&th, s, 1); + if (errval) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + + set_sb_umount_state(rs, REISERFS_ERROR_FS); + set_sb_fs_state(rs, 0); + + /* Clear out s_bmap_nr if it would wrap. We can handle this + * case, but older revisions can't. This will cause the + * file system to fail mount on those older implementations, + * avoiding corruption. -jeffm */ + if (bmap_would_wrap(reiserfs_bmap_count(s)) && + sb_bmap_nr(rs) != 0) { + reiserfs_warning(s, "super-2030", "This file system " + "claims to use %u bitmap blocks in " + "its super block, but requires %u. " + "Clearing to zero.", sb_bmap_nr(rs), + reiserfs_bmap_count(s)); + + set_sb_bmap_nr(rs, 0); + } + + if (old_format_only(s)) { + /* filesystem of format 3.5 either with standard or non-standard + journal */ + if (convert_reiserfs(s)) { + /* and -o conv is given */ + if (!silent) + reiserfs_info(s, + "converting 3.5 filesystem to the 3.6 format"); + + if (is_reiserfs_3_5(rs)) + /* put magic string of 3.6 format. 2.2 will not be able to + mount this filesystem anymore */ + memcpy(rs->s_v1.s_magic, + reiserfs_3_6_magic_string, + sizeof + (reiserfs_3_6_magic_string)); + + set_sb_version(rs, REISERFS_VERSION_2); + reiserfs_convert_objectid_map_v1(s); + set_bit(REISERFS_3_6, &(sbi->s_properties)); + clear_bit(REISERFS_3_5, &(sbi->s_properties)); + } else if (!silent) { + reiserfs_info(s, "using 3.5.x disk format\n"); + } + } else + set_sb_mnt_count(rs, sb_mnt_count(rs) + 1); + + + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + errval = journal_end(&th, s, 1); + if (errval) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + + if ((errval = reiserfs_lookup_privroot(s)) || + (errval = reiserfs_xattr_init(s, s->s_flags))) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + + /* look for files which were to be removed in previous session */ + finish_unfinished(s); + } else { + if (old_format_only(s) && !silent) { + reiserfs_info(s, "using 3.5.x disk format\n"); + } + + if ((errval = reiserfs_lookup_privroot(s)) || + (errval = reiserfs_xattr_init(s, s->s_flags))) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + } + // mark hash in super block: it could be unset. overwrite should be ok + set_sb_hash_function_code(rs, function2code(sbi->s_hash_function)); + + handle_attrs(s); + + reiserfs_proc_info_init(s); + + init_waitqueue_head(&(sbi->s_wait)); + spin_lock_init(&sbi->bitmap_lock); + + reiserfs_write_unlock(s); + + return (0); + +error: + if (jinit_done) { /* kill the commit thread, free journal ram */ + journal_release_error(NULL, s); + } + + reiserfs_write_unlock(s); + + reiserfs_free_bitmap_cache(s); + if (SB_BUFFER_WITH_SB(s)) + brelse(SB_BUFFER_WITH_SB(s)); +#ifdef CONFIG_QUOTA + { + int j; + for (j = 0; j < MAXQUOTAS; j++) + kfree(qf_names[j]); + } +#endif + kfree(sbi); + + s->s_fs_info = NULL; + return errval; +} + +static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb); + + buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize)); + buf->f_bfree = sb_free_blocks(rs); + buf->f_bavail = buf->f_bfree; + buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1; + buf->f_bsize = dentry->d_sb->s_blocksize; + /* changed to accommodate gcc folks. */ + buf->f_type = REISERFS_SUPER_MAGIC; + buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2); + buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2, + sizeof(rs->s_uuid)/2); + + return 0; +} + +#ifdef CONFIG_QUOTA +static int reiserfs_write_dquot(struct dquot *dquot) +{ + struct reiserfs_transaction_handle th; + int ret, err; + + reiserfs_write_lock(dquot->dq_sb); + ret = + journal_begin(&th, dquot->dq_sb, + REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + if (ret) + goto out; + ret = dquot_commit(dquot); + err = + journal_end(&th, dquot->dq_sb, + REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + if (!ret && err) + ret = err; + out: + reiserfs_write_unlock(dquot->dq_sb); + return ret; +} + +static int reiserfs_acquire_dquot(struct dquot *dquot) +{ + struct reiserfs_transaction_handle th; + int ret, err; + + reiserfs_write_lock(dquot->dq_sb); + ret = + journal_begin(&th, dquot->dq_sb, + REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + if (ret) + goto out; + ret = dquot_acquire(dquot); + err = + journal_end(&th, dquot->dq_sb, + REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + if (!ret && err) + ret = err; + out: + reiserfs_write_unlock(dquot->dq_sb); + return ret; +} + +static int reiserfs_release_dquot(struct dquot *dquot) +{ + struct reiserfs_transaction_handle th; + int ret, err; + + reiserfs_write_lock(dquot->dq_sb); + ret = + journal_begin(&th, dquot->dq_sb, + REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + if (ret) { + /* Release dquot anyway to avoid endless cycle in dqput() */ + dquot_release(dquot); + goto out; + } + ret = dquot_release(dquot); + err = + journal_end(&th, dquot->dq_sb, + REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + if (!ret && err) + ret = err; + out: + reiserfs_write_unlock(dquot->dq_sb); + return ret; +} + +static int reiserfs_mark_dquot_dirty(struct dquot *dquot) +{ + /* Are we journaling quotas? */ + if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || + REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + dquot_mark_dquot_dirty(dquot); + return reiserfs_write_dquot(dquot); + } else + return dquot_mark_dquot_dirty(dquot); +} + +static int reiserfs_write_info(struct super_block *sb, int type) +{ + struct reiserfs_transaction_handle th; + int ret, err; + + /* Data block + inode block */ + reiserfs_write_lock(sb); + ret = journal_begin(&th, sb, 2); + if (ret) + goto out; + ret = dquot_commit_info(sb, type); + err = journal_end(&th, sb, 2); + if (!ret && err) + ret = err; + out: + reiserfs_write_unlock(sb); + return ret; +} + +/* + * Turn on quotas during mount time - we need to find the quota file and such... + */ +static int reiserfs_quota_on_mount(struct super_block *sb, int type) +{ + return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], + REISERFS_SB(sb)->s_jquota_fmt, type); +} + +/* + * Standard function to be called on quota_on + */ +static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + int err; + struct inode *inode; + struct reiserfs_transaction_handle th; + + if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) + return -EINVAL; + + /* Quotafile not on the same filesystem? */ + if (path->mnt->mnt_sb != sb) { + err = -EXDEV; + goto out; + } + inode = path->dentry->d_inode; + /* We must not pack tails for quota files on reiserfs for quota IO to work */ + if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { + err = reiserfs_unpack(inode, NULL); + if (err) { + reiserfs_warning(sb, "super-6520", + "Unpacking tail of quota file failed" + " (%d). Cannot turn on quotas.", err); + err = -EINVAL; + goto out; + } + mark_inode_dirty(inode); + } + /* Journaling quota? */ + if (REISERFS_SB(sb)->s_qf_names[type]) { + /* Quotafile not of fs root? */ + if (path->dentry->d_parent != sb->s_root) + reiserfs_warning(sb, "super-6521", + "Quota file not on filesystem root. " + "Journalled quota will not work."); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (reiserfs_file_data_log(inode)) { + /* Just start temporary transaction and finish it */ + err = journal_begin(&th, sb, 1); + if (err) + goto out; + err = journal_end_sync(&th, sb, 1); + if (err) + goto out; + } + err = dquot_quota_on(sb, type, format_id, path); +out: + return err; +} + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) + * we don't have to be afraid of races */ +static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + unsigned long blk = off >> sb->s_blocksize_bits; + int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; + size_t toread; + struct buffer_head tmp_bh, *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = + sb->s_blocksize - offset < + toread ? sb->s_blocksize - offset : toread; + tmp_bh.b_state = 0; + /* Quota files are without tails so we can safely use this function */ + reiserfs_write_lock(sb); + err = reiserfs_get_block(inode, blk, &tmp_bh, 0); + reiserfs_write_unlock(sb); + if (err) + return err; + if (!buffer_mapped(&tmp_bh)) /* A hole? */ + memset(data, 0, tocopy); + else { + bh = sb_bread(sb, tmp_bh.b_blocknr); + if (!bh) + return -EIO; + memcpy(data, bh->b_data + offset, tocopy); + brelse(bh); + } + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +static ssize_t reiserfs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + unsigned long blk = off >> sb->s_blocksize_bits; + int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; + int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL; + size_t towrite = len; + struct buffer_head tmp_bh, *bh; + + if (!current->journal_info) { + printk(KERN_WARNING "reiserfs: Quota write (off=%Lu, len=%Lu)" + " cancelled because transaction is not started.\n", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + tmp_bh.b_state = 0; + err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); + if (err) + goto out; + if (offset || tocopy != sb->s_blocksize) + bh = sb_bread(sb, tmp_bh.b_blocknr); + else + bh = sb_getblk(sb, tmp_bh.b_blocknr); + if (!bh) { + err = -EIO; + goto out; + } + lock_buffer(bh); + memcpy(bh->b_data + offset, data, tocopy); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + unlock_buffer(bh); + reiserfs_prepare_for_journal(sb, bh, 1); + journal_mark_dirty(current->journal_info, sb, bh); + if (!journal_quota) + reiserfs_add_ordered_list(inode, bh); + brelse(bh); + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } +out: + if (len == towrite) { + mutex_unlock(&inode->i_mutex); + return err; + } + if (inode->i_size < off + len - towrite) + i_size_write(inode, off + len - towrite); + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + mutex_unlock(&inode->i_mutex); + return len - towrite; +} + +#endif + +static struct dentry *get_super_block(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super); +} + +static int __init init_reiserfs_fs(void) +{ + int ret; + + if ((ret = init_inodecache())) { + return ret; + } + + reiserfs_proc_info_global_init(); + + ret = register_filesystem(&reiserfs_fs_type); + + if (ret == 0) { + return 0; + } + + reiserfs_proc_info_global_done(); + destroy_inodecache(); + + return ret; +} + +static void __exit exit_reiserfs_fs(void) +{ + reiserfs_proc_info_global_done(); + unregister_filesystem(&reiserfs_fs_type); + destroy_inodecache(); +} + +struct file_system_type reiserfs_fs_type = { + .owner = THIS_MODULE, + .name = "reiserfs", + .mount = get_super_block, + .kill_sb = reiserfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; + +MODULE_DESCRIPTION("ReiserFS journaled filesystem"); +MODULE_AUTHOR("Hans Reiser "); +MODULE_LICENSE("GPL"); + +module_init(init_reiserfs_fs); +module_exit(exit_reiserfs_fs); diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c new file mode 100644 index 00000000..d7f6e51b --- /dev/null +++ b/fs/reiserfs/tail_conversion.c @@ -0,0 +1,280 @@ +/* + * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright details + */ + +#include +#include +#include +#include + +/* access to tail : when one is going to read tail it must make sure, that is not running. + direct2indirect and indirect2direct can not run concurrently */ + +/* Converts direct items to an unformatted node. Panics if file has no + tail. -ENOSPC if no disk space for conversion */ +/* path points to first direct item of the file regarless of how many of + them are there */ +int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, + struct treepath *path, struct buffer_head *unbh, + loff_t tail_offset) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *up_to_date_bh; + struct item_head *p_le_ih = PATH_PITEM_HEAD(path); + unsigned long total_tail = 0; + struct cpu_key end_key; /* Key to search for the last byte of the + converted item. */ + struct item_head ind_ih; /* new indirect item to be inserted or + key of unfm pointer to be pasted */ + int blk_size, retval; /* returned value for reiserfs_insert_item and clones */ + unp_t unfm_ptr; /* Handle on an unformatted node + that will be inserted in the + tree. */ + + BUG_ON(!th->t_trans_id); + + REISERFS_SB(sb)->s_direct2indirect++; + + blk_size = sb->s_blocksize; + + /* and key to search for append or insert pointer to the new + unformatted node. */ + copy_item_head(&ind_ih, p_le_ih); + set_le_ih_k_offset(&ind_ih, tail_offset); + set_le_ih_k_type(&ind_ih, TYPE_INDIRECT); + + /* Set the key to search for the place for new unfm pointer */ + make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4); + + /* FIXME: we could avoid this */ + if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) { + reiserfs_error(sb, "PAP-14030", + "pasted or inserted byte exists in " + "the tree %K. Use fsck to repair.", &end_key); + pathrelse(path); + return -EIO; + } + + p_le_ih = PATH_PITEM_HEAD(path); + + unfm_ptr = cpu_to_le32(unbh->b_blocknr); + + if (is_statdata_le_ih(p_le_ih)) { + /* Insert new indirect item. */ + set_ih_free_space(&ind_ih, 0); /* delete at nearest future */ + put_ih_item_len(&ind_ih, UNFM_P_SIZE); + PATH_LAST_POSITION(path)++; + retval = + reiserfs_insert_item(th, path, &end_key, &ind_ih, inode, + (char *)&unfm_ptr); + } else { + /* Paste into last indirect item of an object. */ + retval = reiserfs_paste_into_item(th, path, &end_key, inode, + (char *)&unfm_ptr, + UNFM_P_SIZE); + } + if (retval) { + return retval; + } + // note: from here there are two keys which have matching first + // three key components. They only differ by the fourth one. + + /* Set the key to search for the direct items of the file */ + make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT, + 4); + + /* Move bytes from the direct items to the new unformatted node + and delete them. */ + while (1) { + int tail_size; + + /* end_key.k_offset is set so, that we will always have found + last item of the file */ + if (search_for_position_by_key(sb, &end_key, path) == + POSITION_FOUND) + reiserfs_panic(sb, "PAP-14050", + "direct item (%K) not found", &end_key); + p_le_ih = PATH_PITEM_HEAD(path); + RFALSE(!is_direct_le_ih(p_le_ih), + "vs-14055: direct item expected(%K), found %h", + &end_key, p_le_ih); + tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1)) + + ih_item_len(p_le_ih) - 1; + + /* we only send the unbh pointer if the buffer is not up to date. + ** this avoids overwriting good data from writepage() with old data + ** from the disk or buffer cache + ** Special case: unbh->b_page will be NULL if we are coming through + ** DIRECT_IO handler here. + */ + if (!unbh->b_page || buffer_uptodate(unbh) + || PageUptodate(unbh->b_page)) { + up_to_date_bh = NULL; + } else { + up_to_date_bh = unbh; + } + retval = reiserfs_delete_item(th, path, &end_key, inode, + up_to_date_bh); + + total_tail += retval; + if (tail_size == retval) + // done: file does not have direct items anymore + break; + + } + /* if we've copied bytes from disk into the page, we need to zero + ** out the unused part of the block (it was not up to date before) + */ + if (up_to_date_bh) { + unsigned pgoff = + (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1); + char *kaddr = kmap_atomic(up_to_date_bh->b_page, KM_USER0); + memset(kaddr + pgoff, 0, blk_size - total_tail); + kunmap_atomic(kaddr, KM_USER0); + } + + REISERFS_I(inode)->i_first_direct_byte = U32_MAX; + + return 0; +} + +/* stolen from fs/buffer.c */ +void reiserfs_unmap_buffer(struct buffer_head *bh) +{ + lock_buffer(bh); + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { + BUG(); + } + clear_buffer_dirty(bh); + /* Remove the buffer from whatever list it belongs to. We are mostly + interested in removing it from per-sb j_dirty_buffers list, to avoid + BUG() on attempt to write not mapped buffer */ + if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { + struct inode *inode = bh->b_page->mapping->host; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + spin_lock(&j->j_dirty_buffers_lock); + list_del_init(&bh->b_assoc_buffers); + reiserfs_free_jh(bh); + spin_unlock(&j->j_dirty_buffers_lock); + } + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + bh->b_bdev = NULL; + unlock_buffer(bh); +} + +/* this first locks inode (neither reads nor sync are permitted), + reads tail through page cache, insert direct item. When direct item + inserted successfully inode is left locked. Return value is always + what we expect from it (number of cut bytes). But when tail remains + in the unformatted node, we set mode to SKIP_BALANCING and unlock + inode */ +int indirect2direct(struct reiserfs_transaction_handle *th, + struct inode *inode, struct page *page, + struct treepath *path, /* path to the indirect item. */ + const struct cpu_key *item_key, /* Key to look for + * unformatted node + * pointer to be cut. */ + loff_t n_new_file_size, /* New file size. */ + char *mode) +{ + struct super_block *sb = inode->i_sb; + struct item_head s_ih; + unsigned long block_size = sb->s_blocksize; + char *tail; + int tail_len, round_tail_len; + loff_t pos, pos1; /* position of first byte of the tail */ + struct cpu_key key; + + BUG_ON(!th->t_trans_id); + + REISERFS_SB(sb)->s_indirect2direct++; + + *mode = M_SKIP_BALANCING; + + /* store item head path points to. */ + copy_item_head(&s_ih, PATH_PITEM_HEAD(path)); + + tail_len = (n_new_file_size & (block_size - 1)); + if (get_inode_sd_version(inode) == STAT_DATA_V2) + round_tail_len = ROUND_UP(tail_len); + else + round_tail_len = tail_len; + + pos = + le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - + 1) * sb->s_blocksize; + pos1 = pos; + + // we are protected by i_mutex. The tail can not disapper, not + // append can be done either + // we are in truncate or packing tail in file_release + + tail = (char *)kmap(page); /* this can schedule */ + + if (path_changed(&s_ih, path)) { + /* re-search indirect item */ + if (search_for_position_by_key(sb, item_key, path) + == POSITION_NOT_FOUND) + reiserfs_panic(sb, "PAP-5520", + "item to be converted %K does not exist", + item_key); + copy_item_head(&s_ih, PATH_PITEM_HEAD(path)); +#ifdef CONFIG_REISERFS_CHECK + pos = le_ih_k_offset(&s_ih) - 1 + + (ih_item_len(&s_ih) / UNFM_P_SIZE - + 1) * sb->s_blocksize; + if (pos != pos1) + reiserfs_panic(sb, "vs-5530", "tail position " + "changed while we were reading it"); +#endif + } + + /* Set direct item header to insert. */ + make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode), + pos1 + 1, TYPE_DIRECT, round_tail_len, + 0xffff /*ih_free_space */ ); + + /* we want a pointer to the first byte of the tail in the page. + ** the page was locked and this part of the page was up to date when + ** indirect2direct was called, so we know the bytes are still valid + */ + tail = tail + (pos & (PAGE_CACHE_SIZE - 1)); + + PATH_LAST_POSITION(path)++; + + key = *item_key; + set_cpu_key_k_type(&key, TYPE_DIRECT); + key.key_length = 4; + /* Insert tail as new direct item in the tree */ + if (reiserfs_insert_item(th, path, &key, &s_ih, inode, + tail ? tail : NULL) < 0) { + /* No disk memory. So we can not convert last unformatted node + to the direct item. In this case we used to adjust + indirect items's ih_free_space. Now ih_free_space is not + used, it would be ideal to write zeros to corresponding + unformatted node. For now i_size is considered as guard for + going out of file size */ + kunmap(page); + return block_size - round_tail_len; + } + kunmap(page); + + /* make sure to get the i_blocks changes from reiserfs_insert_item */ + reiserfs_update_sd(th, inode); + + // note: we have now the same as in above direct2indirect + // conversion: there are two keys which have matching first three + // key components. They only differ by the fouhth one. + + /* We have inserted new direct item and must remove last + unformatted node. */ + *mode = M_CUT; + + /* we store position of first direct item in the in-core inode */ + /* mark_file_with_tail (inode, pos1 + 1); */ + REISERFS_I(inode)->i_first_direct_byte = pos1 + 1; + + return block_size - round_tail_len; +} diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c new file mode 100644 index 00000000..d7808969 --- /dev/null +++ b/fs/reiserfs/xattr.c @@ -0,0 +1,1051 @@ +/* + * linux/fs/reiserfs/xattr.c + * + * Copyright (c) 2002 by Jeff Mahoney, + * + */ + +/* + * In order to implement EA/ACLs in a clean, backwards compatible manner, + * they are implemented as files in a "private" directory. + * Each EA is in it's own file, with the directory layout like so (/ is assumed + * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory, + * directories named using the capital-hex form of the objectid and + * generation number are used. Inside each directory are individual files + * named with the name of the extended attribute. + * + * So, for objectid 12648430, we could have: + * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access + * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default + * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type + * .. or similar. + * + * The file contents are the text of the EA. The size is known based on the + * stat data describing the file. + * + * In the case of system.posix_acl_access and system.posix_acl_default, since + * these are special cases for filesystem ACLs, they are interpreted by the + * kernel, in addition, they are negatively and positively cached and attached + * to the inode so that unnecessary lookups are avoided. + * + * Locking works like so: + * Directory components (xattr root, xattr dir) are protectd by their i_mutex. + * The xattrs themselves are protected by the xattr_sem. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PRIVROOT_NAME ".reiserfs_priv" +#define XAROOT_NAME "xattrs" + + +/* Helpers for inode ops. We do this so that we don't have all the VFS + * overhead and also for proper i_mutex annotation. + * dir->i_mutex must be held for all of them. */ +#ifdef CONFIG_REISERFS_FS_XATTR +static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) +{ + BUG_ON(!mutex_is_locked(&dir->i_mutex)); + return dir->i_op->create(dir, dentry, mode, NULL); +} +#endif + +static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + BUG_ON(!mutex_is_locked(&dir->i_mutex)); + return dir->i_op->mkdir(dir, dentry, mode); +} + +/* We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr + * mutation ops aren't called during rename or splace, which are the + * only other users of I_MUTEX_CHILD. It violates the ordering, but that's + * better than allocating another subclass just for this code. */ +static int xattr_unlink(struct inode *dir, struct dentry *dentry) +{ + int error; + BUG_ON(!mutex_is_locked(&dir->i_mutex)); + + reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, + I_MUTEX_CHILD, dir->i_sb); + error = dir->i_op->unlink(dir, dentry); + mutex_unlock(&dentry->d_inode->i_mutex); + + if (!error) + d_delete(dentry); + return error; +} + +static int xattr_rmdir(struct inode *dir, struct dentry *dentry) +{ + int error; + BUG_ON(!mutex_is_locked(&dir->i_mutex)); + + reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, + I_MUTEX_CHILD, dir->i_sb); + error = dir->i_op->rmdir(dir, dentry); + if (!error) + dentry->d_inode->i_flags |= S_DEAD; + mutex_unlock(&dentry->d_inode->i_mutex); + if (!error) + d_delete(dentry); + + return error; +} + +#define xattr_may_create(flags) (!flags || flags & XATTR_CREATE) + +static struct dentry *open_xa_root(struct super_block *sb, int flags) +{ + struct dentry *privroot = REISERFS_SB(sb)->priv_root; + struct dentry *xaroot; + if (!privroot->d_inode) + return ERR_PTR(-ENODATA); + + mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR); + + xaroot = dget(REISERFS_SB(sb)->xattr_root); + if (!xaroot) + xaroot = ERR_PTR(-ENODATA); + else if (!xaroot->d_inode) { + int err = -ENODATA; + if (xattr_may_create(flags)) + err = xattr_mkdir(privroot->d_inode, xaroot, 0700); + if (err) { + dput(xaroot); + xaroot = ERR_PTR(err); + } + } + + mutex_unlock(&privroot->d_inode->i_mutex); + return xaroot; +} + +static struct dentry *open_xa_dir(const struct inode *inode, int flags) +{ + struct dentry *xaroot, *xadir; + char namebuf[17]; + + xaroot = open_xa_root(inode->i_sb, flags); + if (IS_ERR(xaroot)) + return xaroot; + + snprintf(namebuf, sizeof(namebuf), "%X.%X", + le32_to_cpu(INODE_PKEY(inode)->k_objectid), + inode->i_generation); + + mutex_lock_nested(&xaroot->d_inode->i_mutex, I_MUTEX_XATTR); + + xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); + if (!IS_ERR(xadir) && !xadir->d_inode) { + int err = -ENODATA; + if (xattr_may_create(flags)) + err = xattr_mkdir(xaroot->d_inode, xadir, 0700); + if (err) { + dput(xadir); + xadir = ERR_PTR(err); + } + } + + mutex_unlock(&xaroot->d_inode->i_mutex); + dput(xaroot); + return xadir; +} + +/* The following are side effects of other operations that aren't explicitly + * modifying extended attributes. This includes operations such as permissions + * or ownership changes, object deletions, etc. */ +struct reiserfs_dentry_buf { + struct dentry *xadir; + int count; + struct dentry *dentries[8]; +}; + +static int +fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset, + u64 ino, unsigned int d_type) +{ + struct reiserfs_dentry_buf *dbuf = buf; + struct dentry *dentry; + WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex)); + + if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) + return -ENOSPC; + + if (name[0] == '.' && (name[1] == '\0' || + (name[1] == '.' && name[2] == '\0'))) + return 0; + + dentry = lookup_one_len(name, dbuf->xadir, namelen); + if (IS_ERR(dentry)) { + return PTR_ERR(dentry); + } else if (!dentry->d_inode) { + /* A directory entry exists, but no file? */ + reiserfs_error(dentry->d_sb, "xattr-20003", + "Corrupted directory: xattr %s listed but " + "not found for file %s.\n", + dentry->d_name.name, dbuf->xadir->d_name.name); + dput(dentry); + return -EIO; + } + + dbuf->dentries[dbuf->count++] = dentry; + return 0; +} + +static void +cleanup_dentry_buf(struct reiserfs_dentry_buf *buf) +{ + int i; + for (i = 0; i < buf->count; i++) + if (buf->dentries[i]) + dput(buf->dentries[i]); +} + +static int reiserfs_for_each_xattr(struct inode *inode, + int (*action)(struct dentry *, void *), + void *data) +{ + struct dentry *dir; + int i, err = 0; + loff_t pos = 0; + struct reiserfs_dentry_buf buf = { + .count = 0, + }; + + /* Skip out, an xattr has no xattrs associated with it */ + if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1) + return 0; + + reiserfs_write_unlock(inode->i_sb); + dir = open_xa_dir(inode, XATTR_REPLACE); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + reiserfs_write_lock(inode->i_sb); + goto out; + } else if (!dir->d_inode) { + err = 0; + reiserfs_write_lock(inode->i_sb); + goto out_dir; + } + + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); + + reiserfs_write_lock(inode->i_sb); + + buf.xadir = dir; + err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos); + while ((err == 0 || err == -ENOSPC) && buf.count) { + err = 0; + + for (i = 0; i < buf.count && buf.dentries[i]; i++) { + int lerr = 0; + struct dentry *dentry = buf.dentries[i]; + + if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode)) + lerr = action(dentry, data); + + dput(dentry); + buf.dentries[i] = NULL; + err = lerr ?: err; + } + buf.count = 0; + if (!err) + err = reiserfs_readdir_dentry(dir, &buf, + fill_with_dentries, &pos); + } + mutex_unlock(&dir->d_inode->i_mutex); + + /* Clean up after a failed readdir */ + cleanup_dentry_buf(&buf); + + if (!err) { + /* We start a transaction here to avoid a ABBA situation + * between the xattr root's i_mutex and the journal lock. + * This doesn't incur much additional overhead since the + * new transaction will just nest inside the + * outer transaction. */ + int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); + struct reiserfs_transaction_handle th; + err = journal_begin(&th, inode->i_sb, blocks); + if (!err) { + int jerror; + reiserfs_mutex_lock_nested_safe( + &dir->d_parent->d_inode->i_mutex, + I_MUTEX_XATTR, inode->i_sb); + err = action(dir, data); + jerror = journal_end(&th, inode->i_sb, blocks); + mutex_unlock(&dir->d_parent->d_inode->i_mutex); + err = jerror ?: err; + } + } +out_dir: + dput(dir); +out: + /* -ENODATA isn't an error */ + if (err == -ENODATA) + err = 0; + return err; +} + +static int delete_one_xattr(struct dentry *dentry, void *data) +{ + struct inode *dir = dentry->d_parent->d_inode; + + /* This is the xattr dir, handle specially. */ + if (S_ISDIR(dentry->d_inode->i_mode)) + return xattr_rmdir(dir, dentry); + + return xattr_unlink(dir, dentry); +} + +static int chown_one_xattr(struct dentry *dentry, void *data) +{ + struct iattr *attrs = data; + return reiserfs_setattr(dentry, attrs); +} + +/* No i_mutex, but the inode is unconnected. */ +int reiserfs_delete_xattrs(struct inode *inode) +{ + int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL); + if (err) + reiserfs_warning(inode->i_sb, "jdm-20004", + "Couldn't delete all xattrs (%d)\n", err); + return err; +} + +/* inode->i_mutex: down */ +int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs) +{ + int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs); + if (err) + reiserfs_warning(inode->i_sb, "jdm-20007", + "Couldn't chown all xattrs (%d)\n", err); + return err; +} + +#ifdef CONFIG_REISERFS_FS_XATTR +/* Returns a dentry corresponding to a specific extended attribute file + * for the inode. If flags allow, the file is created. Otherwise, a + * valid or negative dentry, or an error is returned. */ +static struct dentry *xattr_lookup(struct inode *inode, const char *name, + int flags) +{ + struct dentry *xadir, *xafile; + int err = 0; + + xadir = open_xa_dir(inode, flags); + if (IS_ERR(xadir)) + return ERR_CAST(xadir); + + mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR); + xafile = lookup_one_len(name, xadir, strlen(name)); + if (IS_ERR(xafile)) { + err = PTR_ERR(xafile); + goto out; + } + + if (xafile->d_inode && (flags & XATTR_CREATE)) + err = -EEXIST; + + if (!xafile->d_inode) { + err = -ENODATA; + if (xattr_may_create(flags)) + err = xattr_create(xadir->d_inode, xafile, + 0700|S_IFREG); + } + + if (err) + dput(xafile); +out: + mutex_unlock(&xadir->d_inode->i_mutex); + dput(xadir); + if (err) + return ERR_PTR(err); + return xafile; +} + +/* Internal operations on file data */ +static inline void reiserfs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static struct page *reiserfs_get_page(struct inode *dir, size_t n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page; + /* We can deadlock if we try to free dentries, + and an unlink/rmdir has just occurred - GFP_NOFS avoids this */ + mapping_set_gfp_mask(mapping, GFP_NOFS); + page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL); + if (!IS_ERR(page)) { + kmap(page); + if (PageError(page)) + goto fail; + } + return page; + + fail: + reiserfs_put_page(page); + return ERR_PTR(-EIO); +} + +static inline __u32 xattr_hash(const char *msg, int len) +{ + return csum_partial(msg, len, 0); +} + +int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to); + +static void update_ctime(struct inode *inode) +{ + struct timespec now = current_fs_time(inode->i_sb); + if (inode_unhashed(inode) || !inode->i_nlink || + timespec_equal(&inode->i_ctime, &now)) + return; + + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +} + +static int lookup_and_delete_xattr(struct inode *inode, const char *name) +{ + int err = 0; + struct dentry *dentry, *xadir; + + xadir = open_xa_dir(inode, XATTR_REPLACE); + if (IS_ERR(xadir)) + return PTR_ERR(xadir); + + mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR); + dentry = lookup_one_len(name, xadir, strlen(name)); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_dput; + } + + if (dentry->d_inode) { + reiserfs_write_lock(inode->i_sb); + err = xattr_unlink(xadir->d_inode, dentry); + reiserfs_write_unlock(inode->i_sb); + update_ctime(inode); + } + + dput(dentry); +out_dput: + mutex_unlock(&xadir->d_inode->i_mutex); + dput(xadir); + return err; +} + + +/* Generic extended attribute operations that can be used by xa plugins */ + +/* + * inode->i_mutex: down + */ +int +reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, + struct inode *inode, const char *name, + const void *buffer, size_t buffer_size, int flags) +{ + int err = 0; + struct dentry *dentry; + struct page *page; + char *data; + size_t file_pos = 0; + size_t buffer_pos = 0; + size_t new_size; + __u32 xahash = 0; + + if (get_inode_sd_version(inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + reiserfs_write_unlock(inode->i_sb); + + if (!buffer) { + err = lookup_and_delete_xattr(inode, name); + reiserfs_write_lock(inode->i_sb); + return err; + } + + dentry = xattr_lookup(inode, name, flags); + if (IS_ERR(dentry)) { + reiserfs_write_lock(inode->i_sb); + return PTR_ERR(dentry); + } + + down_write(&REISERFS_I(inode)->i_xattr_sem); + + reiserfs_write_lock(inode->i_sb); + + xahash = xattr_hash(buffer, buffer_size); + while (buffer_pos < buffer_size || buffer_pos == 0) { + size_t chunk; + size_t skip = 0; + size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1)); + if (buffer_size - buffer_pos > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE; + else + chunk = buffer_size - buffer_pos; + + page = reiserfs_get_page(dentry->d_inode, file_pos); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out_unlock; + } + + lock_page(page); + data = page_address(page); + + if (file_pos == 0) { + struct reiserfs_xattr_header *rxh; + skip = file_pos = sizeof(struct reiserfs_xattr_header); + if (chunk + skip > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE - skip; + rxh = (struct reiserfs_xattr_header *)data; + rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC); + rxh->h_hash = cpu_to_le32(xahash); + } + + err = __reiserfs_write_begin(page, page_offset, chunk + skip); + if (!err) { + if (buffer) + memcpy(data + skip, buffer + buffer_pos, chunk); + err = reiserfs_commit_write(NULL, page, page_offset, + page_offset + chunk + + skip); + } + unlock_page(page); + reiserfs_put_page(page); + buffer_pos += chunk; + file_pos += chunk; + skip = 0; + if (err || buffer_size == 0 || !buffer) + break; + } + + new_size = buffer_size + sizeof(struct reiserfs_xattr_header); + if (!err && new_size < i_size_read(dentry->d_inode)) { + struct iattr newattrs = { + .ia_ctime = current_fs_time(inode->i_sb), + .ia_size = new_size, + .ia_valid = ATTR_SIZE | ATTR_CTIME, + }; + + reiserfs_write_unlock(inode->i_sb); + mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); + down_write(&dentry->d_inode->i_alloc_sem); + reiserfs_write_lock(inode->i_sb); + + err = reiserfs_setattr(dentry, &newattrs); + up_write(&dentry->d_inode->i_alloc_sem); + mutex_unlock(&dentry->d_inode->i_mutex); + } else + update_ctime(inode); +out_unlock: + up_write(&REISERFS_I(inode)->i_xattr_sem); + dput(dentry); + return err; +} + +/* We need to start a transaction to maintain lock ordering */ +int reiserfs_xattr_set(struct inode *inode, const char *name, + const void *buffer, size_t buffer_size, int flags) +{ + + struct reiserfs_transaction_handle th; + int error, error2; + size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size); + + if (!(flags & XATTR_REPLACE)) + jbegin_count += reiserfs_xattr_jcreate_nblocks(inode); + + reiserfs_write_lock(inode->i_sb); + error = journal_begin(&th, inode->i_sb, jbegin_count); + if (error) { + reiserfs_write_unlock(inode->i_sb); + return error; + } + + error = reiserfs_xattr_set_handle(&th, inode, name, + buffer, buffer_size, flags); + + error2 = journal_end(&th, inode->i_sb, jbegin_count); + if (error == 0) + error = error2; + reiserfs_write_unlock(inode->i_sb); + + return error; +} + +/* + * inode->i_mutex: down + */ +int +reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, + size_t buffer_size) +{ + ssize_t err = 0; + struct dentry *dentry; + size_t isize; + size_t file_pos = 0; + size_t buffer_pos = 0; + struct page *page; + __u32 hash = 0; + + if (name == NULL) + return -EINVAL; + + /* We can't have xattrs attached to v1 items since they don't have + * generation numbers */ + if (get_inode_sd_version(inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + dentry = xattr_lookup(inode, name, XATTR_REPLACE); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } + + down_read(&REISERFS_I(inode)->i_xattr_sem); + + isize = i_size_read(dentry->d_inode); + + /* Just return the size needed */ + if (buffer == NULL) { + err = isize - sizeof(struct reiserfs_xattr_header); + goto out_unlock; + } + + if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) { + err = -ERANGE; + goto out_unlock; + } + + while (file_pos < isize) { + size_t chunk; + char *data; + size_t skip = 0; + if (isize - file_pos > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE; + else + chunk = isize - file_pos; + + page = reiserfs_get_page(dentry->d_inode, file_pos); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out_unlock; + } + + lock_page(page); + data = page_address(page); + if (file_pos == 0) { + struct reiserfs_xattr_header *rxh = + (struct reiserfs_xattr_header *)data; + skip = file_pos = sizeof(struct reiserfs_xattr_header); + chunk -= skip; + /* Magic doesn't match up.. */ + if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) { + unlock_page(page); + reiserfs_put_page(page); + reiserfs_warning(inode->i_sb, "jdm-20001", + "Invalid magic for xattr (%s) " + "associated with %k", name, + INODE_PKEY(inode)); + err = -EIO; + goto out_unlock; + } + hash = le32_to_cpu(rxh->h_hash); + } + memcpy(buffer + buffer_pos, data + skip, chunk); + unlock_page(page); + reiserfs_put_page(page); + file_pos += chunk; + buffer_pos += chunk; + skip = 0; + } + err = isize - sizeof(struct reiserfs_xattr_header); + + if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) != + hash) { + reiserfs_warning(inode->i_sb, "jdm-20002", + "Invalid hash for xattr (%s) associated " + "with %k", name, INODE_PKEY(inode)); + err = -EIO; + } + +out_unlock: + up_read(&REISERFS_I(inode)->i_xattr_sem); + dput(dentry); + +out: + return err; +} + +/* + * In order to implement different sets of xattr operations for each xattr + * prefix with the generic xattr API, a filesystem should create a + * null-terminated array of struct xattr_handler (one for each prefix) and + * hang a pointer to it off of the s_xattr field of the superblock. + * + * The generic_fooxattr() functions will use this list to dispatch xattr + * operations to the correct xattr_handler. + */ +#define for_each_xattr_handler(handlers, handler) \ + for ((handler) = *(handlers)++; \ + (handler) != NULL; \ + (handler) = *(handlers)++) + +/* This is the implementation for the xattr plugin infrastructure */ +static inline const struct xattr_handler * +find_xattr_handler_prefix(const struct xattr_handler **handlers, + const char *name) +{ + const struct xattr_handler *xah; + + if (!handlers) + return NULL; + + for_each_xattr_handler(handlers, xah) { + if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0) + break; + } + + return xah; +} + + +/* + * Inode operation getxattr() + */ +ssize_t +reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, + size_t size) +{ + const struct xattr_handler *handler; + + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); + + if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + return handler->get(dentry, name, buffer, size, handler->flags); +} + +/* + * Inode operation setxattr() + * + * dentry->d_inode->i_mutex down + */ +int +reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + const struct xattr_handler *handler; + + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); + + if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + return handler->set(dentry, name, value, size, flags, handler->flags); +} + +/* + * Inode operation removexattr() + * + * dentry->d_inode->i_mutex down + */ +int reiserfs_removexattr(struct dentry *dentry, const char *name) +{ + const struct xattr_handler *handler; + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); + + if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags); +} + +struct listxattr_buf { + size_t size; + size_t pos; + char *buf; + struct dentry *dentry; +}; + +static int listxattr_filler(void *buf, const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct listxattr_buf *b = (struct listxattr_buf *)buf; + size_t size; + if (name[0] != '.' || + (namelen != 1 && (name[1] != '.' || namelen != 2))) { + const struct xattr_handler *handler; + handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, + name); + if (!handler) /* Unsupported xattr name */ + return 0; + if (b->buf) { + size = handler->list(b->dentry, b->buf + b->pos, + b->size, name, namelen, + handler->flags); + if (size > b->size) + return -ERANGE; + } else { + size = handler->list(b->dentry, NULL, 0, name, + namelen, handler->flags); + } + + b->pos += size; + } + return 0; +} + +/* + * Inode operation listxattr() + * + * We totally ignore the generic listxattr here because it would be stupid + * not to. Since the xattrs are organized in a directory, we can just + * readdir to find them. + */ +ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) +{ + struct dentry *dir; + int err = 0; + loff_t pos = 0; + struct listxattr_buf buf = { + .dentry = dentry, + .buf = buffer, + .size = buffer ? size : 0, + }; + + if (!dentry->d_inode) + return -EINVAL; + + if (!dentry->d_sb->s_xattr || + get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + dir = open_xa_dir(dentry->d_inode, XATTR_REPLACE); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + if (err == -ENODATA) + err = 0; /* Not an error if there aren't any xattrs */ + goto out; + } + + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); + err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos); + mutex_unlock(&dir->d_inode->i_mutex); + + if (!err) + err = buf.pos; + + dput(dir); +out: + return err; +} + +static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags) +{ + struct posix_acl *acl; + int error = -EAGAIN; /* do regular unix permission checks by default */ + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); + + if (acl) { + if (!IS_ERR(acl)) { + error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + } else if (PTR_ERR(acl) != -ENODATA) + error = PTR_ERR(acl); + } + + return error; +} + +static int create_privroot(struct dentry *dentry) +{ + int err; + struct inode *inode = dentry->d_parent->d_inode; + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + + err = xattr_mkdir(inode, dentry, 0700); + if (err || !dentry->d_inode) { + reiserfs_warning(dentry->d_sb, "jdm-20006", + "xattrs/ACLs enabled and couldn't " + "find/create .reiserfs_priv. " + "Failing mount."); + return -EOPNOTSUPP; + } + + dentry->d_inode->i_flags |= S_PRIVATE; + reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr " + "storage.\n", PRIVROOT_NAME); + + return 0; +} + +#else +int __init reiserfs_xattr_register_handlers(void) { return 0; } +void reiserfs_xattr_unregister_handlers(void) {} +static int create_privroot(struct dentry *dentry) { return 0; } +#endif + +/* Actual operations that are exported to VFS-land */ +const struct xattr_handler *reiserfs_xattr_handlers[] = { +#ifdef CONFIG_REISERFS_FS_XATTR + &reiserfs_xattr_user_handler, + &reiserfs_xattr_trusted_handler, +#endif +#ifdef CONFIG_REISERFS_FS_SECURITY + &reiserfs_xattr_security_handler, +#endif +#ifdef CONFIG_REISERFS_FS_POSIX_ACL + &reiserfs_posix_acl_access_handler, + &reiserfs_posix_acl_default_handler, +#endif + NULL +}; + +static int xattr_mount_check(struct super_block *s) +{ + /* We need generation numbers to ensure that the oid mapping is correct + * v3.5 filesystems don't have them. */ + if (old_format_only(s)) { + if (reiserfs_xattrs_optional(s)) { + /* Old format filesystem, but optional xattrs have + * been enabled. Error out. */ + reiserfs_warning(s, "jdm-2005", + "xattrs/ACLs not supported " + "on pre-v3.6 format filesystems. " + "Failing mount."); + return -EOPNOTSUPP; + } + } + + return 0; +} + +int reiserfs_permission(struct inode *inode, int mask, unsigned int flags) +{ + /* + * We don't do permission checks on the internal objects. + * Permissions are determined by the "owning" object. + */ + if (IS_PRIVATE(inode)) + return 0; + +#ifdef CONFIG_REISERFS_FS_XATTR + /* + * Stat data v1 doesn't support ACLs. + */ + if (get_inode_sd_version(inode) != STAT_DATA_V1) + return generic_permission(inode, mask, flags, + reiserfs_check_acl); +#endif + return generic_permission(inode, mask, flags, NULL); +} + +static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + return -EPERM; +} + +static const struct dentry_operations xattr_lookup_poison_ops = { + .d_revalidate = xattr_hide_revalidate, +}; + +int reiserfs_lookup_privroot(struct super_block *s) +{ + struct dentry *dentry; + int err = 0; + + /* If we don't have the privroot located yet - go find it */ + reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s); + dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, + strlen(PRIVROOT_NAME)); + if (!IS_ERR(dentry)) { + REISERFS_SB(s)->priv_root = dentry; + d_set_d_op(dentry, &xattr_lookup_poison_ops); + if (dentry->d_inode) + dentry->d_inode->i_flags |= S_PRIVATE; + } else + err = PTR_ERR(dentry); + mutex_unlock(&s->s_root->d_inode->i_mutex); + + return err; +} + +/* We need to take a copy of the mount flags since things like + * MS_RDONLY don't get set until *after* we're called. + * mount_flags != mount_options */ +int reiserfs_xattr_init(struct super_block *s, int mount_flags) +{ + int err = 0; + struct dentry *privroot = REISERFS_SB(s)->priv_root; + + err = xattr_mount_check(s); + if (err) + goto error; + + if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) { + reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s); + err = create_privroot(REISERFS_SB(s)->priv_root); + mutex_unlock(&s->s_root->d_inode->i_mutex); + } + + if (privroot->d_inode) { + s->s_xattr = reiserfs_xattr_handlers; + reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s); + if (!REISERFS_SB(s)->xattr_root) { + struct dentry *dentry; + dentry = lookup_one_len(XAROOT_NAME, privroot, + strlen(XAROOT_NAME)); + if (!IS_ERR(dentry)) + REISERFS_SB(s)->xattr_root = dentry; + else + err = PTR_ERR(dentry); + } + mutex_unlock(&privroot->d_inode->i_mutex); + } + +error: + if (err) { + clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt)); + clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt)); + } + + /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ + if (reiserfs_posixacl(s)) + s->s_flags |= MS_POSIXACL; + else + s->s_flags &= ~MS_POSIXACL; + + return err; +} diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c new file mode 100644 index 00000000..3dc38f12 --- /dev/null +++ b/fs/reiserfs/xattr_acl.c @@ -0,0 +1,531 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int reiserfs_set_acl(struct reiserfs_transaction_handle *th, + struct inode *inode, int type, + struct posix_acl *acl); + +static int +posix_acl_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl; + int error, error2; + struct reiserfs_transaction_handle th; + size_t jcreate_blocks; + if (!reiserfs_posixacl(inode->i_sb)) + return -EOPNOTSUPP; + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) { + return PTR_ERR(acl); + } else if (acl) { + error = posix_acl_valid(acl); + if (error) + goto release_and_out; + } + } else + acl = NULL; + + /* Pessimism: We can't assume that anything from the xattr root up + * has been created. */ + + jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) + + reiserfs_xattr_nblocks(inode, size) * 2; + + reiserfs_write_lock(inode->i_sb); + error = journal_begin(&th, inode->i_sb, jcreate_blocks); + if (error == 0) { + error = reiserfs_set_acl(&th, inode, type, acl); + error2 = journal_end(&th, inode->i_sb, jcreate_blocks); + if (error2) + error = error2; + } + reiserfs_write_unlock(inode->i_sb); + + release_and_out: + posix_acl_release(acl); + return error; +} + +static int +posix_acl_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) +{ + struct posix_acl *acl; + int error; + + if (!reiserfs_posixacl(dentry->d_sb)) + return -EOPNOTSUPP; + + acl = reiserfs_get_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl *posix_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(reiserfs_acl_header)) + return ERR_PTR(-EINVAL); + if (((reiserfs_acl_header *) value)->a_version != + cpu_to_le32(REISERFS_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(reiserfs_acl_header); + count = reiserfs_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + reiserfs_acl_entry *entry = (reiserfs_acl_entry *) value; + if ((char *)value + sizeof(reiserfs_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + switch (acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(reiserfs_acl_entry_short); + acl->a_entries[n].e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + value = (char *)value + sizeof(reiserfs_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_id = le32_to_cpu(entry->e_id); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + + fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size) +{ + reiserfs_acl_header *ext_acl; + char *e; + int n; + + *size = reiserfs_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(reiserfs_acl_header) + + acl->a_count * + sizeof(reiserfs_acl_entry), + GFP_NOFS); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION); + e = (char *)ext_acl + sizeof(reiserfs_acl_header); + for (n = 0; n < acl->a_count; n++) { + reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e; + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + switch (acl->a_entries[n].e_tag) { + case ACL_USER: + case ACL_GROUP: + entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + e += sizeof(reiserfs_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(reiserfs_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + + fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_mutex: down + * BKL held [before 2.5.x] + */ +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) +{ + char *name, *value; + struct posix_acl *acl; + int size; + int retval; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + + size = reiserfs_xattr_get(inode, name, NULL, 0); + if (size < 0) { + if (size == -ENODATA || size == -ENOSYS) { + set_cached_acl(inode, type, NULL); + return NULL; + } + return ERR_PTR(size); + } + + value = kmalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + + retval = reiserfs_xattr_get(inode, name, value, size); + if (retval == -ENODATA || retval == -ENOSYS) { + /* This shouldn't actually happen as it should have + been caught above.. but just in case */ + acl = NULL; + } else if (retval < 0) { + acl = ERR_PTR(retval); + } else { + acl = posix_acl_from_disk(value, retval); + } + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + kfree(value); + return acl; +} + +/* + * Inode operation set_posix_acl(). + * + * inode->i_mutex: down + * BKL held [before 2.5.x] + */ +static int +reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, + int type, struct posix_acl *acl) +{ + char *name; + void *value = NULL; + size_t size = 0; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; + else { + inode->i_mode = mode; + if (error == 0) + acl = NULL; + } + } + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + default: + return -EINVAL; + } + + if (acl) { + value = posix_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0); + + /* + * Ensure that the inode gets dirtied if we're only using + * the mode bits and an old ACL didn't exist. We don't need + * to check if the inode is hashed here since we won't get + * called by reiserfs_inherit_default_acl(). + */ + if (error == -ENODATA) { + error = 0; + if (type == ACL_TYPE_ACCESS) { + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + } + } + + kfree(value); + + if (!error) + set_cached_acl(inode, type, acl); + + return error; +} + +/* dir->i_mutex: locked, + * inode is new and not released into the wild yet */ +int +reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, + struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct posix_acl *acl; + int err = 0; + + /* ACLs only get applied to files and directories */ + if (S_ISLNK(inode->i_mode)) + return 0; + + /* ACLs can only be used on "new" objects, so if it's an old object + * there is nothing to inherit from */ + if (get_inode_sd_version(dir) == STAT_DATA_V1) + goto apply_umask; + + /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This + * would be useless since permissions are ignored, and a pain because + * it introduces locking cycles */ + if (IS_PRIVATE(dir)) { + inode->i_flags |= S_PRIVATE; + goto apply_umask; + } + + acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + struct posix_acl *acl_copy; + mode_t mode = inode->i_mode; + int need_acl; + + /* Copy the default ACL to the default ACL of a new directory */ + if (S_ISDIR(inode->i_mode)) { + err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, + acl); + if (err) + goto cleanup; + } + + /* Now we reconcile the new ACL and the mode, + potentially modifying both */ + acl_copy = posix_acl_clone(acl, GFP_NOFS); + if (!acl_copy) { + err = -ENOMEM; + goto cleanup; + } + + need_acl = posix_acl_create_masq(acl_copy, &mode); + if (need_acl >= 0) { + if (mode != inode->i_mode) { + inode->i_mode = mode; + } + + /* If we need an ACL.. */ + if (need_acl > 0) { + err = reiserfs_set_acl(th, inode, + ACL_TYPE_ACCESS, + acl_copy); + if (err) + goto cleanup_copy; + } + } + cleanup_copy: + posix_acl_release(acl_copy); + cleanup: + posix_acl_release(acl); + } else { + apply_umask: + /* no ACL, apply umask */ + inode->i_mode &= ~current_umask(); + } + + return err; +} + +/* This is used to cache the default acl before a new object is created. + * The biggest reason for this is to get an idea of how many blocks will + * actually be required for the create operation if we must inherit an ACL. + * An ACL write can add up to 3 object creations and an additional file write + * so we'd prefer not to reserve that many blocks in the journal if we can. + * It also has the advantage of not loading the ACL with a transaction open, + * this may seem silly, but if the owner of the directory is doing the + * creation, the ACL may not be loaded since the permissions wouldn't require + * it. + * We return the number of blocks required for the transaction. + */ +int reiserfs_cache_default_acl(struct inode *inode) +{ + struct posix_acl *acl; + int nblocks = 0; + + if (IS_PRIVATE(inode)) + return 0; + + acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT); + + if (acl && !IS_ERR(acl)) { + int size = reiserfs_acl_size(acl->a_count); + + /* Other xattrs can be created during inode creation. We don't + * want to claim too many blocks, so we check to see if we + * we need to create the tree to the xattrs, and then we + * just want two files. */ + nblocks = reiserfs_xattr_jcreate_nblocks(inode); + nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); + + REISERFS_I(inode)->i_flags |= i_has_xattr_dir; + + /* We need to account for writes + bitmaps for two files */ + nblocks += reiserfs_xattr_nblocks(inode, size) * 4; + posix_acl_release(acl); + } + + return nblocks; +} + +int reiserfs_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (get_inode_sd_version(inode) == STAT_DATA_V1 || + !reiserfs_posixacl(inode->i_sb)) { + return 0; + } + + reiserfs_write_unlock(inode->i_sb); + acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); + reiserfs_write_lock(inode->i_sb); + if (!acl) + return 0; + if (IS_ERR(acl)) + return PTR_ERR(acl); + clone = posix_acl_clone(acl, GFP_NOFS); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + error = posix_acl_chmod_masq(clone, inode->i_mode); + if (!error) { + struct reiserfs_transaction_handle th; + size_t size = reiserfs_xattr_nblocks(inode, + reiserfs_acl_size(clone->a_count)); + int depth; + + depth = reiserfs_write_lock_once(inode->i_sb); + error = journal_begin(&th, inode->i_sb, size * 2); + if (!error) { + int error2; + error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, + clone); + error2 = journal_end(&th, inode->i_sb, size * 2); + if (error2) + error = error2; + } + reiserfs_write_unlock_once(inode->i_sb, depth); + } + posix_acl_release(clone); + return error; +} + +static size_t posix_acl_access_list(struct dentry *dentry, char *list, + size_t list_size, const char *name, + size_t name_len, int type) +{ + const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); + if (!reiserfs_posixacl(dentry->d_sb)) + return 0; + if (list && size <= list_size) + memcpy(list, POSIX_ACL_XATTR_ACCESS, size); + return size; +} + +const struct xattr_handler reiserfs_posix_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .get = posix_acl_get, + .set = posix_acl_set, + .list = posix_acl_access_list, +}; + +static size_t posix_acl_default_list(struct dentry *dentry, char *list, + size_t list_size, const char *name, + size_t name_len, int type) +{ + const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); + if (!reiserfs_posixacl(dentry->d_sb)) + return 0; + if (list && size <= list_size) + memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); + return size; +} + +const struct xattr_handler reiserfs_posix_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .get = posix_acl_get, + .set = posix_acl_set, + .list = posix_acl_default_list, +}; diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c new file mode 100644 index 00000000..ef66c18a --- /dev/null +++ b/fs/reiserfs/xattr_security.c @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + if (IS_PRIVATE(dentry->d_inode)) + return -EPERM; + + return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); +} + +static int +security_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + if (IS_PRIVATE(dentry->d_inode)) + return -EPERM; + + return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); +} + +static size_t security_list(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t namelen, int handler_flags) +{ + const size_t len = namelen + 1; + + if (IS_PRIVATE(dentry->d_inode)) + return 0; + + if (list && len <= list_len) { + memcpy(list, name, namelen); + list[namelen] = '\0'; + } + + return len; +} + +/* Initializes the security context for a new inode and returns the number + * of blocks needed for the transaction. If successful, reiserfs_security + * must be released using reiserfs_security_free when the caller is done. */ +int reiserfs_security_init(struct inode *dir, struct inode *inode, + const struct qstr *qstr, + struct reiserfs_security_handle *sec) +{ + int blocks = 0; + int error; + + sec->name = NULL; + + /* Don't add selinux attributes on xattrs - they'll never get used */ + if (IS_PRIVATE(dir)) + return 0; + + error = security_inode_init_security(inode, dir, qstr, &sec->name, + &sec->value, &sec->length); + if (error) { + if (error == -EOPNOTSUPP) + error = 0; + + sec->name = NULL; + sec->value = NULL; + sec->length = 0; + return error; + } + + if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) { + blocks = reiserfs_xattr_jcreate_nblocks(inode) + + reiserfs_xattr_nblocks(inode, sec->length); + /* We don't want to count the directories twice if we have + * a default ACL. */ + REISERFS_I(inode)->i_flags |= i_has_xattr_dir; + } + return blocks; +} + +int reiserfs_security_write(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct reiserfs_security_handle *sec) +{ + int error; + if (strlen(sec->name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + error = reiserfs_xattr_set_handle(th, inode, sec->name, sec->value, + sec->length, XATTR_CREATE); + if (error == -ENODATA || error == -EOPNOTSUPP) + error = 0; + + return error; +} + +void reiserfs_security_free(struct reiserfs_security_handle *sec) +{ + kfree(sec->name); + kfree(sec->value); + sec->name = NULL; + sec->value = NULL; +} + +const struct xattr_handler reiserfs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = security_get, + .set = security_set, + .list = security_list, +}; diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c new file mode 100644 index 00000000..9883736c --- /dev/null +++ b/fs/reiserfs/xattr_trusted.c @@ -0,0 +1,56 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +static int +trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) + return -EPERM; + + return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); +} + +static int +trusted_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) + return -EPERM; + + return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); +} + +static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int handler_flags) +{ + const size_t len = name_len + 1; + + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) + return 0; + + if (list && len <= list_size) { + memcpy(list, name, name_len); + list[name_len] = '\0'; + } + return len; +} + +const struct xattr_handler reiserfs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = trusted_get, + .set = trusted_set, + .list = trusted_list, +}; diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c new file mode 100644 index 00000000..45ae1a00 --- /dev/null +++ b/fs/reiserfs/xattr_user.c @@ -0,0 +1,52 @@ +#include +#include +#include +#include +#include +#include +#include + +static int +user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) +{ + + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; + if (!reiserfs_xattrs_user(dentry->d_sb)) + return -EOPNOTSUPP; + return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); +} + +static int +user_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs_user(dentry->d_sb)) + return -EOPNOTSUPP; + return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); +} + +static size_t user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int handler_flags) +{ + const size_t len = name_len + 1; + + if (!reiserfs_xattrs_user(dentry->d_sb)) + return 0; + if (list && len <= list_size) { + memcpy(list, name, name_len); + list[name_len] = '\0'; + } + return len; +} + +const struct xattr_handler reiserfs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = user_get, + .set = user_set, + .list = user_list, +}; diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig new file mode 100644 index 00000000..ce2d6bcc --- /dev/null +++ b/fs/romfs/Kconfig @@ -0,0 +1,62 @@ +config ROMFS_FS + tristate "ROM file system support" + depends on BLOCK || MTD + ---help--- + This is a very small read-only file system mainly intended for + initial ram disks of installation disks, but it could be used for + other read-only media as well. Read + for details. + + To compile this file system support as a module, choose M here: the + module will be called romfs. Note that the file system of your + root partition (the one containing the directory /) cannot be a + module. + + If you don't know whether you need it, then you don't need it: + answer N. + +# +# Select the backing stores to be supported +# +choice + prompt "RomFS backing stores" + depends on ROMFS_FS + default ROMFS_BACKED_BY_BLOCK + help + Select the backing stores to be supported. + +config ROMFS_BACKED_BY_BLOCK + bool "Block device-backed ROM file system support" + depends on BLOCK + help + This permits ROMFS to use block devices buffered through the page + cache as the medium from which to retrieve data. It does not allow + direct mapping of the medium. + + If unsure, answer Y. + +config ROMFS_BACKED_BY_MTD + bool "MTD-backed ROM file system support" + depends on MTD=y || (ROMFS_FS=m && MTD) + help + This permits ROMFS to use MTD based devices directly, without the + intercession of the block layer (which may have been disabled). It + also allows direct mapping of MTD devices through romfs files under + NOMMU conditions if the underlying device is directly addressable by + the CPU. + + If unsure, answer Y. + +config ROMFS_BACKED_BY_BOTH + bool "Both the above" + depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD)) +endchoice + + +config ROMFS_ON_BLOCK + bool + default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH + +config ROMFS_ON_MTD + bool + default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile new file mode 100644 index 00000000..420beb7d --- /dev/null +++ b/fs/romfs/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the linux RomFS filesystem routines. +# + +obj-$(CONFIG_ROMFS_FS) += romfs.o + +romfs-y := storage.o super.o + +ifneq ($(CONFIG_MMU),y) +romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o +endif + diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h new file mode 100644 index 00000000..95217b83 --- /dev/null +++ b/fs/romfs/internal.h @@ -0,0 +1,47 @@ +/* RomFS internal definitions + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +struct romfs_inode_info { + struct inode vfs_inode; + unsigned long i_metasize; /* size of non-data area */ + unsigned long i_dataoffset; /* from the start of fs */ +}; + +static inline size_t romfs_maxsize(struct super_block *sb) +{ + return (size_t) (unsigned long) sb->s_fs_info; +} + +static inline struct romfs_inode_info *ROMFS_I(struct inode *inode) +{ + return container_of(inode, struct romfs_inode_info, vfs_inode); +} + +/* + * mmap-nommu.c + */ +#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD) +extern const struct file_operations romfs_ro_fops; +#else +#define romfs_ro_fops generic_ro_fops +#endif + +/* + * storage.c + */ +extern int romfs_dev_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen); +extern ssize_t romfs_dev_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen); +extern int romfs_dev_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size); diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c new file mode 100644 index 00000000..eed99428 --- /dev/null +++ b/fs/romfs/mmap-nommu.c @@ -0,0 +1,79 @@ +/* NOMMU mmap support for RomFS on MTD devices + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include "internal.h" + +/* + * try to determine where a shared mapping can be made + * - only supported for NOMMU at the moment (MMU can't doesn't copy private + * mappings) + * - attempts to map through to the underlying MTD device + */ +static unsigned long romfs_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + struct inode *inode = file->f_mapping->host; + struct mtd_info *mtd = inode->i_sb->s_mtd; + unsigned long isize, offset, maxpages, lpages; + + if (!mtd) + goto cant_map_directly; + + /* the mapping mustn't extend beyond the EOF */ + lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + isize = i_size_read(inode); + offset = pgoff << PAGE_SHIFT; + + maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT; + if ((pgoff >= maxpages) || (maxpages - pgoff < lpages)) + return (unsigned long) -EINVAL; + + /* we need to call down to the MTD layer to do the actual mapping */ + if (mtd->get_unmapped_area) { + if (addr != 0) + return (unsigned long) -EINVAL; + + if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT)) + return (unsigned long) -EINVAL; + + offset += ROMFS_I(inode)->i_dataoffset; + if (offset > mtd->size - len) + return (unsigned long) -EINVAL; + + return mtd->get_unmapped_area(mtd, len, offset, flags); + } + +cant_map_directly: + return (unsigned long) -ENOSYS; +} + +/* + * permit a R/O mapping to be made directly through onto an MTD device if + * possible + */ +static int romfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; +} + +const struct file_operations romfs_ro_fops = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .splice_read = generic_file_splice_read, + .mmap = romfs_mmap, + .get_unmapped_area = romfs_get_unmapped_area, +}; diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c new file mode 100644 index 00000000..71e2b4d5 --- /dev/null +++ b/fs/romfs/storage.c @@ -0,0 +1,293 @@ +/* RomFS storage access routines + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include "internal.h" + +#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK) +#error no ROMFS backing store interface configured +#endif + +#ifdef CONFIG_ROMFS_ON_MTD +#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__)) + +/* + * read data from an romfs image on an MTD device + */ +static int romfs_mtd_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + size_t rlen; + int ret; + + ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf); + return (ret < 0 || rlen != buflen) ? -EIO : 0; +} + +/* + * determine the length of a string in a romfs image on an MTD device + */ +static ssize_t romfs_mtd_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen) +{ + ssize_t n = 0; + size_t segment; + u_char buf[16], *p; + size_t len; + int ret; + + /* scan the string up to 16 bytes at a time */ + while (maxlen > 0) { + segment = min_t(size_t, maxlen, 16); + ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf); + if (ret < 0) + return ret; + p = memchr(buf, 0, len); + if (p) + return n + (p - buf); + maxlen -= len; + pos += len; + n += len; + } + + return n; +} + +/* + * compare a string to one in a romfs image on MTD + * - return 1 if matched, 0 if differ, -ve if error + */ +static int romfs_mtd_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + u_char buf[17]; + size_t len, segment; + int ret; + + /* scan the string up to 16 bytes at a time, and attempt to grab the + * trailing NUL whilst we're at it */ + buf[0] = 0xff; + + while (size > 0) { + segment = min_t(size_t, size + 1, 17); + ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf); + if (ret < 0) + return ret; + len--; + if (memcmp(buf, str, len) != 0) + return 0; + buf[0] = buf[len]; + size -= len; + pos += len; + str += len; + } + + /* check the trailing NUL was */ + if (buf[0]) + return 0; + + return 1; +} +#endif /* CONFIG_ROMFS_ON_MTD */ + +#ifdef CONFIG_ROMFS_ON_BLOCK +/* + * read data from an romfs image on a block device + */ +static int romfs_blk_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + struct buffer_head *bh; + unsigned long offset; + size_t segment; + + /* copy the string up to blocksize bytes at a time */ + while (buflen > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, buflen, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + memcpy(buf, bh->b_data + offset, segment); + brelse(bh); + buf += segment; + buflen -= segment; + pos += segment; + } + + return 0; +} + +/* + * determine the length of a string in romfs on a block device + */ +static ssize_t romfs_blk_strnlen(struct super_block *sb, + unsigned long pos, size_t limit) +{ + struct buffer_head *bh; + unsigned long offset; + ssize_t n = 0; + size_t segment; + u_char *buf, *p; + + /* scan the string up to blocksize bytes at a time */ + while (limit > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, limit, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + buf = bh->b_data + offset; + p = memchr(buf, 0, segment); + brelse(bh); + if (p) + return n + (p - buf); + limit -= segment; + pos += segment; + n += segment; + } + + return n; +} + +/* + * compare a string to one in a romfs image on a block device + * - return 1 if matched, 0 if differ, -ve if error + */ +static int romfs_blk_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + struct buffer_head *bh; + unsigned long offset; + size_t segment; + bool matched, terminated = false; + + /* compare string up to a block at a time */ + while (size > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, size, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + matched = (memcmp(bh->b_data + offset, str, segment) == 0); + + size -= segment; + pos += segment; + str += segment; + if (matched && size == 0 && offset + segment < ROMBSIZE) { + if (!bh->b_data[offset + segment]) + terminated = true; + else + matched = false; + } + brelse(bh); + if (!matched) + return 0; + } + + if (!terminated) { + /* the terminating NUL must be on the first byte of the next + * block */ + BUG_ON((pos & (ROMBSIZE - 1)) != 0); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + matched = !bh->b_data[0]; + brelse(bh); + if (!matched) + return 0; + } + + return 1; +} +#endif /* CONFIG_ROMFS_ON_BLOCK */ + +/* + * read data from the romfs image + */ +int romfs_dev_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (buflen > limit - pos) + buflen = limit - pos; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_read(sb, pos, buf, buflen); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_read(sb, pos, buf, buflen); +#endif + return -EIO; +} + +/* + * determine the length of a string in romfs + */ +ssize_t romfs_dev_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (maxlen > limit - pos) + maxlen = limit - pos; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_strnlen(sb, pos, maxlen); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_strnlen(sb, pos, maxlen); +#endif + return -EIO; +} + +/* + * compare a string to one in romfs + * - the string to be compared to, str, may not be NUL-terminated; instead the + * string is of the specified size + * - return 1 if matched, 0 if differ, -ve if error + */ +int romfs_dev_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (size > ROMFS_MAXFN) + return -ENAMETOOLONG; + if (size + 1 > limit - pos) + return -EIO; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_strcmp(sb, pos, str, size); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_strcmp(sb, pos, str, size); +#endif + return -EIO; +} diff --git a/fs/romfs/super.c b/fs/romfs/super.c new file mode 100644 index 00000000..2305e312 --- /dev/null +++ b/fs/romfs/super.c @@ -0,0 +1,662 @@ +/* Block- or MTD-based romfs + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * Derived from: ROMFS file system, Linux implementation + * + * Copyright © 1997-1999 Janos Farkas + * + * Using parts of the minix filesystem + * Copyright © 1991, 1992 Linus Torvalds + * + * and parts of the affs filesystem additionally + * Copyright © 1993 Ray Burr + * Copyright © 1996 Hans-Joachim Widmaier + * + * Changes + * Changed for 2.1.19 modules + * Jan 1997 Initial release + * Jun 1997 2.1.43+ changes + * Proper page locking in readpage + * Changed to work with 2.1.45+ fs + * Jul 1997 Fixed follow_link + * 2.1.47 + * lookup shouldn't return -ENOENT + * from Horst von Brand: + * fail on wrong checksum + * double unlock_super was possible + * correct namelen for statfs + * spotted by Bill Hawes: + * readlink shouldn't iput() + * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir() + * exposed a problem in readdir + * 2.1.107 code-freeze spellchecker run + * Aug 1998 2.1.118+ VFS changes + * Sep 1998 2.1.122 another VFS change (follow_link) + * Apr 1999 2.2.7 no more EBADF checking in + * lookup/readdir, use ERR_PTR + * Jun 1999 2.3.6 d_alloc_root use changed + * 2.3.9 clean up usage of ENOENT/negative + * dentries in lookup + * clean up page flags setting + * (error, uptodate, locking) in + * in readpage + * use init_special_inode for + * fifos/sockets (and streamline) in + * read_inode, fix _ops table order + * Aug 1999 2.3.16 __initfunc() => __init change + * Oct 1999 2.3.24 page->owner hack obsoleted + * Nov 1999 2.3.27 2.3.25+ page->offset => index change + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static struct kmem_cache *romfs_inode_cachep; + +static const umode_t romfs_modemap[8] = { + 0, /* hard link */ + S_IFDIR | 0644, /* directory */ + S_IFREG | 0644, /* regular file */ + S_IFLNK | 0777, /* symlink */ + S_IFBLK | 0600, /* blockdev */ + S_IFCHR | 0600, /* chardev */ + S_IFSOCK | 0644, /* socket */ + S_IFIFO | 0644 /* FIFO */ +}; + +static const unsigned char romfs_dtype_table[] = { + DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO +}; + +static struct inode *romfs_iget(struct super_block *sb, unsigned long pos); + +/* + * read a page worth of data from the image + */ +static int romfs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + loff_t offset, size; + unsigned long fillsize, pos; + void *buf; + int ret; + + buf = kmap(page); + if (!buf) + return -ENOMEM; + + /* 32 bit warning -- but not for us :) */ + offset = page_offset(page); + size = i_size_read(inode); + fillsize = 0; + ret = 0; + if (offset < size) { + size -= offset; + fillsize = size > PAGE_SIZE ? PAGE_SIZE : size; + + pos = ROMFS_I(inode)->i_dataoffset + offset; + + ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize); + if (ret < 0) { + SetPageError(page); + fillsize = 0; + ret = -EIO; + } + } + + if (fillsize < PAGE_SIZE) + memset(buf + fillsize, 0, PAGE_SIZE - fillsize); + if (ret == 0) + SetPageUptodate(page); + + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + return ret; +} + +static const struct address_space_operations romfs_aops = { + .readpage = romfs_readpage +}; + +/* + * read the entries from a directory + */ +static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *i = filp->f_dentry->d_inode; + struct romfs_inode ri; + unsigned long offset, maxoff; + int j, ino, nextfh; + int stored = 0; + char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ + int ret; + + maxoff = romfs_maxsize(i->i_sb); + + offset = filp->f_pos; + if (!offset) { + offset = i->i_ino & ROMFH_MASK; + ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto out; + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + } + + /* Not really failsafe, but we are read-only... */ + for (;;) { + if (!offset || offset >= maxoff) { + offset = maxoff; + filp->f_pos = offset; + goto out; + } + filp->f_pos = offset; + + /* Fetch inode info */ + ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto out; + + j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE, + sizeof(fsname) - 1); + if (j < 0) + goto out; + + ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j); + if (ret < 0) + goto out; + fsname[j] = '\0'; + + ino = offset; + nextfh = be32_to_cpu(ri.next); + if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) + ino = be32_to_cpu(ri.spec); + if (filldir(dirent, fsname, j, offset, ino, + romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) + goto out; + + stored++; + offset = nextfh & ROMFH_MASK; + } + +out: + return stored; +} + +/* + * look up an entry in a directory + */ +static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + unsigned long offset, maxoff; + struct inode *inode; + struct romfs_inode ri; + const char *name; /* got from dentry */ + int len, ret; + + offset = dir->i_ino & ROMFH_MASK; + ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto error; + + /* search all the file entries in the list starting from the one + * pointed to by the directory's special data */ + maxoff = romfs_maxsize(dir->i_sb); + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + + name = dentry->d_name.name; + len = dentry->d_name.len; + + for (;;) { + if (!offset || offset >= maxoff) + goto out0; + + ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri)); + if (ret < 0) + goto error; + + /* try to match the first 16 bytes of name */ + ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name, + len); + if (ret < 0) + goto error; + if (ret == 1) + break; + + /* next entry */ + offset = be32_to_cpu(ri.next) & ROMFH_MASK; + } + + /* Hard link handling */ + if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + + inode = romfs_iget(dir->i_sb, offset); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto error; + } + goto outi; + + /* + * it's a bit funky, _lookup needs to return an error code + * (negative) or a NULL, both as a dentry. ENOENT should not + * be returned, instead we need to create a negative dentry by + * d_add(dentry, NULL); and return 0 as no error. + * (Although as I see, it only matters on writable file + * systems). + */ +out0: + inode = NULL; +outi: + d_add(dentry, inode); + ret = 0; +error: + return ERR_PTR(ret); +} + +static const struct file_operations romfs_dir_operations = { + .read = generic_read_dir, + .readdir = romfs_readdir, + .llseek = default_llseek, +}; + +static const struct inode_operations romfs_dir_inode_operations = { + .lookup = romfs_lookup, +}; + +/* + * get a romfs inode based on its position in the image (which doubles as the + * inode number) + */ +static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) +{ + struct romfs_inode_info *inode; + struct romfs_inode ri; + struct inode *i; + unsigned long nlen; + unsigned nextfh; + int ret; + umode_t mode; + + /* we might have to traverse a chain of "hard link" file entries to get + * to the actual file */ + for (;;) { + ret = romfs_dev_read(sb, pos, &ri, sizeof(ri)); + if (ret < 0) + goto error; + + /* XXX: do romfs_checksum here too (with name) */ + + nextfh = be32_to_cpu(ri.next); + if ((nextfh & ROMFH_TYPE) != ROMFH_HRD) + break; + + pos = be32_to_cpu(ri.spec) & ROMFH_MASK; + } + + /* determine the length of the filename */ + nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN); + if (IS_ERR_VALUE(nlen)) + goto eio; + + /* get an inode for this image position */ + i = iget_locked(sb, pos); + if (!i) + return ERR_PTR(-ENOMEM); + + if (!(i->i_state & I_NEW)) + return i; + + /* precalculate the data offset */ + inode = ROMFS_I(i); + inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK; + inode->i_dataoffset = pos + inode->i_metasize; + + i->i_nlink = 1; /* Hard to decide.. */ + i->i_size = be32_to_cpu(ri.size); + i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; + i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; + + /* set up mode and ops */ + mode = romfs_modemap[nextfh & ROMFH_TYPE]; + + switch (nextfh & ROMFH_TYPE) { + case ROMFH_DIR: + i->i_size = ROMFS_I(i)->i_metasize; + i->i_op = &romfs_dir_inode_operations; + i->i_fop = &romfs_dir_operations; + if (nextfh & ROMFH_EXEC) + mode |= S_IXUGO; + break; + case ROMFH_REG: + i->i_fop = &romfs_ro_fops; + i->i_data.a_ops = &romfs_aops; + if (i->i_sb->s_mtd) + i->i_data.backing_dev_info = + i->i_sb->s_mtd->backing_dev_info; + if (nextfh & ROMFH_EXEC) + mode |= S_IXUGO; + break; + case ROMFH_SYM: + i->i_op = &page_symlink_inode_operations; + i->i_data.a_ops = &romfs_aops; + mode |= S_IRWXUGO; + break; + default: + /* depending on MBZ for sock/fifos */ + nextfh = be32_to_cpu(ri.spec); + init_special_inode(i, mode, MKDEV(nextfh >> 16, + nextfh & 0xffff)); + break; + } + + i->i_mode = mode; + + unlock_new_inode(i); + return i; + +eio: + ret = -EIO; +error: + printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos); + return ERR_PTR(ret); +} + +/* + * allocate a new inode + */ +static struct inode *romfs_alloc_inode(struct super_block *sb) +{ + struct romfs_inode_info *inode; + inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); + return inode ? &inode->vfs_inode : NULL; +} + +/* + * return a spent inode to the slab cache + */ +static void romfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); +} + +static void romfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, romfs_i_callback); +} + +/* + * get filesystem statistics + */ +static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = ROMFS_MAGIC; + buf->f_namelen = ROMFS_MAXFN; + buf->f_bsize = ROMBSIZE; + buf->f_bfree = buf->f_bavail = buf->f_ffree; + buf->f_blocks = + (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + return 0; +} + +/* + * remounting must involve read-only + */ +static int romfs_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_RDONLY; + return 0; +} + +static const struct super_operations romfs_super_ops = { + .alloc_inode = romfs_alloc_inode, + .destroy_inode = romfs_destroy_inode, + .statfs = romfs_statfs, + .remount_fs = romfs_remount, +}; + +/* + * checksum check on part of a romfs filesystem + */ +static __u32 romfs_checksum(const void *data, int size) +{ + const __be32 *ptr = data; + __u32 sum; + + sum = 0; + size >>= 2; + while (size > 0) { + sum += be32_to_cpu(*ptr++); + size--; + } + return sum; +} + +/* + * fill in the superblock + */ +static int romfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct romfs_super_block *rsb; + struct inode *root; + unsigned long pos, img_size; + const char *storage; + size_t len; + int ret; + +#ifdef CONFIG_BLOCK + if (!sb->s_mtd) { + sb_set_blocksize(sb, ROMBSIZE); + } else { + sb->s_blocksize = ROMBSIZE; + sb->s_blocksize_bits = blksize_bits(ROMBSIZE); + } +#endif + + sb->s_maxbytes = 0xFFFFFFFF; + sb->s_magic = ROMFS_MAGIC; + sb->s_flags |= MS_RDONLY | MS_NOATIME; + sb->s_op = &romfs_super_ops; + + /* read the image superblock and check it */ + rsb = kmalloc(512, GFP_KERNEL); + if (!rsb) + return -ENOMEM; + + sb->s_fs_info = (void *) 512; + ret = romfs_dev_read(sb, 0, rsb, 512); + if (ret < 0) + goto error_rsb; + + img_size = be32_to_cpu(rsb->size); + + if (sb->s_mtd && img_size > sb->s_mtd->size) + goto error_rsb_inval; + + sb->s_fs_info = (void *) img_size; + + if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 || + img_size < ROMFH_SIZE) { + if (!silent) + printk(KERN_WARNING "VFS:" + " Can't find a romfs filesystem on dev %s.\n", + sb->s_id); + goto error_rsb_inval; + } + + if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) { + printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n", + sb->s_id); + goto error_rsb_inval; + } + + storage = sb->s_mtd ? "MTD" : "the block layer"; + + len = strnlen(rsb->name, ROMFS_MAXFN); + if (!silent) + printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n", + (unsigned) len, (unsigned) len, rsb->name, storage); + + kfree(rsb); + rsb = NULL; + + /* find the root directory */ + pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; + + root = romfs_iget(sb, pos); + if (IS_ERR(root)) + goto error; + + sb->s_root = d_alloc_root(root); + if (!sb->s_root) + goto error_i; + + return 0; + +error_i: + iput(root); +error: + return -EINVAL; +error_rsb_inval: + ret = -EINVAL; +error_rsb: + kfree(rsb); + return ret; +} + +/* + * get a superblock for mounting + */ +static struct dentry *romfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + struct dentry *ret = ERR_PTR(-EINVAL); + +#ifdef CONFIG_ROMFS_ON_MTD + ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (ret == ERR_PTR(-EINVAL)) + ret = mount_bdev(fs_type, flags, dev_name, data, + romfs_fill_super); +#endif + return ret; +} + +/* + * destroy a romfs superblock in the appropriate manner + */ +static void romfs_kill_sb(struct super_block *sb) +{ +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) { + kill_mtd_super(sb); + return; + } +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) { + kill_block_super(sb); + return; + } +#endif +} + +static struct file_system_type romfs_fs_type = { + .owner = THIS_MODULE, + .name = "romfs", + .mount = romfs_mount, + .kill_sb = romfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; + +/* + * inode storage initialiser + */ +static void romfs_i_init_once(void *_inode) +{ + struct romfs_inode_info *inode = _inode; + + inode_init_once(&inode->vfs_inode); +} + +/* + * romfs module initialisation + */ +static int __init init_romfs_fs(void) +{ + int ret; + + printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n"); + + romfs_inode_cachep = + kmem_cache_create("romfs_i", + sizeof(struct romfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + romfs_i_init_once); + + if (!romfs_inode_cachep) { + printk(KERN_ERR + "ROMFS error: Failed to initialise inode cache\n"); + return -ENOMEM; + } + ret = register_filesystem(&romfs_fs_type); + if (ret) { + printk(KERN_ERR "ROMFS error: Failed to register filesystem\n"); + goto error_register; + } + return 0; + +error_register: + kmem_cache_destroy(romfs_inode_cachep); + return ret; +} + +/* + * romfs module removal + */ +static void __exit exit_romfs_fs(void) +{ + unregister_filesystem(&romfs_fs_type); + kmem_cache_destroy(romfs_inode_cachep); +} + +module_init(init_romfs_fs); +module_exit(exit_romfs_fs); + +MODULE_DESCRIPTION("Direct-MTD Capable RomFS"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */ diff --git a/fs/select.c b/fs/select.c new file mode 100644 index 00000000..d33418fd --- /dev/null +++ b/fs/select.c @@ -0,0 +1,999 @@ +/* + * This file contains the procedures for the handling of select and poll + * + * Created for Linux based loosely upon Mathius Lattner's minix + * patches by Peter MacDonald. Heavily edited by Linus. + * + * 4 February 1994 + * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS + * flag set in its personality we do *not* modify the given timeout + * parameter to reflect time remaining. + * + * 24 January 2000 + * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation + * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). + */ + +#include +#include +#include +#include +#include +#include +#include /* for STICKY_TIMEOUTS */ +#include +#include +#include +#include +#include + +#include + + +/* + * Estimate expected accuracy in ns from a timeval. + * + * After quite a bit of churning around, we've settled on + * a simple thing of taking 0.1% of the timeout as the + * slack, with a cap of 100 msec. + * "nice" tasks get a 0.5% slack instead. + * + * Consider this comment an open invitation to come up with even + * better solutions.. + */ + +#define MAX_SLACK (100 * NSEC_PER_MSEC) + +static long __estimate_accuracy(struct timespec *tv) +{ + long slack; + int divfactor = 1000; + + if (tv->tv_sec < 0) + return 0; + + if (task_nice(current) > 0) + divfactor = divfactor / 5; + + if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) + return MAX_SLACK; + + slack = tv->tv_nsec / divfactor; + slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); + + if (slack > MAX_SLACK) + return MAX_SLACK; + + return slack; +} + +long select_estimate_accuracy(struct timespec *tv) +{ + unsigned long ret; + struct timespec now; + + /* + * Realtime tasks get a slack of 0 for obvious reasons. + */ + + if (rt_task(current)) + return 0; + + ktime_get_ts(&now); + now = timespec_sub(*tv, now); + ret = __estimate_accuracy(&now); + if (ret < current->timer_slack_ns) + return current->timer_slack_ns; + return ret; +} + + + +struct poll_table_page { + struct poll_table_page * next; + struct poll_table_entry * entry; + struct poll_table_entry entries[0]; +}; + +#define POLL_TABLE_FULL(table) \ + ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) + +/* + * Ok, Peter made a complicated, but straightforward multiple_wait() function. + * I have rewritten this, taking some shortcuts: This code may not be easy to + * follow, but it should be free of race-conditions, and it's practical. If you + * understand what I'm doing here, then you understand how the linux + * sleep/wakeup mechanism works. + * + * Two very simple procedures, poll_wait() and poll_freewait() make all the + * work. poll_wait() is an inline-function defined in , + * as all select/poll functions have to call it to add an entry to the + * poll table. + */ +static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, + poll_table *p); + +void poll_initwait(struct poll_wqueues *pwq) +{ + init_poll_funcptr(&pwq->pt, __pollwait); + pwq->polling_task = current; + pwq->triggered = 0; + pwq->error = 0; + pwq->table = NULL; + pwq->inline_index = 0; +} +EXPORT_SYMBOL(poll_initwait); + +static void free_poll_entry(struct poll_table_entry *entry) +{ + remove_wait_queue(entry->wait_address, &entry->wait); + fput(entry->filp); +} + +void poll_freewait(struct poll_wqueues *pwq) +{ + struct poll_table_page * p = pwq->table; + int i; + for (i = 0; i < pwq->inline_index; i++) + free_poll_entry(pwq->inline_entries + i); + while (p) { + struct poll_table_entry * entry; + struct poll_table_page *old; + + entry = p->entry; + do { + entry--; + free_poll_entry(entry); + } while (entry > p->entries); + old = p; + p = p->next; + free_page((unsigned long) old); + } +} +EXPORT_SYMBOL(poll_freewait); + +static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) +{ + struct poll_table_page *table = p->table; + + if (p->inline_index < N_INLINE_POLL_ENTRIES) + return p->inline_entries + p->inline_index++; + + if (!table || POLL_TABLE_FULL(table)) { + struct poll_table_page *new_table; + + new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); + if (!new_table) { + p->error = -ENOMEM; + return NULL; + } + new_table->entry = new_table->entries; + new_table->next = table; + p->table = new_table; + table = new_table; + } + + return table->entry++; +} + +static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct poll_wqueues *pwq = wait->private; + DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); + + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expect write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in poll_schedule_timeout. + */ + smp_wmb(); + pwq->triggered = 1; + + /* + * Perform the default wake up operation using a dummy + * waitqueue. + * + * TODO: This is hacky but there currently is no interface to + * pass in @sync. @sync is scheduled to be removed and once + * that happens, wake_up_process() can be used directly. + */ + return default_wake_function(&dummy_wait, mode, sync, key); +} + +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct poll_table_entry *entry; + + entry = container_of(wait, struct poll_table_entry, wait); + if (key && !((unsigned long)key & entry->key)) + return 0; + return __pollwake(wait, mode, sync, key); +} + +/* Add a new entry */ +static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, + poll_table *p) +{ + struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); + struct poll_table_entry *entry = poll_get_entry(pwq); + if (!entry) + return; + get_file(filp); + entry->filp = filp; + entry->wait_address = wait_address; + entry->key = p->key; + init_waitqueue_func_entry(&entry->wait, pollwake); + entry->wait.private = pwq; + add_wait_queue(wait_address, &entry->wait); +} + +int poll_schedule_timeout(struct poll_wqueues *pwq, int state, + ktime_t *expires, unsigned long slack) +{ + int rc = -EINTR; + + set_current_state(state); + if (!pwq->triggered) + rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); + __set_current_state(TASK_RUNNING); + + /* + * Prepare for the next iteration. + * + * The following set_mb() serves two purposes. First, it's + * the counterpart rmb of the wmb in pollwake() such that data + * written before wake up is always visible after wake up. + * Second, the full barrier guarantees that triggered clearing + * doesn't pass event check of the next iteration. Note that + * this problem doesn't exist for the first iteration as + * add_wait_queue() has full barrier semantics. + */ + set_mb(pwq->triggered, 0); + + return rc; +} +EXPORT_SYMBOL(poll_schedule_timeout); + +/** + * poll_select_set_timeout - helper function to setup the timeout value + * @to: pointer to timespec variable for the final timeout + * @sec: seconds (from user space) + * @nsec: nanoseconds (from user space) + * + * Note, we do not use a timespec for the user space value here, That + * way we can use the function for timeval and compat interfaces as well. + * + * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. + */ +int poll_select_set_timeout(struct timespec *to, long sec, long nsec) +{ + struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec}; + + if (!timespec_valid(&ts)) + return -EINVAL; + + /* Optimize for the zero timeout value here */ + if (!sec && !nsec) { + to->tv_sec = to->tv_nsec = 0; + } else { + ktime_get_ts(to); + *to = timespec_add_safe(*to, ts); + } + return 0; +} + +static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, + int timeval, int ret) +{ + struct timespec rts; + struct timeval rtv; + + if (!p) + return ret; + + if (current->personality & STICKY_TIMEOUTS) + goto sticky; + + /* No update for zero timeout */ + if (!end_time->tv_sec && !end_time->tv_nsec) + return ret; + + ktime_get_ts(&rts); + rts = timespec_sub(*end_time, rts); + if (rts.tv_sec < 0) + rts.tv_sec = rts.tv_nsec = 0; + + if (timeval) { + if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) + memset(&rtv, 0, sizeof(rtv)); + rtv.tv_sec = rts.tv_sec; + rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; + + if (!copy_to_user(p, &rtv, sizeof(rtv))) + return ret; + + } else if (!copy_to_user(p, &rts, sizeof(rts))) + return ret; + + /* + * If an application puts its timeval in read-only memory, we + * don't want the Linux-specific update to the timeval to + * cause a fault after the select has completed + * successfully. However, because we're not updating the + * timeval, we can't restart the system call. + */ + +sticky: + if (ret == -ERESTARTNOHAND) + ret = -EINTR; + return ret; +} + +#define FDS_IN(fds, n) (fds->in + n) +#define FDS_OUT(fds, n) (fds->out + n) +#define FDS_EX(fds, n) (fds->ex + n) + +#define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) + +static int max_select_fd(unsigned long n, fd_set_bits *fds) +{ + unsigned long *open_fds; + unsigned long set; + int max; + struct fdtable *fdt; + + /* handle last in-complete long-word first */ + set = ~(~0UL << (n & (__NFDBITS-1))); + n /= __NFDBITS; + fdt = files_fdtable(current->files); + open_fds = fdt->open_fds->fds_bits+n; + max = 0; + if (set) { + set &= BITS(fds, n); + if (set) { + if (!(set & ~*open_fds)) + goto get_max; + return -EBADF; + } + } + while (n) { + open_fds--; + n--; + set = BITS(fds, n); + if (!set) + continue; + if (set & ~*open_fds) + return -EBADF; + if (max) + continue; +get_max: + do { + max++; + set >>= 1; + } while (set); + max += n * __NFDBITS; + } + + return max; +} + +#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR) +#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) +#define POLLEX_SET (POLLPRI) + +static inline void wait_key_set(poll_table *wait, unsigned long in, + unsigned long out, unsigned long bit) +{ + if (wait) { + wait->key = POLLEX_SET; + if (in & bit) + wait->key |= POLLIN_SET; + if (out & bit) + wait->key |= POLLOUT_SET; + } +} + +int do_select(int n, fd_set_bits *fds, struct timespec *end_time) +{ + ktime_t expire, *to = NULL; + struct poll_wqueues table; + poll_table *wait; + int retval, i, timed_out = 0; + unsigned long slack = 0; + + rcu_read_lock(); + retval = max_select_fd(n, fds); + rcu_read_unlock(); + + if (retval < 0) + return retval; + n = retval; + + poll_initwait(&table); + wait = &table.pt; + if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { + wait = NULL; + timed_out = 1; + } + + if (end_time && !timed_out) + slack = select_estimate_accuracy(end_time); + + retval = 0; + for (;;) { + unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; + + inp = fds->in; outp = fds->out; exp = fds->ex; + rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; + + for (i = 0; i < n; ++rinp, ++routp, ++rexp) { + unsigned long in, out, ex, all_bits, bit = 1, mask, j; + unsigned long res_in = 0, res_out = 0, res_ex = 0; + const struct file_operations *f_op = NULL; + struct file *file = NULL; + + in = *inp++; out = *outp++; ex = *exp++; + all_bits = in | out | ex; + if (all_bits == 0) { + i += __NFDBITS; + continue; + } + + for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) { + int fput_needed; + if (i >= n) + break; + if (!(bit & all_bits)) + continue; + file = fget_light(i, &fput_needed); + if (file) { + f_op = file->f_op; + mask = DEFAULT_POLLMASK; + if (f_op && f_op->poll) { + wait_key_set(wait, in, out, bit); + mask = (*f_op->poll)(file, wait); + } + fput_light(file, fput_needed); + if ((mask & POLLIN_SET) && (in & bit)) { + res_in |= bit; + retval++; + wait = NULL; + } + if ((mask & POLLOUT_SET) && (out & bit)) { + res_out |= bit; + retval++; + wait = NULL; + } + if ((mask & POLLEX_SET) && (ex & bit)) { + res_ex |= bit; + retval++; + wait = NULL; + } + } + } + if (res_in) + *rinp = res_in; + if (res_out) + *routp = res_out; + if (res_ex) + *rexp = res_ex; + cond_resched(); + } + wait = NULL; + if (retval || timed_out || signal_pending(current)) + break; + if (table.error) { + retval = table.error; + break; + } + + /* + * If this is the first loop and we have a timeout + * given, then we convert to ktime_t and set the to + * pointer to the expiry value. + */ + if (end_time && !to) { + expire = timespec_to_ktime(*end_time); + to = &expire; + } + + if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, + to, slack)) + timed_out = 1; + } + + poll_freewait(&table); + + return retval; +} + +/* + * We can actually return ERESTARTSYS instead of EINTR, but I'd + * like to be certain this leads to no problems. So I return + * EINTR just for safety. + * + * Update: ERESTARTSYS breaks at least the xview clock binary, so + * I'm trying ERESTARTNOHAND which restart only when you want to. + */ +int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, + fd_set __user *exp, struct timespec *end_time) +{ + fd_set_bits fds; + void *bits; + int ret, max_fds; + unsigned int size; + struct fdtable *fdt; + /* Allocate small arguments on the stack to save memory and be faster */ + long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; + + ret = -EINVAL; + if (n < 0) + goto out_nofds; + + /* max_fds can increase, so grab it once to avoid race */ + rcu_read_lock(); + fdt = files_fdtable(current->files); + max_fds = fdt->max_fds; + rcu_read_unlock(); + if (n > max_fds) + n = max_fds; + + /* + * We need 6 bitmaps (in/out/ex for both incoming and outgoing), + * since we used fdset we need to allocate memory in units of + * long-words. + */ + size = FDS_BYTES(n); + bits = stack_fds; + if (size > sizeof(stack_fds) / 6) { + /* Not enough space in on-stack array; must use kmalloc */ + ret = -ENOMEM; + bits = kmalloc(6 * size, GFP_KERNEL); + if (!bits) + goto out_nofds; + } + fds.in = bits; + fds.out = bits + size; + fds.ex = bits + 2*size; + fds.res_in = bits + 3*size; + fds.res_out = bits + 4*size; + fds.res_ex = bits + 5*size; + + if ((ret = get_fd_set(n, inp, fds.in)) || + (ret = get_fd_set(n, outp, fds.out)) || + (ret = get_fd_set(n, exp, fds.ex))) + goto out; + zero_fd_set(n, fds.res_in); + zero_fd_set(n, fds.res_out); + zero_fd_set(n, fds.res_ex); + + ret = do_select(n, &fds, end_time); + + if (ret < 0) + goto out; + if (!ret) { + ret = -ERESTARTNOHAND; + if (signal_pending(current)) + goto out; + ret = 0; + } + + if (set_fd_set(n, inp, fds.res_in) || + set_fd_set(n, outp, fds.res_out) || + set_fd_set(n, exp, fds.res_ex)) + ret = -EFAULT; + +out: + if (bits != stack_fds) + kfree(bits); +out_nofds: + return ret; +} + +SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, + fd_set __user *, exp, struct timeval __user *, tvp) +{ + struct timespec end_time, *to = NULL; + struct timeval tv; + int ret; + + if (tvp) { + if (copy_from_user(&tv, tvp, sizeof(tv))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, + tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), + (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) + return -EINVAL; + } + + ret = core_sys_select(n, inp, outp, exp, to); + ret = poll_select_copy_remaining(&end_time, tvp, 1, ret); + + return ret; +} + +#ifdef HAVE_SET_RESTORE_SIGMASK +static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, + fd_set __user *exp, struct timespec __user *tsp, + const sigset_t __user *sigmask, size_t sigsetsize) +{ + sigset_t ksigmask, sigsaved; + struct timespec ts, end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) + return -EFAULT; + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = core_sys_select(n, inp, outp, exp, to); + ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); + + if (ret == -ERESTARTNOHAND) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + return ret; +} + +/* + * Most architectures can't handle 7-argument syscalls. So we provide a + * 6-argument version where the sixth argument is a pointer to a structure + * which has a pointer to the sigset_t itself followed by a size_t containing + * the sigset size. + */ +SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, + fd_set __user *, exp, struct timespec __user *, tsp, + void __user *, sig) +{ + size_t sigsetsize = 0; + sigset_t __user *up = NULL; + + if (sig) { + if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t)) + || __get_user(up, (sigset_t __user * __user *)sig) + || __get_user(sigsetsize, + (size_t __user *)(sig+sizeof(void *)))) + return -EFAULT; + } + + return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize); +} +#endif /* HAVE_SET_RESTORE_SIGMASK */ + +#ifdef __ARCH_WANT_SYS_OLD_SELECT +struct sel_arg_struct { + unsigned long n; + fd_set __user *inp, *outp, *exp; + struct timeval __user *tvp; +}; + +SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) +{ + struct sel_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); +} +#endif + +struct poll_list { + struct poll_list *next; + int len; + struct pollfd entries[0]; +}; + +#define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) + +/* + * Fish for pollable events on the pollfd->fd file descriptor. We're only + * interested in events matching the pollfd->events mask, and the result + * matching that mask is both recorded in pollfd->revents and returned. The + * pwait poll_table will be used by the fd-provided poll handler for waiting, + * if non-NULL. + */ +static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) +{ + unsigned int mask; + int fd; + + mask = 0; + fd = pollfd->fd; + if (fd >= 0) { + int fput_needed; + struct file * file; + + file = fget_light(fd, &fput_needed); + mask = POLLNVAL; + if (file != NULL) { + mask = DEFAULT_POLLMASK; + if (file->f_op && file->f_op->poll) { + if (pwait) + pwait->key = pollfd->events | + POLLERR | POLLHUP; + mask = file->f_op->poll(file, pwait); + } + /* Mask out unneeded events. */ + mask &= pollfd->events | POLLERR | POLLHUP; + fput_light(file, fput_needed); + } + } + pollfd->revents = mask; + + return mask; +} + +static int do_poll(unsigned int nfds, struct poll_list *list, + struct poll_wqueues *wait, struct timespec *end_time) +{ + poll_table* pt = &wait->pt; + ktime_t expire, *to = NULL; + int timed_out = 0, count = 0; + unsigned long slack = 0; + + /* Optimise the no-wait case */ + if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { + pt = NULL; + timed_out = 1; + } + + if (end_time && !timed_out) + slack = select_estimate_accuracy(end_time); + + for (;;) { + struct poll_list *walk; + + for (walk = list; walk != NULL; walk = walk->next) { + struct pollfd * pfd, * pfd_end; + + pfd = walk->entries; + pfd_end = pfd + walk->len; + for (; pfd != pfd_end; pfd++) { + /* + * Fish for events. If we found one, record it + * and kill the poll_table, so we don't + * needlessly register any other waiters after + * this. They'll get immediately deregistered + * when we break out and return. + */ + if (do_pollfd(pfd, pt)) { + count++; + pt = NULL; + } + } + } + /* + * All waiters have already been registered, so don't provide + * a poll_table to them on the next loop iteration. + */ + pt = NULL; + if (!count) { + count = wait->error; + if (signal_pending(current)) + count = -EINTR; + } + if (count || timed_out) + break; + + /* + * If this is the first loop and we have a timeout + * given, then we convert to ktime_t and set the to + * pointer to the expiry value. + */ + if (end_time && !to) { + expire = timespec_to_ktime(*end_time); + to = &expire; + } + + if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) + timed_out = 1; + } + return count; +} + +#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ + sizeof(struct pollfd)) + +int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, + struct timespec *end_time) +{ + struct poll_wqueues table; + int err = -EFAULT, fdcount, len, size; + /* Allocate small arguments on the stack to save memory and be + faster - use long to make sure the buffer is aligned properly + on 64 bit archs to avoid unaligned access */ + long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; + struct poll_list *const head = (struct poll_list *)stack_pps; + struct poll_list *walk = head; + unsigned long todo = nfds; + + if (nfds > rlimit(RLIMIT_NOFILE)) + return -EINVAL; + + len = min_t(unsigned int, nfds, N_STACK_PPS); + for (;;) { + walk->next = NULL; + walk->len = len; + if (!len) + break; + + if (copy_from_user(walk->entries, ufds + nfds-todo, + sizeof(struct pollfd) * walk->len)) + goto out_fds; + + todo -= walk->len; + if (!todo) + break; + + len = min(todo, POLLFD_PER_PAGE); + size = sizeof(struct poll_list) + sizeof(struct pollfd) * len; + walk = walk->next = kmalloc(size, GFP_KERNEL); + if (!walk) { + err = -ENOMEM; + goto out_fds; + } + } + + poll_initwait(&table); + fdcount = do_poll(nfds, head, &table, end_time); + poll_freewait(&table); + + for (walk = head; walk; walk = walk->next) { + struct pollfd *fds = walk->entries; + int j; + + for (j = 0; j < walk->len; j++, ufds++) + if (__put_user(fds[j].revents, &ufds->revents)) + goto out_fds; + } + + err = fdcount; +out_fds: + walk = head->next; + while (walk) { + struct poll_list *pos = walk; + walk = walk->next; + kfree(pos); + } + + return err; +} + +static long do_restart_poll(struct restart_block *restart_block) +{ + struct pollfd __user *ufds = restart_block->poll.ufds; + int nfds = restart_block->poll.nfds; + struct timespec *to = NULL, end_time; + int ret; + + if (restart_block->poll.has_timeout) { + end_time.tv_sec = restart_block->poll.tv_sec; + end_time.tv_nsec = restart_block->poll.tv_nsec; + to = &end_time; + } + + ret = do_sys_poll(ufds, nfds, to); + + if (ret == -EINTR) { + restart_block->fn = do_restart_poll; + ret = -ERESTART_RESTARTBLOCK; + } + return ret; +} + +SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, + long, timeout_msecs) +{ + struct timespec end_time, *to = NULL; + int ret; + + if (timeout_msecs >= 0) { + to = &end_time; + poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, + NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); + } + + ret = do_sys_poll(ufds, nfds, to); + + if (ret == -EINTR) { + struct restart_block *restart_block; + + restart_block = ¤t_thread_info()->restart_block; + restart_block->fn = do_restart_poll; + restart_block->poll.ufds = ufds; + restart_block->poll.nfds = nfds; + + if (timeout_msecs >= 0) { + restart_block->poll.tv_sec = end_time.tv_sec; + restart_block->poll.tv_nsec = end_time.tv_nsec; + restart_block->poll.has_timeout = 1; + } else + restart_block->poll.has_timeout = 0; + + ret = -ERESTART_RESTARTBLOCK; + } + return ret; +} + +#ifdef HAVE_SET_RESTORE_SIGMASK +SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, + struct timespec __user *, tsp, const sigset_t __user *, sigmask, + size_t, sigsetsize) +{ + sigset_t ksigmask, sigsaved; + struct timespec ts, end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) + return -EFAULT; + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = do_sys_poll(ufds, nfds, to); + + /* We can restart this syscall, usually */ + if (ret == -EINTR) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + ret = -ERESTARTNOHAND; + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); + + return ret; +} +#endif /* HAVE_SET_RESTORE_SIGMASK */ diff --git a/fs/seq_file.c b/fs/seq_file.c new file mode 100644 index 00000000..dba43c3e --- /dev/null +++ b/fs/seq_file.c @@ -0,0 +1,821 @@ +/* + * linux/fs/seq_file.c + * + * helper functions for making synthetic files from sequences of records. + * initial implementation -- AV, Oct 2001. + */ + +#include +#include +#include +#include + +#include +#include + +/** + * seq_open - initialize sequential file + * @file: file we initialize + * @op: method table describing the sequence + * + * seq_open() sets @file, associating it with a sequence described + * by @op. @op->start() sets the iterator up and returns the first + * element of sequence. @op->stop() shuts it down. @op->next() + * returns the next element of sequence. @op->show() prints element + * into the buffer. In case of error ->start() and ->next() return + * ERR_PTR(error). In the end of sequence they return %NULL. ->show() + * returns 0 in case of success and negative number in case of error. + * Returning SEQ_SKIP means "discard this element and move on". + */ +int seq_open(struct file *file, const struct seq_operations *op) +{ + struct seq_file *p = file->private_data; + + if (!p) { + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + file->private_data = p; + } + memset(p, 0, sizeof(*p)); + mutex_init(&p->lock); + p->op = op; + + /* + * Wrappers around seq_open(e.g. swaps_open) need to be + * aware of this. If they set f_version themselves, they + * should call seq_open first and then set f_version. + */ + file->f_version = 0; + + /* + * seq_files support lseek() and pread(). They do not implement + * write() at all, but we clear FMODE_PWRITE here for historical + * reasons. + * + * If a client of seq_files a) implements file.write() and b) wishes to + * support pwrite() then that client will need to implement its own + * file.open() which calls seq_open() and then sets FMODE_PWRITE. + */ + file->f_mode &= ~FMODE_PWRITE; + return 0; +} +EXPORT_SYMBOL(seq_open); + +static int traverse(struct seq_file *m, loff_t offset) +{ + loff_t pos = 0, index; + int error = 0; + void *p; + + m->version = 0; + index = 0; + m->count = m->from = 0; + if (!offset) { + m->index = index; + return 0; + } + if (!m->buf) { + m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + if (!m->buf) + return -ENOMEM; + } + p = m->op->start(m, &index); + while (p) { + error = PTR_ERR(p); + if (IS_ERR(p)) + break; + error = m->op->show(m, p); + if (error < 0) + break; + if (unlikely(error)) { + error = 0; + m->count = 0; + } + if (m->count == m->size) + goto Eoverflow; + if (pos + m->count > offset) { + m->from = offset - pos; + m->count -= m->from; + m->index = index; + break; + } + pos += m->count; + m->count = 0; + if (pos == offset) { + index++; + m->index = index; + break; + } + p = m->op->next(m, p, &index); + } + m->op->stop(m, p); + m->index = index; + return error; + +Eoverflow: + m->op->stop(m, p); + kfree(m->buf); + m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + return !m->buf ? -ENOMEM : -EAGAIN; +} + +/** + * seq_read - ->read() method for sequential files. + * @file: the file to read from + * @buf: the buffer to read to + * @size: the maximum number of bytes to read + * @ppos: the current position in the file + * + * Ready-made ->f_op->read() + */ +ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + size_t copied = 0; + loff_t pos; + size_t n; + void *p; + int err = 0; + + mutex_lock(&m->lock); + + /* Don't assume *ppos is where we left it */ + if (unlikely(*ppos != m->read_pos)) { + m->read_pos = *ppos; + while ((err = traverse(m, *ppos)) == -EAGAIN) + ; + if (err) { + /* With prejudice... */ + m->read_pos = 0; + m->version = 0; + m->index = 0; + m->count = 0; + goto Done; + } + } + + /* + * seq_file->op->..m_start/m_stop/m_next may do special actions + * or optimisations based on the file->f_version, so we want to + * pass the file->f_version to those methods. + * + * seq_file->version is just copy of f_version, and seq_file + * methods can treat it simply as file version. + * It is copied in first and copied out after all operations. + * It is convenient to have it as part of structure to avoid the + * need of passing another argument to all the seq_file methods. + */ + m->version = file->f_version; + /* grab buffer if we didn't have one */ + if (!m->buf) { + m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + if (!m->buf) + goto Enomem; + } + /* if not empty - flush it first */ + if (m->count) { + n = min(m->count, size); + err = copy_to_user(buf, m->buf + m->from, n); + if (err) + goto Efault; + m->count -= n; + m->from += n; + size -= n; + buf += n; + copied += n; + if (!m->count) + m->index++; + if (!size) + goto Done; + } + /* we need at least one record in buffer */ + pos = m->index; + p = m->op->start(m, &pos); + while (1) { + err = PTR_ERR(p); + if (!p || IS_ERR(p)) + break; + err = m->op->show(m, p); + if (err < 0) + break; + if (unlikely(err)) + m->count = 0; + if (unlikely(!m->count)) { + p = m->op->next(m, p, &pos); + m->index = pos; + continue; + } + if (m->count < m->size) + goto Fill; + m->op->stop(m, p); + kfree(m->buf); + m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + if (!m->buf) + goto Enomem; + m->count = 0; + m->version = 0; + pos = m->index; + p = m->op->start(m, &pos); + } + m->op->stop(m, p); + m->count = 0; + goto Done; +Fill: + /* they want more? let's try to get some more */ + while (m->count < size) { + size_t offs = m->count; + loff_t next = pos; + p = m->op->next(m, p, &next); + if (!p || IS_ERR(p)) { + err = PTR_ERR(p); + break; + } + err = m->op->show(m, p); + if (m->count == m->size || err) { + m->count = offs; + if (likely(err <= 0)) + break; + } + pos = next; + } + m->op->stop(m, p); + n = min(m->count, size); + err = copy_to_user(buf, m->buf, n); + if (err) + goto Efault; + copied += n; + m->count -= n; + if (m->count) + m->from = n; + else + pos++; + m->index = pos; +Done: + if (!copied) + copied = err; + else { + *ppos += copied; + m->read_pos += copied; + } + file->f_version = m->version; + mutex_unlock(&m->lock); + return copied; +Enomem: + err = -ENOMEM; + goto Done; +Efault: + err = -EFAULT; + goto Done; +} +EXPORT_SYMBOL(seq_read); + +/** + * seq_lseek - ->llseek() method for sequential files. + * @file: the file in question + * @offset: new position + * @origin: 0 for absolute, 1 for relative position + * + * Ready-made ->f_op->llseek() + */ +loff_t seq_lseek(struct file *file, loff_t offset, int origin) +{ + struct seq_file *m = file->private_data; + loff_t retval = -EINVAL; + + mutex_lock(&m->lock); + m->version = file->f_version; + switch (origin) { + case 1: + offset += file->f_pos; + case 0: + if (offset < 0) + break; + retval = offset; + if (offset != m->read_pos) { + while ((retval=traverse(m, offset)) == -EAGAIN) + ; + if (retval) { + /* with extreme prejudice... */ + file->f_pos = 0; + m->read_pos = 0; + m->version = 0; + m->index = 0; + m->count = 0; + } else { + m->read_pos = offset; + retval = file->f_pos = offset; + } + } + } + file->f_version = m->version; + mutex_unlock(&m->lock); + return retval; +} +EXPORT_SYMBOL(seq_lseek); + +/** + * seq_release - free the structures associated with sequential file. + * @file: file in question + * @inode: file->f_path.dentry->d_inode + * + * Frees the structures associated with sequential file; can be used + * as ->f_op->release() if you don't have private data to destroy. + */ +int seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + kfree(m->buf); + kfree(m); + return 0; +} +EXPORT_SYMBOL(seq_release); + +/** + * seq_escape - print string into buffer, escaping some characters + * @m: target buffer + * @s: string + * @esc: set of characters that need escaping + * + * Puts string into buffer, replacing each occurrence of character from + * @esc with usual octal escape. Returns 0 in case of success, -1 - in + * case of overflow. + */ +int seq_escape(struct seq_file *m, const char *s, const char *esc) +{ + char *end = m->buf + m->size; + char *p; + char c; + + for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) { + if (!strchr(esc, c)) { + *p++ = c; + continue; + } + if (p + 3 < end) { + *p++ = '\\'; + *p++ = '0' + ((c & 0300) >> 6); + *p++ = '0' + ((c & 070) >> 3); + *p++ = '0' + (c & 07); + continue; + } + m->count = m->size; + return -1; + } + m->count = p - m->buf; + return 0; +} +EXPORT_SYMBOL(seq_escape); + +int seq_printf(struct seq_file *m, const char *f, ...) +{ + va_list args; + int len; + + if (m->count < m->size) { + va_start(args, f); + len = vsnprintf(m->buf + m->count, m->size - m->count, f, args); + va_end(args); + if (m->count + len < m->size) { + m->count += len; + return 0; + } + } + m->count = m->size; + return -1; +} +EXPORT_SYMBOL(seq_printf); + +/** + * mangle_path - mangle and copy path to buffer beginning + * @s: buffer start + * @p: beginning of path in above buffer + * @esc: set of characters that need escaping + * + * Copy the path from @p to @s, replacing each occurrence of character from + * @esc with usual octal escape. + * Returns pointer past last written character in @s, or NULL in case of + * failure. + */ +char *mangle_path(char *s, char *p, char *esc) +{ + while (s <= p) { + char c = *p++; + if (!c) { + return s; + } else if (!strchr(esc, c)) { + *s++ = c; + } else if (s + 4 > p) { + break; + } else { + *s++ = '\\'; + *s++ = '0' + ((c & 0300) >> 6); + *s++ = '0' + ((c & 070) >> 3); + *s++ = '0' + (c & 07); + } + } + return NULL; +} +EXPORT_SYMBOL(mangle_path); + +/** + * seq_path - seq_file interface to print a pathname + * @m: the seq_file handle + * @path: the struct path to print + * @esc: set of characters to escape in the output + * + * return the absolute path of 'path', as represented by the + * dentry / mnt pair in the path parameter. + */ +int seq_path(struct seq_file *m, struct path *path, char *esc) +{ + char *buf; + size_t size = seq_get_buf(m, &buf); + int res = -1; + + if (size) { + char *p = d_path(path, buf, size); + if (!IS_ERR(p)) { + char *end = mangle_path(buf, p, esc); + if (end) + res = end - buf; + } + } + seq_commit(m, res); + + return res; +} +EXPORT_SYMBOL(seq_path); + +/* + * Same as seq_path, but relative to supplied root. + */ +int seq_path_root(struct seq_file *m, struct path *path, struct path *root, + char *esc) +{ + char *buf; + size_t size = seq_get_buf(m, &buf); + int res = -ENAMETOOLONG; + + if (size) { + char *p; + + p = __d_path(path, root, buf, size); + if (!p) + return SEQ_SKIP; + res = PTR_ERR(p); + if (!IS_ERR(p)) { + char *end = mangle_path(buf, p, esc); + if (end) + res = end - buf; + else + res = -ENAMETOOLONG; + } + } + seq_commit(m, res); + + return res < 0 && res != -ENAMETOOLONG ? res : 0; +} + +/* + * returns the path of the 'dentry' from the root of its filesystem. + */ +int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) +{ + char *buf; + size_t size = seq_get_buf(m, &buf); + int res = -1; + + if (size) { + char *p = dentry_path(dentry, buf, size); + if (!IS_ERR(p)) { + char *end = mangle_path(buf, p, esc); + if (end) + res = end - buf; + } + } + seq_commit(m, res); + + return res; +} + +int seq_bitmap(struct seq_file *m, const unsigned long *bits, + unsigned int nr_bits) +{ + if (m->count < m->size) { + int len = bitmap_scnprintf(m->buf + m->count, + m->size - m->count, bits, nr_bits); + if (m->count + len < m->size) { + m->count += len; + return 0; + } + } + m->count = m->size; + return -1; +} +EXPORT_SYMBOL(seq_bitmap); + +int seq_bitmap_list(struct seq_file *m, const unsigned long *bits, + unsigned int nr_bits) +{ + if (m->count < m->size) { + int len = bitmap_scnlistprintf(m->buf + m->count, + m->size - m->count, bits, nr_bits); + if (m->count + len < m->size) { + m->count += len; + return 0; + } + } + m->count = m->size; + return -1; +} +EXPORT_SYMBOL(seq_bitmap_list); + +static void *single_start(struct seq_file *p, loff_t *pos) +{ + return NULL + (*pos == 0); +} + +static void *single_next(struct seq_file *p, void *v, loff_t *pos) +{ + ++*pos; + return NULL; +} + +static void single_stop(struct seq_file *p, void *v) +{ +} + +int single_open(struct file *file, int (*show)(struct seq_file *, void *), + void *data) +{ + struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL); + int res = -ENOMEM; + + if (op) { + op->start = single_start; + op->next = single_next; + op->stop = single_stop; + op->show = show; + res = seq_open(file, op); + if (!res) + ((struct seq_file *)file->private_data)->private = data; + else + kfree(op); + } + return res; +} +EXPORT_SYMBOL(single_open); + +int single_release(struct inode *inode, struct file *file) +{ + const struct seq_operations *op = ((struct seq_file *)file->private_data)->op; + int res = seq_release(inode, file); + kfree(op); + return res; +} +EXPORT_SYMBOL(single_release); + +int seq_release_private(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + kfree(seq->private); + seq->private = NULL; + return seq_release(inode, file); +} +EXPORT_SYMBOL(seq_release_private); + +void *__seq_open_private(struct file *f, const struct seq_operations *ops, + int psize) +{ + int rc; + void *private; + struct seq_file *seq; + + private = kzalloc(psize, GFP_KERNEL); + if (private == NULL) + goto out; + + rc = seq_open(f, ops); + if (rc < 0) + goto out_free; + + seq = f->private_data; + seq->private = private; + return private; + +out_free: + kfree(private); +out: + return NULL; +} +EXPORT_SYMBOL(__seq_open_private); + +int seq_open_private(struct file *filp, const struct seq_operations *ops, + int psize) +{ + return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM; +} +EXPORT_SYMBOL(seq_open_private); + +int seq_putc(struct seq_file *m, char c) +{ + if (m->count < m->size) { + m->buf[m->count++] = c; + return 0; + } + return -1; +} +EXPORT_SYMBOL(seq_putc); + +int seq_puts(struct seq_file *m, const char *s) +{ + int len = strlen(s); + if (m->count + len < m->size) { + memcpy(m->buf + m->count, s, len); + m->count += len; + return 0; + } + m->count = m->size; + return -1; +} +EXPORT_SYMBOL(seq_puts); + +/** + * seq_write - write arbitrary data to buffer + * @seq: seq_file identifying the buffer to which data should be written + * @data: data address + * @len: number of bytes + * + * Return 0 on success, non-zero otherwise. + */ +int seq_write(struct seq_file *seq, const void *data, size_t len) +{ + if (seq->count + len < seq->size) { + memcpy(seq->buf + seq->count, data, len); + seq->count += len; + return 0; + } + seq->count = seq->size; + return -1; +} +EXPORT_SYMBOL(seq_write); + +struct list_head *seq_list_start(struct list_head *head, loff_t pos) +{ + struct list_head *lh; + + list_for_each(lh, head) + if (pos-- == 0) + return lh; + + return NULL; +} +EXPORT_SYMBOL(seq_list_start); + +struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) +{ + if (!pos) + return head; + + return seq_list_start(head, pos - 1); +} +EXPORT_SYMBOL(seq_list_start_head); + +struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) +{ + struct list_head *lh; + + lh = ((struct list_head *)v)->next; + ++*ppos; + return lh == head ? NULL : lh; +} +EXPORT_SYMBOL(seq_list_next); + +/** + * seq_hlist_start - start an iteration of a hlist + * @head: the head of the hlist + * @pos: the start position of the sequence + * + * Called at seq_file->op->start(). + */ +struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos) +{ + struct hlist_node *node; + + hlist_for_each(node, head) + if (pos-- == 0) + return node; + return NULL; +} +EXPORT_SYMBOL(seq_hlist_start); + +/** + * seq_hlist_start_head - start an iteration of a hlist + * @head: the head of the hlist + * @pos: the start position of the sequence + * + * Called at seq_file->op->start(). Call this function if you want to + * print a header at the top of the output. + */ +struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos) +{ + if (!pos) + return SEQ_START_TOKEN; + + return seq_hlist_start(head, pos - 1); +} +EXPORT_SYMBOL(seq_hlist_start_head); + +/** + * seq_hlist_next - move to the next position of the hlist + * @v: the current iterator + * @head: the head of the hlist + * @ppos: the current position + * + * Called at seq_file->op->next(). + */ +struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, + loff_t *ppos) +{ + struct hlist_node *node = v; + + ++*ppos; + if (v == SEQ_START_TOKEN) + return head->first; + else + return node->next; +} +EXPORT_SYMBOL(seq_hlist_next); + +/** + * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU + * @head: the head of the hlist + * @pos: the start position of the sequence + * + * Called at seq_file->op->start(). + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head, + loff_t pos) +{ + struct hlist_node *node; + + __hlist_for_each_rcu(node, head) + if (pos-- == 0) + return node; + return NULL; +} +EXPORT_SYMBOL(seq_hlist_start_rcu); + +/** + * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU + * @head: the head of the hlist + * @pos: the start position of the sequence + * + * Called at seq_file->op->start(). Call this function if you want to + * print a header at the top of the output. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, + loff_t pos) +{ + if (!pos) + return SEQ_START_TOKEN; + + return seq_hlist_start_rcu(head, pos - 1); +} +EXPORT_SYMBOL(seq_hlist_start_head_rcu); + +/** + * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU + * @v: the current iterator + * @head: the head of the hlist + * @ppos: the current position + * + * Called at seq_file->op->next(). + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +struct hlist_node *seq_hlist_next_rcu(void *v, + struct hlist_head *head, + loff_t *ppos) +{ + struct hlist_node *node = v; + + ++*ppos; + if (v == SEQ_START_TOKEN) + return rcu_dereference(head->first); + else + return rcu_dereference(node->next); +} +EXPORT_SYMBOL(seq_hlist_next_rcu); diff --git a/fs/signalfd.c b/fs/signalfd.c new file mode 100644 index 00000000..7ae2a574 --- /dev/null +++ b/fs/signalfd.c @@ -0,0 +1,295 @@ +/* + * fs/signalfd.c + * + * Copyright (C) 2003 Linus Torvalds + * + * Mon Mar 5, 2007: Davide Libenzi + * Changed ->read() to return a siginfo strcture instead of signal number. + * Fixed locking in ->poll(). + * Added sighand-detach notification. + * Added fd re-use in sys_signalfd() syscall. + * Now using anonymous inode source. + * Thanks to Oleg Nesterov for useful code review and suggestions. + * More comments and suggestions from Arnd Bergmann. + * Sat May 19, 2007: Davi E. M. Arnaut + * Retrieve multiple signals with one read() call + * Sun Jul 15, 2007: Davide Libenzi + * Attach to the sighand only during read() and poll(). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void signalfd_cleanup(struct sighand_struct *sighand) +{ + wait_queue_head_t *wqh = &sighand->signalfd_wqh; + /* + * The lockless check can race with remove_wait_queue() in progress, + * but in this case its caller should run under rcu_read_lock() and + * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. + */ + if (likely(!waitqueue_active(wqh))) + return; + + /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + wake_up_poll(wqh, POLLHUP | POLLFREE); +} + +struct signalfd_ctx { + sigset_t sigmask; +}; + +static int signalfd_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static unsigned int signalfd_poll(struct file *file, poll_table *wait) +{ + struct signalfd_ctx *ctx = file->private_data; + unsigned int events = 0; + + poll_wait(file, ¤t->sighand->signalfd_wqh, wait); + + spin_lock_irq(¤t->sighand->siglock); + if (next_signal(¤t->pending, &ctx->sigmask) || + next_signal(¤t->signal->shared_pending, + &ctx->sigmask)) + events |= POLLIN; + spin_unlock_irq(¤t->sighand->siglock); + + return events; +} + +/* + * Copied from copy_siginfo_to_user() in kernel/signal.c + */ +static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, + siginfo_t const *kinfo) +{ + long err; + + BUILD_BUG_ON(sizeof(struct signalfd_siginfo) != 128); + + /* + * Unused members should be zero ... + */ + err = __clear_user(uinfo, sizeof(*uinfo)); + + /* + * If you change siginfo_t structure, please be sure + * this code is fixed accordingly. + */ + err |= __put_user(kinfo->si_signo, &uinfo->ssi_signo); + err |= __put_user(kinfo->si_errno, &uinfo->ssi_errno); + err |= __put_user((short) kinfo->si_code, &uinfo->ssi_code); + switch (kinfo->si_code & __SI_MASK) { + case __SI_KILL: + err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); + err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); + break; + case __SI_TIMER: + err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid); + err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun); + err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); + err |= __put_user(kinfo->si_int, &uinfo->ssi_int); + break; + case __SI_POLL: + err |= __put_user(kinfo->si_band, &uinfo->ssi_band); + err |= __put_user(kinfo->si_fd, &uinfo->ssi_fd); + break; + case __SI_FAULT: + err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr); +#ifdef __ARCH_SI_TRAPNO + err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); +#endif +#ifdef BUS_MCEERR_AO + /* + * Other callers might not initialize the si_lsb field, + * so check explicitly for the right codes here. + */ + if (kinfo->si_code == BUS_MCEERR_AR || + kinfo->si_code == BUS_MCEERR_AO) + err |= __put_user((short) kinfo->si_addr_lsb, + &uinfo->ssi_addr_lsb); +#endif + break; + case __SI_CHLD: + err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); + err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); + err |= __put_user(kinfo->si_status, &uinfo->ssi_status); + err |= __put_user(kinfo->si_utime, &uinfo->ssi_utime); + err |= __put_user(kinfo->si_stime, &uinfo->ssi_stime); + break; + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: /* But this is */ + err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); + err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); + err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); + err |= __put_user(kinfo->si_int, &uinfo->ssi_int); + break; + default: + /* + * This case catches also the signals queued by sigqueue(). + */ + err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); + err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); + err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); + err |= __put_user(kinfo->si_int, &uinfo->ssi_int); + break; + } + + return err ? -EFAULT: sizeof(*uinfo); +} + +static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info, + int nonblock) +{ + ssize_t ret; + DECLARE_WAITQUEUE(wait, current); + + spin_lock_irq(¤t->sighand->siglock); + ret = dequeue_signal(current, &ctx->sigmask, info); + switch (ret) { + case 0: + if (!nonblock) + break; + ret = -EAGAIN; + default: + spin_unlock_irq(¤t->sighand->siglock); + return ret; + } + + add_wait_queue(¤t->sighand->signalfd_wqh, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + ret = dequeue_signal(current, &ctx->sigmask, info); + if (ret != 0) + break; + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + spin_unlock_irq(¤t->sighand->siglock); + schedule(); + spin_lock_irq(¤t->sighand->siglock); + } + spin_unlock_irq(¤t->sighand->siglock); + + remove_wait_queue(¤t->sighand->signalfd_wqh, &wait); + __set_current_state(TASK_RUNNING); + + return ret; +} + +/* + * Returns a multiple of the size of a "struct signalfd_siginfo", or a negative + * error code. The "count" parameter must be at least the size of a + * "struct signalfd_siginfo". + */ +static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct signalfd_ctx *ctx = file->private_data; + struct signalfd_siginfo __user *siginfo; + int nonblock = file->f_flags & O_NONBLOCK; + ssize_t ret, total = 0; + siginfo_t info; + + count /= sizeof(struct signalfd_siginfo); + if (!count) + return -EINVAL; + + siginfo = (struct signalfd_siginfo __user *) buf; + do { + ret = signalfd_dequeue(ctx, &info, nonblock); + if (unlikely(ret <= 0)) + break; + ret = signalfd_copyinfo(siginfo, &info); + if (ret < 0) + break; + siginfo++; + total += ret; + nonblock = 1; + } while (--count); + + return total ? total: ret; +} + +static const struct file_operations signalfd_fops = { + .release = signalfd_release, + .poll = signalfd_poll, + .read = signalfd_read, + .llseek = noop_llseek, +}; + +SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, + size_t, sizemask, int, flags) +{ + sigset_t sigmask; + struct signalfd_ctx *ctx; + + /* Check the SFD_* constants for consistency. */ + BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK); + + if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK)) + return -EINVAL; + + if (sizemask != sizeof(sigset_t) || + copy_from_user(&sigmask, user_mask, sizeof(sigmask))) + return -EINVAL; + sigdelsetmask(&sigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + signotset(&sigmask); + + if (ufd == -1) { + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->sigmask = sigmask; + + /* + * When we call this, the initialization must be complete, since + * anon_inode_getfd() will install the fd. + */ + ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx, + O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK))); + if (ufd < 0) + kfree(ctx); + } else { + struct file *file = fget(ufd); + if (!file) + return -EBADF; + ctx = file->private_data; + if (file->f_op != &signalfd_fops) { + fput(file); + return -EINVAL; + } + spin_lock_irq(¤t->sighand->siglock); + ctx->sigmask = sigmask; + spin_unlock_irq(¤t->sighand->siglock); + + wake_up(¤t->sighand->signalfd_wqh); + fput(file); + } + + return ufd; +} + +SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask, + size_t, sizemask) +{ + return sys_signalfd4(ufd, user_mask, sizemask, 0); +} diff --git a/fs/splice.c b/fs/splice.c new file mode 100644 index 00000000..9d890085 --- /dev/null +++ b/fs/splice.c @@ -0,0 +1,2047 @@ +/* + * "splice": joining two ropes together by interweaving their strands. + * + * This is the "extended pipe" functionality, where a pipe is used as + * an arbitrary in-memory buffer. Think of a pipe as a small kernel + * buffer that you can use to transfer data from one end to the other. + * + * The traditional unix read/write is extended with a "splice()" operation + * that transfers data buffers to or from a pipe buffer. + * + * Named by Larry McVoy, original implementation from Linus, extended by + * Jens to support splicing to files, network, direct splicing, etc and + * fixing lots of bugs. + * + * Copyright (C) 2005-2006 Jens Axboe + * Copyright (C) 2005-2006 Linus Torvalds + * Copyright (C) 2006 Ingo Molnar + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Attempt to steal a page from a pipe buffer. This should perhaps go into + * a vm helper function, it's already simplified quite a bit by the + * addition of remove_mapping(). If success is returned, the caller may + * attempt to reuse this page for another destination. + */ +static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + struct address_space *mapping; + + lock_page(page); + + mapping = page_mapping(page); + if (mapping) { + WARN_ON(!PageUptodate(page)); + + /* + * At least for ext2 with nobh option, we need to wait on + * writeback completing on this page, since we'll remove it + * from the pagecache. Otherwise truncate wont wait on the + * page, allowing the disk blocks to be reused by someone else + * before we actually wrote our data to them. fs corruption + * ensues. + */ + wait_on_page_writeback(page); + + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + goto out_unlock; + + /* + * If we succeeded in removing the mapping, set LRU flag + * and return good. + */ + if (remove_mapping(mapping, page)) { + buf->flags |= PIPE_BUF_FLAG_LRU; + return 0; + } + } + + /* + * Raced with truncate or failed to remove page from current + * address space, unlock and return failure. + */ +out_unlock: + unlock_page(page); + return 1; +} + +static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + page_cache_release(buf->page); + buf->flags &= ~PIPE_BUF_FLAG_LRU; +} + +/* + * Check whether the contents of buf is OK to access. Since the content + * is a page cache page, IO may be in flight. + */ +static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + int err; + + if (!PageUptodate(page)) { + lock_page(page); + + /* + * Page got truncated/unhashed. This will cause a 0-byte + * splice, if this is the first page. + */ + if (!page->mapping) { + err = -ENODATA; + goto error; + } + + /* + * Uh oh, read-error from disk. + */ + if (!PageUptodate(page)) { + err = -EIO; + goto error; + } + + /* + * Page is ok afterall, we are done. + */ + unlock_page(page); + } + + return 0; +error: + unlock_page(page); + return err; +} + +static const struct pipe_buf_operations page_cache_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = page_cache_pipe_buf_confirm, + .release = page_cache_pipe_buf_release, + .steal = page_cache_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) + return 1; + + buf->flags |= PIPE_BUF_FLAG_LRU; + return generic_pipe_buf_steal(pipe, buf); +} + +static const struct pipe_buf_operations user_page_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = page_cache_pipe_buf_release, + .steal = user_page_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static void wakeup_pipe_readers(struct pipe_inode_info *pipe) +{ + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); +} + +/** + * splice_to_pipe - fill passed data into a pipe + * @pipe: pipe to fill + * @spd: data to fill + * + * Description: + * @spd contains a map of pages and len/offset tuples, along with + * the struct pipe_buf_operations associated with these pages. This + * function will link that data to the pipe. + * + */ +ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + unsigned int spd_pages = spd->nr_pages; + int ret, do_wakeup, page_nr; + + ret = 0; + do_wakeup = 0; + page_nr = 0; + + pipe_lock(pipe); + + for (;;) { + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + + if (pipe->nrbufs < pipe->buffers) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + + buf->page = spd->pages[page_nr]; + buf->offset = spd->partial[page_nr].offset; + buf->len = spd->partial[page_nr].len; + buf->private = spd->partial[page_nr].private; + buf->ops = spd->ops; + if (spd->flags & SPLICE_F_GIFT) + buf->flags |= PIPE_BUF_FLAG_GIFT; + + pipe->nrbufs++; + page_nr++; + ret += buf->len; + + if (pipe->inode) + do_wakeup = 1; + + if (!--spd->nr_pages) + break; + if (pipe->nrbufs < pipe->buffers) + continue; + + break; + } + + if (spd->flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + do_wakeup = 0; + } + + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } + + pipe_unlock(pipe); + + if (do_wakeup) + wakeup_pipe_readers(pipe); + + while (page_nr < spd_pages) + spd->spd_release(spd, page_nr++); + + return ret; +} + +static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) +{ + page_cache_release(spd->pages[i]); +} + +/* + * Check if we need to grow the arrays holding pages and partial page + * descriptions. + */ +int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) +{ + if (pipe->buffers <= PIPE_DEF_BUFFERS) + return 0; + + spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL); + spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL); + + if (spd->pages && spd->partial) + return 0; + + kfree(spd->pages); + kfree(spd->partial); + return -ENOMEM; +} + +void splice_shrink_spd(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + if (pipe->buffers <= PIPE_DEF_BUFFERS) + return; + + kfree(spd->pages); + kfree(spd->partial); +} + +static int +__generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct address_space *mapping = in->f_mapping; + unsigned int loff, nr_pages, req_pages; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct page *page; + pgoff_t index, end_index; + loff_t isize; + int error, page_nr; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + .spd_release = spd_release_page, + }; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + index = *ppos >> PAGE_CACHE_SHIFT; + loff = *ppos & ~PAGE_CACHE_MASK; + req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nr_pages = min(req_pages, pipe->buffers); + + /* + * Lookup the (hopefully) full range of pages we need. + */ + spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); + index += spd.nr_pages; + + /* + * If find_get_pages_contig() returned fewer pages than we needed, + * readahead/allocate the rest and fill in the holes. + */ + if (spd.nr_pages < nr_pages) + page_cache_sync_readahead(mapping, &in->f_ra, in, + index, req_pages - spd.nr_pages); + + error = 0; + while (spd.nr_pages < nr_pages) { + /* + * Page could be there, find_get_pages_contig() breaks on + * the first hole. + */ + page = find_get_page(mapping, index); + if (!page) { + /* + * page didn't exist, allocate one. + */ + page = page_cache_alloc_cold(mapping); + if (!page) + break; + + error = add_to_page_cache_lru(page, mapping, index, + GFP_KERNEL); + if (unlikely(error)) { + page_cache_release(page); + if (error == -EEXIST) + continue; + break; + } + /* + * add_to_page_cache() locks the page, unlock it + * to avoid convoluting the logic below even more. + */ + unlock_page(page); + } + + spd.pages[spd.nr_pages++] = page; + index++; + } + + /* + * Now loop over the map and see if we need to start IO on any + * pages, fill in the partial map, etc. + */ + index = *ppos >> PAGE_CACHE_SHIFT; + nr_pages = spd.nr_pages; + spd.nr_pages = 0; + for (page_nr = 0; page_nr < nr_pages; page_nr++) { + unsigned int this_len; + + if (!len) + break; + + /* + * this_len is the max we'll use from this page + */ + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + page = spd.pages[page_nr]; + + if (PageReadahead(page)) + page_cache_async_readahead(mapping, &in->f_ra, in, + page, index, req_pages - page_nr); + + /* + * If the page isn't uptodate, we may need to start io on it + */ + if (!PageUptodate(page)) { + lock_page(page); + + /* + * Page was truncated, or invalidated by the + * filesystem. Redo the find/create, but this time the + * page is kept locked, so there's no chance of another + * race with truncate/invalidate. + */ + if (!page->mapping) { + unlock_page(page); + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + + if (!page) { + error = -ENOMEM; + break; + } + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; + } + /* + * page was already under io and is now done, great + */ + if (PageUptodate(page)) { + unlock_page(page); + goto fill_it; + } + + /* + * need to read in the page + */ + error = mapping->a_ops->readpage(in, page); + if (unlikely(error)) { + /* + * We really should re-lookup the page here, + * but it complicates things a lot. Instead + * lets just do what we already stored, and + * we'll get it the next time we are called. + */ + if (error == AOP_TRUNCATED_PAGE) + error = 0; + + break; + } + } +fill_it: + /* + * i_size must be checked after PageUptodate. + */ + isize = i_size_read(mapping->host); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) + break; + + /* + * if this is the last page, see if we need to shrink + * the length and stop + */ + if (end_index == index) { + unsigned int plen; + + /* + * max good bytes in this page + */ + plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (plen <= loff) + break; + + /* + * force quit after adding this page + */ + this_len = min(this_len, plen - loff); + len = this_len; + } + + spd.partial[page_nr].offset = loff; + spd.partial[page_nr].len = this_len; + len -= this_len; + loff = 0; + spd.nr_pages++; + index++; + } + + /* + * Release any pages at the end, if we quit early. 'page_nr' is how far + * we got, 'nr_pages' is how many pages are in the map. + */ + while (page_nr < nr_pages) + page_cache_release(spd.pages[page_nr++]); + in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + + if (spd.nr_pages) + error = splice_to_pipe(pipe, &spd); + + splice_shrink_spd(pipe, &spd); + return error; +} + +/** + * generic_file_splice_read - splice data from file to a pipe + * @in: file to splice from + * @ppos: position in @in + * @pipe: pipe to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * Will read pages from given file and fill them into a pipe. Can be + * used as long as the address_space operations for the source implements + * a readpage() hook. + * + */ +ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + loff_t isize, left; + int ret; + + isize = i_size_read(in->f_mapping->host); + if (unlikely(*ppos >= isize)) + return 0; + + left = isize - *ppos; + if (unlikely(left < len)) + len = left; + + ret = __generic_file_splice_read(in, ppos, pipe, len, flags); + if (ret > 0) { + *ppos += ret; + file_accessed(in); + } + + return ret; +} +EXPORT_SYMBOL(generic_file_splice_read); + +static const struct pipe_buf_operations default_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static ssize_t kernel_readv(struct file *file, const struct iovec *vec, + unsigned long vlen, loff_t offset) +{ + mm_segment_t old_fs; + loff_t pos = offset; + ssize_t res; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos); + set_fs(old_fs); + + return res; +} + +static ssize_t kernel_write(struct file *file, const char *buf, size_t count, + loff_t pos) +{ + mm_segment_t old_fs; + ssize_t res; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = vfs_write(file, (const char __user *)buf, count, &pos); + set_fs(old_fs); + + return res; +} + +ssize_t default_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + unsigned int nr_pages; + unsigned int nr_freed; + size_t offset; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; + ssize_t res; + size_t this_len; + int error; + int i; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &default_pipe_buf_ops, + .spd_release = spd_release_page, + }; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + res = -ENOMEM; + vec = __vec; + if (pipe->buffers > PIPE_DEF_BUFFERS) { + vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL); + if (!vec) + goto shrink_ret; + } + + offset = *ppos & ~PAGE_CACHE_MASK; + nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) { + struct page *page; + + page = alloc_page(GFP_USER); + error = -ENOMEM; + if (!page) + goto err; + + this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); + vec[i].iov_base = (void __user *) page_address(page); + vec[i].iov_len = this_len; + spd.pages[i] = page; + spd.nr_pages++; + len -= this_len; + offset = 0; + } + + res = kernel_readv(in, vec, spd.nr_pages, *ppos); + if (res < 0) { + error = res; + goto err; + } + + error = 0; + if (!res) + goto err; + + nr_freed = 0; + for (i = 0; i < spd.nr_pages; i++) { + this_len = min_t(size_t, vec[i].iov_len, res); + spd.partial[i].offset = 0; + spd.partial[i].len = this_len; + if (!this_len) { + __free_page(spd.pages[i]); + spd.pages[i] = NULL; + nr_freed++; + } + res -= this_len; + } + spd.nr_pages -= nr_freed; + + res = splice_to_pipe(pipe, &spd); + if (res > 0) + *ppos += res; + +shrink_ret: + if (vec != __vec) + kfree(vec); + splice_shrink_spd(pipe, &spd); + return res; + +err: + for (i = 0; i < spd.nr_pages; i++) + __free_page(spd.pages[i]); + + res = error; + goto shrink_ret; +} +EXPORT_SYMBOL(default_file_splice_read); + +/* + * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' + * using sendpage(). Return the number of bytes sent. + */ +static int pipe_to_sendpage(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, struct splice_desc *sd) +{ + struct file *file = sd->u.file; + loff_t pos = sd->pos; + int more; + + if (!likely(file->f_op && file->f_op->sendpage)) + return -EINVAL; + + more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; + if (sd->len < sd->total_len) + more |= MSG_SENDPAGE_NOTLAST; + return file->f_op->sendpage(file, buf->page, buf->offset, + sd->len, &pos, more); +} + +/* + * This is a little more tricky than the file -> pipe splicing. There are + * basically three cases: + * + * - Destination page already exists in the address space and there + * are users of it. For that case we have no other option that + * copying the data. Tough luck. + * - Destination page already exists in the address space, but there + * are no users of it. Make sure it's uptodate, then drop it. Fall + * through to last case. + * - Destination page does not exist, we can add the pipe page to + * the page cache and avoid the copy. + * + * If asked to move pages to the output file (SPLICE_F_MOVE is set in + * sd->flags), we attempt to migrate pages from the pipe to the output + * file address space page cache. This is possible if no one else has + * the pipe page referenced outside of the pipe and page cache. If + * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create + * a new page in the output file page cache and fill/dirty that. + */ +int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + struct file *file = sd->u.file; + struct address_space *mapping = file->f_mapping; + unsigned int offset, this_len; + struct page *page; + void *fsdata; + int ret; + + offset = sd->pos & ~PAGE_CACHE_MASK; + + this_len = sd->len; + if (this_len + offset > PAGE_CACHE_SIZE) + this_len = PAGE_CACHE_SIZE - offset; + + ret = pagecache_write_begin(file, mapping, sd->pos, this_len, + AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); + if (unlikely(ret)) + goto out; + + if (buf->page != page) { + /* + * Careful, ->map() uses KM_USER0! + */ + char *src = buf->ops->map(pipe, buf, 1); + char *dst = kmap_atomic(page, KM_USER1); + + memcpy(dst + offset, src + buf->offset, this_len); + flush_dcache_page(page); + kunmap_atomic(dst, KM_USER1); + buf->ops->unmap(pipe, buf, src); + } + ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, + page, fsdata); +out: + return ret; +} +EXPORT_SYMBOL(pipe_to_file); + +static void wakeup_pipe_writers(struct pipe_inode_info *pipe) +{ + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); +} + +/** + * splice_from_pipe_feed - feed available data from a pipe to a file + * @pipe: pipe to splice from + * @sd: information to @actor + * @actor: handler that splices the data + * + * Description: + * This function loops over the pipe and calls @actor to do the + * actual moving of a single struct pipe_buffer to the desired + * destination. It returns when there's no more buffers left in + * the pipe or if the requested number of bytes (@sd->total_len) + * have been copied. It returns a positive number (one) if the + * pipe needs to be filled with more data, zero if the required + * number of bytes have been copied and -errno on error. + * + * This, together with splice_from_pipe_{begin,end,next}, may be + * used to implement the functionality of __splice_from_pipe() when + * locking is required around copying the pipe buffers to the + * destination. + */ +int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, + splice_actor *actor) +{ + int ret; + + while (pipe->nrbufs) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; + const struct pipe_buf_operations *ops = buf->ops; + + sd->len = buf->len; + if (sd->len > sd->total_len) + sd->len = sd->total_len; + + ret = buf->ops->confirm(pipe, buf); + if (unlikely(ret)) { + if (ret == -ENODATA) + ret = 0; + return ret; + } + + ret = actor(pipe, buf, sd); + if (ret <= 0) + return ret; + + buf->offset += ret; + buf->len -= ret; + + sd->num_spliced += ret; + sd->len -= ret; + sd->pos += ret; + sd->total_len -= ret; + + if (!buf->len) { + buf->ops = NULL; + ops->release(pipe, buf); + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); + pipe->nrbufs--; + if (pipe->inode) + sd->need_wakeup = true; + } + + if (!sd->total_len) + return 0; + } + + return 1; +} +EXPORT_SYMBOL(splice_from_pipe_feed); + +/** + * splice_from_pipe_next - wait for some data to splice from + * @pipe: pipe to splice from + * @sd: information about the splice operation + * + * Description: + * This function will wait for some data and return a positive + * value (one) if pipe buffers are available. It will return zero + * or -errno if no more data needs to be spliced. + */ +int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) +{ + while (!pipe->nrbufs) { + if (!pipe->writers) + return 0; + + if (!pipe->waiting_writers && sd->num_spliced) + return 0; + + if (sd->flags & SPLICE_F_NONBLOCK) + return -EAGAIN; + + if (signal_pending(current)) + return -ERESTARTSYS; + + if (sd->need_wakeup) { + wakeup_pipe_writers(pipe); + sd->need_wakeup = false; + } + + pipe_wait(pipe); + } + + return 1; +} +EXPORT_SYMBOL(splice_from_pipe_next); + +/** + * splice_from_pipe_begin - start splicing from pipe + * @sd: information about the splice operation + * + * Description: + * This function should be called before a loop containing + * splice_from_pipe_next() and splice_from_pipe_feed() to + * initialize the necessary fields of @sd. + */ +void splice_from_pipe_begin(struct splice_desc *sd) +{ + sd->num_spliced = 0; + sd->need_wakeup = false; +} +EXPORT_SYMBOL(splice_from_pipe_begin); + +/** + * splice_from_pipe_end - finish splicing from pipe + * @pipe: pipe to splice from + * @sd: information about the splice operation + * + * Description: + * This function will wake up pipe writers if necessary. It should + * be called after a loop containing splice_from_pipe_next() and + * splice_from_pipe_feed(). + */ +void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) +{ + if (sd->need_wakeup) + wakeup_pipe_writers(pipe); +} +EXPORT_SYMBOL(splice_from_pipe_end); + +/** + * __splice_from_pipe - splice data from a pipe to given actor + * @pipe: pipe to splice from + * @sd: information to @actor + * @actor: handler that splices the data + * + * Description: + * This function does little more than loop over the pipe and call + * @actor to do the actual moving of a single struct pipe_buffer to + * the desired destination. See pipe_to_file, pipe_to_sendpage, or + * pipe_to_user. + * + */ +ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, + splice_actor *actor) +{ + int ret; + + splice_from_pipe_begin(sd); + do { + ret = splice_from_pipe_next(pipe, sd); + if (ret > 0) + ret = splice_from_pipe_feed(pipe, sd, actor); + } while (ret > 0); + splice_from_pipe_end(pipe, sd); + + return sd->num_spliced ? sd->num_spliced : ret; +} +EXPORT_SYMBOL(__splice_from_pipe); + +/** + * splice_from_pipe - splice data from a pipe to a file + * @pipe: pipe to splice from + * @out: file to splice to + * @ppos: position in @out + * @len: how many bytes to splice + * @flags: splice modifier flags + * @actor: handler that splices the data + * + * Description: + * See __splice_from_pipe. This function locks the pipe inode, + * otherwise it's identical to __splice_from_pipe(). + * + */ +ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags, + splice_actor *actor) +{ + ssize_t ret; + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + }; + + pipe_lock(pipe); + ret = __splice_from_pipe(pipe, &sd, actor); + pipe_unlock(pipe); + + return ret; +} + +/** + * generic_file_splice_write - splice data from a pipe to a file + * @pipe: pipe info + * @out: file to write to + * @ppos: position in @out + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * Will either move or copy pages (determined by @flags options) from + * the given pipe inode to the given file. + * + */ +ssize_t +generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct address_space *mapping = out->f_mapping; + struct inode *inode = mapping->host; + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + }; + ssize_t ret; + + pipe_lock(pipe); + + splice_from_pipe_begin(&sd); + do { + ret = splice_from_pipe_next(pipe, &sd); + if (ret <= 0) + break; + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + ret = file_remove_suid(out); + if (!ret) { + file_update_time(out); + ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file); + } + mutex_unlock(&inode->i_mutex); + } while (ret > 0); + splice_from_pipe_end(pipe, &sd); + + pipe_unlock(pipe); + + if (sd.num_spliced) + ret = sd.num_spliced; + + if (ret > 0) { + unsigned long nr_pages; + int err; + + nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; + balance_dirty_pages_ratelimited_nr(mapping, nr_pages); + } + + return ret; +} + +EXPORT_SYMBOL(generic_file_splice_write); + +static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + int ret; + void *data; + + data = buf->ops->map(pipe, buf, 0); + ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); + buf->ops->unmap(pipe, buf, data); + + return ret; +} + +static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + ssize_t ret; + + ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf); + if (ret > 0) + *ppos += ret; + + return ret; +} + +/** + * generic_splice_sendpage - splice data from a pipe to a socket + * @pipe: pipe to splice from + * @out: socket to write to + * @ppos: position in @out + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * Will send @len bytes from the pipe to a network socket. No data copying + * is involved. + * + */ +ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); +} + +EXPORT_SYMBOL(generic_splice_sendpage); + +/* + * Attempt to initiate a splice from pipe to file. + */ +static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int); + int ret; + + if (unlikely(!(out->f_mode & FMODE_WRITE))) + return -EBADF; + + if (unlikely(out->f_flags & O_APPEND)) + return -EINVAL; + + ret = rw_verify_area(WRITE, out, ppos, len); + if (unlikely(ret < 0)) + return ret; + + if (out->f_op && out->f_op->splice_write) + splice_write = out->f_op->splice_write; + else + splice_write = default_file_splice_write; + + return splice_write(pipe, out, ppos, len, flags); +} + +/* + * Attempt to initiate a splice from a file to a pipe. + */ +static long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); + int ret; + + if (unlikely(!(in->f_mode & FMODE_READ))) + return -EBADF; + + ret = rw_verify_area(READ, in, ppos, len); + if (unlikely(ret < 0)) + return ret; + + if (in->f_op && in->f_op->splice_read) + splice_read = in->f_op->splice_read; + else + splice_read = default_file_splice_read; + + return splice_read(in, ppos, pipe, len, flags); +} + +/** + * splice_direct_to_actor - splices data directly between two non-pipes + * @in: file to splice from + * @sd: actor information on where to splice to + * @actor: handles the data splicing + * + * Description: + * This is a special case helper to splice directly between two + * points, without requiring an explicit pipe. Internally an allocated + * pipe is cached in the process, and reused during the lifetime of + * that process. + * + */ +ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, + splice_direct_actor *actor) +{ + struct pipe_inode_info *pipe; + long ret, bytes; + umode_t i_mode; + size_t len; + int i, flags; + + /* + * We require the input being a regular file, as we don't want to + * randomly drop data for eg socket -> socket splicing. Use the + * piped splicing for that! + */ + i_mode = in->f_path.dentry->d_inode->i_mode; + if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) + return -EINVAL; + + /* + * neither in nor out is a pipe, setup an internal pipe attached to + * 'out' and transfer the wanted data from 'in' to 'out' through that + */ + pipe = current->splice_pipe; + if (unlikely(!pipe)) { + pipe = alloc_pipe_info(NULL); + if (!pipe) + return -ENOMEM; + + /* + * We don't have an immediate reader, but we'll read the stuff + * out of the pipe right after the splice_to_pipe(). So set + * PIPE_READERS appropriately. + */ + pipe->readers = 1; + + current->splice_pipe = pipe; + } + + /* + * Do the splice. + */ + ret = 0; + bytes = 0; + len = sd->total_len; + flags = sd->flags; + + /* + * Don't block on output, we have to drain the direct pipe. + */ + sd->flags &= ~SPLICE_F_NONBLOCK; + + while (len) { + size_t read_len; + loff_t pos = sd->pos, prev_pos = pos; + + ret = do_splice_to(in, &pos, pipe, len, flags); + if (unlikely(ret <= 0)) + goto out_release; + + read_len = ret; + sd->total_len = read_len; + + /* + * NOTE: nonblocking mode only applies to the input. We + * must not do the output in nonblocking mode as then we + * could get stuck data in the internal pipe: + */ + ret = actor(pipe, sd); + if (unlikely(ret <= 0)) { + sd->pos = prev_pos; + goto out_release; + } + + bytes += ret; + len -= ret; + sd->pos = pos; + + if (ret < read_len) { + sd->pos = prev_pos + ret; + goto out_release; + } + } + +done: + pipe->nrbufs = pipe->curbuf = 0; + file_accessed(in); + return bytes; + +out_release: + /* + * If we did an incomplete transfer we must release + * the pipe buffers in question: + */ + for (i = 0; i < pipe->buffers; i++) { + struct pipe_buffer *buf = pipe->bufs + i; + + if (buf->ops) { + buf->ops->release(pipe, buf); + buf->ops = NULL; + } + } + + if (!bytes) + bytes = ret; + + goto done; +} +EXPORT_SYMBOL(splice_direct_to_actor); + +static int direct_splice_actor(struct pipe_inode_info *pipe, + struct splice_desc *sd) +{ + struct file *file = sd->u.file; + + return do_splice_from(pipe, file, &file->f_pos, sd->total_len, + sd->flags); +} + +/** + * do_splice_direct - splices data directly between two files + * @in: file to splice from + * @ppos: input file offset + * @out: file to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * For use by do_sendfile(). splice can easily emulate sendfile, but + * doing it in the application would incur an extra system call + * (splice in + splice out, as compared to just sendfile()). So this helper + * can splice directly through a process-private pipe. + * + */ +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + size_t len, unsigned int flags) +{ + struct splice_desc sd = { + .len = len, + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + }; + long ret; + + ret = splice_direct_to_actor(in, &sd, direct_splice_actor); + if (ret > 0) + *ppos = sd.pos; + + return ret; +} + +static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags); + +/* + * Determine where to splice to/from. + */ +static long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) +{ + struct pipe_inode_info *ipipe; + struct pipe_inode_info *opipe; + loff_t offset, *off; + long ret; + + ipipe = get_pipe_info(in); + opipe = get_pipe_info(out); + + if (ipipe && opipe) { + if (off_in || off_out) + return -ESPIPE; + + if (!(in->f_mode & FMODE_READ)) + return -EBADF; + + if (!(out->f_mode & FMODE_WRITE)) + return -EBADF; + + /* Splicing to self would be fun, but... */ + if (ipipe == opipe) + return -EINVAL; + + return splice_pipe_to_pipe(ipipe, opipe, len, flags); + } + + if (ipipe) { + if (off_in) + return -ESPIPE; + if (off_out) { + if (!(out->f_mode & FMODE_PWRITE)) + return -EINVAL; + if (copy_from_user(&offset, off_out, sizeof(loff_t))) + return -EFAULT; + off = &offset; + } else + off = &out->f_pos; + + ret = do_splice_from(ipipe, out, off, len, flags); + + if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) + ret = -EFAULT; + + return ret; + } + + if (opipe) { + if (off_out) + return -ESPIPE; + if (off_in) { + if (!(in->f_mode & FMODE_PREAD)) + return -EINVAL; + if (copy_from_user(&offset, off_in, sizeof(loff_t))) + return -EFAULT; + off = &offset; + } else + off = &in->f_pos; + + ret = do_splice_to(in, off, opipe, len, flags); + + if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) + ret = -EFAULT; + + return ret; + } + + return -EINVAL; +} + +/* + * Map an iov into an array of pages and offset/length tupples. With the + * partial_page structure, we can map several non-contiguous ranges into + * our ones pages[] map instead of splitting that operation into pieces. + * Could easily be exported as a generic helper for other users, in which + * case one would probably want to add a 'max_nr_pages' parameter as well. + */ +static int get_iovec_page_array(const struct iovec __user *iov, + unsigned int nr_vecs, struct page **pages, + struct partial_page *partial, int aligned, + unsigned int pipe_buffers) +{ + int buffers = 0, error = 0; + + while (nr_vecs) { + unsigned long off, npages; + struct iovec entry; + void __user *base; + size_t len; + int i; + + error = -EFAULT; + if (copy_from_user(&entry, iov, sizeof(entry))) + break; + + base = entry.iov_base; + len = entry.iov_len; + + /* + * Sanity check this iovec. 0 read succeeds. + */ + error = 0; + if (unlikely(!len)) + break; + error = -EFAULT; + if (!access_ok(VERIFY_READ, base, len)) + break; + + /* + * Get this base offset and number of pages, then map + * in the user pages. + */ + off = (unsigned long) base & ~PAGE_MASK; + + /* + * If asked for alignment, the offset must be zero and the + * length a multiple of the PAGE_SIZE. + */ + error = -EINVAL; + if (aligned && (off || len & ~PAGE_MASK)) + break; + + npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (npages > pipe_buffers - buffers) + npages = pipe_buffers - buffers; + + error = get_user_pages_fast((unsigned long)base, npages, + 0, &pages[buffers]); + + if (unlikely(error <= 0)) + break; + + /* + * Fill this contiguous range into the partial page map. + */ + for (i = 0; i < error; i++) { + const int plen = min_t(size_t, len, PAGE_SIZE - off); + + partial[buffers].offset = off; + partial[buffers].len = plen; + + off = 0; + len -= plen; + buffers++; + } + + /* + * We didn't complete this iov, stop here since it probably + * means we have to move some of this into a pipe to + * be able to continue. + */ + if (len) + break; + + /* + * Don't continue if we mapped fewer pages than we asked for, + * or if we mapped the max number of pages that we have + * room for. + */ + if (error < npages || buffers == pipe_buffers) + break; + + nr_vecs--; + iov++; + } + + if (buffers) + return buffers; + + return error; +} + +static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + char *src; + int ret; + + /* + * See if we can use the atomic maps, by prefaulting in the + * pages and doing an atomic copy + */ + if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { + src = buf->ops->map(pipe, buf, 1); + ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, + sd->len); + buf->ops->unmap(pipe, buf, src); + if (!ret) { + ret = sd->len; + goto out; + } + } + + /* + * No dice, use slow non-atomic map and copy + */ + src = buf->ops->map(pipe, buf, 0); + + ret = sd->len; + if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) + ret = -EFAULT; + + buf->ops->unmap(pipe, buf, src); +out: + if (ret > 0) + sd->u.userptr += ret; + return ret; +} + +/* + * For lack of a better implementation, implement vmsplice() to userspace + * as a simple copy of the pipes pages to the user iov. + */ +static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe; + struct splice_desc sd; + ssize_t size; + int error; + long ret; + + pipe = get_pipe_info(file); + if (!pipe) + return -EBADF; + + pipe_lock(pipe); + + error = ret = 0; + while (nr_segs) { + void __user *base; + size_t len; + + /* + * Get user address base and length for this iovec. + */ + error = get_user(base, &iov->iov_base); + if (unlikely(error)) + break; + error = get_user(len, &iov->iov_len); + if (unlikely(error)) + break; + + /* + * Sanity check this iovec. 0 read succeeds. + */ + if (unlikely(!len)) + break; + if (unlikely(!base)) { + error = -EFAULT; + break; + } + + if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { + error = -EFAULT; + break; + } + + sd.len = 0; + sd.total_len = len; + sd.flags = flags; + sd.u.userptr = base; + sd.pos = 0; + + size = __splice_from_pipe(pipe, &sd, pipe_to_user); + if (size < 0) { + if (!ret) + ret = size; + + break; + } + + ret += size; + + if (size < len) + break; + + nr_segs--; + iov++; + } + + pipe_unlock(pipe); + + if (!ret) + ret = error; + + return ret; +} + +/* + * vmsplice splices a user address range into a pipe. It can be thought of + * as splice-from-memory, where the regular splice is splice-from-file (or + * to file). In both cases the output is a pipe, naturally. + */ +static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &user_page_pipe_buf_ops, + .spd_release = spd_release_page, + }; + long ret; + + pipe = get_pipe_info(file); + if (!pipe) + return -EBADF; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, + spd.partial, flags & SPLICE_F_GIFT, + pipe->buffers); + if (spd.nr_pages <= 0) + ret = spd.nr_pages; + else + ret = splice_to_pipe(pipe, &spd); + + splice_shrink_spd(pipe, &spd); + return ret; +} + +/* + * Note that vmsplice only really supports true splicing _from_ user memory + * to a pipe, not the other way around. Splicing from user memory is a simple + * operation that can be supported without any funky alignment restrictions + * or nasty vm tricks. We simply map in the user memory and fill them into + * a pipe. The reverse isn't quite as easy, though. There are two possible + * solutions for that: + * + * - memcpy() the data internally, at which point we might as well just + * do a regular read() on the buffer anyway. + * - Lots of nasty vm tricks, that are neither fast nor flexible (it + * has restriction limitations on both ends of the pipe). + * + * Currently we punt and implement it as a normal copy, see pipe_to_user(). + * + */ +SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, + unsigned long, nr_segs, unsigned int, flags) +{ + struct file *file; + long error; + int fput; + + if (unlikely(nr_segs > UIO_MAXIOV)) + return -EINVAL; + else if (unlikely(!nr_segs)) + return 0; + + error = -EBADF; + file = fget_light(fd, &fput); + if (file) { + if (file->f_mode & FMODE_WRITE) + error = vmsplice_to_pipe(file, iov, nr_segs, flags); + else if (file->f_mode & FMODE_READ) + error = vmsplice_to_user(file, iov, nr_segs, flags); + + fput_light(file, fput); + } + + return error; +} + +SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, + int, fd_out, loff_t __user *, off_out, + size_t, len, unsigned int, flags) +{ + long error; + struct file *in, *out; + int fput_in, fput_out; + + if (unlikely(!len)) + return 0; + + error = -EBADF; + in = fget_light(fd_in, &fput_in); + if (in) { + if (in->f_mode & FMODE_READ) { + out = fget_light(fd_out, &fput_out); + if (out) { + if (out->f_mode & FMODE_WRITE) + error = do_splice(in, off_in, + out, off_out, + len, flags); + fput_light(out, fput_out); + } + } + + fput_light(in, fput_in); + } + + return error; +} + +/* + * Make sure there's data to read. Wait for input if we can, otherwise + * return an appropriate error. + */ +static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +{ + int ret; + + /* + * Check ->nrbufs without the inode lock first. This function + * is speculative anyways, so missing one is ok. + */ + if (pipe->nrbufs) + return 0; + + ret = 0; + pipe_lock(pipe); + + while (!pipe->nrbufs) { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + if (!pipe->writers) + break; + if (!pipe->waiting_writers) { + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + } + pipe_wait(pipe); + } + + pipe_unlock(pipe); + return ret; +} + +/* + * Make sure there's writeable room. Wait for room if we can, otherwise + * return an appropriate error. + */ +static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +{ + int ret; + + /* + * Check ->nrbufs without the inode lock first. This function + * is speculative anyways, so missing one is ok. + */ + if (pipe->nrbufs < pipe->buffers) + return 0; + + ret = 0; + pipe_lock(pipe); + + while (pipe->nrbufs >= pipe->buffers) { + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + break; + } + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } + + pipe_unlock(pipe); + return ret; +} + +/* + * Splice contents of ipipe to opipe. + */ +static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) +{ + struct pipe_buffer *ibuf, *obuf; + int ret = 0, nbuf; + bool input_wakeup = false; + + +retry: + ret = ipipe_prep(ipipe, flags); + if (ret) + return ret; + + ret = opipe_prep(opipe, flags); + if (ret) + return ret; + + /* + * Potential ABBA deadlock, work around it by ordering lock + * grabbing by pipe info address. Otherwise two different processes + * could deadlock (one doing tee from A -> B, the other from B -> A). + */ + pipe_double_lock(ipipe, opipe); + + do { + if (!opipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + + if (!ipipe->nrbufs && !ipipe->writers) + break; + + /* + * Cannot make any progress, because either the input + * pipe is empty or the output pipe is full. + */ + if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { + /* Already processed some buffers, break */ + if (ret) + break; + + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + + /* + * We raced with another reader/writer and haven't + * managed to process any buffers. A zero return + * value means EOF, so retry instead. + */ + pipe_unlock(ipipe); + pipe_unlock(opipe); + goto retry; + } + + ibuf = ipipe->bufs + ipipe->curbuf; + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); + obuf = opipe->bufs + nbuf; + + if (len >= ibuf->len) { + /* + * Simply move the whole buffer from ipipe to opipe + */ + *obuf = *ibuf; + ibuf->ops = NULL; + opipe->nrbufs++; + ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); + ipipe->nrbufs--; + input_wakeup = true; + } else { + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); + *obuf = *ibuf; + + /* + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. + */ + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + + obuf->len = len; + opipe->nrbufs++; + ibuf->offset += obuf->len; + ibuf->len -= obuf->len; + } + ret += obuf->len; + len -= obuf->len; + } while (len); + + pipe_unlock(ipipe); + pipe_unlock(opipe); + + /* + * If we put data in the output pipe, wakeup any potential readers. + */ + if (ret > 0) + wakeup_pipe_readers(opipe); + + if (input_wakeup) + wakeup_pipe_writers(ipipe); + + return ret; +} + +/* + * Link contents of ipipe to opipe. + */ +static int link_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) +{ + struct pipe_buffer *ibuf, *obuf; + int ret = 0, i = 0, nbuf; + + /* + * Potential ABBA deadlock, work around it by ordering lock + * grabbing by pipe info address. Otherwise two different processes + * could deadlock (one doing tee from A -> B, the other from B -> A). + */ + pipe_double_lock(ipipe, opipe); + + do { + if (!opipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + + /* + * If we have iterated all input buffers or ran out of + * output room, break. + */ + if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) + break; + + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); + + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); + + obuf = opipe->bufs + nbuf; + *obuf = *ibuf; + + /* + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. + */ + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + + if (obuf->len > len) + obuf->len = len; + + opipe->nrbufs++; + ret += obuf->len; + len -= obuf->len; + i++; + } while (len); + + /* + * return EAGAIN if we have the potential of some data in the + * future, otherwise just return 0 + */ + if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) + ret = -EAGAIN; + + pipe_unlock(ipipe); + pipe_unlock(opipe); + + /* + * If we put data in the output pipe, wakeup any potential readers. + */ + if (ret > 0) + wakeup_pipe_readers(opipe); + + return ret; +} + +/* + * This is a tee(1) implementation that works on pipes. It doesn't copy + * any data, it simply references the 'in' pages on the 'out' pipe. + * The 'flags' used are the SPLICE_F_* variants, currently the only + * applicable one is SPLICE_F_NONBLOCK. + */ +static long do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags) +{ + struct pipe_inode_info *ipipe = get_pipe_info(in); + struct pipe_inode_info *opipe = get_pipe_info(out); + int ret = -EINVAL; + + /* + * Duplicate the contents of ipipe to opipe without actually + * copying the data. + */ + if (ipipe && opipe && ipipe != opipe) { + /* + * Keep going, unless we encounter an error. The ipipe/opipe + * ordering doesn't really matter. + */ + ret = ipipe_prep(ipipe, flags); + if (!ret) { + ret = opipe_prep(opipe, flags); + if (!ret) + ret = link_pipe(ipipe, opipe, len, flags); + } + } + + return ret; +} + +SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) +{ + struct file *in; + int error, fput_in; + + if (unlikely(!len)) + return 0; + + error = -EBADF; + in = fget_light(fdin, &fput_in); + if (in) { + if (in->f_mode & FMODE_READ) { + int fput_out; + struct file *out = fget_light(fdout, &fput_out); + + if (out) { + if (out->f_mode & FMODE_WRITE) + error = do_tee(in, out, len, flags); + fput_light(out, fput_out); + } + } + fput_light(in, fput_in); + } + + return error; +} diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig new file mode 100644 index 00000000..7797218d --- /dev/null +++ b/fs/squashfs/Kconfig @@ -0,0 +1,89 @@ +config SQUASHFS + tristate "SquashFS 4.0 - Squashed file system support" + depends on BLOCK + select ZLIB_INFLATE + help + Saying Y here includes support for SquashFS 4.0 (a Compressed + Read-Only File System). Squashfs is a highly compressed read-only + filesystem for Linux. It uses zlib, lzo or xz compression to + compress both files, inodes and directories. Inodes in the system + are very small and all blocks are packed to minimise data overhead. + Block sizes greater than 4K are supported up to a maximum of 1 Mbytes + (default block size 128K). SquashFS 4.0 supports 64 bit filesystems + and files (larger than 4GB), full uid/gid information, hard links and + timestamps. + + Squashfs is intended for general read-only filesystem use, for + archival use (i.e. in cases where a .tar.gz file may be used), and in + embedded systems where low overhead is needed. Further information + and tools are available from http://squashfs.sourceforge.net. + + If you want to compile this as a module ( = code which can be + inserted in and removed from the running kernel whenever you want), + say M here and read . The module + will be called squashfs. Note that the root file system (the one + containing the directory /) cannot be compiled as a module. + + If unsure, say N. + +config SQUASHFS_XATTR + bool "Squashfs XATTR support" + depends on SQUASHFS + help + Saying Y here includes support for extended attributes (xattrs). + Xattrs are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page). + + If unsure, say N. + +config SQUASHFS_LZO + bool "Include support for LZO compressed file systems" + depends on SQUASHFS + select LZO_DECOMPRESS + help + Saying Y here includes support for reading Squashfs file systems + compressed with LZO compression. LZO compression is mainly + aimed at embedded systems with slower CPUs where the overheads + of zlib are too high. + + LZO is not the standard compression used in Squashfs and so most + file systems will be readable without selecting this option. + + If unsure, say N. + +config SQUASHFS_XZ + bool "Include support for XZ compressed file systems" + depends on SQUASHFS + select XZ_DEC + help + Saying Y here includes support for reading Squashfs file systems + compressed with XZ compression. XZ gives better compression than + the default zlib compression, at the expense of greater CPU and + memory overhead. + + XZ is not the standard compression used in Squashfs and so most + file systems will be readable without selecting this option. + + If unsure, say N. + +config SQUASHFS_EMBEDDED + bool "Additional option for memory-constrained systems" + depends on SQUASHFS + help + Saying Y here allows you to specify cache size. + + If unsure, say N. + +config SQUASHFS_FRAGMENT_CACHE_SIZE + int "Number of fragments cached" if SQUASHFS_EMBEDDED + depends on SQUASHFS + default "3" + help + By default SquashFS caches the last 3 fragments read from + the filesystem. Increasing this amount may mean SquashFS + has to re-read fragments less often from disk, at the expense + of extra system memory. Decreasing this amount will mean + SquashFS uses less memory at the expense of extra reads from disk. + + Note there must be at least one cached fragment. Anything + much more than three will probably not make much difference. diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile new file mode 100644 index 00000000..cecf2bea --- /dev/null +++ b/fs/squashfs/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the linux squashfs routines. +# + +obj-$(CONFIG_SQUASHFS) += squashfs.o +squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o +squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o +squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o +squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o +squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c new file mode 100644 index 00000000..ed0eb2a9 --- /dev/null +++ b/fs/squashfs/block.c @@ -0,0 +1,210 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * block.c + */ + +/* + * This file implements the low-level routines to read and decompress + * datablocks and metadata blocks. + */ + +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "decompressor.h" + +/* + * Read the metadata block length, this is stored in the first two + * bytes of the metadata block. + */ +static struct buffer_head *get_block_length(struct super_block *sb, + u64 *cur_index, int *offset, int *length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + struct buffer_head *bh; + + bh = sb_bread(sb, *cur_index); + if (bh == NULL) + return NULL; + + if (msblk->devblksize - *offset == 1) { + *length = (unsigned char) bh->b_data[*offset]; + put_bh(bh); + bh = sb_bread(sb, ++(*cur_index)); + if (bh == NULL) + return NULL; + *length |= (unsigned char) bh->b_data[0] << 8; + *offset = 1; + } else { + *length = (unsigned char) bh->b_data[*offset] | + (unsigned char) bh->b_data[*offset + 1] << 8; + *offset += 2; + + if (*offset == msblk->devblksize) { + put_bh(bh); + bh = sb_bread(sb, ++(*cur_index)); + if (bh == NULL) + return NULL; + *offset = 0; + } + } + + return bh; +} + + +/* + * Read and decompress a metadata block or datablock. Length is non-zero + * if a datablock is being read (the size is stored elsewhere in the + * filesystem), otherwise the length is obtained from the first two bytes of + * the metadata block. A bit in the length field indicates if the block + * is stored uncompressed in the filesystem (usually because compression + * generated a larger block - this does occasionally happen with zlib). + */ +int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, + int length, u64 *next_index, int srclength, int pages) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + struct buffer_head **bh; + int offset = index & ((1 << msblk->devblksize_log2) - 1); + u64 cur_index = index >> msblk->devblksize_log2; + int bytes, compressed, b = 0, k = 0, page = 0, avail; + + bh = kcalloc(((srclength + msblk->devblksize - 1) + >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); + if (bh == NULL) + return -ENOMEM; + + if (length) { + /* + * Datablock. + */ + bytes = -offset; + compressed = SQUASHFS_COMPRESSED_BLOCK(length); + length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); + if (next_index) + *next_index = index + length; + + TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", + index, compressed ? "" : "un", length, srclength); + + if (length < 0 || length > srclength || + (index + length) > msblk->bytes_used) + goto read_failure; + + for (b = 0; bytes < length; b++, cur_index++) { + bh[b] = sb_getblk(sb, cur_index); + if (bh[b] == NULL) + goto block_release; + bytes += msblk->devblksize; + } + ll_rw_block(READ, b, bh); + } else { + /* + * Metadata block. + */ + if ((index + 2) > msblk->bytes_used) + goto read_failure; + + bh[0] = get_block_length(sb, &cur_index, &offset, &length); + if (bh[0] == NULL) + goto read_failure; + b = 1; + + bytes = msblk->devblksize - offset; + compressed = SQUASHFS_COMPRESSED(length); + length = SQUASHFS_COMPRESSED_SIZE(length); + if (next_index) + *next_index = index + length + 2; + + TRACE("Block @ 0x%llx, %scompressed size %d\n", index, + compressed ? "" : "un", length); + + if (length < 0 || length > srclength || + (index + length) > msblk->bytes_used) + goto block_release; + + for (; bytes < length; b++) { + bh[b] = sb_getblk(sb, ++cur_index); + if (bh[b] == NULL) + goto block_release; + bytes += msblk->devblksize; + } + ll_rw_block(READ, b - 1, bh + 1); + } + + if (compressed) { + length = squashfs_decompress(msblk, buffer, bh, b, offset, + length, srclength, pages); + if (length < 0) + goto read_failure; + } else { + /* + * Block is uncompressed. + */ + int i, in, pg_offset = 0; + + for (i = 0; i < b; i++) { + wait_on_buffer(bh[i]); + if (!buffer_uptodate(bh[i])) + goto block_release; + } + + for (bytes = length; k < b; k++) { + in = min(bytes, msblk->devblksize - offset); + bytes -= in; + while (in) { + if (pg_offset == PAGE_CACHE_SIZE) { + page++; + pg_offset = 0; + } + avail = min_t(int, in, PAGE_CACHE_SIZE - + pg_offset); + memcpy(buffer[page] + pg_offset, + bh[k]->b_data + offset, avail); + in -= avail; + pg_offset += avail; + offset += avail; + } + offset = 0; + put_bh(bh[k]); + } + } + + kfree(bh); + return length; + +block_release: + for (; k < b; k++) + put_bh(bh[k]); + +read_failure: + ERROR("squashfs_read_data failed to read block 0x%llx\n", + (unsigned long long) index); + kfree(bh); + return -EIO; +} diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c new file mode 100644 index 00000000..f744be98 --- /dev/null +++ b/fs/squashfs/cache.c @@ -0,0 +1,428 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * cache.c + */ + +/* + * Blocks in Squashfs are compressed. To avoid repeatedly decompressing + * recently accessed data Squashfs uses two small metadata and fragment caches. + * + * This file implements a generic cache implementation used for both caches, + * plus functions layered ontop of the generic cache implementation to + * access the metadata and fragment caches. + * + * To avoid out of memory and fragmentation issues with vmalloc the cache + * uses sequences of kmalloced PAGE_CACHE_SIZE buffers. + * + * It should be noted that the cache is not used for file datablocks, these + * are decompressed and cached in the page-cache in the normal way. The + * cache is only used to temporarily cache fragment and metadata blocks + * which have been read as as a result of a metadata (i.e. inode or + * directory) or fragment access. Because metadata and fragments are packed + * together into blocks (to gain greater compression) the read of a particular + * piece of metadata or fragment will retrieve other metadata/fragments which + * have been packed with it, these because of locality-of-reference may be read + * in the near future. Temporarily caching them ensures they are available for + * near future access without requiring an additional read and decompress. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" + +/* + * Look-up block in cache, and increment usage count. If not in cache, read + * and decompress it from disk. + */ +struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb, + struct squashfs_cache *cache, u64 block, int length) +{ + int i, n; + struct squashfs_cache_entry *entry; + + spin_lock(&cache->lock); + + while (1) { + for (i = 0; i < cache->entries; i++) + if (cache->entry[i].block == block) + break; + + if (i == cache->entries) { + /* + * Block not in cache, if all cache entries are used + * go to sleep waiting for one to become available. + */ + if (cache->unused == 0) { + cache->num_waiters++; + spin_unlock(&cache->lock); + wait_event(cache->wait_queue, cache->unused); + spin_lock(&cache->lock); + cache->num_waiters--; + continue; + } + + /* + * At least one unused cache entry. A simple + * round-robin strategy is used to choose the entry to + * be evicted from the cache. + */ + i = cache->next_blk; + for (n = 0; n < cache->entries; n++) { + if (cache->entry[i].refcount == 0) + break; + i = (i + 1) % cache->entries; + } + + cache->next_blk = (i + 1) % cache->entries; + entry = &cache->entry[i]; + + /* + * Initialise chosen cache entry, and fill it in from + * disk. + */ + cache->unused--; + entry->block = block; + entry->refcount = 1; + entry->pending = 1; + entry->num_waiters = 0; + entry->error = 0; + spin_unlock(&cache->lock); + + entry->length = squashfs_read_data(sb, entry->data, + block, length, &entry->next_index, + cache->block_size, cache->pages); + + spin_lock(&cache->lock); + + if (entry->length < 0) + entry->error = entry->length; + + entry->pending = 0; + + /* + * While filling this entry one or more other processes + * have looked it up in the cache, and have slept + * waiting for it to become available. + */ + if (entry->num_waiters) { + spin_unlock(&cache->lock); + wake_up_all(&entry->wait_queue); + } else + spin_unlock(&cache->lock); + + goto out; + } + + /* + * Block already in cache. Increment refcount so it doesn't + * get reused until we're finished with it, if it was + * previously unused there's one less cache entry available + * for reuse. + */ + entry = &cache->entry[i]; + if (entry->refcount == 0) + cache->unused--; + entry->refcount++; + + /* + * If the entry is currently being filled in by another process + * go to sleep waiting for it to become available. + */ + if (entry->pending) { + entry->num_waiters++; + spin_unlock(&cache->lock); + wait_event(entry->wait_queue, !entry->pending); + } else + spin_unlock(&cache->lock); + + goto out; + } + +out: + TRACE("Got %s %d, start block %lld, refcount %d, error %d\n", + cache->name, i, entry->block, entry->refcount, entry->error); + + if (entry->error) + ERROR("Unable to read %s cache entry [%llx]\n", cache->name, + block); + return entry; +} + + +/* + * Release cache entry, once usage count is zero it can be reused. + */ +void squashfs_cache_put(struct squashfs_cache_entry *entry) +{ + struct squashfs_cache *cache = entry->cache; + + spin_lock(&cache->lock); + entry->refcount--; + if (entry->refcount == 0) { + cache->unused++; + /* + * If there's any processes waiting for a block to become + * available, wake one up. + */ + if (cache->num_waiters) { + spin_unlock(&cache->lock); + wake_up(&cache->wait_queue); + return; + } + } + spin_unlock(&cache->lock); +} + +/* + * Delete cache reclaiming all kmalloced buffers. + */ +void squashfs_cache_delete(struct squashfs_cache *cache) +{ + int i, j; + + if (cache == NULL) + return; + + for (i = 0; i < cache->entries; i++) { + if (cache->entry[i].data) { + for (j = 0; j < cache->pages; j++) + kfree(cache->entry[i].data[j]); + kfree(cache->entry[i].data); + } + } + + kfree(cache->entry); + kfree(cache); +} + + +/* + * Initialise cache allocating the specified number of entries, each of + * size block_size. To avoid vmalloc fragmentation issues each entry + * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers. + */ +struct squashfs_cache *squashfs_cache_init(char *name, int entries, + int block_size) +{ + int i, j; + struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL); + + if (cache == NULL) { + ERROR("Failed to allocate %s cache\n", name); + return NULL; + } + + cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL); + if (cache->entry == NULL) { + ERROR("Failed to allocate %s cache\n", name); + goto cleanup; + } + + cache->next_blk = 0; + cache->unused = entries; + cache->entries = entries; + cache->block_size = block_size; + cache->pages = block_size >> PAGE_CACHE_SHIFT; + cache->pages = cache->pages ? cache->pages : 1; + cache->name = name; + cache->num_waiters = 0; + spin_lock_init(&cache->lock); + init_waitqueue_head(&cache->wait_queue); + + for (i = 0; i < entries; i++) { + struct squashfs_cache_entry *entry = &cache->entry[i]; + + init_waitqueue_head(&cache->entry[i].wait_queue); + entry->cache = cache; + entry->block = SQUASHFS_INVALID_BLK; + entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL); + if (entry->data == NULL) { + ERROR("Failed to allocate %s cache entry\n", name); + goto cleanup; + } + + for (j = 0; j < cache->pages; j++) { + entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + if (entry->data[j] == NULL) { + ERROR("Failed to allocate %s buffer\n", name); + goto cleanup; + } + } + } + + return cache; + +cleanup: + squashfs_cache_delete(cache); + return NULL; +} + + +/* + * Copy up to length bytes from cache entry to buffer starting at offset bytes + * into the cache entry. If there's not length bytes then copy the number of + * bytes available. In all cases return the number of bytes copied. + */ +int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry, + int offset, int length) +{ + int remaining = length; + + if (length == 0) + return 0; + else if (buffer == NULL) + return min(length, entry->length - offset); + + while (offset < entry->length) { + void *buff = entry->data[offset / PAGE_CACHE_SIZE] + + (offset % PAGE_CACHE_SIZE); + int bytes = min_t(int, entry->length - offset, + PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE)); + + if (bytes >= remaining) { + memcpy(buffer, buff, remaining); + remaining = 0; + break; + } + + memcpy(buffer, buff, bytes); + buffer += bytes; + remaining -= bytes; + offset += bytes; + } + + return length - remaining; +} + + +/* + * Read length bytes from metadata position (block is the + * start of the compressed block on disk, and offset is the offset into + * the block once decompressed). Data is packed into consecutive blocks, + * and length bytes may require reading more than one block. + */ +int squashfs_read_metadata(struct super_block *sb, void *buffer, + u64 *block, int *offset, int length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int bytes, copied = length; + struct squashfs_cache_entry *entry; + + TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset); + + while (length) { + entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0); + if (entry->error) + return entry->error; + else if (*offset >= entry->length) + return -EIO; + + bytes = squashfs_copy_data(buffer, entry, *offset, length); + if (buffer) + buffer += bytes; + length -= bytes; + *offset += bytes; + + if (*offset == entry->length) { + *block = entry->next_index; + *offset = 0; + } + + squashfs_cache_put(entry); + } + + return copied; +} + + +/* + * Look-up in the fragmment cache the fragment located at in the + * filesystem. If necessary read and decompress it from disk. + */ +struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb, + u64 start_block, int length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + + return squashfs_cache_get(sb, msblk->fragment_cache, start_block, + length); +} + + +/* + * Read and decompress the datablock located at in the + * filesystem. The cache is used here to avoid duplicating locking and + * read/decompress code. + */ +struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb, + u64 start_block, int length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + + return squashfs_cache_get(sb, msblk->read_page, start_block, length); +} + + +/* + * Read a filesystem table (uncompressed sequence of bytes) from disk + */ +void *squashfs_read_table(struct super_block *sb, u64 block, int length) +{ + int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int i, res; + void *table, *buffer, **data; + + table = buffer = kmalloc(length, GFP_KERNEL); + if (table == NULL) + return ERR_PTR(-ENOMEM); + + data = kcalloc(pages, sizeof(void *), GFP_KERNEL); + if (data == NULL) { + res = -ENOMEM; + goto failed; + } + + for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) + data[i] = buffer; + + res = squashfs_read_data(sb, data, block, length | + SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages); + + kfree(data); + + if (res < 0) + goto failed; + + return table; + +failed: + kfree(table); + return ERR_PTR(res); +} diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c new file mode 100644 index 00000000..9f1b0bb9 --- /dev/null +++ b/fs/squashfs/decompressor.c @@ -0,0 +1,110 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * decompressor.c + */ + +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file (and decompressor.h) implements a decompressor framework for + * Squashfs, allowing multiple decompressors to be easily supported + */ + +static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { + NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 +}; + +#ifndef CONFIG_SQUASHFS_LZO +static const struct squashfs_decompressor squashfs_lzo_comp_ops = { + NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 +}; +#endif + +#ifndef CONFIG_SQUASHFS_XZ +static const struct squashfs_decompressor squashfs_xz_comp_ops = { + NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0 +}; +#endif + +static const struct squashfs_decompressor squashfs_unknown_comp_ops = { + NULL, NULL, NULL, 0, "unknown", 0 +}; + +static const struct squashfs_decompressor *decompressor[] = { + &squashfs_zlib_comp_ops, + &squashfs_lzo_comp_ops, + &squashfs_xz_comp_ops, + &squashfs_lzma_unsupported_comp_ops, + &squashfs_unknown_comp_ops +}; + + +const struct squashfs_decompressor *squashfs_lookup_decompressor(int id) +{ + int i; + + for (i = 0; decompressor[i]->id; i++) + if (id == decompressor[i]->id) + break; + + return decompressor[i]; +} + + +void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + void *strm, *buffer = NULL; + int length = 0; + + /* + * Read decompressor specific options from file system if present + */ + if (SQUASHFS_COMP_OPTS(flags)) { + buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + if (buffer == NULL) + return ERR_PTR(-ENOMEM); + + length = squashfs_read_data(sb, &buffer, + sizeof(struct squashfs_super_block), 0, NULL, + PAGE_CACHE_SIZE, 1); + + if (length < 0) { + strm = ERR_PTR(length); + goto finished; + } + } + + strm = msblk->decompressor->init(msblk, buffer, length); + +finished: + kfree(buffer); + + return strm; +} diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h new file mode 100644 index 00000000..8ba70cff --- /dev/null +++ b/fs/squashfs/decompressor.h @@ -0,0 +1,59 @@ +#ifndef DECOMPRESSOR_H +#define DECOMPRESSOR_H +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * decompressor.h + */ + +struct squashfs_decompressor { + void *(*init)(struct squashfs_sb_info *, void *, int); + void (*free)(void *); + int (*decompress)(struct squashfs_sb_info *, void **, + struct buffer_head **, int, int, int, int, int); + int id; + char *name; + int supported; +}; + +static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk, + void *s) +{ + if (msblk->decompressor) + msblk->decompressor->free(s); +} + +static inline int squashfs_decompress(struct squashfs_sb_info *msblk, + void **buffer, struct buffer_head **bh, int b, int offset, int length, + int srclength, int pages) +{ + return msblk->decompressor->decompress(msblk, buffer, bh, b, offset, + length, srclength, pages); +} + +#ifdef CONFIG_SQUASHFS_XZ +extern const struct squashfs_decompressor squashfs_xz_comp_ops; +#endif + +#ifdef CONFIG_SQUASHFS_LZO +extern const struct squashfs_decompressor squashfs_lzo_comp_ops; +#endif + +#endif diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c new file mode 100644 index 00000000..9dfe2ce0 --- /dev/null +++ b/fs/squashfs/dir.c @@ -0,0 +1,244 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * dir.c + */ + +/* + * This file implements code to read directories from disk. + * + * See namei.c for a description of directory organisation on disk. + */ + +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +static const unsigned char squashfs_filetype_table[] = { + DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK +}; + +/* + * Lookup offset (f_pos) in the directory index, returning the + * metadata block containing it. + * + * If we get an error reading the index then return the part of the index + * (if any) we have managed to read - the index isn't essential, just + * quicker. + */ +static int get_dir_index_using_offset(struct super_block *sb, + u64 *next_block, int *next_offset, u64 index_start, int index_offset, + int i_count, u64 f_pos) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int err, i, index, length = 0; + struct squashfs_dir_index dir_index; + + TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n", + i_count, f_pos); + + /* + * Translate from external f_pos to the internal f_pos. This + * is offset by 3 because we invent "." and ".." entries which are + * not actually stored in the directory. + */ + if (f_pos < 3) + return f_pos; + f_pos -= 3; + + for (i = 0; i < i_count; i++) { + err = squashfs_read_metadata(sb, &dir_index, &index_start, + &index_offset, sizeof(dir_index)); + if (err < 0) + break; + + index = le32_to_cpu(dir_index.index); + if (index > f_pos) + /* + * Found the index we're looking for. + */ + break; + + err = squashfs_read_metadata(sb, NULL, &index_start, + &index_offset, le32_to_cpu(dir_index.size) + 1); + if (err < 0) + break; + + length = index; + *next_block = le32_to_cpu(dir_index.start_block) + + msblk->directory_table; + } + + *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE; + + /* + * Translate back from internal f_pos to external f_pos. + */ + return length + 3; +} + + +static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + struct inode *inode = file->f_dentry->d_inode; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + u64 block = squashfs_i(inode)->start + msblk->directory_table; + int offset = squashfs_i(inode)->offset, length = 0, dir_count, size, + type, err; + unsigned int inode_number; + struct squashfs_dir_header dirh; + struct squashfs_dir_entry *dire; + + TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset); + + dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); + if (dire == NULL) { + ERROR("Failed to allocate squashfs_dir_entry\n"); + goto finish; + } + + /* + * Return "." and ".." entries as the first two filenames in the + * directory. To maximise compression these two entries are not + * stored in the directory, and so we invent them here. + * + * It also means that the external f_pos is offset by 3 from the + * on-disk directory f_pos. + */ + while (file->f_pos < 3) { + char *name; + int i_ino; + + if (file->f_pos == 0) { + name = "."; + size = 1; + i_ino = inode->i_ino; + } else { + name = ".."; + size = 2; + i_ino = squashfs_i(inode)->parent; + } + + TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n", + dirent, name, size, file->f_pos, i_ino, + squashfs_filetype_table[1]); + + if (filldir(dirent, name, size, file->f_pos, i_ino, + squashfs_filetype_table[1]) < 0) { + TRACE("Filldir returned less than 0\n"); + goto finish; + } + + file->f_pos += size; + } + + length = get_dir_index_using_offset(inode->i_sb, &block, &offset, + squashfs_i(inode)->dir_idx_start, + squashfs_i(inode)->dir_idx_offset, + squashfs_i(inode)->dir_idx_cnt, + file->f_pos); + + while (length < i_size_read(inode)) { + /* + * Read directory header + */ + err = squashfs_read_metadata(inode->i_sb, &dirh, &block, + &offset, sizeof(dirh)); + if (err < 0) + goto failed_read; + + length += sizeof(dirh); + + dir_count = le32_to_cpu(dirh.count) + 1; + + /* dir_count should never be larger than 256 */ + if (dir_count > 256) + goto failed_read; + + while (dir_count--) { + /* + * Read directory entry. + */ + err = squashfs_read_metadata(inode->i_sb, dire, &block, + &offset, sizeof(*dire)); + if (err < 0) + goto failed_read; + + size = le16_to_cpu(dire->size) + 1; + + /* size should never be larger than SQUASHFS_NAME_LEN */ + if (size > SQUASHFS_NAME_LEN) + goto failed_read; + + err = squashfs_read_metadata(inode->i_sb, dire->name, + &block, &offset, size); + if (err < 0) + goto failed_read; + + length += sizeof(*dire) + size; + + if (file->f_pos >= length) + continue; + + dire->name[size] = '\0'; + inode_number = le32_to_cpu(dirh.inode_number) + + ((short) le16_to_cpu(dire->inode_number)); + type = le16_to_cpu(dire->type); + + TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)" + "\n", dirent, dire->name, size, + file->f_pos, + le32_to_cpu(dirh.start_block), + le16_to_cpu(dire->offset), + inode_number, + squashfs_filetype_table[type]); + + if (filldir(dirent, dire->name, size, file->f_pos, + inode_number, + squashfs_filetype_table[type]) < 0) { + TRACE("Filldir returned less than 0\n"); + goto finish; + } + + file->f_pos = length; + } + } + +finish: + kfree(dire); + return 0; + +failed_read: + ERROR("Unable to read directory block [%llx:%x]\n", block, offset); + kfree(dire); + return 0; +} + + +const struct file_operations squashfs_dir_ops = { + .read = generic_read_dir, + .readdir = squashfs_readdir, + .llseek = default_llseek, +}; diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c new file mode 100644 index 00000000..5e1101ff --- /dev/null +++ b/fs/squashfs/export.c @@ -0,0 +1,163 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * export.c + */ + +/* + * This file implements code to make Squashfs filesystems exportable (NFS etc.) + * + * The export code uses an inode lookup table to map inode numbers passed in + * filehandles to an inode location on disk. This table is stored compressed + * into metadata blocks. A second index table is used to locate these. This + * second index table for speed of access (and because it is small) is read at + * mount time and cached in memory. + * + * The inode lookup table is used only by the export code, inode disk + * locations are directly encoded in directories, enabling direct access + * without an intermediate lookup for all operations except the export ops. + */ + +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Look-up inode number (ino) in table, returning the inode location. + */ +static long long squashfs_inode_lookup(struct super_block *sb, int ino_num) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1); + int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1); + u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]); + __le64 ino; + int err; + + TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num); + + err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino)); + if (err < 0) + return err; + + TRACE("squashfs_inode_lookup, inode = 0x%llx\n", + (u64) le64_to_cpu(ino)); + + return le64_to_cpu(ino); +} + + +static struct dentry *squashfs_export_iget(struct super_block *sb, + unsigned int ino_num) +{ + long long ino; + struct dentry *dentry = ERR_PTR(-ENOENT); + + TRACE("Entered squashfs_export_iget\n"); + + ino = squashfs_inode_lookup(sb, ino_num); + if (ino >= 0) + dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num)); + + return dentry; +} + + +static struct dentry *squashfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT) + || fh_len < 2) + return NULL; + + return squashfs_export_iget(sb, fid->i32.ino); +} + + +static struct dentry *squashfs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4) + return NULL; + + return squashfs_export_iget(sb, fid->i32.parent_ino); +} + + +static struct dentry *squashfs_get_parent(struct dentry *child) +{ + struct inode *inode = child->d_inode; + unsigned int parent_ino = squashfs_i(inode)->parent; + + return squashfs_export_iget(inode->i_sb, parent_ino); +} + + +/* + * Read uncompressed inode lookup table indexes off disk into memory + */ +__le64 *squashfs_read_inode_lookup_table(struct super_block *sb, + u64 lookup_table_start, u64 next_table, unsigned int inodes) +{ + unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes); + __le64 *table; + + TRACE("In read_inode_lookup_table, length %d\n", length); + + /* Sanity check values */ + + /* there should always be at least one inode */ + if (inodes == 0) + return ERR_PTR(-EINVAL); + + /* length bytes should not extend into the next table - this check + * also traps instances where lookup_table_start is incorrectly larger + * than the next table start + */ + if (lookup_table_start + length > next_table) + return ERR_PTR(-EINVAL); + + table = squashfs_read_table(sb, lookup_table_start, length); + + /* + * table[0] points to the first inode lookup table metadata block, + * this should be less than lookup_table_start + */ + if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) { + kfree(table); + return ERR_PTR(-EINVAL); + } + + return table; +} + + +const struct export_operations squashfs_export_ops = { + .fh_to_dentry = squashfs_fh_to_dentry, + .fh_to_parent = squashfs_fh_to_parent, + .get_parent = squashfs_get_parent +}; diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c new file mode 100644 index 00000000..38bb1c64 --- /dev/null +++ b/fs/squashfs/file.c @@ -0,0 +1,501 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * file.c + */ + +/* + * This file contains code for handling regular files. A regular file + * consists of a sequence of contiguous compressed blocks, and/or a + * compressed fragment block (tail-end packed block). The compressed size + * of each datablock is stored in a block list contained within the + * file inode (itself stored in one or more compressed metadata blocks). + * + * To speed up access to datablocks when reading 'large' files (256 Mbytes or + * larger), the code implements an index cache that caches the mapping from + * block index to datablock location on disk. + * + * The index cache allows Squashfs to handle large files (up to 1.75 TiB) while + * retaining a simple and space-efficient block list on disk. The cache + * is split into slots, caching up to eight 224 GiB files (128 KiB blocks). + * Larger files use multiple slots, with 1.75 TiB files using all 8 slots. + * The index cache is designed to be memory efficient, and by default uses + * 16 KiB. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Locate cache slot in range [offset, index] for specified inode. If + * there's more than one return the slot closest to index. + */ +static struct meta_index *locate_meta_index(struct inode *inode, int offset, + int index) +{ + struct meta_index *meta = NULL; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int i; + + mutex_lock(&msblk->meta_index_mutex); + + TRACE("locate_meta_index: index %d, offset %d\n", index, offset); + + if (msblk->meta_index == NULL) + goto not_allocated; + + for (i = 0; i < SQUASHFS_META_SLOTS; i++) { + if (msblk->meta_index[i].inode_number == inode->i_ino && + msblk->meta_index[i].offset >= offset && + msblk->meta_index[i].offset <= index && + msblk->meta_index[i].locked == 0) { + TRACE("locate_meta_index: entry %d, offset %d\n", i, + msblk->meta_index[i].offset); + meta = &msblk->meta_index[i]; + offset = meta->offset; + } + } + + if (meta) + meta->locked = 1; + +not_allocated: + mutex_unlock(&msblk->meta_index_mutex); + + return meta; +} + + +/* + * Find and initialise an empty cache slot for index offset. + */ +static struct meta_index *empty_meta_index(struct inode *inode, int offset, + int skip) +{ + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + struct meta_index *meta = NULL; + int i; + + mutex_lock(&msblk->meta_index_mutex); + + TRACE("empty_meta_index: offset %d, skip %d\n", offset, skip); + + if (msblk->meta_index == NULL) { + /* + * First time cache index has been used, allocate and + * initialise. The cache index could be allocated at + * mount time but doing it here means it is allocated only + * if a 'large' file is read. + */ + msblk->meta_index = kcalloc(SQUASHFS_META_SLOTS, + sizeof(*(msblk->meta_index)), GFP_KERNEL); + if (msblk->meta_index == NULL) { + ERROR("Failed to allocate meta_index\n"); + goto failed; + } + for (i = 0; i < SQUASHFS_META_SLOTS; i++) { + msblk->meta_index[i].inode_number = 0; + msblk->meta_index[i].locked = 0; + } + msblk->next_meta_index = 0; + } + + for (i = SQUASHFS_META_SLOTS; i && + msblk->meta_index[msblk->next_meta_index].locked; i--) + msblk->next_meta_index = (msblk->next_meta_index + 1) % + SQUASHFS_META_SLOTS; + + if (i == 0) { + TRACE("empty_meta_index: failed!\n"); + goto failed; + } + + TRACE("empty_meta_index: returned meta entry %d, %p\n", + msblk->next_meta_index, + &msblk->meta_index[msblk->next_meta_index]); + + meta = &msblk->meta_index[msblk->next_meta_index]; + msblk->next_meta_index = (msblk->next_meta_index + 1) % + SQUASHFS_META_SLOTS; + + meta->inode_number = inode->i_ino; + meta->offset = offset; + meta->skip = skip; + meta->entries = 0; + meta->locked = 1; + +failed: + mutex_unlock(&msblk->meta_index_mutex); + return meta; +} + + +static void release_meta_index(struct inode *inode, struct meta_index *meta) +{ + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + mutex_lock(&msblk->meta_index_mutex); + meta->locked = 0; + mutex_unlock(&msblk->meta_index_mutex); +} + + +/* + * Read the next n blocks from the block list, starting from + * metadata block . + */ +static long long read_indexes(struct super_block *sb, int n, + u64 *start_block, int *offset) +{ + int err, i; + long long block = 0; + __le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + + if (blist == NULL) { + ERROR("read_indexes: Failed to allocate block_list\n"); + return -ENOMEM; + } + + while (n) { + int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2); + + err = squashfs_read_metadata(sb, blist, start_block, + offset, blocks << 2); + if (err < 0) { + ERROR("read_indexes: reading block [%llx:%x]\n", + *start_block, *offset); + goto failure; + } + + for (i = 0; i < blocks; i++) { + int size = le32_to_cpu(blist[i]); + block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size); + } + n -= blocks; + } + + kfree(blist); + return block; + +failure: + kfree(blist); + return err; +} + + +/* + * Each cache index slot has SQUASHFS_META_ENTRIES, each of which + * can cache one index -> datablock/blocklist-block mapping. We wish + * to distribute these over the length of the file, entry[0] maps index x, + * entry[1] maps index x + skip, entry[2] maps index x + 2 * skip, and so on. + * The larger the file, the greater the skip factor. The skip factor is + * limited to the size of the metadata cache (SQUASHFS_CACHED_BLKS) to ensure + * the number of metadata blocks that need to be read fits into the cache. + * If the skip factor is limited in this way then the file will use multiple + * slots. + */ +static inline int calculate_skip(int blocks) +{ + int skip = blocks / ((SQUASHFS_META_ENTRIES + 1) + * SQUASHFS_META_INDEXES); + return min(SQUASHFS_CACHED_BLKS - 1, skip + 1); +} + + +/* + * Search and grow the index cache for the specified inode, returning the + * on-disk locations of the datablock and block list metadata block + * for index (scaled to nearest cache index). + */ +static int fill_meta_index(struct inode *inode, int index, + u64 *index_block, int *index_offset, u64 *data_block) +{ + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int skip = calculate_skip(i_size_read(inode) >> msblk->block_log); + int offset = 0; + struct meta_index *meta; + struct meta_entry *meta_entry; + u64 cur_index_block = squashfs_i(inode)->block_list_start; + int cur_offset = squashfs_i(inode)->offset; + u64 cur_data_block = squashfs_i(inode)->start; + int err, i; + + /* + * Scale index to cache index (cache slot entry) + */ + index /= SQUASHFS_META_INDEXES * skip; + + while (offset < index) { + meta = locate_meta_index(inode, offset + 1, index); + + if (meta == NULL) { + meta = empty_meta_index(inode, offset + 1, skip); + if (meta == NULL) + goto all_done; + } else { + offset = index < meta->offset + meta->entries ? index : + meta->offset + meta->entries - 1; + meta_entry = &meta->meta_entry[offset - meta->offset]; + cur_index_block = meta_entry->index_block + + msblk->inode_table; + cur_offset = meta_entry->offset; + cur_data_block = meta_entry->data_block; + TRACE("get_meta_index: offset %d, meta->offset %d, " + "meta->entries %d\n", offset, meta->offset, + meta->entries); + TRACE("get_meta_index: index_block 0x%llx, offset 0x%x" + " data_block 0x%llx\n", cur_index_block, + cur_offset, cur_data_block); + } + + /* + * If necessary grow cache slot by reading block list. Cache + * slot is extended up to index or to the end of the slot, in + * which case further slots will be used. + */ + for (i = meta->offset + meta->entries; i <= index && + i < meta->offset + SQUASHFS_META_ENTRIES; i++) { + int blocks = skip * SQUASHFS_META_INDEXES; + long long res = read_indexes(inode->i_sb, blocks, + &cur_index_block, &cur_offset); + + if (res < 0) { + if (meta->entries == 0) + /* + * Don't leave an empty slot on read + * error allocated to this inode... + */ + meta->inode_number = 0; + err = res; + goto failed; + } + + cur_data_block += res; + meta_entry = &meta->meta_entry[i - meta->offset]; + meta_entry->index_block = cur_index_block - + msblk->inode_table; + meta_entry->offset = cur_offset; + meta_entry->data_block = cur_data_block; + meta->entries++; + offset++; + } + + TRACE("get_meta_index: meta->offset %d, meta->entries %d\n", + meta->offset, meta->entries); + + release_meta_index(inode, meta); + } + +all_done: + *index_block = cur_index_block; + *index_offset = cur_offset; + *data_block = cur_data_block; + + /* + * Scale cache index (cache slot entry) to index + */ + return offset * SQUASHFS_META_INDEXES * skip; + +failed: + release_meta_index(inode, meta); + return err; +} + + +/* + * Get the on-disk location and compressed size of the datablock + * specified by index. Fill_meta_index() does most of the work. + */ +static int read_blocklist(struct inode *inode, int index, u64 *block) +{ + u64 start; + long long blks; + int offset; + __le32 size; + int res = fill_meta_index(inode, index, &start, &offset, block); + + TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset" + " 0x%x, block 0x%llx\n", res, index, start, offset, + *block); + + if (res < 0) + return res; + + /* + * res contains the index of the mapping returned by fill_meta_index(), + * this will likely be less than the desired index (because the + * meta_index cache works at a higher granularity). Read any + * extra block indexes needed. + */ + if (res < index) { + blks = read_indexes(inode->i_sb, index - res, &start, &offset); + if (blks < 0) + return (int) blks; + *block += blks; + } + + /* + * Read length of block specified by index. + */ + res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset, + sizeof(size)); + if (res < 0) + return res; + return le32_to_cpu(size); +} + + +static int squashfs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int bytes, i, offset = 0, sparse = 0; + struct squashfs_cache_entry *buffer = NULL; + void *pageaddr; + + int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT); + int start_index = page->index & ~mask; + int end_index = start_index | mask; + int file_end = i_size_read(inode) >> msblk->block_log; + + TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", + page->index, squashfs_i(inode)->start); + + if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) + goto out; + + if (index < file_end || squashfs_i(inode)->fragment_block == + SQUASHFS_INVALID_BLK) { + /* + * Reading a datablock from disk. Need to read block list + * to get location and block size. + */ + u64 block = 0; + int bsize = read_blocklist(inode, index, &block); + if (bsize < 0) + goto error_out; + + if (bsize == 0) { /* hole */ + bytes = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + sparse = 1; + } else { + /* + * Read and decompress datablock. + */ + buffer = squashfs_get_datablock(inode->i_sb, + block, bsize); + if (buffer->error) { + ERROR("Unable to read page, block %llx, size %x" + "\n", block, bsize); + squashfs_cache_put(buffer); + goto error_out; + } + bytes = buffer->length; + } + } else { + /* + * Datablock is stored inside a fragment (tail-end packed + * block). + */ + buffer = squashfs_get_fragment(inode->i_sb, + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + + if (buffer->error) { + ERROR("Unable to read page, block %llx, size %x\n", + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + squashfs_cache_put(buffer); + goto error_out; + } + bytes = i_size_read(inode) & (msblk->block_size - 1); + offset = squashfs_i(inode)->fragment_offset; + } + + /* + * Loop copying datablock into pages. As the datablock likely covers + * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly + * grab the pages from the page cache, except for the page that we've + * been called to fill. + */ + for (i = start_index; i <= end_index && bytes > 0; i++, + bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { + struct page *push_page; + int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE); + + TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail); + + push_page = (i == page->index) ? page : + grab_cache_page_nowait(page->mapping, i); + + if (!push_page) + continue; + + if (PageUptodate(push_page)) + goto skip_page; + + pageaddr = kmap_atomic(push_page, KM_USER0); + squashfs_copy_data(pageaddr, buffer, offset, avail); + memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail); + kunmap_atomic(pageaddr, KM_USER0); + flush_dcache_page(push_page); + SetPageUptodate(push_page); +skip_page: + unlock_page(push_page); + if (i != page->index) + page_cache_release(push_page); + } + + if (!sparse) + squashfs_cache_put(buffer); + + return 0; + +error_out: + SetPageError(page); +out: + pageaddr = kmap_atomic(page, KM_USER0); + memset(pageaddr, 0, PAGE_CACHE_SIZE); + kunmap_atomic(pageaddr, KM_USER0); + flush_dcache_page(page); + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + + return 0; +} + + +const struct address_space_operations squashfs_aops = { + .readpage = squashfs_readpage +}; diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c new file mode 100644 index 00000000..0ed6edbc --- /dev/null +++ b/fs/squashfs/fragment.c @@ -0,0 +1,99 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * fragment.c + */ + +/* + * This file implements code to handle compressed fragments (tail-end packed + * datablocks). + * + * Regular files contain a fragment index which is mapped to a fragment + * location on disk and compressed size using a fragment lookup table. + * Like everything in Squashfs this fragment lookup table is itself stored + * compressed into metadata blocks. A second index table is used to locate + * these. This second index table for speed of access (and because it + * is small) is read at mount time and cached in memory. + */ + +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" + +/* + * Look-up fragment using the fragment index table. Return the on disk + * location of the fragment and its compressed size + */ +int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment, + u64 *fragment_block) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int block = SQUASHFS_FRAGMENT_INDEX(fragment); + int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment); + u64 start_block = le64_to_cpu(msblk->fragment_index[block]); + struct squashfs_fragment_entry fragment_entry; + int size; + + size = squashfs_read_metadata(sb, &fragment_entry, &start_block, + &offset, sizeof(fragment_entry)); + if (size < 0) + return size; + + *fragment_block = le64_to_cpu(fragment_entry.start_block); + size = le32_to_cpu(fragment_entry.size); + + return size; +} + + +/* + * Read the uncompressed fragment lookup table indexes off disk into memory + */ +__le64 *squashfs_read_fragment_index_table(struct super_block *sb, + u64 fragment_table_start, u64 next_table, unsigned int fragments) +{ + unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments); + __le64 *table; + + /* + * Sanity check, length bytes should not extend into the next table - + * this check also traps instances where fragment_table_start is + * incorrectly larger than the next table start + */ + if (fragment_table_start + length > next_table) + return ERR_PTR(-EINVAL); + + table = squashfs_read_table(sb, fragment_table_start, length); + + /* + * table[0] points to the first fragment table metadata block, this + * should be less than fragment_table_start + */ + if (!IS_ERR(table) && le64_to_cpu(table[0]) >= fragment_table_start) { + kfree(table); + return ERR_PTR(-EINVAL); + } + + return table; +} diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c new file mode 100644 index 00000000..d38ea3da --- /dev/null +++ b/fs/squashfs/id.c @@ -0,0 +1,102 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * id.c + */ + +/* + * This file implements code to handle uids and gids. + * + * For space efficiency regular files store uid and gid indexes, which are + * converted to 32-bit uids/gids using an id look up table. This table is + * stored compressed into metadata blocks. A second index table is used to + * locate these. This second index table for speed of access (and because it + * is small) is read at mount time and cached in memory. + */ + +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" + +/* + * Map uid/gid index into real 32-bit uid/gid using the id look up table + */ +int squashfs_get_id(struct super_block *sb, unsigned int index, + unsigned int *id) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int block = SQUASHFS_ID_BLOCK(index); + int offset = SQUASHFS_ID_BLOCK_OFFSET(index); + u64 start_block = le64_to_cpu(msblk->id_table[block]); + __le32 disk_id; + int err; + + err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset, + sizeof(disk_id)); + if (err < 0) + return err; + + *id = le32_to_cpu(disk_id); + return 0; +} + + +/* + * Read uncompressed id lookup table indexes from disk into memory + */ +__le64 *squashfs_read_id_index_table(struct super_block *sb, + u64 id_table_start, u64 next_table, unsigned short no_ids) +{ + unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids); + __le64 *table; + + TRACE("In read_id_index_table, length %d\n", length); + + /* Sanity check values */ + + /* there should always be at least one id */ + if (no_ids == 0) + return ERR_PTR(-EINVAL); + + /* + * length bytes should not extend into the next table - this check + * also traps instances where id_table_start is incorrectly larger + * than the next table start + */ + if (id_table_start + length > next_table) + return ERR_PTR(-EINVAL); + + table = squashfs_read_table(sb, id_table_start, length); + + /* + * table[0] points to the first id lookup table metadata block, this + * should be less than id_table_start + */ + if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) { + kfree(table); + return ERR_PTR(-EINVAL); + } + + return table; +} diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c new file mode 100644 index 00000000..04bebcaa --- /dev/null +++ b/fs/squashfs/inode.c @@ -0,0 +1,425 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * inode.c + */ + +/* + * This file implements code to create and read inodes from disk. + * + * Inodes in Squashfs are identified by a 48-bit inode which encodes the + * location of the compressed metadata block containing the inode, and the byte + * offset into that block where the inode is placed (). + * + * To maximise compression there are different inodes for each file type + * (regular file, directory, device, etc.), the inode contents and length + * varying with the type. + * + * To further maximise compression, two types of regular file inode and + * directory inode are defined: inodes optimised for frequently occurring + * regular files and directories, and extended types where extra + * information has to be stored. + */ + +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "xattr.h" + +/* + * Initialise VFS inode with the base inode information common to all + * Squashfs inode types. Sqsh_ino contains the unswapped base inode + * off disk. + */ +static int squashfs_new_inode(struct super_block *sb, struct inode *inode, + struct squashfs_base_inode *sqsh_ino) +{ + int err; + + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &inode->i_uid); + if (err) + return err; + + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &inode->i_gid); + if (err) + return err; + + inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); + inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime); + inode->i_atime.tv_sec = inode->i_mtime.tv_sec; + inode->i_ctime.tv_sec = inode->i_mtime.tv_sec; + inode->i_mode = le16_to_cpu(sqsh_ino->mode); + inode->i_size = 0; + + return err; +} + + +struct inode *squashfs_iget(struct super_block *sb, long long ino, + unsigned int ino_number) +{ + struct inode *inode = iget_locked(sb, ino_number); + int err; + + TRACE("Entered squashfs_iget\n"); + + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = squashfs_read_inode(inode, ino); + if (err) { + iget_failed(inode); + return ERR_PTR(err); + } + + unlock_new_inode(inode); + return inode; +} + + +/* + * Initialise VFS inode by reading inode from inode table (compressed + * metadata). The format and amount of data read depends on type. + */ +int squashfs_read_inode(struct inode *inode, long long ino) +{ + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; + int err, type, offset = SQUASHFS_INODE_OFFSET(ino); + union squashfs_inode squashfs_ino; + struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; + int xattr_id = SQUASHFS_INVALID_XATTR; + + TRACE("Entered squashfs_read_inode\n"); + + /* + * Read inode base common to all inode types. + */ + err = squashfs_read_metadata(sb, sqshb_ino, &block, + &offset, sizeof(*sqshb_ino)); + if (err < 0) + goto failed_read; + + err = squashfs_new_inode(sb, inode, sqshb_ino); + if (err) + goto failed_read; + + block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; + offset = SQUASHFS_INODE_OFFSET(ino); + + type = le16_to_cpu(sqshb_ino->inode_type); + switch (type) { + case SQUASHFS_REG_TYPE: { + unsigned int frag_offset, frag; + int frag_size; + u64 frag_blk; + struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + frag = le32_to_cpu(sqsh_ino->fragment); + if (frag != SQUASHFS_INVALID_FRAG) { + frag_offset = le32_to_cpu(sqsh_ino->offset); + frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); + if (frag_size < 0) { + err = frag_size; + goto failed_read; + } + } else { + frag_blk = SQUASHFS_INVALID_BLK; + frag_size = 0; + frag_offset = 0; + } + + inode->i_nlink = 1; + inode->i_size = le32_to_cpu(sqsh_ino->file_size); + inode->i_fop = &generic_ro_fops; + inode->i_mode |= S_IFREG; + inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; + squashfs_i(inode)->fragment_block = frag_blk; + squashfs_i(inode)->fragment_size = frag_size; + squashfs_i(inode)->fragment_offset = frag_offset; + squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->block_list_start = block; + squashfs_i(inode)->offset = offset; + inode->i_data.a_ops = &squashfs_aops; + + TRACE("File inode %x:%x, start_block %llx, block_list_start " + "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), + offset, squashfs_i(inode)->start, block, offset); + break; + } + case SQUASHFS_LREG_TYPE: { + unsigned int frag_offset, frag; + int frag_size; + u64 frag_blk; + struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + frag = le32_to_cpu(sqsh_ino->fragment); + if (frag != SQUASHFS_INVALID_FRAG) { + frag_offset = le32_to_cpu(sqsh_ino->offset); + frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); + if (frag_size < 0) { + err = frag_size; + goto failed_read; + } + } else { + frag_blk = SQUASHFS_INVALID_BLK; + frag_size = 0; + frag_offset = 0; + } + + xattr_id = le32_to_cpu(sqsh_ino->xattr); + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + inode->i_size = le64_to_cpu(sqsh_ino->file_size); + inode->i_op = &squashfs_inode_ops; + inode->i_fop = &generic_ro_fops; + inode->i_mode |= S_IFREG; + inode->i_blocks = ((inode->i_size - + le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1; + + squashfs_i(inode)->fragment_block = frag_blk; + squashfs_i(inode)->fragment_size = frag_size; + squashfs_i(inode)->fragment_offset = frag_offset; + squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->block_list_start = block; + squashfs_i(inode)->offset = offset; + inode->i_data.a_ops = &squashfs_aops; + + TRACE("File inode %x:%x, start_block %llx, block_list_start " + "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), + offset, squashfs_i(inode)->start, block, offset); + break; + } + case SQUASHFS_DIR_TYPE: { + struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + inode->i_size = le16_to_cpu(sqsh_ino->file_size); + inode->i_op = &squashfs_dir_inode_ops; + inode->i_fop = &squashfs_dir_ops; + inode->i_mode |= S_IFDIR; + squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); + squashfs_i(inode)->dir_idx_cnt = 0; + squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); + + TRACE("Directory inode %x:%x, start_block %llx, offset %x\n", + SQUASHFS_INODE_BLK(ino), offset, + squashfs_i(inode)->start, + le16_to_cpu(sqsh_ino->offset)); + break; + } + case SQUASHFS_LDIR_TYPE: { + struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + xattr_id = le32_to_cpu(sqsh_ino->xattr); + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + inode->i_size = le32_to_cpu(sqsh_ino->file_size); + inode->i_op = &squashfs_dir_inode_ops; + inode->i_fop = &squashfs_dir_ops; + inode->i_mode |= S_IFDIR; + squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); + squashfs_i(inode)->dir_idx_start = block; + squashfs_i(inode)->dir_idx_offset = offset; + squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count); + squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); + + TRACE("Long directory inode %x:%x, start_block %llx, offset " + "%x\n", SQUASHFS_INODE_BLK(ino), offset, + squashfs_i(inode)->start, + le16_to_cpu(sqsh_ino->offset)); + break; + } + case SQUASHFS_SYMLINK_TYPE: + case SQUASHFS_LSYMLINK_TYPE: { + struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); + inode->i_op = &squashfs_symlink_inode_ops; + inode->i_data.a_ops = &squashfs_symlink_aops; + inode->i_mode |= S_IFLNK; + squashfs_i(inode)->start = block; + squashfs_i(inode)->offset = offset; + + if (type == SQUASHFS_LSYMLINK_TYPE) { + __le32 xattr; + + err = squashfs_read_metadata(sb, NULL, &block, + &offset, inode->i_size); + if (err < 0) + goto failed_read; + err = squashfs_read_metadata(sb, &xattr, &block, + &offset, sizeof(xattr)); + if (err < 0) + goto failed_read; + xattr_id = le32_to_cpu(xattr); + } + + TRACE("Symbolic link inode %x:%x, start_block %llx, offset " + "%x\n", SQUASHFS_INODE_BLK(ino), offset, + block, offset); + break; + } + case SQUASHFS_BLKDEV_TYPE: + case SQUASHFS_CHRDEV_TYPE: { + struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; + unsigned int rdev; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_CHRDEV_TYPE) + inode->i_mode |= S_IFCHR; + else + inode->i_mode |= S_IFBLK; + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + rdev = le32_to_cpu(sqsh_ino->rdev); + init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + + TRACE("Device inode %x:%x, rdev %x\n", + SQUASHFS_INODE_BLK(ino), offset, rdev); + break; + } + case SQUASHFS_LBLKDEV_TYPE: + case SQUASHFS_LCHRDEV_TYPE: { + struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev; + unsigned int rdev; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_LCHRDEV_TYPE) + inode->i_mode |= S_IFCHR; + else + inode->i_mode |= S_IFBLK; + xattr_id = le32_to_cpu(sqsh_ino->xattr); + inode->i_op = &squashfs_inode_ops; + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + rdev = le32_to_cpu(sqsh_ino->rdev); + init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + + TRACE("Device inode %x:%x, rdev %x\n", + SQUASHFS_INODE_BLK(ino), offset, rdev); + break; + } + case SQUASHFS_FIFO_TYPE: + case SQUASHFS_SOCKET_TYPE: { + struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_FIFO_TYPE) + inode->i_mode |= S_IFIFO; + else + inode->i_mode |= S_IFSOCK; + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + init_special_inode(inode, inode->i_mode, 0); + break; + } + case SQUASHFS_LFIFO_TYPE: + case SQUASHFS_LSOCKET_TYPE: { + struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_LFIFO_TYPE) + inode->i_mode |= S_IFIFO; + else + inode->i_mode |= S_IFSOCK; + xattr_id = le32_to_cpu(sqsh_ino->xattr); + inode->i_op = &squashfs_inode_ops; + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + init_special_inode(inode, inode->i_mode, 0); + break; + } + default: + ERROR("Unknown inode type %d in squashfs_iget!\n", type); + return -EINVAL; + } + + if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) { + err = squashfs_xattr_lookup(sb, xattr_id, + &squashfs_i(inode)->xattr_count, + &squashfs_i(inode)->xattr_size, + &squashfs_i(inode)->xattr); + if (err < 0) + goto failed_read; + inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9) + + 1; + } else + squashfs_i(inode)->xattr_count = 0; + + return 0; + +failed_read: + ERROR("Unable to read inode 0x%llx\n", ino); + return err; +} + + +const struct inode_operations squashfs_inode_ops = { + .getxattr = generic_getxattr, + .listxattr = squashfs_listxattr +}; + diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c new file mode 100644 index 00000000..00f4dfc5 --- /dev/null +++ b/fs/squashfs/lzo_wrapper.c @@ -0,0 +1,135 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 LG Electronics + * Chan Jeong + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * lzo_wrapper.c + */ + +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "decompressor.h" + +struct squashfs_lzo { + void *input; + void *output; +}; + +static void *lzo_init(struct squashfs_sb_info *msblk, void *buff, int len) +{ + int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); + + struct squashfs_lzo *stream = kzalloc(sizeof(*stream), GFP_KERNEL); + if (stream == NULL) + goto failed; + stream->input = vmalloc(block_size); + if (stream->input == NULL) + goto failed; + stream->output = vmalloc(block_size); + if (stream->output == NULL) + goto failed2; + + return stream; + +failed2: + vfree(stream->input); +failed: + ERROR("Failed to allocate lzo workspace\n"); + kfree(stream); + return ERR_PTR(-ENOMEM); +} + + +static void lzo_free(void *strm) +{ + struct squashfs_lzo *stream = strm; + + if (stream) { + vfree(stream->input); + vfree(stream->output); + } + kfree(stream); +} + + +static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer, + struct buffer_head **bh, int b, int offset, int length, int srclength, + int pages) +{ + struct squashfs_lzo *stream = msblk->stream; + void *buff = stream->input; + int avail, i, bytes = length, res; + size_t out_len = srclength; + + mutex_lock(&msblk->read_data_mutex); + + for (i = 0; i < b; i++) { + wait_on_buffer(bh[i]); + if (!buffer_uptodate(bh[i])) + goto block_release; + + avail = min(bytes, msblk->devblksize - offset); + memcpy(buff, bh[i]->b_data + offset, avail); + buff += avail; + bytes -= avail; + offset = 0; + put_bh(bh[i]); + } + + res = lzo1x_decompress_safe(stream->input, (size_t)length, + stream->output, &out_len); + if (res != LZO_E_OK) + goto failed; + + res = bytes = (int)out_len; + for (i = 0, buff = stream->output; bytes && i < pages; i++) { + avail = min_t(int, bytes, PAGE_CACHE_SIZE); + memcpy(buffer[i], buff, avail); + buff += avail; + bytes -= avail; + } + + mutex_unlock(&msblk->read_data_mutex); + return res; + +block_release: + for (; i < b; i++) + put_bh(bh[i]); + +failed: + mutex_unlock(&msblk->read_data_mutex); + + ERROR("lzo decompression failed, data probably corrupt\n"); + return -EIO; +} + +const struct squashfs_decompressor squashfs_lzo_comp_ops = { + .init = lzo_init, + .free = lzo_free, + .decompress = lzo_uncompress, + .id = LZO_COMPRESSION, + .name = "lzo", + .supported = 1 +}; diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c new file mode 100644 index 00000000..4bc63ac6 --- /dev/null +++ b/fs/squashfs/namei.c @@ -0,0 +1,257 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * namei.c + */ + +/* + * This file implements code to do filename lookup in directories. + * + * Like inodes, directories are packed into compressed metadata blocks, stored + * in a directory table. Directories are accessed using the start address of + * the metablock containing the directory and the offset into the + * decompressed block (). + * + * Directories are organised in a slightly complex way, and are not simply + * a list of file names. The organisation takes advantage of the + * fact that (in most cases) the inodes of the files will be in the same + * compressed metadata block, and therefore, can share the start block. + * Directories are therefore organised in a two level list, a directory + * header containing the shared start block value, and a sequence of directory + * entries, each of which share the shared start block. A new directory header + * is written once/if the inode start block changes. The directory + * header/directory entry list is repeated as many times as necessary. + * + * Directories are sorted, and can contain a directory index to speed up + * file lookup. Directory indexes store one entry per metablock, each entry + * storing the index/filename mapping to the first directory header + * in each metadata block. Directories are sorted in alphabetical order, + * and at lookup the index is scanned linearly looking for the first filename + * alphabetically larger than the filename being looked up. At this point the + * location of the metadata block the filename is in has been found. + * The general idea of the index is ensure only one metadata block needs to be + * decompressed to do a lookup irrespective of the length of the directory. + * This scheme has the advantage that it doesn't require extra memory overhead + * and doesn't require much extra storage on disk. + */ + +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "xattr.h" + +/* + * Lookup name in the directory index, returning the location of the metadata + * block containing it, and the directory index this represents. + * + * If we get an error reading the index then return the part of the index + * (if any) we have managed to read - the index isn't essential, just + * quicker. + */ +static int get_dir_index_using_name(struct super_block *sb, + u64 *next_block, int *next_offset, u64 index_start, + int index_offset, int i_count, const char *name, + int len) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int i, size, length = 0, err; + struct squashfs_dir_index *index; + char *str; + + TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count); + + index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL); + if (index == NULL) { + ERROR("Failed to allocate squashfs_dir_index\n"); + goto out; + } + + str = &index->name[SQUASHFS_NAME_LEN + 1]; + strncpy(str, name, len); + str[len] = '\0'; + + for (i = 0; i < i_count; i++) { + err = squashfs_read_metadata(sb, index, &index_start, + &index_offset, sizeof(*index)); + if (err < 0) + break; + + + size = le32_to_cpu(index->size) + 1; + + err = squashfs_read_metadata(sb, index->name, &index_start, + &index_offset, size); + if (err < 0) + break; + + index->name[size] = '\0'; + + if (strcmp(index->name, str) > 0) + break; + + length = le32_to_cpu(index->index); + *next_block = le32_to_cpu(index->start_block) + + msblk->directory_table; + } + + *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE; + kfree(index); + +out: + /* + * Return index (f_pos) of the looked up metadata block. Translate + * from internal f_pos to external f_pos which is offset by 3 because + * we invent "." and ".." entries which are not actually stored in the + * directory. + */ + return length + 3; +} + + +static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + const unsigned char *name = dentry->d_name.name; + int len = dentry->d_name.len; + struct inode *inode = NULL; + struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info; + struct squashfs_dir_header dirh; + struct squashfs_dir_entry *dire; + u64 block = squashfs_i(dir)->start + msblk->directory_table; + int offset = squashfs_i(dir)->offset; + int err, length = 0, dir_count, size; + + TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset); + + dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); + if (dire == NULL) { + ERROR("Failed to allocate squashfs_dir_entry\n"); + return ERR_PTR(-ENOMEM); + } + + if (len > SQUASHFS_NAME_LEN) { + err = -ENAMETOOLONG; + goto failed; + } + + length = get_dir_index_using_name(dir->i_sb, &block, &offset, + squashfs_i(dir)->dir_idx_start, + squashfs_i(dir)->dir_idx_offset, + squashfs_i(dir)->dir_idx_cnt, name, len); + + while (length < i_size_read(dir)) { + /* + * Read directory header. + */ + err = squashfs_read_metadata(dir->i_sb, &dirh, &block, + &offset, sizeof(dirh)); + if (err < 0) + goto read_failure; + + length += sizeof(dirh); + + dir_count = le32_to_cpu(dirh.count) + 1; + + /* dir_count should never be larger than 256 */ + if (dir_count > 256) + goto data_error; + + while (dir_count--) { + /* + * Read directory entry. + */ + err = squashfs_read_metadata(dir->i_sb, dire, &block, + &offset, sizeof(*dire)); + if (err < 0) + goto read_failure; + + size = le16_to_cpu(dire->size) + 1; + + /* size should never be larger than SQUASHFS_NAME_LEN */ + if (size > SQUASHFS_NAME_LEN) + goto data_error; + + err = squashfs_read_metadata(dir->i_sb, dire->name, + &block, &offset, size); + if (err < 0) + goto read_failure; + + length += sizeof(*dire) + size; + + if (name[0] < dire->name[0]) + goto exit_lookup; + + if (len == size && !strncmp(name, dire->name, len)) { + unsigned int blk, off, ino_num; + long long ino; + blk = le32_to_cpu(dirh.start_block); + off = le16_to_cpu(dire->offset); + ino_num = le32_to_cpu(dirh.inode_number) + + (short) le16_to_cpu(dire->inode_number); + ino = SQUASHFS_MKINODE(blk, off); + + TRACE("calling squashfs_iget for directory " + "entry %s, inode %x:%x, %d\n", name, + blk, off, ino_num); + + inode = squashfs_iget(dir->i_sb, ino, ino_num); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto failed; + } + + goto exit_lookup; + } + } + } + +exit_lookup: + kfree(dire); + if (inode) + return d_splice_alias(inode, dentry); + d_add(dentry, inode); + return ERR_PTR(0); + +data_error: + err = -EIO; + +read_failure: + ERROR("Unable to read directory block [%llx:%x]\n", + squashfs_i(dir)->start + msblk->directory_table, + squashfs_i(dir)->offset); +failed: + kfree(dire); + return ERR_PTR(err); +} + + +const struct inode_operations squashfs_dir_inode_ops = { + .lookup = squashfs_lookup, + .getxattr = generic_getxattr, + .listxattr = squashfs_listxattr +}; diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h new file mode 100644 index 00000000..e3be6a71 --- /dev/null +++ b/fs/squashfs/squashfs.h @@ -0,0 +1,102 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs.h + */ + +#define TRACE(s, args...) pr_debug("SQUASHFS: "s, ## args) + +#define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args) + +#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) + +/* block.c */ +extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, + int, int); + +/* cache.c */ +extern struct squashfs_cache *squashfs_cache_init(char *, int, int); +extern void squashfs_cache_delete(struct squashfs_cache *); +extern struct squashfs_cache_entry *squashfs_cache_get(struct super_block *, + struct squashfs_cache *, u64, int); +extern void squashfs_cache_put(struct squashfs_cache_entry *); +extern int squashfs_copy_data(void *, struct squashfs_cache_entry *, int, int); +extern int squashfs_read_metadata(struct super_block *, void *, u64 *, + int *, int); +extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *, + u64, int); +extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *, + u64, int); +extern void *squashfs_read_table(struct super_block *, u64, int); + +/* decompressor.c */ +extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); +extern void *squashfs_decompressor_init(struct super_block *, unsigned short); + +/* export.c */ +extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64, + unsigned int); + +/* fragment.c */ +extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *); +extern __le64 *squashfs_read_fragment_index_table(struct super_block *, + u64, u64, unsigned int); + +/* id.c */ +extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); +extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64, + unsigned short); + +/* inode.c */ +extern struct inode *squashfs_iget(struct super_block *, long long, + unsigned int); +extern int squashfs_read_inode(struct inode *, long long); + +/* xattr.c */ +extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t); + +/* + * Inodes, files, decompressor and xattr operations + */ + +/* dir.c */ +extern const struct file_operations squashfs_dir_ops; + +/* export.c */ +extern const struct export_operations squashfs_export_ops; + +/* file.c */ +extern const struct address_space_operations squashfs_aops; + +/* inode.c */ +extern const struct inode_operations squashfs_inode_ops; + +/* namei.c */ +extern const struct inode_operations squashfs_dir_inode_ops; + +/* symlink.c */ +extern const struct address_space_operations squashfs_symlink_aops; +extern const struct inode_operations squashfs_symlink_inode_ops; + +/* xattr.c */ +extern const struct xattr_handler *squashfs_xattr_handlers[]; + +/* zlib_wrapper.c */ +extern const struct squashfs_decompressor squashfs_zlib_comp_ops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h new file mode 100644 index 00000000..b4a4e539 --- /dev/null +++ b/fs/squashfs/squashfs_fs.h @@ -0,0 +1,459 @@ +#ifndef SQUASHFS_FS +#define SQUASHFS_FS +/* + * Squashfs + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs_fs.h + */ + +#define SQUASHFS_CACHED_FRAGMENTS CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE +#define SQUASHFS_MAJOR 4 +#define SQUASHFS_MINOR 0 +#define SQUASHFS_START 0 + +/* size of metadata (inode and directory) blocks */ +#define SQUASHFS_METADATA_SIZE 8192 +#define SQUASHFS_METADATA_LOG 13 + +/* default size of data blocks */ +#define SQUASHFS_FILE_SIZE 131072 +#define SQUASHFS_FILE_LOG 17 + +#define SQUASHFS_FILE_MAX_SIZE 1048576 +#define SQUASHFS_FILE_MAX_LOG 20 + +/* Max number of uids and gids */ +#define SQUASHFS_IDS 65536 + +/* Max length of filename (not 255) */ +#define SQUASHFS_NAME_LEN 256 + +#define SQUASHFS_INVALID_FRAG (0xffffffffU) +#define SQUASHFS_INVALID_XATTR (0xffffffffU) +#define SQUASHFS_INVALID_BLK (-1LL) + +/* Filesystem flags */ +#define SQUASHFS_NOI 0 +#define SQUASHFS_NOD 1 +#define SQUASHFS_NOF 3 +#define SQUASHFS_NO_FRAG 4 +#define SQUASHFS_ALWAYS_FRAG 5 +#define SQUASHFS_DUPLICATE 6 +#define SQUASHFS_EXPORT 7 +#define SQUASHFS_COMP_OPT 10 + +#define SQUASHFS_BIT(flag, bit) ((flag >> bit) & 1) + +#define SQUASHFS_UNCOMPRESSED_INODES(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NOI) + +#define SQUASHFS_UNCOMPRESSED_DATA(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NOD) + +#define SQUASHFS_UNCOMPRESSED_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NOF) + +#define SQUASHFS_NO_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NO_FRAG) + +#define SQUASHFS_ALWAYS_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_ALWAYS_FRAG) + +#define SQUASHFS_DUPLICATES(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_DUPLICATE) + +#define SQUASHFS_EXPORTABLE(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_EXPORT) + +#define SQUASHFS_COMP_OPTS(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_COMP_OPT) + +/* Max number of types and file types */ +#define SQUASHFS_DIR_TYPE 1 +#define SQUASHFS_REG_TYPE 2 +#define SQUASHFS_SYMLINK_TYPE 3 +#define SQUASHFS_BLKDEV_TYPE 4 +#define SQUASHFS_CHRDEV_TYPE 5 +#define SQUASHFS_FIFO_TYPE 6 +#define SQUASHFS_SOCKET_TYPE 7 +#define SQUASHFS_LDIR_TYPE 8 +#define SQUASHFS_LREG_TYPE 9 +#define SQUASHFS_LSYMLINK_TYPE 10 +#define SQUASHFS_LBLKDEV_TYPE 11 +#define SQUASHFS_LCHRDEV_TYPE 12 +#define SQUASHFS_LFIFO_TYPE 13 +#define SQUASHFS_LSOCKET_TYPE 14 + +/* Xattr types */ +#define SQUASHFS_XATTR_USER 0 +#define SQUASHFS_XATTR_TRUSTED 1 +#define SQUASHFS_XATTR_SECURITY 2 +#define SQUASHFS_XATTR_VALUE_OOL 256 +#define SQUASHFS_XATTR_PREFIX_MASK 0xff + +/* Flag whether block is compressed or uncompressed, bit is set if block is + * uncompressed */ +#define SQUASHFS_COMPRESSED_BIT (1 << 15) + +#define SQUASHFS_COMPRESSED_SIZE(B) (((B) & ~SQUASHFS_COMPRESSED_BIT) ? \ + (B) & ~SQUASHFS_COMPRESSED_BIT : SQUASHFS_COMPRESSED_BIT) + +#define SQUASHFS_COMPRESSED(B) (!((B) & SQUASHFS_COMPRESSED_BIT)) + +#define SQUASHFS_COMPRESSED_BIT_BLOCK (1 << 24) + +#define SQUASHFS_COMPRESSED_SIZE_BLOCK(B) ((B) & \ + ~SQUASHFS_COMPRESSED_BIT_BLOCK) + +#define SQUASHFS_COMPRESSED_BLOCK(B) (!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK)) + +/* + * Inode number ops. Inodes consist of a compressed block number, and an + * uncompressed offset within that block + */ +#define SQUASHFS_INODE_BLK(A) ((unsigned int) ((A) >> 16)) + +#define SQUASHFS_INODE_OFFSET(A) ((unsigned int) ((A) & 0xffff)) + +#define SQUASHFS_MKINODE(A, B) ((long long)(((long long) (A)\ + << 16) + (B))) + +/* Translate between VFS mode and squashfs mode */ +#define SQUASHFS_MODE(A) ((A) & 0xfff) + +/* fragment and fragment table defines */ +#define SQUASHFS_FRAGMENT_BYTES(A) \ + ((A) * sizeof(struct squashfs_fragment_entry)) + +#define SQUASHFS_FRAGMENT_INDEX(A) (SQUASHFS_FRAGMENT_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_FRAGMENT_INDEX_OFFSET(A) (SQUASHFS_FRAGMENT_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_FRAGMENT_INDEXES(A) ((SQUASHFS_FRAGMENT_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_FRAGMENT_INDEX_BYTES(A) (SQUASHFS_FRAGMENT_INDEXES(A) *\ + sizeof(u64)) + +/* inode lookup table defines */ +#define SQUASHFS_LOOKUP_BYTES(A) ((A) * sizeof(u64)) + +#define SQUASHFS_LOOKUP_BLOCK(A) (SQUASHFS_LOOKUP_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_LOOKUP_BLOCK_OFFSET(A) (SQUASHFS_LOOKUP_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_LOOKUP_BLOCKS(A) ((SQUASHFS_LOOKUP_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_LOOKUP_BLOCK_BYTES(A) (SQUASHFS_LOOKUP_BLOCKS(A) *\ + sizeof(u64)) + +/* uid/gid lookup table defines */ +#define SQUASHFS_ID_BYTES(A) ((A) * sizeof(unsigned int)) + +#define SQUASHFS_ID_BLOCK(A) (SQUASHFS_ID_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_ID_BLOCK_OFFSET(A) (SQUASHFS_ID_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_ID_BLOCKS(A) ((SQUASHFS_ID_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ + sizeof(u64)) +/* xattr id lookup table defines */ +#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id)) + +#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_XATTR_BLOCK_OFFSET(A) (SQUASHFS_XATTR_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_XATTR_BLOCKS(A) ((SQUASHFS_XATTR_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_XATTR_BLOCK_BYTES(A) (SQUASHFS_XATTR_BLOCKS(A) *\ + sizeof(u64)) +#define SQUASHFS_XATTR_BLK(A) ((unsigned int) ((A) >> 16)) + +#define SQUASHFS_XATTR_OFFSET(A) ((unsigned int) ((A) & 0xffff)) + +/* cached data constants for filesystem */ +#define SQUASHFS_CACHED_BLKS 8 + +#define SQUASHFS_MAX_FILE_SIZE_LOG 64 + +#define SQUASHFS_MAX_FILE_SIZE (1LL << \ + (SQUASHFS_MAX_FILE_SIZE_LOG - 2)) + +/* meta index cache */ +#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) +#define SQUASHFS_META_ENTRIES 127 +#define SQUASHFS_META_SLOTS 8 + +struct meta_entry { + u64 data_block; + unsigned int index_block; + unsigned short offset; + unsigned short pad; +}; + +struct meta_index { + unsigned int inode_number; + unsigned int offset; + unsigned short entries; + unsigned short skip; + unsigned short locked; + unsigned short pad; + struct meta_entry meta_entry[SQUASHFS_META_ENTRIES]; +}; + + +/* + * definitions for structures on disk + */ +#define ZLIB_COMPRESSION 1 +#define LZMA_COMPRESSION 2 +#define LZO_COMPRESSION 3 +#define XZ_COMPRESSION 4 + +struct squashfs_super_block { + __le32 s_magic; + __le32 inodes; + __le32 mkfs_time; + __le32 block_size; + __le32 fragments; + __le16 compression; + __le16 block_log; + __le16 flags; + __le16 no_ids; + __le16 s_major; + __le16 s_minor; + __le64 root_inode; + __le64 bytes_used; + __le64 id_table_start; + __le64 xattr_id_table_start; + __le64 inode_table_start; + __le64 directory_table_start; + __le64 fragment_table_start; + __le64 lookup_table_start; +}; + +struct squashfs_dir_index { + __le32 index; + __le32 start_block; + __le32 size; + unsigned char name[0]; +}; + +struct squashfs_base_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; +}; + +struct squashfs_ipc_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; +}; + +struct squashfs_lipc_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 xattr; +}; + +struct squashfs_dev_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 rdev; +}; + +struct squashfs_ldev_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 rdev; + __le32 xattr; +}; + +struct squashfs_symlink_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 symlink_size; + char symlink[0]; +}; + +struct squashfs_reg_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 start_block; + __le32 fragment; + __le32 offset; + __le32 file_size; + __le16 block_list[0]; +}; + +struct squashfs_lreg_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le64 start_block; + __le64 file_size; + __le64 sparse; + __le32 nlink; + __le32 fragment; + __le32 offset; + __le32 xattr; + __le16 block_list[0]; +}; + +struct squashfs_dir_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 start_block; + __le32 nlink; + __le16 file_size; + __le16 offset; + __le32 parent_inode; +}; + +struct squashfs_ldir_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 file_size; + __le32 start_block; + __le32 parent_inode; + __le16 i_count; + __le16 offset; + __le32 xattr; + struct squashfs_dir_index index[0]; +}; + +union squashfs_inode { + struct squashfs_base_inode base; + struct squashfs_dev_inode dev; + struct squashfs_ldev_inode ldev; + struct squashfs_symlink_inode symlink; + struct squashfs_reg_inode reg; + struct squashfs_lreg_inode lreg; + struct squashfs_dir_inode dir; + struct squashfs_ldir_inode ldir; + struct squashfs_ipc_inode ipc; + struct squashfs_lipc_inode lipc; +}; + +struct squashfs_dir_entry { + __le16 offset; + __le16 inode_number; + __le16 type; + __le16 size; + char name[0]; +}; + +struct squashfs_dir_header { + __le32 count; + __le32 start_block; + __le32 inode_number; +}; + +struct squashfs_fragment_entry { + __le64 start_block; + __le32 size; + unsigned int unused; +}; + +struct squashfs_xattr_entry { + __le16 type; + __le16 size; + char data[0]; +}; + +struct squashfs_xattr_val { + __le32 vsize; + char value[0]; +}; + +struct squashfs_xattr_id { + __le64 xattr; + __le32 count; + __le32 size; +}; + +struct squashfs_xattr_id_table { + __le64 xattr_table_start; + __le32 xattr_ids; + __le32 unused; +}; + +#endif diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h new file mode 100644 index 00000000..73588e77 --- /dev/null +++ b/fs/squashfs/squashfs_fs_i.h @@ -0,0 +1,54 @@ +#ifndef SQUASHFS_FS_I +#define SQUASHFS_FS_I +/* + * Squashfs + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs_fs_i.h + */ + +struct squashfs_inode_info { + u64 start; + int offset; + u64 xattr; + unsigned int xattr_size; + int xattr_count; + union { + struct { + u64 fragment_block; + int fragment_size; + int fragment_offset; + u64 block_list_start; + }; + struct { + u64 dir_idx_start; + int dir_idx_offset; + int dir_idx_cnt; + int parent; + }; + }; + struct inode vfs_inode; +}; + + +static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) +{ + return list_entry(inode, struct squashfs_inode_info, vfs_inode); +} +#endif diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h new file mode 100644 index 00000000..651f0b31 --- /dev/null +++ b/fs/squashfs/squashfs_fs_sb.h @@ -0,0 +1,79 @@ +#ifndef SQUASHFS_FS_SB +#define SQUASHFS_FS_SB +/* + * Squashfs + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs_fs_sb.h + */ + +#include "squashfs_fs.h" + +struct squashfs_cache { + char *name; + int entries; + int next_blk; + int num_waiters; + int unused; + int block_size; + int pages; + spinlock_t lock; + wait_queue_head_t wait_queue; + struct squashfs_cache_entry *entry; +}; + +struct squashfs_cache_entry { + u64 block; + int length; + int refcount; + u64 next_index; + int pending; + int error; + int num_waiters; + wait_queue_head_t wait_queue; + struct squashfs_cache *cache; + void **data; +}; + +struct squashfs_sb_info { + const struct squashfs_decompressor *decompressor; + int devblksize; + int devblksize_log2; + struct squashfs_cache *block_cache; + struct squashfs_cache *fragment_cache; + struct squashfs_cache *read_page; + int next_meta_index; + __le64 *id_table; + __le64 *fragment_index; + __le64 *xattr_id_table; + struct mutex read_data_mutex; + struct mutex meta_index_mutex; + struct meta_index *meta_index; + void *stream; + __le64 *inode_lookup_table; + u64 inode_table; + u64 directory_table; + u64 xattr_table; + unsigned int block_size; + unsigned short block_log; + long long bytes_used; + unsigned int inodes; + int xattr_ids; +}; +#endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c new file mode 100644 index 00000000..7438850c --- /dev/null +++ b/fs/squashfs/super.c @@ -0,0 +1,497 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * super.c + */ + +/* + * This file implements code to read the superblock, read and initialise + * in-memory structures at mount time, and all the VFS glue code to register + * the filesystem. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "decompressor.h" +#include "xattr.h" + +static struct file_system_type squashfs_fs_type; +static const struct super_operations squashfs_super_ops; + +static const struct squashfs_decompressor *supported_squashfs_filesystem(short + major, short minor, short id) +{ + const struct squashfs_decompressor *decompressor; + + if (major < SQUASHFS_MAJOR) { + ERROR("Major/Minor mismatch, older Squashfs %d.%d " + "filesystems are unsupported\n", major, minor); + return NULL; + } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { + ERROR("Major/Minor mismatch, trying to mount newer " + "%d.%d filesystem\n", major, minor); + ERROR("Please update your kernel\n"); + return NULL; + } + + decompressor = squashfs_lookup_decompressor(id); + if (!decompressor->supported) { + ERROR("Filesystem uses \"%s\" compression. This is not " + "supported\n", decompressor->name); + return NULL; + } + + return decompressor; +} + + +static int squashfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct squashfs_sb_info *msblk; + struct squashfs_super_block *sblk = NULL; + char b[BDEVNAME_SIZE]; + struct inode *root; + long long root_inode; + unsigned short flags; + unsigned int fragments; + u64 lookup_table_start, xattr_id_table_start, next_table; + int err; + + TRACE("Entered squashfs_fill_superblock\n"); + + sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); + if (sb->s_fs_info == NULL) { + ERROR("Failed to allocate squashfs_sb_info\n"); + return -ENOMEM; + } + msblk = sb->s_fs_info; + + msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE); + msblk->devblksize_log2 = ffz(~msblk->devblksize); + + mutex_init(&msblk->read_data_mutex); + mutex_init(&msblk->meta_index_mutex); + + /* + * msblk->bytes_used is checked in squashfs_read_table to ensure reads + * are not beyond filesystem end. But as we're using + * squashfs_read_table here to read the superblock (including the value + * of bytes_used) we need to set it to an initial sensible dummy value + */ + msblk->bytes_used = sizeof(*sblk); + sblk = squashfs_read_table(sb, SQUASHFS_START, sizeof(*sblk)); + + if (IS_ERR(sblk)) { + ERROR("unable to read squashfs_super_block\n"); + err = PTR_ERR(sblk); + sblk = NULL; + goto failed_mount; + } + + err = -EINVAL; + + /* Check it is a SQUASHFS superblock */ + sb->s_magic = le32_to_cpu(sblk->s_magic); + if (sb->s_magic != SQUASHFS_MAGIC) { + if (!silent) + ERROR("Can't find a SQUASHFS superblock on %s\n", + bdevname(sb->s_bdev, b)); + goto failed_mount; + } + + /* Check the MAJOR & MINOR versions and lookup compression type */ + msblk->decompressor = supported_squashfs_filesystem( + le16_to_cpu(sblk->s_major), + le16_to_cpu(sblk->s_minor), + le16_to_cpu(sblk->compression)); + if (msblk->decompressor == NULL) + goto failed_mount; + + /* Check the filesystem does not extend beyond the end of the + block device */ + msblk->bytes_used = le64_to_cpu(sblk->bytes_used); + if (msblk->bytes_used < 0 || msblk->bytes_used > + i_size_read(sb->s_bdev->bd_inode)) + goto failed_mount; + + /* Check block size for sanity */ + msblk->block_size = le32_to_cpu(sblk->block_size); + if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE) + goto failed_mount; + + /* + * Check the system page size is not larger than the filesystem + * block size (by default 128K). This is currently not supported. + */ + if (PAGE_CACHE_SIZE > msblk->block_size) { + ERROR("Page size > filesystem block size (%d). This is " + "currently not supported!\n", msblk->block_size); + goto failed_mount; + } + + msblk->block_log = le16_to_cpu(sblk->block_log); + if (msblk->block_log > SQUASHFS_FILE_MAX_LOG) + goto failed_mount; + + /* Check the root inode for sanity */ + root_inode = le64_to_cpu(sblk->root_inode); + if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE) + goto failed_mount; + + msblk->inode_table = le64_to_cpu(sblk->inode_table_start); + msblk->directory_table = le64_to_cpu(sblk->directory_table_start); + msblk->inodes = le32_to_cpu(sblk->inodes); + flags = le16_to_cpu(sblk->flags); + + TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b)); + TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags) + ? "un" : ""); + TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags) + ? "un" : ""); + TRACE("Filesystem size %lld bytes\n", msblk->bytes_used); + TRACE("Block size %d\n", msblk->block_size); + TRACE("Number of inodes %d\n", msblk->inodes); + TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments)); + TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids)); + TRACE("sblk->inode_table_start %llx\n", msblk->inode_table); + TRACE("sblk->directory_table_start %llx\n", msblk->directory_table); + TRACE("sblk->fragment_table_start %llx\n", + (u64) le64_to_cpu(sblk->fragment_table_start)); + TRACE("sblk->id_table_start %llx\n", + (u64) le64_to_cpu(sblk->id_table_start)); + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_flags |= MS_RDONLY; + sb->s_op = &squashfs_super_ops; + + err = -ENOMEM; + + msblk->block_cache = squashfs_cache_init("metadata", + SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); + if (msblk->block_cache == NULL) + goto failed_mount; + + /* Allocate read_page block */ + msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size); + if (msblk->read_page == NULL) { + ERROR("Failed to allocate read_page block\n"); + goto failed_mount; + } + + msblk->stream = squashfs_decompressor_init(sb, flags); + if (IS_ERR(msblk->stream)) { + err = PTR_ERR(msblk->stream); + msblk->stream = NULL; + goto failed_mount; + } + + /* Handle xattrs */ + sb->s_xattr = squashfs_xattr_handlers; + xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start); + if (xattr_id_table_start == SQUASHFS_INVALID_BLK) { + next_table = msblk->bytes_used; + goto allocate_id_index_table; + } + + /* Allocate and read xattr id lookup table */ + msblk->xattr_id_table = squashfs_read_xattr_id_table(sb, + xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids); + if (IS_ERR(msblk->xattr_id_table)) { + ERROR("unable to read xattr id index table\n"); + err = PTR_ERR(msblk->xattr_id_table); + msblk->xattr_id_table = NULL; + if (err != -ENOTSUPP) + goto failed_mount; + } + next_table = msblk->xattr_table; + +allocate_id_index_table: + /* Allocate and read id index table */ + msblk->id_table = squashfs_read_id_index_table(sb, + le64_to_cpu(sblk->id_table_start), next_table, + le16_to_cpu(sblk->no_ids)); + if (IS_ERR(msblk->id_table)) { + ERROR("unable to read id index table\n"); + err = PTR_ERR(msblk->id_table); + msblk->id_table = NULL; + goto failed_mount; + } + next_table = le64_to_cpu(msblk->id_table[0]); + + /* Handle inode lookup table */ + lookup_table_start = le64_to_cpu(sblk->lookup_table_start); + if (lookup_table_start == SQUASHFS_INVALID_BLK) + goto handle_fragments; + + /* Allocate and read inode lookup table */ + msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, + lookup_table_start, next_table, msblk->inodes); + if (IS_ERR(msblk->inode_lookup_table)) { + ERROR("unable to read inode lookup table\n"); + err = PTR_ERR(msblk->inode_lookup_table); + msblk->inode_lookup_table = NULL; + goto failed_mount; + } + next_table = le64_to_cpu(msblk->inode_lookup_table[0]); + + sb->s_export_op = &squashfs_export_ops; + +handle_fragments: + fragments = le32_to_cpu(sblk->fragments); + if (fragments == 0) + goto check_directory_table; + + msblk->fragment_cache = squashfs_cache_init("fragment", + SQUASHFS_CACHED_FRAGMENTS, msblk->block_size); + if (msblk->fragment_cache == NULL) { + err = -ENOMEM; + goto failed_mount; + } + + /* Allocate and read fragment index table */ + msblk->fragment_index = squashfs_read_fragment_index_table(sb, + le64_to_cpu(sblk->fragment_table_start), next_table, fragments); + if (IS_ERR(msblk->fragment_index)) { + ERROR("unable to read fragment index table\n"); + err = PTR_ERR(msblk->fragment_index); + msblk->fragment_index = NULL; + goto failed_mount; + } + next_table = le64_to_cpu(msblk->fragment_index[0]); + +check_directory_table: + /* Sanity check directory_table */ + if (msblk->directory_table >= next_table) { + err = -EINVAL; + goto failed_mount; + } + + /* Sanity check inode_table */ + if (msblk->inode_table >= msblk->directory_table) { + err = -EINVAL; + goto failed_mount; + } + + /* allocate root */ + root = new_inode(sb); + if (!root) { + err = -ENOMEM; + goto failed_mount; + } + + err = squashfs_read_inode(root, root_inode); + if (err) { + make_bad_inode(root); + iput(root); + goto failed_mount; + } + insert_inode_hash(root); + + sb->s_root = d_alloc_root(root); + if (sb->s_root == NULL) { + ERROR("Root inode create failed\n"); + err = -ENOMEM; + iput(root); + goto failed_mount; + } + + TRACE("Leaving squashfs_fill_super\n"); + kfree(sblk); + return 0; + +failed_mount: + squashfs_cache_delete(msblk->block_cache); + squashfs_cache_delete(msblk->fragment_cache); + squashfs_cache_delete(msblk->read_page); + squashfs_decompressor_free(msblk, msblk->stream); + kfree(msblk->inode_lookup_table); + kfree(msblk->fragment_index); + kfree(msblk->id_table); + kfree(msblk->xattr_id_table); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; + kfree(sblk); + return err; +} + + +static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info; + u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev); + + TRACE("Entered squashfs_statfs\n"); + + buf->f_type = SQUASHFS_MAGIC; + buf->f_bsize = msblk->block_size; + buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1; + buf->f_bfree = buf->f_bavail = 0; + buf->f_files = msblk->inodes; + buf->f_ffree = 0; + buf->f_namelen = SQUASHFS_NAME_LEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + + +static int squashfs_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_RDONLY; + return 0; +} + + +static void squashfs_put_super(struct super_block *sb) +{ + if (sb->s_fs_info) { + struct squashfs_sb_info *sbi = sb->s_fs_info; + squashfs_cache_delete(sbi->block_cache); + squashfs_cache_delete(sbi->fragment_cache); + squashfs_cache_delete(sbi->read_page); + squashfs_decompressor_free(sbi, sbi->stream); + kfree(sbi->id_table); + kfree(sbi->fragment_index); + kfree(sbi->meta_index); + kfree(sbi->inode_lookup_table); + kfree(sbi->xattr_id_table); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; + } +} + + +static struct dentry *squashfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super); +} + + +static struct kmem_cache *squashfs_inode_cachep; + + +static void init_once(void *foo) +{ + struct squashfs_inode_info *ei = foo; + + inode_init_once(&ei->vfs_inode); +} + + +static int __init init_inodecache(void) +{ + squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache", + sizeof(struct squashfs_inode_info), 0, + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once); + + return squashfs_inode_cachep ? 0 : -ENOMEM; +} + + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(squashfs_inode_cachep); +} + + +static int __init init_squashfs_fs(void) +{ + int err = init_inodecache(); + + if (err) + return err; + + err = register_filesystem(&squashfs_fs_type); + if (err) { + destroy_inodecache(); + return err; + } + + printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) " + "Phillip Lougher\n"); + + return 0; +} + + +static void __exit exit_squashfs_fs(void) +{ + unregister_filesystem(&squashfs_fs_type); + destroy_inodecache(); +} + + +static struct inode *squashfs_alloc_inode(struct super_block *sb) +{ + struct squashfs_inode_info *ei = + kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL); + + return ei ? &ei->vfs_inode : NULL; +} + + +static void squashfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode)); +} + +static void squashfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, squashfs_i_callback); +} + + +static struct file_system_type squashfs_fs_type = { + .owner = THIS_MODULE, + .name = "squashfs", + .mount = squashfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV +}; + +static const struct super_operations squashfs_super_ops = { + .alloc_inode = squashfs_alloc_inode, + .destroy_inode = squashfs_destroy_inode, + .statfs = squashfs_statfs, + .put_super = squashfs_put_super, + .remount_fs = squashfs_remount +}; + +module_init(init_squashfs_fs); +module_exit(exit_squashfs_fs); +MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem"); +MODULE_AUTHOR("Phillip Lougher "); +MODULE_LICENSE("GPL"); diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c new file mode 100644 index 00000000..11918172 --- /dev/null +++ b/fs/squashfs/symlink.c @@ -0,0 +1,127 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * symlink.c + */ + +/* + * This file implements code to handle symbolic links. + * + * The data contents of symbolic links are stored inside the symbolic + * link inode within the inode table. This allows the normally small symbolic + * link to be compressed as part of the inode table, achieving much greater + * compression than if the symbolic link was compressed individually. + */ + +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "xattr.h" + +static int squashfs_symlink_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + int index = page->index << PAGE_CACHE_SHIFT; + u64 block = squashfs_i(inode)->start; + int offset = squashfs_i(inode)->offset; + int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE); + int bytes, copied; + void *pageaddr; + struct squashfs_cache_entry *entry; + + TRACE("Entered squashfs_symlink_readpage, page index %ld, start block " + "%llx, offset %x\n", page->index, block, offset); + + /* + * Skip index bytes into symlink metadata. + */ + if (index) { + bytes = squashfs_read_metadata(sb, NULL, &block, &offset, + index); + if (bytes < 0) { + ERROR("Unable to read symlink [%llx:%x]\n", + squashfs_i(inode)->start, + squashfs_i(inode)->offset); + goto error_out; + } + } + + /* + * Read length bytes from symlink metadata. Squashfs_read_metadata + * is not used here because it can sleep and we want to use + * kmap_atomic to map the page. Instead call the underlying + * squashfs_cache_get routine. As length bytes may overlap metadata + * blocks, we may need to call squashfs_cache_get multiple times. + */ + for (bytes = 0; bytes < length; offset = 0, bytes += copied) { + entry = squashfs_cache_get(sb, msblk->block_cache, block, 0); + if (entry->error) { + ERROR("Unable to read symlink [%llx:%x]\n", + squashfs_i(inode)->start, + squashfs_i(inode)->offset); + squashfs_cache_put(entry); + goto error_out; + } + + pageaddr = kmap_atomic(page, KM_USER0); + copied = squashfs_copy_data(pageaddr + bytes, entry, offset, + length - bytes); + if (copied == length - bytes) + memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length); + else + block = entry->next_index; + kunmap_atomic(pageaddr, KM_USER0); + squashfs_cache_put(entry); + } + + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + return 0; + +error_out: + SetPageError(page); + unlock_page(page); + return 0; +} + + +const struct address_space_operations squashfs_symlink_aops = { + .readpage = squashfs_symlink_readpage +}; + +const struct inode_operations squashfs_symlink_inode_ops = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .getxattr = generic_getxattr, + .listxattr = squashfs_listxattr +}; + diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c new file mode 100644 index 00000000..92fcde7b --- /dev/null +++ b/fs/squashfs/xattr.c @@ -0,0 +1,324 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xattr.c + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +static const struct xattr_handler *squashfs_xattr_handler(int); + +ssize_t squashfs_listxattr(struct dentry *d, char *buffer, + size_t buffer_size) +{ + struct inode *inode = d->d_inode; + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr) + + msblk->xattr_table; + int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr); + int count = squashfs_i(inode)->xattr_count; + size_t rest = buffer_size; + int err; + + /* check that the file system has xattrs */ + if (msblk->xattr_id_table == NULL) + return -EOPNOTSUPP; + + /* loop reading each xattr name */ + while (count--) { + struct squashfs_xattr_entry entry; + struct squashfs_xattr_val val; + const struct xattr_handler *handler; + int name_size, prefix_size = 0; + + err = squashfs_read_metadata(sb, &entry, &start, &offset, + sizeof(entry)); + if (err < 0) + goto failed; + + name_size = le16_to_cpu(entry.size); + handler = squashfs_xattr_handler(le16_to_cpu(entry.type)); + if (handler) + prefix_size = handler->list(d, buffer, rest, NULL, + name_size, handler->flags); + if (prefix_size) { + if (buffer) { + if (prefix_size + name_size + 1 > rest) { + err = -ERANGE; + goto failed; + } + buffer += prefix_size; + } + err = squashfs_read_metadata(sb, buffer, &start, + &offset, name_size); + if (err < 0) + goto failed; + if (buffer) { + buffer[name_size] = '\0'; + buffer += name_size + 1; + } + rest -= prefix_size + name_size + 1; + } else { + /* no handler or insuffficient privileges, so skip */ + err = squashfs_read_metadata(sb, NULL, &start, + &offset, name_size); + if (err < 0) + goto failed; + } + + + /* skip remaining xattr entry */ + err = squashfs_read_metadata(sb, &val, &start, &offset, + sizeof(val)); + if (err < 0) + goto failed; + + err = squashfs_read_metadata(sb, NULL, &start, &offset, + le32_to_cpu(val.vsize)); + if (err < 0) + goto failed; + } + err = buffer_size - rest; + +failed: + return err; +} + + +static int squashfs_xattr_get(struct inode *inode, int name_index, + const char *name, void *buffer, size_t buffer_size) +{ + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr) + + msblk->xattr_table; + int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr); + int count = squashfs_i(inode)->xattr_count; + int name_len = strlen(name); + int err, vsize; + char *target = kmalloc(name_len, GFP_KERNEL); + + if (target == NULL) + return -ENOMEM; + + /* loop reading each xattr name */ + for (; count; count--) { + struct squashfs_xattr_entry entry; + struct squashfs_xattr_val val; + int type, prefix, name_size; + + err = squashfs_read_metadata(sb, &entry, &start, &offset, + sizeof(entry)); + if (err < 0) + goto failed; + + name_size = le16_to_cpu(entry.size); + type = le16_to_cpu(entry.type); + prefix = type & SQUASHFS_XATTR_PREFIX_MASK; + + if (prefix == name_index && name_size == name_len) + err = squashfs_read_metadata(sb, target, &start, + &offset, name_size); + else + err = squashfs_read_metadata(sb, NULL, &start, + &offset, name_size); + if (err < 0) + goto failed; + + if (prefix == name_index && name_size == name_len && + strncmp(target, name, name_size) == 0) { + /* found xattr */ + if (type & SQUASHFS_XATTR_VALUE_OOL) { + __le64 xattr_val; + u64 xattr; + /* val is a reference to the real location */ + err = squashfs_read_metadata(sb, &val, &start, + &offset, sizeof(val)); + if (err < 0) + goto failed; + err = squashfs_read_metadata(sb, &xattr_val, + &start, &offset, sizeof(xattr_val)); + if (err < 0) + goto failed; + xattr = le64_to_cpu(xattr_val); + start = SQUASHFS_XATTR_BLK(xattr) + + msblk->xattr_table; + offset = SQUASHFS_XATTR_OFFSET(xattr); + } + /* read xattr value */ + err = squashfs_read_metadata(sb, &val, &start, &offset, + sizeof(val)); + if (err < 0) + goto failed; + + vsize = le32_to_cpu(val.vsize); + if (buffer) { + if (vsize > buffer_size) { + err = -ERANGE; + goto failed; + } + err = squashfs_read_metadata(sb, buffer, &start, + &offset, vsize); + if (err < 0) + goto failed; + } + break; + } + + /* no match, skip remaining xattr entry */ + err = squashfs_read_metadata(sb, &val, &start, &offset, + sizeof(val)); + if (err < 0) + goto failed; + err = squashfs_read_metadata(sb, NULL, &start, &offset, + le32_to_cpu(val.vsize)); + if (err < 0) + goto failed; + } + err = count ? vsize : -ENODATA; + +failed: + kfree(target); + return err; +} + + +/* + * User namespace support + */ +static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + if (list && XATTR_USER_PREFIX_LEN <= list_size) + memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); + return XATTR_USER_PREFIX_LEN; +} + +static int squashfs_user_get(struct dentry *d, const char *name, void *buffer, + size_t size, int type) +{ + if (name[0] == '\0') + return -EINVAL; + + return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name, + buffer, size); +} + +static const struct xattr_handler squashfs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = squashfs_user_list, + .get = squashfs_user_get +}; + +/* + * Trusted namespace support + */ +static size_t squashfs_trusted_list(struct dentry *d, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size) + memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); + return XATTR_TRUSTED_PREFIX_LEN; +} + +static int squashfs_trusted_get(struct dentry *d, const char *name, + void *buffer, size_t size, int type) +{ + if (name[0] == '\0') + return -EINVAL; + + return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name, + buffer, size); +} + +static const struct xattr_handler squashfs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = squashfs_trusted_list, + .get = squashfs_trusted_get +}; + +/* + * Security namespace support + */ +static size_t squashfs_security_list(struct dentry *d, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + if (list && XATTR_SECURITY_PREFIX_LEN <= list_size) + memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); + return XATTR_SECURITY_PREFIX_LEN; +} + +static int squashfs_security_get(struct dentry *d, const char *name, + void *buffer, size_t size, int type) +{ + if (name[0] == '\0') + return -EINVAL; + + return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name, + buffer, size); +} + +static const struct xattr_handler squashfs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = squashfs_security_list, + .get = squashfs_security_get +}; + +static const struct xattr_handler *squashfs_xattr_handler(int type) +{ + if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL)) + /* ignore unrecognised type */ + return NULL; + + switch (type & SQUASHFS_XATTR_PREFIX_MASK) { + case SQUASHFS_XATTR_USER: + return &squashfs_xattr_user_handler; + case SQUASHFS_XATTR_TRUSTED: + return &squashfs_xattr_trusted_handler; + case SQUASHFS_XATTR_SECURITY: + return &squashfs_xattr_security_handler; + default: + /* ignore unrecognised type */ + return NULL; + } +} + +const struct xattr_handler *squashfs_xattr_handlers[] = { + &squashfs_xattr_user_handler, + &squashfs_xattr_trusted_handler, + &squashfs_xattr_security_handler, + NULL +}; + diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h new file mode 100644 index 00000000..c83f5d9e --- /dev/null +++ b/fs/squashfs/xattr.h @@ -0,0 +1,47 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xattr.h + */ + +#ifdef CONFIG_SQUASHFS_XATTR +extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, + u64 *, int *); +extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, + unsigned int *, unsigned long long *); +#else +static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, + u64 start, u64 *xattr_table_start, int *xattr_ids) +{ + ERROR("Xattrs in filesystem, these will be ignored\n"); + *xattr_table_start = start; + return ERR_PTR(-ENOTSUPP); +} + +static inline int squashfs_xattr_lookup(struct super_block *sb, + unsigned int index, int *count, unsigned int *size, + unsigned long long *xattr) +{ + return 0; +} +#define squashfs_listxattr NULL +#define generic_getxattr NULL +#define squashfs_xattr_handlers NULL +#endif diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c new file mode 100644 index 00000000..c89607d6 --- /dev/null +++ b/fs/squashfs/xattr_id.c @@ -0,0 +1,95 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xattr_id.c + */ + +/* + * This file implements code to map the 32-bit xattr id stored in the inode + * into the on disk location of the xattr data. + */ + +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "xattr.h" + +/* + * Map xattr id using the xattr id look up table + */ +int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, + int *count, unsigned int *size, unsigned long long *xattr) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int block = SQUASHFS_XATTR_BLOCK(index); + int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index); + u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]); + struct squashfs_xattr_id id; + int err; + + err = squashfs_read_metadata(sb, &id, &start_block, &offset, + sizeof(id)); + if (err < 0) + return err; + + *xattr = le64_to_cpu(id.xattr); + *size = le32_to_cpu(id.size); + *count = le32_to_cpu(id.count); + return 0; +} + + +/* + * Read uncompressed xattr id lookup table indexes from disk into memory + */ +__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start, + u64 *xattr_table_start, int *xattr_ids) +{ + unsigned int len; + struct squashfs_xattr_id_table *id_table; + + id_table = squashfs_read_table(sb, start, sizeof(*id_table)); + if (IS_ERR(id_table)) + return (__le64 *) id_table; + + *xattr_table_start = le64_to_cpu(id_table->xattr_table_start); + *xattr_ids = le32_to_cpu(id_table->xattr_ids); + kfree(id_table); + + /* Sanity check values */ + + /* there is always at least one xattr id */ + if (*xattr_ids == 0) + return ERR_PTR(-EINVAL); + + /* xattr_table should be less than start */ + if (*xattr_table_start >= start) + return ERR_PTR(-EINVAL); + + len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids); + + TRACE("In read_xattr_index_table, length %d\n", len); + + return squashfs_read_table(sb, start + sizeof(*id_table), len); +} diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c new file mode 100644 index 00000000..1760b7d1 --- /dev/null +++ b/fs/squashfs/xz_wrapper.c @@ -0,0 +1,180 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xz_wrapper.c + */ + + +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "decompressor.h" + +struct squashfs_xz { + struct xz_dec *state; + struct xz_buf buf; +}; + +struct comp_opts { + __le32 dictionary_size; + __le32 flags; +}; + +static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff, + int len) +{ + struct comp_opts *comp_opts = buff; + struct squashfs_xz *stream; + int dict_size = msblk->block_size; + int err, n; + + if (comp_opts) { + /* check compressor options are the expected length */ + if (len < sizeof(*comp_opts)) { + err = -EIO; + goto failed; + } + + dict_size = le32_to_cpu(comp_opts->dictionary_size); + + /* the dictionary size should be 2^n or 2^n+2^(n+1) */ + n = ffs(dict_size) - 1; + if (dict_size != (1 << n) && dict_size != (1 << n) + + (1 << (n + 1))) { + err = -EIO; + goto failed; + } + } + + dict_size = max_t(int, dict_size, SQUASHFS_METADATA_SIZE); + + stream = kmalloc(sizeof(*stream), GFP_KERNEL); + if (stream == NULL) { + err = -ENOMEM; + goto failed; + } + + stream->state = xz_dec_init(XZ_PREALLOC, dict_size); + if (stream->state == NULL) { + kfree(stream); + err = -ENOMEM; + goto failed; + } + + return stream; + +failed: + ERROR("Failed to initialise xz decompressor\n"); + return ERR_PTR(err); +} + + +static void squashfs_xz_free(void *strm) +{ + struct squashfs_xz *stream = strm; + + if (stream) { + xz_dec_end(stream->state); + kfree(stream); + } +} + + +static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer, + struct buffer_head **bh, int b, int offset, int length, int srclength, + int pages) +{ + enum xz_ret xz_err; + int avail, total = 0, k = 0, page = 0; + struct squashfs_xz *stream = msblk->stream; + + mutex_lock(&msblk->read_data_mutex); + + xz_dec_reset(stream->state); + stream->buf.in_pos = 0; + stream->buf.in_size = 0; + stream->buf.out_pos = 0; + stream->buf.out_size = PAGE_CACHE_SIZE; + stream->buf.out = buffer[page++]; + + do { + if (stream->buf.in_pos == stream->buf.in_size && k < b) { + avail = min(length, msblk->devblksize - offset); + length -= avail; + wait_on_buffer(bh[k]); + if (!buffer_uptodate(bh[k])) + goto release_mutex; + + stream->buf.in = bh[k]->b_data + offset; + stream->buf.in_size = avail; + stream->buf.in_pos = 0; + offset = 0; + } + + if (stream->buf.out_pos == stream->buf.out_size + && page < pages) { + stream->buf.out = buffer[page++]; + stream->buf.out_pos = 0; + total += PAGE_CACHE_SIZE; + } + + xz_err = xz_dec_run(stream->state, &stream->buf); + + if (stream->buf.in_pos == stream->buf.in_size && k < b) + put_bh(bh[k++]); + } while (xz_err == XZ_OK); + + if (xz_err != XZ_STREAM_END) { + ERROR("xz_dec_run error, data probably corrupt\n"); + goto release_mutex; + } + + if (k < b) { + ERROR("xz_uncompress error, input remaining\n"); + goto release_mutex; + } + + total += stream->buf.out_pos; + mutex_unlock(&msblk->read_data_mutex); + return total; + +release_mutex: + mutex_unlock(&msblk->read_data_mutex); + + for (; k < b; k++) + put_bh(bh[k]); + + return -EIO; +} + +const struct squashfs_decompressor squashfs_xz_comp_ops = { + .init = squashfs_xz_init, + .free = squashfs_xz_free, + .decompress = squashfs_xz_uncompress, + .id = XZ_COMPRESSION, + .name = "xz", + .supported = 1 +}; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c new file mode 100644 index 00000000..55d918fd --- /dev/null +++ b/fs/squashfs/zlib_wrapper.c @@ -0,0 +1,149 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * zlib_wrapper.c + */ + + +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "decompressor.h" + +static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len) +{ + z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL); + if (stream == NULL) + goto failed; + stream->workspace = vmalloc(zlib_inflate_workspacesize()); + if (stream->workspace == NULL) + goto failed; + + return stream; + +failed: + ERROR("Failed to allocate zlib workspace\n"); + kfree(stream); + return ERR_PTR(-ENOMEM); +} + + +static void zlib_free(void *strm) +{ + z_stream *stream = strm; + + if (stream) + vfree(stream->workspace); + kfree(stream); +} + + +static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, + struct buffer_head **bh, int b, int offset, int length, int srclength, + int pages) +{ + int zlib_err, zlib_init = 0; + int k = 0, page = 0; + z_stream *stream = msblk->stream; + + mutex_lock(&msblk->read_data_mutex); + + stream->avail_out = 0; + stream->avail_in = 0; + + do { + if (stream->avail_in == 0 && k < b) { + int avail = min(length, msblk->devblksize - offset); + length -= avail; + wait_on_buffer(bh[k]); + if (!buffer_uptodate(bh[k])) + goto release_mutex; + + stream->next_in = bh[k]->b_data + offset; + stream->avail_in = avail; + offset = 0; + } + + if (stream->avail_out == 0 && page < pages) { + stream->next_out = buffer[page++]; + stream->avail_out = PAGE_CACHE_SIZE; + } + + if (!zlib_init) { + zlib_err = zlib_inflateInit(stream); + if (zlib_err != Z_OK) { + ERROR("zlib_inflateInit returned unexpected " + "result 0x%x, srclength %d\n", + zlib_err, srclength); + goto release_mutex; + } + zlib_init = 1; + } + + zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH); + + if (stream->avail_in == 0 && k < b) + put_bh(bh[k++]); + } while (zlib_err == Z_OK); + + if (zlib_err != Z_STREAM_END) { + ERROR("zlib_inflate error, data probably corrupt\n"); + goto release_mutex; + } + + zlib_err = zlib_inflateEnd(stream); + if (zlib_err != Z_OK) { + ERROR("zlib_inflate error, data probably corrupt\n"); + goto release_mutex; + } + + if (k < b) { + ERROR("zlib_uncompress error, data remaining\n"); + goto release_mutex; + } + + length = stream->total_out; + mutex_unlock(&msblk->read_data_mutex); + return length; + +release_mutex: + mutex_unlock(&msblk->read_data_mutex); + + for (; k < b; k++) + put_bh(bh[k]); + + return -EIO; +} + +const struct squashfs_decompressor squashfs_zlib_comp_ops = { + .init = zlib_init, + .free = zlib_free, + .decompress = zlib_uncompress, + .id = ZLIB_COMPRESSION, + .name = "zlib", + .supported = 1 +}; + diff --git a/fs/stack.c b/fs/stack.c new file mode 100644 index 00000000..4a6f7f44 --- /dev/null +++ b/fs/stack.c @@ -0,0 +1,79 @@ +#include +#include +#include + +/* does _NOT_ require i_mutex to be held. + * + * This function cannot be inlined since i_size_{read,write} is rather + * heavy-weight on 32-bit systems + */ +void fsstack_copy_inode_size(struct inode *dst, struct inode *src) +{ + loff_t i_size; + blkcnt_t i_blocks; + + /* + * i_size_read() includes its own seqlocking and protection from + * preemption (see include/linux/fs.h): we need nothing extra for + * that here, and prefer to avoid nesting locks than attempt to keep + * i_size and i_blocks in sync together. + */ + i_size = i_size_read(src); + + /* + * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to + * keep the two halves of i_blocks in sync despite SMP or PREEMPT - + * though stat's generic_fillattr() doesn't bother, and we won't be + * applying quotas (where i_blocks does become important) at the + * upper level. + * + * We don't actually know what locking is used at the lower level; + * but if it's a filesystem that supports quotas, it will be using + * i_lock as in inode_add_bytes(). tmpfs uses other locking, and + * its 32-bit is (just) able to exceed 2TB i_size with the aid of + * holes; but its i_blocks cannot carry into the upper long without + * almost 2TB swap - let's ignore that case. + */ + if (sizeof(i_blocks) > sizeof(long)) + spin_lock(&src->i_lock); + i_blocks = src->i_blocks; + if (sizeof(i_blocks) > sizeof(long)) + spin_unlock(&src->i_lock); + + /* + * If CONFIG_SMP or CONFIG_PREEMPT on 32-bit, it's vital for + * fsstack_copy_inode_size() to hold some lock around + * i_size_write(), otherwise i_size_read() may spin forever (see + * include/linux/fs.h). We don't necessarily hold i_mutex when this + * is called, so take i_lock for that case. + * + * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the + * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock + * for that case too, and do both at once by combining the tests. + * + * There is none of this locking overhead in the 64-bit case. + */ + if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long)) + spin_lock(&dst->i_lock); + i_size_write(dst, i_size); + dst->i_blocks = i_blocks; + if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long)) + spin_unlock(&dst->i_lock); +} +EXPORT_SYMBOL_GPL(fsstack_copy_inode_size); + +/* copy all attributes */ +void fsstack_copy_attr_all(struct inode *dest, const struct inode *src) +{ + dest->i_mode = src->i_mode; + dest->i_uid = src->i_uid; + dest->i_gid = src->i_gid; + dest->i_rdev = src->i_rdev; + dest->i_atime = src->i_atime; + dest->i_mtime = src->i_mtime; + dest->i_ctime = src->i_ctime; + dest->i_blkbits = src->i_blkbits; + dest->i_flags = src->i_flags; + dest->i_nlink = src->i_nlink; +} +EXPORT_SYMBOL_GPL(fsstack_copy_attr_all); diff --git a/fs/stat.c b/fs/stat.c new file mode 100644 index 00000000..02a60614 --- /dev/null +++ b/fs/stat.c @@ -0,0 +1,473 @@ +/* + * linux/fs/stat.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +void generic_fillattr(struct inode *inode, struct kstat *stat) +{ + stat->dev = inode->i_sb->s_dev; + stat->ino = inode->i_ino; + stat->mode = inode->i_mode; + stat->nlink = inode->i_nlink; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; + stat->rdev = inode->i_rdev; + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; + stat->size = i_size_read(inode); + stat->blocks = inode->i_blocks; + stat->blksize = (1 << inode->i_blkbits); +} + +EXPORT_SYMBOL(generic_fillattr); + +int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + int retval; + + retval = security_inode_getattr(mnt, dentry); + if (retval) + return retval; + + if (inode->i_op->getattr) + return inode->i_op->getattr(mnt, dentry, stat); + + generic_fillattr(inode, stat); + return 0; +} + +EXPORT_SYMBOL(vfs_getattr); + +int vfs_fstat(unsigned int fd, struct kstat *stat) +{ + struct file *f = fget(fd); + int error = -EBADF; + + if (f) { + error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat); + fput(f); + } + return error; +} +EXPORT_SYMBOL(vfs_fstat); + +int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, + int flag) +{ + struct path path; + int error = -EINVAL; + int lookup_flags = 0; + + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | + AT_EMPTY_PATH)) != 0) + goto out; + + if (!(flag & AT_SYMLINK_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + if (flag & AT_NO_AUTOMOUNT) + lookup_flags |= LOOKUP_NO_AUTOMOUNT; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + error = user_path_at(dfd, filename, lookup_flags, &path); + if (error) + goto out; + + error = vfs_getattr(path.mnt, path.dentry, stat); + path_put(&path); +out: + return error; +} +EXPORT_SYMBOL(vfs_fstatat); + +int vfs_stat(const char __user *name, struct kstat *stat) +{ + return vfs_fstatat(AT_FDCWD, name, stat, 0); +} +EXPORT_SYMBOL(vfs_stat); + +int vfs_lstat(const char __user *name, struct kstat *stat) +{ + return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW); +} +EXPORT_SYMBOL(vfs_lstat); + + +#ifdef __ARCH_WANT_OLD_STAT + +/* + * For backward compatibility? Maybe this should be moved + * into arch/i386 instead? + */ +static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * statbuf) +{ + static int warncount = 5; + struct __old_kernel_stat tmp; + + if (warncount > 0) { + warncount--; + printk(KERN_WARNING "VFS: Warning: %s using old stat() call. Recompile your binary.\n", + current->comm); + } else if (warncount < 0) { + /* it's laughable, but... */ + warncount = 0; + } + + memset(&tmp, 0, sizeof(struct __old_kernel_stat)); + tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) + return -EOVERFLOW; + tmp.st_mode = stat->mode; + tmp.st_nlink = stat->nlink; + if (tmp.st_nlink != stat->nlink) + return -EOVERFLOW; + SET_UID(tmp.st_uid, stat->uid); + SET_GID(tmp.st_gid, stat->gid); + tmp.st_rdev = old_encode_dev(stat->rdev); +#if BITS_PER_LONG == 32 + if (stat->size > MAX_NON_LFS) + return -EOVERFLOW; +#endif + tmp.st_size = stat->size; + tmp.st_atime = stat->atime.tv_sec; + tmp.st_mtime = stat->mtime.tv_sec; + tmp.st_ctime = stat->ctime.tv_sec; + return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; +} + +SYSCALL_DEFINE2(stat, const char __user *, filename, + struct __old_kernel_stat __user *, statbuf) +{ + struct kstat stat; + int error; + + error = vfs_stat(filename, &stat); + if (error) + return error; + + return cp_old_stat(&stat, statbuf); +} + +SYSCALL_DEFINE2(lstat, const char __user *, filename, + struct __old_kernel_stat __user *, statbuf) +{ + struct kstat stat; + int error; + + error = vfs_lstat(filename, &stat); + if (error) + return error; + + return cp_old_stat(&stat, statbuf); +} + +SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf) +{ + struct kstat stat; + int error = vfs_fstat(fd, &stat); + + if (!error) + error = cp_old_stat(&stat, statbuf); + + return error; +} + +#endif /* __ARCH_WANT_OLD_STAT */ + +static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) +{ + struct stat tmp; + +#if BITS_PER_LONG == 32 + if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + return -EOVERFLOW; +#else + if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev)) + return -EOVERFLOW; +#endif + + memset(&tmp, 0, sizeof(tmp)); +#if BITS_PER_LONG == 32 + tmp.st_dev = old_encode_dev(stat->dev); +#else + tmp.st_dev = new_encode_dev(stat->dev); +#endif + tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) + return -EOVERFLOW; + tmp.st_mode = stat->mode; + tmp.st_nlink = stat->nlink; + if (tmp.st_nlink != stat->nlink) + return -EOVERFLOW; + SET_UID(tmp.st_uid, stat->uid); + SET_GID(tmp.st_gid, stat->gid); +#if BITS_PER_LONG == 32 + tmp.st_rdev = old_encode_dev(stat->rdev); +#else + tmp.st_rdev = new_encode_dev(stat->rdev); +#endif +#if BITS_PER_LONG == 32 + if (stat->size > MAX_NON_LFS) + return -EOVERFLOW; +#endif + tmp.st_size = stat->size; + tmp.st_atime = stat->atime.tv_sec; + tmp.st_mtime = stat->mtime.tv_sec; + tmp.st_ctime = stat->ctime.tv_sec; +#ifdef STAT_HAVE_NSEC + tmp.st_atime_nsec = stat->atime.tv_nsec; + tmp.st_mtime_nsec = stat->mtime.tv_nsec; + tmp.st_ctime_nsec = stat->ctime.tv_nsec; +#endif + tmp.st_blocks = stat->blocks; + tmp.st_blksize = stat->blksize; + return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; +} + +SYSCALL_DEFINE2(newstat, const char __user *, filename, + struct stat __user *, statbuf) +{ + struct kstat stat; + int error = vfs_stat(filename, &stat); + + if (error) + return error; + return cp_new_stat(&stat, statbuf); +} + +SYSCALL_DEFINE2(newlstat, const char __user *, filename, + struct stat __user *, statbuf) +{ + struct kstat stat; + int error; + + error = vfs_lstat(filename, &stat); + if (error) + return error; + + return cp_new_stat(&stat, statbuf); +} + +#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) +SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename, + struct stat __user *, statbuf, int, flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_new_stat(&stat, statbuf); +} +#endif + +SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf) +{ + struct kstat stat; + int error = vfs_fstat(fd, &stat); + + if (!error) + error = cp_new_stat(&stat, statbuf); + + return error; +} + +SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, + char __user *, buf, int, bufsiz) +{ + struct path path; + int error; + int empty = 0; + + if (bufsiz <= 0) + return -EINVAL; + + error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty); + if (!error) { + struct inode *inode = path.dentry->d_inode; + + error = empty ? -ENOENT : -EINVAL; + if (inode->i_op->readlink) { + error = security_inode_readlink(path.dentry); + if (!error) { + touch_atime(path.mnt, path.dentry); + error = inode->i_op->readlink(path.dentry, + buf, bufsiz); + } + } + path_put(&path); + } + return error; +} + +SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf, + int, bufsiz) +{ + return sys_readlinkat(AT_FDCWD, path, buf, bufsiz); +} + + +/* ---------- LFS-64 ----------- */ +#ifdef __ARCH_WANT_STAT64 + +static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf) +{ + struct stat64 tmp; + + memset(&tmp, 0, sizeof(struct stat64)); +#ifdef CONFIG_MIPS + /* mips has weird padding, so we don't get 64 bits there */ + if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev)) + return -EOVERFLOW; + tmp.st_dev = new_encode_dev(stat->dev); + tmp.st_rdev = new_encode_dev(stat->rdev); +#else + tmp.st_dev = huge_encode_dev(stat->dev); + tmp.st_rdev = huge_encode_dev(stat->rdev); +#endif + tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) + return -EOVERFLOW; +#ifdef STAT64_HAS_BROKEN_ST_INO + tmp.__st_ino = stat->ino; +#endif + tmp.st_mode = stat->mode; + tmp.st_nlink = stat->nlink; + tmp.st_uid = stat->uid; + tmp.st_gid = stat->gid; + tmp.st_atime = stat->atime.tv_sec; + tmp.st_atime_nsec = stat->atime.tv_nsec; + tmp.st_mtime = stat->mtime.tv_sec; + tmp.st_mtime_nsec = stat->mtime.tv_nsec; + tmp.st_ctime = stat->ctime.tv_sec; + tmp.st_ctime_nsec = stat->ctime.tv_nsec; + tmp.st_size = stat->size; + tmp.st_blocks = stat->blocks; + tmp.st_blksize = stat->blksize; + return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; +} + +SYSCALL_DEFINE2(stat64, const char __user *, filename, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int error = vfs_stat(filename, &stat); + + if (!error) + error = cp_new_stat64(&stat, statbuf); + + return error; +} + +SYSCALL_DEFINE2(lstat64, const char __user *, filename, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int error = vfs_lstat(filename, &stat); + + if (!error) + error = cp_new_stat64(&stat, statbuf); + + return error; +} + +SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf) +{ + struct kstat stat; + int error = vfs_fstat(fd, &stat); + + if (!error) + error = cp_new_stat64(&stat, statbuf); + + return error; +} + +SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename, + struct stat64 __user *, statbuf, int, flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_new_stat64(&stat, statbuf); +} +#endif /* __ARCH_WANT_STAT64 */ + +/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ +void __inode_add_bytes(struct inode *inode, loff_t bytes) +{ + inode->i_blocks += bytes >> 9; + bytes &= 511; + inode->i_bytes += bytes; + if (inode->i_bytes >= 512) { + inode->i_blocks++; + inode->i_bytes -= 512; + } +} + +void inode_add_bytes(struct inode *inode, loff_t bytes) +{ + spin_lock(&inode->i_lock); + __inode_add_bytes(inode, bytes); + spin_unlock(&inode->i_lock); +} + +EXPORT_SYMBOL(inode_add_bytes); + +void inode_sub_bytes(struct inode *inode, loff_t bytes) +{ + spin_lock(&inode->i_lock); + inode->i_blocks -= bytes >> 9; + bytes &= 511; + if (inode->i_bytes < bytes) { + inode->i_blocks--; + inode->i_bytes += 512; + } + inode->i_bytes -= bytes; + spin_unlock(&inode->i_lock); +} + +EXPORT_SYMBOL(inode_sub_bytes); + +loff_t inode_get_bytes(struct inode *inode) +{ + loff_t ret; + + spin_lock(&inode->i_lock); + ret = (((loff_t)inode->i_blocks) << 9) + inode->i_bytes; + spin_unlock(&inode->i_lock); + return ret; +} + +EXPORT_SYMBOL(inode_get_bytes); + +void inode_set_bytes(struct inode *inode, loff_t bytes) +{ + /* Caller is here responsible for sufficient locking + * (ie. inode->i_lock) */ + inode->i_blocks = bytes >> 9; + inode->i_bytes = bytes & 511; +} + +EXPORT_SYMBOL(inode_set_bytes); diff --git a/fs/statfs.c b/fs/statfs.c new file mode 100644 index 00000000..9cf04a11 --- /dev/null +++ b/fs/statfs.c @@ -0,0 +1,229 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int flags_by_mnt(int mnt_flags) +{ + int flags = 0; + + if (mnt_flags & MNT_READONLY) + flags |= ST_RDONLY; + if (mnt_flags & MNT_NOSUID) + flags |= ST_NOSUID; + if (mnt_flags & MNT_NODEV) + flags |= ST_NODEV; + if (mnt_flags & MNT_NOEXEC) + flags |= ST_NOEXEC; + if (mnt_flags & MNT_NOATIME) + flags |= ST_NOATIME; + if (mnt_flags & MNT_NODIRATIME) + flags |= ST_NODIRATIME; + if (mnt_flags & MNT_RELATIME) + flags |= ST_RELATIME; + return flags; +} + +static int flags_by_sb(int s_flags) +{ + int flags = 0; + if (s_flags & MS_SYNCHRONOUS) + flags |= ST_SYNCHRONOUS; + if (s_flags & MS_MANDLOCK) + flags |= ST_MANDLOCK; + return flags; +} + +static int calculate_f_flags(struct vfsmount *mnt) +{ + return ST_VALID | flags_by_mnt(mnt->mnt_flags) | + flags_by_sb(mnt->mnt_sb->s_flags); +} + +int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf) +{ + int retval; + + if (!dentry->d_sb->s_op->statfs) + return -ENOSYS; + + memset(buf, 0, sizeof(*buf)); + retval = security_sb_statfs(dentry); + if (retval) + return retval; + retval = dentry->d_sb->s_op->statfs(dentry, buf); + if (retval == 0 && buf->f_frsize == 0) + buf->f_frsize = buf->f_bsize; + return retval; +} + +int vfs_statfs(struct path *path, struct kstatfs *buf) +{ + int error; + + error = statfs_by_dentry(path->dentry, buf); + if (!error) + buf->f_flags = calculate_f_flags(path->mnt); + return error; +} +EXPORT_SYMBOL(vfs_statfs); + +int user_statfs(const char __user *pathname, struct kstatfs *st) +{ + struct path path; + int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + if (!error) { + error = vfs_statfs(&path, st); + path_put(&path); + } + return error; +} + +int fd_statfs(int fd, struct kstatfs *st) +{ + struct file *file = fget(fd); + int error = -EBADF; + if (file) { + error = vfs_statfs(&file->f_path, st); + fput(file); + } + return error; +} + +static int do_statfs_native(struct kstatfs *st, struct statfs __user *p) +{ + struct statfs buf; + + if (sizeof(buf) == sizeof(*st)) + memcpy(&buf, st, sizeof(*st)); + else { + if (sizeof buf.f_blocks == 4) { + if ((st->f_blocks | st->f_bfree | st->f_bavail | + st->f_bsize | st->f_frsize) & + 0xffffffff00000000ULL) + return -EOVERFLOW; + /* + * f_files and f_ffree may be -1; it's okay to stuff + * that into 32 bits + */ + if (st->f_files != -1 && + (st->f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (st->f_ffree != -1 && + (st->f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid = st->f_fsid; + buf.f_namelen = st->f_namelen; + buf.f_frsize = st->f_frsize; + buf.f_flags = st->f_flags; + memset(buf.f_spare, 0, sizeof(buf.f_spare)); + } + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; + return 0; +} + +static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p) +{ + struct statfs64 buf; + if (sizeof(buf) == sizeof(*st)) + memcpy(&buf, st, sizeof(*st)); + else { + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid = st->f_fsid; + buf.f_namelen = st->f_namelen; + buf.f_frsize = st->f_frsize; + buf.f_flags = st->f_flags; + memset(buf.f_spare, 0, sizeof(buf.f_spare)); + } + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; + return 0; +} + +SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) +{ + struct kstatfs st; + int error = user_statfs(pathname, &st); + if (!error) + error = do_statfs_native(&st, buf); + return error; +} + +SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) +{ + struct kstatfs st; + int error; + if (sz != sizeof(*buf)) + return -EINVAL; + error = user_statfs(pathname, &st); + if (!error) + error = do_statfs64(&st, buf); + return error; +} + +SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) +{ + struct kstatfs st; + int error = fd_statfs(fd, &st); + if (!error) + error = do_statfs_native(&st, buf); + return error; +} + +SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) +{ + struct kstatfs st; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = fd_statfs(fd, &st); + if (!error) + error = do_statfs64(&st, buf); + return error; +} + +SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) +{ + struct super_block *s; + struct ustat tmp; + struct kstatfs sbuf; + int err; + + s = user_get_super(new_decode_dev(dev)); + if (!s) + return -EINVAL; + + err = statfs_by_dentry(s->s_root, &sbuf); + drop_super(s); + if (err) + return err; + + memset(&tmp,0,sizeof(struct ustat)); + tmp.f_tfree = sbuf.f_bfree; + tmp.f_tinode = sbuf.f_ffree; + + return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0; +} diff --git a/fs/super.c b/fs/super.c new file mode 100644 index 00000000..caf4dfa2 --- /dev/null +++ b/fs/super.c @@ -0,0 +1,1061 @@ +/* + * linux/fs/super.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * super.c contains code to handle: - mount structures + * - super-block tables + * - filesystem drivers list + * - mount system call + * - umount system call + * - ustat system call + * + * GK 2/5/95 - Changed to support mounting the root fs via NFS + * + * Added kerneld support: Jacques Gelinas and Bjorn Ekwall + * Added change_root: Werner Almesberger & Hans Lermen, Feb '96 + * Added options to /proc/mounts: + * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996. + * Added devfs support: Richard Gooch , 13-JAN-1998 + * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 + */ + +#include +#include +#include +#include +#include +#include +#include /* for the emergency remount stuff */ +#include +#include +#include +#include +#include +#include "internal.h" + + +LIST_HEAD(super_blocks); +DEFINE_SPINLOCK(sb_lock); + +/** + * alloc_super - create new superblock + * @type: filesystem type superblock should belong to + * + * Allocates and initializes a new &struct super_block. alloc_super() + * returns a pointer new superblock or %NULL if allocation had failed. + */ +static struct super_block *alloc_super(struct file_system_type *type) +{ + struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); + static const struct super_operations default_op; + + if (s) { + if (security_sb_alloc(s)) { + kfree(s); + s = NULL; + goto out; + } +#ifdef CONFIG_SMP + s->s_files = alloc_percpu(struct list_head); + if (!s->s_files) { + security_sb_free(s); + kfree(s); + s = NULL; + goto out; + } else { + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i)); + } +#else + INIT_LIST_HEAD(&s->s_files); +#endif + s->s_bdi = &default_backing_dev_info; + INIT_LIST_HEAD(&s->s_instances); + INIT_HLIST_BL_HEAD(&s->s_anon); + INIT_LIST_HEAD(&s->s_inodes); + INIT_LIST_HEAD(&s->s_dentry_lru); + init_rwsem(&s->s_umount); + mutex_init(&s->s_lock); + lockdep_set_class(&s->s_umount, &type->s_umount_key); + /* + * The locking rules for s_lock are up to the + * filesystem. For example ext3fs has different + * lock ordering than usbfs: + */ + lockdep_set_class(&s->s_lock, &type->s_lock_key); + /* + * sget() can have s_umount recursion. + * + * When it cannot find a suitable sb, it allocates a new + * one (this one), and tries again to find a suitable old + * one. + * + * In case that succeeds, it will acquire the s_umount + * lock of the old one. Since these are clearly distrinct + * locks, and this object isn't exposed yet, there's no + * risk of deadlocks. + * + * Annotate this by putting this lock in a different + * subclass. + */ + down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); + s->s_count = 1; + atomic_set(&s->s_active, 1); + mutex_init(&s->s_vfs_rename_mutex); + lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); + mutex_init(&s->s_dquot.dqio_mutex); + mutex_init(&s->s_dquot.dqonoff_mutex); + init_rwsem(&s->s_dquot.dqptr_sem); + init_waitqueue_head(&s->s_wait_unfrozen); + s->s_maxbytes = MAX_NON_LFS; + s->s_op = &default_op; + s->s_time_gran = 1000000000; + s->cleancache_poolid = -1; + } +out: + return s; +} + +/** + * destroy_super - frees a superblock + * @s: superblock to free + * + * Frees a superblock. + */ +static inline void destroy_super(struct super_block *s) +{ +#ifdef CONFIG_SMP + free_percpu(s->s_files); +#endif + security_sb_free(s); + kfree(s->s_subtype); + kfree(s->s_options); + kfree(s); +} + +/* Superblock refcounting */ + +/* + * Drop a superblock's refcount. The caller must hold sb_lock. + */ +void __put_super(struct super_block *sb) +{ + if (!--sb->s_count) { + list_del_init(&sb->s_list); + destroy_super(sb); + } +} + +/** + * put_super - drop a temporary reference to superblock + * @sb: superblock in question + * + * Drops a temporary reference, frees superblock if there's no + * references left. + */ +void put_super(struct super_block *sb) +{ + spin_lock(&sb_lock); + __put_super(sb); + spin_unlock(&sb_lock); +} + + +/** + * deactivate_locked_super - drop an active reference to superblock + * @s: superblock to deactivate + * + * Drops an active reference to superblock, converting it into a temprory + * one if there is no other active references left. In that case we + * tell fs driver to shut it down and drop the temporary reference we + * had just acquired. + * + * Caller holds exclusive lock on superblock; that lock is released. + */ +void deactivate_locked_super(struct super_block *s) +{ + struct file_system_type *fs = s->s_type; + if (atomic_dec_and_test(&s->s_active)) { + cleancache_flush_fs(s); + fs->kill_sb(s); + /* + * We need to call rcu_barrier so all the delayed rcu free + * inodes are flushed before we release the fs module. + */ + rcu_barrier(); + put_filesystem(fs); + put_super(s); + } else { + up_write(&s->s_umount); + } +} + +EXPORT_SYMBOL(deactivate_locked_super); + +/** + * deactivate_super - drop an active reference to superblock + * @s: superblock to deactivate + * + * Variant of deactivate_locked_super(), except that superblock is *not* + * locked by caller. If we are going to drop the final active reference, + * lock will be acquired prior to that. + */ +void deactivate_super(struct super_block *s) +{ + if (!atomic_add_unless(&s->s_active, -1, 1)) { + down_write(&s->s_umount); + deactivate_locked_super(s); + } +} + +EXPORT_SYMBOL(deactivate_super); + +/** + * grab_super - acquire an active reference + * @s: reference we are trying to make active + * + * Tries to acquire an active reference. grab_super() is used when we + * had just found a superblock in super_blocks or fs_type->fs_supers + * and want to turn it into a full-blown active reference. grab_super() + * is called with sb_lock held and drops it. Returns 1 in case of + * success, 0 if we had failed (superblock contents was already dead or + * dying when grab_super() had been called). + */ +static int grab_super(struct super_block *s) __releases(sb_lock) +{ + if (atomic_inc_not_zero(&s->s_active)) { + spin_unlock(&sb_lock); + return 1; + } + /* it's going away */ + s->s_count++; + spin_unlock(&sb_lock); + /* wait for it to die */ + down_write(&s->s_umount); + up_write(&s->s_umount); + put_super(s); + return 0; +} + +/* + * Superblock locking. We really ought to get rid of these two. + */ +void lock_super(struct super_block * sb) +{ + get_fs_excl(); + mutex_lock(&sb->s_lock); +} + +void unlock_super(struct super_block * sb) +{ + put_fs_excl(); + mutex_unlock(&sb->s_lock); +} + +EXPORT_SYMBOL(lock_super); +EXPORT_SYMBOL(unlock_super); + +/** + * generic_shutdown_super - common helper for ->kill_sb() + * @sb: superblock to kill + * + * generic_shutdown_super() does all fs-independent work on superblock + * shutdown. Typical ->kill_sb() should pick all fs-specific objects + * that need destruction out of superblock, call generic_shutdown_super() + * and release aforementioned objects. Note: dentries and inodes _are_ + * taken care of and do not need specific handling. + * + * Upon calling this function, the filesystem may no longer alter or + * rearrange the set of dentries belonging to this super_block, nor may it + * change the attachments of dentries to inodes. + */ +void generic_shutdown_super(struct super_block *sb) +{ + const struct super_operations *sop = sb->s_op; + + + if (sb->s_root) { + shrink_dcache_for_umount(sb); + sync_filesystem(sb); + get_fs_excl(); + sb->s_flags &= ~MS_ACTIVE; + + fsnotify_unmount_inodes(&sb->s_inodes); + + evict_inodes(sb); + + if (sop->put_super) + sop->put_super(sb); + + if (!list_empty(&sb->s_inodes)) { + printk("VFS: Busy inodes after unmount of %s. " + "Self-destruct in 5 seconds. Have a nice day...\n", + sb->s_id); + } + put_fs_excl(); + } + spin_lock(&sb_lock); + /* should be initialized for __put_super_and_need_restart() */ + list_del_init(&sb->s_instances); + spin_unlock(&sb_lock); + up_write(&sb->s_umount); +} + +EXPORT_SYMBOL(generic_shutdown_super); + +/** + * sget - find or create a superblock + * @type: filesystem type superblock should belong to + * @test: comparison callback + * @set: setup callback + * @data: argument to each of them + */ +struct super_block *sget(struct file_system_type *type, + int (*test)(struct super_block *,void *), + int (*set)(struct super_block *,void *), + void *data) +{ + struct super_block *s = NULL; + struct super_block *old; + int err; + +retry: + spin_lock(&sb_lock); + if (test) { + list_for_each_entry(old, &type->fs_supers, s_instances) { + if (!test(old, data)) + continue; + if (!grab_super(old)) + goto retry; + if (s) { + up_write(&s->s_umount); + destroy_super(s); + s = NULL; + } + down_write(&old->s_umount); + if (unlikely(!(old->s_flags & MS_BORN))) { + deactivate_locked_super(old); + goto retry; + } + return old; + } + } + if (!s) { + spin_unlock(&sb_lock); + s = alloc_super(type); + if (!s) + return ERR_PTR(-ENOMEM); + goto retry; + } + + err = set(s, data); + if (err) { + spin_unlock(&sb_lock); + up_write(&s->s_umount); + destroy_super(s); + return ERR_PTR(err); + } + s->s_type = type; + strlcpy(s->s_id, type->name, sizeof(s->s_id)); + list_add_tail(&s->s_list, &super_blocks); + list_add(&s->s_instances, &type->fs_supers); + spin_unlock(&sb_lock); + get_filesystem(type); + return s; +} + +EXPORT_SYMBOL(sget); + +void drop_super(struct super_block *sb) +{ + up_read(&sb->s_umount); + put_super(sb); +} + +EXPORT_SYMBOL(drop_super); + +/** + * sync_supers - helper for periodic superblock writeback + * + * Call the write_super method if present on all dirty superblocks in + * the system. This is for the periodic writeback used by most older + * filesystems. For data integrity superblock writeback use + * sync_filesystems() instead. + * + * Note: check the dirty flag before waiting, so we don't + * hold up the sync while mounting a device. (The newly + * mounted device won't need syncing.) + */ +void sync_supers(void) +{ + struct super_block *sb, *p = NULL; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; + if (sb->s_op->write_super && sb->s_dirt) { + sb->s_count++; + spin_unlock(&sb_lock); + + down_read(&sb->s_umount); + if (sb->s_root && sb->s_dirt) + sb->s_op->write_super(sb); + up_read(&sb->s_umount); + + spin_lock(&sb_lock); + if (p) + __put_super(p); + p = sb; + } + } + if (p) + __put_super(p); + spin_unlock(&sb_lock); +} + +/** + * iterate_supers - call function for all active superblocks + * @f: function to call + * @arg: argument to pass to it + * + * Scans the superblock list and calls given function, passing it + * locked superblock and given argument. + */ +void iterate_supers(void (*f)(struct super_block *, void *), void *arg) +{ + struct super_block *sb, *p = NULL; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; + sb->s_count++; + spin_unlock(&sb_lock); + + down_read(&sb->s_umount); + if (sb->s_root) + f(sb, arg); + up_read(&sb->s_umount); + + spin_lock(&sb_lock); + if (p) + __put_super(p); + p = sb; + } + if (p) + __put_super(p); + spin_unlock(&sb_lock); +} + +/** + * get_super - get the superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device given. %NULL is returned if no match is found. + */ + +struct super_block *get_super(struct block_device *bdev) +{ + struct super_block *sb; + + if (!bdev) + return NULL; + + spin_lock(&sb_lock); +rescan: + list_for_each_entry(sb, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; + if (sb->s_bdev == bdev) { + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + /* still alive? */ + if (sb->s_root) + return sb; + up_read(&sb->s_umount); + /* nope, got unmounted */ + spin_lock(&sb_lock); + __put_super(sb); + goto rescan; + } + } + spin_unlock(&sb_lock); + return NULL; +} + +EXPORT_SYMBOL(get_super); + +/** + * get_active_super - get an active reference to the superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device given. Returns the superblock with an active + * reference or %NULL if none was found. + */ +struct super_block *get_active_super(struct block_device *bdev) +{ + struct super_block *sb; + + if (!bdev) + return NULL; + +restart: + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; + if (sb->s_bdev == bdev) { + if (grab_super(sb)) /* drops sb_lock */ + return sb; + else + goto restart; + } + } + spin_unlock(&sb_lock); + return NULL; +} + +struct super_block *user_get_super(dev_t dev) +{ + struct super_block *sb; + + spin_lock(&sb_lock); +rescan: + list_for_each_entry(sb, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; + if (sb->s_dev == dev) { + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + /* still alive? */ + if (sb->s_root) + return sb; + up_read(&sb->s_umount); + /* nope, got unmounted */ + spin_lock(&sb_lock); + __put_super(sb); + goto rescan; + } + } + spin_unlock(&sb_lock); + return NULL; +} + +/** + * do_remount_sb - asks filesystem to change mount options. + * @sb: superblock in question + * @flags: numeric part of options + * @data: the rest of options + * @force: whether or not to force the change + * + * Alters the mount options of a mounted file system. + */ +int do_remount_sb(struct super_block *sb, int flags, void *data, int force) +{ + int retval; + int remount_ro; + + if (sb->s_frozen != SB_UNFROZEN) + return -EBUSY; + +#ifdef CONFIG_BLOCK + if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev)) + return -EACCES; +#endif + + if (flags & MS_RDONLY) + acct_auto_close(sb); + shrink_dcache_sb(sb); + sync_filesystem(sb); + + remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + + /* If we are remounting RDONLY and current sb is read/write, + make sure there are no rw files opened */ + if (remount_ro) { + if (force) + mark_files_ro(sb); + else if (!fs_may_remount_ro(sb)) + return -EBUSY; + } + + if (sb->s_op->remount_fs) { + retval = sb->s_op->remount_fs(sb, &flags, data); + if (retval) + return retval; + } + sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); + + /* + * Some filesystems modify their metadata via some other path than the + * bdev buffer cache (eg. use a private mapping, or directories in + * pagecache, etc). Also file data modifications go via their own + * mappings. So If we try to mount readonly then copy the filesystem + * from bdev, we could get stale data, so invalidate it to give a best + * effort at coherency. + */ + if (remount_ro && sb->s_bdev) + invalidate_bdev(sb->s_bdev); + return 0; +} + +static void do_emergency_remount(struct work_struct *work) +{ + struct super_block *sb, *p = NULL; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; + sb->s_count++; + spin_unlock(&sb_lock); + down_write(&sb->s_umount); + if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { + /* + * What lock protects sb->s_flags?? + */ + do_remount_sb(sb, MS_RDONLY, NULL, 1); + } + up_write(&sb->s_umount); + spin_lock(&sb_lock); + if (p) + __put_super(p); + p = sb; + } + if (p) + __put_super(p); + spin_unlock(&sb_lock); + kfree(work); + printk("Emergency Remount complete\n"); +} + +void emergency_remount(void) +{ + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_emergency_remount); + schedule_work(work); + } +} + +/* + * Unnamed block devices are dummy devices used by virtual + * filesystems which don't use real block-devices. -- jrs + */ + +static DEFINE_IDA(unnamed_dev_ida); +static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ +static int unnamed_dev_start = 0; /* don't bother trying below it */ + +int set_anon_super(struct super_block *s, void *data) +{ + int dev; + int error; + + retry: + if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0) + return -ENOMEM; + spin_lock(&unnamed_dev_lock); + error = ida_get_new_above(&unnamed_dev_ida, unnamed_dev_start, &dev); + if (!error) + unnamed_dev_start = dev + 1; + spin_unlock(&unnamed_dev_lock); + if (error == -EAGAIN) + /* We raced and lost with another CPU. */ + goto retry; + else if (error) + return -EAGAIN; + + if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { + spin_lock(&unnamed_dev_lock); + ida_remove(&unnamed_dev_ida, dev); + if (unnamed_dev_start > dev) + unnamed_dev_start = dev; + spin_unlock(&unnamed_dev_lock); + return -EMFILE; + } + s->s_dev = MKDEV(0, dev & MINORMASK); + s->s_bdi = &noop_backing_dev_info; + return 0; +} + +EXPORT_SYMBOL(set_anon_super); + +void kill_anon_super(struct super_block *sb) +{ + int slot = MINOR(sb->s_dev); + + generic_shutdown_super(sb); + spin_lock(&unnamed_dev_lock); + ida_remove(&unnamed_dev_ida, slot); + if (slot < unnamed_dev_start) + unnamed_dev_start = slot; + spin_unlock(&unnamed_dev_lock); +} + +EXPORT_SYMBOL(kill_anon_super); + +void kill_litter_super(struct super_block *sb) +{ + if (sb->s_root) + d_genocide(sb->s_root); + kill_anon_super(sb); +} + +EXPORT_SYMBOL(kill_litter_super); + +static int ns_test_super(struct super_block *sb, void *data) +{ + return sb->s_fs_info == data; +} + +static int ns_set_super(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); +} + +struct dentry *mount_ns(struct file_system_type *fs_type, int flags, + void *data, int (*fill_super)(struct super_block *, void *, int)) +{ + struct super_block *sb; + + sb = sget(fs_type, ns_test_super, ns_set_super, data); + if (IS_ERR(sb)) + return ERR_CAST(sb); + + if (!sb->s_root) { + int err; + sb->s_flags = flags; + err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + if (err) { + deactivate_locked_super(sb); + return ERR_PTR(err); + } + + sb->s_flags |= MS_ACTIVE; + } + + return dget(sb->s_root); +} + +EXPORT_SYMBOL(mount_ns); + +#ifdef CONFIG_BLOCK +static int set_bdev_super(struct super_block *s, void *data) +{ + s->s_bdev = data; + s->s_dev = s->s_bdev->bd_dev; + + /* + * We set the bdi here to the queue backing, file systems can + * overwrite this in ->fill_super() + */ + s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info; + return 0; +} + +static int test_bdev_super(struct super_block *s, void *data) +{ + return (void *)s->s_bdev == data; +} + +struct dentry *mount_bdev(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + int (*fill_super)(struct super_block *, void *, int)) +{ + struct block_device *bdev; + struct super_block *s; + fmode_t mode = FMODE_READ | FMODE_EXCL; + int error = 0; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + bdev = blkdev_get_by_path(dev_name, mode, fs_type); + if (IS_ERR(bdev)) + return ERR_CAST(bdev); + + /* + * once the super is inserted into the list by sget, s_umount + * will protect the lockfs code from trying to start a snapshot + * while we are mounting + */ + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + error = -EBUSY; + goto error_bdev; + } + s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + if (IS_ERR(s)) + goto error_s; + + if (s->s_root) { + if ((flags ^ s->s_flags) & MS_RDONLY) { + deactivate_locked_super(s); + error = -EBUSY; + goto error_bdev; + } + + /* + * s_umount nests inside bd_mutex during + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. + */ + up_write(&s->s_umount); + blkdev_put(bdev, mode); + down_write(&s->s_umount); + } else { + char b[BDEVNAME_SIZE]; + + s->s_flags = flags | MS_NOSEC; + s->s_mode = mode; + strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + sb_set_blocksize(s, block_size(bdev)); + error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); + if (error) { + deactivate_locked_super(s); + goto error; + } + + s->s_flags |= MS_ACTIVE; + bdev->bd_super = s; + } + + return dget(s->s_root); + +error_s: + error = PTR_ERR(s); +error_bdev: + blkdev_put(bdev, mode); +error: + return ERR_PTR(error); +} +EXPORT_SYMBOL(mount_bdev); + +void kill_block_super(struct super_block *sb) +{ + struct block_device *bdev = sb->s_bdev; + fmode_t mode = sb->s_mode; + + bdev->bd_super = NULL; + generic_shutdown_super(sb); + sync_blockdev(bdev); + WARN_ON_ONCE(!(mode & FMODE_EXCL)); + blkdev_put(bdev, mode | FMODE_EXCL); +} + +EXPORT_SYMBOL(kill_block_super); +#endif + +struct dentry *mount_nodev(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int)) +{ + int error; + struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); + + if (IS_ERR(s)) + return ERR_CAST(s); + + s->s_flags = flags; + + error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); + if (error) { + deactivate_locked_super(s); + return ERR_PTR(error); + } + s->s_flags |= MS_ACTIVE; + return dget(s->s_root); +} +EXPORT_SYMBOL(mount_nodev); + +static int compare_single(struct super_block *s, void *p) +{ + return 1; +} + +struct dentry *mount_single(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int)) +{ + struct super_block *s; + int error; + + s = sget(fs_type, compare_single, set_anon_super, NULL); + if (IS_ERR(s)) + return ERR_CAST(s); + if (!s->s_root) { + s->s_flags = flags; + error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); + if (error) { + deactivate_locked_super(s); + return ERR_PTR(error); + } + s->s_flags |= MS_ACTIVE; + } else { + do_remount_sb(s, flags, data, 0); + } + return dget(s->s_root); +} +EXPORT_SYMBOL(mount_single); + +struct dentry * +mount_fs(struct file_system_type *type, int flags, const char *name, void *data) +{ + struct dentry *root; + struct super_block *sb; + char *secdata = NULL; + int error = -ENOMEM; + + if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { + secdata = alloc_secdata(); + if (!secdata) + goto out; + + error = security_sb_copy_data(data, secdata); + if (error) + goto out_free_secdata; + } + + root = type->mount(type, flags, name, data); + if (IS_ERR(root)) { + error = PTR_ERR(root); + goto out_free_secdata; + } + sb = root->d_sb; + BUG_ON(!sb); + WARN_ON(!sb->s_bdi); + WARN_ON(sb->s_bdi == &default_backing_dev_info); + sb->s_flags |= MS_BORN; + + error = security_sb_kern_mount(sb, flags, secdata); + if (error) + goto out_sb; + + /* + * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE + * but s_maxbytes was an unsigned long long for many releases. Throw + * this warning for a little while to try and catch filesystems that + * violate this rule. + */ + WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " + "negative value (%lld)\n", type->name, sb->s_maxbytes); + + up_write(&sb->s_umount); + free_secdata(secdata); + return root; +out_sb: + dput(root); + deactivate_locked_super(sb); +out_free_secdata: + free_secdata(secdata); +out: + return ERR_PTR(error); +} + +/** + * freeze_super - lock the filesystem and force it into a consistent state + * @sb: the super to lock + * + * Syncs the super to make sure the filesystem is consistent and calls the fs's + * freeze_fs. Subsequent calls to this without first thawing the fs will return + * -EBUSY. + */ +int freeze_super(struct super_block *sb) +{ + int ret; + + atomic_inc(&sb->s_active); + down_write(&sb->s_umount); + if (sb->s_frozen) { + deactivate_locked_super(sb); + return -EBUSY; + } + + if (sb->s_flags & MS_RDONLY) { + sb->s_frozen = SB_FREEZE_TRANS; + smp_wmb(); + up_write(&sb->s_umount); + return 0; + } + + sb->s_frozen = SB_FREEZE_WRITE; + smp_wmb(); + + sync_filesystem(sb); + + sb->s_frozen = SB_FREEZE_TRANS; + smp_wmb(); + + sync_blockdev(sb->s_bdev); + if (sb->s_op->freeze_fs) { + ret = sb->s_op->freeze_fs(sb); + if (ret) { + printk(KERN_ERR + "VFS:Filesystem freeze failed\n"); + sb->s_frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_wait_unfrozen); + deactivate_locked_super(sb); + return ret; + } + } + up_write(&sb->s_umount); + return 0; +} +EXPORT_SYMBOL(freeze_super); + +/** + * thaw_super -- unlock filesystem + * @sb: the super to thaw + * + * Unlocks the filesystem and marks it writeable again after freeze_super(). + */ +int thaw_super(struct super_block *sb) +{ + int error; + + down_write(&sb->s_umount); + if (sb->s_frozen == SB_UNFROZEN) { + up_write(&sb->s_umount); + return -EINVAL; + } + + if (sb->s_flags & MS_RDONLY) + goto out; + + if (sb->s_op->unfreeze_fs) { + error = sb->s_op->unfreeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem thaw failed\n"); + sb->s_frozen = SB_FREEZE_TRANS; + up_write(&sb->s_umount); + return error; + } + } + +out: + sb->s_frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_wait_unfrozen); + deactivate_locked_super(sb); + + return 0; +} +EXPORT_SYMBOL(thaw_super); diff --git a/fs/sync.c b/fs/sync.c new file mode 100644 index 00000000..c38ec163 --- /dev/null +++ b/fs/sync.c @@ -0,0 +1,402 @@ +/* + * High-level sync()-related operations + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ + SYNC_FILE_RANGE_WAIT_AFTER) + +/* + * Do the filesystem syncing work. For simple filesystems + * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to + * submit IO for these buffers via __sync_blockdev(). This also speeds up the + * wait == 1 case since in that case write_inode() functions do + * sync_dirty_buffer() and thus effectively write one block at a time. + */ +static int __sync_filesystem(struct super_block *sb, int wait) +{ + /* + * This should be safe, as we require bdi backing to actually + * write out data in the first place + */ + if (sb->s_bdi == &noop_backing_dev_info) + return 0; + + if (sb->s_qcop && sb->s_qcop->quota_sync) + sb->s_qcop->quota_sync(sb, -1, wait); + + if (wait) + sync_inodes_sb(sb); + else + writeback_inodes_sb(sb); + + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, wait); + return __sync_blockdev(sb->s_bdev, wait); +} + +/* + * Write out and wait upon all dirty data associated with this + * superblock. Filesystem data as well as the underlying block + * device. Takes the superblock lock. + */ +int sync_filesystem(struct super_block *sb) +{ + int ret; + + /* + * We need to be protected against the filesystem going from + * r/o to r/w or vice versa. + */ + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + /* + * No point in syncing out anything if the filesystem is read-only. + */ + if (sb->s_flags & MS_RDONLY) + return 0; + + ret = __sync_filesystem(sb, 0); + if (ret < 0) + return ret; + return __sync_filesystem(sb, 1); +} +EXPORT_SYMBOL_GPL(sync_filesystem); + +static void sync_one_sb(struct super_block *sb, void *arg) +{ + if (!(sb->s_flags & MS_RDONLY)) + __sync_filesystem(sb, *(int *)arg); +} +/* + * Sync all the data for all the filesystems (called by sys_sync() and + * emergency sync) + */ +static void sync_filesystems(int wait) +{ + iterate_supers(sync_one_sb, &wait); +} + +/* + * sync everything. Start out by waking pdflush, because that writes back + * all queues in parallel. + */ +SYSCALL_DEFINE0(sync) +{ + wakeup_flusher_threads(0); + sync_filesystems(0); + sync_filesystems(1); + if (unlikely(laptop_mode)) + laptop_sync_completion(); + return 0; +} + +static void do_sync_work(struct work_struct *work) +{ + /* + * Sync twice to reduce the possibility we skipped some inodes / pages + * because they were temporarily locked + */ + sync_filesystems(0); + sync_filesystems(0); + printk("Emergency Sync complete\n"); + kfree(work); +} + +void emergency_sync(void) +{ + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_sync_work); + schedule_work(work); + } +} + +/* + * sync a single super + */ +SYSCALL_DEFINE1(syncfs, int, fd) +{ + struct file *file; + struct super_block *sb; + int ret; + int fput_needed; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + sb = file->f_dentry->d_sb; + + down_read(&sb->s_umount); + ret = sync_filesystem(sb); + up_read(&sb->s_umount); + + fput_light(file, fput_needed); + return ret; +} + +/** + * vfs_fsync_range - helper to sync a range of data & metadata to disk + * @file: file to sync + * @start: offset in bytes of the beginning of data range to sync + * @end: offset in bytes of the end of data range (inclusive) + * @datasync: perform only datasync + * + * Write back data in range @start..@end and metadata for @file to disk. If + * @datasync is set only metadata needed to access modified file data is + * written. + */ +int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct address_space *mapping = file->f_mapping; + int err, ret; + + if (!file->f_op || !file->f_op->fsync) { + ret = -EINVAL; + goto out; + } + + ret = filemap_write_and_wait_range(mapping, start, end); + + /* + * We need to protect against concurrent writers, which could cause + * livelocks in fsync_buffers_list(). + */ + mutex_lock(&mapping->host->i_mutex); + err = file->f_op->fsync(file, datasync); + if (!ret) + ret = err; + mutex_unlock(&mapping->host->i_mutex); + +out: + return ret; +} +EXPORT_SYMBOL(vfs_fsync_range); + +/** + * vfs_fsync - perform a fsync or fdatasync on a file + * @file: file to sync + * @datasync: only perform a fdatasync operation + * + * Write back data and metadata for @file to disk. If @datasync is + * set only metadata needed to access modified file data is written. + */ +int vfs_fsync(struct file *file, int datasync) +{ + return vfs_fsync_range(file, 0, LLONG_MAX, datasync); +} +EXPORT_SYMBOL(vfs_fsync); + +static int do_fsync(unsigned int fd, int datasync) +{ + struct file *file; + int ret = -EBADF; + + file = fget(fd); + if (file) { + ret = vfs_fsync(file, datasync); + fput(file); + } + return ret; +} + +SYSCALL_DEFINE1(fsync, unsigned int, fd) +{ + return do_fsync(fd, 0); +} + +SYSCALL_DEFINE1(fdatasync, unsigned int, fd) +{ + return do_fsync(fd, 1); +} + +/** + * generic_write_sync - perform syncing after a write if file / inode is sync + * @file: file to which the write happened + * @pos: offset where the write started + * @count: length of the write + * + * This is just a simple wrapper about our general syncing function. + */ +int generic_write_sync(struct file *file, loff_t pos, loff_t count) +{ + if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) + return 0; + return vfs_fsync_range(file, pos, pos + count - 1, + (file->f_flags & __O_SYNC) ? 0 : 1); +} +EXPORT_SYMBOL(generic_write_sync); + +/* + * sys_sync_file_range() permits finely controlled syncing over a segment of + * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is + * zero then sys_sync_file_range() will operate from offset out to EOF. + * + * The flag bits are: + * + * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range + * before performing the write. + * + * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the + * range which are not presently under writeback. Note that this may block for + * significant periods due to exhaustion of disk request structures. + * + * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range + * after performing the write. + * + * Useful combinations of the flag bits are: + * + * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages + * in the range which were dirty on entry to sys_sync_file_range() are placed + * under writeout. This is a start-write-for-data-integrity operation. + * + * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which + * are not presently under writeout. This is an asynchronous flush-to-disk + * operation. Not suitable for data integrity operations. + * + * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for + * completion of writeout of all pages in the range. This will be used after an + * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait + * for that operation to complete and to return the result. + * + * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER: + * a traditional sync() operation. This is a write-for-data-integrity operation + * which will ensure that all pages in the range which were dirty on entry to + * sys_sync_file_range() are committed to disk. + * + * + * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any + * I/O errors or ENOSPC conditions and will return those to the caller, after + * clearing the EIO and ENOSPC flags in the address_space. + * + * It should be noted that none of these operations write out the file's + * metadata. So unless the application is strictly performing overwrites of + * already-instantiated disk blocks, there are no guarantees here that the data + * will be available after a crash. + */ +SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, + unsigned int flags) +{ + int ret; + struct file *file; + struct address_space *mapping; + loff_t endbyte; /* inclusive */ + int fput_needed; + umode_t i_mode; + + ret = -EINVAL; + if (flags & ~VALID_FLAGS) + goto out; + + endbyte = offset + nbytes; + + if ((s64)offset < 0) + goto out; + if ((s64)endbyte < 0) + goto out; + if (endbyte < offset) + goto out; + + if (sizeof(pgoff_t) == 4) { + if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { + /* + * The range starts outside a 32 bit machine's + * pagecache addressing capabilities. Let it "succeed" + */ + ret = 0; + goto out; + } + if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { + /* + * Out to EOF + */ + nbytes = 0; + } + } + + if (nbytes == 0) + endbyte = LLONG_MAX; + else + endbyte--; /* inclusive */ + + ret = -EBADF; + file = fget_light(fd, &fput_needed); + if (!file) + goto out; + + i_mode = file->f_path.dentry->d_inode->i_mode; + ret = -ESPIPE; + if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) && + !S_ISLNK(i_mode)) + goto out_put; + + mapping = file->f_mapping; + if (!mapping) { + ret = -EINVAL; + goto out_put; + } + + ret = 0; + if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { + ret = filemap_fdatawait_range(mapping, offset, endbyte); + if (ret < 0) + goto out_put; + } + + if (flags & SYNC_FILE_RANGE_WRITE) { + ret = filemap_fdatawrite_range(mapping, offset, endbyte); + if (ret < 0) + goto out_put; + } + + if (flags & SYNC_FILE_RANGE_WAIT_AFTER) + ret = filemap_fdatawait_range(mapping, offset, endbyte); + +out_put: + fput_light(file, fput_needed); +out: + return ret; +} +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_sync_file_range(long fd, loff_t offset, loff_t nbytes, + long flags) +{ + return SYSC_sync_file_range((int) fd, offset, nbytes, + (unsigned int) flags); +} +SYSCALL_ALIAS(sys_sync_file_range, SyS_sync_file_range); +#endif + +/* It would be nice if people remember that not all the world's an i386 + when they introduce new system calls */ +SYSCALL_DEFINE(sync_file_range2)(int fd, unsigned int flags, + loff_t offset, loff_t nbytes) +{ + return sys_sync_file_range(fd, offset, nbytes, flags); +} +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_sync_file_range2(long fd, long flags, + loff_t offset, loff_t nbytes) +{ + return SYSC_sync_file_range2((int) fd, (unsigned int) flags, + offset, nbytes); +} +SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2); +#endif diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig new file mode 100644 index 00000000..8c41feac --- /dev/null +++ b/fs/sysfs/Kconfig @@ -0,0 +1,23 @@ +config SYSFS + bool "sysfs file system support" if EXPERT + default y + help + The sysfs filesystem is a virtual filesystem that the kernel uses to + export internal kernel objects, their attributes, and their + relationships to one another. + + Users can use sysfs to ascertain useful information about the running + kernel, such as the devices the kernel has discovered on each bus and + which driver each is bound to. sysfs can also be used to tune devices + and other kernel subsystems. + + Some system agents rely on the information in sysfs to operate. + /sbin/hotplug uses device and object attributes in sysfs to assist in + delegating policy decisions, like persistently naming devices. + + sysfs is currently used by the block subsystem to mount the root + partition. If sysfs is disabled you must specify the boot device on + the kernel boot command line via its major and minor numbers. For + example, "root=03:01" for /dev/hda1. + + Designers of embedded systems may wish to say N here to conserve space. diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile new file mode 100644 index 00000000..7a1ceb94 --- /dev/null +++ b/fs/sysfs/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for the sysfs virtual filesystem +# + +obj-y := inode.o file.o dir.o symlink.o mount.o bin.o \ + group.o diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c new file mode 100644 index 00000000..a4759833 --- /dev/null +++ b/fs/sysfs/bin.c @@ -0,0 +1,506 @@ +/* + * fs/sysfs/bin.c - sysfs binary file implementation + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Matthew Wilcox + * Copyright (c) 2004 Silicon Graphics, Inc. + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007 Tejun Heo + * + * This file is released under the GPLv2. + * + * Please see Documentation/filesystems/sysfs.txt for more information. + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sysfs.h" + +/* + * There's one bin_buffer for each open file. + * + * filp->private_data points to bin_buffer and + * sysfs_dirent->s_bin_attr.buffers points to a the bin_buffer s + * sysfs_dirent->s_bin_attr.buffers is protected by sysfs_bin_lock + */ +static DEFINE_MUTEX(sysfs_bin_lock); + +struct bin_buffer { + struct mutex mutex; + void *buffer; + int mmapped; + const struct vm_operations_struct *vm_ops; + struct file *file; + struct hlist_node list; +}; + +static int +fill_read(struct file *file, char *buffer, loff_t off, size_t count) +{ + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; + struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; + int rc; + + /* need attr_sd for attr, its parent for kobj */ + if (!sysfs_get_active(attr_sd)) + return -ENODEV; + + rc = -EIO; + if (attr->read) + rc = attr->read(file, kobj, attr, buffer, off, count); + + sysfs_put_active(attr_sd); + + return rc; +} + +static ssize_t +read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) +{ + struct bin_buffer *bb = file->private_data; + int size = file->f_path.dentry->d_inode->i_size; + loff_t offs = *off; + int count = min_t(size_t, bytes, PAGE_SIZE); + char *temp; + + if (!bytes) + return 0; + + if (size) { + if (offs > size) + return 0; + if (offs + count > size) + count = size - offs; + } + + temp = kmalloc(count, GFP_KERNEL); + if (!temp) + return -ENOMEM; + + mutex_lock(&bb->mutex); + + count = fill_read(file, bb->buffer, offs, count); + if (count < 0) { + mutex_unlock(&bb->mutex); + goto out_free; + } + + memcpy(temp, bb->buffer, count); + + mutex_unlock(&bb->mutex); + + if (copy_to_user(userbuf, temp, count)) { + count = -EFAULT; + goto out_free; + } + + pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); + + *off = offs + count; + + out_free: + kfree(temp); + return count; +} + +static int +flush_write(struct file *file, char *buffer, loff_t offset, size_t count) +{ + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; + struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; + int rc; + + /* need attr_sd for attr, its parent for kobj */ + if (!sysfs_get_active(attr_sd)) + return -ENODEV; + + rc = -EIO; + if (attr->write) + rc = attr->write(file, kobj, attr, buffer, offset, count); + + sysfs_put_active(attr_sd); + + return rc; +} + +static ssize_t write(struct file *file, const char __user *userbuf, + size_t bytes, loff_t *off) +{ + struct bin_buffer *bb = file->private_data; + int size = file->f_path.dentry->d_inode->i_size; + loff_t offs = *off; + int count = min_t(size_t, bytes, PAGE_SIZE); + char *temp; + + if (!bytes) + return 0; + + if (size) { + if (offs > size) + return 0; + if (offs + count > size) + count = size - offs; + } + + temp = memdup_user(userbuf, count); + if (IS_ERR(temp)) + return PTR_ERR(temp); + + mutex_lock(&bb->mutex); + + memcpy(bb->buffer, temp, count); + + count = flush_write(file, bb->buffer, offs, count); + mutex_unlock(&bb->mutex); + + if (count > 0) + *off = offs + count; + + kfree(temp); + return count; +} + +static void bin_vma_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + + if (!bb->vm_ops) + return; + + if (!sysfs_get_active(attr_sd)) + return; + + if (bb->vm_ops->open) + bb->vm_ops->open(vma); + + sysfs_put_active(attr_sd); +} + +static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + int ret; + + if (!bb->vm_ops) + return VM_FAULT_SIGBUS; + + if (!sysfs_get_active(attr_sd)) + return VM_FAULT_SIGBUS; + + ret = VM_FAULT_SIGBUS; + if (bb->vm_ops->fault) + ret = bb->vm_ops->fault(vma, vmf); + + sysfs_put_active(attr_sd); + return ret; +} + +static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + int ret; + + if (!bb->vm_ops) + return VM_FAULT_SIGBUS; + + if (!sysfs_get_active(attr_sd)) + return VM_FAULT_SIGBUS; + + ret = 0; + if (bb->vm_ops->page_mkwrite) + ret = bb->vm_ops->page_mkwrite(vma, vmf); + + sysfs_put_active(attr_sd); + return ret; +} + +static int bin_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + int ret; + + if (!bb->vm_ops) + return -EINVAL; + + if (!sysfs_get_active(attr_sd)) + return -EINVAL; + + ret = -EINVAL; + if (bb->vm_ops->access) + ret = bb->vm_ops->access(vma, addr, buf, len, write); + + sysfs_put_active(attr_sd); + return ret; +} + +#ifdef CONFIG_NUMA +static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + int ret; + + if (!bb->vm_ops) + return 0; + + if (!sysfs_get_active(attr_sd)) + return -EINVAL; + + ret = 0; + if (bb->vm_ops->set_policy) + ret = bb->vm_ops->set_policy(vma, new); + + sysfs_put_active(attr_sd); + return ret; +} + +static struct mempolicy *bin_get_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + struct mempolicy *pol; + + if (!bb->vm_ops) + return vma->vm_policy; + + if (!sysfs_get_active(attr_sd)) + return vma->vm_policy; + + pol = vma->vm_policy; + if (bb->vm_ops->get_policy) + pol = bb->vm_ops->get_policy(vma, addr); + + sysfs_put_active(attr_sd); + return pol; +} + +static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from, + const nodemask_t *to, unsigned long flags) +{ + struct file *file = vma->vm_file; + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + int ret; + + if (!bb->vm_ops) + return 0; + + if (!sysfs_get_active(attr_sd)) + return 0; + + ret = 0; + if (bb->vm_ops->migrate) + ret = bb->vm_ops->migrate(vma, from, to, flags); + + sysfs_put_active(attr_sd); + return ret; +} +#endif + +static const struct vm_operations_struct bin_vm_ops = { + .open = bin_vma_open, + .fault = bin_fault, + .page_mkwrite = bin_page_mkwrite, + .access = bin_access, +#ifdef CONFIG_NUMA + .set_policy = bin_set_policy, + .get_policy = bin_get_policy, + .migrate = bin_migrate, +#endif +}; + +static int mmap(struct file *file, struct vm_area_struct *vma) +{ + struct bin_buffer *bb = file->private_data; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; + struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; + int rc; + + mutex_lock(&bb->mutex); + + /* need attr_sd for attr, its parent for kobj */ + rc = -ENODEV; + if (!sysfs_get_active(attr_sd)) + goto out_unlock; + + rc = -EINVAL; + if (!attr->mmap) + goto out_put; + + rc = attr->mmap(file, kobj, attr, vma); + if (rc) + goto out_put; + + /* + * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() + * to satisfy versions of X which crash if the mmap fails: that + * substitutes a new vm_file, and we don't then want bin_vm_ops. + */ + if (vma->vm_file != file) + goto out_put; + + rc = -EINVAL; + if (bb->mmapped && bb->vm_ops != vma->vm_ops) + goto out_put; + + /* + * It is not possible to successfully wrap close. + * So error if someone is trying to use close. + */ + rc = -EINVAL; + if (vma->vm_ops && vma->vm_ops->close) + goto out_put; + + rc = 0; + bb->mmapped = 1; + bb->vm_ops = vma->vm_ops; + vma->vm_ops = &bin_vm_ops; +out_put: + sysfs_put_active(attr_sd); +out_unlock: + mutex_unlock(&bb->mutex); + + return rc; +} + +static int open(struct inode * inode, struct file * file) +{ + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; + struct bin_buffer *bb = NULL; + int error; + + /* binary file operations requires both @sd and its parent */ + if (!sysfs_get_active(attr_sd)) + return -ENODEV; + + error = -EACCES; + if ((file->f_mode & FMODE_WRITE) && !(attr->write || attr->mmap)) + goto err_out; + if ((file->f_mode & FMODE_READ) && !(attr->read || attr->mmap)) + goto err_out; + + error = -ENOMEM; + bb = kzalloc(sizeof(*bb), GFP_KERNEL); + if (!bb) + goto err_out; + + bb->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!bb->buffer) + goto err_out; + + mutex_init(&bb->mutex); + bb->file = file; + file->private_data = bb; + + mutex_lock(&sysfs_bin_lock); + hlist_add_head(&bb->list, &attr_sd->s_bin_attr.buffers); + mutex_unlock(&sysfs_bin_lock); + + /* open succeeded, put active references */ + sysfs_put_active(attr_sd); + return 0; + + err_out: + sysfs_put_active(attr_sd); + kfree(bb); + return error; +} + +static int release(struct inode * inode, struct file * file) +{ + struct bin_buffer *bb = file->private_data; + + mutex_lock(&sysfs_bin_lock); + hlist_del(&bb->list); + mutex_unlock(&sysfs_bin_lock); + + kfree(bb->buffer); + kfree(bb); + return 0; +} + +const struct file_operations bin_fops = { + .read = read, + .write = write, + .mmap = mmap, + .llseek = generic_file_llseek, + .open = open, + .release = release, +}; + + +void unmap_bin_file(struct sysfs_dirent *attr_sd) +{ + struct bin_buffer *bb; + struct hlist_node *tmp; + + if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR) + return; + + mutex_lock(&sysfs_bin_lock); + + hlist_for_each_entry(bb, tmp, &attr_sd->s_bin_attr.buffers, list) { + struct inode *inode = bb->file->f_path.dentry->d_inode; + + unmap_mapping_range(inode->i_mapping, 0, 0, 1); + } + + mutex_unlock(&sysfs_bin_lock); +} + +/** + * sysfs_create_bin_file - create binary file for object. + * @kobj: object. + * @attr: attribute descriptor. + */ + +int sysfs_create_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) +{ + BUG_ON(!kobj || !kobj->sd || !attr); + + return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); +} + + +/** + * sysfs_remove_bin_file - remove binary file for object. + * @kobj: object. + * @attr: attribute descriptor. + */ + +void sysfs_remove_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) +{ + sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name); +} + +EXPORT_SYMBOL_GPL(sysfs_create_bin_file); +EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c new file mode 100644 index 00000000..ea9120a8 --- /dev/null +++ b/fs/sysfs/dir.c @@ -0,0 +1,964 @@ +/* + * fs/sysfs/dir.c - sysfs core and dir operation implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007 Tejun Heo + * + * This file is released under the GPLv2. + * + * Please see Documentation/filesystems/sysfs.txt for more information. + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "sysfs.h" + +DEFINE_MUTEX(sysfs_mutex); +DEFINE_SPINLOCK(sysfs_assoc_lock); + +static DEFINE_SPINLOCK(sysfs_ino_lock); +static DEFINE_IDA(sysfs_ino_ida); + +/** + * sysfs_link_sibling - link sysfs_dirent into sibling list + * @sd: sysfs_dirent of interest + * + * Link @sd into its sibling list which starts from + * sd->s_parent->s_dir.children. + * + * Locking: + * mutex_lock(sysfs_mutex) + */ +static void sysfs_link_sibling(struct sysfs_dirent *sd) +{ + struct sysfs_dirent *parent_sd = sd->s_parent; + struct sysfs_dirent **pos; + + BUG_ON(sd->s_sibling); + + /* Store directory entries in order by ino. This allows + * readdir to properly restart without having to add a + * cursor into the s_dir.children list. + */ + for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) { + if (sd->s_ino < (*pos)->s_ino) + break; + } + sd->s_sibling = *pos; + *pos = sd; +} + +/** + * sysfs_unlink_sibling - unlink sysfs_dirent from sibling list + * @sd: sysfs_dirent of interest + * + * Unlink @sd from its sibling list which starts from + * sd->s_parent->s_dir.children. + * + * Locking: + * mutex_lock(sysfs_mutex) + */ +static void sysfs_unlink_sibling(struct sysfs_dirent *sd) +{ + struct sysfs_dirent **pos; + + for (pos = &sd->s_parent->s_dir.children; *pos; + pos = &(*pos)->s_sibling) { + if (*pos == sd) { + *pos = sd->s_sibling; + sd->s_sibling = NULL; + break; + } + } +} + +/** + * sysfs_get_active - get an active reference to sysfs_dirent + * @sd: sysfs_dirent to get an active reference to + * + * Get an active reference of @sd. This function is noop if @sd + * is NULL. + * + * RETURNS: + * Pointer to @sd on success, NULL on failure. + */ +struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) +{ + if (unlikely(!sd)) + return NULL; + + while (1) { + int v, t; + + v = atomic_read(&sd->s_active); + if (unlikely(v < 0)) + return NULL; + + t = atomic_cmpxchg(&sd->s_active, v, v + 1); + if (likely(t == v)) { + rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); + return sd; + } + if (t < 0) + return NULL; + + cpu_relax(); + } +} + +/** + * sysfs_put_active - put an active reference to sysfs_dirent + * @sd: sysfs_dirent to put an active reference to + * + * Put an active reference to @sd. This function is noop if @sd + * is NULL. + */ +void sysfs_put_active(struct sysfs_dirent *sd) +{ + struct completion *cmpl; + int v; + + if (unlikely(!sd)) + return; + + rwsem_release(&sd->dep_map, 1, _RET_IP_); + v = atomic_dec_return(&sd->s_active); + if (likely(v != SD_DEACTIVATED_BIAS)) + return; + + /* atomic_dec_return() is a mb(), we'll always see the updated + * sd->s_sibling. + */ + cmpl = (void *)sd->s_sibling; + complete(cmpl); +} + +/** + * sysfs_deactivate - deactivate sysfs_dirent + * @sd: sysfs_dirent to deactivate + * + * Deny new active references and drain existing ones. + */ +static void sysfs_deactivate(struct sysfs_dirent *sd) +{ + DECLARE_COMPLETION_ONSTACK(wait); + int v; + + BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); + + if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF)) + return; + + sd->s_sibling = (void *)&wait; + + rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); + /* atomic_add_return() is a mb(), put_active() will always see + * the updated sd->s_sibling. + */ + v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); + + if (v != SD_DEACTIVATED_BIAS) { + lock_contended(&sd->dep_map, _RET_IP_); + wait_for_completion(&wait); + } + + sd->s_sibling = NULL; + + lock_acquired(&sd->dep_map, _RET_IP_); + rwsem_release(&sd->dep_map, 1, _RET_IP_); +} + +static int sysfs_alloc_ino(ino_t *pino) +{ + int ino, rc; + + retry: + spin_lock(&sysfs_ino_lock); + rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino); + spin_unlock(&sysfs_ino_lock); + + if (rc == -EAGAIN) { + if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL)) + goto retry; + rc = -ENOMEM; + } + + *pino = ino; + return rc; +} + +static void sysfs_free_ino(ino_t ino) +{ + spin_lock(&sysfs_ino_lock); + ida_remove(&sysfs_ino_ida, ino); + spin_unlock(&sysfs_ino_lock); +} + +void release_sysfs_dirent(struct sysfs_dirent * sd) +{ + struct sysfs_dirent *parent_sd; + + repeat: + /* Moving/renaming is always done while holding reference. + * sd->s_parent won't change beneath us. + */ + parent_sd = sd->s_parent; + + if (sysfs_type(sd) == SYSFS_KOBJ_LINK) + sysfs_put(sd->s_symlink.target_sd); + if (sysfs_type(sd) & SYSFS_COPY_NAME) + kfree(sd->s_name); + if (sd->s_iattr && sd->s_iattr->ia_secdata) + security_release_secctx(sd->s_iattr->ia_secdata, + sd->s_iattr->ia_secdata_len); + kfree(sd->s_iattr); + sysfs_free_ino(sd->s_ino); + kmem_cache_free(sysfs_dir_cachep, sd); + + sd = parent_sd; + if (sd && atomic_dec_and_test(&sd->s_count)) + goto repeat; +} + +static int sysfs_dentry_delete(const struct dentry *dentry) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + return !!(sd->s_flags & SYSFS_FLAG_REMOVED); +} + +static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct sysfs_dirent *sd; + int is_dir; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + sd = dentry->d_fsdata; + mutex_lock(&sysfs_mutex); + + /* The sysfs dirent has been deleted */ + if (sd->s_flags & SYSFS_FLAG_REMOVED) + goto out_bad; + + /* The sysfs dirent has been moved? */ + if (dentry->d_parent->d_fsdata != sd->s_parent) + goto out_bad; + + /* The sysfs dirent has been renamed */ + if (strcmp(dentry->d_name.name, sd->s_name) != 0) + goto out_bad; + + mutex_unlock(&sysfs_mutex); +out_valid: + return 1; +out_bad: + /* Remove the dentry from the dcache hashes. + * If this is a deleted dentry we use d_drop instead of d_delete + * so sysfs doesn't need to cope with negative dentries. + * + * If this is a dentry that has simply been renamed we + * use d_drop to remove it from the dcache lookup on its + * old parent. If this dentry persists later when a lookup + * is performed at its new name the dentry will be readded + * to the dcache hashes. + */ + is_dir = (sysfs_type(sd) == SYSFS_DIR); + mutex_unlock(&sysfs_mutex); + if (is_dir) { + /* If we have submounts we must allow the vfs caches + * to lie about the state of the filesystem to prevent + * leaks and other nasty things. + */ + if (have_submounts(dentry)) + goto out_valid; + shrink_dcache_parent(dentry); + } + d_drop(dentry); + return 0; +} + +static void sysfs_dentry_iput(struct dentry *dentry, struct inode *inode) +{ + struct sysfs_dirent * sd = dentry->d_fsdata; + + sysfs_put(sd); + iput(inode); +} + +static const struct dentry_operations sysfs_dentry_ops = { + .d_revalidate = sysfs_dentry_revalidate, + .d_delete = sysfs_dentry_delete, + .d_iput = sysfs_dentry_iput, +}; + +struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) +{ + char *dup_name = NULL; + struct sysfs_dirent *sd; + + if (type & SYSFS_COPY_NAME) { + name = dup_name = kstrdup(name, GFP_KERNEL); + if (!name) + return NULL; + } + + sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL); + if (!sd) + goto err_out1; + + if (sysfs_alloc_ino(&sd->s_ino)) + goto err_out2; + + atomic_set(&sd->s_count, 1); + atomic_set(&sd->s_active, 0); + + sd->s_name = name; + sd->s_mode = mode; + sd->s_flags = type; + + return sd; + + err_out2: + kmem_cache_free(sysfs_dir_cachep, sd); + err_out1: + kfree(dup_name); + return NULL; +} + +/** + * sysfs_addrm_start - prepare for sysfs_dirent add/remove + * @acxt: pointer to sysfs_addrm_cxt to be used + * @parent_sd: parent sysfs_dirent + * + * This function is called when the caller is about to add or + * remove sysfs_dirent under @parent_sd. This function acquires + * sysfs_mutex. @acxt is used to keep and pass context to + * other addrm functions. + * + * LOCKING: + * Kernel thread context (may sleep). sysfs_mutex is locked on + * return. + */ +void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, + struct sysfs_dirent *parent_sd) +{ + memset(acxt, 0, sizeof(*acxt)); + acxt->parent_sd = parent_sd; + + mutex_lock(&sysfs_mutex); +} + +/** + * __sysfs_add_one - add sysfs_dirent to parent without warning + * @acxt: addrm context to use + * @sd: sysfs_dirent to be added + * + * Get @acxt->parent_sd and set sd->s_parent to it and increment + * nlink of parent inode if @sd is a directory and link into the + * children list of the parent. + * + * This function should be called between calls to + * sysfs_addrm_start() and sysfs_addrm_finish() and should be + * passed the same @acxt as passed to sysfs_addrm_start(). + * + * LOCKING: + * Determined by sysfs_addrm_start(). + * + * RETURNS: + * 0 on success, -EEXIST if entry with the given name already + * exists. + */ +int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) +{ + struct sysfs_inode_attrs *ps_iattr; + + if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name)) + return -EEXIST; + + sd->s_parent = sysfs_get(acxt->parent_sd); + + sysfs_link_sibling(sd); + + /* Update timestamps on the parent */ + ps_iattr = acxt->parent_sd->s_iattr; + if (ps_iattr) { + struct iattr *ps_iattrs = &ps_iattr->ia_iattr; + ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; + } + + return 0; +} + +/** + * sysfs_pathname - return full path to sysfs dirent + * @sd: sysfs_dirent whose path we want + * @path: caller allocated buffer + * + * Gives the name "/" to the sysfs_root entry; any path returned + * is relative to wherever sysfs is mounted. + * + * XXX: does no error checking on @path size + */ +static char *sysfs_pathname(struct sysfs_dirent *sd, char *path) +{ + if (sd->s_parent) { + sysfs_pathname(sd->s_parent, path); + strcat(path, "/"); + } + strcat(path, sd->s_name); + return path; +} + +/** + * sysfs_add_one - add sysfs_dirent to parent + * @acxt: addrm context to use + * @sd: sysfs_dirent to be added + * + * Get @acxt->parent_sd and set sd->s_parent to it and increment + * nlink of parent inode if @sd is a directory and link into the + * children list of the parent. + * + * This function should be called between calls to + * sysfs_addrm_start() and sysfs_addrm_finish() and should be + * passed the same @acxt as passed to sysfs_addrm_start(). + * + * LOCKING: + * Determined by sysfs_addrm_start(). + * + * RETURNS: + * 0 on success, -EEXIST if entry with the given name already + * exists. + */ +int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) +{ + int ret; + + ret = __sysfs_add_one(acxt, sd); + if (ret == -EEXIST) { + char *path = kzalloc(PATH_MAX, GFP_KERNEL); + WARN(1, KERN_WARNING + "sysfs: cannot create duplicate filename '%s'\n", + (path == NULL) ? sd->s_name : + strcat(strcat(sysfs_pathname(acxt->parent_sd, path), "/"), + sd->s_name)); + kfree(path); + } + + return ret; +} + +/** + * sysfs_remove_one - remove sysfs_dirent from parent + * @acxt: addrm context to use + * @sd: sysfs_dirent to be removed + * + * Mark @sd removed and drop nlink of parent inode if @sd is a + * directory. @sd is unlinked from the children list. + * + * This function should be called between calls to + * sysfs_addrm_start() and sysfs_addrm_finish() and should be + * passed the same @acxt as passed to sysfs_addrm_start(). + * + * LOCKING: + * Determined by sysfs_addrm_start(). + */ +void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) +{ + struct sysfs_inode_attrs *ps_iattr; + + BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED); + + sysfs_unlink_sibling(sd); + + /* Update timestamps on the parent */ + ps_iattr = acxt->parent_sd->s_iattr; + if (ps_iattr) { + struct iattr *ps_iattrs = &ps_iattr->ia_iattr; + ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; + } + + sd->s_flags |= SYSFS_FLAG_REMOVED; + sd->s_sibling = acxt->removed; + acxt->removed = sd; +} + +/** + * sysfs_addrm_finish - finish up sysfs_dirent add/remove + * @acxt: addrm context to finish up + * + * Finish up sysfs_dirent add/remove. Resources acquired by + * sysfs_addrm_start() are released and removed sysfs_dirents are + * cleaned up. + * + * LOCKING: + * sysfs_mutex is released. + */ +void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) +{ + /* release resources acquired by sysfs_addrm_start() */ + mutex_unlock(&sysfs_mutex); + + /* kill removed sysfs_dirents */ + while (acxt->removed) { + struct sysfs_dirent *sd = acxt->removed; + + acxt->removed = sd->s_sibling; + sd->s_sibling = NULL; + + sysfs_deactivate(sd); + unmap_bin_file(sd); + sysfs_put(sd); + } +} + +/** + * sysfs_find_dirent - find sysfs_dirent with the given name + * @parent_sd: sysfs_dirent to search under + * @name: name to look for + * + * Look for sysfs_dirent with name @name under @parent_sd. + * + * LOCKING: + * mutex_lock(sysfs_mutex) + * + * RETURNS: + * Pointer to sysfs_dirent if found, NULL if not. + */ +struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, + const void *ns, + const unsigned char *name) +{ + struct sysfs_dirent *sd; + + for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) { + if (ns && sd->s_ns && (sd->s_ns != ns)) + continue; + if (!strcmp(sd->s_name, name)) + return sd; + } + return NULL; +} + +/** + * sysfs_get_dirent - find and get sysfs_dirent with the given name + * @parent_sd: sysfs_dirent to search under + * @name: name to look for + * + * Look for sysfs_dirent with name @name under @parent_sd and get + * it if found. + * + * LOCKING: + * Kernel thread context (may sleep). Grabs sysfs_mutex. + * + * RETURNS: + * Pointer to sysfs_dirent if found, NULL if not. + */ +struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, + const void *ns, + const unsigned char *name) +{ + struct sysfs_dirent *sd; + + mutex_lock(&sysfs_mutex); + sd = sysfs_find_dirent(parent_sd, ns, name); + sysfs_get(sd); + mutex_unlock(&sysfs_mutex); + + return sd; +} +EXPORT_SYMBOL_GPL(sysfs_get_dirent); + +static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, + enum kobj_ns_type type, const void *ns, const char *name, + struct sysfs_dirent **p_sd) +{ + umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; + struct sysfs_addrm_cxt acxt; + struct sysfs_dirent *sd; + int rc; + + /* allocate */ + sd = sysfs_new_dirent(name, mode, SYSFS_DIR); + if (!sd) + return -ENOMEM; + + sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT); + sd->s_ns = ns; + sd->s_dir.kobj = kobj; + + /* link in */ + sysfs_addrm_start(&acxt, parent_sd); + rc = sysfs_add_one(&acxt, sd); + sysfs_addrm_finish(&acxt); + + if (rc == 0) + *p_sd = sd; + else + sysfs_put(sd); + + return rc; +} + +int sysfs_create_subdir(struct kobject *kobj, const char *name, + struct sysfs_dirent **p_sd) +{ + return create_dir(kobj, kobj->sd, + KOBJ_NS_TYPE_NONE, NULL, name, p_sd); +} + +/** + * sysfs_read_ns_type: return associated ns_type + * @kobj: the kobject being queried + * + * Each kobject can be tagged with exactly one namespace type + * (i.e. network or user). Return the ns_type associated with + * this object if any + */ +static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj) +{ + const struct kobj_ns_type_operations *ops; + enum kobj_ns_type type; + + ops = kobj_child_ns_ops(kobj); + if (!ops) + return KOBJ_NS_TYPE_NONE; + + type = ops->type; + BUG_ON(type <= KOBJ_NS_TYPE_NONE); + BUG_ON(type >= KOBJ_NS_TYPES); + BUG_ON(!kobj_ns_type_registered(type)); + + return type; +} + +/** + * sysfs_create_dir - create a directory for an object. + * @kobj: object we're creating directory for. + */ +int sysfs_create_dir(struct kobject * kobj) +{ + enum kobj_ns_type type; + struct sysfs_dirent *parent_sd, *sd; + const void *ns = NULL; + int error = 0; + + BUG_ON(!kobj); + + if (kobj->parent) + parent_sd = kobj->parent->sd; + else + parent_sd = &sysfs_root; + + if (sysfs_ns_type(parent_sd)) + ns = kobj->ktype->namespace(kobj); + type = sysfs_read_ns_type(kobj); + + error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd); + if (!error) + kobj->sd = sd; + return error; +} + +static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct dentry *ret = NULL; + struct dentry *parent = dentry->d_parent; + struct sysfs_dirent *parent_sd = parent->d_fsdata; + struct sysfs_dirent *sd; + struct inode *inode; + enum kobj_ns_type type; + const void *ns; + + mutex_lock(&sysfs_mutex); + + type = sysfs_ns_type(parent_sd); + ns = sysfs_info(dir->i_sb)->ns[type]; + + sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name); + + /* no such entry */ + if (!sd) { + ret = ERR_PTR(-ENOENT); + goto out_unlock; + } + + /* attach dentry and inode */ + inode = sysfs_get_inode(dir->i_sb, sd); + if (!inode) { + ret = ERR_PTR(-ENOMEM); + goto out_unlock; + } + + /* instantiate and hash dentry */ + ret = d_find_alias(inode); + if (!ret) { + d_set_d_op(dentry, &sysfs_dentry_ops); + dentry->d_fsdata = sysfs_get(sd); + d_add(dentry, inode); + } else { + d_move(ret, dentry); + iput(inode); + } + + out_unlock: + mutex_unlock(&sysfs_mutex); + return ret; +} + +const struct inode_operations sysfs_dir_inode_operations = { + .lookup = sysfs_lookup, + .permission = sysfs_permission, + .setattr = sysfs_setattr, + .getattr = sysfs_getattr, + .setxattr = sysfs_setxattr, +}; + +static void remove_dir(struct sysfs_dirent *sd) +{ + struct sysfs_addrm_cxt acxt; + + sysfs_addrm_start(&acxt, sd->s_parent); + sysfs_remove_one(&acxt, sd); + sysfs_addrm_finish(&acxt); +} + +void sysfs_remove_subdir(struct sysfs_dirent *sd) +{ + remove_dir(sd); +} + + +static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) +{ + struct sysfs_addrm_cxt acxt; + struct sysfs_dirent **pos; + + if (!dir_sd) + return; + + pr_debug("sysfs %s: removing dir\n", dir_sd->s_name); + sysfs_addrm_start(&acxt, dir_sd); + pos = &dir_sd->s_dir.children; + while (*pos) { + struct sysfs_dirent *sd = *pos; + + if (sysfs_type(sd) != SYSFS_DIR) + sysfs_remove_one(&acxt, sd); + else + pos = &(*pos)->s_sibling; + } + sysfs_addrm_finish(&acxt); + + remove_dir(dir_sd); +} + +/** + * sysfs_remove_dir - remove an object's directory. + * @kobj: object. + * + * The only thing special about this is that we remove any files in + * the directory before we remove the directory, and we've inlined + * what used to be sysfs_rmdir() below, instead of calling separately. + */ + +void sysfs_remove_dir(struct kobject * kobj) +{ + struct sysfs_dirent *sd = kobj->sd; + + spin_lock(&sysfs_assoc_lock); + kobj->sd = NULL; + spin_unlock(&sysfs_assoc_lock); + + __sysfs_remove_dir(sd); +} + +int sysfs_rename(struct sysfs_dirent *sd, + struct sysfs_dirent *new_parent_sd, const void *new_ns, + const char *new_name) +{ + const char *dup_name = NULL; + int error; + + mutex_lock(&sysfs_mutex); + + error = 0; + if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) && + (strcmp(sd->s_name, new_name) == 0)) + goto out; /* nothing to rename */ + + error = -EEXIST; + if (sysfs_find_dirent(new_parent_sd, new_ns, new_name)) + goto out; + + /* rename sysfs_dirent */ + if (strcmp(sd->s_name, new_name) != 0) { + error = -ENOMEM; + new_name = dup_name = kstrdup(new_name, GFP_KERNEL); + if (!new_name) + goto out; + + dup_name = sd->s_name; + sd->s_name = new_name; + } + + /* Remove from old parent's list and insert into new parent's list. */ + if (sd->s_parent != new_parent_sd) { + sysfs_unlink_sibling(sd); + sysfs_get(new_parent_sd); + sysfs_put(sd->s_parent); + sd->s_parent = new_parent_sd; + sysfs_link_sibling(sd); + } + sd->s_ns = new_ns; + + error = 0; + out: + mutex_unlock(&sysfs_mutex); + kfree(dup_name); + return error; +} + +int sysfs_rename_dir(struct kobject *kobj, const char *new_name) +{ + struct sysfs_dirent *parent_sd = kobj->sd->s_parent; + const void *new_ns = NULL; + + if (sysfs_ns_type(parent_sd)) + new_ns = kobj->ktype->namespace(kobj); + + return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name); +} + +int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) +{ + struct sysfs_dirent *sd = kobj->sd; + struct sysfs_dirent *new_parent_sd; + const void *new_ns = NULL; + + BUG_ON(!sd->s_parent); + if (sysfs_ns_type(sd->s_parent)) + new_ns = kobj->ktype->namespace(kobj); + new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? + new_parent_kobj->sd : &sysfs_root; + + return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name); +} + +/* Relationship between s_mode and the DT_xxx types */ +static inline unsigned char dt_type(struct sysfs_dirent *sd) +{ + return (sd->s_mode >> 12) & 15; +} + +static int sysfs_dir_release(struct inode *inode, struct file *filp) +{ + sysfs_put(filp->private_data); + return 0; +} + +static struct sysfs_dirent *sysfs_dir_pos(const void *ns, + struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) +{ + if (pos) { + int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) && + pos->s_parent == parent_sd && + ino == pos->s_ino; + sysfs_put(pos); + if (!valid) + pos = NULL; + } + if (!pos && (ino > 1) && (ino < INT_MAX)) { + pos = parent_sd->s_dir.children; + while (pos && (ino > pos->s_ino)) + pos = pos->s_sibling; + } + while (pos && pos->s_ns && pos->s_ns != ns) + pos = pos->s_sibling; + return pos; +} + +static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, + struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) +{ + pos = sysfs_dir_pos(ns, parent_sd, ino, pos); + if (pos) + pos = pos->s_sibling; + while (pos && pos->s_ns && pos->s_ns != ns) + pos = pos->s_sibling; + return pos; +} + +static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct sysfs_dirent * parent_sd = dentry->d_fsdata; + struct sysfs_dirent *pos = filp->private_data; + enum kobj_ns_type type; + const void *ns; + ino_t ino; + + type = sysfs_ns_type(parent_sd); + ns = sysfs_info(dentry->d_sb)->ns[type]; + + if (filp->f_pos == 0) { + ino = parent_sd->s_ino; + if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) + filp->f_pos++; + } + if (filp->f_pos == 1) { + if (parent_sd->s_parent) + ino = parent_sd->s_parent->s_ino; + else + ino = parent_sd->s_ino; + if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) + filp->f_pos++; + } + mutex_lock(&sysfs_mutex); + for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); + pos; + pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) { + const char * name; + unsigned int type; + int len, ret; + + name = pos->s_name; + len = strlen(name); + ino = pos->s_ino; + type = dt_type(pos); + filp->f_pos = ino; + filp->private_data = sysfs_get(pos); + + mutex_unlock(&sysfs_mutex); + ret = filldir(dirent, name, len, filp->f_pos, ino, type); + mutex_lock(&sysfs_mutex); + if (ret < 0) + break; + } + mutex_unlock(&sysfs_mutex); + if ((filp->f_pos > 1) && !pos) { /* EOF */ + filp->f_pos = INT_MAX; + filp->private_data = NULL; + } + return 0; +} + + +const struct file_operations sysfs_dir_operations = { + .read = generic_read_dir, + .readdir = sysfs_readdir, + .release = sysfs_dir_release, + .llseek = generic_file_llseek, +}; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c new file mode 100644 index 00000000..1ad8c93c --- /dev/null +++ b/fs/sysfs/file.c @@ -0,0 +1,747 @@ +/* + * fs/sysfs/file.c - sysfs regular (text) file implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007 Tejun Heo + * + * This file is released under the GPLv2. + * + * Please see Documentation/filesystems/sysfs.txt for more information. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sysfs.h" + +/* + * There's one sysfs_buffer for each open file and one + * sysfs_open_dirent for each sysfs_dirent with one or more open + * files. + * + * filp->private_data points to sysfs_buffer and + * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open + * is protected by sysfs_open_dirent_lock. + */ +static DEFINE_SPINLOCK(sysfs_open_dirent_lock); + +struct sysfs_open_dirent { + atomic_t refcnt; + atomic_t event; + wait_queue_head_t poll; + struct list_head buffers; /* goes through sysfs_buffer.list */ +}; + +struct sysfs_buffer { + size_t count; + loff_t pos; + char * page; + const struct sysfs_ops * ops; + struct mutex mutex; + int needs_read_fill; + int event; + struct list_head list; +}; + +/** + * fill_read_buffer - allocate and fill buffer from object. + * @dentry: dentry pointer. + * @buffer: data buffer for file. + * + * Allocate @buffer->page, if it hasn't been already, then call the + * kobject's show() method to fill the buffer with this attribute's + * data. + * This is called only once, on the file's first read unless an error + * is returned. + */ +static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) +{ + struct sysfs_dirent *attr_sd = dentry->d_fsdata; + struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; + const struct sysfs_ops * ops = buffer->ops; + int ret = 0; + ssize_t count; + + if (!buffer->page) + buffer->page = (char *) get_zeroed_page(GFP_KERNEL); + if (!buffer->page) + return -ENOMEM; + + /* need attr_sd for attr and ops, its parent for kobj */ + if (!sysfs_get_active(attr_sd)) + return -ENODEV; + + buffer->event = atomic_read(&attr_sd->s_attr.open->event); + count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page); + + sysfs_put_active(attr_sd); + + /* + * The code works fine with PAGE_SIZE return but it's likely to + * indicate truncated result or overflow in normal use cases. + */ + if (count >= (ssize_t)PAGE_SIZE) { + print_symbol("fill_read_buffer: %s returned bad count\n", + (unsigned long)ops->show); + /* Try to struggle along */ + count = PAGE_SIZE - 1; + } + if (count >= 0) { + buffer->needs_read_fill = 0; + buffer->count = count; + } else { + ret = count; + } + return ret; +} + +/** + * sysfs_read_file - read an attribute. + * @file: file pointer. + * @buf: buffer to fill. + * @count: number of bytes to read. + * @ppos: starting offset in file. + * + * Userspace wants to read an attribute file. The attribute descriptor + * is in the file's ->d_fsdata. The target object is in the directory's + * ->d_fsdata. + * + * We call fill_read_buffer() to allocate and fill the buffer from the + * object's show() method exactly once (if the read is happening from + * the beginning of the file). That should fill the entire buffer with + * all the data the object has to offer for that attribute. + * We then call flush_read_buffer() to copy the buffer to userspace + * in the increments specified. + */ + +static ssize_t +sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct sysfs_buffer * buffer = file->private_data; + ssize_t retval = 0; + + mutex_lock(&buffer->mutex); + if (buffer->needs_read_fill || *ppos == 0) { + retval = fill_read_buffer(file->f_path.dentry,buffer); + if (retval) + goto out; + } + pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", + __func__, count, *ppos, buffer->page); + retval = simple_read_from_buffer(buf, count, ppos, buffer->page, + buffer->count); +out: + mutex_unlock(&buffer->mutex); + return retval; +} + +/** + * fill_write_buffer - copy buffer from userspace. + * @buffer: data buffer for file. + * @buf: data from user. + * @count: number of bytes in @userbuf. + * + * Allocate @buffer->page if it hasn't been already, then + * copy the user-supplied buffer into it. + */ + +static int +fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t count) +{ + int error; + + if (!buffer->page) + buffer->page = (char *)get_zeroed_page(GFP_KERNEL); + if (!buffer->page) + return -ENOMEM; + + if (count >= PAGE_SIZE) + count = PAGE_SIZE - 1; + error = copy_from_user(buffer->page,buf,count); + buffer->needs_read_fill = 1; + /* if buf is assumed to contain a string, terminate it by \0, + so e.g. sscanf() can scan the string easily */ + buffer->page[count] = 0; + return error ? -EFAULT : count; +} + + +/** + * flush_write_buffer - push buffer to kobject. + * @dentry: dentry to the attribute + * @buffer: data buffer for file. + * @count: number of bytes + * + * Get the correct pointers for the kobject and the attribute we're + * dealing with, then call the store() method for the attribute, + * passing the buffer that we acquired in fill_write_buffer(). + */ + +static int +flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t count) +{ + struct sysfs_dirent *attr_sd = dentry->d_fsdata; + struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; + const struct sysfs_ops * ops = buffer->ops; + int rc; + + /* need attr_sd for attr and ops, its parent for kobj */ + if (!sysfs_get_active(attr_sd)) + return -ENODEV; + + rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count); + + sysfs_put_active(attr_sd); + + return rc; +} + + +/** + * sysfs_write_file - write an attribute. + * @file: file pointer + * @buf: data to write + * @count: number of bytes + * @ppos: starting offset + * + * Similar to sysfs_read_file(), though working in the opposite direction. + * We allocate and fill the data from the user in fill_write_buffer(), + * then push it to the kobject in flush_write_buffer(). + * There is no easy way for us to know if userspace is only doing a partial + * write, so we don't support them. We expect the entire buffer to come + * on the first write. + * Hint: if you're writing a value, first read the file, modify only the + * the value you're changing, then write entire buffer back. + */ + +static ssize_t +sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + struct sysfs_buffer * buffer = file->private_data; + ssize_t len; + + mutex_lock(&buffer->mutex); + len = fill_write_buffer(buffer, buf, count); + if (len > 0) + len = flush_write_buffer(file->f_path.dentry, buffer, len); + if (len > 0) + *ppos += len; + mutex_unlock(&buffer->mutex); + return len; +} + +/** + * sysfs_get_open_dirent - get or create sysfs_open_dirent + * @sd: target sysfs_dirent + * @buffer: sysfs_buffer for this instance of open + * + * If @sd->s_attr.open exists, increment its reference count; + * otherwise, create one. @buffer is chained to the buffers + * list. + * + * LOCKING: + * Kernel thread context (may sleep). + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int sysfs_get_open_dirent(struct sysfs_dirent *sd, + struct sysfs_buffer *buffer) +{ + struct sysfs_open_dirent *od, *new_od = NULL; + + retry: + spin_lock_irq(&sysfs_open_dirent_lock); + + if (!sd->s_attr.open && new_od) { + sd->s_attr.open = new_od; + new_od = NULL; + } + + od = sd->s_attr.open; + if (od) { + atomic_inc(&od->refcnt); + list_add_tail(&buffer->list, &od->buffers); + } + + spin_unlock_irq(&sysfs_open_dirent_lock); + + if (od) { + kfree(new_od); + return 0; + } + + /* not there, initialize a new one and retry */ + new_od = kmalloc(sizeof(*new_od), GFP_KERNEL); + if (!new_od) + return -ENOMEM; + + atomic_set(&new_od->refcnt, 0); + atomic_set(&new_od->event, 1); + init_waitqueue_head(&new_od->poll); + INIT_LIST_HEAD(&new_od->buffers); + goto retry; +} + +/** + * sysfs_put_open_dirent - put sysfs_open_dirent + * @sd: target sysfs_dirent + * @buffer: associated sysfs_buffer + * + * Put @sd->s_attr.open and unlink @buffer from the buffers list. + * If reference count reaches zero, disassociate and free it. + * + * LOCKING: + * None. + */ +static void sysfs_put_open_dirent(struct sysfs_dirent *sd, + struct sysfs_buffer *buffer) +{ + struct sysfs_open_dirent *od = sd->s_attr.open; + unsigned long flags; + + spin_lock_irqsave(&sysfs_open_dirent_lock, flags); + + list_del(&buffer->list); + if (atomic_dec_and_test(&od->refcnt)) + sd->s_attr.open = NULL; + else + od = NULL; + + spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); + + kfree(od); +} + +static int sysfs_open_file(struct inode *inode, struct file *file) +{ + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; + struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; + struct sysfs_buffer *buffer; + const struct sysfs_ops *ops; + int error = -EACCES; + + /* need attr_sd for attr and ops, its parent for kobj */ + if (!sysfs_get_active(attr_sd)) + return -ENODEV; + + /* every kobject with an attribute needs a ktype assigned */ + if (kobj->ktype && kobj->ktype->sysfs_ops) + ops = kobj->ktype->sysfs_ops; + else { + WARN(1, KERN_ERR "missing sysfs attribute operations for " + "kobject: %s\n", kobject_name(kobj)); + goto err_out; + } + + /* File needs write support. + * The inode's perms must say it's ok, + * and we must have a store method. + */ + if (file->f_mode & FMODE_WRITE) { + if (!(inode->i_mode & S_IWUGO) || !ops->store) + goto err_out; + } + + /* File needs read support. + * The inode's perms must say it's ok, and we there + * must be a show method for it. + */ + if (file->f_mode & FMODE_READ) { + if (!(inode->i_mode & S_IRUGO) || !ops->show) + goto err_out; + } + + /* No error? Great, allocate a buffer for the file, and store it + * it in file->private_data for easy access. + */ + error = -ENOMEM; + buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL); + if (!buffer) + goto err_out; + + mutex_init(&buffer->mutex); + buffer->needs_read_fill = 1; + buffer->ops = ops; + file->private_data = buffer; + + /* make sure we have open dirent struct */ + error = sysfs_get_open_dirent(attr_sd, buffer); + if (error) + goto err_free; + + /* open succeeded, put active references */ + sysfs_put_active(attr_sd); + return 0; + + err_free: + kfree(buffer); + err_out: + sysfs_put_active(attr_sd); + return error; +} + +static int sysfs_release(struct inode *inode, struct file *filp) +{ + struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata; + struct sysfs_buffer *buffer = filp->private_data; + + sysfs_put_open_dirent(sd, buffer); + + if (buffer->page) + free_page((unsigned long)buffer->page); + kfree(buffer); + + return 0; +} + +/* Sysfs attribute files are pollable. The idea is that you read + * the content and then you use 'poll' or 'select' to wait for + * the content to change. When the content changes (assuming the + * manager for the kobject supports notification), poll will + * return POLLERR|POLLPRI, and select will return the fd whether + * it is waiting for read, write, or exceptions. + * Once poll/select indicates that the value has changed, you + * need to close and re-open the file, or seek to 0 and read again. + * Reminder: this only works for attributes which actively support + * it, and it is not possible to test an attribute from userspace + * to see if it supports poll (Neither 'poll' nor 'select' return + * an appropriate error code). When in doubt, set a suitable timeout value. + */ +static unsigned int sysfs_poll(struct file *filp, poll_table *wait) +{ + struct sysfs_buffer * buffer = filp->private_data; + struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; + struct sysfs_open_dirent *od = attr_sd->s_attr.open; + + /* need parent for the kobj, grab both */ + if (!sysfs_get_active(attr_sd)) + goto trigger; + + poll_wait(filp, &od->poll, wait); + + sysfs_put_active(attr_sd); + + if (buffer->event != atomic_read(&od->event)) + goto trigger; + + return DEFAULT_POLLMASK; + + trigger: + buffer->needs_read_fill = 1; + return DEFAULT_POLLMASK|POLLERR|POLLPRI; +} + +void sysfs_notify_dirent(struct sysfs_dirent *sd) +{ + struct sysfs_open_dirent *od; + unsigned long flags; + + spin_lock_irqsave(&sysfs_open_dirent_lock, flags); + + od = sd->s_attr.open; + if (od) { + atomic_inc(&od->event); + wake_up_interruptible(&od->poll); + } + + spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); +} +EXPORT_SYMBOL_GPL(sysfs_notify_dirent); + +void sysfs_notify(struct kobject *k, const char *dir, const char *attr) +{ + struct sysfs_dirent *sd = k->sd; + + mutex_lock(&sysfs_mutex); + + if (sd && dir) + /* Only directories are tagged, so no need to pass + * a tag explicitly. + */ + sd = sysfs_find_dirent(sd, NULL, dir); + if (sd && attr) + sd = sysfs_find_dirent(sd, NULL, attr); + if (sd) + sysfs_notify_dirent(sd); + + mutex_unlock(&sysfs_mutex); +} +EXPORT_SYMBOL_GPL(sysfs_notify); + +const struct file_operations sysfs_file_operations = { + .read = sysfs_read_file, + .write = sysfs_write_file, + .llseek = generic_file_llseek, + .open = sysfs_open_file, + .release = sysfs_release, + .poll = sysfs_poll, +}; + +int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, + const struct attribute *attr, int type, mode_t amode) +{ + umode_t mode = (amode & S_IALLUGO) | S_IFREG; + struct sysfs_addrm_cxt acxt; + struct sysfs_dirent *sd; + int rc; + + sd = sysfs_new_dirent(attr->name, mode, type); + if (!sd) + return -ENOMEM; + sd->s_attr.attr = (void *)attr; + sysfs_dirent_init_lockdep(sd); + + sysfs_addrm_start(&acxt, dir_sd); + rc = sysfs_add_one(&acxt, sd); + sysfs_addrm_finish(&acxt); + + if (rc) + sysfs_put(sd); + + return rc; +} + + +int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, + int type) +{ + return sysfs_add_file_mode(dir_sd, attr, type, attr->mode); +} + + +/** + * sysfs_create_file - create an attribute file for an object. + * @kobj: object we're creating for. + * @attr: attribute descriptor. + */ + +int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) +{ + BUG_ON(!kobj || !kobj->sd || !attr); + + return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR); + +} + +int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr) +{ + int err = 0; + int i; + + for (i = 0; ptr[i] && !err; i++) + err = sysfs_create_file(kobj, ptr[i]); + if (err) + while (--i >= 0) + sysfs_remove_file(kobj, ptr[i]); + return err; +} + +/** + * sysfs_add_file_to_group - add an attribute file to a pre-existing group. + * @kobj: object we're acting for. + * @attr: attribute descriptor. + * @group: group name. + */ +int sysfs_add_file_to_group(struct kobject *kobj, + const struct attribute *attr, const char *group) +{ + struct sysfs_dirent *dir_sd; + int error; + + if (group) + dir_sd = sysfs_get_dirent(kobj->sd, NULL, group); + else + dir_sd = sysfs_get(kobj->sd); + + if (!dir_sd) + return -ENOENT; + + error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR); + sysfs_put(dir_sd); + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); + +/** + * sysfs_chmod_file - update the modified mode value on an object attribute. + * @kobj: object we're acting for. + * @attr: attribute descriptor. + * @mode: file permissions. + * + */ +int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, + mode_t mode) +{ + struct sysfs_dirent *sd; + struct iattr newattrs; + int rc; + + mutex_lock(&sysfs_mutex); + + rc = -ENOENT; + sd = sysfs_find_dirent(kobj->sd, NULL, attr->name); + if (!sd) + goto out; + + newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE; + rc = sysfs_sd_setattr(sd, &newattrs); + + out: + mutex_unlock(&sysfs_mutex); + return rc; +} +EXPORT_SYMBOL_GPL(sysfs_chmod_file); + + +/** + * sysfs_remove_file - remove an object attribute. + * @kobj: object we're acting for. + * @attr: attribute descriptor. + * + * Hash the attribute name and kill the victim. + */ + +void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) +{ + sysfs_hash_and_remove(kobj->sd, NULL, attr->name); +} + +void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) +{ + int i; + for (i = 0; ptr[i]; i++) + sysfs_remove_file(kobj, ptr[i]); +} + +/** + * sysfs_remove_file_from_group - remove an attribute file from a group. + * @kobj: object we're acting for. + * @attr: attribute descriptor. + * @group: group name. + */ +void sysfs_remove_file_from_group(struct kobject *kobj, + const struct attribute *attr, const char *group) +{ + struct sysfs_dirent *dir_sd; + + if (group) + dir_sd = sysfs_get_dirent(kobj->sd, NULL, group); + else + dir_sd = sysfs_get(kobj->sd); + if (dir_sd) { + sysfs_hash_and_remove(dir_sd, NULL, attr->name); + sysfs_put(dir_sd); + } +} +EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); + +struct sysfs_schedule_callback_struct { + struct list_head workq_list; + struct kobject *kobj; + void (*func)(void *); + void *data; + struct module *owner; + struct work_struct work; +}; + +static struct workqueue_struct *sysfs_workqueue; +static DEFINE_MUTEX(sysfs_workq_mutex); +static LIST_HEAD(sysfs_workq); +static void sysfs_schedule_callback_work(struct work_struct *work) +{ + struct sysfs_schedule_callback_struct *ss = container_of(work, + struct sysfs_schedule_callback_struct, work); + + (ss->func)(ss->data); + kobject_put(ss->kobj); + module_put(ss->owner); + mutex_lock(&sysfs_workq_mutex); + list_del(&ss->workq_list); + mutex_unlock(&sysfs_workq_mutex); + kfree(ss); +} + +/** + * sysfs_schedule_callback - helper to schedule a callback for a kobject + * @kobj: object we're acting for. + * @func: callback function to invoke later. + * @data: argument to pass to @func. + * @owner: module owning the callback code + * + * sysfs attribute methods must not unregister themselves or their parent + * kobject (which would amount to the same thing). Attempts to do so will + * deadlock, since unregistration is mutually exclusive with driver + * callbacks. + * + * Instead methods can call this routine, which will attempt to allocate + * and schedule a workqueue request to call back @func with @data as its + * argument in the workqueue's process context. @kobj will be pinned + * until @func returns. + * + * Returns 0 if the request was submitted, -ENOMEM if storage could not + * be allocated, -ENODEV if a reference to @owner isn't available, + * -EAGAIN if a callback has already been scheduled for @kobj. + */ +int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), + void *data, struct module *owner) +{ + struct sysfs_schedule_callback_struct *ss, *tmp; + + if (!try_module_get(owner)) + return -ENODEV; + + mutex_lock(&sysfs_workq_mutex); + list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) + if (ss->kobj == kobj) { + module_put(owner); + mutex_unlock(&sysfs_workq_mutex); + return -EAGAIN; + } + mutex_unlock(&sysfs_workq_mutex); + + if (sysfs_workqueue == NULL) { + sysfs_workqueue = create_singlethread_workqueue("sysfsd"); + if (sysfs_workqueue == NULL) { + module_put(owner); + return -ENOMEM; + } + } + + ss = kmalloc(sizeof(*ss), GFP_KERNEL); + if (!ss) { + module_put(owner); + return -ENOMEM; + } + kobject_get(kobj); + ss->kobj = kobj; + ss->func = func; + ss->data = data; + ss->owner = owner; + INIT_WORK(&ss->work, sysfs_schedule_callback_work); + INIT_LIST_HEAD(&ss->workq_list); + mutex_lock(&sysfs_workq_mutex); + list_add_tail(&ss->workq_list, &sysfs_workq); + mutex_unlock(&sysfs_workq_mutex); + queue_work(sysfs_workqueue, &ss->work); + return 0; +} +EXPORT_SYMBOL_GPL(sysfs_schedule_callback); + + +EXPORT_SYMBOL_GPL(sysfs_create_file); +EXPORT_SYMBOL_GPL(sysfs_remove_file); +EXPORT_SYMBOL_GPL(sysfs_remove_files); +EXPORT_SYMBOL_GPL(sysfs_create_files); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c new file mode 100644 index 00000000..194414f8 --- /dev/null +++ b/fs/sysfs/group.c @@ -0,0 +1,207 @@ +/* + * fs/sysfs/group.c - Operations for adding/removing multiple files at once. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * + * This file is released undert the GPL v2. + * + */ + +#include +#include +#include +#include +#include +#include "sysfs.h" + + +static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, + const struct attribute_group *grp) +{ + struct attribute *const* attr; + int i; + + for (i = 0, attr = grp->attrs; *attr; i++, attr++) + sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); +} + +static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, + const struct attribute_group *grp, int update) +{ + struct attribute *const* attr; + int error = 0, i; + + for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { + mode_t mode = 0; + + /* in update mode, we're changing the permissions or + * visibility. Do this by first removing then + * re-adding (if required) the file */ + if (update) + sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); + if (grp->is_visible) { + mode = grp->is_visible(kobj, *attr, i); + if (!mode) + continue; + } + error = sysfs_add_file_mode(dir_sd, *attr, SYSFS_KOBJ_ATTR, + (*attr)->mode | mode); + if (unlikely(error)) + break; + } + if (error) + remove_files(dir_sd, kobj, grp); + return error; +} + + +static int internal_create_group(struct kobject *kobj, int update, + const struct attribute_group *grp) +{ + struct sysfs_dirent *sd; + int error; + + BUG_ON(!kobj || (!update && !kobj->sd)); + + /* Updates may happen before the object has been instantiated */ + if (unlikely(update && !kobj->sd)) + return -EINVAL; + + if (grp->name) { + error = sysfs_create_subdir(kobj, grp->name, &sd); + if (error) + return error; + } else + sd = kobj->sd; + sysfs_get(sd); + error = create_files(sd, kobj, grp, update); + if (error) { + if (grp->name) + sysfs_remove_subdir(sd); + } + sysfs_put(sd); + return error; +} + +/** + * sysfs_create_group - given a directory kobject, create an attribute group + * @kobj: The kobject to create the group on + * @grp: The attribute group to create + * + * This function creates a group for the first time. It will explicitly + * warn and error if any of the attribute files being created already exist. + * + * Returns 0 on success or error. + */ +int sysfs_create_group(struct kobject *kobj, + const struct attribute_group *grp) +{ + return internal_create_group(kobj, 0, grp); +} + +/** + * sysfs_update_group - given a directory kobject, update an attribute group + * @kobj: The kobject to update the group on + * @grp: The attribute group to update + * + * This function updates an attribute group. Unlike + * sysfs_create_group(), it will explicitly not warn or error if any + * of the attribute files being created already exist. Furthermore, + * if the visibility of the files has changed through the is_visible() + * callback, it will update the permissions and add or remove the + * relevant files. + * + * The primary use for this function is to call it after making a change + * that affects group visibility. + * + * Returns 0 on success or error. + */ +int sysfs_update_group(struct kobject *kobj, + const struct attribute_group *grp) +{ + return internal_create_group(kobj, 1, grp); +} + + + +void sysfs_remove_group(struct kobject * kobj, + const struct attribute_group * grp) +{ + struct sysfs_dirent *dir_sd = kobj->sd; + struct sysfs_dirent *sd; + + if (grp->name) { + sd = sysfs_get_dirent(dir_sd, NULL, grp->name); + if (!sd) { + WARN(!sd, KERN_WARNING "sysfs group %p not found for " + "kobject '%s'\n", grp, kobject_name(kobj)); + return; + } + } else + sd = sysfs_get(dir_sd); + + remove_files(sd, kobj, grp); + if (grp->name) + sysfs_remove_subdir(sd); + + sysfs_put(sd); +} + +/** + * sysfs_merge_group - merge files into a pre-existing attribute group. + * @kobj: The kobject containing the group. + * @grp: The files to create and the attribute group they belong to. + * + * This function returns an error if the group doesn't exist or any of the + * files already exist in that group, in which case none of the new files + * are created. + */ +int sysfs_merge_group(struct kobject *kobj, + const struct attribute_group *grp) +{ + struct sysfs_dirent *dir_sd; + int error = 0; + struct attribute *const *attr; + int i; + + dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name); + if (!dir_sd) + return -ENOENT; + + for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) + error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR); + if (error) { + while (--i >= 0) + sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name); + } + sysfs_put(dir_sd); + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_merge_group); + +/** + * sysfs_unmerge_group - remove files from a pre-existing attribute group. + * @kobj: The kobject containing the group. + * @grp: The files to remove and the attribute group they belong to. + */ +void sysfs_unmerge_group(struct kobject *kobj, + const struct attribute_group *grp) +{ + struct sysfs_dirent *dir_sd; + struct attribute *const *attr; + + dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name); + if (dir_sd) { + for (attr = grp->attrs; *attr; ++attr) + sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); + sysfs_put(dir_sd); + } +} +EXPORT_SYMBOL_GPL(sysfs_unmerge_group); + + +EXPORT_SYMBOL_GPL(sysfs_create_group); +EXPORT_SYMBOL_GPL(sysfs_update_group); +EXPORT_SYMBOL_GPL(sysfs_remove_group); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c new file mode 100644 index 00000000..a494413e --- /dev/null +++ b/fs/sysfs/inode.c @@ -0,0 +1,367 @@ +/* + * fs/sysfs/inode.c - basic sysfs inode and dentry operations + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007 Tejun Heo + * + * This file is released under the GPLv2. + * + * Please see Documentation/filesystems/sysfs.txt for more information. + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "sysfs.h" + +extern struct super_block * sysfs_sb; + +static const struct address_space_operations sysfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, +}; + +static struct backing_dev_info sysfs_backing_dev_info = { + .name = "sysfs", + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, +}; + +static const struct inode_operations sysfs_inode_operations ={ + .permission = sysfs_permission, + .setattr = sysfs_setattr, + .getattr = sysfs_getattr, + .setxattr = sysfs_setxattr, +}; + +int __init sysfs_inode_init(void) +{ + return bdi_init(&sysfs_backing_dev_info); +} + +static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) +{ + struct sysfs_inode_attrs *attrs; + struct iattr *iattrs; + + attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL); + if (!attrs) + return NULL; + iattrs = &attrs->ia_iattr; + + /* assign default attributes */ + iattrs->ia_mode = sd->s_mode; + iattrs->ia_uid = 0; + iattrs->ia_gid = 0; + iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; + + return attrs; +} + +int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr) +{ + struct sysfs_inode_attrs *sd_attrs; + struct iattr *iattrs; + unsigned int ia_valid = iattr->ia_valid; + + sd_attrs = sd->s_iattr; + + if (!sd_attrs) { + /* setting attributes for the first time, allocate now */ + sd_attrs = sysfs_init_inode_attrs(sd); + if (!sd_attrs) + return -ENOMEM; + sd->s_iattr = sd_attrs; + } + /* attributes were changed at least once in past */ + iattrs = &sd_attrs->ia_iattr; + + if (ia_valid & ATTR_UID) + iattrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + iattrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + iattrs->ia_atime = iattr->ia_atime; + if (ia_valid & ATTR_MTIME) + iattrs->ia_mtime = iattr->ia_mtime; + if (ia_valid & ATTR_CTIME) + iattrs->ia_ctime = iattr->ia_ctime; + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + iattrs->ia_mode = sd->s_mode = mode; + } + return 0; +} + +int sysfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + struct sysfs_dirent *sd = dentry->d_fsdata; + int error; + + if (!sd) + return -EINVAL; + + mutex_lock(&sysfs_mutex); + error = inode_change_ok(inode, iattr); + if (error) + goto out; + + error = sysfs_sd_setattr(sd, iattr); + if (error) + goto out; + + /* this ignores size changes */ + setattr_copy(inode, iattr); + +out: + mutex_unlock(&sysfs_mutex); + return error; +} + +static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *secdata_len) +{ + struct sysfs_inode_attrs *iattrs; + void *old_secdata; + size_t old_secdata_len; + + if (!sd->s_iattr) { + sd->s_iattr = sysfs_init_inode_attrs(sd); + if (!sd->s_iattr) + return -ENOMEM; + } + + iattrs = sd->s_iattr; + old_secdata = iattrs->ia_secdata; + old_secdata_len = iattrs->ia_secdata_len; + + iattrs->ia_secdata = *secdata; + iattrs->ia_secdata_len = *secdata_len; + + *secdata = old_secdata; + *secdata_len = old_secdata_len; + return 0; +} + +int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + void *secdata; + int error; + u32 secdata_len = 0; + + if (!sd) + return -EINVAL; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + error = security_inode_setsecurity(dentry->d_inode, suffix, + value, size, flags); + if (error) + goto out; + error = security_inode_getsecctx(dentry->d_inode, + &secdata, &secdata_len); + if (error) + goto out; + + mutex_lock(&sysfs_mutex); + error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len); + mutex_unlock(&sysfs_mutex); + + if (secdata) + security_release_secctx(secdata, secdata_len); + } else + return -EINVAL; +out: + return error; +} + +static inline void set_default_inode_attr(struct inode * inode, mode_t mode) +{ + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +} + +static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) +{ + inode->i_uid = iattr->ia_uid; + inode->i_gid = iattr->ia_gid; + inode->i_atime = iattr->ia_atime; + inode->i_mtime = iattr->ia_mtime; + inode->i_ctime = iattr->ia_ctime; +} + +static int sysfs_count_nlink(struct sysfs_dirent *sd) +{ + struct sysfs_dirent *child; + int nr = 0; + + for (child = sd->s_dir.children; child; child = child->s_sibling) + if (sysfs_type(child) == SYSFS_DIR) + nr++; + + return nr + 2; +} + +static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) +{ + struct sysfs_inode_attrs *iattrs = sd->s_iattr; + + inode->i_mode = sd->s_mode; + if (iattrs) { + /* sysfs_dirent has non-default attributes + * get them from persistent copy in sysfs_dirent + */ + set_inode_attr(inode, &iattrs->ia_iattr); + security_inode_notifysecctx(inode, + iattrs->ia_secdata, + iattrs->ia_secdata_len); + } + + if (sysfs_type(sd) == SYSFS_DIR) + inode->i_nlink = sysfs_count_nlink(sd); +} + +int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + struct inode *inode = dentry->d_inode; + + mutex_lock(&sysfs_mutex); + sysfs_refresh_inode(sd, inode); + mutex_unlock(&sysfs_mutex); + + generic_fillattr(inode, stat); + return 0; +} + +static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) +{ + struct bin_attribute *bin_attr; + + inode->i_private = sysfs_get(sd); + inode->i_mapping->a_ops = &sysfs_aops; + inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; + inode->i_op = &sysfs_inode_operations; + + set_default_inode_attr(inode, sd->s_mode); + sysfs_refresh_inode(sd, inode); + + /* initialize inode according to type */ + switch (sysfs_type(sd)) { + case SYSFS_DIR: + inode->i_op = &sysfs_dir_inode_operations; + inode->i_fop = &sysfs_dir_operations; + break; + case SYSFS_KOBJ_ATTR: + inode->i_size = PAGE_SIZE; + inode->i_fop = &sysfs_file_operations; + break; + case SYSFS_KOBJ_BIN_ATTR: + bin_attr = sd->s_bin_attr.bin_attr; + inode->i_size = bin_attr->size; + inode->i_fop = &bin_fops; + break; + case SYSFS_KOBJ_LINK: + inode->i_op = &sysfs_symlink_inode_operations; + break; + default: + BUG(); + } + + unlock_new_inode(inode); +} + +/** + * sysfs_get_inode - get inode for sysfs_dirent + * @sb: super block + * @sd: sysfs_dirent to allocate inode for + * + * Get inode for @sd. If such inode doesn't exist, a new inode + * is allocated and basics are initialized. New inode is + * returned locked. + * + * LOCKING: + * Kernel thread context (may sleep). + * + * RETURNS: + * Pointer to allocated inode on success, NULL on failure. + */ +struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd) +{ + struct inode *inode; + + inode = iget_locked(sb, sd->s_ino); + if (inode && (inode->i_state & I_NEW)) + sysfs_init_inode(sd, inode); + + return inode; +} + +/* + * The sysfs_dirent serves as both an inode and a directory entry for sysfs. + * To prevent the sysfs inode numbers from being freed prematurely we take a + * reference to sysfs_dirent from the sysfs inode. A + * super_operations.evict_inode() implementation is needed to drop that + * reference upon inode destruction. + */ +void sysfs_evict_inode(struct inode *inode) +{ + struct sysfs_dirent *sd = inode->i_private; + + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + sysfs_put(sd); +} + +int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name) +{ + struct sysfs_addrm_cxt acxt; + struct sysfs_dirent *sd; + + if (!dir_sd) + return -ENOENT; + + sysfs_addrm_start(&acxt, dir_sd); + + sd = sysfs_find_dirent(dir_sd, ns, name); + if (sd && (sd->s_ns != ns)) + sd = NULL; + if (sd) + sysfs_remove_one(&acxt, sd); + + sysfs_addrm_finish(&acxt); + + if (sd) + return 0; + else + return -ENOENT; +} + +int sysfs_permission(struct inode *inode, int mask, unsigned int flags) +{ + struct sysfs_dirent *sd; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + sd = inode->i_private; + + mutex_lock(&sysfs_mutex); + sysfs_refresh_inode(sd, inode); + mutex_unlock(&sysfs_mutex); + + return generic_permission(inode, mask, flags, NULL); +} diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c new file mode 100644 index 00000000..e34f0d99 --- /dev/null +++ b/fs/sysfs/mount.c @@ -0,0 +1,201 @@ +/* + * fs/sysfs/symlink.c - operations for initializing and mounting sysfs + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007 Tejun Heo + * + * This file is released under the GPLv2. + * + * Please see Documentation/filesystems/sysfs.txt for more information. + */ + +#define DEBUG + +#include +#include +#include +#include +#include +#include +#include + +#include "sysfs.h" + + +static struct vfsmount *sysfs_mnt; +struct kmem_cache *sysfs_dir_cachep; + +static const struct super_operations sysfs_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = sysfs_evict_inode, +}; + +struct sysfs_dirent sysfs_root = { + .s_name = "", + .s_count = ATOMIC_INIT(1), + .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT), + .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, + .s_ino = 1, +}; + +static int sysfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + struct dentry *root; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = SYSFS_MAGIC; + sb->s_op = &sysfs_ops; + sb->s_time_gran = 1; + + /* get root inode, initialize and unlock it */ + mutex_lock(&sysfs_mutex); + inode = sysfs_get_inode(sb, &sysfs_root); + mutex_unlock(&sysfs_mutex); + if (!inode) { + pr_debug("sysfs: could not get root inode\n"); + return -ENOMEM; + } + + /* instantiate and link root dentry */ + root = d_alloc_root(inode); + if (!root) { + pr_debug("%s: could not get root dentry!\n",__func__); + iput(inode); + return -ENOMEM; + } + root->d_fsdata = &sysfs_root; + sb->s_root = root; + return 0; +} + +static int sysfs_test_super(struct super_block *sb, void *data) +{ + struct sysfs_super_info *sb_info = sysfs_info(sb); + struct sysfs_super_info *info = data; + enum kobj_ns_type type; + int found = 1; + + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { + if (sb_info->ns[type] != info->ns[type]) + found = 0; + } + return found; +} + +static int sysfs_set_super(struct super_block *sb, void *data) +{ + int error; + error = set_anon_super(sb, data); + if (!error) + sb->s_fs_info = data; + return error; +} + +static void free_sysfs_super_info(struct sysfs_super_info *info) +{ + int type; + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) + kobj_ns_drop(type, info->ns[type]); + kfree(info); +} + +static struct dentry *sysfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct sysfs_super_info *info; + enum kobj_ns_type type; + struct super_block *sb; + int error; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) + info->ns[type] = kobj_ns_grab_current(type); + + sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info); + if (IS_ERR(sb) || sb->s_fs_info != info) + free_sysfs_super_info(info); + if (IS_ERR(sb)) + return ERR_CAST(sb); + if (!sb->s_root) { + sb->s_flags = flags; + error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + if (error) { + deactivate_locked_super(sb); + return ERR_PTR(error); + } + sb->s_flags |= MS_ACTIVE; + } + + return dget(sb->s_root); +} + +static void sysfs_kill_sb(struct super_block *sb) +{ + struct sysfs_super_info *info = sysfs_info(sb); + /* Remove the superblock from fs_supers/s_instances + * so we can't find it, before freeing sysfs_super_info. + */ + kill_anon_super(sb); + free_sysfs_super_info(info); +} + +static struct file_system_type sysfs_fs_type = { + .name = "sysfs", + .mount = sysfs_mount, + .kill_sb = sysfs_kill_sb, +}; + +int __init sysfs_init(void) +{ + int err = -ENOMEM; + + sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", + sizeof(struct sysfs_dirent), + 0, 0, NULL); + if (!sysfs_dir_cachep) + goto out; + + err = sysfs_inode_init(); + if (err) + goto out_err; + + err = register_filesystem(&sysfs_fs_type); + if (!err) { + sysfs_mnt = kern_mount(&sysfs_fs_type); + if (IS_ERR(sysfs_mnt)) { + printk(KERN_ERR "sysfs: could not mount!\n"); + err = PTR_ERR(sysfs_mnt); + sysfs_mnt = NULL; + unregister_filesystem(&sysfs_fs_type); + goto out_err; + } + } else + goto out_err; +out: + return err; +out_err: + kmem_cache_destroy(sysfs_dir_cachep); + sysfs_dir_cachep = NULL; + goto out; +} + +#undef sysfs_get +struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) +{ + return __sysfs_get(sd); +} +EXPORT_SYMBOL_GPL(sysfs_get); + +#undef sysfs_put +void sysfs_put(struct sysfs_dirent *sd) +{ + __sysfs_put(sd); +} +EXPORT_SYMBOL_GPL(sysfs_put); diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c new file mode 100644 index 00000000..a7ac78f8 --- /dev/null +++ b/fs/sysfs/symlink.c @@ -0,0 +1,307 @@ +/* + * fs/sysfs/symlink.c - sysfs symlink implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007 Tejun Heo + * + * This file is released under the GPLv2. + * + * Please see Documentation/filesystems/sysfs.txt for more information. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sysfs.h" + +static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, + const char *name, int warn) +{ + struct sysfs_dirent *parent_sd = NULL; + struct sysfs_dirent *target_sd = NULL; + struct sysfs_dirent *sd = NULL; + struct sysfs_addrm_cxt acxt; + enum kobj_ns_type ns_type; + int error; + + BUG_ON(!name); + + if (!kobj) + parent_sd = &sysfs_root; + else + parent_sd = kobj->sd; + + error = -EFAULT; + if (!parent_sd) + goto out_put; + + /* target->sd can go away beneath us but is protected with + * sysfs_assoc_lock. Fetch target_sd from it. + */ + spin_lock(&sysfs_assoc_lock); + if (target->sd) + target_sd = sysfs_get(target->sd); + spin_unlock(&sysfs_assoc_lock); + + error = -ENOENT; + if (!target_sd) + goto out_put; + + error = -ENOMEM; + sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK); + if (!sd) + goto out_put; + + ns_type = sysfs_ns_type(parent_sd); + if (ns_type) + sd->s_ns = target->ktype->namespace(target); + sd->s_symlink.target_sd = target_sd; + target_sd = NULL; /* reference is now owned by the symlink */ + + sysfs_addrm_start(&acxt, parent_sd); + /* Symlinks must be between directories with the same ns_type */ + if (!ns_type || + (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) { + if (warn) + error = sysfs_add_one(&acxt, sd); + else + error = __sysfs_add_one(&acxt, sd); + } else { + error = -EINVAL; + WARN(1, KERN_WARNING + "sysfs: symlink across ns_types %s/%s -> %s/%s\n", + parent_sd->s_name, + sd->s_name, + sd->s_symlink.target_sd->s_parent->s_name, + sd->s_symlink.target_sd->s_name); + } + sysfs_addrm_finish(&acxt); + + if (error) + goto out_put; + + return 0; + + out_put: + sysfs_put(target_sd); + sysfs_put(sd); + return error; +} + +/** + * sysfs_create_link - create symlink between two objects. + * @kobj: object whose directory we're creating the link in. + * @target: object we're pointing to. + * @name: name of the symlink. + */ +int sysfs_create_link(struct kobject *kobj, struct kobject *target, + const char *name) +{ + return sysfs_do_create_link(kobj, target, name, 1); +} + +/** + * sysfs_create_link_nowarn - create symlink between two objects. + * @kobj: object whose directory we're creating the link in. + * @target: object we're pointing to. + * @name: name of the symlink. + * + * This function does the same as sysf_create_link(), but it + * doesn't warn if the link already exists. + */ +int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, + const char *name) +{ + return sysfs_do_create_link(kobj, target, name, 0); +} + +/** + * sysfs_delete_link - remove symlink in object's directory. + * @kobj: object we're acting for. + * @targ: object we're pointing to. + * @name: name of the symlink to remove. + * + * Unlike sysfs_remove_link sysfs_delete_link has enough information + * to successfully delete symlinks in tagged directories. + */ +void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, + const char *name) +{ + const void *ns = NULL; + spin_lock(&sysfs_assoc_lock); + if (targ->sd && sysfs_ns_type(kobj->sd)) + ns = targ->sd->s_ns; + spin_unlock(&sysfs_assoc_lock); + sysfs_hash_and_remove(kobj->sd, ns, name); +} + +/** + * sysfs_remove_link - remove symlink in object's directory. + * @kobj: object we're acting for. + * @name: name of the symlink to remove. + */ + +void sysfs_remove_link(struct kobject * kobj, const char * name) +{ + struct sysfs_dirent *parent_sd = NULL; + + if (!kobj) + parent_sd = &sysfs_root; + else + parent_sd = kobj->sd; + + sysfs_hash_and_remove(parent_sd, NULL, name); +} + +/** + * sysfs_rename_link - rename symlink in object's directory. + * @kobj: object we're acting for. + * @targ: object we're pointing to. + * @old: previous name of the symlink. + * @new: new name of the symlink. + * + * A helper function for the common rename symlink idiom. + */ +int sysfs_rename_link(struct kobject *kobj, struct kobject *targ, + const char *old, const char *new) +{ + struct sysfs_dirent *parent_sd, *sd = NULL; + const void *old_ns = NULL, *new_ns = NULL; + int result; + + if (!kobj) + parent_sd = &sysfs_root; + else + parent_sd = kobj->sd; + + if (targ->sd) + old_ns = targ->sd->s_ns; + + result = -ENOENT; + sd = sysfs_get_dirent(parent_sd, old_ns, old); + if (!sd) + goto out; + + result = -EINVAL; + if (sysfs_type(sd) != SYSFS_KOBJ_LINK) + goto out; + if (sd->s_symlink.target_sd->s_dir.kobj != targ) + goto out; + + if (sysfs_ns_type(parent_sd)) + new_ns = targ->ktype->namespace(targ); + + result = sysfs_rename(sd, parent_sd, new_ns, new); + +out: + sysfs_put(sd); + return result; +} + +static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, + struct sysfs_dirent *target_sd, char *path) +{ + struct sysfs_dirent *base, *sd; + char *s = path; + int len = 0; + + /* go up to the root, stop at the base */ + base = parent_sd; + while (base->s_parent) { + sd = target_sd->s_parent; + while (sd->s_parent && base != sd) + sd = sd->s_parent; + + if (base == sd) + break; + + strcpy(s, "../"); + s += 3; + base = base->s_parent; + } + + /* determine end of target string for reverse fillup */ + sd = target_sd; + while (sd->s_parent && sd != base) { + len += strlen(sd->s_name) + 1; + sd = sd->s_parent; + } + + /* check limits */ + if (len < 2) + return -EINVAL; + len--; + if ((s - path) + len > PATH_MAX) + return -ENAMETOOLONG; + + /* reverse fillup of target string from target to base */ + sd = target_sd; + while (sd->s_parent && sd != base) { + int slen = strlen(sd->s_name); + + len -= slen; + strncpy(s + len, sd->s_name, slen); + if (len) + s[--len] = '/'; + + sd = sd->s_parent; + } + + return 0; +} + +static int sysfs_getlink(struct dentry *dentry, char * path) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + struct sysfs_dirent *parent_sd = sd->s_parent; + struct sysfs_dirent *target_sd = sd->s_symlink.target_sd; + int error; + + mutex_lock(&sysfs_mutex); + error = sysfs_get_target_path(parent_sd, target_sd, path); + mutex_unlock(&sysfs_mutex); + + return error; +} + +static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + int error = -ENOMEM; + unsigned long page = get_zeroed_page(GFP_KERNEL); + if (page) { + error = sysfs_getlink(dentry, (char *) page); + if (error < 0) + free_page((unsigned long)page); + } + nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); + return NULL; +} + +static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +{ + char *page = nd_get_link(nd); + if (!IS_ERR(page)) + free_page((unsigned long)page); +} + +const struct inode_operations sysfs_symlink_inode_operations = { + .setxattr = sysfs_setxattr, + .readlink = generic_readlink, + .follow_link = sysfs_follow_link, + .put_link = sysfs_put_link, + .setattr = sysfs_setattr, + .getattr = sysfs_getattr, + .permission = sysfs_permission, +}; + + +EXPORT_SYMBOL_GPL(sysfs_create_link); +EXPORT_SYMBOL_GPL(sysfs_remove_link); +EXPORT_SYMBOL_GPL(sysfs_rename_link); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h new file mode 100644 index 00000000..2ed2404f --- /dev/null +++ b/fs/sysfs/sysfs.h @@ -0,0 +1,231 @@ +/* + * fs/sysfs/sysfs.h - sysfs internal header file + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007 Tejun Heo + * + * This file is released under the GPLv2. + */ + +#include +#include +#include + +struct sysfs_open_dirent; + +/* type-specific structures for sysfs_dirent->s_* union members */ +struct sysfs_elem_dir { + struct kobject *kobj; + /* children list starts here and goes through sd->s_sibling */ + struct sysfs_dirent *children; +}; + +struct sysfs_elem_symlink { + struct sysfs_dirent *target_sd; +}; + +struct sysfs_elem_attr { + struct attribute *attr; + struct sysfs_open_dirent *open; +}; + +struct sysfs_elem_bin_attr { + struct bin_attribute *bin_attr; + struct hlist_head buffers; +}; + +struct sysfs_inode_attrs { + struct iattr ia_iattr; + void *ia_secdata; + u32 ia_secdata_len; +}; + +/* + * sysfs_dirent - the building block of sysfs hierarchy. Each and + * every sysfs node is represented by single sysfs_dirent. + * + * As long as s_count reference is held, the sysfs_dirent itself is + * accessible. Dereferencing s_elem or any other outer entity + * requires s_active reference. + */ +struct sysfs_dirent { + atomic_t s_count; + atomic_t s_active; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif + struct sysfs_dirent *s_parent; + struct sysfs_dirent *s_sibling; + const char *s_name; + + const void *s_ns; /* namespace tag */ + union { + struct sysfs_elem_dir s_dir; + struct sysfs_elem_symlink s_symlink; + struct sysfs_elem_attr s_attr; + struct sysfs_elem_bin_attr s_bin_attr; + }; + + unsigned int s_flags; + unsigned short s_mode; + ino_t s_ino; + struct sysfs_inode_attrs *s_iattr; +}; + +#define SD_DEACTIVATED_BIAS INT_MIN + +#define SYSFS_TYPE_MASK 0x00ff +#define SYSFS_DIR 0x0001 +#define SYSFS_KOBJ_ATTR 0x0002 +#define SYSFS_KOBJ_BIN_ATTR 0x0004 +#define SYSFS_KOBJ_LINK 0x0008 +#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) +#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) + +/* identify any namespace tag on sysfs_dirents */ +#define SYSFS_NS_TYPE_MASK 0xff00 +#define SYSFS_NS_TYPE_SHIFT 8 + +#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK) +#define SYSFS_FLAG_REMOVED 0x020000 + +static inline unsigned int sysfs_type(struct sysfs_dirent *sd) +{ + return sd->s_flags & SYSFS_TYPE_MASK; +} + +/* + * Return any namespace tags on this dirent. + * enum kobj_ns_type is defined in linux/kobject.h + */ +static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd) +{ + return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define sysfs_dirent_init_lockdep(sd) \ +do { \ + struct attribute *attr = sd->s_attr.attr; \ + struct lock_class_key *key = attr->key; \ + if (!key) \ + key = &attr->skey; \ + \ + lockdep_init_map(&sd->dep_map, "s_active", key, 0); \ +} while(0) +#else +#define sysfs_dirent_init_lockdep(sd) do {} while(0) +#endif + +/* + * Context structure to be used while adding/removing nodes. + */ +struct sysfs_addrm_cxt { + struct sysfs_dirent *parent_sd; + struct sysfs_dirent *removed; +}; + +/* + * mount.c + */ + +/* + * Each sb is associated with a set of namespace tags (i.e. + * the network namespace of the task which mounted this sysfs + * instance). + */ +struct sysfs_super_info { + void *ns[KOBJ_NS_TYPES]; +}; +#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info)) +extern struct sysfs_dirent sysfs_root; +extern struct kmem_cache *sysfs_dir_cachep; + +/* + * dir.c + */ +extern struct mutex sysfs_mutex; +extern spinlock_t sysfs_assoc_lock; + +extern const struct file_operations sysfs_dir_operations; +extern const struct inode_operations sysfs_dir_inode_operations; + +struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd); +struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd); +void sysfs_put_active(struct sysfs_dirent *sd); +void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, + struct sysfs_dirent *parent_sd); +int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); +int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); +void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); +void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); + +struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, + const void *ns, + const unsigned char *name); +struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, + const void *ns, + const unsigned char *name); +struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); + +void release_sysfs_dirent(struct sysfs_dirent *sd); + +int sysfs_create_subdir(struct kobject *kobj, const char *name, + struct sysfs_dirent **p_sd); +void sysfs_remove_subdir(struct sysfs_dirent *sd); + +int sysfs_rename(struct sysfs_dirent *sd, + struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name); + +static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) +{ + if (sd) { + WARN_ON(!atomic_read(&sd->s_count)); + atomic_inc(&sd->s_count); + } + return sd; +} +#define sysfs_get(sd) __sysfs_get(sd) + +static inline void __sysfs_put(struct sysfs_dirent *sd) +{ + if (sd && atomic_dec_and_test(&sd->s_count)) + release_sysfs_dirent(sd); +} +#define sysfs_put(sd) __sysfs_put(sd) + +/* + * inode.c + */ +struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd); +void sysfs_evict_inode(struct inode *inode); +int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); +int sysfs_permission(struct inode *inode, int mask, unsigned int flags); +int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); +int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); +int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); +int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name); +int sysfs_inode_init(void); + +/* + * file.c + */ +extern const struct file_operations sysfs_file_operations; + +int sysfs_add_file(struct sysfs_dirent *dir_sd, + const struct attribute *attr, int type); + +int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, + const struct attribute *attr, int type, mode_t amode); +/* + * bin.c + */ +extern const struct file_operations bin_fops; +void unmap_bin_file(struct sysfs_dirent *attr_sd); + +/* + * symlink.c + */ +extern const struct inode_operations sysfs_symlink_inode_operations; diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig new file mode 100644 index 00000000..33aeb4b7 --- /dev/null +++ b/fs/sysv/Kconfig @@ -0,0 +1,36 @@ +config SYSV_FS + tristate "System V/Xenix/V7/Coherent file system support" + depends on BLOCK + help + SCO, Xenix and Coherent are commercial Unix systems for Intel + machines, and Version 7 was used on the DEC PDP-11. Saying Y + here would allow you to read from their floppies and hard disk + partitions. + + If you have floppies or hard disk partitions like that, it is likely + that they contain binaries from those other Unix systems; in order + to run these binaries, you will want to install linux-abi which is + a set of kernel modules that lets you run SCO, Xenix, Wyse, + UnixWare, Dell Unix and System V programs under Linux. It is + available via FTP (user: ftp) from + ). + NOTE: that will work only for binaries from Intel-based systems; + PDP ones will have to wait until somebody ports Linux to -11 ;-) + + If you only intend to mount files from some other Unix over the + network using NFS, you don't need the System V file system support + (but you need NFS file system support obviously). + + Note that this option is generally not needed for floppies, since a + good portable way to transport files and directories between unixes + (and even other operating systems) is given by the tar program ("man + tar" or preferably "info tar"). Note also that this option has + nothing whatsoever to do with the option "System V IPC". Read about + the System V file system in + . + Saying Y here will enlarge your kernel by about 27 KB. + + To compile this as a module, choose M here: the module will be called + sysv. + + If you haven't heard about all of this before, it's safe to say N. diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile new file mode 100644 index 00000000..3591f9d7 --- /dev/null +++ b/fs/sysv/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the Linux SystemV/Coherent filesystem routines. +# + +obj-$(CONFIG_SYSV_FS) += sysv.o + +sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \ + namei.o super.o symlink.o diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c new file mode 100644 index 00000000..9a6ad96a --- /dev/null +++ b/fs/sysv/balloc.c @@ -0,0 +1,239 @@ +/* + * linux/fs/sysv/balloc.c + * + * minix/bitmap.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext/freelists.c + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * + * xenix/alloc.c + * Copyright (C) 1992 Doug Evans + * + * coh/alloc.c + * Copyright (C) 1993 Pascal Haible, Bruno Haible + * + * sysv/balloc.c + * Copyright (C) 1993 Bruno Haible + * + * This file contains code for allocating/freeing blocks. + */ + +#include +#include +#include "sysv.h" + +/* We don't trust the value of + sb->sv_sbd2->s_tfree = *sb->sv_free_blocks + but we nevertheless keep it up to date. */ + +static inline sysv_zone_t *get_chunk(struct super_block *sb, struct buffer_head *bh) +{ + char *bh_data = bh->b_data; + + if (SYSV_SB(sb)->s_type == FSTYPE_SYSV4) + return (sysv_zone_t*)(bh_data+4); + else + return (sysv_zone_t*)(bh_data+2); +} + +/* NOTE NOTE NOTE: nr is a block number _as_ _stored_ _on_ _disk_ */ + +void sysv_free_block(struct super_block * sb, sysv_zone_t nr) +{ + struct sysv_sb_info * sbi = SYSV_SB(sb); + struct buffer_head * bh; + sysv_zone_t *blocks = sbi->s_bcache; + unsigned count; + unsigned block = fs32_to_cpu(sbi, nr); + + /* + * This code does not work at all for AFS (it has a bitmap + * free list). As AFS is supposed to be read-only no one + * should call this for an AFS filesystem anyway... + */ + if (sbi->s_type == FSTYPE_AFS) + return; + + if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) { + printk("sysv_free_block: trying to free block not in datazone\n"); + return; + } + + lock_super(sb); + count = fs16_to_cpu(sbi, *sbi->s_bcache_count); + + if (count > sbi->s_flc_size) { + printk("sysv_free_block: flc_count > flc_size\n"); + unlock_super(sb); + return; + } + /* If the free list head in super-block is full, it is copied + * into this block being freed, ditto if it's completely empty + * (applies only on Coherent). + */ + if (count == sbi->s_flc_size || count == 0) { + block += sbi->s_block_base; + bh = sb_getblk(sb, block); + if (!bh) { + printk("sysv_free_block: getblk() failed\n"); + unlock_super(sb); + return; + } + memset(bh->b_data, 0, sb->s_blocksize); + *(__fs16*)bh->b_data = cpu_to_fs16(sbi, count); + memcpy(get_chunk(sb,bh), blocks, count * sizeof(sysv_zone_t)); + mark_buffer_dirty(bh); + set_buffer_uptodate(bh); + brelse(bh); + count = 0; + } + sbi->s_bcache[count++] = nr; + + *sbi->s_bcache_count = cpu_to_fs16(sbi, count); + fs32_add(sbi, sbi->s_free_blocks, 1); + dirty_sb(sb); + unlock_super(sb); +} + +sysv_zone_t sysv_new_block(struct super_block * sb) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + unsigned int block; + sysv_zone_t nr; + struct buffer_head * bh; + unsigned count; + + lock_super(sb); + count = fs16_to_cpu(sbi, *sbi->s_bcache_count); + + if (count == 0) /* Applies only to Coherent FS */ + goto Enospc; + nr = sbi->s_bcache[--count]; + if (nr == 0) /* Applies only to Xenix FS, SystemV FS */ + goto Enospc; + + block = fs32_to_cpu(sbi, nr); + + *sbi->s_bcache_count = cpu_to_fs16(sbi, count); + + if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) { + printk("sysv_new_block: new block %d is not in data zone\n", + block); + goto Enospc; + } + + if (count == 0) { /* the last block continues the free list */ + unsigned count; + + block += sbi->s_block_base; + if (!(bh = sb_bread(sb, block))) { + printk("sysv_new_block: cannot read free-list block\n"); + /* retry this same block next time */ + *sbi->s_bcache_count = cpu_to_fs16(sbi, 1); + goto Enospc; + } + count = fs16_to_cpu(sbi, *(__fs16*)bh->b_data); + if (count > sbi->s_flc_size) { + printk("sysv_new_block: free-list block with >flc_size entries\n"); + brelse(bh); + goto Enospc; + } + *sbi->s_bcache_count = cpu_to_fs16(sbi, count); + memcpy(sbi->s_bcache, get_chunk(sb, bh), + count * sizeof(sysv_zone_t)); + brelse(bh); + } + /* Now the free list head in the superblock is valid again. */ + fs32_add(sbi, sbi->s_free_blocks, -1); + dirty_sb(sb); + unlock_super(sb); + return nr; + +Enospc: + unlock_super(sb); + return 0; +} + +unsigned long sysv_count_free_blocks(struct super_block * sb) +{ + struct sysv_sb_info * sbi = SYSV_SB(sb); + int sb_count; + int count; + struct buffer_head * bh = NULL; + sysv_zone_t *blocks; + unsigned block; + int n; + + /* + * This code does not work at all for AFS (it has a bitmap + * free list). As AFS is supposed to be read-only we just + * lie and say it has no free block at all. + */ + if (sbi->s_type == FSTYPE_AFS) + return 0; + + lock_super(sb); + sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks); + + if (0) + goto trust_sb; + + /* this causes a lot of disk traffic ... */ + count = 0; + n = fs16_to_cpu(sbi, *sbi->s_bcache_count); + blocks = sbi->s_bcache; + while (1) { + sysv_zone_t zone; + if (n > sbi->s_flc_size) + goto E2big; + zone = 0; + while (n && (zone = blocks[--n]) != 0) + count++; + if (zone == 0) + break; + + block = fs32_to_cpu(sbi, zone); + if (bh) + brelse(bh); + + if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) + goto Einval; + block += sbi->s_block_base; + bh = sb_bread(sb, block); + if (!bh) + goto Eio; + n = fs16_to_cpu(sbi, *(__fs16*)bh->b_data); + blocks = get_chunk(sb, bh); + } + if (bh) + brelse(bh); + if (count != sb_count) + goto Ecount; +done: + unlock_super(sb); + return count; + +Einval: + printk("sysv_count_free_blocks: new block %d is not in data zone\n", + block); + goto trust_sb; +Eio: + printk("sysv_count_free_blocks: cannot read free-list block\n"); + goto trust_sb; +E2big: + printk("sysv_count_free_blocks: >flc_size entries in free-list block\n"); + if (bh) + brelse(bh); +trust_sb: + count = sb_count; + goto done; +Ecount: + printk("sysv_count_free_blocks: free block count was %d, " + "correcting to %d\n", sb_count, count); + if (!(sb->s_flags & MS_RDONLY)) { + *sbi->s_free_blocks = cpu_to_fs32(sbi, count); + dirty_sb(sb); + } + goto done; +} diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c new file mode 100644 index 00000000..a77c4215 --- /dev/null +++ b/fs/sysv/dir.c @@ -0,0 +1,377 @@ +/* + * linux/fs/sysv/dir.c + * + * minix/dir.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * coh/dir.c + * Copyright (C) 1993 Pascal Haible, Bruno Haible + * + * sysv/dir.c + * Copyright (C) 1993 Bruno Haible + * + * SystemV/Coherent directory handling functions + */ + +#include +#include +#include +#include "sysv.h" + +static int sysv_readdir(struct file *, void *, filldir_t); + +const struct file_operations sysv_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = sysv_readdir, + .fsync = generic_file_fsync, +}; + +static inline void dir_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) +{ + struct address_space *mapping = page->mapping; + struct inode *dir = mapping->host; + int err = 0; + + block_write_end(NULL, mapping, pos, len, len, page, NULL); + if (pos+len > dir->i_size) { + i_size_write(dir, pos+len); + mark_inode_dirty(dir); + } + if (IS_DIRSYNC(dir)) + err = write_one_page(page, 1); + else + unlock_page(page); + return err; +} + +static struct page * dir_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + if (!IS_ERR(page)) + kmap(page); + return page; +} + +static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + unsigned long pos = filp->f_pos; + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + unsigned offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = dir_pages(inode); + + pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); + if (pos >= inode->i_size) + goto done; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + struct sysv_dir_entry *de; + struct page *page = dir_get_page(inode, n); + + if (IS_ERR(page)) + continue; + kaddr = (char *)page_address(page); + de = (struct sysv_dir_entry *)(kaddr+offset); + limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE; + for ( ;(char*)de <= limit; de++) { + char *name = de->name; + int over; + + if (!de->inode) + continue; + + offset = (char *)de - kaddr; + + over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN), + ((loff_t)n<inode), + DT_UNKNOWN); + if (over) { + dir_put_page(page); + goto done; + } + } + dir_put_page(page); + } + +done: + filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset; + return 0; +} + +/* compare strings: name[0..len-1] (not zero-terminated) and + * buffer[0..] (filled with zeroes up to buffer[0..maxlen-1]) + */ +static inline int namecompare(int len, int maxlen, + const char * name, const char * buffer) +{ + if (len < maxlen && buffer[len]) + return 0; + return !memcmp(name, buffer, len); +} + +/* + * sysv_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + */ +struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_page) +{ + const char * name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct inode * dir = dentry->d_parent->d_inode; + unsigned long start, n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + struct sysv_dir_entry *de; + + *res_page = NULL; + + start = SYSV_I(dir)->i_dir_start_lookup; + if (start >= npages) + start = 0; + n = start; + + do { + char *kaddr; + page = dir_get_page(dir, n); + if (!IS_ERR(page)) { + kaddr = (char*)page_address(page); + de = (struct sysv_dir_entry *) kaddr; + kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE; + for ( ; (char *) de <= kaddr ; de++) { + if (!de->inode) + continue; + if (namecompare(namelen, SYSV_NAMELEN, + name, de->name)) + goto found; + } + dir_put_page(page); + } + + if (++n >= npages) + n = 0; + } while (n != start); + + return NULL; + +found: + SYSV_I(dir)->i_dir_start_lookup = n; + *res_page = page; + return de; +} + +int sysv_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char * name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct page *page = NULL; + struct sysv_dir_entry * de; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr; + loff_t pos; + int err; + + /* We take care of directory expansion in the same loop */ + for (n = 0; n <= npages; n++) { + page = dir_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + kaddr = (char*)page_address(page); + de = (struct sysv_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE; + while ((char *)de <= kaddr) { + if (!de->inode) + goto got_it; + err = -EEXIST; + if (namecompare(namelen, SYSV_NAMELEN, name, de->name)) + goto out_page; + de++; + } + dir_put_page(page); + } + BUG(); + return -EINVAL; + +got_it: + pos = page_offset(page) + + (char*)de - (char*)page_address(page); + lock_page(page); + err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + if (err) + goto out_unlock; + memcpy (de->name, name, namelen); + memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); + de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); + err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); +out_page: + dir_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_page; +} + +int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) +{ + struct inode *inode = page->mapping->host; + char *kaddr = (char*)page_address(page); + loff_t pos = page_offset(page) + (char *)de - kaddr; + int err; + + lock_page(page); + err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + BUG_ON(err); + de->inode = 0; + err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir_put_page(page); + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + return err; +} + +int sysv_make_empty(struct inode *inode, struct inode *dir) +{ + struct page *page = grab_cache_page(inode->i_mapping, 0); + struct sysv_dir_entry * de; + char *base; + int err; + + if (!page) + return -ENOMEM; + err = sysv_prepare_chunk(page, 0, 2 * SYSV_DIRSIZE); + if (err) { + unlock_page(page); + goto fail; + } + kmap(page); + + base = (char*)page_address(page); + memset(base, 0, PAGE_CACHE_SIZE); + + de = (struct sysv_dir_entry *) base; + de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); + strcpy(de->name,"."); + de++; + de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino); + strcpy(de->name,".."); + + kunmap(page); + err = dir_commit_chunk(page, 0, 2 * SYSV_DIRSIZE); +fail: + page_cache_release(page); + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int sysv_empty_dir(struct inode * inode) +{ + struct super_block *sb = inode->i_sb; + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + + for (i = 0; i < npages; i++) { + char *kaddr; + struct sysv_dir_entry * de; + page = dir_get_page(inode, i); + + if (IS_ERR(page)) + continue; + + kaddr = (char *)page_address(page); + de = (struct sysv_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE-SYSV_DIRSIZE; + + for ( ;(char *)de <= kaddr; de++) { + if (!de->inode) + continue; + /* check for . and .. */ + if (de->name[0] != '.') + goto not_empty; + if (!de->name[1]) { + if (de->inode == cpu_to_fs16(SYSV_SB(sb), + inode->i_ino)) + continue; + goto not_empty; + } + if (de->name[1] != '.' || de->name[2]) + goto not_empty; + } + dir_put_page(page); + } + return 1; + +not_empty: + dir_put_page(page); + return 0; +} + +/* Releases the page */ +void sysv_set_link(struct sysv_dir_entry *de, struct page *page, + struct inode *inode) +{ + struct inode *dir = page->mapping->host; + loff_t pos = page_offset(page) + + (char *)de-(char*)page_address(page); + int err; + + lock_page(page); + err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + BUG_ON(err); + de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); + err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); +} + +struct sysv_dir_entry * sysv_dotdot (struct inode *dir, struct page **p) +{ + struct page *page = dir_get_page(dir, 0); + struct sysv_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = (struct sysv_dir_entry*) page_address(page) + 1; + *p = page; + } + return de; +} + +ino_t sysv_inode_by_name(struct dentry *dentry) +{ + struct page *page; + struct sysv_dir_entry *de = sysv_find_entry (dentry, &page); + ino_t res = 0; + + if (de) { + res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); + dir_put_page(page); + } + return res; +} diff --git a/fs/sysv/file.c b/fs/sysv/file.c new file mode 100644 index 00000000..0a659395 --- /dev/null +++ b/fs/sysv/file.c @@ -0,0 +1,58 @@ +/* + * linux/fs/sysv/file.c + * + * minix/file.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * coh/file.c + * Copyright (C) 1993 Pascal Haible, Bruno Haible + * + * sysv/file.c + * Copyright (C) 1993 Bruno Haible + * + * SystemV/Coherent regular file handling primitives + */ + +#include "sysv.h" + +/* + * We have mostly NULLs here: the current defaults are OK for + * the coh filesystem. + */ +const struct file_operations sysv_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .fsync = generic_file_fsync, + .splice_read = generic_file_splice_read, +}; + +static int sysv_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations sysv_file_inode_operations = { + .truncate = sysv_truncate, + .setattr = sysv_setattr, + .getattr = sysv_getattr, +}; diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c new file mode 100644 index 00000000..0c96c98b --- /dev/null +++ b/fs/sysv/ialloc.c @@ -0,0 +1,234 @@ +/* + * linux/fs/sysv/ialloc.c + * + * minix/bitmap.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext/freelists.c + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * + * xenix/alloc.c + * Copyright (C) 1992 Doug Evans + * + * coh/alloc.c + * Copyright (C) 1993 Pascal Haible, Bruno Haible + * + * sysv/ialloc.c + * Copyright (C) 1993 Bruno Haible + * + * This file contains code for allocating/freeing inodes. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "sysv.h" + +/* We don't trust the value of + sb->sv_sbd2->s_tinode = *sb->sv_sb_total_free_inodes + but we nevertheless keep it up to date. */ + +/* An inode on disk is considered free if both i_mode == 0 and i_nlink == 0. */ + +/* return &sb->sv_sb_fic_inodes[i] = &sbd->s_inode[i]; */ +static inline sysv_ino_t * +sv_sb_fic_inode(struct super_block * sb, unsigned int i) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + + if (sbi->s_bh1 == sbi->s_bh2) + return &sbi->s_sb_fic_inodes[i]; + else { + /* 512 byte Xenix FS */ + unsigned int offset = offsetof(struct xenix_super_block, s_inode[i]); + if (offset < 512) + return (sysv_ino_t*)(sbi->s_sbd1 + offset); + else + return (sysv_ino_t*)(sbi->s_sbd2 + offset); + } +} + +struct sysv_inode * +sysv_raw_inode(struct super_block *sb, unsigned ino, struct buffer_head **bh) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + struct sysv_inode *res; + int block = sbi->s_firstinodezone + sbi->s_block_base; + + block += (ino-1) >> sbi->s_inodes_per_block_bits; + *bh = sb_bread(sb, block); + if (!*bh) + return NULL; + res = (struct sysv_inode *)(*bh)->b_data; + return res + ((ino-1) & sbi->s_inodes_per_block_1); +} + +static int refill_free_cache(struct super_block *sb) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + struct buffer_head * bh; + struct sysv_inode * raw_inode; + int i = 0, ino; + + ino = SYSV_ROOT_INO+1; + raw_inode = sysv_raw_inode(sb, ino, &bh); + if (!raw_inode) + goto out; + while (ino <= sbi->s_ninodes) { + if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) { + *sv_sb_fic_inode(sb,i++) = cpu_to_fs16(SYSV_SB(sb), ino); + if (i == sbi->s_fic_size) + break; + } + if ((ino++ & sbi->s_inodes_per_block_1) == 0) { + brelse(bh); + raw_inode = sysv_raw_inode(sb, ino, &bh); + if (!raw_inode) + goto out; + } else + raw_inode++; + } + brelse(bh); +out: + return i; +} + +void sysv_free_inode(struct inode * inode) +{ + struct super_block *sb = inode->i_sb; + struct sysv_sb_info *sbi = SYSV_SB(sb); + unsigned int ino; + struct buffer_head * bh; + struct sysv_inode * raw_inode; + unsigned count; + + sb = inode->i_sb; + ino = inode->i_ino; + if (ino <= SYSV_ROOT_INO || ino > sbi->s_ninodes) { + printk("sysv_free_inode: inode 0,1,2 or nonexistent inode\n"); + return; + } + raw_inode = sysv_raw_inode(sb, ino, &bh); + if (!raw_inode) { + printk("sysv_free_inode: unable to read inode block on device " + "%s\n", inode->i_sb->s_id); + return; + } + lock_super(sb); + count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); + if (count < sbi->s_fic_size) { + *sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino); + *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); + } + fs16_add(sbi, sbi->s_sb_total_free_inodes, 1); + dirty_sb(sb); + memset(raw_inode, 0, sizeof(struct sysv_inode)); + mark_buffer_dirty(bh); + unlock_super(sb); + brelse(bh); +} + +struct inode * sysv_new_inode(const struct inode * dir, mode_t mode) +{ + struct super_block *sb = dir->i_sb; + struct sysv_sb_info *sbi = SYSV_SB(sb); + struct inode *inode; + sysv_ino_t ino; + unsigned count; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE + }; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + lock_super(sb); + count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); + if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) { + count = refill_free_cache(sb); + if (count == 0) { + iput(inode); + unlock_super(sb); + return ERR_PTR(-ENOSPC); + } + } + /* Now count > 0. */ + ino = *sv_sb_fic_inode(sb,--count); + *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); + fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); + dirty_sb(sb); + inode_init_owner(inode, dir, mode); + inode->i_ino = fs16_to_cpu(sbi, ino); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_blocks = 0; + memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data)); + SYSV_I(inode)->i_dir_start_lookup = 0; + insert_inode_hash(inode); + mark_inode_dirty(inode); + + sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */ + mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ + /* That's it. */ + unlock_super(sb); + return inode; +} + +unsigned long sysv_count_free_inodes(struct super_block * sb) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + struct buffer_head * bh; + struct sysv_inode * raw_inode; + int ino, count, sb_count; + + lock_super(sb); + + sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes); + + if (0) + goto trust_sb; + + /* this causes a lot of disk traffic ... */ + count = 0; + ino = SYSV_ROOT_INO+1; + raw_inode = sysv_raw_inode(sb, ino, &bh); + if (!raw_inode) + goto Eio; + while (ino <= sbi->s_ninodes) { + if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) + count++; + if ((ino++ & sbi->s_inodes_per_block_1) == 0) { + brelse(bh); + raw_inode = sysv_raw_inode(sb, ino, &bh); + if (!raw_inode) + goto Eio; + } else + raw_inode++; + } + brelse(bh); + if (count != sb_count) + goto Einval; +out: + unlock_super(sb); + return count; + +Einval: + printk("sysv_count_free_inodes: " + "free inode count was %d, correcting to %d\n", + sb_count, count); + if (!(sb->s_flags & MS_RDONLY)) { + *sbi->s_sb_total_free_inodes = cpu_to_fs16(SYSV_SB(sb), count); + dirty_sb(sb); + } + goto out; + +Eio: + printk("sysv_count_free_inodes: unable to read inode table\n"); +trust_sb: + count = sb_count; + goto out; +} diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c new file mode 100644 index 00000000..0630eb96 --- /dev/null +++ b/fs/sysv/inode.c @@ -0,0 +1,381 @@ +/* + * linux/fs/sysv/inode.c + * + * minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * xenix/inode.c + * Copyright (C) 1992 Doug Evans + * + * coh/inode.c + * Copyright (C) 1993 Pascal Haible, Bruno Haible + * + * sysv/inode.c + * Copyright (C) 1993 Paul B. Monday + * + * sysv/inode.c + * Copyright (C) 1993 Bruno Haible + * Copyright (C) 1997, 1998 Krzysztof G. Baranowski + * + * This file contains code for allocating/freeing inodes and for read/writing + * the superblock. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "sysv.h" + +static int sysv_sync_fs(struct super_block *sb, int wait) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + unsigned long time = get_seconds(), old_time; + + lock_super(sb); + + /* + * If we are going to write out the super block, + * then attach current time stamp. + * But if the filesystem was marked clean, keep it clean. + */ + sb->s_dirt = 0; + old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); + if (sbi->s_type == FSTYPE_SYSV4) { + if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time)) + *sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38 - time); + *sbi->s_sb_time = cpu_to_fs32(sbi, time); + mark_buffer_dirty(sbi->s_bh2); + } + + unlock_super(sb); + + return 0; +} + +static void sysv_write_super(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) + sysv_sync_fs(sb, 1); + else + sb->s_dirt = 0; +} + +static int sysv_remount(struct super_block *sb, int *flags, char *data) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + lock_super(sb); + if (sbi->s_forced_ro) + *flags |= MS_RDONLY; + if (*flags & MS_RDONLY) + sysv_write_super(sb); + unlock_super(sb); + return 0; +} + +static void sysv_put_super(struct super_block *sb) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + + if (sb->s_dirt) + sysv_write_super(sb); + + if (!(sb->s_flags & MS_RDONLY)) { + /* XXX ext2 also updates the state here */ + mark_buffer_dirty(sbi->s_bh1); + if (sbi->s_bh1 != sbi->s_bh2) + mark_buffer_dirty(sbi->s_bh2); + } + + brelse(sbi->s_bh1); + if (sbi->s_bh1 != sbi->s_bh2) + brelse(sbi->s_bh2); + + kfree(sbi); +} + +static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct sysv_sb_info *sbi = SYSV_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = sb->s_magic; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = sbi->s_ndatazones; + buf->f_bavail = buf->f_bfree = sysv_count_free_blocks(sb); + buf->f_files = sbi->s_ninodes; + buf->f_ffree = sysv_count_free_inodes(sb); + buf->f_namelen = SYSV_NAMELEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + return 0; +} + +/* + * NXI <-> N0XI for PDP, XIN <-> XIN0 for le32, NIX <-> 0NIX for be32 + */ +static inline void read3byte(struct sysv_sb_info *sbi, + unsigned char * from, unsigned char * to) +{ + if (sbi->s_bytesex == BYTESEX_PDP) { + to[0] = from[0]; + to[1] = 0; + to[2] = from[1]; + to[3] = from[2]; + } else if (sbi->s_bytesex == BYTESEX_LE) { + to[0] = from[0]; + to[1] = from[1]; + to[2] = from[2]; + to[3] = 0; + } else { + to[0] = 0; + to[1] = from[0]; + to[2] = from[1]; + to[3] = from[2]; + } +} + +static inline void write3byte(struct sysv_sb_info *sbi, + unsigned char * from, unsigned char * to) +{ + if (sbi->s_bytesex == BYTESEX_PDP) { + to[0] = from[0]; + to[1] = from[2]; + to[2] = from[3]; + } else if (sbi->s_bytesex == BYTESEX_LE) { + to[0] = from[0]; + to[1] = from[1]; + to[2] = from[2]; + } else { + to[0] = from[1]; + to[1] = from[2]; + to[2] = from[3]; + } +} + +static const struct inode_operations sysv_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .getattr = sysv_getattr, +}; + +void sysv_set_inode(struct inode *inode, dev_t rdev) +{ + if (S_ISREG(inode->i_mode)) { + inode->i_op = &sysv_file_inode_operations; + inode->i_fop = &sysv_file_operations; + inode->i_mapping->a_ops = &sysv_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &sysv_dir_inode_operations; + inode->i_fop = &sysv_dir_operations; + inode->i_mapping->a_ops = &sysv_aops; + } else if (S_ISLNK(inode->i_mode)) { + if (inode->i_blocks) { + inode->i_op = &sysv_symlink_inode_operations; + inode->i_mapping->a_ops = &sysv_aops; + } else { + inode->i_op = &sysv_fast_symlink_inode_operations; + nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size, + sizeof(SYSV_I(inode)->i_data) - 1); + } + } else + init_special_inode(inode, inode->i_mode, rdev); +} + +struct inode *sysv_iget(struct super_block *sb, unsigned int ino) +{ + struct sysv_sb_info * sbi = SYSV_SB(sb); + struct buffer_head * bh; + struct sysv_inode * raw_inode; + struct sysv_inode_info * si; + struct inode *inode; + unsigned int block; + + if (!ino || ino > sbi->s_ninodes) { + printk("Bad inode number on dev %s: %d is out of range\n", + sb->s_id, ino); + return ERR_PTR(-EIO); + } + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + raw_inode = sysv_raw_inode(sb, ino, &bh); + if (!raw_inode) { + printk("Major problem: unable to read inode from dev %s\n", + inode->i_sb->s_id); + goto bad_inode; + } + /* SystemV FS: kludge permissions if ino==SYSV_ROOT_INO ?? */ + inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode); + inode->i_uid = (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid); + inode->i_gid = (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid); + inode->i_nlink = fs16_to_cpu(sbi, raw_inode->i_nlink); + inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); + inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime); + inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime); + inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_ctime); + inode->i_ctime.tv_nsec = 0; + inode->i_atime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_blocks = 0; + + si = SYSV_I(inode); + for (block = 0; block < 10+1+1+1; block++) + read3byte(sbi, &raw_inode->i_data[3*block], + (u8 *)&si->i_data[block]); + brelse(bh); + si->i_dir_start_lookup = 0; + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + sysv_set_inode(inode, + old_decode_dev(fs32_to_cpu(sbi, si->i_data[0]))); + else + sysv_set_inode(inode, 0); + unlock_new_inode(inode); + return inode; + +bad_inode: + iget_failed(inode); + return ERR_PTR(-EIO); +} + +static int __sysv_write_inode(struct inode *inode, int wait) +{ + struct super_block * sb = inode->i_sb; + struct sysv_sb_info * sbi = SYSV_SB(sb); + struct buffer_head * bh; + struct sysv_inode * raw_inode; + struct sysv_inode_info * si; + unsigned int ino, block; + int err = 0; + + ino = inode->i_ino; + if (!ino || ino > sbi->s_ninodes) { + printk("Bad inode number on dev %s: %d is out of range\n", + inode->i_sb->s_id, ino); + return -EIO; + } + raw_inode = sysv_raw_inode(sb, ino, &bh); + if (!raw_inode) { + printk("unable to read i-node block\n"); + return -EIO; + } + + raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode); + raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid)); + raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid)); + raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink); + raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size); + raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec); + raw_inode->i_mtime = cpu_to_fs32(sbi, inode->i_mtime.tv_sec); + raw_inode->i_ctime = cpu_to_fs32(sbi, inode->i_ctime.tv_sec); + + si = SYSV_I(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + si->i_data[0] = cpu_to_fs32(sbi, old_encode_dev(inode->i_rdev)); + for (block = 0; block < 10+1+1+1; block++) + write3byte(sbi, (u8 *)&si->i_data[block], + &raw_inode->i_data[3*block]); + mark_buffer_dirty(bh); + if (wait) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + printk ("IO error syncing sysv inode [%s:%08x]\n", + sb->s_id, ino); + err = -EIO; + } + } + brelse(bh); + return 0; +} + +int sysv_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + +int sysv_sync_inode(struct inode *inode) +{ + return __sysv_write_inode(inode, 1); +} + +static void sysv_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + if (!inode->i_nlink) { + inode->i_size = 0; + sysv_truncate(inode); + } + invalidate_inode_buffers(inode); + end_writeback(inode); + if (!inode->i_nlink) + sysv_free_inode(inode); +} + +static struct kmem_cache *sysv_inode_cachep; + +static struct inode *sysv_alloc_inode(struct super_block *sb) +{ + struct sysv_inode_info *si; + + si = kmem_cache_alloc(sysv_inode_cachep, GFP_KERNEL); + if (!si) + return NULL; + return &si->vfs_inode; +} + +static void sysv_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(sysv_inode_cachep, SYSV_I(inode)); +} + +static void sysv_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, sysv_i_callback); +} + +static void init_once(void *p) +{ + struct sysv_inode_info *si = (struct sysv_inode_info *)p; + + inode_init_once(&si->vfs_inode); +} + +const struct super_operations sysv_sops = { + .alloc_inode = sysv_alloc_inode, + .destroy_inode = sysv_destroy_inode, + .write_inode = sysv_write_inode, + .evict_inode = sysv_evict_inode, + .put_super = sysv_put_super, + .write_super = sysv_write_super, + .sync_fs = sysv_sync_fs, + .remount_fs = sysv_remount, + .statfs = sysv_statfs, +}; + +int __init sysv_init_icache(void) +{ + sysv_inode_cachep = kmem_cache_create("sysv_inode_cache", + sizeof(struct sysv_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + init_once); + if (!sysv_inode_cachep) + return -ENOMEM; + return 0; +} + +void sysv_destroy_icache(void) +{ + kmem_cache_destroy(sysv_inode_cachep); +} diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c new file mode 100644 index 00000000..fa8d43c9 --- /dev/null +++ b/fs/sysv/itree.c @@ -0,0 +1,494 @@ +/* + * linux/fs/sysv/itree.c + * + * Handling of indirect blocks' trees. + * AV, Sep--Dec 2000 + */ + +#include +#include +#include +#include "sysv.h" + +enum {DIRECT = 10, DEPTH = 4}; /* Have triple indirect */ + +static inline void dirty_indirect(struct buffer_head *bh, struct inode *inode) +{ + mark_buffer_dirty_inode(bh, inode); + if (IS_SYNC(inode)) + sync_dirty_buffer(bh); +} + +static int block_to_path(struct inode *inode, long block, int offsets[DEPTH]) +{ + struct super_block *sb = inode->i_sb; + struct sysv_sb_info *sbi = SYSV_SB(sb); + int ptrs_bits = sbi->s_ind_per_block_bits; + unsigned long indirect_blocks = sbi->s_ind_per_block, + double_blocks = sbi->s_ind_per_block_2; + int n = 0; + + if (block < 0) { + printk("sysv_block_map: block < 0\n"); + } else if (block < DIRECT) { + offsets[n++] = block; + } else if ( (block -= DIRECT) < indirect_blocks) { + offsets[n++] = DIRECT; + offsets[n++] = block; + } else if ((block -= indirect_blocks) < double_blocks) { + offsets[n++] = DIRECT+1; + offsets[n++] = block >> ptrs_bits; + offsets[n++] = block & (indirect_blocks - 1); + } else if (((block -= double_blocks) >> (ptrs_bits * 2)) < indirect_blocks) { + offsets[n++] = DIRECT+2; + offsets[n++] = block >> (ptrs_bits * 2); + offsets[n++] = (block >> ptrs_bits) & (indirect_blocks - 1); + offsets[n++] = block & (indirect_blocks - 1); + } else { + /* nothing */; + } + return n; +} + +static inline int block_to_cpu(struct sysv_sb_info *sbi, sysv_zone_t nr) +{ + return sbi->s_block_base + fs32_to_cpu(sbi, nr); +} + +typedef struct { + sysv_zone_t *p; + sysv_zone_t key; + struct buffer_head *bh; +} Indirect; + +static DEFINE_RWLOCK(pointers_lock); + +static inline void add_chain(Indirect *p, struct buffer_head *bh, sysv_zone_t *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +static inline int verify_chain(Indirect *from, Indirect *to) +{ + while (from <= to && from->key == *from->p) + from++; + return (from > to); +} + +static inline sysv_zone_t *block_end(struct buffer_head *bh) +{ + return (sysv_zone_t*)((char*)bh->b_data + bh->b_size); +} + +/* + * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock) + */ +static Indirect *get_branch(struct inode *inode, + int depth, + int offsets[], + Indirect chain[], + int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + add_chain(chain, NULL, SYSV_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + int block = block_to_cpu(SYSV_SB(sb), p->key); + bh = sb_bread(sb, block); + if (!bh) + goto failure; + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets); + if (!p->key) + goto no_block; + } + return NULL; + +changed: + brelse(bh); + *err = -EAGAIN; + goto no_block; +failure: + *err = -EIO; +no_block: + return p; +} + +static int alloc_branch(struct inode *inode, + int num, + int *offsets, + Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int n = 0; + int i; + + branch[0].key = sysv_new_block(inode->i_sb); + if (branch[0].key) for (n = 1; n < num; n++) { + struct buffer_head *bh; + int parent; + /* Allocate the next block */ + branch[n].key = sysv_new_block(inode->i_sb); + if (!branch[n].key) + break; + /* + * Get buffer_head for parent block, zero it out and set + * the pointer to new one, then send parent to disk. + */ + parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key); + bh = sb_getblk(inode->i_sb, parent); + lock_buffer(bh); + memset(bh->b_data, 0, blocksize); + branch[n].bh = bh; + branch[n].p = (sysv_zone_t*) bh->b_data + offsets[n]; + *branch[n].p = branch[n].key; + set_buffer_uptodate(bh); + unlock_buffer(bh); + dirty_indirect(bh, inode); + } + if (n == num) + return 0; + + /* Allocation failed, free what we already allocated */ + for (i = 1; i < n; i++) + bforget(branch[i].bh); + for (i = 0; i < n; i++) + sysv_free_block(inode->i_sb, branch[i].key); + return -ENOSPC; +} + +static inline int splice_branch(struct inode *inode, + Indirect chain[], + Indirect *where, + int num) +{ + int i; + + /* Verify that place we are splicing to is still there and vacant */ + write_lock(&pointers_lock); + if (!verify_chain(chain, where-1) || *where->p) + goto changed; + *where->p = where->key; + write_unlock(&pointers_lock); + + inode->i_ctime = CURRENT_TIME_SEC; + + /* had we spliced it onto indirect block? */ + if (where->bh) + dirty_indirect(where->bh, inode); + + if (IS_SYNC(inode)) + sysv_sync_inode(inode); + else + mark_inode_dirty(inode); + return 0; + +changed: + write_unlock(&pointers_lock); + for (i = 1; i < num; i++) + bforget(where[i].bh); + for (i = 0; i < num; i++) + sysv_free_block(inode->i_sb, where[i].key); + return -EAGAIN; +} + +static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) +{ + int err = -EIO; + int offsets[DEPTH]; + Indirect chain[DEPTH]; + struct super_block *sb = inode->i_sb; + Indirect *partial; + int left; + int depth = block_to_path(inode, iblock, offsets); + + if (depth == 0) + goto out; + +reread: + read_lock(&pointers_lock); + partial = get_branch(inode, depth, offsets, chain, &err); + read_unlock(&pointers_lock); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { +got_it: + map_bh(bh_result, sb, block_to_cpu(SYSV_SB(sb), + chain[depth-1].key)); + /* Clean up and exit */ + partial = chain+depth-1; /* the whole chain */ + goto cleanup; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) { +cleanup: + while (partial > chain) { + brelse(partial->bh); + partial--; + } +out: + return err; + } + + /* + * Indirect block might be removed by truncate while we were + * reading it. Handling of that case (forget what we've got and + * reread) is taken out of the main path. + */ + if (err == -EAGAIN) + goto changed; + + left = (chain + depth) - partial; + err = alloc_branch(inode, left, offsets+(partial-chain), partial); + if (err) + goto cleanup; + + if (splice_branch(inode, chain, partial, left) < 0) + goto changed; + + set_buffer_new(bh_result); + goto got_it; + +changed: + while (partial > chain) { + brelse(partial->bh); + partial--; + } + goto reread; +} + +static inline int all_zeroes(sysv_zone_t *p, sysv_zone_t *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +static Indirect *find_shared(struct inode *inode, + int depth, + int offsets[], + Indirect chain[], + sysv_zone_t *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + + write_lock(&pointers_lock); + partial = get_branch(inode, k, offsets, chain, &err); + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) { + write_unlock(&pointers_lock); + goto no_top; + } + for (p=partial; p>chain && all_zeroes((sysv_zone_t*)p->bh->b_data,p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + *p->p = 0; + } + write_unlock(&pointers_lock); + + while (partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +static inline void free_data(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q) +{ + for ( ; p < q ; p++) { + sysv_zone_t nr = *p; + if (nr) { + *p = 0; + sysv_free_block(inode->i_sb, nr); + mark_inode_dirty(inode); + } + } +} + +static void free_branches(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q, int depth) +{ + struct buffer_head * bh; + struct super_block *sb = inode->i_sb; + + if (depth--) { + for ( ; p < q ; p++) { + int block; + sysv_zone_t nr = *p; + if (!nr) + continue; + *p = 0; + block = block_to_cpu(SYSV_SB(sb), nr); + bh = sb_bread(sb, block); + if (!bh) + continue; + free_branches(inode, (sysv_zone_t*)bh->b_data, + block_end(bh), depth); + bforget(bh); + sysv_free_block(sb, nr); + mark_inode_dirty(inode); + } + } else + free_data(inode, p, q); +} + +void sysv_truncate (struct inode * inode) +{ + sysv_zone_t *i_data = SYSV_I(inode)->i_data; + int offsets[DEPTH]; + Indirect chain[DEPTH]; + Indirect *partial; + sysv_zone_t nr = 0; + int n; + long iblock; + unsigned blocksize; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + + blocksize = inode->i_sb->s_blocksize; + iblock = (inode->i_size + blocksize-1) + >> inode->i_sb->s_blocksize_bits; + + block_truncate_page(inode->i_mapping, inode->i_size, get_block); + + n = block_to_path(inode, iblock, offsets); + if (n == 0) + return; + + if (n == 1) { + free_data(inode, i_data+offsets[0], i_data + DIRECT); + goto do_indirects; + } + + partial = find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (already detached) */ + if (nr) { + if (partial == chain) + mark_inode_dirty(inode); + else + dirty_indirect(partial->bh, inode); + free_branches(inode, &nr, &nr+1, (chain+n-1) - partial); + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + free_branches(inode, partial->p + 1, block_end(partial->bh), + (chain+n-1) - partial); + dirty_indirect(partial->bh, inode); + brelse (partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees (== subtrees deeper than...) */ + while (n < DEPTH) { + nr = i_data[DIRECT + n - 1]; + if (nr) { + i_data[DIRECT + n - 1] = 0; + mark_inode_dirty(inode); + free_branches(inode, &nr, &nr+1, n); + } + n++; + } + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + if (IS_SYNC(inode)) + sysv_sync_inode (inode); + else + mark_inode_dirty(inode); +} + +static unsigned sysv_nblocks(struct super_block *s, loff_t size) +{ + struct sysv_sb_info *sbi = SYSV_SB(s); + int ptrs_bits = sbi->s_ind_per_block_bits; + unsigned blocks, res, direct = DIRECT, i = DEPTH; + blocks = (size + s->s_blocksize - 1) >> s->s_blocksize_bits; + res = blocks; + while (--i && blocks > direct) { + blocks = ((blocks - direct - 1) >> ptrs_bits) + 1; + res += blocks; + direct = 1; + } + return blocks; +} + +int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct super_block *s = mnt->mnt_sb; + generic_fillattr(dentry->d_inode, stat); + stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); + stat->blksize = s->s_blocksize; + return 0; +} + +static int sysv_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page,get_block,wbc); +} + +static int sysv_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,get_block); +} + +int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len) +{ + return __block_write_begin(page, pos, len, get_block); +} + +static int sysv_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, get_block); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} + +static sector_t sysv_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,get_block); +} + +const struct address_space_operations sysv_aops = { + .readpage = sysv_readpage, + .writepage = sysv_writepage, + .write_begin = sysv_write_begin, + .write_end = generic_write_end, + .bmap = sysv_bmap +}; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c new file mode 100644 index 00000000..e474fbcf --- /dev/null +++ b/fs/sysv/namei.c @@ -0,0 +1,301 @@ +/* + * linux/fs/sysv/namei.c + * + * minix/namei.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * coh/namei.c + * Copyright (C) 1993 Pascal Haible, Bruno Haible + * + * sysv/namei.c + * Copyright (C) 1993 Bruno Haible + * Copyright (C) 1997, 1998 Krzysztof G. Baranowski + */ + +#include +#include "sysv.h" + +static int add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = sysv_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +static int sysv_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) +{ + /* Truncate the name in place, avoids having to define a compare + function. */ + if (qstr->len > SYSV_NAMELEN) { + qstr->len = SYSV_NAMELEN; + qstr->hash = full_name_hash(qstr->name, qstr->len); + } + return 0; +} + +const struct dentry_operations sysv_dentry_operations = { + .d_hash = sysv_hash, +}; + +static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +{ + struct inode * inode = NULL; + ino_t ino; + + if (dentry->d_name.len > SYSV_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + ino = sysv_inode_by_name(dentry); + + if (ino) { + inode = sysv_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + d_add(dentry, inode); + return NULL; +} + +static int sysv_mknod(struct inode * dir, struct dentry * dentry, int mode, dev_t rdev) +{ + struct inode * inode; + int err; + + if (!old_valid_dev(rdev)) + return -EINVAL; + + inode = sysv_new_inode(dir, mode); + err = PTR_ERR(inode); + + if (!IS_ERR(inode)) { + sysv_set_inode(inode, rdev); + mark_inode_dirty(inode); + err = add_nondir(dentry, inode); + } + return err; +} + +static int sysv_create(struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) +{ + return sysv_mknod(dir, dentry, mode, 0); +} + +static int sysv_symlink(struct inode * dir, struct dentry * dentry, + const char * symname) +{ + int err = -ENAMETOOLONG; + int l = strlen(symname)+1; + struct inode * inode; + + if (l > dir->i_sb->s_blocksize) + goto out; + + inode = sysv_new_inode(dir, S_IFLNK|0777); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + sysv_set_inode(inode, 0); + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + + mark_inode_dirty(inode); + err = add_nondir(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + iput(inode); + goto out; +} + +static int sysv_link(struct dentry * old_dentry, struct inode * dir, + struct dentry * dentry) +{ + struct inode *inode = old_dentry->d_inode; + + if (inode->i_nlink >= SYSV_SB(inode->i_sb)->s_link_max) + return -EMLINK; + + inode->i_ctime = CURRENT_TIME_SEC; + inode_inc_link_count(inode); + ihold(inode); + + return add_nondir(dentry, inode); +} + +static int sysv_mkdir(struct inode * dir, struct dentry *dentry, int mode) +{ + struct inode * inode; + int err = -EMLINK; + + if (dir->i_nlink >= SYSV_SB(dir->i_sb)->s_link_max) + goto out; + inode_inc_link_count(dir); + + inode = sysv_new_inode(dir, S_IFDIR|mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + sysv_set_inode(inode, 0); + + inode_inc_link_count(inode); + + err = sysv_make_empty(inode, dir); + if (err) + goto out_fail; + + err = sysv_add_link(dentry, inode); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + iput(inode); +out_dir: + inode_dec_link_count(dir); + goto out; +} + +static int sysv_unlink(struct inode * dir, struct dentry * dentry) +{ + struct inode * inode = dentry->d_inode; + struct page * page; + struct sysv_dir_entry * de; + int err = -ENOENT; + + de = sysv_find_entry(dentry, &page); + if (!de) + goto out; + + err = sysv_delete_entry (de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); +out: + return err; +} + +static int sysv_rmdir(struct inode * dir, struct dentry * dentry) +{ + struct inode *inode = dentry->d_inode; + int err = -ENOTEMPTY; + + if (sysv_empty_dir(inode)) { + err = sysv_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + inode_dec_link_count(inode); + inode_dec_link_count(dir); + } + } + return err; +} + +/* + * Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + */ +static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, + struct inode * new_dir, struct dentry * new_dentry) +{ + struct inode * old_inode = old_dentry->d_inode; + struct inode * new_inode = new_dentry->d_inode; + struct page * dir_page = NULL; + struct sysv_dir_entry * dir_de = NULL; + struct page * old_page; + struct sysv_dir_entry * old_de; + int err = -ENOENT; + + old_de = sysv_find_entry(old_dentry, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = sysv_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page * new_page; + struct sysv_dir_entry * new_de; + + err = -ENOTEMPTY; + if (dir_de && !sysv_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = sysv_find_entry(new_dentry, &new_page); + if (!new_de) + goto out_dir; + sysv_set_link(new_de, new_page, old_inode); + new_inode->i_ctime = CURRENT_TIME_SEC; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + } else { + if (dir_de) { + err = -EMLINK; + if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max) + goto out_dir; + } + err = sysv_add_link(new_dentry, old_inode); + if (err) + goto out_dir; + if (dir_de) + inode_inc_link_count(new_dir); + } + + sysv_delete_entry(old_de, old_page); + mark_inode_dirty(old_inode); + + if (dir_de) { + sysv_set_link(dir_de, dir_page, new_dir); + inode_dec_link_count(old_dir); + } + return 0; + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + return err; +} + +/* + * directories can handle most operations... + */ +const struct inode_operations sysv_dir_inode_operations = { + .create = sysv_create, + .lookup = sysv_lookup, + .link = sysv_link, + .unlink = sysv_unlink, + .symlink = sysv_symlink, + .mkdir = sysv_mkdir, + .rmdir = sysv_rmdir, + .mknod = sysv_mknod, + .rename = sysv_rename, + .getattr = sysv_getattr, +}; diff --git a/fs/sysv/super.c b/fs/sysv/super.c new file mode 100644 index 00000000..f60c1969 --- /dev/null +++ b/fs/sysv/super.c @@ -0,0 +1,590 @@ +/* + * linux/fs/sysv/inode.c + * + * minix/inode.c + * Copyright (C) 1991, 1992 Linus Torvalds + * + * xenix/inode.c + * Copyright (C) 1992 Doug Evans + * + * coh/inode.c + * Copyright (C) 1993 Pascal Haible, Bruno Haible + * + * sysv/inode.c + * Copyright (C) 1993 Paul B. Monday + * + * sysv/inode.c + * Copyright (C) 1993 Bruno Haible + * Copyright (C) 1997, 1998 Krzysztof G. Baranowski + * + * This file contains code for read/parsing the superblock. + */ + +#include +#include +#include +#include +#include "sysv.h" + +/* + * The following functions try to recognize specific filesystems. + * + * We recognize: + * - Xenix FS by its magic number. + * - SystemV FS by its magic number. + * - Coherent FS by its funny fname/fpack field. + * - SCO AFS by s_nfree == 0xffff + * - V7 FS has no distinguishing features. + * + * We discriminate among SystemV4 and SystemV2 FS by the assumption that + * the time stamp is not < 01-01-1980. + */ + +enum { + JAN_1_1980 = (10*365 + 2) * 24 * 60 * 60 +}; + +static void detected_xenix(struct sysv_sb_info *sbi) +{ + struct buffer_head *bh1 = sbi->s_bh1; + struct buffer_head *bh2 = sbi->s_bh2; + struct xenix_super_block * sbd1; + struct xenix_super_block * sbd2; + + if (bh1 != bh2) + sbd1 = sbd2 = (struct xenix_super_block *) bh1->b_data; + else { + /* block size = 512, so bh1 != bh2 */ + sbd1 = (struct xenix_super_block *) bh1->b_data; + sbd2 = (struct xenix_super_block *) (bh2->b_data - 512); + } + + sbi->s_link_max = XENIX_LINK_MAX; + sbi->s_fic_size = XENIX_NICINOD; + sbi->s_flc_size = XENIX_NICFREE; + sbi->s_sbd1 = (char *)sbd1; + sbi->s_sbd2 = (char *)sbd2; + sbi->s_sb_fic_count = &sbd1->s_ninode; + sbi->s_sb_fic_inodes = &sbd1->s_inode[0]; + sbi->s_sb_total_free_inodes = &sbd2->s_tinode; + sbi->s_bcache_count = &sbd1->s_nfree; + sbi->s_bcache = &sbd1->s_free[0]; + sbi->s_free_blocks = &sbd2->s_tfree; + sbi->s_sb_time = &sbd2->s_time; + sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd1->s_isize); + sbi->s_nzones = fs32_to_cpu(sbi, sbd1->s_fsize); +} + +static void detected_sysv4(struct sysv_sb_info *sbi) +{ + struct sysv4_super_block * sbd; + struct buffer_head *bh1 = sbi->s_bh1; + struct buffer_head *bh2 = sbi->s_bh2; + + if (bh1 == bh2) + sbd = (struct sysv4_super_block *) (bh1->b_data + BLOCK_SIZE/2); + else + sbd = (struct sysv4_super_block *) bh2->b_data; + + sbi->s_link_max = SYSV_LINK_MAX; + sbi->s_fic_size = SYSV_NICINOD; + sbi->s_flc_size = SYSV_NICFREE; + sbi->s_sbd1 = (char *)sbd; + sbi->s_sbd2 = (char *)sbd; + sbi->s_sb_fic_count = &sbd->s_ninode; + sbi->s_sb_fic_inodes = &sbd->s_inode[0]; + sbi->s_sb_total_free_inodes = &sbd->s_tinode; + sbi->s_bcache_count = &sbd->s_nfree; + sbi->s_bcache = &sbd->s_free[0]; + sbi->s_free_blocks = &sbd->s_tfree; + sbi->s_sb_time = &sbd->s_time; + sbi->s_sb_state = &sbd->s_state; + sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); + sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); +} + +static void detected_sysv2(struct sysv_sb_info *sbi) +{ + struct sysv2_super_block *sbd; + struct buffer_head *bh1 = sbi->s_bh1; + struct buffer_head *bh2 = sbi->s_bh2; + + if (bh1 == bh2) + sbd = (struct sysv2_super_block *) (bh1->b_data + BLOCK_SIZE/2); + else + sbd = (struct sysv2_super_block *) bh2->b_data; + + sbi->s_link_max = SYSV_LINK_MAX; + sbi->s_fic_size = SYSV_NICINOD; + sbi->s_flc_size = SYSV_NICFREE; + sbi->s_sbd1 = (char *)sbd; + sbi->s_sbd2 = (char *)sbd; + sbi->s_sb_fic_count = &sbd->s_ninode; + sbi->s_sb_fic_inodes = &sbd->s_inode[0]; + sbi->s_sb_total_free_inodes = &sbd->s_tinode; + sbi->s_bcache_count = &sbd->s_nfree; + sbi->s_bcache = &sbd->s_free[0]; + sbi->s_free_blocks = &sbd->s_tfree; + sbi->s_sb_time = &sbd->s_time; + sbi->s_sb_state = &sbd->s_state; + sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); + sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); +} + +static void detected_coherent(struct sysv_sb_info *sbi) +{ + struct coh_super_block * sbd; + struct buffer_head *bh1 = sbi->s_bh1; + + sbd = (struct coh_super_block *) bh1->b_data; + + sbi->s_link_max = COH_LINK_MAX; + sbi->s_fic_size = COH_NICINOD; + sbi->s_flc_size = COH_NICFREE; + sbi->s_sbd1 = (char *)sbd; + sbi->s_sbd2 = (char *)sbd; + sbi->s_sb_fic_count = &sbd->s_ninode; + sbi->s_sb_fic_inodes = &sbd->s_inode[0]; + sbi->s_sb_total_free_inodes = &sbd->s_tinode; + sbi->s_bcache_count = &sbd->s_nfree; + sbi->s_bcache = &sbd->s_free[0]; + sbi->s_free_blocks = &sbd->s_tfree; + sbi->s_sb_time = &sbd->s_time; + sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); + sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); +} + +static void detected_v7(struct sysv_sb_info *sbi) +{ + struct buffer_head *bh2 = sbi->s_bh2; + struct v7_super_block *sbd = (struct v7_super_block *)bh2->b_data; + + sbi->s_link_max = V7_LINK_MAX; + sbi->s_fic_size = V7_NICINOD; + sbi->s_flc_size = V7_NICFREE; + sbi->s_sbd1 = (char *)sbd; + sbi->s_sbd2 = (char *)sbd; + sbi->s_sb_fic_count = &sbd->s_ninode; + sbi->s_sb_fic_inodes = &sbd->s_inode[0]; + sbi->s_sb_total_free_inodes = &sbd->s_tinode; + sbi->s_bcache_count = &sbd->s_nfree; + sbi->s_bcache = &sbd->s_free[0]; + sbi->s_free_blocks = &sbd->s_tfree; + sbi->s_sb_time = &sbd->s_time; + sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); + sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); +} + +static int detect_xenix(struct sysv_sb_info *sbi, struct buffer_head *bh) +{ + struct xenix_super_block *sbd = (struct xenix_super_block *)bh->b_data; + if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0x2b5544)) + sbi->s_bytesex = BYTESEX_LE; + else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0x2b5544)) + sbi->s_bytesex = BYTESEX_BE; + else + return 0; + switch (fs32_to_cpu(sbi, sbd->s_type)) { + case 1: + sbi->s_type = FSTYPE_XENIX; + return 1; + case 2: + sbi->s_type = FSTYPE_XENIX; + return 2; + default: + return 0; + } +} + +static int detect_sysv(struct sysv_sb_info *sbi, struct buffer_head *bh) +{ + struct super_block *sb = sbi->s_sb; + /* All relevant fields are at the same offsets in R2 and R4 */ + struct sysv4_super_block * sbd; + u32 type; + + sbd = (struct sysv4_super_block *) (bh->b_data + BLOCK_SIZE/2); + if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0xfd187e20)) + sbi->s_bytesex = BYTESEX_LE; + else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0xfd187e20)) + sbi->s_bytesex = BYTESEX_BE; + else + return 0; + + type = fs32_to_cpu(sbi, sbd->s_type); + + if (fs16_to_cpu(sbi, sbd->s_nfree) == 0xffff) { + sbi->s_type = FSTYPE_AFS; + sbi->s_forced_ro = 1; + if (!(sb->s_flags & MS_RDONLY)) { + printk("SysV FS: SCO EAFS on %s detected, " + "forcing read-only mode.\n", + sb->s_id); + } + return type; + } + + if (fs32_to_cpu(sbi, sbd->s_time) < JAN_1_1980) { + /* this is likely to happen on SystemV2 FS */ + if (type > 3 || type < 1) + return 0; + sbi->s_type = FSTYPE_SYSV2; + return type; + } + if ((type > 3 || type < 1) && (type > 0x30 || type < 0x10)) + return 0; + + /* On Interactive Unix (ISC) Version 4.0/3.x s_type field = 0x10, + 0x20 or 0x30 indicates that symbolic links and the 14-character + filename limit is gone. Due to lack of information about this + feature read-only mode seems to be a reasonable approach... -KGB */ + + if (type >= 0x10) { + printk("SysV FS: can't handle long file names on %s, " + "forcing read-only mode.\n", sb->s_id); + sbi->s_forced_ro = 1; + } + + sbi->s_type = FSTYPE_SYSV4; + return type >= 0x10 ? type >> 4 : type; +} + +static int detect_coherent(struct sysv_sb_info *sbi, struct buffer_head *bh) +{ + struct coh_super_block * sbd; + + sbd = (struct coh_super_block *) (bh->b_data + BLOCK_SIZE/2); + if ((memcmp(sbd->s_fname,"noname",6) && memcmp(sbd->s_fname,"xxxxx ",6)) + || (memcmp(sbd->s_fpack,"nopack",6) && memcmp(sbd->s_fpack,"xxxxx\n",6))) + return 0; + sbi->s_bytesex = BYTESEX_PDP; + sbi->s_type = FSTYPE_COH; + return 1; +} + +static int detect_sysv_odd(struct sysv_sb_info *sbi, struct buffer_head *bh) +{ + int size = detect_sysv(sbi, bh); + + return size>2 ? 0 : size; +} + +static struct { + int block; + int (*test)(struct sysv_sb_info *, struct buffer_head *); +} flavours[] = { + {1, detect_xenix}, + {0, detect_sysv}, + {0, detect_coherent}, + {9, detect_sysv_odd}, + {15,detect_sysv_odd}, + {18,detect_sysv}, +}; + +static char *flavour_names[] = { + [FSTYPE_XENIX] = "Xenix", + [FSTYPE_SYSV4] = "SystemV", + [FSTYPE_SYSV2] = "SystemV Release 2", + [FSTYPE_COH] = "Coherent", + [FSTYPE_V7] = "V7", + [FSTYPE_AFS] = "AFS", +}; + +static void (*flavour_setup[])(struct sysv_sb_info *) = { + [FSTYPE_XENIX] = detected_xenix, + [FSTYPE_SYSV4] = detected_sysv4, + [FSTYPE_SYSV2] = detected_sysv2, + [FSTYPE_COH] = detected_coherent, + [FSTYPE_V7] = detected_v7, + [FSTYPE_AFS] = detected_sysv4, +}; + +static int complete_read_super(struct super_block *sb, int silent, int size) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + struct inode *root_inode; + char *found = flavour_names[sbi->s_type]; + u_char n_bits = size+8; + int bsize = 1 << n_bits; + int bsize_4 = bsize >> 2; + + sbi->s_firstinodezone = 2; + + flavour_setup[sbi->s_type](sbi); + + sbi->s_truncate = 1; + sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone; + sbi->s_inodes_per_block = bsize >> 6; + sbi->s_inodes_per_block_1 = (bsize >> 6)-1; + sbi->s_inodes_per_block_bits = n_bits-6; + sbi->s_ind_per_block = bsize_4; + sbi->s_ind_per_block_2 = bsize_4*bsize_4; + sbi->s_toobig_block = 10 + bsize_4 * (1 + bsize_4 * (1 + bsize_4)); + sbi->s_ind_per_block_bits = n_bits-2; + + sbi->s_ninodes = (sbi->s_firstdatazone - sbi->s_firstinodezone) + << sbi->s_inodes_per_block_bits; + + if (!silent) + printk("VFS: Found a %s FS (block size = %ld) on device %s\n", + found, sb->s_blocksize, sb->s_id); + + sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type; + /* set up enough so that it can read an inode */ + sb->s_op = &sysv_sops; + if (sbi->s_forced_ro) + sb->s_flags |= MS_RDONLY; + if (sbi->s_truncate) + sb->s_d_op = &sysv_dentry_operations; + root_inode = sysv_iget(sb, SYSV_ROOT_INO); + if (IS_ERR(root_inode)) { + printk("SysV FS: get root inode failed\n"); + return 0; + } + sb->s_root = d_alloc_root(root_inode); + if (!sb->s_root) { + iput(root_inode); + printk("SysV FS: get root dentry failed\n"); + return 0; + } + return 1; +} + +static int sysv_fill_super(struct super_block *sb, void *data, int silent) +{ + struct buffer_head *bh1, *bh = NULL; + struct sysv_sb_info *sbi; + unsigned long blocknr; + int size = 0, i; + + BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block)); + BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block)); + BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block)); + BUILD_BUG_ON(500 != sizeof (struct coh_super_block)); + BUILD_BUG_ON(64 != sizeof (struct sysv_inode)); + + sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sbi->s_sb = sb; + sbi->s_block_base = 0; + sb->s_fs_info = sbi; + + sb_set_blocksize(sb, BLOCK_SIZE); + + for (i = 0; i < ARRAY_SIZE(flavours) && !size; i++) { + brelse(bh); + bh = sb_bread(sb, flavours[i].block); + if (!bh) + continue; + size = flavours[i].test(SYSV_SB(sb), bh); + } + + if (!size) + goto Eunknown; + + switch (size) { + case 1: + blocknr = bh->b_blocknr << 1; + brelse(bh); + sb_set_blocksize(sb, 512); + bh1 = sb_bread(sb, blocknr); + bh = sb_bread(sb, blocknr + 1); + break; + case 2: + bh1 = bh; + break; + case 3: + blocknr = bh->b_blocknr >> 1; + brelse(bh); + sb_set_blocksize(sb, 2048); + bh1 = bh = sb_bread(sb, blocknr); + break; + default: + goto Ebadsize; + } + + if (bh && bh1) { + sbi->s_bh1 = bh1; + sbi->s_bh2 = bh; + if (complete_read_super(sb, silent, size)) + return 0; + } + + brelse(bh1); + brelse(bh); + sb_set_blocksize(sb, BLOCK_SIZE); + printk("oldfs: cannot read superblock\n"); +failed: + kfree(sbi); + return -EINVAL; + +Eunknown: + brelse(bh); + if (!silent) + printk("VFS: unable to find oldfs superblock on device %s\n", + sb->s_id); + goto failed; +Ebadsize: + brelse(bh); + if (!silent) + printk("VFS: oldfs: unsupported block size (%dKb)\n", + 1<<(size-2)); + goto failed; +} + +static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh) +{ + struct v7_super_block *v7sb; + struct sysv_inode *v7i; + struct buffer_head *bh2; + struct sysv_sb_info *sbi; + + sbi = sb->s_fs_info; + + /* plausibility check on superblock */ + v7sb = (struct v7_super_block *) bh->b_data; + if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE || + fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD || + fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE) + return 0; + + /* plausibility check on root inode: it is a directory, + with a nonzero size that is a multiple of 16 */ + bh2 = sb_bread(sb, 2); + if (bh2 == NULL) + return 0; + + v7i = (struct sysv_inode *)(bh2->b_data + 64); + if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR || + (fs32_to_cpu(sbi, v7i->i_size) == 0) || + (fs32_to_cpu(sbi, v7i->i_size) & 017) || + (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES * + sizeof(struct sysv_dir_entry))) { + brelse(bh2); + return 0; + } + + brelse(bh2); + return 1; +} + +static int v7_fill_super(struct super_block *sb, void *data, int silent) +{ + struct sysv_sb_info *sbi; + struct buffer_head *bh; + + if (440 != sizeof (struct v7_super_block)) + panic("V7 FS: bad super-block size"); + if (64 != sizeof (struct sysv_inode)) + panic("sysv fs: bad i-node size"); + + sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sbi->s_sb = sb; + sbi->s_block_base = 0; + sbi->s_type = FSTYPE_V7; + sb->s_fs_info = sbi; + + sb_set_blocksize(sb, 512); + + if ((bh = sb_bread(sb, 1)) == NULL) { + if (!silent) + printk("VFS: unable to read V7 FS superblock on " + "device %s.\n", sb->s_id); + goto failed; + } + + /* Try PDP-11 UNIX */ + sbi->s_bytesex = BYTESEX_PDP; + if (v7_sanity_check(sb, bh)) + goto detected; + + /* Try PC/IX, v7/x86 */ + sbi->s_bytesex = BYTESEX_LE; + if (v7_sanity_check(sb, bh)) + goto detected; + + goto failed; + +detected: + sbi->s_bh1 = bh; + sbi->s_bh2 = bh; + if (complete_read_super(sb, silent, 1)) + return 0; + +failed: + printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n", + sb->s_id); + brelse(bh); + kfree(sbi); + return -EINVAL; +} + +/* Every kernel module contains stuff like this. */ + +static struct dentry *sysv_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super); +} + +static struct dentry *v7_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super); +} + +static struct file_system_type sysv_fs_type = { + .owner = THIS_MODULE, + .name = "sysv", + .mount = sysv_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static struct file_system_type v7_fs_type = { + .owner = THIS_MODULE, + .name = "v7", + .mount = v7_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_sysv_fs(void) +{ + int error; + + error = sysv_init_icache(); + if (error) + goto out; + error = register_filesystem(&sysv_fs_type); + if (error) + goto destroy_icache; + error = register_filesystem(&v7_fs_type); + if (error) + goto unregister; + return 0; + +unregister: + unregister_filesystem(&sysv_fs_type); +destroy_icache: + sysv_destroy_icache(); +out: + return error; +} + +static void __exit exit_sysv_fs(void) +{ + unregister_filesystem(&sysv_fs_type); + unregister_filesystem(&v7_fs_type); + sysv_destroy_icache(); +} + +module_init(init_sysv_fs) +module_exit(exit_sysv_fs) +MODULE_ALIAS("v7"); +MODULE_LICENSE("GPL"); diff --git a/fs/sysv/symlink.c b/fs/sysv/symlink.c new file mode 100644 index 00000000..00d2f8a4 --- /dev/null +++ b/fs/sysv/symlink.c @@ -0,0 +1,20 @@ +/* + * linux/fs/sysv/symlink.c + * + * Handling of System V filesystem fast symlinks extensions. + * Aug 2001, Christoph Hellwig (hch@infradead.org) + */ + +#include "sysv.h" +#include + +static void *sysv_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, (char *)SYSV_I(dentry->d_inode)->i_data); + return NULL; +} + +const struct inode_operations sysv_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = sysv_follow_link, +}; diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h new file mode 100644 index 00000000..bb55cdb3 --- /dev/null +++ b/fs/sysv/sysv.h @@ -0,0 +1,248 @@ +#ifndef _SYSV_H +#define _SYSV_H + +#include + +typedef __u16 __bitwise __fs16; +typedef __u32 __bitwise __fs32; + +#include + +/* + * SystemV/V7/Coherent super-block data in memory + * + * The SystemV/V7/Coherent superblock contains dynamic data (it gets modified + * while the system is running). This is in contrast to the Minix and Berkeley + * filesystems (where the superblock is never modified). This affects the + * sync() operation: we must keep the superblock in a disk buffer and use this + * one as our "working copy". + */ + +struct sysv_sb_info { + struct super_block *s_sb; /* VFS superblock */ + int s_type; /* file system type: FSTYPE_{XENIX|SYSV|COH} */ + char s_bytesex; /* bytesex (le/be/pdp) */ + char s_truncate; /* if 1: names > SYSV_NAMELEN chars are truncated */ + /* if 0: they are disallowed (ENAMETOOLONG) */ + nlink_t s_link_max; /* max number of hard links to a file */ + unsigned int s_inodes_per_block; /* number of inodes per block */ + unsigned int s_inodes_per_block_1; /* inodes_per_block - 1 */ + unsigned int s_inodes_per_block_bits; /* log2(inodes_per_block) */ + unsigned int s_ind_per_block; /* number of indirections per block */ + unsigned int s_ind_per_block_bits; /* log2(ind_per_block) */ + unsigned int s_ind_per_block_2; /* ind_per_block ^ 2 */ + unsigned int s_toobig_block; /* 10 + ipb + ipb^2 + ipb^3 */ + unsigned int s_block_base; /* physical block number of block 0 */ + unsigned short s_fic_size; /* free inode cache size, NICINOD */ + unsigned short s_flc_size; /* free block list chunk size, NICFREE */ + /* The superblock is kept in one or two disk buffers: */ + struct buffer_head *s_bh1; + struct buffer_head *s_bh2; + /* These are pointers into the disk buffer, to compensate for + different superblock layout. */ + char * s_sbd1; /* entire superblock data, for part 1 */ + char * s_sbd2; /* entire superblock data, for part 2 */ + __fs16 *s_sb_fic_count; /* pointer to s_sbd->s_ninode */ + sysv_ino_t *s_sb_fic_inodes; /* pointer to s_sbd->s_inode */ + __fs16 *s_sb_total_free_inodes; /* pointer to s_sbd->s_tinode */ + __fs16 *s_bcache_count; /* pointer to s_sbd->s_nfree */ + sysv_zone_t *s_bcache; /* pointer to s_sbd->s_free */ + __fs32 *s_free_blocks; /* pointer to s_sbd->s_tfree */ + __fs32 *s_sb_time; /* pointer to s_sbd->s_time */ + __fs32 *s_sb_state; /* pointer to s_sbd->s_state, only FSTYPE_SYSV */ + /* We keep those superblock entities that don't change here; + this saves us an indirection and perhaps a conversion. */ + u32 s_firstinodezone; /* index of first inode zone */ + u32 s_firstdatazone; /* same as s_sbd->s_isize */ + u32 s_ninodes; /* total number of inodes */ + u32 s_ndatazones; /* total number of data zones */ + u32 s_nzones; /* same as s_sbd->s_fsize */ + u16 s_namelen; /* max length of dir entry */ + int s_forced_ro; +}; + +/* + * SystemV/V7/Coherent FS inode data in memory + */ +struct sysv_inode_info { + __fs32 i_data[13]; + u32 i_dir_start_lookup; + struct inode vfs_inode; +}; + + +static inline struct sysv_inode_info *SYSV_I(struct inode *inode) +{ + return list_entry(inode, struct sysv_inode_info, vfs_inode); +} + +static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + + +/* identify the FS in memory */ +enum { + FSTYPE_NONE = 0, + FSTYPE_XENIX, + FSTYPE_SYSV4, + FSTYPE_SYSV2, + FSTYPE_COH, + FSTYPE_V7, + FSTYPE_AFS, + FSTYPE_END, +}; + +#define SYSV_MAGIC_BASE 0x012FF7B3 + +#define XENIX_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_XENIX) +#define SYSV4_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV4) +#define SYSV2_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV2) +#define COH_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_COH) + + +/* Admissible values for i_nlink: 0.._LINK_MAX */ +enum { + XENIX_LINK_MAX = 126, /* ?? */ + SYSV_LINK_MAX = 126, /* 127? 251? */ + V7_LINK_MAX = 126, /* ?? */ + COH_LINK_MAX = 10000, +}; + + +static inline void dirty_sb(struct super_block *sb) +{ + struct sysv_sb_info *sbi = SYSV_SB(sb); + + mark_buffer_dirty(sbi->s_bh1); + if (sbi->s_bh1 != sbi->s_bh2) + mark_buffer_dirty(sbi->s_bh2); + sb->s_dirt = 1; +} + + +/* ialloc.c */ +extern struct sysv_inode *sysv_raw_inode(struct super_block *, unsigned, + struct buffer_head **); +extern struct inode * sysv_new_inode(const struct inode *, mode_t); +extern void sysv_free_inode(struct inode *); +extern unsigned long sysv_count_free_inodes(struct super_block *); + +/* balloc.c */ +extern sysv_zone_t sysv_new_block(struct super_block *); +extern void sysv_free_block(struct super_block *, sysv_zone_t); +extern unsigned long sysv_count_free_blocks(struct super_block *); + +/* itree.c */ +extern void sysv_truncate(struct inode *); +extern int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len); + +/* inode.c */ +extern struct inode *sysv_iget(struct super_block *, unsigned int); +extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); +extern int sysv_sync_inode(struct inode *); +extern void sysv_set_inode(struct inode *, dev_t); +extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int sysv_init_icache(void); +extern void sysv_destroy_icache(void); + + +/* dir.c */ +extern struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct page **); +extern int sysv_add_link(struct dentry *, struct inode *); +extern int sysv_delete_entry(struct sysv_dir_entry *, struct page *); +extern int sysv_make_empty(struct inode *, struct inode *); +extern int sysv_empty_dir(struct inode *); +extern void sysv_set_link(struct sysv_dir_entry *, struct page *, + struct inode *); +extern struct sysv_dir_entry *sysv_dotdot(struct inode *, struct page **); +extern ino_t sysv_inode_by_name(struct dentry *); + + +extern const struct inode_operations sysv_file_inode_operations; +extern const struct inode_operations sysv_dir_inode_operations; +extern const struct inode_operations sysv_fast_symlink_inode_operations; +extern const struct file_operations sysv_file_operations; +extern const struct file_operations sysv_dir_operations; +extern const struct address_space_operations sysv_aops; +extern const struct super_operations sysv_sops; +extern const struct dentry_operations sysv_dentry_operations; + + +enum { + BYTESEX_LE, + BYTESEX_PDP, + BYTESEX_BE, +}; + +static inline u32 PDP_swab(u32 x) +{ +#ifdef __LITTLE_ENDIAN + return ((x & 0xffff) << 16) | ((x & 0xffff0000) >> 16); +#else +#ifdef __BIG_ENDIAN + return ((x & 0xff00ff) << 8) | ((x & 0xff00ff00) >> 8); +#else +#error BYTESEX +#endif +#endif +} + +static inline __u32 fs32_to_cpu(struct sysv_sb_info *sbi, __fs32 n) +{ + if (sbi->s_bytesex == BYTESEX_PDP) + return PDP_swab((__force __u32)n); + else if (sbi->s_bytesex == BYTESEX_LE) + return le32_to_cpu((__force __le32)n); + else + return be32_to_cpu((__force __be32)n); +} + +static inline __fs32 cpu_to_fs32(struct sysv_sb_info *sbi, __u32 n) +{ + if (sbi->s_bytesex == BYTESEX_PDP) + return (__force __fs32)PDP_swab(n); + else if (sbi->s_bytesex == BYTESEX_LE) + return (__force __fs32)cpu_to_le32(n); + else + return (__force __fs32)cpu_to_be32(n); +} + +static inline __fs32 fs32_add(struct sysv_sb_info *sbi, __fs32 *n, int d) +{ + if (sbi->s_bytesex == BYTESEX_PDP) + *(__u32*)n = PDP_swab(PDP_swab(*(__u32*)n)+d); + else if (sbi->s_bytesex == BYTESEX_LE) + le32_add_cpu((__le32 *)n, d); + else + be32_add_cpu((__be32 *)n, d); + return *n; +} + +static inline __u16 fs16_to_cpu(struct sysv_sb_info *sbi, __fs16 n) +{ + if (sbi->s_bytesex != BYTESEX_BE) + return le16_to_cpu((__force __le16)n); + else + return be16_to_cpu((__force __be16)n); +} + +static inline __fs16 cpu_to_fs16(struct sysv_sb_info *sbi, __u16 n) +{ + if (sbi->s_bytesex != BYTESEX_BE) + return (__force __fs16)cpu_to_le16(n); + else + return (__force __fs16)cpu_to_be16(n); +} + +static inline __fs16 fs16_add(struct sysv_sb_info *sbi, __fs16 *n, int d) +{ + if (sbi->s_bytesex != BYTESEX_BE) + le16_add_cpu((__le16 *)n, d); + else + be16_add_cpu((__be16 *)n, d); + return *n; +} + +#endif /* _SYSV_H */ diff --git a/fs/timerfd.c b/fs/timerfd.c new file mode 100644 index 00000000..dffeb379 --- /dev/null +++ b/fs/timerfd.c @@ -0,0 +1,369 @@ +/* + * fs/timerfd.c + * + * Copyright (C) 2007 Davide Libenzi + * + * + * Thanks to Thomas Gleixner for code reviews and useful comments. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct timerfd_ctx { + struct hrtimer tmr; + ktime_t tintv; + ktime_t moffs; + wait_queue_head_t wqh; + u64 ticks; + int expired; + int clockid; + struct rcu_head rcu; + struct list_head clist; + bool might_cancel; +}; + +static LIST_HEAD(cancel_list); +static DEFINE_SPINLOCK(cancel_lock); + +/* + * This gets called when the timer event triggers. We set the "expired" + * flag, but we do not re-arm the timer (in case it's necessary, + * tintv.tv64 != 0) until the timer is accessed. + */ +static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) +{ + struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr); + unsigned long flags; + + spin_lock_irqsave(&ctx->wqh.lock, flags); + ctx->expired = 1; + ctx->ticks++; + wake_up_locked(&ctx->wqh); + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return HRTIMER_NORESTART; +} + +/* + * Called when the clock was set to cancel the timers in the cancel + * list. This will wake up processes waiting on these timers. The + * wake-up requires ctx->ticks to be non zero, therefore we increment + * it before calling wake_up_locked(). + */ +void timerfd_clock_was_set(void) +{ + ktime_t moffs = ktime_get_monotonic_offset(); + struct timerfd_ctx *ctx; + unsigned long flags; + + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &cancel_list, clist) { + if (!ctx->might_cancel) + continue; + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ctx->moffs.tv64 != moffs.tv64) { + ctx->moffs.tv64 = KTIME_MAX; + ctx->ticks++; + wake_up_locked(&ctx->wqh); + } + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + } + rcu_read_unlock(); +} + +static void timerfd_remove_cancel(struct timerfd_ctx *ctx) +{ + if (ctx->might_cancel) { + ctx->might_cancel = false; + spin_lock(&cancel_lock); + list_del_rcu(&ctx->clist); + spin_unlock(&cancel_lock); + } +} + +static bool timerfd_canceled(struct timerfd_ctx *ctx) +{ + if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX) + return false; + ctx->moffs = ktime_get_monotonic_offset(); + return true; +} + +static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) +{ + if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) && + (flags & TFD_TIMER_CANCEL_ON_SET)) { + if (!ctx->might_cancel) { + ctx->might_cancel = true; + spin_lock(&cancel_lock); + list_add_rcu(&ctx->clist, &cancel_list); + spin_unlock(&cancel_lock); + } + } else if (ctx->might_cancel) { + timerfd_remove_cancel(ctx); + } +} + +static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) +{ + ktime_t remaining; + + remaining = hrtimer_expires_remaining(&ctx->tmr); + return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; +} + +static int timerfd_setup(struct timerfd_ctx *ctx, int flags, + const struct itimerspec *ktmr) +{ + enum hrtimer_mode htmode; + ktime_t texp; + int clockid = ctx->clockid; + + htmode = (flags & TFD_TIMER_ABSTIME) ? + HRTIMER_MODE_ABS: HRTIMER_MODE_REL; + + texp = timespec_to_ktime(ktmr->it_value); + ctx->expired = 0; + ctx->ticks = 0; + ctx->tintv = timespec_to_ktime(ktmr->it_interval); + hrtimer_init(&ctx->tmr, clockid, htmode); + hrtimer_set_expires(&ctx->tmr, texp); + ctx->tmr.function = timerfd_tmrproc; + if (texp.tv64 != 0) { + hrtimer_start(&ctx->tmr, texp, htmode); + if (timerfd_canceled(ctx)) + return -ECANCELED; + } + return 0; +} + +static int timerfd_release(struct inode *inode, struct file *file) +{ + struct timerfd_ctx *ctx = file->private_data; + + timerfd_remove_cancel(ctx); + hrtimer_cancel(&ctx->tmr); + kfree_rcu(ctx, rcu); + return 0; +} + +static unsigned int timerfd_poll(struct file *file, poll_table *wait) +{ + struct timerfd_ctx *ctx = file->private_data; + unsigned int events = 0; + unsigned long flags; + + poll_wait(file, &ctx->wqh, wait); + + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ctx->ticks) + events |= POLLIN; + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return events; +} + +static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct timerfd_ctx *ctx = file->private_data; + ssize_t res; + u64 ticks = 0; + + if (count < sizeof(ticks)) + return -EINVAL; + spin_lock_irq(&ctx->wqh.lock); + if (file->f_flags & O_NONBLOCK) + res = -EAGAIN; + else + res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks); + + /* + * If clock has changed, we do not care about the + * ticks and we do not rearm the timer. Userspace must + * reevaluate anyway. + */ + if (timerfd_canceled(ctx)) { + ctx->ticks = 0; + ctx->expired = 0; + res = -ECANCELED; + } + + if (ctx->ticks) { + ticks = ctx->ticks; + + if (ctx->expired && ctx->tintv.tv64) { + /* + * If tintv.tv64 != 0, this is a periodic timer that + * needs to be re-armed. We avoid doing it in the timer + * callback to avoid DoS attacks specifying a very + * short timer period. + */ + ticks += hrtimer_forward_now(&ctx->tmr, + ctx->tintv) - 1; + hrtimer_restart(&ctx->tmr); + } + ctx->expired = 0; + ctx->ticks = 0; + } + spin_unlock_irq(&ctx->wqh.lock); + if (ticks) + res = put_user(ticks, (u64 __user *) buf) ? -EFAULT: sizeof(ticks); + return res; +} + +static const struct file_operations timerfd_fops = { + .release = timerfd_release, + .poll = timerfd_poll, + .read = timerfd_read, + .llseek = noop_llseek, +}; + +static struct file *timerfd_fget(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + if (file->f_op != &timerfd_fops) { + fput(file); + return ERR_PTR(-EINVAL); + } + + return file; +} + +SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) +{ + int ufd; + struct timerfd_ctx *ctx; + + /* Check the TFD_* constants for consistency. */ + BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK); + + if ((flags & ~TFD_CREATE_FLAGS) || + (clockid != CLOCK_MONOTONIC && + clockid != CLOCK_REALTIME)) + return -EINVAL; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + init_waitqueue_head(&ctx->wqh); + ctx->clockid = clockid; + hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); + ctx->moffs = ktime_get_monotonic_offset(); + + ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, + O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); + if (ufd < 0) + kfree(ctx); + + return ufd; +} + +SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, + const struct itimerspec __user *, utmr, + struct itimerspec __user *, otmr) +{ + struct file *file; + struct timerfd_ctx *ctx; + struct itimerspec ktmr, kotmr; + int ret; + + if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) + return -EFAULT; + + if ((flags & ~TFD_SETTIME_FLAGS) || + !timespec_valid(&ktmr.it_value) || + !timespec_valid(&ktmr.it_interval)) + return -EINVAL; + + file = timerfd_fget(ufd); + if (IS_ERR(file)) + return PTR_ERR(file); + ctx = file->private_data; + + timerfd_setup_cancel(ctx, flags); + + /* + * We need to stop the existing timer before reprogramming + * it to the new values. + */ + for (;;) { + spin_lock_irq(&ctx->wqh.lock); + if (hrtimer_try_to_cancel(&ctx->tmr) >= 0) + break; + spin_unlock_irq(&ctx->wqh.lock); + cpu_relax(); + } + + /* + * If the timer is expired and it's periodic, we need to advance it + * because the caller may want to know the previous expiration time. + * We do not update "ticks" and "expired" since the timer will be + * re-programmed again in the following timerfd_setup() call. + */ + if (ctx->expired && ctx->tintv.tv64) + hrtimer_forward_now(&ctx->tmr, ctx->tintv); + + kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + kotmr.it_interval = ktime_to_timespec(ctx->tintv); + + /* + * Re-program the timer to the new value ... + */ + ret = timerfd_setup(ctx, flags, &ktmr); + + spin_unlock_irq(&ctx->wqh.lock); + fput(file); + if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr))) + return -EFAULT; + + return ret; +} + +SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) +{ + struct file *file; + struct timerfd_ctx *ctx; + struct itimerspec kotmr; + + file = timerfd_fget(ufd); + if (IS_ERR(file)) + return PTR_ERR(file); + ctx = file->private_data; + + spin_lock_irq(&ctx->wqh.lock); + if (ctx->expired && ctx->tintv.tv64) { + ctx->expired = 0; + ctx->ticks += + hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1; + hrtimer_restart(&ctx->tmr); + } + kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + kotmr.it_interval = ktime_to_timespec(ctx->tintv); + spin_unlock_irq(&ctx->wqh.lock); + fput(file); + + return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0; +} + diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig new file mode 100644 index 00000000..f8b0160d --- /dev/null +++ b/fs/ubifs/Kconfig @@ -0,0 +1,60 @@ +config UBIFS_FS + tristate "UBIFS file system support" + select CRC16 + select CRC32 + select CRYPTO if UBIFS_FS_ADVANCED_COMPR + select CRYPTO if UBIFS_FS_LZO + select CRYPTO if UBIFS_FS_ZLIB + select CRYPTO_LZO if UBIFS_FS_LZO + select CRYPTO_DEFLATE if UBIFS_FS_ZLIB + depends on MTD_UBI + help + UBIFS is a file system for flash devices which works on top of UBI. + +config UBIFS_FS_XATTR + bool "Extended attributes support" + depends on UBIFS_FS + help + This option enables support of extended attributes. + +config UBIFS_FS_ADVANCED_COMPR + bool "Advanced compression options" + depends on UBIFS_FS + help + This option allows to explicitly choose which compressions, if any, + are enabled in UBIFS. Removing compressors means inability to read + existing file systems. + + If unsure, say 'N'. + +config UBIFS_FS_LZO + bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR + depends on UBIFS_FS + default y + help + LZO compressor is generally faster than zlib but compresses worse. + Say 'Y' if unsure. + +config UBIFS_FS_ZLIB + bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR + depends on UBIFS_FS + default y + help + Zlib compresses better than LZO but it is slower. Say 'Y' if unsure. + +# Debugging-related stuff +config UBIFS_FS_DEBUG + bool "Enable debugging support" + depends on UBIFS_FS + select DEBUG_FS + select KALLSYMS + help + This option enables UBIFS debugging support. It makes sure various + assertions, self-checks, debugging messages and test modes are compiled + in (this all is compiled out otherwise). Assertions are light-weight + and this option also enables them. Self-checks, debugging messages and + test modes are switched off by default. Thus, it is safe and actually + recommended to have debugging support enabled, and it should not slow + down UBIFS. You can then further enable / disable individual debugging + features using UBIFS module parameters and the corresponding sysfs + interfaces. diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile new file mode 100644 index 00000000..80e93c35 --- /dev/null +++ b/fs/ubifs/Makefile @@ -0,0 +1,9 @@ +obj-$(CONFIG_UBIFS_FS) += ubifs.o + +ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o +ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o +ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o +ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o + +ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o +ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c new file mode 100644 index 00000000..315de66e --- /dev/null +++ b/fs/ubifs/budget.c @@ -0,0 +1,732 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the budgeting sub-system which is responsible for UBIFS + * space management. + * + * Factors such as compression, wasted space at the ends of LEBs, space in other + * journal heads, the effect of updates on the index, and so on, make it + * impossible to accurately predict the amount of space needed. Consequently + * approximations are used. + */ + +#include "ubifs.h" +#include +#include + +/* + * When pessimistic budget calculations say that there is no enough space, + * UBIFS starts writing back dirty inodes and pages, doing garbage collection, + * or committing. The below constant defines maximum number of times UBIFS + * repeats the operations. + */ +#define MAX_MKSPC_RETRIES 3 + +/* + * The below constant defines amount of dirty pages which should be written + * back at when trying to shrink the liability. + */ +#define NR_TO_WRITE 16 + +/** + * shrink_liability - write-back some dirty pages/inodes. + * @c: UBIFS file-system description object + * @nr_to_write: how many dirty pages to write-back + * + * This function shrinks UBIFS liability by means of writing back some amount + * of dirty inodes and their pages. + * + * Note, this function synchronizes even VFS inodes which are locked + * (@i_mutex) by the caller of the budgeting function, because write-back does + * not touch @i_mutex. + */ +static void shrink_liability(struct ubifs_info *c, int nr_to_write) +{ + down_read(&c->vfs_sb->s_umount); + writeback_inodes_sb(c->vfs_sb); + up_read(&c->vfs_sb->s_umount); +} + +/** + * run_gc - run garbage collector. + * @c: UBIFS file-system description object + * + * This function runs garbage collector to make some more free space. Returns + * zero if a free LEB has been produced, %-EAGAIN if commit is required, and a + * negative error code in case of failure. + */ +static int run_gc(struct ubifs_info *c) +{ + int err, lnum; + + /* Make some free space by garbage-collecting dirty space */ + down_read(&c->commit_sem); + lnum = ubifs_garbage_collect(c, 1); + up_read(&c->commit_sem); + if (lnum < 0) + return lnum; + + /* GC freed one LEB, return it to lprops */ + dbg_budg("GC freed LEB %d", lnum); + err = ubifs_return_leb(c, lnum); + if (err) + return err; + return 0; +} + +/** + * get_liability - calculate current liability. + * @c: UBIFS file-system description object + * + * This function calculates and returns current UBIFS liability, i.e. the + * amount of bytes UBIFS has "promised" to write to the media. + */ +static long long get_liability(struct ubifs_info *c) +{ + long long liab; + + spin_lock(&c->space_lock); + liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth; + spin_unlock(&c->space_lock); + return liab; +} + +/** + * make_free_space - make more free space on the file-system. + * @c: UBIFS file-system description object + * + * This function is called when an operation cannot be budgeted because there + * is supposedly no free space. But in most cases there is some free space: + * o budgeting is pessimistic, so it always budgets more than it is actually + * needed, so shrinking the liability is one way to make free space - the + * cached data will take less space then it was budgeted for; + * o GC may turn some dark space into free space (budgeting treats dark space + * as not available); + * o commit may free some LEB, i.e., turn freeable LEBs into free LEBs. + * + * So this function tries to do the above. Returns %-EAGAIN if some free space + * was presumably made and the caller has to re-try budgeting the operation. + * Returns %-ENOSPC if it couldn't do more free space, and other negative error + * codes on failures. + */ +static int make_free_space(struct ubifs_info *c) +{ + int err, retries = 0; + long long liab1, liab2; + + do { + liab1 = get_liability(c); + /* + * We probably have some dirty pages or inodes (liability), try + * to write them back. + */ + dbg_budg("liability %lld, run write-back", liab1); + shrink_liability(c, NR_TO_WRITE); + + liab2 = get_liability(c); + if (liab2 < liab1) + return -EAGAIN; + + dbg_budg("new liability %lld (not shrunk)", liab2); + + /* Liability did not shrink again, try GC */ + dbg_budg("Run GC"); + err = run_gc(c); + if (!err) + return -EAGAIN; + + if (err != -EAGAIN && err != -ENOSPC) + /* Some real error happened */ + return err; + + dbg_budg("Run commit (retries %d)", retries); + err = ubifs_run_commit(c); + if (err) + return err; + } while (retries++ < MAX_MKSPC_RETRIES); + + return -ENOSPC; +} + +/** + * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index. + * @c: UBIFS file-system description object + * + * This function calculates and returns the number of LEBs which should be kept + * for index usage. + */ +int ubifs_calc_min_idx_lebs(struct ubifs_info *c) +{ + int idx_lebs; + long long idx_size; + + idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx; + /* And make sure we have thrice the index size of space reserved */ + idx_size += idx_size << 1; + /* + * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' + * pair, nor similarly the two variables for the new index size, so we + * have to do this costly 64-bit division on fast-path. + */ + idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size); + /* + * The index head is not available for the in-the-gaps method, so add an + * extra LEB to compensate. + */ + idx_lebs += 1; + if (idx_lebs < MIN_INDEX_LEBS) + idx_lebs = MIN_INDEX_LEBS; + return idx_lebs; +} + +/** + * ubifs_calc_available - calculate available FS space. + * @c: UBIFS file-system description object + * @min_idx_lebs: minimum number of LEBs reserved for the index + * + * This function calculates and returns amount of FS space available for use. + */ +long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs) +{ + int subtract_lebs; + long long available; + + available = c->main_bytes - c->lst.total_used; + + /* + * Now 'available' contains theoretically available flash space + * assuming there is no index, so we have to subtract the space which + * is reserved for the index. + */ + subtract_lebs = min_idx_lebs; + + /* Take into account that GC reserves one LEB for its own needs */ + subtract_lebs += 1; + + /* + * The GC journal head LEB is not really accessible. And since + * different write types go to different heads, we may count only on + * one head's space. + */ + subtract_lebs += c->jhead_cnt - 1; + + /* We also reserve one LEB for deletions, which bypass budgeting */ + subtract_lebs += 1; + + available -= (long long)subtract_lebs * c->leb_size; + + /* Subtract the dead space which is not available for use */ + available -= c->lst.total_dead; + + /* + * Subtract dark space, which might or might not be usable - it depends + * on the data which we have on the media and which will be written. If + * this is a lot of uncompressed or not-compressible data, the dark + * space cannot be used. + */ + available -= c->lst.total_dark; + + /* + * However, there is more dark space. The index may be bigger than + * @min_idx_lebs. Those extra LEBs are assumed to be available, but + * their dark space is not included in total_dark, so it is subtracted + * here. + */ + if (c->lst.idx_lebs > min_idx_lebs) { + subtract_lebs = c->lst.idx_lebs - min_idx_lebs; + available -= subtract_lebs * c->dark_wm; + } + + /* The calculations are rough and may end up with a negative number */ + return available > 0 ? available : 0; +} + +/** + * can_use_rp - check whether the user is allowed to use reserved pool. + * @c: UBIFS file-system description object + * + * UBIFS has so-called "reserved pool" which is flash space reserved + * for the superuser and for uses whose UID/GID is recorded in UBIFS superblock. + * This function checks whether current user is allowed to use reserved pool. + * Returns %1 current user is allowed to use reserved pool and %0 otherwise. + */ +static int can_use_rp(struct ubifs_info *c) +{ + if (current_fsuid() == c->rp_uid || capable(CAP_SYS_RESOURCE) || + (c->rp_gid != 0 && in_group_p(c->rp_gid))) + return 1; + return 0; +} + +/** + * do_budget_space - reserve flash space for index and data growth. + * @c: UBIFS file-system description object + * + * This function makes sure UBIFS has enough free LEBs for index growth and + * data. + * + * When budgeting index space, UBIFS reserves thrice as many LEBs as the index + * would take if it was consolidated and written to the flash. This guarantees + * that the "in-the-gaps" commit method always succeeds and UBIFS will always + * be able to commit dirty index. So this function basically adds amount of + * budgeted index space to the size of the current index, multiplies this by 3, + * and makes sure this does not exceed the amount of free LEBs. + * + * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables: + * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might + * be large, because UBIFS does not do any index consolidation as long as + * there is free space. IOW, the index may take a lot of LEBs, but the LEBs + * will contain a lot of dirt. + * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW, + * the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs. + * + * This function returns zero in case of success, and %-ENOSPC in case of + * failure. + */ +static int do_budget_space(struct ubifs_info *c) +{ + long long outstanding, available; + int lebs, rsvd_idx_lebs, min_idx_lebs; + + /* First budget index space */ + min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + /* Now 'min_idx_lebs' contains number of LEBs to reserve */ + if (min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + + /* + * The number of LEBs that are available to be used by the index is: + * + * @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt - + * @c->lst.taken_empty_lebs + * + * @c->lst.empty_lebs are available because they are empty. + * @c->freeable_cnt are available because they contain only free and + * dirty space, @c->idx_gc_cnt are available because they are index + * LEBs that have been garbage collected and are awaiting the commit + * before they can be used. And the in-the-gaps method will grab these + * if it needs them. @c->lst.taken_empty_lebs are empty LEBs that have + * already been allocated for some purpose. + * + * Note, @c->idx_gc_cnt is included to both @c->lst.empty_lebs (because + * these LEBs are empty) and to @c->lst.taken_empty_lebs (because they + * are taken until after the commit). + * + * Note, @c->lst.taken_empty_lebs may temporarily be higher by one + * because of the way we serialize LEB allocations and budgeting. See a + * comment in 'ubifs_find_free_space()'. + */ + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + if (unlikely(rsvd_idx_lebs > lebs)) { + dbg_budg("out of indexing space: min_idx_lebs %d (old %d), " + "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs, + rsvd_idx_lebs); + return -ENOSPC; + } + + available = ubifs_calc_available(c, min_idx_lebs); + outstanding = c->bi.data_growth + c->bi.dd_growth; + + if (unlikely(available < outstanding)) { + dbg_budg("out of data space: available %lld, outstanding %lld", + available, outstanding); + return -ENOSPC; + } + + if (available - outstanding <= c->rp_size && !can_use_rp(c)) + return -ENOSPC; + + c->bi.min_idx_lebs = min_idx_lebs; + return 0; +} + +/** + * calc_idx_growth - calculate approximate index growth from budgeting request. + * @c: UBIFS file-system description object + * @req: budgeting request + * + * For now we assume each new node adds one znode. But this is rather poor + * approximation, though. + */ +static int calc_idx_growth(const struct ubifs_info *c, + const struct ubifs_budget_req *req) +{ + int znodes; + + znodes = req->new_ino + (req->new_page << UBIFS_BLOCKS_PER_PAGE_SHIFT) + + req->new_dent; + return znodes * c->max_idx_node_sz; +} + +/** + * calc_data_growth - calculate approximate amount of new data from budgeting + * request. + * @c: UBIFS file-system description object + * @req: budgeting request + */ +static int calc_data_growth(const struct ubifs_info *c, + const struct ubifs_budget_req *req) +{ + int data_growth; + + data_growth = req->new_ino ? c->bi.inode_budget : 0; + if (req->new_page) + data_growth += c->bi.page_budget; + if (req->new_dent) + data_growth += c->bi.dent_budget; + data_growth += req->new_ino_d; + return data_growth; +} + +/** + * calc_dd_growth - calculate approximate amount of data which makes other data + * dirty from budgeting request. + * @c: UBIFS file-system description object + * @req: budgeting request + */ +static int calc_dd_growth(const struct ubifs_info *c, + const struct ubifs_budget_req *req) +{ + int dd_growth; + + dd_growth = req->dirtied_page ? c->bi.page_budget : 0; + + if (req->dirtied_ino) + dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1); + if (req->mod_dent) + dd_growth += c->bi.dent_budget; + dd_growth += req->dirtied_ino_d; + return dd_growth; +} + +/** + * ubifs_budget_space - ensure there is enough space to complete an operation. + * @c: UBIFS file-system description object + * @req: budget request + * + * This function allocates budget for an operation. It uses pessimistic + * approximation of how much flash space the operation needs. The goal of this + * function is to make sure UBIFS always has flash space to flush all dirty + * pages, dirty inodes, and dirty znodes (liability). This function may force + * commit, garbage-collection or write-back. Returns zero in case of success, + * %-ENOSPC if there is no free space and other negative error codes in case of + * failures. + */ +int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) +{ + int uninitialized_var(cmt_retries), uninitialized_var(wb_retries); + int err, idx_growth, data_growth, dd_growth, retried = 0; + + ubifs_assert(req->new_page <= 1); + ubifs_assert(req->dirtied_page <= 1); + ubifs_assert(req->new_dent <= 1); + ubifs_assert(req->mod_dent <= 1); + ubifs_assert(req->new_ino <= 1); + ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); + ubifs_assert(req->dirtied_ino <= 4); + ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); + ubifs_assert(!(req->new_ino_d & 7)); + ubifs_assert(!(req->dirtied_ino_d & 7)); + + data_growth = calc_data_growth(c, req); + dd_growth = calc_dd_growth(c, req); + if (!data_growth && !dd_growth) + return 0; + idx_growth = calc_idx_growth(c, req); + +again: + spin_lock(&c->space_lock); + ubifs_assert(c->bi.idx_growth >= 0); + ubifs_assert(c->bi.data_growth >= 0); + ubifs_assert(c->bi.dd_growth >= 0); + + if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) { + dbg_budg("no space"); + spin_unlock(&c->space_lock); + return -ENOSPC; + } + + c->bi.idx_growth += idx_growth; + c->bi.data_growth += data_growth; + c->bi.dd_growth += dd_growth; + + err = do_budget_space(c); + if (likely(!err)) { + req->idx_growth = idx_growth; + req->data_growth = data_growth; + req->dd_growth = dd_growth; + spin_unlock(&c->space_lock); + return 0; + } + + /* Restore the old values */ + c->bi.idx_growth -= idx_growth; + c->bi.data_growth -= data_growth; + c->bi.dd_growth -= dd_growth; + spin_unlock(&c->space_lock); + + if (req->fast) { + dbg_budg("no space for fast budgeting"); + return err; + } + + err = make_free_space(c); + cond_resched(); + if (err == -EAGAIN) { + dbg_budg("try again"); + goto again; + } else if (err == -ENOSPC) { + if (!retried) { + retried = 1; + dbg_budg("-ENOSPC, but anyway try once again"); + goto again; + } + dbg_budg("FS is full, -ENOSPC"); + c->bi.nospace = 1; + if (can_use_rp(c) || c->rp_size == 0) + c->bi.nospace_rp = 1; + smp_wmb(); + } else + ubifs_err("cannot budget space, error %d", err); + return err; +} + +/** + * ubifs_release_budget - release budgeted free space. + * @c: UBIFS file-system description object + * @req: budget request + * + * This function releases the space budgeted by 'ubifs_budget_space()'. Note, + * since the index changes (which were budgeted for in @req->idx_growth) will + * only be written to the media on commit, this function moves the index budget + * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed + * by the commit operation. + */ +void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) +{ + ubifs_assert(req->new_page <= 1); + ubifs_assert(req->dirtied_page <= 1); + ubifs_assert(req->new_dent <= 1); + ubifs_assert(req->mod_dent <= 1); + ubifs_assert(req->new_ino <= 1); + ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); + ubifs_assert(req->dirtied_ino <= 4); + ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); + ubifs_assert(!(req->new_ino_d & 7)); + ubifs_assert(!(req->dirtied_ino_d & 7)); + if (!req->recalculate) { + ubifs_assert(req->idx_growth >= 0); + ubifs_assert(req->data_growth >= 0); + ubifs_assert(req->dd_growth >= 0); + } + + if (req->recalculate) { + req->data_growth = calc_data_growth(c, req); + req->dd_growth = calc_dd_growth(c, req); + req->idx_growth = calc_idx_growth(c, req); + } + + if (!req->data_growth && !req->dd_growth) + return; + + c->bi.nospace = c->bi.nospace_rp = 0; + smp_wmb(); + + spin_lock(&c->space_lock); + c->bi.idx_growth -= req->idx_growth; + c->bi.uncommitted_idx += req->idx_growth; + c->bi.data_growth -= req->data_growth; + c->bi.dd_growth -= req->dd_growth; + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + ubifs_assert(c->bi.idx_growth >= 0); + ubifs_assert(c->bi.data_growth >= 0); + ubifs_assert(c->bi.dd_growth >= 0); + ubifs_assert(c->bi.min_idx_lebs < c->main_lebs); + ubifs_assert(!(c->bi.idx_growth & 7)); + ubifs_assert(!(c->bi.data_growth & 7)); + ubifs_assert(!(c->bi.dd_growth & 7)); + spin_unlock(&c->space_lock); +} + +/** + * ubifs_convert_page_budget - convert budget of a new page. + * @c: UBIFS file-system description object + * + * This function converts budget which was allocated for a new page of data to + * the budget of changing an existing page of data. The latter is smaller than + * the former, so this function only does simple re-calculation and does not + * involve any write-back. + */ +void ubifs_convert_page_budget(struct ubifs_info *c) +{ + spin_lock(&c->space_lock); + /* Release the index growth reservation */ + c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT; + /* Release the data growth reservation */ + c->bi.data_growth -= c->bi.page_budget; + /* Increase the dirty data growth reservation instead */ + c->bi.dd_growth += c->bi.page_budget; + /* And re-calculate the indexing space reservation */ + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + spin_unlock(&c->space_lock); +} + +/** + * ubifs_release_dirty_inode_budget - release dirty inode budget. + * @c: UBIFS file-system description object + * @ui: UBIFS inode to release the budget for + * + * This function releases budget corresponding to a dirty inode. It is usually + * called when after the inode has been written to the media and marked as + * clean. It also causes the "no space" flags to be cleared. + */ +void ubifs_release_dirty_inode_budget(struct ubifs_info *c, + struct ubifs_inode *ui) +{ + struct ubifs_budget_req req; + + memset(&req, 0, sizeof(struct ubifs_budget_req)); + /* The "no space" flags will be cleared because dd_growth is > 0 */ + req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8); + ubifs_release_budget(c, &req); +} + +/** + * ubifs_reported_space - calculate reported free space. + * @c: the UBIFS file-system description object + * @free: amount of free space + * + * This function calculates amount of free space which will be reported to + * user-space. User-space application tend to expect that if the file-system + * (e.g., via the 'statfs()' call) reports that it has N bytes available, they + * are able to write a file of size N. UBIFS attaches node headers to each data + * node and it has to write indexing nodes as well. This introduces additional + * overhead, and UBIFS has to report slightly less free space to meet the above + * expectations. + * + * This function assumes free space is made up of uncompressed data nodes and + * full index nodes (one per data node, tripled because we always allow enough + * space to write the index thrice). + * + * Note, the calculation is pessimistic, which means that most of the time + * UBIFS reports less space than it actually has. + */ +long long ubifs_reported_space(const struct ubifs_info *c, long long free) +{ + int divisor, factor, f; + + /* + * Reported space size is @free * X, where X is UBIFS block size + * divided by UBIFS block size + all overhead one data block + * introduces. The overhead is the node header + indexing overhead. + * + * Indexing overhead calculations are based on the following formula: + * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number + * of data nodes, f - fanout. Because effective UBIFS fanout is twice + * as less than maximum fanout, we assume that each data node + * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes. + * Note, the multiplier 3 is because UBIFS reserves thrice as more space + * for the index. + */ + f = c->fanout > 3 ? c->fanout >> 1 : 2; + factor = UBIFS_BLOCK_SIZE; + divisor = UBIFS_MAX_DATA_NODE_SZ; + divisor += (c->max_idx_node_sz * 3) / (f - 1); + free *= factor; + return div_u64(free, divisor); +} + +/** + * ubifs_get_free_space_nolock - return amount of free space. + * @c: UBIFS file-system description object + * + * This function calculates amount of free space to report to user-space. + * + * Because UBIFS may introduce substantial overhead (the index, node headers, + * alignment, wastage at the end of LEBs, etc), it cannot report real amount of + * free flash space it has (well, because not all dirty space is reclaimable, + * UBIFS does not actually know the real amount). If UBIFS did so, it would + * bread user expectations about what free space is. Users seem to accustomed + * to assume that if the file-system reports N bytes of free space, they would + * be able to fit a file of N bytes to the FS. This almost works for + * traditional file-systems, because they have way less overhead than UBIFS. + * So, to keep users happy, UBIFS tries to take the overhead into account. + */ +long long ubifs_get_free_space_nolock(struct ubifs_info *c) +{ + int rsvd_idx_lebs, lebs; + long long available, outstanding, free; + + ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c)); + outstanding = c->bi.data_growth + c->bi.dd_growth; + available = ubifs_calc_available(c, c->bi.min_idx_lebs); + + /* + * When reporting free space to user-space, UBIFS guarantees that it is + * possible to write a file of free space size. This means that for + * empty LEBs we may use more precise calculations than + * 'ubifs_calc_available()' is using. Namely, we know that in empty + * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm. + * Thus, amend the available space. + * + * Note, the calculations below are similar to what we have in + * 'do_budget_space()', so refer there for comments. + */ + if (c->bi.min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + lebs -= rsvd_idx_lebs; + available += lebs * (c->dark_wm - c->leb_overhead); + + if (available > outstanding) + free = ubifs_reported_space(c, available - outstanding); + else + free = 0; + return free; +} + +/** + * ubifs_get_free_space - return amount of free space. + * @c: UBIFS file-system description object + * + * This function calculates and returns amount of free space to report to + * user-space. + */ +long long ubifs_get_free_space(struct ubifs_info *c) +{ + long long free; + + spin_lock(&c->space_lock); + free = ubifs_get_free_space_nolock(c); + spin_unlock(&c->space_lock); + + return free; +} diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c new file mode 100644 index 00000000..87cd0ead --- /dev/null +++ b/fs/ubifs/commit.c @@ -0,0 +1,738 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements functions that manage the running of the commit process. + * Each affected module has its own functions to accomplish their part in the + * commit and those functions are called here. + * + * The commit is the process whereby all updates to the index and LEB properties + * are written out together and the journal becomes empty. This keeps the + * file system consistent - at all times the state can be recreated by reading + * the index and LEB properties and then replaying the journal. + * + * The commit is split into two parts named "commit start" and "commit end". + * During commit start, the commit process has exclusive access to the journal + * by holding the commit semaphore down for writing. As few I/O operations as + * possible are performed during commit start, instead the nodes that are to be + * written are merely identified. During commit end, the commit semaphore is no + * longer held and the journal is again in operation, allowing users to continue + * to use the file system while the bulk of the commit I/O is performed. The + * purpose of this two-step approach is to prevent the commit from causing any + * latency blips. Note that in any case, the commit does not prevent lookups + * (as permitted by the TNC mutex), or access to VFS data structures e.g. page + * cache. + */ + +#include +#include +#include +#include "ubifs.h" + +/* + * nothing_to_commit - check if there is nothing to commit. + * @c: UBIFS file-system description object + * + * This is a helper function which checks if there is anything to commit. It is + * used as an optimization to avoid starting the commit if it is not really + * necessary. Indeed, the commit operation always assumes flash I/O (e.g., + * writing the commit start node to the log), and it is better to avoid doing + * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is + * nothing to commit, it is more optimal to avoid any flash I/O. + * + * This function has to be called with @c->commit_sem locked for writing - + * this function does not take LPT/TNC locks because the @c->commit_sem + * guarantees that we have exclusive access to the TNC and LPT data structures. + * + * This function returns %1 if there is nothing to commit and %0 otherwise. + */ +static int nothing_to_commit(struct ubifs_info *c) +{ + /* + * During mounting or remounting from R/O mode to R/W mode we may + * commit for various recovery-related reasons. + */ + if (c->mounting || c->remounting_rw) + return 0; + + /* + * If the root TNC node is dirty, we definitely have something to + * commit. + */ + if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags)) + return 0; + + /* + * Even though the TNC is clean, the LPT tree may have dirty nodes. For + * example, this may happen if the budgeting subsystem invoked GC to + * make some free space, and the GC found an LEB with only dirty and + * free space. In this case GC would just change the lprops of this + * LEB (by turning all space into free space) and unmap it. + */ + if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags)) + return 0; + + ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0); + ubifs_assert(c->dirty_pn_cnt == 0); + ubifs_assert(c->dirty_nn_cnt == 0); + + return 1; +} + +/** + * do_commit - commit the journal. + * @c: UBIFS file-system description object + * + * This function implements UBIFS commit. It has to be called with commit lock + * locked. Returns zero in case of success and a negative error code in case of + * failure. + */ +static int do_commit(struct ubifs_info *c) +{ + int err, new_ltail_lnum, old_ltail_lnum, i; + struct ubifs_zbranch zroot; + struct ubifs_lp_stats lst; + + dbg_cmt("start"); + ubifs_assert(!c->ro_media && !c->ro_mount); + + if (c->ro_error) { + err = -EROFS; + goto out_up; + } + + if (nothing_to_commit(c)) { + up_write(&c->commit_sem); + err = 0; + goto out_cancel; + } + + /* Sync all write buffers (necessary for recovery) */ + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + goto out_up; + } + + c->cmt_no += 1; + err = ubifs_gc_start_commit(c); + if (err) + goto out_up; + err = dbg_check_lprops(c); + if (err) + goto out_up; + err = ubifs_log_start_commit(c, &new_ltail_lnum); + if (err) + goto out_up; + err = ubifs_tnc_start_commit(c, &zroot); + if (err) + goto out_up; + err = ubifs_lpt_start_commit(c); + if (err) + goto out_up; + err = ubifs_orphan_start_commit(c); + if (err) + goto out_up; + + ubifs_get_lp_stats(c, &lst); + + up_write(&c->commit_sem); + + err = ubifs_tnc_end_commit(c); + if (err) + goto out; + err = ubifs_lpt_end_commit(c); + if (err) + goto out; + err = ubifs_orphan_end_commit(c); + if (err) + goto out; + old_ltail_lnum = c->ltail_lnum; + err = ubifs_log_end_commit(c, new_ltail_lnum); + if (err) + goto out; + err = dbg_check_old_index(c, &zroot); + if (err) + goto out; + + mutex_lock(&c->mst_mutex); + c->mst_node->cmt_no = cpu_to_le64(c->cmt_no); + c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum); + c->mst_node->root_lnum = cpu_to_le32(zroot.lnum); + c->mst_node->root_offs = cpu_to_le32(zroot.offs); + c->mst_node->root_len = cpu_to_le32(zroot.len); + c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum); + c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs); + c->mst_node->index_size = cpu_to_le64(c->bi.old_idx_sz); + c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum); + c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs); + c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum); + c->mst_node->nhead_offs = cpu_to_le32(c->nhead_offs); + c->mst_node->ltab_lnum = cpu_to_le32(c->ltab_lnum); + c->mst_node->ltab_offs = cpu_to_le32(c->ltab_offs); + c->mst_node->lsave_lnum = cpu_to_le32(c->lsave_lnum); + c->mst_node->lsave_offs = cpu_to_le32(c->lsave_offs); + c->mst_node->lscan_lnum = cpu_to_le32(c->lscan_lnum); + c->mst_node->empty_lebs = cpu_to_le32(lst.empty_lebs); + c->mst_node->idx_lebs = cpu_to_le32(lst.idx_lebs); + c->mst_node->total_free = cpu_to_le64(lst.total_free); + c->mst_node->total_dirty = cpu_to_le64(lst.total_dirty); + c->mst_node->total_used = cpu_to_le64(lst.total_used); + c->mst_node->total_dead = cpu_to_le64(lst.total_dead); + c->mst_node->total_dark = cpu_to_le64(lst.total_dark); + if (c->no_orphs) + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + else + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS); + err = ubifs_write_master(c); + mutex_unlock(&c->mst_mutex); + if (err) + goto out; + + err = ubifs_log_post_commit(c, old_ltail_lnum); + if (err) + goto out; + err = ubifs_gc_end_commit(c); + if (err) + goto out; + err = ubifs_lpt_post_commit(c); + if (err) + goto out; + +out_cancel: + spin_lock(&c->cs_lock); + c->cmt_state = COMMIT_RESTING; + wake_up(&c->cmt_wq); + dbg_cmt("commit end"); + spin_unlock(&c->cs_lock); + return 0; + +out_up: + up_write(&c->commit_sem); +out: + ubifs_err("commit failed, error %d", err); + spin_lock(&c->cs_lock); + c->cmt_state = COMMIT_BROKEN; + wake_up(&c->cmt_wq); + spin_unlock(&c->cs_lock); + ubifs_ro_mode(c, err); + return err; +} + +/** + * run_bg_commit - run background commit if it is needed. + * @c: UBIFS file-system description object + * + * This function runs background commit if it is needed. Returns zero in case + * of success and a negative error code in case of failure. + */ +static int run_bg_commit(struct ubifs_info *c) +{ + spin_lock(&c->cs_lock); + /* + * Run background commit only if background commit was requested or if + * commit is required. + */ + if (c->cmt_state != COMMIT_BACKGROUND && + c->cmt_state != COMMIT_REQUIRED) + goto out; + spin_unlock(&c->cs_lock); + + down_write(&c->commit_sem); + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_REQUIRED) + c->cmt_state = COMMIT_RUNNING_REQUIRED; + else if (c->cmt_state == COMMIT_BACKGROUND) + c->cmt_state = COMMIT_RUNNING_BACKGROUND; + else + goto out_cmt_unlock; + spin_unlock(&c->cs_lock); + + return do_commit(c); + +out_cmt_unlock: + up_write(&c->commit_sem); +out: + spin_unlock(&c->cs_lock); + return 0; +} + +/** + * ubifs_bg_thread - UBIFS background thread function. + * @info: points to the file-system description object + * + * This function implements various file-system background activities: + * o when a write-buffer timer expires it synchronizes the appropriate + * write-buffer; + * o when the journal is about to be full, it starts in-advance commit. + * + * Note, other stuff like background garbage collection may be added here in + * future. + */ +int ubifs_bg_thread(void *info) +{ + int err; + struct ubifs_info *c = info; + + dbg_msg("background thread \"%s\" started, PID %d", + c->bgt_name, current->pid); + set_freezable(); + + while (1) { + if (kthread_should_stop()) + break; + + if (try_to_freeze()) + continue; + + set_current_state(TASK_INTERRUPTIBLE); + /* Check if there is something to do */ + if (!c->need_bgt) { + /* + * Nothing prevents us from going sleep now and + * be never woken up and block the task which + * could wait in 'kthread_stop()' forever. + */ + if (kthread_should_stop()) + break; + schedule(); + continue; + } else + __set_current_state(TASK_RUNNING); + + c->need_bgt = 0; + err = ubifs_bg_wbufs_sync(c); + if (err) + ubifs_ro_mode(c, err); + + run_bg_commit(c); + cond_resched(); + } + + dbg_msg("background thread \"%s\" stops", c->bgt_name); + return 0; +} + +/** + * ubifs_commit_required - set commit state to "required". + * @c: UBIFS file-system description object + * + * This function is called if a commit is required but cannot be done from the + * calling function, so it is just flagged instead. + */ +void ubifs_commit_required(struct ubifs_info *c) +{ + spin_lock(&c->cs_lock); + switch (c->cmt_state) { + case COMMIT_RESTING: + case COMMIT_BACKGROUND: + dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state), + dbg_cstate(COMMIT_REQUIRED)); + c->cmt_state = COMMIT_REQUIRED; + break; + case COMMIT_RUNNING_BACKGROUND: + dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state), + dbg_cstate(COMMIT_RUNNING_REQUIRED)); + c->cmt_state = COMMIT_RUNNING_REQUIRED; + break; + case COMMIT_REQUIRED: + case COMMIT_RUNNING_REQUIRED: + case COMMIT_BROKEN: + break; + } + spin_unlock(&c->cs_lock); +} + +/** + * ubifs_request_bg_commit - notify the background thread to do a commit. + * @c: UBIFS file-system description object + * + * This function is called if the journal is full enough to make a commit + * worthwhile, so background thread is kicked to start it. + */ +void ubifs_request_bg_commit(struct ubifs_info *c) +{ + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_RESTING) { + dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state), + dbg_cstate(COMMIT_BACKGROUND)); + c->cmt_state = COMMIT_BACKGROUND; + spin_unlock(&c->cs_lock); + ubifs_wake_up_bgt(c); + } else + spin_unlock(&c->cs_lock); +} + +/** + * wait_for_commit - wait for commit. + * @c: UBIFS file-system description object + * + * This function sleeps until the commit operation is no longer running. + */ +static int wait_for_commit(struct ubifs_info *c) +{ + dbg_cmt("pid %d goes sleep", current->pid); + + /* + * The following sleeps if the condition is false, and will be woken + * when the commit ends. It is possible, although very unlikely, that we + * will wake up and see the subsequent commit running, rather than the + * one we were waiting for, and go back to sleep. However, we will be + * woken again, so there is no danger of sleeping forever. + */ + wait_event(c->cmt_wq, c->cmt_state != COMMIT_RUNNING_BACKGROUND && + c->cmt_state != COMMIT_RUNNING_REQUIRED); + dbg_cmt("commit finished, pid %d woke up", current->pid); + return 0; +} + +/** + * ubifs_run_commit - run or wait for commit. + * @c: UBIFS file-system description object + * + * This function runs commit and returns zero in case of success and a negative + * error code in case of failure. + */ +int ubifs_run_commit(struct ubifs_info *c) +{ + int err = 0; + + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_BROKEN) { + err = -EINVAL; + goto out; + } + + if (c->cmt_state == COMMIT_RUNNING_BACKGROUND) + /* + * We set the commit state to 'running required' to indicate + * that we want it to complete as quickly as possible. + */ + c->cmt_state = COMMIT_RUNNING_REQUIRED; + + if (c->cmt_state == COMMIT_RUNNING_REQUIRED) { + spin_unlock(&c->cs_lock); + return wait_for_commit(c); + } + spin_unlock(&c->cs_lock); + + /* Ok, the commit is indeed needed */ + + down_write(&c->commit_sem); + spin_lock(&c->cs_lock); + /* + * Since we unlocked 'c->cs_lock', the state may have changed, so + * re-check it. + */ + if (c->cmt_state == COMMIT_BROKEN) { + err = -EINVAL; + goto out_cmt_unlock; + } + + if (c->cmt_state == COMMIT_RUNNING_BACKGROUND) + c->cmt_state = COMMIT_RUNNING_REQUIRED; + + if (c->cmt_state == COMMIT_RUNNING_REQUIRED) { + up_write(&c->commit_sem); + spin_unlock(&c->cs_lock); + return wait_for_commit(c); + } + c->cmt_state = COMMIT_RUNNING_REQUIRED; + spin_unlock(&c->cs_lock); + + err = do_commit(c); + return err; + +out_cmt_unlock: + up_write(&c->commit_sem); +out: + spin_unlock(&c->cs_lock); + return err; +} + +/** + * ubifs_gc_should_commit - determine if it is time for GC to run commit. + * @c: UBIFS file-system description object + * + * This function is called by garbage collection to determine if commit should + * be run. If commit state is @COMMIT_BACKGROUND, which means that the journal + * is full enough to start commit, this function returns true. It is not + * absolutely necessary to commit yet, but it feels like this should be better + * then to keep doing GC. This function returns %1 if GC has to initiate commit + * and %0 if not. + */ +int ubifs_gc_should_commit(struct ubifs_info *c) +{ + int ret = 0; + + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_BACKGROUND) { + dbg_cmt("commit required now"); + c->cmt_state = COMMIT_REQUIRED; + } else + dbg_cmt("commit not requested"); + if (c->cmt_state == COMMIT_REQUIRED) + ret = 1; + spin_unlock(&c->cs_lock); + return ret; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * struct idx_node - hold index nodes during index tree traversal. + * @list: list + * @iip: index in parent (slot number of this indexing node in the parent + * indexing node) + * @upper_key: all keys in this indexing node have to be less or equivalent to + * this key + * @idx: index node (8-byte aligned because all node structures must be 8-byte + * aligned) + */ +struct idx_node { + struct list_head list; + int iip; + union ubifs_key upper_key; + struct ubifs_idx_node idx __attribute__((aligned(8))); +}; + +/** + * dbg_old_index_check_init - get information for the next old index check. + * @c: UBIFS file-system description object + * @zroot: root of the index + * + * This function records information about the index that will be needed for the + * next old index check i.e. 'dbg_check_old_index()'. + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot) +{ + struct ubifs_idx_node *idx; + int lnum, offs, len, err = 0; + struct ubifs_debug_info *d = c->dbg; + + d->old_zroot = *zroot; + lnum = d->old_zroot.lnum; + offs = d->old_zroot.offs; + len = d->old_zroot.len; + + idx = kmalloc(c->max_idx_node_sz, GFP_NOFS); + if (!idx) + return -ENOMEM; + + err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs); + if (err) + goto out; + + d->old_zroot_level = le16_to_cpu(idx->level); + d->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum); +out: + kfree(idx); + return err; +} + +/** + * dbg_check_old_index - check the old copy of the index. + * @c: UBIFS file-system description object + * @zroot: root of the new index + * + * In order to be able to recover from an unclean unmount, a complete copy of + * the index must exist on flash. This is the "old" index. The commit process + * must write the "new" index to flash without overwriting or destroying any + * part of the old index. This function is run at commit end in order to check + * that the old index does indeed exist completely intact. + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) +{ + int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt; + int first = 1, iip; + struct ubifs_debug_info *d = c->dbg; + union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key; + unsigned long long uninitialized_var(last_sqnum); + struct ubifs_idx_node *idx; + struct list_head list; + struct idx_node *i; + size_t sz; + + if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX)) + return 0; + + INIT_LIST_HEAD(&list); + + sz = sizeof(struct idx_node) + ubifs_idx_node_sz(c, c->fanout) - + UBIFS_IDX_NODE_SZ; + + /* Start at the old zroot */ + lnum = d->old_zroot.lnum; + offs = d->old_zroot.offs; + len = d->old_zroot.len; + iip = 0; + + /* + * Traverse the index tree preorder depth-first i.e. do a node and then + * its subtrees from left to right. + */ + while (1) { + struct ubifs_branch *br; + + /* Get the next index node */ + i = kmalloc(sz, GFP_NOFS); + if (!i) { + err = -ENOMEM; + goto out_free; + } + i->iip = iip; + /* Keep the index nodes on our path in a linked list */ + list_add_tail(&i->list, &list); + /* Read the index node */ + idx = &i->idx; + err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs); + if (err) + goto out_free; + /* Validate index node */ + child_cnt = le16_to_cpu(idx->child_cnt); + if (child_cnt < 1 || child_cnt > c->fanout) { + err = 1; + goto out_dump; + } + if (first) { + first = 0; + /* Check root level and sqnum */ + if (le16_to_cpu(idx->level) != d->old_zroot_level) { + err = 2; + goto out_dump; + } + if (le64_to_cpu(idx->ch.sqnum) != d->old_zroot_sqnum) { + err = 3; + goto out_dump; + } + /* Set last values as though root had a parent */ + last_level = le16_to_cpu(idx->level) + 1; + last_sqnum = le64_to_cpu(idx->ch.sqnum) + 1; + key_read(c, ubifs_idx_key(c, idx), &lower_key); + highest_ino_key(c, &upper_key, INUM_WATERMARK); + } + key_copy(c, &upper_key, &i->upper_key); + if (le16_to_cpu(idx->level) != last_level - 1) { + err = 3; + goto out_dump; + } + /* + * The index is always written bottom up hence a child's sqnum + * is always less than the parents. + */ + if (le64_to_cpu(idx->ch.sqnum) >= last_sqnum) { + err = 4; + goto out_dump; + } + /* Check key range */ + key_read(c, ubifs_idx_key(c, idx), &l_key); + br = ubifs_idx_branch(c, idx, child_cnt - 1); + key_read(c, &br->key, &u_key); + if (keys_cmp(c, &lower_key, &l_key) > 0) { + err = 5; + goto out_dump; + } + if (keys_cmp(c, &upper_key, &u_key) < 0) { + err = 6; + goto out_dump; + } + if (keys_cmp(c, &upper_key, &u_key) == 0) + if (!is_hash_key(c, &u_key)) { + err = 7; + goto out_dump; + } + /* Go to next index node */ + if (le16_to_cpu(idx->level) == 0) { + /* At the bottom, so go up until can go right */ + while (1) { + /* Drop the bottom of the list */ + list_del(&i->list); + kfree(i); + /* No more list means we are done */ + if (list_empty(&list)) + goto out; + /* Look at the new bottom */ + i = list_entry(list.prev, struct idx_node, + list); + idx = &i->idx; + /* Can we go right */ + if (iip + 1 < le16_to_cpu(idx->child_cnt)) { + iip = iip + 1; + break; + } else + /* Nope, so go up again */ + iip = i->iip; + } + } else + /* Go down left */ + iip = 0; + /* + * We have the parent in 'idx' and now we set up for reading the + * child pointed to by slot 'iip'. + */ + last_level = le16_to_cpu(idx->level); + last_sqnum = le64_to_cpu(idx->ch.sqnum); + br = ubifs_idx_branch(c, idx, iip); + lnum = le32_to_cpu(br->lnum); + offs = le32_to_cpu(br->offs); + len = le32_to_cpu(br->len); + key_read(c, &br->key, &lower_key); + if (iip + 1 < le16_to_cpu(idx->child_cnt)) { + br = ubifs_idx_branch(c, idx, iip + 1); + key_read(c, &br->key, &upper_key); + } else + key_copy(c, &i->upper_key, &upper_key); + } +out: + err = dbg_old_index_check_init(c, zroot); + if (err) + goto out_free; + + return 0; + +out_dump: + dbg_err("dumping index node (iip=%d)", i->iip); + dbg_dump_node(c, idx); + list_del(&i->list); + kfree(i); + if (!list_empty(&list)) { + i = list_entry(list.prev, struct idx_node, list); + dbg_err("dumping parent index node"); + dbg_dump_node(c, &i->idx); + } +out_free: + while (!list_empty(&list)) { + i = list_entry(list.next, struct idx_node, list); + list_del(&i->list); + kfree(i); + } + ubifs_err("failed, error %d", err); + if (err > 0) + err = -EINVAL; + return err; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c new file mode 100644 index 00000000..11e4132f --- /dev/null +++ b/fs/ubifs/compress.c @@ -0,0 +1,251 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + * Zoltan Sogor + */ + +/* + * This file provides a single place to access to compression and + * decompression. + */ + +#include +#include "ubifs.h" + +/* Fake description object for the "none" compressor */ +static struct ubifs_compressor none_compr = { + .compr_type = UBIFS_COMPR_NONE, + .name = "none", + .capi_name = "", +}; + +#ifdef CONFIG_UBIFS_FS_LZO +static DEFINE_MUTEX(lzo_mutex); + +static struct ubifs_compressor lzo_compr = { + .compr_type = UBIFS_COMPR_LZO, + .comp_mutex = &lzo_mutex, + .name = "lzo", + .capi_name = "lzo", +}; +#else +static struct ubifs_compressor lzo_compr = { + .compr_type = UBIFS_COMPR_LZO, + .name = "lzo", +}; +#endif + +#ifdef CONFIG_UBIFS_FS_ZLIB +static DEFINE_MUTEX(deflate_mutex); +static DEFINE_MUTEX(inflate_mutex); + +static struct ubifs_compressor zlib_compr = { + .compr_type = UBIFS_COMPR_ZLIB, + .comp_mutex = &deflate_mutex, + .decomp_mutex = &inflate_mutex, + .name = "zlib", + .capi_name = "deflate", +}; +#else +static struct ubifs_compressor zlib_compr = { + .compr_type = UBIFS_COMPR_ZLIB, + .name = "zlib", +}; +#endif + +/* All UBIFS compressors */ +struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; + +/** + * ubifs_compress - compress data. + * @in_buf: data to compress + * @in_len: length of the data to compress + * @out_buf: output buffer where compressed data should be stored + * @out_len: output buffer length is returned here + * @compr_type: type of compression to use on enter, actually used compression + * type on exit + * + * This function compresses input buffer @in_buf of length @in_len and stores + * the result in the output buffer @out_buf and the resulting length in + * @out_len. If the input buffer does not compress, it is just copied to the + * @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE or if + * compression error occurred. + * + * Note, if the input buffer was not compressed, it is copied to the output + * buffer and %UBIFS_COMPR_NONE is returned in @compr_type. + */ +void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, + int *compr_type) +{ + int err; + struct ubifs_compressor *compr = ubifs_compressors[*compr_type]; + + if (*compr_type == UBIFS_COMPR_NONE) + goto no_compr; + + /* If the input data is small, do not even try to compress it */ + if (in_len < UBIFS_MIN_COMPR_LEN) + goto no_compr; + + if (compr->comp_mutex) + mutex_lock(compr->comp_mutex); + err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf, + (unsigned int *)out_len); + if (compr->comp_mutex) + mutex_unlock(compr->comp_mutex); + if (unlikely(err)) { + ubifs_warn("cannot compress %d bytes, compressor %s, " + "error %d, leave data uncompressed", + in_len, compr->name, err); + goto no_compr; + } + + /* + * If the data compressed only slightly, it is better to leave it + * uncompressed to improve read speed. + */ + if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF) + goto no_compr; + + return; + +no_compr: + memcpy(out_buf, in_buf, in_len); + *out_len = in_len; + *compr_type = UBIFS_COMPR_NONE; +} + +/** + * ubifs_decompress - decompress data. + * @in_buf: data to decompress + * @in_len: length of the data to decompress + * @out_buf: output buffer where decompressed data should + * @out_len: output length is returned here + * @compr_type: type of compression + * + * This function decompresses data from buffer @in_buf into buffer @out_buf. + * The length of the uncompressed data is returned in @out_len. This functions + * returns %0 on success or a negative error code on failure. + */ +int ubifs_decompress(const void *in_buf, int in_len, void *out_buf, + int *out_len, int compr_type) +{ + int err; + struct ubifs_compressor *compr; + + if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) { + ubifs_err("invalid compression type %d", compr_type); + return -EINVAL; + } + + compr = ubifs_compressors[compr_type]; + + if (unlikely(!compr->capi_name)) { + ubifs_err("%s compression is not compiled in", compr->name); + return -EINVAL; + } + + if (compr_type == UBIFS_COMPR_NONE) { + memcpy(out_buf, in_buf, in_len); + *out_len = in_len; + return 0; + } + + if (compr->decomp_mutex) + mutex_lock(compr->decomp_mutex); + err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf, + (unsigned int *)out_len); + if (compr->decomp_mutex) + mutex_unlock(compr->decomp_mutex); + if (err) + ubifs_err("cannot decompress %d bytes, compressor %s, " + "error %d", in_len, compr->name, err); + + return err; +} + +/** + * compr_init - initialize a compressor. + * @compr: compressor description object + * + * This function initializes the requested compressor and returns zero in case + * of success or a negative error code in case of failure. + */ +static int __init compr_init(struct ubifs_compressor *compr) +{ + if (compr->capi_name) { + compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0); + if (IS_ERR(compr->cc)) { + ubifs_err("cannot initialize compressor %s, error %ld", + compr->name, PTR_ERR(compr->cc)); + return PTR_ERR(compr->cc); + } + } + + ubifs_compressors[compr->compr_type] = compr; + return 0; +} + +/** + * compr_exit - de-initialize a compressor. + * @compr: compressor description object + */ +static void compr_exit(struct ubifs_compressor *compr) +{ + if (compr->capi_name) + crypto_free_comp(compr->cc); + return; +} + +/** + * ubifs_compressors_init - initialize UBIFS compressors. + * + * This function initializes the compressor which were compiled in. Returns + * zero in case of success and a negative error code in case of failure. + */ +int __init ubifs_compressors_init(void) +{ + int err; + + err = compr_init(&lzo_compr); + if (err) + return err; + + err = compr_init(&zlib_compr); + if (err) + goto out_lzo; + + ubifs_compressors[UBIFS_COMPR_NONE] = &none_compr; + return 0; + +out_lzo: + compr_exit(&lzo_compr); + return err; +} + +/** + * ubifs_compressors_exit - de-initialize UBIFS compressors. + */ +void ubifs_compressors_exit(void) +{ + compr_exit(&lzo_compr); + compr_exit(&zlib_compr); +} diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c new file mode 100644 index 00000000..0bb2bcef --- /dev/null +++ b/fs/ubifs/debug.c @@ -0,0 +1,2933 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements most of the debugging stuff which is compiled in only + * when it is enabled. But some debugging check functions are implemented in + * corresponding subsystem, just because they are closely related and utilize + * various local functions of those subsystems. + */ + +#define UBIFS_DBG_PRESERVE_UBI + +#include "ubifs.h" +#include +#include +#include +#include + +#ifdef CONFIG_UBIFS_FS_DEBUG + +DEFINE_SPINLOCK(dbg_lock); + +static char dbg_key_buf0[128]; +static char dbg_key_buf1[128]; + +unsigned int ubifs_chk_flags; +unsigned int ubifs_tst_flags; + +module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR); +module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR); + +MODULE_PARM_DESC(debug_chks, "Debug check flags"); +MODULE_PARM_DESC(debug_tsts, "Debug special test flags"); + +static const char *get_key_fmt(int fmt) +{ + switch (fmt) { + case UBIFS_SIMPLE_KEY_FMT: + return "simple"; + default: + return "unknown/invalid format"; + } +} + +static const char *get_key_hash(int hash) +{ + switch (hash) { + case UBIFS_KEY_HASH_R5: + return "R5"; + case UBIFS_KEY_HASH_TEST: + return "test"; + default: + return "unknown/invalid name hash"; + } +} + +static const char *get_key_type(int type) +{ + switch (type) { + case UBIFS_INO_KEY: + return "inode"; + case UBIFS_DENT_KEY: + return "direntry"; + case UBIFS_XENT_KEY: + return "xentry"; + case UBIFS_DATA_KEY: + return "data"; + case UBIFS_TRUN_KEY: + return "truncate"; + default: + return "unknown/invalid key"; + } +} + +static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, + char *buffer) +{ + char *p = buffer; + int type = key_type(c, key); + + if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) { + switch (type) { + case UBIFS_INO_KEY: + sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key), + get_key_type(type)); + break; + case UBIFS_DENT_KEY: + case UBIFS_XENT_KEY: + sprintf(p, "(%lu, %s, %#08x)", + (unsigned long)key_inum(c, key), + get_key_type(type), key_hash(c, key)); + break; + case UBIFS_DATA_KEY: + sprintf(p, "(%lu, %s, %u)", + (unsigned long)key_inum(c, key), + get_key_type(type), key_block(c, key)); + break; + case UBIFS_TRUN_KEY: + sprintf(p, "(%lu, %s)", + (unsigned long)key_inum(c, key), + get_key_type(type)); + break; + default: + sprintf(p, "(bad key type: %#08x, %#08x)", + key->u32[0], key->u32[1]); + } + } else + sprintf(p, "bad key format %d", c->key_fmt); +} + +const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key) +{ + /* dbg_lock must be held */ + sprintf_key(c, key, dbg_key_buf0); + return dbg_key_buf0; +} + +const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key) +{ + /* dbg_lock must be held */ + sprintf_key(c, key, dbg_key_buf1); + return dbg_key_buf1; +} + +const char *dbg_ntype(int type) +{ + switch (type) { + case UBIFS_PAD_NODE: + return "padding node"; + case UBIFS_SB_NODE: + return "superblock node"; + case UBIFS_MST_NODE: + return "master node"; + case UBIFS_REF_NODE: + return "reference node"; + case UBIFS_INO_NODE: + return "inode node"; + case UBIFS_DENT_NODE: + return "direntry node"; + case UBIFS_XENT_NODE: + return "xentry node"; + case UBIFS_DATA_NODE: + return "data node"; + case UBIFS_TRUN_NODE: + return "truncate node"; + case UBIFS_IDX_NODE: + return "indexing node"; + case UBIFS_CS_NODE: + return "commit start node"; + case UBIFS_ORPH_NODE: + return "orphan node"; + default: + return "unknown node"; + } +} + +static const char *dbg_gtype(int type) +{ + switch (type) { + case UBIFS_NO_NODE_GROUP: + return "no node group"; + case UBIFS_IN_NODE_GROUP: + return "in node group"; + case UBIFS_LAST_OF_NODE_GROUP: + return "last of node group"; + default: + return "unknown"; + } +} + +const char *dbg_cstate(int cmt_state) +{ + switch (cmt_state) { + case COMMIT_RESTING: + return "commit resting"; + case COMMIT_BACKGROUND: + return "background commit requested"; + case COMMIT_REQUIRED: + return "commit required"; + case COMMIT_RUNNING_BACKGROUND: + return "BACKGROUND commit running"; + case COMMIT_RUNNING_REQUIRED: + return "commit running and required"; + case COMMIT_BROKEN: + return "broken commit"; + default: + return "unknown commit state"; + } +} + +const char *dbg_jhead(int jhead) +{ + switch (jhead) { + case GCHD: + return "0 (GC)"; + case BASEHD: + return "1 (base)"; + case DATAHD: + return "2 (data)"; + default: + return "unknown journal head"; + } +} + +static void dump_ch(const struct ubifs_ch *ch) +{ + printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic)); + printk(KERN_DEBUG "\tcrc %#x\n", le32_to_cpu(ch->crc)); + printk(KERN_DEBUG "\tnode_type %d (%s)\n", ch->node_type, + dbg_ntype(ch->node_type)); + printk(KERN_DEBUG "\tgroup_type %d (%s)\n", ch->group_type, + dbg_gtype(ch->group_type)); + printk(KERN_DEBUG "\tsqnum %llu\n", + (unsigned long long)le64_to_cpu(ch->sqnum)); + printk(KERN_DEBUG "\tlen %u\n", le32_to_cpu(ch->len)); +} + +void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode) +{ + const struct ubifs_inode *ui = ubifs_inode(inode); + + printk(KERN_DEBUG "Dump in-memory inode:"); + printk(KERN_DEBUG "\tinode %lu\n", inode->i_ino); + printk(KERN_DEBUG "\tsize %llu\n", + (unsigned long long)i_size_read(inode)); + printk(KERN_DEBUG "\tnlink %u\n", inode->i_nlink); + printk(KERN_DEBUG "\tuid %u\n", (unsigned int)inode->i_uid); + printk(KERN_DEBUG "\tgid %u\n", (unsigned int)inode->i_gid); + printk(KERN_DEBUG "\tatime %u.%u\n", + (unsigned int)inode->i_atime.tv_sec, + (unsigned int)inode->i_atime.tv_nsec); + printk(KERN_DEBUG "\tmtime %u.%u\n", + (unsigned int)inode->i_mtime.tv_sec, + (unsigned int)inode->i_mtime.tv_nsec); + printk(KERN_DEBUG "\tctime %u.%u\n", + (unsigned int)inode->i_ctime.tv_sec, + (unsigned int)inode->i_ctime.tv_nsec); + printk(KERN_DEBUG "\tcreat_sqnum %llu\n", ui->creat_sqnum); + printk(KERN_DEBUG "\txattr_size %u\n", ui->xattr_size); + printk(KERN_DEBUG "\txattr_cnt %u\n", ui->xattr_cnt); + printk(KERN_DEBUG "\txattr_names %u\n", ui->xattr_names); + printk(KERN_DEBUG "\tdirty %u\n", ui->dirty); + printk(KERN_DEBUG "\txattr %u\n", ui->xattr); + printk(KERN_DEBUG "\tbulk_read %u\n", ui->xattr); + printk(KERN_DEBUG "\tsynced_i_size %llu\n", + (unsigned long long)ui->synced_i_size); + printk(KERN_DEBUG "\tui_size %llu\n", + (unsigned long long)ui->ui_size); + printk(KERN_DEBUG "\tflags %d\n", ui->flags); + printk(KERN_DEBUG "\tcompr_type %d\n", ui->compr_type); + printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read); + printk(KERN_DEBUG "\tread_in_a_row %lu\n", ui->read_in_a_row); + printk(KERN_DEBUG "\tdata_len %d\n", ui->data_len); +} + +void dbg_dump_node(const struct ubifs_info *c, const void *node) +{ + int i, n; + union ubifs_key key; + const struct ubifs_ch *ch = node; + + if (dbg_failure_mode) + return; + + /* If the magic is incorrect, just hexdump the first bytes */ + if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) { + printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, + (void *)node, UBIFS_CH_SZ, 1); + return; + } + + spin_lock(&dbg_lock); + dump_ch(node); + + switch (ch->node_type) { + case UBIFS_PAD_NODE: + { + const struct ubifs_pad_node *pad = node; + + printk(KERN_DEBUG "\tpad_len %u\n", + le32_to_cpu(pad->pad_len)); + break; + } + case UBIFS_SB_NODE: + { + const struct ubifs_sb_node *sup = node; + unsigned int sup_flags = le32_to_cpu(sup->flags); + + printk(KERN_DEBUG "\tkey_hash %d (%s)\n", + (int)sup->key_hash, get_key_hash(sup->key_hash)); + printk(KERN_DEBUG "\tkey_fmt %d (%s)\n", + (int)sup->key_fmt, get_key_fmt(sup->key_fmt)); + printk(KERN_DEBUG "\tflags %#x\n", sup_flags); + printk(KERN_DEBUG "\t big_lpt %u\n", + !!(sup_flags & UBIFS_FLG_BIGLPT)); + printk(KERN_DEBUG "\t space_fixup %u\n", + !!(sup_flags & UBIFS_FLG_SPACE_FIXUP)); + printk(KERN_DEBUG "\tmin_io_size %u\n", + le32_to_cpu(sup->min_io_size)); + printk(KERN_DEBUG "\tleb_size %u\n", + le32_to_cpu(sup->leb_size)); + printk(KERN_DEBUG "\tleb_cnt %u\n", + le32_to_cpu(sup->leb_cnt)); + printk(KERN_DEBUG "\tmax_leb_cnt %u\n", + le32_to_cpu(sup->max_leb_cnt)); + printk(KERN_DEBUG "\tmax_bud_bytes %llu\n", + (unsigned long long)le64_to_cpu(sup->max_bud_bytes)); + printk(KERN_DEBUG "\tlog_lebs %u\n", + le32_to_cpu(sup->log_lebs)); + printk(KERN_DEBUG "\tlpt_lebs %u\n", + le32_to_cpu(sup->lpt_lebs)); + printk(KERN_DEBUG "\torph_lebs %u\n", + le32_to_cpu(sup->orph_lebs)); + printk(KERN_DEBUG "\tjhead_cnt %u\n", + le32_to_cpu(sup->jhead_cnt)); + printk(KERN_DEBUG "\tfanout %u\n", + le32_to_cpu(sup->fanout)); + printk(KERN_DEBUG "\tlsave_cnt %u\n", + le32_to_cpu(sup->lsave_cnt)); + printk(KERN_DEBUG "\tdefault_compr %u\n", + (int)le16_to_cpu(sup->default_compr)); + printk(KERN_DEBUG "\trp_size %llu\n", + (unsigned long long)le64_to_cpu(sup->rp_size)); + printk(KERN_DEBUG "\trp_uid %u\n", + le32_to_cpu(sup->rp_uid)); + printk(KERN_DEBUG "\trp_gid %u\n", + le32_to_cpu(sup->rp_gid)); + printk(KERN_DEBUG "\tfmt_version %u\n", + le32_to_cpu(sup->fmt_version)); + printk(KERN_DEBUG "\ttime_gran %u\n", + le32_to_cpu(sup->time_gran)); + printk(KERN_DEBUG "\tUUID %pUB\n", + sup->uuid); + break; + } + case UBIFS_MST_NODE: + { + const struct ubifs_mst_node *mst = node; + + printk(KERN_DEBUG "\thighest_inum %llu\n", + (unsigned long long)le64_to_cpu(mst->highest_inum)); + printk(KERN_DEBUG "\tcommit number %llu\n", + (unsigned long long)le64_to_cpu(mst->cmt_no)); + printk(KERN_DEBUG "\tflags %#x\n", + le32_to_cpu(mst->flags)); + printk(KERN_DEBUG "\tlog_lnum %u\n", + le32_to_cpu(mst->log_lnum)); + printk(KERN_DEBUG "\troot_lnum %u\n", + le32_to_cpu(mst->root_lnum)); + printk(KERN_DEBUG "\troot_offs %u\n", + le32_to_cpu(mst->root_offs)); + printk(KERN_DEBUG "\troot_len %u\n", + le32_to_cpu(mst->root_len)); + printk(KERN_DEBUG "\tgc_lnum %u\n", + le32_to_cpu(mst->gc_lnum)); + printk(KERN_DEBUG "\tihead_lnum %u\n", + le32_to_cpu(mst->ihead_lnum)); + printk(KERN_DEBUG "\tihead_offs %u\n", + le32_to_cpu(mst->ihead_offs)); + printk(KERN_DEBUG "\tindex_size %llu\n", + (unsigned long long)le64_to_cpu(mst->index_size)); + printk(KERN_DEBUG "\tlpt_lnum %u\n", + le32_to_cpu(mst->lpt_lnum)); + printk(KERN_DEBUG "\tlpt_offs %u\n", + le32_to_cpu(mst->lpt_offs)); + printk(KERN_DEBUG "\tnhead_lnum %u\n", + le32_to_cpu(mst->nhead_lnum)); + printk(KERN_DEBUG "\tnhead_offs %u\n", + le32_to_cpu(mst->nhead_offs)); + printk(KERN_DEBUG "\tltab_lnum %u\n", + le32_to_cpu(mst->ltab_lnum)); + printk(KERN_DEBUG "\tltab_offs %u\n", + le32_to_cpu(mst->ltab_offs)); + printk(KERN_DEBUG "\tlsave_lnum %u\n", + le32_to_cpu(mst->lsave_lnum)); + printk(KERN_DEBUG "\tlsave_offs %u\n", + le32_to_cpu(mst->lsave_offs)); + printk(KERN_DEBUG "\tlscan_lnum %u\n", + le32_to_cpu(mst->lscan_lnum)); + printk(KERN_DEBUG "\tleb_cnt %u\n", + le32_to_cpu(mst->leb_cnt)); + printk(KERN_DEBUG "\tempty_lebs %u\n", + le32_to_cpu(mst->empty_lebs)); + printk(KERN_DEBUG "\tidx_lebs %u\n", + le32_to_cpu(mst->idx_lebs)); + printk(KERN_DEBUG "\ttotal_free %llu\n", + (unsigned long long)le64_to_cpu(mst->total_free)); + printk(KERN_DEBUG "\ttotal_dirty %llu\n", + (unsigned long long)le64_to_cpu(mst->total_dirty)); + printk(KERN_DEBUG "\ttotal_used %llu\n", + (unsigned long long)le64_to_cpu(mst->total_used)); + printk(KERN_DEBUG "\ttotal_dead %llu\n", + (unsigned long long)le64_to_cpu(mst->total_dead)); + printk(KERN_DEBUG "\ttotal_dark %llu\n", + (unsigned long long)le64_to_cpu(mst->total_dark)); + break; + } + case UBIFS_REF_NODE: + { + const struct ubifs_ref_node *ref = node; + + printk(KERN_DEBUG "\tlnum %u\n", + le32_to_cpu(ref->lnum)); + printk(KERN_DEBUG "\toffs %u\n", + le32_to_cpu(ref->offs)); + printk(KERN_DEBUG "\tjhead %u\n", + le32_to_cpu(ref->jhead)); + break; + } + case UBIFS_INO_NODE: + { + const struct ubifs_ino_node *ino = node; + + key_read(c, &ino->key, &key); + printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tcreat_sqnum %llu\n", + (unsigned long long)le64_to_cpu(ino->creat_sqnum)); + printk(KERN_DEBUG "\tsize %llu\n", + (unsigned long long)le64_to_cpu(ino->size)); + printk(KERN_DEBUG "\tnlink %u\n", + le32_to_cpu(ino->nlink)); + printk(KERN_DEBUG "\tatime %lld.%u\n", + (long long)le64_to_cpu(ino->atime_sec), + le32_to_cpu(ino->atime_nsec)); + printk(KERN_DEBUG "\tmtime %lld.%u\n", + (long long)le64_to_cpu(ino->mtime_sec), + le32_to_cpu(ino->mtime_nsec)); + printk(KERN_DEBUG "\tctime %lld.%u\n", + (long long)le64_to_cpu(ino->ctime_sec), + le32_to_cpu(ino->ctime_nsec)); + printk(KERN_DEBUG "\tuid %u\n", + le32_to_cpu(ino->uid)); + printk(KERN_DEBUG "\tgid %u\n", + le32_to_cpu(ino->gid)); + printk(KERN_DEBUG "\tmode %u\n", + le32_to_cpu(ino->mode)); + printk(KERN_DEBUG "\tflags %#x\n", + le32_to_cpu(ino->flags)); + printk(KERN_DEBUG "\txattr_cnt %u\n", + le32_to_cpu(ino->xattr_cnt)); + printk(KERN_DEBUG "\txattr_size %u\n", + le32_to_cpu(ino->xattr_size)); + printk(KERN_DEBUG "\txattr_names %u\n", + le32_to_cpu(ino->xattr_names)); + printk(KERN_DEBUG "\tcompr_type %#x\n", + (int)le16_to_cpu(ino->compr_type)); + printk(KERN_DEBUG "\tdata len %u\n", + le32_to_cpu(ino->data_len)); + break; + } + case UBIFS_DENT_NODE: + case UBIFS_XENT_NODE: + { + const struct ubifs_dent_node *dent = node; + int nlen = le16_to_cpu(dent->nlen); + + key_read(c, &dent->key, &key); + printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tinum %llu\n", + (unsigned long long)le64_to_cpu(dent->inum)); + printk(KERN_DEBUG "\ttype %d\n", (int)dent->type); + printk(KERN_DEBUG "\tnlen %d\n", nlen); + printk(KERN_DEBUG "\tname "); + + if (nlen > UBIFS_MAX_NLEN) + printk(KERN_DEBUG "(bad name length, not printing, " + "bad or corrupted node)"); + else { + for (i = 0; i < nlen && dent->name[i]; i++) + printk(KERN_CONT "%c", dent->name[i]); + } + printk(KERN_CONT "\n"); + + break; + } + case UBIFS_DATA_NODE: + { + const struct ubifs_data_node *dn = node; + int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; + + key_read(c, &dn->key, &key); + printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tsize %u\n", + le32_to_cpu(dn->size)); + printk(KERN_DEBUG "\tcompr_typ %d\n", + (int)le16_to_cpu(dn->compr_type)); + printk(KERN_DEBUG "\tdata size %d\n", + dlen); + printk(KERN_DEBUG "\tdata:\n"); + print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1, + (void *)&dn->data, dlen, 0); + break; + } + case UBIFS_TRUN_NODE: + { + const struct ubifs_trun_node *trun = node; + + printk(KERN_DEBUG "\tinum %u\n", + le32_to_cpu(trun->inum)); + printk(KERN_DEBUG "\told_size %llu\n", + (unsigned long long)le64_to_cpu(trun->old_size)); + printk(KERN_DEBUG "\tnew_size %llu\n", + (unsigned long long)le64_to_cpu(trun->new_size)); + break; + } + case UBIFS_IDX_NODE: + { + const struct ubifs_idx_node *idx = node; + + n = le16_to_cpu(idx->child_cnt); + printk(KERN_DEBUG "\tchild_cnt %d\n", n); + printk(KERN_DEBUG "\tlevel %d\n", + (int)le16_to_cpu(idx->level)); + printk(KERN_DEBUG "\tBranches:\n"); + + for (i = 0; i < n && i < c->fanout - 1; i++) { + const struct ubifs_branch *br; + + br = ubifs_idx_branch(c, idx, i); + key_read(c, &br->key, &key); + printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n", + i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs), + le32_to_cpu(br->len), DBGKEY(&key)); + } + break; + } + case UBIFS_CS_NODE: + break; + case UBIFS_ORPH_NODE: + { + const struct ubifs_orph_node *orph = node; + + printk(KERN_DEBUG "\tcommit number %llu\n", + (unsigned long long) + le64_to_cpu(orph->cmt_no) & LLONG_MAX); + printk(KERN_DEBUG "\tlast node flag %llu\n", + (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63); + n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3; + printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n); + for (i = 0; i < n; i++) + printk(KERN_DEBUG "\t ino %llu\n", + (unsigned long long)le64_to_cpu(orph->inos[i])); + break; + } + default: + printk(KERN_DEBUG "node type %d was not recognized\n", + (int)ch->node_type); + } + spin_unlock(&dbg_lock); +} + +void dbg_dump_budget_req(const struct ubifs_budget_req *req) +{ + spin_lock(&dbg_lock); + printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n", + req->new_ino, req->dirtied_ino); + printk(KERN_DEBUG "\tnew_ino_d %d, dirtied_ino_d %d\n", + req->new_ino_d, req->dirtied_ino_d); + printk(KERN_DEBUG "\tnew_page %d, dirtied_page %d\n", + req->new_page, req->dirtied_page); + printk(KERN_DEBUG "\tnew_dent %d, mod_dent %d\n", + req->new_dent, req->mod_dent); + printk(KERN_DEBUG "\tidx_growth %d\n", req->idx_growth); + printk(KERN_DEBUG "\tdata_growth %d dd_growth %d\n", + req->data_growth, req->dd_growth); + spin_unlock(&dbg_lock); +} + +void dbg_dump_lstats(const struct ubifs_lp_stats *lst) +{ + spin_lock(&dbg_lock); + printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, " + "idx_lebs %d\n", current->pid, lst->empty_lebs, lst->idx_lebs); + printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, " + "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free, + lst->total_dirty); + printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, " + "total_dead %lld\n", lst->total_used, lst->total_dark, + lst->total_dead); + spin_unlock(&dbg_lock); +} + +void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi) +{ + int i; + struct rb_node *rb; + struct ubifs_bud *bud; + struct ubifs_gced_idx_leb *idx_gc; + long long available, outstanding, free; + + spin_lock(&c->space_lock); + spin_lock(&dbg_lock); + printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, " + "total budget sum %lld\n", current->pid, + bi->data_growth + bi->dd_growth, + bi->data_growth + bi->dd_growth + bi->idx_growth); + printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, " + "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth, + bi->idx_growth); + printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, " + "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz, + bi->uncommitted_idx); + printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n", + bi->page_budget, bi->inode_budget, bi->dent_budget); + printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n", + bi->nospace, bi->nospace_rp); + printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", + c->dark_wm, c->dead_wm, c->max_idx_node_sz); + + if (bi != &c->bi) + /* + * If we are dumping saved budgeting data, do not print + * additional information which is about the current state, not + * the old one which corresponded to the saved budgeting data. + */ + goto out_unlock; + + printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n", + c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt); + printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, " + "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt), + atomic_long_read(&c->dirty_zn_cnt), + atomic_long_read(&c->clean_zn_cnt)); + printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", + c->gc_lnum, c->ihead_lnum); + + /* If we are in R/O mode, journal heads do not exist */ + if (c->jheads) + for (i = 0; i < c->jhead_cnt; i++) + printk(KERN_DEBUG "\tjhead %s\t LEB %d\n", + dbg_jhead(c->jheads[i].wbuf.jhead), + c->jheads[i].wbuf.lnum); + for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { + bud = rb_entry(rb, struct ubifs_bud, rb); + printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); + } + list_for_each_entry(bud, &c->old_buds, list) + printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum); + list_for_each_entry(idx_gc, &c->idx_gc, list) + printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n", + idx_gc->lnum, idx_gc->unmap); + printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state); + + /* Print budgeting predictions */ + available = ubifs_calc_available(c, c->bi.min_idx_lebs); + outstanding = c->bi.data_growth + c->bi.dd_growth; + free = ubifs_get_free_space_nolock(c); + printk(KERN_DEBUG "Budgeting predictions:\n"); + printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n", + available, outstanding, free); +out_unlock: + spin_unlock(&dbg_lock); + spin_unlock(&c->space_lock); +} + +void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) +{ + int i, spc, dark = 0, dead = 0; + struct rb_node *rb; + struct ubifs_bud *bud; + + spc = lp->free + lp->dirty; + if (spc < c->dead_wm) + dead = spc; + else + dark = ubifs_calc_dark(c, spc); + + if (lp->flags & LPROPS_INDEX) + printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d " + "free + dirty %-8d flags %#x (", lp->lnum, lp->free, + lp->dirty, c->leb_size - spc, spc, lp->flags); + else + printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d " + "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d " + "flags %#-4x (", lp->lnum, lp->free, lp->dirty, + c->leb_size - spc, spc, dark, dead, + (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags); + + if (lp->flags & LPROPS_TAKEN) { + if (lp->flags & LPROPS_INDEX) + printk(KERN_CONT "index, taken"); + else + printk(KERN_CONT "taken"); + } else { + const char *s; + + if (lp->flags & LPROPS_INDEX) { + switch (lp->flags & LPROPS_CAT_MASK) { + case LPROPS_DIRTY_IDX: + s = "dirty index"; + break; + case LPROPS_FRDI_IDX: + s = "freeable index"; + break; + default: + s = "index"; + } + } else { + switch (lp->flags & LPROPS_CAT_MASK) { + case LPROPS_UNCAT: + s = "not categorized"; + break; + case LPROPS_DIRTY: + s = "dirty"; + break; + case LPROPS_FREE: + s = "free"; + break; + case LPROPS_EMPTY: + s = "empty"; + break; + case LPROPS_FREEABLE: + s = "freeable"; + break; + default: + s = NULL; + break; + } + } + printk(KERN_CONT "%s", s); + } + + for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) { + bud = rb_entry(rb, struct ubifs_bud, rb); + if (bud->lnum == lp->lnum) { + int head = 0; + for (i = 0; i < c->jhead_cnt; i++) { + /* + * Note, if we are in R/O mode or in the middle + * of mounting/re-mounting, the write-buffers do + * not exist. + */ + if (c->jheads && + lp->lnum == c->jheads[i].wbuf.lnum) { + printk(KERN_CONT ", jhead %s", + dbg_jhead(i)); + head = 1; + } + } + if (!head) + printk(KERN_CONT ", bud of jhead %s", + dbg_jhead(bud->jhead)); + } + } + if (lp->lnum == c->gc_lnum) + printk(KERN_CONT ", GC LEB"); + printk(KERN_CONT ")\n"); +} + +void dbg_dump_lprops(struct ubifs_info *c) +{ + int lnum, err; + struct ubifs_lprops lp; + struct ubifs_lp_stats lst; + + printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n", + current->pid); + ubifs_get_lp_stats(c, &lst); + dbg_dump_lstats(&lst); + + for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { + err = ubifs_read_one_lp(c, lnum, &lp); + if (err) + ubifs_err("cannot read lprops for LEB %d", lnum); + + dbg_dump_lprop(c, &lp); + } + printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n", + current->pid); +} + +void dbg_dump_lpt_info(struct ubifs_info *c) +{ + int i; + + spin_lock(&dbg_lock); + printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid); + printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz); + printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz); + printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz); + printk(KERN_DEBUG "\tltab_sz: %d\n", c->ltab_sz); + printk(KERN_DEBUG "\tlsave_sz: %d\n", c->lsave_sz); + printk(KERN_DEBUG "\tbig_lpt: %d\n", c->big_lpt); + printk(KERN_DEBUG "\tlpt_hght: %d\n", c->lpt_hght); + printk(KERN_DEBUG "\tpnode_cnt: %d\n", c->pnode_cnt); + printk(KERN_DEBUG "\tnnode_cnt: %d\n", c->nnode_cnt); + printk(KERN_DEBUG "\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt); + printk(KERN_DEBUG "\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt); + printk(KERN_DEBUG "\tlsave_cnt: %d\n", c->lsave_cnt); + printk(KERN_DEBUG "\tspace_bits: %d\n", c->space_bits); + printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); + printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits); + printk(KERN_DEBUG "\tlpt_spc_bits: %d\n", c->lpt_spc_bits); + printk(KERN_DEBUG "\tpcnt_bits: %d\n", c->pcnt_bits); + printk(KERN_DEBUG "\tlnum_bits: %d\n", c->lnum_bits); + printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); + printk(KERN_DEBUG "\tLPT head is at %d:%d\n", + c->nhead_lnum, c->nhead_offs); + printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", + c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n", + c->lsave_lnum, c->lsave_offs); + for (i = 0; i < c->lpt_lebs; i++) + printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d " + "cmt %d\n", i + c->lpt_first, c->ltab[i].free, + c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt); + spin_unlock(&dbg_lock); +} + +void dbg_dump_leb(const struct ubifs_info *c, int lnum) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + void *buf; + + if (dbg_failure_mode) + return; + + printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", + current->pid, lnum); + + buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory for dumping LEB %d", lnum); + return; + } + + sleb = ubifs_scan(c, lnum, 0, buf, 0); + if (IS_ERR(sleb)) { + ubifs_err("scan error %d", (int)PTR_ERR(sleb)); + goto out; + } + + printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum, + sleb->nodes_cnt, sleb->endpt); + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum, + snod->offs, snod->len); + dbg_dump_node(c, snod->node); + } + + printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", + current->pid, lnum); + ubifs_scan_destroy(sleb); + +out: + vfree(buf); + return; +} + +void dbg_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode) +{ + int n; + const struct ubifs_zbranch *zbr; + + spin_lock(&dbg_lock); + if (znode->parent) + zbr = &znode->parent->zbranch[znode->iip]; + else + zbr = &c->zroot; + + printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d" + " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs, + zbr->len, znode->parent, znode->iip, znode->level, + znode->child_cnt, znode->flags); + + if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) { + spin_unlock(&dbg_lock); + return; + } + + printk(KERN_DEBUG "zbranches:\n"); + for (n = 0; n < znode->child_cnt; n++) { + zbr = &znode->zbranch[n]; + if (znode->level > 0) + printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key " + "%s\n", n, zbr->znode, zbr->lnum, + zbr->offs, zbr->len, + DBGKEY(&zbr->key)); + else + printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key " + "%s\n", n, zbr->znode, zbr->lnum, + zbr->offs, zbr->len, + DBGKEY(&zbr->key)); + } + spin_unlock(&dbg_lock); +} + +void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) +{ + int i; + + printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n", + current->pid, cat, heap->cnt); + for (i = 0; i < heap->cnt; i++) { + struct ubifs_lprops *lprops = heap->arr[i]; + + printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d " + "flags %d\n", i, lprops->lnum, lprops->hpos, + lprops->free, lprops->dirty, lprops->flags); + } + printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid); +} + +void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip) +{ + int i; + + printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid); + printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n", + (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); + printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n", + pnode->flags, iip, pnode->level, pnode->num); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops *lp = &pnode->lprops[i]; + + printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n", + i, lp->free, lp->dirty, lp->flags, lp->lnum); + } +} + +void dbg_dump_tnc(struct ubifs_info *c) +{ + struct ubifs_znode *znode; + int level; + + printk(KERN_DEBUG "\n"); + printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid); + znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); + level = znode->level; + printk(KERN_DEBUG "== Level %d ==\n", level); + while (znode) { + if (level != znode->level) { + level = znode->level; + printk(KERN_DEBUG "== Level %d ==\n", level); + } + dbg_dump_znode(c, znode); + znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); + } + printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid); +} + +static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, + void *priv) +{ + dbg_dump_znode(c, znode); + return 0; +} + +/** + * dbg_dump_index - dump the on-flash index. + * @c: UBIFS file-system description object + * + * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()' + * which dumps only in-memory znodes and does not read znodes which from flash. + */ +void dbg_dump_index(struct ubifs_info *c) +{ + dbg_walk_index(c, NULL, dump_znode, NULL); +} + +/** + * dbg_save_space_info - save information about flash space. + * @c: UBIFS file-system description object + * + * This function saves information about UBIFS free space, dirty space, etc, in + * order to check it later. + */ +void dbg_save_space_info(struct ubifs_info *c) +{ + struct ubifs_debug_info *d = c->dbg; + int freeable_cnt; + + spin_lock(&c->space_lock); + memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats)); + memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info)); + d->saved_idx_gc_cnt = c->idx_gc_cnt; + + /* + * We use a dirty hack here and zero out @c->freeable_cnt, because it + * affects the free space calculations, and UBIFS might not know about + * all freeable eraseblocks. Indeed, we know about freeable eraseblocks + * only when we read their lprops, and we do this only lazily, upon the + * need. So at any given point of time @c->freeable_cnt might be not + * exactly accurate. + * + * Just one example about the issue we hit when we did not zero + * @c->freeable_cnt. + * 1. The file-system is mounted R/O, c->freeable_cnt is %0. We save the + * amount of free space in @d->saved_free + * 2. We re-mount R/W, which makes UBIFS to read the "lsave" + * information from flash, where we cache LEBs from various + * categories ('ubifs_remount_fs()' -> 'ubifs_lpt_init()' + * -> 'lpt_init_wr()' -> 'read_lsave()' -> 'ubifs_lpt_lookup()' + * -> 'ubifs_get_pnode()' -> 'update_cats()' + * -> 'ubifs_add_to_cat()'). + * 3. Lsave contains a freeable eraseblock, and @c->freeable_cnt + * becomes %1. + * 4. We calculate the amount of free space when the re-mount is + * finished in 'dbg_check_space_info()' and it does not match + * @d->saved_free. + */ + freeable_cnt = c->freeable_cnt; + c->freeable_cnt = 0; + d->saved_free = ubifs_get_free_space_nolock(c); + c->freeable_cnt = freeable_cnt; + spin_unlock(&c->space_lock); +} + +/** + * dbg_check_space_info - check flash space information. + * @c: UBIFS file-system description object + * + * This function compares current flash space information with the information + * which was saved when the 'dbg_save_space_info()' function was called. + * Returns zero if the information has not changed, and %-EINVAL it it has + * changed. + */ +int dbg_check_space_info(struct ubifs_info *c) +{ + struct ubifs_debug_info *d = c->dbg; + struct ubifs_lp_stats lst; + long long free; + int freeable_cnt; + + spin_lock(&c->space_lock); + freeable_cnt = c->freeable_cnt; + c->freeable_cnt = 0; + free = ubifs_get_free_space_nolock(c); + c->freeable_cnt = freeable_cnt; + spin_unlock(&c->space_lock); + + if (free != d->saved_free) { + ubifs_err("free space changed from %lld to %lld", + d->saved_free, free); + goto out; + } + + return 0; + +out: + ubifs_msg("saved lprops statistics dump"); + dbg_dump_lstats(&d->saved_lst); + ubifs_msg("saved budgeting info dump"); + dbg_dump_budg(c, &d->saved_bi); + ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt); + ubifs_msg("current lprops statistics dump"); + ubifs_get_lp_stats(c, &lst); + dbg_dump_lstats(&lst); + ubifs_msg("current budgeting info dump"); + dbg_dump_budg(c, &c->bi); + dump_stack(); + return -EINVAL; +} + +/** + * dbg_check_synced_i_size - check synchronized inode size. + * @inode: inode to check + * + * If inode is clean, synchronized inode size has to be equivalent to current + * inode size. This function has to be called only for locked inodes (@i_mutex + * has to be locked). Returns %0 if synchronized inode size if correct, and + * %-EINVAL if not. + */ +int dbg_check_synced_i_size(struct inode *inode) +{ + int err = 0; + struct ubifs_inode *ui = ubifs_inode(inode); + + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + if (!S_ISREG(inode->i_mode)) + return 0; + + mutex_lock(&ui->ui_mutex); + spin_lock(&ui->ui_lock); + if (ui->ui_size != ui->synced_i_size && !ui->dirty) { + ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode " + "is clean", ui->ui_size, ui->synced_i_size); + ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino, + inode->i_mode, i_size_read(inode)); + dbg_dump_stack(); + err = -EINVAL; + } + spin_unlock(&ui->ui_lock); + mutex_unlock(&ui->ui_mutex); + return err; +} + +/* + * dbg_check_dir - check directory inode size and link count. + * @c: UBIFS file-system description object + * @dir: the directory to calculate size for + * @size: the result is returned here + * + * This function makes sure that directory size and link count are correct. + * Returns zero in case of success and a negative error code in case of + * failure. + * + * Note, it is good idea to make sure the @dir->i_mutex is locked before + * calling this function. + */ +int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir) +{ + unsigned int nlink = 2; + union ubifs_key key; + struct ubifs_dent_node *dent, *pdent = NULL; + struct qstr nm = { .name = NULL }; + loff_t size = UBIFS_INO_NODE_SZ; + + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + + if (!S_ISDIR(dir->i_mode)) + return 0; + + lowest_dent_key(c, &key, dir->i_ino); + while (1) { + int err; + + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + if (err == -ENOENT) + break; + return err; + } + + nm.name = dent->name; + nm.len = le16_to_cpu(dent->nlen); + size += CALC_DENT_SIZE(nm.len); + if (dent->type == UBIFS_ITYPE_DIR) + nlink += 1; + kfree(pdent); + pdent = dent; + key_read(c, &dent->key, &key); + } + kfree(pdent); + + if (i_size_read(dir) != size) { + ubifs_err("directory inode %lu has size %llu, " + "but calculated size is %llu", dir->i_ino, + (unsigned long long)i_size_read(dir), + (unsigned long long)size); + dump_stack(); + return -EINVAL; + } + if (dir->i_nlink != nlink) { + ubifs_err("directory inode %lu has nlink %u, but calculated " + "nlink is %u", dir->i_ino, dir->i_nlink, nlink); + dump_stack(); + return -EINVAL; + } + + return 0; +} + +/** + * dbg_check_key_order - make sure that colliding keys are properly ordered. + * @c: UBIFS file-system description object + * @zbr1: first zbranch + * @zbr2: following zbranch + * + * In UBIFS indexing B-tree colliding keys has to be sorted in binary order of + * names of the direntries/xentries which are referred by the keys. This + * function reads direntries/xentries referred by @zbr1 and @zbr2 and makes + * sure the name of direntry/xentry referred by @zbr1 is less than + * direntry/xentry referred by @zbr2. Returns zero if this is true, %1 if not, + * and a negative error code in case of failure. + */ +static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, + struct ubifs_zbranch *zbr2) +{ + int err, nlen1, nlen2, cmp; + struct ubifs_dent_node *dent1, *dent2; + union ubifs_key key; + + ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key)); + dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); + if (!dent1) + return -ENOMEM; + dent2 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); + if (!dent2) { + err = -ENOMEM; + goto out_free; + } + + err = ubifs_tnc_read_node(c, zbr1, dent1); + if (err) + goto out_free; + err = ubifs_validate_entry(c, dent1); + if (err) + goto out_free; + + err = ubifs_tnc_read_node(c, zbr2, dent2); + if (err) + goto out_free; + err = ubifs_validate_entry(c, dent2); + if (err) + goto out_free; + + /* Make sure node keys are the same as in zbranch */ + err = 1; + key_read(c, &dent1->key, &key); + if (keys_cmp(c, &zbr1->key, &key)) { + dbg_err("1st entry at %d:%d has key %s", zbr1->lnum, + zbr1->offs, DBGKEY(&key)); + dbg_err("but it should have key %s according to tnc", + DBGKEY(&zbr1->key)); + dbg_dump_node(c, dent1); + goto out_free; + } + + key_read(c, &dent2->key, &key); + if (keys_cmp(c, &zbr2->key, &key)) { + dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum, + zbr1->offs, DBGKEY(&key)); + dbg_err("but it should have key %s according to tnc", + DBGKEY(&zbr2->key)); + dbg_dump_node(c, dent2); + goto out_free; + } + + nlen1 = le16_to_cpu(dent1->nlen); + nlen2 = le16_to_cpu(dent2->nlen); + + cmp = memcmp(dent1->name, dent2->name, min_t(int, nlen1, nlen2)); + if (cmp < 0 || (cmp == 0 && nlen1 < nlen2)) { + err = 0; + goto out_free; + } + if (cmp == 0 && nlen1 == nlen2) + dbg_err("2 xent/dent nodes with the same name"); + else + dbg_err("bad order of colliding key %s", + DBGKEY(&key)); + + ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); + dbg_dump_node(c, dent1); + ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs); + dbg_dump_node(c, dent2); + +out_free: + kfree(dent2); + kfree(dent1); + return err; +} + +/** + * dbg_check_znode - check if znode is all right. + * @c: UBIFS file-system description object + * @zbr: zbranch which points to this znode + * + * This function makes sure that znode referred to by @zbr is all right. + * Returns zero if it is, and %-EINVAL if it is not. + */ +static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr) +{ + struct ubifs_znode *znode = zbr->znode; + struct ubifs_znode *zp = znode->parent; + int n, err, cmp; + + if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) { + err = 1; + goto out; + } + if (znode->level < 0) { + err = 2; + goto out; + } + if (znode->iip < 0 || znode->iip >= c->fanout) { + err = 3; + goto out; + } + + if (zbr->len == 0) + /* Only dirty zbranch may have no on-flash nodes */ + if (!ubifs_zn_dirty(znode)) { + err = 4; + goto out; + } + + if (ubifs_zn_dirty(znode)) { + /* + * If znode is dirty, its parent has to be dirty as well. The + * order of the operation is important, so we have to have + * memory barriers. + */ + smp_mb(); + if (zp && !ubifs_zn_dirty(zp)) { + /* + * The dirty flag is atomic and is cleared outside the + * TNC mutex, so znode's dirty flag may now have + * been cleared. The child is always cleared before the + * parent, so we just need to check again. + */ + smp_mb(); + if (ubifs_zn_dirty(znode)) { + err = 5; + goto out; + } + } + } + + if (zp) { + const union ubifs_key *min, *max; + + if (znode->level != zp->level - 1) { + err = 6; + goto out; + } + + /* Make sure the 'parent' pointer in our znode is correct */ + err = ubifs_search_zbranch(c, zp, &zbr->key, &n); + if (!err) { + /* This zbranch does not exist in the parent */ + err = 7; + goto out; + } + + if (znode->iip >= zp->child_cnt) { + err = 8; + goto out; + } + + if (znode->iip != n) { + /* This may happen only in case of collisions */ + if (keys_cmp(c, &zp->zbranch[n].key, + &zp->zbranch[znode->iip].key)) { + err = 9; + goto out; + } + n = znode->iip; + } + + /* + * Make sure that the first key in our znode is greater than or + * equal to the key in the pointing zbranch. + */ + min = &zbr->key; + cmp = keys_cmp(c, min, &znode->zbranch[0].key); + if (cmp == 1) { + err = 10; + goto out; + } + + if (n + 1 < zp->child_cnt) { + max = &zp->zbranch[n + 1].key; + + /* + * Make sure the last key in our znode is less or + * equivalent than the key in the zbranch which goes + * after our pointing zbranch. + */ + cmp = keys_cmp(c, max, + &znode->zbranch[znode->child_cnt - 1].key); + if (cmp == -1) { + err = 11; + goto out; + } + } + } else { + /* This may only be root znode */ + if (zbr != &c->zroot) { + err = 12; + goto out; + } + } + + /* + * Make sure that next key is greater or equivalent then the previous + * one. + */ + for (n = 1; n < znode->child_cnt; n++) { + cmp = keys_cmp(c, &znode->zbranch[n - 1].key, + &znode->zbranch[n].key); + if (cmp > 0) { + err = 13; + goto out; + } + if (cmp == 0) { + /* This can only be keys with colliding hash */ + if (!is_hash_key(c, &znode->zbranch[n].key)) { + err = 14; + goto out; + } + + if (znode->level != 0 || c->replaying) + continue; + + /* + * Colliding keys should follow binary order of + * corresponding xentry/dentry names. + */ + err = dbg_check_key_order(c, &znode->zbranch[n - 1], + &znode->zbranch[n]); + if (err < 0) + return err; + if (err) { + err = 15; + goto out; + } + } + } + + for (n = 0; n < znode->child_cnt; n++) { + if (!znode->zbranch[n].znode && + (znode->zbranch[n].lnum == 0 || + znode->zbranch[n].len == 0)) { + err = 16; + goto out; + } + + if (znode->zbranch[n].lnum != 0 && + znode->zbranch[n].len == 0) { + err = 17; + goto out; + } + + if (znode->zbranch[n].lnum == 0 && + znode->zbranch[n].len != 0) { + err = 18; + goto out; + } + + if (znode->zbranch[n].lnum == 0 && + znode->zbranch[n].offs != 0) { + err = 19; + goto out; + } + + if (znode->level != 0 && znode->zbranch[n].znode) + if (znode->zbranch[n].znode->parent != znode) { + err = 20; + goto out; + } + } + + return 0; + +out: + ubifs_err("failed, error %d", err); + ubifs_msg("dump of the znode"); + dbg_dump_znode(c, znode); + if (zp) { + ubifs_msg("dump of the parent znode"); + dbg_dump_znode(c, zp); + } + dump_stack(); + return -EINVAL; +} + +/** + * dbg_check_tnc - check TNC tree. + * @c: UBIFS file-system description object + * @extra: do extra checks that are possible at start commit + * + * This function traverses whole TNC tree and checks every znode. Returns zero + * if everything is all right and %-EINVAL if something is wrong with TNC. + */ +int dbg_check_tnc(struct ubifs_info *c, int extra) +{ + struct ubifs_znode *znode; + long clean_cnt = 0, dirty_cnt = 0; + int err, last; + + if (!(ubifs_chk_flags & UBIFS_CHK_TNC)) + return 0; + + ubifs_assert(mutex_is_locked(&c->tnc_mutex)); + if (!c->zroot.znode) + return 0; + + znode = ubifs_tnc_postorder_first(c->zroot.znode); + while (1) { + struct ubifs_znode *prev; + struct ubifs_zbranch *zbr; + + if (!znode->parent) + zbr = &c->zroot; + else + zbr = &znode->parent->zbranch[znode->iip]; + + err = dbg_check_znode(c, zbr); + if (err) + return err; + + if (extra) { + if (ubifs_zn_dirty(znode)) + dirty_cnt += 1; + else + clean_cnt += 1; + } + + prev = znode; + znode = ubifs_tnc_postorder_next(znode); + if (!znode) + break; + + /* + * If the last key of this znode is equivalent to the first key + * of the next znode (collision), then check order of the keys. + */ + last = prev->child_cnt - 1; + if (prev->level == 0 && znode->level == 0 && !c->replaying && + !keys_cmp(c, &prev->zbranch[last].key, + &znode->zbranch[0].key)) { + err = dbg_check_key_order(c, &prev->zbranch[last], + &znode->zbranch[0]); + if (err < 0) + return err; + if (err) { + ubifs_msg("first znode"); + dbg_dump_znode(c, prev); + ubifs_msg("second znode"); + dbg_dump_znode(c, znode); + return -EINVAL; + } + } + } + + if (extra) { + if (clean_cnt != atomic_long_read(&c->clean_zn_cnt)) { + ubifs_err("incorrect clean_zn_cnt %ld, calculated %ld", + atomic_long_read(&c->clean_zn_cnt), + clean_cnt); + return -EINVAL; + } + if (dirty_cnt != atomic_long_read(&c->dirty_zn_cnt)) { + ubifs_err("incorrect dirty_zn_cnt %ld, calculated %ld", + atomic_long_read(&c->dirty_zn_cnt), + dirty_cnt); + return -EINVAL; + } + } + + return 0; +} + +/** + * dbg_walk_index - walk the on-flash index. + * @c: UBIFS file-system description object + * @leaf_cb: called for each leaf node + * @znode_cb: called for each indexing node + * @priv: private data which is passed to callbacks + * + * This function walks the UBIFS index and calls the @leaf_cb for each leaf + * node and @znode_cb for each indexing node. Returns zero in case of success + * and a negative error code in case of failure. + * + * It would be better if this function removed every znode it pulled to into + * the TNC, so that the behavior more closely matched the non-debugging + * behavior. + */ +int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, + dbg_znode_callback znode_cb, void *priv) +{ + int err; + struct ubifs_zbranch *zbr; + struct ubifs_znode *znode, *child; + + mutex_lock(&c->tnc_mutex); + /* If the root indexing node is not in TNC - pull it */ + if (!c->zroot.znode) { + c->zroot.znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(c->zroot.znode)) { + err = PTR_ERR(c->zroot.znode); + c->zroot.znode = NULL; + goto out_unlock; + } + } + + /* + * We are going to traverse the indexing tree in the postorder manner. + * Go down and find the leftmost indexing node where we are going to + * start from. + */ + znode = c->zroot.znode; + while (znode->level > 0) { + zbr = &znode->zbranch[0]; + child = zbr->znode; + if (!child) { + child = ubifs_load_znode(c, zbr, znode, 0); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto out_unlock; + } + zbr->znode = child; + } + + znode = child; + } + + /* Iterate over all indexing nodes */ + while (1) { + int idx; + + cond_resched(); + + if (znode_cb) { + err = znode_cb(c, znode, priv); + if (err) { + ubifs_err("znode checking function returned " + "error %d", err); + dbg_dump_znode(c, znode); + goto out_dump; + } + } + if (leaf_cb && znode->level == 0) { + for (idx = 0; idx < znode->child_cnt; idx++) { + zbr = &znode->zbranch[idx]; + err = leaf_cb(c, zbr, priv); + if (err) { + ubifs_err("leaf checking function " + "returned error %d, for leaf " + "at LEB %d:%d", + err, zbr->lnum, zbr->offs); + goto out_dump; + } + } + } + + if (!znode->parent) + break; + + idx = znode->iip + 1; + znode = znode->parent; + if (idx < znode->child_cnt) { + /* Switch to the next index in the parent */ + zbr = &znode->zbranch[idx]; + child = zbr->znode; + if (!child) { + child = ubifs_load_znode(c, zbr, znode, idx); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto out_unlock; + } + zbr->znode = child; + } + znode = child; + } else + /* + * This is the last child, switch to the parent and + * continue. + */ + continue; + + /* Go to the lowest leftmost znode in the new sub-tree */ + while (znode->level > 0) { + zbr = &znode->zbranch[0]; + child = zbr->znode; + if (!child) { + child = ubifs_load_znode(c, zbr, znode, 0); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto out_unlock; + } + zbr->znode = child; + } + znode = child; + } + } + + mutex_unlock(&c->tnc_mutex); + return 0; + +out_dump: + if (znode->parent) + zbr = &znode->parent->zbranch[znode->iip]; + else + zbr = &c->zroot; + ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs); + dbg_dump_znode(c, znode); +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * add_size - add znode size to partially calculated index size. + * @c: UBIFS file-system description object + * @znode: znode to add size for + * @priv: partially calculated index size + * + * This is a helper function for 'dbg_check_idx_size()' which is called for + * every indexing node and adds its size to the 'long long' variable pointed to + * by @priv. + */ +static int add_size(struct ubifs_info *c, struct ubifs_znode *znode, void *priv) +{ + long long *idx_size = priv; + int add; + + add = ubifs_idx_node_sz(c, znode->child_cnt); + add = ALIGN(add, 8); + *idx_size += add; + return 0; +} + +/** + * dbg_check_idx_size - check index size. + * @c: UBIFS file-system description object + * @idx_size: size to check + * + * This function walks the UBIFS index, calculates its size and checks that the + * size is equivalent to @idx_size. Returns zero in case of success and a + * negative error code in case of failure. + */ +int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) +{ + int err; + long long calc = 0; + + if (!(ubifs_chk_flags & UBIFS_CHK_IDX_SZ)) + return 0; + + err = dbg_walk_index(c, NULL, add_size, &calc); + if (err) { + ubifs_err("error %d while walking the index", err); + return err; + } + + if (calc != idx_size) { + ubifs_err("index size check failed: calculated size is %lld, " + "should be %lld", calc, idx_size); + dump_stack(); + return -EINVAL; + } + + return 0; +} + +/** + * struct fsck_inode - information about an inode used when checking the file-system. + * @rb: link in the RB-tree of inodes + * @inum: inode number + * @mode: inode type, permissions, etc + * @nlink: inode link count + * @xattr_cnt: count of extended attributes + * @references: how many directory/xattr entries refer this inode (calculated + * while walking the index) + * @calc_cnt: for directory inode count of child directories + * @size: inode size (read from on-flash inode) + * @xattr_sz: summary size of all extended attributes (read from on-flash + * inode) + * @calc_sz: for directories calculated directory size + * @calc_xcnt: count of extended attributes + * @calc_xsz: calculated summary size of all extended attributes + * @xattr_nms: sum of lengths of all extended attribute names belonging to this + * inode (read from on-flash inode) + * @calc_xnms: calculated sum of lengths of all extended attribute names + */ +struct fsck_inode { + struct rb_node rb; + ino_t inum; + umode_t mode; + unsigned int nlink; + unsigned int xattr_cnt; + int references; + int calc_cnt; + long long size; + unsigned int xattr_sz; + long long calc_sz; + long long calc_xcnt; + long long calc_xsz; + unsigned int xattr_nms; + long long calc_xnms; +}; + +/** + * struct fsck_data - private FS checking information. + * @inodes: RB-tree of all inodes (contains @struct fsck_inode objects) + */ +struct fsck_data { + struct rb_root inodes; +}; + +/** + * add_inode - add inode information to RB-tree of inodes. + * @c: UBIFS file-system description object + * @fsckd: FS checking information + * @ino: raw UBIFS inode to add + * + * This is a helper function for 'check_leaf()' which adds information about + * inode @ino to the RB-tree of inodes. Returns inode information pointer in + * case of success and a negative error code in case of failure. + */ +static struct fsck_inode *add_inode(struct ubifs_info *c, + struct fsck_data *fsckd, + struct ubifs_ino_node *ino) +{ + struct rb_node **p, *parent = NULL; + struct fsck_inode *fscki; + ino_t inum = key_inum_flash(c, &ino->key); + struct inode *inode; + struct ubifs_inode *ui; + + p = &fsckd->inodes.rb_node; + while (*p) { + parent = *p; + fscki = rb_entry(parent, struct fsck_inode, rb); + if (inum < fscki->inum) + p = &(*p)->rb_left; + else if (inum > fscki->inum) + p = &(*p)->rb_right; + else + return fscki; + } + + if (inum > c->highest_inum) { + ubifs_err("too high inode number, max. is %lu", + (unsigned long)c->highest_inum); + return ERR_PTR(-EINVAL); + } + + fscki = kzalloc(sizeof(struct fsck_inode), GFP_NOFS); + if (!fscki) + return ERR_PTR(-ENOMEM); + + inode = ilookup(c->vfs_sb, inum); + + fscki->inum = inum; + /* + * If the inode is present in the VFS inode cache, use it instead of + * the on-flash inode which might be out-of-date. E.g., the size might + * be out-of-date. If we do not do this, the following may happen, for + * example: + * 1. A power cut happens + * 2. We mount the file-system R/O, the replay process fixes up the + * inode size in the VFS cache, but on on-flash. + * 3. 'check_leaf()' fails because it hits a data node beyond inode + * size. + */ + if (!inode) { + fscki->nlink = le32_to_cpu(ino->nlink); + fscki->size = le64_to_cpu(ino->size); + fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt); + fscki->xattr_sz = le32_to_cpu(ino->xattr_size); + fscki->xattr_nms = le32_to_cpu(ino->xattr_names); + fscki->mode = le32_to_cpu(ino->mode); + } else { + ui = ubifs_inode(inode); + fscki->nlink = inode->i_nlink; + fscki->size = inode->i_size; + fscki->xattr_cnt = ui->xattr_cnt; + fscki->xattr_sz = ui->xattr_size; + fscki->xattr_nms = ui->xattr_names; + fscki->mode = inode->i_mode; + iput(inode); + } + + if (S_ISDIR(fscki->mode)) { + fscki->calc_sz = UBIFS_INO_NODE_SZ; + fscki->calc_cnt = 2; + } + + rb_link_node(&fscki->rb, parent, p); + rb_insert_color(&fscki->rb, &fsckd->inodes); + + return fscki; +} + +/** + * search_inode - search inode in the RB-tree of inodes. + * @fsckd: FS checking information + * @inum: inode number to search + * + * This is a helper function for 'check_leaf()' which searches inode @inum in + * the RB-tree of inodes and returns an inode information pointer or %NULL if + * the inode was not found. + */ +static struct fsck_inode *search_inode(struct fsck_data *fsckd, ino_t inum) +{ + struct rb_node *p; + struct fsck_inode *fscki; + + p = fsckd->inodes.rb_node; + while (p) { + fscki = rb_entry(p, struct fsck_inode, rb); + if (inum < fscki->inum) + p = p->rb_left; + else if (inum > fscki->inum) + p = p->rb_right; + else + return fscki; + } + return NULL; +} + +/** + * read_add_inode - read inode node and add it to RB-tree of inodes. + * @c: UBIFS file-system description object + * @fsckd: FS checking information + * @inum: inode number to read + * + * This is a helper function for 'check_leaf()' which finds inode node @inum in + * the index, reads it, and adds it to the RB-tree of inodes. Returns inode + * information pointer in case of success and a negative error code in case of + * failure. + */ +static struct fsck_inode *read_add_inode(struct ubifs_info *c, + struct fsck_data *fsckd, ino_t inum) +{ + int n, err; + union ubifs_key key; + struct ubifs_znode *znode; + struct ubifs_zbranch *zbr; + struct ubifs_ino_node *ino; + struct fsck_inode *fscki; + + fscki = search_inode(fsckd, inum); + if (fscki) + return fscki; + + ino_key_init(c, &key, inum); + err = ubifs_lookup_level0(c, &key, &znode, &n); + if (!err) { + ubifs_err("inode %lu not found in index", (unsigned long)inum); + return ERR_PTR(-ENOENT); + } else if (err < 0) { + ubifs_err("error %d while looking up inode %lu", + err, (unsigned long)inum); + return ERR_PTR(err); + } + + zbr = &znode->zbranch[n]; + if (zbr->len < UBIFS_INO_NODE_SZ) { + ubifs_err("bad node %lu node length %d", + (unsigned long)inum, zbr->len); + return ERR_PTR(-EINVAL); + } + + ino = kmalloc(zbr->len, GFP_NOFS); + if (!ino) + return ERR_PTR(-ENOMEM); + + err = ubifs_tnc_read_node(c, zbr, ino); + if (err) { + ubifs_err("cannot read inode node at LEB %d:%d, error %d", + zbr->lnum, zbr->offs, err); + kfree(ino); + return ERR_PTR(err); + } + + fscki = add_inode(c, fsckd, ino); + kfree(ino); + if (IS_ERR(fscki)) { + ubifs_err("error %ld while adding inode %lu node", + PTR_ERR(fscki), (unsigned long)inum); + return fscki; + } + + return fscki; +} + +/** + * check_leaf - check leaf node. + * @c: UBIFS file-system description object + * @zbr: zbranch of the leaf node to check + * @priv: FS checking information + * + * This is a helper function for 'dbg_check_filesystem()' which is called for + * every single leaf node while walking the indexing tree. It checks that the + * leaf node referred from the indexing tree exists, has correct CRC, and does + * some other basic validation. This function is also responsible for building + * an RB-tree of inodes - it adds all inodes into the RB-tree. It also + * calculates reference count, size, etc for each inode in order to later + * compare them to the information stored inside the inodes and detect possible + * inconsistencies. Returns zero in case of success and a negative error code + * in case of failure. + */ +static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *priv) +{ + ino_t inum; + void *node; + struct ubifs_ch *ch; + int err, type = key_type(c, &zbr->key); + struct fsck_inode *fscki; + + if (zbr->len < UBIFS_CH_SZ) { + ubifs_err("bad leaf length %d (LEB %d:%d)", + zbr->len, zbr->lnum, zbr->offs); + return -EINVAL; + } + + node = kmalloc(zbr->len, GFP_NOFS); + if (!node) + return -ENOMEM; + + err = ubifs_tnc_read_node(c, zbr, node); + if (err) { + ubifs_err("cannot read leaf node at LEB %d:%d, error %d", + zbr->lnum, zbr->offs, err); + goto out_free; + } + + /* If this is an inode node, add it to RB-tree of inodes */ + if (type == UBIFS_INO_KEY) { + fscki = add_inode(c, priv, node); + if (IS_ERR(fscki)) { + err = PTR_ERR(fscki); + ubifs_err("error %d while adding inode node", err); + goto out_dump; + } + goto out; + } + + if (type != UBIFS_DENT_KEY && type != UBIFS_XENT_KEY && + type != UBIFS_DATA_KEY) { + ubifs_err("unexpected node type %d at LEB %d:%d", + type, zbr->lnum, zbr->offs); + err = -EINVAL; + goto out_free; + } + + ch = node; + if (le64_to_cpu(ch->sqnum) > c->max_sqnum) { + ubifs_err("too high sequence number, max. is %llu", + c->max_sqnum); + err = -EINVAL; + goto out_dump; + } + + if (type == UBIFS_DATA_KEY) { + long long blk_offs; + struct ubifs_data_node *dn = node; + + /* + * Search the inode node this data node belongs to and insert + * it to the RB-tree of inodes. + */ + inum = key_inum_flash(c, &dn->key); + fscki = read_add_inode(c, priv, inum); + if (IS_ERR(fscki)) { + err = PTR_ERR(fscki); + ubifs_err("error %d while processing data node and " + "trying to find inode node %lu", + err, (unsigned long)inum); + goto out_dump; + } + + /* Make sure the data node is within inode size */ + blk_offs = key_block_flash(c, &dn->key); + blk_offs <<= UBIFS_BLOCK_SHIFT; + blk_offs += le32_to_cpu(dn->size); + if (blk_offs > fscki->size) { + ubifs_err("data node at LEB %d:%d is not within inode " + "size %lld", zbr->lnum, zbr->offs, + fscki->size); + err = -EINVAL; + goto out_dump; + } + } else { + int nlen; + struct ubifs_dent_node *dent = node; + struct fsck_inode *fscki1; + + err = ubifs_validate_entry(c, dent); + if (err) + goto out_dump; + + /* + * Search the inode node this entry refers to and the parent + * inode node and insert them to the RB-tree of inodes. + */ + inum = le64_to_cpu(dent->inum); + fscki = read_add_inode(c, priv, inum); + if (IS_ERR(fscki)) { + err = PTR_ERR(fscki); + ubifs_err("error %d while processing entry node and " + "trying to find inode node %lu", + err, (unsigned long)inum); + goto out_dump; + } + + /* Count how many direntries or xentries refers this inode */ + fscki->references += 1; + + inum = key_inum_flash(c, &dent->key); + fscki1 = read_add_inode(c, priv, inum); + if (IS_ERR(fscki1)) { + err = PTR_ERR(fscki1); + ubifs_err("error %d while processing entry node and " + "trying to find parent inode node %lu", + err, (unsigned long)inum); + goto out_dump; + } + + nlen = le16_to_cpu(dent->nlen); + if (type == UBIFS_XENT_KEY) { + fscki1->calc_xcnt += 1; + fscki1->calc_xsz += CALC_DENT_SIZE(nlen); + fscki1->calc_xsz += CALC_XATTR_BYTES(fscki->size); + fscki1->calc_xnms += nlen; + } else { + fscki1->calc_sz += CALC_DENT_SIZE(nlen); + if (dent->type == UBIFS_ITYPE_DIR) + fscki1->calc_cnt += 1; + } + } + +out: + kfree(node); + return 0; + +out_dump: + ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs); + dbg_dump_node(c, node); +out_free: + kfree(node); + return err; +} + +/** + * free_inodes - free RB-tree of inodes. + * @fsckd: FS checking information + */ +static void free_inodes(struct fsck_data *fsckd) +{ + struct rb_node *this = fsckd->inodes.rb_node; + struct fsck_inode *fscki; + + while (this) { + if (this->rb_left) + this = this->rb_left; + else if (this->rb_right) + this = this->rb_right; + else { + fscki = rb_entry(this, struct fsck_inode, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &fscki->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(fscki); + } + } +} + +/** + * check_inodes - checks all inodes. + * @c: UBIFS file-system description object + * @fsckd: FS checking information + * + * This is a helper function for 'dbg_check_filesystem()' which walks the + * RB-tree of inodes after the index scan has been finished, and checks that + * inode nlink, size, etc are correct. Returns zero if inodes are fine, + * %-EINVAL if not, and a negative error code in case of failure. + */ +static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd) +{ + int n, err; + union ubifs_key key; + struct ubifs_znode *znode; + struct ubifs_zbranch *zbr; + struct ubifs_ino_node *ino; + struct fsck_inode *fscki; + struct rb_node *this = rb_first(&fsckd->inodes); + + while (this) { + fscki = rb_entry(this, struct fsck_inode, rb); + this = rb_next(this); + + if (S_ISDIR(fscki->mode)) { + /* + * Directories have to have exactly one reference (they + * cannot have hardlinks), although root inode is an + * exception. + */ + if (fscki->inum != UBIFS_ROOT_INO && + fscki->references != 1) { + ubifs_err("directory inode %lu has %d " + "direntries which refer it, but " + "should be 1", + (unsigned long)fscki->inum, + fscki->references); + goto out_dump; + } + if (fscki->inum == UBIFS_ROOT_INO && + fscki->references != 0) { + ubifs_err("root inode %lu has non-zero (%d) " + "direntries which refer it", + (unsigned long)fscki->inum, + fscki->references); + goto out_dump; + } + if (fscki->calc_sz != fscki->size) { + ubifs_err("directory inode %lu size is %lld, " + "but calculated size is %lld", + (unsigned long)fscki->inum, + fscki->size, fscki->calc_sz); + goto out_dump; + } + if (fscki->calc_cnt != fscki->nlink) { + ubifs_err("directory inode %lu nlink is %d, " + "but calculated nlink is %d", + (unsigned long)fscki->inum, + fscki->nlink, fscki->calc_cnt); + goto out_dump; + } + } else { + if (fscki->references != fscki->nlink) { + ubifs_err("inode %lu nlink is %d, but " + "calculated nlink is %d", + (unsigned long)fscki->inum, + fscki->nlink, fscki->references); + goto out_dump; + } + } + if (fscki->xattr_sz != fscki->calc_xsz) { + ubifs_err("inode %lu has xattr size %u, but " + "calculated size is %lld", + (unsigned long)fscki->inum, fscki->xattr_sz, + fscki->calc_xsz); + goto out_dump; + } + if (fscki->xattr_cnt != fscki->calc_xcnt) { + ubifs_err("inode %lu has %u xattrs, but " + "calculated count is %lld", + (unsigned long)fscki->inum, + fscki->xattr_cnt, fscki->calc_xcnt); + goto out_dump; + } + if (fscki->xattr_nms != fscki->calc_xnms) { + ubifs_err("inode %lu has xattr names' size %u, but " + "calculated names' size is %lld", + (unsigned long)fscki->inum, fscki->xattr_nms, + fscki->calc_xnms); + goto out_dump; + } + } + + return 0; + +out_dump: + /* Read the bad inode and dump it */ + ino_key_init(c, &key, fscki->inum); + err = ubifs_lookup_level0(c, &key, &znode, &n); + if (!err) { + ubifs_err("inode %lu not found in index", + (unsigned long)fscki->inum); + return -ENOENT; + } else if (err < 0) { + ubifs_err("error %d while looking up inode %lu", + err, (unsigned long)fscki->inum); + return err; + } + + zbr = &znode->zbranch[n]; + ino = kmalloc(zbr->len, GFP_NOFS); + if (!ino) + return -ENOMEM; + + err = ubifs_tnc_read_node(c, zbr, ino); + if (err) { + ubifs_err("cannot read inode node at LEB %d:%d, error %d", + zbr->lnum, zbr->offs, err); + kfree(ino); + return err; + } + + ubifs_msg("dump of the inode %lu sitting in LEB %d:%d", + (unsigned long)fscki->inum, zbr->lnum, zbr->offs); + dbg_dump_node(c, ino); + kfree(ino); + return -EINVAL; +} + +/** + * dbg_check_filesystem - check the file-system. + * @c: UBIFS file-system description object + * + * This function checks the file system, namely: + * o makes sure that all leaf nodes exist and their CRCs are correct; + * o makes sure inode nlink, size, xattr size/count are correct (for all + * inodes). + * + * The function reads whole indexing tree and all nodes, so it is pretty + * heavy-weight. Returns zero if the file-system is consistent, %-EINVAL if + * not, and a negative error code in case of failure. + */ +int dbg_check_filesystem(struct ubifs_info *c) +{ + int err; + struct fsck_data fsckd; + + if (!(ubifs_chk_flags & UBIFS_CHK_FS)) + return 0; + + fsckd.inodes = RB_ROOT; + err = dbg_walk_index(c, check_leaf, NULL, &fsckd); + if (err) + goto out_free; + + err = check_inodes(c, &fsckd); + if (err) + goto out_free; + + free_inodes(&fsckd); + return 0; + +out_free: + ubifs_err("file-system check failed with error %d", err); + dump_stack(); + free_inodes(&fsckd); + return err; +} + +/** + * dbg_check_data_nodes_order - check that list of data nodes is sorted. + * @c: UBIFS file-system description object + * @head: the list of nodes ('struct ubifs_scan_node' objects) + * + * This function returns zero if the list of data nodes is sorted correctly, + * and %-EINVAL if not. + */ +int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) +{ + struct list_head *cur; + struct ubifs_scan_node *sa, *sb; + + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + + for (cur = head->next; cur->next != head; cur = cur->next) { + ino_t inuma, inumb; + uint32_t blka, blkb; + + cond_resched(); + sa = container_of(cur, struct ubifs_scan_node, list); + sb = container_of(cur->next, struct ubifs_scan_node, list); + + if (sa->type != UBIFS_DATA_NODE) { + ubifs_err("bad node type %d", sa->type); + dbg_dump_node(c, sa->node); + return -EINVAL; + } + if (sb->type != UBIFS_DATA_NODE) { + ubifs_err("bad node type %d", sb->type); + dbg_dump_node(c, sb->node); + return -EINVAL; + } + + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma < inumb) + continue; + if (inuma > inumb) { + ubifs_err("larger inum %lu goes before inum %lu", + (unsigned long)inuma, (unsigned long)inumb); + goto error_dump; + } + + blka = key_block(c, &sa->key); + blkb = key_block(c, &sb->key); + + if (blka > blkb) { + ubifs_err("larger block %u goes before %u", blka, blkb); + goto error_dump; + } + if (blka == blkb) { + ubifs_err("two data nodes for the same block"); + goto error_dump; + } + } + + return 0; + +error_dump: + dbg_dump_node(c, sa->node); + dbg_dump_node(c, sb->node); + return -EINVAL; +} + +/** + * dbg_check_nondata_nodes_order - check that list of data nodes is sorted. + * @c: UBIFS file-system description object + * @head: the list of nodes ('struct ubifs_scan_node' objects) + * + * This function returns zero if the list of non-data nodes is sorted correctly, + * and %-EINVAL if not. + */ +int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) +{ + struct list_head *cur; + struct ubifs_scan_node *sa, *sb; + + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + + for (cur = head->next; cur->next != head; cur = cur->next) { + ino_t inuma, inumb; + uint32_t hasha, hashb; + + cond_resched(); + sa = container_of(cur, struct ubifs_scan_node, list); + sb = container_of(cur->next, struct ubifs_scan_node, list); + + if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && + sa->type != UBIFS_XENT_NODE) { + ubifs_err("bad node type %d", sa->type); + dbg_dump_node(c, sa->node); + return -EINVAL; + } + if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && + sa->type != UBIFS_XENT_NODE) { + ubifs_err("bad node type %d", sb->type); + dbg_dump_node(c, sb->node); + return -EINVAL; + } + + if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) { + ubifs_err("non-inode node goes before inode node"); + goto error_dump; + } + + if (sa->type == UBIFS_INO_NODE && sb->type != UBIFS_INO_NODE) + continue; + + if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) { + /* Inode nodes are sorted in descending size order */ + if (sa->len < sb->len) { + ubifs_err("smaller inode node goes first"); + goto error_dump; + } + continue; + } + + /* + * This is either a dentry or xentry, which should be sorted in + * ascending (parent ino, hash) order. + */ + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma < inumb) + continue; + if (inuma > inumb) { + ubifs_err("larger inum %lu goes before inum %lu", + (unsigned long)inuma, (unsigned long)inumb); + goto error_dump; + } + + hasha = key_block(c, &sa->key); + hashb = key_block(c, &sb->key); + + if (hasha > hashb) { + ubifs_err("larger hash %u goes before %u", + hasha, hashb); + goto error_dump; + } + } + + return 0; + +error_dump: + ubifs_msg("dumping first node"); + dbg_dump_node(c, sa->node); + ubifs_msg("dumping second node"); + dbg_dump_node(c, sb->node); + return -EINVAL; + return 0; +} + +int dbg_force_in_the_gaps(void) +{ + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + + return !(random32() & 7); +} + +/* Failure mode for recovery testing */ + +#define chance(n, d) (simple_rand() <= (n) * 32768LL / (d)) + +struct failure_mode_info { + struct list_head list; + struct ubifs_info *c; +}; + +static LIST_HEAD(fmi_list); +static DEFINE_SPINLOCK(fmi_lock); + +static unsigned int next; + +static int simple_rand(void) +{ + if (next == 0) + next = current->pid; + next = next * 1103515245 + 12345; + return (next >> 16) & 32767; +} + +static void failure_mode_init(struct ubifs_info *c) +{ + struct failure_mode_info *fmi; + + fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS); + if (!fmi) { + ubifs_err("Failed to register failure mode - no memory"); + return; + } + fmi->c = c; + spin_lock(&fmi_lock); + list_add_tail(&fmi->list, &fmi_list); + spin_unlock(&fmi_lock); +} + +static void failure_mode_exit(struct ubifs_info *c) +{ + struct failure_mode_info *fmi, *tmp; + + spin_lock(&fmi_lock); + list_for_each_entry_safe(fmi, tmp, &fmi_list, list) + if (fmi->c == c) { + list_del(&fmi->list); + kfree(fmi); + } + spin_unlock(&fmi_lock); +} + +static struct ubifs_info *dbg_find_info(struct ubi_volume_desc *desc) +{ + struct failure_mode_info *fmi; + + spin_lock(&fmi_lock); + list_for_each_entry(fmi, &fmi_list, list) + if (fmi->c->ubi == desc) { + struct ubifs_info *c = fmi->c; + + spin_unlock(&fmi_lock); + return c; + } + spin_unlock(&fmi_lock); + return NULL; +} + +static int in_failure_mode(struct ubi_volume_desc *desc) +{ + struct ubifs_info *c = dbg_find_info(desc); + + if (c && dbg_failure_mode) + return c->dbg->failure_mode; + return 0; +} + +static int do_fail(struct ubi_volume_desc *desc, int lnum, int write) +{ + struct ubifs_info *c = dbg_find_info(desc); + struct ubifs_debug_info *d; + + if (!c || !dbg_failure_mode) + return 0; + d = c->dbg; + if (d->failure_mode) + return 1; + if (!d->fail_cnt) { + /* First call - decide delay to failure */ + if (chance(1, 2)) { + unsigned int delay = 1 << (simple_rand() >> 11); + + if (chance(1, 2)) { + d->fail_delay = 1; + d->fail_timeout = jiffies + + msecs_to_jiffies(delay); + dbg_rcvry("failing after %ums", delay); + } else { + d->fail_delay = 2; + d->fail_cnt_max = delay; + dbg_rcvry("failing after %u calls", delay); + } + } + d->fail_cnt += 1; + } + /* Determine if failure delay has expired */ + if (d->fail_delay == 1) { + if (time_before(jiffies, d->fail_timeout)) + return 0; + } else if (d->fail_delay == 2) + if (d->fail_cnt++ < d->fail_cnt_max) + return 0; + if (lnum == UBIFS_SB_LNUM) { + if (write) { + if (chance(1, 2)) + return 0; + } else if (chance(19, 20)) + return 0; + dbg_rcvry("failing in super block LEB %d", lnum); + } else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) { + if (chance(19, 20)) + return 0; + dbg_rcvry("failing in master LEB %d", lnum); + } else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) { + if (write) { + if (chance(99, 100)) + return 0; + } else if (chance(399, 400)) + return 0; + dbg_rcvry("failing in log LEB %d", lnum); + } else if (lnum >= c->lpt_first && lnum <= c->lpt_last) { + if (write) { + if (chance(7, 8)) + return 0; + } else if (chance(19, 20)) + return 0; + dbg_rcvry("failing in LPT LEB %d", lnum); + } else if (lnum >= c->orph_first && lnum <= c->orph_last) { + if (write) { + if (chance(1, 2)) + return 0; + } else if (chance(9, 10)) + return 0; + dbg_rcvry("failing in orphan LEB %d", lnum); + } else if (lnum == c->ihead_lnum) { + if (chance(99, 100)) + return 0; + dbg_rcvry("failing in index head LEB %d", lnum); + } else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) { + if (chance(9, 10)) + return 0; + dbg_rcvry("failing in GC head LEB %d", lnum); + } else if (write && !RB_EMPTY_ROOT(&c->buds) && + !ubifs_search_bud(c, lnum)) { + if (chance(19, 20)) + return 0; + dbg_rcvry("failing in non-bud LEB %d", lnum); + } else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND || + c->cmt_state == COMMIT_RUNNING_REQUIRED) { + if (chance(999, 1000)) + return 0; + dbg_rcvry("failing in bud LEB %d commit running", lnum); + } else { + if (chance(9999, 10000)) + return 0; + dbg_rcvry("failing in bud LEB %d commit not running", lnum); + } + ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum); + d->failure_mode = 1; + dump_stack(); + return 1; +} + +static void cut_data(const void *buf, int len) +{ + int flen, i; + unsigned char *p = (void *)buf; + + flen = (len * (long long)simple_rand()) >> 15; + for (i = flen; i < len; i++) + p[i] = 0xff; +} + +int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, + int len, int check) +{ + if (in_failure_mode(desc)) + return -EROFS; + return ubi_leb_read(desc, lnum, buf, offset, len, check); +} + +int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, + int offset, int len, int dtype) +{ + int err, failing; + + if (in_failure_mode(desc)) + return -EROFS; + failing = do_fail(desc, lnum, 1); + if (failing) + cut_data(buf, len); + err = ubi_leb_write(desc, lnum, buf, offset, len, dtype); + if (err) + return err; + if (failing) + return -EROFS; + return 0; +} + +int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, + int len, int dtype) +{ + int err; + + if (do_fail(desc, lnum, 1)) + return -EROFS; + err = ubi_leb_change(desc, lnum, buf, len, dtype); + if (err) + return err; + if (do_fail(desc, lnum, 1)) + return -EROFS; + return 0; +} + +int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum) +{ + int err; + + if (do_fail(desc, lnum, 0)) + return -EROFS; + err = ubi_leb_erase(desc, lnum); + if (err) + return err; + if (do_fail(desc, lnum, 0)) + return -EROFS; + return 0; +} + +int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum) +{ + int err; + + if (do_fail(desc, lnum, 0)) + return -EROFS; + err = ubi_leb_unmap(desc, lnum); + if (err) + return err; + if (do_fail(desc, lnum, 0)) + return -EROFS; + return 0; +} + +int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum) +{ + if (in_failure_mode(desc)) + return -EROFS; + return ubi_is_mapped(desc, lnum); +} + +int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype) +{ + int err; + + if (do_fail(desc, lnum, 0)) + return -EROFS; + err = ubi_leb_map(desc, lnum, dtype); + if (err) + return err; + if (do_fail(desc, lnum, 0)) + return -EROFS; + return 0; +} + +/** + * ubifs_debugging_init - initialize UBIFS debugging. + * @c: UBIFS file-system description object + * + * This function initializes debugging-related data for the file system. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_debugging_init(struct ubifs_info *c) +{ + c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL); + if (!c->dbg) + return -ENOMEM; + + failure_mode_init(c); + return 0; +} + +/** + * ubifs_debugging_exit - free debugging data. + * @c: UBIFS file-system description object + */ +void ubifs_debugging_exit(struct ubifs_info *c) +{ + failure_mode_exit(c); + kfree(c->dbg); +} + +/* + * Root directory for UBIFS stuff in debugfs. Contains sub-directories which + * contain the stuff specific to particular file-system mounts. + */ +static struct dentry *dfs_rootdir; + +/** + * dbg_debugfs_init - initialize debugfs file-system. + * + * UBIFS uses debugfs file-system to expose various debugging knobs to + * user-space. This function creates "ubifs" directory in the debugfs + * file-system. Returns zero in case of success and a negative error code in + * case of failure. + */ +int dbg_debugfs_init(void) +{ + dfs_rootdir = debugfs_create_dir("ubifs", NULL); + if (IS_ERR(dfs_rootdir)) { + int err = PTR_ERR(dfs_rootdir); + ubifs_err("cannot create \"ubifs\" debugfs directory, " + "error %d\n", err); + return err; + } + + return 0; +} + +/** + * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system. + */ +void dbg_debugfs_exit(void) +{ + debugfs_remove(dfs_rootdir); +} + +static int open_debugfs_file(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return nonseekable_open(inode, file); +} + +static ssize_t write_debugfs_file(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct ubifs_info *c = file->private_data; + struct ubifs_debug_info *d = c->dbg; + + if (file->f_path.dentry == d->dfs_dump_lprops) + dbg_dump_lprops(c); + else if (file->f_path.dentry == d->dfs_dump_budg) + dbg_dump_budg(c, &c->bi); + else if (file->f_path.dentry == d->dfs_dump_tnc) { + mutex_lock(&c->tnc_mutex); + dbg_dump_tnc(c); + mutex_unlock(&c->tnc_mutex); + } else + return -EINVAL; + + return count; +} + +static const struct file_operations dfs_fops = { + .open = open_debugfs_file, + .write = write_debugfs_file, + .owner = THIS_MODULE, + .llseek = no_llseek, +}; + +/** + * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance. + * @c: UBIFS file-system description object + * + * This function creates all debugfs files for this instance of UBIFS. Returns + * zero in case of success and a negative error code in case of failure. + * + * Note, the only reason we have not merged this function with the + * 'ubifs_debugging_init()' function is because it is better to initialize + * debugfs interfaces at the very end of the mount process, and remove them at + * the very beginning of the mount process. + */ +int dbg_debugfs_init_fs(struct ubifs_info *c) +{ + int err; + const char *fname; + struct dentry *dent; + struct ubifs_debug_info *d = c->dbg; + + sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); + fname = d->dfs_dir_name; + dent = debugfs_create_dir(fname, dfs_rootdir); + if (IS_ERR_OR_NULL(dent)) + goto out; + d->dfs_dir = dent; + + fname = "dump_lprops"; + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_dump_lprops = dent; + + fname = "dump_budg"; + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_dump_budg = dent; + + fname = "dump_tnc"; + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_dump_tnc = dent; + + return 0; + +out_remove: + debugfs_remove_recursive(d->dfs_dir); +out: + err = dent ? PTR_ERR(dent) : -ENODEV; + ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", + fname, err); + return err; +} + +/** + * dbg_debugfs_exit_fs - remove all debugfs files. + * @c: UBIFS file-system description object + */ +void dbg_debugfs_exit_fs(struct ubifs_info *c) +{ + debugfs_remove_recursive(c->dbg->dfs_dir); +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h new file mode 100644 index 00000000..fd75b635 --- /dev/null +++ b/fs/ubifs/debug.h @@ -0,0 +1,445 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +#ifndef __UBIFS_DEBUG_H__ +#define __UBIFS_DEBUG_H__ + +/* Checking helper functions */ +typedef int (*dbg_leaf_callback)(struct ubifs_info *c, + struct ubifs_zbranch *zbr, void *priv); +typedef int (*dbg_znode_callback)(struct ubifs_info *c, + struct ubifs_znode *znode, void *priv); + +#ifdef CONFIG_UBIFS_FS_DEBUG + +#include + +/** + * ubifs_debug_info - per-FS debugging information. + * @old_zroot: old index root - used by 'dbg_check_old_index()' + * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' + * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' + * @failure_mode: failure mode for recovery testing + * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls + * @fail_timeout: time in jiffies when delay of failure mode expires + * @fail_cnt: current number of calls to failure mode I/O functions + * @fail_cnt_max: number of calls by which to delay failure mode + * @chk_lpt_sz: used by LPT tree size checker + * @chk_lpt_sz2: used by LPT tree size checker + * @chk_lpt_wastage: used by LPT tree size checker + * @chk_lpt_lebs: used by LPT tree size checker + * @new_nhead_offs: used by LPT tree size checker + * @new_ihead_lnum: used by debugging to check @c->ihead_lnum + * @new_ihead_offs: used by debugging to check @c->ihead_offs + * + * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()') + * @saved_bi: saved budgeting information + * @saved_free: saved amount of free space + * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt + * + * @dfs_dir_name: name of debugfs directory containing this file-system's files + * @dfs_dir: direntry object of the file-system debugfs directory + * @dfs_dump_lprops: "dump lprops" debugfs knob + * @dfs_dump_budg: "dump budgeting information" debugfs knob + * @dfs_dump_tnc: "dump TNC" debugfs knob + */ +struct ubifs_debug_info { + struct ubifs_zbranch old_zroot; + int old_zroot_level; + unsigned long long old_zroot_sqnum; + int failure_mode; + int fail_delay; + unsigned long fail_timeout; + unsigned int fail_cnt; + unsigned int fail_cnt_max; + long long chk_lpt_sz; + long long chk_lpt_sz2; + long long chk_lpt_wastage; + int chk_lpt_lebs; + int new_nhead_offs; + int new_ihead_lnum; + int new_ihead_offs; + + struct ubifs_lp_stats saved_lst; + struct ubifs_budg_info saved_bi; + long long saved_free; + int saved_idx_gc_cnt; + + char dfs_dir_name[100]; + struct dentry *dfs_dir; + struct dentry *dfs_dump_lprops; + struct dentry *dfs_dump_budg; + struct dentry *dfs_dump_tnc; +}; + +#define ubifs_assert(expr) do { \ + if (unlikely(!(expr))) { \ + printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ + __func__, __LINE__, current->pid); \ + dbg_dump_stack(); \ + } \ +} while (0) + +#define ubifs_assert_cmt_locked(c) do { \ + if (unlikely(down_write_trylock(&(c)->commit_sem))) { \ + up_write(&(c)->commit_sem); \ + printk(KERN_CRIT "commit lock is not locked!\n"); \ + ubifs_assert(0); \ + } \ +} while (0) + +#define dbg_dump_stack() dump_stack() + +#define dbg_err(fmt, ...) do { \ + spin_lock(&dbg_lock); \ + ubifs_err(fmt, ##__VA_ARGS__); \ + spin_unlock(&dbg_lock); \ +} while (0) + +const char *dbg_key_str0(const struct ubifs_info *c, + const union ubifs_key *key); +const char *dbg_key_str1(const struct ubifs_info *c, + const union ubifs_key *key); + +/* + * TODO: these macros are now broken because there is no locking around them + * and we use a global buffer for the key string. This means that in case of + * concurrent execution we will end up with incorrect and messy key strings. + */ +#define DBGKEY(key) dbg_key_str0(c, (key)) +#define DBGKEY1(key) dbg_key_str1(c, (key)) + +#define ubifs_dbg_msg(type, fmt, ...) \ + pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__) + +/* Just a debugging messages not related to any specific UBIFS subsystem */ +#define dbg_msg(fmt, ...) \ + printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \ + __func__, ##__VA_ARGS__) + +/* General messages */ +#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__) +/* Additional journal messages */ +#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__) +/* Additional TNC messages */ +#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__) +/* Additional lprops messages */ +#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__) +/* Additional LEB find messages */ +#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__) +/* Additional mount messages */ +#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__) +/* Additional I/O messages */ +#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__) +/* Additional commit messages */ +#define dbg_cmt(fmt, ...) ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__) +/* Additional budgeting messages */ +#define dbg_budg(fmt, ...) ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__) +/* Additional log messages */ +#define dbg_log(fmt, ...) ubifs_dbg_msg("log", fmt, ##__VA_ARGS__) +/* Additional gc messages */ +#define dbg_gc(fmt, ...) ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__) +/* Additional scan messages */ +#define dbg_scan(fmt, ...) ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__) +/* Additional recovery messages */ +#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) + +/* + * Debugging check flags. + * + * UBIFS_CHK_GEN: general checks + * UBIFS_CHK_TNC: check TNC + * UBIFS_CHK_IDX_SZ: check index size + * UBIFS_CHK_ORPH: check orphans + * UBIFS_CHK_OLD_IDX: check the old index + * UBIFS_CHK_LPROPS: check lprops + * UBIFS_CHK_FS: check the file-system + */ +enum { + UBIFS_CHK_GEN = 0x1, + UBIFS_CHK_TNC = 0x2, + UBIFS_CHK_IDX_SZ = 0x4, + UBIFS_CHK_ORPH = 0x8, + UBIFS_CHK_OLD_IDX = 0x10, + UBIFS_CHK_LPROPS = 0x20, + UBIFS_CHK_FS = 0x40, +}; + +/* + * Special testing flags. + * + * UBIFS_TST_RCVRY: failure mode for recovery testing + */ +enum { + UBIFS_TST_RCVRY = 0x4, +}; + +extern spinlock_t dbg_lock; + +extern unsigned int ubifs_msg_flags; +extern unsigned int ubifs_chk_flags; +extern unsigned int ubifs_tst_flags; + +int ubifs_debugging_init(struct ubifs_info *c); +void ubifs_debugging_exit(struct ubifs_info *c); + +/* Dump functions */ +const char *dbg_ntype(int type); +const char *dbg_cstate(int cmt_state); +const char *dbg_jhead(int jhead); +const char *dbg_get_key_dump(const struct ubifs_info *c, + const union ubifs_key *key); +void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode); +void dbg_dump_node(const struct ubifs_info *c, const void *node); +void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum, + int offs); +void dbg_dump_budget_req(const struct ubifs_budget_req *req); +void dbg_dump_lstats(const struct ubifs_lp_stats *lst); +void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi); +void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); +void dbg_dump_lprops(struct ubifs_info *c); +void dbg_dump_lpt_info(struct ubifs_info *c); +void dbg_dump_leb(const struct ubifs_info *c, int lnum); +void dbg_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode); +void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); +void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip); +void dbg_dump_tnc(struct ubifs_info *c); +void dbg_dump_index(struct ubifs_info *c); +void dbg_dump_lpt_lebs(const struct ubifs_info *c); + +int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, + dbg_znode_callback znode_cb, void *priv); + +/* Checking functions */ +void dbg_save_space_info(struct ubifs_info *c); +int dbg_check_space_info(struct ubifs_info *c); +int dbg_check_lprops(struct ubifs_info *c); +int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); +int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); +int dbg_check_cats(struct ubifs_info *c); +int dbg_check_ltab(struct ubifs_info *c); +int dbg_chk_lpt_free_spc(struct ubifs_info *c); +int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len); +int dbg_check_synced_i_size(struct inode *inode); +int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir); +int dbg_check_tnc(struct ubifs_info *c, int extra); +int dbg_check_idx_size(struct ubifs_info *c, long long idx_size); +int dbg_check_filesystem(struct ubifs_info *c); +void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, + int add_pos); +int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, + int row, int col); +int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, + loff_t size); +int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head); +int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head); + +/* Force the use of in-the-gaps method for testing */ +static inline int dbg_force_in_the_gaps_enabled(void) +{ + return ubifs_chk_flags & UBIFS_CHK_GEN; +} +int dbg_force_in_the_gaps(void); + +/* Failure mode for recovery testing */ +#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY) + +#ifndef UBIFS_DBG_PRESERVE_UBI +#define ubi_leb_read dbg_leb_read +#define ubi_leb_write dbg_leb_write +#define ubi_leb_change dbg_leb_change +#define ubi_leb_erase dbg_leb_erase +#define ubi_leb_unmap dbg_leb_unmap +#define ubi_is_mapped dbg_is_mapped +#define ubi_leb_map dbg_leb_map +#endif + +int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, + int len, int check); +int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, + int offset, int len, int dtype); +int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, + int len, int dtype); +int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum); +int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum); +int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum); +int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype); + +static inline int dbg_read(struct ubi_volume_desc *desc, int lnum, char *buf, + int offset, int len) +{ + return dbg_leb_read(desc, lnum, buf, offset, len, 0); +} + +static inline int dbg_write(struct ubi_volume_desc *desc, int lnum, + const void *buf, int offset, int len) +{ + return dbg_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN); +} + +static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, + const void *buf, int len) +{ + return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN); +} + +/* Debugfs-related stuff */ +int dbg_debugfs_init(void); +void dbg_debugfs_exit(void); +int dbg_debugfs_init_fs(struct ubifs_info *c); +void dbg_debugfs_exit_fs(struct ubifs_info *c); + +#else /* !CONFIG_UBIFS_FS_DEBUG */ + +/* Use "if (0)" to make compiler check arguments even if debugging is off */ +#define ubifs_assert(expr) do { \ + if (0 && (expr)) \ + printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ + __func__, __LINE__, current->pid); \ +} while (0) + +#define dbg_err(fmt, ...) do { \ + if (0) \ + ubifs_err(fmt, ##__VA_ARGS__); \ +} while (0) + +#define ubifs_dbg_msg(fmt, ...) do { \ + if (0) \ + pr_debug(fmt "\n", ##__VA_ARGS__); \ +} while (0) + +#define dbg_dump_stack() +#define ubifs_assert_cmt_locked(c) + +#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) + +#define DBGKEY(key) ((char *)(key)) +#define DBGKEY1(key) ((char *)(key)) + +static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; } +static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; } +static inline const char *dbg_ntype(int type) { return ""; } +static inline const char *dbg_cstate(int cmt_state) { return ""; } +static inline const char *dbg_jhead(int jhead) { return ""; } +static inline const char * +dbg_get_key_dump(const struct ubifs_info *c, + const union ubifs_key *key) { return ""; } +static inline void dbg_dump_inode(const struct ubifs_info *c, + const struct inode *inode) { return; } +static inline void dbg_dump_node(const struct ubifs_info *c, + const void *node) { return; } +static inline void dbg_dump_lpt_node(const struct ubifs_info *c, + void *node, int lnum, + int offs) { return; } +static inline void +dbg_dump_budget_req(const struct ubifs_budget_req *req) { return; } +static inline void +dbg_dump_lstats(const struct ubifs_lp_stats *lst) { return; } +static inline void +dbg_dump_budg(struct ubifs_info *c, + const struct ubifs_budg_info *bi) { return; } +static inline void dbg_dump_lprop(const struct ubifs_info *c, + const struct ubifs_lprops *lp) { return; } +static inline void dbg_dump_lprops(struct ubifs_info *c) { return; } +static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; } +static inline void dbg_dump_leb(const struct ubifs_info *c, + int lnum) { return; } +static inline void +dbg_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode) { return; } +static inline void dbg_dump_heap(struct ubifs_info *c, + struct ubifs_lpt_heap *heap, + int cat) { return; } +static inline void dbg_dump_pnode(struct ubifs_info *c, + struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, + int iip) { return; } +static inline void dbg_dump_tnc(struct ubifs_info *c) { return; } +static inline void dbg_dump_index(struct ubifs_info *c) { return; } +static inline void dbg_dump_lpt_lebs(const struct ubifs_info *c) { return; } + +static inline int dbg_walk_index(struct ubifs_info *c, + dbg_leaf_callback leaf_cb, + dbg_znode_callback znode_cb, + void *priv) { return 0; } +static inline void dbg_save_space_info(struct ubifs_info *c) { return; } +static inline int dbg_check_space_info(struct ubifs_info *c) { return 0; } +static inline int dbg_check_lprops(struct ubifs_info *c) { return 0; } +static inline int +dbg_old_index_check_init(struct ubifs_info *c, + struct ubifs_zbranch *zroot) { return 0; } +static inline int +dbg_check_old_index(struct ubifs_info *c, + struct ubifs_zbranch *zroot) { return 0; } +static inline int dbg_check_cats(struct ubifs_info *c) { return 0; } +static inline int dbg_check_ltab(struct ubifs_info *c) { return 0; } +static inline int dbg_chk_lpt_free_spc(struct ubifs_info *c) { return 0; } +static inline int dbg_chk_lpt_sz(struct ubifs_info *c, + int action, int len) { return 0; } +static inline int dbg_check_synced_i_size(struct inode *inode) { return 0; } +static inline int dbg_check_dir_size(struct ubifs_info *c, + const struct inode *dir) { return 0; } +static inline int dbg_check_tnc(struct ubifs_info *c, int extra) { return 0; } +static inline int dbg_check_idx_size(struct ubifs_info *c, + long long idx_size) { return 0; } +static inline int dbg_check_filesystem(struct ubifs_info *c) { return 0; } +static inline void dbg_check_heap(struct ubifs_info *c, + struct ubifs_lpt_heap *heap, + int cat, int add_pos) { return; } +static inline int dbg_check_lpt_nodes(struct ubifs_info *c, + struct ubifs_cnode *cnode, int row, int col) { return 0; } +static inline int dbg_check_inode_size(struct ubifs_info *c, + const struct inode *inode, + loff_t size) { return 0; } +static inline int +dbg_check_data_nodes_order(struct ubifs_info *c, + struct list_head *head) { return 0; } +static inline int +dbg_check_nondata_nodes_order(struct ubifs_info *c, + struct list_head *head) { return 0; } + +static inline int dbg_force_in_the_gaps(void) { return 0; } +#define dbg_force_in_the_gaps_enabled() 0 +#define dbg_failure_mode 0 + +static inline int dbg_debugfs_init(void) { return 0; } +static inline void dbg_debugfs_exit(void) { return; } +static inline int dbg_debugfs_init_fs(struct ubifs_info *c) { return 0; } +static inline int dbg_debugfs_exit_fs(struct ubifs_info *c) { return 0; } + +#endif /* !CONFIG_UBIFS_FS_DEBUG */ +#endif /* !__UBIFS_DEBUG_H__ */ diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c new file mode 100644 index 00000000..ef5abd38 --- /dev/null +++ b/fs/ubifs/dir.c @@ -0,0 +1,1206 @@ +/* * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + * Zoltan Sogor + */ + +/* + * This file implements directory operations. + * + * All FS operations in this file allocate budget before writing anything to the + * media. If they fail to allocate it, the error is returned. The only + * exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even + * if they unable to allocate the budget, because deletion %-ENOSPC failure is + * not what users are usually ready to get. UBIFS budgeting subsystem has some + * space reserved for these purposes. + * + * All operations in this file write all inodes which they change straight + * away, instead of marking them dirty. For example, 'ubifs_link()' changes + * @i_size of the parent inode and writes the parent inode together with the + * target inode. This was done to simplify file-system recovery which would + * otherwise be very difficult to do. The only exception is rename which marks + * the re-named inode dirty (because its @i_ctime is updated) but does not + * write it, but just marks it as dirty. + */ + +#include "ubifs.h" + +/** + * inherit_flags - inherit flags of the parent inode. + * @dir: parent inode + * @mode: new inode mode flags + * + * This is a helper function for 'ubifs_new_inode()' which inherits flag of the + * parent directory inode @dir. UBIFS inodes inherit the following flags: + * o %UBIFS_COMPR_FL, which is useful to switch compression on/of on + * sub-directory basis; + * o %UBIFS_SYNC_FL - useful for the same reasons; + * o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories. + * + * This function returns the inherited flags. + */ +static int inherit_flags(const struct inode *dir, int mode) +{ + int flags; + const struct ubifs_inode *ui = ubifs_inode(dir); + + if (!S_ISDIR(dir->i_mode)) + /* + * The parent is not a directory, which means that an extended + * attribute inode is being created. No flags. + */ + return 0; + + flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL); + if (!S_ISDIR(mode)) + /* The "DIRSYNC" flag only applies to directories */ + flags &= ~UBIFS_DIRSYNC_FL; + return flags; +} + +/** + * ubifs_new_inode - allocate new UBIFS inode object. + * @c: UBIFS file-system description object + * @dir: parent directory inode + * @mode: inode mode flags + * + * This function finds an unused inode number, allocates new inode and + * initializes it. Returns new inode in case of success and an error code in + * case of failure. + */ +struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, + int mode) +{ + struct inode *inode; + struct ubifs_inode *ui; + + inode = new_inode(c->vfs_sb); + ui = ubifs_inode(inode); + if (!inode) + return ERR_PTR(-ENOMEM); + + /* + * Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and + * marking them dirty in file write path (see 'file_update_time()'). + * UBIFS has to fully control "clean <-> dirty" transitions of inodes + * to make budgeting work. + */ + inode->i_flags |= (S_NOCMTIME); + + inode_init_owner(inode, dir, mode); + inode->i_mtime = inode->i_atime = inode->i_ctime = + ubifs_current_time(inode); + inode->i_mapping->nrpages = 0; + /* Disable readahead */ + inode->i_mapping->backing_dev_info = &c->bdi; + + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &ubifs_file_address_operations; + inode->i_op = &ubifs_file_inode_operations; + inode->i_fop = &ubifs_file_operations; + break; + case S_IFDIR: + inode->i_op = &ubifs_dir_inode_operations; + inode->i_fop = &ubifs_dir_operations; + inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ; + break; + case S_IFLNK: + inode->i_op = &ubifs_symlink_inode_operations; + break; + case S_IFSOCK: + case S_IFIFO: + case S_IFBLK: + case S_IFCHR: + inode->i_op = &ubifs_file_inode_operations; + break; + default: + BUG(); + } + + ui->flags = inherit_flags(dir, mode); + ubifs_set_inode_flags(inode); + if (S_ISREG(mode)) + ui->compr_type = c->default_compr; + else + ui->compr_type = UBIFS_COMPR_NONE; + ui->synced_i_size = 0; + + spin_lock(&c->cnt_lock); + /* Inode number overflow is currently not supported */ + if (c->highest_inum >= INUM_WARN_WATERMARK) { + if (c->highest_inum >= INUM_WATERMARK) { + spin_unlock(&c->cnt_lock); + ubifs_err("out of inode numbers"); + make_bad_inode(inode); + iput(inode); + return ERR_PTR(-EINVAL); + } + ubifs_warn("running out of inode numbers (current %lu, max %d)", + (unsigned long)c->highest_inum, INUM_WATERMARK); + } + + inode->i_ino = ++c->highest_inum; + /* + * The creation sequence number remains with this inode for its + * lifetime. All nodes for this inode have a greater sequence number, + * and so it is possible to distinguish obsolete nodes belonging to a + * previous incarnation of the same inode number - for example, for the + * purpose of rebuilding the index. + */ + ui->creat_sqnum = ++c->max_sqnum; + spin_unlock(&c->cnt_lock); + return inode; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm) +{ + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + if (le16_to_cpu(dent->nlen) != nm->len) + return -EINVAL; + if (memcmp(dent->name, nm->name, nm->len)) + return -EINVAL; + return 0; +} + +#else + +#define dbg_check_name(dent, nm) 0 + +#endif + +static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int err; + union ubifs_key key; + struct inode *inode = NULL; + struct ubifs_dent_node *dent; + struct ubifs_info *c = dir->i_sb->s_fs_info; + + dbg_gen("'%.*s' in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, dir->i_ino); + + if (dentry->d_name.len > UBIFS_MAX_NLEN) + return ERR_PTR(-ENAMETOOLONG); + + dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); + if (!dent) + return ERR_PTR(-ENOMEM); + + dent_key_init(c, &key, dir->i_ino, &dentry->d_name); + + err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name); + if (err) { + if (err == -ENOENT) { + dbg_gen("not found"); + goto done; + } + goto out; + } + + if (dbg_check_name(dent, &dentry->d_name)) { + err = -EINVAL; + goto out; + } + + inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum)); + if (IS_ERR(inode)) { + /* + * This should not happen. Probably the file-system needs + * checking. + */ + err = PTR_ERR(inode); + ubifs_err("dead directory entry '%.*s', error %d", + dentry->d_name.len, dentry->d_name.name, err); + ubifs_ro_mode(c, err); + goto out; + } + +done: + kfree(dent); + /* + * Note, d_splice_alias() would be required instead if we supported + * NFS. + */ + d_add(dentry, inode); + return NULL; + +out: + kfree(dent); + return ERR_PTR(err); +} + +static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + struct ubifs_info *c = dir->i_sb->s_fs_info; + int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1 }; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + + /* + * Budget request settings: new inode, new direntry, changing the + * parent directory inode. + */ + + dbg_gen("dent '%.*s', mode %#x in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino); + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + mutex_lock(&dir_ui->ui_mutex); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + mutex_unlock(&dir_ui->ui_mutex); + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + ubifs_err("cannot create regular file, error %d", err); + return err; +} + +/** + * vfs_dent_type - get VFS directory entry type. + * @type: UBIFS directory entry type + * + * This function converts UBIFS directory entry type into VFS directory entry + * type. + */ +static unsigned int vfs_dent_type(uint8_t type) +{ + switch (type) { + case UBIFS_ITYPE_REG: + return DT_REG; + case UBIFS_ITYPE_DIR: + return DT_DIR; + case UBIFS_ITYPE_LNK: + return DT_LNK; + case UBIFS_ITYPE_BLK: + return DT_BLK; + case UBIFS_ITYPE_CHR: + return DT_CHR; + case UBIFS_ITYPE_FIFO: + return DT_FIFO; + case UBIFS_ITYPE_SOCK: + return DT_SOCK; + default: + BUG(); + } + return 0; +} + +/* + * The classical Unix view for directory is that it is a linear array of + * (name, inode number) entries. Linux/VFS assumes this model as well. + * Particularly, 'readdir()' call wants us to return a directory entry offset + * which later may be used to continue 'readdir()'ing the directory or to + * 'seek()' to that specific direntry. Obviously UBIFS does not really fit this + * model because directory entries are identified by keys, which may collide. + * + * UBIFS uses directory entry hash value for directory offsets, so + * 'seekdir()'/'telldir()' may not always work because of possible key + * collisions. But UBIFS guarantees that consecutive 'readdir()' calls work + * properly by means of saving full directory entry name in the private field + * of the file description object. + * + * This means that UBIFS cannot support NFS which requires full + * 'seekdir()'/'telldir()' support. + */ +static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + int err, over = 0; + struct qstr nm; + union ubifs_key key; + struct ubifs_dent_node *dent; + struct inode *dir = file->f_path.dentry->d_inode; + struct ubifs_info *c = dir->i_sb->s_fs_info; + + dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos); + + if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2) + /* + * The directory was seek'ed to a senseless position or there + * are no more entries. + */ + return 0; + + /* File positions 0 and 1 correspond to "." and ".." */ + if (file->f_pos == 0) { + ubifs_assert(!file->private_data); + over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR); + if (over) + return 0; + file->f_pos = 1; + } + + if (file->f_pos == 1) { + ubifs_assert(!file->private_data); + over = filldir(dirent, "..", 2, 1, + parent_ino(file->f_path.dentry), DT_DIR); + if (over) + return 0; + + /* Find the first entry in TNC and save it */ + lowest_dent_key(c, &key, dir->i_ino); + nm.name = NULL; + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + goto out; + } + + file->f_pos = key_hash_flash(c, &dent->key); + file->private_data = dent; + } + + dent = file->private_data; + if (!dent) { + /* + * The directory was seek'ed to and is now readdir'ed. + * Find the entry corresponding to @file->f_pos or the + * closest one. + */ + dent_key_init_hash(c, &key, dir->i_ino, file->f_pos); + nm.name = NULL; + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + goto out; + } + file->f_pos = key_hash_flash(c, &dent->key); + file->private_data = dent; + } + + while (1) { + dbg_gen("feed '%s', ino %llu, new f_pos %#x", + dent->name, (unsigned long long)le64_to_cpu(dent->inum), + key_hash_flash(c, &dent->key)); + ubifs_assert(le64_to_cpu(dent->ch.sqnum) > + ubifs_inode(dir)->creat_sqnum); + + nm.len = le16_to_cpu(dent->nlen); + over = filldir(dirent, dent->name, nm.len, file->f_pos, + le64_to_cpu(dent->inum), + vfs_dent_type(dent->type)); + if (over) + return 0; + + /* Switch to the next entry */ + key_read(c, &dent->key, &key); + nm.name = dent->name; + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + goto out; + } + + kfree(file->private_data); + file->f_pos = key_hash_flash(c, &dent->key); + file->private_data = dent; + cond_resched(); + } + +out: + if (err != -ENOENT) { + ubifs_err("cannot find next direntry, error %d", err); + return err; + } + + kfree(file->private_data); + file->private_data = NULL; + file->f_pos = 2; + return 0; +} + +/* If a directory is seeked, we have to free saved readdir() state */ +static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin) +{ + kfree(file->private_data); + file->private_data = NULL; + return generic_file_llseek(file, offset, origin); +} + +/* Free saved readdir() state when the directory is closed */ +static int ubifs_dir_release(struct inode *dir, struct file *file) +{ + kfree(file->private_data); + file->private_data = NULL; + return 0; +} + +/** + * lock_2_inodes - a wrapper for locking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. + */ +static void lock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); +} + +/** + * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + */ +static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); +} + +static int ubifs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct inode *inode = old_dentry->d_inode; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_inode *dir_ui = ubifs_inode(dir); + int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2, + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + + /* + * Budget request settings: new direntry, changing the target inode, + * changing the parent inode. + */ + + dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, inode->i_ino, + inode->i_nlink, dir->i_ino); + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); + + err = dbg_check_synced_i_size(inode); + if (err) + return err; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + lock_2_inodes(dir, inode); + inc_nlink(inode); + ihold(inode); + inode->i_ctime = ubifs_current_time(inode); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + unlock_2_inodes(dir, inode); + + ubifs_release_budget(c, &req); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + drop_nlink(inode); + unlock_2_inodes(dir, inode); + ubifs_release_budget(c, &req); + iput(inode); + return err; +} + +static int ubifs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct inode *inode = dentry->d_inode; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, budgeted = 1; + struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; + + /* + * Budget request settings: deletion direntry, deletion inode (+1 for + * @dirtied_ino), changing the parent directory inode. If budgeting + * fails, go ahead anyway because we have extra space reserved for + * deletions. + */ + + dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, inode->i_ino, + inode->i_nlink, dir->i_ino); + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); + err = dbg_check_synced_i_size(inode); + if (err) + return err; + + err = ubifs_budget_space(c, &req); + if (err) { + if (err != -ENOSPC) + return err; + budgeted = 0; + } + + lock_2_inodes(dir, inode); + inode->i_ctime = ubifs_current_time(dir); + drop_nlink(inode); + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + if (err) + goto out_cancel; + unlock_2_inodes(dir, inode); + + if (budgeted) + ubifs_release_budget(c, &req); + else { + /* We've deleted something - clean the "no space" flags */ + c->bi.nospace = c->bi.nospace_rp = 0; + smp_wmb(); + } + return 0; + +out_cancel: + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + inc_nlink(inode); + unlock_2_inodes(dir, inode); + if (budgeted) + ubifs_release_budget(c, &req); + return err; +} + +/** + * check_dir_empty - check if a directory is empty or not. + * @c: UBIFS file-system description object + * @dir: VFS inode object of the directory to check + * + * This function checks if directory @dir is empty. Returns zero if the + * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes + * in case of of errors. + */ +static int check_dir_empty(struct ubifs_info *c, struct inode *dir) +{ + struct qstr nm = { .name = NULL }; + struct ubifs_dent_node *dent; + union ubifs_key key; + int err; + + lowest_dent_key(c, &key, dir->i_ino); + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + if (err == -ENOENT) + err = 0; + } else { + kfree(dent); + err = -ENOTEMPTY; + } + return err; +} + +static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct inode *inode = dentry->d_inode; + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, budgeted = 1; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; + + /* + * Budget request settings: deletion direntry, deletion inode and + * changing the parent inode. If budgeting fails, go ahead anyway + * because we have extra space reserved for deletions. + */ + + dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len, + dentry->d_name.name, inode->i_ino, dir->i_ino); + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); + err = check_dir_empty(c, dentry->d_inode); + if (err) + return err; + + err = ubifs_budget_space(c, &req); + if (err) { + if (err != -ENOSPC) + return err; + budgeted = 0; + } + + lock_2_inodes(dir, inode); + inode->i_ctime = ubifs_current_time(dir); + clear_nlink(inode); + drop_nlink(dir); + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + if (err) + goto out_cancel; + unlock_2_inodes(dir, inode); + + if (budgeted) + ubifs_release_budget(c, &req); + else { + /* We've deleted something - clean the "no space" flags */ + c->bi.nospace = c->bi.nospace_rp = 0; + smp_wmb(); + } + return 0; + +out_cancel: + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + inc_nlink(dir); + inc_nlink(inode); + inc_nlink(inode); + unlock_2_inodes(dir, inode); + if (budgeted) + ubifs_release_budget(c, &req); + return err; +} + +static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_info *c = dir->i_sb->s_fs_info; + int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; + + /* + * Budget request settings: new inode, new direntry and changing parent + * directory inode. + */ + + dbg_gen("dent '%.*s', mode %#x in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino); + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, dir, S_IFDIR | mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + mutex_lock(&dir_ui->ui_mutex); + insert_inode_hash(inode); + inc_nlink(inode); + inc_nlink(dir); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) { + ubifs_err("cannot create directory, error %d", err); + goto out_cancel; + } + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + drop_nlink(dir); + mutex_unlock(&dir_ui->ui_mutex); + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +static int ubifs_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + struct inode *inode; + struct ubifs_inode *ui; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_info *c = dir->i_sb->s_fs_info; + union ubifs_dev_desc *dev = NULL; + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, devlen = 0; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .new_ino_d = ALIGN(devlen, 8), + .dirtied_ino = 1 }; + + /* + * Budget request settings: new inode, new direntry and changing parent + * directory inode. + */ + + dbg_gen("dent '%.*s' in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, dir->i_ino); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + if (S_ISBLK(mode) || S_ISCHR(mode)) { + dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); + if (!dev) + return -ENOMEM; + devlen = ubifs_encode_dev(dev, rdev); + } + + err = ubifs_budget_space(c, &req); + if (err) { + kfree(dev); + return err; + } + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + kfree(dev); + err = PTR_ERR(inode); + goto out_budg; + } + + init_special_inode(inode, inode->i_mode, rdev); + inode->i_size = ubifs_inode(inode)->ui_size = devlen; + ui = ubifs_inode(inode); + ui->data = dev; + ui->data_len = devlen; + + mutex_lock(&dir_ui->ui_mutex); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + mutex_unlock(&dir_ui->ui_mutex); + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +static int ubifs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct inode *inode; + struct ubifs_inode *ui; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_info *c = dir->i_sb->s_fs_info; + int err, len = strlen(symname); + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .new_ino_d = ALIGN(len, 8), + .dirtied_ino = 1 }; + + /* + * Budget request settings: new inode, new direntry and changing parent + * directory inode. + */ + + dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len, + dentry->d_name.name, symname, dir->i_ino); + + if (len > UBIFS_MAX_INO_DATA) + return -ENAMETOOLONG; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + ui = ubifs_inode(inode); + ui->data = kmalloc(len + 1, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_inode; + } + + memcpy(ui->data, symname, len); + ((char *)ui->data)[len] = '\0'; + /* + * The terminating zero byte is not written to the flash media and it + * is put just to make later in-memory string processing simpler. Thus, + * data length is @len, not @len + %1. + */ + ui->data_len = len; + inode->i_size = ubifs_inode(inode)->ui_size = len; + + mutex_lock(&dir_ui->ui_mutex); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + mutex_unlock(&dir_ui->ui_mutex); +out_inode: + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +/** + * lock_3_inodes - a wrapper for locking three UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + * @inode3: third inode + * + * This function is used for 'ubifs_rename()' and @inode1 may be the same as + * @inode2 whereas @inode3 may be %NULL. + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. + */ +static void lock_3_inodes(struct inode *inode1, struct inode *inode2, + struct inode *inode3) +{ + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + if (inode2 != inode1) + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); + if (inode3) + mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3); +} + +/** + * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename. + * @inode1: first inode + * @inode2: second inode + * @inode3: third inode + */ +static void unlock_3_inodes(struct inode *inode1, struct inode *inode2, + struct inode *inode3) +{ + if (inode3) + mutex_unlock(&ubifs_inode(inode3)->ui_mutex); + if (inode1 != inode2) + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); +} + +static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct ubifs_info *c = old_dir->i_sb->s_fs_info; + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode); + int err, release, sync = 0, move = (new_dir != old_dir); + int is_dir = S_ISDIR(old_inode->i_mode); + int unlink = !!new_inode; + int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len); + int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len); + struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1, + .dirtied_ino = 3 }; + struct ubifs_budget_req ino_req = { .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; + struct timespec time; + + /* + * Budget request settings: deletion direntry, new direntry, removing + * the old inode, and changing old and new parent directory inodes. + * + * However, this operation also marks the target inode as dirty and + * does not write it, so we allocate budget for the target inode + * separately. + */ + + dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in " + "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name, + old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len, + new_dentry->d_name.name, new_dir->i_ino); + ubifs_assert(mutex_is_locked(&old_dir->i_mutex)); + ubifs_assert(mutex_is_locked(&new_dir->i_mutex)); + if (unlink) + ubifs_assert(mutex_is_locked(&new_inode->i_mutex)); + + + if (unlink && is_dir) { + err = check_dir_empty(c, new_inode); + if (err) + return err; + } + + err = ubifs_budget_space(c, &req); + if (err) + return err; + err = ubifs_budget_space(c, &ino_req); + if (err) { + ubifs_release_budget(c, &req); + return err; + } + + lock_3_inodes(old_dir, new_dir, new_inode); + + /* + * Like most other Unix systems, set the @i_ctime for inodes on a + * rename. + */ + time = ubifs_current_time(old_dir); + old_inode->i_ctime = time; + + /* We must adjust parent link count when renaming directories */ + if (is_dir) { + if (move) { + /* + * @old_dir loses a link because we are moving + * @old_inode to a different directory. + */ + drop_nlink(old_dir); + /* + * @new_dir only gains a link if we are not also + * overwriting an existing directory. + */ + if (!unlink) + inc_nlink(new_dir); + } else { + /* + * @old_inode is not moving to a different directory, + * but @old_dir still loses a link if we are + * overwriting an existing directory. + */ + if (unlink) + drop_nlink(old_dir); + } + } + + old_dir->i_size -= old_sz; + ubifs_inode(old_dir)->ui_size = old_dir->i_size; + old_dir->i_mtime = old_dir->i_ctime = time; + new_dir->i_mtime = new_dir->i_ctime = time; + + /* + * And finally, if we unlinked a direntry which happened to have the + * same name as the moved direntry, we have to decrement @i_nlink of + * the unlinked inode and change its ctime. + */ + if (unlink) { + /* + * Directories cannot have hard-links, so if this is a + * directory, decrement its @i_nlink twice because an empty + * directory has @i_nlink 2. + */ + if (is_dir) + drop_nlink(new_inode); + new_inode->i_ctime = time; + drop_nlink(new_inode); + } else { + new_dir->i_size += new_sz; + ubifs_inode(new_dir)->ui_size = new_dir->i_size; + } + + /* + * Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode + * is dirty, because this will be done later on at the end of + * 'ubifs_rename()'. + */ + if (IS_SYNC(old_inode)) { + sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); + if (unlink && IS_SYNC(new_inode)) + sync = 1; + } + err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry, + sync); + if (err) + goto out_cancel; + + unlock_3_inodes(old_dir, new_dir, new_inode); + ubifs_release_budget(c, &req); + + mutex_lock(&old_inode_ui->ui_mutex); + release = old_inode_ui->dirty; + mark_inode_dirty_sync(old_inode); + mutex_unlock(&old_inode_ui->ui_mutex); + + if (release) + ubifs_release_budget(c, &ino_req); + if (IS_SYNC(old_inode)) + err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); + return err; + +out_cancel: + if (unlink) { + if (is_dir) + inc_nlink(new_inode); + inc_nlink(new_inode); + } else { + new_dir->i_size -= new_sz; + ubifs_inode(new_dir)->ui_size = new_dir->i_size; + } + old_dir->i_size += old_sz; + ubifs_inode(old_dir)->ui_size = old_dir->i_size; + if (is_dir) { + if (move) { + inc_nlink(old_dir); + if (!unlink) + drop_nlink(new_dir); + } else { + if (unlink) + inc_nlink(old_dir); + } + } + unlock_3_inodes(old_dir, new_dir, new_inode); + ubifs_release_budget(c, &ino_req); + ubifs_release_budget(c, &req); + return err; +} + +int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + loff_t size; + struct inode *inode = dentry->d_inode; + struct ubifs_inode *ui = ubifs_inode(inode); + + mutex_lock(&ui->ui_mutex); + stat->dev = inode->i_sb->s_dev; + stat->ino = inode->i_ino; + stat->mode = inode->i_mode; + stat->nlink = inode->i_nlink; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; + stat->rdev = inode->i_rdev; + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; + stat->blksize = UBIFS_BLOCK_SIZE; + stat->size = ui->ui_size; + + /* + * Unfortunately, the 'stat()' system call was designed for block + * device based file systems, and it is not appropriate for UBIFS, + * because UBIFS does not have notion of "block". For example, it is + * difficult to tell how many block a directory takes - it actually + * takes less than 300 bytes, but we have to round it to block size, + * which introduces large mistake. This makes utilities like 'du' to + * report completely senseless numbers. This is the reason why UBIFS + * goes the same way as JFFS2 - it reports zero blocks for everything + * but regular files, which makes more sense than reporting completely + * wrong sizes. + */ + if (S_ISREG(inode->i_mode)) { + size = ui->xattr_size; + size += stat->size; + size = ALIGN(size, UBIFS_BLOCK_SIZE); + /* + * Note, user-space expects 512-byte blocks count irrespectively + * of what was reported in @stat->size. + */ + stat->blocks = size >> 9; + } else + stat->blocks = 0; + mutex_unlock(&ui->ui_mutex); + return 0; +} + +const struct inode_operations ubifs_dir_inode_operations = { + .lookup = ubifs_lookup, + .create = ubifs_create, + .link = ubifs_link, + .symlink = ubifs_symlink, + .unlink = ubifs_unlink, + .mkdir = ubifs_mkdir, + .rmdir = ubifs_rmdir, + .mknod = ubifs_mknod, + .rename = ubifs_rename, + .setattr = ubifs_setattr, + .getattr = ubifs_getattr, +#ifdef CONFIG_UBIFS_FS_XATTR + .setxattr = ubifs_setxattr, + .getxattr = ubifs_getxattr, + .listxattr = ubifs_listxattr, + .removexattr = ubifs_removexattr, +#endif +}; + +const struct file_operations ubifs_dir_operations = { + .llseek = ubifs_dir_llseek, + .release = ubifs_dir_release, + .read = generic_read_dir, + .readdir = ubifs_readdir, + .fsync = ubifs_fsync, + .unlocked_ioctl = ubifs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ubifs_compat_ioctl, +#endif +}; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c new file mode 100644 index 00000000..5e7fccfc --- /dev/null +++ b/fs/ubifs/file.c @@ -0,0 +1,1593 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements VFS file and inode operations for regular files, device + * nodes and symlinks as well as address space operations. + * + * UBIFS uses 2 page flags: @PG_private and @PG_checked. @PG_private is set if + * the page is dirty and is used for optimization purposes - dirty pages are + * not budgeted so the flag shows that 'ubifs_write_end()' should not release + * the budget for this page. The @PG_checked flag is set if full budgeting is + * required for the page e.g., when it corresponds to a file hole or it is + * beyond the file size. The budgeting is done in 'ubifs_write_begin()', because + * it is OK to fail in this function, and the budget is released in + * 'ubifs_write_end()'. So the @PG_private and @PG_checked flags carry + * information about how the page was budgeted, to make it possible to release + * the budget properly. + * + * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we + * implement. However, this is not true for 'ubifs_writepage()', which may be + * called with @i_mutex unlocked. For example, when pdflush is doing background + * write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. At "normal" + * work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. in the + * "sys_write -> alloc_pages -> direct reclaim path". So, in 'ubifs_writepage()' + * we are only guaranteed that the page is locked. + * + * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the + * read-ahead path does not lock it ("sys_read -> generic_file_aio_read -> + * ondemand_readahead -> readpage"). In case of readahead, @I_SYNC flag is not + * set as well. However, UBIFS disables readahead. + */ + +#include "ubifs.h" +#include +#include +#include + +static int read_block(struct inode *inode, void *addr, unsigned int block, + struct ubifs_data_node *dn) +{ + struct ubifs_info *c = inode->i_sb->s_fs_info; + int err, len, out_len; + union ubifs_key key; + unsigned int dlen; + + data_key_init(c, &key, inode->i_ino, block); + err = ubifs_tnc_lookup(c, &key, dn); + if (err) { + if (err == -ENOENT) + /* Not found, so it must be a hole */ + memset(addr, 0, UBIFS_BLOCK_SIZE); + return err; + } + + ubifs_assert(le64_to_cpu(dn->ch.sqnum) > + ubifs_inode(inode)->creat_sqnum); + len = le32_to_cpu(dn->size); + if (len <= 0 || len > UBIFS_BLOCK_SIZE) + goto dump; + + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + out_len = UBIFS_BLOCK_SIZE; + err = ubifs_decompress(&dn->data, dlen, addr, &out_len, + le16_to_cpu(dn->compr_type)); + if (err || len != out_len) + goto dump; + + /* + * Data length can be less than a full block, even for blocks that are + * not the last in the file (e.g., as a result of making a hole and + * appending data). Ensure that the remainder is zeroed out. + */ + if (len < UBIFS_BLOCK_SIZE) + memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + + return 0; + +dump: + ubifs_err("bad data node (block %u, inode %lu)", + block, inode->i_ino); + dbg_dump_node(c, dn); + return -EINVAL; +} + +static int do_readpage(struct page *page) +{ + void *addr; + int err = 0, i; + unsigned int block, beyond; + struct ubifs_data_node *dn; + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + + dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + inode->i_ino, page->index, i_size, page->flags); + ubifs_assert(!PageChecked(page)); + ubifs_assert(!PagePrivate(page)); + + addr = kmap(page); + + block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; + if (block >= beyond) { + /* Reading beyond inode */ + SetPageChecked(page); + memset(addr, 0, PAGE_CACHE_SIZE); + goto out; + } + + dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS); + if (!dn) { + err = -ENOMEM; + goto error; + } + + i = 0; + while (1) { + int ret; + + if (block >= beyond) { + /* Reading beyond inode */ + err = -ENOENT; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } else { + ret = read_block(inode, addr, block, dn); + if (ret) { + err = ret; + if (err != -ENOENT) + break; + } else if (block + 1 == beyond) { + int dlen = le32_to_cpu(dn->size); + int ilen = i_size & (UBIFS_BLOCK_SIZE - 1); + + if (ilen && ilen < dlen) + memset(addr + ilen, 0, dlen - ilen); + } + } + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + block += 1; + addr += UBIFS_BLOCK_SIZE; + } + if (err) { + if (err == -ENOENT) { + /* Not found, so it must be a hole */ + SetPageChecked(page); + dbg_gen("hole"); + goto out_free; + } + ubifs_err("cannot read page %lu of inode %lu, error %d", + page->index, inode->i_ino, err); + goto error; + } + +out_free: + kfree(dn); +out: + SetPageUptodate(page); + ClearPageError(page); + flush_dcache_page(page); + kunmap(page); + return 0; + +error: + kfree(dn); + ClearPageUptodate(page); + SetPageError(page); + flush_dcache_page(page); + kunmap(page); + return err; +} + +/** + * release_new_page_budget - release budget of a new page. + * @c: UBIFS file-system description object + * + * This is a helper function which releases budget corresponding to the budget + * of one new page of data. + */ +static void release_new_page_budget(struct ubifs_info *c) +{ + struct ubifs_budget_req req = { .recalculate = 1, .new_page = 1 }; + + ubifs_release_budget(c, &req); +} + +/** + * release_existing_page_budget - release budget of an existing page. + * @c: UBIFS file-system description object + * + * This is a helper function which releases budget corresponding to the budget + * of changing one one page of data which already exists on the flash media. + */ +static void release_existing_page_budget(struct ubifs_info *c) +{ + struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget}; + + ubifs_release_budget(c, &req); +} + +static int write_begin_slow(struct address_space *mapping, + loff_t pos, unsigned len, struct page **pagep, + unsigned flags) +{ + struct inode *inode = mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct ubifs_budget_req req = { .new_page = 1 }; + int uninitialized_var(err), appending = !!(pos + len > inode->i_size); + struct page *page; + + dbg_gen("ino %lu, pos %llu, len %u, i_size %lld", + inode->i_ino, pos, len, inode->i_size); + + /* + * At the slow path we have to budget before locking the page, because + * budgeting may force write-back, which would wait on locked pages and + * deadlock if we had the page locked. At this point we do not know + * anything about the page, so assume that this is a new page which is + * written to a hole. This corresponds to largest budget. Later the + * budget will be amended if this is not true. + */ + if (appending) + /* We are appending data, budget for inode change */ + req.dirtied_ino = 1; + + err = ubifs_budget_space(c, &req); + if (unlikely(err)) + return err; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (unlikely(!page)) { + ubifs_release_budget(c, &req); + return -ENOMEM; + } + + if (!PageUptodate(page)) { + if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) + SetPageChecked(page); + else { + err = do_readpage(page); + if (err) { + unlock_page(page); + page_cache_release(page); + return err; + } + } + + SetPageUptodate(page); + ClearPageError(page); + } + + if (PagePrivate(page)) + /* + * The page is dirty, which means it was budgeted twice: + * o first time the budget was allocated by the task which + * made the page dirty and set the PG_private flag; + * o and then we budgeted for it for the second time at the + * very beginning of this function. + * + * So what we have to do is to release the page budget we + * allocated. + */ + release_new_page_budget(c); + else if (!PageChecked(page)) + /* + * We are changing a page which already exists on the media. + * This means that changing the page does not make the amount + * of indexing information larger, and this part of the budget + * which we have already acquired may be released. + */ + ubifs_convert_page_budget(c); + + if (appending) { + struct ubifs_inode *ui = ubifs_inode(inode); + + /* + * 'ubifs_write_end()' is optimized from the fast-path part of + * 'ubifs_write_begin()' and expects the @ui_mutex to be locked + * if data is appended. + */ + mutex_lock(&ui->ui_mutex); + if (ui->dirty) + /* + * The inode is dirty already, so we may free the + * budget we allocated. + */ + ubifs_release_dirty_inode_budget(c, ui); + } + + *pagep = page; + return 0; +} + +/** + * allocate_budget - allocate budget for 'ubifs_write_begin()'. + * @c: UBIFS file-system description object + * @page: page to allocate budget for + * @ui: UBIFS inode object the page belongs to + * @appending: non-zero if the page is appended + * + * This is a helper function for 'ubifs_write_begin()' which allocates budget + * for the operation. The budget is allocated differently depending on whether + * this is appending, whether the page is dirty or not, and so on. This + * function leaves the @ui->ui_mutex locked in case of appending. Returns zero + * in case of success and %-ENOSPC in case of failure. + */ +static int allocate_budget(struct ubifs_info *c, struct page *page, + struct ubifs_inode *ui, int appending) +{ + struct ubifs_budget_req req = { .fast = 1 }; + + if (PagePrivate(page)) { + if (!appending) + /* + * The page is dirty and we are not appending, which + * means no budget is needed at all. + */ + return 0; + + mutex_lock(&ui->ui_mutex); + if (ui->dirty) + /* + * The page is dirty and we are appending, so the inode + * has to be marked as dirty. However, it is already + * dirty, so we do not need any budget. We may return, + * but @ui->ui_mutex hast to be left locked because we + * should prevent write-back from flushing the inode + * and freeing the budget. The lock will be released in + * 'ubifs_write_end()'. + */ + return 0; + + /* + * The page is dirty, we are appending, the inode is clean, so + * we need to budget the inode change. + */ + req.dirtied_ino = 1; + } else { + if (PageChecked(page)) + /* + * The page corresponds to a hole and does not + * exist on the media. So changing it makes + * make the amount of indexing information + * larger, and we have to budget for a new + * page. + */ + req.new_page = 1; + else + /* + * Not a hole, the change will not add any new + * indexing information, budget for page + * change. + */ + req.dirtied_page = 1; + + if (appending) { + mutex_lock(&ui->ui_mutex); + if (!ui->dirty) + /* + * The inode is clean but we will have to mark + * it as dirty because we are appending. This + * needs a budget. + */ + req.dirtied_ino = 1; + } + } + + return ubifs_budget_space(c, &req); +} + +/* + * This function is called when a page of data is going to be written. Since + * the page of data will not necessarily go to the flash straight away, UBIFS + * has to reserve space on the media for it, which is done by means of + * budgeting. + * + * This is the hot-path of the file-system and we are trying to optimize it as + * much as possible. For this reasons it is split on 2 parts - slow and fast. + * + * There many budgeting cases: + * o a new page is appended - we have to budget for a new page and for + * changing the inode; however, if the inode is already dirty, there is + * no need to budget for it; + * o an existing clean page is changed - we have budget for it; if the page + * does not exist on the media (a hole), we have to budget for a new + * page; otherwise, we may budget for changing an existing page; the + * difference between these cases is that changing an existing page does + * not introduce anything new to the FS indexing information, so it does + * not grow, and smaller budget is acquired in this case; + * o an existing dirty page is changed - no need to budget at all, because + * the page budget has been acquired by earlier, when the page has been + * marked dirty. + * + * UBIFS budgeting sub-system may force write-back if it thinks there is no + * space to reserve. This imposes some locking restrictions and makes it + * impossible to take into account the above cases, and makes it impossible to + * optimize budgeting. + * + * The solution for this is that the fast path of 'ubifs_write_begin()' assumes + * there is a plenty of flash space and the budget will be acquired quickly, + * without forcing write-back. The slow path does not make this assumption. + */ +static int ubifs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + int uninitialized_var(err), appending = !!(pos + len > inode->i_size); + int skipped_read = 0; + struct page *page; + + ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); + ubifs_assert(!c->ro_media && !c->ro_mount); + + if (unlikely(c->ro_error)) + return -EROFS; + + /* Try out the fast-path part first */ + page = grab_cache_page_write_begin(mapping, index, flags); + if (unlikely(!page)) + return -ENOMEM; + + if (!PageUptodate(page)) { + /* The page is not loaded from the flash */ + if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) { + /* + * We change whole page so no need to load it. But we + * do not know whether this page exists on the media or + * not, so we assume the latter because it requires + * larger budget. The assumption is that it is better + * to budget a bit more than to read the page from the + * media. Thus, we are setting the @PG_checked flag + * here. + */ + SetPageChecked(page); + skipped_read = 1; + } else { + err = do_readpage(page); + if (err) { + unlock_page(page); + page_cache_release(page); + return err; + } + } + + SetPageUptodate(page); + ClearPageError(page); + } + + err = allocate_budget(c, page, ui, appending); + if (unlikely(err)) { + ubifs_assert(err == -ENOSPC); + /* + * If we skipped reading the page because we were going to + * write all of it, then it is not up to date. + */ + if (skipped_read) { + ClearPageChecked(page); + ClearPageUptodate(page); + } + /* + * Budgeting failed which means it would have to force + * write-back but didn't, because we set the @fast flag in the + * request. Write-back cannot be done now, while we have the + * page locked, because it would deadlock. Unlock and free + * everything and fall-back to slow-path. + */ + if (appending) { + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + mutex_unlock(&ui->ui_mutex); + } + unlock_page(page); + page_cache_release(page); + + return write_begin_slow(mapping, pos, len, pagep, flags); + } + + /* + * Whee, we acquired budgeting quickly - without involving + * garbage-collection, committing or forcing write-back. We return + * with @ui->ui_mutex locked if we are appending pages, and unlocked + * otherwise. This is an optimization (slightly hacky though). + */ + *pagep = page; + return 0; + +} + +/** + * cancel_budget - cancel budget. + * @c: UBIFS file-system description object + * @page: page to cancel budget for + * @ui: UBIFS inode object the page belongs to + * @appending: non-zero if the page is appended + * + * This is a helper function for a page write operation. It unlocks the + * @ui->ui_mutex in case of appending. + */ +static void cancel_budget(struct ubifs_info *c, struct page *page, + struct ubifs_inode *ui, int appending) +{ + if (appending) { + if (!ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); + mutex_unlock(&ui->ui_mutex); + } + if (!PagePrivate(page)) { + if (PageChecked(page)) + release_new_page_budget(c); + else + release_existing_page_budget(c); + } +} + +static int ubifs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_info *c = inode->i_sb->s_fs_info; + loff_t end_pos = pos + len; + int appending = !!(end_pos > inode->i_size); + + dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld", + inode->i_ino, pos, page->index, len, copied, inode->i_size); + + if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) { + /* + * VFS copied less data to the page that it intended and + * declared in its '->write_begin()' call via the @len + * argument. If the page was not up-to-date, and @len was + * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did + * not load it from the media (for optimization reasons). This + * means that part of the page contains garbage. So read the + * page now. + */ + dbg_gen("copied %d instead of %d, read page and repeat", + copied, len); + cancel_budget(c, page, ui, appending); + ClearPageChecked(page); + + /* + * Return 0 to force VFS to repeat the whole operation, or the + * error code if 'do_readpage()' fails. + */ + copied = do_readpage(page); + goto out; + } + + if (!PagePrivate(page)) { + SetPagePrivate(page); + atomic_long_inc(&c->dirty_pg_cnt); + __set_page_dirty_nobuffers(page); + } + + if (appending) { + i_size_write(inode, end_pos); + ui->ui_size = end_pos; + /* + * Note, we do not set @I_DIRTY_PAGES (which means that the + * inode has dirty pages), this has been done in + * '__set_page_dirty_nobuffers()'. + */ + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + mutex_unlock(&ui->ui_mutex); + } + +out: + unlock_page(page); + page_cache_release(page); + return copied; +} + +/** + * populate_page - copy data nodes into a page for bulk-read. + * @c: UBIFS file-system description object + * @page: page + * @bu: bulk-read information + * @n: next zbranch slot + * + * This function returns %0 on success and a negative error code on failure. + */ +static int populate_page(struct ubifs_info *c, struct page *page, + struct bu_info *bu, int *n) +{ + int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0; + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + unsigned int page_block; + void *addr, *zaddr; + pgoff_t end_index; + + dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + inode->i_ino, page->index, i_size, page->flags); + + addr = zaddr = kmap(page); + + end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + if (!i_size || page->index > end_index) { + hole = 1; + memset(addr, 0, PAGE_CACHE_SIZE); + goto out_hole; + } + + page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + while (1) { + int err, len, out_len, dlen; + + if (nn >= bu->cnt) { + hole = 1; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } else if (key_block(c, &bu->zbranch[nn].key) == page_block) { + struct ubifs_data_node *dn; + + dn = bu->buf + (bu->zbranch[nn].offs - offs); + + ubifs_assert(le64_to_cpu(dn->ch.sqnum) > + ubifs_inode(inode)->creat_sqnum); + + len = le32_to_cpu(dn->size); + if (len <= 0 || len > UBIFS_BLOCK_SIZE) + goto out_err; + + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + out_len = UBIFS_BLOCK_SIZE; + err = ubifs_decompress(&dn->data, dlen, addr, &out_len, + le16_to_cpu(dn->compr_type)); + if (err || len != out_len) + goto out_err; + + if (len < UBIFS_BLOCK_SIZE) + memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + + nn += 1; + read = (i << UBIFS_BLOCK_SHIFT) + len; + } else if (key_block(c, &bu->zbranch[nn].key) < page_block) { + nn += 1; + continue; + } else { + hole = 1; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + addr += UBIFS_BLOCK_SIZE; + page_block += 1; + } + + if (end_index == page->index) { + int len = i_size & (PAGE_CACHE_SIZE - 1); + + if (len && len < read) + memset(zaddr + len, 0, read - len); + } + +out_hole: + if (hole) { + SetPageChecked(page); + dbg_gen("hole"); + } + + SetPageUptodate(page); + ClearPageError(page); + flush_dcache_page(page); + kunmap(page); + *n = nn; + return 0; + +out_err: + ClearPageUptodate(page); + SetPageError(page); + flush_dcache_page(page); + kunmap(page); + ubifs_err("bad data node (block %u, inode %lu)", + page_block, inode->i_ino); + return -EINVAL; +} + +/** + * ubifs_do_bulk_read - do bulk-read. + * @c: UBIFS file-system description object + * @bu: bulk-read information + * @page1: first page to read + * + * This function returns %1 if the bulk-read is done, otherwise %0 is returned. + */ +static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, + struct page *page1) +{ + pgoff_t offset = page1->index, end_index; + struct address_space *mapping = page1->mapping; + struct inode *inode = mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + int err, page_idx, page_cnt, ret = 0, n = 0; + int allocate = bu->buf ? 0 : 1; + loff_t isize; + + err = ubifs_tnc_get_bu_keys(c, bu); + if (err) + goto out_warn; + + if (bu->eof) { + /* Turn off bulk-read at the end of the file */ + ui->read_in_a_row = 1; + ui->bulk_read = 0; + } + + page_cnt = bu->blk_cnt >> UBIFS_BLOCKS_PER_PAGE_SHIFT; + if (!page_cnt) { + /* + * This happens when there are multiple blocks per page and the + * blocks for the first page we are looking for, are not + * together. If all the pages were like this, bulk-read would + * reduce performance, so we turn it off for a while. + */ + goto out_bu_off; + } + + if (bu->cnt) { + if (allocate) { + /* + * Allocate bulk-read buffer depending on how many data + * nodes we are going to read. + */ + bu->buf_len = bu->zbranch[bu->cnt - 1].offs + + bu->zbranch[bu->cnt - 1].len - + bu->zbranch[0].offs; + ubifs_assert(bu->buf_len > 0); + ubifs_assert(bu->buf_len <= c->leb_size); + bu->buf = kmalloc(bu->buf_len, GFP_NOFS | __GFP_NOWARN); + if (!bu->buf) + goto out_bu_off; + } + + err = ubifs_tnc_bulk_read(c, bu); + if (err) + goto out_warn; + } + + err = populate_page(c, page1, bu, &n); + if (err) + goto out_warn; + + unlock_page(page1); + ret = 1; + + isize = i_size_read(inode); + if (isize == 0) + goto out_free; + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + + for (page_idx = 1; page_idx < page_cnt; page_idx++) { + pgoff_t page_offset = offset + page_idx; + struct page *page; + + if (page_offset > end_index) + break; + page = find_or_create_page(mapping, page_offset, + GFP_NOFS | __GFP_COLD); + if (!page) + break; + if (!PageUptodate(page)) + err = populate_page(c, page, bu, &n); + unlock_page(page); + page_cache_release(page); + if (err) + break; + } + + ui->last_page_read = offset + page_idx - 1; + +out_free: + if (allocate) + kfree(bu->buf); + return ret; + +out_warn: + ubifs_warn("ignoring error %d and skipping bulk-read", err); + goto out_free; + +out_bu_off: + ui->read_in_a_row = ui->bulk_read = 0; + goto out_free; +} + +/** + * ubifs_bulk_read - determine whether to bulk-read and, if so, do it. + * @page: page from which to start bulk-read. + * + * Some flash media are capable of reading sequentially at faster rates. UBIFS + * bulk-read facility is designed to take advantage of that, by reading in one + * go consecutive data nodes that are also located consecutively in the same + * LEB. This function returns %1 if a bulk-read is done and %0 otherwise. + */ +static int ubifs_bulk_read(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + pgoff_t index = page->index, last_page_read = ui->last_page_read; + struct bu_info *bu; + int err = 0, allocated = 0; + + ui->last_page_read = index; + if (!c->bulk_read) + return 0; + + /* + * Bulk-read is protected by @ui->ui_mutex, but it is an optimization, + * so don't bother if we cannot lock the mutex. + */ + if (!mutex_trylock(&ui->ui_mutex)) + return 0; + + if (index != last_page_read + 1) { + /* Turn off bulk-read if we stop reading sequentially */ + ui->read_in_a_row = 1; + if (ui->bulk_read) + ui->bulk_read = 0; + goto out_unlock; + } + + if (!ui->bulk_read) { + ui->read_in_a_row += 1; + if (ui->read_in_a_row < 3) + goto out_unlock; + /* Three reads in a row, so switch on bulk-read */ + ui->bulk_read = 1; + } + + /* + * If possible, try to use pre-allocated bulk-read information, which + * is protected by @c->bu_mutex. + */ + if (mutex_trylock(&c->bu_mutex)) + bu = &c->bu; + else { + bu = kmalloc(sizeof(struct bu_info), GFP_NOFS | __GFP_NOWARN); + if (!bu) + goto out_unlock; + + bu->buf = NULL; + allocated = 1; + } + + bu->buf_len = c->max_bu_buf_len; + data_key_init(c, &bu->key, inode->i_ino, + page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT); + err = ubifs_do_bulk_read(c, bu, page); + + if (!allocated) + mutex_unlock(&c->bu_mutex); + else + kfree(bu); + +out_unlock: + mutex_unlock(&ui->ui_mutex); + return err; +} + +static int ubifs_readpage(struct file *file, struct page *page) +{ + if (ubifs_bulk_read(page)) + return 0; + do_readpage(page); + unlock_page(page); + return 0; +} + +static int do_writepage(struct page *page, int len) +{ + int err = 0, i, blen; + unsigned int block; + void *addr; + union ubifs_key key; + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + +#ifdef UBIFS_DEBUG + spin_lock(&ui->ui_lock); + ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE); + spin_unlock(&ui->ui_lock); +#endif + + /* Update radix tree tags */ + set_page_writeback(page); + + addr = kmap(page); + block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + i = 0; + while (len) { + blen = min_t(int, len, UBIFS_BLOCK_SIZE); + data_key_init(c, &key, inode->i_ino, block); + err = ubifs_jnl_write_data(c, inode, &key, addr, blen); + if (err) + break; + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + block += 1; + addr += blen; + len -= blen; + } + if (err) { + SetPageError(page); + ubifs_err("cannot write page %lu of inode %lu, error %d", + page->index, inode->i_ino, err); + ubifs_ro_mode(c, err); + } + + ubifs_assert(PagePrivate(page)); + if (PageChecked(page)) + release_new_page_budget(c); + else + release_existing_page_budget(c); + + atomic_long_dec(&c->dirty_pg_cnt); + ClearPagePrivate(page); + ClearPageChecked(page); + + kunmap(page); + unlock_page(page); + end_page_writeback(page); + return err; +} + +/* + * When writing-back dirty inodes, VFS first writes-back pages belonging to the + * inode, then the inode itself. For UBIFS this may cause a problem. Consider a + * situation when a we have an inode with size 0, then a megabyte of data is + * appended to the inode, then write-back starts and flushes some amount of the + * dirty pages, the journal becomes full, commit happens and finishes, and then + * an unclean reboot happens. When the file system is mounted next time, the + * inode size would still be 0, but there would be many pages which are beyond + * the inode size, they would be indexed and consume flash space. Because the + * journal has been committed, the replay would not be able to detect this + * situation and correct the inode size. This means UBIFS would have to scan + * whole index and correct all inode sizes, which is long an unacceptable. + * + * To prevent situations like this, UBIFS writes pages back only if they are + * within the last synchronized inode size, i.e. the size which has been + * written to the flash media last time. Otherwise, UBIFS forces inode + * write-back, thus making sure the on-flash inode contains current inode size, + * and then keeps writing pages back. + * + * Some locking issues explanation. 'ubifs_writepage()' first is called with + * the page locked, and it locks @ui_mutex. However, write-back does take inode + * @i_mutex, which means other VFS operations may be run on this inode at the + * same time. And the problematic one is truncation to smaller size, from where + * we have to call 'truncate_setsize()', which first changes @inode->i_size, + * then drops the truncated pages. And while dropping the pages, it takes the + * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' + * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. + * This means that @inode->i_size is changed while @ui_mutex is unlocked. + * + * XXX(truncate): with the new truncate sequence this is not true anymore, + * and the calls to truncate_setsize can be move around freely. They should + * be moved to the very end of the truncate sequence. + * + * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond + * inode size. How do we do this if @inode->i_size may became smaller while we + * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the + * @ui->ui_isize "shadow" field which UBIFS uses instead of @inode->i_size + * internally and updates it under @ui_mutex. + * + * Q: why we do not worry that if we race with truncation, we may end up with a + * situation when the inode is truncated while we are in the middle of + * 'do_writepage()', so we do write beyond inode size? + * A: If we are in the middle of 'do_writepage()', truncation would be locked + * on the page lock and it would not write the truncated inode node to the + * journal before we have finished. + */ +static int ubifs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + loff_t i_size = i_size_read(inode), synced_i_size; + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + int err, len = i_size & (PAGE_CACHE_SIZE - 1); + void *kaddr; + + dbg_gen("ino %lu, pg %lu, pg flags %#lx", + inode->i_ino, page->index, page->flags); + ubifs_assert(PagePrivate(page)); + + /* Is the page fully outside @i_size? (truncate in progress) */ + if (page->index > end_index || (page->index == end_index && !len)) { + err = 0; + goto out_unlock; + } + + spin_lock(&ui->ui_lock); + synced_i_size = ui->synced_i_size; + spin_unlock(&ui->ui_lock); + + /* Is the page fully inside @i_size? */ + if (page->index < end_index) { + if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { + err = inode->i_sb->s_op->write_inode(inode, NULL); + if (err) + goto out_unlock; + /* + * The inode has been written, but the write-buffer has + * not been synchronized, so in case of an unclean + * reboot we may end up with some pages beyond inode + * size, but they would be in the journal (because + * commit flushes write buffers) and recovery would deal + * with this. + */ + } + return do_writepage(page, PAGE_CACHE_SIZE); + } + + /* + * The page straddles @i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + len, 0, PAGE_CACHE_SIZE - len); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + if (i_size > synced_i_size) { + err = inode->i_sb->s_op->write_inode(inode, NULL); + if (err) + goto out_unlock; + } + + return do_writepage(page, len); + +out_unlock: + unlock_page(page); + return err; +} + +/** + * do_attr_changes - change inode attributes. + * @inode: inode to change attributes for + * @attr: describes attributes to change + */ +static void do_attr_changes(struct inode *inode, const struct iattr *attr) +{ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + if (attr->ia_valid & ATTR_ATIME) + inode->i_atime = timespec_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = timespec_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = timespec_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); + if (attr->ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + inode->i_mode = mode; + } +} + +/** + * do_truncation - truncate an inode. + * @c: UBIFS file-system description object + * @inode: inode to truncate + * @attr: inode attribute changes description + * + * This function implements VFS '->setattr()' call when the inode is truncated + * to a smaller size. Returns zero in case of success and a negative error code + * in case of failure. + */ +static int do_truncation(struct ubifs_info *c, struct inode *inode, + const struct iattr *attr) +{ + int err; + struct ubifs_budget_req req; + loff_t old_size = inode->i_size, new_size = attr->ia_size; + int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1; + struct ubifs_inode *ui = ubifs_inode(inode); + + dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); + memset(&req, 0, sizeof(struct ubifs_budget_req)); + + /* + * If this is truncation to a smaller size, and we do not truncate on a + * block boundary, budget for changing one data block, because the last + * block will be re-written. + */ + if (new_size & (UBIFS_BLOCK_SIZE - 1)) + req.dirtied_page = 1; + + req.dirtied_ino = 1; + /* A funny way to budget for truncation node */ + req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ; + err = ubifs_budget_space(c, &req); + if (err) { + /* + * Treat truncations to zero as deletion and always allow them, + * just like we do for '->unlink()'. + */ + if (new_size || err != -ENOSPC) + return err; + budgeted = 0; + } + + truncate_setsize(inode, new_size); + + if (offset) { + pgoff_t index = new_size >> PAGE_CACHE_SHIFT; + struct page *page; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + if (PageDirty(page)) { + /* + * 'ubifs_jnl_truncate()' will try to truncate + * the last data node, but it contains + * out-of-date data because the page is dirty. + * Write the page now, so that + * 'ubifs_jnl_truncate()' will see an already + * truncated (and up to date) data node. + */ + ubifs_assert(PagePrivate(page)); + + clear_page_dirty_for_io(page); + if (UBIFS_BLOCKS_PER_PAGE_SHIFT) + offset = new_size & + (PAGE_CACHE_SIZE - 1); + err = do_writepage(page, offset); + page_cache_release(page); + if (err) + goto out_budg; + /* + * We could now tell 'ubifs_jnl_truncate()' not + * to read the last block. + */ + } else { + /* + * We could 'kmap()' the page and pass the data + * to 'ubifs_jnl_truncate()' to save it from + * having to read it. + */ + unlock_page(page); + page_cache_release(page); + } + } + } + + mutex_lock(&ui->ui_mutex); + ui->ui_size = inode->i_size; + /* Truncation changes inode [mc]time */ + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + /* Other attributes may be changed at the same time as well */ + do_attr_changes(inode, attr); + err = ubifs_jnl_truncate(c, inode, old_size, new_size); + mutex_unlock(&ui->ui_mutex); + +out_budg: + if (budgeted) + ubifs_release_budget(c, &req); + else { + c->bi.nospace = c->bi.nospace_rp = 0; + smp_wmb(); + } + return err; +} + +/** + * do_setattr - change inode attributes. + * @c: UBIFS file-system description object + * @inode: inode to change attributes for + * @attr: inode attribute changes description + * + * This function implements VFS '->setattr()' call for all cases except + * truncations to smaller size. Returns zero in case of success and a negative + * error code in case of failure. + */ +static int do_setattr(struct ubifs_info *c, struct inode *inode, + const struct iattr *attr) +{ + int err, release; + loff_t new_size = attr->ia_size; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_budget_req req = { .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + if (attr->ia_valid & ATTR_SIZE) { + dbg_gen("size %lld -> %lld", inode->i_size, new_size); + truncate_setsize(inode, new_size); + } + + mutex_lock(&ui->ui_mutex); + if (attr->ia_valid & ATTR_SIZE) { + /* Truncation changes inode [mc]time */ + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + /* 'truncate_setsize()' changed @i_size, update @ui_size */ + ui->ui_size = inode->i_size; + } + + do_attr_changes(inode, attr); + + release = ui->dirty; + if (attr->ia_valid & ATTR_SIZE) + /* + * Inode length changed, so we have to make sure + * @I_DIRTY_DATASYNC is set. + */ + __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + else + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + + if (release) + ubifs_release_budget(c, &req); + if (IS_SYNC(inode)) + err = inode->i_sb->s_op->write_inode(inode, NULL); + return err; +} + +int ubifs_setattr(struct dentry *dentry, struct iattr *attr) +{ + int err; + struct inode *inode = dentry->d_inode; + struct ubifs_info *c = inode->i_sb->s_fs_info; + + dbg_gen("ino %lu, mode %#x, ia_valid %#x", + inode->i_ino, inode->i_mode, attr->ia_valid); + err = inode_change_ok(inode, attr); + if (err) + return err; + + err = dbg_check_synced_i_size(inode); + if (err) + return err; + + if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size) + /* Truncation to a smaller size */ + err = do_truncation(c, inode, attr); + else + err = do_setattr(c, inode, attr); + + return err; +} + +static void ubifs_invalidatepage(struct page *page, unsigned long offset) +{ + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + + ubifs_assert(PagePrivate(page)); + if (offset) + /* Partial page remains dirty */ + return; + + if (PageChecked(page)) + release_new_page_budget(c); + else + release_existing_page_budget(c); + + atomic_long_dec(&c->dirty_pg_cnt); + ClearPagePrivate(page); + ClearPageChecked(page); +} + +static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ubifs_inode *ui = ubifs_inode(dentry->d_inode); + + nd_set_link(nd, ui->data); + return NULL; +} + +int ubifs_fsync(struct file *file, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + int err; + + dbg_gen("syncing inode %lu", inode->i_ino); + + if (c->ro_mount) + /* + * For some really strange reasons VFS does not filter out + * 'fsync()' for R/O mounted file-systems as per 2.6.39. + */ + return 0; + + /* + * VFS has already synchronized dirty pages for this inode. Synchronize + * the inode unless this is a 'datasync()' call. + */ + if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { + err = inode->i_sb->s_op->write_inode(inode, NULL); + if (err) + return err; + } + + /* + * Nodes related to this inode may still sit in a write-buffer. Flush + * them. + */ + err = ubifs_sync_wbufs_by_inode(c, inode); + if (err) + return err; + + return 0; +} + +/** + * mctime_update_needed - check if mtime or ctime update is needed. + * @inode: the inode to do the check for + * @now: current time + * + * This helper function checks if the inode mtime/ctime should be updated or + * not. If current values of the time-stamps are within the UBIFS inode time + * granularity, they are not updated. This is an optimization. + */ +static inline int mctime_update_needed(const struct inode *inode, + const struct timespec *now) +{ + if (!timespec_equal(&inode->i_mtime, now) || + !timespec_equal(&inode->i_ctime, now)) + return 1; + return 0; +} + +/** + * update_ctime - update mtime and ctime of an inode. + * @c: UBIFS file-system description object + * @inode: inode to update + * + * This function updates mtime and ctime of the inode if it is not equivalent to + * current time. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int update_mctime(struct ubifs_info *c, struct inode *inode) +{ + struct timespec now = ubifs_current_time(inode); + struct ubifs_inode *ui = ubifs_inode(inode); + + if (mctime_update_needed(inode, &now)) { + int err, release; + struct ubifs_budget_req req = { .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + mutex_lock(&ui->ui_mutex); + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + release = ui->dirty; + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + if (release) + ubifs_release_budget(c, &req); + } + + return 0; +} + +static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + int err; + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + + err = update_mctime(c, inode); + if (err) + return err; + + return generic_file_aio_write(iocb, iov, nr_segs, pos); +} + +static int ubifs_set_page_dirty(struct page *page) +{ + int ret; + + ret = __set_page_dirty_nobuffers(page); + /* + * An attempt to dirty a page without budgeting for it - should not + * happen. + */ + ubifs_assert(ret == 0); + return ret; +} + +static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) +{ + /* + * An attempt to release a dirty page without budgeting for it - should + * not happen. + */ + if (PageWriteback(page)) + return 0; + ubifs_assert(PagePrivate(page)); + ubifs_assert(0); + ClearPagePrivate(page); + ClearPageChecked(page); + return 1; +} + +/* + * mmap()d file has taken write protection fault and is being made writable. + * UBIFS must ensure page is budgeted for. + */ +static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct timespec now = ubifs_current_time(inode); + struct ubifs_budget_req req = { .new_page = 1 }; + int err, update_time; + + dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index, + i_size_read(inode)); + ubifs_assert(!c->ro_media && !c->ro_mount); + + if (unlikely(c->ro_error)) + return VM_FAULT_SIGBUS; /* -EROFS */ + + /* + * We have not locked @page so far so we may budget for changing the + * page. Note, we cannot do this after we locked the page, because + * budgeting may cause write-back which would cause deadlock. + * + * At the moment we do not know whether the page is dirty or not, so we + * assume that it is not and budget for a new page. We could look at + * the @PG_private flag and figure this out, but we may race with write + * back and the page state may change by the time we lock it, so this + * would need additional care. We do not bother with this at the + * moment, although it might be good idea to do. Instead, we allocate + * budget for a new page and amend it later on if the page was in fact + * dirty. + * + * The budgeting-related logic of this function is similar to what we + * do in 'ubifs_write_begin()' and 'ubifs_write_end()'. Glance there + * for more comments. + */ + update_time = mctime_update_needed(inode, &now); + if (update_time) + /* + * We have to change inode time stamp which requires extra + * budgeting. + */ + req.dirtied_ino = 1; + + err = ubifs_budget_space(c, &req); + if (unlikely(err)) { + if (err == -ENOSPC) + ubifs_warn("out of space for mmapped file " + "(inode number %lu)", inode->i_ino); + return VM_FAULT_SIGBUS; + } + + lock_page(page); + if (unlikely(page->mapping != inode->i_mapping || + page_offset(page) > i_size_read(inode))) { + /* Page got truncated out from underneath us */ + err = -EINVAL; + goto out_unlock; + } + + if (PagePrivate(page)) + release_new_page_budget(c); + else { + if (!PageChecked(page)) + ubifs_convert_page_budget(c); + SetPagePrivate(page); + atomic_long_inc(&c->dirty_pg_cnt); + __set_page_dirty_nobuffers(page); + } + + if (update_time) { + int release; + struct ubifs_inode *ui = ubifs_inode(inode); + + mutex_lock(&ui->ui_mutex); + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + release = ui->dirty; + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + if (release) + ubifs_release_dirty_inode_budget(c, ui); + } + + unlock_page(page); + return 0; + +out_unlock: + unlock_page(page); + ubifs_release_budget(c, &req); + if (err) + err = VM_FAULT_SIGBUS; + return err; +} + +static const struct vm_operations_struct ubifs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = ubifs_vm_page_mkwrite, +}; + +static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int err; + + err = generic_file_mmap(file, vma); + if (err) + return err; + vma->vm_ops = &ubifs_file_vm_ops; + return 0; +} + +const struct address_space_operations ubifs_file_address_operations = { + .readpage = ubifs_readpage, + .writepage = ubifs_writepage, + .write_begin = ubifs_write_begin, + .write_end = ubifs_write_end, + .invalidatepage = ubifs_invalidatepage, + .set_page_dirty = ubifs_set_page_dirty, + .releasepage = ubifs_releasepage, +}; + +const struct inode_operations ubifs_file_inode_operations = { + .setattr = ubifs_setattr, + .getattr = ubifs_getattr, +#ifdef CONFIG_UBIFS_FS_XATTR + .setxattr = ubifs_setxattr, + .getxattr = ubifs_getxattr, + .listxattr = ubifs_listxattr, + .removexattr = ubifs_removexattr, +#endif +}; + +const struct inode_operations ubifs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ubifs_follow_link, + .setattr = ubifs_setattr, + .getattr = ubifs_getattr, +}; + +const struct file_operations ubifs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = ubifs_aio_write, + .mmap = ubifs_file_mmap, + .fsync = ubifs_fsync, + .unlocked_ioctl = ubifs_ioctl, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +#ifdef CONFIG_COMPAT + .compat_ioctl = ubifs_compat_ioctl, +#endif +}; diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c new file mode 100644 index 00000000..2559d174 --- /dev/null +++ b/fs/ubifs/find.c @@ -0,0 +1,977 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file contains functions for finding LEBs for various purposes e.g. + * garbage collection. In general, lprops category heaps and lists are used + * for fast access, falling back on scanning the LPT as a last resort. + */ + +#include +#include "ubifs.h" + +/** + * struct scan_data - data provided to scan callback functions + * @min_space: minimum number of bytes for which to scan + * @pick_free: whether it is OK to scan for empty LEBs + * @lnum: LEB number found is returned here + * @exclude_index: whether to exclude index LEBs + */ +struct scan_data { + int min_space; + int pick_free; + int lnum; + int exclude_index; +}; + +/** + * valuable - determine whether LEB properties are valuable. + * @c: the UBIFS file-system description object + * @lprops: LEB properties + * + * This function return %1 if the LEB properties should be added to the LEB + * properties tree in memory. Otherwise %0 is returned. + */ +static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops) +{ + int n, cat = lprops->flags & LPROPS_CAT_MASK; + struct ubifs_lpt_heap *heap; + + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + heap = &c->lpt_heap[cat - 1]; + if (heap->cnt < heap->max_cnt) + return 1; + if (lprops->free + lprops->dirty >= c->dark_wm) + return 1; + return 0; + case LPROPS_EMPTY: + n = c->lst.empty_lebs + c->freeable_cnt - + c->lst.taken_empty_lebs; + if (n < c->lsave_cnt) + return 1; + return 0; + case LPROPS_FREEABLE: + return 1; + case LPROPS_FRDI_IDX: + return 1; + } + return 0; +} + +/** + * scan_for_dirty_cb - dirty space scan callback. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_for_dirty_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude LEBs with too little space */ + if (lprops->free + lprops->dirty < data->min_space) + return ret; + /* If specified, exclude index LEBs */ + if (data->exclude_index && lprops->flags & LPROPS_INDEX) + return ret; + /* If specified, exclude empty or freeable LEBs */ + if (lprops->free + lprops->dirty == c->leb_size) { + if (!data->pick_free) + return ret; + /* Exclude LEBs with too little dirty space (unless it is empty) */ + } else if (lprops->dirty < c->dead_wm) + return ret; + /* Finally we found space */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * scan_for_dirty - find a data LEB with free space. + * @c: the UBIFS file-system description object + * @min_space: minimum amount free plus dirty space the returned LEB has to + * have + * @pick_free: if it is OK to return a free or freeable LEB + * @exclude_index: whether to exclude index LEBs + * + * This function returns a pointer to the LEB properties found or a negative + * error code. + */ +static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, + int min_space, int pick_free, + int exclude_index) +{ + const struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + struct scan_data data; + int err, i; + + /* There may be an LEB with enough dirty space on the free heap */ + heap = &c->lpt_heap[LPROPS_FREE - 1]; + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + if (lprops->free + lprops->dirty < min_space) + continue; + if (lprops->dirty < c->dead_wm) + continue; + return lprops; + } + /* + * A LEB may have fallen off of the bottom of the dirty heap, and ended + * up as uncategorized even though it has enough dirty space for us now, + * so check the uncategorized list. N.B. neither empty nor freeable LEBs + * can end up as uncategorized because they are kept on lists not + * finite-sized heaps. + */ + list_for_each_entry(lprops, &c->uncat_list, list) { + if (lprops->flags & LPROPS_TAKEN) + continue; + if (lprops->free + lprops->dirty < min_space) + continue; + if (exclude_index && (lprops->flags & LPROPS_INDEX)) + continue; + if (lprops->dirty < c->dead_wm) + continue; + return lprops; + } + /* We have looked everywhere in main memory, now scan the flash */ + if (c->pnodes_have >= c->pnode_cnt) + /* All pnodes are in memory, so skip scan */ + return ERR_PTR(-ENOSPC); + data.min_space = min_space; + data.pick_free = pick_free; + data.lnum = -1; + data.exclude_index = exclude_index; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_for_dirty_cb, + &data); + if (err) + return ERR_PTR(err); + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return lprops; + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free + lprops->dirty >= min_space); + ubifs_assert(lprops->dirty >= c->dead_wm || + (pick_free && + lprops->free + lprops->dirty == c->leb_size)); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!exclude_index || !(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_find_dirty_leb - find a dirty LEB for the Garbage Collector. + * @c: the UBIFS file-system description object + * @ret_lp: LEB properties are returned here on exit + * @min_space: minimum amount free plus dirty space the returned LEB has to + * have + * @pick_free: controls whether it is OK to pick empty or index LEBs + * + * This function tries to find a dirty logical eraseblock which has at least + * @min_space free and dirty space. It prefers to take an LEB from the dirty or + * dirty index heap, and it falls-back to LPT scanning if the heaps are empty + * or do not have an LEB which satisfies the @min_space criteria. + * + * Note, LEBs which have less than dead watermark of free + dirty space are + * never picked by this function. + * + * The additional @pick_free argument controls if this function has to return a + * free or freeable LEB if one is present. For example, GC must to set it to %1, + * when called from the journal space reservation function, because the + * appearance of free space may coincide with the loss of enough dirty space + * for GC to succeed anyway. + * + * In contrast, if the Garbage Collector is called from budgeting, it should + * just make free space, not return LEBs which are already free or freeable. + * + * In addition @pick_free is set to %2 by the recovery process in order to + * recover gc_lnum in which case an index LEB must not be returned. + * + * This function returns zero and the LEB properties of found dirty LEB in case + * of success, %-ENOSPC if no dirty LEB was found and a negative error code in + * case of other failures. The returned LEB is marked as "taken". + */ +int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, + int min_space, int pick_free) +{ + int err = 0, sum, exclude_index = pick_free == 2 ? 1 : 0; + const struct ubifs_lprops *lp = NULL, *idx_lp = NULL; + struct ubifs_lpt_heap *heap, *idx_heap; + + ubifs_get_lprops(c); + + if (pick_free) { + int lebs, rsvd_idx_lebs = 0; + + spin_lock(&c->space_lock); + lebs = c->lst.empty_lebs + c->idx_gc_cnt; + lebs += c->freeable_cnt - c->lst.taken_empty_lebs; + + /* + * Note, the index may consume more LEBs than have been reserved + * for it. It is OK because it might be consolidated by GC. + * But if the index takes fewer LEBs than it is reserved for it, + * this function must avoid picking those reserved LEBs. + */ + if (c->bi.min_idx_lebs >= c->lst.idx_lebs) { + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; + exclude_index = 1; + } + spin_unlock(&c->space_lock); + + /* Check if there are enough free LEBs for the index */ + if (rsvd_idx_lebs < lebs) { + /* OK, try to find an empty LEB */ + lp = ubifs_fast_find_empty(c); + if (lp) + goto found; + + /* Or a freeable LEB */ + lp = ubifs_fast_find_freeable(c); + if (lp) + goto found; + } else + /* + * We cannot pick free/freeable LEBs in the below code. + */ + pick_free = 0; + } else { + spin_lock(&c->space_lock); + exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs); + spin_unlock(&c->space_lock); + } + + /* Look on the dirty and dirty index heaps */ + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + idx_heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + + if (idx_heap->cnt && !exclude_index) { + idx_lp = idx_heap->arr[0]; + sum = idx_lp->free + idx_lp->dirty; + /* + * Since we reserve thrice as much space for the index than it + * actually takes, it does not make sense to pick indexing LEBs + * with less than, say, half LEB of dirty space. May be half is + * not the optimal boundary - this should be tested and + * checked. This boundary should determine how much we use + * in-the-gaps to consolidate the index comparing to how much + * we use garbage collector to consolidate it. The "half" + * criteria just feels to be fine. + */ + if (sum < min_space || sum < c->half_leb_size) + idx_lp = NULL; + } + + if (heap->cnt) { + lp = heap->arr[0]; + if (lp->dirty + lp->free < min_space) + lp = NULL; + } + + /* Pick the LEB with most space */ + if (idx_lp && lp) { + if (idx_lp->free + idx_lp->dirty >= lp->free + lp->dirty) + lp = idx_lp; + } else if (idx_lp && !lp) + lp = idx_lp; + + if (lp) { + ubifs_assert(lp->free + lp->dirty >= c->dead_wm); + goto found; + } + + /* Did not find a dirty LEB on the dirty heaps, have to scan */ + dbg_find("scanning LPT for a dirty LEB"); + lp = scan_for_dirty(c, min_space, pick_free, exclude_index); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + ubifs_assert(lp->dirty >= c->dead_wm || + (pick_free && lp->free + lp->dirty == c->leb_size)); + +found: + dbg_find("found LEB %d, free %d, dirty %d, flags %#x", + lp->lnum, lp->free, lp->dirty, lp->flags); + + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + memcpy(ret_lp, lp, sizeof(struct ubifs_lprops)); + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * scan_for_free_cb - free space scan callback. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_for_free_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude index LEBs */ + if (lprops->flags & LPROPS_INDEX) + return ret; + /* Exclude LEBs with too little space */ + if (lprops->free < data->min_space) + return ret; + /* If specified, exclude empty LEBs */ + if (!data->pick_free && lprops->free == c->leb_size) + return ret; + /* + * LEBs that have only free and dirty space must not be allocated + * because they may have been unmapped already or they may have data + * that is obsolete only because of nodes that are still sitting in a + * wbuf. + */ + if (lprops->free + lprops->dirty == c->leb_size && lprops->dirty > 0) + return ret; + /* Finally we found space */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * do_find_free_space - find a data LEB with free space. + * @c: the UBIFS file-system description object + * @min_space: minimum amount of free space required + * @pick_free: whether it is OK to scan for empty LEBs + * @squeeze: whether to try to find space in a non-empty LEB first + * + * This function returns a pointer to the LEB properties found or a negative + * error code. + */ +static +const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, + int min_space, int pick_free, + int squeeze) +{ + const struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + struct scan_data data; + int err, i; + + if (squeeze) { + lprops = ubifs_fast_find_free(c); + if (lprops && lprops->free >= min_space) + return lprops; + } + if (pick_free) { + lprops = ubifs_fast_find_empty(c); + if (lprops) + return lprops; + } + if (!squeeze) { + lprops = ubifs_fast_find_free(c); + if (lprops && lprops->free >= min_space) + return lprops; + } + /* There may be an LEB with enough free space on the dirty heap */ + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + if (lprops->free >= min_space) + return lprops; + } + /* + * A LEB may have fallen off of the bottom of the free heap, and ended + * up as uncategorized even though it has enough free space for us now, + * so check the uncategorized list. N.B. neither empty nor freeable LEBs + * can end up as uncategorized because they are kept on lists not + * finite-sized heaps. + */ + list_for_each_entry(lprops, &c->uncat_list, list) { + if (lprops->flags & LPROPS_TAKEN) + continue; + if (lprops->flags & LPROPS_INDEX) + continue; + if (lprops->free >= min_space) + return lprops; + } + /* We have looked everywhere in main memory, now scan the flash */ + if (c->pnodes_have >= c->pnode_cnt) + /* All pnodes are in memory, so skip scan */ + return ERR_PTR(-ENOSPC); + data.min_space = min_space; + data.pick_free = pick_free; + data.lnum = -1; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_for_free_cb, + &data); + if (err) + return ERR_PTR(err); + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return lprops; + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free >= min_space); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_find_free_space - find a data LEB with free space. + * @c: the UBIFS file-system description object + * @min_space: minimum amount of required free space + * @offs: contains offset of where free space starts on exit + * @squeeze: whether to try to find space in a non-empty LEB first + * + * This function looks for an LEB with at least @min_space bytes of free space. + * It tries to find an empty LEB if possible. If no empty LEBs are available, + * this function searches for a non-empty data LEB. The returned LEB is marked + * as "taken". + * + * This function returns found LEB number in case of success, %-ENOSPC if it + * failed to find a LEB with @min_space bytes of free space and other a negative + * error codes in case of failure. + */ +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, + int squeeze) +{ + const struct ubifs_lprops *lprops; + int lebs, rsvd_idx_lebs, pick_free = 0, err, lnum, flags; + + dbg_find("min_space %d", min_space); + ubifs_get_lprops(c); + + /* Check if there are enough empty LEBs for commit */ + spin_lock(&c->space_lock); + if (c->bi.min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + if (rsvd_idx_lebs < lebs) + /* + * OK to allocate an empty LEB, but we still don't want to go + * looking for one if there aren't any. + */ + if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + pick_free = 1; + /* + * Because we release the space lock, we must account + * for this allocation here. After the LEB properties + * flags have been updated, we subtract one. Note, the + * result of this is that lprops also decreases + * @taken_empty_lebs in 'ubifs_change_lp()', so it is + * off by one for a short period of time which may + * introduce a small disturbance to budgeting + * calculations, but this is harmless because at the + * worst case this would make the budgeting subsystem + * be more pessimistic than needed. + * + * Fundamentally, this is about serialization of the + * budgeting and lprops subsystems. We could make the + * @space_lock a mutex and avoid dropping it before + * calling 'ubifs_change_lp()', but mutex is more + * heavy-weight, and we want budgeting to be as fast as + * possible. + */ + c->lst.taken_empty_lebs += 1; + } + spin_unlock(&c->space_lock); + + lprops = do_find_free_space(c, min_space, pick_free, squeeze); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + lnum = lprops->lnum; + flags = lprops->flags | LPROPS_TAKEN; + + lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, flags, 0); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + if (pick_free) { + spin_lock(&c->space_lock); + c->lst.taken_empty_lebs -= 1; + spin_unlock(&c->space_lock); + } + + *offs = c->leb_size - lprops->free; + ubifs_release_lprops(c); + + if (*offs == 0) { + /* + * Ensure that empty LEBs have been unmapped. They may not have + * been, for example, because of an unclean unmount. Also + * LEBs that were freeable LEBs (free + dirty == leb_size) will + * not have been unmapped. + */ + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + + dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs); + ubifs_assert(*offs <= c->leb_size - min_space); + return lnum; + +out: + if (pick_free) { + spin_lock(&c->space_lock); + c->lst.taken_empty_lebs -= 1; + spin_unlock(&c->space_lock); + } + ubifs_release_lprops(c); + return err; +} + +/** + * scan_for_idx_cb - callback used by the scan for a free LEB for the index. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_for_idx_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude index LEBS */ + if (lprops->flags & LPROPS_INDEX) + return ret; + /* Exclude LEBs that cannot be made empty */ + if (lprops->free + lprops->dirty != c->leb_size) + return ret; + /* + * We are allocating for the index so it is safe to allocate LEBs with + * only free and dirty space, because write buffers are sync'd at commit + * start. + */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * scan_for_leb_for_idx - scan for a free LEB for the index. + * @c: the UBIFS file-system description object + */ +static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct scan_data data; + int err; + + data.lnum = -1; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_for_idx_cb, + &data); + if (err) + return ERR_PTR(err); + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return lprops; + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free + lprops->dirty == c->leb_size); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_find_free_leb_for_idx - find a free LEB for the index. + * @c: the UBIFS file-system description object + * + * This function looks for a free LEB and returns that LEB number. The returned + * LEB is marked as "taken", "index". + * + * Only empty LEBs are allocated. This is for two reasons. First, the commit + * calculates the number of LEBs to allocate based on the assumption that they + * will be empty. Secondly, free space at the end of an index LEB is not + * guaranteed to be empty because it may have been used by the in-the-gaps + * method prior to an unclean unmount. + * + * If no LEB is found %-ENOSPC is returned. For other failures another negative + * error code is returned. + */ +int ubifs_find_free_leb_for_idx(struct ubifs_info *c) +{ + const struct ubifs_lprops *lprops; + int lnum = -1, err, flags; + + ubifs_get_lprops(c); + + lprops = ubifs_fast_find_empty(c); + if (!lprops) { + lprops = ubifs_fast_find_freeable(c); + if (!lprops) { + ubifs_assert(c->freeable_cnt == 0); + if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + lprops = scan_for_leb_for_idx(c); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + } + } + } + + if (!lprops) { + err = -ENOSPC; + goto out; + } + + lnum = lprops->lnum; + + dbg_find("found LEB %d, free %d, dirty %d, flags %#x", + lnum, lprops->free, lprops->dirty, lprops->flags); + + flags = lprops->flags | LPROPS_TAKEN | LPROPS_INDEX; + lprops = ubifs_change_lp(c, lprops, c->leb_size, 0, flags, 0); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + ubifs_release_lprops(c); + + /* + * Ensure that empty LEBs have been unmapped. They may not have been, + * for example, because of an unclean unmount. Also LEBs that were + * freeable LEBs (free + dirty == leb_size) will not have been unmapped. + */ + err = ubifs_leb_unmap(c, lnum); + if (err) { + ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_TAKEN | LPROPS_INDEX, 0); + return err; + } + + return lnum; + +out: + ubifs_release_lprops(c); + return err; +} + +static int cmp_dirty_idx(const struct ubifs_lprops **a, + const struct ubifs_lprops **b) +{ + const struct ubifs_lprops *lpa = *a; + const struct ubifs_lprops *lpb = *b; + + return lpa->dirty + lpa->free - lpb->dirty - lpb->free; +} + +static void swap_dirty_idx(struct ubifs_lprops **a, struct ubifs_lprops **b, + int size) +{ + struct ubifs_lprops *t = *a; + + *a = *b; + *b = t; +} + +/** + * ubifs_save_dirty_idx_lnums - save an array of the most dirty index LEB nos. + * @c: the UBIFS file-system description object + * + * This function is called each commit to create an array of LEB numbers of + * dirty index LEBs sorted in order of dirty and free space. This is used by + * the in-the-gaps method of TNC commit. + */ +int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) +{ + int i; + + ubifs_get_lprops(c); + /* Copy the LPROPS_DIRTY_IDX heap */ + c->dirty_idx.cnt = c->lpt_heap[LPROPS_DIRTY_IDX - 1].cnt; + memcpy(c->dirty_idx.arr, c->lpt_heap[LPROPS_DIRTY_IDX - 1].arr, + sizeof(void *) * c->dirty_idx.cnt); + /* Sort it so that the dirtiest is now at the end */ + sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *), + (int (*)(const void *, const void *))cmp_dirty_idx, + (void (*)(void *, void *, int))swap_dirty_idx); + dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt); + if (c->dirty_idx.cnt) + dbg_find("dirtiest index LEB is %d with dirty %d and free %d", + c->dirty_idx.arr[c->dirty_idx.cnt - 1]->lnum, + c->dirty_idx.arr[c->dirty_idx.cnt - 1]->dirty, + c->dirty_idx.arr[c->dirty_idx.cnt - 1]->free); + /* Replace the lprops pointers with LEB numbers */ + for (i = 0; i < c->dirty_idx.cnt; i++) + c->dirty_idx.arr[i] = (void *)(size_t)c->dirty_idx.arr[i]->lnum; + ubifs_release_lprops(c); + return 0; +} + +/** + * scan_dirty_idx_cb - callback used by the scan for a dirty index LEB. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_dirty_idx_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude non-index LEBs */ + if (!(lprops->flags & LPROPS_INDEX)) + return ret; + /* Exclude LEBs with too little space */ + if (lprops->free + lprops->dirty < c->min_idx_node_sz) + return ret; + /* Finally we found space */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * find_dirty_idx_leb - find a dirty index LEB. + * @c: the UBIFS file-system description object + * + * This function returns LEB number upon success and a negative error code upon + * failure. In particular, -ENOSPC is returned if a dirty index LEB is not + * found. + * + * Note that this function scans the entire LPT but it is called very rarely. + */ +static int find_dirty_idx_leb(struct ubifs_info *c) +{ + const struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + struct scan_data data; + int err, i, ret; + + /* Check all structures in memory first */ + data.lnum = -1; + heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + ret = scan_dirty_idx_cb(c, lprops, 1, &data); + if (ret & LPT_SCAN_STOP) + goto found; + } + list_for_each_entry(lprops, &c->frdi_idx_list, list) { + ret = scan_dirty_idx_cb(c, lprops, 1, &data); + if (ret & LPT_SCAN_STOP) + goto found; + } + list_for_each_entry(lprops, &c->uncat_list, list) { + ret = scan_dirty_idx_cb(c, lprops, 1, &data); + if (ret & LPT_SCAN_STOP) + goto found; + } + if (c->pnodes_have >= c->pnode_cnt) + /* All pnodes are in memory, so skip scan */ + return -ENOSPC; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_dirty_idx_cb, + &data); + if (err) + return err; +found: + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return PTR_ERR(lprops); + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free + lprops->dirty >= c->min_idx_node_sz); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert((lprops->flags & LPROPS_INDEX)); + + dbg_find("found dirty LEB %d, free %d, dirty %d, flags %#x", + lprops->lnum, lprops->free, lprops->dirty, lprops->flags); + + lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, + lprops->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lprops)) + return PTR_ERR(lprops); + + return lprops->lnum; +} + +/** + * get_idx_gc_leb - try to get a LEB number from trivial GC. + * @c: the UBIFS file-system description object + */ +static int get_idx_gc_leb(struct ubifs_info *c) +{ + const struct ubifs_lprops *lp; + int err, lnum; + + err = ubifs_get_idx_gc_leb(c); + if (err < 0) + return err; + lnum = err; + /* + * The LEB was due to be unmapped after the commit but + * it is needed now for this commit. + */ + lp = ubifs_lpt_lookup_dirty(c, lnum); + if (IS_ERR(lp)) + return PTR_ERR(lp); + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_INDEX, -1); + if (IS_ERR(lp)) + return PTR_ERR(lp); + dbg_find("LEB %d, dirty %d and free %d flags %#x", + lp->lnum, lp->dirty, lp->free, lp->flags); + return lnum; +} + +/** + * find_dirtiest_idx_leb - find dirtiest index LEB from dirtiest array. + * @c: the UBIFS file-system description object + */ +static int find_dirtiest_idx_leb(struct ubifs_info *c) +{ + const struct ubifs_lprops *lp; + int lnum; + + while (1) { + if (!c->dirty_idx.cnt) + return -ENOSPC; + /* The lprops pointers were replaced by LEB numbers */ + lnum = (size_t)c->dirty_idx.arr[--c->dirty_idx.cnt]; + lp = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lp)) + return PTR_ERR(lp); + if ((lp->flags & LPROPS_TAKEN) || !(lp->flags & LPROPS_INDEX)) + continue; + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) + return PTR_ERR(lp); + break; + } + dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty, + lp->free, lp->flags); + ubifs_assert(lp->flags | LPROPS_TAKEN); + ubifs_assert(lp->flags | LPROPS_INDEX); + return lnum; +} + +/** + * ubifs_find_dirty_idx_leb - try to find dirtiest index LEB as at last commit. + * @c: the UBIFS file-system description object + * + * This function attempts to find an untaken index LEB with the most free and + * dirty space that can be used without overwriting index nodes that were in the + * last index committed. + */ +int ubifs_find_dirty_idx_leb(struct ubifs_info *c) +{ + int err; + + ubifs_get_lprops(c); + + /* + * We made an array of the dirtiest index LEB numbers as at the start of + * last commit. Try that array first. + */ + err = find_dirtiest_idx_leb(c); + + /* Next try scanning the entire LPT */ + if (err == -ENOSPC) + err = find_dirty_idx_leb(c); + + /* Finally take any index LEBs awaiting trivial GC */ + if (err == -ENOSPC) + err = get_idx_gc_leb(c); + + ubifs_release_lprops(c); + return err; +} diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c new file mode 100644 index 00000000..ded29f62 --- /dev/null +++ b/fs/ubifs/gc.c @@ -0,0 +1,985 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements garbage collection. The procedure for garbage collection + * is different depending on whether a LEB as an index LEB (contains index + * nodes) or not. For non-index LEBs, garbage collection finds a LEB which + * contains a lot of dirty space (obsolete nodes), and copies the non-obsolete + * nodes to the journal, at which point the garbage-collected LEB is free to be + * reused. For index LEBs, garbage collection marks the non-obsolete index nodes + * dirty in the TNC, and after the next commit, the garbage-collected LEB is + * to be reused. Garbage collection will cause the number of dirty index nodes + * to grow, however sufficient space is reserved for the index to ensure the + * commit will never run out of space. + * + * Notes about dead watermark. At current UBIFS implementation we assume that + * LEBs which have less than @c->dead_wm bytes of free + dirty space are full + * and not worth garbage-collecting. The dead watermark is one min. I/O unit + * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS + * Garbage Collector has to synchronize the GC head's write buffer before + * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can + * actually reclaim even very small pieces of dirty space by garbage collecting + * enough dirty LEBs, but we do not bother doing this at this implementation. + * + * Notes about dark watermark. The results of GC work depends on how big are + * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed, + * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would + * have to waste large pieces of free space at the end of LEB B, because nodes + * from LEB A would not fit. And the worst situation is when all nodes are of + * maximum size. So dark watermark is the amount of free + dirty space in LEB + * which are guaranteed to be reclaimable. If LEB has less space, the GC might + * be unable to reclaim it. So, LEBs with free + dirty greater than dark + * watermark are "good" LEBs from GC's point of few. The other LEBs are not so + * good, and GC takes extra care when moving them. + */ + +#include +#include +#include +#include "ubifs.h" + +/* + * GC may need to move more than one LEB to make progress. The below constants + * define "soft" and "hard" limits on the number of LEBs the garbage collector + * may move. + */ +#define SOFT_LEBS_LIMIT 4 +#define HARD_LEBS_LIMIT 32 + +/** + * switch_gc_head - switch the garbage collection journal head. + * @c: UBIFS file-system description object + * @buf: buffer to write + * @len: length of the buffer to write + * @lnum: LEB number written is returned here + * @offs: offset written is returned here + * + * This function switch the GC head to the next LEB which is reserved in + * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required, + * and other negative error code in case of failures. + */ +static int switch_gc_head(struct ubifs_info *c) +{ + int err, gc_lnum = c->gc_lnum; + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + + ubifs_assert(gc_lnum != -1); + dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)", + wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum, + c->leb_size - wbuf->offs - wbuf->used); + + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + return err; + + /* + * The GC write-buffer was synchronized, we may safely unmap + * 'c->gc_lnum'. + */ + err = ubifs_leb_unmap(c, gc_lnum); + if (err) + return err; + + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + return err; + + err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0); + if (err) + return err; + + c->gc_lnum = -1; + err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM); + return err; +} + +/** + * data_nodes_cmp - compare 2 data nodes. + * @priv: UBIFS file-system description object + * @a: first data node + * @a: second data node + * + * This function compares data nodes @a and @b. Returns %1 if @a has greater + * inode or block number, and %-1 otherwise. + */ +static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + ino_t inuma, inumb; + struct ubifs_info *c = priv; + struct ubifs_scan_node *sa, *sb; + + cond_resched(); + if (a == b) + return 0; + + sa = list_entry(a, struct ubifs_scan_node, list); + sb = list_entry(b, struct ubifs_scan_node, list); + + ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY); + ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY); + ubifs_assert(sa->type == UBIFS_DATA_NODE); + ubifs_assert(sb->type == UBIFS_DATA_NODE); + + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma == inumb) { + unsigned int blka = key_block(c, &sa->key); + unsigned int blkb = key_block(c, &sb->key); + + if (blka <= blkb) + return -1; + } else if (inuma <= inumb) + return -1; + + return 1; +} + +/* + * nondata_nodes_cmp - compare 2 non-data nodes. + * @priv: UBIFS file-system description object + * @a: first node + * @a: second node + * + * This function compares nodes @a and @b. It makes sure that inode nodes go + * first and sorted by length in descending order. Directory entry nodes go + * after inode nodes and are sorted in ascending hash valuer order. + */ +static int nondata_nodes_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + ino_t inuma, inumb; + struct ubifs_info *c = priv; + struct ubifs_scan_node *sa, *sb; + + cond_resched(); + if (a == b) + return 0; + + sa = list_entry(a, struct ubifs_scan_node, list); + sb = list_entry(b, struct ubifs_scan_node, list); + + ubifs_assert(key_type(c, &sa->key) != UBIFS_DATA_KEY && + key_type(c, &sb->key) != UBIFS_DATA_KEY); + ubifs_assert(sa->type != UBIFS_DATA_NODE && + sb->type != UBIFS_DATA_NODE); + + /* Inodes go before directory entries */ + if (sa->type == UBIFS_INO_NODE) { + if (sb->type == UBIFS_INO_NODE) + return sb->len - sa->len; + return -1; + } + if (sb->type == UBIFS_INO_NODE) + return 1; + + ubifs_assert(key_type(c, &sa->key) == UBIFS_DENT_KEY || + key_type(c, &sa->key) == UBIFS_XENT_KEY); + ubifs_assert(key_type(c, &sb->key) == UBIFS_DENT_KEY || + key_type(c, &sb->key) == UBIFS_XENT_KEY); + ubifs_assert(sa->type == UBIFS_DENT_NODE || + sa->type == UBIFS_XENT_NODE); + ubifs_assert(sb->type == UBIFS_DENT_NODE || + sb->type == UBIFS_XENT_NODE); + + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma == inumb) { + uint32_t hasha = key_hash(c, &sa->key); + uint32_t hashb = key_hash(c, &sb->key); + + if (hasha <= hashb) + return -1; + } else if (inuma <= inumb) + return -1; + + return 1; +} + +/** + * sort_nodes - sort nodes for GC. + * @c: UBIFS file-system description object + * @sleb: describes nodes to sort and contains the result on exit + * @nondata: contains non-data nodes on exit + * @min: minimum node size is returned here + * + * This function sorts the list of inodes to garbage collect. First of all, it + * kills obsolete nodes and separates data and non-data nodes to the + * @sleb->nodes and @nondata lists correspondingly. + * + * Data nodes are then sorted in block number order - this is important for + * bulk-read; data nodes with lower inode number go before data nodes with + * higher inode number, and data nodes with lower block number go before data + * nodes with higher block number; + * + * Non-data nodes are sorted as follows. + * o First go inode nodes - they are sorted in descending length order. + * o Then go directory entry nodes - they are sorted in hash order, which + * should supposedly optimize 'readdir()'. Direntry nodes with lower parent + * inode number go before direntry nodes with higher parent inode number, + * and direntry nodes with lower name hash values go before direntry nodes + * with higher name hash values. + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct list_head *nondata, int *min) +{ + int err; + struct ubifs_scan_node *snod, *tmp; + + *min = INT_MAX; + + /* Separate data nodes and non-data nodes */ + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + ubifs_assert(snod->type == UBIFS_INO_NODE || + snod->type == UBIFS_DATA_NODE || + snod->type == UBIFS_DENT_NODE || + snod->type == UBIFS_XENT_NODE || + snod->type == UBIFS_TRUN_NODE); + + if (snod->type != UBIFS_INO_NODE && + snod->type != UBIFS_DATA_NODE && + snod->type != UBIFS_DENT_NODE && + snod->type != UBIFS_XENT_NODE) { + /* Probably truncation node, zap it */ + list_del(&snod->list); + kfree(snod); + continue; + } + + ubifs_assert(key_type(c, &snod->key) == UBIFS_DATA_KEY || + key_type(c, &snod->key) == UBIFS_INO_KEY || + key_type(c, &snod->key) == UBIFS_DENT_KEY || + key_type(c, &snod->key) == UBIFS_XENT_KEY); + + err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, + snod->offs, 0); + if (err < 0) + return err; + + if (!err) { + /* The node is obsolete, remove it from the list */ + list_del(&snod->list); + kfree(snod); + continue; + } + + if (snod->len < *min) + *min = snod->len; + + if (key_type(c, &snod->key) != UBIFS_DATA_KEY) + list_move_tail(&snod->list, nondata); + } + + /* Sort data and non-data nodes */ + list_sort(c, &sleb->nodes, &data_nodes_cmp); + list_sort(c, nondata, &nondata_nodes_cmp); + + err = dbg_check_data_nodes_order(c, &sleb->nodes); + if (err) + return err; + err = dbg_check_nondata_nodes_order(c, nondata); + if (err) + return err; + return 0; +} + +/** + * move_node - move a node. + * @c: UBIFS file-system description object + * @sleb: describes the LEB to move nodes from + * @snod: the mode to move + * @wbuf: write-buffer to move node to + * + * This function moves node @snod to @wbuf, changes TNC correspondingly, and + * destroys @snod. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf) +{ + int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used; + + cond_resched(); + err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len); + if (err) + return err; + + err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, + snod->offs, new_lnum, new_offs, + snod->len); + list_del(&snod->list); + kfree(snod); + return err; +} + +/** + * move_nodes - move nodes. + * @c: UBIFS file-system description object + * @sleb: describes the LEB to move nodes from + * + * This function moves valid nodes from data LEB described by @sleb to the GC + * journal head. This function returns zero in case of success, %-EAGAIN if + * commit is required, and other negative error codes in case of other + * failures. + */ +static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) +{ + int err, min; + LIST_HEAD(nondata); + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + + if (wbuf->lnum == -1) { + /* + * The GC journal head is not set, because it is the first GC + * invocation since mount. + */ + err = switch_gc_head(c); + if (err) + return err; + } + + err = sort_nodes(c, sleb, &nondata, &min); + if (err) + goto out; + + /* Write nodes to their new location. Use the first-fit strategy */ + while (1) { + int avail; + struct ubifs_scan_node *snod, *tmp; + + /* Move data nodes */ + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + avail = c->leb_size - wbuf->offs - wbuf->used; + if (snod->len > avail) + /* + * Do not skip data nodes in order to optimize + * bulk-read. + */ + break; + + err = move_node(c, sleb, snod, wbuf); + if (err) + goto out; + } + + /* Move non-data nodes */ + list_for_each_entry_safe(snod, tmp, &nondata, list) { + avail = c->leb_size - wbuf->offs - wbuf->used; + if (avail < min) + break; + + if (snod->len > avail) { + /* + * Keep going only if this is an inode with + * some data. Otherwise stop and switch the GC + * head. IOW, we assume that data-less inode + * nodes and direntry nodes are roughly of the + * same size. + */ + if (key_type(c, &snod->key) == UBIFS_DENT_KEY || + snod->len == UBIFS_INO_NODE_SZ) + break; + continue; + } + + err = move_node(c, sleb, snod, wbuf); + if (err) + goto out; + } + + if (list_empty(&sleb->nodes) && list_empty(&nondata)) + break; + + /* + * Waste the rest of the space in the LEB and switch to the + * next LEB. + */ + err = switch_gc_head(c); + if (err) + goto out; + } + + return 0; + +out: + list_splice_tail(&nondata, &sleb->nodes); + return err; +} + +/** + * gc_sync_wbufs - sync write-buffers for GC. + * @c: UBIFS file-system description object + * + * We must guarantee that obsoleting nodes are on flash. Unfortunately they may + * be in a write-buffer instead. That is, a node could be written to a + * write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is + * erased before the write-buffer is sync'd and then there is an unclean + * unmount, then an existing node is lost. To avoid this, we sync all + * write-buffers. + * + * This function returns %0 on success or a negative error code on failure. + */ +static int gc_sync_wbufs(struct ubifs_info *c) +{ + int err, i; + + for (i = 0; i < c->jhead_cnt; i++) { + if (i == GCHD) + continue; + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + return err; + } + return 0; +} + +/** + * ubifs_garbage_collect_leb - garbage-collect a logical eraseblock. + * @c: UBIFS file-system description object + * @lp: describes the LEB to garbage collect + * + * This function garbage-collects an LEB and returns one of the @LEB_FREED, + * @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is + * required, and other negative error codes in case of failures. + */ +int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + int err = 0, lnum = lp->lnum; + + ubifs_assert(c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 || + c->need_recovery); + ubifs_assert(c->gc_lnum != lnum); + ubifs_assert(wbuf->lnum != lnum); + + if (lp->free + lp->dirty == c->leb_size) { + /* Special case - a free LEB */ + dbg_gc("LEB %d is free, return it", lp->lnum); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + + if (lp->free != c->leb_size) { + /* + * Write buffers must be sync'd before unmapping + * freeable LEBs, because one of them may contain data + * which obsoletes something in 'lp->pnum'. + */ + err = gc_sync_wbufs(c); + if (err) + return err; + err = ubifs_change_one_lp(c, lp->lnum, c->leb_size, + 0, 0, 0, 0); + if (err) + return err; + } + err = ubifs_leb_unmap(c, lp->lnum); + if (err) + return err; + + if (c->gc_lnum == -1) { + c->gc_lnum = lnum; + return LEB_RETAINED; + } + + return LEB_FREED; + } + + /* + * We scan the entire LEB even though we only really need to scan up to + * (c->leb_size - lp->free). + */ + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + + ubifs_assert(!list_empty(&sleb->nodes)); + snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list); + + if (snod->type == UBIFS_IDX_NODE) { + struct ubifs_gced_idx_leb *idx_gc; + + dbg_gc("indexing LEB %d (free %d, dirty %d)", + lnum, lp->free, lp->dirty); + list_for_each_entry(snod, &sleb->nodes, list) { + struct ubifs_idx_node *idx = snod->node; + int level = le16_to_cpu(idx->level); + + ubifs_assert(snod->type == UBIFS_IDX_NODE); + key_read(c, ubifs_idx_key(c, idx), &snod->key); + err = ubifs_dirty_idx_node(c, &snod->key, level, lnum, + snod->offs); + if (err) + goto out; + } + + idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS); + if (!idx_gc) { + err = -ENOMEM; + goto out; + } + + idx_gc->lnum = lnum; + idx_gc->unmap = 0; + list_add(&idx_gc->list, &c->idx_gc); + + /* + * Don't release the LEB until after the next commit, because + * it may contain data which is needed for recovery. So + * although we freed this LEB, it will become usable only after + * the commit. + */ + err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, + LPROPS_INDEX, 1); + if (err) + goto out; + err = LEB_FREED_IDX; + } else { + dbg_gc("data LEB %d (free %d, dirty %d)", + lnum, lp->free, lp->dirty); + + err = move_nodes(c, sleb); + if (err) + goto out_inc_seq; + + err = gc_sync_wbufs(c); + if (err) + goto out_inc_seq; + + err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0); + if (err) + goto out_inc_seq; + + /* Allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); + + if (c->gc_lnum == -1) { + c->gc_lnum = lnum; + err = LEB_RETAINED; + } else { + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + goto out; + + err = ubifs_leb_unmap(c, lnum); + if (err) + goto out; + + err = LEB_FREED; + } + } + +out: + ubifs_scan_destroy(sleb); + return err; + +out_inc_seq: + /* We may have moved at least some nodes so allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); + goto out; +} + +/** + * ubifs_garbage_collect - UBIFS garbage collector. + * @c: UBIFS file-system description object + * @anyway: do GC even if there are free LEBs + * + * This function does out-of-place garbage collection. The return codes are: + * o positive LEB number if the LEB has been freed and may be used; + * o %-EAGAIN if the caller has to run commit; + * o %-ENOSPC if GC failed to make any progress; + * o other negative error codes in case of other errors. + * + * Garbage collector writes data to the journal when GC'ing data LEBs, and just + * marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point + * commit may be required. But commit cannot be run from inside GC, because the + * caller might be holding the commit lock, so %-EAGAIN is returned instead; + * And this error code means that the caller has to run commit, and re-run GC + * if there is still no free space. + * + * There are many reasons why this function may return %-EAGAIN: + * o the log is full and there is no space to write an LEB reference for + * @c->gc_lnum; + * o the journal is too large and exceeds size limitations; + * o GC moved indexing LEBs, but they can be used only after the commit; + * o the shrinker fails to find clean znodes to free and requests the commit; + * o etc. + * + * Note, if the file-system is close to be full, this function may return + * %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of + * the function. E.g., this happens if the limits on the journal size are too + * tough and GC writes too much to the journal before an LEB is freed. This + * might also mean that the journal is too large, and the TNC becomes to big, + * so that the shrinker is constantly called, finds not clean znodes to free, + * and requests commit. Well, this may also happen if the journal is all right, + * but another kernel process consumes too much memory. Anyway, infinite + * %-EAGAIN may happen, but in some extreme/misconfiguration cases. + */ +int ubifs_garbage_collect(struct ubifs_info *c, int anyway) +{ + int i, err, ret, min_space = c->dead_wm; + struct ubifs_lprops lp; + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + + ubifs_assert_cmt_locked(c); + ubifs_assert(!c->ro_media && !c->ro_mount); + + if (ubifs_gc_should_commit(c)) + return -EAGAIN; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + + if (c->ro_error) { + ret = -EROFS; + goto out_unlock; + } + + /* We expect the write-buffer to be empty on entry */ + ubifs_assert(!wbuf->used); + + for (i = 0; ; i++) { + int space_before = c->leb_size - wbuf->offs - wbuf->used; + int space_after; + + cond_resched(); + + /* Give the commit an opportunity to run */ + if (ubifs_gc_should_commit(c)) { + ret = -EAGAIN; + break; + } + + if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) { + /* + * We've done enough iterations. Indexing LEBs were + * moved and will be available after the commit. + */ + dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN"); + ubifs_commit_required(c); + ret = -EAGAIN; + break; + } + + if (i > HARD_LEBS_LIMIT) { + /* + * We've moved too many LEBs and have not made + * progress, give up. + */ + dbg_gc("hard limit, -ENOSPC"); + ret = -ENOSPC; + break; + } + + /* + * Empty and freeable LEBs can turn up while we waited for + * the wbuf lock, or while we have been running GC. In that + * case, we should just return one of those instead of + * continuing to GC dirty LEBs. Hence we request + * 'ubifs_find_dirty_leb()' to return an empty LEB if it can. + */ + ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1); + if (ret) { + if (ret == -ENOSPC) + dbg_gc("no more dirty LEBs"); + break; + } + + dbg_gc("found LEB %d: free %d, dirty %d, sum %d " + "(min. space %d)", lp.lnum, lp.free, lp.dirty, + lp.free + lp.dirty, min_space); + + space_before = c->leb_size - wbuf->offs - wbuf->used; + if (wbuf->lnum == -1) + space_before = 0; + + ret = ubifs_garbage_collect_leb(c, &lp); + if (ret < 0) { + if (ret == -EAGAIN) { + /* + * This is not error, so we have to return the + * LEB to lprops. But if 'ubifs_return_leb()' + * fails, its failure code is propagated to the + * caller instead of the original '-EAGAIN'. + */ + err = ubifs_return_leb(c, lp.lnum); + if (err) + ret = err; + break; + } + goto out; + } + + if (ret == LEB_FREED) { + /* An LEB has been freed and is ready for use */ + dbg_gc("LEB %d freed, return", lp.lnum); + ret = lp.lnum; + break; + } + + if (ret == LEB_FREED_IDX) { + /* + * This was an indexing LEB and it cannot be + * immediately used. And instead of requesting the + * commit straight away, we try to garbage collect some + * more. + */ + dbg_gc("indexing LEB %d freed, continue", lp.lnum); + continue; + } + + ubifs_assert(ret == LEB_RETAINED); + space_after = c->leb_size - wbuf->offs - wbuf->used; + dbg_gc("LEB %d retained, freed %d bytes", lp.lnum, + space_after - space_before); + + if (space_after > space_before) { + /* GC makes progress, keep working */ + min_space >>= 1; + if (min_space < c->dead_wm) + min_space = c->dead_wm; + continue; + } + + dbg_gc("did not make progress"); + + /* + * GC moved an LEB bud have not done any progress. This means + * that the previous GC head LEB contained too few free space + * and the LEB which was GC'ed contained only large nodes which + * did not fit that space. + * + * We can do 2 things: + * 1. pick another LEB in a hope it'll contain a small node + * which will fit the space we have at the end of current GC + * head LEB, but there is no guarantee, so we try this out + * unless we have already been working for too long; + * 2. request an LEB with more dirty space, which will force + * 'ubifs_find_dirty_leb()' to start scanning the lprops + * table, instead of just picking one from the heap + * (previously it already picked the dirtiest LEB). + */ + if (i < SOFT_LEBS_LIMIT) { + dbg_gc("try again"); + continue; + } + + min_space <<= 1; + if (min_space > c->dark_wm) + min_space = c->dark_wm; + dbg_gc("set min. space to %d", min_space); + } + + if (ret == -ENOSPC && !list_empty(&c->idx_gc)) { + dbg_gc("no space, some index LEBs GC'ed, -EAGAIN"); + ubifs_commit_required(c); + ret = -EAGAIN; + } + + err = ubifs_wbuf_sync_nolock(wbuf); + if (!err) + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) { + ret = err; + goto out; + } +out_unlock: + mutex_unlock(&wbuf->io_mutex); + return ret; + +out: + ubifs_assert(ret < 0); + ubifs_assert(ret != -ENOSPC && ret != -EAGAIN); + ubifs_wbuf_sync_nolock(wbuf); + ubifs_ro_mode(c, ret); + mutex_unlock(&wbuf->io_mutex); + ubifs_return_leb(c, lp.lnum); + return ret; +} + +/** + * ubifs_gc_start_commit - garbage collection at start of commit. + * @c: UBIFS file-system description object + * + * If a LEB has only dirty and free space, then we may safely unmap it and make + * it free. Note, we cannot do this with indexing LEBs because dirty space may + * correspond index nodes that are required for recovery. In that case, the + * LEB cannot be unmapped until after the next commit. + * + * This function returns %0 upon success and a negative error code upon failure. + */ +int ubifs_gc_start_commit(struct ubifs_info *c) +{ + struct ubifs_gced_idx_leb *idx_gc; + const struct ubifs_lprops *lp; + int err = 0, flags; + + ubifs_get_lprops(c); + + /* + * Unmap (non-index) freeable LEBs. Note that recovery requires that all + * wbufs are sync'd before this, which is done in 'do_commit()'. + */ + while (1) { + lp = ubifs_fast_find_freeable(c); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + if (!lp) + break; + ubifs_assert(!(lp->flags & LPROPS_TAKEN)); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + err = ubifs_leb_unmap(c, lp->lnum); + if (err) + goto out; + lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + ubifs_assert(!(lp->flags & LPROPS_TAKEN)); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + } + + /* Mark GC'd index LEBs OK to unmap after this commit finishes */ + list_for_each_entry(idx_gc, &c->idx_gc, list) + idx_gc->unmap = 1; + + /* Record index freeable LEBs for unmapping after commit */ + while (1) { + lp = ubifs_fast_find_frdi_idx(c); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + if (!lp) + break; + idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS); + if (!idx_gc) { + err = -ENOMEM; + goto out; + } + ubifs_assert(!(lp->flags & LPROPS_TAKEN)); + ubifs_assert(lp->flags & LPROPS_INDEX); + /* Don't release the LEB until after the next commit */ + flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX; + lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + kfree(idx_gc); + goto out; + } + ubifs_assert(lp->flags & LPROPS_TAKEN); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + idx_gc->lnum = lp->lnum; + idx_gc->unmap = 1; + list_add(&idx_gc->list, &c->idx_gc); + } +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_gc_end_commit - garbage collection at end of commit. + * @c: UBIFS file-system description object + * + * This function completes out-of-place garbage collection of index LEBs. + */ +int ubifs_gc_end_commit(struct ubifs_info *c) +{ + struct ubifs_gced_idx_leb *idx_gc, *tmp; + struct ubifs_wbuf *wbuf; + int err = 0; + + wbuf = &c->jheads[GCHD].wbuf; + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list) + if (idx_gc->unmap) { + dbg_gc("LEB %d", idx_gc->lnum); + err = ubifs_leb_unmap(c, idx_gc->lnum); + if (err) + goto out; + err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC, + LPROPS_NC, 0, LPROPS_TAKEN, -1); + if (err) + goto out; + list_del(&idx_gc->list); + kfree(idx_gc); + } +out: + mutex_unlock(&wbuf->io_mutex); + return err; +} + +/** + * ubifs_destroy_idx_gc - destroy idx_gc list. + * @c: UBIFS file-system description object + * + * This function destroys the @c->idx_gc list. It is called when unmounting + * so locks are not needed. Returns zero in case of success and a negative + * error code in case of failure. + */ +void ubifs_destroy_idx_gc(struct ubifs_info *c) +{ + while (!list_empty(&c->idx_gc)) { + struct ubifs_gced_idx_leb *idx_gc; + + idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, + list); + c->idx_gc_cnt -= 1; + list_del(&idx_gc->list); + kfree(idx_gc); + } +} + +/** + * ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list. + * @c: UBIFS file-system description object + * + * Called during start commit so locks are not needed. + */ +int ubifs_get_idx_gc_leb(struct ubifs_info *c) +{ + struct ubifs_gced_idx_leb *idx_gc; + int lnum; + + if (list_empty(&c->idx_gc)) + return -ENOSPC; + idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list); + lnum = idx_gc->lnum; + /* c->idx_gc_cnt is updated by the caller when lprops are updated */ + list_del(&idx_gc->list); + kfree(idx_gc); + return lnum; +} diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c new file mode 100644 index 00000000..3be645e0 --- /dev/null +++ b/fs/ubifs/io.c @@ -0,0 +1,1054 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + * Zoltan Sogor + */ + +/* + * This file implements UBIFS I/O subsystem which provides various I/O-related + * helper functions (reading/writing/checking/validating nodes) and implements + * write-buffering support. Write buffers help to save space which otherwise + * would have been wasted for padding to the nearest minimal I/O unit boundary. + * Instead, data first goes to the write-buffer and is flushed when the + * buffer is full or when it is not used for some time (by timer). This is + * similar to the mechanism is used by JFFS2. + * + * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum + * write size (@c->max_write_size). The latter is the maximum amount of bytes + * the underlying flash is able to program at a time, and writing in + * @c->max_write_size units should presumably be faster. Obviously, + * @c->min_io_size <= @c->max_write_size. Write-buffers are of + * @c->max_write_size bytes in size for maximum performance. However, when a + * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size + * boundary) which contains data is written, not the whole write-buffer, + * because this is more space-efficient. + * + * This optimization adds few complications to the code. Indeed, on the one + * hand, we want to write in optimal @c->max_write_size bytes chunks, which + * also means aligning writes at the @c->max_write_size bytes offsets. On the + * other hand, we do not want to waste space when synchronizing the write + * buffer, so during synchronization we writes in smaller chunks. And this makes + * the next write offset to be not aligned to @c->max_write_size bytes. So the + * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned + * to @c->max_write_size bytes again. We do this by temporarily shrinking + * write-buffer size (@wbuf->size). + * + * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by + * mutexes defined inside these objects. Since sometimes upper-level code + * has to lock the write-buffer (e.g. journal space reservation code), many + * functions related to write-buffers have "nolock" suffix which means that the + * caller has to lock the write-buffer before calling this function. + * + * UBIFS stores nodes at 64 bit-aligned addresses. If the node length is not + * aligned, UBIFS starts the next node from the aligned address, and the padded + * bytes may contain any rubbish. In other words, UBIFS does not put padding + * bytes in those small gaps. Common headers of nodes store real node lengths, + * not aligned lengths. Indexing nodes also store real lengths in branches. + * + * UBIFS uses padding when it pads to the next min. I/O unit. In this case it + * uses padding nodes or padding bytes, if the padding node does not fit. + * + * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when + * they are read from the flash media. + */ + +#include +#include +#include "ubifs.h" + +/** + * ubifs_ro_mode - switch UBIFS to read read-only mode. + * @c: UBIFS file-system description object + * @err: error code which is the reason of switching to R/O mode + */ +void ubifs_ro_mode(struct ubifs_info *c, int err) +{ + if (!c->ro_error) { + c->ro_error = 1; + c->no_chk_data_crc = 0; + c->vfs_sb->s_flags |= MS_RDONLY; + ubifs_warn("switched to read-only mode, error %d", err); + dbg_dump_stack(); + } +} + +/** + * ubifs_check_node - check node. + * @c: UBIFS file-system description object + * @buf: node to check + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * @quiet: print no messages + * @must_chk_crc: indicates whether to always check the CRC + * + * This function checks node magic number and CRC checksum. This function also + * validates node length to prevent UBIFS from becoming crazy when an attacker + * feeds it a file-system image with incorrect nodes. For example, too large + * node length in the common header could cause UBIFS to read memory outside of + * allocated buffer when checking the CRC checksum. + * + * This function may skip data nodes CRC checking if @c->no_chk_data_crc is + * true, which is controlled by corresponding UBIFS mount option. However, if + * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is + * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are + * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC + * is checked. This is because during mounting or re-mounting from R/O mode to + * R/W mode we may read journal nodes (when replying the journal or doing the + * recovery) and the journal nodes may potentially be corrupted, so checking is + * required. + * + * This function returns zero in case of success and %-EUCLEAN in case of bad + * CRC or magic. + */ +int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, + int offs, int quiet, int must_chk_crc) +{ + int err = -EINVAL, type, node_len; + uint32_t crc, node_crc, magic; + const struct ubifs_ch *ch = buf; + + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + + magic = le32_to_cpu(ch->magic); + if (magic != UBIFS_NODE_MAGIC) { + if (!quiet) + ubifs_err("bad magic %#08x, expected %#08x", + magic, UBIFS_NODE_MAGIC); + err = -EUCLEAN; + goto out; + } + + type = ch->node_type; + if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) { + if (!quiet) + ubifs_err("bad node type %d", type); + goto out; + } + + node_len = le32_to_cpu(ch->len); + if (node_len + offs > c->leb_size) + goto out_len; + + if (c->ranges[type].max_len == 0) { + if (node_len != c->ranges[type].len) + goto out_len; + } else if (node_len < c->ranges[type].min_len || + node_len > c->ranges[type].max_len) + goto out_len; + + if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting && + !c->remounting_rw && c->no_chk_data_crc) + return 0; + + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); + node_crc = le32_to_cpu(ch->crc); + if (crc != node_crc) { + if (!quiet) + ubifs_err("bad CRC: calculated %#08x, read %#08x", + crc, node_crc); + err = -EUCLEAN; + goto out; + } + + return 0; + +out_len: + if (!quiet) + ubifs_err("bad node length %d", node_len); +out: + if (!quiet) { + ubifs_err("bad node at LEB %d:%d", lnum, offs); + dbg_dump_node(c, buf); + dbg_dump_stack(); + } + return err; +} + +/** + * ubifs_pad - pad flash space. + * @c: UBIFS file-system description object + * @buf: buffer to put padding to + * @pad: how many bytes to pad + * + * The flash media obliges us to write only in chunks of %c->min_io_size and + * when we have to write less data we add padding node to the write-buffer and + * pad it to the next minimal I/O unit's boundary. Padding nodes help when the + * media is being scanned. If the amount of wasted space is not enough to fit a + * padding node which takes %UBIFS_PAD_NODE_SZ bytes, we write padding bytes + * pattern (%UBIFS_PADDING_BYTE). + * + * Padding nodes are also used to fill gaps when the "commit-in-gaps" method is + * used. + */ +void ubifs_pad(const struct ubifs_info *c, void *buf, int pad) +{ + uint32_t crc; + + ubifs_assert(pad >= 0 && !(pad & 7)); + + if (pad >= UBIFS_PAD_NODE_SZ) { + struct ubifs_ch *ch = buf; + struct ubifs_pad_node *pad_node = buf; + + ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC); + ch->node_type = UBIFS_PAD_NODE; + ch->group_type = UBIFS_NO_NODE_GROUP; + ch->padding[0] = ch->padding[1] = 0; + ch->sqnum = 0; + ch->len = cpu_to_le32(UBIFS_PAD_NODE_SZ); + pad -= UBIFS_PAD_NODE_SZ; + pad_node->pad_len = cpu_to_le32(pad); + crc = crc32(UBIFS_CRC32_INIT, buf + 8, UBIFS_PAD_NODE_SZ - 8); + ch->crc = cpu_to_le32(crc); + memset(buf + UBIFS_PAD_NODE_SZ, 0, pad); + } else if (pad > 0) + /* Too little space, padding node won't fit */ + memset(buf, UBIFS_PADDING_BYTE, pad); +} + +/** + * next_sqnum - get next sequence number. + * @c: UBIFS file-system description object + */ +static unsigned long long next_sqnum(struct ubifs_info *c) +{ + unsigned long long sqnum; + + spin_lock(&c->cnt_lock); + sqnum = ++c->max_sqnum; + spin_unlock(&c->cnt_lock); + + if (unlikely(sqnum >= SQNUM_WARN_WATERMARK)) { + if (sqnum >= SQNUM_WATERMARK) { + ubifs_err("sequence number overflow %llu, end of life", + sqnum); + ubifs_ro_mode(c, -EINVAL); + } + ubifs_warn("running out of sequence numbers, end of life soon"); + } + + return sqnum; +} + +/** + * ubifs_prepare_node - prepare node to be written to flash. + * @c: UBIFS file-system description object + * @node: the node to pad + * @len: node length + * @pad: if the buffer has to be padded + * + * This function prepares node at @node to be written to the media - it + * calculates node CRC, fills the common header, and adds proper padding up to + * the next minimum I/O unit if @pad is not zero. + */ +void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad) +{ + uint32_t crc; + struct ubifs_ch *ch = node; + unsigned long long sqnum = next_sqnum(c); + + ubifs_assert(len >= UBIFS_CH_SZ); + + ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC); + ch->len = cpu_to_le32(len); + ch->group_type = UBIFS_NO_NODE_GROUP; + ch->sqnum = cpu_to_le64(sqnum); + ch->padding[0] = ch->padding[1] = 0; + crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8); + ch->crc = cpu_to_le32(crc); + + if (pad) { + len = ALIGN(len, 8); + pad = ALIGN(len, c->min_io_size) - len; + ubifs_pad(c, node + len, pad); + } +} + +/** + * ubifs_prep_grp_node - prepare node of a group to be written to flash. + * @c: UBIFS file-system description object + * @node: the node to pad + * @len: node length + * @last: indicates the last node of the group + * + * This function prepares node at @node to be written to the media - it + * calculates node CRC and fills the common header. + */ +void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last) +{ + uint32_t crc; + struct ubifs_ch *ch = node; + unsigned long long sqnum = next_sqnum(c); + + ubifs_assert(len >= UBIFS_CH_SZ); + + ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC); + ch->len = cpu_to_le32(len); + if (last) + ch->group_type = UBIFS_LAST_OF_NODE_GROUP; + else + ch->group_type = UBIFS_IN_NODE_GROUP; + ch->sqnum = cpu_to_le64(sqnum); + ch->padding[0] = ch->padding[1] = 0; + crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8); + ch->crc = cpu_to_le32(crc); +} + +/** + * wbuf_timer_callback - write-buffer timer callback function. + * @data: timer data (write-buffer descriptor) + * + * This function is called when the write-buffer timer expires. + */ +static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer) +{ + struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer); + + dbg_io("jhead %s", dbg_jhead(wbuf->jhead)); + wbuf->need_sync = 1; + wbuf->c->need_wbuf_sync = 1; + ubifs_wake_up_bgt(wbuf->c); + return HRTIMER_NORESTART; +} + +/** + * new_wbuf_timer - start new write-buffer timer. + * @wbuf: write-buffer descriptor + */ +static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) +{ + ubifs_assert(!hrtimer_active(&wbuf->timer)); + + if (wbuf->no_timer) + return; + dbg_io("set timer for jhead %s, %llu-%llu millisecs", + dbg_jhead(wbuf->jhead), + div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC), + div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta, + USEC_PER_SEC)); + hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta, + HRTIMER_MODE_REL); +} + +/** + * cancel_wbuf_timer - cancel write-buffer timer. + * @wbuf: write-buffer descriptor + */ +static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) +{ + if (wbuf->no_timer) + return; + wbuf->need_sync = 0; + hrtimer_cancel(&wbuf->timer); +} + +/** + * ubifs_wbuf_sync_nolock - synchronize write-buffer. + * @wbuf: write-buffer to synchronize + * + * This function synchronizes write-buffer @buf and returns zero in case of + * success or a negative error code in case of failure. + * + * Note, although write-buffers are of @c->max_write_size, this function does + * not necessarily writes all @c->max_write_size bytes to the flash. Instead, + * if the write-buffer is only partially filled with data, only the used part + * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized. + * This way we waste less space. + */ +int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) +{ + struct ubifs_info *c = wbuf->c; + int err, dirt, sync_len; + + cancel_wbuf_timer_nolock(wbuf); + if (!wbuf->used || wbuf->lnum == -1) + /* Write-buffer is empty or not seeked */ + return 0; + + dbg_io("LEB %d:%d, %d bytes, jhead %s", + wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead)); + ubifs_assert(!(wbuf->avail & 7)); + ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size); + ubifs_assert(wbuf->size >= c->min_io_size); + ubifs_assert(wbuf->size <= c->max_write_size); + ubifs_assert(wbuf->size % c->min_io_size == 0); + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->leb_size - wbuf->offs >= c->max_write_size) + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size)); + + if (c->ro_error) + return -EROFS; + + /* + * Do not write whole write buffer but write only the minimum necessary + * amount of min. I/O units. + */ + sync_len = ALIGN(wbuf->used, c->min_io_size); + dirt = sync_len - wbuf->used; + if (dirt) + ubifs_pad(c, wbuf->buf + wbuf->used, dirt); + err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, + sync_len, wbuf->dtype); + if (err) { + ubifs_err("cannot write %d bytes to LEB %d:%d", + sync_len, wbuf->lnum, wbuf->offs); + dbg_dump_stack(); + return err; + } + + spin_lock(&wbuf->lock); + wbuf->offs += sync_len; + /* + * Now @wbuf->offs is not necessarily aligned to @c->max_write_size. + * But our goal is to optimize writes and make sure we write in + * @c->max_write_size chunks and to @c->max_write_size-aligned offset. + * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make + * sure that @wbuf->offs + @wbuf->size is aligned to + * @c->max_write_size. This way we make sure that after next + * write-buffer flush we are again at the optimal offset (aligned to + * @c->max_write_size). + */ + if (c->leb_size - wbuf->offs < c->max_write_size) + wbuf->size = c->leb_size - wbuf->offs; + else if (wbuf->offs & (c->max_write_size - 1)) + wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs; + else + wbuf->size = c->max_write_size; + wbuf->avail = wbuf->size; + wbuf->used = 0; + wbuf->next_ino = 0; + spin_unlock(&wbuf->lock); + + if (wbuf->sync_callback) + err = wbuf->sync_callback(c, wbuf->lnum, + c->leb_size - wbuf->offs, dirt); + return err; +} + +/** + * ubifs_wbuf_seek_nolock - seek write-buffer. + * @wbuf: write-buffer + * @lnum: logical eraseblock number to seek to + * @offs: logical eraseblock offset to seek to + * @dtype: data type + * + * This function targets the write-buffer to logical eraseblock @lnum:@offs. + * The write-buffer has to be empty. Returns zero in case of success and a + * negative error code in case of failure. + */ +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, + int dtype) +{ + const struct ubifs_info *c = wbuf->c; + + dbg_io("LEB %d:%d, jhead %s", lnum, offs, dbg_jhead(wbuf->jhead)); + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt); + ubifs_assert(offs >= 0 && offs <= c->leb_size); + ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); + ubifs_assert(lnum != wbuf->lnum); + ubifs_assert(wbuf->used == 0); + + spin_lock(&wbuf->lock); + wbuf->lnum = lnum; + wbuf->offs = offs; + if (c->leb_size - wbuf->offs < c->max_write_size) + wbuf->size = c->leb_size - wbuf->offs; + else if (wbuf->offs & (c->max_write_size - 1)) + wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs; + else + wbuf->size = c->max_write_size; + wbuf->avail = wbuf->size; + wbuf->used = 0; + spin_unlock(&wbuf->lock); + wbuf->dtype = dtype; + + return 0; +} + +/** + * ubifs_bg_wbufs_sync - synchronize write-buffers. + * @c: UBIFS file-system description object + * + * This function is called by background thread to synchronize write-buffers. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_bg_wbufs_sync(struct ubifs_info *c) +{ + int err, i; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (!c->need_wbuf_sync) + return 0; + c->need_wbuf_sync = 0; + + if (c->ro_error) { + err = -EROFS; + goto out_timers; + } + + dbg_io("synchronize"); + for (i = 0; i < c->jhead_cnt; i++) { + struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf; + + cond_resched(); + + /* + * If the mutex is locked then wbuf is being changed, so + * synchronization is not necessary. + */ + if (mutex_is_locked(&wbuf->io_mutex)) + continue; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + if (!wbuf->need_sync) { + mutex_unlock(&wbuf->io_mutex); + continue; + } + + err = ubifs_wbuf_sync_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + if (err) { + ubifs_err("cannot sync write-buffer, error %d", err); + ubifs_ro_mode(c, err); + goto out_timers; + } + } + + return 0; + +out_timers: + /* Cancel all timers to prevent repeated errors */ + for (i = 0; i < c->jhead_cnt; i++) { + struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + cancel_wbuf_timer_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + } + return err; +} + +/** + * ubifs_wbuf_write_nolock - write data to flash via write-buffer. + * @wbuf: write-buffer + * @buf: node to write + * @len: node length + * + * This function writes data to flash via write-buffer @wbuf. This means that + * the last piece of the node won't reach the flash media immediately if it + * does not take whole max. write unit (@c->max_write_size). Instead, the node + * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or + * because more data are appended to the write-buffer). + * + * This function returns zero in case of success and a negative error code in + * case of failure. If the node cannot be written because there is no more + * space in this logical eraseblock, %-ENOSPC is returned. + */ +int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) +{ + struct ubifs_info *c = wbuf->c; + int err, written, n, aligned_len = ALIGN(len, 8); + + dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len, + dbg_ntype(((struct ubifs_ch *)buf)->node_type), + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs + wbuf->used); + ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); + ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); + ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); + ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size); + ubifs_assert(wbuf->size >= c->min_io_size); + ubifs_assert(wbuf->size <= c->max_write_size); + ubifs_assert(wbuf->size % c->min_io_size == 0); + ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); + ubifs_assert(!c->ro_media && !c->ro_mount); + ubifs_assert(!c->space_fixup); + if (c->leb_size - wbuf->offs >= c->max_write_size) + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size)); + + if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { + err = -ENOSPC; + goto out; + } + + cancel_wbuf_timer_nolock(wbuf); + + if (c->ro_error) + return -EROFS; + + if (aligned_len <= wbuf->avail) { + /* + * The node is not very large and fits entirely within + * write-buffer. + */ + memcpy(wbuf->buf + wbuf->used, buf, len); + + if (aligned_len == wbuf->avail) { + dbg_io("flush jhead %s wbuf to LEB %d:%d", + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); + err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, + wbuf->offs, wbuf->size, + wbuf->dtype); + if (err) + goto out; + + spin_lock(&wbuf->lock); + wbuf->offs += wbuf->size; + if (c->leb_size - wbuf->offs >= c->max_write_size) + wbuf->size = c->max_write_size; + else + wbuf->size = c->leb_size - wbuf->offs; + wbuf->avail = wbuf->size; + wbuf->used = 0; + wbuf->next_ino = 0; + spin_unlock(&wbuf->lock); + } else { + spin_lock(&wbuf->lock); + wbuf->avail -= aligned_len; + wbuf->used += aligned_len; + spin_unlock(&wbuf->lock); + } + + goto exit; + } + + written = 0; + + if (wbuf->used) { + /* + * The node is large enough and does not fit entirely within + * current available space. We have to fill and flush + * write-buffer and switch to the next max. write unit. + */ + dbg_io("flush jhead %s wbuf to LEB %d:%d", + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); + memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); + err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, + wbuf->size, wbuf->dtype); + if (err) + goto out; + + wbuf->offs += wbuf->size; + len -= wbuf->avail; + aligned_len -= wbuf->avail; + written += wbuf->avail; + } else if (wbuf->offs & (c->max_write_size - 1)) { + /* + * The write-buffer offset is not aligned to + * @c->max_write_size and @wbuf->size is less than + * @c->max_write_size. Write @wbuf->size bytes to make sure the + * following writes are done in optimal @c->max_write_size + * chunks. + */ + dbg_io("write %d bytes to LEB %d:%d", + wbuf->size, wbuf->lnum, wbuf->offs); + err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs, + wbuf->size, wbuf->dtype); + if (err) + goto out; + + wbuf->offs += wbuf->size; + len -= wbuf->size; + aligned_len -= wbuf->size; + written += wbuf->size; + } + + /* + * The remaining data may take more whole max. write units, so write the + * remains multiple to max. write unit size directly to the flash media. + * We align node length to 8-byte boundary because we anyway flash wbuf + * if the remaining space is less than 8 bytes. + */ + n = aligned_len >> c->max_write_shift; + if (n) { + n <<= c->max_write_shift; + dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, + wbuf->offs); + err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, + wbuf->offs, n, wbuf->dtype); + if (err) + goto out; + wbuf->offs += n; + aligned_len -= n; + len -= n; + written += n; + } + + spin_lock(&wbuf->lock); + if (aligned_len) + /* + * And now we have what's left and what does not take whole + * max. write unit, so write it to the write-buffer and we are + * done. + */ + memcpy(wbuf->buf, buf + written, len); + + if (c->leb_size - wbuf->offs >= c->max_write_size) + wbuf->size = c->max_write_size; + else + wbuf->size = c->leb_size - wbuf->offs; + wbuf->avail = wbuf->size - aligned_len; + wbuf->used = aligned_len; + wbuf->next_ino = 0; + spin_unlock(&wbuf->lock); + +exit: + if (wbuf->sync_callback) { + int free = c->leb_size - wbuf->offs - wbuf->used; + + err = wbuf->sync_callback(c, wbuf->lnum, free, 0); + if (err) + goto out; + } + + if (wbuf->used) + new_wbuf_timer_nolock(wbuf); + + return 0; + +out: + ubifs_err("cannot write %d bytes to LEB %d:%d, error %d", + len, wbuf->lnum, wbuf->offs, err); + dbg_dump_node(c, buf); + dbg_dump_stack(); + dbg_dump_leb(c, wbuf->lnum); + return err; +} + +/** + * ubifs_write_node - write node to the media. + * @c: UBIFS file-system description object + * @buf: the node to write + * @len: node length + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN) + * + * This function automatically fills node magic number, assigns sequence + * number, and calculates node CRC checksum. The length of the @buf buffer has + * to be aligned to the minimal I/O unit size. This function automatically + * appends padding node and padding bytes if needed. Returns zero in case of + * success and a negative error code in case of failure. + */ +int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, + int offs, int dtype) +{ + int err, buf_len = ALIGN(len, c->min_io_size); + + dbg_io("LEB %d:%d, %s, length %d (aligned %d)", + lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len, + buf_len); + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size); + ubifs_assert(!c->ro_media && !c->ro_mount); + ubifs_assert(!c->space_fixup); + + if (c->ro_error) + return -EROFS; + + ubifs_prepare_node(c, buf, len, 1); + err = ubi_leb_write(c->ubi, lnum, buf, offs, buf_len, dtype); + if (err) { + ubifs_err("cannot write %d bytes to LEB %d:%d, error %d", + buf_len, lnum, offs, err); + dbg_dump_node(c, buf); + dbg_dump_stack(); + } + + return err; +} + +/** + * ubifs_read_node_wbuf - read node from the media or write-buffer. + * @wbuf: wbuf to check for un-written data + * @buf: buffer to read to + * @type: node type + * @len: node length + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * + * This function reads a node of known type and length, checks it and stores + * in @buf. If the node partially or fully sits in the write-buffer, this + * function takes data from the buffer, otherwise it reads the flash media. + * Returns zero in case of success, %-EUCLEAN if CRC mismatched and a negative + * error code in case of failure. + */ +int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, + int lnum, int offs) +{ + const struct ubifs_info *c = wbuf->c; + int err, rlen, overlap; + struct ubifs_ch *ch = buf; + + dbg_io("LEB %d:%d, %s, length %d, jhead %s", lnum, offs, + dbg_ntype(type), len, dbg_jhead(wbuf->jhead)); + ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); + + spin_lock(&wbuf->lock); + overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs); + if (!overlap) { + /* We may safely unlock the write-buffer and read the data */ + spin_unlock(&wbuf->lock); + return ubifs_read_node(c, buf, type, len, lnum, offs); + } + + /* Don't read under wbuf */ + rlen = wbuf->offs - offs; + if (rlen < 0) + rlen = 0; + + /* Copy the rest from the write-buffer */ + memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen); + spin_unlock(&wbuf->lock); + + if (rlen > 0) { + /* Read everything that goes before write-buffer */ + err = ubi_read(c->ubi, lnum, buf, offs, rlen); + if (err && err != -EBADMSG) { + ubifs_err("failed to read node %d from LEB %d:%d, " + "error %d", type, lnum, offs, err); + dbg_dump_stack(); + return err; + } + } + + if (type != ch->node_type) { + ubifs_err("bad node type (%d but expected %d)", + ch->node_type, type); + goto out; + } + + err = ubifs_check_node(c, buf, lnum, offs, 0, 0); + if (err) { + ubifs_err("expected node type %d", type); + return err; + } + + rlen = le32_to_cpu(ch->len); + if (rlen != len) { + ubifs_err("bad node length %d, expected %d", rlen, len); + goto out; + } + + return 0; + +out: + ubifs_err("bad node at LEB %d:%d", lnum, offs); + dbg_dump_node(c, buf); + dbg_dump_stack(); + return -EINVAL; +} + +/** + * ubifs_read_node - read node. + * @c: UBIFS file-system description object + * @buf: buffer to read to + * @type: node type + * @len: node length (not aligned) + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * + * This function reads a node of known type and and length, checks it and + * stores in @buf. Returns zero in case of success, %-EUCLEAN if CRC mismatched + * and a negative error code in case of failure. + */ +int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, + int lnum, int offs) +{ + int err, l; + struct ubifs_ch *ch = buf; + + dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(len >= UBIFS_CH_SZ && offs + len <= c->leb_size); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); + + err = ubi_read(c->ubi, lnum, buf, offs, len); + if (err && err != -EBADMSG) { + ubifs_err("cannot read node %d from LEB %d:%d, error %d", + type, lnum, offs, err); + return err; + } + + if (type != ch->node_type) { + ubifs_err("bad node type (%d but expected %d)", + ch->node_type, type); + goto out; + } + + err = ubifs_check_node(c, buf, lnum, offs, 0, 0); + if (err) { + ubifs_err("expected node type %d", type); + return err; + } + + l = le32_to_cpu(ch->len); + if (l != len) { + ubifs_err("bad node length %d, expected %d", l, len); + goto out; + } + + return 0; + +out: + ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs, + ubi_is_mapped(c->ubi, lnum)); + dbg_dump_node(c, buf); + dbg_dump_stack(); + return -EINVAL; +} + +/** + * ubifs_wbuf_init - initialize write-buffer. + * @c: UBIFS file-system description object + * @wbuf: write-buffer to initialize + * + * This function initializes write-buffer. Returns zero in case of success + * %-ENOMEM in case of failure. + */ +int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) +{ + size_t size; + + wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL); + if (!wbuf->buf) + return -ENOMEM; + + size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); + wbuf->inodes = kmalloc(size, GFP_KERNEL); + if (!wbuf->inodes) { + kfree(wbuf->buf); + wbuf->buf = NULL; + return -ENOMEM; + } + + wbuf->used = 0; + wbuf->lnum = wbuf->offs = -1; + /* + * If the LEB starts at the max. write size aligned address, then + * write-buffer size has to be set to @c->max_write_size. Otherwise, + * set it to something smaller so that it ends at the closest max. + * write size boundary. + */ + size = c->max_write_size - (c->leb_start % c->max_write_size); + wbuf->avail = wbuf->size = size; + wbuf->dtype = UBI_UNKNOWN; + wbuf->sync_callback = NULL; + mutex_init(&wbuf->io_mutex); + spin_lock_init(&wbuf->lock); + wbuf->c = c; + wbuf->next_ino = 0; + + hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + wbuf->timer.function = wbuf_timer_callback_nolock; + wbuf->softlimit = ktime_set(WBUF_TIMEOUT_SOFTLIMIT, 0); + wbuf->delta = WBUF_TIMEOUT_HARDLIMIT - WBUF_TIMEOUT_SOFTLIMIT; + wbuf->delta *= 1000000000ULL; + ubifs_assert(wbuf->delta <= ULONG_MAX); + return 0; +} + +/** + * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array. + * @wbuf: the write-buffer where to add + * @inum: the inode number + * + * This function adds an inode number to the inode array of the write-buffer. + */ +void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum) +{ + if (!wbuf->buf) + /* NOR flash or something similar */ + return; + + spin_lock(&wbuf->lock); + if (wbuf->used) + wbuf->inodes[wbuf->next_ino++] = inum; + spin_unlock(&wbuf->lock); +} + +/** + * wbuf_has_ino - returns if the wbuf contains data from the inode. + * @wbuf: the write-buffer + * @inum: the inode number + * + * This function returns with %1 if the write-buffer contains some data from the + * given inode otherwise it returns with %0. + */ +static int wbuf_has_ino(struct ubifs_wbuf *wbuf, ino_t inum) +{ + int i, ret = 0; + + spin_lock(&wbuf->lock); + for (i = 0; i < wbuf->next_ino; i++) + if (inum == wbuf->inodes[i]) { + ret = 1; + break; + } + spin_unlock(&wbuf->lock); + + return ret; +} + +/** + * ubifs_sync_wbufs_by_inode - synchronize write-buffers for an inode. + * @c: UBIFS file-system description object + * @inode: inode to synchronize + * + * This function synchronizes write-buffers which contain nodes belonging to + * @inode. Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode) +{ + int i, err = 0; + + for (i = 0; i < c->jhead_cnt; i++) { + struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf; + + if (i == GCHD) + /* + * GC head is special, do not look at it. Even if the + * head contains something related to this inode, it is + * a _copy_ of corresponding on-flash node which sits + * somewhere else. + */ + continue; + + if (!wbuf_has_ino(wbuf, inode->i_ino)) + continue; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + if (wbuf_has_ino(wbuf, inode->i_ino)) + err = ubifs_wbuf_sync_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + + if (err) { + ubifs_ro_mode(c, err); + return err; + } + } + return 0; +} diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c new file mode 100644 index 00000000..548acf49 --- /dev/null +++ b/fs/ubifs/ioctl.c @@ -0,0 +1,205 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Zoltan Sogor + * Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* This file implements EXT2-compatible extended attribute ioctl() calls */ + +#include +#include +#include "ubifs.h" + +/** + * ubifs_set_inode_flags - set VFS inode flags. + * @inode: VFS inode to set flags for + * + * This function propagates flags from UBIFS inode object to VFS inode object. + */ +void ubifs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = ubifs_inode(inode)->flags; + + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC); + if (flags & UBIFS_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & UBIFS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & UBIFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & UBIFS_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +/* + * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags. + * @ioctl_flags: flags to convert + * + * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags + * (@UBIFS_COMPR_FL, etc). + */ +static int ioctl2ubifs(int ioctl_flags) +{ + int ubifs_flags = 0; + + if (ioctl_flags & FS_COMPR_FL) + ubifs_flags |= UBIFS_COMPR_FL; + if (ioctl_flags & FS_SYNC_FL) + ubifs_flags |= UBIFS_SYNC_FL; + if (ioctl_flags & FS_APPEND_FL) + ubifs_flags |= UBIFS_APPEND_FL; + if (ioctl_flags & FS_IMMUTABLE_FL) + ubifs_flags |= UBIFS_IMMUTABLE_FL; + if (ioctl_flags & FS_DIRSYNC_FL) + ubifs_flags |= UBIFS_DIRSYNC_FL; + + return ubifs_flags; +} + +/* + * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags. + * @ubifs_flags: flags to convert + * + * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags + * (@FS_COMPR_FL, etc). + */ +static int ubifs2ioctl(int ubifs_flags) +{ + int ioctl_flags = 0; + + if (ubifs_flags & UBIFS_COMPR_FL) + ioctl_flags |= FS_COMPR_FL; + if (ubifs_flags & UBIFS_SYNC_FL) + ioctl_flags |= FS_SYNC_FL; + if (ubifs_flags & UBIFS_APPEND_FL) + ioctl_flags |= FS_APPEND_FL; + if (ubifs_flags & UBIFS_IMMUTABLE_FL) + ioctl_flags |= FS_IMMUTABLE_FL; + if (ubifs_flags & UBIFS_DIRSYNC_FL) + ioctl_flags |= FS_DIRSYNC_FL; + + return ioctl_flags; +} + +static int setflags(struct inode *inode, int flags) +{ + int oldflags, err, release; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_budget_req req = { .dirtied_ino = 1, + .dirtied_ino_d = ui->data_len }; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + */ + mutex_lock(&ui->ui_mutex); + oldflags = ubifs2ioctl(ui->flags); + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto out_unlock; + } + } + + ui->flags = ioctl2ubifs(flags); + ubifs_set_inode_flags(inode); + inode->i_ctime = ubifs_current_time(inode); + release = ui->dirty; + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + + if (release) + ubifs_release_budget(c, &req); + if (IS_SYNC(inode)) + err = write_inode_now(inode, 1); + return err; + +out_unlock: + ubifs_err("can't modify inode %lu attributes", inode->i_ino); + mutex_unlock(&ui->ui_mutex); + ubifs_release_budget(c, &req); + return err; +} + +long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int flags, err; + struct inode *inode = file->f_path.dentry->d_inode; + + switch (cmd) { + case FS_IOC_GETFLAGS: + flags = ubifs2ioctl(ubifs_inode(inode)->flags); + + dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags); + return put_user(flags, (int __user *) arg); + + case FS_IOC_SETFLAGS: { + if (IS_RDONLY(inode)) + return -EROFS; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + if (!S_ISDIR(inode->i_mode)) + flags &= ~FS_DIRSYNC_FL; + + /* + * Make sure the file-system is read-write and make sure it + * will not become read-only while we are changing the flags. + */ + err = mnt_want_write(file->f_path.mnt); + if (err) + return err; + dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags); + err = setflags(inode, flags); + mnt_drop_write(file->f_path.mnt); + return err; + } + + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + default: + return -ENOIOCTLCMD; + } + return ubifs_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} +#endif diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c new file mode 100644 index 00000000..cef0460f --- /dev/null +++ b/fs/ubifs/journal.c @@ -0,0 +1,1466 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS journal. + * + * The journal consists of 2 parts - the log and bud LEBs. The log has fixed + * length and position, while a bud logical eraseblock is any LEB in the main + * area. Buds contain file system data - data nodes, inode nodes, etc. The log + * contains only references to buds and some other stuff like commit + * start node. The idea is that when we commit the journal, we do + * not copy the data, the buds just become indexed. Since after the commit the + * nodes in bud eraseblocks become leaf nodes of the file system index tree, we + * use term "bud". Analogy is obvious, bud eraseblocks contain nodes which will + * become leafs in the future. + * + * The journal is multi-headed because we want to write data to the journal as + * optimally as possible. It is nice to have nodes belonging to the same inode + * in one LEB, so we may write data owned by different inodes to different + * journal heads, although at present only one data head is used. + * + * For recovery reasons, the base head contains all inode nodes, all directory + * entry nodes and all truncate nodes. This means that the other heads contain + * only data nodes. + * + * Bud LEBs may be half-indexed. For example, if the bud was not full at the + * time of commit, the bud is retained to continue to be used in the journal, + * even though the "front" of the LEB is now indexed. In that case, the log + * reference contains the offset where the bud starts for the purposes of the + * journal. + * + * The journal size has to be limited, because the larger is the journal, the + * longer it takes to mount UBIFS (scanning the journal) and the more memory it + * takes (indexing in the TNC). + * + * All the journal write operations like 'ubifs_jnl_update()' here, which write + * multiple UBIFS nodes to the journal at one go, are atomic with respect to + * unclean reboots. Should the unclean reboot happen, the recovery code drops + * all the nodes. + */ + +#include "ubifs.h" + +/** + * zero_ino_node_unused - zero out unused fields of an on-flash inode node. + * @ino: the inode to zero out + */ +static inline void zero_ino_node_unused(struct ubifs_ino_node *ino) +{ + memset(ino->padding1, 0, 4); + memset(ino->padding2, 0, 26); +} + +/** + * zero_dent_node_unused - zero out unused fields of an on-flash directory + * entry node. + * @dent: the directory entry to zero out + */ +static inline void zero_dent_node_unused(struct ubifs_dent_node *dent) +{ + dent->padding1 = 0; + memset(dent->padding2, 0, 4); +} + +/** + * zero_data_node_unused - zero out unused fields of an on-flash data node. + * @data: the data node to zero out + */ +static inline void zero_data_node_unused(struct ubifs_data_node *data) +{ + memset(data->padding, 0, 2); +} + +/** + * zero_trun_node_unused - zero out unused fields of an on-flash truncation + * node. + * @trun: the truncation node to zero out + */ +static inline void zero_trun_node_unused(struct ubifs_trun_node *trun) +{ + memset(trun->padding, 0, 12); +} + +/** + * reserve_space - reserve space in the journal. + * @c: UBIFS file-system description object + * @jhead: journal head number + * @len: node length + * + * This function reserves space in journal head @head. If the reservation + * succeeded, the journal head stays locked and later has to be unlocked using + * 'release_head()'. 'write_node()' and 'write_head()' functions also unlock + * it. Returns zero in case of success, %-EAGAIN if commit has to be done, and + * other negative error codes in case of other failures. + */ +static int reserve_space(struct ubifs_info *c, int jhead, int len) +{ + int err = 0, err1, retries = 0, avail, lnum, offs, squeeze; + struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; + + /* + * Typically, the base head has smaller nodes written to it, so it is + * better to try to allocate space at the ends of eraseblocks. This is + * what the squeeze parameter does. + */ + ubifs_assert(!c->ro_media && !c->ro_mount); + squeeze = (jhead == BASEHD); +again: + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + + if (c->ro_error) { + err = -EROFS; + goto out_unlock; + } + + avail = c->leb_size - wbuf->offs - wbuf->used; + if (wbuf->lnum != -1 && avail >= len) + return 0; + + /* + * Write buffer wasn't seek'ed or there is no enough space - look for an + * LEB with some empty space. + */ + lnum = ubifs_find_free_space(c, len, &offs, squeeze); + if (lnum >= 0) + goto out; + + err = lnum; + if (err != -ENOSPC) + goto out_unlock; + + /* + * No free space, we have to run garbage collector to make + * some. But the write-buffer mutex has to be unlocked because + * GC also takes it. + */ + dbg_jnl("no free space in jhead %s, run GC", dbg_jhead(jhead)); + mutex_unlock(&wbuf->io_mutex); + + lnum = ubifs_garbage_collect(c, 0); + if (lnum < 0) { + err = lnum; + if (err != -ENOSPC) + return err; + + /* + * GC could not make a free LEB. But someone else may + * have allocated new bud for this journal head, + * because we dropped @wbuf->io_mutex, so try once + * again. + */ + dbg_jnl("GC couldn't make a free LEB for jhead %s", + dbg_jhead(jhead)); + if (retries++ < 2) { + dbg_jnl("retry (%d)", retries); + goto again; + } + + dbg_jnl("return -ENOSPC"); + return err; + } + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + dbg_jnl("got LEB %d for jhead %s", lnum, dbg_jhead(jhead)); + avail = c->leb_size - wbuf->offs - wbuf->used; + + if (wbuf->lnum != -1 && avail >= len) { + /* + * Someone else has switched the journal head and we have + * enough space now. This happens when more than one process is + * trying to write to the same journal head at the same time. + */ + dbg_jnl("return LEB %d back, already have LEB %d:%d", + lnum, wbuf->lnum, wbuf->offs + wbuf->used); + err = ubifs_return_leb(c, lnum); + if (err) + goto out_unlock; + return 0; + } + + offs = 0; + +out: + /* + * Make sure we synchronize the write-buffer before we add the new bud + * to the log. Otherwise we may have a power cut after the log + * reference node for the last bud (@lnum) is written but before the + * write-buffer data are written to the next-to-last bud + * (@wbuf->lnum). And the effect would be that the recovery would see + * that there is corruption in the next-to-last bud. + */ + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + goto out_return; + err = ubifs_add_bud_to_log(c, jhead, lnum, offs); + if (err) + goto out_return; + err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype); + if (err) + goto out_unlock; + + return 0; + +out_unlock: + mutex_unlock(&wbuf->io_mutex); + return err; + +out_return: + /* An error occurred and the LEB has to be returned to lprops */ + ubifs_assert(err < 0); + err1 = ubifs_return_leb(c, lnum); + if (err1 && err == -EAGAIN) + /* + * Return original error code only if it is not %-EAGAIN, + * which is not really an error. Otherwise, return the error + * code of 'ubifs_return_leb()'. + */ + err = err1; + mutex_unlock(&wbuf->io_mutex); + return err; +} + +/** + * write_node - write node to a journal head. + * @c: UBIFS file-system description object + * @jhead: journal head + * @node: node to write + * @len: node length + * @lnum: LEB number written is returned here + * @offs: offset written is returned here + * + * This function writes a node to reserved space of journal head @jhead. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +static int write_node(struct ubifs_info *c, int jhead, void *node, int len, + int *lnum, int *offs) +{ + struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; + + ubifs_assert(jhead != GCHD); + + *lnum = c->jheads[jhead].wbuf.lnum; + *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; + + dbg_jnl("jhead %s, LEB %d:%d, len %d", + dbg_jhead(jhead), *lnum, *offs, len); + ubifs_prepare_node(c, node, len, 0); + + return ubifs_wbuf_write_nolock(wbuf, node, len); +} + +/** + * write_head - write data to a journal head. + * @c: UBIFS file-system description object + * @jhead: journal head + * @buf: buffer to write + * @len: length to write + * @lnum: LEB number written is returned here + * @offs: offset written is returned here + * @sync: non-zero if the write-buffer has to by synchronized + * + * This function is the same as 'write_node()' but it does not assume the + * buffer it is writing is a node, so it does not prepare it (which means + * initializing common header and calculating CRC). + */ +static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, + int *lnum, int *offs, int sync) +{ + int err; + struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; + + ubifs_assert(jhead != GCHD); + + *lnum = c->jheads[jhead].wbuf.lnum; + *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; + dbg_jnl("jhead %s, LEB %d:%d, len %d", + dbg_jhead(jhead), *lnum, *offs, len); + + err = ubifs_wbuf_write_nolock(wbuf, buf, len); + if (err) + return err; + if (sync) + err = ubifs_wbuf_sync_nolock(wbuf); + return err; +} + +/** + * make_reservation - reserve journal space. + * @c: UBIFS file-system description object + * @jhead: journal head + * @len: how many bytes to reserve + * + * This function makes space reservation in journal head @jhead. The function + * takes the commit lock and locks the journal head, and the caller has to + * unlock the head and finish the reservation with 'finish_reservation()'. + * Returns zero in case of success and a negative error code in case of + * failure. + * + * Note, the journal head may be unlocked as soon as the data is written, while + * the commit lock has to be released after the data has been added to the + * TNC. + */ +static int make_reservation(struct ubifs_info *c, int jhead, int len) +{ + int err, cmt_retries = 0, nospc_retries = 0; + +again: + down_read(&c->commit_sem); + err = reserve_space(c, jhead, len); + if (!err) + return 0; + up_read(&c->commit_sem); + + if (err == -ENOSPC) { + /* + * GC could not make any progress. We should try to commit + * once because it could make some dirty space and GC would + * make progress, so make the error -EAGAIN so that the below + * will commit and re-try. + */ + if (nospc_retries++ < 2) { + dbg_jnl("no space, retry"); + err = -EAGAIN; + } + + /* + * This means that the budgeting is incorrect. We always have + * to be able to write to the media, because all operations are + * budgeted. Deletions are not budgeted, though, but we reserve + * an extra LEB for them. + */ + } + + if (err != -EAGAIN) + goto out; + + /* + * -EAGAIN means that the journal is full or too large, or the above + * code wants to do one commit. Do this and re-try. + */ + if (cmt_retries > 128) { + /* + * This should not happen unless the journal size limitations + * are too tough. + */ + ubifs_err("stuck in space allocation"); + err = -ENOSPC; + goto out; + } else if (cmt_retries > 32) + ubifs_warn("too many space allocation re-tries (%d)", + cmt_retries); + + dbg_jnl("-EAGAIN, commit and retry (retried %d times)", + cmt_retries); + cmt_retries += 1; + + err = ubifs_run_commit(c); + if (err) + return err; + goto again; + +out: + ubifs_err("cannot reserve %d bytes in jhead %d, error %d", + len, jhead, err); + if (err == -ENOSPC) { + /* This are some budgeting problems, print useful information */ + down_write(&c->commit_sem); + dbg_dump_stack(); + dbg_dump_budg(c, &c->bi); + dbg_dump_lprops(c); + cmt_retries = dbg_check_lprops(c); + up_write(&c->commit_sem); + } + return err; +} + +/** + * release_head - release a journal head. + * @c: UBIFS file-system description object + * @jhead: journal head + * + * This function releases journal head @jhead which was locked by + * the 'make_reservation()' function. It has to be called after each successful + * 'make_reservation()' invocation. + */ +static inline void release_head(struct ubifs_info *c, int jhead) +{ + mutex_unlock(&c->jheads[jhead].wbuf.io_mutex); +} + +/** + * finish_reservation - finish a reservation. + * @c: UBIFS file-system description object + * + * This function finishes journal space reservation. It must be called after + * 'make_reservation()'. + */ +static void finish_reservation(struct ubifs_info *c) +{ + up_read(&c->commit_sem); +} + +/** + * get_dent_type - translate VFS inode mode to UBIFS directory entry type. + * @mode: inode mode + */ +static int get_dent_type(int mode) +{ + switch (mode & S_IFMT) { + case S_IFREG: + return UBIFS_ITYPE_REG; + case S_IFDIR: + return UBIFS_ITYPE_DIR; + case S_IFLNK: + return UBIFS_ITYPE_LNK; + case S_IFBLK: + return UBIFS_ITYPE_BLK; + case S_IFCHR: + return UBIFS_ITYPE_CHR; + case S_IFIFO: + return UBIFS_ITYPE_FIFO; + case S_IFSOCK: + return UBIFS_ITYPE_SOCK; + default: + BUG(); + } + return 0; +} + +/** + * pack_inode - pack an inode node. + * @c: UBIFS file-system description object + * @ino: buffer in which to pack inode node + * @inode: inode to pack + * @last: indicates the last node of the group + */ +static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, + const struct inode *inode, int last) +{ + int data_len = 0, last_reference = !inode->i_nlink; + struct ubifs_inode *ui = ubifs_inode(inode); + + ino->ch.node_type = UBIFS_INO_NODE; + ino_key_init_flash(c, &ino->key, inode->i_ino); + ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum); + ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec); + ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + ino->ctime_sec = cpu_to_le64(inode->i_ctime.tv_sec); + ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec); + ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ino->uid = cpu_to_le32(inode->i_uid); + ino->gid = cpu_to_le32(inode->i_gid); + ino->mode = cpu_to_le32(inode->i_mode); + ino->flags = cpu_to_le32(ui->flags); + ino->size = cpu_to_le64(ui->ui_size); + ino->nlink = cpu_to_le32(inode->i_nlink); + ino->compr_type = cpu_to_le16(ui->compr_type); + ino->data_len = cpu_to_le32(ui->data_len); + ino->xattr_cnt = cpu_to_le32(ui->xattr_cnt); + ino->xattr_size = cpu_to_le32(ui->xattr_size); + ino->xattr_names = cpu_to_le32(ui->xattr_names); + zero_ino_node_unused(ino); + + /* + * Drop the attached data if this is a deletion inode, the data is not + * needed anymore. + */ + if (!last_reference) { + memcpy(ino->data, ui->data, ui->data_len); + data_len = ui->data_len; + } + + ubifs_prep_grp_node(c, ino, UBIFS_INO_NODE_SZ + data_len, last); +} + +/** + * mark_inode_clean - mark UBIFS inode as clean. + * @c: UBIFS file-system description object + * @ui: UBIFS inode to mark as clean + * + * This helper function marks UBIFS inode @ui as clean by cleaning the + * @ui->dirty flag and releasing its budget. Note, VFS may still treat the + * inode as dirty and try to write it back, but 'ubifs_write_inode()' would + * just do nothing. + */ +static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui) +{ + if (ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); + ui->dirty = 0; +} + +/** + * ubifs_jnl_update - update inode. + * @c: UBIFS file-system description object + * @dir: parent inode or host inode in case of extended attributes + * @nm: directory entry name + * @inode: inode to update + * @deletion: indicates a directory entry deletion i.e unlink or rmdir + * @xent: non-zero if the directory entry is an extended attribute entry + * + * This function updates an inode by writing a directory entry (or extended + * attribute entry), the inode itself, and the parent directory inode (or the + * host inode) to the journal. + * + * The function writes the host inode @dir last, which is important in case of + * extended attributes. Indeed, then we guarantee that if the host inode gets + * synchronized (with 'fsync()'), and the write-buffer it sits in gets flushed, + * the extended attribute inode gets flushed too. And this is exactly what the + * user expects - synchronizing the host inode synchronizes its extended + * attributes. Similarly, this guarantees that if @dir is synchronized, its + * directory entry corresponding to @nm gets synchronized too. + * + * If the inode (@inode) or the parent directory (@dir) are synchronous, this + * function synchronizes the write-buffer. + * + * This function marks the @dir and @inode inodes as clean and returns zero on + * success. In case of failure, a negative error code is returned. + */ +int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, + const struct qstr *nm, const struct inode *inode, + int deletion, int xent) +{ + int err, dlen, ilen, len, lnum, ino_offs, dent_offs; + int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir); + int last_reference = !!(deletion && inode->i_nlink == 0); + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_dent_node *dent; + struct ubifs_ino_node *ino; + union ubifs_key dent_key, ino_key; + + dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu", + inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino); + ubifs_assert(dir_ui->data_len == 0); + ubifs_assert(mutex_is_locked(&dir_ui->ui_mutex)); + + dlen = UBIFS_DENT_NODE_SZ + nm->len + 1; + ilen = UBIFS_INO_NODE_SZ; + + /* + * If the last reference to the inode is being deleted, then there is + * no need to attach and write inode data, it is being deleted anyway. + * And if the inode is being deleted, no need to synchronize + * write-buffer even if the inode is synchronous. + */ + if (!last_reference) { + ilen += ui->data_len; + sync |= IS_SYNC(inode); + } + + aligned_dlen = ALIGN(dlen, 8); + aligned_ilen = ALIGN(ilen, 8); + len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ; + dent = kmalloc(len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + if (!xent) { + dent->ch.node_type = UBIFS_DENT_NODE; + dent_key_init(c, &dent_key, dir->i_ino, nm); + } else { + dent->ch.node_type = UBIFS_XENT_NODE; + xent_key_init(c, &dent_key, dir->i_ino, nm); + } + + key_write(c, &dent_key, dent->key); + dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino); + dent->type = get_dent_type(inode->i_mode); + dent->nlen = cpu_to_le16(nm->len); + memcpy(dent->name, nm->name, nm->len); + dent->name[nm->len] = '\0'; + zero_dent_node_unused(dent); + ubifs_prep_grp_node(c, dent, dlen, 0); + + ino = (void *)dent + aligned_dlen; + pack_inode(c, ino, inode, 0); + ino = (void *)ino + aligned_ilen; + pack_inode(c, ino, dir, 1); + + if (last_reference) { + err = ubifs_add_orphan(c, inode->i_ino); + if (err) { + release_head(c, BASEHD); + goto out_finish; + } + ui->del_cmtno = c->cmt_no; + } + + err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync); + if (err) + goto out_release; + if (!sync) { + struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf; + + ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino); + ubifs_wbuf_add_ino_nolock(wbuf, dir->i_ino); + } + release_head(c, BASEHD); + kfree(dent); + + if (deletion) { + err = ubifs_tnc_remove_nm(c, &dent_key, nm); + if (err) + goto out_ro; + err = ubifs_add_dirt(c, lnum, dlen); + } else + err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm); + if (err) + goto out_ro; + + /* + * Note, we do not remove the inode from TNC even if the last reference + * to it has just been deleted, because the inode may still be opened. + * Instead, the inode has been added to orphan lists and the orphan + * subsystem will take further care about it. + */ + ino_key_init(c, &ino_key, inode->i_ino); + ino_offs = dent_offs + aligned_dlen; + err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen); + if (err) + goto out_ro; + + ino_key_init(c, &ino_key, dir->i_ino); + ino_offs += aligned_ilen; + err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&ui->ui_lock); + ui->synced_i_size = ui->ui_size; + spin_unlock(&ui->ui_lock); + mark_inode_clean(c, ui); + mark_inode_clean(c, dir_ui); + return 0; + +out_finish: + finish_reservation(c); +out_free: + kfree(dent); + return err; + +out_release: + release_head(c, BASEHD); + kfree(dent); +out_ro: + ubifs_ro_mode(c, err); + if (last_reference) + ubifs_delete_orphan(c, inode->i_ino); + finish_reservation(c); + return err; +} + +/** + * ubifs_jnl_write_data - write a data node to the journal. + * @c: UBIFS file-system description object + * @inode: inode the data node belongs to + * @key: node key + * @buf: buffer to write + * @len: data length (must not exceed %UBIFS_BLOCK_SIZE) + * + * This function writes a data node to the journal. Returns %0 if the data node + * was successfully written, and a negative error code in case of failure. + */ +int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, + const union ubifs_key *key, const void *buf, int len) +{ + struct ubifs_data_node *data; + int err, lnum, offs, compr_type, out_len; + int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; + struct ubifs_inode *ui = ubifs_inode(inode); + + dbg_jnl("ino %lu, blk %u, len %d, key %s", + (unsigned long)key_inum(c, key), key_block(c, key), len, + DBGKEY(key)); + ubifs_assert(len <= UBIFS_BLOCK_SIZE); + + data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN); + if (!data) { + /* + * Fall-back to the write reserve buffer. Note, we might be + * currently on the memory reclaim path, when the kernel is + * trying to free some memory by writing out dirty pages. The + * write reserve buffer helps us to guarantee that we are + * always able to write the data. + */ + allocated = 0; + mutex_lock(&c->write_reserve_mutex); + data = c->write_reserve_buf; + } + + data->ch.node_type = UBIFS_DATA_NODE; + key_write(c, key, &data->key); + data->size = cpu_to_le32(len); + zero_data_node_unused(data); + + if (!(ui->flags & UBIFS_COMPR_FL)) + /* Compression is disabled for this inode */ + compr_type = UBIFS_COMPR_NONE; + else + compr_type = ui->compr_type; + + out_len = dlen - UBIFS_DATA_NODE_SZ; + ubifs_compress(buf, len, &data->data, &out_len, &compr_type); + ubifs_assert(out_len <= UBIFS_BLOCK_SIZE); + + dlen = UBIFS_DATA_NODE_SZ + out_len; + data->compr_type = cpu_to_le16(compr_type); + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, DATAHD, dlen); + if (err) + goto out_free; + + err = write_node(c, DATAHD, data, dlen, &lnum, &offs); + if (err) + goto out_release; + ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key)); + release_head(c, DATAHD); + + err = ubifs_tnc_add(c, key, lnum, offs, dlen); + if (err) + goto out_ro; + + finish_reservation(c); + if (!allocated) + mutex_unlock(&c->write_reserve_mutex); + else + kfree(data); + return 0; + +out_release: + release_head(c, DATAHD); +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + if (!allocated) + mutex_unlock(&c->write_reserve_mutex); + else + kfree(data); + return err; +} + +/** + * ubifs_jnl_write_inode - flush inode to the journal. + * @c: UBIFS file-system description object + * @inode: inode to flush + * + * This function writes inode @inode to the journal. If the inode is + * synchronous, it also synchronizes the write-buffer. Returns zero in case of + * success and a negative error code in case of failure. + */ +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err, lnum, offs; + struct ubifs_ino_node *ino; + struct ubifs_inode *ui = ubifs_inode(inode); + int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink; + + dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink); + + /* + * If the inode is being deleted, do not write the attached data. No + * need to synchronize the write-buffer either. + */ + if (!last_reference) { + len += ui->data_len; + sync = IS_SYNC(inode); + } + ino = kmalloc(len, GFP_NOFS); + if (!ino) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + pack_inode(c, ino, inode, 1); + err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); + if (err) + goto out_release; + if (!sync) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, + inode->i_ino); + release_head(c, BASEHD); + + if (last_reference) { + err = ubifs_tnc_remove_ino(c, inode->i_ino); + if (err) + goto out_ro; + ubifs_delete_orphan(c, inode->i_ino); + err = ubifs_add_dirt(c, lnum, len); + } else { + union ubifs_key key; + + ino_key_init(c, &key, inode->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, len); + } + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&ui->ui_lock); + ui->synced_i_size = ui->ui_size; + spin_unlock(&ui->ui_lock); + kfree(ino); + return 0; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(ino); + return err; +} + +/** + * ubifs_jnl_delete_inode - delete an inode. + * @c: UBIFS file-system description object + * @inode: inode to delete + * + * This function deletes inode @inode which includes removing it from orphans, + * deleting it from TNC and, in some cases, writing a deletion inode to the + * journal. + * + * When regular file inodes are unlinked or a directory inode is removed, the + * 'ubifs_jnl_update()' function writes a corresponding deletion inode and + * direntry to the media, and adds the inode to orphans. After this, when the + * last reference to this inode has been dropped, this function is called. In + * general, it has to write one more deletion inode to the media, because if + * a commit happened between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal + * anymore, and in fact it might not be on the flash anymore, because it might + * have been garbage-collected already. And for optimization reasons UBIFS does + * not read the orphan area if it has been unmounted cleanly, so it would have + * no indication in the journal that there is a deleted inode which has to be + * removed from TNC. + * + * However, if there was no commit between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion + * inode to the media for the second time. And this is quite a typical case. + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err; + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(inode->i_nlink == 0); + + if (ui->del_cmtno != c->cmt_no) + /* A commit happened for sure */ + return ubifs_jnl_write_inode(c, inode); + + down_read(&c->commit_sem); + /* + * Check commit number again, because the first test has been done + * without @c->commit_sem, so a commit might have happened. + */ + if (ui->del_cmtno != c->cmt_no) { + up_read(&c->commit_sem); + return ubifs_jnl_write_inode(c, inode); + } + + err = ubifs_tnc_remove_ino(c, inode->i_ino); + if (err) + ubifs_ro_mode(c, err); + else + ubifs_delete_orphan(c, inode->i_ino); + up_read(&c->commit_sem); + return err; +} + +/** + * ubifs_jnl_rename - rename a directory entry. + * @c: UBIFS file-system description object + * @old_dir: parent inode of directory entry to rename + * @old_dentry: directory entry to rename + * @new_dir: parent inode of directory entry to rename + * @new_dentry: new directory entry (or directory entry to replace) + * @sync: non-zero if the write-buffer has to be synchronized + * + * This function implements the re-name operation which may involve writing up + * to 3 inodes and 2 directory entries. It marks the written inodes as clean + * and returns zero on success. In case of failure, a negative error code is + * returned. + */ +int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, + const struct dentry *old_dentry, + const struct inode *new_dir, + const struct dentry *new_dentry, int sync) +{ + void *p; + union ubifs_key key; + struct ubifs_dent_node *dent, *dent2; + int err, dlen1, dlen2, ilen, lnum, offs, len; + const struct inode *old_inode = old_dentry->d_inode; + const struct inode *new_inode = new_dentry->d_inode; + int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; + int last_reference = !!(new_inode && new_inode->i_nlink == 0); + int move = (old_dir != new_dir); + struct ubifs_inode *uninitialized_var(new_ui); + + dbg_jnl("dent '%.*s' in dir ino %lu to dent '%.*s' in dir ino %lu", + old_dentry->d_name.len, old_dentry->d_name.name, + old_dir->i_ino, new_dentry->d_name.len, + new_dentry->d_name.name, new_dir->i_ino); + ubifs_assert(ubifs_inode(old_dir)->data_len == 0); + ubifs_assert(ubifs_inode(new_dir)->data_len == 0); + ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex)); + ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex)); + + dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1; + dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1; + if (new_inode) { + new_ui = ubifs_inode(new_inode); + ubifs_assert(mutex_is_locked(&new_ui->ui_mutex)); + ilen = UBIFS_INO_NODE_SZ; + if (!last_reference) + ilen += new_ui->data_len; + } else + ilen = 0; + + aligned_dlen1 = ALIGN(dlen1, 8); + aligned_dlen2 = ALIGN(dlen2, 8); + len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8); + if (old_dir != new_dir) + len += plen; + dent = kmalloc(len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + /* Make new dent */ + dent->ch.node_type = UBIFS_DENT_NODE; + dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name); + dent->inum = cpu_to_le64(old_inode->i_ino); + dent->type = get_dent_type(old_inode->i_mode); + dent->nlen = cpu_to_le16(new_dentry->d_name.len); + memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len); + dent->name[new_dentry->d_name.len] = '\0'; + zero_dent_node_unused(dent); + ubifs_prep_grp_node(c, dent, dlen1, 0); + + /* Make deletion dent */ + dent2 = (void *)dent + aligned_dlen1; + dent2->ch.node_type = UBIFS_DENT_NODE; + dent_key_init_flash(c, &dent2->key, old_dir->i_ino, + &old_dentry->d_name); + dent2->inum = 0; + dent2->type = DT_UNKNOWN; + dent2->nlen = cpu_to_le16(old_dentry->d_name.len); + memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len); + dent2->name[old_dentry->d_name.len] = '\0'; + zero_dent_node_unused(dent2); + ubifs_prep_grp_node(c, dent2, dlen2, 0); + + p = (void *)dent2 + aligned_dlen2; + if (new_inode) { + pack_inode(c, p, new_inode, 0); + p += ALIGN(ilen, 8); + } + + if (!move) + pack_inode(c, p, old_dir, 1); + else { + pack_inode(c, p, old_dir, 0); + p += ALIGN(plen, 8); + pack_inode(c, p, new_dir, 1); + } + + if (last_reference) { + err = ubifs_add_orphan(c, new_inode->i_ino); + if (err) { + release_head(c, BASEHD); + goto out_finish; + } + new_ui->del_cmtno = c->cmt_no; + } + + err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync); + if (err) + goto out_release; + if (!sync) { + struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf; + + ubifs_wbuf_add_ino_nolock(wbuf, new_dir->i_ino); + ubifs_wbuf_add_ino_nolock(wbuf, old_dir->i_ino); + if (new_inode) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, + new_inode->i_ino); + } + release_head(c, BASEHD); + + dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name); + if (err) + goto out_ro; + + err = ubifs_add_dirt(c, lnum, dlen2); + if (err) + goto out_ro; + + dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name); + err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name); + if (err) + goto out_ro; + + offs += aligned_dlen1 + aligned_dlen2; + if (new_inode) { + ino_key_init(c, &key, new_inode->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, ilen); + if (err) + goto out_ro; + offs += ALIGN(ilen, 8); + } + + ino_key_init(c, &key, old_dir->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, plen); + if (err) + goto out_ro; + + if (old_dir != new_dir) { + offs += ALIGN(plen, 8); + ino_key_init(c, &key, new_dir->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, plen); + if (err) + goto out_ro; + } + + finish_reservation(c); + if (new_inode) { + mark_inode_clean(c, new_ui); + spin_lock(&new_ui->ui_lock); + new_ui->synced_i_size = new_ui->ui_size; + spin_unlock(&new_ui->ui_lock); + } + mark_inode_clean(c, ubifs_inode(old_dir)); + if (move) + mark_inode_clean(c, ubifs_inode(new_dir)); + kfree(dent); + return 0; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + if (last_reference) + ubifs_delete_orphan(c, new_inode->i_ino); +out_finish: + finish_reservation(c); +out_free: + kfree(dent); + return err; +} + +/** + * recomp_data_node - re-compress a truncated data node. + * @dn: data node to re-compress + * @new_len: new length + * + * This function is used when an inode is truncated and the last data node of + * the inode has to be re-compressed and re-written. + */ +static int recomp_data_node(struct ubifs_data_node *dn, int *new_len) +{ + void *buf; + int err, len, compr_type, out_len; + + out_len = le32_to_cpu(dn->size); + buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS); + if (!buf) + return -ENOMEM; + + len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + compr_type = le16_to_cpu(dn->compr_type); + err = ubifs_decompress(&dn->data, len, buf, &out_len, compr_type); + if (err) + goto out; + + ubifs_compress(buf, *new_len, &dn->data, &out_len, &compr_type); + ubifs_assert(out_len <= UBIFS_BLOCK_SIZE); + dn->compr_type = cpu_to_le16(compr_type); + dn->size = cpu_to_le32(*new_len); + *new_len = UBIFS_DATA_NODE_SZ + out_len; +out: + kfree(buf); + return err; +} + +/** + * ubifs_jnl_truncate - update the journal for a truncation. + * @c: UBIFS file-system description object + * @inode: inode to truncate + * @old_size: old size + * @new_size: new size + * + * When the size of a file decreases due to truncation, a truncation node is + * written, the journal tree is updated, and the last data block is re-written + * if it has been affected. The inode is also updated in order to synchronize + * the new inode size. + * + * This function marks the inode as clean and returns zero on success. In case + * of failure, a negative error code is returned. + */ +int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, + loff_t old_size, loff_t new_size) +{ + union ubifs_key key, to_key; + struct ubifs_ino_node *ino; + struct ubifs_trun_node *trun; + struct ubifs_data_node *uninitialized_var(dn); + int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode); + struct ubifs_inode *ui = ubifs_inode(inode); + ino_t inum = inode->i_ino; + unsigned int blk; + + dbg_jnl("ino %lu, size %lld -> %lld", + (unsigned long)inum, old_size, new_size); + ubifs_assert(!ui->data_len); + ubifs_assert(S_ISREG(inode->i_mode)); + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + + sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + + UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR; + ino = kmalloc(sz, GFP_NOFS); + if (!ino) + return -ENOMEM; + + trun = (void *)ino + UBIFS_INO_NODE_SZ; + trun->ch.node_type = UBIFS_TRUN_NODE; + trun->inum = cpu_to_le32(inum); + trun->old_size = cpu_to_le64(old_size); + trun->new_size = cpu_to_le64(new_size); + zero_trun_node_unused(trun); + + dlen = new_size & (UBIFS_BLOCK_SIZE - 1); + if (dlen) { + /* Get last data block so it can be truncated */ + dn = (void *)trun + UBIFS_TRUN_NODE_SZ; + blk = new_size >> UBIFS_BLOCK_SHIFT; + data_key_init(c, &key, inum, blk); + dbg_jnl("last block key %s", DBGKEY(&key)); + err = ubifs_tnc_lookup(c, &key, dn); + if (err == -ENOENT) + dlen = 0; /* Not found (so it is a hole) */ + else if (err) + goto out_free; + else { + if (le32_to_cpu(dn->size) <= dlen) + dlen = 0; /* Nothing to do */ + else { + int compr_type = le16_to_cpu(dn->compr_type); + + if (compr_type != UBIFS_COMPR_NONE) { + err = recomp_data_node(dn, &dlen); + if (err) + goto out_free; + } else { + dn->size = cpu_to_le32(dlen); + dlen += UBIFS_DATA_NODE_SZ; + } + zero_data_node_unused(dn); + } + } + } + + /* Must make reservation before allocating sequence numbers */ + len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ; + if (dlen) + len += dlen; + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + pack_inode(c, ino, inode, 0); + ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1); + if (dlen) + ubifs_prep_grp_node(c, dn, dlen, 1); + + err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); + if (err) + goto out_release; + if (!sync) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum); + release_head(c, BASEHD); + + if (dlen) { + sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ; + err = ubifs_tnc_add(c, &key, lnum, sz, dlen); + if (err) + goto out_ro; + } + + ino_key_init(c, &key, inum); + err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ); + if (err) + goto out_ro; + + err = ubifs_add_dirt(c, lnum, UBIFS_TRUN_NODE_SZ); + if (err) + goto out_ro; + + bit = new_size & (UBIFS_BLOCK_SIZE - 1); + blk = (new_size >> UBIFS_BLOCK_SHIFT) + (bit ? 1 : 0); + data_key_init(c, &key, inum, blk); + + bit = old_size & (UBIFS_BLOCK_SIZE - 1); + blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0 : 1); + data_key_init(c, &to_key, inum, blk); + + err = ubifs_tnc_remove_range(c, &key, &to_key); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&ui->ui_lock); + ui->synced_i_size = ui->ui_size; + spin_unlock(&ui->ui_lock); + mark_inode_clean(c, ui); + kfree(ino); + return 0; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(ino); + return err; +} + +#ifdef CONFIG_UBIFS_FS_XATTR + +/** + * ubifs_jnl_delete_xattr - delete an extended attribute. + * @c: UBIFS file-system description object + * @host: host inode + * @inode: extended attribute inode + * @nm: extended attribute entry name + * + * This function delete an extended attribute which is very similar to + * un-linking regular files - it writes a deletion xentry, a deletion inode and + * updates the target inode. Returns zero in case of success and a negative + * error code in case of failure. + */ +int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, + const struct inode *inode, const struct qstr *nm) +{ + int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen; + struct ubifs_dent_node *xent; + struct ubifs_ino_node *ino; + union ubifs_key xent_key, key1, key2; + int sync = IS_DIRSYNC(host); + struct ubifs_inode *host_ui = ubifs_inode(host); + + dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d", + host->i_ino, inode->i_ino, nm->name, + ubifs_inode(inode)->data_len); + ubifs_assert(inode->i_nlink == 0); + ubifs_assert(mutex_is_locked(&host_ui->ui_mutex)); + + /* + * Since we are deleting the inode, we do not bother to attach any data + * to it and assume its length is %UBIFS_INO_NODE_SZ. + */ + xlen = UBIFS_DENT_NODE_SZ + nm->len + 1; + aligned_xlen = ALIGN(xlen, 8); + hlen = host_ui->data_len + UBIFS_INO_NODE_SZ; + len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8); + + xent = kmalloc(len, GFP_NOFS); + if (!xent) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) { + kfree(xent); + return err; + } + + xent->ch.node_type = UBIFS_XENT_NODE; + xent_key_init(c, &xent_key, host->i_ino, nm); + key_write(c, &xent_key, xent->key); + xent->inum = 0; + xent->type = get_dent_type(inode->i_mode); + xent->nlen = cpu_to_le16(nm->len); + memcpy(xent->name, nm->name, nm->len); + xent->name[nm->len] = '\0'; + zero_dent_node_unused(xent); + ubifs_prep_grp_node(c, xent, xlen, 0); + + ino = (void *)xent + aligned_xlen; + pack_inode(c, ino, inode, 0); + ino = (void *)ino + UBIFS_INO_NODE_SZ; + pack_inode(c, ino, host, 1); + + err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync); + if (!sync && !err) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino); + release_head(c, BASEHD); + kfree(xent); + if (err) + goto out_ro; + + /* Remove the extended attribute entry from TNC */ + err = ubifs_tnc_remove_nm(c, &xent_key, nm); + if (err) + goto out_ro; + err = ubifs_add_dirt(c, lnum, xlen); + if (err) + goto out_ro; + + /* + * Remove all nodes belonging to the extended attribute inode from TNC. + * Well, there actually must be only one node - the inode itself. + */ + lowest_ino_key(c, &key1, inode->i_ino); + highest_ino_key(c, &key2, inode->i_ino); + err = ubifs_tnc_remove_range(c, &key1, &key2); + if (err) + goto out_ro; + err = ubifs_add_dirt(c, lnum, UBIFS_INO_NODE_SZ); + if (err) + goto out_ro; + + /* And update TNC with the new host inode position */ + ino_key_init(c, &key1, host->i_ino); + err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&host_ui->ui_lock); + host_ui->synced_i_size = host_ui->ui_size; + spin_unlock(&host_ui->ui_lock); + mark_inode_clean(c, host_ui); + return 0; + +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); + return err; +} + +/** + * ubifs_jnl_change_xattr - change an extended attribute. + * @c: UBIFS file-system description object + * @inode: extended attribute inode + * @host: host inode + * + * This function writes the updated version of an extended attribute inode and + * the host inode to the journal (to the base head). The host inode is written + * after the extended attribute inode in order to guarantee that the extended + * attribute will be flushed when the inode is synchronized by 'fsync()' and + * consequently, the write-buffer is synchronized. This function returns zero + * in case of success and a negative error code in case of failure. + */ +int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, + const struct inode *host) +{ + int err, len1, len2, aligned_len, aligned_len1, lnum, offs; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_ino_node *ino; + union ubifs_key key; + int sync = IS_DIRSYNC(host); + + dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino); + ubifs_assert(host->i_nlink > 0); + ubifs_assert(inode->i_nlink > 0); + ubifs_assert(mutex_is_locked(&host_ui->ui_mutex)); + + len1 = UBIFS_INO_NODE_SZ + host_ui->data_len; + len2 = UBIFS_INO_NODE_SZ + ubifs_inode(inode)->data_len; + aligned_len1 = ALIGN(len1, 8); + aligned_len = aligned_len1 + ALIGN(len2, 8); + + ino = kmalloc(aligned_len, GFP_NOFS); + if (!ino) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, aligned_len); + if (err) + goto out_free; + + pack_inode(c, ino, host, 0); + pack_inode(c, (void *)ino + aligned_len1, inode, 1); + + err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0); + if (!sync && !err) { + struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf; + + ubifs_wbuf_add_ino_nolock(wbuf, host->i_ino); + ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino); + } + release_head(c, BASEHD); + if (err) + goto out_ro; + + ino_key_init(c, &key, host->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, len1); + if (err) + goto out_ro; + + ino_key_init(c, &key, inode->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&host_ui->ui_lock); + host_ui->synced_i_size = host_ui->ui_size; + spin_unlock(&host_ui->ui_lock); + mark_inode_clean(c, host_ui); + kfree(ino); + return 0; + +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(ino); + return err; +} + +#endif /* CONFIG_UBIFS_FS_XATTR */ diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h new file mode 100644 index 00000000..92a8491a --- /dev/null +++ b/fs/ubifs/key.h @@ -0,0 +1,548 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This header contains various key-related definitions and helper function. + * UBIFS allows several key schemes, so we access key fields only via these + * helpers. At the moment only one key scheme is supported. + * + * Simple key scheme + * ~~~~~~~~~~~~~~~~~ + * + * Keys are 64-bits long. First 32-bits are inode number (parent inode number + * in case of direntry key). Next 3 bits are node type. The last 29 bits are + * 4KiB offset in case of inode node, and direntry hash in case of a direntry + * node. We use "r5" hash borrowed from reiserfs. + */ + +#ifndef __UBIFS_KEY_H__ +#define __UBIFS_KEY_H__ + +/** + * key_mask_hash - mask a valid hash value. + * @val: value to be masked + * + * We use hash values as offset in directories, so values %0 and %1 are + * reserved for "." and "..". %2 is reserved for "end of readdir" marker. This + * function makes sure the reserved values are not used. + */ +static inline uint32_t key_mask_hash(uint32_t hash) +{ + hash &= UBIFS_S_KEY_HASH_MASK; + if (unlikely(hash <= 2)) + hash += 3; + return hash; +} + +/** + * key_r5_hash - R5 hash function (borrowed from reiserfs). + * @s: direntry name + * @len: name length + */ +static inline uint32_t key_r5_hash(const char *s, int len) +{ + uint32_t a = 0; + const signed char *str = (const signed char *)s; + + while (*str) { + a += *str << 4; + a += *str >> 4; + a *= 11; + str++; + } + + return key_mask_hash(a); +} + +/** + * key_test_hash - testing hash function. + * @str: direntry name + * @len: name length + */ +static inline uint32_t key_test_hash(const char *str, int len) +{ + uint32_t a = 0; + + len = min_t(uint32_t, len, 4); + memcpy(&a, str, len); + return key_mask_hash(a); +} + +/** + * ino_key_init - initialize inode key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + */ +static inline void ino_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * ino_key_init_flash - initialize on-flash inode key. + * @c: UBIFS file-system description object + * @k: key to initialize + * @inum: inode number + */ +static inline void ino_key_init_flash(const struct ubifs_info *c, void *k, + ino_t inum) +{ + union ubifs_key *key = k; + + key->j32[0] = cpu_to_le32(inum); + key->j32[1] = cpu_to_le32(UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS); + memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * lowest_ino_key - get the lowest possible inode key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + */ +static inline void lowest_ino_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = 0; +} + +/** + * highest_ino_key - get the highest possible inode key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + */ +static inline void highest_ino_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = 0xffffffff; +} + +/** + * dent_key_init - initialize directory entry key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: parent inode number + * @nm: direntry name and length + */ +static inline void dent_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + const struct qstr *nm) +{ + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->u32[0] = inum; + key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS); +} + +/** + * dent_key_init_hash - initialize directory entry key without re-calculating + * hash function. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: parent inode number + * @hash: direntry name hash + */ +static inline void dent_key_init_hash(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + uint32_t hash) +{ + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->u32[0] = inum; + key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS); +} + +/** + * dent_key_init_flash - initialize on-flash directory entry key. + * @c: UBIFS file-system description object + * @k: key to initialize + * @inum: parent inode number + * @nm: direntry name and length + */ +static inline void dent_key_init_flash(const struct ubifs_info *c, void *k, + ino_t inum, const struct qstr *nm) +{ + union ubifs_key *key = k; + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->j32[0] = cpu_to_le32(inum); + key->j32[1] = cpu_to_le32(hash | + (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS)); + memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * lowest_dent_key - get the lowest possible directory entry key. + * @c: UBIFS file-system description object + * @key: where to store the lowest key + * @inum: parent inode number + */ +static inline void lowest_dent_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS; +} + +/** + * xent_key_init - initialize extended attribute entry key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: host inode number + * @nm: extended attribute entry name and length + */ +static inline void xent_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + const struct qstr *nm) +{ + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->u32[0] = inum; + key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS); +} + +/** + * xent_key_init_flash - initialize on-flash extended attribute entry key. + * @c: UBIFS file-system description object + * @k: key to initialize + * @inum: host inode number + * @nm: extended attribute entry name and length + */ +static inline void xent_key_init_flash(const struct ubifs_info *c, void *k, + ino_t inum, const struct qstr *nm) +{ + union ubifs_key *key = k; + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->j32[0] = cpu_to_le32(inum); + key->j32[1] = cpu_to_le32(hash | + (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS)); + memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * lowest_xent_key - get the lowest possible extended attribute entry key. + * @c: UBIFS file-system description object + * @key: where to store the lowest key + * @inum: host inode number + */ +static inline void lowest_xent_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS; +} + +/** + * data_key_init - initialize data key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + * @block: block number + */ +static inline void data_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + unsigned int block) +{ + ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK)); + key->u32[0] = inum; + key->u32[1] = block | (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS); +} + +/** + * highest_data_key - get the highest possible data key for an inode. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + */ +static inline void highest_data_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + data_key_init(c, key, inum, UBIFS_S_KEY_BLOCK_MASK); +} + +/** + * trun_key_init - initialize truncation node key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + * + * Note, UBIFS does not have truncation keys on the media and this function is + * only used for purposes of replay. + */ +static inline void trun_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_TRUN_KEY << UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * invalid_key_init - initialize invalid node key. + * @c: UBIFS file-system description object + * @key: key to initialize + * + * This is a helper function which marks a @key object as invalid. + */ +static inline void invalid_key_init(const struct ubifs_info *c, + union ubifs_key *key) +{ + key->u32[0] = 0xDEADBEAF; + key->u32[1] = UBIFS_INVALID_KEY; +} + +/** + * key_type - get key type. + * @c: UBIFS file-system description object + * @key: key to get type of + */ +static inline int key_type(const struct ubifs_info *c, + const union ubifs_key *key) +{ + return key->u32[1] >> UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * key_type_flash - get type of a on-flash formatted key. + * @c: UBIFS file-system description object + * @k: key to get type of + */ +static inline int key_type_flash(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->j32[1]) >> UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * key_inum - fetch inode number from key. + * @c: UBIFS file-system description object + * @k: key to fetch inode number from + */ +static inline ino_t key_inum(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return key->u32[0]; +} + +/** + * key_inum_flash - fetch inode number from an on-flash formatted key. + * @c: UBIFS file-system description object + * @k: key to fetch inode number from + */ +static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->j32[0]); +} + +/** + * key_hash - get directory entry hash. + * @c: UBIFS file-system description object + * @key: the key to get hash from + */ +static inline uint32_t key_hash(const struct ubifs_info *c, + const union ubifs_key *key) +{ + return key->u32[1] & UBIFS_S_KEY_HASH_MASK; +} + +/** + * key_hash_flash - get directory entry hash from an on-flash formatted key. + * @c: UBIFS file-system description object + * @k: the key to get hash from + */ +static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_HASH_MASK; +} + +/** + * key_block - get data block number. + * @c: UBIFS file-system description object + * @key: the key to get the block number from + */ +static inline unsigned int key_block(const struct ubifs_info *c, + const union ubifs_key *key) +{ + return key->u32[1] & UBIFS_S_KEY_BLOCK_MASK; +} + +/** + * key_block_flash - get data block number from an on-flash formatted key. + * @c: UBIFS file-system description object + * @k: the key to get the block number from + */ +static inline unsigned int key_block_flash(const struct ubifs_info *c, + const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_BLOCK_MASK; +} + +/** + * key_read - transform a key to in-memory format. + * @c: UBIFS file-system description object + * @from: the key to transform + * @to: the key to store the result + */ +static inline void key_read(const struct ubifs_info *c, const void *from, + union ubifs_key *to) +{ + const union ubifs_key *f = from; + + to->u32[0] = le32_to_cpu(f->j32[0]); + to->u32[1] = le32_to_cpu(f->j32[1]); +} + +/** + * key_write - transform a key from in-memory format. + * @c: UBIFS file-system description object + * @from: the key to transform + * @to: the key to store the result + */ +static inline void key_write(const struct ubifs_info *c, + const union ubifs_key *from, void *to) +{ + union ubifs_key *t = to; + + t->j32[0] = cpu_to_le32(from->u32[0]); + t->j32[1] = cpu_to_le32(from->u32[1]); + memset(to + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * key_write_idx - transform a key from in-memory format for the index. + * @c: UBIFS file-system description object + * @from: the key to transform + * @to: the key to store the result + */ +static inline void key_write_idx(const struct ubifs_info *c, + const union ubifs_key *from, void *to) +{ + union ubifs_key *t = to; + + t->j32[0] = cpu_to_le32(from->u32[0]); + t->j32[1] = cpu_to_le32(from->u32[1]); +} + +/** + * key_copy - copy a key. + * @c: UBIFS file-system description object + * @from: the key to copy from + * @to: the key to copy to + */ +static inline void key_copy(const struct ubifs_info *c, + const union ubifs_key *from, union ubifs_key *to) +{ + to->u64[0] = from->u64[0]; +} + +/** + * keys_cmp - compare keys. + * @c: UBIFS file-system description object + * @key1: the first key to compare + * @key2: the second key to compare + * + * This function compares 2 keys and returns %-1 if @key1 is less than + * @key2, %0 if the keys are equivalent and %1 if @key1 is greater than @key2. + */ +static inline int keys_cmp(const struct ubifs_info *c, + const union ubifs_key *key1, + const union ubifs_key *key2) +{ + if (key1->u32[0] < key2->u32[0]) + return -1; + if (key1->u32[0] > key2->u32[0]) + return 1; + if (key1->u32[1] < key2->u32[1]) + return -1; + if (key1->u32[1] > key2->u32[1]) + return 1; + + return 0; +} + +/** + * keys_eq - determine if keys are equivalent. + * @c: UBIFS file-system description object + * @key1: the first key to compare + * @key2: the second key to compare + * + * This function compares 2 keys and returns %1 if @key1 is equal to @key2 and + * %0 if not. + */ +static inline int keys_eq(const struct ubifs_info *c, + const union ubifs_key *key1, + const union ubifs_key *key2) +{ + if (key1->u32[0] != key2->u32[0]) + return 0; + if (key1->u32[1] != key2->u32[1]) + return 0; + return 1; +} + +/** + * is_hash_key - is a key vulnerable to hash collisions. + * @c: UBIFS file-system description object + * @key: key + * + * This function returns %1 if @key is a hashed key or %0 otherwise. + */ +static inline int is_hash_key(const struct ubifs_info *c, + const union ubifs_key *key) +{ + int type = key_type(c, key); + + return type == UBIFS_DENT_KEY || type == UBIFS_XENT_KEY; +} + +/** + * key_max_inode_size - get maximum file size allowed by current key format. + * @c: UBIFS file-system description object + */ +static inline unsigned long long key_max_inode_size(const struct ubifs_info *c) +{ + switch (c->key_fmt) { + case UBIFS_SIMPLE_KEY_FMT: + return (1ULL << UBIFS_S_KEY_BLOCK_BITS) * UBIFS_BLOCK_SIZE; + default: + return 0; + } +} + +#endif /* !__UBIFS_KEY_H__ */ diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c new file mode 100644 index 00000000..affea949 --- /dev/null +++ b/fs/ubifs/log.c @@ -0,0 +1,773 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file is a part of UBIFS journal implementation and contains various + * functions which manipulate the log. The log is a fixed area on the flash + * which does not contain any data but refers to buds. The log is a part of the + * journal. + */ + +#include "ubifs.h" + +#ifdef CONFIG_UBIFS_FS_DEBUG +static int dbg_check_bud_bytes(struct ubifs_info *c); +#else +#define dbg_check_bud_bytes(c) 0 +#endif + +/** + * ubifs_search_bud - search bud LEB. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number to search + * + * This function searches bud LEB @lnum. Returns bud description object in case + * of success and %NULL if there is no bud with this LEB number. + */ +struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum) +{ + struct rb_node *p; + struct ubifs_bud *bud; + + spin_lock(&c->buds_lock); + p = c->buds.rb_node; + while (p) { + bud = rb_entry(p, struct ubifs_bud, rb); + if (lnum < bud->lnum) + p = p->rb_left; + else if (lnum > bud->lnum) + p = p->rb_right; + else { + spin_unlock(&c->buds_lock); + return bud; + } + } + spin_unlock(&c->buds_lock); + return NULL; +} + +/** + * ubifs_get_wbuf - get the wbuf associated with a LEB, if there is one. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number to search + * + * This functions returns the wbuf for @lnum or %NULL if there is not one. + */ +struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum) +{ + struct rb_node *p; + struct ubifs_bud *bud; + int jhead; + + if (!c->jheads) + return NULL; + + spin_lock(&c->buds_lock); + p = c->buds.rb_node; + while (p) { + bud = rb_entry(p, struct ubifs_bud, rb); + if (lnum < bud->lnum) + p = p->rb_left; + else if (lnum > bud->lnum) + p = p->rb_right; + else { + jhead = bud->jhead; + spin_unlock(&c->buds_lock); + return &c->jheads[jhead].wbuf; + } + } + spin_unlock(&c->buds_lock); + return NULL; +} + +/** + * empty_log_bytes - calculate amount of empty space in the log. + * @c: UBIFS file-system description object + */ +static inline long long empty_log_bytes(const struct ubifs_info *c) +{ + long long h, t; + + h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs; + t = (long long)c->ltail_lnum * c->leb_size; + + if (h >= t) + return c->log_bytes - h + t; + else + return t - h; +} + +/** + * ubifs_add_bud - add bud LEB to the tree of buds and its journal head list. + * @c: UBIFS file-system description object + * @bud: the bud to add + */ +void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud) +{ + struct rb_node **p, *parent = NULL; + struct ubifs_bud *b; + struct ubifs_jhead *jhead; + + spin_lock(&c->buds_lock); + p = &c->buds.rb_node; + while (*p) { + parent = *p; + b = rb_entry(parent, struct ubifs_bud, rb); + ubifs_assert(bud->lnum != b->lnum); + if (bud->lnum < b->lnum) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&bud->rb, parent, p); + rb_insert_color(&bud->rb, &c->buds); + if (c->jheads) { + jhead = &c->jheads[bud->jhead]; + list_add_tail(&bud->list, &jhead->buds_list); + } else + ubifs_assert(c->replaying && c->ro_mount); + + /* + * Note, although this is a new bud, we anyway account this space now, + * before any data has been written to it, because this is about to + * guarantee fixed mount time, and this bud will anyway be read and + * scanned. + */ + c->bud_bytes += c->leb_size - bud->start; + + dbg_log("LEB %d:%d, jhead %s, bud_bytes %lld", bud->lnum, + bud->start, dbg_jhead(bud->jhead), c->bud_bytes); + spin_unlock(&c->buds_lock); +} + +/** + * ubifs_add_bud_to_log - add a new bud to the log. + * @c: UBIFS file-system description object + * @jhead: journal head the bud belongs to + * @lnum: LEB number of the bud + * @offs: starting offset of the bud + * + * This function writes reference node for the new bud LEB @lnum it to the log, + * and adds it to the buds tress. It also makes sure that log size does not + * exceed the 'c->max_bud_bytes' limit. Returns zero in case of success, + * %-EAGAIN if commit is required, and a negative error codes in case of + * failure. + */ +int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) +{ + int err; + struct ubifs_bud *bud; + struct ubifs_ref_node *ref; + + bud = kmalloc(sizeof(struct ubifs_bud), GFP_NOFS); + if (!bud) + return -ENOMEM; + ref = kzalloc(c->ref_node_alsz, GFP_NOFS); + if (!ref) { + kfree(bud); + return -ENOMEM; + } + + mutex_lock(&c->log_mutex); + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) { + err = -EROFS; + goto out_unlock; + } + + /* Make sure we have enough space in the log */ + if (empty_log_bytes(c) - c->ref_node_alsz < c->min_log_bytes) { + dbg_log("not enough log space - %lld, required %d", + empty_log_bytes(c), c->min_log_bytes); + ubifs_commit_required(c); + err = -EAGAIN; + goto out_unlock; + } + + /* + * Make sure the amount of space in buds will not exceed the + * 'c->max_bud_bytes' limit, because we want to guarantee mount time + * limits. + * + * It is not necessary to hold @c->buds_lock when reading @c->bud_bytes + * because we are holding @c->log_mutex. All @c->bud_bytes take place + * when both @c->log_mutex and @c->bud_bytes are locked. + */ + if (c->bud_bytes + c->leb_size - offs > c->max_bud_bytes) { + dbg_log("bud bytes %lld (%lld max), require commit", + c->bud_bytes, c->max_bud_bytes); + ubifs_commit_required(c); + err = -EAGAIN; + goto out_unlock; + } + + /* + * If the journal is full enough - start background commit. Note, it is + * OK to read 'c->cmt_state' without spinlock because integer reads + * are atomic in the kernel. + */ + if (c->bud_bytes >= c->bg_bud_bytes && + c->cmt_state == COMMIT_RESTING) { + dbg_log("bud bytes %lld (%lld max), initiate BG commit", + c->bud_bytes, c->max_bud_bytes); + ubifs_request_bg_commit(c); + } + + bud->lnum = lnum; + bud->start = offs; + bud->jhead = jhead; + + ref->ch.node_type = UBIFS_REF_NODE; + ref->lnum = cpu_to_le32(bud->lnum); + ref->offs = cpu_to_le32(bud->start); + ref->jhead = cpu_to_le32(jhead); + + if (c->lhead_offs > c->leb_size - c->ref_node_alsz) { + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + c->lhead_offs = 0; + } + + if (c->lhead_offs == 0) { + /* Must ensure next log LEB has been unmapped */ + err = ubifs_leb_unmap(c, c->lhead_lnum); + if (err) + goto out_unlock; + } + + if (bud->start == 0) { + /* + * Before writing the LEB reference which refers an empty LEB + * to the log, we have to make sure it is mapped, because + * otherwise we'd risk to refer an LEB with garbage in case of + * an unclean reboot, because the target LEB might have been + * unmapped, but not yet physically erased. + */ + err = ubi_leb_map(c->ubi, bud->lnum, UBI_SHORTTERM); + if (err) + goto out_unlock; + } + + dbg_log("write ref LEB %d:%d", + c->lhead_lnum, c->lhead_offs); + err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum, + c->lhead_offs, UBI_SHORTTERM); + if (err) + goto out_unlock; + + c->lhead_offs += c->ref_node_alsz; + + ubifs_add_bud(c, bud); + + mutex_unlock(&c->log_mutex); + kfree(ref); + return 0; + +out_unlock: + if (err != -EAGAIN) + ubifs_ro_mode(c, err); + mutex_unlock(&c->log_mutex); + kfree(ref); + kfree(bud); + return err; +} + +/** + * remove_buds - remove used buds. + * @c: UBIFS file-system description object + * + * This function removes use buds from the buds tree. It does not remove the + * buds which are pointed to by journal heads. + */ +static void remove_buds(struct ubifs_info *c) +{ + struct rb_node *p; + + ubifs_assert(list_empty(&c->old_buds)); + c->cmt_bud_bytes = 0; + spin_lock(&c->buds_lock); + p = rb_first(&c->buds); + while (p) { + struct rb_node *p1 = p; + struct ubifs_bud *bud; + struct ubifs_wbuf *wbuf; + + p = rb_next(p); + bud = rb_entry(p1, struct ubifs_bud, rb); + wbuf = &c->jheads[bud->jhead].wbuf; + + if (wbuf->lnum == bud->lnum) { + /* + * Do not remove buds which are pointed to by journal + * heads (non-closed buds). + */ + c->cmt_bud_bytes += wbuf->offs - bud->start; + dbg_log("preserve %d:%d, jhead %s, bud bytes %d, " + "cmt_bud_bytes %lld", bud->lnum, bud->start, + dbg_jhead(bud->jhead), wbuf->offs - bud->start, + c->cmt_bud_bytes); + bud->start = wbuf->offs; + } else { + c->cmt_bud_bytes += c->leb_size - bud->start; + dbg_log("remove %d:%d, jhead %s, bud bytes %d, " + "cmt_bud_bytes %lld", bud->lnum, bud->start, + dbg_jhead(bud->jhead), c->leb_size - bud->start, + c->cmt_bud_bytes); + rb_erase(p1, &c->buds); + /* + * If the commit does not finish, the recovery will need + * to replay the journal, in which case the old buds + * must be unchanged. Do not release them until post + * commit i.e. do not allow them to be garbage + * collected. + */ + list_move(&bud->list, &c->old_buds); + } + } + spin_unlock(&c->buds_lock); +} + +/** + * ubifs_log_start_commit - start commit. + * @c: UBIFS file-system description object + * @ltail_lnum: return new log tail LEB number + * + * The commit operation starts with writing "commit start" node to the log and + * reference nodes for all journal heads which will define new journal after + * the commit has been finished. The commit start and reference nodes are + * written in one go to the nearest empty log LEB (hence, when commit is + * finished UBIFS may safely unmap all the previous log LEBs). This function + * returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) +{ + void *buf; + struct ubifs_cs_node *cs; + struct ubifs_ref_node *ref; + int err, i, max_len, len; + + err = dbg_check_bud_bytes(c); + if (err) + return err; + + max_len = UBIFS_CS_NODE_SZ + c->jhead_cnt * UBIFS_REF_NODE_SZ; + max_len = ALIGN(max_len, c->min_io_size); + buf = cs = kmalloc(max_len, GFP_NOFS); + if (!buf) + return -ENOMEM; + + cs->ch.node_type = UBIFS_CS_NODE; + cs->cmt_no = cpu_to_le64(c->cmt_no); + ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0); + + /* + * Note, we do not lock 'c->log_mutex' because this is the commit start + * phase and we are exclusively using the log. And we do not lock + * write-buffer because nobody can write to the file-system at this + * phase. + */ + + len = UBIFS_CS_NODE_SZ; + for (i = 0; i < c->jhead_cnt; i++) { + int lnum = c->jheads[i].wbuf.lnum; + int offs = c->jheads[i].wbuf.offs; + + if (lnum == -1 || offs == c->leb_size) + continue; + + dbg_log("add ref to LEB %d:%d for jhead %s", + lnum, offs, dbg_jhead(i)); + ref = buf + len; + ref->ch.node_type = UBIFS_REF_NODE; + ref->lnum = cpu_to_le32(lnum); + ref->offs = cpu_to_le32(offs); + ref->jhead = cpu_to_le32(i); + + ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0); + len += UBIFS_REF_NODE_SZ; + } + + ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len); + + /* Switch to the next log LEB */ + if (c->lhead_offs) { + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + c->lhead_offs = 0; + } + + if (c->lhead_offs == 0) { + /* Must ensure next LEB has been unmapped */ + err = ubifs_leb_unmap(c, c->lhead_lnum); + if (err) + goto out; + } + + len = ALIGN(len, c->min_io_size); + dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len); + err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM); + if (err) + goto out; + + *ltail_lnum = c->lhead_lnum; + + c->lhead_offs += len; + if (c->lhead_offs == c->leb_size) { + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + c->lhead_offs = 0; + } + + remove_buds(c); + + /* + * We have started the commit and now users may use the rest of the log + * for new writes. + */ + c->min_log_bytes = 0; + +out: + kfree(buf); + return err; +} + +/** + * ubifs_log_end_commit - end commit. + * @c: UBIFS file-system description object + * @ltail_lnum: new log tail LEB number + * + * This function is called on when the commit operation was finished. It + * moves log tail to new position and unmaps LEBs which contain obsolete data. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum) +{ + int err; + + /* + * At this phase we have to lock 'c->log_mutex' because UBIFS allows FS + * writes during commit. Its only short "commit" start phase when + * writers are blocked. + */ + mutex_lock(&c->log_mutex); + + dbg_log("old tail was LEB %d:0, new tail is LEB %d:0", + c->ltail_lnum, ltail_lnum); + + c->ltail_lnum = ltail_lnum; + /* + * The commit is finished and from now on it must be guaranteed that + * there is always enough space for the next commit. + */ + c->min_log_bytes = c->leb_size; + + spin_lock(&c->buds_lock); + c->bud_bytes -= c->cmt_bud_bytes; + spin_unlock(&c->buds_lock); + + err = dbg_check_bud_bytes(c); + + mutex_unlock(&c->log_mutex); + return err; +} + +/** + * ubifs_log_post_commit - things to do after commit is completed. + * @c: UBIFS file-system description object + * @old_ltail_lnum: old log tail LEB number + * + * Release buds only after commit is completed, because they must be unchanged + * if recovery is needed. + * + * Unmap log LEBs only after commit is completed, because they may be needed for + * recovery. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum) +{ + int lnum, err = 0; + + while (!list_empty(&c->old_buds)) { + struct ubifs_bud *bud; + + bud = list_entry(c->old_buds.next, struct ubifs_bud, list); + err = ubifs_return_leb(c, bud->lnum); + if (err) + return err; + list_del(&bud->list); + kfree(bud); + } + mutex_lock(&c->log_mutex); + for (lnum = old_ltail_lnum; lnum != c->ltail_lnum; + lnum = ubifs_next_log_lnum(c, lnum)) { + dbg_log("unmap log LEB %d", lnum); + err = ubifs_leb_unmap(c, lnum); + if (err) + goto out; + } +out: + mutex_unlock(&c->log_mutex); + return err; +} + +/** + * struct done_ref - references that have been done. + * @rb: rb-tree node + * @lnum: LEB number + */ +struct done_ref { + struct rb_node rb; + int lnum; +}; + +/** + * done_already - determine if a reference has been done already. + * @done_tree: rb-tree to store references that have been done + * @lnum: LEB number of reference + * + * This function returns %1 if the reference has been done, %0 if not, otherwise + * a negative error code is returned. + */ +static int done_already(struct rb_root *done_tree, int lnum) +{ + struct rb_node **p = &done_tree->rb_node, *parent = NULL; + struct done_ref *dr; + + while (*p) { + parent = *p; + dr = rb_entry(parent, struct done_ref, rb); + if (lnum < dr->lnum) + p = &(*p)->rb_left; + else if (lnum > dr->lnum) + p = &(*p)->rb_right; + else + return 1; + } + + dr = kzalloc(sizeof(struct done_ref), GFP_NOFS); + if (!dr) + return -ENOMEM; + + dr->lnum = lnum; + + rb_link_node(&dr->rb, parent, p); + rb_insert_color(&dr->rb, done_tree); + + return 0; +} + +/** + * destroy_done_tree - destroy the done tree. + * @done_tree: done tree to destroy + */ +static void destroy_done_tree(struct rb_root *done_tree) +{ + struct rb_node *this = done_tree->rb_node; + struct done_ref *dr; + + while (this) { + if (this->rb_left) { + this = this->rb_left; + continue; + } else if (this->rb_right) { + this = this->rb_right; + continue; + } + dr = rb_entry(this, struct done_ref, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &dr->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(dr); + } +} + +/** + * add_node - add a node to the consolidated log. + * @c: UBIFS file-system description object + * @buf: buffer to which to add + * @lnum: LEB number to which to write is passed and returned here + * @offs: offset to where to write is passed and returned here + * @node: node to add + * + * This function returns %0 on success and a negative error code on failure. + */ +static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs, + void *node) +{ + struct ubifs_ch *ch = node; + int len = le32_to_cpu(ch->len), remains = c->leb_size - *offs; + + if (len > remains) { + int sz = ALIGN(*offs, c->min_io_size), err; + + ubifs_pad(c, buf + *offs, sz - *offs); + err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM); + if (err) + return err; + *lnum = ubifs_next_log_lnum(c, *lnum); + *offs = 0; + } + memcpy(buf + *offs, node, len); + *offs += ALIGN(len, 8); + return 0; +} + +/** + * ubifs_consolidate_log - consolidate the log. + * @c: UBIFS file-system description object + * + * Repeated failed commits could cause the log to be full, but at least 1 LEB is + * needed for commit. This function rewrites the reference nodes in the log + * omitting duplicates, and failed CS nodes, and leaving no gaps. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_consolidate_log(struct ubifs_info *c) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + struct rb_root done_tree = RB_ROOT; + int lnum, err, first = 1, write_lnum, offs = 0; + void *buf; + + dbg_rcvry("log tail LEB %d, log head LEB %d", c->ltail_lnum, + c->lhead_lnum); + buf = vmalloc(c->leb_size); + if (!buf) + return -ENOMEM; + lnum = c->ltail_lnum; + write_lnum = lnum; + while (1) { + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0); + if (IS_ERR(sleb)) { + err = PTR_ERR(sleb); + goto out_free; + } + list_for_each_entry(snod, &sleb->nodes, list) { + switch (snod->type) { + case UBIFS_REF_NODE: { + struct ubifs_ref_node *ref = snod->node; + int ref_lnum = le32_to_cpu(ref->lnum); + + err = done_already(&done_tree, ref_lnum); + if (err < 0) + goto out_scan; + if (err != 1) { + err = add_node(c, buf, &write_lnum, + &offs, snod->node); + if (err) + goto out_scan; + } + break; + } + case UBIFS_CS_NODE: + if (!first) + break; + err = add_node(c, buf, &write_lnum, &offs, + snod->node); + if (err) + goto out_scan; + first = 0; + break; + } + } + ubifs_scan_destroy(sleb); + if (lnum == c->lhead_lnum) + break; + lnum = ubifs_next_log_lnum(c, lnum); + } + if (offs) { + int sz = ALIGN(offs, c->min_io_size); + + ubifs_pad(c, buf + offs, sz - offs); + err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM); + if (err) + goto out_free; + offs = ALIGN(offs, c->min_io_size); + } + destroy_done_tree(&done_tree); + vfree(buf); + if (write_lnum == c->lhead_lnum) { + ubifs_err("log is too full"); + return -EINVAL; + } + /* Unmap remaining LEBs */ + lnum = write_lnum; + do { + lnum = ubifs_next_log_lnum(c, lnum); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } while (lnum != c->lhead_lnum); + c->lhead_lnum = write_lnum; + c->lhead_offs = offs; + dbg_rcvry("new log head at %d:%d", c->lhead_lnum, c->lhead_offs); + return 0; + +out_scan: + ubifs_scan_destroy(sleb); +out_free: + destroy_done_tree(&done_tree); + vfree(buf); + return err; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * dbg_check_bud_bytes - make sure bud bytes calculation are all right. + * @c: UBIFS file-system description object + * + * This function makes sure the amount of flash space used by closed buds + * ('c->bud_bytes' is correct). Returns zero in case of success and %-EINVAL in + * case of failure. + */ +static int dbg_check_bud_bytes(struct ubifs_info *c) +{ + int i, err = 0; + struct ubifs_bud *bud; + long long bud_bytes = 0; + + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + + spin_lock(&c->buds_lock); + for (i = 0; i < c->jhead_cnt; i++) + list_for_each_entry(bud, &c->jheads[i].buds_list, list) + bud_bytes += c->leb_size - bud->start; + + if (c->bud_bytes != bud_bytes) { + ubifs_err("bad bud_bytes %lld, calculated %lld", + c->bud_bytes, bud_bytes); + err = -EINVAL; + } + spin_unlock(&c->buds_lock); + + return err; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c new file mode 100644 index 00000000..667884f4 --- /dev/null +++ b/fs/ubifs/lprops.c @@ -0,0 +1,1319 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the functions that access LEB properties and their + * categories. LEBs are categorized based on the needs of UBIFS, and the + * categories are stored as either heaps or lists to provide a fast way of + * finding a LEB in a particular category. For example, UBIFS may need to find + * an empty LEB for the journal, or a very dirty LEB for garbage collection. + */ + +#include "ubifs.h" + +/** + * get_heap_comp_val - get the LEB properties value for heap comparisons. + * @lprops: LEB properties + * @cat: LEB category + */ +static int get_heap_comp_val(struct ubifs_lprops *lprops, int cat) +{ + switch (cat) { + case LPROPS_FREE: + return lprops->free; + case LPROPS_DIRTY_IDX: + return lprops->free + lprops->dirty; + default: + return lprops->dirty; + } +} + +/** + * move_up_lpt_heap - move a new heap entry up as far as possible. + * @c: UBIFS file-system description object + * @heap: LEB category heap + * @lprops: LEB properties to move + * @cat: LEB category + * + * New entries to a heap are added at the bottom and then moved up until the + * parent's value is greater. In the case of LPT's category heaps, the value + * is either the amount of free space or the amount of dirty space, depending + * on the category. + */ +static void move_up_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, + struct ubifs_lprops *lprops, int cat) +{ + int val1, val2, hpos; + + hpos = lprops->hpos; + if (!hpos) + return; /* Already top of the heap */ + val1 = get_heap_comp_val(lprops, cat); + /* Compare to parent and, if greater, move up the heap */ + do { + int ppos = (hpos - 1) / 2; + + val2 = get_heap_comp_val(heap->arr[ppos], cat); + if (val2 >= val1) + return; + /* Greater than parent so move up */ + heap->arr[ppos]->hpos = hpos; + heap->arr[hpos] = heap->arr[ppos]; + heap->arr[ppos] = lprops; + lprops->hpos = ppos; + hpos = ppos; + } while (hpos); +} + +/** + * adjust_lpt_heap - move a changed heap entry up or down the heap. + * @c: UBIFS file-system description object + * @heap: LEB category heap + * @lprops: LEB properties to move + * @hpos: heap position of @lprops + * @cat: LEB category + * + * Changed entries in a heap are moved up or down until the parent's value is + * greater. In the case of LPT's category heaps, the value is either the amount + * of free space or the amount of dirty space, depending on the category. + */ +static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, + struct ubifs_lprops *lprops, int hpos, int cat) +{ + int val1, val2, val3, cpos; + + val1 = get_heap_comp_val(lprops, cat); + /* Compare to parent and, if greater than parent, move up the heap */ + if (hpos) { + int ppos = (hpos - 1) / 2; + + val2 = get_heap_comp_val(heap->arr[ppos], cat); + if (val1 > val2) { + /* Greater than parent so move up */ + while (1) { + heap->arr[ppos]->hpos = hpos; + heap->arr[hpos] = heap->arr[ppos]; + heap->arr[ppos] = lprops; + lprops->hpos = ppos; + hpos = ppos; + if (!hpos) + return; + ppos = (hpos - 1) / 2; + val2 = get_heap_comp_val(heap->arr[ppos], cat); + if (val1 <= val2) + return; + /* Still greater than parent so keep going */ + } + } + } + + /* Not greater than parent, so compare to children */ + while (1) { + /* Compare to left child */ + cpos = hpos * 2 + 1; + if (cpos >= heap->cnt) + return; + val2 = get_heap_comp_val(heap->arr[cpos], cat); + if (val1 < val2) { + /* Less than left child, so promote biggest child */ + if (cpos + 1 < heap->cnt) { + val3 = get_heap_comp_val(heap->arr[cpos + 1], + cat); + if (val3 > val2) + cpos += 1; /* Right child is bigger */ + } + heap->arr[cpos]->hpos = hpos; + heap->arr[hpos] = heap->arr[cpos]; + heap->arr[cpos] = lprops; + lprops->hpos = cpos; + hpos = cpos; + continue; + } + /* Compare to right child */ + cpos += 1; + if (cpos >= heap->cnt) + return; + val3 = get_heap_comp_val(heap->arr[cpos], cat); + if (val1 < val3) { + /* Less than right child, so promote right child */ + heap->arr[cpos]->hpos = hpos; + heap->arr[hpos] = heap->arr[cpos]; + heap->arr[cpos] = lprops; + lprops->hpos = cpos; + hpos = cpos; + continue; + } + return; + } +} + +/** + * add_to_lpt_heap - add LEB properties to a LEB category heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to add + * @cat: LEB category + * + * This function returns %1 if @lprops is added to the heap for LEB category + * @cat, otherwise %0 is returned because the heap is full. + */ +static int add_to_lpt_heap(struct ubifs_info *c, struct ubifs_lprops *lprops, + int cat) +{ + struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1]; + + if (heap->cnt >= heap->max_cnt) { + const int b = LPT_HEAP_SZ / 2 - 1; + int cpos, val1, val2; + + /* Compare to some other LEB on the bottom of heap */ + /* Pick a position kind of randomly */ + cpos = (((size_t)lprops >> 4) & b) + b; + ubifs_assert(cpos >= b); + ubifs_assert(cpos < LPT_HEAP_SZ); + ubifs_assert(cpos < heap->cnt); + + val1 = get_heap_comp_val(lprops, cat); + val2 = get_heap_comp_val(heap->arr[cpos], cat); + if (val1 > val2) { + struct ubifs_lprops *lp; + + lp = heap->arr[cpos]; + lp->flags &= ~LPROPS_CAT_MASK; + lp->flags |= LPROPS_UNCAT; + list_add(&lp->list, &c->uncat_list); + lprops->hpos = cpos; + heap->arr[cpos] = lprops; + move_up_lpt_heap(c, heap, lprops, cat); + dbg_check_heap(c, heap, cat, lprops->hpos); + return 1; /* Added to heap */ + } + dbg_check_heap(c, heap, cat, -1); + return 0; /* Not added to heap */ + } else { + lprops->hpos = heap->cnt++; + heap->arr[lprops->hpos] = lprops; + move_up_lpt_heap(c, heap, lprops, cat); + dbg_check_heap(c, heap, cat, lprops->hpos); + return 1; /* Added to heap */ + } +} + +/** + * remove_from_lpt_heap - remove LEB properties from a LEB category heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to remove + * @cat: LEB category + */ +static void remove_from_lpt_heap(struct ubifs_info *c, + struct ubifs_lprops *lprops, int cat) +{ + struct ubifs_lpt_heap *heap; + int hpos = lprops->hpos; + + heap = &c->lpt_heap[cat - 1]; + ubifs_assert(hpos >= 0 && hpos < heap->cnt); + ubifs_assert(heap->arr[hpos] == lprops); + heap->cnt -= 1; + if (hpos < heap->cnt) { + heap->arr[hpos] = heap->arr[heap->cnt]; + heap->arr[hpos]->hpos = hpos; + adjust_lpt_heap(c, heap, heap->arr[hpos], hpos, cat); + } + dbg_check_heap(c, heap, cat, -1); +} + +/** + * lpt_heap_replace - replace lprops in a category heap. + * @c: UBIFS file-system description object + * @old_lprops: LEB properties to replace + * @new_lprops: LEB properties with which to replace + * @cat: LEB category + * + * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode) + * and the lprops that the pnode contains. When that happens, references in + * the category heaps to those lprops must be updated to point to the new + * lprops. This function does that. + */ +static void lpt_heap_replace(struct ubifs_info *c, + struct ubifs_lprops *old_lprops, + struct ubifs_lprops *new_lprops, int cat) +{ + struct ubifs_lpt_heap *heap; + int hpos = new_lprops->hpos; + + heap = &c->lpt_heap[cat - 1]; + heap->arr[hpos] = new_lprops; +} + +/** + * ubifs_add_to_cat - add LEB properties to a category list or heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to add + * @cat: LEB category to which to add + * + * LEB properties are categorized to enable fast find operations. + */ +void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, + int cat) +{ + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + if (add_to_lpt_heap(c, lprops, cat)) + break; + /* No more room on heap so make it un-categorized */ + cat = LPROPS_UNCAT; + /* Fall through */ + case LPROPS_UNCAT: + list_add(&lprops->list, &c->uncat_list); + break; + case LPROPS_EMPTY: + list_add(&lprops->list, &c->empty_list); + break; + case LPROPS_FREEABLE: + list_add(&lprops->list, &c->freeable_list); + c->freeable_cnt += 1; + break; + case LPROPS_FRDI_IDX: + list_add(&lprops->list, &c->frdi_idx_list); + break; + default: + ubifs_assert(0); + } + lprops->flags &= ~LPROPS_CAT_MASK; + lprops->flags |= cat; +} + +/** + * ubifs_remove_from_cat - remove LEB properties from a category list or heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to remove + * @cat: LEB category from which to remove + * + * LEB properties are categorized to enable fast find operations. + */ +static void ubifs_remove_from_cat(struct ubifs_info *c, + struct ubifs_lprops *lprops, int cat) +{ + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + remove_from_lpt_heap(c, lprops, cat); + break; + case LPROPS_FREEABLE: + c->freeable_cnt -= 1; + ubifs_assert(c->freeable_cnt >= 0); + /* Fall through */ + case LPROPS_UNCAT: + case LPROPS_EMPTY: + case LPROPS_FRDI_IDX: + ubifs_assert(!list_empty(&lprops->list)); + list_del(&lprops->list); + break; + default: + ubifs_assert(0); + } +} + +/** + * ubifs_replace_cat - replace lprops in a category list or heap. + * @c: UBIFS file-system description object + * @old_lprops: LEB properties to replace + * @new_lprops: LEB properties with which to replace + * + * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode) + * and the lprops that the pnode contains. When that happens, references in + * category lists and heaps must be replaced. This function does that. + */ +void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, + struct ubifs_lprops *new_lprops) +{ + int cat; + + cat = new_lprops->flags & LPROPS_CAT_MASK; + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + lpt_heap_replace(c, old_lprops, new_lprops, cat); + break; + case LPROPS_UNCAT: + case LPROPS_EMPTY: + case LPROPS_FREEABLE: + case LPROPS_FRDI_IDX: + list_replace(&old_lprops->list, &new_lprops->list); + break; + default: + ubifs_assert(0); + } +} + +/** + * ubifs_ensure_cat - ensure LEB properties are categorized. + * @c: UBIFS file-system description object + * @lprops: LEB properties + * + * A LEB may have fallen off of the bottom of a heap, and ended up as + * un-categorized even though it has enough space for us now. If that is the + * case this function will put the LEB back onto a heap. + */ +void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops) +{ + int cat = lprops->flags & LPROPS_CAT_MASK; + + if (cat != LPROPS_UNCAT) + return; + cat = ubifs_categorize_lprops(c, lprops); + if (cat == LPROPS_UNCAT) + return; + ubifs_remove_from_cat(c, lprops, LPROPS_UNCAT); + ubifs_add_to_cat(c, lprops, cat); +} + +/** + * ubifs_categorize_lprops - categorize LEB properties. + * @c: UBIFS file-system description object + * @lprops: LEB properties to categorize + * + * LEB properties are categorized to enable fast find operations. This function + * returns the LEB category to which the LEB properties belong. Note however + * that if the LEB category is stored as a heap and the heap is full, the + * LEB properties may have their category changed to %LPROPS_UNCAT. + */ +int ubifs_categorize_lprops(const struct ubifs_info *c, + const struct ubifs_lprops *lprops) +{ + if (lprops->flags & LPROPS_TAKEN) + return LPROPS_UNCAT; + + if (lprops->free == c->leb_size) { + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return LPROPS_EMPTY; + } + + if (lprops->free + lprops->dirty == c->leb_size) { + if (lprops->flags & LPROPS_INDEX) + return LPROPS_FRDI_IDX; + else + return LPROPS_FREEABLE; + } + + if (lprops->flags & LPROPS_INDEX) { + if (lprops->dirty + lprops->free >= c->min_idx_node_sz) + return LPROPS_DIRTY_IDX; + } else { + if (lprops->dirty >= c->dead_wm && + lprops->dirty > lprops->free) + return LPROPS_DIRTY; + if (lprops->free > 0) + return LPROPS_FREE; + } + + return LPROPS_UNCAT; +} + +/** + * change_category - change LEB properties category. + * @c: UBIFS file-system description object + * @lprops: LEB properties to re-categorize + * + * LEB properties are categorized to enable fast find operations. When the LEB + * properties change they must be re-categorized. + */ +static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) +{ + int old_cat = lprops->flags & LPROPS_CAT_MASK; + int new_cat = ubifs_categorize_lprops(c, lprops); + + if (old_cat == new_cat) { + struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1]; + + /* lprops on a heap now must be moved up or down */ + if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT) + return; /* Not on a heap */ + heap = &c->lpt_heap[new_cat - 1]; + adjust_lpt_heap(c, heap, lprops, lprops->hpos, new_cat); + } else { + ubifs_remove_from_cat(c, lprops, old_cat); + ubifs_add_to_cat(c, lprops, new_cat); + } +} + +/** + * ubifs_calc_dark - calculate LEB dark space size. + * @c: the UBIFS file-system description object + * @spc: amount of free and dirty space in the LEB + * + * This function calculates and returns amount of dark space in an LEB which + * has @spc bytes of free and dirty space. + * + * UBIFS is trying to account the space which might not be usable, and this + * space is called "dark space". For example, if an LEB has only %512 free + * bytes, it is dark space, because it cannot fit a large data node. + */ +int ubifs_calc_dark(const struct ubifs_info *c, int spc) +{ + ubifs_assert(!(spc & 7)); + + if (spc < c->dark_wm) + return spc; + + /* + * If we have slightly more space then the dark space watermark, we can + * anyway safely assume it we'll be able to write a node of the + * smallest size there. + */ + if (spc - c->dark_wm < MIN_WRITE_SZ) + return spc - MIN_WRITE_SZ; + + return c->dark_wm; +} + +/** + * is_lprops_dirty - determine if LEB properties are dirty. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to test + */ +static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops) +{ + struct ubifs_pnode *pnode; + int pos; + + pos = (lprops->lnum - c->main_first) & (UBIFS_LPT_FANOUT - 1); + pnode = (struct ubifs_pnode *)container_of(lprops - pos, + struct ubifs_pnode, + lprops[0]); + return !test_bit(COW_ZNODE, &pnode->flags) && + test_bit(DIRTY_CNODE, &pnode->flags); +} + +/** + * ubifs_change_lp - change LEB properties. + * @c: the UBIFS file-system description object + * @lp: LEB properties to change + * @free: new free space amount + * @dirty: new dirty space amount + * @flags: new flags + * @idx_gc_cnt: change to the count of @idx_gc list + * + * This function changes LEB properties (@free, @dirty or @flag). However, the + * property which has the %LPROPS_NC value is not changed. Returns a pointer to + * the updated LEB properties on success and a negative error code on failure. + * + * Note, the LEB properties may have had to be copied (due to COW) and + * consequently the pointer returned may not be the same as the pointer + * passed. + */ +const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, + const struct ubifs_lprops *lp, + int free, int dirty, int flags, + int idx_gc_cnt) +{ + /* + * This is the only function that is allowed to change lprops, so we + * discard the "const" qualifier. + */ + struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp; + + dbg_lp("LEB %d, free %d, dirty %d, flags %d", + lprops->lnum, free, dirty, flags); + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + ubifs_assert(c->lst.empty_lebs >= 0 && + c->lst.empty_lebs <= c->main_lebs); + ubifs_assert(c->freeable_cnt >= 0); + ubifs_assert(c->freeable_cnt <= c->main_lebs); + ubifs_assert(c->lst.taken_empty_lebs >= 0); + ubifs_assert(c->lst.taken_empty_lebs <= c->lst.empty_lebs); + ubifs_assert(!(c->lst.total_free & 7) && !(c->lst.total_dirty & 7)); + ubifs_assert(!(c->lst.total_dead & 7) && !(c->lst.total_dark & 7)); + ubifs_assert(!(c->lst.total_used & 7)); + ubifs_assert(free == LPROPS_NC || free >= 0); + ubifs_assert(dirty == LPROPS_NC || dirty >= 0); + + if (!is_lprops_dirty(c, lprops)) { + lprops = ubifs_lpt_lookup_dirty(c, lprops->lnum); + if (IS_ERR(lprops)) + return lprops; + } else + ubifs_assert(lprops == ubifs_lpt_lookup_dirty(c, lprops->lnum)); + + ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7)); + + spin_lock(&c->space_lock); + if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size) + c->lst.taken_empty_lebs -= 1; + + if (!(lprops->flags & LPROPS_INDEX)) { + int old_spc; + + old_spc = lprops->free + lprops->dirty; + if (old_spc < c->dead_wm) + c->lst.total_dead -= old_spc; + else + c->lst.total_dark -= ubifs_calc_dark(c, old_spc); + + c->lst.total_used -= c->leb_size - old_spc; + } + + if (free != LPROPS_NC) { + free = ALIGN(free, 8); + c->lst.total_free += free - lprops->free; + + /* Increase or decrease empty LEBs counter if needed */ + if (free == c->leb_size) { + if (lprops->free != c->leb_size) + c->lst.empty_lebs += 1; + } else if (lprops->free == c->leb_size) + c->lst.empty_lebs -= 1; + lprops->free = free; + } + + if (dirty != LPROPS_NC) { + dirty = ALIGN(dirty, 8); + c->lst.total_dirty += dirty - lprops->dirty; + lprops->dirty = dirty; + } + + if (flags != LPROPS_NC) { + /* Take care about indexing LEBs counter if needed */ + if ((lprops->flags & LPROPS_INDEX)) { + if (!(flags & LPROPS_INDEX)) + c->lst.idx_lebs -= 1; + } else if (flags & LPROPS_INDEX) + c->lst.idx_lebs += 1; + lprops->flags = flags; + } + + if (!(lprops->flags & LPROPS_INDEX)) { + int new_spc; + + new_spc = lprops->free + lprops->dirty; + if (new_spc < c->dead_wm) + c->lst.total_dead += new_spc; + else + c->lst.total_dark += ubifs_calc_dark(c, new_spc); + + c->lst.total_used += c->leb_size - new_spc; + } + + if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size) + c->lst.taken_empty_lebs += 1; + + change_category(c, lprops); + c->idx_gc_cnt += idx_gc_cnt; + spin_unlock(&c->space_lock); + return lprops; +} + +/** + * ubifs_get_lp_stats - get lprops statistics. + * @c: UBIFS file-system description object + * @st: return statistics + */ +void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst) +{ + spin_lock(&c->space_lock); + memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats)); + spin_unlock(&c->space_lock); +} + +/** + * ubifs_change_one_lp - change LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to change properties for + * @free: amount of free space + * @dirty: amount of dirty space + * @flags_set: flags to set + * @flags_clean: flags to clean + * @idx_gc_cnt: change to the count of idx_gc list + * + * This function changes properties of LEB @lnum. It is a helper wrapper over + * 'ubifs_change_lp()' which hides lprops get/release. The arguments are the + * same as in case of 'ubifs_change_lp()'. Returns zero in case of success and + * a negative error code in case of failure. + */ +int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean, int idx_gc_cnt) +{ + int err = 0, flags; + const struct ubifs_lprops *lp; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + flags = (lp->flags | flags_set) & ~flags_clean; + lp = ubifs_change_lp(c, lp, free, dirty, flags, idx_gc_cnt); + if (IS_ERR(lp)) + err = PTR_ERR(lp); + +out: + ubifs_release_lprops(c); + if (err) + ubifs_err("cannot change properties of LEB %d, error %d", + lnum, err); + return err; +} + +/** + * ubifs_update_one_lp - update LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to change properties for + * @free: amount of free space + * @dirty: amount of dirty space to add + * @flags_set: flags to set + * @flags_clean: flags to clean + * + * This function is the same as 'ubifs_change_one_lp()' but @dirty is added to + * current dirty space, not substitutes it. + */ +int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean) +{ + int err = 0, flags; + const struct ubifs_lprops *lp; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + flags = (lp->flags | flags_set) & ~flags_clean; + lp = ubifs_change_lp(c, lp, free, lp->dirty + dirty, flags, 0); + if (IS_ERR(lp)) + err = PTR_ERR(lp); + +out: + ubifs_release_lprops(c); + if (err) + ubifs_err("cannot update properties of LEB %d, error %d", + lnum, err); + return err; +} + +/** + * ubifs_read_one_lp - read LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to read properties for + * @lp: where to store read properties + * + * This helper function reads properties of a LEB @lnum and stores them in @lp. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp) +{ + int err = 0; + const struct ubifs_lprops *lpp; + + ubifs_get_lprops(c); + + lpp = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lpp)) { + err = PTR_ERR(lpp); + ubifs_err("cannot read properties of LEB %d, error %d", + lnum, err); + goto out; + } + + memcpy(lp, lpp, sizeof(struct ubifs_lprops)); + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_fast_find_free - try to find a LEB with free space quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for a LEB with free space or %NULL if + * the function is unable to find a LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + heap = &c->lpt_heap[LPROPS_FREE - 1]; + if (heap->cnt == 0) + return NULL; + + lprops = heap->arr[0]; + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_fast_find_empty - try to find an empty LEB quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for an empty LEB or %NULL if the + * function is unable to find an empty LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + if (list_empty(&c->empty_list)) + return NULL; + + lprops = list_entry(c->empty_list.next, struct ubifs_lprops, list); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + ubifs_assert(lprops->free == c->leb_size); + return lprops; +} + +/** + * ubifs_fast_find_freeable - try to find a freeable LEB quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for a freeable LEB or %NULL if the + * function is unable to find a freeable LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + if (list_empty(&c->freeable_list)) + return NULL; + + lprops = list_entry(c->freeable_list.next, struct ubifs_lprops, list); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + ubifs_assert(lprops->free + lprops->dirty == c->leb_size); + ubifs_assert(c->freeable_cnt > 0); + return lprops; +} + +/** + * ubifs_fast_find_frdi_idx - try to find a freeable index LEB quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for a freeable index LEB or %NULL if the + * function is unable to find a freeable index LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + if (list_empty(&c->frdi_idx_list)) + return NULL; + + lprops = list_entry(c->frdi_idx_list.next, struct ubifs_lprops, list); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert((lprops->flags & LPROPS_INDEX)); + ubifs_assert(lprops->free + lprops->dirty == c->leb_size); + return lprops; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * dbg_check_cats - check category heaps and lists. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_cats(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct list_head *pos; + int i, cat; + + if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS))) + return 0; + + list_for_each_entry(lprops, &c->empty_list, list) { + if (lprops->free != c->leb_size) { + ubifs_err("non-empty LEB %d on empty list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err("taken LEB %d on empty list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + } + + i = 0; + list_for_each_entry(lprops, &c->freeable_list, list) { + if (lprops->free + lprops->dirty != c->leb_size) { + ubifs_err("non-freeable LEB %d on freeable list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err("taken LEB %d on freeable list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + i += 1; + } + if (i != c->freeable_cnt) { + ubifs_err("freeable list count %d expected %d", i, + c->freeable_cnt); + return -EINVAL; + } + + i = 0; + list_for_each(pos, &c->idx_gc) + i += 1; + if (i != c->idx_gc_cnt) { + ubifs_err("idx_gc list count %d expected %d", i, + c->idx_gc_cnt); + return -EINVAL; + } + + list_for_each_entry(lprops, &c->frdi_idx_list, list) { + if (lprops->free + lprops->dirty != c->leb_size) { + ubifs_err("non-freeable LEB %d on frdi_idx list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err("taken LEB %d on frdi_idx list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + if (!(lprops->flags & LPROPS_INDEX)) { + ubifs_err("non-index LEB %d on frdi_idx list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + } + + for (cat = 1; cat <= LPROPS_HEAP_CNT; cat++) { + struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1]; + + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + if (!lprops) { + ubifs_err("null ptr in LPT heap cat %d", cat); + return -EINVAL; + } + if (lprops->hpos != i) { + ubifs_err("bad ptr in LPT heap cat %d", cat); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err("taken LEB in LPT heap cat %d", cat); + return -EINVAL; + } + } + } + + return 0; +} + +void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, + int add_pos) +{ + int i = 0, j, err = 0; + + if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS))) + return; + + for (i = 0; i < heap->cnt; i++) { + struct ubifs_lprops *lprops = heap->arr[i]; + struct ubifs_lprops *lp; + + if (i != add_pos) + if ((lprops->flags & LPROPS_CAT_MASK) != cat) { + err = 1; + goto out; + } + if (lprops->hpos != i) { + err = 2; + goto out; + } + lp = ubifs_lpt_lookup(c, lprops->lnum); + if (IS_ERR(lp)) { + err = 3; + goto out; + } + if (lprops != lp) { + dbg_msg("lprops %zx lp %zx lprops->lnum %d lp->lnum %d", + (size_t)lprops, (size_t)lp, lprops->lnum, + lp->lnum); + err = 4; + goto out; + } + for (j = 0; j < i; j++) { + lp = heap->arr[j]; + if (lp == lprops) { + err = 5; + goto out; + } + if (lp->lnum == lprops->lnum) { + err = 6; + goto out; + } + } + } +out: + if (err) { + dbg_msg("failed cat %d hpos %d err %d", cat, i, err); + dbg_dump_stack(); + dbg_dump_heap(c, heap, cat); + } +} + +/** + * scan_check_cb - scan callback. + * @c: the UBIFS file-system description object + * @lp: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @lst: lprops statistics to update + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_check_cb(struct ubifs_info *c, + const struct ubifs_lprops *lp, int in_tree, + struct ubifs_lp_stats *lst) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret; + void *buf = NULL; + + cat = lp->flags & LPROPS_CAT_MASK; + if (cat != LPROPS_UNCAT) { + cat = ubifs_categorize_lprops(c, lp); + if (cat != (lp->flags & LPROPS_CAT_MASK)) { + ubifs_err("bad LEB category %d expected %d", + (lp->flags & LPROPS_CAT_MASK), cat); + return -EINVAL; + } + } + + /* Check lp is on its category list (if it has one) */ + if (in_tree) { + struct list_head *list = NULL; + + switch (cat) { + case LPROPS_EMPTY: + list = &c->empty_list; + break; + case LPROPS_FREEABLE: + list = &c->freeable_list; + break; + case LPROPS_FRDI_IDX: + list = &c->frdi_idx_list; + break; + case LPROPS_UNCAT: + list = &c->uncat_list; + break; + } + if (list) { + struct ubifs_lprops *lprops; + int found = 0; + + list_for_each_entry(lprops, list, list) { + if (lprops == lp) { + found = 1; + break; + } + } + if (!found) { + ubifs_err("bad LPT list (category %d)", cat); + return -EINVAL; + } + } + } + + /* Check lp is on its category heap (if it has one) */ + if (in_tree && cat > 0 && cat <= LPROPS_HEAP_CNT) { + struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1]; + + if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) || + lp != heap->arr[lp->hpos]) { + ubifs_err("bad LPT heap (category %d)", cat); + return -EINVAL; + } + } + + buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) + return -ENOMEM; + + /* + * After an unclean unmount, empty and freeable LEBs + * may contain garbage - do not scan them. + */ + if (lp->free == c->leb_size) { + lst->empty_lebs += 1; + lst->total_free += c->leb_size; + lst->total_dark += ubifs_calc_dark(c, c->leb_size); + return LPT_SCAN_CONTINUE; + } + if (lp->free + lp->dirty == c->leb_size && + !(lp->flags & LPROPS_INDEX)) { + lst->total_free += lp->free; + lst->total_dirty += lp->dirty; + lst->total_dark += ubifs_calc_dark(c, c->leb_size); + return LPT_SCAN_CONTINUE; + } + + sleb = ubifs_scan(c, lnum, 0, buf, 0); + if (IS_ERR(sleb)) { + ret = PTR_ERR(sleb); + if (ret == -EUCLEAN) { + dbg_dump_lprops(c); + dbg_dump_budg(c, &c->bi); + } + goto out; + } + + is_idx = -1; + list_for_each_entry(snod, &sleb->nodes, list) { + int found, level = 0; + + cond_resched(); + + if (is_idx == -1) + is_idx = (snod->type == UBIFS_IDX_NODE) ? 1 : 0; + + if (is_idx && snod->type != UBIFS_IDX_NODE) { + ubifs_err("indexing node in data LEB %d:%d", + lnum, snod->offs); + goto out_destroy; + } + + if (snod->type == UBIFS_IDX_NODE) { + struct ubifs_idx_node *idx = snod->node; + + key_read(c, ubifs_idx_key(c, idx), &snod->key); + level = le16_to_cpu(idx->level); + } + + found = ubifs_tnc_has_node(c, &snod->key, level, lnum, + snod->offs, is_idx); + if (found) { + if (found < 0) + goto out_destroy; + used += ALIGN(snod->len, 8); + } + } + + free = c->leb_size - sleb->endpt; + dirty = sleb->endpt - used; + + if (free > c->leb_size || free < 0 || dirty > c->leb_size || + dirty < 0) { + ubifs_err("bad calculated accounting for LEB %d: " + "free %d, dirty %d", lnum, free, dirty); + goto out_destroy; + } + + if (lp->free + lp->dirty == c->leb_size && + free + dirty == c->leb_size) + if ((is_idx && !(lp->flags & LPROPS_INDEX)) || + (!is_idx && free == c->leb_size) || + lp->free == c->leb_size) { + /* + * Empty or freeable LEBs could contain index + * nodes from an uncompleted commit due to an + * unclean unmount. Or they could be empty for + * the same reason. Or it may simply not have been + * unmapped. + */ + free = lp->free; + dirty = lp->dirty; + is_idx = 0; + } + + if (is_idx && lp->free + lp->dirty == free + dirty && + lnum != c->ihead_lnum) { + /* + * After an unclean unmount, an index LEB could have a different + * amount of free space than the value recorded by lprops. That + * is because the in-the-gaps method may use free space or + * create free space (as a side-effect of using ubi_leb_change + * and not writing the whole LEB). The incorrect free space + * value is not a problem because the index is only ever + * allocated empty LEBs, so there will never be an attempt to + * write to the free space at the end of an index LEB - except + * by the in-the-gaps method for which it is not a problem. + */ + free = lp->free; + dirty = lp->dirty; + } + + if (lp->free != free || lp->dirty != dirty) + goto out_print; + + if (is_idx && !(lp->flags & LPROPS_INDEX)) { + if (free == c->leb_size) + /* Free but not unmapped LEB, it's fine */ + is_idx = 0; + else { + ubifs_err("indexing node without indexing " + "flag"); + goto out_print; + } + } + + if (!is_idx && (lp->flags & LPROPS_INDEX)) { + ubifs_err("data node with indexing flag"); + goto out_print; + } + + if (free == c->leb_size) + lst->empty_lebs += 1; + + if (is_idx) + lst->idx_lebs += 1; + + if (!(lp->flags & LPROPS_INDEX)) + lst->total_used += c->leb_size - free - dirty; + lst->total_free += free; + lst->total_dirty += dirty; + + if (!(lp->flags & LPROPS_INDEX)) { + int spc = free + dirty; + + if (spc < c->dead_wm) + lst->total_dead += spc; + else + lst->total_dark += ubifs_calc_dark(c, spc); + } + + ubifs_scan_destroy(sleb); + vfree(buf); + return LPT_SCAN_CONTINUE; + +out_print: + ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " + "should be free %d, dirty %d", + lnum, lp->free, lp->dirty, lp->flags, free, dirty); + dbg_dump_leb(c, lnum); +out_destroy: + ubifs_scan_destroy(sleb); + ret = -EINVAL; +out: + vfree(buf); + return ret; +} + +/** + * dbg_check_lprops - check all LEB properties. + * @c: UBIFS file-system description object + * + * This function checks all LEB properties and makes sure they are all correct. + * It returns zero if everything is fine, %-EINVAL if there is an inconsistency + * and other negative error codes in case of other errors. This function is + * called while the file system is locked (because of commit start), so no + * additional locking is required. Note that locking the LPT mutex would cause + * a circular lock dependency with the TNC mutex. + */ +int dbg_check_lprops(struct ubifs_info *c) +{ + int i, err; + struct ubifs_lp_stats lst; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + /* + * As we are going to scan the media, the write buffers have to be + * synchronized. + */ + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + return err; + } + + memset(&lst, 0, sizeof(struct ubifs_lp_stats)); + err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1, + (ubifs_lpt_scan_callback)scan_check_cb, + &lst); + if (err && err != -ENOSPC) + goto out; + + if (lst.empty_lebs != c->lst.empty_lebs || + lst.idx_lebs != c->lst.idx_lebs || + lst.total_free != c->lst.total_free || + lst.total_dirty != c->lst.total_dirty || + lst.total_used != c->lst.total_used) { + ubifs_err("bad overall accounting"); + ubifs_err("calculated: empty_lebs %d, idx_lebs %d, " + "total_free %lld, total_dirty %lld, total_used %lld", + lst.empty_lebs, lst.idx_lebs, lst.total_free, + lst.total_dirty, lst.total_used); + ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, " + "total_free %lld, total_dirty %lld, total_used %lld", + c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free, + c->lst.total_dirty, c->lst.total_used); + err = -EINVAL; + goto out; + } + + if (lst.total_dead != c->lst.total_dead || + lst.total_dark != c->lst.total_dark) { + ubifs_err("bad dead/dark space accounting"); + ubifs_err("calculated: total_dead %lld, total_dark %lld", + lst.total_dead, lst.total_dark); + ubifs_err("read from lprops: total_dead %lld, total_dark %lld", + c->lst.total_dead, c->lst.total_dark); + err = -EINVAL; + goto out; + } + + err = dbg_check_cats(c); +out: + return err; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c new file mode 100644 index 00000000..ef5155e1 --- /dev/null +++ b/fs/ubifs/lpt.c @@ -0,0 +1,2277 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the LEB properties tree (LPT) area. The LPT area + * contains the LEB properties tree, a table of LPT area eraseblocks (ltab), and + * (for the "big" model) a table of saved LEB numbers (lsave). The LPT area sits + * between the log and the orphan area. + * + * The LPT area is like a miniature self-contained file system. It is required + * that it never runs out of space, is fast to access and update, and scales + * logarithmically. The LEB properties tree is implemented as a wandering tree + * much like the TNC, and the LPT area has its own garbage collection. + * + * The LPT has two slightly different forms called the "small model" and the + * "big model". The small model is used when the entire LEB properties table + * can be written into a single eraseblock. In that case, garbage collection + * consists of just writing the whole table, which therefore makes all other + * eraseblocks reusable. In the case of the big model, dirty eraseblocks are + * selected for garbage collection, which consists of marking the clean nodes in + * that LEB as dirty, and then only the dirty nodes are written out. Also, in + * the case of the big model, a table of LEB numbers is saved so that the entire + * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first + * mounted. + */ + +#include "ubifs.h" +#include +#include +#include + +/** + * do_calc_lpt_geom - calculate sizes for the LPT area. + * @c: the UBIFS file-system description object + * + * Calculate the sizes of LPT bit fields, nodes, and tree, based on the + * properties of the flash and whether LPT is "big" (c->big_lpt). + */ +static void do_calc_lpt_geom(struct ubifs_info *c) +{ + int i, n, bits, per_leb_wastage, max_pnode_cnt; + long long sz, tot_wastage; + + n = c->main_lebs + c->max_leb_cnt - c->leb_cnt; + max_pnode_cnt = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT); + + c->lpt_hght = 1; + n = UBIFS_LPT_FANOUT; + while (n < max_pnode_cnt) { + c->lpt_hght += 1; + n <<= UBIFS_LPT_FANOUT_SHIFT; + } + + c->pnode_cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT); + + n = DIV_ROUND_UP(c->pnode_cnt, UBIFS_LPT_FANOUT); + c->nnode_cnt = n; + for (i = 1; i < c->lpt_hght; i++) { + n = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT); + c->nnode_cnt += n; + } + + c->space_bits = fls(c->leb_size) - 3; + c->lpt_lnum_bits = fls(c->lpt_lebs); + c->lpt_offs_bits = fls(c->leb_size - 1); + c->lpt_spc_bits = fls(c->leb_size); + + n = DIV_ROUND_UP(c->max_leb_cnt, UBIFS_LPT_FANOUT); + c->pcnt_bits = fls(n - 1); + + c->lnum_bits = fls(c->max_leb_cnt - 1); + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + (c->big_lpt ? c->pcnt_bits : 0) + + (c->space_bits * 2 + 1) * UBIFS_LPT_FANOUT; + c->pnode_sz = (bits + 7) / 8; + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + (c->big_lpt ? c->pcnt_bits : 0) + + (c->lpt_lnum_bits + c->lpt_offs_bits) * UBIFS_LPT_FANOUT; + c->nnode_sz = (bits + 7) / 8; + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + c->lpt_lebs * c->lpt_spc_bits * 2; + c->ltab_sz = (bits + 7) / 8; + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + c->lnum_bits * c->lsave_cnt; + c->lsave_sz = (bits + 7) / 8; + + /* Calculate the minimum LPT size */ + c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; + c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; + c->lpt_sz += c->ltab_sz; + if (c->big_lpt) + c->lpt_sz += c->lsave_sz; + + /* Add wastage */ + sz = c->lpt_sz; + per_leb_wastage = max_t(int, c->pnode_sz, c->nnode_sz); + sz += per_leb_wastage; + tot_wastage = per_leb_wastage; + while (sz > c->leb_size) { + sz += per_leb_wastage; + sz -= c->leb_size; + tot_wastage += per_leb_wastage; + } + tot_wastage += ALIGN(sz, c->min_io_size) - sz; + c->lpt_sz += tot_wastage; +} + +/** + * ubifs_calc_lpt_geom - calculate and check sizes for the LPT area. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_calc_lpt_geom(struct ubifs_info *c) +{ + int lebs_needed; + long long sz; + + do_calc_lpt_geom(c); + + /* Verify that lpt_lebs is big enough */ + sz = c->lpt_sz * 2; /* Must have at least 2 times the size */ + lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size); + if (lebs_needed > c->lpt_lebs) { + ubifs_err("too few LPT LEBs"); + return -EINVAL; + } + + /* Verify that ltab fits in a single LEB (since ltab is a single node */ + if (c->ltab_sz > c->leb_size) { + ubifs_err("LPT ltab too big"); + return -EINVAL; + } + + c->check_lpt_free = c->big_lpt; + return 0; +} + +/** + * calc_dflt_lpt_geom - calculate default LPT geometry. + * @c: the UBIFS file-system description object + * @main_lebs: number of main area LEBs is passed and returned here + * @big_lpt: whether the LPT area is "big" is returned here + * + * The size of the LPT area depends on parameters that themselves are dependent + * on the size of the LPT area. This function, successively recalculates the LPT + * area geometry until the parameters and resultant geometry are consistent. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs, + int *big_lpt) +{ + int i, lebs_needed; + long long sz; + + /* Start by assuming the minimum number of LPT LEBs */ + c->lpt_lebs = UBIFS_MIN_LPT_LEBS; + c->main_lebs = *main_lebs - c->lpt_lebs; + if (c->main_lebs <= 0) + return -EINVAL; + + /* And assume we will use the small LPT model */ + c->big_lpt = 0; + + /* + * Calculate the geometry based on assumptions above and then see if it + * makes sense + */ + do_calc_lpt_geom(c); + + /* Small LPT model must have lpt_sz < leb_size */ + if (c->lpt_sz > c->leb_size) { + /* Nope, so try again using big LPT model */ + c->big_lpt = 1; + do_calc_lpt_geom(c); + } + + /* Now check there are enough LPT LEBs */ + for (i = 0; i < 64 ; i++) { + sz = c->lpt_sz * 4; /* Allow 4 times the size */ + lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size); + if (lebs_needed > c->lpt_lebs) { + /* Not enough LPT LEBs so try again with more */ + c->lpt_lebs = lebs_needed; + c->main_lebs = *main_lebs - c->lpt_lebs; + if (c->main_lebs <= 0) + return -EINVAL; + do_calc_lpt_geom(c); + continue; + } + if (c->ltab_sz > c->leb_size) { + ubifs_err("LPT ltab too big"); + return -EINVAL; + } + *main_lebs = c->main_lebs; + *big_lpt = c->big_lpt; + return 0; + } + return -EINVAL; +} + +/** + * pack_bits - pack bit fields end-to-end. + * @addr: address at which to pack (passed and next address returned) + * @pos: bit position at which to pack (passed and next position returned) + * @val: value to pack + * @nrbits: number of bits of value to pack (1-32) + */ +static void pack_bits(uint8_t **addr, int *pos, uint32_t val, int nrbits) +{ + uint8_t *p = *addr; + int b = *pos; + + ubifs_assert(nrbits > 0); + ubifs_assert(nrbits <= 32); + ubifs_assert(*pos >= 0); + ubifs_assert(*pos < 8); + ubifs_assert((val >> nrbits) == 0 || nrbits == 32); + if (b) { + *p |= ((uint8_t)val) << b; + nrbits += b; + if (nrbits > 8) { + *++p = (uint8_t)(val >>= (8 - b)); + if (nrbits > 16) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 24) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 32) + *++p = (uint8_t)(val >>= 8); + } + } + } + } else { + *p = (uint8_t)val; + if (nrbits > 8) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 16) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 24) + *++p = (uint8_t)(val >>= 8); + } + } + } + b = nrbits & 7; + if (b == 0) + p++; + *addr = p; + *pos = b; +} + +/** + * ubifs_unpack_bits - unpack bit fields. + * @addr: address at which to unpack (passed and next address returned) + * @pos: bit position at which to unpack (passed and next position returned) + * @nrbits: number of bits of value to unpack (1-32) + * + * This functions returns the value unpacked. + */ +uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits) +{ + const int k = 32 - nrbits; + uint8_t *p = *addr; + int b = *pos; + uint32_t uninitialized_var(val); + const int bytes = (nrbits + b + 7) >> 3; + + ubifs_assert(nrbits > 0); + ubifs_assert(nrbits <= 32); + ubifs_assert(*pos >= 0); + ubifs_assert(*pos < 8); + if (b) { + switch (bytes) { + case 2: + val = p[1]; + break; + case 3: + val = p[1] | ((uint32_t)p[2] << 8); + break; + case 4: + val = p[1] | ((uint32_t)p[2] << 8) | + ((uint32_t)p[3] << 16); + break; + case 5: + val = p[1] | ((uint32_t)p[2] << 8) | + ((uint32_t)p[3] << 16) | + ((uint32_t)p[4] << 24); + } + val <<= (8 - b); + val |= *p >> b; + nrbits += b; + } else { + switch (bytes) { + case 1: + val = p[0]; + break; + case 2: + val = p[0] | ((uint32_t)p[1] << 8); + break; + case 3: + val = p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16); + break; + case 4: + val = p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16) | + ((uint32_t)p[3] << 24); + break; + } + } + val <<= k; + val >>= k; + b = nrbits & 7; + p += nrbits >> 3; + *addr = p; + *pos = b; + ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32); + return val; +} + +/** + * ubifs_pack_pnode - pack all the bit fields of a pnode. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @pnode: pnode to pack + */ +void ubifs_pack_pnode(struct ubifs_info *c, void *buf, + struct ubifs_pnode *pnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_PNODE, UBIFS_LPT_TYPE_BITS); + if (c->big_lpt) + pack_bits(&addr, &pos, pnode->num, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + pack_bits(&addr, &pos, pnode->lprops[i].free >> 3, + c->space_bits); + pack_bits(&addr, &pos, pnode->lprops[i].dirty >> 3, + c->space_bits); + if (pnode->lprops[i].flags & LPROPS_INDEX) + pack_bits(&addr, &pos, 1, 1); + else + pack_bits(&addr, &pos, 0, 1); + } + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->pnode_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_pack_nnode - pack all the bit fields of a nnode. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @nnode: nnode to pack + */ +void ubifs_pack_nnode(struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_NNODE, UBIFS_LPT_TYPE_BITS); + if (c->big_lpt) + pack_bits(&addr, &pos, nnode->num, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int lnum = nnode->nbranch[i].lnum; + + if (lnum == 0) + lnum = c->lpt_last + 1; + pack_bits(&addr, &pos, lnum - c->lpt_first, c->lpt_lnum_bits); + pack_bits(&addr, &pos, nnode->nbranch[i].offs, + c->lpt_offs_bits); + } + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->nnode_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_pack_ltab - pack the LPT's own lprops table. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @ltab: LPT's own lprops table to pack + */ +void ubifs_pack_ltab(struct ubifs_info *c, void *buf, + struct ubifs_lpt_lprops *ltab) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_LTAB, UBIFS_LPT_TYPE_BITS); + for (i = 0; i < c->lpt_lebs; i++) { + pack_bits(&addr, &pos, ltab[i].free, c->lpt_spc_bits); + pack_bits(&addr, &pos, ltab[i].dirty, c->lpt_spc_bits); + } + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->ltab_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_pack_lsave - pack the LPT's save table. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @lsave: LPT's save table to pack + */ +void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_LSAVE, UBIFS_LPT_TYPE_BITS); + for (i = 0; i < c->lsave_cnt; i++) + pack_bits(&addr, &pos, lsave[i], c->lnum_bits); + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->lsave_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_add_lpt_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @lnum: LEB number to which to add dirty space + * @dirty: amount of dirty space to add + */ +void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty) +{ + if (!dirty || !lnum) + return; + dbg_lp("LEB %d add %d to %d", + lnum, dirty, c->ltab[lnum - c->lpt_first].dirty); + ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); + c->ltab[lnum - c->lpt_first].dirty += dirty; +} + +/** + * set_ltab - set LPT LEB properties. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @free: amount of free space + * @dirty: amount of dirty space + */ +static void set_ltab(struct ubifs_info *c, int lnum, int free, int dirty) +{ + dbg_lp("LEB %d free %d dirty %d to %d %d", + lnum, c->ltab[lnum - c->lpt_first].free, + c->ltab[lnum - c->lpt_first].dirty, free, dirty); + ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); + c->ltab[lnum - c->lpt_first].free = free; + c->ltab[lnum - c->lpt_first].dirty = dirty; +} + +/** + * ubifs_add_nnode_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @nnode: nnode for which to add dirt + */ +void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode) +{ + struct ubifs_nnode *np = nnode->parent; + + if (np) + ubifs_add_lpt_dirt(c, np->nbranch[nnode->iip].lnum, + c->nnode_sz); + else { + ubifs_add_lpt_dirt(c, c->lpt_lnum, c->nnode_sz); + if (!(c->lpt_drty_flgs & LTAB_DIRTY)) { + c->lpt_drty_flgs |= LTAB_DIRTY; + ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz); + } + } +} + +/** + * add_pnode_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @pnode: pnode for which to add dirt + */ +static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum, + c->pnode_sz); +} + +/** + * calc_nnode_num - calculate nnode number. + * @row: the row in the tree (root is zero) + * @col: the column in the row (leftmost is zero) + * + * The nnode number is a number that uniquely identifies a nnode and can be used + * easily to traverse the tree from the root to that nnode. + * + * This function calculates and returns the nnode number for the nnode at @row + * and @col. + */ +static int calc_nnode_num(int row, int col) +{ + int num, bits; + + num = 1; + while (row--) { + bits = (col & (UBIFS_LPT_FANOUT - 1)); + col >>= UBIFS_LPT_FANOUT_SHIFT; + num <<= UBIFS_LPT_FANOUT_SHIFT; + num |= bits; + } + return num; +} + +/** + * calc_nnode_num_from_parent - calculate nnode number. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * The nnode number is a number that uniquely identifies a nnode and can be used + * easily to traverse the tree from the root to that nnode. + * + * This function calculates and returns the nnode number based on the parent's + * nnode number and the index in parent. + */ +static int calc_nnode_num_from_parent(const struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + int num, shft; + + if (!parent) + return 1; + shft = (c->lpt_hght - parent->level) * UBIFS_LPT_FANOUT_SHIFT; + num = parent->num ^ (1 << shft); + num |= (UBIFS_LPT_FANOUT + iip) << shft; + return num; +} + +/** + * calc_pnode_num_from_parent - calculate pnode number. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * The pnode number is a number that uniquely identifies a pnode and can be used + * easily to traverse the tree from the root to that pnode. + * + * This function calculates and returns the pnode number based on the parent's + * nnode number and the index in parent. + */ +static int calc_pnode_num_from_parent(const struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0; + + for (i = 0; i < n; i++) { + num <<= UBIFS_LPT_FANOUT_SHIFT; + num |= pnum & (UBIFS_LPT_FANOUT - 1); + pnum >>= UBIFS_LPT_FANOUT_SHIFT; + } + num <<= UBIFS_LPT_FANOUT_SHIFT; + num |= iip; + return num; +} + +/** + * ubifs_create_dflt_lpt - create default LPT. + * @c: UBIFS file-system description object + * @main_lebs: number of main area LEBs is passed and returned here + * @lpt_first: LEB number of first LPT LEB + * @lpt_lebs: number of LEBs for LPT is passed and returned here + * @big_lpt: use big LPT model is passed and returned here + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, + int *lpt_lebs, int *big_lpt) +{ + int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row; + int blnum, boffs, bsz, bcnt; + struct ubifs_pnode *pnode = NULL; + struct ubifs_nnode *nnode = NULL; + void *buf = NULL, *p; + struct ubifs_lpt_lprops *ltab = NULL; + int *lsave = NULL; + + err = calc_dflt_lpt_geom(c, main_lebs, big_lpt); + if (err) + return err; + *lpt_lebs = c->lpt_lebs; + + /* Needed by 'ubifs_pack_nnode()' and 'set_ltab()' */ + c->lpt_first = lpt_first; + /* Needed by 'set_ltab()' */ + c->lpt_last = lpt_first + c->lpt_lebs - 1; + /* Needed by 'ubifs_pack_lsave()' */ + c->main_first = c->leb_cnt - *main_lebs; + + lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_KERNEL); + pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL); + nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL); + buf = vmalloc(c->leb_size); + ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + if (!pnode || !nnode || !buf || !ltab || !lsave) { + err = -ENOMEM; + goto out; + } + + ubifs_assert(!c->ltab); + c->ltab = ltab; /* Needed by set_ltab */ + + /* Initialize LPT's own lprops */ + for (i = 0; i < c->lpt_lebs; i++) { + ltab[i].free = c->leb_size; + ltab[i].dirty = 0; + ltab[i].tgc = 0; + ltab[i].cmt = 0; + } + + lnum = lpt_first; + p = buf; + /* Number of leaf nodes (pnodes) */ + cnt = c->pnode_cnt; + + /* + * The first pnode contains the LEB properties for the LEBs that contain + * the root inode node and the root index node of the index tree. + */ + node_sz = ALIGN(ubifs_idx_node_sz(c, 1), 8); + iopos = ALIGN(node_sz, c->min_io_size); + pnode->lprops[0].free = c->leb_size - iopos; + pnode->lprops[0].dirty = iopos - node_sz; + pnode->lprops[0].flags = LPROPS_INDEX; + + node_sz = UBIFS_INO_NODE_SZ; + iopos = ALIGN(node_sz, c->min_io_size); + pnode->lprops[1].free = c->leb_size - iopos; + pnode->lprops[1].dirty = iopos - node_sz; + + for (i = 2; i < UBIFS_LPT_FANOUT; i++) + pnode->lprops[i].free = c->leb_size; + + /* Add first pnode */ + ubifs_pack_pnode(c, p, pnode); + p += c->pnode_sz; + len = c->pnode_sz; + pnode->num += 1; + + /* Reset pnode values for remaining pnodes */ + pnode->lprops[0].free = c->leb_size; + pnode->lprops[0].dirty = 0; + pnode->lprops[0].flags = 0; + + pnode->lprops[1].free = c->leb_size; + pnode->lprops[1].dirty = 0; + + /* + * To calculate the internal node branches, we keep information about + * the level below. + */ + blnum = lnum; /* LEB number of level below */ + boffs = 0; /* Offset of level below */ + bcnt = cnt; /* Number of nodes in level below */ + bsz = c->pnode_sz; /* Size of nodes in level below */ + + /* Add all remaining pnodes */ + for (i = 1; i < cnt; i++) { + if (len + c->pnode_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum++, buf, alen, + UBI_SHORTTERM); + if (err) + goto out; + p = buf; + len = 0; + } + ubifs_pack_pnode(c, p, pnode); + p += c->pnode_sz; + len += c->pnode_sz; + /* + * pnodes are simply numbered left to right starting at zero, + * which means the pnode number can be used easily to traverse + * down the tree to the corresponding pnode. + */ + pnode->num += 1; + } + + row = 0; + for (i = UBIFS_LPT_FANOUT; cnt > i; i <<= UBIFS_LPT_FANOUT_SHIFT) + row += 1; + /* Add all nnodes, one level at a time */ + while (1) { + /* Number of internal nodes (nnodes) at next level */ + cnt = DIV_ROUND_UP(cnt, UBIFS_LPT_FANOUT); + for (i = 0; i < cnt; i++) { + if (len + c->nnode_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, + alen - len); + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum++, buf, alen, + UBI_SHORTTERM); + if (err) + goto out; + p = buf; + len = 0; + } + /* Only 1 nnode at this level, so it is the root */ + if (cnt == 1) { + c->lpt_lnum = lnum; + c->lpt_offs = len; + } + /* Set branches to the level below */ + for (j = 0; j < UBIFS_LPT_FANOUT; j++) { + if (bcnt) { + if (boffs + bsz > c->leb_size) { + blnum += 1; + boffs = 0; + } + nnode->nbranch[j].lnum = blnum; + nnode->nbranch[j].offs = boffs; + boffs += bsz; + bcnt--; + } else { + nnode->nbranch[j].lnum = 0; + nnode->nbranch[j].offs = 0; + } + } + nnode->num = calc_nnode_num(row, i); + ubifs_pack_nnode(c, p, nnode); + p += c->nnode_sz; + len += c->nnode_sz; + } + /* Only 1 nnode at this level, so it is the root */ + if (cnt == 1) + break; + /* Update the information about the level below */ + bcnt = cnt; + bsz = c->nnode_sz; + row -= 1; + } + + if (*big_lpt) { + /* Need to add LPT's save table */ + if (len + c->lsave_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum++, buf, alen, + UBI_SHORTTERM); + if (err) + goto out; + p = buf; + len = 0; + } + + c->lsave_lnum = lnum; + c->lsave_offs = len; + + for (i = 0; i < c->lsave_cnt && i < *main_lebs; i++) + lsave[i] = c->main_first + i; + for (; i < c->lsave_cnt; i++) + lsave[i] = c->main_first; + + ubifs_pack_lsave(c, p, lsave); + p += c->lsave_sz; + len += c->lsave_sz; + } + + /* Need to add LPT's own LEB properties table */ + if (len + c->ltab_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum++, buf, alen, UBI_SHORTTERM); + if (err) + goto out; + p = buf; + len = 0; + } + + c->ltab_lnum = lnum; + c->ltab_offs = len; + + /* Update ltab before packing it */ + len += c->ltab_sz; + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + + ubifs_pack_ltab(c, p, ltab); + p += c->ltab_sz; + + /* Write remaining buffer */ + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum, buf, alen, UBI_SHORTTERM); + if (err) + goto out; + + c->nhead_lnum = lnum; + c->nhead_offs = ALIGN(len, c->min_io_size); + + dbg_lp("space_bits %d", c->space_bits); + dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits); + dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits); + dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits); + dbg_lp("pcnt_bits %d", c->pcnt_bits); + dbg_lp("lnum_bits %d", c->lnum_bits); + dbg_lp("pnode_sz %d", c->pnode_sz); + dbg_lp("nnode_sz %d", c->nnode_sz); + dbg_lp("ltab_sz %d", c->ltab_sz); + dbg_lp("lsave_sz %d", c->lsave_sz); + dbg_lp("lsave_cnt %d", c->lsave_cnt); + dbg_lp("lpt_hght %d", c->lpt_hght); + dbg_lp("big_lpt %d", c->big_lpt); + dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs); + dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs); + dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); +out: + c->ltab = NULL; + kfree(lsave); + vfree(ltab); + vfree(buf); + kfree(nnode); + kfree(pnode); + return err; +} + +/** + * update_cats - add LEB properties of a pnode to LEB category lists and heaps. + * @c: UBIFS file-system description object + * @pnode: pnode + * + * When a pnode is loaded into memory, the LEB properties it contains are added, + * by this function, to the LEB category lists and heaps. + */ +static void update_cats(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + int i; + + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int cat = pnode->lprops[i].flags & LPROPS_CAT_MASK; + int lnum = pnode->lprops[i].lnum; + + if (!lnum) + return; + ubifs_add_to_cat(c, &pnode->lprops[i], cat); + } +} + +/** + * replace_cats - add LEB properties of a pnode to LEB category lists and heaps. + * @c: UBIFS file-system description object + * @old_pnode: pnode copied + * @new_pnode: pnode copy + * + * During commit it is sometimes necessary to copy a pnode + * (see dirty_cow_pnode). When that happens, references in + * category lists and heaps must be replaced. This function does that. + */ +static void replace_cats(struct ubifs_info *c, struct ubifs_pnode *old_pnode, + struct ubifs_pnode *new_pnode) +{ + int i; + + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (!new_pnode->lprops[i].lnum) + return; + ubifs_replace_cat(c, &old_pnode->lprops[i], + &new_pnode->lprops[i]); + } +} + +/** + * check_lpt_crc - check LPT node crc is correct. + * @c: UBIFS file-system description object + * @buf: buffer containing node + * @len: length of node + * + * This function returns %0 on success and a negative error code on failure. + */ +static int check_lpt_crc(void *buf, int len) +{ + int pos = 0; + uint8_t *addr = buf; + uint16_t crc, calc_crc; + + crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS); + calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + len - UBIFS_LPT_CRC_BYTES); + if (crc != calc_crc) { + ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc, + calc_crc); + dbg_dump_stack(); + return -EINVAL; + } + return 0; +} + +/** + * check_lpt_type - check LPT node type is correct. + * @c: UBIFS file-system description object + * @addr: address of type bit field is passed and returned updated here + * @pos: position of type bit field is passed and returned updated here + * @type: expected type + * + * This function returns %0 on success and a negative error code on failure. + */ +static int check_lpt_type(uint8_t **addr, int *pos, int type) +{ + int node_type; + + node_type = ubifs_unpack_bits(addr, pos, UBIFS_LPT_TYPE_BITS); + if (node_type != type) { + ubifs_err("invalid type (%d) in LPT node type %d", node_type, + type); + dbg_dump_stack(); + return -EINVAL; + } + return 0; +} + +/** + * unpack_pnode - unpack a pnode. + * @c: UBIFS file-system description object + * @buf: buffer containing packed pnode to unpack + * @pnode: pnode structure to fill + * + * This function returns %0 on success and a negative error code on failure. + */ +static int unpack_pnode(const struct ubifs_info *c, void *buf, + struct ubifs_pnode *pnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(&addr, &pos, UBIFS_LPT_PNODE); + if (err) + return err; + if (c->big_lpt) + pnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops * const lprops = &pnode->lprops[i]; + + lprops->free = ubifs_unpack_bits(&addr, &pos, c->space_bits); + lprops->free <<= 3; + lprops->dirty = ubifs_unpack_bits(&addr, &pos, c->space_bits); + lprops->dirty <<= 3; + + if (ubifs_unpack_bits(&addr, &pos, 1)) + lprops->flags = LPROPS_INDEX; + else + lprops->flags = 0; + lprops->flags |= ubifs_categorize_lprops(c, lprops); + } + err = check_lpt_crc(buf, c->pnode_sz); + return err; +} + +/** + * ubifs_unpack_nnode - unpack a nnode. + * @c: UBIFS file-system description object + * @buf: buffer containing packed nnode to unpack + * @nnode: nnode structure to fill + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(&addr, &pos, UBIFS_LPT_NNODE); + if (err) + return err; + if (c->big_lpt) + nnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int lnum; + + lnum = ubifs_unpack_bits(&addr, &pos, c->lpt_lnum_bits) + + c->lpt_first; + if (lnum == c->lpt_last + 1) + lnum = 0; + nnode->nbranch[i].lnum = lnum; + nnode->nbranch[i].offs = ubifs_unpack_bits(&addr, &pos, + c->lpt_offs_bits); + } + err = check_lpt_crc(buf, c->nnode_sz); + return err; +} + +/** + * unpack_ltab - unpack the LPT's own lprops table. + * @c: UBIFS file-system description object + * @buf: buffer from which to unpack + * + * This function returns %0 on success and a negative error code on failure. + */ +static int unpack_ltab(const struct ubifs_info *c, void *buf) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(&addr, &pos, UBIFS_LPT_LTAB); + if (err) + return err; + for (i = 0; i < c->lpt_lebs; i++) { + int free = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits); + int dirty = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits); + + if (free < 0 || free > c->leb_size || dirty < 0 || + dirty > c->leb_size || free + dirty > c->leb_size) + return -EINVAL; + + c->ltab[i].free = free; + c->ltab[i].dirty = dirty; + c->ltab[i].tgc = 0; + c->ltab[i].cmt = 0; + } + err = check_lpt_crc(buf, c->ltab_sz); + return err; +} + +/** + * unpack_lsave - unpack the LPT's save table. + * @c: UBIFS file-system description object + * @buf: buffer from which to unpack + * + * This function returns %0 on success and a negative error code on failure. + */ +static int unpack_lsave(const struct ubifs_info *c, void *buf) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(&addr, &pos, UBIFS_LPT_LSAVE); + if (err) + return err; + for (i = 0; i < c->lsave_cnt; i++) { + int lnum = ubifs_unpack_bits(&addr, &pos, c->lnum_bits); + + if (lnum < c->main_first || lnum >= c->leb_cnt) + return -EINVAL; + c->lsave[i] = lnum; + } + err = check_lpt_crc(buf, c->lsave_sz); + return err; +} + +/** + * validate_nnode - validate a nnode. + * @c: UBIFS file-system description object + * @nnode: nnode to validate + * @parent: parent nnode (or NULL for the root nnode) + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +static int validate_nnode(const struct ubifs_info *c, struct ubifs_nnode *nnode, + struct ubifs_nnode *parent, int iip) +{ + int i, lvl, max_offs; + + if (c->big_lpt) { + int num = calc_nnode_num_from_parent(c, parent, iip); + + if (nnode->num != num) + return -EINVAL; + } + lvl = parent ? parent->level - 1 : c->lpt_hght; + if (lvl < 1) + return -EINVAL; + if (lvl == 1) + max_offs = c->leb_size - c->pnode_sz; + else + max_offs = c->leb_size - c->nnode_sz; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int lnum = nnode->nbranch[i].lnum; + int offs = nnode->nbranch[i].offs; + + if (lnum == 0) { + if (offs != 0) + return -EINVAL; + continue; + } + if (lnum < c->lpt_first || lnum > c->lpt_last) + return -EINVAL; + if (offs < 0 || offs > max_offs) + return -EINVAL; + } + return 0; +} + +/** + * validate_pnode - validate a pnode. + * @c: UBIFS file-system description object + * @pnode: pnode to validate + * @parent: parent nnode + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +static int validate_pnode(const struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip) +{ + int i; + + if (c->big_lpt) { + int num = calc_pnode_num_from_parent(c, parent, iip); + + if (pnode->num != num) + return -EINVAL; + } + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int free = pnode->lprops[i].free; + int dirty = pnode->lprops[i].dirty; + + if (free < 0 || free > c->leb_size || free % c->min_io_size || + (free & 7)) + return -EINVAL; + if (dirty < 0 || dirty > c->leb_size || (dirty & 7)) + return -EINVAL; + if (dirty + free > c->leb_size) + return -EINVAL; + } + return 0; +} + +/** + * set_pnode_lnum - set LEB numbers on a pnode. + * @c: UBIFS file-system description object + * @pnode: pnode to update + * + * This function calculates the LEB numbers for the LEB properties it contains + * based on the pnode number. + */ +static void set_pnode_lnum(const struct ubifs_info *c, + struct ubifs_pnode *pnode) +{ + int i, lnum; + + lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + c->main_first; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (lnum >= c->leb_cnt) + return; + pnode->lprops[i].lnum = lnum++; + } +} + +/** + * ubifs_read_nnode - read a nnode from flash and link it to the tree in memory. + * @c: UBIFS file-system description object + * @parent: parent nnode (or NULL for the root) + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch = NULL; + struct ubifs_nnode *nnode = NULL; + void *buf = c->lpt_nod_buf; + int err, lnum, offs; + + if (parent) { + branch = &parent->nbranch[iip]; + lnum = branch->lnum; + offs = branch->offs; + } else { + lnum = c->lpt_lnum; + offs = c->lpt_offs; + } + nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_NOFS); + if (!nnode) { + err = -ENOMEM; + goto out; + } + if (lnum == 0) { + /* + * This nnode was not written which just means that the LEB + * properties in the subtree below it describe empty LEBs. We + * make the nnode as though we had read it, which in fact means + * doing almost nothing. + */ + if (c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + } else { + err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz); + if (err) + goto out; + err = ubifs_unpack_nnode(c, buf, nnode); + if (err) + goto out; + } + err = validate_nnode(c, nnode, parent, iip); + if (err) + goto out; + if (!c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + if (parent) { + branch->nnode = nnode; + nnode->level = parent->level - 1; + } else { + c->nroot = nnode; + nnode->level = c->lpt_hght; + } + nnode->parent = parent; + nnode->iip = iip; + return 0; + +out: + ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs); + kfree(nnode); + return err; +} + +/** + * read_pnode - read a pnode from flash and link it to the tree in memory. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_pnode *pnode = NULL; + void *buf = c->lpt_nod_buf; + int err, lnum, offs; + + branch = &parent->nbranch[iip]; + lnum = branch->lnum; + offs = branch->offs; + pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS); + if (!pnode) + return -ENOMEM; + + if (lnum == 0) { + /* + * This pnode was not written which just means that the LEB + * properties in it describe empty LEBs. We make the pnode as + * though we had read it. + */ + int i; + + if (c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops * const lprops = &pnode->lprops[i]; + + lprops->free = c->leb_size; + lprops->flags = ubifs_categorize_lprops(c, lprops); + } + } else { + err = ubi_read(c->ubi, lnum, buf, offs, c->pnode_sz); + if (err) + goto out; + err = unpack_pnode(c, buf, pnode); + if (err) + goto out; + } + err = validate_pnode(c, pnode, parent, iip); + if (err) + goto out; + if (!c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + branch->pnode = pnode; + pnode->parent = parent; + pnode->iip = iip; + set_pnode_lnum(c, pnode); + c->pnodes_have += 1; + return 0; + +out: + ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs); + dbg_dump_pnode(c, pnode, parent, iip); + dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); + kfree(pnode); + return err; +} + +/** + * read_ltab - read LPT's own lprops table. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int read_ltab(struct ubifs_info *c) +{ + int err; + void *buf; + + buf = vmalloc(c->ltab_sz); + if (!buf) + return -ENOMEM; + err = ubi_read(c->ubi, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz); + if (err) + goto out; + err = unpack_ltab(c, buf); +out: + vfree(buf); + return err; +} + +/** + * read_lsave - read LPT's save table. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int read_lsave(struct ubifs_info *c) +{ + int err, i; + void *buf; + + buf = vmalloc(c->lsave_sz); + if (!buf) + return -ENOMEM; + err = ubi_read(c->ubi, c->lsave_lnum, buf, c->lsave_offs, c->lsave_sz); + if (err) + goto out; + err = unpack_lsave(c, buf); + if (err) + goto out; + for (i = 0; i < c->lsave_cnt; i++) { + int lnum = c->lsave[i]; + struct ubifs_lprops *lprops; + + /* + * Due to automatic resizing, the values in the lsave table + * could be beyond the volume size - just ignore them. + */ + if (lnum >= c->leb_cnt) + continue; + lprops = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + } +out: + vfree(buf); + return err; +} + +/** + * ubifs_get_nnode - get a nnode. + * @c: UBIFS file-system description object + * @parent: parent nnode (or NULL for the root) + * @iip: index in parent + * + * This function returns a pointer to the nnode on success or a negative error + * code on failure. + */ +struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_nnode *nnode; + int err; + + branch = &parent->nbranch[iip]; + nnode = branch->nnode; + if (nnode) + return nnode; + err = ubifs_read_nnode(c, parent, iip); + if (err) + return ERR_PTR(err); + return branch->nnode; +} + +/** + * ubifs_get_pnode - get a pnode. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * This function returns a pointer to the pnode on success or a negative error + * code on failure. + */ +struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_pnode *pnode; + int err; + + branch = &parent->nbranch[iip]; + pnode = branch->pnode; + if (pnode) + return pnode; + err = read_pnode(c, parent, iip); + if (err) + return ERR_PTR(err); + update_cats(c, branch->pnode); + return branch->pnode; +} + +/** + * ubifs_lpt_lookup - lookup LEB properties in the LPT. + * @c: UBIFS file-system description object + * @lnum: LEB number to lookup + * + * This function returns a pointer to the LEB properties on success or a + * negative error code on failure. + */ +struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum) +{ + int err, i, h, iip, shft; + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + nnode = c->nroot; + i = lnum - c->main_first; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return ERR_CAST(nnode); + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + pnode = ubifs_get_pnode(c, nnode, iip); + if (IS_ERR(pnode)) + return ERR_CAST(pnode); + iip = (i & (UBIFS_LPT_FANOUT - 1)); + dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, + pnode->lprops[iip].free, pnode->lprops[iip].dirty, + pnode->lprops[iip].flags); + return &pnode->lprops[iip]; +} + +/** + * dirty_cow_nnode - ensure a nnode is not being committed. + * @c: UBIFS file-system description object + * @nnode: nnode to check + * + * Returns dirtied nnode on success or negative error code on failure. + */ +static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c, + struct ubifs_nnode *nnode) +{ + struct ubifs_nnode *n; + int i; + + if (!test_bit(COW_CNODE, &nnode->flags)) { + /* nnode is not being committed */ + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + } + return nnode; + } + + /* nnode is being committed, so copy it */ + n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS); + if (unlikely(!n)) + return ERR_PTR(-ENOMEM); + + memcpy(n, nnode, sizeof(struct ubifs_nnode)); + n->cnext = NULL; + __set_bit(DIRTY_CNODE, &n->flags); + __clear_bit(COW_CNODE, &n->flags); + + /* The children now have new parent */ + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_nbranch *branch = &n->nbranch[i]; + + if (branch->cnode) + branch->cnode->parent = n; + } + + ubifs_assert(!test_bit(OBSOLETE_CNODE, &nnode->flags)); + __set_bit(OBSOLETE_CNODE, &nnode->flags); + + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + if (nnode->parent) + nnode->parent->nbranch[n->iip].nnode = n; + else + c->nroot = n; + return n; +} + +/** + * dirty_cow_pnode - ensure a pnode is not being committed. + * @c: UBIFS file-system description object + * @pnode: pnode to check + * + * Returns dirtied pnode on success or negative error code on failure. + */ +static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c, + struct ubifs_pnode *pnode) +{ + struct ubifs_pnode *p; + + if (!test_bit(COW_CNODE, &pnode->flags)) { + /* pnode is not being committed */ + if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) { + c->dirty_pn_cnt += 1; + add_pnode_dirt(c, pnode); + } + return pnode; + } + + /* pnode is being committed, so copy it */ + p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS); + if (unlikely(!p)) + return ERR_PTR(-ENOMEM); + + memcpy(p, pnode, sizeof(struct ubifs_pnode)); + p->cnext = NULL; + __set_bit(DIRTY_CNODE, &p->flags); + __clear_bit(COW_CNODE, &p->flags); + replace_cats(c, pnode, p); + + ubifs_assert(!test_bit(OBSOLETE_CNODE, &pnode->flags)); + __set_bit(OBSOLETE_CNODE, &pnode->flags); + + c->dirty_pn_cnt += 1; + add_pnode_dirt(c, pnode); + pnode->parent->nbranch[p->iip].pnode = p; + return p; +} + +/** + * ubifs_lpt_lookup_dirty - lookup LEB properties in the LPT. + * @c: UBIFS file-system description object + * @lnum: LEB number to lookup + * + * This function returns a pointer to the LEB properties on success or a + * negative error code on failure. + */ +struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum) +{ + int err, i, h, iip, shft; + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + nnode = c->nroot; + nnode = dirty_cow_nnode(c, nnode); + if (IS_ERR(nnode)) + return ERR_CAST(nnode); + i = lnum - c->main_first; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return ERR_CAST(nnode); + nnode = dirty_cow_nnode(c, nnode); + if (IS_ERR(nnode)) + return ERR_CAST(nnode); + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + pnode = ubifs_get_pnode(c, nnode, iip); + if (IS_ERR(pnode)) + return ERR_CAST(pnode); + pnode = dirty_cow_pnode(c, pnode); + if (IS_ERR(pnode)) + return ERR_CAST(pnode); + iip = (i & (UBIFS_LPT_FANOUT - 1)); + dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, + pnode->lprops[iip].free, pnode->lprops[iip].dirty, + pnode->lprops[iip].flags); + ubifs_assert(test_bit(DIRTY_CNODE, &pnode->flags)); + return &pnode->lprops[iip]; +} + +/** + * lpt_init_rd - initialize the LPT for reading. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int lpt_init_rd(struct ubifs_info *c) +{ + int err, i; + + c->ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + if (!c->ltab) + return -ENOMEM; + + i = max_t(int, c->nnode_sz, c->pnode_sz); + c->lpt_nod_buf = kmalloc(i, GFP_KERNEL); + if (!c->lpt_nod_buf) + return -ENOMEM; + + for (i = 0; i < LPROPS_HEAP_CNT; i++) { + c->lpt_heap[i].arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, + GFP_KERNEL); + if (!c->lpt_heap[i].arr) + return -ENOMEM; + c->lpt_heap[i].cnt = 0; + c->lpt_heap[i].max_cnt = LPT_HEAP_SZ; + } + + c->dirty_idx.arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, GFP_KERNEL); + if (!c->dirty_idx.arr) + return -ENOMEM; + c->dirty_idx.cnt = 0; + c->dirty_idx.max_cnt = LPT_HEAP_SZ; + + err = read_ltab(c); + if (err) + return err; + + dbg_lp("space_bits %d", c->space_bits); + dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits); + dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits); + dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits); + dbg_lp("pcnt_bits %d", c->pcnt_bits); + dbg_lp("lnum_bits %d", c->lnum_bits); + dbg_lp("pnode_sz %d", c->pnode_sz); + dbg_lp("nnode_sz %d", c->nnode_sz); + dbg_lp("ltab_sz %d", c->ltab_sz); + dbg_lp("lsave_sz %d", c->lsave_sz); + dbg_lp("lsave_cnt %d", c->lsave_cnt); + dbg_lp("lpt_hght %d", c->lpt_hght); + dbg_lp("big_lpt %d", c->big_lpt); + dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs); + dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs); + dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); + + return 0; +} + +/** + * lpt_init_wr - initialize the LPT for writing. + * @c: UBIFS file-system description object + * + * 'lpt_init_rd()' must have been called already. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int lpt_init_wr(struct ubifs_info *c) +{ + int err, i; + + c->ltab_cmt = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + if (!c->ltab_cmt) + return -ENOMEM; + + c->lpt_buf = vmalloc(c->leb_size); + if (!c->lpt_buf) + return -ENOMEM; + + if (c->big_lpt) { + c->lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_NOFS); + if (!c->lsave) + return -ENOMEM; + err = read_lsave(c); + if (err) + return err; + } + + for (i = 0; i < c->lpt_lebs; i++) + if (c->ltab[i].free == c->leb_size) { + err = ubifs_leb_unmap(c, i + c->lpt_first); + if (err) + return err; + } + + return 0; +} + +/** + * ubifs_lpt_init - initialize the LPT. + * @c: UBIFS file-system description object + * @rd: whether to initialize lpt for reading + * @wr: whether to initialize lpt for writing + * + * For mounting 'rw', @rd and @wr are both true. For mounting 'ro', @rd is true + * and @wr is false. For mounting from 'ro' to 'rw', @rd is false and @wr is + * true. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr) +{ + int err; + + if (rd) { + err = lpt_init_rd(c); + if (err) + return err; + } + + if (wr) { + err = lpt_init_wr(c); + if (err) + return err; + } + + return 0; +} + +/** + * struct lpt_scan_node - somewhere to put nodes while we scan LPT. + * @nnode: where to keep a nnode + * @pnode: where to keep a pnode + * @cnode: where to keep a cnode + * @in_tree: is the node in the tree in memory + * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in + * the tree + * @ptr.pnode: ditto for pnode + * @ptr.cnode: ditto for cnode + */ +struct lpt_scan_node { + union { + struct ubifs_nnode nnode; + struct ubifs_pnode pnode; + struct ubifs_cnode cnode; + }; + int in_tree; + union { + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + struct ubifs_cnode *cnode; + } ptr; +}; + +/** + * scan_get_nnode - for the scan, get a nnode from either the tree or flash. + * @c: the UBIFS file-system description object + * @path: where to put the nnode + * @parent: parent of the nnode + * @iip: index in parent of the nnode + * + * This function returns a pointer to the nnode on success or a negative error + * code on failure. + */ +static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c, + struct lpt_scan_node *path, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_nnode *nnode; + void *buf = c->lpt_nod_buf; + int err; + + branch = &parent->nbranch[iip]; + nnode = branch->nnode; + if (nnode) { + path->in_tree = 1; + path->ptr.nnode = nnode; + return nnode; + } + nnode = &path->nnode; + path->in_tree = 0; + path->ptr.nnode = nnode; + memset(nnode, 0, sizeof(struct ubifs_nnode)); + if (branch->lnum == 0) { + /* + * This nnode was not written which just means that the LEB + * properties in the subtree below it describe empty LEBs. We + * make the nnode as though we had read it, which in fact means + * doing almost nothing. + */ + if (c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + } else { + err = ubi_read(c->ubi, branch->lnum, buf, branch->offs, + c->nnode_sz); + if (err) + return ERR_PTR(err); + err = ubifs_unpack_nnode(c, buf, nnode); + if (err) + return ERR_PTR(err); + } + err = validate_nnode(c, nnode, parent, iip); + if (err) + return ERR_PTR(err); + if (!c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + nnode->level = parent->level - 1; + nnode->parent = parent; + nnode->iip = iip; + return nnode; +} + +/** + * scan_get_pnode - for the scan, get a pnode from either the tree or flash. + * @c: the UBIFS file-system description object + * @path: where to put the pnode + * @parent: parent of the pnode + * @iip: index in parent of the pnode + * + * This function returns a pointer to the pnode on success or a negative error + * code on failure. + */ +static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c, + struct lpt_scan_node *path, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_pnode *pnode; + void *buf = c->lpt_nod_buf; + int err; + + branch = &parent->nbranch[iip]; + pnode = branch->pnode; + if (pnode) { + path->in_tree = 1; + path->ptr.pnode = pnode; + return pnode; + } + pnode = &path->pnode; + path->in_tree = 0; + path->ptr.pnode = pnode; + memset(pnode, 0, sizeof(struct ubifs_pnode)); + if (branch->lnum == 0) { + /* + * This pnode was not written which just means that the LEB + * properties in it describe empty LEBs. We make the pnode as + * though we had read it. + */ + int i; + + if (c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops * const lprops = &pnode->lprops[i]; + + lprops->free = c->leb_size; + lprops->flags = ubifs_categorize_lprops(c, lprops); + } + } else { + ubifs_assert(branch->lnum >= c->lpt_first && + branch->lnum <= c->lpt_last); + ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size); + err = ubi_read(c->ubi, branch->lnum, buf, branch->offs, + c->pnode_sz); + if (err) + return ERR_PTR(err); + err = unpack_pnode(c, buf, pnode); + if (err) + return ERR_PTR(err); + } + err = validate_pnode(c, pnode, parent, iip); + if (err) + return ERR_PTR(err); + if (!c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + pnode->parent = parent; + pnode->iip = iip; + set_pnode_lnum(c, pnode); + return pnode; +} + +/** + * ubifs_lpt_scan_nolock - scan the LPT. + * @c: the UBIFS file-system description object + * @start_lnum: LEB number from which to start scanning + * @end_lnum: LEB number at which to stop scanning + * @scan_cb: callback function called for each lprops + * @data: data to be passed to the callback function + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum, + ubifs_lpt_scan_callback scan_cb, void *data) +{ + int err = 0, i, h, iip, shft; + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + struct lpt_scan_node *path; + + if (start_lnum == -1) { + start_lnum = end_lnum + 1; + if (start_lnum >= c->leb_cnt) + start_lnum = c->main_first; + } + + ubifs_assert(start_lnum >= c->main_first && start_lnum < c->leb_cnt); + ubifs_assert(end_lnum >= c->main_first && end_lnum < c->leb_cnt); + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return err; + } + + path = kmalloc(sizeof(struct lpt_scan_node) * (c->lpt_hght + 1), + GFP_NOFS); + if (!path) + return -ENOMEM; + + path[0].ptr.nnode = c->nroot; + path[0].in_tree = 1; +again: + /* Descend to the pnode containing start_lnum */ + nnode = c->nroot; + i = start_lnum - c->main_first; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = scan_get_nnode(c, path + h, nnode, iip); + if (IS_ERR(nnode)) { + err = PTR_ERR(nnode); + goto out; + } + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + pnode = scan_get_pnode(c, path + h, nnode, iip); + if (IS_ERR(pnode)) { + err = PTR_ERR(pnode); + goto out; + } + iip = (i & (UBIFS_LPT_FANOUT - 1)); + + /* Loop for each lprops */ + while (1) { + struct ubifs_lprops *lprops = &pnode->lprops[iip]; + int ret, lnum = lprops->lnum; + + ret = scan_cb(c, lprops, path[h].in_tree, data); + if (ret < 0) { + err = ret; + goto out; + } + if (ret & LPT_SCAN_ADD) { + /* Add all the nodes in path to the tree in memory */ + for (h = 1; h < c->lpt_hght; h++) { + const size_t sz = sizeof(struct ubifs_nnode); + struct ubifs_nnode *parent; + + if (path[h].in_tree) + continue; + nnode = kmalloc(sz, GFP_NOFS); + if (!nnode) { + err = -ENOMEM; + goto out; + } + memcpy(nnode, &path[h].nnode, sz); + parent = nnode->parent; + parent->nbranch[nnode->iip].nnode = nnode; + path[h].ptr.nnode = nnode; + path[h].in_tree = 1; + path[h + 1].cnode.parent = nnode; + } + if (path[h].in_tree) + ubifs_ensure_cat(c, lprops); + else { + const size_t sz = sizeof(struct ubifs_pnode); + struct ubifs_nnode *parent; + + pnode = kmalloc(sz, GFP_NOFS); + if (!pnode) { + err = -ENOMEM; + goto out; + } + memcpy(pnode, &path[h].pnode, sz); + parent = pnode->parent; + parent->nbranch[pnode->iip].pnode = pnode; + path[h].ptr.pnode = pnode; + path[h].in_tree = 1; + update_cats(c, pnode); + c->pnodes_have += 1; + } + err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *) + c->nroot, 0, 0); + if (err) + goto out; + err = dbg_check_cats(c); + if (err) + goto out; + } + if (ret & LPT_SCAN_STOP) { + err = 0; + break; + } + /* Get the next lprops */ + if (lnum == end_lnum) { + /* + * We got to the end without finding what we were + * looking for + */ + err = -ENOSPC; + goto out; + } + if (lnum + 1 >= c->leb_cnt) { + /* Wrap-around to the beginning */ + start_lnum = c->main_first; + goto again; + } + if (iip + 1 < UBIFS_LPT_FANOUT) { + /* Next lprops is in the same pnode */ + iip += 1; + continue; + } + /* We need to get the next pnode. Go up until we can go right */ + iip = pnode->iip; + while (1) { + h -= 1; + ubifs_assert(h >= 0); + nnode = path[h].ptr.nnode; + if (iip + 1 < UBIFS_LPT_FANOUT) + break; + iip = nnode->iip; + } + /* Go right */ + iip += 1; + /* Descend to the pnode */ + h += 1; + for (; h < c->lpt_hght; h++) { + nnode = scan_get_nnode(c, path + h, nnode, iip); + if (IS_ERR(nnode)) { + err = PTR_ERR(nnode); + goto out; + } + iip = 0; + } + pnode = scan_get_pnode(c, path + h, nnode, iip); + if (IS_ERR(pnode)) { + err = PTR_ERR(pnode); + goto out; + } + iip = 0; + } +out: + kfree(path); + return err; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * dbg_chk_pnode - check a pnode. + * @c: the UBIFS file-system description object + * @pnode: pnode to check + * @col: pnode column + * + * This function returns %0 on success and a negative error code on failure. + */ +static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + int col) +{ + int i; + + if (pnode->num != col) { + dbg_err("pnode num %d expected %d parent num %d iip %d", + pnode->num, col, pnode->parent->num, pnode->iip); + return -EINVAL; + } + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops *lp, *lprops = &pnode->lprops[i]; + int lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + i + + c->main_first; + int found, cat = lprops->flags & LPROPS_CAT_MASK; + struct ubifs_lpt_heap *heap; + struct list_head *list = NULL; + + if (lnum >= c->leb_cnt) + continue; + if (lprops->lnum != lnum) { + dbg_err("bad LEB number %d expected %d", + lprops->lnum, lnum); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + if (cat != LPROPS_UNCAT) { + dbg_err("LEB %d taken but not uncat %d", + lprops->lnum, cat); + return -EINVAL; + } + continue; + } + if (lprops->flags & LPROPS_INDEX) { + switch (cat) { + case LPROPS_UNCAT: + case LPROPS_DIRTY_IDX: + case LPROPS_FRDI_IDX: + break; + default: + dbg_err("LEB %d index but cat %d", + lprops->lnum, cat); + return -EINVAL; + } + } else { + switch (cat) { + case LPROPS_UNCAT: + case LPROPS_DIRTY: + case LPROPS_FREE: + case LPROPS_EMPTY: + case LPROPS_FREEABLE: + break; + default: + dbg_err("LEB %d not index but cat %d", + lprops->lnum, cat); + return -EINVAL; + } + } + switch (cat) { + case LPROPS_UNCAT: + list = &c->uncat_list; + break; + case LPROPS_EMPTY: + list = &c->empty_list; + break; + case LPROPS_FREEABLE: + list = &c->freeable_list; + break; + case LPROPS_FRDI_IDX: + list = &c->frdi_idx_list; + break; + } + found = 0; + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + heap = &c->lpt_heap[cat - 1]; + if (lprops->hpos < heap->cnt && + heap->arr[lprops->hpos] == lprops) + found = 1; + break; + case LPROPS_UNCAT: + case LPROPS_EMPTY: + case LPROPS_FREEABLE: + case LPROPS_FRDI_IDX: + list_for_each_entry(lp, list, list) + if (lprops == lp) { + found = 1; + break; + } + break; + } + if (!found) { + dbg_err("LEB %d cat %d not found in cat heap/list", + lprops->lnum, cat); + return -EINVAL; + } + switch (cat) { + case LPROPS_EMPTY: + if (lprops->free != c->leb_size) { + dbg_err("LEB %d cat %d free %d dirty %d", + lprops->lnum, cat, lprops->free, + lprops->dirty); + return -EINVAL; + } + case LPROPS_FREEABLE: + case LPROPS_FRDI_IDX: + if (lprops->free + lprops->dirty != c->leb_size) { + dbg_err("LEB %d cat %d free %d dirty %d", + lprops->lnum, cat, lprops->free, + lprops->dirty); + return -EINVAL; + } + } + } + return 0; +} + +/** + * dbg_check_lpt_nodes - check nnodes and pnodes. + * @c: the UBIFS file-system description object + * @cnode: next cnode (nnode or pnode) to check + * @row: row of cnode (root is zero) + * @col: column of cnode (leftmost is zero) + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, + int row, int col) +{ + struct ubifs_nnode *nnode, *nn; + struct ubifs_cnode *cn; + int num, iip = 0, err; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + while (cnode) { + ubifs_assert(row >= 0); + nnode = cnode->parent; + if (cnode->level) { + /* cnode is a nnode */ + num = calc_nnode_num(row, col); + if (cnode->num != num) { + dbg_err("nnode num %d expected %d " + "parent num %d iip %d", cnode->num, num, + (nnode ? nnode->num : 0), cnode->iip); + return -EINVAL; + } + nn = (struct ubifs_nnode *)cnode; + while (iip < UBIFS_LPT_FANOUT) { + cn = nn->nbranch[iip].cnode; + if (cn) { + /* Go down */ + row += 1; + col <<= UBIFS_LPT_FANOUT_SHIFT; + col += iip; + iip = 0; + cnode = cn; + break; + } + /* Go right */ + iip += 1; + } + if (iip < UBIFS_LPT_FANOUT) + continue; + } else { + struct ubifs_pnode *pnode; + + /* cnode is a pnode */ + pnode = (struct ubifs_pnode *)cnode; + err = dbg_chk_pnode(c, pnode, col); + if (err) + return err; + } + /* Go up and to the right */ + row -= 1; + col >>= UBIFS_LPT_FANOUT_SHIFT; + iip = cnode->iip + 1; + cnode = (struct ubifs_cnode *)nnode; + } + return 0; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c new file mode 100644 index 00000000..dfcb5748 --- /dev/null +++ b/fs/ubifs/lpt_commit.c @@ -0,0 +1,2050 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements commit-related functionality of the LEB properties + * subsystem. + */ + +#include +#include +#include "ubifs.h" + +#ifdef CONFIG_UBIFS_FS_DEBUG +static int dbg_populate_lsave(struct ubifs_info *c); +#else +#define dbg_populate_lsave(c) 0 +#endif + +/** + * first_dirty_cnode - find first dirty cnode. + * @c: UBIFS file-system description object + * @nnode: nnode at which to start + * + * This function returns the first dirty cnode or %NULL if there is not one. + */ +static struct ubifs_cnode *first_dirty_cnode(struct ubifs_nnode *nnode) +{ + ubifs_assert(nnode); + while (1) { + int i, cont = 0; + + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_cnode *cnode; + + cnode = nnode->nbranch[i].cnode; + if (cnode && + test_bit(DIRTY_CNODE, &cnode->flags)) { + if (cnode->level == 0) + return cnode; + nnode = (struct ubifs_nnode *)cnode; + cont = 1; + break; + } + } + if (!cont) + return (struct ubifs_cnode *)nnode; + } +} + +/** + * next_dirty_cnode - find next dirty cnode. + * @cnode: cnode from which to begin searching + * + * This function returns the next dirty cnode or %NULL if there is not one. + */ +static struct ubifs_cnode *next_dirty_cnode(struct ubifs_cnode *cnode) +{ + struct ubifs_nnode *nnode; + int i; + + ubifs_assert(cnode); + nnode = cnode->parent; + if (!nnode) + return NULL; + for (i = cnode->iip + 1; i < UBIFS_LPT_FANOUT; i++) { + cnode = nnode->nbranch[i].cnode; + if (cnode && test_bit(DIRTY_CNODE, &cnode->flags)) { + if (cnode->level == 0) + return cnode; /* cnode is a pnode */ + /* cnode is a nnode */ + return first_dirty_cnode((struct ubifs_nnode *)cnode); + } + } + return (struct ubifs_cnode *)nnode; +} + +/** + * get_cnodes_to_commit - create list of dirty cnodes to commit. + * @c: UBIFS file-system description object + * + * This function returns the number of cnodes to commit. + */ +static int get_cnodes_to_commit(struct ubifs_info *c) +{ + struct ubifs_cnode *cnode, *cnext; + int cnt = 0; + + if (!c->nroot) + return 0; + + if (!test_bit(DIRTY_CNODE, &c->nroot->flags)) + return 0; + + c->lpt_cnext = first_dirty_cnode(c->nroot); + cnode = c->lpt_cnext; + if (!cnode) + return 0; + cnt += 1; + while (1) { + ubifs_assert(!test_bit(COW_ZNODE, &cnode->flags)); + __set_bit(COW_ZNODE, &cnode->flags); + cnext = next_dirty_cnode(cnode); + if (!cnext) { + cnode->cnext = c->lpt_cnext; + break; + } + cnode->cnext = cnext; + cnode = cnext; + cnt += 1; + } + dbg_cmt("committing %d cnodes", cnt); + dbg_lp("committing %d cnodes", cnt); + ubifs_assert(cnt == c->dirty_nn_cnt + c->dirty_pn_cnt); + return cnt; +} + +/** + * upd_ltab - update LPT LEB properties. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @free: amount of free space + * @dirty: amount of dirty space to add + */ +static void upd_ltab(struct ubifs_info *c, int lnum, int free, int dirty) +{ + dbg_lp("LEB %d free %d dirty %d to %d +%d", + lnum, c->ltab[lnum - c->lpt_first].free, + c->ltab[lnum - c->lpt_first].dirty, free, dirty); + ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); + c->ltab[lnum - c->lpt_first].free = free; + c->ltab[lnum - c->lpt_first].dirty += dirty; +} + +/** + * alloc_lpt_leb - allocate an LPT LEB that is empty. + * @c: UBIFS file-system description object + * @lnum: LEB number is passed and returned here + * + * This function finds the next empty LEB in the ltab starting from @lnum. If a + * an empty LEB is found it is returned in @lnum and the function returns %0. + * Otherwise the function returns -ENOSPC. Note however, that LPT is designed + * never to run out of space. + */ +static int alloc_lpt_leb(struct ubifs_info *c, int *lnum) +{ + int i, n; + + n = *lnum - c->lpt_first + 1; + for (i = n; i < c->lpt_lebs; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (c->ltab[i].free == c->leb_size) { + c->ltab[i].cmt = 1; + *lnum = i + c->lpt_first; + return 0; + } + } + + for (i = 0; i < n; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (c->ltab[i].free == c->leb_size) { + c->ltab[i].cmt = 1; + *lnum = i + c->lpt_first; + return 0; + } + } + return -ENOSPC; +} + +/** + * layout_cnodes - layout cnodes for commit. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int layout_cnodes(struct ubifs_info *c) +{ + int lnum, offs, len, alen, done_lsave, done_ltab, err; + struct ubifs_cnode *cnode; + + err = dbg_chk_lpt_sz(c, 0, 0); + if (err) + return err; + cnode = c->lpt_cnext; + if (!cnode) + return 0; + lnum = c->nhead_lnum; + offs = c->nhead_offs; + /* Try to place lsave and ltab nicely */ + done_lsave = !c->big_lpt; + done_ltab = 0; + if (!done_lsave && offs + c->lsave_sz <= c->leb_size) { + done_lsave = 1; + c->lsave_lnum = lnum; + c->lsave_offs = offs; + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + } + + if (offs + c->ltab_sz <= c->leb_size) { + done_ltab = 1; + c->ltab_lnum = lnum; + c->ltab_offs = offs; + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + } + + do { + if (cnode->level) { + len = c->nnode_sz; + c->dirty_nn_cnt -= 1; + } else { + len = c->pnode_sz; + c->dirty_pn_cnt -= 1; + } + while (offs + len > c->leb_size) { + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = alloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + /* Try to place lsave and ltab nicely */ + if (!done_lsave) { + done_lsave = 1; + c->lsave_lnum = lnum; + c->lsave_offs = offs; + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + continue; + } + if (!done_ltab) { + done_ltab = 1; + c->ltab_lnum = lnum; + c->ltab_offs = offs; + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + continue; + } + break; + } + if (cnode->parent) { + cnode->parent->nbranch[cnode->iip].lnum = lnum; + cnode->parent->nbranch[cnode->iip].offs = offs; + } else { + c->lpt_lnum = lnum; + c->lpt_offs = offs; + } + offs += len; + dbg_chk_lpt_sz(c, 1, len); + cnode = cnode->cnext; + } while (cnode && cnode != c->lpt_cnext); + + /* Make sure to place LPT's save table */ + if (!done_lsave) { + if (offs + c->lsave_sz > c->leb_size) { + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = alloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + } + done_lsave = 1; + c->lsave_lnum = lnum; + c->lsave_offs = offs; + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + } + + /* Make sure to place LPT's own lprops table */ + if (!done_ltab) { + if (offs + c->ltab_sz > c->leb_size) { + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = alloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + } + done_ltab = 1; + c->ltab_lnum = lnum; + c->ltab_offs = offs; + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + } + + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 4, alen - offs); + err = dbg_chk_lpt_sz(c, 3, alen); + if (err) + return err; + return 0; + +no_space: + ubifs_err("LPT out of space"); + dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, " + "done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + dbg_dump_lpt_info(c); + dbg_dump_lpt_lebs(c); + dump_stack(); + return err; +} + +/** + * realloc_lpt_leb - allocate an LPT LEB that is empty. + * @c: UBIFS file-system description object + * @lnum: LEB number is passed and returned here + * + * This function duplicates exactly the results of the function alloc_lpt_leb. + * It is used during end commit to reallocate the same LEB numbers that were + * allocated by alloc_lpt_leb during start commit. + * + * This function finds the next LEB that was allocated by the alloc_lpt_leb + * function starting from @lnum. If a LEB is found it is returned in @lnum and + * the function returns %0. Otherwise the function returns -ENOSPC. + * Note however, that LPT is designed never to run out of space. + */ +static int realloc_lpt_leb(struct ubifs_info *c, int *lnum) +{ + int i, n; + + n = *lnum - c->lpt_first + 1; + for (i = n; i < c->lpt_lebs; i++) + if (c->ltab[i].cmt) { + c->ltab[i].cmt = 0; + *lnum = i + c->lpt_first; + return 0; + } + + for (i = 0; i < n; i++) + if (c->ltab[i].cmt) { + c->ltab[i].cmt = 0; + *lnum = i + c->lpt_first; + return 0; + } + return -ENOSPC; +} + +/** + * write_cnodes - write cnodes for commit. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int write_cnodes(struct ubifs_info *c) +{ + int lnum, offs, len, from, err, wlen, alen, done_ltab, done_lsave; + struct ubifs_cnode *cnode; + void *buf = c->lpt_buf; + + cnode = c->lpt_cnext; + if (!cnode) + return 0; + lnum = c->nhead_lnum; + offs = c->nhead_offs; + from = offs; + /* Ensure empty LEB is unmapped */ + if (offs == 0) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + /* Try to place lsave and ltab nicely */ + done_lsave = !c->big_lpt; + done_ltab = 0; + if (!done_lsave && offs + c->lsave_sz <= c->leb_size) { + done_lsave = 1; + ubifs_pack_lsave(c, buf + offs, c->lsave); + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + } + + if (offs + c->ltab_sz <= c->leb_size) { + done_ltab = 1; + ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + } + + /* Loop for each cnode */ + do { + if (cnode->level) + len = c->nnode_sz; + else + len = c->pnode_sz; + while (offs + len > c->leb_size) { + wlen = offs - from; + if (wlen) { + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, + alen, UBI_SHORTTERM); + if (err) + return err; + } + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = realloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = from = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + /* Try to place lsave and ltab nicely */ + if (!done_lsave) { + done_lsave = 1; + ubifs_pack_lsave(c, buf + offs, c->lsave); + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + continue; + } + if (!done_ltab) { + done_ltab = 1; + ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + continue; + } + break; + } + if (cnode->level) + ubifs_pack_nnode(c, buf + offs, + (struct ubifs_nnode *)cnode); + else + ubifs_pack_pnode(c, buf + offs, + (struct ubifs_pnode *)cnode); + /* + * The reason for the barriers is the same as in case of TNC. + * See comment in 'write_index()'. 'dirty_cow_nnode()' and + * 'dirty_cow_pnode()' are the functions for which this is + * important. + */ + clear_bit(DIRTY_CNODE, &cnode->flags); + smp_mb__before_clear_bit(); + clear_bit(COW_ZNODE, &cnode->flags); + smp_mb__after_clear_bit(); + offs += len; + dbg_chk_lpt_sz(c, 1, len); + cnode = cnode->cnext; + } while (cnode && cnode != c->lpt_cnext); + + /* Make sure to place LPT's save table */ + if (!done_lsave) { + if (offs + c->lsave_sz > c->leb_size) { + wlen = offs - from; + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, alen, + UBI_SHORTTERM); + if (err) + return err; + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = realloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = from = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + done_lsave = 1; + ubifs_pack_lsave(c, buf + offs, c->lsave); + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + } + + /* Make sure to place LPT's own lprops table */ + if (!done_ltab) { + if (offs + c->ltab_sz > c->leb_size) { + wlen = offs - from; + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, alen, + UBI_SHORTTERM); + if (err) + return err; + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = realloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = from = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + done_ltab = 1; + ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + } + + /* Write remaining data in buffer */ + wlen = offs - from; + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM); + if (err) + return err; + + dbg_chk_lpt_sz(c, 4, alen - wlen); + err = dbg_chk_lpt_sz(c, 3, ALIGN(offs, c->min_io_size)); + if (err) + return err; + + c->nhead_lnum = lnum; + c->nhead_offs = ALIGN(offs, c->min_io_size); + + dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs); + dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs); + dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); + + return 0; + +no_space: + ubifs_err("LPT out of space mismatch"); + dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab " + "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + dbg_dump_lpt_info(c); + dbg_dump_lpt_lebs(c); + dump_stack(); + return err; +} + +/** + * next_pnode_to_dirty - find next pnode to dirty. + * @c: UBIFS file-system description object + * @pnode: pnode + * + * This function returns the next pnode to dirty or %NULL if there are no more + * pnodes. Note that pnodes that have never been written (lnum == 0) are + * skipped. + */ +static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c, + struct ubifs_pnode *pnode) +{ + struct ubifs_nnode *nnode; + int iip; + + /* Try to go right */ + nnode = pnode->parent; + for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) { + if (nnode->nbranch[iip].lnum) + return ubifs_get_pnode(c, nnode, iip); + } + + /* Go up while can't go right */ + do { + iip = nnode->iip + 1; + nnode = nnode->parent; + if (!nnode) + return NULL; + for (; iip < UBIFS_LPT_FANOUT; iip++) { + if (nnode->nbranch[iip].lnum) + break; + } + } while (iip >= UBIFS_LPT_FANOUT); + + /* Go right */ + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return (void *)nnode; + + /* Go down to level 1 */ + while (nnode->level > 1) { + for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) { + if (nnode->nbranch[iip].lnum) + break; + } + if (iip >= UBIFS_LPT_FANOUT) { + /* + * Should not happen, but we need to keep going + * if it does. + */ + iip = 0; + } + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return (void *)nnode; + } + + for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) + if (nnode->nbranch[iip].lnum) + break; + if (iip >= UBIFS_LPT_FANOUT) + /* Should not happen, but we need to keep going if it does */ + iip = 0; + return ubifs_get_pnode(c, nnode, iip); +} + +/** + * pnode_lookup - lookup a pnode in the LPT. + * @c: UBIFS file-system description object + * @i: pnode number (0 to main_lebs - 1) + * + * This function returns a pointer to the pnode on success or a negative + * error code on failure. + */ +static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i) +{ + int err, h, iip, shft; + struct ubifs_nnode *nnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + i <<= UBIFS_LPT_FANOUT_SHIFT; + nnode = c->nroot; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return ERR_CAST(nnode); + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + return ubifs_get_pnode(c, nnode, iip); +} + +/** + * add_pnode_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @pnode: pnode for which to add dirt + */ +static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum, + c->pnode_sz); +} + +/** + * do_make_pnode_dirty - mark a pnode dirty. + * @c: UBIFS file-system description object + * @pnode: pnode to mark dirty + */ +static void do_make_pnode_dirty(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + /* Assumes cnext list is empty i.e. not called during commit */ + if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) { + struct ubifs_nnode *nnode; + + c->dirty_pn_cnt += 1; + add_pnode_dirt(c, pnode); + /* Mark parent and ancestors dirty too */ + nnode = pnode->parent; + while (nnode) { + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + nnode = nnode->parent; + } else + break; + } + } +} + +/** + * make_tree_dirty - mark the entire LEB properties tree dirty. + * @c: UBIFS file-system description object + * + * This function is used by the "small" LPT model to cause the entire LEB + * properties tree to be written. The "small" LPT model does not use LPT + * garbage collection because it is more efficient to write the entire tree + * (because it is small). + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_tree_dirty(struct ubifs_info *c) +{ + struct ubifs_pnode *pnode; + + pnode = pnode_lookup(c, 0); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + + while (pnode) { + do_make_pnode_dirty(c, pnode); + pnode = next_pnode_to_dirty(c, pnode); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + } + return 0; +} + +/** + * need_write_all - determine if the LPT area is running out of free space. + * @c: UBIFS file-system description object + * + * This function returns %1 if the LPT area is running out of free space and %0 + * if it is not. + */ +static int need_write_all(struct ubifs_info *c) +{ + long long free = 0; + int i; + + for (i = 0; i < c->lpt_lebs; i++) { + if (i + c->lpt_first == c->nhead_lnum) + free += c->leb_size - c->nhead_offs; + else if (c->ltab[i].free == c->leb_size) + free += c->leb_size; + else if (c->ltab[i].free + c->ltab[i].dirty == c->leb_size) + free += c->leb_size; + } + /* Less than twice the size left */ + if (free <= c->lpt_sz * 2) + return 1; + return 0; +} + +/** + * lpt_tgc_start - start trivial garbage collection of LPT LEBs. + * @c: UBIFS file-system description object + * + * LPT trivial garbage collection is where a LPT LEB contains only dirty and + * free space and so may be reused as soon as the next commit is completed. + * This function is called during start commit to mark LPT LEBs for trivial GC. + */ +static void lpt_tgc_start(struct ubifs_info *c) +{ + int i; + + for (i = 0; i < c->lpt_lebs; i++) { + if (i + c->lpt_first == c->nhead_lnum) + continue; + if (c->ltab[i].dirty > 0 && + c->ltab[i].free + c->ltab[i].dirty == c->leb_size) { + c->ltab[i].tgc = 1; + c->ltab[i].free = c->leb_size; + c->ltab[i].dirty = 0; + dbg_lp("LEB %d", i + c->lpt_first); + } + } +} + +/** + * lpt_tgc_end - end trivial garbage collection of LPT LEBs. + * @c: UBIFS file-system description object + * + * LPT trivial garbage collection is where a LPT LEB contains only dirty and + * free space and so may be reused as soon as the next commit is completed. + * This function is called after the commit is completed (master node has been + * written) and un-maps LPT LEBs that were marked for trivial GC. + */ +static int lpt_tgc_end(struct ubifs_info *c) +{ + int i, err; + + for (i = 0; i < c->lpt_lebs; i++) + if (c->ltab[i].tgc) { + err = ubifs_leb_unmap(c, i + c->lpt_first); + if (err) + return err; + c->ltab[i].tgc = 0; + dbg_lp("LEB %d", i + c->lpt_first); + } + return 0; +} + +/** + * populate_lsave - fill the lsave array with important LEB numbers. + * @c: the UBIFS file-system description object + * + * This function is only called for the "big" model. It records a small number + * of LEB numbers of important LEBs. Important LEBs are ones that are (from + * most important to least important): empty, freeable, freeable index, dirty + * index, dirty or free. Upon mount, we read this list of LEB numbers and bring + * their pnodes into memory. That will stop us from having to scan the LPT + * straight away. For the "small" model we assume that scanning the LPT is no + * big deal. + */ +static void populate_lsave(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + int i, cnt = 0; + + ubifs_assert(c->big_lpt); + if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) { + c->lpt_drty_flgs |= LSAVE_DIRTY; + ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz); + } + + if (dbg_populate_lsave(c)) + return; + + list_for_each_entry(lprops, &c->empty_list, list) { + c->lsave[cnt++] = lprops->lnum; + if (cnt >= c->lsave_cnt) + return; + } + list_for_each_entry(lprops, &c->freeable_list, list) { + c->lsave[cnt++] = lprops->lnum; + if (cnt >= c->lsave_cnt) + return; + } + list_for_each_entry(lprops, &c->frdi_idx_list, list) { + c->lsave[cnt++] = lprops->lnum; + if (cnt >= c->lsave_cnt) + return; + } + heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + for (i = 0; i < heap->cnt; i++) { + c->lsave[cnt++] = heap->arr[i]->lnum; + if (cnt >= c->lsave_cnt) + return; + } + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + for (i = 0; i < heap->cnt; i++) { + c->lsave[cnt++] = heap->arr[i]->lnum; + if (cnt >= c->lsave_cnt) + return; + } + heap = &c->lpt_heap[LPROPS_FREE - 1]; + for (i = 0; i < heap->cnt; i++) { + c->lsave[cnt++] = heap->arr[i]->lnum; + if (cnt >= c->lsave_cnt) + return; + } + /* Fill it up completely */ + while (cnt < c->lsave_cnt) + c->lsave[cnt++] = c->main_first; +} + +/** + * nnode_lookup - lookup a nnode in the LPT. + * @c: UBIFS file-system description object + * @i: nnode number + * + * This function returns a pointer to the nnode on success or a negative + * error code on failure. + */ +static struct ubifs_nnode *nnode_lookup(struct ubifs_info *c, int i) +{ + int err, iip; + struct ubifs_nnode *nnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + nnode = c->nroot; + while (1) { + iip = i & (UBIFS_LPT_FANOUT - 1); + i >>= UBIFS_LPT_FANOUT_SHIFT; + if (!i) + break; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return nnode; + } + return nnode; +} + +/** + * make_nnode_dirty - find a nnode and, if found, make it dirty. + * @c: UBIFS file-system description object + * @node_num: nnode number of nnode to make dirty + * @lnum: LEB number where nnode was written + * @offs: offset where nnode was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_nnode_dirty(struct ubifs_info *c, int node_num, int lnum, + int offs) +{ + struct ubifs_nnode *nnode; + + nnode = nnode_lookup(c, node_num); + if (IS_ERR(nnode)) + return PTR_ERR(nnode); + if (nnode->parent) { + struct ubifs_nbranch *branch; + + branch = &nnode->parent->nbranch[nnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + return 0; /* nnode is obsolete */ + } else if (c->lpt_lnum != lnum || c->lpt_offs != offs) + return 0; /* nnode is obsolete */ + /* Assumes cnext list is empty i.e. not called during commit */ + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + /* Mark parent and ancestors dirty too */ + nnode = nnode->parent; + while (nnode) { + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + nnode = nnode->parent; + } else + break; + } + } + return 0; +} + +/** + * make_pnode_dirty - find a pnode and, if found, make it dirty. + * @c: UBIFS file-system description object + * @node_num: pnode number of pnode to make dirty + * @lnum: LEB number where pnode was written + * @offs: offset where pnode was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_pnode_dirty(struct ubifs_info *c, int node_num, int lnum, + int offs) +{ + struct ubifs_pnode *pnode; + struct ubifs_nbranch *branch; + + pnode = pnode_lookup(c, node_num); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + branch = &pnode->parent->nbranch[pnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + return 0; + do_make_pnode_dirty(c, pnode); + return 0; +} + +/** + * make_ltab_dirty - make ltab node dirty. + * @c: UBIFS file-system description object + * @lnum: LEB number where ltab was written + * @offs: offset where ltab was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_ltab_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->ltab_lnum || offs != c->ltab_offs) + return 0; /* This ltab node is obsolete */ + if (!(c->lpt_drty_flgs & LTAB_DIRTY)) { + c->lpt_drty_flgs |= LTAB_DIRTY; + ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz); + } + return 0; +} + +/** + * make_lsave_dirty - make lsave node dirty. + * @c: UBIFS file-system description object + * @lnum: LEB number where lsave was written + * @offs: offset where lsave was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_lsave_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->lsave_lnum || offs != c->lsave_offs) + return 0; /* This lsave node is obsolete */ + if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) { + c->lpt_drty_flgs |= LSAVE_DIRTY; + ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz); + } + return 0; +} + +/** + * make_node_dirty - make node dirty. + * @c: UBIFS file-system description object + * @node_type: LPT node type + * @node_num: node number + * @lnum: LEB number where node was written + * @offs: offset where node was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num, + int lnum, int offs) +{ + switch (node_type) { + case UBIFS_LPT_NNODE: + return make_nnode_dirty(c, node_num, lnum, offs); + case UBIFS_LPT_PNODE: + return make_pnode_dirty(c, node_num, lnum, offs); + case UBIFS_LPT_LTAB: + return make_ltab_dirty(c, lnum, offs); + case UBIFS_LPT_LSAVE: + return make_lsave_dirty(c, lnum, offs); + } + return -EINVAL; +} + +/** + * get_lpt_node_len - return the length of a node based on its type. + * @c: UBIFS file-system description object + * @node_type: LPT node type + */ +static int get_lpt_node_len(const struct ubifs_info *c, int node_type) +{ + switch (node_type) { + case UBIFS_LPT_NNODE: + return c->nnode_sz; + case UBIFS_LPT_PNODE: + return c->pnode_sz; + case UBIFS_LPT_LTAB: + return c->ltab_sz; + case UBIFS_LPT_LSAVE: + return c->lsave_sz; + } + return 0; +} + +/** + * get_pad_len - return the length of padding in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer + * @len: length of buffer + */ +static int get_pad_len(const struct ubifs_info *c, uint8_t *buf, int len) +{ + int offs, pad_len; + + if (c->min_io_size == 1) + return 0; + offs = c->leb_size - len; + pad_len = ALIGN(offs, c->min_io_size) - offs; + return pad_len; +} + +/** + * get_lpt_node_type - return type (and node number) of a node in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer + * @node_num: node number is returned here + */ +static int get_lpt_node_type(const struct ubifs_info *c, uint8_t *buf, + int *node_num) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int pos = 0, node_type; + + node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS); + *node_num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits); + return node_type; +} + +/** + * is_a_node - determine if a buffer contains a node. + * @c: UBIFS file-system description object + * @buf: buffer + * @len: length of buffer + * + * This function returns %1 if the buffer contains a node or %0 if it does not. + */ +static int is_a_node(const struct ubifs_info *c, uint8_t *buf, int len) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int pos = 0, node_type, node_len; + uint16_t crc, calc_crc; + + if (len < UBIFS_LPT_CRC_BYTES + (UBIFS_LPT_TYPE_BITS + 7) / 8) + return 0; + node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS); + if (node_type == UBIFS_LPT_NOT_A_NODE) + return 0; + node_len = get_lpt_node_len(c, node_type); + if (!node_len || node_len > len) + return 0; + pos = 0; + addr = buf; + crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS); + calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + node_len - UBIFS_LPT_CRC_BYTES); + if (crc != calc_crc) + return 0; + return 1; +} + +/** + * lpt_gc_lnum - garbage collect a LPT LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to garbage collect + * + * LPT garbage collection is used only for the "big" LPT model + * (c->big_lpt == 1). Garbage collection simply involves marking all the nodes + * in the LEB being garbage-collected as dirty. The dirty nodes are written + * next commit, after which the LEB is free to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int lpt_gc_lnum(struct ubifs_info *c, int lnum) +{ + int err, len = c->leb_size, node_type, node_num, node_len, offs; + void *buf = c->lpt_buf; + + dbg_lp("LEB %d", lnum); + err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); + if (err) { + ubifs_err("cannot read LEB %d, error %d", lnum, err); + return err; + } + while (1) { + if (!is_a_node(c, buf, len)) { + int pad_len; + + pad_len = get_pad_len(c, buf, len); + if (pad_len) { + buf += pad_len; + len -= pad_len; + continue; + } + return 0; + } + node_type = get_lpt_node_type(c, buf, &node_num); + node_len = get_lpt_node_len(c, node_type); + offs = c->leb_size - len; + ubifs_assert(node_len != 0); + mutex_lock(&c->lp_mutex); + err = make_node_dirty(c, node_type, node_num, lnum, offs); + mutex_unlock(&c->lp_mutex); + if (err) + return err; + buf += node_len; + len -= node_len; + } + return 0; +} + +/** + * lpt_gc - LPT garbage collection. + * @c: UBIFS file-system description object + * + * Select a LPT LEB for LPT garbage collection and call 'lpt_gc_lnum()'. + * Returns %0 on success and a negative error code on failure. + */ +static int lpt_gc(struct ubifs_info *c) +{ + int i, lnum = -1, dirty = 0; + + mutex_lock(&c->lp_mutex); + for (i = 0; i < c->lpt_lebs; i++) { + ubifs_assert(!c->ltab[i].tgc); + if (i + c->lpt_first == c->nhead_lnum || + c->ltab[i].free + c->ltab[i].dirty == c->leb_size) + continue; + if (c->ltab[i].dirty > dirty) { + dirty = c->ltab[i].dirty; + lnum = i + c->lpt_first; + } + } + mutex_unlock(&c->lp_mutex); + if (lnum == -1) + return -ENOSPC; + return lpt_gc_lnum(c, lnum); +} + +/** + * ubifs_lpt_start_commit - UBIFS commit starts. + * @c: the UBIFS file-system description object + * + * This function has to be called when UBIFS starts the commit operation. + * This function "freezes" all currently dirty LEB properties and does not + * change them anymore. Further changes are saved and tracked separately + * because they are not part of this commit. This function returns zero in case + * of success and a negative error code in case of failure. + */ +int ubifs_lpt_start_commit(struct ubifs_info *c) +{ + int err, cnt; + + dbg_lp(""); + + mutex_lock(&c->lp_mutex); + err = dbg_chk_lpt_free_spc(c); + if (err) + goto out; + err = dbg_check_ltab(c); + if (err) + goto out; + + if (c->check_lpt_free) { + /* + * We ensure there is enough free space in + * ubifs_lpt_post_commit() by marking nodes dirty. That + * information is lost when we unmount, so we also need + * to check free space once after mounting also. + */ + c->check_lpt_free = 0; + while (need_write_all(c)) { + mutex_unlock(&c->lp_mutex); + err = lpt_gc(c); + if (err) + return err; + mutex_lock(&c->lp_mutex); + } + } + + lpt_tgc_start(c); + + if (!c->dirty_pn_cnt) { + dbg_cmt("no cnodes to commit"); + err = 0; + goto out; + } + + if (!c->big_lpt && need_write_all(c)) { + /* If needed, write everything */ + err = make_tree_dirty(c); + if (err) + goto out; + lpt_tgc_start(c); + } + + if (c->big_lpt) + populate_lsave(c); + + cnt = get_cnodes_to_commit(c); + ubifs_assert(cnt != 0); + + err = layout_cnodes(c); + if (err) + goto out; + + /* Copy the LPT's own lprops for end commit to write */ + memcpy(c->ltab_cmt, c->ltab, + sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + c->lpt_drty_flgs &= ~(LTAB_DIRTY | LSAVE_DIRTY); + +out: + mutex_unlock(&c->lp_mutex); + return err; +} + +/** + * free_obsolete_cnodes - free obsolete cnodes for commit end. + * @c: UBIFS file-system description object + */ +static void free_obsolete_cnodes(struct ubifs_info *c) +{ + struct ubifs_cnode *cnode, *cnext; + + cnext = c->lpt_cnext; + if (!cnext) + return; + do { + cnode = cnext; + cnext = cnode->cnext; + if (test_bit(OBSOLETE_CNODE, &cnode->flags)) + kfree(cnode); + else + cnode->cnext = NULL; + } while (cnext != c->lpt_cnext); + c->lpt_cnext = NULL; +} + +/** + * ubifs_lpt_end_commit - finish the commit operation. + * @c: the UBIFS file-system description object + * + * This function has to be called when the commit operation finishes. It + * flushes the changes which were "frozen" by 'ubifs_lprops_start_commit()' to + * the media. Returns zero in case of success and a negative error code in case + * of failure. + */ +int ubifs_lpt_end_commit(struct ubifs_info *c) +{ + int err; + + dbg_lp(""); + + if (!c->lpt_cnext) + return 0; + + err = write_cnodes(c); + if (err) + return err; + + mutex_lock(&c->lp_mutex); + free_obsolete_cnodes(c); + mutex_unlock(&c->lp_mutex); + + return 0; +} + +/** + * ubifs_lpt_post_commit - post commit LPT trivial GC and LPT GC. + * @c: UBIFS file-system description object + * + * LPT trivial GC is completed after a commit. Also LPT GC is done after a + * commit for the "big" LPT model. + */ +int ubifs_lpt_post_commit(struct ubifs_info *c) +{ + int err; + + mutex_lock(&c->lp_mutex); + err = lpt_tgc_end(c); + if (err) + goto out; + if (c->big_lpt) + while (need_write_all(c)) { + mutex_unlock(&c->lp_mutex); + err = lpt_gc(c); + if (err) + return err; + mutex_lock(&c->lp_mutex); + } +out: + mutex_unlock(&c->lp_mutex); + return err; +} + +/** + * first_nnode - find the first nnode in memory. + * @c: UBIFS file-system description object + * @hght: height of tree where nnode found is returned here + * + * This function returns a pointer to the nnode found or %NULL if no nnode is + * found. This function is a helper to 'ubifs_lpt_free()'. + */ +static struct ubifs_nnode *first_nnode(struct ubifs_info *c, int *hght) +{ + struct ubifs_nnode *nnode; + int h, i, found; + + nnode = c->nroot; + *hght = 0; + if (!nnode) + return NULL; + for (h = 1; h < c->lpt_hght; h++) { + found = 0; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (nnode->nbranch[i].nnode) { + found = 1; + nnode = nnode->nbranch[i].nnode; + *hght = h; + break; + } + } + if (!found) + break; + } + return nnode; +} + +/** + * next_nnode - find the next nnode in memory. + * @c: UBIFS file-system description object + * @nnode: nnode from which to start. + * @hght: height of tree where nnode is, is passed and returned here + * + * This function returns a pointer to the nnode found or %NULL if no nnode is + * found. This function is a helper to 'ubifs_lpt_free()'. + */ +static struct ubifs_nnode *next_nnode(struct ubifs_info *c, + struct ubifs_nnode *nnode, int *hght) +{ + struct ubifs_nnode *parent; + int iip, h, i, found; + + parent = nnode->parent; + if (!parent) + return NULL; + if (nnode->iip == UBIFS_LPT_FANOUT - 1) { + *hght -= 1; + return parent; + } + for (iip = nnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) { + nnode = parent->nbranch[iip].nnode; + if (nnode) + break; + } + if (!nnode) { + *hght -= 1; + return parent; + } + for (h = *hght + 1; h < c->lpt_hght; h++) { + found = 0; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (nnode->nbranch[i].nnode) { + found = 1; + nnode = nnode->nbranch[i].nnode; + *hght = h; + break; + } + } + if (!found) + break; + } + return nnode; +} + +/** + * ubifs_lpt_free - free resources owned by the LPT. + * @c: UBIFS file-system description object + * @wr_only: free only resources used for writing + */ +void ubifs_lpt_free(struct ubifs_info *c, int wr_only) +{ + struct ubifs_nnode *nnode; + int i, hght; + + /* Free write-only things first */ + + free_obsolete_cnodes(c); /* Leftover from a failed commit */ + + vfree(c->ltab_cmt); + c->ltab_cmt = NULL; + vfree(c->lpt_buf); + c->lpt_buf = NULL; + kfree(c->lsave); + c->lsave = NULL; + + if (wr_only) + return; + + /* Now free the rest */ + + nnode = first_nnode(c, &hght); + while (nnode) { + for (i = 0; i < UBIFS_LPT_FANOUT; i++) + kfree(nnode->nbranch[i].nnode); + nnode = next_nnode(c, nnode, &hght); + } + for (i = 0; i < LPROPS_HEAP_CNT; i++) + kfree(c->lpt_heap[i].arr); + kfree(c->dirty_idx.arr); + kfree(c->nroot); + vfree(c->ltab); + kfree(c->lpt_nod_buf); +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes. + * @buf: buffer + * @len: buffer length + */ +static int dbg_is_all_ff(uint8_t *buf, int len) +{ + int i; + + for (i = 0; i < len; i++) + if (buf[i] != 0xff) + return 0; + return 1; +} + +/** + * dbg_is_nnode_dirty - determine if a nnode is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where nnode was written + * @offs: offset where nnode was written + */ +static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs) +{ + struct ubifs_nnode *nnode; + int hght; + + /* Entire tree is in memory so first_nnode / next_nnode are OK */ + nnode = first_nnode(c, &hght); + for (; nnode; nnode = next_nnode(c, nnode, &hght)) { + struct ubifs_nbranch *branch; + + cond_resched(); + if (nnode->parent) { + branch = &nnode->parent->nbranch[nnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + continue; + if (test_bit(DIRTY_CNODE, &nnode->flags)) + return 1; + return 0; + } else { + if (c->lpt_lnum != lnum || c->lpt_offs != offs) + continue; + if (test_bit(DIRTY_CNODE, &nnode->flags)) + return 1; + return 0; + } + } + return 1; +} + +/** + * dbg_is_pnode_dirty - determine if a pnode is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where pnode was written + * @offs: offset where pnode was written + */ +static int dbg_is_pnode_dirty(struct ubifs_info *c, int lnum, int offs) +{ + int i, cnt; + + cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT); + for (i = 0; i < cnt; i++) { + struct ubifs_pnode *pnode; + struct ubifs_nbranch *branch; + + cond_resched(); + pnode = pnode_lookup(c, i); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + branch = &pnode->parent->nbranch[pnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + continue; + if (test_bit(DIRTY_CNODE, &pnode->flags)) + return 1; + return 0; + } + return 1; +} + +/** + * dbg_is_ltab_dirty - determine if a ltab node is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where ltab node was written + * @offs: offset where ltab node was written + */ +static int dbg_is_ltab_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->ltab_lnum || offs != c->ltab_offs) + return 1; + return (c->lpt_drty_flgs & LTAB_DIRTY) != 0; +} + +/** + * dbg_is_lsave_dirty - determine if a lsave node is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where lsave node was written + * @offs: offset where lsave node was written + */ +static int dbg_is_lsave_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->lsave_lnum || offs != c->lsave_offs) + return 1; + return (c->lpt_drty_flgs & LSAVE_DIRTY) != 0; +} + +/** + * dbg_is_node_dirty - determine if a node is dirty. + * @c: the UBIFS file-system description object + * @node_type: node type + * @lnum: LEB number where node was written + * @offs: offset where node was written + */ +static int dbg_is_node_dirty(struct ubifs_info *c, int node_type, int lnum, + int offs) +{ + switch (node_type) { + case UBIFS_LPT_NNODE: + return dbg_is_nnode_dirty(c, lnum, offs); + case UBIFS_LPT_PNODE: + return dbg_is_pnode_dirty(c, lnum, offs); + case UBIFS_LPT_LTAB: + return dbg_is_ltab_dirty(c, lnum, offs); + case UBIFS_LPT_LSAVE: + return dbg_is_lsave_dirty(c, lnum, offs); + } + return 1; +} + +/** + * dbg_check_ltab_lnum - check the ltab for a LPT LEB number. + * @c: the UBIFS file-system description object + * @lnum: LEB number where node was written + * @offs: offset where node was written + * + * This function returns %0 on success and a negative error code on failure. + */ +static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) +{ + int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len; + int ret; + void *buf, *p; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory for ltab checking"); + return 0; + } + + dbg_lp("LEB %d", lnum); + err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); + if (err) { + dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err); + goto out; + } + while (1) { + if (!is_a_node(c, p, len)) { + int i, pad_len; + + pad_len = get_pad_len(c, p, len); + if (pad_len) { + p += pad_len; + len -= pad_len; + dirty += pad_len; + continue; + } + if (!dbg_is_all_ff(p, len)) { + dbg_msg("invalid empty space in LEB %d at %d", + lnum, c->leb_size - len); + err = -EINVAL; + } + i = lnum - c->lpt_first; + if (len != c->ltab[i].free) { + dbg_msg("invalid free space in LEB %d " + "(free %d, expected %d)", + lnum, len, c->ltab[i].free); + err = -EINVAL; + } + if (dirty != c->ltab[i].dirty) { + dbg_msg("invalid dirty space in LEB %d " + "(dirty %d, expected %d)", + lnum, dirty, c->ltab[i].dirty); + err = -EINVAL; + } + goto out; + } + node_type = get_lpt_node_type(c, p, &node_num); + node_len = get_lpt_node_len(c, node_type); + ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len); + if (ret == 1) + dirty += node_len; + p += node_len; + len -= node_len; + } + + err = 0; +out: + vfree(buf); + return err; +} + +/** + * dbg_check_ltab - check the free and dirty space in the ltab. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_ltab(struct ubifs_info *c) +{ + int lnum, err, i, cnt; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + /* Bring the entire tree into memory */ + cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT); + for (i = 0; i < cnt; i++) { + struct ubifs_pnode *pnode; + + pnode = pnode_lookup(c, i); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + cond_resched(); + } + + /* Check nodes */ + err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)c->nroot, 0, 0); + if (err) + return err; + + /* Check each LEB */ + for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) { + err = dbg_check_ltab_lnum(c, lnum); + if (err) { + dbg_err("failed at LEB %d", lnum); + return err; + } + } + + dbg_lp("succeeded"); + return 0; +} + +/** + * dbg_chk_lpt_free_spc - check LPT free space is enough to write entire LPT. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_chk_lpt_free_spc(struct ubifs_info *c) +{ + long long free = 0; + int i; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + for (i = 0; i < c->lpt_lebs; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (i + c->lpt_first == c->nhead_lnum) + free += c->leb_size - c->nhead_offs; + else if (c->ltab[i].free == c->leb_size) + free += c->leb_size; + } + if (free < c->lpt_sz) { + dbg_err("LPT space error: free %lld lpt_sz %lld", + free, c->lpt_sz); + dbg_dump_lpt_info(c); + dbg_dump_lpt_lebs(c); + dump_stack(); + return -EINVAL; + } + return 0; +} + +/** + * dbg_chk_lpt_sz - check LPT does not write more than LPT size. + * @c: the UBIFS file-system description object + * @action: what to do + * @len: length written + * + * This function returns %0 on success and a negative error code on failure. + * The @action argument may be one of: + * o %0 - LPT debugging checking starts, initialize debugging variables; + * o %1 - wrote an LPT node, increase LPT size by @len bytes; + * o %2 - switched to a different LEB and wasted @len bytes; + * o %3 - check that we've written the right number of bytes. + * o %4 - wasted @len bytes; + */ +int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) +{ + struct ubifs_debug_info *d = c->dbg; + long long chk_lpt_sz, lpt_sz; + int err = 0; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + switch (action) { + case 0: + d->chk_lpt_sz = 0; + d->chk_lpt_sz2 = 0; + d->chk_lpt_lebs = 0; + d->chk_lpt_wastage = 0; + if (c->dirty_pn_cnt > c->pnode_cnt) { + dbg_err("dirty pnodes %d exceed max %d", + c->dirty_pn_cnt, c->pnode_cnt); + err = -EINVAL; + } + if (c->dirty_nn_cnt > c->nnode_cnt) { + dbg_err("dirty nnodes %d exceed max %d", + c->dirty_nn_cnt, c->nnode_cnt); + err = -EINVAL; + } + return err; + case 1: + d->chk_lpt_sz += len; + return 0; + case 2: + d->chk_lpt_sz += len; + d->chk_lpt_wastage += len; + d->chk_lpt_lebs += 1; + return 0; + case 3: + chk_lpt_sz = c->leb_size; + chk_lpt_sz *= d->chk_lpt_lebs; + chk_lpt_sz += len - c->nhead_offs; + if (d->chk_lpt_sz != chk_lpt_sz) { + dbg_err("LPT wrote %lld but space used was %lld", + d->chk_lpt_sz, chk_lpt_sz); + err = -EINVAL; + } + if (d->chk_lpt_sz > c->lpt_sz) { + dbg_err("LPT wrote %lld but lpt_sz is %lld", + d->chk_lpt_sz, c->lpt_sz); + err = -EINVAL; + } + if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) { + dbg_err("LPT layout size %lld but wrote %lld", + d->chk_lpt_sz, d->chk_lpt_sz2); + err = -EINVAL; + } + if (d->chk_lpt_sz2 && d->new_nhead_offs != len) { + dbg_err("LPT new nhead offs: expected %d was %d", + d->new_nhead_offs, len); + err = -EINVAL; + } + lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; + lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; + lpt_sz += c->ltab_sz; + if (c->big_lpt) + lpt_sz += c->lsave_sz; + if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) { + dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld", + d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz); + err = -EINVAL; + } + if (err) { + dbg_dump_lpt_info(c); + dbg_dump_lpt_lebs(c); + dump_stack(); + } + d->chk_lpt_sz2 = d->chk_lpt_sz; + d->chk_lpt_sz = 0; + d->chk_lpt_wastage = 0; + d->chk_lpt_lebs = 0; + d->new_nhead_offs = len; + return err; + case 4: + d->chk_lpt_sz += len; + d->chk_lpt_wastage += len; + return 0; + default: + return -EINVAL; + } +} + +/** + * dbg_dump_lpt_leb - dump an LPT LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to dump + * + * This function dumps an LEB from LPT area. Nodes in this area are very + * different to nodes in the main area (e.g., they do not have common headers, + * they do not have 8-byte alignments, etc), so we have a separate function to + * dump LPT area LEBs. Note, LPT has to be locked by the caller. + */ +static void dump_lpt_leb(const struct ubifs_info *c, int lnum) +{ + int err, len = c->leb_size, node_type, node_num, node_len, offs; + void *buf, *p; + + printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", + current->pid, lnum); + buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory to dump LPT"); + return; + } + + err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); + if (err) { + ubifs_err("cannot read LEB %d, error %d", lnum, err); + goto out; + } + while (1) { + offs = c->leb_size - len; + if (!is_a_node(c, p, len)) { + int pad_len; + + pad_len = get_pad_len(c, p, len); + if (pad_len) { + printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n", + lnum, offs, pad_len); + p += pad_len; + len -= pad_len; + continue; + } + if (len) + printk(KERN_DEBUG "LEB %d:%d, free %d bytes\n", + lnum, offs, len); + break; + } + + node_type = get_lpt_node_type(c, p, &node_num); + switch (node_type) { + case UBIFS_LPT_PNODE: + { + node_len = c->pnode_sz; + if (c->big_lpt) + printk(KERN_DEBUG "LEB %d:%d, pnode num %d\n", + lnum, offs, node_num); + else + printk(KERN_DEBUG "LEB %d:%d, pnode\n", + lnum, offs); + break; + } + case UBIFS_LPT_NNODE: + { + int i; + struct ubifs_nnode nnode; + + node_len = c->nnode_sz; + if (c->big_lpt) + printk(KERN_DEBUG "LEB %d:%d, nnode num %d, ", + lnum, offs, node_num); + else + printk(KERN_DEBUG "LEB %d:%d, nnode, ", + lnum, offs); + err = ubifs_unpack_nnode(c, p, &nnode); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum, + nnode.nbranch[i].offs); + if (i != UBIFS_LPT_FANOUT - 1) + printk(KERN_CONT ", "); + } + printk(KERN_CONT "\n"); + break; + } + case UBIFS_LPT_LTAB: + node_len = c->ltab_sz; + printk(KERN_DEBUG "LEB %d:%d, ltab\n", + lnum, offs); + break; + case UBIFS_LPT_LSAVE: + node_len = c->lsave_sz; + printk(KERN_DEBUG "LEB %d:%d, lsave len\n", lnum, offs); + break; + default: + ubifs_err("LPT node type %d not recognized", node_type); + goto out; + } + + p += node_len; + len -= node_len; + } + + printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", + current->pid, lnum); +out: + vfree(buf); + return; +} + +/** + * dbg_dump_lpt_lebs - dump LPT lebs. + * @c: UBIFS file-system description object + * + * This function dumps all LPT LEBs. The caller has to make sure the LPT is + * locked. + */ +void dbg_dump_lpt_lebs(const struct ubifs_info *c) +{ + int i; + + printk(KERN_DEBUG "(pid %d) start dumping all LPT LEBs\n", + current->pid); + for (i = 0; i < c->lpt_lebs; i++) + dump_lpt_leb(c, i + c->lpt_first); + printk(KERN_DEBUG "(pid %d) finish dumping all LPT LEBs\n", + current->pid); +} + +/** + * dbg_populate_lsave - debugging version of 'populate_lsave()' + * @c: UBIFS file-system description object + * + * This is a debugging version for 'populate_lsave()' which populates lsave + * with random LEBs instead of useful LEBs, which is good for test coverage. + * Returns zero if lsave has not been populated (this debugging feature is + * disabled) an non-zero if lsave has been populated. + */ +static int dbg_populate_lsave(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + int i; + + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + if (random32() & 3) + return 0; + + for (i = 0; i < c->lsave_cnt; i++) + c->lsave[i] = c->main_first; + + list_for_each_entry(lprops, &c->empty_list, list) + c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + list_for_each_entry(lprops, &c->freeable_list, list) + c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + list_for_each_entry(lprops, &c->frdi_idx_list, list) + c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + + heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + heap = &c->lpt_heap[LPROPS_FREE - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + + return 1; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c new file mode 100644 index 00000000..278c2382 --- /dev/null +++ b/fs/ubifs/master.c @@ -0,0 +1,396 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* This file implements reading and writing the master node */ + +#include "ubifs.h" + +/** + * scan_for_master - search the valid master node. + * @c: UBIFS file-system description object + * + * This function scans the master node LEBs and search for the latest master + * node. Returns zero in case of success, %-EUCLEAN if there master area is + * corrupted and requires recovery, and a negative error code in case of + * failure. + */ +static int scan_for_master(struct ubifs_info *c) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + int lnum, offs = 0, nodes_cnt; + + lnum = UBIFS_MST_LNUM; + + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + nodes_cnt = sleb->nodes_cnt; + if (nodes_cnt > 0) { + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, + list); + if (snod->type != UBIFS_MST_NODE) + goto out_dump; + memcpy(c->mst_node, snod->node, snod->len); + offs = snod->offs; + } + ubifs_scan_destroy(sleb); + + lnum += 1; + + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + if (sleb->nodes_cnt != nodes_cnt) + goto out; + if (!sleb->nodes_cnt) + goto out; + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list); + if (snod->type != UBIFS_MST_NODE) + goto out_dump; + if (snod->offs != offs) + goto out; + if (memcmp((void *)c->mst_node + UBIFS_CH_SZ, + (void *)snod->node + UBIFS_CH_SZ, + UBIFS_MST_NODE_SZ - UBIFS_CH_SZ)) + goto out; + c->mst_offs = offs; + ubifs_scan_destroy(sleb); + return 0; + +out: + ubifs_scan_destroy(sleb); + return -EUCLEAN; + +out_dump: + ubifs_err("unexpected node type %d master LEB %d:%d", + snod->type, lnum, snod->offs); + ubifs_scan_destroy(sleb); + return -EINVAL; +} + +/** + * validate_master - validate master node. + * @c: UBIFS file-system description object + * + * This function validates data which was read from master node. Returns zero + * if the data is all right and %-EINVAL if not. + */ +static int validate_master(const struct ubifs_info *c) +{ + long long main_sz; + int err; + + if (c->max_sqnum >= SQNUM_WATERMARK) { + err = 1; + goto out; + } + + if (c->cmt_no >= c->max_sqnum) { + err = 2; + goto out; + } + + if (c->highest_inum >= INUM_WATERMARK) { + err = 3; + goto out; + } + + if (c->lhead_lnum < UBIFS_LOG_LNUM || + c->lhead_lnum >= UBIFS_LOG_LNUM + c->log_lebs || + c->lhead_offs < 0 || c->lhead_offs >= c->leb_size || + c->lhead_offs & (c->min_io_size - 1)) { + err = 4; + goto out; + } + + if (c->zroot.lnum >= c->leb_cnt || c->zroot.lnum < c->main_first || + c->zroot.offs >= c->leb_size || c->zroot.offs & 7) { + err = 5; + goto out; + } + + if (c->zroot.len < c->ranges[UBIFS_IDX_NODE].min_len || + c->zroot.len > c->ranges[UBIFS_IDX_NODE].max_len) { + err = 6; + goto out; + } + + if (c->gc_lnum >= c->leb_cnt || c->gc_lnum < c->main_first) { + err = 7; + goto out; + } + + if (c->ihead_lnum >= c->leb_cnt || c->ihead_lnum < c->main_first || + c->ihead_offs % c->min_io_size || c->ihead_offs < 0 || + c->ihead_offs > c->leb_size || c->ihead_offs & 7) { + err = 8; + goto out; + } + + main_sz = (long long)c->main_lebs * c->leb_size; + if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) { + err = 9; + goto out; + } + + if (c->lpt_lnum < c->lpt_first || c->lpt_lnum > c->lpt_last || + c->lpt_offs < 0 || c->lpt_offs + c->nnode_sz > c->leb_size) { + err = 10; + goto out; + } + + if (c->nhead_lnum < c->lpt_first || c->nhead_lnum > c->lpt_last || + c->nhead_offs < 0 || c->nhead_offs % c->min_io_size || + c->nhead_offs > c->leb_size) { + err = 11; + goto out; + } + + if (c->ltab_lnum < c->lpt_first || c->ltab_lnum > c->lpt_last || + c->ltab_offs < 0 || + c->ltab_offs + c->ltab_sz > c->leb_size) { + err = 12; + goto out; + } + + if (c->big_lpt && (c->lsave_lnum < c->lpt_first || + c->lsave_lnum > c->lpt_last || c->lsave_offs < 0 || + c->lsave_offs + c->lsave_sz > c->leb_size)) { + err = 13; + goto out; + } + + if (c->lscan_lnum < c->main_first || c->lscan_lnum >= c->leb_cnt) { + err = 14; + goto out; + } + + if (c->lst.empty_lebs < 0 || c->lst.empty_lebs > c->main_lebs - 2) { + err = 15; + goto out; + } + + if (c->lst.idx_lebs < 0 || c->lst.idx_lebs > c->main_lebs - 1) { + err = 16; + goto out; + } + + if (c->lst.total_free < 0 || c->lst.total_free > main_sz || + c->lst.total_free & 7) { + err = 17; + goto out; + } + + if (c->lst.total_dirty < 0 || (c->lst.total_dirty & 7)) { + err = 18; + goto out; + } + + if (c->lst.total_used < 0 || (c->lst.total_used & 7)) { + err = 19; + goto out; + } + + if (c->lst.total_free + c->lst.total_dirty + + c->lst.total_used > main_sz) { + err = 20; + goto out; + } + + if (c->lst.total_dead + c->lst.total_dark + + c->lst.total_used + c->bi.old_idx_sz > main_sz) { + err = 21; + goto out; + } + + if (c->lst.total_dead < 0 || + c->lst.total_dead > c->lst.total_free + c->lst.total_dirty || + c->lst.total_dead & 7) { + err = 22; + goto out; + } + + if (c->lst.total_dark < 0 || + c->lst.total_dark > c->lst.total_free + c->lst.total_dirty || + c->lst.total_dark & 7) { + err = 23; + goto out; + } + + return 0; + +out: + ubifs_err("bad master node at offset %d error %d", c->mst_offs, err); + dbg_dump_node(c, c->mst_node); + return -EINVAL; +} + +/** + * ubifs_read_master - read master node. + * @c: UBIFS file-system description object + * + * This function finds and reads the master node during file-system mount. If + * the flash is empty, it creates default master node as well. Returns zero in + * case of success and a negative error code in case of failure. + */ +int ubifs_read_master(struct ubifs_info *c) +{ + int err, old_leb_cnt; + + c->mst_node = kzalloc(c->mst_node_alsz, GFP_KERNEL); + if (!c->mst_node) + return -ENOMEM; + + err = scan_for_master(c); + if (err) { + if (err == -EUCLEAN) + err = ubifs_recover_master_node(c); + if (err) + /* + * Note, we do not free 'c->mst_node' here because the + * unmount routine will take care of this. + */ + return err; + } + + /* Make sure that the recovery flag is clear */ + c->mst_node->flags &= cpu_to_le32(~UBIFS_MST_RCVRY); + + c->max_sqnum = le64_to_cpu(c->mst_node->ch.sqnum); + c->highest_inum = le64_to_cpu(c->mst_node->highest_inum); + c->cmt_no = le64_to_cpu(c->mst_node->cmt_no); + c->zroot.lnum = le32_to_cpu(c->mst_node->root_lnum); + c->zroot.offs = le32_to_cpu(c->mst_node->root_offs); + c->zroot.len = le32_to_cpu(c->mst_node->root_len); + c->lhead_lnum = le32_to_cpu(c->mst_node->log_lnum); + c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum); + c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum); + c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs); + c->bi.old_idx_sz = le64_to_cpu(c->mst_node->index_size); + c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum); + c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs); + c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum); + c->nhead_offs = le32_to_cpu(c->mst_node->nhead_offs); + c->ltab_lnum = le32_to_cpu(c->mst_node->ltab_lnum); + c->ltab_offs = le32_to_cpu(c->mst_node->ltab_offs); + c->lsave_lnum = le32_to_cpu(c->mst_node->lsave_lnum); + c->lsave_offs = le32_to_cpu(c->mst_node->lsave_offs); + c->lscan_lnum = le32_to_cpu(c->mst_node->lscan_lnum); + c->lst.empty_lebs = le32_to_cpu(c->mst_node->empty_lebs); + c->lst.idx_lebs = le32_to_cpu(c->mst_node->idx_lebs); + old_leb_cnt = le32_to_cpu(c->mst_node->leb_cnt); + c->lst.total_free = le64_to_cpu(c->mst_node->total_free); + c->lst.total_dirty = le64_to_cpu(c->mst_node->total_dirty); + c->lst.total_used = le64_to_cpu(c->mst_node->total_used); + c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead); + c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark); + + c->calc_idx_sz = c->bi.old_idx_sz; + + if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS)) + c->no_orphs = 1; + + if (old_leb_cnt != c->leb_cnt) { + /* The file system has been resized */ + int growth = c->leb_cnt - old_leb_cnt; + + if (c->leb_cnt < old_leb_cnt || + c->leb_cnt < UBIFS_MIN_LEB_CNT) { + ubifs_err("bad leb_cnt on master node"); + dbg_dump_node(c, c->mst_node); + return -EINVAL; + } + + dbg_mnt("Auto resizing (master) from %d LEBs to %d LEBs", + old_leb_cnt, c->leb_cnt); + c->lst.empty_lebs += growth; + c->lst.total_free += growth * (long long)c->leb_size; + c->lst.total_dark += growth * (long long)c->dark_wm; + + /* + * Reflect changes back onto the master node. N.B. the master + * node gets written immediately whenever mounting (or + * remounting) in read-write mode, so we do not need to write it + * here. + */ + c->mst_node->leb_cnt = cpu_to_le32(c->leb_cnt); + c->mst_node->empty_lebs = cpu_to_le32(c->lst.empty_lebs); + c->mst_node->total_free = cpu_to_le64(c->lst.total_free); + c->mst_node->total_dark = cpu_to_le64(c->lst.total_dark); + } + + err = validate_master(c); + if (err) + return err; + + err = dbg_old_index_check_init(c, &c->zroot); + + return err; +} + +/** + * ubifs_write_master - write master node. + * @c: UBIFS file-system description object + * + * This function writes the master node. The caller has to take the + * @c->mst_mutex lock before calling this function. Returns zero in case of + * success and a negative error code in case of failure. The master node is + * written twice to enable recovery. + */ +int ubifs_write_master(struct ubifs_info *c) +{ + int err, lnum, offs, len; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + + lnum = UBIFS_MST_LNUM; + offs = c->mst_offs + c->mst_node_alsz; + len = UBIFS_MST_NODE_SZ; + + if (offs + UBIFS_MST_NODE_SZ > c->leb_size) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + offs = 0; + } + + c->mst_offs = offs; + c->mst_node->highest_inum = cpu_to_le64(c->highest_inum); + + err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM); + if (err) + return err; + + lnum += 1; + + if (offs == 0) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM); + + return err; +} diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h new file mode 100644 index 00000000..0b5296a9 --- /dev/null +++ b/fs/ubifs/misc.h @@ -0,0 +1,360 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file contains miscellaneous helper functions. + */ + +#ifndef __UBIFS_MISC_H__ +#define __UBIFS_MISC_H__ + +/** + * ubifs_zn_dirty - check if znode is dirty. + * @znode: znode to check + * + * This helper function returns %1 if @znode is dirty and %0 otherwise. + */ +static inline int ubifs_zn_dirty(const struct ubifs_znode *znode) +{ + return !!test_bit(DIRTY_ZNODE, &znode->flags); +} + +/** + * ubifs_wake_up_bgt - wake up background thread. + * @c: UBIFS file-system description object + */ +static inline void ubifs_wake_up_bgt(struct ubifs_info *c) +{ + if (c->bgt && !c->need_bgt) { + c->need_bgt = 1; + wake_up_process(c->bgt); + } +} + +/** + * ubifs_tnc_find_child - find next child in znode. + * @znode: znode to search at + * @start: the zbranch index to start at + * + * This helper function looks for znode child starting at index @start. Returns + * the child or %NULL if no children were found. + */ +static inline struct ubifs_znode * +ubifs_tnc_find_child(struct ubifs_znode *znode, int start) +{ + while (start < znode->child_cnt) { + if (znode->zbranch[start].znode) + return znode->zbranch[start].znode; + start += 1; + } + + return NULL; +} + +/** + * ubifs_inode - get UBIFS inode information by VFS 'struct inode' object. + * @inode: the VFS 'struct inode' pointer + */ +static inline struct ubifs_inode *ubifs_inode(const struct inode *inode) +{ + return container_of(inode, struct ubifs_inode, vfs_inode); +} + +/** + * ubifs_compr_present - check if compressor was compiled in. + * @compr_type: compressor type to check + * + * This function returns %1 of compressor of type @compr_type is present, and + * %0 if not. + */ +static inline int ubifs_compr_present(int compr_type) +{ + ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT); + return !!ubifs_compressors[compr_type]->capi_name; +} + +/** + * ubifs_compr_name - get compressor name string by its type. + * @compr_type: compressor type + * + * This function returns compressor type string. + */ +static inline const char *ubifs_compr_name(int compr_type) +{ + ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT); + return ubifs_compressors[compr_type]->name; +} + +/** + * ubifs_wbuf_sync - synchronize write-buffer. + * @wbuf: write-buffer to synchronize + * + * This is the same as as 'ubifs_wbuf_sync_nolock()' but it does not assume + * that the write-buffer is already locked. + */ +static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf) +{ + int err; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + err = ubifs_wbuf_sync_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + return err; +} + +/** + * ubifs_leb_unmap - unmap an LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to unmap + * + * This function returns %0 on success and a negative error code on failure. + */ +static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + err = ubi_leb_unmap(c->ubi, lnum); + if (err) { + ubifs_err("unmap LEB %d failed, error %d", lnum, err); + return err; + } + + return 0; +} + +/** + * ubifs_leb_write - write to a LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to write + * @buf: buffer to write from + * @offs: offset within LEB to write to + * @len: length to write + * @dtype: data type + * + * This function returns %0 on success and a negative error code on failure. + */ +static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum, + const void *buf, int offs, int len, int dtype) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); + if (err) { + ubifs_err("writing %d bytes at %d:%d, error %d", + len, lnum, offs, err); + return err; + } + + return 0; +} + +/** + * ubifs_leb_change - atomic LEB change. + * @c: UBIFS file-system description object + * @lnum: LEB number to write + * @buf: buffer to write from + * @len: length to write + * @dtype: data type + * + * This function returns %0 on success and a negative error code on failure. + */ +static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum, + const void *buf, int len, int dtype) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); + if (err) { + ubifs_err("changing %d bytes in LEB %d, error %d", + len, lnum, err); + return err; + } + + return 0; +} + +/** + * ubifs_encode_dev - encode device node IDs. + * @dev: UBIFS device node information + * @rdev: device IDs to encode + * + * This is a helper function which encodes major/minor numbers of a device node + * into UBIFS device node description. We use standard Linux "new" and "huge" + * encodings. + */ +static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev) +{ + if (new_valid_dev(rdev)) { + dev->new = cpu_to_le32(new_encode_dev(rdev)); + return sizeof(dev->new); + } else { + dev->huge = cpu_to_le64(huge_encode_dev(rdev)); + return sizeof(dev->huge); + } +} + +/** + * ubifs_add_dirt - add dirty space to LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to add dirty space for + * @dirty: dirty space to add + * + * This is a helper function which increased amount of dirty LEB space. Returns + * zero in case of success and a negative error code in case of failure. + */ +static inline int ubifs_add_dirt(struct ubifs_info *c, int lnum, int dirty) +{ + return ubifs_update_one_lp(c, lnum, LPROPS_NC, dirty, 0, 0); +} + +/** + * ubifs_return_leb - return LEB to lprops. + * @c: the UBIFS file-system description object + * @lnum: LEB to return + * + * This helper function cleans the "taken" flag of a logical eraseblock in the + * lprops. Returns zero in case of success and a negative error code in case of + * failure. + */ +static inline int ubifs_return_leb(struct ubifs_info *c, int lnum) +{ + return ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_TAKEN, 0); +} + +/** + * ubifs_idx_node_sz - return index node size. + * @c: the UBIFS file-system description object + * @child_cnt: number of children of this index node + */ +static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt) +{ + return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt; +} + +/** + * ubifs_idx_branch - return pointer to an index branch. + * @c: the UBIFS file-system description object + * @idx: index node + * @bnum: branch number + */ +static inline +struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c, + const struct ubifs_idx_node *idx, + int bnum) +{ + return (struct ubifs_branch *)((void *)idx->branches + + (UBIFS_BRANCH_SZ + c->key_len) * bnum); +} + +/** + * ubifs_idx_key - return pointer to an index key. + * @c: the UBIFS file-system description object + * @idx: index node + */ +static inline void *ubifs_idx_key(const struct ubifs_info *c, + const struct ubifs_idx_node *idx) +{ + return (void *)((struct ubifs_branch *)idx->branches)->key; +} + +/** + * ubifs_current_time - round current time to time granularity. + * @inode: inode + */ +static inline struct timespec ubifs_current_time(struct inode *inode) +{ + return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? + current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; +} + +/** + * ubifs_tnc_lookup - look up a file-system node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. + */ +static inline int ubifs_tnc_lookup(struct ubifs_info *c, + const union ubifs_key *key, void *node) +{ + return ubifs_tnc_locate(c, key, node, NULL, NULL); +} + +/** + * ubifs_get_lprops - get reference to LEB properties. + * @c: the UBIFS file-system description object + * + * This function locks lprops. Lprops have to be unlocked by + * 'ubifs_release_lprops()'. + */ +static inline void ubifs_get_lprops(struct ubifs_info *c) +{ + mutex_lock(&c->lp_mutex); +} + +/** + * ubifs_release_lprops - release lprops lock. + * @c: the UBIFS file-system description object + * + * This function has to be called after each 'ubifs_get_lprops()' call to + * unlock lprops. + */ +static inline void ubifs_release_lprops(struct ubifs_info *c) +{ + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + ubifs_assert(c->lst.empty_lebs >= 0 && + c->lst.empty_lebs <= c->main_lebs); + mutex_unlock(&c->lp_mutex); +} + +/** + * ubifs_next_log_lnum - switch to the next log LEB. + * @c: UBIFS file-system description object + * @lnum: current log LEB + * + * This helper function returns the log LEB number which goes next after LEB + * 'lnum'. + */ +static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum) +{ + lnum += 1; + if (lnum > c->log_last) + lnum = UBIFS_LOG_LNUM; + + return lnum; +} + +#endif /* __UBIFS_MISC_H__ */ diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c new file mode 100644 index 00000000..a5422fff --- /dev/null +++ b/fs/ubifs/orphan.c @@ -0,0 +1,972 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Author: Adrian Hunter + */ + +#include "ubifs.h" + +/* + * An orphan is an inode number whose inode node has been committed to the index + * with a link count of zero. That happens when an open file is deleted + * (unlinked) and then a commit is run. In the normal course of events the inode + * would be deleted when the file is closed. However in the case of an unclean + * unmount, orphans need to be accounted for. After an unclean unmount, the + * orphans' inodes must be deleted which means either scanning the entire index + * looking for them, or keeping a list on flash somewhere. This unit implements + * the latter approach. + * + * The orphan area is a fixed number of LEBs situated between the LPT area and + * the main area. The number of orphan area LEBs is specified when the file + * system is created. The minimum number is 1. The size of the orphan area + * should be so that it can hold the maximum number of orphans that are expected + * to ever exist at one time. + * + * The number of orphans that can fit in a LEB is: + * + * (c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64) + * + * For example: a 15872 byte LEB can fit 1980 orphans so 1 LEB may be enough. + * + * Orphans are accumulated in a rb-tree. When an inode's link count drops to + * zero, the inode number is added to the rb-tree. It is removed from the tree + * when the inode is deleted. Any new orphans that are in the orphan tree when + * the commit is run, are written to the orphan area in 1 or more orphan nodes. + * If the orphan area is full, it is consolidated to make space. There is + * always enough space because validation prevents the user from creating more + * than the maximum number of orphans allowed. + */ + +#ifdef CONFIG_UBIFS_FS_DEBUG +static int dbg_check_orphans(struct ubifs_info *c); +#else +#define dbg_check_orphans(c) 0 +#endif + +/** + * ubifs_add_orphan - add an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * Add an orphan. This function is called when an inodes link count drops to + * zero. + */ +int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *orphan, *o; + struct rb_node **p, *parent = NULL; + + orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS); + if (!orphan) + return -ENOMEM; + orphan->inum = inum; + orphan->new = 1; + + spin_lock(&c->orphan_lock); + if (c->tot_orphans >= c->max_orphans) { + spin_unlock(&c->orphan_lock); + kfree(orphan); + return -ENFILE; + } + p = &c->orph_tree.rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct ubifs_orphan, rb); + if (inum < o->inum) + p = &(*p)->rb_left; + else if (inum > o->inum) + p = &(*p)->rb_right; + else { + dbg_err("orphaned twice"); + spin_unlock(&c->orphan_lock); + kfree(orphan); + return 0; + } + } + c->tot_orphans += 1; + c->new_orphans += 1; + rb_link_node(&orphan->rb, parent, p); + rb_insert_color(&orphan->rb, &c->orph_tree); + list_add_tail(&orphan->list, &c->orph_list); + list_add_tail(&orphan->new_list, &c->orph_new); + spin_unlock(&c->orphan_lock); + dbg_gen("ino %lu", (unsigned long)inum); + return 0; +} + +/** + * ubifs_delete_orphan - delete an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * Delete an orphan. This function is called when an inode is deleted. + */ +void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *o; + struct rb_node *p; + + spin_lock(&c->orphan_lock); + p = c->orph_tree.rb_node; + while (p) { + o = rb_entry(p, struct ubifs_orphan, rb); + if (inum < o->inum) + p = p->rb_left; + else if (inum > o->inum) + p = p->rb_right; + else { + if (o->dnext) { + spin_unlock(&c->orphan_lock); + dbg_gen("deleted twice ino %lu", + (unsigned long)inum); + return; + } + if (o->cnext) { + o->dnext = c->orph_dnext; + c->orph_dnext = o; + spin_unlock(&c->orphan_lock); + dbg_gen("delete later ino %lu", + (unsigned long)inum); + return; + } + rb_erase(p, &c->orph_tree); + list_del(&o->list); + c->tot_orphans -= 1; + if (o->new) { + list_del(&o->new_list); + c->new_orphans -= 1; + } + spin_unlock(&c->orphan_lock); + kfree(o); + dbg_gen("inum %lu", (unsigned long)inum); + return; + } + } + spin_unlock(&c->orphan_lock); + dbg_err("missing orphan ino %lu", (unsigned long)inum); + dbg_dump_stack(); +} + +/** + * ubifs_orphan_start_commit - start commit of orphans. + * @c: UBIFS file-system description object + * + * Start commit of orphans. + */ +int ubifs_orphan_start_commit(struct ubifs_info *c) +{ + struct ubifs_orphan *orphan, **last; + + spin_lock(&c->orphan_lock); + last = &c->orph_cnext; + list_for_each_entry(orphan, &c->orph_new, new_list) { + ubifs_assert(orphan->new); + orphan->new = 0; + *last = orphan; + last = &orphan->cnext; + } + *last = orphan->cnext; + c->cmt_orphans = c->new_orphans; + c->new_orphans = 0; + dbg_cmt("%d orphans to commit", c->cmt_orphans); + INIT_LIST_HEAD(&c->orph_new); + if (c->tot_orphans == 0) + c->no_orphs = 1; + else + c->no_orphs = 0; + spin_unlock(&c->orphan_lock); + return 0; +} + +/** + * avail_orphs - calculate available space. + * @c: UBIFS file-system description object + * + * This function returns the number of orphans that can be written in the + * available space. + */ +static int avail_orphs(struct ubifs_info *c) +{ + int avail_lebs, avail, gap; + + avail_lebs = c->orph_lebs - (c->ohead_lnum - c->orph_first) - 1; + avail = avail_lebs * + ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)); + gap = c->leb_size - c->ohead_offs; + if (gap >= UBIFS_ORPH_NODE_SZ + sizeof(__le64)) + avail += (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64); + return avail; +} + +/** + * tot_avail_orphs - calculate total space. + * @c: UBIFS file-system description object + * + * This function returns the number of orphans that can be written in half + * the total space. That leaves half the space for adding new orphans. + */ +static int tot_avail_orphs(struct ubifs_info *c) +{ + int avail_lebs, avail; + + avail_lebs = c->orph_lebs; + avail = avail_lebs * + ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)); + return avail / 2; +} + +/** + * do_write_orph_node - write a node to the orphan head. + * @c: UBIFS file-system description object + * @len: length of node + * @atomic: write atomically + * + * This function writes a node to the orphan head from the orphan buffer. If + * %atomic is not zero, then the write is done atomically. On success, %0 is + * returned, otherwise a negative error code is returned. + */ +static int do_write_orph_node(struct ubifs_info *c, int len, int atomic) +{ + int err = 0; + + if (atomic) { + ubifs_assert(c->ohead_offs == 0); + ubifs_prepare_node(c, c->orph_buf, len, 1); + len = ALIGN(len, c->min_io_size); + err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len, + UBI_SHORTTERM); + } else { + if (c->ohead_offs == 0) { + /* Ensure LEB has been unmapped */ + err = ubifs_leb_unmap(c, c->ohead_lnum); + if (err) + return err; + } + err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum, + c->ohead_offs, UBI_SHORTTERM); + } + return err; +} + +/** + * write_orph_node - write an orphan node. + * @c: UBIFS file-system description object + * @atomic: write atomically + * + * This function builds an orphan node from the cnext list and writes it to the + * orphan head. On success, %0 is returned, otherwise a negative error code + * is returned. + */ +static int write_orph_node(struct ubifs_info *c, int atomic) +{ + struct ubifs_orphan *orphan, *cnext; + struct ubifs_orph_node *orph; + int gap, err, len, cnt, i; + + ubifs_assert(c->cmt_orphans > 0); + gap = c->leb_size - c->ohead_offs; + if (gap < UBIFS_ORPH_NODE_SZ + sizeof(__le64)) { + c->ohead_lnum += 1; + c->ohead_offs = 0; + gap = c->leb_size; + if (c->ohead_lnum > c->orph_last) { + /* + * We limit the number of orphans so that this should + * never happen. + */ + ubifs_err("out of space in orphan area"); + return -EINVAL; + } + } + cnt = (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64); + if (cnt > c->cmt_orphans) + cnt = c->cmt_orphans; + len = UBIFS_ORPH_NODE_SZ + cnt * sizeof(__le64); + ubifs_assert(c->orph_buf); + orph = c->orph_buf; + orph->ch.node_type = UBIFS_ORPH_NODE; + spin_lock(&c->orphan_lock); + cnext = c->orph_cnext; + for (i = 0; i < cnt; i++) { + orphan = cnext; + orph->inos[i] = cpu_to_le64(orphan->inum); + cnext = orphan->cnext; + orphan->cnext = NULL; + } + c->orph_cnext = cnext; + c->cmt_orphans -= cnt; + spin_unlock(&c->orphan_lock); + if (c->cmt_orphans) + orph->cmt_no = cpu_to_le64(c->cmt_no); + else + /* Mark the last node of the commit */ + orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63)); + ubifs_assert(c->ohead_offs + len <= c->leb_size); + ubifs_assert(c->ohead_lnum >= c->orph_first); + ubifs_assert(c->ohead_lnum <= c->orph_last); + err = do_write_orph_node(c, len, atomic); + c->ohead_offs += ALIGN(len, c->min_io_size); + c->ohead_offs = ALIGN(c->ohead_offs, 8); + return err; +} + +/** + * write_orph_nodes - write orphan nodes until there are no more to commit. + * @c: UBIFS file-system description object + * @atomic: write atomically + * + * This function writes orphan nodes for all the orphans to commit. On success, + * %0 is returned, otherwise a negative error code is returned. + */ +static int write_orph_nodes(struct ubifs_info *c, int atomic) +{ + int err; + + while (c->cmt_orphans > 0) { + err = write_orph_node(c, atomic); + if (err) + return err; + } + if (atomic) { + int lnum; + + /* Unmap any unused LEBs after consolidation */ + lnum = c->ohead_lnum + 1; + for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + } + return 0; +} + +/** + * consolidate - consolidate the orphan area. + * @c: UBIFS file-system description object + * + * This function enables consolidation by putting all the orphans into the list + * to commit. The list is in the order that the orphans were added, and the + * LEBs are written atomically in order, so at no time can orphans be lost by + * an unclean unmount. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int consolidate(struct ubifs_info *c) +{ + int tot_avail = tot_avail_orphs(c), err = 0; + + spin_lock(&c->orphan_lock); + dbg_cmt("there is space for %d orphans and there are %d", + tot_avail, c->tot_orphans); + if (c->tot_orphans - c->new_orphans <= tot_avail) { + struct ubifs_orphan *orphan, **last; + int cnt = 0; + + /* Change the cnext list to include all non-new orphans */ + last = &c->orph_cnext; + list_for_each_entry(orphan, &c->orph_list, list) { + if (orphan->new) + continue; + *last = orphan; + last = &orphan->cnext; + cnt += 1; + } + *last = orphan->cnext; + ubifs_assert(cnt == c->tot_orphans - c->new_orphans); + c->cmt_orphans = cnt; + c->ohead_lnum = c->orph_first; + c->ohead_offs = 0; + } else { + /* + * We limit the number of orphans so that this should + * never happen. + */ + ubifs_err("out of space in orphan area"); + err = -EINVAL; + } + spin_unlock(&c->orphan_lock); + return err; +} + +/** + * commit_orphans - commit orphans. + * @c: UBIFS file-system description object + * + * This function commits orphans to flash. On success, %0 is returned, + * otherwise a negative error code is returned. + */ +static int commit_orphans(struct ubifs_info *c) +{ + int avail, atomic = 0, err; + + ubifs_assert(c->cmt_orphans > 0); + avail = avail_orphs(c); + if (avail < c->cmt_orphans) { + /* Not enough space to write new orphans, so consolidate */ + err = consolidate(c); + if (err) + return err; + atomic = 1; + } + err = write_orph_nodes(c, atomic); + return err; +} + +/** + * erase_deleted - erase the orphans marked for deletion. + * @c: UBIFS file-system description object + * + * During commit, the orphans being committed cannot be deleted, so they are + * marked for deletion and deleted by this function. Also, the recovery + * adds killed orphans to the deletion list, and therefore they are deleted + * here too. + */ +static void erase_deleted(struct ubifs_info *c) +{ + struct ubifs_orphan *orphan, *dnext; + + spin_lock(&c->orphan_lock); + dnext = c->orph_dnext; + while (dnext) { + orphan = dnext; + dnext = orphan->dnext; + ubifs_assert(!orphan->new); + rb_erase(&orphan->rb, &c->orph_tree); + list_del(&orphan->list); + c->tot_orphans -= 1; + dbg_gen("deleting orphan ino %lu", (unsigned long)orphan->inum); + kfree(orphan); + } + c->orph_dnext = NULL; + spin_unlock(&c->orphan_lock); +} + +/** + * ubifs_orphan_end_commit - end commit of orphans. + * @c: UBIFS file-system description object + * + * End commit of orphans. + */ +int ubifs_orphan_end_commit(struct ubifs_info *c) +{ + int err; + + if (c->cmt_orphans != 0) { + err = commit_orphans(c); + if (err) + return err; + } + erase_deleted(c); + err = dbg_check_orphans(c); + return err; +} + +/** + * ubifs_clear_orphans - erase all LEBs used for orphans. + * @c: UBIFS file-system description object + * + * If recovery is not required, then the orphans from the previous session + * are not needed. This function locates the LEBs used to record + * orphans, and un-maps them. + */ +int ubifs_clear_orphans(struct ubifs_info *c) +{ + int lnum, err; + + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + c->ohead_lnum = c->orph_first; + c->ohead_offs = 0; + return 0; +} + +/** + * insert_dead_orphan - insert an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * This function is a helper to the 'do_kill_orphans()' function. The orphan + * must be kept until the next commit, so it is added to the rb-tree and the + * deletion list. + */ +static int insert_dead_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *orphan, *o; + struct rb_node **p, *parent = NULL; + + orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL); + if (!orphan) + return -ENOMEM; + orphan->inum = inum; + + p = &c->orph_tree.rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct ubifs_orphan, rb); + if (inum < o->inum) + p = &(*p)->rb_left; + else if (inum > o->inum) + p = &(*p)->rb_right; + else { + /* Already added - no problem */ + kfree(orphan); + return 0; + } + } + c->tot_orphans += 1; + rb_link_node(&orphan->rb, parent, p); + rb_insert_color(&orphan->rb, &c->orph_tree); + list_add_tail(&orphan->list, &c->orph_list); + orphan->dnext = c->orph_dnext; + c->orph_dnext = orphan; + dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum, + c->new_orphans, c->tot_orphans); + return 0; +} + +/** + * do_kill_orphans - remove orphan inodes from the index. + * @c: UBIFS file-system description object + * @sleb: scanned LEB + * @last_cmt_no: cmt_no of last orphan node read is passed and returned here + * @outofdate: whether the LEB is out of date is returned here + * @last_flagged: whether the end orphan node is encountered + * + * This function is a helper to the 'kill_orphans()' function. It goes through + * every orphan node in a LEB and for every inode number recorded, removes + * all keys for that inode from the TNC. + */ +static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + unsigned long long *last_cmt_no, int *outofdate, + int *last_flagged) +{ + struct ubifs_scan_node *snod; + struct ubifs_orph_node *orph; + unsigned long long cmt_no; + ino_t inum; + int i, n, err, first = 1; + + list_for_each_entry(snod, &sleb->nodes, list) { + if (snod->type != UBIFS_ORPH_NODE) { + ubifs_err("invalid node type %d in orphan area at " + "%d:%d", snod->type, sleb->lnum, snod->offs); + dbg_dump_node(c, snod->node); + return -EINVAL; + } + + orph = snod->node; + + /* Check commit number */ + cmt_no = le64_to_cpu(orph->cmt_no) & LLONG_MAX; + /* + * The commit number on the master node may be less, because + * of a failed commit. If there are several failed commits in a + * row, the commit number written on orphan nodes will continue + * to increase (because the commit number is adjusted here) even + * though the commit number on the master node stays the same + * because the master node has not been re-written. + */ + if (cmt_no > c->cmt_no) + c->cmt_no = cmt_no; + if (cmt_no < *last_cmt_no && *last_flagged) { + /* + * The last orphan node had a higher commit number and + * was flagged as the last written for that commit + * number. That makes this orphan node, out of date. + */ + if (!first) { + ubifs_err("out of order commit number %llu in " + "orphan node at %d:%d", + cmt_no, sleb->lnum, snod->offs); + dbg_dump_node(c, snod->node); + return -EINVAL; + } + dbg_rcvry("out of date LEB %d", sleb->lnum); + *outofdate = 1; + return 0; + } + + if (first) + first = 0; + + n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3; + for (i = 0; i < n; i++) { + inum = le64_to_cpu(orph->inos[i]); + dbg_rcvry("deleting orphaned inode %lu", + (unsigned long)inum); + err = ubifs_tnc_remove_ino(c, inum); + if (err) + return err; + err = insert_dead_orphan(c, inum); + if (err) + return err; + } + + *last_cmt_no = cmt_no; + if (le64_to_cpu(orph->cmt_no) & (1ULL << 63)) { + dbg_rcvry("last orph node for commit %llu at %d:%d", + cmt_no, sleb->lnum, snod->offs); + *last_flagged = 1; + } else + *last_flagged = 0; + } + + return 0; +} + +/** + * kill_orphans - remove all orphan inodes from the index. + * @c: UBIFS file-system description object + * + * If recovery is required, then orphan inodes recorded during the previous + * session (which ended with an unclean unmount) must be deleted from the index. + * This is done by updating the TNC, but since the index is not updated until + * the next commit, the LEBs where the orphan information is recorded are not + * erased until the next commit. + */ +static int kill_orphans(struct ubifs_info *c) +{ + unsigned long long last_cmt_no = 0; + int lnum, err = 0, outofdate = 0, last_flagged = 0; + + c->ohead_lnum = c->orph_first; + c->ohead_offs = 0; + /* Check no-orphans flag and skip this if no orphans */ + if (c->no_orphs) { + dbg_rcvry("no orphans"); + return 0; + } + /* + * Orph nodes always start at c->orph_first and are written to each + * successive LEB in turn. Generally unused LEBs will have been unmapped + * but may contain out of date orphan nodes if the unmap didn't go + * through. In addition, the last orphan node written for each commit is + * marked (top bit of orph->cmt_no is set to 1). It is possible that + * there are orphan nodes from the next commit (i.e. the commit did not + * complete successfully). In that case, no orphans will have been lost + * due to the way that orphans are written, and any orphans added will + * be valid orphans anyway and so can be deleted. + */ + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + struct ubifs_scan_leb *sleb; + + dbg_rcvry("LEB %d", lnum); + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); + if (IS_ERR(sleb)) { + if (PTR_ERR(sleb) == -EUCLEAN) + sleb = ubifs_recover_leb(c, lnum, 0, + c->sbuf, -1); + if (IS_ERR(sleb)) { + err = PTR_ERR(sleb); + break; + } + } + err = do_kill_orphans(c, sleb, &last_cmt_no, &outofdate, + &last_flagged); + if (err || outofdate) { + ubifs_scan_destroy(sleb); + break; + } + if (sleb->endpt) { + c->ohead_lnum = lnum; + c->ohead_offs = sleb->endpt; + } + ubifs_scan_destroy(sleb); + } + return err; +} + +/** + * ubifs_mount_orphans - delete orphan inodes and erase LEBs that recorded them. + * @c: UBIFS file-system description object + * @unclean: indicates recovery from unclean unmount + * @read_only: indicates read only mount + * + * This function is called when mounting to erase orphans from the previous + * session. If UBIFS was not unmounted cleanly, then the inodes recorded as + * orphans are deleted. + */ +int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only) +{ + int err = 0; + + c->max_orphans = tot_avail_orphs(c); + + if (!read_only) { + c->orph_buf = vmalloc(c->leb_size); + if (!c->orph_buf) + return -ENOMEM; + } + + if (unclean) + err = kill_orphans(c); + else if (!read_only) + err = ubifs_clear_orphans(c); + + return err; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +struct check_orphan { + struct rb_node rb; + ino_t inum; +}; + +struct check_info { + unsigned long last_ino; + unsigned long tot_inos; + unsigned long missing; + unsigned long long leaf_cnt; + struct ubifs_ino_node *node; + struct rb_root root; +}; + +static int dbg_find_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *o; + struct rb_node *p; + + spin_lock(&c->orphan_lock); + p = c->orph_tree.rb_node; + while (p) { + o = rb_entry(p, struct ubifs_orphan, rb); + if (inum < o->inum) + p = p->rb_left; + else if (inum > o->inum) + p = p->rb_right; + else { + spin_unlock(&c->orphan_lock); + return 1; + } + } + spin_unlock(&c->orphan_lock); + return 0; +} + +static int dbg_ins_check_orphan(struct rb_root *root, ino_t inum) +{ + struct check_orphan *orphan, *o; + struct rb_node **p, *parent = NULL; + + orphan = kzalloc(sizeof(struct check_orphan), GFP_NOFS); + if (!orphan) + return -ENOMEM; + orphan->inum = inum; + + p = &root->rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct check_orphan, rb); + if (inum < o->inum) + p = &(*p)->rb_left; + else if (inum > o->inum) + p = &(*p)->rb_right; + else { + kfree(orphan); + return 0; + } + } + rb_link_node(&orphan->rb, parent, p); + rb_insert_color(&orphan->rb, root); + return 0; +} + +static int dbg_find_check_orphan(struct rb_root *root, ino_t inum) +{ + struct check_orphan *o; + struct rb_node *p; + + p = root->rb_node; + while (p) { + o = rb_entry(p, struct check_orphan, rb); + if (inum < o->inum) + p = p->rb_left; + else if (inum > o->inum) + p = p->rb_right; + else + return 1; + } + return 0; +} + +static void dbg_free_check_tree(struct rb_root *root) +{ + struct rb_node *this = root->rb_node; + struct check_orphan *o; + + while (this) { + if (this->rb_left) { + this = this->rb_left; + continue; + } else if (this->rb_right) { + this = this->rb_right; + continue; + } + o = rb_entry(this, struct check_orphan, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &o->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(o); + } +} + +static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *priv) +{ + struct check_info *ci = priv; + ino_t inum; + int err; + + inum = key_inum(c, &zbr->key); + if (inum != ci->last_ino) { + /* Lowest node type is the inode node, so it comes first */ + if (key_type(c, &zbr->key) != UBIFS_INO_KEY) + ubifs_err("found orphan node ino %lu, type %d", + (unsigned long)inum, key_type(c, &zbr->key)); + ci->last_ino = inum; + ci->tot_inos += 1; + err = ubifs_tnc_read_node(c, zbr, ci->node); + if (err) { + ubifs_err("node read failed, error %d", err); + return err; + } + if (ci->node->nlink == 0) + /* Must be recorded as an orphan */ + if (!dbg_find_check_orphan(&ci->root, inum) && + !dbg_find_orphan(c, inum)) { + ubifs_err("missing orphan, ino %lu", + (unsigned long)inum); + ci->missing += 1; + } + } + ci->leaf_cnt += 1; + return 0; +} + +static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb) +{ + struct ubifs_scan_node *snod; + struct ubifs_orph_node *orph; + ino_t inum; + int i, n, err; + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + if (snod->type != UBIFS_ORPH_NODE) + continue; + orph = snod->node; + n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3; + for (i = 0; i < n; i++) { + inum = le64_to_cpu(orph->inos[i]); + err = dbg_ins_check_orphan(&ci->root, inum); + if (err) + return err; + } + } + return 0; +} + +static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) +{ + int lnum, err = 0; + void *buf; + + /* Check no-orphans flag and skip this if no orphans */ + if (c->no_orphs) + return 0; + + buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory to check orphans"); + return 0; + } + + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + struct ubifs_scan_leb *sleb; + + sleb = ubifs_scan(c, lnum, 0, buf, 0); + if (IS_ERR(sleb)) { + err = PTR_ERR(sleb); + break; + } + + err = dbg_read_orphans(ci, sleb); + ubifs_scan_destroy(sleb); + if (err) + break; + } + + vfree(buf); + return err; +} + +static int dbg_check_orphans(struct ubifs_info *c) +{ + struct check_info ci; + int err; + + if (!(ubifs_chk_flags & UBIFS_CHK_ORPH)) + return 0; + + ci.last_ino = 0; + ci.tot_inos = 0; + ci.missing = 0; + ci.leaf_cnt = 0; + ci.root = RB_ROOT; + ci.node = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); + if (!ci.node) { + ubifs_err("out of memory"); + return -ENOMEM; + } + + err = dbg_scan_orphans(c, &ci); + if (err) + goto out; + + err = dbg_walk_index(c, &dbg_orphan_check, NULL, &ci); + if (err) { + ubifs_err("cannot scan TNC, error %d", err); + goto out; + } + + if (ci.missing) { + ubifs_err("%lu missing orphan(s)", ci.missing); + err = -EINVAL; + goto out; + } + + dbg_cmt("last inode number is %lu", ci.last_ino); + dbg_cmt("total number of inodes is %lu", ci.tot_inos); + dbg_cmt("total number of leaf nodes is %llu", ci.leaf_cnt); + +out: + dbg_free_check_tree(&ci.root); + kfree(ci.node); + return err; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c new file mode 100644 index 00000000..783d8e0b --- /dev/null +++ b/fs/ubifs/recovery.c @@ -0,0 +1,1567 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements functions needed to recover from unclean un-mounts. + * When UBIFS is mounted, it checks a flag on the master node to determine if + * an un-mount was completed successfully. If not, the process of mounting + * incorporates additional checking and fixing of on-flash data structures. + * UBIFS always cleans away all remnants of an unclean un-mount, so that + * errors do not accumulate. However UBIFS defers recovery if it is mounted + * read-only, and the flash is not modified in that case. + * + * The general UBIFS approach to the recovery is that it recovers from + * corruptions which could be caused by power cuts, but it refuses to recover + * from corruption caused by other reasons. And UBIFS tries to distinguish + * between these 2 reasons of corruptions and silently recover in the former + * case and loudly complain in the latter case. + * + * UBIFS writes only to erased LEBs, so it writes only to the flash space + * containing only 0xFFs. UBIFS also always writes strictly from the beginning + * of the LEB to the end. And UBIFS assumes that the underlying flash media + * writes in @c->max_write_size bytes at a time. + * + * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min. + * I/O unit corresponding to offset X to contain corrupted data, all the + * following min. I/O units have to contain empty space (all 0xFFs). If this is + * not true, the corruption cannot be the result of a power cut, and UBIFS + * refuses to mount. + */ + +#include +#include +#include "ubifs.h" + +/** + * is_empty - determine whether a buffer is empty (contains all 0xff). + * @buf: buffer to clean + * @len: length of buffer + * + * This function returns %1 if the buffer is empty (contains all 0xff) otherwise + * %0 is returned. + */ +static int is_empty(void *buf, int len) +{ + uint8_t *p = buf; + int i; + + for (i = 0; i < len; i++) + if (*p++ != 0xff) + return 0; + return 1; +} + +/** + * first_non_ff - find offset of the first non-0xff byte. + * @buf: buffer to search in + * @len: length of buffer + * + * This function returns offset of the first non-0xff byte in @buf or %-1 if + * the buffer contains only 0xff bytes. + */ +static int first_non_ff(void *buf, int len) +{ + uint8_t *p = buf; + int i; + + for (i = 0; i < len; i++) + if (*p++ != 0xff) + return i; + return -1; +} + +/** + * get_master_node - get the last valid master node allowing for corruption. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @pbuf: buffer containing the LEB read, is returned here + * @mst: master node, if found, is returned here + * @cor: corruption, if found, is returned here + * + * This function allocates a buffer, reads the LEB into it, and finds and + * returns the last valid master node allowing for one area of corruption. + * The corrupt area, if there is one, must be consistent with the assumption + * that it is the result of an unclean unmount while the master node was being + * written. Under those circumstances, it is valid to use the previously written + * master node. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf, + struct ubifs_mst_node **mst, void **cor) +{ + const int sz = c->mst_node_alsz; + int err, offs, len; + void *sbuf, *buf; + + sbuf = vmalloc(c->leb_size); + if (!sbuf) + return -ENOMEM; + + err = ubi_read(c->ubi, lnum, sbuf, 0, c->leb_size); + if (err && err != -EBADMSG) + goto out_free; + + /* Find the first position that is definitely not a node */ + offs = 0; + buf = sbuf; + len = c->leb_size; + while (offs + UBIFS_MST_NODE_SZ <= c->leb_size) { + struct ubifs_ch *ch = buf; + + if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) + break; + offs += sz; + buf += sz; + len -= sz; + } + /* See if there was a valid master node before that */ + if (offs) { + int ret; + + offs -= sz; + buf -= sz; + len += sz; + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); + if (ret != SCANNED_A_NODE && offs) { + /* Could have been corruption so check one place back */ + offs -= sz; + buf -= sz; + len += sz; + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); + if (ret != SCANNED_A_NODE) + /* + * We accept only one area of corruption because + * we are assuming that it was caused while + * trying to write a master node. + */ + goto out_err; + } + if (ret == SCANNED_A_NODE) { + struct ubifs_ch *ch = buf; + + if (ch->node_type != UBIFS_MST_NODE) + goto out_err; + dbg_rcvry("found a master node at %d:%d", lnum, offs); + *mst = buf; + offs += sz; + buf += sz; + len -= sz; + } + } + /* Check for corruption */ + if (offs < c->leb_size) { + if (!is_empty(buf, min_t(int, len, sz))) { + *cor = buf; + dbg_rcvry("found corruption at %d:%d", lnum, offs); + } + offs += sz; + buf += sz; + len -= sz; + } + /* Check remaining empty space */ + if (offs < c->leb_size) + if (!is_empty(buf, len)) + goto out_err; + *pbuf = sbuf; + return 0; + +out_err: + err = -EINVAL; +out_free: + vfree(sbuf); + *mst = NULL; + *cor = NULL; + return err; +} + +/** + * write_rcvrd_mst_node - write recovered master node. + * @c: UBIFS file-system description object + * @mst: master node + * + * This function returns %0 on success and a negative error code on failure. + */ +static int write_rcvrd_mst_node(struct ubifs_info *c, + struct ubifs_mst_node *mst) +{ + int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz; + __le32 save_flags; + + dbg_rcvry("recovery"); + + save_flags = mst->flags; + mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY); + + ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1); + err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM); + if (err) + goto out; + err = ubi_leb_change(c->ubi, lnum + 1, mst, sz, UBI_SHORTTERM); + if (err) + goto out; +out: + mst->flags = save_flags; + return err; +} + +/** + * ubifs_recover_master_node - recover the master node. + * @c: UBIFS file-system description object + * + * This function recovers the master node from corruption that may occur due to + * an unclean unmount. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_master_node(struct ubifs_info *c) +{ + void *buf1 = NULL, *buf2 = NULL, *cor1 = NULL, *cor2 = NULL; + struct ubifs_mst_node *mst1 = NULL, *mst2 = NULL, *mst; + const int sz = c->mst_node_alsz; + int err, offs1, offs2; + + dbg_rcvry("recovery"); + + err = get_master_node(c, UBIFS_MST_LNUM, &buf1, &mst1, &cor1); + if (err) + goto out_free; + + err = get_master_node(c, UBIFS_MST_LNUM + 1, &buf2, &mst2, &cor2); + if (err) + goto out_free; + + if (mst1) { + offs1 = (void *)mst1 - buf1; + if ((le32_to_cpu(mst1->flags) & UBIFS_MST_RCVRY) && + (offs1 == 0 && !cor1)) { + /* + * mst1 was written by recovery at offset 0 with no + * corruption. + */ + dbg_rcvry("recovery recovery"); + mst = mst1; + } else if (mst2) { + offs2 = (void *)mst2 - buf2; + if (offs1 == offs2) { + /* Same offset, so must be the same */ + if (memcmp((void *)mst1 + UBIFS_CH_SZ, + (void *)mst2 + UBIFS_CH_SZ, + UBIFS_MST_NODE_SZ - UBIFS_CH_SZ)) + goto out_err; + mst = mst1; + } else if (offs2 + sz == offs1) { + /* 1st LEB was written, 2nd was not */ + if (cor1) + goto out_err; + mst = mst1; + } else if (offs1 == 0 && offs2 + sz >= c->leb_size) { + /* 1st LEB was unmapped and written, 2nd not */ + if (cor1) + goto out_err; + mst = mst1; + } else + goto out_err; + } else { + /* + * 2nd LEB was unmapped and about to be written, so + * there must be only one master node in the first LEB + * and no corruption. + */ + if (offs1 != 0 || cor1) + goto out_err; + mst = mst1; + } + } else { + if (!mst2) + goto out_err; + /* + * 1st LEB was unmapped and about to be written, so there must + * be no room left in 2nd LEB. + */ + offs2 = (void *)mst2 - buf2; + if (offs2 + sz + sz <= c->leb_size) + goto out_err; + mst = mst2; + } + + ubifs_msg("recovered master node from LEB %d", + (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1)); + + memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ); + + if (c->ro_mount) { + /* Read-only mode. Keep a copy for switching to rw mode */ + c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL); + if (!c->rcvrd_mst_node) { + err = -ENOMEM; + goto out_free; + } + memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ); + + /* + * We had to recover the master node, which means there was an + * unclean reboot. However, it is possible that the master node + * is clean at this point, i.e., %UBIFS_MST_DIRTY is not set. + * E.g., consider the following chain of events: + * + * 1. UBIFS was cleanly unmounted, so the master node is clean + * 2. UBIFS is being mounted R/W and starts changing the master + * node in the first (%UBIFS_MST_LNUM). A power cut happens, + * so this LEB ends up with some amount of garbage at the + * end. + * 3. UBIFS is being mounted R/O. We reach this place and + * recover the master node from the second LEB + * (%UBIFS_MST_LNUM + 1). But we cannot update the media + * because we are being mounted R/O. We have to defer the + * operation. + * 4. However, this master node (@c->mst_node) is marked as + * clean (since the step 1). And if we just return, the + * mount code will be confused and won't recover the master + * node when it is re-mounter R/W later. + * + * Thus, to force the recovery by marking the master node as + * dirty. + */ + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + } else { + /* Write the recovered master node */ + c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1; + err = write_rcvrd_mst_node(c, c->mst_node); + if (err) + goto out_free; + } + + vfree(buf2); + vfree(buf1); + + return 0; + +out_err: + err = -EINVAL; +out_free: + ubifs_err("failed to recover master node"); + if (mst1) { + dbg_err("dumping first master node"); + dbg_dump_node(c, mst1); + } + if (mst2) { + dbg_err("dumping second master node"); + dbg_dump_node(c, mst2); + } + vfree(buf2); + vfree(buf1); + return err; +} + +/** + * ubifs_write_rcvrd_mst_node - write the recovered master node. + * @c: UBIFS file-system description object + * + * This function writes the master node that was recovered during mounting in + * read-only mode and must now be written because we are remounting rw. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_write_rcvrd_mst_node(struct ubifs_info *c) +{ + int err; + + if (!c->rcvrd_mst_node) + return 0; + c->rcvrd_mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + err = write_rcvrd_mst_node(c, c->rcvrd_mst_node); + if (err) + return err; + kfree(c->rcvrd_mst_node); + c->rcvrd_mst_node = NULL; + return 0; +} + +/** + * is_last_write - determine if an offset was in the last write to a LEB. + * @c: UBIFS file-system description object + * @buf: buffer to check + * @offs: offset to check + * + * This function returns %1 if @offs was in the last write to the LEB whose data + * is in @buf, otherwise %0 is returned. The determination is made by checking + * for subsequent empty space starting from the next @c->max_write_size + * boundary. + */ +static int is_last_write(const struct ubifs_info *c, void *buf, int offs) +{ + int empty_offs, check_len; + uint8_t *p; + + /* + * Round up to the next @c->max_write_size boundary i.e. @offs is in + * the last wbuf written. After that should be empty space. + */ + empty_offs = ALIGN(offs + 1, c->max_write_size); + check_len = c->leb_size - empty_offs; + p = buf + empty_offs - offs; + return is_empty(p, check_len); +} + +/** + * clean_buf - clean the data from an LEB sitting in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer to clean + * @lnum: LEB number to clean + * @offs: offset from which to clean + * @len: length of buffer + * + * This function pads up to the next min_io_size boundary (if there is one) and + * sets empty space to all 0xff. @buf, @offs and @len are updated to the next + * @c->min_io_size boundary. + */ +static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, + int *offs, int *len) +{ + int empty_offs, pad_len; + + lnum = lnum; + dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs); + + ubifs_assert(!(*offs & 7)); + empty_offs = ALIGN(*offs, c->min_io_size); + pad_len = empty_offs - *offs; + ubifs_pad(c, *buf, pad_len); + *offs += pad_len; + *buf += pad_len; + *len -= pad_len; + memset(*buf, 0xff, c->leb_size - empty_offs); +} + +/** + * no_more_nodes - determine if there are no more nodes in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer to check + * @len: length of buffer + * @lnum: LEB number of the LEB from which @buf was read + * @offs: offset from which @buf was read + * + * This function ensures that the corrupted node at @offs is the last thing + * written to a LEB. This function returns %1 if more data is not found and + * %0 if more data is found. + */ +static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, + int lnum, int offs) +{ + struct ubifs_ch *ch = buf; + int skip, dlen = le32_to_cpu(ch->len); + + /* Check for empty space after the corrupt node's common header */ + skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs; + if (is_empty(buf + skip, len - skip)) + return 1; + /* + * The area after the common header size is not empty, so the common + * header must be intact. Check it. + */ + if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) { + dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs); + return 0; + } + /* Now we know the corrupt node's length we can skip over it */ + skip = ALIGN(offs + dlen, c->max_write_size) - offs; + /* After which there should be empty space */ + if (is_empty(buf + skip, len - skip)) + return 1; + dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip); + return 0; +} + +/** + * fix_unclean_leb - fix an unclean LEB. + * @c: UBIFS file-system description object + * @sleb: scanned LEB information + * @start: offset where scan started + */ +static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + int start) +{ + int lnum = sleb->lnum, endpt = start; + + /* Get the end offset of the last node we are keeping */ + if (!list_empty(&sleb->nodes)) { + struct ubifs_scan_node *snod; + + snod = list_entry(sleb->nodes.prev, + struct ubifs_scan_node, list); + endpt = snod->offs + snod->len; + } + + if (c->ro_mount && !c->remounting_rw) { + /* Add to recovery list */ + struct ubifs_unclean_leb *ucleb; + + dbg_rcvry("need to fix LEB %d start %d endpt %d", + lnum, start, sleb->endpt); + ucleb = kzalloc(sizeof(struct ubifs_unclean_leb), GFP_NOFS); + if (!ucleb) + return -ENOMEM; + ucleb->lnum = lnum; + ucleb->endpt = endpt; + list_add_tail(&ucleb->list, &c->unclean_leb_list); + } else { + /* Write the fixed LEB back to flash */ + int err; + + dbg_rcvry("fixing LEB %d start %d endpt %d", + lnum, start, sleb->endpt); + if (endpt == 0) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } else { + int len = ALIGN(endpt, c->min_io_size); + + if (start) { + err = ubi_read(c->ubi, lnum, sleb->buf, 0, + start); + if (err) + return err; + } + /* Pad to min_io_size */ + if (len > endpt) { + int pad_len = len - ALIGN(endpt, 8); + + if (pad_len > 0) { + void *buf = sleb->buf + len - pad_len; + + ubifs_pad(c, buf, pad_len); + } + } + err = ubi_leb_change(c->ubi, lnum, sleb->buf, len, + UBI_UNKNOWN); + if (err) + return err; + } + } + return 0; +} + +/** + * drop_last_group - drop the last group of nodes. + * @sleb: scanned LEB information + * @offs: offset of dropped nodes is returned here + * + * This is a helper function for 'ubifs_recover_leb()' which drops the last + * group of nodes of the scanned LEB. + */ +static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs) +{ + while (!list_empty(&sleb->nodes)) { + struct ubifs_scan_node *snod; + struct ubifs_ch *ch; + + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, + list); + ch = snod->node; + if (ch->group_type != UBIFS_IN_NODE_GROUP) + break; + + dbg_rcvry("dropping grouped node at %d:%d", + sleb->lnum, snod->offs); + *offs = snod->offs; + list_del(&snod->list); + kfree(snod); + sleb->nodes_cnt -= 1; + } +} + +/** + * drop_last_node - drop the last node. + * @sleb: scanned LEB information + * @offs: offset of dropped nodes is returned here + * @grouped: non-zero if whole group of nodes have to be dropped + * + * This is a helper function for 'ubifs_recover_leb()' which drops the last + * node of the scanned LEB. + */ +static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs) +{ + struct ubifs_scan_node *snod; + + if (!list_empty(&sleb->nodes)) { + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, + list); + + dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs); + *offs = snod->offs; + list_del(&snod->list); + kfree(snod); + sleb->nodes_cnt -= 1; + } +} + +/** + * ubifs_recover_leb - scan and recover a LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @offs: offset + * @sbuf: LEB-sized buffer to use + * @jhead: journal head number this LEB belongs to (%-1 if the LEB does not + * belong to any journal head) + * + * This function does a scan of a LEB, but caters for errors that might have + * been caused by the unclean unmount from which we are attempting to recover. + * Returns %0 in case of success, %-EUCLEAN if an unrecoverable corruption is + * found, and a negative error code in case of failure. + */ +struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf, int jhead) +{ + int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit; + int grouped = jhead == -1 ? 0 : c->jheads[jhead].grouped; + struct ubifs_scan_leb *sleb; + void *buf = sbuf + offs; + + dbg_rcvry("%d:%d, jhead %d, grouped %d", lnum, offs, jhead, grouped); + + sleb = ubifs_start_scan(c, lnum, offs, sbuf); + if (IS_ERR(sleb)) + return sleb; + + ubifs_assert(len >= 8); + while (len >= 8) { + dbg_scan("look at LEB %d:%d (%d bytes left)", + lnum, offs, len); + + cond_resched(); + + /* + * Scan quietly until there is an error from which we cannot + * recover + */ + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); + if (ret == SCANNED_A_NODE) { + /* A valid node, and not a padding node */ + struct ubifs_ch *ch = buf; + int node_len; + + err = ubifs_add_snod(c, sleb, buf, offs); + if (err) + goto error; + node_len = ALIGN(le32_to_cpu(ch->len), 8); + offs += node_len; + buf += node_len; + len -= node_len; + } else if (ret > 0) { + /* Padding bytes or a valid padding node */ + offs += ret; + buf += ret; + len -= ret; + } else if (ret == SCANNED_EMPTY_SPACE || + ret == SCANNED_GARBAGE || + ret == SCANNED_A_BAD_PAD_NODE || + ret == SCANNED_A_CORRUPT_NODE) { + dbg_rcvry("found corruption - %d", ret); + break; + } else { + dbg_err("unexpected return value %d", ret); + err = -EINVAL; + goto error; + } + } + + if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) { + if (!is_last_write(c, buf, offs)) + goto corrupted_rescan; + } else if (ret == SCANNED_A_CORRUPT_NODE) { + if (!no_more_nodes(c, buf, len, lnum, offs)) + goto corrupted_rescan; + } else if (!is_empty(buf, len)) { + if (!is_last_write(c, buf, offs)) { + int corruption = first_non_ff(buf, len); + + /* + * See header comment for this file for more + * explanations about the reasons we have this check. + */ + ubifs_err("corrupt empty space LEB %d:%d, corruption " + "starts at %d", lnum, offs, corruption); + /* Make sure we dump interesting non-0xFF data */ + offs += corruption; + buf += corruption; + goto corrupted; + } + } + + min_io_unit = round_down(offs, c->min_io_size); + if (grouped) + /* + * If nodes are grouped, always drop the incomplete group at + * the end. + */ + drop_last_group(sleb, &offs); + + if (jhead == GCHD) { + /* + * If this LEB belongs to the GC head then while we are in the + * middle of the same min. I/O unit keep dropping nodes. So + * basically, what we want is to make sure that the last min. + * I/O unit where we saw the corruption is dropped completely + * with all the uncorrupted nodes which may possibly sit there. + * + * In other words, let's name the min. I/O unit where the + * corruption starts B, and the previous min. I/O unit A. The + * below code tries to deal with a situation when half of B + * contains valid nodes or the end of a valid node, and the + * second half of B contains corrupted data or garbage. This + * means that UBIFS had been writing to B just before the power + * cut happened. I do not know how realistic is this scenario + * that half of the min. I/O unit had been written successfully + * and the other half not, but this is possible in our 'failure + * mode emulation' infrastructure at least. + * + * So what is the problem, why we need to drop those nodes? Why + * can't we just clean-up the second half of B by putting a + * padding node there? We can, and this works fine with one + * exception which was reproduced with power cut emulation + * testing and happens extremely rarely. + * + * Imagine the file-system is full, we run GC which starts + * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is + * the current GC head LEB). The @c->gc_lnum is -1, which means + * that GC will retain LEB X and will try to continue. Imagine + * that LEB X is currently the dirtiest LEB, and the amount of + * used space in LEB Y is exactly the same as amount of free + * space in LEB X. + * + * And a power cut happens when nodes are moved from LEB X to + * LEB Y. We are here trying to recover LEB Y which is the GC + * head LEB. We find the min. I/O unit B as described above. + * Then we clean-up LEB Y by padding min. I/O unit. And later + * 'ubifs_rcvry_gc_commit()' function fails, because it cannot + * find a dirty LEB which could be GC'd into LEB Y! Even LEB X + * does not match because the amount of valid nodes there does + * not fit the free space in LEB Y any more! And this is + * because of the padding node which we added to LEB Y. The + * user-visible effect of this which I once observed and + * analysed is that we cannot mount the file-system with + * -ENOSPC error. + * + * So obviously, to make sure that situation does not happen we + * should free min. I/O unit B in LEB Y completely and the last + * used min. I/O unit in LEB Y should be A. This is basically + * what the below code tries to do. + */ + while (offs > min_io_unit) + drop_last_node(sleb, &offs); + } + + buf = sbuf + offs; + len = c->leb_size - offs; + + clean_buf(c, &buf, lnum, &offs, &len); + ubifs_end_scan(c, sleb, lnum, offs); + + err = fix_unclean_leb(c, sleb, start); + if (err) + goto error; + + return sleb; + +corrupted_rescan: + /* Re-scan the corrupted data with verbose messages */ + dbg_err("corruptio %d", ret); + ubifs_scan_a_node(c, buf, len, lnum, offs, 1); +corrupted: + ubifs_scanned_corruption(c, lnum, offs, buf); + err = -EUCLEAN; +error: + ubifs_err("LEB %d scanning failed", lnum); + ubifs_scan_destroy(sleb); + return ERR_PTR(err); +} + +/** + * get_cs_sqnum - get commit start sequence number. + * @c: UBIFS file-system description object + * @lnum: LEB number of commit start node + * @offs: offset of commit start node + * @cs_sqnum: commit start sequence number is returned here + * + * This function returns %0 on success and a negative error code on failure. + */ +static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs, + unsigned long long *cs_sqnum) +{ + struct ubifs_cs_node *cs_node = NULL; + int err, ret; + + dbg_rcvry("at %d:%d", lnum, offs); + cs_node = kmalloc(UBIFS_CS_NODE_SZ, GFP_KERNEL); + if (!cs_node) + return -ENOMEM; + if (c->leb_size - offs < UBIFS_CS_NODE_SZ) + goto out_err; + err = ubi_read(c->ubi, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ); + if (err && err != -EBADMSG) + goto out_free; + ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0); + if (ret != SCANNED_A_NODE) { + dbg_err("Not a valid node"); + goto out_err; + } + if (cs_node->ch.node_type != UBIFS_CS_NODE) { + dbg_err("Node a CS node, type is %d", cs_node->ch.node_type); + goto out_err; + } + if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) { + dbg_err("CS node cmt_no %llu != current cmt_no %llu", + (unsigned long long)le64_to_cpu(cs_node->cmt_no), + c->cmt_no); + goto out_err; + } + *cs_sqnum = le64_to_cpu(cs_node->ch.sqnum); + dbg_rcvry("commit start sqnum %llu", *cs_sqnum); + kfree(cs_node); + return 0; + +out_err: + err = -EINVAL; +out_free: + ubifs_err("failed to get CS sqnum"); + kfree(cs_node); + return err; +} + +/** + * ubifs_recover_log_leb - scan and recover a log LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @offs: offset + * @sbuf: LEB-sized buffer to use + * + * This function does a scan of a LEB, but caters for errors that might have + * been caused by unclean reboots from which we are attempting to recover + * (assume that only the last log LEB can be corrupted by an unclean reboot). + * + * This function returns %0 on success and a negative error code on failure. + */ +struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf) +{ + struct ubifs_scan_leb *sleb; + int next_lnum; + + dbg_rcvry("LEB %d", lnum); + next_lnum = lnum + 1; + if (next_lnum >= UBIFS_LOG_LNUM + c->log_lebs) + next_lnum = UBIFS_LOG_LNUM; + if (next_lnum != c->ltail_lnum) { + /* + * We can only recover at the end of the log, so check that the + * next log LEB is empty or out of date. + */ + sleb = ubifs_scan(c, next_lnum, 0, sbuf, 0); + if (IS_ERR(sleb)) + return sleb; + if (sleb->nodes_cnt) { + struct ubifs_scan_node *snod; + unsigned long long cs_sqnum = c->cs_sqnum; + + snod = list_entry(sleb->nodes.next, + struct ubifs_scan_node, list); + if (cs_sqnum == 0) { + int err; + + err = get_cs_sqnum(c, lnum, offs, &cs_sqnum); + if (err) { + ubifs_scan_destroy(sleb); + return ERR_PTR(err); + } + } + if (snod->sqnum > cs_sqnum) { + ubifs_err("unrecoverable log corruption " + "in LEB %d", lnum); + ubifs_scan_destroy(sleb); + return ERR_PTR(-EUCLEAN); + } + } + ubifs_scan_destroy(sleb); + } + return ubifs_recover_leb(c, lnum, offs, sbuf, -1); +} + +/** + * recover_head - recover a head. + * @c: UBIFS file-system description object + * @lnum: LEB number of head to recover + * @offs: offset of head to recover + * @sbuf: LEB-sized buffer to use + * + * This function ensures that there is no data on the flash at a head location. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int recover_head(const struct ubifs_info *c, int lnum, int offs, + void *sbuf) +{ + int len = c->max_write_size, err; + + if (offs + len > c->leb_size) + len = c->leb_size - offs; + + if (!len) + return 0; + + /* Read at the head location and check it is empty flash */ + err = ubi_read(c->ubi, lnum, sbuf, offs, len); + if (err || !is_empty(sbuf, len)) { + dbg_rcvry("cleaning head at %d:%d", lnum, offs); + if (offs == 0) + return ubifs_leb_unmap(c, lnum); + err = ubi_read(c->ubi, lnum, sbuf, 0, offs); + if (err) + return err; + return ubi_leb_change(c->ubi, lnum, sbuf, offs, UBI_UNKNOWN); + } + + return 0; +} + +/** + * ubifs_recover_inl_heads - recover index and LPT heads. + * @c: UBIFS file-system description object + * @sbuf: LEB-sized buffer to use + * + * This function ensures that there is no data on the flash at the index and + * LPT head locations. + * + * This deals with the recovery of a half-completed journal commit. UBIFS is + * careful never to overwrite the last version of the index or the LPT. Because + * the index and LPT are wandering trees, data from a half-completed commit will + * not be referenced anywhere in UBIFS. The data will be either in LEBs that are + * assumed to be empty and will be unmapped anyway before use, or in the index + * and LPT heads. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf) +{ + int err; + + ubifs_assert(!c->ro_mount || c->remounting_rw); + + dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs); + err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf); + if (err) + return err; + + dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs); + err = recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf); + if (err) + return err; + + return 0; +} + +/** + * clean_an_unclean_leb - read and write a LEB to remove corruption. + * @c: UBIFS file-system description object + * @ucleb: unclean LEB information + * @sbuf: LEB-sized buffer to use + * + * This function reads a LEB up to a point pre-determined by the mount recovery, + * checks the nodes, and writes the result back to the flash, thereby cleaning + * off any following corruption, or non-fatal ECC errors. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int clean_an_unclean_leb(const struct ubifs_info *c, + struct ubifs_unclean_leb *ucleb, void *sbuf) +{ + int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1; + void *buf = sbuf; + + dbg_rcvry("LEB %d len %d", lnum, len); + + if (len == 0) { + /* Nothing to read, just unmap it */ + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + return 0; + } + + err = ubi_read(c->ubi, lnum, buf, offs, len); + if (err && err != -EBADMSG) + return err; + + while (len >= 8) { + int ret; + + cond_resched(); + + /* Scan quietly until there is an error */ + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); + + if (ret == SCANNED_A_NODE) { + /* A valid node, and not a padding node */ + struct ubifs_ch *ch = buf; + int node_len; + + node_len = ALIGN(le32_to_cpu(ch->len), 8); + offs += node_len; + buf += node_len; + len -= node_len; + continue; + } + + if (ret > 0) { + /* Padding bytes or a valid padding node */ + offs += ret; + buf += ret; + len -= ret; + continue; + } + + if (ret == SCANNED_EMPTY_SPACE) { + ubifs_err("unexpected empty space at %d:%d", + lnum, offs); + return -EUCLEAN; + } + + if (quiet) { + /* Redo the last scan but noisily */ + quiet = 0; + continue; + } + + ubifs_scanned_corruption(c, lnum, offs, buf); + return -EUCLEAN; + } + + /* Pad to min_io_size */ + len = ALIGN(ucleb->endpt, c->min_io_size); + if (len > ucleb->endpt) { + int pad_len = len - ALIGN(ucleb->endpt, 8); + + if (pad_len > 0) { + buf = c->sbuf + len - pad_len; + ubifs_pad(c, buf, pad_len); + } + } + + /* Write back the LEB atomically */ + err = ubi_leb_change(c->ubi, lnum, sbuf, len, UBI_UNKNOWN); + if (err) + return err; + + dbg_rcvry("cleaned LEB %d", lnum); + + return 0; +} + +/** + * ubifs_clean_lebs - clean LEBs recovered during read-only mount. + * @c: UBIFS file-system description object + * @sbuf: LEB-sized buffer to use + * + * This function cleans a LEB identified during recovery that needs to be + * written but was not because UBIFS was mounted read-only. This happens when + * remounting to read-write mode. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf) +{ + dbg_rcvry("recovery"); + while (!list_empty(&c->unclean_leb_list)) { + struct ubifs_unclean_leb *ucleb; + int err; + + ucleb = list_entry(c->unclean_leb_list.next, + struct ubifs_unclean_leb, list); + err = clean_an_unclean_leb(c, ucleb, sbuf); + if (err) + return err; + list_del(&ucleb->list); + kfree(ucleb); + } + return 0; +} + +/** + * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit. + * @c: UBIFS file-system description object + * + * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty + * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns + * zero in case of success and a negative error code in case of failure. + */ +static int grab_empty_leb(struct ubifs_info *c) +{ + int lnum, err; + + /* + * Note, it is very important to first search for an empty LEB and then + * run the commit, not vice-versa. The reason is that there might be + * only one empty LEB at the moment, the one which has been the + * @c->gc_lnum just before the power cut happened. During the regular + * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no + * one but GC can grab it. But at this moment this single empty LEB is + * not marked as taken, so if we run commit - what happens? Right, the + * commit will grab it and write the index there. Remember that the + * index always expands as long as there is free space, and it only + * starts consolidating when we run out of space. + * + * IOW, if we run commit now, we might not be able to find a free LEB + * after this. + */ + lnum = ubifs_find_free_leb_for_idx(c); + if (lnum < 0) { + dbg_err("could not find an empty LEB"); + dbg_dump_lprops(c); + dbg_dump_budg(c, &c->bi); + return lnum; + } + + /* Reset the index flag */ + err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_INDEX, 0); + if (err) + return err; + + c->gc_lnum = lnum; + dbg_rcvry("found empty LEB %d, run commit", lnum); + + return ubifs_run_commit(c); +} + +/** + * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit. + * @c: UBIFS file-system description object + * + * Out-of-place garbage collection requires always one empty LEB with which to + * start garbage collection. The LEB number is recorded in c->gc_lnum and is + * written to the master node on unmounting. In the case of an unclean unmount + * the value of gc_lnum recorded in the master node is out of date and cannot + * be used. Instead, recovery must allocate an empty LEB for this purpose. + * However, there may not be enough empty space, in which case it must be + * possible to GC the dirtiest LEB into the GC head LEB. + * + * This function also runs the commit which causes the TNC updates from + * size-recovery and orphans to be written to the flash. That is important to + * ensure correct replay order for subsequent mounts. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_rcvry_gc_commit(struct ubifs_info *c) +{ + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + struct ubifs_lprops lp; + int err; + + dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs); + + c->gc_lnum = -1; + if (wbuf->lnum == -1 || wbuf->offs == c->leb_size) + return grab_empty_leb(c); + + err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2); + if (err) { + if (err != -ENOSPC) + return err; + + dbg_rcvry("could not find a dirty LEB"); + return grab_empty_leb(c); + } + + ubifs_assert(!(lp.flags & LPROPS_INDEX)); + ubifs_assert(lp.free + lp.dirty >= wbuf->offs); + + /* + * We run the commit before garbage collection otherwise subsequent + * mounts will see the GC and orphan deletion in a different order. + */ + dbg_rcvry("committing"); + err = ubifs_run_commit(c); + if (err) + return err; + + dbg_rcvry("GC'ing LEB %d", lp.lnum); + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + err = ubifs_garbage_collect_leb(c, &lp); + if (err >= 0) { + int err2 = ubifs_wbuf_sync_nolock(wbuf); + + if (err2) + err = err2; + } + mutex_unlock(&wbuf->io_mutex); + if (err < 0) { + dbg_err("GC failed, error %d", err); + if (err == -EAGAIN) + err = -EINVAL; + return err; + } + + ubifs_assert(err == LEB_RETAINED); + if (err != LEB_RETAINED) + return -EINVAL; + + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) + return err; + + dbg_rcvry("allocated LEB %d for GC", lp.lnum); + return 0; +} + +/** + * struct size_entry - inode size information for recovery. + * @rb: link in the RB-tree of sizes + * @inum: inode number + * @i_size: size on inode + * @d_size: maximum size based on data nodes + * @exists: indicates whether the inode exists + * @inode: inode if pinned in memory awaiting rw mode to fix it + */ +struct size_entry { + struct rb_node rb; + ino_t inum; + loff_t i_size; + loff_t d_size; + int exists; + struct inode *inode; +}; + +/** + * add_ino - add an entry to the size tree. + * @c: UBIFS file-system description object + * @inum: inode number + * @i_size: size on inode + * @d_size: maximum size based on data nodes + * @exists: indicates whether the inode exists + */ +static int add_ino(struct ubifs_info *c, ino_t inum, loff_t i_size, + loff_t d_size, int exists) +{ + struct rb_node **p = &c->size_tree.rb_node, *parent = NULL; + struct size_entry *e; + + while (*p) { + parent = *p; + e = rb_entry(parent, struct size_entry, rb); + if (inum < e->inum) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + e = kzalloc(sizeof(struct size_entry), GFP_KERNEL); + if (!e) + return -ENOMEM; + + e->inum = inum; + e->i_size = i_size; + e->d_size = d_size; + e->exists = exists; + + rb_link_node(&e->rb, parent, p); + rb_insert_color(&e->rb, &c->size_tree); + + return 0; +} + +/** + * find_ino - find an entry on the size tree. + * @c: UBIFS file-system description object + * @inum: inode number + */ +static struct size_entry *find_ino(struct ubifs_info *c, ino_t inum) +{ + struct rb_node *p = c->size_tree.rb_node; + struct size_entry *e; + + while (p) { + e = rb_entry(p, struct size_entry, rb); + if (inum < e->inum) + p = p->rb_left; + else if (inum > e->inum) + p = p->rb_right; + else + return e; + } + return NULL; +} + +/** + * remove_ino - remove an entry from the size tree. + * @c: UBIFS file-system description object + * @inum: inode number + */ +static void remove_ino(struct ubifs_info *c, ino_t inum) +{ + struct size_entry *e = find_ino(c, inum); + + if (!e) + return; + rb_erase(&e->rb, &c->size_tree); + kfree(e); +} + +/** + * ubifs_destroy_size_tree - free resources related to the size tree. + * @c: UBIFS file-system description object + */ +void ubifs_destroy_size_tree(struct ubifs_info *c) +{ + struct rb_node *this = c->size_tree.rb_node; + struct size_entry *e; + + while (this) { + if (this->rb_left) { + this = this->rb_left; + continue; + } else if (this->rb_right) { + this = this->rb_right; + continue; + } + e = rb_entry(this, struct size_entry, rb); + if (e->inode) + iput(e->inode); + this = rb_parent(this); + if (this) { + if (this->rb_left == &e->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(e); + } + c->size_tree = RB_ROOT; +} + +/** + * ubifs_recover_size_accum - accumulate inode sizes for recovery. + * @c: UBIFS file-system description object + * @key: node key + * @deletion: node is for a deletion + * @new_size: inode size + * + * This function has two purposes: + * 1) to ensure there are no data nodes that fall outside the inode size + * 2) to ensure there are no data nodes for inodes that do not exist + * To accomplish those purposes, a rb-tree is constructed containing an entry + * for each inode number in the journal that has not been deleted, and recording + * the size from the inode node, the maximum size of any data node (also altered + * by truncations) and a flag indicating a inode number for which no inode node + * was present in the journal. + * + * Note that there is still the possibility that there are data nodes that have + * been committed that are beyond the inode size, however the only way to find + * them would be to scan the entire index. Alternatively, some provision could + * be made to record the size of inodes at the start of commit, which would seem + * very cumbersome for a scenario that is quite unlikely and the only negative + * consequence of which is wasted space. + * + * This functions returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key, + int deletion, loff_t new_size) +{ + ino_t inum = key_inum(c, key); + struct size_entry *e; + int err; + + switch (key_type(c, key)) { + case UBIFS_INO_KEY: + if (deletion) + remove_ino(c, inum); + else { + e = find_ino(c, inum); + if (e) { + e->i_size = new_size; + e->exists = 1; + } else { + err = add_ino(c, inum, new_size, 0, 1); + if (err) + return err; + } + } + break; + case UBIFS_DATA_KEY: + e = find_ino(c, inum); + if (e) { + if (new_size > e->d_size) + e->d_size = new_size; + } else { + err = add_ino(c, inum, 0, new_size, 0); + if (err) + return err; + } + break; + case UBIFS_TRUN_KEY: + e = find_ino(c, inum); + if (e) + e->d_size = new_size; + break; + } + return 0; +} + +/** + * fix_size_in_place - fix inode size in place on flash. + * @c: UBIFS file-system description object + * @e: inode size information for recovery + */ +static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) +{ + struct ubifs_ino_node *ino = c->sbuf; + unsigned char *p; + union ubifs_key key; + int err, lnum, offs, len; + loff_t i_size; + uint32_t crc; + + /* Locate the inode node LEB number and offset */ + ino_key_init(c, &key, e->inum); + err = ubifs_tnc_locate(c, &key, ino, &lnum, &offs); + if (err) + goto out; + /* + * If the size recorded on the inode node is greater than the size that + * was calculated from nodes in the journal then don't change the inode. + */ + i_size = le64_to_cpu(ino->size); + if (i_size >= e->d_size) + return 0; + /* Read the LEB */ + err = ubi_read(c->ubi, lnum, c->sbuf, 0, c->leb_size); + if (err) + goto out; + /* Change the size field and recalculate the CRC */ + ino = c->sbuf + offs; + ino->size = cpu_to_le64(e->d_size); + len = le32_to_cpu(ino->ch.len); + crc = crc32(UBIFS_CRC32_INIT, (void *)ino + 8, len - 8); + ino->ch.crc = cpu_to_le32(crc); + /* Work out where data in the LEB ends and free space begins */ + p = c->sbuf; + len = c->leb_size - 1; + while (p[len] == 0xff) + len -= 1; + len = ALIGN(len + 1, c->min_io_size); + /* Atomically write the fixed LEB back again */ + err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); + if (err) + goto out; + dbg_rcvry("inode %lu at %d:%d size %lld -> %lld", + (unsigned long)e->inum, lnum, offs, i_size, e->d_size); + return 0; + +out: + ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d", + (unsigned long)e->inum, e->i_size, e->d_size, err); + return err; +} + +/** + * ubifs_recover_size - recover inode size. + * @c: UBIFS file-system description object + * + * This function attempts to fix inode size discrepancies identified by the + * 'ubifs_recover_size_accum()' function. + * + * This functions returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_size(struct ubifs_info *c) +{ + struct rb_node *this = rb_first(&c->size_tree); + + while (this) { + struct size_entry *e; + int err; + + e = rb_entry(this, struct size_entry, rb); + if (!e->exists) { + union ubifs_key key; + + ino_key_init(c, &key, e->inum); + err = ubifs_tnc_lookup(c, &key, c->sbuf); + if (err && err != -ENOENT) + return err; + if (err == -ENOENT) { + /* Remove data nodes that have no inode */ + dbg_rcvry("removing ino %lu", + (unsigned long)e->inum); + err = ubifs_tnc_remove_ino(c, e->inum); + if (err) + return err; + } else { + struct ubifs_ino_node *ino = c->sbuf; + + e->exists = 1; + e->i_size = le64_to_cpu(ino->size); + } + } + + if (e->exists && e->i_size < e->d_size) { + if (c->ro_mount) { + /* Fix the inode size and pin it in memory */ + struct inode *inode; + struct ubifs_inode *ui; + + ubifs_assert(!e->inode); + + inode = ubifs_iget(c->vfs_sb, e->inum); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + ui = ubifs_inode(inode); + if (inode->i_size < e->d_size) { + dbg_rcvry("ino %lu size %lld -> %lld", + (unsigned long)e->inum, + inode->i_size, e->d_size); + inode->i_size = e->d_size; + ui->ui_size = e->d_size; + ui->synced_i_size = e->d_size; + e->inode = inode; + this = rb_next(this); + continue; + } + iput(inode); + } else { + /* Fix the size in place */ + err = fix_size_in_place(c, e); + if (err) + return err; + if (e->inode) + iput(e->inode); + } + } + + this = rb_next(this); + rb_erase(&e->rb, &c->size_tree); + kfree(e); + } + + return 0; +} diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c new file mode 100644 index 00000000..5e97161c --- /dev/null +++ b/fs/ubifs/replay.c @@ -0,0 +1,1080 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file contains journal replay code. It runs when the file-system is being + * mounted and requires no locking. + * + * The larger is the journal, the longer it takes to scan it, so the longer it + * takes to mount UBIFS. This is why the journal has limited size which may be + * changed depending on the system requirements. But a larger journal gives + * faster I/O speed because it writes the index less frequently. So this is a + * trade-off. Also, the journal is indexed by the in-memory index (TNC), so the + * larger is the journal, the more memory its index may consume. + */ + +#include "ubifs.h" +#include + +/** + * struct replay_entry - replay list entry. + * @lnum: logical eraseblock number of the node + * @offs: node offset + * @len: node length + * @deletion: non-zero if this entry corresponds to a node deletion + * @sqnum: node sequence number + * @list: links the replay list + * @key: node key + * @nm: directory entry name + * @old_size: truncation old size + * @new_size: truncation new size + * + * The replay process first scans all buds and builds the replay list, then + * sorts the replay list in nodes sequence number order, and then inserts all + * the replay entries to the TNC. + */ +struct replay_entry { + int lnum; + int offs; + int len; + unsigned int deletion:1; + unsigned long long sqnum; + struct list_head list; + union ubifs_key key; + union { + struct qstr nm; + struct { + loff_t old_size; + loff_t new_size; + }; + }; +}; + +/** + * struct bud_entry - entry in the list of buds to replay. + * @list: next bud in the list + * @bud: bud description object + * @sqnum: reference node sequence number + * @free: free bytes in the bud + * @dirty: dirty bytes in the bud + */ +struct bud_entry { + struct list_head list; + struct ubifs_bud *bud; + unsigned long long sqnum; + int free; + int dirty; +}; + +/** + * set_bud_lprops - set free and dirty space used by a bud. + * @c: UBIFS file-system description object + * @b: bud entry which describes the bud + * + * This function makes sure the LEB properties of bud @b are set correctly + * after the replay. Returns zero in case of success and a negative error code + * in case of failure. + */ +static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b) +{ + const struct ubifs_lprops *lp; + int err = 0, dirty; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + dirty = lp->dirty; + if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) { + /* + * The LEB was added to the journal with a starting offset of + * zero which means the LEB must have been empty. The LEB + * property values should be @lp->free == @c->leb_size and + * @lp->dirty == 0, but that is not the case. The reason is that + * the LEB had been garbage collected before it became the bud, + * and there was not commit inbetween. The garbage collector + * resets the free and dirty space without recording it + * anywhere except lprops, so if there was no commit then + * lprops does not have that information. + * + * We do not need to adjust free space because the scan has told + * us the exact value which is recorded in the replay entry as + * @b->free. + * + * However we do need to subtract from the dirty space the + * amount of space that the garbage collector reclaimed, which + * is the whole LEB minus the amount of space that was free. + */ + dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum, + lp->free, lp->dirty); + dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum, + lp->free, lp->dirty); + dirty -= c->leb_size - lp->free; + /* + * If the replay order was perfect the dirty space would now be + * zero. The order is not perfect because the journal heads + * race with each other. This is not a problem but is does mean + * that the dirty space may temporarily exceed c->leb_size + * during the replay. + */ + if (dirty != 0) + dbg_msg("LEB %d lp: %d free %d dirty " + "replay: %d free %d dirty", b->bud->lnum, + lp->free, lp->dirty, b->free, b->dirty); + } + lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + /* Make sure the journal head points to the latest bud */ + err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf, + b->bud->lnum, c->leb_size - b->free, + UBI_SHORTTERM); + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * set_buds_lprops - set free and dirty space for all replayed buds. + * @c: UBIFS file-system description object + * + * This function sets LEB properties for all replayed buds. Returns zero in + * case of success and a negative error code in case of failure. + */ +static int set_buds_lprops(struct ubifs_info *c) +{ + struct bud_entry *b; + int err; + + list_for_each_entry(b, &c->replay_buds, list) { + err = set_bud_lprops(c, b); + if (err) + return err; + } + + return 0; +} + +/** + * trun_remove_range - apply a replay entry for a truncation to the TNC. + * @c: UBIFS file-system description object + * @r: replay entry of truncation + */ +static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r) +{ + unsigned min_blk, max_blk; + union ubifs_key min_key, max_key; + ino_t ino; + + min_blk = r->new_size / UBIFS_BLOCK_SIZE; + if (r->new_size & (UBIFS_BLOCK_SIZE - 1)) + min_blk += 1; + + max_blk = r->old_size / UBIFS_BLOCK_SIZE; + if ((r->old_size & (UBIFS_BLOCK_SIZE - 1)) == 0) + max_blk -= 1; + + ino = key_inum(c, &r->key); + + data_key_init(c, &min_key, ino, min_blk); + data_key_init(c, &max_key, ino, max_blk); + + return ubifs_tnc_remove_range(c, &min_key, &max_key); +} + +/** + * apply_replay_entry - apply a replay entry to the TNC. + * @c: UBIFS file-system description object + * @r: replay entry to apply + * + * Apply a replay entry to the TNC. + */ +static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) +{ + int err; + + dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum, + r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key)); + + /* Set c->replay_sqnum to help deal with dangling branches. */ + c->replay_sqnum = r->sqnum; + + if (is_hash_key(c, &r->key)) { + if (r->deletion) + err = ubifs_tnc_remove_nm(c, &r->key, &r->nm); + else + err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs, + r->len, &r->nm); + } else { + if (r->deletion) + switch (key_type(c, &r->key)) { + case UBIFS_INO_KEY: + { + ino_t inum = key_inum(c, &r->key); + + err = ubifs_tnc_remove_ino(c, inum); + break; + } + case UBIFS_TRUN_KEY: + err = trun_remove_range(c, r); + break; + default: + err = ubifs_tnc_remove(c, &r->key); + break; + } + else + err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs, + r->len); + if (err) + return err; + + if (c->need_recovery) + err = ubifs_recover_size_accum(c, &r->key, r->deletion, + r->new_size); + } + + return err; +} + +/** + * replay_entries_cmp - compare 2 replay entries. + * @priv: UBIFS file-system description object + * @a: first replay entry + * @a: second replay entry + * + * This is a comparios function for 'list_sort()' which compares 2 replay + * entries @a and @b by comparing their sequence numer. Returns %1 if @a has + * greater sequence number and %-1 otherwise. + */ +static int replay_entries_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + struct replay_entry *ra, *rb; + + cond_resched(); + if (a == b) + return 0; + + ra = list_entry(a, struct replay_entry, list); + rb = list_entry(b, struct replay_entry, list); + ubifs_assert(ra->sqnum != rb->sqnum); + if (ra->sqnum > rb->sqnum) + return 1; + return -1; +} + +/** + * apply_replay_list - apply the replay list to the TNC. + * @c: UBIFS file-system description object + * + * Apply all entries in the replay list to the TNC. Returns zero in case of + * success and a negative error code in case of failure. + */ +static int apply_replay_list(struct ubifs_info *c) +{ + struct replay_entry *r; + int err; + + list_sort(c, &c->replay_list, &replay_entries_cmp); + + list_for_each_entry(r, &c->replay_list, list) { + cond_resched(); + + err = apply_replay_entry(c, r); + if (err) + return err; + } + + return 0; +} + +/** + * destroy_replay_list - destroy the replay. + * @c: UBIFS file-system description object + * + * Destroy the replay list. + */ +static void destroy_replay_list(struct ubifs_info *c) +{ + struct replay_entry *r, *tmp; + + list_for_each_entry_safe(r, tmp, &c->replay_list, list) { + if (is_hash_key(c, &r->key)) + kfree(r->nm.name); + list_del(&r->list); + kfree(r); + } +} + +/** + * insert_node - insert a node to the replay list + * @c: UBIFS file-system description object + * @lnum: node logical eraseblock number + * @offs: node offset + * @len: node length + * @key: node key + * @sqnum: sequence number + * @deletion: non-zero if this is a deletion + * @used: number of bytes in use in a LEB + * @old_size: truncation old size + * @new_size: truncation new size + * + * This function inserts a scanned non-direntry node to the replay list. The + * replay list contains @struct replay_entry elements, and we sort this list in + * sequence number order before applying it. The replay list is applied at the + * very end of the replay process. Since the list is sorted in sequence number + * order, the older modifications are applied first. This function returns zero + * in case of success and a negative error code in case of failure. + */ +static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, + union ubifs_key *key, unsigned long long sqnum, + int deletion, int *used, loff_t old_size, + loff_t new_size) +{ + struct replay_entry *r; + + dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); + + if (key_inum(c, key) >= c->highest_inum) + c->highest_inum = key_inum(c, key); + + r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); + if (!r) + return -ENOMEM; + + if (!deletion) + *used += ALIGN(len, 8); + r->lnum = lnum; + r->offs = offs; + r->len = len; + r->deletion = !!deletion; + r->sqnum = sqnum; + key_copy(c, key, &r->key); + r->old_size = old_size; + r->new_size = new_size; + + list_add_tail(&r->list, &c->replay_list); + return 0; +} + +/** + * insert_dent - insert a directory entry node into the replay list. + * @c: UBIFS file-system description object + * @lnum: node logical eraseblock number + * @offs: node offset + * @len: node length + * @key: node key + * @name: directory entry name + * @nlen: directory entry name length + * @sqnum: sequence number + * @deletion: non-zero if this is a deletion + * @used: number of bytes in use in a LEB + * + * This function inserts a scanned directory entry node or an extended + * attribute entry to the replay list. Returns zero in case of success and a + * negative error code in case of failure. + */ +static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, + union ubifs_key *key, const char *name, int nlen, + unsigned long long sqnum, int deletion, int *used) +{ + struct replay_entry *r; + char *nbuf; + + dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); + if (key_inum(c, key) >= c->highest_inum) + c->highest_inum = key_inum(c, key); + + r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); + if (!r) + return -ENOMEM; + + nbuf = kmalloc(nlen + 1, GFP_KERNEL); + if (!nbuf) { + kfree(r); + return -ENOMEM; + } + + if (!deletion) + *used += ALIGN(len, 8); + r->lnum = lnum; + r->offs = offs; + r->len = len; + r->deletion = !!deletion; + r->sqnum = sqnum; + key_copy(c, key, &r->key); + r->nm.len = nlen; + memcpy(nbuf, name, nlen); + nbuf[nlen] = '\0'; + r->nm.name = nbuf; + + list_add_tail(&r->list, &c->replay_list); + return 0; +} + +/** + * ubifs_validate_entry - validate directory or extended attribute entry node. + * @c: UBIFS file-system description object + * @dent: the node to validate + * + * This function validates directory or extended attribute entry node @dent. + * Returns zero if the node is all right and a %-EINVAL if not. + */ +int ubifs_validate_entry(struct ubifs_info *c, + const struct ubifs_dent_node *dent) +{ + int key_type = key_type_flash(c, dent->key); + int nlen = le16_to_cpu(dent->nlen); + + if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 || + dent->type >= UBIFS_ITYPES_CNT || + nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 || + strnlen(dent->name, nlen) != nlen || + le64_to_cpu(dent->inum) > MAX_INUM) { + ubifs_err("bad %s node", key_type == UBIFS_DENT_KEY ? + "directory entry" : "extended attribute entry"); + return -EINVAL; + } + + if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) { + ubifs_err("bad key type %d", key_type); + return -EINVAL; + } + + return 0; +} + +/** + * is_last_bud - check if the bud is the last in the journal head. + * @c: UBIFS file-system description object + * @bud: bud description object + * + * This function checks if bud @bud is the last bud in its journal head. This + * information is then used by 'replay_bud()' to decide whether the bud can + * have corruptions or not. Indeed, only last buds can be corrupted by power + * cuts. Returns %1 if this is the last bud, and %0 if not. + */ +static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud) +{ + struct ubifs_jhead *jh = &c->jheads[bud->jhead]; + struct ubifs_bud *next; + uint32_t data; + int err; + + if (list_is_last(&bud->list, &jh->buds_list)) + return 1; + + /* + * The following is a quirk to make sure we work correctly with UBIFS + * images used with older UBIFS. + * + * Normally, the last bud will be the last in the journal head's list + * of bud. However, there is one exception if the UBIFS image belongs + * to older UBIFS. This is fairly unlikely: one would need to use old + * UBIFS, then have a power cut exactly at the right point, and then + * try to mount this image with new UBIFS. + * + * The exception is: it is possible to have 2 buds A and B, A goes + * before B, and B is the last, bud B is contains no data, and bud A is + * corrupted at the end. The reason is that in older versions when the + * journal code switched the next bud (from A to B), it first added a + * log reference node for the new bud (B), and only after this it + * synchronized the write-buffer of current bud (A). But later this was + * changed and UBIFS started to always synchronize the write-buffer of + * the bud (A) before writing the log reference for the new bud (B). + * + * But because older UBIFS always synchronized A's write-buffer before + * writing to B, we can recognize this exceptional situation but + * checking the contents of bud B - if it is empty, then A can be + * treated as the last and we can recover it. + * + * TODO: remove this piece of code in a couple of years (today it is + * 16.05.2011). + */ + next = list_entry(bud->list.next, struct ubifs_bud, list); + if (!list_is_last(&next->list, &jh->buds_list)) + return 0; + + err = ubi_read(c->ubi, next->lnum, (char *)&data, + next->start, 4); + if (err) + return 0; + + return data == 0xFFFFFFFF; +} + +/** + * replay_bud - replay a bud logical eraseblock. + * @c: UBIFS file-system description object + * @b: bud entry which describes the bud + * + * This function replays bud @bud, recovers it if needed, and adds all nodes + * from this bud to the replay list. Returns zero in case of success and a + * negative error code in case of failure. + */ +static int replay_bud(struct ubifs_info *c, struct bud_entry *b) +{ + int is_last = is_last_bud(c, b->bud); + int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start; + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + + dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d", + lnum, b->bud->jhead, offs, is_last); + + if (c->need_recovery && is_last) + /* + * Recover only last LEBs in the journal heads, because power + * cuts may cause corruptions only in these LEBs, because only + * these LEBs could possibly be written to at the power cut + * time. + */ + sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, b->bud->jhead); + else + sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + + /* + * The bud does not have to start from offset zero - the beginning of + * the 'lnum' LEB may contain previously committed data. One of the + * things we have to do in replay is to correctly update lprops with + * newer information about this LEB. + * + * At this point lprops thinks that this LEB has 'c->leb_size - offs' + * bytes of free space because it only contain information about + * committed data. + * + * But we know that real amount of free space is 'c->leb_size - + * sleb->endpt', and the space in the 'lnum' LEB between 'offs' and + * 'sleb->endpt' is used by bud data. We have to correctly calculate + * how much of these data are dirty and update lprops with this + * information. + * + * The dirt in that LEB region is comprised of padding nodes, deletion + * nodes, truncation nodes and nodes which are obsoleted by subsequent + * nodes in this LEB. So instead of calculating clean space, we + * calculate used space ('used' variable). + */ + + list_for_each_entry(snod, &sleb->nodes, list) { + int deletion = 0; + + cond_resched(); + + if (snod->sqnum >= SQNUM_WATERMARK) { + ubifs_err("file system's life ended"); + goto out_dump; + } + + if (snod->sqnum > c->max_sqnum) + c->max_sqnum = snod->sqnum; + + switch (snod->type) { + case UBIFS_INO_NODE: + { + struct ubifs_ino_node *ino = snod->node; + loff_t new_size = le64_to_cpu(ino->size); + + if (le32_to_cpu(ino->nlink) == 0) + deletion = 1; + err = insert_node(c, lnum, snod->offs, snod->len, + &snod->key, snod->sqnum, deletion, + &used, 0, new_size); + break; + } + case UBIFS_DATA_NODE: + { + struct ubifs_data_node *dn = snod->node; + loff_t new_size = le32_to_cpu(dn->size) + + key_block(c, &snod->key) * + UBIFS_BLOCK_SIZE; + + err = insert_node(c, lnum, snod->offs, snod->len, + &snod->key, snod->sqnum, deletion, + &used, 0, new_size); + break; + } + case UBIFS_DENT_NODE: + case UBIFS_XENT_NODE: + { + struct ubifs_dent_node *dent = snod->node; + + err = ubifs_validate_entry(c, dent); + if (err) + goto out_dump; + + err = insert_dent(c, lnum, snod->offs, snod->len, + &snod->key, dent->name, + le16_to_cpu(dent->nlen), snod->sqnum, + !le64_to_cpu(dent->inum), &used); + break; + } + case UBIFS_TRUN_NODE: + { + struct ubifs_trun_node *trun = snod->node; + loff_t old_size = le64_to_cpu(trun->old_size); + loff_t new_size = le64_to_cpu(trun->new_size); + union ubifs_key key; + + /* Validate truncation node */ + if (old_size < 0 || old_size > c->max_inode_sz || + new_size < 0 || new_size > c->max_inode_sz || + old_size <= new_size) { + ubifs_err("bad truncation node"); + goto out_dump; + } + + /* + * Create a fake truncation key just to use the same + * functions which expect nodes to have keys. + */ + trun_key_init(c, &key, le32_to_cpu(trun->inum)); + err = insert_node(c, lnum, snod->offs, snod->len, + &key, snod->sqnum, 1, &used, + old_size, new_size); + break; + } + default: + ubifs_err("unexpected node type %d in bud LEB %d:%d", + snod->type, lnum, snod->offs); + err = -EINVAL; + goto out_dump; + } + if (err) + goto out; + } + + ubifs_assert(ubifs_search_bud(c, lnum)); + ubifs_assert(sleb->endpt - offs >= used); + ubifs_assert(sleb->endpt % c->min_io_size == 0); + + b->dirty = sleb->endpt - offs - used; + b->free = c->leb_size - sleb->endpt; + dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free); + +out: + ubifs_scan_destroy(sleb); + return err; + +out_dump: + ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs); + dbg_dump_node(c, snod->node); + ubifs_scan_destroy(sleb); + return -EINVAL; +} + +/** + * replay_buds - replay all buds. + * @c: UBIFS file-system description object + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int replay_buds(struct ubifs_info *c) +{ + struct bud_entry *b; + int err; + unsigned long long prev_sqnum = 0; + + list_for_each_entry(b, &c->replay_buds, list) { + err = replay_bud(c, b); + if (err) + return err; + + ubifs_assert(b->sqnum > prev_sqnum); + prev_sqnum = b->sqnum; + } + + return 0; +} + +/** + * destroy_bud_list - destroy the list of buds to replay. + * @c: UBIFS file-system description object + */ +static void destroy_bud_list(struct ubifs_info *c) +{ + struct bud_entry *b; + + while (!list_empty(&c->replay_buds)) { + b = list_entry(c->replay_buds.next, struct bud_entry, list); + list_del(&b->list); + kfree(b); + } +} + +/** + * add_replay_bud - add a bud to the list of buds to replay. + * @c: UBIFS file-system description object + * @lnum: bud logical eraseblock number to replay + * @offs: bud start offset + * @jhead: journal head to which this bud belongs + * @sqnum: reference node sequence number + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, + unsigned long long sqnum) +{ + struct ubifs_bud *bud; + struct bud_entry *b; + + dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead); + + bud = kmalloc(sizeof(struct ubifs_bud), GFP_KERNEL); + if (!bud) + return -ENOMEM; + + b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL); + if (!b) { + kfree(bud); + return -ENOMEM; + } + + bud->lnum = lnum; + bud->start = offs; + bud->jhead = jhead; + ubifs_add_bud(c, bud); + + b->bud = bud; + b->sqnum = sqnum; + list_add_tail(&b->list, &c->replay_buds); + + return 0; +} + +/** + * validate_ref - validate a reference node. + * @c: UBIFS file-system description object + * @ref: the reference node to validate + * @ref_lnum: LEB number of the reference node + * @ref_offs: reference node offset + * + * This function returns %1 if a bud reference already exists for the LEB. %0 is + * returned if the reference node is new, otherwise %-EINVAL is returned if + * validation failed. + */ +static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref) +{ + struct ubifs_bud *bud; + int lnum = le32_to_cpu(ref->lnum); + unsigned int offs = le32_to_cpu(ref->offs); + unsigned int jhead = le32_to_cpu(ref->jhead); + + /* + * ref->offs may point to the end of LEB when the journal head points + * to the end of LEB and we write reference node for it during commit. + * So this is why we require 'offs > c->leb_size'. + */ + if (jhead >= c->jhead_cnt || lnum >= c->leb_cnt || + lnum < c->main_first || offs > c->leb_size || + offs & (c->min_io_size - 1)) + return -EINVAL; + + /* Make sure we have not already looked at this bud */ + bud = ubifs_search_bud(c, lnum); + if (bud) { + if (bud->jhead == jhead && bud->start <= offs) + return 1; + ubifs_err("bud at LEB %d:%d was already referred", lnum, offs); + return -EINVAL; + } + + return 0; +} + +/** + * replay_log_leb - replay a log logical eraseblock. + * @c: UBIFS file-system description object + * @lnum: log logical eraseblock to replay + * @offs: offset to start replaying from + * @sbuf: scan buffer + * + * This function replays a log LEB and returns zero in case of success, %1 if + * this is the last LEB in the log, and a negative error code in case of + * failure. + */ +static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) +{ + int err; + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + const struct ubifs_cs_node *node; + + dbg_mnt("replay log LEB %d:%d", lnum, offs); + sleb = ubifs_scan(c, lnum, offs, sbuf, c->need_recovery); + if (IS_ERR(sleb)) { + if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery) + return PTR_ERR(sleb); + /* + * Note, the below function will recover this log LEB only if + * it is the last, because unclean reboots can possibly corrupt + * only the tail of the log. + */ + sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + } + + if (sleb->nodes_cnt == 0) { + err = 1; + goto out; + } + + node = sleb->buf; + snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list); + if (c->cs_sqnum == 0) { + /* + * This is the first log LEB we are looking at, make sure that + * the first node is a commit start node. Also record its + * sequence number so that UBIFS can determine where the log + * ends, because all nodes which were have higher sequence + * numbers. + */ + if (snod->type != UBIFS_CS_NODE) { + dbg_err("first log node at LEB %d:%d is not CS node", + lnum, offs); + goto out_dump; + } + if (le64_to_cpu(node->cmt_no) != c->cmt_no) { + dbg_err("first CS node at LEB %d:%d has wrong " + "commit number %llu expected %llu", + lnum, offs, + (unsigned long long)le64_to_cpu(node->cmt_no), + c->cmt_no); + goto out_dump; + } + + c->cs_sqnum = le64_to_cpu(node->ch.sqnum); + dbg_mnt("commit start sqnum %llu", c->cs_sqnum); + } + + if (snod->sqnum < c->cs_sqnum) { + /* + * This means that we reached end of log and now + * look to the older log data, which was already + * committed but the eraseblock was not erased (UBIFS + * only un-maps it). So this basically means we have to + * exit with "end of log" code. + */ + err = 1; + goto out; + } + + /* Make sure the first node sits at offset zero of the LEB */ + if (snod->offs != 0) { + dbg_err("first node is not at zero offset"); + goto out_dump; + } + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + + if (snod->sqnum >= SQNUM_WATERMARK) { + ubifs_err("file system's life ended"); + goto out_dump; + } + + if (snod->sqnum < c->cs_sqnum) { + dbg_err("bad sqnum %llu, commit sqnum %llu", + snod->sqnum, c->cs_sqnum); + goto out_dump; + } + + if (snod->sqnum > c->max_sqnum) + c->max_sqnum = snod->sqnum; + + switch (snod->type) { + case UBIFS_REF_NODE: { + const struct ubifs_ref_node *ref = snod->node; + + err = validate_ref(c, ref); + if (err == 1) + break; /* Already have this bud */ + if (err) + goto out_dump; + + err = add_replay_bud(c, le32_to_cpu(ref->lnum), + le32_to_cpu(ref->offs), + le32_to_cpu(ref->jhead), + snod->sqnum); + if (err) + goto out; + + break; + } + case UBIFS_CS_NODE: + /* Make sure it sits at the beginning of LEB */ + if (snod->offs != 0) { + ubifs_err("unexpected node in log"); + goto out_dump; + } + break; + default: + ubifs_err("unexpected node in log"); + goto out_dump; + } + } + + if (sleb->endpt || c->lhead_offs >= c->leb_size) { + c->lhead_lnum = lnum; + c->lhead_offs = sleb->endpt; + } + + err = !sleb->endpt; +out: + ubifs_scan_destroy(sleb); + return err; + +out_dump: + ubifs_err("log error detected while replaying the log at LEB %d:%d", + lnum, offs + snod->offs); + dbg_dump_node(c, snod->node); + ubifs_scan_destroy(sleb); + return -EINVAL; +} + +/** + * take_ihead - update the status of the index head in lprops to 'taken'. + * @c: UBIFS file-system description object + * + * This function returns the amount of free space in the index head LEB or a + * negative error code. + */ +static int take_ihead(struct ubifs_info *c) +{ + const struct ubifs_lprops *lp; + int err, free; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, c->ihead_lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + free = lp->free; + + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + err = free; +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_replay_journal - replay journal. + * @c: UBIFS file-system description object + * + * This function scans the journal, replays and cleans it up. It makes sure all + * memory data structures related to uncommitted journal are built (dirty TNC + * tree, tree of buds, modified lprops, etc). + */ +int ubifs_replay_journal(struct ubifs_info *c) +{ + int err, i, lnum, offs, free; + + BUILD_BUG_ON(UBIFS_TRUN_KEY > 5); + + /* Update the status of the index head in lprops to 'taken' */ + free = take_ihead(c); + if (free < 0) + return free; /* Error code */ + + if (c->ihead_offs != c->leb_size - free) { + ubifs_err("bad index head LEB %d:%d", c->ihead_lnum, + c->ihead_offs); + return -EINVAL; + } + + dbg_mnt("start replaying the journal"); + c->replaying = 1; + lnum = c->ltail_lnum = c->lhead_lnum; + offs = c->lhead_offs; + + for (i = 0; i < c->log_lebs; i++, lnum++) { + if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) { + /* + * The log is logically circular, we reached the last + * LEB, switch to the first one. + */ + lnum = UBIFS_LOG_LNUM; + offs = 0; + } + err = replay_log_leb(c, lnum, offs, c->sbuf); + if (err == 1) + /* We hit the end of the log */ + break; + if (err) + goto out; + offs = 0; + } + + err = replay_buds(c); + if (err) + goto out; + + err = apply_replay_list(c); + if (err) + goto out; + + err = set_buds_lprops(c); + if (err) + goto out; + + /* + * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable + * to roughly estimate index growth. Things like @c->bi.min_idx_lebs + * depend on it. This means we have to initialize it to make sure + * budgeting works properly. + */ + c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt); + c->bi.uncommitted_idx *= c->max_idx_node_sz; + + ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery); + dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, " + "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum, + (unsigned long)c->highest_inum); +out: + destroy_replay_list(c); + destroy_bud_list(c); + c->replaying = 0; + return err; +} diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c new file mode 100644 index 00000000..c606f010 --- /dev/null +++ b/fs/ubifs/sb.c @@ -0,0 +1,803 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS superblock. The superblock is stored at the first + * LEB of the volume and is never changed by UBIFS. Only user-space tools may + * change it. The superblock node mostly contains geometry information. + */ + +#include "ubifs.h" +#include +#include +#include + +/* + * Default journal size in logical eraseblocks as a percent of total + * flash size. + */ +#define DEFAULT_JNL_PERCENT 5 + +/* Default maximum journal size in bytes */ +#define DEFAULT_MAX_JNL (32*1024*1024) + +/* Default indexing tree fanout */ +#define DEFAULT_FANOUT 8 + +/* Default number of data journal heads */ +#define DEFAULT_JHEADS_CNT 1 + +/* Default positions of different LEBs in the main area */ +#define DEFAULT_IDX_LEB 0 +#define DEFAULT_DATA_LEB 1 +#define DEFAULT_GC_LEB 2 + +/* Default number of LEB numbers in LPT's save table */ +#define DEFAULT_LSAVE_CNT 256 + +/* Default reserved pool size as a percent of maximum free space */ +#define DEFAULT_RP_PERCENT 5 + +/* The default maximum size of reserved pool in bytes */ +#define DEFAULT_MAX_RP_SIZE (5*1024*1024) + +/* Default time granularity in nanoseconds */ +#define DEFAULT_TIME_GRAN 1000000000 + +/** + * create_default_filesystem - format empty UBI volume. + * @c: UBIFS file-system description object + * + * This function creates default empty file-system. Returns zero in case of + * success and a negative error code in case of failure. + */ +static int create_default_filesystem(struct ubifs_info *c) +{ + struct ubifs_sb_node *sup; + struct ubifs_mst_node *mst; + struct ubifs_idx_node *idx; + struct ubifs_branch *br; + struct ubifs_ino_node *ino; + struct ubifs_cs_node *cs; + union ubifs_key key; + int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first; + int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0; + int min_leb_cnt = UBIFS_MIN_LEB_CNT; + long long tmp64, main_bytes; + __le64 tmp_le64; + + /* Some functions called from here depend on the @c->key_len filed */ + c->key_len = UBIFS_SK_LEN; + + /* + * First of all, we have to calculate default file-system geometry - + * log size, journal size, etc. + */ + if (c->leb_cnt < 0x7FFFFFFF / DEFAULT_JNL_PERCENT) + /* We can first multiply then divide and have no overflow */ + jnl_lebs = c->leb_cnt * DEFAULT_JNL_PERCENT / 100; + else + jnl_lebs = (c->leb_cnt / 100) * DEFAULT_JNL_PERCENT; + + if (jnl_lebs < UBIFS_MIN_JNL_LEBS) + jnl_lebs = UBIFS_MIN_JNL_LEBS; + if (jnl_lebs * c->leb_size > DEFAULT_MAX_JNL) + jnl_lebs = DEFAULT_MAX_JNL / c->leb_size; + + /* + * The log should be large enough to fit reference nodes for all bud + * LEBs. Because buds do not have to start from the beginning of LEBs + * (half of the LEB may contain committed data), the log should + * generally be larger, make it twice as large. + */ + tmp = 2 * (c->ref_node_alsz * jnl_lebs) + c->leb_size - 1; + log_lebs = tmp / c->leb_size; + /* Plus one LEB reserved for commit */ + log_lebs += 1; + if (c->leb_cnt - min_leb_cnt > 8) { + /* And some extra space to allow writes while committing */ + log_lebs += 1; + min_leb_cnt += 1; + } + + max_buds = jnl_lebs - log_lebs; + if (max_buds < UBIFS_MIN_BUD_LEBS) + max_buds = UBIFS_MIN_BUD_LEBS; + + /* + * Orphan nodes are stored in a separate area. One node can store a lot + * of orphan inode numbers, but when new orphan comes we just add a new + * orphan node. At some point the nodes are consolidated into one + * orphan node. + */ + orph_lebs = UBIFS_MIN_ORPH_LEBS; +#ifdef CONFIG_UBIFS_FS_DEBUG + if (c->leb_cnt - min_leb_cnt > 1) + /* + * For debugging purposes it is better to have at least 2 + * orphan LEBs, because the orphan subsystem would need to do + * consolidations and would be stressed more. + */ + orph_lebs += 1; +#endif + + main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs; + main_lebs -= orph_lebs; + + lpt_first = UBIFS_LOG_LNUM + log_lebs; + c->lsave_cnt = DEFAULT_LSAVE_CNT; + c->max_leb_cnt = c->leb_cnt; + err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs, + &big_lpt); + if (err) + return err; + + dbg_gen("LEB Properties Tree created (LEBs %d-%d)", lpt_first, + lpt_first + lpt_lebs - 1); + + main_first = c->leb_cnt - main_lebs; + + /* Create default superblock */ + tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size); + sup = kzalloc(tmp, GFP_KERNEL); + if (!sup) + return -ENOMEM; + + tmp64 = (long long)max_buds * c->leb_size; + if (big_lpt) + sup_flags |= UBIFS_FLG_BIGLPT; + + sup->ch.node_type = UBIFS_SB_NODE; + sup->key_hash = UBIFS_KEY_HASH_R5; + sup->flags = cpu_to_le32(sup_flags); + sup->min_io_size = cpu_to_le32(c->min_io_size); + sup->leb_size = cpu_to_le32(c->leb_size); + sup->leb_cnt = cpu_to_le32(c->leb_cnt); + sup->max_leb_cnt = cpu_to_le32(c->max_leb_cnt); + sup->max_bud_bytes = cpu_to_le64(tmp64); + sup->log_lebs = cpu_to_le32(log_lebs); + sup->lpt_lebs = cpu_to_le32(lpt_lebs); + sup->orph_lebs = cpu_to_le32(orph_lebs); + sup->jhead_cnt = cpu_to_le32(DEFAULT_JHEADS_CNT); + sup->fanout = cpu_to_le32(DEFAULT_FANOUT); + sup->lsave_cnt = cpu_to_le32(c->lsave_cnt); + sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION); + sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN); + if (c->mount_opts.override_compr) + sup->default_compr = cpu_to_le16(c->mount_opts.compr_type); + else + sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO); + + generate_random_uuid(sup->uuid); + + main_bytes = (long long)main_lebs * c->leb_size; + tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100); + if (tmp64 > DEFAULT_MAX_RP_SIZE) + tmp64 = DEFAULT_MAX_RP_SIZE; + sup->rp_size = cpu_to_le64(tmp64); + sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION); + + err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); + kfree(sup); + if (err) + return err; + + dbg_gen("default superblock created at LEB 0:0"); + + /* Create default master node */ + mst = kzalloc(c->mst_node_alsz, GFP_KERNEL); + if (!mst) + return -ENOMEM; + + mst->ch.node_type = UBIFS_MST_NODE; + mst->log_lnum = cpu_to_le32(UBIFS_LOG_LNUM); + mst->highest_inum = cpu_to_le64(UBIFS_FIRST_INO); + mst->cmt_no = 0; + mst->root_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB); + mst->root_offs = 0; + tmp = ubifs_idx_node_sz(c, 1); + mst->root_len = cpu_to_le32(tmp); + mst->gc_lnum = cpu_to_le32(main_first + DEFAULT_GC_LEB); + mst->ihead_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB); + mst->ihead_offs = cpu_to_le32(ALIGN(tmp, c->min_io_size)); + mst->index_size = cpu_to_le64(ALIGN(tmp, 8)); + mst->lpt_lnum = cpu_to_le32(c->lpt_lnum); + mst->lpt_offs = cpu_to_le32(c->lpt_offs); + mst->nhead_lnum = cpu_to_le32(c->nhead_lnum); + mst->nhead_offs = cpu_to_le32(c->nhead_offs); + mst->ltab_lnum = cpu_to_le32(c->ltab_lnum); + mst->ltab_offs = cpu_to_le32(c->ltab_offs); + mst->lsave_lnum = cpu_to_le32(c->lsave_lnum); + mst->lsave_offs = cpu_to_le32(c->lsave_offs); + mst->lscan_lnum = cpu_to_le32(main_first); + mst->empty_lebs = cpu_to_le32(main_lebs - 2); + mst->idx_lebs = cpu_to_le32(1); + mst->leb_cnt = cpu_to_le32(c->leb_cnt); + + /* Calculate lprops statistics */ + tmp64 = main_bytes; + tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size); + tmp64 -= ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size); + mst->total_free = cpu_to_le64(tmp64); + + tmp64 = ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size); + ino_waste = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size) - + UBIFS_INO_NODE_SZ; + tmp64 += ino_waste; + tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), 8); + mst->total_dirty = cpu_to_le64(tmp64); + + /* The indexing LEB does not contribute to dark space */ + tmp64 = (c->main_lebs - 1) * c->dark_wm; + mst->total_dark = cpu_to_le64(tmp64); + + mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); + + err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0, + UBI_UNKNOWN); + if (err) { + kfree(mst); + return err; + } + err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0, + UBI_UNKNOWN); + kfree(mst); + if (err) + return err; + + dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM); + + /* Create the root indexing node */ + tmp = ubifs_idx_node_sz(c, 1); + idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL); + if (!idx) + return -ENOMEM; + + c->key_fmt = UBIFS_SIMPLE_KEY_FMT; + c->key_hash = key_r5_hash; + + idx->ch.node_type = UBIFS_IDX_NODE; + idx->child_cnt = cpu_to_le16(1); + ino_key_init(c, &key, UBIFS_ROOT_INO); + br = ubifs_idx_branch(c, idx, 0); + key_write_idx(c, &key, &br->key); + br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB); + br->len = cpu_to_le32(UBIFS_INO_NODE_SZ); + err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0, + UBI_UNKNOWN); + kfree(idx); + if (err) + return err; + + dbg_gen("default root indexing node created LEB %d:0", + main_first + DEFAULT_IDX_LEB); + + /* Create default root inode */ + tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size); + ino = kzalloc(tmp, GFP_KERNEL); + if (!ino) + return -ENOMEM; + + ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO); + ino->ch.node_type = UBIFS_INO_NODE; + ino->creat_sqnum = cpu_to_le64(++c->max_sqnum); + ino->nlink = cpu_to_le32(2); + tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec); + ino->atime_sec = tmp_le64; + ino->ctime_sec = tmp_le64; + ino->mtime_sec = tmp_le64; + ino->atime_nsec = 0; + ino->ctime_nsec = 0; + ino->mtime_nsec = 0; + ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO); + ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ); + + /* Set compression enabled by default */ + ino->flags = cpu_to_le32(UBIFS_COMPR_FL); + + err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ, + main_first + DEFAULT_DATA_LEB, 0, + UBI_UNKNOWN); + kfree(ino); + if (err) + return err; + + dbg_gen("root inode created at LEB %d:0", + main_first + DEFAULT_DATA_LEB); + + /* + * The first node in the log has to be the commit start node. This is + * always the case during normal file-system operation. Write a fake + * commit start node to the log. + */ + tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size); + cs = kzalloc(tmp, GFP_KERNEL); + if (!cs) + return -ENOMEM; + + cs->ch.node_type = UBIFS_CS_NODE; + err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, + 0, UBI_UNKNOWN); + kfree(cs); + + ubifs_msg("default file-system created"); + return 0; +} + +/** + * validate_sb - validate superblock node. + * @c: UBIFS file-system description object + * @sup: superblock node + * + * This function validates superblock node @sup. Since most of data was read + * from the superblock and stored in @c, the function validates fields in @c + * instead. Returns zero in case of success and %-EINVAL in case of validation + * failure. + */ +static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) +{ + long long max_bytes; + int err = 1, min_leb_cnt; + + if (!c->key_hash) { + err = 2; + goto failed; + } + + if (sup->key_fmt != UBIFS_SIMPLE_KEY_FMT) { + err = 3; + goto failed; + } + + if (le32_to_cpu(sup->min_io_size) != c->min_io_size) { + ubifs_err("min. I/O unit mismatch: %d in superblock, %d real", + le32_to_cpu(sup->min_io_size), c->min_io_size); + goto failed; + } + + if (le32_to_cpu(sup->leb_size) != c->leb_size) { + ubifs_err("LEB size mismatch: %d in superblock, %d real", + le32_to_cpu(sup->leb_size), c->leb_size); + goto failed; + } + + if (c->log_lebs < UBIFS_MIN_LOG_LEBS || + c->lpt_lebs < UBIFS_MIN_LPT_LEBS || + c->orph_lebs < UBIFS_MIN_ORPH_LEBS || + c->main_lebs < UBIFS_MIN_MAIN_LEBS) { + err = 4; + goto failed; + } + + /* + * Calculate minimum allowed amount of main area LEBs. This is very + * similar to %UBIFS_MIN_LEB_CNT, but we take into account real what we + * have just read from the superblock. + */ + min_leb_cnt = UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs; + min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6; + + if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) { + ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, " + "%d minimum required", c->leb_cnt, c->vi.size, + min_leb_cnt); + goto failed; + } + + if (c->max_leb_cnt < c->leb_cnt) { + ubifs_err("max. LEB count %d less than LEB count %d", + c->max_leb_cnt, c->leb_cnt); + goto failed; + } + + if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) { + err = 7; + goto failed; + } + + if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS || + c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) { + err = 8; + goto failed; + } + + if (c->jhead_cnt < NONDATA_JHEADS_CNT + 1 || + c->jhead_cnt > NONDATA_JHEADS_CNT + UBIFS_MAX_JHEADS) { + err = 9; + goto failed; + } + + if (c->fanout < UBIFS_MIN_FANOUT || + ubifs_idx_node_sz(c, c->fanout) > c->leb_size) { + err = 10; + goto failed; + } + + if (c->lsave_cnt < 0 || (c->lsave_cnt > DEFAULT_LSAVE_CNT && + c->lsave_cnt > c->max_leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - + c->log_lebs - c->lpt_lebs - c->orph_lebs)) { + err = 11; + goto failed; + } + + if (UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs + c->lpt_lebs + + c->orph_lebs + c->main_lebs != c->leb_cnt) { + err = 12; + goto failed; + } + + if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) { + err = 13; + goto failed; + } + + max_bytes = c->main_lebs * (long long)c->leb_size; + if (c->rp_size < 0 || max_bytes < c->rp_size) { + err = 14; + goto failed; + } + + if (le32_to_cpu(sup->time_gran) > 1000000000 || + le32_to_cpu(sup->time_gran) < 1) { + err = 15; + goto failed; + } + + return 0; + +failed: + ubifs_err("bad superblock, error %d", err); + dbg_dump_node(c, sup); + return -EINVAL; +} + +/** + * ubifs_read_sb_node - read superblock node. + * @c: UBIFS file-system description object + * + * This function returns a pointer to the superblock node or a negative error + * code. Note, the user of this function is responsible of kfree()'ing the + * returned superblock buffer. + */ +struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c) +{ + struct ubifs_sb_node *sup; + int err; + + sup = kmalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_NOFS); + if (!sup) + return ERR_PTR(-ENOMEM); + + err = ubifs_read_node(c, sup, UBIFS_SB_NODE, UBIFS_SB_NODE_SZ, + UBIFS_SB_LNUM, 0); + if (err) { + kfree(sup); + return ERR_PTR(err); + } + + return sup; +} + +/** + * ubifs_write_sb_node - write superblock node. + * @c: UBIFS file-system description object + * @sup: superblock node read with 'ubifs_read_sb_node()' + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup) +{ + int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size); + + ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1); + return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM); +} + +/** + * ubifs_read_superblock - read superblock. + * @c: UBIFS file-system description object + * + * This function finds, reads and checks the superblock. If an empty UBI volume + * is being mounted, this function creates default superblock. Returns zero in + * case of success, and a negative error code in case of failure. + */ +int ubifs_read_superblock(struct ubifs_info *c) +{ + int err, sup_flags; + struct ubifs_sb_node *sup; + + if (c->empty) { + err = create_default_filesystem(c); + if (err) + return err; + } + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) + return PTR_ERR(sup); + + c->fmt_version = le32_to_cpu(sup->fmt_version); + c->ro_compat_version = le32_to_cpu(sup->ro_compat_version); + + /* + * The software supports all previous versions but not future versions, + * due to the unavailability of time-travelling equipment. + */ + if (c->fmt_version > UBIFS_FORMAT_VERSION) { + ubifs_assert(!c->ro_media || c->ro_mount); + if (!c->ro_mount || + c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { + ubifs_err("on-flash format version is w%d/r%d, but " + "software only supports up to version " + "w%d/r%d", c->fmt_version, + c->ro_compat_version, UBIFS_FORMAT_VERSION, + UBIFS_RO_COMPAT_VERSION); + if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) { + ubifs_msg("only R/O mounting is possible"); + err = -EROFS; + } else + err = -EINVAL; + goto out; + } + + /* + * The FS is mounted R/O, and the media format is + * R/O-compatible with the UBIFS implementation, so we can + * mount. + */ + c->rw_incompat = 1; + } + + if (c->fmt_version < 3) { + ubifs_err("on-flash format version %d is not supported", + c->fmt_version); + err = -EINVAL; + goto out; + } + + switch (sup->key_hash) { + case UBIFS_KEY_HASH_R5: + c->key_hash = key_r5_hash; + c->key_hash_type = UBIFS_KEY_HASH_R5; + break; + + case UBIFS_KEY_HASH_TEST: + c->key_hash = key_test_hash; + c->key_hash_type = UBIFS_KEY_HASH_TEST; + break; + }; + + c->key_fmt = sup->key_fmt; + + switch (c->key_fmt) { + case UBIFS_SIMPLE_KEY_FMT: + c->key_len = UBIFS_SK_LEN; + break; + default: + ubifs_err("unsupported key format"); + err = -EINVAL; + goto out; + } + + c->leb_cnt = le32_to_cpu(sup->leb_cnt); + c->max_leb_cnt = le32_to_cpu(sup->max_leb_cnt); + c->max_bud_bytes = le64_to_cpu(sup->max_bud_bytes); + c->log_lebs = le32_to_cpu(sup->log_lebs); + c->lpt_lebs = le32_to_cpu(sup->lpt_lebs); + c->orph_lebs = le32_to_cpu(sup->orph_lebs); + c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT; + c->fanout = le32_to_cpu(sup->fanout); + c->lsave_cnt = le32_to_cpu(sup->lsave_cnt); + c->rp_size = le64_to_cpu(sup->rp_size); + c->rp_uid = le32_to_cpu(sup->rp_uid); + c->rp_gid = le32_to_cpu(sup->rp_gid); + sup_flags = le32_to_cpu(sup->flags); + if (!c->mount_opts.override_compr) + c->default_compr = le16_to_cpu(sup->default_compr); + + c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran); + memcpy(&c->uuid, &sup->uuid, 16); + c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT); + c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP); + + /* Automatically increase file system size to the maximum size */ + c->old_leb_cnt = c->leb_cnt; + if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) { + c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size); + if (c->ro_mount) + dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs", + c->old_leb_cnt, c->leb_cnt); + else { + dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs", + c->old_leb_cnt, c->leb_cnt); + sup->leb_cnt = cpu_to_le32(c->leb_cnt); + err = ubifs_write_sb_node(c, sup); + if (err) + goto out; + c->old_leb_cnt = c->leb_cnt; + } + } + + c->log_bytes = (long long)c->log_lebs * c->leb_size; + c->log_last = UBIFS_LOG_LNUM + c->log_lebs - 1; + c->lpt_first = UBIFS_LOG_LNUM + c->log_lebs; + c->lpt_last = c->lpt_first + c->lpt_lebs - 1; + c->orph_first = c->lpt_last + 1; + c->orph_last = c->orph_first + c->orph_lebs - 1; + c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; + c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; + c->main_first = c->leb_cnt - c->main_lebs; + + err = validate_sb(c, sup); +out: + kfree(sup); + return err; +} + +/** + * fixup_leb - fixup/unmap an LEB containing free space. + * @c: UBIFS file-system description object + * @lnum: the LEB number to fix up + * @len: number of used bytes in LEB (starting at offset 0) + * + * This function reads the contents of the given LEB number @lnum, then fixes + * it up, so that empty min. I/O units in the end of LEB are actually erased on + * flash (rather than being just all-0xff real data). If the LEB is completely + * empty, it is simply unmapped. + */ +static int fixup_leb(struct ubifs_info *c, int lnum, int len) +{ + int err; + + ubifs_assert(len >= 0); + ubifs_assert(len % c->min_io_size == 0); + ubifs_assert(len < c->leb_size); + + if (len == 0) { + dbg_mnt("unmap empty LEB %d", lnum); + return ubi_leb_unmap(c->ubi, lnum); + } + + dbg_mnt("fixup LEB %d, data len %d", lnum, len); + err = ubi_read(c->ubi, lnum, c->sbuf, 0, len); + if (err) + return err; + + return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); +} + +/** + * fixup_free_space - find & remap all LEBs containing free space. + * @c: UBIFS file-system description object + * + * This function walks through all LEBs in the filesystem and fiexes up those + * containing free/empty space. + */ +static int fixup_free_space(struct ubifs_info *c) +{ + int lnum, err = 0; + struct ubifs_lprops *lprops; + + ubifs_get_lprops(c); + + /* Fixup LEBs in the master area */ + for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) { + err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz); + if (err) + goto out; + } + + /* Unmap unused log LEBs */ + lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + while (lnum != c->ltail_lnum) { + err = fixup_leb(c, lnum, 0); + if (err) + goto out; + lnum = ubifs_next_log_lnum(c, lnum); + } + + /* Fixup the current log head */ + err = fixup_leb(c, c->lhead_lnum, c->lhead_offs); + if (err) + goto out; + + /* Fixup LEBs in the LPT area */ + for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) { + int free = c->ltab[lnum - c->lpt_first].free; + + if (free > 0) { + err = fixup_leb(c, lnum, c->leb_size - free); + if (err) + goto out; + } + } + + /* Unmap LEBs in the orphans area */ + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + err = fixup_leb(c, lnum, 0); + if (err) + goto out; + } + + /* Fixup LEBs in the main area */ + for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { + lprops = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + if (lprops->free > 0) { + err = fixup_leb(c, lnum, c->leb_size - lprops->free); + if (err) + goto out; + } + } + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_fixup_free_space - find & fix all LEBs with free space. + * @c: UBIFS file-system description object + * + * This function fixes up LEBs containing free space on first mount, if the + * appropriate flag was set when the FS was created. Each LEB with one or more + * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure + * the free space is actually erased. E.g., this is necessary for some NAND + * chips, since the free space may have been programmed like real "0xff" data + * (generating a non-0xff ECC), causing future writes to the not-really-erased + * NAND pages to behave badly. After the space is fixed up, the superblock flag + * is cleared, so that this is skipped for all future mounts. + */ +int ubifs_fixup_free_space(struct ubifs_info *c) +{ + int err; + struct ubifs_sb_node *sup; + + ubifs_assert(c->space_fixup); + ubifs_assert(!c->ro_mount); + + ubifs_msg("start fixing up free space"); + + err = fixup_free_space(c); + if (err) + return err; + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) + return PTR_ERR(sup); + + /* Free-space fixup is no longer required */ + c->space_fixup = 0; + sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP); + + err = ubifs_write_sb_node(c, sup); + kfree(sup); + if (err) + return err; + + ubifs_msg("free space fixup complete"); + return err; +} diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c new file mode 100644 index 00000000..36216b46 --- /dev/null +++ b/fs/ubifs/scan.c @@ -0,0 +1,380 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the scan which is a general-purpose function for + * determining what nodes are in an eraseblock. The scan is used to replay the + * journal, to do garbage collection. for the TNC in-the-gaps method, and by + * debugging functions. + */ + +#include "ubifs.h" + +/** + * scan_padding_bytes - scan for padding bytes. + * @buf: buffer to scan + * @len: length of buffer + * + * This function returns the number of padding bytes on success and + * %SCANNED_GARBAGE on failure. + */ +static int scan_padding_bytes(void *buf, int len) +{ + int pad_len = 0, max_pad_len = min_t(int, UBIFS_PAD_NODE_SZ, len); + uint8_t *p = buf; + + dbg_scan("not a node"); + + while (pad_len < max_pad_len && *p++ == UBIFS_PADDING_BYTE) + pad_len += 1; + + if (!pad_len || (pad_len & 7)) + return SCANNED_GARBAGE; + + dbg_scan("%d padding bytes", pad_len); + + return pad_len; +} + +/** + * ubifs_scan_a_node - scan for a node or padding. + * @c: UBIFS file-system description object + * @buf: buffer to scan + * @len: length of buffer + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * @quiet: print no messages + * + * This function returns a scanning code to indicate what was scanned. + */ +int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, + int offs, int quiet) +{ + struct ubifs_ch *ch = buf; + uint32_t magic; + + magic = le32_to_cpu(ch->magic); + + if (magic == 0xFFFFFFFF) { + dbg_scan("hit empty space"); + return SCANNED_EMPTY_SPACE; + } + + if (magic != UBIFS_NODE_MAGIC) + return scan_padding_bytes(buf, len); + + if (len < UBIFS_CH_SZ) + return SCANNED_GARBAGE; + + dbg_scan("scanning %s", dbg_ntype(ch->node_type)); + + if (ubifs_check_node(c, buf, lnum, offs, quiet, 1)) + return SCANNED_A_CORRUPT_NODE; + + if (ch->node_type == UBIFS_PAD_NODE) { + struct ubifs_pad_node *pad = buf; + int pad_len = le32_to_cpu(pad->pad_len); + int node_len = le32_to_cpu(ch->len); + + /* Validate the padding node */ + if (pad_len < 0 || + offs + node_len + pad_len > c->leb_size) { + if (!quiet) { + ubifs_err("bad pad node at LEB %d:%d", + lnum, offs); + dbg_dump_node(c, pad); + } + return SCANNED_A_BAD_PAD_NODE; + } + + /* Make the node pads to 8-byte boundary */ + if ((node_len + pad_len) & 7) { + if (!quiet) + dbg_err("bad padding length %d - %d", + offs, offs + node_len + pad_len); + return SCANNED_A_BAD_PAD_NODE; + } + + dbg_scan("%d bytes padded, offset now %d", + pad_len, ALIGN(offs + node_len + pad_len, 8)); + + return node_len + pad_len; + } + + return SCANNED_A_NODE; +} + +/** + * ubifs_start_scan - create LEB scanning information at start of scan. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number + * @offs: offset to start at (usually zero) + * @sbuf: scan buffer (must be c->leb_size) + * + * This function returns %0 on success and a negative error code on failure. + */ +struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf) +{ + struct ubifs_scan_leb *sleb; + int err; + + dbg_scan("scan LEB %d:%d", lnum, offs); + + sleb = kzalloc(sizeof(struct ubifs_scan_leb), GFP_NOFS); + if (!sleb) + return ERR_PTR(-ENOMEM); + + sleb->lnum = lnum; + INIT_LIST_HEAD(&sleb->nodes); + sleb->buf = sbuf; + + err = ubi_read(c->ubi, lnum, sbuf + offs, offs, c->leb_size - offs); + if (err && err != -EBADMSG) { + ubifs_err("cannot read %d bytes from LEB %d:%d," + " error %d", c->leb_size - offs, lnum, offs, err); + kfree(sleb); + return ERR_PTR(err); + } + + if (err == -EBADMSG) + sleb->ecc = 1; + + return sleb; +} + +/** + * ubifs_end_scan - update LEB scanning information at end of scan. + * @c: UBIFS file-system description object + * @sleb: scanning information + * @lnum: logical eraseblock number + * @offs: offset to start at (usually zero) + * + * This function returns %0 on success and a negative error code on failure. + */ +void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + int lnum, int offs) +{ + lnum = lnum; + dbg_scan("stop scanning LEB %d at offset %d", lnum, offs); + ubifs_assert(offs % c->min_io_size == 0); + + sleb->endpt = ALIGN(offs, c->min_io_size); +} + +/** + * ubifs_add_snod - add a scanned node to LEB scanning information. + * @c: UBIFS file-system description object + * @sleb: scanning information + * @buf: buffer containing node + * @offs: offset of node on flash + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + void *buf, int offs) +{ + struct ubifs_ch *ch = buf; + struct ubifs_ino_node *ino = buf; + struct ubifs_scan_node *snod; + + snod = kmalloc(sizeof(struct ubifs_scan_node), GFP_NOFS); + if (!snod) + return -ENOMEM; + + snod->sqnum = le64_to_cpu(ch->sqnum); + snod->type = ch->node_type; + snod->offs = offs; + snod->len = le32_to_cpu(ch->len); + snod->node = buf; + + switch (ch->node_type) { + case UBIFS_INO_NODE: + case UBIFS_DENT_NODE: + case UBIFS_XENT_NODE: + case UBIFS_DATA_NODE: + /* + * The key is in the same place in all keyed + * nodes. + */ + key_read(c, &ino->key, &snod->key); + break; + default: + invalid_key_init(c, &snod->key); + break; + } + list_add_tail(&snod->list, &sleb->nodes); + sleb->nodes_cnt += 1; + return 0; +} + +/** + * ubifs_scanned_corruption - print information after UBIFS scanned corruption. + * @c: UBIFS file-system description object + * @lnum: LEB number of corruption + * @offs: offset of corruption + * @buf: buffer containing corruption + */ +void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, + void *buf) +{ + int len; + + ubifs_err("corruption at LEB %d:%d", lnum, offs); + if (dbg_failure_mode) + return; + len = c->leb_size - offs; + if (len > 8192) + len = 8192; + dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1); +} + +/** + * ubifs_scan - scan a logical eraseblock. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number + * @offs: offset to start at (usually zero) + * @sbuf: scan buffer (must be of @c->leb_size bytes in size) + * @quiet: print no messages + * + * This function scans LEB number @lnum and returns complete information about + * its contents. Returns the scaned information in case of success and, + * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case + * of failure. + * + * If @quiet is non-zero, this function does not print large and scary + * error messages and flash dumps in case of errors. + */ +struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf, int quiet) +{ + void *buf = sbuf + offs; + int err, len = c->leb_size - offs; + struct ubifs_scan_leb *sleb; + + sleb = ubifs_start_scan(c, lnum, offs, sbuf); + if (IS_ERR(sleb)) + return sleb; + + while (len >= 8) { + struct ubifs_ch *ch = buf; + int node_len, ret; + + dbg_scan("look at LEB %d:%d (%d bytes left)", + lnum, offs, len); + + cond_resched(); + + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); + if (ret > 0) { + /* Padding bytes or a valid padding node */ + offs += ret; + buf += ret; + len -= ret; + continue; + } + + if (ret == SCANNED_EMPTY_SPACE) + /* Empty space is checked later */ + break; + + switch (ret) { + case SCANNED_GARBAGE: + dbg_err("garbage"); + goto corrupted; + case SCANNED_A_NODE: + break; + case SCANNED_A_CORRUPT_NODE: + case SCANNED_A_BAD_PAD_NODE: + dbg_err("bad node"); + goto corrupted; + default: + dbg_err("unknown"); + err = -EINVAL; + goto error; + } + + err = ubifs_add_snod(c, sleb, buf, offs); + if (err) + goto error; + + node_len = ALIGN(le32_to_cpu(ch->len), 8); + offs += node_len; + buf += node_len; + len -= node_len; + } + + if (offs % c->min_io_size) { + if (!quiet) + ubifs_err("empty space starts at non-aligned offset %d", + offs); + goto corrupted; + } + + ubifs_end_scan(c, sleb, lnum, offs); + + for (; len > 4; offs += 4, buf = buf + 4, len -= 4) + if (*(uint32_t *)buf != 0xffffffff) + break; + for (; len; offs++, buf++, len--) + if (*(uint8_t *)buf != 0xff) { + if (!quiet) + ubifs_err("corrupt empty space at LEB %d:%d", + lnum, offs); + goto corrupted; + } + + return sleb; + +corrupted: + if (!quiet) { + ubifs_scanned_corruption(c, lnum, offs, buf); + ubifs_err("LEB %d scanning failed", lnum); + } + err = -EUCLEAN; + ubifs_scan_destroy(sleb); + return ERR_PTR(err); + +error: + ubifs_err("LEB %d scanning failed, error %d", lnum, err); + ubifs_scan_destroy(sleb); + return ERR_PTR(err); +} + +/** + * ubifs_scan_destroy - destroy LEB scanning information. + * @sleb: scanning information to free + */ +void ubifs_scan_destroy(struct ubifs_scan_leb *sleb) +{ + struct ubifs_scan_node *node; + struct list_head *head; + + head = &sleb->nodes; + while (!list_empty(head)) { + node = list_entry(head->next, struct ubifs_scan_node, list); + list_del(&node->list); + kfree(node); + } + kfree(sleb); +} diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c new file mode 100644 index 00000000..9e1d0566 --- /dev/null +++ b/fs/ubifs/shrinker.c @@ -0,0 +1,325 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS shrinker which evicts clean znodes from the TNC + * tree when Linux VM needs more RAM. + * + * We do not implement any LRU lists to find oldest znodes to free because it + * would add additional overhead to the file system fast paths. So the shrinker + * just walks the TNC tree when searching for znodes to free. + * + * If the root of a TNC sub-tree is clean and old enough, then the children are + * also clean and old enough. So the shrinker walks the TNC in level order and + * dumps entire sub-trees. + * + * The age of znodes is just the time-stamp when they were last looked at. + * The current shrinker first tries to evict old znodes, then young ones. + * + * Since the shrinker is global, it has to protect against races with FS + * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'. + */ + +#include "ubifs.h" + +/* List of all UBIFS file-system instances */ +LIST_HEAD(ubifs_infos); + +/* + * We number each shrinker run and record the number on the ubifs_info structure + * so that we can easily work out which ubifs_info structures have already been + * done by the current run. + */ +static unsigned int shrinker_run_no; + +/* Protects 'ubifs_infos' list */ +DEFINE_SPINLOCK(ubifs_infos_lock); + +/* Global clean znode counter (for all mounted UBIFS instances) */ +atomic_long_t ubifs_clean_zn_cnt; + +/** + * shrink_tnc - shrink TNC tree. + * @c: UBIFS file-system description object + * @nr: number of znodes to free + * @age: the age of znodes to free + * @contention: if any contention, this is set to %1 + * + * This function traverses TNC tree and frees clean znodes. It does not free + * clean znodes which younger then @age. Returns number of freed znodes. + */ +static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) +{ + int total_freed = 0; + struct ubifs_znode *znode, *zprev; + int time = get_seconds(); + + ubifs_assert(mutex_is_locked(&c->umount_mutex)); + ubifs_assert(mutex_is_locked(&c->tnc_mutex)); + + if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0) + return 0; + + /* + * Traverse the TNC tree in levelorder manner, so that it is possible + * to destroy large sub-trees. Indeed, if a znode is old, then all its + * children are older or of the same age. + * + * Note, we are holding 'c->tnc_mutex', so we do not have to lock the + * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is + * changed only when the 'c->tnc_mutex' is held. + */ + zprev = NULL; + znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); + while (znode && total_freed < nr && + atomic_long_read(&c->clean_zn_cnt) > 0) { + int freed; + + /* + * If the znode is clean, but it is in the 'c->cnext' list, this + * means that this znode has just been written to flash as a + * part of commit and was marked clean. They will be removed + * from the list at end commit. We cannot change the list, + * because it is not protected by any mutex (design decision to + * make commit really independent and parallel to main I/O). So + * we just skip these znodes. + * + * Note, the 'clean_zn_cnt' counters are not updated until + * after the commit, so the UBIFS shrinker does not report + * the znodes which are in the 'c->cnext' list as freeable. + * + * Also note, if the root of a sub-tree is not in 'c->cnext', + * then the whole sub-tree is not in 'c->cnext' as well, so it + * is safe to dump whole sub-tree. + */ + + if (znode->cnext) { + /* + * Very soon these znodes will be removed from the list + * and become freeable. + */ + *contention = 1; + } else if (!ubifs_zn_dirty(znode) && + abs(time - znode->time) >= age) { + if (znode->parent) + znode->parent->zbranch[znode->iip].znode = NULL; + else + c->zroot.znode = NULL; + + freed = ubifs_destroy_tnc_subtree(znode); + atomic_long_sub(freed, &ubifs_clean_zn_cnt); + atomic_long_sub(freed, &c->clean_zn_cnt); + ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0); + total_freed += freed; + znode = zprev; + } + + if (unlikely(!c->zroot.znode)) + break; + + zprev = znode; + znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); + cond_resched(); + } + + return total_freed; +} + +/** + * shrink_tnc_trees - shrink UBIFS TNC trees. + * @nr: number of znodes to free + * @age: the age of znodes to free + * @contention: if any contention, this is set to %1 + * + * This function walks the list of mounted UBIFS file-systems and frees clean + * znodes which are older than @age, until at least @nr znodes are freed. + * Returns the number of freed znodes. + */ +static int shrink_tnc_trees(int nr, int age, int *contention) +{ + struct ubifs_info *c; + struct list_head *p; + unsigned int run_no; + int freed = 0; + + spin_lock(&ubifs_infos_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + /* Iterate over all mounted UBIFS file-systems and try to shrink them */ + p = ubifs_infos.next; + while (p != &ubifs_infos) { + c = list_entry(p, struct ubifs_info, infos_list); + /* + * We move the ones we do to the end of the list, so we stop + * when we see one we have already done. + */ + if (c->shrinker_run_no == run_no) + break; + if (!mutex_trylock(&c->umount_mutex)) { + /* Some un-mount is in progress, try next FS */ + *contention = 1; + p = p->next; + continue; + } + /* + * We're holding 'c->umount_mutex', so the file-system won't go + * away. + */ + if (!mutex_trylock(&c->tnc_mutex)) { + mutex_unlock(&c->umount_mutex); + *contention = 1; + p = p->next; + continue; + } + spin_unlock(&ubifs_infos_lock); + /* + * OK, now we have TNC locked, the file-system cannot go away - + * it is safe to reap the cache. + */ + c->shrinker_run_no = run_no; + freed += shrink_tnc(c, nr, age, contention); + mutex_unlock(&c->tnc_mutex); + spin_lock(&ubifs_infos_lock); + /* Get the next list element before we move this one */ + p = p->next; + /* + * Move this one to the end of the list to provide some + * fairness. + */ + list_move_tail(&c->infos_list, &ubifs_infos); + mutex_unlock(&c->umount_mutex); + if (freed >= nr) + break; + } + spin_unlock(&ubifs_infos_lock); + return freed; +} + +/** + * kick_a_thread - kick a background thread to start commit. + * + * This function kicks a background thread to start background commit. Returns + * %-1 if a thread was kicked or there is another reason to assume the memory + * will soon be freed or become freeable. If there are no dirty znodes, returns + * %0. + */ +static int kick_a_thread(void) +{ + int i; + struct ubifs_info *c; + + /* + * Iterate over all mounted UBIFS file-systems and find out if there is + * already an ongoing commit operation there. If no, then iterate for + * the second time and initiate background commit. + */ + spin_lock(&ubifs_infos_lock); + for (i = 0; i < 2; i++) { + list_for_each_entry(c, &ubifs_infos, infos_list) { + long dirty_zn_cnt; + + if (!mutex_trylock(&c->umount_mutex)) { + /* + * Some un-mount is in progress, it will + * certainly free memory, so just return. + */ + spin_unlock(&ubifs_infos_lock); + return -1; + } + + dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt); + + if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN || + c->ro_mount || c->ro_error) { + mutex_unlock(&c->umount_mutex); + continue; + } + + if (c->cmt_state != COMMIT_RESTING) { + spin_unlock(&ubifs_infos_lock); + mutex_unlock(&c->umount_mutex); + return -1; + } + + if (i == 1) { + list_move_tail(&c->infos_list, &ubifs_infos); + spin_unlock(&ubifs_infos_lock); + + ubifs_request_bg_commit(c); + mutex_unlock(&c->umount_mutex); + return -1; + } + mutex_unlock(&c->umount_mutex); + } + } + spin_unlock(&ubifs_infos_lock); + + return 0; +} + +int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc) +{ + int nr = sc->nr_to_scan; + int freed, contention = 0; + long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); + + if (nr == 0) + /* + * Due to the way UBIFS updates the clean znode counter it may + * temporarily be negative. + */ + return clean_zn_cnt >= 0 ? clean_zn_cnt : 1; + + if (!clean_zn_cnt) { + /* + * No clean znodes, nothing to reap. All we can do in this case + * is to kick background threads to start commit, which will + * probably make clean znodes which, in turn, will be freeable. + * And we return -1 which means will make VM call us again + * later. + */ + dbg_tnc("no clean znodes, kick a thread"); + return kick_a_thread(); + } + + freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention); + if (freed >= nr) + goto out; + + dbg_tnc("not enough old znodes, try to free young ones"); + freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention); + if (freed >= nr) + goto out; + + dbg_tnc("not enough young znodes, free all"); + freed += shrink_tnc_trees(nr - freed, 0, &contention); + + if (!freed && contention) { + dbg_tnc("freed nothing, but contention"); + return -1; + } + +out: + dbg_tnc("%d znodes were freed, requested %d", freed, nr); + return freed; +} diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c new file mode 100644 index 00000000..529be058 --- /dev/null +++ b/fs/ubifs/super.c @@ -0,0 +1,2321 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS initialization and VFS superblock operations. Some + * initialization stuff which is rather large and complex is placed at + * corresponding subsystems, but most of it is here. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ubifs.h" + +/* + * Maximum amount of memory we may 'kmalloc()' without worrying that we are + * allocating too much. + */ +#define UBIFS_KMALLOC_OK (128*1024) + +/* Slab cache for UBIFS inodes */ +struct kmem_cache *ubifs_inode_slab; + +/* UBIFS TNC shrinker description */ +static struct shrinker ubifs_shrinker_info = { + .shrink = ubifs_shrinker, + .seeks = DEFAULT_SEEKS, +}; + +/** + * validate_inode - validate inode. + * @c: UBIFS file-system description object + * @inode: the inode to validate + * + * This is a helper function for 'ubifs_iget()' which validates various fields + * of a newly built inode to make sure they contain sane values and prevent + * possible vulnerabilities. Returns zero if the inode is all right and + * a non-zero error code if not. + */ +static int validate_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err; + const struct ubifs_inode *ui = ubifs_inode(inode); + + if (inode->i_size > c->max_inode_sz) { + ubifs_err("inode is too large (%lld)", + (long long)inode->i_size); + return 1; + } + + if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) { + ubifs_err("unknown compression type %d", ui->compr_type); + return 2; + } + + if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX) + return 3; + + if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA) + return 4; + + if (ui->xattr && (inode->i_mode & S_IFMT) != S_IFREG) + return 5; + + if (!ubifs_compr_present(ui->compr_type)) { + ubifs_warn("inode %lu uses '%s' compression, but it was not " + "compiled in", inode->i_ino, + ubifs_compr_name(ui->compr_type)); + } + + err = dbg_check_dir_size(c, inode); + return err; +} + +struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) +{ + int err; + union ubifs_key key; + struct ubifs_ino_node *ino; + struct ubifs_info *c = sb->s_fs_info; + struct inode *inode; + struct ubifs_inode *ui; + + dbg_gen("inode %lu", inum); + + inode = iget_locked(sb, inum); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + ui = ubifs_inode(inode); + + ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); + if (!ino) { + err = -ENOMEM; + goto out; + } + + ino_key_init(c, &key, inode->i_ino); + + err = ubifs_tnc_lookup(c, &key, ino); + if (err) + goto out_ino; + + inode->i_flags |= (S_NOCMTIME | S_NOATIME); + inode->i_nlink = le32_to_cpu(ino->nlink); + inode->i_uid = le32_to_cpu(ino->uid); + inode->i_gid = le32_to_cpu(ino->gid); + inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec); + inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec); + inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec); + inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec); + inode->i_ctime.tv_sec = (int64_t)le64_to_cpu(ino->ctime_sec); + inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec); + inode->i_mode = le32_to_cpu(ino->mode); + inode->i_size = le64_to_cpu(ino->size); + + ui->data_len = le32_to_cpu(ino->data_len); + ui->flags = le32_to_cpu(ino->flags); + ui->compr_type = le16_to_cpu(ino->compr_type); + ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum); + ui->xattr_cnt = le32_to_cpu(ino->xattr_cnt); + ui->xattr_size = le32_to_cpu(ino->xattr_size); + ui->xattr_names = le32_to_cpu(ino->xattr_names); + ui->synced_i_size = ui->ui_size = inode->i_size; + + ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0; + + err = validate_inode(c, inode); + if (err) + goto out_invalid; + + /* Disable read-ahead */ + inode->i_mapping->backing_dev_info = &c->bdi; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &ubifs_file_address_operations; + inode->i_op = &ubifs_file_inode_operations; + inode->i_fop = &ubifs_file_operations; + if (ui->xattr) { + ui->data = kmalloc(ui->data_len + 1, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_ino; + } + memcpy(ui->data, ino->data, ui->data_len); + ((char *)ui->data)[ui->data_len] = '\0'; + } else if (ui->data_len != 0) { + err = 10; + goto out_invalid; + } + break; + case S_IFDIR: + inode->i_op = &ubifs_dir_inode_operations; + inode->i_fop = &ubifs_dir_operations; + if (ui->data_len != 0) { + err = 11; + goto out_invalid; + } + break; + case S_IFLNK: + inode->i_op = &ubifs_symlink_inode_operations; + if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) { + err = 12; + goto out_invalid; + } + ui->data = kmalloc(ui->data_len + 1, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_ino; + } + memcpy(ui->data, ino->data, ui->data_len); + ((char *)ui->data)[ui->data_len] = '\0'; + break; + case S_IFBLK: + case S_IFCHR: + { + dev_t rdev; + union ubifs_dev_desc *dev; + + ui->data = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_ino; + } + + dev = (union ubifs_dev_desc *)ino->data; + if (ui->data_len == sizeof(dev->new)) + rdev = new_decode_dev(le32_to_cpu(dev->new)); + else if (ui->data_len == sizeof(dev->huge)) + rdev = huge_decode_dev(le64_to_cpu(dev->huge)); + else { + err = 13; + goto out_invalid; + } + memcpy(ui->data, ino->data, ui->data_len); + inode->i_op = &ubifs_file_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + break; + } + case S_IFSOCK: + case S_IFIFO: + inode->i_op = &ubifs_file_inode_operations; + init_special_inode(inode, inode->i_mode, 0); + if (ui->data_len != 0) { + err = 14; + goto out_invalid; + } + break; + default: + err = 15; + goto out_invalid; + } + + kfree(ino); + ubifs_set_inode_flags(inode); + unlock_new_inode(inode); + return inode; + +out_invalid: + ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err); + dbg_dump_node(c, ino); + dbg_dump_inode(c, inode); + err = -EINVAL; +out_ino: + kfree(ino); +out: + ubifs_err("failed to read inode %lu, error %d", inode->i_ino, err); + iget_failed(inode); + return ERR_PTR(err); +} + +static struct inode *ubifs_alloc_inode(struct super_block *sb) +{ + struct ubifs_inode *ui; + + ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS); + if (!ui) + return NULL; + + memset((void *)ui + sizeof(struct inode), 0, + sizeof(struct ubifs_inode) - sizeof(struct inode)); + mutex_init(&ui->ui_mutex); + spin_lock_init(&ui->ui_lock); + return &ui->vfs_inode; +}; + +static void ubifs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ubifs_inode *ui = ubifs_inode(inode); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ubifs_inode_slab, ui); +} + +static void ubifs_destroy_inode(struct inode *inode) +{ + struct ubifs_inode *ui = ubifs_inode(inode); + + kfree(ui->data); + call_rcu(&inode->i_rcu, ubifs_i_callback); +} + +/* + * Note, Linux write-back code calls this without 'i_mutex'. + */ +static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int err = 0; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(!ui->xattr); + if (is_bad_inode(inode)) + return 0; + + mutex_lock(&ui->ui_mutex); + /* + * Due to races between write-back forced by budgeting + * (see 'sync_some_inodes()') and pdflush write-back, the inode may + * have already been synchronized, do not do this again. This might + * also happen if it was synchronized in an VFS operation, e.g. + * 'ubifs_link()'. + */ + if (!ui->dirty) { + mutex_unlock(&ui->ui_mutex); + return 0; + } + + /* + * As an optimization, do not write orphan inodes to the media just + * because this is not needed. + */ + dbg_gen("inode %lu, mode %#x, nlink %u", + inode->i_ino, (int)inode->i_mode, inode->i_nlink); + if (inode->i_nlink) { + err = ubifs_jnl_write_inode(c, inode); + if (err) + ubifs_err("can't write inode %lu, error %d", + inode->i_ino, err); + else + err = dbg_check_inode_size(c, inode, ui->ui_size); + } + + ui->dirty = 0; + mutex_unlock(&ui->ui_mutex); + ubifs_release_dirty_inode_budget(c, ui); + return err; +} + +static void ubifs_evict_inode(struct inode *inode) +{ + int err; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + + if (ui->xattr) + /* + * Extended attribute inode deletions are fully handled in + * 'ubifs_removexattr()'. These inodes are special and have + * limited usage, so there is nothing to do here. + */ + goto out; + + dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); + ubifs_assert(!atomic_read(&inode->i_count)); + + truncate_inode_pages(&inode->i_data, 0); + + if (inode->i_nlink) + goto done; + + if (is_bad_inode(inode)) + goto out; + + ui->ui_size = inode->i_size = 0; + err = ubifs_jnl_delete_inode(c, inode); + if (err) + /* + * Worst case we have a lost orphan inode wasting space, so a + * simple error message is OK here. + */ + ubifs_err("can't delete inode %lu, error %d", + inode->i_ino, err); + +out: + if (ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); + else { + /* We've deleted something - clean the "no space" flags */ + c->bi.nospace = c->bi.nospace_rp = 0; + smp_wmb(); + } +done: + end_writeback(inode); +} + +static void ubifs_dirty_inode(struct inode *inode, int flags) +{ + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + if (!ui->dirty) { + ui->dirty = 1; + dbg_gen("inode %lu", inode->i_ino); + } +} + +static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ubifs_info *c = dentry->d_sb->s_fs_info; + unsigned long long free; + __le32 *uuid = (__le32 *)c->uuid; + + free = ubifs_get_free_space(c); + dbg_gen("free space %lld bytes (%lld blocks)", + free, free >> UBIFS_BLOCK_SHIFT); + + buf->f_type = UBIFS_SUPER_MAGIC; + buf->f_bsize = UBIFS_BLOCK_SIZE; + buf->f_blocks = c->block_cnt; + buf->f_bfree = free >> UBIFS_BLOCK_SHIFT; + if (free > c->report_rp_size) + buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT; + else + buf->f_bavail = 0; + buf->f_files = 0; + buf->f_ffree = 0; + buf->f_namelen = UBIFS_MAX_NLEN; + buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); + buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); + ubifs_assert(buf->f_bfree <= c->block_cnt); + return 0; +} + +static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt) +{ + struct ubifs_info *c = mnt->mnt_sb->s_fs_info; + + if (c->mount_opts.unmount_mode == 2) + seq_printf(s, ",fast_unmount"); + else if (c->mount_opts.unmount_mode == 1) + seq_printf(s, ",norm_unmount"); + + if (c->mount_opts.bulk_read == 2) + seq_printf(s, ",bulk_read"); + else if (c->mount_opts.bulk_read == 1) + seq_printf(s, ",no_bulk_read"); + + if (c->mount_opts.chk_data_crc == 2) + seq_printf(s, ",chk_data_crc"); + else if (c->mount_opts.chk_data_crc == 1) + seq_printf(s, ",no_chk_data_crc"); + + if (c->mount_opts.override_compr) { + seq_printf(s, ",compr=%s", + ubifs_compr_name(c->mount_opts.compr_type)); + } + + return 0; +} + +static int ubifs_sync_fs(struct super_block *sb, int wait) +{ + int i, err; + struct ubifs_info *c = sb->s_fs_info; + + /* + * Zero @wait is just an advisory thing to help the file system shove + * lots of data into the queues, and there will be the second + * '->sync_fs()' call, with non-zero @wait. + */ + if (!wait) + return 0; + + /* + * Synchronize write buffers, because 'ubifs_run_commit()' does not + * do this if it waits for an already running commit. + */ + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + return err; + } + + /* + * Strictly speaking, it is not necessary to commit the journal here, + * synchronizing write-buffers would be enough. But committing makes + * UBIFS free space predictions much more accurate, so we want to let + * the user be able to get more accurate results of 'statfs()' after + * they synchronize the file system. + */ + err = ubifs_run_commit(c); + if (err) + return err; + + return ubi_sync(c->vi.ubi_num); +} + +/** + * init_constants_early - initialize UBIFS constants. + * @c: UBIFS file-system description object + * + * This function initialize UBIFS constants which do not need the superblock to + * be read. It also checks that the UBI volume satisfies basic UBIFS + * requirements. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int init_constants_early(struct ubifs_info *c) +{ + if (c->vi.corrupted) { + ubifs_warn("UBI volume is corrupted - read-only mode"); + c->ro_media = 1; + } + + if (c->di.ro_mode) { + ubifs_msg("read-only UBI device"); + c->ro_media = 1; + } + + if (c->vi.vol_type == UBI_STATIC_VOLUME) { + ubifs_msg("static UBI volume - read-only mode"); + c->ro_media = 1; + } + + c->leb_cnt = c->vi.size; + c->leb_size = c->vi.usable_leb_size; + c->leb_start = c->di.leb_start; + c->half_leb_size = c->leb_size / 2; + c->min_io_size = c->di.min_io_size; + c->min_io_shift = fls(c->min_io_size) - 1; + c->max_write_size = c->di.max_write_size; + c->max_write_shift = fls(c->max_write_size) - 1; + + if (c->leb_size < UBIFS_MIN_LEB_SZ) { + ubifs_err("too small LEBs (%d bytes), min. is %d bytes", + c->leb_size, UBIFS_MIN_LEB_SZ); + return -EINVAL; + } + + if (c->leb_cnt < UBIFS_MIN_LEB_CNT) { + ubifs_err("too few LEBs (%d), min. is %d", + c->leb_cnt, UBIFS_MIN_LEB_CNT); + return -EINVAL; + } + + if (!is_power_of_2(c->min_io_size)) { + ubifs_err("bad min. I/O size %d", c->min_io_size); + return -EINVAL; + } + + /* + * Maximum write size has to be greater or equivalent to min. I/O + * size, and be multiple of min. I/O size. + */ + if (c->max_write_size < c->min_io_size || + c->max_write_size % c->min_io_size || + !is_power_of_2(c->max_write_size)) { + ubifs_err("bad write buffer size %d for %d min. I/O unit", + c->max_write_size, c->min_io_size); + return -EINVAL; + } + + /* + * UBIFS aligns all node to 8-byte boundary, so to make function in + * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is + * less than 8. + */ + if (c->min_io_size < 8) { + c->min_io_size = 8; + c->min_io_shift = 3; + if (c->max_write_size < c->min_io_size) { + c->max_write_size = c->min_io_size; + c->max_write_shift = c->min_io_shift; + } + } + + c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size); + c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size); + + /* + * Initialize node length ranges which are mostly needed for node + * length validation. + */ + c->ranges[UBIFS_PAD_NODE].len = UBIFS_PAD_NODE_SZ; + c->ranges[UBIFS_SB_NODE].len = UBIFS_SB_NODE_SZ; + c->ranges[UBIFS_MST_NODE].len = UBIFS_MST_NODE_SZ; + c->ranges[UBIFS_REF_NODE].len = UBIFS_REF_NODE_SZ; + c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ; + c->ranges[UBIFS_CS_NODE].len = UBIFS_CS_NODE_SZ; + + c->ranges[UBIFS_INO_NODE].min_len = UBIFS_INO_NODE_SZ; + c->ranges[UBIFS_INO_NODE].max_len = UBIFS_MAX_INO_NODE_SZ; + c->ranges[UBIFS_ORPH_NODE].min_len = + UBIFS_ORPH_NODE_SZ + sizeof(__le64); + c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size; + c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ; + c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ; + c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ; + c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ; + c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ; + c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ; + /* + * Minimum indexing node size is amended later when superblock is + * read and the key length is known. + */ + c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ; + /* + * Maximum indexing node size is amended later when superblock is + * read and the fanout is known. + */ + c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX; + + /* + * Initialize dead and dark LEB space watermarks. See gc.c for comments + * about these values. + */ + c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); + c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); + + /* + * Calculate how many bytes would be wasted at the end of LEB if it was + * fully filled with data nodes of maximum size. This is used in + * calculations when reporting free space. + */ + c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; + + /* Buffer size for bulk-reads */ + c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ; + if (c->max_bu_buf_len > c->leb_size) + c->max_bu_buf_len = c->leb_size; + return 0; +} + +/** + * bud_wbuf_callback - bud LEB write-buffer synchronization call-back. + * @c: UBIFS file-system description object + * @lnum: LEB the write-buffer was synchronized to + * @free: how many free bytes left in this LEB + * @pad: how many bytes were padded + * + * This is a callback function which is called by the I/O unit when the + * write-buffer is synchronized. We need this to correctly maintain space + * accounting in bud logical eraseblocks. This function returns zero in case of + * success and a negative error code in case of failure. + * + * This function actually belongs to the journal, but we keep it here because + * we want to keep it static. + */ +static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad) +{ + return ubifs_update_one_lp(c, lnum, free, pad, 0, 0); +} + +/* + * init_constants_sb - initialize UBIFS constants. + * @c: UBIFS file-system description object + * + * This is a helper function which initializes various UBIFS constants after + * the superblock has been read. It also checks various UBIFS parameters and + * makes sure they are all right. Returns zero in case of success and a + * negative error code in case of failure. + */ +static int init_constants_sb(struct ubifs_info *c) +{ + int tmp, err; + long long tmp64; + + c->main_bytes = (long long)c->main_lebs * c->leb_size; + c->max_znode_sz = sizeof(struct ubifs_znode) + + c->fanout * sizeof(struct ubifs_zbranch); + + tmp = ubifs_idx_node_sz(c, 1); + c->ranges[UBIFS_IDX_NODE].min_len = tmp; + c->min_idx_node_sz = ALIGN(tmp, 8); + + tmp = ubifs_idx_node_sz(c, c->fanout); + c->ranges[UBIFS_IDX_NODE].max_len = tmp; + c->max_idx_node_sz = ALIGN(tmp, 8); + + /* Make sure LEB size is large enough to fit full commit */ + tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt; + tmp = ALIGN(tmp, c->min_io_size); + if (tmp > c->leb_size) { + dbg_err("too small LEB size %d, at least %d needed", + c->leb_size, tmp); + return -EINVAL; + } + + /* + * Make sure that the log is large enough to fit reference nodes for + * all buds plus one reserved LEB. + */ + tmp64 = c->max_bud_bytes + c->leb_size - 1; + c->max_bud_cnt = div_u64(tmp64, c->leb_size); + tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1); + tmp /= c->leb_size; + tmp += 1; + if (c->log_lebs < tmp) { + dbg_err("too small log %d LEBs, required min. %d LEBs", + c->log_lebs, tmp); + return -EINVAL; + } + + /* + * When budgeting we assume worst-case scenarios when the pages are not + * be compressed and direntries are of the maximum size. + * + * Note, data, which may be stored in inodes is budgeted separately, so + * it is not included into 'c->bi.inode_budget'. + */ + c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; + c->bi.inode_budget = UBIFS_INO_NODE_SZ; + c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ; + + /* + * When the amount of flash space used by buds becomes + * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit. + * The writers are unblocked when the commit is finished. To avoid + * writers to be blocked UBIFS initiates background commit in advance, + * when number of bud bytes becomes above the limit defined below. + */ + c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4; + + /* + * Ensure minimum journal size. All the bytes in the journal heads are + * considered to be used, when calculating the current journal usage. + * Consequently, if the journal is too small, UBIFS will treat it as + * always full. + */ + tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1; + if (c->bg_bud_bytes < tmp64) + c->bg_bud_bytes = tmp64; + if (c->max_bud_bytes < tmp64 + c->leb_size) + c->max_bud_bytes = tmp64 + c->leb_size; + + err = ubifs_calc_lpt_geom(c); + if (err) + return err; + + /* Initialize effective LEB size used in budgeting calculations */ + c->idx_leb_size = c->leb_size - c->max_idx_node_sz; + return 0; +} + +/* + * init_constants_master - initialize UBIFS constants. + * @c: UBIFS file-system description object + * + * This is a helper function which initializes various UBIFS constants after + * the master node has been read. It also checks various UBIFS parameters and + * makes sure they are all right. + */ +static void init_constants_master(struct ubifs_info *c) +{ + long long tmp64; + + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + c->report_rp_size = ubifs_reported_space(c, c->rp_size); + + /* + * Calculate total amount of FS blocks. This number is not used + * internally because it does not make much sense for UBIFS, but it is + * necessary to report something for the 'statfs()' call. + * + * Subtract the LEB reserved for GC, the LEB which is reserved for + * deletions, minimum LEBs for the index, and assume only one journal + * head is available. + */ + tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1; + tmp64 *= (long long)c->leb_size - c->leb_overhead; + tmp64 = ubifs_reported_space(c, tmp64); + c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; +} + +/** + * take_gc_lnum - reserve GC LEB. + * @c: UBIFS file-system description object + * + * This function ensures that the LEB reserved for garbage collection is marked + * as "taken" in lprops. We also have to set free space to LEB size and dirty + * space to zero, because lprops may contain out-of-date information if the + * file-system was un-mounted before it has been committed. This function + * returns zero in case of success and a negative error code in case of + * failure. + */ +static int take_gc_lnum(struct ubifs_info *c) +{ + int err; + + if (c->gc_lnum == -1) { + ubifs_err("no LEB for GC"); + return -EINVAL; + } + + /* And we have to tell lprops that this LEB is taken */ + err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0, + LPROPS_TAKEN, 0, 0); + return err; +} + +/** + * alloc_wbufs - allocate write-buffers. + * @c: UBIFS file-system description object + * + * This helper function allocates and initializes UBIFS write-buffers. Returns + * zero in case of success and %-ENOMEM in case of failure. + */ +static int alloc_wbufs(struct ubifs_info *c) +{ + int i, err; + + c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead), + GFP_KERNEL); + if (!c->jheads) + return -ENOMEM; + + /* Initialize journal heads */ + for (i = 0; i < c->jhead_cnt; i++) { + INIT_LIST_HEAD(&c->jheads[i].buds_list); + err = ubifs_wbuf_init(c, &c->jheads[i].wbuf); + if (err) + return err; + + c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback; + c->jheads[i].wbuf.jhead = i; + c->jheads[i].grouped = 1; + } + + c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM; + /* + * Garbage Collector head likely contains long-term data and + * does not need to be synchronized by timer. Also GC head nodes are + * not grouped. + */ + c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM; + c->jheads[GCHD].wbuf.no_timer = 1; + c->jheads[GCHD].grouped = 0; + + return 0; +} + +/** + * free_wbufs - free write-buffers. + * @c: UBIFS file-system description object + */ +static void free_wbufs(struct ubifs_info *c) +{ + int i; + + if (c->jheads) { + for (i = 0; i < c->jhead_cnt; i++) { + kfree(c->jheads[i].wbuf.buf); + kfree(c->jheads[i].wbuf.inodes); + } + kfree(c->jheads); + c->jheads = NULL; + } +} + +/** + * free_orphans - free orphans. + * @c: UBIFS file-system description object + */ +static void free_orphans(struct ubifs_info *c) +{ + struct ubifs_orphan *orph; + + while (c->orph_dnext) { + orph = c->orph_dnext; + c->orph_dnext = orph->dnext; + list_del(&orph->list); + kfree(orph); + } + + while (!list_empty(&c->orph_list)) { + orph = list_entry(c->orph_list.next, struct ubifs_orphan, list); + list_del(&orph->list); + kfree(orph); + dbg_err("orphan list not empty at unmount"); + } + + vfree(c->orph_buf); + c->orph_buf = NULL; +} + +/** + * free_buds - free per-bud objects. + * @c: UBIFS file-system description object + */ +static void free_buds(struct ubifs_info *c) +{ + struct rb_node *this = c->buds.rb_node; + struct ubifs_bud *bud; + + while (this) { + if (this->rb_left) + this = this->rb_left; + else if (this->rb_right) + this = this->rb_right; + else { + bud = rb_entry(this, struct ubifs_bud, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &bud->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(bud); + } + } +} + +/** + * check_volume_empty - check if the UBI volume is empty. + * @c: UBIFS file-system description object + * + * This function checks if the UBIFS volume is empty by looking if its LEBs are + * mapped or not. The result of checking is stored in the @c->empty variable. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +static int check_volume_empty(struct ubifs_info *c) +{ + int lnum, err; + + c->empty = 1; + for (lnum = 0; lnum < c->leb_cnt; lnum++) { + err = ubi_is_mapped(c->ubi, lnum); + if (unlikely(err < 0)) + return err; + if (err == 1) { + c->empty = 0; + break; + } + + cond_resched(); + } + + return 0; +} + +/* + * UBIFS mount options. + * + * Opt_fast_unmount: do not run a journal commit before un-mounting + * Opt_norm_unmount: run a journal commit before un-mounting + * Opt_bulk_read: enable bulk-reads + * Opt_no_bulk_read: disable bulk-reads + * Opt_chk_data_crc: check CRCs when reading data nodes + * Opt_no_chk_data_crc: do not check CRCs when reading data nodes + * Opt_override_compr: override default compressor + * Opt_err: just end of array marker + */ +enum { + Opt_fast_unmount, + Opt_norm_unmount, + Opt_bulk_read, + Opt_no_bulk_read, + Opt_chk_data_crc, + Opt_no_chk_data_crc, + Opt_override_compr, + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_fast_unmount, "fast_unmount"}, + {Opt_norm_unmount, "norm_unmount"}, + {Opt_bulk_read, "bulk_read"}, + {Opt_no_bulk_read, "no_bulk_read"}, + {Opt_chk_data_crc, "chk_data_crc"}, + {Opt_no_chk_data_crc, "no_chk_data_crc"}, + {Opt_override_compr, "compr=%s"}, + {Opt_err, NULL}, +}; + +/** + * parse_standard_option - parse a standard mount option. + * @option: the option to parse + * + * Normally, standard mount options like "sync" are passed to file-systems as + * flags. However, when a "rootflags=" kernel boot parameter is used, they may + * be present in the options string. This function tries to deal with this + * situation and parse standard options. Returns 0 if the option was not + * recognized, and the corresponding integer flag if it was. + * + * UBIFS is only interested in the "sync" option, so do not check for anything + * else. + */ +static int parse_standard_option(const char *option) +{ + ubifs_msg("parse %s", option); + if (!strcmp(option, "sync")) + return MS_SYNCHRONOUS; + return 0; +} + +/** + * ubifs_parse_options - parse mount parameters. + * @c: UBIFS file-system description object + * @options: parameters to parse + * @is_remount: non-zero if this is FS re-mount + * + * This function parses UBIFS mount options and returns zero in case success + * and a negative error code in case of failure. + */ +static int ubifs_parse_options(struct ubifs_info *c, char *options, + int is_remount) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + + if (!options) + return 0; + + while ((p = strsep(&options, ","))) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + /* + * %Opt_fast_unmount and %Opt_norm_unmount options are ignored. + * We accept them in order to be backward-compatible. But this + * should be removed at some point. + */ + case Opt_fast_unmount: + c->mount_opts.unmount_mode = 2; + break; + case Opt_norm_unmount: + c->mount_opts.unmount_mode = 1; + break; + case Opt_bulk_read: + c->mount_opts.bulk_read = 2; + c->bulk_read = 1; + break; + case Opt_no_bulk_read: + c->mount_opts.bulk_read = 1; + c->bulk_read = 0; + break; + case Opt_chk_data_crc: + c->mount_opts.chk_data_crc = 2; + c->no_chk_data_crc = 0; + break; + case Opt_no_chk_data_crc: + c->mount_opts.chk_data_crc = 1; + c->no_chk_data_crc = 1; + break; + case Opt_override_compr: + { + char *name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (!strcmp(name, "none")) + c->mount_opts.compr_type = UBIFS_COMPR_NONE; + else if (!strcmp(name, "lzo")) + c->mount_opts.compr_type = UBIFS_COMPR_LZO; + else if (!strcmp(name, "zlib")) + c->mount_opts.compr_type = UBIFS_COMPR_ZLIB; + else { + ubifs_err("unknown compressor \"%s\"", name); + kfree(name); + return -EINVAL; + } + kfree(name); + c->mount_opts.override_compr = 1; + c->default_compr = c->mount_opts.compr_type; + break; + } + default: + { + unsigned long flag; + struct super_block *sb = c->vfs_sb; + + flag = parse_standard_option(p); + if (!flag) { + ubifs_err("unrecognized mount option \"%s\" " + "or missing value", p); + return -EINVAL; + } + sb->s_flags |= flag; + break; + } + } + } + + return 0; +} + +/** + * destroy_journal - destroy journal data structures. + * @c: UBIFS file-system description object + * + * This function destroys journal data structures including those that may have + * been created by recovery functions. + */ +static void destroy_journal(struct ubifs_info *c) +{ + while (!list_empty(&c->unclean_leb_list)) { + struct ubifs_unclean_leb *ucleb; + + ucleb = list_entry(c->unclean_leb_list.next, + struct ubifs_unclean_leb, list); + list_del(&ucleb->list); + kfree(ucleb); + } + while (!list_empty(&c->old_buds)) { + struct ubifs_bud *bud; + + bud = list_entry(c->old_buds.next, struct ubifs_bud, list); + list_del(&bud->list); + kfree(bud); + } + ubifs_destroy_idx_gc(c); + ubifs_destroy_size_tree(c); + ubifs_tnc_close(c); + free_buds(c); +} + +/** + * bu_init - initialize bulk-read information. + * @c: UBIFS file-system description object + */ +static void bu_init(struct ubifs_info *c) +{ + ubifs_assert(c->bulk_read == 1); + + if (c->bu.buf) + return; /* Already initialized */ + +again: + c->bu.buf = kmalloc(c->max_bu_buf_len, GFP_KERNEL | __GFP_NOWARN); + if (!c->bu.buf) { + if (c->max_bu_buf_len > UBIFS_KMALLOC_OK) { + c->max_bu_buf_len = UBIFS_KMALLOC_OK; + goto again; + } + + /* Just disable bulk-read */ + ubifs_warn("Cannot allocate %d bytes of memory for bulk-read, " + "disabling it", c->max_bu_buf_len); + c->mount_opts.bulk_read = 1; + c->bulk_read = 0; + return; + } +} + +/** + * check_free_space - check if there is enough free space to mount. + * @c: UBIFS file-system description object + * + * This function makes sure UBIFS has enough free space to be mounted in + * read/write mode. UBIFS must always have some free space to allow deletions. + */ +static int check_free_space(struct ubifs_info *c) +{ + ubifs_assert(c->dark_wm > 0); + if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { + ubifs_err("insufficient free space to mount in R/W mode"); + dbg_dump_budg(c, &c->bi); + dbg_dump_lprops(c); + return -ENOSPC; + } + return 0; +} + +/** + * mount_ubifs - mount UBIFS file-system. + * @c: UBIFS file-system description object + * + * This function mounts UBIFS file system. Returns zero in case of success and + * a negative error code in case of failure. + * + * Note, the function does not de-allocate resources it it fails half way + * through, and the caller has to do this instead. + */ +static int mount_ubifs(struct ubifs_info *c) +{ + int err; + long long x; + size_t sz; + + c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY); + err = init_constants_early(c); + if (err) + return err; + + err = ubifs_debugging_init(c); + if (err) + return err; + + err = check_volume_empty(c); + if (err) + goto out_free; + + if (c->empty && (c->ro_mount || c->ro_media)) { + /* + * This UBI volume is empty, and read-only, or the file system + * is mounted read-only - we cannot format it. + */ + ubifs_err("can't format empty UBI volume: read-only %s", + c->ro_media ? "UBI volume" : "mount"); + err = -EROFS; + goto out_free; + } + + if (c->ro_media && !c->ro_mount) { + ubifs_err("cannot mount read-write - read-only media"); + err = -EROFS; + goto out_free; + } + + /* + * The requirement for the buffer is that it should fit indexing B-tree + * height amount of integers. We assume the height if the TNC tree will + * never exceed 64. + */ + err = -ENOMEM; + c->bottom_up_buf = kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL); + if (!c->bottom_up_buf) + goto out_free; + + c->sbuf = vmalloc(c->leb_size); + if (!c->sbuf) + goto out_free; + + if (!c->ro_mount) { + c->ileb_buf = vmalloc(c->leb_size); + if (!c->ileb_buf) + goto out_free; + } + + if (c->bulk_read == 1) + bu_init(c); + + if (!c->ro_mount) { + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, + GFP_KERNEL); + if (!c->write_reserve_buf) + goto out_free; + } + + c->mounting = 1; + + err = ubifs_read_superblock(c); + if (err) + goto out_free; + + /* + * Make sure the compressor which is set as default in the superblock + * or overridden by mount options is actually compiled in. + */ + if (!ubifs_compr_present(c->default_compr)) { + ubifs_err("'compressor \"%s\" is not compiled in", + ubifs_compr_name(c->default_compr)); + err = -ENOTSUPP; + goto out_free; + } + + err = init_constants_sb(c); + if (err) + goto out_free; + + sz = ALIGN(c->max_idx_node_sz, c->min_io_size); + sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size); + c->cbuf = kmalloc(sz, GFP_NOFS); + if (!c->cbuf) { + err = -ENOMEM; + goto out_free; + } + + err = alloc_wbufs(c); + if (err) + goto out_cbuf; + + sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); + if (!c->ro_mount) { + /* Create background thread */ + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); + if (IS_ERR(c->bgt)) { + err = PTR_ERR(c->bgt); + c->bgt = NULL; + ubifs_err("cannot spawn \"%s\", error %d", + c->bgt_name, err); + goto out_wbufs; + } + wake_up_process(c->bgt); + } + + err = ubifs_read_master(c); + if (err) + goto out_master; + + init_constants_master(c); + + if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { + ubifs_msg("recovery needed"); + c->need_recovery = 1; + } + + if (c->need_recovery && !c->ro_mount) { + err = ubifs_recover_inl_heads(c, c->sbuf); + if (err) + goto out_master; + } + + err = ubifs_lpt_init(c, 1, !c->ro_mount); + if (err) + goto out_master; + + if (!c->ro_mount && c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + goto out_master; + } + + if (!c->ro_mount) { + /* + * Set the "dirty" flag so that if we reboot uncleanly we + * will notice this immediately on the next mount. + */ + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + err = ubifs_write_master(c); + if (err) + goto out_lpt; + } + + err = dbg_check_idx_size(c, c->bi.old_idx_sz); + if (err) + goto out_lpt; + + err = ubifs_replay_journal(c); + if (err) + goto out_journal; + + /* Calculate 'min_idx_lebs' after journal replay */ + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount); + if (err) + goto out_orphans; + + if (!c->ro_mount) { + int lnum; + + err = check_free_space(c); + if (err) + goto out_orphans; + + /* Check for enough log space */ + lnum = c->lhead_lnum + 1; + if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) + lnum = UBIFS_LOG_LNUM; + if (lnum == c->ltail_lnum) { + err = ubifs_consolidate_log(c); + if (err) + goto out_orphans; + } + + if (c->need_recovery) { + err = ubifs_recover_size(c); + if (err) + goto out_orphans; + err = ubifs_rcvry_gc_commit(c); + if (err) + goto out_orphans; + } else { + err = take_gc_lnum(c); + if (err) + goto out_orphans; + + /* + * GC LEB may contain garbage if there was an unclean + * reboot, and it should be un-mapped. + */ + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) + goto out_orphans; + } + + err = dbg_check_lprops(c); + if (err) + goto out_orphans; + } else if (c->need_recovery) { + err = ubifs_recover_size(c); + if (err) + goto out_orphans; + } else { + /* + * Even if we mount read-only, we have to set space in GC LEB + * to proper value because this affects UBIFS free space + * reporting. We do not want to have a situation when + * re-mounting from R/O to R/W changes amount of free space. + */ + err = take_gc_lnum(c); + if (err) + goto out_orphans; + } + + spin_lock(&ubifs_infos_lock); + list_add_tail(&c->infos_list, &ubifs_infos); + spin_unlock(&ubifs_infos_lock); + + if (c->need_recovery) { + if (c->ro_mount) + ubifs_msg("recovery deferred"); + else { + c->need_recovery = 0; + ubifs_msg("recovery completed"); + /* + * GC LEB has to be empty and taken at this point. But + * the journal head LEBs may also be accounted as + * "empty taken" if they are empty. + */ + ubifs_assert(c->lst.taken_empty_lebs > 0); + } + } else + ubifs_assert(c->lst.taken_empty_lebs > 0); + + err = dbg_check_filesystem(c); + if (err) + goto out_infos; + + err = dbg_debugfs_init_fs(c); + if (err) + goto out_infos; + + c->mounting = 0; + + ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", + c->vi.ubi_num, c->vi.vol_id, c->vi.name); + if (c->ro_mount) + ubifs_msg("mounted read-only"); + x = (long long)c->main_lebs * c->leb_size; + ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d " + "LEBs)", x, x >> 10, x >> 20, c->main_lebs); + x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; + ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " + "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); + ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)", + c->fmt_version, c->ro_compat_version, + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); + ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); + ubifs_msg("reserved for root: %llu bytes (%llu KiB)", + c->report_rp_size, c->report_rp_size >> 10); + + dbg_msg("compiled on: " __DATE__ " at " __TIME__); + dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); + dbg_msg("max. write size: %d bytes", c->max_write_size); + dbg_msg("LEB size: %d bytes (%d KiB)", + c->leb_size, c->leb_size >> 10); + dbg_msg("data journal heads: %d", + c->jhead_cnt - NONDATA_JHEADS_CNT); + dbg_msg("UUID: %pUB", c->uuid); + dbg_msg("big_lpt %d", c->big_lpt); + dbg_msg("log LEBs: %d (%d - %d)", + c->log_lebs, UBIFS_LOG_LNUM, c->log_last); + dbg_msg("LPT area LEBs: %d (%d - %d)", + c->lpt_lebs, c->lpt_first, c->lpt_last); + dbg_msg("orphan area LEBs: %d (%d - %d)", + c->orph_lebs, c->orph_first, c->orph_last); + dbg_msg("main area LEBs: %d (%d - %d)", + c->main_lebs, c->main_first, c->leb_cnt - 1); + dbg_msg("index LEBs: %d", c->lst.idx_lebs); + dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)", + c->bi.old_idx_sz, c->bi.old_idx_sz >> 10, + c->bi.old_idx_sz >> 20); + dbg_msg("key hash type: %d", c->key_hash_type); + dbg_msg("tree fanout: %d", c->fanout); + dbg_msg("reserved GC LEB: %d", c->gc_lnum); + dbg_msg("first main LEB: %d", c->main_first); + dbg_msg("max. znode size %d", c->max_znode_sz); + dbg_msg("max. index node size %d", c->max_idx_node_sz); + dbg_msg("node sizes: data %zu, inode %zu, dentry %zu", + UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ); + dbg_msg("node sizes: trun %zu, sb %zu, master %zu", + UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); + dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", + UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); + dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", + UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, + UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); + dbg_msg("dead watermark: %d", c->dead_wm); + dbg_msg("dark watermark: %d", c->dark_wm); + dbg_msg("LEB overhead: %d", c->leb_overhead); + x = (long long)c->main_lebs * c->dark_wm; + dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)", + x, x >> 10, x >> 20); + dbg_msg("maximum bud bytes: %lld (%lld KiB, %lld MiB)", + c->max_bud_bytes, c->max_bud_bytes >> 10, + c->max_bud_bytes >> 20); + dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)", + c->bg_bud_bytes, c->bg_bud_bytes >> 10, + c->bg_bud_bytes >> 20); + dbg_msg("current bud bytes %lld (%lld KiB, %lld MiB)", + c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20); + dbg_msg("max. seq. number: %llu", c->max_sqnum); + dbg_msg("commit number: %llu", c->cmt_no); + + return 0; + +out_infos: + spin_lock(&ubifs_infos_lock); + list_del(&c->infos_list); + spin_unlock(&ubifs_infos_lock); +out_orphans: + free_orphans(c); +out_journal: + destroy_journal(c); +out_lpt: + ubifs_lpt_free(c, 0); +out_master: + kfree(c->mst_node); + kfree(c->rcvrd_mst_node); + if (c->bgt) + kthread_stop(c->bgt); +out_wbufs: + free_wbufs(c); +out_cbuf: + kfree(c->cbuf); +out_free: + kfree(c->write_reserve_buf); + kfree(c->bu.buf); + vfree(c->ileb_buf); + vfree(c->sbuf); + kfree(c->bottom_up_buf); + ubifs_debugging_exit(c); + return err; +} + +/** + * ubifs_umount - un-mount UBIFS file-system. + * @c: UBIFS file-system description object + * + * Note, this function is called to free allocated resourced when un-mounting, + * as well as free resources when an error occurred while we were half way + * through mounting (error path cleanup function). So it has to make sure the + * resource was actually allocated before freeing it. + */ +static void ubifs_umount(struct ubifs_info *c) +{ + dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num, + c->vi.vol_id); + + dbg_debugfs_exit_fs(c); + spin_lock(&ubifs_infos_lock); + list_del(&c->infos_list); + spin_unlock(&ubifs_infos_lock); + + if (c->bgt) + kthread_stop(c->bgt); + + destroy_journal(c); + free_wbufs(c); + free_orphans(c); + ubifs_lpt_free(c, 0); + + kfree(c->cbuf); + kfree(c->rcvrd_mst_node); + kfree(c->mst_node); + kfree(c->write_reserve_buf); + kfree(c->bu.buf); + vfree(c->ileb_buf); + vfree(c->sbuf); + kfree(c->bottom_up_buf); + ubifs_debugging_exit(c); +} + +/** + * ubifs_remount_rw - re-mount in read-write mode. + * @c: UBIFS file-system description object + * + * UBIFS avoids allocating many unnecessary resources when mounted in read-only + * mode. This function allocates the needed resources and re-mounts UBIFS in + * read-write mode. + */ +static int ubifs_remount_rw(struct ubifs_info *c) +{ + int err, lnum; + + if (c->rw_incompat) { + ubifs_err("the file-system is not R/W-compatible"); + ubifs_msg("on-flash format version is w%d/r%d, but software " + "only supports up to version w%d/r%d", c->fmt_version, + c->ro_compat_version, UBIFS_FORMAT_VERSION, + UBIFS_RO_COMPAT_VERSION); + return -EROFS; + } + + mutex_lock(&c->umount_mutex); + dbg_save_space_info(c); + c->remounting_rw = 1; + c->ro_mount = 0; + + err = check_free_space(c); + if (err) + goto out; + + if (c->old_leb_cnt != c->leb_cnt) { + struct ubifs_sb_node *sup; + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) { + err = PTR_ERR(sup); + goto out; + } + sup->leb_cnt = cpu_to_le32(c->leb_cnt); + err = ubifs_write_sb_node(c, sup); + kfree(sup); + if (err) + goto out; + } + + if (c->need_recovery) { + ubifs_msg("completing deferred recovery"); + err = ubifs_write_rcvrd_mst_node(c); + if (err) + goto out; + err = ubifs_recover_size(c); + if (err) + goto out; + err = ubifs_clean_lebs(c, c->sbuf); + if (err) + goto out; + err = ubifs_recover_inl_heads(c, c->sbuf); + if (err) + goto out; + } else { + /* A readonly mount is not allowed to have orphans */ + ubifs_assert(c->tot_orphans == 0); + err = ubifs_clear_orphans(c); + if (err) + goto out; + } + + if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) { + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + err = ubifs_write_master(c); + if (err) + goto out; + } + + c->ileb_buf = vmalloc(c->leb_size); + if (!c->ileb_buf) { + err = -ENOMEM; + goto out; + } + + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL); + if (!c->write_reserve_buf) + goto out; + + err = ubifs_lpt_init(c, 0, 1); + if (err) + goto out; + + /* Create background thread */ + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); + if (IS_ERR(c->bgt)) { + err = PTR_ERR(c->bgt); + c->bgt = NULL; + ubifs_err("cannot spawn \"%s\", error %d", + c->bgt_name, err); + goto out; + } + wake_up_process(c->bgt); + + c->orph_buf = vmalloc(c->leb_size); + if (!c->orph_buf) { + err = -ENOMEM; + goto out; + } + + /* Check for enough log space */ + lnum = c->lhead_lnum + 1; + if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) + lnum = UBIFS_LOG_LNUM; + if (lnum == c->ltail_lnum) { + err = ubifs_consolidate_log(c); + if (err) + goto out; + } + + if (c->need_recovery) + err = ubifs_rcvry_gc_commit(c); + else + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) + goto out; + + dbg_gen("re-mounted read-write"); + c->remounting_rw = 0; + + if (c->need_recovery) { + c->need_recovery = 0; + ubifs_msg("deferred recovery completed"); + } else { + /* + * Do not run the debugging space check if the were doing + * recovery, because when we saved the information we had the + * file-system in a state where the TNC and lprops has been + * modified in memory, but all the I/O operations (including a + * commit) were deferred. So the file-system was in + * "non-committed" state. Now the file-system is in committed + * state, and of course the amount of free space will change + * because, for example, the old index size was imprecise. + */ + err = dbg_check_space_info(c); + } + + if (c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + goto out; + } + + mutex_unlock(&c->umount_mutex); + return err; + +out: + c->ro_mount = 1; + vfree(c->orph_buf); + c->orph_buf = NULL; + if (c->bgt) { + kthread_stop(c->bgt); + c->bgt = NULL; + } + free_wbufs(c); + kfree(c->write_reserve_buf); + c->write_reserve_buf = NULL; + vfree(c->ileb_buf); + c->ileb_buf = NULL; + ubifs_lpt_free(c, 1); + c->remounting_rw = 0; + mutex_unlock(&c->umount_mutex); + return err; +} + +/** + * ubifs_remount_ro - re-mount in read-only mode. + * @c: UBIFS file-system description object + * + * We assume VFS has stopped writing. Possibly the background thread could be + * running a commit, however kthread_stop will wait in that case. + */ +static void ubifs_remount_ro(struct ubifs_info *c) +{ + int i, err; + + ubifs_assert(!c->need_recovery); + ubifs_assert(!c->ro_mount); + + mutex_lock(&c->umount_mutex); + if (c->bgt) { + kthread_stop(c->bgt); + c->bgt = NULL; + } + + dbg_save_space_info(c); + + for (i = 0; i < c->jhead_cnt; i++) + ubifs_wbuf_sync(&c->jheads[i].wbuf); + + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); + err = ubifs_write_master(c); + if (err) + ubifs_ro_mode(c, err); + + vfree(c->orph_buf); + c->orph_buf = NULL; + kfree(c->write_reserve_buf); + c->write_reserve_buf = NULL; + vfree(c->ileb_buf); + c->ileb_buf = NULL; + ubifs_lpt_free(c, 1); + c->ro_mount = 1; + err = dbg_check_space_info(c); + if (err) + ubifs_ro_mode(c, err); + mutex_unlock(&c->umount_mutex); +} + +static void ubifs_put_super(struct super_block *sb) +{ + int i; + struct ubifs_info *c = sb->s_fs_info; + + ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num, + c->vi.vol_id); + + /* + * The following asserts are only valid if there has not been a failure + * of the media. For example, there will be dirty inodes if we failed + * to write them back because of I/O errors. + */ + if (!c->ro_error) { + ubifs_assert(c->bi.idx_growth == 0); + ubifs_assert(c->bi.dd_growth == 0); + ubifs_assert(c->bi.data_growth == 0); + } + + /* + * The 'c->umount_lock' prevents races between UBIFS memory shrinker + * and file system un-mount. Namely, it prevents the shrinker from + * picking this superblock for shrinking - it will be just skipped if + * the mutex is locked. + */ + mutex_lock(&c->umount_mutex); + if (!c->ro_mount) { + /* + * First of all kill the background thread to make sure it does + * not interfere with un-mounting and freeing resources. + */ + if (c->bgt) { + kthread_stop(c->bgt); + c->bgt = NULL; + } + + /* + * On fatal errors c->ro_error is set to 1, in which case we do + * not write the master node. + */ + if (!c->ro_error) { + int err; + + /* Synchronize write-buffers */ + for (i = 0; i < c->jhead_cnt; i++) + ubifs_wbuf_sync(&c->jheads[i].wbuf); + + /* + * We are being cleanly unmounted which means the + * orphans were killed - indicate this in the master + * node. Also save the reserved GC LEB number. + */ + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); + err = ubifs_write_master(c); + if (err) + /* + * Recovery will attempt to fix the master area + * next mount, so we just print a message and + * continue to unmount normally. + */ + ubifs_err("failed to write master node, " + "error %d", err); + } else { + for (i = 0; i < c->jhead_cnt; i++) + /* Make sure write-buffer timers are canceled */ + hrtimer_cancel(&c->jheads[i].wbuf.timer); + } + } + + ubifs_umount(c); + bdi_destroy(&c->bdi); + ubi_close_volume(c->ubi); + mutex_unlock(&c->umount_mutex); +} + +static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) +{ + int err; + struct ubifs_info *c = sb->s_fs_info; + + dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); + + err = ubifs_parse_options(c, data, 1); + if (err) { + ubifs_err("invalid or unknown remount parameter"); + return err; + } + + if (c->ro_mount && !(*flags & MS_RDONLY)) { + if (c->ro_error) { + ubifs_msg("cannot re-mount R/W due to prior errors"); + return -EROFS; + } + if (c->ro_media) { + ubifs_msg("cannot re-mount R/W - UBI volume is R/O"); + return -EROFS; + } + err = ubifs_remount_rw(c); + if (err) + return err; + } else if (!c->ro_mount && (*flags & MS_RDONLY)) { + if (c->ro_error) { + ubifs_msg("cannot re-mount R/O due to prior errors"); + return -EROFS; + } + ubifs_remount_ro(c); + } + + if (c->bulk_read == 1) + bu_init(c); + else { + dbg_gen("disable bulk-read"); + kfree(c->bu.buf); + c->bu.buf = NULL; + } + + ubifs_assert(c->lst.taken_empty_lebs > 0); + return 0; +} + +const struct super_operations ubifs_super_operations = { + .alloc_inode = ubifs_alloc_inode, + .destroy_inode = ubifs_destroy_inode, + .put_super = ubifs_put_super, + .write_inode = ubifs_write_inode, + .evict_inode = ubifs_evict_inode, + .statfs = ubifs_statfs, + .dirty_inode = ubifs_dirty_inode, + .remount_fs = ubifs_remount_fs, + .show_options = ubifs_show_options, + .sync_fs = ubifs_sync_fs, +}; + +/** + * open_ubi - parse UBI device name string and open the UBI device. + * @name: UBI volume name + * @mode: UBI volume open mode + * + * The primary method of mounting UBIFS is by specifying the UBI volume + * character device node path. However, UBIFS may also be mounted withoug any + * character device node using one of the following methods: + * + * o ubiX_Y - mount UBI device number X, volume Y; + * o ubiY - mount UBI device number 0, volume Y; + * o ubiX:NAME - mount UBI device X, volume with name NAME; + * o ubi:NAME - mount UBI device 0, volume with name NAME. + * + * Alternative '!' separator may be used instead of ':' (because some shells + * like busybox may interpret ':' as an NFS host name separator). This function + * returns UBI volume description object in case of success and a negative + * error code in case of failure. + */ +static struct ubi_volume_desc *open_ubi(const char *name, int mode) +{ + struct ubi_volume_desc *ubi; + int dev, vol; + char *endptr; + + /* First, try to open using the device node path method */ + ubi = ubi_open_volume_path(name, mode); + if (!IS_ERR(ubi)) + return ubi; + + /* Try the "nodev" method */ + if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') + return ERR_PTR(-EINVAL); + + /* ubi:NAME method */ + if ((name[3] == ':' || name[3] == '!') && name[4] != '\0') + return ubi_open_volume_nm(0, name + 4, mode); + + if (!isdigit(name[3])) + return ERR_PTR(-EINVAL); + + dev = simple_strtoul(name + 3, &endptr, 0); + + /* ubiY method */ + if (*endptr == '\0') + return ubi_open_volume(0, dev, mode); + + /* ubiX_Y method */ + if (*endptr == '_' && isdigit(endptr[1])) { + vol = simple_strtoul(endptr + 1, &endptr, 0); + if (*endptr != '\0') + return ERR_PTR(-EINVAL); + return ubi_open_volume(dev, vol, mode); + } + + /* ubiX:NAME method */ + if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0') + return ubi_open_volume_nm(dev, ++endptr, mode); + + return ERR_PTR(-EINVAL); +} + +static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) +{ + struct ubifs_info *c; + + c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL); + if (c) { + spin_lock_init(&c->cnt_lock); + spin_lock_init(&c->cs_lock); + spin_lock_init(&c->buds_lock); + spin_lock_init(&c->space_lock); + spin_lock_init(&c->orphan_lock); + init_rwsem(&c->commit_sem); + mutex_init(&c->lp_mutex); + mutex_init(&c->tnc_mutex); + mutex_init(&c->log_mutex); + mutex_init(&c->mst_mutex); + mutex_init(&c->umount_mutex); + mutex_init(&c->bu_mutex); + mutex_init(&c->write_reserve_mutex); + init_waitqueue_head(&c->cmt_wq); + c->buds = RB_ROOT; + c->old_idx = RB_ROOT; + c->size_tree = RB_ROOT; + c->orph_tree = RB_ROOT; + INIT_LIST_HEAD(&c->infos_list); + INIT_LIST_HEAD(&c->idx_gc); + INIT_LIST_HEAD(&c->replay_list); + INIT_LIST_HEAD(&c->replay_buds); + INIT_LIST_HEAD(&c->uncat_list); + INIT_LIST_HEAD(&c->empty_list); + INIT_LIST_HEAD(&c->freeable_list); + INIT_LIST_HEAD(&c->frdi_idx_list); + INIT_LIST_HEAD(&c->unclean_leb_list); + INIT_LIST_HEAD(&c->old_buds); + INIT_LIST_HEAD(&c->orph_list); + INIT_LIST_HEAD(&c->orph_new); + c->no_chk_data_crc = 1; + + c->highest_inum = UBIFS_FIRST_INO; + c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; + + ubi_get_volume_info(ubi, &c->vi); + ubi_get_device_info(c->vi.ubi_num, &c->di); + } + return c; +} + +static int ubifs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct ubifs_info *c = sb->s_fs_info; + struct inode *root; + int err; + + c->vfs_sb = sb; + /* Re-open the UBI device in read-write mode */ + c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); + if (IS_ERR(c->ubi)) { + err = PTR_ERR(c->ubi); + goto out; + } + + /* + * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For + * UBIFS, I/O is not deferred, it is done immediately in readpage, + * which means the user would have to wait not just for their own I/O + * but the read-ahead I/O as well i.e. completely pointless. + * + * Read-ahead will be disabled because @c->bdi.ra_pages is 0. + */ + c->bdi.name = "ubifs", + c->bdi.capabilities = BDI_CAP_MAP_COPY; + err = bdi_init(&c->bdi); + if (err) + goto out_close; + err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d", + c->vi.ubi_num, c->vi.vol_id); + if (err) + goto out_bdi; + + err = ubifs_parse_options(c, data, 0); + if (err) + goto out_bdi; + + sb->s_bdi = &c->bdi; + sb->s_fs_info = c; + sb->s_magic = UBIFS_SUPER_MAGIC; + sb->s_blocksize = UBIFS_BLOCK_SIZE; + sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT; + sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c); + if (c->max_inode_sz > MAX_LFS_FILESIZE) + sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; + sb->s_op = &ubifs_super_operations; + + mutex_lock(&c->umount_mutex); + err = mount_ubifs(c); + if (err) { + ubifs_assert(err < 0); + goto out_unlock; + } + + /* Read the root inode */ + root = ubifs_iget(sb, UBIFS_ROOT_INO); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out_umount; + } + + sb->s_root = d_alloc_root(root); + if (!sb->s_root) + goto out_iput; + + mutex_unlock(&c->umount_mutex); + return 0; + +out_iput: + iput(root); +out_umount: + ubifs_umount(c); +out_unlock: + mutex_unlock(&c->umount_mutex); +out_bdi: + bdi_destroy(&c->bdi); +out_close: + ubi_close_volume(c->ubi); +out: + return err; +} + +static int sb_test(struct super_block *sb, void *data) +{ + struct ubifs_info *c1 = data; + struct ubifs_info *c = sb->s_fs_info; + + return c->vi.cdev == c1->vi.cdev; +} + +static int sb_set(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); +} + +static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, + const char *name, void *data) +{ + struct ubi_volume_desc *ubi; + struct ubifs_info *c; + struct super_block *sb; + int err; + + dbg_gen("name %s, flags %#x", name, flags); + + /* + * Get UBI device number and volume ID. Mount it read-only so far + * because this might be a new mount point, and UBI allows only one + * read-write user at a time. + */ + ubi = open_ubi(name, UBI_READONLY); + if (IS_ERR(ubi)) { + dbg_err("cannot open \"%s\", error %d", + name, (int)PTR_ERR(ubi)); + return ERR_CAST(ubi); + } + + c = alloc_ubifs_info(ubi); + if (!c) { + err = -ENOMEM; + goto out_close; + } + + dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); + + sb = sget(fs_type, sb_test, sb_set, c); + if (IS_ERR(sb)) { + err = PTR_ERR(sb); + kfree(c); + goto out_close; + } + + if (sb->s_root) { + struct ubifs_info *c1 = sb->s_fs_info; + kfree(c); + /* A new mount point for already mounted UBIFS */ + dbg_gen("this ubi volume is already mounted"); + if (!!(flags & MS_RDONLY) != c1->ro_mount) { + err = -EBUSY; + goto out_deact; + } + } else { + sb->s_flags = flags; + err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + if (err) + goto out_deact; + /* We do not support atime */ + sb->s_flags |= MS_ACTIVE | MS_NOATIME; + } + + /* 'fill_super()' opens ubi again so we must close it here */ + ubi_close_volume(ubi); + + return dget(sb->s_root); + +out_deact: + deactivate_locked_super(sb); +out_close: + ubi_close_volume(ubi); + return ERR_PTR(err); +} + +static void kill_ubifs_super(struct super_block *s) +{ + struct ubifs_info *c = s->s_fs_info; + kill_anon_super(s); + kfree(c); +} + +static struct file_system_type ubifs_fs_type = { + .name = "ubifs", + .owner = THIS_MODULE, + .mount = ubifs_mount, + .kill_sb = kill_ubifs_super, +}; + +/* + * Inode slab cache constructor. + */ +static void inode_slab_ctor(void *obj) +{ + struct ubifs_inode *ui = obj; + inode_init_once(&ui->vfs_inode); +} + +static int __init ubifs_init(void) +{ + int err; + + BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24); + + /* Make sure node sizes are 8-byte aligned */ + BUILD_BUG_ON(UBIFS_CH_SZ & 7); + BUILD_BUG_ON(UBIFS_INO_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_SB_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MST_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_REF_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_CS_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7); + + BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_NODE_SZ & 7); + BUILD_BUG_ON(MIN_WRITE_SZ & 7); + + /* Check min. node size */ + BUILD_BUG_ON(UBIFS_INO_NODE_SZ < MIN_WRITE_SZ); + BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ); + BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ); + BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ); + + BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ); + BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ); + BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ); + BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ > UBIFS_MAX_NODE_SZ); + + /* Defined node sizes */ + BUILD_BUG_ON(UBIFS_SB_NODE_SZ != 4096); + BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512); + BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160); + BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64); + + /* + * We use 2 bit wide bit-fields to store compression type, which should + * be amended if more compressors are added. The bit-fields are: + * @compr_type in 'struct ubifs_inode', @default_compr in + * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'. + */ + BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4); + + /* + * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to + * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. + */ + if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) { + ubifs_err("VFS page cache size is %u bytes, but UBIFS requires" + " at least 4096 bytes", + (unsigned int)PAGE_CACHE_SIZE); + return -EINVAL; + } + + err = register_filesystem(&ubifs_fs_type); + if (err) { + ubifs_err("cannot register file system, error %d", err); + return err; + } + + err = -ENOMEM; + ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", + sizeof(struct ubifs_inode), 0, + SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, + &inode_slab_ctor); + if (!ubifs_inode_slab) + goto out_reg; + + register_shrinker(&ubifs_shrinker_info); + + err = ubifs_compressors_init(); + if (err) + goto out_shrinker; + + err = dbg_debugfs_init(); + if (err) + goto out_compr; + + return 0; + +out_compr: + ubifs_compressors_exit(); +out_shrinker: + unregister_shrinker(&ubifs_shrinker_info); + kmem_cache_destroy(ubifs_inode_slab); +out_reg: + unregister_filesystem(&ubifs_fs_type); + return err; +} +/* late_initcall to let compressors initialize first */ +late_initcall(ubifs_init); + +static void __exit ubifs_exit(void) +{ + ubifs_assert(list_empty(&ubifs_infos)); + ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0); + + dbg_debugfs_exit(); + ubifs_compressors_exit(); + unregister_shrinker(&ubifs_shrinker_info); + kmem_cache_destroy(ubifs_inode_slab); + unregister_filesystem(&ubifs_fs_type); +} +module_exit(ubifs_exit); + +MODULE_LICENSE("GPL"); +MODULE_VERSION(__stringify(UBIFS_VERSION)); +MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter"); +MODULE_DESCRIPTION("UBIFS - UBI File System"); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c new file mode 100644 index 00000000..91b4213d --- /dev/null +++ b/fs/ubifs/tnc.c @@ -0,0 +1,3349 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements TNC (Tree Node Cache) which caches indexing nodes of + * the UBIFS B-tree. + * + * At the moment the locking rules of the TNC tree are quite simple and + * straightforward. We just have a mutex and lock it when we traverse the + * tree. If a znode is not in memory, we read it from flash while still having + * the mutex locked. + */ + +#include +#include +#include "ubifs.h" + +/* + * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions. + * @NAME_LESS: name corresponding to the first argument is less than second + * @NAME_MATCHES: names match + * @NAME_GREATER: name corresponding to the second argument is greater than + * first + * @NOT_ON_MEDIA: node referred by zbranch does not exist on the media + * + * These constants were introduce to improve readability. + */ +enum { + NAME_LESS = 0, + NAME_MATCHES = 1, + NAME_GREATER = 2, + NOT_ON_MEDIA = 3, +}; + +/** + * insert_old_idx - record an index node obsoleted since the last commit start. + * @c: UBIFS file-system description object + * @lnum: LEB number of obsoleted index node + * @offs: offset of obsoleted index node + * + * Returns %0 on success, and a negative error code on failure. + * + * For recovery, there must always be a complete intact version of the index on + * flash at all times. That is called the "old index". It is the index as at the + * time of the last successful commit. Many of the index nodes in the old index + * may be dirty, but they must not be erased until the next successful commit + * (at which point that index becomes the old index). + * + * That means that the garbage collection and the in-the-gaps method of + * committing must be able to determine if an index node is in the old index. + * Most of the old index nodes can be found by looking up the TNC using the + * 'lookup_znode()' function. However, some of the old index nodes may have + * been deleted from the current index or may have been changed so much that + * they cannot be easily found. In those cases, an entry is added to an RB-tree. + * That is what this function does. The RB-tree is ordered by LEB number and + * offset because they uniquely identify the old index node. + */ +static int insert_old_idx(struct ubifs_info *c, int lnum, int offs) +{ + struct ubifs_old_idx *old_idx, *o; + struct rb_node **p, *parent = NULL; + + old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS); + if (unlikely(!old_idx)) + return -ENOMEM; + old_idx->lnum = lnum; + old_idx->offs = offs; + + p = &c->old_idx.rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct ubifs_old_idx, rb); + if (lnum < o->lnum) + p = &(*p)->rb_left; + else if (lnum > o->lnum) + p = &(*p)->rb_right; + else if (offs < o->offs) + p = &(*p)->rb_left; + else if (offs > o->offs) + p = &(*p)->rb_right; + else { + ubifs_err("old idx added twice!"); + kfree(old_idx); + return 0; + } + } + rb_link_node(&old_idx->rb, parent, p); + rb_insert_color(&old_idx->rb, &c->old_idx); + return 0; +} + +/** + * insert_old_idx_znode - record a znode obsoleted since last commit start. + * @c: UBIFS file-system description object + * @znode: znode of obsoleted index node + * + * Returns %0 on success, and a negative error code on failure. + */ +int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode) +{ + if (znode->parent) { + struct ubifs_zbranch *zbr; + + zbr = &znode->parent->zbranch[znode->iip]; + if (zbr->len) + return insert_old_idx(c, zbr->lnum, zbr->offs); + } else + if (c->zroot.len) + return insert_old_idx(c, c->zroot.lnum, + c->zroot.offs); + return 0; +} + +/** + * ins_clr_old_idx_znode - record a znode obsoleted since last commit start. + * @c: UBIFS file-system description object + * @znode: znode of obsoleted index node + * + * Returns %0 on success, and a negative error code on failure. + */ +static int ins_clr_old_idx_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + int err; + + if (znode->parent) { + struct ubifs_zbranch *zbr; + + zbr = &znode->parent->zbranch[znode->iip]; + if (zbr->len) { + err = insert_old_idx(c, zbr->lnum, zbr->offs); + if (err) + return err; + zbr->lnum = 0; + zbr->offs = 0; + zbr->len = 0; + } + } else + if (c->zroot.len) { + err = insert_old_idx(c, c->zroot.lnum, c->zroot.offs); + if (err) + return err; + c->zroot.lnum = 0; + c->zroot.offs = 0; + c->zroot.len = 0; + } + return 0; +} + +/** + * destroy_old_idx - destroy the old_idx RB-tree. + * @c: UBIFS file-system description object + * + * During start commit, the old_idx RB-tree is used to avoid overwriting index + * nodes that were in the index last commit but have since been deleted. This + * is necessary for recovery i.e. the old index must be kept intact until the + * new index is successfully written. The old-idx RB-tree is used for the + * in-the-gaps method of writing index nodes and is destroyed every commit. + */ +void destroy_old_idx(struct ubifs_info *c) +{ + struct rb_node *this = c->old_idx.rb_node; + struct ubifs_old_idx *old_idx; + + while (this) { + if (this->rb_left) { + this = this->rb_left; + continue; + } else if (this->rb_right) { + this = this->rb_right; + continue; + } + old_idx = rb_entry(this, struct ubifs_old_idx, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &old_idx->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(old_idx); + } + c->old_idx = RB_ROOT; +} + +/** + * copy_znode - copy a dirty znode. + * @c: UBIFS file-system description object + * @znode: znode to copy + * + * A dirty znode being committed may not be changed, so it is copied. + */ +static struct ubifs_znode *copy_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + struct ubifs_znode *zn; + + zn = kmalloc(c->max_znode_sz, GFP_NOFS); + if (unlikely(!zn)) + return ERR_PTR(-ENOMEM); + + memcpy(zn, znode, c->max_znode_sz); + zn->cnext = NULL; + __set_bit(DIRTY_ZNODE, &zn->flags); + __clear_bit(COW_ZNODE, &zn->flags); + + ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags)); + __set_bit(OBSOLETE_ZNODE, &znode->flags); + + if (znode->level != 0) { + int i; + const int n = zn->child_cnt; + + /* The children now have new parent */ + for (i = 0; i < n; i++) { + struct ubifs_zbranch *zbr = &zn->zbranch[i]; + + if (zbr->znode) + zbr->znode->parent = zn; + } + } + + atomic_long_inc(&c->dirty_zn_cnt); + return zn; +} + +/** + * add_idx_dirt - add dirt due to a dirty znode. + * @c: UBIFS file-system description object + * @lnum: LEB number of index node + * @dirt: size of index node + * + * This function updates lprops dirty space and the new size of the index. + */ +static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt) +{ + c->calc_idx_sz -= ALIGN(dirt, 8); + return ubifs_add_dirt(c, lnum, dirt); +} + +/** + * dirty_cow_znode - ensure a znode is not being committed. + * @c: UBIFS file-system description object + * @zbr: branch of znode to check + * + * Returns dirtied znode on success or negative error code on failure. + */ +static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c, + struct ubifs_zbranch *zbr) +{ + struct ubifs_znode *znode = zbr->znode; + struct ubifs_znode *zn; + int err; + + if (!test_bit(COW_ZNODE, &znode->flags)) { + /* znode is not being committed */ + if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) { + atomic_long_inc(&c->dirty_zn_cnt); + atomic_long_dec(&c->clean_zn_cnt); + atomic_long_dec(&ubifs_clean_zn_cnt); + err = add_idx_dirt(c, zbr->lnum, zbr->len); + if (unlikely(err)) + return ERR_PTR(err); + } + return znode; + } + + zn = copy_znode(c, znode); + if (IS_ERR(zn)) + return zn; + + if (zbr->len) { + err = insert_old_idx(c, zbr->lnum, zbr->offs); + if (unlikely(err)) + return ERR_PTR(err); + err = add_idx_dirt(c, zbr->lnum, zbr->len); + } else + err = 0; + + zbr->znode = zn; + zbr->lnum = 0; + zbr->offs = 0; + zbr->len = 0; + + if (unlikely(err)) + return ERR_PTR(err); + return zn; +} + +/** + * lnc_add - add a leaf node to the leaf node cache. + * @c: UBIFS file-system description object + * @zbr: zbranch of leaf node + * @node: leaf node + * + * Leaf nodes are non-index nodes directory entry nodes or data nodes. The + * purpose of the leaf node cache is to save re-reading the same leaf node over + * and over again. Most things are cached by VFS, however the file system must + * cache directory entries for readdir and for resolving hash collisions. The + * present implementation of the leaf node cache is extremely simple, and + * allows for error returns that are not used but that may be needed if a more + * complex implementation is created. + * + * Note, this function does not add the @node object to LNC directly, but + * allocates a copy of the object and adds the copy to LNC. The reason for this + * is that @node has been allocated outside of the TNC subsystem and will be + * used with @c->tnc_mutex unlock upon return from the TNC subsystem. But LNC + * may be changed at any time, e.g. freed by the shrinker. + */ +static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr, + const void *node) +{ + int err; + void *lnc_node; + const struct ubifs_dent_node *dent = node; + + ubifs_assert(!zbr->leaf); + ubifs_assert(zbr->len != 0); + ubifs_assert(is_hash_key(c, &zbr->key)); + + err = ubifs_validate_entry(c, dent); + if (err) { + dbg_dump_stack(); + dbg_dump_node(c, dent); + return err; + } + + lnc_node = kmalloc(zbr->len, GFP_NOFS); + if (!lnc_node) + /* We don't have to have the cache, so no error */ + return 0; + + memcpy(lnc_node, node, zbr->len); + zbr->leaf = lnc_node; + return 0; +} + + /** + * lnc_add_directly - add a leaf node to the leaf-node-cache. + * @c: UBIFS file-system description object + * @zbr: zbranch of leaf node + * @node: leaf node + * + * This function is similar to 'lnc_add()', but it does not create a copy of + * @node but inserts @node to TNC directly. + */ +static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node) +{ + int err; + + ubifs_assert(!zbr->leaf); + ubifs_assert(zbr->len != 0); + + err = ubifs_validate_entry(c, node); + if (err) { + dbg_dump_stack(); + dbg_dump_node(c, node); + return err; + } + + zbr->leaf = node; + return 0; +} + +/** + * lnc_free - remove a leaf node from the leaf node cache. + * @zbr: zbranch of leaf node + * @node: leaf node + */ +static void lnc_free(struct ubifs_zbranch *zbr) +{ + if (!zbr->leaf) + return; + kfree(zbr->leaf); + zbr->leaf = NULL; +} + +/** + * tnc_read_node_nm - read a "hashed" leaf node. + * @c: UBIFS file-system description object + * @zbr: key and position of the node + * @node: node is returned here + * + * This function reads a "hashed" node defined by @zbr from the leaf node cache + * (in it is there) or from the hash media, in which case the node is also + * added to LNC. Returns zero in case of success or a negative negative error + * code in case of failure. + */ +static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node) +{ + int err; + + ubifs_assert(is_hash_key(c, &zbr->key)); + + if (zbr->leaf) { + /* Read from the leaf node cache */ + ubifs_assert(zbr->len != 0); + memcpy(node, zbr->leaf, zbr->len); + return 0; + } + + err = ubifs_tnc_read_node(c, zbr, node); + if (err) + return err; + + /* Add the node to the leaf node cache */ + err = lnc_add(c, zbr, node); + return err; +} + +/** + * try_read_node - read a node if it is a node. + * @c: UBIFS file-system description object + * @buf: buffer to read to + * @type: node type + * @len: node length (not aligned) + * @lnum: LEB number of node to read + * @offs: offset of node to read + * + * This function tries to read a node of known type and length, checks it and + * stores it in @buf. This function returns %1 if a node is present and %0 if + * a node is not present. A negative error code is returned for I/O errors. + * This function performs that same function as ubifs_read_node except that + * it does not require that there is actually a node present and instead + * the return code indicates if a node was read. + * + * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc + * is true (it is controlled by corresponding mount option). However, if + * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to + * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is + * because during mounting or re-mounting from R/O mode to R/W mode we may read + * journal nodes (when replying the journal or doing the recovery) and the + * journal nodes may potentially be corrupted, so checking is required. + */ +static int try_read_node(const struct ubifs_info *c, void *buf, int type, + int len, int lnum, int offs) +{ + int err, node_len; + struct ubifs_ch *ch = buf; + uint32_t crc, node_crc; + + dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); + + err = ubi_read(c->ubi, lnum, buf, offs, len); + if (err) { + ubifs_err("cannot read node type %d from LEB %d:%d, error %d", + type, lnum, offs, err); + return err; + } + + if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) + return 0; + + if (ch->node_type != type) + return 0; + + node_len = le32_to_cpu(ch->len); + if (node_len != len) + return 0; + + if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting && + !c->remounting_rw) + return 1; + + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); + node_crc = le32_to_cpu(ch->crc); + if (crc != node_crc) + return 0; + + return 1; +} + +/** + * fallible_read_node - try to read a leaf node. + * @c: UBIFS file-system description object + * @key: key of node to read + * @zbr: position of node + * @node: node returned + * + * This function tries to read a node and returns %1 if the node is read, %0 + * if the node is not present, and a negative error code in the case of error. + */ +static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_zbranch *zbr, void *node) +{ + int ret; + + dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key)); + + ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum, + zbr->offs); + if (ret == 1) { + union ubifs_key node_key; + struct ubifs_dent_node *dent = node; + + /* All nodes have key in the same place */ + key_read(c, &dent->key, &node_key); + if (keys_cmp(c, key, &node_key) != 0) + ret = 0; + } + if (ret == 0 && c->replaying) + dbg_mnt("dangling branch LEB %d:%d len %d, key %s", + zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); + return ret; +} + +/** + * matches_name - determine if a direntry or xattr entry matches a given name. + * @c: UBIFS file-system description object + * @zbr: zbranch of dent + * @nm: name to match + * + * This function checks if xentry/direntry referred by zbranch @zbr matches name + * @nm. Returns %NAME_MATCHES if it does, %NAME_LESS if the name referred by + * @zbr is less than @nm, and %NAME_GREATER if it is greater than @nm. In case + * of failure, a negative error code is returned. + */ +static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr, + const struct qstr *nm) +{ + struct ubifs_dent_node *dent; + int nlen, err; + + /* If possible, match against the dent in the leaf node cache */ + if (!zbr->leaf) { + dent = kmalloc(zbr->len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + err = ubifs_tnc_read_node(c, zbr, dent); + if (err) + goto out_free; + + /* Add the node to the leaf node cache */ + err = lnc_add_directly(c, zbr, dent); + if (err) + goto out_free; + } else + dent = zbr->leaf; + + nlen = le16_to_cpu(dent->nlen); + err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len)); + if (err == 0) { + if (nlen == nm->len) + return NAME_MATCHES; + else if (nlen < nm->len) + return NAME_LESS; + else + return NAME_GREATER; + } else if (err < 0) + return NAME_LESS; + else + return NAME_GREATER; + +out_free: + kfree(dent); + return err; +} + +/** + * get_znode - get a TNC znode that may not be loaded yet. + * @c: UBIFS file-system description object + * @znode: parent znode + * @n: znode branch slot number + * + * This function returns the znode or a negative error code. + */ +static struct ubifs_znode *get_znode(struct ubifs_info *c, + struct ubifs_znode *znode, int n) +{ + struct ubifs_zbranch *zbr; + + zbr = &znode->zbranch[n]; + if (zbr->znode) + znode = zbr->znode; + else + znode = ubifs_load_znode(c, zbr, znode, n); + return znode; +} + +/** + * tnc_next - find next TNC entry. + * @c: UBIFS file-system description object + * @zn: znode is passed and returned here + * @n: znode branch slot number is passed and returned here + * + * This function returns %0 if the next TNC entry is found, %-ENOENT if there is + * no next entry, or a negative error code otherwise. + */ +static int tnc_next(struct ubifs_info *c, struct ubifs_znode **zn, int *n) +{ + struct ubifs_znode *znode = *zn; + int nn = *n; + + nn += 1; + if (nn < znode->child_cnt) { + *n = nn; + return 0; + } + while (1) { + struct ubifs_znode *zp; + + zp = znode->parent; + if (!zp) + return -ENOENT; + nn = znode->iip + 1; + znode = zp; + if (nn < znode->child_cnt) { + znode = get_znode(c, znode, nn); + if (IS_ERR(znode)) + return PTR_ERR(znode); + while (znode->level != 0) { + znode = get_znode(c, znode, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + nn = 0; + break; + } + } + *zn = znode; + *n = nn; + return 0; +} + +/** + * tnc_prev - find previous TNC entry. + * @c: UBIFS file-system description object + * @zn: znode is returned here + * @n: znode branch slot number is passed and returned here + * + * This function returns %0 if the previous TNC entry is found, %-ENOENT if + * there is no next entry, or a negative error code otherwise. + */ +static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n) +{ + struct ubifs_znode *znode = *zn; + int nn = *n; + + if (nn > 0) { + *n = nn - 1; + return 0; + } + while (1) { + struct ubifs_znode *zp; + + zp = znode->parent; + if (!zp) + return -ENOENT; + nn = znode->iip - 1; + znode = zp; + if (nn >= 0) { + znode = get_znode(c, znode, nn); + if (IS_ERR(znode)) + return PTR_ERR(znode); + while (znode->level != 0) { + nn = znode->child_cnt - 1; + znode = get_znode(c, znode, nn); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + nn = znode->child_cnt - 1; + break; + } + } + *zn = znode; + *n = nn; + return 0; +} + +/** + * resolve_collision - resolve a collision. + * @c: UBIFS file-system description object + * @key: key of a directory or extended attribute entry + * @zn: znode is returned here + * @n: zbranch number is passed and returned here + * @nm: name of the entry + * + * This function is called for "hashed" keys to make sure that the found key + * really corresponds to the looked up node (directory or extended attribute + * entry). It returns %1 and sets @zn and @n if the collision is resolved. + * %0 is returned if @nm is not found and @zn and @n are set to the previous + * entry, i.e. to the entry after which @nm could follow if it were in TNC. + * This means that @n may be set to %-1 if the leftmost key in @zn is the + * previous one. A negative error code is returned on failures. + */ +static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n, + const struct qstr *nm) +{ + int err; + + err = matches_name(c, &(*zn)->zbranch[*n], nm); + if (unlikely(err < 0)) + return err; + if (err == NAME_MATCHES) + return 1; + + if (err == NAME_GREATER) { + /* Look left */ + while (1) { + err = tnc_prev(c, zn, n); + if (err == -ENOENT) { + ubifs_assert(*n == 0); + *n = -1; + return 0; + } + if (err < 0) + return err; + if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) { + /* + * We have found the branch after which we would + * like to insert, but inserting in this znode + * may still be wrong. Consider the following 3 + * znodes, in the case where we are resolving a + * collision with Key2. + * + * znode zp + * ---------------------- + * level 1 | Key0 | Key1 | + * ----------------------- + * | | + * znode za | | znode zb + * ------------ ------------ + * level 0 | Key0 | | Key2 | + * ------------ ------------ + * + * The lookup finds Key2 in znode zb. Lets say + * there is no match and the name is greater so + * we look left. When we find Key0, we end up + * here. If we return now, we will insert into + * znode za at slot n = 1. But that is invalid + * according to the parent's keys. Key2 must + * be inserted into znode zb. + * + * Note, this problem is not relevant for the + * case when we go right, because + * 'tnc_insert()' would correct the parent key. + */ + if (*n == (*zn)->child_cnt - 1) { + err = tnc_next(c, zn, n); + if (err) { + /* Should be impossible */ + ubifs_assert(0); + if (err == -ENOENT) + err = -EINVAL; + return err; + } + ubifs_assert(*n == 0); + *n = -1; + } + return 0; + } + err = matches_name(c, &(*zn)->zbranch[*n], nm); + if (err < 0) + return err; + if (err == NAME_LESS) + return 0; + if (err == NAME_MATCHES) + return 1; + ubifs_assert(err == NAME_GREATER); + } + } else { + int nn = *n; + struct ubifs_znode *znode = *zn; + + /* Look right */ + while (1) { + err = tnc_next(c, &znode, &nn); + if (err == -ENOENT) + return 0; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + return 0; + err = matches_name(c, &znode->zbranch[nn], nm); + if (err < 0) + return err; + if (err == NAME_GREATER) + return 0; + *zn = znode; + *n = nn; + if (err == NAME_MATCHES) + return 1; + ubifs_assert(err == NAME_LESS); + } + } +} + +/** + * fallible_matches_name - determine if a dent matches a given name. + * @c: UBIFS file-system description object + * @zbr: zbranch of dent + * @nm: name to match + * + * This is a "fallible" version of 'matches_name()' function which does not + * panic if the direntry/xentry referred by @zbr does not exist on the media. + * + * This function checks if xentry/direntry referred by zbranch @zbr matches name + * @nm. Returns %NAME_MATCHES it does, %NAME_LESS if the name referred by @zbr + * is less than @nm, %NAME_GREATER if it is greater than @nm, and @NOT_ON_MEDIA + * if xentry/direntry referred by @zbr does not exist on the media. A negative + * error code is returned in case of failure. + */ +static int fallible_matches_name(struct ubifs_info *c, + struct ubifs_zbranch *zbr, + const struct qstr *nm) +{ + struct ubifs_dent_node *dent; + int nlen, err; + + /* If possible, match against the dent in the leaf node cache */ + if (!zbr->leaf) { + dent = kmalloc(zbr->len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + err = fallible_read_node(c, &zbr->key, zbr, dent); + if (err < 0) + goto out_free; + if (err == 0) { + /* The node was not present */ + err = NOT_ON_MEDIA; + goto out_free; + } + ubifs_assert(err == 1); + + err = lnc_add_directly(c, zbr, dent); + if (err) + goto out_free; + } else + dent = zbr->leaf; + + nlen = le16_to_cpu(dent->nlen); + err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len)); + if (err == 0) { + if (nlen == nm->len) + return NAME_MATCHES; + else if (nlen < nm->len) + return NAME_LESS; + else + return NAME_GREATER; + } else if (err < 0) + return NAME_LESS; + else + return NAME_GREATER; + +out_free: + kfree(dent); + return err; +} + +/** + * fallible_resolve_collision - resolve a collision even if nodes are missing. + * @c: UBIFS file-system description object + * @key: key + * @zn: znode is returned here + * @n: branch number is passed and returned here + * @nm: name of directory entry + * @adding: indicates caller is adding a key to the TNC + * + * This is a "fallible" version of the 'resolve_collision()' function which + * does not panic if one of the nodes referred to by TNC does not exist on the + * media. This may happen when replaying the journal if a deleted node was + * Garbage-collected and the commit was not done. A branch that refers to a node + * that is not present is called a dangling branch. The following are the return + * codes for this function: + * o if @nm was found, %1 is returned and @zn and @n are set to the found + * branch; + * o if we are @adding and @nm was not found, %0 is returned; + * o if we are not @adding and @nm was not found, but a dangling branch was + * found, then %1 is returned and @zn and @n are set to the dangling branch; + * o a negative error code is returned in case of failure. + */ +static int fallible_resolve_collision(struct ubifs_info *c, + const union ubifs_key *key, + struct ubifs_znode **zn, int *n, + const struct qstr *nm, int adding) +{ + struct ubifs_znode *o_znode = NULL, *znode = *zn; + int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n; + + cmp = fallible_matches_name(c, &znode->zbranch[nn], nm); + if (unlikely(cmp < 0)) + return cmp; + if (cmp == NAME_MATCHES) + return 1; + if (cmp == NOT_ON_MEDIA) { + o_znode = znode; + o_n = nn; + /* + * We are unlucky and hit a dangling branch straight away. + * Now we do not really know where to go to find the needed + * branch - to the left or to the right. Well, let's try left. + */ + unsure = 1; + } else if (!adding) + unsure = 1; /* Remove a dangling branch wherever it is */ + + if (cmp == NAME_GREATER || unsure) { + /* Look left */ + while (1) { + err = tnc_prev(c, zn, n); + if (err == -ENOENT) { + ubifs_assert(*n == 0); + *n = -1; + break; + } + if (err < 0) + return err; + if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) { + /* See comments in 'resolve_collision()' */ + if (*n == (*zn)->child_cnt - 1) { + err = tnc_next(c, zn, n); + if (err) { + /* Should be impossible */ + ubifs_assert(0); + if (err == -ENOENT) + err = -EINVAL; + return err; + } + ubifs_assert(*n == 0); + *n = -1; + } + break; + } + err = fallible_matches_name(c, &(*zn)->zbranch[*n], nm); + if (err < 0) + return err; + if (err == NAME_MATCHES) + return 1; + if (err == NOT_ON_MEDIA) { + o_znode = *zn; + o_n = *n; + continue; + } + if (!adding) + continue; + if (err == NAME_LESS) + break; + else + unsure = 0; + } + } + + if (cmp == NAME_LESS || unsure) { + /* Look right */ + *zn = znode; + *n = nn; + while (1) { + err = tnc_next(c, &znode, &nn); + if (err == -ENOENT) + break; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + break; + err = fallible_matches_name(c, &znode->zbranch[nn], nm); + if (err < 0) + return err; + if (err == NAME_GREATER) + break; + *zn = znode; + *n = nn; + if (err == NAME_MATCHES) + return 1; + if (err == NOT_ON_MEDIA) { + o_znode = znode; + o_n = nn; + } + } + } + + /* Never match a dangling branch when adding */ + if (adding || !o_znode) + return 0; + + dbg_mnt("dangling match LEB %d:%d len %d %s", + o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs, + o_znode->zbranch[o_n].len, DBGKEY(key)); + *zn = o_znode; + *n = o_n; + return 1; +} + +/** + * matches_position - determine if a zbranch matches a given position. + * @zbr: zbranch of dent + * @lnum: LEB number of dent to match + * @offs: offset of dent to match + * + * This function returns %1 if @lnum:@offs matches, and %0 otherwise. + */ +static int matches_position(struct ubifs_zbranch *zbr, int lnum, int offs) +{ + if (zbr->lnum == lnum && zbr->offs == offs) + return 1; + else + return 0; +} + +/** + * resolve_collision_directly - resolve a collision directly. + * @c: UBIFS file-system description object + * @key: key of directory entry + * @zn: znode is passed and returned here + * @n: zbranch number is passed and returned here + * @lnum: LEB number of dent node to match + * @offs: offset of dent node to match + * + * This function is used for "hashed" keys to make sure the found directory or + * extended attribute entry node is what was looked for. It is used when the + * flash address of the right node is known (@lnum:@offs) which makes it much + * easier to resolve collisions (no need to read entries and match full + * names). This function returns %1 and sets @zn and @n if the collision is + * resolved, %0 if @lnum:@offs is not found and @zn and @n are set to the + * previous directory entry. Otherwise a negative error code is returned. + */ +static int resolve_collision_directly(struct ubifs_info *c, + const union ubifs_key *key, + struct ubifs_znode **zn, int *n, + int lnum, int offs) +{ + struct ubifs_znode *znode; + int nn, err; + + znode = *zn; + nn = *n; + if (matches_position(&znode->zbranch[nn], lnum, offs)) + return 1; + + /* Look left */ + while (1) { + err = tnc_prev(c, &znode, &nn); + if (err == -ENOENT) + break; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + break; + if (matches_position(&znode->zbranch[nn], lnum, offs)) { + *zn = znode; + *n = nn; + return 1; + } + } + + /* Look right */ + znode = *zn; + nn = *n; + while (1) { + err = tnc_next(c, &znode, &nn); + if (err == -ENOENT) + return 0; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + return 0; + *zn = znode; + *n = nn; + if (matches_position(&znode->zbranch[nn], lnum, offs)) + return 1; + } +} + +/** + * dirty_cow_bottom_up - dirty a znode and its ancestors. + * @c: UBIFS file-system description object + * @znode: znode to dirty + * + * If we do not have a unique key that resides in a znode, then we cannot + * dirty that znode from the top down (i.e. by using lookup_level0_dirty) + * This function records the path back to the last dirty ancestor, and then + * dirties the znodes on that path. + */ +static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + struct ubifs_znode *zp; + int *path = c->bottom_up_buf, p = 0; + + ubifs_assert(c->zroot.znode); + ubifs_assert(znode); + if (c->zroot.znode->level > BOTTOM_UP_HEIGHT) { + kfree(c->bottom_up_buf); + c->bottom_up_buf = kmalloc(c->zroot.znode->level * sizeof(int), + GFP_NOFS); + if (!c->bottom_up_buf) + return ERR_PTR(-ENOMEM); + path = c->bottom_up_buf; + } + if (c->zroot.znode->level) { + /* Go up until parent is dirty */ + while (1) { + int n; + + zp = znode->parent; + if (!zp) + break; + n = znode->iip; + ubifs_assert(p < c->zroot.znode->level); + path[p++] = n; + if (!zp->cnext && ubifs_zn_dirty(znode)) + break; + znode = zp; + } + } + + /* Come back down, dirtying as we go */ + while (1) { + struct ubifs_zbranch *zbr; + + zp = znode->parent; + if (zp) { + ubifs_assert(path[p - 1] >= 0); + ubifs_assert(path[p - 1] < zp->child_cnt); + zbr = &zp->zbranch[path[--p]]; + znode = dirty_cow_znode(c, zbr); + } else { + ubifs_assert(znode == c->zroot.znode); + znode = dirty_cow_znode(c, &c->zroot); + } + if (IS_ERR(znode) || !p) + break; + ubifs_assert(path[p - 1] >= 0); + ubifs_assert(path[p - 1] < znode->child_cnt); + znode = znode->zbranch[path[p - 1]].znode; + } + + return znode; +} + +/** + * ubifs_lookup_level0 - search for zero-level znode. + * @c: UBIFS file-system description object + * @key: key to lookup + * @zn: znode is returned here + * @n: znode branch slot number is returned here + * + * This function looks up the TNC tree and search for zero-level znode which + * refers key @key. The found zero-level znode is returned in @zn. There are 3 + * cases: + * o exact match, i.e. the found zero-level znode contains key @key, then %1 + * is returned and slot number of the matched branch is stored in @n; + * o not exact match, which means that zero-level znode does not contain + * @key, then %0 is returned and slot number of the closest branch is stored + * in @n; + * o @key is so small that it is even less than the lowest key of the + * leftmost zero-level node, then %0 is returned and %0 is stored in @n. + * + * Note, when the TNC tree is traversed, some znodes may be absent, then this + * function reads corresponding indexing nodes and inserts them to TNC. In + * case of failure, a negative error code is returned. + */ +int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n) +{ + int err, exact; + struct ubifs_znode *znode; + unsigned long time = get_seconds(); + + dbg_tnc("search key %s", DBGKEY(key)); + ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY); + + znode = c->zroot.znode; + if (unlikely(!znode)) { + znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + znode->time = time; + + while (1) { + struct ubifs_zbranch *zbr; + + exact = ubifs_search_zbranch(c, znode, key, n); + + if (znode->level == 0) + break; + + if (*n < 0) + *n = 0; + zbr = &znode->zbranch[*n]; + + if (zbr->znode) { + znode->time = time; + znode = zbr->znode; + continue; + } + + /* znode is not in TNC cache, load it from the media */ + znode = ubifs_load_znode(c, zbr, znode, *n); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + *zn = znode; + if (exact || !is_hash_key(c, key) || *n != -1) { + dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n); + return exact; + } + + /* + * Here is a tricky place. We have not found the key and this is a + * "hashed" key, which may collide. The rest of the code deals with + * situations like this: + * + * | 3 | 5 | + * / \ + * | 3 | 5 | | 6 | 7 | (x) + * + * Or more a complex example: + * + * | 1 | 5 | + * / \ + * | 1 | 3 | | 5 | 8 | + * \ / + * | 5 | 5 | | 6 | 7 | (x) + * + * In the examples, if we are looking for key "5", we may reach nodes + * marked with "(x)". In this case what we have do is to look at the + * left and see if there is "5" key there. If there is, we have to + * return it. + * + * Note, this whole situation is possible because we allow to have + * elements which are equivalent to the next key in the parent in the + * children of current znode. For example, this happens if we split a + * znode like this: | 3 | 5 | 5 | 6 | 7 |, which results in something + * like this: + * | 3 | 5 | + * / \ + * | 3 | 5 | | 5 | 6 | 7 | + * ^ + * And this becomes what is at the first "picture" after key "5" marked + * with "^" is removed. What could be done is we could prohibit + * splitting in the middle of the colliding sequence. Also, when + * removing the leftmost key, we would have to correct the key of the + * parent node, which would introduce additional complications. Namely, + * if we changed the leftmost key of the parent znode, the garbage + * collector would be unable to find it (GC is doing this when GC'ing + * indexing LEBs). Although we already have an additional RB-tree where + * we save such changed znodes (see 'ins_clr_old_idx_znode()') until + * after the commit. But anyway, this does not look easy to implement + * so we did not try this. + */ + err = tnc_prev(c, &znode, n); + if (err == -ENOENT) { + dbg_tnc("found 0, lvl %d, n -1", znode->level); + *n = -1; + return 0; + } + if (unlikely(err < 0)) + return err; + if (keys_cmp(c, key, &znode->zbranch[*n].key)) { + dbg_tnc("found 0, lvl %d, n -1", znode->level); + *n = -1; + return 0; + } + + dbg_tnc("found 1, lvl %d, n %d", znode->level, *n); + *zn = znode; + return 1; +} + +/** + * lookup_level0_dirty - search for zero-level znode dirtying. + * @c: UBIFS file-system description object + * @key: key to lookup + * @zn: znode is returned here + * @n: znode branch slot number is returned here + * + * This function looks up the TNC tree and search for zero-level znode which + * refers key @key. The found zero-level znode is returned in @zn. There are 3 + * cases: + * o exact match, i.e. the found zero-level znode contains key @key, then %1 + * is returned and slot number of the matched branch is stored in @n; + * o not exact match, which means that zero-level znode does not contain @key + * then %0 is returned and slot number of the closed branch is stored in + * @n; + * o @key is so small that it is even less than the lowest key of the + * leftmost zero-level node, then %0 is returned and %-1 is stored in @n. + * + * Additionally all znodes in the path from the root to the located zero-level + * znode are marked as dirty. + * + * Note, when the TNC tree is traversed, some znodes may be absent, then this + * function reads corresponding indexing nodes and inserts them to TNC. In + * case of failure, a negative error code is returned. + */ +static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n) +{ + int err, exact; + struct ubifs_znode *znode; + unsigned long time = get_seconds(); + + dbg_tnc("search and dirty key %s", DBGKEY(key)); + + znode = c->zroot.znode; + if (unlikely(!znode)) { + znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + znode = dirty_cow_znode(c, &c->zroot); + if (IS_ERR(znode)) + return PTR_ERR(znode); + + znode->time = time; + + while (1) { + struct ubifs_zbranch *zbr; + + exact = ubifs_search_zbranch(c, znode, key, n); + + if (znode->level == 0) + break; + + if (*n < 0) + *n = 0; + zbr = &znode->zbranch[*n]; + + if (zbr->znode) { + znode->time = time; + znode = dirty_cow_znode(c, zbr); + if (IS_ERR(znode)) + return PTR_ERR(znode); + continue; + } + + /* znode is not in TNC cache, load it from the media */ + znode = ubifs_load_znode(c, zbr, znode, *n); + if (IS_ERR(znode)) + return PTR_ERR(znode); + znode = dirty_cow_znode(c, zbr); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + *zn = znode; + if (exact || !is_hash_key(c, key) || *n != -1) { + dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n); + return exact; + } + + /* + * See huge comment at 'lookup_level0_dirty()' what is the rest of the + * code. + */ + err = tnc_prev(c, &znode, n); + if (err == -ENOENT) { + *n = -1; + dbg_tnc("found 0, lvl %d, n -1", znode->level); + return 0; + } + if (unlikely(err < 0)) + return err; + if (keys_cmp(c, key, &znode->zbranch[*n].key)) { + *n = -1; + dbg_tnc("found 0, lvl %d, n -1", znode->level); + return 0; + } + + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + dbg_tnc("found 1, lvl %d, n %d", znode->level, *n); + *zn = znode; + return 1; +} + +/** + * maybe_leb_gced - determine if a LEB may have been garbage collected. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @gc_seq1: garbage collection sequence number + * + * This function determines if @lnum may have been garbage collected since + * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise + * %0 is returned. + */ +static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1) +{ + int gc_seq2, gced_lnum; + + gced_lnum = c->gced_lnum; + smp_rmb(); + gc_seq2 = c->gc_seq; + /* Same seq means no GC */ + if (gc_seq1 == gc_seq2) + return 0; + /* Different by more than 1 means we don't know */ + if (gc_seq1 + 1 != gc_seq2) + return 1; + /* + * We have seen the sequence number has increased by 1. Now we need to + * be sure we read the right LEB number, so read it again. + */ + smp_rmb(); + if (gced_lnum != c->gced_lnum) + return 1; + /* Finally we can check lnum */ + if (gced_lnum == lnum) + return 1; + return 0; +} + +/** + * ubifs_tnc_locate - look up a file-system node and return it and its location. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * @lnum: LEB number is returned here + * @offs: offset is returned here + * + * This function looks up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. The node location can be returned in @lnum and @offs. + */ +int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, + void *node, int *lnum, int *offs) +{ + int found, n, err, safely = 0, gc_seq1; + struct ubifs_znode *znode; + struct ubifs_zbranch zbr, *zt; + +again: + mutex_lock(&c->tnc_mutex); + found = ubifs_lookup_level0(c, key, &znode, &n); + if (!found) { + err = -ENOENT; + goto out; + } else if (found < 0) { + err = found; + goto out; + } + zt = &znode->zbranch[n]; + if (lnum) { + *lnum = zt->lnum; + *offs = zt->offs; + } + if (is_hash_key(c, key)) { + /* + * In this case the leaf node cache gets used, so we pass the + * address of the zbranch and keep the mutex locked + */ + err = tnc_read_node_nm(c, zt, node); + goto out; + } + if (safely) { + err = ubifs_tnc_read_node(c, zt, node); + goto out; + } + /* Drop the TNC mutex prematurely and race with garbage collection */ + zbr = znode->zbranch[n]; + gc_seq1 = c->gc_seq; + mutex_unlock(&c->tnc_mutex); + + if (ubifs_get_wbuf(c, zbr.lnum)) { + /* We do not GC journal heads */ + err = ubifs_tnc_read_node(c, &zbr, node); + return err; + } + + err = fallible_read_node(c, key, &zbr, node); + if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) { + /* + * The node may have been GC'ed out from under us so try again + * while keeping the TNC mutex locked. + */ + safely = 1; + goto again; + } + return 0; + +out: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_get_bu_keys - lookup keys for bulk-read. + * @c: UBIFS file-system description object + * @bu: bulk-read parameters and results + * + * Lookup consecutive data node keys for the same inode that reside + * consecutively in the same LEB. This function returns zero in case of success + * and a negative error code in case of failure. + * + * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function + * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares + * maximum possible amount of nodes for bulk-read. + */ +int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu) +{ + int n, err = 0, lnum = -1, uninitialized_var(offs); + int uninitialized_var(len); + unsigned int block = key_block(c, &bu->key); + struct ubifs_znode *znode; + + bu->cnt = 0; + bu->blk_cnt = 0; + bu->eof = 0; + + mutex_lock(&c->tnc_mutex); + /* Find first key */ + err = ubifs_lookup_level0(c, &bu->key, &znode, &n); + if (err < 0) + goto out; + if (err) { + /* Key found */ + len = znode->zbranch[n].len; + /* The buffer must be big enough for at least 1 node */ + if (len > bu->buf_len) { + err = -EINVAL; + goto out; + } + /* Add this key */ + bu->zbranch[bu->cnt++] = znode->zbranch[n]; + bu->blk_cnt += 1; + lnum = znode->zbranch[n].lnum; + offs = ALIGN(znode->zbranch[n].offs + len, 8); + } + while (1) { + struct ubifs_zbranch *zbr; + union ubifs_key *key; + unsigned int next_block; + + /* Find next key */ + err = tnc_next(c, &znode, &n); + if (err) + goto out; + zbr = &znode->zbranch[n]; + key = &zbr->key; + /* See if there is another data key for this file */ + if (key_inum(c, key) != key_inum(c, &bu->key) || + key_type(c, key) != UBIFS_DATA_KEY) { + err = -ENOENT; + goto out; + } + if (lnum < 0) { + /* First key found */ + lnum = zbr->lnum; + offs = ALIGN(zbr->offs + zbr->len, 8); + len = zbr->len; + if (len > bu->buf_len) { + err = -EINVAL; + goto out; + } + } else { + /* + * The data nodes must be in consecutive positions in + * the same LEB. + */ + if (zbr->lnum != lnum || zbr->offs != offs) + goto out; + offs += ALIGN(zbr->len, 8); + len = ALIGN(len, 8) + zbr->len; + /* Must not exceed buffer length */ + if (len > bu->buf_len) + goto out; + } + /* Allow for holes */ + next_block = key_block(c, key); + bu->blk_cnt += (next_block - block - 1); + if (bu->blk_cnt >= UBIFS_MAX_BULK_READ) + goto out; + block = next_block; + /* Add this key */ + bu->zbranch[bu->cnt++] = *zbr; + bu->blk_cnt += 1; + /* See if we have room for more */ + if (bu->cnt >= UBIFS_MAX_BULK_READ) + goto out; + if (bu->blk_cnt >= UBIFS_MAX_BULK_READ) + goto out; + } +out: + if (err == -ENOENT) { + bu->eof = 1; + err = 0; + } + bu->gc_seq = c->gc_seq; + mutex_unlock(&c->tnc_mutex); + if (err) + return err; + /* + * An enormous hole could cause bulk-read to encompass too many + * page cache pages, so limit the number here. + */ + if (bu->blk_cnt > UBIFS_MAX_BULK_READ) + bu->blk_cnt = UBIFS_MAX_BULK_READ; + /* + * Ensure that bulk-read covers a whole number of page cache + * pages. + */ + if (UBIFS_BLOCKS_PER_PAGE == 1 || + !(bu->blk_cnt & (UBIFS_BLOCKS_PER_PAGE - 1))) + return 0; + if (bu->eof) { + /* At the end of file we can round up */ + bu->blk_cnt += UBIFS_BLOCKS_PER_PAGE - 1; + return 0; + } + /* Exclude data nodes that do not make up a whole page cache page */ + block = key_block(c, &bu->key) + bu->blk_cnt; + block &= ~(UBIFS_BLOCKS_PER_PAGE - 1); + while (bu->cnt) { + if (key_block(c, &bu->zbranch[bu->cnt - 1].key) < block) + break; + bu->cnt -= 1; + } + return 0; +} + +/** + * read_wbuf - bulk-read from a LEB with a wbuf. + * @wbuf: wbuf that may overlap the read + * @buf: buffer into which to read + * @len: read length + * @lnum: LEB number from which to read + * @offs: offset from which to read + * + * This functions returns %0 on success or a negative error code on failure. + */ +static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum, + int offs) +{ + const struct ubifs_info *c = wbuf->c; + int rlen, overlap; + + dbg_io("LEB %d:%d, length %d", lnum, offs, len); + ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(offs + len <= c->leb_size); + + spin_lock(&wbuf->lock); + overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs); + if (!overlap) { + /* We may safely unlock the write-buffer and read the data */ + spin_unlock(&wbuf->lock); + return ubi_read(c->ubi, lnum, buf, offs, len); + } + + /* Don't read under wbuf */ + rlen = wbuf->offs - offs; + if (rlen < 0) + rlen = 0; + + /* Copy the rest from the write-buffer */ + memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen); + spin_unlock(&wbuf->lock); + + if (rlen > 0) + /* Read everything that goes before write-buffer */ + return ubi_read(c->ubi, lnum, buf, offs, rlen); + + return 0; +} + +/** + * validate_data_node - validate data nodes for bulk-read. + * @c: UBIFS file-system description object + * @buf: buffer containing data node to validate + * @zbr: zbranch of data node to validate + * + * This functions returns %0 on success or a negative error code on failure. + */ +static int validate_data_node(struct ubifs_info *c, void *buf, + struct ubifs_zbranch *zbr) +{ + union ubifs_key key1; + struct ubifs_ch *ch = buf; + int err, len; + + if (ch->node_type != UBIFS_DATA_NODE) { + ubifs_err("bad node type (%d but expected %d)", + ch->node_type, UBIFS_DATA_NODE); + goto out_err; + } + + err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0); + if (err) { + ubifs_err("expected node type %d", UBIFS_DATA_NODE); + goto out; + } + + len = le32_to_cpu(ch->len); + if (len != zbr->len) { + ubifs_err("bad node length %d, expected %d", len, zbr->len); + goto out_err; + } + + /* Make sure the key of the read node is correct */ + key_read(c, buf + UBIFS_KEY_OFFSET, &key1); + if (!keys_eq(c, &zbr->key, &key1)) { + ubifs_err("bad key in node at LEB %d:%d", + zbr->lnum, zbr->offs); + dbg_tnc("looked for key %s found node's key %s", + DBGKEY(&zbr->key), DBGKEY1(&key1)); + goto out_err; + } + + return 0; + +out_err: + err = -EINVAL; +out: + ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs); + dbg_dump_node(c, buf); + dbg_dump_stack(); + return err; +} + +/** + * ubifs_tnc_bulk_read - read a number of data nodes in one go. + * @c: UBIFS file-system description object + * @bu: bulk-read parameters and results + * + * This functions reads and validates the data nodes that were identified by the + * 'ubifs_tnc_get_bu_keys()' function. This functions returns %0 on success, + * -EAGAIN to indicate a race with GC, or another negative error code on + * failure. + */ +int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) +{ + int lnum = bu->zbranch[0].lnum, offs = bu->zbranch[0].offs, len, err, i; + struct ubifs_wbuf *wbuf; + void *buf; + + len = bu->zbranch[bu->cnt - 1].offs; + len += bu->zbranch[bu->cnt - 1].len - offs; + if (len > bu->buf_len) { + ubifs_err("buffer too small %d vs %d", bu->buf_len, len); + return -EINVAL; + } + + /* Do the read */ + wbuf = ubifs_get_wbuf(c, lnum); + if (wbuf) + err = read_wbuf(wbuf, bu->buf, len, lnum, offs); + else + err = ubi_read(c->ubi, lnum, bu->buf, offs, len); + + /* Check for a race with GC */ + if (maybe_leb_gced(c, lnum, bu->gc_seq)) + return -EAGAIN; + + if (err && err != -EBADMSG) { + ubifs_err("failed to read from LEB %d:%d, error %d", + lnum, offs, err); + dbg_dump_stack(); + dbg_tnc("key %s", DBGKEY(&bu->key)); + return err; + } + + /* Validate the nodes read */ + buf = bu->buf; + for (i = 0; i < bu->cnt; i++) { + err = validate_data_node(c, buf, &bu->zbranch[i]); + if (err) + return err; + buf = buf + ALIGN(bu->zbranch[i].len, 8); + } + + return 0; +} + +/** + * do_lookup_nm- look up a "hashed" node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * @nm: node name + * + * This function look up and reads a node which contains name hash in the key. + * Since the hash may have collisions, there may be many nodes with the same + * key, so we have to sequentially look to all of them until the needed one is + * found. This function returns zero in case of success, %-ENOENT if the node + * was not found, and a negative error code in case of failure. + */ +static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, + void *node, const struct qstr *nm) +{ + int found, n, err; + struct ubifs_znode *znode; + + dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); + mutex_lock(&c->tnc_mutex); + found = ubifs_lookup_level0(c, key, &znode, &n); + if (!found) { + err = -ENOENT; + goto out_unlock; + } else if (found < 0) { + err = found; + goto out_unlock; + } + + ubifs_assert(n >= 0); + + err = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n); + if (unlikely(err < 0)) + goto out_unlock; + if (err == 0) { + err = -ENOENT; + goto out_unlock; + } + + err = tnc_read_node_nm(c, &znode->zbranch[n], node); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_lookup_nm - look up a "hashed" node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * @nm: node name + * + * This function look up and reads a node which contains name hash in the key. + * Since the hash may have collisions, there may be many nodes with the same + * key, so we have to sequentially look to all of them until the needed one is + * found. This function returns zero in case of success, %-ENOENT if the node + * was not found, and a negative error code in case of failure. + */ +int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, + void *node, const struct qstr *nm) +{ + int err, len; + const struct ubifs_dent_node *dent = node; + + /* + * We assume that in most of the cases there are no name collisions and + * 'ubifs_tnc_lookup()' returns us the right direntry. + */ + err = ubifs_tnc_lookup(c, key, node); + if (err) + return err; + + len = le16_to_cpu(dent->nlen); + if (nm->len == len && !memcmp(dent->name, nm->name, len)) + return 0; + + /* + * Unluckily, there are hash collisions and we have to iterate over + * them look at each direntry with colliding name hash sequentially. + */ + return do_lookup_nm(c, key, node, nm); +} + +/** + * correct_parent_keys - correct parent znodes' keys. + * @c: UBIFS file-system description object + * @znode: znode to correct parent znodes for + * + * This is a helper function for 'tnc_insert()'. When the key of the leftmost + * zbranch changes, keys of parent znodes have to be corrected. This helper + * function is called in such situations and corrects the keys if needed. + */ +static void correct_parent_keys(const struct ubifs_info *c, + struct ubifs_znode *znode) +{ + union ubifs_key *key, *key1; + + ubifs_assert(znode->parent); + ubifs_assert(znode->iip == 0); + + key = &znode->zbranch[0].key; + key1 = &znode->parent->zbranch[0].key; + + while (keys_cmp(c, key, key1) < 0) { + key_copy(c, key, key1); + znode = znode->parent; + znode->alt = 1; + if (!znode->parent || znode->iip) + break; + key1 = &znode->parent->zbranch[0].key; + } +} + +/** + * insert_zbranch - insert a zbranch into a znode. + * @znode: znode into which to insert + * @zbr: zbranch to insert + * @n: slot number to insert to + * + * This is a helper function for 'tnc_insert()'. UBIFS does not allow "gaps" in + * znode's array of zbranches and keeps zbranches consolidated, so when a new + * zbranch has to be inserted to the @znode->zbranches[]' array at the @n-th + * slot, zbranches starting from @n have to be moved right. + */ +static void insert_zbranch(struct ubifs_znode *znode, + const struct ubifs_zbranch *zbr, int n) +{ + int i; + + ubifs_assert(ubifs_zn_dirty(znode)); + + if (znode->level) { + for (i = znode->child_cnt; i > n; i--) { + znode->zbranch[i] = znode->zbranch[i - 1]; + if (znode->zbranch[i].znode) + znode->zbranch[i].znode->iip = i; + } + if (zbr->znode) + zbr->znode->iip = n; + } else + for (i = znode->child_cnt; i > n; i--) + znode->zbranch[i] = znode->zbranch[i - 1]; + + znode->zbranch[n] = *zbr; + znode->child_cnt += 1; + + /* + * After inserting at slot zero, the lower bound of the key range of + * this znode may have changed. If this znode is subsequently split + * then the upper bound of the key range may change, and furthermore + * it could change to be lower than the original lower bound. If that + * happens, then it will no longer be possible to find this znode in the + * TNC using the key from the index node on flash. That is bad because + * if it is not found, we will assume it is obsolete and may overwrite + * it. Then if there is an unclean unmount, we will start using the + * old index which will be broken. + * + * So we first mark znodes that have insertions at slot zero, and then + * if they are split we add their lnum/offs to the old_idx tree. + */ + if (n == 0) + znode->alt = 1; +} + +/** + * tnc_insert - insert a node into TNC. + * @c: UBIFS file-system description object + * @znode: znode to insert into + * @zbr: branch to insert + * @n: slot number to insert new zbranch to + * + * This function inserts a new node described by @zbr into znode @znode. If + * znode does not have a free slot for new zbranch, it is split. Parent znodes + * are splat as well if needed. Returns zero in case of success or a negative + * error code in case of failure. + */ +static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode, + struct ubifs_zbranch *zbr, int n) +{ + struct ubifs_znode *zn, *zi, *zp; + int i, keep, move, appending = 0; + union ubifs_key *key = &zbr->key, *key1; + + ubifs_assert(n >= 0 && n <= c->fanout); + + /* Implement naive insert for now */ +again: + zp = znode->parent; + if (znode->child_cnt < c->fanout) { + ubifs_assert(n != c->fanout); + dbg_tnc("inserted at %d level %d, key %s", n, znode->level, + DBGKEY(key)); + + insert_zbranch(znode, zbr, n); + + /* Ensure parent's key is correct */ + if (n == 0 && zp && znode->iip == 0) + correct_parent_keys(c, znode); + + return 0; + } + + /* + * Unfortunately, @znode does not have more empty slots and we have to + * split it. + */ + dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key)); + + if (znode->alt) + /* + * We can no longer be sure of finding this znode by key, so we + * record it in the old_idx tree. + */ + ins_clr_old_idx_znode(c, znode); + + zn = kzalloc(c->max_znode_sz, GFP_NOFS); + if (!zn) + return -ENOMEM; + zn->parent = zp; + zn->level = znode->level; + + /* Decide where to split */ + if (znode->level == 0 && key_type(c, key) == UBIFS_DATA_KEY) { + /* Try not to split consecutive data keys */ + if (n == c->fanout) { + key1 = &znode->zbranch[n - 1].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY) + appending = 1; + } else + goto check_split; + } else if (appending && n != c->fanout) { + /* Try not to split consecutive data keys */ + appending = 0; +check_split: + if (n >= (c->fanout + 1) / 2) { + key1 = &znode->zbranch[0].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY) { + key1 = &znode->zbranch[n].key; + if (key_inum(c, key1) != key_inum(c, key) || + key_type(c, key1) != UBIFS_DATA_KEY) { + keep = n; + move = c->fanout - keep; + zi = znode; + goto do_split; + } + } + } + } + + if (appending) { + keep = c->fanout; + move = 0; + } else { + keep = (c->fanout + 1) / 2; + move = c->fanout - keep; + } + + /* + * Although we don't at present, we could look at the neighbors and see + * if we can move some zbranches there. + */ + + if (n < keep) { + /* Insert into existing znode */ + zi = znode; + move += 1; + keep -= 1; + } else { + /* Insert into new znode */ + zi = zn; + n -= keep; + /* Re-parent */ + if (zn->level != 0) + zbr->znode->parent = zn; + } + +do_split: + + __set_bit(DIRTY_ZNODE, &zn->flags); + atomic_long_inc(&c->dirty_zn_cnt); + + zn->child_cnt = move; + znode->child_cnt = keep; + + dbg_tnc("moving %d, keeping %d", move, keep); + + /* Move zbranch */ + for (i = 0; i < move; i++) { + zn->zbranch[i] = znode->zbranch[keep + i]; + /* Re-parent */ + if (zn->level != 0) + if (zn->zbranch[i].znode) { + zn->zbranch[i].znode->parent = zn; + zn->zbranch[i].znode->iip = i; + } + } + + /* Insert new key and branch */ + dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key)); + + insert_zbranch(zi, zbr, n); + + /* Insert new znode (produced by spitting) into the parent */ + if (zp) { + if (n == 0 && zi == znode && znode->iip == 0) + correct_parent_keys(c, znode); + + /* Locate insertion point */ + n = znode->iip + 1; + + /* Tail recursion */ + zbr->key = zn->zbranch[0].key; + zbr->znode = zn; + zbr->lnum = 0; + zbr->offs = 0; + zbr->len = 0; + znode = zp; + + goto again; + } + + /* We have to split root znode */ + dbg_tnc("creating new zroot at level %d", znode->level + 1); + + zi = kzalloc(c->max_znode_sz, GFP_NOFS); + if (!zi) + return -ENOMEM; + + zi->child_cnt = 2; + zi->level = znode->level + 1; + + __set_bit(DIRTY_ZNODE, &zi->flags); + atomic_long_inc(&c->dirty_zn_cnt); + + zi->zbranch[0].key = znode->zbranch[0].key; + zi->zbranch[0].znode = znode; + zi->zbranch[0].lnum = c->zroot.lnum; + zi->zbranch[0].offs = c->zroot.offs; + zi->zbranch[0].len = c->zroot.len; + zi->zbranch[1].key = zn->zbranch[0].key; + zi->zbranch[1].znode = zn; + + c->zroot.lnum = 0; + c->zroot.offs = 0; + c->zroot.len = 0; + c->zroot.znode = zi; + + zn->parent = zi; + zn->iip = 1; + znode->parent = zi; + znode->iip = 0; + + return 0; +} + +/** + * ubifs_tnc_add - add a node to TNC. + * @c: UBIFS file-system description object + * @key: key to add + * @lnum: LEB number of node + * @offs: node offset + * @len: node length + * + * This function adds a node with key @key to TNC. The node may be new or it may + * obsolete some existing one. Returns %0 on success or negative error code on + * failure. + */ +int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, + int offs, int len) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key)); + found = lookup_level0_dirty(c, key, &znode, &n); + if (!found) { + struct ubifs_zbranch zbr; + + zbr.znode = NULL; + zbr.lnum = lnum; + zbr.offs = offs; + zbr.len = len; + key_copy(c, key, &zbr.key); + err = tnc_insert(c, znode, &zbr, n + 1); + } else if (found == 1) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } else + err = found; + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + + return err; +} + +/** + * ubifs_tnc_replace - replace a node in the TNC only if the old node is found. + * @c: UBIFS file-system description object + * @key: key to add + * @old_lnum: LEB number of old node + * @old_offs: old node offset + * @lnum: LEB number of node + * @offs: node offset + * @len: node length + * + * This function replaces a node with key @key in the TNC only if the old node + * is found. This function is called by garbage collection when node are moved. + * Returns %0 on success or negative error code on failure. + */ +int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, + int old_lnum, int old_offs, int lnum, int offs, int len) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum, + old_offs, lnum, offs, len, DBGKEY(key)); + found = lookup_level0_dirty(c, key, &znode, &n); + if (found < 0) { + err = found; + goto out_unlock; + } + + if (found == 1) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + found = 0; + if (zbr->lnum == old_lnum && zbr->offs == old_offs) { + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + if (err) + goto out_unlock; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + found = 1; + } else if (is_hash_key(c, key)) { + found = resolve_collision_directly(c, key, &znode, &n, + old_lnum, old_offs); + dbg_tnc("rc returned %d, znode %p, n %d, LEB %d:%d", + found, znode, n, old_lnum, old_offs); + if (found < 0) { + err = found; + goto out_unlock; + } + + if (found) { + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + zbr = &znode->zbranch[n]; + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, + zbr->len); + if (err) + goto out_unlock; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } + } + } + + if (!found) + err = ubifs_add_dirt(c, lnum, len); + + if (!err) + err = dbg_check_tnc(c, 0); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_add_nm - add a "hashed" node to TNC. + * @c: UBIFS file-system description object + * @key: key to add + * @lnum: LEB number of node + * @offs: node offset + * @len: node length + * @nm: node name + * + * This is the same as 'ubifs_tnc_add()' but it should be used with keys which + * may have collisions, like directory entry keys. + */ +int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, + int lnum, int offs, int len, const struct qstr *nm) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name, + DBGKEY(key)); + found = lookup_level0_dirty(c, key, &znode, &n); + if (found < 0) { + err = found; + goto out_unlock; + } + + if (found == 1) { + if (c->replaying) + found = fallible_resolve_collision(c, key, &znode, &n, + nm, 1); + else + found = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", found, znode, n); + if (found < 0) { + err = found; + goto out_unlock; + } + + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + + if (found == 1) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + goto out_unlock; + } + } + + if (!found) { + struct ubifs_zbranch zbr; + + zbr.znode = NULL; + zbr.lnum = lnum; + zbr.offs = offs; + zbr.len = len; + key_copy(c, key, &zbr.key); + err = tnc_insert(c, znode, &zbr, n + 1); + if (err) + goto out_unlock; + if (c->replaying) { + /* + * We did not find it in the index so there may be a + * dangling branch still in the index. So we remove it + * by passing 'ubifs_tnc_remove_nm()' the same key but + * an unmatchable name. + */ + struct qstr noname = { .len = 0, .name = "" }; + + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + if (err) + return err; + return ubifs_tnc_remove_nm(c, key, &noname); + } + } + +out_unlock: + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * tnc_delete - delete a znode form TNC. + * @c: UBIFS file-system description object + * @znode: znode to delete from + * @n: zbranch slot number to delete + * + * This function deletes a leaf node from @n-th slot of @znode. Returns zero in + * case of success and a negative error code in case of failure. + */ +static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) +{ + struct ubifs_zbranch *zbr; + struct ubifs_znode *zp; + int i, err; + + /* Delete without merge for now */ + ubifs_assert(znode->level == 0); + ubifs_assert(n >= 0 && n < c->fanout); + dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key)); + + zbr = &znode->zbranch[n]; + lnc_free(zbr); + + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + if (err) { + dbg_dump_znode(c, znode); + return err; + } + + /* We do not "gap" zbranch slots */ + for (i = n; i < znode->child_cnt - 1; i++) + znode->zbranch[i] = znode->zbranch[i + 1]; + znode->child_cnt -= 1; + + if (znode->child_cnt > 0) + return 0; + + /* + * This was the last zbranch, we have to delete this znode from the + * parent. + */ + + do { + ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags)); + ubifs_assert(ubifs_zn_dirty(znode)); + + zp = znode->parent; + n = znode->iip; + + atomic_long_dec(&c->dirty_zn_cnt); + + err = insert_old_idx_znode(c, znode); + if (err) + return err; + + if (znode->cnext) { + __set_bit(OBSOLETE_ZNODE, &znode->flags); + atomic_long_inc(&c->clean_zn_cnt); + atomic_long_inc(&ubifs_clean_zn_cnt); + } else + kfree(znode); + znode = zp; + } while (znode->child_cnt == 1); /* while removing last child */ + + /* Remove from znode, entry n - 1 */ + znode->child_cnt -= 1; + ubifs_assert(znode->level != 0); + for (i = n; i < znode->child_cnt; i++) { + znode->zbranch[i] = znode->zbranch[i + 1]; + if (znode->zbranch[i].znode) + znode->zbranch[i].znode->iip = i; + } + + /* + * If this is the root and it has only 1 child then + * collapse the tree. + */ + if (!znode->parent) { + while (znode->child_cnt == 1 && znode->level != 0) { + zp = znode; + zbr = &znode->zbranch[0]; + znode = get_znode(c, znode, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + znode = dirty_cow_znode(c, zbr); + if (IS_ERR(znode)) + return PTR_ERR(znode); + znode->parent = NULL; + znode->iip = 0; + if (c->zroot.len) { + err = insert_old_idx(c, c->zroot.lnum, + c->zroot.offs); + if (err) + return err; + } + c->zroot.lnum = zbr->lnum; + c->zroot.offs = zbr->offs; + c->zroot.len = zbr->len; + c->zroot.znode = znode; + ubifs_assert(!test_bit(OBSOLETE_ZNODE, + &zp->flags)); + ubifs_assert(test_bit(DIRTY_ZNODE, &zp->flags)); + atomic_long_dec(&c->dirty_zn_cnt); + + if (zp->cnext) { + __set_bit(OBSOLETE_ZNODE, &zp->flags); + atomic_long_inc(&c->clean_zn_cnt); + atomic_long_inc(&ubifs_clean_zn_cnt); + } else + kfree(zp); + } + } + + return 0; +} + +/** + * ubifs_tnc_remove - remove an index entry of a node. + * @c: UBIFS file-system description object + * @key: key of node + * + * Returns %0 on success or negative error code on failure. + */ +int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("key %s", DBGKEY(key)); + found = lookup_level0_dirty(c, key, &znode, &n); + if (found < 0) { + err = found; + goto out_unlock; + } + if (found == 1) + err = tnc_delete(c, znode, n); + if (!err) + err = dbg_check_tnc(c, 0); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_remove_nm - remove an index entry for a "hashed" node. + * @c: UBIFS file-system description object + * @key: key of node + * @nm: directory entry name + * + * Returns %0 on success or negative error code on failure. + */ +int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, + const struct qstr *nm) +{ + int n, err; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key)); + err = lookup_level0_dirty(c, key, &znode, &n); + if (err < 0) + goto out_unlock; + + if (err) { + if (c->replaying) + err = fallible_resolve_collision(c, key, &znode, &n, + nm, 0); + else + err = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n); + if (err < 0) + goto out_unlock; + if (err) { + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + err = tnc_delete(c, znode, n); + } + } + +out_unlock: + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * key_in_range - determine if a key falls within a range of keys. + * @c: UBIFS file-system description object + * @key: key to check + * @from_key: lowest key in range + * @to_key: highest key in range + * + * This function returns %1 if the key is in range and %0 otherwise. + */ +static int key_in_range(struct ubifs_info *c, union ubifs_key *key, + union ubifs_key *from_key, union ubifs_key *to_key) +{ + if (keys_cmp(c, key, from_key) < 0) + return 0; + if (keys_cmp(c, key, to_key) > 0) + return 0; + return 1; +} + +/** + * ubifs_tnc_remove_range - remove index entries in range. + * @c: UBIFS file-system description object + * @from_key: lowest key to remove + * @to_key: highest key to remove + * + * This function removes index entries starting at @from_key and ending at + * @to_key. This function returns zero in case of success and a negative error + * code in case of failure. + */ +int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, + union ubifs_key *to_key) +{ + int i, n, k, err = 0; + struct ubifs_znode *znode; + union ubifs_key *key; + + mutex_lock(&c->tnc_mutex); + while (1) { + /* Find first level 0 znode that contains keys to remove */ + err = ubifs_lookup_level0(c, from_key, &znode, &n); + if (err < 0) + goto out_unlock; + + if (err) + key = from_key; + else { + err = tnc_next(c, &znode, &n); + if (err == -ENOENT) { + err = 0; + goto out_unlock; + } + if (err < 0) + goto out_unlock; + key = &znode->zbranch[n].key; + if (!key_in_range(c, key, from_key, to_key)) { + err = 0; + goto out_unlock; + } + } + + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + + /* Remove all keys in range except the first */ + for (i = n + 1, k = 0; i < znode->child_cnt; i++, k++) { + key = &znode->zbranch[i].key; + if (!key_in_range(c, key, from_key, to_key)) + break; + lnc_free(&znode->zbranch[i]); + err = ubifs_add_dirt(c, znode->zbranch[i].lnum, + znode->zbranch[i].len); + if (err) { + dbg_dump_znode(c, znode); + goto out_unlock; + } + dbg_tnc("removing %s", DBGKEY(key)); + } + if (k) { + for (i = n + 1 + k; i < znode->child_cnt; i++) + znode->zbranch[i - k] = znode->zbranch[i]; + znode->child_cnt -= k; + } + + /* Now delete the first */ + err = tnc_delete(c, znode, n); + if (err) + goto out_unlock; + } + +out_unlock: + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_remove_ino - remove an inode from TNC. + * @c: UBIFS file-system description object + * @inum: inode number to remove + * + * This function remove inode @inum and all the extended attributes associated + * with the anode from TNC and returns zero in case of success or a negative + * error code in case of failure. + */ +int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) +{ + union ubifs_key key1, key2; + struct ubifs_dent_node *xent, *pxent = NULL; + struct qstr nm = { .name = NULL }; + + dbg_tnc("ino %lu", (unsigned long)inum); + + /* + * Walk all extended attribute entries and remove them together with + * corresponding extended attribute inodes. + */ + lowest_xent_key(c, &key1, inum); + while (1) { + ino_t xattr_inum; + int err; + + xent = ubifs_tnc_next_ent(c, &key1, &nm); + if (IS_ERR(xent)) { + err = PTR_ERR(xent); + if (err == -ENOENT) + break; + return err; + } + + xattr_inum = le64_to_cpu(xent->inum); + dbg_tnc("xent '%s', ino %lu", xent->name, + (unsigned long)xattr_inum); + + nm.name = xent->name; + nm.len = le16_to_cpu(xent->nlen); + err = ubifs_tnc_remove_nm(c, &key1, &nm); + if (err) { + kfree(xent); + return err; + } + + lowest_ino_key(c, &key1, xattr_inum); + highest_ino_key(c, &key2, xattr_inum); + err = ubifs_tnc_remove_range(c, &key1, &key2); + if (err) { + kfree(xent); + return err; + } + + kfree(pxent); + pxent = xent; + key_read(c, &xent->key, &key1); + } + + kfree(pxent); + lowest_ino_key(c, &key1, inum); + highest_ino_key(c, &key2, inum); + + return ubifs_tnc_remove_range(c, &key1, &key2); +} + +/** + * ubifs_tnc_next_ent - walk directory or extended attribute entries. + * @c: UBIFS file-system description object + * @key: key of last entry + * @nm: name of last entry found or %NULL + * + * This function finds and reads the next directory or extended attribute entry + * after the given key (@key) if there is one. @nm is used to resolve + * collisions. + * + * If the name of the current entry is not known and only the key is known, + * @nm->name has to be %NULL. In this case the semantics of this function is a + * little bit different and it returns the entry corresponding to this key, not + * the next one. If the key was not found, the closest "right" entry is + * returned. + * + * If the fist entry has to be found, @key has to contain the lowest possible + * key value for this inode and @name has to be %NULL. + * + * This function returns the found directory or extended attribute entry node + * in case of success, %-ENOENT is returned if no entry was found, and a + * negative error code is returned in case of failure. + */ +struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, + union ubifs_key *key, + const struct qstr *nm) +{ + int n, err, type = key_type(c, key); + struct ubifs_znode *znode; + struct ubifs_dent_node *dent; + struct ubifs_zbranch *zbr; + union ubifs_key *dkey; + + dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key)); + ubifs_assert(is_hash_key(c, key)); + + mutex_lock(&c->tnc_mutex); + err = ubifs_lookup_level0(c, key, &znode, &n); + if (unlikely(err < 0)) + goto out_unlock; + + if (nm->name) { + if (err) { + /* Handle collisions */ + err = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", + err, znode, n); + if (unlikely(err < 0)) + goto out_unlock; + } + + /* Now find next entry */ + err = tnc_next(c, &znode, &n); + if (unlikely(err)) + goto out_unlock; + } else { + /* + * The full name of the entry was not given, in which case the + * behavior of this function is a little different and it + * returns current entry, not the next one. + */ + if (!err) { + /* + * However, the given key does not exist in the TNC + * tree and @znode/@n variables contain the closest + * "preceding" element. Switch to the next one. + */ + err = tnc_next(c, &znode, &n); + if (err) + goto out_unlock; + } + } + + zbr = &znode->zbranch[n]; + dent = kmalloc(zbr->len, GFP_NOFS); + if (unlikely(!dent)) { + err = -ENOMEM; + goto out_unlock; + } + + /* + * The above 'tnc_next()' call could lead us to the next inode, check + * this. + */ + dkey = &zbr->key; + if (key_inum(c, dkey) != key_inum(c, key) || + key_type(c, dkey) != type) { + err = -ENOENT; + goto out_free; + } + + err = tnc_read_node_nm(c, zbr, dent); + if (unlikely(err)) + goto out_free; + + mutex_unlock(&c->tnc_mutex); + return dent; + +out_free: + kfree(dent); +out_unlock: + mutex_unlock(&c->tnc_mutex); + return ERR_PTR(err); +} + +/** + * tnc_destroy_cnext - destroy left-over obsolete znodes from a failed commit. + * @c: UBIFS file-system description object + * + * Destroy left-over obsolete znodes from a failed commit. + */ +static void tnc_destroy_cnext(struct ubifs_info *c) +{ + struct ubifs_znode *cnext; + + if (!c->cnext) + return; + ubifs_assert(c->cmt_state == COMMIT_BROKEN); + cnext = c->cnext; + do { + struct ubifs_znode *znode = cnext; + + cnext = cnext->cnext; + if (test_bit(OBSOLETE_ZNODE, &znode->flags)) + kfree(znode); + } while (cnext && cnext != c->cnext); +} + +/** + * ubifs_tnc_close - close TNC subsystem and free all related resources. + * @c: UBIFS file-system description object + */ +void ubifs_tnc_close(struct ubifs_info *c) +{ + tnc_destroy_cnext(c); + if (c->zroot.znode) { + long n; + + ubifs_destroy_tnc_subtree(c->zroot.znode); + n = atomic_long_read(&c->clean_zn_cnt); + atomic_long_sub(n, &ubifs_clean_zn_cnt); + } + kfree(c->gap_lebs); + kfree(c->ilebs); + destroy_old_idx(c); +} + +/** + * left_znode - get the znode to the left. + * @c: UBIFS file-system description object + * @znode: znode + * + * This function returns a pointer to the znode to the left of @znode or NULL if + * there is not one. A negative error code is returned on failure. + */ +static struct ubifs_znode *left_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + int level = znode->level; + + while (1) { + int n = znode->iip - 1; + + /* Go up until we can go left */ + znode = znode->parent; + if (!znode) + return NULL; + if (n >= 0) { + /* Now go down the rightmost branch to 'level' */ + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + while (znode->level != level) { + n = znode->child_cnt - 1; + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + } + break; + } + } + return znode; +} + +/** + * right_znode - get the znode to the right. + * @c: UBIFS file-system description object + * @znode: znode + * + * This function returns a pointer to the znode to the right of @znode or NULL + * if there is not one. A negative error code is returned on failure. + */ +static struct ubifs_znode *right_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + int level = znode->level; + + while (1) { + int n = znode->iip + 1; + + /* Go up until we can go right */ + znode = znode->parent; + if (!znode) + return NULL; + if (n < znode->child_cnt) { + /* Now go down the leftmost branch to 'level' */ + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + while (znode->level != level) { + znode = get_znode(c, znode, 0); + if (IS_ERR(znode)) + return znode; + } + break; + } + } + return znode; +} + +/** + * lookup_znode - find a particular indexing node from TNC. + * @c: UBIFS file-system description object + * @key: index node key to lookup + * @level: index node level + * @lnum: index node LEB number + * @offs: index node offset + * + * This function searches an indexing node by its first key @key and its + * address @lnum:@offs. It looks up the indexing tree by pulling all indexing + * nodes it traverses to TNC. This function is called for indexing nodes which + * were found on the media by scanning, for example when garbage-collecting or + * when doing in-the-gaps commit. This means that the indexing node which is + * looked for does not have to have exactly the same leftmost key @key, because + * the leftmost key may have been changed, in which case TNC will contain a + * dirty znode which still refers the same @lnum:@offs. This function is clever + * enough to recognize such indexing nodes. + * + * Note, if a znode was deleted or changed too much, then this function will + * not find it. For situations like this UBIFS has the old index RB-tree + * (indexed by @lnum:@offs). + * + * This function returns a pointer to the znode found or %NULL if it is not + * found. A negative error code is returned on failure. + */ +static struct ubifs_znode *lookup_znode(struct ubifs_info *c, + union ubifs_key *key, int level, + int lnum, int offs) +{ + struct ubifs_znode *znode, *zn; + int n, nn; + + ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY); + + /* + * The arguments have probably been read off flash, so don't assume + * they are valid. + */ + if (level < 0) + return ERR_PTR(-EINVAL); + + /* Get the root znode */ + znode = c->zroot.znode; + if (!znode) { + znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(znode)) + return znode; + } + /* Check if it is the one we are looking for */ + if (c->zroot.lnum == lnum && c->zroot.offs == offs) + return znode; + /* Descend to the parent level i.e. (level + 1) */ + if (level >= znode->level) + return NULL; + while (1) { + ubifs_search_zbranch(c, znode, key, &n); + if (n < 0) { + /* + * We reached a znode where the leftmost key is greater + * than the key we are searching for. This is the same + * situation as the one described in a huge comment at + * the end of the 'ubifs_lookup_level0()' function. And + * for exactly the same reasons we have to try to look + * left before giving up. + */ + znode = left_znode(c, znode); + if (!znode) + return NULL; + if (IS_ERR(znode)) + return znode; + ubifs_search_zbranch(c, znode, key, &n); + ubifs_assert(n >= 0); + } + if (znode->level == level + 1) + break; + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + } + /* Check if the child is the one we are looking for */ + if (znode->zbranch[n].lnum == lnum && znode->zbranch[n].offs == offs) + return get_znode(c, znode, n); + /* If the key is unique, there is nowhere else to look */ + if (!is_hash_key(c, key)) + return NULL; + /* + * The key is not unique and so may be also in the znodes to either + * side. + */ + zn = znode; + nn = n; + /* Look left */ + while (1) { + /* Move one branch to the left */ + if (n) + n -= 1; + else { + znode = left_znode(c, znode); + if (!znode) + break; + if (IS_ERR(znode)) + return znode; + n = znode->child_cnt - 1; + } + /* Check it */ + if (znode->zbranch[n].lnum == lnum && + znode->zbranch[n].offs == offs) + return get_znode(c, znode, n); + /* Stop if the key is less than the one we are looking for */ + if (keys_cmp(c, &znode->zbranch[n].key, key) < 0) + break; + } + /* Back to the middle */ + znode = zn; + n = nn; + /* Look right */ + while (1) { + /* Move one branch to the right */ + if (++n >= znode->child_cnt) { + znode = right_znode(c, znode); + if (!znode) + break; + if (IS_ERR(znode)) + return znode; + n = 0; + } + /* Check it */ + if (znode->zbranch[n].lnum == lnum && + znode->zbranch[n].offs == offs) + return get_znode(c, znode, n); + /* Stop if the key is greater than the one we are looking for */ + if (keys_cmp(c, &znode->zbranch[n].key, key) > 0) + break; + } + return NULL; +} + +/** + * is_idx_node_in_tnc - determine if an index node is in the TNC. + * @c: UBIFS file-system description object + * @key: key of index node + * @level: index node level + * @lnum: LEB number of index node + * @offs: offset of index node + * + * This function returns %0 if the index node is not referred to in the TNC, %1 + * if the index node is referred to in the TNC and the corresponding znode is + * dirty, %2 if an index node is referred to in the TNC and the corresponding + * znode is clean, and a negative error code in case of failure. + * + * Note, the @key argument has to be the key of the first child. Also note, + * this function relies on the fact that 0:0 is never a valid LEB number and + * offset for a main-area node. + */ +int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs) +{ + struct ubifs_znode *znode; + + znode = lookup_znode(c, key, level, lnum, offs); + if (!znode) + return 0; + if (IS_ERR(znode)) + return PTR_ERR(znode); + + return ubifs_zn_dirty(znode) ? 1 : 2; +} + +/** + * is_leaf_node_in_tnc - determine if a non-indexing not is in the TNC. + * @c: UBIFS file-system description object + * @key: node key + * @lnum: node LEB number + * @offs: node offset + * + * This function returns %1 if the node is referred to in the TNC, %0 if it is + * not, and a negative error code in case of failure. + * + * Note, this function relies on the fact that 0:0 is never a valid LEB number + * and offset for a main-area node. + */ +static int is_leaf_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, + int lnum, int offs) +{ + struct ubifs_zbranch *zbr; + struct ubifs_znode *znode, *zn; + int n, found, err, nn; + const int unique = !is_hash_key(c, key); + + found = ubifs_lookup_level0(c, key, &znode, &n); + if (found < 0) + return found; /* Error code */ + if (!found) + return 0; + zbr = &znode->zbranch[n]; + if (lnum == zbr->lnum && offs == zbr->offs) + return 1; /* Found it */ + if (unique) + return 0; + /* + * Because the key is not unique, we have to look left + * and right as well + */ + zn = znode; + nn = n; + /* Look left */ + while (1) { + err = tnc_prev(c, &znode, &n); + if (err == -ENOENT) + break; + if (err) + return err; + if (keys_cmp(c, key, &znode->zbranch[n].key)) + break; + zbr = &znode->zbranch[n]; + if (lnum == zbr->lnum && offs == zbr->offs) + return 1; /* Found it */ + } + /* Look right */ + znode = zn; + n = nn; + while (1) { + err = tnc_next(c, &znode, &n); + if (err) { + if (err == -ENOENT) + return 0; + return err; + } + if (keys_cmp(c, key, &znode->zbranch[n].key)) + break; + zbr = &znode->zbranch[n]; + if (lnum == zbr->lnum && offs == zbr->offs) + return 1; /* Found it */ + } + return 0; +} + +/** + * ubifs_tnc_has_node - determine whether a node is in the TNC. + * @c: UBIFS file-system description object + * @key: node key + * @level: index node level (if it is an index node) + * @lnum: node LEB number + * @offs: node offset + * @is_idx: non-zero if the node is an index node + * + * This function returns %1 if the node is in the TNC, %0 if it is not, and a + * negative error code in case of failure. For index nodes, @key has to be the + * key of the first child. An index node is considered to be in the TNC only if + * the corresponding znode is clean or has not been loaded. + */ +int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs, int is_idx) +{ + int err; + + mutex_lock(&c->tnc_mutex); + if (is_idx) { + err = is_idx_node_in_tnc(c, key, level, lnum, offs); + if (err < 0) + goto out_unlock; + if (err == 1) + /* The index node was found but it was dirty */ + err = 0; + else if (err == 2) + /* The index node was found and it was clean */ + err = 1; + else + BUG_ON(err != 0); + } else + err = is_leaf_node_in_tnc(c, key, lnum, offs); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_dirty_idx_node - dirty an index node. + * @c: UBIFS file-system description object + * @key: index node key + * @level: index node level + * @lnum: index node LEB number + * @offs: index node offset + * + * This function loads and dirties an index node so that it can be garbage + * collected. The @key argument has to be the key of the first child. This + * function relies on the fact that 0:0 is never a valid LEB number and offset + * for a main-area node. Returns %0 on success and a negative error code on + * failure. + */ +int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs) +{ + struct ubifs_znode *znode; + int err = 0; + + mutex_lock(&c->tnc_mutex); + znode = lookup_znode(c, key, level, lnum, offs); + if (!znode) + goto out_unlock; + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * dbg_check_inode_size - check if inode size is correct. + * @c: UBIFS file-system description object + * @inum: inode number + * @size: inode size + * + * This function makes sure that the inode size (@size) is correct and it does + * not have any pages beyond @size. Returns zero if the inode is OK, %-EINVAL + * if it has a data page beyond @size, and other negative error code in case of + * other errors. + */ +int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, + loff_t size) +{ + int err, n; + union ubifs_key from_key, to_key, *key; + struct ubifs_znode *znode; + unsigned int block; + + if (!S_ISREG(inode->i_mode)) + return 0; + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + + block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; + data_key_init(c, &from_key, inode->i_ino, block); + highest_data_key(c, &to_key, inode->i_ino); + + mutex_lock(&c->tnc_mutex); + err = ubifs_lookup_level0(c, &from_key, &znode, &n); + if (err < 0) + goto out_unlock; + + if (err) { + err = -EINVAL; + key = &from_key; + goto out_dump; + } + + err = tnc_next(c, &znode, &n); + if (err == -ENOENT) { + err = 0; + goto out_unlock; + } + if (err < 0) + goto out_unlock; + + ubifs_assert(err == 0); + key = &znode->zbranch[n].key; + if (!key_in_range(c, key, &from_key, &to_key)) + goto out_unlock; + +out_dump: + block = key_block(c, key); + ubifs_err("inode %lu has size %lld, but there are data at offset %lld " + "(data key %s)", (unsigned long)inode->i_ino, size, + ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key)); + dbg_dump_inode(c, inode); + dbg_dump_stack(); + err = -EINVAL; + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c new file mode 100644 index 00000000..41920f35 --- /dev/null +++ b/fs/ubifs/tnc_commit.c @@ -0,0 +1,1103 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* This file implements TNC functions for committing */ + +#include "ubifs.h" + +/** + * make_idx_node - make an index node for fill-the-gaps method of TNC commit. + * @c: UBIFS file-system description object + * @idx: buffer in which to place new index node + * @znode: znode from which to make new index node + * @lnum: LEB number where new index node will be written + * @offs: offset where new index node will be written + * @len: length of new index node + */ +static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, + struct ubifs_znode *znode, int lnum, int offs, int len) +{ + struct ubifs_znode *zp; + int i, err; + + /* Make index node */ + idx->ch.node_type = UBIFS_IDX_NODE; + idx->child_cnt = cpu_to_le16(znode->child_cnt); + idx->level = cpu_to_le16(znode->level); + for (i = 0; i < znode->child_cnt; i++) { + struct ubifs_branch *br = ubifs_idx_branch(c, idx, i); + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + key_write_idx(c, &zbr->key, &br->key); + br->lnum = cpu_to_le32(zbr->lnum); + br->offs = cpu_to_le32(zbr->offs); + br->len = cpu_to_le32(zbr->len); + if (!zbr->lnum || !zbr->len) { + ubifs_err("bad ref in znode"); + dbg_dump_znode(c, znode); + if (zbr->znode) + dbg_dump_znode(c, zbr->znode); + } + } + ubifs_prepare_node(c, idx, len, 0); + +#ifdef CONFIG_UBIFS_FS_DEBUG + znode->lnum = lnum; + znode->offs = offs; + znode->len = len; +#endif + + err = insert_old_idx_znode(c, znode); + + /* Update the parent */ + zp = znode->parent; + if (zp) { + struct ubifs_zbranch *zbr; + + zbr = &zp->zbranch[znode->iip]; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } else { + c->zroot.lnum = lnum; + c->zroot.offs = offs; + c->zroot.len = len; + } + c->calc_idx_sz += ALIGN(len, 8); + + atomic_long_dec(&c->dirty_zn_cnt); + + ubifs_assert(ubifs_zn_dirty(znode)); + ubifs_assert(test_bit(COW_ZNODE, &znode->flags)); + + __clear_bit(DIRTY_ZNODE, &znode->flags); + __clear_bit(COW_ZNODE, &znode->flags); + + return err; +} + +/** + * fill_gap - make index nodes in gaps in dirty index LEBs. + * @c: UBIFS file-system description object + * @lnum: LEB number that gap appears in + * @gap_start: offset of start of gap + * @gap_end: offset of end of gap + * @dirt: adds dirty space to this + * + * This function returns the number of index nodes written into the gap. + */ +static int fill_gap(struct ubifs_info *c, int lnum, int gap_start, int gap_end, + int *dirt) +{ + int len, gap_remains, gap_pos, written, pad_len; + + ubifs_assert((gap_start & 7) == 0); + ubifs_assert((gap_end & 7) == 0); + ubifs_assert(gap_end >= gap_start); + + gap_remains = gap_end - gap_start; + if (!gap_remains) + return 0; + gap_pos = gap_start; + written = 0; + while (c->enext) { + len = ubifs_idx_node_sz(c, c->enext->child_cnt); + if (len < gap_remains) { + struct ubifs_znode *znode = c->enext; + const int alen = ALIGN(len, 8); + int err; + + ubifs_assert(alen <= gap_remains); + err = make_idx_node(c, c->ileb_buf + gap_pos, znode, + lnum, gap_pos, len); + if (err) + return err; + gap_remains -= alen; + gap_pos += alen; + c->enext = znode->cnext; + if (c->enext == c->cnext) + c->enext = NULL; + written += 1; + } else + break; + } + if (gap_end == c->leb_size) { + c->ileb_len = ALIGN(gap_pos, c->min_io_size); + /* Pad to end of min_io_size */ + pad_len = c->ileb_len - gap_pos; + } else + /* Pad to end of gap */ + pad_len = gap_remains; + dbg_gc("LEB %d:%d to %d len %d nodes written %d wasted bytes %d", + lnum, gap_start, gap_end, gap_end - gap_start, written, pad_len); + ubifs_pad(c, c->ileb_buf + gap_pos, pad_len); + *dirt += pad_len; + return written; +} + +/** + * find_old_idx - find an index node obsoleted since the last commit start. + * @c: UBIFS file-system description object + * @lnum: LEB number of obsoleted index node + * @offs: offset of obsoleted index node + * + * Returns %1 if found and %0 otherwise. + */ +static int find_old_idx(struct ubifs_info *c, int lnum, int offs) +{ + struct ubifs_old_idx *o; + struct rb_node *p; + + p = c->old_idx.rb_node; + while (p) { + o = rb_entry(p, struct ubifs_old_idx, rb); + if (lnum < o->lnum) + p = p->rb_left; + else if (lnum > o->lnum) + p = p->rb_right; + else if (offs < o->offs) + p = p->rb_left; + else if (offs > o->offs) + p = p->rb_right; + else + return 1; + } + return 0; +} + +/** + * is_idx_node_in_use - determine if an index node can be overwritten. + * @c: UBIFS file-system description object + * @key: key of index node + * @level: index node level + * @lnum: LEB number of index node + * @offs: offset of index node + * + * If @key / @lnum / @offs identify an index node that was not part of the old + * index, then this function returns %0 (obsolete). Else if the index node was + * part of the old index but is now dirty %1 is returned, else if it is clean %2 + * is returned. A negative error code is returned on failure. + */ +static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key, + int level, int lnum, int offs) +{ + int ret; + + ret = is_idx_node_in_tnc(c, key, level, lnum, offs); + if (ret < 0) + return ret; /* Error code */ + if (ret == 0) + if (find_old_idx(c, lnum, offs)) + return 1; + return ret; +} + +/** + * layout_leb_in_gaps - layout index nodes using in-the-gaps method. + * @c: UBIFS file-system description object + * @p: return LEB number here + * + * This function lays out new index nodes for dirty znodes using in-the-gaps + * method of TNC commit. + * This function merely puts the next znode into the next gap, making no attempt + * to try to maximise the number of znodes that fit. + * This function returns the number of index nodes written into the gaps, or a + * negative error code on failure. + */ +static int layout_leb_in_gaps(struct ubifs_info *c, int *p) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + int lnum, dirt = 0, gap_start, gap_end, err, written, tot_written; + + tot_written = 0; + /* Get an index LEB with lots of obsolete index nodes */ + lnum = ubifs_find_dirty_idx_leb(c); + if (lnum < 0) + /* + * There also may be dirt in the index head that could be + * filled, however we do not check there at present. + */ + return lnum; /* Error code */ + *p = lnum; + dbg_gc("LEB %d", lnum); + /* + * Scan the index LEB. We use the generic scan for this even though + * it is more comprehensive and less efficient than is needed for this + * purpose. + */ + sleb = ubifs_scan(c, lnum, 0, c->ileb_buf, 0); + c->ileb_len = 0; + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + gap_start = 0; + list_for_each_entry(snod, &sleb->nodes, list) { + struct ubifs_idx_node *idx; + int in_use, level; + + ubifs_assert(snod->type == UBIFS_IDX_NODE); + idx = snod->node; + key_read(c, ubifs_idx_key(c, idx), &snod->key); + level = le16_to_cpu(idx->level); + /* Determine if the index node is in use (not obsolete) */ + in_use = is_idx_node_in_use(c, &snod->key, level, lnum, + snod->offs); + if (in_use < 0) { + ubifs_scan_destroy(sleb); + return in_use; /* Error code */ + } + if (in_use) { + if (in_use == 1) + dirt += ALIGN(snod->len, 8); + /* + * The obsolete index nodes form gaps that can be + * overwritten. This gap has ended because we have + * found an index node that is still in use + * i.e. not obsolete + */ + gap_end = snod->offs; + /* Try to fill gap */ + written = fill_gap(c, lnum, gap_start, gap_end, &dirt); + if (written < 0) { + ubifs_scan_destroy(sleb); + return written; /* Error code */ + } + tot_written += written; + gap_start = ALIGN(snod->offs + snod->len, 8); + } + } + ubifs_scan_destroy(sleb); + c->ileb_len = c->leb_size; + gap_end = c->leb_size; + /* Try to fill gap */ + written = fill_gap(c, lnum, gap_start, gap_end, &dirt); + if (written < 0) + return written; /* Error code */ + tot_written += written; + if (tot_written == 0) { + struct ubifs_lprops lp; + + dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written); + err = ubifs_read_one_lp(c, lnum, &lp); + if (err) + return err; + if (lp.free == c->leb_size) { + /* + * We must have snatched this LEB from the idx_gc list + * so we need to correct the free and dirty space. + */ + err = ubifs_change_one_lp(c, lnum, + c->leb_size - c->ileb_len, + dirt, 0, 0, 0); + if (err) + return err; + } + return 0; + } + err = ubifs_change_one_lp(c, lnum, c->leb_size - c->ileb_len, dirt, + 0, 0, 0); + if (err) + return err; + err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len, + UBI_SHORTTERM); + if (err) + return err; + dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written); + return tot_written; +} + +/** + * get_leb_cnt - calculate the number of empty LEBs needed to commit. + * @c: UBIFS file-system description object + * @cnt: number of znodes to commit + * + * This function returns the number of empty LEBs needed to commit @cnt znodes + * to the current index head. The number is not exact and may be more than + * needed. + */ +static int get_leb_cnt(struct ubifs_info *c, int cnt) +{ + int d; + + /* Assume maximum index node size (i.e. overestimate space needed) */ + cnt -= (c->leb_size - c->ihead_offs) / c->max_idx_node_sz; + if (cnt < 0) + cnt = 0; + d = c->leb_size / c->max_idx_node_sz; + return DIV_ROUND_UP(cnt, d); +} + +/** + * layout_in_gaps - in-the-gaps method of committing TNC. + * @c: UBIFS file-system description object + * @cnt: number of dirty znodes to commit. + * + * This function lays out new index nodes for dirty znodes using in-the-gaps + * method of TNC commit. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int layout_in_gaps(struct ubifs_info *c, int cnt) +{ + int err, leb_needed_cnt, written, *p; + + dbg_gc("%d znodes to write", cnt); + + c->gap_lebs = kmalloc(sizeof(int) * (c->lst.idx_lebs + 1), GFP_NOFS); + if (!c->gap_lebs) + return -ENOMEM; + + p = c->gap_lebs; + do { + ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs); + written = layout_leb_in_gaps(c, p); + if (written < 0) { + err = written; + if (err != -ENOSPC) { + kfree(c->gap_lebs); + c->gap_lebs = NULL; + return err; + } + if (dbg_force_in_the_gaps_enabled()) { + /* + * Do not print scary warnings if the debugging + * option which forces in-the-gaps is enabled. + */ + ubifs_warn("out of space"); + dbg_dump_budg(c, &c->bi); + dbg_dump_lprops(c); + } + /* Try to commit anyway */ + err = 0; + break; + } + p++; + cnt -= written; + leb_needed_cnt = get_leb_cnt(c, cnt); + dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt, + leb_needed_cnt, c->ileb_cnt); + } while (leb_needed_cnt > c->ileb_cnt); + + *p = -1; + return 0; +} + +/** + * layout_in_empty_space - layout index nodes in empty space. + * @c: UBIFS file-system description object + * + * This function lays out new index nodes for dirty znodes using empty LEBs. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int layout_in_empty_space(struct ubifs_info *c) +{ + struct ubifs_znode *znode, *cnext, *zp; + int lnum, offs, len, next_len, buf_len, buf_offs, used, avail; + int wlen, blen, err; + + cnext = c->enext; + if (!cnext) + return 0; + + lnum = c->ihead_lnum; + buf_offs = c->ihead_offs; + + buf_len = ubifs_idx_node_sz(c, c->fanout); + buf_len = ALIGN(buf_len, c->min_io_size); + used = 0; + avail = buf_len; + + /* Ensure there is enough room for first write */ + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + if (buf_offs + next_len > c->leb_size) + lnum = -1; + + while (1) { + znode = cnext; + + len = ubifs_idx_node_sz(c, znode->child_cnt); + + /* Determine the index node position */ + if (lnum == -1) { + if (c->ileb_nxt >= c->ileb_cnt) { + ubifs_err("out of space"); + return -ENOSPC; + } + lnum = c->ilebs[c->ileb_nxt++]; + buf_offs = 0; + used = 0; + avail = buf_len; + } + + offs = buf_offs + used; + +#ifdef CONFIG_UBIFS_FS_DEBUG + znode->lnum = lnum; + znode->offs = offs; + znode->len = len; +#endif + + /* Update the parent */ + zp = znode->parent; + if (zp) { + struct ubifs_zbranch *zbr; + int i; + + i = znode->iip; + zbr = &zp->zbranch[i]; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } else { + c->zroot.lnum = lnum; + c->zroot.offs = offs; + c->zroot.len = len; + } + c->calc_idx_sz += ALIGN(len, 8); + + /* + * Once lprops is updated, we can decrease the dirty znode count + * but it is easier to just do it here. + */ + atomic_long_dec(&c->dirty_zn_cnt); + + /* + * Calculate the next index node length to see if there is + * enough room for it + */ + cnext = znode->cnext; + if (cnext == c->cnext) + next_len = 0; + else + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + + if (c->min_io_size == 1) { + buf_offs += ALIGN(len, 8); + if (next_len) { + if (buf_offs + next_len <= c->leb_size) + continue; + err = ubifs_update_one_lp(c, lnum, 0, + c->leb_size - buf_offs, 0, 0); + if (err) + return err; + lnum = -1; + continue; + } + err = ubifs_update_one_lp(c, lnum, + c->leb_size - buf_offs, 0, 0, 0); + if (err) + return err; + break; + } + + /* Update buffer positions */ + wlen = used + len; + used += ALIGN(len, 8); + avail -= ALIGN(len, 8); + + if (next_len != 0 && + buf_offs + used + next_len <= c->leb_size && + avail > 0) + continue; + + if (avail <= 0 && next_len && + buf_offs + used + next_len <= c->leb_size) + blen = buf_len; + else + blen = ALIGN(wlen, c->min_io_size); + + /* The buffer is full or there are no more znodes to do */ + buf_offs += blen; + if (next_len) { + if (buf_offs + next_len > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, + c->leb_size - buf_offs, blen - used, + 0, 0); + if (err) + return err; + lnum = -1; + } + used -= blen; + if (used < 0) + used = 0; + avail = buf_len - used; + continue; + } + err = ubifs_update_one_lp(c, lnum, c->leb_size - buf_offs, + blen - used, 0, 0); + if (err) + return err; + break; + } + +#ifdef CONFIG_UBIFS_FS_DEBUG + c->dbg->new_ihead_lnum = lnum; + c->dbg->new_ihead_offs = buf_offs; +#endif + + return 0; +} + +/** + * layout_commit - determine positions of index nodes to commit. + * @c: UBIFS file-system description object + * @no_space: indicates that insufficient empty LEBs were allocated + * @cnt: number of znodes to commit + * + * Calculate and update the positions of index nodes to commit. If there were + * an insufficient number of empty LEBs allocated, then index nodes are placed + * into the gaps created by obsolete index nodes in non-empty index LEBs. For + * this purpose, an obsolete index node is one that was not in the index as at + * the end of the last commit. To write "in-the-gaps" requires that those index + * LEBs are updated atomically in-place. + */ +static int layout_commit(struct ubifs_info *c, int no_space, int cnt) +{ + int err; + + if (no_space) { + err = layout_in_gaps(c, cnt); + if (err) + return err; + } + err = layout_in_empty_space(c); + return err; +} + +/** + * find_first_dirty - find first dirty znode. + * @znode: znode to begin searching from + */ +static struct ubifs_znode *find_first_dirty(struct ubifs_znode *znode) +{ + int i, cont; + + if (!znode) + return NULL; + + while (1) { + if (znode->level == 0) { + if (ubifs_zn_dirty(znode)) + return znode; + return NULL; + } + cont = 0; + for (i = 0; i < znode->child_cnt; i++) { + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + if (zbr->znode && ubifs_zn_dirty(zbr->znode)) { + znode = zbr->znode; + cont = 1; + break; + } + } + if (!cont) { + if (ubifs_zn_dirty(znode)) + return znode; + return NULL; + } + } +} + +/** + * find_next_dirty - find next dirty znode. + * @znode: znode to begin searching from + */ +static struct ubifs_znode *find_next_dirty(struct ubifs_znode *znode) +{ + int n = znode->iip + 1; + + znode = znode->parent; + if (!znode) + return NULL; + for (; n < znode->child_cnt; n++) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + if (zbr->znode && ubifs_zn_dirty(zbr->znode)) + return find_first_dirty(zbr->znode); + } + return znode; +} + +/** + * get_znodes_to_commit - create list of dirty znodes to commit. + * @c: UBIFS file-system description object + * + * This function returns the number of znodes to commit. + */ +static int get_znodes_to_commit(struct ubifs_info *c) +{ + struct ubifs_znode *znode, *cnext; + int cnt = 0; + + c->cnext = find_first_dirty(c->zroot.znode); + znode = c->enext = c->cnext; + if (!znode) { + dbg_cmt("no znodes to commit"); + return 0; + } + cnt += 1; + while (1) { + ubifs_assert(!test_bit(COW_ZNODE, &znode->flags)); + __set_bit(COW_ZNODE, &znode->flags); + znode->alt = 0; + cnext = find_next_dirty(znode); + if (!cnext) { + znode->cnext = c->cnext; + break; + } + znode->cnext = cnext; + znode = cnext; + cnt += 1; + } + dbg_cmt("committing %d znodes", cnt); + ubifs_assert(cnt == atomic_long_read(&c->dirty_zn_cnt)); + return cnt; +} + +/** + * alloc_idx_lebs - allocate empty LEBs to be used to commit. + * @c: UBIFS file-system description object + * @cnt: number of znodes to commit + * + * This function returns %-ENOSPC if it cannot allocate a sufficient number of + * empty LEBs. %0 is returned on success, otherwise a negative error code + * is returned. + */ +static int alloc_idx_lebs(struct ubifs_info *c, int cnt) +{ + int i, leb_cnt, lnum; + + c->ileb_cnt = 0; + c->ileb_nxt = 0; + leb_cnt = get_leb_cnt(c, cnt); + dbg_cmt("need about %d empty LEBS for TNC commit", leb_cnt); + if (!leb_cnt) + return 0; + c->ilebs = kmalloc(leb_cnt * sizeof(int), GFP_NOFS); + if (!c->ilebs) + return -ENOMEM; + for (i = 0; i < leb_cnt; i++) { + lnum = ubifs_find_free_leb_for_idx(c); + if (lnum < 0) + return lnum; + c->ilebs[c->ileb_cnt++] = lnum; + dbg_cmt("LEB %d", lnum); + } + if (dbg_force_in_the_gaps()) + return -ENOSPC; + return 0; +} + +/** + * free_unused_idx_lebs - free unused LEBs that were allocated for the commit. + * @c: UBIFS file-system description object + * + * It is possible that we allocate more empty LEBs for the commit than we need. + * This functions frees the surplus. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int free_unused_idx_lebs(struct ubifs_info *c) +{ + int i, err = 0, lnum, er; + + for (i = c->ileb_nxt; i < c->ileb_cnt; i++) { + lnum = c->ilebs[i]; + dbg_cmt("LEB %d", lnum); + er = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_INDEX | LPROPS_TAKEN, 0); + if (!err) + err = er; + } + return err; +} + +/** + * free_idx_lebs - free unused LEBs after commit end. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int free_idx_lebs(struct ubifs_info *c) +{ + int err; + + err = free_unused_idx_lebs(c); + kfree(c->ilebs); + c->ilebs = NULL; + return err; +} + +/** + * ubifs_tnc_start_commit - start TNC commit. + * @c: UBIFS file-system description object + * @zroot: new index root position is returned here + * + * This function prepares the list of indexing nodes to commit and lays out + * their positions on flash. If there is not enough free space it uses the + * in-gap commit method. Returns zero in case of success and a negative error + * code in case of failure. + */ +int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot) +{ + int err = 0, cnt; + + mutex_lock(&c->tnc_mutex); + err = dbg_check_tnc(c, 1); + if (err) + goto out; + cnt = get_znodes_to_commit(c); + if (cnt != 0) { + int no_space = 0; + + err = alloc_idx_lebs(c, cnt); + if (err == -ENOSPC) + no_space = 1; + else if (err) + goto out_free; + err = layout_commit(c, no_space, cnt); + if (err) + goto out_free; + ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0); + err = free_unused_idx_lebs(c); + if (err) + goto out; + } + destroy_old_idx(c); + memcpy(zroot, &c->zroot, sizeof(struct ubifs_zbranch)); + + err = ubifs_save_dirty_idx_lnums(c); + if (err) + goto out; + + spin_lock(&c->space_lock); + /* + * Although we have not finished committing yet, update size of the + * committed index ('c->bi.old_idx_sz') and zero out the index growth + * budget. It is OK to do this now, because we've reserved all the + * space which is needed to commit the index, and it is save for the + * budgeting subsystem to assume the index is already committed, + * even though it is not. + */ + ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c)); + c->bi.old_idx_sz = c->calc_idx_sz; + c->bi.uncommitted_idx = 0; + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + spin_unlock(&c->space_lock); + mutex_unlock(&c->tnc_mutex); + + dbg_cmt("number of index LEBs %d", c->lst.idx_lebs); + dbg_cmt("size of index %llu", c->calc_idx_sz); + return err; + +out_free: + free_idx_lebs(c); +out: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * write_index - write index nodes. + * @c: UBIFS file-system description object + * + * This function writes the index nodes whose positions were laid out in the + * layout_in_empty_space function. + */ +static int write_index(struct ubifs_info *c) +{ + struct ubifs_idx_node *idx; + struct ubifs_znode *znode, *cnext; + int i, lnum, offs, len, next_len, buf_len, buf_offs, used; + int avail, wlen, err, lnum_pos = 0; + + cnext = c->enext; + if (!cnext) + return 0; + + /* + * Always write index nodes to the index head so that index nodes and + * other types of nodes are never mixed in the same erase block. + */ + lnum = c->ihead_lnum; + buf_offs = c->ihead_offs; + + /* Allocate commit buffer */ + buf_len = ALIGN(c->max_idx_node_sz, c->min_io_size); + used = 0; + avail = buf_len; + + /* Ensure there is enough room for first write */ + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + if (buf_offs + next_len > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, 0, + LPROPS_TAKEN); + if (err) + return err; + lnum = -1; + } + + while (1) { + cond_resched(); + + znode = cnext; + idx = c->cbuf + used; + + /* Make index node */ + idx->ch.node_type = UBIFS_IDX_NODE; + idx->child_cnt = cpu_to_le16(znode->child_cnt); + idx->level = cpu_to_le16(znode->level); + for (i = 0; i < znode->child_cnt; i++) { + struct ubifs_branch *br = ubifs_idx_branch(c, idx, i); + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + key_write_idx(c, &zbr->key, &br->key); + br->lnum = cpu_to_le32(zbr->lnum); + br->offs = cpu_to_le32(zbr->offs); + br->len = cpu_to_le32(zbr->len); + if (!zbr->lnum || !zbr->len) { + ubifs_err("bad ref in znode"); + dbg_dump_znode(c, znode); + if (zbr->znode) + dbg_dump_znode(c, zbr->znode); + } + } + len = ubifs_idx_node_sz(c, znode->child_cnt); + ubifs_prepare_node(c, idx, len, 0); + + /* Determine the index node position */ + if (lnum == -1) { + lnum = c->ilebs[lnum_pos++]; + buf_offs = 0; + used = 0; + avail = buf_len; + } + offs = buf_offs + used; + +#ifdef CONFIG_UBIFS_FS_DEBUG + if (lnum != znode->lnum || offs != znode->offs || + len != znode->len) { + ubifs_err("inconsistent znode posn"); + return -EINVAL; + } +#endif + + /* Grab some stuff from znode while we still can */ + cnext = znode->cnext; + + ubifs_assert(ubifs_zn_dirty(znode)); + ubifs_assert(test_bit(COW_ZNODE, &znode->flags)); + + /* + * It is important that other threads should see %DIRTY_ZNODE + * flag cleared before %COW_ZNODE. Specifically, it matters in + * the 'dirty_cow_znode()' function. This is the reason for the + * first barrier. Also, we want the bit changes to be seen to + * other threads ASAP, to avoid unnecesarry copying, which is + * the reason for the second barrier. + */ + clear_bit(DIRTY_ZNODE, &znode->flags); + smp_mb__before_clear_bit(); + clear_bit(COW_ZNODE, &znode->flags); + smp_mb__after_clear_bit(); + + /* Do not access znode from this point on */ + + /* Update buffer positions */ + wlen = used + len; + used += ALIGN(len, 8); + avail -= ALIGN(len, 8); + + /* + * Calculate the next index node length to see if there is + * enough room for it + */ + if (cnext == c->cnext) + next_len = 0; + else + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + + if (c->min_io_size == 1) { + /* + * Write the prepared index node immediately if there is + * no minimum IO size + */ + err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, + wlen, UBI_SHORTTERM); + if (err) + return err; + buf_offs += ALIGN(wlen, 8); + if (next_len) { + used = 0; + avail = buf_len; + if (buf_offs + next_len > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, + LPROPS_NC, 0, 0, LPROPS_TAKEN); + if (err) + return err; + lnum = -1; + } + continue; + } + } else { + int blen, nxt_offs = buf_offs + used + next_len; + + if (next_len && nxt_offs <= c->leb_size) { + if (avail > 0) + continue; + else + blen = buf_len; + } else { + wlen = ALIGN(wlen, 8); + blen = ALIGN(wlen, c->min_io_size); + ubifs_pad(c, c->cbuf + wlen, blen - wlen); + } + /* + * The buffer is full or there are no more znodes + * to do + */ + err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, + blen, UBI_SHORTTERM); + if (err) + return err; + buf_offs += blen; + if (next_len) { + if (nxt_offs > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, + LPROPS_NC, 0, 0, LPROPS_TAKEN); + if (err) + return err; + lnum = -1; + } + used -= blen; + if (used < 0) + used = 0; + avail = buf_len - used; + memmove(c->cbuf, c->cbuf + blen, used); + continue; + } + } + break; + } + +#ifdef CONFIG_UBIFS_FS_DEBUG + if (lnum != c->dbg->new_ihead_lnum || + buf_offs != c->dbg->new_ihead_offs) { + ubifs_err("inconsistent ihead"); + return -EINVAL; + } +#endif + + c->ihead_lnum = lnum; + c->ihead_offs = buf_offs; + + return 0; +} + +/** + * free_obsolete_znodes - free obsolete znodes. + * @c: UBIFS file-system description object + * + * At the end of commit end, obsolete znodes are freed. + */ +static void free_obsolete_znodes(struct ubifs_info *c) +{ + struct ubifs_znode *znode, *cnext; + + cnext = c->cnext; + do { + znode = cnext; + cnext = znode->cnext; + if (test_bit(OBSOLETE_ZNODE, &znode->flags)) + kfree(znode); + else { + znode->cnext = NULL; + atomic_long_inc(&c->clean_zn_cnt); + atomic_long_inc(&ubifs_clean_zn_cnt); + } + } while (cnext != c->cnext); +} + +/** + * return_gap_lebs - return LEBs used by the in-gap commit method. + * @c: UBIFS file-system description object + * + * This function clears the "taken" flag for the LEBs which were used by the + * "commit in-the-gaps" method. + */ +static int return_gap_lebs(struct ubifs_info *c) +{ + int *p, err; + + if (!c->gap_lebs) + return 0; + + dbg_cmt(""); + for (p = c->gap_lebs; *p != -1; p++) { + err = ubifs_change_one_lp(c, *p, LPROPS_NC, LPROPS_NC, 0, + LPROPS_TAKEN, 0); + if (err) + return err; + } + + kfree(c->gap_lebs); + c->gap_lebs = NULL; + return 0; +} + +/** + * ubifs_tnc_end_commit - update the TNC for commit end. + * @c: UBIFS file-system description object + * + * Write the dirty znodes. + */ +int ubifs_tnc_end_commit(struct ubifs_info *c) +{ + int err; + + if (!c->cnext) + return 0; + + err = return_gap_lebs(c); + if (err) + return err; + + err = write_index(c); + if (err) + return err; + + mutex_lock(&c->tnc_mutex); + + dbg_cmt("TNC height is %d", c->zroot.znode->level + 1); + + free_obsolete_znodes(c); + + c->cnext = NULL; + kfree(c->ilebs); + c->ilebs = NULL; + + mutex_unlock(&c->tnc_mutex); + + return 0; +} diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c new file mode 100644 index 00000000..b48db999 --- /dev/null +++ b/fs/ubifs/tnc_misc.c @@ -0,0 +1,494 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file contains miscelanious TNC-related functions shared betweend + * different files. This file does not form any logically separate TNC + * sub-system. The file was created because there is a lot of TNC code and + * putting it all in one file would make that file too big and unreadable. + */ + +#include "ubifs.h" + +/** + * ubifs_tnc_levelorder_next - next TNC tree element in levelorder traversal. + * @zr: root of the subtree to traverse + * @znode: previous znode + * + * This function implements levelorder TNC traversal. The LNC is ignored. + * Returns the next element or %NULL if @znode is already the last one. + */ +struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr, + struct ubifs_znode *znode) +{ + int level, iip, level_search = 0; + struct ubifs_znode *zn; + + ubifs_assert(zr); + + if (unlikely(!znode)) + return zr; + + if (unlikely(znode == zr)) { + if (znode->level == 0) + return NULL; + return ubifs_tnc_find_child(zr, 0); + } + + level = znode->level; + + iip = znode->iip; + while (1) { + ubifs_assert(znode->level <= zr->level); + + /* + * First walk up until there is a znode with next branch to + * look at. + */ + while (znode->parent != zr && iip >= znode->parent->child_cnt) { + znode = znode->parent; + iip = znode->iip; + } + + if (unlikely(znode->parent == zr && + iip >= znode->parent->child_cnt)) { + /* This level is done, switch to the lower one */ + level -= 1; + if (level_search || level < 0) + /* + * We were already looking for znode at lower + * level ('level_search'). As we are here + * again, it just does not exist. Or all levels + * were finished ('level < 0'). + */ + return NULL; + + level_search = 1; + iip = -1; + znode = ubifs_tnc_find_child(zr, 0); + ubifs_assert(znode); + } + + /* Switch to the next index */ + zn = ubifs_tnc_find_child(znode->parent, iip + 1); + if (!zn) { + /* No more children to look at, we have walk up */ + iip = znode->parent->child_cnt; + continue; + } + + /* Walk back down to the level we came from ('level') */ + while (zn->level != level) { + znode = zn; + zn = ubifs_tnc_find_child(zn, 0); + if (!zn) { + /* + * This path is not too deep so it does not + * reach 'level'. Try next path. + */ + iip = znode->iip; + break; + } + } + + if (zn) { + ubifs_assert(zn->level >= 0); + return zn; + } + } +} + +/** + * ubifs_search_zbranch - search znode branch. + * @c: UBIFS file-system description object + * @znode: znode to search in + * @key: key to search for + * @n: znode branch slot number is returned here + * + * This is a helper function which search branch with key @key in @znode using + * binary search. The result of the search may be: + * o exact match, then %1 is returned, and the slot number of the branch is + * stored in @n; + * o no exact match, then %0 is returned and the slot number of the left + * closest branch is returned in @n; the slot if all keys in this znode are + * greater than @key, then %-1 is returned in @n. + */ +int ubifs_search_zbranch(const struct ubifs_info *c, + const struct ubifs_znode *znode, + const union ubifs_key *key, int *n) +{ + int beg = 0, end = znode->child_cnt, uninitialized_var(mid); + int uninitialized_var(cmp); + const struct ubifs_zbranch *zbr = &znode->zbranch[0]; + + ubifs_assert(end > beg); + + while (end > beg) { + mid = (beg + end) >> 1; + cmp = keys_cmp(c, key, &zbr[mid].key); + if (cmp > 0) + beg = mid + 1; + else if (cmp < 0) + end = mid; + else { + *n = mid; + return 1; + } + } + + *n = end - 1; + + /* The insert point is after *n */ + ubifs_assert(*n >= -1 && *n < znode->child_cnt); + if (*n == -1) + ubifs_assert(keys_cmp(c, key, &zbr[0].key) < 0); + else + ubifs_assert(keys_cmp(c, key, &zbr[*n].key) > 0); + if (*n + 1 < znode->child_cnt) + ubifs_assert(keys_cmp(c, key, &zbr[*n + 1].key) < 0); + + return 0; +} + +/** + * ubifs_tnc_postorder_first - find first znode to do postorder tree traversal. + * @znode: znode to start at (root of the sub-tree to traverse) + * + * Find the lowest leftmost znode in a subtree of the TNC tree. The LNC is + * ignored. + */ +struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode) +{ + if (unlikely(!znode)) + return NULL; + + while (znode->level > 0) { + struct ubifs_znode *child; + + child = ubifs_tnc_find_child(znode, 0); + if (!child) + return znode; + znode = child; + } + + return znode; +} + +/** + * ubifs_tnc_postorder_next - next TNC tree element in postorder traversal. + * @znode: previous znode + * + * This function implements postorder TNC traversal. The LNC is ignored. + * Returns the next element or %NULL if @znode is already the last one. + */ +struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode) +{ + struct ubifs_znode *zn; + + ubifs_assert(znode); + if (unlikely(!znode->parent)) + return NULL; + + /* Switch to the next index in the parent */ + zn = ubifs_tnc_find_child(znode->parent, znode->iip + 1); + if (!zn) + /* This is in fact the last child, return parent */ + return znode->parent; + + /* Go to the first znode in this new subtree */ + return ubifs_tnc_postorder_first(zn); +} + +/** + * ubifs_destroy_tnc_subtree - destroy all znodes connected to a subtree. + * @znode: znode defining subtree to destroy + * + * This function destroys subtree of the TNC tree. Returns number of clean + * znodes in the subtree. + */ +long ubifs_destroy_tnc_subtree(struct ubifs_znode *znode) +{ + struct ubifs_znode *zn = ubifs_tnc_postorder_first(znode); + long clean_freed = 0; + int n; + + ubifs_assert(zn); + while (1) { + for (n = 0; n < zn->child_cnt; n++) { + if (!zn->zbranch[n].znode) + continue; + + if (zn->level > 0 && + !ubifs_zn_dirty(zn->zbranch[n].znode)) + clean_freed += 1; + + cond_resched(); + kfree(zn->zbranch[n].znode); + } + + if (zn == znode) { + if (!ubifs_zn_dirty(zn)) + clean_freed += 1; + kfree(zn); + return clean_freed; + } + + zn = ubifs_tnc_postorder_next(zn); + } +} + +/** + * read_znode - read an indexing node from flash and fill znode. + * @c: UBIFS file-system description object + * @lnum: LEB of the indexing node to read + * @offs: node offset + * @len: node length + * @znode: znode to read to + * + * This function reads an indexing node from the flash media and fills znode + * with the read data. Returns zero in case of success and a negative error + * code in case of failure. The read indexing node is validated and if anything + * is wrong with it, this function prints complaint messages and returns + * %-EINVAL. + */ +static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, + struct ubifs_znode *znode) +{ + int i, err, type, cmp; + struct ubifs_idx_node *idx; + + idx = kmalloc(c->max_idx_node_sz, GFP_NOFS); + if (!idx) + return -ENOMEM; + + err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs); + if (err < 0) { + kfree(idx); + return err; + } + + znode->child_cnt = le16_to_cpu(idx->child_cnt); + znode->level = le16_to_cpu(idx->level); + + dbg_tnc("LEB %d:%d, level %d, %d branch", + lnum, offs, znode->level, znode->child_cnt); + + if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) { + dbg_err("current fanout %d, branch count %d", + c->fanout, znode->child_cnt); + dbg_err("max levels %d, znode level %d", + UBIFS_MAX_LEVELS, znode->level); + err = 1; + goto out_dump; + } + + for (i = 0; i < znode->child_cnt; i++) { + const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i); + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + key_read(c, &br->key, &zbr->key); + zbr->lnum = le32_to_cpu(br->lnum); + zbr->offs = le32_to_cpu(br->offs); + zbr->len = le32_to_cpu(br->len); + zbr->znode = NULL; + + /* Validate branch */ + + if (zbr->lnum < c->main_first || + zbr->lnum >= c->leb_cnt || zbr->offs < 0 || + zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) { + dbg_err("bad branch %d", i); + err = 2; + goto out_dump; + } + + switch (key_type(c, &zbr->key)) { + case UBIFS_INO_KEY: + case UBIFS_DATA_KEY: + case UBIFS_DENT_KEY: + case UBIFS_XENT_KEY: + break; + default: + dbg_msg("bad key type at slot %d: %s", i, + DBGKEY(&zbr->key)); + err = 3; + goto out_dump; + } + + if (znode->level) + continue; + + type = key_type(c, &zbr->key); + if (c->ranges[type].max_len == 0) { + if (zbr->len != c->ranges[type].len) { + dbg_err("bad target node (type %d) length (%d)", + type, zbr->len); + dbg_err("have to be %d", c->ranges[type].len); + err = 4; + goto out_dump; + } + } else if (zbr->len < c->ranges[type].min_len || + zbr->len > c->ranges[type].max_len) { + dbg_err("bad target node (type %d) length (%d)", + type, zbr->len); + dbg_err("have to be in range of %d-%d", + c->ranges[type].min_len, + c->ranges[type].max_len); + err = 5; + goto out_dump; + } + } + + /* + * Ensure that the next key is greater or equivalent to the + * previous one. + */ + for (i = 0; i < znode->child_cnt - 1; i++) { + const union ubifs_key *key1, *key2; + + key1 = &znode->zbranch[i].key; + key2 = &znode->zbranch[i + 1].key; + + cmp = keys_cmp(c, key1, key2); + if (cmp > 0) { + dbg_err("bad key order (keys %d and %d)", i, i + 1); + err = 6; + goto out_dump; + } else if (cmp == 0 && !is_hash_key(c, key1)) { + /* These can only be keys with colliding hash */ + dbg_err("keys %d and %d are not hashed but equivalent", + i, i + 1); + err = 7; + goto out_dump; + } + } + + kfree(idx); + return 0; + +out_dump: + ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err); + dbg_dump_node(c, idx); + kfree(idx); + return -EINVAL; +} + +/** + * ubifs_load_znode - load znode to TNC cache. + * @c: UBIFS file-system description object + * @zbr: znode branch + * @parent: znode's parent + * @iip: index in parent + * + * This function loads znode pointed to by @zbr into the TNC cache and + * returns pointer to it in case of success and a negative error code in case + * of failure. + */ +struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c, + struct ubifs_zbranch *zbr, + struct ubifs_znode *parent, int iip) +{ + int err; + struct ubifs_znode *znode; + + ubifs_assert(!zbr->znode); + /* + * A slab cache is not presently used for znodes because the znode size + * depends on the fanout which is stored in the superblock. + */ + znode = kzalloc(c->max_znode_sz, GFP_NOFS); + if (!znode) + return ERR_PTR(-ENOMEM); + + err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode); + if (err) + goto out; + + atomic_long_inc(&c->clean_zn_cnt); + + /* + * Increment the global clean znode counter as well. It is OK that + * global and per-FS clean znode counters may be inconsistent for some + * short time (because we might be preempted at this point), the global + * one is only used in shrinker. + */ + atomic_long_inc(&ubifs_clean_zn_cnt); + + zbr->znode = znode; + znode->parent = parent; + znode->time = get_seconds(); + znode->iip = iip; + + return znode; + +out: + kfree(znode); + return ERR_PTR(err); +} + +/** + * ubifs_tnc_read_node - read a leaf node from the flash media. + * @c: UBIFS file-system description object + * @zbr: key and position of the node + * @node: node is returned here + * + * This function reads a node defined by @zbr from the flash media. Returns + * zero in case of success or a negative negative error code in case of + * failure. + */ +int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node) +{ + union ubifs_key key1, *key = &zbr->key; + int err, type = key_type(c, key); + struct ubifs_wbuf *wbuf; + + /* + * 'zbr' has to point to on-flash node. The node may sit in a bud and + * may even be in a write buffer, so we have to take care about this. + */ + wbuf = ubifs_get_wbuf(c, zbr->lnum); + if (wbuf) + err = ubifs_read_node_wbuf(wbuf, node, type, zbr->len, + zbr->lnum, zbr->offs); + else + err = ubifs_read_node(c, node, type, zbr->len, zbr->lnum, + zbr->offs); + + if (err) { + dbg_tnc("key %s", DBGKEY(key)); + return err; + } + + /* Make sure the key of the read node is correct */ + key_read(c, node + UBIFS_KEY_OFFSET, &key1); + if (!keys_eq(c, key, &key1)) { + ubifs_err("bad key in node at LEB %d:%d", + zbr->lnum, zbr->offs); + dbg_tnc("looked for key %s found node's key %s", + DBGKEY(key), DBGKEY1(&key1)); + dbg_dump_node(c, node); + return -EINVAL; + } + + return 0; +} diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h new file mode 100644 index 00000000..e24380cf --- /dev/null +++ b/fs/ubifs/ubifs-media.h @@ -0,0 +1,784 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file describes UBIFS on-flash format and contains definitions of all the + * relevant data structures and constants. + * + * All UBIFS on-flash objects are stored in the form of nodes. All nodes start + * with the UBIFS node magic number and have the same common header. Nodes + * always sit at 8-byte aligned positions on the media and node header sizes are + * also 8-byte aligned (except for the indexing node and the padding node). + */ + +#ifndef __UBIFS_MEDIA_H__ +#define __UBIFS_MEDIA_H__ + +/* UBIFS node magic number (must not have the padding byte first or last) */ +#define UBIFS_NODE_MAGIC 0x06101831 + +/* + * UBIFS on-flash format version. This version is increased when the on-flash + * format is changing. If this happens, UBIFS is will support older versions as + * well. But older UBIFS code will not support newer formats. Format changes + * will be rare and only when absolutely necessary, e.g. to fix a bug or to add + * a new feature. + * + * UBIFS went into mainline kernel with format version 4. The older formats + * were development formats. + */ +#define UBIFS_FORMAT_VERSION 4 + +/* + * Read-only compatibility version. If the UBIFS format is changed, older UBIFS + * implementations will not be able to mount newer formats in read-write mode. + * However, depending on the change, it may be possible to mount newer formats + * in R/O mode. This is indicated by the R/O compatibility version which is + * stored in the super-block. + * + * This is needed to support boot-loaders which only need R/O mounting. With + * this flag it is possible to do UBIFS format changes without a need to update + * boot-loaders. + */ +#define UBIFS_RO_COMPAT_VERSION 0 + +/* Minimum logical eraseblock size in bytes */ +#define UBIFS_MIN_LEB_SZ (15*1024) + +/* Initial CRC32 value used when calculating CRC checksums */ +#define UBIFS_CRC32_INIT 0xFFFFFFFFU + +/* + * UBIFS does not try to compress data if its length is less than the below + * constant. + */ +#define UBIFS_MIN_COMPR_LEN 128 + +/* + * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes + * shorter than uncompressed data length, UBIFS prefers to leave this data + * node uncompress, because it'll be read faster. + */ +#define UBIFS_MIN_COMPRESS_DIFF 64 + +/* Root inode number */ +#define UBIFS_ROOT_INO 1 + +/* Lowest inode number used for regular inodes (not UBIFS-only internal ones) */ +#define UBIFS_FIRST_INO 64 + +/* + * Maximum file name and extended attribute length (must be a multiple of 8, + * minus 1). + */ +#define UBIFS_MAX_NLEN 255 + +/* Maximum number of data journal heads */ +#define UBIFS_MAX_JHEADS 1 + +/* + * Size of UBIFS data block. Note, UBIFS is not a block oriented file-system, + * which means that it does not treat the underlying media as consisting of + * blocks like in case of hard drives. Do not be confused. UBIFS block is just + * the maximum amount of data which one data node can have or which can be + * attached to an inode node. + */ +#define UBIFS_BLOCK_SIZE 4096 +#define UBIFS_BLOCK_SHIFT 12 + +/* UBIFS padding byte pattern (must not be first or last byte of node magic) */ +#define UBIFS_PADDING_BYTE 0xCE + +/* Maximum possible key length */ +#define UBIFS_MAX_KEY_LEN 16 + +/* Key length ("simple" format) */ +#define UBIFS_SK_LEN 8 + +/* Minimum index tree fanout */ +#define UBIFS_MIN_FANOUT 3 + +/* Maximum number of levels in UBIFS indexing B-tree */ +#define UBIFS_MAX_LEVELS 512 + +/* Maximum amount of data attached to an inode in bytes */ +#define UBIFS_MAX_INO_DATA UBIFS_BLOCK_SIZE + +/* LEB Properties Tree fanout (must be power of 2) and fanout shift */ +#define UBIFS_LPT_FANOUT 4 +#define UBIFS_LPT_FANOUT_SHIFT 2 + +/* LEB Properties Tree bit field sizes */ +#define UBIFS_LPT_CRC_BITS 16 +#define UBIFS_LPT_CRC_BYTES 2 +#define UBIFS_LPT_TYPE_BITS 4 + +/* The key is always at the same position in all keyed nodes */ +#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key) + +/* Garbage collector journal head number */ +#define UBIFS_GC_HEAD 0 +/* Base journal head number */ +#define UBIFS_BASE_HEAD 1 +/* Data journal head number */ +#define UBIFS_DATA_HEAD 2 + +/* + * LEB Properties Tree node types. + * + * UBIFS_LPT_PNODE: LPT leaf node (contains LEB properties) + * UBIFS_LPT_NNODE: LPT internal node + * UBIFS_LPT_LTAB: LPT's own lprops table + * UBIFS_LPT_LSAVE: LPT's save table (big model only) + * UBIFS_LPT_NODE_CNT: count of LPT node types + * UBIFS_LPT_NOT_A_NODE: all ones (15 for 4 bits) is never a valid node type + */ +enum { + UBIFS_LPT_PNODE, + UBIFS_LPT_NNODE, + UBIFS_LPT_LTAB, + UBIFS_LPT_LSAVE, + UBIFS_LPT_NODE_CNT, + UBIFS_LPT_NOT_A_NODE = (1 << UBIFS_LPT_TYPE_BITS) - 1, +}; + +/* + * UBIFS inode types. + * + * UBIFS_ITYPE_REG: regular file + * UBIFS_ITYPE_DIR: directory + * UBIFS_ITYPE_LNK: soft link + * UBIFS_ITYPE_BLK: block device node + * UBIFS_ITYPE_CHR: character device node + * UBIFS_ITYPE_FIFO: fifo + * UBIFS_ITYPE_SOCK: socket + * UBIFS_ITYPES_CNT: count of supported file types + */ +enum { + UBIFS_ITYPE_REG, + UBIFS_ITYPE_DIR, + UBIFS_ITYPE_LNK, + UBIFS_ITYPE_BLK, + UBIFS_ITYPE_CHR, + UBIFS_ITYPE_FIFO, + UBIFS_ITYPE_SOCK, + UBIFS_ITYPES_CNT, +}; + +/* + * Supported key hash functions. + * + * UBIFS_KEY_HASH_R5: R5 hash + * UBIFS_KEY_HASH_TEST: test hash which just returns first 4 bytes of the name + */ +enum { + UBIFS_KEY_HASH_R5, + UBIFS_KEY_HASH_TEST, +}; + +/* + * Supported key formats. + * + * UBIFS_SIMPLE_KEY_FMT: simple key format + */ +enum { + UBIFS_SIMPLE_KEY_FMT, +}; + +/* + * The simple key format uses 29 bits for storing UBIFS block number and hash + * value. + */ +#define UBIFS_S_KEY_BLOCK_BITS 29 +#define UBIFS_S_KEY_BLOCK_MASK 0x1FFFFFFF +#define UBIFS_S_KEY_HASH_BITS UBIFS_S_KEY_BLOCK_BITS +#define UBIFS_S_KEY_HASH_MASK UBIFS_S_KEY_BLOCK_MASK + +/* + * Key types. + * + * UBIFS_INO_KEY: inode node key + * UBIFS_DATA_KEY: data node key + * UBIFS_DENT_KEY: directory entry node key + * UBIFS_XENT_KEY: extended attribute entry key + * UBIFS_KEY_TYPES_CNT: number of supported key types + */ +enum { + UBIFS_INO_KEY, + UBIFS_DATA_KEY, + UBIFS_DENT_KEY, + UBIFS_XENT_KEY, + UBIFS_KEY_TYPES_CNT, +}; + +/* Count of LEBs reserved for the superblock area */ +#define UBIFS_SB_LEBS 1 +/* Count of LEBs reserved for the master area */ +#define UBIFS_MST_LEBS 2 + +/* First LEB of the superblock area */ +#define UBIFS_SB_LNUM 0 +/* First LEB of the master area */ +#define UBIFS_MST_LNUM (UBIFS_SB_LNUM + UBIFS_SB_LEBS) +/* First LEB of the log area */ +#define UBIFS_LOG_LNUM (UBIFS_MST_LNUM + UBIFS_MST_LEBS) + +/* + * The below constants define the absolute minimum values for various UBIFS + * media areas. Many of them actually depend of flash geometry and the FS + * configuration (number of journal heads, orphan LEBs, etc). This means that + * the smallest volume size which can be used for UBIFS cannot be pre-defined + * by these constants. The file-system that meets the below limitation will not + * necessarily mount. UBIFS does run-time calculations and validates the FS + * size. + */ + +/* Minimum number of logical eraseblocks in the log */ +#define UBIFS_MIN_LOG_LEBS 2 +/* Minimum number of bud logical eraseblocks (one for each head) */ +#define UBIFS_MIN_BUD_LEBS 3 +/* Minimum number of journal logical eraseblocks */ +#define UBIFS_MIN_JNL_LEBS (UBIFS_MIN_LOG_LEBS + UBIFS_MIN_BUD_LEBS) +/* Minimum number of LPT area logical eraseblocks */ +#define UBIFS_MIN_LPT_LEBS 2 +/* Minimum number of orphan area logical eraseblocks */ +#define UBIFS_MIN_ORPH_LEBS 1 +/* + * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1 + * for GC, 1 for deletions, and at least 1 for committed data). + */ +#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6) + +/* Minimum number of logical eraseblocks */ +#define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \ + UBIFS_MIN_LOG_LEBS + UBIFS_MIN_LPT_LEBS + \ + UBIFS_MIN_ORPH_LEBS + UBIFS_MIN_MAIN_LEBS) + +/* Node sizes (N.B. these are guaranteed to be multiples of 8) */ +#define UBIFS_CH_SZ sizeof(struct ubifs_ch) +#define UBIFS_INO_NODE_SZ sizeof(struct ubifs_ino_node) +#define UBIFS_DATA_NODE_SZ sizeof(struct ubifs_data_node) +#define UBIFS_DENT_NODE_SZ sizeof(struct ubifs_dent_node) +#define UBIFS_TRUN_NODE_SZ sizeof(struct ubifs_trun_node) +#define UBIFS_PAD_NODE_SZ sizeof(struct ubifs_pad_node) +#define UBIFS_SB_NODE_SZ sizeof(struct ubifs_sb_node) +#define UBIFS_MST_NODE_SZ sizeof(struct ubifs_mst_node) +#define UBIFS_REF_NODE_SZ sizeof(struct ubifs_ref_node) +#define UBIFS_IDX_NODE_SZ sizeof(struct ubifs_idx_node) +#define UBIFS_CS_NODE_SZ sizeof(struct ubifs_cs_node) +#define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node) +/* Extended attribute entry nodes are identical to directory entry nodes */ +#define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ +/* Only this does not have to be multiple of 8 bytes */ +#define UBIFS_BRANCH_SZ sizeof(struct ubifs_branch) + +/* Maximum node sizes (N.B. these are guaranteed to be multiples of 8) */ +#define UBIFS_MAX_DATA_NODE_SZ (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE) +#define UBIFS_MAX_INO_NODE_SZ (UBIFS_INO_NODE_SZ + UBIFS_MAX_INO_DATA) +#define UBIFS_MAX_DENT_NODE_SZ (UBIFS_DENT_NODE_SZ + UBIFS_MAX_NLEN + 1) +#define UBIFS_MAX_XENT_NODE_SZ UBIFS_MAX_DENT_NODE_SZ + +/* The largest UBIFS node */ +#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ + +/* + * On-flash inode flags. + * + * UBIFS_COMPR_FL: use compression for this inode + * UBIFS_SYNC_FL: I/O on this inode has to be synchronous + * UBIFS_IMMUTABLE_FL: inode is immutable + * UBIFS_APPEND_FL: writes to the inode may only append data + * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous + * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value + * + * Note, these are on-flash flags which correspond to ioctl flags + * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not + * have to be the same. + */ +enum { + UBIFS_COMPR_FL = 0x01, + UBIFS_SYNC_FL = 0x02, + UBIFS_IMMUTABLE_FL = 0x04, + UBIFS_APPEND_FL = 0x08, + UBIFS_DIRSYNC_FL = 0x10, + UBIFS_XATTR_FL = 0x20, +}; + +/* Inode flag bits used by UBIFS */ +#define UBIFS_FL_MASK 0x0000001F + +/* + * UBIFS compression algorithms. + * + * UBIFS_COMPR_NONE: no compression + * UBIFS_COMPR_LZO: LZO compression + * UBIFS_COMPR_ZLIB: ZLIB compression + * UBIFS_COMPR_TYPES_CNT: count of supported compression types + */ +enum { + UBIFS_COMPR_NONE, + UBIFS_COMPR_LZO, + UBIFS_COMPR_ZLIB, + UBIFS_COMPR_TYPES_CNT, +}; + +/* + * UBIFS node types. + * + * UBIFS_INO_NODE: inode node + * UBIFS_DATA_NODE: data node + * UBIFS_DENT_NODE: directory entry node + * UBIFS_XENT_NODE: extended attribute node + * UBIFS_TRUN_NODE: truncation node + * UBIFS_PAD_NODE: padding node + * UBIFS_SB_NODE: superblock node + * UBIFS_MST_NODE: master node + * UBIFS_REF_NODE: LEB reference node + * UBIFS_IDX_NODE: index node + * UBIFS_CS_NODE: commit start node + * UBIFS_ORPH_NODE: orphan node + * UBIFS_NODE_TYPES_CNT: count of supported node types + * + * Note, we index arrays by these numbers, so keep them low and contiguous. + * Node type constants for inodes, direntries and so on have to be the same as + * corresponding key type constants. + */ +enum { + UBIFS_INO_NODE, + UBIFS_DATA_NODE, + UBIFS_DENT_NODE, + UBIFS_XENT_NODE, + UBIFS_TRUN_NODE, + UBIFS_PAD_NODE, + UBIFS_SB_NODE, + UBIFS_MST_NODE, + UBIFS_REF_NODE, + UBIFS_IDX_NODE, + UBIFS_CS_NODE, + UBIFS_ORPH_NODE, + UBIFS_NODE_TYPES_CNT, +}; + +/* + * Master node flags. + * + * UBIFS_MST_DIRTY: rebooted uncleanly - master node is dirty + * UBIFS_MST_NO_ORPHS: no orphan inodes present + * UBIFS_MST_RCVRY: written by recovery + */ +enum { + UBIFS_MST_DIRTY = 1, + UBIFS_MST_NO_ORPHS = 2, + UBIFS_MST_RCVRY = 4, +}; + +/* + * Node group type (used by recovery to recover whole group or none). + * + * UBIFS_NO_NODE_GROUP: this node is not part of a group + * UBIFS_IN_NODE_GROUP: this node is a part of a group + * UBIFS_LAST_OF_NODE_GROUP: this node is the last in a group + */ +enum { + UBIFS_NO_NODE_GROUP = 0, + UBIFS_IN_NODE_GROUP, + UBIFS_LAST_OF_NODE_GROUP, +}; + +/* + * Superblock flags. + * + * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set + * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed + */ +enum { + UBIFS_FLG_BIGLPT = 0x02, + UBIFS_FLG_SPACE_FIXUP = 0x04, +}; + +/** + * struct ubifs_ch - common header node. + * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC) + * @crc: CRC-32 checksum of the node header + * @sqnum: sequence number + * @len: full node length + * @node_type: node type + * @group_type: node group type + * @padding: reserved for future, zeroes + * + * Every UBIFS node starts with this common part. If the node has a key, the + * key always goes next. + */ +struct ubifs_ch { + __le32 magic; + __le32 crc; + __le64 sqnum; + __le32 len; + __u8 node_type; + __u8 group_type; + __u8 padding[2]; +} __packed; + +/** + * union ubifs_dev_desc - device node descriptor. + * @new: new type device descriptor + * @huge: huge type device descriptor + * + * This data structure describes major/minor numbers of a device node. In an + * inode is a device node then its data contains an object of this type. UBIFS + * uses standard Linux "new" and "huge" device node encodings. + */ +union ubifs_dev_desc { + __le32 new; + __le64 huge; +} __packed; + +/** + * struct ubifs_ino_node - inode node. + * @ch: common header + * @key: node key + * @creat_sqnum: sequence number at time of creation + * @size: inode size in bytes (amount of uncompressed data) + * @atime_sec: access time seconds + * @ctime_sec: creation time seconds + * @mtime_sec: modification time seconds + * @atime_nsec: access time nanoseconds + * @ctime_nsec: creation time nanoseconds + * @mtime_nsec: modification time nanoseconds + * @nlink: number of hard links + * @uid: owner ID + * @gid: group ID + * @mode: access flags + * @flags: per-inode flags (%UBIFS_COMPR_FL, %UBIFS_SYNC_FL, etc) + * @data_len: inode data length + * @xattr_cnt: count of extended attributes this inode has + * @xattr_size: summarized size of all extended attributes in bytes + * @padding1: reserved for future, zeroes + * @xattr_names: sum of lengths of all extended attribute names belonging to + * this inode + * @compr_type: compression type used for this inode + * @padding2: reserved for future, zeroes + * @data: data attached to the inode + * + * Note, even though inode compression type is defined by @compr_type, some + * nodes of this inode may be compressed with different compressor - this + * happens if compression type is changed while the inode already has data + * nodes. But @compr_type will be use for further writes to the inode. + * + * Note, do not forget to amend 'zero_ino_node_unused()' function when changing + * the padding fields. + */ +struct ubifs_ino_node { + struct ubifs_ch ch; + __u8 key[UBIFS_MAX_KEY_LEN]; + __le64 creat_sqnum; + __le64 size; + __le64 atime_sec; + __le64 ctime_sec; + __le64 mtime_sec; + __le32 atime_nsec; + __le32 ctime_nsec; + __le32 mtime_nsec; + __le32 nlink; + __le32 uid; + __le32 gid; + __le32 mode; + __le32 flags; + __le32 data_len; + __le32 xattr_cnt; + __le32 xattr_size; + __u8 padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */ + __le32 xattr_names; + __le16 compr_type; + __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */ + __u8 data[]; +} __packed; + +/** + * struct ubifs_dent_node - directory entry node. + * @ch: common header + * @key: node key + * @inum: target inode number + * @padding1: reserved for future, zeroes + * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc) + * @nlen: name length + * @padding2: reserved for future, zeroes + * @name: zero-terminated name + * + * Note, do not forget to amend 'zero_dent_node_unused()' function when + * changing the padding fields. + */ +struct ubifs_dent_node { + struct ubifs_ch ch; + __u8 key[UBIFS_MAX_KEY_LEN]; + __le64 inum; + __u8 padding1; + __u8 type; + __le16 nlen; + __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */ + __u8 name[]; +} __packed; + +/** + * struct ubifs_data_node - data node. + * @ch: common header + * @key: node key + * @size: uncompressed data size in bytes + * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc) + * @padding: reserved for future, zeroes + * @data: data + * + * Note, do not forget to amend 'zero_data_node_unused()' function when + * changing the padding fields. + */ +struct ubifs_data_node { + struct ubifs_ch ch; + __u8 key[UBIFS_MAX_KEY_LEN]; + __le32 size; + __le16 compr_type; + __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */ + __u8 data[]; +} __packed; + +/** + * struct ubifs_trun_node - truncation node. + * @ch: common header + * @inum: truncated inode number + * @padding: reserved for future, zeroes + * @old_size: size before truncation + * @new_size: size after truncation + * + * This node exists only in the journal and never goes to the main area. Note, + * do not forget to amend 'zero_trun_node_unused()' function when changing the + * padding fields. + */ +struct ubifs_trun_node { + struct ubifs_ch ch; + __le32 inum; + __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */ + __le64 old_size; + __le64 new_size; +} __packed; + +/** + * struct ubifs_pad_node - padding node. + * @ch: common header + * @pad_len: how many bytes after this node are unused (because padded) + * @padding: reserved for future, zeroes + */ +struct ubifs_pad_node { + struct ubifs_ch ch; + __le32 pad_len; +} __packed; + +/** + * struct ubifs_sb_node - superblock node. + * @ch: common header + * @padding: reserved for future, zeroes + * @key_hash: type of hash function used in keys + * @key_fmt: format of the key + * @flags: file-system flags (%UBIFS_FLG_BIGLPT, etc) + * @min_io_size: minimal input/output unit size + * @leb_size: logical eraseblock size in bytes + * @leb_cnt: count of LEBs used by file-system + * @max_leb_cnt: maximum count of LEBs used by file-system + * @max_bud_bytes: maximum amount of data stored in buds + * @log_lebs: log size in logical eraseblocks + * @lpt_lebs: number of LEBs used for lprops table + * @orph_lebs: number of LEBs used for recording orphans + * @jhead_cnt: count of journal heads + * @fanout: tree fanout (max. number of links per indexing node) + * @lsave_cnt: number of LEB numbers in LPT's save table + * @fmt_version: UBIFS on-flash format version + * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) + * @padding1: reserved for future, zeroes + * @rp_uid: reserve pool UID + * @rp_gid: reserve pool GID + * @rp_size: size of the reserved pool in bytes + * @padding2: reserved for future, zeroes + * @time_gran: time granularity in nanoseconds + * @uuid: UUID generated when the file system image was created + * @ro_compat_version: UBIFS R/O compatibility version + */ +struct ubifs_sb_node { + struct ubifs_ch ch; + __u8 padding[2]; + __u8 key_hash; + __u8 key_fmt; + __le32 flags; + __le32 min_io_size; + __le32 leb_size; + __le32 leb_cnt; + __le32 max_leb_cnt; + __le64 max_bud_bytes; + __le32 log_lebs; + __le32 lpt_lebs; + __le32 orph_lebs; + __le32 jhead_cnt; + __le32 fanout; + __le32 lsave_cnt; + __le32 fmt_version; + __le16 default_compr; + __u8 padding1[2]; + __le32 rp_uid; + __le32 rp_gid; + __le64 rp_size; + __le32 time_gran; + __u8 uuid[16]; + __le32 ro_compat_version; + __u8 padding2[3968]; +} __packed; + +/** + * struct ubifs_mst_node - master node. + * @ch: common header + * @highest_inum: highest inode number in the committed index + * @cmt_no: commit number + * @flags: various flags (%UBIFS_MST_DIRTY, etc) + * @log_lnum: start of the log + * @root_lnum: LEB number of the root indexing node + * @root_offs: offset within @root_lnum + * @root_len: root indexing node length + * @gc_lnum: LEB reserved for garbage collection (%-1 value means the LEB was + * not reserved and should be reserved on mount) + * @ihead_lnum: LEB number of index head + * @ihead_offs: offset of index head + * @index_size: size of index on flash + * @total_free: total free space in bytes + * @total_dirty: total dirty space in bytes + * @total_used: total used space in bytes (includes only data LEBs) + * @total_dead: total dead space in bytes (includes only data LEBs) + * @total_dark: total dark space in bytes (includes only data LEBs) + * @lpt_lnum: LEB number of LPT root nnode + * @lpt_offs: offset of LPT root nnode + * @nhead_lnum: LEB number of LPT head + * @nhead_offs: offset of LPT head + * @ltab_lnum: LEB number of LPT's own lprops table + * @ltab_offs: offset of LPT's own lprops table + * @lsave_lnum: LEB number of LPT's save table (big model only) + * @lsave_offs: offset of LPT's save table (big model only) + * @lscan_lnum: LEB number of last LPT scan + * @empty_lebs: number of empty logical eraseblocks + * @idx_lebs: number of indexing logical eraseblocks + * @leb_cnt: count of LEBs used by file-system + * @padding: reserved for future, zeroes + */ +struct ubifs_mst_node { + struct ubifs_ch ch; + __le64 highest_inum; + __le64 cmt_no; + __le32 flags; + __le32 log_lnum; + __le32 root_lnum; + __le32 root_offs; + __le32 root_len; + __le32 gc_lnum; + __le32 ihead_lnum; + __le32 ihead_offs; + __le64 index_size; + __le64 total_free; + __le64 total_dirty; + __le64 total_used; + __le64 total_dead; + __le64 total_dark; + __le32 lpt_lnum; + __le32 lpt_offs; + __le32 nhead_lnum; + __le32 nhead_offs; + __le32 ltab_lnum; + __le32 ltab_offs; + __le32 lsave_lnum; + __le32 lsave_offs; + __le32 lscan_lnum; + __le32 empty_lebs; + __le32 idx_lebs; + __le32 leb_cnt; + __u8 padding[344]; +} __packed; + +/** + * struct ubifs_ref_node - logical eraseblock reference node. + * @ch: common header + * @lnum: the referred logical eraseblock number + * @offs: start offset in the referred LEB + * @jhead: journal head number + * @padding: reserved for future, zeroes + */ +struct ubifs_ref_node { + struct ubifs_ch ch; + __le32 lnum; + __le32 offs; + __le32 jhead; + __u8 padding[28]; +} __packed; + +/** + * struct ubifs_branch - key/reference/length branch + * @lnum: LEB number of the target node + * @offs: offset within @lnum + * @len: target node length + * @key: key + */ +struct ubifs_branch { + __le32 lnum; + __le32 offs; + __le32 len; + __u8 key[]; +} __packed; + +/** + * struct ubifs_idx_node - indexing node. + * @ch: common header + * @child_cnt: number of child index nodes + * @level: tree level + * @branches: LEB number / offset / length / key branches + */ +struct ubifs_idx_node { + struct ubifs_ch ch; + __le16 child_cnt; + __le16 level; + __u8 branches[]; +} __packed; + +/** + * struct ubifs_cs_node - commit start node. + * @ch: common header + * @cmt_no: commit number + */ +struct ubifs_cs_node { + struct ubifs_ch ch; + __le64 cmt_no; +} __packed; + +/** + * struct ubifs_orph_node - orphan node. + * @ch: common header + * @cmt_no: commit number (also top bit is set on the last node of the commit) + * @inos: inode numbers of orphans + */ +struct ubifs_orph_node { + struct ubifs_ch ch; + __le64 cmt_no; + __le64 inos[]; +} __packed; + +#endif /* __UBIFS_MEDIA_H__ */ diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h new file mode 100644 index 00000000..f79983d6 --- /dev/null +++ b/fs/ubifs/ubifs.h @@ -0,0 +1,1777 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +#ifndef __UBIFS_H__ +#define __UBIFS_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ubifs-media.h" + +/* Version of this UBIFS implementation */ +#define UBIFS_VERSION 1 + +/* Normal UBIFS messages */ +#define ubifs_msg(fmt, ...) \ + printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__) +/* UBIFS error messages */ +#define ubifs_err(fmt, ...) \ + printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \ + __func__, ##__VA_ARGS__) +/* UBIFS warning messages */ +#define ubifs_warn(fmt, ...) \ + printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__) + +/* UBIFS file system VFS magic number */ +#define UBIFS_SUPER_MAGIC 0x24051905 + +/* Number of UBIFS blocks per VFS page */ +#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE) +#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT) + +/* "File system end of life" sequence number watermark */ +#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL +#define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL + +/* + * Minimum amount of LEBs reserved for the index. At present the index needs at + * least 2 LEBs: one for the index head and one for in-the-gaps method (which + * currently does not cater for the index head and so excludes it from + * consideration). + */ +#define MIN_INDEX_LEBS 2 + +/* Minimum amount of data UBIFS writes to the flash */ +#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8) + +/* + * Currently we do not support inode number overlapping and re-using, so this + * watermark defines dangerous inode number level. This should be fixed later, + * although it is difficult to exceed current limit. Another option is to use + * 64-bit inode numbers, but this means more overhead. + */ +#define INUM_WARN_WATERMARK 0xFFF00000 +#define INUM_WATERMARK 0xFFFFFF00 + +/* Largest key size supported in this implementation */ +#define CUR_MAX_KEY_LEN UBIFS_SK_LEN + +/* Maximum number of entries in each LPT (LEB category) heap */ +#define LPT_HEAP_SZ 256 + +/* + * Background thread name pattern. The numbers are UBI device and volume + * numbers. + */ +#define BGT_NAME_PATTERN "ubifs_bgt%d_%d" + +/* Write-buffer synchronization timeout interval in seconds */ +#define WBUF_TIMEOUT_SOFTLIMIT 3 +#define WBUF_TIMEOUT_HARDLIMIT 5 + +/* Maximum possible inode number (only 32-bit inodes are supported now) */ +#define MAX_INUM 0xFFFFFFFF + +/* Number of non-data journal heads */ +#define NONDATA_JHEADS_CNT 2 + +/* Shorter names for journal head numbers for internal usage */ +#define GCHD UBIFS_GC_HEAD +#define BASEHD UBIFS_BASE_HEAD +#define DATAHD UBIFS_DATA_HEAD + +/* 'No change' value for 'ubifs_change_lp()' */ +#define LPROPS_NC 0x80000001 + +/* + * There is no notion of truncation key because truncation nodes do not exist + * in TNC. However, when replaying, it is handy to introduce fake "truncation" + * keys for truncation nodes because the code becomes simpler. So we define + * %UBIFS_TRUN_KEY type. + * + * But otherwise, out of the journal reply scope, the truncation keys are + * invalid. + */ +#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT +#define UBIFS_INVALID_KEY UBIFS_KEY_TYPES_CNT + +/* + * How much a directory entry/extended attribute entry adds to the parent/host + * inode. + */ +#define CALC_DENT_SIZE(name_len) ALIGN(UBIFS_DENT_NODE_SZ + (name_len) + 1, 8) + +/* How much an extended attribute adds to the host inode */ +#define CALC_XATTR_BYTES(data_len) ALIGN(UBIFS_INO_NODE_SZ + (data_len) + 1, 8) + +/* + * Znodes which were not touched for 'OLD_ZNODE_AGE' seconds are considered + * "old", and znode which were touched last 'YOUNG_ZNODE_AGE' seconds ago are + * considered "young". This is used by shrinker when selecting znode to trim + * off. + */ +#define OLD_ZNODE_AGE 20 +#define YOUNG_ZNODE_AGE 5 + +/* + * Some compressors, like LZO, may end up with more data then the input buffer. + * So UBIFS always allocates larger output buffer, to be sure the compressor + * will not corrupt memory in case of worst case compression. + */ +#define WORST_COMPR_FACTOR 2 + +/* + * How much memory is needed for a buffer where we comress a data node. + */ +#define COMPRESSED_DATA_NODE_BUF_SZ \ + (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR) + +/* Maximum expected tree height for use by bottom_up_buf */ +#define BOTTOM_UP_HEIGHT 64 + +/* Maximum number of data nodes to bulk-read */ +#define UBIFS_MAX_BULK_READ 32 + +/* + * Lockdep classes for UBIFS inode @ui_mutex. + */ +enum { + WB_MUTEX_1 = 0, + WB_MUTEX_2 = 1, + WB_MUTEX_3 = 2, +}; + +/* + * Znode flags (actually, bit numbers which store the flags). + * + * DIRTY_ZNODE: znode is dirty + * COW_ZNODE: znode is being committed and a new instance of this znode has to + * be created before changing this znode + * OBSOLETE_ZNODE: znode is obsolete, which means it was deleted, but it is + * still in the commit list and the ongoing commit operation + * will commit it, and delete this znode after it is done + */ +enum { + DIRTY_ZNODE = 0, + COW_ZNODE = 1, + OBSOLETE_ZNODE = 2, +}; + +/* + * Commit states. + * + * COMMIT_RESTING: commit is not wanted + * COMMIT_BACKGROUND: background commit has been requested + * COMMIT_REQUIRED: commit is required + * COMMIT_RUNNING_BACKGROUND: background commit is running + * COMMIT_RUNNING_REQUIRED: commit is running and it is required + * COMMIT_BROKEN: commit failed + */ +enum { + COMMIT_RESTING = 0, + COMMIT_BACKGROUND, + COMMIT_REQUIRED, + COMMIT_RUNNING_BACKGROUND, + COMMIT_RUNNING_REQUIRED, + COMMIT_BROKEN, +}; + +/* + * 'ubifs_scan_a_node()' return values. + * + * SCANNED_GARBAGE: scanned garbage + * SCANNED_EMPTY_SPACE: scanned empty space + * SCANNED_A_NODE: scanned a valid node + * SCANNED_A_CORRUPT_NODE: scanned a corrupted node + * SCANNED_A_BAD_PAD_NODE: scanned a padding node with invalid pad length + * + * Greater than zero means: 'scanned that number of padding bytes' + */ +enum { + SCANNED_GARBAGE = 0, + SCANNED_EMPTY_SPACE = -1, + SCANNED_A_NODE = -2, + SCANNED_A_CORRUPT_NODE = -3, + SCANNED_A_BAD_PAD_NODE = -4, +}; + +/* + * LPT cnode flag bits. + * + * DIRTY_CNODE: cnode is dirty + * COW_CNODE: cnode is being committed and must be copied before writing + * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted), + * so it can (and must) be freed when the commit is finished + */ +enum { + DIRTY_CNODE = 0, + COW_CNODE = 1, + OBSOLETE_CNODE = 2, +}; + +/* + * Dirty flag bits (lpt_drty_flgs) for LPT special nodes. + * + * LTAB_DIRTY: ltab node is dirty + * LSAVE_DIRTY: lsave node is dirty + */ +enum { + LTAB_DIRTY = 1, + LSAVE_DIRTY = 2, +}; + +/* + * Return codes used by the garbage collector. + * @LEB_FREED: the logical eraseblock was freed and is ready to use + * @LEB_FREED_IDX: indexing LEB was freed and can be used only after the commit + * @LEB_RETAINED: the logical eraseblock was freed and retained for GC purposes + */ +enum { + LEB_FREED, + LEB_FREED_IDX, + LEB_RETAINED, +}; + +/** + * struct ubifs_old_idx - index node obsoleted since last commit start. + * @rb: rb-tree node + * @lnum: LEB number of obsoleted index node + * @offs: offset of obsoleted index node + */ +struct ubifs_old_idx { + struct rb_node rb; + int lnum; + int offs; +}; + +/* The below union makes it easier to deal with keys */ +union ubifs_key { + uint8_t u8[CUR_MAX_KEY_LEN]; + uint32_t u32[CUR_MAX_KEY_LEN/4]; + uint64_t u64[CUR_MAX_KEY_LEN/8]; + __le32 j32[CUR_MAX_KEY_LEN/4]; +}; + +/** + * struct ubifs_scan_node - UBIFS scanned node information. + * @list: list of scanned nodes + * @key: key of node scanned (if it has one) + * @sqnum: sequence number + * @type: type of node scanned + * @offs: offset with LEB of node scanned + * @len: length of node scanned + * @node: raw node + */ +struct ubifs_scan_node { + struct list_head list; + union ubifs_key key; + unsigned long long sqnum; + int type; + int offs; + int len; + void *node; +}; + +/** + * struct ubifs_scan_leb - UBIFS scanned LEB information. + * @lnum: logical eraseblock number + * @nodes_cnt: number of nodes scanned + * @nodes: list of struct ubifs_scan_node + * @endpt: end point (and therefore the start of empty space) + * @ecc: read returned -EBADMSG + * @buf: buffer containing entire LEB scanned + */ +struct ubifs_scan_leb { + int lnum; + int nodes_cnt; + struct list_head nodes; + int endpt; + int ecc; + void *buf; +}; + +/** + * struct ubifs_gced_idx_leb - garbage-collected indexing LEB. + * @list: list + * @lnum: LEB number + * @unmap: OK to unmap this LEB + * + * This data structure is used to temporary store garbage-collected indexing + * LEBs - they are not released immediately, but only after the next commit. + * This is needed to guarantee recoverability. + */ +struct ubifs_gced_idx_leb { + struct list_head list; + int lnum; + int unmap; +}; + +/** + * struct ubifs_inode - UBIFS in-memory inode description. + * @vfs_inode: VFS inode description object + * @creat_sqnum: sequence number at time of creation + * @del_cmtno: commit number corresponding to the time the inode was deleted, + * protected by @c->commit_sem; + * @xattr_size: summarized size of all extended attributes in bytes + * @xattr_cnt: count of extended attributes this inode has + * @xattr_names: sum of lengths of all extended attribute names belonging to + * this inode + * @dirty: non-zero if the inode is dirty + * @xattr: non-zero if this is an extended attribute inode + * @bulk_read: non-zero if bulk-read should be used + * @ui_mutex: serializes inode write-back with the rest of VFS operations, + * serializes "clean <-> dirty" state changes, serializes bulk-read, + * protects @dirty, @bulk_read, @ui_size, and @xattr_size + * @ui_lock: protects @synced_i_size + * @synced_i_size: synchronized size of inode, i.e. the value of inode size + * currently stored on the flash; used only for regular file + * inodes + * @ui_size: inode size used by UBIFS when writing to flash + * @flags: inode flags (@UBIFS_COMPR_FL, etc) + * @compr_type: default compression type used for this inode + * @last_page_read: page number of last page read (for bulk read) + * @read_in_a_row: number of consecutive pages read in a row (for bulk read) + * @data_len: length of the data attached to the inode + * @data: inode's data + * + * @ui_mutex exists for two main reasons. At first it prevents inodes from + * being written back while UBIFS changing them, being in the middle of an VFS + * operation. This way UBIFS makes sure the inode fields are consistent. For + * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and + * write-back must not write any of them before we have finished. + * + * The second reason is budgeting - UBIFS has to budget all operations. If an + * operation is going to mark an inode dirty, it has to allocate budget for + * this. It cannot just mark it dirty because there is no guarantee there will + * be enough flash space to write the inode back later. This means UBIFS has + * to have full control over inode "clean <-> dirty" transitions (and pages + * actually). But unfortunately, VFS marks inodes dirty in many places, and it + * does not ask the file-system if it is allowed to do so (there is a notifier, + * but it is not enough), i.e., there is no mechanism to synchronize with this. + * So UBIFS has its own inode dirty flag and its own mutex to serialize + * "clean <-> dirty" transitions. + * + * The @synced_i_size field is used to make sure we never write pages which are + * beyond last synchronized inode size. See 'ubifs_writepage()' for more + * information. + * + * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses + * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot + * make sure @inode->i_size is always changed under @ui_mutex, because it + * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would + * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields + * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one + * could consider to rework locking and base it on "shadow" fields. + */ +struct ubifs_inode { + struct inode vfs_inode; + unsigned long long creat_sqnum; + unsigned long long del_cmtno; + unsigned int xattr_size; + unsigned int xattr_cnt; + unsigned int xattr_names; + unsigned int dirty:1; + unsigned int xattr:1; + unsigned int bulk_read:1; + unsigned int compr_type:2; + struct mutex ui_mutex; + spinlock_t ui_lock; + loff_t synced_i_size; + loff_t ui_size; + int flags; + pgoff_t last_page_read; + pgoff_t read_in_a_row; + int data_len; + void *data; +}; + +/** + * struct ubifs_unclean_leb - records a LEB recovered under read-only mode. + * @list: list + * @lnum: LEB number of recovered LEB + * @endpt: offset where recovery ended + * + * This structure records a LEB identified during recovery that needs to be + * cleaned but was not because UBIFS was mounted read-only. The information + * is used to clean the LEB when remounting to read-write mode. + */ +struct ubifs_unclean_leb { + struct list_head list; + int lnum; + int endpt; +}; + +/* + * LEB properties flags. + * + * LPROPS_UNCAT: not categorized + * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index + * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index + * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index + * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs + * LPROPS_EMPTY: LEB is empty, not taken + * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken + * LPROPS_FRDI_IDX: free + dirty == leb_size and index, may be taken + * LPROPS_CAT_MASK: mask for the LEB categories above + * LPROPS_TAKEN: LEB was taken (this flag is not saved on the media) + * LPROPS_INDEX: LEB contains indexing nodes (this flag also exists on flash) + */ +enum { + LPROPS_UNCAT = 0, + LPROPS_DIRTY = 1, + LPROPS_DIRTY_IDX = 2, + LPROPS_FREE = 3, + LPROPS_HEAP_CNT = 3, + LPROPS_EMPTY = 4, + LPROPS_FREEABLE = 5, + LPROPS_FRDI_IDX = 6, + LPROPS_CAT_MASK = 15, + LPROPS_TAKEN = 16, + LPROPS_INDEX = 32, +}; + +/** + * struct ubifs_lprops - logical eraseblock properties. + * @free: amount of free space in bytes + * @dirty: amount of dirty space in bytes + * @flags: LEB properties flags (see above) + * @lnum: LEB number + * @list: list of same-category lprops (for LPROPS_EMPTY and LPROPS_FREEABLE) + * @hpos: heap position in heap of same-category lprops (other categories) + */ +struct ubifs_lprops { + int free; + int dirty; + int flags; + int lnum; + union { + struct list_head list; + int hpos; + }; +}; + +/** + * struct ubifs_lpt_lprops - LPT logical eraseblock properties. + * @free: amount of free space in bytes + * @dirty: amount of dirty space in bytes + * @tgc: trivial GC flag (1 => unmap after commit end) + * @cmt: commit flag (1 => reserved for commit) + */ +struct ubifs_lpt_lprops { + int free; + int dirty; + unsigned tgc:1; + unsigned cmt:1; +}; + +/** + * struct ubifs_lp_stats - statistics of eraseblocks in the main area. + * @empty_lebs: number of empty LEBs + * @taken_empty_lebs: number of taken LEBs + * @idx_lebs: number of indexing LEBs + * @total_free: total free space in bytes (includes all LEBs) + * @total_dirty: total dirty space in bytes (includes all LEBs) + * @total_used: total used space in bytes (does not include index LEBs) + * @total_dead: total dead space in bytes (does not include index LEBs) + * @total_dark: total dark space in bytes (does not include index LEBs) + * + * The @taken_empty_lebs field counts the LEBs that are in the transient state + * of having been "taken" for use but not yet written to. @taken_empty_lebs is + * needed to account correctly for @gc_lnum, otherwise @empty_lebs could be + * used by itself (in which case 'unused_lebs' would be a better name). In the + * case of @gc_lnum, it is "taken" at mount time or whenever a LEB is retained + * by GC, but unlike other empty LEBs that are "taken", it may not be written + * straight away (i.e. before the next commit start or unmount), so either + * @gc_lnum must be specially accounted for, or the current approach followed + * i.e. count it under @taken_empty_lebs. + * + * @empty_lebs includes @taken_empty_lebs. + * + * @total_used, @total_dead and @total_dark fields do not account indexing + * LEBs. + */ +struct ubifs_lp_stats { + int empty_lebs; + int taken_empty_lebs; + int idx_lebs; + long long total_free; + long long total_dirty; + long long total_used; + long long total_dead; + long long total_dark; +}; + +struct ubifs_nnode; + +/** + * struct ubifs_cnode - LEB Properties Tree common node. + * @parent: parent nnode + * @cnext: next cnode to commit + * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE) + * @iip: index in parent + * @level: level in the tree (zero for pnodes, greater than zero for nnodes) + * @num: node number + */ +struct ubifs_cnode { + struct ubifs_nnode *parent; + struct ubifs_cnode *cnext; + unsigned long flags; + int iip; + int level; + int num; +}; + +/** + * struct ubifs_pnode - LEB Properties Tree leaf node. + * @parent: parent nnode + * @cnext: next cnode to commit + * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE) + * @iip: index in parent + * @level: level in the tree (always zero for pnodes) + * @num: node number + * @lprops: LEB properties array + */ +struct ubifs_pnode { + struct ubifs_nnode *parent; + struct ubifs_cnode *cnext; + unsigned long flags; + int iip; + int level; + int num; + struct ubifs_lprops lprops[UBIFS_LPT_FANOUT]; +}; + +/** + * struct ubifs_nbranch - LEB Properties Tree internal node branch. + * @lnum: LEB number of child + * @offs: offset of child + * @nnode: nnode child + * @pnode: pnode child + * @cnode: cnode child + */ +struct ubifs_nbranch { + int lnum; + int offs; + union { + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + struct ubifs_cnode *cnode; + }; +}; + +/** + * struct ubifs_nnode - LEB Properties Tree internal node. + * @parent: parent nnode + * @cnext: next cnode to commit + * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE) + * @iip: index in parent + * @level: level in the tree (always greater than zero for nnodes) + * @num: node number + * @nbranch: branches to child nodes + */ +struct ubifs_nnode { + struct ubifs_nnode *parent; + struct ubifs_cnode *cnext; + unsigned long flags; + int iip; + int level; + int num; + struct ubifs_nbranch nbranch[UBIFS_LPT_FANOUT]; +}; + +/** + * struct ubifs_lpt_heap - heap of categorized lprops. + * @arr: heap array + * @cnt: number in heap + * @max_cnt: maximum number allowed in heap + * + * There are %LPROPS_HEAP_CNT heaps. + */ +struct ubifs_lpt_heap { + struct ubifs_lprops **arr; + int cnt; + int max_cnt; +}; + +/* + * Return codes for LPT scan callback function. + * + * LPT_SCAN_CONTINUE: continue scanning + * LPT_SCAN_ADD: add the LEB properties scanned to the tree in memory + * LPT_SCAN_STOP: stop scanning + */ +enum { + LPT_SCAN_CONTINUE = 0, + LPT_SCAN_ADD = 1, + LPT_SCAN_STOP = 2, +}; + +struct ubifs_info; + +/* Callback used by the 'ubifs_lpt_scan_nolock()' function */ +typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, + const struct ubifs_lprops *lprops, + int in_tree, void *data); + +/** + * struct ubifs_wbuf - UBIFS write-buffer. + * @c: UBIFS file-system description object + * @buf: write-buffer (of min. flash I/O unit size) + * @lnum: logical eraseblock number the write-buffer points to + * @offs: write-buffer offset in this logical eraseblock + * @avail: number of bytes available in the write-buffer + * @used: number of used bytes in the write-buffer + * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range) + * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM, + * %UBI_UNKNOWN) + * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep + * up by 'mutex_lock_nested()). + * @sync_callback: write-buffer synchronization callback + * @io_mutex: serializes write-buffer I/O + * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes + * fields + * @softlimit: soft write-buffer timeout interval + * @delta: hard and soft timeouts delta (the timer expire inteval is @softlimit + * and @softlimit + @delta) + * @timer: write-buffer timer + * @no_timer: non-zero if this write-buffer does not have a timer + * @need_sync: non-zero if the timer expired and the wbuf needs sync'ing + * @next_ino: points to the next position of the following inode number + * @inodes: stores the inode numbers of the nodes which are in wbuf + * + * The write-buffer synchronization callback is called when the write-buffer is + * synchronized in order to notify how much space was wasted due to + * write-buffer padding and how much free space is left in the LEB. + * + * Note: the fields @buf, @lnum, @offs, @avail and @used can be read under + * spin-lock or mutex because they are written under both mutex and spin-lock. + * @buf is appended to under mutex but overwritten under both mutex and + * spin-lock. Thus the data between @buf and @buf + @used can be read under + * spinlock. + */ +struct ubifs_wbuf { + struct ubifs_info *c; + void *buf; + int lnum; + int offs; + int avail; + int used; + int size; + int dtype; + int jhead; + int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); + struct mutex io_mutex; + spinlock_t lock; + ktime_t softlimit; + unsigned long long delta; + struct hrtimer timer; + unsigned int no_timer:1; + unsigned int need_sync:1; + int next_ino; + ino_t *inodes; +}; + +/** + * struct ubifs_bud - bud logical eraseblock. + * @lnum: logical eraseblock number + * @start: where the (uncommitted) bud data starts + * @jhead: journal head number this bud belongs to + * @list: link in the list buds belonging to the same journal head + * @rb: link in the tree of all buds + */ +struct ubifs_bud { + int lnum; + int start; + int jhead; + struct list_head list; + struct rb_node rb; +}; + +/** + * struct ubifs_jhead - journal head. + * @wbuf: head's write-buffer + * @buds_list: list of bud LEBs belonging to this journal head + * @grouped: non-zero if UBIFS groups nodes when writing to this journal head + * + * Note, the @buds list is protected by the @c->buds_lock. + */ +struct ubifs_jhead { + struct ubifs_wbuf wbuf; + struct list_head buds_list; + unsigned int grouped:1; +}; + +/** + * struct ubifs_zbranch - key/coordinate/length branch stored in znodes. + * @key: key + * @znode: znode address in memory + * @lnum: LEB number of the target node (indexing node or data node) + * @offs: target node offset within @lnum + * @len: target node length + */ +struct ubifs_zbranch { + union ubifs_key key; + union { + struct ubifs_znode *znode; + void *leaf; + }; + int lnum; + int offs; + int len; +}; + +/** + * struct ubifs_znode - in-memory representation of an indexing node. + * @parent: parent znode or NULL if it is the root + * @cnext: next znode to commit + * @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE) + * @time: last access time (seconds) + * @level: level of the entry in the TNC tree + * @child_cnt: count of child znodes + * @iip: index in parent's zbranch array + * @alt: lower bound of key range has altered i.e. child inserted at slot 0 + * @lnum: LEB number of the corresponding indexing node + * @offs: offset of the corresponding indexing node + * @len: length of the corresponding indexing node + * @zbranch: array of znode branches (@c->fanout elements) + */ +struct ubifs_znode { + struct ubifs_znode *parent; + struct ubifs_znode *cnext; + unsigned long flags; + unsigned long time; + int level; + int child_cnt; + int iip; + int alt; +#ifdef CONFIG_UBIFS_FS_DEBUG + int lnum, offs, len; +#endif + struct ubifs_zbranch zbranch[]; +}; + +/** + * struct bu_info - bulk-read information. + * @key: first data node key + * @zbranch: zbranches of data nodes to bulk read + * @buf: buffer to read into + * @buf_len: buffer length + * @gc_seq: GC sequence number to detect races with GC + * @cnt: number of data nodes for bulk read + * @blk_cnt: number of data blocks including holes + * @oef: end of file reached + */ +struct bu_info { + union ubifs_key key; + struct ubifs_zbranch zbranch[UBIFS_MAX_BULK_READ]; + void *buf; + int buf_len; + int gc_seq; + int cnt; + int blk_cnt; + int eof; +}; + +/** + * struct ubifs_node_range - node length range description data structure. + * @len: fixed node length + * @min_len: minimum possible node length + * @max_len: maximum possible node length + * + * If @max_len is %0, the node has fixed length @len. + */ +struct ubifs_node_range { + union { + int len; + int min_len; + }; + int max_len; +}; + +/** + * struct ubifs_compressor - UBIFS compressor description structure. + * @compr_type: compressor type (%UBIFS_COMPR_LZO, etc) + * @cc: cryptoapi compressor handle + * @comp_mutex: mutex used during compression + * @decomp_mutex: mutex used during decompression + * @name: compressor name + * @capi_name: cryptoapi compressor name + */ +struct ubifs_compressor { + int compr_type; + struct crypto_comp *cc; + struct mutex *comp_mutex; + struct mutex *decomp_mutex; + const char *name; + const char *capi_name; +}; + +/** + * struct ubifs_budget_req - budget requirements of an operation. + * + * @fast: non-zero if the budgeting should try to acquire budget quickly and + * should not try to call write-back + * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields + * have to be re-calculated + * @new_page: non-zero if the operation adds a new page + * @dirtied_page: non-zero if the operation makes a page dirty + * @new_dent: non-zero if the operation adds a new directory entry + * @mod_dent: non-zero if the operation removes or modifies an existing + * directory entry + * @new_ino: non-zero if the operation adds a new inode + * @new_ino_d: now much data newly created inode contains + * @dirtied_ino: how many inodes the operation makes dirty + * @dirtied_ino_d: now much data dirtied inode contains + * @idx_growth: how much the index will supposedly grow + * @data_growth: how much new data the operation will supposedly add + * @dd_growth: how much data that makes other data dirty the operation will + * supposedly add + * + * @idx_growth, @data_growth and @dd_growth are not used in budget request. The + * budgeting subsystem caches index and data growth values there to avoid + * re-calculating them when the budget is released. However, if @idx_growth is + * %-1, it is calculated by the release function using other fields. + * + * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d + * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made + * dirty by the re-name operation. + * + * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to + * make sure the amount of inode data which contribute to @new_ino_d and + * @dirtied_ino_d fields are aligned. + */ +struct ubifs_budget_req { + unsigned int fast:1; + unsigned int recalculate:1; +#ifndef UBIFS_DEBUG + unsigned int new_page:1; + unsigned int dirtied_page:1; + unsigned int new_dent:1; + unsigned int mod_dent:1; + unsigned int new_ino:1; + unsigned int new_ino_d:13; + unsigned int dirtied_ino:4; + unsigned int dirtied_ino_d:15; +#else + /* Not bit-fields to check for overflows */ + unsigned int new_page; + unsigned int dirtied_page; + unsigned int new_dent; + unsigned int mod_dent; + unsigned int new_ino; + unsigned int new_ino_d; + unsigned int dirtied_ino; + unsigned int dirtied_ino_d; +#endif + int idx_growth; + int data_growth; + int dd_growth; +}; + +/** + * struct ubifs_orphan - stores the inode number of an orphan. + * @rb: rb-tree node of rb-tree of orphans sorted by inode number + * @list: list head of list of orphans in order added + * @new_list: list head of list of orphans added since the last commit + * @cnext: next orphan to commit + * @dnext: next orphan to delete + * @inum: inode number + * @new: %1 => added since the last commit, otherwise %0 + */ +struct ubifs_orphan { + struct rb_node rb; + struct list_head list; + struct list_head new_list; + struct ubifs_orphan *cnext; + struct ubifs_orphan *dnext; + ino_t inum; + int new; +}; + +/** + * struct ubifs_mount_opts - UBIFS-specific mount options information. + * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast) + * @bulk_read: enable/disable bulk-reads (%0 default, %1 disabe, %2 enable) + * @chk_data_crc: enable/disable CRC data checking when reading data nodes + * (%0 default, %1 disabe, %2 enable) + * @override_compr: override default compressor (%0 - do not override and use + * superblock compressor, %1 - override and use compressor + * specified in @compr_type) + * @compr_type: compressor type to override the superblock compressor with + * (%UBIFS_COMPR_NONE, etc) + */ +struct ubifs_mount_opts { + unsigned int unmount_mode:2; + unsigned int bulk_read:2; + unsigned int chk_data_crc:2; + unsigned int override_compr:1; + unsigned int compr_type:2; +}; + +/** + * struct ubifs_budg_info - UBIFS budgeting information. + * @idx_growth: amount of bytes budgeted for index growth + * @data_growth: amount of bytes budgeted for cached data + * @dd_growth: amount of bytes budgeted for cached data that will make + * other data dirty + * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but + * which still have to be taken into account because the index + * has not been committed so far + * @old_idx_sz: size of index on flash + * @min_idx_lebs: minimum number of LEBs required for the index + * @nospace: non-zero if the file-system does not have flash space (used as + * optimization) + * @nospace_rp: the same as @nospace, but additionally means that even reserved + * pool is full + * @page_budget: budget for a page (constant, nenver changed after mount) + * @inode_budget: budget for an inode (constant, nenver changed after mount) + * @dent_budget: budget for a directory entry (constant, nenver changed after + * mount) + */ +struct ubifs_budg_info { + long long idx_growth; + long long data_growth; + long long dd_growth; + long long uncommitted_idx; + unsigned long long old_idx_sz; + int min_idx_lebs; + unsigned int nospace:1; + unsigned int nospace_rp:1; + int page_budget; + int inode_budget; + int dent_budget; +}; + +struct ubifs_debug_info; + +/** + * struct ubifs_info - UBIFS file-system description data structure + * (per-superblock). + * @vfs_sb: VFS @struct super_block object + * @bdi: backing device info object to make VFS happy and disable read-ahead + * + * @highest_inum: highest used inode number + * @max_sqnum: current global sequence number + * @cmt_no: commit number of the last successfully completed commit, protected + * by @commit_sem + * @cnt_lock: protects @highest_inum and @max_sqnum counters + * @fmt_version: UBIFS on-flash format version + * @ro_compat_version: R/O compatibility version + * @uuid: UUID from super block + * + * @lhead_lnum: log head logical eraseblock number + * @lhead_offs: log head offset + * @ltail_lnum: log tail logical eraseblock number (offset is always 0) + * @log_mutex: protects the log, @lhead_lnum, @lhead_offs, @ltail_lnum, and + * @bud_bytes + * @min_log_bytes: minimum required number of bytes in the log + * @cmt_bud_bytes: used during commit to temporarily amount of bytes in + * committed buds + * + * @buds: tree of all buds indexed by bud LEB number + * @bud_bytes: how many bytes of flash is used by buds + * @buds_lock: protects the @buds tree, @bud_bytes, and per-journal head bud + * lists + * @jhead_cnt: count of journal heads + * @jheads: journal heads (head zero is base head) + * @max_bud_bytes: maximum number of bytes allowed in buds + * @bg_bud_bytes: number of bud bytes when background commit is initiated + * @old_buds: buds to be released after commit ends + * @max_bud_cnt: maximum number of buds + * + * @commit_sem: synchronizes committer with other processes + * @cmt_state: commit state + * @cs_lock: commit state lock + * @cmt_wq: wait queue to sleep on if the log is full and a commit is running + * + * @big_lpt: flag that LPT is too big to write whole during commit + * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up + * @no_chk_data_crc: do not check CRCs when reading data nodes (except during + * recovery) + * @bulk_read: enable bulk-reads + * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) + * @rw_incompat: the media is not R/W compatible + * + * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and + * @calc_idx_sz + * @zroot: zbranch which points to the root index node and znode + * @cnext: next znode to commit + * @enext: next znode to commit to empty space + * @gap_lebs: array of LEBs used by the in-gaps commit method + * @cbuf: commit buffer + * @ileb_buf: buffer for commit in-the-gaps method + * @ileb_len: length of data in ileb_buf + * @ihead_lnum: LEB number of index head + * @ihead_offs: offset of index head + * @ilebs: pre-allocated index LEBs + * @ileb_cnt: number of pre-allocated index LEBs + * @ileb_nxt: next pre-allocated index LEBs + * @old_idx: tree of index nodes obsoleted since the last commit start + * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c + * + * @mst_node: master node + * @mst_offs: offset of valid master node + * @mst_mutex: protects the master node area, @mst_node, and @mst_offs + * + * @max_bu_buf_len: maximum bulk-read buffer length + * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu + * @bu: pre-allocated bulk-read information + * + * @write_reserve_mutex: protects @write_reserve_buf + * @write_reserve_buf: on the write path we allocate memory, which might + * sometimes be unavailable, in which case we use this + * write reserve buffer + * + * @log_lebs: number of logical eraseblocks in the log + * @log_bytes: log size in bytes + * @log_last: last LEB of the log + * @lpt_lebs: number of LEBs used for lprops table + * @lpt_first: first LEB of the lprops table area + * @lpt_last: last LEB of the lprops table area + * @orph_lebs: number of LEBs used for the orphan area + * @orph_first: first LEB of the orphan area + * @orph_last: last LEB of the orphan area + * @main_lebs: count of LEBs in the main area + * @main_first: first LEB of the main area + * @main_bytes: main area size in bytes + * + * @key_hash_type: type of the key hash + * @key_hash: direntry key hash function + * @key_fmt: key format + * @key_len: key length + * @fanout: fanout of the index tree (number of links per indexing node) + * + * @min_io_size: minimal input/output unit size + * @min_io_shift: number of bits in @min_io_size minus one + * @max_write_size: maximum amount of bytes the underlying flash can write at a + * time (MTD write buffer size) + * @max_write_shift: number of bits in @max_write_size minus one + * @leb_size: logical eraseblock size in bytes + * @leb_start: starting offset of logical eraseblocks within physical + * eraseblocks + * @half_leb_size: half LEB size + * @idx_leb_size: how many bytes of an LEB are effectively available when it is + * used to store indexing nodes (@leb_size - @max_idx_node_sz) + * @leb_cnt: count of logical eraseblocks + * @max_leb_cnt: maximum count of logical eraseblocks + * @old_leb_cnt: count of logical eraseblocks before re-size + * @ro_media: the underlying UBI volume is read-only + * @ro_mount: the file-system was mounted as read-only + * @ro_error: UBIFS switched to R/O mode because an error happened + * + * @dirty_pg_cnt: number of dirty pages (not used) + * @dirty_zn_cnt: number of dirty znodes + * @clean_zn_cnt: number of clean znodes + * + * @space_lock: protects @bi and @lst + * @lst: lprops statistics + * @bi: budgeting information + * @calc_idx_sz: temporary variable which is used to calculate new index size + * (contains accurate new index size at end of TNC commit start) + * + * @ref_node_alsz: size of the LEB reference node aligned to the min. flash + * I/O unit + * @mst_node_alsz: master node aligned size + * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary + * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary + * @max_inode_sz: maximum possible inode size in bytes + * @max_znode_sz: size of znode in bytes + * + * @leb_overhead: how many bytes are wasted in an LEB when it is filled with + * data nodes of maximum size - used in free space reporting + * @dead_wm: LEB dead space watermark + * @dark_wm: LEB dark space watermark + * @block_cnt: count of 4KiB blocks on the FS + * + * @ranges: UBIFS node length ranges + * @ubi: UBI volume descriptor + * @di: UBI device information + * @vi: UBI volume information + * + * @orph_tree: rb-tree of orphan inode numbers + * @orph_list: list of orphan inode numbers in order added + * @orph_new: list of orphan inode numbers added since last commit + * @orph_cnext: next orphan to commit + * @orph_dnext: next orphan to delete + * @orphan_lock: lock for orph_tree and orph_new + * @orph_buf: buffer for orphan nodes + * @new_orphans: number of orphans since last commit + * @cmt_orphans: number of orphans being committed + * @tot_orphans: number of orphans in the rb_tree + * @max_orphans: maximum number of orphans allowed + * @ohead_lnum: orphan head LEB number + * @ohead_offs: orphan head offset + * @no_orphs: non-zero if there are no orphans + * + * @bgt: UBIFS background thread + * @bgt_name: background thread name + * @need_bgt: if background thread should run + * @need_wbuf_sync: if write-buffers have to be synchronized + * + * @gc_lnum: LEB number used for garbage collection + * @sbuf: a buffer of LEB size used by GC and replay for scanning + * @idx_gc: list of index LEBs that have been garbage collected + * @idx_gc_cnt: number of elements on the idx_gc list + * @gc_seq: incremented for every non-index LEB garbage collected + * @gced_lnum: last non-index LEB that was garbage collected + * + * @infos_list: links all 'ubifs_info' objects + * @umount_mutex: serializes shrinker and un-mount + * @shrinker_run_no: shrinker run number + * + * @space_bits: number of bits needed to record free or dirty space + * @lpt_lnum_bits: number of bits needed to record a LEB number in the LPT + * @lpt_offs_bits: number of bits needed to record an offset in the LPT + * @lpt_spc_bits: number of bits needed to space in the LPT + * @pcnt_bits: number of bits needed to record pnode or nnode number + * @lnum_bits: number of bits needed to record LEB number + * @nnode_sz: size of on-flash nnode + * @pnode_sz: size of on-flash pnode + * @ltab_sz: size of on-flash LPT lprops table + * @lsave_sz: size of on-flash LPT save table + * @pnode_cnt: number of pnodes + * @nnode_cnt: number of nnodes + * @lpt_hght: height of the LPT + * @pnodes_have: number of pnodes in memory + * + * @lp_mutex: protects lprops table and all the other lprops-related fields + * @lpt_lnum: LEB number of the root nnode of the LPT + * @lpt_offs: offset of the root nnode of the LPT + * @nhead_lnum: LEB number of LPT head + * @nhead_offs: offset of LPT head + * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab + * @dirty_nn_cnt: number of dirty nnodes + * @dirty_pn_cnt: number of dirty pnodes + * @check_lpt_free: flag that indicates LPT GC may be needed + * @lpt_sz: LPT size + * @lpt_nod_buf: buffer for an on-flash nnode or pnode + * @lpt_buf: buffer of LEB size used by LPT + * @nroot: address in memory of the root nnode of the LPT + * @lpt_cnext: next LPT node to commit + * @lpt_heap: array of heaps of categorized lprops + * @dirty_idx: a (reverse sorted) copy of the LPROPS_DIRTY_IDX heap as at + * previous commit start + * @uncat_list: list of un-categorized LEBs + * @empty_list: list of empty LEBs + * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size) + * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size) + * @freeable_cnt: number of freeable LEBs in @freeable_list + * + * @ltab_lnum: LEB number of LPT's own lprops table + * @ltab_offs: offset of LPT's own lprops table + * @ltab: LPT's own lprops table + * @ltab_cmt: LPT's own lprops table (commit copy) + * @lsave_cnt: number of LEB numbers in LPT's save table + * @lsave_lnum: LEB number of LPT's save table + * @lsave_offs: offset of LPT's save table + * @lsave: LPT's save table + * @lscan_lnum: LEB number of last LPT scan + * + * @rp_size: size of the reserved pool in bytes + * @report_rp_size: size of the reserved pool reported to user-space + * @rp_uid: reserved pool user ID + * @rp_gid: reserved pool group ID + * + * @empty: %1 if the UBI device is empty + * @need_recovery: %1 if the file-system needs recovery + * @replaying: %1 during journal replay + * @mounting: %1 while mounting + * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode + * @replay_list: temporary list used during journal replay + * @replay_buds: list of buds to replay + * @cs_sqnum: sequence number of first node in the log (commit start node) + * @replay_sqnum: sequence number of node currently being replayed + * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W + * mode + * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted + * FS to R/W mode + * @size_tree: inode size information for recovery + * @mount_opts: UBIFS-specific mount options + * + * @dbg: debugging-related information + */ +struct ubifs_info { + struct super_block *vfs_sb; + struct backing_dev_info bdi; + + ino_t highest_inum; + unsigned long long max_sqnum; + unsigned long long cmt_no; + spinlock_t cnt_lock; + int fmt_version; + int ro_compat_version; + unsigned char uuid[16]; + + int lhead_lnum; + int lhead_offs; + int ltail_lnum; + struct mutex log_mutex; + int min_log_bytes; + long long cmt_bud_bytes; + + struct rb_root buds; + long long bud_bytes; + spinlock_t buds_lock; + int jhead_cnt; + struct ubifs_jhead *jheads; + long long max_bud_bytes; + long long bg_bud_bytes; + struct list_head old_buds; + int max_bud_cnt; + + struct rw_semaphore commit_sem; + int cmt_state; + spinlock_t cs_lock; + wait_queue_head_t cmt_wq; + + unsigned int big_lpt:1; + unsigned int space_fixup:1; + unsigned int no_chk_data_crc:1; + unsigned int bulk_read:1; + unsigned int default_compr:2; + unsigned int rw_incompat:1; + + struct mutex tnc_mutex; + struct ubifs_zbranch zroot; + struct ubifs_znode *cnext; + struct ubifs_znode *enext; + int *gap_lebs; + void *cbuf; + void *ileb_buf; + int ileb_len; + int ihead_lnum; + int ihead_offs; + int *ilebs; + int ileb_cnt; + int ileb_nxt; + struct rb_root old_idx; + int *bottom_up_buf; + + struct ubifs_mst_node *mst_node; + int mst_offs; + struct mutex mst_mutex; + + int max_bu_buf_len; + struct mutex bu_mutex; + struct bu_info bu; + + struct mutex write_reserve_mutex; + void *write_reserve_buf; + + int log_lebs; + long long log_bytes; + int log_last; + int lpt_lebs; + int lpt_first; + int lpt_last; + int orph_lebs; + int orph_first; + int orph_last; + int main_lebs; + int main_first; + long long main_bytes; + + uint8_t key_hash_type; + uint32_t (*key_hash)(const char *str, int len); + int key_fmt; + int key_len; + int fanout; + + int min_io_size; + int min_io_shift; + int max_write_size; + int max_write_shift; + int leb_size; + int leb_start; + int half_leb_size; + int idx_leb_size; + int leb_cnt; + int max_leb_cnt; + int old_leb_cnt; + unsigned int ro_media:1; + unsigned int ro_mount:1; + unsigned int ro_error:1; + + atomic_long_t dirty_pg_cnt; + atomic_long_t dirty_zn_cnt; + atomic_long_t clean_zn_cnt; + + spinlock_t space_lock; + struct ubifs_lp_stats lst; + struct ubifs_budg_info bi; + unsigned long long calc_idx_sz; + + int ref_node_alsz; + int mst_node_alsz; + int min_idx_node_sz; + int max_idx_node_sz; + long long max_inode_sz; + int max_znode_sz; + + int leb_overhead; + int dead_wm; + int dark_wm; + int block_cnt; + + struct ubifs_node_range ranges[UBIFS_NODE_TYPES_CNT]; + struct ubi_volume_desc *ubi; + struct ubi_device_info di; + struct ubi_volume_info vi; + + struct rb_root orph_tree; + struct list_head orph_list; + struct list_head orph_new; + struct ubifs_orphan *orph_cnext; + struct ubifs_orphan *orph_dnext; + spinlock_t orphan_lock; + void *orph_buf; + int new_orphans; + int cmt_orphans; + int tot_orphans; + int max_orphans; + int ohead_lnum; + int ohead_offs; + int no_orphs; + + struct task_struct *bgt; + char bgt_name[sizeof(BGT_NAME_PATTERN) + 9]; + int need_bgt; + int need_wbuf_sync; + + int gc_lnum; + void *sbuf; + struct list_head idx_gc; + int idx_gc_cnt; + int gc_seq; + int gced_lnum; + + struct list_head infos_list; + struct mutex umount_mutex; + unsigned int shrinker_run_no; + + int space_bits; + int lpt_lnum_bits; + int lpt_offs_bits; + int lpt_spc_bits; + int pcnt_bits; + int lnum_bits; + int nnode_sz; + int pnode_sz; + int ltab_sz; + int lsave_sz; + int pnode_cnt; + int nnode_cnt; + int lpt_hght; + int pnodes_have; + + struct mutex lp_mutex; + int lpt_lnum; + int lpt_offs; + int nhead_lnum; + int nhead_offs; + int lpt_drty_flgs; + int dirty_nn_cnt; + int dirty_pn_cnt; + int check_lpt_free; + long long lpt_sz; + void *lpt_nod_buf; + void *lpt_buf; + struct ubifs_nnode *nroot; + struct ubifs_cnode *lpt_cnext; + struct ubifs_lpt_heap lpt_heap[LPROPS_HEAP_CNT]; + struct ubifs_lpt_heap dirty_idx; + struct list_head uncat_list; + struct list_head empty_list; + struct list_head freeable_list; + struct list_head frdi_idx_list; + int freeable_cnt; + + int ltab_lnum; + int ltab_offs; + struct ubifs_lpt_lprops *ltab; + struct ubifs_lpt_lprops *ltab_cmt; + int lsave_cnt; + int lsave_lnum; + int lsave_offs; + int *lsave; + int lscan_lnum; + + long long rp_size; + long long report_rp_size; + uid_t rp_uid; + gid_t rp_gid; + + /* The below fields are used only during mounting and re-mounting */ + unsigned int empty:1; + unsigned int need_recovery:1; + unsigned int replaying:1; + unsigned int mounting:1; + unsigned int remounting_rw:1; + struct list_head replay_list; + struct list_head replay_buds; + unsigned long long cs_sqnum; + unsigned long long replay_sqnum; + struct list_head unclean_leb_list; + struct ubifs_mst_node *rcvrd_mst_node; + struct rb_root size_tree; + struct ubifs_mount_opts mount_opts; + +#ifdef CONFIG_UBIFS_FS_DEBUG + struct ubifs_debug_info *dbg; +#endif +}; + +extern struct list_head ubifs_infos; +extern spinlock_t ubifs_infos_lock; +extern atomic_long_t ubifs_clean_zn_cnt; +extern struct kmem_cache *ubifs_inode_slab; +extern const struct super_operations ubifs_super_operations; +extern const struct address_space_operations ubifs_file_address_operations; +extern const struct file_operations ubifs_file_operations; +extern const struct inode_operations ubifs_file_inode_operations; +extern const struct file_operations ubifs_dir_operations; +extern const struct inode_operations ubifs_dir_inode_operations; +extern const struct inode_operations ubifs_symlink_inode_operations; +extern struct backing_dev_info ubifs_backing_dev_info; +extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; + +/* io.c */ +void ubifs_ro_mode(struct ubifs_info *c, int err); +int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len); +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, + int dtype); +int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf); +int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, + int lnum, int offs); +int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, + int lnum, int offs); +int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, + int offs, int dtype); +int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, + int offs, int quiet, int must_chk_crc); +void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); +void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last); +int ubifs_io_init(struct ubifs_info *c); +void ubifs_pad(const struct ubifs_info *c, void *buf, int pad); +int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf); +int ubifs_bg_wbufs_sync(struct ubifs_info *c); +void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum); +int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode); + +/* scan.c */ +struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf, int quiet); +void ubifs_scan_destroy(struct ubifs_scan_leb *sleb); +int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, + int offs, int quiet); +struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf); +void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + int lnum, int offs); +int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + void *buf, int offs); +void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, + void *buf); + +/* log.c */ +void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud); +void ubifs_create_buds_lists(struct ubifs_info *c); +int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs); +struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum); +struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum); +int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum); +int ubifs_log_end_commit(struct ubifs_info *c, int new_ltail_lnum); +int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum); +int ubifs_consolidate_log(struct ubifs_info *c); + +/* journal.c */ +int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, + const struct qstr *nm, const struct inode *inode, + int deletion, int xent); +int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, + const union ubifs_key *key, const void *buf, int len); +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode); +int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, + const struct dentry *old_dentry, + const struct inode *new_dir, + const struct dentry *new_dentry, int sync); +int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, + loff_t old_size, loff_t new_size); +int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, + const struct inode *inode, const struct qstr *nm); +int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1, + const struct inode *inode2); + +/* budget.c */ +int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req); +void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req); +void ubifs_release_dirty_inode_budget(struct ubifs_info *c, + struct ubifs_inode *ui); +int ubifs_budget_inode_op(struct ubifs_info *c, struct inode *inode, + struct ubifs_budget_req *req); +void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode, + struct ubifs_budget_req *req); +void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, + struct ubifs_budget_req *req); +long long ubifs_get_free_space(struct ubifs_info *c); +long long ubifs_get_free_space_nolock(struct ubifs_info *c); +int ubifs_calc_min_idx_lebs(struct ubifs_info *c); +void ubifs_convert_page_budget(struct ubifs_info *c); +long long ubifs_reported_space(const struct ubifs_info *c, long long free); +long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); + +/* find.c */ +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, + int squeeze); +int ubifs_find_free_leb_for_idx(struct ubifs_info *c); +int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, + int min_space, int pick_free); +int ubifs_find_dirty_idx_leb(struct ubifs_info *c); +int ubifs_save_dirty_idx_lnums(struct ubifs_info *c); + +/* tnc.c */ +int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n); +int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, + void *node, const struct qstr *nm); +int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, + void *node, int *lnum, int *offs); +int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, + int offs, int len); +int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, + int old_lnum, int old_offs, int lnum, int offs, int len); +int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, + int lnum, int offs, int len, const struct qstr *nm); +int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key); +int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, + const struct qstr *nm); +int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, + union ubifs_key *to_key); +int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum); +struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, + union ubifs_key *key, + const struct qstr *nm); +void ubifs_tnc_close(struct ubifs_info *c); +int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs, int is_idx); +int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs); +/* Shared by tnc.c for tnc_commit.c */ +void destroy_old_idx(struct ubifs_info *c); +int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs); +int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode); +int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu); +int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu); + +/* tnc_misc.c */ +struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr, + struct ubifs_znode *znode); +int ubifs_search_zbranch(const struct ubifs_info *c, + const struct ubifs_znode *znode, + const union ubifs_key *key, int *n); +struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode); +struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode); +long ubifs_destroy_tnc_subtree(struct ubifs_znode *zr); +struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c, + struct ubifs_zbranch *zbr, + struct ubifs_znode *parent, int iip); +int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node); + +/* tnc_commit.c */ +int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot); +int ubifs_tnc_end_commit(struct ubifs_info *c); + +/* shrinker.c */ +int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc); + +/* commit.c */ +int ubifs_bg_thread(void *info); +void ubifs_commit_required(struct ubifs_info *c); +void ubifs_request_bg_commit(struct ubifs_info *c); +int ubifs_run_commit(struct ubifs_info *c); +void ubifs_recovery_commit(struct ubifs_info *c); +int ubifs_gc_should_commit(struct ubifs_info *c); +void ubifs_wait_for_commit(struct ubifs_info *c); + +/* master.c */ +int ubifs_read_master(struct ubifs_info *c); +int ubifs_write_master(struct ubifs_info *c); + +/* sb.c */ +int ubifs_read_superblock(struct ubifs_info *c); +struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c); +int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup); +int ubifs_fixup_free_space(struct ubifs_info *c); + +/* replay.c */ +int ubifs_validate_entry(struct ubifs_info *c, + const struct ubifs_dent_node *dent); +int ubifs_replay_journal(struct ubifs_info *c); + +/* gc.c */ +int ubifs_garbage_collect(struct ubifs_info *c, int anyway); +int ubifs_gc_start_commit(struct ubifs_info *c); +int ubifs_gc_end_commit(struct ubifs_info *c); +void ubifs_destroy_idx_gc(struct ubifs_info *c); +int ubifs_get_idx_gc_leb(struct ubifs_info *c); +int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp); + +/* orphan.c */ +int ubifs_add_orphan(struct ubifs_info *c, ino_t inum); +void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum); +int ubifs_orphan_start_commit(struct ubifs_info *c); +int ubifs_orphan_end_commit(struct ubifs_info *c); +int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only); +int ubifs_clear_orphans(struct ubifs_info *c); + +/* lpt.c */ +int ubifs_calc_lpt_geom(struct ubifs_info *c); +int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, + int *lpt_lebs, int *big_lpt); +int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr); +struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum); +struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum); +int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum, + ubifs_lpt_scan_callback scan_cb, void *data); + +/* Shared by lpt.c for lpt_commit.c */ +void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave); +void ubifs_pack_ltab(struct ubifs_info *c, void *buf, + struct ubifs_lpt_lprops *ltab); +void ubifs_pack_pnode(struct ubifs_info *c, void *buf, + struct ubifs_pnode *pnode); +void ubifs_pack_nnode(struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode); +struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip); +struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip); +int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip); +void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty); +void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode); +uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits); +struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght); +/* Needed only in debugging code in lpt_commit.c */ +int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode); + +/* lpt_commit.c */ +int ubifs_lpt_start_commit(struct ubifs_info *c); +int ubifs_lpt_end_commit(struct ubifs_info *c); +int ubifs_lpt_post_commit(struct ubifs_info *c); +void ubifs_lpt_free(struct ubifs_info *c, int wr_only); + +/* lprops.c */ +const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, + const struct ubifs_lprops *lp, + int free, int dirty, int flags, + int idx_gc_cnt); +void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst); +void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, + int cat); +void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, + struct ubifs_lprops *new_lprops); +void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops); +int ubifs_categorize_lprops(const struct ubifs_info *c, + const struct ubifs_lprops *lprops); +int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean, int idx_gc_cnt); +int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean); +int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp); +const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c); +const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c); +const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c); +const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c); +int ubifs_calc_dark(const struct ubifs_info *c, int spc); + +/* file.c */ +int ubifs_fsync(struct file *file, int datasync); +int ubifs_setattr(struct dentry *dentry, struct iattr *attr); + +/* dir.c */ +struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, + int mode); +int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); + +/* xattr.c */ +int ubifs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size); +ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); +int ubifs_removexattr(struct dentry *dentry, const char *name); + +/* super.c */ +struct inode *ubifs_iget(struct super_block *sb, unsigned long inum); + +/* recovery.c */ +int ubifs_recover_master_node(struct ubifs_info *c); +int ubifs_write_rcvrd_mst_node(struct ubifs_info *c); +struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf, int jhead); +struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf); +int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf); +int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf); +int ubifs_rcvry_gc_commit(struct ubifs_info *c); +int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key, + int deletion, loff_t new_size); +int ubifs_recover_size(struct ubifs_info *c); +void ubifs_destroy_size_tree(struct ubifs_info *c); + +/* ioctl.c */ +long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +void ubifs_set_inode_flags(struct inode *inode); +#ifdef CONFIG_COMPAT +long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +#endif + +/* compressor.c */ +int __init ubifs_compressors_init(void); +void ubifs_compressors_exit(void); +void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, + int *compr_type); +int ubifs_decompress(const void *buf, int len, void *out, int *out_len, + int compr_type); + +#include "debug.h" +#include "misc.h" +#include "key.h" + +#endif /* !__UBIFS_H__ */ diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c new file mode 100644 index 00000000..16f19f55 --- /dev/null +++ b/fs/ubifs/xattr.c @@ -0,0 +1,572 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS extended attributes support. + * + * Extended attributes are implemented as regular inodes with attached data, + * which limits extended attribute size to UBIFS block size (4KiB). Names of + * extended attributes are described by extended attribute entries (xentries), + * which are almost identical to directory entries, but have different key type. + * + * In other words, the situation with extended attributes is very similar to + * directories. Indeed, any inode (but of course not xattr inodes) may have a + * number of associated xentries, just like directory inodes have associated + * directory entries. Extended attribute entries store the name of the extended + * attribute, the host inode number, and the extended attribute inode number. + * Similarly, direntries store the name, the parent and the target inode + * numbers. Thus, most of the common UBIFS mechanisms may be re-used for + * extended attributes. + * + * The number of extended attributes is not limited, but there is Linux + * limitation on the maximum possible size of the list of all extended + * attributes associated with an inode (%XATTR_LIST_MAX), so UBIFS makes sure + * the sum of all extended attribute names of the inode does not exceed that + * limit. + * + * Extended attributes are synchronous, which means they are written to the + * flash media synchronously and there is no write-back for extended attribute + * inodes. The extended attribute values are not stored in compressed form on + * the media. + * + * Since extended attributes are represented by regular inodes, they are cached + * in the VFS inode cache. The xentries are cached in the LNC cache (see + * tnc.c). + * + * ACL support is not implemented. + */ + +#include "ubifs.h" +#include +#include +#include +#include + +/* + * Limit the number of extended attributes per inode so that the total size + * (@xattr_size) is guaranteeded to fit in an 'unsigned int'. + */ +#define MAX_XATTRS_PER_INODE 65535 + +/* + * Extended attribute type constants. + * + * USER_XATTR: user extended attribute ("user.*") + * TRUSTED_XATTR: trusted extended attribute ("trusted.*) + * SECURITY_XATTR: security extended attribute ("security.*") + */ +enum { + USER_XATTR, + TRUSTED_XATTR, + SECURITY_XATTR, +}; + +static const struct inode_operations empty_iops; +static const struct file_operations empty_fops; + +/** + * create_xattr - create an extended attribute. + * @c: UBIFS file-system description object + * @host: host inode + * @nm: extended attribute name + * @value: extended attribute value + * @size: size of extended attribute value + * + * This is a helper function which creates an extended attribute of name @nm + * and value @value for inode @host. The host inode is also updated on flash + * because the ctime and extended attribute accounting data changes. This + * function returns zero in case of success and a negative error code in case + * of failure. + */ +static int create_xattr(struct ubifs_info *c, struct inode *host, + const struct qstr *nm, const void *value, int size) +{ + int err; + struct inode *inode; + struct ubifs_inode *ui, *host_ui = ubifs_inode(host); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; + + if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) + return -ENOSPC; + /* + * Linux limits the maximum size of the extended attribute names list + * to %XATTR_LIST_MAX. This means we should not allow creating more + * extended attributes if the name list becomes larger. This limitation + * is artificial for UBIFS, though. + */ + if (host_ui->xattr_names + host_ui->xattr_cnt + + nm->len + 1 > XATTR_LIST_MAX) + return -ENOSPC; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + /* Re-define all operations to be "nothing" */ + inode->i_mapping->a_ops = &empty_aops; + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; + + inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA; + ui = ubifs_inode(inode); + ui->xattr = 1; + ui->flags |= UBIFS_XATTR_FL; + ui->data = kmalloc(size, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_free; + } + memcpy(ui->data, value, size); + inode->i_size = ui->ui_size = size; + ui->data_len = size; + + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_cnt += 1; + host_ui->xattr_size += CALC_DENT_SIZE(nm->len); + host_ui->xattr_size += CALC_XATTR_BYTES(size); + host_ui->xattr_names += nm->len; + + err = ubifs_jnl_update(c, host, nm, inode, 0, 1); + if (err) + goto out_cancel; + mutex_unlock(&host_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + iput(inode); + return 0; + +out_cancel: + host_ui->xattr_cnt -= 1; + host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); + host_ui->xattr_size -= CALC_XATTR_BYTES(size); + mutex_unlock(&host_ui->ui_mutex); +out_free: + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +/** + * change_xattr - change an extended attribute. + * @c: UBIFS file-system description object + * @host: host inode + * @inode: extended attribute inode + * @value: extended attribute value + * @size: size of extended attribute value + * + * This helper function changes the value of extended attribute @inode with new + * data from @value. Returns zero in case of success and a negative error code + * in case of failure. + */ +static int change_xattr(struct ubifs_info *c, struct inode *host, + struct inode *inode, const void *value, int size) +{ + int err; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_budget_req req = { .dirtied_ino = 2, + .dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) }; + + ubifs_assert(ui->data_len == inode->i_size); + err = ubifs_budget_space(c, &req); + if (err) + return err; + + kfree(ui->data); + ui->data = kmalloc(size, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_free; + } + memcpy(ui->data, value, size); + inode->i_size = ui->ui_size = size; + ui->data_len = size; + + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_size += CALC_XATTR_BYTES(size); + + /* + * It is important to write the host inode after the xattr inode + * because if the host inode gets synchronized (via 'fsync()'), then + * the extended attribute inode gets synchronized, because it goes + * before the host inode in the write-buffer. + */ + err = ubifs_jnl_change_xattr(c, inode, host); + if (err) + goto out_cancel; + mutex_unlock(&host_ui->ui_mutex); + + ubifs_release_budget(c, &req); + return 0; + +out_cancel: + host_ui->xattr_size -= CALC_XATTR_BYTES(size); + host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); + mutex_unlock(&host_ui->ui_mutex); + make_bad_inode(inode); +out_free: + ubifs_release_budget(c, &req); + return err; +} + +/** + * check_namespace - check extended attribute name-space. + * @nm: extended attribute name + * + * This function makes sure the extended attribute name belongs to one of the + * supported extended attribute name-spaces. Returns name-space index in case + * of success and a negative error code in case of failure. + */ +static int check_namespace(const struct qstr *nm) +{ + int type; + + if (nm->len > UBIFS_MAX_NLEN) + return -ENAMETOOLONG; + + if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN)) { + if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0') + return -EINVAL; + type = TRUSTED_XATTR; + } else if (!strncmp(nm->name, XATTR_USER_PREFIX, + XATTR_USER_PREFIX_LEN)) { + if (nm->name[XATTR_USER_PREFIX_LEN] == '\0') + return -EINVAL; + type = USER_XATTR; + } else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN)) { + if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0') + return -EINVAL; + type = SECURITY_XATTR; + } else + return -EOPNOTSUPP; + + return type; +} + +static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum) +{ + struct inode *inode; + + inode = ubifs_iget(c->vfs_sb, inum); + if (IS_ERR(inode)) { + ubifs_err("dead extended attribute entry, error %d", + (int)PTR_ERR(inode)); + return inode; + } + if (ubifs_inode(inode)->xattr) + return inode; + ubifs_err("corrupt extended attribute entry"); + iput(inode); + return ERR_PTR(-EINVAL); +} + +int ubifs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode, *host = dentry->d_inode; + struct ubifs_info *c = host->i_sb->s_fs_info; + struct qstr nm = { .name = name, .len = strlen(name) }; + struct ubifs_dent_node *xent; + union ubifs_key key; + int err, type; + + dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name, + host->i_ino, dentry->d_name.len, dentry->d_name.name, size); + ubifs_assert(mutex_is_locked(&host->i_mutex)); + + if (size > UBIFS_MAX_INO_DATA) + return -ERANGE; + + type = check_namespace(&nm); + if (type < 0) + return type; + + xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); + if (!xent) + return -ENOMEM; + + /* + * The extended attribute entries are stored in LNC, so multiple + * look-ups do not involve reading the flash. + */ + xent_key_init(c, &key, host->i_ino, &nm); + err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); + if (err) { + if (err != -ENOENT) + goto out_free; + + if (flags & XATTR_REPLACE) + /* We are asked not to create the xattr */ + err = -ENODATA; + else + err = create_xattr(c, host, &nm, value, size); + goto out_free; + } + + if (flags & XATTR_CREATE) { + /* We are asked not to replace the xattr */ + err = -EEXIST; + goto out_free; + } + + inode = iget_xattr(c, le64_to_cpu(xent->inum)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_free; + } + + err = change_xattr(c, host, inode, value, size); + iput(inode); + +out_free: + kfree(xent); + return err; +} + +ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size) +{ + struct inode *inode, *host = dentry->d_inode; + struct ubifs_info *c = host->i_sb->s_fs_info; + struct qstr nm = { .name = name, .len = strlen(name) }; + struct ubifs_inode *ui; + struct ubifs_dent_node *xent; + union ubifs_key key; + int err; + + dbg_gen("xattr '%s', ino %lu ('%.*s'), buf size %zd", name, + host->i_ino, dentry->d_name.len, dentry->d_name.name, size); + + err = check_namespace(&nm); + if (err < 0) + return err; + + xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); + if (!xent) + return -ENOMEM; + + xent_key_init(c, &key, host->i_ino, &nm); + err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); + if (err) { + if (err == -ENOENT) + err = -ENODATA; + goto out_unlock; + } + + inode = iget_xattr(c, le64_to_cpu(xent->inum)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_unlock; + } + + ui = ubifs_inode(inode); + ubifs_assert(inode->i_size == ui->data_len); + ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len); + + if (buf) { + /* If @buf is %NULL we are supposed to return the length */ + if (ui->data_len > size) { + dbg_err("buffer size %zd, xattr len %d", + size, ui->data_len); + err = -ERANGE; + goto out_iput; + } + + memcpy(buf, ui->data, ui->data_len); + } + err = ui->data_len; + +out_iput: + iput(inode); +out_unlock: + kfree(xent); + return err; +} + +ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + union ubifs_key key; + struct inode *host = dentry->d_inode; + struct ubifs_info *c = host->i_sb->s_fs_info; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_dent_node *xent, *pxent = NULL; + int err, len, written = 0; + struct qstr nm = { .name = NULL }; + + dbg_gen("ino %lu ('%.*s'), buffer size %zd", host->i_ino, + dentry->d_name.len, dentry->d_name.name, size); + + len = host_ui->xattr_names + host_ui->xattr_cnt; + if (!buffer) + /* + * We should return the minimum buffer size which will fit a + * null-terminated list of all the extended attribute names. + */ + return len; + + if (len > size) + return -ERANGE; + + lowest_xent_key(c, &key, host->i_ino); + while (1) { + int type; + + xent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(xent)) { + err = PTR_ERR(xent); + break; + } + + nm.name = xent->name; + nm.len = le16_to_cpu(xent->nlen); + + type = check_namespace(&nm); + if (unlikely(type < 0)) { + err = type; + break; + } + + /* Show trusted namespace only for "power" users */ + if (type != TRUSTED_XATTR || capable(CAP_SYS_ADMIN)) { + memcpy(buffer + written, nm.name, nm.len + 1); + written += nm.len + 1; + } + + kfree(pxent); + pxent = xent; + key_read(c, &xent->key, &key); + } + + kfree(pxent); + if (err != -ENOENT) { + ubifs_err("cannot find next direntry, error %d", err); + return err; + } + + ubifs_assert(written <= size); + return written; +} + +static int remove_xattr(struct ubifs_info *c, struct inode *host, + struct inode *inode, const struct qstr *nm) +{ + int err; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; + + ubifs_assert(ui->data_len == inode->i_size); + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_cnt -= 1; + host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); + host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_names -= nm->len; + + err = ubifs_jnl_delete_xattr(c, host, inode, nm); + if (err) + goto out_cancel; + mutex_unlock(&host_ui->ui_mutex); + + ubifs_release_budget(c, &req); + return 0; + +out_cancel: + host_ui->xattr_cnt += 1; + host_ui->xattr_size += CALC_DENT_SIZE(nm->len); + host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); + mutex_unlock(&host_ui->ui_mutex); + ubifs_release_budget(c, &req); + make_bad_inode(inode); + return err; +} + +int ubifs_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode, *host = dentry->d_inode; + struct ubifs_info *c = host->i_sb->s_fs_info; + struct qstr nm = { .name = name, .len = strlen(name) }; + struct ubifs_dent_node *xent; + union ubifs_key key; + int err; + + dbg_gen("xattr '%s', ino %lu ('%.*s')", name, + host->i_ino, dentry->d_name.len, dentry->d_name.name); + ubifs_assert(mutex_is_locked(&host->i_mutex)); + + err = check_namespace(&nm); + if (err < 0) + return err; + + xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); + if (!xent) + return -ENOMEM; + + xent_key_init(c, &key, host->i_ino, &nm); + err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); + if (err) { + if (err == -ENOENT) + err = -ENODATA; + goto out_free; + } + + inode = iget_xattr(c, le64_to_cpu(xent->inum)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_free; + } + + ubifs_assert(inode->i_nlink == 1); + inode->i_nlink = 0; + err = remove_xattr(c, host, inode, &nm); + if (err) + inode->i_nlink = 1; + + /* If @i_nlink is 0, 'iput()' will delete the inode */ + iput(inode); + +out_free: + kfree(xent); + return err; +} diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig new file mode 100644 index 00000000..0e0e99bd --- /dev/null +++ b/fs/udf/Kconfig @@ -0,0 +1,18 @@ +config UDF_FS + tristate "UDF file system support" + select CRC_ITU_T + help + This is the new file system used on some CD-ROMs and DVDs. Say Y if + you intend to mount DVD discs or CDRW's written in packet mode, or + if written to by other UDF utilities, such as DirectCD. + Please read . + + To compile this file system support as a module, choose M here: the + module will be called udf. + + If unsure, say N. + +config UDF_NLS + bool + default y + depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y) diff --git a/fs/udf/Makefile b/fs/udf/Makefile new file mode 100644 index 00000000..eb880f66 --- /dev/null +++ b/fs/udf/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the linux udf-filesystem routines. +# + +obj-$(CONFIG_UDF_FS) += udf.o + +udf-objs := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \ + partition.o super.o truncate.o symlink.o \ + directory.o misc.o udftime.o unicode.o diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c new file mode 100644 index 00000000..95518a9f --- /dev/null +++ b/fs/udf/balloc.c @@ -0,0 +1,815 @@ +/* + * balloc.c + * + * PURPOSE + * Block allocation handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1999-2001 Ben Fennema + * (C) 1999 Stelias Computing Inc + * + * HISTORY + * + * 02/24/99 blf Created. + * + */ + +#include "udfdecl.h" + +#include +#include + +#include "udf_i.h" +#include "udf_sb.h" + +#define udf_clear_bit __test_and_clear_bit_le +#define udf_set_bit __test_and_set_bit_le +#define udf_test_bit test_bit_le +#define udf_find_next_one_bit find_next_bit_le + +static int read_block_bitmap(struct super_block *sb, + struct udf_bitmap *bitmap, unsigned int block, + unsigned long bitmap_nr) +{ + struct buffer_head *bh = NULL; + int retval = 0; + struct kernel_lb_addr loc; + + loc.logicalBlockNum = bitmap->s_extPosition; + loc.partitionReferenceNum = UDF_SB(sb)->s_partition; + + bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block)); + if (!bh) + retval = -EIO; + + bitmap->s_block_bitmap[bitmap_nr] = bh; + return retval; +} + +static int __load_block_bitmap(struct super_block *sb, + struct udf_bitmap *bitmap, + unsigned int block_group) +{ + int retval = 0; + int nr_groups = bitmap->s_nr_groups; + + if (block_group >= nr_groups) { + udf_debug("block_group (%d) > nr_groups (%d)\n", block_group, + nr_groups); + } + + if (bitmap->s_block_bitmap[block_group]) { + return block_group; + } else { + retval = read_block_bitmap(sb, bitmap, block_group, + block_group); + if (retval < 0) + return retval; + return block_group; + } +} + +static inline int load_block_bitmap(struct super_block *sb, + struct udf_bitmap *bitmap, + unsigned int block_group) +{ + int slot; + + slot = __load_block_bitmap(sb, bitmap, block_group); + + if (slot < 0) + return slot; + + if (!bitmap->s_block_bitmap[slot]) + return -EIO; + + return slot; +} + +static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct logicalVolIntegrityDesc *lvid; + + if (!sbi->s_lvid_bh) + return; + + lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data; + le32_add_cpu(&lvid->freeSpaceTable[partition], cnt); + udf_updated_lvid(sb); +} + +static void udf_bitmap_free_blocks(struct super_block *sb, + struct inode *inode, + struct udf_bitmap *bitmap, + struct kernel_lb_addr *bloc, + uint32_t offset, + uint32_t count) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct buffer_head *bh = NULL; + struct udf_part_map *partmap; + unsigned long block; + unsigned long block_group; + unsigned long bit; + unsigned long i; + int bitmap_nr; + unsigned long overflow; + + mutex_lock(&sbi->s_alloc_mutex); + partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; + if (bloc->logicalBlockNum + count < count || + (bloc->logicalBlockNum + count) > partmap->s_partition_len) { + udf_debug("%d < %d || %d + %d > %d\n", + bloc->logicalBlockNum, 0, bloc->logicalBlockNum, + count, partmap->s_partition_len); + goto error_return; + } + + block = bloc->logicalBlockNum + offset + + (sizeof(struct spaceBitmapDesc) << 3); + + do { + overflow = 0; + block_group = block >> (sb->s_blocksize_bits + 3); + bit = block % (sb->s_blocksize << 3); + + /* + * Check to see if we are freeing blocks across a group boundary. + */ + if (bit + count > (sb->s_blocksize << 3)) { + overflow = bit + count - (sb->s_blocksize << 3); + count -= overflow; + } + bitmap_nr = load_block_bitmap(sb, bitmap, block_group); + if (bitmap_nr < 0) + goto error_return; + + bh = bitmap->s_block_bitmap[bitmap_nr]; + for (i = 0; i < count; i++) { + if (udf_set_bit(bit + i, bh->b_data)) { + udf_debug("bit %ld already set\n", bit + i); + udf_debug("byte=%2x\n", + ((char *)bh->b_data)[(bit + i) >> 3]); + } + } + udf_add_free_space(sb, sbi->s_partition, count); + mark_buffer_dirty(bh); + if (overflow) { + block += count; + count = overflow; + } + } while (overflow); + +error_return: + mutex_unlock(&sbi->s_alloc_mutex); +} + +static int udf_bitmap_prealloc_blocks(struct super_block *sb, + struct inode *inode, + struct udf_bitmap *bitmap, + uint16_t partition, uint32_t first_block, + uint32_t block_count) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + int alloc_count = 0; + int bit, block, block_group, group_start; + int nr_groups, bitmap_nr; + struct buffer_head *bh; + __u32 part_len; + + mutex_lock(&sbi->s_alloc_mutex); + part_len = sbi->s_partmaps[partition].s_partition_len; + if (first_block >= part_len) + goto out; + + if (first_block + block_count > part_len) + block_count = part_len - first_block; + + do { + nr_groups = udf_compute_nr_groups(sb, partition); + block = first_block + (sizeof(struct spaceBitmapDesc) << 3); + block_group = block >> (sb->s_blocksize_bits + 3); + group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc); + + bitmap_nr = load_block_bitmap(sb, bitmap, block_group); + if (bitmap_nr < 0) + goto out; + bh = bitmap->s_block_bitmap[bitmap_nr]; + + bit = block % (sb->s_blocksize << 3); + + while (bit < (sb->s_blocksize << 3) && block_count > 0) { + if (!udf_clear_bit(bit, bh->b_data)) + goto out; + block_count--; + alloc_count++; + bit++; + block++; + } + mark_buffer_dirty(bh); + } while (block_count > 0); + +out: + udf_add_free_space(sb, partition, -alloc_count); + mutex_unlock(&sbi->s_alloc_mutex); + return alloc_count; +} + +static int udf_bitmap_new_block(struct super_block *sb, + struct inode *inode, + struct udf_bitmap *bitmap, uint16_t partition, + uint32_t goal, int *err) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + int newbit, bit = 0, block, block_group, group_start; + int end_goal, nr_groups, bitmap_nr, i; + struct buffer_head *bh = NULL; + char *ptr; + int newblock = 0; + + *err = -ENOSPC; + mutex_lock(&sbi->s_alloc_mutex); + +repeat: + if (goal >= sbi->s_partmaps[partition].s_partition_len) + goal = 0; + + nr_groups = bitmap->s_nr_groups; + block = goal + (sizeof(struct spaceBitmapDesc) << 3); + block_group = block >> (sb->s_blocksize_bits + 3); + group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc); + + bitmap_nr = load_block_bitmap(sb, bitmap, block_group); + if (bitmap_nr < 0) + goto error_return; + bh = bitmap->s_block_bitmap[bitmap_nr]; + ptr = memscan((char *)bh->b_data + group_start, 0xFF, + sb->s_blocksize - group_start); + + if ((ptr - ((char *)bh->b_data)) < sb->s_blocksize) { + bit = block % (sb->s_blocksize << 3); + if (udf_test_bit(bit, bh->b_data)) + goto got_block; + + end_goal = (bit + 63) & ~63; + bit = udf_find_next_one_bit(bh->b_data, end_goal, bit); + if (bit < end_goal) + goto got_block; + + ptr = memscan((char *)bh->b_data + (bit >> 3), 0xFF, + sb->s_blocksize - ((bit + 7) >> 3)); + newbit = (ptr - ((char *)bh->b_data)) << 3; + if (newbit < sb->s_blocksize << 3) { + bit = newbit; + goto search_back; + } + + newbit = udf_find_next_one_bit(bh->b_data, + sb->s_blocksize << 3, bit); + if (newbit < sb->s_blocksize << 3) { + bit = newbit; + goto got_block; + } + } + + for (i = 0; i < (nr_groups * 2); i++) { + block_group++; + if (block_group >= nr_groups) + block_group = 0; + group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc); + + bitmap_nr = load_block_bitmap(sb, bitmap, block_group); + if (bitmap_nr < 0) + goto error_return; + bh = bitmap->s_block_bitmap[bitmap_nr]; + if (i < nr_groups) { + ptr = memscan((char *)bh->b_data + group_start, 0xFF, + sb->s_blocksize - group_start); + if ((ptr - ((char *)bh->b_data)) < sb->s_blocksize) { + bit = (ptr - ((char *)bh->b_data)) << 3; + break; + } + } else { + bit = udf_find_next_one_bit(bh->b_data, + sb->s_blocksize << 3, + group_start << 3); + if (bit < sb->s_blocksize << 3) + break; + } + } + if (i >= (nr_groups * 2)) { + mutex_unlock(&sbi->s_alloc_mutex); + return newblock; + } + if (bit < sb->s_blocksize << 3) + goto search_back; + else + bit = udf_find_next_one_bit(bh->b_data, sb->s_blocksize << 3, + group_start << 3); + if (bit >= sb->s_blocksize << 3) { + mutex_unlock(&sbi->s_alloc_mutex); + return 0; + } + +search_back: + i = 0; + while (i < 7 && bit > (group_start << 3) && + udf_test_bit(bit - 1, bh->b_data)) { + ++i; + --bit; + } + +got_block: + newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - + (sizeof(struct spaceBitmapDesc) << 3); + + if (!udf_clear_bit(bit, bh->b_data)) { + udf_debug("bit already cleared for block %d\n", bit); + goto repeat; + } + + mark_buffer_dirty(bh); + + udf_add_free_space(sb, partition, -1); + mutex_unlock(&sbi->s_alloc_mutex); + *err = 0; + return newblock; + +error_return: + *err = -EIO; + mutex_unlock(&sbi->s_alloc_mutex); + return 0; +} + +static void udf_table_free_blocks(struct super_block *sb, + struct inode *inode, + struct inode *table, + struct kernel_lb_addr *bloc, + uint32_t offset, + uint32_t count) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *partmap; + uint32_t start, end; + uint32_t elen; + struct kernel_lb_addr eloc; + struct extent_position oepos, epos; + int8_t etype; + int i; + struct udf_inode_info *iinfo; + + mutex_lock(&sbi->s_alloc_mutex); + partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; + if (bloc->logicalBlockNum + count < count || + (bloc->logicalBlockNum + count) > partmap->s_partition_len) { + udf_debug("%d < %d || %d + %d > %d\n", + bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, + partmap->s_partition_len); + goto error_return; + } + + iinfo = UDF_I(table); + udf_add_free_space(sb, sbi->s_partition, count); + + start = bloc->logicalBlockNum + offset; + end = bloc->logicalBlockNum + offset + count - 1; + + epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry); + elen = 0; + epos.block = oepos.block = iinfo->i_location; + epos.bh = oepos.bh = NULL; + + while (count && + (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) { + if (((eloc.logicalBlockNum + + (elen >> sb->s_blocksize_bits)) == start)) { + if ((0x3FFFFFFF - elen) < + (count << sb->s_blocksize_bits)) { + uint32_t tmp = ((0x3FFFFFFF - elen) >> + sb->s_blocksize_bits); + count -= tmp; + start += tmp; + elen = (etype << 30) | + (0x40000000 - sb->s_blocksize); + } else { + elen = (etype << 30) | + (elen + + (count << sb->s_blocksize_bits)); + start += count; + count = 0; + } + udf_write_aext(table, &oepos, &eloc, elen, 1); + } else if (eloc.logicalBlockNum == (end + 1)) { + if ((0x3FFFFFFF - elen) < + (count << sb->s_blocksize_bits)) { + uint32_t tmp = ((0x3FFFFFFF - elen) >> + sb->s_blocksize_bits); + count -= tmp; + end -= tmp; + eloc.logicalBlockNum -= tmp; + elen = (etype << 30) | + (0x40000000 - sb->s_blocksize); + } else { + eloc.logicalBlockNum = start; + elen = (etype << 30) | + (elen + + (count << sb->s_blocksize_bits)); + end -= count; + count = 0; + } + udf_write_aext(table, &oepos, &eloc, elen, 1); + } + + if (epos.bh != oepos.bh) { + i = -1; + oepos.block = epos.block; + brelse(oepos.bh); + get_bh(epos.bh); + oepos.bh = epos.bh; + oepos.offset = 0; + } else { + oepos.offset = epos.offset; + } + } + + if (count) { + /* + * NOTE: we CANNOT use udf_add_aext here, as it can try to + * allocate a new block, and since we hold the super block + * lock already very bad things would happen :) + * + * We copy the behavior of udf_add_aext, but instead of + * trying to allocate a new block close to the existing one, + * we just steal a block from the extent we are trying to add. + * + * It would be nice if the blocks were close together, but it + * isn't required. + */ + + int adsize; + struct short_ad *sad = NULL; + struct long_ad *lad = NULL; + struct allocExtDesc *aed; + + eloc.logicalBlockNum = start; + elen = EXT_RECORDED_ALLOCATED | + (count << sb->s_blocksize_bits); + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else { + brelse(oepos.bh); + brelse(epos.bh); + goto error_return; + } + + if (epos.offset + (2 * adsize) > sb->s_blocksize) { + unsigned char *sptr, *dptr; + int loffset; + + brelse(oepos.bh); + oepos = epos; + + /* Steal a block from the extent being free'd */ + epos.block.logicalBlockNum = eloc.logicalBlockNum; + eloc.logicalBlockNum++; + elen -= sb->s_blocksize; + + epos.bh = udf_tread(sb, + udf_get_lb_pblock(sb, &epos.block, 0)); + if (!epos.bh) { + brelse(oepos.bh); + goto error_return; + } + aed = (struct allocExtDesc *)(epos.bh->b_data); + aed->previousAllocExtLocation = + cpu_to_le32(oepos.block.logicalBlockNum); + if (epos.offset + adsize > sb->s_blocksize) { + loffset = epos.offset; + aed->lengthAllocDescs = cpu_to_le32(adsize); + sptr = iinfo->i_ext.i_data + epos.offset + - adsize; + dptr = epos.bh->b_data + + sizeof(struct allocExtDesc); + memcpy(dptr, sptr, adsize); + epos.offset = sizeof(struct allocExtDesc) + + adsize; + } else { + loffset = epos.offset + adsize; + aed->lengthAllocDescs = cpu_to_le32(0); + if (oepos.bh) { + sptr = oepos.bh->b_data + epos.offset; + aed = (struct allocExtDesc *) + oepos.bh->b_data; + le32_add_cpu(&aed->lengthAllocDescs, + adsize); + } else { + sptr = iinfo->i_ext.i_data + + epos.offset; + iinfo->i_lenAlloc += adsize; + mark_inode_dirty(table); + } + epos.offset = sizeof(struct allocExtDesc); + } + if (sbi->s_udfrev >= 0x0200) + udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, + 3, 1, epos.block.logicalBlockNum, + sizeof(struct tag)); + else + udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, + 2, 1, epos.block.logicalBlockNum, + sizeof(struct tag)); + + switch (iinfo->i_alloc_type) { + case ICBTAG_FLAG_AD_SHORT: + sad = (struct short_ad *)sptr; + sad->extLength = cpu_to_le32( + EXT_NEXT_EXTENT_ALLOCDECS | + sb->s_blocksize); + sad->extPosition = + cpu_to_le32(epos.block.logicalBlockNum); + break; + case ICBTAG_FLAG_AD_LONG: + lad = (struct long_ad *)sptr; + lad->extLength = cpu_to_le32( + EXT_NEXT_EXTENT_ALLOCDECS | + sb->s_blocksize); + lad->extLocation = + cpu_to_lelb(epos.block); + break; + } + if (oepos.bh) { + udf_update_tag(oepos.bh->b_data, loffset); + mark_buffer_dirty(oepos.bh); + } else { + mark_inode_dirty(table); + } + } + + /* It's possible that stealing the block emptied the extent */ + if (elen) { + udf_write_aext(table, &epos, &eloc, elen, 1); + + if (!epos.bh) { + iinfo->i_lenAlloc += adsize; + mark_inode_dirty(table); + } else { + aed = (struct allocExtDesc *)epos.bh->b_data; + le32_add_cpu(&aed->lengthAllocDescs, adsize); + udf_update_tag(epos.bh->b_data, epos.offset); + mark_buffer_dirty(epos.bh); + } + } + } + + brelse(epos.bh); + brelse(oepos.bh); + +error_return: + mutex_unlock(&sbi->s_alloc_mutex); + return; +} + +static int udf_table_prealloc_blocks(struct super_block *sb, + struct inode *inode, + struct inode *table, uint16_t partition, + uint32_t first_block, uint32_t block_count) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + int alloc_count = 0; + uint32_t elen, adsize; + struct kernel_lb_addr eloc; + struct extent_position epos; + int8_t etype = -1; + struct udf_inode_info *iinfo; + + if (first_block >= sbi->s_partmaps[partition].s_partition_len) + return 0; + + iinfo = UDF_I(table); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + return 0; + + mutex_lock(&sbi->s_alloc_mutex); + epos.offset = sizeof(struct unallocSpaceEntry); + epos.block = iinfo->i_location; + epos.bh = NULL; + eloc.logicalBlockNum = 0xFFFFFFFF; + + while (first_block != eloc.logicalBlockNum && + (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) { + udf_debug("eloc=%d, elen=%d, first_block=%d\n", + eloc.logicalBlockNum, elen, first_block); + ; /* empty loop body */ + } + + if (first_block == eloc.logicalBlockNum) { + epos.offset -= adsize; + + alloc_count = (elen >> sb->s_blocksize_bits); + if (alloc_count > block_count) { + alloc_count = block_count; + eloc.logicalBlockNum += alloc_count; + elen -= (alloc_count << sb->s_blocksize_bits); + udf_write_aext(table, &epos, &eloc, + (etype << 30) | elen, 1); + } else + udf_delete_aext(table, epos, eloc, + (etype << 30) | elen); + } else { + alloc_count = 0; + } + + brelse(epos.bh); + + if (alloc_count) + udf_add_free_space(sb, partition, -alloc_count); + mutex_unlock(&sbi->s_alloc_mutex); + return alloc_count; +} + +static int udf_table_new_block(struct super_block *sb, + struct inode *inode, + struct inode *table, uint16_t partition, + uint32_t goal, int *err) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF; + uint32_t newblock = 0, adsize; + uint32_t elen, goal_elen = 0; + struct kernel_lb_addr eloc, uninitialized_var(goal_eloc); + struct extent_position epos, goal_epos; + int8_t etype; + struct udf_inode_info *iinfo = UDF_I(table); + + *err = -ENOSPC; + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + return newblock; + + mutex_lock(&sbi->s_alloc_mutex); + if (goal >= sbi->s_partmaps[partition].s_partition_len) + goal = 0; + + /* We search for the closest matching block to goal. If we find + a exact hit, we stop. Otherwise we keep going till we run out + of extents. We store the buffer_head, bloc, and extoffset + of the current closest match and use that when we are done. + */ + epos.offset = sizeof(struct unallocSpaceEntry); + epos.block = iinfo->i_location; + epos.bh = goal_epos.bh = NULL; + + while (spread && + (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) { + if (goal >= eloc.logicalBlockNum) { + if (goal < eloc.logicalBlockNum + + (elen >> sb->s_blocksize_bits)) + nspread = 0; + else + nspread = goal - eloc.logicalBlockNum - + (elen >> sb->s_blocksize_bits); + } else { + nspread = eloc.logicalBlockNum - goal; + } + + if (nspread < spread) { + spread = nspread; + if (goal_epos.bh != epos.bh) { + brelse(goal_epos.bh); + goal_epos.bh = epos.bh; + get_bh(goal_epos.bh); + } + goal_epos.block = epos.block; + goal_epos.offset = epos.offset - adsize; + goal_eloc = eloc; + goal_elen = (etype << 30) | elen; + } + } + + brelse(epos.bh); + + if (spread == 0xFFFFFFFF) { + brelse(goal_epos.bh); + mutex_unlock(&sbi->s_alloc_mutex); + return 0; + } + + /* Only allocate blocks from the beginning of the extent. + That way, we only delete (empty) extents, never have to insert an + extent because of splitting */ + /* This works, but very poorly.... */ + + newblock = goal_eloc.logicalBlockNum; + goal_eloc.logicalBlockNum++; + goal_elen -= sb->s_blocksize; + + if (goal_elen) + udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1); + else + udf_delete_aext(table, goal_epos, goal_eloc, goal_elen); + brelse(goal_epos.bh); + + udf_add_free_space(sb, partition, -1); + + mutex_unlock(&sbi->s_alloc_mutex); + *err = 0; + return newblock; +} + +void udf_free_blocks(struct super_block *sb, struct inode *inode, + struct kernel_lb_addr *bloc, uint32_t offset, + uint32_t count) +{ + uint16_t partition = bloc->partitionReferenceNum; + struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; + + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { + udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap, + bloc, offset, count); + } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { + udf_table_free_blocks(sb, inode, map->s_uspace.s_table, + bloc, offset, count); + } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { + udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap, + bloc, offset, count); + } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { + udf_table_free_blocks(sb, inode, map->s_fspace.s_table, + bloc, offset, count); + } +} + +inline int udf_prealloc_blocks(struct super_block *sb, + struct inode *inode, + uint16_t partition, uint32_t first_block, + uint32_t block_count) +{ + struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; + + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) + return udf_bitmap_prealloc_blocks(sb, inode, + map->s_uspace.s_bitmap, + partition, first_block, + block_count); + else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) + return udf_table_prealloc_blocks(sb, inode, + map->s_uspace.s_table, + partition, first_block, + block_count); + else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) + return udf_bitmap_prealloc_blocks(sb, inode, + map->s_fspace.s_bitmap, + partition, first_block, + block_count); + else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) + return udf_table_prealloc_blocks(sb, inode, + map->s_fspace.s_table, + partition, first_block, + block_count); + else + return 0; +} + +inline int udf_new_block(struct super_block *sb, + struct inode *inode, + uint16_t partition, uint32_t goal, int *err) +{ + struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; + + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) + return udf_bitmap_new_block(sb, inode, + map->s_uspace.s_bitmap, + partition, goal, err); + else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) + return udf_table_new_block(sb, inode, + map->s_uspace.s_table, + partition, goal, err); + else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) + return udf_bitmap_new_block(sb, inode, + map->s_fspace.s_bitmap, + partition, goal, err); + else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) + return udf_table_new_block(sb, inode, + map->s_fspace.s_table, + partition, goal, err); + else { + *err = -EIO; + return 0; + } +} diff --git a/fs/udf/dir.c b/fs/udf/dir.c new file mode 100644 index 00000000..eb8bfe2b --- /dev/null +++ b/fs/udf/dir.c @@ -0,0 +1,210 @@ +/* + * dir.c + * + * PURPOSE + * Directory handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998-2004 Ben Fennema + * + * HISTORY + * + * 10/05/98 dgb Split directory operations into its own file + * Implemented directory reads via do_udf_readdir + * 10/06/98 Made directory operations work! + * 11/17/98 Rewrote directory to support ICBTAG_FLAG_AD_LONG + * 11/25/98 blf Rewrote directory handling (readdir+lookup) to support reading + * across blocks. + * 12/12/98 Split out the lookup code to namei.c. bulk of directory + * code now in directory.c:udf_fileident_read. + */ + +#include "udfdecl.h" + +#include +#include +#include +#include +#include + +#include "udf_i.h" +#include "udf_sb.h" + +static int do_udf_readdir(struct inode *dir, struct file *filp, + filldir_t filldir, void *dirent) +{ + struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL}; + struct fileIdentDesc *fi = NULL; + struct fileIdentDesc cfi; + int block, iblock; + loff_t nf_pos = (filp->f_pos - 1) << 2; + int flen; + unsigned char *fname = NULL; + unsigned char *nameptr; + uint16_t liu; + uint8_t lfi; + loff_t size = udf_ext0_offset(dir) + dir->i_size; + struct buffer_head *tmp, *bha[16]; + struct kernel_lb_addr eloc; + uint32_t elen; + sector_t offset; + int i, num, ret = 0; + unsigned int dt_type; + struct extent_position epos = { NULL, 0, {0, 0} }; + struct udf_inode_info *iinfo; + + if (nf_pos >= size) + goto out; + + fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); + if (!fname) { + ret = -ENOMEM; + goto out; + } + + if (nf_pos == 0) + nf_pos = udf_ext0_offset(dir); + + fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1); + iinfo = UDF_I(dir); + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits, + &epos, &eloc, &elen, &offset) + != (EXT_RECORDED_ALLOCATED >> 30)) { + ret = -ENOENT; + goto out; + } + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); + if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (iinfo->i_alloc_type == + ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + } else { + offset = 0; + } + + if (!(fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block))) { + ret = -EIO; + goto out; + } + + if (!(offset & ((16 >> (dir->i_sb->s_blocksize_bits - 9)) - 1))) { + i = 16 >> (dir->i_sb->s_blocksize_bits - 9); + if (i + offset > (elen >> dir->i_sb->s_blocksize_bits)) + i = (elen >> dir->i_sb->s_blocksize_bits) - offset; + for (num = 0; i > 0; i--) { + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i); + tmp = udf_tgetblk(dir->i_sb, block); + if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) + bha[num++] = tmp; + else + brelse(tmp); + } + if (num) { + ll_rw_block(READA, num, bha); + for (i = 0; i < num; i++) + brelse(bha[i]); + } + } + } + + while (nf_pos < size) { + filp->f_pos = (nf_pos >> 2) + 1; + + fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, + &elen, &offset); + if (!fi) + goto out; + + liu = le16_to_cpu(cfi.lengthOfImpUse); + lfi = cfi.lengthFileIdent; + + if (fibh.sbh == fibh.ebh) { + nameptr = fi->fileIdent + liu; + } else { + int poffset; /* Unpaded ending offset */ + + poffset = fibh.soffset + sizeof(struct fileIdentDesc) + liu + lfi; + + if (poffset >= lfi) { + nameptr = (char *)(fibh.ebh->b_data + poffset - lfi); + } else { + nameptr = fname; + memcpy(nameptr, fi->fileIdent + liu, + lfi - poffset); + memcpy(nameptr + lfi - poffset, + fibh.ebh->b_data, poffset); + } + } + + if ((cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { + if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNDELETE)) + continue; + } + + if ((cfi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { + if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNHIDE)) + continue; + } + + if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) { + iblock = parent_ino(filp->f_path.dentry); + flen = 2; + memcpy(fname, "..", flen); + dt_type = DT_DIR; + } else { + struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation); + + iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0); + flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); + dt_type = DT_UNKNOWN; + } + + if (flen && filldir(dirent, fname, flen, filp->f_pos, + iblock, dt_type) < 0) + goto out; + } /* end while */ + + filp->f_pos = (nf_pos >> 2) + 1; + +out: + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + brelse(epos.bh); + kfree(fname); + + return ret; +} + +static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *dir = filp->f_path.dentry->d_inode; + int result; + + if (filp->f_pos == 0) { + if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) { + return 0; + } + filp->f_pos++; + } + + result = do_udf_readdir(dir, filp, filldir, dirent); + return result; +} + +/* readdir and lookup functions */ +const struct file_operations udf_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = udf_readdir, + .unlocked_ioctl = udf_ioctl, + .fsync = generic_file_fsync, +}; diff --git a/fs/udf/directory.c b/fs/udf/directory.c new file mode 100644 index 00000000..2ffdb673 --- /dev/null +++ b/fs/udf/directory.c @@ -0,0 +1,241 @@ +/* + * directory.c + * + * PURPOSE + * Directory related functions + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + */ + +#include "udfdecl.h" +#include "udf_i.h" + +#include +#include +#include + +struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, + struct udf_fileident_bh *fibh, + struct fileIdentDesc *cfi, + struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t *elen, + sector_t *offset) +{ + struct fileIdentDesc *fi; + int i, num, block; + struct buffer_head *tmp, *bha[16]; + struct udf_inode_info *iinfo = UDF_I(dir); + + fibh->soffset = fibh->eoffset; + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + fi = udf_get_fileident(iinfo->i_ext.i_data - + (iinfo->i_efe ? + sizeof(struct extendedFileEntry) : + sizeof(struct fileEntry)), + dir->i_sb->s_blocksize, + &(fibh->eoffset)); + if (!fi) + return NULL; + + *nf_pos += fibh->eoffset - fibh->soffset; + + memcpy((uint8_t *)cfi, (uint8_t *)fi, + sizeof(struct fileIdentDesc)); + + return fi; + } + + if (fibh->eoffset == dir->i_sb->s_blocksize) { + int lextoffset = epos->offset; + unsigned char blocksize_bits = dir->i_sb->s_blocksize_bits; + + if (udf_next_aext(dir, epos, eloc, elen, 1) != + (EXT_RECORDED_ALLOCATED >> 30)) + return NULL; + + block = udf_get_lb_pblock(dir->i_sb, eloc, *offset); + + (*offset)++; + + if ((*offset << blocksize_bits) >= *elen) + *offset = 0; + else + epos->offset = lextoffset; + + brelse(fibh->sbh); + fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); + if (!fibh->sbh) + return NULL; + fibh->soffset = fibh->eoffset = 0; + + if (!(*offset & ((16 >> (blocksize_bits - 9)) - 1))) { + i = 16 >> (blocksize_bits - 9); + if (i + *offset > (*elen >> blocksize_bits)) + i = (*elen >> blocksize_bits)-*offset; + for (num = 0; i > 0; i--) { + block = udf_get_lb_pblock(dir->i_sb, eloc, + *offset + i); + tmp = udf_tgetblk(dir->i_sb, block); + if (tmp && !buffer_uptodate(tmp) && + !buffer_locked(tmp)) + bha[num++] = tmp; + else + brelse(tmp); + } + if (num) { + ll_rw_block(READA, num, bha); + for (i = 0; i < num; i++) + brelse(bha[i]); + } + } + } else if (fibh->sbh != fibh->ebh) { + brelse(fibh->sbh); + fibh->sbh = fibh->ebh; + } + + fi = udf_get_fileident(fibh->sbh->b_data, dir->i_sb->s_blocksize, + &(fibh->eoffset)); + + if (!fi) + return NULL; + + *nf_pos += fibh->eoffset - fibh->soffset; + + if (fibh->eoffset <= dir->i_sb->s_blocksize) { + memcpy((uint8_t *)cfi, (uint8_t *)fi, + sizeof(struct fileIdentDesc)); + } else if (fibh->eoffset > dir->i_sb->s_blocksize) { + int lextoffset = epos->offset; + + if (udf_next_aext(dir, epos, eloc, elen, 1) != + (EXT_RECORDED_ALLOCATED >> 30)) + return NULL; + + block = udf_get_lb_pblock(dir->i_sb, eloc, *offset); + + (*offset)++; + + if ((*offset << dir->i_sb->s_blocksize_bits) >= *elen) + *offset = 0; + else + epos->offset = lextoffset; + + fibh->soffset -= dir->i_sb->s_blocksize; + fibh->eoffset -= dir->i_sb->s_blocksize; + + fibh->ebh = udf_tread(dir->i_sb, block); + if (!fibh->ebh) + return NULL; + + if (sizeof(struct fileIdentDesc) > -fibh->soffset) { + int fi_len; + + memcpy((uint8_t *)cfi, (uint8_t *)fi, -fibh->soffset); + memcpy((uint8_t *)cfi - fibh->soffset, + fibh->ebh->b_data, + sizeof(struct fileIdentDesc) + fibh->soffset); + + fi_len = (sizeof(struct fileIdentDesc) + + cfi->lengthFileIdent + + le16_to_cpu(cfi->lengthOfImpUse) + 3) & ~3; + + *nf_pos += fi_len - (fibh->eoffset - fibh->soffset); + fibh->eoffset = fibh->soffset + fi_len; + } else { + memcpy((uint8_t *)cfi, (uint8_t *)fi, + sizeof(struct fileIdentDesc)); + } + } + return fi; +} + +struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) +{ + struct fileIdentDesc *fi; + int lengthThisIdent; + uint8_t *ptr; + int padlen; + + if ((!buffer) || (!offset)) { + udf_debug("invalidparms\n, buffer=%p, offset=%p\n", buffer, + offset); + return NULL; + } + + ptr = buffer; + + if ((*offset > 0) && (*offset < bufsize)) + ptr += *offset; + fi = (struct fileIdentDesc *)ptr; + if (fi->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FID)) { + udf_debug("0x%x != TAG_IDENT_FID\n", + le16_to_cpu(fi->descTag.tagIdent)); + udf_debug("offset: %u sizeof: %lu bufsize: %u\n", + *offset, (unsigned long)sizeof(struct fileIdentDesc), + bufsize); + return NULL; + } + if ((*offset + sizeof(struct fileIdentDesc)) > bufsize) + lengthThisIdent = sizeof(struct fileIdentDesc); + else + lengthThisIdent = sizeof(struct fileIdentDesc) + + fi->lengthFileIdent + le16_to_cpu(fi->lengthOfImpUse); + + /* we need to figure padding, too! */ + padlen = lengthThisIdent % UDF_NAME_PAD; + if (padlen) + lengthThisIdent += (UDF_NAME_PAD - padlen); + *offset = *offset + lengthThisIdent; + + return fi; +} + +struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, + int inc) +{ + struct short_ad *sa; + + if ((!ptr) || (!offset)) { + printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n"); + return NULL; + } + + if ((*offset + sizeof(struct short_ad)) > maxoffset) + return NULL; + else { + sa = (struct short_ad *)ptr; + if (sa->extLength == 0) + return NULL; + } + + if (inc) + *offset += sizeof(struct short_ad); + return sa; +} + +struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) +{ + struct long_ad *la; + + if ((!ptr) || (!offset)) { + printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n"); + return NULL; + } + + if ((*offset + sizeof(struct long_ad)) > maxoffset) + return NULL; + else { + la = (struct long_ad *)ptr; + if (la->extLength == 0) + return NULL; + } + + if (inc) + *offset += sizeof(struct long_ad); + return la; +} diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h new file mode 100644 index 00000000..4792b771 --- /dev/null +++ b/fs/udf/ecma_167.h @@ -0,0 +1,796 @@ +/* + * ecma_167.h + * + * This file is based on ECMA-167 3rd edition (June 1997) + * http://www.ecma.ch + * + * Copyright (c) 2001-2002 Ben Fennema + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#ifndef _ECMA_167_H +#define _ECMA_167_H 1 + +/* Character set specification (ECMA 167r3 1/7.2.1) */ +struct charspec { + uint8_t charSetType; + uint8_t charSetInfo[63]; +} __attribute__ ((packed)); + +/* Character Set Type (ECMA 167r3 1/7.2.1.1) */ +#define CHARSPEC_TYPE_CS0 0x00 /* (1/7.2.2) */ +#define CHARSPEC_TYPE_CS1 0x01 /* (1/7.2.3) */ +#define CHARSPEC_TYPE_CS2 0x02 /* (1/7.2.4) */ +#define CHARSPEC_TYPE_CS3 0x03 /* (1/7.2.5) */ +#define CHARSPEC_TYPE_CS4 0x04 /* (1/7.2.6) */ +#define CHARSPEC_TYPE_CS5 0x05 /* (1/7.2.7) */ +#define CHARSPEC_TYPE_CS6 0x06 /* (1/7.2.8) */ +#define CHARSPEC_TYPE_CS7 0x07 /* (1/7.2.9) */ +#define CHARSPEC_TYPE_CS8 0x08 /* (1/7.2.10) */ + +typedef uint8_t dstring; + +/* Timestamp (ECMA 167r3 1/7.3) */ +struct timestamp { + __le16 typeAndTimezone; + __le16 year; + uint8_t month; + uint8_t day; + uint8_t hour; + uint8_t minute; + uint8_t second; + uint8_t centiseconds; + uint8_t hundredsOfMicroseconds; + uint8_t microseconds; +} __attribute__ ((packed)); + +/* Type and Time Zone (ECMA 167r3 1/7.3.1) */ +#define TIMESTAMP_TYPE_MASK 0xF000 +#define TIMESTAMP_TYPE_CUT 0x0000 +#define TIMESTAMP_TYPE_LOCAL 0x1000 +#define TIMESTAMP_TYPE_AGREEMENT 0x2000 +#define TIMESTAMP_TIMEZONE_MASK 0x0FFF + +/* Entity identifier (ECMA 167r3 1/7.4) */ +struct regid { + uint8_t flags; + uint8_t ident[23]; + uint8_t identSuffix[8]; +} __attribute__ ((packed)); + +/* Flags (ECMA 167r3 1/7.4.1) */ +#define ENTITYID_FLAGS_DIRTY 0x00 +#define ENTITYID_FLAGS_PROTECTED 0x01 + +/* Volume Structure Descriptor (ECMA 167r3 2/9.1) */ +#define VSD_STD_ID_LEN 5 +struct volStructDesc { + uint8_t structType; + uint8_t stdIdent[VSD_STD_ID_LEN]; + uint8_t structVersion; + uint8_t structData[2041]; +} __attribute__ ((packed)); + +/* Standard Identifier (EMCA 167r2 2/9.1.2) */ +#define VSD_STD_ID_NSR02 "NSR02" /* (3/9.1) */ + +/* Standard Identifier (ECMA 167r3 2/9.1.2) */ +#define VSD_STD_ID_BEA01 "BEA01" /* (2/9.2) */ +#define VSD_STD_ID_BOOT2 "BOOT2" /* (2/9.4) */ +#define VSD_STD_ID_CD001 "CD001" /* (ECMA-119) */ +#define VSD_STD_ID_CDW02 "CDW02" /* (ECMA-168) */ +#define VSD_STD_ID_NSR03 "NSR03" /* (3/9.1) */ +#define VSD_STD_ID_TEA01 "TEA01" /* (2/9.3) */ + +/* Beginning Extended Area Descriptor (ECMA 167r3 2/9.2) */ +struct beginningExtendedAreaDesc { + uint8_t structType; + uint8_t stdIdent[VSD_STD_ID_LEN]; + uint8_t structVersion; + uint8_t structData[2041]; +} __attribute__ ((packed)); + +/* Terminating Extended Area Descriptor (ECMA 167r3 2/9.3) */ +struct terminatingExtendedAreaDesc { + uint8_t structType; + uint8_t stdIdent[VSD_STD_ID_LEN]; + uint8_t structVersion; + uint8_t structData[2041]; +} __attribute__ ((packed)); + +/* Boot Descriptor (ECMA 167r3 2/9.4) */ +struct bootDesc { + uint8_t structType; + uint8_t stdIdent[VSD_STD_ID_LEN]; + uint8_t structVersion; + uint8_t reserved1; + struct regid archType; + struct regid bootIdent; + __le32 bootExtLocation; + __le32 bootExtLength; + __le64 loadAddress; + __le64 startAddress; + struct timestamp descCreationDateAndTime; + __le16 flags; + uint8_t reserved2[32]; + uint8_t bootUse[1906]; +} __attribute__ ((packed)); + +/* Flags (ECMA 167r3 2/9.4.12) */ +#define BOOT_FLAGS_ERASE 0x01 + +/* Extent Descriptor (ECMA 167r3 3/7.1) */ +struct extent_ad { + __le32 extLength; + __le32 extLocation; +} __attribute__ ((packed)); + +struct kernel_extent_ad { + uint32_t extLength; + uint32_t extLocation; +}; + +/* Descriptor Tag (ECMA 167r3 3/7.2) */ +struct tag { + __le16 tagIdent; + __le16 descVersion; + uint8_t tagChecksum; + uint8_t reserved; + __le16 tagSerialNum; + __le16 descCRC; + __le16 descCRCLength; + __le32 tagLocation; +} __attribute__ ((packed)); + +/* Tag Identifier (ECMA 167r3 3/7.2.1) */ +#define TAG_IDENT_PVD 0x0001 +#define TAG_IDENT_AVDP 0x0002 +#define TAG_IDENT_VDP 0x0003 +#define TAG_IDENT_IUVD 0x0004 +#define TAG_IDENT_PD 0x0005 +#define TAG_IDENT_LVD 0x0006 +#define TAG_IDENT_USD 0x0007 +#define TAG_IDENT_TD 0x0008 +#define TAG_IDENT_LVID 0x0009 + +/* NSR Descriptor (ECMA 167r3 3/9.1) */ +struct NSRDesc { + uint8_t structType; + uint8_t stdIdent[VSD_STD_ID_LEN]; + uint8_t structVersion; + uint8_t reserved; + uint8_t structData[2040]; +} __attribute__ ((packed)); + +/* Primary Volume Descriptor (ECMA 167r3 3/10.1) */ +struct primaryVolDesc { + struct tag descTag; + __le32 volDescSeqNum; + __le32 primaryVolDescNum; + dstring volIdent[32]; + __le16 volSeqNum; + __le16 maxVolSeqNum; + __le16 interchangeLvl; + __le16 maxInterchangeLvl; + __le32 charSetList; + __le32 maxCharSetList; + dstring volSetIdent[128]; + struct charspec descCharSet; + struct charspec explanatoryCharSet; + struct extent_ad volAbstract; + struct extent_ad volCopyright; + struct regid appIdent; + struct timestamp recordingDateAndTime; + struct regid impIdent; + uint8_t impUse[64]; + __le32 predecessorVolDescSeqLocation; + __le16 flags; + uint8_t reserved[22]; +} __attribute__ ((packed)); + +/* Flags (ECMA 167r3 3/10.1.21) */ +#define PVD_FLAGS_VSID_COMMON 0x0001 + +/* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */ +struct anchorVolDescPtr { + struct tag descTag; + struct extent_ad mainVolDescSeqExt; + struct extent_ad reserveVolDescSeqExt; + uint8_t reserved[480]; +} __attribute__ ((packed)); + +/* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */ +struct volDescPtr { + struct tag descTag; + __le32 volDescSeqNum; + struct extent_ad nextVolDescSeqExt; + uint8_t reserved[484]; +} __attribute__ ((packed)); + +/* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */ +struct impUseVolDesc { + struct tag descTag; + __le32 volDescSeqNum; + struct regid impIdent; + uint8_t impUse[460]; +} __attribute__ ((packed)); + +/* Partition Descriptor (ECMA 167r3 3/10.5) */ +struct partitionDesc { + struct tag descTag; + __le32 volDescSeqNum; + __le16 partitionFlags; + __le16 partitionNumber; + struct regid partitionContents; + uint8_t partitionContentsUse[128]; + __le32 accessType; + __le32 partitionStartingLocation; + __le32 partitionLength; + struct regid impIdent; + uint8_t impUse[128]; + uint8_t reserved[156]; +} __attribute__ ((packed)); + +/* Partition Flags (ECMA 167r3 3/10.5.3) */ +#define PD_PARTITION_FLAGS_ALLOC 0x0001 + +/* Partition Contents (ECMA 167r2 3/10.5.3) */ +#define PD_PARTITION_CONTENTS_NSR02 "+NSR02" + +/* Partition Contents (ECMA 167r3 3/10.5.5) */ +#define PD_PARTITION_CONTENTS_FDC01 "+FDC01" +#define PD_PARTITION_CONTENTS_CD001 "+CD001" +#define PD_PARTITION_CONTENTS_CDW02 "+CDW02" +#define PD_PARTITION_CONTENTS_NSR03 "+NSR03" + +/* Access Type (ECMA 167r3 3/10.5.7) */ +#define PD_ACCESS_TYPE_NONE 0x00000000 +#define PD_ACCESS_TYPE_READ_ONLY 0x00000001 +#define PD_ACCESS_TYPE_WRITE_ONCE 0x00000002 +#define PD_ACCESS_TYPE_REWRITABLE 0x00000003 +#define PD_ACCESS_TYPE_OVERWRITABLE 0x00000004 + +/* Logical Volume Descriptor (ECMA 167r3 3/10.6) */ +struct logicalVolDesc { + struct tag descTag; + __le32 volDescSeqNum; + struct charspec descCharSet; + dstring logicalVolIdent[128]; + __le32 logicalBlockSize; + struct regid domainIdent; + uint8_t logicalVolContentsUse[16]; + __le32 mapTableLength; + __le32 numPartitionMaps; + struct regid impIdent; + uint8_t impUse[128]; + struct extent_ad integritySeqExt; + uint8_t partitionMaps[0]; +} __attribute__ ((packed)); + +/* Generic Partition Map (ECMA 167r3 3/10.7.1) */ +struct genericPartitionMap { + uint8_t partitionMapType; + uint8_t partitionMapLength; + uint8_t partitionMapping[0]; +} __attribute__ ((packed)); + +/* Partition Map Type (ECMA 167r3 3/10.7.1.1) */ +#define GP_PARTITION_MAP_TYPE_UNDEF 0x00 +#define GP_PARTIITON_MAP_TYPE_1 0x01 +#define GP_PARTITION_MAP_TYPE_2 0x02 + +/* Type 1 Partition Map (ECMA 167r3 3/10.7.2) */ +struct genericPartitionMap1 { + uint8_t partitionMapType; + uint8_t partitionMapLength; + __le16 volSeqNum; + __le16 partitionNum; +} __attribute__ ((packed)); + +/* Type 2 Partition Map (ECMA 167r3 3/10.7.3) */ +struct genericPartitionMap2 { + uint8_t partitionMapType; + uint8_t partitionMapLength; + uint8_t partitionIdent[62]; +} __attribute__ ((packed)); + +/* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */ +struct unallocSpaceDesc { + struct tag descTag; + __le32 volDescSeqNum; + __le32 numAllocDescs; + struct extent_ad allocDescs[0]; +} __attribute__ ((packed)); + +/* Terminating Descriptor (ECMA 167r3 3/10.9) */ +struct terminatingDesc { + struct tag descTag; + uint8_t reserved[496]; +} __attribute__ ((packed)); + +/* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */ +struct logicalVolIntegrityDesc { + struct tag descTag; + struct timestamp recordingDateAndTime; + __le32 integrityType; + struct extent_ad nextIntegrityExt; + uint8_t logicalVolContentsUse[32]; + __le32 numOfPartitions; + __le32 lengthOfImpUse; + __le32 freeSpaceTable[0]; + __le32 sizeTable[0]; + uint8_t impUse[0]; +} __attribute__ ((packed)); + +/* Integrity Type (ECMA 167r3 3/10.10.3) */ +#define LVID_INTEGRITY_TYPE_OPEN 0x00000000 +#define LVID_INTEGRITY_TYPE_CLOSE 0x00000001 + +/* Recorded Address (ECMA 167r3 4/7.1) */ +struct lb_addr { + __le32 logicalBlockNum; + __le16 partitionReferenceNum; +} __attribute__ ((packed)); + +/* ... and its in-core analog */ +struct kernel_lb_addr { + uint32_t logicalBlockNum; + uint16_t partitionReferenceNum; +}; + +/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ +struct short_ad { + __le32 extLength; + __le32 extPosition; +} __attribute__ ((packed)); + +/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */ +struct long_ad { + __le32 extLength; + struct lb_addr extLocation; + uint8_t impUse[6]; +} __attribute__ ((packed)); + +struct kernel_long_ad { + uint32_t extLength; + struct kernel_lb_addr extLocation; + uint8_t impUse[6]; +}; + +/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */ +struct ext_ad { + __le32 extLength; + __le32 recordedLength; + __le32 informationLength; + struct lb_addr extLocation; +} __attribute__ ((packed)); + +struct kernel_ext_ad { + uint32_t extLength; + uint32_t recordedLength; + uint32_t informationLength; + struct kernel_lb_addr extLocation; +}; + +/* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */ + +/* Tag Identifier (ECMA 167r3 4/7.2.1) */ +#define TAG_IDENT_FSD 0x0100 +#define TAG_IDENT_FID 0x0101 +#define TAG_IDENT_AED 0x0102 +#define TAG_IDENT_IE 0x0103 +#define TAG_IDENT_TE 0x0104 +#define TAG_IDENT_FE 0x0105 +#define TAG_IDENT_EAHD 0x0106 +#define TAG_IDENT_USE 0x0107 +#define TAG_IDENT_SBD 0x0108 +#define TAG_IDENT_PIE 0x0109 +#define TAG_IDENT_EFE 0x010A + +/* File Set Descriptor (ECMA 167r3 4/14.1) */ +struct fileSetDesc { + struct tag descTag; + struct timestamp recordingDateAndTime; + __le16 interchangeLvl; + __le16 maxInterchangeLvl; + __le32 charSetList; + __le32 maxCharSetList; + __le32 fileSetNum; + __le32 fileSetDescNum; + struct charspec logicalVolIdentCharSet; + dstring logicalVolIdent[128]; + struct charspec fileSetCharSet; + dstring fileSetIdent[32]; + dstring copyrightFileIdent[32]; + dstring abstractFileIdent[32]; + struct long_ad rootDirectoryICB; + struct regid domainIdent; + struct long_ad nextExt; + struct long_ad streamDirectoryICB; + uint8_t reserved[32]; +} __attribute__ ((packed)); + +/* Partition Header Descriptor (ECMA 167r3 4/14.3) */ +struct partitionHeaderDesc { + struct short_ad unallocSpaceTable; + struct short_ad unallocSpaceBitmap; + struct short_ad partitionIntegrityTable; + struct short_ad freedSpaceTable; + struct short_ad freedSpaceBitmap; + uint8_t reserved[88]; +} __attribute__ ((packed)); + +/* File Identifier Descriptor (ECMA 167r3 4/14.4) */ +struct fileIdentDesc { + struct tag descTag; + __le16 fileVersionNum; + uint8_t fileCharacteristics; + uint8_t lengthFileIdent; + struct long_ad icb; + __le16 lengthOfImpUse; + uint8_t impUse[0]; + uint8_t fileIdent[0]; + uint8_t padding[0]; +} __attribute__ ((packed)); + +/* File Characteristics (ECMA 167r3 4/14.4.3) */ +#define FID_FILE_CHAR_HIDDEN 0x01 +#define FID_FILE_CHAR_DIRECTORY 0x02 +#define FID_FILE_CHAR_DELETED 0x04 +#define FID_FILE_CHAR_PARENT 0x08 +#define FID_FILE_CHAR_METADATA 0x10 + +/* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */ +struct allocExtDesc { + struct tag descTag; + __le32 previousAllocExtLocation; + __le32 lengthAllocDescs; +} __attribute__ ((packed)); + +/* ICB Tag (ECMA 167r3 4/14.6) */ +struct icbtag { + __le32 priorRecordedNumDirectEntries; + __le16 strategyType; + __le16 strategyParameter; + __le16 numEntries; + uint8_t reserved; + uint8_t fileType; + struct lb_addr parentICBLocation; + __le16 flags; +} __attribute__ ((packed)); + +/* Strategy Type (ECMA 167r3 4/14.6.2) */ +#define ICBTAG_STRATEGY_TYPE_UNDEF 0x0000 +#define ICBTAG_STRATEGY_TYPE_1 0x0001 +#define ICBTAG_STRATEGY_TYPE_2 0x0002 +#define ICBTAG_STRATEGY_TYPE_3 0x0003 +#define ICBTAG_STRATEGY_TYPE_4 0x0004 + +/* File Type (ECMA 167r3 4/14.6.6) */ +#define ICBTAG_FILE_TYPE_UNDEF 0x00 +#define ICBTAG_FILE_TYPE_USE 0x01 +#define ICBTAG_FILE_TYPE_PIE 0x02 +#define ICBTAG_FILE_TYPE_IE 0x03 +#define ICBTAG_FILE_TYPE_DIRECTORY 0x04 +#define ICBTAG_FILE_TYPE_REGULAR 0x05 +#define ICBTAG_FILE_TYPE_BLOCK 0x06 +#define ICBTAG_FILE_TYPE_CHAR 0x07 +#define ICBTAG_FILE_TYPE_EA 0x08 +#define ICBTAG_FILE_TYPE_FIFO 0x09 +#define ICBTAG_FILE_TYPE_SOCKET 0x0A +#define ICBTAG_FILE_TYPE_TE 0x0B +#define ICBTAG_FILE_TYPE_SYMLINK 0x0C +#define ICBTAG_FILE_TYPE_STREAMDIR 0x0D + +/* Flags (ECMA 167r3 4/14.6.8) */ +#define ICBTAG_FLAG_AD_MASK 0x0007 +#define ICBTAG_FLAG_AD_SHORT 0x0000 +#define ICBTAG_FLAG_AD_LONG 0x0001 +#define ICBTAG_FLAG_AD_EXTENDED 0x0002 +#define ICBTAG_FLAG_AD_IN_ICB 0x0003 +#define ICBTAG_FLAG_SORTED 0x0008 +#define ICBTAG_FLAG_NONRELOCATABLE 0x0010 +#define ICBTAG_FLAG_ARCHIVE 0x0020 +#define ICBTAG_FLAG_SETUID 0x0040 +#define ICBTAG_FLAG_SETGID 0x0080 +#define ICBTAG_FLAG_STICKY 0x0100 +#define ICBTAG_FLAG_CONTIGUOUS 0x0200 +#define ICBTAG_FLAG_SYSTEM 0x0400 +#define ICBTAG_FLAG_TRANSFORMED 0x0800 +#define ICBTAG_FLAG_MULTIVERSIONS 0x1000 +#define ICBTAG_FLAG_STREAM 0x2000 + +/* Indirect Entry (ECMA 167r3 4/14.7) */ +struct indirectEntry { + struct tag descTag; + struct icbtag icbTag; + struct long_ad indirectICB; +} __attribute__ ((packed)); + +/* Terminal Entry (ECMA 167r3 4/14.8) */ +struct terminalEntry { + struct tag descTag; + struct icbtag icbTag; +} __attribute__ ((packed)); + +/* File Entry (ECMA 167r3 4/14.9) */ +struct fileEntry { + struct tag descTag; + struct icbtag icbTag; + __le32 uid; + __le32 gid; + __le32 permissions; + __le16 fileLinkCount; + uint8_t recordFormat; + uint8_t recordDisplayAttr; + __le32 recordLength; + __le64 informationLength; + __le64 logicalBlocksRecorded; + struct timestamp accessTime; + struct timestamp modificationTime; + struct timestamp attrTime; + __le32 checkpoint; + struct long_ad extendedAttrICB; + struct regid impIdent; + __le64 uniqueID; + __le32 lengthExtendedAttr; + __le32 lengthAllocDescs; + uint8_t extendedAttr[0]; + uint8_t allocDescs[0]; +} __attribute__ ((packed)); + +/* Permissions (ECMA 167r3 4/14.9.5) */ +#define FE_PERM_O_EXEC 0x00000001U +#define FE_PERM_O_WRITE 0x00000002U +#define FE_PERM_O_READ 0x00000004U +#define FE_PERM_O_CHATTR 0x00000008U +#define FE_PERM_O_DELETE 0x00000010U +#define FE_PERM_G_EXEC 0x00000020U +#define FE_PERM_G_WRITE 0x00000040U +#define FE_PERM_G_READ 0x00000080U +#define FE_PERM_G_CHATTR 0x00000100U +#define FE_PERM_G_DELETE 0x00000200U +#define FE_PERM_U_EXEC 0x00000400U +#define FE_PERM_U_WRITE 0x00000800U +#define FE_PERM_U_READ 0x00001000U +#define FE_PERM_U_CHATTR 0x00002000U +#define FE_PERM_U_DELETE 0x00004000U + +/* Record Format (ECMA 167r3 4/14.9.7) */ +#define FE_RECORD_FMT_UNDEF 0x00 +#define FE_RECORD_FMT_FIXED_PAD 0x01 +#define FE_RECORD_FMT_FIXED 0x02 +#define FE_RECORD_FMT_VARIABLE8 0x03 +#define FE_RECORD_FMT_VARIABLE16 0x04 +#define FE_RECORD_FMT_VARIABLE16_MSB 0x05 +#define FE_RECORD_FMT_VARIABLE32 0x06 +#define FE_RECORD_FMT_PRINT 0x07 +#define FE_RECORD_FMT_LF 0x08 +#define FE_RECORD_FMT_CR 0x09 +#define FE_RECORD_FMT_CRLF 0x0A +#define FE_RECORD_FMT_LFCR 0x0B + +/* Record Display Attributes (ECMA 167r3 4/14.9.8) */ +#define FE_RECORD_DISPLAY_ATTR_UNDEF 0x00 +#define FE_RECORD_DISPLAY_ATTR_1 0x01 +#define FE_RECORD_DISPLAY_ATTR_2 0x02 +#define FE_RECORD_DISPLAY_ATTR_3 0x03 + +/* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */ +struct extendedAttrHeaderDesc { + struct tag descTag; + __le32 impAttrLocation; + __le32 appAttrLocation; +} __attribute__ ((packed)); + +/* Generic Format (ECMA 167r3 4/14.10.2) */ +struct genericFormat { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + uint8_t attrData[0]; +} __attribute__ ((packed)); + +/* Character Set Information (ECMA 167r3 4/14.10.3) */ +struct charSetInfo { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + __le32 escapeSeqLength; + uint8_t charSetType; + uint8_t escapeSeq[0]; +} __attribute__ ((packed)); + +/* Alternate Permissions (ECMA 167r3 4/14.10.4) */ +struct altPerms { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + __le16 ownerIdent; + __le16 groupIdent; + __le16 permission; +} __attribute__ ((packed)); + +/* File Times Extended Attribute (ECMA 167r3 4/14.10.5) */ +struct fileTimesExtAttr { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + __le32 dataLength; + __le32 fileTimeExistence; + uint8_t fileTimes; +} __attribute__ ((packed)); + +/* FileTimeExistence (ECMA 167r3 4/14.10.5.6) */ +#define FTE_CREATION 0x00000001 +#define FTE_DELETION 0x00000004 +#define FTE_EFFECTIVE 0x00000008 +#define FTE_BACKUP 0x00000002 + +/* Information Times Extended Attribute (ECMA 167r3 4/14.10.6) */ +struct infoTimesExtAttr { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + __le32 dataLength; + __le32 infoTimeExistence; + uint8_t infoTimes[0]; +} __attribute__ ((packed)); + +/* Device Specification (ECMA 167r3 4/14.10.7) */ +struct deviceSpec { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + __le32 impUseLength; + __le32 majorDeviceIdent; + __le32 minorDeviceIdent; + uint8_t impUse[0]; +} __attribute__ ((packed)); + +/* Implementation Use Extended Attr (ECMA 167r3 4/14.10.8) */ +struct impUseExtAttr { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + __le32 impUseLength; + struct regid impIdent; + uint8_t impUse[0]; +} __attribute__ ((packed)); + +/* Application Use Extended Attribute (ECMA 167r3 4/14.10.9) */ +struct appUseExtAttr { + __le32 attrType; + uint8_t attrSubtype; + uint8_t reserved[3]; + __le32 attrLength; + __le32 appUseLength; + struct regid appIdent; + uint8_t appUse[0]; +} __attribute__ ((packed)); + +#define EXTATTR_CHAR_SET 1 +#define EXTATTR_ALT_PERMS 3 +#define EXTATTR_FILE_TIMES 5 +#define EXTATTR_INFO_TIMES 6 +#define EXTATTR_DEV_SPEC 12 +#define EXTATTR_IMP_USE 2048 +#define EXTATTR_APP_USE 65536 + +/* Unallocated Space Entry (ECMA 167r3 4/14.11) */ +struct unallocSpaceEntry { + struct tag descTag; + struct icbtag icbTag; + __le32 lengthAllocDescs; + uint8_t allocDescs[0]; +} __attribute__ ((packed)); + +/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */ +struct spaceBitmapDesc { + struct tag descTag; + __le32 numOfBits; + __le32 numOfBytes; + uint8_t bitmap[0]; +} __attribute__ ((packed)); + +/* Partition Integrity Entry (ECMA 167r3 4/14.13) */ +struct partitionIntegrityEntry { + struct tag descTag; + struct icbtag icbTag; + struct timestamp recordingDateAndTime; + uint8_t integrityType; + uint8_t reserved[175]; + struct regid impIdent; + uint8_t impUse[256]; +} __attribute__ ((packed)); + +/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ + +/* Extent Length (ECMA 167r3 4/14.14.1.1) */ +#define EXT_RECORDED_ALLOCATED 0x00000000 +#define EXT_NOT_RECORDED_ALLOCATED 0x40000000 +#define EXT_NOT_RECORDED_NOT_ALLOCATED 0x80000000 +#define EXT_NEXT_EXTENT_ALLOCDECS 0xC0000000 + +/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */ + +/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */ + +/* Logical Volume Header Descriptor (ECMA 167r3 4/14.15) */ +struct logicalVolHeaderDesc { + __le64 uniqueID; + uint8_t reserved[24]; +} __attribute__ ((packed)); + +/* Path Component (ECMA 167r3 4/14.16.1) */ +struct pathComponent { + uint8_t componentType; + uint8_t lengthComponentIdent; + __le16 componentFileVersionNum; + dstring componentIdent[0]; +} __attribute__ ((packed)); + +/* File Entry (ECMA 167r3 4/14.17) */ +struct extendedFileEntry { + struct tag descTag; + struct icbtag icbTag; + __le32 uid; + __le32 gid; + __le32 permissions; + __le16 fileLinkCount; + uint8_t recordFormat; + uint8_t recordDisplayAttr; + __le32 recordLength; + __le64 informationLength; + __le64 objectSize; + __le64 logicalBlocksRecorded; + struct timestamp accessTime; + struct timestamp modificationTime; + struct timestamp createTime; + struct timestamp attrTime; + __le32 checkpoint; + __le32 reserved; + struct long_ad extendedAttrICB; + struct long_ad streamDirectoryICB; + struct regid impIdent; + __le64 uniqueID; + __le32 lengthExtendedAttr; + __le32 lengthAllocDescs; + uint8_t extendedAttr[0]; + uint8_t allocDescs[0]; +} __attribute__ ((packed)); + +#endif /* _ECMA_167_H */ diff --git a/fs/udf/file.c b/fs/udf/file.c new file mode 100644 index 00000000..3438b000 --- /dev/null +++ b/fs/udf/file.c @@ -0,0 +1,249 @@ +/* + * file.c + * + * PURPOSE + * File handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998-1999 Dave Boynton + * (C) 1998-2004 Ben Fennema + * (C) 1999-2000 Stelias Computing Inc + * + * HISTORY + * + * 10/02/98 dgb Attempt to integrate into udf.o + * 10/07/98 Switched to using generic_readpage, etc., like isofs + * And it works! + * 12/06/98 blf Added udf_file_read. uses generic_file_read for all cases but + * ICBTAG_FLAG_AD_IN_ICB. + * 04/06/99 64 bit file handling on 32 bit systems taken from ext2 file.c + * 05/12/99 Preliminary file write support + */ + +#include "udfdecl.h" +#include +#include +#include +#include /* memset */ +#include +#include +#include +#include +#include + +#include "udf_i.h" +#include "udf_sb.h" + +static int udf_adinicb_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + char *kaddr; + struct udf_inode_info *iinfo = UDF_I(inode); + + BUG_ON(!PageLocked(page)); + + kaddr = kmap(page); + memset(kaddr, 0, PAGE_CACHE_SIZE); + memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size); + flush_dcache_page(page); + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + + return 0; +} + +static int udf_adinicb_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + char *kaddr; + struct udf_inode_info *iinfo = UDF_I(inode); + + BUG_ON(!PageLocked(page)); + + kaddr = kmap(page); + memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); + mark_inode_dirty(inode); + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + + return 0; +} + +static int udf_adinicb_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + char *kaddr; + struct udf_inode_info *iinfo = UDF_I(inode); + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset, + kaddr + offset, copied); + kunmap_atomic(kaddr, KM_USER0); + + return simple_write_end(file, mapping, pos, len, copied, page, fsdata); +} + +const struct address_space_operations udf_adinicb_aops = { + .readpage = udf_adinicb_readpage, + .writepage = udf_adinicb_writepage, + .write_begin = simple_write_begin, + .write_end = udf_adinicb_write_end, +}; + +static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t ppos) +{ + ssize_t retval; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_path.dentry->d_inode; + int err, pos; + size_t count = iocb->ki_left; + struct udf_inode_info *iinfo = UDF_I(inode); + + down_write(&iinfo->i_data_sem); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + if (file->f_flags & O_APPEND) + pos = inode->i_size; + else + pos = ppos; + + if (inode->i_sb->s_blocksize < + (udf_file_entry_alloc_offset(inode) + + pos + count)) { + err = udf_expand_file_adinicb(inode); + if (err) { + udf_debug("udf_expand_adinicb: err=%d\n", err); + return err; + } + } else { + if (pos + count > inode->i_size) + iinfo->i_lenAlloc = pos + count; + else + iinfo->i_lenAlloc = inode->i_size; + up_write(&iinfo->i_data_sem); + } + } else + up_write(&iinfo->i_data_sem); + + retval = generic_file_aio_write(iocb, iov, nr_segs, ppos); + if (retval > 0) + mark_inode_dirty(inode); + + return retval; +} + +long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + long old_block, new_block; + int result = -EINVAL; + + if (file_permission(filp, MAY_READ) != 0) { + udf_debug("no permission to access inode %lu\n", inode->i_ino); + result = -EPERM; + goto out; + } + + if (!arg) { + udf_debug("invalid argument to udf_ioctl\n"); + result = -EINVAL; + goto out; + } + + switch (cmd) { + case UDF_GETVOLIDENT: + if (copy_to_user((char __user *)arg, + UDF_SB(inode->i_sb)->s_volume_ident, 32)) + result = -EFAULT; + else + result = 0; + goto out; + case UDF_RELOCATE_BLOCKS: + if (!capable(CAP_SYS_ADMIN)) { + result = -EACCES; + goto out; + } + if (get_user(old_block, (long __user *)arg)) { + result = -EFAULT; + goto out; + } + result = udf_relocate_blocks(inode->i_sb, + old_block, &new_block); + if (result == 0) + result = put_user(new_block, (long __user *)arg); + goto out; + case UDF_GETEASIZE: + result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg); + goto out; + case UDF_GETEABLOCK: + result = copy_to_user((char __user *)arg, + UDF_I(inode)->i_ext.i_data, + UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0; + goto out; + } + +out: + return result; +} + +static int udf_release_file(struct inode *inode, struct file *filp) +{ + if (filp->f_mode & FMODE_WRITE) { + down_write(&UDF_I(inode)->i_data_sem); + udf_discard_prealloc(inode); + udf_truncate_tail_extent(inode); + up_write(&UDF_I(inode)->i_data_sem); + } + return 0; +} + +const struct file_operations udf_file_operations = { + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .unlocked_ioctl = udf_ioctl, + .open = generic_file_open, + .mmap = generic_file_mmap, + .write = do_sync_write, + .aio_write = udf_file_aio_write, + .release = udf_release_file, + .fsync = generic_file_fsync, + .splice_read = generic_file_splice_read, + .llseek = generic_file_llseek, +}; + +static int udf_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = udf_setsize(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations udf_file_inode_operations = { + .setattr = udf_setattr, +}; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c new file mode 100644 index 00000000..6fb7e0ad --- /dev/null +++ b/fs/udf/ialloc.c @@ -0,0 +1,132 @@ +/* + * ialloc.c + * + * PURPOSE + * Inode allocation handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998-2001 Ben Fennema + * + * HISTORY + * + * 02/24/99 blf Created. + * + */ + +#include "udfdecl.h" +#include +#include +#include + +#include "udf_i.h" +#include "udf_sb.h" + +void udf_free_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct udf_sb_info *sbi = UDF_SB(sb); + + mutex_lock(&sbi->s_alloc_mutex); + if (sbi->s_lvid_bh) { + struct logicalVolIntegrityDescImpUse *lvidiu = + udf_sb_lvidiu(sbi); + if (S_ISDIR(inode->i_mode)) + le32_add_cpu(&lvidiu->numDirs, -1); + else + le32_add_cpu(&lvidiu->numFiles, -1); + udf_updated_lvid(sb); + } + mutex_unlock(&sbi->s_alloc_mutex); + + udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); +} + +struct inode *udf_new_inode(struct inode *dir, int mode, int *err) +{ + struct super_block *sb = dir->i_sb; + struct udf_sb_info *sbi = UDF_SB(sb); + struct inode *inode; + int block; + uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; + struct udf_inode_info *iinfo; + struct udf_inode_info *dinfo = UDF_I(dir); + + inode = new_inode(sb); + + if (!inode) { + *err = -ENOMEM; + return NULL; + } + *err = -ENOSPC; + + iinfo = UDF_I(inode); + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { + iinfo->i_efe = 1; + if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev) + sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE; + iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - + sizeof(struct extendedFileEntry), + GFP_KERNEL); + } else { + iinfo->i_efe = 0; + iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - + sizeof(struct fileEntry), + GFP_KERNEL); + } + if (!iinfo->i_ext.i_data) { + iput(inode); + *err = -ENOMEM; + return NULL; + } + + block = udf_new_block(dir->i_sb, NULL, + dinfo->i_location.partitionReferenceNum, + start, err); + if (*err) { + iput(inode); + return NULL; + } + + if (sbi->s_lvid_bh) { + struct logicalVolIntegrityDescImpUse *lvidiu; + + iinfo->i_unique = lvid_get_unique_id(sb); + mutex_lock(&sbi->s_alloc_mutex); + lvidiu = udf_sb_lvidiu(sbi); + if (S_ISDIR(mode)) + le32_add_cpu(&lvidiu->numDirs, 1); + else + le32_add_cpu(&lvidiu->numFiles, 1); + udf_updated_lvid(sb); + mutex_unlock(&sbi->s_alloc_mutex); + } + + inode_init_owner(inode, dir, mode); + + iinfo->i_location.logicalBlockNum = block; + iinfo->i_location.partitionReferenceNum = + dinfo->i_location.partitionReferenceNum; + inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0); + inode->i_blocks = 0; + iinfo->i_lenEAttr = 0; + iinfo->i_lenAlloc = 0; + iinfo->i_use = 0; + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) + iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; + else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; + else + iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; + inode->i_mtime = inode->i_atime = inode->i_ctime = + iinfo->i_crtime = current_fs_time(inode->i_sb); + insert_inode_hash(inode); + mark_inode_dirty(inode); + + *err = 0; + return inode; +} diff --git a/fs/udf/inode.c b/fs/udf/inode.c new file mode 100644 index 00000000..262050f2 --- /dev/null +++ b/fs/udf/inode.c @@ -0,0 +1,2166 @@ +/* + * inode.c + * + * PURPOSE + * Inode handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998 Dave Boynton + * (C) 1998-2004 Ben Fennema + * (C) 1999-2000 Stelias Computing Inc + * + * HISTORY + * + * 10/04/98 dgb Added rudimentary directory functions + * 10/07/98 Fully working udf_block_map! It works! + * 11/25/98 bmap altered to better support extents + * 12/06/98 blf partition support in udf_iget, udf_block_map + * and udf_read_inode + * 12/12/98 rewrote udf_block_map to handle next extents and descs across + * block boundaries (which is not actually allowed) + * 12/20/98 added support for strategy 4096 + * 03/07/99 rewrote udf_block_map (again) + * New funcs, inode_bmap, udf_next_aext + * 04/19/99 Support for writing device EA's for major/minor # + */ + +#include "udfdecl.h" +#include +#include +#include +#include +#include +#include +#include + +#include "udf_i.h" +#include "udf_sb.h" + +MODULE_AUTHOR("Ben Fennema"); +MODULE_DESCRIPTION("Universal Disk Format Filesystem"); +MODULE_LICENSE("GPL"); + +#define EXTENT_MERGE_SIZE 5 + +static mode_t udf_convert_permissions(struct fileEntry *); +static int udf_update_inode(struct inode *, int); +static void udf_fill_inode(struct inode *, struct buffer_head *); +static int udf_sync_inode(struct inode *inode); +static int udf_alloc_i_data(struct inode *inode, size_t size); +static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, + sector_t *, int *); +static int8_t udf_insert_aext(struct inode *, struct extent_position, + struct kernel_lb_addr, uint32_t); +static void udf_split_extents(struct inode *, int *, int, int, + struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); +static void udf_prealloc_extents(struct inode *, int, int, + struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); +static void udf_merge_extents(struct inode *, + struct kernel_long_ad[EXTENT_MERGE_SIZE], int *); +static void udf_update_extents(struct inode *, + struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int, + struct extent_position *); +static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); + + +void udf_evict_inode(struct inode *inode) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + int want_delete = 0; + + if (!inode->i_nlink && !is_bad_inode(inode)) { + want_delete = 1; + udf_setsize(inode, 0); + udf_update_inode(inode, IS_SYNC(inode)); + } else + truncate_inode_pages(&inode->i_data, 0); + invalidate_inode_buffers(inode); + end_writeback(inode); + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && + inode->i_size != iinfo->i_lenExtents) { + printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has " + "inode size %llu different from extent length %llu. " + "Filesystem need not be standards compliant.\n", + inode->i_sb->s_id, inode->i_ino, inode->i_mode, + (unsigned long long)inode->i_size, + (unsigned long long)iinfo->i_lenExtents); + } + kfree(iinfo->i_ext.i_data); + iinfo->i_ext.i_data = NULL; + if (want_delete) { + udf_free_inode(inode); + } +} + +static int udf_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, udf_get_block, wbc); +} + +static int udf_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, udf_get_block); +} + +static int udf_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block); + if (unlikely(ret)) { + struct inode *inode = mapping->host; + struct udf_inode_info *iinfo = UDF_I(inode); + loff_t isize = inode->i_size; + + if (pos + len > isize) { + truncate_pagecache(inode, pos + len, isize); + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + down_write(&iinfo->i_data_sem); + udf_truncate_extents(inode); + up_write(&iinfo->i_data_sem); + } + } + } + + return ret; +} + +static sector_t udf_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, udf_get_block); +} + +const struct address_space_operations udf_aops = { + .readpage = udf_readpage, + .writepage = udf_writepage, + .write_begin = udf_write_begin, + .write_end = generic_write_end, + .bmap = udf_bmap, +}; + +/* + * Expand file stored in ICB to a normal one-block-file + * + * This function requires i_data_sem for writing and releases it. + * This function requires i_mutex held + */ +int udf_expand_file_adinicb(struct inode *inode) +{ + struct page *page; + char *kaddr; + struct udf_inode_info *iinfo = UDF_I(inode); + int err; + struct writeback_control udf_wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = 1, + }; + + if (!iinfo->i_lenAlloc) { + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; + else + iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; + /* from now on we have normal address_space methods */ + inode->i_data.a_ops = &udf_aops; + up_write(&iinfo->i_data_sem); + mark_inode_dirty(inode); + return 0; + } + /* + * Release i_data_sem so that we can lock a page - page lock ranks + * above i_data_sem. i_mutex still protects us against file changes. + */ + up_write(&iinfo->i_data_sem); + + page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); + if (!page) + return -ENOMEM; + + if (!PageUptodate(page)) { + kaddr = kmap(page); + memset(kaddr + iinfo->i_lenAlloc, 0x00, + PAGE_CACHE_SIZE - iinfo->i_lenAlloc); + memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, + iinfo->i_lenAlloc); + flush_dcache_page(page); + SetPageUptodate(page); + kunmap(page); + } + down_write(&iinfo->i_data_sem); + memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00, + iinfo->i_lenAlloc); + iinfo->i_lenAlloc = 0; + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; + else + iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; + /* from now on we have normal address_space methods */ + inode->i_data.a_ops = &udf_aops; + up_write(&iinfo->i_data_sem); + err = inode->i_data.a_ops->writepage(page, &udf_wbc); + if (err) { + /* Restore everything back so that we don't lose data... */ + lock_page(page); + kaddr = kmap(page); + down_write(&iinfo->i_data_sem); + memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, + inode->i_size); + kunmap(page); + unlock_page(page); + iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; + inode->i_data.a_ops = &udf_adinicb_aops; + up_write(&iinfo->i_data_sem); + } + page_cache_release(page); + mark_inode_dirty(inode); + + return err; +} + +struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, + int *err) +{ + int newblock; + struct buffer_head *dbh = NULL; + struct kernel_lb_addr eloc; + uint8_t alloctype; + struct extent_position epos; + + struct udf_fileident_bh sfibh, dfibh; + loff_t f_pos = udf_ext0_offset(inode); + int size = udf_ext0_offset(inode) + inode->i_size; + struct fileIdentDesc cfi, *sfi, *dfi; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + alloctype = ICBTAG_FLAG_AD_SHORT; + else + alloctype = ICBTAG_FLAG_AD_LONG; + + if (!inode->i_size) { + iinfo->i_alloc_type = alloctype; + mark_inode_dirty(inode); + return NULL; + } + + /* alloc block, and copy data to it */ + *block = udf_new_block(inode->i_sb, inode, + iinfo->i_location.partitionReferenceNum, + iinfo->i_location.logicalBlockNum, err); + if (!(*block)) + return NULL; + newblock = udf_get_pblock(inode->i_sb, *block, + iinfo->i_location.partitionReferenceNum, + 0); + if (!newblock) + return NULL; + dbh = udf_tgetblk(inode->i_sb, newblock); + if (!dbh) + return NULL; + lock_buffer(dbh); + memset(dbh->b_data, 0x00, inode->i_sb->s_blocksize); + set_buffer_uptodate(dbh); + unlock_buffer(dbh); + mark_buffer_dirty_inode(dbh, inode); + + sfibh.soffset = sfibh.eoffset = + f_pos & (inode->i_sb->s_blocksize - 1); + sfibh.sbh = sfibh.ebh = NULL; + dfibh.soffset = dfibh.eoffset = 0; + dfibh.sbh = dfibh.ebh = dbh; + while (f_pos < size) { + iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; + sfi = udf_fileident_read(inode, &f_pos, &sfibh, &cfi, NULL, + NULL, NULL, NULL); + if (!sfi) { + brelse(dbh); + return NULL; + } + iinfo->i_alloc_type = alloctype; + sfi->descTag.tagLocation = cpu_to_le32(*block); + dfibh.soffset = dfibh.eoffset; + dfibh.eoffset += (sfibh.eoffset - sfibh.soffset); + dfi = (struct fileIdentDesc *)(dbh->b_data + dfibh.soffset); + if (udf_write_fi(inode, sfi, dfi, &dfibh, sfi->impUse, + sfi->fileIdent + + le16_to_cpu(sfi->lengthOfImpUse))) { + iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; + brelse(dbh); + return NULL; + } + } + mark_buffer_dirty_inode(dbh, inode); + + memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0, + iinfo->i_lenAlloc); + iinfo->i_lenAlloc = 0; + eloc.logicalBlockNum = *block; + eloc.partitionReferenceNum = + iinfo->i_location.partitionReferenceNum; + iinfo->i_lenExtents = inode->i_size; + epos.bh = NULL; + epos.block = iinfo->i_location; + epos.offset = udf_file_entry_alloc_offset(inode); + udf_add_aext(inode, &epos, &eloc, inode->i_size, 0); + /* UniqueID stuff */ + + brelse(epos.bh); + mark_inode_dirty(inode); + return dbh; +} + +static int udf_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + int err, new; + struct buffer_head *bh; + sector_t phys = 0; + struct udf_inode_info *iinfo; + + if (!create) { + phys = udf_block_map(inode, block); + if (phys) + map_bh(bh_result, inode->i_sb, phys); + return 0; + } + + err = -EIO; + new = 0; + bh = NULL; + iinfo = UDF_I(inode); + + down_write(&iinfo->i_data_sem); + if (block == iinfo->i_next_alloc_block + 1) { + iinfo->i_next_alloc_block++; + iinfo->i_next_alloc_goal++; + } + + err = 0; + + bh = inode_getblk(inode, block, &err, &phys, &new); + BUG_ON(bh); + if (err) + goto abort; + BUG_ON(!phys); + + if (new) + set_buffer_new(bh_result); + map_bh(bh_result, inode->i_sb, phys); + +abort: + up_write(&iinfo->i_data_sem); + return err; +} + +static struct buffer_head *udf_getblk(struct inode *inode, long block, + int create, int *err) +{ + struct buffer_head *bh; + struct buffer_head dummy; + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + *err = udf_get_block(inode, block, &dummy, create); + if (!*err && buffer_mapped(&dummy)) { + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); + if (buffer_new(&dummy)) { + lock_buffer(bh); + memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + } + return bh; + } + + return NULL; +} + +/* Extend the file by 'blocks' blocks, return the number of extents added */ +static int udf_do_extend_file(struct inode *inode, + struct extent_position *last_pos, + struct kernel_long_ad *last_ext, + sector_t blocks) +{ + sector_t add; + int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); + struct super_block *sb = inode->i_sb; + struct kernel_lb_addr prealloc_loc = {}; + int prealloc_len = 0; + struct udf_inode_info *iinfo; + int err; + + /* The previous extent is fake and we should not extend by anything + * - there's nothing to do... */ + if (!blocks && fake) + return 0; + + iinfo = UDF_I(inode); + /* Round the last extent up to a multiple of block size */ + if (last_ext->extLength & (sb->s_blocksize - 1)) { + last_ext->extLength = + (last_ext->extLength & UDF_EXTENT_FLAG_MASK) | + (((last_ext->extLength & UDF_EXTENT_LENGTH_MASK) + + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1)); + iinfo->i_lenExtents = + (iinfo->i_lenExtents + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + } + + /* Last extent are just preallocated blocks? */ + if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == + EXT_NOT_RECORDED_ALLOCATED) { + /* Save the extent so that we can reattach it to the end */ + prealloc_loc = last_ext->extLocation; + prealloc_len = last_ext->extLength; + /* Mark the extent as a hole */ + last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | + (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); + last_ext->extLocation.logicalBlockNum = 0; + last_ext->extLocation.partitionReferenceNum = 0; + } + + /* Can we merge with the previous extent? */ + if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == + EXT_NOT_RECORDED_NOT_ALLOCATED) { + add = ((1 << 30) - sb->s_blocksize - + (last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) >> + sb->s_blocksize_bits; + if (add > blocks) + add = blocks; + blocks -= add; + last_ext->extLength += add << sb->s_blocksize_bits; + } + + if (fake) { + udf_add_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + count++; + } else + udf_write_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + + /* Managed to do everything necessary? */ + if (!blocks) + goto out; + + /* All further extents will be NOT_RECORDED_NOT_ALLOCATED */ + last_ext->extLocation.logicalBlockNum = 0; + last_ext->extLocation.partitionReferenceNum = 0; + add = (1 << (30-sb->s_blocksize_bits)) - 1; + last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | + (add << sb->s_blocksize_bits); + + /* Create enough extents to cover the whole hole */ + while (blocks > add) { + blocks -= add; + err = udf_add_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + if (err) + return err; + count++; + } + if (blocks) { + last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | + (blocks << sb->s_blocksize_bits); + err = udf_add_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + if (err) + return err; + count++; + } + +out: + /* Do we have some preallocated blocks saved? */ + if (prealloc_len) { + err = udf_add_aext(inode, last_pos, &prealloc_loc, + prealloc_len, 1); + if (err) + return err; + last_ext->extLocation = prealloc_loc; + last_ext->extLength = prealloc_len; + count++; + } + + /* last_pos should point to the last written extent... */ + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + last_pos->offset -= sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + last_pos->offset -= sizeof(struct long_ad); + else + return -EIO; + + return count; +} + +static int udf_extend_file(struct inode *inode, loff_t newsize) +{ + + struct extent_position epos; + struct kernel_lb_addr eloc; + uint32_t elen; + int8_t etype; + struct super_block *sb = inode->i_sb; + sector_t first_block = newsize >> sb->s_blocksize_bits, offset; + int adsize; + struct udf_inode_info *iinfo = UDF_I(inode); + struct kernel_long_ad extent; + int err; + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + BUG(); + + etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); + + /* File has extent covering the new size (could happen when extending + * inside a block)? */ + if (etype != -1) + return 0; + if (newsize & (sb->s_blocksize - 1)) + offset++; + /* Extended file just to the boundary of the last file block? */ + if (offset == 0) + return 0; + + /* Truncate is extending the file by 'offset' blocks */ + if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) || + (epos.bh && epos.offset == sizeof(struct allocExtDesc))) { + /* File has no extents at all or has empty last + * indirect extent! Create a fake extent... */ + extent.extLocation.logicalBlockNum = 0; + extent.extLocation.partitionReferenceNum = 0; + extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; + } else { + epos.offset -= adsize; + etype = udf_next_aext(inode, &epos, &extent.extLocation, + &extent.extLength, 0); + extent.extLength |= etype << 30; + } + err = udf_do_extend_file(inode, &epos, &extent, offset); + if (err < 0) + goto out; + err = 0; + iinfo->i_lenExtents = newsize; +out: + brelse(epos.bh); + return err; +} + +static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, + int *err, sector_t *phys, int *new) +{ + static sector_t last_block; + struct buffer_head *result = NULL; + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; + struct extent_position prev_epos, cur_epos, next_epos; + int count = 0, startnum = 0, endnum = 0; + uint32_t elen = 0, tmpelen; + struct kernel_lb_addr eloc, tmpeloc; + int c = 1; + loff_t lbcount = 0, b_off = 0; + uint32_t newblocknum, newblock; + sector_t offset = 0; + int8_t etype; + struct udf_inode_info *iinfo = UDF_I(inode); + int goal = 0, pgoal = iinfo->i_location.logicalBlockNum; + int lastblock = 0; + + prev_epos.offset = udf_file_entry_alloc_offset(inode); + prev_epos.block = iinfo->i_location; + prev_epos.bh = NULL; + cur_epos = next_epos = prev_epos; + b_off = (loff_t)block << inode->i_sb->s_blocksize_bits; + + /* find the extent which contains the block we are looking for. + alternate between laarr[0] and laarr[1] for locations of the + current extent, and the previous extent */ + do { + if (prev_epos.bh != cur_epos.bh) { + brelse(prev_epos.bh); + get_bh(cur_epos.bh); + prev_epos.bh = cur_epos.bh; + } + if (cur_epos.bh != next_epos.bh) { + brelse(cur_epos.bh); + get_bh(next_epos.bh); + cur_epos.bh = next_epos.bh; + } + + lbcount += elen; + + prev_epos.block = cur_epos.block; + cur_epos.block = next_epos.block; + + prev_epos.offset = cur_epos.offset; + cur_epos.offset = next_epos.offset; + + etype = udf_next_aext(inode, &next_epos, &eloc, &elen, 1); + if (etype == -1) + break; + + c = !c; + + laarr[c].extLength = (etype << 30) | elen; + laarr[c].extLocation = eloc; + + if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) + pgoal = eloc.logicalBlockNum + + ((elen + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits); + + count++; + } while (lbcount + elen <= b_off); + + b_off -= lbcount; + offset = b_off >> inode->i_sb->s_blocksize_bits; + /* + * Move prev_epos and cur_epos into indirect extent if we are at + * the pointer to it + */ + udf_next_aext(inode, &prev_epos, &tmpeloc, &tmpelen, 0); + udf_next_aext(inode, &cur_epos, &tmpeloc, &tmpelen, 0); + + /* if the extent is allocated and recorded, return the block + if the extent is not a multiple of the blocksize, round up */ + + if (etype == (EXT_RECORDED_ALLOCATED >> 30)) { + if (elen & (inode->i_sb->s_blocksize - 1)) { + elen = EXT_RECORDED_ALLOCATED | + ((elen + inode->i_sb->s_blocksize - 1) & + ~(inode->i_sb->s_blocksize - 1)); + udf_write_aext(inode, &cur_epos, &eloc, elen, 1); + } + brelse(prev_epos.bh); + brelse(cur_epos.bh); + brelse(next_epos.bh); + newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset); + *phys = newblock; + return NULL; + } + + last_block = block; + /* Are we beyond EOF? */ + if (etype == -1) { + int ret; + + if (count) { + if (c) + laarr[0] = laarr[1]; + startnum = 1; + } else { + /* Create a fake extent when there's not one */ + memset(&laarr[0].extLocation, 0x00, + sizeof(struct kernel_lb_addr)); + laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; + /* Will udf_do_extend_file() create real extent from + a fake one? */ + startnum = (offset > 0); + } + /* Create extents for the hole between EOF and offset */ + ret = udf_do_extend_file(inode, &prev_epos, laarr, offset); + if (ret < 0) { + brelse(prev_epos.bh); + brelse(cur_epos.bh); + brelse(next_epos.bh); + *err = ret; + return NULL; + } + c = 0; + offset = 0; + count += ret; + /* We are not covered by a preallocated extent? */ + if ((laarr[0].extLength & UDF_EXTENT_FLAG_MASK) != + EXT_NOT_RECORDED_ALLOCATED) { + /* Is there any real extent? - otherwise we overwrite + * the fake one... */ + if (count) + c = !c; + laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | + inode->i_sb->s_blocksize; + memset(&laarr[c].extLocation, 0x00, + sizeof(struct kernel_lb_addr)); + count++; + endnum++; + } + endnum = c + 1; + lastblock = 1; + } else { + endnum = startnum = ((count > 2) ? 2 : count); + + /* if the current extent is in position 0, + swap it with the previous */ + if (!c && count != 1) { + laarr[2] = laarr[0]; + laarr[0] = laarr[1]; + laarr[1] = laarr[2]; + c = 1; + } + + /* if the current block is located in an extent, + read the next extent */ + etype = udf_next_aext(inode, &next_epos, &eloc, &elen, 0); + if (etype != -1) { + laarr[c + 1].extLength = (etype << 30) | elen; + laarr[c + 1].extLocation = eloc; + count++; + startnum++; + endnum++; + } else + lastblock = 1; + } + + /* if the current extent is not recorded but allocated, get the + * block in the extent corresponding to the requested block */ + if ((laarr[c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) + newblocknum = laarr[c].extLocation.logicalBlockNum + offset; + else { /* otherwise, allocate a new block */ + if (iinfo->i_next_alloc_block == block) + goal = iinfo->i_next_alloc_goal; + + if (!goal) { + if (!(goal = pgoal)) /* XXX: what was intended here? */ + goal = iinfo->i_location.logicalBlockNum + 1; + } + + newblocknum = udf_new_block(inode->i_sb, inode, + iinfo->i_location.partitionReferenceNum, + goal, err); + if (!newblocknum) { + brelse(prev_epos.bh); + *err = -ENOSPC; + return NULL; + } + iinfo->i_lenExtents += inode->i_sb->s_blocksize; + } + + /* if the extent the requsted block is located in contains multiple + * blocks, split the extent into at most three extents. blocks prior + * to requested block, requested block, and blocks after requested + * block */ + udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); + +#ifdef UDF_PREALLOCATE + /* We preallocate blocks only for regular files. It also makes sense + * for directories but there's a problem when to drop the + * preallocation. We might use some delayed work for that but I feel + * it's overengineering for a filesystem like UDF. */ + if (S_ISREG(inode->i_mode)) + udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); +#endif + + /* merge any continuous blocks in laarr */ + udf_merge_extents(inode, laarr, &endnum); + + /* write back the new extents, inserting new extents if the new number + * of extents is greater than the old number, and deleting extents if + * the new number of extents is less than the old number */ + udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); + + brelse(prev_epos.bh); + + newblock = udf_get_pblock(inode->i_sb, newblocknum, + iinfo->i_location.partitionReferenceNum, 0); + if (!newblock) + return NULL; + *phys = newblock; + *err = 0; + *new = 1; + iinfo->i_next_alloc_block = block; + iinfo->i_next_alloc_goal = newblocknum; + inode->i_ctime = current_fs_time(inode->i_sb); + + if (IS_SYNC(inode)) + udf_sync_inode(inode); + else + mark_inode_dirty(inode); + + return result; +} + +static void udf_split_extents(struct inode *inode, int *c, int offset, + int newblocknum, + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], + int *endnum) +{ + unsigned long blocksize = inode->i_sb->s_blocksize; + unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; + + if ((laarr[*c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30) || + (laarr[*c].extLength >> 30) == + (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) { + int curr = *c; + int blen = ((laarr[curr].extLength & UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) >> blocksize_bits; + int8_t etype = (laarr[curr].extLength >> 30); + + if (blen == 1) + ; + else if (!offset || blen == offset + 1) { + laarr[curr + 2] = laarr[curr + 1]; + laarr[curr + 1] = laarr[curr]; + } else { + laarr[curr + 3] = laarr[curr + 1]; + laarr[curr + 2] = laarr[curr + 1] = laarr[curr]; + } + + if (offset) { + if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { + udf_free_blocks(inode->i_sb, inode, + &laarr[curr].extLocation, + 0, offset); + laarr[curr].extLength = + EXT_NOT_RECORDED_NOT_ALLOCATED | + (offset << blocksize_bits); + laarr[curr].extLocation.logicalBlockNum = 0; + laarr[curr].extLocation. + partitionReferenceNum = 0; + } else + laarr[curr].extLength = (etype << 30) | + (offset << blocksize_bits); + curr++; + (*c)++; + (*endnum)++; + } + + laarr[curr].extLocation.logicalBlockNum = newblocknum; + if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) + laarr[curr].extLocation.partitionReferenceNum = + UDF_I(inode)->i_location.partitionReferenceNum; + laarr[curr].extLength = EXT_RECORDED_ALLOCATED | + blocksize; + curr++; + + if (blen != offset + 1) { + if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) + laarr[curr].extLocation.logicalBlockNum += + offset + 1; + laarr[curr].extLength = (etype << 30) | + ((blen - (offset + 1)) << blocksize_bits); + curr++; + (*endnum)++; + } + } +} + +static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], + int *endnum) +{ + int start, length = 0, currlength = 0, i; + + if (*endnum >= (c + 1)) { + if (!lastblock) + return; + else + start = c; + } else { + if ((laarr[c + 1].extLength >> 30) == + (EXT_NOT_RECORDED_ALLOCATED >> 30)) { + start = c + 1; + length = currlength = + (((laarr[c + 1].extLength & + UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits); + } else + start = c; + } + + for (i = start + 1; i <= *endnum; i++) { + if (i == *endnum) { + if (lastblock) + length += UDF_DEFAULT_PREALLOC_BLOCKS; + } else if ((laarr[i].extLength >> 30) == + (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) { + length += (((laarr[i].extLength & + UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits); + } else + break; + } + + if (length) { + int next = laarr[start].extLocation.logicalBlockNum + + (((laarr[start].extLength & UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits); + int numalloc = udf_prealloc_blocks(inode->i_sb, inode, + laarr[start].extLocation.partitionReferenceNum, + next, (UDF_DEFAULT_PREALLOC_BLOCKS > length ? + length : UDF_DEFAULT_PREALLOC_BLOCKS) - + currlength); + if (numalloc) { + if (start == (c + 1)) + laarr[start].extLength += + (numalloc << + inode->i_sb->s_blocksize_bits); + else { + memmove(&laarr[c + 2], &laarr[c + 1], + sizeof(struct long_ad) * (*endnum - (c + 1))); + (*endnum)++; + laarr[c + 1].extLocation.logicalBlockNum = next; + laarr[c + 1].extLocation.partitionReferenceNum = + laarr[c].extLocation. + partitionReferenceNum; + laarr[c + 1].extLength = + EXT_NOT_RECORDED_ALLOCATED | + (numalloc << + inode->i_sb->s_blocksize_bits); + start = c + 1; + } + + for (i = start + 1; numalloc && i < *endnum; i++) { + int elen = ((laarr[i].extLength & + UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + + if (elen > numalloc) { + laarr[i].extLength -= + (numalloc << + inode->i_sb->s_blocksize_bits); + numalloc = 0; + } else { + numalloc -= elen; + if (*endnum > (i + 1)) + memmove(&laarr[i], + &laarr[i + 1], + sizeof(struct long_ad) * + (*endnum - (i + 1))); + i--; + (*endnum)--; + } + } + UDF_I(inode)->i_lenExtents += + numalloc << inode->i_sb->s_blocksize_bits; + } + } +} + +static void udf_merge_extents(struct inode *inode, + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], + int *endnum) +{ + int i; + unsigned long blocksize = inode->i_sb->s_blocksize; + unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; + + for (i = 0; i < (*endnum - 1); i++) { + struct kernel_long_ad *li /*l[i]*/ = &laarr[i]; + struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1]; + + if (((li->extLength >> 30) == (lip1->extLength >> 30)) && + (((li->extLength >> 30) == + (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) || + ((lip1->extLocation.logicalBlockNum - + li->extLocation.logicalBlockNum) == + (((li->extLength & UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) >> blocksize_bits)))) { + + if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) { + lip1->extLength = (lip1->extLength - + (li->extLength & + UDF_EXTENT_LENGTH_MASK) + + UDF_EXTENT_LENGTH_MASK) & + ~(blocksize - 1); + li->extLength = (li->extLength & + UDF_EXTENT_FLAG_MASK) + + (UDF_EXTENT_LENGTH_MASK + 1) - + blocksize; + lip1->extLocation.logicalBlockNum = + li->extLocation.logicalBlockNum + + ((li->extLength & + UDF_EXTENT_LENGTH_MASK) >> + blocksize_bits); + } else { + li->extLength = lip1->extLength + + (((li->extLength & + UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) & ~(blocksize - 1)); + if (*endnum > (i + 2)) + memmove(&laarr[i + 1], &laarr[i + 2], + sizeof(struct long_ad) * + (*endnum - (i + 2))); + i--; + (*endnum)--; + } + } else if (((li->extLength >> 30) == + (EXT_NOT_RECORDED_ALLOCATED >> 30)) && + ((lip1->extLength >> 30) == + (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) { + udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0, + ((li->extLength & + UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) >> blocksize_bits); + li->extLocation.logicalBlockNum = 0; + li->extLocation.partitionReferenceNum = 0; + + if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) { + lip1->extLength = (lip1->extLength - + (li->extLength & + UDF_EXTENT_LENGTH_MASK) + + UDF_EXTENT_LENGTH_MASK) & + ~(blocksize - 1); + li->extLength = (li->extLength & + UDF_EXTENT_FLAG_MASK) + + (UDF_EXTENT_LENGTH_MASK + 1) - + blocksize; + } else { + li->extLength = lip1->extLength + + (((li->extLength & + UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) & ~(blocksize - 1)); + if (*endnum > (i + 2)) + memmove(&laarr[i + 1], &laarr[i + 2], + sizeof(struct long_ad) * + (*endnum - (i + 2))); + i--; + (*endnum)--; + } + } else if ((li->extLength >> 30) == + (EXT_NOT_RECORDED_ALLOCATED >> 30)) { + udf_free_blocks(inode->i_sb, inode, + &li->extLocation, 0, + ((li->extLength & + UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) >> blocksize_bits); + li->extLocation.logicalBlockNum = 0; + li->extLocation.partitionReferenceNum = 0; + li->extLength = (li->extLength & + UDF_EXTENT_LENGTH_MASK) | + EXT_NOT_RECORDED_NOT_ALLOCATED; + } + } +} + +static void udf_update_extents(struct inode *inode, + struct kernel_long_ad laarr[EXTENT_MERGE_SIZE], + int startnum, int endnum, + struct extent_position *epos) +{ + int start = 0, i; + struct kernel_lb_addr tmploc; + uint32_t tmplen; + + if (startnum > endnum) { + for (i = 0; i < (startnum - endnum); i++) + udf_delete_aext(inode, *epos, laarr[i].extLocation, + laarr[i].extLength); + } else if (startnum < endnum) { + for (i = 0; i < (endnum - startnum); i++) { + udf_insert_aext(inode, *epos, laarr[i].extLocation, + laarr[i].extLength); + udf_next_aext(inode, epos, &laarr[i].extLocation, + &laarr[i].extLength, 1); + start++; + } + } + + for (i = start; i < endnum; i++) { + udf_next_aext(inode, epos, &tmploc, &tmplen, 0); + udf_write_aext(inode, epos, &laarr[i].extLocation, + laarr[i].extLength, 1); + } +} + +struct buffer_head *udf_bread(struct inode *inode, int block, + int create, int *err) +{ + struct buffer_head *bh = NULL; + + bh = udf_getblk(inode, block, create, err); + if (!bh) + return NULL; + + if (buffer_uptodate(bh)) + return bh; + + ll_rw_block(READ, 1, &bh); + + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + + brelse(bh); + *err = -EIO; + return NULL; +} + +int udf_setsize(struct inode *inode, loff_t newsize) +{ + int err; + struct udf_inode_info *iinfo; + int bsize = 1 << inode->i_blkbits; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return -EINVAL; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + iinfo = UDF_I(inode); + if (newsize > inode->i_size) { + down_write(&iinfo->i_data_sem); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + if (bsize < + (udf_file_entry_alloc_offset(inode) + newsize)) { + err = udf_expand_file_adinicb(inode); + if (err) + return err; + down_write(&iinfo->i_data_sem); + } else + iinfo->i_lenAlloc = newsize; + } + err = udf_extend_file(inode, newsize); + if (err) { + up_write(&iinfo->i_data_sem); + return err; + } + truncate_setsize(inode, newsize); + up_write(&iinfo->i_data_sem); + } else { + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + down_write(&iinfo->i_data_sem); + memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize, + 0x00, bsize - newsize - + udf_file_entry_alloc_offset(inode)); + iinfo->i_lenAlloc = newsize; + truncate_setsize(inode, newsize); + up_write(&iinfo->i_data_sem); + goto update_time; + } + err = block_truncate_page(inode->i_mapping, newsize, + udf_get_block); + if (err) + return err; + down_write(&iinfo->i_data_sem); + truncate_setsize(inode, newsize); + udf_truncate_extents(inode); + up_write(&iinfo->i_data_sem); + } +update_time: + inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); + if (IS_SYNC(inode)) + udf_sync_inode(inode); + else + mark_inode_dirty(inode); + return 0; +} + +static void __udf_read_inode(struct inode *inode) +{ + struct buffer_head *bh = NULL; + struct fileEntry *fe; + uint16_t ident; + struct udf_inode_info *iinfo = UDF_I(inode); + + /* + * Set defaults, but the inode is still incomplete! + * Note: get_new_inode() sets the following on a new inode: + * i_sb = sb + * i_no = ino + * i_flags = sb->s_flags + * i_state = 0 + * clean_inode(): zero fills and sets + * i_count = 1 + * i_nlink = 1 + * i_op = NULL; + */ + bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident); + if (!bh) { + printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n", + inode->i_ino); + make_bad_inode(inode); + return; + } + + if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && + ident != TAG_IDENT_USE) { + printk(KERN_ERR "udf: udf_read_inode(ino %ld) " + "failed ident=%d\n", inode->i_ino, ident); + brelse(bh); + make_bad_inode(inode); + return; + } + + fe = (struct fileEntry *)bh->b_data; + + if (fe->icbTag.strategyType == cpu_to_le16(4096)) { + struct buffer_head *ibh; + + ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1, + &ident); + if (ident == TAG_IDENT_IE && ibh) { + struct buffer_head *nbh = NULL; + struct kernel_lb_addr loc; + struct indirectEntry *ie; + + ie = (struct indirectEntry *)ibh->b_data; + loc = lelb_to_cpu(ie->indirectICB.extLocation); + + if (ie->indirectICB.extLength && + (nbh = udf_read_ptagged(inode->i_sb, &loc, 0, + &ident))) { + if (ident == TAG_IDENT_FE || + ident == TAG_IDENT_EFE) { + memcpy(&iinfo->i_location, + &loc, + sizeof(struct kernel_lb_addr)); + brelse(bh); + brelse(ibh); + brelse(nbh); + __udf_read_inode(inode); + return; + } + brelse(nbh); + } + } + brelse(ibh); + } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { + printk(KERN_ERR "udf: unsupported strategy type: %d\n", + le16_to_cpu(fe->icbTag.strategyType)); + brelse(bh); + make_bad_inode(inode); + return; + } + udf_fill_inode(inode, bh); + + brelse(bh); +} + +static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) +{ + struct fileEntry *fe; + struct extendedFileEntry *efe; + int offset; + struct udf_sb_info *sbi = UDF_SB(inode->i_sb); + struct udf_inode_info *iinfo = UDF_I(inode); + + fe = (struct fileEntry *)bh->b_data; + efe = (struct extendedFileEntry *)bh->b_data; + + if (fe->icbTag.strategyType == cpu_to_le16(4)) + iinfo->i_strat4096 = 0; + else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */ + iinfo->i_strat4096 = 1; + + iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) & + ICBTAG_FLAG_AD_MASK; + iinfo->i_unique = 0; + iinfo->i_lenEAttr = 0; + iinfo->i_lenExtents = 0; + iinfo->i_lenAlloc = 0; + iinfo->i_next_alloc_block = 0; + iinfo->i_next_alloc_goal = 0; + if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) { + iinfo->i_efe = 1; + iinfo->i_use = 0; + if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - + sizeof(struct extendedFileEntry))) { + make_bad_inode(inode); + return; + } + memcpy(iinfo->i_ext.i_data, + bh->b_data + sizeof(struct extendedFileEntry), + inode->i_sb->s_blocksize - + sizeof(struct extendedFileEntry)); + } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { + iinfo->i_efe = 0; + iinfo->i_use = 0; + if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - + sizeof(struct fileEntry))) { + make_bad_inode(inode); + return; + } + memcpy(iinfo->i_ext.i_data, + bh->b_data + sizeof(struct fileEntry), + inode->i_sb->s_blocksize - sizeof(struct fileEntry)); + } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { + iinfo->i_efe = 0; + iinfo->i_use = 1; + iinfo->i_lenAlloc = le32_to_cpu( + ((struct unallocSpaceEntry *)bh->b_data)-> + lengthAllocDescs); + if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - + sizeof(struct unallocSpaceEntry))) { + make_bad_inode(inode); + return; + } + memcpy(iinfo->i_ext.i_data, + bh->b_data + sizeof(struct unallocSpaceEntry), + inode->i_sb->s_blocksize - + sizeof(struct unallocSpaceEntry)); + return; + } + + read_lock(&sbi->s_cred_lock); + inode->i_uid = le32_to_cpu(fe->uid); + if (inode->i_uid == -1 || + UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) || + UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET)) + inode->i_uid = UDF_SB(inode->i_sb)->s_uid; + + inode->i_gid = le32_to_cpu(fe->gid); + if (inode->i_gid == -1 || + UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_IGNORE) || + UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET)) + inode->i_gid = UDF_SB(inode->i_sb)->s_gid; + + if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY && + sbi->s_fmode != UDF_INVALID_MODE) + inode->i_mode = sbi->s_fmode; + else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY && + sbi->s_dmode != UDF_INVALID_MODE) + inode->i_mode = sbi->s_dmode; + else + inode->i_mode = udf_convert_permissions(fe); + inode->i_mode &= ~sbi->s_umask; + read_unlock(&sbi->s_cred_lock); + + inode->i_nlink = le16_to_cpu(fe->fileLinkCount); + if (!inode->i_nlink) + inode->i_nlink = 1; + + inode->i_size = le64_to_cpu(fe->informationLength); + iinfo->i_lenExtents = inode->i_size; + + if (iinfo->i_efe == 0) { + inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << + (inode->i_sb->s_blocksize_bits - 9); + + if (!udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime)) + inode->i_atime = sbi->s_record_time; + + if (!udf_disk_stamp_to_time(&inode->i_mtime, + fe->modificationTime)) + inode->i_mtime = sbi->s_record_time; + + if (!udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime)) + inode->i_ctime = sbi->s_record_time; + + iinfo->i_unique = le64_to_cpu(fe->uniqueID); + iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); + iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs); + offset = sizeof(struct fileEntry) + iinfo->i_lenEAttr; + } else { + inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << + (inode->i_sb->s_blocksize_bits - 9); + + if (!udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime)) + inode->i_atime = sbi->s_record_time; + + if (!udf_disk_stamp_to_time(&inode->i_mtime, + efe->modificationTime)) + inode->i_mtime = sbi->s_record_time; + + if (!udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime)) + iinfo->i_crtime = sbi->s_record_time; + + if (!udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime)) + inode->i_ctime = sbi->s_record_time; + + iinfo->i_unique = le64_to_cpu(efe->uniqueID); + iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); + iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); + offset = sizeof(struct extendedFileEntry) + + iinfo->i_lenEAttr; + } + + switch (fe->icbTag.fileType) { + case ICBTAG_FILE_TYPE_DIRECTORY: + inode->i_op = &udf_dir_inode_operations; + inode->i_fop = &udf_dir_operations; + inode->i_mode |= S_IFDIR; + inc_nlink(inode); + break; + case ICBTAG_FILE_TYPE_REALTIME: + case ICBTAG_FILE_TYPE_REGULAR: + case ICBTAG_FILE_TYPE_UNDEF: + case ICBTAG_FILE_TYPE_VAT20: + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + inode->i_data.a_ops = &udf_adinicb_aops; + else + inode->i_data.a_ops = &udf_aops; + inode->i_op = &udf_file_inode_operations; + inode->i_fop = &udf_file_operations; + inode->i_mode |= S_IFREG; + break; + case ICBTAG_FILE_TYPE_BLOCK: + inode->i_mode |= S_IFBLK; + break; + case ICBTAG_FILE_TYPE_CHAR: + inode->i_mode |= S_IFCHR; + break; + case ICBTAG_FILE_TYPE_FIFO: + init_special_inode(inode, inode->i_mode | S_IFIFO, 0); + break; + case ICBTAG_FILE_TYPE_SOCKET: + init_special_inode(inode, inode->i_mode | S_IFSOCK, 0); + break; + case ICBTAG_FILE_TYPE_SYMLINK: + inode->i_data.a_ops = &udf_symlink_aops; + inode->i_op = &udf_symlink_inode_operations; + inode->i_mode = S_IFLNK | S_IRWXUGO; + break; + case ICBTAG_FILE_TYPE_MAIN: + udf_debug("METADATA FILE-----\n"); + break; + case ICBTAG_FILE_TYPE_MIRROR: + udf_debug("METADATA MIRROR FILE-----\n"); + break; + case ICBTAG_FILE_TYPE_BITMAP: + udf_debug("METADATA BITMAP FILE-----\n"); + break; + default: + printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown " + "file type=%d\n", inode->i_ino, + fe->icbTag.fileType); + make_bad_inode(inode); + return; + } + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + struct deviceSpec *dsea = + (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); + if (dsea) { + init_special_inode(inode, inode->i_mode, + MKDEV(le32_to_cpu(dsea->majorDeviceIdent), + le32_to_cpu(dsea->minorDeviceIdent))); + /* Developer ID ??? */ + } else + make_bad_inode(inode); + } +} + +static int udf_alloc_i_data(struct inode *inode, size_t size) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL); + + if (!iinfo->i_ext.i_data) { + printk(KERN_ERR "udf:udf_alloc_i_data (ino %ld) " + "no free memory\n", inode->i_ino); + return -ENOMEM; + } + + return 0; +} + +static mode_t udf_convert_permissions(struct fileEntry *fe) +{ + mode_t mode; + uint32_t permissions; + uint32_t flags; + + permissions = le32_to_cpu(fe->permissions); + flags = le16_to_cpu(fe->icbTag.flags); + + mode = ((permissions) & S_IRWXO) | + ((permissions >> 2) & S_IRWXG) | + ((permissions >> 4) & S_IRWXU) | + ((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) | + ((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) | + ((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0); + + return mode; +} + +int udf_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + +static int udf_sync_inode(struct inode *inode) +{ + return udf_update_inode(inode, 1); +} + +static int udf_update_inode(struct inode *inode, int do_sync) +{ + struct buffer_head *bh = NULL; + struct fileEntry *fe; + struct extendedFileEntry *efe; + uint32_t udfperms; + uint16_t icbflags; + uint16_t crclen; + int err = 0; + struct udf_sb_info *sbi = UDF_SB(inode->i_sb); + unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; + struct udf_inode_info *iinfo = UDF_I(inode); + + bh = udf_tgetblk(inode->i_sb, + udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0)); + if (!bh) { + udf_debug("getblk failure\n"); + return -ENOMEM; + } + + lock_buffer(bh); + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + fe = (struct fileEntry *)bh->b_data; + efe = (struct extendedFileEntry *)bh->b_data; + + if (iinfo->i_use) { + struct unallocSpaceEntry *use = + (struct unallocSpaceEntry *)bh->b_data; + + use->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); + memcpy(bh->b_data + sizeof(struct unallocSpaceEntry), + iinfo->i_ext.i_data, inode->i_sb->s_blocksize - + sizeof(struct unallocSpaceEntry)); + use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE); + use->descTag.tagLocation = + cpu_to_le32(iinfo->i_location.logicalBlockNum); + crclen = sizeof(struct unallocSpaceEntry) + + iinfo->i_lenAlloc - sizeof(struct tag); + use->descTag.descCRCLength = cpu_to_le16(crclen); + use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + + sizeof(struct tag), + crclen)); + use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); + + goto out; + } + + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) + fe->uid = cpu_to_le32(-1); + else + fe->uid = cpu_to_le32(inode->i_uid); + + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET)) + fe->gid = cpu_to_le32(-1); + else + fe->gid = cpu_to_le32(inode->i_gid); + + udfperms = ((inode->i_mode & S_IRWXO)) | + ((inode->i_mode & S_IRWXG) << 2) | + ((inode->i_mode & S_IRWXU) << 4); + + udfperms |= (le32_to_cpu(fe->permissions) & + (FE_PERM_O_DELETE | FE_PERM_O_CHATTR | + FE_PERM_G_DELETE | FE_PERM_G_CHATTR | + FE_PERM_U_DELETE | FE_PERM_U_CHATTR)); + fe->permissions = cpu_to_le32(udfperms); + + if (S_ISDIR(inode->i_mode)) + fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); + else + fe->fileLinkCount = cpu_to_le16(inode->i_nlink); + + fe->informationLength = cpu_to_le64(inode->i_size); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + struct regid *eid; + struct deviceSpec *dsea = + (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); + if (!dsea) { + dsea = (struct deviceSpec *) + udf_add_extendedattr(inode, + sizeof(struct deviceSpec) + + sizeof(struct regid), 12, 0x3); + dsea->attrType = cpu_to_le32(12); + dsea->attrSubtype = 1; + dsea->attrLength = cpu_to_le32( + sizeof(struct deviceSpec) + + sizeof(struct regid)); + dsea->impUseLength = cpu_to_le32(sizeof(struct regid)); + } + eid = (struct regid *)dsea->impUse; + memset(eid, 0, sizeof(struct regid)); + strcpy(eid->ident, UDF_ID_DEVELOPER); + eid->identSuffix[0] = UDF_OS_CLASS_UNIX; + eid->identSuffix[1] = UDF_OS_ID_LINUX; + dsea->majorDeviceIdent = cpu_to_le32(imajor(inode)); + dsea->minorDeviceIdent = cpu_to_le32(iminor(inode)); + } + + if (iinfo->i_efe == 0) { + memcpy(bh->b_data + sizeof(struct fileEntry), + iinfo->i_ext.i_data, + inode->i_sb->s_blocksize - sizeof(struct fileEntry)); + fe->logicalBlocksRecorded = cpu_to_le64( + (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> + (blocksize_bits - 9)); + + udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); + udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); + udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); + memset(&(fe->impIdent), 0, sizeof(struct regid)); + strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); + fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; + fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; + fe->uniqueID = cpu_to_le64(iinfo->i_unique); + fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); + fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); + fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE); + crclen = sizeof(struct fileEntry); + } else { + memcpy(bh->b_data + sizeof(struct extendedFileEntry), + iinfo->i_ext.i_data, + inode->i_sb->s_blocksize - + sizeof(struct extendedFileEntry)); + efe->objectSize = cpu_to_le64(inode->i_size); + efe->logicalBlocksRecorded = cpu_to_le64( + (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> + (blocksize_bits - 9)); + + if (iinfo->i_crtime.tv_sec > inode->i_atime.tv_sec || + (iinfo->i_crtime.tv_sec == inode->i_atime.tv_sec && + iinfo->i_crtime.tv_nsec > inode->i_atime.tv_nsec)) + iinfo->i_crtime = inode->i_atime; + + if (iinfo->i_crtime.tv_sec > inode->i_mtime.tv_sec || + (iinfo->i_crtime.tv_sec == inode->i_mtime.tv_sec && + iinfo->i_crtime.tv_nsec > inode->i_mtime.tv_nsec)) + iinfo->i_crtime = inode->i_mtime; + + if (iinfo->i_crtime.tv_sec > inode->i_ctime.tv_sec || + (iinfo->i_crtime.tv_sec == inode->i_ctime.tv_sec && + iinfo->i_crtime.tv_nsec > inode->i_ctime.tv_nsec)) + iinfo->i_crtime = inode->i_ctime; + + udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime); + udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime); + udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); + udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); + + memset(&(efe->impIdent), 0, sizeof(struct regid)); + strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); + efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; + efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; + efe->uniqueID = cpu_to_le64(iinfo->i_unique); + efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); + efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); + efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE); + crclen = sizeof(struct extendedFileEntry); + } + if (iinfo->i_strat4096) { + fe->icbTag.strategyType = cpu_to_le16(4096); + fe->icbTag.strategyParameter = cpu_to_le16(1); + fe->icbTag.numEntries = cpu_to_le16(2); + } else { + fe->icbTag.strategyType = cpu_to_le16(4); + fe->icbTag.numEntries = cpu_to_le16(1); + } + + if (S_ISDIR(inode->i_mode)) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY; + else if (S_ISREG(inode->i_mode)) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR; + else if (S_ISLNK(inode->i_mode)) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_SYMLINK; + else if (S_ISBLK(inode->i_mode)) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_BLOCK; + else if (S_ISCHR(inode->i_mode)) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_CHAR; + else if (S_ISFIFO(inode->i_mode)) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_FIFO; + else if (S_ISSOCK(inode->i_mode)) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_SOCKET; + + icbflags = iinfo->i_alloc_type | + ((inode->i_mode & S_ISUID) ? ICBTAG_FLAG_SETUID : 0) | + ((inode->i_mode & S_ISGID) ? ICBTAG_FLAG_SETGID : 0) | + ((inode->i_mode & S_ISVTX) ? ICBTAG_FLAG_STICKY : 0) | + (le16_to_cpu(fe->icbTag.flags) & + ~(ICBTAG_FLAG_AD_MASK | ICBTAG_FLAG_SETUID | + ICBTAG_FLAG_SETGID | ICBTAG_FLAG_STICKY)); + + fe->icbTag.flags = cpu_to_le16(icbflags); + if (sbi->s_udfrev >= 0x0200) + fe->descTag.descVersion = cpu_to_le16(3); + else + fe->descTag.descVersion = cpu_to_le16(2); + fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number); + fe->descTag.tagLocation = cpu_to_le32( + iinfo->i_location.logicalBlockNum); + crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag); + fe->descTag.descCRCLength = cpu_to_le16(crclen); + fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag), + crclen)); + fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); + +out: + set_buffer_uptodate(bh); + unlock_buffer(bh); + + /* write the data blocks */ + mark_buffer_dirty(bh); + if (do_sync) { + sync_dirty_buffer(bh); + if (buffer_write_io_error(bh)) { + printk(KERN_WARNING "IO error syncing udf inode " + "[%s:%08lx]\n", inode->i_sb->s_id, + inode->i_ino); + err = -EIO; + } + } + brelse(bh); + + return err; +} + +struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino) +{ + unsigned long block = udf_get_lb_pblock(sb, ino, 0); + struct inode *inode = iget_locked(sb, block); + + if (!inode) + return NULL; + + if (inode->i_state & I_NEW) { + memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); + __udf_read_inode(inode); + unlock_new_inode(inode); + } + + if (is_bad_inode(inode)) + goto out_iput; + + if (ino->logicalBlockNum >= UDF_SB(sb)-> + s_partmaps[ino->partitionReferenceNum].s_partition_len) { + udf_debug("block=%d, partition=%d out of range\n", + ino->logicalBlockNum, ino->partitionReferenceNum); + make_bad_inode(inode); + goto out_iput; + } + + return inode; + + out_iput: + iput(inode); + return NULL; +} + +int udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) +{ + int adsize; + struct short_ad *sad = NULL; + struct long_ad *lad = NULL; + struct allocExtDesc *aed; + uint8_t *ptr; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (!epos->bh) + ptr = iinfo->i_ext.i_data + epos->offset - + udf_file_entry_alloc_offset(inode) + + iinfo->i_lenEAttr; + else + ptr = epos->bh->b_data + epos->offset; + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + return -EIO; + + if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { + unsigned char *sptr, *dptr; + struct buffer_head *nbh; + int err, loffset; + struct kernel_lb_addr obloc = epos->block; + + epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL, + obloc.partitionReferenceNum, + obloc.logicalBlockNum, &err); + if (!epos->block.logicalBlockNum) + return -ENOSPC; + nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, + &epos->block, + 0)); + if (!nbh) + return -EIO; + lock_buffer(nbh); + memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize); + set_buffer_uptodate(nbh); + unlock_buffer(nbh); + mark_buffer_dirty_inode(nbh, inode); + + aed = (struct allocExtDesc *)(nbh->b_data); + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) + aed->previousAllocExtLocation = + cpu_to_le32(obloc.logicalBlockNum); + if (epos->offset + adsize > inode->i_sb->s_blocksize) { + loffset = epos->offset; + aed->lengthAllocDescs = cpu_to_le32(adsize); + sptr = ptr - adsize; + dptr = nbh->b_data + sizeof(struct allocExtDesc); + memcpy(dptr, sptr, adsize); + epos->offset = sizeof(struct allocExtDesc) + adsize; + } else { + loffset = epos->offset + adsize; + aed->lengthAllocDescs = cpu_to_le32(0); + sptr = ptr; + epos->offset = sizeof(struct allocExtDesc); + + if (epos->bh) { + aed = (struct allocExtDesc *)epos->bh->b_data; + le32_add_cpu(&aed->lengthAllocDescs, adsize); + } else { + iinfo->i_lenAlloc += adsize; + mark_inode_dirty(inode); + } + } + if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200) + udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1, + epos->block.logicalBlockNum, sizeof(struct tag)); + else + udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1, + epos->block.logicalBlockNum, sizeof(struct tag)); + switch (iinfo->i_alloc_type) { + case ICBTAG_FLAG_AD_SHORT: + sad = (struct short_ad *)sptr; + sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | + inode->i_sb->s_blocksize); + sad->extPosition = + cpu_to_le32(epos->block.logicalBlockNum); + break; + case ICBTAG_FLAG_AD_LONG: + lad = (struct long_ad *)sptr; + lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | + inode->i_sb->s_blocksize); + lad->extLocation = cpu_to_lelb(epos->block); + memset(lad->impUse, 0x00, sizeof(lad->impUse)); + break; + } + if (epos->bh) { + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || + UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) + udf_update_tag(epos->bh->b_data, loffset); + else + udf_update_tag(epos->bh->b_data, + sizeof(struct allocExtDesc)); + mark_buffer_dirty_inode(epos->bh, inode); + brelse(epos->bh); + } else { + mark_inode_dirty(inode); + } + epos->bh = nbh; + } + + udf_write_aext(inode, epos, eloc, elen, inc); + + if (!epos->bh) { + iinfo->i_lenAlloc += adsize; + mark_inode_dirty(inode); + } else { + aed = (struct allocExtDesc *)epos->bh->b_data; + le32_add_cpu(&aed->lengthAllocDescs, adsize); + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || + UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) + udf_update_tag(epos->bh->b_data, + epos->offset + (inc ? 0 : adsize)); + else + udf_update_tag(epos->bh->b_data, + sizeof(struct allocExtDesc)); + mark_buffer_dirty_inode(epos->bh, inode); + } + + return 0; +} + +void udf_write_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) +{ + int adsize; + uint8_t *ptr; + struct short_ad *sad; + struct long_ad *lad; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (!epos->bh) + ptr = iinfo->i_ext.i_data + epos->offset - + udf_file_entry_alloc_offset(inode) + + iinfo->i_lenEAttr; + else + ptr = epos->bh->b_data + epos->offset; + + switch (iinfo->i_alloc_type) { + case ICBTAG_FLAG_AD_SHORT: + sad = (struct short_ad *)ptr; + sad->extLength = cpu_to_le32(elen); + sad->extPosition = cpu_to_le32(eloc->logicalBlockNum); + adsize = sizeof(struct short_ad); + break; + case ICBTAG_FLAG_AD_LONG: + lad = (struct long_ad *)ptr; + lad->extLength = cpu_to_le32(elen); + lad->extLocation = cpu_to_lelb(*eloc); + memset(lad->impUse, 0x00, sizeof(lad->impUse)); + adsize = sizeof(struct long_ad); + break; + default: + return; + } + + if (epos->bh) { + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || + UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) { + struct allocExtDesc *aed = + (struct allocExtDesc *)epos->bh->b_data; + udf_update_tag(epos->bh->b_data, + le32_to_cpu(aed->lengthAllocDescs) + + sizeof(struct allocExtDesc)); + } + mark_buffer_dirty_inode(epos->bh, inode); + } else { + mark_inode_dirty(inode); + } + + if (inc) + epos->offset += adsize; +} + +int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t *elen, int inc) +{ + int8_t etype; + + while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) == + (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { + int block; + epos->block = *eloc; + epos->offset = sizeof(struct allocExtDesc); + brelse(epos->bh); + block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0); + epos->bh = udf_tread(inode->i_sb, block); + if (!epos->bh) { + udf_debug("reading block %d failed!\n", block); + return -1; + } + } + + return etype; +} + +int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t *elen, int inc) +{ + int alen; + int8_t etype; + uint8_t *ptr; + struct short_ad *sad; + struct long_ad *lad; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (!epos->bh) { + if (!epos->offset) + epos->offset = udf_file_entry_alloc_offset(inode); + ptr = iinfo->i_ext.i_data + epos->offset - + udf_file_entry_alloc_offset(inode) + + iinfo->i_lenEAttr; + alen = udf_file_entry_alloc_offset(inode) + + iinfo->i_lenAlloc; + } else { + if (!epos->offset) + epos->offset = sizeof(struct allocExtDesc); + ptr = epos->bh->b_data + epos->offset; + alen = sizeof(struct allocExtDesc) + + le32_to_cpu(((struct allocExtDesc *)epos->bh->b_data)-> + lengthAllocDescs); + } + + switch (iinfo->i_alloc_type) { + case ICBTAG_FLAG_AD_SHORT: + sad = udf_get_fileshortad(ptr, alen, &epos->offset, inc); + if (!sad) + return -1; + etype = le32_to_cpu(sad->extLength) >> 30; + eloc->logicalBlockNum = le32_to_cpu(sad->extPosition); + eloc->partitionReferenceNum = + iinfo->i_location.partitionReferenceNum; + *elen = le32_to_cpu(sad->extLength) & UDF_EXTENT_LENGTH_MASK; + break; + case ICBTAG_FLAG_AD_LONG: + lad = udf_get_filelongad(ptr, alen, &epos->offset, inc); + if (!lad) + return -1; + etype = le32_to_cpu(lad->extLength) >> 30; + *eloc = lelb_to_cpu(lad->extLocation); + *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK; + break; + default: + udf_debug("alloc_type = %d unsupported\n", + iinfo->i_alloc_type); + return -1; + } + + return etype; +} + +static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, + struct kernel_lb_addr neloc, uint32_t nelen) +{ + struct kernel_lb_addr oeloc; + uint32_t oelen; + int8_t etype; + + if (epos.bh) + get_bh(epos.bh); + + while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) { + udf_write_aext(inode, &epos, &neloc, nelen, 1); + neloc = oeloc; + nelen = (etype << 30) | oelen; + } + udf_add_aext(inode, &epos, &neloc, nelen, 1); + brelse(epos.bh); + + return (nelen >> 30); +} + +int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, + struct kernel_lb_addr eloc, uint32_t elen) +{ + struct extent_position oepos; + int adsize; + int8_t etype; + struct allocExtDesc *aed; + struct udf_inode_info *iinfo; + + if (epos.bh) { + get_bh(epos.bh); + get_bh(epos.bh); + } + + iinfo = UDF_I(inode); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + adsize = 0; + + oepos = epos; + if (udf_next_aext(inode, &epos, &eloc, &elen, 1) == -1) + return -1; + + while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { + udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1); + if (oepos.bh != epos.bh) { + oepos.block = epos.block; + brelse(oepos.bh); + get_bh(epos.bh); + oepos.bh = epos.bh; + oepos.offset = epos.offset - adsize; + } + } + memset(&eloc, 0x00, sizeof(struct kernel_lb_addr)); + elen = 0; + + if (epos.bh != oepos.bh) { + udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1); + udf_write_aext(inode, &oepos, &eloc, elen, 1); + udf_write_aext(inode, &oepos, &eloc, elen, 1); + if (!oepos.bh) { + iinfo->i_lenAlloc -= (adsize * 2); + mark_inode_dirty(inode); + } else { + aed = (struct allocExtDesc *)oepos.bh->b_data; + le32_add_cpu(&aed->lengthAllocDescs, -(2 * adsize)); + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || + UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) + udf_update_tag(oepos.bh->b_data, + oepos.offset - (2 * adsize)); + else + udf_update_tag(oepos.bh->b_data, + sizeof(struct allocExtDesc)); + mark_buffer_dirty_inode(oepos.bh, inode); + } + } else { + udf_write_aext(inode, &oepos, &eloc, elen, 1); + if (!oepos.bh) { + iinfo->i_lenAlloc -= adsize; + mark_inode_dirty(inode); + } else { + aed = (struct allocExtDesc *)oepos.bh->b_data; + le32_add_cpu(&aed->lengthAllocDescs, -adsize); + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || + UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) + udf_update_tag(oepos.bh->b_data, + epos.offset - adsize); + else + udf_update_tag(oepos.bh->b_data, + sizeof(struct allocExtDesc)); + mark_buffer_dirty_inode(oepos.bh, inode); + } + } + + brelse(epos.bh); + brelse(oepos.bh); + + return (elen >> 30); +} + +int8_t inode_bmap(struct inode *inode, sector_t block, + struct extent_position *pos, struct kernel_lb_addr *eloc, + uint32_t *elen, sector_t *offset) +{ + unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; + loff_t lbcount = 0, bcount = + (loff_t) block << blocksize_bits; + int8_t etype; + struct udf_inode_info *iinfo; + + iinfo = UDF_I(inode); + pos->offset = 0; + pos->block = iinfo->i_location; + pos->bh = NULL; + *elen = 0; + + do { + etype = udf_next_aext(inode, pos, eloc, elen, 1); + if (etype == -1) { + *offset = (bcount - lbcount) >> blocksize_bits; + iinfo->i_lenExtents = lbcount; + return -1; + } + lbcount += *elen; + } while (lbcount <= bcount); + + *offset = (bcount + *elen - lbcount) >> blocksize_bits; + + return etype; +} + +long udf_block_map(struct inode *inode, sector_t block) +{ + struct kernel_lb_addr eloc; + uint32_t elen; + sector_t offset; + struct extent_position epos = {}; + int ret; + + down_read(&UDF_I(inode)->i_data_sem); + + if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == + (EXT_RECORDED_ALLOCATED >> 30)) + ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset); + else + ret = 0; + + up_read(&UDF_I(inode)->i_data_sem); + brelse(epos.bh); + + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV)) + return udf_fixed_to_variable(ret); + else + return ret; +} diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c new file mode 100644 index 00000000..43e24a3b --- /dev/null +++ b/fs/udf/lowlevel.c @@ -0,0 +1,67 @@ +/* + * lowlevel.c + * + * PURPOSE + * Low Level Device Routines for the UDF filesystem + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1999-2001 Ben Fennema + * + * HISTORY + * + * 03/26/99 blf Created. + */ + +#include "udfdecl.h" + +#include +#include +#include + +#include "udf_sb.h" + +unsigned int udf_get_last_session(struct super_block *sb) +{ + struct cdrom_multisession ms_info; + unsigned int vol_desc_start; + struct block_device *bdev = sb->s_bdev; + int i; + + vol_desc_start = 0; + ms_info.addr_format = CDROM_LBA; + i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info); + + if (i == 0) { + udf_debug("XA disk: %s, vol_desc_start=%d\n", + (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba); + if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ + vol_desc_start = ms_info.addr.lba; + } else { + udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i); + } + return vol_desc_start; +} + +unsigned long udf_get_last_block(struct super_block *sb) +{ + struct block_device *bdev = sb->s_bdev; + unsigned long lblock = 0; + + /* + * ioctl failed or returned obviously bogus value? + * Try using the device size... + */ + if (ioctl_by_bdev(bdev, CDROM_LAST_WRITTEN, (unsigned long) &lblock) || + lblock == 0) + lblock = bdev->bd_inode->i_size >> sb->s_blocksize_bits; + + if (lblock) + return lblock - 1; + else + return 0; +} diff --git a/fs/udf/misc.c b/fs/udf/misc.c new file mode 100644 index 00000000..9215700c --- /dev/null +++ b/fs/udf/misc.c @@ -0,0 +1,296 @@ +/* + * misc.c + * + * PURPOSE + * Miscellaneous routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998 Dave Boynton + * (C) 1998-2004 Ben Fennema + * (C) 1999-2000 Stelias Computing Inc + * + * HISTORY + * + * 04/19/99 blf partial support for reading/writing specific EA's + */ + +#include "udfdecl.h" + +#include +#include +#include +#include + +#include "udf_i.h" +#include "udf_sb.h" + +struct buffer_head *udf_tgetblk(struct super_block *sb, int block) +{ + if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV)) + return sb_getblk(sb, udf_fixed_to_variable(block)); + else + return sb_getblk(sb, block); +} + +struct buffer_head *udf_tread(struct super_block *sb, int block) +{ + if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV)) + return sb_bread(sb, udf_fixed_to_variable(block)); + else + return sb_bread(sb, block); +} + +struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, + uint32_t type, uint8_t loc) +{ + uint8_t *ea = NULL, *ad = NULL; + int offset; + uint16_t crclen; + struct udf_inode_info *iinfo = UDF_I(inode); + + ea = iinfo->i_ext.i_data; + if (iinfo->i_lenEAttr) { + ad = iinfo->i_ext.i_data + iinfo->i_lenEAttr; + } else { + ad = ea; + size += sizeof(struct extendedAttrHeaderDesc); + } + + offset = inode->i_sb->s_blocksize - udf_file_entry_alloc_offset(inode) - + iinfo->i_lenAlloc; + + /* TODO - Check for FreeEASpace */ + + if (loc & 0x01 && offset >= size) { + struct extendedAttrHeaderDesc *eahd; + eahd = (struct extendedAttrHeaderDesc *)ea; + + if (iinfo->i_lenAlloc) + memmove(&ad[size], ad, iinfo->i_lenAlloc); + + if (iinfo->i_lenEAttr) { + /* check checksum/crc */ + if (eahd->descTag.tagIdent != + cpu_to_le16(TAG_IDENT_EAHD) || + le32_to_cpu(eahd->descTag.tagLocation) != + iinfo->i_location.logicalBlockNum) + return NULL; + } else { + struct udf_sb_info *sbi = UDF_SB(inode->i_sb); + + size -= sizeof(struct extendedAttrHeaderDesc); + iinfo->i_lenEAttr += + sizeof(struct extendedAttrHeaderDesc); + eahd->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EAHD); + if (sbi->s_udfrev >= 0x0200) + eahd->descTag.descVersion = cpu_to_le16(3); + else + eahd->descTag.descVersion = cpu_to_le16(2); + eahd->descTag.tagSerialNum = + cpu_to_le16(sbi->s_serial_number); + eahd->descTag.tagLocation = cpu_to_le32( + iinfo->i_location.logicalBlockNum); + eahd->impAttrLocation = cpu_to_le32(0xFFFFFFFF); + eahd->appAttrLocation = cpu_to_le32(0xFFFFFFFF); + } + + offset = iinfo->i_lenEAttr; + if (type < 2048) { + if (le32_to_cpu(eahd->appAttrLocation) < + iinfo->i_lenEAttr) { + uint32_t aal = + le32_to_cpu(eahd->appAttrLocation); + memmove(&ea[offset - aal + size], + &ea[aal], offset - aal); + offset -= aal; + eahd->appAttrLocation = + cpu_to_le32(aal + size); + } + if (le32_to_cpu(eahd->impAttrLocation) < + iinfo->i_lenEAttr) { + uint32_t ial = + le32_to_cpu(eahd->impAttrLocation); + memmove(&ea[offset - ial + size], + &ea[ial], offset - ial); + offset -= ial; + eahd->impAttrLocation = + cpu_to_le32(ial + size); + } + } else if (type < 65536) { + if (le32_to_cpu(eahd->appAttrLocation) < + iinfo->i_lenEAttr) { + uint32_t aal = + le32_to_cpu(eahd->appAttrLocation); + memmove(&ea[offset - aal + size], + &ea[aal], offset - aal); + offset -= aal; + eahd->appAttrLocation = + cpu_to_le32(aal + size); + } + } + /* rewrite CRC + checksum of eahd */ + crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag); + eahd->descTag.descCRCLength = cpu_to_le16(crclen); + eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd + + sizeof(struct tag), crclen)); + eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag); + iinfo->i_lenEAttr += size; + return (struct genericFormat *)&ea[offset]; + } + if (loc & 0x02) + ; + + return NULL; +} + +struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type, + uint8_t subtype) +{ + struct genericFormat *gaf; + uint8_t *ea = NULL; + uint32_t offset; + struct udf_inode_info *iinfo = UDF_I(inode); + + ea = iinfo->i_ext.i_data; + + if (iinfo->i_lenEAttr) { + struct extendedAttrHeaderDesc *eahd; + eahd = (struct extendedAttrHeaderDesc *)ea; + + /* check checksum/crc */ + if (eahd->descTag.tagIdent != + cpu_to_le16(TAG_IDENT_EAHD) || + le32_to_cpu(eahd->descTag.tagLocation) != + iinfo->i_location.logicalBlockNum) + return NULL; + + if (type < 2048) + offset = sizeof(struct extendedAttrHeaderDesc); + else if (type < 65536) + offset = le32_to_cpu(eahd->impAttrLocation); + else + offset = le32_to_cpu(eahd->appAttrLocation); + + while (offset < iinfo->i_lenEAttr) { + gaf = (struct genericFormat *)&ea[offset]; + if (le32_to_cpu(gaf->attrType) == type && + gaf->attrSubtype == subtype) + return gaf; + else + offset += le32_to_cpu(gaf->attrLength); + } + } + + return NULL; +} + +/* + * udf_read_tagged + * + * PURPOSE + * Read the first block of a tagged descriptor. + * + * HISTORY + * July 1, 1997 - Andrew E. Mileski + * Written, tested, and released. + */ +struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, + uint32_t location, uint16_t *ident) +{ + struct tag *tag_p; + struct buffer_head *bh = NULL; + + /* Read the block */ + if (block == 0xFFFFFFFF) + return NULL; + + bh = udf_tread(sb, block); + if (!bh) { + udf_debug("block=%d, location=%d: read failed\n", + block, location); + return NULL; + } + + tag_p = (struct tag *)(bh->b_data); + + *ident = le16_to_cpu(tag_p->tagIdent); + + if (location != le32_to_cpu(tag_p->tagLocation)) { + udf_debug("location mismatch block %u, tag %u != %u\n", + block, le32_to_cpu(tag_p->tagLocation), location); + goto error_out; + } + + /* Verify the tag checksum */ + if (udf_tag_checksum(tag_p) != tag_p->tagChecksum) { + printk(KERN_ERR "udf: tag checksum failed block %d\n", block); + goto error_out; + } + + /* Verify the tag version */ + if (tag_p->descVersion != cpu_to_le16(0x0002U) && + tag_p->descVersion != cpu_to_le16(0x0003U)) { + udf_debug("tag version 0x%04x != 0x0002 || 0x0003 block %d\n", + le16_to_cpu(tag_p->descVersion), block); + goto error_out; + } + + /* Verify the descriptor CRC */ + if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize || + le16_to_cpu(tag_p->descCRC) == crc_itu_t(0, + bh->b_data + sizeof(struct tag), + le16_to_cpu(tag_p->descCRCLength))) + return bh; + + udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block, + le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength)); + +error_out: + brelse(bh); + return NULL; +} + +struct buffer_head *udf_read_ptagged(struct super_block *sb, + struct kernel_lb_addr *loc, + uint32_t offset, uint16_t *ident) +{ + return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset), + loc->logicalBlockNum + offset, ident); +} + +void udf_update_tag(char *data, int length) +{ + struct tag *tptr = (struct tag *)data; + length -= sizeof(struct tag); + + tptr->descCRCLength = cpu_to_le16(length); + tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length)); + tptr->tagChecksum = udf_tag_checksum(tptr); +} + +void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum, + uint32_t loc, int length) +{ + struct tag *tptr = (struct tag *)data; + tptr->tagIdent = cpu_to_le16(ident); + tptr->descVersion = cpu_to_le16(version); + tptr->tagSerialNum = cpu_to_le16(snum); + tptr->tagLocation = cpu_to_le32(loc); + udf_update_tag(data, length); +} + +u8 udf_tag_checksum(const struct tag *t) +{ + u8 *data = (u8 *)t; + u8 checksum = 0; + int i; + for (i = 0; i < sizeof(struct tag); ++i) + if (i != 4) /* position of checksum */ + checksum += data[i]; + return checksum; +} diff --git a/fs/udf/namei.c b/fs/udf/namei.c new file mode 100644 index 00000000..f1dce848 --- /dev/null +++ b/fs/udf/namei.c @@ -0,0 +1,1339 @@ +/* + * namei.c + * + * PURPOSE + * Inode name handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998-2004 Ben Fennema + * (C) 1999-2000 Stelias Computing Inc + * + * HISTORY + * + * 12/12/98 blf Created. Split out the lookup code from dir.c + * 04/19/99 blf link, mknod, symlink support + */ + +#include "udfdecl.h" + +#include "udf_i.h" +#include "udf_sb.h" +#include +#include +#include +#include +#include +#include +#include +#include + +enum { UDF_MAX_LINKS = 0xffff }; + +static inline int udf_match(int len1, const unsigned char *name1, int len2, + const unsigned char *name2) +{ + if (len1 != len2) + return 0; + + return !memcmp(name1, name2, len1); +} + +int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, + struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh, + uint8_t *impuse, uint8_t *fileident) +{ + uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag); + uint16_t crc; + int offset; + uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse); + uint8_t lfi = cfi->lengthFileIdent; + int padlen = fibh->eoffset - fibh->soffset - liu - lfi - + sizeof(struct fileIdentDesc); + int adinicb = 0; + + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + adinicb = 1; + + offset = fibh->soffset + sizeof(struct fileIdentDesc); + + if (impuse) { + if (adinicb || (offset + liu < 0)) { + memcpy((uint8_t *)sfi->impUse, impuse, liu); + } else if (offset >= 0) { + memcpy(fibh->ebh->b_data + offset, impuse, liu); + } else { + memcpy((uint8_t *)sfi->impUse, impuse, -offset); + memcpy(fibh->ebh->b_data, impuse - offset, + liu + offset); + } + } + + offset += liu; + + if (fileident) { + if (adinicb || (offset + lfi < 0)) { + memcpy((uint8_t *)sfi->fileIdent + liu, fileident, lfi); + } else if (offset >= 0) { + memcpy(fibh->ebh->b_data + offset, fileident, lfi); + } else { + memcpy((uint8_t *)sfi->fileIdent + liu, fileident, + -offset); + memcpy(fibh->ebh->b_data, fileident - offset, + lfi + offset); + } + } + + offset += lfi; + + if (adinicb || (offset + padlen < 0)) { + memset((uint8_t *)sfi->padding + liu + lfi, 0x00, padlen); + } else if (offset >= 0) { + memset(fibh->ebh->b_data + offset, 0x00, padlen); + } else { + memset((uint8_t *)sfi->padding + liu + lfi, 0x00, -offset); + memset(fibh->ebh->b_data, 0x00, padlen + offset); + } + + crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag), + sizeof(struct fileIdentDesc) - sizeof(struct tag)); + + if (fibh->sbh == fibh->ebh) { + crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, + crclen + sizeof(struct tag) - + sizeof(struct fileIdentDesc)); + } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) { + crc = crc_itu_t(crc, fibh->ebh->b_data + + sizeof(struct fileIdentDesc) + + fibh->soffset, + crclen + sizeof(struct tag) - + sizeof(struct fileIdentDesc)); + } else { + crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, + -fibh->soffset - sizeof(struct fileIdentDesc)); + crc = crc_itu_t(crc, fibh->ebh->b_data, fibh->eoffset); + } + + cfi->descTag.descCRC = cpu_to_le16(crc); + cfi->descTag.descCRCLength = cpu_to_le16(crclen); + cfi->descTag.tagChecksum = udf_tag_checksum(&cfi->descTag); + + if (adinicb || (sizeof(struct fileIdentDesc) <= -fibh->soffset)) { + memcpy((uint8_t *)sfi, (uint8_t *)cfi, + sizeof(struct fileIdentDesc)); + } else { + memcpy((uint8_t *)sfi, (uint8_t *)cfi, -fibh->soffset); + memcpy(fibh->ebh->b_data, (uint8_t *)cfi - fibh->soffset, + sizeof(struct fileIdentDesc) + fibh->soffset); + } + + if (adinicb) { + mark_inode_dirty(inode); + } else { + if (fibh->sbh != fibh->ebh) + mark_buffer_dirty_inode(fibh->ebh, inode); + mark_buffer_dirty_inode(fibh->sbh, inode); + } + return 0; +} + +static struct fileIdentDesc *udf_find_entry(struct inode *dir, + const struct qstr *child, + struct udf_fileident_bh *fibh, + struct fileIdentDesc *cfi) +{ + struct fileIdentDesc *fi = NULL; + loff_t f_pos; + int block, flen; + unsigned char *fname = NULL; + unsigned char *nameptr; + uint8_t lfi; + uint16_t liu; + loff_t size; + struct kernel_lb_addr eloc; + uint32_t elen; + sector_t offset; + struct extent_position epos = {}; + struct udf_inode_info *dinfo = UDF_I(dir); + int isdotdot = child->len == 2 && + child->name[0] == '.' && child->name[1] == '.'; + + size = udf_ext0_offset(dir) + dir->i_size; + f_pos = udf_ext0_offset(dir); + + fibh->sbh = fibh->ebh = NULL; + fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1); + if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, + &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) + goto out_err; + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); + if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + } else + offset = 0; + + fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); + if (!fibh->sbh) + goto out_err; + } + + fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); + if (!fname) + goto out_err; + + while (f_pos < size) { + fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, + &elen, &offset); + if (!fi) + goto out_err; + + liu = le16_to_cpu(cfi->lengthOfImpUse); + lfi = cfi->lengthFileIdent; + + if (fibh->sbh == fibh->ebh) { + nameptr = fi->fileIdent + liu; + } else { + int poffset; /* Unpaded ending offset */ + + poffset = fibh->soffset + sizeof(struct fileIdentDesc) + + liu + lfi; + + if (poffset >= lfi) + nameptr = (uint8_t *)(fibh->ebh->b_data + + poffset - lfi); + else { + nameptr = fname; + memcpy(nameptr, fi->fileIdent + liu, + lfi - poffset); + memcpy(nameptr + lfi - poffset, + fibh->ebh->b_data, poffset); + } + } + + if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { + if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNDELETE)) + continue; + } + + if ((cfi->fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { + if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNHIDE)) + continue; + } + + if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) && + isdotdot) + goto out_ok; + + if (!lfi) + continue; + + flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); + if (flen && udf_match(flen, fname, child->len, child->name)) + goto out_ok; + } + +out_err: + fi = NULL; + if (fibh->sbh != fibh->ebh) + brelse(fibh->ebh); + brelse(fibh->sbh); +out_ok: + brelse(epos.bh); + kfree(fname); + + return fi; +} + +static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode = NULL; + struct fileIdentDesc cfi; + struct udf_fileident_bh fibh; + + if (dentry->d_name.len > UDF_NAME_LEN - 2) + return ERR_PTR(-ENAMETOOLONG); + +#ifdef UDF_RECOVERY + /* temporary shorthand for specifying files by inode number */ + if (!strncmp(dentry->d_name.name, ".B=", 3)) { + struct kernel_lb_addr lb = { + .logicalBlockNum = 0, + .partitionReferenceNum = + simple_strtoul(dentry->d_name.name + 3, + NULL, 0), + }; + inode = udf_iget(dir->i_sb, lb); + if (!inode) { + return ERR_PTR(-EACCES); + } + } else +#endif /* UDF_RECOVERY */ + + if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) { + struct kernel_lb_addr loc; + + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + + loc = lelb_to_cpu(cfi.icb.extLocation); + inode = udf_iget(dir->i_sb, &loc); + if (!inode) { + return ERR_PTR(-EACCES); + } + } + + return d_splice_alias(inode, dentry); +} + +static struct fileIdentDesc *udf_add_entry(struct inode *dir, + struct dentry *dentry, + struct udf_fileident_bh *fibh, + struct fileIdentDesc *cfi, int *err) +{ + struct super_block *sb = dir->i_sb; + struct fileIdentDesc *fi = NULL; + unsigned char *name = NULL; + int namelen; + loff_t f_pos; + loff_t size = udf_ext0_offset(dir) + dir->i_size; + int nfidlen; + uint8_t lfi; + uint16_t liu; + int block; + struct kernel_lb_addr eloc; + uint32_t elen = 0; + sector_t offset; + struct extent_position epos = {}; + struct udf_inode_info *dinfo; + + fibh->sbh = fibh->ebh = NULL; + name = kmalloc(UDF_NAME_LEN, GFP_NOFS); + if (!name) { + *err = -ENOMEM; + goto out_err; + } + + if (dentry) { + if (!dentry->d_name.len) { + *err = -EINVAL; + goto out_err; + } + namelen = udf_put_filename(sb, dentry->d_name.name, name, + dentry->d_name.len); + if (!namelen) { + *err = -ENAMETOOLONG; + goto out_err; + } + } else { + namelen = 0; + } + + nfidlen = (sizeof(struct fileIdentDesc) + namelen + 3) & ~3; + + f_pos = udf_ext0_offset(dir); + + fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1); + dinfo = UDF_I(dir); + if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, + &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { + block = udf_get_lb_pblock(dir->i_sb, + &dinfo->i_location, 0); + fibh->soffset = fibh->eoffset = sb->s_blocksize; + goto add; + } + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); + if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + } else + offset = 0; + + fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); + if (!fibh->sbh) { + *err = -EIO; + goto out_err; + } + + block = dinfo->i_location.logicalBlockNum; + } + + while (f_pos < size) { + fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, + &elen, &offset); + + if (!fi) { + *err = -EIO; + goto out_err; + } + + liu = le16_to_cpu(cfi->lengthOfImpUse); + lfi = cfi->lengthFileIdent; + + if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { + if (((sizeof(struct fileIdentDesc) + + liu + lfi + 3) & ~3) == nfidlen) { + cfi->descTag.tagSerialNum = cpu_to_le16(1); + cfi->fileVersionNum = cpu_to_le16(1); + cfi->fileCharacteristics = 0; + cfi->lengthFileIdent = namelen; + cfi->lengthOfImpUse = cpu_to_le16(0); + if (!udf_write_fi(dir, cfi, fi, fibh, NULL, + name)) + goto out_ok; + else { + *err = -EIO; + goto out_err; + } + } + } + } + +add: + f_pos += nfidlen; + + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && + sb->s_blocksize - fibh->eoffset < nfidlen) { + brelse(epos.bh); + epos.bh = NULL; + fibh->soffset -= udf_ext0_offset(dir); + fibh->eoffset -= udf_ext0_offset(dir); + f_pos -= udf_ext0_offset(dir); + if (fibh->sbh != fibh->ebh) + brelse(fibh->ebh); + brelse(fibh->sbh); + fibh->sbh = fibh->ebh = + udf_expand_dir_adinicb(dir, &block, err); + if (!fibh->sbh) + goto out_err; + epos.block = dinfo->i_location; + epos.offset = udf_file_entry_alloc_offset(dir); + /* Load extent udf_expand_dir_adinicb() has created */ + udf_current_aext(dir, &epos, &eloc, &elen, 1); + } + + /* Entry fits into current block? */ + if (sb->s_blocksize - fibh->eoffset >= nfidlen) { + fibh->soffset = fibh->eoffset; + fibh->eoffset += nfidlen; + if (fibh->sbh != fibh->ebh) { + brelse(fibh->sbh); + fibh->sbh = fibh->ebh; + } + + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + block = dinfo->i_location.logicalBlockNum; + fi = (struct fileIdentDesc *) + (dinfo->i_ext.i_data + + fibh->soffset - + udf_ext0_offset(dir) + + dinfo->i_lenEAttr); + } else { + block = eloc.logicalBlockNum + + ((elen - 1) >> + dir->i_sb->s_blocksize_bits); + fi = (struct fileIdentDesc *) + (fibh->sbh->b_data + fibh->soffset); + } + } else { + /* Round up last extent in the file */ + elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + udf_write_aext(dir, &epos, &eloc, elen, 1); + dinfo->i_lenExtents = (dinfo->i_lenExtents + sb->s_blocksize + - 1) & ~(sb->s_blocksize - 1); + + fibh->soffset = fibh->eoffset - sb->s_blocksize; + fibh->eoffset += nfidlen - sb->s_blocksize; + if (fibh->sbh != fibh->ebh) { + brelse(fibh->sbh); + fibh->sbh = fibh->ebh; + } + + block = eloc.logicalBlockNum + ((elen - 1) >> + dir->i_sb->s_blocksize_bits); + fibh->ebh = udf_bread(dir, + f_pos >> dir->i_sb->s_blocksize_bits, 1, err); + if (!fibh->ebh) + goto out_err; + /* Extents could have been merged, invalidate our position */ + brelse(epos.bh); + epos.bh = NULL; + epos.block = dinfo->i_location; + epos.offset = udf_file_entry_alloc_offset(dir); + + if (!fibh->soffset) { + /* Find the freshly allocated block */ + while (udf_next_aext(dir, &epos, &eloc, &elen, 1) == + (EXT_RECORDED_ALLOCATED >> 30)) + ; + block = eloc.logicalBlockNum + ((elen - 1) >> + dir->i_sb->s_blocksize_bits); + brelse(fibh->sbh); + fibh->sbh = fibh->ebh; + fi = (struct fileIdentDesc *)(fibh->sbh->b_data); + } else { + fi = (struct fileIdentDesc *) + (fibh->sbh->b_data + sb->s_blocksize + + fibh->soffset); + } + } + + memset(cfi, 0, sizeof(struct fileIdentDesc)); + if (UDF_SB(sb)->s_udfrev >= 0x0200) + udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block, + sizeof(struct tag)); + else + udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block, + sizeof(struct tag)); + cfi->fileVersionNum = cpu_to_le16(1); + cfi->lengthFileIdent = namelen; + cfi->lengthOfImpUse = cpu_to_le16(0); + if (!udf_write_fi(dir, cfi, fi, fibh, NULL, name)) { + dir->i_size += nfidlen; + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + dinfo->i_lenAlloc += nfidlen; + else { + /* Find the last extent and truncate it to proper size */ + while (udf_next_aext(dir, &epos, &eloc, &elen, 1) == + (EXT_RECORDED_ALLOCATED >> 30)) + ; + elen -= dinfo->i_lenExtents - dir->i_size; + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + udf_write_aext(dir, &epos, &eloc, elen, 1); + dinfo->i_lenExtents = dir->i_size; + } + + mark_inode_dirty(dir); + goto out_ok; + } else { + *err = -EIO; + goto out_err; + } + +out_err: + fi = NULL; + if (fibh->sbh != fibh->ebh) + brelse(fibh->ebh); + brelse(fibh->sbh); +out_ok: + brelse(epos.bh); + kfree(name); + return fi; +} + +static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, + struct udf_fileident_bh *fibh, + struct fileIdentDesc *cfi) +{ + cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED; + + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) + memset(&(cfi->icb), 0x00, sizeof(struct long_ad)); + + return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); +} + +static int udf_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct udf_fileident_bh fibh; + struct inode *inode; + struct fileIdentDesc cfi, *fi; + int err; + struct udf_inode_info *iinfo; + + inode = udf_new_inode(dir, mode, &err); + if (!inode) { + return err; + } + + iinfo = UDF_I(inode); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + inode->i_data.a_ops = &udf_adinicb_aops; + else + inode->i_data.a_ops = &udf_aops; + inode->i_op = &udf_file_inode_operations; + inode->i_fop = &udf_file_operations; + mark_inode_dirty(inode); + + fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); + if (!fi) { + inode->i_nlink--; + mark_inode_dirty(inode); + iput(inode); + return err; + } + cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); + udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); + if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + mark_inode_dirty(dir); + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + d_instantiate(dentry, inode); + + return 0; +} + +static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + struct inode *inode; + struct udf_fileident_bh fibh; + struct fileIdentDesc cfi, *fi; + int err; + struct udf_inode_info *iinfo; + + if (!old_valid_dev(rdev)) + return -EINVAL; + + err = -EIO; + inode = udf_new_inode(dir, mode, &err); + if (!inode) + goto out; + + iinfo = UDF_I(inode); + init_special_inode(inode, mode, rdev); + fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); + if (!fi) { + inode->i_nlink--; + mark_inode_dirty(inode); + iput(inode); + return err; + } + cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); + udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); + if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + mark_inode_dirty(dir); + mark_inode_dirty(inode); + + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + d_instantiate(dentry, inode); + err = 0; + +out: + return err; +} + +static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + struct udf_fileident_bh fibh; + struct fileIdentDesc cfi, *fi; + int err; + struct udf_inode_info *dinfo = UDF_I(dir); + struct udf_inode_info *iinfo; + + err = -EMLINK; + if (dir->i_nlink >= UDF_MAX_LINKS) + goto out; + + err = -EIO; + inode = udf_new_inode(dir, S_IFDIR | mode, &err); + if (!inode) + goto out; + + iinfo = UDF_I(inode); + inode->i_op = &udf_dir_inode_operations; + inode->i_fop = &udf_dir_operations; + fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); + if (!fi) { + inode->i_nlink--; + mark_inode_dirty(inode); + iput(inode); + goto out; + } + inode->i_nlink = 2; + cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + cpu_to_le32(dinfo->i_unique & 0x00000000FFFFFFFFUL); + cfi.fileCharacteristics = + FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT; + udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL); + brelse(fibh.sbh); + mark_inode_dirty(inode); + + fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); + if (!fi) { + inode->i_nlink = 0; + mark_inode_dirty(inode); + iput(inode); + goto out; + } + cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); + cfi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY; + udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); + inc_nlink(dir); + mark_inode_dirty(dir); + d_instantiate(dentry, inode); + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + err = 0; + +out: + return err; +} + +static int empty_dir(struct inode *dir) +{ + struct fileIdentDesc *fi, cfi; + struct udf_fileident_bh fibh; + loff_t f_pos; + loff_t size = udf_ext0_offset(dir) + dir->i_size; + int block; + struct kernel_lb_addr eloc; + uint32_t elen; + sector_t offset; + struct extent_position epos = {}; + struct udf_inode_info *dinfo = UDF_I(dir); + + f_pos = udf_ext0_offset(dir); + fibh.soffset = fibh.eoffset = f_pos & (dir->i_sb->s_blocksize - 1); + + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + fibh.sbh = fibh.ebh = NULL; + else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, + &epos, &eloc, &elen, &offset) == + (EXT_RECORDED_ALLOCATED >> 30)) { + block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); + if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + } else + offset = 0; + + fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block); + if (!fibh.sbh) { + brelse(epos.bh); + return 0; + } + } else { + brelse(epos.bh); + return 0; + } + + while (f_pos < size) { + fi = udf_fileident_read(dir, &f_pos, &fibh, &cfi, &epos, &eloc, + &elen, &offset); + if (!fi) { + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + brelse(epos.bh); + return 0; + } + + if (cfi.lengthFileIdent && + (cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) == 0) { + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + brelse(epos.bh); + return 0; + } + } + + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + brelse(epos.bh); + + return 1; +} + +static int udf_rmdir(struct inode *dir, struct dentry *dentry) +{ + int retval; + struct inode *inode = dentry->d_inode; + struct udf_fileident_bh fibh; + struct fileIdentDesc *fi, cfi; + struct kernel_lb_addr tloc; + + retval = -ENOENT; + fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); + if (!fi) + goto out; + + retval = -EIO; + tloc = lelb_to_cpu(cfi.icb.extLocation); + if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) + goto end_rmdir; + retval = -ENOTEMPTY; + if (!empty_dir(inode)) + goto end_rmdir; + retval = udf_delete_entry(dir, fi, &fibh, &cfi); + if (retval) + goto end_rmdir; + if (inode->i_nlink != 2) + udf_warning(inode->i_sb, "udf_rmdir", + "empty directory has nlink != 2 (%d)", + inode->i_nlink); + clear_nlink(inode); + inode->i_size = 0; + inode_dec_link_count(dir); + inode->i_ctime = dir->i_ctime = dir->i_mtime = + current_fs_time(dir->i_sb); + mark_inode_dirty(dir); + +end_rmdir: + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + +out: + return retval; +} + +static int udf_unlink(struct inode *dir, struct dentry *dentry) +{ + int retval; + struct inode *inode = dentry->d_inode; + struct udf_fileident_bh fibh; + struct fileIdentDesc *fi; + struct fileIdentDesc cfi; + struct kernel_lb_addr tloc; + + retval = -ENOENT; + fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); + if (!fi) + goto out; + + retval = -EIO; + tloc = lelb_to_cpu(cfi.icb.extLocation); + if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) + goto end_unlink; + + if (!inode->i_nlink) { + udf_debug("Deleting nonexistent file (%lu), %d\n", + inode->i_ino, inode->i_nlink); + inode->i_nlink = 1; + } + retval = udf_delete_entry(dir, fi, &fibh, &cfi); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb); + mark_inode_dirty(dir); + inode_dec_link_count(inode); + inode->i_ctime = dir->i_ctime; + retval = 0; + +end_unlink: + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + +out: + return retval; +} + +static int udf_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct inode *inode; + struct pathComponent *pc; + const char *compstart; + struct udf_fileident_bh fibh; + struct extent_position epos = {}; + int eoffset, elen = 0; + struct fileIdentDesc *fi; + struct fileIdentDesc cfi; + uint8_t *ea; + int err; + int block; + unsigned char *name = NULL; + int namelen; + struct udf_inode_info *iinfo; + struct super_block *sb = dir->i_sb; + + inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); + if (!inode) + goto out; + + iinfo = UDF_I(inode); + down_write(&iinfo->i_data_sem); + name = kmalloc(UDF_NAME_LEN, GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto out_no_entry; + } + + inode->i_data.a_ops = &udf_symlink_aops; + inode->i_op = &udf_symlink_inode_operations; + + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + struct kernel_lb_addr eloc; + uint32_t bsize; + + block = udf_new_block(sb, inode, + iinfo->i_location.partitionReferenceNum, + iinfo->i_location.logicalBlockNum, &err); + if (!block) + goto out_no_entry; + epos.block = iinfo->i_location; + epos.offset = udf_file_entry_alloc_offset(inode); + epos.bh = NULL; + eloc.logicalBlockNum = block; + eloc.partitionReferenceNum = + iinfo->i_location.partitionReferenceNum; + bsize = sb->s_blocksize; + iinfo->i_lenExtents = bsize; + udf_add_aext(inode, &epos, &eloc, bsize, 0); + brelse(epos.bh); + + block = udf_get_pblock(sb, block, + iinfo->i_location.partitionReferenceNum, + 0); + epos.bh = udf_tgetblk(sb, block); + lock_buffer(epos.bh); + memset(epos.bh->b_data, 0x00, bsize); + set_buffer_uptodate(epos.bh); + unlock_buffer(epos.bh); + mark_buffer_dirty_inode(epos.bh, inode); + ea = epos.bh->b_data + udf_ext0_offset(inode); + } else + ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr; + + eoffset = sb->s_blocksize - udf_ext0_offset(inode); + pc = (struct pathComponent *)ea; + + if (*symname == '/') { + do { + symname++; + } while (*symname == '/'); + + pc->componentType = 1; + pc->lengthComponentIdent = 0; + pc->componentFileVersionNum = 0; + elen += sizeof(struct pathComponent); + } + + err = -ENAMETOOLONG; + + while (*symname) { + if (elen + sizeof(struct pathComponent) > eoffset) + goto out_no_entry; + + pc = (struct pathComponent *)(ea + elen); + + compstart = symname; + + do { + symname++; + } while (*symname && *symname != '/'); + + pc->componentType = 5; + pc->lengthComponentIdent = 0; + pc->componentFileVersionNum = 0; + if (compstart[0] == '.') { + if ((symname - compstart) == 1) + pc->componentType = 4; + else if ((symname - compstart) == 2 && + compstart[1] == '.') + pc->componentType = 3; + } + + if (pc->componentType == 5) { + namelen = udf_put_filename(sb, compstart, name, + symname - compstart); + if (!namelen) + goto out_no_entry; + + if (elen + sizeof(struct pathComponent) + namelen > + eoffset) + goto out_no_entry; + else + pc->lengthComponentIdent = namelen; + + memcpy(pc->componentIdent, name, namelen); + } + + elen += sizeof(struct pathComponent) + pc->lengthComponentIdent; + + if (*symname) { + do { + symname++; + } while (*symname == '/'); + } + } + + brelse(epos.bh); + inode->i_size = elen; + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + iinfo->i_lenAlloc = inode->i_size; + else + udf_truncate_tail_extent(inode); + mark_inode_dirty(inode); + + fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); + if (!fi) + goto out_no_entry; + cfi.icb.extLength = cpu_to_le32(sb->s_blocksize); + cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); + if (UDF_SB(inode->i_sb)->s_lvid_bh) { + *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + cpu_to_le32(lvid_get_unique_id(sb)); + } + udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); + if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + mark_inode_dirty(dir); + up_write(&iinfo->i_data_sem); + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + d_instantiate(dentry, inode); + err = 0; + +out: + kfree(name); + return err; + +out_no_entry: + up_write(&iinfo->i_data_sem); + inode_dec_link_count(inode); + iput(inode); + goto out; +} + +static int udf_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + struct udf_fileident_bh fibh; + struct fileIdentDesc cfi, *fi; + int err; + + if (inode->i_nlink >= UDF_MAX_LINKS) + return -EMLINK; + + fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); + if (!fi) { + return err; + } + cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location); + if (UDF_SB(inode->i_sb)->s_lvid_bh) { + *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + cpu_to_le32(lvid_get_unique_id(inode->i_sb)); + } + udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); + if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + mark_inode_dirty(dir); + + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + inc_nlink(inode); + inode->i_ctime = current_fs_time(inode->i_sb); + mark_inode_dirty(inode); + ihold(inode); + d_instantiate(dentry, inode); + + return 0; +} + +/* Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + */ +static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct udf_fileident_bh ofibh, nfibh; + struct fileIdentDesc *ofi = NULL, *nfi = NULL, *dir_fi = NULL; + struct fileIdentDesc ocfi, ncfi; + struct buffer_head *dir_bh = NULL; + int retval = -ENOENT; + struct kernel_lb_addr tloc; + struct udf_inode_info *old_iinfo = UDF_I(old_inode); + + ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); + if (ofi) { + if (ofibh.sbh != ofibh.ebh) + brelse(ofibh.ebh); + brelse(ofibh.sbh); + } + tloc = lelb_to_cpu(ocfi.icb.extLocation); + if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) + != old_inode->i_ino) + goto end_rename; + + nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi); + if (nfi) { + if (!new_inode) { + if (nfibh.sbh != nfibh.ebh) + brelse(nfibh.ebh); + brelse(nfibh.sbh); + nfi = NULL; + } + } + if (S_ISDIR(old_inode->i_mode)) { + int offset = udf_ext0_offset(old_inode); + + if (new_inode) { + retval = -ENOTEMPTY; + if (!empty_dir(new_inode)) + goto end_rename; + } + retval = -EIO; + if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + dir_fi = udf_get_fileident( + old_iinfo->i_ext.i_data - + (old_iinfo->i_efe ? + sizeof(struct extendedFileEntry) : + sizeof(struct fileEntry)), + old_inode->i_sb->s_blocksize, &offset); + } else { + dir_bh = udf_bread(old_inode, 0, 0, &retval); + if (!dir_bh) + goto end_rename; + dir_fi = udf_get_fileident(dir_bh->b_data, + old_inode->i_sb->s_blocksize, &offset); + } + if (!dir_fi) + goto end_rename; + tloc = lelb_to_cpu(dir_fi->icb.extLocation); + if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) != + old_dir->i_ino) + goto end_rename; + + retval = -EMLINK; + if (!new_inode && new_dir->i_nlink >= UDF_MAX_LINKS) + goto end_rename; + } + if (!nfi) { + nfi = udf_add_entry(new_dir, new_dentry, &nfibh, &ncfi, + &retval); + if (!nfi) + goto end_rename; + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = current_fs_time(old_inode->i_sb); + mark_inode_dirty(old_inode); + + /* + * ok, that's it + */ + ncfi.fileVersionNum = ocfi.fileVersionNum; + ncfi.fileCharacteristics = ocfi.fileCharacteristics; + memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad)); + udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL); + + /* The old fid may have moved - find it again */ + ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); + udf_delete_entry(old_dir, ofi, &ofibh, &ocfi); + + if (new_inode) { + new_inode->i_ctime = current_fs_time(new_inode->i_sb); + inode_dec_link_count(new_inode); + } + old_dir->i_ctime = old_dir->i_mtime = current_fs_time(old_dir->i_sb); + mark_inode_dirty(old_dir); + + if (dir_fi) { + dir_fi->icb.extLocation = cpu_to_lelb(UDF_I(new_dir)->i_location); + udf_update_tag((char *)dir_fi, + (sizeof(struct fileIdentDesc) + + le16_to_cpu(dir_fi->lengthOfImpUse) + 3) & ~3); + if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + mark_inode_dirty(old_inode); + else + mark_buffer_dirty_inode(dir_bh, old_inode); + + inode_dec_link_count(old_dir); + if (new_inode) + inode_dec_link_count(new_inode); + else { + inc_nlink(new_dir); + mark_inode_dirty(new_dir); + } + } + + if (ofi) { + if (ofibh.sbh != ofibh.ebh) + brelse(ofibh.ebh); + brelse(ofibh.sbh); + } + + retval = 0; + +end_rename: + brelse(dir_bh); + if (nfi) { + if (nfibh.sbh != nfibh.ebh) + brelse(nfibh.ebh); + brelse(nfibh.sbh); + } + + return retval; +} + +static struct dentry *udf_get_parent(struct dentry *child) +{ + struct kernel_lb_addr tloc; + struct inode *inode = NULL; + struct qstr dotdot = {.name = "..", .len = 2}; + struct fileIdentDesc cfi; + struct udf_fileident_bh fibh; + + if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi)) + goto out_unlock; + + if (fibh.sbh != fibh.ebh) + brelse(fibh.ebh); + brelse(fibh.sbh); + + tloc = lelb_to_cpu(cfi.icb.extLocation); + inode = udf_iget(child->d_inode->i_sb, &tloc); + if (!inode) + goto out_unlock; + + return d_obtain_alias(inode); +out_unlock: + return ERR_PTR(-EACCES); +} + + +static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block, + u16 partref, __u32 generation) +{ + struct inode *inode; + struct kernel_lb_addr loc; + + if (block == 0) + return ERR_PTR(-ESTALE); + + loc.logicalBlockNum = block; + loc.partitionReferenceNum = partref; + inode = udf_iget(sb, &loc); + + if (inode == NULL) + return ERR_PTR(-ENOMEM); + + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + return d_obtain_alias(inode); +} + +static struct dentry *udf_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + if ((fh_len != 3 && fh_len != 5) || + (fh_type != FILEID_UDF_WITH_PARENT && + fh_type != FILEID_UDF_WITHOUT_PARENT)) + return NULL; + + return udf_nfs_get_inode(sb, fid->udf.block, fid->udf.partref, + fid->udf.generation); +} + +static struct dentry *udf_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + if (fh_len != 5 || fh_type != FILEID_UDF_WITH_PARENT) + return NULL; + + return udf_nfs_get_inode(sb, fid->udf.parent_block, + fid->udf.parent_partref, + fid->udf.parent_generation); +} +static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp, + int connectable) +{ + int len = *lenp; + struct inode *inode = de->d_inode; + struct kernel_lb_addr location = UDF_I(inode)->i_location; + struct fid *fid = (struct fid *)fh; + int type = FILEID_UDF_WITHOUT_PARENT; + + if (connectable && (len < 5)) { + *lenp = 5; + return 255; + } else if (len < 3) { + *lenp = 3; + return 255; + } + + *lenp = 3; + fid->udf.block = location.logicalBlockNum; + fid->udf.partref = location.partitionReferenceNum; + fid->udf.generation = inode->i_generation; + + if (connectable && !S_ISDIR(inode->i_mode)) { + spin_lock(&de->d_lock); + inode = de->d_parent->d_inode; + location = UDF_I(inode)->i_location; + fid->udf.parent_block = location.logicalBlockNum; + fid->udf.parent_partref = location.partitionReferenceNum; + fid->udf.parent_generation = inode->i_generation; + spin_unlock(&de->d_lock); + *lenp = 5; + type = FILEID_UDF_WITH_PARENT; + } + + return type; +} + +const struct export_operations udf_export_ops = { + .encode_fh = udf_encode_fh, + .fh_to_dentry = udf_fh_to_dentry, + .fh_to_parent = udf_fh_to_parent, + .get_parent = udf_get_parent, +}; + +const struct inode_operations udf_dir_inode_operations = { + .lookup = udf_lookup, + .create = udf_create, + .link = udf_link, + .unlink = udf_unlink, + .symlink = udf_symlink, + .mkdir = udf_mkdir, + .rmdir = udf_rmdir, + .mknod = udf_mknod, + .rename = udf_rename, +}; +const struct inode_operations udf_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +}; diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h new file mode 100644 index 00000000..fbff7465 --- /dev/null +++ b/fs/udf/osta_udf.h @@ -0,0 +1,279 @@ +/* + * osta_udf.h + * + * This file is based on OSTA UDF(tm) 2.50 (April 30, 2003) + * http://www.osta.org + * + * Copyright (c) 2001-2004 Ben Fennema + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU Public License ("GPL"). + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "ecma_167.h" + +#ifndef _OSTA_UDF_H +#define _OSTA_UDF_H 1 + +/* OSTA CS0 Charspec (UDF 2.50 2.1.2) */ +#define UDF_CHAR_SET_TYPE 0 +#define UDF_CHAR_SET_INFO "OSTA Compressed Unicode" + +/* Entity Identifier (UDF 2.50 2.1.5) */ +/* Identifiers (UDF 2.50 2.1.5.2) */ +#define UDF_ID_DEVELOPER "*Linux UDFFS" +#define UDF_ID_COMPLIANT "*OSTA UDF Compliant" +#define UDF_ID_LV_INFO "*UDF LV Info" +#define UDF_ID_FREE_EA "*UDF FreeEASpace" +#define UDF_ID_FREE_APP_EA "*UDF FreeAppEASpace" +#define UDF_ID_DVD_CGMS "*UDF DVD CGMS Info" +#define UDF_ID_OS2_EA "*UDF OS/2 EA" +#define UDF_ID_OS2_EA_LENGTH "*UDF OS/2 EALength" +#define UDF_ID_MAC_VOLUME "*UDF Mac VolumeInfo" +#define UDF_ID_MAC_FINDER "*UDF Mac FinderInfo" +#define UDF_ID_MAC_UNIQUE "*UDF Mac UniqueIDTable" +#define UDF_ID_MAC_RESOURCE "*UDF Mac ResourceFork" +#define UDF_ID_VIRTUAL "*UDF Virtual Partition" +#define UDF_ID_SPARABLE "*UDF Sparable Partition" +#define UDF_ID_ALLOC "*UDF Virtual Alloc Tbl" +#define UDF_ID_SPARING "*UDF Sparing Table" +#define UDF_ID_METADATA "*UDF Metadata Partition" + +/* Identifier Suffix (UDF 2.50 2.1.5.3) */ +#define IS_DF_HARD_WRITE_PROTECT 0x01 +#define IS_DF_SOFT_WRITE_PROTECT 0x02 + +struct UDFIdentSuffix { + __le16 UDFRevision; + uint8_t OSClass; + uint8_t OSIdentifier; + uint8_t reserved[4]; +} __attribute__ ((packed)); + +struct impIdentSuffix { + uint8_t OSClass; + uint8_t OSIdentifier; + uint8_t reserved[6]; +} __attribute__ ((packed)); + +struct appIdentSuffix { + uint8_t impUse[8]; +} __attribute__ ((packed)); + +/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */ +/* Implementation Use (UDF 2.50 2.2.6.4) */ +struct logicalVolIntegrityDescImpUse { + struct regid impIdent; + __le32 numFiles; + __le32 numDirs; + __le16 minUDFReadRev; + __le16 minUDFWriteRev; + __le16 maxUDFWriteRev; + uint8_t impUse[0]; +} __attribute__ ((packed)); + +/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */ +/* Implementation Use (UDF 2.50 2.2.7.2) */ +struct impUseVolDescImpUse { + struct charspec LVICharset; + dstring logicalVolIdent[128]; + dstring LVInfo1[36]; + dstring LVInfo2[36]; + dstring LVInfo3[36]; + struct regid impIdent; + uint8_t impUse[128]; +} __attribute__ ((packed)); + +struct udfPartitionMap2 { + uint8_t partitionMapType; + uint8_t partitionMapLength; + uint8_t reserved1[2]; + struct regid partIdent; + __le16 volSeqNum; + __le16 partitionNum; +} __attribute__ ((packed)); + +/* Virtual Partition Map (UDF 2.50 2.2.8) */ +struct virtualPartitionMap { + uint8_t partitionMapType; + uint8_t partitionMapLength; + uint8_t reserved1[2]; + struct regid partIdent; + __le16 volSeqNum; + __le16 partitionNum; + uint8_t reserved2[24]; +} __attribute__ ((packed)); + +/* Sparable Partition Map (UDF 2.50 2.2.9) */ +struct sparablePartitionMap { + uint8_t partitionMapType; + uint8_t partitionMapLength; + uint8_t reserved1[2]; + struct regid partIdent; + __le16 volSeqNum; + __le16 partitionNum; + __le16 packetLength; + uint8_t numSparingTables; + uint8_t reserved2[1]; + __le32 sizeSparingTable; + __le32 locSparingTable[4]; +} __attribute__ ((packed)); + +/* Metadata Partition Map (UDF 2.4.0 2.2.10) */ +struct metadataPartitionMap { + uint8_t partitionMapType; + uint8_t partitionMapLength; + uint8_t reserved1[2]; + struct regid partIdent; + __le16 volSeqNum; + __le16 partitionNum; + __le32 metadataFileLoc; + __le32 metadataMirrorFileLoc; + __le32 metadataBitmapFileLoc; + __le32 allocUnitSize; + __le16 alignUnitSize; + uint8_t flags; + uint8_t reserved2[5]; +} __attribute__ ((packed)); + +/* Virtual Allocation Table (UDF 1.5 2.2.10) */ +struct virtualAllocationTable15 { + __le32 VirtualSector[0]; + struct regid vatIdent; + __le32 previousVATICBLoc; +} __attribute__ ((packed)); + +#define ICBTAG_FILE_TYPE_VAT15 0x00U + +/* Virtual Allocation Table (UDF 2.50 2.2.11) */ +struct virtualAllocationTable20 { + __le16 lengthHeader; + __le16 lengthImpUse; + dstring logicalVolIdent[128]; + __le32 previousVATICBLoc; + __le32 numFiles; + __le32 numDirs; + __le16 minReadRevision; + __le16 minWriteRevision; + __le16 maxWriteRevision; + __le16 reserved; + uint8_t impUse[0]; + __le32 vatEntry[0]; +} __attribute__ ((packed)); + +#define ICBTAG_FILE_TYPE_VAT20 0xF8U + +/* Sparing Table (UDF 2.50 2.2.12) */ +struct sparingEntry { + __le32 origLocation; + __le32 mappedLocation; +} __attribute__ ((packed)); + +struct sparingTable { + struct tag descTag; + struct regid sparingIdent; + __le16 reallocationTableLen; + __le16 reserved; + __le32 sequenceNum; + struct sparingEntry + mapEntry[0]; +} __attribute__ ((packed)); + +/* Metadata File (and Metadata Mirror File) (UDF 2.50 2.2.13.1) */ +#define ICBTAG_FILE_TYPE_MAIN 0xFA +#define ICBTAG_FILE_TYPE_MIRROR 0xFB +#define ICBTAG_FILE_TYPE_BITMAP 0xFC + +/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */ +struct allocDescImpUse { + __le16 flags; + uint8_t impUse[4]; +} __attribute__ ((packed)); + +#define AD_IU_EXT_ERASED 0x0001 + +/* Real-Time Files (UDF 2.50 6.11) */ +#define ICBTAG_FILE_TYPE_REALTIME 0xF9U + +/* Implementation Use Extended Attribute (UDF 2.50 3.3.4.5) */ +/* FreeEASpace (UDF 2.50 3.3.4.5.1.1) */ +struct freeEaSpace { + __le16 headerChecksum; + uint8_t freeEASpace[0]; +} __attribute__ ((packed)); + +/* DVD Copyright Management Information (UDF 2.50 3.3.4.5.1.2) */ +struct DVDCopyrightImpUse { + __le16 headerChecksum; + uint8_t CGMSInfo; + uint8_t dataType; + uint8_t protectionSystemInfo[4]; +} __attribute__ ((packed)); + +/* Application Use Extended Attribute (UDF 2.50 3.3.4.6) */ +/* FreeAppEASpace (UDF 2.50 3.3.4.6.1) */ +struct freeAppEASpace { + __le16 headerChecksum; + uint8_t freeEASpace[0]; +} __attribute__ ((packed)); + +/* UDF Defined System Stream (UDF 2.50 3.3.7) */ +#define UDF_ID_UNIQUE_ID "*UDF Unique ID Mapping Data" +#define UDF_ID_NON_ALLOC "*UDF Non-Allocatable Space" +#define UDF_ID_POWER_CAL "*UDF Power Cal Table" +#define UDF_ID_BACKUP "*UDF Backup" + +/* Operating System Identifiers (UDF 2.50 6.3) */ +#define UDF_OS_CLASS_UNDEF 0x00U +#define UDF_OS_CLASS_DOS 0x01U +#define UDF_OS_CLASS_OS2 0x02U +#define UDF_OS_CLASS_MAC 0x03U +#define UDF_OS_CLASS_UNIX 0x04U +#define UDF_OS_CLASS_WIN9X 0x05U +#define UDF_OS_CLASS_WINNT 0x06U +#define UDF_OS_CLASS_OS400 0x07U +#define UDF_OS_CLASS_BEOS 0x08U +#define UDF_OS_CLASS_WINCE 0x09U + +#define UDF_OS_ID_UNDEF 0x00U +#define UDF_OS_ID_DOS 0x00U +#define UDF_OS_ID_OS2 0x00U +#define UDF_OS_ID_MAC 0x00U +#define UDF_OS_ID_MAX_OSX 0x01U +#define UDF_OS_ID_UNIX 0x00U +#define UDF_OS_ID_AIX 0x01U +#define UDF_OS_ID_SOLARIS 0x02U +#define UDF_OS_ID_HPUX 0x03U +#define UDF_OS_ID_IRIX 0x04U +#define UDF_OS_ID_LINUX 0x05U +#define UDF_OS_ID_MKLINUX 0x06U +#define UDF_OS_ID_FREEBSD 0x07U +#define UDF_OS_ID_WIN9X 0x00U +#define UDF_OS_ID_WINNT 0x00U +#define UDF_OS_ID_OS400 0x00U +#define UDF_OS_ID_BEOS 0x00U +#define UDF_OS_ID_WINCE 0x00U + +#endif /* _OSTA_UDF_H */ diff --git a/fs/udf/partition.c b/fs/udf/partition.c new file mode 100644 index 00000000..a71090ea --- /dev/null +++ b/fs/udf/partition.c @@ -0,0 +1,334 @@ +/* + * partition.c + * + * PURPOSE + * Partition handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998-2001 Ben Fennema + * + * HISTORY + * + * 12/06/98 blf Created file. + * + */ + +#include "udfdecl.h" +#include "udf_sb.h" +#include "udf_i.h" + +#include +#include +#include +#include + +uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, + uint16_t partition, uint32_t offset) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map; + if (partition >= sbi->s_partitions) { + udf_debug("block=%d, partition=%d, offset=%d: " + "invalid partition\n", block, partition, offset); + return 0xFFFFFFFF; + } + map = &sbi->s_partmaps[partition]; + if (map->s_partition_func) + return map->s_partition_func(sb, block, partition, offset); + else + return map->s_partition_root + block + offset; +} + +uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, + uint16_t partition, uint32_t offset) +{ + struct buffer_head *bh = NULL; + uint32_t newblock; + uint32_t index; + uint32_t loc; + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map; + struct udf_virtual_data *vdata; + struct udf_inode_info *iinfo = UDF_I(sbi->s_vat_inode); + + map = &sbi->s_partmaps[partition]; + vdata = &map->s_type_specific.s_virtual; + + if (block > vdata->s_num_entries) { + udf_debug("Trying to access block beyond end of VAT " + "(%d max %d)\n", block, vdata->s_num_entries); + return 0xFFFFFFFF; + } + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + loc = le32_to_cpu(((__le32 *)(iinfo->i_ext.i_data + + vdata->s_start_offset))[block]); + goto translate; + } + index = (sb->s_blocksize - vdata->s_start_offset) / sizeof(uint32_t); + if (block >= index) { + block -= index; + newblock = 1 + (block / (sb->s_blocksize / sizeof(uint32_t))); + index = block % (sb->s_blocksize / sizeof(uint32_t)); + } else { + newblock = 0; + index = vdata->s_start_offset / sizeof(uint32_t) + block; + } + + loc = udf_block_map(sbi->s_vat_inode, newblock); + + bh = sb_bread(sb, loc); + if (!bh) { + udf_debug("get_pblock(UDF_VIRTUAL_MAP:%p,%d,%d) VAT: %d[%d]\n", + sb, block, partition, loc, index); + return 0xFFFFFFFF; + } + + loc = le32_to_cpu(((__le32 *)bh->b_data)[index]); + + brelse(bh); + +translate: + if (iinfo->i_location.partitionReferenceNum == partition) { + udf_debug("recursive call to udf_get_pblock!\n"); + return 0xFFFFFFFF; + } + + return udf_get_pblock(sb, loc, + iinfo->i_location.partitionReferenceNum, + offset); +} + +inline uint32_t udf_get_pblock_virt20(struct super_block *sb, uint32_t block, + uint16_t partition, uint32_t offset) +{ + return udf_get_pblock_virt15(sb, block, partition, offset); +} + +uint32_t udf_get_pblock_spar15(struct super_block *sb, uint32_t block, + uint16_t partition, uint32_t offset) +{ + int i; + struct sparingTable *st = NULL; + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map; + uint32_t packet; + struct udf_sparing_data *sdata; + + map = &sbi->s_partmaps[partition]; + sdata = &map->s_type_specific.s_sparing; + packet = (block + offset) & ~(sdata->s_packet_len - 1); + + for (i = 0; i < 4; i++) { + if (sdata->s_spar_map[i] != NULL) { + st = (struct sparingTable *) + sdata->s_spar_map[i]->b_data; + break; + } + } + + if (st) { + for (i = 0; i < le16_to_cpu(st->reallocationTableLen); i++) { + struct sparingEntry *entry = &st->mapEntry[i]; + u32 origLoc = le32_to_cpu(entry->origLocation); + if (origLoc >= 0xFFFFFFF0) + break; + else if (origLoc == packet) + return le32_to_cpu(entry->mappedLocation) + + ((block + offset) & + (sdata->s_packet_len - 1)); + else if (origLoc > packet) + break; + } + } + + return map->s_partition_root + block + offset; +} + +int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block) +{ + struct udf_sparing_data *sdata; + struct sparingTable *st = NULL; + struct sparingEntry mapEntry; + uint32_t packet; + int i, j, k, l; + struct udf_sb_info *sbi = UDF_SB(sb); + u16 reallocationTableLen; + struct buffer_head *bh; + int ret = 0; + + mutex_lock(&sbi->s_alloc_mutex); + for (i = 0; i < sbi->s_partitions; i++) { + struct udf_part_map *map = &sbi->s_partmaps[i]; + if (old_block > map->s_partition_root && + old_block < map->s_partition_root + map->s_partition_len) { + sdata = &map->s_type_specific.s_sparing; + packet = (old_block - map->s_partition_root) & + ~(sdata->s_packet_len - 1); + + for (j = 0; j < 4; j++) + if (sdata->s_spar_map[j] != NULL) { + st = (struct sparingTable *) + sdata->s_spar_map[j]->b_data; + break; + } + + if (!st) { + ret = 1; + goto out; + } + + reallocationTableLen = + le16_to_cpu(st->reallocationTableLen); + for (k = 0; k < reallocationTableLen; k++) { + struct sparingEntry *entry = &st->mapEntry[k]; + u32 origLoc = le32_to_cpu(entry->origLocation); + + if (origLoc == 0xFFFFFFFF) { + for (; j < 4; j++) { + int len; + bh = sdata->s_spar_map[j]; + if (!bh) + continue; + + st = (struct sparingTable *) + bh->b_data; + entry->origLocation = + cpu_to_le32(packet); + len = + sizeof(struct sparingTable) + + reallocationTableLen * + sizeof(struct sparingEntry); + udf_update_tag((char *)st, len); + mark_buffer_dirty(bh); + } + *new_block = le32_to_cpu( + entry->mappedLocation) + + ((old_block - + map->s_partition_root) & + (sdata->s_packet_len - 1)); + ret = 0; + goto out; + } else if (origLoc == packet) { + *new_block = le32_to_cpu( + entry->mappedLocation) + + ((old_block - + map->s_partition_root) & + (sdata->s_packet_len - 1)); + ret = 0; + goto out; + } else if (origLoc > packet) + break; + } + + for (l = k; l < reallocationTableLen; l++) { + struct sparingEntry *entry = &st->mapEntry[l]; + u32 origLoc = le32_to_cpu(entry->origLocation); + + if (origLoc != 0xFFFFFFFF) + continue; + + for (; j < 4; j++) { + bh = sdata->s_spar_map[j]; + if (!bh) + continue; + + st = (struct sparingTable *)bh->b_data; + mapEntry = st->mapEntry[l]; + mapEntry.origLocation = + cpu_to_le32(packet); + memmove(&st->mapEntry[k + 1], + &st->mapEntry[k], + (l - k) * + sizeof(struct sparingEntry)); + st->mapEntry[k] = mapEntry; + udf_update_tag((char *)st, + sizeof(struct sparingTable) + + reallocationTableLen * + sizeof(struct sparingEntry)); + mark_buffer_dirty(bh); + } + *new_block = + le32_to_cpu( + st->mapEntry[k].mappedLocation) + + ((old_block - map->s_partition_root) & + (sdata->s_packet_len - 1)); + ret = 0; + goto out; + } + + ret = 1; + goto out; + } /* if old_block */ + } + + if (i == sbi->s_partitions) { + /* outside of partitions */ + /* for now, fail =) */ + ret = 1; + } + +out: + mutex_unlock(&sbi->s_alloc_mutex); + return ret; +} + +static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block, + uint16_t partition, uint32_t offset) +{ + struct super_block *sb = inode->i_sb; + struct udf_part_map *map; + struct kernel_lb_addr eloc; + uint32_t elen; + sector_t ext_offset; + struct extent_position epos = {}; + uint32_t phyblock; + + if (inode_bmap(inode, block, &epos, &eloc, &elen, &ext_offset) != + (EXT_RECORDED_ALLOCATED >> 30)) + phyblock = 0xFFFFFFFF; + else { + map = &UDF_SB(sb)->s_partmaps[partition]; + /* map to sparable/physical partition desc */ + phyblock = udf_get_pblock(sb, eloc.logicalBlockNum, + map->s_partition_num, ext_offset + offset); + } + + brelse(epos.bh); + return phyblock; +} + +uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block, + uint16_t partition, uint32_t offset) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map; + struct udf_meta_data *mdata; + uint32_t retblk; + struct inode *inode; + + udf_debug("READING from METADATA\n"); + + map = &sbi->s_partmaps[partition]; + mdata = &map->s_type_specific.s_metadata; + inode = mdata->s_metadata_fe ? : mdata->s_mirror_fe; + + /* We shouldn't mount such media... */ + BUG_ON(!inode); + retblk = udf_try_read_meta(inode, block, partition, offset); + if (retblk == 0xFFFFFFFF) { + udf_warning(sb, __func__, "error reading from METADATA, " + "trying to read from MIRROR"); + inode = mdata->s_mirror_fe; + if (!inode) + return 0xFFFFFFFF; + retblk = udf_try_read_meta(inode, block, partition, offset); + } + + return retblk; +} diff --git a/fs/udf/super.c b/fs/udf/super.c new file mode 100644 index 00000000..7f0e18aa --- /dev/null +++ b/fs/udf/super.c @@ -0,0 +1,2324 @@ +/* + * super.c + * + * PURPOSE + * Super block routines for the OSTA-UDF(tm) filesystem. + * + * DESCRIPTION + * OSTA-UDF(tm) = Optical Storage Technology Association + * Universal Disk Format. + * + * This code is based on version 2.00 of the UDF specification, + * and revision 3 of the ECMA 167 standard [equivalent to ISO 13346]. + * http://www.osta.org/ + * http://www.ecma.ch/ + * http://www.iso.org/ + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998 Dave Boynton + * (C) 1998-2004 Ben Fennema + * (C) 2000 Stelias Computing Inc + * + * HISTORY + * + * 09/24/98 dgb changed to allow compiling outside of kernel, and + * added some debugging. + * 10/01/98 dgb updated to allow (some) possibility of compiling w/2.0.34 + * 10/16/98 attempting some multi-session support + * 10/17/98 added freespace count for "df" + * 11/11/98 gr added novrs option + * 11/26/98 dgb added fileset,anchor mount options + * 12/06/98 blf really hosed things royally. vat/sparing support. sequenced + * vol descs. rewrote option handling based on isofs + * 12/20/98 find the free space bitmap (if it exists) + */ + +#include "udfdecl.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "udf_sb.h" +#include "udf_i.h" + +#include +#include + +#define VDS_POS_PRIMARY_VOL_DESC 0 +#define VDS_POS_UNALLOC_SPACE_DESC 1 +#define VDS_POS_LOGICAL_VOL_DESC 2 +#define VDS_POS_PARTITION_DESC 3 +#define VDS_POS_IMP_USE_VOL_DESC 4 +#define VDS_POS_VOL_DESC_PTR 5 +#define VDS_POS_TERMINATING_DESC 6 +#define VDS_POS_LENGTH 7 + +#define UDF_DEFAULT_BLOCKSIZE 2048 + +static char error_buf[1024]; + +/* These are the "meat" - everything else is stuffing */ +static int udf_fill_super(struct super_block *, void *, int); +static void udf_put_super(struct super_block *); +static int udf_sync_fs(struct super_block *, int); +static int udf_remount_fs(struct super_block *, int *, char *); +static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad); +static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *, + struct kernel_lb_addr *); +static void udf_load_fileset(struct super_block *, struct buffer_head *, + struct kernel_lb_addr *); +static void udf_open_lvid(struct super_block *); +static void udf_close_lvid(struct super_block *); +static unsigned int udf_count_free(struct super_block *); +static int udf_statfs(struct dentry *, struct kstatfs *); +static int udf_show_options(struct seq_file *, struct vfsmount *); +static void udf_error(struct super_block *sb, const char *function, + const char *fmt, ...); + +struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi) +{ + struct logicalVolIntegrityDesc *lvid = + (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data; + __u32 number_of_partitions = le32_to_cpu(lvid->numOfPartitions); + __u32 offset = number_of_partitions * 2 * + sizeof(uint32_t)/sizeof(uint8_t); + return (struct logicalVolIntegrityDescImpUse *)&(lvid->impUse[offset]); +} + +/* UDF filesystem type */ +static struct dentry *udf_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super); +} + +static struct file_system_type udf_fstype = { + .owner = THIS_MODULE, + .name = "udf", + .mount = udf_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static struct kmem_cache *udf_inode_cachep; + +static struct inode *udf_alloc_inode(struct super_block *sb) +{ + struct udf_inode_info *ei; + ei = kmem_cache_alloc(udf_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + + ei->i_unique = 0; + ei->i_lenExtents = 0; + ei->i_next_alloc_block = 0; + ei->i_next_alloc_goal = 0; + ei->i_strat4096 = 0; + init_rwsem(&ei->i_data_sem); + + return &ei->vfs_inode; +} + +static void udf_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(udf_inode_cachep, UDF_I(inode)); +} + +static void udf_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, udf_i_callback); +} + +static void init_once(void *foo) +{ + struct udf_inode_info *ei = (struct udf_inode_info *)foo; + + ei->i_ext.i_data = NULL; + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + udf_inode_cachep = kmem_cache_create("udf_inode_cache", + sizeof(struct udf_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD), + init_once); + if (!udf_inode_cachep) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(udf_inode_cachep); +} + +/* Superblock operations */ +static const struct super_operations udf_sb_ops = { + .alloc_inode = udf_alloc_inode, + .destroy_inode = udf_destroy_inode, + .write_inode = udf_write_inode, + .evict_inode = udf_evict_inode, + .put_super = udf_put_super, + .sync_fs = udf_sync_fs, + .statfs = udf_statfs, + .remount_fs = udf_remount_fs, + .show_options = udf_show_options, +}; + +struct udf_options { + unsigned char novrs; + unsigned int blocksize; + unsigned int session; + unsigned int lastblock; + unsigned int anchor; + unsigned int volume; + unsigned short partition; + unsigned int fileset; + unsigned int rootdir; + unsigned int flags; + mode_t umask; + gid_t gid; + uid_t uid; + mode_t fmode; + mode_t dmode; + struct nls_table *nls_map; +}; + +static int __init init_udf_fs(void) +{ + int err; + + err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&udf_fstype); + if (err) + goto out; + + return 0; + +out: + destroy_inodecache(); + +out1: + return err; +} + +static void __exit exit_udf_fs(void) +{ + unregister_filesystem(&udf_fstype); + destroy_inodecache(); +} + +module_init(init_udf_fs) +module_exit(exit_udf_fs) + +static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + + sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map), + GFP_KERNEL); + if (!sbi->s_partmaps) { + udf_error(sb, __func__, + "Unable to allocate space for %d partition maps", + count); + sbi->s_partitions = 0; + return -ENOMEM; + } + + sbi->s_partitions = count; + return 0; +} + +static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt) +{ + struct super_block *sb = mnt->mnt_sb; + struct udf_sb_info *sbi = UDF_SB(sb); + + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) + seq_puts(seq, ",nostrict"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET)) + seq_printf(seq, ",bs=%lu", sb->s_blocksize); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) + seq_puts(seq, ",unhide"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) + seq_puts(seq, ",undelete"); + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_USE_AD_IN_ICB)) + seq_puts(seq, ",noadinicb"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_USE_SHORT_AD)) + seq_puts(seq, ",shortad"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_FORGET)) + seq_puts(seq, ",uid=forget"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_IGNORE)) + seq_puts(seq, ",uid=ignore"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_FORGET)) + seq_puts(seq, ",gid=forget"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_IGNORE)) + seq_puts(seq, ",gid=ignore"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) + seq_printf(seq, ",uid=%u", sbi->s_uid); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) + seq_printf(seq, ",gid=%u", sbi->s_gid); + if (sbi->s_umask != 0) + seq_printf(seq, ",umask=%o", sbi->s_umask); + if (sbi->s_fmode != UDF_INVALID_MODE) + seq_printf(seq, ",mode=%o", sbi->s_fmode); + if (sbi->s_dmode != UDF_INVALID_MODE) + seq_printf(seq, ",dmode=%o", sbi->s_dmode); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET)) + seq_printf(seq, ",session=%u", sbi->s_session); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET)) + seq_printf(seq, ",lastblock=%u", sbi->s_last_block); + if (sbi->s_anchor != 0) + seq_printf(seq, ",anchor=%u", sbi->s_anchor); + /* + * volume, partition, fileset and rootdir seem to be ignored + * currently + */ + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) + seq_puts(seq, ",utf8"); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map) + seq_printf(seq, ",iocharset=%s", sbi->s_nls_map->charset); + + return 0; +} + +/* + * udf_parse_options + * + * PURPOSE + * Parse mount options. + * + * DESCRIPTION + * The following mount options are supported: + * + * gid= Set the default group. + * umask= Set the default umask. + * mode= Set the default file permissions. + * dmode= Set the default directory permissions. + * uid= Set the default user. + * bs= Set the block size. + * unhide Show otherwise hidden files. + * undelete Show deleted files in lists. + * adinicb Embed data in the inode (default) + * noadinicb Don't embed data in the inode + * shortad Use short ad's + * longad Use long ad's (default) + * nostrict Unset strict conformance + * iocharset= Set the NLS character set + * + * The remaining are for debugging and disaster recovery: + * + * novrs Skip volume sequence recognition + * + * The following expect a offset from 0. + * + * session= Set the CDROM session (default= last session) + * anchor= Override standard anchor location. (default= 256) + * volume= Override the VolumeDesc location. (unused) + * partition= Override the PartitionDesc location. (unused) + * lastblock= Set the last block of the filesystem/ + * + * The following expect a offset from the partition root. + * + * fileset= Override the fileset block location. (unused) + * rootdir= Override the root directory location. (unused) + * WARNING: overriding the rootdir to a non-directory may + * yield highly unpredictable results. + * + * PRE-CONDITIONS + * options Pointer to mount options string. + * uopts Pointer to mount options variable. + * + * POST-CONDITIONS + * 1 Mount options parsed okay. + * 0 Error parsing mount options. + * + * HISTORY + * July 1, 1997 - Andrew E. Mileski + * Written, tested, and released. + */ + +enum { + Opt_novrs, Opt_nostrict, Opt_bs, Opt_unhide, Opt_undelete, + Opt_noadinicb, Opt_adinicb, Opt_shortad, Opt_longad, + Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock, + Opt_anchor, Opt_volume, Opt_partition, Opt_fileset, + Opt_rootdir, Opt_utf8, Opt_iocharset, + Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore, + Opt_fmode, Opt_dmode +}; + +static const match_table_t tokens = { + {Opt_novrs, "novrs"}, + {Opt_nostrict, "nostrict"}, + {Opt_bs, "bs=%u"}, + {Opt_unhide, "unhide"}, + {Opt_undelete, "undelete"}, + {Opt_noadinicb, "noadinicb"}, + {Opt_adinicb, "adinicb"}, + {Opt_shortad, "shortad"}, + {Opt_longad, "longad"}, + {Opt_uforget, "uid=forget"}, + {Opt_uignore, "uid=ignore"}, + {Opt_gforget, "gid=forget"}, + {Opt_gignore, "gid=ignore"}, + {Opt_gid, "gid=%u"}, + {Opt_uid, "uid=%u"}, + {Opt_umask, "umask=%o"}, + {Opt_session, "session=%u"}, + {Opt_lastblock, "lastblock=%u"}, + {Opt_anchor, "anchor=%u"}, + {Opt_volume, "volume=%u"}, + {Opt_partition, "partition=%u"}, + {Opt_fileset, "fileset=%u"}, + {Opt_rootdir, "rootdir=%u"}, + {Opt_utf8, "utf8"}, + {Opt_iocharset, "iocharset=%s"}, + {Opt_fmode, "mode=%o"}, + {Opt_dmode, "dmode=%o"}, + {Opt_err, NULL} +}; + +static int udf_parse_options(char *options, struct udf_options *uopt, + bool remount) +{ + char *p; + int option; + + uopt->novrs = 0; + uopt->partition = 0xFFFF; + uopt->session = 0xFFFFFFFF; + uopt->lastblock = 0; + uopt->anchor = 0; + uopt->volume = 0xFFFFFFFF; + uopt->rootdir = 0xFFFFFFFF; + uopt->fileset = 0xFFFFFFFF; + uopt->nls_map = NULL; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_novrs: + uopt->novrs = 1; + break; + case Opt_bs: + if (match_int(&args[0], &option)) + return 0; + uopt->blocksize = option; + uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET); + break; + case Opt_unhide: + uopt->flags |= (1 << UDF_FLAG_UNHIDE); + break; + case Opt_undelete: + uopt->flags |= (1 << UDF_FLAG_UNDELETE); + break; + case Opt_noadinicb: + uopt->flags &= ~(1 << UDF_FLAG_USE_AD_IN_ICB); + break; + case Opt_adinicb: + uopt->flags |= (1 << UDF_FLAG_USE_AD_IN_ICB); + break; + case Opt_shortad: + uopt->flags |= (1 << UDF_FLAG_USE_SHORT_AD); + break; + case Opt_longad: + uopt->flags &= ~(1 << UDF_FLAG_USE_SHORT_AD); + break; + case Opt_gid: + if (match_int(args, &option)) + return 0; + uopt->gid = option; + uopt->flags |= (1 << UDF_FLAG_GID_SET); + break; + case Opt_uid: + if (match_int(args, &option)) + return 0; + uopt->uid = option; + uopt->flags |= (1 << UDF_FLAG_UID_SET); + break; + case Opt_umask: + if (match_octal(args, &option)) + return 0; + uopt->umask = option; + break; + case Opt_nostrict: + uopt->flags &= ~(1 << UDF_FLAG_STRICT); + break; + case Opt_session: + if (match_int(args, &option)) + return 0; + uopt->session = option; + if (!remount) + uopt->flags |= (1 << UDF_FLAG_SESSION_SET); + break; + case Opt_lastblock: + if (match_int(args, &option)) + return 0; + uopt->lastblock = option; + if (!remount) + uopt->flags |= (1 << UDF_FLAG_LASTBLOCK_SET); + break; + case Opt_anchor: + if (match_int(args, &option)) + return 0; + uopt->anchor = option; + break; + case Opt_volume: + if (match_int(args, &option)) + return 0; + uopt->volume = option; + break; + case Opt_partition: + if (match_int(args, &option)) + return 0; + uopt->partition = option; + break; + case Opt_fileset: + if (match_int(args, &option)) + return 0; + uopt->fileset = option; + break; + case Opt_rootdir: + if (match_int(args, &option)) + return 0; + uopt->rootdir = option; + break; + case Opt_utf8: + uopt->flags |= (1 << UDF_FLAG_UTF8); + break; +#ifdef CONFIG_UDF_NLS + case Opt_iocharset: + uopt->nls_map = load_nls(args[0].from); + uopt->flags |= (1 << UDF_FLAG_NLS_MAP); + break; +#endif + case Opt_uignore: + uopt->flags |= (1 << UDF_FLAG_UID_IGNORE); + break; + case Opt_uforget: + uopt->flags |= (1 << UDF_FLAG_UID_FORGET); + break; + case Opt_gignore: + uopt->flags |= (1 << UDF_FLAG_GID_IGNORE); + break; + case Opt_gforget: + uopt->flags |= (1 << UDF_FLAG_GID_FORGET); + break; + case Opt_fmode: + if (match_octal(args, &option)) + return 0; + uopt->fmode = option & 0777; + break; + case Opt_dmode: + if (match_octal(args, &option)) + return 0; + uopt->dmode = option & 0777; + break; + default: + printk(KERN_ERR "udf: bad mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } + return 1; +} + +static int udf_remount_fs(struct super_block *sb, int *flags, char *options) +{ + struct udf_options uopt; + struct udf_sb_info *sbi = UDF_SB(sb); + int error = 0; + + uopt.flags = sbi->s_flags; + uopt.uid = sbi->s_uid; + uopt.gid = sbi->s_gid; + uopt.umask = sbi->s_umask; + uopt.fmode = sbi->s_fmode; + uopt.dmode = sbi->s_dmode; + + if (!udf_parse_options(options, &uopt, true)) + return -EINVAL; + + write_lock(&sbi->s_cred_lock); + sbi->s_flags = uopt.flags; + sbi->s_uid = uopt.uid; + sbi->s_gid = uopt.gid; + sbi->s_umask = uopt.umask; + sbi->s_fmode = uopt.fmode; + sbi->s_dmode = uopt.dmode; + write_unlock(&sbi->s_cred_lock); + + if (sbi->s_lvid_bh) { + int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); + if (write_rev > UDF_MAX_WRITE_VERSION) + *flags |= MS_RDONLY; + } + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + goto out_unlock; + + if (*flags & MS_RDONLY) + udf_close_lvid(sb); + else + udf_open_lvid(sb); + +out_unlock: + return error; +} + +/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */ +/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */ +static loff_t udf_check_vsd(struct super_block *sb) +{ + struct volStructDesc *vsd = NULL; + loff_t sector = 32768; + int sectorsize; + struct buffer_head *bh = NULL; + int nsr02 = 0; + int nsr03 = 0; + struct udf_sb_info *sbi; + + sbi = UDF_SB(sb); + if (sb->s_blocksize < sizeof(struct volStructDesc)) + sectorsize = sizeof(struct volStructDesc); + else + sectorsize = sb->s_blocksize; + + sector += (sbi->s_session << sb->s_blocksize_bits); + + udf_debug("Starting at sector %u (%ld byte sectors)\n", + (unsigned int)(sector >> sb->s_blocksize_bits), + sb->s_blocksize); + /* Process the sequence (if applicable) */ + for (; !nsr02 && !nsr03; sector += sectorsize) { + /* Read a block */ + bh = udf_tread(sb, sector >> sb->s_blocksize_bits); + if (!bh) + break; + + /* Look for ISO descriptors */ + vsd = (struct volStructDesc *)(bh->b_data + + (sector & (sb->s_blocksize - 1))); + + if (vsd->stdIdent[0] == 0) { + brelse(bh); + break; + } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001, + VSD_STD_ID_LEN)) { + switch (vsd->structType) { + case 0: + udf_debug("ISO9660 Boot Record found\n"); + break; + case 1: + udf_debug("ISO9660 Primary Volume Descriptor " + "found\n"); + break; + case 2: + udf_debug("ISO9660 Supplementary Volume " + "Descriptor found\n"); + break; + case 3: + udf_debug("ISO9660 Volume Partition Descriptor " + "found\n"); + break; + case 255: + udf_debug("ISO9660 Volume Descriptor Set " + "Terminator found\n"); + break; + default: + udf_debug("ISO9660 VRS (%u) found\n", + vsd->structType); + break; + } + } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_BEA01, + VSD_STD_ID_LEN)) + ; /* nothing */ + else if (!strncmp(vsd->stdIdent, VSD_STD_ID_TEA01, + VSD_STD_ID_LEN)) { + brelse(bh); + break; + } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR02, + VSD_STD_ID_LEN)) + nsr02 = sector; + else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR03, + VSD_STD_ID_LEN)) + nsr03 = sector; + brelse(bh); + } + + if (nsr03) + return nsr03; + else if (nsr02) + return nsr02; + else if (sector - (sbi->s_session << sb->s_blocksize_bits) == 32768) + return -1; + else + return 0; +} + +static int udf_find_fileset(struct super_block *sb, + struct kernel_lb_addr *fileset, + struct kernel_lb_addr *root) +{ + struct buffer_head *bh = NULL; + long lastblock; + uint16_t ident; + struct udf_sb_info *sbi; + + if (fileset->logicalBlockNum != 0xFFFFFFFF || + fileset->partitionReferenceNum != 0xFFFF) { + bh = udf_read_ptagged(sb, fileset, 0, &ident); + + if (!bh) { + return 1; + } else if (ident != TAG_IDENT_FSD) { + brelse(bh); + return 1; + } + + } + + sbi = UDF_SB(sb); + if (!bh) { + /* Search backwards through the partitions */ + struct kernel_lb_addr newfileset; + +/* --> cvg: FIXME - is it reasonable? */ + return 1; + + for (newfileset.partitionReferenceNum = sbi->s_partitions - 1; + (newfileset.partitionReferenceNum != 0xFFFF && + fileset->logicalBlockNum == 0xFFFFFFFF && + fileset->partitionReferenceNum == 0xFFFF); + newfileset.partitionReferenceNum--) { + lastblock = sbi->s_partmaps + [newfileset.partitionReferenceNum] + .s_partition_len; + newfileset.logicalBlockNum = 0; + + do { + bh = udf_read_ptagged(sb, &newfileset, 0, + &ident); + if (!bh) { + newfileset.logicalBlockNum++; + continue; + } + + switch (ident) { + case TAG_IDENT_SBD: + { + struct spaceBitmapDesc *sp; + sp = (struct spaceBitmapDesc *) + bh->b_data; + newfileset.logicalBlockNum += 1 + + ((le32_to_cpu(sp->numOfBytes) + + sizeof(struct spaceBitmapDesc) + - 1) >> sb->s_blocksize_bits); + brelse(bh); + break; + } + case TAG_IDENT_FSD: + *fileset = newfileset; + break; + default: + newfileset.logicalBlockNum++; + brelse(bh); + bh = NULL; + break; + } + } while (newfileset.logicalBlockNum < lastblock && + fileset->logicalBlockNum == 0xFFFFFFFF && + fileset->partitionReferenceNum == 0xFFFF); + } + } + + if ((fileset->logicalBlockNum != 0xFFFFFFFF || + fileset->partitionReferenceNum != 0xFFFF) && bh) { + udf_debug("Fileset at block=%d, partition=%d\n", + fileset->logicalBlockNum, + fileset->partitionReferenceNum); + + sbi->s_partition = fileset->partitionReferenceNum; + udf_load_fileset(sb, bh, root); + brelse(bh); + return 0; + } + return 1; +} + +static int udf_load_pvoldesc(struct super_block *sb, sector_t block) +{ + struct primaryVolDesc *pvoldesc; + struct ustr *instr, *outstr; + struct buffer_head *bh; + uint16_t ident; + int ret = 1; + + instr = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!instr) + return 1; + + outstr = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!outstr) + goto out1; + + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) + goto out2; + + BUG_ON(ident != TAG_IDENT_PVD); + + pvoldesc = (struct primaryVolDesc *)bh->b_data; + + if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time, + pvoldesc->recordingDateAndTime)) { +#ifdef UDFFS_DEBUG + struct timestamp *ts = &pvoldesc->recordingDateAndTime; + udf_debug("recording time %04u/%02u/%02u" + " %02u:%02u (%x)\n", + le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, + ts->minute, le16_to_cpu(ts->typeAndTimezone)); +#endif + } + + if (!udf_build_ustr(instr, pvoldesc->volIdent, 32)) + if (udf_CS0toUTF8(outstr, instr)) { + strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name, + outstr->u_len > 31 ? 31 : outstr->u_len); + udf_debug("volIdent[] = '%s'\n", + UDF_SB(sb)->s_volume_ident); + } + + if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) + if (udf_CS0toUTF8(outstr, instr)) + udf_debug("volSetIdent[] = '%s'\n", outstr->u_name); + + brelse(bh); + ret = 0; +out2: + kfree(outstr); +out1: + kfree(instr); + return ret; +} + +static int udf_load_metadata_files(struct super_block *sb, int partition) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map; + struct udf_meta_data *mdata; + struct kernel_lb_addr addr; + int fe_error = 0; + + map = &sbi->s_partmaps[partition]; + mdata = &map->s_type_specific.s_metadata; + + /* metadata address */ + addr.logicalBlockNum = mdata->s_meta_file_loc; + addr.partitionReferenceNum = map->s_partition_num; + + udf_debug("Metadata file location: block = %d part = %d\n", + addr.logicalBlockNum, addr.partitionReferenceNum); + + mdata->s_metadata_fe = udf_iget(sb, &addr); + + if (mdata->s_metadata_fe == NULL) { + udf_warning(sb, __func__, "metadata inode efe not found, " + "will try mirror inode."); + fe_error = 1; + } else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type != + ICBTAG_FLAG_AD_SHORT) { + udf_warning(sb, __func__, "metadata inode efe does not have " + "short allocation descriptors!"); + fe_error = 1; + iput(mdata->s_metadata_fe); + mdata->s_metadata_fe = NULL; + } + + /* mirror file entry */ + addr.logicalBlockNum = mdata->s_mirror_file_loc; + addr.partitionReferenceNum = map->s_partition_num; + + udf_debug("Mirror metadata file location: block = %d part = %d\n", + addr.logicalBlockNum, addr.partitionReferenceNum); + + mdata->s_mirror_fe = udf_iget(sb, &addr); + + if (mdata->s_mirror_fe == NULL) { + if (fe_error) { + udf_error(sb, __func__, "mirror inode efe not found " + "and metadata inode is missing too, exiting..."); + goto error_exit; + } else + udf_warning(sb, __func__, "mirror inode efe not found," + " but metadata inode is OK"); + } else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type != + ICBTAG_FLAG_AD_SHORT) { + udf_warning(sb, __func__, "mirror inode efe does not have " + "short allocation descriptors!"); + iput(mdata->s_mirror_fe); + mdata->s_mirror_fe = NULL; + if (fe_error) + goto error_exit; + } + + /* + * bitmap file entry + * Note: + * Load only if bitmap file location differs from 0xFFFFFFFF (DCN-5102) + */ + if (mdata->s_bitmap_file_loc != 0xFFFFFFFF) { + addr.logicalBlockNum = mdata->s_bitmap_file_loc; + addr.partitionReferenceNum = map->s_partition_num; + + udf_debug("Bitmap file location: block = %d part = %d\n", + addr.logicalBlockNum, addr.partitionReferenceNum); + + mdata->s_bitmap_fe = udf_iget(sb, &addr); + + if (mdata->s_bitmap_fe == NULL) { + if (sb->s_flags & MS_RDONLY) + udf_warning(sb, __func__, "bitmap inode efe " + "not found but it's ok since the disc" + " is mounted read-only"); + else { + udf_error(sb, __func__, "bitmap inode efe not " + "found and attempted read-write mount"); + goto error_exit; + } + } + } + + udf_debug("udf_load_metadata_files Ok\n"); + + return 0; + +error_exit: + return 1; +} + +static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, + struct kernel_lb_addr *root) +{ + struct fileSetDesc *fset; + + fset = (struct fileSetDesc *)bh->b_data; + + *root = lelb_to_cpu(fset->rootDirectoryICB.extLocation); + + UDF_SB(sb)->s_serial_number = le16_to_cpu(fset->descTag.tagSerialNum); + + udf_debug("Rootdir at block=%d, partition=%d\n", + root->logicalBlockNum, root->partitionReferenceNum); +} + +int udf_compute_nr_groups(struct super_block *sb, u32 partition) +{ + struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; + return DIV_ROUND_UP(map->s_partition_len + + (sizeof(struct spaceBitmapDesc) << 3), + sb->s_blocksize * 8); +} + +static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) +{ + struct udf_bitmap *bitmap; + int nr_groups; + int size; + + nr_groups = udf_compute_nr_groups(sb, index); + size = sizeof(struct udf_bitmap) + + (sizeof(struct buffer_head *) * nr_groups); + + if (size <= PAGE_SIZE) + bitmap = kzalloc(size, GFP_KERNEL); + else + bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ + + if (bitmap == NULL) { + udf_error(sb, __func__, + "Unable to allocate space for bitmap " + "and %d buffer_head pointers", nr_groups); + return NULL; + } + + bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1); + bitmap->s_nr_groups = nr_groups; + return bitmap; +} + +static int udf_fill_partdesc_info(struct super_block *sb, + struct partitionDesc *p, int p_index) +{ + struct udf_part_map *map; + struct udf_sb_info *sbi = UDF_SB(sb); + struct partitionHeaderDesc *phd; + + map = &sbi->s_partmaps[p_index]; + + map->s_partition_len = le32_to_cpu(p->partitionLength); /* blocks */ + map->s_partition_root = le32_to_cpu(p->partitionStartingLocation); + + if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY)) + map->s_partition_flags |= UDF_PART_FLAG_READ_ONLY; + if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_WRITE_ONCE)) + map->s_partition_flags |= UDF_PART_FLAG_WRITE_ONCE; + if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_REWRITABLE)) + map->s_partition_flags |= UDF_PART_FLAG_REWRITABLE; + if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE)) + map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE; + + udf_debug("Partition (%d type %x) starts at physical %d, " + "block length %d\n", p_index, + map->s_partition_type, map->s_partition_root, + map->s_partition_len); + + if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) && + strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03)) + return 0; + + phd = (struct partitionHeaderDesc *)p->partitionContentsUse; + if (phd->unallocSpaceTable.extLength) { + struct kernel_lb_addr loc = { + .logicalBlockNum = le32_to_cpu( + phd->unallocSpaceTable.extPosition), + .partitionReferenceNum = p_index, + }; + + map->s_uspace.s_table = udf_iget(sb, &loc); + if (!map->s_uspace.s_table) { + udf_debug("cannot load unallocSpaceTable (part %d)\n", + p_index); + return 1; + } + map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; + udf_debug("unallocSpaceTable (part %d) @ %ld\n", + p_index, map->s_uspace.s_table->i_ino); + } + + if (phd->unallocSpaceBitmap.extLength) { + struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); + if (!bitmap) + return 1; + map->s_uspace.s_bitmap = bitmap; + bitmap->s_extLength = le32_to_cpu( + phd->unallocSpaceBitmap.extLength); + bitmap->s_extPosition = le32_to_cpu( + phd->unallocSpaceBitmap.extPosition); + map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP; + udf_debug("unallocSpaceBitmap (part %d) @ %d\n", p_index, + bitmap->s_extPosition); + } + + if (phd->partitionIntegrityTable.extLength) + udf_debug("partitionIntegrityTable (part %d)\n", p_index); + + if (phd->freedSpaceTable.extLength) { + struct kernel_lb_addr loc = { + .logicalBlockNum = le32_to_cpu( + phd->freedSpaceTable.extPosition), + .partitionReferenceNum = p_index, + }; + + map->s_fspace.s_table = udf_iget(sb, &loc); + if (!map->s_fspace.s_table) { + udf_debug("cannot load freedSpaceTable (part %d)\n", + p_index); + return 1; + } + + map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; + udf_debug("freedSpaceTable (part %d) @ %ld\n", + p_index, map->s_fspace.s_table->i_ino); + } + + if (phd->freedSpaceBitmap.extLength) { + struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); + if (!bitmap) + return 1; + map->s_fspace.s_bitmap = bitmap; + bitmap->s_extLength = le32_to_cpu( + phd->freedSpaceBitmap.extLength); + bitmap->s_extPosition = le32_to_cpu( + phd->freedSpaceBitmap.extPosition); + map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; + udf_debug("freedSpaceBitmap (part %d) @ %d\n", p_index, + bitmap->s_extPosition); + } + return 0; +} + +static void udf_find_vat_block(struct super_block *sb, int p_index, + int type1_index, sector_t start_block) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map = &sbi->s_partmaps[p_index]; + sector_t vat_block; + struct kernel_lb_addr ino; + + /* + * VAT file entry is in the last recorded block. Some broken disks have + * it a few blocks before so try a bit harder... + */ + ino.partitionReferenceNum = type1_index; + for (vat_block = start_block; + vat_block >= map->s_partition_root && + vat_block >= start_block - 3 && + !sbi->s_vat_inode; vat_block--) { + ino.logicalBlockNum = vat_block - map->s_partition_root; + sbi->s_vat_inode = udf_iget(sb, &ino); + } +} + +static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map = &sbi->s_partmaps[p_index]; + struct buffer_head *bh = NULL; + struct udf_inode_info *vati; + uint32_t pos; + struct virtualAllocationTable20 *vat20; + sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + + udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block); + if (!sbi->s_vat_inode && + sbi->s_last_block != blocks - 1) { + printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the" + " last recorded block (%lu), retrying with the last " + "block of the device (%lu).\n", + (unsigned long)sbi->s_last_block, + (unsigned long)blocks - 1); + udf_find_vat_block(sb, p_index, type1_index, blocks - 1); + } + if (!sbi->s_vat_inode) + return 1; + + if (map->s_partition_type == UDF_VIRTUAL_MAP15) { + map->s_type_specific.s_virtual.s_start_offset = 0; + map->s_type_specific.s_virtual.s_num_entries = + (sbi->s_vat_inode->i_size - 36) >> 2; + } else if (map->s_partition_type == UDF_VIRTUAL_MAP20) { + vati = UDF_I(sbi->s_vat_inode); + if (vati->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + pos = udf_block_map(sbi->s_vat_inode, 0); + bh = sb_bread(sb, pos); + if (!bh) + return 1; + vat20 = (struct virtualAllocationTable20 *)bh->b_data; + } else { + vat20 = (struct virtualAllocationTable20 *) + vati->i_ext.i_data; + } + + map->s_type_specific.s_virtual.s_start_offset = + le16_to_cpu(vat20->lengthHeader); + map->s_type_specific.s_virtual.s_num_entries = + (sbi->s_vat_inode->i_size - + map->s_type_specific.s_virtual. + s_start_offset) >> 2; + brelse(bh); + } + return 0; +} + +static int udf_load_partdesc(struct super_block *sb, sector_t block) +{ + struct buffer_head *bh; + struct partitionDesc *p; + struct udf_part_map *map; + struct udf_sb_info *sbi = UDF_SB(sb); + int i, type1_idx; + uint16_t partitionNumber; + uint16_t ident; + int ret = 0; + + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) + return 1; + if (ident != TAG_IDENT_PD) + goto out_bh; + + p = (struct partitionDesc *)bh->b_data; + partitionNumber = le16_to_cpu(p->partitionNumber); + + /* First scan for TYPE1, SPARABLE and METADATA partitions */ + for (i = 0; i < sbi->s_partitions; i++) { + map = &sbi->s_partmaps[i]; + udf_debug("Searching map: (%d == %d)\n", + map->s_partition_num, partitionNumber); + if (map->s_partition_num == partitionNumber && + (map->s_partition_type == UDF_TYPE1_MAP15 || + map->s_partition_type == UDF_SPARABLE_MAP15)) + break; + } + + if (i >= sbi->s_partitions) { + udf_debug("Partition (%d) not found in partition map\n", + partitionNumber); + goto out_bh; + } + + ret = udf_fill_partdesc_info(sb, p, i); + + /* + * Now rescan for VIRTUAL or METADATA partitions when SPARABLE and + * PHYSICAL partitions are already set up + */ + type1_idx = i; + for (i = 0; i < sbi->s_partitions; i++) { + map = &sbi->s_partmaps[i]; + + if (map->s_partition_num == partitionNumber && + (map->s_partition_type == UDF_VIRTUAL_MAP15 || + map->s_partition_type == UDF_VIRTUAL_MAP20 || + map->s_partition_type == UDF_METADATA_MAP25)) + break; + } + + if (i >= sbi->s_partitions) + goto out_bh; + + ret = udf_fill_partdesc_info(sb, p, i); + if (ret) + goto out_bh; + + if (map->s_partition_type == UDF_METADATA_MAP25) { + ret = udf_load_metadata_files(sb, i); + if (ret) { + printk(KERN_ERR "UDF-fs: error loading MetaData " + "partition map %d\n", i); + goto out_bh; + } + } else { + ret = udf_load_vat(sb, i, type1_idx); + if (ret) + goto out_bh; + /* + * Mark filesystem read-only if we have a partition with + * virtual map since we don't handle writing to it (we + * overwrite blocks instead of relocating them). + */ + sb->s_flags |= MS_RDONLY; + printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only " + "because writing to pseudooverwrite partition is " + "not implemented.\n"); + } +out_bh: + /* In case loading failed, we handle cleanup in udf_fill_super */ + brelse(bh); + return ret; +} + +static int udf_load_logicalvol(struct super_block *sb, sector_t block, + struct kernel_lb_addr *fileset) +{ + struct logicalVolDesc *lvd; + int i, j, offset; + uint8_t type; + struct udf_sb_info *sbi = UDF_SB(sb); + struct genericPartitionMap *gpm; + uint16_t ident; + struct buffer_head *bh; + int ret = 0; + + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) + return 1; + BUG_ON(ident != TAG_IDENT_LVD); + lvd = (struct logicalVolDesc *)bh->b_data; + + i = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps)); + if (i != 0) { + ret = i; + goto out_bh; + } + + for (i = 0, offset = 0; + i < sbi->s_partitions && offset < le32_to_cpu(lvd->mapTableLength); + i++, offset += gpm->partitionMapLength) { + struct udf_part_map *map = &sbi->s_partmaps[i]; + gpm = (struct genericPartitionMap *) + &(lvd->partitionMaps[offset]); + type = gpm->partitionMapType; + if (type == 1) { + struct genericPartitionMap1 *gpm1 = + (struct genericPartitionMap1 *)gpm; + map->s_partition_type = UDF_TYPE1_MAP15; + map->s_volumeseqnum = le16_to_cpu(gpm1->volSeqNum); + map->s_partition_num = le16_to_cpu(gpm1->partitionNum); + map->s_partition_func = NULL; + } else if (type == 2) { + struct udfPartitionMap2 *upm2 = + (struct udfPartitionMap2 *)gpm; + if (!strncmp(upm2->partIdent.ident, UDF_ID_VIRTUAL, + strlen(UDF_ID_VIRTUAL))) { + u16 suf = + le16_to_cpu(((__le16 *)upm2->partIdent. + identSuffix)[0]); + if (suf < 0x0200) { + map->s_partition_type = + UDF_VIRTUAL_MAP15; + map->s_partition_func = + udf_get_pblock_virt15; + } else { + map->s_partition_type = + UDF_VIRTUAL_MAP20; + map->s_partition_func = + udf_get_pblock_virt20; + } + } else if (!strncmp(upm2->partIdent.ident, + UDF_ID_SPARABLE, + strlen(UDF_ID_SPARABLE))) { + uint32_t loc; + struct sparingTable *st; + struct sparablePartitionMap *spm = + (struct sparablePartitionMap *)gpm; + + map->s_partition_type = UDF_SPARABLE_MAP15; + map->s_type_specific.s_sparing.s_packet_len = + le16_to_cpu(spm->packetLength); + for (j = 0; j < spm->numSparingTables; j++) { + struct buffer_head *bh2; + + loc = le32_to_cpu( + spm->locSparingTable[j]); + bh2 = udf_read_tagged(sb, loc, loc, + &ident); + map->s_type_specific.s_sparing. + s_spar_map[j] = bh2; + + if (bh2 == NULL) + continue; + + st = (struct sparingTable *)bh2->b_data; + if (ident != 0 || strncmp( + st->sparingIdent.ident, + UDF_ID_SPARING, + strlen(UDF_ID_SPARING))) { + brelse(bh2); + map->s_type_specific.s_sparing. + s_spar_map[j] = NULL; + } + } + map->s_partition_func = udf_get_pblock_spar15; + } else if (!strncmp(upm2->partIdent.ident, + UDF_ID_METADATA, + strlen(UDF_ID_METADATA))) { + struct udf_meta_data *mdata = + &map->s_type_specific.s_metadata; + struct metadataPartitionMap *mdm = + (struct metadataPartitionMap *) + &(lvd->partitionMaps[offset]); + udf_debug("Parsing Logical vol part %d " + "type %d id=%s\n", i, type, + UDF_ID_METADATA); + + map->s_partition_type = UDF_METADATA_MAP25; + map->s_partition_func = udf_get_pblock_meta25; + + mdata->s_meta_file_loc = + le32_to_cpu(mdm->metadataFileLoc); + mdata->s_mirror_file_loc = + le32_to_cpu(mdm->metadataMirrorFileLoc); + mdata->s_bitmap_file_loc = + le32_to_cpu(mdm->metadataBitmapFileLoc); + mdata->s_alloc_unit_size = + le32_to_cpu(mdm->allocUnitSize); + mdata->s_align_unit_size = + le16_to_cpu(mdm->alignUnitSize); + mdata->s_dup_md_flag = + mdm->flags & 0x01; + + udf_debug("Metadata Ident suffix=0x%x\n", + (le16_to_cpu( + ((__le16 *) + mdm->partIdent.identSuffix)[0]))); + udf_debug("Metadata part num=%d\n", + le16_to_cpu(mdm->partitionNum)); + udf_debug("Metadata part alloc unit size=%d\n", + le32_to_cpu(mdm->allocUnitSize)); + udf_debug("Metadata file loc=%d\n", + le32_to_cpu(mdm->metadataFileLoc)); + udf_debug("Mirror file loc=%d\n", + le32_to_cpu(mdm->metadataMirrorFileLoc)); + udf_debug("Bitmap file loc=%d\n", + le32_to_cpu(mdm->metadataBitmapFileLoc)); + udf_debug("Duplicate Flag: %d %d\n", + mdata->s_dup_md_flag, mdm->flags); + } else { + udf_debug("Unknown ident: %s\n", + upm2->partIdent.ident); + continue; + } + map->s_volumeseqnum = le16_to_cpu(upm2->volSeqNum); + map->s_partition_num = le16_to_cpu(upm2->partitionNum); + } + udf_debug("Partition (%d:%d) type %d on volume %d\n", + i, map->s_partition_num, type, + map->s_volumeseqnum); + } + + if (fileset) { + struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]); + + *fileset = lelb_to_cpu(la->extLocation); + udf_debug("FileSet found in LogicalVolDesc at block=%d, " + "partition=%d\n", fileset->logicalBlockNum, + fileset->partitionReferenceNum); + } + if (lvd->integritySeqExt.extLength) + udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt)); + +out_bh: + brelse(bh); + return ret; +} + +/* + * udf_load_logicalvolint + * + */ +static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc) +{ + struct buffer_head *bh = NULL; + uint16_t ident; + struct udf_sb_info *sbi = UDF_SB(sb); + struct logicalVolIntegrityDesc *lvid; + + while (loc.extLength > 0 && + (bh = udf_read_tagged(sb, loc.extLocation, + loc.extLocation, &ident)) && + ident == TAG_IDENT_LVID) { + sbi->s_lvid_bh = bh; + lvid = (struct logicalVolIntegrityDesc *)bh->b_data; + + if (lvid->nextIntegrityExt.extLength) + udf_load_logicalvolint(sb, + leea_to_cpu(lvid->nextIntegrityExt)); + + if (sbi->s_lvid_bh != bh) + brelse(bh); + loc.extLength -= sb->s_blocksize; + loc.extLocation++; + } + if (sbi->s_lvid_bh != bh) + brelse(bh); +} + +/* + * udf_process_sequence + * + * PURPOSE + * Process a main/reserve volume descriptor sequence. + * + * PRE-CONDITIONS + * sb Pointer to _locked_ superblock. + * block First block of first extent of the sequence. + * lastblock Lastblock of first extent of the sequence. + * + * HISTORY + * July 1, 1997 - Andrew E. Mileski + * Written, tested, and released. + */ +static noinline int udf_process_sequence(struct super_block *sb, long block, + long lastblock, struct kernel_lb_addr *fileset) +{ + struct buffer_head *bh = NULL; + struct udf_vds_record vds[VDS_POS_LENGTH]; + struct udf_vds_record *curr; + struct generic_desc *gd; + struct volDescPtr *vdp; + int done = 0; + uint32_t vdsn; + uint16_t ident; + long next_s = 0, next_e = 0; + + memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); + + /* + * Read the main descriptor sequence and find which descriptors + * are in it. + */ + for (; (!done && block <= lastblock); block++) { + + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) { + printk(KERN_ERR "udf: Block %Lu of volume descriptor " + "sequence is corrupted or we could not read " + "it.\n", (unsigned long long)block); + return 1; + } + + /* Process each descriptor (ISO 13346 3/8.3-8.4) */ + gd = (struct generic_desc *)bh->b_data; + vdsn = le32_to_cpu(gd->volDescSeqNum); + switch (ident) { + case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */ + curr = &vds[VDS_POS_PRIMARY_VOL_DESC]; + if (vdsn >= curr->volDescSeqNum) { + curr->volDescSeqNum = vdsn; + curr->block = block; + } + break; + case TAG_IDENT_VDP: /* ISO 13346 3/10.3 */ + curr = &vds[VDS_POS_VOL_DESC_PTR]; + if (vdsn >= curr->volDescSeqNum) { + curr->volDescSeqNum = vdsn; + curr->block = block; + + vdp = (struct volDescPtr *)bh->b_data; + next_s = le32_to_cpu( + vdp->nextVolDescSeqExt.extLocation); + next_e = le32_to_cpu( + vdp->nextVolDescSeqExt.extLength); + next_e = next_e >> sb->s_blocksize_bits; + next_e += next_s; + } + break; + case TAG_IDENT_IUVD: /* ISO 13346 3/10.4 */ + curr = &vds[VDS_POS_IMP_USE_VOL_DESC]; + if (vdsn >= curr->volDescSeqNum) { + curr->volDescSeqNum = vdsn; + curr->block = block; + } + break; + case TAG_IDENT_PD: /* ISO 13346 3/10.5 */ + curr = &vds[VDS_POS_PARTITION_DESC]; + if (!curr->block) + curr->block = block; + break; + case TAG_IDENT_LVD: /* ISO 13346 3/10.6 */ + curr = &vds[VDS_POS_LOGICAL_VOL_DESC]; + if (vdsn >= curr->volDescSeqNum) { + curr->volDescSeqNum = vdsn; + curr->block = block; + } + break; + case TAG_IDENT_USD: /* ISO 13346 3/10.8 */ + curr = &vds[VDS_POS_UNALLOC_SPACE_DESC]; + if (vdsn >= curr->volDescSeqNum) { + curr->volDescSeqNum = vdsn; + curr->block = block; + } + break; + case TAG_IDENT_TD: /* ISO 13346 3/10.9 */ + vds[VDS_POS_TERMINATING_DESC].block = block; + if (next_e) { + block = next_s; + lastblock = next_e; + next_s = next_e = 0; + } else + done = 1; + break; + } + brelse(bh); + } + /* + * Now read interesting descriptors again and process them + * in a suitable order + */ + if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) { + printk(KERN_ERR "udf: Primary Volume Descriptor not found!\n"); + return 1; + } + if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block)) + return 1; + + if (vds[VDS_POS_LOGICAL_VOL_DESC].block && udf_load_logicalvol(sb, + vds[VDS_POS_LOGICAL_VOL_DESC].block, fileset)) + return 1; + + if (vds[VDS_POS_PARTITION_DESC].block) { + /* + * We rescan the whole descriptor sequence to find + * partition descriptor blocks and process them. + */ + for (block = vds[VDS_POS_PARTITION_DESC].block; + block < vds[VDS_POS_TERMINATING_DESC].block; + block++) + if (udf_load_partdesc(sb, block)) + return 1; + } + + return 0; +} + +static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, + struct kernel_lb_addr *fileset) +{ + struct anchorVolDescPtr *anchor; + long main_s, main_e, reserve_s, reserve_e; + + anchor = (struct anchorVolDescPtr *)bh->b_data; + + /* Locate the main sequence */ + main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation); + main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength); + main_e = main_e >> sb->s_blocksize_bits; + main_e += main_s; + + /* Locate the reserve sequence */ + reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation); + reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength); + reserve_e = reserve_e >> sb->s_blocksize_bits; + reserve_e += reserve_s; + + /* Process the main & reserve sequences */ + /* responsible for finding the PartitionDesc(s) */ + if (!udf_process_sequence(sb, main_s, main_e, fileset)) + return 1; + return !udf_process_sequence(sb, reserve_s, reserve_e, fileset); +} + +/* + * Check whether there is an anchor block in the given block and + * load Volume Descriptor Sequence if so. + */ +static int udf_check_anchor_block(struct super_block *sb, sector_t block, + struct kernel_lb_addr *fileset) +{ + struct buffer_head *bh; + uint16_t ident; + int ret; + + if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && + udf_fixed_to_variable(block) >= + sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) + return 0; + + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) + return 0; + if (ident != TAG_IDENT_AVDP) { + brelse(bh); + return 0; + } + ret = udf_load_sequence(sb, bh, fileset); + brelse(bh); + return ret; +} + +/* Search for an anchor volume descriptor pointer */ +static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock, + struct kernel_lb_addr *fileset) +{ + sector_t last[6]; + int i; + struct udf_sb_info *sbi = UDF_SB(sb); + int last_count = 0; + + /* First try user provided anchor */ + if (sbi->s_anchor) { + if (udf_check_anchor_block(sb, sbi->s_anchor, fileset)) + return lastblock; + } + /* + * according to spec, anchor is in either: + * block 256 + * lastblock-256 + * lastblock + * however, if the disc isn't closed, it could be 512. + */ + if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset)) + return lastblock; + /* + * The trouble is which block is the last one. Drives often misreport + * this so we try various possibilities. + */ + last[last_count++] = lastblock; + if (lastblock >= 1) + last[last_count++] = lastblock - 1; + last[last_count++] = lastblock + 1; + if (lastblock >= 2) + last[last_count++] = lastblock - 2; + if (lastblock >= 150) + last[last_count++] = lastblock - 150; + if (lastblock >= 152) + last[last_count++] = lastblock - 152; + + for (i = 0; i < last_count; i++) { + if (last[i] >= sb->s_bdev->bd_inode->i_size >> + sb->s_blocksize_bits) + continue; + if (udf_check_anchor_block(sb, last[i], fileset)) + return last[i]; + if (last[i] < 256) + continue; + if (udf_check_anchor_block(sb, last[i] - 256, fileset)) + return last[i]; + } + + /* Finally try block 512 in case media is open */ + if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset)) + return last[0]; + return 0; +} + +/* + * Find an anchor volume descriptor and load Volume Descriptor Sequence from + * area specified by it. The function expects sbi->s_lastblock to be the last + * block on the media. + * + * Return 1 if ok, 0 if not found. + * + */ +static int udf_find_anchor(struct super_block *sb, + struct kernel_lb_addr *fileset) +{ + sector_t lastblock; + struct udf_sb_info *sbi = UDF_SB(sb); + + lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset); + if (lastblock) + goto out; + + /* No anchor found? Try VARCONV conversion of block numbers */ + UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); + /* Firstly, we try to not convert number of the last block */ + lastblock = udf_scan_anchors(sb, + udf_variable_to_fixed(sbi->s_last_block), + fileset); + if (lastblock) + goto out; + + /* Secondly, we try with converted number of the last block */ + lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset); + if (!lastblock) { + /* VARCONV didn't help. Clear it. */ + UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV); + return 0; + } +out: + sbi->s_last_block = lastblock; + return 1; +} + +/* + * Check Volume Structure Descriptor, find Anchor block and load Volume + * Descriptor Sequence + */ +static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, + int silent, struct kernel_lb_addr *fileset) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + loff_t nsr_off; + + if (!sb_set_blocksize(sb, uopt->blocksize)) { + if (!silent) + printk(KERN_WARNING "UDF-fs: Bad block size\n"); + return 0; + } + sbi->s_last_block = uopt->lastblock; + if (!uopt->novrs) { + /* Check that it is NSR02 compliant */ + nsr_off = udf_check_vsd(sb); + if (!nsr_off) { + if (!silent) + printk(KERN_WARNING "UDF-fs: No VRS found\n"); + return 0; + } + if (nsr_off == -1) + udf_debug("Failed to read byte 32768. Assuming open " + "disc. Skipping validity check\n"); + if (!sbi->s_last_block) + sbi->s_last_block = udf_get_last_block(sb); + } else { + udf_debug("Validity check skipped because of novrs option\n"); + } + + /* Look for anchor block and load Volume Descriptor Sequence */ + sbi->s_anchor = uopt->anchor; + if (!udf_find_anchor(sb, fileset)) { + if (!silent) + printk(KERN_WARNING "UDF-fs: No anchor found\n"); + return 0; + } + return 1; +} + +static void udf_open_lvid(struct super_block *sb) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct buffer_head *bh = sbi->s_lvid_bh; + struct logicalVolIntegrityDesc *lvid; + struct logicalVolIntegrityDescImpUse *lvidiu; + + if (!bh) + return; + + mutex_lock(&sbi->s_alloc_mutex); + lvid = (struct logicalVolIntegrityDesc *)bh->b_data; + lvidiu = udf_sb_lvidiu(sbi); + + lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; + lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; + udf_time_to_disk_stamp(&lvid->recordingDateAndTime, + CURRENT_TIME); + lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN); + + lvid->descTag.descCRC = cpu_to_le16( + crc_itu_t(0, (char *)lvid + sizeof(struct tag), + le16_to_cpu(lvid->descTag.descCRCLength))); + + lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); + mark_buffer_dirty(bh); + sbi->s_lvid_dirty = 0; + mutex_unlock(&sbi->s_alloc_mutex); +} + +static void udf_close_lvid(struct super_block *sb) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct buffer_head *bh = sbi->s_lvid_bh; + struct logicalVolIntegrityDesc *lvid; + struct logicalVolIntegrityDescImpUse *lvidiu; + + if (!bh) + return; + + mutex_lock(&sbi->s_alloc_mutex); + lvid = (struct logicalVolIntegrityDesc *)bh->b_data; + lvidiu = udf_sb_lvidiu(sbi); + lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; + lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; + udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME); + if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev)) + lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION); + if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev)) + lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev); + if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev)) + lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev); + lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); + + lvid->descTag.descCRC = cpu_to_le16( + crc_itu_t(0, (char *)lvid + sizeof(struct tag), + le16_to_cpu(lvid->descTag.descCRCLength))); + + lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); + /* + * We set buffer uptodate unconditionally here to avoid spurious + * warnings from mark_buffer_dirty() when previous EIO has marked + * the buffer as !uptodate + */ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + sbi->s_lvid_dirty = 0; + mutex_unlock(&sbi->s_alloc_mutex); +} + +u64 lvid_get_unique_id(struct super_block *sb) +{ + struct buffer_head *bh; + struct udf_sb_info *sbi = UDF_SB(sb); + struct logicalVolIntegrityDesc *lvid; + struct logicalVolHeaderDesc *lvhd; + u64 uniqueID; + u64 ret; + + bh = sbi->s_lvid_bh; + if (!bh) + return 0; + + lvid = (struct logicalVolIntegrityDesc *)bh->b_data; + lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse; + + mutex_lock(&sbi->s_alloc_mutex); + ret = uniqueID = le64_to_cpu(lvhd->uniqueID); + if (!(++uniqueID & 0xFFFFFFFF)) + uniqueID += 16; + lvhd->uniqueID = cpu_to_le64(uniqueID); + mutex_unlock(&sbi->s_alloc_mutex); + mark_buffer_dirty(bh); + + return ret; +} + +static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) +{ + int i; + int nr_groups = bitmap->s_nr_groups; + int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) * + nr_groups); + + for (i = 0; i < nr_groups; i++) + if (bitmap->s_block_bitmap[i]) + brelse(bitmap->s_block_bitmap[i]); + + if (size <= PAGE_SIZE) + kfree(bitmap); + else + vfree(bitmap); +} + +static void udf_free_partition(struct udf_part_map *map) +{ + int i; + struct udf_meta_data *mdata; + + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) + iput(map->s_uspace.s_table); + if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) + iput(map->s_fspace.s_table); + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) + udf_sb_free_bitmap(map->s_uspace.s_bitmap); + if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) + udf_sb_free_bitmap(map->s_fspace.s_bitmap); + if (map->s_partition_type == UDF_SPARABLE_MAP15) + for (i = 0; i < 4; i++) + brelse(map->s_type_specific.s_sparing.s_spar_map[i]); + else if (map->s_partition_type == UDF_METADATA_MAP25) { + mdata = &map->s_type_specific.s_metadata; + iput(mdata->s_metadata_fe); + mdata->s_metadata_fe = NULL; + + iput(mdata->s_mirror_fe); + mdata->s_mirror_fe = NULL; + + iput(mdata->s_bitmap_fe); + mdata->s_bitmap_fe = NULL; + } +} + +static int udf_fill_super(struct super_block *sb, void *options, int silent) +{ + int i; + int ret; + struct inode *inode = NULL; + struct udf_options uopt; + struct kernel_lb_addr rootdir, fileset; + struct udf_sb_info *sbi; + + uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); + uopt.uid = -1; + uopt.gid = -1; + uopt.umask = 0; + uopt.fmode = UDF_INVALID_MODE; + uopt.dmode = UDF_INVALID_MODE; + + sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + + mutex_init(&sbi->s_alloc_mutex); + + if (!udf_parse_options((char *)options, &uopt, false)) + goto error_out; + + if (uopt.flags & (1 << UDF_FLAG_UTF8) && + uopt.flags & (1 << UDF_FLAG_NLS_MAP)) { + udf_error(sb, "udf_read_super", + "utf8 cannot be combined with iocharset\n"); + goto error_out; + } +#ifdef CONFIG_UDF_NLS + if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) { + uopt.nls_map = load_nls_default(); + if (!uopt.nls_map) + uopt.flags &= ~(1 << UDF_FLAG_NLS_MAP); + else + udf_debug("Using default NLS map\n"); + } +#endif + if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP))) + uopt.flags |= (1 << UDF_FLAG_UTF8); + + fileset.logicalBlockNum = 0xFFFFFFFF; + fileset.partitionReferenceNum = 0xFFFF; + + sbi->s_flags = uopt.flags; + sbi->s_uid = uopt.uid; + sbi->s_gid = uopt.gid; + sbi->s_umask = uopt.umask; + sbi->s_fmode = uopt.fmode; + sbi->s_dmode = uopt.dmode; + sbi->s_nls_map = uopt.nls_map; + rwlock_init(&sbi->s_cred_lock); + + if (uopt.session == 0xFFFFFFFF) + sbi->s_session = udf_get_last_session(sb); + else + sbi->s_session = uopt.session; + + udf_debug("Multi-session=%d\n", sbi->s_session); + + /* Fill in the rest of the superblock */ + sb->s_op = &udf_sb_ops; + sb->s_export_op = &udf_export_ops; + + sb->s_dirt = 0; + sb->s_magic = UDF_SUPER_MAGIC; + sb->s_time_gran = 1000; + + if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) { + ret = udf_load_vrs(sb, &uopt, silent, &fileset); + } else { + uopt.blocksize = bdev_logical_block_size(sb->s_bdev); + ret = udf_load_vrs(sb, &uopt, silent, &fileset); + if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { + if (!silent) + printk(KERN_NOTICE + "UDF-fs: Rescanning with blocksize " + "%d\n", UDF_DEFAULT_BLOCKSIZE); + uopt.blocksize = UDF_DEFAULT_BLOCKSIZE; + ret = udf_load_vrs(sb, &uopt, silent, &fileset); + } + } + if (!ret) { + printk(KERN_WARNING "UDF-fs: No partition found (1)\n"); + goto error_out; + } + + udf_debug("Lastblock=%d\n", sbi->s_last_block); + + if (sbi->s_lvid_bh) { + struct logicalVolIntegrityDescImpUse *lvidiu = + udf_sb_lvidiu(sbi); + uint16_t minUDFReadRev = le16_to_cpu(lvidiu->minUDFReadRev); + uint16_t minUDFWriteRev = le16_to_cpu(lvidiu->minUDFWriteRev); + /* uint16_t maxUDFWriteRev = + le16_to_cpu(lvidiu->maxUDFWriteRev); */ + + if (minUDFReadRev > UDF_MAX_READ_VERSION) { + printk(KERN_ERR "UDF-fs: minUDFReadRev=%x " + "(max is %x)\n", + le16_to_cpu(lvidiu->minUDFReadRev), + UDF_MAX_READ_VERSION); + goto error_out; + } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) + sb->s_flags |= MS_RDONLY; + + sbi->s_udfrev = minUDFWriteRev; + + if (minUDFReadRev >= UDF_VERS_USE_EXTENDED_FE) + UDF_SET_FLAG(sb, UDF_FLAG_USE_EXTENDED_FE); + if (minUDFReadRev >= UDF_VERS_USE_STREAMS) + UDF_SET_FLAG(sb, UDF_FLAG_USE_STREAMS); + } + + if (!sbi->s_partitions) { + printk(KERN_WARNING "UDF-fs: No partition found (2)\n"); + goto error_out; + } + + if (sbi->s_partmaps[sbi->s_partition].s_partition_flags & + UDF_PART_FLAG_READ_ONLY) { + printk(KERN_NOTICE "UDF-fs: Partition marked readonly; " + "forcing readonly mount\n"); + sb->s_flags |= MS_RDONLY; + } + + if (udf_find_fileset(sb, &fileset, &rootdir)) { + printk(KERN_WARNING "UDF-fs: No fileset found\n"); + goto error_out; + } + + if (!silent) { + struct timestamp ts; + udf_time_to_disk_stamp(&ts, sbi->s_record_time); + udf_info("UDF: Mounting volume '%s', " + "timestamp %04u/%02u/%02u %02u:%02u (%x)\n", + sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day, + ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone)); + } + if (!(sb->s_flags & MS_RDONLY)) + udf_open_lvid(sb); + + /* Assign the root inode */ + /* assign inodes by physical block number */ + /* perhaps it's not extensible enough, but for now ... */ + inode = udf_iget(sb, &rootdir); + if (!inode) { + printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, " + "partition=%d\n", + rootdir.logicalBlockNum, rootdir.partitionReferenceNum); + goto error_out; + } + + /* Allocate a dentry for the root inode */ + sb->s_root = d_alloc_root(inode); + if (!sb->s_root) { + printk(KERN_ERR "UDF-fs: Couldn't allocate root dentry\n"); + iput(inode); + goto error_out; + } + sb->s_maxbytes = MAX_LFS_FILESIZE; + return 0; + +error_out: + if (sbi->s_vat_inode) + iput(sbi->s_vat_inode); + if (sbi->s_partitions) + for (i = 0; i < sbi->s_partitions; i++) + udf_free_partition(&sbi->s_partmaps[i]); +#ifdef CONFIG_UDF_NLS + if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + unload_nls(sbi->s_nls_map); +#endif + if (!(sb->s_flags & MS_RDONLY)) + udf_close_lvid(sb); + brelse(sbi->s_lvid_bh); + + kfree(sbi->s_partmaps); + kfree(sbi); + sb->s_fs_info = NULL; + + return -EINVAL; +} + +static void udf_error(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + va_list args; + + if (!(sb->s_flags & MS_RDONLY)) { + /* mark sb error */ + sb->s_dirt = 1; + } + va_start(args, fmt); + vsnprintf(error_buf, sizeof(error_buf), fmt, args); + va_end(args); + printk(KERN_CRIT "UDF-fs error (device %s): %s: %s\n", + sb->s_id, function, error_buf); +} + +void udf_warning(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(error_buf, sizeof(error_buf), fmt, args); + va_end(args); + printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s\n", + sb->s_id, function, error_buf); +} + +static void udf_put_super(struct super_block *sb) +{ + int i; + struct udf_sb_info *sbi; + + sbi = UDF_SB(sb); + + if (sbi->s_vat_inode) + iput(sbi->s_vat_inode); + if (sbi->s_partitions) + for (i = 0; i < sbi->s_partitions; i++) + udf_free_partition(&sbi->s_partmaps[i]); +#ifdef CONFIG_UDF_NLS + if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + unload_nls(sbi->s_nls_map); +#endif + if (!(sb->s_flags & MS_RDONLY)) + udf_close_lvid(sb); + brelse(sbi->s_lvid_bh); + kfree(sbi->s_partmaps); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; +} + +static int udf_sync_fs(struct super_block *sb, int wait) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + + mutex_lock(&sbi->s_alloc_mutex); + if (sbi->s_lvid_dirty) { + /* + * Blockdevice will be synced later so we don't have to submit + * the buffer for IO + */ + mark_buffer_dirty(sbi->s_lvid_bh); + sb->s_dirt = 0; + sbi->s_lvid_dirty = 0; + } + mutex_unlock(&sbi->s_alloc_mutex); + + return 0; +} + +static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct udf_sb_info *sbi = UDF_SB(sb); + struct logicalVolIntegrityDescImpUse *lvidiu; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + if (sbi->s_lvid_bh != NULL) + lvidiu = udf_sb_lvidiu(sbi); + else + lvidiu = NULL; + + buf->f_type = UDF_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = sbi->s_partmaps[sbi->s_partition].s_partition_len; + buf->f_bfree = udf_count_free(sb); + buf->f_bavail = buf->f_bfree; + buf->f_files = (lvidiu != NULL ? (le32_to_cpu(lvidiu->numFiles) + + le32_to_cpu(lvidiu->numDirs)) : 0) + + buf->f_bfree; + buf->f_ffree = buf->f_bfree; + buf->f_namelen = UDF_NAME_LEN - 2; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + +static unsigned int udf_count_free_bitmap(struct super_block *sb, + struct udf_bitmap *bitmap) +{ + struct buffer_head *bh = NULL; + unsigned int accum = 0; + int index; + int block = 0, newblock; + struct kernel_lb_addr loc; + uint32_t bytes; + uint8_t *ptr; + uint16_t ident; + struct spaceBitmapDesc *bm; + + loc.logicalBlockNum = bitmap->s_extPosition; + loc.partitionReferenceNum = UDF_SB(sb)->s_partition; + bh = udf_read_ptagged(sb, &loc, 0, &ident); + + if (!bh) { + printk(KERN_ERR "udf: udf_count_free failed\n"); + goto out; + } else if (ident != TAG_IDENT_SBD) { + brelse(bh); + printk(KERN_ERR "udf: udf_count_free failed\n"); + goto out; + } + + bm = (struct spaceBitmapDesc *)bh->b_data; + bytes = le32_to_cpu(bm->numOfBytes); + index = sizeof(struct spaceBitmapDesc); /* offset in first block only */ + ptr = (uint8_t *)bh->b_data; + + while (bytes > 0) { + u32 cur_bytes = min_t(u32, bytes, sb->s_blocksize - index); + accum += bitmap_weight((const unsigned long *)(ptr + index), + cur_bytes * 8); + bytes -= cur_bytes; + if (bytes) { + brelse(bh); + newblock = udf_get_lb_pblock(sb, &loc, ++block); + bh = udf_tread(sb, newblock); + if (!bh) { + udf_debug("read failed\n"); + goto out; + } + index = 0; + ptr = (uint8_t *)bh->b_data; + } + } + brelse(bh); +out: + return accum; +} + +static unsigned int udf_count_free_table(struct super_block *sb, + struct inode *table) +{ + unsigned int accum = 0; + uint32_t elen; + struct kernel_lb_addr eloc; + int8_t etype; + struct extent_position epos; + + mutex_lock(&UDF_SB(sb)->s_alloc_mutex); + epos.block = UDF_I(table)->i_location; + epos.offset = sizeof(struct unallocSpaceEntry); + epos.bh = NULL; + + while ((etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) + accum += (elen >> table->i_sb->s_blocksize_bits); + + brelse(epos.bh); + mutex_unlock(&UDF_SB(sb)->s_alloc_mutex); + + return accum; +} + +static unsigned int udf_count_free(struct super_block *sb) +{ + unsigned int accum = 0; + struct udf_sb_info *sbi; + struct udf_part_map *map; + + sbi = UDF_SB(sb); + if (sbi->s_lvid_bh) { + struct logicalVolIntegrityDesc *lvid = + (struct logicalVolIntegrityDesc *) + sbi->s_lvid_bh->b_data; + if (le32_to_cpu(lvid->numOfPartitions) > sbi->s_partition) { + accum = le32_to_cpu( + lvid->freeSpaceTable[sbi->s_partition]); + if (accum == 0xFFFFFFFF) + accum = 0; + } + } + + if (accum) + return accum; + + map = &sbi->s_partmaps[sbi->s_partition]; + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { + accum += udf_count_free_bitmap(sb, + map->s_uspace.s_bitmap); + } + if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { + accum += udf_count_free_bitmap(sb, + map->s_fspace.s_bitmap); + } + if (accum) + return accum; + + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { + accum += udf_count_free_table(sb, + map->s_uspace.s_table); + } + if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { + accum += udf_count_free_table(sb, + map->s_fspace.s_table); + } + + return accum; +} diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c new file mode 100644 index 00000000..b1d4488b --- /dev/null +++ b/fs/udf/symlink.c @@ -0,0 +1,119 @@ +/* + * symlink.c + * + * PURPOSE + * Symlink handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998-2001 Ben Fennema + * (C) 1999 Stelias Computing Inc + * + * HISTORY + * + * 04/16/99 blf Created. + * + */ + +#include "udfdecl.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "udf_i.h" + +static void udf_pc_to_char(struct super_block *sb, unsigned char *from, + int fromlen, unsigned char *to) +{ + struct pathComponent *pc; + int elen = 0; + unsigned char *p = to; + + while (elen < fromlen) { + pc = (struct pathComponent *)(from + elen); + switch (pc->componentType) { + case 1: + if (pc->lengthComponentIdent == 0) { + p = to; + *p++ = '/'; + } + break; + case 3: + memcpy(p, "../", 3); + p += 3; + break; + case 4: + memcpy(p, "./", 2); + p += 2; + /* that would be . - just ignore */ + break; + case 5: + p += udf_get_filename(sb, pc->componentIdent, p, + pc->lengthComponentIdent); + *p++ = '/'; + break; + } + elen += sizeof(struct pathComponent) + pc->lengthComponentIdent; + } + if (p > to + 1) + p[-1] = '\0'; + else + p[0] = '\0'; +} + +static int udf_symlink_filler(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *bh = NULL; + unsigned char *symlink; + int err = -EIO; + unsigned char *p = kmap(page); + struct udf_inode_info *iinfo; + uint32_t pos; + + iinfo = UDF_I(inode); + pos = udf_block_map(inode, 0); + + down_read(&iinfo->i_data_sem); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr; + } else { + bh = sb_bread(inode->i_sb, pos); + + if (!bh) + goto out; + + symlink = bh->b_data; + } + + udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p); + brelse(bh); + + up_read(&iinfo->i_data_sem); + SetPageUptodate(page); + kunmap(page); + unlock_page(page); + return 0; + +out: + up_read(&iinfo->i_data_sem); + SetPageError(page); + kunmap(page); + unlock_page(page); + return err; +} + +/* + * symlinks can't do much... + */ +const struct address_space_operations udf_symlink_aops = { + .readpage = udf_symlink_filler, +}; diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c new file mode 100644 index 00000000..8424308d --- /dev/null +++ b/fs/udf/truncate.c @@ -0,0 +1,289 @@ +/* + * truncate.c + * + * PURPOSE + * Truncate handling routines for the OSTA-UDF(tm) filesystem. + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1999-2004 Ben Fennema + * (C) 1999 Stelias Computing Inc + * + * HISTORY + * + * 02/24/99 blf Created. + * + */ + +#include "udfdecl.h" +#include +#include +#include + +#include "udf_i.h" +#include "udf_sb.h" + +static void extent_trunc(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen, + uint32_t nelen) +{ + struct kernel_lb_addr neloc = {}; + int last_block = (elen + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + int first_block = (nelen + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + + if (nelen) { + if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { + udf_free_blocks(inode->i_sb, inode, eloc, 0, + last_block); + etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30); + } else + neloc = *eloc; + nelen = (etype << 30) | nelen; + } + + if (elen != nelen) { + udf_write_aext(inode, epos, &neloc, nelen, 0); + if (last_block - first_block > 0) { + if (etype == (EXT_RECORDED_ALLOCATED >> 30)) + mark_inode_dirty(inode); + + if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) + udf_free_blocks(inode->i_sb, inode, eloc, + first_block, + last_block - first_block); + } + } +} + +/* + * Truncate the last extent to match i_size. This function assumes + * that preallocation extent is already truncated. + */ +void udf_truncate_tail_extent(struct inode *inode) +{ + struct extent_position epos = {}; + struct kernel_lb_addr eloc; + uint32_t elen, nelen; + uint64_t lbcount = 0; + int8_t etype = -1, netype; + int adsize; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || + inode->i_size == iinfo->i_lenExtents) + return; + /* Are we going to delete the file anyway? */ + if (inode->i_nlink == 0) + return; + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + BUG(); + + /* Find the last extent in the file */ + while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { + etype = netype; + lbcount += elen; + if (lbcount > inode->i_size) { + if (lbcount - inode->i_size >= inode->i_sb->s_blocksize) + printk(KERN_WARNING + "udf_truncate_tail_extent(): Too long " + "extent after EOF in inode %u: i_size: " + "%Ld lbcount: %Ld extent %u+%u\n", + (unsigned)inode->i_ino, + (long long)inode->i_size, + (long long)lbcount, + (unsigned)eloc.logicalBlockNum, + (unsigned)elen); + nelen = elen - (lbcount - inode->i_size); + epos.offset -= adsize; + extent_trunc(inode, &epos, &eloc, etype, elen, nelen); + epos.offset += adsize; + if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) + printk(KERN_ERR "udf_truncate_tail_extent(): " + "Extent after EOF in inode %u.\n", + (unsigned)inode->i_ino); + break; + } + } + /* This inode entry is in-memory only and thus we don't have to mark + * the inode dirty */ + iinfo->i_lenExtents = inode->i_size; + brelse(epos.bh); +} + +void udf_discard_prealloc(struct inode *inode) +{ + struct extent_position epos = { NULL, 0, {0, 0} }; + struct kernel_lb_addr eloc; + uint32_t elen; + uint64_t lbcount = 0; + int8_t etype = -1, netype; + int adsize; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || + inode->i_size == iinfo->i_lenExtents) + return; + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + adsize = 0; + + epos.block = iinfo->i_location; + + /* Find the last extent in the file */ + while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { + etype = netype; + lbcount += elen; + } + if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { + epos.offset -= adsize; + lbcount -= elen; + extent_trunc(inode, &epos, &eloc, etype, elen, 0); + if (!epos.bh) { + iinfo->i_lenAlloc = + epos.offset - + udf_file_entry_alloc_offset(inode); + mark_inode_dirty(inode); + } else { + struct allocExtDesc *aed = + (struct allocExtDesc *)(epos.bh->b_data); + aed->lengthAllocDescs = + cpu_to_le32(epos.offset - + sizeof(struct allocExtDesc)); + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || + UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) + udf_update_tag(epos.bh->b_data, epos.offset); + else + udf_update_tag(epos.bh->b_data, + sizeof(struct allocExtDesc)); + mark_buffer_dirty_inode(epos.bh, inode); + } + } + /* This inode entry is in-memory only and thus we don't have to mark + * the inode dirty */ + iinfo->i_lenExtents = lbcount; + brelse(epos.bh); +} + +static void udf_update_alloc_ext_desc(struct inode *inode, + struct extent_position *epos, + u32 lenalloc) +{ + struct super_block *sb = inode->i_sb; + struct udf_sb_info *sbi = UDF_SB(sb); + + struct allocExtDesc *aed = (struct allocExtDesc *) (epos->bh->b_data); + int len = sizeof(struct allocExtDesc); + + aed->lengthAllocDescs = cpu_to_le32(lenalloc); + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || sbi->s_udfrev >= 0x0201) + len += lenalloc; + + udf_update_tag(epos->bh->b_data, len); + mark_buffer_dirty_inode(epos->bh, inode); +} + +/* + * Truncate extents of inode to inode->i_size. This function can be used only + * for making file shorter. For making file longer, udf_extend_file() has to + * be used. + */ +void udf_truncate_extents(struct inode *inode) +{ + struct extent_position epos; + struct kernel_lb_addr eloc, neloc = {}; + uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc; + int8_t etype; + struct super_block *sb = inode->i_sb; + sector_t first_block = inode->i_size >> sb->s_blocksize_bits, offset; + loff_t byte_offset; + int adsize; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + BUG(); + + etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); + byte_offset = (offset << sb->s_blocksize_bits) + + (inode->i_size & (sb->s_blocksize - 1)); + if (etype == -1) { + /* We should extend the file? */ + WARN_ON(byte_offset); + return; + } + epos.offset -= adsize; + extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); + epos.offset += adsize; + if (byte_offset) + lenalloc = epos.offset; + else + lenalloc = epos.offset - adsize; + + if (!epos.bh) + lenalloc -= udf_file_entry_alloc_offset(inode); + else + lenalloc -= sizeof(struct allocExtDesc); + + while ((etype = udf_current_aext(inode, &epos, &eloc, + &elen, 0)) != -1) { + if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { + udf_write_aext(inode, &epos, &neloc, nelen, 0); + if (indirect_ext_len) { + /* We managed to free all extents in the + * indirect extent - free it too */ + BUG_ON(!epos.bh); + udf_free_blocks(sb, inode, &epos.block, + 0, indirect_ext_len); + } else if (!epos.bh) { + iinfo->i_lenAlloc = lenalloc; + mark_inode_dirty(inode); + } else + udf_update_alloc_ext_desc(inode, + &epos, lenalloc); + brelse(epos.bh); + epos.offset = sizeof(struct allocExtDesc); + epos.block = eloc; + epos.bh = udf_tread(sb, + udf_get_lb_pblock(sb, &eloc, 0)); + if (elen) + indirect_ext_len = + (elen + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + else + indirect_ext_len = 1; + } else { + extent_trunc(inode, &epos, &eloc, etype, elen, 0); + epos.offset += adsize; + } + } + + if (indirect_ext_len) { + BUG_ON(!epos.bh); + udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len); + } else if (!epos.bh) { + iinfo->i_lenAlloc = lenalloc; + mark_inode_dirty(inode); + } else + udf_update_alloc_ext_desc(inode, &epos, lenalloc); + iinfo->i_lenExtents = inode->i_size; + + brelse(epos.bh); +} diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h new file mode 100644 index 00000000..d1bd31ea --- /dev/null +++ b/fs/udf/udf_i.h @@ -0,0 +1,45 @@ +#ifndef _UDF_I_H +#define _UDF_I_H + +/* + * The i_data_sem and i_mutex serve for protection of allocation information + * of a regular files and symlinks. This includes all extents belonging to + * the file/symlink, a fact whether data are in-inode or in external data + * blocks, preallocation, goal block information... When extents are read, + * i_mutex or i_data_sem must be held (for reading is enough in case of + * i_data_sem). When extents are changed, i_data_sem must be held for writing + * and also i_mutex must be held. + * + * For directories i_mutex is used for all the necessary protection. + */ + +struct udf_inode_info { + struct timespec i_crtime; + /* Physical address of inode */ + struct kernel_lb_addr i_location; + __u64 i_unique; + __u32 i_lenEAttr; + __u32 i_lenAlloc; + __u64 i_lenExtents; + __u32 i_next_alloc_block; + __u32 i_next_alloc_goal; + unsigned i_alloc_type : 3; + unsigned i_efe : 1; /* extendedFileEntry */ + unsigned i_use : 1; /* unallocSpaceEntry */ + unsigned i_strat4096 : 1; + unsigned reserved : 26; + union { + struct short_ad *i_sad; + struct long_ad *i_lad; + __u8 *i_data; + } i_ext; + struct rw_semaphore i_data_sem; + struct inode vfs_inode; +}; + +static inline struct udf_inode_info *UDF_I(struct inode *inode) +{ + return list_entry(inode, struct udf_inode_info, vfs_inode); +} + +#endif /* _UDF_I_H) */ diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h new file mode 100644 index 00000000..4858c191 --- /dev/null +++ b/fs/udf/udf_sb.h @@ -0,0 +1,182 @@ +#ifndef __LINUX_UDF_SB_H +#define __LINUX_UDF_SB_H + +#include +#include + +/* Since UDF 2.01 is ISO 13346 based... */ +#define UDF_SUPER_MAGIC 0x15013346 + +#define UDF_MAX_READ_VERSION 0x0250 +#define UDF_MAX_WRITE_VERSION 0x0201 + +#define UDF_FLAG_USE_EXTENDED_FE 0 +#define UDF_VERS_USE_EXTENDED_FE 0x0200 +#define UDF_FLAG_USE_STREAMS 1 +#define UDF_VERS_USE_STREAMS 0x0200 +#define UDF_FLAG_USE_SHORT_AD 2 +#define UDF_FLAG_USE_AD_IN_ICB 3 +#define UDF_FLAG_USE_FILE_CTIME_EA 4 +#define UDF_FLAG_STRICT 5 +#define UDF_FLAG_UNDELETE 6 +#define UDF_FLAG_UNHIDE 7 +#define UDF_FLAG_VARCONV 8 +#define UDF_FLAG_NLS_MAP 9 +#define UDF_FLAG_UTF8 10 +#define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ +#define UDF_FLAG_UID_IGNORE 12 /* use sb uid instead of on disk uid */ +#define UDF_FLAG_GID_FORGET 13 +#define UDF_FLAG_GID_IGNORE 14 +#define UDF_FLAG_UID_SET 15 +#define UDF_FLAG_GID_SET 16 +#define UDF_FLAG_SESSION_SET 17 +#define UDF_FLAG_LASTBLOCK_SET 18 +#define UDF_FLAG_BLOCKSIZE_SET 19 + +#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 +#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 +#define UDF_PART_FLAG_FREED_BITMAP 0x0004 +#define UDF_PART_FLAG_FREED_TABLE 0x0008 +#define UDF_PART_FLAG_READ_ONLY 0x0010 +#define UDF_PART_FLAG_WRITE_ONCE 0x0020 +#define UDF_PART_FLAG_REWRITABLE 0x0040 +#define UDF_PART_FLAG_OVERWRITABLE 0x0080 + +#define UDF_MAX_BLOCK_LOADED 8 + +#define UDF_TYPE1_MAP15 0x1511U +#define UDF_VIRTUAL_MAP15 0x1512U +#define UDF_VIRTUAL_MAP20 0x2012U +#define UDF_SPARABLE_MAP15 0x1522U +#define UDF_METADATA_MAP25 0x2511U + +#define UDF_INVALID_MODE ((mode_t)-1) + +#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ + +struct udf_meta_data { + __u32 s_meta_file_loc; + __u32 s_mirror_file_loc; + __u32 s_bitmap_file_loc; + __u32 s_alloc_unit_size; + __u16 s_align_unit_size; + __u8 s_dup_md_flag; + struct inode *s_metadata_fe; + struct inode *s_mirror_fe; + struct inode *s_bitmap_fe; +}; + +struct udf_sparing_data { + __u16 s_packet_len; + struct buffer_head *s_spar_map[4]; +}; + +struct udf_virtual_data { + __u32 s_num_entries; + __u16 s_start_offset; +}; + +struct udf_bitmap { + __u32 s_extLength; + __u32 s_extPosition; + __u16 s_nr_groups; + struct buffer_head **s_block_bitmap; +}; + +struct udf_part_map { + union { + struct udf_bitmap *s_bitmap; + struct inode *s_table; + } s_uspace; + union { + struct udf_bitmap *s_bitmap; + struct inode *s_table; + } s_fspace; + __u32 s_partition_root; + __u32 s_partition_len; + __u16 s_partition_type; + __u16 s_partition_num; + union { + struct udf_sparing_data s_sparing; + struct udf_virtual_data s_virtual; + struct udf_meta_data s_metadata; + } s_type_specific; + __u32 (*s_partition_func)(struct super_block *, __u32, __u16, __u32); + __u16 s_volumeseqnum; + __u16 s_partition_flags; +}; + +#pragma pack() + +struct udf_sb_info { + struct udf_part_map *s_partmaps; + __u8 s_volume_ident[32]; + + /* Overall info */ + __u16 s_partitions; + __u16 s_partition; + + /* Sector headers */ + __s32 s_session; + __u32 s_anchor; + __u32 s_last_block; + + struct buffer_head *s_lvid_bh; + + /* Default permissions */ + mode_t s_umask; + gid_t s_gid; + uid_t s_uid; + mode_t s_fmode; + mode_t s_dmode; + /* Lock protecting consistency of above permission settings */ + rwlock_t s_cred_lock; + + /* Root Info */ + struct timespec s_record_time; + + /* Fileset Info */ + __u16 s_serial_number; + + /* highest UDF revision we have recorded to this media */ + __u16 s_udfrev; + + /* Miscellaneous flags */ + unsigned long s_flags; + + /* Encoding info */ + struct nls_table *s_nls_map; + + /* VAT inode */ + struct inode *s_vat_inode; + + struct mutex s_alloc_mutex; + /* Protected by s_alloc_mutex */ + unsigned int s_lvid_dirty; +}; + +static inline struct udf_sb_info *UDF_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi); + +int udf_compute_nr_groups(struct super_block *sb, u32 partition); + +static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag) +{ + return test_bit(flag, &UDF_SB(sb)->s_flags); +} + +static inline void UDF_SET_FLAG(struct super_block *sb, int flag) +{ + set_bit(flag, &UDF_SB(sb)->s_flags); +} + +static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag) +{ + clear_bit(flag, &UDF_SB(sb)->s_flags); +} + +#endif /* __LINUX_UDF_SB_H */ diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h new file mode 100644 index 00000000..dbd52d4b --- /dev/null +++ b/fs/udf/udfdecl.h @@ -0,0 +1,241 @@ +#ifndef __UDF_DECL_H +#define __UDF_DECL_H + +#include "ecma_167.h" +#include "osta_udf.h" + +#include +#include +#include +#include + +#include "udf_sb.h" +#include "udfend.h" +#include "udf_i.h" + +#define UDF_PREALLOCATE +#define UDF_DEFAULT_PREALLOC_BLOCKS 8 + +#undef UDFFS_DEBUG + +#ifdef UDFFS_DEBUG +#define udf_debug(f, a...) \ +do { \ + printk(KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \ + __FILE__, __LINE__, __func__); \ + printk(f, ##a); \ +} while (0) +#else +#define udf_debug(f, a...) /**/ +#endif + +#define udf_info(f, a...) \ + printk(KERN_INFO "UDF-fs INFO " f, ##a); + + +#define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) ) +#define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) ) + +#define UDF_EXTENT_LENGTH_MASK 0x3FFFFFFF +#define UDF_EXTENT_FLAG_MASK 0xC0000000 + +#define UDF_NAME_PAD 4 +#define UDF_NAME_LEN 256 +#define UDF_PATH_LEN 1023 + +static inline size_t udf_file_entry_alloc_offset(struct inode *inode) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + if (iinfo->i_use) + return sizeof(struct unallocSpaceEntry); + else if (iinfo->i_efe) + return sizeof(struct extendedFileEntry) + iinfo->i_lenEAttr; + else + return sizeof(struct fileEntry) + iinfo->i_lenEAttr; +} + +static inline size_t udf_ext0_offset(struct inode *inode) +{ + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + return udf_file_entry_alloc_offset(inode); + else + return 0; +} + +/* computes tag checksum */ +u8 udf_tag_checksum(const struct tag *t); + +struct dentry; +struct inode; +struct task_struct; +struct buffer_head; +struct super_block; + +extern const struct export_operations udf_export_ops; +extern const struct inode_operations udf_dir_inode_operations; +extern const struct file_operations udf_dir_operations; +extern const struct inode_operations udf_file_inode_operations; +extern const struct file_operations udf_file_operations; +extern const struct inode_operations udf_symlink_inode_operations; +extern const struct address_space_operations udf_aops; +extern const struct address_space_operations udf_adinicb_aops; +extern const struct address_space_operations udf_symlink_aops; + +struct udf_fileident_bh { + struct buffer_head *sbh; + struct buffer_head *ebh; + int soffset; + int eoffset; +}; + +struct udf_vds_record { + uint32_t block; + uint32_t volDescSeqNum; +}; + +struct generic_desc { + struct tag descTag; + __le32 volDescSeqNum; +}; + +struct ustr { + uint8_t u_cmpID; + uint8_t u_name[UDF_NAME_LEN - 2]; + uint8_t u_len; +}; + +struct extent_position { + struct buffer_head *bh; + uint32_t offset; + struct kernel_lb_addr block; +}; + +/* super.c */ + +__attribute__((format(printf, 3, 4))) +extern void udf_warning(struct super_block *, const char *, const char *, ...); +static inline void udf_updated_lvid(struct super_block *sb) +{ + struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh; + + BUG_ON(!bh); + WARN_ON_ONCE(((struct logicalVolIntegrityDesc *) + bh->b_data)->integrityType != + cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN)); + sb->s_dirt = 1; + UDF_SB(sb)->s_lvid_dirty = 1; +} +extern u64 lvid_get_unique_id(struct super_block *sb); + +/* namei.c */ +extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, + struct fileIdentDesc *, struct udf_fileident_bh *, + uint8_t *, uint8_t *); + +/* file.c */ +extern long udf_ioctl(struct file *, unsigned int, unsigned long); +/* inode.c */ +extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); +extern int udf_expand_file_adinicb(struct inode *); +extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); +extern struct buffer_head *udf_bread(struct inode *, int, int, int *); +extern int udf_setsize(struct inode *, loff_t); +extern void udf_read_inode(struct inode *); +extern void udf_evict_inode(struct inode *); +extern int udf_write_inode(struct inode *, struct writeback_control *wbc); +extern long udf_block_map(struct inode *, sector_t); +extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, + struct kernel_lb_addr *, uint32_t *, sector_t *); +extern int udf_add_aext(struct inode *, struct extent_position *, + struct kernel_lb_addr *, uint32_t, int); +extern void udf_write_aext(struct inode *, struct extent_position *, + struct kernel_lb_addr *, uint32_t, int); +extern int8_t udf_delete_aext(struct inode *, struct extent_position, + struct kernel_lb_addr, uint32_t); +extern int8_t udf_next_aext(struct inode *, struct extent_position *, + struct kernel_lb_addr *, uint32_t *, int); +extern int8_t udf_current_aext(struct inode *, struct extent_position *, + struct kernel_lb_addr *, uint32_t *, int); + +/* misc.c */ +extern struct buffer_head *udf_tgetblk(struct super_block *, int); +extern struct buffer_head *udf_tread(struct super_block *, int); +extern struct genericFormat *udf_add_extendedattr(struct inode *, uint32_t, + uint32_t, uint8_t); +extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t, + uint8_t); +extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t, + uint32_t, uint16_t *); +extern struct buffer_head *udf_read_ptagged(struct super_block *, + struct kernel_lb_addr *, uint32_t, + uint16_t *); +extern void udf_update_tag(char *, int); +extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int); + +/* lowlevel.c */ +extern unsigned int udf_get_last_session(struct super_block *); +extern unsigned long udf_get_last_block(struct super_block *); + +/* partition.c */ +extern uint32_t udf_get_pblock(struct super_block *, uint32_t, uint16_t, + uint32_t); +extern uint32_t udf_get_pblock_virt15(struct super_block *, uint32_t, uint16_t, + uint32_t); +extern uint32_t udf_get_pblock_virt20(struct super_block *, uint32_t, uint16_t, + uint32_t); +extern uint32_t udf_get_pblock_spar15(struct super_block *, uint32_t, uint16_t, + uint32_t); +extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t, + uint32_t); +extern int udf_relocate_blocks(struct super_block *, long, long *); + +static inline uint32_t +udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc, + uint32_t offset) +{ + return udf_get_pblock(sb, loc->logicalBlockNum, + loc->partitionReferenceNum, offset); +} + +/* unicode.c */ +extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int); +extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *, + int); +extern int udf_build_ustr(struct ustr *, dstring *, int); +extern int udf_CS0toUTF8(struct ustr *, const struct ustr *); + +/* ialloc.c */ +extern void udf_free_inode(struct inode *); +extern struct inode *udf_new_inode(struct inode *, int, int *); + +/* truncate.c */ +extern void udf_truncate_tail_extent(struct inode *); +extern void udf_discard_prealloc(struct inode *); +extern void udf_truncate_extents(struct inode *); + +/* balloc.c */ +extern void udf_free_blocks(struct super_block *, struct inode *, + struct kernel_lb_addr *, uint32_t, uint32_t); +extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t, + uint32_t, uint32_t); +extern int udf_new_block(struct super_block *, struct inode *, uint16_t, + uint32_t, int *); + +/* directory.c */ +extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *, + struct udf_fileident_bh *, + struct fileIdentDesc *, + struct extent_position *, + struct kernel_lb_addr *, uint32_t *, + sector_t *); +extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, + int *offset); +extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); +extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); + +/* udftime.c */ +extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest, + struct timestamp src); +extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src); + +#endif /* __UDF_DECL_H */ diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h new file mode 100644 index 00000000..6a9f3a9c --- /dev/null +++ b/fs/udf/udfend.h @@ -0,0 +1,77 @@ +#ifndef __UDF_ENDIAN_H +#define __UDF_ENDIAN_H + +#include +#include + +static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in) +{ + struct kernel_lb_addr out; + + out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum); + out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum); + + return out; +} + +static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in) +{ + struct lb_addr out; + + out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum); + out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum); + + return out; +} + +static inline struct short_ad lesa_to_cpu(struct short_ad in) +{ + struct short_ad out; + + out.extLength = le32_to_cpu(in.extLength); + out.extPosition = le32_to_cpu(in.extPosition); + + return out; +} + +static inline struct short_ad cpu_to_lesa(struct short_ad in) +{ + struct short_ad out; + + out.extLength = cpu_to_le32(in.extLength); + out.extPosition = cpu_to_le32(in.extPosition); + + return out; +} + +static inline struct kernel_long_ad lela_to_cpu(struct long_ad in) +{ + struct kernel_long_ad out; + + out.extLength = le32_to_cpu(in.extLength); + out.extLocation = lelb_to_cpu(in.extLocation); + + return out; +} + +static inline struct long_ad cpu_to_lela(struct kernel_long_ad in) +{ + struct long_ad out; + + out.extLength = cpu_to_le32(in.extLength); + out.extLocation = cpu_to_lelb(in.extLocation); + + return out; +} + +static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in) +{ + struct kernel_extent_ad out; + + out.extLength = le32_to_cpu(in.extLength); + out.extLocation = le32_to_cpu(in.extLocation); + + return out; +} + +#endif /* __UDF_ENDIAN_H */ diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c new file mode 100644 index 00000000..b8c828c4 --- /dev/null +++ b/fs/udf/udftime.c @@ -0,0 +1,171 @@ +/* Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Paul Eggert (eggert@twinsun.com). + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* + * dgb 10/02/98: ripped this from glibc source to help convert timestamps + * to unix time + * 10/04/98: added new table-based lookup after seeing how ugly + * the gnu code is + * blf 09/27/99: ripped out all the old code and inserted new table from + * John Brockmeyer (without leap second corrections) + * rewrote udf_stamp_to_time and fixed timezone accounting in + * udf_time_to_stamp. + */ + +/* + * We don't take into account leap seconds. This may be correct or incorrect. + * For more NIST information (especially dealing with leap seconds), see: + * http://www.boulder.nist.gov/timefreq/pubs/bulletin/leapsecond.htm + */ + +#include +#include +#include "udfdecl.h" + +#define EPOCH_YEAR 1970 + +#ifndef __isleap +/* Nonzero if YEAR is a leap year (every 4 years, + except every 100th isn't, and every 400th is). */ +#define __isleap(year) \ + ((year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0)) +#endif + +/* How many days come before each month (0-12). */ +static const unsigned short int __mon_yday[2][13] = { + /* Normal years. */ + {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, + /* Leap years. */ + {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} +}; + +#define MAX_YEAR_SECONDS 69 +#define SPD 0x15180 /*3600*24 */ +#define SPY(y, l, s) (SPD * (365 * y + l) + s) + +static time_t year_seconds[MAX_YEAR_SECONDS] = { +/*1970*/ SPY(0, 0, 0), SPY(1, 0, 0), SPY(2, 0, 0), SPY(3, 1, 0), +/*1974*/ SPY(4, 1, 0), SPY(5, 1, 0), SPY(6, 1, 0), SPY(7, 2, 0), +/*1978*/ SPY(8, 2, 0), SPY(9, 2, 0), SPY(10, 2, 0), SPY(11, 3, 0), +/*1982*/ SPY(12, 3, 0), SPY(13, 3, 0), SPY(14, 3, 0), SPY(15, 4, 0), +/*1986*/ SPY(16, 4, 0), SPY(17, 4, 0), SPY(18, 4, 0), SPY(19, 5, 0), +/*1990*/ SPY(20, 5, 0), SPY(21, 5, 0), SPY(22, 5, 0), SPY(23, 6, 0), +/*1994*/ SPY(24, 6, 0), SPY(25, 6, 0), SPY(26, 6, 0), SPY(27, 7, 0), +/*1998*/ SPY(28, 7, 0), SPY(29, 7, 0), SPY(30, 7, 0), SPY(31, 8, 0), +/*2002*/ SPY(32, 8, 0), SPY(33, 8, 0), SPY(34, 8, 0), SPY(35, 9, 0), +/*2006*/ SPY(36, 9, 0), SPY(37, 9, 0), SPY(38, 9, 0), SPY(39, 10, 0), +/*2010*/ SPY(40, 10, 0), SPY(41, 10, 0), SPY(42, 10, 0), SPY(43, 11, 0), +/*2014*/ SPY(44, 11, 0), SPY(45, 11, 0), SPY(46, 11, 0), SPY(47, 12, 0), +/*2018*/ SPY(48, 12, 0), SPY(49, 12, 0), SPY(50, 12, 0), SPY(51, 13, 0), +/*2022*/ SPY(52, 13, 0), SPY(53, 13, 0), SPY(54, 13, 0), SPY(55, 14, 0), +/*2026*/ SPY(56, 14, 0), SPY(57, 14, 0), SPY(58, 14, 0), SPY(59, 15, 0), +/*2030*/ SPY(60, 15, 0), SPY(61, 15, 0), SPY(62, 15, 0), SPY(63, 16, 0), +/*2034*/ SPY(64, 16, 0), SPY(65, 16, 0), SPY(66, 16, 0), SPY(67, 17, 0), +/*2038*/ SPY(68, 17, 0) +}; + +extern struct timezone sys_tz; + +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) + +struct timespec * +udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src) +{ + int yday; + u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone); + u16 year = le16_to_cpu(src.year); + uint8_t type = typeAndTimezone >> 12; + int16_t offset; + + if (type == 1) { + offset = typeAndTimezone << 4; + /* sign extent offset */ + offset = (offset >> 4); + if (offset == -2047) /* unspecified offset */ + offset = 0; + } else + offset = 0; + + if ((year < EPOCH_YEAR) || + (year >= EPOCH_YEAR + MAX_YEAR_SECONDS)) { + return NULL; + } + dest->tv_sec = year_seconds[year - EPOCH_YEAR]; + dest->tv_sec -= offset * 60; + + yday = ((__mon_yday[__isleap(year)][src.month - 1]) + src.day - 1); + dest->tv_sec += (((yday * 24) + src.hour) * 60 + src.minute) * 60 + src.second; + dest->tv_nsec = 1000 * (src.centiseconds * 10000 + + src.hundredsOfMicroseconds * 100 + src.microseconds); + return dest; +} + +struct timestamp * +udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts) +{ + long int days, rem, y; + const unsigned short int *ip; + int16_t offset; + + offset = -sys_tz.tz_minuteswest; + + if (!dest) + return NULL; + + dest->typeAndTimezone = cpu_to_le16(0x1000 | (offset & 0x0FFF)); + + ts.tv_sec += offset * 60; + days = ts.tv_sec / SECS_PER_DAY; + rem = ts.tv_sec % SECS_PER_DAY; + dest->hour = rem / SECS_PER_HOUR; + rem %= SECS_PER_HOUR; + dest->minute = rem / 60; + dest->second = rem % 60; + y = 1970; + +#define DIV(a, b) ((a) / (b) - ((a) % (b) < 0)) +#define LEAPS_THRU_END_OF(y) (DIV (y, 4) - DIV (y, 100) + DIV (y, 400)) + + while (days < 0 || days >= (__isleap(y) ? 366 : 365)) { + long int yg = y + days / 365 - (days % 365 < 0); + + /* Adjust DAYS and Y to match the guessed year. */ + days -= ((yg - y) * 365 + + LEAPS_THRU_END_OF(yg - 1) + - LEAPS_THRU_END_OF(y - 1)); + y = yg; + } + dest->year = cpu_to_le16(y); + ip = __mon_yday[__isleap(y)]; + for (y = 11; days < (long int)ip[y]; --y) + continue; + days -= ip[y]; + dest->month = y + 1; + dest->day = days + 1; + + dest->centiseconds = ts.tv_nsec / 10000000; + dest->hundredsOfMicroseconds = (ts.tv_nsec / 1000 - + dest->centiseconds * 10000) / 100; + dest->microseconds = (ts.tv_nsec / 1000 - dest->centiseconds * 10000 - + dest->hundredsOfMicroseconds * 100); + return dest; +} + +/* EOF */ diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c new file mode 100644 index 00000000..d03a90b6 --- /dev/null +++ b/fs/udf/unicode.c @@ -0,0 +1,493 @@ +/* + * unicode.c + * + * PURPOSE + * Routines for converting between UTF-8 and OSTA Compressed Unicode. + * Also handles filename mangling + * + * DESCRIPTION + * OSTA Compressed Unicode is explained in the OSTA UDF specification. + * http://www.osta.org/ + * UTF-8 is explained in the IETF RFC XXXX. + * ftp://ftp.internic.net/rfc/rfcxxxx.txt + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + */ + +#include "udfdecl.h" + +#include +#include /* for memset */ +#include +#include +#include + +#include "udf_sb.h" + +static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int); + +static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen) +{ + if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2)) + return 0; + + memset(dest, 0, sizeof(struct ustr)); + memcpy(dest->u_name, src, strlen); + dest->u_cmpID = 0x08; + dest->u_len = strlen; + + return strlen; +} + +/* + * udf_build_ustr + */ +int udf_build_ustr(struct ustr *dest, dstring *ptr, int size) +{ + int usesize; + + if (!dest || !ptr || !size) + return -1; + BUG_ON(size < 2); + + usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name)); + usesize = min(usesize, size - 2); + dest->u_cmpID = ptr[0]; + dest->u_len = usesize; + memcpy(dest->u_name, ptr + 1, usesize); + memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize); + + return 0; +} + +/* + * udf_build_ustr_exact + */ +static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) +{ + if ((!dest) || (!ptr) || (!exactsize)) + return -1; + + memset(dest, 0, sizeof(struct ustr)); + dest->u_cmpID = ptr[0]; + dest->u_len = exactsize - 1; + memcpy(dest->u_name, ptr + 1, exactsize - 1); + + return 0; +} + +/* + * udf_ocu_to_utf8 + * + * PURPOSE + * Convert OSTA Compressed Unicode to the UTF-8 equivalent. + * + * PRE-CONDITIONS + * utf Pointer to UTF-8 output buffer. + * ocu Pointer to OSTA Compressed Unicode input buffer + * of size UDF_NAME_LEN bytes. + * both of type "struct ustr *" + * + * POST-CONDITIONS + * Zero on success. + * + * HISTORY + * November 12, 1997 - Andrew E. Mileski + * Written, tested, and released. + */ +int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) +{ + const uint8_t *ocu; + uint8_t cmp_id, ocu_len; + int i; + + ocu_len = ocu_i->u_len; + if (ocu_len == 0) { + memset(utf_o, 0, sizeof(struct ustr)); + return 0; + } + + cmp_id = ocu_i->u_cmpID; + if (cmp_id != 8 && cmp_id != 16) { + memset(utf_o, 0, sizeof(struct ustr)); + printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", + cmp_id, ocu_i->u_name); + return 0; + } + + ocu = ocu_i->u_name; + utf_o->u_len = 0; + for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) { + + /* Expand OSTA compressed Unicode to Unicode */ + uint32_t c = ocu[i++]; + if (cmp_id == 16) + c = (c << 8) | ocu[i++]; + + /* Compress Unicode to UTF-8 */ + if (c < 0x80U) + utf_o->u_name[utf_o->u_len++] = (uint8_t)c; + else if (c < 0x800U) { + utf_o->u_name[utf_o->u_len++] = + (uint8_t)(0xc0 | (c >> 6)); + utf_o->u_name[utf_o->u_len++] = + (uint8_t)(0x80 | (c & 0x3f)); + } else { + utf_o->u_name[utf_o->u_len++] = + (uint8_t)(0xe0 | (c >> 12)); + utf_o->u_name[utf_o->u_len++] = + (uint8_t)(0x80 | + ((c >> 6) & 0x3f)); + utf_o->u_name[utf_o->u_len++] = + (uint8_t)(0x80 | (c & 0x3f)); + } + } + utf_o->u_cmpID = 8; + + return utf_o->u_len; +} + +/* + * + * udf_utf8_to_ocu + * + * PURPOSE + * Convert UTF-8 to the OSTA Compressed Unicode equivalent. + * + * DESCRIPTION + * This routine is only called by udf_lookup(). + * + * PRE-CONDITIONS + * ocu Pointer to OSTA Compressed Unicode output + * buffer of size UDF_NAME_LEN bytes. + * utf Pointer to UTF-8 input buffer. + * utf_len Length of UTF-8 input buffer in bytes. + * + * POST-CONDITIONS + * Zero on success. + * + * HISTORY + * November 12, 1997 - Andrew E. Mileski + * Written, tested, and released. + */ +static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length) +{ + unsigned c, i, max_val, utf_char; + int utf_cnt, u_len; + + memset(ocu, 0, sizeof(dstring) * length); + ocu[0] = 8; + max_val = 0xffU; + +try_again: + u_len = 0U; + utf_char = 0U; + utf_cnt = 0U; + for (i = 0U; i < utf->u_len; i++) { + c = (uint8_t)utf->u_name[i]; + + /* Complete a multi-byte UTF-8 character */ + if (utf_cnt) { + utf_char = (utf_char << 6) | (c & 0x3fU); + if (--utf_cnt) + continue; + } else { + /* Check for a multi-byte UTF-8 character */ + if (c & 0x80U) { + /* Start a multi-byte UTF-8 character */ + if ((c & 0xe0U) == 0xc0U) { + utf_char = c & 0x1fU; + utf_cnt = 1; + } else if ((c & 0xf0U) == 0xe0U) { + utf_char = c & 0x0fU; + utf_cnt = 2; + } else if ((c & 0xf8U) == 0xf0U) { + utf_char = c & 0x07U; + utf_cnt = 3; + } else if ((c & 0xfcU) == 0xf8U) { + utf_char = c & 0x03U; + utf_cnt = 4; + } else if ((c & 0xfeU) == 0xfcU) { + utf_char = c & 0x01U; + utf_cnt = 5; + } else { + goto error_out; + } + continue; + } else { + /* Single byte UTF-8 character (most common) */ + utf_char = c; + } + } + + /* Choose no compression if necessary */ + if (utf_char > max_val) { + if (max_val == 0xffU) { + max_val = 0xffffU; + ocu[0] = (uint8_t)0x10U; + goto try_again; + } + goto error_out; + } + + if (max_val == 0xffffU) + ocu[++u_len] = (uint8_t)(utf_char >> 8); + ocu[++u_len] = (uint8_t)(utf_char & 0xffU); + } + + if (utf_cnt) { +error_out: + ocu[++u_len] = '?'; + printk(KERN_DEBUG "udf: bad UTF-8 character\n"); + } + + ocu[length - 1] = (uint8_t)u_len + 1; + + return u_len + 1; +} + +static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, + const struct ustr *ocu_i) +{ + const uint8_t *ocu; + uint8_t cmp_id, ocu_len; + int i, len; + + + ocu_len = ocu_i->u_len; + if (ocu_len == 0) { + memset(utf_o, 0, sizeof(struct ustr)); + return 0; + } + + cmp_id = ocu_i->u_cmpID; + if (cmp_id != 8 && cmp_id != 16) { + memset(utf_o, 0, sizeof(struct ustr)); + printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", + cmp_id, ocu_i->u_name); + return 0; + } + + ocu = ocu_i->u_name; + utf_o->u_len = 0; + for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) { + /* Expand OSTA compressed Unicode to Unicode */ + uint32_t c = ocu[i++]; + if (cmp_id == 16) + c = (c << 8) | ocu[i++]; + + len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len], + UDF_NAME_LEN - utf_o->u_len); + /* Valid character? */ + if (len >= 0) + utf_o->u_len += len; + else + utf_o->u_name[utf_o->u_len++] = '?'; + } + utf_o->u_cmpID = 8; + + return utf_o->u_len; +} + +static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, + int length) +{ + int len; + unsigned i, max_val; + uint16_t uni_char; + int u_len; + + memset(ocu, 0, sizeof(dstring) * length); + ocu[0] = 8; + max_val = 0xffU; + +try_again: + u_len = 0U; + for (i = 0U; i < uni->u_len; i++) { + len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char); + if (!len) + continue; + /* Invalid character, deal with it */ + if (len < 0) { + len = 1; + uni_char = '?'; + } + + if (uni_char > max_val) { + max_val = 0xffffU; + ocu[0] = (uint8_t)0x10U; + goto try_again; + } + + if (max_val == 0xffffU) + ocu[++u_len] = (uint8_t)(uni_char >> 8); + ocu[++u_len] = (uint8_t)(uni_char & 0xffU); + i += len - 1; + } + + ocu[length - 1] = (uint8_t)u_len + 1; + return u_len + 1; +} + +int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, + int flen) +{ + struct ustr *filename, *unifilename; + int len = 0; + + filename = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!filename) + return 0; + + unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS); + if (!unifilename) + goto out1; + + if (udf_build_ustr_exact(unifilename, sname, flen)) + goto out2; + + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { + if (!udf_CS0toUTF8(filename, unifilename)) { + udf_debug("Failed in udf_get_filename: sname = %s\n", + sname); + goto out2; + } + } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { + if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename, + unifilename)) { + udf_debug("Failed in udf_get_filename: sname = %s\n", + sname); + goto out2; + } + } else + goto out2; + + len = udf_translate_to_linux(dname, filename->u_name, filename->u_len, + unifilename->u_name, unifilename->u_len); +out2: + kfree(unifilename); +out1: + kfree(filename); + return len; +} + +int udf_put_filename(struct super_block *sb, const uint8_t *sname, + uint8_t *dname, int flen) +{ + struct ustr unifilename; + int namelen; + + if (!udf_char_to_ustr(&unifilename, sname, flen)) + return 0; + + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { + namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN); + if (!namelen) + return 0; + } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { + namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname, + &unifilename, UDF_NAME_LEN); + if (!namelen) + return 0; + } else + return 0; + + return namelen; +} + +#define ILLEGAL_CHAR_MARK '_' +#define EXT_MARK '.' +#define CRC_MARK '#' +#define EXT_SIZE 5 + +static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, + int udfLen, uint8_t *fidName, + int fidNameLen) +{ + int index, newIndex = 0, needsCRC = 0; + int extIndex = 0, newExtIndex = 0, hasExt = 0; + unsigned short valueCRC; + uint8_t curr; + const uint8_t hexChar[] = "0123456789ABCDEF"; + + if (udfName[0] == '.' && + (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) { + needsCRC = 1; + newIndex = udfLen; + memcpy(newName, udfName, udfLen); + } else { + for (index = 0; index < udfLen; index++) { + curr = udfName[index]; + if (curr == '/' || curr == 0) { + needsCRC = 1; + curr = ILLEGAL_CHAR_MARK; + while (index + 1 < udfLen && + (udfName[index + 1] == '/' || + udfName[index + 1] == 0)) + index++; + } + if (curr == EXT_MARK && + (udfLen - index - 1) <= EXT_SIZE) { + if (udfLen == index + 1) + hasExt = 0; + else { + hasExt = 1; + extIndex = index; + newExtIndex = newIndex; + } + } + if (newIndex < 256) + newName[newIndex++] = curr; + else + needsCRC = 1; + } + } + if (needsCRC) { + uint8_t ext[EXT_SIZE]; + int localExtIndex = 0; + + if (hasExt) { + int maxFilenameLen; + for (index = 0; + index < EXT_SIZE && extIndex + index + 1 < udfLen; + index++) { + curr = udfName[extIndex + index + 1]; + + if (curr == '/' || curr == 0) { + needsCRC = 1; + curr = ILLEGAL_CHAR_MARK; + while (extIndex + index + 2 < udfLen && + (index + 1 < EXT_SIZE && + (udfName[extIndex + index + 2] == '/' || + udfName[extIndex + index + 2] == 0))) + index++; + } + ext[localExtIndex++] = curr; + } + maxFilenameLen = 250 - localExtIndex; + if (newIndex > maxFilenameLen) + newIndex = maxFilenameLen; + else + newIndex = newExtIndex; + } else if (newIndex > 250) + newIndex = 250; + newName[newIndex++] = CRC_MARK; + valueCRC = crc_itu_t(0, fidName, fidNameLen); + newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12]; + newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8]; + newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4]; + newName[newIndex++] = hexChar[(valueCRC & 0x000f)]; + + if (hasExt) { + newName[newIndex++] = EXT_MARK; + for (index = 0; index < localExtIndex; index++) + newName[newIndex++] = ext[index]; + } + } + + return newIndex; +} diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig new file mode 100644 index 00000000..e4f10a40 --- /dev/null +++ b/fs/ufs/Kconfig @@ -0,0 +1,43 @@ +config UFS_FS + tristate "UFS file system support (read only)" + depends on BLOCK + help + BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, + OpenBSD and NeXTstep) use a file system called UFS. Some System V + Unixes can create and mount hard disk partitions and diskettes using + this file system as well. Saying Y here will allow you to read from + these partitions; if you also want to write to them, say Y to the + experimental "UFS file system write support", below. Please read the + file for more information. + + The recently released UFS2 variant (used in FreeBSD 5.x) is + READ-ONLY supported. + + Note that this option is generally not needed for floppies, since a + good portable way to transport files and directories between unixes + (and even other operating systems) is given by the tar program ("man + tar" or preferably "info tar"). + + When accessing NeXTstep files, you may need to convert them from the + NeXT character set to the Latin1 character set; use the program + recode ("info recode") for this purpose. + + To compile the UFS file system support as a module, choose M here: the + module will be called ufs. + + If you haven't heard about all of this before, it's safe to say N. + +config UFS_FS_WRITE + bool "UFS file system write support (DANGEROUS)" + depends on UFS_FS && EXPERIMENTAL + help + Say Y here if you want to try writing to UFS partitions. This is + experimental, so you should back up your UFS partitions beforehand. + +config UFS_DEBUG + bool "UFS debugging" + depends on UFS_FS + help + If you are experiencing any problems with the UFS filesystem, say + Y here. This will result in _many_ additional debugging messages to be + written to the system log. diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile new file mode 100644 index 00000000..dd399804 --- /dev/null +++ b/fs/ufs/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the Linux ufs filesystem routines. +# + +obj-$(CONFIG_UFS_FS) += ufs.o + +ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \ + namei.o super.o symlink.o truncate.o util.o diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c new file mode 100644 index 00000000..42694e11 --- /dev/null +++ b/fs/ufs/balloc.c @@ -0,0 +1,953 @@ +/* + * linux/fs/ufs/balloc.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * UFS2 write support Evgeniy Dushistov , 2007 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +#define INVBLOCK ((u64)-1L) + +static u64 ufs_add_fragments(struct inode *, u64, unsigned, unsigned, int *); +static u64 ufs_alloc_fragments(struct inode *, unsigned, u64, unsigned, int *); +static u64 ufs_alloccg_block(struct inode *, struct ufs_cg_private_info *, u64, int *); +static u64 ufs_bitmap_search (struct super_block *, struct ufs_cg_private_info *, u64, unsigned); +static unsigned char ufs_fragtable_8fpb[], ufs_fragtable_other[]; +static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *, unsigned, int); + +/* + * Free 'count' fragments from fragment number 'fragment' + */ +void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + unsigned cgno, bit, end_bit, bbase, blkmap, i; + u64 blkno; + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first(uspi); + + UFSD("ENTER, fragment %llu, count %u\n", + (unsigned long long)fragment, count); + + if (ufs_fragnum(fragment) + count > uspi->s_fpg) + ufs_error (sb, "ufs_free_fragments", "internal error"); + + lock_super(sb); + + cgno = ufs_dtog(uspi, fragment); + bit = ufs_dtogd(uspi, fragment); + if (cgno >= uspi->s_ncg) { + ufs_panic (sb, "ufs_free_fragments", "freeing blocks are outside device"); + goto failed; + } + + ucpi = ufs_load_cylinder (sb, cgno); + if (!ucpi) + goto failed; + ucg = ubh_get_ucg (UCPI_UBH(ucpi)); + if (!ufs_cg_chkmagic(sb, ucg)) { + ufs_panic (sb, "ufs_free_fragments", "internal error, bad magic number on cg %u", cgno); + goto failed; + } + + end_bit = bit + count; + bbase = ufs_blknum (bit); + blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase); + ufs_fragacct (sb, blkmap, ucg->cg_frsum, -1); + for (i = bit; i < end_bit; i++) { + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, i)) + ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, i); + else + ufs_error (sb, "ufs_free_fragments", + "bit already cleared for fragment %u", i); + } + + fs32_add(sb, &ucg->cg_cs.cs_nffree, count); + uspi->cs_total.cs_nffree += count; + fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); + blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase); + ufs_fragacct(sb, blkmap, ucg->cg_frsum, 1); + + /* + * Trying to reassemble free fragments into block + */ + blkno = ufs_fragstoblks (bbase); + if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) { + fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb); + uspi->cs_total.cs_nffree -= uspi->s_fpb; + fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb); + if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) + ufs_clusteracct (sb, ucpi, blkno, 1); + fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); + uspi->cs_total.cs_nbfree++; + fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1); + if (uspi->fs_magic != UFS2_MAGIC) { + unsigned cylno = ufs_cbtocylno (bbase); + + fs16_add(sb, &ubh_cg_blks(ucpi, cylno, + ufs_cbtorpos(bbase)), 1); + fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); + } + } + + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); + sb->s_dirt = 1; + + unlock_super (sb); + UFSD("EXIT\n"); + return; + +failed: + unlock_super (sb); + UFSD("EXIT (FAILED)\n"); + return; +} + +/* + * Free 'count' fragments from fragment number 'fragment' (free whole blocks) + */ +void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + unsigned overflow, cgno, bit, end_bit, i; + u64 blkno; + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first(uspi); + + UFSD("ENTER, fragment %llu, count %u\n", + (unsigned long long)fragment, count); + + if ((fragment & uspi->s_fpbmask) || (count & uspi->s_fpbmask)) { + ufs_error (sb, "ufs_free_blocks", "internal error, " + "fragment %llu, count %u\n", + (unsigned long long)fragment, count); + goto failed; + } + + lock_super(sb); + +do_more: + overflow = 0; + cgno = ufs_dtog(uspi, fragment); + bit = ufs_dtogd(uspi, fragment); + if (cgno >= uspi->s_ncg) { + ufs_panic (sb, "ufs_free_blocks", "freeing blocks are outside device"); + goto failed_unlock; + } + end_bit = bit + count; + if (end_bit > uspi->s_fpg) { + overflow = bit + count - uspi->s_fpg; + count -= overflow; + end_bit -= overflow; + } + + ucpi = ufs_load_cylinder (sb, cgno); + if (!ucpi) + goto failed_unlock; + ucg = ubh_get_ucg (UCPI_UBH(ucpi)); + if (!ufs_cg_chkmagic(sb, ucg)) { + ufs_panic (sb, "ufs_free_blocks", "internal error, bad magic number on cg %u", cgno); + goto failed_unlock; + } + + for (i = bit; i < end_bit; i += uspi->s_fpb) { + blkno = ufs_fragstoblks(i); + if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) { + ufs_error(sb, "ufs_free_blocks", "freeing free fragment"); + } + ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); + if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) + ufs_clusteracct (sb, ucpi, blkno, 1); + + fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); + uspi->cs_total.cs_nbfree++; + fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1); + + if (uspi->fs_magic != UFS2_MAGIC) { + unsigned cylno = ufs_cbtocylno(i); + + fs16_add(sb, &ubh_cg_blks(ucpi, cylno, + ufs_cbtorpos(i)), 1); + fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); + } + } + + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); + + if (overflow) { + fragment += count; + count = overflow; + goto do_more; + } + + sb->s_dirt = 1; + unlock_super (sb); + UFSD("EXIT\n"); + return; + +failed_unlock: + unlock_super (sb); +failed: + UFSD("EXIT (FAILED)\n"); + return; +} + +/* + * Modify inode page cache in such way: + * have - blocks with b_blocknr equal to oldb...oldb+count-1 + * get - blocks with b_blocknr equal to newb...newb+count-1 + * also we suppose that oldb...oldb+count-1 blocks + * situated at the end of file. + * + * We can come here from ufs_writepage or ufs_prepare_write, + * locked_page is argument of these functions, so we already lock it. + */ +static void ufs_change_blocknr(struct inode *inode, sector_t beg, + unsigned int count, sector_t oldb, + sector_t newb, struct page *locked_page) +{ + const unsigned blks_per_page = + 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits); + const unsigned mask = blks_per_page - 1; + struct address_space * const mapping = inode->i_mapping; + pgoff_t index, cur_index, last_index; + unsigned pos, j, lblock; + sector_t end, i; + struct page *page; + struct buffer_head *head, *bh; + + UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n", + inode->i_ino, count, + (unsigned long long)oldb, (unsigned long long)newb); + + BUG_ON(!locked_page); + BUG_ON(!PageLocked(locked_page)); + + cur_index = locked_page->index; + end = count + beg; + last_index = end >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + for (i = beg; i < end; i = (i | mask) + 1) { + index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + + if (likely(cur_index != index)) { + page = ufs_get_locked_page(mapping, index); + if (!page)/* it was truncated */ + continue; + if (IS_ERR(page)) {/* or EIO */ + ufs_error(inode->i_sb, __func__, + "read of page %llu failed\n", + (unsigned long long)index); + continue; + } + } else + page = locked_page; + + head = page_buffers(page); + bh = head; + pos = i & mask; + for (j = 0; j < pos; ++j) + bh = bh->b_this_page; + + + if (unlikely(index == last_index)) + lblock = end & mask; + else + lblock = blks_per_page; + + do { + if (j >= lblock) + break; + pos = (i - beg) + j; + + if (!buffer_mapped(bh)) + map_bh(bh, inode->i_sb, oldb + pos); + if (!buffer_uptodate(bh)) { + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + ufs_error(inode->i_sb, __func__, + "read of block failed\n"); + break; + } + } + + UFSD(" change from %llu to %llu, pos %u\n", + (unsigned long long)(pos + oldb), + (unsigned long long)(pos + newb), pos); + + bh->b_blocknr = newb + pos; + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + mark_buffer_dirty(bh); + ++j; + bh = bh->b_this_page; + } while (bh != head); + + if (likely(cur_index != index)) + ufs_put_locked_page(page); + } + UFSD("EXIT\n"); +} + +static void ufs_clear_frags(struct inode *inode, sector_t beg, unsigned int n, + int sync) +{ + struct buffer_head *bh; + sector_t end = beg + n; + + for (; beg < end; ++beg) { + bh = sb_getblk(inode->i_sb, beg); + lock_buffer(bh); + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + if (IS_SYNC(inode) || sync) + sync_dirty_buffer(bh); + brelse(bh); + } +} + +u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, + u64 goal, unsigned count, int *err, + struct page *locked_page) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + unsigned cgno, oldcount, newcount; + u64 tmp, request, result; + + UFSD("ENTER, ino %lu, fragment %llu, goal %llu, count %u\n", + inode->i_ino, (unsigned long long)fragment, + (unsigned long long)goal, count); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first(uspi); + *err = -ENOSPC; + + lock_super (sb); + tmp = ufs_data_ptr_to_cpu(sb, p); + + if (count + ufs_fragnum(fragment) > uspi->s_fpb) { + ufs_warning(sb, "ufs_new_fragments", "internal warning" + " fragment %llu, count %u", + (unsigned long long)fragment, count); + count = uspi->s_fpb - ufs_fragnum(fragment); + } + oldcount = ufs_fragnum (fragment); + newcount = oldcount + count; + + /* + * Somebody else has just allocated our fragments + */ + if (oldcount) { + if (!tmp) { + ufs_error(sb, "ufs_new_fragments", "internal error, " + "fragment %llu, tmp %llu\n", + (unsigned long long)fragment, + (unsigned long long)tmp); + unlock_super(sb); + return INVBLOCK; + } + if (fragment < UFS_I(inode)->i_lastfrag) { + UFSD("EXIT (ALREADY ALLOCATED)\n"); + unlock_super (sb); + return 0; + } + } + else { + if (tmp) { + UFSD("EXIT (ALREADY ALLOCATED)\n"); + unlock_super(sb); + return 0; + } + } + + /* + * There is not enough space for user on the device + */ + if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { + unlock_super (sb); + UFSD("EXIT (FAILED)\n"); + return 0; + } + + if (goal >= uspi->s_size) + goal = 0; + if (goal == 0) + cgno = ufs_inotocg (inode->i_ino); + else + cgno = ufs_dtog(uspi, goal); + + /* + * allocate new fragment + */ + if (oldcount == 0) { + result = ufs_alloc_fragments (inode, cgno, goal, count, err); + if (result) { + ufs_cpu_to_data_ptr(sb, p, result); + *err = 0; + UFS_I(inode)->i_lastfrag = + max(UFS_I(inode)->i_lastfrag, fragment + count); + ufs_clear_frags(inode, result + oldcount, + newcount - oldcount, locked_page != NULL); + } + unlock_super(sb); + UFSD("EXIT, result %llu\n", (unsigned long long)result); + return result; + } + + /* + * resize block + */ + result = ufs_add_fragments (inode, tmp, oldcount, newcount, err); + if (result) { + *err = 0; + UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, + fragment + count); + ufs_clear_frags(inode, result + oldcount, newcount - oldcount, + locked_page != NULL); + unlock_super(sb); + UFSD("EXIT, result %llu\n", (unsigned long long)result); + return result; + } + + /* + * allocate new block and move data + */ + switch (fs32_to_cpu(sb, usb1->fs_optim)) { + case UFS_OPTSPACE: + request = newcount; + if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree + > uspi->s_dsize * uspi->s_minfree / (2 * 100)) + break; + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); + break; + default: + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); + + case UFS_OPTTIME: + request = uspi->s_fpb; + if (uspi->cs_total.cs_nffree < uspi->s_dsize * + (uspi->s_minfree - 2) / 100) + break; + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); + break; + } + result = ufs_alloc_fragments (inode, cgno, goal, request, err); + if (result) { + ufs_clear_frags(inode, result + oldcount, newcount - oldcount, + locked_page != NULL); + ufs_change_blocknr(inode, fragment - oldcount, oldcount, + uspi->s_sbbase + tmp, + uspi->s_sbbase + result, locked_page); + ufs_cpu_to_data_ptr(sb, p, result); + *err = 0; + UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, + fragment + count); + unlock_super(sb); + if (newcount < request) + ufs_free_fragments (inode, result + newcount, request - newcount); + ufs_free_fragments (inode, tmp, oldcount); + UFSD("EXIT, result %llu\n", (unsigned long long)result); + return result; + } + + unlock_super(sb); + UFSD("EXIT (FAILED)\n"); + return 0; +} + +static u64 ufs_add_fragments(struct inode *inode, u64 fragment, + unsigned oldcount, unsigned newcount, int *err) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + unsigned cgno, fragno, fragoff, count, fragsize, i; + + UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", + (unsigned long long)fragment, oldcount, newcount); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first (uspi); + count = newcount - oldcount; + + cgno = ufs_dtog(uspi, fragment); + if (fs32_to_cpu(sb, UFS_SB(sb)->fs_cs(cgno).cs_nffree) < count) + return 0; + if ((ufs_fragnum (fragment) + newcount) > uspi->s_fpb) + return 0; + ucpi = ufs_load_cylinder (sb, cgno); + if (!ucpi) + return 0; + ucg = ubh_get_ucg (UCPI_UBH(ucpi)); + if (!ufs_cg_chkmagic(sb, ucg)) { + ufs_panic (sb, "ufs_add_fragments", + "internal error, bad magic number on cg %u", cgno); + return 0; + } + + fragno = ufs_dtogd(uspi, fragment); + fragoff = ufs_fragnum (fragno); + for (i = oldcount; i < newcount; i++) + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) + return 0; + /* + * Block can be extended + */ + ucg->cg_time = cpu_to_fs32(sb, get_seconds()); + for (i = newcount; i < (uspi->s_fpb - fragoff); i++) + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) + break; + fragsize = i - oldcount; + if (!fs32_to_cpu(sb, ucg->cg_frsum[fragsize])) + ufs_panic (sb, "ufs_add_fragments", + "internal error or corrupted bitmap on cg %u", cgno); + fs32_sub(sb, &ucg->cg_frsum[fragsize], 1); + if (fragsize != count) + fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); + for (i = oldcount; i < newcount; i++) + ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); + + fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); + fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); + uspi->cs_total.cs_nffree -= count; + + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); + sb->s_dirt = 1; + + UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment); + + return fragment; +} + +#define UFS_TEST_FREE_SPACE_CG \ + ucg = (struct ufs_cylinder_group *) UFS_SB(sb)->s_ucg[cgno]->b_data; \ + if (fs32_to_cpu(sb, ucg->cg_cs.cs_nbfree)) \ + goto cg_found; \ + for (k = count; k < uspi->s_fpb; k++) \ + if (fs32_to_cpu(sb, ucg->cg_frsum[k])) \ + goto cg_found; + +static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, + u64 goal, unsigned count, int *err) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + unsigned oldcg, i, j, k, allocsize; + u64 result; + + UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", + inode->i_ino, cgno, (unsigned long long)goal, count); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first(uspi); + oldcg = cgno; + + /* + * 1. searching on preferred cylinder group + */ + UFS_TEST_FREE_SPACE_CG + + /* + * 2. quadratic rehash + */ + for (j = 1; j < uspi->s_ncg; j *= 2) { + cgno += j; + if (cgno >= uspi->s_ncg) + cgno -= uspi->s_ncg; + UFS_TEST_FREE_SPACE_CG + } + + /* + * 3. brute force search + * We start at i = 2 ( 0 is checked at 1.step, 1 at 2.step ) + */ + cgno = (oldcg + 1) % uspi->s_ncg; + for (j = 2; j < uspi->s_ncg; j++) { + cgno++; + if (cgno >= uspi->s_ncg) + cgno = 0; + UFS_TEST_FREE_SPACE_CG + } + + UFSD("EXIT (FAILED)\n"); + return 0; + +cg_found: + ucpi = ufs_load_cylinder (sb, cgno); + if (!ucpi) + return 0; + ucg = ubh_get_ucg (UCPI_UBH(ucpi)); + if (!ufs_cg_chkmagic(sb, ucg)) + ufs_panic (sb, "ufs_alloc_fragments", + "internal error, bad magic number on cg %u", cgno); + ucg->cg_time = cpu_to_fs32(sb, get_seconds()); + + if (count == uspi->s_fpb) { + result = ufs_alloccg_block (inode, ucpi, goal, err); + if (result == INVBLOCK) + return 0; + goto succed; + } + + for (allocsize = count; allocsize < uspi->s_fpb; allocsize++) + if (fs32_to_cpu(sb, ucg->cg_frsum[allocsize]) != 0) + break; + + if (allocsize == uspi->s_fpb) { + result = ufs_alloccg_block (inode, ucpi, goal, err); + if (result == INVBLOCK) + return 0; + goal = ufs_dtogd(uspi, result); + for (i = count; i < uspi->s_fpb; i++) + ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); + i = uspi->s_fpb - count; + + fs32_add(sb, &ucg->cg_cs.cs_nffree, i); + uspi->cs_total.cs_nffree += i; + fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i); + fs32_add(sb, &ucg->cg_frsum[i], 1); + goto succed; + } + + result = ufs_bitmap_search (sb, ucpi, goal, allocsize); + if (result == INVBLOCK) + return 0; + for (i = 0; i < count; i++) + ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); + + fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); + uspi->cs_total.cs_nffree -= count; + fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); + fs32_sub(sb, &ucg->cg_frsum[allocsize], 1); + + if (count != allocsize) + fs32_add(sb, &ucg->cg_frsum[allocsize - count], 1); + +succed: + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); + sb->s_dirt = 1; + + result += cgno * uspi->s_fpg; + UFSD("EXIT3, result %llu\n", (unsigned long long)result); + return result; +} + +static u64 ufs_alloccg_block(struct inode *inode, + struct ufs_cg_private_info *ucpi, + u64 goal, int *err) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct ufs_cylinder_group * ucg; + u64 result, blkno; + + UFSD("ENTER, goal %llu\n", (unsigned long long)goal); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first(uspi); + ucg = ubh_get_ucg(UCPI_UBH(ucpi)); + + if (goal == 0) { + goal = ucpi->c_rotor; + goto norot; + } + goal = ufs_blknum (goal); + goal = ufs_dtogd(uspi, goal); + + /* + * If the requested block is available, use it. + */ + if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) { + result = goal; + goto gotit; + } + +norot: + result = ufs_bitmap_search (sb, ucpi, goal, uspi->s_fpb); + if (result == INVBLOCK) + return INVBLOCK; + ucpi->c_rotor = result; +gotit: + blkno = ufs_fragstoblks(result); + ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); + if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) + ufs_clusteracct (sb, ucpi, blkno, -1); + + fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1); + uspi->cs_total.cs_nbfree--; + fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1); + + if (uspi->fs_magic != UFS2_MAGIC) { + unsigned cylno = ufs_cbtocylno((unsigned)result); + + fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, + ufs_cbtorpos((unsigned)result)), 1); + fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1); + } + + UFSD("EXIT, result %llu\n", (unsigned long long)result); + + return result; +} + +static unsigned ubh_scanc(struct ufs_sb_private_info *uspi, + struct ufs_buffer_head *ubh, + unsigned begin, unsigned size, + unsigned char *table, unsigned char mask) +{ + unsigned rest, offset; + unsigned char *cp; + + + offset = begin & ~uspi->s_fmask; + begin >>= uspi->s_fshift; + for (;;) { + if ((offset + size) < uspi->s_fsize) + rest = size; + else + rest = uspi->s_fsize - offset; + size -= rest; + cp = ubh->bh[begin]->b_data + offset; + while ((table[*cp++] & mask) == 0 && --rest) + ; + if (rest || !size) + break; + begin++; + offset = 0; + } + return (size + rest); +} + +/* + * Find a block of the specified size in the specified cylinder group. + * @sp: pointer to super block + * @ucpi: pointer to cylinder group info + * @goal: near which block we want find new one + * @count: specified size + */ +static u64 ufs_bitmap_search(struct super_block *sb, + struct ufs_cg_private_info *ucpi, + u64 goal, unsigned count) +{ + /* + * Bit patterns for identifying fragments in the block map + * used as ((map & mask_arr) == want_arr) + */ + static const int mask_arr[9] = { + 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff + }; + static const int want_arr[9] = { + 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe + }; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct ufs_super_block_first *usb1; + struct ufs_cylinder_group *ucg; + unsigned start, length, loc; + unsigned pos, want, blockmap, mask, end; + u64 result; + + UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx, + (unsigned long long)goal, count); + + usb1 = ubh_get_usb_first (uspi); + ucg = ubh_get_ucg(UCPI_UBH(ucpi)); + + if (goal) + start = ufs_dtogd(uspi, goal) >> 3; + else + start = ucpi->c_frotor >> 3; + + length = ((uspi->s_fpg + 7) >> 3) - start; + loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff + start, length, + (uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other, + 1 << (count - 1 + (uspi->s_fpb & 7))); + if (loc == 0) { + length = start + 1; + loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff, length, + (uspi->s_fpb == 8) ? ufs_fragtable_8fpb : + ufs_fragtable_other, + 1 << (count - 1 + (uspi->s_fpb & 7))); + if (loc == 0) { + ufs_error(sb, "ufs_bitmap_search", + "bitmap corrupted on cg %u, start %u," + " length %u, count %u, freeoff %u\n", + ucpi->c_cgx, start, length, count, + ucpi->c_freeoff); + return INVBLOCK; + } + start = 0; + } + result = (start + length - loc) << 3; + ucpi->c_frotor = result; + + /* + * found the byte in the map + */ + + for (end = result + 8; result < end; result += uspi->s_fpb) { + blockmap = ubh_blkmap(UCPI_UBH(ucpi), ucpi->c_freeoff, result); + blockmap <<= 1; + mask = mask_arr[count]; + want = want_arr[count]; + for (pos = 0; pos <= uspi->s_fpb - count; pos++) { + if ((blockmap & mask) == want) { + UFSD("EXIT, result %llu\n", + (unsigned long long)result); + return result + pos; + } + mask <<= 1; + want <<= 1; + } + } + + ufs_error(sb, "ufs_bitmap_search", "block not in map on cg %u\n", + ucpi->c_cgx); + UFSD("EXIT (FAILED)\n"); + return INVBLOCK; +} + +static void ufs_clusteracct(struct super_block * sb, + struct ufs_cg_private_info * ucpi, unsigned blkno, int cnt) +{ + struct ufs_sb_private_info * uspi; + int i, start, end, forw, back; + + uspi = UFS_SB(sb)->s_uspi; + if (uspi->s_contigsumsize <= 0) + return; + + if (cnt > 0) + ubh_setbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno); + else + ubh_clrbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno); + + /* + * Find the size of the cluster going forward. + */ + start = blkno + 1; + end = start + uspi->s_contigsumsize; + if ( end >= ucpi->c_nclusterblks) + end = ucpi->c_nclusterblks; + i = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, end, start); + if (i > end) + i = end; + forw = i - start; + + /* + * Find the size of the cluster going backward. + */ + start = blkno - 1; + end = start - uspi->s_contigsumsize; + if (end < 0 ) + end = -1; + i = ubh_find_last_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, start, end); + if ( i < end) + i = end; + back = start - i; + + /* + * Account for old cluster and the possibly new forward and + * back clusters. + */ + i = back + forw + 1; + if (i > uspi->s_contigsumsize) + i = uspi->s_contigsumsize; + fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (i << 2)), cnt); + if (back > 0) + fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (back << 2)), cnt); + if (forw > 0) + fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (forw << 2)), cnt); +} + + +static unsigned char ufs_fragtable_8fpb[] = { + 0x00, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x04, 0x01, 0x01, 0x01, 0x03, 0x02, 0x03, 0x04, 0x08, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x02, 0x03, 0x03, 0x02, 0x04, 0x05, 0x08, 0x10, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x04, 0x05, 0x05, 0x06, 0x08, 0x09, 0x10, 0x20, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0A, + 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, 0x08, 0x09, 0x09, 0x0A, 0x10, 0x11, 0x20, 0x40, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, 0x05, 0x05, 0x05, 0x07, 0x09, 0x09, 0x11, 0x21, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0A, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, 0x02, 0x03, 0x03, 0x02, 0x06, 0x07, 0x0A, 0x12, + 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x05, 0x07, 0x06, 0x07, 0x04, 0x0C, + 0x08, 0x09, 0x09, 0x0A, 0x09, 0x09, 0x0A, 0x0C, 0x10, 0x11, 0x11, 0x12, 0x20, 0x21, 0x40, 0x80, +}; + +static unsigned char ufs_fragtable_other[] = { + 0x00, 0x16, 0x16, 0x2A, 0x16, 0x16, 0x26, 0x4E, 0x16, 0x16, 0x16, 0x3E, 0x2A, 0x3E, 0x4E, 0x8A, + 0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E, + 0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E, + 0x2A, 0x3E, 0x3E, 0x2A, 0x3E, 0x3E, 0x2E, 0x6E, 0x3E, 0x3E, 0x3E, 0x3E, 0x2A, 0x3E, 0x6E, 0xAA, + 0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E, + 0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E, + 0x26, 0x36, 0x36, 0x2E, 0x36, 0x36, 0x26, 0x6E, 0x36, 0x36, 0x36, 0x3E, 0x2E, 0x3E, 0x6E, 0xAE, + 0x4E, 0x5E, 0x5E, 0x6E, 0x5E, 0x5E, 0x6E, 0x4E, 0x5E, 0x5E, 0x5E, 0x7E, 0x6E, 0x7E, 0x4E, 0xCE, + 0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E, + 0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E, + 0x16, 0x16, 0x16, 0x3E, 0x16, 0x16, 0x36, 0x5E, 0x16, 0x16, 0x16, 0x3E, 0x3E, 0x3E, 0x5E, 0x9E, + 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E, 0xBE, + 0x2A, 0x3E, 0x3E, 0x2A, 0x3E, 0x3E, 0x2E, 0x6E, 0x3E, 0x3E, 0x3E, 0x3E, 0x2A, 0x3E, 0x6E, 0xAA, + 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x3E, 0x7E, 0xBE, + 0x4E, 0x5E, 0x5E, 0x6E, 0x5E, 0x5E, 0x6E, 0x4E, 0x5E, 0x5E, 0x5E, 0x7E, 0x6E, 0x7E, 0x4E, 0xCE, + 0x8A, 0x9E, 0x9E, 0xAA, 0x9E, 0x9E, 0xAE, 0xCE, 0x9E, 0x9E, 0x9E, 0xBE, 0xAA, 0xBE, 0xCE, 0x8A, +}; diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c new file mode 100644 index 00000000..b4676322 --- /dev/null +++ b/fs/ufs/cylinder.c @@ -0,0 +1,201 @@ +/* + * linux/fs/ufs/cylinder.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * ext2 - inode (block) bitmap caching inspired + */ + +#include +#include +#include +#include +#include + +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +/* + * Read cylinder group into cache. The memory space for ufs_cg_private_info + * structure is already allocated during ufs_read_super. + */ +static void ufs_read_cylinder (struct super_block * sb, + unsigned cgno, unsigned bitmap_nr) +{ + struct ufs_sb_info * sbi = UFS_SB(sb); + struct ufs_sb_private_info * uspi; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + unsigned i, j; + + UFSD("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr); + uspi = sbi->s_uspi; + ucpi = sbi->s_ucpi[bitmap_nr]; + ucg = (struct ufs_cylinder_group *)sbi->s_ucg[cgno]->b_data; + + UCPI_UBH(ucpi)->fragment = ufs_cgcmin(cgno); + UCPI_UBH(ucpi)->count = uspi->s_cgsize >> sb->s_blocksize_bits; + /* + * We have already the first fragment of cylinder group block in buffer + */ + UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno]; + for (i = 1; i < UCPI_UBH(ucpi)->count; i++) + if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i))) + goto failed; + sbi->s_cgno[bitmap_nr] = cgno; + + ucpi->c_cgx = fs32_to_cpu(sb, ucg->cg_cgx); + ucpi->c_ncyl = fs16_to_cpu(sb, ucg->cg_ncyl); + ucpi->c_niblk = fs16_to_cpu(sb, ucg->cg_niblk); + ucpi->c_ndblk = fs32_to_cpu(sb, ucg->cg_ndblk); + ucpi->c_rotor = fs32_to_cpu(sb, ucg->cg_rotor); + ucpi->c_frotor = fs32_to_cpu(sb, ucg->cg_frotor); + ucpi->c_irotor = fs32_to_cpu(sb, ucg->cg_irotor); + ucpi->c_btotoff = fs32_to_cpu(sb, ucg->cg_btotoff); + ucpi->c_boff = fs32_to_cpu(sb, ucg->cg_boff); + ucpi->c_iusedoff = fs32_to_cpu(sb, ucg->cg_iusedoff); + ucpi->c_freeoff = fs32_to_cpu(sb, ucg->cg_freeoff); + ucpi->c_nextfreeoff = fs32_to_cpu(sb, ucg->cg_nextfreeoff); + ucpi->c_clustersumoff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clustersumoff); + ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff); + ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks); + UFSD("EXIT\n"); + return; + +failed: + for (j = 1; j < i; j++) + brelse (sbi->s_ucg[j]); + sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY; + ufs_error (sb, "ufs_read_cylinder", "can't read cylinder group block %u", cgno); +} + +/* + * Remove cylinder group from cache, doesn't release memory + * allocated for cylinder group (this is done at ufs_put_super only). + */ +void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr) +{ + struct ufs_sb_info * sbi = UFS_SB(sb); + struct ufs_sb_private_info * uspi; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + unsigned i; + + UFSD("ENTER, bitmap_nr %u\n", bitmap_nr); + + uspi = sbi->s_uspi; + if (sbi->s_cgno[bitmap_nr] == UFS_CGNO_EMPTY) { + UFSD("EXIT\n"); + return; + } + ucpi = sbi->s_ucpi[bitmap_nr]; + ucg = ubh_get_ucg(UCPI_UBH(ucpi)); + + if (uspi->s_ncg > UFS_MAX_GROUP_LOADED && bitmap_nr >= sbi->s_cg_loaded) { + ufs_panic (sb, "ufs_put_cylinder", "internal error"); + return; + } + /* + * rotor is not so important data, so we put it to disk + * at the end of working with cylinder + */ + ucg->cg_rotor = cpu_to_fs32(sb, ucpi->c_rotor); + ucg->cg_frotor = cpu_to_fs32(sb, ucpi->c_frotor); + ucg->cg_irotor = cpu_to_fs32(sb, ucpi->c_irotor); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + for (i = 1; i < UCPI_UBH(ucpi)->count; i++) { + brelse (UCPI_UBH(ucpi)->bh[i]); + } + + sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY; + UFSD("EXIT\n"); +} + +/* + * Find cylinder group in cache and return it as pointer. + * If cylinder group is not in cache, we will load it from disk. + * + * The cache is managed by LRU algorithm. + */ +struct ufs_cg_private_info * ufs_load_cylinder ( + struct super_block * sb, unsigned cgno) +{ + struct ufs_sb_info * sbi = UFS_SB(sb); + struct ufs_sb_private_info * uspi; + struct ufs_cg_private_info * ucpi; + unsigned cg, i, j; + + UFSD("ENTER, cgno %u\n", cgno); + + uspi = sbi->s_uspi; + if (cgno >= uspi->s_ncg) { + ufs_panic (sb, "ufs_load_cylinder", "internal error, high number of cg"); + return NULL; + } + /* + * Cylinder group number cg it in cache and it was last used + */ + if (sbi->s_cgno[0] == cgno) { + UFSD("EXIT\n"); + return sbi->s_ucpi[0]; + } + /* + * Number of cylinder groups is not higher than UFS_MAX_GROUP_LOADED + */ + if (uspi->s_ncg <= UFS_MAX_GROUP_LOADED) { + if (sbi->s_cgno[cgno] != UFS_CGNO_EMPTY) { + if (sbi->s_cgno[cgno] != cgno) { + ufs_panic (sb, "ufs_load_cylinder", "internal error, wrong number of cg in cache"); + UFSD("EXIT (FAILED)\n"); + return NULL; + } + else { + UFSD("EXIT\n"); + return sbi->s_ucpi[cgno]; + } + } else { + ufs_read_cylinder (sb, cgno, cgno); + UFSD("EXIT\n"); + return sbi->s_ucpi[cgno]; + } + } + /* + * Cylinder group number cg is in cache but it was not last used, + * we will move to the first position + */ + for (i = 0; i < sbi->s_cg_loaded && sbi->s_cgno[i] != cgno; i++); + if (i < sbi->s_cg_loaded && sbi->s_cgno[i] == cgno) { + cg = sbi->s_cgno[i]; + ucpi = sbi->s_ucpi[i]; + for (j = i; j > 0; j--) { + sbi->s_cgno[j] = sbi->s_cgno[j-1]; + sbi->s_ucpi[j] = sbi->s_ucpi[j-1]; + } + sbi->s_cgno[0] = cg; + sbi->s_ucpi[0] = ucpi; + /* + * Cylinder group number cg is not in cache, we will read it from disk + * and put it to the first position + */ + } else { + if (sbi->s_cg_loaded < UFS_MAX_GROUP_LOADED) + sbi->s_cg_loaded++; + else + ufs_put_cylinder (sb, UFS_MAX_GROUP_LOADED-1); + ucpi = sbi->s_ucpi[sbi->s_cg_loaded - 1]; + for (j = sbi->s_cg_loaded - 1; j > 0; j--) { + sbi->s_cgno[j] = sbi->s_cgno[j-1]; + sbi->s_ucpi[j] = sbi->s_ucpi[j-1]; + } + sbi->s_ucpi[0] = ucpi; + ufs_read_cylinder (sb, cgno, 0); + } + UFSD("EXIT\n"); + return sbi->s_ucpi[0]; +} diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c new file mode 100644 index 00000000..dbc90994 --- /dev/null +++ b/fs/ufs/dir.c @@ -0,0 +1,666 @@ +/* + * linux/fs/ufs/ufs_dir.c + * + * Copyright (C) 1996 + * Adrian Rodriguez (adrian@franklins-tower.rutgers.edu) + * Laboratory for Computer Science Research Computing Facility + * Rutgers, The State University of New Jersey + * + * swab support by Francois-Rene Rideau 19970406 + * + * 4.4BSD (FreeBSD) support added on February 1st 1998 by + * Niels Kristian Bech Jensen partially based + * on code by Martin von Loewis . + * + * Migration to usage of "page cache" on May 2006 by + * Evgeniy Dushistov based on ext2 code base. + */ + +#include +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +/* + * NOTE! unlike strncmp, ufs_match returns 1 for success, 0 for failure. + * + * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller. + */ +static inline int ufs_match(struct super_block *sb, int len, + const unsigned char *name, struct ufs_dir_entry *de) +{ + if (len != ufs_get_de_namlen(sb, de)) + return 0; + if (!de->d_ino) + return 0; + return !memcmp(name, de->d_name, len); +} + +static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) +{ + struct address_space *mapping = page->mapping; + struct inode *dir = mapping->host; + int err = 0; + + dir->i_version++; + block_write_end(NULL, mapping, pos, len, len, page, NULL); + if (pos+len > dir->i_size) { + i_size_write(dir, pos+len); + mark_inode_dirty(dir); + } + if (IS_DIRSYNC(dir)) + err = write_one_page(page, 1); + else + unlock_page(page); + return err; +} + +static inline void ufs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static inline unsigned long ufs_dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) +{ + ino_t res = 0; + struct ufs_dir_entry *de; + struct page *page; + + de = ufs_find_entry(dir, qstr, &page); + if (de) { + res = fs32_to_cpu(dir->i_sb, de->d_ino); + ufs_put_page(page); + } + return res; +} + + +/* Releases the page */ +void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, + struct page *page, struct inode *inode) +{ + loff_t pos = page_offset(page) + + (char *) de - (char *) page_address(page); + unsigned len = fs16_to_cpu(dir->i_sb, de->d_reclen); + int err; + + lock_page(page); + err = ufs_prepare_chunk(page, pos, len); + BUG_ON(err); + + de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); + ufs_set_de_type(dir->i_sb, de, inode->i_mode); + + err = ufs_commit_chunk(page, pos, len); + ufs_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); +} + + +static void ufs_check_page(struct page *page) +{ + struct inode *dir = page->mapping->host; + struct super_block *sb = dir->i_sb; + char *kaddr = page_address(page); + unsigned offs, rec_len; + unsigned limit = PAGE_CACHE_SIZE; + const unsigned chunk_mask = UFS_SB(sb)->s_uspi->s_dirblksize - 1; + struct ufs_dir_entry *p; + char *error; + + if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_CACHE_MASK; + if (limit & chunk_mask) + goto Ebadsize; + if (!limit) + goto out; + } + for (offs = 0; offs <= limit - UFS_DIR_REC_LEN(1); offs += rec_len) { + p = (struct ufs_dir_entry *)(kaddr + offs); + rec_len = fs16_to_cpu(sb, p->d_reclen); + + if (rec_len < UFS_DIR_REC_LEN(1)) + goto Eshort; + if (rec_len & 3) + goto Ealign; + if (rec_len < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, p))) + goto Enamelen; + if (((offs + rec_len - 1) ^ offs) & ~chunk_mask) + goto Espan; + if (fs32_to_cpu(sb, p->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg * + UFS_SB(sb)->s_uspi->s_ncg)) + goto Einumber; + } + if (offs != limit) + goto Eend; +out: + SetPageChecked(page); + return; + + /* Too bad, we had an error */ + +Ebadsize: + ufs_error(sb, "ufs_check_page", + "size of directory #%lu is not a multiple of chunk size", + dir->i_ino + ); + goto fail; +Eshort: + error = "rec_len is smaller than minimal"; + goto bad_entry; +Ealign: + error = "unaligned directory entry"; + goto bad_entry; +Enamelen: + error = "rec_len is too small for name_len"; + goto bad_entry; +Espan: + error = "directory entry across blocks"; + goto bad_entry; +Einumber: + error = "inode out of bounds"; +bad_entry: + ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - " + "offset=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<i_ino, (page->index<i_mapping; + struct page *page = read_mapping_page(mapping, n, NULL); + if (!IS_ERR(page)) { + kmap(page); + if (!PageChecked(page)) + ufs_check_page(page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + ufs_put_page(page); + return ERR_PTR(-EIO); +} + +/* + * Return the offset into page `page_nr' of the last valid + * byte in that page, plus one. + */ +static unsigned +ufs_last_byte(struct inode *inode, unsigned long page_nr) +{ + unsigned last_byte = inode->i_size; + + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte; +} + +static inline struct ufs_dir_entry * +ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p) +{ + return (struct ufs_dir_entry *)((char *)p + + fs16_to_cpu(sb, p->d_reclen)); +} + +struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p) +{ + struct page *page = ufs_get_page(dir, 0); + struct ufs_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = ufs_next_entry(dir->i_sb, + (struct ufs_dir_entry *)page_address(page)); + *p = page; + } + return de; +} + +/* + * ufs_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the page in which the entry was found, and the entry itself + * (as a parameter - res_dir). Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, + struct page **res_page) +{ + struct super_block *sb = dir->i_sb; + const unsigned char *name = qstr->name; + int namelen = qstr->len; + unsigned reclen = UFS_DIR_REC_LEN(namelen); + unsigned long start, n; + unsigned long npages = ufs_dir_pages(dir); + struct page *page = NULL; + struct ufs_inode_info *ui = UFS_I(dir); + struct ufs_dir_entry *de; + + UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen); + + if (npages == 0 || namelen > UFS_MAXNAMLEN) + goto out; + + /* OFFSET_CACHE */ + *res_page = NULL; + + start = ui->i_dir_start_lookup; + + if (start >= npages) + start = 0; + n = start; + do { + char *kaddr; + page = ufs_get_page(dir, n); + if (!IS_ERR(page)) { + kaddr = page_address(page); + de = (struct ufs_dir_entry *) kaddr; + kaddr += ufs_last_byte(dir, n) - reclen; + while ((char *) de <= kaddr) { + if (de->d_reclen == 0) { + ufs_error(dir->i_sb, __func__, + "zero-length directory entry"); + ufs_put_page(page); + goto out; + } + if (ufs_match(sb, namelen, name, de)) + goto found; + de = ufs_next_entry(sb, de); + } + ufs_put_page(page); + } + if (++n >= npages) + n = 0; + } while (n != start); +out: + return NULL; + +found: + *res_page = page; + ui->i_dir_start_lookup = n; + return de; +} + +/* + * Parent is locked. + */ +int ufs_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + const unsigned char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct super_block *sb = dir->i_sb; + unsigned reclen = UFS_DIR_REC_LEN(namelen); + const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize; + unsigned short rec_len, name_len; + struct page *page = NULL; + struct ufs_dir_entry *de; + unsigned long npages = ufs_dir_pages(dir); + unsigned long n; + char *kaddr; + loff_t pos; + int err; + + UFSD("ENTER, name %s, namelen %u\n", name, namelen); + + /* + * We take care of directory expansion in the same loop. + * This code plays outside i_size, so it locks the page + * to protect that region. + */ + for (n = 0; n <= npages; n++) { + char *dir_end; + + page = ufs_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = page_address(page); + dir_end = kaddr + ufs_last_byte(dir, n); + de = (struct ufs_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE - reclen; + while ((char *)de <= kaddr) { + if ((char *)de == dir_end) { + /* We hit i_size */ + name_len = 0; + rec_len = chunk_size; + de->d_reclen = cpu_to_fs16(sb, chunk_size); + de->d_ino = 0; + goto got_it; + } + if (de->d_reclen == 0) { + ufs_error(dir->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out_unlock; + } + err = -EEXIST; + if (ufs_match(sb, namelen, name, de)) + goto out_unlock; + name_len = UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)); + rec_len = fs16_to_cpu(sb, de->d_reclen); + if (!de->d_ino && rec_len >= reclen) + goto got_it; + if (rec_len >= name_len + reclen) + goto got_it; + de = (struct ufs_dir_entry *) ((char *) de + rec_len); + } + unlock_page(page); + ufs_put_page(page); + } + BUG(); + return -EINVAL; + +got_it: + pos = page_offset(page) + + (char*)de - (char*)page_address(page); + err = ufs_prepare_chunk(page, pos, rec_len); + if (err) + goto out_unlock; + if (de->d_ino) { + struct ufs_dir_entry *de1 = + (struct ufs_dir_entry *) ((char *) de + name_len); + de1->d_reclen = cpu_to_fs16(sb, rec_len - name_len); + de->d_reclen = cpu_to_fs16(sb, name_len); + + de = de1; + } + + ufs_set_de_namlen(sb, de, namelen); + memcpy(de->d_name, name, namelen + 1); + de->d_ino = cpu_to_fs32(sb, inode->i_ino); + ufs_set_de_type(sb, de, inode->i_mode); + + err = ufs_commit_chunk(page, pos, rec_len); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + + mark_inode_dirty(dir); + /* OFFSET_CACHE */ +out_put: + ufs_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +static inline unsigned +ufs_validate_entry(struct super_block *sb, char *base, + unsigned offset, unsigned mask) +{ + struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset); + struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask)); + while ((char*)p < (char*)de) { + if (p->d_reclen == 0) + break; + p = ufs_next_entry(sb, p); + } + return (char *)p - base; +} + + +/* + * This is blatantly stolen from ext2fs + */ +static int +ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + loff_t pos = filp->f_pos; + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + unsigned int offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = ufs_dir_pages(inode); + unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); + int need_revalidate = filp->f_version != inode->i_version; + unsigned flags = UFS_SB(sb)->s_flags; + + UFSD("BEGIN\n"); + + if (pos > inode->i_size - UFS_DIR_REC_LEN(1)) + return 0; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + struct ufs_dir_entry *de; + + struct page *page = ufs_get_page(inode, n); + + if (IS_ERR(page)) { + ufs_error(sb, __func__, + "bad page in #%lu", + inode->i_ino); + filp->f_pos += PAGE_CACHE_SIZE - offset; + return -EIO; + } + kaddr = page_address(page); + if (unlikely(need_revalidate)) { + if (offset) { + offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask); + filp->f_pos = (n<f_version = inode->i_version; + need_revalidate = 0; + } + de = (struct ufs_dir_entry *)(kaddr+offset); + limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1); + for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) { + if (de->d_reclen == 0) { + ufs_error(sb, __func__, + "zero-length directory entry"); + ufs_put_page(page); + return -EIO; + } + if (de->d_ino) { + int over; + unsigned char d_type = DT_UNKNOWN; + + offset = (char *)de - kaddr; + + UFSD("filldir(%s,%u)\n", de->d_name, + fs32_to_cpu(sb, de->d_ino)); + UFSD("namlen %u\n", ufs_get_de_namlen(sb, de)); + + if ((flags & UFS_DE_MASK) == UFS_DE_44BSD) + d_type = de->d_u.d_44.d_type; + + over = filldir(dirent, de->d_name, + ufs_get_de_namlen(sb, de), + (n<d_ino), d_type); + if (over) { + ufs_put_page(page); + return 0; + } + } + filp->f_pos += fs16_to_cpu(sb, de->d_reclen); + } + ufs_put_page(page); + } + return 0; +} + + +/* + * ufs_delete_entry deletes a directory entry by merging it with the + * previous entry. + */ +int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, + struct page * page) +{ + struct super_block *sb = inode->i_sb; + char *kaddr = page_address(page); + unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); + unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen); + loff_t pos; + struct ufs_dir_entry *pde = NULL; + struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from); + int err; + + UFSD("ENTER\n"); + + UFSD("ino %u, reclen %u, namlen %u, name %s\n", + fs32_to_cpu(sb, de->d_ino), + fs16_to_cpu(sb, de->d_reclen), + ufs_get_de_namlen(sb, de), de->d_name); + + while ((char*)de < (char*)dir) { + if (de->d_reclen == 0) { + ufs_error(inode->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out; + } + pde = de; + de = ufs_next_entry(sb, de); + } + if (pde) + from = (char*)pde - (char*)page_address(page); + + pos = page_offset(page) + from; + lock_page(page); + err = ufs_prepare_chunk(page, pos, to - from); + BUG_ON(err); + if (pde) + pde->d_reclen = cpu_to_fs16(sb, to - from); + dir->d_ino = 0; + err = ufs_commit_chunk(page, pos, to - from); + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +out: + ufs_put_page(page); + UFSD("EXIT\n"); + return err; +} + +int ufs_make_empty(struct inode * inode, struct inode *dir) +{ + struct super_block * sb = dir->i_sb; + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); + const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize; + struct ufs_dir_entry * de; + char *base; + int err; + + if (!page) + return -ENOMEM; + + err = ufs_prepare_chunk(page, 0, chunk_size); + if (err) { + unlock_page(page); + goto fail; + } + + kmap(page); + base = (char*)page_address(page); + memset(base, 0, PAGE_CACHE_SIZE); + + de = (struct ufs_dir_entry *) base; + + de->d_ino = cpu_to_fs32(sb, inode->i_ino); + ufs_set_de_type(sb, de, inode->i_mode); + ufs_set_de_namlen(sb, de, 1); + de->d_reclen = cpu_to_fs16(sb, UFS_DIR_REC_LEN(1)); + strcpy (de->d_name, "."); + de = (struct ufs_dir_entry *) + ((char *)de + fs16_to_cpu(sb, de->d_reclen)); + de->d_ino = cpu_to_fs32(sb, dir->i_ino); + ufs_set_de_type(sb, de, dir->i_mode); + de->d_reclen = cpu_to_fs16(sb, chunk_size - UFS_DIR_REC_LEN(1)); + ufs_set_de_namlen(sb, de, 2); + strcpy (de->d_name, ".."); + kunmap(page); + + err = ufs_commit_chunk(page, 0, chunk_size); +fail: + page_cache_release(page); + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int ufs_empty_dir(struct inode * inode) +{ + struct super_block *sb = inode->i_sb; + struct page *page = NULL; + unsigned long i, npages = ufs_dir_pages(inode); + + for (i = 0; i < npages; i++) { + char *kaddr; + struct ufs_dir_entry *de; + page = ufs_get_page(inode, i); + + if (IS_ERR(page)) + continue; + + kaddr = page_address(page); + de = (struct ufs_dir_entry *)kaddr; + kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1); + + while ((char *)de <= kaddr) { + if (de->d_reclen == 0) { + ufs_error(inode->i_sb, __func__, + "zero-length directory entry: " + "kaddr=%p, de=%p\n", kaddr, de); + goto not_empty; + } + if (de->d_ino) { + u16 namelen=ufs_get_de_namlen(sb, de); + /* check for . and .. */ + if (de->d_name[0] != '.') + goto not_empty; + if (namelen > 2) + goto not_empty; + if (namelen < 2) { + if (inode->i_ino != + fs32_to_cpu(sb, de->d_ino)) + goto not_empty; + } else if (de->d_name[1] != '.') + goto not_empty; + } + de = ufs_next_entry(sb, de); + } + ufs_put_page(page); + } + return 1; + +not_empty: + ufs_put_page(page); + return 0; +} + +const struct file_operations ufs_dir_operations = { + .read = generic_read_dir, + .readdir = ufs_readdir, + .fsync = generic_file_fsync, + .llseek = generic_file_llseek, +}; diff --git a/fs/ufs/file.c b/fs/ufs/file.c new file mode 100644 index 00000000..33afa20d --- /dev/null +++ b/fs/ufs/file.c @@ -0,0 +1,46 @@ +/* + * linux/fs/ufs/file.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * from + * + * linux/fs/ext2/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 fs regular file handling primitives + */ + +#include + +#include "ufs_fs.h" +#include "ufs.h" + +/* + * We have mostly NULL's here: the current defaults are ok for + * the ufs filesystem. + */ + +const struct file_operations ufs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .open = generic_file_open, + .fsync = generic_file_fsync, + .splice_read = generic_file_splice_read, +}; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c new file mode 100644 index 00000000..2eabf04a --- /dev/null +++ b/fs/ufs/ialloc.c @@ -0,0 +1,354 @@ +/* + * linux/fs/ufs/ialloc.c + * + * Copyright (c) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * from + * + * linux/fs/ext2/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@dcs.ed.ac.uk), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * UFS2 write support added by + * Evgeniy Dushistov , 2007 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ufs_free_inode (struct inode * inode) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + int is_directory; + unsigned ino, cg, bit; + + UFSD("ENTER, ino %lu\n", inode->i_ino); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first(uspi); + + ino = inode->i_ino; + + lock_super (sb); + + if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) { + ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino); + unlock_super (sb); + return; + } + + cg = ufs_inotocg (ino); + bit = ufs_inotocgoff (ino); + ucpi = ufs_load_cylinder (sb, cg); + if (!ucpi) { + unlock_super (sb); + return; + } + ucg = ubh_get_ucg(UCPI_UBH(ucpi)); + if (!ufs_cg_chkmagic(sb, ucg)) + ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number"); + + ucg->cg_time = cpu_to_fs32(sb, get_seconds()); + + is_directory = S_ISDIR(inode->i_mode); + + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit)) + ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino); + else { + ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit); + if (ino < ucpi->c_irotor) + ucpi->c_irotor = ino; + fs32_add(sb, &ucg->cg_cs.cs_nifree, 1); + uspi->cs_total.cs_nifree++; + fs32_add(sb, &UFS_SB(sb)->fs_cs(cg).cs_nifree, 1); + + if (is_directory) { + fs32_sub(sb, &ucg->cg_cs.cs_ndir, 1); + uspi->cs_total.cs_ndir--; + fs32_sub(sb, &UFS_SB(sb)->fs_cs(cg).cs_ndir, 1); + } + } + + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); + + sb->s_dirt = 1; + unlock_super (sb); + UFSD("EXIT\n"); +} + +/* + * Nullify new chunk of inodes, + * BSD people also set ui_gen field of inode + * during nullification, but we not care about + * that because of linux ufs do not support NFS + */ +static void ufs2_init_inodes_chunk(struct super_block *sb, + struct ufs_cg_private_info *ucpi, + struct ufs_cylinder_group *ucg) +{ + struct buffer_head *bh; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + sector_t beg = uspi->s_sbbase + + ufs_inotofsba(ucpi->c_cgx * uspi->s_ipg + + fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_initediblk)); + sector_t end = beg + uspi->s_fpb; + + UFSD("ENTER cgno %d\n", ucpi->c_cgx); + + for (; beg < end; ++beg) { + bh = sb_getblk(sb, beg); + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + if (sb->s_flags & MS_SYNCHRONOUS) + sync_dirty_buffer(bh); + brelse(bh); + } + + fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb); + ubh_mark_buffer_dirty(UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); + + UFSD("EXIT\n"); +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +struct inode * ufs_new_inode(struct inode * dir, int mode) +{ + struct super_block * sb; + struct ufs_sb_info * sbi; + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct ufs_cg_private_info * ucpi; + struct ufs_cylinder_group * ucg; + struct inode * inode; + unsigned cg, bit, i, j, start; + struct ufs_inode_info *ufsi; + int err = -ENOSPC; + + UFSD("ENTER\n"); + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ufsi = UFS_I(inode); + sbi = UFS_SB(sb); + uspi = sbi->s_uspi; + usb1 = ubh_get_usb_first(uspi); + + lock_super (sb); + + /* + * Try to place the inode in its parent directory + */ + i = ufs_inotocg(dir->i_ino); + if (sbi->fs_cs(i).cs_nifree) { + cg = i; + goto cg_found; + } + + /* + * Use a quadratic hash to find a group with a free inode + */ + for ( j = 1; j < uspi->s_ncg; j <<= 1 ) { + i += j; + if (i >= uspi->s_ncg) + i -= uspi->s_ncg; + if (sbi->fs_cs(i).cs_nifree) { + cg = i; + goto cg_found; + } + } + + /* + * That failed: try linear search for a free inode + */ + i = ufs_inotocg(dir->i_ino) + 1; + for (j = 2; j < uspi->s_ncg; j++) { + i++; + if (i >= uspi->s_ncg) + i = 0; + if (sbi->fs_cs(i).cs_nifree) { + cg = i; + goto cg_found; + } + } + + goto failed; + +cg_found: + ucpi = ufs_load_cylinder (sb, cg); + if (!ucpi) { + err = -EIO; + goto failed; + } + ucg = ubh_get_ucg(UCPI_UBH(ucpi)); + if (!ufs_cg_chkmagic(sb, ucg)) + ufs_panic (sb, "ufs_new_inode", "internal error, bad cg magic number"); + + start = ucpi->c_irotor; + bit = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, uspi->s_ipg, start); + if (!(bit < uspi->s_ipg)) { + bit = ubh_find_first_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, start); + if (!(bit < start)) { + ufs_error (sb, "ufs_new_inode", + "cylinder group %u corrupted - error in inode bitmap\n", cg); + err = -EIO; + goto failed; + } + } + UFSD("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg); + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit)) + ubh_setbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit); + else { + ufs_panic (sb, "ufs_new_inode", "internal error"); + err = -EIO; + goto failed; + } + + if (uspi->fs_magic == UFS2_MAGIC) { + u32 initediblk = fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_initediblk); + + if (bit + uspi->s_inopb > initediblk && + initediblk < fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_niblk)) + ufs2_init_inodes_chunk(sb, ucpi, ucg); + } + + fs32_sub(sb, &ucg->cg_cs.cs_nifree, 1); + uspi->cs_total.cs_nifree--; + fs32_sub(sb, &sbi->fs_cs(cg).cs_nifree, 1); + + if (S_ISDIR(mode)) { + fs32_add(sb, &ucg->cg_cs.cs_ndir, 1); + uspi->cs_total.cs_ndir++; + fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1); + } + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); + sb->s_dirt = 1; + + inode->i_ino = cg * uspi->s_ipg + bit; + inode_init_owner(inode, dir, mode); + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + ufsi->i_flags = UFS_I(dir)->i_flags; + ufsi->i_lastfrag = 0; + ufsi->i_shadow = 0; + ufsi->i_osync = 0; + ufsi->i_oeftflag = 0; + ufsi->i_dir_start_lookup = 0; + memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1)); + insert_inode_hash(inode); + mark_inode_dirty(inode); + + if (uspi->fs_magic == UFS2_MAGIC) { + struct buffer_head *bh; + struct ufs2_inode *ufs2_inode; + + /* + * setup birth date, we do it here because of there is no sense + * to hold it in struct ufs_inode_info, and lose 64 bit + */ + bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); + if (!bh) { + ufs_warning(sb, "ufs_read_inode", + "unable to read inode %lu\n", + inode->i_ino); + err = -EIO; + goto fail_remove_inode; + } + lock_buffer(bh); + ufs2_inode = (struct ufs2_inode *)bh->b_data; + ufs2_inode += ufs_inotofsbo(inode->i_ino); + ufs2_inode->ui_birthtime = cpu_to_fs64(sb, CURRENT_TIME.tv_sec); + ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, CURRENT_TIME.tv_nsec); + mark_buffer_dirty(bh); + unlock_buffer(bh); + if (sb->s_flags & MS_SYNCHRONOUS) + sync_dirty_buffer(bh); + brelse(bh); + } + + unlock_super (sb); + + UFSD("allocating inode %lu\n", inode->i_ino); + UFSD("EXIT\n"); + return inode; + +fail_remove_inode: + unlock_super(sb); + inode->i_nlink = 0; + iput(inode); + UFSD("EXIT (FAILED): err %d\n", err); + return ERR_PTR(err); +failed: + unlock_super (sb); + make_bad_inode(inode); + iput (inode); + UFSD("EXIT (FAILED): err %d\n", err); + return ERR_PTR(err); +} diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c new file mode 100644 index 00000000..b4d791a8 --- /dev/null +++ b/fs/ufs/inode.c @@ -0,0 +1,906 @@ +/* + * linux/fs/ufs/inode.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * from + * + * linux/fs/ext2/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie (sct@dcs.ed.ac.uk), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock); + +static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4]) +{ + struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi; + int ptrs = uspi->s_apb; + int ptrs_bits = uspi->s_apbshift; + const long direct_blocks = UFS_NDADDR, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + + + UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks); + if (i_block < direct_blocks) { + offsets[n++] = i_block; + } else if ((i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = UFS_IND_BLOCK; + offsets[n++] = i_block; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = UFS_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = UFS_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + } else { + ufs_warning(inode->i_sb, "ufs_block_to_path", "block > big"); + } + return n; +} + +/* + * Returns the location of the fragment from + * the beginning of the filesystem. + */ + +static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + u64 mask = (u64) uspi->s_apbmask>>uspi->s_fpbshift; + int shift = uspi->s_apbshift-uspi->s_fpbshift; + sector_t offsets[4], *p; + int depth = ufs_block_to_path(inode, frag >> uspi->s_fpbshift, offsets); + u64 ret = 0L; + __fs32 block; + __fs64 u2_block = 0L; + unsigned flags = UFS_SB(sb)->s_flags; + u64 temp = 0L; + + UFSD(": frag = %llu depth = %d\n", (unsigned long long)frag, depth); + UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n", + uspi->s_fpbshift, uspi->s_apbmask, + (unsigned long long)mask); + + if (depth == 0) + return 0; + + p = offsets; + + if (needs_lock) + lock_ufs(sb); + if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) + goto ufs2; + + block = ufsi->i_u1.i_data[*p++]; + if (!block) + goto out; + while (--depth) { + struct buffer_head *bh; + sector_t n = *p++; + + bh = sb_bread(sb, uspi->s_sbbase + fs32_to_cpu(sb, block)+(n>>shift)); + if (!bh) + goto out; + block = ((__fs32 *) bh->b_data)[n & mask]; + brelse (bh); + if (!block) + goto out; + } + ret = (u64) (uspi->s_sbbase + fs32_to_cpu(sb, block) + (frag & uspi->s_fpbmask)); + goto out; +ufs2: + u2_block = ufsi->i_u1.u2_i_data[*p++]; + if (!u2_block) + goto out; + + + while (--depth) { + struct buffer_head *bh; + sector_t n = *p++; + + + temp = (u64)(uspi->s_sbbase) + fs64_to_cpu(sb, u2_block); + bh = sb_bread(sb, temp +(u64) (n>>shift)); + if (!bh) + goto out; + u2_block = ((__fs64 *)bh->b_data)[n & mask]; + brelse(bh); + if (!u2_block) + goto out; + } + temp = (u64)uspi->s_sbbase + fs64_to_cpu(sb, u2_block); + ret = temp + (u64) (frag & uspi->s_fpbmask); + +out: + if (needs_lock) + unlock_ufs(sb); + return ret; +} + +/** + * ufs_inode_getfrag() - allocate new fragment(s) + * @inode - pointer to inode + * @fragment - number of `fragment' which hold pointer + * to new allocated fragment(s) + * @new_fragment - number of new allocated fragment(s) + * @required - how many fragment(s) we require + * @err - we set it if something wrong + * @phys - pointer to where we save physical number of new allocated fragments, + * NULL if we allocate not data(indirect blocks for example). + * @new - we set it if we allocate new block + * @locked_page - for ufs_new_fragments() + */ +static struct buffer_head * +ufs_inode_getfrag(struct inode *inode, u64 fragment, + sector_t new_fragment, unsigned int required, int *err, + long *phys, int *new, struct page *locked_page) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct buffer_head * result; + unsigned blockoff, lastblockoff; + u64 tmp, goal, lastfrag, block, lastblock; + void *p, *p2; + + UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, required %u, " + "metadata %d\n", inode->i_ino, (unsigned long long)fragment, + (unsigned long long)new_fragment, required, !phys); + + /* TODO : to be done for write support + if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) + goto ufs2; + */ + + block = ufs_fragstoblks (fragment); + blockoff = ufs_fragnum (fragment); + p = ufs_get_direct_data_ptr(uspi, ufsi, block); + + goal = 0; + +repeat: + tmp = ufs_data_ptr_to_cpu(sb, p); + + lastfrag = ufsi->i_lastfrag; + if (tmp && fragment < lastfrag) { + if (!phys) { + result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); + if (tmp == ufs_data_ptr_to_cpu(sb, p)) { + UFSD("EXIT, result %llu\n", + (unsigned long long)tmp + blockoff); + return result; + } + brelse (result); + goto repeat; + } else { + *phys = uspi->s_sbbase + tmp + blockoff; + return NULL; + } + } + + lastblock = ufs_fragstoblks (lastfrag); + lastblockoff = ufs_fragnum (lastfrag); + /* + * We will extend file into new block beyond last allocated block + */ + if (lastblock < block) { + /* + * We must reallocate last allocated block + */ + if (lastblockoff) { + p2 = ufs_get_direct_data_ptr(uspi, ufsi, lastblock); + tmp = ufs_new_fragments(inode, p2, lastfrag, + ufs_data_ptr_to_cpu(sb, p2), + uspi->s_fpb - lastblockoff, + err, locked_page); + if (!tmp) { + if (lastfrag != ufsi->i_lastfrag) + goto repeat; + else + return NULL; + } + lastfrag = ufsi->i_lastfrag; + + } + tmp = ufs_data_ptr_to_cpu(sb, + ufs_get_direct_data_ptr(uspi, ufsi, + lastblock)); + if (tmp) + goal = tmp + uspi->s_fpb; + tmp = ufs_new_fragments (inode, p, fragment - blockoff, + goal, required + blockoff, + err, + phys != NULL ? locked_page : NULL); + } else if (lastblock == block) { + /* + * We will extend last allocated block + */ + tmp = ufs_new_fragments(inode, p, fragment - + (blockoff - lastblockoff), + ufs_data_ptr_to_cpu(sb, p), + required + (blockoff - lastblockoff), + err, phys != NULL ? locked_page : NULL); + } else /* (lastblock > block) */ { + /* + * We will allocate new block before last allocated block + */ + if (block) { + tmp = ufs_data_ptr_to_cpu(sb, + ufs_get_direct_data_ptr(uspi, ufsi, block - 1)); + if (tmp) + goal = tmp + uspi->s_fpb; + } + tmp = ufs_new_fragments(inode, p, fragment - blockoff, + goal, uspi->s_fpb, err, + phys != NULL ? locked_page : NULL); + } + if (!tmp) { + if ((!blockoff && ufs_data_ptr_to_cpu(sb, p)) || + (blockoff && lastfrag != ufsi->i_lastfrag)) + goto repeat; + *err = -ENOSPC; + return NULL; + } + + if (!phys) { + result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); + } else { + *phys = uspi->s_sbbase + tmp + blockoff; + result = NULL; + *err = 0; + *new = 1; + } + + inode->i_ctime = CURRENT_TIME_SEC; + if (IS_SYNC(inode)) + ufs_sync_inode (inode); + mark_inode_dirty(inode); + UFSD("EXIT, result %llu\n", (unsigned long long)tmp + blockoff); + return result; + + /* This part : To be implemented .... + Required only for writing, not required for READ-ONLY. +ufs2: + + u2_block = ufs_fragstoblks(fragment); + u2_blockoff = ufs_fragnum(fragment); + p = ufsi->i_u1.u2_i_data + block; + goal = 0; + +repeat2: + tmp = fs32_to_cpu(sb, *p); + lastfrag = ufsi->i_lastfrag; + + */ +} + +/** + * ufs_inode_getblock() - allocate new block + * @inode - pointer to inode + * @bh - pointer to block which hold "pointer" to new allocated block + * @fragment - number of `fragment' which hold pointer + * to new allocated block + * @new_fragment - number of new allocated fragment + * (block will hold this fragment and also uspi->s_fpb-1) + * @err - see ufs_inode_getfrag() + * @phys - see ufs_inode_getfrag() + * @new - see ufs_inode_getfrag() + * @locked_page - see ufs_inode_getfrag() + */ +static struct buffer_head * +ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, + u64 fragment, sector_t new_fragment, int *err, + long *phys, int *new, struct page *locked_page) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct buffer_head * result; + unsigned blockoff; + u64 tmp, goal, block; + void *p; + + block = ufs_fragstoblks (fragment); + blockoff = ufs_fragnum (fragment); + + UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, metadata %d\n", + inode->i_ino, (unsigned long long)fragment, + (unsigned long long)new_fragment, !phys); + + result = NULL; + if (!bh) + goto out; + if (!buffer_uptodate(bh)) { + ll_rw_block (READ, 1, &bh); + wait_on_buffer (bh); + if (!buffer_uptodate(bh)) + goto out; + } + if (uspi->fs_magic == UFS2_MAGIC) + p = (__fs64 *)bh->b_data + block; + else + p = (__fs32 *)bh->b_data + block; +repeat: + tmp = ufs_data_ptr_to_cpu(sb, p); + if (tmp) { + if (!phys) { + result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); + if (tmp == ufs_data_ptr_to_cpu(sb, p)) + goto out; + brelse (result); + goto repeat; + } else { + *phys = uspi->s_sbbase + tmp + blockoff; + goto out; + } + } + + if (block && (uspi->fs_magic == UFS2_MAGIC ? + (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[block-1])) : + (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[block-1])))) + goal = tmp + uspi->s_fpb; + else + goal = bh->b_blocknr + uspi->s_fpb; + tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal, + uspi->s_fpb, err, locked_page); + if (!tmp) { + if (ufs_data_ptr_to_cpu(sb, p)) + goto repeat; + goto out; + } + + + if (!phys) { + result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); + } else { + *phys = uspi->s_sbbase + tmp + blockoff; + *new = 1; + } + + mark_buffer_dirty(bh); + if (IS_SYNC(inode)) + sync_dirty_buffer(bh); + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + UFSD("result %llu\n", (unsigned long long)tmp + blockoff); +out: + brelse (bh); + UFSD("EXIT\n"); + return result; +} + +/** + * ufs_getfrag_block() - `get_block_t' function, interface between UFS and + * readpage, writepage and so on + */ + +int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) +{ + struct super_block * sb = inode->i_sb; + struct ufs_sb_info * sbi = UFS_SB(sb); + struct ufs_sb_private_info * uspi = sbi->s_uspi; + struct buffer_head * bh; + int ret, err, new; + unsigned long ptr,phys; + u64 phys64 = 0; + bool needs_lock = (sbi->mutex_owner != current); + + if (!create) { + phys64 = ufs_frag_map(inode, fragment, needs_lock); + UFSD("phys64 = %llu\n", (unsigned long long)phys64); + if (phys64) + map_bh(bh_result, sb, phys64); + return 0; + } + + /* This code entered only while writing ....? */ + + err = -EIO; + new = 0; + ret = 0; + bh = NULL; + + if (needs_lock) + lock_ufs(sb); + + UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); + if (fragment > + ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb) + << uspi->s_fpbshift)) + goto abort_too_big; + + err = 0; + ptr = fragment; + + /* + * ok, these macros clean the logic up a bit and make + * it much more readable: + */ +#define GET_INODE_DATABLOCK(x) \ + ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new,\ + bh_result->b_page) +#define GET_INODE_PTR(x) \ + ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL,\ + bh_result->b_page) +#define GET_INDIRECT_DATABLOCK(x) \ + ufs_inode_getblock(inode, bh, x, fragment, \ + &err, &phys, &new, bh_result->b_page) +#define GET_INDIRECT_PTR(x) \ + ufs_inode_getblock(inode, bh, x, fragment, \ + &err, NULL, NULL, NULL) + + if (ptr < UFS_NDIR_FRAGMENT) { + bh = GET_INODE_DATABLOCK(ptr); + goto out; + } + ptr -= UFS_NDIR_FRAGMENT; + if (ptr < (1 << (uspi->s_apbshift + uspi->s_fpbshift))) { + bh = GET_INODE_PTR(UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift)); + goto get_indirect; + } + ptr -= 1 << (uspi->s_apbshift + uspi->s_fpbshift); + if (ptr < (1 << (uspi->s_2apbshift + uspi->s_fpbshift))) { + bh = GET_INODE_PTR(UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift)); + goto get_double; + } + ptr -= 1 << (uspi->s_2apbshift + uspi->s_fpbshift); + bh = GET_INODE_PTR(UFS_TIND_FRAGMENT + (ptr >> uspi->s_3apbshift)); + bh = GET_INDIRECT_PTR((ptr >> uspi->s_2apbshift) & uspi->s_apbmask); +get_double: + bh = GET_INDIRECT_PTR((ptr >> uspi->s_apbshift) & uspi->s_apbmask); +get_indirect: + bh = GET_INDIRECT_DATABLOCK(ptr & uspi->s_apbmask); + +#undef GET_INODE_DATABLOCK +#undef GET_INODE_PTR +#undef GET_INDIRECT_DATABLOCK +#undef GET_INDIRECT_PTR + +out: + if (err) + goto abort; + if (new) + set_buffer_new(bh_result); + map_bh(bh_result, sb, phys); +abort: + if (needs_lock) + unlock_ufs(sb); + + return err; + +abort_too_big: + ufs_warning(sb, "ufs_get_block", "block > big"); + goto abort; +} + +static int ufs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page,ufs_getfrag_block,wbc); +} + +static int ufs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,ufs_getfrag_block); +} + +int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len) +{ + return __block_write_begin(page, pos, len, ufs_getfrag_block); +} + +static int ufs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags, pagep, + ufs_getfrag_block); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} + +static sector_t ufs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping,block,ufs_getfrag_block); +} + +const struct address_space_operations ufs_aops = { + .readpage = ufs_readpage, + .writepage = ufs_writepage, + .write_begin = ufs_write_begin, + .write_end = generic_write_end, + .bmap = ufs_bmap +}; + +static void ufs_set_inode_ops(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ufs_file_inode_operations; + inode->i_fop = &ufs_file_operations; + inode->i_mapping->a_ops = &ufs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ufs_dir_inode_operations; + inode->i_fop = &ufs_dir_operations; + inode->i_mapping->a_ops = &ufs_aops; + } else if (S_ISLNK(inode->i_mode)) { + if (!inode->i_blocks) + inode->i_op = &ufs_fast_symlink_inode_operations; + else { + inode->i_op = &ufs_symlink_inode_operations; + inode->i_mapping->a_ops = &ufs_aops; + } + } else + init_special_inode(inode, inode->i_mode, + ufs_get_inode_dev(inode->i_sb, UFS_I(inode))); +} + +static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + mode_t mode; + + /* + * Copy data to the in-core inode. + */ + inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode); + inode->i_nlink = fs16_to_cpu(sb, ufs_inode->ui_nlink); + if (inode->i_nlink == 0) { + ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); + return -1; + } + + /* + * Linux now has 32-bit uid and gid, so we can support EFT. + */ + inode->i_uid = ufs_get_inode_uid(sb, ufs_inode); + inode->i_gid = ufs_get_inode_gid(sb, ufs_inode); + + inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size); + inode->i_atime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec); + inode->i_ctime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec); + inode->i_mtime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec); + inode->i_mtime.tv_nsec = 0; + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks); + inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen); + ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags); + ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); + ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); + + + if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { + memcpy(ufsi->i_u1.i_data, &ufs_inode->ui_u2.ui_addr, + sizeof(ufs_inode->ui_u2.ui_addr)); + } else { + memcpy(ufsi->i_u1.i_symlink, ufs_inode->ui_u2.ui_symlink, + sizeof(ufs_inode->ui_u2.ui_symlink) - 1); + ufsi->i_u1.i_symlink[sizeof(ufs_inode->ui_u2.ui_symlink) - 1] = 0; + } + return 0; +} + +static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + mode_t mode; + + UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino); + /* + * Copy data to the in-core inode. + */ + inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode); + inode->i_nlink = fs16_to_cpu(sb, ufs2_inode->ui_nlink); + if (inode->i_nlink == 0) { + ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); + return -1; + } + + /* + * Linux now has 32-bit uid and gid, so we can support EFT. + */ + inode->i_uid = fs32_to_cpu(sb, ufs2_inode->ui_uid); + inode->i_gid = fs32_to_cpu(sb, ufs2_inode->ui_gid); + + inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size); + inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime); + inode->i_ctime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_ctime); + inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime); + inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec); + inode->i_ctime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_ctimensec); + inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec); + inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks); + inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen); + ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags); + /* + ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); + ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); + */ + + if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { + memcpy(ufsi->i_u1.u2_i_data, &ufs2_inode->ui_u2.ui_addr, + sizeof(ufs2_inode->ui_u2.ui_addr)); + } else { + memcpy(ufsi->i_u1.i_symlink, ufs2_inode->ui_u2.ui_symlink, + sizeof(ufs2_inode->ui_u2.ui_symlink) - 1); + ufsi->i_u1.i_symlink[sizeof(ufs2_inode->ui_u2.ui_symlink) - 1] = 0; + } + return 0; +} + +struct inode *ufs_iget(struct super_block *sb, unsigned long ino) +{ + struct ufs_inode_info *ufsi; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct buffer_head * bh; + struct inode *inode; + int err; + + UFSD("ENTER, ino %lu\n", ino); + + if (ino < UFS_ROOTINO || ino > (uspi->s_ncg * uspi->s_ipg)) { + ufs_warning(sb, "ufs_read_inode", "bad inode number (%lu)\n", + ino); + return ERR_PTR(-EIO); + } + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ufsi = UFS_I(inode); + + bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); + if (!bh) { + ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n", + inode->i_ino); + goto bad_inode; + } + if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data; + + err = ufs2_read_inode(inode, + ufs2_inode + ufs_inotofsbo(inode->i_ino)); + } else { + struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data; + + err = ufs1_read_inode(inode, + ufs_inode + ufs_inotofsbo(inode->i_ino)); + } + + if (err) + goto bad_inode; + inode->i_version++; + ufsi->i_lastfrag = + (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; + ufsi->i_dir_start_lookup = 0; + ufsi->i_osync = 0; + + ufs_set_inode_ops(inode); + + brelse(bh); + + UFSD("EXIT\n"); + unlock_new_inode(inode); + return inode; + +bad_inode: + iget_failed(inode); + return ERR_PTR(-EIO); +} + +static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) +{ + struct super_block *sb = inode->i_sb; + struct ufs_inode_info *ufsi = UFS_I(inode); + + ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode); + ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink); + + ufs_set_inode_uid(sb, ufs_inode, inode->i_uid); + ufs_set_inode_gid(sb, ufs_inode, inode->i_gid); + + ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); + ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec); + ufs_inode->ui_atime.tv_usec = 0; + ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, inode->i_ctime.tv_sec); + ufs_inode->ui_ctime.tv_usec = 0; + ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec); + ufs_inode->ui_mtime.tv_usec = 0; + ufs_inode->ui_blocks = cpu_to_fs32(sb, inode->i_blocks); + ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags); + ufs_inode->ui_gen = cpu_to_fs32(sb, inode->i_generation); + + if ((UFS_SB(sb)->s_flags & UFS_UID_MASK) == UFS_UID_EFT) { + ufs_inode->ui_u3.ui_sun.ui_shadow = cpu_to_fs32(sb, ufsi->i_shadow); + ufs_inode->ui_u3.ui_sun.ui_oeftflag = cpu_to_fs32(sb, ufsi->i_oeftflag); + } + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + /* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */ + ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.i_data[0]; + } else if (inode->i_blocks) { + memcpy(&ufs_inode->ui_u2.ui_addr, ufsi->i_u1.i_data, + sizeof(ufs_inode->ui_u2.ui_addr)); + } + else { + memcpy(&ufs_inode->ui_u2.ui_symlink, ufsi->i_u1.i_symlink, + sizeof(ufs_inode->ui_u2.ui_symlink)); + } + + if (!inode->i_nlink) + memset (ufs_inode, 0, sizeof(struct ufs_inode)); +} + +static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode) +{ + struct super_block *sb = inode->i_sb; + struct ufs_inode_info *ufsi = UFS_I(inode); + + UFSD("ENTER\n"); + ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode); + ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink); + + ufs_inode->ui_uid = cpu_to_fs32(sb, inode->i_uid); + ufs_inode->ui_gid = cpu_to_fs32(sb, inode->i_gid); + + ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); + ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec); + ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec); + ufs_inode->ui_ctime = cpu_to_fs64(sb, inode->i_ctime.tv_sec); + ufs_inode->ui_ctimensec = cpu_to_fs32(sb, inode->i_ctime.tv_nsec); + ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec); + ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec); + + ufs_inode->ui_blocks = cpu_to_fs64(sb, inode->i_blocks); + ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags); + ufs_inode->ui_gen = cpu_to_fs32(sb, inode->i_generation); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + /* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */ + ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.u2_i_data[0]; + } else if (inode->i_blocks) { + memcpy(&ufs_inode->ui_u2.ui_addr, ufsi->i_u1.u2_i_data, + sizeof(ufs_inode->ui_u2.ui_addr)); + } else { + memcpy(&ufs_inode->ui_u2.ui_symlink, ufsi->i_u1.i_symlink, + sizeof(ufs_inode->ui_u2.ui_symlink)); + } + + if (!inode->i_nlink) + memset (ufs_inode, 0, sizeof(struct ufs2_inode)); + UFSD("EXIT\n"); +} + +static int ufs_update_inode(struct inode * inode, int do_sync) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct buffer_head * bh; + + UFSD("ENTER, ino %lu\n", inode->i_ino); + + if (inode->i_ino < UFS_ROOTINO || + inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) { + ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino); + return -1; + } + + bh = sb_bread(sb, ufs_inotofsba(inode->i_ino)); + if (!bh) { + ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino); + return -1; + } + if (uspi->fs_magic == UFS2_MAGIC) { + struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data; + + ufs2_update_inode(inode, + ufs2_inode + ufs_inotofsbo(inode->i_ino)); + } else { + struct ufs_inode *ufs_inode = (struct ufs_inode *) bh->b_data; + + ufs1_update_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); + } + + mark_buffer_dirty(bh); + if (do_sync) + sync_dirty_buffer(bh); + brelse (bh); + + UFSD("EXIT\n"); + return 0; +} + +int ufs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int ret; + lock_ufs(inode->i_sb); + ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); + unlock_ufs(inode->i_sb); + return ret; +} + +int ufs_sync_inode (struct inode *inode) +{ + return ufs_update_inode (inode, 1); +} + +void ufs_evict_inode(struct inode * inode) +{ + int want_delete = 0; + + if (!inode->i_nlink && !is_bad_inode(inode)) + want_delete = 1; + + truncate_inode_pages(&inode->i_data, 0); + if (want_delete) { + loff_t old_i_size; + /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ + lock_ufs(inode->i_sb); + mark_inode_dirty(inode); + ufs_update_inode(inode, IS_SYNC(inode)); + old_i_size = inode->i_size; + inode->i_size = 0; + if (inode->i_blocks && ufs_truncate(inode, old_i_size)) + ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n"); + unlock_ufs(inode->i_sb); + } + + invalidate_inode_buffers(inode); + end_writeback(inode); + + if (want_delete) { + lock_ufs(inode->i_sb); + ufs_free_inode (inode); + unlock_ufs(inode->i_sb); + } +} diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c new file mode 100644 index 00000000..b57aab9a --- /dev/null +++ b/fs/ufs/namei.c @@ -0,0 +1,360 @@ +/* + * linux/fs/ufs/namei.c + * + * Migration to usage of "page cache" on May 2006 by + * Evgeniy Dushistov based on ext2 code base. + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * from + * + * linux/fs/ext2/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "util.h" + +static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = ufs_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode * inode = NULL; + ino_t ino; + + if (dentry->d_name.len > UFS_MAXNAMLEN) + return ERR_PTR(-ENAMETOOLONG); + + lock_ufs(dir->i_sb); + ino = ufs_inode_by_name(dir, &dentry->d_name); + if (ino) + inode = ufs_iget(dir->i_sb, ino); + unlock_ufs(dir->i_sb); + if (IS_ERR(inode)) + return ERR_CAST(inode); + return d_splice_alias(inode, dentry); +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + int err; + + UFSD("BEGIN\n"); + + inode = ufs_new_inode(dir, mode); + err = PTR_ERR(inode); + + if (!IS_ERR(inode)) { + inode->i_op = &ufs_file_inode_operations; + inode->i_fop = &ufs_file_operations; + inode->i_mapping->a_ops = &ufs_aops; + mark_inode_dirty(inode); + lock_ufs(dir->i_sb); + err = ufs_add_nondir(dentry, inode); + unlock_ufs(dir->i_sb); + } + UFSD("END: err=%d\n", err); + return err; +} + +static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) +{ + struct inode *inode; + int err; + + if (!old_valid_dev(rdev)) + return -EINVAL; + + inode = ufs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, mode, rdev); + ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev); + mark_inode_dirty(inode); + lock_ufs(dir->i_sb); + err = ufs_add_nondir(dentry, inode); + unlock_ufs(dir->i_sb); + } + return err; +} + +static int ufs_symlink (struct inode * dir, struct dentry * dentry, + const char * symname) +{ + struct super_block * sb = dir->i_sb; + int err = -ENAMETOOLONG; + unsigned l = strlen(symname)+1; + struct inode * inode; + + if (l > sb->s_blocksize) + goto out_notlocked; + + lock_ufs(dir->i_sb); + inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { + /* slow symlink */ + inode->i_op = &ufs_symlink_inode_operations; + inode->i_mapping->a_ops = &ufs_aops; + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + } else { + /* fast symlink */ + inode->i_op = &ufs_fast_symlink_inode_operations; + memcpy(UFS_I(inode)->i_u1.i_symlink, symname, l); + inode->i_size = l-1; + } + mark_inode_dirty(inode); + + err = ufs_add_nondir(dentry, inode); +out: + unlock_ufs(dir->i_sb); +out_notlocked: + return err; + +out_fail: + inode_dec_link_count(inode); + iput(inode); + goto out; +} + +static int ufs_link (struct dentry * old_dentry, struct inode * dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + int error; + + lock_ufs(dir->i_sb); + if (inode->i_nlink >= UFS_LINK_MAX) { + unlock_ufs(dir->i_sb); + return -EMLINK; + } + + inode->i_ctime = CURRENT_TIME_SEC; + inode_inc_link_count(inode); + ihold(inode); + + error = ufs_add_nondir(dentry, inode); + unlock_ufs(dir->i_sb); + return error; +} + +static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode) +{ + struct inode * inode; + int err = -EMLINK; + + if (dir->i_nlink >= UFS_LINK_MAX) + goto out; + + lock_ufs(dir->i_sb); + inode_inc_link_count(dir); + + inode = ufs_new_inode(dir, S_IFDIR|mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + inode->i_op = &ufs_dir_inode_operations; + inode->i_fop = &ufs_dir_operations; + inode->i_mapping->a_ops = &ufs_aops; + + inode_inc_link_count(inode); + + err = ufs_make_empty(inode, dir); + if (err) + goto out_fail; + + err = ufs_add_link(dentry, inode); + if (err) + goto out_fail; + unlock_ufs(dir->i_sb); + + d_instantiate(dentry, inode); +out: + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + iput (inode); +out_dir: + inode_dec_link_count(dir); + unlock_ufs(dir->i_sb); + goto out; +} + +static int ufs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode * inode = dentry->d_inode; + struct ufs_dir_entry *de; + struct page *page; + int err = -ENOENT; + + de = ufs_find_entry(dir, &dentry->d_name, &page); + if (!de) + goto out; + + err = ufs_delete_entry(dir, de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + err = 0; +out: + return err; +} + +static int ufs_rmdir (struct inode * dir, struct dentry *dentry) +{ + struct inode * inode = dentry->d_inode; + int err= -ENOTEMPTY; + + lock_ufs(dir->i_sb); + if (ufs_empty_dir (inode)) { + err = ufs_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + inode_dec_link_count(inode); + inode_dec_link_count(dir); + } + } + unlock_ufs(dir->i_sb); + return err; +} + +static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct page *dir_page = NULL; + struct ufs_dir_entry * dir_de = NULL; + struct page *old_page; + struct ufs_dir_entry *old_de; + int err = -ENOENT; + + old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = ufs_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page *new_page; + struct ufs_dir_entry *new_de; + + err = -ENOTEMPTY; + if (dir_de && !ufs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); + if (!new_de) + goto out_dir; + ufs_set_link(new_dir, new_de, new_page, old_inode); + new_inode->i_ctime = CURRENT_TIME_SEC; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + } else { + if (dir_de) { + err = -EMLINK; + if (new_dir->i_nlink >= UFS_LINK_MAX) + goto out_dir; + } + err = ufs_add_link(new_dentry, old_inode); + if (err) + goto out_dir; + if (dir_de) + inode_inc_link_count(new_dir); + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = CURRENT_TIME_SEC; + + ufs_delete_entry(old_dir, old_de, old_page); + mark_inode_dirty(old_inode); + + if (dir_de) { + ufs_set_link(old_inode, dir_de, dir_page, new_dir); + inode_dec_link_count(old_dir); + } + return 0; + + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + return err; +} + +const struct inode_operations ufs_dir_inode_operations = { + .create = ufs_create, + .lookup = ufs_lookup, + .link = ufs_link, + .unlink = ufs_unlink, + .symlink = ufs_symlink, + .mkdir = ufs_mkdir, + .rmdir = ufs_rmdir, + .mknod = ufs_mknod, + .rename = ufs_rename, +}; diff --git a/fs/ufs/super.c b/fs/ufs/super.c new file mode 100644 index 00000000..3915ade6 --- /dev/null +++ b/fs/ufs/super.c @@ -0,0 +1,1511 @@ +/* + * linux/fs/ufs/super.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + */ + +/* Derived from + * + * linux/fs/ext2/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +/* + * Inspired by + * + * linux/fs/ufs/super.c + * + * Copyright (C) 1996 + * Adrian Rodriguez (adrian@franklins-tower.rutgers.edu) + * Laboratory for Computer Science Research Computing Facility + * Rutgers, The State University of New Jersey + * + * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be) + * + * Kernel module support added on 96/04/26 by + * Stefan Reinauer + * + * Module usage counts added on 96/04/29 by + * Gertjan van Wingerde + * + * Clean swab support on 19970406 by + * Francois-Rene Rideau + * + * 4.4BSD (FreeBSD) support added on February 1st 1998 by + * Niels Kristian Bech Jensen partially based + * on code by Martin von Loewis . + * + * NeXTstep support added on February 5th 1998 by + * Niels Kristian Bech Jensen . + * + * write support Daniel Pirkl 1998 + * + * HP/UX hfs filesystem support added by + * Martin K. Petersen , August 1999 + * + * UFS2 (of FreeBSD 5.x) support added by + * Niraj Kumar , Jan 2004 + * + * UFS2 write support added by + * Evgeniy Dushistov , 2007 + */ + + +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +void lock_ufs(struct super_block *sb) +{ +#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT) + struct ufs_sb_info *sbi = UFS_SB(sb); + + mutex_lock(&sbi->mutex); + sbi->mutex_owner = current; +#endif +} + +void unlock_ufs(struct super_block *sb) +{ +#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT) + struct ufs_sb_info *sbi = UFS_SB(sb); + + sbi->mutex_owner = NULL; + mutex_unlock(&sbi->mutex); +#endif +} + +static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) +{ + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct inode *inode; + + if (ino < UFS_ROOTINO || ino > uspi->s_ncg * uspi->s_ipg) + return ERR_PTR(-ESTALE); + + inode = ufs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *ufs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ufs_nfs_get_inode); +} + +static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, ufs_nfs_get_inode); +} + +static struct dentry *ufs_get_parent(struct dentry *child) +{ + struct qstr dot_dot = { + .name = "..", + .len = 2, + }; + ino_t ino; + + ino = ufs_inode_by_name(child->d_inode, &dot_dot); + if (!ino) + return ERR_PTR(-ENOENT); + return d_obtain_alias(ufs_iget(child->d_inode->i_sb, ino)); +} + +static const struct export_operations ufs_export_ops = { + .fh_to_dentry = ufs_fh_to_dentry, + .fh_to_parent = ufs_fh_to_parent, + .get_parent = ufs_get_parent, +}; + +#ifdef CONFIG_UFS_DEBUG +/* + * Print contents of ufs_super_block, useful for debugging + */ +static void ufs_print_super_stuff(struct super_block *sb, + struct ufs_super_block_first *usb1, + struct ufs_super_block_second *usb2, + struct ufs_super_block_third *usb3) +{ + u32 magic = fs32_to_cpu(sb, usb3->fs_magic); + + printk("ufs_print_super_stuff\n"); + printk(" magic: 0x%x\n", magic); + if (fs32_to_cpu(sb, usb3->fs_magic) == UFS2_MAGIC) { + printk(" fs_size: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size)); + printk(" fs_dsize: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize)); + printk(" bsize: %u\n", + fs32_to_cpu(sb, usb1->fs_bsize)); + printk(" fsize: %u\n", + fs32_to_cpu(sb, usb1->fs_fsize)); + printk(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname); + printk(" fs_sblockloc: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc)); + printk(" cs_ndir(No of dirs): %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir)); + printk(" cs_nbfree(No of free blocks): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)); + printk(KERN_INFO" cs_nifree(Num of free inodes): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree)); + printk(KERN_INFO" cs_nffree(Num of free frags): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree)); + printk(KERN_INFO" fs_maxsymlinklen: %u\n", + fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen)); + } else { + printk(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); + printk(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); + printk(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno)); + printk(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno)); + printk(" cgoffset: %u\n", + fs32_to_cpu(sb, usb1->fs_cgoffset)); + printk(" ~cgmask: 0x%x\n", + ~fs32_to_cpu(sb, usb1->fs_cgmask)); + printk(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size)); + printk(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize)); + printk(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg)); + printk(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); + printk(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); + printk(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag)); + printk(" fragshift: %u\n", + fs32_to_cpu(sb, usb1->fs_fragshift)); + printk(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask)); + printk(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift)); + printk(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize)); + printk(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc)); + printk(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg)); + printk(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg)); + printk(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg)); + printk(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr)); + printk(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize)); + printk(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize)); + printk(" fstodb: %u\n", + fs32_to_cpu(sb, usb1->fs_fsbtodb)); + printk(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos)); + printk(" ndir %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir)); + printk(" nifree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree)); + printk(" nbfree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)); + printk(" nffree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree)); + } + printk("\n"); +} + +/* + * Print contents of ufs_cylinder_group, useful for debugging + */ +static void ufs_print_cylinder_stuff(struct super_block *sb, + struct ufs_cylinder_group *cg) +{ + printk("\nufs_print_cylinder_stuff\n"); + printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group)); + printk(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic)); + printk(" time: %u\n", fs32_to_cpu(sb, cg->cg_time)); + printk(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx)); + printk(" ncyl: %u\n", fs16_to_cpu(sb, cg->cg_ncyl)); + printk(" niblk: %u\n", fs16_to_cpu(sb, cg->cg_niblk)); + printk(" ndblk: %u\n", fs32_to_cpu(sb, cg->cg_ndblk)); + printk(" cs_ndir: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir)); + printk(" cs_nbfree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree)); + printk(" cs_nifree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree)); + printk(" cs_nffree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree)); + printk(" rotor: %u\n", fs32_to_cpu(sb, cg->cg_rotor)); + printk(" frotor: %u\n", fs32_to_cpu(sb, cg->cg_frotor)); + printk(" irotor: %u\n", fs32_to_cpu(sb, cg->cg_irotor)); + printk(" frsum: %u, %u, %u, %u, %u, %u, %u, %u\n", + fs32_to_cpu(sb, cg->cg_frsum[0]), fs32_to_cpu(sb, cg->cg_frsum[1]), + fs32_to_cpu(sb, cg->cg_frsum[2]), fs32_to_cpu(sb, cg->cg_frsum[3]), + fs32_to_cpu(sb, cg->cg_frsum[4]), fs32_to_cpu(sb, cg->cg_frsum[5]), + fs32_to_cpu(sb, cg->cg_frsum[6]), fs32_to_cpu(sb, cg->cg_frsum[7])); + printk(" btotoff: %u\n", fs32_to_cpu(sb, cg->cg_btotoff)); + printk(" boff: %u\n", fs32_to_cpu(sb, cg->cg_boff)); + printk(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff)); + printk(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff)); + printk(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff)); + printk(" clustersumoff %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff)); + printk(" clusteroff %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff)); + printk(" nclusterblks %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks)); + printk("\n"); +} +#else +# define ufs_print_super_stuff(sb, usb1, usb2, usb3) /**/ +# define ufs_print_cylinder_stuff(sb, cg) /**/ +#endif /* CONFIG_UFS_DEBUG */ + +static const struct super_operations ufs_super_ops; + +static char error_buf[1024]; + +void ufs_error (struct super_block * sb, const char * function, + const char * fmt, ...) +{ + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + va_list args; + + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first(uspi); + + if (!(sb->s_flags & MS_RDONLY)) { + usb1->fs_clean = UFS_FSBAD; + ubh_mark_buffer_dirty(USPI_UBH(uspi)); + sb->s_dirt = 1; + sb->s_flags |= MS_RDONLY; + } + va_start (args, fmt); + vsnprintf (error_buf, sizeof(error_buf), fmt, args); + va_end (args); + switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) { + case UFS_MOUNT_ONERROR_PANIC: + panic ("UFS-fs panic (device %s): %s: %s\n", + sb->s_id, function, error_buf); + + case UFS_MOUNT_ONERROR_LOCK: + case UFS_MOUNT_ONERROR_UMOUNT: + case UFS_MOUNT_ONERROR_REPAIR: + printk (KERN_CRIT "UFS-fs error (device %s): %s: %s\n", + sb->s_id, function, error_buf); + } +} + +void ufs_panic (struct super_block * sb, const char * function, + const char * fmt, ...) +{ + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + va_list args; + + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first(uspi); + + if (!(sb->s_flags & MS_RDONLY)) { + usb1->fs_clean = UFS_FSBAD; + ubh_mark_buffer_dirty(USPI_UBH(uspi)); + sb->s_dirt = 1; + } + va_start (args, fmt); + vsnprintf (error_buf, sizeof(error_buf), fmt, args); + va_end (args); + sb->s_flags |= MS_RDONLY; + printk (KERN_CRIT "UFS-fs panic (device %s): %s: %s\n", + sb->s_id, function, error_buf); +} + +void ufs_warning (struct super_block * sb, const char * function, + const char * fmt, ...) +{ + va_list args; + + va_start (args, fmt); + vsnprintf (error_buf, sizeof(error_buf), fmt, args); + va_end (args); + printk (KERN_WARNING "UFS-fs warning (device %s): %s: %s\n", + sb->s_id, function, error_buf); +} + +enum { + Opt_type_old = UFS_MOUNT_UFSTYPE_OLD, + Opt_type_sunx86 = UFS_MOUNT_UFSTYPE_SUNx86, + Opt_type_sun = UFS_MOUNT_UFSTYPE_SUN, + Opt_type_sunos = UFS_MOUNT_UFSTYPE_SUNOS, + Opt_type_44bsd = UFS_MOUNT_UFSTYPE_44BSD, + Opt_type_ufs2 = UFS_MOUNT_UFSTYPE_UFS2, + Opt_type_hp = UFS_MOUNT_UFSTYPE_HP, + Opt_type_nextstepcd = UFS_MOUNT_UFSTYPE_NEXTSTEP_CD, + Opt_type_nextstep = UFS_MOUNT_UFSTYPE_NEXTSTEP, + Opt_type_openstep = UFS_MOUNT_UFSTYPE_OPENSTEP, + Opt_onerror_panic = UFS_MOUNT_ONERROR_PANIC, + Opt_onerror_lock = UFS_MOUNT_ONERROR_LOCK, + Opt_onerror_umount = UFS_MOUNT_ONERROR_UMOUNT, + Opt_onerror_repair = UFS_MOUNT_ONERROR_REPAIR, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_type_old, "ufstype=old"}, + {Opt_type_sunx86, "ufstype=sunx86"}, + {Opt_type_sun, "ufstype=sun"}, + {Opt_type_sunos, "ufstype=sunos"}, + {Opt_type_44bsd, "ufstype=44bsd"}, + {Opt_type_ufs2, "ufstype=ufs2"}, + {Opt_type_ufs2, "ufstype=5xbsd"}, + {Opt_type_hp, "ufstype=hp"}, + {Opt_type_nextstepcd, "ufstype=nextstep-cd"}, + {Opt_type_nextstep, "ufstype=nextstep"}, + {Opt_type_openstep, "ufstype=openstep"}, +/*end of possible ufs types */ + {Opt_onerror_panic, "onerror=panic"}, + {Opt_onerror_lock, "onerror=lock"}, + {Opt_onerror_umount, "onerror=umount"}, + {Opt_onerror_repair, "onerror=repair"}, + {Opt_err, NULL} +}; + +static int ufs_parse_options (char * options, unsigned * mount_options) +{ + char * p; + + UFSD("ENTER\n"); + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_type_old: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_OLD); + break; + case Opt_type_sunx86: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_SUNx86); + break; + case Opt_type_sun: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_SUN); + break; + case Opt_type_sunos: + ufs_clear_opt(*mount_options, UFSTYPE); + ufs_set_opt(*mount_options, UFSTYPE_SUNOS); + break; + case Opt_type_44bsd: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_44BSD); + break; + case Opt_type_ufs2: + ufs_clear_opt(*mount_options, UFSTYPE); + ufs_set_opt(*mount_options, UFSTYPE_UFS2); + break; + case Opt_type_hp: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_HP); + break; + case Opt_type_nextstepcd: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP_CD); + break; + case Opt_type_nextstep: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP); + break; + case Opt_type_openstep: + ufs_clear_opt (*mount_options, UFSTYPE); + ufs_set_opt (*mount_options, UFSTYPE_OPENSTEP); + break; + case Opt_onerror_panic: + ufs_clear_opt (*mount_options, ONERROR); + ufs_set_opt (*mount_options, ONERROR_PANIC); + break; + case Opt_onerror_lock: + ufs_clear_opt (*mount_options, ONERROR); + ufs_set_opt (*mount_options, ONERROR_LOCK); + break; + case Opt_onerror_umount: + ufs_clear_opt (*mount_options, ONERROR); + ufs_set_opt (*mount_options, ONERROR_UMOUNT); + break; + case Opt_onerror_repair: + printk("UFS-fs: Unable to do repair on error, " + "will lock lock instead\n"); + ufs_clear_opt (*mount_options, ONERROR); + ufs_set_opt (*mount_options, ONERROR_REPAIR); + break; + default: + printk("UFS-fs: Invalid option: \"%s\" " + "or missing value\n", p); + return 0; + } + } + return 1; +} + +/* + * Different types of UFS hold fs_cstotal in different + * places, and use different data structure for it. + * To make things simpler we just copy fs_cstotal to ufs_sb_private_info + */ +static void ufs_setup_cstotal(struct super_block *sb) +{ + struct ufs_sb_info *sbi = UFS_SB(sb); + struct ufs_sb_private_info *uspi = sbi->s_uspi; + struct ufs_super_block_first *usb1; + struct ufs_super_block_second *usb2; + struct ufs_super_block_third *usb3; + unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; + + UFSD("ENTER, mtype=%u\n", mtype); + usb1 = ubh_get_usb_first(uspi); + usb2 = ubh_get_usb_second(uspi); + usb3 = ubh_get_usb_third(uspi); + + if ((mtype == UFS_MOUNT_UFSTYPE_44BSD && + (usb1->fs_flags & UFS_FLAGS_UPDATED)) || + mtype == UFS_MOUNT_UFSTYPE_UFS2) { + /*we have statistic in different place, then usual*/ + uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir); + uspi->cs_total.cs_nbfree = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree); + uspi->cs_total.cs_nifree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree); + uspi->cs_total.cs_nffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree); + } else { + uspi->cs_total.cs_ndir = fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir); + uspi->cs_total.cs_nbfree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree); + uspi->cs_total.cs_nifree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree); + uspi->cs_total.cs_nffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree); + } + UFSD("EXIT\n"); +} + +/* + * Read on-disk structures associated with cylinder groups + */ +static int ufs_read_cylinder_structures(struct super_block *sb) +{ + struct ufs_sb_info *sbi = UFS_SB(sb); + struct ufs_sb_private_info *uspi = sbi->s_uspi; + struct ufs_buffer_head * ubh; + unsigned char * base, * space; + unsigned size, blks, i; + struct ufs_super_block_third *usb3; + + UFSD("ENTER\n"); + + usb3 = ubh_get_usb_third(uspi); + /* + * Read cs structures from (usually) first data block + * on the device. + */ + size = uspi->s_cssize; + blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; + base = space = kmalloc(size, GFP_NOFS); + if (!base) + goto failed; + sbi->s_csp = (struct ufs_csum *)space; + for (i = 0; i < blks; i += uspi->s_fpb) { + size = uspi->s_bsize; + if (i + uspi->s_fpb > blks) + size = (blks - i) * uspi->s_fsize; + + ubh = ubh_bread(sb, uspi->s_csaddr + i, size); + + if (!ubh) + goto failed; + + ubh_ubhcpymem (space, ubh, size); + + space += size; + ubh_brelse (ubh); + ubh = NULL; + } + + /* + * Read cylinder group (we read only first fragment from block + * at this time) and prepare internal data structures for cg caching. + */ + if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_NOFS))) + goto failed; + for (i = 0; i < uspi->s_ncg; i++) + sbi->s_ucg[i] = NULL; + for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) { + sbi->s_ucpi[i] = NULL; + sbi->s_cgno[i] = UFS_CGNO_EMPTY; + } + for (i = 0; i < uspi->s_ncg; i++) { + UFSD("read cg %u\n", i); + if (!(sbi->s_ucg[i] = sb_bread(sb, ufs_cgcmin(i)))) + goto failed; + if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data)) + goto failed; + + ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data); + } + for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) { + if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_NOFS))) + goto failed; + sbi->s_cgno[i] = UFS_CGNO_EMPTY; + } + sbi->s_cg_loaded = 0; + UFSD("EXIT\n"); + return 1; + +failed: + kfree (base); + if (sbi->s_ucg) { + for (i = 0; i < uspi->s_ncg; i++) + if (sbi->s_ucg[i]) + brelse (sbi->s_ucg[i]); + kfree (sbi->s_ucg); + for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) + kfree (sbi->s_ucpi[i]); + } + UFSD("EXIT (FAILED)\n"); + return 0; +} + +/* + * Sync our internal copy of fs_cstotal with disk + */ +static void ufs_put_cstotal(struct super_block *sb) +{ + unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct ufs_super_block_first *usb1; + struct ufs_super_block_second *usb2; + struct ufs_super_block_third *usb3; + + UFSD("ENTER\n"); + usb1 = ubh_get_usb_first(uspi); + usb2 = ubh_get_usb_second(uspi); + usb3 = ubh_get_usb_third(uspi); + + if ((mtype == UFS_MOUNT_UFSTYPE_44BSD && + (usb1->fs_flags & UFS_FLAGS_UPDATED)) || + mtype == UFS_MOUNT_UFSTYPE_UFS2) { + /*we have statistic in different place, then usual*/ + usb2->fs_un.fs_u2.cs_ndir = + cpu_to_fs64(sb, uspi->cs_total.cs_ndir); + usb2->fs_un.fs_u2.cs_nbfree = + cpu_to_fs64(sb, uspi->cs_total.cs_nbfree); + usb3->fs_un1.fs_u2.cs_nifree = + cpu_to_fs64(sb, uspi->cs_total.cs_nifree); + usb3->fs_un1.fs_u2.cs_nffree = + cpu_to_fs64(sb, uspi->cs_total.cs_nffree); + } else { + usb1->fs_cstotal.cs_ndir = + cpu_to_fs32(sb, uspi->cs_total.cs_ndir); + usb1->fs_cstotal.cs_nbfree = + cpu_to_fs32(sb, uspi->cs_total.cs_nbfree); + usb1->fs_cstotal.cs_nifree = + cpu_to_fs32(sb, uspi->cs_total.cs_nifree); + usb1->fs_cstotal.cs_nffree = + cpu_to_fs32(sb, uspi->cs_total.cs_nffree); + } + ubh_mark_buffer_dirty(USPI_UBH(uspi)); + ufs_print_super_stuff(sb, usb1, usb2, usb3); + UFSD("EXIT\n"); +} + +/** + * ufs_put_super_internal() - put on-disk intrenal structures + * @sb: pointer to super_block structure + * Put on-disk structures associated with cylinder groups + * and write them back to disk, also update cs_total on disk + */ +static void ufs_put_super_internal(struct super_block *sb) +{ + struct ufs_sb_info *sbi = UFS_SB(sb); + struct ufs_sb_private_info *uspi = sbi->s_uspi; + struct ufs_buffer_head * ubh; + unsigned char * base, * space; + unsigned blks, size, i; + + + UFSD("ENTER\n"); + + ufs_put_cstotal(sb); + size = uspi->s_cssize; + blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; + base = space = (char*) sbi->s_csp; + for (i = 0; i < blks; i += uspi->s_fpb) { + size = uspi->s_bsize; + if (i + uspi->s_fpb > blks) + size = (blks - i) * uspi->s_fsize; + + ubh = ubh_bread(sb, uspi->s_csaddr + i, size); + + ubh_memcpyubh (ubh, space, size); + space += size; + ubh_mark_buffer_uptodate (ubh, 1); + ubh_mark_buffer_dirty (ubh); + ubh_brelse (ubh); + } + for (i = 0; i < sbi->s_cg_loaded; i++) { + ufs_put_cylinder (sb, i); + kfree (sbi->s_ucpi[i]); + } + for (; i < UFS_MAX_GROUP_LOADED; i++) + kfree (sbi->s_ucpi[i]); + for (i = 0; i < uspi->s_ncg; i++) + brelse (sbi->s_ucg[i]); + kfree (sbi->s_ucg); + kfree (base); + + UFSD("EXIT\n"); +} + +static int ufs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct ufs_sb_info * sbi; + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct ufs_super_block_second * usb2; + struct ufs_super_block_third * usb3; + struct ufs_buffer_head * ubh; + struct inode *inode; + unsigned block_size, super_block_size; + unsigned flags; + unsigned super_block_offset; + unsigned maxsymlen; + int ret = -EINVAL; + + uspi = NULL; + ubh = NULL; + flags = 0; + + UFSD("ENTER\n"); + + sbi = kzalloc(sizeof(struct ufs_sb_info), GFP_KERNEL); + if (!sbi) + goto failed_nomem; + sb->s_fs_info = sbi; + + UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); + +#ifndef CONFIG_UFS_FS_WRITE + if (!(sb->s_flags & MS_RDONLY)) { + printk("ufs was compiled with read-only support, " + "can't be mounted as read-write\n"); + goto failed; + } +#endif + mutex_init(&sbi->mutex); + /* + * Set default mount options + * Parse mount options + */ + sbi->s_mount_opt = 0; + ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK); + if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) { + printk("wrong mount options\n"); + goto failed; + } + if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) { + if (!silent) + printk("You didn't specify the type of your ufs filesystem\n\n" + "mount -t ufs -o ufstype=" + "sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n" + ">>>WARNING<<< Wrong ufstype may corrupt your filesystem, " + "default is ufstype=old\n"); + ufs_set_opt (sbi->s_mount_opt, UFSTYPE_OLD); + } + + uspi = kzalloc(sizeof(struct ufs_sb_private_info), GFP_KERNEL); + sbi->s_uspi = uspi; + if (!uspi) + goto failed; + uspi->s_dirblksize = UFS_SECTOR_SIZE; + super_block_offset=UFS_SBLOCK; + + /* Keep 2Gig file limit. Some UFS variants need to override + this but as I don't know which I'll let those in the know loosen + the rules */ + switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) { + case UFS_MOUNT_UFSTYPE_44BSD: + UFSD("ufstype=44bsd\n"); + uspi->s_fsize = block_size = 512; + uspi->s_fmask = ~(512 - 1); + uspi->s_fshift = 9; + uspi->s_sbsize = super_block_size = 1536; + uspi->s_sbbase = 0; + flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; + break; + case UFS_MOUNT_UFSTYPE_UFS2: + UFSD("ufstype=ufs2\n"); + super_block_offset=SBLOCK_UFS2; + uspi->s_fsize = block_size = 512; + uspi->s_fmask = ~(512 - 1); + uspi->s_fshift = 9; + uspi->s_sbsize = super_block_size = 1536; + uspi->s_sbbase = 0; + flags |= UFS_TYPE_UFS2 | UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; + break; + + case UFS_MOUNT_UFSTYPE_SUN: + UFSD("ufstype=sun\n"); + uspi->s_fsize = block_size = 1024; + uspi->s_fmask = ~(1024 - 1); + uspi->s_fshift = 10; + uspi->s_sbsize = super_block_size = 2048; + uspi->s_sbbase = 0; + uspi->s_maxsymlinklen = 0; /* Not supported on disk */ + flags |= UFS_DE_OLD | UFS_UID_EFT | UFS_ST_SUN | UFS_CG_SUN; + break; + + case UFS_MOUNT_UFSTYPE_SUNOS: + UFSD(("ufstype=sunos\n")) + uspi->s_fsize = block_size = 1024; + uspi->s_fmask = ~(1024 - 1); + uspi->s_fshift = 10; + uspi->s_sbsize = 2048; + super_block_size = 2048; + uspi->s_sbbase = 0; + uspi->s_maxsymlinklen = 0; /* Not supported on disk */ + flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_SUNOS | UFS_CG_SUN; + break; + + case UFS_MOUNT_UFSTYPE_SUNx86: + UFSD("ufstype=sunx86\n"); + uspi->s_fsize = block_size = 1024; + uspi->s_fmask = ~(1024 - 1); + uspi->s_fshift = 10; + uspi->s_sbsize = super_block_size = 2048; + uspi->s_sbbase = 0; + uspi->s_maxsymlinklen = 0; /* Not supported on disk */ + flags |= UFS_DE_OLD | UFS_UID_EFT | UFS_ST_SUNx86 | UFS_CG_SUN; + break; + + case UFS_MOUNT_UFSTYPE_OLD: + UFSD("ufstype=old\n"); + uspi->s_fsize = block_size = 1024; + uspi->s_fmask = ~(1024 - 1); + uspi->s_fshift = 10; + uspi->s_sbsize = super_block_size = 2048; + uspi->s_sbbase = 0; + flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; + if (!(sb->s_flags & MS_RDONLY)) { + if (!silent) + printk(KERN_INFO "ufstype=old is supported read-only\n"); + sb->s_flags |= MS_RDONLY; + } + break; + + case UFS_MOUNT_UFSTYPE_NEXTSTEP: + UFSD("ufstype=nextstep\n"); + uspi->s_fsize = block_size = 1024; + uspi->s_fmask = ~(1024 - 1); + uspi->s_fshift = 10; + uspi->s_sbsize = super_block_size = 2048; + uspi->s_sbbase = 0; + uspi->s_dirblksize = 1024; + flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; + if (!(sb->s_flags & MS_RDONLY)) { + if (!silent) + printk(KERN_INFO "ufstype=nextstep is supported read-only\n"); + sb->s_flags |= MS_RDONLY; + } + break; + + case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD: + UFSD("ufstype=nextstep-cd\n"); + uspi->s_fsize = block_size = 2048; + uspi->s_fmask = ~(2048 - 1); + uspi->s_fshift = 11; + uspi->s_sbsize = super_block_size = 2048; + uspi->s_sbbase = 0; + uspi->s_dirblksize = 1024; + flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; + if (!(sb->s_flags & MS_RDONLY)) { + if (!silent) + printk(KERN_INFO "ufstype=nextstep-cd is supported read-only\n"); + sb->s_flags |= MS_RDONLY; + } + break; + + case UFS_MOUNT_UFSTYPE_OPENSTEP: + UFSD("ufstype=openstep\n"); + uspi->s_fsize = block_size = 1024; + uspi->s_fmask = ~(1024 - 1); + uspi->s_fshift = 10; + uspi->s_sbsize = super_block_size = 2048; + uspi->s_sbbase = 0; + uspi->s_dirblksize = 1024; + flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; + if (!(sb->s_flags & MS_RDONLY)) { + if (!silent) + printk(KERN_INFO "ufstype=openstep is supported read-only\n"); + sb->s_flags |= MS_RDONLY; + } + break; + + case UFS_MOUNT_UFSTYPE_HP: + UFSD("ufstype=hp\n"); + uspi->s_fsize = block_size = 1024; + uspi->s_fmask = ~(1024 - 1); + uspi->s_fshift = 10; + uspi->s_sbsize = super_block_size = 2048; + uspi->s_sbbase = 0; + flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; + if (!(sb->s_flags & MS_RDONLY)) { + if (!silent) + printk(KERN_INFO "ufstype=hp is supported read-only\n"); + sb->s_flags |= MS_RDONLY; + } + break; + default: + if (!silent) + printk("unknown ufstype\n"); + goto failed; + } + +again: + if (!sb_set_blocksize(sb, block_size)) { + printk(KERN_ERR "UFS: failed to set blocksize\n"); + goto failed; + } + + /* + * read ufs super block from device + */ + + ubh = ubh_bread_uspi(uspi, sb, uspi->s_sbbase + super_block_offset/block_size, super_block_size); + + if (!ubh) + goto failed; + + usb1 = ubh_get_usb_first(uspi); + usb2 = ubh_get_usb_second(uspi); + usb3 = ubh_get_usb_third(uspi); + + /* Sort out mod used on SunOS 4.1.3 for fs_state */ + uspi->s_postblformat = fs32_to_cpu(sb, usb3->fs_postblformat); + if (((flags & UFS_ST_MASK) == UFS_ST_SUNOS) && + (uspi->s_postblformat != UFS_42POSTBLFMT)) { + flags &= ~UFS_ST_MASK; + flags |= UFS_ST_SUN; + } + + /* + * Check ufs magic number + */ + sbi->s_bytesex = BYTESEX_LE; + switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { + case UFS_MAGIC: + case UFS_MAGIC_BW: + case UFS2_MAGIC: + case UFS_MAGIC_LFN: + case UFS_MAGIC_FEA: + case UFS_MAGIC_4GB: + goto magic_found; + } + sbi->s_bytesex = BYTESEX_BE; + switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { + case UFS_MAGIC: + case UFS_MAGIC_BW: + case UFS2_MAGIC: + case UFS_MAGIC_LFN: + case UFS_MAGIC_FEA: + case UFS_MAGIC_4GB: + goto magic_found; + } + + if ((((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP) + || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP_CD) + || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_OPENSTEP)) + && uspi->s_sbbase < 256) { + ubh_brelse_uspi(uspi); + ubh = NULL; + uspi->s_sbbase += 8; + goto again; + } + if (!silent) + printk("ufs_read_super: bad magic number\n"); + goto failed; + +magic_found: + /* + * Check block and fragment sizes + */ + uspi->s_bsize = fs32_to_cpu(sb, usb1->fs_bsize); + uspi->s_fsize = fs32_to_cpu(sb, usb1->fs_fsize); + uspi->s_sbsize = fs32_to_cpu(sb, usb1->fs_sbsize); + uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask); + uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); + + if (!is_power_of_2(uspi->s_fsize)) { + printk(KERN_ERR "ufs_read_super: fragment size %u is not a power of 2\n", + uspi->s_fsize); + goto failed; + } + if (uspi->s_fsize < 512) { + printk(KERN_ERR "ufs_read_super: fragment size %u is too small\n", + uspi->s_fsize); + goto failed; + } + if (uspi->s_fsize > 4096) { + printk(KERN_ERR "ufs_read_super: fragment size %u is too large\n", + uspi->s_fsize); + goto failed; + } + if (!is_power_of_2(uspi->s_bsize)) { + printk(KERN_ERR "ufs_read_super: block size %u is not a power of 2\n", + uspi->s_bsize); + goto failed; + } + if (uspi->s_bsize < 4096) { + printk(KERN_ERR "ufs_read_super: block size %u is too small\n", + uspi->s_bsize); + goto failed; + } + if (uspi->s_bsize / uspi->s_fsize > 8) { + printk(KERN_ERR "ufs_read_super: too many fragments per block (%u)\n", + uspi->s_bsize / uspi->s_fsize); + goto failed; + } + if (uspi->s_fsize != block_size || uspi->s_sbsize != super_block_size) { + ubh_brelse_uspi(uspi); + ubh = NULL; + block_size = uspi->s_fsize; + super_block_size = uspi->s_sbsize; + UFSD("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size); + goto again; + } + + sbi->s_flags = flags;/*after that line some functions use s_flags*/ + ufs_print_super_stuff(sb, usb1, usb2, usb3); + + /* + * Check, if file system was correctly unmounted. + * If not, make it read only. + */ + if (((flags & UFS_ST_MASK) == UFS_ST_44BSD) || + ((flags & UFS_ST_MASK) == UFS_ST_OLD) || + (((flags & UFS_ST_MASK) == UFS_ST_SUN || + (flags & UFS_ST_MASK) == UFS_ST_SUNOS || + (flags & UFS_ST_MASK) == UFS_ST_SUNx86) && + (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) { + switch(usb1->fs_clean) { + case UFS_FSCLEAN: + UFSD("fs is clean\n"); + break; + case UFS_FSSTABLE: + UFSD("fs is stable\n"); + break; + case UFS_FSLOG: + UFSD("fs is logging fs\n"); + break; + case UFS_FSOSF1: + UFSD("fs is DEC OSF/1\n"); + break; + case UFS_FSACTIVE: + printk("ufs_read_super: fs is active\n"); + sb->s_flags |= MS_RDONLY; + break; + case UFS_FSBAD: + printk("ufs_read_super: fs is bad\n"); + sb->s_flags |= MS_RDONLY; + break; + default: + printk("ufs_read_super: can't grok fs_clean 0x%x\n", usb1->fs_clean); + sb->s_flags |= MS_RDONLY; + break; + } + } else { + printk("ufs_read_super: fs needs fsck\n"); + sb->s_flags |= MS_RDONLY; + } + + /* + * Read ufs_super_block into internal data structures + */ + sb->s_op = &ufs_super_ops; + sb->s_export_op = &ufs_export_ops; + + sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic); + + uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno); + uspi->s_cblkno = fs32_to_cpu(sb, usb1->fs_cblkno); + uspi->s_iblkno = fs32_to_cpu(sb, usb1->fs_iblkno); + uspi->s_dblkno = fs32_to_cpu(sb, usb1->fs_dblkno); + uspi->s_cgoffset = fs32_to_cpu(sb, usb1->fs_cgoffset); + uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask); + + if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + uspi->s_u2_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size); + uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); + } else { + uspi->s_size = fs32_to_cpu(sb, usb1->fs_size); + uspi->s_dsize = fs32_to_cpu(sb, usb1->fs_dsize); + } + + uspi->s_ncg = fs32_to_cpu(sb, usb1->fs_ncg); + /* s_bsize already set */ + /* s_fsize already set */ + uspi->s_fpb = fs32_to_cpu(sb, usb1->fs_frag); + uspi->s_minfree = fs32_to_cpu(sb, usb1->fs_minfree); + uspi->s_bmask = fs32_to_cpu(sb, usb1->fs_bmask); + uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask); + uspi->s_bshift = fs32_to_cpu(sb, usb1->fs_bshift); + uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); + UFSD("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift, + uspi->s_fshift); + uspi->s_fpbshift = fs32_to_cpu(sb, usb1->fs_fragshift); + uspi->s_fsbtodb = fs32_to_cpu(sb, usb1->fs_fsbtodb); + /* s_sbsize already set */ + uspi->s_csmask = fs32_to_cpu(sb, usb1->fs_csmask); + uspi->s_csshift = fs32_to_cpu(sb, usb1->fs_csshift); + uspi->s_nindir = fs32_to_cpu(sb, usb1->fs_nindir); + uspi->s_inopb = fs32_to_cpu(sb, usb1->fs_inopb); + uspi->s_nspf = fs32_to_cpu(sb, usb1->fs_nspf); + uspi->s_npsect = ufs_get_fs_npsect(sb, usb1, usb3); + uspi->s_interleave = fs32_to_cpu(sb, usb1->fs_interleave); + uspi->s_trackskew = fs32_to_cpu(sb, usb1->fs_trackskew); + + if (uspi->fs_magic == UFS2_MAGIC) + uspi->s_csaddr = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr); + else + uspi->s_csaddr = fs32_to_cpu(sb, usb1->fs_csaddr); + + uspi->s_cssize = fs32_to_cpu(sb, usb1->fs_cssize); + uspi->s_cgsize = fs32_to_cpu(sb, usb1->fs_cgsize); + uspi->s_ntrak = fs32_to_cpu(sb, usb1->fs_ntrak); + uspi->s_nsect = fs32_to_cpu(sb, usb1->fs_nsect); + uspi->s_spc = fs32_to_cpu(sb, usb1->fs_spc); + uspi->s_ipg = fs32_to_cpu(sb, usb1->fs_ipg); + uspi->s_fpg = fs32_to_cpu(sb, usb1->fs_fpg); + uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_un.fs_u1.fs_cpc); + uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_contigsumsize); + uspi->s_qbmask = ufs_get_fs_qbmask(sb, usb3); + uspi->s_qfmask = ufs_get_fs_qfmask(sb, usb3); + uspi->s_nrpos = fs32_to_cpu(sb, usb3->fs_nrpos); + uspi->s_postbloff = fs32_to_cpu(sb, usb3->fs_postbloff); + uspi->s_rotbloff = fs32_to_cpu(sb, usb3->fs_rotbloff); + + /* + * Compute another frequently used values + */ + uspi->s_fpbmask = uspi->s_fpb - 1; + if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) + uspi->s_apbshift = uspi->s_bshift - 3; + else + uspi->s_apbshift = uspi->s_bshift - 2; + + uspi->s_2apbshift = uspi->s_apbshift * 2; + uspi->s_3apbshift = uspi->s_apbshift * 3; + uspi->s_apb = 1 << uspi->s_apbshift; + uspi->s_2apb = 1 << uspi->s_2apbshift; + uspi->s_3apb = 1 << uspi->s_3apbshift; + uspi->s_apbmask = uspi->s_apb - 1; + uspi->s_nspfshift = uspi->s_fshift - UFS_SECTOR_BITS; + uspi->s_nspb = uspi->s_nspf << uspi->s_fpbshift; + uspi->s_inopf = uspi->s_inopb >> uspi->s_fpbshift; + uspi->s_bpf = uspi->s_fsize << 3; + uspi->s_bpfshift = uspi->s_fshift + 3; + uspi->s_bpfmask = uspi->s_bpf - 1; + if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_44BSD || + (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_UFS2) + uspi->s_maxsymlinklen = + fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen); + + if (uspi->fs_magic == UFS2_MAGIC) + maxsymlen = 2 * 4 * (UFS_NDADDR + UFS_NINDIR); + else + maxsymlen = 4 * (UFS_NDADDR + UFS_NINDIR); + if (uspi->s_maxsymlinklen > maxsymlen) { + ufs_warning(sb, __func__, "ufs_read_super: excessive maximum " + "fast symlink size (%u)\n", uspi->s_maxsymlinklen); + uspi->s_maxsymlinklen = maxsymlen; + } + + inode = ufs_iget(sb, UFS_ROOTINO); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto failed; + } + sb->s_root = d_alloc_root(inode); + if (!sb->s_root) { + ret = -ENOMEM; + goto dalloc_failed; + } + + ufs_setup_cstotal(sb); + /* + * Read cylinder group structures + */ + if (!(sb->s_flags & MS_RDONLY)) + if (!ufs_read_cylinder_structures(sb)) + goto failed; + + UFSD("EXIT\n"); + return 0; + +dalloc_failed: + iput(inode); +failed: + if (ubh) + ubh_brelse_uspi (uspi); + kfree (uspi); + kfree(sbi); + sb->s_fs_info = NULL; + UFSD("EXIT (FAILED)\n"); + return ret; + +failed_nomem: + UFSD("EXIT (NOMEM)\n"); + return -ENOMEM; +} + +static int ufs_sync_fs(struct super_block *sb, int wait) +{ + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct ufs_super_block_third * usb3; + unsigned flags; + + lock_ufs(sb); + lock_super(sb); + + UFSD("ENTER\n"); + + flags = UFS_SB(sb)->s_flags; + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first(uspi); + usb3 = ubh_get_usb_third(uspi); + + usb1->fs_time = cpu_to_fs32(sb, get_seconds()); + if ((flags & UFS_ST_MASK) == UFS_ST_SUN || + (flags & UFS_ST_MASK) == UFS_ST_SUNOS || + (flags & UFS_ST_MASK) == UFS_ST_SUNx86) + ufs_set_fs_state(sb, usb1, usb3, + UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); + ufs_put_cstotal(sb); + sb->s_dirt = 0; + + UFSD("EXIT\n"); + unlock_super(sb); + unlock_ufs(sb); + + return 0; +} + +static void ufs_write_super(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) + ufs_sync_fs(sb, 1); + else + sb->s_dirt = 0; +} + +static void ufs_put_super(struct super_block *sb) +{ + struct ufs_sb_info * sbi = UFS_SB(sb); + + UFSD("ENTER\n"); + + if (sb->s_dirt) + ufs_write_super(sb); + + if (!(sb->s_flags & MS_RDONLY)) + ufs_put_super_internal(sb); + + ubh_brelse_uspi (sbi->s_uspi); + kfree (sbi->s_uspi); + kfree (sbi); + sb->s_fs_info = NULL; + UFSD("EXIT\n"); + return; +} + + +static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) +{ + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct ufs_super_block_third * usb3; + unsigned new_mount_opt, ufstype; + unsigned flags; + + lock_ufs(sb); + lock_super(sb); + uspi = UFS_SB(sb)->s_uspi; + flags = UFS_SB(sb)->s_flags; + usb1 = ubh_get_usb_first(uspi); + usb3 = ubh_get_usb_third(uspi); + + /* + * Allow the "check" option to be passed as a remount option. + * It is not possible to change ufstype option during remount + */ + ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; + new_mount_opt = 0; + ufs_set_opt (new_mount_opt, ONERROR_LOCK); + if (!ufs_parse_options (data, &new_mount_opt)) { + unlock_super(sb); + unlock_ufs(sb); + return -EINVAL; + } + if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { + new_mount_opt |= ufstype; + } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { + printk("ufstype can't be changed during remount\n"); + unlock_super(sb); + unlock_ufs(sb); + return -EINVAL; + } + + if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { + UFS_SB(sb)->s_mount_opt = new_mount_opt; + unlock_super(sb); + unlock_ufs(sb); + return 0; + } + + /* + * fs was mouted as rw, remounting ro + */ + if (*mount_flags & MS_RDONLY) { + ufs_put_super_internal(sb); + usb1->fs_time = cpu_to_fs32(sb, get_seconds()); + if ((flags & UFS_ST_MASK) == UFS_ST_SUN + || (flags & UFS_ST_MASK) == UFS_ST_SUNOS + || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) + ufs_set_fs_state(sb, usb1, usb3, + UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + sb->s_dirt = 0; + sb->s_flags |= MS_RDONLY; + } else { + /* + * fs was mounted as ro, remounting rw + */ +#ifndef CONFIG_UFS_FS_WRITE + printk("ufs was compiled with read-only support, " + "can't be mounted as read-write\n"); + unlock_super(sb); + unlock_ufs(sb); + return -EINVAL; +#else + if (ufstype != UFS_MOUNT_UFSTYPE_SUN && + ufstype != UFS_MOUNT_UFSTYPE_SUNOS && + ufstype != UFS_MOUNT_UFSTYPE_44BSD && + ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && + ufstype != UFS_MOUNT_UFSTYPE_UFS2) { + printk("this ufstype is read-only supported\n"); + unlock_super(sb); + unlock_ufs(sb); + return -EINVAL; + } + if (!ufs_read_cylinder_structures(sb)) { + printk("failed during remounting\n"); + unlock_super(sb); + unlock_ufs(sb); + return -EPERM; + } + sb->s_flags &= ~MS_RDONLY; +#endif + } + UFS_SB(sb)->s_mount_opt = new_mount_opt; + unlock_super(sb); + unlock_ufs(sb); + return 0; +} + +static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb); + unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; + const struct match_token *tp = tokens; + + while (tp->token != Opt_onerror_panic && tp->token != mval) + ++tp; + BUG_ON(tp->token == Opt_onerror_panic); + seq_printf(seq, ",%s", tp->pattern); + + mval = sbi->s_mount_opt & UFS_MOUNT_ONERROR; + while (tp->token != Opt_err && tp->token != mval) + ++tp; + BUG_ON(tp->token == Opt_err); + seq_printf(seq, ",%s", tp->pattern); + + return 0; +} + +static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi; + unsigned flags = UFS_SB(sb)->s_flags; + struct ufs_super_block_first *usb1; + struct ufs_super_block_second *usb2; + struct ufs_super_block_third *usb3; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + lock_ufs(sb); + + usb1 = ubh_get_usb_first(uspi); + usb2 = ubh_get_usb_second(uspi); + usb3 = ubh_get_usb_third(uspi); + + if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + buf->f_type = UFS2_MAGIC; + buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); + } else { + buf->f_type = UFS_MAGIC; + buf->f_blocks = uspi->s_dsize; + } + buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) + + uspi->cs_total.cs_nffree; + buf->f_ffree = uspi->cs_total.cs_nifree; + buf->f_bsize = sb->s_blocksize; + buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree)) + ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0; + buf->f_files = uspi->s_ncg * uspi->s_ipg; + buf->f_namelen = UFS_MAXNAMLEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + unlock_ufs(sb); + + return 0; +} + +static struct kmem_cache * ufs_inode_cachep; + +static struct inode *ufs_alloc_inode(struct super_block *sb) +{ + struct ufs_inode_info *ei; + ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + ei->vfs_inode.i_version = 1; + return &ei->vfs_inode; +} + +static void ufs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ufs_inode_cachep, UFS_I(inode)); +} + +static void ufs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ufs_i_callback); +} + +static void init_once(void *foo) +{ + struct ufs_inode_info *ei = (struct ufs_inode_info *) foo; + + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", + sizeof(struct ufs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (ufs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(ufs_inode_cachep); +} + +static const struct super_operations ufs_super_ops = { + .alloc_inode = ufs_alloc_inode, + .destroy_inode = ufs_destroy_inode, + .write_inode = ufs_write_inode, + .evict_inode = ufs_evict_inode, + .put_super = ufs_put_super, + .write_super = ufs_write_super, + .sync_fs = ufs_sync_fs, + .statfs = ufs_statfs, + .remount_fs = ufs_remount, + .show_options = ufs_show_options, +}; + +static struct dentry *ufs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super); +} + +static struct file_system_type ufs_fs_type = { + .owner = THIS_MODULE, + .name = "ufs", + .mount = ufs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_ufs_fs(void) +{ + int err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&ufs_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + return err; +} + +static void __exit exit_ufs_fs(void) +{ + unregister_filesystem(&ufs_fs_type); + destroy_inodecache(); +} + +module_init(init_ufs_fs) +module_exit(exit_ufs_fs) +MODULE_LICENSE("GPL"); diff --git a/fs/ufs/swab.h b/fs/ufs/swab.h new file mode 100644 index 00000000..8d974c4f --- /dev/null +++ b/fs/ufs/swab.h @@ -0,0 +1,115 @@ +/* + * linux/fs/ufs/swab.h + * + * Copyright (C) 1997, 1998 Francois-Rene Rideau + * Copyright (C) 1998 Jakub Jelinek + * Copyright (C) 2001 Christoph Hellwig + */ + +#ifndef _UFS_SWAB_H +#define _UFS_SWAB_H + +/* + * Notes: + * HERE WE ASSUME EITHER BIG OR LITTLE ENDIAN UFSes + * in case there are ufs implementations that have strange bytesexes, + * you'll need to modify code here as well as in ufs_super.c and ufs_fs.h + * to support them. + */ + +enum { + BYTESEX_LE, + BYTESEX_BE +}; + +static inline u64 +fs64_to_cpu(struct super_block *sbp, __fs64 n) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + return le64_to_cpu((__force __le64)n); + else + return be64_to_cpu((__force __be64)n); +} + +static inline __fs64 +cpu_to_fs64(struct super_block *sbp, u64 n) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + return (__force __fs64)cpu_to_le64(n); + else + return (__force __fs64)cpu_to_be64(n); +} + +static inline u32 +fs32_to_cpu(struct super_block *sbp, __fs32 n) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + return le32_to_cpu((__force __le32)n); + else + return be32_to_cpu((__force __be32)n); +} + +static inline __fs32 +cpu_to_fs32(struct super_block *sbp, u32 n) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + return (__force __fs32)cpu_to_le32(n); + else + return (__force __fs32)cpu_to_be32(n); +} + +static inline void +fs32_add(struct super_block *sbp, __fs32 *n, int d) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + le32_add_cpu((__le32 *)n, d); + else + be32_add_cpu((__be32 *)n, d); +} + +static inline void +fs32_sub(struct super_block *sbp, __fs32 *n, int d) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + le32_add_cpu((__le32 *)n, -d); + else + be32_add_cpu((__be32 *)n, -d); +} + +static inline u16 +fs16_to_cpu(struct super_block *sbp, __fs16 n) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + return le16_to_cpu((__force __le16)n); + else + return be16_to_cpu((__force __be16)n); +} + +static inline __fs16 +cpu_to_fs16(struct super_block *sbp, u16 n) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + return (__force __fs16)cpu_to_le16(n); + else + return (__force __fs16)cpu_to_be16(n); +} + +static inline void +fs16_add(struct super_block *sbp, __fs16 *n, int d) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + le16_add_cpu((__le16 *)n, d); + else + be16_add_cpu((__be16 *)n, d); +} + +static inline void +fs16_sub(struct super_block *sbp, __fs16 *n, int d) +{ + if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) + le16_add_cpu((__le16 *)n, -d); + else + be16_add_cpu((__be16 *)n, -d); +} + +#endif /* _UFS_SWAB_H */ diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c new file mode 100644 index 00000000..d283628b --- /dev/null +++ b/fs/ufs/symlink.c @@ -0,0 +1,53 @@ +/* + * linux/fs/ufs/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * from + * + * linux/fs/ext2/symlink.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 symlink handling code + */ + +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" + + +static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ufs_inode_info *p = UFS_I(dentry->d_inode); + nd_set_link(nd, (char*)p->i_u1.i_symlink); + return NULL; +} + +const struct inode_operations ufs_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ufs_follow_link, + .setattr = ufs_setattr, +}; + +const struct inode_operations ufs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = ufs_setattr, +}; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c new file mode 100644 index 00000000..f04f89fb --- /dev/null +++ b/fs/ufs/truncate.c @@ -0,0 +1,523 @@ +/* + * linux/fs/ufs/truncate.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * from + * + * linux/fs/ext2/truncate.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/truncate.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +/* + * Real random numbers for secure rm added 94/02/18 + * Idea from Pierre del Perugia + */ + +/* + * Adoptation to use page cache and UFS2 write support by + * Evgeniy Dushistov , 2006-2007 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +/* + * Secure deletion currently doesn't work. It interacts very badly + * with buffers shared with memory mappings, and for that reason + * can't be done in the truncate() routines. It should instead be + * done separately in "release()" before calling the truncate routines + * that will release the actual file blocks. + * + * Linus + */ + +#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) +#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) + + +static int ufs_trunc_direct(struct inode *inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block * sb; + struct ufs_sb_private_info * uspi; + void *p; + u64 frag1, frag2, frag3, frag4, block1, block2; + unsigned frag_to_free, free_count; + unsigned i, tmp; + int retry; + + UFSD("ENTER: ino %lu\n", inode->i_ino); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + frag_to_free = 0; + free_count = 0; + retry = 0; + + frag1 = DIRECT_FRAGMENT; + frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); + frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); + frag3 = frag4 & ~uspi->s_fpbmask; + block1 = block2 = 0; + if (frag2 > frag3) { + frag2 = frag4; + frag3 = frag4 = 0; + } else if (frag2 < frag3) { + block1 = ufs_fragstoblks (frag2); + block2 = ufs_fragstoblks (frag3); + } + + UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu," + " frag3 %llu, frag4 %llu\n", inode->i_ino, + (unsigned long long)frag1, (unsigned long long)frag2, + (unsigned long long)block1, (unsigned long long)block2, + (unsigned long long)frag3, (unsigned long long)frag4); + + if (frag1 >= frag2) + goto next1; + + /* + * Free first free fragments + */ + p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1)); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp ) + ufs_panic (sb, "ufs_trunc_direct", "internal error"); + frag2 -= frag1; + frag1 = ufs_fragnum (frag1); + + ufs_free_fragments(inode, tmp + frag1, frag2); + mark_inode_dirty(inode); + frag_to_free = tmp + frag1; + +next1: + /* + * Free whole blocks + */ + for (i = block1 ; i < block2; i++) { + p = ufs_get_direct_data_ptr(uspi, ufsi, i); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + continue; + ufs_data_ptr_clear(uspi, p); + + if (free_count == 0) { + frag_to_free = tmp; + free_count = uspi->s_fpb; + } else if (free_count > 0 && frag_to_free == tmp - free_count) + free_count += uspi->s_fpb; + else { + ufs_free_blocks (inode, frag_to_free, free_count); + frag_to_free = tmp; + free_count = uspi->s_fpb; + } + mark_inode_dirty(inode); + } + + if (free_count > 0) + ufs_free_blocks (inode, frag_to_free, free_count); + + if (frag3 >= frag4) + goto next3; + + /* + * Free last free fragments + */ + p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3)); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp ) + ufs_panic(sb, "ufs_truncate_direct", "internal error"); + frag4 = ufs_fragnum (frag4); + ufs_data_ptr_clear(uspi, p); + + ufs_free_fragments (inode, tmp, frag4); + mark_inode_dirty(inode); + next3: + + UFSD("EXIT: ino %lu\n", inode->i_ino); + return retry; +} + + +static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_buffer_head * ind_ubh; + void *ind; + u64 tmp, indirect_block, i, frag_to_free; + unsigned free_count; + int retry; + + UFSD("ENTER: ino %lu, offset %llu, p: %p\n", + inode->i_ino, (unsigned long long)offset, p); + + BUG_ON(!p); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + frag_to_free = 0; + free_count = 0; + retry = 0; + + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + return 0; + ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize); + if (tmp != ufs_data_ptr_to_cpu(sb, p)) { + ubh_brelse (ind_ubh); + return 1; + } + if (!ind_ubh) { + ufs_data_ptr_clear(uspi, p); + return 0; + } + + indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0; + for (i = indirect_block; i < uspi->s_apb; i++) { + ind = ubh_get_data_ptr(uspi, ind_ubh, i); + tmp = ufs_data_ptr_to_cpu(sb, ind); + if (!tmp) + continue; + + ufs_data_ptr_clear(uspi, ind); + ubh_mark_buffer_dirty(ind_ubh); + if (free_count == 0) { + frag_to_free = tmp; + free_count = uspi->s_fpb; + } else if (free_count > 0 && frag_to_free == tmp - free_count) + free_count += uspi->s_fpb; + else { + ufs_free_blocks (inode, frag_to_free, free_count); + frag_to_free = tmp; + free_count = uspi->s_fpb; + } + + mark_inode_dirty(inode); + } + + if (free_count > 0) { + ufs_free_blocks (inode, frag_to_free, free_count); + } + for (i = 0; i < uspi->s_apb; i++) + if (!ufs_is_data_ptr_zero(uspi, + ubh_get_data_ptr(uspi, ind_ubh, i))) + break; + if (i >= uspi->s_apb) { + tmp = ufs_data_ptr_to_cpu(sb, p); + ufs_data_ptr_clear(uspi, p); + + ufs_free_blocks (inode, tmp, uspi->s_fpb); + mark_inode_dirty(inode); + ubh_bforget(ind_ubh); + ind_ubh = NULL; + } + if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) + ubh_sync_block(ind_ubh); + ubh_brelse (ind_ubh); + + UFSD("EXIT: ino %lu\n", inode->i_ino); + + return retry; +} + +static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_buffer_head *dind_bh; + u64 i, tmp, dindirect_block; + void *dind; + int retry = 0; + + UFSD("ENTER: ino %lu\n", inode->i_ino); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + dindirect_block = (DIRECT_BLOCK > offset) + ? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0; + retry = 0; + + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + return 0; + dind_bh = ubh_bread(sb, tmp, uspi->s_bsize); + if (tmp != ufs_data_ptr_to_cpu(sb, p)) { + ubh_brelse (dind_bh); + return 1; + } + if (!dind_bh) { + ufs_data_ptr_clear(uspi, p); + return 0; + } + + for (i = dindirect_block ; i < uspi->s_apb ; i++) { + dind = ubh_get_data_ptr(uspi, dind_bh, i); + tmp = ufs_data_ptr_to_cpu(sb, dind); + if (!tmp) + continue; + retry |= ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind); + ubh_mark_buffer_dirty(dind_bh); + } + + for (i = 0; i < uspi->s_apb; i++) + if (!ufs_is_data_ptr_zero(uspi, + ubh_get_data_ptr(uspi, dind_bh, i))) + break; + if (i >= uspi->s_apb) { + tmp = ufs_data_ptr_to_cpu(sb, p); + ufs_data_ptr_clear(uspi, p); + + ufs_free_blocks(inode, tmp, uspi->s_fpb); + mark_inode_dirty(inode); + ubh_bforget(dind_bh); + dind_bh = NULL; + } + if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) + ubh_sync_block(dind_bh); + ubh_brelse (dind_bh); + + UFSD("EXIT: ino %lu\n", inode->i_ino); + + return retry; +} + +static int ufs_trunc_tindirect(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct ufs_inode_info *ufsi = UFS_I(inode); + struct ufs_buffer_head * tind_bh; + u64 tindirect_block, tmp, i; + void *tind, *p; + int retry; + + UFSD("ENTER: ino %lu\n", inode->i_ino); + + retry = 0; + + tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb)) + ? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0; + + p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK); + if (!(tmp = ufs_data_ptr_to_cpu(sb, p))) + return 0; + tind_bh = ubh_bread (sb, tmp, uspi->s_bsize); + if (tmp != ufs_data_ptr_to_cpu(sb, p)) { + ubh_brelse (tind_bh); + return 1; + } + if (!tind_bh) { + ufs_data_ptr_clear(uspi, p); + return 0; + } + + for (i = tindirect_block ; i < uspi->s_apb ; i++) { + tind = ubh_get_data_ptr(uspi, tind_bh, i); + retry |= ufs_trunc_dindirect(inode, UFS_NDADDR + + uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind); + ubh_mark_buffer_dirty(tind_bh); + } + for (i = 0; i < uspi->s_apb; i++) + if (!ufs_is_data_ptr_zero(uspi, + ubh_get_data_ptr(uspi, tind_bh, i))) + break; + if (i >= uspi->s_apb) { + tmp = ufs_data_ptr_to_cpu(sb, p); + ufs_data_ptr_clear(uspi, p); + + ufs_free_blocks(inode, tmp, uspi->s_fpb); + mark_inode_dirty(inode); + ubh_bforget(tind_bh); + tind_bh = NULL; + } + if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) + ubh_sync_block(tind_bh); + ubh_brelse (tind_bh); + + UFSD("EXIT: ino %lu\n", inode->i_ino); + return retry; +} + +static int ufs_alloc_lastblock(struct inode *inode) +{ + int err = 0; + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned i, end; + sector_t lastfrag; + struct page *lastpage; + struct buffer_head *bh; + u64 phys64; + + lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift; + + if (!lastfrag) + goto out; + + lastfrag--; + + lastpage = ufs_get_locked_page(mapping, lastfrag >> + (PAGE_CACHE_SHIFT - inode->i_blkbits)); + if (IS_ERR(lastpage)) { + err = -EIO; + goto out; + } + + end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1); + bh = page_buffers(lastpage); + for (i = 0; i < end; ++i) + bh = bh->b_this_page; + + + err = ufs_getfrag_block(inode, lastfrag, bh, 1); + + if (unlikely(err)) + goto out_unlock; + + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + /* + * we do not zeroize fragment, because of + * if it maped to hole, it already contains zeroes + */ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + set_page_dirty(lastpage); + } + + if (lastfrag >= UFS_IND_FRAGMENT) { + end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1; + phys64 = bh->b_blocknr + 1; + for (i = 0; i < end; ++i) { + bh = sb_getblk(sb, i + phys64); + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + sync_dirty_buffer(bh); + brelse(bh); + } + } +out_unlock: + ufs_put_locked_page(lastpage); +out: + return err; +} + +int ufs_truncate(struct inode *inode, loff_t old_i_size) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + int retry, err = 0; + + UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", + inode->i_ino, (unsigned long long)i_size_read(inode), + (unsigned long long)old_i_size); + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return -EINVAL; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + err = ufs_alloc_lastblock(inode); + + if (err) { + i_size_write(inode, old_i_size); + goto out; + } + + block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); + + while (1) { + retry = ufs_trunc_direct(inode); + retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, + ufs_get_direct_data_ptr(uspi, ufsi, + UFS_IND_BLOCK)); + retry |= ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, + ufs_get_direct_data_ptr(uspi, ufsi, + UFS_DIND_BLOCK)); + retry |= ufs_trunc_tindirect (inode); + if (!retry) + break; + if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) + ufs_sync_inode (inode); + yield(); + } + + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + ufsi->i_lastfrag = DIRECT_FRAGMENT; + mark_inode_dirty(inode); +out: + UFSD("EXIT: err %d\n", err); + return err; +} + +int ufs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + unsigned int ia_valid = attr->ia_valid; + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + loff_t old_i_size = inode->i_size; + + /* XXX(truncate): truncate_setsize should be called last */ + truncate_setsize(inode, attr->ia_size); + + lock_ufs(inode->i_sb); + error = ufs_truncate(inode, old_i_size); + unlock_ufs(inode->i_sb); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations ufs_file_inode_operations = { + .setattr = ufs_setattr, +}; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h new file mode 100644 index 00000000..5be2755d --- /dev/null +++ b/fs/ufs/ufs.h @@ -0,0 +1,161 @@ +#ifndef _UFS_UFS_H +#define _UFS_UFS_H 1 + +#define UFS_MAX_GROUP_LOADED 8 +#define UFS_CGNO_EMPTY ((unsigned)-1) + +struct ufs_sb_private_info; +struct ufs_cg_private_info; +struct ufs_csum; + +struct ufs_sb_info { + struct ufs_sb_private_info * s_uspi; + struct ufs_csum * s_csp; + unsigned s_bytesex; + unsigned s_flags; + struct buffer_head ** s_ucg; + struct ufs_cg_private_info * s_ucpi[UFS_MAX_GROUP_LOADED]; + unsigned s_cgno[UFS_MAX_GROUP_LOADED]; + unsigned short s_cg_loaded; + unsigned s_mount_opt; + struct mutex mutex; + struct task_struct *mutex_owner; +}; + +struct ufs_inode_info { + union { + __fs32 i_data[15]; + __u8 i_symlink[2 * 4 * 15]; + __fs64 u2_i_data[15]; + } i_u1; + __u32 i_flags; + __u32 i_shadow; + __u32 i_unused1; + __u32 i_unused2; + __u32 i_oeftflag; + __u16 i_osync; + __u64 i_lastfrag; + __u32 i_dir_start_lookup; + struct inode vfs_inode; +}; + +/* mount options */ +#define UFS_MOUNT_ONERROR 0x0000000F +#define UFS_MOUNT_ONERROR_PANIC 0x00000001 +#define UFS_MOUNT_ONERROR_LOCK 0x00000002 +#define UFS_MOUNT_ONERROR_UMOUNT 0x00000004 +#define UFS_MOUNT_ONERROR_REPAIR 0x00000008 + +#define UFS_MOUNT_UFSTYPE 0x0000FFF0 +#define UFS_MOUNT_UFSTYPE_OLD 0x00000010 +#define UFS_MOUNT_UFSTYPE_44BSD 0x00000020 +#define UFS_MOUNT_UFSTYPE_SUN 0x00000040 +#define UFS_MOUNT_UFSTYPE_NEXTSTEP 0x00000080 +#define UFS_MOUNT_UFSTYPE_NEXTSTEP_CD 0x00000100 +#define UFS_MOUNT_UFSTYPE_OPENSTEP 0x00000200 +#define UFS_MOUNT_UFSTYPE_SUNx86 0x00000400 +#define UFS_MOUNT_UFSTYPE_HP 0x00000800 +#define UFS_MOUNT_UFSTYPE_UFS2 0x00001000 +#define UFS_MOUNT_UFSTYPE_SUNOS 0x00002000 + +#define ufs_clear_opt(o,opt) o &= ~UFS_MOUNT_##opt +#define ufs_set_opt(o,opt) o |= UFS_MOUNT_##opt +#define ufs_test_opt(o,opt) ((o) & UFS_MOUNT_##opt) + +/* + * Debug code + */ +#ifdef CONFIG_UFS_DEBUG +# define UFSD(f, a...) { \ + printk ("UFSD (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk (f, ## a); \ + } +#else +# define UFSD(f, a...) /**/ +#endif + +/* balloc.c */ +extern void ufs_free_fragments (struct inode *, u64, unsigned); +extern void ufs_free_blocks (struct inode *, u64, unsigned); +extern u64 ufs_new_fragments(struct inode *, void *, u64, u64, + unsigned, int *, struct page *); + +/* cylinder.c */ +extern struct ufs_cg_private_info * ufs_load_cylinder (struct super_block *, unsigned); +extern void ufs_put_cylinder (struct super_block *, unsigned); + +/* dir.c */ +extern const struct inode_operations ufs_dir_inode_operations; +extern int ufs_add_link (struct dentry *, struct inode *); +extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *); +extern int ufs_make_empty(struct inode *, struct inode *); +extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **); +extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *); +extern int ufs_empty_dir (struct inode *); +extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); +extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, + struct page *page, struct inode *inode); + +/* file.c */ +extern const struct inode_operations ufs_file_inode_operations; +extern const struct file_operations ufs_file_operations; +extern const struct address_space_operations ufs_aops; + +/* ialloc.c */ +extern void ufs_free_inode (struct inode *inode); +extern struct inode * ufs_new_inode (struct inode *, int); + +/* inode.c */ +extern struct inode *ufs_iget(struct super_block *, unsigned long); +extern int ufs_write_inode (struct inode *, struct writeback_control *); +extern int ufs_sync_inode (struct inode *); +extern void ufs_evict_inode (struct inode *); +extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create); + +/* namei.c */ +extern const struct file_operations ufs_dir_operations; + +/* super.c */ +extern void ufs_warning (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); +extern void ufs_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); +extern void ufs_panic (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); + +/* symlink.c */ +extern const struct inode_operations ufs_fast_symlink_inode_operations; +extern const struct inode_operations ufs_symlink_inode_operations; + +/* truncate.c */ +extern int ufs_truncate (struct inode *, loff_t); +extern int ufs_setattr(struct dentry *dentry, struct iattr *attr); + +static inline struct ufs_sb_info *UFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct ufs_inode_info *UFS_I(struct inode *inode) +{ + return container_of(inode, struct ufs_inode_info, vfs_inode); +} + +/* + * Give cylinder group number for a file system block. + * Give cylinder group block number for a file system block. + */ +/* #define ufs_dtog(d) ((d) / uspi->s_fpg) */ +static inline u64 ufs_dtog(struct ufs_sb_private_info * uspi, u64 b) +{ + do_div(b, uspi->s_fpg); + return b; +} +/* #define ufs_dtogd(d) ((d) % uspi->s_fpg) */ +static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b) +{ + return do_div(b, uspi->s_fpg); +} + +extern void lock_ufs(struct super_block *sb); +extern void unlock_ufs(struct super_block *sb); + +#endif /* _UFS_UFS_H */ diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h new file mode 100644 index 00000000..8aba544f --- /dev/null +++ b/fs/ufs/ufs_fs.h @@ -0,0 +1,959 @@ +/* + * linux/include/linux/ufs_fs.h + * + * Copyright (C) 1996 + * Adrian Rodriguez (adrian@franklins-tower.rutgers.edu) + * Laboratory for Computer Science Research Computing Facility + * Rutgers, The State University of New Jersey + * + * Clean swab support by Fare + * just hope no one is using NNUUXXI on __?64 structure elements + * 64-bit clean thanks to Maciej W. Rozycki + * + * 4.4BSD (FreeBSD) support added on February 1st 1998 by + * Niels Kristian Bech Jensen partially based + * on code by Martin von Loewis . + * + * NeXTstep support added on February 5th 1998 by + * Niels Kristian Bech Jensen . + * + * Write support by Daniel Pirkl + * + * HP/UX hfs filesystem support added by + * Martin K. Petersen , August 1999 + * + * UFS2 (of FreeBSD 5.x) support added by + * Niraj Kumar , Jan 2004 + * + */ + +#ifndef __LINUX_UFS_FS_H +#define __LINUX_UFS_FS_H + +#include +#include +#include +#include + +#include +typedef __u64 __bitwise __fs64; +typedef __u32 __bitwise __fs32; +typedef __u16 __bitwise __fs16; + +#define UFS_BBLOCK 0 +#define UFS_BBSIZE 8192 +#define UFS_SBLOCK 8192 +#define UFS_SBSIZE 8192 + +#define UFS_SECTOR_SIZE 512 +#define UFS_SECTOR_BITS 9 +#define UFS_MAGIC 0x00011954 +#define UFS_MAGIC_BW 0x0f242697 +#define UFS2_MAGIC 0x19540119 +#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */ + +/* Copied from FreeBSD */ +/* + * Each disk drive contains some number of filesystems. + * A filesystem consists of a number of cylinder groups. + * Each cylinder group has inodes and data. + * + * A filesystem is described by its super-block, which in turn + * describes the cylinder groups. The super-block is critical + * data and is replicated in each cylinder group to protect against + * catastrophic loss. This is done at `newfs' time and the critical + * super-block data does not change, so the copies need not be + * referenced further unless disaster strikes. + * + * For filesystem fs, the offsets of the various blocks of interest + * are given in the super block as: + * [fs->fs_sblkno] Super-block + * [fs->fs_cblkno] Cylinder group block + * [fs->fs_iblkno] Inode blocks + * [fs->fs_dblkno] Data blocks + * The beginning of cylinder group cg in fs, is given by + * the ``cgbase(fs, cg)'' macro. + * + * Depending on the architecture and the media, the superblock may + * reside in any one of four places. For tiny media where every block + * counts, it is placed at the very front of the partition. Historically, + * UFS1 placed it 8K from the front to leave room for the disk label and + * a small bootstrap. For UFS2 it got moved to 64K from the front to leave + * room for the disk label and a bigger bootstrap, and for really piggy + * systems we check at 256K from the front if the first three fail. In + * all cases the size of the superblock will be SBLOCKSIZE. All values are + * given in byte-offset form, so they do not imply a sector size. The + * SBLOCKSEARCH specifies the order in which the locations should be searched. + */ +#define SBLOCK_FLOPPY 0 +#define SBLOCK_UFS1 8192 +#define SBLOCK_UFS2 65536 +#define SBLOCK_PIGGY 262144 +#define SBLOCKSIZE 8192 +#define SBLOCKSEARCH \ + { SBLOCK_UFS2, SBLOCK_UFS1, SBLOCK_FLOPPY, SBLOCK_PIGGY, -1 } + + +/* HP specific MAGIC values */ + +#define UFS_MAGIC_LFN 0x00095014 /* fs supports filenames > 14 chars */ +#define UFS_CIGAM_LFN 0x14500900 /* srahc 41 < semanelif stroppus sf */ + +#define UFS_MAGIC_SEC 0x00612195 /* B1 security fs */ +#define UFS_CIGAM_SEC 0x95216100 + +#define UFS_MAGIC_FEA 0x00195612 /* fs_featurebits supported */ +#define UFS_CIGAM_FEA 0x12561900 + +#define UFS_MAGIC_4GB 0x05231994 /* fs > 4 GB && fs_featurebits */ +#define UFS_CIGAM_4GB 0x94192305 + +/* Seems somebody at HP goofed here. B1 and lfs are both 0x2 !?! */ +#define UFS_FSF_LFN 0x00000001 /* long file names */ +#define UFS_FSF_B1 0x00000002 /* B1 security */ +#define UFS_FSF_LFS 0x00000002 /* large files */ +#define UFS_FSF_LUID 0x00000004 /* large UIDs */ + +/* End of HP stuff */ + + +#define UFS_BSIZE 8192 +#define UFS_MINBSIZE 4096 +#define UFS_FSIZE 1024 +#define UFS_MAXFRAG (UFS_BSIZE / UFS_FSIZE) + +#define UFS_NDADDR 12 +#define UFS_NINDIR 3 + +#define UFS_IND_BLOCK (UFS_NDADDR + 0) +#define UFS_DIND_BLOCK (UFS_NDADDR + 1) +#define UFS_TIND_BLOCK (UFS_NDADDR + 2) + +#define UFS_NDIR_FRAGMENT (UFS_NDADDR << uspi->s_fpbshift) +#define UFS_IND_FRAGMENT (UFS_IND_BLOCK << uspi->s_fpbshift) +#define UFS_DIND_FRAGMENT (UFS_DIND_BLOCK << uspi->s_fpbshift) +#define UFS_TIND_FRAGMENT (UFS_TIND_BLOCK << uspi->s_fpbshift) + +#define UFS_ROOTINO 2 +#define UFS_FIRST_INO (UFS_ROOTINO + 1) + +#define UFS_USEEFT ((__u16)65535) + +/* fs_clean values */ +#define UFS_FSOK 0x7c269d38 +#define UFS_FSACTIVE ((__s8)0x00) +#define UFS_FSCLEAN ((__s8)0x01) +#define UFS_FSSTABLE ((__s8)0x02) +#define UFS_FSOSF1 ((__s8)0x03) /* is this correct for DEC OSF/1? */ +#define UFS_FSBAD ((__s8)0xff) + +/* Solaris-specific fs_clean values */ +#define UFS_FSSUSPEND ((__s8)0xfe) /* temporarily suspended */ +#define UFS_FSLOG ((__s8)0xfd) /* logging fs */ +#define UFS_FSFIX ((__s8)0xfc) /* being repaired while mounted */ + +/* From here to next blank line, s_flags for ufs_sb_info */ +/* directory entry encoding */ +#define UFS_DE_MASK 0x00000010 /* mask for the following */ +#define UFS_DE_OLD 0x00000000 +#define UFS_DE_44BSD 0x00000010 +/* uid encoding */ +#define UFS_UID_MASK 0x00000060 /* mask for the following */ +#define UFS_UID_OLD 0x00000000 +#define UFS_UID_44BSD 0x00000020 +#define UFS_UID_EFT 0x00000040 +/* superblock state encoding */ +#define UFS_ST_MASK 0x00000700 /* mask for the following */ +#define UFS_ST_OLD 0x00000000 +#define UFS_ST_44BSD 0x00000100 +#define UFS_ST_SUN 0x00000200 /* Solaris */ +#define UFS_ST_SUNOS 0x00000300 +#define UFS_ST_SUNx86 0x00000400 /* Solaris x86 */ +/*cylinder group encoding */ +#define UFS_CG_MASK 0x00003000 /* mask for the following */ +#define UFS_CG_OLD 0x00000000 +#define UFS_CG_44BSD 0x00002000 +#define UFS_CG_SUN 0x00001000 +/* filesystem type encoding */ +#define UFS_TYPE_MASK 0x00010000 /* mask for the following */ +#define UFS_TYPE_UFS1 0x00000000 +#define UFS_TYPE_UFS2 0x00010000 + + +/* fs_inodefmt options */ +#define UFS_42INODEFMT -1 +#define UFS_44INODEFMT 2 + +/* + * MINFREE gives the minimum acceptable percentage of file system + * blocks which may be free. If the freelist drops below this level + * only the superuser may continue to allocate blocks. This may + * be set to 0 if no reserve of free blocks is deemed necessary, + * however throughput drops by fifty percent if the file system + * is run at between 95% and 100% full; thus the minimum default + * value of fs_minfree is 5%. However, to get good clustering + * performance, 10% is a better choice. hence we use 10% as our + * default value. With 10% free space, fragmentation is not a + * problem, so we choose to optimize for time. + */ +#define UFS_MINFREE 5 +#define UFS_DEFAULTOPT UFS_OPTTIME + +/* + * Turn file system block numbers into disk block addresses. + * This maps file system blocks to device size blocks. + */ +#define ufs_fsbtodb(uspi, b) ((b) << (uspi)->s_fsbtodb) +#define ufs_dbtofsb(uspi, b) ((b) >> (uspi)->s_fsbtodb) + +/* + * Cylinder group macros to locate things in cylinder groups. + * They calc file system addresses of cylinder group data structures. + */ +#define ufs_cgbase(c) (uspi->s_fpg * (c)) +#define ufs_cgstart(c) ((uspi)->fs_magic == UFS2_MAGIC ? ufs_cgbase(c) : \ + (ufs_cgbase(c) + uspi->s_cgoffset * ((c) & ~uspi->s_cgmask))) +#define ufs_cgsblock(c) (ufs_cgstart(c) + uspi->s_sblkno) /* super blk */ +#define ufs_cgcmin(c) (ufs_cgstart(c) + uspi->s_cblkno) /* cg block */ +#define ufs_cgimin(c) (ufs_cgstart(c) + uspi->s_iblkno) /* inode blk */ +#define ufs_cgdmin(c) (ufs_cgstart(c) + uspi->s_dblkno) /* 1st data */ + +/* + * Macros for handling inode numbers: + * inode number to file system block offset. + * inode number to cylinder group number. + * inode number to file system block address. + */ +#define ufs_inotocg(x) ((x) / uspi->s_ipg) +#define ufs_inotocgoff(x) ((x) % uspi->s_ipg) +#define ufs_inotofsba(x) (((u64)ufs_cgimin(ufs_inotocg(x))) + ufs_inotocgoff(x) / uspi->s_inopf) +#define ufs_inotofsbo(x) ((x) % uspi->s_inopf) + +/* + * Compute the cylinder and rotational position of a cyl block addr. + */ +#define ufs_cbtocylno(bno) \ + ((bno) * uspi->s_nspf / uspi->s_spc) +#define ufs_cbtorpos(bno) \ + ((UFS_SB(sb)->s_flags & UFS_CG_SUN) ? \ + (((((bno) * uspi->s_nspf % uspi->s_spc) % \ + uspi->s_nsect) * \ + uspi->s_nrpos) / uspi->s_nsect) \ + : \ + ((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \ + * uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \ + % uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \ + * uspi->s_nrpos) / uspi->s_npsect)) + +/* + * The following macros optimize certain frequently calculated + * quantities by using shifts and masks in place of divisions + * modulos and multiplications. + */ +#define ufs_blkoff(loc) ((loc) & uspi->s_qbmask) +#define ufs_fragoff(loc) ((loc) & uspi->s_qfmask) +#define ufs_lblktosize(blk) ((blk) << uspi->s_bshift) +#define ufs_lblkno(loc) ((loc) >> uspi->s_bshift) +#define ufs_numfrags(loc) ((loc) >> uspi->s_fshift) +#define ufs_blkroundup(size) (((size) + uspi->s_qbmask) & uspi->s_bmask) +#define ufs_fragroundup(size) (((size) + uspi->s_qfmask) & uspi->s_fmask) +#define ufs_fragstoblks(frags) ((frags) >> uspi->s_fpbshift) +#define ufs_blkstofrags(blks) ((blks) << uspi->s_fpbshift) +#define ufs_fragnum(fsb) ((fsb) & uspi->s_fpbmask) +#define ufs_blknum(fsb) ((fsb) & ~uspi->s_fpbmask) + +#define UFS_MAXNAMLEN 255 +#define UFS_MAXMNTLEN 512 +#define UFS2_MAXMNTLEN 468 +#define UFS2_MAXVOLLEN 32 +#define UFS_MAXCSBUFS 31 +#define UFS_LINK_MAX 32000 +/* +#define UFS2_NOCSPTRS ((128 / sizeof(void *)) - 4) +*/ +#define UFS2_NOCSPTRS 28 + +/* + * UFS_DIR_PAD defines the directory entries boundaries + * (must be a multiple of 4) + */ +#define UFS_DIR_PAD 4 +#define UFS_DIR_ROUND (UFS_DIR_PAD - 1) +#define UFS_DIR_REC_LEN(name_len) (((name_len) + 1 + 8 + UFS_DIR_ROUND) & ~UFS_DIR_ROUND) + +struct ufs_timeval { + __fs32 tv_sec; + __fs32 tv_usec; +}; + +struct ufs_dir_entry { + __fs32 d_ino; /* inode number of this entry */ + __fs16 d_reclen; /* length of this entry */ + union { + __fs16 d_namlen; /* actual length of d_name */ + struct { + __u8 d_type; /* file type */ + __u8 d_namlen; /* length of string in d_name */ + } d_44; + } d_u; + __u8 d_name[UFS_MAXNAMLEN + 1]; /* file name */ +}; + +struct ufs_csum { + __fs32 cs_ndir; /* number of directories */ + __fs32 cs_nbfree; /* number of free blocks */ + __fs32 cs_nifree; /* number of free inodes */ + __fs32 cs_nffree; /* number of free frags */ +}; +struct ufs2_csum_total { + __fs64 cs_ndir; /* number of directories */ + __fs64 cs_nbfree; /* number of free blocks */ + __fs64 cs_nifree; /* number of free inodes */ + __fs64 cs_nffree; /* number of free frags */ + __fs64 cs_numclusters; /* number of free clusters */ + __fs64 cs_spare[3]; /* future expansion */ +}; + +struct ufs_csum_core { + __u64 cs_ndir; /* number of directories */ + __u64 cs_nbfree; /* number of free blocks */ + __u64 cs_nifree; /* number of free inodes */ + __u64 cs_nffree; /* number of free frags */ + __u64 cs_numclusters; /* number of free clusters */ +}; + +/* + * File system flags + */ +#define UFS_UNCLEAN 0x01 /* file system not clean at mount (unused) */ +#define UFS_DOSOFTDEP 0x02 /* file system using soft dependencies */ +#define UFS_NEEDSFSCK 0x04 /* needs sync fsck (FreeBSD compat, unused) */ +#define UFS_INDEXDIRS 0x08 /* kernel supports indexed directories */ +#define UFS_ACLS 0x10 /* file system has ACLs enabled */ +#define UFS_MULTILABEL 0x20 /* file system is MAC multi-label */ +#define UFS_FLAGS_UPDATED 0x80 /* flags have been moved to new location */ + +#if 0 +/* + * This is the actual superblock, as it is laid out on the disk. + * Do NOT use this structure, because of sizeof(ufs_super_block) > 512 and + * it may occupy several blocks, use + * struct ufs_super_block_(first,second,third) instead. + */ +struct ufs_super_block { + union { + struct { + __fs32 fs_link; /* UNUSED */ + } fs_42; + struct { + __fs32 fs_state; /* file system state flag */ + } fs_sun; + } fs_u0; + __fs32 fs_rlink; /* UNUSED */ + __fs32 fs_sblkno; /* addr of super-block in filesys */ + __fs32 fs_cblkno; /* offset of cyl-block in filesys */ + __fs32 fs_iblkno; /* offset of inode-blocks in filesys */ + __fs32 fs_dblkno; /* offset of first data after cg */ + __fs32 fs_cgoffset; /* cylinder group offset in cylinder */ + __fs32 fs_cgmask; /* used to calc mod fs_ntrak */ + __fs32 fs_time; /* last time written -- time_t */ + __fs32 fs_size; /* number of blocks in fs */ + __fs32 fs_dsize; /* number of data blocks in fs */ + __fs32 fs_ncg; /* number of cylinder groups */ + __fs32 fs_bsize; /* size of basic blocks in fs */ + __fs32 fs_fsize; /* size of frag blocks in fs */ + __fs32 fs_frag; /* number of frags in a block in fs */ +/* these are configuration parameters */ + __fs32 fs_minfree; /* minimum percentage of free blocks */ + __fs32 fs_rotdelay; /* num of ms for optimal next block */ + __fs32 fs_rps; /* disk revolutions per second */ +/* these fields can be computed from the others */ + __fs32 fs_bmask; /* ``blkoff'' calc of blk offsets */ + __fs32 fs_fmask; /* ``fragoff'' calc of frag offsets */ + __fs32 fs_bshift; /* ``lblkno'' calc of logical blkno */ + __fs32 fs_fshift; /* ``numfrags'' calc number of frags */ +/* these are configuration parameters */ + __fs32 fs_maxcontig; /* max number of contiguous blks */ + __fs32 fs_maxbpg; /* max number of blks per cyl group */ +/* these fields can be computed from the others */ + __fs32 fs_fragshift; /* block to frag shift */ + __fs32 fs_fsbtodb; /* fsbtodb and dbtofsb shift constant */ + __fs32 fs_sbsize; /* actual size of super block */ + __fs32 fs_csmask; /* csum block offset */ + __fs32 fs_csshift; /* csum block number */ + __fs32 fs_nindir; /* value of NINDIR */ + __fs32 fs_inopb; /* value of INOPB */ + __fs32 fs_nspf; /* value of NSPF */ +/* yet another configuration parameter */ + __fs32 fs_optim; /* optimization preference, see below */ +/* these fields are derived from the hardware */ + union { + struct { + __fs32 fs_npsect; /* # sectors/track including spares */ + } fs_sun; + struct { + __fs32 fs_state; /* file system state time stamp */ + } fs_sunx86; + } fs_u1; + __fs32 fs_interleave; /* hardware sector interleave */ + __fs32 fs_trackskew; /* sector 0 skew, per track */ +/* a unique id for this filesystem (currently unused and unmaintained) */ +/* In 4.3 Tahoe this space is used by fs_headswitch and fs_trkseek */ +/* Neither of those fields is used in the Tahoe code right now but */ +/* there could be problems if they are. */ + __fs32 fs_id[2]; /* file system id */ +/* sizes determined by number of cylinder groups and their sizes */ + __fs32 fs_csaddr; /* blk addr of cyl grp summary area */ + __fs32 fs_cssize; /* size of cyl grp summary area */ + __fs32 fs_cgsize; /* cylinder group size */ +/* these fields are derived from the hardware */ + __fs32 fs_ntrak; /* tracks per cylinder */ + __fs32 fs_nsect; /* sectors per track */ + __fs32 fs_spc; /* sectors per cylinder */ +/* this comes from the disk driver partitioning */ + __fs32 fs_ncyl; /* cylinders in file system */ +/* these fields can be computed from the others */ + __fs32 fs_cpg; /* cylinders per group */ + __fs32 fs_ipg; /* inodes per cylinder group */ + __fs32 fs_fpg; /* blocks per group * fs_frag */ +/* this data must be re-computed after crashes */ + struct ufs_csum fs_cstotal; /* cylinder summary information */ +/* these fields are cleared at mount time */ + __s8 fs_fmod; /* super block modified flag */ + __s8 fs_clean; /* file system is clean flag */ + __s8 fs_ronly; /* mounted read-only flag */ + __s8 fs_flags; + union { + struct { + __s8 fs_fsmnt[UFS_MAXMNTLEN];/* name mounted on */ + __fs32 fs_cgrotor; /* last cg searched */ + __fs32 fs_csp[UFS_MAXCSBUFS];/*list of fs_cs info buffers */ + __fs32 fs_maxcluster; + __fs32 fs_cpc; /* cyl per cycle in postbl */ + __fs16 fs_opostbl[16][8]; /* old rotation block list head */ + } fs_u1; + struct { + __s8 fs_fsmnt[UFS2_MAXMNTLEN]; /* name mounted on */ + __u8 fs_volname[UFS2_MAXVOLLEN]; /* volume name */ + __fs64 fs_swuid; /* system-wide uid */ + __fs32 fs_pad; /* due to alignment of fs_swuid */ + __fs32 fs_cgrotor; /* last cg searched */ + __fs32 fs_ocsp[UFS2_NOCSPTRS]; /*list of fs_cs info buffers */ + __fs32 fs_contigdirs;/*# of contiguously allocated dirs */ + __fs32 fs_csp; /* cg summary info buffer for fs_cs */ + __fs32 fs_maxcluster; + __fs32 fs_active;/* used by snapshots to track fs */ + __fs32 fs_old_cpc; /* cyl per cycle in postbl */ + __fs32 fs_maxbsize;/*maximum blocking factor permitted */ + __fs64 fs_sparecon64[17];/*old rotation block list head */ + __fs64 fs_sblockloc; /* byte offset of standard superblock */ + struct ufs2_csum_total fs_cstotal;/*cylinder summary information*/ + struct ufs_timeval fs_time; /* last time written */ + __fs64 fs_size; /* number of blocks in fs */ + __fs64 fs_dsize; /* number of data blocks in fs */ + __fs64 fs_csaddr; /* blk addr of cyl grp summary area */ + __fs64 fs_pendingblocks;/* blocks in process of being freed */ + __fs32 fs_pendinginodes;/*inodes in process of being freed */ + } fs_u2; + } fs_u11; + union { + struct { + __fs32 fs_sparecon[53];/* reserved for future constants */ + __fs32 fs_reclaim; + __fs32 fs_sparecon2[1]; + __fs32 fs_state; /* file system state time stamp */ + __fs32 fs_qbmask[2]; /* ~usb_bmask */ + __fs32 fs_qfmask[2]; /* ~usb_fmask */ + } fs_sun; + struct { + __fs32 fs_sparecon[53];/* reserved for future constants */ + __fs32 fs_reclaim; + __fs32 fs_sparecon2[1]; + __fs32 fs_npsect; /* # sectors/track including spares */ + __fs32 fs_qbmask[2]; /* ~usb_bmask */ + __fs32 fs_qfmask[2]; /* ~usb_fmask */ + } fs_sunx86; + struct { + __fs32 fs_sparecon[50];/* reserved for future constants */ + __fs32 fs_contigsumsize;/* size of cluster summary array */ + __fs32 fs_maxsymlinklen;/* max length of an internal symlink */ + __fs32 fs_inodefmt; /* format of on-disk inodes */ + __fs32 fs_maxfilesize[2]; /* max representable file size */ + __fs32 fs_qbmask[2]; /* ~usb_bmask */ + __fs32 fs_qfmask[2]; /* ~usb_fmask */ + __fs32 fs_state; /* file system state time stamp */ + } fs_44; + } fs_u2; + __fs32 fs_postblformat; /* format of positional layout tables */ + __fs32 fs_nrpos; /* number of rotational positions */ + __fs32 fs_postbloff; /* (__s16) rotation block list head */ + __fs32 fs_rotbloff; /* (__u8) blocks for each rotation */ + __fs32 fs_magic; /* magic number */ + __u8 fs_space[1]; /* list of blocks for each rotation */ +}; +#endif/*struct ufs_super_block*/ + +/* + * Preference for optimization. + */ +#define UFS_OPTTIME 0 /* minimize allocation time */ +#define UFS_OPTSPACE 1 /* minimize disk fragmentation */ + +/* + * Rotational layout table format types + */ +#define UFS_42POSTBLFMT -1 /* 4.2BSD rotational table format */ +#define UFS_DYNAMICPOSTBLFMT 1 /* dynamic rotational table format */ + +/* + * Convert cylinder group to base address of its global summary info. + */ +#define fs_cs(indx) s_csp[(indx)] + +/* + * Cylinder group block for a file system. + * + * Writable fields in the cylinder group are protected by the associated + * super block lock fs->fs_lock. + */ +#define CG_MAGIC 0x090255 +#define ufs_cg_chkmagic(sb, ucg) \ + (fs32_to_cpu((sb), (ucg)->cg_magic) == CG_MAGIC) +/* + * Macros for access to old cylinder group array structures + */ +#define ufs_ocg_blktot(sb, ucg) fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_btot) +#define ufs_ocg_blks(sb, ucg, cylno) fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_b[cylno]) +#define ufs_ocg_inosused(sb, ucg) fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_iused) +#define ufs_ocg_blksfree(sb, ucg) fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_free) +#define ufs_ocg_chkmagic(sb, ucg) \ + (fs32_to_cpu((sb), ((struct ufs_old_cylinder_group *)(ucg))->cg_magic) == CG_MAGIC) + +/* + * size of this structure is 172 B + */ +struct ufs_cylinder_group { + __fs32 cg_link; /* linked list of cyl groups */ + __fs32 cg_magic; /* magic number */ + __fs32 cg_time; /* time last written */ + __fs32 cg_cgx; /* we are the cgx'th cylinder group */ + __fs16 cg_ncyl; /* number of cyl's this cg */ + __fs16 cg_niblk; /* number of inode blocks this cg */ + __fs32 cg_ndblk; /* number of data blocks this cg */ + struct ufs_csum cg_cs; /* cylinder summary information */ + __fs32 cg_rotor; /* position of last used block */ + __fs32 cg_frotor; /* position of last used frag */ + __fs32 cg_irotor; /* position of last used inode */ + __fs32 cg_frsum[UFS_MAXFRAG]; /* counts of available frags */ + __fs32 cg_btotoff; /* (__u32) block totals per cylinder */ + __fs32 cg_boff; /* (short) free block positions */ + __fs32 cg_iusedoff; /* (char) used inode map */ + __fs32 cg_freeoff; /* (u_char) free block map */ + __fs32 cg_nextfreeoff; /* (u_char) next available space */ + union { + struct { + __fs32 cg_clustersumoff; /* (u_int32) counts of avail clusters */ + __fs32 cg_clusteroff; /* (u_int8) free cluster map */ + __fs32 cg_nclusterblks; /* number of clusters this cg */ + __fs32 cg_sparecon[13]; /* reserved for future use */ + } cg_44; + struct { + __fs32 cg_clustersumoff;/* (u_int32) counts of avail clusters */ + __fs32 cg_clusteroff; /* (u_int8) free cluster map */ + __fs32 cg_nclusterblks;/* number of clusters this cg */ + __fs32 cg_niblk; /* number of inode blocks this cg */ + __fs32 cg_initediblk; /* last initialized inode */ + __fs32 cg_sparecon32[3];/* reserved for future use */ + __fs64 cg_time; /* time last written */ + __fs64 cg_sparecon[3]; /* reserved for future use */ + } cg_u2; + __fs32 cg_sparecon[16]; /* reserved for future use */ + } cg_u; + __u8 cg_space[1]; /* space for cylinder group maps */ +/* actually longer */ +}; + +/* Historic Cylinder group info */ +struct ufs_old_cylinder_group { + __fs32 cg_link; /* linked list of cyl groups */ + __fs32 cg_rlink; /* for incore cyl groups */ + __fs32 cg_time; /* time last written */ + __fs32 cg_cgx; /* we are the cgx'th cylinder group */ + __fs16 cg_ncyl; /* number of cyl's this cg */ + __fs16 cg_niblk; /* number of inode blocks this cg */ + __fs32 cg_ndblk; /* number of data blocks this cg */ + struct ufs_csum cg_cs; /* cylinder summary information */ + __fs32 cg_rotor; /* position of last used block */ + __fs32 cg_frotor; /* position of last used frag */ + __fs32 cg_irotor; /* position of last used inode */ + __fs32 cg_frsum[8]; /* counts of available frags */ + __fs32 cg_btot[32]; /* block totals per cylinder */ + __fs16 cg_b[32][8]; /* positions of free blocks */ + __u8 cg_iused[256]; /* used inode map */ + __fs32 cg_magic; /* magic number */ + __u8 cg_free[1]; /* free block map */ +/* actually longer */ +}; + +/* + * structure of an on-disk inode + */ +struct ufs_inode { + __fs16 ui_mode; /* 0x0 */ + __fs16 ui_nlink; /* 0x2 */ + union { + struct { + __fs16 ui_suid; /* 0x4 */ + __fs16 ui_sgid; /* 0x6 */ + } oldids; + __fs32 ui_inumber; /* 0x4 lsf: inode number */ + __fs32 ui_author; /* 0x4 GNU HURD: author */ + } ui_u1; + __fs64 ui_size; /* 0x8 */ + struct ufs_timeval ui_atime; /* 0x10 access */ + struct ufs_timeval ui_mtime; /* 0x18 modification */ + struct ufs_timeval ui_ctime; /* 0x20 creation */ + union { + struct { + __fs32 ui_db[UFS_NDADDR];/* 0x28 data blocks */ + __fs32 ui_ib[UFS_NINDIR];/* 0x58 indirect blocks */ + } ui_addr; + __u8 ui_symlink[4*(UFS_NDADDR+UFS_NINDIR)];/* 0x28 fast symlink */ + } ui_u2; + __fs32 ui_flags; /* 0x64 immutable, append-only... */ + __fs32 ui_blocks; /* 0x68 blocks in use */ + __fs32 ui_gen; /* 0x6c like ext2 i_version, for NFS support */ + union { + struct { + __fs32 ui_shadow; /* 0x70 shadow inode with security data */ + __fs32 ui_uid; /* 0x74 long EFT version of uid */ + __fs32 ui_gid; /* 0x78 long EFT version of gid */ + __fs32 ui_oeftflag; /* 0x7c reserved */ + } ui_sun; + struct { + __fs32 ui_uid; /* 0x70 File owner */ + __fs32 ui_gid; /* 0x74 File group */ + __fs32 ui_spare[2]; /* 0x78 reserved */ + } ui_44; + struct { + __fs32 ui_uid; /* 0x70 */ + __fs32 ui_gid; /* 0x74 */ + __fs16 ui_modeh; /* 0x78 mode high bits */ + __fs16 ui_spare; /* 0x7A unused */ + __fs32 ui_trans; /* 0x7c filesystem translator */ + } ui_hurd; + } ui_u3; +}; + +#define UFS_NXADDR 2 /* External addresses in inode. */ +struct ufs2_inode { + __fs16 ui_mode; /* 0: IFMT, permissions; see below. */ + __fs16 ui_nlink; /* 2: File link count. */ + __fs32 ui_uid; /* 4: File owner. */ + __fs32 ui_gid; /* 8: File group. */ + __fs32 ui_blksize; /* 12: Inode blocksize. */ + __fs64 ui_size; /* 16: File byte count. */ + __fs64 ui_blocks; /* 24: Bytes actually held. */ + __fs64 ui_atime; /* 32: Last access time. */ + __fs64 ui_mtime; /* 40: Last modified time. */ + __fs64 ui_ctime; /* 48: Last inode change time. */ + __fs64 ui_birthtime; /* 56: Inode creation time. */ + __fs32 ui_mtimensec; /* 64: Last modified time. */ + __fs32 ui_atimensec; /* 68: Last access time. */ + __fs32 ui_ctimensec; /* 72: Last inode change time. */ + __fs32 ui_birthnsec; /* 76: Inode creation time. */ + __fs32 ui_gen; /* 80: Generation number. */ + __fs32 ui_kernflags; /* 84: Kernel flags. */ + __fs32 ui_flags; /* 88: Status flags (chflags). */ + __fs32 ui_extsize; /* 92: External attributes block. */ + __fs64 ui_extb[UFS_NXADDR];/* 96: External attributes block. */ + union { + struct { + __fs64 ui_db[UFS_NDADDR]; /* 112: Direct disk blocks. */ + __fs64 ui_ib[UFS_NINDIR];/* 208: Indirect disk blocks.*/ + } ui_addr; + __u8 ui_symlink[2*4*(UFS_NDADDR+UFS_NINDIR)];/* 0x28 fast symlink */ + } ui_u2; + __fs64 ui_spare[3]; /* 232: Reserved; currently unused */ +}; + + +/* FreeBSD has these in sys/stat.h */ +/* ui_flags that can be set by a file owner */ +#define UFS_UF_SETTABLE 0x0000ffff +#define UFS_UF_NODUMP 0x00000001 /* do not dump */ +#define UFS_UF_IMMUTABLE 0x00000002 /* immutable (can't "change") */ +#define UFS_UF_APPEND 0x00000004 /* append-only */ +#define UFS_UF_OPAQUE 0x00000008 /* directory is opaque (unionfs) */ +#define UFS_UF_NOUNLINK 0x00000010 /* can't be removed or renamed */ +/* ui_flags that only root can set */ +#define UFS_SF_SETTABLE 0xffff0000 +#define UFS_SF_ARCHIVED 0x00010000 /* archived */ +#define UFS_SF_IMMUTABLE 0x00020000 /* immutable (can't "change") */ +#define UFS_SF_APPEND 0x00040000 /* append-only */ +#define UFS_SF_NOUNLINK 0x00100000 /* can't be removed or renamed */ + +/* + * This structure is used for reading disk structures larger + * than the size of fragment. + */ +struct ufs_buffer_head { + __u64 fragment; /* first fragment */ + __u64 count; /* number of fragments */ + struct buffer_head * bh[UFS_MAXFRAG]; /* buffers */ +}; + +struct ufs_cg_private_info { + struct ufs_buffer_head c_ubh; + __u32 c_cgx; /* number of cylidner group */ + __u16 c_ncyl; /* number of cyl's this cg */ + __u16 c_niblk; /* number of inode blocks this cg */ + __u32 c_ndblk; /* number of data blocks this cg */ + __u32 c_rotor; /* position of last used block */ + __u32 c_frotor; /* position of last used frag */ + __u32 c_irotor; /* position of last used inode */ + __u32 c_btotoff; /* (__u32) block totals per cylinder */ + __u32 c_boff; /* (short) free block positions */ + __u32 c_iusedoff; /* (char) used inode map */ + __u32 c_freeoff; /* (u_char) free block map */ + __u32 c_nextfreeoff; /* (u_char) next available space */ + __u32 c_clustersumoff;/* (u_int32) counts of avail clusters */ + __u32 c_clusteroff; /* (u_int8) free cluster map */ + __u32 c_nclusterblks; /* number of clusters this cg */ +}; + + +struct ufs_sb_private_info { + struct ufs_buffer_head s_ubh; /* buffer containing super block */ + struct ufs_csum_core cs_total; + __u32 s_sblkno; /* offset of super-blocks in filesys */ + __u32 s_cblkno; /* offset of cg-block in filesys */ + __u32 s_iblkno; /* offset of inode-blocks in filesys */ + __u32 s_dblkno; /* offset of first data after cg */ + __u32 s_cgoffset; /* cylinder group offset in cylinder */ + __u32 s_cgmask; /* used to calc mod fs_ntrak */ + __u32 s_size; /* number of blocks (fragments) in fs */ + __u32 s_dsize; /* number of data blocks in fs */ + __u64 s_u2_size; /* ufs2: number of blocks (fragments) in fs */ + __u64 s_u2_dsize; /*ufs2: number of data blocks in fs */ + __u32 s_ncg; /* number of cylinder groups */ + __u32 s_bsize; /* size of basic blocks */ + __u32 s_fsize; /* size of fragments */ + __u32 s_fpb; /* fragments per block */ + __u32 s_minfree; /* minimum percentage of free blocks */ + __u32 s_bmask; /* `blkoff'' calc of blk offsets */ + __u32 s_fmask; /* s_fsize mask */ + __u32 s_bshift; /* `lblkno'' calc of logical blkno */ + __u32 s_fshift; /* s_fsize shift */ + __u32 s_fpbshift; /* fragments per block shift */ + __u32 s_fsbtodb; /* fsbtodb and dbtofsb shift constant */ + __u32 s_sbsize; /* actual size of super block */ + __u32 s_csmask; /* csum block offset */ + __u32 s_csshift; /* csum block number */ + __u32 s_nindir; /* value of NINDIR */ + __u32 s_inopb; /* value of INOPB */ + __u32 s_nspf; /* value of NSPF */ + __u32 s_npsect; /* # sectors/track including spares */ + __u32 s_interleave; /* hardware sector interleave */ + __u32 s_trackskew; /* sector 0 skew, per track */ + __u64 s_csaddr; /* blk addr of cyl grp summary area */ + __u32 s_cssize; /* size of cyl grp summary area */ + __u32 s_cgsize; /* cylinder group size */ + __u32 s_ntrak; /* tracks per cylinder */ + __u32 s_nsect; /* sectors per track */ + __u32 s_spc; /* sectors per cylinder */ + __u32 s_ipg; /* inodes per cylinder group */ + __u32 s_fpg; /* fragments per group */ + __u32 s_cpc; /* cyl per cycle in postbl */ + __s32 s_contigsumsize;/* size of cluster summary array, 44bsd */ + __s64 s_qbmask; /* ~usb_bmask */ + __s64 s_qfmask; /* ~usb_fmask */ + __s32 s_postblformat; /* format of positional layout tables */ + __s32 s_nrpos; /* number of rotational positions */ + __s32 s_postbloff; /* (__s16) rotation block list head */ + __s32 s_rotbloff; /* (__u8) blocks for each rotation */ + + __u32 s_fpbmask; /* fragments per block mask */ + __u32 s_apb; /* address per block */ + __u32 s_2apb; /* address per block^2 */ + __u32 s_3apb; /* address per block^3 */ + __u32 s_apbmask; /* address per block mask */ + __u32 s_apbshift; /* address per block shift */ + __u32 s_2apbshift; /* address per block shift * 2 */ + __u32 s_3apbshift; /* address per block shift * 3 */ + __u32 s_nspfshift; /* number of sector per fragment shift */ + __u32 s_nspb; /* number of sector per block */ + __u32 s_inopf; /* inodes per fragment */ + __u32 s_sbbase; /* offset of NeXTstep superblock */ + __u32 s_bpf; /* bits per fragment */ + __u32 s_bpfshift; /* bits per fragment shift*/ + __u32 s_bpfmask; /* bits per fragment mask */ + + __u32 s_maxsymlinklen;/* upper limit on fast symlinks' size */ + __s32 fs_magic; /* filesystem magic */ + unsigned int s_dirblksize; +}; + +/* + * Sizes of this structures are: + * ufs_super_block_first 512 + * ufs_super_block_second 512 + * ufs_super_block_third 356 + */ +struct ufs_super_block_first { + union { + struct { + __fs32 fs_link; /* UNUSED */ + } fs_42; + struct { + __fs32 fs_state; /* file system state flag */ + } fs_sun; + } fs_u0; + __fs32 fs_rlink; + __fs32 fs_sblkno; + __fs32 fs_cblkno; + __fs32 fs_iblkno; + __fs32 fs_dblkno; + __fs32 fs_cgoffset; + __fs32 fs_cgmask; + __fs32 fs_time; + __fs32 fs_size; + __fs32 fs_dsize; + __fs32 fs_ncg; + __fs32 fs_bsize; + __fs32 fs_fsize; + __fs32 fs_frag; + __fs32 fs_minfree; + __fs32 fs_rotdelay; + __fs32 fs_rps; + __fs32 fs_bmask; + __fs32 fs_fmask; + __fs32 fs_bshift; + __fs32 fs_fshift; + __fs32 fs_maxcontig; + __fs32 fs_maxbpg; + __fs32 fs_fragshift; + __fs32 fs_fsbtodb; + __fs32 fs_sbsize; + __fs32 fs_csmask; + __fs32 fs_csshift; + __fs32 fs_nindir; + __fs32 fs_inopb; + __fs32 fs_nspf; + __fs32 fs_optim; + union { + struct { + __fs32 fs_npsect; + } fs_sun; + struct { + __fs32 fs_state; + } fs_sunx86; + } fs_u1; + __fs32 fs_interleave; + __fs32 fs_trackskew; + __fs32 fs_id[2]; + __fs32 fs_csaddr; + __fs32 fs_cssize; + __fs32 fs_cgsize; + __fs32 fs_ntrak; + __fs32 fs_nsect; + __fs32 fs_spc; + __fs32 fs_ncyl; + __fs32 fs_cpg; + __fs32 fs_ipg; + __fs32 fs_fpg; + struct ufs_csum fs_cstotal; + __s8 fs_fmod; + __s8 fs_clean; + __s8 fs_ronly; + __s8 fs_flags; + __s8 fs_fsmnt[UFS_MAXMNTLEN - 212]; + +}; + +struct ufs_super_block_second { + union { + struct { + __s8 fs_fsmnt[212]; + __fs32 fs_cgrotor; + __fs32 fs_csp[UFS_MAXCSBUFS]; + __fs32 fs_maxcluster; + __fs32 fs_cpc; + __fs16 fs_opostbl[82]; + } fs_u1; + struct { + __s8 fs_fsmnt[UFS2_MAXMNTLEN - UFS_MAXMNTLEN + 212]; + __u8 fs_volname[UFS2_MAXVOLLEN]; + __fs64 fs_swuid; + __fs32 fs_pad; + __fs32 fs_cgrotor; + __fs32 fs_ocsp[UFS2_NOCSPTRS]; + __fs32 fs_contigdirs; + __fs32 fs_csp; + __fs32 fs_maxcluster; + __fs32 fs_active; + __fs32 fs_old_cpc; + __fs32 fs_maxbsize; + __fs64 fs_sparecon64[17]; + __fs64 fs_sblockloc; + __fs64 cs_ndir; + __fs64 cs_nbfree; + } fs_u2; + } fs_un; +}; + +struct ufs_super_block_third { + union { + struct { + __fs16 fs_opostbl[46]; + } fs_u1; + struct { + __fs64 cs_nifree; /* number of free inodes */ + __fs64 cs_nffree; /* number of free frags */ + __fs64 cs_numclusters; /* number of free clusters */ + __fs64 cs_spare[3]; /* future expansion */ + struct ufs_timeval fs_time; /* last time written */ + __fs64 fs_size; /* number of blocks in fs */ + __fs64 fs_dsize; /* number of data blocks in fs */ + __fs64 fs_csaddr; /* blk addr of cyl grp summary area */ + __fs64 fs_pendingblocks;/* blocks in process of being freed */ + __fs32 fs_pendinginodes;/*inodes in process of being freed */ + } __attribute__ ((packed)) fs_u2; + } fs_un1; + union { + struct { + __fs32 fs_sparecon[53];/* reserved for future constants */ + __fs32 fs_reclaim; + __fs32 fs_sparecon2[1]; + __fs32 fs_state; /* file system state time stamp */ + __fs32 fs_qbmask[2]; /* ~usb_bmask */ + __fs32 fs_qfmask[2]; /* ~usb_fmask */ + } fs_sun; + struct { + __fs32 fs_sparecon[53];/* reserved for future constants */ + __fs32 fs_reclaim; + __fs32 fs_sparecon2[1]; + __fs32 fs_npsect; /* # sectors/track including spares */ + __fs32 fs_qbmask[2]; /* ~usb_bmask */ + __fs32 fs_qfmask[2]; /* ~usb_fmask */ + } fs_sunx86; + struct { + __fs32 fs_sparecon[50];/* reserved for future constants */ + __fs32 fs_contigsumsize;/* size of cluster summary array */ + __fs32 fs_maxsymlinklen;/* max length of an internal symlink */ + __fs32 fs_inodefmt; /* format of on-disk inodes */ + __fs32 fs_maxfilesize[2]; /* max representable file size */ + __fs32 fs_qbmask[2]; /* ~usb_bmask */ + __fs32 fs_qfmask[2]; /* ~usb_fmask */ + __fs32 fs_state; /* file system state time stamp */ + } fs_44; + } fs_un2; + __fs32 fs_postblformat; + __fs32 fs_nrpos; + __fs32 fs_postbloff; + __fs32 fs_rotbloff; + __fs32 fs_magic; + __u8 fs_space[1]; +}; + +#endif /* __LINUX_UFS_FS_H */ diff --git a/fs/ufs/util.c b/fs/ufs/util.c new file mode 100644 index 00000000..95425b59 --- /dev/null +++ b/fs/ufs/util.c @@ -0,0 +1,283 @@ +/* + * linux/fs/ufs/util.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + */ + +#include +#include +#include + +#include "ufs_fs.h" +#include "ufs.h" +#include "swab.h" +#include "util.h" + +struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi, + struct super_block *sb, u64 fragment, u64 size) +{ + struct ufs_buffer_head * ubh; + unsigned i, j ; + u64 count = 0; + if (size & ~uspi->s_fmask) + return NULL; + count = size >> uspi->s_fshift; + if (count > UFS_MAXFRAG) + return NULL; + ubh = (struct ufs_buffer_head *) + kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS); + if (!ubh) + return NULL; + ubh->fragment = fragment; + ubh->count = count; + for (i = 0; i < count; i++) + if (!(ubh->bh[i] = sb_bread(sb, fragment + i))) + goto failed; + for (; i < UFS_MAXFRAG; i++) + ubh->bh[i] = NULL; + return ubh; +failed: + for (j = 0; j < i; j++) + brelse (ubh->bh[j]); + kfree(ubh); + return NULL; +} + +struct ufs_buffer_head * ubh_bread_uspi (struct ufs_sb_private_info * uspi, + struct super_block *sb, u64 fragment, u64 size) +{ + unsigned i, j; + u64 count = 0; + if (size & ~uspi->s_fmask) + return NULL; + count = size >> uspi->s_fshift; + if (count <= 0 || count > UFS_MAXFRAG) + return NULL; + USPI_UBH(uspi)->fragment = fragment; + USPI_UBH(uspi)->count = count; + for (i = 0; i < count; i++) + if (!(USPI_UBH(uspi)->bh[i] = sb_bread(sb, fragment + i))) + goto failed; + for (; i < UFS_MAXFRAG; i++) + USPI_UBH(uspi)->bh[i] = NULL; + return USPI_UBH(uspi); +failed: + for (j = 0; j < i; j++) + brelse (USPI_UBH(uspi)->bh[j]); + return NULL; +} + +void ubh_brelse (struct ufs_buffer_head * ubh) +{ + unsigned i; + if (!ubh) + return; + for (i = 0; i < ubh->count; i++) + brelse (ubh->bh[i]); + kfree (ubh); +} + +void ubh_brelse_uspi (struct ufs_sb_private_info * uspi) +{ + unsigned i; + if (!USPI_UBH(uspi)) + return; + for ( i = 0; i < USPI_UBH(uspi)->count; i++ ) { + brelse (USPI_UBH(uspi)->bh[i]); + USPI_UBH(uspi)->bh[i] = NULL; + } +} + +void ubh_mark_buffer_dirty (struct ufs_buffer_head * ubh) +{ + unsigned i; + if (!ubh) + return; + for ( i = 0; i < ubh->count; i++ ) + mark_buffer_dirty (ubh->bh[i]); +} + +void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag) +{ + unsigned i; + if (!ubh) + return; + if (flag) { + for ( i = 0; i < ubh->count; i++ ) + set_buffer_uptodate (ubh->bh[i]); + } else { + for ( i = 0; i < ubh->count; i++ ) + clear_buffer_uptodate (ubh->bh[i]); + } +} + +void ubh_sync_block(struct ufs_buffer_head *ubh) +{ + if (ubh) { + unsigned i; + + for (i = 0; i < ubh->count; i++) + write_dirty_buffer(ubh->bh[i], WRITE); + + for (i = 0; i < ubh->count; i++) + wait_on_buffer(ubh->bh[i]); + } +} + +void ubh_bforget (struct ufs_buffer_head * ubh) +{ + unsigned i; + if (!ubh) + return; + for ( i = 0; i < ubh->count; i++ ) if ( ubh->bh[i] ) + bforget (ubh->bh[i]); +} + +int ubh_buffer_dirty (struct ufs_buffer_head * ubh) +{ + unsigned i; + unsigned result = 0; + if (!ubh) + return 0; + for ( i = 0; i < ubh->count; i++ ) + result |= buffer_dirty(ubh->bh[i]); + return result; +} + +void _ubh_ubhcpymem_(struct ufs_sb_private_info * uspi, + unsigned char * mem, struct ufs_buffer_head * ubh, unsigned size) +{ + unsigned len, bhno; + if (size > (ubh->count << uspi->s_fshift)) + size = ubh->count << uspi->s_fshift; + bhno = 0; + while (size) { + len = min_t(unsigned int, size, uspi->s_fsize); + memcpy (mem, ubh->bh[bhno]->b_data, len); + mem += uspi->s_fsize; + size -= len; + bhno++; + } +} + +void _ubh_memcpyubh_(struct ufs_sb_private_info * uspi, + struct ufs_buffer_head * ubh, unsigned char * mem, unsigned size) +{ + unsigned len, bhno; + if (size > (ubh->count << uspi->s_fshift)) + size = ubh->count << uspi->s_fshift; + bhno = 0; + while (size) { + len = min_t(unsigned int, size, uspi->s_fsize); + memcpy (ubh->bh[bhno]->b_data, mem, len); + mem += uspi->s_fsize; + size -= len; + bhno++; + } +} + +dev_t +ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi) +{ + __u32 fs32; + dev_t dev; + + if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86) + fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[1]); + else + fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[0]); + switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { + case UFS_ST_SUNx86: + case UFS_ST_SUN: + if ((fs32 & 0xffff0000) == 0 || + (fs32 & 0xffff0000) == 0xffff0000) + dev = old_decode_dev(fs32 & 0x7fff); + else + dev = MKDEV(sysv_major(fs32), sysv_minor(fs32)); + break; + + default: + dev = old_decode_dev(fs32); + break; + } + return dev; +} + +void +ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev) +{ + __u32 fs32; + + switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { + case UFS_ST_SUNx86: + case UFS_ST_SUN: + fs32 = sysv_encode_dev(dev); + if ((fs32 & 0xffff8000) == 0) { + fs32 = old_encode_dev(dev); + } + break; + + default: + fs32 = old_encode_dev(dev); + break; + } + if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86) + ufsi->i_u1.i_data[1] = cpu_to_fs32(sb, fs32); + else + ufsi->i_u1.i_data[0] = cpu_to_fs32(sb, fs32); +} + +/** + * ufs_get_locked_page() - locate, pin and lock a pagecache page, if not exist + * read it from disk. + * @mapping: the address_space to search + * @index: the page index + * + * Locates the desired pagecache page, if not exist we'll read it, + * locks it, increments its reference + * count and returns its address. + * + */ + +struct page *ufs_get_locked_page(struct address_space *mapping, + pgoff_t index) +{ + struct page *page; + + page = find_lock_page(mapping, index); + if (!page) { + page = read_mapping_page(mapping, index, NULL); + + if (IS_ERR(page)) { + printk(KERN_ERR "ufs_change_blocknr: " + "read_mapping_page error: ino %lu, index: %lu\n", + mapping->host->i_ino, index); + goto out; + } + + lock_page(page); + + if (unlikely(page->mapping == NULL)) { + /* Truncate got there first */ + unlock_page(page); + page_cache_release(page); + page = NULL; + goto out; + } + + if (!PageUptodate(page) || PageError(page)) { + unlock_page(page); + page_cache_release(page); + + printk(KERN_ERR "ufs_change_blocknr: " + "can not read page: ino %lu, index: %lu\n", + mapping->host->i_ino, index); + + page = ERR_PTR(-EIO); + } + } +out: + return page; +} diff --git a/fs/ufs/util.h b/fs/ufs/util.h new file mode 100644 index 00000000..95417592 --- /dev/null +++ b/fs/ufs/util.h @@ -0,0 +1,592 @@ +/* + * linux/fs/ufs/util.h + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + */ + +#include +#include +#include "swab.h" + + +/* + * some useful macros + */ +#define in_range(b,first,len) ((b)>=(first)&&(b)<(first)+(len)) + +/* + * functions used for retyping + */ +static inline struct ufs_buffer_head *UCPI_UBH(struct ufs_cg_private_info *cpi) +{ + return &cpi->c_ubh; +} +static inline struct ufs_buffer_head *USPI_UBH(struct ufs_sb_private_info *spi) +{ + return &spi->s_ubh; +} + + + +/* + * macros used for accessing structures + */ +static inline s32 +ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, + struct ufs_super_block_third *usb3) +{ + switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { + case UFS_ST_SUNOS: + if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT) + return fs32_to_cpu(sb, usb1->fs_u0.fs_sun.fs_state); + /* Fall Through to UFS_ST_SUN */ + case UFS_ST_SUN: + return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state); + case UFS_ST_SUNx86: + return fs32_to_cpu(sb, usb1->fs_u1.fs_sunx86.fs_state); + case UFS_ST_44BSD: + default: + return fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_state); + } +} + +static inline void +ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, + struct ufs_super_block_third *usb3, s32 value) +{ + switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { + case UFS_ST_SUNOS: + if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT) { + usb1->fs_u0.fs_sun.fs_state = cpu_to_fs32(sb, value); + break; + } + /* Fall Through to UFS_ST_SUN */ + case UFS_ST_SUN: + usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value); + break; + case UFS_ST_SUNx86: + usb1->fs_u1.fs_sunx86.fs_state = cpu_to_fs32(sb, value); + break; + case UFS_ST_44BSD: + usb3->fs_un2.fs_44.fs_state = cpu_to_fs32(sb, value); + break; + } +} + +static inline u32 +ufs_get_fs_npsect(struct super_block *sb, struct ufs_super_block_first *usb1, + struct ufs_super_block_third *usb3) +{ + if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86) + return fs32_to_cpu(sb, usb3->fs_un2.fs_sunx86.fs_npsect); + else + return fs32_to_cpu(sb, usb1->fs_u1.fs_sun.fs_npsect); +} + +static inline u64 +ufs_get_fs_qbmask(struct super_block *sb, struct ufs_super_block_third *usb3) +{ + __fs64 tmp; + + switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { + case UFS_ST_SUNOS: + case UFS_ST_SUN: + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qbmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qbmask[1]; + break; + case UFS_ST_SUNx86: + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qbmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qbmask[1]; + break; + case UFS_ST_44BSD: + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qbmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qbmask[1]; + break; + } + + return fs64_to_cpu(sb, tmp); +} + +static inline u64 +ufs_get_fs_qfmask(struct super_block *sb, struct ufs_super_block_third *usb3) +{ + __fs64 tmp; + + switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { + case UFS_ST_SUNOS: + case UFS_ST_SUN: + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qfmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qfmask[1]; + break; + case UFS_ST_SUNx86: + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qfmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qfmask[1]; + break; + case UFS_ST_44BSD: + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qfmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qfmask[1]; + break; + } + + return fs64_to_cpu(sb, tmp); +} + +static inline u16 +ufs_get_de_namlen(struct super_block *sb, struct ufs_dir_entry *de) +{ + if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) == UFS_DE_OLD) + return fs16_to_cpu(sb, de->d_u.d_namlen); + else + return de->d_u.d_44.d_namlen; /* XXX this seems wrong */ +} + +static inline void +ufs_set_de_namlen(struct super_block *sb, struct ufs_dir_entry *de, u16 value) +{ + if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) == UFS_DE_OLD) + de->d_u.d_namlen = cpu_to_fs16(sb, value); + else + de->d_u.d_44.d_namlen = value; /* XXX this seems wrong */ +} + +static inline void +ufs_set_de_type(struct super_block *sb, struct ufs_dir_entry *de, int mode) +{ + if ((UFS_SB(sb)->s_flags & UFS_DE_MASK) != UFS_DE_44BSD) + return; + + /* + * TODO turn this into a table lookup + */ + switch (mode & S_IFMT) { + case S_IFSOCK: + de->d_u.d_44.d_type = DT_SOCK; + break; + case S_IFLNK: + de->d_u.d_44.d_type = DT_LNK; + break; + case S_IFREG: + de->d_u.d_44.d_type = DT_REG; + break; + case S_IFBLK: + de->d_u.d_44.d_type = DT_BLK; + break; + case S_IFDIR: + de->d_u.d_44.d_type = DT_DIR; + break; + case S_IFCHR: + de->d_u.d_44.d_type = DT_CHR; + break; + case S_IFIFO: + de->d_u.d_44.d_type = DT_FIFO; + break; + default: + de->d_u.d_44.d_type = DT_UNKNOWN; + } +} + +static inline u32 +ufs_get_inode_uid(struct super_block *sb, struct ufs_inode *inode) +{ + switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { + case UFS_UID_44BSD: + return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_uid); + case UFS_UID_EFT: + if (inode->ui_u1.oldids.ui_suid == 0xFFFF) + return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_uid); + /* Fall through */ + default: + return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_suid); + } +} + +static inline void +ufs_set_inode_uid(struct super_block *sb, struct ufs_inode *inode, u32 value) +{ + switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { + case UFS_UID_44BSD: + inode->ui_u3.ui_44.ui_uid = cpu_to_fs32(sb, value); + inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value); + break; + case UFS_UID_EFT: + inode->ui_u3.ui_sun.ui_uid = cpu_to_fs32(sb, value); + if (value > 0xFFFF) + value = 0xFFFF; + /* Fall through */ + default: + inode->ui_u1.oldids.ui_suid = cpu_to_fs16(sb, value); + break; + } +} + +static inline u32 +ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode) +{ + switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { + case UFS_UID_44BSD: + return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_gid); + case UFS_UID_EFT: + if (inode->ui_u1.oldids.ui_suid == 0xFFFF) + return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid); + /* Fall through */ + default: + return fs16_to_cpu(sb, inode->ui_u1.oldids.ui_sgid); + } +} + +static inline void +ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value) +{ + switch (UFS_SB(sb)->s_flags & UFS_UID_MASK) { + case UFS_UID_44BSD: + inode->ui_u3.ui_44.ui_gid = cpu_to_fs32(sb, value); + inode->ui_u1.oldids.ui_sgid = cpu_to_fs16(sb, value); + break; + case UFS_UID_EFT: + inode->ui_u3.ui_sun.ui_gid = cpu_to_fs32(sb, value); + if (value > 0xFFFF) + value = 0xFFFF; + /* Fall through */ + default: + inode->ui_u1.oldids.ui_sgid = cpu_to_fs16(sb, value); + break; + } +} + +extern dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *); +extern void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t); +extern int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len); + +/* + * These functions manipulate ufs buffers + */ +#define ubh_bread(sb,fragment,size) _ubh_bread_(uspi,sb,fragment,size) +extern struct ufs_buffer_head * _ubh_bread_(struct ufs_sb_private_info *, struct super_block *, u64 , u64); +extern struct ufs_buffer_head * ubh_bread_uspi(struct ufs_sb_private_info *, struct super_block *, u64, u64); +extern void ubh_brelse (struct ufs_buffer_head *); +extern void ubh_brelse_uspi (struct ufs_sb_private_info *); +extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *); +extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int); +extern void ubh_sync_block(struct ufs_buffer_head *); +extern void ubh_bforget (struct ufs_buffer_head *); +extern int ubh_buffer_dirty (struct ufs_buffer_head *); +#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size) +extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struct ufs_buffer_head *, unsigned); +#define ubh_memcpyubh(ubh,mem,size) _ubh_memcpyubh_(uspi,ubh,mem,size) +extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned); + +/* This functions works with cache pages*/ +extern struct page *ufs_get_locked_page(struct address_space *mapping, + pgoff_t index); +static inline void ufs_put_locked_page(struct page *page) +{ + unlock_page(page); + page_cache_release(page); +} + + +/* + * macros and inline function to get important structures from ufs_sb_private_info + */ + +static inline void *get_usb_offset(struct ufs_sb_private_info *uspi, + unsigned int offset) +{ + unsigned int index; + + index = offset >> uspi->s_fshift; + offset &= ~uspi->s_fmask; + return uspi->s_ubh.bh[index]->b_data + offset; +} + +#define ubh_get_usb_first(uspi) \ + ((struct ufs_super_block_first *)get_usb_offset((uspi), 0)) + +#define ubh_get_usb_second(uspi) \ + ((struct ufs_super_block_second *)get_usb_offset((uspi), UFS_SECTOR_SIZE)) + +#define ubh_get_usb_third(uspi) \ + ((struct ufs_super_block_third *)get_usb_offset((uspi), 2*UFS_SECTOR_SIZE)) + + +#define ubh_get_ucg(ubh) \ + ((struct ufs_cylinder_group *)((ubh)->bh[0]->b_data)) + + +/* + * Extract byte from ufs_buffer_head + * Extract the bits for a block from a map inside ufs_buffer_head + */ +#define ubh_get_addr8(ubh,begin) \ + ((u8*)(ubh)->bh[(begin) >> uspi->s_fshift]->b_data + \ + ((begin) & ~uspi->s_fmask)) + +#define ubh_get_addr16(ubh,begin) \ + (((__fs16*)((ubh)->bh[(begin) >> (uspi->s_fshift-1)]->b_data)) + \ + ((begin) & ((uspi->fsize>>1) - 1))) + +#define ubh_get_addr32(ubh,begin) \ + (((__fs32*)((ubh)->bh[(begin) >> (uspi->s_fshift-2)]->b_data)) + \ + ((begin) & ((uspi->s_fsize>>2) - 1))) + +#define ubh_get_addr64(ubh,begin) \ + (((__fs64*)((ubh)->bh[(begin) >> (uspi->s_fshift-3)]->b_data)) + \ + ((begin) & ((uspi->s_fsize>>3) - 1))) + +#define ubh_get_addr ubh_get_addr8 + +static inline void *ubh_get_data_ptr(struct ufs_sb_private_info *uspi, + struct ufs_buffer_head *ubh, + u64 blk) +{ + if (uspi->fs_magic == UFS2_MAGIC) + return ubh_get_addr64(ubh, blk); + else + return ubh_get_addr32(ubh, blk); +} + +#define ubh_blkmap(ubh,begin,bit) \ + ((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb))) + +/* + * Determine the number of available frags given a + * percentage to hold in reserve. + */ +static inline u64 +ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved) +{ + return ufs_blkstofrags(uspi->cs_total.cs_nbfree) + + uspi->cs_total.cs_nffree - + (uspi->s_dsize * (percentreserved) / 100); +} + +/* + * Macros to access cylinder group array structures + */ +#define ubh_cg_blktot(ucpi,cylno) \ + (*((__fs32*)ubh_get_addr(UCPI_UBH(ucpi), (ucpi)->c_btotoff + ((cylno) << 2)))) + +#define ubh_cg_blks(ucpi,cylno,rpos) \ + (*((__fs16*)ubh_get_addr(UCPI_UBH(ucpi), \ + (ucpi)->c_boff + (((cylno) * uspi->s_nrpos + (rpos)) << 1 )))) + +/* + * Bitmap operations + * These functions work like classical bitmap operations. + * The difference is that we don't have the whole bitmap + * in one contiguous chunk of memory, but in several buffers. + * The parameters of each function are super_block, ufs_buffer_head and + * position of the beginning of the bitmap. + */ +#define ubh_setbit(ubh,begin,bit) \ + (*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) |= (1 << ((bit) & 7))) + +#define ubh_clrbit(ubh,begin,bit) \ + (*ubh_get_addr (ubh, (begin) + ((bit) >> 3)) &= ~(1 << ((bit) & 7))) + +#define ubh_isset(ubh,begin,bit) \ + (*ubh_get_addr (ubh, (begin) + ((bit) >> 3)) & (1 << ((bit) & 7))) + +#define ubh_isclr(ubh,begin,bit) (!ubh_isset(ubh,begin,bit)) + +#define ubh_find_first_zero_bit(ubh,begin,size) _ubh_find_next_zero_bit_(uspi,ubh,begin,size,0) + +#define ubh_find_next_zero_bit(ubh,begin,size,offset) _ubh_find_next_zero_bit_(uspi,ubh,begin,size,offset) +static inline unsigned _ubh_find_next_zero_bit_( + struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, + unsigned begin, unsigned size, unsigned offset) +{ + unsigned base, count, pos; + + size -= offset; + begin <<= 3; + offset += begin; + base = offset >> uspi->s_bpfshift; + offset &= uspi->s_bpfmask; + for (;;) { + count = min_t(unsigned int, size + offset, uspi->s_bpf); + size -= count - offset; + pos = find_next_zero_bit_le(ubh->bh[base]->b_data, count, offset); + if (pos < count || !size) + break; + base++; + offset = 0; + } + return (base << uspi->s_bpfshift) + pos - begin; +} + +static inline unsigned find_last_zero_bit (unsigned char * bitmap, + unsigned size, unsigned offset) +{ + unsigned bit, i; + unsigned char * mapp; + unsigned char map; + + mapp = bitmap + (size >> 3); + map = *mapp--; + bit = 1 << (size & 7); + for (i = size; i > offset; i--) { + if ((map & bit) == 0) + break; + if ((i & 7) != 0) { + bit >>= 1; + } else { + map = *mapp--; + bit = 1 << 7; + } + } + return i; +} + +#define ubh_find_last_zero_bit(ubh,begin,size,offset) _ubh_find_last_zero_bit_(uspi,ubh,begin,size,offset) +static inline unsigned _ubh_find_last_zero_bit_( + struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, + unsigned begin, unsigned start, unsigned end) +{ + unsigned base, count, pos, size; + + size = start - end; + begin <<= 3; + start += begin; + base = start >> uspi->s_bpfshift; + start &= uspi->s_bpfmask; + for (;;) { + count = min_t(unsigned int, + size + (uspi->s_bpf - start), uspi->s_bpf) + - (uspi->s_bpf - start); + size -= count; + pos = find_last_zero_bit (ubh->bh[base]->b_data, + start, start - count); + if (pos > start - count || !size) + break; + base--; + start = uspi->s_bpf; + } + return (base << uspi->s_bpfshift) + pos - begin; +} + +#define ubh_isblockclear(ubh,begin,block) (!_ubh_isblockset_(uspi,ubh,begin,block)) + +#define ubh_isblockset(ubh,begin,block) _ubh_isblockset_(uspi,ubh,begin,block) +static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi, + struct ufs_buffer_head * ubh, unsigned begin, unsigned block) +{ + switch (uspi->s_fpb) { + case 8: + return (*ubh_get_addr (ubh, begin + block) == 0xff); + case 4: + return (*ubh_get_addr (ubh, begin + (block >> 1)) == (0x0f << ((block & 0x01) << 2))); + case 2: + return (*ubh_get_addr (ubh, begin + (block >> 2)) == (0x03 << ((block & 0x03) << 1))); + case 1: + return (*ubh_get_addr (ubh, begin + (block >> 3)) == (0x01 << (block & 0x07))); + } + return 0; +} + +#define ubh_clrblock(ubh,begin,block) _ubh_clrblock_(uspi,ubh,begin,block) +static inline void _ubh_clrblock_(struct ufs_sb_private_info * uspi, + struct ufs_buffer_head * ubh, unsigned begin, unsigned block) +{ + switch (uspi->s_fpb) { + case 8: + *ubh_get_addr (ubh, begin + block) = 0x00; + return; + case 4: + *ubh_get_addr (ubh, begin + (block >> 1)) &= ~(0x0f << ((block & 0x01) << 2)); + return; + case 2: + *ubh_get_addr (ubh, begin + (block >> 2)) &= ~(0x03 << ((block & 0x03) << 1)); + return; + case 1: + *ubh_get_addr (ubh, begin + (block >> 3)) &= ~(0x01 << ((block & 0x07))); + return; + } +} + +#define ubh_setblock(ubh,begin,block) _ubh_setblock_(uspi,ubh,begin,block) +static inline void _ubh_setblock_(struct ufs_sb_private_info * uspi, + struct ufs_buffer_head * ubh, unsigned begin, unsigned block) +{ + switch (uspi->s_fpb) { + case 8: + *ubh_get_addr(ubh, begin + block) = 0xff; + return; + case 4: + *ubh_get_addr(ubh, begin + (block >> 1)) |= (0x0f << ((block & 0x01) << 2)); + return; + case 2: + *ubh_get_addr(ubh, begin + (block >> 2)) |= (0x03 << ((block & 0x03) << 1)); + return; + case 1: + *ubh_get_addr(ubh, begin + (block >> 3)) |= (0x01 << ((block & 0x07))); + return; + } +} + +static inline void ufs_fragacct (struct super_block * sb, unsigned blockmap, + __fs32 * fraglist, int cnt) +{ + struct ufs_sb_private_info * uspi; + unsigned fragsize, pos; + + uspi = UFS_SB(sb)->s_uspi; + + fragsize = 0; + for (pos = 0; pos < uspi->s_fpb; pos++) { + if (blockmap & (1 << pos)) { + fragsize++; + } + else if (fragsize > 0) { + fs32_add(sb, &fraglist[fragsize], cnt); + fragsize = 0; + } + } + if (fragsize > 0 && fragsize < uspi->s_fpb) + fs32_add(sb, &fraglist[fragsize], cnt); +} + +static inline void *ufs_get_direct_data_ptr(struct ufs_sb_private_info *uspi, + struct ufs_inode_info *ufsi, + unsigned blk) +{ + BUG_ON(blk > UFS_TIND_BLOCK); + return uspi->fs_magic == UFS2_MAGIC ? + (void *)&ufsi->i_u1.u2_i_data[blk] : + (void *)&ufsi->i_u1.i_data[blk]; +} + +static inline u64 ufs_data_ptr_to_cpu(struct super_block *sb, void *p) +{ + return UFS_SB(sb)->s_uspi->fs_magic == UFS2_MAGIC ? + fs64_to_cpu(sb, *(__fs64 *)p) : + fs32_to_cpu(sb, *(__fs32 *)p); +} + +static inline void ufs_cpu_to_data_ptr(struct super_block *sb, void *p, u64 val) +{ + if (UFS_SB(sb)->s_uspi->fs_magic == UFS2_MAGIC) + *(__fs64 *)p = cpu_to_fs64(sb, val); + else + *(__fs32 *)p = cpu_to_fs32(sb, val); +} + +static inline void ufs_data_ptr_clear(struct ufs_sb_private_info *uspi, + void *p) +{ + if (uspi->fs_magic == UFS2_MAGIC) + *(__fs64 *)p = 0; + else + *(__fs32 *)p = 0; +} + +static inline int ufs_is_data_ptr_zero(struct ufs_sb_private_info *uspi, + void *p) +{ + if (uspi->fs_magic == UFS2_MAGIC) + return *(__fs64 *)p == 0; + else + return *(__fs32 *)p == 0; +} diff --git a/fs/utimes.c b/fs/utimes.c new file mode 100644 index 00000000..ba653f3d --- /dev/null +++ b/fs/utimes.c @@ -0,0 +1,224 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __ARCH_WANT_SYS_UTIME + +/* + * sys_utime() can be implemented in user-level using sys_utimes(). + * Is this for backwards compatibility? If so, why not move it + * into the appropriate arch directory (for those architectures that + * need it). + */ + +/* If times==NULL, set access and modification to current time, + * must be owner or have write permission. + * Else, update from *times, must be owner or super user. + */ +SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times) +{ + struct timespec tv[2]; + + if (times) { + if (get_user(tv[0].tv_sec, ×->actime) || + get_user(tv[1].tv_sec, ×->modtime)) + return -EFAULT; + tv[0].tv_nsec = 0; + tv[1].tv_nsec = 0; + } + return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0); +} + +#endif + +static bool nsec_valid(long nsec) +{ + if (nsec == UTIME_OMIT || nsec == UTIME_NOW) + return true; + + return nsec >= 0 && nsec <= 999999999; +} + +static int utimes_common(struct path *path, struct timespec *times) +{ + int error; + struct iattr newattrs; + struct inode *inode = path->dentry->d_inode; + + error = mnt_want_write(path->mnt); + if (error) + goto out; + + if (times && times[0].tv_nsec == UTIME_NOW && + times[1].tv_nsec == UTIME_NOW) + times = NULL; + + newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME; + if (times) { + if (times[0].tv_nsec == UTIME_OMIT) + newattrs.ia_valid &= ~ATTR_ATIME; + else if (times[0].tv_nsec != UTIME_NOW) { + newattrs.ia_atime.tv_sec = times[0].tv_sec; + newattrs.ia_atime.tv_nsec = times[0].tv_nsec; + newattrs.ia_valid |= ATTR_ATIME_SET; + } + + if (times[1].tv_nsec == UTIME_OMIT) + newattrs.ia_valid &= ~ATTR_MTIME; + else if (times[1].tv_nsec != UTIME_NOW) { + newattrs.ia_mtime.tv_sec = times[1].tv_sec; + newattrs.ia_mtime.tv_nsec = times[1].tv_nsec; + newattrs.ia_valid |= ATTR_MTIME_SET; + } + /* + * Tell inode_change_ok(), that this is an explicit time + * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET + * were used. + */ + newattrs.ia_valid |= ATTR_TIMES_SET; + } else { + /* + * If times is NULL (or both times are UTIME_NOW), + * then we need to check permissions, because + * inode_change_ok() won't do it. + */ + error = -EACCES; + if (IS_IMMUTABLE(inode)) + goto mnt_drop_write_and_out; + + if (!inode_owner_or_capable(inode)) { + error = inode_permission(inode, MAY_WRITE); + if (error) + goto mnt_drop_write_and_out; + } + } + mutex_lock(&inode->i_mutex); + error = notify_change(path->dentry, &newattrs); + mutex_unlock(&inode->i_mutex); + +mnt_drop_write_and_out: + mnt_drop_write(path->mnt); +out: + return error; +} + +/* + * do_utimes - change times on filename or file descriptor + * @dfd: open file descriptor, -1 or AT_FDCWD + * @filename: path name or NULL + * @times: new times or NULL + * @flags: zero or more flags (only AT_SYMLINK_NOFOLLOW for the moment) + * + * If filename is NULL and dfd refers to an open file, then operate on + * the file. Otherwise look up filename, possibly using dfd as a + * starting point. + * + * If times==NULL, set access and modification to current time, + * must be owner or have write permission. + * Else, update from *times, must be owner or super user. + */ +long do_utimes(int dfd, const char __user *filename, struct timespec *times, + int flags) +{ + int error = -EINVAL; + + if (times && (!nsec_valid(times[0].tv_nsec) || + !nsec_valid(times[1].tv_nsec))) { + goto out; + } + + if (flags & ~AT_SYMLINK_NOFOLLOW) + goto out; + + if (filename == NULL && dfd != AT_FDCWD) { + struct file *file; + + if (flags & AT_SYMLINK_NOFOLLOW) + goto out; + + file = fget(dfd); + error = -EBADF; + if (!file) + goto out; + + error = utimes_common(&file->f_path, times); + fput(file); + } else { + struct path path; + int lookup_flags = 0; + + if (!(flags & AT_SYMLINK_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + + error = user_path_at(dfd, filename, lookup_flags, &path); + if (error) + goto out; + + error = utimes_common(&path, times); + path_put(&path); + } + +out: + return error; +} + +SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, + struct timespec __user *, utimes, int, flags) +{ + struct timespec tstimes[2]; + + if (utimes) { + if (copy_from_user(&tstimes, utimes, sizeof(tstimes))) + return -EFAULT; + + /* Nothing to do, we must not even check the path. */ + if (tstimes[0].tv_nsec == UTIME_OMIT && + tstimes[1].tv_nsec == UTIME_OMIT) + return 0; + } + + return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags); +} + +SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, + struct timeval __user *, utimes) +{ + struct timeval times[2]; + struct timespec tstimes[2]; + + if (utimes) { + if (copy_from_user(×, utimes, sizeof(times))) + return -EFAULT; + + /* This test is needed to catch all invalid values. If we + would test only in do_utimes we would miss those invalid + values truncated by the multiplication with 1000. Note + that we also catch UTIME_{NOW,OMIT} here which are only + valid for utimensat. */ + if (times[0].tv_usec >= 1000000 || times[0].tv_usec < 0 || + times[1].tv_usec >= 1000000 || times[1].tv_usec < 0) + return -EINVAL; + + tstimes[0].tv_sec = times[0].tv_sec; + tstimes[0].tv_nsec = 1000 * times[0].tv_usec; + tstimes[1].tv_sec = times[1].tv_sec; + tstimes[1].tv_nsec = 1000 * times[1].tv_usec; + } + + return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0); +} + +SYSCALL_DEFINE2(utimes, char __user *, filename, + struct timeval __user *, utimes) +{ + return sys_futimesat(AT_FDCWD, filename, utimes); +} diff --git a/fs/xattr.c b/fs/xattr.c new file mode 100644 index 00000000..f060663a --- /dev/null +++ b/fs/xattr.c @@ -0,0 +1,698 @@ +/* + File: fs/xattr.c + + Extended attribute handling. + + Copyright (C) 2001 by Andreas Gruenbacher + Copyright (C) 2001 SGI - Silicon Graphics, Inc + Copyright (c) 2004 Red Hat, Inc., James Morris + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * Check permissions for extended attribute access. This is a bit complicated + * because different namespaces have very different rules. + */ +static int +xattr_permission(struct inode *inode, const char *name, int mask) +{ + /* + * We can never set or remove an extended attribute on a read-only + * filesystem or on an immutable / append-only inode. + */ + if (mask & MAY_WRITE) { + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + } + + /* + * No restriction for security.* and system.* from the VFS. Decision + * on these is left to the underlying filesystem / security module. + */ + if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return 0; + + /* + * The trusted.* namespace can only be accessed by privileged users. + */ + if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { + if (!capable(CAP_SYS_ADMIN)) + return (mask & MAY_WRITE) ? -EPERM : -ENODATA; + return 0; + } + + /* + * In the user.* namespace, only regular files and directories can have + * extended attributes. For sticky directories, only the owner and + * privileged users can write attributes. + */ + if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return (mask & MAY_WRITE) ? -EPERM : -ENODATA; + if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && + (mask & MAY_WRITE) && !inode_owner_or_capable(inode)) + return -EPERM; + } + + return inode_permission(inode, mask); +} + +/** + * __vfs_setxattr_noperm - perform setxattr operation without performing + * permission checks. + * + * @dentry - object to perform setxattr on + * @name - xattr name to set + * @value - value to set @name to + * @size - size of @value + * @flags - flags to pass into filesystem operations + * + * returns the result of the internal setxattr or setsecurity operations. + * + * This function requires the caller to lock the inode's i_mutex before it + * is executed. It also assumes that the caller will make the appropriate + * permission checks. + */ +int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + int error = -EOPNOTSUPP; + int issec = !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN); + + if (issec) + inode->i_flags &= ~S_NOSEC; + if (inode->i_op->setxattr) { + error = inode->i_op->setxattr(dentry, name, value, size, flags); + if (!error) { + fsnotify_xattr(dentry); + security_inode_post_setxattr(dentry, name, value, + size, flags); + } + } else if (issec) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + error = security_inode_setsecurity(inode, suffix, value, + size, flags); + if (!error) + fsnotify_xattr(dentry); + } + + return error; +} + + +int +vfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = xattr_permission(inode, name, MAY_WRITE); + if (error) + return error; + + mutex_lock(&inode->i_mutex); + error = security_inode_setxattr(dentry, name, value, size, flags); + if (error) + goto out; + + error = __vfs_setxattr_noperm(dentry, name, value, size, flags); + +out: + mutex_unlock(&inode->i_mutex); + return error; +} +EXPORT_SYMBOL_GPL(vfs_setxattr); + +ssize_t +xattr_getsecurity(struct inode *inode, const char *name, void *value, + size_t size) +{ + void *buffer = NULL; + ssize_t len; + + if (!value || !size) { + len = security_inode_getsecurity(inode, name, &buffer, false); + goto out_noalloc; + } + + len = security_inode_getsecurity(inode, name, &buffer, true); + if (len < 0) + return len; + if (size < len) { + len = -ERANGE; + goto out; + } + memcpy(value, buffer, len); +out: + security_release_secctx(buffer, len); +out_noalloc: + return len; +} +EXPORT_SYMBOL_GPL(xattr_getsecurity); + +ssize_t +vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = xattr_permission(inode, name, MAY_READ); + if (error) + return error; + + error = security_inode_getxattr(dentry, name); + if (error) + return error; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + int ret = xattr_getsecurity(inode, suffix, value, size); + /* + * Only overwrite the return value if a security module + * is actually active. + */ + if (ret == -EOPNOTSUPP) + goto nolsm; + return ret; + } +nolsm: + if (inode->i_op->getxattr) + error = inode->i_op->getxattr(dentry, name, value, size); + else + error = -EOPNOTSUPP; + + return error; +} +EXPORT_SYMBOL_GPL(vfs_getxattr); + +ssize_t +vfs_listxattr(struct dentry *d, char *list, size_t size) +{ + ssize_t error; + + error = security_inode_listxattr(d); + if (error) + return error; + error = -EOPNOTSUPP; + if (d->d_inode->i_op->listxattr) { + error = d->d_inode->i_op->listxattr(d, list, size); + } else { + error = security_inode_listsecurity(d->d_inode, list, size); + if (size && error > size) + error = -ERANGE; + } + return error; +} +EXPORT_SYMBOL_GPL(vfs_listxattr); + +int +vfs_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + int error; + + if (!inode->i_op->removexattr) + return -EOPNOTSUPP; + + error = xattr_permission(inode, name, MAY_WRITE); + if (error) + return error; + + error = security_inode_removexattr(dentry, name); + if (error) + return error; + + mutex_lock(&inode->i_mutex); + error = inode->i_op->removexattr(dentry, name); + mutex_unlock(&inode->i_mutex); + + if (!error) + fsnotify_xattr(dentry); + return error; +} +EXPORT_SYMBOL_GPL(vfs_removexattr); + + +/* + * Extended attribute SET operations + */ +static long +setxattr(struct dentry *d, const char __user *name, const void __user *value, + size_t size, int flags) +{ + int error; + void *kvalue = NULL; + char kname[XATTR_NAME_MAX + 1]; + + if (flags & ~(XATTR_CREATE|XATTR_REPLACE)) + return -EINVAL; + + error = strncpy_from_user(kname, name, sizeof(kname)); + if (error == 0 || error == sizeof(kname)) + error = -ERANGE; + if (error < 0) + return error; + + if (size) { + if (size > XATTR_SIZE_MAX) + return -E2BIG; + kvalue = memdup_user(value, size); + if (IS_ERR(kvalue)) + return PTR_ERR(kvalue); + } + + error = vfs_setxattr(d, kname, kvalue, size, flags); + kfree(kvalue); + return error; +} + +SYSCALL_DEFINE5(setxattr, const char __user *, pathname, + const char __user *, name, const void __user *, value, + size_t, size, int, flags) +{ + struct path path; + int error; + + error = user_path(pathname, &path); + if (error) + return error; + error = mnt_want_write(path.mnt); + if (!error) { + error = setxattr(path.dentry, name, value, size, flags); + mnt_drop_write(path.mnt); + } + path_put(&path); + return error; +} + +SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, + const char __user *, name, const void __user *, value, + size_t, size, int, flags) +{ + struct path path; + int error; + + error = user_lpath(pathname, &path); + if (error) + return error; + error = mnt_want_write(path.mnt); + if (!error) { + error = setxattr(path.dentry, name, value, size, flags); + mnt_drop_write(path.mnt); + } + path_put(&path); + return error; +} + +SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, + const void __user *,value, size_t, size, int, flags) +{ + struct file *f; + struct dentry *dentry; + int error = -EBADF; + + f = fget(fd); + if (!f) + return error; + dentry = f->f_path.dentry; + audit_inode(NULL, dentry); + error = mnt_want_write_file(f); + if (!error) { + error = setxattr(dentry, name, value, size, flags); + mnt_drop_write(f->f_path.mnt); + } + fput(f); + return error; +} + +/* + * Extended attribute GET operations + */ +static ssize_t +getxattr(struct dentry *d, const char __user *name, void __user *value, + size_t size) +{ + ssize_t error; + void *kvalue = NULL; + char kname[XATTR_NAME_MAX + 1]; + + error = strncpy_from_user(kname, name, sizeof(kname)); + if (error == 0 || error == sizeof(kname)) + error = -ERANGE; + if (error < 0) + return error; + + if (size) { + if (size > XATTR_SIZE_MAX) + size = XATTR_SIZE_MAX; + kvalue = kzalloc(size, GFP_KERNEL); + if (!kvalue) + return -ENOMEM; + } + + error = vfs_getxattr(d, kname, kvalue, size); + if (error > 0) { + if (size && copy_to_user(value, kvalue, error)) + error = -EFAULT; + } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { + /* The file system tried to returned a value bigger + than XATTR_SIZE_MAX bytes. Not possible. */ + error = -E2BIG; + } + kfree(kvalue); + return error; +} + +SYSCALL_DEFINE4(getxattr, const char __user *, pathname, + const char __user *, name, void __user *, value, size_t, size) +{ + struct path path; + ssize_t error; + + error = user_path(pathname, &path); + if (error) + return error; + error = getxattr(path.dentry, name, value, size); + path_put(&path); + return error; +} + +SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, + const char __user *, name, void __user *, value, size_t, size) +{ + struct path path; + ssize_t error; + + error = user_lpath(pathname, &path); + if (error) + return error; + error = getxattr(path.dentry, name, value, size); + path_put(&path); + return error; +} + +SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, + void __user *, value, size_t, size) +{ + struct file *f; + ssize_t error = -EBADF; + + f = fget(fd); + if (!f) + return error; + audit_inode(NULL, f->f_path.dentry); + error = getxattr(f->f_path.dentry, name, value, size); + fput(f); + return error; +} + +/* + * Extended attribute LIST operations + */ +static ssize_t +listxattr(struct dentry *d, char __user *list, size_t size) +{ + ssize_t error; + char *klist = NULL; + + if (size) { + if (size > XATTR_LIST_MAX) + size = XATTR_LIST_MAX; + klist = kmalloc(size, GFP_KERNEL); + if (!klist) + return -ENOMEM; + } + + error = vfs_listxattr(d, klist, size); + if (error > 0) { + if (size && copy_to_user(list, klist, error)) + error = -EFAULT; + } else if (error == -ERANGE && size >= XATTR_LIST_MAX) { + /* The file system tried to returned a list bigger + than XATTR_LIST_MAX bytes. Not possible. */ + error = -E2BIG; + } + kfree(klist); + return error; +} + +SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, + size_t, size) +{ + struct path path; + ssize_t error; + + error = user_path(pathname, &path); + if (error) + return error; + error = listxattr(path.dentry, list, size); + path_put(&path); + return error; +} + +SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, + size_t, size) +{ + struct path path; + ssize_t error; + + error = user_lpath(pathname, &path); + if (error) + return error; + error = listxattr(path.dentry, list, size); + path_put(&path); + return error; +} + +SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) +{ + struct file *f; + ssize_t error = -EBADF; + + f = fget(fd); + if (!f) + return error; + audit_inode(NULL, f->f_path.dentry); + error = listxattr(f->f_path.dentry, list, size); + fput(f); + return error; +} + +/* + * Extended attribute REMOVE operations + */ +static long +removexattr(struct dentry *d, const char __user *name) +{ + int error; + char kname[XATTR_NAME_MAX + 1]; + + error = strncpy_from_user(kname, name, sizeof(kname)); + if (error == 0 || error == sizeof(kname)) + error = -ERANGE; + if (error < 0) + return error; + + return vfs_removexattr(d, kname); +} + +SYSCALL_DEFINE2(removexattr, const char __user *, pathname, + const char __user *, name) +{ + struct path path; + int error; + + error = user_path(pathname, &path); + if (error) + return error; + error = mnt_want_write(path.mnt); + if (!error) { + error = removexattr(path.dentry, name); + mnt_drop_write(path.mnt); + } + path_put(&path); + return error; +} + +SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, + const char __user *, name) +{ + struct path path; + int error; + + error = user_lpath(pathname, &path); + if (error) + return error; + error = mnt_want_write(path.mnt); + if (!error) { + error = removexattr(path.dentry, name); + mnt_drop_write(path.mnt); + } + path_put(&path); + return error; +} + +SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) +{ + struct file *f; + struct dentry *dentry; + int error = -EBADF; + + f = fget(fd); + if (!f) + return error; + dentry = f->f_path.dentry; + audit_inode(NULL, dentry); + error = mnt_want_write_file(f); + if (!error) { + error = removexattr(dentry, name); + mnt_drop_write(f->f_path.mnt); + } + fput(f); + return error; +} + + +static const char * +strcmp_prefix(const char *a, const char *a_prefix) +{ + while (*a_prefix && *a == *a_prefix) { + a++; + a_prefix++; + } + return *a_prefix ? NULL : a; +} + +/* + * In order to implement different sets of xattr operations for each xattr + * prefix with the generic xattr API, a filesystem should create a + * null-terminated array of struct xattr_handler (one for each prefix) and + * hang a pointer to it off of the s_xattr field of the superblock. + * + * The generic_fooxattr() functions will use this list to dispatch xattr + * operations to the correct xattr_handler. + */ +#define for_each_xattr_handler(handlers, handler) \ + for ((handler) = *(handlers)++; \ + (handler) != NULL; \ + (handler) = *(handlers)++) + +/* + * Find the xattr_handler with the matching prefix. + */ +static const struct xattr_handler * +xattr_resolve_name(const struct xattr_handler **handlers, const char **name) +{ + const struct xattr_handler *handler; + + if (!*name) + return NULL; + + for_each_xattr_handler(handlers, handler) { + const char *n = strcmp_prefix(*name, handler->prefix); + if (n) { + *name = n; + break; + } + } + return handler; +} + +/* + * Find the handler for the prefix and dispatch its get() operation. + */ +ssize_t +generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) +{ + const struct xattr_handler *handler; + + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); + if (!handler) + return -EOPNOTSUPP; + return handler->get(dentry, name, buffer, size, handler->flags); +} + +/* + * Combine the results of the list() operation from every xattr_handler in the + * list. + */ +ssize_t +generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; + unsigned int size = 0; + + if (!buffer) { + for_each_xattr_handler(handlers, handler) { + size += handler->list(dentry, NULL, 0, NULL, 0, + handler->flags); + } + } else { + char *buf = buffer; + + for_each_xattr_handler(handlers, handler) { + size = handler->list(dentry, buf, buffer_size, + NULL, 0, handler->flags); + if (size > buffer_size) + return -ERANGE; + buf += size; + buffer_size -= size; + } + size = buf - buffer; + } + return size; +} + +/* + * Find the handler for the prefix and dispatch its set() operation. + */ +int +generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) +{ + const struct xattr_handler *handler; + + if (size == 0) + value = ""; /* empty EA, do not remove */ + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); + if (!handler) + return -EOPNOTSUPP; + return handler->set(dentry, name, value, size, flags, handler->flags); +} + +/* + * Find the handler for the prefix and dispatch its set() operation to remove + * any associated extended attribute. + */ +int +generic_removexattr(struct dentry *dentry, const char *name) +{ + const struct xattr_handler *handler; + + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); + if (!handler) + return -EOPNOTSUPP; + return handler->set(dentry, name, NULL, 0, + XATTR_REPLACE, handler->flags); +} + +EXPORT_SYMBOL(generic_getxattr); +EXPORT_SYMBOL(generic_listxattr); +EXPORT_SYMBOL(generic_setxattr); +EXPORT_SYMBOL(generic_removexattr); diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c new file mode 100644 index 00000000..8d5a506c --- /dev/null +++ b/fs/xattr_acl.c @@ -0,0 +1,98 @@ +/* + * linux/fs/xattr_acl.c + * + * Almost all from linux/fs/ext2/acl.c: + * Copyright (C) 2001 by Andreas Gruenbacher, + */ + +#include +#include +#include +#include + + +/* + * Convert from extended attribute to in-memory representation. + */ +struct posix_acl * +posix_acl_from_xattr(const void *value, size_t size) +{ + posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; + posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + int count; + struct posix_acl *acl; + struct posix_acl_entry *acl_e; + + if (!value) + return NULL; + if (size < sizeof(posix_acl_xattr_header)) + return ERR_PTR(-EINVAL); + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return ERR_PTR(-EOPNOTSUPP); + + count = posix_acl_xattr_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + acl_e = acl->a_entries; + + for (end = entry + count; entry != end; acl_e++, entry++) { + acl_e->e_tag = le16_to_cpu(entry->e_tag); + acl_e->e_perm = le16_to_cpu(entry->e_perm); + + switch(acl_e->e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + acl_e->e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + acl_e->e_id = le32_to_cpu(entry->e_id); + break; + + default: + goto fail; + } + } + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} +EXPORT_SYMBOL (posix_acl_from_xattr); + +/* + * Convert from in-memory to extended attribute representation. + */ +int +posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size) +{ + posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; + posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; + int real_size, n; + + real_size = posix_acl_xattr_size(acl->a_count); + if (!buffer) + return real_size; + if (real_size > size) + return -ERANGE; + + ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); + + for (n=0; n < acl->a_count; n++, ext_entry++) { + ext_entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + ext_entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + ext_entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + } + return real_size; +} +EXPORT_SYMBOL (posix_acl_to_xattr); diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig new file mode 100644 index 00000000..6100ec0f --- /dev/null +++ b/fs/xfs/Kconfig @@ -0,0 +1,82 @@ +config XFS_FS + tristate "XFS filesystem support" + depends on BLOCK + select EXPORTFS + help + XFS is a high performance journaling filesystem which originated + on the SGI IRIX platform. It is completely multi-threaded, can + support large files and large filesystems, extended attributes, + variable block sizes, is extent based, and makes extensive use of + Btrees (directories, extents, free space) to aid both performance + and scalability. + + Refer to the documentation at + for complete details. This implementation is on-disk compatible + with the IRIX version of XFS. + + To compile this file system support as a module, choose M here: the + module will be called xfs. Be aware, however, that if the file + system of your root partition is compiled as a module, you'll need + to use an initial ramdisk (initrd) to boot. + +config XFS_QUOTA + bool "XFS Quota support" + depends on XFS_FS + select QUOTACTL + help + If you say Y here, you will be able to set limits for disk usage on + a per user and/or a per group basis under XFS. XFS considers quota + information as filesystem metadata and uses journaling to provide a + higher level guarantee of consistency. The on-disk data format for + quota is also compatible with the IRIX version of XFS, allowing a + filesystem to be migrated between Linux and IRIX without any need + for conversion. + + If unsure, say N. More comprehensive documentation can be found in + README.quota in the xfsprogs package. XFS quota can be used either + with or without the generic quota support enabled (CONFIG_QUOTA) - + they are completely independent subsystems. + +config XFS_POSIX_ACL + bool "XFS POSIX ACL support" + depends on XFS_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N. + +config XFS_RT + bool "XFS Realtime subvolume support" + depends on XFS_FS + help + If you say Y here you will be able to mount and use XFS filesystems + which contain a realtime subvolume. The realtime subvolume is a + separate area of disk space where only file data is stored. It was + originally designed to provide deterministic data rates suitable + for media streaming applications, but is also useful as a generic + mechanism for ensuring data and metadata/log I/Os are completely + separated. Regular file I/Os are isolated to a separate device + from all other requests, and this can be done quite transparently + to applications via the inherit-realtime directory inode flag. + + See the xfs man page in section 5 for additional information. + + If unsure, say N. + +config XFS_DEBUG + bool "XFS Debugging support (EXPERIMENTAL)" + depends on XFS_FS && EXPERIMENTAL + help + Say Y here to get an XFS build with many debugging features, + including ASSERT checks, function wrappers around macros, + and extra sanity-checking functions in various code paths. + + Note that the resulting code will be HUGE and SLOW, and probably + not useful unless you are debugging a particular problem. + + Say N unless you are an XFS developer, or you play one on TV. diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile new file mode 100644 index 00000000..284a7c89 --- /dev/null +++ b/fs/xfs/Makefile @@ -0,0 +1,111 @@ +# +# Copyright (c) 2000-2005 Silicon Graphics, Inc. +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# + +ccflags-y := -I$(src) -I$(src)/linux-2.6 +ccflags-$(CONFIG_XFS_DEBUG) += -g + +XFS_LINUX := linux-2.6 + +obj-$(CONFIG_XFS_FS) += xfs.o + +xfs-y += linux-2.6/xfs_trace.o + +xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \ + xfs_dquot.o \ + xfs_dquot_item.o \ + xfs_trans_dquot.o \ + xfs_qm_syscalls.o \ + xfs_qm_bhv.o \ + xfs_qm.o) +xfs-$(CONFIG_XFS_QUOTA) += linux-2.6/xfs_quotaops.o + +ifeq ($(CONFIG_XFS_QUOTA),y) +xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o +endif + +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o +xfs-$(CONFIG_XFS_POSIX_ACL) += $(XFS_LINUX)/xfs_acl.o +xfs-$(CONFIG_PROC_FS) += $(XFS_LINUX)/xfs_stats.o +xfs-$(CONFIG_SYSCTL) += $(XFS_LINUX)/xfs_sysctl.o +xfs-$(CONFIG_COMPAT) += $(XFS_LINUX)/xfs_ioctl32.o + + +xfs-y += xfs_alloc.o \ + xfs_alloc_btree.o \ + xfs_attr.o \ + xfs_attr_leaf.o \ + xfs_bit.o \ + xfs_bmap.o \ + xfs_bmap_btree.o \ + xfs_btree.o \ + xfs_buf_item.o \ + xfs_da_btree.o \ + xfs_dir2.o \ + xfs_dir2_block.o \ + xfs_dir2_data.o \ + xfs_dir2_leaf.o \ + xfs_dir2_node.o \ + xfs_dir2_sf.o \ + xfs_error.o \ + xfs_extfree_item.o \ + xfs_filestream.o \ + xfs_fsops.o \ + xfs_ialloc.o \ + xfs_ialloc_btree.o \ + xfs_iget.o \ + xfs_inode.o \ + xfs_inode_item.o \ + xfs_iomap.o \ + xfs_itable.o \ + xfs_dfrag.o \ + xfs_log.o \ + xfs_log_cil.o \ + xfs_log_recover.o \ + xfs_mount.o \ + xfs_mru_cache.o \ + xfs_rename.o \ + xfs_trans.o \ + xfs_trans_ail.o \ + xfs_trans_buf.o \ + xfs_trans_extfree.o \ + xfs_trans_inode.o \ + xfs_utils.o \ + xfs_vnodeops.o \ + xfs_rw.o + +xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o + +# Objects in linux/ +xfs-y += $(addprefix $(XFS_LINUX)/, \ + kmem.o \ + xfs_aops.o \ + xfs_buf.o \ + xfs_discard.o \ + xfs_export.o \ + xfs_file.o \ + xfs_fs_subr.o \ + xfs_globals.o \ + xfs_ioctl.o \ + xfs_iops.o \ + xfs_message.o \ + xfs_super.o \ + xfs_sync.o \ + xfs_xattr.o) + +# Objects in support/ +xfs-y += support/uuid.o diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c new file mode 100644 index 00000000..a907de56 --- /dev/null +++ b/fs/xfs/linux-2.6/kmem.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include "time.h" +#include "kmem.h" +#include "xfs_message.h" + +/* + * Greedy allocation. May fail and may return vmalloced memory. + * + * Must be freed using kmem_free_large. + */ +void * +kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) +{ + void *ptr; + size_t kmsize = maxsize; + + while (!(ptr = kmem_zalloc_large(kmsize))) { + if ((kmsize >>= 1) <= minsize) + kmsize = minsize; + } + if (ptr) + *size = kmsize; + return ptr; +} + +void * +kmem_alloc(size_t size, unsigned int __nocast flags) +{ + int retries = 0; + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + do { + ptr = kmalloc(size, lflags); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", + __func__, lflags); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } while (1); +} + +void * +kmem_zalloc(size_t size, unsigned int __nocast flags) +{ + void *ptr; + + ptr = kmem_alloc(size, flags); + if (ptr) + memset((char *)ptr, 0, (int)size); + return ptr; +} + +void +kmem_free(const void *ptr) +{ + if (!is_vmalloc_addr(ptr)) { + kfree(ptr); + } else { + vfree(ptr); + } +} + +void * +kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, + unsigned int __nocast flags) +{ + void *new; + + new = kmem_alloc(newsize, flags); + if (ptr) { + if (new) + memcpy(new, ptr, + ((oldsize < newsize) ? oldsize : newsize)); + kmem_free(ptr); + } + return new; +} + +void * +kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) +{ + int retries = 0; + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + do { + ptr = kmem_cache_alloc(zone, lflags); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", + __func__, lflags); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } while (1); +} + +void * +kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags) +{ + void *ptr; + + ptr = kmem_zone_alloc(zone, flags); + if (ptr) + memset((char *)ptr, 0, kmem_cache_size(zone)); + return ptr; +} diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h new file mode 100644 index 00000000..f7c8f7a9 --- /dev/null +++ b/fs/xfs/linux-2.6/kmem.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SUPPORT_KMEM_H__ +#define __XFS_SUPPORT_KMEM_H__ + +#include +#include +#include +#include + +/* + * General memory allocation interfaces + */ + +#define KM_SLEEP 0x0001u +#define KM_NOSLEEP 0x0002u +#define KM_NOFS 0x0004u +#define KM_MAYFAIL 0x0008u + +/* + * We use a special process flag to avoid recursive callbacks into + * the filesystem during transactions. We will also issue our own + * warnings, so we explicitly skip any generic ones (silly of us). + */ +static inline gfp_t +kmem_flags_convert(unsigned int __nocast flags) +{ + gfp_t lflags; + + BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL)); + + if (flags & KM_NOSLEEP) { + lflags = GFP_ATOMIC | __GFP_NOWARN; + } else { + lflags = GFP_KERNEL | __GFP_NOWARN; + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + lflags &= ~__GFP_FS; + } + return lflags; +} + +extern void *kmem_alloc(size_t, unsigned int __nocast); +extern void *kmem_zalloc(size_t, unsigned int __nocast); +extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); +extern void kmem_free(const void *); + +static inline void *kmem_zalloc_large(size_t size) +{ + void *ptr; + + ptr = vmalloc(size); + if (ptr) + memset(ptr, 0, size); + return ptr; +} +static inline void kmem_free_large(void *ptr) +{ + vfree(ptr); +} + +extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); + +/* + * Zone interfaces + */ + +#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN +#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT +#define KM_ZONE_SPREAD SLAB_MEM_SPREAD + +#define kmem_zone kmem_cache +#define kmem_zone_t struct kmem_cache + +static inline kmem_zone_t * +kmem_zone_init(int size, char *zone_name) +{ + return kmem_cache_create(zone_name, size, 0, 0, NULL); +} + +static inline kmem_zone_t * +kmem_zone_init_flags(int size, char *zone_name, unsigned long flags, + void (*construct)(void *)) +{ + return kmem_cache_create(zone_name, size, 0, flags, construct); +} + +static inline void +kmem_zone_free(kmem_zone_t *zone, void *ptr) +{ + kmem_cache_free(zone, ptr); +} + +static inline void +kmem_zone_destroy(kmem_zone_t *zone) +{ + if (zone) + kmem_cache_destroy(zone); +} + +extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); +extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); + +static inline int +kmem_shake_allow(gfp_t gfp_mask) +{ + return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)); +} + +#endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h new file mode 100644 index 00000000..ff6a1987 --- /dev/null +++ b/fs/xfs/linux-2.6/mrlock.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SUPPORT_MRLOCK_H__ +#define __XFS_SUPPORT_MRLOCK_H__ + +#include + +typedef struct { + struct rw_semaphore mr_lock; +#ifdef DEBUG + int mr_writer; +#endif +} mrlock_t; + +#ifdef DEBUG +#define mrinit(mrp, name) \ + do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) +#else +#define mrinit(mrp, name) \ + do { init_rwsem(&(mrp)->mr_lock); } while (0) +#endif + +#define mrlock_init(mrp, t,n,s) mrinit(mrp, n) +#define mrfree(mrp) do { } while (0) + +static inline void mraccess_nested(mrlock_t *mrp, int subclass) +{ + down_read_nested(&mrp->mr_lock, subclass); +} + +static inline void mrupdate_nested(mrlock_t *mrp, int subclass) +{ + down_write_nested(&mrp->mr_lock, subclass); +#ifdef DEBUG + mrp->mr_writer = 1; +#endif +} + +static inline int mrtryaccess(mrlock_t *mrp) +{ + return down_read_trylock(&mrp->mr_lock); +} + +static inline int mrtryupdate(mrlock_t *mrp) +{ + if (!down_write_trylock(&mrp->mr_lock)) + return 0; +#ifdef DEBUG + mrp->mr_writer = 1; +#endif + return 1; +} + +static inline void mrunlock_excl(mrlock_t *mrp) +{ +#ifdef DEBUG + mrp->mr_writer = 0; +#endif + up_write(&mrp->mr_lock); +} + +static inline void mrunlock_shared(mrlock_t *mrp) +{ + up_read(&mrp->mr_lock); +} + +static inline void mrdemote(mrlock_t *mrp) +{ +#ifdef DEBUG + mrp->mr_writer = 0; +#endif + downgrade_write(&mrp->mr_lock); +} + +#endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/linux-2.6/time.h new file mode 100644 index 00000000..387e695a --- /dev/null +++ b/fs/xfs/linux-2.6/time.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SUPPORT_TIME_H__ +#define __XFS_SUPPORT_TIME_H__ + +#include +#include + +typedef struct timespec timespec_t; + +static inline void delay(long ticks) +{ + schedule_timeout_uninterruptible(ticks); +} + +static inline void nanotime(struct timespec *tvp) +{ + *tvp = CURRENT_TIME; +} + +#endif /* __XFS_SUPPORT_TIME_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c new file mode 100644 index 00000000..f86e0348 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -0,0 +1,464 @@ +/* + * Copyright (c) 2008, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_acl.h" +#include "xfs_attr.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_vnodeops.h" +#include "xfs_trace.h" +#include +#include +#include + + +/* + * Locking scheme: + * - all ACL updates are protected by inode->i_mutex, which is taken before + * calling into this file. + */ + +STATIC struct posix_acl * +xfs_acl_from_disk(struct xfs_acl *aclp) +{ + struct posix_acl_entry *acl_e; + struct posix_acl *acl; + struct xfs_acl_entry *ace; + unsigned int count, i; + + count = be32_to_cpu(aclp->acl_cnt); + if (count > XFS_ACL_MAX_ENTRIES) + return ERR_PTR(-EFSCORRUPTED); + + acl = posix_acl_alloc(count, GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + acl_e = &acl->a_entries[i]; + ace = &aclp->acl_entry[i]; + + /* + * The tag is 32 bits on disk and 16 bits in core. + * + * Because every access to it goes through the core + * format first this is not a problem. + */ + acl_e->e_tag = be32_to_cpu(ace->ae_tag); + acl_e->e_perm = be16_to_cpu(ace->ae_perm); + + switch (acl_e->e_tag) { + case ACL_USER: + case ACL_GROUP: + acl_e->e_id = be32_to_cpu(ace->ae_id); + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + acl_e->e_id = ACL_UNDEFINED_ID; + break; + default: + goto fail; + } + } + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +STATIC void +xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) +{ + const struct posix_acl_entry *acl_e; + struct xfs_acl_entry *ace; + int i; + + aclp->acl_cnt = cpu_to_be32(acl->a_count); + for (i = 0; i < acl->a_count; i++) { + ace = &aclp->acl_entry[i]; + acl_e = &acl->a_entries[i]; + + ace->ae_tag = cpu_to_be32(acl_e->e_tag); + ace->ae_id = cpu_to_be32(acl_e->e_id); + ace->ae_perm = cpu_to_be16(acl_e->e_perm); + } +} + +struct posix_acl * +xfs_get_acl(struct inode *inode, int type) +{ + struct xfs_inode *ip = XFS_I(inode); + struct posix_acl *acl; + struct xfs_acl *xfs_acl; + int len = sizeof(struct xfs_acl); + unsigned char *ea_name; + int error; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + switch (type) { + case ACL_TYPE_ACCESS: + ea_name = SGI_ACL_FILE; + break; + case ACL_TYPE_DEFAULT: + ea_name = SGI_ACL_DEFAULT; + break; + default: + BUG(); + } + + /* + * If we have a cached ACLs value just return it, not need to + * go out to the disk. + */ + + xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); + if (!xfs_acl) + return ERR_PTR(-ENOMEM); + + error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, + &len, ATTR_ROOT); + if (error) { + /* + * If the attribute doesn't exist make sure we have a negative + * cache entry, for any other error assume it is transient and + * leave the cache entry as ACL_NOT_CACHED. + */ + if (error == -ENOATTR) { + acl = NULL; + goto out_update_cache; + } + goto out; + } + + acl = xfs_acl_from_disk(xfs_acl); + if (IS_ERR(acl)) + goto out; + + out_update_cache: + set_cached_acl(inode, type, acl); + out: + kfree(xfs_acl); + return acl; +} + +STATIC int +xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct xfs_inode *ip = XFS_I(inode); + unsigned char *ea_name; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + ea_name = SGI_ACL_FILE; + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + ea_name = SGI_ACL_DEFAULT; + break; + default: + return -EINVAL; + } + + if (acl) { + struct xfs_acl *xfs_acl; + int len; + + xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); + if (!xfs_acl) + return -ENOMEM; + + xfs_acl_to_disk(xfs_acl, acl); + len = sizeof(struct xfs_acl) - + (sizeof(struct xfs_acl_entry) * + (XFS_ACL_MAX_ENTRIES - acl->a_count)); + + error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, + len, ATTR_ROOT); + + kfree(xfs_acl); + } else { + /* + * A NULL ACL argument means we want to remove the ACL. + */ + error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT); + + /* + * If the attribute didn't exist to start with that's fine. + */ + if (error == -ENOATTR) + error = 0; + } + + if (!error) + set_cached_acl(inode, type, acl); + return error; +} + +int +xfs_check_acl(struct inode *inode, int mask, unsigned int flags) +{ + struct xfs_inode *ip; + struct posix_acl *acl; + int error = -EAGAIN; + + ip = XFS_I(inode); + trace_xfs_check_acl(ip); + + /* + * If there is no attribute fork no ACL exists on this inode and + * we can skip the whole exercise. + */ + if (!XFS_IFORK_Q(ip)) + return -EAGAIN; + + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -ECHILD; + return -EAGAIN; + } + + acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + } + + return error; +} + +static int +xfs_set_mode(struct inode *inode, mode_t mode) +{ + int error = 0; + + if (mode != inode->i_mode) { + struct iattr iattr; + + iattr.ia_valid = ATTR_MODE | ATTR_CTIME; + iattr.ia_mode = mode; + iattr.ia_ctime = current_fs_time(inode->i_sb); + + error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL); + } + + return error; +} + +static int +xfs_acl_exists(struct inode *inode, unsigned char *name) +{ + int len = sizeof(struct xfs_acl); + + return (xfs_attr_get(XFS_I(inode), name, NULL, &len, + ATTR_ROOT|ATTR_KERNOVAL) == 0); +} + +int +posix_acl_access_exists(struct inode *inode) +{ + return xfs_acl_exists(inode, SGI_ACL_FILE); +} + +int +posix_acl_default_exists(struct inode *inode) +{ + if (!S_ISDIR(inode->i_mode)) + return 0; + return xfs_acl_exists(inode, SGI_ACL_DEFAULT); +} + +/* + * No need for i_mutex because the inode is not yet exposed to the VFS. + */ +int +xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl) +{ + struct posix_acl *clone; + mode_t mode; + int error = 0, inherit = 0; + + if (S_ISDIR(inode->i_mode)) { + error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl); + if (error) + return error; + } + + clone = posix_acl_clone(default_acl, GFP_KERNEL); + if (!clone) + return -ENOMEM; + + mode = inode->i_mode; + error = posix_acl_create_masq(clone, &mode); + if (error < 0) + goto out_release_clone; + + /* + * If posix_acl_create_masq returns a positive value we need to + * inherit a permission that can't be represented using the Unix + * mode bits and we actually need to set an ACL. + */ + if (error > 0) + inherit = 1; + + error = xfs_set_mode(inode, mode); + if (error) + goto out_release_clone; + + if (inherit) + error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone); + + out_release_clone: + posix_acl_release(clone); + return error; +} + +int +xfs_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + + error = posix_acl_chmod_masq(clone, inode->i_mode); + if (!error) + error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone); + + posix_acl_release(clone); + return error; +} + +static int +xfs_xattr_acl_get(struct dentry *dentry, const char *name, + void *value, size_t size, int type) +{ + struct posix_acl *acl; + int error; + + acl = xfs_get_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + + error = posix_acl_to_xattr(acl, value, size); + posix_acl_release(acl); + + return error; +} + +static int +xfs_xattr_acl_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl = NULL; + int error = 0; + + if (flags & XATTR_CREATE) + return -EINVAL; + if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + return value ? -EACCES : 0; + if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + + if (!value) + goto set_acl; + + acl = posix_acl_from_xattr(value, size); + if (!acl) { + /* + * acl_set_file(3) may request that we set default ACLs with + * zero length -- defend (gracefully) against that here. + */ + goto out; + } + if (IS_ERR(acl)) { + error = PTR_ERR(acl); + goto out; + } + + error = posix_acl_valid(acl); + if (error) + goto out_release; + + error = -EINVAL; + if (acl->a_count > XFS_ACL_MAX_ENTRIES) + goto out_release; + + if (type == ACL_TYPE_ACCESS) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + + if (error <= 0) { + posix_acl_release(acl); + acl = NULL; + + if (error < 0) + return error; + } + + error = xfs_set_mode(inode, mode); + if (error) + goto out_release; + } + + set_acl: + error = xfs_set_acl(inode, type, acl); + out_release: + posix_acl_release(acl); + out: + return error; +} + +const struct xattr_handler xfs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .get = xfs_xattr_acl_get, + .set = xfs_xattr_acl_set, +}; + +const struct xattr_handler xfs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .get = xfs_xattr_acl_get, + .set = xfs_xattr_acl_set, +}; diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c new file mode 100644 index 00000000..79ce38be --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -0,0 +1,1506 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_trans.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_rw.h" +#include "xfs_iomap.h" +#include "xfs_vnodeops.h" +#include "xfs_trace.h" +#include "xfs_bmap.h" +#include +#include +#include +#include + + +/* + * Prime number of hash buckets since address is used as the key. + */ +#define NVSYNC 37 +#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC]) +static wait_queue_head_t xfs_ioend_wq[NVSYNC]; + +void __init +xfs_ioend_init(void) +{ + int i; + + for (i = 0; i < NVSYNC; i++) + init_waitqueue_head(&xfs_ioend_wq[i]); +} + +void +xfs_ioend_wait( + xfs_inode_t *ip) +{ + wait_queue_head_t *wq = to_ioend_wq(ip); + + wait_event(*wq, (atomic_read(&ip->i_iocount) == 0)); +} + +STATIC void +xfs_ioend_wake( + xfs_inode_t *ip) +{ + if (atomic_dec_and_test(&ip->i_iocount)) + wake_up(to_ioend_wq(ip)); +} + +void +xfs_count_page_state( + struct page *page, + int *delalloc, + int *unwritten) +{ + struct buffer_head *bh, *head; + + *delalloc = *unwritten = 0; + + bh = head = page_buffers(page); + do { + if (buffer_unwritten(bh)) + (*unwritten) = 1; + else if (buffer_delay(bh)) + (*delalloc) = 1; + } while ((bh = bh->b_this_page) != head); +} + +STATIC struct block_device * +xfs_find_bdev_for_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + if (XFS_IS_REALTIME_INODE(ip)) + return mp->m_rtdev_targp->bt_bdev; + else + return mp->m_ddev_targp->bt_bdev; +} + +/* + * We're now finished for good with this ioend structure. + * Update the page state via the associated buffer_heads, + * release holds on the inode and bio, and finally free + * up memory. Do not use the ioend after this. + */ +STATIC void +xfs_destroy_ioend( + xfs_ioend_t *ioend) +{ + struct buffer_head *bh, *next; + struct xfs_inode *ip = XFS_I(ioend->io_inode); + + for (bh = ioend->io_buffer_head; bh; bh = next) { + next = bh->b_private; + bh->b_end_io(bh, !ioend->io_error); + } + + /* + * Volume managers supporting multiple paths can send back ENODEV + * when the final path disappears. In this case continuing to fill + * the page cache with dirty data which cannot be written out is + * evil, so prevent that. + */ + if (unlikely(ioend->io_error == -ENODEV)) { + xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, + __FILE__, __LINE__); + } + + xfs_ioend_wake(ip); + mempool_free(ioend, xfs_ioend_pool); +} + +/* + * If the end of the current ioend is beyond the current EOF, + * return the new EOF value, otherwise zero. + */ +STATIC xfs_fsize_t +xfs_ioend_new_eof( + xfs_ioend_t *ioend) +{ + xfs_inode_t *ip = XFS_I(ioend->io_inode); + xfs_fsize_t isize; + xfs_fsize_t bsize; + + bsize = ioend->io_offset + ioend->io_size; + isize = MAX(ip->i_size, ip->i_new_size); + isize = MIN(isize, bsize); + return isize > ip->i_d.di_size ? isize : 0; +} + +/* + * Update on-disk file size now that data has been written to disk. The + * current in-memory file size is i_size. If a write is beyond eof i_new_size + * will be the intended file size until i_size is updated. If this write does + * not extend all the way to the valid file size then restrict this update to + * the end of the write. + * + * This function does not block as blocking on the inode lock in IO completion + * can lead to IO completion order dependency deadlocks.. If it can't get the + * inode ilock it will return EAGAIN. Callers must handle this. + */ +STATIC int +xfs_setfilesize( + xfs_ioend_t *ioend) +{ + xfs_inode_t *ip = XFS_I(ioend->io_inode); + xfs_fsize_t isize; + + if (unlikely(ioend->io_error)) + return 0; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + return EAGAIN; + + isize = xfs_ioend_new_eof(ioend); + if (isize) { + ip->i_d.di_size = isize; + xfs_mark_inode_dirty(ip); + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* + * Schedule IO completion handling on the final put of an ioend. + */ +STATIC void +xfs_finish_ioend( + struct xfs_ioend *ioend) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) { + if (ioend->io_type == IO_UNWRITTEN) + queue_work(xfsconvertd_workqueue, &ioend->io_work); + else + queue_work(xfsdatad_workqueue, &ioend->io_work); + } +} + +/* + * IO write completion. + */ +STATIC void +xfs_end_io( + struct work_struct *work) +{ + xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error = 0; + + /* + * For unwritten extents we need to issue transactions to convert a + * range to normal written extens after the data I/O has finished. + */ + if (ioend->io_type == IO_UNWRITTEN && + likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { + + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, + ioend->io_size); + if (error) + ioend->io_error = error; + } + + /* + * We might have to update the on-disk file size after extending + * writes. + */ + error = xfs_setfilesize(ioend); + ASSERT(!error || error == EAGAIN); + + /* + * If we didn't complete processing of the ioend, requeue it to the + * tail of the workqueue for another attempt later. Otherwise destroy + * it. + */ + if (error == EAGAIN) { + atomic_inc(&ioend->io_remaining); + xfs_finish_ioend(ioend); + /* ensure we don't spin on blocked ioends */ + delay(1); + } else { + if (ioend->io_iocb) + aio_complete(ioend->io_iocb, ioend->io_result, 0); + xfs_destroy_ioend(ioend); + } +} + +/* + * Call IO completion handling in caller context on the final put of an ioend. + */ +STATIC void +xfs_finish_ioend_sync( + struct xfs_ioend *ioend) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) + xfs_end_io(&ioend->io_work); +} + +/* + * Allocate and initialise an IO completion structure. + * We need to track unwritten extent write completion here initially. + * We'll need to extend this for updating the ondisk inode size later + * (vs. incore size). + */ +STATIC xfs_ioend_t * +xfs_alloc_ioend( + struct inode *inode, + unsigned int type) +{ + xfs_ioend_t *ioend; + + ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); + + /* + * Set the count to 1 initially, which will prevent an I/O + * completion callback from happening before we have started + * all the I/O from calling the completion routine too early. + */ + atomic_set(&ioend->io_remaining, 1); + ioend->io_error = 0; + ioend->io_list = NULL; + ioend->io_type = type; + ioend->io_inode = inode; + ioend->io_buffer_head = NULL; + ioend->io_buffer_tail = NULL; + atomic_inc(&XFS_I(ioend->io_inode)->i_iocount); + ioend->io_offset = 0; + ioend->io_size = 0; + ioend->io_iocb = NULL; + ioend->io_result = 0; + + INIT_WORK(&ioend->io_work, xfs_end_io); + return ioend; +} + +STATIC int +xfs_map_blocks( + struct inode *inode, + loff_t offset, + struct xfs_bmbt_irec *imap, + int type, + int nonblocking) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t count = 1 << inode->i_blkbits; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int bmapi_flags = XFS_BMAPI_ENTIRE; + int nimaps = 1; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + if (type == IO_UNWRITTEN) + bmapi_flags |= XFS_BMAPI_IGSTATE; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + if (nonblocking) + return -XFS_ERROR(EAGAIN); + xfs_ilock(ip, XFS_ILOCK_SHARED); + } + + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + (ip->i_df.if_flags & XFS_IFEXTENTS)); + ASSERT(offset <= mp->m_maxioffset); + + if (offset + count > mp->m_maxioffset) + count = mp->m_maxioffset - offset; + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, + bmapi_flags, NULL, 0, imap, &nimaps, NULL); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (error) + return -XFS_ERROR(error); + + if (type == IO_DELALLOC && + (!nimaps || isnullstartblock(imap->br_startblock))) { + error = xfs_iomap_write_allocate(ip, offset, count, imap); + if (!error) + trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); + return -XFS_ERROR(error); + } + +#ifdef DEBUG + if (type == IO_UNWRITTEN) { + ASSERT(nimaps); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + } +#endif + if (nimaps) + trace_xfs_map_blocks_found(ip, offset, count, type, imap); + return 0; +} + +STATIC int +xfs_imap_valid( + struct inode *inode, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + offset >>= inode->i_blkbits; + + return offset >= imap->br_startoff && + offset < imap->br_startoff + imap->br_blockcount; +} + +/* + * BIO completion handler for buffered IO. + */ +STATIC void +xfs_end_bio( + struct bio *bio, + int error) +{ + xfs_ioend_t *ioend = bio->bi_private; + + ASSERT(atomic_read(&bio->bi_cnt) >= 1); + ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; + + /* Toss bio and pass work off to an xfsdatad thread */ + bio->bi_private = NULL; + bio->bi_end_io = NULL; + bio_put(bio); + + xfs_finish_ioend(ioend); +} + +STATIC void +xfs_submit_ioend_bio( + struct writeback_control *wbc, + xfs_ioend_t *ioend, + struct bio *bio) +{ + atomic_inc(&ioend->io_remaining); + bio->bi_private = ioend; + bio->bi_end_io = xfs_end_bio; + + /* + * If the I/O is beyond EOF we mark the inode dirty immediately + * but don't update the inode size until I/O completion. + */ + if (xfs_ioend_new_eof(ioend)) + xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); + + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); +} + +STATIC struct bio * +xfs_alloc_ioend_bio( + struct buffer_head *bh) +{ + int nvecs = bio_get_nr_vecs(bh->b_bdev); + struct bio *bio = bio_alloc(GFP_NOIO, nvecs); + + ASSERT(bio->bi_private == NULL); + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + return bio; +} + +STATIC void +xfs_start_buffer_writeback( + struct buffer_head *bh) +{ + ASSERT(buffer_mapped(bh)); + ASSERT(buffer_locked(bh)); + ASSERT(!buffer_delay(bh)); + ASSERT(!buffer_unwritten(bh)); + + mark_buffer_async_write(bh); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); +} + +STATIC void +xfs_start_page_writeback( + struct page *page, + int clear_dirty, + int buffers) +{ + ASSERT(PageLocked(page)); + ASSERT(!PageWriteback(page)); + if (clear_dirty) + clear_page_dirty_for_io(page); + set_page_writeback(page); + unlock_page(page); + /* If no buffers on the page are to be written, finish it here */ + if (!buffers) + end_page_writeback(page); +} + +static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) +{ + return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); +} + +/* + * Submit all of the bios for all of the ioends we have saved up, covering the + * initial writepage page and also any probed pages. + * + * Because we may have multiple ioends spanning a page, we need to start + * writeback on all the buffers before we submit them for I/O. If we mark the + * buffers as we got, then we can end up with a page that only has buffers + * marked async write and I/O complete on can occur before we mark the other + * buffers async write. + * + * The end result of this is that we trip a bug in end_page_writeback() because + * we call it twice for the one page as the code in end_buffer_async_write() + * assumes that all buffers on the page are started at the same time. + * + * The fix is two passes across the ioend list - one to start writeback on the + * buffer_heads, and then submit them for I/O on the second pass. + */ +STATIC void +xfs_submit_ioend( + struct writeback_control *wbc, + xfs_ioend_t *ioend) +{ + xfs_ioend_t *head = ioend; + xfs_ioend_t *next; + struct buffer_head *bh; + struct bio *bio; + sector_t lastblock = 0; + + /* Pass 1 - start writeback */ + do { + next = ioend->io_list; + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) + xfs_start_buffer_writeback(bh); + } while ((ioend = next) != NULL); + + /* Pass 2 - submit I/O */ + ioend = head; + do { + next = ioend->io_list; + bio = NULL; + + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { + + if (!bio) { + retry: + bio = xfs_alloc_ioend_bio(bh); + } else if (bh->b_blocknr != lastblock + 1) { + xfs_submit_ioend_bio(wbc, ioend, bio); + goto retry; + } + + if (bio_add_buffer(bio, bh) != bh->b_size) { + xfs_submit_ioend_bio(wbc, ioend, bio); + goto retry; + } + + lastblock = bh->b_blocknr; + } + if (bio) + xfs_submit_ioend_bio(wbc, ioend, bio); + xfs_finish_ioend(ioend); + } while ((ioend = next) != NULL); +} + +/* + * Cancel submission of all buffer_heads so far in this endio. + * Toss the endio too. Only ever called for the initial page + * in a writepage request, so only ever one page. + */ +STATIC void +xfs_cancel_ioend( + xfs_ioend_t *ioend) +{ + xfs_ioend_t *next; + struct buffer_head *bh, *next_bh; + + do { + next = ioend->io_list; + bh = ioend->io_buffer_head; + do { + next_bh = bh->b_private; + clear_buffer_async_write(bh); + unlock_buffer(bh); + } while ((bh = next_bh) != NULL); + + xfs_ioend_wake(XFS_I(ioend->io_inode)); + mempool_free(ioend, xfs_ioend_pool); + } while ((ioend = next) != NULL); +} + +/* + * Test to see if we've been building up a completion structure for + * earlier buffers -- if so, we try to append to this ioend if we + * can, otherwise we finish off any current ioend and start another. + * Return true if we've finished the given ioend. + */ +STATIC void +xfs_add_to_ioend( + struct inode *inode, + struct buffer_head *bh, + xfs_off_t offset, + unsigned int type, + xfs_ioend_t **result, + int need_ioend) +{ + xfs_ioend_t *ioend = *result; + + if (!ioend || need_ioend || type != ioend->io_type) { + xfs_ioend_t *previous = *result; + + ioend = xfs_alloc_ioend(inode, type); + ioend->io_offset = offset; + ioend->io_buffer_head = bh; + ioend->io_buffer_tail = bh; + if (previous) + previous->io_list = ioend; + *result = ioend; + } else { + ioend->io_buffer_tail->b_private = bh; + ioend->io_buffer_tail = bh; + } + + bh->b_private = NULL; + ioend->io_size += bh->b_size; +} + +STATIC void +xfs_map_buffer( + struct inode *inode, + struct buffer_head *bh, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + sector_t bn; + struct xfs_mount *m = XFS_I(inode)->i_mount; + xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); + xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); + + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + + bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + + ((offset - iomap_offset) >> inode->i_blkbits); + + ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); + + bh->b_blocknr = bn; + set_buffer_mapped(bh); +} + +STATIC void +xfs_map_at_offset( + struct inode *inode, + struct buffer_head *bh, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + + xfs_map_buffer(inode, bh, imap, offset); + set_buffer_mapped(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); +} + +/* + * Test if a given page is suitable for writing as part of an unwritten + * or delayed allocate extent. + */ +STATIC int +xfs_is_delayed_page( + struct page *page, + unsigned int type) +{ + if (PageWriteback(page)) + return 0; + + if (page->mapping && page_has_buffers(page)) { + struct buffer_head *bh, *head; + int acceptable = 0; + + bh = head = page_buffers(page); + do { + if (buffer_unwritten(bh)) + acceptable = (type == IO_UNWRITTEN); + else if (buffer_delay(bh)) + acceptable = (type == IO_DELALLOC); + else if (buffer_dirty(bh) && buffer_mapped(bh)) + acceptable = (type == IO_OVERWRITE); + else + break; + } while ((bh = bh->b_this_page) != head); + + if (acceptable) + return 1; + } + + return 0; +} + +/* + * Allocate & map buffers for page given the extent map. Write it out. + * except for the original page of a writepage, this is called on + * delalloc/unwritten pages only, for the original page it is possible + * that the page has no mapping at all. + */ +STATIC int +xfs_convert_page( + struct inode *inode, + struct page *page, + loff_t tindex, + struct xfs_bmbt_irec *imap, + xfs_ioend_t **ioendp, + struct writeback_control *wbc) +{ + struct buffer_head *bh, *head; + xfs_off_t end_offset; + unsigned long p_offset; + unsigned int type; + int len, page_dirty; + int count = 0, done = 0, uptodate = 1; + xfs_off_t offset = page_offset(page); + + if (page->index != tindex) + goto fail; + if (!trylock_page(page)) + goto fail; + if (PageWriteback(page)) + goto fail_unlock_page; + if (page->mapping != inode->i_mapping) + goto fail_unlock_page; + if (!xfs_is_delayed_page(page, (*ioendp)->io_type)) + goto fail_unlock_page; + + /* + * page_dirty is initially a count of buffers on the page before + * EOF and is decremented as we move each into a cleanable state. + * + * Derivation: + * + * End offset is the highest offset that this page should represent. + * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) + * will evaluate non-zero and be less than PAGE_CACHE_SIZE and + * hence give us the correct page_dirty count. On any other page, + * it will be zero and in that case we need page_dirty to be the + * count of buffers on the page. + */ + end_offset = min_t(unsigned long long, + (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, + i_size_read(inode)); + + len = 1 << inode->i_blkbits; + p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), + PAGE_CACHE_SIZE); + p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; + page_dirty = p_offset / len; + + bh = head = page_buffers(page); + do { + if (offset >= end_offset) + break; + if (!buffer_uptodate(bh)) + uptodate = 0; + if (!(PageUptodate(page) || buffer_uptodate(bh))) { + done = 1; + continue; + } + + if (buffer_unwritten(bh) || buffer_delay(bh) || + buffer_mapped(bh)) { + if (buffer_unwritten(bh)) + type = IO_UNWRITTEN; + else if (buffer_delay(bh)) + type = IO_DELALLOC; + else + type = IO_OVERWRITE; + + if (!xfs_imap_valid(inode, imap, offset)) { + done = 1; + continue; + } + + lock_buffer(bh); + if (type != IO_OVERWRITE) + xfs_map_at_offset(inode, bh, imap, offset); + xfs_add_to_ioend(inode, bh, offset, type, + ioendp, done); + + page_dirty--; + count++; + } else { + done = 1; + } + } while (offset += len, (bh = bh->b_this_page) != head); + + if (uptodate && bh == head) + SetPageUptodate(page); + + if (count) { + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) + done = 1; + } + xfs_start_page_writeback(page, !page_dirty, count); + + return done; + fail_unlock_page: + unlock_page(page); + fail: + return 1; +} + +/* + * Convert & write out a cluster of pages in the same extent as defined + * by mp and following the start page. + */ +STATIC void +xfs_cluster_write( + struct inode *inode, + pgoff_t tindex, + struct xfs_bmbt_irec *imap, + xfs_ioend_t **ioendp, + struct writeback_control *wbc, + pgoff_t tlast) +{ + struct pagevec pvec; + int done = 0, i; + + pagevec_init(&pvec, 0); + while (!done && tindex <= tlast) { + unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); + + if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) + break; + + for (i = 0; i < pagevec_count(&pvec); i++) { + done = xfs_convert_page(inode, pvec.pages[i], tindex++, + imap, ioendp, wbc); + if (done) + break; + } + + pagevec_release(&pvec); + cond_resched(); + } +} + +STATIC void +xfs_vm_invalidatepage( + struct page *page, + unsigned long offset) +{ + trace_xfs_invalidatepage(page->mapping->host, page, offset); + block_invalidatepage(page, offset); +} + +/* + * If the page has delalloc buffers on it, we need to punch them out before we + * invalidate the page. If we don't, we leave a stale delalloc mapping on the + * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read + * is done on that same region - the delalloc extent is returned when none is + * supposed to be there. + * + * We prevent this by truncating away the delalloc regions on the page before + * invalidating it. Because they are delalloc, we can do this without needing a + * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this + * truncation without a transaction as there is no space left for block + * reservation (typically why we see a ENOSPC in writeback). + * + * This is not a performance critical path, so for now just do the punching a + * buffer head at a time. + */ +STATIC void +xfs_aops_discard_page( + struct page *page) +{ + struct inode *inode = page->mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct buffer_head *bh, *head; + loff_t offset = page_offset(page); + + if (!xfs_is_delayed_page(page, IO_DELALLOC)) + goto out_invalidate; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + goto out_invalidate; + + xfs_alert(ip->i_mount, + "page discard on page %p, inode 0x%llx, offset %llu.", + page, ip->i_ino, offset); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + bh = head = page_buffers(page); + do { + int error; + xfs_fileoff_t start_fsb; + + if (!buffer_delay(bh)) + goto next_buffer; + + start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "page discard unable to remove delalloc mapping."); + } + break; + } +next_buffer: + offset += 1 << inode->i_blkbits; + + } while ((bh = bh->b_this_page) != head); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_invalidate: + xfs_vm_invalidatepage(page, 0); + return; +} + +/* + * Write out a dirty page. + * + * For delalloc space on the page we need to allocate space and flush it. + * For unwritten space on the page we need to start the conversion to + * regular allocated space. + * For any other dirty buffer heads on the page we should flush them. + * + * If we detect that a transaction would be required to flush the page, we + * have to check the process flags first, if we are already in a transaction + * or disk I/O during allocations is off, we need to fail the writepage and + * redirty the page. + */ +STATIC int +xfs_vm_writepage( + struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + int delalloc, unwritten; + struct buffer_head *bh, *head; + struct xfs_bmbt_irec imap; + xfs_ioend_t *ioend = NULL, *iohead = NULL; + loff_t offset; + unsigned int type; + __uint64_t end_offset; + pgoff_t end_index, last_index; + ssize_t len; + int err, imap_valid = 0, uptodate = 1; + int count = 0; + int nonblocking = 0; + + trace_xfs_writepage(inode, page, 0); + + ASSERT(page_has_buffers(page)); + + /* + * Refuse to write the page out if we are called from reclaim context. + * + * This avoids stack overflows when called from deeply used stacks in + * random callers for direct reclaim or memcg reclaim. We explicitly + * allow reclaim from kswapd as the stack usage there is relatively low. + * + * This should really be done by the core VM, but until that happens + * filesystems like XFS, btrfs and ext4 have to take care of this + * by themselves. + */ + if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) + goto redirty; + + /* + * We need a transaction if there are delalloc or unwritten buffers + * on the page. + * + * If we need a transaction and the process flags say we are already + * in a transaction, or no IO is allowed then mark the page dirty + * again and leave the page as is. + */ + xfs_count_page_state(page, &delalloc, &unwritten); + if ((current->flags & PF_FSTRANS) && (delalloc || unwritten)) + goto redirty; + + /* Is this page beyond the end of the file? */ + offset = i_size_read(inode); + end_index = offset >> PAGE_CACHE_SHIFT; + last_index = (offset - 1) >> PAGE_CACHE_SHIFT; + if (page->index >= end_index) { + if ((page->index >= end_index + 1) || + !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { + unlock_page(page); + return 0; + } + } + + end_offset = min_t(unsigned long long, + (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, + offset); + len = 1 << inode->i_blkbits; + + bh = head = page_buffers(page); + offset = page_offset(page); + type = IO_OVERWRITE; + + if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) + nonblocking = 1; + + do { + int new_ioend = 0; + + if (offset >= end_offset) + break; + if (!buffer_uptodate(bh)) + uptodate = 0; + + /* + * set_page_dirty dirties all buffers in a page, independent + * of their state. The dirty state however is entirely + * meaningless for holes (!mapped && uptodate), so skip + * buffers covering holes here. + */ + if (!buffer_mapped(bh) && buffer_uptodate(bh)) { + imap_valid = 0; + continue; + } + + if (buffer_unwritten(bh)) { + if (type != IO_UNWRITTEN) { + type = IO_UNWRITTEN; + imap_valid = 0; + } + } else if (buffer_delay(bh)) { + if (type != IO_DELALLOC) { + type = IO_DELALLOC; + imap_valid = 0; + } + } else if (buffer_uptodate(bh)) { + if (type != IO_OVERWRITE) { + type = IO_OVERWRITE; + imap_valid = 0; + } + } else { + if (PageUptodate(page)) { + ASSERT(buffer_mapped(bh)); + imap_valid = 0; + } + continue; + } + + if (imap_valid) + imap_valid = xfs_imap_valid(inode, &imap, offset); + if (!imap_valid) { + /* + * If we didn't have a valid mapping then we need to + * put the new mapping into a separate ioend structure. + * This ensures non-contiguous extents always have + * separate ioends, which is particularly important + * for unwritten extent conversion at I/O completion + * time. + */ + new_ioend = 1; + err = xfs_map_blocks(inode, offset, &imap, type, + nonblocking); + if (err) + goto error; + imap_valid = xfs_imap_valid(inode, &imap, offset); + } + if (imap_valid) { + lock_buffer(bh); + if (type != IO_OVERWRITE) + xfs_map_at_offset(inode, bh, &imap, offset); + xfs_add_to_ioend(inode, bh, offset, type, &ioend, + new_ioend); + count++; + } + + if (!iohead) + iohead = ioend; + + } while (offset += len, ((bh = bh->b_this_page) != head)); + + if (uptodate && bh == head) + SetPageUptodate(page); + + xfs_start_page_writeback(page, 1, count); + + if (ioend && imap_valid) { + xfs_off_t end_index; + + end_index = imap.br_startoff + imap.br_blockcount; + + /* to bytes */ + end_index <<= inode->i_blkbits; + + /* to pages */ + end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; + + /* check against file size */ + if (end_index > last_index) + end_index = last_index; + + xfs_cluster_write(inode, page->index + 1, &imap, &ioend, + wbc, end_index); + } + + if (iohead) + xfs_submit_ioend(wbc, iohead); + + return 0; + +error: + if (iohead) + xfs_cancel_ioend(iohead); + + if (err == -EAGAIN) + goto redirty; + + xfs_aops_discard_page(page); + ClearPageUptodate(page); + unlock_page(page); + return err; + +redirty: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +} + +STATIC int +xfs_vm_writepages( + struct address_space *mapping, + struct writeback_control *wbc) +{ + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); + return generic_writepages(mapping, wbc); +} + +/* + * Called to move a page into cleanable state - and from there + * to be released. The page should already be clean. We always + * have buffer heads in this call. + * + * Returns 1 if the page is ok to release, 0 otherwise. + */ +STATIC int +xfs_vm_releasepage( + struct page *page, + gfp_t gfp_mask) +{ + int delalloc, unwritten; + + trace_xfs_releasepage(page->mapping->host, page, 0); + + xfs_count_page_state(page, &delalloc, &unwritten); + + if (WARN_ON(delalloc)) + return 0; + if (WARN_ON(unwritten)) + return 0; + + return try_to_free_buffers(page); +} + +STATIC int +__xfs_get_blocks( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create, + int direct) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int lockmode = 0; + struct xfs_bmbt_irec imap; + int nimaps = 1; + xfs_off_t offset; + ssize_t size; + int new = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + offset = (xfs_off_t)iblock << inode->i_blkbits; + ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); + size = bh_result->b_size; + + if (!create && direct && offset >= i_size_read(inode)) + return 0; + + if (create) { + lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockmode); + } else { + lockmode = xfs_ilock_map_shared(ip); + } + + ASSERT(offset <= mp->m_maxioffset); + if (offset + size > mp->m_maxioffset) + size = mp->m_maxioffset - offset; + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + + error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, + XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL); + if (error) + goto out_unlock; + + if (create && + (!nimaps || + (imap.br_startblock == HOLESTARTBLOCK || + imap.br_startblock == DELAYSTARTBLOCK))) { + if (direct) { + error = xfs_iomap_write_direct(ip, offset, size, + &imap, nimaps); + } else { + error = xfs_iomap_write_delay(ip, offset, size, &imap); + } + if (error) + goto out_unlock; + + trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); + } else if (nimaps) { + trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); + } else { + trace_xfs_get_blocks_notfound(ip, offset, size); + goto out_unlock; + } + xfs_iunlock(ip, lockmode); + + if (imap.br_startblock != HOLESTARTBLOCK && + imap.br_startblock != DELAYSTARTBLOCK) { + /* + * For unwritten extents do not report a disk address on + * the read case (treat as if we're reading into a hole). + */ + if (create || !ISUNWRITTEN(&imap)) + xfs_map_buffer(inode, bh_result, &imap, offset); + if (create && ISUNWRITTEN(&imap)) { + if (direct) + bh_result->b_private = inode; + set_buffer_unwritten(bh_result); + } + } + + /* + * If this is a realtime file, data may be on a different device. + * to that pointed to from the buffer_head b_bdev currently. + */ + bh_result->b_bdev = xfs_find_bdev_for_inode(inode); + + /* + * If we previously allocated a block out beyond eof and we are now + * coming back to use it then we will need to flag it as new even if it + * has a disk address. + * + * With sub-block writes into unwritten extents we also need to mark + * the buffer as new so that the unwritten parts of the buffer gets + * correctly zeroed. + */ + if (create && + ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || + (offset >= i_size_read(inode)) || + (new || ISUNWRITTEN(&imap)))) + set_buffer_new(bh_result); + + if (imap.br_startblock == DELAYSTARTBLOCK) { + BUG_ON(direct); + if (create) { + set_buffer_uptodate(bh_result); + set_buffer_mapped(bh_result); + set_buffer_delay(bh_result); + } + } + + /* + * If this is O_DIRECT or the mpage code calling tell them how large + * the mapping is, so that we can avoid repeated get_blocks calls. + */ + if (direct || size > (1 << inode->i_blkbits)) { + xfs_off_t mapping_size; + + mapping_size = imap.br_startoff + imap.br_blockcount - iblock; + mapping_size <<= inode->i_blkbits; + + ASSERT(mapping_size > 0); + if (mapping_size > size) + mapping_size = size; + if (mapping_size > LONG_MAX) + mapping_size = LONG_MAX; + + bh_result->b_size = mapping_size; + } + + return 0; + +out_unlock: + xfs_iunlock(ip, lockmode); + return -error; +} + +int +xfs_get_blocks( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + return __xfs_get_blocks(inode, iblock, bh_result, create, 0); +} + +STATIC int +xfs_get_blocks_direct( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + return __xfs_get_blocks(inode, iblock, bh_result, create, 1); +} + +/* + * Complete a direct I/O write request. + * + * If the private argument is non-NULL __xfs_get_blocks signals us that we + * need to issue a transaction to convert the range from unwritten to written + * extents. In case this is regular synchronous I/O we just call xfs_end_io + * to do this and we are done. But in case this was a successful AIO + * request this handler is called from interrupt context, from which we + * can't start transactions. In that case offload the I/O completion to + * the workqueues we also use for buffered I/O completion. + */ +STATIC void +xfs_end_io_direct_write( + struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private, + int ret, + bool is_async) +{ + struct xfs_ioend *ioend = iocb->private; + + /* + * blockdev_direct_IO can return an error even after the I/O + * completion handler was called. Thus we need to protect + * against double-freeing. + */ + iocb->private = NULL; + + ioend->io_offset = offset; + ioend->io_size = size; + if (private && size > 0) + ioend->io_type = IO_UNWRITTEN; + + if (is_async) { + /* + * If we are converting an unwritten extent we need to delay + * the AIO completion until after the unwrittent extent + * conversion has completed, otherwise do it ASAP. + */ + if (ioend->io_type == IO_UNWRITTEN) { + ioend->io_iocb = iocb; + ioend->io_result = ret; + } else { + aio_complete(iocb, ret, 0); + } + xfs_finish_ioend(ioend); + } else { + xfs_finish_ioend_sync(ioend); + } +} + +STATIC ssize_t +xfs_vm_direct_IO( + int rw, + struct kiocb *iocb, + const struct iovec *iov, + loff_t offset, + unsigned long nr_segs) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct block_device *bdev = xfs_find_bdev_for_inode(inode); + ssize_t ret; + + if (rw & WRITE) { + iocb->private = xfs_alloc_ioend(inode, IO_DIRECT); + + ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, + offset, nr_segs, + xfs_get_blocks_direct, + xfs_end_io_direct_write, NULL, 0); + if (ret != -EIOCBQUEUED && iocb->private) + xfs_destroy_ioend(iocb->private); + } else { + ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, + offset, nr_segs, + xfs_get_blocks_direct, + NULL, NULL, 0); + } + + return ret; +} + +STATIC void +xfs_vm_write_failed( + struct address_space *mapping, + loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + /* + * punch out the delalloc blocks we have already allocated. We + * don't call xfs_setattr() to do this as we may be in the + * middle of a multi-iovec write and so the vfs inode->i_size + * will not match the xfs ip->i_size and so it will zero too + * much. Hence we jus truncate the page cache to zero what is + * necessary and punch the delalloc blocks directly. + */ + struct xfs_inode *ip = XFS_I(inode); + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error; + + truncate_pagecache(inode, to, inode->i_size); + + /* + * Check if there are any blocks that are outside of i_size + * that need to be trimmed back. + */ + start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1; + end_fsb = XFS_B_TO_FSB(ip->i_mount, to); + if (end_fsb <= start_fsb) + return; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "xfs_vm_write_failed: unable to clean up ino %lld", + ip->i_ino); + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } +} + +STATIC int +xfs_vm_write_begin( + struct file *file, + struct address_space *mapping, + loff_t pos, + unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata) +{ + int ret; + + ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS, + pagep, xfs_get_blocks); + if (unlikely(ret)) + xfs_vm_write_failed(mapping, pos + len); + return ret; +} + +STATIC int +xfs_vm_write_end( + struct file *file, + struct address_space *mapping, + loff_t pos, + unsigned len, + unsigned copied, + struct page *page, + void *fsdata) +{ + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (unlikely(ret < len)) + xfs_vm_write_failed(mapping, pos + len); + return ret; +} + +STATIC sector_t +xfs_vm_bmap( + struct address_space *mapping, + sector_t block) +{ + struct inode *inode = (struct inode *)mapping->host; + struct xfs_inode *ip = XFS_I(inode); + + trace_xfs_vm_bmap(XFS_I(inode)); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return generic_block_bmap(mapping, block, xfs_get_blocks); +} + +STATIC int +xfs_vm_readpage( + struct file *unused, + struct page *page) +{ + return mpage_readpage(page, xfs_get_blocks); +} + +STATIC int +xfs_vm_readpages( + struct file *unused, + struct address_space *mapping, + struct list_head *pages, + unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); +} + +const struct address_space_operations xfs_address_space_operations = { + .readpage = xfs_vm_readpage, + .readpages = xfs_vm_readpages, + .writepage = xfs_vm_writepage, + .writepages = xfs_vm_writepages, + .releasepage = xfs_vm_releasepage, + .invalidatepage = xfs_vm_invalidatepage, + .write_begin = xfs_vm_write_begin, + .write_end = xfs_vm_write_end, + .bmap = xfs_vm_bmap, + .direct_IO = xfs_vm_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h new file mode 100644 index 00000000..71f721e1 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2005-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_AOPS_H__ +#define __XFS_AOPS_H__ + +extern struct workqueue_struct *xfsdatad_workqueue; +extern struct workqueue_struct *xfsconvertd_workqueue; +extern mempool_t *xfs_ioend_pool; + +/* + * Types of I/O for bmap clustering and I/O completion tracking. + */ +enum { + IO_DIRECT = 0, /* special case for direct I/O ioends */ + IO_DELALLOC, /* mapping covers delalloc region */ + IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */ + IO_OVERWRITE, /* mapping covers already allocated extent */ +}; + +#define XFS_IO_TYPES \ + { 0, "" }, \ + { IO_DELALLOC, "delalloc" }, \ + { IO_UNWRITTEN, "unwritten" }, \ + { IO_OVERWRITE, "overwrite" } + +/* + * xfs_ioend struct manages large extent writes for XFS. + * It can manage several multi-page bio's at once. + */ +typedef struct xfs_ioend { + struct xfs_ioend *io_list; /* next ioend in chain */ + unsigned int io_type; /* delalloc / unwritten */ + int io_error; /* I/O error code */ + atomic_t io_remaining; /* hold count */ + struct inode *io_inode; /* file being written to */ + struct buffer_head *io_buffer_head;/* buffer linked list head */ + struct buffer_head *io_buffer_tail;/* buffer linked list tail */ + size_t io_size; /* size of the extent */ + xfs_off_t io_offset; /* offset in the file */ + struct work_struct io_work; /* xfsdatad work queue */ + struct kiocb *io_iocb; + int io_result; +} xfs_ioend_t; + +extern const struct address_space_operations xfs_address_space_operations; +extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); + +extern void xfs_ioend_init(void); +extern void xfs_ioend_wait(struct xfs_inode *); + +extern void xfs_count_page_state(struct page *, int *, int *); + +#endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c new file mode 100644 index 00000000..5e68099d --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -0,0 +1,1899 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_trace.h" + +static kmem_zone_t *xfs_buf_zone; +STATIC int xfsbufd(void *); +STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); + +static struct workqueue_struct *xfslogd_workqueue; +struct workqueue_struct *xfsdatad_workqueue; +struct workqueue_struct *xfsconvertd_workqueue; + +#ifdef XFS_BUF_LOCK_TRACKING +# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) +# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) +# define XB_GET_OWNER(bp) ((bp)->b_last_holder) +#else +# define XB_SET_OWNER(bp) do { } while (0) +# define XB_CLEAR_OWNER(bp) do { } while (0) +# define XB_GET_OWNER(bp) do { } while (0) +#endif + +#define xb_to_gfp(flags) \ + ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ + ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) + +#define xb_to_km(flags) \ + (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) + +#define xfs_buf_allocate(flags) \ + kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)) +#define xfs_buf_deallocate(bp) \ + kmem_zone_free(xfs_buf_zone, (bp)); + +static inline int +xfs_buf_is_vmapped( + struct xfs_buf *bp) +{ + /* + * Return true if the buffer is vmapped. + * + * The XBF_MAPPED flag is set if the buffer should be mapped, but the + * code is clever enough to know it doesn't have to map a single page, + * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1. + */ + return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1; +} + +static inline int +xfs_buf_vmap_len( + struct xfs_buf *bp) +{ + return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; +} + +/* + * xfs_buf_lru_add - add a buffer to the LRU. + * + * The LRU takes a new reference to the buffer so that it will only be freed + * once the shrinker takes the buffer off the LRU. + */ +STATIC void +xfs_buf_lru_add( + struct xfs_buf *bp) +{ + struct xfs_buftarg *btp = bp->b_target; + + spin_lock(&btp->bt_lru_lock); + if (list_empty(&bp->b_lru)) { + atomic_inc(&bp->b_hold); + list_add_tail(&bp->b_lru, &btp->bt_lru); + btp->bt_lru_nr++; + } + spin_unlock(&btp->bt_lru_lock); +} + +/* + * xfs_buf_lru_del - remove a buffer from the LRU + * + * The unlocked check is safe here because it only occurs when there are not + * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there + * to optimise the shrinker removing the buffer from the LRU and calling + * xfs_buf_free(). i.e. it removes an unnecessary round trip on the + * bt_lru_lock. + */ +STATIC void +xfs_buf_lru_del( + struct xfs_buf *bp) +{ + struct xfs_buftarg *btp = bp->b_target; + + if (list_empty(&bp->b_lru)) + return; + + spin_lock(&btp->bt_lru_lock); + if (!list_empty(&bp->b_lru)) { + list_del_init(&bp->b_lru); + btp->bt_lru_nr--; + } + spin_unlock(&btp->bt_lru_lock); +} + +/* + * When we mark a buffer stale, we remove the buffer from the LRU and clear the + * b_lru_ref count so that the buffer is freed immediately when the buffer + * reference count falls to zero. If the buffer is already on the LRU, we need + * to remove the reference that LRU holds on the buffer. + * + * This prevents build-up of stale buffers on the LRU. + */ +void +xfs_buf_stale( + struct xfs_buf *bp) +{ + bp->b_flags |= XBF_STALE; + atomic_set(&(bp)->b_lru_ref, 0); + if (!list_empty(&bp->b_lru)) { + struct xfs_buftarg *btp = bp->b_target; + + spin_lock(&btp->bt_lru_lock); + if (!list_empty(&bp->b_lru)) { + list_del_init(&bp->b_lru); + btp->bt_lru_nr--; + atomic_dec(&bp->b_hold); + } + spin_unlock(&btp->bt_lru_lock); + } + ASSERT(atomic_read(&bp->b_hold) >= 1); +} + +STATIC void +_xfs_buf_initialize( + xfs_buf_t *bp, + xfs_buftarg_t *target, + xfs_off_t range_base, + size_t range_length, + xfs_buf_flags_t flags) +{ + /* + * We don't want certain flags to appear in b_flags. + */ + flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); + + memset(bp, 0, sizeof(xfs_buf_t)); + atomic_set(&bp->b_hold, 1); + atomic_set(&bp->b_lru_ref, 1); + init_completion(&bp->b_iowait); + INIT_LIST_HEAD(&bp->b_lru); + INIT_LIST_HEAD(&bp->b_list); + RB_CLEAR_NODE(&bp->b_rbnode); + sema_init(&bp->b_sema, 0); /* held, no waiters */ + XB_SET_OWNER(bp); + bp->b_target = target; + bp->b_file_offset = range_base; + /* + * Set buffer_length and count_desired to the same value initially. + * I/O routines should use count_desired, which will be the same in + * most cases but may be reset (e.g. XFS recovery). + */ + bp->b_buffer_length = bp->b_count_desired = range_length; + bp->b_flags = flags; + bp->b_bn = XFS_BUF_DADDR_NULL; + atomic_set(&bp->b_pin_count, 0); + init_waitqueue_head(&bp->b_waiters); + + XFS_STATS_INC(xb_create); + + trace_xfs_buf_init(bp, _RET_IP_); +} + +/* + * Allocate a page array capable of holding a specified number + * of pages, and point the page buf at it. + */ +STATIC int +_xfs_buf_get_pages( + xfs_buf_t *bp, + int page_count, + xfs_buf_flags_t flags) +{ + /* Make sure that we have a page list */ + if (bp->b_pages == NULL) { + bp->b_offset = xfs_buf_poff(bp->b_file_offset); + bp->b_page_count = page_count; + if (page_count <= XB_PAGES) { + bp->b_pages = bp->b_page_array; + } else { + bp->b_pages = kmem_alloc(sizeof(struct page *) * + page_count, xb_to_km(flags)); + if (bp->b_pages == NULL) + return -ENOMEM; + } + memset(bp->b_pages, 0, sizeof(struct page *) * page_count); + } + return 0; +} + +/* + * Frees b_pages if it was allocated. + */ +STATIC void +_xfs_buf_free_pages( + xfs_buf_t *bp) +{ + if (bp->b_pages != bp->b_page_array) { + kmem_free(bp->b_pages); + bp->b_pages = NULL; + } +} + +/* + * Releases the specified buffer. + * + * The modification state of any associated pages is left unchanged. + * The buffer most not be on any hash - use xfs_buf_rele instead for + * hashed and refcounted buffers + */ +void +xfs_buf_free( + xfs_buf_t *bp) +{ + trace_xfs_buf_free(bp, _RET_IP_); + + ASSERT(list_empty(&bp->b_lru)); + + if (bp->b_flags & _XBF_PAGES) { + uint i; + + if (xfs_buf_is_vmapped(bp)) + vm_unmap_ram(bp->b_addr - bp->b_offset, + bp->b_page_count); + + for (i = 0; i < bp->b_page_count; i++) { + struct page *page = bp->b_pages[i]; + + __free_page(page); + } + } else if (bp->b_flags & _XBF_KMEM) + kmem_free(bp->b_addr); + _xfs_buf_free_pages(bp); + xfs_buf_deallocate(bp); +} + +/* + * Allocates all the pages for buffer in question and builds it's page list. + */ +STATIC int +xfs_buf_allocate_memory( + xfs_buf_t *bp, + uint flags) +{ + size_t size = bp->b_count_desired; + size_t nbytes, offset; + gfp_t gfp_mask = xb_to_gfp(flags); + unsigned short page_count, i; + xfs_off_t end; + int error; + + /* + * for buffers that are contained within a single page, just allocate + * the memory from the heap - there's no need for the complexity of + * page arrays to keep allocation down to order 0. + */ + if (bp->b_buffer_length < PAGE_SIZE) { + bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags)); + if (!bp->b_addr) { + /* low memory - use alloc_page loop instead */ + goto use_alloc_page; + } + + if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) & + PAGE_MASK) != + ((unsigned long)bp->b_addr & PAGE_MASK)) { + /* b_addr spans two pages - use alloc_page instead */ + kmem_free(bp->b_addr); + bp->b_addr = NULL; + goto use_alloc_page; + } + bp->b_offset = offset_in_page(bp->b_addr); + bp->b_pages = bp->b_page_array; + bp->b_pages[0] = virt_to_page(bp->b_addr); + bp->b_page_count = 1; + bp->b_flags |= XBF_MAPPED | _XBF_KMEM; + return 0; + } + +use_alloc_page: + end = bp->b_file_offset + bp->b_buffer_length; + page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); + error = _xfs_buf_get_pages(bp, page_count, flags); + if (unlikely(error)) + return error; + + offset = bp->b_offset; + bp->b_flags |= _XBF_PAGES; + + for (i = 0; i < bp->b_page_count; i++) { + struct page *page; + uint retries = 0; +retry: + page = alloc_page(gfp_mask); + if (unlikely(page == NULL)) { + if (flags & XBF_READ_AHEAD) { + bp->b_page_count = i; + error = ENOMEM; + goto out_free_pages; + } + + /* + * This could deadlock. + * + * But until all the XFS lowlevel code is revamped to + * handle buffer allocation failures we can't do much. + */ + if (!(++retries % 100)) + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", + __func__, gfp_mask); + + XFS_STATS_INC(xb_page_retries); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + + XFS_STATS_INC(xb_page_found); + + nbytes = min_t(size_t, size, PAGE_SIZE - offset); + size -= nbytes; + bp->b_pages[i] = page; + offset = 0; + } + return 0; + +out_free_pages: + for (i = 0; i < bp->b_page_count; i++) + __free_page(bp->b_pages[i]); + return error; +} + +/* + * Map buffer into kernel address-space if necessary. + */ +STATIC int +_xfs_buf_map_pages( + xfs_buf_t *bp, + uint flags) +{ + ASSERT(bp->b_flags & _XBF_PAGES); + if (bp->b_page_count == 1) { + /* A single page buffer is always mappable */ + bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; + bp->b_flags |= XBF_MAPPED; + } else if (flags & XBF_MAPPED) { + int retried = 0; + + do { + bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, + -1, PAGE_KERNEL); + if (bp->b_addr) + break; + vm_unmap_aliases(); + } while (retried++ <= 1); + + if (!bp->b_addr) + return -ENOMEM; + bp->b_addr += bp->b_offset; + bp->b_flags |= XBF_MAPPED; + } + + return 0; +} + +/* + * Finding and Reading Buffers + */ + +/* + * Look up, and creates if absent, a lockable buffer for + * a given range of an inode. The buffer is returned + * locked. If other overlapping buffers exist, they are + * released before the new buffer is created and locked, + * which may imply that this call will block until those buffers + * are unlocked. No I/O is implied by this call. + */ +xfs_buf_t * +_xfs_buf_find( + xfs_buftarg_t *btp, /* block device target */ + xfs_off_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + xfs_buf_flags_t flags, + xfs_buf_t *new_bp) +{ + xfs_off_t range_base; + size_t range_length; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent; + xfs_buf_t *bp; + + range_base = (ioff << BBSHIFT); + range_length = (isize << BBSHIFT); + + /* Check for IOs smaller than the sector size / not sector aligned */ + ASSERT(!(range_length < (1 << btp->bt_sshift))); + ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); + + /* get tree root */ + pag = xfs_perag_get(btp->bt_mount, + xfs_daddr_to_agno(btp->bt_mount, ioff)); + + /* walk tree */ + spin_lock(&pag->pag_buf_lock); + rbp = &pag->pag_buf_tree.rb_node; + parent = NULL; + bp = NULL; + while (*rbp) { + parent = *rbp; + bp = rb_entry(parent, struct xfs_buf, b_rbnode); + + if (range_base < bp->b_file_offset) + rbp = &(*rbp)->rb_left; + else if (range_base > bp->b_file_offset) + rbp = &(*rbp)->rb_right; + else { + /* + * found a block offset match. If the range doesn't + * match, the only way this is allowed is if the buffer + * in the cache is stale and the transaction that made + * it stale has not yet committed. i.e. we are + * reallocating a busy extent. Skip this buffer and + * continue searching to the right for an exact match. + */ + if (bp->b_buffer_length != range_length) { + ASSERT(bp->b_flags & XBF_STALE); + rbp = &(*rbp)->rb_right; + continue; + } + atomic_inc(&bp->b_hold); + goto found; + } + } + + /* No match found */ + if (new_bp) { + _xfs_buf_initialize(new_bp, btp, range_base, + range_length, flags); + rb_link_node(&new_bp->b_rbnode, parent, rbp); + rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); + /* the buffer keeps the perag reference until it is freed */ + new_bp->b_pag = pag; + spin_unlock(&pag->pag_buf_lock); + } else { + XFS_STATS_INC(xb_miss_locked); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + } + return new_bp; + +found: + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + + if (xfs_buf_cond_lock(bp)) { + /* failed, so wait for the lock if requested. */ + if (!(flags & XBF_TRYLOCK)) { + xfs_buf_lock(bp); + XFS_STATS_INC(xb_get_locked_waited); + } else { + xfs_buf_rele(bp); + XFS_STATS_INC(xb_busy_locked); + return NULL; + } + } + + /* + * if the buffer is stale, clear all the external state associated with + * it. We need to keep flags such as how we allocated the buffer memory + * intact here. + */ + if (bp->b_flags & XBF_STALE) { + ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); + bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES; + } + + trace_xfs_buf_find(bp, flags, _RET_IP_); + XFS_STATS_INC(xb_get_locked); + return bp; +} + +/* + * Assembles a buffer covering the specified range. + * Storage in memory for all portions of the buffer will be allocated, + * although backing storage may not be. + */ +xfs_buf_t * +xfs_buf_get( + xfs_buftarg_t *target,/* target for buffer */ + xfs_off_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + xfs_buf_flags_t flags) +{ + xfs_buf_t *bp, *new_bp; + int error = 0; + + new_bp = xfs_buf_allocate(flags); + if (unlikely(!new_bp)) + return NULL; + + bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); + if (bp == new_bp) { + error = xfs_buf_allocate_memory(bp, flags); + if (error) + goto no_buffer; + } else { + xfs_buf_deallocate(new_bp); + if (unlikely(bp == NULL)) + return NULL; + } + + if (!(bp->b_flags & XBF_MAPPED)) { + error = _xfs_buf_map_pages(bp, flags); + if (unlikely(error)) { + xfs_warn(target->bt_mount, + "%s: failed to map pages\n", __func__); + goto no_buffer; + } + } + + XFS_STATS_INC(xb_get); + + /* + * Always fill in the block number now, the mapped cases can do + * their own overlay of this later. + */ + bp->b_bn = ioff; + bp->b_count_desired = bp->b_buffer_length; + + trace_xfs_buf_get(bp, flags, _RET_IP_); + return bp; + + no_buffer: + if (flags & (XBF_LOCK | XBF_TRYLOCK)) + xfs_buf_unlock(bp); + xfs_buf_rele(bp); + return NULL; +} + +STATIC int +_xfs_buf_read( + xfs_buf_t *bp, + xfs_buf_flags_t flags) +{ + int status; + + ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); + ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); + + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \ + XBF_READ_AHEAD | _XBF_RUN_QUEUES); + bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \ + XBF_READ_AHEAD | _XBF_RUN_QUEUES); + + status = xfs_buf_iorequest(bp); + if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC)) + return status; + return xfs_buf_iowait(bp); +} + +xfs_buf_t * +xfs_buf_read( + xfs_buftarg_t *target, + xfs_off_t ioff, + size_t isize, + xfs_buf_flags_t flags) +{ + xfs_buf_t *bp; + + flags |= XBF_READ; + + bp = xfs_buf_get(target, ioff, isize, flags); + if (bp) { + trace_xfs_buf_read(bp, flags, _RET_IP_); + + if (!XFS_BUF_ISDONE(bp)) { + XFS_STATS_INC(xb_get_read); + _xfs_buf_read(bp, flags); + } else if (flags & XBF_ASYNC) { + /* + * Read ahead call which is already satisfied, + * drop the buffer + */ + goto no_buffer; + } else { + /* We do not want read in the flags */ + bp->b_flags &= ~XBF_READ; + } + } + + return bp; + + no_buffer: + if (flags & (XBF_LOCK | XBF_TRYLOCK)) + xfs_buf_unlock(bp); + xfs_buf_rele(bp); + return NULL; +} + +/* + * If we are not low on memory then do the readahead in a deadlock + * safe manner. + */ +void +xfs_buf_readahead( + xfs_buftarg_t *target, + xfs_off_t ioff, + size_t isize) +{ + if (bdi_read_congested(target->bt_bdi)) + return; + + xfs_buf_read(target, ioff, isize, + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK); +} + +/* + * Read an uncached buffer from disk. Allocates and returns a locked + * buffer containing the disk contents or nothing. + */ +struct xfs_buf * +xfs_buf_read_uncached( + struct xfs_mount *mp, + struct xfs_buftarg *target, + xfs_daddr_t daddr, + size_t length, + int flags) +{ + xfs_buf_t *bp; + int error; + + bp = xfs_buf_get_uncached(target, length, flags); + if (!bp) + return NULL; + + /* set up the buffer for a read IO */ + xfs_buf_lock(bp); + XFS_BUF_SET_ADDR(bp, daddr); + XFS_BUF_READ(bp); + XFS_BUF_BUSY(bp); + + xfsbdstrat(mp, bp); + error = xfs_buf_iowait(bp); + if (error || bp->b_error) { + xfs_buf_relse(bp); + return NULL; + } + return bp; +} + +xfs_buf_t * +xfs_buf_get_empty( + size_t len, + xfs_buftarg_t *target) +{ + xfs_buf_t *bp; + + bp = xfs_buf_allocate(0); + if (bp) + _xfs_buf_initialize(bp, target, 0, len, 0); + return bp; +} + +/* + * Return a buffer allocated as an empty buffer and associated to external + * memory via xfs_buf_associate_memory() back to it's empty state. + */ +void +xfs_buf_set_empty( + struct xfs_buf *bp, + size_t len) +{ + if (bp->b_pages) + _xfs_buf_free_pages(bp); + + bp->b_pages = NULL; + bp->b_page_count = 0; + bp->b_addr = NULL; + bp->b_file_offset = 0; + bp->b_buffer_length = bp->b_count_desired = len; + bp->b_bn = XFS_BUF_DADDR_NULL; + bp->b_flags &= ~XBF_MAPPED; +} + +static inline struct page * +mem_to_page( + void *addr) +{ + if ((!is_vmalloc_addr(addr))) { + return virt_to_page(addr); + } else { + return vmalloc_to_page(addr); + } +} + +int +xfs_buf_associate_memory( + xfs_buf_t *bp, + void *mem, + size_t len) +{ + int rval; + int i = 0; + unsigned long pageaddr; + unsigned long offset; + size_t buflen; + int page_count; + + pageaddr = (unsigned long)mem & PAGE_MASK; + offset = (unsigned long)mem - pageaddr; + buflen = PAGE_ALIGN(len + offset); + page_count = buflen >> PAGE_SHIFT; + + /* Free any previous set of page pointers */ + if (bp->b_pages) + _xfs_buf_free_pages(bp); + + bp->b_pages = NULL; + bp->b_addr = mem; + + rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK); + if (rval) + return rval; + + bp->b_offset = offset; + + for (i = 0; i < bp->b_page_count; i++) { + bp->b_pages[i] = mem_to_page((void *)pageaddr); + pageaddr += PAGE_SIZE; + } + + bp->b_count_desired = len; + bp->b_buffer_length = buflen; + bp->b_flags |= XBF_MAPPED; + + return 0; +} + +xfs_buf_t * +xfs_buf_get_uncached( + struct xfs_buftarg *target, + size_t len, + int flags) +{ + unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; + int error, i; + xfs_buf_t *bp; + + bp = xfs_buf_allocate(0); + if (unlikely(bp == NULL)) + goto fail; + _xfs_buf_initialize(bp, target, 0, len, 0); + + error = _xfs_buf_get_pages(bp, page_count, 0); + if (error) + goto fail_free_buf; + + for (i = 0; i < page_count; i++) { + bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); + if (!bp->b_pages[i]) + goto fail_free_mem; + } + bp->b_flags |= _XBF_PAGES; + + error = _xfs_buf_map_pages(bp, XBF_MAPPED); + if (unlikely(error)) { + xfs_warn(target->bt_mount, + "%s: failed to map pages\n", __func__); + goto fail_free_mem; + } + + xfs_buf_unlock(bp); + + trace_xfs_buf_get_uncached(bp, _RET_IP_); + return bp; + + fail_free_mem: + while (--i >= 0) + __free_page(bp->b_pages[i]); + _xfs_buf_free_pages(bp); + fail_free_buf: + xfs_buf_deallocate(bp); + fail: + return NULL; +} + +/* + * Increment reference count on buffer, to hold the buffer concurrently + * with another thread which may release (free) the buffer asynchronously. + * Must hold the buffer already to call this function. + */ +void +xfs_buf_hold( + xfs_buf_t *bp) +{ + trace_xfs_buf_hold(bp, _RET_IP_); + atomic_inc(&bp->b_hold); +} + +/* + * Releases a hold on the specified buffer. If the + * the hold count is 1, calls xfs_buf_free. + */ +void +xfs_buf_rele( + xfs_buf_t *bp) +{ + struct xfs_perag *pag = bp->b_pag; + + trace_xfs_buf_rele(bp, _RET_IP_); + + if (!pag) { + ASSERT(list_empty(&bp->b_lru)); + ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); + if (atomic_dec_and_test(&bp->b_hold)) + xfs_buf_free(bp); + return; + } + + ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); + + ASSERT(atomic_read(&bp->b_hold) > 0); + if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { + if (!(bp->b_flags & XBF_STALE) && + atomic_read(&bp->b_lru_ref)) { + xfs_buf_lru_add(bp); + spin_unlock(&pag->pag_buf_lock); + } else { + xfs_buf_lru_del(bp); + ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); + rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + xfs_buf_free(bp); + } + } +} + + +/* + * Lock a buffer object, if it is not already locked. + * + * If we come across a stale, pinned, locked buffer, we know that we are + * being asked to lock a buffer that has been reallocated. Because it is + * pinned, we know that the log has not been pushed to disk and hence it + * will still be locked. Rather than continuing to have trylock attempts + * fail until someone else pushes the log, push it ourselves before + * returning. This means that the xfsaild will not get stuck trying + * to push on stale inode buffers. + */ +int +xfs_buf_cond_lock( + xfs_buf_t *bp) +{ + int locked; + + locked = down_trylock(&bp->b_sema) == 0; + if (locked) + XB_SET_OWNER(bp); + else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_target->bt_mount, 0); + + trace_xfs_buf_cond_lock(bp, _RET_IP_); + return locked ? 0 : -EBUSY; +} + +int +xfs_buf_lock_value( + xfs_buf_t *bp) +{ + return bp->b_sema.count; +} + +/* + * Lock a buffer object. + * + * If we come across a stale, pinned, locked buffer, we know that we + * are being asked to lock a buffer that has been reallocated. Because + * it is pinned, we know that the log has not been pushed to disk and + * hence it will still be locked. Rather than sleeping until someone + * else pushes the log, push it ourselves before trying to get the lock. + */ +void +xfs_buf_lock( + xfs_buf_t *bp) +{ + trace_xfs_buf_lock(bp, _RET_IP_); + + if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_target->bt_mount, 0); + down(&bp->b_sema); + XB_SET_OWNER(bp); + + trace_xfs_buf_lock_done(bp, _RET_IP_); +} + +/* + * Releases the lock on the buffer object. + * If the buffer is marked delwri but is not queued, do so before we + * unlock the buffer as we need to set flags correctly. We also need to + * take a reference for the delwri queue because the unlocker is going to + * drop their's and they don't know we just queued it. + */ +void +xfs_buf_unlock( + xfs_buf_t *bp) +{ + if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) { + atomic_inc(&bp->b_hold); + bp->b_flags |= XBF_ASYNC; + xfs_buf_delwri_queue(bp, 0); + } + + XB_CLEAR_OWNER(bp); + up(&bp->b_sema); + + trace_xfs_buf_unlock(bp, _RET_IP_); +} + +STATIC void +xfs_buf_wait_unpin( + xfs_buf_t *bp) +{ + DECLARE_WAITQUEUE (wait, current); + + if (atomic_read(&bp->b_pin_count) == 0) + return; + + add_wait_queue(&bp->b_waiters, &wait); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&bp->b_pin_count) == 0) + break; + io_schedule(); + } + remove_wait_queue(&bp->b_waiters, &wait); + set_current_state(TASK_RUNNING); +} + +/* + * Buffer Utility Routines + */ + +STATIC void +xfs_buf_iodone_work( + struct work_struct *work) +{ + xfs_buf_t *bp = + container_of(work, xfs_buf_t, b_iodone_work); + + if (bp->b_iodone) + (*(bp->b_iodone))(bp); + else if (bp->b_flags & XBF_ASYNC) + xfs_buf_relse(bp); +} + +void +xfs_buf_ioend( + xfs_buf_t *bp, + int schedule) +{ + trace_xfs_buf_iodone(bp, _RET_IP_); + + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); + if (bp->b_error == 0) + bp->b_flags |= XBF_DONE; + + if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { + if (schedule) { + INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); + queue_work(xfslogd_workqueue, &bp->b_iodone_work); + } else { + xfs_buf_iodone_work(&bp->b_iodone_work); + } + } else { + complete(&bp->b_iowait); + } +} + +void +xfs_buf_ioerror( + xfs_buf_t *bp, + int error) +{ + ASSERT(error >= 0 && error <= 0xffff); + bp->b_error = (unsigned short)error; + trace_xfs_buf_ioerror(bp, error, _RET_IP_); +} + +int +xfs_bwrite( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + int error; + + bp->b_flags |= XBF_WRITE; + bp->b_flags &= ~(XBF_ASYNC | XBF_READ); + + xfs_buf_delwri_dequeue(bp); + xfs_bdstrat_cb(bp); + + error = xfs_buf_iowait(bp); + if (error) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + xfs_buf_relse(bp); + return error; +} + +void +xfs_bdwrite( + void *mp, + struct xfs_buf *bp) +{ + trace_xfs_buf_bdwrite(bp, _RET_IP_); + + bp->b_flags &= ~XBF_READ; + bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); + + xfs_buf_delwri_queue(bp, 1); +} + +/* + * Called when we want to stop a buffer from getting written or read. + * We attach the EIO error, muck with its flags, and call xfs_buf_ioend + * so that the proper iodone callbacks get called. + */ +STATIC int +xfs_bioerror( + xfs_buf_t *bp) +{ +#ifdef XFSERRORDEBUG + ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); +#endif + + /* + * No need to wait until the buffer is unpinned, we aren't flushing it. + */ + XFS_BUF_ERROR(bp, EIO); + + /* + * We're calling xfs_buf_ioend, so delete XBF_DONE flag. + */ + XFS_BUF_UNREAD(bp); + XFS_BUF_UNDELAYWRITE(bp); + XFS_BUF_UNDONE(bp); + XFS_BUF_STALE(bp); + + xfs_buf_ioend(bp, 0); + + return EIO; +} + +/* + * Same as xfs_bioerror, except that we are releasing the buffer + * here ourselves, and avoiding the xfs_buf_ioend call. + * This is meant for userdata errors; metadata bufs come with + * iodone functions attached, so that we can track down errors. + */ +STATIC int +xfs_bioerror_relse( + struct xfs_buf *bp) +{ + int64_t fl = XFS_BUF_BFLAGS(bp); + /* + * No need to wait until the buffer is unpinned. + * We aren't flushing it. + * + * chunkhold expects B_DONE to be set, whether + * we actually finish the I/O or not. We don't want to + * change that interface. + */ + XFS_BUF_UNREAD(bp); + XFS_BUF_UNDELAYWRITE(bp); + XFS_BUF_DONE(bp); + XFS_BUF_STALE(bp); + XFS_BUF_CLR_IODONE_FUNC(bp); + if (!(fl & XBF_ASYNC)) { + /* + * Mark b_error and B_ERROR _both_. + * Lot's of chunkcache code assumes that. + * There's no reason to mark error for + * ASYNC buffers. + */ + XFS_BUF_ERROR(bp, EIO); + XFS_BUF_FINISH_IOWAIT(bp); + } else { + xfs_buf_relse(bp); + } + + return EIO; +} + + +/* + * All xfs metadata buffers except log state machine buffers + * get this attached as their b_bdstrat callback function. + * This is so that we can catch a buffer + * after prematurely unpinning it to forcibly shutdown the filesystem. + */ +int +xfs_bdstrat_cb( + struct xfs_buf *bp) +{ + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + /* + * Metadata write that didn't get logged but + * written delayed anyway. These aren't associated + * with a transaction, and can be ignored. + */ + if (!bp->b_iodone && !XFS_BUF_ISREAD(bp)) + return xfs_bioerror_relse(bp); + else + return xfs_bioerror(bp); + } + + xfs_buf_iorequest(bp); + return 0; +} + +/* + * Wrapper around bdstrat so that we can stop data from going to disk in case + * we are shutting down the filesystem. Typically user data goes thru this + * path; one of the exceptions is the superblock. + */ +void +xfsbdstrat( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + if (XFS_FORCED_SHUTDOWN(mp)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + xfs_bioerror_relse(bp); + return; + } + + xfs_buf_iorequest(bp); +} + +STATIC void +_xfs_buf_ioend( + xfs_buf_t *bp, + int schedule) +{ + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) + xfs_buf_ioend(bp, schedule); +} + +STATIC void +xfs_buf_bio_end_io( + struct bio *bio, + int error) +{ + xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; + + xfs_buf_ioerror(bp, -error); + + if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); + + _xfs_buf_ioend(bp, 1); + bio_put(bio); +} + +STATIC void +_xfs_buf_ioapply( + xfs_buf_t *bp) +{ + int rw, map_i, total_nr_pages, nr_pages; + struct bio *bio; + int offset = bp->b_offset; + int size = bp->b_count_desired; + sector_t sector = bp->b_bn; + + total_nr_pages = bp->b_page_count; + map_i = 0; + + if (bp->b_flags & XBF_ORDERED) { + ASSERT(!(bp->b_flags & XBF_READ)); + rw = WRITE_FLUSH_FUA; + } else if (bp->b_flags & XBF_LOG_BUFFER) { + ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); + bp->b_flags &= ~_XBF_RUN_QUEUES; + rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC; + } else if (bp->b_flags & _XBF_RUN_QUEUES) { + ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); + bp->b_flags &= ~_XBF_RUN_QUEUES; + rw = (bp->b_flags & XBF_WRITE) ? WRITE_META : READ_META; + } else { + rw = (bp->b_flags & XBF_WRITE) ? WRITE : + (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; + } + + +next_chunk: + atomic_inc(&bp->b_io_remaining); + nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); + if (nr_pages > total_nr_pages) + nr_pages = total_nr_pages; + + bio = bio_alloc(GFP_NOIO, nr_pages); + bio->bi_bdev = bp->b_target->bt_bdev; + bio->bi_sector = sector; + bio->bi_end_io = xfs_buf_bio_end_io; + bio->bi_private = bp; + + + for (; size && nr_pages; nr_pages--, map_i++) { + int rbytes, nbytes = PAGE_SIZE - offset; + + if (nbytes > size) + nbytes = size; + + rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset); + if (rbytes < nbytes) + break; + + offset = 0; + sector += nbytes >> BBSHIFT; + size -= nbytes; + total_nr_pages--; + } + + if (likely(bio->bi_size)) { + if (xfs_buf_is_vmapped(bp)) { + flush_kernel_vmap_range(bp->b_addr, + xfs_buf_vmap_len(bp)); + } + submit_bio(rw, bio); + if (size) + goto next_chunk; + } else { + xfs_buf_ioerror(bp, EIO); + bio_put(bio); + } +} + +int +xfs_buf_iorequest( + xfs_buf_t *bp) +{ + trace_xfs_buf_iorequest(bp, _RET_IP_); + + if (bp->b_flags & XBF_DELWRI) { + xfs_buf_delwri_queue(bp, 1); + return 0; + } + + if (bp->b_flags & XBF_WRITE) { + xfs_buf_wait_unpin(bp); + } + + xfs_buf_hold(bp); + + /* Set the count to 1 initially, this will stop an I/O + * completion callout which happens before we have started + * all the I/O from calling xfs_buf_ioend too early. + */ + atomic_set(&bp->b_io_remaining, 1); + _xfs_buf_ioapply(bp); + _xfs_buf_ioend(bp, 0); + + xfs_buf_rele(bp); + return 0; +} + +/* + * Waits for I/O to complete on the buffer supplied. + * It returns immediately if no I/O is pending. + * It returns the I/O error code, if any, or 0 if there was no error. + */ +int +xfs_buf_iowait( + xfs_buf_t *bp) +{ + trace_xfs_buf_iowait(bp, _RET_IP_); + + wait_for_completion(&bp->b_iowait); + + trace_xfs_buf_iowait_done(bp, _RET_IP_); + return bp->b_error; +} + +xfs_caddr_t +xfs_buf_offset( + xfs_buf_t *bp, + size_t offset) +{ + struct page *page; + + if (bp->b_flags & XBF_MAPPED) + return XFS_BUF_PTR(bp) + offset; + + offset += bp->b_offset; + page = bp->b_pages[offset >> PAGE_SHIFT]; + return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); +} + +/* + * Move data into or out of a buffer. + */ +void +xfs_buf_iomove( + xfs_buf_t *bp, /* buffer to process */ + size_t boff, /* starting buffer offset */ + size_t bsize, /* length to copy */ + void *data, /* data address */ + xfs_buf_rw_t mode) /* read/write/zero flag */ +{ + size_t bend, cpoff, csize; + struct page *page; + + bend = boff + bsize; + while (boff < bend) { + page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; + cpoff = xfs_buf_poff(boff + bp->b_offset); + csize = min_t(size_t, + PAGE_SIZE-cpoff, bp->b_count_desired-boff); + + ASSERT(((csize + cpoff) <= PAGE_SIZE)); + + switch (mode) { + case XBRW_ZERO: + memset(page_address(page) + cpoff, 0, csize); + break; + case XBRW_READ: + memcpy(data, page_address(page) + cpoff, csize); + break; + case XBRW_WRITE: + memcpy(page_address(page) + cpoff, data, csize); + } + + boff += csize; + data += csize; + } +} + +/* + * Handling of buffer targets (buftargs). + */ + +/* + * Wait for any bufs with callbacks that have been submitted but have not yet + * returned. These buffers will have an elevated hold count, so wait on those + * while freeing all the buffers only held by the LRU. + */ +void +xfs_wait_buftarg( + struct xfs_buftarg *btp) +{ + struct xfs_buf *bp; + +restart: + spin_lock(&btp->bt_lru_lock); + while (!list_empty(&btp->bt_lru)) { + bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); + if (atomic_read(&bp->b_hold) > 1) { + spin_unlock(&btp->bt_lru_lock); + delay(100); + goto restart; + } + /* + * clear the LRU reference count so the bufer doesn't get + * ignored in xfs_buf_rele(). + */ + atomic_set(&bp->b_lru_ref, 0); + spin_unlock(&btp->bt_lru_lock); + xfs_buf_rele(bp); + spin_lock(&btp->bt_lru_lock); + } + spin_unlock(&btp->bt_lru_lock); +} + +int +xfs_buftarg_shrink( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_buftarg *btp = container_of(shrink, + struct xfs_buftarg, bt_shrinker); + struct xfs_buf *bp; + int nr_to_scan = sc->nr_to_scan; + LIST_HEAD(dispose); + + if (!nr_to_scan) + return btp->bt_lru_nr; + + spin_lock(&btp->bt_lru_lock); + while (!list_empty(&btp->bt_lru)) { + if (nr_to_scan-- <= 0) + break; + + bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); + + /* + * Decrement the b_lru_ref count unless the value is already + * zero. If the value is already zero, we need to reclaim the + * buffer, otherwise it gets another trip through the LRU. + */ + if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { + list_move_tail(&bp->b_lru, &btp->bt_lru); + continue; + } + + /* + * remove the buffer from the LRU now to avoid needing another + * lock round trip inside xfs_buf_rele(). + */ + list_move(&bp->b_lru, &dispose); + btp->bt_lru_nr--; + } + spin_unlock(&btp->bt_lru_lock); + + while (!list_empty(&dispose)) { + bp = list_first_entry(&dispose, struct xfs_buf, b_lru); + list_del_init(&bp->b_lru); + xfs_buf_rele(bp); + } + + return btp->bt_lru_nr; +} + +void +xfs_free_buftarg( + struct xfs_mount *mp, + struct xfs_buftarg *btp) +{ + unregister_shrinker(&btp->bt_shrinker); + + xfs_flush_buftarg(btp, 1); + if (mp->m_flags & XFS_MOUNT_BARRIER) + xfs_blkdev_issue_flush(btp); + + kthread_stop(btp->bt_task); + kmem_free(btp); +} + +STATIC int +xfs_setsize_buftarg_flags( + xfs_buftarg_t *btp, + unsigned int blocksize, + unsigned int sectorsize, + int verbose) +{ + btp->bt_bsize = blocksize; + btp->bt_sshift = ffs(sectorsize) - 1; + btp->bt_smask = sectorsize - 1; + + if (set_blocksize(btp->bt_bdev, sectorsize)) { + xfs_warn(btp->bt_mount, + "Cannot set_blocksize to %u on device %s\n", + sectorsize, XFS_BUFTARG_NAME(btp)); + return EINVAL; + } + + return 0; +} + +/* + * When allocating the initial buffer target we have not yet + * read in the superblock, so don't know what sized sectors + * are being used is at this early stage. Play safe. + */ +STATIC int +xfs_setsize_buftarg_early( + xfs_buftarg_t *btp, + struct block_device *bdev) +{ + return xfs_setsize_buftarg_flags(btp, + PAGE_SIZE, bdev_logical_block_size(bdev), 0); +} + +int +xfs_setsize_buftarg( + xfs_buftarg_t *btp, + unsigned int blocksize, + unsigned int sectorsize) +{ + return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); +} + +STATIC int +xfs_alloc_delwrite_queue( + xfs_buftarg_t *btp, + const char *fsname) +{ + INIT_LIST_HEAD(&btp->bt_delwrite_queue); + spin_lock_init(&btp->bt_delwrite_lock); + btp->bt_flags = 0; + btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); + if (IS_ERR(btp->bt_task)) + return PTR_ERR(btp->bt_task); + return 0; +} + +xfs_buftarg_t * +xfs_alloc_buftarg( + struct xfs_mount *mp, + struct block_device *bdev, + int external, + const char *fsname) +{ + xfs_buftarg_t *btp; + + btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); + + btp->bt_mount = mp; + btp->bt_dev = bdev->bd_dev; + btp->bt_bdev = bdev; + btp->bt_bdi = blk_get_backing_dev_info(bdev); + if (!btp->bt_bdi) + goto error; + + INIT_LIST_HEAD(&btp->bt_lru); + spin_lock_init(&btp->bt_lru_lock); + if (xfs_setsize_buftarg_early(btp, bdev)) + goto error; + if (xfs_alloc_delwrite_queue(btp, fsname)) + goto error; + btp->bt_shrinker.shrink = xfs_buftarg_shrink; + btp->bt_shrinker.seeks = DEFAULT_SEEKS; + register_shrinker(&btp->bt_shrinker); + return btp; + +error: + kmem_free(btp); + return NULL; +} + + +/* + * Delayed write buffer handling + */ +STATIC void +xfs_buf_delwri_queue( + xfs_buf_t *bp, + int unlock) +{ + struct list_head *dwq = &bp->b_target->bt_delwrite_queue; + spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; + + trace_xfs_buf_delwri_queue(bp, _RET_IP_); + + ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); + + spin_lock(dwlk); + /* If already in the queue, dequeue and place at tail */ + if (!list_empty(&bp->b_list)) { + ASSERT(bp->b_flags & _XBF_DELWRI_Q); + if (unlock) + atomic_dec(&bp->b_hold); + list_del(&bp->b_list); + } + + if (list_empty(dwq)) { + /* start xfsbufd as it is about to have something to do */ + wake_up_process(bp->b_target->bt_task); + } + + bp->b_flags |= _XBF_DELWRI_Q; + list_add_tail(&bp->b_list, dwq); + bp->b_queuetime = jiffies; + spin_unlock(dwlk); + + if (unlock) + xfs_buf_unlock(bp); +} + +void +xfs_buf_delwri_dequeue( + xfs_buf_t *bp) +{ + spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; + int dequeued = 0; + + spin_lock(dwlk); + if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { + ASSERT(bp->b_flags & _XBF_DELWRI_Q); + list_del_init(&bp->b_list); + dequeued = 1; + } + bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); + spin_unlock(dwlk); + + if (dequeued) + xfs_buf_rele(bp); + + trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); +} + +/* + * If a delwri buffer needs to be pushed before it has aged out, then promote + * it to the head of the delwri queue so that it will be flushed on the next + * xfsbufd run. We do this by resetting the queuetime of the buffer to be older + * than the age currently needed to flush the buffer. Hence the next time the + * xfsbufd sees it is guaranteed to be considered old enough to flush. + */ +void +xfs_buf_delwri_promote( + struct xfs_buf *bp) +{ + struct xfs_buftarg *btp = bp->b_target; + long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1; + + ASSERT(bp->b_flags & XBF_DELWRI); + ASSERT(bp->b_flags & _XBF_DELWRI_Q); + + /* + * Check the buffer age before locking the delayed write queue as we + * don't need to promote buffers that are already past the flush age. + */ + if (bp->b_queuetime < jiffies - age) + return; + bp->b_queuetime = jiffies - age; + spin_lock(&btp->bt_delwrite_lock); + list_move(&bp->b_list, &btp->bt_delwrite_queue); + spin_unlock(&btp->bt_delwrite_lock); +} + +STATIC void +xfs_buf_runall_queues( + struct workqueue_struct *queue) +{ + flush_workqueue(queue); +} + +/* + * Move as many buffers as specified to the supplied list + * idicating if we skipped any buffers to prevent deadlocks. + */ +STATIC int +xfs_buf_delwri_split( + xfs_buftarg_t *target, + struct list_head *list, + unsigned long age) +{ + xfs_buf_t *bp, *n; + struct list_head *dwq = &target->bt_delwrite_queue; + spinlock_t *dwlk = &target->bt_delwrite_lock; + int skipped = 0; + int force; + + force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); + INIT_LIST_HEAD(list); + spin_lock(dwlk); + list_for_each_entry_safe(bp, n, dwq, b_list) { + ASSERT(bp->b_flags & XBF_DELWRI); + + if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { + if (!force && + time_before(jiffies, bp->b_queuetime + age)) { + xfs_buf_unlock(bp); + break; + } + + bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q| + _XBF_RUN_QUEUES); + bp->b_flags |= XBF_WRITE; + list_move_tail(&bp->b_list, list); + trace_xfs_buf_delwri_split(bp, _RET_IP_); + } else + skipped++; + } + spin_unlock(dwlk); + + return skipped; + +} + +/* + * Compare function is more complex than it needs to be because + * the return value is only 32 bits and we are doing comparisons + * on 64 bit values + */ +static int +xfs_buf_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); + struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); + xfs_daddr_t diff; + + diff = ap->b_bn - bp->b_bn; + if (diff < 0) + return -1; + if (diff > 0) + return 1; + return 0; +} + +void +xfs_buf_delwri_sort( + xfs_buftarg_t *target, + struct list_head *list) +{ + list_sort(NULL, list, xfs_buf_cmp); +} + +STATIC int +xfsbufd( + void *data) +{ + xfs_buftarg_t *target = (xfs_buftarg_t *)data; + + current->flags |= PF_MEMALLOC; + + set_freezable(); + + do { + long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); + long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); + struct list_head tmp; + struct blk_plug plug; + + if (unlikely(freezing(current))) { + set_bit(XBT_FORCE_SLEEP, &target->bt_flags); + refrigerator(); + } else { + clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); + } + + /* sleep for a long time if there is nothing to do. */ + if (list_empty(&target->bt_delwrite_queue)) + tout = MAX_SCHEDULE_TIMEOUT; + schedule_timeout_interruptible(tout); + + xfs_buf_delwri_split(target, &tmp, age); + list_sort(NULL, &tmp, xfs_buf_cmp); + + blk_start_plug(&plug); + while (!list_empty(&tmp)) { + struct xfs_buf *bp; + bp = list_first_entry(&tmp, struct xfs_buf, b_list); + list_del_init(&bp->b_list); + xfs_bdstrat_cb(bp); + } + blk_finish_plug(&plug); + } while (!kthread_should_stop()); + + return 0; +} + +/* + * Go through all incore buffers, and release buffers if they belong to + * the given device. This is used in filesystem error handling to + * preserve the consistency of its metadata. + */ +int +xfs_flush_buftarg( + xfs_buftarg_t *target, + int wait) +{ + xfs_buf_t *bp; + int pincount = 0; + LIST_HEAD(tmp_list); + LIST_HEAD(wait_list); + struct blk_plug plug; + + xfs_buf_runall_queues(xfsconvertd_workqueue); + xfs_buf_runall_queues(xfsdatad_workqueue); + xfs_buf_runall_queues(xfslogd_workqueue); + + set_bit(XBT_FORCE_FLUSH, &target->bt_flags); + pincount = xfs_buf_delwri_split(target, &tmp_list, 0); + + /* + * Dropped the delayed write list lock, now walk the temporary list. + * All I/O is issued async and then if we need to wait for completion + * we do that after issuing all the IO. + */ + list_sort(NULL, &tmp_list, xfs_buf_cmp); + + blk_start_plug(&plug); + while (!list_empty(&tmp_list)) { + bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); + ASSERT(target == bp->b_target); + list_del_init(&bp->b_list); + if (wait) { + bp->b_flags &= ~XBF_ASYNC; + list_add(&bp->b_list, &wait_list); + } + xfs_bdstrat_cb(bp); + } + blk_finish_plug(&plug); + + if (wait) { + /* Wait for IO to complete. */ + while (!list_empty(&wait_list)) { + bp = list_first_entry(&wait_list, struct xfs_buf, b_list); + + list_del_init(&bp->b_list); + xfs_buf_iowait(bp); + xfs_buf_relse(bp); + } + } + + return pincount; +} + +int __init +xfs_buf_init(void) +{ + xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", + KM_ZONE_HWALIGN, NULL); + if (!xfs_buf_zone) + goto out; + + xfslogd_workqueue = alloc_workqueue("xfslogd", + WQ_MEM_RECLAIM | WQ_HIGHPRI, 1); + if (!xfslogd_workqueue) + goto out_free_buf_zone; + + xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1); + if (!xfsdatad_workqueue) + goto out_destroy_xfslogd_workqueue; + + xfsconvertd_workqueue = alloc_workqueue("xfsconvertd", + WQ_MEM_RECLAIM, 1); + if (!xfsconvertd_workqueue) + goto out_destroy_xfsdatad_workqueue; + + return 0; + + out_destroy_xfsdatad_workqueue: + destroy_workqueue(xfsdatad_workqueue); + out_destroy_xfslogd_workqueue: + destroy_workqueue(xfslogd_workqueue); + out_free_buf_zone: + kmem_zone_destroy(xfs_buf_zone); + out: + return -ENOMEM; +} + +void +xfs_buf_terminate(void) +{ + destroy_workqueue(xfsconvertd_workqueue); + destroy_workqueue(xfsdatad_workqueue); + destroy_workqueue(xfslogd_workqueue); + kmem_zone_destroy(xfs_buf_zone); +} + +#ifdef CONFIG_KDB_MODULES +struct list_head * +xfs_get_buftarg_list(void) +{ + return &xfs_buftarg_list; +} +#endif diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h new file mode 100644 index 00000000..36d6ee44 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BUF_H__ +#define __XFS_BUF_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Base types + */ + +#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) + +#define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) +#define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) +#define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) +#define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) + +typedef enum { + XBRW_READ = 1, /* transfer into target memory */ + XBRW_WRITE = 2, /* transfer from target memory */ + XBRW_ZERO = 3, /* Zero target memory */ +} xfs_buf_rw_t; + +#define XBF_READ (1 << 0) /* buffer intended for reading from device */ +#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ +#define XBF_MAPPED (1 << 2) /* buffer mapped (b_addr valid) */ +#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ +#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ +#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ +#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ +#define XBF_ORDERED (1 << 11)/* use ordered writes */ +#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */ +#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */ + +/* flags used only as arguments to access routines */ +#define XBF_LOCK (1 << 14)/* lock requested */ +#define XBF_TRYLOCK (1 << 15)/* lock requested, but do not wait */ +#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */ + +/* flags used only internally */ +#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */ +#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */ +#define _XBF_KMEM (1 << 20)/* backed by heap memory */ +#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */ + +typedef unsigned int xfs_buf_flags_t; + +#define XFS_BUF_FLAGS \ + { XBF_READ, "READ" }, \ + { XBF_WRITE, "WRITE" }, \ + { XBF_MAPPED, "MAPPED" }, \ + { XBF_ASYNC, "ASYNC" }, \ + { XBF_DONE, "DONE" }, \ + { XBF_DELWRI, "DELWRI" }, \ + { XBF_STALE, "STALE" }, \ + { XBF_ORDERED, "ORDERED" }, \ + { XBF_READ_AHEAD, "READ_AHEAD" }, \ + { XBF_LOCK, "LOCK" }, /* should never be set */\ + { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ + { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ + { _XBF_PAGES, "PAGES" }, \ + { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ + { _XBF_KMEM, "KMEM" }, \ + { _XBF_DELWRI_Q, "DELWRI_Q" } + +typedef enum { + XBT_FORCE_SLEEP = 0, + XBT_FORCE_FLUSH = 1, +} xfs_buftarg_flags_t; + +typedef struct xfs_bufhash { + struct list_head bh_list; + spinlock_t bh_lock; +} xfs_bufhash_t; + +typedef struct xfs_buftarg { + dev_t bt_dev; + struct block_device *bt_bdev; + struct backing_dev_info *bt_bdi; + struct xfs_mount *bt_mount; + unsigned int bt_bsize; + unsigned int bt_sshift; + size_t bt_smask; + + /* per device delwri queue */ + struct task_struct *bt_task; + struct list_head bt_delwrite_queue; + spinlock_t bt_delwrite_lock; + unsigned long bt_flags; + + /* LRU control structures */ + struct shrinker bt_shrinker; + struct list_head bt_lru; + spinlock_t bt_lru_lock; + unsigned int bt_lru_nr; +} xfs_buftarg_t; + +struct xfs_buf; +typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); + +#define XB_PAGES 2 + +typedef struct xfs_buf { + /* + * first cacheline holds all the fields needed for an uncontended cache + * hit to be fully processed. The semaphore straddles the cacheline + * boundary, but the counter and lock sits on the first cacheline, + * which is the only bit that is touched if we hit the semaphore + * fast-path on locking. + */ + struct rb_node b_rbnode; /* rbtree node */ + xfs_off_t b_file_offset; /* offset in file */ + size_t b_buffer_length;/* size of buffer in bytes */ + atomic_t b_hold; /* reference count */ + atomic_t b_lru_ref; /* lru reclaim ref count */ + xfs_buf_flags_t b_flags; /* status flags */ + struct semaphore b_sema; /* semaphore for lockables */ + + struct list_head b_lru; /* lru list */ + wait_queue_head_t b_waiters; /* unpin waiters */ + struct list_head b_list; + struct xfs_perag *b_pag; /* contains rbtree root */ + xfs_buftarg_t *b_target; /* buffer target (device) */ + xfs_daddr_t b_bn; /* block number for I/O */ + size_t b_count_desired;/* desired transfer size */ + void *b_addr; /* virtual address of buffer */ + struct work_struct b_iodone_work; + xfs_buf_iodone_t b_iodone; /* I/O completion function */ + struct completion b_iowait; /* queue for I/O waiters */ + void *b_fspriv; + void *b_fspriv2; + struct page **b_pages; /* array of page pointers */ + struct page *b_page_array[XB_PAGES]; /* inline pages */ + unsigned long b_queuetime; /* time buffer was queued */ + atomic_t b_pin_count; /* pin count */ + atomic_t b_io_remaining; /* #outstanding I/O requests */ + unsigned int b_page_count; /* size of page array */ + unsigned int b_offset; /* page offset in first page */ + unsigned short b_error; /* error code on I/O */ +#ifdef XFS_BUF_LOCK_TRACKING + int b_last_holder; +#endif +} xfs_buf_t; + + +/* Finding and Reading Buffers */ +extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t, + xfs_buf_flags_t, xfs_buf_t *); +#define xfs_incore(buftarg,blkno,len,lockit) \ + _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) + +extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t, + xfs_buf_flags_t); +extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, + xfs_buf_flags_t); + +extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); +extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len); +extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); +extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); +extern void xfs_buf_hold(xfs_buf_t *); +extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t); +struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp, + struct xfs_buftarg *target, + xfs_daddr_t daddr, size_t length, int flags); + +/* Releasing Buffers */ +extern void xfs_buf_free(xfs_buf_t *); +extern void xfs_buf_rele(xfs_buf_t *); + +/* Locking and Unlocking Buffers */ +extern int xfs_buf_cond_lock(xfs_buf_t *); +extern int xfs_buf_lock_value(xfs_buf_t *); +extern void xfs_buf_lock(xfs_buf_t *); +extern void xfs_buf_unlock(xfs_buf_t *); + +/* Buffer Read and Write Routines */ +extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); +extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); + +extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); +extern int xfs_bdstrat_cb(struct xfs_buf *); + +extern void xfs_buf_ioend(xfs_buf_t *, int); +extern void xfs_buf_ioerror(xfs_buf_t *, int); +extern int xfs_buf_iorequest(xfs_buf_t *); +extern int xfs_buf_iowait(xfs_buf_t *); +extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, + xfs_buf_rw_t); +#define xfs_buf_zero(bp, off, len) \ + xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) + +static inline int xfs_buf_geterror(xfs_buf_t *bp) +{ + return bp ? bp->b_error : ENOMEM; +} + +/* Buffer Utility Routines */ +extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); + +/* Delayed Write Buffer Routines */ +extern void xfs_buf_delwri_dequeue(xfs_buf_t *); +extern void xfs_buf_delwri_promote(xfs_buf_t *); + +/* Buffer Daemon Setup Routines */ +extern int xfs_buf_init(void); +extern void xfs_buf_terminate(void); + +#define xfs_buf_target_name(target) \ + ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) + + +#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) +#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ + ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) + +void xfs_buf_stale(struct xfs_buf *bp); +#define XFS_BUF_STALE(bp) xfs_buf_stale(bp); +#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) +#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) +#define XFS_BUF_SUPER_STALE(bp) do { \ + XFS_BUF_STALE(bp); \ + xfs_buf_delwri_dequeue(bp); \ + XFS_BUF_DONE(bp); \ + } while (0) + +#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) +#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) +#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) + +#define XFS_BUF_ERROR(bp,no) xfs_buf_ioerror(bp,no) +#define XFS_BUF_GETERROR(bp) xfs_buf_geterror(bp) +#define XFS_BUF_ISERROR(bp) (xfs_buf_geterror(bp) ? 1 : 0) + +#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) +#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) +#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) + +#define XFS_BUF_BUSY(bp) do { } while (0) +#define XFS_BUF_UNBUSY(bp) do { } while (0) +#define XFS_BUF_ISBUSY(bp) (1) + +#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC) +#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) +#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) + +#define XFS_BUF_ORDERED(bp) ((bp)->b_flags |= XBF_ORDERED) +#define XFS_BUF_UNORDERED(bp) ((bp)->b_flags &= ~XBF_ORDERED) +#define XFS_BUF_ISORDERED(bp) ((bp)->b_flags & XBF_ORDERED) + +#define XFS_BUF_HOLD(bp) xfs_buf_hold(bp) +#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) +#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) +#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ) + +#define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE) +#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) +#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) + +#define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone) +#define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func)) +#define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL) + +#define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv) +#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val)) +#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) +#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) +#define XFS_BUF_SET_START(bp) do { } while (0) + +#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) +#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) +#define XFS_BUF_ADDR(bp) ((bp)->b_bn) +#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) +#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset) +#define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off)) +#define XFS_BUF_COUNT(bp) ((bp)->b_count_desired) +#define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt)) +#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) +#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) + +static inline void +xfs_buf_set_ref( + struct xfs_buf *bp, + int lru_ref) +{ + atomic_set(&bp->b_lru_ref, lru_ref); +} +#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref) +#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) + +#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) + +#define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp) +#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0) +#define XFS_BUF_VSEMA(bp) xfs_buf_unlock(bp) +#define XFS_BUF_PSEMA(bp,x) xfs_buf_lock(bp) +#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); + +#define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target)) +#define XFS_BUF_TARGET(bp) ((bp)->b_target) +#define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target) + +static inline void xfs_buf_relse(xfs_buf_t *bp) +{ + xfs_buf_unlock(bp); + xfs_buf_rele(bp); +} + +/* + * Handling of buftargs. + */ +extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, + struct block_device *, int, const char *); +extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); +extern void xfs_wait_buftarg(xfs_buftarg_t *); +extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); +extern int xfs_flush_buftarg(xfs_buftarg_t *, int); + +#ifdef CONFIG_KDB_MODULES +extern struct list_head *xfs_get_buftarg_list(void); +#endif + +#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) +#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) + +#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1) + +#endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c new file mode 100644 index 00000000..572494fa --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_discard.c @@ -0,0 +1,222 @@ +/* + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_discard.h" +#include "xfs_trace.h" + +STATIC int +xfs_trim_extents( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_fsblock_t start, + xfs_fsblock_t len, + xfs_fsblock_t minlen, + __uint64_t *blocks_trimmed) +{ + struct block_device *bdev = mp->m_ddev_targp->bt_bdev; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + struct xfs_perag *pag; + int error; + int i; + + pag = xfs_perag_get(mp, agno); + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error || !agbp) + goto out_put_perag; + + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + + /* + * Force out the log. This means any transactions that might have freed + * space before we took the AGF buffer lock are now on disk, and the + * volatile disk cache is flushed. + */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * Look up the longest btree in the AGF and start with it. + */ + error = xfs_alloc_lookup_le(cur, 0, + be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); + if (error) + goto out_del_cursor; + + /* + * Loop until we are done with all extents that are large + * enough to be worth discarding. + */ + while (i) { + xfs_agblock_t fbno; + xfs_extlen_t flen; + + error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); + if (error) + goto out_del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); + ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); + + /* + * Too small? Give up. + */ + if (flen < minlen) { + trace_xfs_discard_toosmall(mp, agno, fbno, flen); + goto out_del_cursor; + } + + /* + * If the extent is entirely outside of the range we are + * supposed to discard skip it. Do not bother to trim + * down partially overlapping ranges for now. + */ + if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || + XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) { + trace_xfs_discard_exclude(mp, agno, fbno, flen); + goto next_extent; + } + + /* + * If any blocks in the range are still busy, skip the + * discard and try again the next time. + */ + if (xfs_alloc_busy_search(mp, agno, fbno, flen)) { + trace_xfs_discard_busy(mp, agno, fbno, flen); + goto next_extent; + } + + trace_xfs_discard_extent(mp, agno, fbno, flen); + error = -blkdev_issue_discard(bdev, + XFS_AGB_TO_DADDR(mp, agno, fbno), + XFS_FSB_TO_BB(mp, flen), + GFP_NOFS, 0); + if (error) + goto out_del_cursor; + *blocks_trimmed += flen; + +next_extent: + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto out_del_cursor; + } + +out_del_cursor: + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); +out_put_perag: + xfs_perag_put(pag); + return error; +} + +int +xfs_ioc_trim( + struct xfs_mount *mp, + struct fstrim_range __user *urange) +{ + struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; + unsigned int granularity = q->limits.discard_granularity; + struct fstrim_range range; + xfs_fsblock_t start, len, minlen; + xfs_agnumber_t start_agno, end_agno, agno; + __uint64_t blocks_trimmed = 0; + int error, last_error = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (!blk_queue_discard(q)) + return -XFS_ERROR(EOPNOTSUPP); + if (copy_from_user(&range, urange, sizeof(range))) + return -XFS_ERROR(EFAULT); + + /* + * Truncating down the len isn't actually quite correct, but using + * XFS_B_TO_FSB would mean we trivially get overflows for values + * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default + * used by the fstrim application. In the end it really doesn't + * matter as trimming blocks is an advisory interface. + */ + start = XFS_B_TO_FSBT(mp, range.start); + len = XFS_B_TO_FSBT(mp, range.len); + minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); + + start_agno = XFS_FSB_TO_AGNO(mp, start); + if (start_agno >= mp->m_sb.sb_agcount) + return -XFS_ERROR(EINVAL); + + end_agno = XFS_FSB_TO_AGNO(mp, start + len); + if (end_agno >= mp->m_sb.sb_agcount) + end_agno = mp->m_sb.sb_agcount - 1; + + for (agno = start_agno; agno <= end_agno; agno++) { + error = -xfs_trim_extents(mp, agno, start, len, minlen, + &blocks_trimmed); + if (error) + last_error = error; + } + + if (last_error) + return last_error; + + range.len = XFS_FSB_TO_B(mp, blocks_trimmed); + if (copy_to_user(urange, &range, sizeof(range))) + return -XFS_ERROR(EFAULT); + return 0; +} + +int +xfs_discard_extents( + struct xfs_mount *mp, + struct list_head *list) +{ + struct xfs_busy_extent *busyp; + int error = 0; + + list_for_each_entry(busyp, list, list) { + trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, + busyp->length); + + error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, + XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), + XFS_FSB_TO_BB(mp, busyp->length), + GFP_NOFS, 0); + if (error && error != EOPNOTSUPP) { + xfs_info(mp, + "discard failed for extent [0x%llu,%u], error %d", + (unsigned long long)busyp->bno, + busyp->length, + error); + return error; + } + } + + return 0; +} diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h new file mode 100644 index 00000000..344879ae --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_discard.h @@ -0,0 +1,10 @@ +#ifndef XFS_DISCARD_H +#define XFS_DISCARD_H 1 + +struct fstrim_range; +struct list_head; + +extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); +extern int xfs_discard_extents(struct xfs_mount *, struct list_head *); + +#endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c new file mode 100644 index 00000000..fed3f3c8 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2004-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_export.h" +#include "xfs_vnodeops.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_trace.h" + +/* + * Note that we only accept fileids which are long enough rather than allow + * the parent generation number to default to zero. XFS considers zero a + * valid generation number not an invalid/wildcard value. + */ +static int xfs_fileid_length(int fileid_type) +{ + switch (fileid_type) { + case FILEID_INO32_GEN: + return 2; + case FILEID_INO32_GEN_PARENT: + return 4; + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + return 3; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + return 6; + } + return 255; /* invalid */ +} + +STATIC int +xfs_fs_encode_fh( + struct dentry *dentry, + __u32 *fh, + int *max_len, + int connectable) +{ + struct fid *fid = (struct fid *)fh; + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh; + struct inode *inode = dentry->d_inode; + int fileid_type; + int len; + + /* Directories don't need their parent encoded, they have ".." */ + if (S_ISDIR(inode->i_mode) || !connectable) + fileid_type = FILEID_INO32_GEN; + else + fileid_type = FILEID_INO32_GEN_PARENT; + + /* + * If the the filesystem may contain 64bit inode numbers, we need + * to use larger file handles that can represent them. + * + * While we only allocate inodes that do not fit into 32 bits any + * large enough filesystem may contain them, thus the slightly + * confusing looking conditional below. + */ + if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) || + (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES)) + fileid_type |= XFS_FILEID_TYPE_64FLAG; + + /* + * Only encode if there is enough space given. In practice + * this means we can't export a filesystem with 64bit inodes + * over NFSv2 with the subtree_check export option; the other + * seven combinations work. The real answer is "don't use v2". + */ + len = xfs_fileid_length(fileid_type); + if (*max_len < len) { + *max_len = len; + return 255; + } + *max_len = len; + + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + spin_lock(&dentry->d_lock); + fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; + fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation; + spin_unlock(&dentry->d_lock); + /*FALLTHRU*/ + case FILEID_INO32_GEN: + fid->i32.ino = XFS_I(inode)->i_ino; + fid->i32.gen = inode->i_generation; + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + spin_lock(&dentry->d_lock); + fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; + fid64->parent_gen = dentry->d_parent->d_inode->i_generation; + spin_unlock(&dentry->d_lock); + /*FALLTHRU*/ + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + fid64->ino = XFS_I(inode)->i_ino; + fid64->gen = inode->i_generation; + break; + } + + return fileid_type; +} + +STATIC struct inode * +xfs_nfs_get_inode( + struct super_block *sb, + u64 ino, + u32 generation) + { + xfs_mount_t *mp = XFS_M(sb); + xfs_inode_t *ip; + int error; + + /* + * NFS can sometimes send requests for ino 0. Fail them gracefully. + */ + if (ino == 0) + return ERR_PTR(-ESTALE); + + /* + * The XFS_IGET_UNTRUSTED means that an invalid inode number is just + * fine and not an indication of a corrupted filesystem as clients can + * send invalid file handles and we have to handle it gracefully.. + */ + error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip); + if (error) { + /* + * EINVAL means the inode cluster doesn't exist anymore. + * This implies the filehandle is stale, so we should + * translate it here. + * We don't use ESTALE directly down the chain to not + * confuse applications using bulkstat that expect EINVAL. + */ + if (error == EINVAL) + error = ESTALE; + return ERR_PTR(-error); + } + + if (ip->i_d.di_gen != generation) { + IRELE(ip); + return ERR_PTR(-ENOENT); + } + + return VFS_I(ip); +} + +STATIC struct dentry * +xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fileid_type) +{ + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; + struct inode *inode = NULL; + + if (fh_len < xfs_fileid_length(fileid_type)) + return NULL; + + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + case FILEID_INO32_GEN: + inode = xfs_nfs_get_inode(sb, fid->i32.ino, fid->i32.gen); + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + inode = xfs_nfs_get_inode(sb, fid64->ino, fid64->gen); + break; + } + + return d_obtain_alias(inode); +} + +STATIC struct dentry * +xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fileid_type) +{ + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; + struct inode *inode = NULL; + + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino, + fid->i32.parent_gen); + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + inode = xfs_nfs_get_inode(sb, fid64->parent_ino, + fid64->parent_gen); + break; + } + + return d_obtain_alias(inode); +} + +STATIC struct dentry * +xfs_fs_get_parent( + struct dentry *child) +{ + int error; + struct xfs_inode *cip; + + error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL); + if (unlikely(error)) + return ERR_PTR(-error); + + return d_obtain_alias(VFS_I(cip)); +} + +STATIC int +xfs_fs_nfs_commit_metadata( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + int error = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) { + error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn, + XFS_LOG_SYNC, NULL); + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + return error; +} + +const struct export_operations xfs_export_operations = { + .encode_fh = xfs_fs_encode_fh, + .fh_to_dentry = xfs_fs_fh_to_dentry, + .fh_to_parent = xfs_fs_fh_to_parent, + .get_parent = xfs_fs_get_parent, + .commit_metadata = xfs_fs_nfs_commit_metadata, +}; diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/linux-2.6/xfs_export.h new file mode 100644 index 00000000..3272b6ae --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_export.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_EXPORT_H__ +#define __XFS_EXPORT_H__ + +/* + * Common defines for code related to exporting XFS filesystems over NFS. + * + * The NFS fileid goes out on the wire as an array of + * 32bit unsigned ints in host order. There are 5 possible + * formats. + * + * (1) fileid_type=0x00 + * (no fileid data; handled by the generic code) + * + * (2) fileid_type=0x01 + * inode-num + * generation + * + * (3) fileid_type=0x02 + * inode-num + * generation + * parent-inode-num + * parent-generation + * + * (4) fileid_type=0x81 + * inode-num-lo32 + * inode-num-hi32 + * generation + * + * (5) fileid_type=0x82 + * inode-num-lo32 + * inode-num-hi32 + * generation + * parent-inode-num-lo32 + * parent-inode-num-hi32 + * parent-generation + * + * Note, the NFS filehandle also includes an fsid portion which + * may have an inode number in it. That number is hardcoded to + * 32bits and there is no way for XFS to intercept it. In + * practice this means when exporting an XFS filesystem with 64bit + * inodes you should either export the mountpoint (rather than + * a subdirectory) or use the "fsid" export option. + */ + +struct xfs_fid64 { + u64 ino; + u32 gen; + u64 parent_ino; + u32 parent_gen; +} __attribute__((packed)); + +/* This flag goes on the wire. Don't play with it. */ +#define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */ + +#endif /* __XFS_EXPORT_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c new file mode 100644 index 00000000..b679198d --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -0,0 +1,1114 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_trans.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_vnodeops.h" +#include "xfs_da_btree.h" +#include "xfs_ioctl.h" +#include "xfs_trace.h" + +#include +#include + +static const struct vm_operations_struct xfs_file_vm_ops; + +/* + * Locking primitives for read and write IO paths to ensure we consistently use + * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. + */ +static inline void +xfs_rw_ilock( + struct xfs_inode *ip, + int type) +{ + if (type & XFS_IOLOCK_EXCL) + mutex_lock(&VFS_I(ip)->i_mutex); + xfs_ilock(ip, type); +} + +static inline void +xfs_rw_iunlock( + struct xfs_inode *ip, + int type) +{ + xfs_iunlock(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +static inline void +xfs_rw_ilock_demote( + struct xfs_inode *ip, + int type) +{ + xfs_ilock_demote(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +/* + * xfs_iozero + * + * xfs_iozero clears the specified range of buffer supplied, + * and marks all the affected blocks as valid and modified. If + * an affected block is not allocated, it will be allocated. If + * an affected block is not completely overwritten, and is not + * valid before the operation, it will be read from disk before + * being partially zeroed. + */ +STATIC int +xfs_iozero( + struct xfs_inode *ip, /* inode */ + loff_t pos, /* offset in file */ + size_t count) /* size of data to zero */ +{ + struct page *page; + struct address_space *mapping; + int status; + + mapping = VFS_I(ip)->i_mapping; + do { + unsigned offset, bytes; + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) + break; + + zero_user(page, offset, bytes); + + status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, + page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + pos += bytes; + count -= bytes; + status = 0; + } while (count); + + return (-status); +} + +STATIC int +xfs_file_fsync( + struct file *file, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error = 0; + int log_flushed = 0; + + trace_xfs_file_fsync(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + xfs_iflags_clear(ip, XFS_ITRUNCATED); + + xfs_ioend_wait(ip); + + if (mp->m_flags & XFS_MOUNT_BARRIER) { + /* + * If we have an RT and/or log subvolume we need to make sure + * to flush the write cache the device used for file data + * first. This is to ensure newly written file data make + * it to disk before logging the new inode size in case of + * an extending write. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(mp->m_rtdev_targp); + else if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_blkdev_issue_flush(mp->m_ddev_targp); + } + + /* + * We always need to make sure that the required inode state is safe on + * disk. The inode might be clean but we still might need to force the + * log because of committed transactions that haven't hit the disk yet. + * Likewise, there could be unflushed non-transactional changes to the + * inode core that have to go to disk and this requires us to issue + * a synchronous transaction to capture these changes correctly. + * + * This code relies on the assumption that if the i_update_core field + * of the inode is clear and the inode is unpinned then it is clean + * and no action is required. + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + + /* + * First check if the VFS inode is marked dirty. All the dirtying + * of non-transactional updates no goes through mark_inode_dirty*, + * which allows us to distinguish beteeen pure timestamp updates + * and i_size updates which need to be caught for fdatasync. + * After that also theck for the dirty state in the XFS inode, which + * might gets cleared when the inode gets written out via the AIL + * or xfs_iflush_cluster. + */ + if (((inode->i_state & I_DIRTY_DATASYNC) || + ((inode->i_state & I_DIRTY_SYNC) && !datasync)) && + ip->i_update_core) { + /* + * Kick off a transaction to log the inode core to get the + * updates. The sync transaction will also force the log. + */ + xfs_iunlock(ip, XFS_ILOCK_SHARED); + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, + XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return -error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Note - it's possible that we might have pushed ourselves out + * of the way during trans_reserve which would flush the inode. + * But there's no guarantee that the inode buffer has actually + * gone out yet (it's delwri). Plus the buffer could be pinned + * anyway if it's part of an inode in another recent + * transaction. So we play it safe and fire off the + * transaction anyway. + */ + xfs_trans_ijoin(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_set_sync(tp); + error = _xfs_trans_commit(tp, 0, &log_flushed); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } else { + /* + * Timestamps/size haven't changed since last inode flush or + * inode transaction commit. That means either nothing got + * written or a transaction committed which caught the updates. + * If the latter happened and the transaction hasn't hit the + * disk yet, the inode will be still be pinned. If it is, + * force the log. + */ + if (xfs_ipincount(ip)) { + error = _xfs_log_force_lsn(mp, + ip->i_itemp->ili_last_lsn, + XFS_LOG_SYNC, &log_flushed); + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + } + + /* + * If we only have a single device, and the log force about was + * a no-op we might have to flush the data device cache here. + * This can only happen for fdatasync/O_DSYNC if we were overwriting + * an already allocated file and thus do not have any metadata to + * commit. + */ + if ((mp->m_flags & XFS_MOUNT_BARRIER) && + mp->m_logdev_targp == mp->m_ddev_targp && + !XFS_IS_REALTIME_INODE(ip) && + !log_flushed) + xfs_blkdev_issue_flush(mp->m_ddev_targp); + + return -error; +} + +STATIC ssize_t +xfs_file_aio_read( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + size_t size = 0; + ssize_t ret = 0; + int ioflags = 0; + xfs_fsize_t n; + unsigned long seg; + + XFS_STATS_INC(xs_read_calls); + + BUG_ON(iocb->ki_pos != pos); + + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + if (file->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + + /* START copy & waste from filemap.c */ + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *iv = &iovp[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + size += iv->iov_len; + if (unlikely((ssize_t)(size|iv->iov_len) < 0)) + return XFS_ERROR(-EINVAL); + } + /* END copy & waste from filemap.c */ + + if (unlikely(ioflags & IO_ISDIRECT)) { + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + if ((iocb->ki_pos & target->bt_smask) || + (size & target->bt_smask)) { + if (iocb->ki_pos == ip->i_size) + return 0; + return -XFS_ERROR(EINVAL); + } + } + + n = XFS_MAXIOFFSET(mp) - iocb->ki_pos; + if (n <= 0 || size == 0) + return 0; + + if (n < size) + size = n; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* + * Locking is a bit tricky here. If we take an exclusive lock + * for direct IO, we effectively serialise all new concurrent + * read IO to this file and block it behind IO that is currently in + * progress because IO in progress holds the IO lock shared. We only + * need to hold the lock exclusive to blow away the page cache, so + * only take lock exclusively if the page cache needs invalidation. + * This allows the normal direct IO case of no page cache pages to + * proceeed concurrently without serialisation. + */ + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + + if (inode->i_mapping->nrpages) { + ret = -xfs_flushinval_pages(ip, + (iocb->ki_pos & PAGE_CACHE_MASK), + -1, FI_REMAPF_LOCKED); + if (ret) { + xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); + return ret; + } + } + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + } + + trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); + + ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} + +STATIC ssize_t +xfs_file_splice_read( + struct file *infilp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t count, + unsigned int flags) +{ + struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); + int ioflags = 0; + ssize_t ret; + + XFS_STATS_INC(xs_read_calls); + + if (infilp->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + + trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} + +STATIC void +xfs_aio_write_isize_update( + struct inode *inode, + loff_t *ppos, + ssize_t bytes_written) +{ + struct xfs_inode *ip = XFS_I(inode); + xfs_fsize_t isize = i_size_read(inode); + + if (bytes_written > 0) + XFS_STATS_ADD(xs_write_bytes, bytes_written); + + if (unlikely(bytes_written < 0 && bytes_written != -EFAULT && + *ppos > isize)) + *ppos = isize; + + if (*ppos > ip->i_size) { + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + if (*ppos > ip->i_size) + ip->i_size = *ppos; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + } +} + +/* + * If this was a direct or synchronous I/O that failed (such as ENOSPC) then + * part of the I/O may have been written to disk before the error occurred. In + * this case the on-disk file size may have been adjusted beyond the in-memory + * file size and now needs to be truncated back. + */ +STATIC void +xfs_aio_write_newsize_update( + struct xfs_inode *ip) +{ + if (ip->i_new_size) { + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + ip->i_new_size = 0; + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + } +} + +/* + * xfs_file_splice_write() does not use xfs_rw_ilock() because + * generic_file_splice_write() takes the i_mutex itself. This, in theory, + * couuld cause lock inversions between the aio_write path and the splice path + * if someone is doing concurrent splice(2) based writes and write(2) based + * writes to the same inode. The only real way to fix this is to re-implement + * the generic code here with correct locking orders. + */ +STATIC ssize_t +xfs_file_splice_write( + struct pipe_inode_info *pipe, + struct file *outfilp, + loff_t *ppos, + size_t count, + unsigned int flags) +{ + struct inode *inode = outfilp->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + xfs_fsize_t new_size; + int ioflags = 0; + ssize_t ret; + + XFS_STATS_INC(xs_write_calls); + + if (outfilp->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + new_size = *ppos + count; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (new_size > ip->i_size) + ip->i_new_size = new_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + trace_xfs_file_splice_write(ip, count, *ppos, ioflags); + + ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); + + xfs_aio_write_isize_update(inode, ppos, ret); + xfs_aio_write_newsize_update(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return ret; +} + +/* + * This routine is called to handle zeroing any space in the last + * block of the file that is beyond the EOF. We do this since the + * size is being increased without writing anything to that block + * and we don't want anyone to read the garbage on the disk. + */ +STATIC int /* error (positive) */ +xfs_zero_last_block( + xfs_inode_t *ip, + xfs_fsize_t offset, + xfs_fsize_t isize) +{ + xfs_fileoff_t last_fsb; + xfs_mount_t *mp = ip->i_mount; + int nimaps; + int zero_offset; + int zero_len; + int error = 0; + xfs_bmbt_irec_t imap; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + zero_offset = XFS_B_FSB_OFFSET(mp, isize); + if (zero_offset == 0) { + /* + * There are no extra bytes in the last block on disk to + * zero, so return. + */ + return 0; + } + + last_fsb = XFS_B_TO_FSBT(mp, isize); + nimaps = 1; + error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, + &nimaps, NULL); + if (error) { + return error; + } + ASSERT(nimaps > 0); + /* + * If the block underlying isize is just a hole, then there + * is nothing to zero. + */ + if (imap.br_startblock == HOLESTARTBLOCK) { + return 0; + } + /* + * Zero the part of the last block beyond the EOF, and write it + * out sync. We need to drop the ilock while we do this so we + * don't deadlock when the buffer cache calls back to us. + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + zero_len = mp->m_sb.sb_blocksize - zero_offset; + if (isize + zero_len > offset) + zero_len = offset - isize; + error = xfs_iozero(ip, isize, zero_len); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + ASSERT(error >= 0); + return error; +} + +/* + * Zero any on disk space between the current EOF and the new, + * larger EOF. This handles the normal case of zeroing the remainder + * of the last block in the file and the unusual case of zeroing blocks + * out beyond the size of the file. This second case only happens + * with fixed size extents and when the system crashes before the inode + * size was updated but after blocks were allocated. If fill is set, + * then any holes in the range are filled and zeroed. If not, the holes + * are left alone as holes. + */ + +int /* error (positive) */ +xfs_zero_eof( + xfs_inode_t *ip, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize) /* current inode size */ +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_fileoff_t zero_off; + xfs_fsize_t zero_len; + int nimaps; + int error = 0; + xfs_bmbt_irec_t imap; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + ASSERT(offset > isize); + + /* + * First handle zeroing the block on which isize resides. + * We only zero a part of that block so it is handled specially. + */ + error = xfs_zero_last_block(ip, offset, isize); + if (error) { + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + return error; + } + + /* + * Calculate the range between the new size and the old + * where blocks needing to be zeroed may exist. To get the + * block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back + * to a block boundary. We subtract 1 in case the size is + * exactly on a block boundary. + */ + last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; + start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); + end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); + ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); + if (last_fsb == end_zero_fsb) { + /* + * The size was only incremented on its last block. + * We took care of that above, so just return. + */ + return 0; + } + + ASSERT(start_zero_fsb <= end_zero_fsb); + while (start_zero_fsb <= end_zero_fsb) { + nimaps = 1; + zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, + 0, NULL, 0, &imap, &nimaps, NULL); + if (error) { + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + return error; + } + ASSERT(nimaps > 0); + + if (imap.br_state == XFS_EXT_UNWRITTEN || + imap.br_startblock == HOLESTARTBLOCK) { + /* + * This loop handles initializing pages that were + * partially initialized by the code below this + * loop. It basically zeroes the part of the page + * that sits on a hole and sets the page as P_HOLE + * and calls remapf if it is a mapped file. + */ + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + continue; + } + + /* + * There are blocks we need to zero. + * Drop the inode lock while we're doing the I/O. + * We'll still have the iolock to protect us. + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); + zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); + + if ((zero_off + zero_len) > offset) + zero_len = offset - zero_off; + + error = xfs_iozero(ip, zero_off, zero_len); + if (error) { + goto out_lock; + } + + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + } + + return 0; + +out_lock: + xfs_ilock(ip, XFS_ILOCK_EXCL); + ASSERT(error >= 0); + return error; +} + +/* + * Common pre-write limit and setup checks. + * + * Returns with iolock held according to @iolock. + */ +STATIC ssize_t +xfs_file_aio_write_checks( + struct file *file, + loff_t *pos, + size_t *count, + int *iolock) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + xfs_fsize_t new_size; + int error = 0; + + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); + if (error) { + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + *iolock = 0; + return error; + } + + new_size = *pos + *count; + if (new_size > ip->i_size) + ip->i_new_size = new_size; + + if (likely(!(file->f_mode & FMODE_NOCMTIME))) + file_update_time(file); + + /* + * If the offset is beyond the size of the file, we need to zero any + * blocks that fall between the existing EOF and the start of this + * write. + */ + if (*pos > ip->i_size) + error = -xfs_zero_eof(ip, *pos, ip->i_size); + + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + /* + * If we're writing the file then make sure to clear the setuid and + * setgid bits if the process is not being run by root. This keeps + * people from modifying setuid and setgid binaries. + */ + return file_remove_suid(file); + +} + +/* + * xfs_file_dio_aio_write - handle direct IO writes + * + * Lock the inode appropriately to prepare for and issue a direct IO write. + * By separating it from the buffered write path we remove all the tricky to + * follow locking changes and looping. + * + * If there are cached pages or we're extending the file, we need IOLOCK_EXCL + * until we're sure the bytes at the new EOF have been zeroed and/or the cached + * pages are flushed out. + * + * In most cases the direct IO writes will be done holding IOLOCK_SHARED + * allowing them to be done in parallel with reads and other direct IO writes. + * However, if the IO is not aligned to filesystem blocks, the direct IO layer + * needs to do sub-block zeroing and that requires serialisation against other + * direct IOs to the same block. In this case we need to serialise the + * submission of the unaligned IOs so that we don't get racing block zeroing in + * the dio layer. To avoid the problem with aio, we also need to wait for + * outstanding IOs to complete so that unwritten extent conversion is completed + * before we try to map the overlapping block. This is currently implemented by + * hitting it with a big hammer (i.e. xfs_ioend_wait()). + * + * Returns with locks held indicated by @iolock and errors indicated by + * negative return values. + */ +STATIC ssize_t +xfs_file_dio_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos, + size_t ocount, + int *iolock) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + size_t count = ocount; + int unaligned_io = 0; + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + *iolock = 0; + if ((pos & target->bt_smask) || (count & target->bt_smask)) + return -XFS_ERROR(EINVAL); + + if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) + unaligned_io = 1; + + if (unaligned_io || mapping->nrpages || pos > ip->i_size) + *iolock = XFS_IOLOCK_EXCL; + else + *iolock = XFS_IOLOCK_SHARED; + xfs_rw_ilock(ip, *iolock); + + ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + if (ret) + return ret; + + /* + * Recheck if there are cached pages that need invalidate after we got + * the iolock to protect against other threads adding new pages while + * we were waiting for the iolock. + */ + if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + } + + if (mapping->nrpages) { + ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, + FI_REMAPF_LOCKED); + if (ret) + return ret; + } + + /* + * If we are doing unaligned IO, wait for all other IO to drain, + * otherwise demote the lock if we had to flush cached pages + */ + if (unaligned_io) + xfs_ioend_wait(ip); + else if (*iolock == XFS_IOLOCK_EXCL) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + *iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + ret = generic_file_direct_write(iocb, iovp, + &nr_segs, pos, &iocb->ki_pos, count, ocount); + + /* No fallback to buffered IO on errors for XFS. */ + ASSERT(ret < 0 || ret == count); + return ret; +} + +STATIC ssize_t +xfs_file_buffered_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos, + size_t ocount, + int *iolock) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int enospc = 0; + size_t count = ocount; + + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + + ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + if (ret) + return ret; + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + +write_retry: + trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); + ret = generic_file_buffered_write(iocb, iovp, nr_segs, + pos, &iocb->ki_pos, count, ret); + /* + * if we just got an ENOSPC, flush the inode now we aren't holding any + * page locks and retry *once* + */ + if (ret == -ENOSPC && !enospc) { + ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); + if (ret) + return ret; + enospc = 1; + goto write_retry; + } + current->backing_dev_info = NULL; + return ret; +} + +STATIC ssize_t +xfs_file_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int iolock; + size_t ocount = 0; + + XFS_STATS_INC(xs_write_calls); + + BUG_ON(iocb->ki_pos != pos); + + ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); + if (ret) + return ret; + + if (ocount == 0) + return 0; + + xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + if (unlikely(file->f_flags & O_DIRECT)) + ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, + ocount, &iolock); + else + ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, + ocount, &iolock); + + xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); + + if (ret <= 0) + goto out_unlock; + + /* Handle various SYNC-type writes */ + if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { + loff_t end = pos + ret - 1; + int error, error2; + + xfs_rw_iunlock(ip, iolock); + error = filemap_write_and_wait_range(mapping, pos, end); + xfs_rw_ilock(ip, iolock); + + error2 = -xfs_file_fsync(file, + (file->f_flags & __O_SYNC) ? 0 : 1); + if (error) + ret = error; + else if (error2) + ret = error2; + } + +out_unlock: + xfs_aio_write_newsize_update(ip); + xfs_rw_iunlock(ip, iolock); + return ret; +} + +STATIC long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + long error; + loff_t new_size = 0; + xfs_flock64_t bf; + xfs_inode_t *ip = XFS_I(inode); + int cmd = XFS_IOC_RESVSP; + int attr_flags = XFS_ATTR_NOLOCK; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + bf.l_whence = 0; + bf.l_start = offset; + bf.l_len = len; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + if (mode & FALLOC_FL_PUNCH_HOLE) + cmd = XFS_IOC_UNRESVSP; + + /* check the new inode size is valid before allocating */ + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + error = inode_newsize_ok(inode, new_size); + if (error) + goto out_unlock; + } + + if (file->f_flags & O_DSYNC) + attr_flags |= XFS_ATTR_SYNC; + + error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags); + if (error) + goto out_unlock; + + /* Change file size if needed */ + if (new_size) { + struct iattr iattr; + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = new_size; + error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); + } + +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} + + +STATIC int +xfs_file_open( + struct inode *inode, + struct file *file) +{ + if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + return -EFBIG; + if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) + return -EIO; + return 0; +} + +STATIC int +xfs_dir_open( + struct inode *inode, + struct file *file) +{ + struct xfs_inode *ip = XFS_I(inode); + int mode; + int error; + + error = xfs_file_open(inode, file); + if (error) + return error; + + /* + * If there are any blocks, read-ahead block 0 as we're almost + * certain to have the next operation be a read there. + */ + mode = xfs_ilock_map_shared(ip); + if (ip->i_d.di_nextents > 0) + xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); + xfs_iunlock(ip, mode); + return 0; +} + +STATIC int +xfs_file_release( + struct inode *inode, + struct file *filp) +{ + return -xfs_release(XFS_I(inode)); +} + +STATIC int +xfs_file_readdir( + struct file *filp, + void *dirent, + filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + xfs_inode_t *ip = XFS_I(inode); + int error; + size_t bufsize; + + /* + * The Linux API doesn't pass down the total size of the buffer + * we read into down to the filesystem. With the filldir concept + * it's not needed for correct information, but the XFS dir2 leaf + * code wants an estimate of the buffer size to calculate it's + * readahead window and size the buffers used for mapping to + * physical blocks. + * + * Try to give it an estimate that's good enough, maybe at some + * point we can change the ->readdir prototype to include the + * buffer size. For now we use the current glibc buffer size. + */ + bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); + + error = xfs_readdir(ip, dirent, bufsize, + (xfs_off_t *)&filp->f_pos, filldir); + if (error) + return -error; + return 0; +} + +STATIC int +xfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + vma->vm_ops = &xfs_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + + file_accessed(filp); + return 0; +} + +/* + * mmap()d file has taken write protection fault and is being made + * writable. We can set the page state up correctly for a writable + * page, which means we can do correct delalloc accounting (ENOSPC + * checking!) and unwritten extent mapping. + */ +STATIC int +xfs_vm_page_mkwrite( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + return block_page_mkwrite(vma, vmf, xfs_get_blocks); +} + +const struct file_operations xfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = xfs_file_aio_read, + .aio_write = xfs_file_aio_write, + .splice_read = xfs_file_splice_read, + .splice_write = xfs_file_splice_write, + .unlocked_ioctl = xfs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = xfs_file_compat_ioctl, +#endif + .mmap = xfs_file_mmap, + .open = xfs_file_open, + .release = xfs_file_release, + .fsync = xfs_file_fsync, + .fallocate = xfs_file_fallocate, +}; + +const struct file_operations xfs_dir_file_operations = { + .open = xfs_dir_open, + .read = generic_read_dir, + .readdir = xfs_file_readdir, + .llseek = generic_file_llseek, + .unlocked_ioctl = xfs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = xfs_file_compat_ioctl, +#endif + .fsync = xfs_file_fsync, +}; + +static const struct vm_operations_struct xfs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = xfs_vm_page_mkwrite, +}; diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c new file mode 100644 index 00000000..ed88ed16 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_vnodeops.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_trace.h" + +/* + * note: all filemap functions return negative error codes. These + * need to be inverted before returning to the xfs core functions. + */ +void +xfs_tosspages( + xfs_inode_t *ip, + xfs_off_t first, + xfs_off_t last, + int fiopt) +{ + /* can't toss partial tail pages, so mask them out */ + last &= ~(PAGE_SIZE - 1); + truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1); +} + +int +xfs_flushinval_pages( + xfs_inode_t *ip, + xfs_off_t first, + xfs_off_t last, + int fiopt) +{ + struct address_space *mapping = VFS_I(ip)->i_mapping; + int ret = 0; + + trace_xfs_pagecache_inval(ip, first, last); + + xfs_iflags_clear(ip, XFS_ITRUNCATED); + ret = filemap_write_and_wait_range(mapping, first, + last == -1 ? LLONG_MAX : last); + if (!ret) + truncate_inode_pages_range(mapping, first, last); + return -ret; +} + +int +xfs_flush_pages( + xfs_inode_t *ip, + xfs_off_t first, + xfs_off_t last, + uint64_t flags, + int fiopt) +{ + struct address_space *mapping = VFS_I(ip)->i_mapping; + int ret = 0; + int ret2; + + xfs_iflags_clear(ip, XFS_ITRUNCATED); + ret = -filemap_fdatawrite_range(mapping, first, + last == -1 ? LLONG_MAX : last); + if (flags & XBF_ASYNC) + return ret; + ret2 = xfs_wait_on_pages(ip, first, last); + if (!ret) + ret = ret2; + return ret; +} + +int +xfs_wait_on_pages( + xfs_inode_t *ip, + xfs_off_t first, + xfs_off_t last) +{ + struct address_space *mapping = VFS_I(ip)->i_mapping; + + if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { + return -filemap_fdatawait_range(mapping, first, + last == -1 ? ip->i_size - 1 : last); + } + return 0; +} diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c new file mode 100644 index 00000000..76e81cff --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_globals.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_sysctl.h" + +/* + * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, + * other XFS code uses these values. Times are measured in centisecs (i.e. + * 100ths of a second). + */ +xfs_param_t xfs_params = { + /* MIN DFLT MAX */ + .sgid_inherit = { 0, 0, 1 }, + .symlink_mode = { 0, 0, 1 }, + .panic_mask = { 0, 0, 255 }, + .error_level = { 0, 3, 11 }, + .syncd_timer = { 1*100, 30*100, 7200*100}, + .stats_clear = { 0, 0, 1 }, + .inherit_sync = { 0, 1, 1 }, + .inherit_nodump = { 0, 1, 1 }, + .inherit_noatim = { 0, 1, 1 }, + .xfs_buf_timer = { 100/2, 1*100, 30*100 }, + .xfs_buf_age = { 1*100, 15*100, 7200*100}, + .inherit_nosym = { 0, 0, 1 }, + .rotorstep = { 1, 1, 255 }, + .inherit_nodfrg = { 0, 1, 1 }, + .fstrm_timer = { 1, 30*100, 3600*100}, +}; diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c new file mode 100644 index 00000000..acca2c5c --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -0,0 +1,1556 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_alloc.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_ioctl.h" +#include "xfs_rtalloc.h" +#include "xfs_itable.h" +#include "xfs_error.h" +#include "xfs_attr.h" +#include "xfs_bmap.h" +#include "xfs_buf_item.h" +#include "xfs_utils.h" +#include "xfs_dfrag.h" +#include "xfs_fsops.h" +#include "xfs_vnodeops.h" +#include "xfs_discard.h" +#include "xfs_quota.h" +#include "xfs_inode_item.h" +#include "xfs_export.h" +#include "xfs_trace.h" + +#include +#include +#include +#include +#include +#include +#include + +/* + * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to + * a file or fs handle. + * + * XFS_IOC_PATH_TO_FSHANDLE + * returns fs handle for a mount point or path within that mount point + * XFS_IOC_FD_TO_HANDLE + * returns full handle for a FD opened in user space + * XFS_IOC_PATH_TO_HANDLE + * returns full handle for a path + */ +int +xfs_find_handle( + unsigned int cmd, + xfs_fsop_handlereq_t *hreq) +{ + int hsize; + xfs_handle_t handle; + struct inode *inode; + struct file *file = NULL; + struct path path; + int error; + struct xfs_inode *ip; + + if (cmd == XFS_IOC_FD_TO_HANDLE) { + file = fget(hreq->fd); + if (!file) + return -EBADF; + inode = file->f_path.dentry->d_inode; + } else { + error = user_lpath((const char __user *)hreq->path, &path); + if (error) + return error; + inode = path.dentry->d_inode; + } + ip = XFS_I(inode); + + /* + * We can only generate handles for inodes residing on a XFS filesystem, + * and only for regular files, directories or symbolic links. + */ + error = -EINVAL; + if (inode->i_sb->s_magic != XFS_SB_MAGIC) + goto out_put; + + error = -EBADF; + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + goto out_put; + + + memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); + + if (cmd == XFS_IOC_PATH_TO_FSHANDLE) { + /* + * This handle only contains an fsid, zero the rest. + */ + memset(&handle.ha_fid, 0, sizeof(handle.ha_fid)); + hsize = sizeof(xfs_fsid_t); + } else { + int lock_mode; + + lock_mode = xfs_ilock_map_shared(ip); + handle.ha_fid.fid_len = sizeof(xfs_fid_t) - + sizeof(handle.ha_fid.fid_len); + handle.ha_fid.fid_pad = 0; + handle.ha_fid.fid_gen = ip->i_d.di_gen; + handle.ha_fid.fid_ino = ip->i_ino; + xfs_iunlock_map_shared(ip, lock_mode); + + hsize = XFS_HSIZE(handle); + } + + error = -EFAULT; + if (copy_to_user(hreq->ohandle, &handle, hsize) || + copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) + goto out_put; + + error = 0; + + out_put: + if (cmd == XFS_IOC_FD_TO_HANDLE) + fput(file); + else + path_put(&path); + return error; +} + +/* + * No need to do permission checks on the various pathname components + * as the handle operations are privileged. + */ +STATIC int +xfs_handle_acceptable( + void *context, + struct dentry *dentry) +{ + return 1; +} + +/* + * Convert userspace handle data into a dentry. + */ +struct dentry * +xfs_handle_to_dentry( + struct file *parfilp, + void __user *uhandle, + u32 hlen) +{ + xfs_handle_t handle; + struct xfs_fid64 fid; + + /* + * Only allow handle opens under a directory. + */ + if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode)) + return ERR_PTR(-ENOTDIR); + + if (hlen != sizeof(xfs_handle_t)) + return ERR_PTR(-EINVAL); + if (copy_from_user(&handle, uhandle, hlen)) + return ERR_PTR(-EFAULT); + if (handle.ha_fid.fid_len != + sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len)) + return ERR_PTR(-EINVAL); + + memset(&fid, 0, sizeof(struct fid)); + fid.ino = handle.ha_fid.fid_ino; + fid.gen = handle.ha_fid.fid_gen; + + return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3, + FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG, + xfs_handle_acceptable, NULL); +} + +STATIC struct dentry * +xfs_handlereq_to_dentry( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen); +} + +int +xfs_open_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + const struct cred *cred = current_cred(); + int error; + int fd; + int permflag; + struct file *filp; + struct inode *inode; + struct dentry *dentry; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + inode = dentry->d_inode; + + /* Restrict xfs_open_by_handle to directories & regular files. */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { + error = -XFS_ERROR(EPERM); + goto out_dput; + } + +#if BITS_PER_LONG != 32 + hreq->oflags |= O_LARGEFILE; +#endif + + /* Put open permission in namei format. */ + permflag = hreq->oflags; + if ((permflag+1) & O_ACCMODE) + permflag++; + if (permflag & O_TRUNC) + permflag |= 2; + + if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && + (permflag & FMODE_WRITE) && IS_APPEND(inode)) { + error = -XFS_ERROR(EPERM); + goto out_dput; + } + + if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) { + error = -XFS_ERROR(EACCES); + goto out_dput; + } + + /* Can't write directories. */ + if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) { + error = -XFS_ERROR(EISDIR); + goto out_dput; + } + + fd = get_unused_fd(); + if (fd < 0) { + error = fd; + goto out_dput; + } + + filp = dentry_open(dentry, mntget(parfilp->f_path.mnt), + hreq->oflags, cred); + if (IS_ERR(filp)) { + put_unused_fd(fd); + return PTR_ERR(filp); + } + + if (inode->i_mode & S_IFREG) { + filp->f_flags |= O_NOATIME; + filp->f_mode |= FMODE_NOCMTIME; + } + + fd_install(fd, filp); + return fd; + + out_dput: + dput(dentry); + return error; +} + +/* + * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's + * unused first argument. + */ +STATIC int +do_readlink( + char __user *buffer, + int buflen, + const char *link) +{ + int len; + + len = PTR_ERR(link); + if (IS_ERR(link)) + goto out; + + len = strlen(link); + if (len > (unsigned) buflen) + len = buflen; + if (copy_to_user(buffer, link, len)) + len = -EFAULT; + out: + return len; +} + + +int +xfs_readlink_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + struct dentry *dentry; + __u32 olen; + void *link; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + /* Restrict this handle operation to symlinks only. */ + if (!S_ISLNK(dentry->d_inode->i_mode)) { + error = -XFS_ERROR(EINVAL); + goto out_dput; + } + + if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { + error = -XFS_ERROR(EFAULT); + goto out_dput; + } + + link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); + if (!link) { + error = -XFS_ERROR(ENOMEM); + goto out_dput; + } + + error = -xfs_readlink(XFS_I(dentry->d_inode), link); + if (error) + goto out_kfree; + error = do_readlink(hreq->ohandle, olen, link); + if (error) + goto out_kfree; + + out_kfree: + kfree(link); + out_dput: + dput(dentry); + return error; +} + +STATIC int +xfs_fssetdm_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + struct fsdmidata fsd; + xfs_fsop_setdm_handlereq_t dmhreq; + struct dentry *dentry; + + if (!capable(CAP_MKNOD)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) + return -XFS_ERROR(EFAULT); + + dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { + error = -XFS_ERROR(EPERM); + goto out; + } + + if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) { + error = -XFS_ERROR(EFAULT); + goto out; + } + + error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, + fsd.fsd_dmstate); + + out: + dput(dentry); + return error; +} + +STATIC int +xfs_attrlist_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error = -ENOMEM; + attrlist_cursor_kern_t *cursor; + xfs_fsop_attrlist_handlereq_t al_hreq; + struct dentry *dentry; + char *kbuf; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) + return -XFS_ERROR(EFAULT); + if (al_hreq.buflen > XATTR_LIST_MAX) + return -XFS_ERROR(EINVAL); + + /* + * Reject flags, only allow namespaces. + */ + if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) + return -XFS_ERROR(EINVAL); + + dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL); + if (!kbuf) + goto out_dput; + + cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; + error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, + al_hreq.flags, cursor); + if (error) + goto out_kfree; + + if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) + error = -EFAULT; + + out_kfree: + kfree(kbuf); + out_dput: + dput(dentry); + return error; +} + +int +xfs_attrmulti_attr_get( + struct inode *inode, + unsigned char *name, + unsigned char __user *ubuf, + __uint32_t *len, + __uint32_t flags) +{ + unsigned char *kbuf; + int error = EFAULT; + + if (*len > XATTR_SIZE_MAX) + return EINVAL; + kbuf = kmalloc(*len, GFP_KERNEL); + if (!kbuf) + return ENOMEM; + + error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); + if (error) + goto out_kfree; + + if (copy_to_user(ubuf, kbuf, *len)) + error = EFAULT; + + out_kfree: + kfree(kbuf); + return error; +} + +int +xfs_attrmulti_attr_set( + struct inode *inode, + unsigned char *name, + const unsigned char __user *ubuf, + __uint32_t len, + __uint32_t flags) +{ + unsigned char *kbuf; + int error = EFAULT; + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return EPERM; + if (len > XATTR_SIZE_MAX) + return EINVAL; + + kbuf = memdup_user(ubuf, len); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); + + return error; +} + +int +xfs_attrmulti_attr_remove( + struct inode *inode, + unsigned char *name, + __uint32_t flags) +{ + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return EPERM; + return xfs_attr_remove(XFS_I(inode), name, flags); +} + +STATIC int +xfs_attrmulti_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + xfs_attr_multiop_t *ops; + xfs_fsop_attrmulti_handlereq_t am_hreq; + struct dentry *dentry; + unsigned int i, size; + unsigned char *attr_name; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) + return -XFS_ERROR(EFAULT); + + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t)) + return -E2BIG; + + dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = E2BIG; + size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); + if (!size || size > 16 * PAGE_SIZE) + goto out_dput; + + ops = memdup_user(am_hreq.ops, size); + if (IS_ERR(ops)) { + error = PTR_ERR(ops); + goto out_dput; + } + + attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); + if (!attr_name) + goto out_kfree_ops; + + error = 0; + for (i = 0; i < am_hreq.opcount; i++) { + ops[i].am_error = strncpy_from_user((char *)attr_name, + ops[i].am_attrname, MAXNAMELEN); + if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) + error = -ERANGE; + if (ops[i].am_error < 0) + break; + + switch (ops[i].am_opcode) { + case ATTR_OP_GET: + ops[i].am_error = xfs_attrmulti_attr_get( + dentry->d_inode, attr_name, + ops[i].am_attrvalue, &ops[i].am_length, + ops[i].am_flags); + break; + case ATTR_OP_SET: + ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_set( + dentry->d_inode, attr_name, + ops[i].am_attrvalue, ops[i].am_length, + ops[i].am_flags); + mnt_drop_write(parfilp->f_path.mnt); + break; + case ATTR_OP_REMOVE: + ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_remove( + dentry->d_inode, attr_name, + ops[i].am_flags); + mnt_drop_write(parfilp->f_path.mnt); + break; + default: + ops[i].am_error = EINVAL; + } + } + + if (copy_to_user(am_hreq.ops, ops, size)) + error = XFS_ERROR(EFAULT); + + kfree(attr_name); + out_kfree_ops: + kfree(ops); + out_dput: + dput(dentry); + return -error; +} + +int +xfs_ioc_space( + struct xfs_inode *ip, + struct inode *inode, + struct file *filp, + int ioflags, + unsigned int cmd, + xfs_flock64_t *bf) +{ + int attr_flags = 0; + int error; + + /* + * Only allow the sys admin to reserve space unless + * unwritten extents are enabled. + */ + if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) && + !capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + + if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) + return -XFS_ERROR(EPERM); + + if (!(filp->f_mode & FMODE_WRITE)) + return -XFS_ERROR(EBADF); + + if (!S_ISREG(inode->i_mode)) + return -XFS_ERROR(EINVAL); + + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + attr_flags |= XFS_ATTR_NONBLOCK; + + if (filp->f_flags & O_DSYNC) + attr_flags |= XFS_ATTR_SYNC; + + if (ioflags & IO_INVIS) + attr_flags |= XFS_ATTR_DMI; + + error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags); + return -error; +} + +STATIC int +xfs_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + void __user *arg) +{ + xfs_fsop_bulkreq_t bulkreq; + int count; /* # of records returned */ + xfs_ino_t inlast; /* last inode number */ + int done; + int error; + + /* done = 1 if there are more stats to get and if bulkstat */ + /* should be called again (unused here, but used in dmapi) */ + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t))) + return -XFS_ERROR(EFAULT); + + if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) + return -XFS_ERROR(EFAULT); + + if ((count = bulkreq.icount) <= 0) + return -XFS_ERROR(EINVAL); + + if (bulkreq.ubuffer == NULL) + return -XFS_ERROR(EINVAL); + + if (cmd == XFS_IOC_FSINUMBERS) + error = xfs_inumbers(mp, &inlast, &count, + bulkreq.ubuffer, xfs_inumbers_fmt); + else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) + error = xfs_bulkstat_single(mp, &inlast, + bulkreq.ubuffer, &done); + else /* XFS_IOC_FSBULKSTAT */ + error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one, + sizeof(xfs_bstat_t), bulkreq.ubuffer, + &done); + + if (error) + return -error; + + if (bulkreq.ocount != NULL) { + if (copy_to_user(bulkreq.lastip, &inlast, + sizeof(xfs_ino_t))) + return -XFS_ERROR(EFAULT); + + if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) + return -XFS_ERROR(EFAULT); + } + + return 0; +} + +STATIC int +xfs_ioc_fsgeometry_v1( + xfs_mount_t *mp, + void __user *arg) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 3); + if (error) + return -error; + + /* + * Caller should have passed an argument of type + * xfs_fsop_geom_v1_t. This is a proper subset of the + * xfs_fsop_geom_t that xfs_fs_geometry() fills in. + */ + if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t))) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_ioc_fsgeometry( + xfs_mount_t *mp, + void __user *arg) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 4); + if (error) + return -error; + + if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) + return -XFS_ERROR(EFAULT); + return 0; +} + +/* + * Linux extended inode flags interface. + */ + +STATIC unsigned int +xfs_merge_ioc_xflags( + unsigned int flags, + unsigned int start) +{ + unsigned int xflags = start; + + if (flags & FS_IMMUTABLE_FL) + xflags |= XFS_XFLAG_IMMUTABLE; + else + xflags &= ~XFS_XFLAG_IMMUTABLE; + if (flags & FS_APPEND_FL) + xflags |= XFS_XFLAG_APPEND; + else + xflags &= ~XFS_XFLAG_APPEND; + if (flags & FS_SYNC_FL) + xflags |= XFS_XFLAG_SYNC; + else + xflags &= ~XFS_XFLAG_SYNC; + if (flags & FS_NOATIME_FL) + xflags |= XFS_XFLAG_NOATIME; + else + xflags &= ~XFS_XFLAG_NOATIME; + if (flags & FS_NODUMP_FL) + xflags |= XFS_XFLAG_NODUMP; + else + xflags &= ~XFS_XFLAG_NODUMP; + + return xflags; +} + +STATIC unsigned int +xfs_di2lxflags( + __uint16_t di_flags) +{ + unsigned int flags = 0; + + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= FS_APPEND_FL; + if (di_flags & XFS_DIFLAG_SYNC) + flags |= FS_SYNC_FL; + if (di_flags & XFS_DIFLAG_NOATIME) + flags |= FS_NOATIME_FL; + if (di_flags & XFS_DIFLAG_NODUMP) + flags |= FS_NODUMP_FL; + return flags; +} + +STATIC int +xfs_ioc_fsgetxattr( + xfs_inode_t *ip, + int attr, + void __user *arg) +{ + struct fsxattr fa; + + memset(&fa, 0, sizeof(struct fsxattr)); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + fa.fsx_xflags = xfs_ip2xflags(ip); + fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; + fa.fsx_projid = xfs_get_projid(ip); + + if (attr) { + if (ip->i_afp) { + if (ip->i_afp->if_flags & XFS_IFEXTENTS) + fa.fsx_nextents = ip->i_afp->if_bytes / + sizeof(xfs_bmbt_rec_t); + else + fa.fsx_nextents = ip->i_d.di_anextents; + } else + fa.fsx_nextents = 0; + } else { + if (ip->i_df.if_flags & XFS_IFEXTENTS) + fa.fsx_nextents = ip->i_df.if_bytes / + sizeof(xfs_bmbt_rec_t); + else + fa.fsx_nextents = ip->i_d.di_nextents; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (copy_to_user(arg, &fa, sizeof(fa))) + return -EFAULT; + return 0; +} + +STATIC void +xfs_set_diflags( + struct xfs_inode *ip, + unsigned int xflags) +{ + unsigned int di_flags; + + /* can't set PREALLOC this way, just preserve it */ + di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + if (xflags & XFS_XFLAG_IMMUTABLE) + di_flags |= XFS_DIFLAG_IMMUTABLE; + if (xflags & XFS_XFLAG_APPEND) + di_flags |= XFS_DIFLAG_APPEND; + if (xflags & XFS_XFLAG_SYNC) + di_flags |= XFS_DIFLAG_SYNC; + if (xflags & XFS_XFLAG_NOATIME) + di_flags |= XFS_DIFLAG_NOATIME; + if (xflags & XFS_XFLAG_NODUMP) + di_flags |= XFS_DIFLAG_NODUMP; + if (xflags & XFS_XFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + if (xflags & XFS_XFLAG_NODEFRAG) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (xflags & XFS_XFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (xflags & XFS_XFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (xflags & XFS_XFLAG_NOSYMLINKS) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if (xflags & XFS_XFLAG_EXTSZINHERIT) + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + if (xflags & XFS_XFLAG_REALTIME) + di_flags |= XFS_DIFLAG_REALTIME; + if (xflags & XFS_XFLAG_EXTSIZE) + di_flags |= XFS_DIFLAG_EXTSIZE; + } + + ip->i_d.di_flags = di_flags; +} + +STATIC void +xfs_diflags_to_linux( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + unsigned int xflags = xfs_ip2xflags(ip); + + if (xflags & XFS_XFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (xflags & XFS_XFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (xflags & XFS_XFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (xflags & XFS_XFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; +} + +#define FSX_PROJID 1 +#define FSX_EXTSIZE 2 +#define FSX_XFLAGS 4 +#define FSX_NONBLOCK 8 + +STATIC int +xfs_ioctl_setattr( + xfs_inode_t *ip, + struct fsxattr *fa, + int mask) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int lock_flags = 0; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *olddquot = NULL; + int code; + + trace_xfs_ioctl_setattr(ip); + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return XFS_ERROR(EROFS); + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * Disallow 32bit project ids when projid32bit feature is not enabled. + */ + if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) && + !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) + return XFS_ERROR(EINVAL); + + /* + * If disk quotas is on, we make sure that the dquots do exist on disk, + * before we start any other transactions. Trying to do this later + * is messy. We don't care to take a readlock to look at the ids + * in inode here, because we can't hold it across the trans_reserve. + * If the IDs do change before we take the ilock, we're covered + * because the i_*dquot fields will get updated anyway. + */ + if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) { + code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, + ip->i_d.di_gid, fa->fsx_projid, + XFS_QMOPT_PQUOTA, &udqp, &gdqp); + if (code) + return code; + } + + /* + * For the other attributes, we acquire the inode lock and + * first do an error checking pass. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + if (code) + goto error_return; + + lock_flags = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_flags); + + /* + * CAP_FOWNER overrides the following restrictions: + * + * The user ID of the calling process must be equal + * to the file owner ID, except in cases where the + * CAP_FSETID capability is applicable. + */ + if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) { + code = XFS_ERROR(EPERM); + goto error_return; + } + + /* + * Do a quota reservation only if projid is actually going to change. + */ + if (mask & FSX_PROJID) { + if (XFS_IS_QUOTA_RUNNING(mp) && + XFS_IS_PQUOTA_ON(mp) && + xfs_get_projid(ip) != fa->fsx_projid) { + ASSERT(tp); + code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, + capable(CAP_FOWNER) ? + XFS_QMOPT_FORCE_RES : 0); + if (code) /* out of quota */ + goto error_return; + } + } + + if (mask & FSX_EXTSIZE) { + /* + * Can't change extent size if any extents are allocated. + */ + if (ip->i_d.di_nextents && + ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != + fa->fsx_extsize)) { + code = XFS_ERROR(EINVAL); /* EFBIG? */ + goto error_return; + } + + /* + * Extent size must be a multiple of the appropriate block + * size, if set at all. It must also be smaller than the + * maximum extent size supported by the filesystem. + * + * Also, for non-realtime files, limit the extent size hint to + * half the size of the AGs in the filesystem so alignment + * doesn't result in extents larger than an AG. + */ + if (fa->fsx_extsize != 0) { + xfs_extlen_t size; + xfs_fsblock_t extsize_fsb; + + extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); + if (extsize_fsb > MAXEXTLEN) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + + if (XFS_IS_REALTIME_INODE(ip) || + ((mask & FSX_XFLAGS) && + (fa->fsx_xflags & XFS_XFLAG_REALTIME))) { + size = mp->m_sb.sb_rextsize << + mp->m_sb.sb_blocklog; + } else { + size = mp->m_sb.sb_blocksize; + if (extsize_fsb > mp->m_sb.sb_agblocks / 2) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + } + + if (fa->fsx_extsize % size) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + } + } + + + if (mask & FSX_XFLAGS) { + /* + * Can't change realtime flag if any extents are allocated. + */ + if ((ip->i_d.di_nextents || ip->i_delayed_blks) && + (XFS_IS_REALTIME_INODE(ip)) != + (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + code = XFS_ERROR(EINVAL); /* EFBIG? */ + goto error_return; + } + + /* + * If realtime flag is set then must have realtime data. + */ + if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + if ((mp->m_sb.sb_rblocks == 0) || + (mp->m_sb.sb_rextsize == 0) || + (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + } + + /* + * Can't modify an immutable/append-only file unless + * we have appropriate permission. + */ + if ((ip->i_d.di_flags & + (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) || + (fa->fsx_xflags & + (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && + !capable(CAP_LINUX_IMMUTABLE)) { + code = XFS_ERROR(EPERM); + goto error_return; + } + } + + xfs_trans_ijoin(tp, ip); + + /* + * Change file ownership. Must be the owner or privileged. + */ + if (mask & FSX_PROJID) { + /* + * CAP_FSETID overrides the following restrictions: + * + * The set-user-ID and set-group-ID bits of a file will be + * cleared upon successful return from chown() + */ + if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + !capable(CAP_FSETID)) + ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); + + /* + * Change the ownerships and register quota modifications + * in the transaction. + */ + if (xfs_get_projid(ip) != fa->fsx_projid) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { + olddquot = xfs_qm_vop_chown(tp, ip, + &ip->i_gdquot, gdqp); + } + xfs_set_projid(ip, fa->fsx_projid); + + /* + * We may have to rev the inode as well as + * the superblock version number since projids didn't + * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. + */ + if (ip->i_d.di_version == 1) + xfs_bump_ino_vers2(tp, ip); + } + + } + + if (mask & FSX_EXTSIZE) + ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; + if (mask & FSX_XFLAGS) { + xfs_set_diflags(ip, fa->fsx_xflags); + xfs_diflags_to_linux(ip); + } + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(xs_ig_attrchg); + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + * This is slightly sub-optimal in that truncates require + * two sync transactions instead of one for wsync filesystems. + * One for the truncate and one for the timestamps since we + * don't want to change the timestamps unless we're sure the + * truncate worked. Truncates are less than 1% of the laddis + * mix so this probably isn't worth the trouble to optimize. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + code = xfs_trans_commit(tp, 0); + xfs_iunlock(ip, lock_flags); + + /* + * Release any dquot(s) the inode had kept before chown. + */ + xfs_qm_dqrele(olddquot); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + return code; + + error_return: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_trans_cancel(tp, 0); + if (lock_flags) + xfs_iunlock(ip, lock_flags); + return code; +} + +STATIC int +xfs_ioc_fssetxattr( + xfs_inode_t *ip, + struct file *filp, + void __user *arg) +{ + struct fsxattr fa; + unsigned int mask; + + if (copy_from_user(&fa, arg, sizeof(fa))) + return -EFAULT; + + mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID; + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + mask |= FSX_NONBLOCK; + + return -xfs_ioctl_setattr(ip, &fa, mask); +} + +STATIC int +xfs_ioc_getxflags( + xfs_inode_t *ip, + void __user *arg) +{ + unsigned int flags; + + flags = xfs_di2lxflags(ip->i_d.di_flags); + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} + +STATIC int +xfs_ioc_setxflags( + xfs_inode_t *ip, + struct file *filp, + void __user *arg) +{ + struct fsxattr fa; + unsigned int flags; + unsigned int mask; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL)) + return -EOPNOTSUPP; + + mask = FSX_XFLAGS; + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + mask |= FSX_NONBLOCK; + fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); + + return -xfs_ioctl_setattr(ip, &fa, mask); +} + +STATIC int +xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) +{ + struct getbmap __user *base = *ap; + + /* copy only getbmap portion (not getbmapx) */ + if (copy_to_user(base, bmv, sizeof(struct getbmap))) + return XFS_ERROR(EFAULT); + + *ap += sizeof(struct getbmap); + return 0; +} + +STATIC int +xfs_ioc_getbmap( + struct xfs_inode *ip, + int ioflags, + unsigned int cmd, + void __user *arg) +{ + struct getbmapx bmx; + int error; + + if (copy_from_user(&bmx, arg, sizeof(struct getbmapx))) + return -XFS_ERROR(EFAULT); + + if (bmx.bmv_count < 2) + return -XFS_ERROR(EINVAL); + + bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); + if (ioflags & IO_INVIS) + bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; + + error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, + (struct getbmap *)arg+1); + if (error) + return -error; + + /* copy back header - only size of getbmap */ + if (copy_to_user(arg, &bmx, sizeof(struct getbmap))) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full) +{ + struct getbmapx __user *base = *ap; + + if (copy_to_user(base, bmv, sizeof(struct getbmapx))) + return XFS_ERROR(EFAULT); + + *ap += sizeof(struct getbmapx); + return 0; +} + +STATIC int +xfs_ioc_getbmapx( + struct xfs_inode *ip, + void __user *arg) +{ + struct getbmapx bmx; + int error; + + if (copy_from_user(&bmx, arg, sizeof(bmx))) + return -XFS_ERROR(EFAULT); + + if (bmx.bmv_count < 2) + return -XFS_ERROR(EINVAL); + + if (bmx.bmv_iflags & (~BMV_IF_VALID)) + return -XFS_ERROR(EINVAL); + + error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format, + (struct getbmapx *)arg+1); + if (error) + return -error; + + /* copy back header */ + if (copy_to_user(arg, &bmx, sizeof(struct getbmapx))) + return -XFS_ERROR(EFAULT); + + return 0; +} + +/* + * Note: some of the ioctl's return positive numbers as a + * byte count indicating success, such as readlink_by_handle. + * So we don't "sign flip" like most other routines. This means + * true errors need to be returned as a negative value. + */ +long +xfs_file_ioctl( + struct file *filp, + unsigned int cmd, + unsigned long p) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + void __user *arg = (void __user *)p; + int ioflags = 0; + int error; + + if (filp->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + + trace_xfs_file_ioctl(ip); + + switch (cmd) { + case FITRIM: + return xfs_ioc_trim(mp, arg); + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + case XFS_IOC_ZERO_RANGE: { + xfs_flock64_t bf; + + if (copy_from_user(&bf, arg, sizeof(bf))) + return -XFS_ERROR(EFAULT); + return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + } + case XFS_IOC_DIOINFO: { + struct dioattr da; + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + da.d_mem = da.d_miniosz = 1 << target->bt_sshift; + da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); + + if (copy_to_user(arg, &da, sizeof(da))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_FSBULKSTAT_SINGLE: + case XFS_IOC_FSBULKSTAT: + case XFS_IOC_FSINUMBERS: + return xfs_ioc_bulkstat(mp, cmd, arg); + + case XFS_IOC_FSGEOMETRY_V1: + return xfs_ioc_fsgeometry_v1(mp, arg); + + case XFS_IOC_FSGEOMETRY: + return xfs_ioc_fsgeometry(mp, arg); + + case XFS_IOC_GETVERSION: + return put_user(inode->i_generation, (int __user *)arg); + + case XFS_IOC_FSGETXATTR: + return xfs_ioc_fsgetxattr(ip, 0, arg); + case XFS_IOC_FSGETXATTRA: + return xfs_ioc_fsgetxattr(ip, 1, arg); + case XFS_IOC_FSSETXATTR: + return xfs_ioc_fssetxattr(ip, filp, arg); + case XFS_IOC_GETXFLAGS: + return xfs_ioc_getxflags(ip, arg); + case XFS_IOC_SETXFLAGS: + return xfs_ioc_setxflags(ip, filp, arg); + + case XFS_IOC_FSSETDM: { + struct fsdmidata dmi; + + if (copy_from_user(&dmi, arg, sizeof(dmi))) + return -XFS_ERROR(EFAULT); + + error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, + dmi.fsd_dmstate); + return -error; + } + + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + return xfs_ioc_getbmap(ip, ioflags, cmd, arg); + + case XFS_IOC_GETBMAPX: + return xfs_ioc_getbmapx(ip, arg); + + case XFS_IOC_FD_TO_HANDLE: + case XFS_IOC_PATH_TO_HANDLE: + case XFS_IOC_PATH_TO_FSHANDLE: { + xfs_fsop_handlereq_t hreq; + + if (copy_from_user(&hreq, arg, sizeof(hreq))) + return -XFS_ERROR(EFAULT); + return xfs_find_handle(cmd, &hreq); + } + case XFS_IOC_OPEN_BY_HANDLE: { + xfs_fsop_handlereq_t hreq; + + if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) + return -XFS_ERROR(EFAULT); + return xfs_open_by_handle(filp, &hreq); + } + case XFS_IOC_FSSETDM_BY_HANDLE: + return xfs_fssetdm_by_handle(filp, arg); + + case XFS_IOC_READLINK_BY_HANDLE: { + xfs_fsop_handlereq_t hreq; + + if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) + return -XFS_ERROR(EFAULT); + return xfs_readlink_by_handle(filp, &hreq); + } + case XFS_IOC_ATTRLIST_BY_HANDLE: + return xfs_attrlist_by_handle(filp, arg); + + case XFS_IOC_ATTRMULTI_BY_HANDLE: + return xfs_attrmulti_by_handle(filp, arg); + + case XFS_IOC_SWAPEXT: { + struct xfs_swapext sxp; + + if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) + return -XFS_ERROR(EFAULT); + error = xfs_swapext(&sxp); + return -error; + } + + case XFS_IOC_FSCOUNTS: { + xfs_fsop_counts_t out; + + error = xfs_fs_counts(mp, &out); + if (error) + return -error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_SET_RESBLKS: { + xfs_fsop_resblks_t inout; + __uint64_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -XFS_ERROR(EROFS); + + if (copy_from_user(&inout, arg, sizeof(inout))) + return -XFS_ERROR(EFAULT); + + /* input parameter is passed in resblks field of structure */ + in = inout.resblks; + error = xfs_reserve_blocks(mp, &in, &inout); + if (error) + return -error; + + if (copy_to_user(arg, &inout, sizeof(inout))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_GET_RESBLKS: { + xfs_fsop_resblks_t out; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_reserve_blocks(mp, NULL, &out); + if (error) + return -error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -XFS_ERROR(EFAULT); + + return 0; + } + + case XFS_IOC_FSGROWFSDATA: { + xfs_growfs_data_t in; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_data(mp, &in); + return -error; + } + + case XFS_IOC_FSGROWFSLOG: { + xfs_growfs_log_t in; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_log(mp, &in); + return -error; + } + + case XFS_IOC_FSGROWFSRT: { + xfs_growfs_rt_t in; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_rt(mp, &in); + return -error; + } + + case XFS_IOC_GOINGDOWN: { + __uint32_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__uint32_t __user *)arg)) + return -XFS_ERROR(EFAULT); + + error = xfs_fs_goingdown(mp, in); + return -error; + } + + case XFS_IOC_ERROR_INJECTION: { + xfs_error_injection_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_errortag_add(in.errtag, mp); + return -error; + } + + case XFS_IOC_ERROR_CLEARALL: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_errortag_clearall(mp, 1); + return -error; + + default: + return -ENOTTY; + } +} diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h new file mode 100644 index 00000000..d56173b3 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_ioctl.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2008 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOCTL_H__ +#define __XFS_IOCTL_H__ + +extern int +xfs_ioc_space( + struct xfs_inode *ip, + struct inode *inode, + struct file *filp, + int ioflags, + unsigned int cmd, + xfs_flock64_t *bf); + +extern int +xfs_find_handle( + unsigned int cmd, + xfs_fsop_handlereq_t *hreq); + +extern int +xfs_open_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq); + +extern int +xfs_readlink_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq); + +extern int +xfs_attrmulti_attr_get( + struct inode *inode, + unsigned char *name, + unsigned char __user *ubuf, + __uint32_t *len, + __uint32_t flags); + +extern int +xfs_attrmulti_attr_set( + struct inode *inode, + unsigned char *name, + const unsigned char __user *ubuf, + __uint32_t len, + __uint32_t flags); + +extern int +xfs_attrmulti_attr_remove( + struct inode *inode, + unsigned char *name, + __uint32_t flags); + +extern struct dentry * +xfs_handle_to_dentry( + struct file *parfilp, + void __user *uhandle, + u32 hlen); + +extern long +xfs_file_ioctl( + struct file *filp, + unsigned int cmd, + unsigned long p); + +extern long +xfs_file_compat_ioctl( + struct file *file, + unsigned int cmd, + unsigned long arg); + +#endif diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c new file mode 100644 index 00000000..54e623bf --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -0,0 +1,672 @@ +/* + * Copyright (c) 2004-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_vnode.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_itable.h" +#include "xfs_error.h" +#include "xfs_dfrag.h" +#include "xfs_vnodeops.h" +#include "xfs_fsops.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_attr.h" +#include "xfs_ioctl.h" +#include "xfs_ioctl32.h" +#include "xfs_trace.h" + +#define _NATIVE_IOC(cmd, type) \ + _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) + +#ifdef BROKEN_X86_ALIGNMENT +STATIC int +xfs_compat_flock64_copyin( + xfs_flock64_t *bf, + compat_xfs_flock64_t __user *arg32) +{ + if (get_user(bf->l_type, &arg32->l_type) || + get_user(bf->l_whence, &arg32->l_whence) || + get_user(bf->l_start, &arg32->l_start) || + get_user(bf->l_len, &arg32->l_len) || + get_user(bf->l_sysid, &arg32->l_sysid) || + get_user(bf->l_pid, &arg32->l_pid) || + copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32))) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_compat_ioc_fsgeometry_v1( + struct xfs_mount *mp, + compat_xfs_fsop_geom_v1_t __user *arg32) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 3); + if (error) + return -error; + /* The 32-bit variant simply has some padding at the end */ + if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1))) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_compat_growfs_data_copyin( + struct xfs_growfs_data *in, + compat_xfs_growfs_data_t __user *arg32) +{ + if (get_user(in->newblocks, &arg32->newblocks) || + get_user(in->imaxpct, &arg32->imaxpct)) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_compat_growfs_rt_copyin( + struct xfs_growfs_rt *in, + compat_xfs_growfs_rt_t __user *arg32) +{ + if (get_user(in->newblocks, &arg32->newblocks) || + get_user(in->extsize, &arg32->extsize)) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_inumbers_fmt_compat( + void __user *ubuffer, + const xfs_inogrp_t *buffer, + long count, + long *written) +{ + compat_xfs_inogrp_t __user *p32 = ubuffer; + long i; + + for (i = 0; i < count; i++) { + if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) || + put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) || + put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask)) + return -XFS_ERROR(EFAULT); + } + *written = count * sizeof(*p32); + return 0; +} + +#else +#define xfs_inumbers_fmt_compat xfs_inumbers_fmt +#endif /* BROKEN_X86_ALIGNMENT */ + +STATIC int +xfs_ioctl32_bstime_copyin( + xfs_bstime_t *bstime, + compat_xfs_bstime_t __user *bstime32) +{ + compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */ + + if (get_user(sec32, &bstime32->tv_sec) || + get_user(bstime->tv_nsec, &bstime32->tv_nsec)) + return -XFS_ERROR(EFAULT); + bstime->tv_sec = sec32; + return 0; +} + +/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */ +STATIC int +xfs_ioctl32_bstat_copyin( + xfs_bstat_t *bstat, + compat_xfs_bstat_t __user *bstat32) +{ + if (get_user(bstat->bs_ino, &bstat32->bs_ino) || + get_user(bstat->bs_mode, &bstat32->bs_mode) || + get_user(bstat->bs_nlink, &bstat32->bs_nlink) || + get_user(bstat->bs_uid, &bstat32->bs_uid) || + get_user(bstat->bs_gid, &bstat32->bs_gid) || + get_user(bstat->bs_rdev, &bstat32->bs_rdev) || + get_user(bstat->bs_blksize, &bstat32->bs_blksize) || + get_user(bstat->bs_size, &bstat32->bs_size) || + xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) || + xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) || + xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) || + get_user(bstat->bs_blocks, &bstat32->bs_size) || + get_user(bstat->bs_xflags, &bstat32->bs_size) || + get_user(bstat->bs_extsize, &bstat32->bs_extsize) || + get_user(bstat->bs_extents, &bstat32->bs_extents) || + get_user(bstat->bs_gen, &bstat32->bs_gen) || + get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) || + get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) || + get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || + get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || + get_user(bstat->bs_aextents, &bstat32->bs_aextents)) + return -XFS_ERROR(EFAULT); + return 0; +} + +/* XFS_IOC_FSBULKSTAT and friends */ + +STATIC int +xfs_bstime_store_compat( + compat_xfs_bstime_t __user *p32, + const xfs_bstime_t *p) +{ + __s32 sec32; + + sec32 = p->tv_sec; + if (put_user(sec32, &p32->tv_sec) || + put_user(p->tv_nsec, &p32->tv_nsec)) + return -XFS_ERROR(EFAULT); + return 0; +} + +/* Return 0 on success or positive error (to xfs_bulkstat()) */ +STATIC int +xfs_bulkstat_one_fmt_compat( + void __user *ubuffer, + int ubsize, + int *ubused, + const xfs_bstat_t *buffer) +{ + compat_xfs_bstat_t __user *p32 = ubuffer; + + if (ubsize < sizeof(*p32)) + return XFS_ERROR(ENOMEM); + + if (put_user(buffer->bs_ino, &p32->bs_ino) || + put_user(buffer->bs_mode, &p32->bs_mode) || + put_user(buffer->bs_nlink, &p32->bs_nlink) || + put_user(buffer->bs_uid, &p32->bs_uid) || + put_user(buffer->bs_gid, &p32->bs_gid) || + put_user(buffer->bs_rdev, &p32->bs_rdev) || + put_user(buffer->bs_blksize, &p32->bs_blksize) || + put_user(buffer->bs_size, &p32->bs_size) || + xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) || + xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) || + xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) || + put_user(buffer->bs_blocks, &p32->bs_blocks) || + put_user(buffer->bs_xflags, &p32->bs_xflags) || + put_user(buffer->bs_extsize, &p32->bs_extsize) || + put_user(buffer->bs_extents, &p32->bs_extents) || + put_user(buffer->bs_gen, &p32->bs_gen) || + put_user(buffer->bs_projid, &p32->bs_projid) || + put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) || + put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || + put_user(buffer->bs_dmstate, &p32->bs_dmstate) || + put_user(buffer->bs_aextents, &p32->bs_aextents)) + return XFS_ERROR(EFAULT); + if (ubused) + *ubused = sizeof(*p32); + return 0; +} + +STATIC int +xfs_bulkstat_one_compat( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + int *ubused, /* bytes used by me */ + int *stat) /* BULKSTAT_RV_... */ +{ + return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, + xfs_bulkstat_one_fmt_compat, + ubused, stat); +} + +/* copied from xfs_ioctl.c */ +STATIC int +xfs_compat_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + compat_xfs_fsop_bulkreq_t __user *p32) +{ + u32 addr; + xfs_fsop_bulkreq_t bulkreq; + int count; /* # of records returned */ + xfs_ino_t inlast; /* last inode number */ + int done; + int error; + + /* done = 1 if there are more stats to get and if bulkstat */ + /* should be called again (unused here, but used in dmapi) */ + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + if (get_user(addr, &p32->lastip)) + return -XFS_ERROR(EFAULT); + bulkreq.lastip = compat_ptr(addr); + if (get_user(bulkreq.icount, &p32->icount) || + get_user(addr, &p32->ubuffer)) + return -XFS_ERROR(EFAULT); + bulkreq.ubuffer = compat_ptr(addr); + if (get_user(addr, &p32->ocount)) + return -XFS_ERROR(EFAULT); + bulkreq.ocount = compat_ptr(addr); + + if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) + return -XFS_ERROR(EFAULT); + + if ((count = bulkreq.icount) <= 0) + return -XFS_ERROR(EINVAL); + + if (bulkreq.ubuffer == NULL) + return -XFS_ERROR(EINVAL); + + if (cmd == XFS_IOC_FSINUMBERS_32) { + error = xfs_inumbers(mp, &inlast, &count, + bulkreq.ubuffer, xfs_inumbers_fmt_compat); + } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) { + int res; + + error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer, + sizeof(compat_xfs_bstat_t), 0, &res); + } else if (cmd == XFS_IOC_FSBULKSTAT_32) { + error = xfs_bulkstat(mp, &inlast, &count, + xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t), + bulkreq.ubuffer, &done); + } else + error = XFS_ERROR(EINVAL); + if (error) + return -error; + + if (bulkreq.ocount != NULL) { + if (copy_to_user(bulkreq.lastip, &inlast, + sizeof(xfs_ino_t))) + return -XFS_ERROR(EFAULT); + + if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) + return -XFS_ERROR(EFAULT); + } + + return 0; +} + +STATIC int +xfs_compat_handlereq_copyin( + xfs_fsop_handlereq_t *hreq, + compat_xfs_fsop_handlereq_t __user *arg32) +{ + compat_xfs_fsop_handlereq_t hreq32; + + if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t))) + return -XFS_ERROR(EFAULT); + + hreq->fd = hreq32.fd; + hreq->path = compat_ptr(hreq32.path); + hreq->oflags = hreq32.oflags; + hreq->ihandle = compat_ptr(hreq32.ihandle); + hreq->ihandlen = hreq32.ihandlen; + hreq->ohandle = compat_ptr(hreq32.ohandle); + hreq->ohandlen = compat_ptr(hreq32.ohandlen); + + return 0; +} + +STATIC struct dentry * +xfs_compat_handlereq_to_dentry( + struct file *parfilp, + compat_xfs_fsop_handlereq_t *hreq) +{ + return xfs_handle_to_dentry(parfilp, + compat_ptr(hreq->ihandle), hreq->ihandlen); +} + +STATIC int +xfs_compat_attrlist_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + attrlist_cursor_kern_t *cursor; + compat_xfs_fsop_attrlist_handlereq_t al_hreq; + struct dentry *dentry; + char *kbuf; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&al_hreq, arg, + sizeof(compat_xfs_fsop_attrlist_handlereq_t))) + return -XFS_ERROR(EFAULT); + if (al_hreq.buflen > XATTR_LIST_MAX) + return -XFS_ERROR(EINVAL); + + /* + * Reject flags, only allow namespaces. + */ + if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) + return -XFS_ERROR(EINVAL); + + dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = -ENOMEM; + kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); + if (!kbuf) + goto out_dput; + + cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; + error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, + al_hreq.flags, cursor); + if (error) + goto out_kfree; + + if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) + error = -EFAULT; + + out_kfree: + kfree(kbuf); + out_dput: + dput(dentry); + return error; +} + +STATIC int +xfs_compat_attrmulti_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + compat_xfs_attr_multiop_t *ops; + compat_xfs_fsop_attrmulti_handlereq_t am_hreq; + struct dentry *dentry; + unsigned int i, size; + unsigned char *attr_name; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&am_hreq, arg, + sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) + return -XFS_ERROR(EFAULT); + + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t)) + return -E2BIG; + + dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = E2BIG; + size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t); + if (!size || size > 16 * PAGE_SIZE) + goto out_dput; + + ops = memdup_user(compat_ptr(am_hreq.ops), size); + if (IS_ERR(ops)) { + error = PTR_ERR(ops); + goto out_dput; + } + + attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); + if (!attr_name) + goto out_kfree_ops; + + error = 0; + for (i = 0; i < am_hreq.opcount; i++) { + ops[i].am_error = strncpy_from_user((char *)attr_name, + compat_ptr(ops[i].am_attrname), + MAXNAMELEN); + if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) + error = -ERANGE; + if (ops[i].am_error < 0) + break; + + switch (ops[i].am_opcode) { + case ATTR_OP_GET: + ops[i].am_error = xfs_attrmulti_attr_get( + dentry->d_inode, attr_name, + compat_ptr(ops[i].am_attrvalue), + &ops[i].am_length, ops[i].am_flags); + break; + case ATTR_OP_SET: + ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_set( + dentry->d_inode, attr_name, + compat_ptr(ops[i].am_attrvalue), + ops[i].am_length, ops[i].am_flags); + mnt_drop_write(parfilp->f_path.mnt); + break; + case ATTR_OP_REMOVE: + ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_remove( + dentry->d_inode, attr_name, + ops[i].am_flags); + mnt_drop_write(parfilp->f_path.mnt); + break; + default: + ops[i].am_error = EINVAL; + } + } + + if (copy_to_user(compat_ptr(am_hreq.ops), ops, size)) + error = XFS_ERROR(EFAULT); + + kfree(attr_name); + out_kfree_ops: + kfree(ops); + out_dput: + dput(dentry); + return -error; +} + +STATIC int +xfs_compat_fssetdm_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + struct fsdmidata fsd; + compat_xfs_fsop_setdm_handlereq_t dmhreq; + struct dentry *dentry; + + if (!capable(CAP_MKNOD)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&dmhreq, arg, + sizeof(compat_xfs_fsop_setdm_handlereq_t))) + return -XFS_ERROR(EFAULT); + + dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { + error = -XFS_ERROR(EPERM); + goto out; + } + + if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) { + error = -XFS_ERROR(EFAULT); + goto out; + } + + error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, + fsd.fsd_dmstate); + +out: + dput(dentry); + return error; +} + +long +xfs_file_compat_ioctl( + struct file *filp, + unsigned cmd, + unsigned long p) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + void __user *arg = (void __user *)p; + int ioflags = 0; + int error; + + if (filp->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + + trace_xfs_file_compat_ioctl(ip); + + switch (cmd) { + /* No size or alignment issues on any arch */ + case XFS_IOC_DIOINFO: + case XFS_IOC_FSGEOMETRY: + case XFS_IOC_FSGETXATTR: + case XFS_IOC_FSSETXATTR: + case XFS_IOC_FSGETXATTRA: + case XFS_IOC_FSSETDM: + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + case XFS_IOC_GETBMAPX: + case XFS_IOC_FSCOUNTS: + case XFS_IOC_SET_RESBLKS: + case XFS_IOC_GET_RESBLKS: + case XFS_IOC_FSGROWFSLOG: + case XFS_IOC_GOINGDOWN: + case XFS_IOC_ERROR_INJECTION: + case XFS_IOC_ERROR_CLEARALL: + return xfs_file_ioctl(filp, cmd, p); +#ifndef BROKEN_X86_ALIGNMENT + /* These are handled fine if no alignment issues */ + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + case XFS_IOC_FSGEOMETRY_V1: + case XFS_IOC_FSGROWFSDATA: + case XFS_IOC_FSGROWFSRT: + case XFS_IOC_ZERO_RANGE: + return xfs_file_ioctl(filp, cmd, p); +#else + case XFS_IOC_ALLOCSP_32: + case XFS_IOC_FREESP_32: + case XFS_IOC_ALLOCSP64_32: + case XFS_IOC_FREESP64_32: + case XFS_IOC_RESVSP_32: + case XFS_IOC_UNRESVSP_32: + case XFS_IOC_RESVSP64_32: + case XFS_IOC_UNRESVSP64_32: + case XFS_IOC_ZERO_RANGE_32: { + struct xfs_flock64 bf; + + if (xfs_compat_flock64_copyin(&bf, arg)) + return -XFS_ERROR(EFAULT); + cmd = _NATIVE_IOC(cmd, struct xfs_flock64); + return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + } + case XFS_IOC_FSGEOMETRY_V1_32: + return xfs_compat_ioc_fsgeometry_v1(mp, arg); + case XFS_IOC_FSGROWFSDATA_32: { + struct xfs_growfs_data in; + + if (xfs_compat_growfs_data_copyin(&in, arg)) + return -XFS_ERROR(EFAULT); + error = xfs_growfs_data(mp, &in); + return -error; + } + case XFS_IOC_FSGROWFSRT_32: { + struct xfs_growfs_rt in; + + if (xfs_compat_growfs_rt_copyin(&in, arg)) + return -XFS_ERROR(EFAULT); + error = xfs_growfs_rt(mp, &in); + return -error; + } +#endif + /* long changes size, but xfs only copiese out 32 bits */ + case XFS_IOC_GETXFLAGS_32: + case XFS_IOC_SETXFLAGS_32: + case XFS_IOC_GETVERSION_32: + cmd = _NATIVE_IOC(cmd, long); + return xfs_file_ioctl(filp, cmd, p); + case XFS_IOC_SWAPEXT_32: { + struct xfs_swapext sxp; + struct compat_xfs_swapext __user *sxu = arg; + + /* Bulk copy in up to the sx_stat field, then copy bstat */ + if (copy_from_user(&sxp, sxu, + offsetof(struct xfs_swapext, sx_stat)) || + xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) + return -XFS_ERROR(EFAULT); + error = xfs_swapext(&sxp); + return -error; + } + case XFS_IOC_FSBULKSTAT_32: + case XFS_IOC_FSBULKSTAT_SINGLE_32: + case XFS_IOC_FSINUMBERS_32: + return xfs_compat_ioc_bulkstat(mp, cmd, arg); + case XFS_IOC_FD_TO_HANDLE_32: + case XFS_IOC_PATH_TO_HANDLE_32: + case XFS_IOC_PATH_TO_FSHANDLE_32: { + struct xfs_fsop_handlereq hreq; + + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -XFS_ERROR(EFAULT); + cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq); + return xfs_find_handle(cmd, &hreq); + } + case XFS_IOC_OPEN_BY_HANDLE_32: { + struct xfs_fsop_handlereq hreq; + + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -XFS_ERROR(EFAULT); + return xfs_open_by_handle(filp, &hreq); + } + case XFS_IOC_READLINK_BY_HANDLE_32: { + struct xfs_fsop_handlereq hreq; + + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -XFS_ERROR(EFAULT); + return xfs_readlink_by_handle(filp, &hreq); + } + case XFS_IOC_ATTRLIST_BY_HANDLE_32: + return xfs_compat_attrlist_by_handle(filp, arg); + case XFS_IOC_ATTRMULTI_BY_HANDLE_32: + return xfs_compat_attrmulti_by_handle(filp, arg); + case XFS_IOC_FSSETDM_BY_HANDLE_32: + return xfs_compat_fssetdm_by_handle(filp, arg); + default: + return -XFS_ERROR(ENOIOCTLCMD); + } +} diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h new file mode 100644 index 00000000..80f4060e --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_ioctl32.h @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2004-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOCTL32_H__ +#define __XFS_IOCTL32_H__ + +#include + +/* + * on 32-bit arches, ioctl argument structures may have different sizes + * and/or alignment. We define compat structures which match the + * 32-bit sizes/alignments here, and their associated ioctl numbers. + * + * xfs_ioctl32.c contains routines to copy these structures in and out. + */ + +/* stock kernel-level ioctls we support */ +#define XFS_IOC_GETXFLAGS_32 FS_IOC32_GETFLAGS +#define XFS_IOC_SETXFLAGS_32 FS_IOC32_SETFLAGS +#define XFS_IOC_GETVERSION_32 FS_IOC32_GETVERSION + +/* + * On intel, even if sizes match, alignment and/or padding may differ. + */ +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) +#define BROKEN_X86_ALIGNMENT +#define __compat_packed __attribute__((packed)) +#else +#define __compat_packed +#endif + +typedef struct compat_xfs_bstime { + compat_time_t tv_sec; /* seconds */ + __s32 tv_nsec; /* and nanoseconds */ +} compat_xfs_bstime_t; + +typedef struct compat_xfs_bstat { + __u64 bs_ino; /* inode number */ + __u16 bs_mode; /* type and mode */ + __u16 bs_nlink; /* number of links */ + __u32 bs_uid; /* user id */ + __u32 bs_gid; /* group id */ + __u32 bs_rdev; /* device value */ + __s32 bs_blksize; /* block size */ + __s64 bs_size; /* file size */ + compat_xfs_bstime_t bs_atime; /* access time */ + compat_xfs_bstime_t bs_mtime; /* modify time */ + compat_xfs_bstime_t bs_ctime; /* inode change time */ + int64_t bs_blocks; /* number of blocks */ + __u32 bs_xflags; /* extended flags */ + __s32 bs_extsize; /* extent size */ + __s32 bs_extents; /* number of extents */ + __u32 bs_gen; /* generation count */ + __u16 bs_projid_lo; /* lower part of project id */ +#define bs_projid bs_projid_lo /* (previously just bs_projid) */ + __u16 bs_projid_hi; /* high part of project id */ + unsigned char bs_pad[12]; /* pad space, unused */ + __u32 bs_dmevmask; /* DMIG event mask */ + __u16 bs_dmstate; /* DMIG state info */ + __u16 bs_aextents; /* attribute number of extents */ +} __compat_packed compat_xfs_bstat_t; + +typedef struct compat_xfs_fsop_bulkreq { + compat_uptr_t lastip; /* last inode # pointer */ + __s32 icount; /* count of entries in buffer */ + compat_uptr_t ubuffer; /* user buffer for inode desc. */ + compat_uptr_t ocount; /* output count pointer */ +} compat_xfs_fsop_bulkreq_t; + +#define XFS_IOC_FSBULKSTAT_32 \ + _IOWR('X', 101, struct compat_xfs_fsop_bulkreq) +#define XFS_IOC_FSBULKSTAT_SINGLE_32 \ + _IOWR('X', 102, struct compat_xfs_fsop_bulkreq) +#define XFS_IOC_FSINUMBERS_32 \ + _IOWR('X', 103, struct compat_xfs_fsop_bulkreq) + +typedef struct compat_xfs_fsop_handlereq { + __u32 fd; /* fd for FD_TO_HANDLE */ + compat_uptr_t path; /* user pathname */ + __u32 oflags; /* open flags */ + compat_uptr_t ihandle; /* user supplied handle */ + __u32 ihandlen; /* user supplied length */ + compat_uptr_t ohandle; /* user buffer for handle */ + compat_uptr_t ohandlen; /* user buffer length */ +} compat_xfs_fsop_handlereq_t; + +#define XFS_IOC_PATH_TO_FSHANDLE_32 \ + _IOWR('X', 104, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_PATH_TO_HANDLE_32 \ + _IOWR('X', 105, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_FD_TO_HANDLE_32 \ + _IOWR('X', 106, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_OPEN_BY_HANDLE_32 \ + _IOWR('X', 107, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_READLINK_BY_HANDLE_32 \ + _IOWR('X', 108, struct compat_xfs_fsop_handlereq) + +/* The bstat field in the swapext struct needs translation */ +typedef struct compat_xfs_swapext { + __int64_t sx_version; /* version */ + __int64_t sx_fdtarget; /* fd of target file */ + __int64_t sx_fdtmp; /* fd of tmp file */ + xfs_off_t sx_offset; /* offset into file */ + xfs_off_t sx_length; /* leng from offset */ + char sx_pad[16]; /* pad space, unused */ + compat_xfs_bstat_t sx_stat; /* stat of target b4 copy */ +} __compat_packed compat_xfs_swapext_t; + +#define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext) + +typedef struct compat_xfs_fsop_attrlist_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */ + struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */ + __u32 flags; /* which namespace to use */ + __u32 buflen; /* length of buffer supplied */ + compat_uptr_t buffer; /* returned names */ +} __compat_packed compat_xfs_fsop_attrlist_handlereq_t; + +/* Note: actually this is read/write */ +#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \ + _IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq) + +/* am_opcodes defined in xfs_fs.h */ +typedef struct compat_xfs_attr_multiop { + __u32 am_opcode; + __s32 am_error; + compat_uptr_t am_attrname; + compat_uptr_t am_attrvalue; + __u32 am_length; + __u32 am_flags; +} compat_xfs_attr_multiop_t; + +typedef struct compat_xfs_fsop_attrmulti_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */ + __u32 opcount;/* count of following multiop */ + /* ptr to compat_xfs_attr_multiop */ + compat_uptr_t ops; /* attr_multi data */ +} compat_xfs_fsop_attrmulti_handlereq_t; + +#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \ + _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq) + +typedef struct compat_xfs_fsop_setdm_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle information */ + /* ptr to struct fsdmidata */ + compat_uptr_t data; /* DMAPI data */ +} compat_xfs_fsop_setdm_handlereq_t; + +#define XFS_IOC_FSSETDM_BY_HANDLE_32 \ + _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq) + +#ifdef BROKEN_X86_ALIGNMENT +/* on ia32 l_start is on a 32-bit boundary */ +typedef struct compat_xfs_flock64 { + __s16 l_type; + __s16 l_whence; + __s64 l_start __attribute__((packed)); + /* len == 0 means until end of file */ + __s64 l_len __attribute__((packed)); + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +} compat_xfs_flock64_t; + +#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64) +#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64) +#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64) +#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64) +#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64) +#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) +#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) +#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) +#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64) + +typedef struct compat_xfs_fsop_geom_v1 { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ +} __attribute__((packed)) compat_xfs_fsop_geom_v1_t; + +#define XFS_IOC_FSGEOMETRY_V1_32 \ + _IOR('X', 100, struct compat_xfs_fsop_geom_v1) + +typedef struct compat_xfs_inogrp { + __u64 xi_startino; /* starting inode number */ + __s32 xi_alloccount; /* # bits set in allocmask */ + __u64 xi_allocmask; /* mask of allocated inodes */ +} __attribute__((packed)) compat_xfs_inogrp_t; + +/* These growfs input structures have padding on the end, so must translate */ +typedef struct compat_xfs_growfs_data { + __u64 newblocks; /* new data subvol size, fsblocks */ + __u32 imaxpct; /* new inode space percentage limit */ +} __attribute__((packed)) compat_xfs_growfs_data_t; + +typedef struct compat_xfs_growfs_rt { + __u64 newblocks; /* new realtime size, fsblocks */ + __u32 extsize; /* new realtime extent size, fsblocks */ +} __attribute__((packed)) compat_xfs_growfs_rt_t; + +#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data) +#define XFS_IOC_FSGROWFSRT_32 _IOW('X', 112, struct compat_xfs_growfs_rt) + +#endif /* BROKEN_X86_ALIGNMENT */ + +#endif /* __XFS_IOCTL32_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c new file mode 100644 index 00000000..f5b697bf --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -0,0 +1,778 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_acl.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_utils.h" +#include "xfs_vnodeops.h" +#include "xfs_trace.h" + +#include +#include +#include +#include +#include +#include +#include + +/* + * Bring the timestamps in the XFS inode uptodate. + * + * Used before writing the inode to disk. + */ +void +xfs_synchronize_times( + xfs_inode_t *ip) +{ + struct inode *inode = VFS_I(ip); + + ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; + ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec; + ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec; +} + +/* + * If the linux inode is valid, mark it dirty, else mark the dirty state + * in the XFS inode to make sure we pick it up when reclaiming the inode. + */ +void +xfs_mark_inode_dirty_sync( + xfs_inode_t *ip) +{ + struct inode *inode = VFS_I(ip); + + if (!(inode->i_state & (I_WILL_FREE|I_FREEING))) + mark_inode_dirty_sync(inode); + else { + barrier(); + ip->i_update_core = 1; + } +} + +void +xfs_mark_inode_dirty( + xfs_inode_t *ip) +{ + struct inode *inode = VFS_I(ip); + + if (!(inode->i_state & (I_WILL_FREE|I_FREEING))) + mark_inode_dirty(inode); + else { + barrier(); + ip->i_update_core = 1; + } + +} + +/* + * Hook in SELinux. This is not quite correct yet, what we really need + * here (as we do for default ACLs) is a mechanism by which creation of + * these attrs can be journalled at inode creation time (along with the + * inode, of course, such that log replay can't cause these to be lost). + */ +STATIC int +xfs_init_security( + struct inode *inode, + struct inode *dir, + const struct qstr *qstr) +{ + struct xfs_inode *ip = XFS_I(inode); + size_t length; + void *value; + unsigned char *name; + int error; + + error = security_inode_init_security(inode, dir, qstr, (char **)&name, + &value, &length); + if (error) { + if (error == -EOPNOTSUPP) + return 0; + return -error; + } + + error = xfs_attr_set(ip, name, value, length, ATTR_SECURE); + + kfree(name); + kfree(value); + return error; +} + +static void +xfs_dentry_to_name( + struct xfs_name *namep, + struct dentry *dentry) +{ + namep->name = dentry->d_name.name; + namep->len = dentry->d_name.len; +} + +STATIC void +xfs_cleanup_inode( + struct inode *dir, + struct inode *inode, + struct dentry *dentry) +{ + struct xfs_name teardown; + + /* Oh, the horror. + * If we can't add the ACL or we fail in + * xfs_init_security we must back out. + * ENOSPC can hit here, among other things. + */ + xfs_dentry_to_name(&teardown, dentry); + + xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); + iput(inode); +} + +STATIC int +xfs_vn_mknod( + struct inode *dir, + struct dentry *dentry, + int mode, + dev_t rdev) +{ + struct inode *inode; + struct xfs_inode *ip = NULL; + struct posix_acl *default_acl = NULL; + struct xfs_name name; + int error; + + /* + * Irix uses Missed'em'V split, but doesn't want to see + * the upper 5 bits of (14bit) major. + */ + if (S_ISCHR(mode) || S_ISBLK(mode)) { + if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) + return -EINVAL; + rdev = sysv_encode_dev(rdev); + } else { + rdev = 0; + } + + if (IS_POSIXACL(dir)) { + default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(default_acl)) + return PTR_ERR(default_acl); + + if (!default_acl) + mode &= ~current_umask(); + } + + xfs_dentry_to_name(&name, dentry); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + if (unlikely(error)) + goto out_free_acl; + + inode = VFS_I(ip); + + error = xfs_init_security(inode, dir, &dentry->d_name); + if (unlikely(error)) + goto out_cleanup_inode; + + if (default_acl) { + error = -xfs_inherit_acl(inode, default_acl); + if (unlikely(error)) + goto out_cleanup_inode; + posix_acl_release(default_acl); + } + + + d_instantiate(dentry, inode); + return -error; + + out_cleanup_inode: + xfs_cleanup_inode(dir, inode, dentry); + out_free_acl: + posix_acl_release(default_acl); + return -error; +} + +STATIC int +xfs_vn_create( + struct inode *dir, + struct dentry *dentry, + int mode, + struct nameidata *nd) +{ + return xfs_vn_mknod(dir, dentry, mode, 0); +} + +STATIC int +xfs_vn_mkdir( + struct inode *dir, + struct dentry *dentry, + int mode) +{ + return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0); +} + +STATIC struct dentry * +xfs_vn_lookup( + struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct xfs_inode *cip; + struct xfs_name name; + int error; + + if (dentry->d_name.len >= MAXNAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + xfs_dentry_to_name(&name, dentry); + error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); + if (unlikely(error)) { + if (unlikely(error != ENOENT)) + return ERR_PTR(-error); + d_add(dentry, NULL); + return NULL; + } + + return d_splice_alias(VFS_I(cip), dentry); +} + +STATIC struct dentry * +xfs_vn_ci_lookup( + struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct xfs_inode *ip; + struct xfs_name xname; + struct xfs_name ci_name; + struct qstr dname; + int error; + + if (dentry->d_name.len >= MAXNAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + xfs_dentry_to_name(&xname, dentry); + error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); + if (unlikely(error)) { + if (unlikely(error != ENOENT)) + return ERR_PTR(-error); + /* + * call d_add(dentry, NULL) here when d_drop_negative_children + * is called in xfs_vn_mknod (ie. allow negative dentries + * with CI filesystems). + */ + return NULL; + } + + /* if exact match, just splice and exit */ + if (!ci_name.name) + return d_splice_alias(VFS_I(ip), dentry); + + /* else case-insensitive match... */ + dname.name = ci_name.name; + dname.len = ci_name.len; + dentry = d_add_ci(dentry, VFS_I(ip), &dname); + kmem_free(ci_name.name); + return dentry; +} + +STATIC int +xfs_vn_link( + struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + struct xfs_name name; + int error; + + xfs_dentry_to_name(&name, dentry); + + error = xfs_link(XFS_I(dir), XFS_I(inode), &name); + if (unlikely(error)) + return -error; + + ihold(inode); + d_instantiate(dentry, inode); + return 0; +} + +STATIC int +xfs_vn_unlink( + struct inode *dir, + struct dentry *dentry) +{ + struct xfs_name name; + int error; + + xfs_dentry_to_name(&name, dentry); + + error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); + if (error) + return error; + + /* + * With unlink, the VFS makes the dentry "negative": no inode, + * but still hashed. This is incompatible with case-insensitive + * mode, so invalidate (unhash) the dentry in CI-mode. + */ + if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb)) + d_invalidate(dentry); + return 0; +} + +STATIC int +xfs_vn_symlink( + struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + struct inode *inode; + struct xfs_inode *cip = NULL; + struct xfs_name name; + int error; + mode_t mode; + + mode = S_IFLNK | + (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); + xfs_dentry_to_name(&name, dentry); + + error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); + if (unlikely(error)) + goto out; + + inode = VFS_I(cip); + + error = xfs_init_security(inode, dir, &dentry->d_name); + if (unlikely(error)) + goto out_cleanup_inode; + + d_instantiate(dentry, inode); + return 0; + + out_cleanup_inode: + xfs_cleanup_inode(dir, inode, dentry); + out: + return -error; +} + +STATIC int +xfs_vn_rename( + struct inode *odir, + struct dentry *odentry, + struct inode *ndir, + struct dentry *ndentry) +{ + struct inode *new_inode = ndentry->d_inode; + struct xfs_name oname; + struct xfs_name nname; + + xfs_dentry_to_name(&oname, odentry); + xfs_dentry_to_name(&nname, ndentry); + + return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), + XFS_I(ndir), &nname, new_inode ? + XFS_I(new_inode) : NULL); +} + +/* + * careful here - this function can get called recursively, so + * we need to be very careful about how much stack we use. + * uio is kmalloced for this reason... + */ +STATIC void * +xfs_vn_follow_link( + struct dentry *dentry, + struct nameidata *nd) +{ + char *link; + int error = -ENOMEM; + + link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); + if (!link) + goto out_err; + + error = -xfs_readlink(XFS_I(dentry->d_inode), link); + if (unlikely(error)) + goto out_kfree; + + nd_set_link(nd, link); + return NULL; + + out_kfree: + kfree(link); + out_err: + nd_set_link(nd, ERR_PTR(error)); + return NULL; +} + +STATIC void +xfs_vn_put_link( + struct dentry *dentry, + struct nameidata *nd, + void *p) +{ + char *s = nd_get_link(nd); + + if (!IS_ERR(s)) + kfree(s); +} + +STATIC int +xfs_vn_getattr( + struct vfsmount *mnt, + struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + trace_xfs_getattr(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + stat->size = XFS_ISIZE(ip); + stat->dev = inode->i_sb->s_dev; + stat->mode = ip->i_d.di_mode; + stat->nlink = ip->i_d.di_nlink; + stat->uid = ip->i_d.di_uid; + stat->gid = ip->i_d.di_gid; + stat->ino = ip->i_ino; + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; + stat->blocks = + XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); + + + switch (inode->i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + stat->blksize = BLKDEV_IOSIZE; + stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, + sysv_minor(ip->i_df.if_u2.if_rdev)); + break; + default: + if (XFS_IS_REALTIME_INODE(ip)) { + /* + * If the file blocks are being allocated from a + * realtime volume, then return the inode's realtime + * extent size or the realtime volume's extent size. + */ + stat->blksize = + xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog; + } else + stat->blksize = xfs_preferred_iosize(mp); + stat->rdev = 0; + break; + } + + return 0; +} + +STATIC int +xfs_vn_setattr( + struct dentry *dentry, + struct iattr *iattr) +{ + return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); +} + +#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) + +/* + * Call fiemap helper to fill in user data. + * Returns positive errors to xfs_getbmap. + */ +STATIC int +xfs_fiemap_format( + void **arg, + struct getbmapx *bmv, + int *full) +{ + int error; + struct fiemap_extent_info *fieinfo = *arg; + u32 fiemap_flags = 0; + u64 logical, physical, length; + + /* Do nothing for a hole */ + if (bmv->bmv_block == -1LL) + return 0; + + logical = BBTOB(bmv->bmv_offset); + physical = BBTOB(bmv->bmv_block); + length = BBTOB(bmv->bmv_length); + + if (bmv->bmv_oflags & BMV_OF_PREALLOC) + fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; + else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { + fiemap_flags |= FIEMAP_EXTENT_DELALLOC; + physical = 0; /* no block yet */ + } + if (bmv->bmv_oflags & BMV_OF_LAST) + fiemap_flags |= FIEMAP_EXTENT_LAST; + + error = fiemap_fill_next_extent(fieinfo, logical, physical, + length, fiemap_flags); + if (error > 0) { + error = 0; + *full = 1; /* user array now full */ + } + + return -error; +} + +STATIC int +xfs_vn_fiemap( + struct inode *inode, + struct fiemap_extent_info *fieinfo, + u64 start, + u64 length) +{ + xfs_inode_t *ip = XFS_I(inode); + struct getbmapx bm; + int error; + + error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS); + if (error) + return error; + + /* Set up bmap header for xfs internal routine */ + bm.bmv_offset = BTOBB(start); + /* Special case for whole file */ + if (length == FIEMAP_MAX_OFFSET) + bm.bmv_length = -1LL; + else + bm.bmv_length = BTOBB(length); + + /* We add one because in getbmap world count includes the header */ + bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : + fieinfo->fi_extents_max + 1; + bm.bmv_count = min_t(__s32, bm.bmv_count, + (PAGE_SIZE * 16 / sizeof(struct getbmapx))); + bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) + bm.bmv_iflags |= BMV_IF_ATTRFORK; + if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) + bm.bmv_iflags |= BMV_IF_DELALLOC; + + error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo); + if (error) + return -error; + + return 0; +} + +static const struct inode_operations xfs_inode_operations = { + .check_acl = xfs_check_acl, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .fiemap = xfs_vn_fiemap, +}; + +static const struct inode_operations xfs_dir_inode_operations = { + .create = xfs_vn_create, + .lookup = xfs_vn_lookup, + .link = xfs_vn_link, + .unlink = xfs_vn_unlink, + .symlink = xfs_vn_symlink, + .mkdir = xfs_vn_mkdir, + /* + * Yes, XFS uses the same method for rmdir and unlink. + * + * There are some subtile differences deeper in the code, + * but we use S_ISDIR to check for those. + */ + .rmdir = xfs_vn_unlink, + .mknod = xfs_vn_mknod, + .rename = xfs_vn_rename, + .check_acl = xfs_check_acl, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, +}; + +static const struct inode_operations xfs_dir_ci_inode_operations = { + .create = xfs_vn_create, + .lookup = xfs_vn_ci_lookup, + .link = xfs_vn_link, + .unlink = xfs_vn_unlink, + .symlink = xfs_vn_symlink, + .mkdir = xfs_vn_mkdir, + /* + * Yes, XFS uses the same method for rmdir and unlink. + * + * There are some subtile differences deeper in the code, + * but we use S_ISDIR to check for those. + */ + .rmdir = xfs_vn_unlink, + .mknod = xfs_vn_mknod, + .rename = xfs_vn_rename, + .check_acl = xfs_check_acl, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, +}; + +static const struct inode_operations xfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = xfs_vn_follow_link, + .put_link = xfs_vn_put_link, + .check_acl = xfs_check_acl, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, +}; + +STATIC void +xfs_diflags_to_iflags( + struct inode *inode, + struct xfs_inode *ip) +{ + if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; +} + +/* + * Initialize the Linux inode, set up the operation vectors and + * unlock the inode. + * + * When reading existing inodes from disk this is called directly + * from xfs_iget, when creating a new inode it is called from + * xfs_ialloc after setting up the inode. + * + * We are always called with an uninitialised linux inode here. + * We need to initialise the necessary fields and take a reference + * on it. + */ +void +xfs_setup_inode( + struct xfs_inode *ip) +{ + struct inode *inode = &ip->i_vnode; + + inode->i_ino = ip->i_ino; + inode->i_state = I_NEW; + + inode_sb_list_add(inode); + /* make the inode look hashed for the writeback code */ + hlist_add_fake(&inode->i_hash); + + inode->i_mode = ip->i_d.di_mode; + inode->i_nlink = ip->i_d.di_nlink; + inode->i_uid = ip->i_d.di_uid; + inode->i_gid = ip->i_d.di_gid; + + switch (inode->i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + inode->i_rdev = + MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, + sysv_minor(ip->i_df.if_u2.if_rdev)); + break; + default: + inode->i_rdev = 0; + break; + } + + inode->i_generation = ip->i_d.di_gen; + i_size_write(inode, ip->i_d.di_size); + inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; + inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; + inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; + inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; + inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; + inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + xfs_diflags_to_iflags(inode, ip); + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &xfs_inode_operations; + inode->i_fop = &xfs_file_operations; + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + case S_IFDIR: + if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) + inode->i_op = &xfs_dir_ci_inode_operations; + else + inode->i_op = &xfs_dir_inode_operations; + inode->i_fop = &xfs_dir_file_operations; + break; + case S_IFLNK: + inode->i_op = &xfs_symlink_inode_operations; + if (!(ip->i_df.if_flags & XFS_IFINLINE)) + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + default: + inode->i_op = &xfs_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + } + + xfs_iflags_clear(ip, XFS_INEW); + barrier(); + + unlock_new_inode(inode); +} diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h new file mode 100644 index 00000000..ef41c92c --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_iops.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOPS_H__ +#define __XFS_IOPS_H__ + +struct xfs_inode; + +extern const struct file_operations xfs_file_operations; +extern const struct file_operations xfs_dir_file_operations; + +extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); + +extern void xfs_setup_inode(struct xfs_inode *); + +#endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h new file mode 100644 index 00000000..87315168 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LINUX__ +#define __XFS_LINUX__ + +#include + +/* + * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits. + * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set. + */ +#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) +# define XFS_BIG_BLKNOS 1 +# define XFS_BIG_INUMS 1 +#else +# define XFS_BIG_BLKNOS 0 +# define XFS_BIG_INUMS 0 +#endif + +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Feature macros (disable/enable) + */ +#ifdef CONFIG_SMP +#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ +#else +#undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ +#endif + +#define irix_sgid_inherit xfs_params.sgid_inherit.val +#define irix_symlink_mode xfs_params.symlink_mode.val +#define xfs_panic_mask xfs_params.panic_mask.val +#define xfs_error_level xfs_params.error_level.val +#define xfs_syncd_centisecs xfs_params.syncd_timer.val +#define xfs_stats_clear xfs_params.stats_clear.val +#define xfs_inherit_sync xfs_params.inherit_sync.val +#define xfs_inherit_nodump xfs_params.inherit_nodump.val +#define xfs_inherit_noatime xfs_params.inherit_noatim.val +#define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val +#define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val +#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val +#define xfs_rotorstep xfs_params.rotorstep.val +#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val +#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val + +#define current_cpu() (raw_smp_processor_id()) +#define current_pid() (current->pid) +#define current_test_flags(f) (current->flags & (f)) +#define current_set_flags_nested(sp, f) \ + (*(sp) = current->flags, current->flags |= (f)) +#define current_clear_flags_nested(sp, f) \ + (*(sp) = current->flags, current->flags &= ~(f)) +#define current_restore_flags_nested(sp, f) \ + (current->flags = ((current->flags & ~(f)) | (*(sp) & (f)))) + +#define spinlock_destroy(lock) + +#define NBBY 8 /* number of bits per byte */ + +/* + * Size of block device i/o is parameterized here. + * Currently the system supports page-sized i/o. + */ +#define BLKDEV_IOSHIFT PAGE_CACHE_SHIFT +#define BLKDEV_IOSIZE (1<> 32; + __low = c; + if (__high) { + __upper = __high % (b); + __high = __high / (b); + } + asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); + asm("":"=A" (c):"a" (__low),"d" (__high)); + *(__u64 *)a = c; + return __mod; + } + } + + /* NOTREACHED */ + return 0; +} + +/* Side effect free 64 bit mod operation */ +static inline __u32 xfs_do_mod(void *a, __u32 b, int n) +{ + switch (n) { + case 4: + return *(__u32 *)a % b; + case 8: + { + unsigned long __upper, __low, __high, __mod; + __u64 c = *(__u64 *)a; + __upper = __high = c >> 32; + __low = c; + if (__high) { + __upper = __high % (b); + __high = __high / (b); + } + asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); + asm("":"=A" (c):"a" (__low),"d" (__high)); + return __mod; + } + } + + /* NOTREACHED */ + return 0; +} +#else +static inline __u32 xfs_do_div(void *a, __u32 b, int n) +{ + __u32 mod; + + switch (n) { + case 4: + mod = *(__u32 *)a % b; + *(__u32 *)a = *(__u32 *)a / b; + return mod; + case 8: + mod = do_div(*(__u64 *)a, b); + return mod; + } + + /* NOTREACHED */ + return 0; +} + +/* Side effect free 64 bit mod operation */ +static inline __u32 xfs_do_mod(void *a, __u32 b, int n) +{ + switch (n) { + case 4: + return *(__u32 *)a % b; + case 8: + { + __u64 c = *(__u64 *)a; + return do_div(c, b); + } + } + + /* NOTREACHED */ + return 0; +} +#endif + +#undef do_div +#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a)) +#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a)) + +static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) +{ + x += y - 1; + do_div(x, y); + return(x * y); +} + +static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) +{ + x += y - 1; + do_div(x, y); + return x; +} + +/* ARM old ABI has some weird alignment/padding */ +#if defined(__arm__) && !defined(__ARM_EABI__) +#define __arch_pack __attribute__((packed)) +#else +#define __arch_pack +#endif + +#define ASSERT_ALWAYS(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + +#ifndef DEBUG +#define ASSERT(expr) ((void)0) + +#ifndef STATIC +# define STATIC static noinline +#endif + +#else /* DEBUG */ + +#define ASSERT(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + +#ifndef STATIC +# define STATIC noinline +#endif + +#endif /* DEBUG */ + +#endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c new file mode 100644 index 00000000..bd672def --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_message.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2011 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" + +/* + * XFS logging functions + */ +static void +__xfs_printk( + const char *level, + const struct xfs_mount *mp, + struct va_format *vaf) +{ + if (mp && mp->m_fsname) { + printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf); + return; + } + printk("%sXFS: %pV\n", level, vaf); +} + +#define define_xfs_printk_level(func, kern_level) \ +void func(const struct xfs_mount *mp, const char *fmt, ...) \ +{ \ + struct va_format vaf; \ + va_list args; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + __xfs_printk(kern_level, mp, &vaf); \ + va_end(args); \ +} \ + +define_xfs_printk_level(xfs_emerg, KERN_EMERG); +define_xfs_printk_level(xfs_alert, KERN_ALERT); +define_xfs_printk_level(xfs_crit, KERN_CRIT); +define_xfs_printk_level(xfs_err, KERN_ERR); +define_xfs_printk_level(xfs_warn, KERN_WARNING); +define_xfs_printk_level(xfs_notice, KERN_NOTICE); +define_xfs_printk_level(xfs_info, KERN_INFO); +#ifdef DEBUG +define_xfs_printk_level(xfs_debug, KERN_DEBUG); +#endif + +void +xfs_alert_tag( + const struct xfs_mount *mp, + int panic_tag, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int do_panic = 0; + + if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { + xfs_alert(mp, "Transforming an alert into a BUG."); + do_panic = 1; + } + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + __xfs_printk(KERN_ALERT, mp, &vaf); + va_end(args); + + BUG_ON(do_panic); +} + +void +assfail(char *expr, char *file, int line) +{ + xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d", + expr, file, line); + BUG(); +} + +void +xfs_hex_dump(void *p, int length) +{ + print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1); +} diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h new file mode 100644 index 00000000..7fb7ea00 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_message.h @@ -0,0 +1,39 @@ +#ifndef __XFS_MESSAGE_H +#define __XFS_MESSAGE_H 1 + +struct xfs_mount; + +extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern void xfs_alert_tag(const struct xfs_mount *mp, int tag, + const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))); +extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); + +#ifdef DEBUG +extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +#else +static inline void +__attribute__ ((format (printf, 2, 3))) +xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) +{ +} +#endif + +extern void assfail(char *expr, char *f, int l); + +extern void xfs_hex_dump(void *p, int length); + +#endif /* __XFS_MESSAGE_H */ diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c new file mode 100644 index 00000000..29b9d642 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2008, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "quota/xfs_qm.h" +#include + + +STATIC int +xfs_quota_type(int type) +{ + switch (type) { + case USRQUOTA: + return XFS_DQ_USER; + case GRPQUOTA: + return XFS_DQ_GROUP; + default: + return XFS_DQ_PROJ; + } +} + +STATIC int +xfs_fs_get_xstate( + struct super_block *sb, + struct fs_quota_stat *fqs) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + return -xfs_qm_scall_getqstat(mp, fqs); +} + +STATIC int +xfs_fs_set_xstate( + struct super_block *sb, + unsigned int uflags, + int op) +{ + struct xfs_mount *mp = XFS_M(sb); + unsigned int flags = 0; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + + if (uflags & FS_QUOTA_UDQ_ACCT) + flags |= XFS_UQUOTA_ACCT; + if (uflags & FS_QUOTA_PDQ_ACCT) + flags |= XFS_PQUOTA_ACCT; + if (uflags & FS_QUOTA_GDQ_ACCT) + flags |= XFS_GQUOTA_ACCT; + if (uflags & FS_QUOTA_UDQ_ENFD) + flags |= XFS_UQUOTA_ENFD; + if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD)) + flags |= XFS_OQUOTA_ENFD; + + switch (op) { + case Q_XQUOTAON: + return -xfs_qm_scall_quotaon(mp, flags); + case Q_XQUOTAOFF: + if (!XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + return -xfs_qm_scall_quotaoff(mp, flags); + case Q_XQUOTARM: + if (XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + return -xfs_qm_scall_trunc_qfiles(mp, flags); + } + + return -EINVAL; +} + +STATIC int +xfs_fs_get_dqblk( + struct super_block *sb, + int type, + qid_t id, + struct fs_disk_quota *fdq) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + + return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq); +} + +STATIC int +xfs_fs_set_dqblk( + struct super_block *sb, + int type, + qid_t id, + struct fs_disk_quota *fdq) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + + return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); +} + +const struct quotactl_ops xfs_quotactl_operations = { + .get_xstate = xfs_fs_get_xstate, + .set_xstate = xfs_fs_set_xstate, + .get_dqblk = xfs_fs_get_dqblk, + .set_dqblk = xfs_fs_set_dqblk, +}; diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c new file mode 100644 index 00000000..76fdc586 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_stats.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include + +DEFINE_PER_CPU(struct xfsstats, xfsstats); + +static int xfs_stat_proc_show(struct seq_file *m, void *v) +{ + int c, i, j, val; + __uint64_t xs_xstrat_bytes = 0; + __uint64_t xs_write_bytes = 0; + __uint64_t xs_read_bytes = 0; + + static const struct xstats_entry { + char *desc; + int endpoint; + } xstats[] = { + { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC }, + { "abt", XFSSTAT_END_ALLOC_BTREE }, + { "blk_map", XFSSTAT_END_BLOCK_MAPPING }, + { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE }, + { "dir", XFSSTAT_END_DIRECTORY_OPS }, + { "trans", XFSSTAT_END_TRANSACTIONS }, + { "ig", XFSSTAT_END_INODE_OPS }, + { "log", XFSSTAT_END_LOG_OPS }, + { "push_ail", XFSSTAT_END_TAIL_PUSHING }, + { "xstrat", XFSSTAT_END_WRITE_CONVERT }, + { "rw", XFSSTAT_END_READ_WRITE_OPS }, + { "attr", XFSSTAT_END_ATTRIBUTE_OPS }, + { "icluster", XFSSTAT_END_INODE_CLUSTER }, + { "vnodes", XFSSTAT_END_VNODE_OPS }, + { "buf", XFSSTAT_END_BUF }, + { "abtb2", XFSSTAT_END_ABTB_V2 }, + { "abtc2", XFSSTAT_END_ABTC_V2 }, + { "bmbt2", XFSSTAT_END_BMBT_V2 }, + { "ibt2", XFSSTAT_END_IBT_V2 }, + }; + + /* Loop over all stats groups */ + for (i=j = 0; i < ARRAY_SIZE(xstats); i++) { + seq_printf(m, "%s", xstats[i].desc); + /* inner loop does each group */ + while (j < xstats[i].endpoint) { + val = 0; + /* sum over all cpus */ + for_each_possible_cpu(c) + val += *(((__u32*)&per_cpu(xfsstats, c) + j)); + seq_printf(m, " %u", val); + j++; + } + seq_putc(m, '\n'); + } + /* extra precision counters */ + for_each_possible_cpu(i) { + xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes; + xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes; + xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; + } + + seq_printf(m, "xpc %Lu %Lu %Lu\n", + xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); + seq_printf(m, "debug %u\n", +#if defined(DEBUG) + 1); +#else + 0); +#endif + return 0; +} + +static int xfs_stat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xfs_stat_proc_show, NULL); +} + +static const struct file_operations xfs_stat_proc_fops = { + .owner = THIS_MODULE, + .open = xfs_stat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int +xfs_init_procfs(void) +{ + if (!proc_mkdir("fs/xfs", NULL)) + goto out; + + if (!proc_create("fs/xfs/stat", 0, NULL, + &xfs_stat_proc_fops)) + goto out_remove_entry; + return 0; + + out_remove_entry: + remove_proc_entry("fs/xfs", NULL); + out: + return -ENOMEM; +} + +void +xfs_cleanup_procfs(void) +{ + remove_proc_entry("fs/xfs/stat", NULL); + remove_proc_entry("fs/xfs", NULL); +} diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h new file mode 100644 index 00000000..736854b1 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_stats.h @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_STATS_H__ +#define __XFS_STATS_H__ + + +#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF) + +#include + +/* + * XFS global statistics + */ +struct xfsstats { +# define XFSSTAT_END_EXTENT_ALLOC 4 + __uint32_t xs_allocx; + __uint32_t xs_allocb; + __uint32_t xs_freex; + __uint32_t xs_freeb; +# define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4) + __uint32_t xs_abt_lookup; + __uint32_t xs_abt_compare; + __uint32_t xs_abt_insrec; + __uint32_t xs_abt_delrec; +# define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7) + __uint32_t xs_blk_mapr; + __uint32_t xs_blk_mapw; + __uint32_t xs_blk_unmap; + __uint32_t xs_add_exlist; + __uint32_t xs_del_exlist; + __uint32_t xs_look_exlist; + __uint32_t xs_cmp_exlist; +# define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4) + __uint32_t xs_bmbt_lookup; + __uint32_t xs_bmbt_compare; + __uint32_t xs_bmbt_insrec; + __uint32_t xs_bmbt_delrec; +# define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4) + __uint32_t xs_dir_lookup; + __uint32_t xs_dir_create; + __uint32_t xs_dir_remove; + __uint32_t xs_dir_getdents; +# define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3) + __uint32_t xs_trans_sync; + __uint32_t xs_trans_async; + __uint32_t xs_trans_empty; +# define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7) + __uint32_t xs_ig_attempts; + __uint32_t xs_ig_found; + __uint32_t xs_ig_frecycle; + __uint32_t xs_ig_missed; + __uint32_t xs_ig_dup; + __uint32_t xs_ig_reclaims; + __uint32_t xs_ig_attrchg; +# define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5) + __uint32_t xs_log_writes; + __uint32_t xs_log_blocks; + __uint32_t xs_log_noiclogs; + __uint32_t xs_log_force; + __uint32_t xs_log_force_sleep; +# define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10) + __uint32_t xs_try_logspace; + __uint32_t xs_sleep_logspace; + __uint32_t xs_push_ail; + __uint32_t xs_push_ail_success; + __uint32_t xs_push_ail_pushbuf; + __uint32_t xs_push_ail_pinned; + __uint32_t xs_push_ail_locked; + __uint32_t xs_push_ail_flushing; + __uint32_t xs_push_ail_restarts; + __uint32_t xs_push_ail_flush; +# define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2) + __uint32_t xs_xstrat_quick; + __uint32_t xs_xstrat_split; +# define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2) + __uint32_t xs_write_calls; + __uint32_t xs_read_calls; +# define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4) + __uint32_t xs_attr_get; + __uint32_t xs_attr_set; + __uint32_t xs_attr_remove; + __uint32_t xs_attr_list; +# define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3) + __uint32_t xs_iflush_count; + __uint32_t xs_icluster_flushcnt; + __uint32_t xs_icluster_flushinode; +# define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8) + __uint32_t vn_active; /* # vnodes not on free lists */ + __uint32_t vn_alloc; /* # times vn_alloc called */ + __uint32_t vn_get; /* # times vn_get called */ + __uint32_t vn_hold; /* # times vn_hold called */ + __uint32_t vn_rele; /* # times vn_rele called */ + __uint32_t vn_reclaim; /* # times vn_reclaim called */ + __uint32_t vn_remove; /* # times vn_remove called */ + __uint32_t vn_free; /* # times vn_free called */ +#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9) + __uint32_t xb_get; + __uint32_t xb_create; + __uint32_t xb_get_locked; + __uint32_t xb_get_locked_waited; + __uint32_t xb_busy_locked; + __uint32_t xb_miss_locked; + __uint32_t xb_page_retries; + __uint32_t xb_page_found; + __uint32_t xb_get_read; +/* Version 2 btree counters */ +#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15) + __uint32_t xs_abtb_2_lookup; + __uint32_t xs_abtb_2_compare; + __uint32_t xs_abtb_2_insrec; + __uint32_t xs_abtb_2_delrec; + __uint32_t xs_abtb_2_newroot; + __uint32_t xs_abtb_2_killroot; + __uint32_t xs_abtb_2_increment; + __uint32_t xs_abtb_2_decrement; + __uint32_t xs_abtb_2_lshift; + __uint32_t xs_abtb_2_rshift; + __uint32_t xs_abtb_2_split; + __uint32_t xs_abtb_2_join; + __uint32_t xs_abtb_2_alloc; + __uint32_t xs_abtb_2_free; + __uint32_t xs_abtb_2_moves; +#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15) + __uint32_t xs_abtc_2_lookup; + __uint32_t xs_abtc_2_compare; + __uint32_t xs_abtc_2_insrec; + __uint32_t xs_abtc_2_delrec; + __uint32_t xs_abtc_2_newroot; + __uint32_t xs_abtc_2_killroot; + __uint32_t xs_abtc_2_increment; + __uint32_t xs_abtc_2_decrement; + __uint32_t xs_abtc_2_lshift; + __uint32_t xs_abtc_2_rshift; + __uint32_t xs_abtc_2_split; + __uint32_t xs_abtc_2_join; + __uint32_t xs_abtc_2_alloc; + __uint32_t xs_abtc_2_free; + __uint32_t xs_abtc_2_moves; +#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15) + __uint32_t xs_bmbt_2_lookup; + __uint32_t xs_bmbt_2_compare; + __uint32_t xs_bmbt_2_insrec; + __uint32_t xs_bmbt_2_delrec; + __uint32_t xs_bmbt_2_newroot; + __uint32_t xs_bmbt_2_killroot; + __uint32_t xs_bmbt_2_increment; + __uint32_t xs_bmbt_2_decrement; + __uint32_t xs_bmbt_2_lshift; + __uint32_t xs_bmbt_2_rshift; + __uint32_t xs_bmbt_2_split; + __uint32_t xs_bmbt_2_join; + __uint32_t xs_bmbt_2_alloc; + __uint32_t xs_bmbt_2_free; + __uint32_t xs_bmbt_2_moves; +#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15) + __uint32_t xs_ibt_2_lookup; + __uint32_t xs_ibt_2_compare; + __uint32_t xs_ibt_2_insrec; + __uint32_t xs_ibt_2_delrec; + __uint32_t xs_ibt_2_newroot; + __uint32_t xs_ibt_2_killroot; + __uint32_t xs_ibt_2_increment; + __uint32_t xs_ibt_2_decrement; + __uint32_t xs_ibt_2_lshift; + __uint32_t xs_ibt_2_rshift; + __uint32_t xs_ibt_2_split; + __uint32_t xs_ibt_2_join; + __uint32_t xs_ibt_2_alloc; + __uint32_t xs_ibt_2_free; + __uint32_t xs_ibt_2_moves; +/* Extra precision counters */ + __uint64_t xs_xstrat_bytes; + __uint64_t xs_write_bytes; + __uint64_t xs_read_bytes; +}; + +DECLARE_PER_CPU(struct xfsstats, xfsstats); + +/* + * We don't disable preempt, not too worried about poking the + * wrong CPU's stat for now (also aggregated before reporting). + */ +#define XFS_STATS_INC(v) (per_cpu(xfsstats, current_cpu()).v++) +#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--) +#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc)) + +extern int xfs_init_procfs(void); +extern void xfs_cleanup_procfs(void); + + +#else /* !CONFIG_PROC_FS */ + +# define XFS_STATS_INC(count) +# define XFS_STATS_DEC(count) +# define XFS_STATS_ADD(count, inc) + +static inline int xfs_init_procfs(void) +{ + return 0; +} + +static inline void xfs_cleanup_procfs(void) +{ +} + +#endif /* !CONFIG_PROC_FS */ + +#endif /* __XFS_STATS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c new file mode 100644 index 00000000..e6ac98c1 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -0,0 +1,1720 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_btree_trace.h" +#include "xfs_ialloc.h" +#include "xfs_bmap.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_fsops.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_utils.h" +#include "xfs_vnodeops.h" +#include "xfs_log_priv.h" +#include "xfs_trans_priv.h" +#include "xfs_filestream.h" +#include "xfs_da_btree.h" +#include "xfs_extfree_item.h" +#include "xfs_mru_cache.h" +#include "xfs_inode_item.h" +#include "xfs_sync.h" +#include "xfs_trace.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct super_operations xfs_super_operations; +static kmem_zone_t *xfs_ioend_zone; +mempool_t *xfs_ioend_pool; + +#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ +#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ +#define MNTOPT_LOGDEV "logdev" /* log device */ +#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */ +#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */ +#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */ +#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */ +#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */ +#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */ +#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */ +#define MNTOPT_NOUUID "nouuid" /* ignore filesystem UUID */ +#define MNTOPT_MTPT "mtpt" /* filesystem mount point */ +#define MNTOPT_GRPID "grpid" /* group-ID from parent directory */ +#define MNTOPT_NOGRPID "nogrpid" /* group-ID from current process */ +#define MNTOPT_BSDGROUPS "bsdgroups" /* group-ID from parent directory */ +#define MNTOPT_SYSVGROUPS "sysvgroups" /* group-ID from current process */ +#define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */ +#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */ +#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and + * unwritten extent conversion */ +#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ +#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ +#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ +#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ +#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */ +#define MNTOPT_NOLARGEIO "nolargeio" /* do not report large I/O sizes + * in stat(). */ +#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */ +#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */ +#define MNTOPT_FILESTREAM "filestreams" /* use filestreams allocator */ +#define MNTOPT_QUOTA "quota" /* disk quotas (user) */ +#define MNTOPT_NOQUOTA "noquota" /* no quotas */ +#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */ +#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */ +#define MNTOPT_PRJQUOTA "prjquota" /* project quota enabled */ +#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */ +#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */ +#define MNTOPT_PQUOTA "pquota" /* project quota (IRIX variant) */ +#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */ +#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ +#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ +#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ +#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */ +#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */ +#define MNTOPT_DISCARD "discard" /* Discard unused blocks */ +#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ + +/* + * Table driven mount option parser. + * + * Currently only used for remount, but it will be used for mount + * in the future, too. + */ +enum { + Opt_barrier, Opt_nobarrier, Opt_err +}; + +static const match_table_t tokens = { + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_err, NULL} +}; + + +STATIC unsigned long +suffix_strtoul(char *s, char **endp, unsigned int base) +{ + int last, shift_left_factor = 0; + char *value = s; + + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + shift_left_factor = 10; + value[last] = '\0'; + } + if (value[last] == 'M' || value[last] == 'm') { + shift_left_factor = 20; + value[last] = '\0'; + } + if (value[last] == 'G' || value[last] == 'g') { + shift_left_factor = 30; + value[last] = '\0'; + } + + return simple_strtoul((const char *)s, endp, base) << shift_left_factor; +} + +/* + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock has _not_ yet been read in. + * + * Note that this function leaks the various device name allocations on + * failure. The caller takes care of them. + */ +STATIC int +xfs_parseargs( + struct xfs_mount *mp, + char *options) +{ + struct super_block *sb = mp->m_super; + char *this_char, *value, *eov; + int dsunit = 0; + int dswidth = 0; + int iosize = 0; + __uint8_t iosizelog = 0; + + /* + * set up the mount name first so all the errors will refer to the + * correct device. + */ + mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_fsname) + return ENOMEM; + mp->m_fsname_len = strlen(mp->m_fsname) + 1; + + /* + * Copy binary VFS mount flags we are interested in. + */ + if (sb->s_flags & MS_RDONLY) + mp->m_flags |= XFS_MOUNT_RDONLY; + if (sb->s_flags & MS_DIRSYNC) + mp->m_flags |= XFS_MOUNT_DIRSYNC; + if (sb->s_flags & MS_SYNCHRONOUS) + mp->m_flags |= XFS_MOUNT_WSYNC; + + /* + * Set some default flags that could be cleared by the mount option + * parsing. + */ + mp->m_flags |= XFS_MOUNT_BARRIER; + mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; + mp->m_flags |= XFS_MOUNT_DELAYLOG; + + /* + * These can be overridden by the mount option parsing. + */ + mp->m_logbufs = -1; + mp->m_logbsize = -1; + + if (!options) + goto done; + + while ((this_char = strsep(&options, ",")) != NULL) { + if (!*this_char) + continue; + if ((value = strchr(this_char, '=')) != NULL) + *value++ = 0; + + if (!strcmp(this_char, MNTOPT_LOGBUFS)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + mp->m_logbufs = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + mp->m_logbsize = suffix_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_logname) + return ENOMEM; + } else if (!strcmp(this_char, MNTOPT_MTPT)) { + xfs_warn(mp, "%s option not allowed on this system", + this_char); + return EINVAL; + } else if (!strcmp(this_char, MNTOPT_RTDEV)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_rtname) + return ENOMEM; + } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + iosize = simple_strtoul(value, &eov, 10); + iosizelog = ffs(iosize) - 1; + } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + iosize = suffix_strtoul(value, &eov, 10); + iosizelog = ffs(iosize) - 1; + } else if (!strcmp(this_char, MNTOPT_GRPID) || + !strcmp(this_char, MNTOPT_BSDGROUPS)) { + mp->m_flags |= XFS_MOUNT_GRPID; + } else if (!strcmp(this_char, MNTOPT_NOGRPID) || + !strcmp(this_char, MNTOPT_SYSVGROUPS)) { + mp->m_flags &= ~XFS_MOUNT_GRPID; + } else if (!strcmp(this_char, MNTOPT_WSYNC)) { + mp->m_flags |= XFS_MOUNT_WSYNC; + } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { + mp->m_flags |= XFS_MOUNT_NORECOVERY; + } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { + mp->m_flags |= XFS_MOUNT_NOALIGN; + } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { + mp->m_flags |= XFS_MOUNT_SWALLOC; + } else if (!strcmp(this_char, MNTOPT_SUNIT)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + dsunit = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + dswidth = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { + mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; +#if !XFS_BIG_INUMS + xfs_warn(mp, "%s option not allowed on this system", + this_char); + return EINVAL; +#endif + } else if (!strcmp(this_char, MNTOPT_NOUUID)) { + mp->m_flags |= XFS_MOUNT_NOUUID; + } else if (!strcmp(this_char, MNTOPT_BARRIER)) { + mp->m_flags |= XFS_MOUNT_BARRIER; + } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) { + mp->m_flags &= ~XFS_MOUNT_BARRIER; + } else if (!strcmp(this_char, MNTOPT_IKEEP)) { + mp->m_flags |= XFS_MOUNT_IKEEP; + } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { + mp->m_flags &= ~XFS_MOUNT_IKEEP; + } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { + mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE; + } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) { + mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; + } else if (!strcmp(this_char, MNTOPT_ATTR2)) { + mp->m_flags |= XFS_MOUNT_ATTR2; + } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { + mp->m_flags &= ~XFS_MOUNT_ATTR2; + mp->m_flags |= XFS_MOUNT_NOATTR2; + } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { + mp->m_flags |= XFS_MOUNT_FILESTREAMS; + } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) { + mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | + XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | + XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | + XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD); + } else if (!strcmp(this_char, MNTOPT_QUOTA) || + !strcmp(this_char, MNTOPT_UQUOTA) || + !strcmp(this_char, MNTOPT_USRQUOTA)) { + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | + XFS_UQUOTA_ENFD); + } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) || + !strcmp(this_char, MNTOPT_UQUOTANOENF)) { + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_UQUOTA_ENFD; + } else if (!strcmp(this_char, MNTOPT_PQUOTA) || + !strcmp(this_char, MNTOPT_PRJQUOTA)) { + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | + XFS_OQUOTA_ENFD); + } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_OQUOTA_ENFD; + } else if (!strcmp(this_char, MNTOPT_GQUOTA) || + !strcmp(this_char, MNTOPT_GRPQUOTA)) { + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | + XFS_OQUOTA_ENFD); + } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_OQUOTA_ENFD; + } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { + mp->m_flags |= XFS_MOUNT_DELAYLOG; + } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { + mp->m_flags &= ~XFS_MOUNT_DELAYLOG; + } else if (!strcmp(this_char, MNTOPT_DISCARD)) { + mp->m_flags |= XFS_MOUNT_DISCARD; + } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { + mp->m_flags &= ~XFS_MOUNT_DISCARD; + } else if (!strcmp(this_char, "ihashsize")) { + xfs_warn(mp, + "ihashsize no longer used, option is deprecated."); + } else if (!strcmp(this_char, "osyncisdsync")) { + xfs_warn(mp, + "osyncisdsync has no effect, option is deprecated."); + } else if (!strcmp(this_char, "osyncisosync")) { + xfs_warn(mp, + "osyncisosync has no effect, option is deprecated."); + } else if (!strcmp(this_char, "irixsgid")) { + xfs_warn(mp, + "irixsgid is now a sysctl(2) variable, option is deprecated."); + } else { + xfs_warn(mp, "unknown mount option [%s].", this_char); + return EINVAL; + } + } + + /* + * no recovery flag requires a read-only mount + */ + if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && + !(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_warn(mp, "no-recovery mounts must be read-only."); + return EINVAL; + } + + if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { + xfs_warn(mp, + "sunit and swidth options incompatible with the noalign option"); + return EINVAL; + } + + if ((mp->m_flags & XFS_MOUNT_DISCARD) && + !(mp->m_flags & XFS_MOUNT_DELAYLOG)) { + xfs_warn(mp, + "the discard option is incompatible with the nodelaylog option"); + return EINVAL; + } + +#ifndef CONFIG_XFS_QUOTA + if (XFS_IS_QUOTA_RUNNING(mp)) { + xfs_warn(mp, "quota support not available in this kernel."); + return EINVAL; + } +#endif + + if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && + (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) { + xfs_warn(mp, "cannot mount with both project and group quota"); + return EINVAL; + } + + if ((dsunit && !dswidth) || (!dsunit && dswidth)) { + xfs_warn(mp, "sunit and swidth must be specified together"); + return EINVAL; + } + + if (dsunit && (dswidth % dsunit != 0)) { + xfs_warn(mp, + "stripe width (%d) must be a multiple of the stripe unit (%d)", + dswidth, dsunit); + return EINVAL; + } + +done: + if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) { + /* + * At this point the superblock has not been read + * in, therefore we do not know the block size. + * Before the mount call ends we will convert + * these to FSBs. + */ + if (dsunit) { + mp->m_dalign = dsunit; + mp->m_flags |= XFS_MOUNT_RETERR; + } + + if (dswidth) + mp->m_swidth = dswidth; + } + + if (mp->m_logbufs != -1 && + mp->m_logbufs != 0 && + (mp->m_logbufs < XLOG_MIN_ICLOGS || + mp->m_logbufs > XLOG_MAX_ICLOGS)) { + xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]", + mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); + return XFS_ERROR(EINVAL); + } + if (mp->m_logbsize != -1 && + mp->m_logbsize != 0 && + (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || + mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || + !is_power_of_2(mp->m_logbsize))) { + xfs_warn(mp, + "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", + mp->m_logbsize); + return XFS_ERROR(EINVAL); + } + + if (iosizelog) { + if (iosizelog > XFS_MAX_IO_LOG || + iosizelog < XFS_MIN_IO_LOG) { + xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", + iosizelog, XFS_MIN_IO_LOG, + XFS_MAX_IO_LOG); + return XFS_ERROR(EINVAL); + } + + mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; + mp->m_readio_log = iosizelog; + mp->m_writeio_log = iosizelog; + } + + return 0; +} + +struct proc_xfs_info { + int flag; + char *str; +}; + +STATIC int +xfs_showargs( + struct xfs_mount *mp, + struct seq_file *m) +{ + static struct proc_xfs_info xfs_info_set[] = { + /* the few simple ones we can get from the mount struct */ + { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP }, + { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC }, + { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN }, + { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, + { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, + { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, + { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, + { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, + { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, + { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, + { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, + { 0, NULL } + }; + static struct proc_xfs_info xfs_info_unset[] = { + /* the few simple ones we can get from the mount struct */ + { XFS_MOUNT_COMPAT_IOSIZE, "," MNTOPT_LARGEIO }, + { XFS_MOUNT_BARRIER, "," MNTOPT_NOBARRIER }, + { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_64BITINODE }, + { 0, NULL } + }; + struct proc_xfs_info *xfs_infop; + + for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) { + if (mp->m_flags & xfs_infop->flag) + seq_puts(m, xfs_infop->str); + } + for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) { + if (!(mp->m_flags & xfs_infop->flag)) + seq_puts(m, xfs_infop->str); + } + + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk", + (int)(1 << mp->m_writeio_log) >> 10); + + if (mp->m_logbufs > 0) + seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs); + if (mp->m_logbsize > 0) + seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); + + if (mp->m_logname) + seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); + if (mp->m_rtname) + seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); + + if (mp->m_dalign > 0) + seq_printf(m, "," MNTOPT_SUNIT "=%d", + (int)XFS_FSB_TO_BB(mp, mp->m_dalign)); + if (mp->m_swidth > 0) + seq_printf(m, "," MNTOPT_SWIDTH "=%d", + (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); + + if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD)) + seq_puts(m, "," MNTOPT_USRQUOTA); + else if (mp->m_qflags & XFS_UQUOTA_ACCT) + seq_puts(m, "," MNTOPT_UQUOTANOENF); + + /* Either project or group quotas can be active, not both */ + + if (mp->m_qflags & XFS_PQUOTA_ACCT) { + if (mp->m_qflags & XFS_OQUOTA_ENFD) + seq_puts(m, "," MNTOPT_PRJQUOTA); + else + seq_puts(m, "," MNTOPT_PQUOTANOENF); + } else if (mp->m_qflags & XFS_GQUOTA_ACCT) { + if (mp->m_qflags & XFS_OQUOTA_ENFD) + seq_puts(m, "," MNTOPT_GRPQUOTA); + else + seq_puts(m, "," MNTOPT_GQUOTANOENF); + } + + if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) + seq_puts(m, "," MNTOPT_NOQUOTA); + + return 0; +} +__uint64_t +xfs_max_file_offset( + unsigned int blockshift) +{ + unsigned int pagefactor = 1; + unsigned int bitshift = BITS_PER_LONG - 1; + + /* Figure out maximum filesize, on Linux this can depend on + * the filesystem blocksize (on 32 bit platforms). + * __block_write_begin does this in an [unsigned] long... + * page->index << (PAGE_CACHE_SHIFT - bbits) + * So, for page sized blocks (4K on 32 bit platforms), + * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is + * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) + * but for smaller blocksizes it is less (bbits = log2 bsize). + * Note1: get_block_t takes a long (implicit cast from above) + * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch + * can optionally convert the [unsigned] long from above into + * an [unsigned] long long. + */ + +#if BITS_PER_LONG == 32 +# if defined(CONFIG_LBDAF) + ASSERT(sizeof(sector_t) == 8); + pagefactor = PAGE_CACHE_SIZE; + bitshift = BITS_PER_LONG; +# else + pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift); +# endif +#endif + + return (((__uint64_t)pagefactor) << bitshift) - 1; +} + +STATIC int +xfs_blkdev_get( + xfs_mount_t *mp, + const char *name, + struct block_device **bdevp) +{ + int error = 0; + + *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + mp); + if (IS_ERR(*bdevp)) { + error = PTR_ERR(*bdevp); + xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error); + } + + return -error; +} + +STATIC void +xfs_blkdev_put( + struct block_device *bdev) +{ + if (bdev) + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +} + +void +xfs_blkdev_issue_flush( + xfs_buftarg_t *buftarg) +{ + blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL); +} + +STATIC void +xfs_close_devices( + struct xfs_mount *mp) +{ + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { + struct block_device *logdev = mp->m_logdev_targp->bt_bdev; + xfs_free_buftarg(mp, mp->m_logdev_targp); + xfs_blkdev_put(logdev); + } + if (mp->m_rtdev_targp) { + struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; + xfs_free_buftarg(mp, mp->m_rtdev_targp); + xfs_blkdev_put(rtdev); + } + xfs_free_buftarg(mp, mp->m_ddev_targp); +} + +/* + * The file system configurations are: + * (1) device (partition) with data and internal log + * (2) logical volume with data and log subvolumes. + * (3) logical volume with data, log, and realtime subvolumes. + * + * We only have to handle opening the log and realtime volumes here if + * they are present. The data subvolume has already been opened by + * get_sb_bdev() and is stored in sb->s_bdev. + */ +STATIC int +xfs_open_devices( + struct xfs_mount *mp) +{ + struct block_device *ddev = mp->m_super->s_bdev; + struct block_device *logdev = NULL, *rtdev = NULL; + int error; + + /* + * Open real time and log devices - order is important. + */ + if (mp->m_logname) { + error = xfs_blkdev_get(mp, mp->m_logname, &logdev); + if (error) + goto out; + } + + if (mp->m_rtname) { + error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev); + if (error) + goto out_close_logdev; + + if (rtdev == ddev || rtdev == logdev) { + xfs_warn(mp, + "Cannot mount filesystem with identical rtdev and ddev/logdev."); + error = EINVAL; + goto out_close_rtdev; + } + } + + /* + * Setup xfs_mount buffer target pointers + */ + error = ENOMEM; + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname); + if (!mp->m_ddev_targp) + goto out_close_rtdev; + + if (rtdev) { + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1, + mp->m_fsname); + if (!mp->m_rtdev_targp) + goto out_free_ddev_targ; + } + + if (logdev && logdev != ddev) { + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1, + mp->m_fsname); + if (!mp->m_logdev_targp) + goto out_free_rtdev_targ; + } else { + mp->m_logdev_targp = mp->m_ddev_targp; + } + + return 0; + + out_free_rtdev_targ: + if (mp->m_rtdev_targp) + xfs_free_buftarg(mp, mp->m_rtdev_targp); + out_free_ddev_targ: + xfs_free_buftarg(mp, mp->m_ddev_targp); + out_close_rtdev: + if (rtdev) + xfs_blkdev_put(rtdev); + out_close_logdev: + if (logdev && logdev != ddev) + xfs_blkdev_put(logdev); + out: + return error; +} + +/* + * Setup xfs_mount buffer target pointers based on superblock + */ +STATIC int +xfs_setup_devices( + struct xfs_mount *mp) +{ + int error; + + error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize, + mp->m_sb.sb_sectsize); + if (error) + return error; + + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { + unsigned int log_sector_size = BBSIZE; + + if (xfs_sb_version_hassector(&mp->m_sb)) + log_sector_size = mp->m_sb.sb_logsectsize; + error = xfs_setsize_buftarg(mp->m_logdev_targp, + mp->m_sb.sb_blocksize, + log_sector_size); + if (error) + return error; + } + if (mp->m_rtdev_targp) { + error = xfs_setsize_buftarg(mp->m_rtdev_targp, + mp->m_sb.sb_blocksize, + mp->m_sb.sb_sectsize); + if (error) + return error; + } + + return 0; +} + +/* Catch misguided souls that try to use this interface on XFS */ +STATIC struct inode * +xfs_fs_alloc_inode( + struct super_block *sb) +{ + BUG(); + return NULL; +} + +/* + * Now that the generic code is guaranteed not to be accessing + * the linux inode, we can reclaim the inode. + */ +STATIC void +xfs_fs_destroy_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + + trace_xfs_destroy_inode(ip); + + XFS_STATS_INC(vn_reclaim); + + /* bad inode, get out here ASAP */ + if (is_bad_inode(inode)) + goto out_reclaim; + + xfs_ioend_wait(ip); + + ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); + + /* + * We should never get here with one of the reclaim flags already set. + */ + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); + + /* + * We always use background reclaim here because even if the + * inode is clean, it still may be under IO and hence we have + * to take the flush lock. The background reclaim path handles + * this more efficiently than we can here, so simply let background + * reclaim tear down all inodes. + */ +out_reclaim: + xfs_inode_set_reclaim_tag(ip); +} + +/* + * Slab object creation initialisation for the XFS inode. + * This covers only the idempotent fields in the XFS inode; + * all other fields need to be initialised on allocation + * from the slab. This avoids the need to repeatedly initialise + * fields in the xfs inode that left in the initialise state + * when freeing the inode. + */ +STATIC void +xfs_fs_inode_init_once( + void *inode) +{ + struct xfs_inode *ip = inode; + + memset(ip, 0, sizeof(struct xfs_inode)); + + /* vfs inode */ + inode_init_once(VFS_I(ip)); + + /* xfs inode */ + atomic_set(&ip->i_iocount, 0); + atomic_set(&ip->i_pincount, 0); + spin_lock_init(&ip->i_flags_lock); + init_waitqueue_head(&ip->i_ipin_wait); + /* + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. + */ + init_completion(&ip->i_flush); + complete(&ip->i_flush); + + mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, + "xfsino", ip->i_ino); +} + +/* + * Dirty the XFS inode when mark_inode_dirty_sync() is called so that + * we catch unlogged VFS level updates to the inode. + * + * We need the barrier() to maintain correct ordering between unlogged + * updates and the transaction commit code that clears the i_update_core + * field. This requires all updates to be completed before marking the + * inode dirty. + */ +STATIC void +xfs_fs_dirty_inode( + struct inode *inode, + int flags) +{ + barrier(); + XFS_I(inode)->i_update_core = 1; +} + +STATIC int +xfs_fs_write_inode( + struct inode *inode, + struct writeback_control *wbc) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + int error = EAGAIN; + + trace_xfs_write_inode(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { + /* + * Make sure the inode has made it it into the log. Instead + * of forcing it all the way to stable storage using a + * synchronous transaction we let the log force inside the + * ->sync_fs call do that for thus, which reduces the number + * of synchronous log foces dramatically. + */ + xfs_ioend_wait(ip); + error = xfs_log_dirty_inode(ip, NULL, 0); + if (error) + goto out; + return 0; + } else { + if (!ip->i_update_core) + return 0; + + /* + * We make this non-blocking if the inode is contended, return + * EAGAIN to indicate to the caller that they did not succeed. + * This prevents the flush path from blocking on inodes inside + * another operation right now, they get caught later by + * xfs_sync. + */ + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) + goto out; + + if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) + goto out_unlock; + + /* + * Now we have the flush lock and the inode is not pinned, we + * can check if the inode is really clean as we know that + * there are no pending transaction completions, it is not + * waiting on the delayed write queue and there is no IO in + * progress. + */ + if (xfs_inode_clean(ip)) { + xfs_ifunlock(ip); + error = 0; + goto out_unlock; + } + error = xfs_iflush(ip, SYNC_TRYLOCK); + } + + out_unlock: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + out: + /* + * if we failed to write out the inode then mark + * it dirty again so we'll try again later. + */ + if (error) + xfs_mark_inode_dirty_sync(ip); + return -error; +} + +STATIC void +xfs_fs_evict_inode( + struct inode *inode) +{ + xfs_inode_t *ip = XFS_I(inode); + + trace_xfs_evict_inode(ip); + + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + XFS_STATS_INC(vn_rele); + XFS_STATS_INC(vn_remove); + XFS_STATS_DEC(vn_active); + + /* + * The iolock is used by the file system to coordinate reads, + * writes, and block truncates. Up to this point the lock + * protected concurrent accesses by users of the inode. But + * from here forward we're doing some final processing of the + * inode because we're done with it, and although we reuse the + * iolock for protection it is really a distinct lock class + * (in the lockdep sense) from before. To keep lockdep happy + * (and basically indicate what we are doing), we explicitly + * re-init the iolock here. + */ + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + lockdep_set_class_and_name(&ip->i_iolock.mr_lock, + &xfs_iolock_reclaimable, "xfs_iolock_reclaimable"); + + xfs_inactive(ip); +} + +STATIC void +xfs_free_fsname( + struct xfs_mount *mp) +{ + kfree(mp->m_fsname); + kfree(mp->m_rtname); + kfree(mp->m_logname); +} + +STATIC void +xfs_fs_put_super( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + /* + * Unregister the memory shrinker before we tear down the mount + * structure so we don't have memory reclaim racing with us here. + */ + xfs_inode_shrinker_unregister(mp); + xfs_syncd_stop(mp); + + /* + * Blow away any referenced inode in the filestreams cache. + * This can and will cause log traffic as inodes go inactive + * here. + */ + xfs_filestream_unmount(mp); + + XFS_bflush(mp->m_ddev_targp); + + xfs_unmountfs(mp); + xfs_freesb(mp); + xfs_icsb_destroy_counters(mp); + xfs_close_devices(mp); + xfs_free_fsname(mp); + kfree(mp); +} + +STATIC int +xfs_fs_sync_fs( + struct super_block *sb, + int wait) +{ + struct xfs_mount *mp = XFS_M(sb); + int error; + + /* + * Not much we can do for the first async pass. Writing out the + * superblock would be counter-productive as we are going to redirty + * when writing out other data and metadata (and writing out a single + * block is quite fast anyway). + * + * Try to asynchronously kick off quota syncing at least. + */ + if (!wait) { + xfs_qm_sync(mp, SYNC_TRYLOCK); + return 0; + } + + error = xfs_quiesce_data(mp); + if (error) + return -error; + + if (laptop_mode) { + /* + * The disk must be active because we're syncing. + * We schedule xfssyncd now (now that the disk is + * active) instead of later (when it might not be). + */ + flush_delayed_work_sync(&mp->m_sync_work); + } + + return 0; +} + +STATIC int +xfs_fs_statfs( + struct dentry *dentry, + struct kstatfs *statp) +{ + struct xfs_mount *mp = XFS_M(dentry->d_sb); + xfs_sb_t *sbp = &mp->m_sb; + struct xfs_inode *ip = XFS_I(dentry->d_inode); + __uint64_t fakeinos, id; + xfs_extlen_t lsize; + __int64_t ffree; + + statp->f_type = XFS_SB_MAGIC; + statp->f_namelen = MAXNAMELEN - 1; + + id = huge_encode_dev(mp->m_ddev_targp->bt_dev); + statp->f_fsid.val[0] = (u32)id; + statp->f_fsid.val[1] = (u32)(id >> 32); + + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + + spin_lock(&mp->m_sb_lock); + statp->f_bsize = sbp->sb_blocksize; + lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; + statp->f_blocks = sbp->sb_dblocks - lsize; + statp->f_bfree = statp->f_bavail = + sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + fakeinos = statp->f_bfree << sbp->sb_inopblog; + statp->f_files = + MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); + if (mp->m_maxicount) + statp->f_files = min_t(typeof(statp->f_files), + statp->f_files, + mp->m_maxicount); + + /* make sure statp->f_ffree does not underflow */ + ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + statp->f_ffree = max_t(__int64_t, ffree, 0); + + spin_unlock(&mp->m_sb_lock); + + if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || + ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) == + (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) + xfs_qm_statvfs(ip, statp); + return 0; +} + +STATIC void +xfs_save_resvblks(struct xfs_mount *mp) +{ + __uint64_t resblks = 0; + + mp->m_resblks_save = mp->m_resblks; + xfs_reserve_blocks(mp, &resblks, NULL); +} + +STATIC void +xfs_restore_resvblks(struct xfs_mount *mp) +{ + __uint64_t resblks; + + if (mp->m_resblks_save) { + resblks = mp->m_resblks_save; + mp->m_resblks_save = 0; + } else + resblks = xfs_default_resblks(mp); + + xfs_reserve_blocks(mp, &resblks, NULL); +} + +STATIC int +xfs_fs_remount( + struct super_block *sb, + int *flags, + char *options) +{ + struct xfs_mount *mp = XFS_M(sb); + substring_t args[MAX_OPT_ARGS]; + char *p; + int error; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_barrier: + mp->m_flags |= XFS_MOUNT_BARRIER; + break; + case Opt_nobarrier: + mp->m_flags &= ~XFS_MOUNT_BARRIER; + break; + default: + /* + * Logically we would return an error here to prevent + * users from believing they might have changed + * mount options using remount which can't be changed. + * + * But unfortunately mount(8) adds all options from + * mtab and fstab to the mount arguments in some cases + * so we can't blindly reject options, but have to + * check for each specified option if it actually + * differs from the currently set option and only + * reject it if that's the case. + * + * Until that is implemented we return success for + * every remount request, and silently ignore all + * options that we can't actually change. + */ +#if 0 + xfs_info(mp, + "mount option \"%s\" not supported for remount\n", p); + return -EINVAL; +#else + break; +#endif + } + } + + /* ro -> rw */ + if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { + mp->m_flags &= ~XFS_MOUNT_RDONLY; + + /* + * If this is the first remount to writeable state we + * might have some superblock changes to update. + */ + if (mp->m_update_flags) { + error = xfs_mount_log_sb(mp, mp->m_update_flags); + if (error) { + xfs_warn(mp, "failed to write sb changes"); + return error; + } + mp->m_update_flags = 0; + } + + /* + * Fill out the reserve pool if it is empty. Use the stashed + * value if it is non-zero, otherwise go with the default. + */ + xfs_restore_resvblks(mp); + } + + /* rw -> ro */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + /* + * After we have synced the data but before we sync the + * metadata, we need to free up the reserve block pool so that + * the used block count in the superblock on disk is correct at + * the end of the remount. Stash the current reserve pool size + * so that if we get remounted rw, we can return it to the same + * size. + */ + + xfs_quiesce_data(mp); + xfs_save_resvblks(mp); + xfs_quiesce_attr(mp); + mp->m_flags |= XFS_MOUNT_RDONLY; + } + + return 0; +} + +/* + * Second stage of a freeze. The data is already frozen so we only + * need to take care of the metadata. Once that's done write a dummy + * record to dirty the log in case of a crash while frozen. + */ +STATIC int +xfs_fs_freeze( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_save_resvblks(mp); + xfs_quiesce_attr(mp); + return -xfs_fs_log_dummy(mp); +} + +STATIC int +xfs_fs_unfreeze( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_restore_resvblks(mp); + return 0; +} + +STATIC int +xfs_fs_show_options( + struct seq_file *m, + struct vfsmount *mnt) +{ + return -xfs_showargs(XFS_M(mnt->mnt_sb), m); +} + +/* + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock _has_ now been read in. + */ +STATIC int +xfs_finish_flags( + struct xfs_mount *mp) +{ + int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); + + /* Fail a mount where the logbuf is smaller than the log stripe */ + if (xfs_sb_version_haslogv2(&mp->m_sb)) { + if (mp->m_logbsize <= 0 && + mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) { + mp->m_logbsize = mp->m_sb.sb_logsunit; + } else if (mp->m_logbsize > 0 && + mp->m_logbsize < mp->m_sb.sb_logsunit) { + xfs_warn(mp, + "logbuf size must be greater than or equal to log stripe size"); + return XFS_ERROR(EINVAL); + } + } else { + /* Fail a mount if the logbuf is larger than 32K */ + if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) { + xfs_warn(mp, + "logbuf size for version 1 logs must be 16K or 32K"); + return XFS_ERROR(EINVAL); + } + } + + /* + * mkfs'ed attr2 will turn on attr2 mount unless explicitly + * told by noattr2 to turn it off + */ + if (xfs_sb_version_hasattr2(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_NOATTR2)) + mp->m_flags |= XFS_MOUNT_ATTR2; + + /* + * prohibit r/w mounts of read-only filesystems + */ + if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { + xfs_warn(mp, + "cannot mount a read-only filesystem as read-write"); + return XFS_ERROR(EROFS); + } + + return 0; +} + +STATIC int +xfs_fs_fill_super( + struct super_block *sb, + void *data, + int silent) +{ + struct inode *root; + struct xfs_mount *mp = NULL; + int flags = 0, error = ENOMEM; + + mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); + if (!mp) + goto out; + + spin_lock_init(&mp->m_sb_lock); + mutex_init(&mp->m_growlock); + atomic_set(&mp->m_active_trans, 0); + + mp->m_super = sb; + sb->s_fs_info = mp; + + error = xfs_parseargs(mp, (char *)data); + if (error) + goto out_free_fsname; + + sb_min_blocksize(sb, BBSIZE); + sb->s_xattr = xfs_xattr_handlers; + sb->s_export_op = &xfs_export_operations; +#ifdef CONFIG_XFS_QUOTA + sb->s_qcop = &xfs_quotactl_operations; +#endif + sb->s_op = &xfs_super_operations; + + if (silent) + flags |= XFS_MFSI_QUIET; + + error = xfs_open_devices(mp); + if (error) + goto out_free_fsname; + + error = xfs_icsb_init_counters(mp); + if (error) + goto out_close_devices; + + error = xfs_readsb(mp, flags); + if (error) + goto out_destroy_counters; + + error = xfs_finish_flags(mp); + if (error) + goto out_free_sb; + + error = xfs_setup_devices(mp); + if (error) + goto out_free_sb; + + error = xfs_filestream_mount(mp); + if (error) + goto out_free_sb; + + /* + * we must configure the block size in the superblock before we run the + * full mount process as the mount process can lookup and cache inodes. + * For the same reason we must also initialise the syncd and register + * the inode cache shrinker so that inodes can be reclaimed during + * operations like a quotacheck that iterate all inodes in the + * filesystem. + */ + sb->s_magic = XFS_SB_MAGIC; + sb->s_blocksize = mp->m_sb.sb_blocksize; + sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; + sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_time_gran = 1; + set_posix_acl_flag(sb); + + xfs_inode_shrinker_register(mp); + + error = xfs_mountfs(mp); + if (error) + goto out_filestream_unmount; + + error = xfs_syncd_init(mp); + if (error) + goto out_unmount; + + root = igrab(VFS_I(mp->m_rootip)); + if (!root) { + error = ENOENT; + goto out_syncd_stop; + } + if (is_bad_inode(root)) { + error = EINVAL; + goto out_syncd_stop; + } + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + error = ENOMEM; + goto out_iput; + } + + return 0; + + out_filestream_unmount: + xfs_inode_shrinker_unregister(mp); + xfs_filestream_unmount(mp); + out_free_sb: + xfs_freesb(mp); + out_destroy_counters: + xfs_icsb_destroy_counters(mp); + out_close_devices: + xfs_close_devices(mp); + out_free_fsname: + xfs_free_fsname(mp); + kfree(mp); + out: + return -error; + + out_iput: + iput(root); + out_syncd_stop: + xfs_syncd_stop(mp); + out_unmount: + xfs_inode_shrinker_unregister(mp); + + /* + * Blow away any referenced inode in the filestreams cache. + * This can and will cause log traffic as inodes go inactive + * here. + */ + xfs_filestream_unmount(mp); + + XFS_bflush(mp->m_ddev_targp); + + xfs_unmountfs(mp); + goto out_free_sb; +} + +STATIC struct dentry * +xfs_fs_mount( + struct file_system_type *fs_type, + int flags, + const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); +} + +static const struct super_operations xfs_super_operations = { + .alloc_inode = xfs_fs_alloc_inode, + .destroy_inode = xfs_fs_destroy_inode, + .dirty_inode = xfs_fs_dirty_inode, + .write_inode = xfs_fs_write_inode, + .evict_inode = xfs_fs_evict_inode, + .put_super = xfs_fs_put_super, + .sync_fs = xfs_fs_sync_fs, + .freeze_fs = xfs_fs_freeze, + .unfreeze_fs = xfs_fs_unfreeze, + .statfs = xfs_fs_statfs, + .remount_fs = xfs_fs_remount, + .show_options = xfs_fs_show_options, +}; + +static struct file_system_type xfs_fs_type = { + .owner = THIS_MODULE, + .name = "xfs", + .mount = xfs_fs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +STATIC int __init +xfs_init_zones(void) +{ + + xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); + if (!xfs_ioend_zone) + goto out; + + xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, + xfs_ioend_zone); + if (!xfs_ioend_pool) + goto out_destroy_ioend_zone; + + xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), + "xfs_log_ticket"); + if (!xfs_log_ticket_zone) + goto out_destroy_ioend_pool; + + xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), + "xfs_bmap_free_item"); + if (!xfs_bmap_free_item_zone) + goto out_destroy_log_ticket_zone; + + xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), + "xfs_btree_cur"); + if (!xfs_btree_cur_zone) + goto out_destroy_bmap_free_item_zone; + + xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t), + "xfs_da_state"); + if (!xfs_da_state_zone) + goto out_destroy_btree_cur_zone; + + xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); + if (!xfs_dabuf_zone) + goto out_destroy_da_state_zone; + + xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); + if (!xfs_ifork_zone) + goto out_destroy_dabuf_zone; + + xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); + if (!xfs_trans_zone) + goto out_destroy_ifork_zone; + + xfs_log_item_desc_zone = + kmem_zone_init(sizeof(struct xfs_log_item_desc), + "xfs_log_item_desc"); + if (!xfs_log_item_desc_zone) + goto out_destroy_trans_zone; + + /* + * The size of the zone allocated buf log item is the maximum + * size possible under XFS. This wastes a little bit of memory, + * but it is much faster. + */ + xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + + (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / + NBWORD) * sizeof(int))), "xfs_buf_item"); + if (!xfs_buf_item_zone) + goto out_destroy_log_item_desc_zone; + + xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * + sizeof(xfs_extent_t))), "xfs_efd_item"); + if (!xfs_efd_zone) + goto out_destroy_buf_item_zone; + + xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) + + ((XFS_EFI_MAX_FAST_EXTENTS - 1) * + sizeof(xfs_extent_t))), "xfs_efi_item"); + if (!xfs_efi_zone) + goto out_destroy_efd_zone; + + xfs_inode_zone = + kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD, + xfs_fs_inode_init_once); + if (!xfs_inode_zone) + goto out_destroy_efi_zone; + + xfs_ili_zone = + kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", + KM_ZONE_SPREAD, NULL); + if (!xfs_ili_zone) + goto out_destroy_inode_zone; + + return 0; + + out_destroy_inode_zone: + kmem_zone_destroy(xfs_inode_zone); + out_destroy_efi_zone: + kmem_zone_destroy(xfs_efi_zone); + out_destroy_efd_zone: + kmem_zone_destroy(xfs_efd_zone); + out_destroy_buf_item_zone: + kmem_zone_destroy(xfs_buf_item_zone); + out_destroy_log_item_desc_zone: + kmem_zone_destroy(xfs_log_item_desc_zone); + out_destroy_trans_zone: + kmem_zone_destroy(xfs_trans_zone); + out_destroy_ifork_zone: + kmem_zone_destroy(xfs_ifork_zone); + out_destroy_dabuf_zone: + kmem_zone_destroy(xfs_dabuf_zone); + out_destroy_da_state_zone: + kmem_zone_destroy(xfs_da_state_zone); + out_destroy_btree_cur_zone: + kmem_zone_destroy(xfs_btree_cur_zone); + out_destroy_bmap_free_item_zone: + kmem_zone_destroy(xfs_bmap_free_item_zone); + out_destroy_log_ticket_zone: + kmem_zone_destroy(xfs_log_ticket_zone); + out_destroy_ioend_pool: + mempool_destroy(xfs_ioend_pool); + out_destroy_ioend_zone: + kmem_zone_destroy(xfs_ioend_zone); + out: + return -ENOMEM; +} + +STATIC void +xfs_destroy_zones(void) +{ + kmem_zone_destroy(xfs_ili_zone); + kmem_zone_destroy(xfs_inode_zone); + kmem_zone_destroy(xfs_efi_zone); + kmem_zone_destroy(xfs_efd_zone); + kmem_zone_destroy(xfs_buf_item_zone); + kmem_zone_destroy(xfs_log_item_desc_zone); + kmem_zone_destroy(xfs_trans_zone); + kmem_zone_destroy(xfs_ifork_zone); + kmem_zone_destroy(xfs_dabuf_zone); + kmem_zone_destroy(xfs_da_state_zone); + kmem_zone_destroy(xfs_btree_cur_zone); + kmem_zone_destroy(xfs_bmap_free_item_zone); + kmem_zone_destroy(xfs_log_ticket_zone); + mempool_destroy(xfs_ioend_pool); + kmem_zone_destroy(xfs_ioend_zone); + +} + +STATIC int __init +xfs_init_workqueues(void) +{ + /* + * max_active is set to 8 to give enough concurency to allow + * multiple work operations on each CPU to run. This allows multiple + * filesystems to be running sync work concurrently, and scales with + * the number of CPUs in the system. + */ + xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8); + if (!xfs_syncd_wq) + return -ENOMEM; + return 0; +} + +STATIC void +xfs_destroy_workqueues(void) +{ + destroy_workqueue(xfs_syncd_wq); +} + +STATIC int __init +init_xfs_fs(void) +{ + int error; + + printk(KERN_INFO XFS_VERSION_STRING " with " + XFS_BUILD_OPTIONS " enabled\n"); + + xfs_ioend_init(); + xfs_dir_startup(); + + error = xfs_init_zones(); + if (error) + goto out; + + error = xfs_init_workqueues(); + if (error) + goto out_destroy_zones; + + error = xfs_mru_cache_init(); + if (error) + goto out_destroy_wq; + + error = xfs_filestream_init(); + if (error) + goto out_mru_cache_uninit; + + error = xfs_buf_init(); + if (error) + goto out_filestream_uninit; + + error = xfs_init_procfs(); + if (error) + goto out_buf_terminate; + + error = xfs_sysctl_register(); + if (error) + goto out_cleanup_procfs; + + vfs_initquota(); + + error = register_filesystem(&xfs_fs_type); + if (error) + goto out_sysctl_unregister; + return 0; + + out_sysctl_unregister: + xfs_sysctl_unregister(); + out_cleanup_procfs: + xfs_cleanup_procfs(); + out_buf_terminate: + xfs_buf_terminate(); + out_filestream_uninit: + xfs_filestream_uninit(); + out_mru_cache_uninit: + xfs_mru_cache_uninit(); + out_destroy_wq: + xfs_destroy_workqueues(); + out_destroy_zones: + xfs_destroy_zones(); + out: + return error; +} + +STATIC void __exit +exit_xfs_fs(void) +{ + vfs_exitquota(); + unregister_filesystem(&xfs_fs_type); + xfs_sysctl_unregister(); + xfs_cleanup_procfs(); + xfs_buf_terminate(); + xfs_filestream_uninit(); + xfs_mru_cache_uninit(); + xfs_destroy_workqueues(); + xfs_destroy_zones(); +} + +module_init(init_xfs_fs); +module_exit(exit_xfs_fs); + +MODULE_AUTHOR("Silicon Graphics, Inc."); +MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled"); +MODULE_LICENSE("GPL"); diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h new file mode 100644 index 00000000..50a3266c --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SUPER_H__ +#define __XFS_SUPER_H__ + +#include + +#ifdef CONFIG_XFS_QUOTA +extern void xfs_qm_init(void); +extern void xfs_qm_exit(void); +# define vfs_initquota() xfs_qm_init() +# define vfs_exitquota() xfs_qm_exit() +#else +# define vfs_initquota() do { } while (0) +# define vfs_exitquota() do { } while (0) +#endif + +#ifdef CONFIG_XFS_POSIX_ACL +# define XFS_ACL_STRING "ACLs, " +# define set_posix_acl_flag(sb) ((sb)->s_flags |= MS_POSIXACL) +#else +# define XFS_ACL_STRING +# define set_posix_acl_flag(sb) do { } while (0) +#endif + +#define XFS_SECURITY_STRING "security attributes, " + +#ifdef CONFIG_XFS_RT +# define XFS_REALTIME_STRING "realtime, " +#else +# define XFS_REALTIME_STRING +#endif + +#if XFS_BIG_BLKNOS +# if XFS_BIG_INUMS +# define XFS_BIGFS_STRING "large block/inode numbers, " +# else +# define XFS_BIGFS_STRING "large block numbers, " +# endif +#else +# define XFS_BIGFS_STRING +#endif + +#ifdef DEBUG +# define XFS_DBG_STRING "debug" +#else +# define XFS_DBG_STRING "no debug" +#endif + +#define XFS_VERSION_STRING "SGI XFS" +#define XFS_BUILD_OPTIONS XFS_ACL_STRING \ + XFS_SECURITY_STRING \ + XFS_REALTIME_STRING \ + XFS_BIGFS_STRING \ + XFS_DBG_STRING /* DBG must be last */ + +struct xfs_inode; +struct xfs_mount; +struct xfs_buftarg; +struct block_device; + +extern __uint64_t xfs_max_file_offset(unsigned int); + +extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); + +extern const struct export_operations xfs_export_operations; +extern const struct xattr_handler *xfs_xattr_handlers[]; +extern const struct quotactl_ops xfs_quotactl_operations; + +#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) + +#endif /* __XFS_SUPER_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c new file mode 100644 index 00000000..2f277a04 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -0,0 +1,1132 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_dinode.h" +#include "xfs_error.h" +#include "xfs_filestream.h" +#include "xfs_vnodeops.h" +#include "xfs_inode_item.h" +#include "xfs_quota.h" +#include "xfs_trace.h" +#include "xfs_fsops.h" + +#include +#include + +struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ + +/* + * The inode lookup is done in batches to keep the amount of lock traffic and + * radix tree lookups to a minimum. The batch size is a trade off between + * lookup reduction and stack usage. This is in the reclaim path, so we can't + * be too greedy. + */ +#define XFS_LOOKUP_BATCH 32 + +STATIC int +xfs_inode_ag_walk_grab( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(rcu_read_lock_held()); + + /* + * check for stale RCU freed inode + * + * If the inode has been reallocated, it doesn't matter if it's not in + * the AG we are walking - we are walking for writeback, so if it + * passes all the "valid inode" checks and is dirty, then we'll write + * it back anyway. If it has been reallocated and still being + * initialised, the XFS_INEW check below will catch it. + */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino) + goto out_unlock_noent; + + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock_noent; + spin_unlock(&ip->i_flags_lock); + + /* nothing to sync during shutdown */ + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return EFSCORRUPTED; + + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + return ENOENT; + + if (is_bad_inode(inode)) { + IRELE(ip); + return ENOENT; + } + + /* inode is valid */ + return 0; + +out_unlock_noent: + spin_unlock(&ip->i_flags_lock); + return ENOENT; +} + +STATIC int +xfs_inode_ag_walk( + struct xfs_mount *mp, + struct xfs_perag *pag, + int (*execute)(struct xfs_inode *ip, + struct xfs_perag *pag, int flags), + int flags) +{ + uint32_t first_index; + int last_error = 0; + int skipped; + int done; + int nr_found; + +restart: + done = 0; + skipped = 0; + first_index = 0; + nr_found = 0; + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int error = 0; + int i; + + rcu_read_lock(); + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH); + if (!nr_found) { + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || xfs_inode_ag_walk_grab(ip)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = execute(batch[i], pag, flags); + IRELE(batch[i]); + if (error == EAGAIN) { + skipped++; + continue; + } + if (error && last_error != EFSCORRUPTED) + last_error = error; + } + + /* bail out if the filesystem is corrupted. */ + if (error == EFSCORRUPTED) + break; + + } while (nr_found && !done); + + if (skipped) { + delay(1); + goto restart; + } + return last_error; +} + +int +xfs_inode_ag_iterator( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, + struct xfs_perag *pag, int flags), + int flags) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_perag_get(mp, ag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == EFSCORRUPTED) + break; + } + } + return XFS_ERROR(last_error); +} + +STATIC int +xfs_sync_inode_data( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags) +{ + struct inode *inode = VFS_I(ip); + struct address_space *mapping = inode->i_mapping; + int error = 0; + + if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + goto out_wait; + + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { + if (flags & SYNC_TRYLOCK) + goto out_wait; + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } + + error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? + 0 : XBF_ASYNC, FI_NONE); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + out_wait: + if (flags & SYNC_WAIT) + xfs_ioend_wait(ip); + return error; +} + +STATIC int +xfs_sync_inode_attr( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags) +{ + int error = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_inode_clean(ip)) + goto out_unlock; + if (!xfs_iflock_nowait(ip)) { + if (!(flags & SYNC_WAIT)) + goto out_unlock; + xfs_iflock(ip); + } + + if (xfs_inode_clean(ip)) { + xfs_ifunlock(ip); + goto out_unlock; + } + + error = xfs_iflush(ip, flags); + + /* + * We don't want to try again on non-blocking flushes that can't run + * again immediately. If an inode really must be written, then that's + * what the SYNC_WAIT flag is for. + */ + if (error == EAGAIN) { + ASSERT(!(flags & SYNC_WAIT)); + error = 0; + } + + out_unlock: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +/* + * Write out pagecache data for the whole filesystem. + */ +STATIC int +xfs_sync_data( + struct xfs_mount *mp, + int flags) +{ + int error; + + ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); + + error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags); + if (error) + return XFS_ERROR(error); + + xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); + return 0; +} + +/* + * Write out inode metadata (attributes) for the whole filesystem. + */ +STATIC int +xfs_sync_attr( + struct xfs_mount *mp, + int flags) +{ + ASSERT((flags & ~SYNC_WAIT) == 0); + + return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags); +} + +STATIC int +xfs_sync_fsdata( + struct xfs_mount *mp) +{ + struct xfs_buf *bp; + + /* + * If the buffer is pinned then push on the log so we won't get stuck + * waiting in the write for someone, maybe ourselves, to flush the log. + * + * Even though we just pushed the log above, we did not have the + * superblock buffer locked at that point so it can become pinned in + * between there and here. + */ + bp = xfs_getsb(mp, 0); + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(mp, 0); + + return xfs_bwrite(mp, bp); +} + +int +xfs_log_dirty_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + if (!ip->i_update_core) + return 0; + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return xfs_trans_commit(tp, 0); +} + +/* + * When remounting a filesystem read-only or freezing the filesystem, we have + * two phases to execute. This first phase is syncing the data before we + * quiesce the filesystem, and the second is flushing all the inodes out after + * we've waited for all the transactions created by the first phase to + * complete. The second phase ensures that the inodes are written to their + * location on disk rather than just existing in transactions in the log. This + * means after a quiesce there is no log replay required to write the inodes to + * disk (this is the main difference between a sync and a quiesce). + */ +/* + * First stage of freeze - no writers will make progress now we are here, + * so we flush delwri and delalloc buffers here, then wait for all I/O to + * complete. Data is frozen at that point. Metadata is not frozen, + * transactions can still occur here so don't bother flushing the buftarg + * because it'll just get dirty again. + */ +int +xfs_quiesce_data( + struct xfs_mount *mp) +{ + int error, error2 = 0; + + /* push non-blocking */ + xfs_sync_data(mp, 0); + xfs_qm_sync(mp, SYNC_TRYLOCK); + + /* push and block till complete */ + xfs_sync_data(mp, SYNC_WAIT); + + /* + * Log all pending size and timestamp updates. The vfs writeback + * code is supposed to do this, but due to its overagressive + * livelock detection it will skip inodes where appending writes + * were written out in the first non-blocking sync phase if their + * completion took long enough that it happened after taking the + * timestamp for the cut-off in the blocking phase. + */ + xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0); + + xfs_qm_sync(mp, SYNC_WAIT); + + /* write superblock and hoover up shutdown errors */ + error = xfs_sync_fsdata(mp); + + /* make sure all delwri buffers are written out */ + xfs_flush_buftarg(mp->m_ddev_targp, 1); + + /* mark the log as covered if needed */ + if (xfs_log_need_covered(mp)) + error2 = xfs_fs_log_dummy(mp); + + /* flush data-only devices */ + if (mp->m_rtdev_targp) + XFS_bflush(mp->m_rtdev_targp); + + return error ? error : error2; +} + +STATIC void +xfs_quiesce_fs( + struct xfs_mount *mp) +{ + int count = 0, pincount; + + xfs_reclaim_inodes(mp, 0); + xfs_flush_buftarg(mp->m_ddev_targp, 0); + + /* + * This loop must run at least twice. The first instance of the loop + * will flush most meta data but that will generate more meta data + * (typically directory updates). Which then must be flushed and + * logged before we can write the unmount record. We also so sync + * reclaim of inodes to catch any that the above delwri flush skipped. + */ + do { + xfs_reclaim_inodes(mp, SYNC_WAIT); + xfs_sync_attr(mp, SYNC_WAIT); + pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); + if (!pincount) { + delay(50); + count++; + } + } while (count < 2); +} + +/* + * Second stage of a quiesce. The data is already synced, now we have to take + * care of the metadata. New transactions are already blocked, so we need to + * wait for any remaining transactions to drain out before proceeding. + */ +void +xfs_quiesce_attr( + struct xfs_mount *mp) +{ + int error = 0; + + /* wait for all modifications to complete */ + while (atomic_read(&mp->m_active_trans) > 0) + delay(100); + + /* flush inodes and push all remaining buffers out to disk */ + xfs_quiesce_fs(mp); + + /* + * Just warn here till VFS can correctly support + * read-only remount without racing. + */ + WARN_ON(atomic_read(&mp->m_active_trans) != 0); + + /* Push the superblock and write an unmount record */ + error = xfs_log_sbcount(mp, 1); + if (error) + xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " + "Frozen image may not be consistent."); + xfs_log_unmount_write(mp); + xfs_unmountfs_writesb(mp); +} + +static void +xfs_syncd_queue_sync( + struct xfs_mount *mp) +{ + queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work, + msecs_to_jiffies(xfs_syncd_centisecs * 10)); +} + +/* + * Every sync period we need to unpin all items, reclaim inodes and sync + * disk quotas. We might need to cover the log to indicate that the + * filesystem is idle and not frozen. + */ +STATIC void +xfs_sync_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_sync_work); + int error; + + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + /* dgc: errors ignored here */ + if (mp->m_super->s_frozen == SB_UNFROZEN && + xfs_log_need_covered(mp)) + error = xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + error = xfs_qm_sync(mp, SYNC_TRYLOCK); + + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); + } + + /* queue us up again */ + xfs_syncd_queue_sync(mp); +} + +/* + * Queue a new inode reclaim pass if there are reclaimable inodes and there + * isn't a reclaim pass already in progress. By default it runs every 5s based + * on the xfs syncd work default of 30s. Perhaps this should have it's own + * tunable, but that can be done if this method proves to be ineffective or too + * aggressive. + */ +static void +xfs_syncd_queue_reclaim( + struct xfs_mount *mp) +{ + + /* + * We can have inodes enter reclaim after we've shut down the syncd + * workqueue during unmount, so don't allow reclaim work to be queued + * during unmount. + */ + if (!(mp->m_super->s_flags & MS_ACTIVE)) + return; + + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, + msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); + } + rcu_read_unlock(); +} + +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. It scans as quickly as possible avoiding locked inodes or those + * already being flushed, and once done schedules a future pass. + */ +STATIC void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + + xfs_reclaim_inodes(mp, SYNC_TRYLOCK); + xfs_syncd_queue_reclaim(mp); +} + +/* + * Flush delayed allocate data, attempting to free up reserved space + * from existing allocations. At this point a new allocation attempt + * has failed with ENOSPC and we are in the process of scratching our + * heads, looking about for more room. + * + * Queue a new data flush if there isn't one already in progress and + * wait for completion of the flush. This means that we only ever have one + * inode flush in progress no matter how many ENOSPC events are occurring and + * so will prevent the system from bogging down due to every concurrent + * ENOSPC event scanning all the active inodes in the system for writeback. + */ +void +xfs_flush_inodes( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + queue_work(xfs_syncd_wq, &mp->m_flush_work); + flush_work_sync(&mp->m_flush_work); +} + +STATIC void +xfs_flush_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(work, + struct xfs_mount, m_flush_work); + + xfs_sync_data(mp, SYNC_TRYLOCK); + xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); +} + +int +xfs_syncd_init( + struct xfs_mount *mp) +{ + INIT_WORK(&mp->m_flush_work, xfs_flush_worker); + INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + + xfs_syncd_queue_sync(mp); + xfs_syncd_queue_reclaim(mp); + + return 0; +} + +void +xfs_syncd_stop( + struct xfs_mount *mp) +{ + cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_reclaim_work); + cancel_work_sync(&mp->m_flush_work); +} + +void +__xfs_inode_set_reclaim_tag( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + + if (!pag->pag_ici_reclaimable) { + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + + /* schedule periodic background inode reclaim */ + xfs_syncd_queue_reclaim(ip->i_mount); + + trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + pag->pag_ici_reclaimable++; +} + +/* + * We set the inode flag atomically with the radix tree tag. + * Once we get tag lookups on the radix tree, this inode flag + * can go away. + */ +void +xfs_inode_set_reclaim_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + __xfs_inode_set_reclaim_tag(pag, ip); + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +STATIC void +__xfs_inode_clear_reclaim( + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + pag->pag_ici_reclaimable--; + if (!pag->pag_ici_reclaimable) { + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } +} + +void +__xfs_inode_clear_reclaim_tag( + xfs_mount_t *mp, + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_inode_clear_reclaim(pag, ip); +} + +/* + * Grab the inode for reclaim exclusively. + * Return 0 if we grabbed it, non-zero otherwise. + */ +STATIC int +xfs_reclaim_inode_grab( + struct xfs_inode *ip, + int flags) +{ + ASSERT(rcu_read_lock_held()); + + /* quick check for stale RCU freed inode */ + if (!ip->i_ino) + return 1; + + /* + * do some unlocked checks first to avoid unnecessary lock traffic. + * The first is a flush lock check, the second is a already in reclaim + * check. Only do these checks if we are not going to block on locks. + */ + if ((flags & SYNC_TRYLOCK) && + (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { + return 1; + } + + /* + * The radix tree lock here protects a thread in xfs_iget from racing + * with us starting reclaim on the inode. Once we have the + * XFS_IRECLAIM flag set it will not touch us. + * + * Due to RCU lookup, we may find inodes that have been freed and only + * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that + * aren't candidates for reclaim at all, so we must check the + * XFS_IRECLAIMABLE is set first before proceeding to reclaim. + */ + spin_lock(&ip->i_flags_lock); + if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || + __xfs_iflags_test(ip, XFS_IRECLAIM)) { + /* not a reclaim candidate. */ + spin_unlock(&ip->i_flags_lock); + return 1; + } + __xfs_iflags_set(ip, XFS_IRECLAIM); + spin_unlock(&ip->i_flags_lock); + return 0; +} + +/* + * Inodes in different states need to be treated differently, and the return + * value of xfs_iflush is not sufficient to get this right. The following table + * lists the inode states and the reclaim actions necessary for non-blocking + * reclaim: + * + * + * inode state iflush ret required action + * --------------- ---------- --------------- + * bad - reclaim + * shutdown EIO unpin and reclaim + * clean, unpinned 0 reclaim + * stale, unpinned 0 reclaim + * clean, pinned(*) 0 requeue + * stale, pinned EAGAIN requeue + * dirty, delwri ok 0 requeue + * dirty, delwri blocked EAGAIN requeue + * dirty, sync flush 0 reclaim + * + * (*) dgc: I don't think the clean, pinned state is possible but it gets + * handled anyway given the order of checks implemented. + * + * As can be seen from the table, the return value of xfs_iflush() is not + * sufficient to correctly decide the reclaim action here. The checks in + * xfs_iflush() might look like duplicates, but they are not. + * + * Also, because we get the flush lock first, we know that any inode that has + * been flushed delwri has had the flush completed by the time we check that + * the inode is clean. The clean inode check needs to be done before flushing + * the inode delwri otherwise we would loop forever requeuing clean inodes as + * we cannot tell apart a successful delwri flush and a clean inode from the + * return value of xfs_iflush(). + * + * Note that because the inode is flushed delayed write by background + * writeback, the flush lock may already be held here and waiting on it can + * result in very long latencies. Hence for sync reclaims, where we wait on the + * flush lock, the caller should push out delayed write inodes first before + * trying to reclaim them to minimise the amount of time spent waiting. For + * background relaim, we just requeue the inode for the next pass. + * + * Hence the order of actions after gaining the locks should be: + * bad => reclaim + * shutdown => unpin and reclaim + * pinned, delwri => requeue + * pinned, sync => unpin + * stale => reclaim + * clean => reclaim + * dirty, delwri => flush and requeue + * dirty, sync => flush, wait and reclaim + */ +STATIC int +xfs_reclaim_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, + int sync_mode) +{ + int error; + +restart: + error = 0; + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (!xfs_iflock_nowait(ip)) { + if (!(sync_mode & SYNC_WAIT)) + goto out; + + /* + * If we only have a single dirty inode in a cluster there is + * a fair chance that the AIL push may have pushed it into + * the buffer, but xfsbufd won't touch it until 30 seconds + * from now, and thus we will lock up here. + * + * Promote the inode buffer to the front of the delwri list + * and wake up xfsbufd now. + */ + xfs_promote_inode(ip); + xfs_iflock(ip); + } + + if (is_bad_inode(VFS_I(ip))) + goto reclaim; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_iunpin_wait(ip); + goto reclaim; + } + if (xfs_ipincount(ip)) { + if (!(sync_mode & SYNC_WAIT)) { + xfs_ifunlock(ip); + goto out; + } + xfs_iunpin_wait(ip); + } + if (xfs_iflags_test(ip, XFS_ISTALE)) + goto reclaim; + if (xfs_inode_clean(ip)) + goto reclaim; + + /* + * Now we have an inode that needs flushing. + * + * We do a nonblocking flush here even if we are doing a SYNC_WAIT + * reclaim as we can deadlock with inode cluster removal. + * xfs_ifree_cluster() can lock the inode buffer before it locks the + * ip->i_lock, and we are doing the exact opposite here. As a result, + * doing a blocking xfs_itobp() to get the cluster buffer will result + * in an ABBA deadlock with xfs_ifree_cluster(). + * + * As xfs_ifree_cluser() must gather all inodes that are active in the + * cache to mark them stale, if we hit this case we don't actually want + * to do IO here - we want the inode marked stale so we can simply + * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, + * just unlock the inode, back off and try again. Hopefully the next + * pass through will see the stale flag set on the inode. + */ + error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); + if (sync_mode & SYNC_WAIT) { + if (error == EAGAIN) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* backoff longer than in xfs_ifree_cluster */ + delay(2); + goto restart; + } + xfs_iflock(ip); + goto reclaim; + } + + /* + * When we have to flush an inode but don't have SYNC_WAIT set, we + * flush the inode out using a delwri buffer and wait for the next + * call into reclaim to find it in a clean state instead of waiting for + * it now. We also don't return errors here - if the error is transient + * then the next reclaim pass will flush the inode, and if the error + * is permanent then the next sync reclaim will reclaim the inode and + * pass on the error. + */ + if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_warn(ip->i_mount, + "inode 0x%llx background reclaim flush failed with %d", + (long long)ip->i_ino, error); + } +out: + xfs_iflags_clear(ip, XFS_IRECLAIM); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * We could return EAGAIN here to make reclaim rescan the inode tree in + * a short while. However, this just burns CPU time scanning the tree + * waiting for IO to complete and xfssyncd never goes back to the idle + * state. Instead, return 0 to let the next scheduled background reclaim + * attempt to reclaim the inode again. + */ + return 0; + +reclaim: + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + XFS_STATS_INC(xs_ig_reclaims); + /* + * Remove the inode from the per-AG radix tree. + * + * Because radix_tree_delete won't complain even if the item was never + * added to the tree assert that it's been there before to catch + * problems with the inode life time early on. + */ + spin_lock(&pag->pag_ici_lock); + if (!radix_tree_delete(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) + ASSERT(0); + __xfs_inode_clear_reclaim(pag, ip); + spin_unlock(&pag->pag_ici_lock); + + /* + * Here we do an (almost) spurious inode lock in order to coordinate + * with inode cache radix tree lookups. This is because the lookup + * can reference the inodes in the cache without taking references. + * + * We make that OK here by ensuring that we wait until the inode is + * unlocked after the lookup before we go ahead and free it. We get + * both the ilock and the iolock because the code may need to drop the + * ilock one but will still hold the iolock. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_qm_dqdetach(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + + xfs_inode_free(ip); + return error; + +} + +/* + * Walk the AGs and reclaim the inodes in them. Even if the filesystem is + * corrupted, we still want to try to reclaim all the inodes. If we don't, + * then a shut down during filesystem unmount reclaim walk leak all the + * unreclaimed inodes. + */ +int +xfs_reclaim_inodes_ag( + struct xfs_mount *mp, + int flags, + int *nr_to_scan) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + int trylock = flags & SYNC_TRYLOCK; + int skipped; + +restart: + ag = 0; + skipped = 0; + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + unsigned long first_index = 0; + int done = 0; + int nr_found = 0; + + ag = pag->pag_agno + 1; + + if (trylock) { + if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { + skipped++; + xfs_perag_put(pag); + continue; + } + first_index = pag->pag_ici_reclaim_cursor; + } else + mutex_lock(&pag->pag_ici_reclaim_lock); + + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int i; + + rcu_read_lock(); + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH, + XFS_ICI_RECLAIM_TAG); + if (!nr_found) { + done = 1; + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || xfs_reclaim_inode_grab(ip, flags)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can + * occur if we have inodes in the last block of + * the AG and we are currently pointing to the + * last inode. + * + * Because we may see inodes that are from the + * wrong AG due to RCU freeing and + * reallocation, only update the index if it + * lies in this AG. It was a race that lead us + * to see this inode, so another lookup from + * the same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != + pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = xfs_reclaim_inode(batch[i], pag, flags); + if (error && last_error != EFSCORRUPTED) + last_error = error; + } + + *nr_to_scan -= XFS_LOOKUP_BATCH; + + } while (nr_found && !done && *nr_to_scan > 0); + + if (trylock && !done) + pag->pag_ici_reclaim_cursor = first_index; + else + pag->pag_ici_reclaim_cursor = 0; + mutex_unlock(&pag->pag_ici_reclaim_lock); + xfs_perag_put(pag); + } + + /* + * if we skipped any AG, and we still have scan count remaining, do + * another pass this time using blocking reclaim semantics (i.e + * waiting on the reclaim locks and ignoring the reclaim cursors). This + * ensure that when we get more reclaimers than AGs we block rather + * than spin trying to execute reclaim. + */ + if (trylock && skipped && *nr_to_scan > 0) { + trylock = 0; + goto restart; + } + return XFS_ERROR(last_error); +} + +int +xfs_reclaim_inodes( + xfs_mount_t *mp, + int mode) +{ + int nr_to_scan = INT_MAX; + + return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); +} + +/* + * Inode cache shrinker. + * + * When called we make sure that there is a background (fast) inode reclaim in + * progress, while we will throttle the speed of reclaim via doiing synchronous + * reclaim of inodes. That means if we come across dirty inodes, we wait for + * them to be cleaned, which we hope will not be very long due to the + * background walker having already kicked the IO off on those dirty inodes. + */ +static int +xfs_reclaim_inode_shrink( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_mount *mp; + struct xfs_perag *pag; + xfs_agnumber_t ag; + int reclaimable; + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + + mp = container_of(shrink, struct xfs_mount, m_inode_shrink); + if (nr_to_scan) { + /* kick background reclaimer and push the AIL */ + xfs_syncd_queue_reclaim(mp); + xfs_ail_push_all(mp->m_ail); + + if (!(gfp_mask & __GFP_FS)) + return -1; + + xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, + &nr_to_scan); + /* terminate if we don't exhaust the scan */ + if (nr_to_scan > 0) + return -1; + } + + reclaimable = 0; + ag = 0; + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + ag = pag->pag_agno + 1; + reclaimable += pag->pag_ici_reclaimable; + xfs_perag_put(pag); + } + return reclaimable; +} + +void +xfs_inode_shrinker_register( + struct xfs_mount *mp) +{ + mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink; + mp->m_inode_shrink.seeks = DEFAULT_SEEKS; + register_shrinker(&mp->m_inode_shrink); +} + +void +xfs_inode_shrinker_unregister( + struct xfs_mount *mp) +{ + unregister_shrinker(&mp->m_inode_shrink); +} diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h new file mode 100644 index 00000000..ef5b2ce4 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef XFS_SYNC_H +#define XFS_SYNC_H 1 + +struct xfs_mount; +struct xfs_perag; + +typedef struct xfs_sync_work { + struct list_head w_list; + struct xfs_mount *w_mount; + void *w_data; /* syncer routine argument */ + void (*w_syncer)(struct xfs_mount *, void *); + struct completion *w_completion; +} xfs_sync_work_t; + +#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ +#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ + +extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ + +int xfs_syncd_init(struct xfs_mount *mp); +void xfs_syncd_stop(struct xfs_mount *mp); + +int xfs_quiesce_data(struct xfs_mount *mp); +void xfs_quiesce_attr(struct xfs_mount *mp); + +void xfs_flush_inodes(struct xfs_inode *ip); + +int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags); + +int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); + +void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); +void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, + struct xfs_inode *ip); + +int xfs_sync_inode_grab(struct xfs_inode *ip); +int xfs_inode_ag_iterator(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), + int flags); + +void xfs_inode_shrinker_register(struct xfs_mount *mp); +void xfs_inode_shrinker_unregister(struct xfs_mount *mp); + +#endif diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c new file mode 100644 index 00000000..ee2d2ada --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2001-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include +#include +#include "xfs_error.h" + +static struct ctl_table_header *xfs_table_header; + +#ifdef CONFIG_PROC_FS +STATIC int +xfs_stats_clear_proc_handler( + ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int c, ret, *valp = ctl->data; + __uint32_t vn_active; + + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); + + if (!ret && write && *valp) { + xfs_notice(NULL, "Clearing xfsstats"); + for_each_possible_cpu(c) { + preempt_disable(); + /* save vn_active, it's a universal truth! */ + vn_active = per_cpu(xfsstats, c).vn_active; + memset(&per_cpu(xfsstats, c), 0, + sizeof(struct xfsstats)); + per_cpu(xfsstats, c).vn_active = vn_active; + preempt_enable(); + } + xfs_stats_clear = 0; + } + + return ret; +} + +STATIC int +xfs_panic_mask_proc_handler( + ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int ret, *valp = ctl->data; + + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); + if (!ret && write) { + xfs_panic_mask = *valp; +#ifdef DEBUG + xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); +#endif + } + return ret; +} +#endif /* CONFIG_PROC_FS */ + +static ctl_table xfs_table[] = { + { + .procname = "irix_sgid_inherit", + .data = &xfs_params.sgid_inherit.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.sgid_inherit.min, + .extra2 = &xfs_params.sgid_inherit.max + }, + { + .procname = "irix_symlink_mode", + .data = &xfs_params.symlink_mode.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.symlink_mode.min, + .extra2 = &xfs_params.symlink_mode.max + }, + { + .procname = "panic_mask", + .data = &xfs_params.panic_mask.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = xfs_panic_mask_proc_handler, + .extra1 = &xfs_params.panic_mask.min, + .extra2 = &xfs_params.panic_mask.max + }, + + { + .procname = "error_level", + .data = &xfs_params.error_level.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.error_level.min, + .extra2 = &xfs_params.error_level.max + }, + { + .procname = "xfssyncd_centisecs", + .data = &xfs_params.syncd_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.syncd_timer.min, + .extra2 = &xfs_params.syncd_timer.max + }, + { + .procname = "inherit_sync", + .data = &xfs_params.inherit_sync.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_sync.min, + .extra2 = &xfs_params.inherit_sync.max + }, + { + .procname = "inherit_nodump", + .data = &xfs_params.inherit_nodump.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_nodump.min, + .extra2 = &xfs_params.inherit_nodump.max + }, + { + .procname = "inherit_noatime", + .data = &xfs_params.inherit_noatim.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_noatim.min, + .extra2 = &xfs_params.inherit_noatim.max + }, + { + .procname = "xfsbufd_centisecs", + .data = &xfs_params.xfs_buf_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.xfs_buf_timer.min, + .extra2 = &xfs_params.xfs_buf_timer.max + }, + { + .procname = "age_buffer_centisecs", + .data = &xfs_params.xfs_buf_age.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.xfs_buf_age.min, + .extra2 = &xfs_params.xfs_buf_age.max + }, + { + .procname = "inherit_nosymlinks", + .data = &xfs_params.inherit_nosym.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_nosym.min, + .extra2 = &xfs_params.inherit_nosym.max + }, + { + .procname = "rotorstep", + .data = &xfs_params.rotorstep.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.rotorstep.min, + .extra2 = &xfs_params.rotorstep.max + }, + { + .procname = "inherit_nodefrag", + .data = &xfs_params.inherit_nodfrg.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_nodfrg.min, + .extra2 = &xfs_params.inherit_nodfrg.max + }, + { + .procname = "filestream_centisecs", + .data = &xfs_params.fstrm_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.fstrm_timer.min, + .extra2 = &xfs_params.fstrm_timer.max, + }, + /* please keep this the last entry */ +#ifdef CONFIG_PROC_FS + { + .procname = "stats_clear", + .data = &xfs_params.stats_clear.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = xfs_stats_clear_proc_handler, + .extra1 = &xfs_params.stats_clear.min, + .extra2 = &xfs_params.stats_clear.max + }, +#endif /* CONFIG_PROC_FS */ + + {} +}; + +static ctl_table xfs_dir_table[] = { + { + .procname = "xfs", + .mode = 0555, + .child = xfs_table + }, + {} +}; + +static ctl_table xfs_root_table[] = { + { + .procname = "fs", + .mode = 0555, + .child = xfs_dir_table + }, + {} +}; + +int +xfs_sysctl_register(void) +{ + xfs_table_header = register_sysctl_table(xfs_root_table); + if (!xfs_table_header) + return -ENOMEM; + return 0; +} + +void +xfs_sysctl_unregister(void) +{ + unregister_sysctl_table(xfs_table_header); +} diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h new file mode 100644 index 00000000..b9937d45 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_sysctl.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2001-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SYSCTL_H__ +#define __XFS_SYSCTL_H__ + +#include + +/* + * Tunable xfs parameters + */ + +typedef struct xfs_sysctl_val { + int min; + int val; + int max; +} xfs_sysctl_val_t; + +typedef struct xfs_param { + xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is + * not a member of parent dir GID. */ + xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ + xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */ + xfs_sysctl_val_t error_level; /* Degree of reporting for problems */ + xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */ + xfs_sysctl_val_t stats_clear; /* Reset all XFS statistics to zero. */ + xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */ + xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */ + xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */ + xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */ + xfs_sysctl_val_t xfs_buf_age; /* Metadata buffer age before flush. */ + xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ + xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ + xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ + xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ +} xfs_param_t; + +/* + * xfs_error_level: + * + * How much error reporting will be done when internal problems are + * encountered. These problems normally return an EFSCORRUPTED to their + * caller, with no other information reported. + * + * 0 No error reports + * 1 Report EFSCORRUPTED errors that will cause a filesystem shutdown + * 5 Report all EFSCORRUPTED errors (all of the above errors, plus any + * additional errors that are known to not cause shutdowns) + * + * xfs_panic_mask bit 0x8 turns the error reports into panics + */ + +enum { + /* XFS_REFCACHE_SIZE = 1 */ + /* XFS_REFCACHE_PURGE = 2 */ + /* XFS_RESTRICT_CHOWN = 3 */ + XFS_SGID_INHERIT = 4, + XFS_SYMLINK_MODE = 5, + XFS_PANIC_MASK = 6, + XFS_ERRLEVEL = 7, + XFS_SYNCD_TIMER = 8, + /* XFS_PROBE_DMAPI = 9 */ + /* XFS_PROBE_IOOPS = 10 */ + /* XFS_PROBE_QUOTA = 11 */ + XFS_STATS_CLEAR = 12, + XFS_INHERIT_SYNC = 13, + XFS_INHERIT_NODUMP = 14, + XFS_INHERIT_NOATIME = 15, + XFS_BUF_TIMER = 16, + XFS_BUF_AGE = 17, + /* XFS_IO_BYPASS = 18 */ + XFS_INHERIT_NOSYM = 19, + XFS_ROTORSTEP = 20, + XFS_INHERIT_NODFRG = 21, + XFS_FILESTREAM_TIMER = 22, +}; + +extern xfs_param_t xfs_params; + +#ifdef CONFIG_SYSCTL +extern int xfs_sysctl_register(void); +extern void xfs_sysctl_unregister(void); +#else +# define xfs_sysctl_register() (0) +# define xfs_sysctl_unregister() do { } while (0) +#endif /* CONFIG_SYSCTL */ + +#endif /* __XFS_SYSCTL_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c new file mode 100644 index 00000000..88d25d4a --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_trace.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2009, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_mount.h" +#include "xfs_ialloc.h" +#include "xfs_itable.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_log_priv.h" +#include "xfs_buf_item.h" +#include "xfs_quota.h" +#include "xfs_iomap.h" +#include "xfs_aops.h" +#include "quota/xfs_dquot_item.h" +#include "quota/xfs_dquot.h" +#include "xfs_log_recover.h" +#include "xfs_inode_item.h" + +/* + * We include this last to have the helpers above available for the trace + * event implementations. + */ +#define CREATE_TRACE_POINTS +#include "xfs_trace.h" diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h new file mode 100644 index 00000000..d48b7a57 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -0,0 +1,1776 @@ +/* + * Copyright (c) 2009, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM xfs + +#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_XFS_H + +#include + +struct xfs_agf; +struct xfs_alloc_arg; +struct xfs_attr_list_context; +struct xfs_buf_log_item; +struct xfs_da_args; +struct xfs_da_node_entry; +struct xfs_dquot; +struct xlog_ticket; +struct log; +struct xlog_recover; +struct xlog_recover_item; +struct xfs_buf_log_format; +struct xfs_inode_log_format; + +DECLARE_EVENT_CLASS(xfs_attr_list_class, + TP_PROTO(struct xfs_attr_list_context *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u32, hashval) + __field(u32, blkno) + __field(u32, offset) + __field(void *, alist) + __field(int, bufsize) + __field(int, count) + __field(int, firstu) + __field(int, dupcnt) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; + __entry->ino = ctx->dp->i_ino; + __entry->hashval = ctx->cursor->hashval; + __entry->blkno = ctx->cursor->blkno; + __entry->offset = ctx->cursor->offset; + __entry->alist = ctx->alist; + __entry->bufsize = ctx->bufsize; + __entry->count = ctx->count; + __entry->firstu = ctx->firstu; + __entry->flags = ctx->flags; + ), + TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " + "alist 0x%p size %u count %u firstu %u flags %d %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->hashval, + __entry->blkno, + __entry->offset, + __entry->dupcnt, + __entry->alist, + __entry->bufsize, + __entry->count, + __entry->firstu, + __entry->flags, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) + ) +) + +#define DEFINE_ATTR_LIST_EVENT(name) \ +DEFINE_EVENT(xfs_attr_list_class, name, \ + TP_PROTO(struct xfs_attr_list_context *ctx), \ + TP_ARGS(ctx)) +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); + +DECLARE_EVENT_CLASS(xfs_perag_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, + unsigned long caller_ip), + TP_ARGS(mp, agno, refcount, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, refcount) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->refcount = refcount; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno %u refcount %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->refcount, + (char *)__entry->caller_ip) +); + +#define DEFINE_PERAG_REF_EVENT(name) \ +DEFINE_EVENT(xfs_perag_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \ + unsigned long caller_ip), \ + TP_ARGS(mp, agno, refcount, caller_ip)) +DEFINE_PERAG_REF_EVENT(xfs_perag_get); +DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); +DEFINE_PERAG_REF_EVENT(xfs_perag_put); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); + +TRACE_EVENT(xfs_attr_list_node_descend, + TP_PROTO(struct xfs_attr_list_context *ctx, + struct xfs_da_node_entry *btree), + TP_ARGS(ctx, btree), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u32, hashval) + __field(u32, blkno) + __field(u32, offset) + __field(void *, alist) + __field(int, bufsize) + __field(int, count) + __field(int, firstu) + __field(int, dupcnt) + __field(int, flags) + __field(u32, bt_hashval) + __field(u32, bt_before) + ), + TP_fast_assign( + __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; + __entry->ino = ctx->dp->i_ino; + __entry->hashval = ctx->cursor->hashval; + __entry->blkno = ctx->cursor->blkno; + __entry->offset = ctx->cursor->offset; + __entry->alist = ctx->alist; + __entry->bufsize = ctx->bufsize; + __entry->count = ctx->count; + __entry->firstu = ctx->firstu; + __entry->flags = ctx->flags; + __entry->bt_hashval = be32_to_cpu(btree->hashval); + __entry->bt_before = be32_to_cpu(btree->before); + ), + TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " + "alist 0x%p size %u count %u firstu %u flags %d %s " + "node hashval %u, node before %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->hashval, + __entry->blkno, + __entry->offset, + __entry->dupcnt, + __entry->alist, + __entry->bufsize, + __entry->count, + __entry->firstu, + __entry->flags, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), + __entry->bt_hashval, + __entry->bt_before) +); + +TRACE_EVENT(xfs_iext_insert, + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, + struct xfs_bmbt_irec *r, int state, unsigned long caller_ip), + TP_ARGS(ip, idx, r, state, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_extnum_t, idx) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(int, bmap_state) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->idx = idx; + __entry->startoff = r->br_startoff; + __entry->startblock = r->br_startblock; + __entry->blockcount = r->br_blockcount; + __entry->state = r->br_state; + __entry->bmap_state = state; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " + "offset %lld block %lld count %lld flag %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), + (long)__entry->idx, + __entry->startoff, + (__int64_t)__entry->startblock, + __entry->blockcount, + __entry->state, + (char *)__entry->caller_ip) +); + +DECLARE_EVENT_CLASS(xfs_bmap_class, + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, + unsigned long caller_ip), + TP_ARGS(ip, idx, state, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_extnum_t, idx) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(int, bmap_state) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ? + ip->i_afp : &ip->i_df; + struct xfs_bmbt_irec r; + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->idx = idx; + __entry->startoff = r.br_startoff; + __entry->startblock = r.br_startblock; + __entry->blockcount = r.br_blockcount; + __entry->state = r.br_state; + __entry->bmap_state = state; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " + "offset %lld block %lld count %lld flag %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), + (long)__entry->idx, + __entry->startoff, + (__int64_t)__entry->startblock, + __entry->blockcount, + __entry->state, + (char *)__entry->caller_ip) +) + +#define DEFINE_BMAP_EVENT(name) \ +DEFINE_EVENT(xfs_bmap_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \ + unsigned long caller_ip), \ + TP_ARGS(ip, idx, state, caller_ip)) +DEFINE_BMAP_EVENT(xfs_iext_remove); +DEFINE_BMAP_EVENT(xfs_bmap_pre_update); +DEFINE_BMAP_EVENT(xfs_bmap_post_update); +DEFINE_BMAP_EVENT(xfs_extlist); + +DECLARE_EVENT_CLASS(xfs_buf_class, + TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), + TP_ARGS(bp, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(unsigned, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = bp->b_buffer_length; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = xfs_buf_lock_value(bp); + __entry->flags = bp->b_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_BUF_EVENT(name) \ +DEFINE_EVENT(xfs_buf_class, name, \ + TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \ + TP_ARGS(bp, caller_ip)) +DEFINE_BUF_EVENT(xfs_buf_init); +DEFINE_BUF_EVENT(xfs_buf_free); +DEFINE_BUF_EVENT(xfs_buf_hold); +DEFINE_BUF_EVENT(xfs_buf_rele); +DEFINE_BUF_EVENT(xfs_buf_iodone); +DEFINE_BUF_EVENT(xfs_buf_iorequest); +DEFINE_BUF_EVENT(xfs_buf_bawrite); +DEFINE_BUF_EVENT(xfs_buf_bdwrite); +DEFINE_BUF_EVENT(xfs_buf_lock); +DEFINE_BUF_EVENT(xfs_buf_lock_done); +DEFINE_BUF_EVENT(xfs_buf_cond_lock); +DEFINE_BUF_EVENT(xfs_buf_unlock); +DEFINE_BUF_EVENT(xfs_buf_iowait); +DEFINE_BUF_EVENT(xfs_buf_iowait_done); +DEFINE_BUF_EVENT(xfs_buf_delwri_queue); +DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); +DEFINE_BUF_EVENT(xfs_buf_delwri_split); +DEFINE_BUF_EVENT(xfs_buf_get_uncached); +DEFINE_BUF_EVENT(xfs_bdstrat_shut); +DEFINE_BUF_EVENT(xfs_buf_item_relse); +DEFINE_BUF_EVENT(xfs_buf_item_iodone); +DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); +DEFINE_BUF_EVENT(xfs_buf_error_relse); +DEFINE_BUF_EVENT(xfs_trans_read_buf_io); +DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); + +/* not really buffer traces, but the buf provides useful information */ +DEFINE_BUF_EVENT(xfs_btree_corrupt); +DEFINE_BUF_EVENT(xfs_da_btree_corrupt); +DEFINE_BUF_EVENT(xfs_reset_dqcounts); +DEFINE_BUF_EVENT(xfs_inode_item_push); + +/* pass flags explicitly */ +DECLARE_EVENT_CLASS(xfs_buf_flags_class, + TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), + TP_ARGS(bp, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(unsigned, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = bp->b_buffer_length; + __entry->flags = flags; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = xfs_buf_lock_value(bp); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_BUF_FLAGS_EVENT(name) \ +DEFINE_EVENT(xfs_buf_flags_class, name, \ + TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \ + TP_ARGS(bp, flags, caller_ip)) +DEFINE_BUF_FLAGS_EVENT(xfs_buf_find); +DEFINE_BUF_FLAGS_EVENT(xfs_buf_get); +DEFINE_BUF_FLAGS_EVENT(xfs_buf_read); + +TRACE_EVENT(xfs_buf_ioerror, + TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip), + TP_ARGS(bp, error, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(unsigned, flags) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(int, error) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = bp->b_buffer_length; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = xfs_buf_lock_value(bp); + __entry->error = error; + __entry->flags = bp->b_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d error %d flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __entry->error, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +); + +DECLARE_EVENT_CLASS(xfs_buf_item_class, + TP_PROTO(struct xfs_buf_log_item *bip), + TP_ARGS(bip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, buf_bno) + __field(size_t, buf_len) + __field(int, buf_hold) + __field(int, buf_pincount) + __field(int, buf_lockval) + __field(unsigned, buf_flags) + __field(unsigned, bli_recur) + __field(int, bli_refcount) + __field(unsigned, bli_flags) + __field(void *, li_desc) + __field(unsigned, li_flags) + ), + TP_fast_assign( + __entry->dev = bip->bli_buf->b_target->bt_dev; + __entry->bli_flags = bip->bli_flags; + __entry->bli_recur = bip->bli_recur; + __entry->bli_refcount = atomic_read(&bip->bli_refcount); + __entry->buf_bno = bip->bli_buf->b_bn; + __entry->buf_len = bip->bli_buf->b_buffer_length; + __entry->buf_flags = bip->bli_buf->b_flags; + __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); + __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); + __entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf); + __entry->li_desc = bip->bli_item.li_desc; + __entry->li_flags = bip->bli_item.li_flags; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s recur %d refcount %d bliflags %s " + "lidesc 0x%p liflags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->buf_bno, + __entry->buf_len, + __entry->buf_hold, + __entry->buf_pincount, + __entry->buf_lockval, + __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS), + __entry->bli_recur, + __entry->bli_refcount, + __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), + __entry->li_desc, + __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_BUF_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_buf_item_class, name, \ + TP_PROTO(struct xfs_buf_log_item *bip), \ + TP_ARGS(bip)) +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); +DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); +DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); + +DECLARE_EVENT_CLASS(xfs_lock_class, + TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, + unsigned long caller_ip), + TP_ARGS(ip, lock_flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, lock_flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lock_flags = lock_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_LOCK_EVENT(name) \ +DEFINE_EVENT(xfs_lock_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \ + unsigned long caller_ip), \ + TP_ARGS(ip, lock_flags, caller_ip)) +DEFINE_LOCK_EVENT(xfs_ilock); +DEFINE_LOCK_EVENT(xfs_ilock_nowait); +DEFINE_LOCK_EVENT(xfs_ilock_demote); +DEFINE_LOCK_EVENT(xfs_iunlock); + +DECLARE_EVENT_CLASS(xfs_inode_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +) + +#define DEFINE_INODE_EVENT(name) \ +DEFINE_EVENT(xfs_inode_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_INODE_EVENT(xfs_iget_skip); +DEFINE_INODE_EVENT(xfs_iget_reclaim); +DEFINE_INODE_EVENT(xfs_iget_reclaim_fail); +DEFINE_INODE_EVENT(xfs_iget_hit); +DEFINE_INODE_EVENT(xfs_iget_miss); + +DEFINE_INODE_EVENT(xfs_getattr); +DEFINE_INODE_EVENT(xfs_setattr); +DEFINE_INODE_EVENT(xfs_readlink); +DEFINE_INODE_EVENT(xfs_alloc_file_space); +DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_readdir); +#ifdef CONFIG_XFS_POSIX_ACL +DEFINE_INODE_EVENT(xfs_check_acl); +#endif +DEFINE_INODE_EVENT(xfs_vm_bmap); +DEFINE_INODE_EVENT(xfs_file_ioctl); +DEFINE_INODE_EVENT(xfs_file_compat_ioctl); +DEFINE_INODE_EVENT(xfs_ioctl_setattr); +DEFINE_INODE_EVENT(xfs_file_fsync); +DEFINE_INODE_EVENT(xfs_destroy_inode); +DEFINE_INODE_EVENT(xfs_write_inode); +DEFINE_INODE_EVENT(xfs_evict_inode); + +DEFINE_INODE_EVENT(xfs_dquot_dqalloc); +DEFINE_INODE_EVENT(xfs_dquot_dqdetach); + +DECLARE_EVENT_CLASS(xfs_iref_class, + TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), + TP_ARGS(ip, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, count) + __field(int, pincount) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->pincount = atomic_read(&ip->i_pincount); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->count, + __entry->pincount, + (char *)__entry->caller_ip) +) + +#define DEFINE_IREF_EVENT(name) \ +DEFINE_EVENT(xfs_iref_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ + TP_ARGS(ip, caller_ip)) +DEFINE_IREF_EVENT(xfs_ihold); +DEFINE_IREF_EVENT(xfs_irele); +DEFINE_IREF_EVENT(xfs_inode_pin); +DEFINE_IREF_EVENT(xfs_inode_unpin); +DEFINE_IREF_EVENT(xfs_inode_unpin_nowait); + +DECLARE_EVENT_CLASS(xfs_namespace_class, + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), + TP_ARGS(dp, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dp_ino) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = VFS_I(dp)->i_sb->s_dev; + __entry->dp_ino = dp->i_ino; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dp ino 0x%llx name %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dp_ino, + __get_str(name)) +) + +#define DEFINE_NAMESPACE_EVENT(name) \ +DEFINE_EVENT(xfs_namespace_class, name, \ + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \ + TP_ARGS(dp, name)) +DEFINE_NAMESPACE_EVENT(xfs_remove); +DEFINE_NAMESPACE_EVENT(xfs_link); +DEFINE_NAMESPACE_EVENT(xfs_lookup); +DEFINE_NAMESPACE_EVENT(xfs_create); +DEFINE_NAMESPACE_EVENT(xfs_symlink); + +TRACE_EVENT(xfs_rename, + TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp, + struct xfs_name *src_name, struct xfs_name *target_name), + TP_ARGS(src_dp, target_dp, src_name, target_name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, src_dp_ino) + __field(xfs_ino_t, target_dp_ino) + __dynamic_array(char, src_name, src_name->len) + __dynamic_array(char, target_name, target_name->len) + ), + TP_fast_assign( + __entry->dev = VFS_I(src_dp)->i_sb->s_dev; + __entry->src_dp_ino = src_dp->i_ino; + __entry->target_dp_ino = target_dp->i_ino; + memcpy(__get_str(src_name), src_name->name, src_name->len); + memcpy(__get_str(target_name), target_name->name, target_name->len); + ), + TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx" + " src name %s target name %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->src_dp_ino, + __entry->target_dp_ino, + __get_str(src_name), + __get_str(target_name)) +) + +DECLARE_EVENT_CLASS(xfs_dquot_class, + TP_PROTO(struct xfs_dquot *dqp), + TP_ARGS(dqp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, id) + __field(unsigned, flags) + __field(unsigned, nrefs) + __field(unsigned long long, res_bcount) + __field(unsigned long long, bcount) + __field(unsigned long long, icount) + __field(unsigned long long, blk_hardlimit) + __field(unsigned long long, blk_softlimit) + __field(unsigned long long, ino_hardlimit) + __field(unsigned long long, ino_softlimit) + ), \ + TP_fast_assign( + __entry->dev = dqp->q_mount->m_super->s_dev; + __entry->id = be32_to_cpu(dqp->q_core.d_id); + __entry->flags = dqp->dq_flags; + __entry->nrefs = dqp->q_nrefs; + __entry->res_bcount = dqp->q_res_bcount; + __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); + __entry->icount = be64_to_cpu(dqp->q_core.d_icount); + __entry->blk_hardlimit = + be64_to_cpu(dqp->q_core.d_blk_hardlimit); + __entry->blk_softlimit = + be64_to_cpu(dqp->q_core.d_blk_softlimit); + __entry->ino_hardlimit = + be64_to_cpu(dqp->q_core.d_ino_hardlimit); + __entry->ino_softlimit = + be64_to_cpu(dqp->q_core.d_ino_softlimit); + ), + TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " + "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx " + "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->id, + __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), + __entry->nrefs, + __entry->res_bcount, + __entry->bcount, + __entry->blk_hardlimit, + __entry->blk_softlimit, + __entry->icount, + __entry->ino_hardlimit, + __entry->ino_softlimit) +) + +#define DEFINE_DQUOT_EVENT(name) \ +DEFINE_EVENT(xfs_dquot_class, name, \ + TP_PROTO(struct xfs_dquot *dqp), \ + TP_ARGS(dqp)) +DEFINE_DQUOT_EVENT(xfs_dqadjust); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink); +DEFINE_DQUOT_EVENT(xfs_dqattach_found); +DEFINE_DQUOT_EVENT(xfs_dqattach_get); +DEFINE_DQUOT_EVENT(xfs_dqinit); +DEFINE_DQUOT_EVENT(xfs_dqreuse); +DEFINE_DQUOT_EVENT(xfs_dqalloc); +DEFINE_DQUOT_EVENT(xfs_dqtobp_read); +DEFINE_DQUOT_EVENT(xfs_dqread); +DEFINE_DQUOT_EVENT(xfs_dqread_fail); +DEFINE_DQUOT_EVENT(xfs_dqlookup_found); +DEFINE_DQUOT_EVENT(xfs_dqlookup_want); +DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist); +DEFINE_DQUOT_EVENT(xfs_dqlookup_done); +DEFINE_DQUOT_EVENT(xfs_dqget_hit); +DEFINE_DQUOT_EVENT(xfs_dqget_miss); +DEFINE_DQUOT_EVENT(xfs_dqput); +DEFINE_DQUOT_EVENT(xfs_dqput_wait); +DEFINE_DQUOT_EVENT(xfs_dqput_free); +DEFINE_DQUOT_EVENT(xfs_dqrele); +DEFINE_DQUOT_EVENT(xfs_dqflush); +DEFINE_DQUOT_EVENT(xfs_dqflush_force); +DEFINE_DQUOT_EVENT(xfs_dqflush_done); + +DECLARE_EVENT_CLASS(xfs_loggrant_class, + TP_PROTO(struct log *log, struct xlog_ticket *tic), + TP_ARGS(log, tic), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned, trans_type) + __field(char, ocnt) + __field(char, cnt) + __field(int, curr_res) + __field(int, unit_res) + __field(unsigned int, flags) + __field(int, reserveq) + __field(int, writeq) + __field(int, grant_reserve_cycle) + __field(int, grant_reserve_bytes) + __field(int, grant_write_cycle) + __field(int, grant_write_bytes) + __field(int, curr_cycle) + __field(int, curr_block) + __field(xfs_lsn_t, tail_lsn) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->trans_type = tic->t_trans_type; + __entry->ocnt = tic->t_ocnt; + __entry->cnt = tic->t_cnt; + __entry->curr_res = tic->t_curr_res; + __entry->unit_res = tic->t_unit_res; + __entry->flags = tic->t_flags; + __entry->reserveq = list_empty(&log->l_reserveq); + __entry->writeq = list_empty(&log->l_writeq); + xlog_crack_grant_head(&log->l_grant_reserve_head, + &__entry->grant_reserve_cycle, + &__entry->grant_reserve_bytes); + xlog_crack_grant_head(&log->l_grant_write_head, + &__entry->grant_write_cycle, + &__entry->grant_write_bytes); + __entry->curr_cycle = log->l_curr_cycle; + __entry->curr_block = log->l_curr_block; + __entry->tail_lsn = atomic64_read(&log->l_tail_lsn); + ), + TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " + "t_unit_res %u t_flags %s reserveq %s " + "writeq %s grant_reserve_cycle %d " + "grant_reserve_bytes %d grant_write_cycle %d " + "grant_write_bytes %d curr_cycle %d curr_block %d " + "tail_cycle %d tail_block %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), + __entry->ocnt, + __entry->cnt, + __entry->curr_res, + __entry->unit_res, + __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), + __entry->reserveq ? "empty" : "active", + __entry->writeq ? "empty" : "active", + __entry->grant_reserve_cycle, + __entry->grant_reserve_bytes, + __entry->grant_write_cycle, + __entry->grant_write_bytes, + __entry->curr_cycle, + __entry->curr_block, + CYCLE_LSN(__entry->tail_lsn), + BLOCK_LSN(__entry->tail_lsn) + ) +) + +#define DEFINE_LOGGRANT_EVENT(name) \ +DEFINE_EVENT(xfs_loggrant_class, name, \ + TP_PROTO(struct log *log, struct xlog_ticket *tic), \ + TP_ARGS(log, tic)) +DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); +DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); +DEFINE_LOGGRANT_EVENT(xfs_log_reserve); +DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_error); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); + +DECLARE_EVENT_CLASS(xfs_file_class, + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), + TP_ARGS(ip, count, offset, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fsize_t, new_size) + __field(loff_t, offset) + __field(size_t, count) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = ip->i_new_size; + __entry->offset = offset; + __entry->count = count; + __entry->flags = flags; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + "offset 0x%llx count 0x%zx ioflags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size, + __entry->offset, + __entry->count, + __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) +) + +#define DEFINE_RW_EVENT(name) \ +DEFINE_EVENT(xfs_file_class, name, \ + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ + TP_ARGS(ip, count, offset, flags)) +DEFINE_RW_EVENT(xfs_file_read); +DEFINE_RW_EVENT(xfs_file_buffered_write); +DEFINE_RW_EVENT(xfs_file_direct_write); +DEFINE_RW_EVENT(xfs_file_splice_read); +DEFINE_RW_EVENT(xfs_file_splice_write); + +DECLARE_EVENT_CLASS(xfs_page_class, + TP_PROTO(struct inode *inode, struct page *page, unsigned long off), + TP_ARGS(inode, page, off), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(pgoff_t, pgoff) + __field(loff_t, size) + __field(unsigned long, offset) + __field(int, delalloc) + __field(int, unwritten) + ), + TP_fast_assign( + int delalloc = -1, unwritten = -1; + + if (page_has_buffers(page)) + xfs_count_page_state(page, &delalloc, &unwritten); + __entry->dev = inode->i_sb->s_dev; + __entry->ino = XFS_I(inode)->i_ino; + __entry->pgoff = page_offset(page); + __entry->size = i_size_read(inode); + __entry->offset = off; + __entry->delalloc = delalloc; + __entry->unwritten = unwritten; + ), + TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " + "delalloc %d unwritten %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pgoff, + __entry->size, + __entry->offset, + __entry->delalloc, + __entry->unwritten) +) + +#define DEFINE_PAGE_EVENT(name) \ +DEFINE_EVENT(xfs_page_class, name, \ + TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ + TP_ARGS(inode, page, off)) +DEFINE_PAGE_EVENT(xfs_writepage); +DEFINE_PAGE_EVENT(xfs_releasepage); +DEFINE_PAGE_EVENT(xfs_invalidatepage); + +DECLARE_EVENT_CLASS(xfs_imap_class, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, + int type, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, offset, count, type, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, size) + __field(loff_t, new_size) + __field(loff_t, offset) + __field(size_t, count) + __field(int, type) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = ip->i_new_size; + __entry->offset = offset; + __entry->count = count; + __entry->type = type; + __entry->startoff = irec ? irec->br_startoff : 0; + __entry->startblock = irec ? irec->br_startblock : 0; + __entry->blockcount = irec ? irec->br_blockcount : 0; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + "offset 0x%llx count %zd type %s " + "startoff 0x%llx startblock %lld blockcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size, + __entry->offset, + __entry->count, + __print_symbolic(__entry->type, XFS_IO_TYPES), + __entry->startoff, + (__int64_t)__entry->startblock, + __entry->blockcount) +) + +#define DEFINE_IOMAP_EVENT(name) \ +DEFINE_EVENT(xfs_imap_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ + int type, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, type, irec)) +DEFINE_IOMAP_EVENT(xfs_map_blocks_found); +DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); +DEFINE_IOMAP_EVENT(xfs_get_blocks_found); +DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); + +DECLARE_EVENT_CLASS(xfs_simple_io_class, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), + TP_ARGS(ip, offset, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, size) + __field(loff_t, new_size) + __field(loff_t, offset) + __field(size_t, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = ip->i_new_size; + __entry->offset = offset; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + "offset 0x%llx count %zd", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size, + __entry->offset, + __entry->count) +); + +#define DEFINE_SIMPLE_IO_EVENT(name) \ +DEFINE_EVENT(xfs_simple_io_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \ + TP_ARGS(ip, offset, count)) +DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); +DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); +DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound); + + +TRACE_EVENT(xfs_itruncate_start, + TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size, int flag, + xfs_off_t toss_start, xfs_off_t toss_finish), + TP_ARGS(ip, new_size, flag, toss_start, toss_finish), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fsize_t, new_size) + __field(xfs_off_t, toss_start) + __field(xfs_off_t, toss_finish) + __field(int, flag) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = new_size; + __entry->toss_start = toss_start; + __entry->toss_finish = toss_finish; + __entry->flag = flag; + ), + TP_printk("dev %d:%d ino 0x%llx %s size 0x%llx new_size 0x%llx " + "toss start 0x%llx toss finish 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->flag, "|", XFS_ITRUNC_FLAGS), + __entry->size, + __entry->new_size, + __entry->toss_start, + __entry->toss_finish) +); + +DECLARE_EVENT_CLASS(xfs_itrunc_class, + TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), + TP_ARGS(ip, new_size), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fsize_t, new_size) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = new_size; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size) +) + +#define DEFINE_ITRUNC_EVENT(name) \ +DEFINE_EVENT(xfs_itrunc_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ + TP_ARGS(ip, new_size)) +DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_start); +DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_end); + +TRACE_EVENT(xfs_pagecache_inval, + TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), + TP_ARGS(ip, start, finish), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_off_t, start) + __field(xfs_off_t, finish) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->start = start; + __entry->finish = finish; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->start, + __entry->finish) +); + +TRACE_EVENT(xfs_bunmap, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, + int flags, unsigned long caller_ip), + TP_ARGS(ip, bno, len, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fileoff_t, bno) + __field(xfs_filblks_t, len) + __field(unsigned long, caller_ip) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->bno = bno; + __entry->len = len; + __entry->caller_ip = caller_ip; + __entry->flags = flags; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx" + "flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->bno, + __entry->len, + __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS), + (void *)__entry->caller_ip) + +); + +DECLARE_EVENT_CLASS(xfs_busy_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +); +#define DEFINE_BUSY_EVENT(name) \ +DEFINE_EVENT(xfs_busy_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_BUSY_EVENT(xfs_alloc_busy); +DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem); +DEFINE_BUSY_EVENT(xfs_alloc_busy_force); +DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse); +DEFINE_BUSY_EVENT(xfs_alloc_busy_clear); + +TRACE_EVENT(xfs_alloc_busy_trim, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, + xfs_agblock_t tbno, xfs_extlen_t tlen), + TP_ARGS(mp, agno, agbno, len, tbno, tlen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(xfs_agblock_t, tbno) + __field(xfs_extlen_t, tlen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->tbno = tbno; + __entry->tlen = tlen; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->tbno, + __entry->tlen) +); + +TRACE_EVENT(xfs_trans_commit_lsn, + TP_PROTO(struct xfs_trans *trans), + TP_ARGS(trans), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->lsn = trans->t_commit_lsn; + ), + TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, + __entry->lsn) +); + +TRACE_EVENT(xfs_agf, + TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, + unsigned long caller_ip), + TP_ARGS(mp, agf, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, flags) + __field(__u32, length) + __field(__u32, bno_root) + __field(__u32, cnt_root) + __field(__u32, bno_level) + __field(__u32, cnt_level) + __field(__u32, flfirst) + __field(__u32, fllast) + __field(__u32, flcount) + __field(__u32, freeblks) + __field(__u32, longest) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = be32_to_cpu(agf->agf_seqno), + __entry->flags = flags; + __entry->length = be32_to_cpu(agf->agf_length), + __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]), + __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]), + __entry->bno_level = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]), + __entry->cnt_level = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]), + __entry->flfirst = be32_to_cpu(agf->agf_flfirst), + __entry->fllast = be32_to_cpu(agf->agf_fllast), + __entry->flcount = be32_to_cpu(agf->agf_flcount), + __entry->freeblks = be32_to_cpu(agf->agf_freeblks), + __entry->longest = be32_to_cpu(agf->agf_longest); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u " + "levels b %u c %u flfirst %u fllast %u flcount %u " + "freeblks %u longest %u caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __print_flags(__entry->flags, "|", XFS_AGF_FLAGS), + __entry->length, + __entry->bno_root, + __entry->cnt_root, + __entry->bno_level, + __entry->cnt_level, + __entry->flfirst, + __entry->fllast, + __entry->flcount, + __entry->freeblks, + __entry->longest, + (void *)__entry->caller_ip) +); + +TRACE_EVENT(xfs_free_extent, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_extlen_t len, bool isfl, int haveleft, int haveright), + TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(int, isfl) + __field(int, haveleft) + __field(int, haveright) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->isfl = isfl; + __entry->haveleft = haveleft; + __entry->haveright = haveright; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->isfl, + __entry->haveleft ? + (__entry->haveright ? "both" : "left") : + (__entry->haveright ? "right" : "none")) + +); + +DECLARE_EVENT_CLASS(xfs_alloc_class, + TP_PROTO(struct xfs_alloc_arg *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, minlen) + __field(xfs_extlen_t, maxlen) + __field(xfs_extlen_t, mod) + __field(xfs_extlen_t, prod) + __field(xfs_extlen_t, minleft) + __field(xfs_extlen_t, total) + __field(xfs_extlen_t, alignment) + __field(xfs_extlen_t, minalignslop) + __field(xfs_extlen_t, len) + __field(short, type) + __field(short, otype) + __field(char, wasdel) + __field(char, wasfromfl) + __field(char, isfl) + __field(char, userdata) + __field(xfs_fsblock_t, firstblock) + ), + TP_fast_assign( + __entry->dev = args->mp->m_super->s_dev; + __entry->agno = args->agno; + __entry->agbno = args->agbno; + __entry->minlen = args->minlen; + __entry->maxlen = args->maxlen; + __entry->mod = args->mod; + __entry->prod = args->prod; + __entry->minleft = args->minleft; + __entry->total = args->total; + __entry->alignment = args->alignment; + __entry->minalignslop = args->minalignslop; + __entry->len = args->len; + __entry->type = args->type; + __entry->otype = args->otype; + __entry->wasdel = args->wasdel; + __entry->wasfromfl = args->wasfromfl; + __entry->isfl = args->isfl; + __entry->userdata = args->userdata; + __entry->firstblock = args->firstblock; + ), + TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " + "prod %u minleft %u total %u alignment %u minalignslop %u " + "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " + "userdata %d firstblock 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->minlen, + __entry->maxlen, + __entry->mod, + __entry->prod, + __entry->minleft, + __entry->total, + __entry->alignment, + __entry->minalignslop, + __entry->len, + __print_symbolic(__entry->type, XFS_ALLOC_TYPES), + __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), + __entry->wasdel, + __entry->wasfromfl, + __entry->isfl, + __entry->userdata, + (unsigned long long)__entry->firstblock) +) + +#define DEFINE_ALLOC_EVENT(name) \ +DEFINE_EVENT(xfs_alloc_class, name, \ + TP_PROTO(struct xfs_alloc_arg *args), \ + TP_ARGS(args)) +DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); +DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound); +DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); +DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); +DEFINE_ALLOC_EVENT(xfs_alloc_near_first); +DEFINE_ALLOC_EVENT(xfs_alloc_near_greater); +DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser); +DEFINE_ALLOC_EVENT(xfs_alloc_near_error); +DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry); +DEFINE_ALLOC_EVENT(xfs_alloc_near_busy); +DEFINE_ALLOC_EVENT(xfs_alloc_size_neither); +DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry); +DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft); +DEFINE_ALLOC_EVENT(xfs_alloc_size_done); +DEFINE_ALLOC_EVENT(xfs_alloc_size_error); +DEFINE_ALLOC_EVENT(xfs_alloc_size_busy); +DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist); +DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough); +DEFINE_ALLOC_EVENT(xfs_alloc_small_done); +DEFINE_ALLOC_EVENT(xfs_alloc_small_error); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed); + +DECLARE_EVENT_CLASS(xfs_dir2_class, + TP_PROTO(struct xfs_da_args *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __dynamic_array(char, name, args->namelen) + __field(int, namelen) + __field(xfs_dahash_t, hashval) + __field(xfs_ino_t, inumber) + __field(int, op_flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + if (args->namelen) + memcpy(__get_str(name), args->name, args->namelen); + __entry->namelen = args->namelen; + __entry->hashval = args->hashval; + __entry->inumber = args->inumber; + __entry->op_flags = args->op_flags; + ), + TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x " + "inumber 0x%llx op_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __entry->namelen ? __get_str(name) : NULL, + __entry->namelen, + __entry->hashval, + __entry->inumber, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) +) + +#define DEFINE_DIR2_EVENT(name) \ +DEFINE_EVENT(xfs_dir2_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_DIR2_EVENT(xfs_dir2_sf_addname); +DEFINE_DIR2_EVENT(xfs_dir2_sf_create); +DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_sf_replace); +DEFINE_DIR2_EVENT(xfs_dir2_sf_removename); +DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4); +DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8); +DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block); +DEFINE_DIR2_EVENT(xfs_dir2_block_addname); +DEFINE_DIR2_EVENT(xfs_dir2_block_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_block_replace); +DEFINE_DIR2_EVENT(xfs_dir2_block_removename); +DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf); +DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node); +DEFINE_DIR2_EVENT(xfs_dir2_node_addname); +DEFINE_DIR2_EVENT(xfs_dir2_node_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_node_replace); +DEFINE_DIR2_EVENT(xfs_dir2_node_removename); +DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); + +DECLARE_EVENT_CLASS(xfs_dir2_space_class, + TP_PROTO(struct xfs_da_args *args, int idx), + TP_ARGS(args, idx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, op_flags) + __field(int, idx) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + __entry->op_flags = args->op_flags; + __entry->idx = idx; + ), + TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), + __entry->idx) +) + +#define DEFINE_DIR2_SPACE_EVENT(name) \ +DEFINE_EVENT(xfs_dir2_space_class, name, \ + TP_PROTO(struct xfs_da_args *args, int idx), \ + TP_ARGS(args, idx)) +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode); + +TRACE_EVENT(xfs_dir2_leafn_moveents, + TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count), + TP_ARGS(args, src_idx, dst_idx, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, op_flags) + __field(int, src_idx) + __field(int, dst_idx) + __field(int, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + __entry->op_flags = args->op_flags; + __entry->src_idx = src_idx; + __entry->dst_idx = dst_idx; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx op_flags %s " + "src_idx %d dst_idx %d count %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), + __entry->src_idx, + __entry->dst_idx, + __entry->count) +); + +#define XFS_SWAPEXT_INODES \ + { 0, "target" }, \ + { 1, "temp" } + +#define XFS_INODE_FORMAT_STR \ + { 0, "invalid" }, \ + { 1, "local" }, \ + { 2, "extent" }, \ + { 3, "btree" } + +DECLARE_EVENT_CLASS(xfs_swap_extent_class, + TP_PROTO(struct xfs_inode *ip, int which), + TP_ARGS(ip, which), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, which) + __field(xfs_ino_t, ino) + __field(int, format) + __field(int, nex) + __field(int, max_nex) + __field(int, broot_size) + __field(int, fork_off) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->which = which; + __entry->ino = ip->i_ino; + __entry->format = ip->i_d.di_format; + __entry->nex = ip->i_d.di_nextents; + __entry->max_nex = ip->i_df.if_ext_max; + __entry->broot_size = ip->i_df.if_broot_bytes; + __entry->fork_off = XFS_IFORK_BOFF(ip); + ), + TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " + "Max in-fork extents %d, broot size %d, fork offset %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), + __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), + __entry->nex, + __entry->max_nex, + __entry->broot_size, + __entry->fork_off) +) + +#define DEFINE_SWAPEXT_EVENT(name) \ +DEFINE_EVENT(xfs_swap_extent_class, name, \ + TP_PROTO(struct xfs_inode *ip, int which), \ + TP_ARGS(ip, which)) + +DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); +DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); + +DECLARE_EVENT_CLASS(xfs_log_recover_item_class, + TP_PROTO(struct log *log, struct xlog_recover *trans, + struct xlog_recover_item *item, int pass), + TP_ARGS(log, trans, item, pass), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, item) + __field(xlog_tid_t, tid) + __field(int, type) + __field(int, pass) + __field(int, count) + __field(int, total) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->item = (unsigned long)item; + __entry->tid = trans->r_log_tid; + __entry->type = ITEM_TYPE(item); + __entry->pass = pass; + __entry->count = item->ri_cnt; + __entry->total = item->ri_total; + ), + TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s " + "item region count/total %d/%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tid, + __entry->pass, + (void *)__entry->item, + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __entry->count, + __entry->total) +) + +#define DEFINE_LOG_RECOVER_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_item_class, name, \ + TP_PROTO(struct log *log, struct xlog_recover *trans, \ + struct xlog_recover_item *item, int pass), \ + TP_ARGS(log, trans, item, pass)) + +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover); + +DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, + TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), + TP_ARGS(log, buf_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(__int64_t, blkno) + __field(unsigned short, len) + __field(unsigned short, flags) + __field(unsigned short, size) + __field(unsigned int, map_size) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->blkno = buf_f->blf_blkno; + __entry->len = buf_f->blf_len; + __entry->flags = buf_f->blf_flags; + __entry->size = buf_f->blf_size; + __entry->map_size = buf_f->blf_map_size; + ), + TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, " + "map_size %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->blkno, + __entry->len, + __entry->flags, + __entry->size, + __entry->map_size) +) + +#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \ + TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \ + TP_ARGS(log, buf_f)) + +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); + +DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, + TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), + TP_ARGS(log, in_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned short, size) + __field(int, fields) + __field(unsigned short, asize) + __field(unsigned short, dsize) + __field(__int64_t, blkno) + __field(int, len) + __field(int, boffset) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->ino = in_f->ilf_ino; + __entry->size = in_f->ilf_size; + __entry->fields = in_f->ilf_fields; + __entry->asize = in_f->ilf_asize; + __entry->dsize = in_f->ilf_dsize; + __entry->blkno = in_f->ilf_blkno; + __entry->len = in_f->ilf_len; + __entry->boffset = in_f->ilf_boffset; + ), + TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, " + "dsize %d, blkno 0x%llx, len %d, boffset %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->fields, + __entry->asize, + __entry->dsize, + __entry->blkno, + __entry->len, + __entry->boffset) +) +#define DEFINE_LOG_RECOVER_INO_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \ + TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \ + TP_ARGS(log, in_f)) + +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); + +DECLARE_EVENT_CLASS(xfs_discard_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +) + +#define DEFINE_DISCARD_EVENT(name) \ +DEFINE_EVENT(xfs_discard_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_DISCARD_EVENT(xfs_discard_extent); +DEFINE_DISCARD_EVENT(xfs_discard_toosmall); +DEFINE_DISCARD_EVENT(xfs_discard_exclude); +DEFINE_DISCARD_EVENT(xfs_discard_busy); + +#endif /* _TRACE_XFS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE xfs_trace +#include diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h new file mode 100644 index 00000000..7c220b42 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_VNODE_H__ +#define __XFS_VNODE_H__ + +#include "xfs_fs.h" + +struct file; +struct xfs_inode; +struct xfs_iomap; +struct attrlist_cursor_kern; + +/* + * Return values for xfs_inactive. A return value of + * VN_INACTIVE_NOCACHE implies that the file system behavior + * has disassociated its state and bhv_desc_t from the vnode. + */ +#define VN_INACTIVE_CACHE 0 +#define VN_INACTIVE_NOCACHE 1 + +/* + * Flags for read/write calls - same values as IRIX + */ +#define IO_ISDIRECT 0x00004 /* bypass page cache */ +#define IO_INVIS 0x00020 /* don't update inode timestamps */ + +#define XFS_IO_FLAGS \ + { IO_ISDIRECT, "DIRECT" }, \ + { IO_INVIS, "INVIS"} + +/* + * Flush/Invalidate options for vop_toss/flush/flushinval_pages. + */ +#define FI_NONE 0 /* none */ +#define FI_REMAPF 1 /* Do a remapf prior to the operation */ +#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation. + Prevent VM access to the pages until + the operation completes. */ + +/* + * Some useful predicates. + */ +#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) +#define VN_CACHED(vp) (vp->i_mapping->nrpages) +#define VN_DIRTY(vp) mapping_tagged(vp->i_mapping, \ + PAGECACHE_TAG_DIRTY) + + +#endif /* __XFS_VNODE_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c new file mode 100644 index 00000000..87d3e038 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_xattr.c @@ -0,0 +1,241 @@ +/* + * Copyright (C) 2008 Christoph Hellwig. + * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_acl.h" +#include "xfs_vnodeops.h" + +#include +#include + + +static int +xfs_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size, int xflags) +{ + struct xfs_inode *ip = XFS_I(dentry->d_inode); + int error, asize = size; + + if (strcmp(name, "") == 0) + return -EINVAL; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (!size) { + xflags |= ATTR_KERNOVAL; + value = NULL; + } + + error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); + if (error) + return error; + return asize; +} + +static int +xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int xflags) +{ + struct xfs_inode *ip = XFS_I(dentry->d_inode); + + if (strcmp(name, "") == 0) + return -EINVAL; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (flags & XATTR_CREATE) + xflags |= ATTR_CREATE; + if (flags & XATTR_REPLACE) + xflags |= ATTR_REPLACE; + + if (!value) + return -xfs_attr_remove(ip, (unsigned char *)name, xflags); + return -xfs_attr_set(ip, (unsigned char *)name, + (void *)value, size, xflags); +} + +static const struct xattr_handler xfs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = 0, /* no flags implies user namespace */ + .get = xfs_xattr_get, + .set = xfs_xattr_set, +}; + +static const struct xattr_handler xfs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = ATTR_ROOT, + .get = xfs_xattr_get, + .set = xfs_xattr_set, +}; + +static const struct xattr_handler xfs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = ATTR_SECURE, + .get = xfs_xattr_get, + .set = xfs_xattr_set, +}; + +const struct xattr_handler *xfs_xattr_handlers[] = { + &xfs_xattr_user_handler, + &xfs_xattr_trusted_handler, + &xfs_xattr_security_handler, +#ifdef CONFIG_XFS_POSIX_ACL + &xfs_xattr_acl_access_handler, + &xfs_xattr_acl_default_handler, +#endif + NULL +}; + +static unsigned int xfs_xattr_prefix_len(int flags) +{ + if (flags & XFS_ATTR_SECURE) + return sizeof("security"); + else if (flags & XFS_ATTR_ROOT) + return sizeof("trusted"); + else + return sizeof("user"); +} + +static const char *xfs_xattr_prefix(int flags) +{ + if (flags & XFS_ATTR_SECURE) + return xfs_xattr_security_handler.prefix; + else if (flags & XFS_ATTR_ROOT) + return xfs_xattr_trusted_handler.prefix; + else + return xfs_xattr_user_handler.prefix; +} + +static int +xfs_xattr_put_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) +{ + unsigned int prefix_len = xfs_xattr_prefix_len(flags); + char *offset; + int arraytop; + + ASSERT(context->count >= 0); + + /* + * Only show root namespace entries if we are actually allowed to + * see them. + */ + if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN)) + return 0; + + arraytop = context->count + prefix_len + namelen + 1; + if (arraytop > context->firstu) { + context->count = -1; /* insufficient space */ + return 1; + } + offset = (char *)context->alist + context->count; + strncpy(offset, xfs_xattr_prefix(flags), prefix_len); + offset += prefix_len; + strncpy(offset, (char *)name, namelen); /* real name */ + offset += namelen; + *offset = '\0'; + context->count += prefix_len + namelen + 1; + return 0; +} + +static int +xfs_xattr_put_listent_sizes( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) +{ + context->count += xfs_xattr_prefix_len(flags) + namelen + 1; + return 0; +} + +static int +list_one_attr(const char *name, const size_t len, void *data, + size_t size, ssize_t *result) +{ + char *p = data + *result; + + *result += len; + if (!size) + return 0; + if (*result > size) + return -ERANGE; + + strcpy(p, name); + return 0; +} + +ssize_t +xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) +{ + struct xfs_attr_list_context context; + struct attrlist_cursor_kern cursor = { 0 }; + struct inode *inode = dentry->d_inode; + int error; + + /* + * First read the regular on-disk attributes. + */ + memset(&context, 0, sizeof(context)); + context.dp = XFS_I(inode); + context.cursor = &cursor; + context.resynch = 1; + context.alist = data; + context.bufsize = size; + context.firstu = context.bufsize; + + if (size) + context.put_listent = xfs_xattr_put_listent; + else + context.put_listent = xfs_xattr_put_listent_sizes; + + xfs_attr_list_int(&context); + if (context.count < 0) + return -ERANGE; + + /* + * Then add the two synthetic ACL attributes. + */ + if (posix_acl_access_exists(inode)) { + error = list_one_attr(POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS) + 1, + data, size, &context.count); + if (error) + return error; + } + + if (posix_acl_default_exists(inode)) { + error = list_one_attr(POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT) + 1, + data, size, &context.count); + if (error) + return error; + } + + return context.count; +} diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c new file mode 100644 index 00000000..6fa21460 --- /dev/null +++ b/fs/xfs/quota/xfs_dquot.c @@ -0,0 +1,1496 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_trans_space.h" +#include "xfs_trans_priv.h" +#include "xfs_qm.h" +#include "xfs_trace.h" + + +/* + LOCK ORDER + + inode lock (ilock) + dquot hash-chain lock (hashlock) + xqm dquot freelist lock (freelistlock + mount's dquot list lock (mplistlock) + user dquot lock - lock ordering among dquots is based on the uid or gid + group dquot lock - similar to udquots. Between the two dquots, the udquot + has to be locked first. + pin lock - the dquot lock must be held to take this lock. + flush lock - ditto. +*/ + +#ifdef DEBUG +xfs_buftarg_t *xfs_dqerror_target; +int xfs_do_dqerror; +int xfs_dqreq_num; +int xfs_dqerror_mod = 33; +#endif + +static struct lock_class_key xfs_dquot_other_class; + +/* + * Allocate and initialize a dquot. We don't always allocate fresh memory; + * we try to reclaim a free dquot if the number of incore dquots are above + * a threshold. + * The only field inside the core that gets initialized at this point + * is the d_id field. The idea is to fill in the entire q_core + * when we read in the on disk dquot. + */ +STATIC xfs_dquot_t * +xfs_qm_dqinit( + xfs_mount_t *mp, + xfs_dqid_t id, + uint type) +{ + xfs_dquot_t *dqp; + boolean_t brandnewdquot; + + brandnewdquot = xfs_qm_dqalloc_incore(&dqp); + dqp->dq_flags = type; + dqp->q_core.d_id = cpu_to_be32(id); + dqp->q_mount = mp; + + /* + * No need to re-initialize these if this is a reclaimed dquot. + */ + if (brandnewdquot) { + INIT_LIST_HEAD(&dqp->q_freelist); + mutex_init(&dqp->q_qlock); + init_waitqueue_head(&dqp->q_pinwait); + + /* + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. + */ + init_completion(&dqp->q_flush); + complete(&dqp->q_flush); + + trace_xfs_dqinit(dqp); + } else { + /* + * Only the q_core portion was zeroed in dqreclaim_one(). + * So, we need to reset others. + */ + dqp->q_nrefs = 0; + dqp->q_blkno = 0; + INIT_LIST_HEAD(&dqp->q_mplist); + INIT_LIST_HEAD(&dqp->q_hashlist); + dqp->q_bufoffset = 0; + dqp->q_fileoffset = 0; + dqp->q_transp = NULL; + dqp->q_gdquot = NULL; + dqp->q_res_bcount = 0; + dqp->q_res_icount = 0; + dqp->q_res_rtbcount = 0; + atomic_set(&dqp->q_pincount, 0); + dqp->q_hash = NULL; + ASSERT(list_empty(&dqp->q_freelist)); + + trace_xfs_dqreuse(dqp); + } + + /* + * In either case we need to make sure group quotas have a different + * lock class than user quotas, to make sure lockdep knows we can + * locks of one of each at the same time. + */ + if (!(type & XFS_DQ_USER)) + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); + + /* + * log item gets initialized later + */ + return (dqp); +} + +/* + * This is called to free all the memory associated with a dquot + */ +void +xfs_qm_dqdestroy( + xfs_dquot_t *dqp) +{ + ASSERT(list_empty(&dqp->q_freelist)); + + mutex_destroy(&dqp->q_qlock); + kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); + + atomic_dec(&xfs_Gqm->qm_totaldquots); +} + +/* + * This is what a 'fresh' dquot inside a dquot chunk looks like on disk. + */ +STATIC void +xfs_qm_dqinit_core( + xfs_dqid_t id, + uint type, + xfs_dqblk_t *d) +{ + /* + * Caller has zero'd the entire dquot 'chunk' already. + */ + d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + d->dd_diskdq.d_version = XFS_DQUOT_VERSION; + d->dd_diskdq.d_id = cpu_to_be32(id); + d->dd_diskdq.d_flags = type; +} + +/* + * If default limits are in force, push them into the dquot now. + * We overwrite the dquot limits only if they are zero and this + * is not the root dquot. + */ +void +xfs_qm_adjust_dqlimits( + xfs_mount_t *mp, + xfs_disk_dquot_t *d) +{ + xfs_quotainfo_t *q = mp->m_quotainfo; + + ASSERT(d->d_id); + + if (q->qi_bsoftlimit && !d->d_blk_softlimit) + d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit); + if (q->qi_bhardlimit && !d->d_blk_hardlimit) + d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit); + if (q->qi_isoftlimit && !d->d_ino_softlimit) + d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit); + if (q->qi_ihardlimit && !d->d_ino_hardlimit) + d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit); + if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit) + d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit); + if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit) + d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit); +} + +/* + * Check the limits and timers of a dquot and start or reset timers + * if necessary. + * This gets called even when quota enforcement is OFF, which makes our + * life a little less complicated. (We just don't reject any quota + * reservations in that case, when enforcement is off). + * We also return 0 as the values of the timers in Q_GETQUOTA calls, when + * enforcement's off. + * In contrast, warnings are a little different in that they don't + * 'automatically' get started when limits get exceeded. They do + * get reset to zero, however, when we find the count to be under + * the soft limit (they are only ever set non-zero via userspace). + */ +void +xfs_qm_adjust_dqtimers( + xfs_mount_t *mp, + xfs_disk_dquot_t *d) +{ + ASSERT(d->d_id); + +#ifdef QUOTADEBUG + if (d->d_blk_hardlimit) + ASSERT(be64_to_cpu(d->d_blk_softlimit) <= + be64_to_cpu(d->d_blk_hardlimit)); + if (d->d_ino_hardlimit) + ASSERT(be64_to_cpu(d->d_ino_softlimit) <= + be64_to_cpu(d->d_ino_hardlimit)); + if (d->d_rtb_hardlimit) + ASSERT(be64_to_cpu(d->d_rtb_softlimit) <= + be64_to_cpu(d->d_rtb_hardlimit)); +#endif + if (!d->d_btimer) { + if ((d->d_blk_softlimit && + (be64_to_cpu(d->d_bcount) >= + be64_to_cpu(d->d_blk_softlimit))) || + (d->d_blk_hardlimit && + (be64_to_cpu(d->d_bcount) >= + be64_to_cpu(d->d_blk_hardlimit)))) { + d->d_btimer = cpu_to_be32(get_seconds() + + mp->m_quotainfo->qi_btimelimit); + } else { + d->d_bwarns = 0; + } + } else { + if ((!d->d_blk_softlimit || + (be64_to_cpu(d->d_bcount) < + be64_to_cpu(d->d_blk_softlimit))) && + (!d->d_blk_hardlimit || + (be64_to_cpu(d->d_bcount) < + be64_to_cpu(d->d_blk_hardlimit)))) { + d->d_btimer = 0; + } + } + + if (!d->d_itimer) { + if ((d->d_ino_softlimit && + (be64_to_cpu(d->d_icount) >= + be64_to_cpu(d->d_ino_softlimit))) || + (d->d_ino_hardlimit && + (be64_to_cpu(d->d_icount) >= + be64_to_cpu(d->d_ino_hardlimit)))) { + d->d_itimer = cpu_to_be32(get_seconds() + + mp->m_quotainfo->qi_itimelimit); + } else { + d->d_iwarns = 0; + } + } else { + if ((!d->d_ino_softlimit || + (be64_to_cpu(d->d_icount) < + be64_to_cpu(d->d_ino_softlimit))) && + (!d->d_ino_hardlimit || + (be64_to_cpu(d->d_icount) < + be64_to_cpu(d->d_ino_hardlimit)))) { + d->d_itimer = 0; + } + } + + if (!d->d_rtbtimer) { + if ((d->d_rtb_softlimit && + (be64_to_cpu(d->d_rtbcount) >= + be64_to_cpu(d->d_rtb_softlimit))) || + (d->d_rtb_hardlimit && + (be64_to_cpu(d->d_rtbcount) >= + be64_to_cpu(d->d_rtb_hardlimit)))) { + d->d_rtbtimer = cpu_to_be32(get_seconds() + + mp->m_quotainfo->qi_rtbtimelimit); + } else { + d->d_rtbwarns = 0; + } + } else { + if ((!d->d_rtb_softlimit || + (be64_to_cpu(d->d_rtbcount) < + be64_to_cpu(d->d_rtb_softlimit))) && + (!d->d_rtb_hardlimit || + (be64_to_cpu(d->d_rtbcount) < + be64_to_cpu(d->d_rtb_hardlimit)))) { + d->d_rtbtimer = 0; + } + } +} + +/* + * initialize a buffer full of dquots and log the whole thing + */ +STATIC void +xfs_qm_init_dquot_blk( + xfs_trans_t *tp, + xfs_mount_t *mp, + xfs_dqid_t id, + uint type, + xfs_buf_t *bp) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + xfs_dqblk_t *d; + int curid, i; + + ASSERT(tp); + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + + d = (xfs_dqblk_t *)XFS_BUF_PTR(bp); + + /* + * ID of the first dquot in the block - id's are zero based. + */ + curid = id - (id % q->qi_dqperchunk); + ASSERT(curid >= 0); + memset(d, 0, BBTOB(q->qi_dqchunklen)); + for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) + xfs_qm_dqinit_core(curid, type, d); + xfs_trans_dquot_buf(tp, bp, + (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : + ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : + XFS_BLF_GDQUOT_BUF))); + xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); +} + + + +/* + * Allocate a block and fill it with dquots. + * This is called when the bmapi finds a hole. + */ +STATIC int +xfs_qm_dqalloc( + xfs_trans_t **tpp, + xfs_mount_t *mp, + xfs_dquot_t *dqp, + xfs_inode_t *quotip, + xfs_fileoff_t offset_fsb, + xfs_buf_t **O_bpp) +{ + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + xfs_bmbt_irec_t map; + int nmaps, error, committed; + xfs_buf_t *bp; + xfs_trans_t *tp = *tpp; + + ASSERT(tp != NULL); + + trace_xfs_dqalloc(dqp); + + /* + * Initialize the bmap freelist prior to calling bmapi code. + */ + xfs_bmap_init(&flist, &firstblock); + xfs_ilock(quotip, XFS_ILOCK_EXCL); + /* + * Return if this type of quotas is turned off while we didn't + * have an inode lock + */ + if (XFS_IS_THIS_QUOTA_OFF(dqp)) { + xfs_iunlock(quotip, XFS_ILOCK_EXCL); + return (ESRCH); + } + + xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL); + nmaps = 1; + if ((error = xfs_bmapi(tp, quotip, + offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, + XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, + &firstblock, + XFS_QM_DQALLOC_SPACE_RES(mp), + &map, &nmaps, &flist))) { + goto error0; + } + ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); + ASSERT(nmaps == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + /* + * Keep track of the blkno to save a lookup later + */ + dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); + + /* now we can just get the buffer (there's nothing to read yet) */ + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0); + if (!bp || (error = XFS_BUF_GETERROR(bp))) + goto error1; + /* + * Make a chunk of dquots out of this buffer and log + * the entire thing. + */ + xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id), + dqp->dq_flags & XFS_DQ_ALLTYPES, bp); + + /* + * xfs_bmap_finish() may commit the current transaction and + * start a second transaction if the freelist is not empty. + * + * Since we still want to modify this buffer, we need to + * ensure that the buffer is not released on commit of + * the first transaction and ensure the buffer is added to the + * second transaction. + * + * If there is only one transaction then don't stop the buffer + * from being released when it commits later on. + */ + + xfs_trans_bhold(tp, bp); + + if ((error = xfs_bmap_finish(tpp, &flist, &committed))) { + goto error1; + } + + if (committed) { + tp = *tpp; + xfs_trans_bjoin(tp, bp); + } else { + xfs_trans_bhold_release(tp, bp); + } + + *O_bpp = bp; + return 0; + + error1: + xfs_bmap_cancel(&flist); + error0: + xfs_iunlock(quotip, XFS_ILOCK_EXCL); + + return (error); +} + +/* + * Maps a dquot to the buffer containing its on-disk version. + * This returns a ptr to the buffer containing the on-disk dquot + * in the bpp param, and a ptr to the on-disk dquot within that buffer + */ +STATIC int +xfs_qm_dqtobp( + xfs_trans_t **tpp, + xfs_dquot_t *dqp, + xfs_disk_dquot_t **O_ddpp, + xfs_buf_t **O_bpp, + uint flags) +{ + xfs_bmbt_irec_t map; + int nmaps = 1, error; + xfs_buf_t *bp; + xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); + xfs_mount_t *mp = dqp->q_mount; + xfs_disk_dquot_t *ddq; + xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); + xfs_trans_t *tp = (tpp ? *tpp : NULL); + + dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; + + xfs_ilock(quotip, XFS_ILOCK_SHARED); + if (XFS_IS_THIS_QUOTA_OFF(dqp)) { + /* + * Return if this type of quotas is turned off while we + * didn't have the quota inode lock. + */ + xfs_iunlock(quotip, XFS_ILOCK_SHARED); + return ESRCH; + } + + /* + * Find the block map; no allocations yet + */ + error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, + NULL, 0, &map, &nmaps, NULL); + + xfs_iunlock(quotip, XFS_ILOCK_SHARED); + if (error) + return error; + + ASSERT(nmaps == 1); + ASSERT(map.br_blockcount == 1); + + /* + * Offset of dquot in the (fixed sized) dquot chunk. + */ + dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * + sizeof(xfs_dqblk_t); + + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + if (map.br_startblock == HOLESTARTBLOCK) { + /* + * We don't allocate unless we're asked to + */ + if (!(flags & XFS_QMOPT_DQALLOC)) + return ENOENT; + + ASSERT(tp); + error = xfs_qm_dqalloc(tpp, mp, dqp, quotip, + dqp->q_fileoffset, &bp); + if (error) + return error; + tp = *tpp; + } else { + trace_xfs_dqtobp_read(dqp); + + /* + * store the blkno etc so that we don't have to do the + * mapping all the time + */ + dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, &bp); + if (error || !bp) + return XFS_ERROR(error); + } + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + + /* + * calculate the location of the dquot inside the buffer. + */ + ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset); + + /* + * A simple sanity check in case we got a corrupted dquot... + */ + error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES, + flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN), + "dqtobp"); + if (error) { + if (!(flags & XFS_QMOPT_DQREPAIR)) { + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EIO); + } + XFS_BUF_BUSY(bp); /* We dirtied this */ + } + + *O_bpp = bp; + *O_ddpp = ddq; + + return (0); +} + + +/* + * Read in the ondisk dquot using dqtobp() then copy it to an incore version, + * and release the buffer immediately. + * + */ +/* ARGSUSED */ +STATIC int +xfs_qm_dqread( + xfs_trans_t **tpp, + xfs_dqid_t id, + xfs_dquot_t *dqp, /* dquot to get filled in */ + uint flags) +{ + xfs_disk_dquot_t *ddqp; + xfs_buf_t *bp; + int error; + xfs_trans_t *tp; + + ASSERT(tpp); + + trace_xfs_dqread(dqp); + + /* + * get a pointer to the on-disk dquot and the buffer containing it + * dqp already knows its own type (GROUP/USER). + */ + if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) { + return (error); + } + tp = *tpp; + + /* copy everything from disk dquot to the incore dquot */ + memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); + ASSERT(be32_to_cpu(dqp->q_core.d_id) == id); + xfs_qm_dquot_logitem_init(dqp); + + /* + * Reservation counters are defined as reservation plus current usage + * to avoid having to add every time. + */ + dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount); + dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); + dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); + + /* Mark the buf so that this will stay incore a little longer */ + XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF); + + /* + * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) + * So we need to release with xfs_trans_brelse(). + * The strategy here is identical to that of inodes; we lock + * the dquot in xfs_qm_dqget() before making it accessible to + * others. This is because dquots, like inodes, need a good level of + * concurrency, and we don't want to take locks on the entire buffers + * for dquot accesses. + * Note also that the dquot buffer may even be dirty at this point, if + * this particular dquot was repaired. We still aren't afraid to + * brelse it because we have the changes incore. + */ + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + xfs_trans_brelse(tp, bp); + + return (error); +} + + +/* + * allocate an incore dquot from the kernel heap, + * and fill its core with quota information kept on disk. + * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk + * if it wasn't already allocated. + */ +STATIC int +xfs_qm_idtodq( + xfs_mount_t *mp, + xfs_dqid_t id, /* gid or uid, depending on type */ + uint type, /* UDQUOT or GDQUOT */ + uint flags, /* DQALLOC, DQREPAIR */ + xfs_dquot_t **O_dqpp)/* OUT : incore dquot, not locked */ +{ + xfs_dquot_t *dqp; + int error; + xfs_trans_t *tp; + int cancelflags=0; + + dqp = xfs_qm_dqinit(mp, id, type); + tp = NULL; + if (flags & XFS_QMOPT_DQALLOC) { + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); + error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), + XFS_WRITE_LOG_RES(mp) + + BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + + 128, + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error) { + cancelflags = 0; + goto error0; + } + cancelflags = XFS_TRANS_RELEASE_LOG_RES; + } + + /* + * Read it from disk; xfs_dqread() takes care of + * all the necessary initialization of dquot's fields (locks, etc) + */ + if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) { + /* + * This can happen if quotas got turned off (ESRCH), + * or if the dquot didn't exist on disk and we ask to + * allocate (ENOENT). + */ + trace_xfs_dqread_fail(dqp); + cancelflags |= XFS_TRANS_ABORT; + goto error0; + } + if (tp) { + if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) + goto error1; + } + + *O_dqpp = dqp; + return (0); + + error0: + ASSERT(error); + if (tp) + xfs_trans_cancel(tp, cancelflags); + error1: + xfs_qm_dqdestroy(dqp); + *O_dqpp = NULL; + return (error); +} + +/* + * Lookup a dquot in the incore dquot hashtable. We keep two separate + * hashtables for user and group dquots; and, these are global tables + * inside the XQM, not per-filesystem tables. + * The hash chain must be locked by caller, and it is left locked + * on return. Returning dquot is locked. + */ +STATIC int +xfs_qm_dqlookup( + xfs_mount_t *mp, + xfs_dqid_t id, + xfs_dqhash_t *qh, + xfs_dquot_t **O_dqpp) +{ + xfs_dquot_t *dqp; + uint flist_locked; + + ASSERT(mutex_is_locked(&qh->qh_lock)); + + flist_locked = B_FALSE; + + /* + * Traverse the hashchain looking for a match + */ + list_for_each_entry(dqp, &qh->qh_list, q_hashlist) { + /* + * We already have the hashlock. We don't need the + * dqlock to look at the id field of the dquot, since the + * id can't be modified without the hashlock anyway. + */ + if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) { + trace_xfs_dqlookup_found(dqp); + + /* + * All in core dquots must be on the dqlist of mp + */ + ASSERT(!list_empty(&dqp->q_mplist)); + + xfs_dqlock(dqp); + if (dqp->q_nrefs == 0) { + ASSERT(!list_empty(&dqp->q_freelist)); + if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { + trace_xfs_dqlookup_want(dqp); + + /* + * We may have raced with dqreclaim_one() + * (and lost). So, flag that we don't + * want the dquot to be reclaimed. + */ + dqp->dq_flags |= XFS_DQ_WANT; + xfs_dqunlock(dqp); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); + xfs_dqlock(dqp); + dqp->dq_flags &= ~(XFS_DQ_WANT); + } + flist_locked = B_TRUE; + } + + /* + * id couldn't have changed; we had the hashlock all + * along + */ + ASSERT(be32_to_cpu(dqp->q_core.d_id) == id); + + if (flist_locked) { + if (dqp->q_nrefs != 0) { + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + flist_locked = B_FALSE; + } else { + /* take it off the freelist */ + trace_xfs_dqlookup_freelist(dqp); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + } + } + + XFS_DQHOLD(dqp); + + if (flist_locked) + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + /* + * move the dquot to the front of the hashchain + */ + ASSERT(mutex_is_locked(&qh->qh_lock)); + list_move(&dqp->q_hashlist, &qh->qh_list); + trace_xfs_dqlookup_done(dqp); + *O_dqpp = dqp; + return 0; + } + } + + *O_dqpp = NULL; + ASSERT(mutex_is_locked(&qh->qh_lock)); + return (1); +} + +/* + * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a + * a locked dquot, doing an allocation (if requested) as needed. + * When both an inode and an id are given, the inode's id takes precedence. + * That is, if the id changes while we don't hold the ilock inside this + * function, the new dquot is returned, not necessarily the one requested + * in the id argument. + */ +int +xfs_qm_dqget( + xfs_mount_t *mp, + xfs_inode_t *ip, /* locked inode (optional) */ + xfs_dqid_t id, /* uid/projid/gid depending on type */ + uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */ + uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */ + xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ +{ + xfs_dquot_t *dqp; + xfs_dqhash_t *h; + uint version; + int error; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || + (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) || + (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { + return (ESRCH); + } + h = XFS_DQ_HASH(mp, id, type); + +#ifdef DEBUG + if (xfs_do_dqerror) { + if ((xfs_dqerror_target == mp->m_ddev_targp) && + (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { + xfs_debug(mp, "Returning error in dqget"); + return (EIO); + } + } +#endif + + again: + +#ifdef DEBUG + ASSERT(type == XFS_DQ_USER || + type == XFS_DQ_PROJ || + type == XFS_DQ_GROUP); + if (ip) { + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (type == XFS_DQ_USER) + ASSERT(ip->i_udquot == NULL); + else + ASSERT(ip->i_gdquot == NULL); + } +#endif + mutex_lock(&h->qh_lock); + + /* + * Look in the cache (hashtable). + * The chain is kept locked during lookup. + */ + if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) { + XQM_STATS_INC(xqmstats.xs_qm_dqcachehits); + /* + * The dquot was found, moved to the front of the chain, + * taken off the freelist if it was on it, and locked + * at this point. Just unlock the hashchain and return. + */ + ASSERT(*O_dqpp); + ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); + mutex_unlock(&h->qh_lock); + trace_xfs_dqget_hit(*O_dqpp); + return (0); /* success */ + } + XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses); + + /* + * Dquot cache miss. We don't want to keep the inode lock across + * a (potential) disk read. Also we don't want to deal with the lock + * ordering between quotainode and this inode. OTOH, dropping the inode + * lock here means dealing with a chown that can happen before + * we re-acquire the lock. + */ + if (ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * Save the hashchain version stamp, and unlock the chain, so that + * we don't keep the lock across a disk read + */ + version = h->qh_version; + mutex_unlock(&h->qh_lock); + + /* + * Allocate the dquot on the kernel heap, and read the ondisk + * portion off the disk. Also, do all the necessary initialization + * This can return ENOENT if dquot didn't exist on disk and we didn't + * ask it to allocate; ESRCH if quotas got turned off suddenly. + */ + if ((error = xfs_qm_idtodq(mp, id, type, + flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR| + XFS_QMOPT_DOWARN), + &dqp))) { + if (ip) + xfs_ilock(ip, XFS_ILOCK_EXCL); + return (error); + } + + /* + * See if this is mount code calling to look at the overall quota limits + * which are stored in the id == 0 user or group's dquot. + * Since we may not have done a quotacheck by this point, just return + * the dquot without attaching it to any hashtables, lists, etc, or even + * taking a reference. + * The caller must dqdestroy this once done. + */ + if (flags & XFS_QMOPT_DQSUSER) { + ASSERT(id == 0); + ASSERT(! ip); + goto dqret; + } + + /* + * Dquot lock comes after hashlock in the lock ordering + */ + if (ip) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * A dquot could be attached to this inode by now, since + * we had dropped the ilock. + */ + if (type == XFS_DQ_USER) { + if (!XFS_IS_UQUOTA_ON(mp)) { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return XFS_ERROR(ESRCH); + } + if (ip->i_udquot) { + xfs_qm_dqdestroy(dqp); + dqp = ip->i_udquot; + xfs_dqlock(dqp); + goto dqret; + } + } else { + if (!XFS_IS_OQUOTA_ON(mp)) { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return XFS_ERROR(ESRCH); + } + if (ip->i_gdquot) { + xfs_qm_dqdestroy(dqp); + dqp = ip->i_gdquot; + xfs_dqlock(dqp); + goto dqret; + } + } + } + + /* + * Hashlock comes after ilock in lock order + */ + mutex_lock(&h->qh_lock); + if (version != h->qh_version) { + xfs_dquot_t *tmpdqp; + /* + * Now, see if somebody else put the dquot in the + * hashtable before us. This can happen because we didn't + * keep the hashchain lock. We don't have to worry about + * lock order between the two dquots here since dqp isn't + * on any findable lists yet. + */ + if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) { + /* + * Duplicate found. Just throw away the new dquot + * and start over. + */ + xfs_qm_dqput(tmpdqp); + mutex_unlock(&h->qh_lock); + xfs_qm_dqdestroy(dqp); + XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); + goto again; + } + } + + /* + * Put the dquot at the beginning of the hash-chain and mp's list + * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock .. + */ + ASSERT(mutex_is_locked(&h->qh_lock)); + dqp->q_hash = h; + list_add(&dqp->q_hashlist, &h->qh_list); + h->qh_version++; + + /* + * Attach this dquot to this filesystem's list of all dquots, + * kept inside the mount structure in m_quotainfo field + */ + mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); + + /* + * We return a locked dquot to the caller, with a reference taken + */ + xfs_dqlock(dqp); + dqp->q_nrefs = 1; + + list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist); + mp->m_quotainfo->qi_dquots++; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); + mutex_unlock(&h->qh_lock); + dqret: + ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); + trace_xfs_dqget_miss(dqp); + *O_dqpp = dqp; + return (0); +} + + +/* + * Release a reference to the dquot (decrement ref-count) + * and unlock it. If there is a group quota attached to this + * dquot, carefully release that too without tripping over + * deadlocks'n'stuff. + */ +void +xfs_qm_dqput( + xfs_dquot_t *dqp) +{ + xfs_dquot_t *gdqp; + + ASSERT(dqp->q_nrefs > 0); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + trace_xfs_dqput(dqp); + + if (dqp->q_nrefs != 1) { + dqp->q_nrefs--; + xfs_dqunlock(dqp); + return; + } + + /* + * drop the dqlock and acquire the freelist and dqlock + * in the right order; but try to get it out-of-order first + */ + if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { + trace_xfs_dqput_wait(dqp); + xfs_dqunlock(dqp); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); + xfs_dqlock(dqp); + } + + while (1) { + gdqp = NULL; + + /* We can't depend on nrefs being == 1 here */ + if (--dqp->q_nrefs == 0) { + trace_xfs_dqput_free(dqp); + + list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); + xfs_Gqm->qm_dqfrlist_cnt++; + + /* + * If we just added a udquot to the freelist, then + * we want to release the gdquot reference that + * it (probably) has. Otherwise it'll keep the + * gdquot from getting reclaimed. + */ + if ((gdqp = dqp->q_gdquot)) { + /* + * Avoid a recursive dqput call + */ + xfs_dqlock(gdqp); + dqp->q_gdquot = NULL; + } + } + xfs_dqunlock(dqp); + + /* + * If we had a group quota inside the user quota as a hint, + * release it now. + */ + if (! gdqp) + break; + dqp = gdqp; + } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); +} + +/* + * Release a dquot. Flush it if dirty, then dqput() it. + * dquot must not be locked. + */ +void +xfs_qm_dqrele( + xfs_dquot_t *dqp) +{ + if (!dqp) + return; + + trace_xfs_dqrele(dqp); + + xfs_dqlock(dqp); + /* + * We don't care to flush it if the dquot is dirty here. + * That will create stutters that we want to avoid. + * Instead we do a delayed write when we try to reclaim + * a dirty dquot. Also xfs_sync will take part of the burden... + */ + xfs_qm_dqput(dqp); +} + +/* + * This is the dquot flushing I/O completion routine. It is called + * from interrupt level when the buffer containing the dquot is + * flushed to disk. It is responsible for removing the dquot logitem + * from the AIL if it has not been re-logged, and unlocking the dquot's + * flush lock. This behavior is very similar to that of inodes.. + */ +STATIC void +xfs_qm_dqflush_done( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip; + xfs_dquot_t *dqp = qip->qli_dquot; + struct xfs_ail *ailp = lip->li_ailp; + + /* + * We only want to pull the item from the AIL if its + * location in the log has not changed since we started the flush. + * Thus, we only bother if the dquot's lsn has + * not changed. First we check the lsn outside the lock + * since it's cheaper, and then we recheck while + * holding the lock before removing the dquot from the AIL. + */ + if ((lip->li_flags & XFS_LI_IN_AIL) && + lip->li_lsn == qip->qli_flush_lsn) { + + /* xfs_trans_ail_delete() drops the AIL lock. */ + spin_lock(&ailp->xa_lock); + if (lip->li_lsn == qip->qli_flush_lsn) + xfs_trans_ail_delete(ailp, lip); + else + spin_unlock(&ailp->xa_lock); + } + + /* + * Release the dq's flush lock since we're done with it. + */ + xfs_dqfunlock(dqp); +} + +/* + * Write a modified dquot to disk. + * The dquot must be locked and the flush lock too taken by caller. + * The flush lock will not be unlocked until the dquot reaches the disk, + * but the dquot is free to be unlocked and modified by the caller + * in the interim. Dquot is still locked on return. This behavior is + * identical to that of inodes. + */ +int +xfs_qm_dqflush( + xfs_dquot_t *dqp, + uint flags) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_buf *bp; + struct xfs_disk_dquot *ddqp; + int error; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(!completion_done(&dqp->q_flush)); + + trace_xfs_dqflush(dqp); + + /* + * If not dirty, or it's pinned and we are not supposed to block, nada. + */ + if (!XFS_DQ_IS_DIRTY(dqp) || + (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) { + xfs_dqfunlock(dqp); + return 0; + } + xfs_qm_dqunpin_wait(dqp); + + /* + * This may have been unpinned because the filesystem is shutting + * down forcibly. If that's the case we must not write this dquot + * to disk, because the log record didn't make it to disk! + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + dqp->dq_flags &= ~XFS_DQ_DIRTY; + xfs_dqfunlock(dqp); + return XFS_ERROR(EIO); + } + + /* + * Get the buffer containing the on-disk dquot + */ + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) { + ASSERT(error != ENOENT); + xfs_dqfunlock(dqp); + return error; + } + + /* + * Calculate the location of the dquot inside the buffer. + */ + ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset); + + /* + * A simple sanity check in case we got a corrupted dquot.. + */ + error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, + XFS_QMOPT_DOWARN, "dqflush (incore copy)"); + if (error) { + xfs_buf_relse(bp); + xfs_dqfunlock(dqp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return XFS_ERROR(EIO); + } + + /* This is the only portion of data that needs to persist */ + memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t)); + + /* + * Clear the dirty field and remember the flush lsn for later use. + */ + dqp->dq_flags &= ~XFS_DQ_DIRTY; + + xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, + &dqp->q_logitem.qli_item.li_lsn); + + /* + * Attach an iodone routine so that we can remove this dquot from the + * AIL and release the flush lock once the dquot is synced to disk. + */ + xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, + &dqp->q_logitem.qli_item); + + /* + * If the buffer is pinned then push on the log so we won't + * get stuck waiting in the write for too long. + */ + if (XFS_BUF_ISPINNED(bp)) { + trace_xfs_dqflush_force(dqp); + xfs_log_force(mp, 0); + } + + if (flags & SYNC_WAIT) + error = xfs_bwrite(mp, bp); + else + xfs_bdwrite(mp, bp); + + trace_xfs_dqflush_done(dqp); + + /* + * dqp is still locked, but caller is free to unlock it now. + */ + return error; + +} + +int +xfs_qm_dqlock_nowait( + xfs_dquot_t *dqp) +{ + return mutex_trylock(&dqp->q_qlock); +} + +void +xfs_dqlock( + xfs_dquot_t *dqp) +{ + mutex_lock(&dqp->q_qlock); +} + +void +xfs_dqunlock( + xfs_dquot_t *dqp) +{ + mutex_unlock(&(dqp->q_qlock)); + if (dqp->q_logitem.qli_dquot == dqp) { + /* Once was dqp->q_mount, but might just have been cleared */ + xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp, + (xfs_log_item_t*)&(dqp->q_logitem)); + } +} + + +void +xfs_dqunlock_nonotify( + xfs_dquot_t *dqp) +{ + mutex_unlock(&(dqp->q_qlock)); +} + +/* + * Lock two xfs_dquot structures. + * + * To avoid deadlocks we always lock the quota structure with + * the lowerd id first. + */ +void +xfs_dqlock2( + xfs_dquot_t *d1, + xfs_dquot_t *d2) +{ + if (d1 && d2) { + ASSERT(d1 != d2); + if (be32_to_cpu(d1->q_core.d_id) > + be32_to_cpu(d2->q_core.d_id)) { + mutex_lock(&d2->q_qlock); + mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED); + } else { + mutex_lock(&d1->q_qlock); + mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED); + } + } else if (d1) { + mutex_lock(&d1->q_qlock); + } else if (d2) { + mutex_lock(&d2->q_qlock); + } +} + + +/* + * Take a dquot out of the mount's dqlist as well as the hashlist. + * This is called via unmount as well as quotaoff, and the purge + * will always succeed unless there are soft (temp) references + * outstanding. + * + * This returns 0 if it was purged, 1 if it wasn't. It's not an error code + * that we're returning! XXXsup - not cool. + */ +/* ARGSUSED */ +int +xfs_qm_dqpurge( + xfs_dquot_t *dqp) +{ + xfs_dqhash_t *qh = dqp->q_hash; + xfs_mount_t *mp = dqp->q_mount; + + ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock)); + ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); + + xfs_dqlock(dqp); + /* + * We really can't afford to purge a dquot that is + * referenced, because these are hard refs. + * It shouldn't happen in general because we went thru _all_ inodes in + * dqrele_all_inodes before calling this and didn't let the mountlock go. + * However it is possible that we have dquots with temporary + * references that are not attached to an inode. e.g. see xfs_setattr(). + */ + if (dqp->q_nrefs != 0) { + xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_hash->qh_lock); + return (1); + } + + ASSERT(!list_empty(&dqp->q_freelist)); + + /* + * If we're turning off quotas, we have to make sure that, for + * example, we don't delete quota disk blocks while dquots are + * in the process of getting written to those disk blocks. + * This dquot might well be on AIL, and we can't leave it there + * if we're turning off quotas. Basically, we need this flush + * lock, and are willing to block on it. + */ + if (!xfs_dqflock_nowait(dqp)) { + /* + * Block on the flush lock after nudging dquot buffer, + * if it is incore. + */ + xfs_qm_dqflock_pushbuf_wait(dqp); + } + + /* + * XXXIf we're turning this type of quotas off, we don't care + * about the dirty metadata sitting in this dquot. OTOH, if + * we're unmounting, we do care, so we flush it and wait. + */ + if (XFS_DQ_IS_DIRTY(dqp)) { + int error; + + /* dqflush unlocks dqflock */ + /* + * Given that dqpurge is a very rare occurrence, it is OK + * that we're holding the hashlist and mplist locks + * across the disk write. But, ... XXXsup + * + * We don't care about getting disk errors here. We need + * to purge this dquot anyway, so we go ahead regardless. + */ + error = xfs_qm_dqflush(dqp, SYNC_WAIT); + if (error) + xfs_warn(mp, "%s: dquot %p flush failed", + __func__, dqp); + xfs_dqflock(dqp); + } + ASSERT(atomic_read(&dqp->q_pincount) == 0); + ASSERT(XFS_FORCED_SHUTDOWN(mp) || + !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); + + list_del_init(&dqp->q_hashlist); + qh->qh_version++; + list_del_init(&dqp->q_mplist); + mp->m_quotainfo->qi_dqreclaims++; + mp->m_quotainfo->qi_dquots--; + /* + * XXX Move this to the front of the freelist, if we can get the + * freelist lock. + */ + ASSERT(!list_empty(&dqp->q_freelist)); + + dqp->q_mount = NULL; + dqp->q_hash = NULL; + dqp->dq_flags = XFS_DQ_INACTIVE; + memset(&dqp->q_core, 0, sizeof(dqp->q_core)); + xfs_dqfunlock(dqp); + xfs_dqunlock(dqp); + mutex_unlock(&qh->qh_lock); + return (0); +} + + +#ifdef QUOTADEBUG +void +xfs_qm_dqprint(xfs_dquot_t *dqp) +{ + struct xfs_mount *mp = dqp->q_mount; + + xfs_debug(mp, "-----------KERNEL DQUOT----------------"); + xfs_debug(mp, "---- dquotID = %d", + (int)be32_to_cpu(dqp->q_core.d_id)); + xfs_debug(mp, "---- type = %s", DQFLAGTO_TYPESTR(dqp)); + xfs_debug(mp, "---- fs = 0x%p", dqp->q_mount); + xfs_debug(mp, "---- blkno = 0x%x", (int) dqp->q_blkno); + xfs_debug(mp, "---- boffset = 0x%x", (int) dqp->q_bufoffset); + xfs_debug(mp, "---- blkhlimit = %Lu (0x%x)", + be64_to_cpu(dqp->q_core.d_blk_hardlimit), + (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit)); + xfs_debug(mp, "---- blkslimit = %Lu (0x%x)", + be64_to_cpu(dqp->q_core.d_blk_softlimit), + (int)be64_to_cpu(dqp->q_core.d_blk_softlimit)); + xfs_debug(mp, "---- inohlimit = %Lu (0x%x)", + be64_to_cpu(dqp->q_core.d_ino_hardlimit), + (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit)); + xfs_debug(mp, "---- inoslimit = %Lu (0x%x)", + be64_to_cpu(dqp->q_core.d_ino_softlimit), + (int)be64_to_cpu(dqp->q_core.d_ino_softlimit)); + xfs_debug(mp, "---- bcount = %Lu (0x%x)", + be64_to_cpu(dqp->q_core.d_bcount), + (int)be64_to_cpu(dqp->q_core.d_bcount)); + xfs_debug(mp, "---- icount = %Lu (0x%x)", + be64_to_cpu(dqp->q_core.d_icount), + (int)be64_to_cpu(dqp->q_core.d_icount)); + xfs_debug(mp, "---- btimer = %d", + (int)be32_to_cpu(dqp->q_core.d_btimer)); + xfs_debug(mp, "---- itimer = %d", + (int)be32_to_cpu(dqp->q_core.d_itimer)); + xfs_debug(mp, "---------------------------"); +} +#endif + +/* + * Give the buffer a little push if it is incore and + * wait on the flush lock. + */ +void +xfs_qm_dqflock_pushbuf_wait( + xfs_dquot_t *dqp) +{ + xfs_mount_t *mp = dqp->q_mount; + xfs_buf_t *bp; + + /* + * Check to see if the dquot has been flushed delayed + * write. If so, grab its buffer and send it + * out immediately. We'll be able to acquire + * the flush lock when the I/O completes. + */ + bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); + if (!bp) + goto out_lock; + + if (XFS_BUF_ISDELAYWRITE(bp)) { + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(mp, 0); + xfs_buf_delwri_promote(bp); + wake_up_process(bp->b_target->bt_task); + } + xfs_buf_relse(bp); +out_lock: + xfs_dqflock(dqp); +} diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h new file mode 100644 index 00000000..5da3a23b --- /dev/null +++ b/fs/xfs/quota/xfs_dquot.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DQUOT_H__ +#define __XFS_DQUOT_H__ + +/* + * Dquots are structures that hold quota information about a user or a group, + * much like inodes are for files. In fact, dquots share many characteristics + * with inodes. However, dquots can also be a centralized resource, relative + * to a collection of inodes. In this respect, dquots share some characteristics + * of the superblock. + * XFS dquots exploit both those in its algorithms. They make every attempt + * to not be a bottleneck when quotas are on and have minimal impact, if any, + * when quotas are off. + */ + +/* + * The hash chain headers (hash buckets) + */ +typedef struct xfs_dqhash { + struct list_head qh_list; + struct mutex qh_lock; + uint qh_version; /* ever increasing version */ + uint qh_nelems; /* number of dquots on the list */ +} xfs_dqhash_t; + +struct xfs_mount; +struct xfs_trans; + +/* + * The incore dquot structure + */ +typedef struct xfs_dquot { + uint dq_flags; /* various flags (XFS_DQ_*) */ + struct list_head q_freelist; /* global free list of dquots */ + struct list_head q_mplist; /* mount's list of dquots */ + struct list_head q_hashlist; /* gloabl hash list of dquots */ + xfs_dqhash_t *q_hash; /* the hashchain header */ + struct xfs_mount*q_mount; /* filesystem this relates to */ + struct xfs_trans*q_transp; /* trans this belongs to currently */ + uint q_nrefs; /* # active refs from inodes */ + xfs_daddr_t q_blkno; /* blkno of dquot buffer */ + int q_bufoffset; /* off of dq in buffer (# dquots) */ + xfs_fileoff_t q_fileoffset; /* offset in quotas file */ + + struct xfs_dquot*q_gdquot; /* group dquot, hint only */ + xfs_disk_dquot_t q_core; /* actual usage & quotas */ + xfs_dq_logitem_t q_logitem; /* dquot log item */ + xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ + xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ + xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ + struct mutex q_qlock; /* quota lock */ + struct completion q_flush; /* flush completion queue */ + atomic_t q_pincount; /* dquot pin count */ + wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ +} xfs_dquot_t; + +/* + * Lock hierarchy for q_qlock: + * XFS_QLOCK_NORMAL is the implicit default, + * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 + */ +enum { + XFS_QLOCK_NORMAL = 0, + XFS_QLOCK_NESTED, +}; + +#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) + +/* + * Manage the q_flush completion queue embedded in the dquot. This completion + * queue synchronizes processes attempting to flush the in-core dquot back to + * disk. + */ +static inline void xfs_dqflock(xfs_dquot_t *dqp) +{ + wait_for_completion(&dqp->q_flush); +} + +static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp) +{ + return try_wait_for_completion(&dqp->q_flush); +} + +static inline void xfs_dqfunlock(xfs_dquot_t *dqp) +{ + complete(&dqp->q_flush); +} + +#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) +#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) +#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) +#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) +#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) +#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo) +#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \ + XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \ + XFS_DQ_TO_QINF(dqp)->qi_gquotaip) + +#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \ + (XFS_IS_UQUOTA_ON((d)->q_mount)) : \ + (XFS_IS_OQUOTA_ON((d)->q_mount)))) + +#ifdef QUOTADEBUG +extern void xfs_qm_dqprint(xfs_dquot_t *); +#else +#define xfs_qm_dqprint(a) +#endif + +extern void xfs_qm_dqdestroy(xfs_dquot_t *); +extern int xfs_qm_dqflush(xfs_dquot_t *, uint); +extern int xfs_qm_dqpurge(xfs_dquot_t *); +extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); +extern int xfs_qm_dqlock_nowait(xfs_dquot_t *); +extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp); +extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, + xfs_disk_dquot_t *); +extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, + xfs_disk_dquot_t *); +extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, + xfs_dqid_t, uint, uint, xfs_dquot_t **); +extern void xfs_qm_dqput(xfs_dquot_t *); +extern void xfs_dqlock(xfs_dquot_t *); +extern void xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *); +extern void xfs_dqunlock(xfs_dquot_t *); +extern void xfs_dqunlock_nonotify(xfs_dquot_t *); + +#endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c new file mode 100644 index 00000000..8126fc2e --- /dev/null +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -0,0 +1,533 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_qm.h" + +static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_dq_logitem, qli_item); +} + +/* + * returns the number of iovecs needed to log the given dquot item. + */ +STATIC uint +xfs_qm_dquot_logitem_size( + struct xfs_log_item *lip) +{ + /* + * we need only two iovecs, one for the format, one for the real thing + */ + return 2; +} + +/* + * fills in the vector of log iovecs for the given dquot log item. + */ +STATIC void +xfs_qm_dquot_logitem_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *logvec) +{ + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); + + logvec->i_addr = &qlip->qli_format; + logvec->i_len = sizeof(xfs_dq_logformat_t); + logvec->i_type = XLOG_REG_TYPE_QFORMAT; + logvec++; + logvec->i_addr = &qlip->qli_dquot->q_core; + logvec->i_len = sizeof(xfs_disk_dquot_t); + logvec->i_type = XLOG_REG_TYPE_DQUOT; + + ASSERT(2 == lip->li_desc->lid_size); + qlip->qli_format.qlf_size = 2; + +} + +/* + * Increment the pin count of the given dquot. + */ +STATIC void +xfs_qm_dquot_logitem_pin( + struct xfs_log_item *lip) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + atomic_inc(&dqp->q_pincount); +} + +/* + * Decrement the pin count of the given dquot, and wake up + * anyone in xfs_dqwait_unpin() if the count goes to 0. The + * dquot must have been previously pinned with a call to + * xfs_qm_dquot_logitem_pin(). + */ +STATIC void +xfs_qm_dquot_logitem_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + + ASSERT(atomic_read(&dqp->q_pincount) > 0); + if (atomic_dec_and_test(&dqp->q_pincount)) + wake_up(&dqp->q_pinwait); +} + +/* + * Given the logitem, this writes the corresponding dquot entry to disk + * asynchronously. This is called with the dquot entry securely locked; + * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot + * at the end. + */ +STATIC void +xfs_qm_dquot_logitem_push( + struct xfs_log_item *lip) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + int error; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(!completion_done(&dqp->q_flush)); + + /* + * Since we were able to lock the dquot's flush lock and + * we found it on the AIL, the dquot must be dirty. This + * is because the dquot is removed from the AIL while still + * holding the flush lock in xfs_dqflush_done(). Thus, if + * we found it in the AIL and were able to obtain the flush + * lock without sleeping, then there must not have been + * anyone in the process of flushing the dquot. + */ + error = xfs_qm_dqflush(dqp, 0); + if (error) + xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", + __func__, error, dqp); + xfs_dqunlock(dqp); +} + +STATIC xfs_lsn_t +xfs_qm_dquot_logitem_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + /* + * We always re-log the entire dquot when it becomes dirty, + * so, the latest copy _is_ the only one that matters. + */ + return lsn; +} + +/* + * This is called to wait for the given dquot to be unpinned. + * Most of these pin/unpin routines are plagiarized from inode code. + */ +void +xfs_qm_dqunpin_wait( + struct xfs_dquot *dqp) +{ + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + if (atomic_read(&dqp->q_pincount) == 0) + return; + + /* + * Give the log a push so we don't wait here too long. + */ + xfs_log_force(dqp->q_mount, 0); + wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); +} + +/* + * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that + * the dquot is locked by us, but the flush lock isn't. So, here we are + * going to see if the relevant dquot buffer is incore, waiting on DELWRI. + * If so, we want to push it out to help us take this item off the AIL as soon + * as possible. + * + * We must not be holding the AIL lock at this point. Calling incore() to + * search the buffer cache can be a time consuming thing, and AIL lock is a + * spinlock. + */ +STATIC bool +xfs_qm_dquot_logitem_pushbuf( + struct xfs_log_item *lip) +{ + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); + struct xfs_dquot *dqp = qlip->qli_dquot; + struct xfs_buf *bp; + bool ret = true; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + /* + * If flushlock isn't locked anymore, chances are that the + * inode flush completed and the inode was taken off the AIL. + * So, just get out. + */ + if (completion_done(&dqp->q_flush) || + !(lip->li_flags & XFS_LI_IN_AIL)) { + xfs_dqunlock(dqp); + return true; + } + + bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno, + dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); + xfs_dqunlock(dqp); + if (!bp) + return true; + if (XFS_BUF_ISDELAYWRITE(bp)) + xfs_buf_delwri_promote(bp); + if (XFS_BUF_ISPINNED(bp)) + ret = false; + xfs_buf_relse(bp); + return ret; +} + +/* + * This is called to attempt to lock the dquot associated with this + * dquot log item. Don't sleep on the dquot lock or the flush lock. + * If the flush lock is already held, indicating that the dquot has + * been or is in the process of being flushed, then see if we can + * find the dquot's buffer in the buffer cache without sleeping. If + * we can and it is marked delayed write, then we want to send it out. + * We delay doing so until the push routine, though, to avoid sleeping + * in any device strategy routines. + */ +STATIC uint +xfs_qm_dquot_logitem_trylock( + struct xfs_log_item *lip) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + + if (atomic_read(&dqp->q_pincount) > 0) + return XFS_ITEM_PINNED; + + if (!xfs_qm_dqlock_nowait(dqp)) + return XFS_ITEM_LOCKED; + + if (!xfs_dqflock_nowait(dqp)) { + /* + * dquot has already been flushed to the backing buffer, + * leave it locked, pushbuf routine will unlock it. + */ + return XFS_ITEM_PUSHBUF; + } + + ASSERT(lip->li_flags & XFS_LI_IN_AIL); + return XFS_ITEM_SUCCESS; +} + +/* + * Unlock the dquot associated with the log item. + * Clear the fields of the dquot and dquot log item that + * are specific to the current transaction. If the + * hold flags is set, do not unlock the dquot. + */ +STATIC void +xfs_qm_dquot_logitem_unlock( + struct xfs_log_item *lip) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + /* + * Clear the transaction pointer in the dquot + */ + dqp->q_transp = NULL; + + /* + * dquots are never 'held' from getting unlocked at the end of + * a transaction. Their locking and unlocking is hidden inside the + * transaction layer, within trans_commit. Hence, no LI_HOLD flag + * for the logitem. + */ + xfs_dqunlock(dqp); +} + +/* + * this needs to stamp an lsn into the dquot, I think. + * rpc's that look at user dquot's would then have to + * push on the dependency recorded in the dquot + */ +STATIC void +xfs_qm_dquot_logitem_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector for dquots + */ +static struct xfs_item_ops xfs_dquot_item_ops = { + .iop_size = xfs_qm_dquot_logitem_size, + .iop_format = xfs_qm_dquot_logitem_format, + .iop_pin = xfs_qm_dquot_logitem_pin, + .iop_unpin = xfs_qm_dquot_logitem_unpin, + .iop_trylock = xfs_qm_dquot_logitem_trylock, + .iop_unlock = xfs_qm_dquot_logitem_unlock, + .iop_committed = xfs_qm_dquot_logitem_committed, + .iop_push = xfs_qm_dquot_logitem_push, + .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf, + .iop_committing = xfs_qm_dquot_logitem_committing +}; + +/* + * Initialize the dquot log item for a newly allocated dquot. + * The dquot isn't locked at this point, but it isn't on any of the lists + * either, so we don't care. + */ +void +xfs_qm_dquot_logitem_init( + struct xfs_dquot *dqp) +{ + struct xfs_dq_logitem *lp = &dqp->q_logitem; + + xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, + &xfs_dquot_item_ops); + lp->qli_dquot = dqp; + lp->qli_format.qlf_type = XFS_LI_DQUOT; + lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id); + lp->qli_format.qlf_blkno = dqp->q_blkno; + lp->qli_format.qlf_len = 1; + /* + * This is just the offset of this dquot within its buffer + * (which is currently 1 FSB and probably won't change). + * Hence 32 bits for this offset should be just fine. + * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t)) + * here, and recompute it at recovery time. + */ + lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset; +} + +/*------------------ QUOTAOFF LOG ITEMS -------------------*/ + +static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_qoff_logitem, qql_item); +} + + +/* + * This returns the number of iovecs needed to log the given quotaoff item. + * We only need 1 iovec for an quotaoff item. It just logs the + * quotaoff_log_format structure. + */ +STATIC uint +xfs_qm_qoff_logitem_size( + struct xfs_log_item *lip) +{ + return 1; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given quotaoff log item. We use only 1 iovec, and we point that + * at the quotaoff_log_format structure embedded in the quotaoff item. + * It is at this point that we assert that all of the extent + * slots in the quotaoff item have been filled. + */ +STATIC void +xfs_qm_qoff_logitem_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *log_vector) +{ + struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); + + ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF); + + log_vector->i_addr = &qflip->qql_format; + log_vector->i_len = sizeof(xfs_qoff_logitem_t); + log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF; + qflip->qql_format.qf_size = 1; +} + +/* + * Pinning has no meaning for an quotaoff item, so just return. + */ +STATIC void +xfs_qm_qoff_logitem_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an quotaoff item, unpinning does + * not either. + */ +STATIC void +xfs_qm_qoff_logitem_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * Quotaoff items have no locking, so just return success. + */ +STATIC uint +xfs_qm_qoff_logitem_trylock( + struct xfs_log_item *lip) +{ + return XFS_ITEM_LOCKED; +} + +/* + * Quotaoff items have no locking or pushing, so return failure + * so that the caller doesn't bother with us. + */ +STATIC void +xfs_qm_qoff_logitem_unlock( + struct xfs_log_item *lip) +{ +} + +/* + * The quotaoff-start-item is logged only once and cannot be moved in the log, + * so simply return the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_qm_qoff_logitem_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +/* + * There isn't much you can do to push on an quotaoff item. It is simply + * stuck waiting for the log to be flushed to disk. + */ +STATIC void +xfs_qm_qoff_logitem_push( + struct xfs_log_item *lip) +{ +} + + +STATIC xfs_lsn_t +xfs_qm_qoffend_logitem_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip); + struct xfs_qoff_logitem *qfs = qfe->qql_start_lip; + struct xfs_ail *ailp = qfs->qql_item.li_ailp; + + /* + * Delete the qoff-start logitem from the AIL. + * xfs_trans_ail_delete() drops the AIL lock. + */ + spin_lock(&ailp->xa_lock); + xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs); + + kmem_free(qfs); + kmem_free(qfe); + return (xfs_lsn_t)-1; +} + +/* + * XXX rcc - don't know quite what to do with this. I think we can + * just ignore it. The only time that isn't the case is if we allow + * the client to somehow see that quotas have been turned off in which + * we can't allow that to get back until the quotaoff hits the disk. + * So how would that happen? Also, do we need different routines for + * quotaoff start and quotaoff end? I suspect the answer is yes but + * to be sure, I need to look at the recovery code and see how quota off + * recovery is handled (do we roll forward or back or do something else). + * If we roll forwards or backwards, then we need two separate routines, + * one that does nothing and one that stamps in the lsn that matters + * (truly makes the quotaoff irrevocable). If we do something else, + * then maybe we don't need two. + */ +STATIC void +xfs_qm_qoff_logitem_committing( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn) +{ +} + +static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { + .iop_size = xfs_qm_qoff_logitem_size, + .iop_format = xfs_qm_qoff_logitem_format, + .iop_pin = xfs_qm_qoff_logitem_pin, + .iop_unpin = xfs_qm_qoff_logitem_unpin, + .iop_trylock = xfs_qm_qoff_logitem_trylock, + .iop_unlock = xfs_qm_qoff_logitem_unlock, + .iop_committed = xfs_qm_qoffend_logitem_committed, + .iop_push = xfs_qm_qoff_logitem_push, + .iop_committing = xfs_qm_qoff_logitem_committing +}; + +/* + * This is the ops vector shared by all quotaoff-start log items. + */ +static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { + .iop_size = xfs_qm_qoff_logitem_size, + .iop_format = xfs_qm_qoff_logitem_format, + .iop_pin = xfs_qm_qoff_logitem_pin, + .iop_unpin = xfs_qm_qoff_logitem_unpin, + .iop_trylock = xfs_qm_qoff_logitem_trylock, + .iop_unlock = xfs_qm_qoff_logitem_unlock, + .iop_committed = xfs_qm_qoff_logitem_committed, + .iop_push = xfs_qm_qoff_logitem_push, + .iop_committing = xfs_qm_qoff_logitem_committing +}; + +/* + * Allocate and initialize an quotaoff item of the correct quota type(s). + */ +struct xfs_qoff_logitem * +xfs_qm_qoff_logitem_init( + struct xfs_mount *mp, + struct xfs_qoff_logitem *start, + uint flags) +{ + struct xfs_qoff_logitem *qf; + + qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP); + + xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? + &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); + qf->qql_item.li_mountp = mp; + qf->qql_format.qf_type = XFS_LI_QUOTAOFF; + qf->qql_format.qf_flags = flags; + qf->qql_start_lip = start; + return qf; +} diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h new file mode 100644 index 00000000..5acae2ad --- /dev/null +++ b/fs/xfs/quota/xfs_dquot_item.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DQUOT_ITEM_H__ +#define __XFS_DQUOT_ITEM_H__ + +struct xfs_dquot; +struct xfs_trans; +struct xfs_mount; +struct xfs_qoff_logitem; + +typedef struct xfs_dq_logitem { + xfs_log_item_t qli_item; /* common portion */ + struct xfs_dquot *qli_dquot; /* dquot ptr */ + xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ + xfs_dq_logformat_t qli_format; /* logged structure */ +} xfs_dq_logitem_t; + +typedef struct xfs_qoff_logitem { + xfs_log_item_t qql_item; /* common portion */ + struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ + xfs_qoff_logformat_t qql_format; /* logged structure */ +} xfs_qoff_logitem_t; + + +extern void xfs_qm_dquot_logitem_init(struct xfs_dquot *); +extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *, + struct xfs_qoff_logitem *, uint); +extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *, + struct xfs_qoff_logitem *, uint); +extern void xfs_trans_log_quotaoff_item(struct xfs_trans *, + struct xfs_qoff_logitem *); + +#endif /* __XFS_DQUOT_ITEM_H__ */ diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c new file mode 100644 index 00000000..e70c7fc9 --- /dev/null +++ b/fs/xfs/quota/xfs_qm.c @@ -0,0 +1,2462 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_itable.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_trans_space.h" +#include "xfs_utils.h" +#include "xfs_qm.h" +#include "xfs_trace.h" + +/* + * The global quota manager. There is only one of these for the entire + * system, _not_ one per file system. XQM keeps track of the overall + * quota functionality, including maintaining the freelist and hash + * tables of dquots. + */ +struct mutex xfs_Gqm_lock; +struct xfs_qm *xfs_Gqm; +uint ndquot; + +kmem_zone_t *qm_dqzone; +kmem_zone_t *qm_dqtrxzone; + +STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); +STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); + +STATIC int xfs_qm_init_quotainos(xfs_mount_t *); +STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); +STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *); + +static struct shrinker xfs_qm_shaker = { + .shrink = xfs_qm_shake, + .seeks = DEFAULT_SEEKS, +}; + +#ifdef DEBUG +extern struct mutex qcheck_lock; +#endif + +#ifdef QUOTADEBUG +static void +xfs_qm_dquot_list_print( + struct xfs_mount *mp) +{ + xfs_dquot_t *dqp; + int i = 0; + + list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) { + xfs_debug(mp, " %d. \"%d (%s)\" " + "bcnt = %lld, icnt = %lld, refs = %d", + i++, be32_to_cpu(dqp->q_core.d_id), + DQFLAGTO_TYPESTR(dqp), + (long long)be64_to_cpu(dqp->q_core.d_bcount), + (long long)be64_to_cpu(dqp->q_core.d_icount), + dqp->q_nrefs); + } +} +#else +static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { } +#endif + +/* + * Initialize the XQM structure. + * Note that there is not one quota manager per file system. + */ +STATIC struct xfs_qm * +xfs_Gqm_init(void) +{ + xfs_dqhash_t *udqhash, *gdqhash; + xfs_qm_t *xqm; + size_t hsize; + uint i; + + /* + * Initialize the dquot hash tables. + */ + udqhash = kmem_zalloc_greedy(&hsize, + XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t), + XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t)); + if (!udqhash) + goto out; + + gdqhash = kmem_zalloc_large(hsize); + if (!gdqhash) + goto out_free_udqhash; + + hsize /= sizeof(xfs_dqhash_t); + ndquot = hsize << 8; + + xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP); + xqm->qm_dqhashmask = hsize - 1; + xqm->qm_usr_dqhtable = udqhash; + xqm->qm_grp_dqhtable = gdqhash; + ASSERT(xqm->qm_usr_dqhtable != NULL); + ASSERT(xqm->qm_grp_dqhtable != NULL); + + for (i = 0; i < hsize; i++) { + xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i); + xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i); + } + + /* + * Freelist of all dquots of all file systems + */ + INIT_LIST_HEAD(&xqm->qm_dqfrlist); + xqm->qm_dqfrlist_cnt = 0; + mutex_init(&xqm->qm_dqfrlist_lock); + + /* + * dquot zone. we register our own low-memory callback. + */ + if (!qm_dqzone) { + xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t), + "xfs_dquots"); + qm_dqzone = xqm->qm_dqzone; + } else + xqm->qm_dqzone = qm_dqzone; + + register_shrinker(&xfs_qm_shaker); + + /* + * The t_dqinfo portion of transactions. + */ + if (!qm_dqtrxzone) { + xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t), + "xfs_dqtrx"); + qm_dqtrxzone = xqm->qm_dqtrxzone; + } else + xqm->qm_dqtrxzone = qm_dqtrxzone; + + atomic_set(&xqm->qm_totaldquots, 0); + xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO; + xqm->qm_nrefs = 0; +#ifdef DEBUG + mutex_init(&qcheck_lock); +#endif + return xqm; + + out_free_udqhash: + kmem_free_large(udqhash); + out: + return NULL; +} + +/* + * Destroy the global quota manager when its reference count goes to zero. + */ +STATIC void +xfs_qm_destroy( + struct xfs_qm *xqm) +{ + struct xfs_dquot *dqp, *n; + int hsize, i; + + ASSERT(xqm != NULL); + ASSERT(xqm->qm_nrefs == 0); + unregister_shrinker(&xfs_qm_shaker); + hsize = xqm->qm_dqhashmask + 1; + for (i = 0; i < hsize; i++) { + xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); + xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i])); + } + kmem_free_large(xqm->qm_usr_dqhtable); + kmem_free_large(xqm->qm_grp_dqhtable); + xqm->qm_usr_dqhtable = NULL; + xqm->qm_grp_dqhtable = NULL; + xqm->qm_dqhashmask = 0; + + /* frlist cleanup */ + mutex_lock(&xqm->qm_dqfrlist_lock); + list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) { + xfs_dqlock(dqp); +#ifdef QUOTADEBUG + xfs_debug(dqp->q_mount, "FREELIST destroy 0x%p", dqp); +#endif + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + xfs_dqunlock(dqp); + xfs_qm_dqdestroy(dqp); + } + mutex_unlock(&xqm->qm_dqfrlist_lock); + mutex_destroy(&xqm->qm_dqfrlist_lock); +#ifdef DEBUG + mutex_destroy(&qcheck_lock); +#endif + kmem_free(xqm); +} + +/* + * Called at mount time to let XQM know that another file system is + * starting quotas. This isn't crucial information as the individual mount + * structures are pretty independent, but it helps the XQM keep a + * global view of what's going on. + */ +/* ARGSUSED */ +STATIC int +xfs_qm_hold_quotafs_ref( + struct xfs_mount *mp) +{ + /* + * Need to lock the xfs_Gqm structure for things like this. For example, + * the structure could disappear between the entry to this routine and + * a HOLD operation if not locked. + */ + mutex_lock(&xfs_Gqm_lock); + + if (!xfs_Gqm) { + xfs_Gqm = xfs_Gqm_init(); + if (!xfs_Gqm) { + mutex_unlock(&xfs_Gqm_lock); + return ENOMEM; + } + } + + /* + * We can keep a list of all filesystems with quotas mounted for + * debugging and statistical purposes, but ... + * Just take a reference and get out. + */ + xfs_Gqm->qm_nrefs++; + mutex_unlock(&xfs_Gqm_lock); + + return 0; +} + + +/* + * Release the reference that a filesystem took at mount time, + * so that we know when we need to destroy the entire quota manager. + */ +/* ARGSUSED */ +STATIC void +xfs_qm_rele_quotafs_ref( + struct xfs_mount *mp) +{ + xfs_dquot_t *dqp, *n; + + ASSERT(xfs_Gqm); + ASSERT(xfs_Gqm->qm_nrefs > 0); + + /* + * Go thru the freelist and destroy all inactive dquots. + */ + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); + + list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) { + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_INACTIVE) { + ASSERT(dqp->q_mount == NULL); + ASSERT(! XFS_DQ_IS_DIRTY(dqp)); + ASSERT(list_empty(&dqp->q_hashlist)); + ASSERT(list_empty(&dqp->q_mplist)); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + xfs_dqunlock(dqp); + xfs_qm_dqdestroy(dqp); + } else { + xfs_dqunlock(dqp); + } + } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + + /* + * Destroy the entire XQM. If somebody mounts with quotaon, this'll + * be restarted. + */ + mutex_lock(&xfs_Gqm_lock); + if (--xfs_Gqm->qm_nrefs == 0) { + xfs_qm_destroy(xfs_Gqm); + xfs_Gqm = NULL; + } + mutex_unlock(&xfs_Gqm_lock); +} + +/* + * Just destroy the quotainfo structure. + */ +void +xfs_qm_unmount( + struct xfs_mount *mp) +{ + if (mp->m_quotainfo) { + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + xfs_qm_destroy_quotainfo(mp); + } +} + + +/* + * This is called from xfs_mountfs to start quotas and initialize all + * necessary data structures like quotainfo. This is also responsible for + * running a quotacheck as necessary. We are guaranteed that the superblock + * is consistently read in at this point. + * + * If we fail here, the mount will continue with quota turned off. We don't + * need to inidicate success or failure at all. + */ +void +xfs_qm_mount_quotas( + xfs_mount_t *mp) +{ + int error = 0; + uint sbf; + + /* + * If quotas on realtime volumes is not supported, we disable + * quotas immediately. + */ + if (mp->m_sb.sb_rextents) { + xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); + mp->m_qflags = 0; + goto write_changes; + } + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + /* + * Allocate the quotainfo structure inside the mount struct, and + * create quotainode(s), and change/rev superblock if necessary. + */ + error = xfs_qm_init_quotainfo(mp); + if (error) { + /* + * We must turn off quotas. + */ + ASSERT(mp->m_quotainfo == NULL); + mp->m_qflags = 0; + goto write_changes; + } + /* + * If any of the quotas are not consistent, do a quotacheck. + */ + if (XFS_QM_NEED_QUOTACHECK(mp)) { + error = xfs_qm_quotacheck(mp); + if (error) { + /* Quotacheck failed and disabled quotas. */ + return; + } + } + /* + * If one type of quotas is off, then it will lose its + * quotachecked status, since we won't be doing accounting for + * that type anymore. + */ + if (!XFS_IS_UQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_UQUOTA_CHKD; + if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) + mp->m_qflags &= ~XFS_OQUOTA_CHKD; + + write_changes: + /* + * We actually don't have to acquire the m_sb_lock at all. + * This can only be called from mount, and that's single threaded. XXX + */ + spin_lock(&mp->m_sb_lock); + sbf = mp->m_sb.sb_qflags; + mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL; + spin_unlock(&mp->m_sb_lock); + + if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) { + if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) { + /* + * We could only have been turning quotas off. + * We aren't in very good shape actually because + * the incore structures are convinced that quotas are + * off, but the on disk superblock doesn't know that ! + */ + ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); + xfs_alert(mp, "%s: Superblock update failed!", + __func__); + } + } + + if (error) { + xfs_warn(mp, "Failed to initialize disk quotas."); + return; + } + +#ifdef QUOTADEBUG + if (XFS_IS_QUOTA_ON(mp)) + xfs_qm_internalqcheck(mp); +#endif +} + +/* + * Called from the vfsops layer. + */ +void +xfs_qm_unmount_quotas( + xfs_mount_t *mp) +{ + /* + * Release the dquots that root inode, et al might be holding, + * before we flush quotas and blow away the quotainfo structure. + */ + ASSERT(mp->m_rootip); + xfs_qm_dqdetach(mp->m_rootip); + if (mp->m_rbmip) + xfs_qm_dqdetach(mp->m_rbmip); + if (mp->m_rsumip) + xfs_qm_dqdetach(mp->m_rsumip); + + /* + * Release the quota inodes. + */ + if (mp->m_quotainfo) { + if (mp->m_quotainfo->qi_uquotaip) { + IRELE(mp->m_quotainfo->qi_uquotaip); + mp->m_quotainfo->qi_uquotaip = NULL; + } + if (mp->m_quotainfo->qi_gquotaip) { + IRELE(mp->m_quotainfo->qi_gquotaip); + mp->m_quotainfo->qi_gquotaip = NULL; + } + } +} + +/* + * Flush all dquots of the given file system to disk. The dquots are + * _not_ purged from memory here, just their data written to disk. + */ +STATIC int +xfs_qm_dqflush_all( + struct xfs_mount *mp, + int sync_mode) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + int recl; + struct xfs_dquot *dqp; + int error; + + if (!q) + return 0; +again: + mutex_lock(&q->qi_dqlist_lock); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { + xfs_dqlock(dqp); + if (! XFS_DQ_IS_DIRTY(dqp)) { + xfs_dqunlock(dqp); + continue; + } + + /* XXX a sentinel would be better */ + recl = q->qi_dqreclaims; + if (!xfs_dqflock_nowait(dqp)) { + /* + * If we can't grab the flush lock then check + * to see if the dquot has been flushed delayed + * write. If so, grab its buffer and send it + * out immediately. We'll be able to acquire + * the flush lock when the I/O completes. + */ + xfs_qm_dqflock_pushbuf_wait(dqp); + } + /* + * Let go of the mplist lock. We don't want to hold it + * across a disk write. + */ + mutex_unlock(&q->qi_dqlist_lock); + error = xfs_qm_dqflush(dqp, sync_mode); + xfs_dqunlock(dqp); + if (error) + return error; + + mutex_lock(&q->qi_dqlist_lock); + if (recl != q->qi_dqreclaims) { + mutex_unlock(&q->qi_dqlist_lock); + /* XXX restart limit */ + goto again; + } + } + + mutex_unlock(&q->qi_dqlist_lock); + /* return ! busy */ + return 0; +} +/* + * Release the group dquot pointers the user dquots may be + * carrying around as a hint. mplist is locked on entry and exit. + */ +STATIC void +xfs_qm_detach_gdquots( + struct xfs_mount *mp) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_dquot *dqp, *gdqp; + int nrecl; + + again: + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { + xfs_dqlock(dqp); + if ((gdqp = dqp->q_gdquot)) { + xfs_dqlock(gdqp); + dqp->q_gdquot = NULL; + } + xfs_dqunlock(dqp); + + if (gdqp) { + /* + * Can't hold the mplist lock across a dqput. + * XXXmust convert to marker based iterations here. + */ + nrecl = q->qi_dqreclaims; + mutex_unlock(&q->qi_dqlist_lock); + xfs_qm_dqput(gdqp); + + mutex_lock(&q->qi_dqlist_lock); + if (nrecl != q->qi_dqreclaims) + goto again; + } + } +} + +/* + * Go through all the incore dquots of this file system and take them + * off the mplist and hashlist, if the dquot type matches the dqtype + * parameter. This is used when turning off quota accounting for + * users and/or groups, as well as when the filesystem is unmounting. + */ +STATIC int +xfs_qm_dqpurge_int( + struct xfs_mount *mp, + uint flags) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_dquot *dqp, *n; + uint dqtype; + int nrecl; + int nmisses; + + if (!q) + return 0; + + dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; + dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0; + dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0; + + mutex_lock(&q->qi_dqlist_lock); + + /* + * In the first pass through all incore dquots of this filesystem, + * we release the group dquot pointers the user dquots may be + * carrying around as a hint. We need to do this irrespective of + * what's being turned off. + */ + xfs_qm_detach_gdquots(mp); + + again: + nmisses = 0; + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); + /* + * Try to get rid of all of the unwanted dquots. The idea is to + * get them off mplist and hashlist, but leave them on freelist. + */ + list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) { + /* + * It's OK to look at the type without taking dqlock here. + * We're holding the mplist lock here, and that's needed for + * a dqreclaim. + */ + if ((dqp->dq_flags & dqtype) == 0) + continue; + + if (!mutex_trylock(&dqp->q_hash->qh_lock)) { + nrecl = q->qi_dqreclaims; + mutex_unlock(&q->qi_dqlist_lock); + mutex_lock(&dqp->q_hash->qh_lock); + mutex_lock(&q->qi_dqlist_lock); + + /* + * XXXTheoretically, we can get into a very long + * ping pong game here. + * No one can be adding dquots to the mplist at + * this point, but somebody might be taking things off. + */ + if (nrecl != q->qi_dqreclaims) { + mutex_unlock(&dqp->q_hash->qh_lock); + goto again; + } + } + + /* + * Take the dquot off the mplist and hashlist. It may remain on + * freelist in INACTIVE state. + */ + nmisses += xfs_qm_dqpurge(dqp); + } + mutex_unlock(&q->qi_dqlist_lock); + return nmisses; +} + +int +xfs_qm_dqpurge_all( + xfs_mount_t *mp, + uint flags) +{ + int ndquots; + + /* + * Purge the dquot cache. + * None of the dquots should really be busy at this point. + */ + if (mp->m_quotainfo) { + while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) { + delay(ndquots * 10); + } + } + return 0; +} + +STATIC int +xfs_qm_dqattach_one( + xfs_inode_t *ip, + xfs_dqid_t id, + uint type, + uint doalloc, + xfs_dquot_t *udqhint, /* hint */ + xfs_dquot_t **IO_idqpp) +{ + xfs_dquot_t *dqp; + int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + error = 0; + + /* + * See if we already have it in the inode itself. IO_idqpp is + * &i_udquot or &i_gdquot. This made the code look weird, but + * made the logic a lot simpler. + */ + dqp = *IO_idqpp; + if (dqp) { + trace_xfs_dqattach_found(dqp); + return 0; + } + + /* + * udqhint is the i_udquot field in inode, and is non-NULL only + * when the type arg is group/project. Its purpose is to save a + * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside + * the user dquot. + */ + if (udqhint) { + ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); + xfs_dqlock(udqhint); + + /* + * No need to take dqlock to look at the id. + * + * The ID can't change until it gets reclaimed, and it won't + * be reclaimed as long as we have a ref from inode and we + * hold the ilock. + */ + dqp = udqhint->q_gdquot; + if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) { + xfs_dqlock(dqp); + XFS_DQHOLD(dqp); + ASSERT(*IO_idqpp == NULL); + *IO_idqpp = dqp; + + xfs_dqunlock(dqp); + xfs_dqunlock(udqhint); + return 0; + } + + /* + * We can't hold a dquot lock when we call the dqget code. + * We'll deadlock in no time, because of (not conforming to) + * lock ordering - the inodelock comes before any dquot lock, + * and we may drop and reacquire the ilock in xfs_qm_dqget(). + */ + xfs_dqunlock(udqhint); + } + + /* + * Find the dquot from somewhere. This bumps the + * reference count of dquot and returns it locked. + * This can return ENOENT if dquot didn't exist on + * disk and we didn't ask it to allocate; + * ESRCH if quotas got turned off suddenly. + */ + error = xfs_qm_dqget(ip->i_mount, ip, id, type, + doalloc | XFS_QMOPT_DOWARN, &dqp); + if (error) + return error; + + trace_xfs_dqattach_get(dqp); + + /* + * dqget may have dropped and re-acquired the ilock, but it guarantees + * that the dquot returned is the one that should go in the inode. + */ + *IO_idqpp = dqp; + xfs_dqunlock(dqp); + return 0; +} + + +/* + * Given a udquot and gdquot, attach a ptr to the group dquot in the + * udquot as a hint for future lookups. The idea sounds simple, but the + * execution isn't, because the udquot might have a group dquot attached + * already and getting rid of that gets us into lock ordering constraints. + * The process is complicated more by the fact that the dquots may or may not + * be locked on entry. + */ +STATIC void +xfs_qm_dqattach_grouphint( + xfs_dquot_t *udq, + xfs_dquot_t *gdq) +{ + xfs_dquot_t *tmp; + + xfs_dqlock(udq); + + if ((tmp = udq->q_gdquot)) { + if (tmp == gdq) { + xfs_dqunlock(udq); + return; + } + + udq->q_gdquot = NULL; + /* + * We can't keep any dqlocks when calling dqrele, + * because the freelist lock comes before dqlocks. + */ + xfs_dqunlock(udq); + /* + * we took a hard reference once upon a time in dqget, + * so give it back when the udquot no longer points at it + * dqput() does the unlocking of the dquot. + */ + xfs_qm_dqrele(tmp); + + xfs_dqlock(udq); + xfs_dqlock(gdq); + + } else { + ASSERT(XFS_DQ_IS_LOCKED(udq)); + xfs_dqlock(gdq); + } + + ASSERT(XFS_DQ_IS_LOCKED(udq)); + ASSERT(XFS_DQ_IS_LOCKED(gdq)); + /* + * Somebody could have attached a gdquot here, + * when we dropped the uqlock. If so, just do nothing. + */ + if (udq->q_gdquot == NULL) { + XFS_DQHOLD(gdq); + udq->q_gdquot = gdq; + } + + xfs_dqunlock(gdq); + xfs_dqunlock(udq); +} + + +/* + * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON + * into account. + * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. + * Inode may get unlocked and relocked in here, and the caller must deal with + * the consequences. + */ +int +xfs_qm_dqattach_locked( + xfs_inode_t *ip, + uint flags) +{ + xfs_mount_t *mp = ip->i_mount; + uint nquotas = 0; + int error = 0; + + if (!XFS_IS_QUOTA_RUNNING(mp) || + !XFS_IS_QUOTA_ON(mp) || + !XFS_NOT_DQATTACHED(mp, ip) || + ip->i_ino == mp->m_sb.sb_uquotino || + ip->i_ino == mp->m_sb.sb_gquotino) + return 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, + flags & XFS_QMOPT_DQALLOC, + NULL, &ip->i_udquot); + if (error) + goto done; + nquotas++; + } + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (XFS_IS_OQUOTA_ON(mp)) { + error = XFS_IS_GQUOTA_ON(mp) ? + xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, + flags & XFS_QMOPT_DQALLOC, + ip->i_udquot, &ip->i_gdquot) : + xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, + flags & XFS_QMOPT_DQALLOC, + ip->i_udquot, &ip->i_gdquot); + /* + * Don't worry about the udquot that we may have + * attached above. It'll get detached, if not already. + */ + if (error) + goto done; + nquotas++; + } + + /* + * Attach this group quota to the user quota as a hint. + * This WON'T, in general, result in a thrash. + */ + if (nquotas == 2) { + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(ip->i_udquot); + ASSERT(ip->i_gdquot); + + /* + * We may or may not have the i_udquot locked at this point, + * but this check is OK since we don't depend on the i_gdquot to + * be accurate 100% all the time. It is just a hint, and this + * will succeed in general. + */ + if (ip->i_udquot->q_gdquot == ip->i_gdquot) + goto done; + /* + * Attach i_gdquot to the gdquot hint inside the i_udquot. + */ + xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); + } + + done: +#ifdef QUOTADEBUG + if (! error) { + if (XFS_IS_UQUOTA_ON(mp)) + ASSERT(ip->i_udquot); + if (XFS_IS_OQUOTA_ON(mp)) + ASSERT(ip->i_gdquot); + } + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); +#endif + return error; +} + +int +xfs_qm_dqattach( + struct xfs_inode *ip, + uint flags) +{ + int error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_qm_dqattach_locked(ip, flags); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + return error; +} + +/* + * Release dquots (and their references) if any. + * The inode should be locked EXCL except when this's called by + * xfs_ireclaim. + */ +void +xfs_qm_dqdetach( + xfs_inode_t *ip) +{ + if (!(ip->i_udquot || ip->i_gdquot)) + return; + + trace_xfs_dquot_dqdetach(ip); + + ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino); + ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino); + if (ip->i_udquot) { + xfs_qm_dqrele(ip->i_udquot); + ip->i_udquot = NULL; + } + if (ip->i_gdquot) { + xfs_qm_dqrele(ip->i_gdquot); + ip->i_gdquot = NULL; + } +} + +int +xfs_qm_sync( + struct xfs_mount *mp, + int flags) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + int recl, restarts; + struct xfs_dquot *dqp; + int error; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + restarts = 0; + + again: + mutex_lock(&q->qi_dqlist_lock); + /* + * dqpurge_all() also takes the mplist lock and iterate thru all dquots + * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared + * when we have the mplist lock, we know that dquots will be consistent + * as long as we have it locked. + */ + if (!XFS_IS_QUOTA_ON(mp)) { + mutex_unlock(&q->qi_dqlist_lock); + return 0; + } + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { + /* + * If this is vfs_sync calling, then skip the dquots that + * don't 'seem' to be dirty. ie. don't acquire dqlock. + * This is very similar to what xfs_sync does with inodes. + */ + if (flags & SYNC_TRYLOCK) { + if (!XFS_DQ_IS_DIRTY(dqp)) + continue; + if (!xfs_qm_dqlock_nowait(dqp)) + continue; + } else { + xfs_dqlock(dqp); + } + + /* + * Now, find out for sure if this dquot is dirty or not. + */ + if (! XFS_DQ_IS_DIRTY(dqp)) { + xfs_dqunlock(dqp); + continue; + } + + /* XXX a sentinel would be better */ + recl = q->qi_dqreclaims; + if (!xfs_dqflock_nowait(dqp)) { + if (flags & SYNC_TRYLOCK) { + xfs_dqunlock(dqp); + continue; + } + /* + * If we can't grab the flush lock then if the caller + * really wanted us to give this our best shot, so + * see if we can give a push to the buffer before we wait + * on the flush lock. At this point, we know that + * even though the dquot is being flushed, + * it has (new) dirty data. + */ + xfs_qm_dqflock_pushbuf_wait(dqp); + } + /* + * Let go of the mplist lock. We don't want to hold it + * across a disk write + */ + mutex_unlock(&q->qi_dqlist_lock); + error = xfs_qm_dqflush(dqp, flags); + xfs_dqunlock(dqp); + if (error && XFS_FORCED_SHUTDOWN(mp)) + return 0; /* Need to prevent umount failure */ + else if (error) + return error; + + mutex_lock(&q->qi_dqlist_lock); + if (recl != q->qi_dqreclaims) { + if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) + break; + + mutex_unlock(&q->qi_dqlist_lock); + goto again; + } + } + + mutex_unlock(&q->qi_dqlist_lock); + return 0; +} + +/* + * The hash chains and the mplist use the same xfs_dqhash structure as + * their list head, but we can take the mplist qh_lock and one of the + * hash qh_locks at the same time without any problem as they aren't + * related. + */ +static struct lock_class_key xfs_quota_mplist_class; + +/* + * This initializes all the quota information that's kept in the + * mount structure + */ +STATIC int +xfs_qm_init_quotainfo( + xfs_mount_t *mp) +{ + xfs_quotainfo_t *qinf; + int error; + xfs_dquot_t *dqp; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + /* + * Tell XQM that we exist as soon as possible. + */ + if ((error = xfs_qm_hold_quotafs_ref(mp))) { + return error; + } + + qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); + + /* + * See if quotainodes are setup, and if not, allocate them, + * and change the superblock accordingly. + */ + if ((error = xfs_qm_init_quotainos(mp))) { + kmem_free(qinf); + mp->m_quotainfo = NULL; + return error; + } + + INIT_LIST_HEAD(&qinf->qi_dqlist); + mutex_init(&qinf->qi_dqlist_lock); + lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class); + + qinf->qi_dqreclaims = 0; + + /* mutex used to serialize quotaoffs */ + mutex_init(&qinf->qi_quotaofflock); + + /* Precalc some constants */ + qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); + ASSERT(qinf->qi_dqchunklen); + qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen); + do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t)); + + mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); + + /* + * We try to get the limits from the superuser's limits fields. + * This is quite hacky, but it is standard quota practice. + * We look at the USR dquot with id == 0 first, but if user quotas + * are not enabled we goto the GRP dquot with id == 0. + * We don't really care to keep separate default limits for user + * and group quotas, at least not at this point. + */ + error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0, + XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : + (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : + XFS_DQ_PROJ), + XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN, + &dqp); + if (! error) { + xfs_disk_dquot_t *ddqp = &dqp->q_core; + + /* + * The warnings and timers set the grace period given to + * a user or group before he or she can not perform any + * more writing. If it is zero, a default is used. + */ + qinf->qi_btimelimit = ddqp->d_btimer ? + be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT; + qinf->qi_itimelimit = ddqp->d_itimer ? + be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT; + qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ? + be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT; + qinf->qi_bwarnlimit = ddqp->d_bwarns ? + be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT; + qinf->qi_iwarnlimit = ddqp->d_iwarns ? + be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT; + qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ? + be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT; + qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); + qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); + qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); + qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); + qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); + qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); + + /* + * We sent the XFS_QMOPT_DQSUSER flag to dqget because + * we don't want this dquot cached. We haven't done a + * quotacheck yet, and quotacheck doesn't like incore dquots. + */ + xfs_qm_dqdestroy(dqp); + } else { + qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; + qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; + qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; + qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; + qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; + qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; + } + + return 0; +} + + +/* + * Gets called when unmounting a filesystem or when all quotas get + * turned off. + * This purges the quota inodes, destroys locks and frees itself. + */ +void +xfs_qm_destroy_quotainfo( + xfs_mount_t *mp) +{ + xfs_quotainfo_t *qi; + + qi = mp->m_quotainfo; + ASSERT(qi != NULL); + ASSERT(xfs_Gqm != NULL); + + /* + * Release the reference that XQM kept, so that we know + * when the XQM structure should be freed. We cannot assume + * that xfs_Gqm is non-null after this point. + */ + xfs_qm_rele_quotafs_ref(mp); + + ASSERT(list_empty(&qi->qi_dqlist)); + mutex_destroy(&qi->qi_dqlist_lock); + + if (qi->qi_uquotaip) { + IRELE(qi->qi_uquotaip); + qi->qi_uquotaip = NULL; /* paranoia */ + } + if (qi->qi_gquotaip) { + IRELE(qi->qi_gquotaip); + qi->qi_gquotaip = NULL; + } + mutex_destroy(&qi->qi_quotaofflock); + kmem_free(qi); + mp->m_quotainfo = NULL; +} + + + +/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */ + +/* ARGSUSED */ +STATIC void +xfs_qm_list_init( + xfs_dqlist_t *list, + char *str, + int n) +{ + mutex_init(&list->qh_lock); + INIT_LIST_HEAD(&list->qh_list); + list->qh_version = 0; + list->qh_nelems = 0; +} + +STATIC void +xfs_qm_list_destroy( + xfs_dqlist_t *list) +{ + mutex_destroy(&(list->qh_lock)); +} + +/* + * Create an inode and return with a reference already taken, but unlocked + * This is how we create quota inodes + */ +STATIC int +xfs_qm_qino_alloc( + xfs_mount_t *mp, + xfs_inode_t **ip, + __int64_t sbfields, + uint flags) +{ + xfs_trans_t *tp; + int error; + int committed; + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); + if ((error = xfs_trans_reserve(tp, + XFS_QM_QINOCREATE_SPACE_RES(mp), + XFS_CREATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_CREATE_LOG_COUNT))) { + xfs_trans_cancel(tp, 0); + return error; + } + + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); + if (error) { + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT); + return error; + } + + /* + * Make the changes in the superblock, and log those too. + * sbfields arg may contain fields other than *QUOTINO; + * VERSIONNUM for example. + */ + spin_lock(&mp->m_sb_lock); + if (flags & XFS_QMOPT_SBVERSION) { + ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); + ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | + XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == + (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | + XFS_SB_GQUOTINO | XFS_SB_QFLAGS)); + + xfs_sb_version_addquota(&mp->m_sb); + mp->m_sb.sb_uquotino = NULLFSINO; + mp->m_sb.sb_gquotino = NULLFSINO; + + /* qflags will get updated _after_ quotacheck */ + mp->m_sb.sb_qflags = 0; + } + if (flags & XFS_QMOPT_UQUOTA) + mp->m_sb.sb_uquotino = (*ip)->i_ino; + else + mp->m_sb.sb_gquotino = (*ip)->i_ino; + spin_unlock(&mp->m_sb_lock); + xfs_mod_sb(tp, sbfields); + + if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { + xfs_alert(mp, "%s failed (error %d)!", __func__, error); + return error; + } + return 0; +} + + +STATIC void +xfs_qm_reset_dqcounts( + xfs_mount_t *mp, + xfs_buf_t *bp, + xfs_dqid_t id, + uint type) +{ + xfs_disk_dquot_t *ddq; + int j; + + trace_xfs_reset_dqcounts(bp, _RET_IP_); + + /* + * Reset all counters and timers. They'll be + * started afresh by xfs_qm_quotacheck. + */ +#ifdef DEBUG + j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); + do_div(j, sizeof(xfs_dqblk_t)); + ASSERT(mp->m_quotainfo->qi_dqperchunk == j); +#endif + ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); + for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { + /* + * Do a sanity check, and if needed, repair the dqblk. Don't + * output any warnings because it's perfectly possible to + * find uninitialised dquot blks. See comment in xfs_qm_dqcheck. + */ + (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR, + "xfs_quotacheck"); + ddq->d_bcount = 0; + ddq->d_icount = 0; + ddq->d_rtbcount = 0; + ddq->d_btimer = 0; + ddq->d_itimer = 0; + ddq->d_rtbtimer = 0; + ddq->d_bwarns = 0; + ddq->d_iwarns = 0; + ddq->d_rtbwarns = 0; + ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); + } +} + +STATIC int +xfs_qm_dqiter_bufs( + xfs_mount_t *mp, + xfs_dqid_t firstid, + xfs_fsblock_t bno, + xfs_filblks_t blkcnt, + uint flags) +{ + xfs_buf_t *bp; + int error; + int type; + + ASSERT(blkcnt > 0); + type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : + (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); + error = 0; + + /* + * Blkcnt arg can be a very big number, and might even be + * larger than the log itself. So, we have to break it up into + * manageable-sized transactions. + * Note that we don't start a permanent transaction here; we might + * not be able to get a log reservation for the whole thing up front, + * and we don't really care to either, because we just discard + * everything if we were to crash in the middle of this loop. + */ + while (blkcnt--) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, bno), + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) + break; + + xfs_qm_reset_dqcounts(mp, bp, firstid, type); + xfs_bdwrite(mp, bp); + /* + * goto the next block. + */ + bno++; + firstid += mp->m_quotainfo->qi_dqperchunk; + } + return error; +} + +/* + * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a + * caller supplied function for every chunk of dquots that we find. + */ +STATIC int +xfs_qm_dqiterate( + xfs_mount_t *mp, + xfs_inode_t *qip, + uint flags) +{ + xfs_bmbt_irec_t *map; + int i, nmaps; /* number of map entries */ + int error; /* return value */ + xfs_fileoff_t lblkno; + xfs_filblks_t maxlblkcnt; + xfs_dqid_t firstid; + xfs_fsblock_t rablkno; + xfs_filblks_t rablkcnt; + + error = 0; + /* + * This looks racy, but we can't keep an inode lock across a + * trans_reserve. But, this gets called during quotacheck, and that + * happens only at mount time which is single threaded. + */ + if (qip->i_d.di_nblocks == 0) + return 0; + + map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); + + lblkno = 0; + maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + do { + nmaps = XFS_DQITER_MAP_SIZE; + /* + * We aren't changing the inode itself. Just changing + * some of its data. No new blocks are added here, and + * the inode is never added to the transaction. + */ + xfs_ilock(qip, XFS_ILOCK_SHARED); + error = xfs_bmapi(NULL, qip, lblkno, + maxlblkcnt - lblkno, + XFS_BMAPI_METADATA, + NULL, + 0, map, &nmaps, NULL); + xfs_iunlock(qip, XFS_ILOCK_SHARED); + if (error) + break; + + ASSERT(nmaps <= XFS_DQITER_MAP_SIZE); + for (i = 0; i < nmaps; i++) { + ASSERT(map[i].br_startblock != DELAYSTARTBLOCK); + ASSERT(map[i].br_blockcount); + + + lblkno += map[i].br_blockcount; + + if (map[i].br_startblock == HOLESTARTBLOCK) + continue; + + firstid = (xfs_dqid_t) map[i].br_startoff * + mp->m_quotainfo->qi_dqperchunk; + /* + * Do a read-ahead on the next extent. + */ + if ((i+1 < nmaps) && + (map[i+1].br_startblock != HOLESTARTBLOCK)) { + rablkcnt = map[i+1].br_blockcount; + rablkno = map[i+1].br_startblock; + while (rablkcnt--) { + xfs_buf_readahead(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, rablkno), + mp->m_quotainfo->qi_dqchunklen); + rablkno++; + } + } + /* + * Iterate thru all the blks in the extent and + * reset the counters of all the dquots inside them. + */ + if ((error = xfs_qm_dqiter_bufs(mp, + firstid, + map[i].br_startblock, + map[i].br_blockcount, + flags))) { + break; + } + } + + if (error) + break; + } while (nmaps > 0); + + kmem_free(map); + + return error; +} + +/* + * Called by dqusage_adjust in doing a quotacheck. + * + * Given the inode, and a dquot id this updates both the incore dqout as well + * as the buffer copy. This is so that once the quotacheck is done, we can + * just log all the buffers, as opposed to logging numerous updates to + * individual dquots. + */ +STATIC int +xfs_qm_quotacheck_dqadjust( + struct xfs_inode *ip, + xfs_dqid_t id, + uint type, + xfs_qcnt_t nblks, + xfs_qcnt_t rtblks) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *dqp; + int error; + + error = xfs_qm_dqget(mp, ip, id, type, + XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp); + if (error) { + /* + * Shouldn't be able to turn off quotas here. + */ + ASSERT(error != ESRCH); + ASSERT(error != ENOENT); + return error; + } + + trace_xfs_dqadjust(dqp); + + /* + * Adjust the inode count and the block count to reflect this inode's + * resource usage. + */ + be64_add_cpu(&dqp->q_core.d_icount, 1); + dqp->q_res_icount++; + if (nblks) { + be64_add_cpu(&dqp->q_core.d_bcount, nblks); + dqp->q_res_bcount += nblks; + } + if (rtblks) { + be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks); + dqp->q_res_rtbcount += rtblks; + } + + /* + * Set default limits, adjust timers (since we changed usages) + * + * There are no timers for the default values set in the root dquot. + */ + if (dqp->q_core.d_id) { + xfs_qm_adjust_dqlimits(mp, &dqp->q_core); + xfs_qm_adjust_dqtimers(mp, &dqp->q_core); + } + + dqp->dq_flags |= XFS_DQ_DIRTY; + xfs_qm_dqput(dqp); + return 0; +} + +STATIC int +xfs_qm_get_rtblks( + xfs_inode_t *ip, + xfs_qcnt_t *O_rtblks) +{ + xfs_filblks_t rtblks; /* total rt blks */ + xfs_extnum_t idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t nextents; /* number of extent entries */ + int error; + + ASSERT(XFS_IS_REALTIME_INODE(ip)); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK))) + return error; + } + rtblks = 0; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (idx = 0; idx < nextents; idx++) + rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx)); + *O_rtblks = (xfs_qcnt_t)rtblks; + return 0; +} + +/* + * callback routine supplied to bulkstat(). Given an inumber, find its + * dquots and update them to account for resources taken by that inode. + */ +/* ARGSUSED */ +STATIC int +xfs_qm_dqusage_adjust( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* not used */ + int ubsize, /* not used */ + int *ubused, /* not used */ + int *res) /* result code value */ +{ + xfs_inode_t *ip; + xfs_qcnt_t nblks, rtblks = 0; + int error; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + /* + * rootino must have its resources accounted for, not so with the quota + * inodes. + */ + if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { + *res = BULKSTAT_RV_NOTHING; + return XFS_ERROR(EINVAL); + } + + /* + * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget + * interface expects the inode to be exclusively locked because that's + * the case in all other instances. It's OK that we do this because + * quotacheck is done only at mount time. + */ + error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip); + if (error) { + *res = BULKSTAT_RV_NOTHING; + return error; + } + + ASSERT(ip->i_delayed_blks == 0); + + if (XFS_IS_REALTIME_INODE(ip)) { + /* + * Walk thru the extent list and count the realtime blocks. + */ + error = xfs_qm_get_rtblks(ip, &rtblks); + if (error) + goto error0; + } + + nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; + + /* + * Add the (disk blocks and inode) resources occupied by this + * inode to its dquots. We do this adjustment in the incore dquot, + * and also copy the changes to its buffer. + * We don't care about putting these changes in a transaction + * envelope because if we crash in the middle of a 'quotacheck' + * we have to start from the beginning anyway. + * Once we're done, we'll log all the dquot bufs. + * + * The *QUOTA_ON checks below may look pretty racy, but quotachecks + * and quotaoffs don't race. (Quotachecks happen at mount time only). + */ + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid, + XFS_DQ_USER, nblks, rtblks); + if (error) + goto error0; + } + + if (XFS_IS_GQUOTA_ON(mp)) { + error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid, + XFS_DQ_GROUP, nblks, rtblks); + if (error) + goto error0; + } + + if (XFS_IS_PQUOTA_ON(mp)) { + error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip), + XFS_DQ_PROJ, nblks, rtblks); + if (error) + goto error0; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + *res = BULKSTAT_RV_DIDONE; + return 0; + +error0: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + *res = BULKSTAT_RV_GIVEUP; + return error; +} + +/* + * Walk thru all the filesystem inodes and construct a consistent view + * of the disk quota world. If the quotacheck fails, disable quotas. + */ +int +xfs_qm_quotacheck( + xfs_mount_t *mp) +{ + int done, count, error; + xfs_ino_t lastino; + size_t structsz; + xfs_inode_t *uip, *gip; + uint flags; + + count = INT_MAX; + structsz = 1; + lastino = 0; + flags = 0; + + ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip); + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + /* + * There should be no cached dquots. The (simplistic) quotacheck + * algorithm doesn't like that. + */ + ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist)); + + xfs_notice(mp, "Quotacheck needed: Please wait."); + + /* + * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset + * their counters to zero. We need a clean slate. + * We don't log our changes till later. + */ + uip = mp->m_quotainfo->qi_uquotaip; + if (uip) { + error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA); + if (error) + goto error_return; + flags |= XFS_UQUOTA_CHKD; + } + + gip = mp->m_quotainfo->qi_gquotaip; + if (gip) { + error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? + XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); + if (error) + goto error_return; + flags |= XFS_OQUOTA_CHKD; + } + + do { + /* + * Iterate thru all the inodes in the file system, + * adjusting the corresponding dquot counters in core. + */ + error = xfs_bulkstat(mp, &lastino, &count, + xfs_qm_dqusage_adjust, + structsz, NULL, &done); + if (error) + break; + + } while (!done); + + /* + * We've made all the changes that we need to make incore. + * Flush them down to disk buffers if everything was updated + * successfully. + */ + if (!error) + error = xfs_qm_dqflush_all(mp, 0); + + /* + * We can get this error if we couldn't do a dquot allocation inside + * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the + * dirty dquots that might be cached, we just want to get rid of them + * and turn quotaoff. The dquots won't be attached to any of the inodes + * at this point (because we intentionally didn't in dqget_noattach). + */ + if (error) { + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + goto error_return; + } + + /* + * We didn't log anything, because if we crashed, we'll have to + * start the quotacheck from scratch anyway. However, we must make + * sure that our dquot changes are secure before we put the + * quotacheck'd stamp on the superblock. So, here we do a synchronous + * flush. + */ + XFS_bflush(mp->m_ddev_targp); + + /* + * If one type of quotas is off, then it will lose its + * quotachecked status, since we won't be doing accounting for + * that type anymore. + */ + mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); + mp->m_qflags |= flags; + + xfs_qm_dquot_list_print(mp); + + error_return: + if (error) { + xfs_warn(mp, + "Quotacheck: Unsuccessful (Error %d): Disabling quotas.", + error); + /* + * We must turn off quotas. + */ + ASSERT(mp->m_quotainfo != NULL); + ASSERT(xfs_Gqm != NULL); + xfs_qm_destroy_quotainfo(mp); + if (xfs_mount_reset_sbqflags(mp)) { + xfs_warn(mp, + "Quotacheck: Failed to reset quota flags."); + } + } else + xfs_notice(mp, "Quotacheck: Done."); + return (error); +} + +/* + * This is called after the superblock has been read in and we're ready to + * iget the quota inodes. + */ +STATIC int +xfs_qm_init_quotainos( + xfs_mount_t *mp) +{ + xfs_inode_t *uip, *gip; + int error; + __int64_t sbflags; + uint flags; + + ASSERT(mp->m_quotainfo); + uip = gip = NULL; + sbflags = 0; + flags = 0; + + /* + * Get the uquota and gquota inodes + */ + if (xfs_sb_version_hasquota(&mp->m_sb)) { + if (XFS_IS_UQUOTA_ON(mp) && + mp->m_sb.sb_uquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_uquotino > 0); + if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, + 0, 0, &uip))) + return XFS_ERROR(error); + } + if (XFS_IS_OQUOTA_ON(mp) && + mp->m_sb.sb_gquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_gquotino > 0); + if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + 0, 0, &gip))) { + if (uip) + IRELE(uip); + return XFS_ERROR(error); + } + } + } else { + flags |= XFS_QMOPT_SBVERSION; + sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | + XFS_SB_GQUOTINO | XFS_SB_QFLAGS); + } + + /* + * Create the two inodes, if they don't exist already. The changes + * made above will get added to a transaction and logged in one of + * the qino_alloc calls below. If the device is readonly, + * temporarily switch to read-write to do this. + */ + if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { + if ((error = xfs_qm_qino_alloc(mp, &uip, + sbflags | XFS_SB_UQUOTINO, + flags | XFS_QMOPT_UQUOTA))) + return XFS_ERROR(error); + + flags &= ~XFS_QMOPT_SBVERSION; + } + if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) { + flags |= (XFS_IS_GQUOTA_ON(mp) ? + XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); + error = xfs_qm_qino_alloc(mp, &gip, + sbflags | XFS_SB_GQUOTINO, flags); + if (error) { + if (uip) + IRELE(uip); + + return XFS_ERROR(error); + } + } + + mp->m_quotainfo->qi_uquotaip = uip; + mp->m_quotainfo->qi_gquotaip = gip; + + return 0; +} + + + +/* + * Just pop the least recently used dquot off the freelist and + * recycle it. The returned dquot is locked. + */ +STATIC xfs_dquot_t * +xfs_qm_dqreclaim_one(void) +{ + xfs_dquot_t *dqpout; + xfs_dquot_t *dqp; + int restarts; + int startagain; + + restarts = 0; + dqpout = NULL; + + /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ +again: + startagain = 0; + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); + + list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { + struct xfs_mount *mp = dqp->q_mount; + xfs_dqlock(dqp); + + /* + * We are racing with dqlookup here. Naturally we don't + * want to reclaim a dquot that lookup wants. We release the + * freelist lock and start over, so that lookup will grab + * both the dquot and the freelistlock. + */ + if (dqp->dq_flags & XFS_DQ_WANT) { + ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); + + trace_xfs_dqreclaim_want(dqp); + XQM_STATS_INC(xqmstats.xs_qm_dqwants); + restarts++; + startagain = 1; + goto dqunlock; + } + + /* + * If the dquot is inactive, we are assured that it is + * not on the mplist or the hashlist, and that makes our + * life easier. + */ + if (dqp->dq_flags & XFS_DQ_INACTIVE) { + ASSERT(mp == NULL); + ASSERT(! XFS_DQ_IS_DIRTY(dqp)); + ASSERT(list_empty(&dqp->q_hashlist)); + ASSERT(list_empty(&dqp->q_mplist)); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + dqpout = dqp; + XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); + goto dqunlock; + } + + ASSERT(dqp->q_hash); + ASSERT(!list_empty(&dqp->q_mplist)); + + /* + * Try to grab the flush lock. If this dquot is in the process + * of getting flushed to disk, we don't want to reclaim it. + */ + if (!xfs_dqflock_nowait(dqp)) + goto dqunlock; + + /* + * We have the flush lock so we know that this is not in the + * process of being flushed. So, if this is dirty, flush it + * DELWRI so that we don't get a freelist infested with + * dirty dquots. + */ + if (XFS_DQ_IS_DIRTY(dqp)) { + int error; + + trace_xfs_dqreclaim_dirty(dqp); + + /* + * We flush it delayed write, so don't bother + * releasing the freelist lock. + */ + error = xfs_qm_dqflush(dqp, 0); + if (error) { + xfs_warn(mp, "%s: dquot %p flush failed", + __func__, dqp); + } + goto dqunlock; + } + + /* + * We're trying to get the hashlock out of order. This races + * with dqlookup; so, we giveup and goto the next dquot if + * we couldn't get the hashlock. This way, we won't starve + * a dqlookup process that holds the hashlock that is + * waiting for the freelist lock. + */ + if (!mutex_trylock(&dqp->q_hash->qh_lock)) { + restarts++; + goto dqfunlock; + } + + /* + * This races with dquot allocation code as well as dqflush_all + * and reclaim code. So, if we failed to grab the mplist lock, + * giveup everything and start over. + */ + if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { + restarts++; + startagain = 1; + goto qhunlock; + } + + ASSERT(dqp->q_nrefs == 0); + list_del_init(&dqp->q_mplist); + mp->m_quotainfo->qi_dquots--; + mp->m_quotainfo->qi_dqreclaims++; + list_del_init(&dqp->q_hashlist); + dqp->q_hash->qh_version++; + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + dqpout = dqp; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); +qhunlock: + mutex_unlock(&dqp->q_hash->qh_lock); +dqfunlock: + xfs_dqfunlock(dqp); +dqunlock: + xfs_dqunlock(dqp); + if (dqpout) + break; + if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) + break; + if (startagain) { + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + goto again; + } + } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + return dqpout; +} + +/* + * Traverse the freelist of dquots and attempt to reclaim a maximum of + * 'howmany' dquots. This operation races with dqlookup(), and attempts to + * favor the lookup function ... + */ +STATIC int +xfs_qm_shake_freelist( + int howmany) +{ + int nreclaimed = 0; + xfs_dquot_t *dqp; + + if (howmany <= 0) + return 0; + + while (nreclaimed < howmany) { + dqp = xfs_qm_dqreclaim_one(); + if (!dqp) + return nreclaimed; + xfs_qm_dqdestroy(dqp); + nreclaimed++; + } + return nreclaimed; +} + +/* + * The kmem_shake interface is invoked when memory is running low. + */ +/* ARGSUSED */ +STATIC int +xfs_qm_shake( + struct shrinker *shrink, + struct shrink_control *sc) +{ + int ndqused, nfree, n; + gfp_t gfp_mask = sc->gfp_mask; + + if (!kmem_shake_allow(gfp_mask)) + return 0; + if (!xfs_Gqm) + return 0; + + nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */ + /* incore dquots in all f/s's */ + ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; + + ASSERT(ndqused >= 0); + + if (nfree <= ndqused && nfree < ndquot) + return 0; + + ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */ + n = nfree - ndqused - ndquot; /* # over target */ + + return xfs_qm_shake_freelist(MAX(nfree, n)); +} + + +/*------------------------------------------------------------------*/ + +/* + * Return a new incore dquot. Depending on the number of + * dquots in the system, we either allocate a new one on the kernel heap, + * or reclaim a free one. + * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed + * to reclaim an existing one from the freelist. + */ +boolean_t +xfs_qm_dqalloc_incore( + xfs_dquot_t **O_dqpp) +{ + xfs_dquot_t *dqp; + + /* + * Check against high water mark to see if we want to pop + * a nincompoop dquot off the freelist. + */ + if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) { + /* + * Try to recycle a dquot from the freelist. + */ + if ((dqp = xfs_qm_dqreclaim_one())) { + XQM_STATS_INC(xqmstats.xs_qm_dqreclaims); + /* + * Just zero the core here. The rest will get + * reinitialized by caller. XXX we shouldn't even + * do this zero ... + */ + memset(&dqp->q_core, 0, sizeof(dqp->q_core)); + *O_dqpp = dqp; + return B_FALSE; + } + XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses); + } + + /* + * Allocate a brand new dquot on the kernel heap and return it + * to the caller to initialize. + */ + ASSERT(xfs_Gqm->qm_dqzone != NULL); + *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP); + atomic_inc(&xfs_Gqm->qm_totaldquots); + + return B_TRUE; +} + + +/* + * Start a transaction and write the incore superblock changes to + * disk. flags parameter indicates which fields have changed. + */ +int +xfs_qm_write_sb_changes( + xfs_mount_t *mp, + __int64_t flags) +{ + xfs_trans_t *tp; + int error; + +#ifdef QUOTADEBUG + xfs_notice(mp, "Writing superblock quota changes"); +#endif + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); + if ((error = xfs_trans_reserve(tp, 0, + mp->m_sb.sb_sectsize + 128, 0, + 0, + XFS_DEFAULT_LOG_COUNT))) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_mod_sb(tp, flags); + error = xfs_trans_commit(tp, 0); + + return error; +} + + +/* --------------- utility functions for vnodeops ---------------- */ + + +/* + * Given an inode, a uid, gid and prid make sure that we have + * allocated relevant dquot(s) on disk, and that we won't exceed inode + * quotas by creating this file. + * This also attaches dquot(s) to the given inode after locking it, + * and returns the dquots corresponding to the uid and/or gid. + * + * in : inode (unlocked) + * out : udquot, gdquot with references taken and unlocked + */ +int +xfs_qm_vop_dqalloc( + struct xfs_inode *ip, + uid_t uid, + gid_t gid, + prid_t prid, + uint flags, + struct xfs_dquot **O_udqpp, + struct xfs_dquot **O_gdqpp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *uq, *gq; + int error; + uint lockflags; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + lockflags = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockflags); + + if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) + gid = ip->i_d.di_gid; + + /* + * Attach the dquot(s) to this inode, doing a dquot allocation + * if necessary. The dquot(s) will not be locked. + */ + if (XFS_NOT_DQATTACHED(mp, ip)) { + error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC); + if (error) { + xfs_iunlock(ip, lockflags); + return error; + } + } + + uq = gq = NULL; + if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { + if (ip->i_d.di_uid != uid) { + /* + * What we need is the dquot that has this uid, and + * if we send the inode to dqget, the uid of the inode + * takes priority over what's sent in the uid argument. + * We must unlock inode here before calling dqget if + * we're not sending the inode, because otherwise + * we'll deadlock by doing trans_reserve while + * holding ilock. + */ + xfs_iunlock(ip, lockflags); + if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid, + XFS_DQ_USER, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &uq))) { + ASSERT(error != ENOENT); + return error; + } + /* + * Get the ilock in the right order. + */ + xfs_dqunlock(uq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + /* + * Take an extra reference, because we'll return + * this to caller + */ + ASSERT(ip->i_udquot); + uq = ip->i_udquot; + xfs_dqlock(uq); + XFS_DQHOLD(uq); + xfs_dqunlock(uq); + } + } + if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { + if (ip->i_d.di_gid != gid) { + xfs_iunlock(ip, lockflags); + if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, + XFS_DQ_GROUP, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &gq))) { + if (uq) + xfs_qm_dqrele(uq); + ASSERT(error != ENOENT); + return error; + } + xfs_dqunlock(gq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + ASSERT(ip->i_gdquot); + gq = ip->i_gdquot; + xfs_dqlock(gq); + XFS_DQHOLD(gq); + xfs_dqunlock(gq); + } + } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { + if (xfs_get_projid(ip) != prid) { + xfs_iunlock(ip, lockflags); + if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, + XFS_DQ_PROJ, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &gq))) { + if (uq) + xfs_qm_dqrele(uq); + ASSERT(error != ENOENT); + return (error); + } + xfs_dqunlock(gq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + ASSERT(ip->i_gdquot); + gq = ip->i_gdquot; + xfs_dqlock(gq); + XFS_DQHOLD(gq); + xfs_dqunlock(gq); + } + } + if (uq) + trace_xfs_dquot_dqalloc(ip); + + xfs_iunlock(ip, lockflags); + if (O_udqpp) + *O_udqpp = uq; + else if (uq) + xfs_qm_dqrele(uq); + if (O_gdqpp) + *O_gdqpp = gq; + else if (gq) + xfs_qm_dqrele(gq); + return 0; +} + +/* + * Actually transfer ownership, and do dquot modifications. + * These were already reserved. + */ +xfs_dquot_t * +xfs_qm_vop_chown( + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_dquot_t **IO_olddq, + xfs_dquot_t *newdq) +{ + xfs_dquot_t *prevdq; + uint bfield = XFS_IS_REALTIME_INODE(ip) ? + XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; + + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); + + /* old dquot */ + prevdq = *IO_olddq; + ASSERT(prevdq); + ASSERT(prevdq != newdq); + + xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks)); + xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1); + + /* the sparkling new dquot */ + xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks); + xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); + + /* + * Take an extra reference, because the inode + * is going to keep this dquot pointer even + * after the trans_commit. + */ + xfs_dqlock(newdq); + XFS_DQHOLD(newdq); + xfs_dqunlock(newdq); + *IO_olddq = newdq; + + return prevdq; +} + +/* + * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID). + */ +int +xfs_qm_vop_chown_reserve( + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_dquot_t *udqp, + xfs_dquot_t *gdqp, + uint flags) +{ + xfs_mount_t *mp = ip->i_mount; + uint delblks, blkflags, prjflags = 0; + xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; + int error; + + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + delblks = ip->i_delayed_blks; + delblksudq = delblksgdq = unresudq = unresgdq = NULL; + blkflags = XFS_IS_REALTIME_INODE(ip) ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; + + if (XFS_IS_UQUOTA_ON(mp) && udqp && + ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) { + delblksudq = udqp; + /* + * If there are delayed allocation blocks, then we have to + * unreserve those from the old dquot, and add them to the + * new dquot. + */ + if (delblks) { + ASSERT(ip->i_udquot); + unresudq = ip->i_udquot; + } + } + if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { + if (XFS_IS_PQUOTA_ON(ip->i_mount) && + xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id)) + prjflags = XFS_QMOPT_ENOSPC; + + if (prjflags || + (XFS_IS_GQUOTA_ON(ip->i_mount) && + ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) { + delblksgdq = gdqp; + if (delblks) { + ASSERT(ip->i_gdquot); + unresgdq = ip->i_gdquot; + } + } + } + + if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, + delblksudq, delblksgdq, ip->i_d.di_nblocks, 1, + flags | blkflags | prjflags))) + return (error); + + /* + * Do the delayed blks reservations/unreservations now. Since, these + * are done without the help of a transaction, if a reservation fails + * its previous reservations won't be automatically undone by trans + * code. So, we have to do it manually here. + */ + if (delblks) { + /* + * Do the reservations first. Unreservation can't fail. + */ + ASSERT(delblksudq || delblksgdq); + ASSERT(unresudq || unresgdq); + if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, + delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0, + flags | blkflags | prjflags))) + return (error); + xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, + unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0, + blkflags); + } + + return (0); +} + +int +xfs_qm_vop_rename_dqattach( + struct xfs_inode **i_tab) +{ + struct xfs_mount *mp = i_tab[0]->i_mount; + int i; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + for (i = 0; (i < 4 && i_tab[i]); i++) { + struct xfs_inode *ip = i_tab[i]; + int error; + + /* + * Watch out for duplicate entries in the table. + */ + if (i == 0 || ip != i_tab[i-1]) { + if (XFS_NOT_DQATTACHED(mp, ip)) { + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + } + } + } + return 0; +} + +void +xfs_qm_vop_create_dqattach( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + if (udqp) { + xfs_dqlock(udqp); + XFS_DQHOLD(udqp); + xfs_dqunlock(udqp); + ASSERT(ip->i_udquot == NULL); + ip->i_udquot = udqp; + ASSERT(XFS_IS_UQUOTA_ON(mp)); + ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); + xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); + } + if (gdqp) { + xfs_dqlock(gdqp); + XFS_DQHOLD(gdqp); + xfs_dqunlock(gdqp); + ASSERT(ip->i_gdquot == NULL); + ip->i_gdquot = gdqp; + ASSERT(XFS_IS_OQUOTA_ON(mp)); + ASSERT((XFS_IS_GQUOTA_ON(mp) ? + ip->i_d.di_gid : xfs_get_projid(ip)) == + be32_to_cpu(gdqp->q_core.d_id)); + xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); + } +} + diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h new file mode 100644 index 00000000..567b29b9 --- /dev/null +++ b/fs/xfs/quota/xfs_qm.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_QM_H__ +#define __XFS_QM_H__ + +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" +#include "xfs_quota_priv.h" +#include "xfs_qm_stats.h" + +struct xfs_qm; +struct xfs_inode; + +extern uint ndquot; +extern struct mutex xfs_Gqm_lock; +extern struct xfs_qm *xfs_Gqm; +extern kmem_zone_t *qm_dqzone; +extern kmem_zone_t *qm_dqtrxzone; + +/* + * Used in xfs_qm_sync called by xfs_sync to count the max times that it can + * iterate over the mountpt's dquot list in one call. + */ +#define XFS_QM_SYNC_MAX_RESTARTS 7 + +/* + * Ditto, for xfs_qm_dqreclaim_one. + */ +#define XFS_QM_RECLAIM_MAX_RESTARTS 4 + +/* + * Ideal ratio of free to in use dquots. Quota manager makes an attempt + * to keep this balance. + */ +#define XFS_QM_DQFREE_RATIO 2 + +/* + * Dquot hashtable constants/threshold values. + */ +#define XFS_QM_HASHSIZE_LOW (PAGE_SIZE / sizeof(xfs_dqhash_t)) +#define XFS_QM_HASHSIZE_HIGH ((PAGE_SIZE * 4) / sizeof(xfs_dqhash_t)) + +/* + * This defines the unit of allocation of dquots. + * Currently, it is just one file system block, and a 4K blk contains 30 + * (136 * 30 = 4080) dquots. It's probably not worth trying to make + * this more dynamic. + * XXXsup However, if this number is changed, we have to make sure that we don't + * implicitly assume that we do allocations in chunks of a single filesystem + * block in the dquot/xqm code. + */ +#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 + +typedef xfs_dqhash_t xfs_dqlist_t; + +/* + * Quota Manager (global) structure. Lives only in core. + */ +typedef struct xfs_qm { + xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */ + xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */ + uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */ + struct list_head qm_dqfrlist; /* freelist of dquots */ + struct mutex qm_dqfrlist_lock; + int qm_dqfrlist_cnt; + atomic_t qm_totaldquots; /* total incore dquots */ + uint qm_nrefs; /* file systems with quota on */ + int qm_dqfree_ratio;/* ratio of free to inuse dquots */ + kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */ + kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */ +} xfs_qm_t; + +/* + * Various quota information for individual filesystems. + * The mount structure keeps a pointer to this. + */ +typedef struct xfs_quotainfo { + xfs_inode_t *qi_uquotaip; /* user quota inode */ + xfs_inode_t *qi_gquotaip; /* group quota inode */ + struct list_head qi_dqlist; /* all dquots in filesys */ + struct mutex qi_dqlist_lock; + int qi_dquots; + int qi_dqreclaims; /* a change here indicates + a removal in the dqlist */ + time_t qi_btimelimit; /* limit for blks timer */ + time_t qi_itimelimit; /* limit for inodes timer */ + time_t qi_rtbtimelimit;/* limit for rt blks timer */ + xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ + xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ + xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ + struct mutex qi_quotaofflock;/* to serialize quotaoff */ + xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ + uint qi_dqperchunk; /* # ondisk dqs in above chunk */ + xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */ + xfs_qcnt_t qi_bsoftlimit; /* default data blk soft limit */ + xfs_qcnt_t qi_ihardlimit; /* default inode count hard limit */ + xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */ + xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */ + xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */ +} xfs_quotainfo_t; + + +extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); +extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, + xfs_dquot_t *, xfs_dquot_t *, long, long, uint); +extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *); +extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *); + +/* + * We keep the usr and grp dquots separately so that locking will be easier + * to do at commit time. All transactions that we know of at this point + * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value. + */ +#define XFS_QM_TRANS_MAXDQS 2 +typedef struct xfs_dquot_acct { + xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS]; + xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS]; +} xfs_dquot_acct_t; + +/* + * Users are allowed to have a usage exceeding their softlimit for + * a period this long. + */ +#define XFS_QM_BTIMELIMIT (7 * 24*60*60) /* 1 week */ +#define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */ +#define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */ + +#define XFS_QM_BWARNLIMIT 5 +#define XFS_QM_IWARNLIMIT 5 +#define XFS_QM_RTBWARNLIMIT 5 + +extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); +extern int xfs_qm_quotacheck(xfs_mount_t *); +extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); + +/* dquot stuff */ +extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **); +extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); +extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); + +/* quota ops */ +extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); +extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, + fs_disk_quota_t *); +extern int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint, + fs_disk_quota_t *); +extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); +extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); +extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); + +#ifdef DEBUG +extern int xfs_qm_internalqcheck(xfs_mount_t *); +#else +#define xfs_qm_internalqcheck(mp) (0) +#endif + +#endif /* __XFS_QM_H__ */ diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c new file mode 100644 index 00000000..a0a829ad --- /dev/null +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_itable.h" +#include "xfs_bmap.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_qm.h" + + +STATIC void +xfs_fill_statvfs_from_dquot( + struct kstatfs *statp, + xfs_disk_dquot_t *dp) +{ + __uint64_t limit; + + limit = dp->d_blk_softlimit ? + be64_to_cpu(dp->d_blk_softlimit) : + be64_to_cpu(dp->d_blk_hardlimit); + if (limit && statp->f_blocks > limit) { + statp->f_blocks = limit; + statp->f_bfree = statp->f_bavail = + (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? + (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; + } + + limit = dp->d_ino_softlimit ? + be64_to_cpu(dp->d_ino_softlimit) : + be64_to_cpu(dp->d_ino_hardlimit); + if (limit && statp->f_files > limit) { + statp->f_files = limit; + statp->f_ffree = + (statp->f_files > be64_to_cpu(dp->d_icount)) ? + (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0; + } +} + + +/* + * Directory tree accounting is implemented using project quotas, where + * the project identifier is inherited from parent directories. + * A statvfs (df, etc.) of a directory that is using project quota should + * return a statvfs of the project, not the entire filesystem. + * This makes such trees appear as if they are filesystems in themselves. + */ +void +xfs_qm_statvfs( + xfs_inode_t *ip, + struct kstatfs *statp) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_dquot_t *dqp; + + if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) { + xfs_fill_statvfs_from_dquot(statp, &dqp->q_core); + xfs_qm_dqput(dqp); + } +} + +int +xfs_qm_newmount( + xfs_mount_t *mp, + uint *needquotamount, + uint *quotaflags) +{ + uint quotaondisk; + uint uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0; + + quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) && + (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT); + + if (quotaondisk) { + uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT; + pquotaondisk = mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT; + gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT; + } + + /* + * If the device itself is read-only, we can't allow + * the user to change the state of quota on the mount - + * this would generate a transaction on the ro device, + * which would lead to an I/O error and shutdown + */ + + if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || + (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || + (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) || + (!pquotaondisk && XFS_IS_PQUOTA_ON(mp)) || + (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || + (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) && + xfs_dev_is_read_only(mp, "changing quota state")) { + xfs_warn(mp, "please mount with%s%s%s%s.", + (!quotaondisk ? "out quota" : ""), + (uquotaondisk ? " usrquota" : ""), + (pquotaondisk ? " prjquota" : ""), + (gquotaondisk ? " grpquota" : "")); + return XFS_ERROR(EPERM); + } + + if (XFS_IS_QUOTA_ON(mp) || quotaondisk) { + /* + * Call mount_quotas at this point only if we won't have to do + * a quotacheck. + */ + if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) { + /* + * If an error occurred, qm_mount_quotas code + * has already disabled quotas. So, just finish + * mounting, and get on with the boring life + * without disk quotas. + */ + xfs_qm_mount_quotas(mp); + } else { + /* + * Clear the quota flags, but remember them. This + * is so that the quota code doesn't get invoked + * before we're ready. This can happen when an + * inode goes inactive and wants to free blocks, + * or via xfs_log_mount_finish. + */ + *needquotamount = B_TRUE; + *quotaflags = mp->m_qflags; + mp->m_qflags = 0; + } + } + + return 0; +} + +void __init +xfs_qm_init(void) +{ + printk(KERN_INFO "SGI XFS Quota Management subsystem\n"); + mutex_init(&xfs_Gqm_lock); + xfs_qm_init_procfs(); +} + +void __exit +xfs_qm_exit(void) +{ + xfs_qm_cleanup_procfs(); + if (qm_dqzone) + kmem_zone_destroy(qm_dqzone); + if (qm_dqtrxzone) + kmem_zone_destroy(qm_dqtrxzone); +} diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c new file mode 100644 index 00000000..8671a0b3 --- /dev/null +++ b/fs/xfs/quota/xfs_qm_stats.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_itable.h" +#include "xfs_bmap.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_qm.h" + +struct xqmstats xqmstats; + +static int xqm_proc_show(struct seq_file *m, void *v) +{ + /* maximum; incore; ratio free to inuse; freelist */ + seq_printf(m, "%d\t%d\t%d\t%u\n", + ndquot, + xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, + xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, + xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0); + return 0; +} + +static int xqm_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xqm_proc_show, NULL); +} + +static const struct file_operations xqm_proc_fops = { + .owner = THIS_MODULE, + .open = xqm_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int xqmstat_proc_show(struct seq_file *m, void *v) +{ + /* quota performance statistics */ + seq_printf(m, "qm %u %u %u %u %u %u %u %u\n", + xqmstats.xs_qm_dqreclaims, + xqmstats.xs_qm_dqreclaim_misses, + xqmstats.xs_qm_dquot_dups, + xqmstats.xs_qm_dqcachemisses, + xqmstats.xs_qm_dqcachehits, + xqmstats.xs_qm_dqwants, + xqmstats.xs_qm_dqshake_reclaims, + xqmstats.xs_qm_dqinact_reclaims); + return 0; +} + +static int xqmstat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xqmstat_proc_show, NULL); +} + +static const struct file_operations xqmstat_proc_fops = { + .owner = THIS_MODULE, + .open = xqmstat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +void +xfs_qm_init_procfs(void) +{ + proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops); + proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops); +} + +void +xfs_qm_cleanup_procfs(void) +{ + remove_proc_entry("fs/xfs/xqm", NULL); + remove_proc_entry("fs/xfs/xqmstat", NULL); +} diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h new file mode 100644 index 00000000..5b964fc0 --- /dev/null +++ b/fs/xfs/quota/xfs_qm_stats.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2002 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_QM_STATS_H__ +#define __XFS_QM_STATS_H__ + +#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF) + +/* + * XQM global statistics + */ +struct xqmstats { + __uint32_t xs_qm_dqreclaims; + __uint32_t xs_qm_dqreclaim_misses; + __uint32_t xs_qm_dquot_dups; + __uint32_t xs_qm_dqcachemisses; + __uint32_t xs_qm_dqcachehits; + __uint32_t xs_qm_dqwants; + __uint32_t xs_qm_dqshake_reclaims; + __uint32_t xs_qm_dqinact_reclaims; +}; + +extern struct xqmstats xqmstats; + +# define XQM_STATS_INC(count) ( (count)++ ) + +extern void xfs_qm_init_procfs(void); +extern void xfs_qm_cleanup_procfs(void); + +#else + +# define XQM_STATS_INC(count) do { } while (0) + +static inline void xfs_qm_init_procfs(void) { }; +static inline void xfs_qm_cleanup_procfs(void) { }; + +#endif + +#endif /* __XFS_QM_STATS_H__ */ diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c new file mode 100644 index 00000000..2dadb15d --- /dev/null +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -0,0 +1,1259 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_itable.h" +#include "xfs_bmap.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_utils.h" +#include "xfs_qm.h" +#include "xfs_trace.h" + +STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); +STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, + uint); +STATIC uint xfs_qm_export_flags(uint); +STATIC uint xfs_qm_export_qtype_flags(uint); +STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *, + fs_disk_quota_t *); + + +/* + * Turn off quota accounting and/or enforcement for all udquots and/or + * gdquots. Called only at unmount time. + * + * This assumes that there are no dquots of this file system cached + * incore, and modifies the ondisk dquot directly. Therefore, for example, + * it is an error to call this twice, without purging the cache. + */ +int +xfs_qm_scall_quotaoff( + xfs_mount_t *mp, + uint flags) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + uint dqtype; + int error; + uint inactivate_flags; + xfs_qoff_logitem_t *qoffstart; + int nculprits; + + /* + * No file system can have quotas enabled on disk but not in core. + * Note that quota utilities (like quotaoff) _expect_ + * errno == EEXIST here. + */ + if ((mp->m_qflags & flags) == 0) + return XFS_ERROR(EEXIST); + error = 0; + + flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); + + /* + * We don't want to deal with two quotaoffs messing up each other, + * so we're going to serialize it. quotaoff isn't exactly a performance + * critical thing. + * If quotaoff, then we must be dealing with the root filesystem. + */ + ASSERT(q); + mutex_lock(&q->qi_quotaofflock); + + /* + * If we're just turning off quota enforcement, change mp and go. + */ + if ((flags & XFS_ALL_QUOTA_ACCT) == 0) { + mp->m_qflags &= ~(flags); + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = mp->m_qflags; + spin_unlock(&mp->m_sb_lock); + mutex_unlock(&q->qi_quotaofflock); + + /* XXX what to do if error ? Revert back to old vals incore ? */ + error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); + return (error); + } + + dqtype = 0; + inactivate_flags = 0; + /* + * If accounting is off, we must turn enforcement off, clear the + * quota 'CHKD' certificate to make it known that we have to + * do a quotacheck the next time this quota is turned on. + */ + if (flags & XFS_UQUOTA_ACCT) { + dqtype |= XFS_QMOPT_UQUOTA; + flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD); + inactivate_flags |= XFS_UQUOTA_ACTIVE; + } + if (flags & XFS_GQUOTA_ACCT) { + dqtype |= XFS_QMOPT_GQUOTA; + flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); + inactivate_flags |= XFS_GQUOTA_ACTIVE; + } else if (flags & XFS_PQUOTA_ACCT) { + dqtype |= XFS_QMOPT_PQUOTA; + flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); + inactivate_flags |= XFS_PQUOTA_ACTIVE; + } + + /* + * Nothing to do? Don't complain. This happens when we're just + * turning off quota enforcement. + */ + if ((mp->m_qflags & flags) == 0) + goto out_unlock; + + /* + * Write the LI_QUOTAOFF log record, and do SB changes atomically, + * and synchronously. If we fail to write, we should abort the + * operation as it cannot be recovered safely if we crash. + */ + error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); + if (error) + goto out_unlock; + + /* + * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct + * to take care of the race between dqget and quotaoff. We don't take + * any special locks to reset these bits. All processes need to check + * these bits *after* taking inode lock(s) to see if the particular + * quota type is in the process of being turned off. If *ACTIVE, it is + * guaranteed that all dquot structures and all quotainode ptrs will all + * stay valid as long as that inode is kept locked. + * + * There is no turning back after this. + */ + mp->m_qflags &= ~inactivate_flags; + + /* + * Give back all the dquot reference(s) held by inodes. + * Here we go thru every single incore inode in this file system, and + * do a dqrele on the i_udquot/i_gdquot that it may have. + * Essentially, as long as somebody has an inode locked, this guarantees + * that quotas will not be turned off. This is handy because in a + * transaction once we lock the inode(s) and check for quotaon, we can + * depend on the quota inodes (and other things) being valid as long as + * we keep the lock(s). + */ + xfs_qm_dqrele_all_inodes(mp, flags); + + /* + * Next we make the changes in the quota flag in the mount struct. + * This isn't protected by a particular lock directly, because we + * don't want to take a mrlock every time we depend on quotas being on. + */ + mp->m_qflags &= ~(flags); + + /* + * Go through all the dquots of this file system and purge them, + * according to what was turned off. We may not be able to get rid + * of all dquots, because dquots can have temporary references that + * are not attached to inodes. eg. xfs_setattr, xfs_create. + * So, if we couldn't purge all the dquots from the filesystem, + * we can't get rid of the incore data structures. + */ + while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype))) + delay(10 * nculprits); + + /* + * Transactions that had started before ACTIVE state bit was cleared + * could have logged many dquots, so they'd have higher LSNs than + * the first QUOTAOFF log record does. If we happen to crash when + * the tail of the log has gone past the QUOTAOFF record, but + * before the last dquot modification, those dquots __will__ + * recover, and that's not good. + * + * So, we have QUOTAOFF start and end logitems; the start + * logitem won't get overwritten until the end logitem appears... + */ + error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags); + if (error) { + /* We're screwed now. Shutdown is the only option. */ + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + goto out_unlock; + } + + /* + * If quotas is completely disabled, close shop. + */ + if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) || + ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) { + mutex_unlock(&q->qi_quotaofflock); + xfs_qm_destroy_quotainfo(mp); + return (0); + } + + /* + * Release our quotainode references if we don't need them anymore. + */ + if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) { + IRELE(q->qi_uquotaip); + q->qi_uquotaip = NULL; + } + if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) { + IRELE(q->qi_gquotaip); + q->qi_gquotaip = NULL; + } + +out_unlock: + mutex_unlock(&q->qi_quotaofflock); + return error; +} + +STATIC int +xfs_qm_scall_trunc_qfile( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + struct xfs_trans *tp; + int error; + + if (ino == NULLFSINO) + return 0; + + error = xfs_iget(mp, NULL, ino, 0, 0, &ip); + if (error) + return error; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); + error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + goto out_put; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip); + + error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, 1); + if (error) { + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT); + goto out_unlock; + } + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); +out_put: + IRELE(ip); + return error; +} + +int +xfs_qm_scall_trunc_qfiles( + xfs_mount_t *mp, + uint flags) +{ + int error = 0, error2 = 0; + + if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { + xfs_debug(mp, "%s: flags=%x m_qflags=%x\n", + __func__, flags, mp->m_qflags); + return XFS_ERROR(EINVAL); + } + + if (flags & XFS_DQ_USER) + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); + if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) + error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); + + return error ? error : error2; +} + +/* + * Switch on (a given) quota enforcement for a filesystem. This takes + * effect immediately. + * (Switching on quota accounting must be done at mount time.) + */ +int +xfs_qm_scall_quotaon( + xfs_mount_t *mp, + uint flags) +{ + int error; + uint qf; + __int64_t sbflags; + + flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); + /* + * Switching on quota accounting must be done at mount time. + */ + flags &= ~(XFS_ALL_QUOTA_ACCT); + + sbflags = 0; + + if (flags == 0) { + xfs_debug(mp, "%s: zero flags, m_qflags=%x\n", + __func__, mp->m_qflags); + return XFS_ERROR(EINVAL); + } + + /* No fs can turn on quotas with a delayed effect */ + ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0); + + /* + * Can't enforce without accounting. We check the superblock + * qflags here instead of m_qflags because rootfs can have + * quota acct on ondisk without m_qflags' knowing. + */ + if (((flags & XFS_UQUOTA_ACCT) == 0 && + (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && + (flags & XFS_UQUOTA_ENFD)) + || + ((flags & XFS_PQUOTA_ACCT) == 0 && + (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && + (flags & XFS_GQUOTA_ACCT) == 0 && + (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && + (flags & XFS_OQUOTA_ENFD))) { + xfs_debug(mp, + "%s: Can't enforce without acct, flags=%x sbflags=%x\n", + __func__, flags, mp->m_sb.sb_qflags); + return XFS_ERROR(EINVAL); + } + /* + * If everything's up to-date incore, then don't waste time. + */ + if ((mp->m_qflags & flags) == flags) + return XFS_ERROR(EEXIST); + + /* + * Change sb_qflags on disk but not incore mp->qflags + * if this is the root filesystem. + */ + spin_lock(&mp->m_sb_lock); + qf = mp->m_sb.sb_qflags; + mp->m_sb.sb_qflags = qf | flags; + spin_unlock(&mp->m_sb_lock); + + /* + * There's nothing to change if it's the same. + */ + if ((qf & flags) == flags && sbflags == 0) + return XFS_ERROR(EEXIST); + sbflags |= XFS_SB_QFLAGS; + + if ((error = xfs_qm_write_sb_changes(mp, sbflags))) + return (error); + /* + * If we aren't trying to switch on quota enforcement, we are done. + */ + if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) != + (mp->m_qflags & XFS_UQUOTA_ACCT)) || + ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) != + (mp->m_qflags & XFS_PQUOTA_ACCT)) || + ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) != + (mp->m_qflags & XFS_GQUOTA_ACCT)) || + (flags & XFS_ALL_QUOTA_ENFD) == 0) + return (0); + + if (! XFS_IS_QUOTA_RUNNING(mp)) + return XFS_ERROR(ESRCH); + + /* + * Switch on quota enforcement in core. + */ + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); + mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); + + return (0); +} + + +/* + * Return quota status information, such as uquota-off, enforcements, etc. + */ +int +xfs_qm_scall_getqstat( + struct xfs_mount *mp, + struct fs_quota_stat *out) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_inode *uip, *gip; + boolean_t tempuqip, tempgqip; + + uip = gip = NULL; + tempuqip = tempgqip = B_FALSE; + memset(out, 0, sizeof(fs_quota_stat_t)); + + out->qs_version = FS_QSTAT_VERSION; + if (!xfs_sb_version_hasquota(&mp->m_sb)) { + out->qs_uquota.qfs_ino = NULLFSINO; + out->qs_gquota.qfs_ino = NULLFSINO; + return (0); + } + out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & + (XFS_ALL_QUOTA_ACCT| + XFS_ALL_QUOTA_ENFD)); + out->qs_pad = 0; + out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; + out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; + + if (q) { + uip = q->qi_uquotaip; + gip = q->qi_gquotaip; + } + if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, + 0, 0, &uip) == 0) + tempuqip = B_TRUE; + } + if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + 0, 0, &gip) == 0) + tempgqip = B_TRUE; + } + if (uip) { + out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; + out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; + if (tempuqip) + IRELE(uip); + } + if (gip) { + out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; + out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; + if (tempgqip) + IRELE(gip); + } + if (q) { + out->qs_incoredqs = q->qi_dquots; + out->qs_btimelimit = q->qi_btimelimit; + out->qs_itimelimit = q->qi_itimelimit; + out->qs_rtbtimelimit = q->qi_rtbtimelimit; + out->qs_bwarnlimit = q->qi_bwarnlimit; + out->qs_iwarnlimit = q->qi_iwarnlimit; + } + return 0; +} + +#define XFS_DQ_MASK \ + (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK) + +/* + * Adjust quota limits, and start/stop timers accordingly. + */ +int +xfs_qm_scall_setqlim( + xfs_mount_t *mp, + xfs_dqid_t id, + uint type, + fs_disk_quota_t *newlim) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + xfs_disk_dquot_t *ddq; + xfs_dquot_t *dqp; + xfs_trans_t *tp; + int error; + xfs_qcnt_t hard, soft; + + if (newlim->d_fieldmask & ~XFS_DQ_MASK) + return EINVAL; + if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) + return 0; + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); + if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, + 0, 0, XFS_DEFAULT_LOG_COUNT))) { + xfs_trans_cancel(tp, 0); + return (error); + } + + /* + * We don't want to race with a quotaoff so take the quotaoff lock. + * (We don't hold an inode lock, so there's nothing else to stop + * a quotaoff from happening). (XXXThis doesn't currently happen + * because we take the vfslock before calling xfs_qm_sysent). + */ + mutex_lock(&q->qi_quotaofflock); + + /* + * Get the dquot (locked), and join it to the transaction. + * Allocate the dquot if this doesn't exist. + */ + if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { + xfs_trans_cancel(tp, XFS_TRANS_ABORT); + ASSERT(error != ENOENT); + goto out_unlock; + } + xfs_trans_dqjoin(tp, dqp); + ddq = &dqp->q_core; + + /* + * Make sure that hardlimits are >= soft limits before changing. + */ + hard = (newlim->d_fieldmask & FS_DQ_BHARD) ? + (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) : + be64_to_cpu(ddq->d_blk_hardlimit); + soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ? + (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) : + be64_to_cpu(ddq->d_blk_softlimit); + if (hard == 0 || hard >= soft) { + ddq->d_blk_hardlimit = cpu_to_be64(hard); + ddq->d_blk_softlimit = cpu_to_be64(soft); + if (id == 0) { + q->qi_bhardlimit = hard; + q->qi_bsoftlimit = soft; + } + } else { + xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft); + } + hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ? + (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) : + be64_to_cpu(ddq->d_rtb_hardlimit); + soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ? + (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) : + be64_to_cpu(ddq->d_rtb_softlimit); + if (hard == 0 || hard >= soft) { + ddq->d_rtb_hardlimit = cpu_to_be64(hard); + ddq->d_rtb_softlimit = cpu_to_be64(soft); + if (id == 0) { + q->qi_rtbhardlimit = hard; + q->qi_rtbsoftlimit = soft; + } + } else { + xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft); + } + + hard = (newlim->d_fieldmask & FS_DQ_IHARD) ? + (xfs_qcnt_t) newlim->d_ino_hardlimit : + be64_to_cpu(ddq->d_ino_hardlimit); + soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ? + (xfs_qcnt_t) newlim->d_ino_softlimit : + be64_to_cpu(ddq->d_ino_softlimit); + if (hard == 0 || hard >= soft) { + ddq->d_ino_hardlimit = cpu_to_be64(hard); + ddq->d_ino_softlimit = cpu_to_be64(soft); + if (id == 0) { + q->qi_ihardlimit = hard; + q->qi_isoftlimit = soft; + } + } else { + xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft); + } + + /* + * Update warnings counter(s) if requested + */ + if (newlim->d_fieldmask & FS_DQ_BWARNS) + ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns); + if (newlim->d_fieldmask & FS_DQ_IWARNS) + ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns); + if (newlim->d_fieldmask & FS_DQ_RTBWARNS) + ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns); + + if (id == 0) { + /* + * Timelimits for the super user set the relative time + * the other users can be over quota for this file system. + * If it is zero a default is used. Ditto for the default + * soft and hard limit values (already done, above), and + * for warnings. + */ + if (newlim->d_fieldmask & FS_DQ_BTIMER) { + q->qi_btimelimit = newlim->d_btimer; + ddq->d_btimer = cpu_to_be32(newlim->d_btimer); + } + if (newlim->d_fieldmask & FS_DQ_ITIMER) { + q->qi_itimelimit = newlim->d_itimer; + ddq->d_itimer = cpu_to_be32(newlim->d_itimer); + } + if (newlim->d_fieldmask & FS_DQ_RTBTIMER) { + q->qi_rtbtimelimit = newlim->d_rtbtimer; + ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer); + } + if (newlim->d_fieldmask & FS_DQ_BWARNS) + q->qi_bwarnlimit = newlim->d_bwarns; + if (newlim->d_fieldmask & FS_DQ_IWARNS) + q->qi_iwarnlimit = newlim->d_iwarns; + if (newlim->d_fieldmask & FS_DQ_RTBWARNS) + q->qi_rtbwarnlimit = newlim->d_rtbwarns; + } else { + /* + * If the user is now over quota, start the timelimit. + * The user will not be 'warned'. + * Note that we keep the timers ticking, whether enforcement + * is on or off. We don't really want to bother with iterating + * over all ondisk dquots and turning the timers on/off. + */ + xfs_qm_adjust_dqtimers(mp, ddq); + } + dqp->dq_flags |= XFS_DQ_DIRTY; + xfs_trans_log_dquot(tp, dqp); + + error = xfs_trans_commit(tp, 0); + xfs_qm_dqprint(dqp); + xfs_qm_dqrele(dqp); + + out_unlock: + mutex_unlock(&q->qi_quotaofflock); + return error; +} + +int +xfs_qm_scall_getquota( + xfs_mount_t *mp, + xfs_dqid_t id, + uint type, + fs_disk_quota_t *out) +{ + xfs_dquot_t *dqp; + int error; + + /* + * Try to get the dquot. We don't want it allocated on disk, so + * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't + * exist, we'll get ENOENT back. + */ + if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) { + return (error); + } + + /* + * If everything's NULL, this dquot doesn't quite exist as far as + * our utility programs are concerned. + */ + if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + xfs_qm_dqput(dqp); + return XFS_ERROR(ENOENT); + } + /* xfs_qm_dqprint(dqp); */ + /* + * Convert the disk dquot to the exportable format + */ + xfs_qm_export_dquot(mp, &dqp->q_core, out); + xfs_qm_dqput(dqp); + return (error ? XFS_ERROR(EFAULT) : 0); +} + + +STATIC int +xfs_qm_log_quotaoff_end( + xfs_mount_t *mp, + xfs_qoff_logitem_t *startqoff, + uint flags) +{ + xfs_trans_t *tp; + int error; + xfs_qoff_logitem_t *qoffi; + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); + + if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2, + 0, 0, XFS_DEFAULT_LOG_COUNT))) { + xfs_trans_cancel(tp, 0); + return (error); + } + + qoffi = xfs_trans_get_qoff_item(tp, startqoff, + flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + return (error); +} + + +STATIC int +xfs_qm_log_quotaoff( + xfs_mount_t *mp, + xfs_qoff_logitem_t **qoffstartp, + uint flags) +{ + xfs_trans_t *tp; + int error; + xfs_qoff_logitem_t *qoffi=NULL; + uint oldsbqflag=0; + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); + if ((error = xfs_trans_reserve(tp, 0, + sizeof(xfs_qoff_logitem_t) * 2 + + mp->m_sb.sb_sectsize + 128, + 0, + 0, + XFS_DEFAULT_LOG_COUNT))) { + goto error0; + } + + qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + + spin_lock(&mp->m_sb_lock); + oldsbqflag = mp->m_sb.sb_qflags; + mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; + spin_unlock(&mp->m_sb_lock); + + xfs_mod_sb(tp, XFS_SB_QFLAGS); + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + +error0: + if (error) { + xfs_trans_cancel(tp, 0); + /* + * No one else is modifying sb_qflags, so this is OK. + * We still hold the quotaofflock. + */ + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = oldsbqflag; + spin_unlock(&mp->m_sb_lock); + } + *qoffstartp = qoffi; + return (error); +} + + +/* + * Translate an internal style on-disk-dquot to the exportable format. + * The main differences are that the counters/limits are all in Basic + * Blocks (BBs) instead of the internal FSBs, and all on-disk data has + * to be converted to the native endianness. + */ +STATIC void +xfs_qm_export_dquot( + xfs_mount_t *mp, + xfs_disk_dquot_t *src, + struct fs_disk_quota *dst) +{ + memset(dst, 0, sizeof(*dst)); + dst->d_version = FS_DQUOT_VERSION; /* different from src->d_version */ + dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags); + dst->d_id = be32_to_cpu(src->d_id); + dst->d_blk_hardlimit = + XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit)); + dst->d_blk_softlimit = + XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit)); + dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit); + dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit); + dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount)); + dst->d_icount = be64_to_cpu(src->d_icount); + dst->d_btimer = be32_to_cpu(src->d_btimer); + dst->d_itimer = be32_to_cpu(src->d_itimer); + dst->d_iwarns = be16_to_cpu(src->d_iwarns); + dst->d_bwarns = be16_to_cpu(src->d_bwarns); + dst->d_rtb_hardlimit = + XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit)); + dst->d_rtb_softlimit = + XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit)); + dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount)); + dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer); + dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns); + + /* + * Internally, we don't reset all the timers when quota enforcement + * gets turned off. No need to confuse the user level code, + * so return zeroes in that case. + */ + if ((!XFS_IS_UQUOTA_ENFORCED(mp) && src->d_flags == XFS_DQ_USER) || + (!XFS_IS_OQUOTA_ENFORCED(mp) && + (src->d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) { + dst->d_btimer = 0; + dst->d_itimer = 0; + dst->d_rtbtimer = 0; + } + +#ifdef DEBUG + if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) || + (XFS_IS_OQUOTA_ENFORCED(mp) && + (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && + dst->d_id != 0) { + if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) && + (dst->d_blk_softlimit > 0)) { + ASSERT(dst->d_btimer != 0); + } + if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) && + (dst->d_ino_softlimit > 0)) { + ASSERT(dst->d_itimer != 0); + } + } +#endif +} + +STATIC uint +xfs_qm_export_qtype_flags( + uint flags) +{ + /* + * Can't be more than one, or none. + */ + ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) != + (FS_PROJ_QUOTA | FS_USER_QUOTA)); + ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) != + (FS_PROJ_QUOTA | FS_GROUP_QUOTA)); + ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) != + (FS_USER_QUOTA | FS_GROUP_QUOTA)); + ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0); + + return (flags & XFS_DQ_USER) ? + FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ? + FS_PROJ_QUOTA : FS_GROUP_QUOTA; +} + +STATIC uint +xfs_qm_export_flags( + uint flags) +{ + uint uflags; + + uflags = 0; + if (flags & XFS_UQUOTA_ACCT) + uflags |= FS_QUOTA_UDQ_ACCT; + if (flags & XFS_PQUOTA_ACCT) + uflags |= FS_QUOTA_PDQ_ACCT; + if (flags & XFS_GQUOTA_ACCT) + uflags |= FS_QUOTA_GDQ_ACCT; + if (flags & XFS_UQUOTA_ENFD) + uflags |= FS_QUOTA_UDQ_ENFD; + if (flags & (XFS_OQUOTA_ENFD)) { + uflags |= (flags & XFS_GQUOTA_ACCT) ? + FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD; + } + return (uflags); +} + + +STATIC int +xfs_dqrele_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags) +{ + /* skip quota inodes */ + if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || + ip == ip->i_mount->m_quotainfo->qi_gquotaip) { + ASSERT(ip->i_udquot == NULL); + ASSERT(ip->i_gdquot == NULL); + return 0; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { + xfs_qm_dqrele(ip->i_udquot); + ip->i_udquot = NULL; + } + if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) { + xfs_qm_dqrele(ip->i_gdquot); + ip->i_gdquot = NULL; + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + + +/* + * Go thru all the inodes in the file system, releasing their dquots. + * + * Note that the mount structure gets modified to indicate that quotas are off + * AFTER this, in the case of quotaoff. + */ +void +xfs_qm_dqrele_all_inodes( + struct xfs_mount *mp, + uint flags) +{ + ASSERT(mp->m_quotainfo); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags); +} + +/*------------------------------------------------------------------------*/ +#ifdef DEBUG +/* + * This contains all the test functions for XFS disk quotas. + * Currently it does a quota accounting check. ie. it walks through + * all inodes in the file system, calculating the dquot accounting fields, + * and prints out any inconsistencies. + */ +xfs_dqhash_t *qmtest_udqtab; +xfs_dqhash_t *qmtest_gdqtab; +int qmtest_hashmask; +int qmtest_nfails; +struct mutex qcheck_lock; + +#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \ + (__psunsigned_t)(id)) & \ + (qmtest_hashmask - 1)) + +#define DQTEST_HASH(mp, id, type) ((type & XFS_DQ_USER) ? \ + (qmtest_udqtab + \ + DQTEST_HASHVAL(mp, id)) : \ + (qmtest_gdqtab + \ + DQTEST_HASHVAL(mp, id))) + +#define DQTEST_LIST_PRINT(l, NXT, title) \ +{ \ + xfs_dqtest_t *dqp; int i = 0;\ + xfs_debug(NULL, "%s (#%d)", title, (int) (l)->qh_nelems); \ + for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \ + dqp = (xfs_dqtest_t *)dqp->NXT) { \ + xfs_debug(dqp->q_mount, \ + " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \ + ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \ + dqp->d_bcount, dqp->d_icount); } \ +} + +typedef struct dqtest { + uint dq_flags; /* various flags (XFS_DQ_*) */ + struct list_head q_hashlist; + xfs_dqhash_t *q_hash; /* the hashchain header */ + xfs_mount_t *q_mount; /* filesystem this relates to */ + xfs_dqid_t d_id; /* user id or group id */ + xfs_qcnt_t d_bcount; /* # disk blocks owned by the user */ + xfs_qcnt_t d_icount; /* # inodes owned by the user */ +} xfs_dqtest_t; + +STATIC void +xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp) +{ + list_add(&dqp->q_hashlist, &h->qh_list); + h->qh_version++; + h->qh_nelems++; +} +STATIC void +xfs_qm_dqtest_print( + struct xfs_mount *mp, + struct dqtest *d) +{ + xfs_debug(mp, "-----------DQTEST DQUOT----------------"); + xfs_debug(mp, "---- dquot ID = %d", d->d_id); + xfs_debug(mp, "---- fs = 0x%p", d->q_mount); + xfs_debug(mp, "---- bcount = %Lu (0x%x)", + d->d_bcount, (int)d->d_bcount); + xfs_debug(mp, "---- icount = %Lu (0x%x)", + d->d_icount, (int)d->d_icount); + xfs_debug(mp, "---------------------------"); +} + +STATIC void +xfs_qm_dqtest_failed( + xfs_dqtest_t *d, + xfs_dquot_t *dqp, + char *reason, + xfs_qcnt_t a, + xfs_qcnt_t b, + int error) +{ + qmtest_nfails++; + if (error) + xfs_debug(dqp->q_mount, + "quotacheck failed id=%d, err=%d\nreason: %s", + d->d_id, error, reason); + else + xfs_debug(dqp->q_mount, + "quotacheck failed id=%d (%s) [%d != %d]", + d->d_id, reason, (int)a, (int)b); + xfs_qm_dqtest_print(dqp->q_mount, d); + if (dqp) + xfs_qm_dqprint(dqp); +} + +STATIC int +xfs_dqtest_cmp2( + xfs_dqtest_t *d, + xfs_dquot_t *dqp) +{ + int err = 0; + if (be64_to_cpu(dqp->q_core.d_icount) != d->d_icount) { + xfs_qm_dqtest_failed(d, dqp, "icount mismatch", + be64_to_cpu(dqp->q_core.d_icount), + d->d_icount, 0); + err++; + } + if (be64_to_cpu(dqp->q_core.d_bcount) != d->d_bcount) { + xfs_qm_dqtest_failed(d, dqp, "bcount mismatch", + be64_to_cpu(dqp->q_core.d_bcount), + d->d_bcount, 0); + err++; + } + if (dqp->q_core.d_blk_softlimit && + be64_to_cpu(dqp->q_core.d_bcount) >= + be64_to_cpu(dqp->q_core.d_blk_softlimit)) { + if (!dqp->q_core.d_btimer && dqp->q_core.d_id) { + xfs_debug(dqp->q_mount, + "%d [%s] BLK TIMER NOT STARTED", + d->d_id, DQFLAGTO_TYPESTR(d)); + err++; + } + } + if (dqp->q_core.d_ino_softlimit && + be64_to_cpu(dqp->q_core.d_icount) >= + be64_to_cpu(dqp->q_core.d_ino_softlimit)) { + if (!dqp->q_core.d_itimer && dqp->q_core.d_id) { + xfs_debug(dqp->q_mount, + "%d [%s] INO TIMER NOT STARTED", + d->d_id, DQFLAGTO_TYPESTR(d)); + err++; + } + } +#ifdef QUOTADEBUG + if (!err) { + xfs_debug(dqp->q_mount, "%d [%s] qchecked", + d->d_id, DQFLAGTO_TYPESTR(d)); + } +#endif + return (err); +} + +STATIC void +xfs_dqtest_cmp( + xfs_dqtest_t *d) +{ + xfs_dquot_t *dqp; + int error; + + /* xfs_qm_dqtest_print(d); */ + if ((error = xfs_qm_dqget(d->q_mount, NULL, d->d_id, d->dq_flags, 0, + &dqp))) { + xfs_qm_dqtest_failed(d, NULL, "dqget failed", 0, 0, error); + return; + } + xfs_dqtest_cmp2(d, dqp); + xfs_qm_dqput(dqp); +} + +STATIC int +xfs_qm_internalqcheck_dqget( + xfs_mount_t *mp, + xfs_dqid_t id, + uint type, + xfs_dqtest_t **O_dq) +{ + xfs_dqtest_t *d; + xfs_dqhash_t *h; + + h = DQTEST_HASH(mp, id, type); + list_for_each_entry(d, &h->qh_list, q_hashlist) { + if (d->d_id == id && mp == d->q_mount) { + *O_dq = d; + return (0); + } + } + d = kmem_zalloc(sizeof(xfs_dqtest_t), KM_SLEEP); + d->dq_flags = type; + d->d_id = id; + d->q_mount = mp; + d->q_hash = h; + INIT_LIST_HEAD(&d->q_hashlist); + xfs_qm_hashinsert(h, d); + *O_dq = d; + return (0); +} + +STATIC void +xfs_qm_internalqcheck_get_dquots( + xfs_mount_t *mp, + xfs_dqid_t uid, + xfs_dqid_t projid, + xfs_dqid_t gid, + xfs_dqtest_t **ud, + xfs_dqtest_t **gd) +{ + if (XFS_IS_UQUOTA_ON(mp)) + xfs_qm_internalqcheck_dqget(mp, uid, XFS_DQ_USER, ud); + if (XFS_IS_GQUOTA_ON(mp)) + xfs_qm_internalqcheck_dqget(mp, gid, XFS_DQ_GROUP, gd); + else if (XFS_IS_PQUOTA_ON(mp)) + xfs_qm_internalqcheck_dqget(mp, projid, XFS_DQ_PROJ, gd); +} + + +STATIC void +xfs_qm_internalqcheck_dqadjust( + xfs_inode_t *ip, + xfs_dqtest_t *d) +{ + d->d_icount++; + d->d_bcount += (xfs_qcnt_t)ip->i_d.di_nblocks; +} + +STATIC int +xfs_qm_internalqcheck_adjust( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* not used */ + int ubsize, /* not used */ + int *ubused, /* not used */ + int *res) /* bulkstat result code */ +{ + xfs_inode_t *ip; + xfs_dqtest_t *ud, *gd; + uint lock_flags; + boolean_t ipreleased; + int error; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { + *res = BULKSTAT_RV_NOTHING; + xfs_debug(mp, "%s: ino=%llu, uqino=%llu, gqino=%llu\n", + __func__, (unsigned long long) ino, + (unsigned long long) mp->m_sb.sb_uquotino, + (unsigned long long) mp->m_sb.sb_gquotino); + return XFS_ERROR(EINVAL); + } + ipreleased = B_FALSE; + again: + lock_flags = XFS_ILOCK_SHARED; + if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip))) { + *res = BULKSTAT_RV_NOTHING; + return (error); + } + + /* + * This inode can have blocks after eof which can get released + * when we send it to inactive. Since we don't check the dquot + * until the after all our calculations are done, we must get rid + * of those now. + */ + if (! ipreleased) { + xfs_iunlock(ip, lock_flags); + IRELE(ip); + ipreleased = B_TRUE; + goto again; + } + xfs_qm_internalqcheck_get_dquots(mp, + (xfs_dqid_t) ip->i_d.di_uid, + (xfs_dqid_t) xfs_get_projid(ip), + (xfs_dqid_t) ip->i_d.di_gid, + &ud, &gd); + if (XFS_IS_UQUOTA_ON(mp)) { + ASSERT(ud); + xfs_qm_internalqcheck_dqadjust(ip, ud); + } + if (XFS_IS_OQUOTA_ON(mp)) { + ASSERT(gd); + xfs_qm_internalqcheck_dqadjust(ip, gd); + } + xfs_iunlock(ip, lock_flags); + IRELE(ip); + *res = BULKSTAT_RV_DIDONE; + return (0); +} + + +/* PRIVATE, debugging */ +int +xfs_qm_internalqcheck( + xfs_mount_t *mp) +{ + xfs_ino_t lastino; + int done, count; + int i; + int error; + + lastino = 0; + qmtest_hashmask = 32; + count = 5; + done = 0; + qmtest_nfails = 0; + + if (! XFS_IS_QUOTA_ON(mp)) + return XFS_ERROR(ESRCH); + + xfs_log_force(mp, XFS_LOG_SYNC); + XFS_bflush(mp->m_ddev_targp); + xfs_log_force(mp, XFS_LOG_SYNC); + XFS_bflush(mp->m_ddev_targp); + + mutex_lock(&qcheck_lock); + /* There should be absolutely no quota activity while this + is going on. */ + qmtest_udqtab = kmem_zalloc(qmtest_hashmask * + sizeof(xfs_dqhash_t), KM_SLEEP); + qmtest_gdqtab = kmem_zalloc(qmtest_hashmask * + sizeof(xfs_dqhash_t), KM_SLEEP); + do { + /* + * Iterate thru all the inodes in the file system, + * adjusting the corresponding dquot counters + */ + error = xfs_bulkstat(mp, &lastino, &count, + xfs_qm_internalqcheck_adjust, + 0, NULL, &done); + if (error) { + xfs_debug(mp, "Bulkstat returned error 0x%x", error); + break; + } + } while (!done); + + xfs_debug(mp, "Checking results against system dquots"); + for (i = 0; i < qmtest_hashmask; i++) { + xfs_dqtest_t *d, *n; + xfs_dqhash_t *h; + + h = &qmtest_udqtab[i]; + list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) { + xfs_dqtest_cmp(d); + kmem_free(d); + } + h = &qmtest_gdqtab[i]; + list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) { + xfs_dqtest_cmp(d); + kmem_free(d); + } + } + + if (qmtest_nfails) { + xfs_debug(mp, "******** quotacheck failed ********"); + xfs_debug(mp, "failures = %d", qmtest_nfails); + } else { + xfs_debug(mp, "******** quotacheck successful! ********"); + } + kmem_free(qmtest_udqtab); + kmem_free(qmtest_gdqtab); + mutex_unlock(&qcheck_lock); + return (qmtest_nfails); +} + +#endif /* DEBUG */ diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h new file mode 100644 index 00000000..94a3d927 --- /dev/null +++ b/fs/xfs/quota/xfs_quota_priv.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_QUOTA_PRIV_H__ +#define __XFS_QUOTA_PRIV_H__ + +/* + * Number of bmaps that we ask from bmapi when doing a quotacheck. + * We make this restriction to keep the memory usage to a minimum. + */ +#define XFS_DQITER_MAP_SIZE 10 + +/* + * Hash into a bucket in the dquot hash table, based on . + */ +#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \ + (__psunsigned_t)(id)) & \ + (xfs_Gqm->qm_dqhashmask - 1)) +#define XFS_DQ_HASH(mp, id, type) (type == XFS_DQ_USER ? \ + (xfs_Gqm->qm_usr_dqhtable + \ + XFS_DQ_HASHVAL(mp, id)) : \ + (xfs_Gqm->qm_grp_dqhtable + \ + XFS_DQ_HASHVAL(mp, id))) +#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ + !dqp->q_core.d_blk_hardlimit && \ + !dqp->q_core.d_blk_softlimit && \ + !dqp->q_core.d_rtb_hardlimit && \ + !dqp->q_core.d_rtb_softlimit && \ + !dqp->q_core.d_ino_hardlimit && \ + !dqp->q_core.d_ino_softlimit && \ + !dqp->q_core.d_bcount && \ + !dqp->q_core.d_rtbcount && \ + !dqp->q_core.d_icount) + +#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ + (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ + (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) + +#endif /* __XFS_QUOTA_PRIV_H__ */ diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c new file mode 100644 index 00000000..2a364873 --- /dev/null +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -0,0 +1,895 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_itable.h" +#include "xfs_bmap.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_qm.h" + +STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); + +/* + * Add the locked dquot to the transaction. + * The dquot must be locked, and it cannot be associated with any + * transaction. + */ +void +xfs_trans_dqjoin( + xfs_trans_t *tp, + xfs_dquot_t *dqp) +{ + ASSERT(dqp->q_transp != tp); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(dqp->q_logitem.qli_dquot == dqp); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &dqp->q_logitem.qli_item); + + /* + * Initialize i_transp so we can later determine if this dquot is + * associated with this transaction. + */ + dqp->q_transp = tp; +} + + +/* + * This is called to mark the dquot as needing + * to be logged when the transaction is committed. The dquot must + * already be associated with the given transaction. + * Note that it marks the entire transaction as dirty. In the ordinary + * case, this gets called via xfs_trans_commit, after the transaction + * is already dirty. However, there's nothing stop this from getting + * called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY + * flag. + */ +void +xfs_trans_log_dquot( + xfs_trans_t *tp, + xfs_dquot_t *dqp) +{ + ASSERT(dqp->q_transp == tp); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + tp->t_flags |= XFS_TRANS_DIRTY; + dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} + +/* + * Carry forward whatever is left of the quota blk reservation to + * the spanky new transaction + */ +void +xfs_trans_dup_dqinfo( + xfs_trans_t *otp, + xfs_trans_t *ntp) +{ + xfs_dqtrx_t *oq, *nq; + int i,j; + xfs_dqtrx_t *oqa, *nqa; + + if (!otp->t_dqinfo) + return; + + xfs_trans_alloc_dqinfo(ntp); + oqa = otp->t_dqinfo->dqa_usrdquots; + nqa = ntp->t_dqinfo->dqa_usrdquots; + + /* + * Because the quota blk reservation is carried forward, + * it is also necessary to carry forward the DQ_DIRTY flag. + */ + if(otp->t_flags & XFS_TRANS_DQ_DIRTY) + ntp->t_flags |= XFS_TRANS_DQ_DIRTY; + + for (j = 0; j < 2; j++) { + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + if (oqa[i].qt_dquot == NULL) + break; + oq = &oqa[i]; + nq = &nqa[i]; + + nq->qt_dquot = oq->qt_dquot; + nq->qt_bcount_delta = nq->qt_icount_delta = 0; + nq->qt_rtbcount_delta = 0; + + /* + * Transfer whatever is left of the reservations. + */ + nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used; + oq->qt_blk_res = oq->qt_blk_res_used; + + nq->qt_rtblk_res = oq->qt_rtblk_res - + oq->qt_rtblk_res_used; + oq->qt_rtblk_res = oq->qt_rtblk_res_used; + + nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used; + oq->qt_ino_res = oq->qt_ino_res_used; + + } + oqa = otp->t_dqinfo->dqa_grpdquots; + nqa = ntp->t_dqinfo->dqa_grpdquots; + } +} + +/* + * Wrap around mod_dquot to account for both user and group quotas. + */ +void +xfs_trans_mod_dquot_byino( + xfs_trans_t *tp, + xfs_inode_t *ip, + uint field, + long delta) +{ + xfs_mount_t *mp = tp->t_mountp; + + if (!XFS_IS_QUOTA_RUNNING(mp) || + !XFS_IS_QUOTA_ON(mp) || + ip->i_ino == mp->m_sb.sb_uquotino || + ip->i_ino == mp->m_sb.sb_gquotino) + return; + + if (tp->t_dqinfo == NULL) + xfs_trans_alloc_dqinfo(tp); + + if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) + (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta); + if (XFS_IS_OQUOTA_ON(mp) && ip->i_gdquot) + (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta); +} + +STATIC xfs_dqtrx_t * +xfs_trans_get_dqtrx( + xfs_trans_t *tp, + xfs_dquot_t *dqp) +{ + int i; + xfs_dqtrx_t *qa; + + qa = XFS_QM_ISUDQ(dqp) ? + tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots; + + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + if (qa[i].qt_dquot == NULL || + qa[i].qt_dquot == dqp) + return &qa[i]; + } + + return NULL; +} + +/* + * Make the changes in the transaction structure. + * The moral equivalent to xfs_trans_mod_sb(). + * We don't touch any fields in the dquot, so we don't care + * if it's locked or not (most of the time it won't be). + */ +void +xfs_trans_mod_dquot( + xfs_trans_t *tp, + xfs_dquot_t *dqp, + uint field, + long delta) +{ + xfs_dqtrx_t *qtrx; + + ASSERT(tp); + ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp)); + qtrx = NULL; + + if (tp->t_dqinfo == NULL) + xfs_trans_alloc_dqinfo(tp); + /* + * Find either the first free slot or the slot that belongs + * to this dquot. + */ + qtrx = xfs_trans_get_dqtrx(tp, dqp); + ASSERT(qtrx); + if (qtrx->qt_dquot == NULL) + qtrx->qt_dquot = dqp; + + switch (field) { + + /* + * regular disk blk reservation + */ + case XFS_TRANS_DQ_RES_BLKS: + qtrx->qt_blk_res += (ulong)delta; + break; + + /* + * inode reservation + */ + case XFS_TRANS_DQ_RES_INOS: + qtrx->qt_ino_res += (ulong)delta; + break; + + /* + * disk blocks used. + */ + case XFS_TRANS_DQ_BCOUNT: + if (qtrx->qt_blk_res && delta > 0) { + qtrx->qt_blk_res_used += (ulong)delta; + ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used); + } + qtrx->qt_bcount_delta += delta; + break; + + case XFS_TRANS_DQ_DELBCOUNT: + qtrx->qt_delbcnt_delta += delta; + break; + + /* + * Inode Count + */ + case XFS_TRANS_DQ_ICOUNT: + if (qtrx->qt_ino_res && delta > 0) { + qtrx->qt_ino_res_used += (ulong)delta; + ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used); + } + qtrx->qt_icount_delta += delta; + break; + + /* + * rtblk reservation + */ + case XFS_TRANS_DQ_RES_RTBLKS: + qtrx->qt_rtblk_res += (ulong)delta; + break; + + /* + * rtblk count + */ + case XFS_TRANS_DQ_RTBCOUNT: + if (qtrx->qt_rtblk_res && delta > 0) { + qtrx->qt_rtblk_res_used += (ulong)delta; + ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used); + } + qtrx->qt_rtbcount_delta += delta; + break; + + case XFS_TRANS_DQ_DELRTBCOUNT: + qtrx->qt_delrtb_delta += delta; + break; + + default: + ASSERT(0); + } + tp->t_flags |= XFS_TRANS_DQ_DIRTY; +} + + +/* + * Given an array of dqtrx structures, lock all the dquots associated + * and join them to the transaction, provided they have been modified. + * We know that the highest number of dquots (of one type - usr OR grp), + * involved in a transaction is 2 and that both usr and grp combined - 3. + * So, we don't attempt to make this very generic. + */ +STATIC void +xfs_trans_dqlockedjoin( + xfs_trans_t *tp, + xfs_dqtrx_t *q) +{ + ASSERT(q[0].qt_dquot != NULL); + if (q[1].qt_dquot == NULL) { + xfs_dqlock(q[0].qt_dquot); + xfs_trans_dqjoin(tp, q[0].qt_dquot); + } else { + ASSERT(XFS_QM_TRANS_MAXDQS == 2); + xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot); + xfs_trans_dqjoin(tp, q[0].qt_dquot); + xfs_trans_dqjoin(tp, q[1].qt_dquot); + } +} + + +/* + * Called by xfs_trans_commit() and similar in spirit to + * xfs_trans_apply_sb_deltas(). + * Go thru all the dquots belonging to this transaction and modify the + * INCORE dquot to reflect the actual usages. + * Unreserve just the reservations done by this transaction. + * dquot is still left locked at exit. + */ +void +xfs_trans_apply_dquot_deltas( + xfs_trans_t *tp) +{ + int i, j; + xfs_dquot_t *dqp; + xfs_dqtrx_t *qtrx, *qa; + xfs_disk_dquot_t *d; + long totalbdelta; + long totalrtbdelta; + + if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY)) + return; + + ASSERT(tp->t_dqinfo); + qa = tp->t_dqinfo->dqa_usrdquots; + for (j = 0; j < 2; j++) { + if (qa[0].qt_dquot == NULL) { + qa = tp->t_dqinfo->dqa_grpdquots; + continue; + } + + /* + * Lock all of the dquots and join them to the transaction. + */ + xfs_trans_dqlockedjoin(tp, qa); + + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + qtrx = &qa[i]; + /* + * The array of dquots is filled + * sequentially, not sparsely. + */ + if ((dqp = qtrx->qt_dquot) == NULL) + break; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(dqp->q_transp == tp); + + /* + * adjust the actual number of blocks used + */ + d = &dqp->q_core; + + /* + * The issue here is - sometimes we don't make a blkquota + * reservation intentionally to be fair to users + * (when the amount is small). On the other hand, + * delayed allocs do make reservations, but that's + * outside of a transaction, so we have no + * idea how much was really reserved. + * So, here we've accumulated delayed allocation blks and + * non-delay blks. The assumption is that the + * delayed ones are always reserved (outside of a + * transaction), and the others may or may not have + * quota reservations. + */ + totalbdelta = qtrx->qt_bcount_delta + + qtrx->qt_delbcnt_delta; + totalrtbdelta = qtrx->qt_rtbcount_delta + + qtrx->qt_delrtb_delta; +#ifdef QUOTADEBUG + if (totalbdelta < 0) + ASSERT(be64_to_cpu(d->d_bcount) >= + (xfs_qcnt_t) -totalbdelta); + + if (totalrtbdelta < 0) + ASSERT(be64_to_cpu(d->d_rtbcount) >= + (xfs_qcnt_t) -totalrtbdelta); + + if (qtrx->qt_icount_delta < 0) + ASSERT(be64_to_cpu(d->d_icount) >= + (xfs_qcnt_t) -qtrx->qt_icount_delta); +#endif + if (totalbdelta) + be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta); + + if (qtrx->qt_icount_delta) + be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta); + + if (totalrtbdelta) + be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta); + + /* + * Get any default limits in use. + * Start/reset the timer(s) if needed. + */ + if (d->d_id) { + xfs_qm_adjust_dqlimits(tp->t_mountp, d); + xfs_qm_adjust_dqtimers(tp->t_mountp, d); + } + + dqp->dq_flags |= XFS_DQ_DIRTY; + /* + * add this to the list of items to get logged + */ + xfs_trans_log_dquot(tp, dqp); + /* + * Take off what's left of the original reservation. + * In case of delayed allocations, there's no + * reservation that a transaction structure knows of. + */ + if (qtrx->qt_blk_res != 0) { + if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) { + if (qtrx->qt_blk_res > + qtrx->qt_blk_res_used) + dqp->q_res_bcount -= (xfs_qcnt_t) + (qtrx->qt_blk_res - + qtrx->qt_blk_res_used); + else + dqp->q_res_bcount -= (xfs_qcnt_t) + (qtrx->qt_blk_res_used - + qtrx->qt_blk_res); + } + } else { + /* + * These blks were never reserved, either inside + * a transaction or outside one (in a delayed + * allocation). Also, this isn't always a + * negative number since we sometimes + * deliberately skip quota reservations. + */ + if (qtrx->qt_bcount_delta) { + dqp->q_res_bcount += + (xfs_qcnt_t)qtrx->qt_bcount_delta; + } + } + /* + * Adjust the RT reservation. + */ + if (qtrx->qt_rtblk_res != 0) { + if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) { + if (qtrx->qt_rtblk_res > + qtrx->qt_rtblk_res_used) + dqp->q_res_rtbcount -= (xfs_qcnt_t) + (qtrx->qt_rtblk_res - + qtrx->qt_rtblk_res_used); + else + dqp->q_res_rtbcount -= (xfs_qcnt_t) + (qtrx->qt_rtblk_res_used - + qtrx->qt_rtblk_res); + } + } else { + if (qtrx->qt_rtbcount_delta) + dqp->q_res_rtbcount += + (xfs_qcnt_t)qtrx->qt_rtbcount_delta; + } + + /* + * Adjust the inode reservation. + */ + if (qtrx->qt_ino_res != 0) { + ASSERT(qtrx->qt_ino_res >= + qtrx->qt_ino_res_used); + if (qtrx->qt_ino_res > qtrx->qt_ino_res_used) + dqp->q_res_icount -= (xfs_qcnt_t) + (qtrx->qt_ino_res - + qtrx->qt_ino_res_used); + } else { + if (qtrx->qt_icount_delta) + dqp->q_res_icount += + (xfs_qcnt_t)qtrx->qt_icount_delta; + } + + ASSERT(dqp->q_res_bcount >= + be64_to_cpu(dqp->q_core.d_bcount)); + ASSERT(dqp->q_res_icount >= + be64_to_cpu(dqp->q_core.d_icount)); + ASSERT(dqp->q_res_rtbcount >= + be64_to_cpu(dqp->q_core.d_rtbcount)); + } + /* + * Do the group quotas next + */ + qa = tp->t_dqinfo->dqa_grpdquots; + } +} + +/* + * Release the reservations, and adjust the dquots accordingly. + * This is called only when the transaction is being aborted. If by + * any chance we have done dquot modifications incore (ie. deltas) already, + * we simply throw those away, since that's the expected behavior + * when a transaction is curtailed without a commit. + */ +void +xfs_trans_unreserve_and_mod_dquots( + xfs_trans_t *tp) +{ + int i, j; + xfs_dquot_t *dqp; + xfs_dqtrx_t *qtrx, *qa; + boolean_t locked; + + if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) + return; + + qa = tp->t_dqinfo->dqa_usrdquots; + + for (j = 0; j < 2; j++) { + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + qtrx = &qa[i]; + /* + * We assume that the array of dquots is filled + * sequentially, not sparsely. + */ + if ((dqp = qtrx->qt_dquot) == NULL) + break; + /* + * Unreserve the original reservation. We don't care + * about the number of blocks used field, or deltas. + * Also we don't bother to zero the fields. + */ + locked = B_FALSE; + if (qtrx->qt_blk_res) { + xfs_dqlock(dqp); + locked = B_TRUE; + dqp->q_res_bcount -= + (xfs_qcnt_t)qtrx->qt_blk_res; + } + if (qtrx->qt_ino_res) { + if (!locked) { + xfs_dqlock(dqp); + locked = B_TRUE; + } + dqp->q_res_icount -= + (xfs_qcnt_t)qtrx->qt_ino_res; + } + + if (qtrx->qt_rtblk_res) { + if (!locked) { + xfs_dqlock(dqp); + locked = B_TRUE; + } + dqp->q_res_rtbcount -= + (xfs_qcnt_t)qtrx->qt_rtblk_res; + } + if (locked) + xfs_dqunlock(dqp); + + } + qa = tp->t_dqinfo->dqa_grpdquots; + } +} + +STATIC void +xfs_quota_warn( + struct xfs_mount *mp, + struct xfs_dquot *dqp, + int type) +{ + /* no warnings for project quotas - we just return ENOSPC later */ + if (dqp->dq_flags & XFS_DQ_PROJ) + return; + quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA, + be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev, + type); +} + +/* + * This reserves disk blocks and inodes against a dquot. + * Flags indicate if the dquot is to be locked here and also + * if the blk reservation is for RT or regular blocks. + * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check. + */ +STATIC int +xfs_trans_dqresv( + xfs_trans_t *tp, + xfs_mount_t *mp, + xfs_dquot_t *dqp, + long nblks, + long ninos, + uint flags) +{ + xfs_qcnt_t hardlimit; + xfs_qcnt_t softlimit; + time_t timer; + xfs_qwarncnt_t warns; + xfs_qwarncnt_t warnlimit; + xfs_qcnt_t count; + xfs_qcnt_t *resbcountp; + xfs_quotainfo_t *q = mp->m_quotainfo; + + + xfs_dqlock(dqp); + + if (flags & XFS_TRANS_DQ_RES_BLKS) { + hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); + if (!hardlimit) + hardlimit = q->qi_bhardlimit; + softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit); + if (!softlimit) + softlimit = q->qi_bsoftlimit; + timer = be32_to_cpu(dqp->q_core.d_btimer); + warns = be16_to_cpu(dqp->q_core.d_bwarns); + warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit; + resbcountp = &dqp->q_res_bcount; + } else { + ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); + hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit); + if (!hardlimit) + hardlimit = q->qi_rtbhardlimit; + softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit); + if (!softlimit) + softlimit = q->qi_rtbsoftlimit; + timer = be32_to_cpu(dqp->q_core.d_rtbtimer); + warns = be16_to_cpu(dqp->q_core.d_rtbwarns); + warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit; + resbcountp = &dqp->q_res_rtbcount; + } + + if ((flags & XFS_QMOPT_FORCE_RES) == 0 && + dqp->q_core.d_id && + ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || + (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) && + (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) { +#ifdef QUOTADEBUG + xfs_debug(mp, + "BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?", + nblks, *resbcountp, hardlimit); +#endif + if (nblks > 0) { + /* + * dquot is locked already. See if we'd go over the + * hardlimit or exceed the timelimit if we allocate + * nblks. + */ + if (hardlimit > 0ULL && + hardlimit <= nblks + *resbcountp) { + xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); + goto error_return; + } + if (softlimit > 0ULL && + softlimit <= nblks + *resbcountp) { + if ((timer != 0 && get_seconds() > timer) || + (warns != 0 && warns >= warnlimit)) { + xfs_quota_warn(mp, dqp, + QUOTA_NL_BSOFTLONGWARN); + goto error_return; + } + + xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN); + } + } + if (ninos > 0) { + count = be64_to_cpu(dqp->q_core.d_icount); + timer = be32_to_cpu(dqp->q_core.d_itimer); + warns = be16_to_cpu(dqp->q_core.d_iwarns); + warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; + hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); + if (!hardlimit) + hardlimit = q->qi_ihardlimit; + softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); + if (!softlimit) + softlimit = q->qi_isoftlimit; + + if (hardlimit > 0ULL && count >= hardlimit) { + xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); + goto error_return; + } + if (softlimit > 0ULL && count >= softlimit) { + if ((timer != 0 && get_seconds() > timer) || + (warns != 0 && warns >= warnlimit)) { + xfs_quota_warn(mp, dqp, + QUOTA_NL_ISOFTLONGWARN); + goto error_return; + } + xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN); + } + } + } + + /* + * Change the reservation, but not the actual usage. + * Note that q_res_bcount = q_core.d_bcount + resv + */ + (*resbcountp) += (xfs_qcnt_t)nblks; + if (ninos != 0) + dqp->q_res_icount += (xfs_qcnt_t)ninos; + + /* + * note the reservation amt in the trans struct too, + * so that the transaction knows how much was reserved by + * it against this particular dquot. + * We don't do this when we are reserving for a delayed allocation, + * because we don't have the luxury of a transaction envelope then. + */ + if (tp) { + ASSERT(tp->t_dqinfo); + ASSERT(flags & XFS_QMOPT_RESBLK_MASK); + if (nblks != 0) + xfs_trans_mod_dquot(tp, dqp, + flags & XFS_QMOPT_RESBLK_MASK, + nblks); + if (ninos != 0) + xfs_trans_mod_dquot(tp, dqp, + XFS_TRANS_DQ_RES_INOS, + ninos); + } + ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount)); + ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); + ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); + + xfs_dqunlock(dqp); + return 0; + +error_return: + xfs_dqunlock(dqp); + if (flags & XFS_QMOPT_ENOSPC) + return ENOSPC; + return EDQUOT; +} + + +/* + * Given dquot(s), make disk block and/or inode reservations against them. + * The fact that this does the reservation against both the usr and + * grp/prj quotas is important, because this follows a both-or-nothing + * approach. + * + * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. + * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota. + * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks + * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks + * dquots are unlocked on return, if they were not locked by caller. + */ +int +xfs_trans_reserve_quota_bydquots( + xfs_trans_t *tp, + xfs_mount_t *mp, + xfs_dquot_t *udqp, + xfs_dquot_t *gdqp, + long nblks, + long ninos, + uint flags) +{ + int resvd = 0, error; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + if (tp && tp->t_dqinfo == NULL) + xfs_trans_alloc_dqinfo(tp); + + ASSERT(flags & XFS_QMOPT_RESBLK_MASK); + + if (udqp) { + error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, + (flags & ~XFS_QMOPT_ENOSPC)); + if (error) + return error; + resvd = 1; + } + + if (gdqp) { + error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags); + if (error) { + /* + * can't do it, so backout previous reservation + */ + if (resvd) { + flags |= XFS_QMOPT_FORCE_RES; + xfs_trans_dqresv(tp, mp, udqp, + -nblks, -ninos, flags); + } + return error; + } + } + + /* + * Didn't change anything critical, so, no need to log + */ + return 0; +} + + +/* + * Lock the dquot and change the reservation if we can. + * This doesn't change the actual usage, just the reservation. + * The inode sent in is locked. + */ +int +xfs_trans_reserve_quota_nblks( + struct xfs_trans *tp, + struct xfs_inode *ip, + long nblks, + long ninos, + uint flags) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + if (XFS_IS_PQUOTA_ON(mp)) + flags |= XFS_QMOPT_ENOSPC; + + ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); + ASSERT(ip->i_ino != mp->m_sb.sb_gquotino); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == + XFS_TRANS_DQ_RES_RTBLKS || + (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == + XFS_TRANS_DQ_RES_BLKS); + + /* + * Reserve nblks against these dquots, with trans as the mediator. + */ + return xfs_trans_reserve_quota_bydquots(tp, mp, + ip->i_udquot, ip->i_gdquot, + nblks, ninos, flags); +} + +/* + * This routine is called to allocate a quotaoff log item. + */ +xfs_qoff_logitem_t * +xfs_trans_get_qoff_item( + xfs_trans_t *tp, + xfs_qoff_logitem_t *startqoff, + uint flags) +{ + xfs_qoff_logitem_t *q; + + ASSERT(tp != NULL); + + q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags); + ASSERT(q != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &q->qql_item); + return q; +} + + +/* + * This is called to mark the quotaoff logitem as needing + * to be logged when the transaction is committed. The logitem must + * already be associated with the given transaction. + */ +void +xfs_trans_log_quotaoff_item( + xfs_trans_t *tp, + xfs_qoff_logitem_t *qlp) +{ + tp->t_flags |= XFS_TRANS_DIRTY; + qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} + +STATIC void +xfs_trans_alloc_dqinfo( + xfs_trans_t *tp) +{ + tp->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP); +} + +void +xfs_trans_free_dqinfo( + xfs_trans_t *tp) +{ + if (!tp->t_dqinfo) + return; + kmem_zone_free(xfs_Gqm->qm_dqtrxzone, tp->t_dqinfo); + tp->t_dqinfo = NULL; +} diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c new file mode 100644 index 00000000..b83f76b6 --- /dev/null +++ b/fs/xfs/support/uuid.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include + +/* IRIX interpretation of an uuid_t */ +typedef struct { + __be32 uu_timelow; + __be16 uu_timemid; + __be16 uu_timehi; + __be16 uu_clockseq; + __be16 uu_node[3]; +} xfs_uu_t; + +/* + * uuid_getnodeuniq - obtain the node unique fields of a UUID. + * + * This is not in any way a standard or condoned UUID function; + * it just something that's needed for user-level file handles. + */ +void +uuid_getnodeuniq(uuid_t *uuid, int fsid [2]) +{ + xfs_uu_t *uup = (xfs_uu_t *)uuid; + + fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) | + be16_to_cpu(uup->uu_timemid); + fsid[1] = be32_to_cpu(uup->uu_timelow); +} + +int +uuid_is_nil(uuid_t *uuid) +{ + int i; + char *cp = (char *)uuid; + + if (uuid == NULL) + return 0; + /* implied check of version number here... */ + for (i = 0; i < sizeof *uuid; i++) + if (*cp++) return 0; /* not nil */ + return 1; /* is nil */ +} + +int +uuid_equal(uuid_t *uuid1, uuid_t *uuid2) +{ + return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1; +} diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h new file mode 100644 index 00000000..4732d712 --- /dev/null +++ b/fs/xfs/support/uuid.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SUPPORT_UUID_H__ +#define __XFS_SUPPORT_UUID_H__ + +typedef struct { + unsigned char __u_bits[16]; +} uuid_t; + +extern int uuid_is_nil(uuid_t *uuid); +extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); +extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); + +#endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h new file mode 100644 index 00000000..5ad8ad3a --- /dev/null +++ b/fs/xfs/xfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_H__ +#define __XFS_H__ + +#ifdef CONFIG_XFS_DEBUG +#define STATIC +#define DEBUG 1 +#define XFS_BUF_LOCK_TRACKING 1 +/* #define QUOTADEBUG 1 */ +#endif + +#include +#endif /* __XFS_H__ */ diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h new file mode 100644 index 00000000..11dd7207 --- /dev/null +++ b/fs/xfs/xfs_acl.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2001-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ACL_H__ +#define __XFS_ACL_H__ + +struct inode; +struct posix_acl; +struct xfs_inode; + +#define XFS_ACL_MAX_ENTRIES 25 +#define XFS_ACL_NOT_PRESENT (-1) + +/* On-disk XFS access control list structure */ +struct xfs_acl { + __be32 acl_cnt; + struct xfs_acl_entry { + __be32 ae_tag; + __be32 ae_id; + __be16 ae_perm; + } acl_entry[XFS_ACL_MAX_ENTRIES]; +}; + +/* On-disk XFS extended attribute names */ +#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" +#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" +#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) +#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) + +#ifdef CONFIG_XFS_POSIX_ACL +extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags); +extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); +extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); +extern int xfs_acl_chmod(struct inode *inode); +extern int posix_acl_access_exists(struct inode *inode); +extern int posix_acl_default_exists(struct inode *inode); + +extern const struct xattr_handler xfs_xattr_acl_access_handler; +extern const struct xattr_handler xfs_xattr_acl_default_handler; +#else +# define xfs_check_acl NULL +# define xfs_get_acl(inode, type) NULL +# define xfs_inherit_acl(inode, default_acl) 0 +# define xfs_acl_chmod(inode) 0 +# define posix_acl_access_exists(inode) 0 +# define posix_acl_default_exists(inode) 0 +#endif /* CONFIG_XFS_POSIX_ACL */ +#endif /* __XFS_ACL_H__ */ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h new file mode 100644 index 00000000..6530769a --- /dev/null +++ b/fs/xfs/xfs_ag.h @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_AG_H__ +#define __XFS_AG_H__ + +/* + * Allocation group header + * This is divided into three structures, placed in sequential 512-byte + * buffers after a copy of the superblock (also in a 512-byte buffer). + */ + +struct xfs_buf; +struct xfs_mount; +struct xfs_trans; + +#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */ +#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */ +#define XFS_AGF_VERSION 1 +#define XFS_AGI_VERSION 1 + +#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION) +#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) + +/* + * Btree number 0 is bno, 1 is cnt. This value gives the size of the + * arrays below. + */ +#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1) + +/* + * The second word of agf_levels in the first a.g. overlaps the EFS + * superblock's magic number. Since the magic numbers valid for EFS + * are > 64k, our value cannot be confused for an EFS superblock's. + */ + +typedef struct xfs_agf { + /* + * Common allocation group header information + */ + __be32 agf_magicnum; /* magic number == XFS_AGF_MAGIC */ + __be32 agf_versionnum; /* header version == XFS_AGF_VERSION */ + __be32 agf_seqno; /* sequence # starting from 0 */ + __be32 agf_length; /* size in blocks of a.g. */ + /* + * Freespace information + */ + __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */ + __be32 agf_spare0; /* spare field */ + __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ + __be32 agf_spare1; /* spare field */ + __be32 agf_flfirst; /* first freelist block's index */ + __be32 agf_fllast; /* last freelist block's index */ + __be32 agf_flcount; /* count of blocks in freelist */ + __be32 agf_freeblks; /* total free blocks */ + __be32 agf_longest; /* longest free space */ + __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ +} xfs_agf_t; + +#define XFS_AGF_MAGICNUM 0x00000001 +#define XFS_AGF_VERSIONNUM 0x00000002 +#define XFS_AGF_SEQNO 0x00000004 +#define XFS_AGF_LENGTH 0x00000008 +#define XFS_AGF_ROOTS 0x00000010 +#define XFS_AGF_LEVELS 0x00000020 +#define XFS_AGF_FLFIRST 0x00000040 +#define XFS_AGF_FLLAST 0x00000080 +#define XFS_AGF_FLCOUNT 0x00000100 +#define XFS_AGF_FREEBLKS 0x00000200 +#define XFS_AGF_LONGEST 0x00000400 +#define XFS_AGF_BTREEBLKS 0x00000800 +#define XFS_AGF_NUM_BITS 12 +#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) + +#define XFS_AGF_FLAGS \ + { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ + { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \ + { XFS_AGF_SEQNO, "SEQNO" }, \ + { XFS_AGF_LENGTH, "LENGTH" }, \ + { XFS_AGF_ROOTS, "ROOTS" }, \ + { XFS_AGF_LEVELS, "LEVELS" }, \ + { XFS_AGF_FLFIRST, "FLFIRST" }, \ + { XFS_AGF_FLLAST, "FLLAST" }, \ + { XFS_AGF_FLCOUNT, "FLCOUNT" }, \ + { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ + { XFS_AGF_LONGEST, "LONGEST" }, \ + { XFS_AGF_BTREEBLKS, "BTREEBLKS" } + +/* disk block (xfs_daddr_t) in the AG */ +#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) +#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) +#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) + +extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); + +/* + * Size of the unlinked inode hash table in the agi. + */ +#define XFS_AGI_UNLINKED_BUCKETS 64 + +typedef struct xfs_agi { + /* + * Common allocation group header information + */ + __be32 agi_magicnum; /* magic number == XFS_AGI_MAGIC */ + __be32 agi_versionnum; /* header version == XFS_AGI_VERSION */ + __be32 agi_seqno; /* sequence # starting from 0 */ + __be32 agi_length; /* size in blocks of a.g. */ + /* + * Inode information + * Inodes are mapped by interpreting the inode number, so no + * mapping data is needed here. + */ + __be32 agi_count; /* count of allocated inodes */ + __be32 agi_root; /* root of inode btree */ + __be32 agi_level; /* levels in inode btree */ + __be32 agi_freecount; /* number of free inodes */ + __be32 agi_newino; /* new inode just allocated */ + __be32 agi_dirino; /* last directory inode chunk */ + /* + * Hash table of inodes which have been unlinked but are + * still being referenced. + */ + __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; +} xfs_agi_t; + +#define XFS_AGI_MAGICNUM 0x00000001 +#define XFS_AGI_VERSIONNUM 0x00000002 +#define XFS_AGI_SEQNO 0x00000004 +#define XFS_AGI_LENGTH 0x00000008 +#define XFS_AGI_COUNT 0x00000010 +#define XFS_AGI_ROOT 0x00000020 +#define XFS_AGI_LEVEL 0x00000040 +#define XFS_AGI_FREECOUNT 0x00000080 +#define XFS_AGI_NEWINO 0x00000100 +#define XFS_AGI_DIRINO 0x00000200 +#define XFS_AGI_UNLINKED 0x00000400 +#define XFS_AGI_NUM_BITS 11 +#define XFS_AGI_ALL_BITS ((1 << XFS_AGI_NUM_BITS) - 1) + +/* disk block (xfs_daddr_t) in the AG */ +#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) +#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) +#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp)) + +extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, struct xfs_buf **bpp); + +/* + * The third a.g. block contains the a.g. freelist, an array + * of block pointers to blocks owned by the allocation btree code. + */ +#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) +#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) +#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t)) +#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)XFS_BUF_PTR(bp)) + +typedef struct xfs_agfl { + __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ +} xfs_agfl_t; + +/* + * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that + * have been freed but whose transactions aren't committed to disk yet. + * + * Note that we use the transaction ID to record the transaction, not the + * transaction structure itself. See xfs_alloc_busy_insert() for details. + */ +struct xfs_busy_extent { + struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct list_head list; /* transaction busy extent list */ + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_extlen_t length; + unsigned int flags; +#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */ +#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */ +}; + +/* + * Per-ag incore structure, copies of information in agf and agi, + * to improve the performance of allocation group selection. + */ +#define XFS_PAGB_NUM_SLOTS 128 + +typedef struct xfs_perag { + struct xfs_mount *pag_mount; /* owner filesystem */ + xfs_agnumber_t pag_agno; /* AG this structure belongs to */ + atomic_t pag_ref; /* perag reference count */ + char pagf_init; /* this agf's entry is initialized */ + char pagi_init; /* this agi's entry is initialized */ + char pagf_metadata; /* the agf is preferred to be metadata */ + char pagi_inodeok; /* The agi is ok for inodes */ + __uint8_t pagf_levels[XFS_BTNUM_AGF]; + /* # of levels in bno & cnt btree */ + __uint32_t pagf_flcount; /* count of blocks in freelist */ + xfs_extlen_t pagf_freeblks; /* total free blocks */ + xfs_extlen_t pagf_longest; /* longest free space */ + __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ + xfs_agino_t pagi_freecount; /* number of free inodes */ + xfs_agino_t pagi_count; /* number of allocated inodes */ + + /* + * Inode allocation search lookup optimisation. + * If the pagino matches, the search for new inodes + * doesn't need to search the near ones again straight away + */ + xfs_agino_t pagl_pagino; + xfs_agino_t pagl_leftrec; + xfs_agino_t pagl_rightrec; +#ifdef __KERNEL__ + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ + + atomic_t pagf_fstrms; /* # of filestreams active in this AG */ + + spinlock_t pag_ici_lock; /* incore inode cache lock */ + struct radix_tree_root pag_ici_root; /* incore inode cache root */ + int pag_ici_reclaimable; /* reclaimable inodes */ + struct mutex pag_ici_reclaim_lock; /* serialisation point */ + unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ + + /* buffer cache index */ + spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ + struct rb_root pag_buf_tree; /* ordered tree of active buffers */ + + /* for rcu-safe freeing */ + struct rcu_head rcu_head; +#endif + int pagb_count; /* pagb slots in use */ +} xfs_perag_t; + +/* + * tags for inode radix tree + */ +#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup + in xfs_inode_ag_iterator */ +#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ + +#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) +#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ + (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) +#define XFS_MIN_FREELIST(a,mp) \ + (XFS_MIN_FREELIST_RAW( \ + be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \ + be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) +#define XFS_MIN_FREELIST_PAG(pag,mp) \ + (XFS_MIN_FREELIST_RAW( \ + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) + +#define XFS_AGB_TO_FSB(mp,agno,agbno) \ + (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) +#define XFS_FSB_TO_AGNO(mp,fsbno) \ + ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog)) +#define XFS_FSB_TO_AGBNO(mp,fsbno) \ + ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog))) +#define XFS_AGB_TO_DADDR(mp,agno,agbno) \ + ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \ + (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno))) +#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d)) + +/* + * For checking for bad ranges of xfs_daddr_t's, covering multiple + * allocation groups or a single xfs_daddr_t that's a superblock copy. + */ +#define XFS_AG_CHECK_DADDR(mp,d,len) \ + ((len) == 1 ? \ + ASSERT((d) == XFS_SB_DADDR || \ + xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \ + ASSERT(xfs_daddr_to_agno(mp, d) == \ + xfs_daddr_to_agno(mp, (d) + (len) - 1))) + +#endif /* __XFS_AG_H__ */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c new file mode 100644 index 00000000..95862bbf --- /dev/null +++ b/fs/xfs/xfs_alloc.c @@ -0,0 +1,3047 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" + + +#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) + +#define XFSA_FIXUP_BNO_OK 1 +#define XFSA_FIXUP_CNT_OK 2 + +STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, + xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); +STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *, + xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *); + +/* + * Lookup the record equal to [bno, len] in the btree given by cur. + */ +STATIC int /* error */ +xfs_alloc_lookup_eq( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +/* + * Lookup the first record greater than or equal to [bno, len] + * in the btree given by cur. + */ +STATIC int /* error */ +xfs_alloc_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* + * Lookup the first record less than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Update the record referred to by cur to the value given + * by [bno, len]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int /* error */ +xfs_alloc_update( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len) /* length of extent */ +{ + union xfs_btree_rec rec; + + rec.alloc.ar_startblock = cpu_to_be32(bno); + rec.alloc.ar_blockcount = cpu_to_be32(len); + return xfs_btree_update(cur, &rec); +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat) /* output: success/failure */ +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (!error && *stat == 1) { + *bno = be32_to_cpu(rec->alloc.ar_startblock); + *len = be32_to_cpu(rec->alloc.ar_blockcount); + } + return error; +} + +/* + * Compute aligned version of the found extent. + * Takes alignment and min length into account. + */ +STATIC void +xfs_alloc_compute_aligned( + xfs_alloc_arg_t *args, /* allocation argument structure */ + xfs_agblock_t foundbno, /* starting block in found extent */ + xfs_extlen_t foundlen, /* length in found extent */ + xfs_agblock_t *resbno, /* result block number */ + xfs_extlen_t *reslen) /* result length */ +{ + xfs_agblock_t bno; + xfs_extlen_t len; + + /* Trim busy sections out of found extent */ + xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len); + + if (args->alignment > 1 && len >= args->minlen) { + xfs_agblock_t aligned_bno = roundup(bno, args->alignment); + xfs_extlen_t diff = aligned_bno - bno; + + *resbno = aligned_bno; + *reslen = diff >= len ? 0 : len - diff; + } else { + *resbno = bno; + *reslen = len; + } +} + +/* + * Compute best start block and diff for "near" allocations. + * freelen >= wantlen already checked by caller. + */ +STATIC xfs_extlen_t /* difference value (absolute) */ +xfs_alloc_compute_diff( + xfs_agblock_t wantbno, /* target starting block */ + xfs_extlen_t wantlen, /* target length */ + xfs_extlen_t alignment, /* target alignment */ + xfs_agblock_t freebno, /* freespace's starting block */ + xfs_extlen_t freelen, /* freespace's length */ + xfs_agblock_t *newbnop) /* result: best start block from free */ +{ + xfs_agblock_t freeend; /* end of freespace extent */ + xfs_agblock_t newbno1; /* return block number */ + xfs_agblock_t newbno2; /* other new block number */ + xfs_extlen_t newlen1=0; /* length with newbno1 */ + xfs_extlen_t newlen2=0; /* length with newbno2 */ + xfs_agblock_t wantend; /* end of target extent */ + + ASSERT(freelen >= wantlen); + freeend = freebno + freelen; + wantend = wantbno + wantlen; + if (freebno >= wantbno) { + if ((newbno1 = roundup(freebno, alignment)) >= freeend) + newbno1 = NULLAGBLOCK; + } else if (freeend >= wantend && alignment > 1) { + newbno1 = roundup(wantbno, alignment); + newbno2 = newbno1 - alignment; + if (newbno1 >= freeend) + newbno1 = NULLAGBLOCK; + else + newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1); + if (newbno2 < freebno) + newbno2 = NULLAGBLOCK; + else + newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2); + if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { + if (newlen1 < newlen2 || + (newlen1 == newlen2 && + XFS_ABSDIFF(newbno1, wantbno) > + XFS_ABSDIFF(newbno2, wantbno))) + newbno1 = newbno2; + } else if (newbno2 != NULLAGBLOCK) + newbno1 = newbno2; + } else if (freeend >= wantend) { + newbno1 = wantbno; + } else if (alignment > 1) { + newbno1 = roundup(freeend - wantlen, alignment); + if (newbno1 > freeend - wantlen && + newbno1 - alignment >= freebno) + newbno1 -= alignment; + else if (newbno1 >= freeend) + newbno1 = NULLAGBLOCK; + } else + newbno1 = freeend - wantlen; + *newbnop = newbno1; + return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); +} + +/* + * Fix up the length, based on mod and prod. + * len should be k * prod + mod for some k. + * If len is too small it is returned unchanged. + * If len hits maxlen it is left alone. + */ +STATIC void +xfs_alloc_fix_len( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_extlen_t k; + xfs_extlen_t rlen; + + ASSERT(args->mod < args->prod); + rlen = args->len; + ASSERT(rlen >= args->minlen); + ASSERT(rlen <= args->maxlen); + if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen || + (args->mod == 0 && rlen < args->prod)) + return; + k = rlen % args->prod; + if (k == args->mod) + return; + if (k > args->mod) { + if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen) + return; + } else { + if ((int)(rlen = rlen - args->prod - (args->mod - k)) < + (int)args->minlen) + return; + } + ASSERT(rlen >= args->minlen); + ASSERT(rlen <= args->maxlen); + args->len = rlen; +} + +/* + * Fix up length if there is too little space left in the a.g. + * Return 1 if ok, 0 if too little, should give up. + */ +STATIC int +xfs_alloc_fix_minleft( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_agf_t *agf; /* a.g. freelist header */ + int diff; /* free space difference */ + + if (args->minleft == 0) + return 1; + agf = XFS_BUF_TO_AGF(args->agbp); + diff = be32_to_cpu(agf->agf_freeblks) + - args->len - args->minleft; + if (diff >= 0) + return 1; + args->len += diff; /* shrink the allocated space */ + if (args->len >= args->minlen) + return 1; + args->agbno = NULLAGBLOCK; + return 0; +} + +/* + * Update the two btrees, logically removing from freespace the extent + * starting at rbno, rlen blocks. The extent is contained within the + * actual (current) free extent fbno for flen blocks. + * Flags are passed in indicating whether the cursors are set to the + * relevant records. + */ +STATIC int /* error code */ +xfs_alloc_fixup_trees( + xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */ + xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */ + xfs_agblock_t fbno, /* starting block of free extent */ + xfs_extlen_t flen, /* length of free extent */ + xfs_agblock_t rbno, /* starting block of returned extent */ + xfs_extlen_t rlen, /* length of returned extent */ + int flags) /* flags, XFSA_FIXUP_... */ +{ + int error; /* error code */ + int i; /* operation results */ + xfs_agblock_t nfbno1; /* first new free startblock */ + xfs_agblock_t nfbno2; /* second new free startblock */ + xfs_extlen_t nflen1=0; /* first new free length */ + xfs_extlen_t nflen2=0; /* second new free length */ + + /* + * Look up the record in the by-size tree if necessary. + */ + if (flags & XFSA_FIXUP_CNT_OK) { +#ifdef DEBUG + if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN( + i == 1 && nfbno1 == fbno && nflen1 == flen); +#endif + } else { + if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + /* + * Look up the record in the by-block tree if necessary. + */ + if (flags & XFSA_FIXUP_BNO_OK) { +#ifdef DEBUG + if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN( + i == 1 && nfbno1 == fbno && nflen1 == flen); +#endif + } else { + if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + +#ifdef DEBUG + if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) { + struct xfs_btree_block *bnoblock; + struct xfs_btree_block *cntblock; + + bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); + cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); + + XFS_WANT_CORRUPTED_RETURN( + bnoblock->bb_numrecs == cntblock->bb_numrecs); + } +#endif + + /* + * Deal with all four cases: the allocated record is contained + * within the freespace record, so we can have new freespace + * at either (or both) end, or no freespace remaining. + */ + if (rbno == fbno && rlen == flen) + nfbno1 = nfbno2 = NULLAGBLOCK; + else if (rbno == fbno) { + nfbno1 = rbno + rlen; + nflen1 = flen - rlen; + nfbno2 = NULLAGBLOCK; + } else if (rbno + rlen == fbno + flen) { + nfbno1 = fbno; + nflen1 = flen - rlen; + nfbno2 = NULLAGBLOCK; + } else { + nfbno1 = fbno; + nflen1 = rbno - fbno; + nfbno2 = rbno + rlen; + nflen2 = (fbno + flen) - nfbno2; + } + /* + * Delete the entry from the by-size btree. + */ + if ((error = xfs_btree_delete(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + /* + * Add new by-size btree entry(s). + */ + if (nfbno1 != NULLAGBLOCK) { + if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 0); + if ((error = xfs_btree_insert(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + if (nfbno2 != NULLAGBLOCK) { + if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 0); + if ((error = xfs_btree_insert(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + /* + * Fix up the by-block btree entry(s). + */ + if (nfbno1 == NULLAGBLOCK) { + /* + * No remaining freespace, just delete the by-block tree entry. + */ + if ((error = xfs_btree_delete(bno_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } else { + /* + * Update the by-block entry to start later|be shorter. + */ + if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1))) + return error; + } + if (nfbno2 != NULLAGBLOCK) { + /* + * 2 resulting free entries, need to add one. + */ + if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 0); + if ((error = xfs_btree_insert(bno_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + return 0; +} + +/* + * Read in the allocation group free block array. + */ +STATIC int /* error */ +xfs_alloc_read_agfl( + xfs_mount_t *mp, /* mount point structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_buf_t **bpp) /* buffer for the ag free block array */ +{ + xfs_buf_t *bp; /* return value */ + int error; + + ASSERT(agno != NULLAGNUMBER); + error = xfs_trans_read_buf( + mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp); + if (error) + return error; + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF); + *bpp = bp; + return 0; +} + +STATIC int +xfs_alloc_update_counters( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_buf *agbp, + long len) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + + pag->pagf_freeblks += len; + be32_add_cpu(&agf->agf_freeblks, len); + + xfs_trans_agblocks_delta(tp, len); + if (unlikely(be32_to_cpu(agf->agf_freeblks) > + be32_to_cpu(agf->agf_length))) + return EFSCORRUPTED; + + xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); + return 0; +} + +/* + * Allocation group level functions. + */ + +/* + * Allocate a variable extent in the allocation group agno. + * Type and bno are used to determine where in the allocation group the + * extent will start. + * Extent's length (returned in *len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent( + xfs_alloc_arg_t *args) /* argument structure for allocation */ +{ + int error=0; + + ASSERT(args->minlen > 0); + ASSERT(args->maxlen > 0); + ASSERT(args->minlen <= args->maxlen); + ASSERT(args->mod < args->prod); + ASSERT(args->alignment > 0); + /* + * Branch to correct routine based on the type. + */ + args->wasfromfl = 0; + switch (args->type) { + case XFS_ALLOCTYPE_THIS_AG: + error = xfs_alloc_ag_vextent_size(args); + break; + case XFS_ALLOCTYPE_NEAR_BNO: + error = xfs_alloc_ag_vextent_near(args); + break; + case XFS_ALLOCTYPE_THIS_BNO: + error = xfs_alloc_ag_vextent_exact(args); + break; + default: + ASSERT(0); + /* NOTREACHED */ + } + + if (error || args->agbno == NULLAGBLOCK) + return error; + + ASSERT(args->len >= args->minlen); + ASSERT(args->len <= args->maxlen); + ASSERT(!args->wasfromfl || !args->isfl); + ASSERT(args->agbno % args->alignment == 0); + + if (!args->wasfromfl) { + error = xfs_alloc_update_counters(args->tp, args->pag, + args->agbp, + -((long)(args->len))); + if (error) + return error; + + ASSERT(!xfs_alloc_busy_search(args->mp, args->agno, + args->agbno, args->len)); + } + + if (!args->isfl) { + xfs_trans_mod_sb(args->tp, args->wasdel ? + XFS_TRANS_SB_RES_FDBLOCKS : + XFS_TRANS_SB_FDBLOCKS, + -((long)(args->len))); + } + + XFS_STATS_INC(xs_allocx); + XFS_STATS_ADD(xs_allocb, args->len); + return error; +} + +/* + * Allocate a variable extent at exactly agno/bno. + * Extent's length (returned in *len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_exact( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ + xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ + int error; + xfs_agblock_t fbno; /* start block of found extent */ + xfs_extlen_t flen; /* length of found extent */ + xfs_agblock_t tbno; /* start block of trimmed extent */ + xfs_extlen_t tlen; /* length of trimmed extent */ + xfs_agblock_t tend; /* end block of trimmed extent */ + xfs_agblock_t end; /* end of allocated extent */ + int i; /* success/failure of operation */ + xfs_extlen_t rlen; /* length of returned extent */ + + ASSERT(args->alignment == 1); + + /* + * Allocate/initialize a cursor for the by-number freespace btree. + */ + bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); + + /* + * Lookup bno and minlen in the btree (minlen is irrelevant, really). + * Look for the closest free block <= bno, it must contain bno + * if any free block does. + */ + error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i); + if (error) + goto error0; + if (!i) + goto not_found; + + /* + * Grab the freespace record. + */ + error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + ASSERT(fbno <= args->agbno); + + /* + * Check for overlapping busy extents. + */ + xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen); + + /* + * Give up if the start of the extent is busy, or the freespace isn't + * long enough for the minimum request. + */ + if (tbno > args->agbno) + goto not_found; + if (tlen < args->minlen) + goto not_found; + tend = tbno + tlen; + if (tend < args->agbno + args->minlen) + goto not_found; + + /* + * End of extent will be smaller of the freespace end and the + * maximal requested end. + * + * Fix the length according to mod and prod if given. + */ + end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen); + args->len = end - args->agbno; + xfs_alloc_fix_len(args); + if (!xfs_alloc_fix_minleft(args)) + goto not_found; + + rlen = args->len; + ASSERT(args->agbno + rlen <= tend); + end = args->agbno + rlen; + + /* + * We are allocating agbno for rlen [agbno .. end] + * Allocate/initialize a cursor for the by-size btree. + */ + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); + ASSERT(args->agbno + args->len <= + be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, + args->len, XFSA_FIXUP_BNO_OK); + if (error) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + goto error0; + } + + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + + args->wasfromfl = 0; + trace_xfs_alloc_exact_done(args); + return 0; + +not_found: + /* Didn't find it, return null. */ + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + args->agbno = NULLAGBLOCK; + trace_xfs_alloc_exact_notfound(args); + return 0; + +error0: + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + trace_xfs_alloc_exact_error(args); + return error; +} + +/* + * Search the btree in a given direction via the search cursor and compare + * the records found against the good extent we've already found. + */ +STATIC int +xfs_alloc_find_best_extent( + struct xfs_alloc_arg *args, /* allocation argument structure */ + struct xfs_btree_cur **gcur, /* good cursor */ + struct xfs_btree_cur **scur, /* searching cursor */ + xfs_agblock_t gdiff, /* difference for search comparison */ + xfs_agblock_t *sbno, /* extent found by search */ + xfs_extlen_t *slen, /* extent length */ + xfs_agblock_t *sbnoa, /* aligned extent found by search */ + xfs_extlen_t *slena, /* aligned extent length */ + int dir) /* 0 = search right, 1 = search left */ +{ + xfs_agblock_t new; + xfs_agblock_t sdiff; + int error; + int i; + + /* The good extent is perfect, no need to search. */ + if (!gdiff) + goto out_use_good; + + /* + * Look until we find a better one, run out of space or run off the end. + */ + do { + error = xfs_alloc_get_rec(*scur, sbno, slen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); + + /* + * The good extent is closer than this one. + */ + if (!dir) { + if (*sbnoa >= args->agbno + gdiff) + goto out_use_good; + } else { + if (*sbnoa <= args->agbno - gdiff) + goto out_use_good; + } + + /* + * Same distance, compare length and pick the best. + */ + if (*slena >= args->minlen) { + args->len = XFS_EXTLEN_MIN(*slena, args->maxlen); + xfs_alloc_fix_len(args); + + sdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, *sbnoa, + *slena, &new); + + /* + * Choose closer size and invalidate other cursor. + */ + if (sdiff < gdiff) + goto out_use_search; + goto out_use_good; + } + + if (!dir) + error = xfs_btree_increment(*scur, 0, &i); + else + error = xfs_btree_decrement(*scur, 0, &i); + if (error) + goto error0; + } while (i); + +out_use_good: + xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR); + *scur = NULL; + return 0; + +out_use_search: + xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR); + *gcur = NULL; + return 0; + +error0: + /* caller invalidates cursors */ + return error; +} + +/* + * Allocate a variable extent near bno in the allocation group agno. + * Extent's length (returned in len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_near( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */ + xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */ + xfs_btree_cur_t *cnt_cur; /* cursor for count btree */ + xfs_agblock_t gtbno; /* start bno of right side entry */ + xfs_agblock_t gtbnoa; /* aligned ... */ + xfs_extlen_t gtdiff; /* difference to right side entry */ + xfs_extlen_t gtlen; /* length of right side entry */ + xfs_extlen_t gtlena; /* aligned ... */ + xfs_agblock_t gtnew; /* useful start bno of right side */ + int error; /* error code */ + int i; /* result code, temporary */ + int j; /* result code, temporary */ + xfs_agblock_t ltbno; /* start bno of left side entry */ + xfs_agblock_t ltbnoa; /* aligned ... */ + xfs_extlen_t ltdiff; /* difference to left side entry */ + xfs_extlen_t ltlen; /* length of left side entry */ + xfs_extlen_t ltlena; /* aligned ... */ + xfs_agblock_t ltnew; /* useful start bno of left side */ + xfs_extlen_t rlen; /* length of returned extent */ + int forced = 0; +#if defined(DEBUG) && defined(__KERNEL__) + /* + * Randomly don't execute the first algorithm. + */ + int dofirst; /* set to do first algorithm */ + + dofirst = random32() & 1; +#endif + +restart: + bno_cur_lt = NULL; + bno_cur_gt = NULL; + ltlen = 0; + gtlena = 0; + ltlena = 0; + + /* + * Get a cursor for the by-size btree. + */ + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); + + /* + * See if there are any free extents as big as maxlen. + */ + if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i))) + goto error0; + /* + * If none, then pick up the last entry in the tree unless the + * tree is empty. + */ + if (!i) { + if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, <bno, + <len, &i))) + goto error0; + if (i == 0 || ltlen == 0) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_near_noentry(args); + return 0; + } + ASSERT(i == 1); + } + args->wasfromfl = 0; + + /* + * First algorithm. + * If the requested extent is large wrt the freespaces available + * in this a.g., then the cursor will be pointing to a btree entry + * near the right edge of the tree. If it's in the last btree leaf + * block, then we just examine all the entries in that block + * that are big enough, and pick the best one. + * This is written as a while loop so we can break out of it, + * but we never loop back to the top. + */ + while (xfs_btree_islastblock(cnt_cur, 0)) { + xfs_extlen_t bdiff; + int besti=0; + xfs_extlen_t blen=0; + xfs_agblock_t bnew=0; + +#if defined(DEBUG) && defined(__KERNEL__) + if (!dofirst) + break; +#endif + /* + * Start from the entry that lookup found, sequence through + * all larger free blocks. If we're actually pointing at a + * record smaller than maxlen, go to the start of this block, + * and skip all those smaller than minlen. + */ + if (ltlen || args->alignment > 1) { + cnt_cur->bc_ptrs[0] = 1; + do { + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, + <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (ltlen >= args->minlen) + break; + if ((error = xfs_btree_increment(cnt_cur, 0, &i))) + goto error0; + } while (i); + ASSERT(ltlen >= args->minlen); + if (!i) + break; + } + i = cnt_cur->bc_ptrs[0]; + for (j = 1, blen = 0, bdiff = 0; + !error && j && (blen < args->maxlen || bdiff > 0); + error = xfs_btree_increment(cnt_cur, 0, &j)) { + /* + * For each entry, decide if it's better than + * the previous best entry. + */ + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_alloc_compute_aligned(args, ltbno, ltlen, + <bnoa, <lena); + if (ltlena < args->minlen) + continue; + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + ASSERT(args->len >= args->minlen); + if (args->len < blen) + continue; + ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, ltbnoa, ltlena, <new); + if (ltnew != NULLAGBLOCK && + (args->len > blen || ltdiff < bdiff)) { + bdiff = ltdiff; + bnew = ltnew; + blen = args->len; + besti = cnt_cur->bc_ptrs[0]; + } + } + /* + * It didn't work. We COULD be in a case where + * there's a good record somewhere, so try again. + */ + if (blen == 0) + break; + /* + * Point at the best entry, and retrieve it again. + */ + cnt_cur->bc_ptrs[0] = besti; + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + args->len = blen; + if (!xfs_alloc_fix_minleft(args)) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_near_nominleft(args); + return 0; + } + blen = args->len; + /* + * We are allocating starting at bnew for blen blocks. + */ + args->agbno = bnew; + ASSERT(bnew >= ltbno); + ASSERT(bnew + blen <= ltbno + ltlen); + /* + * Set up a cursor for the by-bno tree. + */ + bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, + args->agbp, args->agno, XFS_BTNUM_BNO); + /* + * Fix up the btree entries. + */ + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, + ltlen, bnew, blen, XFSA_FIXUP_CNT_OK))) + goto error0; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + + trace_xfs_alloc_near_first(args); + return 0; + } + /* + * Second algorithm. + * Search in the by-bno tree to the left and to the right + * simultaneously, until in each case we find a space big enough, + * or run into the edge of the tree. When we run into the edge, + * we deallocate that cursor. + * If both searches succeed, we compare the two spaces and pick + * the better one. + * With alignment, it's possible for both to fail; the upper + * level algorithm that picks allocation groups for allocations + * is not supposed to do this. + */ + /* + * Allocate and initialize the cursor for the leftward search. + */ + bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); + /* + * Lookup <= bno to find the leftward search's starting point. + */ + if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i))) + goto error0; + if (!i) { + /* + * Didn't find anything; use this cursor for the rightward + * search. + */ + bno_cur_gt = bno_cur_lt; + bno_cur_lt = NULL; + } + /* + * Found something. Duplicate the cursor for the rightward search. + */ + else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt))) + goto error0; + /* + * Increment the cursor, so we will point at the entry just right + * of the leftward entry if any, or to the leftmost entry. + */ + if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) + goto error0; + if (!i) { + /* + * It failed, there are no rightward entries. + */ + xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + /* + * Loop going left with the leftward cursor, right with the + * rightward cursor, until either both directions give up or + * we find an entry at least as big as minlen. + */ + do { + if (bno_cur_lt) { + if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_alloc_compute_aligned(args, ltbno, ltlen, + <bnoa, <lena); + if (ltlena >= args->minlen) + break; + if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) + goto error0; + if (!i) { + xfs_btree_del_cursor(bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + } + } + if (bno_cur_gt) { + if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_alloc_compute_aligned(args, gtbno, gtlen, + >bnoa, >lena); + if (gtlena >= args->minlen) + break; + if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) + goto error0; + if (!i) { + xfs_btree_del_cursor(bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + } + } while (bno_cur_lt || bno_cur_gt); + + /* + * Got both cursors still active, need to find better entry. + */ + if (bno_cur_lt && bno_cur_gt) { + if (ltlena >= args->minlen) { + /* + * Left side is good, look for a right side entry. + */ + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, ltbnoa, ltlena, <new); + + error = xfs_alloc_find_best_extent(args, + &bno_cur_lt, &bno_cur_gt, + ltdiff, >bno, >len, + >bnoa, >lena, + 0 /* search right */); + } else { + ASSERT(gtlena >= args->minlen); + + /* + * Right side is good, look for a left side entry. + */ + args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); + xfs_alloc_fix_len(args); + gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, gtbnoa, gtlena, >new); + + error = xfs_alloc_find_best_extent(args, + &bno_cur_gt, &bno_cur_lt, + gtdiff, <bno, <len, + <bnoa, <lena, + 1 /* search left */); + } + + if (error) + goto error0; + } + + /* + * If we couldn't get anything, give up. + */ + if (bno_cur_lt == NULL && bno_cur_gt == NULL) { + if (!forced++) { + trace_xfs_alloc_near_busy(args); + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + + trace_xfs_alloc_size_neither(args); + args->agbno = NULLAGBLOCK; + return 0; + } + + /* + * At this point we have selected a freespace entry, either to the + * left or to the right. If it's on the right, copy all the + * useful variables to the "left" set so we only have one + * copy of this code. + */ + if (bno_cur_gt) { + bno_cur_lt = bno_cur_gt; + bno_cur_gt = NULL; + ltbno = gtbno; + ltbnoa = gtbnoa; + ltlen = gtlen; + ltlena = gtlena; + j = 1; + } else + j = 0; + + /* + * Fix up the length and compute the useful address. + */ + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + if (!xfs_alloc_fix_minleft(args)) { + trace_xfs_alloc_near_nominleft(args); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + return 0; + } + rlen = args->len; + (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, + ltbnoa, ltlena, <new); + ASSERT(ltnew >= ltbno); + ASSERT(ltnew + rlen <= ltbnoa + ltlena); + ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + args->agbno = ltnew; + + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, + ltnew, rlen, XFSA_FIXUP_BNO_OK))) + goto error0; + + if (j) + trace_xfs_alloc_near_greater(args); + else + trace_xfs_alloc_near_lesser(args); + + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + return 0; + + error0: + trace_xfs_alloc_near_error(args); + if (cnt_cur != NULL) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + if (bno_cur_lt != NULL) + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR); + if (bno_cur_gt != NULL) + xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR); + return error; +} + +/* + * Allocate a variable extent anywhere in the allocation group agno. + * Extent's length (returned in len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_size( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur; /* cursor for bno btree */ + xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */ + int error; /* error result */ + xfs_agblock_t fbno; /* start of found freespace */ + xfs_extlen_t flen; /* length of found freespace */ + int i; /* temp status variable */ + xfs_agblock_t rbno; /* returned block number */ + xfs_extlen_t rlen; /* length of returned extent */ + int forced = 0; + +restart: + /* + * Allocate and initialize a cursor for the by-size btree. + */ + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); + bno_cur = NULL; + + /* + * Look for an entry >= maxlen+alignment-1 blocks. + */ + if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, + args->maxlen + args->alignment - 1, &i))) + goto error0; + + /* + * If none or we have busy extents that we cannot allocate from, then + * we have to settle for a smaller extent. In the case that there are + * no large extents, this will return the last entry in the tree unless + * the tree is empty. In the case that there are only busy large + * extents, this will return the largest small extent unless there + * are no smaller extents available. + */ + if (!i || forced > 1) { + error = xfs_alloc_ag_vextent_small(args, cnt_cur, + &fbno, &flen, &i); + if (error) + goto error0; + if (i == 0 || flen == 0) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_noentry(args); + return 0; + } + ASSERT(i == 1); + xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); + } else { + /* + * Search for a non-busy extent that is large enough. + * If we are at low space, don't check, or if we fall of + * the end of the btree, turn off the busy check and + * restart. + */ + for (;;) { + error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen); + + if (rlen >= args->maxlen) + break; + + error = xfs_btree_increment(cnt_cur, 0, &i); + if (error) + goto error0; + if (i == 0) { + /* + * Our only valid extents must have been busy. + * Make it unbusy by forcing the log out and + * retrying. If we've been here before, forcing + * the log isn't making the extents available, + * which means they have probably been freed in + * this transaction. In that case, we have to + * give up on them and we'll attempt a minlen + * allocation the next time around. + */ + xfs_btree_del_cursor(cnt_cur, + XFS_BTREE_NOERROR); + trace_xfs_alloc_size_busy(args); + if (!forced++) + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + } + } + + /* + * In the first case above, we got the last entry in the + * by-size btree. Now we check to see if the space hits maxlen + * once aligned; if not, we search left for something better. + * This can't happen in the second case above. + */ + rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); + XFS_WANT_CORRUPTED_GOTO(rlen == 0 || + (rlen <= flen && rbno + rlen <= fbno + flen), error0); + if (rlen < args->maxlen) { + xfs_agblock_t bestfbno; + xfs_extlen_t bestflen; + xfs_agblock_t bestrbno; + xfs_extlen_t bestrlen; + + bestrlen = rlen; + bestrbno = rbno; + bestflen = flen; + bestfbno = fbno; + for (;;) { + if ((error = xfs_btree_decrement(cnt_cur, 0, &i))) + goto error0; + if (i == 0) + break; + if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (flen < bestrlen) + break; + xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen); + rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); + XFS_WANT_CORRUPTED_GOTO(rlen == 0 || + (rlen <= flen && rbno + rlen <= fbno + flen), + error0); + if (rlen > bestrlen) { + bestrlen = rlen; + bestrbno = rbno; + bestflen = flen; + bestfbno = fbno; + if (rlen == args->maxlen) + break; + } + } + if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + rlen = bestrlen; + rbno = bestrbno; + flen = bestflen; + fbno = bestfbno; + } + args->wasfromfl = 0; + /* + * Fix up the length. + */ + args->len = rlen; + if (rlen < args->minlen) { + if (!forced++) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_busy(args); + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + goto out_nominleft; + } + xfs_alloc_fix_len(args); + + if (!xfs_alloc_fix_minleft(args)) + goto out_nominleft; + rlen = args->len; + XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); + /* + * Allocate and initialize a cursor for the by-block tree. + */ + bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, + rbno, rlen, XFSA_FIXUP_CNT_OK))) + goto error0; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + cnt_cur = bno_cur = NULL; + args->len = rlen; + args->agbno = rbno; + XFS_WANT_CORRUPTED_GOTO( + args->agbno + args->len <= + be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), + error0); + trace_xfs_alloc_size_done(args); + return 0; + +error0: + trace_xfs_alloc_size_error(args); + if (cnt_cur) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + if (bno_cur) + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + return error; + +out_nominleft: + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_nominleft(args); + args->agbno = NULLAGBLOCK; + return 0; +} + +/* + * Deal with the case where only small freespaces remain. + * Either return the contents of the last freespace record, + * or allocate space from the freelist if there is nothing in the tree. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_small( + xfs_alloc_arg_t *args, /* allocation argument structure */ + xfs_btree_cur_t *ccur, /* by-size cursor */ + xfs_agblock_t *fbnop, /* result block number */ + xfs_extlen_t *flenp, /* result length */ + int *stat) /* status: 0-freelist, 1-normal/none */ +{ + int error; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int i; + + if ((error = xfs_btree_decrement(ccur, 0, &i))) + goto error0; + if (i) { + if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + /* + * Nothing in the btree, try the freelist. Make sure + * to respect minleft even when pulling from the + * freelist. + */ + else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && + (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) + > args->minleft)) { + error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); + if (error) + goto error0; + if (fbno != NULLAGBLOCK) { + xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1, + args->userdata); + + if (args->userdata) { + xfs_buf_t *bp; + + bp = xfs_btree_get_bufs(args->mp, args->tp, + args->agno, fbno, 0); + xfs_trans_binval(args->tp, bp); + } + args->len = 1; + args->agbno = fbno; + XFS_WANT_CORRUPTED_GOTO( + args->agbno + args->len <= + be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), + error0); + args->wasfromfl = 1; + trace_xfs_alloc_small_freelist(args); + *stat = 0; + return 0; + } + /* + * Nothing in the freelist. + */ + else + flen = 0; + } + /* + * Can't allocate from the freelist for some reason. + */ + else { + fbno = NULLAGBLOCK; + flen = 0; + } + /* + * Can't do the allocation, give up. + */ + if (flen < args->minlen) { + args->agbno = NULLAGBLOCK; + trace_xfs_alloc_small_notenough(args); + flen = 0; + } + *fbnop = fbno; + *flenp = flen; + *stat = 1; + trace_xfs_alloc_small_done(args); + return 0; + +error0: + trace_xfs_alloc_small_error(args); + return error; +} + +/* + * Free the extent starting at agno/bno for length. + */ +STATIC int /* error */ +xfs_free_ag_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer for a.g. freelist header */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t bno, /* starting block number */ + xfs_extlen_t len, /* length of extent */ + int isfl) /* set if is freelist blocks - no sb acctg */ +{ + xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ + xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ + int error; /* error return value */ + xfs_agblock_t gtbno; /* start of right neighbor block */ + xfs_extlen_t gtlen; /* length of right neighbor block */ + int haveleft; /* have a left neighbor block */ + int haveright; /* have a right neighbor block */ + int i; /* temp, result code */ + xfs_agblock_t ltbno; /* start of left neighbor block */ + xfs_extlen_t ltlen; /* length of left neighbor block */ + xfs_mount_t *mp; /* mount point struct for filesystem */ + xfs_agblock_t nbno; /* new starting block of freespace */ + xfs_extlen_t nlen; /* new length of freespace */ + xfs_perag_t *pag; /* per allocation group data */ + + mp = tp->t_mountp; + /* + * Allocate and initialize a cursor for the by-block btree. + */ + bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO); + cnt_cur = NULL; + /* + * Look for a neighboring block on the left (lower block numbers) + * that is contiguous with this space. + */ + if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft))) + goto error0; + if (haveleft) { + /* + * There is a block to our left. + */ + if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * It's not contiguous, though. + */ + if (ltbno + ltlen < bno) + haveleft = 0; + else { + /* + * If this failure happens the request to free this + * space was invalid, it's (partly) already free. + * Very bad. + */ + XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0); + } + } + /* + * Look for a neighboring block on the right (higher block numbers) + * that is contiguous with this space. + */ + if ((error = xfs_btree_increment(bno_cur, 0, &haveright))) + goto error0; + if (haveright) { + /* + * There is a block to our right. + */ + if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * It's not contiguous, though. + */ + if (bno + len < gtbno) + haveright = 0; + else { + /* + * If this failure happens the request to free this + * space was invalid, it's (partly) already free. + * Very bad. + */ + XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0); + } + } + /* + * Now allocate and initialize a cursor for the by-size tree. + */ + cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT); + /* + * Have both left and right contiguous neighbors. + * Merge all three into a single free block. + */ + if (haveleft && haveright) { + /* + * Delete the old by-size entry on the left. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_btree_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Delete the old by-size entry on the right. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_btree_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Delete the old by-block entry for the right block. + */ + if ((error = xfs_btree_delete(bno_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Move the by-block cursor back to the left neighbor. + */ + if ((error = xfs_btree_decrement(bno_cur, 0, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); +#ifdef DEBUG + /* + * Check that this is the right record: delete didn't + * mangle the cursor. + */ + { + xfs_agblock_t xxbno; + xfs_extlen_t xxlen; + + if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO( + i == 1 && xxbno == ltbno && xxlen == ltlen, + error0); + } +#endif + /* + * Update remaining by-block entry to the new, joined block. + */ + nbno = ltbno; + nlen = len + ltlen + gtlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * Have only a left contiguous neighbor. + * Merge it together with the new freespace. + */ + else if (haveleft) { + /* + * Delete the old by-size entry on the left. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_btree_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Back up the by-block cursor to the left neighbor, and + * update its length. + */ + if ((error = xfs_btree_decrement(bno_cur, 0, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + nbno = ltbno; + nlen = len + ltlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * Have only a right contiguous neighbor. + * Merge it together with the new freespace. + */ + else if (haveright) { + /* + * Delete the old by-size entry on the right. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_btree_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Update the starting block and length of the right + * neighbor in the by-block tree. + */ + nbno = bno; + nlen = len + gtlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * No contiguous neighbors. + * Insert the new freespace into the by-block tree. + */ + else { + nbno = bno; + nlen = len; + if ((error = xfs_btree_insert(bno_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + bno_cur = NULL; + /* + * In all cases we need to insert the new freespace in the by-size tree. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 0, error0); + if ((error = xfs_btree_insert(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + cnt_cur = NULL; + + /* + * Update the freespace totals in the ag and superblock. + */ + pag = xfs_perag_get(mp, agno); + error = xfs_alloc_update_counters(tp, pag, agbp, len); + xfs_perag_put(pag); + if (error) + goto error0; + + if (!isfl) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); + XFS_STATS_INC(xs_freex); + XFS_STATS_ADD(xs_freeb, len); + + trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); + + return 0; + + error0: + trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1); + if (bno_cur) + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + if (cnt_cur) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Visible (exported) allocation/free functions. + * Some of these are used just by xfs_alloc_btree.c and this file. + */ + +/* + * Compute and fill in value of m_ag_maxlevels. + */ +void +xfs_alloc_compute_maxlevels( + xfs_mount_t *mp) /* file system mount structure */ +{ + int level; + uint maxblocks; + uint maxleafents; + int minleafrecs; + int minnoderecs; + + maxleafents = (mp->m_sb.sb_agblocks + 1) / 2; + minleafrecs = mp->m_alloc_mnr[0]; + minnoderecs = mp->m_alloc_mnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + mp->m_ag_maxlevels = level; +} + +/* + * Find the length of the longest extent in an AG. + */ +xfs_extlen_t +xfs_alloc_longest_free_extent( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + xfs_extlen_t need, delta = 0; + + need = XFS_MIN_FREELIST_PAG(pag, mp); + if (need > pag->pagf_flcount) + delta = need - pag->pagf_flcount; + + if (pag->pagf_longest > delta) + return pag->pagf_longest - delta; + return pag->pagf_flcount > 0 || pag->pagf_longest > 0; +} + +/* + * Decide whether to use this allocation group for this allocation. + * If so, fix up the btree freelist's size. + */ +STATIC int /* error */ +xfs_alloc_fix_freelist( + xfs_alloc_arg_t *args, /* allocation argument structure */ + int flags) /* XFS_ALLOC_FLAG_... */ +{ + xfs_buf_t *agbp; /* agf buffer pointer */ + xfs_agf_t *agf; /* a.g. freespace structure pointer */ + xfs_buf_t *agflbp;/* agfl buffer pointer */ + xfs_agblock_t bno; /* freelist block */ + xfs_extlen_t delta; /* new blocks needed in freelist */ + int error; /* error result code */ + xfs_extlen_t longest;/* longest extent in allocation group */ + xfs_mount_t *mp; /* file system mount point structure */ + xfs_extlen_t need; /* total blocks needed in freelist */ + xfs_perag_t *pag; /* per-ag information structure */ + xfs_alloc_arg_t targs; /* local allocation arguments */ + xfs_trans_t *tp; /* transaction pointer */ + + mp = args->mp; + + pag = args->pag; + tp = args->tp; + if (!pag->pagf_init) { + if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, + &agbp))) + return error; + if (!pag->pagf_init) { + ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + args->agbp = NULL; + return 0; + } + } else + agbp = NULL; + + /* + * If this is a metadata preferred pag and we are user data + * then try somewhere else if we are not being asked to + * try harder at this point + */ + if (pag->pagf_metadata && args->userdata && + (flags & XFS_ALLOC_FLAG_TRYLOCK)) { + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + args->agbp = NULL; + return 0; + } + + if (!(flags & XFS_ALLOC_FLAG_FREEING)) { + /* + * If it looks like there isn't a long enough extent, or enough + * total blocks, reject it. + */ + need = XFS_MIN_FREELIST_PAG(pag, mp); + longest = xfs_alloc_longest_free_extent(mp, pag); + if ((args->minlen + args->alignment + args->minalignslop - 1) > + longest || + ((int)(pag->pagf_freeblks + pag->pagf_flcount - + need - args->total) < (int)args->minleft)) { + if (agbp) + xfs_trans_brelse(tp, agbp); + args->agbp = NULL; + return 0; + } + } + + /* + * Get the a.g. freespace buffer. + * Can fail if we're not blocking on locks, and it's held. + */ + if (agbp == NULL) { + if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, + &agbp))) + return error; + if (agbp == NULL) { + ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + args->agbp = NULL; + return 0; + } + } + /* + * Figure out how many blocks we should have in the freelist. + */ + agf = XFS_BUF_TO_AGF(agbp); + need = XFS_MIN_FREELIST(agf, mp); + /* + * If there isn't enough total or single-extent, reject it. + */ + if (!(flags & XFS_ALLOC_FLAG_FREEING)) { + delta = need > be32_to_cpu(agf->agf_flcount) ? + (need - be32_to_cpu(agf->agf_flcount)) : 0; + longest = be32_to_cpu(agf->agf_longest); + longest = (longest > delta) ? (longest - delta) : + (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0); + if ((args->minlen + args->alignment + args->minalignslop - 1) > + longest || + ((int)(be32_to_cpu(agf->agf_freeblks) + + be32_to_cpu(agf->agf_flcount) - need - args->total) < + (int)args->minleft)) { + xfs_trans_brelse(tp, agbp); + args->agbp = NULL; + return 0; + } + } + /* + * Make the freelist shorter if it's too long. + */ + while (be32_to_cpu(agf->agf_flcount) > need) { + xfs_buf_t *bp; + + error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); + if (error) + return error; + if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) + return error; + bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); + xfs_trans_binval(tp, bp); + } + /* + * Initialize the args structure. + */ + targs.tp = tp; + targs.mp = mp; + targs.agbp = agbp; + targs.agno = args->agno; + targs.mod = targs.minleft = targs.wasdel = targs.userdata = + targs.minalignslop = 0; + targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; + targs.type = XFS_ALLOCTYPE_THIS_AG; + targs.pag = pag; + if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp))) + return error; + /* + * Make the freelist longer if it's too short. + */ + while (be32_to_cpu(agf->agf_flcount) < need) { + targs.agbno = 0; + targs.maxlen = need - be32_to_cpu(agf->agf_flcount); + /* + * Allocate as many blocks as possible at once. + */ + if ((error = xfs_alloc_ag_vextent(&targs))) { + xfs_trans_brelse(tp, agflbp); + return error; + } + /* + * Stop if we run out. Won't happen if callers are obeying + * the restrictions correctly. Can happen for free calls + * on a completely full ag. + */ + if (targs.agbno == NULLAGBLOCK) { + if (flags & XFS_ALLOC_FLAG_FREEING) + break; + xfs_trans_brelse(tp, agflbp); + args->agbp = NULL; + return 0; + } + /* + * Put each allocated block on the list. + */ + for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { + error = xfs_alloc_put_freelist(tp, agbp, + agflbp, bno, 0); + if (error) + return error; + } + } + xfs_trans_brelse(tp, agflbp); + args->agbp = agbp; + return 0; +} + +/* + * Get a block from the freelist. + * Returns with the buffer for the block gotten. + */ +int /* error */ +xfs_alloc_get_freelist( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer containing the agf structure */ + xfs_agblock_t *bnop, /* block address retrieved from freelist */ + int btreeblk) /* destination is a AGF btree */ +{ + xfs_agf_t *agf; /* a.g. freespace structure */ + xfs_agfl_t *agfl; /* a.g. freelist structure */ + xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ + xfs_agblock_t bno; /* block number returned */ + int error; + int logflags; + xfs_mount_t *mp; /* mount structure */ + xfs_perag_t *pag; /* per allocation group data */ + + agf = XFS_BUF_TO_AGF(agbp); + /* + * Freelist is empty, give up. + */ + if (!agf->agf_flcount) { + *bnop = NULLAGBLOCK; + return 0; + } + /* + * Read the array of free blocks. + */ + mp = tp->t_mountp; + if ((error = xfs_alloc_read_agfl(mp, tp, + be32_to_cpu(agf->agf_seqno), &agflbp))) + return error; + agfl = XFS_BUF_TO_AGFL(agflbp); + /* + * Get the block number and update the data structures. + */ + bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]); + be32_add_cpu(&agf->agf_flfirst, 1); + xfs_trans_brelse(tp, agflbp); + if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) + agf->agf_flfirst = 0; + + pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + be32_add_cpu(&agf->agf_flcount, -1); + xfs_trans_agflist_delta(tp, -1); + pag->pagf_flcount--; + xfs_perag_put(pag); + + logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; + if (btreeblk) { + be32_add_cpu(&agf->agf_btreeblks, 1); + pag->pagf_btreeblks++; + logflags |= XFS_AGF_BTREEBLKS; + } + + xfs_alloc_log_agf(tp, agbp, logflags); + *bnop = bno; + + return 0; +} + +/* + * Log the given fields from the agf structure. + */ +void +xfs_alloc_log_agf( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *bp, /* buffer for a.g. freelist header */ + int fields) /* mask of fields to be logged (XFS_AGF_...) */ +{ + int first; /* first byte offset */ + int last; /* last byte offset */ + static const short offsets[] = { + offsetof(xfs_agf_t, agf_magicnum), + offsetof(xfs_agf_t, agf_versionnum), + offsetof(xfs_agf_t, agf_seqno), + offsetof(xfs_agf_t, agf_length), + offsetof(xfs_agf_t, agf_roots[0]), + offsetof(xfs_agf_t, agf_levels[0]), + offsetof(xfs_agf_t, agf_flfirst), + offsetof(xfs_agf_t, agf_fllast), + offsetof(xfs_agf_t, agf_flcount), + offsetof(xfs_agf_t, agf_freeblks), + offsetof(xfs_agf_t, agf_longest), + offsetof(xfs_agf_t, agf_btreeblks), + sizeof(xfs_agf_t) + }; + + trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + + xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); + xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); +} + +/* + * Interface for inode allocation to force the pag data to be initialized. + */ +int /* error */ +xfs_alloc_pagf_init( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags) /* XFS_ALLOC_FLAGS_... */ +{ + xfs_buf_t *bp; + int error; + + if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp))) + return error; + if (bp) + xfs_trans_brelse(tp, bp); + return 0; +} + +/* + * Put the block on the freelist for the allocation group. + */ +int /* error */ +xfs_alloc_put_freelist( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer for a.g. freelist header */ + xfs_buf_t *agflbp,/* buffer for a.g. free block array */ + xfs_agblock_t bno, /* block being freed */ + int btreeblk) /* block came from a AGF btree */ +{ + xfs_agf_t *agf; /* a.g. freespace structure */ + xfs_agfl_t *agfl; /* a.g. free block array */ + __be32 *blockp;/* pointer to array entry */ + int error; + int logflags; + xfs_mount_t *mp; /* mount structure */ + xfs_perag_t *pag; /* per allocation group data */ + + agf = XFS_BUF_TO_AGF(agbp); + mp = tp->t_mountp; + + if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, + be32_to_cpu(agf->agf_seqno), &agflbp))) + return error; + agfl = XFS_BUF_TO_AGFL(agflbp); + be32_add_cpu(&agf->agf_fllast, 1); + if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) + agf->agf_fllast = 0; + + pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + be32_add_cpu(&agf->agf_flcount, 1); + xfs_trans_agflist_delta(tp, 1); + pag->pagf_flcount++; + + logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; + if (btreeblk) { + be32_add_cpu(&agf->agf_btreeblks, -1); + pag->pagf_btreeblks--; + logflags |= XFS_AGF_BTREEBLKS; + } + xfs_perag_put(pag); + + xfs_alloc_log_agf(tp, agbp, logflags); + + ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); + blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)]; + *blockp = cpu_to_be32(bno); + xfs_alloc_log_agf(tp, agbp, logflags); + xfs_trans_log_buf(tp, agflbp, + (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl), + (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl + + sizeof(xfs_agblock_t) - 1)); + return 0; +} + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_BUF_ */ + struct xfs_buf **bpp) /* buffer for the ag freelist header */ +{ + struct xfs_agf *agf; /* ag freelist header */ + int agf_ok; /* set if agf is consistent */ + int error; + + ASSERT(agno != NULLAGNUMBER); + error = xfs_trans_read_buf( + mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), flags, bpp); + if (error) + return error; + if (!*bpp) + return 0; + + ASSERT(!XFS_BUF_GETERROR(*bpp)); + agf = XFS_BUF_TO_AGF(*bpp); + + /* + * Validate the magic number of the agf block. + */ + agf_ok = + be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC && + XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && + be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_seqno) == agno; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= + be32_to_cpu(agf->agf_length); + if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF))) { + XFS_CORRUPTION_ERROR("xfs_alloc_read_agf", + XFS_ERRLEVEL_LOW, mp, agf); + xfs_trans_brelse(tp, *bpp); + return XFS_ERROR(EFSCORRUPTED); + } + XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); + return 0; +} + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_alloc_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_ALLOC_FLAG_... */ + struct xfs_buf **bpp) /* buffer for the ag freelist header */ +{ + struct xfs_agf *agf; /* ag freelist header */ + struct xfs_perag *pag; /* per allocation group data */ + int error; + + ASSERT(agno != NULLAGNUMBER); + + error = xfs_read_agf(mp, tp, agno, + (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, + bpp); + if (error) + return error; + if (!*bpp) + return 0; + ASSERT(!XFS_BUF_GETERROR(*bpp)); + + agf = XFS_BUF_TO_AGF(*bpp); + pag = xfs_perag_get(mp, agno); + if (!pag->pagf_init) { + pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); + pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); + pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); + pag->pagf_longest = be32_to_cpu(agf->agf_longest); + pag->pagf_levels[XFS_BTNUM_BNOi] = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); + pag->pagf_levels[XFS_BTNUM_CNTi] = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); + spin_lock_init(&pag->pagb_lock); + pag->pagb_count = 0; + pag->pagb_tree = RB_ROOT; + pag->pagf_init = 1; + } +#ifdef DEBUG + else if (!XFS_FORCED_SHUTDOWN(mp)) { + ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); + ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); + ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); + ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); + ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi])); + ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] == + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); + } +#endif + xfs_perag_put(pag); + return 0; +} + +/* + * Allocate an extent (variable-size). + * Depending on the allocation type, we either look in a single allocation + * group or loop over the allocation groups to find the result. + */ +int /* error */ +xfs_alloc_vextent( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_agblock_t agsize; /* allocation group size */ + int error; + int flags; /* XFS_ALLOC_FLAG_... locking flags */ + xfs_extlen_t minleft;/* minimum left value, temp copy */ + xfs_mount_t *mp; /* mount structure pointer */ + xfs_agnumber_t sagno; /* starting allocation group number */ + xfs_alloctype_t type; /* input allocation type */ + int bump_rotor = 0; + int no_min = 0; + xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */ + + mp = args->mp; + type = args->otype = args->type; + args->agbno = NULLAGBLOCK; + /* + * Just fix this up, for the case where the last a.g. is shorter + * (or there's only one a.g.) and the caller couldn't easily figure + * that out (xfs_bmap_alloc). + */ + agsize = mp->m_sb.sb_agblocks; + if (args->maxlen > agsize) + args->maxlen = agsize; + if (args->alignment == 0) + args->alignment = 1; + ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize); + ASSERT(args->minlen <= args->maxlen); + ASSERT(args->minlen <= agsize); + ASSERT(args->mod < args->prod); + if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount || + XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize || + args->minlen > args->maxlen || args->minlen > agsize || + args->mod >= args->prod) { + args->fsbno = NULLFSBLOCK; + trace_xfs_alloc_vextent_badargs(args); + return 0; + } + minleft = args->minleft; + + switch (type) { + case XFS_ALLOCTYPE_THIS_AG: + case XFS_ALLOCTYPE_NEAR_BNO: + case XFS_ALLOCTYPE_THIS_BNO: + /* + * These three force us into a single a.g. + */ + args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); + args->pag = xfs_perag_get(mp, args->agno); + args->minleft = 0; + error = xfs_alloc_fix_freelist(args, 0); + args->minleft = minleft; + if (error) { + trace_xfs_alloc_vextent_nofix(args); + goto error0; + } + if (!args->agbp) { + trace_xfs_alloc_vextent_noagbp(args); + break; + } + args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + if ((error = xfs_alloc_ag_vextent(args))) + goto error0; + break; + case XFS_ALLOCTYPE_START_BNO: + /* + * Try near allocation first, then anywhere-in-ag after + * the first a.g. fails. + */ + if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) && + (mp->m_flags & XFS_MOUNT_32BITINODES)) { + args->fsbno = XFS_AGB_TO_FSB(mp, + ((mp->m_agfrotor / rotorstep) % + mp->m_sb.sb_agcount), 0); + bump_rotor = 1; + } + args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + args->type = XFS_ALLOCTYPE_NEAR_BNO; + /* FALLTHROUGH */ + case XFS_ALLOCTYPE_ANY_AG: + case XFS_ALLOCTYPE_START_AG: + case XFS_ALLOCTYPE_FIRST_AG: + /* + * Rotate through the allocation groups looking for a winner. + */ + if (type == XFS_ALLOCTYPE_ANY_AG) { + /* + * Start with the last place we left off. + */ + args->agno = sagno = (mp->m_agfrotor / rotorstep) % + mp->m_sb.sb_agcount; + args->type = XFS_ALLOCTYPE_THIS_AG; + flags = XFS_ALLOC_FLAG_TRYLOCK; + } else if (type == XFS_ALLOCTYPE_FIRST_AG) { + /* + * Start with allocation group given by bno. + */ + args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); + args->type = XFS_ALLOCTYPE_THIS_AG; + sagno = 0; + flags = 0; + } else { + if (type == XFS_ALLOCTYPE_START_AG) + args->type = XFS_ALLOCTYPE_THIS_AG; + /* + * Start with the given allocation group. + */ + args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno); + flags = XFS_ALLOC_FLAG_TRYLOCK; + } + /* + * Loop over allocation groups twice; first time with + * trylock set, second time without. + */ + for (;;) { + args->pag = xfs_perag_get(mp, args->agno); + if (no_min) args->minleft = 0; + error = xfs_alloc_fix_freelist(args, flags); + args->minleft = minleft; + if (error) { + trace_xfs_alloc_vextent_nofix(args); + goto error0; + } + /* + * If we get a buffer back then the allocation will fly. + */ + if (args->agbp) { + if ((error = xfs_alloc_ag_vextent(args))) + goto error0; + break; + } + + trace_xfs_alloc_vextent_loopfailed(args); + + /* + * Didn't work, figure out the next iteration. + */ + if (args->agno == sagno && + type == XFS_ALLOCTYPE_START_BNO) + args->type = XFS_ALLOCTYPE_THIS_AG; + /* + * For the first allocation, we can try any AG to get + * space. However, if we already have allocated a + * block, we don't want to try AGs whose number is below + * sagno. Otherwise, we may end up with out-of-order + * locking of AGF, which might cause deadlock. + */ + if (++(args->agno) == mp->m_sb.sb_agcount) { + if (args->firstblock != NULLFSBLOCK) + args->agno = sagno; + else + args->agno = 0; + } + /* + * Reached the starting a.g., must either be done + * or switch to non-trylock mode. + */ + if (args->agno == sagno) { + if (no_min == 1) { + args->agbno = NULLAGBLOCK; + trace_xfs_alloc_vextent_allfailed(args); + break; + } + if (flags == 0) { + no_min = 1; + } else { + flags = 0; + if (type == XFS_ALLOCTYPE_START_BNO) { + args->agbno = XFS_FSB_TO_AGBNO(mp, + args->fsbno); + args->type = XFS_ALLOCTYPE_NEAR_BNO; + } + } + } + xfs_perag_put(args->pag); + } + if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { + if (args->agno == sagno) + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); + else + mp->m_agfrotor = (args->agno * rotorstep + 1) % + (mp->m_sb.sb_agcount * rotorstep); + } + break; + default: + ASSERT(0); + /* NOTREACHED */ + } + if (args->agbno == NULLAGBLOCK) + args->fsbno = NULLFSBLOCK; + else { + args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); +#ifdef DEBUG + ASSERT(args->len >= args->minlen); + ASSERT(args->len <= args->maxlen); + ASSERT(args->agbno % args->alignment == 0); + XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), + args->len); +#endif + } + xfs_perag_put(args->pag); + return 0; +error0: + xfs_perag_put(args->pag); + return error; +} + +/* + * Free an extent. + * Just break up the extent address and hand off to xfs_free_ag_extent + * after fixing up the freelist. + */ +int /* error */ +xfs_free_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len) /* length of extent */ +{ + xfs_alloc_arg_t args; + int error; + + ASSERT(len != 0); + memset(&args, 0, sizeof(xfs_alloc_arg_t)); + args.tp = tp; + args.mp = tp->t_mountp; + + /* + * validate that the block number is legal - the enables us to detect + * and handle a silent filesystem corruption rather than crashing. + */ + args.agno = XFS_FSB_TO_AGNO(args.mp, bno); + if (args.agno >= args.mp->m_sb.sb_agcount) + return EFSCORRUPTED; + + args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); + if (args.agbno >= args.mp->m_sb.sb_agblocks) + return EFSCORRUPTED; + + args.pag = xfs_perag_get(args.mp, args.agno); + ASSERT(args.pag); + + error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); + if (error) + goto error0; + + /* validate the extent size is legal now we have the agf locked */ + if (args.agbno + len > + be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) { + error = EFSCORRUPTED; + goto error0; + } + + error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); + if (!error) + xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0); +error0: + xfs_perag_put(args.pag); + return error; +} + +void +xfs_alloc_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + unsigned int flags) +{ + struct xfs_busy_extent *new; + struct xfs_busy_extent *busyp; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent = NULL; + + new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); + if (!new) { + /* + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. + */ + trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len); + xfs_trans_set_sync(tp); + return; + } + + new->agno = agno; + new->bno = bno; + new->length = len; + INIT_LIST_HEAD(&new->list); + new->flags = flags; + + /* trace before insert to be able to see failed inserts */ + trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len); + + pag = xfs_perag_get(tp->t_mountp, new->agno); + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + while (*rbp) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); + + if (new->bno < busyp->bno) { + rbp = &(*rbp)->rb_left; + ASSERT(new->bno + new->length <= busyp->bno); + } else if (new->bno > busyp->bno) { + rbp = &(*rbp)->rb_right; + ASSERT(bno >= busyp->bno + busyp->length); + } else { + ASSERT(0); + } + } + + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + + list_add(&new->list, &tp->t_busy); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. + */ +int +xfs_alloc_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + struct xfs_busy_extent *busyp; + int match = 0; + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); + + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + return match; +} + +/* + * The found free extent [fbno, fend] overlaps part or all of the given busy + * extent. If the overlap covers the beginning, the end, or all of the busy + * extent, the overlapping portion can be made unbusy and used for the + * allocation. We can't split a busy extent because we can't modify a + * transaction/CIL context busy list, but we can update an entries block + * number or length. + * + * Returns true if the extent can safely be reused, or false if the search + * needs to be restarted. + */ +STATIC bool +xfs_alloc_busy_update_extent( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_busy_extent *busyp, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + /* + * This extent is currently being discarded. Give the thread + * performing the discard a chance to mark the extent unbusy + * and retry. + */ + if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) { + spin_unlock(&pag->pagb_lock); + delay(1); + spin_lock(&pag->pagb_lock); + return false; + } + + /* + * If there is a busy extent overlapping a user allocation, we have + * no choice but to force the log and retry the search. + * + * Fortunately this does not happen during normal operation, but + * only if the filesystem is very low on space and has to dip into + * the AGFL for normal allocations. + */ + if (userdata) + goto out_force_log; + + if (bbno < fbno && bend > fend) { + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + */ + + /* + * We would have to split the busy extent to be able to track + * it correct, which we cannot do because we would have to + * modify the list of busy extents attached to the transaction + * or CIL context, which is immutable. + * + * Force out the log to clear the busy extent and retry the + * search. + */ + goto out_force_log; + } else if (bbno >= fbno && bend <= fend) { + /* + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + */ + + /* + * The busy extent is fully covered by the extent we are + * allocating, and can simply be removed from the rbtree. + * However we cannot remove it from the immutable list + * tracking busy extents in the transaction or CIL context, + * so set the length to zero to mark it invalid. + * + * We also need to restart the busy extent search from the + * tree root, because erasing the node can rearrange the + * tree topology. + */ + rb_erase(&busyp->rb_node, &pag->pagb_tree); + busyp->length = 0; + return false; + } else if (fend < bend) { + /* + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + */ + busyp->bno = fend; + } else if (bbno < fbno) { + /* + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + */ + busyp->length = fbno - busyp->bno; + } else { + ASSERT(0); + } + + trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen); + return true; + +out_force_log: + spin_unlock(&pag->pagb_lock); + xfs_log_force(mp, XFS_LOG_SYNC); + trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen); + spin_lock(&pag->pagb_lock); + return false; +} + + +/* + * For a given extent [fbno, flen], make sure we can reuse it safely. + */ +void +xfs_alloc_busy_reuse( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + + ASSERT(flen > 0); + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); +restart: + rbp = pag->pagb_tree.rb_node; + while (rbp) { + struct xfs_busy_extent *busyp = + rb_entry(rbp, struct xfs_busy_extent, rb_node); + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fbno + flen <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen, + userdata)) + goto restart; + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * For a given extent [fbno, flen], search the busy extent list to find a + * subset of the extent that is not busy. If *rlen is smaller than + * args->minlen no suitable extent could be found, and the higher level + * code needs to force out the log and retry the allocation. + */ +STATIC void +xfs_alloc_busy_trim( + struct xfs_alloc_arg *args, + xfs_agblock_t bno, + xfs_extlen_t len, + xfs_agblock_t *rbno, + xfs_extlen_t *rlen) +{ + xfs_agblock_t fbno; + xfs_extlen_t flen; + struct rb_node *rbp; + + ASSERT(len > 0); + + spin_lock(&args->pag->pagb_lock); +restart: + fbno = bno; + flen = len; + rbp = args->pag->pagb_tree.rb_node; + while (rbp && flen >= args->minlen) { + struct xfs_busy_extent *busyp = + rb_entry(rbp, struct xfs_busy_extent, rb_node); + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fend <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + /* + * If this is a metadata allocation, try to reuse the busy + * extent instead of trimming the allocation. + */ + if (!args->userdata && + !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) { + if (!xfs_alloc_busy_update_extent(args->mp, args->pag, + busyp, fbno, flen, + false)) + goto restart; + continue; + } + + if (bbno <= fbno) { + /* start overlap */ + + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * No unbusy region in extent, return failure. + */ + if (fend <= bend) + goto fail; + + /* + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + * + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fbno = bend; + } else if (bend >= fend) { + /* end overlap */ + + /* + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fend = bbno; + } else { + /* middle overlap */ + + /* + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + * Can be trimmed to: + * +-------+ OR +-------+ + * fbno fend fbno fend + * + * Backward allocation leads to significant + * fragmentation of directories, which degrades + * directory performance, therefore we always want to + * choose the option that produces forward allocation + * patterns. + * Preferring the lower bno extent will make the next + * request use "fend" as the start of the next + * allocation; if the segment is no longer busy at + * that point, we'll get a contiguous allocation, but + * even if it is still busy, we will get a forward + * allocation. + * We try to avoid choosing the segment at "bend", + * because that can lead to the next allocation + * taking the segment at "fbno", which would be a + * backward allocation. We only use the segment at + * "fbno" if it is much larger than the current + * requested size, because in that case there's a + * good chance subsequent allocations will be + * contiguous. + */ + if (bbno - fbno >= args->maxlen) { + /* left candidate fits perfect */ + fend = bbno; + } else if (fend - bend >= args->maxlen * 4) { + /* right candidate has enough free space */ + fbno = bend; + } else if (bbno - fbno >= args->minlen) { + /* left candidate fits minimum requirement */ + fend = bbno; + } else { + goto fail; + } + } + + flen = fend - fbno; + } + spin_unlock(&args->pag->pagb_lock); + + if (fbno != bno || flen != len) { + trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, + fbno, flen); + } + *rbno = fbno; + *rlen = flen; + return; +fail: + /* + * Return a zero extent length as failure indications. All callers + * re-check if the trimmed extent satisfies the minlen requirement. + */ + spin_unlock(&args->pag->pagb_lock); + trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0); + *rbno = fbno; + *rlen = 0; +} + +static void +xfs_alloc_busy_clear_one( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_busy_extent *busyp) +{ + if (busyp->length) { + trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno, + busyp->length); + rb_erase(&busyp->rb_node, &pag->pagb_tree); + } + + list_del_init(&busyp->list); + kmem_free(busyp); +} + +/* + * Remove all extents on the passed in list from the busy extents tree. + * If do_discard is set skip extents that need to be discarded, and mark + * these as undergoing a discard operation instead. + */ +void +xfs_alloc_busy_clear( + struct xfs_mount *mp, + struct list_head *list, + bool do_discard) +{ + struct xfs_busy_extent *busyp, *n; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = NULLAGNUMBER; + + list_for_each_entry_safe(busyp, n, list, list) { + if (busyp->agno != agno) { + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + agno = busyp->agno; + } + + if (do_discard && busyp->length && + !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD)) + busyp->flags = XFS_ALLOC_BUSY_DISCARDED; + else + xfs_alloc_busy_clear_one(mp, pag, busyp); + } + + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } +} + +/* + * Callback for list_sort to sort busy extents by the AG they reside in. + */ +int +xfs_busy_extent_ag_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + return container_of(a, struct xfs_busy_extent, list)->agno - + container_of(b, struct xfs_busy_extent, list)->agno; +} diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h new file mode 100644 index 00000000..2f52b924 --- /dev/null +++ b/fs/xfs/xfs_alloc.h @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ALLOC_H__ +#define __XFS_ALLOC_H__ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; +struct xfs_perag; +struct xfs_trans; +struct xfs_busy_extent; + +/* + * Freespace allocation types. Argument to xfs_alloc_[v]extent. + */ +#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */ +#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */ +#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */ +#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */ +#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */ +#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */ +#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */ + +/* this should become an enum again when the tracing code is fixed */ +typedef unsigned int xfs_alloctype_t; + +#define XFS_ALLOC_TYPES \ + { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \ + { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \ + { XFS_ALLOCTYPE_START_AG, "START_AG" }, \ + { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \ + { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \ + { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \ + { XFS_ALLOCTYPE_THIS_BNO, "THIS_BNO" } + +/* + * Flags for xfs_alloc_fix_freelist. + */ +#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ +#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ + +/* + * In order to avoid ENOSPC-related deadlock caused by + * out-of-order locking of AGF buffer (PV 947395), we place + * constraints on the relationship among actual allocations for + * data blocks, freelist blocks, and potential file data bmap + * btree blocks. However, these restrictions may result in no + * actual space allocated for a delayed extent, for example, a data + * block in a certain AG is allocated but there is no additional + * block for the additional bmap btree block due to a split of the + * bmap btree of the file. The result of this may lead to an + * infinite loop in xfssyncd when the file gets flushed to disk and + * all delayed extents need to be actually allocated. To get around + * this, we explicitly set aside a few blocks which will not be + * reserved in delayed allocation. Considering the minimum number of + * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap + * btree requires 1 fsb, so we set the number of set-aside blocks + * to 4 + 4*agcount. + */ +#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) + +/* + * When deciding how much space to allocate out of an AG, we limit the + * allocation maximum size to the size the AG. However, we cannot use all the + * blocks in the AG - some are permanently used by metadata. These + * blocks are generally: + * - the AG superblock, AGF, AGI and AGFL + * - the AGF (bno and cnt) and AGI btree root blocks + * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits + * + * The AG headers are sector sized, so the amount of space they take up is + * dependent on filesystem geometry. The others are all single blocks. + */ +#define XFS_ALLOC_AG_MAX_USABLE(mp) \ + ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7) + + +/* + * Argument structure for xfs_alloc routines. + * This is turned into a structure to avoid having 20 arguments passed + * down several levels of the stack. + */ +typedef struct xfs_alloc_arg { + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_mount *mp; /* file system mount point */ + struct xfs_buf *agbp; /* buffer for a.g. freelist header */ + struct xfs_perag *pag; /* per-ag struct for this agno */ + xfs_fsblock_t fsbno; /* file system block number */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_agblock_t agbno; /* allocation group-relative block # */ + xfs_extlen_t minlen; /* minimum size of extent */ + xfs_extlen_t maxlen; /* maximum size of extent */ + xfs_extlen_t mod; /* mod value for extent size */ + xfs_extlen_t prod; /* prod value for extent size */ + xfs_extlen_t minleft; /* min blocks must be left after us */ + xfs_extlen_t total; /* total blocks needed in xaction */ + xfs_extlen_t alignment; /* align answer to multiple of this */ + xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */ + xfs_extlen_t len; /* output: actual size of extent */ + xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ + xfs_alloctype_t otype; /* original allocation type */ + char wasdel; /* set if allocation was prev delayed */ + char wasfromfl; /* set if allocation is from freelist */ + char isfl; /* set if is freelist blocks - !acctg */ + char userdata; /* set if this is user data */ + xfs_fsblock_t firstblock; /* io first block allocated */ +} xfs_alloc_arg_t; + +/* + * Defines for userdata + */ +#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ +#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ + +/* + * Find the length of the longest extent in an AG. + */ +xfs_extlen_t +xfs_alloc_longest_free_extent(struct xfs_mount *mp, + struct xfs_perag *pag); + +#ifdef __KERNEL__ +void +xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); + +void +xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list, + bool do_discard); + +int +xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); + +void +xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); + +int +xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b); + +static inline void xfs_alloc_busy_sort(struct list_head *list) +{ + list_sort(NULL, list, xfs_busy_extent_ag_cmp); +} + +#endif /* __KERNEL__ */ + +/* + * Compute and fill in value of m_ag_maxlevels. + */ +void +xfs_alloc_compute_maxlevels( + struct xfs_mount *mp); /* file system mount structure */ + +/* + * Get a block from the freelist. + * Returns with the buffer for the block gotten. + */ +int /* error */ +xfs_alloc_get_freelist( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer containing the agf structure */ + xfs_agblock_t *bnop, /* block address retrieved from freelist */ + int btreeblk); /* destination is a AGF btree */ + +/* + * Log the given fields from the agf structure. + */ +void +xfs_alloc_log_agf( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *bp, /* buffer for a.g. freelist header */ + int fields);/* mask of fields to be logged (XFS_AGF_...) */ + +/* + * Interface for inode allocation to force the pag data to be initialized. + */ +int /* error */ +xfs_alloc_pagf_init( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags); /* XFS_ALLOC_FLAGS_... */ + +/* + * Put the block on the freelist for the allocation group. + */ +int /* error */ +xfs_alloc_put_freelist( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for a.g. freelist header */ + struct xfs_buf *agflbp,/* buffer for a.g. free block array */ + xfs_agblock_t bno, /* block being freed */ + int btreeblk); /* owner was a AGF btree */ + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_alloc_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_ALLOC_FLAG_... */ + struct xfs_buf **bpp); /* buffer for the ag freelist header */ + +/* + * Allocate an extent (variable-size). + */ +int /* error */ +xfs_alloc_vextent( + xfs_alloc_arg_t *args); /* allocation argument structure */ + +/* + * Free an extent. + */ +int /* error */ +xfs_free_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len); /* length of extent */ + +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat); /* output: success/failure */ + +#endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c new file mode 100644 index 00000000..2b351882 --- /dev/null +++ b/fs/xfs/xfs_alloc_btree.c @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_btree_trace.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" + + +STATIC struct xfs_btree_cur * +xfs_allocbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_btnum); +} + +STATIC void +xfs_allocbt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + int btnum = cur->bc_btnum; + struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + + ASSERT(ptr->s != 0); + + agf->agf_roots[btnum] = ptr->s; + be32_add_cpu(&agf->agf_levels[btnum], inc); + pag->pagf_levels[btnum] += inc; + xfs_perag_put(pag); + + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); +} + +STATIC int +xfs_allocbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int length, + int *stat) +{ + int error; + xfs_agblock_t bno; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + /* Allocate the new block from the freelist. If we can't, give up. */ + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + &bno, 1); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + + if (bno == NULLAGBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + + xfs_trans_agbtree_delta(cur->bc_tp, 1); + new->s = cpu_to_be32(bno); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +} + +STATIC int +xfs_allocbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agblock_t bno; + int error; + + bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); + error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + if (error) + return error; + + xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + XFS_ALLOC_BUSY_SKIP_DISCARD); + xfs_trans_agbtree_delta(cur->bc_tp, -1); + return 0; +} + +/* + * Update the longest extent in the AGF + */ +STATIC void +xfs_allocbt_update_lastrec( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_rec *rec, + int ptr, + int reason) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_perag *pag; + __be32 len; + int numrecs; + + ASSERT(cur->bc_btnum == XFS_BTNUM_CNT); + + switch (reason) { + case LASTREC_UPDATE: + /* + * If this is the last leaf block and it's the last record, + * then update the size of the longest extent in the AG. + */ + if (ptr != xfs_btree_get_numrecs(block)) + return; + len = rec->alloc.ar_blockcount; + break; + case LASTREC_INSREC: + if (be32_to_cpu(rec->alloc.ar_blockcount) <= + be32_to_cpu(agf->agf_longest)) + return; + len = rec->alloc.ar_blockcount; + break; + case LASTREC_DELREC: + numrecs = xfs_btree_get_numrecs(block); + if (ptr <= numrecs) + return; + ASSERT(ptr == numrecs + 1); + + if (numrecs) { + xfs_alloc_rec_t *rrp; + + rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs); + len = rrp->ar_blockcount; + } else { + len = 0; + } + + break; + default: + ASSERT(0); + return; + } + + agf->agf_longest = len; + pag = xfs_perag_get(cur->bc_mp, seqno); + pag->pagf_longest = be32_to_cpu(len); + xfs_perag_put(pag); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); +} + +STATIC int +xfs_allocbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_alloc_mnr[level != 0]; +} + +STATIC int +xfs_allocbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_alloc_mxr[level != 0]; +} + +STATIC void +xfs_allocbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + ASSERT(rec->alloc.ar_startblock != 0); + + key->alloc.ar_startblock = rec->alloc.ar_startblock; + key->alloc.ar_blockcount = rec->alloc.ar_blockcount; +} + +STATIC void +xfs_allocbt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + ASSERT(key->alloc.ar_startblock != 0); + + rec->alloc.ar_startblock = key->alloc.ar_startblock; + rec->alloc.ar_blockcount = key->alloc.ar_blockcount; +} + +STATIC void +xfs_allocbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + ASSERT(cur->bc_rec.a.ar_startblock != 0); + + rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock); + rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount); +} + +STATIC void +xfs_allocbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(agf->agf_roots[cur->bc_btnum] != 0); + + ptr->s = agf->agf_roots[cur->bc_btnum]; +} + +STATIC __int64_t +xfs_allocbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; + xfs_alloc_key_t *kp = &key->alloc; + __int64_t diff; + + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return (__int64_t)be32_to_cpu(kp->ar_startblock) - + rec->ar_startblock; + } + + diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; + if (diff) + return diff; + + return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; +} + +#ifdef DEBUG +STATIC int +xfs_allocbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock); + } else { + return be32_to_cpu(k1->alloc.ar_blockcount) < + be32_to_cpu(k2->alloc.ar_blockcount) || + (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount && + be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock)); + } +} + +STATIC int +xfs_allocbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return be32_to_cpu(r1->alloc.ar_startblock) + + be32_to_cpu(r1->alloc.ar_blockcount) <= + be32_to_cpu(r2->alloc.ar_startblock); + } else { + return be32_to_cpu(r1->alloc.ar_blockcount) < + be32_to_cpu(r2->alloc.ar_blockcount) || + (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount && + be32_to_cpu(r1->alloc.ar_startblock) < + be32_to_cpu(r2->alloc.ar_startblock)); + } +} +#endif /* DEBUG */ + +#ifdef XFS_BTREE_TRACE +ktrace_t *xfs_allocbt_trace_buf; + +STATIC void +xfs_allocbt_trace_enter( + struct xfs_btree_cur *cur, + const char *func, + char *s, + int type, + int line, + __psunsigned_t a0, + __psunsigned_t a1, + __psunsigned_t a2, + __psunsigned_t a3, + __psunsigned_t a4, + __psunsigned_t a5, + __psunsigned_t a6, + __psunsigned_t a7, + __psunsigned_t a8, + __psunsigned_t a9, + __psunsigned_t a10) +{ + ktrace_enter(xfs_allocbt_trace_buf, (void *)(__psint_t)type, + (void *)func, (void *)s, NULL, (void *)cur, + (void *)a0, (void *)a1, (void *)a2, (void *)a3, + (void *)a4, (void *)a5, (void *)a6, (void *)a7, + (void *)a8, (void *)a9, (void *)a10); +} + +STATIC void +xfs_allocbt_trace_cursor( + struct xfs_btree_cur *cur, + __uint32_t *s0, + __uint64_t *l0, + __uint64_t *l1) +{ + *s0 = cur->bc_private.a.agno; + *l0 = cur->bc_rec.a.ar_startblock; + *l1 = cur->bc_rec.a.ar_blockcount; +} + +STATIC void +xfs_allocbt_trace_key( + struct xfs_btree_cur *cur, + union xfs_btree_key *key, + __uint64_t *l0, + __uint64_t *l1) +{ + *l0 = be32_to_cpu(key->alloc.ar_startblock); + *l1 = be32_to_cpu(key->alloc.ar_blockcount); +} + +STATIC void +xfs_allocbt_trace_record( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + __uint64_t *l0, + __uint64_t *l1, + __uint64_t *l2) +{ + *l0 = be32_to_cpu(rec->alloc.ar_startblock); + *l1 = be32_to_cpu(rec->alloc.ar_blockcount); + *l2 = 0; +} +#endif /* XFS_BTREE_TRACE */ + +static const struct xfs_btree_ops xfs_allocbt_ops = { + .rec_len = sizeof(xfs_alloc_rec_t), + .key_len = sizeof(xfs_alloc_key_t), + + .dup_cursor = xfs_allocbt_dup_cursor, + .set_root = xfs_allocbt_set_root, + .alloc_block = xfs_allocbt_alloc_block, + .free_block = xfs_allocbt_free_block, + .update_lastrec = xfs_allocbt_update_lastrec, + .get_minrecs = xfs_allocbt_get_minrecs, + .get_maxrecs = xfs_allocbt_get_maxrecs, + .init_key_from_rec = xfs_allocbt_init_key_from_rec, + .init_rec_from_key = xfs_allocbt_init_rec_from_key, + .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, + .key_diff = xfs_allocbt_key_diff, + +#ifdef DEBUG + .keys_inorder = xfs_allocbt_keys_inorder, + .recs_inorder = xfs_allocbt_recs_inorder, +#endif + +#ifdef XFS_BTREE_TRACE + .trace_enter = xfs_allocbt_trace_enter, + .trace_cursor = xfs_allocbt_trace_cursor, + .trace_key = xfs_allocbt_trace_key, + .trace_record = xfs_allocbt_trace_record, +#endif +}; + +/* + * Allocate a new allocation btree cursor. + */ +struct xfs_btree_cur * /* new alloc btree cursor */ +xfs_allocbt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for agf structure */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* btree identifier */ +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_btree_cur *cur; + + ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); + + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[btnum]); + cur->bc_btnum = btnum; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + + cur->bc_ops = &xfs_allocbt_ops; + if (btnum == XFS_BTNUM_CNT) + cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; + + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + + return cur; +} + +/* + * Calculate number of records in an alloc btree block. + */ +int +xfs_allocbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= XFS_ALLOC_BLOCK_LEN(mp); + + if (leaf) + return blocklen / sizeof(xfs_alloc_rec_t); + return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t)); +} diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h new file mode 100644 index 00000000..a6caa002 --- /dev/null +++ b/fs/xfs/xfs_alloc_btree.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ALLOC_BTREE_H__ +#define __XFS_ALLOC_BTREE_H__ + +/* + * Freespace on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +/* + * There are two on-disk btrees, one sorted by blockno and one sorted + * by blockcount and blockno. All blocks look the same to make the code + * simpler; if we have time later, we'll make the optimizations. + */ +#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ +#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ + +/* + * Data record/key structure + */ +typedef struct xfs_alloc_rec { + __be32 ar_startblock; /* starting block number */ + __be32 ar_blockcount; /* count of free blocks */ +} xfs_alloc_rec_t, xfs_alloc_key_t; + +typedef struct xfs_alloc_rec_incore { + xfs_agblock_t ar_startblock; /* starting block number */ + xfs_extlen_t ar_blockcount; /* count of free blocks */ +} xfs_alloc_rec_incore_t; + +/* btree pointer type */ +typedef __be32 xfs_alloc_ptr_t; + +/* + * Minimum and maximum blocksize and sectorsize. + * The blocksize upper limit is pretty much arbitrary. + * The sectorsize upper limit is due to sizeof(sb_sectsize). + */ +#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ +#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) +#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) +#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ +#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) +#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) + +/* + * Block numbers in the AG: + * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3. + */ +#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1)) +#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1)) + +/* + * Btree block header size depends on a superblock flag. + * + * (not quite yet, but soon) + */ +#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_ALLOC_REC_ADDR(mp, block, index) \ + ((xfs_alloc_rec_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + (((index) - 1) * sizeof(xfs_alloc_rec_t)))) + +#define XFS_ALLOC_KEY_ADDR(mp, block, index) \ + ((xfs_alloc_key_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_alloc_key_t))) + +#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_alloc_ptr_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_alloc_key_t) + \ + ((index) - 1) * sizeof(xfs_alloc_ptr_t))) + +extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_buf *, + xfs_agnumber_t, xfs_btnum_t); +extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); + +#endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h new file mode 100644 index 00000000..09022493 --- /dev/null +++ b/fs/xfs/xfs_arch.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ARCH_H__ +#define __XFS_ARCH_H__ + +#ifndef XFS_BIG_INUMS +# error XFS_BIG_INUMS must be defined true or false +#endif + +#ifdef __KERNEL__ + +#include + +#ifdef __BIG_ENDIAN +#define XFS_NATIVE_HOST 1 +#else +#undef XFS_NATIVE_HOST +#endif + +#else /* __KERNEL__ */ + +#if __BYTE_ORDER == __BIG_ENDIAN +#define XFS_NATIVE_HOST 1 +#else +#undef XFS_NATIVE_HOST +#endif + +#ifdef XFS_NATIVE_HOST +#define cpu_to_be16(val) ((__force __be16)(__u16)(val)) +#define cpu_to_be32(val) ((__force __be32)(__u32)(val)) +#define cpu_to_be64(val) ((__force __be64)(__u64)(val)) +#define be16_to_cpu(val) ((__force __u16)(__be16)(val)) +#define be32_to_cpu(val) ((__force __u32)(__be32)(val)) +#define be64_to_cpu(val) ((__force __u64)(__be64)(val)) +#else +#define cpu_to_be16(val) ((__force __be16)__swab16((__u16)(val))) +#define cpu_to_be32(val) ((__force __be32)__swab32((__u32)(val))) +#define cpu_to_be64(val) ((__force __be64)__swab64((__u64)(val))) +#define be16_to_cpu(val) (__swab16((__force __u16)(__be16)(val))) +#define be32_to_cpu(val) (__swab32((__force __u32)(__be32)(val))) +#define be64_to_cpu(val) (__swab64((__force __u64)(__be64)(val))) +#endif + +static inline void be16_add_cpu(__be16 *a, __s16 b) +{ + *a = cpu_to_be16(be16_to_cpu(*a) + b); +} + +static inline void be32_add_cpu(__be32 *a, __s32 b) +{ + *a = cpu_to_be32(be32_to_cpu(*a) + b); +} + +static inline void be64_add_cpu(__be64 *a, __s64 b) +{ + *a = cpu_to_be64(be64_to_cpu(*a) + b); +} + +#endif /* __KERNEL__ */ + +/* + * get and set integers from potentially unaligned locations + */ + +#define INT_GET_UNALIGNED_16_BE(pointer) \ + ((__u16)((((__u8*)(pointer))[0] << 8) | (((__u8*)(pointer))[1]))) +#define INT_SET_UNALIGNED_16_BE(pointer,value) \ + { \ + ((__u8*)(pointer))[0] = (((value) >> 8) & 0xff); \ + ((__u8*)(pointer))[1] = (((value) ) & 0xff); \ + } + +/* + * In directories inode numbers are stored as unaligned arrays of unsigned + * 8bit integers on disk. + * + * For v1 directories or v2 directories that contain inode numbers that + * do not fit into 32bit the array has eight members, but the first member + * is always zero: + * + * |unused|48-55|40-47|32-39|24-31|16-23| 8-15| 0- 7| + * + * For v2 directories that only contain entries with inode numbers that fit + * into 32bits a four-member array is used: + * + * |24-31|16-23| 8-15| 0- 7| + */ + +#define XFS_GET_DIR_INO4(di) \ + (((__u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3])) + +#define XFS_PUT_DIR_INO4(from, di) \ +do { \ + (di).i[0] = (((from) & 0xff000000ULL) >> 24); \ + (di).i[1] = (((from) & 0x00ff0000ULL) >> 16); \ + (di).i[2] = (((from) & 0x0000ff00ULL) >> 8); \ + (di).i[3] = ((from) & 0x000000ffULL); \ +} while (0) + +#define XFS_DI_HI(di) \ + (((__u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3])) +#define XFS_DI_LO(di) \ + (((__u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7])) + +#define XFS_GET_DIR_INO8(di) \ + (((xfs_ino_t)XFS_DI_LO(di) & 0xffffffffULL) | \ + ((xfs_ino_t)XFS_DI_HI(di) << 32)) + +#define XFS_PUT_DIR_INO8(from, di) \ +do { \ + (di).i[0] = 0; \ + (di).i[1] = (((from) & 0x00ff000000000000ULL) >> 48); \ + (di).i[2] = (((from) & 0x0000ff0000000000ULL) >> 40); \ + (di).i[3] = (((from) & 0x000000ff00000000ULL) >> 32); \ + (di).i[4] = (((from) & 0x00000000ff000000ULL) >> 24); \ + (di).i[5] = (((from) & 0x0000000000ff0000ULL) >> 16); \ + (di).i[6] = (((from) & 0x000000000000ff00ULL) >> 8); \ + (di).i[7] = ((from) & 0x00000000000000ffULL); \ +} while (0) + +#endif /* __XFS_ARCH_H__ */ diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c new file mode 100644 index 00000000..99d40116 --- /dev/null +++ b/fs/xfs/xfs_attr.c @@ -0,0 +1,2230 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_rw.h" +#include "xfs_vnodeops.h" +#include "xfs_trace.h" + +/* + * xfs_attr.c + * + * Provide the external interfaces to manage attribute lists. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Internal routines when attribute list fits inside the inode. + */ +STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); + +/* + * Internal routines when attribute list is one block. + */ +STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context); + +/* + * Internal routines when attribute list is more than one block. + */ +STATIC int xfs_attr_node_get(xfs_da_args_t *args); +STATIC int xfs_attr_node_addname(xfs_da_args_t *args); +STATIC int xfs_attr_node_removename(xfs_da_args_t *args); +STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context); +STATIC int xfs_attr_fillstate(xfs_da_state_t *state); +STATIC int xfs_attr_refillstate(xfs_da_state_t *state); + +/* + * Routines to manipulate out-of-line attribute values. + */ +STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args); +STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); + +#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ + +STATIC int +xfs_attr_name_to_xname( + struct xfs_name *xname, + const unsigned char *aname) +{ + if (!aname) + return EINVAL; + xname->name = aname; + xname->len = strlen((char *)aname); + if (xname->len >= MAXNAMELEN) + return EFAULT; /* match IRIX behaviour */ + + return 0; +} + +STATIC int +xfs_inode_hasattr( + struct xfs_inode *ip) +{ + if (!XFS_IFORK_Q(ip) || + (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_anextents == 0)) + return 0; + return 1; +} + +/*======================================================================== + * Overall external interface routines. + *========================================================================*/ + +STATIC int +xfs_attr_get_int( + struct xfs_inode *ip, + struct xfs_name *name, + unsigned char *value, + int *valuelenp, + int flags) +{ + xfs_da_args_t args; + int error; + + if (!xfs_inode_hasattr(ip)) + return ENOATTR; + + /* + * Fill in the arg structure for this request. + */ + memset((char *)&args, 0, sizeof(args)); + args.name = name->name; + args.namelen = name->len; + args.value = value; + args.valuelen = *valuelenp; + args.flags = flags; + args.hashval = xfs_da_hashname(args.name, args.namelen); + args.dp = ip; + args.whichfork = XFS_ATTR_FORK; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = xfs_attr_shortform_getvalue(&args); + } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_get(&args); + } else { + error = xfs_attr_node_get(&args); + } + + /* + * Return the number of bytes in the value to the caller. + */ + *valuelenp = args.valuelen; + + if (error == EEXIST) + error = 0; + return(error); +} + +int +xfs_attr_get( + xfs_inode_t *ip, + const unsigned char *name, + unsigned char *value, + int *valuelenp, + int flags) +{ + int error; + struct xfs_name xname; + + XFS_STATS_INC(xs_attr_get); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return(EIO); + + error = xfs_attr_name_to_xname(&xname, name); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return(error); +} + +/* + * Calculate how many blocks we need for the new attribute, + */ +STATIC int +xfs_attr_calc_size( + struct xfs_inode *ip, + int namelen, + int valuelen, + int *local) +{ + struct xfs_mount *mp = ip->i_mount; + int size; + int nblks; + + /* + * Determine space new attribute will use, and if it would be + * "local" or "remote" (note: local != inline). + */ + size = xfs_attr_leaf_newentsize(namelen, valuelen, + mp->m_sb.sb_blocksize, local); + + nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); + if (*local) { + if (size > (mp->m_sb.sb_blocksize >> 1)) { + /* Double split possible */ + nblks *= 2; + } + } else { + /* + * Out of line attribute, cannot double split, but + * make room for the attribute value itself. + */ + uint dblocks = XFS_B_TO_FSB(mp, valuelen); + nblks += dblocks; + nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); + } + + return nblks; +} + +STATIC int +xfs_attr_set_int( + struct xfs_inode *dp, + struct xfs_name *name, + unsigned char *value, + int valuelen, + int flags) +{ + xfs_da_args_t args; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + int error, err2, committed; + xfs_mount_t *mp = dp->i_mount; + int rsvd = (flags & ATTR_ROOT) != 0; + int local; + + /* + * Attach the dquots to the inode. + */ + error = xfs_qm_dqattach(dp, 0); + if (error) + return error; + + /* + * If the inode doesn't have an attribute fork, add one. + * (inode must not be locked when we call this routine) + */ + if (XFS_IFORK_Q(dp) == 0) { + int sf_size = sizeof(xfs_attr_sf_hdr_t) + + XFS_ATTR_SF_ENTSIZE_BYNAME(name->len, valuelen); + + if ((error = xfs_bmap_add_attrfork(dp, sf_size, rsvd))) + return(error); + } + + /* + * Fill in the arg structure for this request. + */ + memset((char *)&args, 0, sizeof(args)); + args.name = name->name; + args.namelen = name->len; + args.value = value; + args.valuelen = valuelen; + args.flags = flags; + args.hashval = xfs_da_hashname(args.name, args.namelen); + args.dp = dp; + args.firstblock = &firstblock; + args.flist = &flist; + args.whichfork = XFS_ATTR_FORK; + args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + + /* Size is now blocks for attribute data */ + args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local); + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET); + + /* + * Root fork attributes can use reserved data blocks for this + * operation if necessary + */ + + if (rsvd) + args.trans->t_flags |= XFS_TRANS_RESERVE; + + if ((error = xfs_trans_reserve(args.trans, args.total, + XFS_ATTRSET_LOG_RES(mp, args.total), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) { + xfs_trans_cancel(args.trans, 0); + return(error); + } + xfs_ilock(dp, XFS_ILOCK_EXCL); + + error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, + rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); + if (error) { + xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); + return (error); + } + + xfs_trans_ijoin(args.trans, dp); + + /* + * If the attribute list is non-existent or a shortform list, + * upgrade it to a single-leaf-block attribute list. + */ + if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || + ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) && + (dp->i_d.di_anextents == 0))) { + + /* + * Build initial attribute list (if required). + */ + if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) + xfs_attr_shortform_create(&args); + + /* + * Try to add the attr to the attribute list in + * the inode. + */ + error = xfs_attr_shortform_addname(&args); + if (error != ENOSPC) { + /* + * Commit the shortform mods, and we're done. + * NOTE: this is also the error path (EEXIST, etc). + */ + ASSERT(args.trans != NULL); + + /* + * If this is a synchronous mount, make sure that + * the transaction goes to disk before returning + * to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) { + xfs_trans_set_sync(args.trans); + } + + if (!error && (flags & ATTR_KERNOTIME) == 0) { + xfs_trans_ichgtime(args.trans, dp, + XFS_ICHGTIME_CHG); + } + err2 = xfs_trans_commit(args.trans, + XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return(error == 0 ? err2 : error); + } + + /* + * It won't fit in the shortform, transform to a leaf block. + * GROT: another possible req'mt for a double-split btree op. + */ + xfs_bmap_init(args.flist, args.firstblock); + error = xfs_attr_shortform_to_leaf(&args); + if (!error) { + error = xfs_bmap_finish(&args.trans, args.flist, + &committed); + } + if (error) { + ASSERT(committed); + args.trans = NULL; + xfs_bmap_cancel(&flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args.trans, dp); + + /* + * Commit the leaf transformation. We'll need another (linked) + * transaction to add the new attribute to the leaf. + */ + + error = xfs_trans_roll(&args.trans, dp); + if (error) + goto out; + + } + + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_addname(&args); + } else { + error = xfs_attr_node_addname(&args); + } + if (error) { + goto out; + } + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) { + xfs_trans_set_sync(args.trans); + } + + if ((flags & ATTR_KERNOTIME) == 0) + xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + + /* + * Commit the last in the sequence of transactions. + */ + xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return(error); + +out: + if (args.trans) + xfs_trans_cancel(args.trans, + XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return(error); +} + +int +xfs_attr_set( + xfs_inode_t *dp, + const unsigned char *name, + unsigned char *value, + int valuelen, + int flags) +{ + int error; + struct xfs_name xname; + + XFS_STATS_INC(xs_attr_set); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return (EIO); + + error = xfs_attr_name_to_xname(&xname, name); + if (error) + return error; + + return xfs_attr_set_int(dp, &xname, value, valuelen, flags); +} + +/* + * Generic handler routine to remove a name from an attribute list. + * Transitions attribute list from Btree to shortform as necessary. + */ +STATIC int +xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) +{ + xfs_da_args_t args; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + int error; + xfs_mount_t *mp = dp->i_mount; + + /* + * Fill in the arg structure for this request. + */ + memset((char *)&args, 0, sizeof(args)); + args.name = name->name; + args.namelen = name->len; + args.flags = flags; + args.hashval = xfs_da_hashname(args.name, args.namelen); + args.dp = dp; + args.firstblock = &firstblock; + args.flist = &flist; + args.total = 0; + args.whichfork = XFS_ATTR_FORK; + + /* + * we have no control over the attribute names that userspace passes us + * to remove, so we have to allow the name lookup prior to attribute + * removal to fail. + */ + args.op_flags = XFS_DA_OP_OKNOENT; + + /* + * Attach the dquots to the inode. + */ + error = xfs_qm_dqattach(dp, 0); + if (error) + return error; + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM); + + /* + * Root fork attributes can use reserved data blocks for this + * operation if necessary + */ + + if (flags & ATTR_ROOT) + args.trans->t_flags |= XFS_TRANS_RESERVE; + + if ((error = xfs_trans_reserve(args.trans, + XFS_ATTRRM_SPACE_RES(mp), + XFS_ATTRRM_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_ATTRRM_LOG_COUNT))) { + xfs_trans_cancel(args.trans, 0); + return(error); + } + + xfs_ilock(dp, XFS_ILOCK_EXCL); + /* + * No need to make quota reservations here. We expect to release some + * blocks not allocate in the common case. + */ + xfs_trans_ijoin(args.trans, dp); + + /* + * Decide on what work routines to call based on the inode size. + */ + if (!xfs_inode_hasattr(dp)) { + error = XFS_ERROR(ENOATTR); + goto out; + } + if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); + error = xfs_attr_shortform_remove(&args); + if (error) { + goto out; + } + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_removename(&args); + } else { + error = xfs_attr_node_removename(&args); + } + if (error) { + goto out; + } + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) { + xfs_trans_set_sync(args.trans); + } + + if ((flags & ATTR_KERNOTIME) == 0) + xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + + /* + * Commit the last in the sequence of transactions. + */ + xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return(error); + +out: + if (args.trans) + xfs_trans_cancel(args.trans, + XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return(error); +} + +int +xfs_attr_remove( + xfs_inode_t *dp, + const unsigned char *name, + int flags) +{ + int error; + struct xfs_name xname; + + XFS_STATS_INC(xs_attr_remove); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return (EIO); + + error = xfs_attr_name_to_xname(&xname, name); + if (error) + return error; + + xfs_ilock(dp, XFS_ILOCK_SHARED); + if (!xfs_inode_hasattr(dp)) { + xfs_iunlock(dp, XFS_ILOCK_SHARED); + return XFS_ERROR(ENOATTR); + } + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + return xfs_attr_remove_int(dp, &xname, flags); +} + +int +xfs_attr_list_int(xfs_attr_list_context_t *context) +{ + int error; + xfs_inode_t *dp = context->dp; + + XFS_STATS_INC(xs_attr_list); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return EIO; + + xfs_ilock(dp, XFS_ILOCK_SHARED); + + /* + * Decide on what work routines to call based on the inode size. + */ + if (!xfs_inode_hasattr(dp)) { + error = 0; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = xfs_attr_shortform_list(context); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_list(context); + } else { + error = xfs_attr_node_list(context); + } + + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + return error; +} + +#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ + (((struct attrlist_ent *) 0)->a_name - (char *) 0) +#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ + ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ + & ~(sizeof(u_int32_t)-1)) + +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +/*ARGSUSED*/ +STATIC int +xfs_attr_put_listent( + xfs_attr_list_context_t *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) +{ + struct attrlist *alist = (struct attrlist *)context->alist; + attrlist_ent_t *aep; + int arraytop; + + ASSERT(!(context->flags & ATTR_KERNOVAL)); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*alist)); + ASSERT(context->firstu <= context->bufsize); + + /* + * Only list entries in the right namespace. + */ + if (((context->flags & ATTR_SECURE) == 0) != + ((flags & XFS_ATTR_SECURE) == 0)) + return 0; + if (((context->flags & ATTR_ROOT) == 0) != + ((flags & XFS_ATTR_ROOT) == 0)) + return 0; + + arraytop = sizeof(*alist) + + context->count * sizeof(alist->al_offset[0]); + context->firstu -= ATTR_ENTSIZE(namelen); + if (context->firstu < arraytop) { + trace_xfs_attr_list_full(context); + alist->al_more = 1; + context->seen_enough = 1; + return 1; + } + + aep = (attrlist_ent_t *)&context->alist[context->firstu]; + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[namelen] = 0; + alist->al_offset[context->count++] = context->firstu; + alist->al_count = context->count; + trace_xfs_attr_list_add(context); + return 0; +} + +/* + * Generate a list of extended attribute names and optionally + * also value lengths. Positive return value follows the XFS + * convention of being an error, zero or negative return code + * is the length of the buffer returned (negated), indicating + * success. + */ +int +xfs_attr_list( + xfs_inode_t *dp, + char *buffer, + int bufsize, + int flags, + attrlist_cursor_kern_t *cursor) +{ + xfs_attr_list_context_t context; + struct attrlist *alist; + int error; + + /* + * Validate the cursor. + */ + if (cursor->pad1 || cursor->pad2) + return(XFS_ERROR(EINVAL)); + if ((cursor->initted == 0) && + (cursor->hashval || cursor->blkno || cursor->offset)) + return XFS_ERROR(EINVAL); + + /* + * Check for a properly aligned buffer. + */ + if (((long)buffer) & (sizeof(int)-1)) + return XFS_ERROR(EFAULT); + if (flags & ATTR_KERNOVAL) + bufsize = 0; + + /* + * Initialize the output buffer. + */ + memset(&context, 0, sizeof(context)); + context.dp = dp; + context.cursor = cursor; + context.resynch = 1; + context.flags = flags; + context.alist = buffer; + context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ + context.firstu = context.bufsize; + context.put_listent = xfs_attr_put_listent; + + alist = (struct attrlist *)context.alist; + alist->al_count = 0; + alist->al_more = 0; + alist->al_offset[0] = context.bufsize; + + error = xfs_attr_list_int(&context); + ASSERT(error >= 0); + return error; +} + +int /* error */ +xfs_attr_inactive(xfs_inode_t *dp) +{ + xfs_trans_t *trans; + xfs_mount_t *mp; + int error; + + mp = dp->i_mount; + ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); + + xfs_ilock(dp, XFS_ILOCK_SHARED); + if (!xfs_inode_hasattr(dp) || + dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + xfs_iunlock(dp, XFS_ILOCK_SHARED); + return 0; + } + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); + if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ATTRINVAL_LOG_COUNT))) { + xfs_trans_cancel(trans, 0); + return(error); + } + xfs_ilock(dp, XFS_ILOCK_EXCL); + + /* + * No need to make quota reservations here. We expect to release some + * blocks, not allocate, in the common case. + */ + xfs_trans_ijoin(trans, dp); + + /* + * Decide on what work routines to call based on the inode size. + */ + if (!xfs_inode_hasattr(dp) || + dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = 0; + goto out; + } + error = xfs_attr_root_inactive(&trans, dp); + if (error) + goto out; + + error = xfs_itruncate_finish(&trans, dp, 0LL, XFS_ATTR_FORK, 0); + if (error) + goto out; + + /* + * Commit the last in the sequence of transactions. + */ + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return(error); + +out: + xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return(error); +} + + + +/*======================================================================== + * External routines when attribute list is inside the inode + *========================================================================*/ + +/* + * Add a name to the shortform attribute list structure + * This is the external routine. + */ +STATIC int +xfs_attr_shortform_addname(xfs_da_args_t *args) +{ + int newsize, forkoff, retval; + + retval = xfs_attr_shortform_lookup(args); + if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { + return(retval); + } else if (retval == EEXIST) { + if (args->flags & ATTR_CREATE) + return(retval); + retval = xfs_attr_shortform_remove(args); + ASSERT(retval == 0); + } + + if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || + args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX) + return(XFS_ERROR(ENOSPC)); + + newsize = XFS_ATTR_SF_TOTSIZE(args->dp); + newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + + forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize); + if (!forkoff) + return(XFS_ERROR(ENOSPC)); + + xfs_attr_shortform_add(args, forkoff); + return(0); +} + + +/*======================================================================== + * External routines when attribute list is one block + *========================================================================*/ + +/* + * Add a name to the leaf attribute list structure + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +STATIC int +xfs_attr_leaf_addname(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + xfs_dabuf_t *bp; + int retval, error, committed, forkoff; + + /* + * Read the (only) block in the attribute list in. + */ + dp = args->dp; + args->blkno = 0; + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + + /* + * Look up the given attribute in the leaf block. Figure out if + * the given flags produce an error or call for an atomic rename. + */ + retval = xfs_attr_leaf_lookup_int(bp, args); + if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { + xfs_da_brelse(args->trans, bp); + return(retval); + } else if (retval == EEXIST) { + if (args->flags & ATTR_CREATE) { /* pure create op */ + xfs_da_brelse(args->trans, bp); + return(retval); + } + args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ + args->blkno2 = args->blkno; /* set 2nd entry info*/ + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + } + + /* + * Add the attribute to the leaf block, transitioning to a Btree + * if required. + */ + retval = xfs_attr_leaf_add(bp, args); + xfs_da_buf_done(bp); + if (retval == ENOSPC) { + /* + * Promote the attribute list to the Btree format, then + * Commit that transaction so that the node_addname() call + * can manage its own transactions. + */ + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr_leaf_to_node(args); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp); + + /* + * Commit the current trans (including the inode) and start + * a new one. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + return (error); + + /* + * Fob the whole rest of the problem off on the Btree code. + */ + error = xfs_attr_node_addname(args); + return(error); + } + + /* + * Commit the transaction that added the attr name so that + * later routines can manage their own transactions. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + return (error); + + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_set(args); + if (error) + return(error); + } + + /* + * If this is an atomic rename operation, we must "flip" the + * incomplete flags on the "new" and "old" attribute/value pairs + * so that one disappears and one appears atomically. Then we + * must remove the "old" attribute/value pair. + */ + if (args->op_flags & XFS_DA_OP_RENAME) { + /* + * In a separate transaction, set the incomplete flag on the + * "old" attr and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr_leaf_flipflags(args); + if (error) + return(error); + + /* + * Dismantle the "old" attribute/value pair by removing + * a "remote" value (if it exists). + */ + args->index = args->index2; + args->blkno = args->blkno2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + if (args->rmtblkno) { + error = xfs_attr_rmtval_remove(args); + if (error) + return(error); + } + + /* + * Read in the block containing the "old" attr, then + * remove the "old" attr from that block (neat, huh!) + */ + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, + &bp, XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + (void)xfs_attr_leaf_remove(bp, args); + + /* + * If the result is small enough, shrink it all into the inode. + */ + if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp); + } else + xfs_da_buf_done(bp); + + /* + * Commit the remove and start the next trans in series. + */ + error = xfs_trans_roll(&args->trans, dp); + + } else if (args->rmtblkno > 0) { + /* + * Added a "remote" value, just clear the incomplete flag. + */ + error = xfs_attr_leaf_clearflag(args); + } + return(error); +} + +/* + * Remove a name from the leaf attribute list structure + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +STATIC int +xfs_attr_leaf_removename(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + xfs_dabuf_t *bp; + int error, committed, forkoff; + + /* + * Remove the attribute. + */ + dp = args->dp; + args->blkno = 0; + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) { + return(error); + } + + ASSERT(bp != NULL); + error = xfs_attr_leaf_lookup_int(bp, args); + if (error == ENOATTR) { + xfs_da_brelse(args->trans, bp); + return(error); + } + + (void)xfs_attr_leaf_remove(bp, args); + + /* + * If the result is small enough, shrink it all into the inode. + */ + if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp); + } else + xfs_da_buf_done(bp); + return(0); +} + +/* + * Look up a name in a leaf attribute list structure. + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +STATIC int +xfs_attr_leaf_get(xfs_da_args_t *args) +{ + xfs_dabuf_t *bp; + int error; + + args->blkno = 0; + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + + error = xfs_attr_leaf_lookup_int(bp, args); + if (error != EEXIST) { + xfs_da_brelse(args->trans, bp); + return(error); + } + error = xfs_attr_leaf_getvalue(bp, args); + xfs_da_brelse(args->trans, bp); + if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { + error = xfs_attr_rmtval_get(args); + } + return(error); +} + +/* + * Copy out attribute entries for attr_list(), for leaf attribute lists. + */ +STATIC int +xfs_attr_leaf_list(xfs_attr_list_context_t *context) +{ + xfs_attr_leafblock_t *leaf; + int error; + xfs_dabuf_t *bp; + + context->cursor->blkno = 0; + error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); + if (error) + return XFS_ERROR(error); + ASSERT(bp != NULL); + leaf = bp->data; + if (unlikely(be16_to_cpu(leaf->hdr.info.magic) != XFS_ATTR_LEAF_MAGIC)) { + XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW, + context->dp->i_mount, leaf); + xfs_da_brelse(NULL, bp); + return XFS_ERROR(EFSCORRUPTED); + } + + error = xfs_attr_leaf_list_int(bp, context); + xfs_da_brelse(NULL, bp); + return XFS_ERROR(error); +} + + +/*======================================================================== + * External routines when attribute list size > XFS_LBSIZE(mp). + *========================================================================*/ + +/* + * Add a name to a Btree-format attribute list. + * + * This will involve walking down the Btree, and may involve splitting + * leaf nodes and even splitting intermediate nodes up to and including + * the root node (a special case of an intermediate node). + * + * "Remote" attribute values confuse the issue and atomic rename operations + * add a whole extra layer of confusion on top of that. + */ +STATIC int +xfs_attr_node_addname(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + xfs_inode_t *dp; + xfs_mount_t *mp; + int committed, retval, error; + + /* + * Fill in bucket of arguments/results/context to carry around. + */ + dp = args->dp; + mp = dp->i_mount; +restart: + state = xfs_da_state_alloc(); + state->args = args; + state->mp = mp; + state->blocksize = state->mp->m_sb.sb_blocksize; + state->node_ents = state->mp->m_attr_node_ents; + + /* + * Search to see if name already exists, and get back a pointer + * to where it should go. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error) + goto out; + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { + goto out; + } else if (retval == EEXIST) { + if (args->flags & ATTR_CREATE) + goto out; + args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ + args->blkno2 = args->blkno; /* set 2nd entry info*/ + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtblkno = 0; + args->rmtblkcnt = 0; + } + + retval = xfs_attr_leaf_add(blk->bp, state->args); + if (retval == ENOSPC) { + if (state->path.active == 1) { + /* + * Its really a single leaf node, but it had + * out-of-line values so it looked like it *might* + * have been a b-tree. + */ + xfs_da_state_free(state); + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr_leaf_to_node(args); + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp); + + /* + * Commit the node conversion and start the next + * trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + goto out; + + goto restart; + } + + /* + * Split as many Btree elements as required. + * This code tracks the new and old attr's location + * in the index/blkno/rmtblkno/rmtblkcnt fields and + * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. + */ + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_da_split(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp); + } else { + /* + * Addition succeeded, update Btree hashvals. + */ + xfs_da_fixhashpath(state, &state->path); + } + + /* + * Kill the state structure, we're done with it and need to + * allow the buffers to come back later. + */ + xfs_da_state_free(state); + state = NULL; + + /* + * Commit the leaf addition or btree split and start the next + * trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + goto out; + + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_set(args); + if (error) + return(error); + } + + /* + * If this is an atomic rename operation, we must "flip" the + * incomplete flags on the "new" and "old" attribute/value pairs + * so that one disappears and one appears atomically. Then we + * must remove the "old" attribute/value pair. + */ + if (args->op_flags & XFS_DA_OP_RENAME) { + /* + * In a separate transaction, set the incomplete flag on the + * "old" attr and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr_leaf_flipflags(args); + if (error) + goto out; + + /* + * Dismantle the "old" attribute/value pair by removing + * a "remote" value (if it exists). + */ + args->index = args->index2; + args->blkno = args->blkno2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + if (args->rmtblkno) { + error = xfs_attr_rmtval_remove(args); + if (error) + return(error); + } + + /* + * Re-find the "old" attribute entry after any split ops. + * The INCOMPLETE flag means that we will find the "old" + * attr, not the "new" one. + */ + args->flags |= XFS_ATTR_INCOMPLETE; + state = xfs_da_state_alloc(); + state->args = args; + state->mp = mp; + state->blocksize = state->mp->m_sb.sb_blocksize; + state->node_ents = state->mp->m_attr_node_ents; + state->inleaf = 0; + error = xfs_da_node_lookup_int(state, &retval); + if (error) + goto out; + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + error = xfs_attr_leaf_remove(blk->bp, args); + xfs_da_fixhashpath(state, &state->path); + + /* + * Check to see if the tree needs to be collapsed. + */ + if (retval && (state->path.active > 1)) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_da_join(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp); + } + + /* + * Commit and start the next trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + goto out; + + } else if (args->rmtblkno > 0) { + /* + * Added a "remote" value, just clear the incomplete flag. + */ + error = xfs_attr_leaf_clearflag(args); + if (error) + goto out; + } + retval = error = 0; + +out: + if (state) + xfs_da_state_free(state); + if (error) + return(error); + return(retval); +} + +/* + * Remove a name from a B-tree attribute list. + * + * This will involve walking down the Btree, and may involve joining + * leaf nodes and even joining intermediate nodes up to and including + * the root node (a special case of an intermediate node). + */ +STATIC int +xfs_attr_node_removename(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + xfs_inode_t *dp; + xfs_dabuf_t *bp; + int retval, error, committed, forkoff; + + /* + * Tie a string around our finger to remind us where we are. + */ + dp = args->dp; + state = xfs_da_state_alloc(); + state->args = args; + state->mp = dp->i_mount; + state->blocksize = state->mp->m_sb.sb_blocksize; + state->node_ents = state->mp->m_attr_node_ents; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error || (retval != EEXIST)) { + if (error == 0) + error = retval; + goto out; + } + + /* + * If there is an out-of-line value, de-allocate the blocks. + * This is done before we remove the attribute so that we don't + * overflow the maximum size of a transaction and/or hit a deadlock. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->bp != NULL); + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + if (args->rmtblkno > 0) { + /* + * Fill in disk block numbers in the state structure + * so that we can get the buffers back after we commit + * several transactions in the following calls. + */ + error = xfs_attr_fillstate(state); + if (error) + goto out; + + /* + * Mark the attribute as INCOMPLETE, then bunmapi() the + * remote value. + */ + error = xfs_attr_leaf_setflag(args); + if (error) + goto out; + error = xfs_attr_rmtval_remove(args); + if (error) + goto out; + + /* + * Refill the state structure with buffers, the prior calls + * released our buffers. + */ + error = xfs_attr_refillstate(state); + if (error) + goto out; + } + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + retval = xfs_attr_leaf_remove(blk->bp, args); + xfs_da_fixhashpath(state, &state->path); + + /* + * Check to see if the tree needs to be collapsed. + */ + if (retval && (state->path.active > 1)) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_da_join(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp); + + /* + * Commit the Btree join operation and start a new trans. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + goto out; + } + + /* + * If the result is small enough, push it all into the inode. + */ + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + /* + * Have to get rid of the copy of this dabuf in the state. + */ + ASSERT(state->path.active == 1); + ASSERT(state->path.blk[0].bp); + xfs_da_buf_done(state->path.blk[0].bp); + state->path.blk[0].bp = NULL; + + error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, + XFS_ATTR_FORK); + if (error) + goto out; + ASSERT(be16_to_cpu(((xfs_attr_leafblock_t *) + bp->data)->hdr.info.magic) + == XFS_ATTR_LEAF_MAGIC); + + if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp); + } else + xfs_da_brelse(args->trans, bp); + } + error = 0; + +out: + xfs_da_state_free(state); + return(error); +} + +/* + * Fill in the disk block numbers in the state structure for the buffers + * that are attached to the state structure. + * This is done so that we can quickly reattach ourselves to those buffers + * after some set of transaction commits have released these buffers. + */ +STATIC int +xfs_attr_fillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level; + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_da_blkno(blk->bp); + xfs_da_buf_done(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_da_blkno(blk->bp); + xfs_da_buf_done(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + return(0); +} + +/* + * Reattach the buffers to the state structure based on the disk block + * numbers stored in the state structure. + * This is done after some set of transaction commits have released those + * buffers from our grip. + */ +STATIC int +xfs_attr_refillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level, error; + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da_read_buf(state->args->trans, + state->args->dp, + blk->blkno, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return(error); + } else { + blk->bp = NULL; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da_read_buf(state->args->trans, + state->args->dp, + blk->blkno, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return(error); + } else { + blk->bp = NULL; + } + } + + return(0); +} + +/* + * Look up a filename in a node attribute list. + * + * This routine gets called for any attribute fork that has more than one + * block, ie: both true Btree attr lists and for single-leaf-blocks with + * "remote" values taking up more blocks. + */ +STATIC int +xfs_attr_node_get(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + int error, retval; + int i; + + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_sb.sb_blocksize; + state->node_ents = state->mp->m_attr_node_ents; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error) { + retval = error; + } else if (retval == EEXIST) { + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->bp != NULL); + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + + /* + * Get the value, local or "remote" + */ + retval = xfs_attr_leaf_getvalue(blk->bp, args); + if (!retval && (args->rmtblkno > 0) + && !(args->flags & ATTR_KERNOVAL)) { + retval = xfs_attr_rmtval_get(args); + } + } + + /* + * If not in a transaction, we have to release all the buffers. + */ + for (i = 0; i < state->path.active; i++) { + xfs_da_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + + xfs_da_state_free(state); + return(retval); +} + +STATIC int /* error */ +xfs_attr_node_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_leafblock_t *leaf; + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + int error, i; + xfs_dabuf_t *bp; + + cursor = context->cursor; + cursor->initted = 1; + + /* + * Do all sorts of validation on the passed-in cursor structure. + * If anything is amiss, ignore the cursor and look up the hashval + * starting from the btree root. + */ + bp = NULL; + if (cursor->blkno > 0) { + error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, + &bp, XFS_ATTR_FORK); + if ((error != 0) && (error != EFSCORRUPTED)) + return(error); + if (bp) { + node = bp->data; + switch (be16_to_cpu(node->hdr.info.magic)) { + case XFS_DA_NODE_MAGIC: + trace_xfs_attr_list_wrong_blk(context); + xfs_da_brelse(NULL, bp); + bp = NULL; + break; + case XFS_ATTR_LEAF_MAGIC: + leaf = bp->data; + if (cursor->hashval > be32_to_cpu(leaf->entries[ + be16_to_cpu(leaf->hdr.count)-1].hashval)) { + trace_xfs_attr_list_wrong_blk(context); + xfs_da_brelse(NULL, bp); + bp = NULL; + } else if (cursor->hashval <= + be32_to_cpu(leaf->entries[0].hashval)) { + trace_xfs_attr_list_wrong_blk(context); + xfs_da_brelse(NULL, bp); + bp = NULL; + } + break; + default: + trace_xfs_attr_list_wrong_blk(context); + xfs_da_brelse(NULL, bp); + bp = NULL; + } + } + } + + /* + * We did not find what we expected given the cursor's contents, + * so we start from the top and work down based on the hash value. + * Note that start of node block is same as start of leaf block. + */ + if (bp == NULL) { + cursor->blkno = 0; + for (;;) { + error = xfs_da_read_buf(NULL, context->dp, + cursor->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + if (unlikely(bp == NULL)) { + XFS_ERROR_REPORT("xfs_attr_node_list(2)", + XFS_ERRLEVEL_LOW, + context->dp->i_mount); + return(XFS_ERROR(EFSCORRUPTED)); + } + node = bp->data; + if (be16_to_cpu(node->hdr.info.magic) + == XFS_ATTR_LEAF_MAGIC) + break; + if (unlikely(be16_to_cpu(node->hdr.info.magic) + != XFS_DA_NODE_MAGIC)) { + XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, + node); + xfs_da_brelse(NULL, bp); + return(XFS_ERROR(EFSCORRUPTED)); + } + btree = node->btree; + for (i = 0; i < be16_to_cpu(node->hdr.count); + btree++, i++) { + if (cursor->hashval + <= be32_to_cpu(btree->hashval)) { + cursor->blkno = be32_to_cpu(btree->before); + trace_xfs_attr_list_node_descend(context, + btree); + break; + } + } + if (i == be16_to_cpu(node->hdr.count)) { + xfs_da_brelse(NULL, bp); + return(0); + } + xfs_da_brelse(NULL, bp); + } + } + ASSERT(bp != NULL); + + /* + * Roll upward through the blocks, processing each leaf block in + * order. As long as there is space in the result buffer, keep + * adding the information. + */ + for (;;) { + leaf = bp->data; + if (unlikely(be16_to_cpu(leaf->hdr.info.magic) + != XFS_ATTR_LEAF_MAGIC)) { + XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, leaf); + xfs_da_brelse(NULL, bp); + return(XFS_ERROR(EFSCORRUPTED)); + } + error = xfs_attr_leaf_list_int(bp, context); + if (error) { + xfs_da_brelse(NULL, bp); + return error; + } + if (context->seen_enough || leaf->hdr.info.forw == 0) + break; + cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); + xfs_da_brelse(NULL, bp); + error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, + &bp, XFS_ATTR_FORK); + if (error) + return(error); + if (unlikely((bp == NULL))) { + XFS_ERROR_REPORT("xfs_attr_node_list(5)", + XFS_ERRLEVEL_LOW, + context->dp->i_mount); + return(XFS_ERROR(EFSCORRUPTED)); + } + } + xfs_da_brelse(NULL, bp); + return(0); +} + + +/*======================================================================== + * External routines for manipulating out-of-line attribute values. + *========================================================================*/ + +/* + * Read the value associated with an attribute from the out-of-line buffer + * that we stored it in. + */ +int +xfs_attr_rmtval_get(xfs_da_args_t *args) +{ + xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; + xfs_mount_t *mp; + xfs_daddr_t dblkno; + void *dst; + xfs_buf_t *bp; + int nmap, error, tmp, valuelen, blkcnt, i; + xfs_dablk_t lblkno; + + ASSERT(!(args->flags & ATTR_KERNOVAL)); + + mp = args->dp->i_mount; + dst = args->value; + valuelen = args->valuelen; + lblkno = args->rmtblkno; + while (valuelen > 0) { + nmap = ATTR_RMTVALUE_MAPSIZE; + error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + NULL, 0, map, &nmap, NULL); + if (error) + return(error); + ASSERT(nmap >= 1); + + for (i = 0; (i < nmap) && (valuelen > 0); i++) { + ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && + (map[i].br_startblock != HOLESTARTBLOCK)); + dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); + blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, + blkcnt, XBF_LOCK | XBF_DONT_BLOCK, + &bp); + if (error) + return(error); + + tmp = (valuelen < XFS_BUF_SIZE(bp)) + ? valuelen : XFS_BUF_SIZE(bp); + xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ); + xfs_buf_relse(bp); + dst += tmp; + valuelen -= tmp; + + lblkno += map[i].br_blockcount; + } + } + ASSERT(valuelen == 0); + return(0); +} + +/* + * Write the value associated with an attribute into the out-of-line buffer + * that we have defined for it. + */ +STATIC int +xfs_attr_rmtval_set(xfs_da_args_t *args) +{ + xfs_mount_t *mp; + xfs_fileoff_t lfileoff; + xfs_inode_t *dp; + xfs_bmbt_irec_t map; + xfs_daddr_t dblkno; + void *src; + xfs_buf_t *bp; + xfs_dablk_t lblkno; + int blkcnt, valuelen, nmap, error, tmp, committed; + + dp = args->dp; + mp = dp->i_mount; + src = args->value; + + /* + * Find a "hole" in the attribute address space large enough for + * us to drop the new attribute's value into. + */ + blkcnt = XFS_B_TO_FSB(mp, args->valuelen); + lfileoff = 0; + error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, + XFS_ATTR_FORK); + if (error) { + return(error); + } + args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; + args->rmtblkcnt = blkcnt; + + /* + * Roll through the "value", allocating blocks on disk as required. + */ + while (blkcnt > 0) { + /* + * Allocate a single extent, up to the size of the value. + */ + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno, + blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA | + XFS_BMAPI_WRITE, + args->firstblock, args->total, &map, &nmap, + args->flist); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp); + + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + + /* + * Start the next trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + return (error); + } + + /* + * Roll through the "value", copying the attribute value to the + * already-allocated blocks. Blocks are written synchronously + * so that we can know they are all on disk before we turn off + * the INCOMPLETE flag. + */ + lblkno = args->rmtblkno; + valuelen = args->valuelen; + while (valuelen > 0) { + /* + * Try to remember where we decided to put the value. + */ + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + args->firstblock, 0, &map, &nmap, + NULL); + if (error) { + return(error); + } + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, + XBF_LOCK | XBF_DONT_BLOCK); + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + + tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : + XFS_BUF_SIZE(bp); + xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); + if (tmp < XFS_BUF_SIZE(bp)) + xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); + if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ + return (error); + } + src += tmp; + valuelen -= tmp; + + lblkno += map.br_blockcount; + } + ASSERT(valuelen == 0); + return(0); +} + +/* + * Remove the value associated with an attribute by deleting the + * out-of-line buffer that it is stored on. + */ +STATIC int +xfs_attr_rmtval_remove(xfs_da_args_t *args) +{ + xfs_mount_t *mp; + xfs_bmbt_irec_t map; + xfs_buf_t *bp; + xfs_daddr_t dblkno; + xfs_dablk_t lblkno; + int valuelen, blkcnt, nmap, error, done, committed; + + mp = args->dp->i_mount; + + /* + * Roll through the "value", invalidating the attribute value's + * blocks. + */ + lblkno = args->rmtblkno; + valuelen = args->rmtblkcnt; + while (valuelen > 0) { + /* + * Try to remember where we decided to put the value. + */ + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + args->firstblock, 0, &map, &nmap, + args->flist); + if (error) { + return(error); + } + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + /* + * If the "remote" value is in the cache, remove it. + */ + bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); + if (bp) { + XFS_BUF_STALE(bp); + XFS_BUF_UNDELAYWRITE(bp); + xfs_buf_relse(bp); + bp = NULL; + } + + valuelen -= map.br_blockcount; + + lblkno += map.br_blockcount; + } + + /* + * Keep de-allocating extents until the remote-value region is gone. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + done = 0; + while (!done) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + 1, args->firstblock, args->flist, + &done); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, args->dp); + + /* + * Close out trans and start the next one in the chain. + */ + error = xfs_trans_roll(&args->trans, args->dp); + if (error) + return (error); + } + return(0); +} diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h new file mode 100644 index 00000000..e920d68e --- /dev/null +++ b/fs/xfs/xfs_attr.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ATTR_H__ +#define __XFS_ATTR_H__ + +struct xfs_inode; +struct xfs_da_args; +struct xfs_attr_list_context; + +/* + * Large attribute lists are structured around Btrees where all the data + * elements are in the leaf nodes. Attribute names are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of an attribute name may not be unique, we may have duplicate keys. + * The internal links in the Btree are logical block offsets into the file. + * + * Small attribute lists use a different format and are packed as tightly + * as possible so as to fit into the literal area of the inode. + */ + +/*======================================================================== + * External interfaces + *========================================================================*/ + + +#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ +#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ +#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ +#define ATTR_SECURE 0x0008 /* use attrs in security namespace */ +#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ +#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ + +#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ +#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ + +#define XFS_ATTR_FLAGS \ + { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ + { ATTR_ROOT, "ROOT" }, \ + { ATTR_TRUST, "TRUST" }, \ + { ATTR_SECURE, "SECURE" }, \ + { ATTR_CREATE, "CREATE" }, \ + { ATTR_REPLACE, "REPLACE" }, \ + { ATTR_KERNOTIME, "KERNOTIME" }, \ + { ATTR_KERNOVAL, "KERNOVAL" } + +/* + * The maximum size (into the kernel or returned from the kernel) of an + * attribute value or the buffer used for an attr_list() call. Larger + * sizes will result in an ERANGE return code. + */ +#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ + +/* + * Define how lists of attribute names are returned to the user from + * the attr_list() call. A large, 32bit aligned, buffer is passed in + * along with its size. We put an array of offsets at the top that each + * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom. + */ +typedef struct attrlist { + __s32 al_count; /* number of entries in attrlist */ + __s32 al_more; /* T/F: more attrs (do call again) */ + __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ +} attrlist_t; + +/* + * Show the interesting info about one attribute. This is what the + * al_offset[i] entry points to. + */ +typedef struct attrlist_ent { /* data from attr_list() */ + __u32 a_valuelen; /* number bytes in value of attr */ + char a_name[1]; /* attr name (NULL terminated) */ +} attrlist_ent_t; + +/* + * Given a pointer to the (char*) buffer containing the attr_list() result, + * and an index, return a pointer to the indicated attribute in the buffer. + */ +#define ATTR_ENTRY(buffer, index) \ + ((attrlist_ent_t *) \ + &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) + +/* + * Kernel-internal version of the attrlist cursor. + */ +typedef struct attrlist_cursor_kern { + __u32 hashval; /* hash value of next entry to add */ + __u32 blkno; /* block containing entry (suggestion) */ + __u32 offset; /* offset in list of equal-hashvals */ + __u16 pad1; /* padding to match user-level */ + __u8 pad2; /* padding to match user-level */ + __u8 initted; /* T/F: cursor has been initialized */ +} attrlist_cursor_kern_t; + + +/*======================================================================== + * Structure used to pass context around among the routines. + *========================================================================*/ + + +typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, + unsigned char *, int, int, unsigned char *); + +typedef struct xfs_attr_list_context { + struct xfs_inode *dp; /* inode */ + struct attrlist_cursor_kern *cursor; /* position in list */ + char *alist; /* output buffer */ + int seen_enough; /* T/F: seen enough of list? */ + ssize_t count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize; /* total buffer size */ + int firstu; /* first used byte in buffer */ + int flags; /* from VOP call */ + int resynch; /* T/F: resynch with cursor */ + int put_value; /* T/F: need value for listent */ + put_listent_func_t put_listent; /* list output fmt function */ + int index; /* index into output buffer */ +} xfs_attr_list_context_t; + + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Overall external interface routines. + */ +int xfs_attr_inactive(struct xfs_inode *dp); +int xfs_attr_rmtval_get(struct xfs_da_args *args); +int xfs_attr_list_int(struct xfs_attr_list_context *); + +#endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c new file mode 100644 index 00000000..f49ecf2e --- /dev/null +++ b/fs/xfs/xfs_attr_leaf.c @@ -0,0 +1,2979 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_trace.h" + +/* + * xfs_attr_leaf.c + * + * Routines to implement leaf blocks of attributes as Btrees of hashed names. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block, + xfs_dabuf_t **bpp); +STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args, + int freemap_index); +STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer); +STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2); +STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *leaf_blk_1, + xfs_da_state_blk_t *leaf_blk_2, + int *number_entries_in_blk1, + int *number_usedbytes_in_blk1); + +/* + * Routines used for shrinking the Btree. + */ +STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, + xfs_dabuf_t *bp, int level); +STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, + xfs_dabuf_t *bp); +STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, + xfs_dablk_t blkno, int blkcnt); + +/* + * Utility routines. + */ +STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, + int src_start, + xfs_attr_leafblock_t *dst_leaf, + int dst_start, int move_count, + xfs_mount_t *mp); +STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); + +/*======================================================================== + * Namespace helper routines + *========================================================================*/ + +/* + * If namespace bits don't match return 0. + * If all match then return 1. + */ +STATIC int +xfs_attr_namesp_match(int arg_flags, int ondisk_flags) +{ + return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); +} + + +/*======================================================================== + * External routines when attribute fork size < XFS_LITINO(mp). + *========================================================================*/ + +/* + * Query whether the requested number of additional bytes of extended + * attribute space will be able to fit inline. + * + * Returns zero if not, else the di_forkoff fork offset to be used in the + * literal area for attribute data once the new bytes have been added. + * + * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value; + * special case for dev/uuid inodes, they have fixed size data forks. + */ +int +xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) +{ + int offset; + int minforkoff; /* lower limit on valid forkoff locations */ + int maxforkoff; /* upper limit on valid forkoff locations */ + int dsize; + xfs_mount_t *mp = dp->i_mount; + + offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ + + switch (dp->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + return (offset >= minforkoff) ? minforkoff : 0; + case XFS_DINODE_FMT_UUID: + minforkoff = roundup(sizeof(uuid_t), 8) >> 3; + return (offset >= minforkoff) ? minforkoff : 0; + } + + /* + * If the requested numbers of bytes is smaller or equal to the + * current attribute fork size we can always proceed. + * + * Note that if_bytes in the data fork might actually be larger than + * the current data fork size is due to delalloc extents. In that + * case either the extent count will go down when they are converted + * to real extents, or the delalloc conversion will take care of the + * literal area rebalancing. + */ + if (bytes <= XFS_IFORK_ASIZE(dp)) + return dp->i_d.di_forkoff; + + /* + * For attr2 we can try to move the forkoff if there is space in the + * literal area, but for the old format we are done if there is no + * space in the fixed attribute fork. + */ + if (!(mp->m_flags & XFS_MOUNT_ATTR2)) + return 0; + + dsize = dp->i_df.if_bytes; + + switch (dp->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* + * If there is no attr fork and the data fork is extents, + * determine if creating the default attr fork will result + * in the extents form migrating to btree. If so, the + * minimum offset only needs to be the space required for + * the btree root. + */ + if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > + xfs_default_attroffset(dp)) + dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + break; + case XFS_DINODE_FMT_BTREE: + /* + * If we have a data btree then keep forkoff if we have one, + * otherwise we are adding a new attr, so then we set + * minforkoff to where the btree root can finish so we have + * plenty of room for attrs + */ + if (dp->i_d.di_forkoff) { + if (offset < dp->i_d.di_forkoff) + return 0; + return dp->i_d.di_forkoff; + } + dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); + break; + } + + /* + * A data fork btree root must have space for at least + * MINDBTPTRS key/ptr pairs if the data fork is small or empty. + */ + minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); + minforkoff = roundup(minforkoff, 8) >> 3; + + /* attr fork btree root can have at least this many key/ptr pairs */ + maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = maxforkoff >> 3; /* rounded down */ + + if (offset >= maxforkoff) + return maxforkoff; + if (offset >= minforkoff) + return offset; + return 0; +} + +/* + * Switch on the ATTR2 superblock bit (implies also FEATURES2) + */ +STATIC void +xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp) +{ + if ((mp->m_flags & XFS_MOUNT_ATTR2) && + !(xfs_sb_version_hasattr2(&mp->m_sb))) { + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasattr2(&mp->m_sb)) { + xfs_sb_version_addattr2(&mp->m_sb); + spin_unlock(&mp->m_sb_lock); + xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); + } else + spin_unlock(&mp->m_sb_lock); + } +} + +/* + * Create the initial contents of a shortform attribute list. + */ +void +xfs_attr_shortform_create(xfs_da_args_t *args) +{ + xfs_attr_sf_hdr_t *hdr; + xfs_inode_t *dp; + xfs_ifork_t *ifp; + + dp = args->dp; + ASSERT(dp != NULL); + ifp = dp->i_afp; + ASSERT(ifp != NULL); + ASSERT(ifp->if_bytes == 0); + if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) { + ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */ + dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL; + ifp->if_flags |= XFS_IFINLINE; + } else { + ASSERT(ifp->if_flags & XFS_IFINLINE); + } + xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); + hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data; + hdr->count = 0; + hdr->totsize = cpu_to_be16(sizeof(*hdr)); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); +} + +/* + * Add a name/value pair to the shortform attribute list. + * Overflow from the inode has already been checked for. + */ +void +xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i, offset, size; + xfs_mount_t *mp; + xfs_inode_t *dp; + xfs_ifork_t *ifp; + + dp = args->dp; + mp = dp->i_mount; + dp->i_d.di_forkoff = forkoff; + dp->i_df.if_ext_max = + XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); + dp->i_afp->if_ext_max = + XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); + + ifp = dp->i_afp; + ASSERT(ifp->if_flags & XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { +#ifdef DEBUG + if (sfe->namelen != args->namelen) + continue; + if (memcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) + continue; + ASSERT(0); +#endif + } + + offset = (char *)sfe - (char *)sf; + size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset); + + sfe->namelen = args->namelen; + sfe->valuelen = args->valuelen; + sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + memcpy(sfe->nameval, args->name, args->namelen); + memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); + sf->hdr.count++; + be16_add_cpu(&sf->hdr.totsize, size); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); + + xfs_sbversion_add_attr2(mp, args->trans); +} + +/* + * After the last attribute is removed revert to original inode format, + * making all literal area available to the data fork once more. + */ +STATIC void +xfs_attr_fork_reset( + struct xfs_inode *ip, + struct xfs_trans *tp) +{ + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + ip->i_d.di_forkoff = 0; + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + + ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_afp == NULL); + + ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* + * Remove an attribute from the shortform attribute list structure. + */ +int +xfs_attr_shortform_remove(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int base, size=0, end, totsize, i; + xfs_mount_t *mp; + xfs_inode_t *dp; + + dp = args->dp; + mp = dp->i_mount; + base = sizeof(xfs_attr_sf_hdr_t); + sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + sfe = &sf->list[0]; + end = sf->hdr.count; + for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), + base += size, i++) { + size = XFS_ATTR_SF_ENTSIZE(sfe); + if (sfe->namelen != args->namelen) + continue; + if (memcmp(sfe->nameval, args->name, args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) + continue; + break; + } + if (i == end) + return(XFS_ERROR(ENOATTR)); + + /* + * Fix up the attribute fork data, covering the hole + */ + end = base + size; + totsize = be16_to_cpu(sf->hdr.totsize); + if (end != totsize) + memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end); + sf->hdr.count--; + be16_add_cpu(&sf->hdr.totsize, -size); + + /* + * Fix up the start offset of the attribute fork + */ + totsize -= size; + if (totsize == sizeof(xfs_attr_sf_hdr_t) && + (mp->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + !(args->op_flags & XFS_DA_OP_ADDNAME)) { + xfs_attr_fork_reset(dp, args->trans); + } else { + xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); + dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); + ASSERT(dp->i_d.di_forkoff); + ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || + (args->op_flags & XFS_DA_OP_ADDNAME) || + !(mp->m_flags & XFS_MOUNT_ATTR2) || + dp->i_d.di_format == XFS_DINODE_FMT_BTREE); + dp->i_afp->if_ext_max = + XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); + dp->i_df.if_ext_max = + XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); + xfs_trans_log_inode(args->trans, dp, + XFS_ILOG_CORE | XFS_ILOG_ADATA); + } + + xfs_sbversion_add_attr2(mp, args->trans); + + return(0); +} + +/* + * Look up a name in a shortform attribute list structure. + */ +/*ARGSUSED*/ +int +xfs_attr_shortform_lookup(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i; + xfs_ifork_t *ifp; + + ifp = args->dp->i_afp; + ASSERT(ifp->if_flags & XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < sf->hdr.count; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + if (sfe->namelen != args->namelen) + continue; + if (memcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) + continue; + return(XFS_ERROR(EEXIST)); + } + return(XFS_ERROR(ENOATTR)); +} + +/* + * Look up a name in a shortform attribute list structure. + */ +/*ARGSUSED*/ +int +xfs_attr_shortform_getvalue(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i; + + ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < sf->hdr.count; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + if (sfe->namelen != args->namelen) + continue; + if (memcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) + continue; + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = sfe->valuelen; + return(XFS_ERROR(EEXIST)); + } + if (args->valuelen < sfe->valuelen) { + args->valuelen = sfe->valuelen; + return(XFS_ERROR(ERANGE)); + } + args->valuelen = sfe->valuelen; + memcpy(args->value, &sfe->nameval[args->namelen], + args->valuelen); + return(XFS_ERROR(EEXIST)); + } + return(XFS_ERROR(ENOATTR)); +} + +/* + * Convert from using the shortform to the leaf. + */ +int +xfs_attr_shortform_to_leaf(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + xfs_da_args_t nargs; + char *tmpbuffer; + int error, i, size; + xfs_dablk_t blkno; + xfs_dabuf_t *bp; + xfs_ifork_t *ifp; + + dp = args->dp; + ifp = dp->i_afp; + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + size = be16_to_cpu(sf->hdr.totsize); + tmpbuffer = kmem_alloc(size, KM_SLEEP); + ASSERT(tmpbuffer != NULL); + memcpy(tmpbuffer, ifp->if_u1.if_data, size); + sf = (xfs_attr_shortform_t *)tmpbuffer; + + xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); + bp = NULL; + error = xfs_da_grow_inode(args, &blkno); + if (error) { + /* + * If we hit an IO error middle of the transaction inside + * grow_inode(), we may have inconsistent data. Bail out. + */ + if (error == EIO) + goto out; + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ + memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + goto out; + } + + ASSERT(blkno == 0); + error = xfs_attr_leaf_create(args, blkno, &bp); + if (error) { + error = xfs_da_shrink_inode(args, 0, bp); + bp = NULL; + if (error) + goto out; + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ + memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + goto out; + } + + memset((char *)&nargs, 0, sizeof(nargs)); + nargs.dp = dp; + nargs.firstblock = args->firstblock; + nargs.flist = args->flist; + nargs.total = args->total; + nargs.whichfork = XFS_ATTR_FORK; + nargs.trans = args->trans; + nargs.op_flags = XFS_DA_OP_OKNOENT; + + sfe = &sf->list[0]; + for (i = 0; i < sf->hdr.count; i++) { + nargs.name = sfe->nameval; + nargs.namelen = sfe->namelen; + nargs.value = &sfe->nameval[nargs.namelen]; + nargs.valuelen = sfe->valuelen; + nargs.hashval = xfs_da_hashname(sfe->nameval, + sfe->namelen); + nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); + error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ + ASSERT(error == ENOATTR); + error = xfs_attr_leaf_add(bp, &nargs); + ASSERT(error != ENOSPC); + if (error) + goto out; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + } + error = 0; + +out: + if(bp) + xfs_da_buf_done(bp); + kmem_free(tmpbuffer); + return(error); +} + +STATIC int +xfs_attr_shortform_compare(const void *a, const void *b) +{ + xfs_attr_sf_sort_t *sa, *sb; + + sa = (xfs_attr_sf_sort_t *)a; + sb = (xfs_attr_sf_sort_t *)b; + if (sa->hash < sb->hash) { + return(-1); + } else if (sa->hash > sb->hash) { + return(1); + } else { + return(sa->entno - sb->entno); + } +} + + +#define XFS_ISRESET_CURSOR(cursor) \ + (!((cursor)->initted) && !((cursor)->hashval) && \ + !((cursor)->blkno) && !((cursor)->offset)) +/* + * Copy out entries of shortform attribute lists for attr_list(). + * Shortform attribute lists are not stored in hashval sorted order. + * If the output buffer is not large enough to hold them all, then we + * we have to calculate each entries' hashvalue and sort them before + * we can begin returning them to the user. + */ +/*ARGSUSED*/ +int +xfs_attr_shortform_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_sf_sort_t *sbuf, *sbp; + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + xfs_inode_t *dp; + int sbsize, nsbuf, count, i; + int error; + + ASSERT(context != NULL); + dp = context->dp; + ASSERT(dp != NULL); + ASSERT(dp->i_afp != NULL); + sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + ASSERT(sf != NULL); + if (!sf->hdr.count) + return(0); + cursor = context->cursor; + ASSERT(cursor != NULL); + + trace_xfs_attr_list_sf(context); + + /* + * If the buffer is large enough and the cursor is at the start, + * do not bother with sorting since we will return everything in + * one buffer and another call using the cursor won't need to be + * made. + * Note the generous fudge factor of 16 overhead bytes per entry. + * If bufsize is zero then put_listent must be a search function + * and can just scan through what we have. + */ + if (context->bufsize == 0 || + (XFS_ISRESET_CURSOR(cursor) && + (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { + for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + error = context->put_listent(context, + sfe->flags, + sfe->nameval, + (int)sfe->namelen, + (int)sfe->valuelen, + &sfe->nameval[sfe->namelen]); + + /* + * Either search callback finished early or + * didn't fit it all in the buffer after all. + */ + if (context->seen_enough) + break; + + if (error) + return error; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + } + trace_xfs_attr_list_sf_all(context); + return(0); + } + + /* do no more for a search callback */ + if (context->bufsize == 0) + return 0; + + /* + * It didn't all fit, so we have to sort everything on hashval. + */ + sbsize = sf->hdr.count * sizeof(*sbuf); + sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); + + /* + * Scan the attribute list for the rest of the entries, storing + * the relevant info from only those that match into a buffer. + */ + nsbuf = 0; + for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + if (unlikely( + ((char *)sfe < (char *)sf) || + ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) { + XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, sfe); + kmem_free(sbuf); + return XFS_ERROR(EFSCORRUPTED); + } + + sbp->entno = i; + sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen); + sbp->name = sfe->nameval; + sbp->namelen = sfe->namelen; + /* These are bytes, and both on-disk, don't endian-flip */ + sbp->valuelen = sfe->valuelen; + sbp->flags = sfe->flags; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sbp++; + nsbuf++; + } + + /* + * Sort the entries on hash then entno. + */ + xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare); + + /* + * Re-find our place IN THE SORTED LIST. + */ + count = 0; + cursor->initted = 1; + cursor->blkno = 0; + for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) { + if (sbp->hash == cursor->hashval) { + if (cursor->offset == count) { + break; + } + count++; + } else if (sbp->hash > cursor->hashval) { + break; + } + } + if (i == nsbuf) { + kmem_free(sbuf); + return(0); + } + + /* + * Loop putting entries into the user buffer. + */ + for ( ; i < nsbuf; i++, sbp++) { + if (cursor->hashval != sbp->hash) { + cursor->hashval = sbp->hash; + cursor->offset = 0; + } + error = context->put_listent(context, + sbp->flags, + sbp->name, + sbp->namelen, + sbp->valuelen, + &sbp->name[sbp->namelen]); + if (error) + return error; + if (context->seen_enough) + break; + cursor->offset++; + } + + kmem_free(sbuf); + return(0); +} + +/* + * Check a leaf attribute block to see if all the entries would fit into + * a shortform attribute list. + */ +int +xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + int bytes, i; + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + + entry = &leaf->entries[0]; + bytes = sizeof(struct xfs_attr_sf_hdr); + for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* don't copy partial entries */ + if (!(entry->flags & XFS_ATTR_LOCAL)) + return(0); + name_loc = xfs_attr_leaf_name_local(leaf, i); + if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) + return(0); + if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) + return(0); + bytes += sizeof(struct xfs_attr_sf_entry)-1 + + name_loc->namelen + + be16_to_cpu(name_loc->valuelen); + } + if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + (bytes == sizeof(struct xfs_attr_sf_hdr))) + return(-1); + return(xfs_attr_shortform_bytesfit(dp, bytes)); +} + +/* + * Convert a leaf attribute list to shortform attribute list + */ +int +xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + xfs_da_args_t nargs; + xfs_inode_t *dp; + char *tmpbuffer; + int error, i; + + dp = args->dp; + tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP); + ASSERT(tmpbuffer != NULL); + + ASSERT(bp != NULL); + memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount)); + leaf = (xfs_attr_leafblock_t *)tmpbuffer; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + memset(bp->data, 0, XFS_LBSIZE(dp->i_mount)); + + /* + * Clean out the prior contents of the attribute list. + */ + error = xfs_da_shrink_inode(args, 0, bp); + if (error) + goto out; + + if (forkoff == -1) { + ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); + ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); + xfs_attr_fork_reset(dp, args->trans); + goto out; + } + + xfs_attr_shortform_create(args); + + /* + * Copy the attributes + */ + memset((char *)&nargs, 0, sizeof(nargs)); + nargs.dp = dp; + nargs.firstblock = args->firstblock; + nargs.flist = args->flist; + nargs.total = args->total; + nargs.whichfork = XFS_ATTR_FORK; + nargs.trans = args->trans; + nargs.op_flags = XFS_DA_OP_OKNOENT; + entry = &leaf->entries[0]; + for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* don't copy partial entries */ + if (!entry->nameidx) + continue; + ASSERT(entry->flags & XFS_ATTR_LOCAL); + name_loc = xfs_attr_leaf_name_local(leaf, i); + nargs.name = name_loc->nameval; + nargs.namelen = name_loc->namelen; + nargs.value = &name_loc->nameval[nargs.namelen]; + nargs.valuelen = be16_to_cpu(name_loc->valuelen); + nargs.hashval = be32_to_cpu(entry->hashval); + nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); + xfs_attr_shortform_add(&nargs, forkoff); + } + error = 0; + +out: + kmem_free(tmpbuffer); + return(error); +} + +/* + * Convert from using a single leaf to a root node and a leaf. + */ +int +xfs_attr_leaf_to_node(xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_da_intnode_t *node; + xfs_inode_t *dp; + xfs_dabuf_t *bp1, *bp2; + xfs_dablk_t blkno; + int error; + + dp = args->dp; + bp1 = bp2 = NULL; + error = xfs_da_grow_inode(args, &blkno); + if (error) + goto out; + error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, + XFS_ATTR_FORK); + if (error) + goto out; + ASSERT(bp1 != NULL); + bp2 = NULL; + error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, + XFS_ATTR_FORK); + if (error) + goto out; + ASSERT(bp2 != NULL); + memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount)); + xfs_da_buf_done(bp1); + bp1 = NULL; + xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); + + /* + * Set up the new root node. + */ + error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); + if (error) + goto out; + node = bp1->data; + leaf = bp2->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + /* both on-disk, don't endian-flip twice */ + node->btree[0].hashval = + leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval; + node->btree[0].before = cpu_to_be32(blkno); + node->hdr.count = cpu_to_be16(1); + xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1); + error = 0; +out: + if (bp1) + xfs_da_buf_done(bp1); + if (bp2) + xfs_da_buf_done(bp2); + return(error); +} + + +/*======================================================================== + * Routines used for growing the Btree. + *========================================================================*/ + +/* + * Create the initial contents of a leaf attribute list + * or a leaf in a node attribute list. + */ +STATIC int +xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_hdr_t *hdr; + xfs_inode_t *dp; + xfs_dabuf_t *bp; + int error; + + dp = args->dp; + ASSERT(dp != NULL); + error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + leaf = bp->data; + memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); + hdr = &leaf->hdr; + hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC); + hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount)); + if (!hdr->firstused) { + hdr->firstused = cpu_to_be16( + XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN); + } + + hdr->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); + hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) - + sizeof(xfs_attr_leaf_hdr_t)); + + xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); + + *bpp = bp; + return(0); +} + +/* + * Split the leaf node, rebalance, then add the new entry. + */ +int +xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, + xfs_da_state_blk_t *newblk) +{ + xfs_dablk_t blkno; + int error; + + /* + * Allocate space for a new leaf node. + */ + ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC); + error = xfs_da_grow_inode(state->args, &blkno); + if (error) + return(error); + error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp); + if (error) + return(error); + newblk->blkno = blkno; + newblk->magic = XFS_ATTR_LEAF_MAGIC; + + /* + * Rebalance the entries across the two leaves. + * NOTE: rebalance() currently depends on the 2nd block being empty. + */ + xfs_attr_leaf_rebalance(state, oldblk, newblk); + error = xfs_da_blk_link(state, oldblk, newblk); + if (error) + return(error); + + /* + * Save info on "old" attribute for "atomic rename" ops, leaf_add() + * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the + * "new" attrs info. Will need the "old" info to remove it later. + * + * Insert the "new" entry in the correct block. + */ + if (state->inleaf) + error = xfs_attr_leaf_add(oldblk->bp, state->args); + else + error = xfs_attr_leaf_add(newblk->bp, state->args); + + /* + * Update last hashval in each block since we added the name. + */ + oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL); + newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL); + return(error); +} + +/* + * Add a name to the leaf attribute list structure. + */ +int +xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_hdr_t *hdr; + xfs_attr_leaf_map_t *map; + int tablesize, entsize, sum, tmp, i; + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT((args->index >= 0) + && (args->index <= be16_to_cpu(leaf->hdr.count))); + hdr = &leaf->hdr; + entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen, + args->trans->t_mountp->m_sb.sb_blocksize, NULL); + + /* + * Search through freemap for first-fit on new name length. + * (may need to figure in size of entry struct too) + */ + tablesize = (be16_to_cpu(hdr->count) + 1) + * sizeof(xfs_attr_leaf_entry_t) + + sizeof(xfs_attr_leaf_hdr_t); + map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1]; + for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) { + if (tablesize > be16_to_cpu(hdr->firstused)) { + sum += be16_to_cpu(map->size); + continue; + } + if (!map->size) + continue; /* no space in this map */ + tmp = entsize; + if (be16_to_cpu(map->base) < be16_to_cpu(hdr->firstused)) + tmp += sizeof(xfs_attr_leaf_entry_t); + if (be16_to_cpu(map->size) >= tmp) { + tmp = xfs_attr_leaf_add_work(bp, args, i); + return(tmp); + } + sum += be16_to_cpu(map->size); + } + + /* + * If there are no holes in the address space of the block, + * and we don't have enough freespace, then compaction will do us + * no good and we should just give up. + */ + if (!hdr->holes && (sum < entsize)) + return(XFS_ERROR(ENOSPC)); + + /* + * Compact the entries to coalesce free space. + * This may change the hdr->count via dropping INCOMPLETE entries. + */ + xfs_attr_leaf_compact(args->trans, bp); + + /* + * After compaction, the block is guaranteed to have only one + * free region, in freemap[0]. If it is not big enough, give up. + */ + if (be16_to_cpu(hdr->freemap[0].size) + < (entsize + sizeof(xfs_attr_leaf_entry_t))) + return(XFS_ERROR(ENOSPC)); + + return(xfs_attr_leaf_add_work(bp, args, 0)); +} + +/* + * Add a name to a leaf attribute list structure. + */ +STATIC int +xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_hdr_t *hdr; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_attr_leaf_map_t *map; + xfs_mount_t *mp; + int tmp, i; + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + hdr = &leaf->hdr; + ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE)); + ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(hdr->count))); + + /* + * Force open some space in the entry array and fill it in. + */ + entry = &leaf->entries[args->index]; + if (args->index < be16_to_cpu(hdr->count)) { + tmp = be16_to_cpu(hdr->count) - args->index; + tmp *= sizeof(xfs_attr_leaf_entry_t); + memmove((char *)(entry+1), (char *)entry, tmp); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); + } + be16_add_cpu(&hdr->count, 1); + + /* + * Allocate space for the new string (at the end of the run). + */ + map = &hdr->freemap[mapindex]; + mp = args->trans->t_mountp; + ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp)); + ASSERT((be16_to_cpu(map->base) & 0x3) == 0); + ASSERT(be16_to_cpu(map->size) >= + xfs_attr_leaf_newentsize(args->namelen, args->valuelen, + mp->m_sb.sb_blocksize, NULL)); + ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp)); + ASSERT((be16_to_cpu(map->size) & 0x3) == 0); + be16_add_cpu(&map->size, + -xfs_attr_leaf_newentsize(args->namelen, args->valuelen, + mp->m_sb.sb_blocksize, &tmp)); + entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) + + be16_to_cpu(map->size)); + entry->hashval = cpu_to_be32(args->hashval); + entry->flags = tmp ? XFS_ATTR_LOCAL : 0; + entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + if (args->op_flags & XFS_DA_OP_RENAME) { + entry->flags |= XFS_ATTR_INCOMPLETE; + if ((args->blkno2 == args->blkno) && + (args->index2 <= args->index)) { + args->index2++; + } + } + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + ASSERT((args->index == 0) || + (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval))); + ASSERT((args->index == be16_to_cpu(hdr->count)-1) || + (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval))); + + /* + * Copy the attribute name and value into the new space. + * + * For "remote" attribute values, simply note that we need to + * allocate space for the "remote" value. We can't actually + * allocate the extents in this transaction, and we can't decide + * which blocks they should be as we might allocate more blocks + * as part of this transaction (a split operation for example). + */ + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr_leaf_name_local(leaf, args->index); + name_loc->namelen = args->namelen; + name_loc->valuelen = cpu_to_be16(args->valuelen); + memcpy((char *)name_loc->nameval, args->name, args->namelen); + memcpy((char *)&name_loc->nameval[args->namelen], args->value, + be16_to_cpu(name_loc->valuelen)); + } else { + name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt->namelen = args->namelen; + memcpy((char *)name_rmt->name, args->name, args->namelen); + entry->flags |= XFS_ATTR_INCOMPLETE; + /* just in case */ + name_rmt->valuelen = 0; + name_rmt->valueblk = 0; + args->rmtblkno = 1; + args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); + } + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), + xfs_attr_leaf_entsize(leaf, args->index))); + + /* + * Update the control info for this leaf node + */ + if (be16_to_cpu(entry->nameidx) < be16_to_cpu(hdr->firstused)) { + /* both on-disk, don't endian-flip twice */ + hdr->firstused = entry->nameidx; + } + ASSERT(be16_to_cpu(hdr->firstused) >= + ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr))); + tmp = (be16_to_cpu(hdr->count)-1) * sizeof(xfs_attr_leaf_entry_t) + + sizeof(xfs_attr_leaf_hdr_t); + map = &hdr->freemap[0]; + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { + if (be16_to_cpu(map->base) == tmp) { + be16_add_cpu(&map->base, sizeof(xfs_attr_leaf_entry_t)); + be16_add_cpu(&map->size, + -((int)sizeof(xfs_attr_leaf_entry_t))); + } + } + be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index)); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); + return(0); +} + +/* + * Garbage collect a leaf attribute list block by copying it to a new buffer. + */ +STATIC void +xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) +{ + xfs_attr_leafblock_t *leaf_s, *leaf_d; + xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + xfs_mount_t *mp; + char *tmpbuffer; + + mp = trans->t_mountp; + tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); + ASSERT(tmpbuffer != NULL); + memcpy(tmpbuffer, bp->data, XFS_LBSIZE(mp)); + memset(bp->data, 0, XFS_LBSIZE(mp)); + + /* + * Copy basic information + */ + leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; + leaf_d = bp->data; + hdr_s = &leaf_s->hdr; + hdr_d = &leaf_d->hdr; + hdr_d->info = hdr_s->info; /* struct copy */ + hdr_d->firstused = cpu_to_be16(XFS_LBSIZE(mp)); + /* handle truncation gracefully */ + if (!hdr_d->firstused) { + hdr_d->firstused = cpu_to_be16( + XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN); + } + hdr_d->usedbytes = 0; + hdr_d->count = 0; + hdr_d->holes = 0; + hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); + hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) - + sizeof(xfs_attr_leaf_hdr_t)); + + /* + * Copy all entry's in the same (sorted) order, + * but allocate name/value pairs packed and in sequence. + */ + xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0, + be16_to_cpu(hdr_s->count), mp); + xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); + + kmem_free(tmpbuffer); +} + +/* + * Redistribute the attribute list entries between two leaf nodes, + * taking into account the size of the new entry. + * + * NOTE: if new block is empty, then it will get the upper half of the + * old block. At present, all (one) callers pass in an empty second block. + * + * This code adjusts the args->index/blkno and args->index2/blkno2 fields + * to match what it is doing in splitting the attribute leaf block. Those + * values are used in "atomic rename" operations on attributes. Note that + * the "new" and "old" values can end up in different blocks. + */ +STATIC void +xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2) +{ + xfs_da_args_t *args; + xfs_da_state_blk_t *tmp_blk; + xfs_attr_leafblock_t *leaf1, *leaf2; + xfs_attr_leaf_hdr_t *hdr1, *hdr2; + int count, totallen, max, space, swap; + + /* + * Set up environment. + */ + ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); + leaf1 = blk1->bp->data; + leaf2 = blk2->bp->data; + ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + args = state->args; + + /* + * Check ordering of blocks, reverse if it makes things simpler. + * + * NOTE: Given that all (current) callers pass in an empty + * second block, this code should never set "swap". + */ + swap = 0; + if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) { + tmp_blk = blk1; + blk1 = blk2; + blk2 = tmp_blk; + leaf1 = blk1->bp->data; + leaf2 = blk2->bp->data; + swap = 1; + } + hdr1 = &leaf1->hdr; + hdr2 = &leaf2->hdr; + + /* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. Then get + * the direction to copy and the number of elements to move. + * + * "inleaf" is true if the new entry should be inserted into blk1. + * If "swap" is also true, then reverse the sense of "inleaf". + */ + state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2, + &count, &totallen); + if (swap) + state->inleaf = !state->inleaf; + + /* + * Move any entries required from leaf to leaf: + */ + if (count < be16_to_cpu(hdr1->count)) { + /* + * Figure the total bytes to be added to the destination leaf. + */ + /* number entries being moved */ + count = be16_to_cpu(hdr1->count) - count; + space = be16_to_cpu(hdr1->usedbytes) - totallen; + space += count * sizeof(xfs_attr_leaf_entry_t); + + /* + * leaf2 is the destination, compact it if it looks tight. + */ + max = be16_to_cpu(hdr2->firstused) + - sizeof(xfs_attr_leaf_hdr_t); + max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); + if (space > max) { + xfs_attr_leaf_compact(args->trans, blk2->bp); + } + + /* + * Move high entries from leaf1 to low end of leaf2. + */ + xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count, + leaf2, 0, count, state->mp); + + xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + } else if (count > be16_to_cpu(hdr1->count)) { + /* + * I assert that since all callers pass in an empty + * second buffer, this code should never execute. + */ + + /* + * Figure the total bytes to be added to the destination leaf. + */ + /* number entries being moved */ + count -= be16_to_cpu(hdr1->count); + space = totallen - be16_to_cpu(hdr1->usedbytes); + space += count * sizeof(xfs_attr_leaf_entry_t); + + /* + * leaf1 is the destination, compact it if it looks tight. + */ + max = be16_to_cpu(hdr1->firstused) + - sizeof(xfs_attr_leaf_hdr_t); + max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); + if (space > max) { + xfs_attr_leaf_compact(args->trans, blk1->bp); + } + + /* + * Move low entries from leaf2 to high end of leaf1. + */ + xfs_attr_leaf_moveents(leaf2, 0, leaf1, + be16_to_cpu(hdr1->count), count, state->mp); + + xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + } + + /* + * Copy out last hashval in each block for B-tree code. + */ + blk1->hashval = be32_to_cpu( + leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval); + blk2->hashval = be32_to_cpu( + leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval); + + /* + * Adjust the expected index for insertion. + * NOTE: this code depends on the (current) situation that the + * second block was originally empty. + * + * If the insertion point moved to the 2nd block, we must adjust + * the index. We must also track the entry just following the + * new entry for use in an "atomic rename" operation, that entry + * is always the "old" entry and the "new" entry is what we are + * inserting. The index/blkno fields refer to the "old" entry, + * while the index2/blkno2 fields refer to the "new" entry. + */ + if (blk1->index > be16_to_cpu(leaf1->hdr.count)) { + ASSERT(state->inleaf == 0); + blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); + args->index = args->index2 = blk2->index; + args->blkno = args->blkno2 = blk2->blkno; + } else if (blk1->index == be16_to_cpu(leaf1->hdr.count)) { + if (state->inleaf) { + args->index = blk1->index; + args->blkno = blk1->blkno; + args->index2 = 0; + args->blkno2 = blk2->blkno; + } else { + blk2->index = blk1->index + - be16_to_cpu(leaf1->hdr.count); + args->index = args->index2 = blk2->index; + args->blkno = args->blkno2 = blk2->blkno; + } + } else { + ASSERT(state->inleaf == 1); + args->index = args->index2 = blk1->index; + args->blkno = args->blkno2 = blk1->blkno; + } +} + +/* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. + * GROT: Is this really necessary? With other than a 512 byte blocksize, + * GROT: there will always be enough room in either block for a new entry. + * GROT: Do a double-split for this case? + */ +STATIC int +xfs_attr_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2, + int *countarg, int *usedbytesarg) +{ + xfs_attr_leafblock_t *leaf1, *leaf2; + xfs_attr_leaf_hdr_t *hdr1, *hdr2; + xfs_attr_leaf_entry_t *entry; + int count, max, index, totallen, half; + int lastdelta, foundit, tmp; + + /* + * Set up environment. + */ + leaf1 = blk1->bp->data; + leaf2 = blk2->bp->data; + hdr1 = &leaf1->hdr; + hdr2 = &leaf2->hdr; + foundit = 0; + totallen = 0; + + /* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. + */ + max = be16_to_cpu(hdr1->count) + be16_to_cpu(hdr2->count); + half = (max+1) * sizeof(*entry); + half += be16_to_cpu(hdr1->usedbytes) + + be16_to_cpu(hdr2->usedbytes) + + xfs_attr_leaf_newentsize( + state->args->namelen, + state->args->valuelen, + state->blocksize, NULL); + half /= 2; + lastdelta = state->blocksize; + entry = &leaf1->entries[0]; + for (count = index = 0; count < max; entry++, index++, count++) { + +#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A)) + /* + * The new entry is in the first block, account for it. + */ + if (count == blk1->index) { + tmp = totallen + sizeof(*entry) + + xfs_attr_leaf_newentsize( + state->args->namelen, + state->args->valuelen, + state->blocksize, NULL); + if (XFS_ATTR_ABS(half - tmp) > lastdelta) + break; + lastdelta = XFS_ATTR_ABS(half - tmp); + totallen = tmp; + foundit = 1; + } + + /* + * Wrap around into the second block if necessary. + */ + if (count == be16_to_cpu(hdr1->count)) { + leaf1 = leaf2; + entry = &leaf1->entries[0]; + index = 0; + } + + /* + * Figure out if next leaf entry would be too much. + */ + tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1, + index); + if (XFS_ATTR_ABS(half - tmp) > lastdelta) + break; + lastdelta = XFS_ATTR_ABS(half - tmp); + totallen = tmp; +#undef XFS_ATTR_ABS + } + + /* + * Calculate the number of usedbytes that will end up in lower block. + * If new entry not in lower block, fix up the count. + */ + totallen -= count * sizeof(*entry); + if (foundit) { + totallen -= sizeof(*entry) + + xfs_attr_leaf_newentsize( + state->args->namelen, + state->args->valuelen, + state->blocksize, NULL); + } + + *countarg = count; + *usedbytesarg = totallen; + return(foundit); +} + +/*======================================================================== + * Routines used for shrinking the Btree. + *========================================================================*/ + +/* + * Check a leaf block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + * + * GROT: allow for INCOMPLETE entries in calculation. + */ +int +xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) +{ + xfs_attr_leafblock_t *leaf; + xfs_da_state_blk_t *blk; + xfs_da_blkinfo_t *info; + int count, bytes, forward, error, retval, i; + xfs_dablk_t blkno; + xfs_dabuf_t *bp; + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[ state->path.active-1 ]; + info = blk->bp->data; + ASSERT(be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC); + leaf = (xfs_attr_leafblock_t *)info; + count = be16_to_cpu(leaf->hdr.count); + bytes = sizeof(xfs_attr_leaf_hdr_t) + + count * sizeof(xfs_attr_leaf_entry_t) + + be16_to_cpu(leaf->hdr.usedbytes); + if (bytes > (state->blocksize >> 1)) { + *action = 0; /* blk over 50%, don't try to join */ + return(0); + } + + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (arbitrarily) + * to merge with the forward block unless it is NULL. + */ + if (count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = (info->forw != 0); + memcpy(&state->altpath, &state->path, sizeof(state->path)); + error = xfs_da_path_shift(state, &state->altpath, forward, + 0, &retval); + if (error) + return(error); + if (retval) { + *action = 0; + } else { + *action = 2; + } + return(0); + } + + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink an attribute list over time. + */ + /* start with smaller blk num */ + forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back)); + for (i = 0; i < 2; forward = !forward, i++) { + if (forward) + blkno = be32_to_cpu(info->forw); + else + blkno = be32_to_cpu(info->back); + if (blkno == 0) + continue; + error = xfs_da_read_buf(state->args->trans, state->args->dp, + blkno, -1, &bp, XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + + leaf = (xfs_attr_leafblock_t *)info; + count = be16_to_cpu(leaf->hdr.count); + bytes = state->blocksize - (state->blocksize>>2); + bytes -= be16_to_cpu(leaf->hdr.usedbytes); + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + count += be16_to_cpu(leaf->hdr.count); + bytes -= be16_to_cpu(leaf->hdr.usedbytes); + bytes -= count * sizeof(xfs_attr_leaf_entry_t); + bytes -= sizeof(xfs_attr_leaf_hdr_t); + xfs_da_brelse(state->args->trans, bp); + if (bytes >= 0) + break; /* fits with at least 25% to spare */ + } + if (i >= 2) { + *action = 0; + return(0); + } + + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + memcpy(&state->altpath, &state->path, sizeof(state->path)); + if (blkno < blk->blkno) { + error = xfs_da_path_shift(state, &state->altpath, forward, + 0, &retval); + } else { + error = xfs_da_path_shift(state, &state->path, forward, + 0, &retval); + } + if (error) + return(error); + if (retval) { + *action = 0; + } else { + *action = 1; + } + return(0); +} + +/* + * Remove a name from the leaf attribute list structure. + * + * Return 1 if leaf is less than 37% full, 0 if >= 37% full. + * If two leaves are 37% full, when combined they will leave 25% free. + */ +int +xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_hdr_t *hdr; + xfs_attr_leaf_map_t *map; + xfs_attr_leaf_entry_t *entry; + int before, after, smallest, entsize; + int tablesize, tmp, i; + xfs_mount_t *mp; + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + hdr = &leaf->hdr; + mp = args->trans->t_mountp; + ASSERT((be16_to_cpu(hdr->count) > 0) + && (be16_to_cpu(hdr->count) < (XFS_LBSIZE(mp)/8))); + ASSERT((args->index >= 0) + && (args->index < be16_to_cpu(hdr->count))); + ASSERT(be16_to_cpu(hdr->firstused) >= + ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr))); + entry = &leaf->entries[args->index]; + ASSERT(be16_to_cpu(entry->nameidx) >= be16_to_cpu(hdr->firstused)); + ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); + + /* + * Scan through free region table: + * check for adjacency of free'd entry with an existing one, + * find smallest free region in case we need to replace it, + * adjust any map that borders the entry table, + */ + tablesize = be16_to_cpu(hdr->count) * sizeof(xfs_attr_leaf_entry_t) + + sizeof(xfs_attr_leaf_hdr_t); + map = &hdr->freemap[0]; + tmp = be16_to_cpu(map->size); + before = after = -1; + smallest = XFS_ATTR_LEAF_MAPSIZE - 1; + entsize = xfs_attr_leaf_entsize(leaf, args->index); + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { + ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp)); + ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp)); + if (be16_to_cpu(map->base) == tablesize) { + be16_add_cpu(&map->base, + -((int)sizeof(xfs_attr_leaf_entry_t))); + be16_add_cpu(&map->size, sizeof(xfs_attr_leaf_entry_t)); + } + + if ((be16_to_cpu(map->base) + be16_to_cpu(map->size)) + == be16_to_cpu(entry->nameidx)) { + before = i; + } else if (be16_to_cpu(map->base) + == (be16_to_cpu(entry->nameidx) + entsize)) { + after = i; + } else if (be16_to_cpu(map->size) < tmp) { + tmp = be16_to_cpu(map->size); + smallest = i; + } + } + + /* + * Coalesce adjacent freemap regions, + * or replace the smallest region. + */ + if ((before >= 0) || (after >= 0)) { + if ((before >= 0) && (after >= 0)) { + map = &hdr->freemap[before]; + be16_add_cpu(&map->size, entsize); + be16_add_cpu(&map->size, + be16_to_cpu(hdr->freemap[after].size)); + hdr->freemap[after].base = 0; + hdr->freemap[after].size = 0; + } else if (before >= 0) { + map = &hdr->freemap[before]; + be16_add_cpu(&map->size, entsize); + } else { + map = &hdr->freemap[after]; + /* both on-disk, don't endian flip twice */ + map->base = entry->nameidx; + be16_add_cpu(&map->size, entsize); + } + } else { + /* + * Replace smallest region (if it is smaller than free'd entry) + */ + map = &hdr->freemap[smallest]; + if (be16_to_cpu(map->size) < entsize) { + map->base = cpu_to_be16(be16_to_cpu(entry->nameidx)); + map->size = cpu_to_be16(entsize); + } + } + + /* + * Did we remove the first entry? + */ + if (be16_to_cpu(entry->nameidx) == be16_to_cpu(hdr->firstused)) + smallest = 1; + else + smallest = 0; + + /* + * Compress the remaining entries and zero out the removed stuff. + */ + memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize); + be16_add_cpu(&hdr->usedbytes, -entsize); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), + entsize)); + + tmp = (be16_to_cpu(hdr->count) - args->index) + * sizeof(xfs_attr_leaf_entry_t); + memmove((char *)entry, (char *)(entry+1), tmp); + be16_add_cpu(&hdr->count, -1); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); + entry = &leaf->entries[be16_to_cpu(hdr->count)]; + memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t)); + + /* + * If we removed the first entry, re-find the first used byte + * in the name area. Note that if the entry was the "firstused", + * then we don't have a "hole" in our block resulting from + * removing the name. + */ + if (smallest) { + tmp = XFS_LBSIZE(mp); + entry = &leaf->entries[0]; + for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) { + ASSERT(be16_to_cpu(entry->nameidx) >= + be16_to_cpu(hdr->firstused)); + ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); + + if (be16_to_cpu(entry->nameidx) < tmp) + tmp = be16_to_cpu(entry->nameidx); + } + hdr->firstused = cpu_to_be16(tmp); + if (!hdr->firstused) { + hdr->firstused = cpu_to_be16( + tmp - XFS_ATTR_LEAF_NAME_ALIGN); + } + } else { + hdr->holes = 1; /* mark as needing compaction */ + } + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); + + /* + * Check if leaf is less than 50% full, caller may want to + * "join" the leaf with a sibling if so. + */ + tmp = sizeof(xfs_attr_leaf_hdr_t); + tmp += be16_to_cpu(leaf->hdr.count) * sizeof(xfs_attr_leaf_entry_t); + tmp += be16_to_cpu(leaf->hdr.usedbytes); + return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */ +} + +/* + * Move all the attribute list entries from drop_leaf into save_leaf. + */ +void +xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk) +{ + xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf; + xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr; + xfs_mount_t *mp; + char *tmpbuffer; + + /* + * Set up environment. + */ + mp = state->mp; + ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC); + drop_leaf = drop_blk->bp->data; + save_leaf = save_blk->bp->data; + ASSERT(be16_to_cpu(drop_leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(be16_to_cpu(save_leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + drop_hdr = &drop_leaf->hdr; + save_hdr = &save_leaf->hdr; + + /* + * Save last hashval from dying block for later Btree fixup. + */ + drop_blk->hashval = be32_to_cpu( + drop_leaf->entries[be16_to_cpu(drop_leaf->hdr.count)-1].hashval); + + /* + * Check if we need a temp buffer, or can we do it in place. + * Note that we don't check "leaf" for holes because we will + * always be dropping it, toosmall() decided that for us already. + */ + if (save_hdr->holes == 0) { + /* + * dest leaf has no holes, so we add there. May need + * to make some room in the entry array. + */ + if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { + xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0, + be16_to_cpu(drop_hdr->count), mp); + } else { + xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, + be16_to_cpu(save_hdr->count), + be16_to_cpu(drop_hdr->count), mp); + } + } else { + /* + * Destination has holes, so we make a temporary copy + * of the leaf and add them both to that. + */ + tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP); + ASSERT(tmpbuffer != NULL); + memset(tmpbuffer, 0, state->blocksize); + tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer; + tmp_hdr = &tmp_leaf->hdr; + tmp_hdr->info = save_hdr->info; /* struct copy */ + tmp_hdr->count = 0; + tmp_hdr->firstused = cpu_to_be16(state->blocksize); + if (!tmp_hdr->firstused) { + tmp_hdr->firstused = cpu_to_be16( + state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN); + } + tmp_hdr->usedbytes = 0; + if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { + xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0, + be16_to_cpu(drop_hdr->count), mp); + xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, + be16_to_cpu(tmp_leaf->hdr.count), + be16_to_cpu(save_hdr->count), mp); + } else { + xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0, + be16_to_cpu(save_hdr->count), mp); + xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, + be16_to_cpu(tmp_leaf->hdr.count), + be16_to_cpu(drop_hdr->count), mp); + } + memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize); + kmem_free(tmpbuffer); + } + + xfs_da_log_buf(state->args->trans, save_blk->bp, 0, + state->blocksize - 1); + + /* + * Copy out last hashval in each block for B-tree code. + */ + save_blk->hashval = be32_to_cpu( + save_leaf->entries[be16_to_cpu(save_leaf->hdr.count)-1].hashval); +} + +/*======================================================================== + * Routines used for finding things in the Btree. + *========================================================================*/ + +/* + * Look up a name in a leaf attribute list structure. + * This is the internal routine, it uses the caller's buffer. + * + * Note that duplicate keys are allowed, but only check within the + * current leaf node. The Btree code must check in adjacent leaf nodes. + * + * Return in args->index the index into the entry[] array of either + * the found entry, or where the entry should have been (insert before + * that entry). + * + * Don't change the args->value unless we find the attribute. + */ +int +xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + int probe, span; + xfs_dahash_t hashval; + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(be16_to_cpu(leaf->hdr.count) + < (XFS_LBSIZE(args->dp->i_mount)/8)); + + /* + * Binary search. (note: small blocks will skip this loop) + */ + hashval = args->hashval; + probe = span = be16_to_cpu(leaf->hdr.count) / 2; + for (entry = &leaf->entries[probe]; span > 4; + entry = &leaf->entries[probe]) { + span /= 2; + if (be32_to_cpu(entry->hashval) < hashval) + probe += span; + else if (be32_to_cpu(entry->hashval) > hashval) + probe -= span; + else + break; + } + ASSERT((probe >= 0) && + (!leaf->hdr.count + || (probe < be16_to_cpu(leaf->hdr.count)))); + ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval)); + + /* + * Since we may have duplicate hashval's, find the first matching + * hashval in the leaf. + */ + while ((probe > 0) && (be32_to_cpu(entry->hashval) >= hashval)) { + entry--; + probe--; + } + while ((probe < be16_to_cpu(leaf->hdr.count)) && + (be32_to_cpu(entry->hashval) < hashval)) { + entry++; + probe++; + } + if ((probe == be16_to_cpu(leaf->hdr.count)) || + (be32_to_cpu(entry->hashval) != hashval)) { + args->index = probe; + return(XFS_ERROR(ENOATTR)); + } + + /* + * Duplicate keys may be present, so search all of them for a match. + */ + for ( ; (probe < be16_to_cpu(leaf->hdr.count)) && + (be32_to_cpu(entry->hashval) == hashval); + entry++, probe++) { +/* + * GROT: Add code to remove incomplete entries. + */ + /* + * If we are looking for INCOMPLETE entries, show only those. + * If we are looking for complete entries, show only those. + */ + if ((args->flags & XFS_ATTR_INCOMPLETE) != + (entry->flags & XFS_ATTR_INCOMPLETE)) { + continue; + } + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr_leaf_name_local(leaf, probe); + if (name_loc->namelen != args->namelen) + continue; + if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, entry->flags)) + continue; + args->index = probe; + return(XFS_ERROR(EEXIST)); + } else { + name_rmt = xfs_attr_leaf_name_remote(leaf, probe); + if (name_rmt->namelen != args->namelen) + continue; + if (memcmp(args->name, (char *)name_rmt->name, + args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, entry->flags)) + continue; + args->index = probe; + args->rmtblkno = be32_to_cpu(name_rmt->valueblk); + args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, + be32_to_cpu(name_rmt->valuelen)); + return(XFS_ERROR(EEXIST)); + } + } + args->index = probe; + return(XFS_ERROR(ENOATTR)); +} + +/* + * Get the value associated with an attribute name from a leaf attribute + * list structure. + */ +int +xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args) +{ + int valuelen; + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(be16_to_cpu(leaf->hdr.count) + < (XFS_LBSIZE(args->dp->i_mount)/8)); + ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); + + entry = &leaf->entries[args->index]; + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr_leaf_name_local(leaf, args->index); + ASSERT(name_loc->namelen == args->namelen); + ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); + valuelen = be16_to_cpu(name_loc->valuelen); + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = valuelen; + return(0); + } + if (args->valuelen < valuelen) { + args->valuelen = valuelen; + return(XFS_ERROR(ERANGE)); + } + args->valuelen = valuelen; + memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); + } else { + name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + ASSERT(name_rmt->namelen == args->namelen); + ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); + valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtblkno = be32_to_cpu(name_rmt->valueblk); + args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen); + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = valuelen; + return(0); + } + if (args->valuelen < valuelen) { + args->valuelen = valuelen; + return(XFS_ERROR(ERANGE)); + } + args->valuelen = valuelen; + } + return(0); +} + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Move the indicated entries from one leaf to another. + * NOTE: this routine modifies both source and destination leaves. + */ +/*ARGSUSED*/ +STATIC void +xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, + xfs_attr_leafblock_t *leaf_d, int start_d, + int count, xfs_mount_t *mp) +{ + xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + xfs_attr_leaf_entry_t *entry_s, *entry_d; + int desti, tmp, i; + + /* + * Check for nothing to do. + */ + if (count == 0) + return; + + /* + * Set up environment. + */ + ASSERT(be16_to_cpu(leaf_s->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(be16_to_cpu(leaf_d->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + hdr_s = &leaf_s->hdr; + hdr_d = &leaf_d->hdr; + ASSERT((be16_to_cpu(hdr_s->count) > 0) && + (be16_to_cpu(hdr_s->count) < (XFS_LBSIZE(mp)/8))); + ASSERT(be16_to_cpu(hdr_s->firstused) >= + ((be16_to_cpu(hdr_s->count) + * sizeof(*entry_s))+sizeof(*hdr_s))); + ASSERT(be16_to_cpu(hdr_d->count) < (XFS_LBSIZE(mp)/8)); + ASSERT(be16_to_cpu(hdr_d->firstused) >= + ((be16_to_cpu(hdr_d->count) + * sizeof(*entry_d))+sizeof(*hdr_d))); + + ASSERT(start_s < be16_to_cpu(hdr_s->count)); + ASSERT(start_d <= be16_to_cpu(hdr_d->count)); + ASSERT(count <= be16_to_cpu(hdr_s->count)); + + /* + * Move the entries in the destination leaf up to make a hole? + */ + if (start_d < be16_to_cpu(hdr_d->count)) { + tmp = be16_to_cpu(hdr_d->count) - start_d; + tmp *= sizeof(xfs_attr_leaf_entry_t); + entry_s = &leaf_d->entries[start_d]; + entry_d = &leaf_d->entries[start_d + count]; + memmove((char *)entry_d, (char *)entry_s, tmp); + } + + /* + * Copy all entry's in the same (sorted) order, + * but allocate attribute info packed and in sequence. + */ + entry_s = &leaf_s->entries[start_s]; + entry_d = &leaf_d->entries[start_d]; + desti = start_d; + for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) { + ASSERT(be16_to_cpu(entry_s->nameidx) + >= be16_to_cpu(hdr_s->firstused)); + tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i); +#ifdef GROT + /* + * Code to drop INCOMPLETE entries. Difficult to use as we + * may also need to change the insertion index. Code turned + * off for 6.2, should be revisited later. + */ + if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */ + memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp); + be16_add_cpu(&hdr_s->usedbytes, -tmp); + be16_add_cpu(&hdr_s->count, -1); + entry_d--; /* to compensate for ++ in loop hdr */ + desti--; + if ((start_s + i) < offset) + result++; /* insertion index adjustment */ + } else { +#endif /* GROT */ + be16_add_cpu(&hdr_d->firstused, -tmp); + /* both on-disk, don't endian flip twice */ + entry_d->hashval = entry_s->hashval; + /* both on-disk, don't endian flip twice */ + entry_d->nameidx = hdr_d->firstused; + entry_d->flags = entry_s->flags; + ASSERT(be16_to_cpu(entry_d->nameidx) + tmp + <= XFS_LBSIZE(mp)); + memmove(xfs_attr_leaf_name(leaf_d, desti), + xfs_attr_leaf_name(leaf_s, start_s + i), tmp); + ASSERT(be16_to_cpu(entry_s->nameidx) + tmp + <= XFS_LBSIZE(mp)); + memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp); + be16_add_cpu(&hdr_s->usedbytes, -tmp); + be16_add_cpu(&hdr_d->usedbytes, tmp); + be16_add_cpu(&hdr_s->count, -1); + be16_add_cpu(&hdr_d->count, 1); + tmp = be16_to_cpu(hdr_d->count) + * sizeof(xfs_attr_leaf_entry_t) + + sizeof(xfs_attr_leaf_hdr_t); + ASSERT(be16_to_cpu(hdr_d->firstused) >= tmp); +#ifdef GROT + } +#endif /* GROT */ + } + + /* + * Zero out the entries we just copied. + */ + if (start_s == be16_to_cpu(hdr_s->count)) { + tmp = count * sizeof(xfs_attr_leaf_entry_t); + entry_s = &leaf_s->entries[start_s]; + ASSERT(((char *)entry_s + tmp) <= + ((char *)leaf_s + XFS_LBSIZE(mp))); + memset((char *)entry_s, 0, tmp); + } else { + /* + * Move the remaining entries down to fill the hole, + * then zero the entries at the top. + */ + tmp = be16_to_cpu(hdr_s->count) - count; + tmp *= sizeof(xfs_attr_leaf_entry_t); + entry_s = &leaf_s->entries[start_s + count]; + entry_d = &leaf_s->entries[start_s]; + memmove((char *)entry_d, (char *)entry_s, tmp); + + tmp = count * sizeof(xfs_attr_leaf_entry_t); + entry_s = &leaf_s->entries[be16_to_cpu(hdr_s->count)]; + ASSERT(((char *)entry_s + tmp) <= + ((char *)leaf_s + XFS_LBSIZE(mp))); + memset((char *)entry_s, 0, tmp); + } + + /* + * Fill in the freemap information + */ + hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); + be16_add_cpu(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) * + sizeof(xfs_attr_leaf_entry_t)); + hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) + - be16_to_cpu(hdr_d->freemap[0].base)); + hdr_d->freemap[1].base = 0; + hdr_d->freemap[2].base = 0; + hdr_d->freemap[1].size = 0; + hdr_d->freemap[2].size = 0; + hdr_s->holes = 1; /* leaf may not be compact */ +} + +/* + * Compare two leaf blocks "order". + * Return 0 unless leaf2 should go before leaf1. + */ +int +xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp) +{ + xfs_attr_leafblock_t *leaf1, *leaf2; + + leaf1 = leaf1_bp->data; + leaf2 = leaf2_bp->data; + ASSERT((be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC) && + (be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC)); + if ((be16_to_cpu(leaf1->hdr.count) > 0) && + (be16_to_cpu(leaf2->hdr.count) > 0) && + ((be32_to_cpu(leaf2->entries[0].hashval) < + be32_to_cpu(leaf1->entries[0].hashval)) || + (be32_to_cpu(leaf2->entries[ + be16_to_cpu(leaf2->hdr.count)-1].hashval) < + be32_to_cpu(leaf1->entries[ + be16_to_cpu(leaf1->hdr.count)-1].hashval)))) { + return(1); + } + return(0); +} + +/* + * Pick up the last hashvalue from a leaf block. + */ +xfs_dahash_t +xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count) +{ + xfs_attr_leafblock_t *leaf; + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + if (count) + *count = be16_to_cpu(leaf->hdr.count); + if (!leaf->hdr.count) + return(0); + return be32_to_cpu(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval); +} + +/* + * Calculate the number of bytes used to store the indicated attribute + * (whether local or remote only calculate bytes in this block). + */ +STATIC int +xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) +{ + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + int size; + + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + if (leaf->entries[index].flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr_leaf_name_local(leaf, index); + size = xfs_attr_leaf_entsize_local(name_loc->namelen, + be16_to_cpu(name_loc->valuelen)); + } else { + name_rmt = xfs_attr_leaf_name_remote(leaf, index); + size = xfs_attr_leaf_entsize_remote(name_rmt->namelen); + } + return(size); +} + +/* + * Calculate the number of bytes that would be required to store the new + * attribute (whether local or remote only calculate bytes in this block). + * This routine decides as a side effect whether the attribute will be + * a "local" or a "remote" attribute. + */ +int +xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local) +{ + int size; + + size = xfs_attr_leaf_entsize_local(namelen, valuelen); + if (size < xfs_attr_leaf_entsize_local_max(blocksize)) { + if (local) { + *local = 1; + } + } else { + size = xfs_attr_leaf_entsize_remote(namelen); + if (local) { + *local = 0; + } + } + return(size); +} + +/* + * Copy out attribute list entries for attr_list(), for leaf attribute lists. + */ +int +xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + int retval, i; + + ASSERT(bp != NULL); + leaf = bp->data; + cursor = context->cursor; + cursor->initted = 1; + + trace_xfs_attr_list_leaf(context); + + /* + * Re-find our place in the leaf block if this is a new syscall. + */ + if (context->resynch) { + entry = &leaf->entries[0]; + for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + if (be32_to_cpu(entry->hashval) == cursor->hashval) { + if (cursor->offset == context->dupcnt) { + context->dupcnt = 0; + break; + } + context->dupcnt++; + } else if (be32_to_cpu(entry->hashval) > + cursor->hashval) { + context->dupcnt = 0; + break; + } + } + if (i == be16_to_cpu(leaf->hdr.count)) { + trace_xfs_attr_list_notfound(context); + return(0); + } + } else { + entry = &leaf->entries[0]; + i = 0; + } + context->resynch = 0; + + /* + * We have found our place, start copying out the new attributes. + */ + retval = 0; + for ( ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) { + if (be32_to_cpu(entry->hashval) != cursor->hashval) { + cursor->hashval = be32_to_cpu(entry->hashval); + cursor->offset = 0; + } + + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* skip incomplete entries */ + + if (entry->flags & XFS_ATTR_LOCAL) { + xfs_attr_leaf_name_local_t *name_loc = + xfs_attr_leaf_name_local(leaf, i); + + retval = context->put_listent(context, + entry->flags, + name_loc->nameval, + (int)name_loc->namelen, + be16_to_cpu(name_loc->valuelen), + &name_loc->nameval[name_loc->namelen]); + if (retval) + return retval; + } else { + xfs_attr_leaf_name_remote_t *name_rmt = + xfs_attr_leaf_name_remote(leaf, i); + + int valuelen = be32_to_cpu(name_rmt->valuelen); + + if (context->put_value) { + xfs_da_args_t args; + + memset((char *)&args, 0, sizeof(args)); + args.dp = context->dp; + args.whichfork = XFS_ATTR_FORK; + args.valuelen = valuelen; + args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); + args.rmtblkno = be32_to_cpu(name_rmt->valueblk); + args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); + retval = xfs_attr_rmtval_get(&args); + if (retval) + return retval; + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + args.value); + kmem_free(args.value); + } else { + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + NULL); + } + if (retval) + return retval; + } + if (context->seen_enough) + break; + cursor->offset++; + } + trace_xfs_attr_list_leaf_end(context); + return(retval); +} + + +/*======================================================================== + * Manage the INCOMPLETE flag in a leaf entry + *========================================================================*/ + +/* + * Clear the INCOMPLETE flag on an entry in a leaf block. + */ +int +xfs_attr_leaf_clearflag(xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_dabuf_t *bp; + int error; +#ifdef DEBUG + xfs_attr_leaf_name_local_t *name_loc; + int namelen; + char *name; +#endif /* DEBUG */ + + /* + * Set up the operation. + */ + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) { + return(error); + } + ASSERT(bp != NULL); + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); + ASSERT(args->index >= 0); + entry = &leaf->entries[ args->index ]; + ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); + +#ifdef DEBUG + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr_leaf_name_local(leaf, args->index); + namelen = name_loc->namelen; + name = (char *)name_loc->nameval; + } else { + name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + namelen = name_rmt->namelen; + name = (char *)name_rmt->name; + } + ASSERT(be32_to_cpu(entry->hashval) == args->hashval); + ASSERT(namelen == args->namelen); + ASSERT(memcmp(name, args->name, namelen) == 0); +#endif /* DEBUG */ + + entry->flags &= ~XFS_ATTR_INCOMPLETE; + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + + if (args->rmtblkno) { + ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); + name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt->valueblk = cpu_to_be32(args->rmtblkno); + name_rmt->valuelen = cpu_to_be32(args->valuelen); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); + } + xfs_da_buf_done(bp); + + /* + * Commit the flag value change and start the next trans in series. + */ + return xfs_trans_roll(&args->trans, args->dp); +} + +/* + * Set the INCOMPLETE flag on an entry in a leaf block. + */ +int +xfs_attr_leaf_setflag(xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_dabuf_t *bp; + int error; + + /* + * Set up the operation. + */ + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) { + return(error); + } + ASSERT(bp != NULL); + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); + ASSERT(args->index >= 0); + entry = &leaf->entries[ args->index ]; + + ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); + entry->flags |= XFS_ATTR_INCOMPLETE; + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + if ((entry->flags & XFS_ATTR_LOCAL) == 0) { + name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt->valueblk = 0; + name_rmt->valuelen = 0; + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); + } + xfs_da_buf_done(bp); + + /* + * Commit the flag value change and start the next trans in series. + */ + return xfs_trans_roll(&args->trans, args->dp); +} + +/* + * In a single transaction, clear the INCOMPLETE flag on the leaf entry + * given by args->blkno/index and set the INCOMPLETE flag on the leaf + * entry given by args->blkno2/index2. + * + * Note that they could be in different blocks, or in the same block. + */ +int +xfs_attr_leaf_flipflags(xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf1, *leaf2; + xfs_attr_leaf_entry_t *entry1, *entry2; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_dabuf_t *bp1, *bp2; + int error; +#ifdef DEBUG + xfs_attr_leaf_name_local_t *name_loc; + int namelen1, namelen2; + char *name1, *name2; +#endif /* DEBUG */ + + /* + * Read the block containing the "old" attr + */ + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, + XFS_ATTR_FORK); + if (error) { + return(error); + } + ASSERT(bp1 != NULL); + + /* + * Read the block containing the "new" attr, if it is different + */ + if (args->blkno2 != args->blkno) { + error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, + -1, &bp2, XFS_ATTR_FORK); + if (error) { + return(error); + } + ASSERT(bp2 != NULL); + } else { + bp2 = bp1; + } + + leaf1 = bp1->data; + ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); + ASSERT(args->index >= 0); + entry1 = &leaf1->entries[ args->index ]; + + leaf2 = bp2->data; + ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); + ASSERT(args->index2 >= 0); + entry2 = &leaf2->entries[ args->index2 ]; + +#ifdef DEBUG + if (entry1->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr_leaf_name_local(leaf1, args->index); + namelen1 = name_loc->namelen; + name1 = (char *)name_loc->nameval; + } else { + name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index); + namelen1 = name_rmt->namelen; + name1 = (char *)name_rmt->name; + } + if (entry2->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr_leaf_name_local(leaf2, args->index2); + namelen2 = name_loc->namelen; + name2 = (char *)name_loc->nameval; + } else { + name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2); + namelen2 = name_rmt->namelen; + name2 = (char *)name_rmt->name; + } + ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval)); + ASSERT(namelen1 == namelen2); + ASSERT(memcmp(name1, name2, namelen1) == 0); +#endif /* DEBUG */ + + ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE); + ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0); + + entry1->flags &= ~XFS_ATTR_INCOMPLETE; + xfs_da_log_buf(args->trans, bp1, + XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); + if (args->rmtblkno) { + ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); + name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index); + name_rmt->valueblk = cpu_to_be32(args->rmtblkno); + name_rmt->valuelen = cpu_to_be32(args->valuelen); + xfs_da_log_buf(args->trans, bp1, + XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); + } + + entry2->flags |= XFS_ATTR_INCOMPLETE; + xfs_da_log_buf(args->trans, bp2, + XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); + if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { + name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2); + name_rmt->valueblk = 0; + name_rmt->valuelen = 0; + xfs_da_log_buf(args->trans, bp2, + XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); + } + xfs_da_buf_done(bp1); + if (bp1 != bp2) + xfs_da_buf_done(bp2); + + /* + * Commit the flag value change and start the next trans in series. + */ + error = xfs_trans_roll(&args->trans, args->dp); + + return(error); +} + +/*======================================================================== + * Indiscriminately delete the entire attribute fork + *========================================================================*/ + +/* + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +int +xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) +{ + xfs_da_blkinfo_t *info; + xfs_daddr_t blkno; + xfs_dabuf_t *bp; + int error; + + /* + * Read block 0 to see what we have to work with. + * We only get here if we have extents, since we remove + * the extents in reverse order the extent containing + * block 0 must still be there. + */ + error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + if (error) + return(error); + blkno = xfs_da_blkno(bp); + + /* + * Invalidate the tree, even if the "tree" is only a single leaf block. + * This is a depth-first traversal! + */ + info = bp->data; + if (be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC) { + error = xfs_attr_node_inactive(trans, dp, bp, 1); + } else if (be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC) { + error = xfs_attr_leaf_inactive(trans, dp, bp); + } else { + error = XFS_ERROR(EIO); + xfs_da_brelse(*trans, bp); + } + if (error) + return(error); + + /* + * Invalidate the incore copy of the root block. + */ + error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); + if (error) + return(error); + xfs_da_binval(*trans, bp); /* remove from cache */ + /* + * Commit the invalidate and start the next transaction. + */ + error = xfs_trans_roll(trans, dp); + + return (error); +} + +/* + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +STATIC int +xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, + int level) +{ + xfs_da_blkinfo_t *info; + xfs_da_intnode_t *node; + xfs_dablk_t child_fsb; + xfs_daddr_t parent_blkno, child_blkno; + int error, count, i; + xfs_dabuf_t *child_bp; + + /* + * Since this code is recursive (gasp!) we must protect ourselves. + */ + if (level > XFS_DA_NODE_MAXDEPTH) { + xfs_da_brelse(*trans, bp); /* no locks for later trans */ + return(XFS_ERROR(EIO)); + } + + node = bp->data; + ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + parent_blkno = xfs_da_blkno(bp); /* save for re-read later */ + count = be16_to_cpu(node->hdr.count); + if (!count) { + xfs_da_brelse(*trans, bp); + return(0); + } + child_fsb = be32_to_cpu(node->btree[0].before); + xfs_da_brelse(*trans, bp); /* no locks for later trans */ + + /* + * If this is the node level just above the leaves, simply loop + * over the leaves removing all of them. If this is higher up + * in the tree, recurse downward. + */ + for (i = 0; i < count; i++) { + /* + * Read the subsidiary block to see what we have to work with. + * Don't do this in a transaction. This is a depth-first + * traversal of the tree so we may deal with many blocks + * before we come back to this one. + */ + error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, + XFS_ATTR_FORK); + if (error) + return(error); + if (child_bp) { + /* save for re-read later */ + child_blkno = xfs_da_blkno(child_bp); + + /* + * Invalidate the subtree, however we have to. + */ + info = child_bp->data; + if (be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC) { + error = xfs_attr_node_inactive(trans, dp, + child_bp, level+1); + } else if (be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC) { + error = xfs_attr_leaf_inactive(trans, dp, + child_bp); + } else { + error = XFS_ERROR(EIO); + xfs_da_brelse(*trans, child_bp); + } + if (error) + return(error); + + /* + * Remove the subsidiary block from the cache + * and from the log. + */ + error = xfs_da_get_buf(*trans, dp, 0, child_blkno, + &child_bp, XFS_ATTR_FORK); + if (error) + return(error); + xfs_da_binval(*trans, child_bp); + } + + /* + * If we're not done, re-read the parent to get the next + * child block number. + */ + if ((i+1) < count) { + error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, + &bp, XFS_ATTR_FORK); + if (error) + return(error); + child_fsb = be32_to_cpu(node->btree[i+1].before); + xfs_da_brelse(*trans, bp); + } + /* + * Atomically commit the whole invalidate stuff. + */ + error = xfs_trans_roll(trans, dp); + if (error) + return (error); + } + + return(0); +} + +/* + * Invalidate all of the "remote" value regions pointed to by a particular + * leaf block. + * Note that we must release the lock on the buffer so that we are not + * caught holding something that the logging code wants to flush to disk. + */ +STATIC int +xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_attr_inactive_list_t *list, *lp; + int error, count, size, tmp, i; + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + + /* + * Count the number of "remote" value extents. + */ + count = 0; + entry = &leaf->entries[0]; + for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + if (be16_to_cpu(entry->nameidx) && + ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = xfs_attr_leaf_name_remote(leaf, i); + if (name_rmt->valueblk) + count++; + } + } + + /* + * If there are no "remote" values, we're done. + */ + if (count == 0) { + xfs_da_brelse(*trans, bp); + return(0); + } + + /* + * Allocate storage for a list of all the "remote" value extents. + */ + size = count * sizeof(xfs_attr_inactive_list_t); + list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP); + + /* + * Identify each of the "remote" value extents. + */ + lp = list; + entry = &leaf->entries[0]; + for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + if (be16_to_cpu(entry->nameidx) && + ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = xfs_attr_leaf_name_remote(leaf, i); + if (name_rmt->valueblk) { + lp->valueblk = be32_to_cpu(name_rmt->valueblk); + lp->valuelen = XFS_B_TO_FSB(dp->i_mount, + be32_to_cpu(name_rmt->valuelen)); + lp++; + } + } + } + xfs_da_brelse(*trans, bp); /* unlock for trans. in freextent() */ + + /* + * Invalidate each of the "remote" value extents. + */ + error = 0; + for (lp = list, i = 0; i < count; i++, lp++) { + tmp = xfs_attr_leaf_freextent(trans, dp, + lp->valueblk, lp->valuelen); + + if (error == 0) + error = tmp; /* save only the 1st errno */ + } + + kmem_free((xfs_caddr_t)list); + return(error); +} + +/* + * Look at all the extents for this logical region, + * invalidate any buffers that are incore/in transactions. + */ +STATIC int +xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, + xfs_dablk_t blkno, int blkcnt) +{ + xfs_bmbt_irec_t map; + xfs_dablk_t tblkno; + int tblkcnt, dblkcnt, nmap, error; + xfs_daddr_t dblkno; + xfs_buf_t *bp; + + /* + * Roll through the "value", invalidating the attribute value's + * blocks. + */ + tblkno = blkno; + tblkcnt = blkcnt; + while (tblkcnt > 0) { + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + NULL, 0, &map, &nmap, NULL); + if (error) { + return(error); + } + ASSERT(nmap == 1); + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + + /* + * If it's a hole, these are already unmapped + * so there's nothing to invalidate. + */ + if (map.br_startblock != HOLESTARTBLOCK) { + + dblkno = XFS_FSB_TO_DADDR(dp->i_mount, + map.br_startblock); + dblkcnt = XFS_FSB_TO_BB(dp->i_mount, + map.br_blockcount); + bp = xfs_trans_get_buf(*trans, + dp->i_mount->m_ddev_targp, + dblkno, dblkcnt, XBF_LOCK); + xfs_trans_binval(*trans, bp); + /* + * Roll to next transaction. + */ + error = xfs_trans_roll(trans, dp); + if (error) + return (error); + } + + tblkno += map.br_blockcount; + tblkcnt -= map.br_blockcount; + } + + return(0); +} diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h new file mode 100644 index 00000000..9c7d22fd --- /dev/null +++ b/fs/xfs/xfs_attr_leaf.h @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ATTR_LEAF_H__ +#define __XFS_ATTR_LEAF_H__ + +/* + * Attribute storage layout, internal structure, access macros, etc. + * + * Attribute lists are structured around Btrees where all the data + * elements are in the leaf nodes. Attribute names are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of an attribute name may not be unique, we may have duplicate keys. The + * internal links in the Btree are logical block offsets into the file. + */ + +struct attrlist; +struct attrlist_cursor_kern; +struct xfs_attr_list_context; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_da_state; +struct xfs_da_state_blk; +struct xfs_inode; +struct xfs_trans; + +/*======================================================================== + * Attribute structure when equal to XFS_LBSIZE(mp) bytes. + *========================================================================*/ + +/* + * This is the structure of the leaf nodes in the Btree. + * + * Struct leaf_entry's are packed from the top. Name/values grow from the + * bottom but are not packed. The freemap contains run-length-encoded entries + * for the free bytes after the leaf_entry's, but only the N largest such, + * smaller runs are dropped. When the freemap doesn't show enough space + * for an allocation, we compact the name/value area and try again. If we + * still don't have enough space, then we have to split the block. The + * name/value structs (both local and remote versions) must be 32bit aligned. + * + * Since we have duplicate hash keys, for each key that matches, compare + * the actual name string. The root and intermediate node search always + * takes the first-in-the-block key match found, so we should only have + * to work "forw"ard. If none matches, continue with the "forw"ard leaf + * nodes until the hash key changes or the attribute name is found. + * + * We store the fact that an attribute is a ROOT/USER/SECURE attribute in + * the leaf_entry. The namespaces are independent only because we also look + * at the namespace bit when we are looking for a matching attribute name. + * + * We also store an "incomplete" bit in the leaf_entry. It shows that an + * attribute is in the middle of being created and should not be shown to + * the user if we crash during the time that the bit is set. We clear the + * bit when we have finished setting up the attribute. We do this because + * we cannot create some large attributes inside a single transaction, and we + * need some indication that we weren't finished if we crash in the middle. + */ +#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ + +typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ + __be16 base; /* base of free region */ + __be16 size; /* length of free region */ +} xfs_attr_leaf_map_t; + +typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */ + xfs_da_blkinfo_t info; /* block type, links, etc. */ + __be16 count; /* count of active leaf_entry's */ + __be16 usedbytes; /* num bytes of names/values stored */ + __be16 firstused; /* first used byte in name area */ + __u8 holes; /* != 0 if blk needs compaction */ + __u8 pad1; + xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE]; + /* N largest free regions */ +} xfs_attr_leaf_hdr_t; + +typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ + __be32 hashval; /* hash value of name */ + __be16 nameidx; /* index into buffer of name/value */ + __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ + __u8 pad2; /* unused pad byte */ +} xfs_attr_leaf_entry_t; + +typedef struct xfs_attr_leaf_name_local { + __be16 valuelen; /* number of bytes in value */ + __u8 namelen; /* length of name bytes */ + __u8 nameval[1]; /* name/value bytes */ +} xfs_attr_leaf_name_local_t; + +typedef struct xfs_attr_leaf_name_remote { + __be32 valueblk; /* block number of value bytes */ + __be32 valuelen; /* number of bytes in value */ + __u8 namelen; /* length of name bytes */ + __u8 name[1]; /* name bytes */ +} xfs_attr_leaf_name_remote_t; + +typedef struct xfs_attr_leafblock { + xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ + xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ + xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */ + xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */ +} xfs_attr_leafblock_t; + +/* + * Flags used in the leaf_entry[i].flags field. + * NOTE: the INCOMPLETE bit must not collide with the flags bits specified + * on the system call, they are "or"ed together for various operations. + */ +#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ +#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ +#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ +#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ +#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) +#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) + +/* + * Conversion macros for converting namespace bits from argument flags + * to ondisk flags. + */ +#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) +#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) +#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ + ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) +#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ + ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) + +/* + * Alignment for namelist and valuelist entries (since they are mixed + * there can be only one alignment value) + */ +#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t)) + +/* + * Cast typed pointers for "local" and "remote" name/value structs. + */ +static inline xfs_attr_leaf_name_remote_t * +xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) +{ + return (xfs_attr_leaf_name_remote_t *) + &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; +} + +static inline xfs_attr_leaf_name_local_t * +xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) +{ + return (xfs_attr_leaf_name_local_t *) + &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; +} + +static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx) +{ + return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; +} + +/* + * Calculate total bytes used (including trailing pad for alignment) for + * a "local" name/value structure, a "remote" name/value structure, and + * a pointer which might be either. + */ +static inline int xfs_attr_leaf_entsize_remote(int nlen) +{ + return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); +} + +static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) +{ + return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); +} + +static inline int xfs_attr_leaf_entsize_local_max(int bsize) +{ + return (((bsize) >> 1) + ((bsize) >> 2)); +} + +/* + * Used to keep a list of "remote value" extents when unlinking an inode. + */ +typedef struct xfs_attr_inactive_list { + xfs_dablk_t valueblk; /* block number of value bytes */ + int valuelen; /* number of bytes in value */ +} xfs_attr_inactive_list_t; + + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Internal routines when attribute fork size < XFS_LITINO(mp). + */ +void xfs_attr_shortform_create(struct xfs_da_args *args); +void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); +int xfs_attr_shortform_lookup(struct xfs_da_args *args); +int xfs_attr_shortform_getvalue(struct xfs_da_args *args); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); +int xfs_attr_shortform_remove(struct xfs_da_args *args); +int xfs_attr_shortform_list(struct xfs_attr_list_context *context); +int xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp); +int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); + + +/* + * Internal routines when attribute fork size == XFS_LBSIZE(mp). + */ +int xfs_attr_leaf_to_node(struct xfs_da_args *args); +int xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp, + struct xfs_da_args *args, int forkoff); +int xfs_attr_leaf_clearflag(struct xfs_da_args *args); +int xfs_attr_leaf_setflag(struct xfs_da_args *args); +int xfs_attr_leaf_flipflags(xfs_da_args_t *args); + +/* + * Routines used for growing the Btree. + */ +int xfs_attr_leaf_split(struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk); +int xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf, + struct xfs_da_args *args); +int xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args); +int xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer, + struct xfs_da_args *args); +int xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer, + struct xfs_da_args *args); +int xfs_attr_leaf_list_int(struct xfs_dabuf *bp, + struct xfs_attr_list_context *context); + +/* + * Routines used for shrinking the Btree. + */ +int xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval); +void xfs_attr_leaf_unbalance(struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk); +int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); + +/* + * Utility routines. + */ +xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count); +int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp, + struct xfs_dabuf *leaf2_bp); +int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, + int *local); +#endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h new file mode 100644 index 00000000..919756e3 --- /dev/null +++ b/fs/xfs/xfs_attr_sf.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ATTR_SF_H__ +#define __XFS_ATTR_SF_H__ + +/* + * Attribute storage when stored inside the inode. + * + * Small attribute lists are packed as tightly as possible so as + * to fit into the literal area of the inode. + */ + +/* + * Entries are packed toward the top as tight as possible. + */ +typedef struct xfs_attr_shortform { + struct xfs_attr_sf_hdr { /* constant-structure header block */ + __be16 totsize; /* total bytes in shortform list */ + __u8 count; /* count of active entries */ + } hdr; + struct xfs_attr_sf_entry { + __uint8_t namelen; /* actual length of name (no NULL) */ + __uint8_t valuelen; /* actual length of value (no NULL) */ + __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + __uint8_t nameval[1]; /* name & value bytes concatenated */ + } list[1]; /* variable sized array */ +} xfs_attr_shortform_t; +typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; +typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t; + +/* + * We generate this then sort it, attr_list() must return things in hash-order. + */ +typedef struct xfs_attr_sf_sort { + __uint8_t entno; /* entry number in original list */ + __uint8_t namelen; /* length of name value (no null) */ + __uint8_t valuelen; /* length of value */ + __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + xfs_dahash_t hash; /* this entry's hash value */ + unsigned char *name; /* name value, pointer into buffer */ +} xfs_attr_sf_sort_t; + +#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ + (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen))) +#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ + ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1) +#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \ + ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen) +#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \ + ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep))) +#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \ + (be16_to_cpu(((xfs_attr_shortform_t *) \ + ((dp)->i_afp->if_u1.if_data))->hdr.totsize)) + +#endif /* __XFS_ATTR_SF_H__ */ diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c new file mode 100644 index 00000000..48228848 --- /dev/null +++ b/fs/xfs/xfs_bit.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" + +/* + * XFS bit manipulation routines, used in non-realtime code. + */ + +/* + * Return whether bitmap is empty. + * Size is number of words in the bitmap, which is padded to word boundary + * Returns 1 for empty, 0 for non-empty. + */ +int +xfs_bitmap_empty(uint *map, uint size) +{ + uint i; + uint ret = 0; + + for (i = 0; i < size; i++) { + ret |= map[i]; + } + + return (ret == 0); +} + +/* + * Count the number of contiguous bits set in the bitmap starting with bit + * start_bit. Size is the size of the bitmap in words. + */ +int +xfs_contig_bits(uint *map, uint size, uint start_bit) +{ + uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); + uint result = 0; + uint tmp; + + size <<= BIT_TO_WORD_SHIFT; + + ASSERT(start_bit < size); + size -= start_bit & ~(NBWORD - 1); + start_bit &= (NBWORD - 1); + if (start_bit) { + tmp = *p++; + /* set to one first offset bits prior to start */ + tmp |= (~0U >> (NBWORD-start_bit)); + if (tmp != ~0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + while (size) { + if ((tmp = *p++) != ~0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + return result - start_bit; +found: + return result + ffz(tmp) - start_bit; +} + +/* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + * + * Size is the number of words, not bytes, in the bitmap. + */ +int xfs_next_bit(uint *map, uint size, uint start_bit) +{ + uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); + uint result = start_bit & ~(NBWORD - 1); + uint tmp; + + size <<= BIT_TO_WORD_SHIFT; + + if (start_bit >= size) + return -1; + size -= result; + start_bit &= (NBWORD - 1); + if (start_bit) { + tmp = *p++; + /* set to zero first offset bits prior to start */ + tmp &= (~0U << start_bit); + if (tmp != 0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + while (size) { + if ((tmp = *p++) != 0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + return -1; +found: + return result + ffs(tmp) - 1; +} diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h new file mode 100644 index 00000000..f1e3c907 --- /dev/null +++ b/fs/xfs/xfs_bit.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BIT_H__ +#define __XFS_BIT_H__ + +/* + * XFS bit manipulation routines. + */ + +/* + * masks with n high/low bits set, 64-bit values + */ +static inline __uint64_t xfs_mask64hi(int n) +{ + return (__uint64_t)-1 << (64 - (n)); +} +static inline __uint32_t xfs_mask32lo(int n) +{ + return ((__uint32_t)1 << (n)) - 1; +} +static inline __uint64_t xfs_mask64lo(int n) +{ + return ((__uint64_t)1 << (n)) - 1; +} + +/* Get high bit set out of 32-bit argument, -1 if none set */ +static inline int xfs_highbit32(__uint32_t v) +{ + return fls(v) - 1; +} + +/* Get high bit set out of 64-bit argument, -1 if none set */ +static inline int xfs_highbit64(__uint64_t v) +{ + return fls64(v) - 1; +} + +/* Get low bit set out of 32-bit argument, -1 if none set */ +static inline int xfs_lowbit32(__uint32_t v) +{ + return ffs(v) - 1; +} + +/* Get low bit set out of 64-bit argument, -1 if none set */ +static inline int xfs_lowbit64(__uint64_t v) +{ + __uint32_t w = (__uint32_t)v; + int n = 0; + + if (w) { /* lower bits */ + n = ffs(w); + } else { /* upper bits */ + w = (__uint32_t)(v >> 32); + if (w && (n = ffs(w))) + n += 32; + } + return n - 1; +} + +/* Return whether bitmap is empty (1 == empty) */ +extern int xfs_bitmap_empty(uint *map, uint size); + +/* Count continuous one bits in map starting with start_bit */ +extern int xfs_contig_bits(uint *map, uint size, uint start_bit); + +/* Find next set bit in map */ +extern int xfs_next_bit(uint *map, uint size, uint start_bit); + +#endif /* __XFS_BIT_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c new file mode 100644 index 00000000..a175933a --- /dev/null +++ b/fs/xfs/xfs_bmap.c @@ -0,0 +1,6139 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_mount.h" +#include "xfs_itable.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_inode_item.h" +#include "xfs_extfree_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_attr_leaf.h" +#include "xfs_rw.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_buf_item.h" +#include "xfs_filestream.h" +#include "xfs_vnodeops.h" +#include "xfs_trace.h" + + +#ifdef DEBUG +STATIC void +xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork); +#endif + +kmem_zone_t *xfs_bmap_free_item_zone; + +/* + * Prototypes for internal bmap routines. + */ + + +/* + * Called from xfs_bmap_add_attrfork to handle extents format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags); /* inode logging flags */ + +/* + * Called from xfs_bmap_add_attrfork to handle local format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_local( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags); /* inode logging flags */ + +/* + * Called by xfs_bmap_add_extent to handle cases converting a delayed + * allocation to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_delay_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to add to file extents */ + xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp); /* inode logging flags */ + +/* + * Called by xfs_bmap_add_extent to handle cases converting a hole + * to a delayed allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_hole_delay( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_bmbt_irec_t *new, /* new data to add to file extents */ + int *logflagsp); /* inode logging flags */ + +/* + * Called by xfs_bmap_add_extent to handle cases converting a hole + * to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_hole_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to add to file extents */ + int *logflagsp, /* inode logging flags */ + int whichfork); /* data or attr fork */ + +/* + * Called by xfs_bmap_add_extent to handle cases converting an unwritten + * allocation to a real allocation or vice versa. + */ +STATIC int /* error */ +xfs_bmap_add_extent_unwritten_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to add to file extents */ + int *logflagsp); /* inode logging flags */ + +/* + * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. + * It figures out where to ask the underlying allocator to put the new extent. + */ +STATIC int /* error */ +xfs_bmap_alloc( + xfs_bmalloca_t *ap); /* bmap alloc argument struct */ + +/* + * Transform a btree format file with only one leaf node, where the + * extents list will fit in the inode, into an extents format file. + * Since the file extents are already in-core, all we have to do is + * give up the space for the btree root and pitch the leaf block. + */ +STATIC int /* error */ +xfs_bmap_btree_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflagsp, /* inode logging flags */ + int whichfork); /* data or attr fork */ + +/* + * Remove the entry "free" from the free item list. Prev points to the + * previous entry, unless "free" is the head of the list. + */ +STATIC void +xfs_bmap_del_free( + xfs_bmap_free_t *flist, /* free item list header */ + xfs_bmap_free_item_t *prev, /* previous item on list, if any */ + xfs_bmap_free_item_t *free); /* list item to be freed */ + +/* + * Convert an extents-format file into a btree-format file. + * The new file will have a root block (in the inode) and a single child block. + */ +STATIC int /* error */ +xfs_bmap_extents_to_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first-block-allocated */ + xfs_bmap_free_t *flist, /* blocks freed in xaction */ + xfs_btree_cur_t **curp, /* cursor returned to caller */ + int wasdel, /* converting a delayed alloc */ + int *logflagsp, /* inode logging flags */ + int whichfork); /* data or attr fork */ + +/* + * Convert a local file to an extents file. + * This code is sort of bogus, since the file data needs to get + * logged so it won't be lost. The bmap-level manipulations are ok, though. + */ +STATIC int /* error */ +xfs_bmap_local_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated in xaction */ + xfs_extlen_t total, /* total blocks needed by transaction */ + int *logflagsp, /* inode logging flags */ + int whichfork); /* data or attr fork */ + +/* + * Search the extents list for the inode, for the extent containing bno. + * If bno lies in a hole, point to the next entry. If bno lies past eof, + * *eofp will be set, and *prevp will contain the last entry (null if none). + * Else, *lastxp will be set to the index of the found + * entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_extents( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int whichfork, /* data or attr fork */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + */ +STATIC int /* error */ +xfs_bmap_isaeof( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t off, /* file offset in fsblocks */ + int whichfork, /* data or attribute fork */ + char *aeof); /* return value */ + +/* + * Compute the worst-case number of indirect blocks that will be used + * for ip's delayed extent of length "len". + */ +STATIC xfs_filblks_t +xfs_bmap_worst_indlen( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_filblks_t len); /* delayed extent length */ + +#ifdef DEBUG +/* + * Perform various validation checks on the values being returned + * from xfs_bmapi(). + */ +STATIC void +xfs_bmap_validate_ret( + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_bmbt_irec_t *mval, + int nmap, + int ret_nmap); +#else +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#endif /* DEBUG */ + +STATIC int +xfs_bmap_count_tree( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ifork_t *ifp, + xfs_fsblock_t blockno, + int levelin, + int *count); + +STATIC void +xfs_bmap_count_leaves( + xfs_ifork_t *ifp, + xfs_extnum_t idx, + int numrecs, + int *count); + +STATIC void +xfs_bmap_disk_count_leaves( + struct xfs_mount *mp, + struct xfs_btree_block *block, + int numrecs, + int *count); + +/* + * Bmap internal routines. + */ + +STATIC int /* error */ +xfs_bmbt_lookup_eq( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +STATIC int /* error */ +xfs_bmbt_lookup_ge( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* +* Update the record referred to by cur to the value given + * by [off, bno, len, state]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_bmbt_update( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + xfs_exntst_t state) +{ + union xfs_btree_rec rec; + + xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state); + return xfs_btree_update(cur, &rec); +} + +/* + * Called from xfs_bmap_add_attrfork to handle btree format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_btree_cur_t *cur; /* btree cursor */ + int error; /* error return value */ + xfs_mount_t *mp; /* file system mount struct */ + int stat; /* newroot status */ + + mp = ip->i_mount; + if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip)) + *flags |= XFS_ILOG_DBROOT; + else { + cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); + cur->bc_private.b.flist = flist; + cur->bc_private.b.firstblock = *firstblock; + if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) + goto error0; + /* must be at least one entry */ + XFS_WANT_CORRUPTED_GOTO(stat == 1, error0); + if ((error = xfs_btree_new_iroot(cur, flags, &stat))) + goto error0; + if (stat == 0) { + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return XFS_ERROR(ENOSPC); + } + *firstblock = cur->bc_private.b.firstblock; + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + } + return 0; +error0: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Called from xfs_bmap_add_attrfork to handle extents format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + int error; /* error return value */ + + if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip)) + return 0; + cur = NULL; + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0, + flags, XFS_DATA_FORK); + if (cur) { + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + +/* + * Called from xfs_bmap_add_attrfork to handle local format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_local( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_da_args_t dargs; /* args for dir/attr code */ + int error; /* error return value */ + xfs_mount_t *mp; /* mount structure pointer */ + + if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) + return 0; + if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + mp = ip->i_mount; + memset(&dargs, 0, sizeof(dargs)); + dargs.dp = ip; + dargs.firstblock = firstblock; + dargs.flist = flist; + dargs.total = mp->m_dirblkfsbs; + dargs.whichfork = XFS_DATA_FORK; + dargs.trans = tp; + error = xfs_dir2_sf_to_block(&dargs); + } else + error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, + XFS_DATA_FORK); + return error; +} + +/* + * Called by xfs_bmapi to update file extent records and the btree + * after allocating space (or doing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_add_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to add to file extents */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + xfs_btree_cur_t *cur; /* btree cursor or null */ + xfs_filblks_t da_new; /* new count del alloc blocks used */ + xfs_filblks_t da_old; /* old count del alloc blocks used */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork ptr */ + int logflags; /* returned value */ + xfs_extnum_t nextents; /* number of extents in file now */ + + XFS_STATS_INC(xs_add_exlist); + + cur = *curp; + ifp = XFS_IFORK_PTR(ip, whichfork); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + da_old = da_new = 0; + error = 0; + + ASSERT(*idx >= 0); + ASSERT(*idx <= nextents); + + /* + * This is the first extent added to a new/empty file. + * Special case this one, so other routines get to assume there are + * already extents in the list. + */ + if (nextents == 0) { + xfs_iext_insert(ip, *idx, 1, new, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); + + ASSERT(cur == NULL); + + if (!isnullstartblock(new->br_startblock)) { + XFS_IFORK_NEXT_SET(ip, whichfork, 1); + logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + } else + logflags = 0; + } + /* + * Any kind of new delayed allocation goes here. + */ + else if (isnullstartblock(new->br_startblock)) { + if (cur) + ASSERT((cur->bc_private.b.flags & + XFS_BTCUR_BPRV_WASDEL) == 0); + error = xfs_bmap_add_extent_hole_delay(ip, idx, new, + &logflags); + } + /* + * Real allocation off the end of the file. + */ + else if (*idx == nextents) { + if (cur) + ASSERT((cur->bc_private.b.flags & + XFS_BTCUR_BPRV_WASDEL) == 0); + error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, + &logflags, whichfork); + } else { + xfs_bmbt_irec_t prev; /* old extent at offset idx */ + + /* + * Get the record referred to by idx. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev); + /* + * If it's a real allocation record, and the new allocation ends + * after the start of the referred to record, then we're filling + * in a delayed or unwritten allocation with a real one, or + * converting real back to unwritten. + */ + if (!isnullstartblock(new->br_startblock) && + new->br_startoff + new->br_blockcount > prev.br_startoff) { + if (prev.br_state != XFS_EXT_UNWRITTEN && + isnullstartblock(prev.br_startblock)) { + da_old = startblockval(prev.br_startblock); + if (cur) + ASSERT(cur->bc_private.b.flags & + XFS_BTCUR_BPRV_WASDEL); + error = xfs_bmap_add_extent_delay_real(ip, + idx, &cur, new, &da_new, + first, flist, &logflags); + } else { + ASSERT(new->br_state == XFS_EXT_NORM || + new->br_state == XFS_EXT_UNWRITTEN); + + error = xfs_bmap_add_extent_unwritten_real(ip, + idx, &cur, new, &logflags); + if (error) + goto done; + } + } + /* + * Otherwise we're filling in a hole with an allocation. + */ + else { + if (cur) + ASSERT((cur->bc_private.b.flags & + XFS_BTCUR_BPRV_WASDEL) == 0); + error = xfs_bmap_add_extent_hole_real(ip, idx, cur, + new, &logflags, whichfork); + } + } + + if (error) + goto done; + ASSERT(*curp == cur || *curp == NULL); + + /* + * Convert to a btree if necessary. + */ + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(ip->i_transp, ip, first, + flist, &cur, da_old > 0, &tmp_logflags, whichfork); + logflags |= tmp_logflags; + if (error) + goto done; + } + /* + * Adjust for changes in reserved delayed indirect blocks. + * Nothing to do for disk quotas here. + */ + if (da_old || da_new) { + xfs_filblks_t nblks; + + nblks = da_new; + if (cur) + nblks += cur->bc_private.b.allocated; + ASSERT(nblks <= da_old); + if (nblks < da_old) + xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, + (int64_t)(da_old - nblks), 0); + } + /* + * Clear out the allocated field, done with it now in any case. + */ + if (cur) { + cur->bc_private.b.allocated = 0; + *curp = cur; + } +done: +#ifdef DEBUG + if (!error) + xfs_bmap_check_leaf_extents(*curp, ip, whichfork); +#endif + *logflagsp = logflags; + return error; +} + +/* + * Called by xfs_bmap_add_extent to handle cases converting a delayed + * allocation to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_delay_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to add to file extents */ + xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp) /* inode logging flags */ +{ + xfs_btree_cur_t *cur; /* btree cursor */ + int diff; /* temp value */ + xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ + int error; /* error return value */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t new_endoff; /* end offset of new entry */ + xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + int rval=0; /* return value (logging flags) */ + int state = 0;/* state bits, accessed thru macros */ + xfs_filblks_t temp=0; /* value for dnew calculations */ + xfs_filblks_t temp2=0;/* value for dnew calculations */ + int tmp_rval; /* partial logging flags */ + +#define LEFT r[0] +#define RIGHT r[1] +#define PREV r[2] + + /* + * Set up a bunch of variables to make the tests simpler. + */ + cur = *curp; + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ep = xfs_iext_get_ext(ifp, *idx); + xfs_bmbt_get_all(ep, &PREV); + new_endoff = new->br_startoff + new->br_blockcount; + ASSERT(PREV.br_startoff <= new->br_startoff); + ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + + /* + * Set flags determining what part of the previous delayed allocation + * extent is being replaced by a real allocation. + */ + if (PREV.br_startoff == new->br_startoff) + state |= BMAP_LEFT_FILLING; + if (PREV.br_startoff + PREV.br_blockcount == new_endoff) + state |= BMAP_RIGHT_FILLING; + + /* + * Check and set flags if this segment has a left neighbor. + * Don't set contiguous if the combined extent would be too large. + */ + if (*idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); + + if (isnullstartblock(LEFT.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && + LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && + LEFT.br_state == new->br_state && + LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + /* + * Check and set flags if this segment has a right neighbor. + * Don't set contiguous if the combined extent would be too large. + * Also check for all-three-contiguous being too large. + */ + if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); + + if (isnullstartblock(RIGHT.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new_endoff == RIGHT.br_startoff && + new->br_startblock + new->br_blockcount == RIGHT.br_startblock && + new->br_state == RIGHT.br_state && + new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING)) != + (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING) || + LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount + <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; + + error = 0; + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Filling in all of a previously delayed allocation extent. + * The left and right neighbors are both contiguous with new. + */ + --*idx; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 2, state); + ip->i_d.di_nextents--; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + PREV.br_blockcount + + RIGHT.br_blockcount, LEFT.br_state))) + goto done; + } + *dnew = 0; + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + /* + * Filling in all of a previously delayed allocation extent. + * The left neighbor is contiguous, the right is not. + */ + --*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + LEFT.br_blockcount + PREV.br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 1, state); + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + LEFT.br_startblock, LEFT.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + PREV.br_blockcount, LEFT.br_state))) + goto done; + } + *dnew = 0; + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Filling in all of a previously delayed allocation extent. + * The right neighbor is contiguous, the left is not. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, new->br_startblock); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount + RIGHT.br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 1, state); + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + new->br_startblock, + PREV.br_blockcount + + RIGHT.br_blockcount, PREV.br_state))) + goto done; + } + + *dnew = 0; + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: + /* + * Filling in all of a previously delayed allocation extent. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, new->br_startblock); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 0, done); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } + + *dnew = 0; + break; + + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: + /* + * Filling in the first part of a previous delayed allocation. + * The left neighbor is contiguous. + */ + trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), + LEFT.br_blockcount + new->br_blockcount); + xfs_bmbt_set_startoff(ep, + PREV.br_startoff + new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); + + temp = PREV.br_blockcount - new->br_blockcount; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + LEFT.br_startblock, LEFT.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + new->br_blockcount, + LEFT.br_state))) + goto done; + } + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + startblockval(PREV.br_startblock)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + --*idx; + *dnew = temp; + break; + + case BMAP_LEFT_FILLING: + /* + * Filling in the first part of a previous delayed allocation. + * The left neighbor is not contiguous. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startoff(ep, new_endoff); + temp = PREV.br_blockcount - new->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + xfs_iext_insert(ip, *idx, 1, new, state); + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 0, done); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_nextents > ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(ip->i_transp, ip, + first, flist, &cur, 1, &tmp_rval, + XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + startblockval(PREV.br_startblock) - + (cur ? cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, *idx + 1); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); + + *dnew = temp; + break; + + case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Filling in the last part of a previous delayed allocation. + * The right neighbor is contiguous with the new allocation. + */ + temp = PREV.br_blockcount - new->br_blockcount; + trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1), + new->br_startoff, new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + RIGHT.br_state); + trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + + RIGHT.br_blockcount, + RIGHT.br_state))) + goto done; + } + + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + startblockval(PREV.br_startblock)); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ++*idx; + *dnew = temp; + break; + + case BMAP_RIGHT_FILLING: + /* + * Filling in the last part of a previous delayed allocation. + * The right neighbor is not contiguous. + */ + temp = PREV.br_blockcount - new->br_blockcount; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + xfs_iext_insert(ip, *idx + 1, 1, new, state); + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 0, done); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_nextents > ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(ip->i_transp, ip, + first, flist, &cur, 1, &tmp_rval, + XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + startblockval(PREV.br_startblock) - + (cur ? cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, *idx); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ++*idx; + *dnew = temp; + break; + + case 0: + /* + * Filling in the middle part of a previous delayed allocation. + * Contiguity is impossible here. + * This case is avoided almost all the time. + * + * We start with a delayed allocation: + * + * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+ + * PREV @ idx + * + * and we are allocating: + * +rrrrrrrrrrrrrrrrr+ + * new + * + * and we set it up for insertion as: + * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+ + * new + * PREV @ idx LEFT RIGHT + * inserted at idx + 1 + */ + temp = new->br_startoff - PREV.br_startoff; + temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; + trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ + LEFT = *new; + RIGHT.br_state = PREV.br_state; + RIGHT.br_startblock = nullstartblock( + (int)xfs_bmap_worst_indlen(ip, temp2)); + RIGHT.br_startoff = new_endoff; + RIGHT.br_blockcount = temp2; + /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ + xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state); + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 0, done); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_nextents > ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(ip->i_transp, ip, + first, flist, &cur, 1, &tmp_rval, + XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + temp = xfs_bmap_worst_indlen(ip, temp); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - + (cur ? cur->bc_private.b.allocated : 0)); + if (diff > 0 && + xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, + -((int64_t)diff), 0)) { + /* + * Ick gross gag me with a spoon. + */ + ASSERT(0); /* want to see if this ever happens! */ + while (diff > 0) { + if (temp) { + temp--; + diff--; + if (!diff || + !xfs_icsb_modify_counters(ip->i_mount, + XFS_SBS_FDBLOCKS, + -((int64_t)diff), 0)) + break; + } + if (temp2) { + temp2--; + diff--; + if (!diff || + !xfs_icsb_modify_counters(ip->i_mount, + XFS_SBS_FDBLOCKS, + -((int64_t)diff), 0)) + break; + } + } + } + ep = xfs_iext_get_ext(ifp, *idx); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2), + nullstartblock((int)temp2)); + trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_); + + ++*idx; + *dnew = temp + temp2; + break; + + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_CONTIG: + case BMAP_RIGHT_CONTIG: + /* + * These cases are all impossible. + */ + ASSERT(0); + } + *curp = cur; +done: + *logflagsp = rval; + return error; +#undef LEFT +#undef RIGHT +#undef PREV +} + +/* + * Called by xfs_bmap_add_extent to handle cases converting an unwritten + * allocation to a real allocation or vice versa. + */ +STATIC int /* error */ +xfs_bmap_add_extent_unwritten_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to add to file extents */ + int *logflagsp) /* inode logging flags */ +{ + xfs_btree_cur_t *cur; /* btree cursor */ + xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ + int error; /* error return value */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t new_endoff; /* end offset of new entry */ + xfs_exntst_t newext; /* new extent state */ + xfs_exntst_t oldext; /* old extent state */ + xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + int rval=0; /* return value (logging flags) */ + int state = 0;/* state bits, accessed thru macros */ + +#define LEFT r[0] +#define RIGHT r[1] +#define PREV r[2] + /* + * Set up a bunch of variables to make the tests simpler. + */ + error = 0; + cur = *curp; + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ep = xfs_iext_get_ext(ifp, *idx); + xfs_bmbt_get_all(ep, &PREV); + newext = new->br_state; + oldext = (newext == XFS_EXT_UNWRITTEN) ? + XFS_EXT_NORM : XFS_EXT_UNWRITTEN; + ASSERT(PREV.br_state == oldext); + new_endoff = new->br_startoff + new->br_blockcount; + ASSERT(PREV.br_startoff <= new->br_startoff); + ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + + /* + * Set flags determining what part of the previous oldext allocation + * extent is being replaced by a newext allocation. + */ + if (PREV.br_startoff == new->br_startoff) + state |= BMAP_LEFT_FILLING; + if (PREV.br_startoff + PREV.br_blockcount == new_endoff) + state |= BMAP_RIGHT_FILLING; + + /* + * Check and set flags if this segment has a left neighbor. + * Don't set contiguous if the combined extent would be too large. + */ + if (*idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); + + if (isnullstartblock(LEFT.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && + LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && + LEFT.br_state == newext && + LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + /* + * Check and set flags if this segment has a right neighbor. + * Don't set contiguous if the combined extent would be too large. + * Also check for all-three-contiguous being too large. + */ + if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); + if (isnullstartblock(RIGHT.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new_endoff == RIGHT.br_startoff && + new->br_startblock + new->br_blockcount == RIGHT.br_startblock && + newext == RIGHT.br_state && + new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING)) != + (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING) || + LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount + <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; + + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left and right neighbors are both contiguous with new. + */ + --*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 2, state); + ip->i_d.di_nextents -= 2; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount, LEFT.br_state))) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left neighbor is contiguous, the right is not. + */ + --*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + LEFT.br_blockcount + PREV.br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 1, state); + ip->i_d.di_nextents--; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + PREV.br_blockcount, + LEFT.br_state))) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The right neighbor is contiguous, the left is not. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount + RIGHT.br_blockcount); + xfs_bmbt_set_state(ep, newext); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_remove(ip, *idx + 1, 1, state); + ip->i_d.di_nextents--; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + newext))) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: + /* + * Setting all of a previous oldext extent to newext. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_state(ep, newext); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + newext))) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is contiguous. + */ + trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), + LEFT.br_blockcount + new->br_blockcount); + xfs_bmbt_set_startoff(ep, + PREV.br_startoff + new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, + new->br_startblock + new->br_blockcount); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + --*idx; + + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, + PREV.br_startoff + new->br_blockcount, + PREV.br_startblock + new->br_blockcount, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + if (xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + new->br_blockcount, + LEFT.br_state)) + goto done; + } + break; + + case BMAP_LEFT_FILLING: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is not contiguous. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); + xfs_bmbt_set_startoff(ep, new_endoff); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + xfs_bmbt_set_startblock(ep, + new->br_startblock + new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_insert(ip, *idx, 1, new, state); + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, + PREV.br_startoff + new->br_blockcount, + PREV.br_startblock + new->br_blockcount, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + cur->bc_rec.b = *new; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } + break; + + case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is contiguous with the new allocation. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ++*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + new->br_startoff, new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, newext); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_btree_increment(cur, 0, &i))) + goto done; + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + newext))) + goto done; + } + break; + + case BMAP_RIGHT_FILLING: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is not contiguous. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ++*idx; + xfs_iext_insert(ip, *idx, 1, new, state); + + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 0, done); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } + break; + + case 0: + /* + * Setting the middle part of a previous oldext extent to + * newext. Contiguity is impossible here. + * One extent becomes three extents. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, + new->br_startoff - PREV.br_startoff); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + r[0] = *new; + r[1].br_startoff = new_endoff; + r[1].br_blockcount = + PREV.br_startoff + PREV.br_blockcount - new_endoff; + r[1].br_startblock = new->br_startblock + new->br_blockcount; + r[1].br_state = oldext; + + ++*idx; + xfs_iext_insert(ip, *idx, 2, &r[0], state); + + ip->i_d.di_nextents += 2; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + /* new right extent - oldext */ + if ((error = xfs_bmbt_update(cur, r[1].br_startoff, + r[1].br_startblock, r[1].br_blockcount, + r[1].br_state))) + goto done; + /* new left extent - oldext */ + cur->bc_rec.b = PREV; + cur->bc_rec.b.br_blockcount = + new->br_startoff - PREV.br_startoff; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + /* + * Reset the cursor to the position of the new extent + * we are about to insert as we can't trust it after + * the previous insert. + */ + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 0, done); + /* new middle extent - newext */ + cur->bc_rec.b.br_state = new->br_state; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } + break; + + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_CONTIG: + case BMAP_RIGHT_CONTIG: + /* + * These cases are all impossible. + */ + ASSERT(0); + } + *curp = cur; +done: + *logflagsp = rval; + return error; +#undef LEFT +#undef RIGHT +#undef PREV +} + +/* + * Called by xfs_bmap_add_extent to handle cases converting a hole + * to a delayed allocation. + */ +/*ARGSUSED*/ +STATIC int /* error */ +xfs_bmap_add_extent_hole_delay( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_bmbt_irec_t *new, /* new data to add to file extents */ + int *logflagsp) /* inode logging flags */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_filblks_t newlen=0; /* new indirect size */ + xfs_filblks_t oldlen=0; /* old indirect size */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + int state; /* state bits, accessed thru macros */ + xfs_filblks_t temp=0; /* temp for indirect calculations */ + + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + state = 0; + ASSERT(isnullstartblock(new->br_startblock)); + + /* + * Check and set flags if this segment has a left neighbor + */ + if (*idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); + + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + /* + * Check and set flags if the current (right) segment exists. + * If it doesn't exist, we're converting the hole at end-of-file. + */ + if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); + + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + /* + * Set contiguity flags on the left and right neighbors. + * Don't let extents get too large, even if the pieces are contiguous. + */ + if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + (left.br_blockcount + new->br_blockcount + + right.br_blockcount <= MAXEXTLEN))) + state |= BMAP_RIGHT_CONTIG; + + /* + * Switch out based on the contiguity flags. + */ + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with delayed allocations + * on the left and on the right. + * Merge all three into a single extent record. + */ + --*idx; + temp = left.br_blockcount + new->br_blockcount + + right.br_blockcount; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), + nullstartblock((int)newlen)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 1, state); + break; + + case BMAP_LEFT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + --*idx; + temp = left.br_blockcount + new->br_blockcount; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), + nullstartblock((int)newlen)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + + case BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + temp = new->br_blockcount + right.br_blockcount; + oldlen = startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + new->br_startoff, + nullstartblock((int)newlen), temp, right.br_state); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + + case 0: + /* + * New allocation is not contiguous with another + * delayed allocation. + * Insert a new entry. + */ + oldlen = newlen = 0; + xfs_iext_insert(ip, *idx, 1, new, state); + break; + } + if (oldlen != newlen) { + ASSERT(oldlen > newlen); + xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, + (int64_t)(oldlen - newlen), 0); + /* + * Nothing to do for disk quota accounting here. + */ + } + *logflagsp = 0; + return 0; +} + +/* + * Called by xfs_bmap_add_extent to handle cases converting a hole + * to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_hole_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to add to file extents */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + int error; /* error return value */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + int rval=0; /* return value (logging flags) */ + int state; /* state bits, accessed thru macros */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); + state = 0; + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + /* + * Check and set flags if this segment has a left neighbor. + */ + if (*idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + /* + * Check and set flags if this segment has a current value. + * Not true if we're inserting into the "hole" at eof. + */ + if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + /* + * We're inserting a real allocation between "left" and "right". + * Set the contiguity flags. Don't let extents get too large. + */ + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_startblock + left.br_blockcount == new->br_startblock && + left.br_state == new->br_state && + left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_startblock + new->br_blockcount == right.br_startblock && + new->br_state == right.br_state && + new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + left.br_blockcount + new->br_blockcount + + right.br_blockcount <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; + + error = 0; + /* + * Select which case we're in here, and implement it. + */ + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with real allocations on the + * left and on the right. + * Merge all three into a single extent record. + */ + --*idx; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + left.br_blockcount + new->br_blockcount + + right.br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 1, state); + + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + if (cur == NULL) { + rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + } else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, + right.br_startoff, + right.br_startblock, + right.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, left.br_startoff, + left.br_startblock, + left.br_blockcount + + new->br_blockcount + + right.br_blockcount, + left.br_state))) + goto done; + } + break; + + case BMAP_LEFT_CONTIG: + /* + * New allocation is contiguous with a real allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + --*idx; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + left.br_blockcount + new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + if (cur == NULL) { + rval = xfs_ilog_fext(whichfork); + } else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, + left.br_startoff, + left.br_startblock, + left.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, left.br_startoff, + left.br_startblock, + left.br_blockcount + + new->br_blockcount, + left.br_state))) + goto done; + } + break; + + case BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with a real allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + new->br_startoff, new->br_startblock, + new->br_blockcount + right.br_blockcount, + right.br_state); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + if (cur == NULL) { + rval = xfs_ilog_fext(whichfork); + } else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, + right.br_startoff, + right.br_startblock, + right.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + + right.br_blockcount, + right.br_state))) + goto done; + } + break; + + case 0: + /* + * New allocation is not contiguous with another + * real allocation. + * Insert a new entry. + */ + xfs_iext_insert(ip, *idx, 1, new, state); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + if (cur == NULL) { + rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + } else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, + new->br_startoff, + new->br_startblock, + new->br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 0, done); + cur->bc_rec.b.br_state = new->br_state; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } + break; + } +done: + *logflagsp = rval; + return error; +} + +/* + * Adjust the size of the new extent based on di_extsize and rt extsize. + */ +STATIC int +xfs_bmap_extsize_align( + xfs_mount_t *mp, + xfs_bmbt_irec_t *gotp, /* next extent pointer */ + xfs_bmbt_irec_t *prevp, /* previous extent pointer */ + xfs_extlen_t extsz, /* align to this extent size */ + int rt, /* is this a realtime inode? */ + int eof, /* is extent at end-of-file? */ + int delay, /* creating delalloc extent? */ + int convert, /* overwriting unwritten extent? */ + xfs_fileoff_t *offp, /* in/out: aligned offset */ + xfs_extlen_t *lenp) /* in/out: aligned length */ +{ + xfs_fileoff_t orig_off; /* original offset */ + xfs_extlen_t orig_alen; /* original length */ + xfs_fileoff_t orig_end; /* original off+len */ + xfs_fileoff_t nexto; /* next file offset */ + xfs_fileoff_t prevo; /* previous file offset */ + xfs_fileoff_t align_off; /* temp for offset */ + xfs_extlen_t align_alen; /* temp for length */ + xfs_extlen_t temp; /* temp for calculations */ + + if (convert) + return 0; + + orig_off = align_off = *offp; + orig_alen = align_alen = *lenp; + orig_end = orig_off + orig_alen; + + /* + * If this request overlaps an existing extent, then don't + * attempt to perform any additional alignment. + */ + if (!delay && !eof && + (orig_off >= gotp->br_startoff) && + (orig_end <= gotp->br_startoff + gotp->br_blockcount)) { + return 0; + } + + /* + * If the file offset is unaligned vs. the extent size + * we need to align it. This will be possible unless + * the file was previously written with a kernel that didn't + * perform this alignment, or if a truncate shot us in the + * foot. + */ + temp = do_mod(orig_off, extsz); + if (temp) { + align_alen += temp; + align_off -= temp; + } + /* + * Same adjustment for the end of the requested area. + */ + if ((temp = (align_alen % extsz))) { + align_alen += extsz - temp; + } + /* + * If the previous block overlaps with this proposed allocation + * then move the start forward without adjusting the length. + */ + if (prevp->br_startoff != NULLFILEOFF) { + if (prevp->br_startblock == HOLESTARTBLOCK) + prevo = prevp->br_startoff; + else + prevo = prevp->br_startoff + prevp->br_blockcount; + } else + prevo = 0; + if (align_off != orig_off && align_off < prevo) + align_off = prevo; + /* + * If the next block overlaps with this proposed allocation + * then move the start back without adjusting the length, + * but not before offset 0. + * This may of course make the start overlap previous block, + * and if we hit the offset 0 limit then the next block + * can still overlap too. + */ + if (!eof && gotp->br_startoff != NULLFILEOFF) { + if ((delay && gotp->br_startblock == HOLESTARTBLOCK) || + (!delay && gotp->br_startblock == DELAYSTARTBLOCK)) + nexto = gotp->br_startoff + gotp->br_blockcount; + else + nexto = gotp->br_startoff; + } else + nexto = NULLFILEOFF; + if (!eof && + align_off + align_alen != orig_end && + align_off + align_alen > nexto) + align_off = nexto > align_alen ? nexto - align_alen : 0; + /* + * If we're now overlapping the next or previous extent that + * means we can't fit an extsz piece in this hole. Just move + * the start forward to the first valid spot and set + * the length so we hit the end. + */ + if (align_off != orig_off && align_off < prevo) + align_off = prevo; + if (align_off + align_alen != orig_end && + align_off + align_alen > nexto && + nexto != NULLFILEOFF) { + ASSERT(nexto > prevo); + align_alen = nexto - align_off; + } + + /* + * If realtime, and the result isn't a multiple of the realtime + * extent size we need to remove blocks until it is. + */ + if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) { + /* + * We're not covering the original request, or + * we won't be able to once we fix the length. + */ + if (orig_off < align_off || + orig_end > align_off + align_alen || + align_alen - temp < orig_alen) + return XFS_ERROR(EINVAL); + /* + * Try to fix it by moving the start up. + */ + if (align_off + temp <= orig_off) { + align_alen -= temp; + align_off += temp; + } + /* + * Try to fix it by moving the end in. + */ + else if (align_off + align_alen - temp >= orig_end) + align_alen -= temp; + /* + * Set the start to the minimum then trim the length. + */ + else { + align_alen -= orig_off - align_off; + align_off = orig_off; + align_alen -= align_alen % mp->m_sb.sb_rextsize; + } + /* + * Result doesn't cover the request, fail it. + */ + if (orig_off < align_off || orig_end > align_off + align_alen) + return XFS_ERROR(EINVAL); + } else { + ASSERT(orig_off >= align_off); + ASSERT(orig_end <= align_off + align_alen); + } + +#ifdef DEBUG + if (!eof && gotp->br_startoff != NULLFILEOFF) + ASSERT(align_off + align_alen <= gotp->br_startoff); + if (prevp->br_startoff != NULLFILEOFF) + ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount); +#endif + + *lenp = align_alen; + *offp = align_off; + return 0; +} + +#define XFS_ALLOC_GAP_UNITS 4 + +STATIC void +xfs_bmap_adjacent( + xfs_bmalloca_t *ap) /* bmap alloc argument struct */ +{ + xfs_fsblock_t adjust; /* adjustment to block numbers */ + xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ + xfs_mount_t *mp; /* mount point structure */ + int nullfb; /* true if ap->firstblock isn't set */ + int rt; /* true if inode is realtime */ + +#define ISVALID(x,y) \ + (rt ? \ + (x) < mp->m_sb.sb_rblocks : \ + XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \ + XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \ + XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) + + mp = ap->ip->i_mount; + nullfb = ap->firstblock == NULLFSBLOCK; + rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); + /* + * If allocating at eof, and there's a previous real block, + * try to use its last block as our starting point. + */ + if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prevp->br_startblock) && + ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount, + ap->prevp->br_startblock)) { + ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount; + /* + * Adjust for the gap between prevp and us. + */ + adjust = ap->off - + (ap->prevp->br_startoff + ap->prevp->br_blockcount); + if (adjust && + ISVALID(ap->rval + adjust, ap->prevp->br_startblock)) + ap->rval += adjust; + } + /* + * If not at eof, then compare the two neighbor blocks. + * Figure out whether either one gives us a good starting point, + * and pick the better one. + */ + else if (!ap->eof) { + xfs_fsblock_t gotbno; /* right side block number */ + xfs_fsblock_t gotdiff=0; /* right side difference */ + xfs_fsblock_t prevbno; /* left side block number */ + xfs_fsblock_t prevdiff=0; /* left side difference */ + + /* + * If there's a previous (left) block, select a requested + * start block based on it. + */ + if (ap->prevp->br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prevp->br_startblock) && + (prevbno = ap->prevp->br_startblock + + ap->prevp->br_blockcount) && + ISVALID(prevbno, ap->prevp->br_startblock)) { + /* + * Calculate gap to end of previous block. + */ + adjust = prevdiff = ap->off - + (ap->prevp->br_startoff + + ap->prevp->br_blockcount); + /* + * Figure the startblock based on the previous block's + * end and the gap size. + * Heuristic! + * If the gap is large relative to the piece we're + * allocating, or using it gives us an invalid block + * number, then just use the end of the previous block. + */ + if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + ISVALID(prevbno + prevdiff, + ap->prevp->br_startblock)) + prevbno += adjust; + else + prevdiff += adjust; + /* + * If the firstblock forbids it, can't use it, + * must use default. + */ + if (!rt && !nullfb && + XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno) + prevbno = NULLFSBLOCK; + } + /* + * No previous block or can't follow it, just default. + */ + else + prevbno = NULLFSBLOCK; + /* + * If there's a following (right) block, select a requested + * start block based on it. + */ + if (!isnullstartblock(ap->gotp->br_startblock)) { + /* + * Calculate gap to start of next block. + */ + adjust = gotdiff = ap->gotp->br_startoff - ap->off; + /* + * Figure the startblock based on the next block's + * start and the gap size. + */ + gotbno = ap->gotp->br_startblock; + /* + * Heuristic! + * If the gap is large relative to the piece we're + * allocating, or using it gives us an invalid block + * number, then just use the start of the next block + * offset by our length. + */ + if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + ISVALID(gotbno - gotdiff, gotbno)) + gotbno -= adjust; + else if (ISVALID(gotbno - ap->alen, gotbno)) { + gotbno -= ap->alen; + gotdiff += adjust - ap->alen; + } else + gotdiff += adjust; + /* + * If the firstblock forbids it, can't use it, + * must use default. + */ + if (!rt && !nullfb && + XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno) + gotbno = NULLFSBLOCK; + } + /* + * No next block, just default. + */ + else + gotbno = NULLFSBLOCK; + /* + * If both valid, pick the better one, else the only good + * one, else ap->rval is already set (to 0 or the inode block). + */ + if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) + ap->rval = prevdiff <= gotdiff ? prevbno : gotbno; + else if (prevbno != NULLFSBLOCK) + ap->rval = prevbno; + else if (gotbno != NULLFSBLOCK) + ap->rval = gotbno; + } +#undef ISVALID +} + +STATIC int +xfs_bmap_rtalloc( + xfs_bmalloca_t *ap) /* bmap alloc argument struct */ +{ + xfs_alloctype_t atype = 0; /* type for allocation routines */ + int error; /* error return value */ + xfs_mount_t *mp; /* mount point structure */ + xfs_extlen_t prod = 0; /* product factor for allocators */ + xfs_extlen_t ralen = 0; /* realtime allocation length */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_rtblock_t rtb; + + mp = ap->ip->i_mount; + align = xfs_get_extsz_hint(ap->ip); + prod = align / mp->m_sb.sb_rextsize; + error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, + align, 1, ap->eof, 0, + ap->conv, &ap->off, &ap->alen); + if (error) + return error; + ASSERT(ap->alen); + ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0); + + /* + * If the offset & length are not perfectly aligned + * then kill prod, it will just get us in trouble. + */ + if (do_mod(ap->off, align) || ap->alen % align) + prod = 1; + /* + * Set ralen to be the actual requested length in rtextents. + */ + ralen = ap->alen / mp->m_sb.sb_rextsize; + /* + * If the old value was close enough to MAXEXTLEN that + * we rounded up to it, cut it back so it's valid again. + * Note that if it's a really large request (bigger than + * MAXEXTLEN), we don't hear about that number, and can't + * adjust the starting point to match it. + */ + if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) + ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; + + /* + * Lock out other modifications to the RT bitmap inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + + /* + * If it's an allocation to an empty file at offset 0, + * pick an extent that will space things out in the rt area. + */ + if (ap->eof && ap->off == 0) { + xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ + + error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); + if (error) + return error; + ap->rval = rtx * mp->m_sb.sb_rextsize; + } else { + ap->rval = 0; + } + + xfs_bmap_adjacent(ap); + + /* + * Realtime allocation, done through xfs_rtallocate_extent. + */ + atype = ap->rval == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; + do_div(ap->rval, mp->m_sb.sb_rextsize); + rtb = ap->rval; + ap->alen = ralen; + if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen, + &ralen, atype, ap->wasdel, prod, &rtb))) + return error; + if (rtb == NULLFSBLOCK && prod > 1 && + (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, + ap->alen, &ralen, atype, + ap->wasdel, 1, &rtb))) + return error; + ap->rval = rtb; + if (ap->rval != NULLFSBLOCK) { + ap->rval *= mp->m_sb.sb_rextsize; + ralen *= mp->m_sb.sb_rextsize; + ap->alen = ralen; + ap->ip->i_d.di_nblocks += ralen; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= ralen; + /* + * Adjust the disk quota also. This was reserved + * earlier. + */ + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, + ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : + XFS_TRANS_DQ_RTBCOUNT, (long) ralen); + } else { + ap->alen = 0; + } + return 0; +} + +STATIC int +xfs_bmap_btalloc_nullfb( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen) +{ + struct xfs_mount *mp = ap->ip->i_mount; + struct xfs_perag *pag; + xfs_agnumber_t ag, startag; + int notinit = 0; + int error; + + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + args->type = XFS_ALLOCTYPE_NEAR_BNO; + else + args->type = XFS_ALLOCTYPE_START_BNO; + args->total = ap->total; + + /* + * Search for an allocation group with a single extent large enough + * for the request. If one isn't found, then adjust the minimum + * allocation size to the largest space found. + */ + startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno); + if (startag == NULLAGNUMBER) + startag = ag = 0; + + pag = xfs_perag_get(mp, ag); + while (*blen < args->maxlen) { + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, args->tp, ag, + XFS_ALLOC_FLAG_TRYLOCK); + if (error) { + xfs_perag_put(pag); + return error; + } + } + + /* + * See xfs_alloc_fix_freelist... + */ + if (pag->pagf_init) { + xfs_extlen_t longest; + longest = xfs_alloc_longest_free_extent(mp, pag); + if (*blen < longest) + *blen = longest; + } else + notinit = 1; + + if (xfs_inode_is_filestream(ap->ip)) { + if (*blen >= args->maxlen) + break; + + if (ap->userdata) { + /* + * If startag is an invalid AG, we've + * come here once before and + * xfs_filestream_new_ag picked the + * best currently available. + * + * Don't continue looping, since we + * could loop forever. + */ + if (startag == NULLAGNUMBER) + break; + + error = xfs_filestream_new_ag(ap, &ag); + xfs_perag_put(pag); + if (error) + return error; + + /* loop again to set 'blen'*/ + startag = NULLAGNUMBER; + pag = xfs_perag_get(mp, ag); + continue; + } + } + if (++ag == mp->m_sb.sb_agcount) + ag = 0; + if (ag == startag) + break; + xfs_perag_put(pag); + pag = xfs_perag_get(mp, ag); + } + xfs_perag_put(pag); + + /* + * Since the above loop did a BUF_TRYLOCK, it is + * possible that there is space for this request. + */ + if (notinit || *blen < ap->minlen) + args->minlen = ap->minlen; + /* + * If the best seen length is less than the request + * length, use the best as the minimum. + */ + else if (*blen < args->maxlen) + args->minlen = *blen; + /* + * Otherwise we've seen an extent as big as maxlen, + * use that as the minimum. + */ + else + args->minlen = args->maxlen; + + /* + * set the failure fallback case to look in the selected + * AG as the stream may have moved. + */ + if (xfs_inode_is_filestream(ap->ip)) + ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); + + return 0; +} + +STATIC int +xfs_bmap_btalloc( + xfs_bmalloca_t *ap) /* bmap alloc argument struct */ +{ + xfs_mount_t *mp; /* mount point structure */ + xfs_alloctype_t atype = 0; /* type for allocation routines */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ + xfs_agnumber_t ag; + xfs_alloc_arg_t args; + xfs_extlen_t blen; + xfs_extlen_t nextminlen = 0; + int nullfb; /* true if ap->firstblock isn't set */ + int isaligned; + int tryagain; + int error; + + mp = ap->ip->i_mount; + align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; + if (unlikely(align)) { + error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, + align, 0, ap->eof, 0, ap->conv, + &ap->off, &ap->alen); + ASSERT(!error); + ASSERT(ap->alen); + } + nullfb = ap->firstblock == NULLFSBLOCK; + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); + if (nullfb) { + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { + ag = xfs_filestream_lookup_ag(ap->ip); + ag = (ag != NULLAGNUMBER) ? ag : 0; + ap->rval = XFS_AGB_TO_FSB(mp, ag, 0); + } else { + ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); + } + } else + ap->rval = ap->firstblock; + + xfs_bmap_adjacent(ap); + + /* + * If allowed, use ap->rval; otherwise must use firstblock since + * it's in the right allocation group. + */ + if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno) + ; + else + ap->rval = ap->firstblock; + /* + * Normal allocation, done through xfs_alloc_vextent. + */ + tryagain = isaligned = 0; + args.tp = ap->tp; + args.mp = mp; + args.fsbno = ap->rval; + + /* Trim the allocation back to the maximum an AG can fit. */ + args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp)); + args.firstblock = ap->firstblock; + blen = 0; + if (nullfb) { + error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); + if (error) + return error; + } else if (ap->low) { + if (xfs_inode_is_filestream(ap->ip)) + args.type = XFS_ALLOCTYPE_FIRST_AG; + else + args.type = XFS_ALLOCTYPE_START_BNO; + args.total = args.minlen = ap->minlen; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.total = ap->total; + args.minlen = ap->minlen; + } + /* apply extent size hints if obtained earlier */ + if (unlikely(align)) { + args.prod = align; + if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) + args.mod = (xfs_extlen_t)(args.prod - args.mod); + } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) { + args.prod = 1; + args.mod = 0; + } else { + args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog; + if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod)))) + args.mod = (xfs_extlen_t)(args.prod - args.mod); + } + /* + * If we are not low on available data blocks, and the + * underlying logical volume manager is a stripe, and + * the file offset is zero then try to allocate data + * blocks on stripe unit boundary. + * NOTE: ap->aeof is only set if the allocation length + * is >= the stripe unit and the allocation offset is + * at the end of file. + */ + if (!ap->low && ap->aeof) { + if (!ap->off) { + args.alignment = mp->m_dalign; + atype = args.type; + isaligned = 1; + /* + * Adjust for alignment + */ + if (blen > args.alignment && blen <= args.maxlen) + args.minlen = blen - args.alignment; + args.minalignslop = 0; + } else { + /* + * First try an exact bno allocation. + * If it fails then do a near or start bno + * allocation with alignment turned on. + */ + atype = args.type; + tryagain = 1; + args.type = XFS_ALLOCTYPE_THIS_BNO; + args.alignment = 1; + /* + * Compute the minlen+alignment for the + * next case. Set slop so that the value + * of minlen+alignment+slop doesn't go up + * between the calls. + */ + if (blen > mp->m_dalign && blen <= args.maxlen) + nextminlen = blen - mp->m_dalign; + else + nextminlen = args.minlen; + if (nextminlen + mp->m_dalign > args.minlen + 1) + args.minalignslop = + nextminlen + mp->m_dalign - + args.minlen - 1; + else + args.minalignslop = 0; + } + } else { + args.alignment = 1; + args.minalignslop = 0; + } + args.minleft = ap->minleft; + args.wasdel = ap->wasdel; + args.isfl = 0; + args.userdata = ap->userdata; + if ((error = xfs_alloc_vextent(&args))) + return error; + if (tryagain && args.fsbno == NULLFSBLOCK) { + /* + * Exact allocation failed. Now try with alignment + * turned on. + */ + args.type = atype; + args.fsbno = ap->rval; + args.alignment = mp->m_dalign; + args.minlen = nextminlen; + args.minalignslop = 0; + isaligned = 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (isaligned && args.fsbno == NULLFSBLOCK) { + /* + * allocation failed, so turn off alignment and + * try again. + */ + args.type = atype; + args.fsbno = ap->rval; + args.alignment = 0; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (args.fsbno == NULLFSBLOCK && nullfb && + args.minlen > ap->minlen) { + args.minlen = ap->minlen; + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = ap->rval; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (args.fsbno == NULLFSBLOCK && nullfb) { + args.fsbno = 0; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.total = ap->minlen; + args.minleft = 0; + if ((error = xfs_alloc_vextent(&args))) + return error; + ap->low = 1; + } + if (args.fsbno != NULLFSBLOCK) { + ap->firstblock = ap->rval = args.fsbno; + ASSERT(nullfb || fb_agno == args.agno || + (ap->low && fb_agno < args.agno)); + ap->alen = args.len; + ap->ip->i_d.di_nblocks += args.len; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= args.len; + /* + * Adjust the disk quota also. This was reserved + * earlier. + */ + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, + ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : + XFS_TRANS_DQ_BCOUNT, + (long) args.len); + } else { + ap->rval = NULLFSBLOCK; + ap->alen = 0; + } + return 0; +} + +/* + * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. + * It figures out where to ask the underlying allocator to put the new extent. + */ +STATIC int +xfs_bmap_alloc( + xfs_bmalloca_t *ap) /* bmap alloc argument struct */ +{ + if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) + return xfs_bmap_rtalloc(ap); + return xfs_bmap_btalloc(ap); +} + +/* + * Transform a btree format file with only one leaf node, where the + * extents list will fit in the inode, into an extents format file. + * Since the file extents are already in-core, all we have to do is + * give up the space for the btree root and pitch the leaf block. + */ +STATIC int /* error */ +xfs_bmap_btree_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + /* REFERENCED */ + struct xfs_btree_block *cblock;/* child btree block */ + xfs_fsblock_t cbno; /* child block number */ + xfs_buf_t *cbp; /* child block's buffer */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork data */ + xfs_mount_t *mp; /* mount point structure */ + __be64 *pp; /* ptr to block address */ + struct xfs_btree_block *rblock;/* root btree block */ + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + rblock = ifp->if_broot; + ASSERT(be16_to_cpu(rblock->bb_level) == 1); + ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); + ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); + cbno = be64_to_cpu(*pp); + *logflagsp = 0; +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, cbno, 1))) + return error; +#endif + if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, + XFS_BMAP_BTREE_REF))) + return error; + cblock = XFS_BUF_TO_BLOCK(cbp); + if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) + return error; + xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + ip->i_d.di_nblocks--; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, cbp); + if (cur->bc_bufs[0] == cbp) + cur->bc_bufs[0] = NULL; + xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + return 0; +} + +/* + * Called by xfs_bmapi to update file extent records and the btree + * after removing space (or undoing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_del_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_trans_t *tp, /* current transaction pointer */ + xfs_extnum_t *idx, /* extent number to update/delete */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *del, /* data to remove from extents */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ + xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ + xfs_fsblock_t del_endblock=0; /* first block past del */ + xfs_fileoff_t del_endoff; /* first offset past del */ + int delay; /* current block is delayed allocated */ + int do_fx; /* free extent at end of routine */ + xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ + int error; /* error return value */ + int flags; /* inode logging flags */ + xfs_bmbt_irec_t got; /* current extent entry */ + xfs_fileoff_t got_endoff; /* first offset past got */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t nblks; /* quota/sb block count */ + xfs_bmbt_irec_t new; /* new record to be inserted */ + /* REFERENCED */ + uint qfield; /* quota field to update */ + xfs_filblks_t temp; /* for indirect length calculations */ + xfs_filblks_t temp2; /* for indirect length calculations */ + int state = 0; + + XFS_STATS_INC(xs_del_exlist); + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(del->br_blockcount > 0); + ep = xfs_iext_get_ext(ifp, *idx); + xfs_bmbt_get_all(ep, &got); + ASSERT(got.br_startoff <= del->br_startoff); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got.br_startoff + got.br_blockcount; + ASSERT(got_endoff >= del_endoff); + delay = isnullstartblock(got.br_startblock); + ASSERT(isnullstartblock(del->br_startblock) == delay); + flags = 0; + qfield = 0; + error = 0; + /* + * If deleting a real allocation, must free up the disk space. + */ + if (!delay) { + flags = XFS_ILOG_CORE; + /* + * Realtime allocation. Free it and record di_nblocks update. + */ + if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { + xfs_fsblock_t bno; + xfs_filblks_t len; + + ASSERT(do_mod(del->br_blockcount, + mp->m_sb.sb_rextsize) == 0); + ASSERT(do_mod(del->br_startblock, + mp->m_sb.sb_rextsize) == 0); + bno = del->br_startblock; + len = del->br_blockcount; + do_div(bno, mp->m_sb.sb_rextsize); + do_div(len, mp->m_sb.sb_rextsize); + if ((error = xfs_rtfree_extent(ip->i_transp, bno, + (xfs_extlen_t)len))) + goto done; + do_fx = 0; + nblks = len * mp->m_sb.sb_rextsize; + qfield = XFS_TRANS_DQ_RTBCOUNT; + } + /* + * Ordinary allocation. + */ + else { + do_fx = 1; + nblks = del->br_blockcount; + qfield = XFS_TRANS_DQ_BCOUNT; + } + /* + * Set up del_endblock and cur for later. + */ + del_endblock = del->br_startblock + del->br_blockcount; + if (cur) { + if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, got.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } + da_old = da_new = 0; + } else { + da_old = startblockval(got.br_startblock); + da_new = 0; + nblks = 0; + do_fx = 0; + } + /* + * Set flag value to use in switch statement. + * Left-contig is 2, right-contig is 1. + */ + switch (((got.br_startoff == del->br_startoff) << 1) | + (got_endoff == del_endoff)) { + case 3: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, *idx, 1, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); + --*idx; + if (delay) + break; + + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + flags |= XFS_ILOG_CORE; + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + break; + + case 2: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startoff(ep, del_endoff); + temp = got.br_blockcount - del->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + da_new = temp; + break; + } + xfs_bmbt_set_startblock(ep, del_endblock); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 1: + /* + * Deleting the last part of the extent. + */ + temp = got.br_blockcount - del->br_blockcount; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + da_new = temp; + break; + } + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 0: + /* + * Deleting the middle of the extent. + */ + temp = del->br_startoff - got.br_startoff; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + new.br_startoff = del_endoff; + temp2 = got_endoff - del_endoff; + new.br_blockcount = temp2; + new.br_state = got.br_state; + if (!delay) { + new.br_startblock = del_endblock; + flags |= XFS_ILOG_CORE; + if (cur) { + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, temp, + got.br_state))) + goto done; + if ((error = xfs_btree_increment(cur, 0, &i))) + goto done; + cur->bc_rec.b = new; + error = xfs_btree_insert(cur, &i); + if (error && error != ENOSPC) + goto done; + /* + * If get no-space back from btree insert, + * it tried a split, and we have a zero + * block reservation. + * Fix up our state and return the error. + */ + if (error == ENOSPC) { + /* + * Reset the cursor, don't trust + * it after any insert operation. + */ + if ((error = xfs_bmbt_lookup_eq(cur, + got.br_startoff, + got.br_startblock, + temp, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + /* + * Update the btree record back + * to the original value. + */ + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state))) + goto done; + /* + * Reset the extent record back + * to the original value. + */ + xfs_bmbt_set_blockcount(ep, + got.br_blockcount); + flags = 0; + error = XFS_ERROR(ENOSPC); + goto done; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } else + flags |= xfs_ilog_fext(whichfork); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + } else { + ASSERT(whichfork == XFS_DATA_FORK); + temp = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + new.br_startblock = nullstartblock((int)temp2); + da_new = temp + temp2; + while (da_new > da_old) { + if (temp) { + temp--; + da_new--; + xfs_bmbt_set_startblock(ep, + nullstartblock((int)temp)); + } + if (da_new == da_old) + break; + if (temp2) { + temp2--; + da_new--; + new.br_startblock = + nullstartblock((int)temp2); + } + } + } + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_insert(ip, *idx + 1, 1, &new, state); + ++*idx; + break; + } + /* + * If we need to, add to list of extents to delete. + */ + if (do_fx) + xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, + mp); + /* + * Adjust inode # blocks in the file. + */ + if (nblks) + ip->i_d.di_nblocks -= nblks; + /* + * Adjust quota data. + */ + if (qfield) + xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); + + /* + * Account for change in delayed indirect blocks. + * Nothing to do for disk quota accounting here. + */ + ASSERT(da_old >= da_new); + if (da_old > da_new) { + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + (int64_t)(da_old - da_new), 0); + } +done: + *logflagsp = flags; + return error; +} + +/* + * Remove the entry "free" from the free item list. Prev points to the + * previous entry, unless "free" is the head of the list. + */ +STATIC void +xfs_bmap_del_free( + xfs_bmap_free_t *flist, /* free item list header */ + xfs_bmap_free_item_t *prev, /* previous item on list, if any */ + xfs_bmap_free_item_t *free) /* list item to be freed */ +{ + if (prev) + prev->xbfi_next = free->xbfi_next; + else + flist->xbf_first = free->xbfi_next; + flist->xbf_count--; + kmem_zone_free(xfs_bmap_free_item_zone, free); +} + +/* + * Convert an extents-format file into a btree-format file. + * The new file will have a root block (in the inode) and a single child block. + */ +STATIC int /* error */ +xfs_bmap_extents_to_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first-block-allocated */ + xfs_bmap_free_t *flist, /* blocks freed in xaction */ + xfs_btree_cur_t **curp, /* cursor returned to caller */ + int wasdel, /* converting a delayed alloc */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *ablock; /* allocated (child) bt block */ + xfs_buf_t *abp; /* buffer for ablock */ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_bmbt_rec_t *arp; /* child record pointer */ + struct xfs_btree_block *block; /* btree root block */ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + int error; /* error return value */ + xfs_extnum_t i, cnt; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_key_t *kp; /* root block key pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_bmbt_ptr_t *pp; /* root block address pointer */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* + * Make space in the inode incore. + */ + xfs_iroot_realloc(ip, 1, whichfork); + ifp->if_flags |= XFS_IFBROOT; + + /* + * Fill in the root. + */ + block = ifp->if_broot; + block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); + block->bb_level = cpu_to_be16(1); + block->bb_numrecs = cpu_to_be16(1); + block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + + /* + * Need a cursor. Can't allocate until bb_level is filled in. + */ + mp = ip->i_mount; + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + /* + * Convert to a btree with two levels, one record in root. + */ + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + args.tp = tp; + args.mp = mp; + args.firstblock = *firstblock; + if (*firstblock == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); + } else if (flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = *firstblock; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.fsbno = *firstblock; + } + args.minlen = args.maxlen = args.prod = 1; + args.total = args.minleft = args.alignment = args.mod = args.isfl = + args.minalignslop = 0; + args.wasdel = wasdel; + *logflagsp = 0; + if ((error = xfs_alloc_vextent(&args))) { + xfs_iroot_realloc(ip, -1, whichfork); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + /* + * Allocation can't fail, the space was reserved. + */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(*firstblock == NULLFSBLOCK || + args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || + (flist->xbf_low && + args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + *firstblock = cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + ip->i_d.di_nblocks++; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); + abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); + /* + * Fill in the child block. + */ + ablock = XFS_BUF_TO_BLOCK(abp); + ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); + ablock->bb_level = 0; + ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (cnt = i = 0; i < nextents; i++) { + ep = xfs_iext_get_ext(ifp, i); + if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { + arp->l0 = cpu_to_be64(ep->l0); + arp->l1 = cpu_to_be64(ep->l1); + arp++; cnt++; + } + } + ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); + xfs_btree_set_numrecs(ablock, cnt); + + /* + * Fill in the root key and pointer. + */ + kp = XFS_BMBT_KEY_ADDR(mp, block, 1); + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, + be16_to_cpu(block->bb_level))); + *pp = cpu_to_be64(args.fsbno); + + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS); + xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); + ASSERT(*curp == NULL); + *curp = cur; + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); + return 0; +} + +/* + * Calculate the default attribute fork offset for newly created inodes. + */ +uint +xfs_default_attroffset( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + uint offset; + + if (mp->m_sb.sb_inodesize == 256) { + offset = XFS_LITINO(mp) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); + } else { + offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); + } + + ASSERT(offset < XFS_LITINO(mp)); + return offset; +} + +/* + * Helper routine to reset inode di_forkoff field when switching + * attribute fork from local to extent format - we reset it where + * possible to make space available for inline data fork extents. + */ +STATIC void +xfs_bmap_forkoff_reset( + xfs_mount_t *mp, + xfs_inode_t *ip, + int whichfork) +{ + if (whichfork == XFS_ATTR_FORK && + ip->i_d.di_format != XFS_DINODE_FMT_DEV && + ip->i_d.di_format != XFS_DINODE_FMT_UUID && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { + uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; + + if (dfl_forkoff > ip->i_d.di_forkoff) { + ip->i_d.di_forkoff = dfl_forkoff; + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); + ip->i_afp->if_ext_max = + XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t); + } + } +} + +/* + * Convert a local file to an extents file. + * This code is out of bounds for data forks of regular files, + * since the file data needs to get logged so things will stay consistent. + * (The bmap-level manipulations are ok, though). + */ +STATIC int /* error */ +xfs_bmap_local_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated in xaction */ + xfs_extlen_t total, /* total blocks needed by transaction */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + int error; /* error return value */ + int flags; /* logging flags returned */ + xfs_ifork_t *ifp; /* inode fork pointer */ + + /* + * We don't want to deal with the case of keeping inode data inline yet. + * So sending the data fork of a regular inode is invalid. + */ + ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG && + whichfork == XFS_DATA_FORK)); + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + flags = 0; + error = 0; + if (ifp->if_bytes) { + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_buf_t *bp; /* buffer for extent block */ + xfs_bmbt_rec_host_t *ep;/* extent record pointer */ + + args.tp = tp; + args.mp = ip->i_mount; + args.firstblock = *firstblock; + ASSERT((ifp->if_flags & + (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); + /* + * Allocate a block. We know we need only one, since the + * file currently fits in an inode. + */ + if (*firstblock == NULLFSBLOCK) { + args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.fsbno = *firstblock; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + args.total = total; + args.mod = args.minleft = args.alignment = args.wasdel = + args.isfl = args.minalignslop = 0; + args.minlen = args.maxlen = args.prod = 1; + if ((error = xfs_alloc_vextent(&args))) + goto done; + /* + * Can't fail, the space was reserved. + */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(args.len == 1); + *firstblock = args.fsbno; + bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data, + ifp->if_bytes); + xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); + xfs_bmap_forkoff_reset(args.mp, ip, whichfork); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); + xfs_iext_add(ifp, 0, 1); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); + trace_xfs_bmap_post_update(ip, 0, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, + _THIS_IP_); + XFS_IFORK_NEXT_SET(ip, whichfork, 1); + ip->i_d.di_nblocks = 1; + xfs_trans_mod_dquot_byino(tp, ip, + XFS_TRANS_DQ_BCOUNT, 1L); + flags |= xfs_ilog_fext(whichfork); + } else { + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); + xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); + } + ifp->if_flags &= ~XFS_IFINLINE; + ifp->if_flags |= XFS_IFEXTENTS; + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + flags |= XFS_ILOG_CORE; +done: + *logflagsp = flags; + return error; +} + +/* + * Search the extent records for the entry containing block bno. + * If bno lies in a hole, point to the next entry. If bno lies + * past eof, *eofp will be set, and *prevp will contain the last + * entry (null if none). Else, *lastxp will be set to the index + * of the found entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_multi_extents( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + xfs_extnum_t lastx; /* last extent index */ + + /* + * Initialize the extent entry structure to catch access to + * uninitialized br_startblock field. + */ + gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; + gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; + gotp->br_state = XFS_EXT_INVALID; +#if XFS_BIG_BLKNOS + gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; +#else + gotp->br_startblock = 0xffffa5a5; +#endif + prevp->br_startoff = NULLFILEOFF; + + ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); + if (lastx > 0) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); + } + if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + xfs_bmbt_get_all(ep, gotp); + *eofp = 0; + } else { + if (lastx > 0) { + *gotp = *prevp; + } + *eofp = 1; + ep = NULL; + } + *lastxp = lastx; + return ep; +} + +/* + * Search the extents list for the inode, for the extent containing bno. + * If bno lies in a hole, point to the next entry. If bno lies past eof, + * *eofp will be set, and *prevp will contain the last entry (null if none). + * Else, *lastxp will be set to the index of the found + * entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_extents( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int fork, /* data or attr fork */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + + XFS_STATS_INC(xs_look_exlist); + ifp = XFS_IFORK_PTR(ip, fork); + + ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); + + if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && + !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x lastx: %x\n", + (unsigned long long)ip->i_ino, + (unsigned long long)gotp->br_startblock, + (unsigned long long)gotp->br_startoff, + (unsigned long long)gotp->br_blockcount, + gotp->br_state, *lastxp); + *lastxp = NULLEXTNUM; + *eofp = 1; + return NULL; + } + return ep; +} + +/* + * Compute the worst-case number of indirect blocks that will be used + * for ip's delayed extent of length "len". + */ +STATIC xfs_filblks_t +xfs_bmap_worst_indlen( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ +{ + int level; /* btree level number */ + int maxrecs; /* maximum record count at this level */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t rval; /* return value */ + + mp = ip->i_mount; + maxrecs = mp->m_bmap_dmxr[0]; + for (level = 0, rval = 0; + level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); + level++) { + len += maxrecs - 1; + do_div(len, maxrecs); + rval += len; + if (len == 1) + return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + level - 1; + if (level == 0) + maxrecs = mp->m_bmap_dmxr[1]; + } + return rval; +} + +/* + * Convert inode from non-attributed to attributed. + * Must not be in a transaction, ip must not be locked. + */ +int /* error code */ +xfs_bmap_add_attrfork( + xfs_inode_t *ip, /* incore inode pointer */ + int size, /* space new attribute needs */ + int rsvd) /* xact may use reserved blks */ +{ + xfs_fsblock_t firstblock; /* 1st block/ag allocated */ + xfs_bmap_free_t flist; /* freed extent records */ + xfs_mount_t *mp; /* mount structure */ + xfs_trans_t *tp; /* transaction pointer */ + int blks; /* space reservation */ + int version = 1; /* superblock attr version */ + int committed; /* xaction was committed */ + int logflags; /* logging flags */ + int error; /* error return value */ + + ASSERT(XFS_IFORK_Q(ip) == 0); + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); + + mp = ip->i_mount; + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); + blks = XFS_ADDAFORK_SPACE_RES(mp); + if (rsvd) + tp->t_flags |= XFS_TRANS_RESERVE; + if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) + goto error0; + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? + XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + return error; + } + if (XFS_IFORK_Q(ip)) + goto error1; + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { + /* + * For inodes coming from pre-6.2 filesystems. + */ + ASSERT(ip->i_d.di_aformat == 0); + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + } + ASSERT(ip->i_d.di_anextents == 0); + + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + break; + case XFS_DINODE_FMT_UUID: + ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); + if (!ip->i_d.di_forkoff) + ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; + else if (mp->m_flags & XFS_MOUNT_ATTR2) + version = 2; + break; + default: + ASSERT(0); + error = XFS_ERROR(EINVAL); + goto error1; + } + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp->if_ext_max = + XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + ip->i_afp->if_flags = XFS_IFEXTENTS; + logflags = 0; + xfs_bmap_init(&flist, &firstblock); + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, + &logflags); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, + &flist, &logflags); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, + &logflags); + break; + default: + error = 0; + break; + } + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (error) + goto error2; + if (!xfs_sb_version_hasattr(&mp->m_sb) || + (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { + __int64_t sbfields = 0; + + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasattr(&mp->m_sb)) { + xfs_sb_version_addattr(&mp->m_sb); + sbfields |= XFS_SB_VERSIONNUM; + } + if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { + xfs_sb_version_addattr2(&mp->m_sb); + sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); + } + if (sbfields) { + spin_unlock(&mp->m_sb_lock); + xfs_mod_sb(tp, sbfields); + } else + spin_unlock(&mp->m_sb_lock); + } + if ((error = xfs_bmap_finish(&tp, &flist, &committed))) + goto error2; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); + return error; +error2: + xfs_bmap_cancel(&flist); +error1: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +error0: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); + return error; +} + +/* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +/* ARGSUSED */ +void +xfs_bmap_add_free( + xfs_fsblock_t bno, /* fs block number of extent */ + xfs_filblks_t len, /* length of extent */ + xfs_bmap_free_t *flist, /* list of extents */ + xfs_mount_t *mp) /* mount point structure */ +{ + xfs_bmap_free_item_t *cur; /* current (next) element */ + xfs_bmap_free_item_t *new; /* new element */ + xfs_bmap_free_item_t *prev; /* previous element */ +#ifdef DEBUG + xfs_agnumber_t agno; + xfs_agblock_t agbno; + + ASSERT(bno != NULLFSBLOCK); + ASSERT(len > 0); + ASSERT(len <= MAXEXTLEN); + ASSERT(!isnullstartblock(bno)); + agno = XFS_FSB_TO_AGNO(mp, bno); + agbno = XFS_FSB_TO_AGBNO(mp, bno); + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(agbno < mp->m_sb.sb_agblocks); + ASSERT(len < mp->m_sb.sb_agblocks); + ASSERT(agbno + len <= mp->m_sb.sb_agblocks); +#endif + ASSERT(xfs_bmap_free_item_zone != NULL); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xbfi_startblock = bno; + new->xbfi_blockcount = (xfs_extlen_t)len; + for (prev = NULL, cur = flist->xbf_first; + cur != NULL; + prev = cur, cur = cur->xbfi_next) { + if (cur->xbfi_startblock >= bno) + break; + } + if (prev) + prev->xbfi_next = new; + else + flist->xbf_first = new; + new->xbfi_next = cur; + flist->xbf_count++; +} + +/* + * Compute and fill in the value of the maximum depth of a bmap btree + * in this filesystem. Done once, during mount. + */ +void +xfs_bmap_compute_maxlevels( + xfs_mount_t *mp, /* file system mount structure */ + int whichfork) /* data or attr fork */ +{ + int level; /* btree level */ + uint maxblocks; /* max blocks at this level */ + uint maxleafents; /* max leaf entries possible */ + int maxrootrecs; /* max records in root block */ + int minleafrecs; /* min records in leaf block */ + int minnoderecs; /* min records in node block */ + int sz; /* root block size */ + + /* + * The maximum number of extents in a file, hence the maximum + * number of leaf entries, is controlled by the type of di_nextents + * (a signed 32-bit number, xfs_extnum_t), or by di_anextents + * (a signed 16-bit number, xfs_aextnum_t). + * + * Note that we can no longer assume that if we are in ATTR1 that + * the fork offset of all the inodes will be + * (xfs_default_attroffset(ip) >> 3) because we could have mounted + * with ATTR2 and then mounted back with ATTR1, keeping the + * di_forkoff's fixed but probably at various positions. Therefore, + * for both ATTR1 and ATTR2 we have to assume the worst case scenario + * of a minimum size available. + */ + if (whichfork == XFS_DATA_FORK) { + maxleafents = MAXEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + } else { + maxleafents = MAXAEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); + } + maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0); + minleafrecs = mp->m_bmap_dmnr[0]; + minnoderecs = mp->m_bmap_dmnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) { + if (maxblocks <= maxrootrecs) + maxblocks = 1; + else + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + } + mp->m_bm_maxlevels[whichfork] = level; +} + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. We never free any extents in + * the first transaction. + * + * Return 1 if the given transaction was committed and a new one + * started, and 0 otherwise in the committed parameter. + */ +int /* error */ +xfs_bmap_finish( + xfs_trans_t **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *committed) /* xact committed or not */ +{ + xfs_efd_log_item_t *efd; /* extent free data */ + xfs_efi_log_item_t *efi; /* extent free intention */ + int error; /* error return value */ + xfs_bmap_free_item_t *free; /* free extent item */ + unsigned int logres; /* new log reservation */ + unsigned int logcount; /* new log count */ + xfs_mount_t *mp; /* filesystem mount structure */ + xfs_bmap_free_item_t *next; /* next item on free list */ + xfs_trans_t *ntp; /* new transaction pointer */ + + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + if (flist->xbf_count == 0) { + *committed = 0; + return 0; + } + ntp = *tp; + efi = xfs_trans_get_efi(ntp, flist->xbf_count); + for (free = flist->xbf_first; free; free = free->xbfi_next) + xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + free->xbfi_blockcount); + logres = ntp->t_log_res; + logcount = ntp->t_log_count; + ntp = xfs_trans_dup(*tp); + error = xfs_trans_commit(*tp, 0); + *tp = ntp; + *committed = 1; + /* + * We have a new transaction, so we should return committed=1, + * even though we're returning an error. + */ + if (error) + return error; + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(ntp->t_ticket); + + if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES, + logcount))) + return error; + efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + for (free = flist->xbf_first; free != NULL; free = next) { + next = free->xbfi_next; + if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + free->xbfi_blockcount))) { + /* + * The bmap free list will be cleaned up at a + * higher level. The EFI will be canceled when + * this transaction is aborted. + * Need to force shutdown here to make sure it + * happens, since this transaction may not be + * dirty yet. + */ + mp = ntp->t_mountp; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_force_shutdown(mp, + (error == EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : + SHUTDOWN_META_IO_ERROR); + return error; + } + xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + free->xbfi_blockcount); + xfs_bmap_del_free(flist, NULL, free); + } + return 0; +} + +/* + * Free up any items left in the list. + */ +void +xfs_bmap_cancel( + xfs_bmap_free_t *flist) /* list of bmap_free_items */ +{ + xfs_bmap_free_item_t *free; /* free list item */ + xfs_bmap_free_item_t *next; + + if (flist->xbf_count == 0) + return; + ASSERT(flist->xbf_first != NULL); + for (free = flist->xbf_first; free; free = next) { + next = free->xbfi_next; + xfs_bmap_del_free(flist, NULL, free); + } + ASSERT(flist->xbf_count == 0); +} + +/* + * Returns the file-relative block number of the first unused block(s) + * in the file with at least "len" logically contiguous blocks free. + * This is the lowest-address hole if the file has holes, else the first block + * past the end of file. + * Return 0 if the file is currently local (in-inode). + */ +int /* error */ +xfs_bmap_first_unused( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *first_unused, /* unused block */ + int whichfork) /* data or attr fork */ +{ + int error; /* error return value */ + int idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t lastaddr; /* last block number seen */ + xfs_fileoff_t lowest; /* lowest useful block */ + xfs_fileoff_t max; /* starting useful block */ + xfs_fileoff_t off; /* offset for this block */ + xfs_extnum_t nextents; /* number of extent entries */ + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *first_unused = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + lowest = *first_unused; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); + off = xfs_bmbt_get_startoff(ep); + /* + * See if the hole before this extent will work. + */ + if (off >= lowest + len && off - max >= len) { + *first_unused = max; + return 0; + } + lastaddr = off + xfs_bmbt_get_blockcount(ep); + max = XFS_FILEOFF_MAX(lastaddr, lowest); + } + *first_unused = max; + return 0; +} + +/* + * Returns the file-relative block number of the last block + 1 before + * last_block (input value) in the file. + * This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int /* error */ +xfs_bmap_last_before( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ +{ + xfs_fileoff_t bno; /* input file offset */ + int eof; /* hit end of file */ + xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ + int error; /* error return value */ + xfs_bmbt_irec_t got; /* current extent value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent used */ + xfs_bmbt_irec_t prev; /* previous extent value */ + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EIO); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *last_block = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + bno = *last_block - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + if (eof || xfs_bmbt_get_startoff(ep) > bno) { + if (prev.br_startoff == NULLFILEOFF) + *last_block = 0; + else + *last_block = prev.br_startoff + prev.br_blockcount; + } + /* + * Otherwise *last_block is already the right answer. + */ + return 0; +} + +/* + * Returns the file-relative block number of the first block past eof in + * the file. This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int /* error */ +xfs_bmap_last_offset( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t nextents; /* number of extent entries */ + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EIO); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *last_block = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (!nextents) { + *last_block = 0; + return 0; + } + ep = xfs_iext_get_ext(ifp, nextents - 1); + *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep); + return 0; +} + +/* + * Returns whether the selected fork of the inode has exactly one + * block or not. For the data fork we check this matches di_size, + * implying the file's range is 0..bsize-1. + */ +int /* 1=>1 block, 0=>otherwise */ +xfs_bmap_one_block( + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int rval; /* return value */ + xfs_bmbt_irec_t s; /* internal version of extent */ + +#ifndef DEBUG + if (whichfork == XFS_DATA_FORK) { + return ((ip->i_d.di_mode & S_IFMT) == S_IFREG) ? + (ip->i_size == ip->i_mount->m_sb.sb_blocksize) : + (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); + } +#endif /* !DEBUG */ + if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) + return 0; + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return 0; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_get_all(ep, &s); + rval = s.br_startoff == 0 && s.br_blockcount == 1; + if (rval && whichfork == XFS_DATA_FORK) + ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize); + return rval; +} + +STATIC int +xfs_bmap_sanity_check( + struct xfs_mount *mp, + struct xfs_buf *bp, + int level) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + if (be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC || + be16_to_cpu(block->bb_level) != level || + be16_to_cpu(block->bb_numrecs) == 0 || + be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return 0; + return 1; +} + +/* + * Read in the extents to if_extents. + * All inode fields are set up by caller, we just traverse the btree + * and copy the records in. If the file system cannot contain unwritten + * extents, the records are checked for no "state" flags. + */ +int /* error */ +xfs_bmap_read_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ + xfs_extnum_t i, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + /* REFERENCED */ + xfs_extnum_t room; /* number of entries there's room for */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : + XFS_EXTFMT_INODE(ip); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + return error; + block = XFS_BUF_TO_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, level), + error0); + if (level == 0) + break; + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + xfs_trans_brelse(tp, bp); + } + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + i = 0; + /* + * Loop over all leaf nodes. Copy information to the extent records. + */ + for (;;) { + xfs_bmbt_rec_t *frp; + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + xfs_extnum_t start; + + + num_recs = xfs_btree_get_numrecs(block); + if (unlikely(i + num_recs > room)) { + ASSERT(i + num_recs <= room); + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, (btree extents).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)", + XFS_ERRLEVEL_LOW, ip->i_mount, block); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, 0), + error0); + /* + * Read-ahead the next leaf block, if any. + */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + if (nextbno != NULLFSBLOCK) + xfs_btree_reada_bufl(mp, nextbno, 1); + /* + * Copy records into the extent records. + */ + frp = XFS_BMBT_REC_ADDR(mp, block, 1); + start = i; + for (j = 0; j < num_recs; j++, i++, frp++) { + xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); + trp->l0 = be64_to_cpu(frp->l0); + trp->l1 = be64_to_cpu(frp->l1); + } + if (exntf == XFS_EXTFMT_NOSTATE) { + /* + * Check all attribute bmap btree records and + * any "older" data bmap btree records for a + * set bit in the "extent flag" position. + */ + if (unlikely(xfs_check_nostate_extents(ifp, + start, num_recs))) { + XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + goto error0; + } + } + xfs_trans_brelse(tp, bp); + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + return error; + block = XFS_BUF_TO_BLOCK(bp); + } + ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); + XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); + return 0; +error0: + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); +} + +#ifdef DEBUG +/* + * Add bmap trace insert entries for all the contents of the extent records. + */ +void +xfs_bmap_trace_exlist( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t cnt, /* count of entries in the list */ + int whichfork, /* data or attr fork */ + unsigned long caller_ip) +{ + xfs_extnum_t idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int state = 0; + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + for (idx = 0; idx < cnt; idx++) + trace_xfs_extlist(ip, idx, whichfork, caller_ip); +} + +/* + * Validate that the bmbt_irecs being returned from bmapi are valid + * given the callers original parameters. Specifically check the + * ranges of the returned irecs to ensure that they only extent beyond + * the given parameters if the XFS_BMAPI_ENTIRE flag was set. + */ +STATIC void +xfs_bmap_validate_ret( + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_bmbt_irec_t *mval, + int nmap, + int ret_nmap) +{ + int i; /* index to map values */ + + ASSERT(ret_nmap <= nmap); + + for (i = 0; i < ret_nmap; i++) { + ASSERT(mval[i].br_blockcount > 0); + if (!(flags & XFS_BMAPI_ENTIRE)) { + ASSERT(mval[i].br_startoff >= bno); + ASSERT(mval[i].br_blockcount <= len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= + bno + len); + } else { + ASSERT(mval[i].br_startoff < bno + len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount > + bno); + } + ASSERT(i == 0 || + mval[i - 1].br_startoff + mval[i - 1].br_blockcount == + mval[i].br_startoff); + if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY)) + ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && + mval[i].br_startblock != HOLESTARTBLOCK); + ASSERT(mval[i].br_state == XFS_EXT_NORM || + mval[i].br_state == XFS_EXT_UNWRITTEN); + } +} +#endif /* DEBUG */ + + +/* + * Map file blocks to filesystem blocks. + * File range is given by the bno/len pair. + * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) + * into a hole or past eof. + * Only allocates blocks from a single allocation group, + * to avoid locking problems. + * The returned value in "firstblock" from the first call in a transaction + * must be remembered and presented to subsequent calls in "firstblock". + * An upper bound for the number of blocks to be allocated is supplied to + * the first call in "total"; if no allocation group has that many free + * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). + */ +int /* error */ +xfs_bmapi( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + int flags, /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_extlen_t total, /* total blocks needed */ + xfs_bmbt_irec_t *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + xfs_bmap_free_t *flist) /* i/o: list extents to free */ +{ + xfs_fsblock_t abno; /* allocated block number */ + xfs_extlen_t alen; /* allocated extent length */ + xfs_fileoff_t aoff; /* allocated file offset */ + xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_fileoff_t end; /* end of mapped file region */ + int eof; /* we've hit the end of extents */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + int error; /* error return */ + xfs_bmbt_irec_t got; /* current file extent record */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extlen_t indlen; /* indirect blocks length */ + xfs_extnum_t lastx; /* last useful extent number */ + int logflags; /* flags for transaction logging */ + xfs_extlen_t minleft; /* min blocks left after allocation */ + xfs_extlen_t minlen; /* min allocation size */ + xfs_mount_t *mp; /* xfs mount structure */ + int n; /* current extent index */ + int nallocs; /* number of extents alloc'd */ + xfs_extnum_t nextents; /* number of extents in file */ + xfs_fileoff_t obno; /* old block number (offset) */ + xfs_bmbt_irec_t prev; /* previous file extent record */ + int tmp_logflags; /* temp flags holder */ + int whichfork; /* data or attr fork */ + char inhole; /* current location is hole in file */ + char wasdelay; /* old extent was delayed */ + char wr; /* this is a write request */ + char rt; /* this is a realtime file */ +#ifdef DEBUG + xfs_fileoff_t orig_bno; /* original block number value */ + int orig_flags; /* original flags arg value */ + xfs_filblks_t orig_len; /* original value of len arg */ + xfs_bmbt_irec_t *orig_mval; /* original value of mval */ + int orig_nmap; /* original value of *nmap */ + + orig_bno = bno; + orig_len = len; + orig_flags = flags; + orig_mval = mval; + orig_nmap = *nmap; +#endif + ASSERT(*nmap >= 1); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE)); + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + mp = ip->i_mount; + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + if ((wr = (flags & XFS_BMAPI_WRITE)) != 0) + XFS_STATS_INC(xs_blk_mapw); + else + XFS_STATS_INC(xs_blk_mapr); + /* + * IGSTATE flag is used to combine extents which + * differ only due to the state of the extents. + * This technique is used from xfs_getbmap() + * when the caller does not wish to see the + * separation (which is the default). + * + * This technique is also used when writing a + * buffer which has been partially written, + * (usually by being flushed during a chunkread), + * to ensure one write takes place. This also + * prevents a change in the xfs inode extents at + * this time, intentionally. This change occurs + * on completion of the write operation, in + * xfs_strat_comp(), where the xfs_bmapi() call + * is transactioned, and the extents combined. + */ + if ((flags & XFS_BMAPI_IGSTATE) && wr) /* if writing unwritten space */ + wr = 0; /* no allocations are allowed */ + ASSERT(wr || !(flags & XFS_BMAPI_DELAY)); + logflags = 0; + nallocs = 0; + cur = NULL; + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + ASSERT(wr && tp); + if ((error = xfs_bmap_local_to_extents(tp, ip, + firstblock, total, &logflags, whichfork))) + goto error0; + } + if (wr && *firstblock == NULLFSBLOCK) { + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) + minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; + else + minleft = 1; + } else + minleft = 0; + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + goto error0; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + n = 0; + end = bno + len; + obno = bno; + bma.ip = NULL; + + while (bno < end && n < *nmap) { + /* + * Reading past eof, act as though there's a hole + * up to end. + */ + if (eof && !wr) + got.br_startoff = end; + inhole = eof || got.br_startoff > bno; + wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) && + isnullstartblock(got.br_startblock); + /* + * First, deal with the hole before the allocated space + * that we found, if any. + */ + if (wr && (inhole || wasdelay)) { + /* + * For the wasdelay case, we could also just + * allocate the stuff asked for in this bmap call + * but that wouldn't be as good. + */ + if (wasdelay) { + alen = (xfs_extlen_t)got.br_blockcount; + aoff = got.br_startoff; + if (lastx != NULLEXTNUM && lastx) { + ep = xfs_iext_get_ext(ifp, lastx - 1); + xfs_bmbt_get_all(ep, &prev); + } + } else { + alen = (xfs_extlen_t) + XFS_FILBLKS_MIN(len, MAXEXTLEN); + if (!eof) + alen = (xfs_extlen_t) + XFS_FILBLKS_MIN(alen, + got.br_startoff - bno); + aoff = bno; + } + minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1; + if (flags & XFS_BMAPI_DELAY) { + xfs_extlen_t extsz; + + /* Figure out the extent size, adjust alen */ + extsz = xfs_get_extsz_hint(ip); + if (extsz) { + /* + * make sure we don't exceed a single + * extent length when we align the + * extent by reducing length we are + * going to allocate by the maximum + * amount extent size aligment may + * require. + */ + alen = XFS_FILBLKS_MIN(len, + MAXEXTLEN - (2 * extsz - 1)); + error = xfs_bmap_extsize_align(mp, + &got, &prev, extsz, + rt, eof, + flags&XFS_BMAPI_DELAY, + flags&XFS_BMAPI_CONVERT, + &aoff, &alen); + ASSERT(!error); + } + + if (rt) + extsz = alen / mp->m_sb.sb_rextsize; + + /* + * Make a transaction-less quota reservation for + * delayed allocation blocks. This number gets + * adjusted later. We return if we haven't + * allocated blocks already inside this loop. + */ + error = xfs_trans_reserve_quota_nblks( + NULL, ip, (long)alen, 0, + rt ? XFS_QMOPT_RES_RTBLKS : + XFS_QMOPT_RES_REGBLKS); + if (error) { + if (n == 0) { + *nmap = 0; + ASSERT(cur == NULL); + return error; + } + break; + } + + /* + * Split changing sb for alen and indlen since + * they could be coming from different places. + */ + indlen = (xfs_extlen_t) + xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + + if (rt) { + error = xfs_mod_incore_sb(mp, + XFS_SBS_FREXTENTS, + -((int64_t)extsz), 0); + } else { + error = xfs_icsb_modify_counters(mp, + XFS_SBS_FDBLOCKS, + -((int64_t)alen), 0); + } + if (!error) { + error = xfs_icsb_modify_counters(mp, + XFS_SBS_FDBLOCKS, + -((int64_t)indlen), 0); + if (error && rt) + xfs_mod_incore_sb(mp, + XFS_SBS_FREXTENTS, + (int64_t)extsz, 0); + else if (error) + xfs_icsb_modify_counters(mp, + XFS_SBS_FDBLOCKS, + (int64_t)alen, 0); + } + + if (error) { + if (XFS_IS_QUOTA_ON(mp)) + /* unreserve the blocks now */ + (void) + xfs_trans_unreserve_quota_nblks( + NULL, ip, + (long)alen, 0, rt ? + XFS_QMOPT_RES_RTBLKS : + XFS_QMOPT_RES_REGBLKS); + break; + } + + ip->i_delayed_blks += alen; + abno = nullstartblock(indlen); + } else { + /* + * If first time, allocate and fill in + * once-only bma fields. + */ + if (bma.ip == NULL) { + bma.tp = tp; + bma.ip = ip; + bma.prevp = &prev; + bma.gotp = &got; + bma.total = total; + bma.userdata = 0; + } + /* Indicate if this is the first user data + * in the file, or just any user data. + */ + if (!(flags & XFS_BMAPI_METADATA)) { + bma.userdata = (aoff == 0) ? + XFS_ALLOC_INITIAL_USER_DATA : + XFS_ALLOC_USERDATA; + } + /* + * Fill in changeable bma fields. + */ + bma.eof = eof; + bma.firstblock = *firstblock; + bma.alen = alen; + bma.off = aoff; + bma.conv = !!(flags & XFS_BMAPI_CONVERT); + bma.wasdel = wasdelay; + bma.minlen = minlen; + bma.low = flist->xbf_low; + bma.minleft = minleft; + /* + * Only want to do the alignment at the + * eof if it is userdata and allocation length + * is larger than a stripe unit. + */ + if (mp->m_dalign && alen >= mp->m_dalign && + (!(flags & XFS_BMAPI_METADATA)) && + (whichfork == XFS_DATA_FORK)) { + if ((error = xfs_bmap_isaeof(ip, aoff, + whichfork, &bma.aeof))) + goto error0; + } else + bma.aeof = 0; + /* + * Call allocator. + */ + if ((error = xfs_bmap_alloc(&bma))) + goto error0; + /* + * Copy out result fields. + */ + abno = bma.rval; + if ((flist->xbf_low = bma.low)) + minleft = 0; + alen = bma.alen; + aoff = bma.off; + ASSERT(*firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *firstblock) == + XFS_FSB_TO_AGNO(mp, bma.firstblock) || + (flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *firstblock) < + XFS_FSB_TO_AGNO(mp, bma.firstblock))); + *firstblock = bma.firstblock; + if (cur) + cur->bc_private.b.firstblock = + *firstblock; + if (abno == NULLFSBLOCK) + break; + if ((ifp->if_flags & XFS_IFBROOT) && !cur) { + cur = xfs_bmbt_init_cursor(mp, tp, + ip, whichfork); + cur->bc_private.b.firstblock = + *firstblock; + cur->bc_private.b.flist = flist; + } + /* + * Bump the number of extents we've allocated + * in this call. + */ + nallocs++; + } + if (cur) + cur->bc_private.b.flags = + wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0; + got.br_startoff = aoff; + got.br_startblock = abno; + got.br_blockcount = alen; + got.br_state = XFS_EXT_NORM; /* assume normal */ + /* + * Determine state of extent, and the filesystem. + * A wasdelay extent has been initialized, so + * shouldn't be flagged as unwritten. + */ + if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) { + if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) + got.br_state = XFS_EXT_UNWRITTEN; + } + error = xfs_bmap_add_extent(ip, &lastx, &cur, &got, + firstblock, flist, &tmp_logflags, + whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + ep = xfs_iext_get_ext(ifp, lastx); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + xfs_bmbt_get_all(ep, &got); + ASSERT(got.br_startoff <= aoff); + ASSERT(got.br_startoff + got.br_blockcount >= + aoff + alen); +#ifdef DEBUG + if (flags & XFS_BMAPI_DELAY) { + ASSERT(isnullstartblock(got.br_startblock)); + ASSERT(startblockval(got.br_startblock) > 0); + } + ASSERT(got.br_state == XFS_EXT_NORM || + got.br_state == XFS_EXT_UNWRITTEN); +#endif + /* + * Fall down into the found allocated space case. + */ + } else if (inhole) { + /* + * Reading in a hole. + */ + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = + XFS_FILBLKS_MIN(len, got.br_startoff - bno); + mval->br_state = XFS_EXT_NORM; + bno += mval->br_blockcount; + len -= mval->br_blockcount; + mval++; + n++; + continue; + } + /* + * Then deal with the allocated space we found. + */ + ASSERT(ep != NULL); + if (!(flags & XFS_BMAPI_ENTIRE) && + (got.br_startoff + got.br_blockcount > obno)) { + if (obno > bno) + bno = obno; + ASSERT((bno >= obno) || (n == 0)); + ASSERT(bno < end); + mval->br_startoff = bno; + if (isnullstartblock(got.br_startblock)) { + ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); + mval->br_startblock = DELAYSTARTBLOCK; + } else + mval->br_startblock = + got.br_startblock + + (bno - got.br_startoff); + /* + * Return the minimum of what we got and what we + * asked for for the length. We can use the len + * variable here because it is modified below + * and we could have been there before coming + * here if the first part of the allocation + * didn't overlap what was asked for. + */ + mval->br_blockcount = + XFS_FILBLKS_MIN(end - bno, got.br_blockcount - + (bno - got.br_startoff)); + mval->br_state = got.br_state; + ASSERT(mval->br_blockcount <= len); + } else { + *mval = got; + if (isnullstartblock(mval->br_startblock)) { + ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); + mval->br_startblock = DELAYSTARTBLOCK; + } + } + + /* + * Check if writing previously allocated but + * unwritten extents. + */ + if (wr && + ((mval->br_state == XFS_EXT_UNWRITTEN && + ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) || + (mval->br_state == XFS_EXT_NORM && + ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) == + (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) { + /* + * Modify (by adding) the state flag, if writing. + */ + ASSERT(mval->br_blockcount <= len); + if ((ifp->if_flags & XFS_IFBROOT) && !cur) { + cur = xfs_bmbt_init_cursor(mp, + tp, ip, whichfork); + cur->bc_private.b.firstblock = + *firstblock; + cur->bc_private.b.flist = flist; + } + mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) + ? XFS_EXT_NORM + : XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent(ip, &lastx, &cur, mval, + firstblock, flist, &tmp_logflags, + whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + ep = xfs_iext_get_ext(ifp, lastx); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + xfs_bmbt_get_all(ep, &got); + /* + * We may have combined previously unwritten + * space with written space, so generate + * another request. + */ + if (mval->br_blockcount < len) + continue; + } + + ASSERT((flags & XFS_BMAPI_ENTIRE) || + ((mval->br_startoff + mval->br_blockcount) <= end)); + ASSERT((flags & XFS_BMAPI_ENTIRE) || + (mval->br_blockcount <= len) || + (mval->br_startoff < obno)); + bno = mval->br_startoff + mval->br_blockcount; + len = end - bno; + if (n > 0 && mval->br_startoff == mval[-1].br_startoff) { + ASSERT(mval->br_startblock == mval[-1].br_startblock); + ASSERT(mval->br_blockcount > mval[-1].br_blockcount); + ASSERT(mval->br_state == mval[-1].br_state); + mval[-1].br_blockcount = mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != HOLESTARTBLOCK && + mval->br_startblock == + mval[-1].br_startblock + mval[-1].br_blockcount && + ((flags & XFS_BMAPI_IGSTATE) || + mval[-1].br_state == mval->br_state)) { + ASSERT(mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount); + mval[-1].br_blockcount += mval->br_blockcount; + } else if (n > 0 && + mval->br_startblock == DELAYSTARTBLOCK && + mval[-1].br_startblock == DELAYSTARTBLOCK && + mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount) { + mval[-1].br_blockcount += mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (!((n == 0) && + ((mval->br_startoff + mval->br_blockcount) <= + obno))) { + mval++; + n++; + } + /* + * If we're done, stop now. Stop when we've allocated + * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise + * the transaction may get too big. + */ + if (bno >= end || n >= *nmap || nallocs >= *nmap) + break; + /* + * Else go on to the next record. + */ + prev = got; + if (++lastx < nextents) { + ep = xfs_iext_get_ext(ifp, lastx); + xfs_bmbt_get_all(ep, &got); + } else { + eof = 1; + } + } + *nmap = n; + /* + * Transform from btree to extents, give it cur. + */ + if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + ASSERT(wr && cur); + error = xfs_bmap_btree_to_extents(tp, ip, cur, + &tmp_logflags, whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + } + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || + XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); + error = 0; +error0: + /* + * Log everything. Do this after conversion, there's no point in + * logging the extent records if we've converted to btree format. + */ + if ((logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + logflags &= ~xfs_ilog_fext(whichfork); + else if ((logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + logflags &= ~xfs_ilog_fbroot(whichfork); + /* + * Log whatever the flags say, even if error. Otherwise we might miss + * detecting a case where the data is changed, there's an error, + * and it's not logged so we don't shutdown when we should. + */ + if (logflags) { + ASSERT(tp && wr); + xfs_trans_log_inode(tp, ip, logflags); + } + if (cur) { + if (!error) { + ASSERT(*firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *firstblock) == + XFS_FSB_TO_AGNO(mp, + cur->bc_private.b.firstblock) || + (flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *firstblock) < + XFS_FSB_TO_AGNO(mp, + cur->bc_private.b.firstblock))); + *firstblock = cur->bc_private.b.firstblock; + } + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + if (!error) + xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, + orig_nmap, *nmap); + return error; +} + +/* + * Map file blocks to filesystem blocks, simple version. + * One block (extent) only, read-only. + * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. + * For the other flag values, the effect is as if XFS_BMAPI_METADATA + * was set and all the others were clear. + */ +int /* error */ +xfs_bmapi_single( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + xfs_fsblock_t *fsb, /* output: mapped block */ + xfs_fileoff_t bno) /* starting file offs. mapped */ +{ + int eof; /* we've hit the end of extents */ + int error; /* error return */ + xfs_bmbt_irec_t got; /* current file extent record */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last useful extent number */ + xfs_bmbt_irec_t prev; /* previous file extent record */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (unlikely( + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) { + XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return XFS_ERROR(EIO); + XFS_STATS_INC(xs_blk_mapr); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + /* + * Reading past eof, act as though there's a hole + * up to end. + */ + if (eof || got.br_startoff > bno) { + *fsb = NULLFSBLOCK; + return 0; + } + ASSERT(!isnullstartblock(got.br_startblock)); + ASSERT(bno < got.br_startoff + got.br_blockcount); + *fsb = got.br_startblock + (bno - got.br_startoff); + return 0; +} + +/* + * Unmap (remove) blocks from a file. + * If nexts is nonzero then the number of extents to remove is limited to + * that value. If not all extents in the block range can be removed then + * *done is set. + */ +int /* error */ +xfs_bunmapi( + xfs_trans_t *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting offset to unmap */ + xfs_filblks_t len, /* length to unmap in file */ + int flags, /* misc flags */ + xfs_extnum_t nexts, /* number of extents max */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *done) /* set if not done yet */ +{ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_irec_t del; /* extent being deleted */ + int eof; /* is deleting at eof */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + int error; /* error return value */ + xfs_extnum_t extno; /* extent number in list */ + xfs_bmbt_irec_t got; /* current extent record */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int isrt; /* freeing in rt area */ + xfs_extnum_t lastx; /* last extent index used */ + int logflags; /* transaction logging flags */ + xfs_extlen_t mod; /* rt extent offset */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_bmbt_irec_t prev; /* previous extent record */ + xfs_fileoff_t start; /* first file offset deleted */ + int tmp_logflags; /* partial logging flags */ + int wasdel; /* was a delayed alloc extent */ + int whichfork; /* data or attribute fork */ + xfs_fsblock_t sum; + + trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); + + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + ifp = XFS_IFORK_PTR(ip, whichfork); + if (unlikely( + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { + XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + mp = ip->i_mount; + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + ASSERT(len > 0); + ASSERT(nexts >= 0); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *done = 1; + return 0; + } + XFS_STATS_INC(xs_blk_unmap); + isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + start = bno; + bno = start + len - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + + /* + * Check to see if the given block number is past the end of the + * file, back up to the last block if so... + */ + if (eof) { + ep = xfs_iext_get_ext(ifp, --lastx); + xfs_bmbt_get_all(ep, &got); + bno = got.br_startoff + got.br_blockcount - 1; + } + logflags = 0; + if (ifp->if_flags & XFS_IFBROOT) { + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = 0; + } else + cur = NULL; + extno = 0; + while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && + (nexts == 0 || extno < nexts)) { + /* + * Is the found extent after a hole in which bno lives? + * Just back up to the previous extent, if so. + */ + if (got.br_startoff > bno) { + if (--lastx < 0) + break; + ep = xfs_iext_get_ext(ifp, lastx); + xfs_bmbt_get_all(ep, &got); + } + /* + * Is the last block of this extent before the range + * we're supposed to delete? If so, we're done. + */ + bno = XFS_FILEOFF_MIN(bno, + got.br_startoff + got.br_blockcount - 1); + if (bno < start) + break; + /* + * Then deal with the (possibly delayed) allocated space + * we found. + */ + ASSERT(ep != NULL); + del = got; + wasdel = isnullstartblock(del.br_startblock); + if (got.br_startoff < start) { + del.br_startoff = start; + del.br_blockcount -= start - got.br_startoff; + if (!wasdel) + del.br_startblock += start - got.br_startoff; + } + if (del.br_startoff + del.br_blockcount > bno + 1) + del.br_blockcount = bno + 1 - del.br_startoff; + sum = del.br_startblock + del.br_blockcount; + if (isrt && + (mod = do_mod(sum, mp->m_sb.sb_rextsize))) { + /* + * Realtime extent not lined up at the end. + * The extent could have been split into written + * and unwritten pieces, or we could just be + * unmapping part of it. But we can't really + * get rid of part of a realtime extent. + */ + if (del.br_state == XFS_EXT_UNWRITTEN || + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + /* + * This piece is unwritten, or we're not + * using unwritten extents. Skip over it. + */ + ASSERT(bno >= mod); + bno -= mod > del.br_blockcount ? + del.br_blockcount : mod; + if (bno < got.br_startoff) { + if (--lastx >= 0) + xfs_bmbt_get_all(xfs_iext_get_ext( + ifp, lastx), &got); + } + continue; + } + /* + * It's written, turn it unwritten. + * This is better than zeroing it. + */ + ASSERT(del.br_state == XFS_EXT_NORM); + ASSERT(xfs_trans_get_block_res(tp) > 0); + /* + * If this spans a realtime extent boundary, + * chop it back to the start of the one we end at. + */ + if (del.br_blockcount > mod) { + del.br_startoff += del.br_blockcount - mod; + del.br_startblock += del.br_blockcount - mod; + del.br_blockcount = mod; + } + del.br_state = XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent(ip, &lastx, &cur, &del, + firstblock, flist, &logflags, + XFS_DATA_FORK); + if (error) + goto error0; + goto nodelete; + } + if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) { + /* + * Realtime extent is lined up at the end but not + * at the front. We'll get rid of full extents if + * we can. + */ + mod = mp->m_sb.sb_rextsize - mod; + if (del.br_blockcount > mod) { + del.br_blockcount -= mod; + del.br_startoff += mod; + del.br_startblock += mod; + } else if ((del.br_startoff == start && + (del.br_state == XFS_EXT_UNWRITTEN || + xfs_trans_get_block_res(tp) == 0)) || + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + /* + * Can't make it unwritten. There isn't + * a full extent here so just skip it. + */ + ASSERT(bno >= del.br_blockcount); + bno -= del.br_blockcount; + if (got.br_startoff > bno) { + if (--lastx >= 0) { + ep = xfs_iext_get_ext(ifp, + lastx); + xfs_bmbt_get_all(ep, &got); + } + } + continue; + } else if (del.br_state == XFS_EXT_UNWRITTEN) { + /* + * This one is already unwritten. + * It must have a written left neighbor. + * Unwrite the killed part of that one and + * try again. + */ + ASSERT(lastx > 0); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, + lastx - 1), &prev); + ASSERT(prev.br_state == XFS_EXT_NORM); + ASSERT(!isnullstartblock(prev.br_startblock)); + ASSERT(del.br_startblock == + prev.br_startblock + prev.br_blockcount); + if (prev.br_startoff < start) { + mod = start - prev.br_startoff; + prev.br_blockcount -= mod; + prev.br_startblock += mod; + prev.br_startoff = start; + } + prev.br_state = XFS_EXT_UNWRITTEN; + lastx--; + error = xfs_bmap_add_extent(ip, &lastx, &cur, + &prev, firstblock, flist, &logflags, + XFS_DATA_FORK); + if (error) + goto error0; + goto nodelete; + } else { + ASSERT(del.br_state == XFS_EXT_NORM); + del.br_state = XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent(ip, &lastx, &cur, + &del, firstblock, flist, &logflags, + XFS_DATA_FORK); + if (error) + goto error0; + goto nodelete; + } + } + if (wasdel) { + ASSERT(startblockval(del.br_startblock) > 0); + /* Update realtime/data freespace, unreserve quota */ + if (isrt) { + xfs_filblks_t rtexts; + + rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); + do_div(rtexts, mp->m_sb.sb_rextsize); + xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, + (int64_t)rtexts, 0); + (void)xfs_trans_reserve_quota_nblks(NULL, + ip, -((long)del.br_blockcount), 0, + XFS_QMOPT_RES_RTBLKS); + } else { + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + (int64_t)del.br_blockcount, 0); + (void)xfs_trans_reserve_quota_nblks(NULL, + ip, -((long)del.br_blockcount), 0, + XFS_QMOPT_RES_REGBLKS); + } + ip->i_delayed_blks -= del.br_blockcount; + if (cur) + cur->bc_private.b.flags |= + XFS_BTCUR_BPRV_WASDEL; + } else if (cur) + cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; + /* + * If it's the case where the directory code is running + * with no block reservation, and the deleted block is in + * the middle of its extent, and the resulting insert + * of an extent would cause transformation to btree format, + * then reject it. The calling code will then swap + * blocks around instead. + * We have to do this now, rather than waiting for the + * conversion to btree format, since the transaction + * will be dirty. + */ + if (!wasdel && xfs_trans_get_block_res(tp) == 0 && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max && + del.br_startoff > got.br_startoff && + del.br_startoff + del.br_blockcount < + got.br_startoff + got.br_blockcount) { + error = XFS_ERROR(ENOSPC); + goto error0; + } + error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del, + &tmp_logflags, whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + bno = del.br_startoff - 1; +nodelete: + /* + * If not done go on to the next (previous) record. + */ + if (bno != (xfs_fileoff_t)-1 && bno >= start) { + if (lastx >= 0) { + ep = xfs_iext_get_ext(ifp, lastx); + if (xfs_bmbt_get_startoff(ep) > bno) { + if (--lastx >= 0) + ep = xfs_iext_get_ext(ifp, + lastx); + } + xfs_bmbt_get_all(ep, &got); + } + extno++; + } + } + *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* + * Convert to a btree if necessary. + */ + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, + &cur, 0, &tmp_logflags, whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + } + /* + * transform from btree to extents, give it cur + */ + else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + ASSERT(cur != NULL); + error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, + whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + } + /* + * transform from extents to local? + */ + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + error = 0; +error0: + /* + * Log everything. Do this after conversion, there's no point in + * logging the extent records if we've converted to btree format. + */ + if ((logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + logflags &= ~xfs_ilog_fext(whichfork); + else if ((logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + logflags &= ~xfs_ilog_fbroot(whichfork); + /* + * Log inode even in the error case, if the transaction + * is dirty we'll need to shut down the filesystem. + */ + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (cur) { + if (!error) { + *firstblock = cur->bc_private.b.firstblock; + cur->bc_private.b.allocated = 0; + } + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + +/* + * returns 1 for success, 0 if we failed to map the extent. + */ +STATIC int +xfs_getbmapx_fix_eof_hole( + xfs_inode_t *ip, /* xfs incore inode pointer */ + struct getbmapx *out, /* output structure */ + int prealloced, /* this is a file with + * preallocated data space */ + __int64_t end, /* last block requested */ + xfs_fsblock_t startblock) +{ + __int64_t fixlen; + xfs_mount_t *mp; /* file system mount point */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent pointer */ + xfs_fileoff_t fileblock; + + if (startblock == HOLESTARTBLOCK) { + mp = ip->i_mount; + out->bmv_block = -1; + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size)); + fixlen -= out->bmv_offset; + if (prealloced && out->bmv_offset + out->bmv_length == end) { + /* Came to hole at EOF. Trim it. */ + if (fixlen <= 0) + return 0; + out->bmv_length = fixlen; + } + } else { + if (startblock == DELAYSTARTBLOCK) + out->bmv_block = -2; + else + out->bmv_block = xfs_fsb_to_db(ip, startblock); + fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && + (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) + out->bmv_oflags |= BMV_OF_LAST; + } + + return 1; +} + +/* + * Get inode's extents as described in bmv, and format for output. + * Calls formatter to fill the user's buffer until all extents + * are mapped, until the passed-in bmv->bmv_count slots have + * been filled, or until the formatter short-circuits the loop, + * if it is tracking filled-in extents on its own. + */ +int /* error code */ +xfs_getbmap( + xfs_inode_t *ip, + struct getbmapx *bmv, /* user bmap structure */ + xfs_bmap_format_t formatter, /* format to user */ + void *arg) /* formatter arg */ +{ + __int64_t bmvend; /* last block requested */ + int error = 0; /* return value */ + __int64_t fixlen; /* length for -1 case */ + int i; /* extent number */ + int lock; /* lock state */ + xfs_bmbt_irec_t *map; /* buffer for user's data */ + xfs_mount_t *mp; /* file system mount point */ + int nex; /* # of user extents can do */ + int nexleft; /* # of user extents left */ + int subnex; /* # of bmapi's can do */ + int nmap; /* number of map entries */ + struct getbmapx *out; /* output structure */ + int whichfork; /* data or attr fork */ + int prealloced; /* this is a file with + * preallocated data space */ + int iflags; /* interface flags */ + int bmapi_flags; /* flags for xfs_bmapi */ + int cur_ext = 0; + + mp = ip->i_mount; + iflags = bmv->bmv_iflags; + whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; + + if (whichfork == XFS_ATTR_FORK) { + if (XFS_IFORK_Q(ip)) { + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && + ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EINVAL); + } else if (unlikely( + ip->i_d.di_aformat != 0 && + ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) { + XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + + prealloced = 0; + fixlen = 1LL << 32; + } else { + if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE && + ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EINVAL); + + if (xfs_get_extsz_hint(ip) || + ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ + prealloced = 1; + fixlen = XFS_MAXIOFFSET(mp); + } else { + prealloced = 0; + fixlen = ip->i_size; + } + } + + if (bmv->bmv_length == -1) { + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); + bmv->bmv_length = + max_t(__int64_t, fixlen - bmv->bmv_offset, 0); + } else if (bmv->bmv_length == 0) { + bmv->bmv_entries = 0; + return 0; + } else if (bmv->bmv_length < 0) { + return XFS_ERROR(EINVAL); + } + + nex = bmv->bmv_count - 1; + if (nex <= 0) + return XFS_ERROR(EINVAL); + bmvend = bmv->bmv_offset + bmv->bmv_length; + + + if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) + return XFS_ERROR(ENOMEM); + out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL); + if (!out) + return XFS_ERROR(ENOMEM); + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { + if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) { + error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); + if (error) + goto out_unlock_iolock; + } + /* + * even after flushing the inode, there can still be delalloc + * blocks on the inode beyond EOF due to speculative + * preallocation. These are not removed until the release + * function is called or the inode is inactivated. Hence we + * cannot assert here that ip->i_delayed_blks == 0. + */ + } + + lock = xfs_ilock_map_shared(ip); + + /* + * Don't let nex be bigger than the number of extents + * we can have assuming alternating holes and real extents. + */ + if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) + nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; + + bmapi_flags = xfs_bmapi_aflag(whichfork); + if (!(iflags & BMV_IF_PREALLOC)) + bmapi_flags |= XFS_BMAPI_IGSTATE; + + /* + * Allocate enough space to handle "subnex" maps at a time. + */ + error = ENOMEM; + subnex = 16; + map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); + if (!map) + goto out_unlock_ilock; + + bmv->bmv_entries = 0; + + if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 && + (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) { + error = 0; + goto out_free_map; + } + + nexleft = nex; + + do { + nmap = (nexleft > subnex) ? subnex : nexleft; + error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), + XFS_BB_TO_FSB(mp, bmv->bmv_length), + bmapi_flags, NULL, 0, map, &nmap, + NULL); + if (error) + goto out_free_map; + ASSERT(nmap <= subnex); + + for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { + out[cur_ext].bmv_oflags = 0; + if (map[i].br_state == XFS_EXT_UNWRITTEN) + out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; + else if (map[i].br_startblock == DELAYSTARTBLOCK) + out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC; + out[cur_ext].bmv_offset = + XFS_FSB_TO_BB(mp, map[i].br_startoff); + out[cur_ext].bmv_length = + XFS_FSB_TO_BB(mp, map[i].br_blockcount); + out[cur_ext].bmv_unused1 = 0; + out[cur_ext].bmv_unused2 = 0; + ASSERT(((iflags & BMV_IF_DELALLOC) != 0) || + (map[i].br_startblock != DELAYSTARTBLOCK)); + if (map[i].br_startblock == HOLESTARTBLOCK && + whichfork == XFS_ATTR_FORK) { + /* came to the end of attribute fork */ + out[cur_ext].bmv_oflags |= BMV_OF_LAST; + goto out_free_map; + } + + if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], + prealloced, bmvend, + map[i].br_startblock)) + goto out_free_map; + + bmv->bmv_offset = + out[cur_ext].bmv_offset + + out[cur_ext].bmv_length; + bmv->bmv_length = + max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + + /* + * In case we don't want to return the hole, + * don't increase cur_ext so that we can reuse + * it in the next loop. + */ + if ((iflags & BMV_IF_NO_HOLES) && + map[i].br_startblock == HOLESTARTBLOCK) { + memset(&out[cur_ext], 0, sizeof(out[cur_ext])); + continue; + } + + nexleft--; + bmv->bmv_entries++; + cur_ext++; + } + } while (nmap && nexleft && bmv->bmv_length); + + out_free_map: + kmem_free(map); + out_unlock_ilock: + xfs_iunlock_map_shared(ip, lock); + out_unlock_iolock: + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + for (i = 0; i < cur_ext; i++) { + int full = 0; /* user array is full */ + + /* format results & advance arg */ + error = formatter(&arg, &out[i], &full); + if (error || full) + break; + } + + kmem_free(out); + return error; +} + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + */ +STATIC int /* error */ +xfs_bmap_isaeof( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t off, /* file offset in fsblocks */ + int whichfork, /* data or attribute fork */ + char *aeof) /* return value */ +{ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_bmbt_irec_t s; /* expanded extent record */ + + ASSERT(whichfork == XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(NULL, ip, whichfork))) + return error; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *aeof = 1; + return 0; + } + /* + * Go to the last extent + */ + lastrec = xfs_iext_get_ext(ifp, nextents - 1); + xfs_bmbt_get_all(lastrec, &s); + /* + * Check we are allocating in the last extent (for delayed allocations) + * or past the last extent for non-delayed allocations. + */ + *aeof = (off >= s.br_startoff && + off < s.br_startoff + s.br_blockcount && + isnullstartblock(s.br_startblock)) || + off >= s.br_startoff + s.br_blockcount; + return 0; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. + */ +int /* error */ +xfs_bmap_eof( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t endoff, /* file offset in fsblocks */ + int whichfork, /* data or attribute fork */ + int *eof) /* result value */ +{ + xfs_fsblock_t blockcount; /* extent block count */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_fileoff_t startoff; /* extent starting file offset */ + + ASSERT(whichfork == XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(NULL, ip, whichfork))) + return error; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *eof = 1; + return 0; + } + /* + * Go to the last extent + */ + lastrec = xfs_iext_get_ext(ifp, nextents - 1); + startoff = xfs_bmbt_get_startoff(lastrec); + blockcount = xfs_bmbt_get_blockcount(lastrec); + *eof = endoff >= startoff + blockcount; + return 0; +} + +#ifdef DEBUG +STATIC struct xfs_buf * +xfs_bmap_get_bp( + struct xfs_btree_cur *cur, + xfs_fsblock_t bno) +{ + struct xfs_log_item_desc *lidp; + int i; + + if (!cur) + return NULL; + + for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { + if (!cur->bc_bufs[i]) + break; + if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) + return cur->bc_bufs[i]; + } + + /* Chase down all the log items to see if the bp is there */ + list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { + struct xfs_buf_log_item *bip; + bip = (struct xfs_buf_log_item *)lidp->lid_item; + if (bip->bli_item.li_type == XFS_LI_BUF && + XFS_BUF_ADDR(bip->bli_buf) == bno) + return bip->bli_buf; + } + + return NULL; +} + +STATIC void +xfs_check_block( + struct xfs_btree_block *block, + xfs_mount_t *mp, + int root, + short sz) +{ + int i, j, dmxr; + __be64 *pp, *thispa; /* pointer to block address */ + xfs_bmbt_key_t *prevp, *keyp; + + ASSERT(be16_to_cpu(block->bb_level) > 0); + + prevp = NULL; + for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { + dmxr = mp->m_bmap_dmxr[0]; + keyp = XFS_BMBT_KEY_ADDR(mp, block, i); + + if (prevp) { + ASSERT(be64_to_cpu(prevp->br_startoff) < + be64_to_cpu(keyp->br_startoff)); + } + prevp = keyp; + + /* + * Compare the block numbers to see if there are dups. + */ + if (root) + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); + else + pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); + + for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { + if (root) + thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); + else + thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); + if (*thispa == *pp) { + xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", + __func__, j, i, + (unsigned long long)be64_to_cpu(*thispa)); + panic("%s: ptrs are equal in node\n", + __func__); + } + } + } +} + +/* + * Check that the extents for the inode ip are in the right order in all + * btree leaves. + */ + +STATIC void +xfs_bmap_check_leaf_extents( + xfs_btree_cur_t *cur, /* btree cursor or null */ + xfs_inode_t *ip, /* incore inode pointer */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_extnum_t i=0, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + xfs_bmbt_rec_t *ep; /* pointer to current extent */ + xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ + xfs_bmbt_rec_t *nextp; /* pointer to next extent */ + int bp_release = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { + return; + } + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + xfs_check_block(block, mp, 1, ifp->if_broot_bytes); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + /* See if buf is in cur first */ + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (bp) { + bp_release = 0; + } else { + bp_release = 1; + } + if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + goto error_norelse; + block = XFS_BUF_TO_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, level), + error0); + if (level == 0) + break; + + /* + * Check this block for basic sanity (increasing keys and + * no duplicate blocks). + */ + + xfs_check_block(block, mp, 0, 0); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + } + + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + i = 0; + + /* + * Loop over all leaf nodes checking that all extents are in the right order. + */ + for (;;) { + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + + + num_recs = xfs_btree_get_numrecs(block); + + /* + * Read-ahead the next leaf block, if any. + */ + + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + + /* + * Check all the extents to make sure they are OK. + * If we had a previous block, the last entry should + * conform with the first entry in this one. + */ + + ep = XFS_BMBT_REC_ADDR(mp, block, 1); + if (i) { + ASSERT(xfs_bmbt_disk_get_startoff(&last) + + xfs_bmbt_disk_get_blockcount(&last) <= + xfs_bmbt_disk_get_startoff(ep)); + } + for (j = 1; j < num_recs; j++) { + nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); + ASSERT(xfs_bmbt_disk_get_startoff(ep) + + xfs_bmbt_disk_get_blockcount(ep) <= + xfs_bmbt_disk_get_startoff(nextp)); + ep = nextp; + } + + last = *ep; + i += num_recs; + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (bp) { + bp_release = 0; + } else { + bp_release = 1; + } + if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + goto error_norelse; + block = XFS_BUF_TO_BLOCK(bp); + } + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + return; + +error0: + xfs_warn(mp, "%s: at error0", __func__); + if (bp_release) + xfs_trans_brelse(NULL, bp); +error_norelse: + xfs_warn(mp, "%s: BAD after btree leaves for %d extents", + __func__, i); + panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); + return; +} +#endif + +/* + * Count fsblocks of the given fork. + */ +int /* error */ +xfs_bmap_count_blocks( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + int *count) /* out: count of blocks */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { + xfs_bmap_count_leaves(ifp, 0, + ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), + count); + return 0; + } + + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + block = ifp->if_broot; + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { + XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, + mp); + return XFS_ERROR(EFSCORRUPTED); + } + + return 0; +} + +/* + * Recursively walks each level of a btree + * to count total fsblocks is use. + */ +STATIC int /* error */ +xfs_bmap_count_tree( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fsblock_t blockno, /* file system block number */ + int levelin, /* level in btree */ + int *count) /* Count of blocks */ +{ + int error; + xfs_buf_t *bp, *nbp; + int level = levelin; + __be64 *pp; + xfs_fsblock_t bno = blockno; + xfs_fsblock_t nextbno; + struct xfs_btree_block *block, *nextblock; + int numrecs; + + if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + + if (--level) { + /* Not at node above leaves, count this level of nodes */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + while (nextbno != NULLFSBLOCK) { + if ((error = xfs_btree_read_bufl(mp, tp, nextbno, + 0, &nbp, XFS_BMAP_BTREE_REF))) + return error; + *count += 1; + nextblock = XFS_BUF_TO_BLOCK(nbp); + nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); + xfs_trans_brelse(tp, nbp); + } + + /* Dive to the next level */ + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + if (unlikely((error = + xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { + xfs_trans_brelse(tp, bp); + XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + xfs_trans_brelse(tp, bp); + } else { + /* count all level 1 nodes and their leaves */ + for (;;) { + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + numrecs = be16_to_cpu(block->bb_numrecs); + xfs_bmap_disk_count_leaves(mp, block, numrecs, count); + xfs_trans_brelse(tp, bp); + if (nextbno == NULLFSBLOCK) + break; + bno = nextbno; + if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + } + } + return 0; +} + +/* + * Count leaf blocks given a range of extent records. + */ +STATIC void +xfs_bmap_count_leaves( + xfs_ifork_t *ifp, + xfs_extnum_t idx, + int numrecs, + int *count) +{ + int b; + + for (b = 0; b < numrecs; b++) { + xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); + *count += xfs_bmbt_get_blockcount(frp); + } +} + +/* + * Count leaf blocks given a range of extent records originally + * in btree format. + */ +STATIC void +xfs_bmap_disk_count_leaves( + struct xfs_mount *mp, + struct xfs_btree_block *block, + int numrecs, + int *count) +{ + int b; + xfs_bmbt_rec_t *frp; + + for (b = 1; b <= numrecs; b++) { + frp = XFS_BMBT_REC_ADDR(mp, block, b); + *count += xfs_bmbt_disk_get_blockcount(frp); + } +} + +/* + * dead simple method of punching delalyed allocation blocks from a range in + * the inode. Walks a block at a time so will be slow, but is only executed in + * rare error cases so the overhead is not critical. This will alays punch out + * both the start and end blocks, even if the ranges only partially overlap + * them, so it is up to the caller to ensure that partial blocks are not + * passed in. + */ +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length) +{ + xfs_fileoff_t remaining = length; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + do { + int done; + xfs_bmbt_irec_t imap; + int nimaps = 1; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi(NULL, ip, start_fsb, 1, + XFS_BMAPI_ENTIRE, NULL, 0, &imap, + &nimaps, NULL); + + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "Failed delalloc mapping lookup ino %lld fsb %lld.", + ip->i_ino, start_fsb); + } + break; + } + if (!nimaps) { + /* nothing there */ + goto next_block; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_block; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, + &flist, &done); + if (error) + break; + + ASSERT(!flist.xbf_count && !flist.xbf_first); +next_block: + start_fsb++; + remaining--; + } while(remaining > 0); + + return error; +} diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h new file mode 100644 index 00000000..c62234bd --- /dev/null +++ b/fs/xfs/xfs_bmap.h @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BMAP_H__ +#define __XFS_BMAP_H__ + +struct getbmap; +struct xfs_bmbt_irec; +struct xfs_ifork; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +extern kmem_zone_t *xfs_bmap_free_item_zone; + +/* + * List of extents to be free "later". + * The list is kept sorted on xbf_startblock. + */ +typedef struct xfs_bmap_free_item +{ + xfs_fsblock_t xbfi_startblock;/* starting fs block number */ + xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */ + struct xfs_bmap_free_item *xbfi_next; /* link to next entry */ +} xfs_bmap_free_item_t; + +/* + * Header for free extent list. + * + * xbf_low is used by the allocator to activate the lowspace algorithm - + * when free space is running low the extent allocator may choose to + * allocate an extent from an AG without leaving sufficient space for + * a btree split when inserting the new extent. In this case the allocator + * will enable the lowspace algorithm which is supposed to allow further + * allocations (such as btree splits and newroots) to allocate from + * sequential AGs. In order to avoid locking AGs out of order the lowspace + * algorithm will start searching for free space from AG 0. If the correct + * transaction reservations have been made then this algorithm will eventually + * find all the space it needs. + */ +typedef struct xfs_bmap_free +{ + xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ + int xbf_count; /* count of items on list */ + int xbf_low; /* alloc in low mode */ +} xfs_bmap_free_t; + +#define XFS_BMAP_MAX_NMAP 4 + +/* + * Flags for xfs_bmapi + */ +#define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */ +#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */ +#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ +#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ +#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ +#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */ +#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ + /* combine contig. space */ +#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */ +/* + * unwritten extent conversion - this needs write cache flushing and no additional + * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts + * from written to unwritten, otherwise convert from unwritten to written. + */ +#define XFS_BMAPI_CONVERT 0x200 + +#define XFS_BMAPI_FLAGS \ + { XFS_BMAPI_WRITE, "WRITE" }, \ + { XFS_BMAPI_DELAY, "DELAY" }, \ + { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ + { XFS_BMAPI_METADATA, "METADATA" }, \ + { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ + { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ + { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ + { XFS_BMAPI_CONTIG, "CONTIG" }, \ + { XFS_BMAPI_CONVERT, "CONVERT" } + + +static inline int xfs_bmapi_aflag(int w) +{ + return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); +} + +/* + * Special values for xfs_bmbt_irec_t br_startblock field. + */ +#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) +#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) + +static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) +{ + ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ + (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK); +} + +/* + * Argument structure for xfs_bmap_alloc. + */ +typedef struct xfs_bmalloca { + xfs_fsblock_t firstblock; /* i/o first block allocated */ + xfs_fsblock_t rval; /* starting block of new extent */ + xfs_fileoff_t off; /* offset in file filling in */ + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct xfs_bmbt_irec *prevp; /* extent before the new one */ + struct xfs_bmbt_irec *gotp; /* extent after, or delayed */ + xfs_extlen_t alen; /* i/o length asked/allocated */ + xfs_extlen_t total; /* total blocks needed for xaction */ + xfs_extlen_t minlen; /* minimum allocation size (blocks) */ + xfs_extlen_t minleft; /* amount must be left after alloc */ + char eof; /* set if allocating past last extent */ + char wasdel; /* replacing a delayed allocation */ + char userdata;/* set if is user data */ + char low; /* low on space, using seq'l ags */ + char aeof; /* allocated space at eof */ + char conv; /* overwriting unwritten extents */ +} xfs_bmalloca_t; + +/* + * Flags for xfs_bmap_add_extent*. + */ +#define BMAP_LEFT_CONTIG (1 << 0) +#define BMAP_RIGHT_CONTIG (1 << 1) +#define BMAP_LEFT_FILLING (1 << 2) +#define BMAP_RIGHT_FILLING (1 << 3) +#define BMAP_LEFT_DELAY (1 << 4) +#define BMAP_RIGHT_DELAY (1 << 5) +#define BMAP_LEFT_VALID (1 << 6) +#define BMAP_RIGHT_VALID (1 << 7) +#define BMAP_ATTRFORK (1 << 8) + +#define XFS_BMAP_EXT_FLAGS \ + { BMAP_LEFT_CONTIG, "LC" }, \ + { BMAP_RIGHT_CONTIG, "RC" }, \ + { BMAP_LEFT_FILLING, "LF" }, \ + { BMAP_RIGHT_FILLING, "RF" }, \ + { BMAP_ATTRFORK, "ATTR" } + +/* + * Add bmap trace insert entries for all the contents of the extent list. + * + * Quite excessive tracing. Only do this for debug builds. + */ +#if defined(__KERNEL) && defined(DEBUG) +void +xfs_bmap_trace_exlist( + struct xfs_inode *ip, /* incore inode pointer */ + xfs_extnum_t cnt, /* count of entries in list */ + int whichfork, + unsigned long caller_ip); /* data or attr fork */ +#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ + xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_) +#else +#define XFS_BMAP_TRACE_EXLIST(ip,c,w) +#endif + +/* + * Convert inode from non-attributed to attributed. + * Must not be in a transaction, ip must not be locked. + */ +int /* error code */ +xfs_bmap_add_attrfork( + struct xfs_inode *ip, /* incore inode pointer */ + int size, /* space needed for new attribute */ + int rsvd); /* flag for reserved block allocation */ + +/* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +void +xfs_bmap_add_free( + xfs_fsblock_t bno, /* fs block number of extent */ + xfs_filblks_t len, /* length of extent */ + xfs_bmap_free_t *flist, /* list of extents */ + struct xfs_mount *mp); /* mount point structure */ + +/* + * Routine to clean up the free list data structure when + * an error occurs during a transaction. + */ +void +xfs_bmap_cancel( + xfs_bmap_free_t *flist); /* free list to clean up */ + +/* + * Compute and fill in the value of the maximum depth of a bmap btree + * in this filesystem. Done once, during mount. + */ +void +xfs_bmap_compute_maxlevels( + struct xfs_mount *mp, /* file system mount structure */ + int whichfork); /* data or attr fork */ + +/* + * Returns the file-relative block number of the first unused block in the file. + * This is the lowest-address hole if the file has holes, else the first block + * past the end of file. + */ +int /* error */ +xfs_bmap_first_unused( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *unused, /* unused block num */ + int whichfork); /* data or attr fork */ + +/* + * Returns the file-relative block number of the last block + 1 before + * last_block (input value) in the file. + * This is not based on i_size, it is based on the extent list. + * Returns 0 for local files, as they do not have an extent list. + */ +int /* error */ +xfs_bmap_last_before( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork); /* data or attr fork */ + +/* + * Returns the file-relative block number of the first block past eof in + * the file. This is not based on i_size, it is based on the extent list. + * Returns 0 for local files, as they do not have an extent list. + */ +int /* error */ +xfs_bmap_last_offset( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t *unused, /* last block num */ + int whichfork); /* data or attr fork */ + +/* + * Returns whether the selected fork of the inode has exactly one + * block or not. For the data fork we check this matches di_size, + * implying the file's range is 0..bsize-1. + */ +int +xfs_bmap_one_block( + struct xfs_inode *ip, /* incore inode */ + int whichfork); /* data or attr fork */ + +/* + * Read in the extents to iu_extents. + * All inode fields are set up by caller, we just traverse the btree + * and copy the records in. + */ +int /* error */ +xfs_bmap_read_extents( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + int whichfork); /* data or attr fork */ + +/* + * Map file blocks to filesystem blocks. + * File range is given by the bno/len pair. + * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) + * into a hole or past eof. + * Only allocates blocks from a single allocation group, + * to avoid locking problems. + * The returned value in "firstblock" from the first call in a transaction + * must be remembered and presented to subsequent calls in "firstblock". + * An upper bound for the number of blocks to be allocated is supplied to + * the first call in "total"; if no allocation group has that many free + * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). + */ +int /* error */ +xfs_bmapi( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + int flags, /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_extlen_t total, /* total blocks needed */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + xfs_bmap_free_t *flist); /* i/o: list extents to free */ + +/* + * Map file blocks to filesystem blocks, simple version. + * One block only, read-only. + * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. + * For the other flag values, the effect is as if XFS_BMAPI_METADATA + * was set and all the others were clear. + */ +int /* error */ +xfs_bmapi_single( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + xfs_fsblock_t *fsb, /* output: mapped block */ + xfs_fileoff_t bno); /* starting file offs. mapped */ + +/* + * Unmap (remove) blocks from a file. + * If nexts is nonzero then the number of extents to remove is limited to + * that value. If not all extents in the block range can be removed then + * *done is set. + */ +int /* error */ +xfs_bunmapi( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting offset to unmap */ + xfs_filblks_t len, /* length to unmap in file */ + int flags, /* XFS_BMAPI_... */ + xfs_extnum_t nexts, /* number of extents max */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *done); /* set if not done yet */ + +/* + * Check an extent list, which has just been read, for + * any bit in the extent flag field. + */ +int +xfs_check_nostate_extents( + struct xfs_ifork *ifp, + xfs_extnum_t idx, + xfs_extnum_t num); + +uint +xfs_default_attroffset( + struct xfs_inode *ip); + +#ifdef __KERNEL__ + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. + * + * Return 1 if the given transaction was committed and a new one allocated, + * and 0 otherwise. + */ +int /* error */ +xfs_bmap_finish( + struct xfs_trans **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *committed); /* xact committed or not */ + +/* bmap to userspace formatter - copy to user & advance pointer */ +typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); + +/* + * Get inode's extents as described in bmv, and format for output. + */ +int /* error code */ +xfs_getbmap( + xfs_inode_t *ip, + struct getbmapx *bmv, /* user bmap structure */ + xfs_bmap_format_t formatter, /* format to user */ + void *arg); /* formatter arg */ + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof); + +/* + * Count fsblocks of the given fork. + */ +int +xfs_bmap_count_blocks( + xfs_trans_t *tp, + struct xfs_inode *ip, + int whichfork, + int *count); + +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length); +#endif /* __KERNEL__ */ + +#endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c new file mode 100644 index 00000000..87d3c10b --- /dev/null +++ b/fs/xfs/xfs_bmap_btree.c @@ -0,0 +1,919 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_btree_trace.h" +#include "xfs_itable.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_quota.h" + +/* + * Determine the extent state. + */ +/* ARGSUSED */ +STATIC xfs_exntst_t +xfs_extent_state( + xfs_filblks_t blks, + int extent_flag) +{ + if (extent_flag) { + ASSERT(blks != 0); /* saved for DMIG */ + return XFS_EXT_UNWRITTEN; + } + return XFS_EXT_NORM; +} + +/* + * Convert on-disk form of btree root to in-memory form. + */ +void +xfs_bmdr_to_bmbt( + struct xfs_mount *mp, + xfs_bmdr_block_t *dblock, + int dblocklen, + struct xfs_btree_block *rblock, + int rblocklen) +{ + int dmxr; + xfs_bmbt_key_t *fkp; + __be64 *fpp; + xfs_bmbt_key_t *tkp; + __be64 *tpp; + + rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); + rblock->bb_level = dblock->bb_level; + ASSERT(be16_to_cpu(rblock->bb_level) > 0); + rblock->bb_numrecs = dblock->bb_numrecs; + rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); + fkp = XFS_BMDR_KEY_ADDR(dblock, 1); + tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); + fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); + tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); + dmxr = be16_to_cpu(dblock->bb_numrecs); + memcpy(tkp, fkp, sizeof(*fkp) * dmxr); + memcpy(tpp, fpp, sizeof(*fpp) * dmxr); +} + +/* + * Convert a compressed bmap extent record to an uncompressed form. + * This code must be in sync with the routines xfs_bmbt_get_startoff, + * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. + */ +STATIC void +__xfs_bmbt_get_all( + __uint64_t l0, + __uint64_t l1, + xfs_bmbt_irec_t *s) +{ + int ext_flag; + xfs_exntst_t st; + + ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN)); + s->br_startoff = ((xfs_fileoff_t)l0 & + xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; +#if XFS_BIG_BLKNOS + s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) | + (((xfs_fsblock_t)l1) >> 21); +#else +#ifdef DEBUG + { + xfs_dfsbno_t b; + + b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) | + (((xfs_dfsbno_t)l1) >> 21); + ASSERT((b >> 32) == 0 || isnulldstartblock(b)); + s->br_startblock = (xfs_fsblock_t)b; + } +#else /* !DEBUG */ + s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21); +#endif /* DEBUG */ +#endif /* XFS_BIG_BLKNOS */ + s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21)); + /* This is xfs_extent_state() in-line */ + if (ext_flag) { + ASSERT(s->br_blockcount != 0); /* saved for DMIG */ + st = XFS_EXT_UNWRITTEN; + } else + st = XFS_EXT_NORM; + s->br_state = st; +} + +void +xfs_bmbt_get_all( + xfs_bmbt_rec_host_t *r, + xfs_bmbt_irec_t *s) +{ + __xfs_bmbt_get_all(r->l0, r->l1, s); +} + +/* + * Extract the blockcount field from an in memory bmap extent record. + */ +xfs_filblks_t +xfs_bmbt_get_blockcount( + xfs_bmbt_rec_host_t *r) +{ + return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21)); +} + +/* + * Extract the startblock field from an in memory bmap extent record. + */ +xfs_fsblock_t +xfs_bmbt_get_startblock( + xfs_bmbt_rec_host_t *r) +{ +#if XFS_BIG_BLKNOS + return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) | + (((xfs_fsblock_t)r->l1) >> 21); +#else +#ifdef DEBUG + xfs_dfsbno_t b; + + b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) | + (((xfs_dfsbno_t)r->l1) >> 21); + ASSERT((b >> 32) == 0 || isnulldstartblock(b)); + return (xfs_fsblock_t)b; +#else /* !DEBUG */ + return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21); +#endif /* DEBUG */ +#endif /* XFS_BIG_BLKNOS */ +} + +/* + * Extract the startoff field from an in memory bmap extent record. + */ +xfs_fileoff_t +xfs_bmbt_get_startoff( + xfs_bmbt_rec_host_t *r) +{ + return ((xfs_fileoff_t)r->l0 & + xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; +} + +xfs_exntst_t +xfs_bmbt_get_state( + xfs_bmbt_rec_host_t *r) +{ + int ext_flag; + + ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN)); + return xfs_extent_state(xfs_bmbt_get_blockcount(r), + ext_flag); +} + +/* + * Extract the blockcount field from an on disk bmap extent record. + */ +xfs_filblks_t +xfs_bmbt_disk_get_blockcount( + xfs_bmbt_rec_t *r) +{ + return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21)); +} + +/* + * Extract the startoff field from a disk format bmap extent record. + */ +xfs_fileoff_t +xfs_bmbt_disk_get_startoff( + xfs_bmbt_rec_t *r) +{ + return ((xfs_fileoff_t)be64_to_cpu(r->l0) & + xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; +} + + +/* + * Set all the fields in a bmap extent record from the arguments. + */ +void +xfs_bmbt_set_allf( + xfs_bmbt_rec_host_t *r, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) +{ + int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; + + ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); + ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); + ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); + +#if XFS_BIG_BLKNOS + ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); + + r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9) | + ((xfs_bmbt_rec_base_t)startblock >> 43); + r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); +#else /* !XFS_BIG_BLKNOS */ + if (isnullstartblock(startblock)) { + r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9) | + (xfs_bmbt_rec_base_t)xfs_mask64lo(9); + r->l1 = xfs_mask64hi(11) | + ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); + } else { + r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9); + r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); + } +#endif /* XFS_BIG_BLKNOS */ +} + +/* + * Set all the fields in a bmap extent record from the uncompressed form. + */ +void +xfs_bmbt_set_all( + xfs_bmbt_rec_host_t *r, + xfs_bmbt_irec_t *s) +{ + xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock, + s->br_blockcount, s->br_state); +} + + +/* + * Set all the fields in a disk format bmap extent record from the arguments. + */ +void +xfs_bmbt_disk_set_allf( + xfs_bmbt_rec_t *r, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) +{ + int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; + + ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); + ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); + ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); + +#if XFS_BIG_BLKNOS + ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); + + r->l0 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9) | + ((xfs_bmbt_rec_base_t)startblock >> 43)); + r->l1 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); +#else /* !XFS_BIG_BLKNOS */ + if (isnullstartblock(startblock)) { + r->l0 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9) | + (xfs_bmbt_rec_base_t)xfs_mask64lo(9)); + r->l1 = cpu_to_be64(xfs_mask64hi(11) | + ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); + } else { + r->l0 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9)); + r->l1 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); + } +#endif /* XFS_BIG_BLKNOS */ +} + +/* + * Set all the fields in a bmap extent record from the uncompressed form. + */ +STATIC void +xfs_bmbt_disk_set_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s) +{ + xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock, + s->br_blockcount, s->br_state); +} + +/* + * Set the blockcount field in a bmap extent record. + */ +void +xfs_bmbt_set_blockcount( + xfs_bmbt_rec_host_t *r, + xfs_filblks_t v) +{ + ASSERT((v & xfs_mask64hi(43)) == 0); + r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) | + (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21)); +} + +/* + * Set the startblock field in a bmap extent record. + */ +void +xfs_bmbt_set_startblock( + xfs_bmbt_rec_host_t *r, + xfs_fsblock_t v) +{ +#if XFS_BIG_BLKNOS + ASSERT((v & xfs_mask64hi(12)) == 0); + r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) | + (xfs_bmbt_rec_base_t)(v >> 43); + r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) | + (xfs_bmbt_rec_base_t)(v << 21); +#else /* !XFS_BIG_BLKNOS */ + if (isnullstartblock(v)) { + r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9); + r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) | + ((xfs_bmbt_rec_base_t)v << 21) | + (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); + } else { + r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9); + r->l1 = ((xfs_bmbt_rec_base_t)v << 21) | + (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); + } +#endif /* XFS_BIG_BLKNOS */ +} + +/* + * Set the startoff field in a bmap extent record. + */ +void +xfs_bmbt_set_startoff( + xfs_bmbt_rec_host_t *r, + xfs_fileoff_t v) +{ + ASSERT((v & xfs_mask64hi(9)) == 0); + r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) | + ((xfs_bmbt_rec_base_t)v << 9) | + (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9)); +} + +/* + * Set the extent state field in a bmap extent record. + */ +void +xfs_bmbt_set_state( + xfs_bmbt_rec_host_t *r, + xfs_exntst_t v) +{ + ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN); + if (v == XFS_EXT_NORM) + r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN); + else + r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN); +} + +/* + * Convert in-memory form of btree root to on-disk form. + */ +void +xfs_bmbt_to_bmdr( + struct xfs_mount *mp, + struct xfs_btree_block *rblock, + int rblocklen, + xfs_bmdr_block_t *dblock, + int dblocklen) +{ + int dmxr; + xfs_bmbt_key_t *fkp; + __be64 *fpp; + xfs_bmbt_key_t *tkp; + __be64 *tpp; + + ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC); + ASSERT(be64_to_cpu(rblock->bb_u.l.bb_leftsib) == NULLDFSBNO); + ASSERT(be64_to_cpu(rblock->bb_u.l.bb_rightsib) == NULLDFSBNO); + ASSERT(be16_to_cpu(rblock->bb_level) > 0); + dblock->bb_level = rblock->bb_level; + dblock->bb_numrecs = rblock->bb_numrecs; + dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); + fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); + tkp = XFS_BMDR_KEY_ADDR(dblock, 1); + fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); + tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); + dmxr = be16_to_cpu(dblock->bb_numrecs); + memcpy(tkp, fkp, sizeof(*fkp) * dmxr); + memcpy(tpp, fpp, sizeof(*fpp) * dmxr); +} + +/* + * Check extent records, which have just been read, for + * any bit in the extent flag field. ASSERT on debug + * kernels, as this condition should not occur. + * Return an error condition (1) if any flags found, + * otherwise return 0. + */ + +int +xfs_check_nostate_extents( + xfs_ifork_t *ifp, + xfs_extnum_t idx, + xfs_extnum_t num) +{ + for (; num > 0; num--, idx++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); + if ((ep->l0 >> + (64 - BMBT_EXNTFLAG_BITLEN)) != 0) { + ASSERT(0); + return 1; + } + } + return 0; +} + + +STATIC struct xfs_btree_cur * +xfs_bmbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + struct xfs_btree_cur *new; + + new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.b.ip, cur->bc_private.b.whichfork); + + /* + * Copy the firstblock, flist, and flags values, + * since init cursor doesn't get them. + */ + new->bc_private.b.firstblock = cur->bc_private.b.firstblock; + new->bc_private.b.flist = cur->bc_private.b.flist; + new->bc_private.b.flags = cur->bc_private.b.flags; + + return new; +} + +STATIC void +xfs_bmbt_update_cursor( + struct xfs_btree_cur *src, + struct xfs_btree_cur *dst) +{ + ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) || + (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); + ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist); + + dst->bc_private.b.allocated += src->bc_private.b.allocated; + dst->bc_private.b.firstblock = src->bc_private.b.firstblock; + + src->bc_private.b.allocated = 0; +} + +STATIC int +xfs_bmbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int length, + int *stat) +{ + xfs_alloc_arg_t args; /* block allocation args */ + int error; /* error return value */ + + memset(&args, 0, sizeof(args)); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.fsbno = cur->bc_private.b.firstblock; + args.firstblock = args.fsbno; + + if (args.fsbno == NULLFSBLOCK) { + args.fsbno = be64_to_cpu(start->l); + args.type = XFS_ALLOCTYPE_START_BNO; + /* + * Make sure there is sufficient room left in the AG to + * complete a full tree split for an extent insert. If + * we are converting the middle part of an extent then + * we may need space for two tree splits. + * + * We are relying on the caller to make the correct block + * reservation for this operation to succeed. If the + * reservation amount is insufficient then we may fail a + * block allocation here and corrupt the filesystem. + */ + args.minleft = xfs_trans_get_block_res(args.tp); + } else if (cur->bc_private.b.flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) { + error = XFS_ERROR(ENOSPC); + goto error0; + } + error = xfs_alloc_vextent(&args); + if (error) + goto error0; + + if (args.fsbno == NULLFSBLOCK && args.minleft) { + /* + * Could not find an AG with enough free space to satisfy + * a full btree split. Try again without minleft and if + * successful activate the lowspace algorithm. + */ + args.fsbno = 0; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.minleft = 0; + error = xfs_alloc_vextent(&args); + if (error) + goto error0; + cur->bc_private.b.flist->xbf_low = 1; + } + if (args.fsbno == NULLFSBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + cur->bc_private.b.ip->i_d.di_nblocks++; + xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); + xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip, + XFS_TRANS_DQ_BCOUNT, 1L); + + new->l = cpu_to_be64(args.fsbno); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + + error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +STATIC int +xfs_bmbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_trans *tp = cur->bc_tp; + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + + xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp); + ip->i_d.di_nblocks--; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, bp); + return 0; +} + +STATIC int +xfs_bmbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, + cur->bc_private.b.whichfork); + + return xfs_bmbt_maxrecs(cur->bc_mp, + ifp->if_broot_bytes, level == 0) / 2; + } + + return cur->bc_mp->m_bmap_dmnr[level != 0]; +} + +int +xfs_bmbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, + cur->bc_private.b.whichfork); + + return xfs_bmbt_maxrecs(cur->bc_mp, + ifp->if_broot_bytes, level == 0); + } + + return cur->bc_mp->m_bmap_dmxr[level != 0]; + +} + +/* + * Get the maximum records we could store in the on-disk format. + * + * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but + * for the root node this checks the available space in the dinode fork + * so that we can resize the in-memory buffer to match it. After a + * resize to the maximum size this function returns the same value + * as xfs_bmbt_get_maxrecs for the root node, too. + */ +STATIC int +xfs_bmbt_get_dmaxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level != cur->bc_nlevels - 1) + return cur->bc_mp->m_bmap_dmxr[level != 0]; + return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize, + level == 0); +} + +STATIC void +xfs_bmbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->bmbt.br_startoff = + cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt)); +} + +STATIC void +xfs_bmbt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + ASSERT(key->bmbt.br_startoff != 0); + + xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff), + 0, 0, XFS_EXT_NORM); +} + +STATIC void +xfs_bmbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b); +} + +STATIC void +xfs_bmbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ptr->l = 0; +} + +STATIC __int64_t +xfs_bmbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) - + cur->bc_rec.b.br_startoff; +} + +#ifdef DEBUG +STATIC int +xfs_bmbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be64_to_cpu(k1->bmbt.br_startoff) < + be64_to_cpu(k2->bmbt.br_startoff); +} + +STATIC int +xfs_bmbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return xfs_bmbt_disk_get_startoff(&r1->bmbt) + + xfs_bmbt_disk_get_blockcount(&r1->bmbt) <= + xfs_bmbt_disk_get_startoff(&r2->bmbt); +} +#endif /* DEBUG */ + +#ifdef XFS_BTREE_TRACE +ktrace_t *xfs_bmbt_trace_buf; + +STATIC void +xfs_bmbt_trace_enter( + struct xfs_btree_cur *cur, + const char *func, + char *s, + int type, + int line, + __psunsigned_t a0, + __psunsigned_t a1, + __psunsigned_t a2, + __psunsigned_t a3, + __psunsigned_t a4, + __psunsigned_t a5, + __psunsigned_t a6, + __psunsigned_t a7, + __psunsigned_t a8, + __psunsigned_t a9, + __psunsigned_t a10) +{ + struct xfs_inode *ip = cur->bc_private.b.ip; + int whichfork = cur->bc_private.b.whichfork; + + ktrace_enter(xfs_bmbt_trace_buf, + (void *)((__psint_t)type | (whichfork << 8) | (line << 16)), + (void *)func, (void *)s, (void *)ip, (void *)cur, + (void *)a0, (void *)a1, (void *)a2, (void *)a3, + (void *)a4, (void *)a5, (void *)a6, (void *)a7, + (void *)a8, (void *)a9, (void *)a10); +} + +STATIC void +xfs_bmbt_trace_cursor( + struct xfs_btree_cur *cur, + __uint32_t *s0, + __uint64_t *l0, + __uint64_t *l1) +{ + struct xfs_bmbt_rec_host r; + + xfs_bmbt_set_all(&r, &cur->bc_rec.b); + + *s0 = (cur->bc_nlevels << 24) | + (cur->bc_private.b.flags << 16) | + cur->bc_private.b.allocated; + *l0 = r.l0; + *l1 = r.l1; +} + +STATIC void +xfs_bmbt_trace_key( + struct xfs_btree_cur *cur, + union xfs_btree_key *key, + __uint64_t *l0, + __uint64_t *l1) +{ + *l0 = be64_to_cpu(key->bmbt.br_startoff); + *l1 = 0; +} + +/* Endian flipping versions of the bmbt extraction functions */ +STATIC void +xfs_bmbt_disk_get_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s) +{ + __xfs_bmbt_get_all(get_unaligned_be64(&r->l0), + get_unaligned_be64(&r->l1), s); +} + +STATIC void +xfs_bmbt_trace_record( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + __uint64_t *l0, + __uint64_t *l1, + __uint64_t *l2) +{ + struct xfs_bmbt_irec irec; + + xfs_bmbt_disk_get_all(&rec->bmbt, &irec); + *l0 = irec.br_startoff; + *l1 = irec.br_startblock; + *l2 = irec.br_blockcount; +} +#endif /* XFS_BTREE_TRACE */ + +static const struct xfs_btree_ops xfs_bmbt_ops = { + .rec_len = sizeof(xfs_bmbt_rec_t), + .key_len = sizeof(xfs_bmbt_key_t), + + .dup_cursor = xfs_bmbt_dup_cursor, + .update_cursor = xfs_bmbt_update_cursor, + .alloc_block = xfs_bmbt_alloc_block, + .free_block = xfs_bmbt_free_block, + .get_maxrecs = xfs_bmbt_get_maxrecs, + .get_minrecs = xfs_bmbt_get_minrecs, + .get_dmaxrecs = xfs_bmbt_get_dmaxrecs, + .init_key_from_rec = xfs_bmbt_init_key_from_rec, + .init_rec_from_key = xfs_bmbt_init_rec_from_key, + .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, + .key_diff = xfs_bmbt_key_diff, + +#ifdef DEBUG + .keys_inorder = xfs_bmbt_keys_inorder, + .recs_inorder = xfs_bmbt_recs_inorder, +#endif + +#ifdef XFS_BTREE_TRACE + .trace_enter = xfs_bmbt_trace_enter, + .trace_cursor = xfs_bmbt_trace_cursor, + .trace_key = xfs_bmbt_trace_key, + .trace_record = xfs_bmbt_trace_record, +#endif +}; + +/* + * Allocate a new bmap btree cursor. + */ +struct xfs_btree_cur * /* new bmap btree cursor */ +xfs_bmbt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* inode owning the btree */ + int whichfork) /* data or attr fork */ +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_cur *cur; + + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; + cur->bc_btnum = XFS_BTNUM_BMAP; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + + cur->bc_ops = &xfs_bmbt_ops; + cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; + + cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); + cur->bc_private.b.ip = ip; + cur->bc_private.b.firstblock = NULLFSBLOCK; + cur->bc_private.b.flist = NULL; + cur->bc_private.b.allocated = 0; + cur->bc_private.b.flags = 0; + cur->bc_private.b.whichfork = whichfork; + + return cur; +} + +/* + * Calculate number of records in a bmap btree block. + */ +int +xfs_bmbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= XFS_BMBT_BLOCK_LEN(mp); + + if (leaf) + return blocklen / sizeof(xfs_bmbt_rec_t); + return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)); +} + +/* + * Calculate number of records in a bmap btree inode root. + */ +int +xfs_bmdr_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= sizeof(xfs_bmdr_block_t); + + if (leaf) + return blocklen / sizeof(xfs_bmdr_rec_t); + return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t)); +} diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h new file mode 100644 index 00000000..0e66c4ea --- /dev/null +++ b/fs/xfs/xfs_bmap_btree.h @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BMAP_BTREE_H__ +#define __XFS_BMAP_BTREE_H__ + +#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ + +struct xfs_btree_cur; +struct xfs_btree_block; +struct xfs_mount; +struct xfs_inode; +struct xfs_trans; + +/* + * Bmap root header, on-disk form only. + */ +typedef struct xfs_bmdr_block { + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ +} xfs_bmdr_block_t; + +/* + * Bmap btree record and extent descriptor. + * l0:63 is an extent flag (value 1 indicates non-normal). + * l0:9-62 are startoff. + * l0:0-8 and l1:21-63 are startblock. + * l1:0-20 are blockcount. + */ +#define BMBT_EXNTFLAG_BITLEN 1 +#define BMBT_STARTOFF_BITLEN 54 +#define BMBT_STARTBLOCK_BITLEN 52 +#define BMBT_BLOCKCOUNT_BITLEN 21 + +typedef struct xfs_bmbt_rec { + __be64 l0, l1; +} xfs_bmbt_rec_t; + +typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ +typedef xfs_bmbt_rec_t xfs_bmdr_rec_t; + +typedef struct xfs_bmbt_rec_host { + __uint64_t l0, l1; +} xfs_bmbt_rec_host_t; + +/* + * Values and macros for delayed-allocation startblock fields. + */ +#define STARTBLOCKVALBITS 17 +#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20) +#define DSTARTBLOCKMASKBITS (15 + 20) +#define STARTBLOCKMASK \ + (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) +#define DSTARTBLOCKMASK \ + (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) + +static inline int isnullstartblock(xfs_fsblock_t x) +{ + return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; +} + +static inline int isnulldstartblock(xfs_dfsbno_t x) +{ + return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK; +} + +static inline xfs_fsblock_t nullstartblock(int k) +{ + ASSERT(k < (1 << STARTBLOCKVALBITS)); + return STARTBLOCKMASK | (k); +} + +static inline xfs_filblks_t startblockval(xfs_fsblock_t x) +{ + return (xfs_filblks_t)((x) & ~STARTBLOCKMASK); +} + +/* + * Possible extent formats. + */ +typedef enum { + XFS_EXTFMT_NOSTATE = 0, + XFS_EXTFMT_HASSTATE +} xfs_exntfmt_t; + +/* + * Possible extent states. + */ +typedef enum { + XFS_EXT_NORM, XFS_EXT_UNWRITTEN, + XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID +} xfs_exntst_t; + +/* + * Extent state and extent format macros. + */ +#define XFS_EXTFMT_INODE(x) \ + (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \ + XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE) +#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN) + +/* + * Incore version of above. + */ +typedef struct xfs_bmbt_irec +{ + xfs_fileoff_t br_startoff; /* starting file offset */ + xfs_fsblock_t br_startblock; /* starting block number */ + xfs_filblks_t br_blockcount; /* number of blocks */ + xfs_exntst_t br_state; /* extent state */ +} xfs_bmbt_irec_t; + +/* + * Key structure for non-leaf levels of the tree. + */ +typedef struct xfs_bmbt_key { + __be64 br_startoff; /* starting file offset */ +} xfs_bmbt_key_t, xfs_bmdr_key_t; + +/* btree pointer type */ +typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; + +/* + * Btree block header size depends on a superblock flag. + * + * (not quite yet, but soon) + */ +#define XFS_BMBT_BLOCK_LEN(mp) XFS_BTREE_LBLOCK_LEN + +#define XFS_BMBT_REC_ADDR(mp, block, index) \ + ((xfs_bmbt_rec_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_bmbt_rec_t))) + +#define XFS_BMBT_KEY_ADDR(mp, block, index) \ + ((xfs_bmbt_key_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_bmbt_key_t))) + +#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_bmbt_ptr_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_bmbt_key_t) + \ + ((index) - 1) * sizeof(xfs_bmbt_ptr_t))) + +#define XFS_BMDR_REC_ADDR(block, index) \ + ((xfs_bmdr_rec_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + ((index) - 1) * sizeof(xfs_bmdr_rec_t))) + +#define XFS_BMDR_KEY_ADDR(block, index) \ + ((xfs_bmdr_key_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + ((index) - 1) * sizeof(xfs_bmdr_key_t))) + +#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \ + ((xfs_bmdr_ptr_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + (maxrecs) * sizeof(xfs_bmdr_key_t) + \ + ((index) - 1) * sizeof(xfs_bmdr_ptr_t))) + +/* + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \ + XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0)) + +#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \ + (int)(XFS_BTREE_LBLOCK_LEN + \ + ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) + +#define XFS_BMAP_BROOT_SPACE(bb) \ + (XFS_BMAP_BROOT_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) +#define XFS_BMDR_SPACE_CALC(nrecs) \ + (int)(sizeof(xfs_bmdr_block_t) + \ + ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) + +/* + * Maximum number of bmap btree levels. + */ +#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)]) + +/* + * Prototypes for xfs_bmap.c to call. + */ +extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int, + struct xfs_btree_block *, int); +extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); +extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r); +extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r); +extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); +extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r); + +extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); +extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); + +extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); +extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o, + xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); +extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v); +extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v); +extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v); +extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v); + +extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, + xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); + +extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, + xfs_bmdr_block_t *, int); + +extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); +extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); +extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); + +extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_inode *, int); + + +#endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c new file mode 100644 index 00000000..2f9e97c1 --- /dev/null +++ b/fs/xfs/xfs_btree.c @@ -0,0 +1,3679 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_btree.h" +#include "xfs_btree_trace.h" +#include "xfs_error.h" +#include "xfs_trace.h" + +/* + * Cursor allocation zone. + */ +kmem_zone_t *xfs_btree_cur_zone; + +/* + * Btree magic numbers. + */ +const __uint32_t xfs_magics[XFS_BTNUM_MAX] = { + XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC +}; + + +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lblock( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* btree long form block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp) /* buffer for block, if any */ +{ + int lblock_ok; /* block passes checks */ + struct xfs_mount *mp; /* file system mount point */ + + mp = cur->bc_mp; + lblock_ok = + be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && + be16_to_cpu(block->bb_level) == level && + be16_to_cpu(block->bb_numrecs) <= + cur->bc_ops->get_maxrecs(cur, level) && + block->bb_u.l.bb_leftsib && + (be64_to_cpu(block->bb_u.l.bb_leftsib) == NULLDFSBNO || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_leftsib))) && + block->bb_u.l.bb_rightsib && + (be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_rightsib))); + if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, + XFS_ERRTAG_BTREE_CHECK_LBLOCK, + XFS_RANDOM_BTREE_CHECK_LBLOCK))) { + if (bp) + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW, + mp); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sblock( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* btree short form block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp) /* buffer containing block */ +{ + struct xfs_buf *agbp; /* buffer for ag. freespace struct */ + struct xfs_agf *agf; /* ag. freespace structure */ + xfs_agblock_t agflen; /* native ag. freespace length */ + int sblock_ok; /* block passes checks */ + + agbp = cur->bc_private.a.agbp; + agf = XFS_BUF_TO_AGF(agbp); + agflen = be32_to_cpu(agf->agf_length); + sblock_ok = + be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && + be16_to_cpu(block->bb_level) == level && + be16_to_cpu(block->bb_numrecs) <= + cur->bc_ops->get_maxrecs(cur, level) && + (be32_to_cpu(block->bb_u.s.bb_leftsib) == NULLAGBLOCK || + be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) && + block->bb_u.s.bb_leftsib && + (be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK || + be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) && + block->bb_u.s.bb_rightsib; + if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp, + XFS_ERRTAG_BTREE_CHECK_SBLOCK, + XFS_RANDOM_BTREE_CHECK_SBLOCK))) { + if (bp) + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR("xfs_btree_check_sblock", + XFS_ERRLEVEL_LOW, cur->bc_mp, block); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +/* + * Debug routine: check that block header is ok. + */ +int +xfs_btree_check_block( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* generic btree block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp) /* buffer containing block, if any */ +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return xfs_btree_check_lblock(cur, block, level, bp); + else + return xfs_btree_check_sblock(cur, block, level, bp); +} + +/* + * Check that (long) pointer is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lptr( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_dfsbno_t bno, /* btree block disk address */ + int level) /* btree block level */ +{ + XFS_WANT_CORRUPTED_RETURN( + level > 0 && + bno != NULLDFSBNO && + XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); + return 0; +} + +#ifdef DEBUG +/* + * Check that (short) pointer is ok. + */ +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sptr( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* btree block disk address */ + int level) /* btree block level */ +{ + xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks; + + XFS_WANT_CORRUPTED_RETURN( + level > 0 && + bno != NULLAGBLOCK && + bno != 0 && + bno < agblocks); + return 0; +} + +/* + * Check that block ptr is ok. + */ +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_ptr( + struct xfs_btree_cur *cur, /* btree cursor */ + union xfs_btree_ptr *ptr, /* btree block disk address */ + int index, /* offset from ptr to check */ + int level) /* btree block level */ +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + return xfs_btree_check_lptr(cur, + be64_to_cpu((&ptr->l)[index]), level); + } else { + return xfs_btree_check_sptr(cur, + be32_to_cpu((&ptr->s)[index]), level); + } +} +#endif + +/* + * Delete the btree cursor. + */ +void +xfs_btree_del_cursor( + xfs_btree_cur_t *cur, /* btree cursor */ + int error) /* del because of error */ +{ + int i; /* btree level */ + + /* + * Clear the buffer pointers, and release the buffers. + * If we're doing this in the face of an error, we + * need to make sure to inspect all of the entries + * in the bc_bufs array for buffers to be unlocked. + * This is because some of the btree code works from + * level n down to 0, and if we get an error along + * the way we won't have initialized all the entries + * down to 0. + */ + for (i = 0; i < cur->bc_nlevels; i++) { + if (cur->bc_bufs[i]) + xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]); + else if (!error) + break; + } + /* + * Can't free a bmap cursor without having dealt with the + * allocated indirect blocks' accounting. + */ + ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || + cur->bc_private.b.allocated == 0); + /* + * Free the cursor. + */ + kmem_zone_free(xfs_btree_cur_zone, cur); +} + +/* + * Duplicate the btree cursor. + * Allocate a new one, copy the record, re-get the buffers. + */ +int /* error */ +xfs_btree_dup_cursor( + xfs_btree_cur_t *cur, /* input cursor */ + xfs_btree_cur_t **ncur) /* output cursor */ +{ + xfs_buf_t *bp; /* btree block's buffer pointer */ + int error; /* error return value */ + int i; /* level number of btree block */ + xfs_mount_t *mp; /* mount structure for filesystem */ + xfs_btree_cur_t *new; /* new cursor value */ + xfs_trans_t *tp; /* transaction pointer, can be NULL */ + + tp = cur->bc_tp; + mp = cur->bc_mp; + + /* + * Allocate a new cursor like the old one. + */ + new = cur->bc_ops->dup_cursor(cur); + + /* + * Copy the record currently in the cursor. + */ + new->bc_rec = cur->bc_rec; + + /* + * For each level current, re-get the buffer and copy the ptr value. + */ + for (i = 0; i < new->bc_nlevels; i++) { + new->bc_ptrs[i] = cur->bc_ptrs[i]; + new->bc_ra[i] = cur->bc_ra[i]; + if ((bp = cur->bc_bufs[i])) { + if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) { + xfs_btree_del_cursor(new, error); + *ncur = NULL; + return error; + } + new->bc_bufs[i] = bp; + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + } else + new->bc_bufs[i] = NULL; + } + *ncur = new; + return 0; +} + +/* + * XFS btree block layout and addressing: + * + * There are two types of blocks in the btree: leaf and non-leaf blocks. + * + * The leaf record start with a header then followed by records containing + * the values. A non-leaf block also starts with the same header, and + * then first contains lookup keys followed by an equal number of pointers + * to the btree blocks at the previous level. + * + * +--------+-------+-------+-------+-------+-------+-------+ + * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | + * +--------+-------+-------+-------+-------+-------+-------+ + * + * +--------+-------+-------+-------+-------+-------+-------+ + * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | + * +--------+-------+-------+-------+-------+-------+-------+ + * + * The header is called struct xfs_btree_block for reasons better left unknown + * and comes in different versions for short (32bit) and long (64bit) block + * pointers. The record and key structures are defined by the btree instances + * and opaque to the btree core. The block pointers are simple disk endian + * integers, available in a short (32bit) and long (64bit) variant. + * + * The helpers below calculate the offset of a given record, key or pointer + * into a btree block (xfs_btree_*_offset) or return a pointer to the given + * record, key or pointer (xfs_btree_*_addr). Note that all addressing + * inside the btree block is done using indices starting at one, not zero! + */ + +/* + * Return size of the btree block header for this btree instance. + */ +static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) +{ + return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + XFS_BTREE_LBLOCK_LEN : + XFS_BTREE_SBLOCK_LEN; +} + +/* + * Return size of btree block pointers for this btree instance. + */ +static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur) +{ + return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + sizeof(__be64) : sizeof(__be32); +} + +/* + * Calculate offset of the n-th record in a btree block. + */ +STATIC size_t +xfs_btree_rec_offset( + struct xfs_btree_cur *cur, + int n) +{ + return xfs_btree_block_len(cur) + + (n - 1) * cur->bc_ops->rec_len; +} + +/* + * Calculate offset of the n-th key in a btree block. + */ +STATIC size_t +xfs_btree_key_offset( + struct xfs_btree_cur *cur, + int n) +{ + return xfs_btree_block_len(cur) + + (n - 1) * cur->bc_ops->key_len; +} + +/* + * Calculate offset of the n-th block pointer in a btree block. + */ +STATIC size_t +xfs_btree_ptr_offset( + struct xfs_btree_cur *cur, + int n, + int level) +{ + return xfs_btree_block_len(cur) + + cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + + (n - 1) * xfs_btree_ptr_len(cur); +} + +/* + * Return a pointer to the n-th record in the btree block. + */ +STATIC union xfs_btree_rec * +xfs_btree_rec_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + return (union xfs_btree_rec *) + ((char *)block + xfs_btree_rec_offset(cur, n)); +} + +/* + * Return a pointer to the n-th key in the btree block. + */ +STATIC union xfs_btree_key * +xfs_btree_key_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + return (union xfs_btree_key *) + ((char *)block + xfs_btree_key_offset(cur, n)); +} + +/* + * Return a pointer to the n-th block pointer in the btree block. + */ +STATIC union xfs_btree_ptr * +xfs_btree_ptr_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + int level = xfs_btree_get_level(block); + + ASSERT(block->bb_level != 0); + + return (union xfs_btree_ptr *) + ((char *)block + xfs_btree_ptr_offset(cur, n, level)); +} + +/* + * Get a the root block which is stored in the inode. + * + * For now this btree implementation assumes the btree root is always + * stored in the if_broot field of an inode fork. + */ +STATIC struct xfs_btree_block * +xfs_btree_get_iroot( + struct xfs_btree_cur *cur) +{ + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); + return (struct xfs_btree_block *)ifp->if_broot; +} + +/* + * Retrieve the block pointer from the cursor at the given level. + * This may be an inode btree root or from a buffer. + */ +STATIC struct xfs_btree_block * /* generic btree block pointer */ +xfs_btree_get_block( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in btree */ + struct xfs_buf **bpp) /* buffer containing the block */ +{ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) { + *bpp = NULL; + return xfs_btree_get_iroot(cur); + } + + *bpp = cur->bc_bufs[level]; + return XFS_BUF_TO_BLOCK(*bpp); +} + +/* + * Get a buffer for the block, return it with no data read. + * Long-form addressing. + */ +xfs_buf_t * /* buffer for fsbno */ +xfs_btree_get_bufl( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock) /* lock flags for get_buf */ +{ + xfs_buf_t *bp; /* buffer pointer (return value) */ + xfs_daddr_t d; /* real disk block address */ + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + return bp; +} + +/* + * Get a buffer for the block, return it with no data read. + * Short-form addressing. + */ +xfs_buf_t * /* buffer for agno/agbno */ +xfs_btree_get_bufs( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + uint lock) /* lock flags for get_buf */ +{ + xfs_buf_t *bp; /* buffer pointer (return value) */ + xfs_daddr_t d; /* real disk block address */ + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agbno != NULLAGBLOCK); + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + return bp; +} + +/* + * Check for the cursor referring to the last block at the given level. + */ +int /* 1=is last block, 0=not last block */ +xfs_btree_islastblock( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to check */ +{ + struct xfs_btree_block *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO; + else + return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK; +} + +/* + * Change the cursor to point to the first record at the given level. + * Other levels are unaffected. + */ +STATIC int /* success=1, failure=0 */ +xfs_btree_firstrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to change */ +{ + struct xfs_btree_block *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + /* + * Get the block pointer for this level. + */ + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + /* + * It's empty, there is no such record. + */ + if (!block->bb_numrecs) + return 0; + /* + * Set the ptr value to 1, that's the first record/key. + */ + cur->bc_ptrs[level] = 1; + return 1; +} + +/* + * Change the cursor to point to the last record in the current block + * at the given level. Other levels are unaffected. + */ +STATIC int /* success=1, failure=0 */ +xfs_btree_lastrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to change */ +{ + struct xfs_btree_block *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + /* + * Get the block pointer for this level. + */ + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + /* + * It's empty, there is no such record. + */ + if (!block->bb_numrecs) + return 0; + /* + * Set the ptr value to numrecs, that's the last record/key. + */ + cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs); + return 1; +} + +/* + * Compute first and last byte offsets for the fields given. + * Interprets the offsets table, which contains struct field offsets. + */ +void +xfs_btree_offsets( + __int64_t fields, /* bitmask of fields */ + const short *offsets, /* table of field offsets */ + int nbits, /* number of bits to inspect */ + int *first, /* output: first byte offset */ + int *last) /* output: last byte offset */ +{ + int i; /* current bit number */ + __int64_t imask; /* mask for current bit number */ + + ASSERT(fields != 0); + /* + * Find the lowest bit, so the first byte offset. + */ + for (i = 0, imask = 1LL; ; i++, imask <<= 1) { + if (imask & fields) { + *first = offsets[i]; + break; + } + } + /* + * Find the highest bit, so the last byte offset. + */ + for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) { + if (imask & fields) { + *last = offsets[i + 1] - 1; + break; + } + } +} + +/* + * Get a buffer for the block, return it read in. + * Long-form addressing. + */ +int /* error */ +xfs_btree_read_bufl( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + xfs_buf_t **bpp, /* buffer for fsbno */ + int refval) /* ref count value for buffer */ +{ + xfs_buf_t *bp; /* return value */ + xfs_daddr_t d; /* real disk block address */ + int error; + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp))) { + return error; + } + ASSERT(!bp || !XFS_BUF_GETERROR(bp)); + if (bp) + XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); + *bpp = bp; + return 0; +} + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Long-form addressing. + */ +/* ARGSUSED */ +void +xfs_btree_reada_bufl( + xfs_mount_t *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count) /* count of filesystem blocks */ +{ + xfs_daddr_t d; + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); +} + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Short-form addressing. + */ +/* ARGSUSED */ +void +xfs_btree_reada_bufs( + xfs_mount_t *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count) /* count of filesystem blocks */ +{ + xfs_daddr_t d; + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agbno != NULLAGBLOCK); + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); +} + +STATIC int +xfs_btree_readahead_lblock( + struct xfs_btree_cur *cur, + int lr, + struct xfs_btree_block *block) +{ + int rval = 0; + xfs_dfsbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); + xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); + + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { + xfs_btree_reada_bufl(cur->bc_mp, left, 1); + rval++; + } + + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { + xfs_btree_reada_bufl(cur->bc_mp, right, 1); + rval++; + } + + return rval; +} + +STATIC int +xfs_btree_readahead_sblock( + struct xfs_btree_cur *cur, + int lr, + struct xfs_btree_block *block) +{ + int rval = 0; + xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); + xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); + + + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + left, 1); + rval++; + } + + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + right, 1); + rval++; + } + + return rval; +} + +/* + * Read-ahead btree blocks, at the given level. + * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. + */ +STATIC int +xfs_btree_readahead( + struct xfs_btree_cur *cur, /* btree cursor */ + int lev, /* level in btree */ + int lr) /* left/right bits */ +{ + struct xfs_btree_block *block; + + /* + * No readahead needed if we are at the root level and the + * btree root is stored in the inode. + */ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (lev == cur->bc_nlevels - 1)) + return 0; + + if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev]) + return 0; + + cur->bc_ra[lev] |= lr; + block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return xfs_btree_readahead_lblock(cur, lr, block); + return xfs_btree_readahead_sblock(cur, lr, block); +} + +/* + * Set the buffer for level "lev" in the cursor to bp, releasing + * any previous buffer. + */ +STATIC void +xfs_btree_setbuf( + xfs_btree_cur_t *cur, /* btree cursor */ + int lev, /* level in btree */ + xfs_buf_t *bp) /* new buffer to set */ +{ + struct xfs_btree_block *b; /* btree block */ + + if (cur->bc_bufs[lev]) + xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]); + cur->bc_bufs[lev] = bp; + cur->bc_ra[lev] = 0; + + b = XFS_BUF_TO_BLOCK(bp); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO) + cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; + if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO) + cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; + } else { + if (be32_to_cpu(b->bb_u.s.bb_leftsib) == NULLAGBLOCK) + cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; + if (be32_to_cpu(b->bb_u.s.bb_rightsib) == NULLAGBLOCK) + cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; + } +} + +STATIC int +xfs_btree_ptr_is_null( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return be64_to_cpu(ptr->l) == NULLDFSBNO; + else + return be32_to_cpu(ptr->s) == NULLAGBLOCK; +} + +STATIC void +xfs_btree_set_ptr_null( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(NULLDFSBNO); + else + ptr->s = cpu_to_be32(NULLAGBLOCK); +} + +/* + * Get/set/init sibling pointers + */ +STATIC void +xfs_btree_get_sibling( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_ptr *ptr, + int lr) +{ + ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (lr == XFS_BB_RIGHTSIB) + ptr->l = block->bb_u.l.bb_rightsib; + else + ptr->l = block->bb_u.l.bb_leftsib; + } else { + if (lr == XFS_BB_RIGHTSIB) + ptr->s = block->bb_u.s.bb_rightsib; + else + ptr->s = block->bb_u.s.bb_leftsib; + } +} + +STATIC void +xfs_btree_set_sibling( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_ptr *ptr, + int lr) +{ + ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (lr == XFS_BB_RIGHTSIB) + block->bb_u.l.bb_rightsib = ptr->l; + else + block->bb_u.l.bb_leftsib = ptr->l; + } else { + if (lr == XFS_BB_RIGHTSIB) + block->bb_u.s.bb_rightsib = ptr->s; + else + block->bb_u.s.bb_leftsib = ptr->s; + } +} + +STATIC void +xfs_btree_init_block( + struct xfs_btree_cur *cur, + int level, + int numrecs, + struct xfs_btree_block *new) /* new block */ +{ + new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); + new->bb_level = cpu_to_be16(level); + new->bb_numrecs = cpu_to_be16(numrecs); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + } else { + new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + } +} + +/* + * Return true if ptr is the last record in the btree and + * we need to track updateѕ to this record. The decision + * will be further refined in the update_lastrec method. + */ +STATIC int +xfs_btree_is_lastrec( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level) +{ + union xfs_btree_ptr ptr; + + if (level > 0) + return 0; + if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE)) + return 0; + + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &ptr)) + return 0; + return 1; +} + +STATIC void +xfs_btree_buf_to_ptr( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, + XFS_BUF_ADDR(bp))); + else { + ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, + XFS_BUF_ADDR(bp))); + } +} + +STATIC xfs_daddr_t +xfs_btree_ptr_to_daddr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + ASSERT(be64_to_cpu(ptr->l) != NULLDFSBNO); + + return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); + } else { + ASSERT(cur->bc_private.a.agno != NULLAGNUMBER); + ASSERT(be32_to_cpu(ptr->s) != NULLAGBLOCK); + + return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, + be32_to_cpu(ptr->s)); + } +} + +STATIC void +xfs_btree_set_refs( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + switch (cur->bc_btnum) { + case XFS_BTNUM_BNO: + case XFS_BTNUM_CNT: + XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF); + break; + case XFS_BTNUM_INO: + XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF); + break; + case XFS_BTNUM_BMAP: + XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF); + break; + default: + ASSERT(0); + } +} + +STATIC int +xfs_btree_get_buf_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int flags, + struct xfs_btree_block **block, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_daddr_t d; + + /* need to sort out how callers deal with failures first */ + ASSERT(!(flags & XBF_TRYLOCK)); + + d = xfs_btree_ptr_to_daddr(cur, ptr); + *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, + mp->m_bsize, flags); + + ASSERT(*bpp); + ASSERT(!XFS_BUF_GETERROR(*bpp)); + + *block = XFS_BUF_TO_BLOCK(*bpp); + return 0; +} + +/* + * Read in the buffer at the given ptr and return the buffer and + * the block pointer within the buffer. + */ +STATIC int +xfs_btree_read_buf_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int level, + int flags, + struct xfs_btree_block **block, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_daddr_t d; + int error; + + /* need to sort out how callers deal with failures first */ + ASSERT(!(flags & XBF_TRYLOCK)); + + d = xfs_btree_ptr_to_daddr(cur, ptr); + error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, + mp->m_bsize, flags, bpp); + if (error) + return error; + + ASSERT(*bpp != NULL); + ASSERT(!XFS_BUF_GETERROR(*bpp)); + + xfs_btree_set_refs(cur, *bpp); + *block = XFS_BUF_TO_BLOCK(*bpp); + + error = xfs_btree_check_block(cur, *block, level, *bpp); + if (error) + xfs_trans_brelse(cur->bc_tp, *bpp); + return error; +} + +/* + * Copy keys from one btree block to another. + */ +STATIC void +xfs_btree_copy_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *dst_key, + union xfs_btree_key *src_key, + int numkeys) +{ + ASSERT(numkeys >= 0); + memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); +} + +/* + * Copy records from one btree block to another. + */ +STATIC void +xfs_btree_copy_recs( + struct xfs_btree_cur *cur, + union xfs_btree_rec *dst_rec, + union xfs_btree_rec *src_rec, + int numrecs) +{ + ASSERT(numrecs >= 0); + memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); +} + +/* + * Copy block pointers from one btree block to another. + */ +STATIC void +xfs_btree_copy_ptrs( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *dst_ptr, + union xfs_btree_ptr *src_ptr, + int numptrs) +{ + ASSERT(numptrs >= 0); + memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur)); +} + +/* + * Shift keys one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *key, + int dir, + int numkeys) +{ + char *dst_key; + + ASSERT(numkeys >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_key = (char *)key + (dir * cur->bc_ops->key_len); + memmove(dst_key, key, numkeys * cur->bc_ops->key_len); +} + +/* + * Shift records one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_recs( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + int dir, + int numrecs) +{ + char *dst_rec; + + ASSERT(numrecs >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); + memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); +} + +/* + * Shift block pointers one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_ptrs( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int dir, + int numptrs) +{ + char *dst_ptr; + + ASSERT(numptrs >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur)); + memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur)); +} + +/* + * Log key values from the btree block. + */ +STATIC void +xfs_btree_log_keys( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int first, + int last) +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + if (bp) { + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_key_offset(cur, first), + xfs_btree_key_offset(cur, last + 1) - 1); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log record values from the btree block. + */ +void +xfs_btree_log_recs( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int first, + int last) +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_rec_offset(cur, first), + xfs_btree_rec_offset(cur, last + 1) - 1); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log block pointer fields from a btree block (nonleaf). + */ +STATIC void +xfs_btree_log_ptrs( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_buf *bp, /* buffer containing btree block */ + int first, /* index of first pointer to log */ + int last) /* index of last pointer to log */ +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + if (bp) { + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + int level = xfs_btree_get_level(block); + + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_ptr_offset(cur, first, level), + xfs_btree_ptr_offset(cur, last + 1, level) - 1); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log fields from a btree block header. + */ +void +xfs_btree_log_block( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_buf *bp, /* buffer containing btree block */ + int fields) /* mask of fields: XFS_BB_... */ +{ + int first; /* first byte offset logged */ + int last; /* last byte offset logged */ + static const short soffsets[] = { /* table of offsets (short) */ + offsetof(struct xfs_btree_block, bb_magic), + offsetof(struct xfs_btree_block, bb_level), + offsetof(struct xfs_btree_block, bb_numrecs), + offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), + offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), + XFS_BTREE_SBLOCK_LEN + }; + static const short loffsets[] = { /* table of offsets (long) */ + offsetof(struct xfs_btree_block, bb_magic), + offsetof(struct xfs_btree_block, bb_level), + offsetof(struct xfs_btree_block, bb_numrecs), + offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), + offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), + XFS_BTREE_LBLOCK_LEN + }; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBI(cur, bp, fields); + + if (bp) { + xfs_btree_offsets(fields, + (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + loffsets : soffsets, + XFS_BB_NUM_BITS, &first, &last); + xfs_trans_log_buf(cur->bc_tp, bp, first, last); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Increment cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_btree_increment( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; + union xfs_btree_ptr ptr; + struct xfs_buf *bp; + int error; /* error return value */ + int lev; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + ASSERT(level < cur->bc_nlevels); + + /* Read-ahead to the right at this level. */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + + /* Get a pointer to the btree block. */ + block = xfs_btree_get_block(cur, level, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* We're done if we remain in the block after the increment. */ + if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block)) + goto out1; + + /* Fail if we just went off the right edge of the tree. */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &ptr)) + goto out0; + + XFS_BTREE_STATS_INC(cur, increment); + + /* + * March up the tree incrementing pointers. + * Stop when we don't go off the right edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + block = xfs_btree_get_block(cur, lev, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, lev, bp); + if (error) + goto error0; +#endif + + if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block)) + break; + + /* Read-ahead the right block for the next loop. */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); + } + + /* + * If we went off the root then we are either seriously + * confused or have the tree root in an inode. + */ + if (lev == cur->bc_nlevels) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + goto out0; + ASSERT(0); + error = EFSCORRUPTED; + goto error0; + } + ASSERT(lev < cur->bc_nlevels); + + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { + union xfs_btree_ptr *ptrp; + + ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); + error = xfs_btree_read_buf_block(cur, ptrp, --lev, + 0, &block, &bp); + if (error) + goto error0; + + xfs_btree_setbuf(cur, lev, bp); + cur->bc_ptrs[lev] = 1; + } +out1: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Decrement cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_btree_decrement( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; + xfs_buf_t *bp; + int error; /* error return value */ + int lev; + union xfs_btree_ptr ptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + ASSERT(level < cur->bc_nlevels); + + /* Read-ahead to the left at this level. */ + xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); + + /* We're done if we remain in the block after the decrement. */ + if (--cur->bc_ptrs[level] > 0) + goto out1; + + /* Get a pointer to the btree block. */ + block = xfs_btree_get_block(cur, level, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* Fail if we just went off the left edge of the tree. */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); + if (xfs_btree_ptr_is_null(cur, &ptr)) + goto out0; + + XFS_BTREE_STATS_INC(cur, decrement); + + /* + * March up the tree decrementing pointers. + * Stop when we don't go off the left edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + if (--cur->bc_ptrs[lev] > 0) + break; + /* Read-ahead the left block for the next loop. */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); + } + + /* + * If we went off the root then we are seriously confused. + * or the root of the tree is in an inode. + */ + if (lev == cur->bc_nlevels) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + goto out0; + ASSERT(0); + error = EFSCORRUPTED; + goto error0; + } + ASSERT(lev < cur->bc_nlevels); + + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { + union xfs_btree_ptr *ptrp; + + ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); + error = xfs_btree_read_buf_block(cur, ptrp, --lev, + 0, &block, &bp); + if (error) + goto error0; + xfs_btree_setbuf(cur, lev, bp); + cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block); + } +out1: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +STATIC int +xfs_btree_lookup_get_block( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in the btree */ + union xfs_btree_ptr *pp, /* ptr to btree block */ + struct xfs_btree_block **blkp) /* return btree block */ +{ + struct xfs_buf *bp; /* buffer pointer for btree block */ + int error = 0; + + /* special case the root block if in an inode */ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) { + *blkp = xfs_btree_get_iroot(cur); + return 0; + } + + /* + * If the old buffer at this level for the disk address we are + * looking for re-use it. + * + * Otherwise throw it away and get a new one. + */ + bp = cur->bc_bufs[level]; + if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) { + *blkp = XFS_BUF_TO_BLOCK(bp); + return 0; + } + + error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp); + if (error) + return error; + + xfs_btree_setbuf(cur, level, bp); + return 0; +} + +/* + * Get current search key. For level 0 we don't actually have a key + * structure so we make one up from the record. For all other levels + * we just return the right key. + */ +STATIC union xfs_btree_key * +xfs_lookup_get_search_key( + struct xfs_btree_cur *cur, + int level, + int keyno, + struct xfs_btree_block *block, + union xfs_btree_key *kp) +{ + if (level == 0) { + cur->bc_ops->init_key_from_rec(kp, + xfs_btree_rec_addr(cur, keyno, block)); + return kp; + } + + return xfs_btree_key_addr(cur, keyno, block); +} + +/* + * Lookup the record. The cursor is made to point to it, based on dir. + * Return 0 if can't find any such record, 1 for success. + */ +int /* error */ +xfs_btree_lookup( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_lookup_t dir, /* <=, ==, or >= */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* current btree block */ + __int64_t diff; /* difference for the current key */ + int error; /* error return value */ + int keyno; /* current key number */ + int level; /* level in the btree */ + union xfs_btree_ptr *pp; /* ptr to btree block */ + union xfs_btree_ptr ptr; /* ptr to btree block */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, dir); + + XFS_BTREE_STATS_INC(cur, lookup); + + block = NULL; + keyno = 0; + + /* initialise start pointer from cursor */ + cur->bc_ops->init_ptr_from_cur(cur, &ptr); + pp = &ptr; + + /* + * Iterate over each level in the btree, starting at the root. + * For each level above the leaves, find the key we need, based + * on the lookup record, then follow the corresponding block + * pointer down to the next level. + */ + for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { + /* Get the block we need to do the lookup on. */ + error = xfs_btree_lookup_get_block(cur, level, pp, &block); + if (error) + goto error0; + + if (diff == 0) { + /* + * If we already had a key match at a higher level, we + * know we need to use the first entry in this block. + */ + keyno = 1; + } else { + /* Otherwise search this block. Do a binary search. */ + + int high; /* high entry number */ + int low; /* low entry number */ + + /* Set low and high entry numbers, 1-based. */ + low = 1; + high = xfs_btree_get_numrecs(block); + if (!high) { + /* Block is empty, must be an empty leaf. */ + ASSERT(level == 0 && cur->bc_nlevels == 1); + + cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Binary search the block. */ + while (low <= high) { + union xfs_btree_key key; + union xfs_btree_key *kp; + + XFS_BTREE_STATS_INC(cur, compare); + + /* keyno is average of low and high. */ + keyno = (low + high) >> 1; + + /* Get current search key */ + kp = xfs_lookup_get_search_key(cur, level, + keyno, block, &key); + + /* + * Compute difference to get next direction: + * - less than, move right + * - greater than, move left + * - equal, we're done + */ + diff = cur->bc_ops->key_diff(cur, kp); + if (diff < 0) + low = keyno + 1; + else if (diff > 0) + high = keyno - 1; + else + break; + } + } + + /* + * If there are more levels, set up for the next level + * by getting the block number and filling in the cursor. + */ + if (level > 0) { + /* + * If we moved left, need the previous key number, + * unless there isn't one. + */ + if (diff > 0 && --keyno < 1) + keyno = 1; + pp = xfs_btree_ptr_addr(cur, keyno, block); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, pp, 0, level); + if (error) + goto error0; +#endif + cur->bc_ptrs[level] = keyno; + } + } + + /* Done with the search. See if we need to adjust the results. */ + if (dir != XFS_LOOKUP_LE && diff < 0) { + keyno++; + /* + * If ge search and we went off the end of the block, but it's + * not the last block, we're in the wrong block. + */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (dir == XFS_LOOKUP_GE && + keyno > xfs_btree_get_numrecs(block) && + !xfs_btree_ptr_is_null(cur, &ptr)) { + int i; + + cur->bc_ptrs[0] = keyno; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + } + } else if (dir == XFS_LOOKUP_LE && diff > 0) + keyno--; + cur->bc_ptrs[0] = keyno; + + /* Return if we succeeded or not. */ + if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) + *stat = 0; + else if (dir != XFS_LOOKUP_EQ || diff == 0) + *stat = 1; + else + *stat = 0; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Update keys at all levels from here to the root along the cursor's path. + */ +STATIC int +xfs_btree_updkey( + struct xfs_btree_cur *cur, + union xfs_btree_key *keyp, + int level) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + union xfs_btree_key *kp; + int ptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIK(cur, level, keyp); + + ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1); + + /* + * Go up the tree from this level toward the root. + * At each level, update the key value to the value input. + * Stop when we reach a level where the cursor isn't pointing + * at the first entry in the block. + */ + for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { +#ifdef DEBUG + int error; +#endif + block = xfs_btree_get_block(cur, level, &bp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } +#endif + ptr = cur->bc_ptrs[level]; + kp = xfs_btree_key_addr(cur, ptr, block); + xfs_btree_copy_keys(cur, kp, keyp, 1); + xfs_btree_log_keys(cur, bp, ptr, ptr); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + +/* + * Update the record referred to by cur to the value in the + * given record. This either works (return 0) or gets an + * EFSCORRUPTED error. + */ +int +xfs_btree_update( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + int error; + int ptr; + union xfs_btree_rec *rp; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGR(cur, rec); + + /* Pick up the current block. */ + block = xfs_btree_get_block(cur, 0, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, 0, bp); + if (error) + goto error0; +#endif + /* Get the address of the rec to be updated. */ + ptr = cur->bc_ptrs[0]; + rp = xfs_btree_rec_addr(cur, ptr, block); + + /* Fill in the new contents and log them. */ + xfs_btree_copy_recs(cur, rp, rec, 1); + xfs_btree_log_recs(cur, bp, ptr, ptr); + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, 0)) { + cur->bc_ops->update_lastrec(cur, block, rec, + ptr, LASTREC_UPDATE); + } + + /* Updating first rec in leaf. Pass new key value up to our parent. */ + if (ptr == 1) { + union xfs_btree_key key; + + cur->bc_ops->init_key_from_rec(&key, rec); + error = xfs_btree_updkey(cur, &key, 1); + if (error) + goto error0; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Move 1 record left from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_btree_lshift( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + union xfs_btree_key key; /* btree key */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + int lrecs; /* left record count */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + int rrecs; /* right record count */ + union xfs_btree_ptr lptr; /* left btree pointer */ + union xfs_btree_key *rkp = NULL; /* right btree key */ + union xfs_btree_ptr *rpp = NULL; /* right address pointer */ + union xfs_btree_rec *rrp = NULL; /* right record pointer */ + int error; /* error return value */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) + goto out0; + + /* Set up variables for this block as "right". */ + right = xfs_btree_get_block(cur, level, &rbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, right, level, rbp); + if (error) + goto error0; +#endif + + /* If we've got no left sibling then we can't shift an entry left. */ + xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + if (xfs_btree_ptr_is_null(cur, &lptr)) + goto out0; + + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + if (cur->bc_ptrs[level] <= 1) + goto out0; + + /* Set up the left neighbor as "left". */ + error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp); + if (error) + goto error0; + + /* If it's full, it can't take another entry. */ + lrecs = xfs_btree_get_numrecs(left); + if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) + goto out0; + + rrecs = xfs_btree_get_numrecs(right); + + /* + * We add one entry to the left side and remove one for the right side. + * Account for it here, the changes will be updated on disk and logged + * later. + */ + lrecs++; + rrecs--; + + XFS_BTREE_STATS_INC(cur, lshift); + XFS_BTREE_STATS_ADD(cur, moves, 1); + + /* + * If non-leaf, copy a key and a ptr to the left block. + * Log the changes to the left block. + */ + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + + lkp = xfs_btree_key_addr(cur, lrecs, left); + rkp = xfs_btree_key_addr(cur, 1, right); + + lpp = xfs_btree_ptr_addr(cur, lrecs, left); + rpp = xfs_btree_ptr_addr(cur, 1, right); +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, rpp, 0, level); + if (error) + goto error0; +#endif + xfs_btree_copy_keys(cur, lkp, rkp, 1); + xfs_btree_copy_ptrs(cur, lpp, rpp, 1); + + xfs_btree_log_keys(cur, lbp, lrecs, lrecs); + xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); + + ASSERT(cur->bc_ops->keys_inorder(cur, + xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + + lrp = xfs_btree_rec_addr(cur, lrecs, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, lrp, rrp, 1); + xfs_btree_log_recs(cur, lbp, lrecs, lrecs); + + ASSERT(cur->bc_ops->recs_inorder(cur, + xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); + } + + xfs_btree_set_numrecs(left, lrecs); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); + + xfs_btree_set_numrecs(right, rrecs); + xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); + + /* + * Slide the contents of right down one entry. + */ + XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); + if (level > 0) { + /* It's a nonleaf. operate on keys and ptrs */ +#ifdef DEBUG + int i; /* loop index */ + + for (i = 0; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, rpp, i + 1, level); + if (error) + goto error0; + } +#endif + xfs_btree_shift_keys(cur, + xfs_btree_key_addr(cur, 2, right), + -1, rrecs); + xfs_btree_shift_ptrs(cur, + xfs_btree_ptr_addr(cur, 2, right), + -1, rrecs); + + xfs_btree_log_keys(cur, rbp, 1, rrecs); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs); + } else { + /* It's a leaf. operate on records */ + xfs_btree_shift_recs(cur, + xfs_btree_rec_addr(cur, 2, right), + -1, rrecs); + xfs_btree_log_recs(cur, rbp, 1, rrecs); + + /* + * If it's the first record in the block, we'll need a key + * structure to pass up to the next level (updkey). + */ + cur->bc_ops->init_key_from_rec(&key, + xfs_btree_rec_addr(cur, 1, right)); + rkp = &key; + } + + /* Update the parent key values of right. */ + error = xfs_btree_updkey(cur, rkp, level + 1); + if (error) + goto error0; + + /* Slide the cursor value left one. */ + cur->bc_ptrs[level]--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Move 1 record right from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_btree_rshift( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + union xfs_btree_key key; /* btree key */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + struct xfs_btree_cur *tcur; /* temporary btree cursor */ + union xfs_btree_ptr rptr; /* right block pointer */ + union xfs_btree_key *rkp; /* right btree key */ + int rrecs; /* right record count */ + int lrecs; /* left record count */ + int error; /* error return value */ + int i; /* loop counter */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) + goto out0; + + /* Set up variables for this block as "left". */ + left = xfs_btree_get_block(cur, level, &lbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + + /* If we've got no right sibling then we can't shift an entry right. */ + xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &rptr)) + goto out0; + + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + lrecs = xfs_btree_get_numrecs(left); + if (cur->bc_ptrs[level] >= lrecs) + goto out0; + + /* Set up the right neighbor as "right". */ + error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp); + if (error) + goto error0; + + /* If it's full, it can't take another entry. */ + rrecs = xfs_btree_get_numrecs(right); + if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) + goto out0; + + XFS_BTREE_STATS_INC(cur, rshift); + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + + /* + * Make a hole at the start of the right neighbor block, then + * copy the last left block entry to the hole. + */ + if (level > 0) { + /* It's a nonleaf. make a hole in the keys and ptrs */ + union xfs_btree_key *lkp; + union xfs_btree_ptr *lpp; + union xfs_btree_ptr *rpp; + + lkp = xfs_btree_key_addr(cur, lrecs, left); + lpp = xfs_btree_ptr_addr(cur, lrecs, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); + +#ifdef DEBUG + for (i = rrecs - 1; i >= 0; i--) { + error = xfs_btree_check_ptr(cur, rpp, i, level); + if (error) + goto error0; + } +#endif + + xfs_btree_shift_keys(cur, rkp, 1, rrecs); + xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, lpp, 0, level); + if (error) + goto error0; +#endif + + /* Now put the new data in, and log it. */ + xfs_btree_copy_keys(cur, rkp, lkp, 1); + xfs_btree_copy_ptrs(cur, rpp, lpp, 1); + + xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); + + ASSERT(cur->bc_ops->keys_inorder(cur, rkp, + xfs_btree_key_addr(cur, 2, right))); + } else { + /* It's a leaf. make a hole in the records */ + union xfs_btree_rec *lrp; + union xfs_btree_rec *rrp; + + lrp = xfs_btree_rec_addr(cur, lrecs, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_shift_recs(cur, rrp, 1, rrecs); + + /* Now put the new data in, and log it. */ + xfs_btree_copy_recs(cur, rrp, lrp, 1); + xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); + + cur->bc_ops->init_key_from_rec(&key, rrp); + rkp = &key; + + ASSERT(cur->bc_ops->recs_inorder(cur, rrp, + xfs_btree_rec_addr(cur, 2, right))); + } + + /* + * Decrement and log left's numrecs, bump and log right's numrecs. + */ + xfs_btree_set_numrecs(left, --lrecs); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); + + xfs_btree_set_numrecs(right, ++rrecs); + xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); + + /* + * Using a temporary cursor, update the parent key values of the + * block on the right. + */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_btree_increment(tcur, level, &i); + if (error) + goto error1; + + error = xfs_btree_updkey(tcur, rkp, level + 1); + if (error) + goto error1; + + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + +error1: + XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR); + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Split cur/level block in half. + * Return new block number and the key to its first + * record (to be inserted into parent). + */ +STATIC int /* error */ +xfs_btree_split( + struct xfs_btree_cur *cur, + int level, + union xfs_btree_ptr *ptrp, + union xfs_btree_key *key, + struct xfs_btree_cur **curp, + int *stat) /* success/failure */ +{ + union xfs_btree_ptr lptr; /* left sibling block ptr */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + union xfs_btree_ptr rptr; /* right sibling block ptr */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + union xfs_btree_ptr rrptr; /* right-right sibling ptr */ + struct xfs_buf *rrbp; /* right-right buffer pointer */ + struct xfs_btree_block *rrblock; /* right-right btree block */ + int lrecs; + int rrecs; + int src_index; + int error; /* error return value */ +#ifdef DEBUG + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key); + + XFS_BTREE_STATS_INC(cur, split); + + /* Set up left block (current one). */ + left = xfs_btree_get_block(cur, level, &lbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + + xfs_btree_buf_to_ptr(cur, lbp, &lptr); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat); + if (error) + goto error0; + if (*stat == 0) + goto out0; + XFS_BTREE_STATS_INC(cur, alloc); + + /* Set up the new block as "right". */ + error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp); + if (error) + goto error0; + + /* Fill in the btree header for the new right block. */ + xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right); + + /* + * Split the entries between the old and the new block evenly. + * Make sure that if there's an odd number of entries now, that + * each new block will have the same number of entries. + */ + lrecs = xfs_btree_get_numrecs(left); + rrecs = lrecs / 2; + if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1) + rrecs++; + src_index = (lrecs - rrecs + 1); + + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + + /* + * Copy btree block entries from the left block over to the + * new block, the right. Update the right block and log the + * changes. + */ + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + union xfs_btree_key *rkp; /* right btree key */ + union xfs_btree_ptr *rpp; /* right address pointer */ + + lkp = xfs_btree_key_addr(cur, src_index, left); + lpp = xfs_btree_ptr_addr(cur, src_index, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); + +#ifdef DEBUG + for (i = src_index; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, lpp, i, level); + if (error) + goto error0; + } +#endif + + xfs_btree_copy_keys(cur, rkp, lkp, rrecs); + xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); + + xfs_btree_log_keys(cur, rbp, 1, rrecs); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs); + + /* Grab the keys to the entries moved to the right block */ + xfs_btree_copy_keys(cur, key, rkp, 1); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + union xfs_btree_rec *rrp; /* right record pointer */ + + lrp = xfs_btree_rec_addr(cur, src_index, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, rrp, lrp, rrecs); + xfs_btree_log_recs(cur, rbp, 1, rrecs); + + cur->bc_ops->init_key_from_rec(key, + xfs_btree_rec_addr(cur, 1, right)); + } + + + /* + * Find the left block number by looking in the buffer. + * Adjust numrecs, sibling pointers. + */ + xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); + xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); + xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); + + lrecs -= rrecs; + xfs_btree_set_numrecs(left, lrecs); + xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); + + xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + + /* + * If there's a block to the new block's right, make that block + * point back to right instead of to left. + */ + if (!xfs_btree_ptr_is_null(cur, &rrptr)) { + error = xfs_btree_read_buf_block(cur, &rrptr, level, + 0, &rrblock, &rrbp); + if (error) + goto error0; + xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); + xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); + } + /* + * If the cursor is really in the right block, move it there. + * If it's just pointing past the last entry in left, then we'll + * insert there, so don't change anything in that case. + */ + if (cur->bc_ptrs[level] > lrecs + 1) { + xfs_btree_setbuf(cur, level, rbp); + cur->bc_ptrs[level] -= lrecs; + } + /* + * If there are more levels, we'll need another cursor which refers + * the right block, no matter where this cursor was. + */ + if (level + 1 < cur->bc_nlevels) { + error = xfs_btree_dup_cursor(cur, curp); + if (error) + goto error0; + (*curp)->bc_ptrs[level + 1]++; + } + *ptrp = rptr; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Copy the old inode root contents into a real block and make the + * broot point to it. + */ +int /* error */ +xfs_btree_new_iroot( + struct xfs_btree_cur *cur, /* btree cursor */ + int *logflags, /* logging flags for inode */ + int *stat) /* return status - 0 fail */ +{ + struct xfs_buf *cbp; /* buffer for cblock */ + struct xfs_btree_block *block; /* btree block */ + struct xfs_btree_block *cblock; /* child btree block */ + union xfs_btree_key *ckp; /* child key pointer */ + union xfs_btree_ptr *cpp; /* child ptr pointer */ + union xfs_btree_key *kp; /* pointer to btree key */ + union xfs_btree_ptr *pp; /* pointer to block addr */ + union xfs_btree_ptr nptr; /* new block addr */ + int level; /* btree level */ + int error; /* error return code */ +#ifdef DEBUG + int i; /* loop counter */ +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, newroot); + + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + + level = cur->bc_nlevels - 1; + + block = xfs_btree_get_iroot(cur); + pp = xfs_btree_ptr_addr(cur, 1, block); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat); + if (error) + goto error0; + if (*stat == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + } + XFS_BTREE_STATS_INC(cur, alloc); + + /* Copy the root into a real block. */ + error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp); + if (error) + goto error0; + + memcpy(cblock, block, xfs_btree_block_len(cur)); + + be16_add_cpu(&block->bb_level, 1); + xfs_btree_set_numrecs(block, 1); + cur->bc_nlevels++; + cur->bc_ptrs[level + 1] = 1; + + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); + + cpp = xfs_btree_ptr_addr(cur, 1, cblock); +#ifdef DEBUG + for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { + error = xfs_btree_check_ptr(cur, pp, i, level); + if (error) + goto error0; + } +#endif + xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, &nptr, 0, level); + if (error) + goto error0; +#endif + xfs_btree_copy_ptrs(cur, pp, &nptr, 1); + + xfs_iroot_realloc(cur->bc_private.b.ip, + 1 - xfs_btree_get_numrecs(cblock), + cur->bc_private.b.whichfork); + + xfs_btree_setbuf(cur, level, cbp); + + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); + xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); + xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); + + *logflags |= + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork); + *stat = 1; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Allocate a new root block, fill it in. + */ +STATIC int /* error */ +xfs_btree_new_root( + struct xfs_btree_cur *cur, /* btree cursor */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* one half of the old root block */ + struct xfs_buf *bp; /* buffer containing block */ + int error; /* error return value */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + struct xfs_buf *nbp; /* new (root) buffer */ + struct xfs_btree_block *new; /* new (root) btree block */ + int nptr; /* new value for key index, 1 or 2 */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + union xfs_btree_ptr rptr; + union xfs_btree_ptr lptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, newroot); + + /* initialise our start point from the cursor */ + cur->bc_ops->init_ptr_from_cur(cur, &rptr); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat); + if (error) + goto error0; + if (*stat == 0) + goto out0; + XFS_BTREE_STATS_INC(cur, alloc); + + /* Set up the new block. */ + error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp); + if (error) + goto error0; + + /* Set the root in the holding structure increasing the level by 1. */ + cur->bc_ops->set_root(cur, &lptr, 1); + + /* + * At the previous root level there are now two blocks: the old root, + * and the new block generated when it was split. We don't know which + * one the cursor is pointing at, so we set up variables "left" and + * "right" for each case. + */ + block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); + if (error) + goto error0; +#endif + + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &rptr)) { + /* Our block is left, pick up the right block. */ + lbp = bp; + xfs_btree_buf_to_ptr(cur, lbp, &lptr); + left = block; + error = xfs_btree_read_buf_block(cur, &rptr, + cur->bc_nlevels - 1, 0, &right, &rbp); + if (error) + goto error0; + bp = rbp; + nptr = 1; + } else { + /* Our block is right, pick up the left block. */ + rbp = bp; + xfs_btree_buf_to_ptr(cur, rbp, &rptr); + right = block; + xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + error = xfs_btree_read_buf_block(cur, &lptr, + cur->bc_nlevels - 1, 0, &left, &lbp); + if (error) + goto error0; + bp = lbp; + nptr = 2; + } + /* Fill in the new block's btree header and log it. */ + xfs_btree_init_block(cur, cur->bc_nlevels, 2, new); + xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); + ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && + !xfs_btree_ptr_is_null(cur, &rptr)); + + /* Fill in the key data in the new root. */ + if (xfs_btree_get_level(left) > 0) { + xfs_btree_copy_keys(cur, + xfs_btree_key_addr(cur, 1, new), + xfs_btree_key_addr(cur, 1, left), 1); + xfs_btree_copy_keys(cur, + xfs_btree_key_addr(cur, 2, new), + xfs_btree_key_addr(cur, 1, right), 1); + } else { + cur->bc_ops->init_key_from_rec( + xfs_btree_key_addr(cur, 1, new), + xfs_btree_rec_addr(cur, 1, left)); + cur->bc_ops->init_key_from_rec( + xfs_btree_key_addr(cur, 2, new), + xfs_btree_rec_addr(cur, 1, right)); + } + xfs_btree_log_keys(cur, nbp, 1, 2); + + /* Fill in the pointer data in the new root. */ + xfs_btree_copy_ptrs(cur, + xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); + xfs_btree_copy_ptrs(cur, + xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); + xfs_btree_log_ptrs(cur, nbp, 1, 2); + + /* Fix up the cursor. */ + xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); + cur->bc_ptrs[cur->bc_nlevels] = nptr; + cur->bc_nlevels++; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; +} + +STATIC int +xfs_btree_make_block_unfull( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* btree level */ + int numrecs,/* # of recs in block */ + int *oindex,/* old tree index */ + int *index, /* new tree index */ + union xfs_btree_ptr *nptr, /* new btree ptr */ + struct xfs_btree_cur **ncur, /* new btree cursor */ + union xfs_btree_rec *nrec, /* new record */ + int *stat) +{ + union xfs_btree_key key; /* new btree key value */ + int error = 0; + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) { + struct xfs_inode *ip = cur->bc_private.b.ip; + + if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { + /* A root block that can be made bigger. */ + + xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); + } else { + /* A root block that needs replacing */ + int logflags = 0; + + error = xfs_btree_new_iroot(cur, &logflags, stat); + if (error || *stat == 0) + return error; + + xfs_trans_log_inode(cur->bc_tp, ip, logflags); + } + + return 0; + } + + /* First, try shifting an entry to the right neighbor. */ + error = xfs_btree_rshift(cur, level, stat); + if (error || *stat) + return error; + + /* Next, try shifting an entry to the left neighbor. */ + error = xfs_btree_lshift(cur, level, stat); + if (error) + return error; + + if (*stat) { + *oindex = *index = cur->bc_ptrs[level]; + return 0; + } + + /* + * Next, try splitting the current block in half. + * + * If this works we have to re-set our variables because we + * could be in a different block now. + */ + error = xfs_btree_split(cur, level, nptr, &key, ncur, stat); + if (error || *stat == 0) + return error; + + + *index = cur->bc_ptrs[level]; + cur->bc_ops->init_rec_from_key(&key, nrec); + return 0; +} + +/* + * Insert one record/level. Return information to the caller + * allowing the next level up to proceed if necessary. + */ +STATIC int +xfs_btree_insrec( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level to insert record at */ + union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ + union xfs_btree_rec *recp, /* i/o: record data inserted */ + struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* btree block */ + struct xfs_buf *bp; /* buffer for block */ + union xfs_btree_key key; /* btree key */ + union xfs_btree_ptr nptr; /* new block ptr */ + struct xfs_btree_cur *ncur; /* new btree cursor */ + union xfs_btree_rec nrec; /* new record count */ + int optr; /* old key/record index */ + int ptr; /* key/record index */ + int numrecs;/* number of records */ + int error; /* error return value */ +#ifdef DEBUG + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp); + + ncur = NULL; + + /* + * If we have an external root pointer, and we've made it to the + * root level, allocate a new root block and we're done. + */ + if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level >= cur->bc_nlevels)) { + error = xfs_btree_new_root(cur, stat); + xfs_btree_set_ptr_null(cur, ptrp); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return error; + } + + /* If we're off the left edge, return failure. */ + ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Make a key out of the record data to be inserted, and save it. */ + cur->bc_ops->init_key_from_rec(&key, recp); + + optr = ptr; + + XFS_BTREE_STATS_INC(cur, insrec); + + /* Get pointers to the btree buffer and block. */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; + + /* Check that the new entry is being inserted in the right place. */ + if (ptr <= numrecs) { + if (level == 0) { + ASSERT(cur->bc_ops->recs_inorder(cur, recp, + xfs_btree_rec_addr(cur, ptr, block))); + } else { + ASSERT(cur->bc_ops->keys_inorder(cur, &key, + xfs_btree_key_addr(cur, ptr, block))); + } + } +#endif + + /* + * If the block is full, we can't insert the new entry until we + * make the block un-full. + */ + xfs_btree_set_ptr_null(cur, &nptr); + if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { + error = xfs_btree_make_block_unfull(cur, level, numrecs, + &optr, &ptr, &nptr, &ncur, &nrec, stat); + if (error || *stat == 0) + goto error0; + } + + /* + * The current block may have changed if the block was + * previously full and we have just made space in it. + */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + return error; +#endif + + /* + * At this point we know there's room for our new entry in the block + * we're pointing at. + */ + XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); + + if (level > 0) { + /* It's a nonleaf. make a hole in the keys and ptrs */ + union xfs_btree_key *kp; + union xfs_btree_ptr *pp; + + kp = xfs_btree_key_addr(cur, ptr, block); + pp = xfs_btree_ptr_addr(cur, ptr, block); + +#ifdef DEBUG + for (i = numrecs - ptr; i >= 0; i--) { + error = xfs_btree_check_ptr(cur, pp, i, level); + if (error) + return error; + } +#endif + + xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); + xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, ptrp, 0, level); + if (error) + goto error0; +#endif + + /* Now put the new data in, bump numrecs and log it. */ + xfs_btree_copy_keys(cur, kp, &key, 1); + xfs_btree_copy_ptrs(cur, pp, ptrp, 1); + numrecs++; + xfs_btree_set_numrecs(block, numrecs); + xfs_btree_log_ptrs(cur, bp, ptr, numrecs); + xfs_btree_log_keys(cur, bp, ptr, numrecs); +#ifdef DEBUG + if (ptr < numrecs) { + ASSERT(cur->bc_ops->keys_inorder(cur, kp, + xfs_btree_key_addr(cur, ptr + 1, block))); + } +#endif + } else { + /* It's a leaf. make a hole in the records */ + union xfs_btree_rec *rp; + + rp = xfs_btree_rec_addr(cur, ptr, block); + + xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); + + /* Now put the new data in, bump numrecs and log it. */ + xfs_btree_copy_recs(cur, rp, recp, 1); + xfs_btree_set_numrecs(block, ++numrecs); + xfs_btree_log_recs(cur, bp, ptr, numrecs); +#ifdef DEBUG + if (ptr < numrecs) { + ASSERT(cur->bc_ops->recs_inorder(cur, rp, + xfs_btree_rec_addr(cur, ptr + 1, block))); + } +#endif + } + + /* Log the new number of records in the btree header. */ + xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); + + /* If we inserted at the start of a block, update the parents' keys. */ + if (optr == 1) { + error = xfs_btree_updkey(cur, &key, level + 1); + if (error) + goto error0; + } + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, level)) { + cur->bc_ops->update_lastrec(cur, block, recp, + ptr, LASTREC_INSREC); + } + + /* + * Return the new block number, if any. + * If there is one, give back a record value and a cursor too. + */ + *ptrp = nptr; + if (!xfs_btree_ptr_is_null(cur, &nptr)) { + *recp = nrec; + *curp = ncur; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Insert the record at the point referenced by cur. + * + * A multi-level split of the tree on insert will invalidate the original + * cursor. All callers of this function should assume that the cursor is + * no longer valid and revalidate it. + */ +int +xfs_btree_insert( + struct xfs_btree_cur *cur, + int *stat) +{ + int error; /* error return value */ + int i; /* result value, 0 for failure */ + int level; /* current level number in btree */ + union xfs_btree_ptr nptr; /* new block number (split result) */ + struct xfs_btree_cur *ncur; /* new cursor (split result) */ + struct xfs_btree_cur *pcur; /* previous level's cursor */ + union xfs_btree_rec rec; /* record to insert */ + + level = 0; + ncur = NULL; + pcur = cur; + + xfs_btree_set_ptr_null(cur, &nptr); + cur->bc_ops->init_rec_from_cur(cur, &rec); + + /* + * Loop going up the tree, starting at the leaf level. + * Stop when we don't get a split block, that must mean that + * the insert is finished with this level. + */ + do { + /* + * Insert nrec/nptr into this level of the tree. + * Note if we fail, nptr will be null. + */ + error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i); + if (error) { + if (pcur != cur) + xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); + goto error0; + } + + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + level++; + + /* + * See if the cursor we just used is trash. + * Can't trash the caller's cursor, but otherwise we should + * if ncur is a new cursor or we're about to be done. + */ + if (pcur != cur && + (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { + /* Save the state from the cursor before we trash it */ + if (cur->bc_ops->update_cursor) + cur->bc_ops->update_cursor(pcur, cur); + cur->bc_nlevels = pcur->bc_nlevels; + xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); + } + /* If we got a new cursor, switch to it. */ + if (ncur) { + pcur = ncur; + ncur = NULL; + } + } while (!xfs_btree_ptr_is_null(cur, &nptr)); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = i; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Try to merge a non-leaf block back into the inode root. + * + * Note: the killroot names comes from the fact that we're effectively + * killing the old root block. But because we can't just delete the + * inode we have to copy the single block it was pointing to into the + * inode. + */ +STATIC int +xfs_btree_kill_iroot( + struct xfs_btree_cur *cur) +{ + int whichfork = cur->bc_private.b.whichfork; + struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_block *block; + struct xfs_btree_block *cblock; + union xfs_btree_key *kp; + union xfs_btree_key *ckp; + union xfs_btree_ptr *pp; + union xfs_btree_ptr *cpp; + struct xfs_buf *cbp; + int level; + int index; + int numrecs; +#ifdef DEBUG + union xfs_btree_ptr ptr; + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_nlevels > 1); + + /* + * Don't deal with the root block needs to be a leaf case. + * We're just going to turn the thing back into extents anyway. + */ + level = cur->bc_nlevels - 1; + if (level == 1) + goto out0; + + /* + * Give up if the root has multiple children. + */ + block = xfs_btree_get_iroot(cur); + if (xfs_btree_get_numrecs(block) != 1) + goto out0; + + cblock = xfs_btree_get_block(cur, level - 1, &cbp); + numrecs = xfs_btree_get_numrecs(cblock); + + /* + * Only do this if the next level will fit. + * Then the data must be copied up to the inode, + * instead of freeing the root you free the next level. + */ + if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) + goto out0; + + XFS_BTREE_STATS_INC(cur, killroot); + +#ifdef DEBUG + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); +#endif + + index = numrecs - cur->bc_ops->get_maxrecs(cur, level); + if (index) { + xfs_iroot_realloc(cur->bc_private.b.ip, index, + cur->bc_private.b.whichfork); + block = ifp->if_broot; + } + + be16_add_cpu(&block->bb_numrecs, index); + ASSERT(block->bb_numrecs == cblock->bb_numrecs); + + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, kp, ckp, numrecs); + + pp = xfs_btree_ptr_addr(cur, 1, block); + cpp = xfs_btree_ptr_addr(cur, 1, cblock); +#ifdef DEBUG + for (i = 0; i < numrecs; i++) { + int error; + + error = xfs_btree_check_ptr(cur, cpp, i, level - 1); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + } +#endif + xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); + + cur->bc_ops->free_block(cur, cbp); + XFS_BTREE_STATS_INC(cur, free); + + cur->bc_bufs[level - 1] = NULL; + be16_add_cpu(&block->bb_level, -1); + xfs_trans_log_inode(cur->bc_tp, ip, + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + cur->bc_nlevels--; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + +/* + * Kill the current root node, and replace it with it's only child node. + */ +STATIC int +xfs_btree_kill_root( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int level, + union xfs_btree_ptr *newroot) +{ + int error; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, killroot); + + /* + * Update the root pointer, decreasing the level by 1 and then + * free the old root. + */ + cur->bc_ops->set_root(cur, newroot, -1); + + error = cur->bc_ops->free_block(cur, bp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + + XFS_BTREE_STATS_INC(cur, free); + + cur->bc_bufs[level] = NULL; + cur->bc_ra[level] = 0; + cur->bc_nlevels--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + +STATIC int +xfs_btree_dec_cursor( + struct xfs_btree_cur *cur, + int level, + int *stat) +{ + int error; + int i; + + if (level > 0) { + error = xfs_btree_decrement(cur, level, &i); + if (error) + return error; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +} + +/* + * Single level of the btree record deletion routine. + * Delete record pointed to by cur/level. + * Remove the record from its block then rebalance the tree. + * Return 0 for error, 1 for done, 2 to go on to the next level. + */ +STATIC int /* error */ +xfs_btree_delrec( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level removing record from */ + int *stat) /* fail/done/go-on */ +{ + struct xfs_btree_block *block; /* btree block */ + union xfs_btree_ptr cptr; /* current block ptr */ + struct xfs_buf *bp; /* buffer for block */ + int error; /* error return value */ + int i; /* loop counter */ + union xfs_btree_key key; /* storage for keyp */ + union xfs_btree_key *keyp = &key; /* passed to the next level */ + union xfs_btree_ptr lptr; /* left sibling block ptr */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + int lrecs = 0; /* left record count */ + int ptr; /* key/record index */ + union xfs_btree_ptr rptr; /* right sibling block ptr */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + struct xfs_btree_block *rrblock; /* right-right btree block */ + struct xfs_buf *rrbp; /* right-right buffer pointer */ + int rrecs = 0; /* right record count */ + struct xfs_btree_cur *tcur; /* temporary btree cursor */ + int numrecs; /* temporary numrec count */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + tcur = NULL; + + /* Get the index of the entry being deleted, check for nothing there. */ + ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Get the buffer & block containing the record or key/ptr. */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* Fail if we're off the end of the block. */ + if (ptr > numrecs) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + XFS_BTREE_STATS_INC(cur, delrec); + XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); + + /* Excise the entries being deleted. */ + if (level > 0) { + /* It's a nonleaf. operate on keys and ptrs */ + union xfs_btree_key *lkp; + union xfs_btree_ptr *lpp; + + lkp = xfs_btree_key_addr(cur, ptr + 1, block); + lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); + +#ifdef DEBUG + for (i = 0; i < numrecs - ptr; i++) { + error = xfs_btree_check_ptr(cur, lpp, i, level); + if (error) + goto error0; + } +#endif + + if (ptr < numrecs) { + xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); + xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); + xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); + xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); + } + + /* + * If it's the first record in the block, we'll need to pass a + * key up to the next level (updkey). + */ + if (ptr == 1) + keyp = xfs_btree_key_addr(cur, 1, block); + } else { + /* It's a leaf. operate on records */ + if (ptr < numrecs) { + xfs_btree_shift_recs(cur, + xfs_btree_rec_addr(cur, ptr + 1, block), + -1, numrecs - ptr); + xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); + } + + /* + * If it's the first record in the block, we'll need a key + * structure to pass up to the next level (updkey). + */ + if (ptr == 1) { + cur->bc_ops->init_key_from_rec(&key, + xfs_btree_rec_addr(cur, 1, block)); + keyp = &key; + } + } + + /* + * Decrement and log the number of entries in the block. + */ + xfs_btree_set_numrecs(block, --numrecs); + xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, level)) { + cur->bc_ops->update_lastrec(cur, block, NULL, + ptr, LASTREC_DELREC); + } + + /* + * We're at the root level. First, shrink the root block in-memory. + * Try to get rid of the next level down. If we can't then there's + * nothing left to do. + */ + if (level == cur->bc_nlevels - 1) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + xfs_iroot_realloc(cur->bc_private.b.ip, -1, + cur->bc_private.b.whichfork); + + error = xfs_btree_kill_iroot(cur); + if (error) + goto error0; + + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + *stat = 1; + return 0; + } + + /* + * If this is the root level, and there's only one entry left, + * and it's NOT the leaf level, then we can get rid of this + * level. + */ + if (numrecs == 1 && level > 0) { + union xfs_btree_ptr *pp; + /* + * pp is still set to the first pointer in the block. + * Make it the new root of the btree. + */ + pp = xfs_btree_ptr_addr(cur, 1, block); + error = xfs_btree_kill_root(cur, bp, level, pp); + if (error) + goto error0; + } else if (level > 0) { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + } + *stat = 1; + return 0; + } + + /* + * If we deleted the leftmost entry in the block, update the + * key values above us in the tree. + */ + if (ptr == 1) { + error = xfs_btree_updkey(cur, keyp, level + 1); + if (error) + goto error0; + } + + /* + * If the number of records remaining in the block is at least + * the minimum, we're done. + */ + if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + + /* + * Otherwise, we have to move some records around to keep the + * tree balanced. Look at the left and right sibling blocks to + * see if we can re-balance by moving only one record. + */ + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); + + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + /* + * One child of root, need to get a chance to copy its contents + * into the root and delete it. Can't go up to next level, + * there's nothing to delete there. + */ + if (xfs_btree_ptr_is_null(cur, &rptr) && + xfs_btree_ptr_is_null(cur, &lptr) && + level == cur->bc_nlevels - 2) { + error = xfs_btree_kill_iroot(cur); + if (!error) + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + } + + ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || + !xfs_btree_ptr_is_null(cur, &lptr)); + + /* + * Duplicate the cursor so our btree manipulations here won't + * disrupt the next level up. + */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + + /* + * If there's a right sibling, see if it's ok to shift an entry + * out of it. + */ + if (!xfs_btree_ptr_is_null(cur, &rptr)) { + /* + * Move the temp cursor to the last entry in the next block. + * Actually any entry but the first would suffice. + */ + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_btree_increment(tcur, level, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + /* Grab a pointer to the block. */ + right = xfs_btree_get_block(tcur, level, &rbp); +#ifdef DEBUG + error = xfs_btree_check_block(tcur, right, level, rbp); + if (error) + goto error0; +#endif + /* Grab the current block number, for future use. */ + xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); + + /* + * If right block is full enough so that removing one entry + * won't make it too empty, and left-shifting an entry out + * of right to us works, we're done. + */ + if (xfs_btree_get_numrecs(right) - 1 >= + cur->bc_ops->get_minrecs(tcur, level)) { + error = xfs_btree_lshift(tcur, level, &i); + if (error) + goto error0; + if (i) { + ASSERT(xfs_btree_get_numrecs(block) >= + cur->bc_ops->get_minrecs(tcur, level)); + + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + } + + /* + * Otherwise, grab the number of records in right for + * future reference, and fix up the temp cursor to point + * to our block again (last record). + */ + rrecs = xfs_btree_get_numrecs(right); + if (!xfs_btree_ptr_is_null(cur, &lptr)) { + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_btree_decrement(tcur, level, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + } + + /* + * If there's a left sibling, see if it's ok to shift an entry + * out of it. + */ + if (!xfs_btree_ptr_is_null(cur, &lptr)) { + /* + * Move the temp cursor to the first entry in the + * previous block. + */ + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_btree_decrement(tcur, level, &i); + if (error) + goto error0; + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + /* Grab a pointer to the block. */ + left = xfs_btree_get_block(tcur, level, &lbp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + /* Grab the current block number, for future use. */ + xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); + + /* + * If left block is full enough so that removing one entry + * won't make it too empty, and right-shifting an entry out + * of left to us works, we're done. + */ + if (xfs_btree_get_numrecs(left) - 1 >= + cur->bc_ops->get_minrecs(tcur, level)) { + error = xfs_btree_rshift(tcur, level, &i); + if (error) + goto error0; + if (i) { + ASSERT(xfs_btree_get_numrecs(block) >= + cur->bc_ops->get_minrecs(tcur, level)); + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + if (level == 0) + cur->bc_ptrs[0]++; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + } + } + + /* + * Otherwise, grab the number of records in right for + * future reference. + */ + lrecs = xfs_btree_get_numrecs(left); + } + + /* Delete the temp cursor, we're done with it. */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + + /* If here, we need to do a join to keep the tree balanced. */ + ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); + + if (!xfs_btree_ptr_is_null(cur, &lptr) && + lrecs + xfs_btree_get_numrecs(block) <= + cur->bc_ops->get_maxrecs(cur, level)) { + /* + * Set "right" to be the starting block, + * "left" to be the left neighbor. + */ + rptr = cptr; + right = block; + rbp = bp; + error = xfs_btree_read_buf_block(cur, &lptr, level, + 0, &left, &lbp); + if (error) + goto error0; + + /* + * If that won't work, see if we can join with the right neighbor block. + */ + } else if (!xfs_btree_ptr_is_null(cur, &rptr) && + rrecs + xfs_btree_get_numrecs(block) <= + cur->bc_ops->get_maxrecs(cur, level)) { + /* + * Set "left" to be the starting block, + * "right" to be the right neighbor. + */ + lptr = cptr; + left = block; + lbp = bp; + error = xfs_btree_read_buf_block(cur, &rptr, level, + 0, &right, &rbp); + if (error) + goto error0; + + /* + * Otherwise, we can't fix the imbalance. + * Just return. This is probably a logic error, but it's not fatal. + */ + } else { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + + rrecs = xfs_btree_get_numrecs(right); + lrecs = xfs_btree_get_numrecs(left); + + /* + * We're now going to join "left" and "right" by moving all the stuff + * in "right" to "left" and deleting "right". + */ + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + union xfs_btree_key *rkp; /* right btree key */ + union xfs_btree_ptr *rpp; /* right address pointer */ + + lkp = xfs_btree_key_addr(cur, lrecs + 1, left); + lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); +#ifdef DEBUG + for (i = 1; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, rpp, i, level); + if (error) + goto error0; + } +#endif + xfs_btree_copy_keys(cur, lkp, rkp, rrecs); + xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); + + xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); + xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + union xfs_btree_rec *rrp; /* right record pointer */ + + lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, lrp, rrp, rrecs); + xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); + } + + XFS_BTREE_STATS_INC(cur, join); + + /* + * Fix up the number of records and right block pointer in the + * surviving block, and log it. + */ + xfs_btree_set_numrecs(left, lrecs + rrecs); + xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB), + xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + + /* If there is a right sibling, point it to the remaining block. */ + xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &cptr)) { + error = xfs_btree_read_buf_block(cur, &cptr, level, + 0, &rrblock, &rrbp); + if (error) + goto error0; + xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); + xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); + } + + /* Free the deleted block. */ + error = cur->bc_ops->free_block(cur, rbp); + if (error) + goto error0; + XFS_BTREE_STATS_INC(cur, free); + + /* + * If we joined with the left neighbor, set the buffer in the + * cursor to the left block, and fix up the index. + */ + if (bp != lbp) { + cur->bc_bufs[level] = lbp; + cur->bc_ptrs[level] += lrecs; + cur->bc_ra[level] = 0; + } + /* + * If we joined with the right neighbor and there's a level above + * us, increment the cursor at that level. + */ + else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || + (level + 1 < cur->bc_nlevels)) { + error = xfs_btree_increment(cur, level + 1, &i); + if (error) + goto error0; + } + + /* + * Readjust the ptr at this level if it's not a leaf, since it's + * still pointing at the deletion point, which makes the cursor + * inconsistent. If this makes the ptr 0, the caller fixes it up. + * We can't use decrement because it would change the next level up. + */ + if (level > 0) + cur->bc_ptrs[level]--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + /* Return value means the next level up has something to do. */ + *stat = 2; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (tcur) + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Delete the record pointed to by cur. + * The cursor refers to the place where the record was (could be inserted) + * when the operation returns. + */ +int /* error */ +xfs_btree_delete( + struct xfs_btree_cur *cur, + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int level; + int i; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + /* + * Go up the tree, starting at leaf level. + * + * If 2 is returned then a join was done; go to the next level. + * Otherwise we are done. + */ + for (level = 0, i = 2; i == 2; level++) { + error = xfs_btree_delrec(cur, level, &i); + if (error) + goto error0; + } + + if (i == 0) { + for (level = 1; level < cur->bc_nlevels; level++) { + if (cur->bc_ptrs[level] == 0) { + error = xfs_btree_decrement(cur, level, &i); + if (error) + goto error0; + break; + } + } + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = i; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_btree_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + union xfs_btree_rec **recp, /* output: btree record */ + int *stat) /* output: success/failure */ +{ + struct xfs_btree_block *block; /* btree block */ + struct xfs_buf *bp; /* buffer pointer */ + int ptr; /* record number */ +#ifdef DEBUG + int error; /* error return value */ +#endif + + ptr = cur->bc_ptrs[0]; + block = xfs_btree_get_block(cur, 0, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, 0, bp); + if (error) + return error; +#endif + + /* + * Off the right end or left end, return failure. + */ + if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { + *stat = 0; + return 0; + } + + /* + * Point to the record and extract its data. + */ + *recp = xfs_btree_rec_addr(cur, ptr, block); + *stat = 1; + return 0; +} diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h new file mode 100644 index 00000000..82fafc66 --- /dev/null +++ b/fs/xfs/xfs_btree.h @@ -0,0 +1,455 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BTREE_H__ +#define __XFS_BTREE_H__ + +struct xfs_buf; +struct xfs_bmap_free; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +extern kmem_zone_t *xfs_btree_cur_zone; + +/* + * This nonsense is to make -wlint happy. + */ +#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi) +#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) +#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) + +#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi) +#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) +#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) +#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) + +/* + * Generic btree header. + * + * This is a combination of the actual format used on disk for short and long + * format btrees. The first three fields are shared by both format, but + * the pointers are different and should be used with care. + * + * To get the size of the actual short or long form headers please use + * the size macros below. Never use sizeof(xfs_btree_block). + */ +struct xfs_btree_block { + __be32 bb_magic; /* magic number for block type */ + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ + union { + struct { + __be32 bb_leftsib; + __be32 bb_rightsib; + } s; /* short form pointers */ + struct { + __be64 bb_leftsib; + __be64 bb_rightsib; + } l; /* long form pointers */ + } bb_u; /* rest */ +}; + +#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ +#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ + + +/* + * Generic key, ptr and record wrapper structures. + * + * These are disk format structures, and are converted where necessary + * by the btree specific code that needs to interpret them. + */ +union xfs_btree_ptr { + __be32 s; /* short form ptr */ + __be64 l; /* long form ptr */ +}; + +union xfs_btree_key { + xfs_bmbt_key_t bmbt; + xfs_bmdr_key_t bmbr; /* bmbt root block */ + xfs_alloc_key_t alloc; + xfs_inobt_key_t inobt; +}; + +union xfs_btree_rec { + xfs_bmbt_rec_t bmbt; + xfs_bmdr_rec_t bmbr; /* bmbt root block */ + xfs_alloc_rec_t alloc; + xfs_inobt_rec_t inobt; +}; + +/* + * For logging record fields. + */ +#define XFS_BB_MAGIC 0x01 +#define XFS_BB_LEVEL 0x02 +#define XFS_BB_NUMRECS 0x04 +#define XFS_BB_LEFTSIB 0x08 +#define XFS_BB_RIGHTSIB 0x10 +#define XFS_BB_NUM_BITS 5 +#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) + +/* + * Magic numbers for btree blocks. + */ +extern const __uint32_t xfs_magics[]; + +/* + * Generic stats interface + */ +#define __XFS_BTREE_STATS_INC(type, stat) \ + XFS_STATS_INC(xs_ ## type ## _2_ ## stat) +#define XFS_BTREE_STATS_INC(cur, stat) \ +do { \ + switch (cur->bc_btnum) { \ + case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \ + case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \ + case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \ + case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \ + case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ + } \ +} while (0) + +#define __XFS_BTREE_STATS_ADD(type, stat, val) \ + XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val) +#define XFS_BTREE_STATS_ADD(cur, stat, val) \ +do { \ + switch (cur->bc_btnum) { \ + case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \ + case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \ + case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \ + case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \ + case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ + } \ +} while (0) + +#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */ + +struct xfs_btree_ops { + /* size of the key and record structures */ + size_t key_len; + size_t rec_len; + + /* cursor operations */ + struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *); + void (*update_cursor)(struct xfs_btree_cur *src, + struct xfs_btree_cur *dst); + + /* update btree root pointer */ + void (*set_root)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, int level_change); + + /* block allocation / freeing */ + int (*alloc_block)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *start_bno, + union xfs_btree_ptr *new_bno, + int length, int *stat); + int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); + + /* update last record information */ + void (*update_lastrec)(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_rec *rec, + int ptr, int reason); + + /* records in block/level */ + int (*get_minrecs)(struct xfs_btree_cur *cur, int level); + int (*get_maxrecs)(struct xfs_btree_cur *cur, int level); + + /* records on disk. Matter for the root in inode case. */ + int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level); + + /* init values of btree structures */ + void (*init_key_from_rec)(union xfs_btree_key *key, + union xfs_btree_rec *rec); + void (*init_rec_from_key)(union xfs_btree_key *key, + union xfs_btree_rec *rec); + void (*init_rec_from_cur)(struct xfs_btree_cur *cur, + union xfs_btree_rec *rec); + void (*init_ptr_from_cur)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); + + /* difference between key value and cursor value */ + __int64_t (*key_diff)(struct xfs_btree_cur *cur, + union xfs_btree_key *key); + +#ifdef DEBUG + /* check that k1 is lower than k2 */ + int (*keys_inorder)(struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2); + + /* check that r1 is lower than r2 */ + int (*recs_inorder)(struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2); +#endif + + /* btree tracing */ +#ifdef XFS_BTREE_TRACE + void (*trace_enter)(struct xfs_btree_cur *, const char *, + char *, int, int, __psunsigned_t, + __psunsigned_t, __psunsigned_t, + __psunsigned_t, __psunsigned_t, + __psunsigned_t, __psunsigned_t, + __psunsigned_t, __psunsigned_t, + __psunsigned_t, __psunsigned_t); + void (*trace_cursor)(struct xfs_btree_cur *, __uint32_t *, + __uint64_t *, __uint64_t *); + void (*trace_key)(struct xfs_btree_cur *, + union xfs_btree_key *, __uint64_t *, + __uint64_t *); + void (*trace_record)(struct xfs_btree_cur *, + union xfs_btree_rec *, __uint64_t *, + __uint64_t *, __uint64_t *); +#endif +}; + +/* + * Reasons for the update_lastrec method to be called. + */ +#define LASTREC_UPDATE 0 +#define LASTREC_INSREC 1 +#define LASTREC_DELREC 2 + + +/* + * Btree cursor structure. + * This collects all information needed by the btree code in one place. + */ +typedef struct xfs_btree_cur +{ + struct xfs_trans *bc_tp; /* transaction we're in, if any */ + struct xfs_mount *bc_mp; /* file system mount struct */ + const struct xfs_btree_ops *bc_ops; + uint bc_flags; /* btree features - below */ + union { + xfs_alloc_rec_incore_t a; + xfs_bmbt_irec_t b; + xfs_inobt_rec_incore_t i; + } bc_rec; /* current insert/search record value */ + struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ + int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ + __uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */ +#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */ +#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */ + __uint8_t bc_nlevels; /* number of levels in the tree */ + __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ + xfs_btnum_t bc_btnum; /* identifies which btree type */ + union { + struct { /* needed for BNO, CNT, INO */ + struct xfs_buf *agbp; /* agf/agi buffer pointer */ + xfs_agnumber_t agno; /* ag number */ + } a; + struct { /* needed for BMAP */ + struct xfs_inode *ip; /* pointer to our inode */ + struct xfs_bmap_free *flist; /* list to free after */ + xfs_fsblock_t firstblock; /* 1st blk allocated */ + int allocated; /* count of alloced */ + short forksize; /* fork's inode space */ + char whichfork; /* data or attr fork */ + char flags; /* flags */ +#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ + } b; + } bc_private; /* per-btree type data */ +} xfs_btree_cur_t; + +/* cursor flags */ +#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ +#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ +#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ + + +#define XFS_BTREE_NOERROR 0 +#define XFS_BTREE_ERROR 1 + +/* + * Convert from buffer to btree block header. + */ +#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)XFS_BUF_PTR(bp)) + + +/* + * Check that block header is ok. + */ +int +xfs_btree_check_block( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* generic btree block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp); /* buffer containing block, if any */ + +/* + * Check that (long) pointer is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lptr( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_dfsbno_t ptr, /* btree block disk address */ + int level); /* btree block level */ + +/* + * Delete the btree cursor. + */ +void +xfs_btree_del_cursor( + xfs_btree_cur_t *cur, /* btree cursor */ + int error); /* del because of error */ + +/* + * Duplicate the btree cursor. + * Allocate a new one, copy the record, re-get the buffers. + */ +int /* error */ +xfs_btree_dup_cursor( + xfs_btree_cur_t *cur, /* input cursor */ + xfs_btree_cur_t **ncur);/* output cursor */ + +/* + * Get a buffer for the block, return it with no data read. + * Long-form addressing. + */ +struct xfs_buf * /* buffer for fsbno */ +xfs_btree_get_bufl( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock); /* lock flags for get_buf */ + +/* + * Get a buffer for the block, return it with no data read. + * Short-form addressing. + */ +struct xfs_buf * /* buffer for agno/agbno */ +xfs_btree_get_bufs( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + uint lock); /* lock flags for get_buf */ + +/* + * Check for the cursor referring to the last block at the given level. + */ +int /* 1=is last block, 0=not last block */ +xfs_btree_islastblock( + xfs_btree_cur_t *cur, /* btree cursor */ + int level); /* level to check */ + +/* + * Compute first and last byte offsets for the fields given. + * Interprets the offsets table, which contains struct field offsets. + */ +void +xfs_btree_offsets( + __int64_t fields, /* bitmask of fields */ + const short *offsets,/* table of field offsets */ + int nbits, /* number of bits to inspect */ + int *first, /* output: first byte offset */ + int *last); /* output: last byte offset */ + +/* + * Get a buffer for the block, return it read in. + * Long-form addressing. + */ +int /* error */ +xfs_btree_read_bufl( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for fsbno */ + int refval);/* ref count value for buffer */ + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Long-form addressing. + */ +void /* error */ +xfs_btree_reada_bufl( + struct xfs_mount *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count); /* count of filesystem blocks */ + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Short-form addressing. + */ +void /* error */ +xfs_btree_reada_bufs( + struct xfs_mount *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count); /* count of filesystem blocks */ + + +/* + * Common btree core entry points. + */ +int xfs_btree_increment(struct xfs_btree_cur *, int, int *); +int xfs_btree_decrement(struct xfs_btree_cur *, int, int *); +int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); +int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); +int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); +int xfs_btree_insert(struct xfs_btree_cur *, int *); +int xfs_btree_delete(struct xfs_btree_cur *, int *); +int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); + +/* + * Internal btree helpers also used by xfs_bmap.c. + */ +void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); +void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); + +/* + * Helpers. + */ +static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block) +{ + return be16_to_cpu(block->bb_numrecs); +} + +static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block, + __uint16_t numrecs) +{ + block->bb_numrecs = cpu_to_be16(numrecs); +} + +static inline int xfs_btree_get_level(struct xfs_btree_block *block) +{ + return be16_to_cpu(block->bb_level); +} + + +/* + * Min and max functions for extlen, agblock, fileoff, and filblks types. + */ +#define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b)) +#define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b)) +#define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b)) +#define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b)) +#define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b)) +#define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b)) +#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) +#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) + +#define XFS_FSB_SANITY_CHECK(mp,fsb) \ + (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ + XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) + +#endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/xfs_btree_trace.c b/fs/xfs/xfs_btree_trace.c new file mode 100644 index 00000000..44ff942a --- /dev/null +++ b/fs/xfs/xfs_btree_trace.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2008 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_btree_trace.h" + +STATIC void +xfs_btree_trace_ptr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr ptr, + __psunsigned_t *high, + __psunsigned_t *low) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + __u64 val = be64_to_cpu(ptr.l); + *high = val >> 32; + *low = (int)val; + } else { + *high = 0; + *low = be32_to_cpu(ptr.s); + } +} + +/* + * Add a trace buffer entry for arguments, for a buffer & 1 integer arg. + */ +void +xfs_btree_trace_argbi( + const char *func, + struct xfs_btree_cur *cur, + struct xfs_buf *b, + int i, + int line) +{ + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBI, + line, (__psunsigned_t)b, i, 0, 0, 0, 0, 0, + 0, 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for a buffer & 2 integer args. + */ +void +xfs_btree_trace_argbii( + const char *func, + struct xfs_btree_cur *cur, + struct xfs_buf *b, + int i0, + int i1, + int line) +{ + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBII, + line, (__psunsigned_t)b, i0, i1, 0, 0, 0, 0, + 0, 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for 3 block-length args + * and an integer arg. + */ +void +xfs_btree_trace_argfffi( + const char *func, + struct xfs_btree_cur *cur, + xfs_dfiloff_t o, + xfs_dfsbno_t b, + xfs_dfilblks_t i, + int j, + int line) +{ + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGFFFI, + line, + o >> 32, (int)o, + b >> 32, (int)b, + i >> 32, (int)i, + (int)j, 0, 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for one integer arg. + */ +void +xfs_btree_trace_argi( + const char *func, + struct xfs_btree_cur *cur, + int i, + int line) +{ + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGI, + line, i, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for int, fsblock, key. + */ +void +xfs_btree_trace_argipk( + const char *func, + struct xfs_btree_cur *cur, + int i, + union xfs_btree_ptr ptr, + union xfs_btree_key *key, + int line) +{ + __psunsigned_t high, low; + __uint64_t l0, l1; + + xfs_btree_trace_ptr(cur, ptr, &high, &low); + cur->bc_ops->trace_key(cur, key, &l0, &l1); + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPK, + line, i, high, low, + l0 >> 32, (int)l0, + l1 >> 32, (int)l1, + 0, 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for int, fsblock, rec. + */ +void +xfs_btree_trace_argipr( + const char *func, + struct xfs_btree_cur *cur, + int i, + union xfs_btree_ptr ptr, + union xfs_btree_rec *rec, + int line) +{ + __psunsigned_t high, low; + __uint64_t l0, l1, l2; + + xfs_btree_trace_ptr(cur, ptr, &high, &low); + cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2); + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPR, + line, i, + high, low, + l0 >> 32, (int)l0, + l1 >> 32, (int)l1, + l2 >> 32, (int)l2, + 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for int, key. + */ +void +xfs_btree_trace_argik( + const char *func, + struct xfs_btree_cur *cur, + int i, + union xfs_btree_key *key, + int line) +{ + __uint64_t l0, l1; + + cur->bc_ops->trace_key(cur, key, &l0, &l1); + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIK, + line, i, + l0 >> 32, (int)l0, + l1 >> 32, (int)l1, + 0, 0, 0, 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for record. + */ +void +xfs_btree_trace_argr( + const char *func, + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + int line) +{ + __uint64_t l0, l1, l2; + + cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2); + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGR, + line, + l0 >> 32, (int)l0, + l1 >> 32, (int)l1, + l2 >> 32, (int)l2, + 0, 0, 0, 0, 0); +} + +/* + * Add a trace buffer entry for the cursor/operation. + */ +void +xfs_btree_trace_cursor( + const char *func, + struct xfs_btree_cur *cur, + int type, + int line) +{ + __uint32_t s0; + __uint64_t l0, l1; + char *s; + + switch (type) { + case XBT_ARGS: + s = "args"; + break; + case XBT_ENTRY: + s = "entry"; + break; + case XBT_ERROR: + s = "error"; + break; + case XBT_EXIT: + s = "exit"; + break; + default: + s = "unknown"; + break; + } + + cur->bc_ops->trace_cursor(cur, &s0, &l0, &l1); + cur->bc_ops->trace_enter(cur, func, s, XFS_BTREE_KTRACE_CUR, line, + s0, + l0 >> 32, (int)l0, + l1 >> 32, (int)l1, + (__psunsigned_t)cur->bc_bufs[0], + (__psunsigned_t)cur->bc_bufs[1], + (__psunsigned_t)cur->bc_bufs[2], + (__psunsigned_t)cur->bc_bufs[3], + (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1], + (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]); +} diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h new file mode 100644 index 00000000..2d8a3098 --- /dev/null +++ b/fs/xfs/xfs_btree_trace.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2008 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BTREE_TRACE_H__ +#define __XFS_BTREE_TRACE_H__ + +struct xfs_btree_cur; +struct xfs_buf; + + +/* + * Trace hooks. + * i,j = integer (32 bit) + * b = btree block buffer (xfs_buf_t) + * p = btree ptr + * r = btree record + * k = btree key + */ + +#ifdef XFS_BTREE_TRACE + +/* + * Trace buffer entry types. + */ +#define XFS_BTREE_KTRACE_ARGBI 1 +#define XFS_BTREE_KTRACE_ARGBII 2 +#define XFS_BTREE_KTRACE_ARGFFFI 3 +#define XFS_BTREE_KTRACE_ARGI 4 +#define XFS_BTREE_KTRACE_ARGIPK 5 +#define XFS_BTREE_KTRACE_ARGIPR 6 +#define XFS_BTREE_KTRACE_ARGIK 7 +#define XFS_BTREE_KTRACE_ARGR 8 +#define XFS_BTREE_KTRACE_CUR 9 + +/* + * Sub-types for cursor traces. + */ +#define XBT_ARGS 0 +#define XBT_ENTRY 1 +#define XBT_ERROR 2 +#define XBT_EXIT 3 + +void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *, + struct xfs_buf *, int, int); +void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *, + struct xfs_buf *, int, int, int); +void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int); +void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int, + union xfs_btree_ptr, union xfs_btree_key *, int); +void xfs_btree_trace_argipr(const char *, struct xfs_btree_cur *, int, + union xfs_btree_ptr, union xfs_btree_rec *, int); +void xfs_btree_trace_argik(const char *, struct xfs_btree_cur *, int, + union xfs_btree_key *, int); +void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *, + union xfs_btree_rec *, int); +void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int); + +#define XFS_BTREE_TRACE_ARGBI(c, b, i) \ + xfs_btree_trace_argbi(__func__, c, b, i, __LINE__) +#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) \ + xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__) +#define XFS_BTREE_TRACE_ARGI(c, i) \ + xfs_btree_trace_argi(__func__, c, i, __LINE__) +#define XFS_BTREE_TRACE_ARGIPK(c, i, p, k) \ + xfs_btree_trace_argipk(__func__, c, i, p, k, __LINE__) +#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) \ + xfs_btree_trace_argipr(__func__, c, i, p, r, __LINE__) +#define XFS_BTREE_TRACE_ARGIK(c, i, k) \ + xfs_btree_trace_argik(__func__, c, i, k, __LINE__) +#define XFS_BTREE_TRACE_ARGR(c, r) \ + xfs_btree_trace_argr(__func__, c, r, __LINE__) +#define XFS_BTREE_TRACE_CURSOR(c, t) \ + xfs_btree_trace_cursor(__func__, c, t, __LINE__) +#else +#define XFS_BTREE_TRACE_ARGBI(c, b, i) +#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) +#define XFS_BTREE_TRACE_ARGI(c, i) +#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s) +#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) +#define XFS_BTREE_TRACE_ARGIK(c, i, k) +#define XFS_BTREE_TRACE_ARGR(c, r) +#define XFS_BTREE_TRACE_CURSOR(c, t) +#endif /* XFS_BTREE_TRACE */ + +#endif /* __XFS_BTREE_TRACE_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c new file mode 100644 index 00000000..7888a756 --- /dev/null +++ b/fs/xfs/xfs_buf_item.c @@ -0,0 +1,1064 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" + + +kmem_zone_t *xfs_buf_item_zone; + +static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_buf_log_item, bli_item); +} + + +#ifdef XFS_TRANS_DEBUG +/* + * This function uses an alternate strategy for tracking the bytes + * that the user requests to be logged. This can then be used + * in conjunction with the bli_orig array in the buf log item to + * catch bugs in our callers' code. + * + * We also double check the bits set in xfs_buf_item_log using a + * simple algorithm to check that every byte is accounted for. + */ +STATIC void +xfs_buf_item_log_debug( + xfs_buf_log_item_t *bip, + uint first, + uint last) +{ + uint x; + uint byte; + uint nbytes; + uint chunk_num; + uint word_num; + uint bit_num; + uint bit_set; + uint *wordp; + + ASSERT(bip->bli_logged != NULL); + byte = first; + nbytes = last - first + 1; + bfset(bip->bli_logged, first, nbytes); + for (x = 0; x < nbytes; x++) { + chunk_num = byte >> XFS_BLF_SHIFT; + word_num = chunk_num >> BIT_TO_WORD_SHIFT; + bit_num = chunk_num & (NBWORD - 1); + wordp = &(bip->bli_format.blf_data_map[word_num]); + bit_set = *wordp & (1 << bit_num); + ASSERT(bit_set); + byte++; + } +} + +/* + * This function is called when we flush something into a buffer without + * logging it. This happens for things like inodes which are logged + * separately from the buffer. + */ +void +xfs_buf_item_flush_log_debug( + xfs_buf_t *bp, + uint first, + uint last) +{ + xfs_buf_log_item_t *bip; + uint nbytes; + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) { + return; + } + + ASSERT(bip->bli_logged != NULL); + nbytes = last - first + 1; + bfset(bip->bli_logged, first, nbytes); +} + +/* + * This function is called to verify that our callers have logged + * all the bytes that they changed. + * + * It does this by comparing the original copy of the buffer stored in + * the buf log item's bli_orig array to the current copy of the buffer + * and ensuring that all bytes which mismatch are set in the bli_logged + * array of the buf log item. + */ +STATIC void +xfs_buf_item_log_check( + xfs_buf_log_item_t *bip) +{ + char *orig; + char *buffer; + int x; + xfs_buf_t *bp; + + ASSERT(bip->bli_orig != NULL); + ASSERT(bip->bli_logged != NULL); + + bp = bip->bli_buf; + ASSERT(XFS_BUF_COUNT(bp) > 0); + ASSERT(XFS_BUF_PTR(bp) != NULL); + orig = bip->bli_orig; + buffer = XFS_BUF_PTR(bp); + for (x = 0; x < XFS_BUF_COUNT(bp); x++) { + if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { + xfs_emerg(bp->b_mount, + "%s: bip %x buffer %x orig %x index %d", + __func__, bip, bp, orig, x); + ASSERT(0); + } + } +} +#else +#define xfs_buf_item_log_debug(x,y,z) +#define xfs_buf_item_log_check(x) +#endif + +STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); + +/* + * This returns the number of log iovecs needed to log the + * given buf log item. + * + * It calculates this as 1 iovec for the buf log format structure + * and 1 for each stretch of non-contiguous chunks to be logged. + * Contiguous chunks are logged in a single iovec. + * + * If the XFS_BLI_STALE flag has been set, then log nothing. + */ +STATIC uint +xfs_buf_item_size( + struct xfs_log_item *lip) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + uint nvecs; + int next_bit; + int last_bit; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + trace_xfs_buf_item_size_stale(bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + return 1; + } + + ASSERT(bip->bli_flags & XFS_BLI_LOGGED); + nvecs = 1; + last_bit = xfs_next_bit(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size, 0); + ASSERT(last_bit != -1); + nvecs++; + while (last_bit != -1) { + /* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + */ + next_bit = xfs_next_bit(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size, + last_bit + 1); + /* + * If we run out of bits, leave the loop, + * else if we find a new set of bits bump the number of vecs, + * else keep scanning the current set of bits. + */ + if (next_bit == -1) { + last_bit = -1; + } else if (next_bit != last_bit + 1) { + last_bit = next_bit; + nvecs++; + } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != + (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + + XFS_BLF_CHUNK)) { + last_bit = next_bit; + nvecs++; + } else { + last_bit++; + } + } + + trace_xfs_buf_item_size(bip); + return nvecs; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given log buf item. It fills the first entry with a buf log + * format structure, and the rest point to contiguous chunks + * within the buffer. + */ +STATIC void +xfs_buf_item_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *vecp) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + uint base_size; + uint nvecs; + int first_bit; + int last_bit; + int next_bit; + uint nbits; + uint buffer_offset; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_STALE)); + + /* + * The size of the base structure is the size of the + * declared structure plus the space for the extra words + * of the bitmap. We subtract one from the map size, because + * the first element of the bitmap is accounted for in the + * size of the base structure. + */ + base_size = + (uint)(sizeof(xfs_buf_log_format_t) + + ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); + vecp->i_addr = &bip->bli_format; + vecp->i_len = base_size; + vecp->i_type = XLOG_REG_TYPE_BFORMAT; + vecp++; + nvecs = 1; + + /* + * If it is an inode buffer, transfer the in-memory state to the + * format flags and clear the in-memory state. We do not transfer + * this state if the inode buffer allocation has not yet been committed + * to the log as setting the XFS_BLI_INODE_BUF flag will prevent + * correct replay of the inode allocation. + */ + if (bip->bli_flags & XFS_BLI_INODE_BUF) { + if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + xfs_log_item_in_current_chkpt(lip))) + bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->bli_flags &= ~XFS_BLI_INODE_BUF; + } + + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + trace_xfs_buf_item_format_stale(bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + bip->bli_format.blf_size = nvecs; + return; + } + + /* + * Fill in an iovec for each set of contiguous chunks. + */ + first_bit = xfs_next_bit(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size, 0); + ASSERT(first_bit != -1); + last_bit = first_bit; + nbits = 1; + for (;;) { + /* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + */ + next_bit = xfs_next_bit(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size, + (uint)last_bit + 1); + /* + * If we run out of bits fill in the last iovec and get + * out of the loop. + * Else if we start a new set of bits then fill in the + * iovec for the series we were looking at and start + * counting the bits in the new one. + * Else we're still in the same set of bits so just + * keep counting and scanning. + */ + if (next_bit == -1) { + buffer_offset = first_bit * XFS_BLF_CHUNK; + vecp->i_addr = xfs_buf_offset(bp, buffer_offset); + vecp->i_len = nbits * XFS_BLF_CHUNK; + vecp->i_type = XLOG_REG_TYPE_BCHUNK; + nvecs++; + break; + } else if (next_bit != last_bit + 1) { + buffer_offset = first_bit * XFS_BLF_CHUNK; + vecp->i_addr = xfs_buf_offset(bp, buffer_offset); + vecp->i_len = nbits * XFS_BLF_CHUNK; + vecp->i_type = XLOG_REG_TYPE_BCHUNK; + nvecs++; + vecp++; + first_bit = next_bit; + last_bit = next_bit; + nbits = 1; + } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) != + (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) + + XFS_BLF_CHUNK)) { + buffer_offset = first_bit * XFS_BLF_CHUNK; + vecp->i_addr = xfs_buf_offset(bp, buffer_offset); + vecp->i_len = nbits * XFS_BLF_CHUNK; + vecp->i_type = XLOG_REG_TYPE_BCHUNK; +/* You would think we need to bump the nvecs here too, but we do not + * this number is used by recovery, and it gets confused by the boundary + * split here + * nvecs++; + */ + vecp++; + first_bit = next_bit; + last_bit = next_bit; + nbits = 1; + } else { + last_bit++; + nbits++; + } + } + bip->bli_format.blf_size = nvecs; + + /* + * Check to make sure everything is consistent. + */ + trace_xfs_buf_item_format(bip); + xfs_buf_item_log_check(bip); +} + +/* + * This is called to pin the buffer associated with the buf log item in memory + * so it cannot be written out. + * + * We also always take a reference to the buffer log item here so that the bli + * is held while the item is pinned in memory. This means that we can + * unconditionally drop the reference count a transaction holds when the + * transaction is completed. + */ +STATIC void +xfs_buf_item_pin( + struct xfs_log_item *lip) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + + ASSERT(XFS_BUF_ISBUSY(bip->bli_buf)); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_STALE)); + + trace_xfs_buf_item_pin(bip); + + atomic_inc(&bip->bli_refcount); + atomic_inc(&bip->bli_buf->b_pin_count); +} + +/* + * This is called to unpin the buffer associated with the buf log + * item which was previously pinned with a call to xfs_buf_item_pin(). + * + * Also drop the reference to the buf item for the current transaction. + * If the XFS_BLI_STALE flag is set and we are the last reference, + * then free up the buf log item and unlock the buffer. + * + * If the remove flag is set we are called from uncommit in the + * forced-shutdown path. If that is true and the reference count on + * the log item is going to drop to zero we need to free the item's + * descriptor in the transaction. + */ +STATIC void +xfs_buf_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + xfs_buf_t *bp = bip->bli_buf; + struct xfs_ail *ailp = lip->li_ailp; + int stale = bip->bli_flags & XFS_BLI_STALE; + int freed; + + ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + trace_xfs_buf_item_unpin(bip); + + freed = atomic_dec_and_test(&bip->bli_refcount); + + if (atomic_dec_and_test(&bp->b_pin_count)) + wake_up_all(&bp->b_waiters); + + if (freed && stale) { + ASSERT(bip->bli_flags & XFS_BLI_STALE); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); + ASSERT(XFS_BUF_ISSTALE(bp)); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + + trace_xfs_buf_item_unpin_stale(bip); + + if (remove) { + /* + * If we are in a transaction context, we have to + * remove the log item from the transaction as we are + * about to release our reference to the buffer. If we + * don't, the unlock that occurs later in + * xfs_trans_uncommit() will try to reference the + * buffer which we no longer have a hold on. + */ + if (lip->li_desc) + xfs_trans_del_item(lip); + + /* + * Since the transaction no longer refers to the buffer, + * the buffer should no longer refer to the transaction. + */ + XFS_BUF_SET_FSPRIVATE2(bp, NULL); + } + + /* + * If we get called here because of an IO error, we may + * or may not have the item on the AIL. xfs_trans_ail_delete() + * will take care of that situation. + * xfs_trans_ail_delete() drops the AIL lock. + */ + if (bip->bli_flags & XFS_BLI_STALE_INODE) { + xfs_buf_do_callbacks(bp); + XFS_BUF_SET_FSPRIVATE(bp, NULL); + XFS_BUF_CLR_IODONE_FUNC(bp); + } else { + spin_lock(&ailp->xa_lock); + xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); + xfs_buf_item_relse(bp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL); + } + xfs_buf_relse(bp); + } +} + +/* + * This is called to attempt to lock the buffer associated with this + * buf log item. Don't sleep on the buffer lock. If we can't get + * the lock right away, return 0. If we can get the lock, take a + * reference to the buffer. If this is a delayed write buffer that + * needs AIL help to be written back, invoke the pushbuf routine + * rather than the normal success path. + */ +STATIC uint +xfs_buf_item_trylock( + struct xfs_log_item *lip) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + + if (XFS_BUF_ISPINNED(bp)) + return XFS_ITEM_PINNED; + if (!XFS_BUF_CPSEMA(bp)) + return XFS_ITEM_LOCKED; + + /* take a reference to the buffer. */ + XFS_BUF_HOLD(bp); + + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + trace_xfs_buf_item_trylock(bip); + if (XFS_BUF_ISDELAYWRITE(bp)) + return XFS_ITEM_PUSHBUF; + return XFS_ITEM_SUCCESS; +} + +/* + * Release the buffer associated with the buf log item. If there is no dirty + * logged data associated with the buffer recorded in the buf log item, then + * free the buf log item and remove the reference to it in the buffer. + * + * This call ignores the recursion count. It is only called when the buffer + * should REALLY be unlocked, regardless of the recursion count. + * + * We unconditionally drop the transaction's reference to the log item. If the + * item was logged, then another reference was taken when it was pinned, so we + * can safely drop the transaction reference now. This also allows us to avoid + * potential races with the unpin code freeing the bli by not referencing the + * bli after we've dropped the reference count. + * + * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item + * if necessary but do not unlock the buffer. This is for support of + * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't + * free the item. + */ +STATIC void +xfs_buf_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + int aborted; + uint hold; + + /* Clear the buffer's association with this transaction. */ + XFS_BUF_SET_FSPRIVATE2(bp, NULL); + + /* + * If this is a transaction abort, don't return early. Instead, allow + * the brelse to happen. Normally it would be done for stale + * (cancelled) buffers at unpin time, but we'll never go through the + * pin/unpin cycle if we abort inside commit. + */ + aborted = (lip->li_flags & XFS_LI_ABORTED) != 0; + + /* + * Before possibly freeing the buf item, determine if we should + * release the buffer at the end of this routine. + */ + hold = bip->bli_flags & XFS_BLI_HOLD; + + /* Clear the per transaction state. */ + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD); + + /* + * If the buf item is marked stale, then don't do anything. We'll + * unlock the buffer and free the buf item when the buffer is unpinned + * for the last time. + */ + if (bip->bli_flags & XFS_BLI_STALE) { + trace_xfs_buf_item_unlock_stale(bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + if (!aborted) { + atomic_dec(&bip->bli_refcount); + return; + } + } + + trace_xfs_buf_item_unlock(bip); + + /* + * If the buf item isn't tracking any data, free it, otherwise drop the + * reference we hold to it. + */ + if (xfs_bitmap_empty(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size)) + xfs_buf_item_relse(bp); + else + atomic_dec(&bip->bli_refcount); + + if (!hold) + xfs_buf_relse(bp); +} + +/* + * This is called to find out where the oldest active copy of the + * buf log item in the on disk log resides now that the last log + * write of it completed at the given lsn. + * We always re-log all the dirty data in a buffer, so usually the + * latest copy in the on disk log is the only one that matters. For + * those cases we simply return the given lsn. + * + * The one exception to this is for buffers full of newly allocated + * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF + * flag set, indicating that only the di_next_unlinked fields from the + * inodes in the buffers will be replayed during recovery. If the + * original newly allocated inode images have not yet been flushed + * when the buffer is so relogged, then we need to make sure that we + * keep the old images in the 'active' portion of the log. We do this + * by returning the original lsn of that transaction here rather than + * the current one. + */ +STATIC xfs_lsn_t +xfs_buf_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + + trace_xfs_buf_item_committed(bip); + + if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0) + return lip->li_lsn; + return lsn; +} + +/* + * The buffer is locked, but is not a delayed write buffer. This happens + * if we race with IO completion and hence we don't want to try to write it + * again. Just release the buffer. + */ +STATIC void +xfs_buf_item_push( + struct xfs_log_item *lip) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); + + trace_xfs_buf_item_push(bip); + + xfs_buf_relse(bp); +} + +/* + * The buffer is locked and is a delayed write buffer. Promote the buffer + * in the delayed write queue as the caller knows that they must invoke + * the xfsbufd to get this buffer written. We have to unlock the buffer + * to allow the xfsbufd to write it, too. + */ +STATIC bool +xfs_buf_item_pushbuf( + struct xfs_log_item *lip) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(XFS_BUF_ISDELAYWRITE(bp)); + + trace_xfs_buf_item_pushbuf(bip); + + xfs_buf_delwri_promote(bp); + xfs_buf_relse(bp); + return true; +} + +STATIC void +xfs_buf_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn) +{ +} + +/* + * This is the ops vector shared by all buf log items. + */ +static struct xfs_item_ops xfs_buf_item_ops = { + .iop_size = xfs_buf_item_size, + .iop_format = xfs_buf_item_format, + .iop_pin = xfs_buf_item_pin, + .iop_unpin = xfs_buf_item_unpin, + .iop_trylock = xfs_buf_item_trylock, + .iop_unlock = xfs_buf_item_unlock, + .iop_committed = xfs_buf_item_committed, + .iop_push = xfs_buf_item_push, + .iop_pushbuf = xfs_buf_item_pushbuf, + .iop_committing = xfs_buf_item_committing +}; + + +/* + * Allocate a new buf log item to go with the given buffer. + * Set the buffer's b_fsprivate field to point to the new + * buf log item. If there are other item's attached to the + * buffer (see xfs_buf_attach_iodone() below), then put the + * buf log item at the front. + */ +void +xfs_buf_item_init( + xfs_buf_t *bp, + xfs_mount_t *mp) +{ + xfs_log_item_t *lip; + xfs_buf_log_item_t *bip; + int chunks; + int map_size; + + /* + * Check to see if there is already a buf log item for + * this buffer. If there is, it is guaranteed to be + * the first. If we do already have one, there is + * nothing to do here so return. + */ + ASSERT(bp->b_target->bt_mount == mp); + if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { + lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + if (lip->li_type == XFS_LI_BUF) { + return; + } + } + + /* + * chunks is the number of XFS_BLF_CHUNK size pieces + * the buffer can be divided into. Make sure not to + * truncate any pieces. map_size is the size of the + * bitmap needed to describe the chunks of the buffer. + */ + chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT); + map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); + + bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, + KM_SLEEP); + xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); + bip->bli_buf = bp; + xfs_buf_hold(bp); + bip->bli_format.blf_type = XFS_LI_BUF; + bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); + bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); + bip->bli_format.blf_map_size = map_size; + +#ifdef XFS_TRANS_DEBUG + /* + * Allocate the arrays for tracking what needs to be logged + * and what our callers request to be logged. bli_orig + * holds a copy of the original, clean buffer for comparison + * against, and bli_logged keeps a 1 bit flag per byte in + * the buffer to indicate which bytes the callers have asked + * to have logged. + */ + bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); + memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp)); + bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); +#endif + + /* + * Put the buf item into the list of items attached to the + * buffer at the front. + */ + if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { + bip->bli_item.li_bio_list = + XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + } + XFS_BUF_SET_FSPRIVATE(bp, bip); +} + + +/* + * Mark bytes first through last inclusive as dirty in the buf + * item's bitmap. + */ +void +xfs_buf_item_log( + xfs_buf_log_item_t *bip, + uint first, + uint last) +{ + uint first_bit; + uint last_bit; + uint bits_to_set; + uint bits_set; + uint word_num; + uint *wordp; + uint bit; + uint end_bit; + uint mask; + + /* + * Mark the item as having some dirty data for + * quick reference in xfs_buf_item_dirty. + */ + bip->bli_flags |= XFS_BLI_DIRTY; + + /* + * Convert byte offsets to bit numbers. + */ + first_bit = first >> XFS_BLF_SHIFT; + last_bit = last >> XFS_BLF_SHIFT; + + /* + * Calculate the total number of bits to be set. + */ + bits_to_set = last_bit - first_bit + 1; + + /* + * Get a pointer to the first word in the bitmap + * to set a bit in. + */ + word_num = first_bit >> BIT_TO_WORD_SHIFT; + wordp = &(bip->bli_format.blf_data_map[word_num]); + + /* + * Calculate the starting bit in the first word. + */ + bit = first_bit & (uint)(NBWORD - 1); + + /* + * First set any bits in the first word of our range. + * If it starts at bit 0 of the word, it will be + * set below rather than here. That is what the variable + * bit tells us. The variable bits_set tracks the number + * of bits that have been set so far. End_bit is the number + * of the last bit to be set in this word plus one. + */ + if (bit) { + end_bit = MIN(bit + bits_to_set, (uint)NBWORD); + mask = ((1 << (end_bit - bit)) - 1) << bit; + *wordp |= mask; + wordp++; + bits_set = end_bit - bit; + } else { + bits_set = 0; + } + + /* + * Now set bits a whole word at a time that are between + * first_bit and last_bit. + */ + while ((bits_to_set - bits_set) >= NBWORD) { + *wordp |= 0xffffffff; + bits_set += NBWORD; + wordp++; + } + + /* + * Finally, set any bits left to be set in one last partial word. + */ + end_bit = bits_to_set - bits_set; + if (end_bit) { + mask = (1 << end_bit) - 1; + *wordp |= mask; + } + + xfs_buf_item_log_debug(bip, first, last); +} + + +/* + * Return 1 if the buffer has some data that has been logged (at any + * point, not just the current transaction) and 0 if not. + */ +uint +xfs_buf_item_dirty( + xfs_buf_log_item_t *bip) +{ + return (bip->bli_flags & XFS_BLI_DIRTY); +} + +STATIC void +xfs_buf_item_free( + xfs_buf_log_item_t *bip) +{ +#ifdef XFS_TRANS_DEBUG + kmem_free(bip->bli_orig); + kmem_free(bip->bli_logged); +#endif /* XFS_TRANS_DEBUG */ + + kmem_zone_free(xfs_buf_item_zone, bip); +} + +/* + * This is called when the buf log item is no longer needed. It should + * free the buf log item associated with the given buffer and clear + * the buffer's pointer to the buf log item. If there are no more + * items in the list, clear the b_iodone field of the buffer (see + * xfs_buf_attach_iodone() below). + */ +void +xfs_buf_item_relse( + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + trace_xfs_buf_item_relse(bp, _RET_IP_); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list); + if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) && + (XFS_BUF_IODONE_FUNC(bp) != NULL)) { + XFS_BUF_CLR_IODONE_FUNC(bp); + } + xfs_buf_rele(bp); + xfs_buf_item_free(bip); +} + + +/* + * Add the given log item with its callback to the list of callbacks + * to be called when the buffer's I/O completes. If it is not set + * already, set the buffer's b_iodone() routine to be + * xfs_buf_iodone_callbacks() and link the log item into the list of + * items rooted at b_fsprivate. Items are always added as the second + * entry in the list if there is a first, because the buf item code + * assumes that the buf log item is first. + */ +void +xfs_buf_attach_iodone( + xfs_buf_t *bp, + void (*cb)(xfs_buf_t *, xfs_log_item_t *), + xfs_log_item_t *lip) +{ + xfs_log_item_t *head_lip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + + lip->li_cb = cb; + if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { + head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + lip->li_bio_list = head_lip->li_bio_list; + head_lip->li_bio_list = lip; + } else { + XFS_BUF_SET_FSPRIVATE(bp, lip); + } + + ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) || + (XFS_BUF_IODONE_FUNC(bp) == NULL)); + XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); +} + +/* + * We can have many callbacks on a buffer. Running the callbacks individually + * can cause a lot of contention on the AIL lock, so we allow for a single + * callback to be able to scan the remaining lip->li_bio_list for other items + * of the same type and callback to be processed in the first call. + * + * As a result, the loop walking the callback list below will also modify the + * list. it removes the first item from the list and then runs the callback. + * The loop then restarts from the new head of the list. This allows the + * callback to scan and modify the list attached to the buffer and we don't + * have to care about maintaining a next item pointer. + */ +STATIC void +xfs_buf_do_callbacks( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip; + + while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) { + XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list); + ASSERT(lip->li_cb != NULL); + /* + * Clear the next pointer so we don't have any + * confusion if the item is added to another buf. + * Don't touch the log item after calling its + * callback, because it could have freed itself. + */ + lip->li_bio_list = NULL; + lip->li_cb(bp, lip); + } +} + +/* + * This is the iodone() function for buffers which have had callbacks + * attached to them by xfs_buf_attach_iodone(). It should remove each + * log item from the buffer's list and call the callback of each in turn. + * When done, the buffer's fsprivate field is set to NULL and the buffer + * is unlocked with a call to iodone(). + */ +void +xfs_buf_iodone_callbacks( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_mount *mp = lip->li_mountp; + static ulong lasttime; + static xfs_buftarg_t *lasttarg; + + if (likely(!XFS_BUF_GETERROR(bp))) + goto do_callbacks; + + /* + * If we've already decided to shutdown the filesystem because of + * I/O errors, there's no point in giving this a retry. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + XFS_BUF_SUPER_STALE(bp); + trace_xfs_buf_item_iodone(bp, _RET_IP_); + goto do_callbacks; + } + + if (XFS_BUF_TARGET(bp) != lasttarg || + time_after(jiffies, (lasttime + 5*HZ))) { + lasttime = jiffies; + xfs_alert(mp, "Device %s: metadata write error block 0x%llx", + XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), + (__uint64_t)XFS_BUF_ADDR(bp)); + } + lasttarg = XFS_BUF_TARGET(bp); + + /* + * If the write was asynchronous then no one will be looking for the + * error. Clear the error state and write the buffer out again. + * + * During sync or umount we'll write all pending buffers again + * synchronous, which will catch these errors if they keep hanging + * around. + */ + if (XFS_BUF_ISASYNC(bp)) { + XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */ + + if (!XFS_BUF_ISSTALE(bp)) { + XFS_BUF_DELAYWRITE(bp); + XFS_BUF_DONE(bp); + XFS_BUF_SET_START(bp); + } + ASSERT(XFS_BUF_IODONE_FUNC(bp)); + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + xfs_buf_relse(bp); + return; + } + + /* + * If the write of the buffer was synchronous, we want to make + * sure to return the error to the caller of xfs_bwrite(). + */ + XFS_BUF_STALE(bp); + XFS_BUF_DONE(bp); + XFS_BUF_UNDELAYWRITE(bp); + + trace_xfs_buf_error_relse(bp, _RET_IP_); + +do_callbacks: + xfs_buf_do_callbacks(bp); + XFS_BUF_SET_FSPRIVATE(bp, NULL); + XFS_BUF_CLR_IODONE_FUNC(bp); + xfs_buf_ioend(bp, 0); +} + +/* + * This is the iodone() function for buffers which have been + * logged. It is called when they are eventually flushed out. + * It should remove the buf item from the AIL, and free the buf item. + * It is called by xfs_buf_iodone_callbacks() above which will take + * care of cleaning up the buffer itself. + */ +void +xfs_buf_iodone( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + struct xfs_ail *ailp = lip->li_ailp; + + ASSERT(BUF_ITEM(lip)->bli_buf == bp); + + xfs_buf_rele(bp); + + /* + * If we are forcibly shutting down, this may well be + * off the AIL already. That's because we simulate the + * log-committed callbacks to unpin these buffers. Or we may never + * have put this item on AIL because of the transaction was + * aborted forcibly. xfs_trans_ail_delete() takes care of these. + * + * Either way, AIL is useless if we're forcing a shutdown. + */ + spin_lock(&ailp->xa_lock); + xfs_trans_ail_delete(ailp, lip); + xfs_buf_item_free(BUF_ITEM(lip)); +} diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h new file mode 100644 index 00000000..b6ecd206 --- /dev/null +++ b/fs/xfs/xfs_buf_item.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BUF_ITEM_H__ +#define __XFS_BUF_ITEM_H__ + +extern kmem_zone_t *xfs_buf_item_zone; + +/* + * This is the structure used to lay out a buf log item in the + * log. The data map describes which 128 byte chunks of the buffer + * have been logged. + * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. + */ +typedef struct xfs_buf_log_format { + unsigned short blf_type; /* buf log item type indicator */ + unsigned short blf_size; /* size of this item */ + ushort blf_flags; /* misc state */ + ushort blf_len; /* number of blocks in this buf */ + __int64_t blf_blkno; /* starting blkno of this buf */ + unsigned int blf_map_size; /* size of data bitmap in words */ + unsigned int blf_data_map[1];/* variable size bitmap of */ + /* regions of buffer in this item */ +} xfs_buf_log_format_t; + +/* + * This flag indicates that the buffer contains on disk inodes + * and requires special recovery handling. + */ +#define XFS_BLF_INODE_BUF 0x1 +/* + * This flag indicates that the buffer should not be replayed + * during recovery because its blocks are being freed. + */ +#define XFS_BLF_CANCEL 0x2 +/* + * This flag indicates that the buffer contains on disk + * user or group dquots and may require special recovery handling. + */ +#define XFS_BLF_UDQUOT_BUF 0x4 +#define XFS_BLF_PDQUOT_BUF 0x8 +#define XFS_BLF_GDQUOT_BUF 0x10 + +#define XFS_BLF_CHUNK 128 +#define XFS_BLF_SHIFT 7 +#define BIT_TO_WORD_SHIFT 5 +#define NBWORD (NBBY * sizeof(unsigned int)) + +/* + * buf log item flags + */ +#define XFS_BLI_HOLD 0x01 +#define XFS_BLI_DIRTY 0x02 +#define XFS_BLI_STALE 0x04 +#define XFS_BLI_LOGGED 0x08 +#define XFS_BLI_INODE_ALLOC_BUF 0x10 +#define XFS_BLI_STALE_INODE 0x20 +#define XFS_BLI_INODE_BUF 0x40 + +#define XFS_BLI_FLAGS \ + { XFS_BLI_HOLD, "HOLD" }, \ + { XFS_BLI_DIRTY, "DIRTY" }, \ + { XFS_BLI_STALE, "STALE" }, \ + { XFS_BLI_LOGGED, "LOGGED" }, \ + { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ + { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ + { XFS_BLI_INODE_BUF, "INODE_BUF" } + + +#ifdef __KERNEL__ + +struct xfs_buf; +struct xfs_mount; +struct xfs_buf_log_item; + +/* + * This is the in core log item structure used to track information + * needed to log buffers. It tracks how many times the lock has been + * locked, and which 128 byte chunks of the buffer are dirty. + */ +typedef struct xfs_buf_log_item { + xfs_log_item_t bli_item; /* common item structure */ + struct xfs_buf *bli_buf; /* real buffer pointer */ + unsigned int bli_flags; /* misc flags */ + unsigned int bli_recur; /* lock recursion count */ + atomic_t bli_refcount; /* cnt of tp refs */ +#ifdef XFS_TRANS_DEBUG + char *bli_orig; /* original buffer copy */ + char *bli_logged; /* bytes logged (bitmap) */ +#endif + xfs_buf_log_format_t bli_format; /* in-log header */ +} xfs_buf_log_item_t; + +void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); +void xfs_buf_item_relse(struct xfs_buf *); +void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); +uint xfs_buf_item_dirty(xfs_buf_log_item_t *); +void xfs_buf_attach_iodone(struct xfs_buf *, + void(*)(struct xfs_buf *, xfs_log_item_t *), + xfs_log_item_t *); +void xfs_buf_iodone_callbacks(struct xfs_buf *); +void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); + +#ifdef XFS_TRANS_DEBUG +void +xfs_buf_item_flush_log_debug( + struct xfs_buf *bp, + uint first, + uint last); +#else +#define xfs_buf_item_flush_log_debug(bp, first, last) +#endif + +#endif /* __KERNEL__ */ + +#endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c new file mode 100644 index 00000000..6102ac6d --- /dev/null +++ b/fs/xfs/xfs_da_btree.c @@ -0,0 +1,2463 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_dir2_node.h" +#include "xfs_error.h" +#include "xfs_trace.h" + +/* + * xfs_da_btree.c + * + * Routines to implement directories as Btrees of hashed names. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +STATIC int xfs_da_root_split(xfs_da_state_t *state, + xfs_da_state_blk_t *existing_root, + xfs_da_state_blk_t *new_child); +STATIC int xfs_da_node_split(xfs_da_state_t *state, + xfs_da_state_blk_t *existing_blk, + xfs_da_state_blk_t *split_blk, + xfs_da_state_blk_t *blk_to_add, + int treelevel, + int *result); +STATIC void xfs_da_node_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *node_blk_1, + xfs_da_state_blk_t *node_blk_2); +STATIC void xfs_da_node_add(xfs_da_state_t *state, + xfs_da_state_blk_t *old_node_blk, + xfs_da_state_blk_t *new_node_blk); + +/* + * Routines used for shrinking the Btree. + */ +STATIC int xfs_da_root_join(xfs_da_state_t *state, + xfs_da_state_blk_t *root_blk); +STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval); +STATIC void xfs_da_node_remove(xfs_da_state_t *state, + xfs_da_state_blk_t *drop_blk); +STATIC void xfs_da_node_unbalance(xfs_da_state_t *state, + xfs_da_state_blk_t *src_node_blk, + xfs_da_state_blk_t *dst_node_blk); + +/* + * Utility routines. + */ +STATIC uint xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count); +STATIC int xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp); +STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra); +STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, + xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk); +STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); + +/*======================================================================== + * Routines used for growing the Btree. + *========================================================================*/ + +/* + * Create the initial contents of an intermediate node. + */ +int +xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, + xfs_dabuf_t **bpp, int whichfork) +{ + xfs_da_intnode_t *node; + xfs_dabuf_t *bp; + int error; + xfs_trans_t *tp; + + tp = args->trans; + error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + node = bp->data; + node->hdr.info.forw = 0; + node->hdr.info.back = 0; + node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC); + node->hdr.info.pad = 0; + node->hdr.count = 0; + node->hdr.level = cpu_to_be16(level); + + xfs_da_log_buf(tp, bp, + XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + + *bpp = bp; + return(0); +} + +/* + * Split a leaf node, rebalance, then possibly split + * intermediate nodes, rebalance, etc. + */ +int /* error */ +xfs_da_split(xfs_da_state_t *state) +{ + xfs_da_state_blk_t *oldblk, *newblk, *addblk; + xfs_da_intnode_t *node; + xfs_dabuf_t *bp; + int max, action, error, i; + + /* + * Walk back up the tree splitting/inserting/adjusting as necessary. + * If we need to insert and there isn't room, split the node, then + * decide which fragment to insert the new block from below into. + * Note that we may split the root this way, but we need more fixup. + */ + max = state->path.active - 1; + ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH)); + ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC || + state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC); + + addblk = &state->path.blk[max]; /* initial dummy value */ + for (i = max; (i >= 0) && addblk; state->path.active--, i--) { + oldblk = &state->path.blk[i]; + newblk = &state->altpath.blk[i]; + + /* + * If a leaf node then + * Allocate a new leaf node, then rebalance across them. + * else if an intermediate node then + * We split on the last layer, must we split the node? + */ + switch (oldblk->magic) { + case XFS_ATTR_LEAF_MAGIC: + error = xfs_attr_leaf_split(state, oldblk, newblk); + if ((error != 0) && (error != ENOSPC)) { + return(error); /* GROT: attr is inconsistent */ + } + if (!error) { + addblk = newblk; + break; + } + /* + * Entry wouldn't fit, split the leaf again. + */ + state->extravalid = 1; + if (state->inleaf) { + state->extraafter = 0; /* before newblk */ + error = xfs_attr_leaf_split(state, oldblk, + &state->extrablk); + } else { + state->extraafter = 1; /* after newblk */ + error = xfs_attr_leaf_split(state, newblk, + &state->extrablk); + } + if (error) + return(error); /* GROT: attr inconsistent */ + addblk = newblk; + break; + case XFS_DIR2_LEAFN_MAGIC: + error = xfs_dir2_leafn_split(state, oldblk, newblk); + if (error) + return error; + addblk = newblk; + break; + case XFS_DA_NODE_MAGIC: + error = xfs_da_node_split(state, oldblk, newblk, addblk, + max - i, &action); + xfs_da_buf_done(addblk->bp); + addblk->bp = NULL; + if (error) + return(error); /* GROT: dir is inconsistent */ + /* + * Record the newly split block for the next time thru? + */ + if (action) + addblk = newblk; + else + addblk = NULL; + break; + } + + /* + * Update the btree to show the new hashval for this child. + */ + xfs_da_fixhashpath(state, &state->path); + /* + * If we won't need this block again, it's getting dropped + * from the active path by the loop control, so we need + * to mark it done now. + */ + if (i > 0 || !addblk) + xfs_da_buf_done(oldblk->bp); + } + if (!addblk) + return(0); + + /* + * Split the root node. + */ + ASSERT(state->path.active == 0); + oldblk = &state->path.blk[0]; + error = xfs_da_root_split(state, oldblk, addblk); + if (error) { + xfs_da_buf_done(oldblk->bp); + xfs_da_buf_done(addblk->bp); + addblk->bp = NULL; + return(error); /* GROT: dir is inconsistent */ + } + + /* + * Update pointers to the node which used to be block 0 and + * just got bumped because of the addition of a new root node. + * There might be three blocks involved if a double split occurred, + * and the original block 0 could be at any position in the list. + */ + + node = oldblk->bp->data; + if (node->hdr.info.forw) { + if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { + bp = addblk->bp; + } else { + ASSERT(state->extravalid); + bp = state->extrablk.bp; + } + node = bp->data; + node->hdr.info.back = cpu_to_be32(oldblk->blkno); + xfs_da_log_buf(state->args->trans, bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); + } + node = oldblk->bp->data; + if (node->hdr.info.back) { + if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) { + bp = addblk->bp; + } else { + ASSERT(state->extravalid); + bp = state->extrablk.bp; + } + node = bp->data; + node->hdr.info.forw = cpu_to_be32(oldblk->blkno); + xfs_da_log_buf(state->args->trans, bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); + } + xfs_da_buf_done(oldblk->bp); + xfs_da_buf_done(addblk->bp); + addblk->bp = NULL; + return(0); +} + +/* + * Split the root. We have to create a new root and point to the two + * parts (the split old root) that we just created. Copy block zero to + * the EOF, extending the inode in process. + */ +STATIC int /* error */ +xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2) +{ + xfs_da_intnode_t *node, *oldroot; + xfs_da_args_t *args; + xfs_dablk_t blkno; + xfs_dabuf_t *bp; + int error, size; + xfs_inode_t *dp; + xfs_trans_t *tp; + xfs_mount_t *mp; + xfs_dir2_leaf_t *leaf; + + /* + * Copy the existing (incorrect) block from the root node position + * to a free space somewhere. + */ + args = state->args; + ASSERT(args != NULL); + error = xfs_da_grow_inode(args, &blkno); + if (error) + return(error); + dp = args->dp; + tp = args->trans; + mp = state->mp; + error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + node = bp->data; + oldroot = blk1->bp->data; + if (be16_to_cpu(oldroot->hdr.info.magic) == XFS_DA_NODE_MAGIC) { + size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] - + (char *)oldroot); + } else { + ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + leaf = (xfs_dir2_leaf_t *)oldroot; + size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] - + (char *)leaf); + } + memcpy(node, oldroot, size); + xfs_da_log_buf(tp, bp, 0, size - 1); + xfs_da_buf_done(blk1->bp); + blk1->bp = bp; + blk1->blkno = blkno; + + /* + * Set up the new root node. + */ + error = xfs_da_node_create(args, + (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0, + be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork); + if (error) + return(error); + node = bp->data; + node->btree[0].hashval = cpu_to_be32(blk1->hashval); + node->btree[0].before = cpu_to_be32(blk1->blkno); + node->btree[1].hashval = cpu_to_be32(blk2->hashval); + node->btree[1].before = cpu_to_be32(blk2->blkno); + node->hdr.count = cpu_to_be16(2); + +#ifdef DEBUG + if (be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC) { + ASSERT(blk1->blkno >= mp->m_dirleafblk && + blk1->blkno < mp->m_dirfreeblk); + ASSERT(blk2->blkno >= mp->m_dirleafblk && + blk2->blkno < mp->m_dirfreeblk); + } +#endif + + /* Header is already logged by xfs_da_node_create */ + xfs_da_log_buf(tp, bp, + XFS_DA_LOGRANGE(node, node->btree, + sizeof(xfs_da_node_entry_t) * 2)); + xfs_da_buf_done(bp); + + return(0); +} + +/* + * Split the node, rebalance, then add the new entry. + */ +STATIC int /* error */ +xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, + xfs_da_state_blk_t *newblk, + xfs_da_state_blk_t *addblk, + int treelevel, int *result) +{ + xfs_da_intnode_t *node; + xfs_dablk_t blkno; + int newcount, error; + int useextra; + + node = oldblk->bp->data; + ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + + /* + * With V2 dirs the extra block is data or freespace. + */ + useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK; + newcount = 1 + useextra; + /* + * Do we have to split the node? + */ + if ((be16_to_cpu(node->hdr.count) + newcount) > state->node_ents) { + /* + * Allocate a new node, add to the doubly linked chain of + * nodes, then move some of our excess entries into it. + */ + error = xfs_da_grow_inode(state->args, &blkno); + if (error) + return(error); /* GROT: dir is inconsistent */ + + error = xfs_da_node_create(state->args, blkno, treelevel, + &newblk->bp, state->args->whichfork); + if (error) + return(error); /* GROT: dir is inconsistent */ + newblk->blkno = blkno; + newblk->magic = XFS_DA_NODE_MAGIC; + xfs_da_node_rebalance(state, oldblk, newblk); + error = xfs_da_blk_link(state, oldblk, newblk); + if (error) + return(error); + *result = 1; + } else { + *result = 0; + } + + /* + * Insert the new entry(s) into the correct block + * (updating last hashval in the process). + * + * xfs_da_node_add() inserts BEFORE the given index, + * and as a result of using node_lookup_int() we always + * point to a valid entry (not after one), but a split + * operation always results in a new block whose hashvals + * FOLLOW the current block. + * + * If we had double-split op below us, then add the extra block too. + */ + node = oldblk->bp->data; + if (oldblk->index <= be16_to_cpu(node->hdr.count)) { + oldblk->index++; + xfs_da_node_add(state, oldblk, addblk); + if (useextra) { + if (state->extraafter) + oldblk->index++; + xfs_da_node_add(state, oldblk, &state->extrablk); + state->extravalid = 0; + } + } else { + newblk->index++; + xfs_da_node_add(state, newblk, addblk); + if (useextra) { + if (state->extraafter) + newblk->index++; + xfs_da_node_add(state, newblk, &state->extrablk); + state->extravalid = 0; + } + } + + return(0); +} + +/* + * Balance the btree elements between two intermediate nodes, + * usually one full and one empty. + * + * NOTE: if blk2 is empty, then it will get the upper half of blk1. + */ +STATIC void +xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2) +{ + xfs_da_intnode_t *node1, *node2, *tmpnode; + xfs_da_node_entry_t *btree_s, *btree_d; + int count, tmp; + xfs_trans_t *tp; + + node1 = blk1->bp->data; + node2 = blk2->bp->data; + /* + * Figure out how many entries need to move, and in which direction. + * Swap the nodes around if that makes it simpler. + */ + if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && + ((be32_to_cpu(node2->btree[0].hashval) < be32_to_cpu(node1->btree[0].hashval)) || + (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) < + be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) { + tmpnode = node1; + node1 = node2; + node2 = tmpnode; + } + ASSERT(be16_to_cpu(node1->hdr.info.magic) == XFS_DA_NODE_MAGIC); + ASSERT(be16_to_cpu(node2->hdr.info.magic) == XFS_DA_NODE_MAGIC); + count = (be16_to_cpu(node1->hdr.count) - be16_to_cpu(node2->hdr.count)) / 2; + if (count == 0) + return; + tp = state->args->trans; + /* + * Two cases: high-to-low and low-to-high. + */ + if (count > 0) { + /* + * Move elements in node2 up to make a hole. + */ + if ((tmp = be16_to_cpu(node2->hdr.count)) > 0) { + tmp *= (uint)sizeof(xfs_da_node_entry_t); + btree_s = &node2->btree[0]; + btree_d = &node2->btree[count]; + memmove(btree_d, btree_s, tmp); + } + + /* + * Move the req'd B-tree elements from high in node1 to + * low in node2. + */ + be16_add_cpu(&node2->hdr.count, count); + tmp = count * (uint)sizeof(xfs_da_node_entry_t); + btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count]; + btree_d = &node2->btree[0]; + memcpy(btree_d, btree_s, tmp); + be16_add_cpu(&node1->hdr.count, -count); + } else { + /* + * Move the req'd B-tree elements from low in node2 to + * high in node1. + */ + count = -count; + tmp = count * (uint)sizeof(xfs_da_node_entry_t); + btree_s = &node2->btree[0]; + btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)]; + memcpy(btree_d, btree_s, tmp); + be16_add_cpu(&node1->hdr.count, count); + xfs_da_log_buf(tp, blk1->bp, + XFS_DA_LOGRANGE(node1, btree_d, tmp)); + + /* + * Move elements in node2 down to fill the hole. + */ + tmp = be16_to_cpu(node2->hdr.count) - count; + tmp *= (uint)sizeof(xfs_da_node_entry_t); + btree_s = &node2->btree[count]; + btree_d = &node2->btree[0]; + memmove(btree_d, btree_s, tmp); + be16_add_cpu(&node2->hdr.count, -count); + } + + /* + * Log header of node 1 and all current bits of node 2. + */ + xfs_da_log_buf(tp, blk1->bp, + XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr))); + xfs_da_log_buf(tp, blk2->bp, + XFS_DA_LOGRANGE(node2, &node2->hdr, + sizeof(node2->hdr) + + sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count))); + + /* + * Record the last hashval from each block for upward propagation. + * (note: don't use the swapped node pointers) + */ + node1 = blk1->bp->data; + node2 = blk2->bp->data; + blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval); + blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval); + + /* + * Adjust the expected index for insertion. + */ + if (blk1->index >= be16_to_cpu(node1->hdr.count)) { + blk2->index = blk1->index - be16_to_cpu(node1->hdr.count); + blk1->index = be16_to_cpu(node1->hdr.count) + 1; /* make it invalid */ + } +} + +/* + * Add a new entry to an intermediate node. + */ +STATIC void +xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, + xfs_da_state_blk_t *newblk) +{ + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + int tmp; + + node = oldblk->bp->data; + ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); + ASSERT(newblk->blkno != 0); + if (state->args->whichfork == XFS_DATA_FORK) + ASSERT(newblk->blkno >= state->mp->m_dirleafblk && + newblk->blkno < state->mp->m_dirfreeblk); + + /* + * We may need to make some room before we insert the new node. + */ + tmp = 0; + btree = &node->btree[ oldblk->index ]; + if (oldblk->index < be16_to_cpu(node->hdr.count)) { + tmp = (be16_to_cpu(node->hdr.count) - oldblk->index) * (uint)sizeof(*btree); + memmove(btree + 1, btree, tmp); + } + btree->hashval = cpu_to_be32(newblk->hashval); + btree->before = cpu_to_be32(newblk->blkno); + xfs_da_log_buf(state->args->trans, oldblk->bp, + XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree))); + be16_add_cpu(&node->hdr.count, 1); + xfs_da_log_buf(state->args->trans, oldblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + + /* + * Copy the last hash value from the oldblk to propagate upwards. + */ + oldblk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1 ].hashval); +} + +/*======================================================================== + * Routines used for shrinking the Btree. + *========================================================================*/ + +/* + * Deallocate an empty leaf node, remove it from its parent, + * possibly deallocating that block, etc... + */ +int +xfs_da_join(xfs_da_state_t *state) +{ + xfs_da_state_blk_t *drop_blk, *save_blk; + int action, error; + + action = 0; + drop_blk = &state->path.blk[ state->path.active-1 ]; + save_blk = &state->altpath.blk[ state->path.active-1 ]; + ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC); + ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC || + drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); + + /* + * Walk back up the tree joining/deallocating as necessary. + * When we stop dropping blocks, break out. + */ + for ( ; state->path.active >= 2; drop_blk--, save_blk--, + state->path.active--) { + /* + * See if we can combine the block with a neighbor. + * (action == 0) => no options, just leave + * (action == 1) => coalesce, then unlink + * (action == 2) => block empty, unlink it + */ + switch (drop_blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + error = xfs_attr_leaf_toosmall(state, &action); + if (error) + return(error); + if (action == 0) + return(0); + xfs_attr_leaf_unbalance(state, drop_blk, save_blk); + break; + case XFS_DIR2_LEAFN_MAGIC: + error = xfs_dir2_leafn_toosmall(state, &action); + if (error) + return error; + if (action == 0) + return 0; + xfs_dir2_leafn_unbalance(state, drop_blk, save_blk); + break; + case XFS_DA_NODE_MAGIC: + /* + * Remove the offending node, fixup hashvals, + * check for a toosmall neighbor. + */ + xfs_da_node_remove(state, drop_blk); + xfs_da_fixhashpath(state, &state->path); + error = xfs_da_node_toosmall(state, &action); + if (error) + return(error); + if (action == 0) + return 0; + xfs_da_node_unbalance(state, drop_blk, save_blk); + break; + } + xfs_da_fixhashpath(state, &state->altpath); + error = xfs_da_blk_unlink(state, drop_blk, save_blk); + xfs_da_state_kill_altpath(state); + if (error) + return(error); + error = xfs_da_shrink_inode(state->args, drop_blk->blkno, + drop_blk->bp); + drop_blk->bp = NULL; + if (error) + return(error); + } + /* + * We joined all the way to the top. If it turns out that + * we only have one entry in the root, make the child block + * the new root. + */ + xfs_da_node_remove(state, drop_blk); + xfs_da_fixhashpath(state, &state->path); + error = xfs_da_root_join(state, &state->path.blk[0]); + return(error); +} + +/* + * We have only one entry in the root. Copy the only remaining child of + * the old root to block 0 as the new root node. + */ +STATIC int +xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) +{ + xfs_da_intnode_t *oldroot; + /* REFERENCED */ + xfs_da_blkinfo_t *blkinfo; + xfs_da_args_t *args; + xfs_dablk_t child; + xfs_dabuf_t *bp; + int error; + + args = state->args; + ASSERT(args != NULL); + ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); + oldroot = root_blk->bp->data; + ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DA_NODE_MAGIC); + ASSERT(!oldroot->hdr.info.forw); + ASSERT(!oldroot->hdr.info.back); + + /* + * If the root has more than one child, then don't do anything. + */ + if (be16_to_cpu(oldroot->hdr.count) > 1) + return(0); + + /* + * Read in the (only) child block, then copy those bytes into + * the root block's buffer and free the original child block. + */ + child = be32_to_cpu(oldroot->btree[0].before); + ASSERT(child != 0); + error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, + args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + blkinfo = bp->data; + if (be16_to_cpu(oldroot->hdr.level) == 1) { + ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DIR2_LEAFN_MAGIC || + be16_to_cpu(blkinfo->magic) == XFS_ATTR_LEAF_MAGIC); + } else { + ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DA_NODE_MAGIC); + } + ASSERT(!blkinfo->forw); + ASSERT(!blkinfo->back); + memcpy(root_blk->bp->data, bp->data, state->blocksize); + xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); + error = xfs_da_shrink_inode(args, child, bp); + return(error); +} + +/* + * Check a node block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + */ +STATIC int +xfs_da_node_toosmall(xfs_da_state_t *state, int *action) +{ + xfs_da_intnode_t *node; + xfs_da_state_blk_t *blk; + xfs_da_blkinfo_t *info; + int count, forward, error, retval, i; + xfs_dablk_t blkno; + xfs_dabuf_t *bp; + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[ state->path.active-1 ]; + info = blk->bp->data; + ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC); + node = (xfs_da_intnode_t *)info; + count = be16_to_cpu(node->hdr.count); + if (count > (state->node_ents >> 1)) { + *action = 0; /* blk over 50%, don't try to join */ + return(0); /* blk over 50%, don't try to join */ + } + + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (arbitrarily) + * to merge with the forward block unless it is NULL. + */ + if (count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = (info->forw != 0); + memcpy(&state->altpath, &state->path, sizeof(state->path)); + error = xfs_da_path_shift(state, &state->altpath, forward, + 0, &retval); + if (error) + return(error); + if (retval) { + *action = 0; + } else { + *action = 2; + } + return(0); + } + + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink a directory over time. + */ + /* start with smaller blk num */ + forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back)); + for (i = 0; i < 2; forward = !forward, i++) { + if (forward) + blkno = be32_to_cpu(info->forw); + else + blkno = be32_to_cpu(info->back); + if (blkno == 0) + continue; + error = xfs_da_read_buf(state->args->trans, state->args->dp, + blkno, -1, &bp, state->args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + + node = (xfs_da_intnode_t *)info; + count = state->node_ents; + count -= state->node_ents >> 2; + count -= be16_to_cpu(node->hdr.count); + node = bp->data; + ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + count -= be16_to_cpu(node->hdr.count); + xfs_da_brelse(state->args->trans, bp); + if (count >= 0) + break; /* fits with at least 25% to spare */ + } + if (i >= 2) { + *action = 0; + return(0); + } + + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + memcpy(&state->altpath, &state->path, sizeof(state->path)); + if (blkno < blk->blkno) { + error = xfs_da_path_shift(state, &state->altpath, forward, + 0, &retval); + if (error) { + return(error); + } + if (retval) { + *action = 0; + return(0); + } + } else { + error = xfs_da_path_shift(state, &state->path, forward, + 0, &retval); + if (error) { + return(error); + } + if (retval) { + *action = 0; + return(0); + } + } + *action = 1; + return(0); +} + +/* + * Walk back up the tree adjusting hash values as necessary, + * when we stop making changes, return. + */ +void +xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) +{ + xfs_da_state_blk_t *blk; + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + xfs_dahash_t lasthash=0; + int level, count; + + level = path->active-1; + blk = &path->blk[ level ]; + switch (blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + lasthash = xfs_attr_leaf_lasthash(blk->bp, &count); + if (count == 0) + return; + break; + case XFS_DIR2_LEAFN_MAGIC: + lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count); + if (count == 0) + return; + break; + case XFS_DA_NODE_MAGIC: + lasthash = xfs_da_node_lasthash(blk->bp, &count); + if (count == 0) + return; + break; + } + for (blk--, level--; level >= 0; blk--, level--) { + node = blk->bp->data; + ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + btree = &node->btree[ blk->index ]; + if (be32_to_cpu(btree->hashval) == lasthash) + break; + blk->hashval = lasthash; + btree->hashval = cpu_to_be32(lasthash); + xfs_da_log_buf(state->args->trans, blk->bp, + XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); + + lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + } +} + +/* + * Remove an entry from an intermediate node. + */ +STATIC void +xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) +{ + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + int tmp; + + node = drop_blk->bp->data; + ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count)); + ASSERT(drop_blk->index >= 0); + + /* + * Copy over the offending entry, or just zero it out. + */ + btree = &node->btree[drop_blk->index]; + if (drop_blk->index < (be16_to_cpu(node->hdr.count)-1)) { + tmp = be16_to_cpu(node->hdr.count) - drop_blk->index - 1; + tmp *= (uint)sizeof(xfs_da_node_entry_t); + memmove(btree, btree + 1, tmp); + xfs_da_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, btree, tmp)); + btree = &node->btree[be16_to_cpu(node->hdr.count)-1]; + } + memset((char *)btree, 0, sizeof(xfs_da_node_entry_t)); + xfs_da_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); + be16_add_cpu(&node->hdr.count, -1); + xfs_da_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + + /* + * Copy the last hash value from the block to propagate upwards. + */ + btree--; + drop_blk->hashval = be32_to_cpu(btree->hashval); +} + +/* + * Unbalance the btree elements between two intermediate nodes, + * move all Btree elements from one node into another. + */ +STATIC void +xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk) +{ + xfs_da_intnode_t *drop_node, *save_node; + xfs_da_node_entry_t *btree; + int tmp; + xfs_trans_t *tp; + + drop_node = drop_blk->bp->data; + save_node = save_blk->bp->data; + ASSERT(be16_to_cpu(drop_node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + ASSERT(be16_to_cpu(save_node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + tp = state->args->trans; + + /* + * If the dying block has lower hashvals, then move all the + * elements in the remaining block up to make a hole. + */ + if ((be32_to_cpu(drop_node->btree[0].hashval) < be32_to_cpu(save_node->btree[ 0 ].hashval)) || + (be32_to_cpu(drop_node->btree[be16_to_cpu(drop_node->hdr.count)-1].hashval) < + be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval))) + { + btree = &save_node->btree[be16_to_cpu(drop_node->hdr.count)]; + tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); + memmove(btree, &save_node->btree[0], tmp); + btree = &save_node->btree[0]; + xfs_da_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, btree, + (be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) * + sizeof(xfs_da_node_entry_t))); + } else { + btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)]; + xfs_da_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, btree, + be16_to_cpu(drop_node->hdr.count) * + sizeof(xfs_da_node_entry_t))); + } + + /* + * Move all the B-tree elements from drop_blk to save_blk. + */ + tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); + memcpy(btree, &drop_node->btree[0], tmp); + be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count)); + + xfs_da_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, &save_node->hdr, + sizeof(save_node->hdr))); + + /* + * Save the last hashval in the remaining block for upward propagation. + */ + save_blk->hashval = be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval); +} + +/*======================================================================== + * Routines used for finding things in the Btree. + *========================================================================*/ + +/* + * Walk down the Btree looking for a particular filename, filling + * in the state structure as we go. + * + * We will set the state structure to point to each of the elements + * in each of the nodes where either the hashval is or should be. + * + * We support duplicate hashval's so for each entry in the current + * node that could contain the desired hashval, descend. This is a + * pruned depth-first tree search. + */ +int /* error */ +xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) +{ + xfs_da_state_blk_t *blk; + xfs_da_blkinfo_t *curr; + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + xfs_dablk_t blkno; + int probe, span, max, error, retval; + xfs_dahash_t hashval, btreehashval; + xfs_da_args_t *args; + + args = state->args; + + /* + * Descend thru the B-tree searching each level for the right + * node to use, until the right hashval is found. + */ + blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0; + for (blk = &state->path.blk[0], state->path.active = 1; + state->path.active <= XFS_DA_NODE_MAXDEPTH; + blk++, state->path.active++) { + /* + * Read the next node down in the tree. + */ + blk->blkno = blkno; + error = xfs_da_read_buf(args->trans, args->dp, blkno, + -1, &blk->bp, args->whichfork); + if (error) { + blk->blkno = 0; + state->path.active--; + return(error); + } + curr = blk->bp->data; + blk->magic = be16_to_cpu(curr->magic); + ASSERT(blk->magic == XFS_DA_NODE_MAGIC || + blk->magic == XFS_DIR2_LEAFN_MAGIC || + blk->magic == XFS_ATTR_LEAF_MAGIC); + + /* + * Search an intermediate node for a match. + */ + if (blk->magic == XFS_DA_NODE_MAGIC) { + node = blk->bp->data; + max = be16_to_cpu(node->hdr.count); + blk->hashval = be32_to_cpu(node->btree[max-1].hashval); + + /* + * Binary search. (note: small blocks will skip loop) + */ + probe = span = max / 2; + hashval = args->hashval; + for (btree = &node->btree[probe]; span > 4; + btree = &node->btree[probe]) { + span /= 2; + btreehashval = be32_to_cpu(btree->hashval); + if (btreehashval < hashval) + probe += span; + else if (btreehashval > hashval) + probe -= span; + else + break; + } + ASSERT((probe >= 0) && (probe < max)); + ASSERT((span <= 4) || (be32_to_cpu(btree->hashval) == hashval)); + + /* + * Since we may have duplicate hashval's, find the first + * matching hashval in the node. + */ + while ((probe > 0) && (be32_to_cpu(btree->hashval) >= hashval)) { + btree--; + probe--; + } + while ((probe < max) && (be32_to_cpu(btree->hashval) < hashval)) { + btree++; + probe++; + } + + /* + * Pick the right block to descend on. + */ + if (probe == max) { + blk->index = max-1; + blkno = be32_to_cpu(node->btree[max-1].before); + } else { + blk->index = probe; + blkno = be32_to_cpu(btree->before); + } + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); + break; + } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { + blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL); + break; + } + } + + /* + * A leaf block that ends in the hashval that we are interested in + * (final hashval == search hashval) means that the next block may + * contain more entries with the same hashval, shift upward to the + * next leaf and keep searching. + */ + for (;;) { + if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { + retval = xfs_dir2_leafn_lookup_int(blk->bp, args, + &blk->index, state); + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + retval = xfs_attr_leaf_lookup_int(blk->bp, args); + blk->index = args->index; + args->blkno = blk->blkno; + } else { + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); + } + if (((retval == ENOENT) || (retval == ENOATTR)) && + (blk->hashval == args->hashval)) { + error = xfs_da_path_shift(state, &state->path, 1, 1, + &retval); + if (error) + return(error); + if (retval == 0) { + continue; + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + /* path_shift() gives ENOENT */ + retval = XFS_ERROR(ENOATTR); + } + } + break; + } + *result = retval; + return(0); +} + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Link a new block into a doubly linked list of blocks (of whatever type). + */ +int /* error */ +xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, + xfs_da_state_blk_t *new_blk) +{ + xfs_da_blkinfo_t *old_info, *new_info, *tmp_info; + xfs_da_args_t *args; + int before=0, error; + xfs_dabuf_t *bp; + + /* + * Set up environment. + */ + args = state->args; + ASSERT(args != NULL); + old_info = old_blk->bp->data; + new_info = new_blk->bp->data; + ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC || + old_blk->magic == XFS_DIR2_LEAFN_MAGIC || + old_blk->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(old_blk->magic == be16_to_cpu(old_info->magic)); + ASSERT(new_blk->magic == be16_to_cpu(new_info->magic)); + ASSERT(old_blk->magic == new_blk->magic); + + switch (old_blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp); + break; + case XFS_DIR2_LEAFN_MAGIC: + before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp); + break; + case XFS_DA_NODE_MAGIC: + before = xfs_da_node_order(old_blk->bp, new_blk->bp); + break; + } + + /* + * Link blocks in appropriate order. + */ + if (before) { + /* + * Link new block in before existing block. + */ + new_info->forw = cpu_to_be32(old_blk->blkno); + new_info->back = old_info->back; + if (old_info->back) { + error = xfs_da_read_buf(args->trans, args->dp, + be32_to_cpu(old_info->back), + -1, &bp, args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + tmp_info = bp->data; + ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic)); + ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno); + tmp_info->forw = cpu_to_be32(new_blk->blkno); + xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); + xfs_da_buf_done(bp); + } + old_info->back = cpu_to_be32(new_blk->blkno); + } else { + /* + * Link new block in after existing block. + */ + new_info->forw = old_info->forw; + new_info->back = cpu_to_be32(old_blk->blkno); + if (old_info->forw) { + error = xfs_da_read_buf(args->trans, args->dp, + be32_to_cpu(old_info->forw), + -1, &bp, args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + tmp_info = bp->data; + ASSERT(tmp_info->magic == old_info->magic); + ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno); + tmp_info->back = cpu_to_be32(new_blk->blkno); + xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); + xfs_da_buf_done(bp); + } + old_info->forw = cpu_to_be32(new_blk->blkno); + } + + xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); + xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); + return(0); +} + +/* + * Compare two intermediate nodes for "order". + */ +STATIC int +xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp) +{ + xfs_da_intnode_t *node1, *node2; + + node1 = node1_bp->data; + node2 = node2_bp->data; + ASSERT((be16_to_cpu(node1->hdr.info.magic) == XFS_DA_NODE_MAGIC) && + (be16_to_cpu(node2->hdr.info.magic) == XFS_DA_NODE_MAGIC)); + if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && + ((be32_to_cpu(node2->btree[0].hashval) < + be32_to_cpu(node1->btree[0].hashval)) || + (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) < + be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) { + return(1); + } + return(0); +} + +/* + * Pick up the last hashvalue from an intermediate node. + */ +STATIC uint +xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count) +{ + xfs_da_intnode_t *node; + + node = bp->data; + ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + if (count) + *count = be16_to_cpu(node->hdr.count); + if (!node->hdr.count) + return(0); + return be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); +} + +/* + * Unlink a block from a doubly linked list of blocks. + */ +STATIC int /* error */ +xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk) +{ + xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info; + xfs_da_args_t *args; + xfs_dabuf_t *bp; + int error; + + /* + * Set up environment. + */ + args = state->args; + ASSERT(args != NULL); + save_info = save_blk->bp->data; + drop_info = drop_blk->bp->data; + ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC || + save_blk->magic == XFS_DIR2_LEAFN_MAGIC || + save_blk->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(save_blk->magic == be16_to_cpu(save_info->magic)); + ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic)); + ASSERT(save_blk->magic == drop_blk->magic); + ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) || + (be32_to_cpu(save_info->back) == drop_blk->blkno)); + ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) || + (be32_to_cpu(drop_info->back) == save_blk->blkno)); + + /* + * Unlink the leaf block from the doubly linked chain of leaves. + */ + if (be32_to_cpu(save_info->back) == drop_blk->blkno) { + save_info->back = drop_info->back; + if (drop_info->back) { + error = xfs_da_read_buf(args->trans, args->dp, + be32_to_cpu(drop_info->back), + -1, &bp, args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + tmp_info = bp->data; + ASSERT(tmp_info->magic == save_info->magic); + ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno); + tmp_info->forw = cpu_to_be32(save_blk->blkno); + xfs_da_log_buf(args->trans, bp, 0, + sizeof(*tmp_info) - 1); + xfs_da_buf_done(bp); + } + } else { + save_info->forw = drop_info->forw; + if (drop_info->forw) { + error = xfs_da_read_buf(args->trans, args->dp, + be32_to_cpu(drop_info->forw), + -1, &bp, args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + tmp_info = bp->data; + ASSERT(tmp_info->magic == save_info->magic); + ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno); + tmp_info->back = cpu_to_be32(save_blk->blkno); + xfs_da_log_buf(args->trans, bp, 0, + sizeof(*tmp_info) - 1); + xfs_da_buf_done(bp); + } + } + + xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); + return(0); +} + +/* + * Move a path "forward" or "!forward" one block at the current level. + * + * This routine will adjust a "path" to point to the next block + * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the + * Btree, including updating pointers to the intermediate nodes between + * the new bottom and the root. + */ +int /* error */ +xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, + int forward, int release, int *result) +{ + xfs_da_state_blk_t *blk; + xfs_da_blkinfo_t *info; + xfs_da_intnode_t *node; + xfs_da_args_t *args; + xfs_dablk_t blkno=0; + int level, error; + + /* + * Roll up the Btree looking for the first block where our + * current index is not at the edge of the block. Note that + * we skip the bottom layer because we want the sibling block. + */ + args = state->args; + ASSERT(args != NULL); + ASSERT(path != NULL); + ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + level = (path->active-1) - 1; /* skip bottom layer in path */ + for (blk = &path->blk[level]; level >= 0; blk--, level--) { + ASSERT(blk->bp != NULL); + node = blk->bp->data; + ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) { + blk->index++; + blkno = be32_to_cpu(node->btree[blk->index].before); + break; + } else if (!forward && (blk->index > 0)) { + blk->index--; + blkno = be32_to_cpu(node->btree[blk->index].before); + break; + } + } + if (level < 0) { + *result = XFS_ERROR(ENOENT); /* we're out of our tree */ + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + return(0); + } + + /* + * Roll down the edge of the subtree until we reach the + * same depth we were at originally. + */ + for (blk++, level++; level < path->active; blk++, level++) { + /* + * Release the old block. + * (if it's dirty, trans won't actually let go) + */ + if (release) + xfs_da_brelse(args->trans, blk->bp); + + /* + * Read the next child block. + */ + blk->blkno = blkno; + error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, + &blk->bp, args->whichfork); + if (error) + return(error); + ASSERT(blk->bp != NULL); + info = blk->bp->data; + ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC || + be16_to_cpu(info->magic) == XFS_DIR2_LEAFN_MAGIC || + be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC); + blk->magic = be16_to_cpu(info->magic); + if (blk->magic == XFS_DA_NODE_MAGIC) { + node = (xfs_da_intnode_t *)info; + blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + if (forward) + blk->index = 0; + else + blk->index = be16_to_cpu(node->hdr.count)-1; + blkno = be32_to_cpu(node->btree[blk->index].before); + } else { + ASSERT(level == path->active-1); + blk->index = 0; + switch(blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, + NULL); + break; + case XFS_DIR2_LEAFN_MAGIC: + blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, + NULL); + break; + default: + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC || + blk->magic == XFS_DIR2_LEAFN_MAGIC); + break; + } + } + } + *result = 0; + return(0); +} + + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Implement a simple hash on a character string. + * Rotate the hash value by 7 bits, then XOR each character in. + * This is implemented with some source-level loop unrolling. + */ +xfs_dahash_t +xfs_da_hashname(const __uint8_t *name, int namelen) +{ + xfs_dahash_t hash; + + /* + * Do four characters at a time as long as we can. + */ + for (hash = 0; namelen >= 4; namelen -= 4, name += 4) + hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^ + (name[3] << 0) ^ rol32(hash, 7 * 4); + + /* + * Now do the rest of the characters. + */ + switch (namelen) { + case 3: + return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^ + rol32(hash, 7 * 3); + case 2: + return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2); + case 1: + return (name[0] << 0) ^ rol32(hash, 7 * 1); + default: /* case 0: */ + return hash; + } +} + +enum xfs_dacmp +xfs_da_compname( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + return (args->namelen == len && memcmp(args->name, name, len) == 0) ? + XFS_CMP_EXACT : XFS_CMP_DIFFERENT; +} + +static xfs_dahash_t +xfs_default_hashname( + struct xfs_name *name) +{ + return xfs_da_hashname(name->name, name->len); +} + +const struct xfs_nameops xfs_default_nameops = { + .hashname = xfs_default_hashname, + .compname = xfs_da_compname +}; + +/* + * Add a block to the btree ahead of the file. + * Return the new block number to the caller. + */ +int +xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) +{ + xfs_fileoff_t bno, b; + xfs_bmbt_irec_t map; + xfs_bmbt_irec_t *mapp; + xfs_inode_t *dp; + int nmap, error, w, count, c, got, i, mapi; + xfs_trans_t *tp; + xfs_mount_t *mp; + xfs_drfsbno_t nblks; + + dp = args->dp; + mp = dp->i_mount; + w = args->whichfork; + tp = args->trans; + nblks = dp->i_d.di_nblocks; + + /* + * For new directories adjust the file offset and block count. + */ + if (w == XFS_DATA_FORK) { + bno = mp->m_dirleafblk; + count = mp->m_dirblkfsbs; + } else { + bno = 0; + count = 1; + } + /* + * Find a spot in the file space to put the new block. + */ + if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w))) + return error; + if (w == XFS_DATA_FORK) + ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk); + /* + * Try mapping it in one filesystem block. + */ + nmap = 1; + ASSERT(args->firstblock != NULL); + if ((error = xfs_bmapi(tp, dp, bno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| + XFS_BMAPI_CONTIG, + args->firstblock, args->total, &map, &nmap, + args->flist))) { + return error; + } + ASSERT(nmap <= 1); + if (nmap == 1) { + mapp = ↦ + mapi = 1; + } + /* + * If we didn't get it and the block might work if fragmented, + * try without the CONTIG flag. Loop until we get it all. + */ + else if (nmap == 0 && count > 1) { + mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); + for (b = bno, mapi = 0; b < bno + count; ) { + nmap = MIN(XFS_BMAP_MAX_NMAP, count); + c = (int)(bno + count - b); + if ((error = xfs_bmapi(tp, dp, b, c, + xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE| + XFS_BMAPI_METADATA, + args->firstblock, args->total, + &mapp[mapi], &nmap, args->flist))) { + kmem_free(mapp); + return error; + } + if (nmap < 1) + break; + mapi += nmap; + b = mapp[mapi - 1].br_startoff + + mapp[mapi - 1].br_blockcount; + } + } else { + mapi = 0; + mapp = NULL; + } + /* + * Count the blocks we got, make sure it matches the total. + */ + for (i = 0, got = 0; i < mapi; i++) + got += mapp[i].br_blockcount; + if (got != count || mapp[0].br_startoff != bno || + mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != + bno + count) { + if (mapp != &map) + kmem_free(mapp); + return XFS_ERROR(ENOSPC); + } + if (mapp != &map) + kmem_free(mapp); + /* account for newly allocated blocks in reserved blocks total */ + args->total -= dp->i_d.di_nblocks - nblks; + *new_blkno = (xfs_dablk_t)bno; + return 0; +} + +/* + * Ick. We need to always be able to remove a btree block, even + * if there's no space reservation because the filesystem is full. + * This is called if xfs_bunmapi on a btree block fails due to ENOSPC. + * It swaps the target block with the last block in the file. The + * last block in the file can always be removed since it can't cause + * a bmap btree split to do that. + */ +STATIC int +xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, + xfs_dabuf_t **dead_bufp) +{ + xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno; + xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf; + xfs_fileoff_t lastoff; + xfs_inode_t *ip; + xfs_trans_t *tp; + xfs_mount_t *mp; + int error, w, entno, level, dead_level; + xfs_da_blkinfo_t *dead_info, *sib_info; + xfs_da_intnode_t *par_node, *dead_node; + xfs_dir2_leaf_t *dead_leaf2; + xfs_dahash_t dead_hash; + + dead_buf = *dead_bufp; + dead_blkno = *dead_blknop; + tp = args->trans; + ip = args->dp; + w = args->whichfork; + ASSERT(w == XFS_DATA_FORK); + mp = ip->i_mount; + lastoff = mp->m_dirfreeblk; + error = xfs_bmap_last_before(tp, ip, &lastoff, w); + if (error) + return error; + if (unlikely(lastoff == 0)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW, + mp); + return XFS_ERROR(EFSCORRUPTED); + } + /* + * Read the last block in the btree space. + */ + last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; + if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w))) + return error; + /* + * Copy the last block into the dead buffer and log it. + */ + memcpy(dead_buf->data, last_buf->data, mp->m_dirblksize); + xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1); + dead_info = dead_buf->data; + /* + * Get values from the moved block. + */ + if (be16_to_cpu(dead_info->magic) == XFS_DIR2_LEAFN_MAGIC) { + dead_leaf2 = (xfs_dir2_leaf_t *)dead_info; + dead_level = 0; + dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval); + } else { + ASSERT(be16_to_cpu(dead_info->magic) == XFS_DA_NODE_MAGIC); + dead_node = (xfs_da_intnode_t *)dead_info; + dead_level = be16_to_cpu(dead_node->hdr.level); + dead_hash = be32_to_cpu(dead_node->btree[be16_to_cpu(dead_node->hdr.count) - 1].hashval); + } + sib_buf = par_buf = NULL; + /* + * If the moved block has a left sibling, fix up the pointers. + */ + if ((sib_blkno = be32_to_cpu(dead_info->back))) { + if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + goto done; + sib_info = sib_buf->data; + if (unlikely( + be32_to_cpu(sib_info->forw) != last_blkno || + sib_info->magic != dead_info->magic)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)", + XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + sib_info->forw = cpu_to_be32(dead_blkno); + xfs_da_log_buf(tp, sib_buf, + XFS_DA_LOGRANGE(sib_info, &sib_info->forw, + sizeof(sib_info->forw))); + xfs_da_buf_done(sib_buf); + sib_buf = NULL; + } + /* + * If the moved block has a right sibling, fix up the pointers. + */ + if ((sib_blkno = be32_to_cpu(dead_info->forw))) { + if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + goto done; + sib_info = sib_buf->data; + if (unlikely( + be32_to_cpu(sib_info->back) != last_blkno || + sib_info->magic != dead_info->magic)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)", + XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + sib_info->back = cpu_to_be32(dead_blkno); + xfs_da_log_buf(tp, sib_buf, + XFS_DA_LOGRANGE(sib_info, &sib_info->back, + sizeof(sib_info->back))); + xfs_da_buf_done(sib_buf); + sib_buf = NULL; + } + par_blkno = mp->m_dirleafblk; + level = -1; + /* + * Walk down the tree looking for the parent of the moved block. + */ + for (;;) { + if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + goto done; + par_node = par_buf->data; + if (unlikely( + be16_to_cpu(par_node->hdr.info.magic) != XFS_DA_NODE_MAGIC || + (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", + XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + level = be16_to_cpu(par_node->hdr.level); + for (entno = 0; + entno < be16_to_cpu(par_node->hdr.count) && + be32_to_cpu(par_node->btree[entno].hashval) < dead_hash; + entno++) + continue; + if (unlikely(entno == be16_to_cpu(par_node->hdr.count))) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)", + XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + par_blkno = be32_to_cpu(par_node->btree[entno].before); + if (level == dead_level + 1) + break; + xfs_da_brelse(tp, par_buf); + par_buf = NULL; + } + /* + * We're in the right parent block. + * Look for the right entry. + */ + for (;;) { + for (; + entno < be16_to_cpu(par_node->hdr.count) && + be32_to_cpu(par_node->btree[entno].before) != last_blkno; + entno++) + continue; + if (entno < be16_to_cpu(par_node->hdr.count)) + break; + par_blkno = be32_to_cpu(par_node->hdr.info.forw); + xfs_da_brelse(tp, par_buf); + par_buf = NULL; + if (unlikely(par_blkno == 0)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)", + XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + goto done; + par_node = par_buf->data; + if (unlikely( + be16_to_cpu(par_node->hdr.level) != level || + be16_to_cpu(par_node->hdr.info.magic) != XFS_DA_NODE_MAGIC)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", + XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + entno = 0; + } + /* + * Update the parent entry pointing to the moved block. + */ + par_node->btree[entno].before = cpu_to_be32(dead_blkno); + xfs_da_log_buf(tp, par_buf, + XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before, + sizeof(par_node->btree[entno].before))); + xfs_da_buf_done(par_buf); + xfs_da_buf_done(dead_buf); + *dead_blknop = last_blkno; + *dead_bufp = last_buf; + return 0; +done: + if (par_buf) + xfs_da_brelse(tp, par_buf); + if (sib_buf) + xfs_da_brelse(tp, sib_buf); + xfs_da_brelse(tp, last_buf); + return error; +} + +/* + * Remove a btree block from a directory or attribute. + */ +int +xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, + xfs_dabuf_t *dead_buf) +{ + xfs_inode_t *dp; + int done, error, w, count; + xfs_trans_t *tp; + xfs_mount_t *mp; + + dp = args->dp; + w = args->whichfork; + tp = args->trans; + mp = dp->i_mount; + if (w == XFS_DATA_FORK) + count = mp->m_dirblkfsbs; + else + count = 1; + for (;;) { + /* + * Remove extents. If we get ENOSPC for a dir we have to move + * the last block to the place we want to kill. + */ + if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, + 0, args->firstblock, args->flist, + &done)) == ENOSPC) { + if (w != XFS_DATA_FORK) + break; + if ((error = xfs_da_swap_lastblock(args, &dead_blkno, + &dead_buf))) + break; + } else { + break; + } + } + xfs_da_binval(tp, dead_buf); + return error; +} + +/* + * See if the mapping(s) for this btree block are valid, i.e. + * don't contain holes, are logically contiguous, and cover the whole range. + */ +STATIC int +xfs_da_map_covers_blocks( + int nmap, + xfs_bmbt_irec_t *mapp, + xfs_dablk_t bno, + int count) +{ + int i; + xfs_fileoff_t off; + + for (i = 0, off = bno; i < nmap; i++) { + if (mapp[i].br_startblock == HOLESTARTBLOCK || + mapp[i].br_startblock == DELAYSTARTBLOCK) { + return 0; + } + if (off != mapp[i].br_startoff) { + return 0; + } + off += mapp[i].br_blockcount; + } + return off == bno + count; +} + +/* + * Make a dabuf. + * Used for get_buf, read_buf, read_bufr, and reada_buf. + */ +STATIC int +xfs_da_do_buf( + xfs_trans_t *trans, + xfs_inode_t *dp, + xfs_dablk_t bno, + xfs_daddr_t *mappedbnop, + xfs_dabuf_t **bpp, + int whichfork, + int caller, + inst_t *ra) +{ + xfs_buf_t *bp = NULL; + xfs_buf_t **bplist; + int error=0; + int i; + xfs_bmbt_irec_t map; + xfs_bmbt_irec_t *mapp; + xfs_daddr_t mappedbno; + xfs_mount_t *mp; + int nbplist=0; + int nfsb; + int nmap; + xfs_dabuf_t *rbp; + + mp = dp->i_mount; + nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1; + mappedbno = *mappedbnop; + /* + * Caller doesn't have a mapping. -2 means don't complain + * if we land in a hole. + */ + if (mappedbno == -1 || mappedbno == -2) { + /* + * Optimize the one-block case. + */ + if (nfsb == 1) { + xfs_fsblock_t fsb; + + if ((error = + xfs_bmapi_single(trans, dp, whichfork, &fsb, + (xfs_fileoff_t)bno))) { + return error; + } + mapp = ↦ + if (fsb == NULLFSBLOCK) { + nmap = 0; + } else { + map.br_startblock = fsb; + map.br_startoff = (xfs_fileoff_t)bno; + map.br_blockcount = 1; + nmap = 1; + } + } else { + mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP); + nmap = nfsb; + if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno, + nfsb, + XFS_BMAPI_METADATA | + xfs_bmapi_aflag(whichfork), + NULL, 0, mapp, &nmap, NULL))) + goto exit0; + } + } else { + map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); + map.br_startoff = (xfs_fileoff_t)bno; + map.br_blockcount = nfsb; + mapp = ↦ + nmap = 1; + } + if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) { + error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED); + if (unlikely(error == EFSCORRUPTED)) { + if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + xfs_alert(mp, "%s: bno %lld dir: inode %lld", + __func__, (long long)bno, + (long long)dp->i_ino); + for (i = 0; i < nmap; i++) { + xfs_alert(mp, +"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d", + i, + (long long)mapp[i].br_startoff, + (long long)mapp[i].br_startblock, + (long long)mapp[i].br_blockcount, + mapp[i].br_state); + } + } + XFS_ERROR_REPORT("xfs_da_do_buf(1)", + XFS_ERRLEVEL_LOW, mp); + } + goto exit0; + } + if (caller != 3 && nmap > 1) { + bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP); + nbplist = 0; + } else + bplist = NULL; + /* + * Turn the mapping(s) into buffer(s). + */ + for (i = 0; i < nmap; i++) { + int nmapped; + + mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock); + if (i == 0) + *mappedbnop = mappedbno; + nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount); + switch (caller) { + case 0: + bp = xfs_trans_get_buf(trans, mp->m_ddev_targp, + mappedbno, nmapped, 0); + error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO); + break; + case 1: + case 2: + bp = NULL; + error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp, + mappedbno, nmapped, 0, &bp); + break; + case 3: + xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped); + error = 0; + bp = NULL; + break; + } + if (error) { + if (bp) + xfs_trans_brelse(trans, bp); + goto exit1; + } + if (!bp) + continue; + if (caller == 1) { + if (whichfork == XFS_ATTR_FORK) { + XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE, + XFS_ATTR_BTREE_REF); + } else { + XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE, + XFS_DIR_BTREE_REF); + } + } + if (bplist) { + bplist[nbplist++] = bp; + } + } + /* + * Build a dabuf structure. + */ + if (bplist) { + rbp = xfs_da_buf_make(nbplist, bplist, ra); + } else if (bp) + rbp = xfs_da_buf_make(1, &bp, ra); + else + rbp = NULL; + /* + * For read_buf, check the magic number. + */ + if (caller == 1) { + xfs_dir2_data_t *data; + xfs_dir2_free_t *free; + xfs_da_blkinfo_t *info; + uint magic, magic1; + + info = rbp->data; + data = rbp->data; + free = rbp->data; + magic = be16_to_cpu(info->magic); + magic1 = be32_to_cpu(data->hdr.magic); + if (unlikely( + XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) && + (magic != XFS_ATTR_LEAF_MAGIC) && + (magic != XFS_DIR2_LEAF1_MAGIC) && + (magic != XFS_DIR2_LEAFN_MAGIC) && + (magic1 != XFS_DIR2_BLOCK_MAGIC) && + (magic1 != XFS_DIR2_DATA_MAGIC) && + (be32_to_cpu(free->hdr.magic) != XFS_DIR2_FREE_MAGIC), + mp, XFS_ERRTAG_DA_READ_BUF, + XFS_RANDOM_DA_READ_BUF))) { + trace_xfs_da_btree_corrupt(rbp->bps[0], _RET_IP_); + XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)", + XFS_ERRLEVEL_LOW, mp, info); + error = XFS_ERROR(EFSCORRUPTED); + xfs_da_brelse(trans, rbp); + nbplist = 0; + goto exit1; + } + } + if (bplist) { + kmem_free(bplist); + } + if (mapp != &map) { + kmem_free(mapp); + } + if (bpp) + *bpp = rbp; + return 0; +exit1: + if (bplist) { + for (i = 0; i < nbplist; i++) + xfs_trans_brelse(trans, bplist[i]); + kmem_free(bplist); + } +exit0: + if (mapp != &map) + kmem_free(mapp); + if (bpp) + *bpp = NULL; + return error; +} + +/* + * Get a buffer for the dir/attr block. + */ +int +xfs_da_get_buf( + xfs_trans_t *trans, + xfs_inode_t *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + xfs_dabuf_t **bpp, + int whichfork) +{ + return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0, + (inst_t *)__return_address); +} + +/* + * Get a buffer for the dir/attr block, fill in the contents. + */ +int +xfs_da_read_buf( + xfs_trans_t *trans, + xfs_inode_t *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + xfs_dabuf_t **bpp, + int whichfork) +{ + return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1, + (inst_t *)__return_address); +} + +/* + * Readahead the dir/attr block. + */ +xfs_daddr_t +xfs_da_reada_buf( + xfs_trans_t *trans, + xfs_inode_t *dp, + xfs_dablk_t bno, + int whichfork) +{ + xfs_daddr_t rval; + + rval = -1; + if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3, + (inst_t *)__return_address)) + return -1; + else + return rval; +} + +kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ +kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */ + +/* + * Allocate a dir-state structure. + * We don't put them on the stack since they're large. + */ +xfs_da_state_t * +xfs_da_state_alloc(void) +{ + return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); +} + +/* + * Kill the altpath contents of a da-state structure. + */ +STATIC void +xfs_da_state_kill_altpath(xfs_da_state_t *state) +{ + int i; + + for (i = 0; i < state->altpath.active; i++) { + if (state->altpath.blk[i].bp) { + if (state->altpath.blk[i].bp != state->path.blk[i].bp) + xfs_da_buf_done(state->altpath.blk[i].bp); + state->altpath.blk[i].bp = NULL; + } + } + state->altpath.active = 0; +} + +/* + * Free a da-state structure. + */ +void +xfs_da_state_free(xfs_da_state_t *state) +{ + int i; + + xfs_da_state_kill_altpath(state); + for (i = 0; i < state->path.active; i++) { + if (state->path.blk[i].bp) + xfs_da_buf_done(state->path.blk[i].bp); + } + if (state->extravalid && state->extrablk.bp) + xfs_da_buf_done(state->extrablk.bp); +#ifdef DEBUG + memset((char *)state, 0, sizeof(*state)); +#endif /* DEBUG */ + kmem_zone_free(xfs_da_state_zone, state); +} + +#ifdef XFS_DABUF_DEBUG +xfs_dabuf_t *xfs_dabuf_global_list; +static DEFINE_SPINLOCK(xfs_dabuf_global_lock); +#endif + +/* + * Create a dabuf. + */ +/* ARGSUSED */ +STATIC xfs_dabuf_t * +xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra) +{ + xfs_buf_t *bp; + xfs_dabuf_t *dabuf; + int i; + int off; + + if (nbuf == 1) + dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS); + else + dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS); + dabuf->dirty = 0; +#ifdef XFS_DABUF_DEBUG + dabuf->ra = ra; + dabuf->target = XFS_BUF_TARGET(bps[0]); + dabuf->blkno = XFS_BUF_ADDR(bps[0]); +#endif + if (nbuf == 1) { + dabuf->nbuf = 1; + bp = bps[0]; + dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp)); + dabuf->data = XFS_BUF_PTR(bp); + dabuf->bps[0] = bp; + } else { + dabuf->nbuf = nbuf; + for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) { + dabuf->bps[i] = bp = bps[i]; + dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp)); + } + dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); + for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) { + bp = bps[i]; + memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp), + XFS_BUF_COUNT(bp)); + } + } +#ifdef XFS_DABUF_DEBUG + { + xfs_dabuf_t *p; + + spin_lock(&xfs_dabuf_global_lock); + for (p = xfs_dabuf_global_list; p; p = p->next) { + ASSERT(p->blkno != dabuf->blkno || + p->target != dabuf->target); + } + dabuf->prev = NULL; + if (xfs_dabuf_global_list) + xfs_dabuf_global_list->prev = dabuf; + dabuf->next = xfs_dabuf_global_list; + xfs_dabuf_global_list = dabuf; + spin_unlock(&xfs_dabuf_global_lock); + } +#endif + return dabuf; +} + +/* + * Un-dirty a dabuf. + */ +STATIC void +xfs_da_buf_clean(xfs_dabuf_t *dabuf) +{ + xfs_buf_t *bp; + int i; + int off; + + if (dabuf->dirty) { + ASSERT(dabuf->nbuf > 1); + dabuf->dirty = 0; + for (i = off = 0; i < dabuf->nbuf; + i++, off += XFS_BUF_COUNT(bp)) { + bp = dabuf->bps[i]; + memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off, + XFS_BUF_COUNT(bp)); + } + } +} + +/* + * Release a dabuf. + */ +void +xfs_da_buf_done(xfs_dabuf_t *dabuf) +{ + ASSERT(dabuf); + ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); + if (dabuf->dirty) + xfs_da_buf_clean(dabuf); + if (dabuf->nbuf > 1) + kmem_free(dabuf->data); +#ifdef XFS_DABUF_DEBUG + { + spin_lock(&xfs_dabuf_global_lock); + if (dabuf->prev) + dabuf->prev->next = dabuf->next; + else + xfs_dabuf_global_list = dabuf->next; + if (dabuf->next) + dabuf->next->prev = dabuf->prev; + spin_unlock(&xfs_dabuf_global_lock); + } + memset(dabuf, 0, XFS_DA_BUF_SIZE(dabuf->nbuf)); +#endif + if (dabuf->nbuf == 1) + kmem_zone_free(xfs_dabuf_zone, dabuf); + else + kmem_free(dabuf); +} + +/* + * Log transaction from a dabuf. + */ +void +xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) +{ + xfs_buf_t *bp; + uint f; + int i; + uint l; + int off; + + ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); + if (dabuf->nbuf == 1) { + ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0])); + xfs_trans_log_buf(tp, dabuf->bps[0], first, last); + return; + } + dabuf->dirty = 1; + ASSERT(first <= last); + for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) { + bp = dabuf->bps[i]; + f = off; + l = f + XFS_BUF_COUNT(bp) - 1; + if (f < first) + f = first; + if (l > last) + l = last; + if (f <= l) + xfs_trans_log_buf(tp, bp, f - off, l - off); + /* + * B_DONE is set by xfs_trans_log buf. + * If we don't set it on a new buffer (get not read) + * then if we don't put anything in the buffer it won't + * be set, and at commit it it released into the cache, + * and then a read will fail. + */ + else if (!(XFS_BUF_ISDONE(bp))) + XFS_BUF_DONE(bp); + } + ASSERT(last < off); +} + +/* + * Release dabuf from a transaction. + * Have to free up the dabuf before the buffers are released, + * since the synchronization on the dabuf is really the lock on the buffer. + */ +void +xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf) +{ + xfs_buf_t *bp; + xfs_buf_t **bplist; + int i; + int nbuf; + + ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); + if ((nbuf = dabuf->nbuf) == 1) { + bplist = &bp; + bp = dabuf->bps[0]; + } else { + bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP); + memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist)); + } + xfs_da_buf_done(dabuf); + for (i = 0; i < nbuf; i++) + xfs_trans_brelse(tp, bplist[i]); + if (bplist != &bp) + kmem_free(bplist); +} + +/* + * Invalidate dabuf from a transaction. + */ +void +xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf) +{ + xfs_buf_t *bp; + xfs_buf_t **bplist; + int i; + int nbuf; + + ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); + if ((nbuf = dabuf->nbuf) == 1) { + bplist = &bp; + bp = dabuf->bps[0]; + } else { + bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP); + memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist)); + } + xfs_da_buf_done(dabuf); + for (i = 0; i < nbuf; i++) + xfs_trans_binval(tp, bplist[i]); + if (bplist != &bp) + kmem_free(bplist); +} + +/* + * Get the first daddr from a dabuf. + */ +xfs_daddr_t +xfs_da_blkno(xfs_dabuf_t *dabuf) +{ + ASSERT(dabuf->nbuf); + ASSERT(dabuf->data); + return XFS_BUF_ADDR(dabuf->bps[0]); +} diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h new file mode 100644 index 00000000..fe9f5a8c --- /dev/null +++ b/fs/xfs/xfs_da_btree.h @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DA_BTREE_H__ +#define __XFS_DA_BTREE_H__ + +struct xfs_buf; +struct xfs_bmap_free; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; +struct zone; + +/*======================================================================== + * Directory Structure when greater than XFS_LBSIZE(mp) bytes. + *========================================================================*/ + +/* + * This structure is common to both leaf nodes and non-leaf nodes in the Btree. + * + * Is is used to manage a doubly linked list of all blocks at the same + * level in the Btree, and to identify which type of block this is. + */ +#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ +#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ +#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ + +typedef struct xfs_da_blkinfo { + __be32 forw; /* previous block in list */ + __be32 back; /* following block in list */ + __be16 magic; /* validity check on block */ + __be16 pad; /* unused */ +} xfs_da_blkinfo_t; + +/* + * This is the structure of the root and intermediate nodes in the Btree. + * The leaf nodes are defined above. + * + * Entries are not packed. + * + * Since we have duplicate keys, use a binary search but always follow + * all match in the block, not just the first match found. + */ +#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ + +typedef struct xfs_da_intnode { + struct xfs_da_node_hdr { /* constant-structure header block */ + xfs_da_blkinfo_t info; /* block type, links, etc. */ + __be16 count; /* count of active entries */ + __be16 level; /* level above leaves (leaf == 0) */ + } hdr; + struct xfs_da_node_entry { + __be32 hashval; /* hash value for this descendant */ + __be32 before; /* Btree block before this key */ + } btree[1]; /* variable sized array of keys */ +} xfs_da_intnode_t; +typedef struct xfs_da_node_hdr xfs_da_node_hdr_t; +typedef struct xfs_da_node_entry xfs_da_node_entry_t; + +#define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize + +/*======================================================================== + * Btree searching and modification structure definitions. + *========================================================================*/ + +/* + * Search comparison results + */ +enum xfs_dacmp { + XFS_CMP_DIFFERENT, /* names are completely different */ + XFS_CMP_EXACT, /* names are exactly the same */ + XFS_CMP_CASE /* names are same but differ in case */ +}; + +/* + * Structure to ease passing around component names. + */ +typedef struct xfs_da_args { + const __uint8_t *name; /* string (maybe not NULL terminated) */ + int namelen; /* length of string (maybe no NULL) */ + __uint8_t *value; /* set of bytes (maybe contain NULLs) */ + int valuelen; /* length of value */ + int flags; /* argument flags (eg: ATTR_NOCREATE) */ + xfs_dahash_t hashval; /* hash value of name */ + xfs_ino_t inumber; /* input/output inode number */ + struct xfs_inode *dp; /* directory inode to manipulate */ + xfs_fsblock_t *firstblock; /* ptr to firstblock for bmap calls */ + struct xfs_bmap_free *flist; /* ptr to freelist for bmap_finish */ + struct xfs_trans *trans; /* current trans (changes over time) */ + xfs_extlen_t total; /* total blocks needed, for 1st bmap */ + int whichfork; /* data or attribute fork */ + xfs_dablk_t blkno; /* blkno of attr leaf of interest */ + int index; /* index of attr of interest in blk */ + xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ + int rmtblkcnt; /* remote attr value block count */ + xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ + int index2; /* index of 2nd attr in blk */ + xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ + int rmtblkcnt2; /* remote attr value block count */ + int op_flags; /* operation flags */ + enum xfs_dacmp cmpresult; /* name compare result for lookups */ +} xfs_da_args_t; + +/* + * Operation flags: + */ +#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */ +#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */ +#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ +#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ +#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ + +#define XFS_DA_OP_FLAGS \ + { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ + { XFS_DA_OP_RENAME, "RENAME" }, \ + { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ + { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ + { XFS_DA_OP_CILOOKUP, "CILOOKUP" } + +/* + * Structure to describe buffer(s) for a block. + * This is needed in the directory version 2 format case, when + * multiple non-contiguous fsblocks might be needed to cover one + * logical directory block. + * If the buffer count is 1 then the data pointer points to the + * same place as the b_addr field for the buffer, else to kmem_alloced memory. + */ +typedef struct xfs_dabuf { + int nbuf; /* number of buffer pointers present */ + short dirty; /* data needs to be copied back */ + short bbcount; /* how large is data in bbs */ + void *data; /* pointer for buffers' data */ +#ifdef XFS_DABUF_DEBUG + inst_t *ra; /* return address of caller to make */ + struct xfs_dabuf *next; /* next in global chain */ + struct xfs_dabuf *prev; /* previous in global chain */ + struct xfs_buftarg *target; /* device for buffer */ + xfs_daddr_t blkno; /* daddr first in bps[0] */ +#endif + struct xfs_buf *bps[1]; /* actually nbuf of these */ +} xfs_dabuf_t; +#define XFS_DA_BUF_SIZE(n) \ + (sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1)) + +#ifdef XFS_DABUF_DEBUG +extern xfs_dabuf_t *xfs_dabuf_global_list; +#endif + +/* + * Storage for holding state during Btree searches and split/join ops. + * + * Only need space for 5 intermediate nodes. With a minimum of 62-way + * fanout to the Btree, we can support over 900 million directory blocks, + * which is slightly more than enough. + */ +typedef struct xfs_da_state_blk { + xfs_dabuf_t *bp; /* buffer containing block */ + xfs_dablk_t blkno; /* filesystem blkno of buffer */ + xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */ + int index; /* relevant index into block */ + xfs_dahash_t hashval; /* last hash value in block */ + int magic; /* blk's magic number, ie: blk type */ +} xfs_da_state_blk_t; + +typedef struct xfs_da_state_path { + int active; /* number of active levels */ + xfs_da_state_blk_t blk[XFS_DA_NODE_MAXDEPTH]; +} xfs_da_state_path_t; + +typedef struct xfs_da_state { + xfs_da_args_t *args; /* filename arguments */ + struct xfs_mount *mp; /* filesystem mount point */ + unsigned int blocksize; /* logical block size */ + unsigned int node_ents; /* how many entries in danode */ + xfs_da_state_path_t path; /* search/split paths */ + xfs_da_state_path_t altpath; /* alternate path for join */ + unsigned char inleaf; /* insert into 1->lf, 0->splf */ + unsigned char extravalid; /* T/F: extrablk is in use */ + unsigned char extraafter; /* T/F: extrablk is after new */ + xfs_da_state_blk_t extrablk; /* for double-splits on leaves */ + /* for dirv2 extrablk is data */ +} xfs_da_state_t; + +/* + * Utility macros to aid in logging changed structure fields. + */ +#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE)) +#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE) \ + (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \ + (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1) + +/* + * Name ops for directory and/or attr name operations + */ +struct xfs_nameops { + xfs_dahash_t (*hashname)(struct xfs_name *); + enum xfs_dacmp (*compname)(struct xfs_da_args *, + const unsigned char *, int); +}; + + +/*======================================================================== + * Function prototypes. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, + xfs_dabuf_t **bpp, int whichfork); +int xfs_da_split(xfs_da_state_t *state); + +/* + * Routines used for shrinking the Btree. + */ +int xfs_da_join(xfs_da_state_t *state); +void xfs_da_fixhashpath(xfs_da_state_t *state, + xfs_da_state_path_t *path_to_to_fix); + +/* + * Routines used for finding things in the Btree. + */ +int xfs_da_node_lookup_int(xfs_da_state_t *state, int *result); +int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, + int forward, int release, int *result); +/* + * Utility routines. + */ +int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, + xfs_da_state_blk_t *new_blk); + +/* + * Utility routines. + */ +int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); +int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + xfs_dabuf_t **bp, int whichfork); +int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + xfs_dabuf_t **bpp, int whichfork); +xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, + xfs_dablk_t bno, int whichfork); +int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, + xfs_dabuf_t *dead_buf); + +uint xfs_da_hashname(const __uint8_t *name_string, int name_length); +enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, + const unsigned char *name, int len); + + +xfs_da_state_t *xfs_da_state_alloc(void); +void xfs_da_state_free(xfs_da_state_t *state); + +void xfs_da_buf_done(xfs_dabuf_t *dabuf); +void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first, + uint last); +void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf); +void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf); +xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf); + +extern struct kmem_zone *xfs_da_state_zone; +extern struct kmem_zone *xfs_dabuf_zone; +extern const struct xfs_nameops xfs_default_nameops; + +#endif /* __XFS_DA_BTREE_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c new file mode 100644 index 00000000..9a84a85c --- /dev/null +++ b/fs/xfs/xfs_dfrag.c @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_itable.h" +#include "xfs_dfrag.h" +#include "xfs_error.h" +#include "xfs_vnodeops.h" +#include "xfs_trace.h" + + +static int xfs_swap_extents( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip, /* tmp inode */ + xfs_swapext_t *sxp); + +/* + * ioctl interface for swapext + */ +int +xfs_swapext( + xfs_swapext_t *sxp) +{ + xfs_inode_t *ip, *tip; + struct file *file, *tmp_file; + int error = 0; + + /* Pull information for the target fd */ + file = fget((int)sxp->sx_fdtarget); + if (!file) { + error = XFS_ERROR(EINVAL); + goto out; + } + + if (!(file->f_mode & FMODE_WRITE) || + !(file->f_mode & FMODE_READ) || + (file->f_flags & O_APPEND)) { + error = XFS_ERROR(EBADF); + goto out_put_file; + } + + tmp_file = fget((int)sxp->sx_fdtmp); + if (!tmp_file) { + error = XFS_ERROR(EINVAL); + goto out_put_file; + } + + if (!(tmp_file->f_mode & FMODE_WRITE) || + !(tmp_file->f_mode & FMODE_READ) || + (tmp_file->f_flags & O_APPEND)) { + error = XFS_ERROR(EBADF); + goto out_put_tmp_file; + } + + if (IS_SWAPFILE(file->f_path.dentry->d_inode) || + IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) { + error = XFS_ERROR(EINVAL); + goto out_put_tmp_file; + } + + ip = XFS_I(file->f_path.dentry->d_inode); + tip = XFS_I(tmp_file->f_path.dentry->d_inode); + + if (ip->i_mount != tip->i_mount) { + error = XFS_ERROR(EINVAL); + goto out_put_tmp_file; + } + + if (ip->i_ino == tip->i_ino) { + error = XFS_ERROR(EINVAL); + goto out_put_tmp_file; + } + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + error = XFS_ERROR(EIO); + goto out_put_tmp_file; + } + + error = xfs_swap_extents(ip, tip, sxp); + + out_put_tmp_file: + fput(tmp_file); + out_put_file: + fput(file); + out: + return error; +} + +/* + * We need to check that the format of the data fork in the temporary inode is + * valid for the target inode before doing the swap. This is not a problem with + * attr1 because of the fixed fork offset, but attr2 has a dynamically sized + * data fork depending on the space the attribute fork is taking so we can get + * invalid formats on the target inode. + * + * E.g. target has space for 7 extents in extent format, temp inode only has + * space for 6. If we defragment down to 7 extents, then the tmp format is a + * btree, but when swapped it needs to be in extent format. Hence we can't just + * blindly swap data forks on attr2 filesystems. + * + * Note that we check the swap in both directions so that we don't end up with + * a corrupt temporary inode, either. + * + * Note that fixing the way xfs_fsr sets up the attribute fork in the source + * inode will prevent this situation from occurring, so all we do here is + * reject and log the attempt. basically we are putting the responsibility on + * userspace to get this right. + */ +static int +xfs_swap_extents_check_format( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip) /* tmp inode */ +{ + + /* Should never get a local format */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || + tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) + return EINVAL; + + /* + * if the target inode has less extents that then temporary inode then + * why did userspace call us? + */ + if (ip->i_d.di_nextents < tip->i_d.di_nextents) + return EINVAL; + + /* + * if the target inode is in extent form and the temp inode is in btree + * form then we will end up with the target inode in the wrong format + * as we already know there are less extents in the temp inode. + */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + return EINVAL; + + /* Check temp in extent form to max in target */ + if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) + return EINVAL; + + /* Check target in extent form to max in temp */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) + return EINVAL; + + /* + * If we are in a btree format, check that the temp root block will fit + * in the target and that it has enough extents to be in btree format + * in the target. + * + * Note that we have to be careful to allow btree->extent conversions + * (a common defrag case) which will occur when the temp inode is in + * extent format... + */ + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && + ((XFS_IFORK_BOFF(ip) && + tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) || + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max)) + return EINVAL; + + /* Reciprocal target->temp btree format checks */ + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + ((XFS_IFORK_BOFF(tip) && + ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) || + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max)) + return EINVAL; + + return 0; +} + +static int +xfs_swap_extents( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip, /* tmp inode */ + xfs_swapext_t *sxp) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_trans_t *tp; + xfs_bstat_t *sbp = &sxp->sx_stat; + xfs_ifork_t *tempifp, *ifp, *tifp; + int ilf_fields, tilf_fields; + int error = 0; + int aforkblks = 0; + int taforkblks = 0; + __uint64_t tmp; + + tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); + if (!tempifp) { + error = XFS_ERROR(ENOMEM); + goto out; + } + + /* + * we have to do two separate lock calls here to keep lockdep + * happy. If we try to get all the locks in one call, lock will + * report false positives when we drop the ILOCK and regain them + * below. + */ + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + + /* Verify that both files have the same format */ + if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + /* Verify both files are either real-time or non-realtime */ + if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + if (VN_CACHED(VFS_I(tip)) != 0) { + error = xfs_flushinval_pages(tip, 0, -1, + FI_REMAPF_LOCKED); + if (error) + goto out_unlock; + } + + /* Verify O_DIRECT for ftmp */ + if (VN_CACHED(VFS_I(tip)) != 0) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + /* Verify all data are being swapped */ + if (sxp->sx_offset != 0 || + sxp->sx_length != ip->i_d.di_size || + sxp->sx_length != tip->i_d.di_size) { + error = XFS_ERROR(EFAULT); + goto out_unlock; + } + + trace_xfs_swap_extent_before(ip, 0); + trace_xfs_swap_extent_before(tip, 1); + + /* check inode formats now that data is flushed */ + error = xfs_swap_extents_check_format(ip, tip); + if (error) { + xfs_notice(mp, + "%s: inode 0x%llx format is incompatible for exchanging.", + __func__, ip->i_ino); + goto out_unlock; + } + + /* + * Compare the current change & modify times with that + * passed in. If they differ, we abort this swap. + * This is the mechanism used to ensure the calling + * process that the file was not changed out from + * under it. + */ + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { + error = XFS_ERROR(EBUSY); + goto out_unlock; + } + + /* We need to fail if the file is memory mapped. Once we have tossed + * all existing pages, the page fault will have no option + * but to go to the filesystem for pages. By making the page fault call + * vop_read (or write in the case of autogrow) they block on the iolock + * until we have switched the extents. + */ + if (VN_MAPPED(VFS_I(ip))) { + error = XFS_ERROR(EBUSY); + goto out_unlock; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL); + + /* + * There is a race condition here since we gave up the + * ilock. However, the data fork will not change since + * we have the iolock (locked for truncation too) so we + * are safe. We don't really care if non-io related + * fields change. + */ + + xfs_tosspages(ip, 0, -1, FI_REMAPF); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); + if ((error = xfs_trans_reserve(tp, 0, + XFS_ICHANGE_LOG_RES(mp), 0, + 0, 0))) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_IOLOCK_EXCL); + xfs_trans_cancel(tp, 0); + goto out; + } + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + + /* + * Count the number of extended attribute blocks + */ + if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && + (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); + if (error) + goto out_trans_cancel; + } + if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && + (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, + &taforkblks); + if (error) + goto out_trans_cancel; + } + + /* + * Swap the data forks of the inodes + */ + ifp = &ip->i_df; + tifp = &tip->i_df; + *tempifp = *ifp; /* struct copy */ + *ifp = *tifp; /* struct copy */ + *tifp = *tempifp; /* struct copy */ + + /* + * Fix the in-memory data fork values that are dependent on the fork + * offset in the inode. We can't assume they remain the same as attr2 + * has dynamic fork offsets. + */ + ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) / + (uint)sizeof(xfs_bmbt_rec_t); + tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) / + (uint)sizeof(xfs_bmbt_rec_t); + + /* + * Fix the on-disk inode values + */ + tmp = (__uint64_t)ip->i_d.di_nblocks; + ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; + tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; + + tmp = (__uint64_t) ip->i_d.di_nextents; + ip->i_d.di_nextents = tip->i_d.di_nextents; + tip->i_d.di_nextents = tmp; + + tmp = (__uint64_t) ip->i_d.di_format; + ip->i_d.di_format = tip->i_d.di_format; + tip->i_d.di_format = tmp; + + /* + * The extents in the source inode could still contain speculative + * preallocation beyond EOF (e.g. the file is open but not modified + * while defrag is in progress). In that case, we need to copy over the + * number of delalloc blocks the data fork in the source inode is + * tracking beyond EOF so that when the fork is truncated away when the + * temporary inode is unlinked we don't underrun the i_delayed_blks + * counter on that inode. + */ + ASSERT(tip->i_delayed_blks == 0); + tip->i_delayed_blks = ip->i_delayed_blks; + ip->i_delayed_blks = 0; + + ilf_fields = XFS_ILOG_CORE; + + switch(ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { + ifp->if_u1.if_extents = + ifp->if_u2.if_inline_ext; + } + ilf_fields |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + ilf_fields |= XFS_ILOG_DBROOT; + break; + } + + tilf_fields = XFS_ILOG_CORE; + + switch(tip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { + tifp->if_u1.if_extents = + tifp->if_u2.if_inline_ext; + } + tilf_fields |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + tilf_fields |= XFS_ILOG_DBROOT; + break; + } + + + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + + xfs_trans_log_inode(tp, ip, ilf_fields); + xfs_trans_log_inode(tp, tip, tilf_fields); + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); + + trace_xfs_swap_extent_after(ip, 0); + trace_xfs_swap_extent_after(tip, 1); +out: + kmem_free(tempifp); + return error; + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + goto out; + +out_trans_cancel: + xfs_trans_cancel(tp, 0); + goto out_unlock; +} diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h new file mode 100644 index 00000000..20bdd935 --- /dev/null +++ b/fs/xfs/xfs_dfrag.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DFRAG_H__ +#define __XFS_DFRAG_H__ + +/* + * Structure passed to xfs_swapext + */ + +typedef struct xfs_swapext +{ + __int64_t sx_version; /* version */ + __int64_t sx_fdtarget; /* fd of target file */ + __int64_t sx_fdtmp; /* fd of tmp file */ + xfs_off_t sx_offset; /* offset into file */ + xfs_off_t sx_length; /* leng from offset */ + char sx_pad[16]; /* pad space, unused */ + xfs_bstat_t sx_stat; /* stat of target b4 copy */ +} xfs_swapext_t; + +/* + * Version flag + */ +#define XFS_SX_VERSION 0 + +#ifdef __KERNEL__ +/* + * Prototypes for visible xfs_dfrag.c routines. + */ + +/* + * Syscall interface for xfs_swapext + */ +int xfs_swapext(struct xfs_swapext *sx); + +#endif /* __KERNEL__ */ + +#endif /* __XFS_DFRAG_H__ */ diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h new file mode 100644 index 00000000..dffba9ba --- /dev/null +++ b/fs/xfs/xfs_dinode.h @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DINODE_H__ +#define __XFS_DINODE_H__ + +#define XFS_DINODE_MAGIC 0x494e /* 'IN' */ +#define XFS_DINODE_GOOD_VERSION(v) (((v) == 1 || (v) == 2)) + +typedef struct xfs_timestamp { + __be32 t_sec; /* timestamp seconds */ + __be32 t_nsec; /* timestamp nanoseconds */ +} xfs_timestamp_t; + +/* + * On-disk inode structure. + * + * This is just the header or "dinode core", the inode is expanded to fill a + * variable size the leftover area split into a data and an attribute fork. + * The format of the data and attribute fork depends on the format of the + * inode as indicated by di_format and di_aformat. To access the data and + * attribute use the XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR macros + * below. + * + * There is a very similar struct icdinode in xfs_inode which matches the + * layout of the first 96 bytes of this structure, but is kept in native + * format instead of big endian. + */ +typedef struct xfs_dinode { + __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __be16 di_mode; /* mode and type of file */ + __u8 di_version; /* inode version */ + __u8 di_format; /* format of di_c data */ + __be16 di_onlink; /* old number of links to file */ + __be32 di_uid; /* owner's user id */ + __be32 di_gid; /* owner's group id */ + __be32 di_nlink; /* number of links to file */ + __be16 di_projid_lo; /* lower part of owner's project id */ + __be16 di_projid_hi; /* higher part owner's project id */ + __u8 di_pad[6]; /* unused, zeroed space */ + __be16 di_flushiter; /* incremented on flush */ + xfs_timestamp_t di_atime; /* time last accessed */ + xfs_timestamp_t di_mtime; /* time last modified */ + xfs_timestamp_t di_ctime; /* time created/inode modified */ + __be64 di_size; /* number of bytes in file */ + __be64 di_nblocks; /* # of direct & btree blocks used */ + __be32 di_extsize; /* basic/minimum extent size for file */ + __be32 di_nextents; /* number of extents in data fork */ + __be16 di_anextents; /* number of extents in attribute fork*/ + __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ + __s8 di_aformat; /* format of attr fork's data */ + __be32 di_dmevmask; /* DMIG event mask */ + __be16 di_dmstate; /* DMIG state info */ + __be16 di_flags; /* random flags, XFS_DIFLAG_... */ + __be32 di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + __be32 di_next_unlinked;/* agi unlinked list ptr */ +} __attribute__((packed)) xfs_dinode_t; + +#define DI_MAX_FLUSH 0xffff + +/* + * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. + * Since the pathconf interface is signed, we use 2^31 - 1 instead. + * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. + */ +#define XFS_MAXLINK ((1U << 31) - 1U) +#define XFS_MAXLINK_1 65535U + +/* + * Values for di_format + */ +typedef enum xfs_dinode_fmt { + XFS_DINODE_FMT_DEV, /* xfs_dev_t */ + XFS_DINODE_FMT_LOCAL, /* bulk data */ + XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ + XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ + XFS_DINODE_FMT_UUID /* uuid_t */ +} xfs_dinode_fmt_t; + +/* + * Inode minimum and maximum sizes. + */ +#define XFS_DINODE_MIN_LOG 8 +#define XFS_DINODE_MAX_LOG 11 +#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG) +#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG) + +/* + * Inode size for given fs. + */ +#define XFS_LITINO(mp) \ + ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode))) + +#define XFS_BROOT_SIZE_ADJ \ + (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t)) + +/* + * Inode data & attribute fork sizes, per inode. + */ +#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0) +#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) + +#define XFS_DFORK_DSIZE(dip,mp) \ + (XFS_DFORK_Q(dip) ? \ + XFS_DFORK_BOFF(dip) : \ + XFS_LITINO(mp)) +#define XFS_DFORK_ASIZE(dip,mp) \ + (XFS_DFORK_Q(dip) ? \ + XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : \ + 0) +#define XFS_DFORK_SIZE(dip,mp,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_DFORK_DSIZE(dip, mp) : \ + XFS_DFORK_ASIZE(dip, mp)) + +/* + * Return pointers to the data or attribute forks. + */ +#define XFS_DFORK_DPTR(dip) \ + ((char *)(dip) + sizeof(struct xfs_dinode)) +#define XFS_DFORK_APTR(dip) \ + (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) +#define XFS_DFORK_PTR(dip,w) \ + ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip)) + +#define XFS_DFORK_FORMAT(dip,w) \ + ((w) == XFS_DATA_FORK ? \ + (dip)->di_format : \ + (dip)->di_aformat) +#define XFS_DFORK_NEXTENTS(dip,w) \ + ((w) == XFS_DATA_FORK ? \ + be32_to_cpu((dip)->di_nextents) : \ + be16_to_cpu((dip)->di_anextents)) + +#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp)) + +/* + * For block and character special files the 32bit dev_t is stored at the + * beginning of the data fork. + */ +static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip) +{ + return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip)); +} + +static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) +{ + *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev); +} + +/* + * Values for di_flags + * There should be a one-to-one correspondence between these flags and the + * XFS_XFLAG_s. + */ +#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */ +#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */ +#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */ +#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */ +#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */ +#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */ +#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */ +#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */ +#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */ +#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */ +#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */ +#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ +#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ +#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ +#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ +#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) +#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) +#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) +#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT) +#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT) +#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT) +#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT) +#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT) +#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT) +#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT) +#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT) +#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) +#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) +#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) +#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT) + +#ifdef CONFIG_XFS_RT +#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) +#else +#define XFS_IS_REALTIME_INODE(ip) (0) +#endif + +#define XFS_DIFLAG_ANY \ + (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ + XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ + XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ + XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ + XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) + +#endif /* __XFS_DINODE_H__ */ diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c new file mode 100644 index 00000000..dba7a71c --- /dev/null +++ b/fs/xfs/xfs_dir2.c @@ -0,0 +1,764 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_dir2_node.h" +#include "xfs_error.h" +#include "xfs_vnodeops.h" +#include "xfs_trace.h" + +struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2}; + +/* + * ASCII case-insensitive (ie. A-Z) support for directories that was + * used in IRIX. + */ +STATIC xfs_dahash_t +xfs_ascii_ci_hashname( + struct xfs_name *name) +{ + xfs_dahash_t hash; + int i; + + for (i = 0, hash = 0; i < name->len; i++) + hash = tolower(name->name[i]) ^ rol32(hash, 7); + + return hash; +} + +STATIC enum xfs_dacmp +xfs_ascii_ci_compname( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + enum xfs_dacmp result; + int i; + + if (args->namelen != len) + return XFS_CMP_DIFFERENT; + + result = XFS_CMP_EXACT; + for (i = 0; i < len; i++) { + if (args->name[i] == name[i]) + continue; + if (tolower(args->name[i]) != tolower(name[i])) + return XFS_CMP_DIFFERENT; + result = XFS_CMP_CASE; + } + + return result; +} + +static struct xfs_nameops xfs_ascii_ci_nameops = { + .hashname = xfs_ascii_ci_hashname, + .compname = xfs_ascii_ci_compname, +}; + +void +xfs_dir_mount( + xfs_mount_t *mp) +{ + ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb)); + ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= + XFS_MAX_BLOCKSIZE); + mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog); + mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog; + mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp)); + mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); + mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp)); + mp->m_attr_node_ents = + (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) / + (uint)sizeof(xfs_da_node_entry_t); + mp->m_dir_node_ents = + (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) / + (uint)sizeof(xfs_da_node_entry_t); + mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100; + if (xfs_sb_version_hasasciici(&mp->m_sb)) + mp->m_dirnameops = &xfs_ascii_ci_nameops; + else + mp->m_dirnameops = &xfs_default_nameops; +} + +/* + * Return 1 if directory contains only "." and "..". + */ +int +xfs_dir_isempty( + xfs_inode_t *dp) +{ + xfs_dir2_sf_t *sfp; + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + if (dp->i_d.di_size == 0) /* might happen during shutdown. */ + return 1; + if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) + return 0; + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + return !sfp->hdr.count; +} + +/* + * Validate a given inode number. + */ +int +xfs_dir_ino_validate( + xfs_mount_t *mp, + xfs_ino_t ino) +{ + xfs_agblock_t agblkno; + xfs_agino_t agino; + xfs_agnumber_t agno; + int ino_ok; + int ioff; + + agno = XFS_INO_TO_AGNO(mp, ino); + agblkno = XFS_INO_TO_AGBNO(mp, ino); + ioff = XFS_INO_TO_OFFSET(mp, ino); + agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff); + ino_ok = + agno < mp->m_sb.sb_agcount && + agblkno < mp->m_sb.sb_agblocks && + agblkno != 0 && + ioff < (1 << mp->m_sb.sb_inopblog) && + XFS_AGINO_TO_INO(mp, agno, agino) == ino; + if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE, + XFS_RANDOM_DIR_INO_VALIDATE))) { + xfs_warn(mp, "Invalid inode number 0x%Lx", + (unsigned long long) ino); + XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +/* + * Initialize a directory with its "." and ".." entries. + */ +int +xfs_dir_init( + xfs_trans_t *tp, + xfs_inode_t *dp, + xfs_inode_t *pdp) +{ + xfs_da_args_t args; + int error; + + memset((char *)&args, 0, sizeof(args)); + args.dp = dp; + args.trans = tp; + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) + return error; + return xfs_dir2_sf_create(&args, pdp->i_ino); +} + +/* + Enter a name in a directory. + */ +int +xfs_dir_createname( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name, + xfs_ino_t inum, /* new entry inode number */ + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + xfs_da_args_t args; + int rval; + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) + return rval; + XFS_STATS_INC(xs_dir_create); + + memset(&args, 0, sizeof(xfs_da_args_t)); + args.name = name->name; + args.namelen = name->len; + args.hashval = dp->i_mount->m_dirnameops->hashname(name); + args.inumber = inum; + args.dp = dp; + args.firstblock = first; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_addname(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) + return rval; + else if (v) + rval = xfs_dir2_block_addname(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) + return rval; + else if (v) + rval = xfs_dir2_leaf_addname(&args); + else + rval = xfs_dir2_node_addname(&args); + return rval; +} + +/* + * If doing a CI lookup and case-insensitive match, dup actual name into + * args.value. Return EEXIST for success (ie. name found) or an error. + */ +int +xfs_dir_cilookup_result( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + if (args->cmpresult == XFS_CMP_DIFFERENT) + return ENOENT; + if (args->cmpresult != XFS_CMP_CASE || + !(args->op_flags & XFS_DA_OP_CILOOKUP)) + return EEXIST; + + args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL); + if (!args->value) + return ENOMEM; + + memcpy(args->value, name, len); + args->valuelen = len; + return EEXIST; +} + +/* + * Lookup a name in a directory, give back the inode number. + * If ci_name is not NULL, returns the actual name in ci_name if it differs + * to name, or ci_name->name is set to NULL for an exact match. + */ + +int +xfs_dir_lookup( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name, + xfs_ino_t *inum, /* out: inode number */ + struct xfs_name *ci_name) /* out: actual name if CI match */ +{ + xfs_da_args_t args; + int rval; + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + XFS_STATS_INC(xs_dir_lookup); + + memset(&args, 0, sizeof(xfs_da_args_t)); + args.name = name->name; + args.namelen = name->len; + args.hashval = dp->i_mount->m_dirnameops->hashname(name); + args.dp = dp; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + args.op_flags = XFS_DA_OP_OKNOENT; + if (ci_name) + args.op_flags |= XFS_DA_OP_CILOOKUP; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_lookup(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) + return rval; + else if (v) + rval = xfs_dir2_block_lookup(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) + return rval; + else if (v) + rval = xfs_dir2_leaf_lookup(&args); + else + rval = xfs_dir2_node_lookup(&args); + if (rval == EEXIST) + rval = 0; + if (!rval) { + *inum = args.inumber; + if (ci_name) { + ci_name->name = args.value; + ci_name->len = args.valuelen; + } + } + return rval; +} + +/* + * Remove an entry from a directory. + */ +int +xfs_dir_removename( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name, + xfs_ino_t ino, + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + xfs_da_args_t args; + int rval; + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + XFS_STATS_INC(xs_dir_remove); + + memset(&args, 0, sizeof(xfs_da_args_t)); + args.name = name->name; + args.namelen = name->len; + args.hashval = dp->i_mount->m_dirnameops->hashname(name); + args.inumber = ino; + args.dp = dp; + args.firstblock = first; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_removename(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) + return rval; + else if (v) + rval = xfs_dir2_block_removename(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) + return rval; + else if (v) + rval = xfs_dir2_leaf_removename(&args); + else + rval = xfs_dir2_node_removename(&args); + return rval; +} + +/* + * Read a directory. + */ +int +xfs_readdir( + xfs_inode_t *dp, + void *dirent, + size_t bufsize, + xfs_off_t *offset, + filldir_t filldir) +{ + int rval; /* return value */ + int v; /* type-checking value */ + + trace_xfs_readdir(dp); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return XFS_ERROR(EIO); + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + XFS_STATS_INC(xs_dir_getdents); + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir); + else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) + ; + else if (v) + rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir); + else + rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset, + filldir); + return rval; +} + +/* + * Replace the inode number of a directory entry. + */ +int +xfs_dir_replace( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name, /* name of entry to replace */ + xfs_ino_t inum, /* new inode number */ + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + xfs_da_args_t args; + int rval; + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + + if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) + return rval; + + memset(&args, 0, sizeof(xfs_da_args_t)); + args.name = name->name; + args.namelen = name->len; + args.hashval = dp->i_mount->m_dirnameops->hashname(name); + args.inumber = inum; + args.dp = dp; + args.firstblock = first; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_replace(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) + return rval; + else if (v) + rval = xfs_dir2_block_replace(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) + return rval; + else if (v) + rval = xfs_dir2_leaf_replace(&args); + else + rval = xfs_dir2_node_replace(&args); + return rval; +} + +/* + * See if this entry can be added to the directory without allocating space. + * First checks that the caller couldn't reserve enough space (resblks = 0). + */ +int +xfs_dir_canenter( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name, /* name of entry to add */ + uint resblks) +{ + xfs_da_args_t args; + int rval; + int v; /* type-checking value */ + + if (resblks) + return 0; + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + + memset(&args, 0, sizeof(xfs_da_args_t)); + args.name = name->name; + args.namelen = name->len; + args.hashval = dp->i_mount->m_dirnameops->hashname(name); + args.dp = dp; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | + XFS_DA_OP_OKNOENT; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_addname(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) + return rval; + else if (v) + rval = xfs_dir2_block_addname(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) + return rval; + else if (v) + rval = xfs_dir2_leaf_addname(&args); + else + rval = xfs_dir2_node_addname(&args); + return rval; +} + +/* + * Utility routines. + */ + +/* + * Add a block to the directory. + * This routine is for data and free blocks, not leaf/node blocks + * which are handled by xfs_da_grow_inode. + */ +int +xfs_dir2_grow_inode( + xfs_da_args_t *args, + int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */ + xfs_dir2_db_t *dbp) /* out: block number added */ +{ + xfs_fileoff_t bno; /* directory offset of new block */ + int count; /* count of filesystem blocks */ + xfs_inode_t *dp; /* incore directory inode */ + int error; + int got; /* blocks actually mapped */ + int i; + xfs_bmbt_irec_t map; /* single structure for bmap */ + int mapi; /* mapping index */ + xfs_bmbt_irec_t *mapp; /* bmap mapping structure(s) */ + xfs_mount_t *mp; + int nmap; /* number of bmap entries */ + xfs_trans_t *tp; + xfs_drfsbno_t nblks; + + trace_xfs_dir2_grow_inode(args, space); + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + nblks = dp->i_d.di_nblocks; + /* + * Set lowest possible block in the space requested. + */ + bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE); + count = mp->m_dirblkfsbs; + /* + * Find the first hole for our block. + */ + if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK))) + return error; + nmap = 1; + ASSERT(args->firstblock != NULL); + /* + * Try mapping the new block contiguously (one extent). + */ + if ((error = xfs_bmapi(tp, dp, bno, count, + XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, + args->firstblock, args->total, &map, &nmap, + args->flist))) + return error; + ASSERT(nmap <= 1); + if (nmap == 1) { + mapp = ↦ + mapi = 1; + } + /* + * Didn't work and this is a multiple-fsb directory block. + * Try again with contiguous flag turned on. + */ + else if (nmap == 0 && count > 1) { + xfs_fileoff_t b; /* current file offset */ + + /* + * Space for maximum number of mappings. + */ + mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); + /* + * Iterate until we get to the end of our block. + */ + for (b = bno, mapi = 0; b < bno + count; ) { + int c; /* current fsb count */ + + /* + * Can't map more than MAX_NMAP at once. + */ + nmap = MIN(XFS_BMAP_MAX_NMAP, count); + c = (int)(bno + count - b); + if ((error = xfs_bmapi(tp, dp, b, c, + XFS_BMAPI_WRITE|XFS_BMAPI_METADATA, + args->firstblock, args->total, + &mapp[mapi], &nmap, args->flist))) { + kmem_free(mapp); + return error; + } + if (nmap < 1) + break; + /* + * Add this bunch into our table, go to the next offset. + */ + mapi += nmap; + b = mapp[mapi - 1].br_startoff + + mapp[mapi - 1].br_blockcount; + } + } + /* + * Didn't work. + */ + else { + mapi = 0; + mapp = NULL; + } + /* + * See how many fsb's we got. + */ + for (i = 0, got = 0; i < mapi; i++) + got += mapp[i].br_blockcount; + /* + * Didn't get enough fsb's, or the first/last block's are wrong. + */ + if (got != count || mapp[0].br_startoff != bno || + mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != + bno + count) { + if (mapp != &map) + kmem_free(mapp); + return XFS_ERROR(ENOSPC); + } + /* + * Done with the temporary mapping table. + */ + if (mapp != &map) + kmem_free(mapp); + + /* account for newly allocated blocks in reserved blocks total */ + args->total -= dp->i_d.di_nblocks - nblks; + *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno); + + /* + * Update file's size if this is the data space and it grew. + */ + if (space == XFS_DIR2_DATA_SPACE) { + xfs_fsize_t size; /* directory file (data) size */ + + size = XFS_FSB_TO_B(mp, bno + count); + if (size > dp->i_d.di_size) { + dp->i_d.di_size = size; + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + } + } + return 0; +} + +/* + * See if the directory is a single-block form directory. + */ +int +xfs_dir2_isblock( + xfs_trans_t *tp, + xfs_inode_t *dp, + int *vp) /* out: 1 is block, 0 is not block */ +{ + xfs_fileoff_t last; /* last file offset */ + xfs_mount_t *mp; + int rval; + + mp = dp->i_mount; + if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) + return rval; + rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize; + ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize); + *vp = rval; + return 0; +} + +/* + * See if the directory is a single-leaf form directory. + */ +int +xfs_dir2_isleaf( + xfs_trans_t *tp, + xfs_inode_t *dp, + int *vp) /* out: 1 is leaf, 0 is not leaf */ +{ + xfs_fileoff_t last; /* last file offset */ + xfs_mount_t *mp; + int rval; + + mp = dp->i_mount; + if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) + return rval; + *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog); + return 0; +} + +/* + * Remove the given block from the directory. + * This routine is used for data and free blocks, leaf/node are done + * by xfs_da_shrink_inode. + */ +int +xfs_dir2_shrink_inode( + xfs_da_args_t *args, + xfs_dir2_db_t db, + xfs_dabuf_t *bp) +{ + xfs_fileoff_t bno; /* directory file offset */ + xfs_dablk_t da; /* directory file offset */ + int done; /* bunmap is finished */ + xfs_inode_t *dp; + int error; + xfs_mount_t *mp; + xfs_trans_t *tp; + + trace_xfs_dir2_shrink_inode(args, db); + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + da = xfs_dir2_db_to_da(mp, db); + /* + * Unmap the fsblock(s). + */ + if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs, + XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, + &done))) { + /* + * ENOSPC actually can happen if we're in a removename with + * no space reservation, and the resulting block removal + * would cause a bmap btree split or conversion from extents + * to btree. This can only happen for un-fragmented + * directory blocks, since you need to be punching out + * the middle of an extent. + * In this case we need to leave the block in the file, + * and not binval it. + * So the block has to be in a consistent empty state + * and appropriately logged. + * We don't free up the buffer, the caller can tell it + * hasn't happened since it got an error back. + */ + return error; + } + ASSERT(done); + /* + * Invalidate the buffer from the transaction. + */ + xfs_da_binval(tp, bp); + /* + * If it's not a data block, we're done. + */ + if (db >= XFS_DIR2_LEAF_FIRSTDB(mp)) + return 0; + /* + * If the block isn't the last one in the directory, we're done. + */ + if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(mp, db + 1, 0)) + return 0; + bno = da; + if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { + /* + * This can't really happen unless there's kernel corruption. + */ + return error; + } + if (db == mp->m_dirdatablk) + ASSERT(bno == 0); + else + ASSERT(bno > 0); + /* + * Set the size to the new last block. + */ + dp->i_d.di_size = XFS_FSB_TO_B(mp, bno); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + return 0; +} diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h new file mode 100644 index 00000000..74a3b105 --- /dev/null +++ b/fs/xfs/xfs_dir2.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DIR2_H__ +#define __XFS_DIR2_H__ + +struct uio; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_dir2_put_args; +struct xfs_bmap_free; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * Directory version 2. + * There are 4 possible formats: + * shortform + * single block - data with embedded leaf at the end + * multiple data blocks, single leaf+freeindex block + * data blocks, node&leaf blocks (btree), freeindex blocks + * + * The shortform format is in xfs_dir2_sf.h. + * The single block format is in xfs_dir2_block.h. + * The data block format is in xfs_dir2_data.h. + * The leaf and freeindex block formats are in xfs_dir2_leaf.h. + * Node blocks are the same as the other version, in xfs_da_btree.h. + */ + +/* + * Byte offset in data block and shortform entry. + */ +typedef __uint16_t xfs_dir2_data_off_t; +#define NULLDATAOFF 0xffffU +typedef uint xfs_dir2_data_aoff_t; /* argument form */ + +/* + * Directory block number (logical dirblk in file) + */ +typedef __uint32_t xfs_dir2_db_t; + +/* + * Byte offset in a directory. + */ +typedef xfs_off_t xfs_dir2_off_t; + +extern struct xfs_name xfs_name_dotdot; + +/* + * Generic directory interface routines + */ +extern void xfs_dir_startup(void); +extern void xfs_dir_mount(struct xfs_mount *mp); +extern int xfs_dir_isempty(struct xfs_inode *dp); +extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_inode *pdp); +extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name, xfs_ino_t inum, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, xfs_extlen_t tot); +extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name, xfs_ino_t *inum, + struct xfs_name *ci_name); +extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name, xfs_ino_t ino, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, xfs_extlen_t tot); +extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name, xfs_ino_t inum, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, xfs_extlen_t tot); +extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name, uint resblks); +extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); + +/* + * Utility routines for v2 directories. + */ +extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, + xfs_dir2_db_t *dbp); +extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, + int *vp); +extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, + int *vp); +extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, + struct xfs_dabuf *bp); + +extern int xfs_dir_cilookup_result(struct xfs_da_args *args, + const unsigned char *name, int len); + +#endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c new file mode 100644 index 00000000..580d99ce --- /dev/null +++ b/fs/xfs/xfs_dir2_block.c @@ -0,0 +1,1233 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_error.h" +#include "xfs_trace.h" + +/* + * Local function prototypes. + */ +static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first, + int last); +static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp); +static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp, + int *entno); +static int xfs_dir2_block_sort(const void *a, const void *b); + +static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot; + +/* + * One-time startup routine called from xfs_init(). + */ +void +xfs_dir_startup(void) +{ + xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1); + xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); +} + +/* + * Add an entry to a block directory. + */ +int /* error */ +xfs_dir2_block_addname( + xfs_da_args_t *args) /* directory op arguments */ +{ + xfs_dir2_data_free_t *bf; /* bestfree table in block */ + xfs_dir2_block_t *block; /* directory block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* buffer for block */ + xfs_dir2_block_tail_t *btp; /* block tail */ + int compact; /* need to compact leaf ents */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* directory inode */ + xfs_dir2_data_unused_t *dup; /* block unused entry */ + int error; /* error return value */ + xfs_dir2_data_unused_t *enddup=NULL; /* unused at end of data */ + xfs_dahash_t hash; /* hash value of found entry */ + int high; /* high index for binary srch */ + int highstale; /* high stale index */ + int lfloghigh=0; /* last final leaf to log */ + int lfloglow=0; /* first final leaf to log */ + int len; /* length of the new entry */ + int low; /* low index for binary srch */ + int lowstale; /* low stale index */ + int mid=0; /* midpoint for binary srch */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log header */ + int needscan; /* need to rescan freespace */ + __be16 *tagp; /* pointer to tag value */ + xfs_trans_t *tp; /* transaction structure */ + + trace_xfs_dir2_block_addname(args); + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + /* + * Read the (one and only) directory block into dabuf bp. + */ + if ((error = + xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + return error; + } + ASSERT(bp != NULL); + block = bp->data; + /* + * Check the magic number, corrupted if wrong. + */ + if (unlikely(be32_to_cpu(block->hdr.magic) != XFS_DIR2_BLOCK_MAGIC)) { + XFS_CORRUPTION_ERROR("xfs_dir2_block_addname", + XFS_ERRLEVEL_LOW, mp, block); + xfs_da_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); + } + len = xfs_dir2_data_entsize(args->namelen); + /* + * Set up pointers to parts of the block. + */ + bf = block->hdr.bestfree; + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); + /* + * No stale entries? Need space for entry and new leaf. + */ + if (!btp->stale) { + /* + * Tag just before the first leaf entry. + */ + tagp = (__be16 *)blp - 1; + /* + * Data object just before the first leaf entry. + */ + enddup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp)); + /* + * If it's not free then can't do this add without cleaning up: + * the space before the first leaf entry needs to be free so it + * can be expanded to hold the pointer to the new entry. + */ + if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG) + dup = enddup = NULL; + /* + * Check out the biggest freespace and see if it's the same one. + */ + else { + dup = (xfs_dir2_data_unused_t *) + ((char *)block + be16_to_cpu(bf[0].offset)); + if (dup == enddup) { + /* + * It is the biggest freespace, is it too small + * to hold the new leaf too? + */ + if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { + /* + * Yes, we use the second-largest + * entry instead if it works. + */ + if (be16_to_cpu(bf[1].length) >= len) + dup = (xfs_dir2_data_unused_t *) + ((char *)block + + be16_to_cpu(bf[1].offset)); + else + dup = NULL; + } + } else { + /* + * Not the same free entry, + * just check its length. + */ + if (be16_to_cpu(dup->length) < len) { + dup = NULL; + } + } + } + compact = 0; + } + /* + * If there are stale entries we'll use one for the leaf. + * Is the biggest entry enough to avoid compaction? + */ + else if (be16_to_cpu(bf[0].length) >= len) { + dup = (xfs_dir2_data_unused_t *) + ((char *)block + be16_to_cpu(bf[0].offset)); + compact = 0; + } + /* + * Will need to compact to make this work. + */ + else { + /* + * Tag just before the first leaf entry. + */ + tagp = (__be16 *)blp - 1; + /* + * Data object just before the first leaf entry. + */ + dup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp)); + /* + * If it's not free then the data will go where the + * leaf data starts now, if it works at all. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * + (uint)sizeof(*blp) < len) + dup = NULL; + } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) + dup = NULL; + else + dup = (xfs_dir2_data_unused_t *)blp; + compact = 1; + } + /* + * If this isn't a real add, we're done with the buffer. + */ + if (args->op_flags & XFS_DA_OP_JUSTCHECK) + xfs_da_brelse(tp, bp); + /* + * If we don't have space for the new entry & leaf ... + */ + if (!dup) { + /* + * Not trying to actually do anything, or don't have + * a space reservation: return no-space. + */ + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) + return XFS_ERROR(ENOSPC); + /* + * Convert to the next larger format. + * Then add the new entry in that format. + */ + error = xfs_dir2_block_to_leaf(args, bp); + xfs_da_buf_done(bp); + if (error) + return error; + return xfs_dir2_leaf_addname(args); + } + /* + * Just checking, and it would work, so say so. + */ + if (args->op_flags & XFS_DA_OP_JUSTCHECK) + return 0; + needlog = needscan = 0; + /* + * If need to compact the leaf entries, do it now. + * Leave the highest-numbered stale entry stale. + * XXX should be the one closest to mid but mid is not yet computed. + */ + if (compact) { + int fromidx; /* source leaf index */ + int toidx; /* target leaf index */ + + for (fromidx = toidx = be32_to_cpu(btp->count) - 1, + highstale = lfloghigh = -1; + fromidx >= 0; + fromidx--) { + if (be32_to_cpu(blp[fromidx].address) == XFS_DIR2_NULL_DATAPTR) { + if (highstale == -1) + highstale = toidx; + else { + if (lfloghigh == -1) + lfloghigh = toidx; + continue; + } + } + if (fromidx < toidx) + blp[toidx] = blp[fromidx]; + toidx--; + } + lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); + lfloghigh -= be32_to_cpu(btp->stale) - 1; + be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); + xfs_dir2_data_make_free(tp, bp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)block), + (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), + &needlog, &needscan); + blp += be32_to_cpu(btp->stale) - 1; + btp->stale = cpu_to_be32(1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) { + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); + needscan = 0; + } + } + /* + * Set leaf logging boundaries to impossible state. + * For the no-stale case they're set explicitly. + */ + else if (btp->stale) { + lfloglow = be32_to_cpu(btp->count); + lfloghigh = -1; + } + /* + * Find the slot that's first lower than our hash value, -1 if none. + */ + for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) { + mid = (low + high) >> 1; + if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval) + break; + if (hash < args->hashval) + low = mid + 1; + else + high = mid - 1; + } + while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) { + mid--; + } + /* + * No stale entries, will use enddup space to hold new leaf. + */ + if (!btp->stale) { + /* + * Mark the space needed for the new leaf entry, now in use. + */ + xfs_dir2_data_use_free(tp, bp, enddup, + (xfs_dir2_data_aoff_t) + ((char *)enddup - (char *)block + be16_to_cpu(enddup->length) - + sizeof(*blp)), + (xfs_dir2_data_aoff_t)sizeof(*blp), + &needlog, &needscan); + /* + * Update the tail (entry count). + */ + be32_add_cpu(&btp->count, 1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) { + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, + &needlog); + needscan = 0; + } + /* + * Adjust pointer to the first leaf entry, we're about to move + * the table up one to open up space for the new leaf entry. + * Then adjust our index to match. + */ + blp--; + mid++; + if (mid) + memmove(blp, &blp[1], mid * sizeof(*blp)); + lfloglow = 0; + lfloghigh = mid; + } + /* + * Use a stale leaf for our new entry. + */ + else { + for (lowstale = mid; + lowstale >= 0 && + be32_to_cpu(blp[lowstale].address) != XFS_DIR2_NULL_DATAPTR; + lowstale--) + continue; + for (highstale = mid + 1; + highstale < be32_to_cpu(btp->count) && + be32_to_cpu(blp[highstale].address) != XFS_DIR2_NULL_DATAPTR && + (lowstale < 0 || mid - lowstale > highstale - mid); + highstale++) + continue; + /* + * Move entries toward the low-numbered stale entry. + */ + if (lowstale >= 0 && + (highstale == be32_to_cpu(btp->count) || + mid - lowstale <= highstale - mid)) { + if (mid - lowstale) + memmove(&blp[lowstale], &blp[lowstale + 1], + (mid - lowstale) * sizeof(*blp)); + lfloglow = MIN(lowstale, lfloglow); + lfloghigh = MAX(mid, lfloghigh); + } + /* + * Move entries toward the high-numbered stale entry. + */ + else { + ASSERT(highstale < be32_to_cpu(btp->count)); + mid++; + if (highstale - mid) + memmove(&blp[mid + 1], &blp[mid], + (highstale - mid) * sizeof(*blp)); + lfloglow = MIN(mid, lfloglow); + lfloghigh = MAX(highstale, lfloghigh); + } + be32_add_cpu(&btp->stale, -1); + } + /* + * Point to the new data entry. + */ + dep = (xfs_dir2_data_entry_t *)dup; + /* + * Fill in the leaf entry. + */ + blp[mid].hashval = cpu_to_be32(args->hashval); + blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + (char *)dep - (char *)block)); + xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); + /* + * Mark space for the data entry used. + */ + xfs_dir2_data_use_free(tp, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)block), + (xfs_dir2_data_aoff_t)len, &needlog, &needscan); + /* + * Create the new data entry. + */ + dep->inumber = cpu_to_be64(args->inumber); + dep->namelen = args->namelen; + memcpy(dep->name, args->name, args->namelen); + tagp = xfs_dir2_data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)block); + /* + * Clean up the bestfree array and log the header, tail, and entry. + */ + if (needscan) + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); + if (needlog) + xfs_dir2_data_log_header(tp, bp); + xfs_dir2_block_log_tail(tp, bp); + xfs_dir2_data_log_entry(tp, bp, dep); + xfs_dir2_data_check(dp, bp); + xfs_da_buf_done(bp); + return 0; +} + +/* + * Readdir for block directories. + */ +int /* error */ +xfs_dir2_block_getdents( + xfs_inode_t *dp, /* incore inode */ + void *dirent, + xfs_off_t *offset, + filldir_t filldir) +{ + xfs_dir2_block_t *block; /* directory block structure */ + xfs_dabuf_t *bp; /* buffer for block */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_dir2_data_unused_t *dup; /* block unused entry */ + char *endptr; /* end of the data entries */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + char *ptr; /* current data entry */ + int wantoff; /* starting block offset */ + xfs_off_t cook; + + mp = dp->i_mount; + /* + * If the block number in the offset is out of range, we're done. + */ + if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) { + return 0; + } + /* + * Can't read the block, give up, else get dabuf in bp. + */ + error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1, + &bp, XFS_DATA_FORK); + if (error) + return error; + + ASSERT(bp != NULL); + /* + * Extract the byte offset we start at from the seek pointer. + * We'll skip entries before this. + */ + wantoff = xfs_dir2_dataptr_to_off(mp, *offset); + block = bp->data; + xfs_dir2_data_check(dp, bp); + /* + * Set up values for the loop. + */ + btp = xfs_dir2_block_tail_p(mp, block); + ptr = (char *)block->u; + endptr = (char *)xfs_dir2_block_leaf_p(btp); + + /* + * Loop over the data portion of the block. + * Each object is a real entry (dep) or an unused one (dup). + */ + while (ptr < endptr) { + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * Unused, skip it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + ptr += be16_to_cpu(dup->length); + continue; + } + + dep = (xfs_dir2_data_entry_t *)ptr; + + /* + * Bump pointer for the next iteration. + */ + ptr += xfs_dir2_data_entsize(dep->namelen); + /* + * The entry is before the desired starting point, skip it. + */ + if ((char *)dep - (char *)block < wantoff) + continue; + + cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + (char *)dep - (char *)block); + + /* + * If it didn't fit, set the final offset to here & return. + */ + if (filldir(dirent, (char *)dep->name, dep->namelen, + cook & 0x7fffffff, be64_to_cpu(dep->inumber), + DT_UNKNOWN)) { + *offset = cook & 0x7fffffff; + xfs_da_brelse(NULL, bp); + return 0; + } + } + + /* + * Reached the end of the block. + * Set the offset to a non-existent block 1 and return. + */ + *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & + 0x7fffffff; + xfs_da_brelse(NULL, bp); + return 0; +} + +/* + * Log leaf entries from the block. + */ +static void +xfs_dir2_block_log_leaf( + xfs_trans_t *tp, /* transaction structure */ + xfs_dabuf_t *bp, /* block buffer */ + int first, /* index of first logged leaf */ + int last) /* index of last logged leaf */ +{ + xfs_dir2_block_t *block; /* directory block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_mount_t *mp; /* filesystem mount point */ + + mp = tp->t_mountp; + block = bp->data; + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); + xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block), + (uint)((char *)&blp[last + 1] - (char *)block - 1)); +} + +/* + * Log the block tail. + */ +static void +xfs_dir2_block_log_tail( + xfs_trans_t *tp, /* transaction structure */ + xfs_dabuf_t *bp) /* block buffer */ +{ + xfs_dir2_block_t *block; /* directory block structure */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_mount_t *mp; /* filesystem mount point */ + + mp = tp->t_mountp; + block = bp->data; + btp = xfs_dir2_block_tail_p(mp, block); + xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block), + (uint)((char *)(btp + 1) - (char *)block - 1)); +} + +/* + * Look up an entry in the block. This is the external routine, + * xfs_dir2_block_lookup_int does the real work. + */ +int /* error */ +xfs_dir2_block_lookup( + xfs_da_args_t *args) /* dir lookup arguments */ +{ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* entry index */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + + trace_xfs_dir2_block_lookup(args); + + /* + * Get the buffer, look up the entry. + * If not found (ENOENT) then return, have no buffer. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) + return error; + dp = args->dp; + mp = dp->i_mount; + block = bp->data; + xfs_dir2_data_check(dp, bp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); + /* + * Get the offset from the leaf entry, to point to the data. + */ + dep = (xfs_dir2_data_entry_t *)((char *)block + + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + /* + * Fill in inode number, CI name if appropriate, release the block. + */ + args->inumber = be64_to_cpu(dep->inumber); + error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); + xfs_da_brelse(args->trans, bp); + return XFS_ERROR(error); +} + +/* + * Internal block lookup routine. + */ +static int /* error */ +xfs_dir2_block_lookup_int( + xfs_da_args_t *args, /* dir lookup arguments */ + xfs_dabuf_t **bpp, /* returned block buffer */ + int *entno) /* returned entry number */ +{ + xfs_dir2_dataptr_t addr; /* data entry address */ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int error; /* error return value */ + xfs_dahash_t hash; /* found hash value */ + int high; /* binary search high index */ + int low; /* binary search low index */ + int mid; /* binary search current idx */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + enum xfs_dacmp cmp; /* comparison result */ + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + /* + * Read the buffer, return error if we can't get it. + */ + if ((error = + xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + return error; + } + ASSERT(bp != NULL); + block = bp->data; + xfs_dir2_data_check(dp, bp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); + /* + * Loop doing a binary search for our hash value. + * Find our entry, ENOENT if it's not there. + */ + for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) { + ASSERT(low <= high); + mid = (low + high) >> 1; + if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval) + break; + if (hash < args->hashval) + low = mid + 1; + else + high = mid - 1; + if (low > high) { + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + xfs_da_brelse(tp, bp); + return XFS_ERROR(ENOENT); + } + } + /* + * Back up to the first one with the right hash value. + */ + while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) { + mid--; + } + /* + * Now loop forward through all the entries with the + * right hash value looking for our name. + */ + do { + if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Get pointer to the entry from the leaf. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + xfs_dir2_dataptr_to_off(mp, addr)); + /* + * Compare name and if it's an exact match, return the index + * and buffer. If it's the first case-insensitive match, store + * the index and buffer and continue looking for an exact match. + */ + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; + *bpp = bp; + *entno = mid; + if (cmp == XFS_CMP_EXACT) + return 0; + } + } while (++mid < be32_to_cpu(btp->count) && + be32_to_cpu(blp[mid].hashval) == hash); + + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or replace). + * If a case-insensitive match was found earlier, return success. + */ + if (args->cmpresult == XFS_CMP_CASE) + return 0; + /* + * No match, release the buffer and return ENOENT. + */ + xfs_da_brelse(tp, bp); + return XFS_ERROR(ENOENT); +} + +/* + * Remove an entry from a block format directory. + * If that makes the block small enough to fit in shortform, transform it. + */ +int /* error */ +xfs_dir2_block_removename( + xfs_da_args_t *args) /* directory operation args */ +{ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* block leaf entry index */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log block header */ + int needscan; /* need to fixup bestfree */ + xfs_dir2_sf_hdr_t sfh; /* shortform header */ + int size; /* shortform size */ + xfs_trans_t *tp; /* transaction pointer */ + + trace_xfs_dir2_block_removename(args); + + /* + * Look up the entry in the block. Gets the buffer and entry index. + * It will always be there, the vnodeops level does a lookup first. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) { + return error; + } + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + block = bp->data; + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); + /* + * Point to the data entry using the leaf entry. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + /* + * Mark the data entry's space free. + */ + needlog = needscan = 0; + xfs_dir2_data_make_free(tp, bp, + (xfs_dir2_data_aoff_t)((char *)dep - (char *)block), + xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); + /* + * Fix up the block tail. + */ + be32_add_cpu(&btp->stale, 1); + xfs_dir2_block_log_tail(tp, bp); + /* + * Remove the leaf entry by marking it stale. + */ + blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); + xfs_dir2_block_log_leaf(tp, bp, ent, ent); + /* + * Fix up bestfree, log the header if necessary. + */ + if (needscan) + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); + if (needlog) + xfs_dir2_data_log_header(tp, bp); + xfs_dir2_data_check(dp, bp); + /* + * See if the size as a shortform is good enough. + */ + if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) > + XFS_IFORK_DSIZE(dp)) { + xfs_da_buf_done(bp); + return 0; + } + /* + * If it works, do the conversion. + */ + return xfs_dir2_block_to_sf(args, bp, size, &sfh); +} + +/* + * Replace an entry in a V2 block directory. + * Change the inode number to the new value. + */ +int /* error */ +xfs_dir2_block_replace( + xfs_da_args_t *args) /* directory operation args */ +{ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* leaf entry index */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + + trace_xfs_dir2_block_replace(args); + + /* + * Lookup the entry in the directory. Get buffer and entry index. + * This will always succeed since the caller has already done a lookup. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) { + return error; + } + dp = args->dp; + mp = dp->i_mount; + block = bp->data; + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); + /* + * Point to the data entry we need to change. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + ASSERT(be64_to_cpu(dep->inumber) != args->inumber); + /* + * Change the inode number to the new value. + */ + dep->inumber = cpu_to_be64(args->inumber); + xfs_dir2_data_log_entry(args->trans, bp, dep); + xfs_dir2_data_check(dp, bp); + xfs_da_buf_done(bp); + return 0; +} + +/* + * Qsort comparison routine for the block leaf entries. + */ +static int /* sort order */ +xfs_dir2_block_sort( + const void *a, /* first leaf entry */ + const void *b) /* second leaf entry */ +{ + const xfs_dir2_leaf_entry_t *la; /* first leaf entry */ + const xfs_dir2_leaf_entry_t *lb; /* second leaf entry */ + + la = a; + lb = b; + return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 : + (be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0); +} + +/* + * Convert a V2 leaf directory to a V2 block directory if possible. + */ +int /* error */ +xfs_dir2_leaf_to_block( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *lbp, /* leaf buffer */ + xfs_dabuf_t *dbp) /* data buffer */ +{ + __be16 *bestsp; /* leaf bests table */ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* unused data entry */ + int error; /* error return value */ + int from; /* leaf from index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* file system mount point */ + int needlog; /* need to log data header */ + int needscan; /* need to scan for bestfree */ + xfs_dir2_sf_hdr_t sfh; /* shortform header */ + int size; /* bytes used */ + __be16 *tagp; /* end of entry (tag) */ + int to; /* block/leaf to index */ + xfs_trans_t *tp; /* transaction pointer */ + + trace_xfs_dir2_leaf_to_block(args); + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = lbp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + /* + * If there are data blocks other than the first one, take this + * opportunity to remove trailing empty data blocks that may have + * been left behind during no-space-reservation operations. + * These will show up in the leaf bests table. + */ + while (dp->i_d.di_size > mp->m_dirblksize) { + bestsp = xfs_dir2_leaf_bests_p(ltp); + if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == + mp->m_dirblksize - (uint)sizeof(block->hdr)) { + if ((error = + xfs_dir2_leaf_trim_data(args, lbp, + (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) + goto out; + } else { + error = 0; + goto out; + } + } + /* + * Read the data block if we don't already have it, give up if it fails. + */ + if (dbp == NULL && + (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, + XFS_DATA_FORK))) { + goto out; + } + block = dbp->data; + ASSERT(be32_to_cpu(block->hdr.magic) == XFS_DIR2_DATA_MAGIC); + /* + * Size of the "leaf" area in the block. + */ + size = (uint)sizeof(block->tail) + + (uint)sizeof(*lep) * (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); + /* + * Look at the last data entry. + */ + tagp = (__be16 *)((char *)block + mp->m_dirblksize) - 1; + dup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp)); + /* + * If it's not free or is too short we can't do it. + */ + if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG || + be16_to_cpu(dup->length) < size) { + error = 0; + goto out; + } + /* + * Start converting it to block form. + */ + block->hdr.magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + needlog = 1; + needscan = 0; + /* + * Use up the space at the end of the block (blp/btp). + */ + xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size, + &needlog, &needscan); + /* + * Initialize the block tail. + */ + btp = xfs_dir2_block_tail_p(mp, block); + btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); + btp->stale = 0; + xfs_dir2_block_log_tail(tp, dbp); + /* + * Initialize the block leaf area. We compact out stale entries. + */ + lep = xfs_dir2_block_leaf_p(btp); + for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { + if (be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR) + continue; + lep[to++] = leaf->ents[from]; + } + ASSERT(to == be32_to_cpu(btp->count)); + xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1); + /* + * Scan the bestfree if we need it and log the data block header. + */ + if (needscan) + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); + if (needlog) + xfs_dir2_data_log_header(tp, dbp); + /* + * Pitch the old leaf block. + */ + error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp); + lbp = NULL; + if (error) { + goto out; + } + /* + * Now see if the resulting block can be shrunken to shortform. + */ + if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) > + XFS_IFORK_DSIZE(dp)) { + error = 0; + goto out; + } + return xfs_dir2_block_to_sf(args, dbp, size, &sfh); +out: + if (lbp) + xfs_da_buf_done(lbp); + if (dbp) + xfs_da_buf_done(dbp); + return error; +} + +/* + * Convert the shortform directory to block form. + */ +int /* error */ +xfs_dir2_sf_to_block( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_dir2_db_t blkno; /* dir-relative block # (0) */ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail pointer */ + char *buf; /* sf buffer */ + int buf_len; + xfs_dir2_data_entry_t *dep; /* data entry pointer */ + xfs_inode_t *dp; /* incore directory inode */ + int dummy; /* trash */ + xfs_dir2_data_unused_t *dup; /* unused entry pointer */ + int endoffset; /* end of data objects */ + int error; /* error return value */ + int i; /* index */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log block header */ + int needscan; /* need to scan block freespc */ + int newoffset; /* offset from current entry */ + int offset; /* target block offset */ + xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + __be16 *tagp; /* end of data entry */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_name name; + + trace_xfs_dir2_sf_to_block(args); + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Bomb out if the shortform directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); + /* + * Copy the directory into the stack buffer. + * Then pitch the incore inode data so we can make extents. + */ + + buf_len = dp->i_df.if_bytes; + buf = kmem_alloc(buf_len, KM_SLEEP); + + memcpy(buf, sfp, buf_len); + xfs_idata_realloc(dp, -buf_len, XFS_DATA_FORK); + dp->i_d.di_size = 0; + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + /* + * Reset pointer - old sfp is gone. + */ + sfp = (xfs_dir2_sf_t *)buf; + /* + * Add block 0 to the inode. + */ + error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno); + if (error) { + kmem_free(buf); + return error; + } + /* + * Initialize the data block. + */ + error = xfs_dir2_data_init(args, blkno, &bp); + if (error) { + kmem_free(buf); + return error; + } + block = bp->data; + block->hdr.magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + /* + * Compute size of block "tail" area. + */ + i = (uint)sizeof(*btp) + + (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t); + /* + * The whole thing is initialized to free by the init routine. + * Say we're using the leaf and tail area. + */ + dup = (xfs_dir2_data_unused_t *)block->u; + needlog = needscan = 0; + xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog, + &needscan); + ASSERT(needscan == 0); + /* + * Fill in the tail. + */ + btp = xfs_dir2_block_tail_p(mp, block); + btp->count = cpu_to_be32(sfp->hdr.count + 2); /* ., .. */ + btp->stale = 0; + blp = xfs_dir2_block_leaf_p(btp); + endoffset = (uint)((char *)blp - (char *)block); + /* + * Remove the freespace, we'll manage it. + */ + xfs_dir2_data_use_free(tp, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)block), + be16_to_cpu(dup->length), &needlog, &needscan); + /* + * Create entry for . + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + XFS_DIR2_DATA_DOT_OFFSET); + dep->inumber = cpu_to_be64(dp->i_ino); + dep->namelen = 1; + dep->name[0] = '.'; + tagp = xfs_dir2_data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)block); + xfs_dir2_data_log_entry(tp, bp, dep); + blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); + blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + (char *)dep - (char *)block)); + /* + * Create entry for .. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET); + dep->inumber = cpu_to_be64(xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent)); + dep->namelen = 2; + dep->name[0] = dep->name[1] = '.'; + tagp = xfs_dir2_data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)block); + xfs_dir2_data_log_entry(tp, bp, dep); + blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); + blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + (char *)dep - (char *)block)); + offset = XFS_DIR2_DATA_FIRST_OFFSET; + /* + * Loop over existing entries, stuff them in. + */ + if ((i = 0) == sfp->hdr.count) + sfep = NULL; + else + sfep = xfs_dir2_sf_firstentry(sfp); + /* + * Need to preserve the existing offset values in the sf directory. + * Insert holes (unused entries) where necessary. + */ + while (offset < endoffset) { + /* + * sfep is null when we reach the end of the list. + */ + if (sfep == NULL) + newoffset = endoffset; + else + newoffset = xfs_dir2_sf_get_offset(sfep); + /* + * There should be a hole here, make one. + */ + if (offset < newoffset) { + dup = (xfs_dir2_data_unused_t *) + ((char *)block + offset); + dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + dup->length = cpu_to_be16(newoffset - offset); + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16( + ((char *)dup - (char *)block)); + xfs_dir2_data_log_unused(tp, bp, dup); + (void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block, + dup, &dummy); + offset += be16_to_cpu(dup->length); + continue; + } + /* + * Copy a real entry. + */ + dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset); + dep->inumber = cpu_to_be64(xfs_dir2_sf_get_inumber(sfp, + xfs_dir2_sf_inumberp(sfep))); + dep->namelen = sfep->namelen; + memcpy(dep->name, sfep->name, dep->namelen); + tagp = xfs_dir2_data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)block); + xfs_dir2_data_log_entry(tp, bp, dep); + name.name = sfep->name; + name.len = sfep->namelen; + blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops-> + hashname(&name)); + blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + (char *)dep - (char *)block)); + offset = (int)((char *)(tagp + 1) - (char *)block); + if (++i == sfp->hdr.count) + sfep = NULL; + else + sfep = xfs_dir2_sf_nextentry(sfp, sfep); + } + /* Done with the temporary buffer */ + kmem_free(buf); + /* + * Sort the leaf entries by hash value. + */ + xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort); + /* + * Log the leaf entry area and tail. + * Already logged the header in data_init, ignore needlog. + */ + ASSERT(needscan == 0); + xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1); + xfs_dir2_block_log_tail(tp, bp); + xfs_dir2_data_check(dp, bp); + xfs_da_buf_done(bp); + return 0; +} diff --git a/fs/xfs/xfs_dir2_block.h b/fs/xfs/xfs_dir2_block.h new file mode 100644 index 00000000..10e68967 --- /dev/null +++ b/fs/xfs/xfs_dir2_block.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DIR2_BLOCK_H__ +#define __XFS_DIR2_BLOCK_H__ + +/* + * xfs_dir2_block.h + * Directory version 2, single block format structures + */ + +struct uio; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_dir2_data_hdr; +struct xfs_dir2_leaf_entry; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * The single block format is as follows: + * xfs_dir2_data_hdr_t structure + * xfs_dir2_data_entry_t and xfs_dir2_data_unused_t structures + * xfs_dir2_leaf_entry_t structures + * xfs_dir2_block_tail_t structure + */ + +#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: for one block dirs */ + +typedef struct xfs_dir2_block_tail { + __be32 count; /* count of leaf entries */ + __be32 stale; /* count of stale lf entries */ +} xfs_dir2_block_tail_t; + +/* + * Generic single-block structure, for xfs_db. + */ +typedef struct xfs_dir2_block { + xfs_dir2_data_hdr_t hdr; /* magic XFS_DIR2_BLOCK_MAGIC */ + xfs_dir2_data_union_t u[1]; + xfs_dir2_leaf_entry_t leaf[1]; + xfs_dir2_block_tail_t tail; +} xfs_dir2_block_t; + +/* + * Pointer to the leaf header embedded in a data block (1-block format) + */ +static inline xfs_dir2_block_tail_t * +xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block) +{ + return (((xfs_dir2_block_tail_t *) + ((char *)(block) + (mp)->m_dirblksize)) - 1); +} + +/* + * Pointer to the leaf entries embedded in a data block (1-block format) + */ +static inline struct xfs_dir2_leaf_entry * +xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp) +{ + return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count); +} + +/* + * Function declarations. + */ +extern int xfs_dir2_block_addname(struct xfs_da_args *args); +extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, + xfs_off_t *offset, filldir_t filldir); +extern int xfs_dir2_block_lookup(struct xfs_da_args *args); +extern int xfs_dir2_block_removename(struct xfs_da_args *args); +extern int xfs_dir2_block_replace(struct xfs_da_args *args); +extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, + struct xfs_dabuf *lbp, struct xfs_dabuf *dbp); +extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); + +#endif /* __XFS_DIR2_BLOCK_H__ */ diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c new file mode 100644 index 00000000..921595b8 --- /dev/null +++ b/fs/xfs/xfs_dir2_data.c @@ -0,0 +1,833 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_error.h" + +#ifdef DEBUG +/* + * Check the consistency of the data block. + * The input can also be a block-format directory. + * Pop an assert if we find anything bad. + */ +void +xfs_dir2_data_check( + xfs_inode_t *dp, /* incore inode pointer */ + xfs_dabuf_t *bp) /* data block's buffer */ +{ + xfs_dir2_dataptr_t addr; /* addr for leaf lookup */ + xfs_dir2_data_free_t *bf; /* bestfree table */ + xfs_dir2_block_tail_t *btp=NULL; /* block tail */ + int count; /* count of entries found */ + xfs_dir2_data_t *d; /* data block pointer */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_dir2_data_free_t *dfp; /* bestfree entry */ + xfs_dir2_data_unused_t *dup; /* unused entry */ + char *endp; /* end of useful data */ + int freeseen; /* mask of bestfrees seen */ + xfs_dahash_t hash; /* hash of current name */ + int i; /* leaf index */ + int lastfree; /* last entry was unused */ + xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */ + xfs_mount_t *mp; /* filesystem mount point */ + char *p; /* current data position */ + int stale; /* count of stale leaves */ + struct xfs_name name; + + mp = dp->i_mount; + d = bp->data; + ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || + be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); + bf = d->hdr.bestfree; + p = (char *)d->u; + if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { + btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d); + lep = xfs_dir2_block_leaf_p(btp); + endp = (char *)lep; + } else + endp = (char *)d + mp->m_dirblksize; + count = lastfree = freeseen = 0; + /* + * Account for zero bestfree entries. + */ + if (!bf[0].length) { + ASSERT(!bf[0].offset); + freeseen |= 1 << 0; + } + if (!bf[1].length) { + ASSERT(!bf[1].offset); + freeseen |= 1 << 1; + } + if (!bf[2].length) { + ASSERT(!bf[2].offset); + freeseen |= 1 << 2; + } + ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); + ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); + /* + * Loop over the data/unused entries. + */ + while (p < endp) { + dup = (xfs_dir2_data_unused_t *)p; + /* + * If it's unused, look for the space in the bestfree table. + * If we find it, account for that, else make sure it + * doesn't need to be there. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + ASSERT(lastfree == 0); + ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == + (char *)dup - (char *)d); + dfp = xfs_dir2_data_freefind(d, dup); + if (dfp) { + i = (int)(dfp - bf); + ASSERT((freeseen & (1 << i)) == 0); + freeseen |= 1 << i; + } else { + ASSERT(be16_to_cpu(dup->length) <= + be16_to_cpu(bf[2].length)); + } + p += be16_to_cpu(dup->length); + lastfree = 1; + continue; + } + /* + * It's a real entry. Validate the fields. + * If this is a block directory then make sure it's + * in the leaf section of the block. + * The linear search is crude but this is DEBUG code. + */ + dep = (xfs_dir2_data_entry_t *)p; + ASSERT(dep->namelen != 0); + ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); + ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == + (char *)dep - (char *)d); + count++; + lastfree = 0; + if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { + addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + (xfs_dir2_data_aoff_t) + ((char *)dep - (char *)d)); + name.name = dep->name; + name.len = dep->namelen; + hash = mp->m_dirnameops->hashname(&name); + for (i = 0; i < be32_to_cpu(btp->count); i++) { + if (be32_to_cpu(lep[i].address) == addr && + be32_to_cpu(lep[i].hashval) == hash) + break; + } + ASSERT(i < be32_to_cpu(btp->count)); + } + p += xfs_dir2_data_entsize(dep->namelen); + } + /* + * Need to have seen all the entries and all the bestfree slots. + */ + ASSERT(freeseen == 7); + if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { + for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { + if (be32_to_cpu(lep[i].address) == XFS_DIR2_NULL_DATAPTR) + stale++; + if (i > 0) + ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); + } + ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); + ASSERT(stale == be32_to_cpu(btp->stale)); + } +} +#endif + +/* + * Given a data block and an unused entry from that block, + * return the bestfree entry if any that corresponds to it. + */ +xfs_dir2_data_free_t * +xfs_dir2_data_freefind( + xfs_dir2_data_t *d, /* data block */ + xfs_dir2_data_unused_t *dup) /* data unused entry */ +{ + xfs_dir2_data_free_t *dfp; /* bestfree entry */ + xfs_dir2_data_aoff_t off; /* offset value needed */ +#if defined(DEBUG) && defined(__KERNEL__) + int matched; /* matched the value */ + int seenzero; /* saw a 0 bestfree entry */ +#endif + + off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)d); +#if defined(DEBUG) && defined(__KERNEL__) + /* + * Validate some consistency in the bestfree table. + * Check order, non-overlapping entries, and if we find the + * one we're looking for it has to be exact. + */ + ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || + be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); + for (dfp = &d->hdr.bestfree[0], seenzero = matched = 0; + dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT]; + dfp++) { + if (!dfp->offset) { + ASSERT(!dfp->length); + seenzero = 1; + continue; + } + ASSERT(seenzero == 0); + if (be16_to_cpu(dfp->offset) == off) { + matched = 1; + ASSERT(dfp->length == dup->length); + } else if (off < be16_to_cpu(dfp->offset)) + ASSERT(off + be16_to_cpu(dup->length) <= be16_to_cpu(dfp->offset)); + else + ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off); + ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length)); + if (dfp > &d->hdr.bestfree[0]) + ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length)); + } +#endif + /* + * If this is smaller than the smallest bestfree entry, + * it can't be there since they're sorted. + */ + if (be16_to_cpu(dup->length) < + be16_to_cpu(d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length)) + return NULL; + /* + * Look at the three bestfree entries for our guy. + */ + for (dfp = &d->hdr.bestfree[0]; + dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT]; + dfp++) { + if (!dfp->offset) + return NULL; + if (be16_to_cpu(dfp->offset) == off) + return dfp; + } + /* + * Didn't find it. This only happens if there are duplicate lengths. + */ + return NULL; +} + +/* + * Insert an unused-space entry into the bestfree table. + */ +xfs_dir2_data_free_t * /* entry inserted */ +xfs_dir2_data_freeinsert( + xfs_dir2_data_t *d, /* data block pointer */ + xfs_dir2_data_unused_t *dup, /* unused space */ + int *loghead) /* log the data header (out) */ +{ + xfs_dir2_data_free_t *dfp; /* bestfree table pointer */ + xfs_dir2_data_free_t new; /* new bestfree entry */ + +#ifdef __KERNEL__ + ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || + be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); +#endif + dfp = d->hdr.bestfree; + new.length = dup->length; + new.offset = cpu_to_be16((char *)dup - (char *)d); + /* + * Insert at position 0, 1, or 2; or not at all. + */ + if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) { + dfp[2] = dfp[1]; + dfp[1] = dfp[0]; + dfp[0] = new; + *loghead = 1; + return &dfp[0]; + } + if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) { + dfp[2] = dfp[1]; + dfp[1] = new; + *loghead = 1; + return &dfp[1]; + } + if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) { + dfp[2] = new; + *loghead = 1; + return &dfp[2]; + } + return NULL; +} + +/* + * Remove a bestfree entry from the table. + */ +STATIC void +xfs_dir2_data_freeremove( + xfs_dir2_data_t *d, /* data block pointer */ + xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */ + int *loghead) /* out: log data header */ +{ +#ifdef __KERNEL__ + ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || + be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); +#endif + /* + * It's the first entry, slide the next 2 up. + */ + if (dfp == &d->hdr.bestfree[0]) { + d->hdr.bestfree[0] = d->hdr.bestfree[1]; + d->hdr.bestfree[1] = d->hdr.bestfree[2]; + } + /* + * It's the second entry, slide the 3rd entry up. + */ + else if (dfp == &d->hdr.bestfree[1]) + d->hdr.bestfree[1] = d->hdr.bestfree[2]; + /* + * Must be the last entry. + */ + else + ASSERT(dfp == &d->hdr.bestfree[2]); + /* + * Clear the 3rd entry, must be zero now. + */ + d->hdr.bestfree[2].length = 0; + d->hdr.bestfree[2].offset = 0; + *loghead = 1; +} + +/* + * Given a data block, reconstruct its bestfree map. + */ +void +xfs_dir2_data_freescan( + xfs_mount_t *mp, /* filesystem mount point */ + xfs_dir2_data_t *d, /* data block pointer */ + int *loghead) /* out: log data header */ +{ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* active data entry */ + xfs_dir2_data_unused_t *dup; /* unused data entry */ + char *endp; /* end of block's data */ + char *p; /* current entry pointer */ + +#ifdef __KERNEL__ + ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || + be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); +#endif + /* + * Start by clearing the table. + */ + memset(d->hdr.bestfree, 0, sizeof(d->hdr.bestfree)); + *loghead = 1; + /* + * Set up pointers. + */ + p = (char *)d->u; + if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { + btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d); + endp = (char *)xfs_dir2_block_leaf_p(btp); + } else + endp = (char *)d + mp->m_dirblksize; + /* + * Loop over the block's entries. + */ + while (p < endp) { + dup = (xfs_dir2_data_unused_t *)p; + /* + * If it's a free entry, insert it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + ASSERT((char *)dup - (char *)d == + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); + xfs_dir2_data_freeinsert(d, dup, loghead); + p += be16_to_cpu(dup->length); + } + /* + * For active entries, check their tags and skip them. + */ + else { + dep = (xfs_dir2_data_entry_t *)p; + ASSERT((char *)dep - (char *)d == + be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep))); + p += xfs_dir2_data_entsize(dep->namelen); + } + } +} + +/* + * Initialize a data block at the given block number in the directory. + * Give back the buffer for the created block. + */ +int /* error */ +xfs_dir2_data_init( + xfs_da_args_t *args, /* directory operation args */ + xfs_dir2_db_t blkno, /* logical dir block number */ + xfs_dabuf_t **bpp) /* output block buffer */ +{ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_data_t *d; /* pointer to block */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* unused entry pointer */ + int error; /* error return value */ + int i; /* bestfree index */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + int t; /* temp */ + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + /* + * Get the buffer set up for the block. + */ + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, + XFS_DATA_FORK); + if (error) { + return error; + } + ASSERT(bp != NULL); + /* + * Initialize the header. + */ + d = bp->data; + d->hdr.magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + d->hdr.bestfree[0].offset = cpu_to_be16(sizeof(d->hdr)); + for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { + d->hdr.bestfree[i].length = 0; + d->hdr.bestfree[i].offset = 0; + } + /* + * Set up an unused entry for the block's body. + */ + dup = &d->u[0].unused; + dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + + t=mp->m_dirblksize - (uint)sizeof(d->hdr); + d->hdr.bestfree[0].length = cpu_to_be16(t); + dup->length = cpu_to_be16(t); + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)d); + /* + * Log it and return it. + */ + xfs_dir2_data_log_header(tp, bp); + xfs_dir2_data_log_unused(tp, bp, dup); + *bpp = bp; + return 0; +} + +/* + * Log an active data entry from the block. + */ +void +xfs_dir2_data_log_entry( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp, /* block buffer */ + xfs_dir2_data_entry_t *dep) /* data entry pointer */ +{ + xfs_dir2_data_t *d; /* data block pointer */ + + d = bp->data; + ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || + be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); + xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d), + (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) - + (char *)d - 1)); +} + +/* + * Log a data block header. + */ +void +xfs_dir2_data_log_header( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp) /* block buffer */ +{ + xfs_dir2_data_t *d; /* data block pointer */ + + d = bp->data; + ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || + be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); + xfs_da_log_buf(tp, bp, (uint)((char *)&d->hdr - (char *)d), + (uint)(sizeof(d->hdr) - 1)); +} + +/* + * Log a data unused entry. + */ +void +xfs_dir2_data_log_unused( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp, /* block buffer */ + xfs_dir2_data_unused_t *dup) /* data unused pointer */ +{ + xfs_dir2_data_t *d; /* data block pointer */ + + d = bp->data; + ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || + be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); + /* + * Log the first part of the unused entry. + */ + xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)d), + (uint)((char *)&dup->length + sizeof(dup->length) - + 1 - (char *)d)); + /* + * Log the end (tag) of the unused entry. + */ + xfs_da_log_buf(tp, bp, + (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)d), + (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)d + + sizeof(xfs_dir2_data_off_t) - 1)); +} + +/* + * Make a byte range in the data block unused. + * Its current contents are unimportant. + */ +void +xfs_dir2_data_make_free( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp, /* block buffer */ + xfs_dir2_data_aoff_t offset, /* starting byte offset */ + xfs_dir2_data_aoff_t len, /* length in bytes */ + int *needlogp, /* out: log header */ + int *needscanp) /* out: regen bestfree */ +{ + xfs_dir2_data_t *d; /* data block pointer */ + xfs_dir2_data_free_t *dfp; /* bestfree pointer */ + char *endptr; /* end of data area */ + xfs_mount_t *mp; /* filesystem mount point */ + int needscan; /* need to regen bestfree */ + xfs_dir2_data_unused_t *newdup; /* new unused entry */ + xfs_dir2_data_unused_t *postdup; /* unused entry after us */ + xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ + + mp = tp->t_mountp; + d = bp->data; + /* + * Figure out where the end of the data area is. + */ + if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC) + endptr = (char *)d + mp->m_dirblksize; + else { + xfs_dir2_block_tail_t *btp; /* block tail */ + + ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); + btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + } + /* + * If this isn't the start of the block, then back up to + * the previous entry and see if it's free. + */ + if (offset > sizeof(d->hdr)) { + __be16 *tagp; /* tag just before us */ + + tagp = (__be16 *)((char *)d + offset) - 1; + prevdup = (xfs_dir2_data_unused_t *)((char *)d + be16_to_cpu(*tagp)); + if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG) + prevdup = NULL; + } else + prevdup = NULL; + /* + * If this isn't the end of the block, see if the entry after + * us is free. + */ + if ((char *)d + offset + len < endptr) { + postdup = + (xfs_dir2_data_unused_t *)((char *)d + offset + len); + if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG) + postdup = NULL; + } else + postdup = NULL; + ASSERT(*needscanp == 0); + needscan = 0; + /* + * Previous and following entries are both free, + * merge everything into a single free entry. + */ + if (prevdup && postdup) { + xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */ + + /* + * See if prevdup and/or postdup are in bestfree table. + */ + dfp = xfs_dir2_data_freefind(d, prevdup); + dfp2 = xfs_dir2_data_freefind(d, postdup); + /* + * We need a rescan unless there are exactly 2 free entries + * namely our two. Then we know what's happening, otherwise + * since the third bestfree is there, there might be more + * entries. + */ + needscan = (d->hdr.bestfree[2].length != 0); + /* + * Fix up the new big freespace. + */ + be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length)); + *xfs_dir2_data_unused_tag_p(prevdup) = + cpu_to_be16((char *)prevdup - (char *)d); + xfs_dir2_data_log_unused(tp, bp, prevdup); + if (!needscan) { + /* + * Has to be the case that entries 0 and 1 are + * dfp and dfp2 (don't know which is which), and + * entry 2 is empty. + * Remove entry 1 first then entry 0. + */ + ASSERT(dfp && dfp2); + if (dfp == &d->hdr.bestfree[1]) { + dfp = &d->hdr.bestfree[0]; + ASSERT(dfp2 == dfp); + dfp2 = &d->hdr.bestfree[1]; + } + xfs_dir2_data_freeremove(d, dfp2, needlogp); + xfs_dir2_data_freeremove(d, dfp, needlogp); + /* + * Now insert the new entry. + */ + dfp = xfs_dir2_data_freeinsert(d, prevdup, needlogp); + ASSERT(dfp == &d->hdr.bestfree[0]); + ASSERT(dfp->length == prevdup->length); + ASSERT(!dfp[1].length); + ASSERT(!dfp[2].length); + } + } + /* + * The entry before us is free, merge with it. + */ + else if (prevdup) { + dfp = xfs_dir2_data_freefind(d, prevdup); + be16_add_cpu(&prevdup->length, len); + *xfs_dir2_data_unused_tag_p(prevdup) = + cpu_to_be16((char *)prevdup - (char *)d); + xfs_dir2_data_log_unused(tp, bp, prevdup); + /* + * If the previous entry was in the table, the new entry + * is longer, so it will be in the table too. Remove + * the old one and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(d, dfp, needlogp); + (void)xfs_dir2_data_freeinsert(d, prevdup, needlogp); + } + /* + * Otherwise we need a scan if the new entry is big enough. + */ + else { + needscan = be16_to_cpu(prevdup->length) > + be16_to_cpu(d->hdr.bestfree[2].length); + } + } + /* + * The following entry is free, merge with it. + */ + else if (postdup) { + dfp = xfs_dir2_data_freefind(d, postdup); + newdup = (xfs_dir2_data_unused_t *)((char *)d + offset); + newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length)); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)d); + xfs_dir2_data_log_unused(tp, bp, newdup); + /* + * If the following entry was in the table, the new entry + * is longer, so it will be in the table too. Remove + * the old one and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(d, dfp, needlogp); + (void)xfs_dir2_data_freeinsert(d, newdup, needlogp); + } + /* + * Otherwise we need a scan if the new entry is big enough. + */ + else { + needscan = be16_to_cpu(newdup->length) > + be16_to_cpu(d->hdr.bestfree[2].length); + } + } + /* + * Neither neighbor is free. Make a new entry. + */ + else { + newdup = (xfs_dir2_data_unused_t *)((char *)d + offset); + newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + newdup->length = cpu_to_be16(len); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)d); + xfs_dir2_data_log_unused(tp, bp, newdup); + (void)xfs_dir2_data_freeinsert(d, newdup, needlogp); + } + *needscanp = needscan; +} + +/* + * Take a byte range out of an existing unused space and make it un-free. + */ +void +xfs_dir2_data_use_free( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp, /* data block buffer */ + xfs_dir2_data_unused_t *dup, /* unused entry */ + xfs_dir2_data_aoff_t offset, /* starting offset to use */ + xfs_dir2_data_aoff_t len, /* length to use */ + int *needlogp, /* out: need to log header */ + int *needscanp) /* out: need regen bestfree */ +{ + xfs_dir2_data_t *d; /* data block */ + xfs_dir2_data_free_t *dfp; /* bestfree pointer */ + int matchback; /* matches end of freespace */ + int matchfront; /* matches start of freespace */ + int needscan; /* need to regen bestfree */ + xfs_dir2_data_unused_t *newdup; /* new unused entry */ + xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ + int oldlen; /* old unused entry's length */ + + d = bp->data; + ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || + be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); + ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); + ASSERT(offset >= (char *)dup - (char *)d); + ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)d); + ASSERT((char *)dup - (char *)d == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); + /* + * Look up the entry in the bestfree table. + */ + dfp = xfs_dir2_data_freefind(d, dup); + oldlen = be16_to_cpu(dup->length); + ASSERT(dfp || oldlen <= be16_to_cpu(d->hdr.bestfree[2].length)); + /* + * Check for alignment with front and back of the entry. + */ + matchfront = (char *)dup - (char *)d == offset; + matchback = (char *)dup + oldlen - (char *)d == offset + len; + ASSERT(*needscanp == 0); + needscan = 0; + /* + * If we matched it exactly we just need to get rid of it from + * the bestfree table. + */ + if (matchfront && matchback) { + if (dfp) { + needscan = (d->hdr.bestfree[2].offset != 0); + if (!needscan) + xfs_dir2_data_freeremove(d, dfp, needlogp); + } + } + /* + * We match the first part of the entry. + * Make a new entry with the remaining freespace. + */ + else if (matchfront) { + newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len); + newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + newdup->length = cpu_to_be16(oldlen - len); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)d); + xfs_dir2_data_log_unused(tp, bp, newdup); + /* + * If it was in the table, remove it and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(d, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp); + ASSERT(dfp != NULL); + ASSERT(dfp->length == newdup->length); + ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)d); + /* + * If we got inserted at the last slot, + * that means we don't know if there was a better + * choice for the last slot, or not. Rescan. + */ + needscan = dfp == &d->hdr.bestfree[2]; + } + } + /* + * We match the last part of the entry. + * Trim the allocated space off the tail of the entry. + */ + else if (matchback) { + newdup = dup; + newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)d); + xfs_dir2_data_log_unused(tp, bp, newdup); + /* + * If it was in the table, remove it and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(d, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp); + ASSERT(dfp != NULL); + ASSERT(dfp->length == newdup->length); + ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)d); + /* + * If we got inserted at the last slot, + * that means we don't know if there was a better + * choice for the last slot, or not. Rescan. + */ + needscan = dfp == &d->hdr.bestfree[2]; + } + } + /* + * Poking out the middle of an entry. + * Make two new entries. + */ + else { + newdup = dup; + newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)d); + xfs_dir2_data_log_unused(tp, bp, newdup); + newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len); + newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length)); + *xfs_dir2_data_unused_tag_p(newdup2) = + cpu_to_be16((char *)newdup2 - (char *)d); + xfs_dir2_data_log_unused(tp, bp, newdup2); + /* + * If the old entry was in the table, we need to scan + * if the 3rd entry was valid, since these entries + * are smaller than the old one. + * If we don't need to scan that means there were 1 or 2 + * entries in the table, and removing the old and adding + * the 2 new will work. + */ + if (dfp) { + needscan = (d->hdr.bestfree[2].length != 0); + if (!needscan) { + xfs_dir2_data_freeremove(d, dfp, needlogp); + (void)xfs_dir2_data_freeinsert(d, newdup, + needlogp); + (void)xfs_dir2_data_freeinsert(d, newdup2, + needlogp); + } + } + } + *needscanp = needscan; +} diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h new file mode 100644 index 00000000..efbc290c --- /dev/null +++ b/fs/xfs/xfs_dir2_data.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DIR2_DATA_H__ +#define __XFS_DIR2_DATA_H__ + +/* + * Directory format 2, data block structures. + */ + +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_inode; +struct xfs_trans; + +/* + * Constants. + */ +#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: for multiblock dirs */ +#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */ +#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG) +#define XFS_DIR2_DATA_FREE_TAG 0xffff +#define XFS_DIR2_DATA_FD_COUNT 3 + +/* + * Directory address space divided into sections, + * spaces separated by 32GB. + */ +#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) +#define XFS_DIR2_DATA_SPACE 0 +#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) +#define XFS_DIR2_DATA_FIRSTDB(mp) \ + xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET) + +/* + * Offsets of . and .. in data space (always block 0) + */ +#define XFS_DIR2_DATA_DOT_OFFSET \ + ((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t)) +#define XFS_DIR2_DATA_DOTDOT_OFFSET \ + (XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1)) +#define XFS_DIR2_DATA_FIRST_OFFSET \ + (XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2)) + +/* + * Structures. + */ + +/* + * Describe a free area in the data block. + * The freespace will be formatted as a xfs_dir2_data_unused_t. + */ +typedef struct xfs_dir2_data_free { + __be16 offset; /* start of freespace */ + __be16 length; /* length of freespace */ +} xfs_dir2_data_free_t; + +/* + * Header for the data blocks. + * Always at the beginning of a directory-sized block. + * The code knows that XFS_DIR2_DATA_FD_COUNT is 3. + */ +typedef struct xfs_dir2_data_hdr { + __be32 magic; /* XFS_DIR2_DATA_MAGIC */ + /* or XFS_DIR2_BLOCK_MAGIC */ + xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT]; +} xfs_dir2_data_hdr_t; + +/* + * Active entry in a data block. Aligned to 8 bytes. + * Tag appears as the last 2 bytes. + */ +typedef struct xfs_dir2_data_entry { + __be64 inumber; /* inode number */ + __u8 namelen; /* name length */ + __u8 name[1]; /* name bytes, no null */ + /* variable offset */ + __be16 tag; /* starting offset of us */ +} xfs_dir2_data_entry_t; + +/* + * Unused entry in a data block. Aligned to 8 bytes. + * Tag appears as the last 2 bytes. + */ +typedef struct xfs_dir2_data_unused { + __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */ + __be16 length; /* total free length */ + /* variable offset */ + __be16 tag; /* starting offset of us */ +} xfs_dir2_data_unused_t; + +typedef union { + xfs_dir2_data_entry_t entry; + xfs_dir2_data_unused_t unused; +} xfs_dir2_data_union_t; + +/* + * Generic data block structure, for xfs_db. + */ +typedef struct xfs_dir2_data { + xfs_dir2_data_hdr_t hdr; /* magic XFS_DIR2_DATA_MAGIC */ + xfs_dir2_data_union_t u[1]; +} xfs_dir2_data_t; + +/* + * Macros. + */ + +/* + * Size of a data entry. + */ +static inline int xfs_dir2_data_entsize(int n) +{ + return (int)roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \ + (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN); +} + +/* + * Pointer to an entry's tag word. + */ +static inline __be16 * +xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep) +{ + return (__be16 *)((char *)dep + + xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); +} + +/* + * Pointer to a freespace's tag word. + */ +static inline __be16 * +xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup) +{ + return (__be16 *)((char *)dup + + be16_to_cpu(dup->length) - sizeof(__be16)); +} + +/* + * Function declarations. + */ +#ifdef DEBUG +extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp); +#else +#define xfs_dir2_data_check(dp,bp) +#endif +extern xfs_dir2_data_free_t *xfs_dir2_data_freefind(xfs_dir2_data_t *d, + xfs_dir2_data_unused_t *dup); +extern xfs_dir2_data_free_t *xfs_dir2_data_freeinsert(xfs_dir2_data_t *d, + xfs_dir2_data_unused_t *dup, int *loghead); +extern void xfs_dir2_data_freescan(struct xfs_mount *mp, xfs_dir2_data_t *d, + int *loghead); +extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, + struct xfs_dabuf **bpp); +extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp, + xfs_dir2_data_entry_t *dep); +extern void xfs_dir2_data_log_header(struct xfs_trans *tp, + struct xfs_dabuf *bp); +extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp, + xfs_dir2_data_unused_t *dup); +extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp, + xfs_dir2_data_aoff_t offset, + xfs_dir2_data_aoff_t len, int *needlogp, + int *needscanp); +extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp, + xfs_dir2_data_unused_t *dup, + xfs_dir2_data_aoff_t offset, + xfs_dir2_data_aoff_t len, int *needlogp, + int *needscanp); + +#endif /* __XFS_DIR2_DATA_H__ */ diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c new file mode 100644 index 00000000..ae891223 --- /dev/null +++ b/fs/xfs/xfs_dir2_leaf.c @@ -0,0 +1,1881 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_dir2_node.h" +#include "xfs_error.h" +#include "xfs_trace.h" + +/* + * Local function declarations. + */ +#ifdef DEBUG +static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp); +#else +#define xfs_dir2_leaf_check(dp, bp) +#endif +static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp, + int *indexp, xfs_dabuf_t **dbpp); +static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp, + int first, int last); +static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp); + + +/* + * Convert a block form directory to a leaf form directory. + */ +int /* error */ +xfs_dir2_block_to_leaf( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *dbp) /* input block's buffer */ +{ + __be16 *bestsp; /* leaf's bestsp entries */ + xfs_dablk_t blkno; /* leaf block's bno */ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block's leaf entries */ + xfs_dir2_block_tail_t *btp; /* block's tail */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + xfs_dabuf_t *lbp; /* leaf block's buffer */ + xfs_dir2_db_t ldb; /* leaf block's bno */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log block header */ + int needscan; /* need to rescan bestfree */ + xfs_trans_t *tp; /* transaction pointer */ + + trace_xfs_dir2_block_to_leaf(args); + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + /* + * Add the leaf block to the inode. + * This interface will only put blocks in the leaf/node range. + * Since that's empty now, we'll get the root (block 0 in range). + */ + if ((error = xfs_da_grow_inode(args, &blkno))) { + return error; + } + ldb = xfs_dir2_da_to_db(mp, blkno); + ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp)); + /* + * Initialize the leaf block, get a buffer for it. + */ + if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) { + return error; + } + ASSERT(lbp != NULL); + leaf = lbp->data; + block = dbp->data; + xfs_dir2_data_check(dp, dbp); + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); + /* + * Set the counts in the leaf header. + */ + leaf->hdr.count = cpu_to_be16(be32_to_cpu(btp->count)); + leaf->hdr.stale = cpu_to_be16(be32_to_cpu(btp->stale)); + /* + * Could compact these but I think we always do the conversion + * after squeezing out stale entries. + */ + memcpy(leaf->ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir2_leaf_log_ents(tp, lbp, 0, be16_to_cpu(leaf->hdr.count) - 1); + needscan = 0; + needlog = 1; + /* + * Make the space formerly occupied by the leaf entries and block + * tail be free. + */ + xfs_dir2_data_make_free(tp, dbp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)block), + (xfs_dir2_data_aoff_t)((char *)block + mp->m_dirblksize - + (char *)blp), + &needlog, &needscan); + /* + * Fix up the block header, make it a data block. + */ + block->hdr.magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + if (needscan) + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog); + /* + * Set up leaf tail and bests table. + */ + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp->bestcount = cpu_to_be32(1); + bestsp = xfs_dir2_leaf_bests_p(ltp); + bestsp[0] = block->hdr.bestfree[0].length; + /* + * Log the data header and leaf bests table. + */ + if (needlog) + xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_leaf_check(dp, lbp); + xfs_dir2_data_check(dp, dbp); + xfs_dir2_leaf_log_bests(tp, lbp, 0, 0); + xfs_da_buf_done(lbp); + return 0; +} + +/* + * Add an entry to a leaf form directory. + */ +int /* error */ +xfs_dir2_leaf_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + __be16 *bestsp; /* freespace table in leaf */ + int compact; /* need to compact leaves */ + xfs_dir2_data_t *data; /* data block structure */ + xfs_dabuf_t *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* data unused entry */ + int error; /* error return value */ + int grown; /* allocated new data block */ + int highstale; /* index of next stale leaf */ + int i; /* temporary, index */ + int index; /* leaf table position */ + xfs_dabuf_t *lbp; /* leaf's buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int length; /* length of new entry */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ + int lfloglow; /* low leaf logging index */ + int lfloghigh; /* high leaf logging index */ + int lowstale; /* index of prev stale leaf */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ + xfs_mount_t *mp; /* filesystem mount point */ + int needbytes; /* leaf block bytes needed */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data free */ + __be16 *tagp; /* end of data entry */ + xfs_trans_t *tp; /* transaction pointer */ + xfs_dir2_db_t use_block; /* data block number */ + + trace_xfs_dir2_leaf_addname(args); + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + /* + * Read the leaf block. + */ + error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, + XFS_DATA_FORK); + if (error) { + return error; + } + ASSERT(lbp != NULL); + /* + * Look up the entry by hash value and name. + * We know it's not there, our caller has already done a lookup. + * So the index is of the entry to insert in front of. + * But if there are dup hash values the index is of the first of those. + */ + index = xfs_dir2_leaf_search_hash(args, lbp); + leaf = lbp->data; + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + bestsp = xfs_dir2_leaf_bests_p(ltp); + length = xfs_dir2_data_entsize(args->namelen); + /* + * See if there are any entries with the same hash value + * and space in their block for the new entry. + * This is good because it puts multiple same-hash value entries + * in a data block, improving the lookup of those entries. + */ + for (use_block = -1, lep = &leaf->ents[index]; + index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval; + index++, lep++) { + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; + i = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + ASSERT(i < be32_to_cpu(ltp->bestcount)); + ASSERT(be16_to_cpu(bestsp[i]) != NULLDATAOFF); + if (be16_to_cpu(bestsp[i]) >= length) { + use_block = i; + break; + } + } + /* + * Didn't find a block yet, linear search all the data blocks. + */ + if (use_block == -1) { + for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) { + /* + * Remember a block we see that's missing. + */ + if (be16_to_cpu(bestsp[i]) == NULLDATAOFF && use_block == -1) + use_block = i; + else if (be16_to_cpu(bestsp[i]) >= length) { + use_block = i; + break; + } + } + } + /* + * How many bytes do we need in the leaf block? + */ + needbytes = + (leaf->hdr.stale ? 0 : (uint)sizeof(leaf->ents[0])) + + (use_block != -1 ? 0 : (uint)sizeof(leaf->bests[0])); + /* + * Now kill use_block if it refers to a missing block, so we + * can use it as an indication of allocation needed. + */ + if (use_block != -1 && be16_to_cpu(bestsp[use_block]) == NULLDATAOFF) + use_block = -1; + /* + * If we don't have enough free bytes but we can make enough + * by compacting out stale entries, we'll do that. + */ + if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < + needbytes && be16_to_cpu(leaf->hdr.stale) > 1) { + compact = 1; + } + /* + * Otherwise if we don't have enough free bytes we need to + * convert to node form. + */ + else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu( + leaf->hdr.count)] < needbytes) { + /* + * Just checking or no space reservation, give up. + */ + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || + args->total == 0) { + xfs_da_brelse(tp, lbp); + return XFS_ERROR(ENOSPC); + } + /* + * Convert to node form. + */ + error = xfs_dir2_leaf_to_node(args, lbp); + xfs_da_buf_done(lbp); + if (error) + return error; + /* + * Then add the new entry. + */ + return xfs_dir2_node_addname(args); + } + /* + * Otherwise it will fit without compaction. + */ + else + compact = 0; + /* + * If just checking, then it will fit unless we needed to allocate + * a new data block. + */ + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { + xfs_da_brelse(tp, lbp); + return use_block == -1 ? XFS_ERROR(ENOSPC) : 0; + } + /* + * If no allocations are allowed, return now before we've + * changed anything. + */ + if (args->total == 0 && use_block == -1) { + xfs_da_brelse(tp, lbp); + return XFS_ERROR(ENOSPC); + } + /* + * Need to compact the leaf entries, removing stale ones. + * Leave one stale entry behind - the one closest to our + * insertion index - and we'll shift that one to our insertion + * point later. + */ + if (compact) { + xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale, + &lfloglow, &lfloghigh); + } + /* + * There are stale entries, so we'll need log-low and log-high + * impossibly bad values later. + */ + else if (be16_to_cpu(leaf->hdr.stale)) { + lfloglow = be16_to_cpu(leaf->hdr.count); + lfloghigh = -1; + } + /* + * If there was no data block space found, we need to allocate + * a new one. + */ + if (use_block == -1) { + /* + * Add the new data block. + */ + if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, + &use_block))) { + xfs_da_brelse(tp, lbp); + return error; + } + /* + * Initialize the block. + */ + if ((error = xfs_dir2_data_init(args, use_block, &dbp))) { + xfs_da_brelse(tp, lbp); + return error; + } + /* + * If we're adding a new data block on the end we need to + * extend the bests table. Copy it up one entry. + */ + if (use_block >= be32_to_cpu(ltp->bestcount)) { + bestsp--; + memmove(&bestsp[0], &bestsp[1], + be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0])); + be32_add_cpu(<p->bestcount, 1); + xfs_dir2_leaf_log_tail(tp, lbp); + xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + } + /* + * If we're filling in a previously empty block just log it. + */ + else + xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); + data = dbp->data; + bestsp[use_block] = data->hdr.bestfree[0].length; + grown = 1; + } + /* + * Already had space in some data block. + * Just read that one in. + */ + else { + if ((error = + xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), + -1, &dbp, XFS_DATA_FORK))) { + xfs_da_brelse(tp, lbp); + return error; + } + data = dbp->data; + grown = 0; + } + xfs_dir2_data_check(dp, dbp); + /* + * Point to the biggest freespace in our data block. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)data + be16_to_cpu(data->hdr.bestfree[0].offset)); + ASSERT(be16_to_cpu(dup->length) >= length); + needscan = needlog = 0; + /* + * Mark the initial part of our freespace in use for the new entry. + */ + xfs_dir2_data_use_free(tp, dbp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length, + &needlog, &needscan); + /* + * Initialize our new entry (at last). + */ + dep = (xfs_dir2_data_entry_t *)dup; + dep->inumber = cpu_to_be64(args->inumber); + dep->namelen = args->namelen; + memcpy(dep->name, args->name, dep->namelen); + tagp = xfs_dir2_data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)data); + /* + * Need to scan fix up the bestfree table. + */ + if (needscan) + xfs_dir2_data_freescan(mp, data, &needlog); + /* + * Need to log the data block's header. + */ + if (needlog) + xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_data_log_entry(tp, dbp, dep); + /* + * If the bests table needs to be changed, do it. + * Log the change unless we've already done that. + */ + if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(data->hdr.bestfree[0].length)) { + bestsp[use_block] = data->hdr.bestfree[0].length; + if (!grown) + xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); + } + /* + * Now we need to make room to insert the leaf entry. + * If there are no stale entries, we just insert a hole at index. + */ + if (!leaf->hdr.stale) { + /* + * lep is still good as the index leaf entry. + */ + if (index < be16_to_cpu(leaf->hdr.count)) + memmove(lep + 1, lep, + (be16_to_cpu(leaf->hdr.count) - index) * sizeof(*lep)); + /* + * Record low and high logging indices for the leaf. + */ + lfloglow = index; + lfloghigh = be16_to_cpu(leaf->hdr.count); + be16_add_cpu(&leaf->hdr.count, 1); + } + /* + * There are stale entries. + * We will use one of them for the new entry. + * It's probably not at the right location, so we'll have to + * shift some up or down first. + */ + else { + /* + * If we didn't compact before, we need to find the nearest + * stale entries before and after our insertion point. + */ + if (compact == 0) { + /* + * Find the first stale entry before the insertion + * point, if any. + */ + for (lowstale = index - 1; + lowstale >= 0 && + be32_to_cpu(leaf->ents[lowstale].address) != + XFS_DIR2_NULL_DATAPTR; + lowstale--) + continue; + /* + * Find the next stale entry at or after the insertion + * point, if any. Stop if we go so far that the + * lowstale entry would be better. + */ + for (highstale = index; + highstale < be16_to_cpu(leaf->hdr.count) && + be32_to_cpu(leaf->ents[highstale].address) != + XFS_DIR2_NULL_DATAPTR && + (lowstale < 0 || + index - lowstale - 1 >= highstale - index); + highstale++) + continue; + } + /* + * If the low one is better, use it. + */ + if (lowstale >= 0 && + (highstale == be16_to_cpu(leaf->hdr.count) || + index - lowstale - 1 < highstale - index)) { + ASSERT(index - lowstale - 1 >= 0); + ASSERT(be32_to_cpu(leaf->ents[lowstale].address) == + XFS_DIR2_NULL_DATAPTR); + /* + * Copy entries up to cover the stale entry + * and make room for the new entry. + */ + if (index - lowstale - 1 > 0) + memmove(&leaf->ents[lowstale], + &leaf->ents[lowstale + 1], + (index - lowstale - 1) * sizeof(*lep)); + lep = &leaf->ents[index - 1]; + lfloglow = MIN(lowstale, lfloglow); + lfloghigh = MAX(index - 1, lfloghigh); + } + /* + * The high one is better, so use that one. + */ + else { + ASSERT(highstale - index >= 0); + ASSERT(be32_to_cpu(leaf->ents[highstale].address) == + XFS_DIR2_NULL_DATAPTR); + /* + * Copy entries down to cover the stale entry + * and make room for the new entry. + */ + if (highstale - index > 0) + memmove(&leaf->ents[index + 1], + &leaf->ents[index], + (highstale - index) * sizeof(*lep)); + lep = &leaf->ents[index]; + lfloglow = MIN(index, lfloglow); + lfloghigh = MAX(highstale, lfloghigh); + } + be16_add_cpu(&leaf->hdr.stale, -1); + } + /* + * Fill in the new leaf entry. + */ + lep->hashval = cpu_to_be32(args->hashval); + lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, use_block, + be16_to_cpu(*tagp))); + /* + * Log the leaf fields and give up the buffers. + */ + xfs_dir2_leaf_log_header(tp, lbp); + xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh); + xfs_dir2_leaf_check(dp, lbp); + xfs_da_buf_done(lbp); + xfs_dir2_data_check(dp, dbp); + xfs_da_buf_done(dbp); + return 0; +} + +#ifdef DEBUG +/* + * Check the internal consistency of a leaf1 block. + * Pop an assert if something is wrong. + */ +STATIC void +xfs_dir2_leaf_check( + xfs_inode_t *dp, /* incore directory inode */ + xfs_dabuf_t *bp) /* leaf's buffer */ +{ + int i; /* leaf index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ + xfs_mount_t *mp; /* filesystem mount point */ + int stale; /* count of stale leaves */ + + leaf = bp->data; + mp = dp->i_mount; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); + /* + * This value is not restrictive enough. + * Should factor in the size of the bests table as well. + * We can deduce a value for that from di_size. + */ + ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + /* + * Leaves and bests don't overlap. + */ + ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <= + (char *)xfs_dir2_leaf_bests_p(ltp)); + /* + * Check hash value order, count stale entries. + */ + for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { + if (i + 1 < be16_to_cpu(leaf->hdr.count)) + ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= + be32_to_cpu(leaf->ents[i + 1].hashval)); + if (be32_to_cpu(leaf->ents[i].address) == XFS_DIR2_NULL_DATAPTR) + stale++; + } + ASSERT(be16_to_cpu(leaf->hdr.stale) == stale); +} +#endif /* DEBUG */ + +/* + * Compact out any stale entries in the leaf. + * Log the header and changed leaf entries, if any. + */ +void +xfs_dir2_leaf_compact( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *bp) /* leaf buffer */ +{ + int from; /* source leaf index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int loglow; /* first leaf entry to log */ + int to; /* target leaf index */ + + leaf = bp->data; + if (!leaf->hdr.stale) { + return; + } + /* + * Compress out the stale entries in place. + */ + for (from = to = 0, loglow = -1; from < be16_to_cpu(leaf->hdr.count); from++) { + if (be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Only actually copy the entries that are different. + */ + if (from > to) { + if (loglow == -1) + loglow = to; + leaf->ents[to] = leaf->ents[from]; + } + to++; + } + /* + * Update and log the header, log the leaf entries. + */ + ASSERT(be16_to_cpu(leaf->hdr.stale) == from - to); + be16_add_cpu(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale))); + leaf->hdr.stale = 0; + xfs_dir2_leaf_log_header(args->trans, bp); + if (loglow != -1) + xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1); +} + +/* + * Compact the leaf entries, removing stale ones. + * Leave one stale entry behind - the one closest to our + * insertion index - and the caller will shift that one to our insertion + * point later. + * Return new insertion index, where the remaining stale entry is, + * and leaf logging indices. + */ +void +xfs_dir2_leaf_compact_x1( + xfs_dabuf_t *bp, /* leaf buffer */ + int *indexp, /* insertion index */ + int *lowstalep, /* out: stale entry before us */ + int *highstalep, /* out: stale entry after us */ + int *lowlogp, /* out: low log index */ + int *highlogp) /* out: high log index */ +{ + int from; /* source copy index */ + int highstale; /* stale entry at/after index */ + int index; /* insertion index */ + int keepstale; /* source index of kept stale */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int lowstale; /* stale entry before index */ + int newindex=0; /* new insertion index */ + int to; /* destination copy index */ + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.stale) > 1); + index = *indexp; + /* + * Find the first stale entry before our index, if any. + */ + for (lowstale = index - 1; + lowstale >= 0 && + be32_to_cpu(leaf->ents[lowstale].address) != XFS_DIR2_NULL_DATAPTR; + lowstale--) + continue; + /* + * Find the first stale entry at or after our index, if any. + * Stop if the answer would be worse than lowstale. + */ + for (highstale = index; + highstale < be16_to_cpu(leaf->hdr.count) && + be32_to_cpu(leaf->ents[highstale].address) != XFS_DIR2_NULL_DATAPTR && + (lowstale < 0 || index - lowstale > highstale - index); + highstale++) + continue; + /* + * Pick the better of lowstale and highstale. + */ + if (lowstale >= 0 && + (highstale == be16_to_cpu(leaf->hdr.count) || + index - lowstale <= highstale - index)) + keepstale = lowstale; + else + keepstale = highstale; + /* + * Copy the entries in place, removing all the stale entries + * except keepstale. + */ + for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { + /* + * Notice the new value of index. + */ + if (index == from) + newindex = to; + if (from != keepstale && + be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR) { + if (from == to) + *lowlogp = to; + continue; + } + /* + * Record the new keepstale value for the insertion. + */ + if (from == keepstale) + lowstale = highstale = to; + /* + * Copy only the entries that have moved. + */ + if (from > to) + leaf->ents[to] = leaf->ents[from]; + to++; + } + ASSERT(from > to); + /* + * If the insertion point was past the last entry, + * set the new insertion point accordingly. + */ + if (index == from) + newindex = to; + *indexp = newindex; + /* + * Adjust the leaf header values. + */ + be16_add_cpu(&leaf->hdr.count, -(from - to)); + leaf->hdr.stale = cpu_to_be16(1); + /* + * Remember the low/high stale value only in the "right" + * direction. + */ + if (lowstale >= newindex) + lowstale = -1; + else + highstale = be16_to_cpu(leaf->hdr.count); + *highlogp = be16_to_cpu(leaf->hdr.count) - 1; + *lowstalep = lowstale; + *highstalep = highstale; +} + +/* + * Getdents (readdir) for leaf and node directories. + * This reads the data blocks only, so is the same for both forms. + */ +int /* error */ +xfs_dir2_leaf_getdents( + xfs_inode_t *dp, /* incore directory inode */ + void *dirent, + size_t bufsize, + xfs_off_t *offset, + filldir_t filldir) +{ + xfs_dabuf_t *bp; /* data block buffer */ + int byteoff; /* offset in current block */ + xfs_dir2_db_t curdb; /* db for current block */ + xfs_dir2_off_t curoff; /* current overall offset */ + xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_dir2_data_unused_t *dup; /* unused entry */ + int error = 0; /* error return value */ + int i; /* temporary loop index */ + int j; /* temporary loop index */ + int length; /* temporary length value */ + xfs_bmbt_irec_t *map; /* map vector for blocks */ + xfs_extlen_t map_blocks; /* number of fsbs in map */ + xfs_dablk_t map_off; /* last mapped file offset */ + int map_size; /* total entries in *map */ + int map_valid; /* valid entries in *map */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_off_t newoff; /* new curoff after new blk */ + int nmap; /* mappings to ask xfs_bmapi */ + char *ptr = NULL; /* pointer to current data */ + int ra_current; /* number of read-ahead blks */ + int ra_index; /* *map index for read-ahead */ + int ra_offset; /* map entry offset for ra */ + int ra_want; /* readahead count wanted */ + + /* + * If the offset is at or past the largest allowed value, + * give up right away. + */ + if (*offset >= XFS_DIR2_MAX_DATAPTR) + return 0; + + mp = dp->i_mount; + + /* + * Set up to bmap a number of blocks based on the caller's + * buffer size, the directory block size, and the filesystem + * block size. + */ + map_size = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize); + map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP); + map_valid = ra_index = ra_offset = ra_current = map_blocks = 0; + bp = NULL; + + /* + * Inside the loop we keep the main offset value as a byte offset + * in the directory file. + */ + curoff = xfs_dir2_dataptr_to_byte(mp, *offset); + + /* + * Force this conversion through db so we truncate the offset + * down to get the start of the data block. + */ + map_off = xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, curoff)); + /* + * Loop over directory entries until we reach the end offset. + * Get more blocks and readahead as necessary. + */ + while (curoff < XFS_DIR2_LEAF_OFFSET) { + /* + * If we have no buffer, or we're off the end of the + * current buffer, need to get another one. + */ + if (!bp || ptr >= (char *)bp->data + mp->m_dirblksize) { + /* + * If we have a buffer, we need to release it and + * take it out of the mapping. + */ + if (bp) { + xfs_da_brelse(NULL, bp); + bp = NULL; + map_blocks -= mp->m_dirblkfsbs; + /* + * Loop to get rid of the extents for the + * directory block. + */ + for (i = mp->m_dirblkfsbs; i > 0; ) { + j = MIN((int)map->br_blockcount, i); + map->br_blockcount -= j; + map->br_startblock += j; + map->br_startoff += j; + /* + * If mapping is done, pitch it from + * the table. + */ + if (!map->br_blockcount && --map_valid) + memmove(&map[0], &map[1], + sizeof(map[0]) * + map_valid); + i -= j; + } + } + /* + * Recalculate the readahead blocks wanted. + */ + ra_want = howmany(bufsize + mp->m_dirblksize, + mp->m_sb.sb_blocksize) - 1; + ASSERT(ra_want >= 0); + + /* + * If we don't have as many as we want, and we haven't + * run out of data blocks, get some more mappings. + */ + if (1 + ra_want > map_blocks && + map_off < + xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) { + /* + * Get more bmaps, fill in after the ones + * we already have in the table. + */ + nmap = map_size - map_valid; + error = xfs_bmapi(NULL, dp, + map_off, + xfs_dir2_byte_to_da(mp, + XFS_DIR2_LEAF_OFFSET) - map_off, + XFS_BMAPI_METADATA, NULL, 0, + &map[map_valid], &nmap, NULL); + /* + * Don't know if we should ignore this or + * try to return an error. + * The trouble with returning errors + * is that readdir will just stop without + * actually passing the error through. + */ + if (error) + break; /* XXX */ + /* + * If we got all the mappings we asked for, + * set the final map offset based on the + * last bmap value received. + * Otherwise, we've reached the end. + */ + if (nmap == map_size - map_valid) + map_off = + map[map_valid + nmap - 1].br_startoff + + map[map_valid + nmap - 1].br_blockcount; + else + map_off = + xfs_dir2_byte_to_da(mp, + XFS_DIR2_LEAF_OFFSET); + /* + * Look for holes in the mapping, and + * eliminate them. Count up the valid blocks. + */ + for (i = map_valid; i < map_valid + nmap; ) { + if (map[i].br_startblock == + HOLESTARTBLOCK) { + nmap--; + length = map_valid + nmap - i; + if (length) + memmove(&map[i], + &map[i + 1], + sizeof(map[i]) * + length); + } else { + map_blocks += + map[i].br_blockcount; + i++; + } + } + map_valid += nmap; + } + /* + * No valid mappings, so no more data blocks. + */ + if (!map_valid) { + curoff = xfs_dir2_da_to_byte(mp, map_off); + break; + } + /* + * Read the directory block starting at the first + * mapping. + */ + curdb = xfs_dir2_da_to_db(mp, map->br_startoff); + error = xfs_da_read_buf(NULL, dp, map->br_startoff, + map->br_blockcount >= mp->m_dirblkfsbs ? + XFS_FSB_TO_DADDR(mp, map->br_startblock) : + -1, + &bp, XFS_DATA_FORK); + /* + * Should just skip over the data block instead + * of giving up. + */ + if (error) + break; /* XXX */ + /* + * Adjust the current amount of read-ahead: we just + * read a block that was previously ra. + */ + if (ra_current) + ra_current -= mp->m_dirblkfsbs; + /* + * Do we need more readahead? + */ + for (ra_index = ra_offset = i = 0; + ra_want > ra_current && i < map_blocks; + i += mp->m_dirblkfsbs) { + ASSERT(ra_index < map_valid); + /* + * Read-ahead a contiguous directory block. + */ + if (i > ra_current && + map[ra_index].br_blockcount >= + mp->m_dirblkfsbs) { + xfs_buf_readahead(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, + map[ra_index].br_startblock + + ra_offset), + (int)BTOBB(mp->m_dirblksize)); + ra_current = i; + } + /* + * Read-ahead a non-contiguous directory block. + * This doesn't use our mapping, but this + * is a very rare case. + */ + else if (i > ra_current) { + (void)xfs_da_reada_buf(NULL, dp, + map[ra_index].br_startoff + + ra_offset, XFS_DATA_FORK); + ra_current = i; + } + /* + * Advance offset through the mapping table. + */ + for (j = 0; j < mp->m_dirblkfsbs; j++) { + /* + * The rest of this extent but not + * more than a dir block. + */ + length = MIN(mp->m_dirblkfsbs, + (int)(map[ra_index].br_blockcount - + ra_offset)); + j += length; + ra_offset += length; + /* + * Advance to the next mapping if + * this one is used up. + */ + if (ra_offset == + map[ra_index].br_blockcount) { + ra_offset = 0; + ra_index++; + } + } + } + /* + * Having done a read, we need to set a new offset. + */ + newoff = xfs_dir2_db_off_to_byte(mp, curdb, 0); + /* + * Start of the current block. + */ + if (curoff < newoff) + curoff = newoff; + /* + * Make sure we're in the right block. + */ + else if (curoff > newoff) + ASSERT(xfs_dir2_byte_to_db(mp, curoff) == + curdb); + data = bp->data; + xfs_dir2_data_check(dp, bp); + /* + * Find our position in the block. + */ + ptr = (char *)&data->u; + byteoff = xfs_dir2_byte_to_off(mp, curoff); + /* + * Skip past the header. + */ + if (byteoff == 0) + curoff += (uint)sizeof(data->hdr); + /* + * Skip past entries until we reach our offset. + */ + else { + while ((char *)ptr - (char *)data < byteoff) { + dup = (xfs_dir2_data_unused_t *)ptr; + + if (be16_to_cpu(dup->freetag) + == XFS_DIR2_DATA_FREE_TAG) { + + length = be16_to_cpu(dup->length); + ptr += length; + continue; + } + dep = (xfs_dir2_data_entry_t *)ptr; + length = + xfs_dir2_data_entsize(dep->namelen); + ptr += length; + } + /* + * Now set our real offset. + */ + curoff = + xfs_dir2_db_off_to_byte(mp, + xfs_dir2_byte_to_db(mp, curoff), + (char *)ptr - (char *)data); + if (ptr >= (char *)data + mp->m_dirblksize) { + continue; + } + } + } + /* + * We have a pointer to an entry. + * Is it a live one? + */ + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * No, it's unused, skip over it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + length = be16_to_cpu(dup->length); + ptr += length; + curoff += length; + continue; + } + + dep = (xfs_dir2_data_entry_t *)ptr; + length = xfs_dir2_data_entsize(dep->namelen); + + if (filldir(dirent, (char *)dep->name, dep->namelen, + xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, + be64_to_cpu(dep->inumber), DT_UNKNOWN)) + break; + + /* + * Advance to next entry in the block. + */ + ptr += length; + curoff += length; + /* bufsize may have just been a guess; don't go negative */ + bufsize = bufsize > length ? bufsize - length : 0; + } + + /* + * All done. Set output offset value to current offset. + */ + if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR)) + *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; + else + *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; + kmem_free(map); + if (bp) + xfs_da_brelse(NULL, bp); + return error; +} + +/* + * Initialize a new leaf block, leaf1 or leafn magic accepted. + */ +int +xfs_dir2_leaf_init( + xfs_da_args_t *args, /* operation arguments */ + xfs_dir2_db_t bno, /* directory block number */ + xfs_dabuf_t **bpp, /* out: leaf buffer */ + int magic) /* magic number for block */ +{ + xfs_dabuf_t *bp; /* leaf buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + ASSERT(dp != NULL); + tp = args->trans; + mp = dp->i_mount; + ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) && + bno < XFS_DIR2_FREE_FIRSTDB(mp)); + /* + * Get the buffer for the block. + */ + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, + XFS_DATA_FORK); + if (error) { + return error; + } + ASSERT(bp != NULL); + leaf = bp->data; + /* + * Initialize the header. + */ + leaf->hdr.info.magic = cpu_to_be16(magic); + leaf->hdr.info.forw = 0; + leaf->hdr.info.back = 0; + leaf->hdr.count = 0; + leaf->hdr.stale = 0; + xfs_dir2_leaf_log_header(tp, bp); + /* + * If it's a leaf-format directory initialize the tail. + * In this case our caller has the real bests table to copy into + * the block. + */ + if (magic == XFS_DIR2_LEAF1_MAGIC) { + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp->bestcount = 0; + xfs_dir2_leaf_log_tail(tp, bp); + } + *bpp = bp; + return 0; +} + +/* + * Log the bests entries indicated from a leaf1 block. + */ +static void +xfs_dir2_leaf_log_bests( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp, /* leaf buffer */ + int first, /* first entry to log */ + int last) /* last entry to log */ +{ + __be16 *firstb; /* pointer to first entry */ + __be16 *lastb; /* pointer to last entry */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); + ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf); + firstb = xfs_dir2_leaf_bests_p(ltp) + first; + lastb = xfs_dir2_leaf_bests_p(ltp) + last; + xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), + (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1)); +} + +/* + * Log the leaf entries indicated from a leaf1 or leafn block. + */ +void +xfs_dir2_leaf_log_ents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp, /* leaf buffer */ + int first, /* first entry to log */ + int last) /* last entry to log */ +{ + xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */ + xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC || + be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + firstlep = &leaf->ents[first]; + lastlep = &leaf->ents[last]; + xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), + (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); +} + +/* + * Log the header of the leaf1 or leafn block. + */ +void +xfs_dir2_leaf_log_header( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp) /* leaf buffer */ +{ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC || + be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), + (uint)(sizeof(leaf->hdr) - 1)); +} + +/* + * Log the tail of the leaf1 block. + */ +STATIC void +xfs_dir2_leaf_log_tail( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp) /* leaf buffer */ +{ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* filesystem mount point */ + + mp = tp->t_mountp; + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), + (uint)(mp->m_dirblksize - 1)); +} + +/* + * Look up the entry referred to by args in the leaf format directory. + * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which + * is also used by the node-format code. + */ +int +xfs_dir2_leaf_lookup( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_dabuf_t *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + int index; /* found entry index */ + xfs_dabuf_t *lbp; /* leaf buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_trans_t *tp; /* transaction pointer */ + + trace_xfs_dir2_leaf_lookup(args); + + /* + * Look up name in the leaf block, returning both buffers and index. + */ + if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + return error; + } + tp = args->trans; + dp = args->dp; + xfs_dir2_leaf_check(dp, lbp); + leaf = lbp->data; + /* + * Get to the leaf entry and contained data entry address. + */ + lep = &leaf->ents[index]; + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)dbp->data + + xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); + /* + * Return the found inode number & CI name if appropriate + */ + args->inumber = be64_to_cpu(dep->inumber); + error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); + xfs_da_brelse(tp, dbp); + xfs_da_brelse(tp, lbp); + return XFS_ERROR(error); +} + +/* + * Look up name/hash in the leaf block. + * Fill in indexp with the found index, and dbpp with the data buffer. + * If not found dbpp will be NULL, and ENOENT comes back. + * lbpp will always be filled in with the leaf buffer unless there's an error. + */ +static int /* error */ +xfs_dir2_leaf_lookup_int( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t **lbpp, /* out: leaf buffer */ + int *indexp, /* out: index in leaf block */ + xfs_dabuf_t **dbpp) /* out: data buffer */ +{ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dabuf_t *dbp = NULL; /* data buffer */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + int index; /* index in leaf block */ + xfs_dabuf_t *lbp; /* leaf buffer */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_db_t newdb; /* new data block number */ + xfs_trans_t *tp; /* transaction pointer */ + xfs_dir2_db_t cidb = -1; /* case match data block no. */ + enum xfs_dacmp cmp; /* name compare result */ + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + /* + * Read the leaf block into the buffer. + */ + error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, + XFS_DATA_FORK); + if (error) + return error; + *lbpp = lbp; + leaf = lbp->data; + xfs_dir2_leaf_check(dp, lbp); + /* + * Look for the first leaf entry with our hash value. + */ + index = xfs_dir2_leaf_search_hash(args, lbp); + /* + * Loop over all the entries with the right hash value + * looking to match the name. + */ + for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && + be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { + /* + * Skip over stale leaf entries. + */ + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Get the new data block number. + */ + newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + /* + * If it's not the same as the old data block number, + * need to pitch the old one and read the new one. + */ + if (newdb != curdb) { + if (dbp) + xfs_da_brelse(tp, dbp); + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, newdb), + -1, &dbp, XFS_DATA_FORK); + if (error) { + xfs_da_brelse(tp, lbp); + return error; + } + xfs_dir2_data_check(dp, dbp); + curdb = newdb; + } + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *)((char *)dbp->data + + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + /* + * Compare name and if it's an exact match, return the index + * and buffer. If it's the first case-insensitive match, store + * the index and buffer and continue looking for an exact match. + */ + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; + *indexp = index; + /* case exact match: return the current buffer. */ + if (cmp == XFS_CMP_EXACT) { + *dbpp = dbp; + return 0; + } + cidb = curdb; + } + } + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or remove). + * If a case-insensitive match was found earlier, re-read the + * appropriate data block if required and return it. + */ + if (args->cmpresult == XFS_CMP_CASE) { + ASSERT(cidb != -1); + if (cidb != curdb) { + xfs_da_brelse(tp, dbp); + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, cidb), + -1, &dbp, XFS_DATA_FORK); + if (error) { + xfs_da_brelse(tp, lbp); + return error; + } + } + *dbpp = dbp; + return 0; + } + /* + * No match found, return ENOENT. + */ + ASSERT(cidb == -1); + if (dbp) + xfs_da_brelse(tp, dbp); + xfs_da_brelse(tp, lbp); + return XFS_ERROR(ENOENT); +} + +/* + * Remove an entry from a leaf format directory. + */ +int /* error */ +xfs_dir2_leaf_removename( + xfs_da_args_t *args) /* operation arguments */ +{ + __be16 *bestsp; /* leaf block best freespace */ + xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_db_t db; /* data block number */ + xfs_dabuf_t *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data entry structure */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + xfs_dir2_db_t i; /* temporary data block # */ + int index; /* index into leaf entries */ + xfs_dabuf_t *lbp; /* leaf buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data frees */ + xfs_dir2_data_off_t oldbest; /* old value of best free */ + xfs_trans_t *tp; /* transaction pointer */ + + trace_xfs_dir2_leaf_removename(args); + + /* + * Lookup the leaf entry, get the leaf and data blocks read in. + */ + if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + return error; + } + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = lbp->data; + data = dbp->data; + xfs_dir2_data_check(dp, dbp); + /* + * Point to the leaf entry, use that to point to the data entry. + */ + lep = &leaf->ents[index]; + db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + dep = (xfs_dir2_data_entry_t *) + ((char *)data + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + needscan = needlog = 0; + oldbest = be16_to_cpu(data->hdr.bestfree[0].length); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + bestsp = xfs_dir2_leaf_bests_p(ltp); + ASSERT(be16_to_cpu(bestsp[db]) == oldbest); + /* + * Mark the former data entry unused. + */ + xfs_dir2_data_make_free(tp, dbp, + (xfs_dir2_data_aoff_t)((char *)dep - (char *)data), + xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); + /* + * We just mark the leaf entry stale by putting a null in it. + */ + be16_add_cpu(&leaf->hdr.stale, 1); + xfs_dir2_leaf_log_header(tp, lbp); + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); + xfs_dir2_leaf_log_ents(tp, lbp, index, index); + /* + * Scan the freespace in the data block again if necessary, + * log the data block header if necessary. + */ + if (needscan) + xfs_dir2_data_freescan(mp, data, &needlog); + if (needlog) + xfs_dir2_data_log_header(tp, dbp); + /* + * If the longest freespace in the data block has changed, + * put the new value in the bests table and log that. + */ + if (be16_to_cpu(data->hdr.bestfree[0].length) != oldbest) { + bestsp[db] = data->hdr.bestfree[0].length; + xfs_dir2_leaf_log_bests(tp, lbp, db, db); + } + xfs_dir2_data_check(dp, dbp); + /* + * If the data block is now empty then get rid of the data block. + */ + if (be16_to_cpu(data->hdr.bestfree[0].length) == + mp->m_dirblksize - (uint)sizeof(data->hdr)) { + ASSERT(db != mp->m_dirdatablk); + if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { + /* + * Nope, can't get rid of it because it caused + * allocation of a bmap btree block to do so. + * Just go on, returning success, leaving the + * empty block in place. + */ + if (error == ENOSPC && args->total == 0) { + xfs_da_buf_done(dbp); + error = 0; + } + xfs_dir2_leaf_check(dp, lbp); + xfs_da_buf_done(lbp); + return error; + } + dbp = NULL; + /* + * If this is the last data block then compact the + * bests table by getting rid of entries. + */ + if (db == be32_to_cpu(ltp->bestcount) - 1) { + /* + * Look for the last active entry (i). + */ + for (i = db - 1; i > 0; i--) { + if (be16_to_cpu(bestsp[i]) != NULLDATAOFF) + break; + } + /* + * Copy the table down so inactive entries at the + * end are removed. + */ + memmove(&bestsp[db - i], bestsp, + (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp)); + be32_add_cpu(<p->bestcount, -(db - i)); + xfs_dir2_leaf_log_tail(tp, lbp); + xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + } else + bestsp[db] = cpu_to_be16(NULLDATAOFF); + } + /* + * If the data block was not the first one, drop it. + */ + else if (db != mp->m_dirdatablk && dbp != NULL) { + xfs_da_buf_done(dbp); + dbp = NULL; + } + xfs_dir2_leaf_check(dp, lbp); + /* + * See if we can convert to block form. + */ + return xfs_dir2_leaf_to_block(args, lbp, dbp); +} + +/* + * Replace the inode number in a leaf format directory entry. + */ +int /* error */ +xfs_dir2_leaf_replace( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_dabuf_t *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + int index; /* index of leaf entry */ + xfs_dabuf_t *lbp; /* leaf buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_trans_t *tp; /* transaction pointer */ + + trace_xfs_dir2_leaf_replace(args); + + /* + * Look up the entry. + */ + if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + return error; + } + dp = args->dp; + leaf = lbp->data; + /* + * Point to the leaf entry, get data address from it. + */ + lep = &leaf->ents[index]; + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)dbp->data + + xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); + ASSERT(args->inumber != be64_to_cpu(dep->inumber)); + /* + * Put the new inode number in, log it. + */ + dep->inumber = cpu_to_be64(args->inumber); + tp = args->trans; + xfs_dir2_data_log_entry(tp, dbp, dep); + xfs_da_buf_done(dbp); + xfs_dir2_leaf_check(dp, lbp); + xfs_da_brelse(tp, lbp); + return 0; +} + +/* + * Return index in the leaf block (lbp) which is either the first + * one with this hash value, or if there are none, the insert point + * for that hash value. + */ +int /* index value */ +xfs_dir2_leaf_search_hash( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *lbp) /* leaf buffer */ +{ + xfs_dahash_t hash=0; /* hash from this entry */ + xfs_dahash_t hashwant; /* hash value looking for */ + int high; /* high leaf index */ + int low; /* low leaf index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int mid=0; /* current leaf index */ + + leaf = lbp->data; +#ifndef __KERNEL__ + if (!leaf->hdr.count) + return 0; +#endif + /* + * Note, the table cannot be empty, so we have to go through the loop. + * Binary search the leaf entries looking for our hash value. + */ + for (lep = leaf->ents, low = 0, high = be16_to_cpu(leaf->hdr.count) - 1, + hashwant = args->hashval; + low <= high; ) { + mid = (low + high) >> 1; + if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant) + break; + if (hash < hashwant) + low = mid + 1; + else + high = mid - 1; + } + /* + * Found one, back up through all the equal hash values. + */ + if (hash == hashwant) { + while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) { + mid--; + } + } + /* + * Need to point to an entry higher than ours. + */ + else if (hash < hashwant) + mid++; + return mid; +} + +/* + * Trim off a trailing data block. We know it's empty since the leaf + * freespace table says so. + */ +int /* error */ +xfs_dir2_leaf_trim_data( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *lbp, /* leaf buffer */ + xfs_dir2_db_t db) /* data block number */ +{ + __be16 *bestsp; /* leaf bests table */ +#ifdef DEBUG + xfs_dir2_data_t *data; /* data block structure */ +#endif + xfs_dabuf_t *dbp; /* data block buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + /* + * Read the offending data block. We need its buffer. + */ + if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, + XFS_DATA_FORK))) { + return error; + } +#ifdef DEBUG + data = dbp->data; + ASSERT(be32_to_cpu(data->hdr.magic) == XFS_DIR2_DATA_MAGIC); +#endif + /* this seems to be an error + * data is only valid if DEBUG is defined? + * RMC 09/08/1999 + */ + + leaf = lbp->data; + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ASSERT(be16_to_cpu(data->hdr.bestfree[0].length) == + mp->m_dirblksize - (uint)sizeof(data->hdr)); + ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); + /* + * Get rid of the data block. + */ + if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { + ASSERT(error != ENOSPC); + xfs_da_brelse(tp, dbp); + return error; + } + /* + * Eliminate the last bests entry from the table. + */ + bestsp = xfs_dir2_leaf_bests_p(ltp); + be32_add_cpu(<p->bestcount, -1); + memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp)); + xfs_dir2_leaf_log_tail(tp, lbp); + xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + return 0; +} + +/* + * Convert node form directory to leaf form directory. + * The root of the node form dir needs to already be a LEAFN block. + * Just return if we can't do anything. + */ +int /* error */ +xfs_dir2_node_to_leaf( + xfs_da_state_t *state) /* directory operation state */ +{ + xfs_da_args_t *args; /* operation arguments */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + xfs_dabuf_t *fbp; /* buffer for freespace block */ + xfs_fileoff_t fo; /* freespace file offset */ + xfs_dir2_free_t *free; /* freespace structure */ + xfs_dabuf_t *lbp; /* buffer for leaf block */ + xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_mount_t *mp; /* filesystem mount point */ + int rval; /* successful free trim? */ + xfs_trans_t *tp; /* transaction pointer */ + + /* + * There's more than a leaf level in the btree, so there must + * be multiple leafn blocks. Give up. + */ + if (state->path.active > 1) + return 0; + args = state->args; + + trace_xfs_dir2_node_to_leaf(args); + + mp = state->mp; + dp = args->dp; + tp = args->trans; + /* + * Get the last offset in the file. + */ + if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) { + return error; + } + fo -= mp->m_dirblkfsbs; + /* + * If there are freespace blocks other than the first one, + * take this opportunity to remove trailing empty freespace blocks + * that may have been left behind during no-space-reservation + * operations. + */ + while (fo > mp->m_dirfreeblk) { + if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) { + return error; + } + if (rval) + fo -= mp->m_dirblkfsbs; + else + return 0; + } + /* + * Now find the block just before the freespace block. + */ + if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) { + return error; + } + /* + * If it's not the single leaf block, give up. + */ + if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize) + return 0; + lbp = state->path.blk[0].bp; + leaf = lbp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + /* + * Read the freespace block. + */ + if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, + XFS_DATA_FORK))) { + return error; + } + free = fbp->data; + ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + ASSERT(!free->hdr.firstdb); + /* + * Now see if the leafn and free data will fit in a leaf1. + * If not, release the buffer and give up. + */ + if ((uint)sizeof(leaf->hdr) + + (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)) * (uint)sizeof(leaf->ents[0]) + + be32_to_cpu(free->hdr.nvalid) * (uint)sizeof(leaf->bests[0]) + + (uint)sizeof(leaf->tail) > + mp->m_dirblksize) { + xfs_da_brelse(tp, fbp); + return 0; + } + /* + * If the leaf has any stale entries in it, compress them out. + * The compact routine will log the header. + */ + if (be16_to_cpu(leaf->hdr.stale)) + xfs_dir2_leaf_compact(args, lbp); + else + xfs_dir2_leaf_log_header(tp, lbp); + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); + /* + * Set up the leaf tail from the freespace block. + */ + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp->bestcount = free->hdr.nvalid; + /* + * Set up the leaf bests table. + */ + memcpy(xfs_dir2_leaf_bests_p(ltp), free->bests, + be32_to_cpu(ltp->bestcount) * sizeof(leaf->bests[0])); + xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir2_leaf_log_tail(tp, lbp); + xfs_dir2_leaf_check(dp, lbp); + /* + * Get rid of the freespace block. + */ + error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp); + if (error) { + /* + * This can't fail here because it can only happen when + * punching out the middle of an extent, and this is an + * isolated block. + */ + ASSERT(error != ENOSPC); + return error; + } + fbp = NULL; + /* + * Now see if we can convert the single-leaf directory + * down to a block form directory. + * This routine always kills the dabuf for the leaf, so + * eliminate it from the path. + */ + error = xfs_dir2_leaf_to_block(args, lbp, NULL); + state->path.blk[0].bp = NULL; + return error; +} diff --git a/fs/xfs/xfs_dir2_leaf.h b/fs/xfs/xfs_dir2_leaf.h new file mode 100644 index 00000000..6c9539f0 --- /dev/null +++ b/fs/xfs/xfs_dir2_leaf.h @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DIR2_LEAF_H__ +#define __XFS_DIR2_LEAF_H__ + +struct uio; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * Offset of the leaf/node space. First block in this space + * is the btree root. + */ +#define XFS_DIR2_LEAF_SPACE 1 +#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) +#define XFS_DIR2_LEAF_FIRSTDB(mp) \ + xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET) + +/* + * Offset in data space of a data entry. + */ +typedef __uint32_t xfs_dir2_dataptr_t; +#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff) +#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0) + +/* + * Leaf block header. + */ +typedef struct xfs_dir2_leaf_hdr { + xfs_da_blkinfo_t info; /* header for da routines */ + __be16 count; /* count of entries */ + __be16 stale; /* count of stale entries */ +} xfs_dir2_leaf_hdr_t; + +/* + * Leaf block entry. + */ +typedef struct xfs_dir2_leaf_entry { + __be32 hashval; /* hash value of name */ + __be32 address; /* address of data entry */ +} xfs_dir2_leaf_entry_t; + +/* + * Leaf block tail. + */ +typedef struct xfs_dir2_leaf_tail { + __be32 bestcount; +} xfs_dir2_leaf_tail_t; + +/* + * Leaf block. + * bests and tail are at the end of the block for single-leaf only + * (magic = XFS_DIR2_LEAF1_MAGIC not XFS_DIR2_LEAFN_MAGIC). + */ +typedef struct xfs_dir2_leaf { + xfs_dir2_leaf_hdr_t hdr; /* leaf header */ + xfs_dir2_leaf_entry_t ents[1]; /* entries */ + /* ... */ + xfs_dir2_data_off_t bests[1]; /* best free counts */ + xfs_dir2_leaf_tail_t tail; /* leaf tail */ +} xfs_dir2_leaf_t; + +/* + * DB blocks here are logical directory block numbers, not filesystem blocks. + */ + +static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp) +{ + return (int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) / + (uint)sizeof(xfs_dir2_leaf_entry_t)); +} + +/* + * Get address of the bestcount field in the single-leaf block. + */ +static inline xfs_dir2_leaf_tail_t * +xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp) +{ + return (xfs_dir2_leaf_tail_t *) + ((char *)(lp) + (mp)->m_dirblksize - + (uint)sizeof(xfs_dir2_leaf_tail_t)); +} + +/* + * Get address of the bests array in the single-leaf block. + */ +static inline __be16 * +xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp) +{ + return (__be16 *)ltp - be32_to_cpu(ltp->bestcount); +} + +/* + * Convert dataptr to byte in file space + */ +static inline xfs_dir2_off_t +xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) +{ + return (xfs_dir2_off_t)(dp) << XFS_DIR2_DATA_ALIGN_LOG; +} + +/* + * Convert byte in file space to dataptr. It had better be aligned. + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by) +{ + return (xfs_dir2_dataptr_t)((by) >> XFS_DIR2_DATA_ALIGN_LOG); +} + +/* + * Convert byte in space to (DB) block + */ +static inline xfs_dir2_db_t +xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by) +{ + return (xfs_dir2_db_t)((by) >> \ + ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)); +} + +/* + * Convert dataptr to a block number + */ +static inline xfs_dir2_db_t +xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp)); +} + +/* + * Convert byte in space to offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by) +{ + return (xfs_dir2_data_aoff_t)((by) & \ + ((1 << ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) - 1)); +} + +/* + * Convert dataptr to a byte offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp)); +} + +/* + * Convert block and offset to byte in space + */ +static inline xfs_dir2_off_t +xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return ((xfs_dir2_off_t)(db) << \ + ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) + (o); +} + +/* + * Convert block (DB) to block (dablk) + */ +static inline xfs_dablk_t +xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db) +{ + return (xfs_dablk_t)((db) << (mp)->m_sb.sb_dirblklog); +} + +/* + * Convert byte in space to (DA) block + */ +static inline xfs_dablk_t +xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by) +{ + return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by)); +} + +/* + * Convert block and offset to dataptr + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o)); +} + +/* + * Convert block (dablk) to block (DB) + */ +static inline xfs_dir2_db_t +xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da) +{ + return (xfs_dir2_db_t)((da) >> (mp)->m_sb.sb_dirblklog); +} + +/* + * Convert block (dablk) to byte offset in space + */ +static inline xfs_dir2_off_t +xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da) +{ + return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0); +} + +/* + * Function declarations. + */ +extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, + struct xfs_dabuf *dbp); +extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); +extern void xfs_dir2_leaf_compact(struct xfs_da_args *args, + struct xfs_dabuf *bp); +extern void xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp, + int *lowstalep, int *highstalep, + int *lowlogp, int *highlogp); +extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent, + size_t bufsize, xfs_off_t *offset, + filldir_t filldir); +extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno, + struct xfs_dabuf **bpp, int magic); +extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp, + int first, int last); +extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp, + struct xfs_dabuf *bp); +extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); +extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); +extern int xfs_dir2_leaf_replace(struct xfs_da_args *args); +extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args, + struct xfs_dabuf *lbp); +extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args, + struct xfs_dabuf *lbp, xfs_dir2_db_t db); +extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); + +#endif /* __XFS_DIR2_LEAF_H__ */ diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c new file mode 100644 index 00000000..a0aab7d3 --- /dev/null +++ b/fs/xfs/xfs_dir2_node.c @@ -0,0 +1,2076 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_dir2_node.h" +#include "xfs_error.h" +#include "xfs_trace.h" + +/* + * Function declarations. + */ +static void xfs_dir2_free_log_header(xfs_trans_t *tp, xfs_dabuf_t *bp); +static int xfs_dir2_leafn_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index); +#ifdef DEBUG +static void xfs_dir2_leafn_check(xfs_inode_t *dp, xfs_dabuf_t *bp); +#else +#define xfs_dir2_leafn_check(dp, bp) +#endif +static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, xfs_dabuf_t *bp_s, + int start_s, xfs_dabuf_t *bp_d, int start_d, + int count); +static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2); +static int xfs_dir2_leafn_remove(xfs_da_args_t *args, xfs_dabuf_t *bp, + int index, xfs_da_state_blk_t *dblk, + int *rval); +static int xfs_dir2_node_addname_int(xfs_da_args_t *args, + xfs_da_state_blk_t *fblk); + +/* + * Log entries from a freespace block. + */ +STATIC void +xfs_dir2_free_log_bests( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp, /* freespace buffer */ + int first, /* first entry to log */ + int last) /* last entry to log */ +{ + xfs_dir2_free_t *free; /* freespace structure */ + + free = bp->data; + ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + xfs_da_log_buf(tp, bp, + (uint)((char *)&free->bests[first] - (char *)free), + (uint)((char *)&free->bests[last] - (char *)free + + sizeof(free->bests[0]) - 1)); +} + +/* + * Log header from a freespace block. + */ +static void +xfs_dir2_free_log_header( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp) /* freespace buffer */ +{ + xfs_dir2_free_t *free; /* freespace structure */ + + free = bp->data; + ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free), + (uint)(sizeof(xfs_dir2_free_hdr_t) - 1)); +} + +/* + * Convert a leaf-format directory to a node-format directory. + * We need to change the magic number of the leaf block, and copy + * the freespace table out of the leaf block into its own block. + */ +int /* error */ +xfs_dir2_leaf_to_node( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *lbp) /* leaf buffer */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + xfs_dabuf_t *fbp; /* freespace buffer */ + xfs_dir2_db_t fdb; /* freespace block number */ + xfs_dir2_free_t *free; /* freespace structure */ + __be16 *from; /* pointer to freespace entry */ + int i; /* leaf freespace index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* filesystem mount point */ + int n; /* count of live freespc ents */ + xfs_dir2_data_off_t off; /* freespace entry value */ + __be16 *to; /* pointer to freespace entry */ + xfs_trans_t *tp; /* transaction pointer */ + + trace_xfs_dir2_leaf_to_node(args); + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + /* + * Add a freespace block to the directory. + */ + if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) { + return error; + } + ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp)); + /* + * Get the buffer for the new freespace block. + */ + if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, + XFS_DATA_FORK))) { + return error; + } + ASSERT(fbp != NULL); + free = fbp->data; + leaf = lbp->data; + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + /* + * Initialize the freespace block header. + */ + free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); + free->hdr.firstdb = 0; + ASSERT(be32_to_cpu(ltp->bestcount) <= (uint)dp->i_d.di_size / mp->m_dirblksize); + free->hdr.nvalid = ltp->bestcount; + /* + * Copy freespace entries from the leaf block to the new block. + * Count active entries. + */ + for (i = n = 0, from = xfs_dir2_leaf_bests_p(ltp), to = free->bests; + i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { + if ((off = be16_to_cpu(*from)) != NULLDATAOFF) + n++; + *to = cpu_to_be16(off); + } + free->hdr.nused = cpu_to_be32(n); + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + /* + * Log everything. + */ + xfs_dir2_leaf_log_header(tp, lbp); + xfs_dir2_free_log_header(tp, fbp); + xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1); + xfs_da_buf_done(fbp); + xfs_dir2_leafn_check(dp, lbp); + return 0; +} + +/* + * Add a leaf entry to a leaf block in a node-form directory. + * The other work necessary is done from the caller. + */ +static int /* error */ +xfs_dir2_leafn_add( + xfs_dabuf_t *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int index) /* insertion pt for new entry */ +{ + int compact; /* compacting stale leaves */ + xfs_inode_t *dp; /* incore directory inode */ + int highstale; /* next stale entry */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int lfloghigh; /* high leaf entry logging */ + int lfloglow; /* low leaf entry logging */ + int lowstale; /* previous stale entry */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + + trace_xfs_dir2_leafn_add(args, index); + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + leaf = bp->data; + + /* + * Quick check just to make sure we are not going to index + * into other peoples memory + */ + if (index < 0) + return XFS_ERROR(EFSCORRUPTED); + + /* + * If there are already the maximum number of leaf entries in + * the block, if there are no stale entries it won't fit. + * Caller will do a split. If there are stale entries we'll do + * a compact. + */ + + if (be16_to_cpu(leaf->hdr.count) == xfs_dir2_max_leaf_ents(mp)) { + if (!leaf->hdr.stale) + return XFS_ERROR(ENOSPC); + compact = be16_to_cpu(leaf->hdr.stale) > 1; + } else + compact = 0; + ASSERT(index == 0 || be32_to_cpu(leaf->ents[index - 1].hashval) <= args->hashval); + ASSERT(index == be16_to_cpu(leaf->hdr.count) || + be32_to_cpu(leaf->ents[index].hashval) >= args->hashval); + + if (args->op_flags & XFS_DA_OP_JUSTCHECK) + return 0; + + /* + * Compact out all but one stale leaf entry. Leaves behind + * the entry closest to index. + */ + if (compact) { + xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale, + &lfloglow, &lfloghigh); + } + /* + * Set impossible logging indices for this case. + */ + else if (leaf->hdr.stale) { + lfloglow = be16_to_cpu(leaf->hdr.count); + lfloghigh = -1; + } + /* + * No stale entries, just insert a space for the new entry. + */ + if (!leaf->hdr.stale) { + lep = &leaf->ents[index]; + if (index < be16_to_cpu(leaf->hdr.count)) + memmove(lep + 1, lep, + (be16_to_cpu(leaf->hdr.count) - index) * sizeof(*lep)); + lfloglow = index; + lfloghigh = be16_to_cpu(leaf->hdr.count); + be16_add_cpu(&leaf->hdr.count, 1); + } + /* + * There are stale entries. We'll use one for the new entry. + */ + else { + /* + * If we didn't do a compact then we need to figure out + * which stale entry will be used. + */ + if (compact == 0) { + /* + * Find first stale entry before our insertion point. + */ + for (lowstale = index - 1; + lowstale >= 0 && + be32_to_cpu(leaf->ents[lowstale].address) != + XFS_DIR2_NULL_DATAPTR; + lowstale--) + continue; + /* + * Find next stale entry after insertion point. + * Stop looking if the answer would be worse than + * lowstale already found. + */ + for (highstale = index; + highstale < be16_to_cpu(leaf->hdr.count) && + be32_to_cpu(leaf->ents[highstale].address) != + XFS_DIR2_NULL_DATAPTR && + (lowstale < 0 || + index - lowstale - 1 >= highstale - index); + highstale++) + continue; + } + /* + * Using the low stale entry. + * Shift entries up toward the stale slot. + */ + if (lowstale >= 0 && + (highstale == be16_to_cpu(leaf->hdr.count) || + index - lowstale - 1 < highstale - index)) { + ASSERT(be32_to_cpu(leaf->ents[lowstale].address) == + XFS_DIR2_NULL_DATAPTR); + ASSERT(index - lowstale - 1 >= 0); + if (index - lowstale - 1 > 0) + memmove(&leaf->ents[lowstale], + &leaf->ents[lowstale + 1], + (index - lowstale - 1) * sizeof(*lep)); + lep = &leaf->ents[index - 1]; + lfloglow = MIN(lowstale, lfloglow); + lfloghigh = MAX(index - 1, lfloghigh); + } + /* + * Using the high stale entry. + * Shift entries down toward the stale slot. + */ + else { + ASSERT(be32_to_cpu(leaf->ents[highstale].address) == + XFS_DIR2_NULL_DATAPTR); + ASSERT(highstale - index >= 0); + if (highstale - index > 0) + memmove(&leaf->ents[index + 1], + &leaf->ents[index], + (highstale - index) * sizeof(*lep)); + lep = &leaf->ents[index]; + lfloglow = MIN(index, lfloglow); + lfloghigh = MAX(highstale, lfloghigh); + } + be16_add_cpu(&leaf->hdr.stale, -1); + } + /* + * Insert the new entry, log everything. + */ + lep->hashval = cpu_to_be32(args->hashval); + lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, + args->blkno, args->index)); + xfs_dir2_leaf_log_header(tp, bp); + xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh); + xfs_dir2_leafn_check(dp, bp); + return 0; +} + +#ifdef DEBUG +/* + * Check internal consistency of a leafn block. + */ +void +xfs_dir2_leafn_check( + xfs_inode_t *dp, /* incore directory inode */ + xfs_dabuf_t *bp) /* leaf buffer */ +{ + int i; /* leaf index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_mount_t *mp; /* filesystem mount point */ + int stale; /* count of stale leaves */ + + leaf = bp->data; + mp = dp->i_mount; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); + for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { + if (i + 1 < be16_to_cpu(leaf->hdr.count)) { + ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= + be32_to_cpu(leaf->ents[i + 1].hashval)); + } + if (be32_to_cpu(leaf->ents[i].address) == XFS_DIR2_NULL_DATAPTR) + stale++; + } + ASSERT(be16_to_cpu(leaf->hdr.stale) == stale); +} +#endif /* DEBUG */ + +/* + * Return the last hash value in the leaf. + * Stale entries are ok. + */ +xfs_dahash_t /* hash value */ +xfs_dir2_leafn_lasthash( + xfs_dabuf_t *bp, /* leaf buffer */ + int *count) /* count of entries in leaf */ +{ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + if (count) + *count = be16_to_cpu(leaf->hdr.count); + if (!leaf->hdr.count) + return 0; + return be32_to_cpu(leaf->ents[be16_to_cpu(leaf->hdr.count) - 1].hashval); +} + +/* + * Look up a leaf entry for space to add a name in a node-format leaf block. + * The extrablk in state is a freespace block. + */ +STATIC int +xfs_dir2_leafn_lookup_for_addname( + xfs_dabuf_t *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + xfs_dabuf_t *curbp = NULL; /* current data/free buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dir2_db_t curfdb = -1; /* current free block number */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int fi; /* free entry index */ + xfs_dir2_free_t *free = NULL; /* free block structure */ + int index; /* leaf entry index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int length; /* length of new data entry */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_db_t newdb; /* new data block number */ + xfs_dir2_db_t newfdb; /* new free block number */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); +#ifdef __KERNEL__ + ASSERT(be16_to_cpu(leaf->hdr.count) > 0); +#endif + xfs_dir2_leafn_check(dp, bp); + /* + * Look up the hash value in the leaf entries. + */ + index = xfs_dir2_leaf_search_hash(args, bp); + /* + * Do we have a buffer coming in? + */ + if (state->extravalid) { + /* If so, it's a free block buffer, get the block number. */ + curbp = state->extrablk.bp; + curfdb = state->extrablk.blkno; + free = curbp->data; + ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + } + length = xfs_dir2_data_entsize(args->namelen); + /* + * Loop over leaf entries with the right hash value. + */ + for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && + be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { + /* + * Skip stale leaf entries. + */ + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Pull the data block number from the entry. + */ + newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + /* + * For addname, we're looking for a place to put the new entry. + * We want to use a data block with an entry of equal + * hash value to ours if there is one with room. + * + * If this block isn't the data block we already have + * in hand, take a look at it. + */ + if (newdb != curdb) { + curdb = newdb; + /* + * Convert the data block to the free block + * holding its freespace information. + */ + newfdb = xfs_dir2_db_to_fdb(mp, newdb); + /* + * If it's not the one we have in hand, read it in. + */ + if (newfdb != curfdb) { + /* + * If we had one before, drop it. + */ + if (curbp) + xfs_da_brelse(tp, curbp); + /* + * Read the free block. + */ + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, newfdb), + -1, &curbp, XFS_DATA_FORK); + if (error) + return error; + free = curbp->data; + ASSERT(be32_to_cpu(free->hdr.magic) == + XFS_DIR2_FREE_MAGIC); + ASSERT((be32_to_cpu(free->hdr.firstdb) % + XFS_DIR2_MAX_FREE_BESTS(mp)) == 0); + ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb); + ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) + + be32_to_cpu(free->hdr.nvalid)); + } + /* + * Get the index for our entry. + */ + fi = xfs_dir2_db_to_fdindex(mp, curdb); + /* + * If it has room, return it. + */ + if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) { + XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", + XFS_ERRLEVEL_LOW, mp); + if (curfdb != newfdb) + xfs_da_brelse(tp, curbp); + return XFS_ERROR(EFSCORRUPTED); + } + curfdb = newfdb; + if (be16_to_cpu(free->bests[fi]) >= length) + goto out; + } + } + /* Didn't find any space */ + fi = -1; +out: + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + if (curbp) { + /* Giving back a free block. */ + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.index = fi; + state->extrablk.blkno = curfdb; + state->extrablk.magic = XFS_DIR2_FREE_MAGIC; + } else { + state->extravalid = 0; + } + /* + * Return the index, that will be the insertion point. + */ + *indexp = index; + return XFS_ERROR(ENOENT); +} + +/* + * Look up a leaf entry in a node-format leaf block. + * The extrablk in state a data block. + */ +STATIC int +xfs_dir2_leafn_lookup_for_entry( + xfs_dabuf_t *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + xfs_dabuf_t *curbp = NULL; /* current data/free buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int index; /* leaf entry index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_db_t newdb; /* new data block number */ + xfs_trans_t *tp; /* transaction pointer */ + enum xfs_dacmp cmp; /* comparison result */ + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); +#ifdef __KERNEL__ + ASSERT(be16_to_cpu(leaf->hdr.count) > 0); +#endif + xfs_dir2_leafn_check(dp, bp); + /* + * Look up the hash value in the leaf entries. + */ + index = xfs_dir2_leaf_search_hash(args, bp); + /* + * Do we have a buffer coming in? + */ + if (state->extravalid) { + curbp = state->extrablk.bp; + curdb = state->extrablk.blkno; + } + /* + * Loop over leaf entries with the right hash value. + */ + for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && + be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { + /* + * Skip stale leaf entries. + */ + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Pull the data block number from the entry. + */ + newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + /* + * Not adding a new entry, so we really want to find + * the name given to us. + * + * If it's a different data block, go get it. + */ + if (newdb != curdb) { + /* + * If we had a block before that we aren't saving + * for a CI name, drop it + */ + if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT || + curdb != state->extrablk.blkno)) + xfs_da_brelse(tp, curbp); + /* + * If needing the block that is saved with a CI match, + * use it otherwise read in the new data block. + */ + if (args->cmpresult != XFS_CMP_DIFFERENT && + newdb == state->extrablk.blkno) { + ASSERT(state->extravalid); + curbp = state->extrablk.bp; + } else { + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, newdb), + -1, &curbp, XFS_DATA_FORK); + if (error) + return error; + } + xfs_dir2_data_check(dp, curbp); + curdb = newdb; + } + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *)((char *)curbp->data + + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + /* + * Compare the entry and if it's an exact match, return + * EEXIST immediately. If it's the first case-insensitive + * match, store the block & inode number and continue looking. + */ + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + /* If there is a CI match block, drop it */ + if (args->cmpresult != XFS_CMP_DIFFERENT && + curdb != state->extrablk.blkno) + xfs_da_brelse(tp, state->extrablk.bp); + args->cmpresult = cmp; + args->inumber = be64_to_cpu(dep->inumber); + *indexp = index; + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.blkno = curdb; + state->extrablk.index = (int)((char *)dep - + (char *)curbp->data); + state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + if (cmp == XFS_CMP_EXACT) + return XFS_ERROR(EEXIST); + } + } + ASSERT(index == be16_to_cpu(leaf->hdr.count) || + (args->op_flags & XFS_DA_OP_OKNOENT)); + if (curbp) { + if (args->cmpresult == XFS_CMP_DIFFERENT) { + /* Giving back last used data block. */ + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.index = -1; + state->extrablk.blkno = curdb; + state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + } else { + /* If the curbp is not the CI match block, drop it */ + if (state->extrablk.bp != curbp) + xfs_da_brelse(tp, curbp); + } + } else { + state->extravalid = 0; + } + *indexp = index; + return XFS_ERROR(ENOENT); +} + +/* + * Look up a leaf entry in a node-format leaf block. + * If this is an addname then the extrablk in state is a freespace block, + * otherwise it's a data block. + */ +int +xfs_dir2_leafn_lookup_int( + xfs_dabuf_t *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + if (args->op_flags & XFS_DA_OP_ADDNAME) + return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp, + state); + return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state); +} + +/* + * Move count leaf entries from source to destination leaf. + * Log entries and headers. Stale entries are preserved. + */ +static void +xfs_dir2_leafn_moveents( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *bp_s, /* source leaf buffer */ + int start_s, /* source leaf index */ + xfs_dabuf_t *bp_d, /* destination leaf buffer */ + int start_d, /* destination leaf index */ + int count) /* count of leaves to copy */ +{ + xfs_dir2_leaf_t *leaf_d; /* destination leaf structure */ + xfs_dir2_leaf_t *leaf_s; /* source leaf structure */ + int stale; /* count stale leaves copied */ + xfs_trans_t *tp; /* transaction pointer */ + + trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count); + + /* + * Silently return if nothing to do. + */ + if (count == 0) { + return; + } + tp = args->trans; + leaf_s = bp_s->data; + leaf_d = bp_d->data; + /* + * If the destination index is not the end of the current + * destination leaf entries, open up a hole in the destination + * to hold the new entries. + */ + if (start_d < be16_to_cpu(leaf_d->hdr.count)) { + memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d], + (be16_to_cpu(leaf_d->hdr.count) - start_d) * + sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count, + count + be16_to_cpu(leaf_d->hdr.count) - 1); + } + /* + * If the source has stale leaves, count the ones in the copy range + * so we can update the header correctly. + */ + if (leaf_s->hdr.stale) { + int i; /* temp leaf index */ + + for (i = start_s, stale = 0; i < start_s + count; i++) { + if (be32_to_cpu(leaf_s->ents[i].address) == XFS_DIR2_NULL_DATAPTR) + stale++; + } + } else + stale = 0; + /* + * Copy the leaf entries from source to destination. + */ + memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s], + count * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1); + /* + * If there are source entries after the ones we copied, + * delete the ones we copied by sliding the next ones down. + */ + if (start_s + count < be16_to_cpu(leaf_s->hdr.count)) { + memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count], + count * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1); + } + /* + * Update the headers and log them. + */ + be16_add_cpu(&leaf_s->hdr.count, -(count)); + be16_add_cpu(&leaf_s->hdr.stale, -(stale)); + be16_add_cpu(&leaf_d->hdr.count, count); + be16_add_cpu(&leaf_d->hdr.stale, stale); + xfs_dir2_leaf_log_header(tp, bp_s); + xfs_dir2_leaf_log_header(tp, bp_d); + xfs_dir2_leafn_check(args->dp, bp_s); + xfs_dir2_leafn_check(args->dp, bp_d); +} + +/* + * Determine the sort order of two leaf blocks. + * Returns 1 if both are valid and leaf2 should be before leaf1, else 0. + */ +int /* sort order */ +xfs_dir2_leafn_order( + xfs_dabuf_t *leaf1_bp, /* leaf1 buffer */ + xfs_dabuf_t *leaf2_bp) /* leaf2 buffer */ +{ + xfs_dir2_leaf_t *leaf1; /* leaf1 structure */ + xfs_dir2_leaf_t *leaf2; /* leaf2 structure */ + + leaf1 = leaf1_bp->data; + leaf2 = leaf2_bp->data; + ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + if (be16_to_cpu(leaf1->hdr.count) > 0 && + be16_to_cpu(leaf2->hdr.count) > 0 && + (be32_to_cpu(leaf2->ents[0].hashval) < be32_to_cpu(leaf1->ents[0].hashval) || + be32_to_cpu(leaf2->ents[be16_to_cpu(leaf2->hdr.count) - 1].hashval) < + be32_to_cpu(leaf1->ents[be16_to_cpu(leaf1->hdr.count) - 1].hashval))) + return 1; + return 0; +} + +/* + * Rebalance leaf entries between two leaf blocks. + * This is actually only called when the second block is new, + * though the code deals with the general case. + * A new entry will be inserted in one of the blocks, and that + * entry is taken into account when balancing. + */ +static void +xfs_dir2_leafn_rebalance( + xfs_da_state_t *state, /* btree cursor */ + xfs_da_state_blk_t *blk1, /* first btree block */ + xfs_da_state_blk_t *blk2) /* second btree block */ +{ + xfs_da_args_t *args; /* operation arguments */ + int count; /* count (& direction) leaves */ + int isleft; /* new goes in left leaf */ + xfs_dir2_leaf_t *leaf1; /* first leaf structure */ + xfs_dir2_leaf_t *leaf2; /* second leaf structure */ + int mid; /* midpoint leaf index */ +#ifdef DEBUG + int oldstale; /* old count of stale leaves */ +#endif + int oldsum; /* old total leaf count */ + int swap; /* swapped leaf blocks */ + + args = state->args; + /* + * If the block order is wrong, swap the arguments. + */ + if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) { + xfs_da_state_blk_t *tmp; /* temp for block swap */ + + tmp = blk1; + blk1 = blk2; + blk2 = tmp; + } + leaf1 = blk1->bp->data; + leaf2 = blk2->bp->data; + oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count); +#ifdef DEBUG + oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale); +#endif + mid = oldsum >> 1; + /* + * If the old leaf count was odd then the new one will be even, + * so we need to divide the new count evenly. + */ + if (oldsum & 1) { + xfs_dahash_t midhash; /* middle entry hash value */ + + if (mid >= be16_to_cpu(leaf1->hdr.count)) + midhash = be32_to_cpu(leaf2->ents[mid - be16_to_cpu(leaf1->hdr.count)].hashval); + else + midhash = be32_to_cpu(leaf1->ents[mid].hashval); + isleft = args->hashval <= midhash; + } + /* + * If the old count is even then the new count is odd, so there's + * no preferred side for the new entry. + * Pick the left one. + */ + else + isleft = 1; + /* + * Calculate moved entry count. Positive means left-to-right, + * negative means right-to-left. Then move the entries. + */ + count = be16_to_cpu(leaf1->hdr.count) - mid + (isleft == 0); + if (count > 0) + xfs_dir2_leafn_moveents(args, blk1->bp, + be16_to_cpu(leaf1->hdr.count) - count, blk2->bp, 0, count); + else if (count < 0) + xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp, + be16_to_cpu(leaf1->hdr.count), count); + ASSERT(be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count) == oldsum); + ASSERT(be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale) == oldstale); + /* + * Mark whether we're inserting into the old or new leaf. + */ + if (be16_to_cpu(leaf1->hdr.count) < be16_to_cpu(leaf2->hdr.count)) + state->inleaf = swap; + else if (be16_to_cpu(leaf1->hdr.count) > be16_to_cpu(leaf2->hdr.count)) + state->inleaf = !swap; + else + state->inleaf = + swap ^ (blk1->index <= be16_to_cpu(leaf1->hdr.count)); + /* + * Adjust the expected index for insertion. + */ + if (!state->inleaf) + blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); + + /* + * Finally sanity check just to make sure we are not returning a + * negative index + */ + if(blk2->index < 0) { + state->inleaf = 1; + blk2->index = 0; + xfs_alert(args->dp->i_mount, + "%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n", + __func__, blk1->index); + } +} + +/* + * Remove an entry from a node directory. + * This removes the leaf entry and the data entry, + * and updates the free block if necessary. + */ +static int /* error */ +xfs_dir2_leafn_remove( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *bp, /* leaf buffer */ + int index, /* leaf entry index */ + xfs_da_state_blk_t *dblk, /* data block */ + int *rval) /* resulting block needs join */ +{ + xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_db_t db; /* data block number */ + xfs_dabuf_t *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int longest; /* longest data free entry */ + int off; /* data block entry offset */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data frees */ + xfs_trans_t *tp; /* transaction pointer */ + + trace_xfs_dir2_leafn_remove(args, index); + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + /* + * Point to the entry we're removing. + */ + lep = &leaf->ents[index]; + /* + * Extract the data block and offset from the entry. + */ + db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + ASSERT(dblk->blkno == db); + off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)); + ASSERT(dblk->index == off); + /* + * Kill the leaf entry by marking it stale. + * Log the leaf block changes. + */ + be16_add_cpu(&leaf->hdr.stale, 1); + xfs_dir2_leaf_log_header(tp, bp); + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); + xfs_dir2_leaf_log_ents(tp, bp, index, index); + /* + * Make the data entry free. Keep track of the longest freespace + * in the data block in case it changes. + */ + dbp = dblk->bp; + data = dbp->data; + dep = (xfs_dir2_data_entry_t *)((char *)data + off); + longest = be16_to_cpu(data->hdr.bestfree[0].length); + needlog = needscan = 0; + xfs_dir2_data_make_free(tp, dbp, off, + xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); + /* + * Rescan the data block freespaces for bestfree. + * Log the data block header if needed. + */ + if (needscan) + xfs_dir2_data_freescan(mp, data, &needlog); + if (needlog) + xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_data_check(dp, dbp); + /* + * If the longest data block freespace changes, need to update + * the corresponding freeblock entry. + */ + if (longest < be16_to_cpu(data->hdr.bestfree[0].length)) { + int error; /* error return value */ + xfs_dabuf_t *fbp; /* freeblock buffer */ + xfs_dir2_db_t fdb; /* freeblock block number */ + int findex; /* index in freeblock entries */ + xfs_dir2_free_t *free; /* freeblock structure */ + int logfree; /* need to log free entry */ + + /* + * Convert the data block number to a free block, + * read in the free block. + */ + fdb = xfs_dir2_db_to_fdb(mp, db); + if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), + -1, &fbp, XFS_DATA_FORK))) { + return error; + } + free = fbp->data; + ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + ASSERT(be32_to_cpu(free->hdr.firstdb) == + XFS_DIR2_MAX_FREE_BESTS(mp) * + (fdb - XFS_DIR2_FREE_FIRSTDB(mp))); + /* + * Calculate which entry we need to fix. + */ + findex = xfs_dir2_db_to_fdindex(mp, db); + longest = be16_to_cpu(data->hdr.bestfree[0].length); + /* + * If the data block is now empty we can get rid of it + * (usually). + */ + if (longest == mp->m_dirblksize - (uint)sizeof(data->hdr)) { + /* + * Try to punch out the data block. + */ + error = xfs_dir2_shrink_inode(args, db, dbp); + if (error == 0) { + dblk->bp = NULL; + data = NULL; + } + /* + * We can get ENOSPC if there's no space reservation. + * In this case just drop the buffer and some one else + * will eventually get rid of the empty block. + */ + else if (error == ENOSPC && args->total == 0) + xfs_da_buf_done(dbp); + else + return error; + } + /* + * If we got rid of the data block, we can eliminate that entry + * in the free block. + */ + if (data == NULL) { + /* + * One less used entry in the free table. + */ + be32_add_cpu(&free->hdr.nused, -1); + xfs_dir2_free_log_header(tp, fbp); + /* + * If this was the last entry in the table, we can + * trim the table size back. There might be other + * entries at the end referring to non-existent + * data blocks, get those too. + */ + if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { + int i; /* free entry index */ + + for (i = findex - 1; + i >= 0 && be16_to_cpu(free->bests[i]) == NULLDATAOFF; + i--) + continue; + free->hdr.nvalid = cpu_to_be32(i + 1); + logfree = 0; + } + /* + * Not the last entry, just punch it out. + */ + else { + free->bests[findex] = cpu_to_be16(NULLDATAOFF); + logfree = 1; + } + /* + * If there are no useful entries left in the block, + * get rid of the block if we can. + */ + if (!free->hdr.nused) { + error = xfs_dir2_shrink_inode(args, fdb, fbp); + if (error == 0) { + fbp = NULL; + logfree = 0; + } else if (error != ENOSPC || args->total != 0) + return error; + /* + * It's possible to get ENOSPC if there is no + * space reservation. In this case some one + * else will eventually get rid of this block. + */ + } + } + /* + * Data block is not empty, just set the free entry to + * the new value. + */ + else { + free->bests[findex] = cpu_to_be16(longest); + logfree = 1; + } + /* + * Log the free entry that changed, unless we got rid of it. + */ + if (logfree) + xfs_dir2_free_log_bests(tp, fbp, findex, findex); + /* + * Drop the buffer if we still have it. + */ + if (fbp) + xfs_da_buf_done(fbp); + } + xfs_dir2_leafn_check(dp, bp); + /* + * Return indication of whether this leaf block is empty enough + * to justify trying to join it with a neighbor. + */ + *rval = + ((uint)sizeof(leaf->hdr) + + (uint)sizeof(leaf->ents[0]) * + (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale))) < + mp->m_dir_magicpct; + return 0; +} + +/* + * Split the leaf entries in the old block into old and new blocks. + */ +int /* error */ +xfs_dir2_leafn_split( + xfs_da_state_t *state, /* btree cursor */ + xfs_da_state_blk_t *oldblk, /* original block */ + xfs_da_state_blk_t *newblk) /* newly created block */ +{ + xfs_da_args_t *args; /* operation arguments */ + xfs_dablk_t blkno; /* new leaf block number */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + + /* + * Allocate space for a new leaf node. + */ + args = state->args; + mp = args->dp->i_mount; + ASSERT(args != NULL); + ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC); + error = xfs_da_grow_inode(args, &blkno); + if (error) { + return error; + } + /* + * Initialize the new leaf block. + */ + error = xfs_dir2_leaf_init(args, xfs_dir2_da_to_db(mp, blkno), + &newblk->bp, XFS_DIR2_LEAFN_MAGIC); + if (error) { + return error; + } + newblk->blkno = blkno; + newblk->magic = XFS_DIR2_LEAFN_MAGIC; + /* + * Rebalance the entries across the two leaves, link the new + * block into the leaves. + */ + xfs_dir2_leafn_rebalance(state, oldblk, newblk); + error = xfs_da_blk_link(state, oldblk, newblk); + if (error) { + return error; + } + /* + * Insert the new entry in the correct block. + */ + if (state->inleaf) + error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index); + else + error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index); + /* + * Update last hashval in each block since we added the name. + */ + oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL); + newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL); + xfs_dir2_leafn_check(args->dp, oldblk->bp); + xfs_dir2_leafn_check(args->dp, newblk->bp); + return error; +} + +/* + * Check a leaf block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + */ +int /* error */ +xfs_dir2_leafn_toosmall( + xfs_da_state_t *state, /* btree cursor */ + int *action) /* resulting action to take */ +{ + xfs_da_state_blk_t *blk; /* leaf block */ + xfs_dablk_t blkno; /* leaf block number */ + xfs_dabuf_t *bp; /* leaf buffer */ + int bytes; /* bytes in use */ + int count; /* leaf live entry count */ + int error; /* error return value */ + int forward; /* sibling block direction */ + int i; /* sibling counter */ + xfs_da_blkinfo_t *info; /* leaf block header */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int rval; /* result from path_shift */ + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[state->path.active - 1]; + info = blk->bp->data; + ASSERT(be16_to_cpu(info->magic) == XFS_DIR2_LEAFN_MAGIC); + leaf = (xfs_dir2_leaf_t *)info; + count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); + bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]); + if (bytes > (state->blocksize >> 1)) { + /* + * Blk over 50%, don't try to join. + */ + *action = 0; + return 0; + } + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (arbitrarily) + * to merge with the forward block unless it is NULL. + */ + if (count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = (info->forw != 0); + memcpy(&state->altpath, &state->path, sizeof(state->path)); + error = xfs_da_path_shift(state, &state->altpath, forward, 0, + &rval); + if (error) + return error; + *action = rval ? 2 : 0; + return 0; + } + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink a directory over time. + */ + forward = be32_to_cpu(info->forw) < be32_to_cpu(info->back); + for (i = 0, bp = NULL; i < 2; forward = !forward, i++) { + blkno = forward ? be32_to_cpu(info->forw) : be32_to_cpu(info->back); + if (blkno == 0) + continue; + /* + * Read the sibling leaf block. + */ + if ((error = + xfs_da_read_buf(state->args->trans, state->args->dp, blkno, + -1, &bp, XFS_DATA_FORK))) { + return error; + } + ASSERT(bp != NULL); + /* + * Count bytes in the two blocks combined. + */ + leaf = (xfs_dir2_leaf_t *)info; + count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); + bytes = state->blocksize - (state->blocksize >> 2); + leaf = bp->data; + ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); + bytes -= count * (uint)sizeof(leaf->ents[0]); + /* + * Fits with at least 25% to spare. + */ + if (bytes >= 0) + break; + xfs_da_brelse(state->args->trans, bp); + } + /* + * Didn't like either block, give up. + */ + if (i >= 2) { + *action = 0; + return 0; + } + /* + * Done with the sibling leaf block here, drop the dabuf + * so path_shift can get it. + */ + xfs_da_buf_done(bp); + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + memcpy(&state->altpath, &state->path, sizeof(state->path)); + if (blkno < blk->blkno) + error = xfs_da_path_shift(state, &state->altpath, forward, 0, + &rval); + else + error = xfs_da_path_shift(state, &state->path, forward, 0, + &rval); + if (error) { + return error; + } + *action = rval ? 0 : 1; + return 0; +} + +/* + * Move all the leaf entries from drop_blk to save_blk. + * This is done as part of a join operation. + */ +void +xfs_dir2_leafn_unbalance( + xfs_da_state_t *state, /* cursor */ + xfs_da_state_blk_t *drop_blk, /* dead block */ + xfs_da_state_blk_t *save_blk) /* surviving block */ +{ + xfs_da_args_t *args; /* operation arguments */ + xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */ + xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */ + + args = state->args; + ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC); + drop_leaf = drop_blk->bp->data; + save_leaf = save_blk->bp->data; + ASSERT(be16_to_cpu(drop_leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(be16_to_cpu(save_leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + /* + * If there are any stale leaf entries, take this opportunity + * to purge them. + */ + if (drop_leaf->hdr.stale) + xfs_dir2_leaf_compact(args, drop_blk->bp); + if (save_leaf->hdr.stale) + xfs_dir2_leaf_compact(args, save_blk->bp); + /* + * Move the entries from drop to the appropriate end of save. + */ + drop_blk->hashval = be32_to_cpu(drop_leaf->ents[be16_to_cpu(drop_leaf->hdr.count) - 1].hashval); + if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp)) + xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0, + be16_to_cpu(drop_leaf->hdr.count)); + else + xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, + be16_to_cpu(save_leaf->hdr.count), be16_to_cpu(drop_leaf->hdr.count)); + save_blk->hashval = be32_to_cpu(save_leaf->ents[be16_to_cpu(save_leaf->hdr.count) - 1].hashval); + xfs_dir2_leafn_check(args->dp, save_blk->bp); +} + +/* + * Top-level node form directory addname routine. + */ +int /* error */ +xfs_dir2_node_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_da_state_blk_t *blk; /* leaf block for insert */ + int error; /* error return value */ + int rval; /* sub-return value */ + xfs_da_state_t *state; /* btree cursor */ + + trace_xfs_dir2_node_addname(args); + + /* + * Allocate and initialize the state (btree cursor). + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_dirblksize; + state->node_ents = state->mp->m_dir_node_ents; + /* + * Look up the name. We're not supposed to find it, but + * this gives us the insertion point. + */ + error = xfs_da_node_lookup_int(state, &rval); + if (error) + rval = error; + if (rval != ENOENT) { + goto done; + } + /* + * Add the data entry to a data block. + * Extravalid is set to a freeblock found by lookup. + */ + rval = xfs_dir2_node_addname_int(args, + state->extravalid ? &state->extrablk : NULL); + if (rval) { + goto done; + } + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + /* + * Add the new leaf entry. + */ + rval = xfs_dir2_leafn_add(blk->bp, args, blk->index); + if (rval == 0) { + /* + * It worked, fix the hash values up the btree. + */ + if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) + xfs_da_fixhashpath(state, &state->path); + } else { + /* + * It didn't work, we need to split the leaf block. + */ + if (args->total == 0) { + ASSERT(rval == ENOSPC); + goto done; + } + /* + * Split the leaf block and insert the new entry. + */ + rval = xfs_da_split(state); + } +done: + xfs_da_state_free(state); + return rval; +} + +/* + * Add the data entry for a node-format directory name addition. + * The leaf entry is added in xfs_dir2_leafn_add. + * We may enter with a freespace block that the lookup found. + */ +static int /* error */ +xfs_dir2_node_addname_int( + xfs_da_args_t *args, /* operation arguments */ + xfs_da_state_blk_t *fblk) /* optional freespace block */ +{ + xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_db_t dbno; /* data block number */ + xfs_dabuf_t *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data entry pointer */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* data unused entry pointer */ + int error; /* error return value */ + xfs_dir2_db_t fbno; /* freespace block number */ + xfs_dabuf_t *fbp; /* freespace buffer */ + int findex; /* freespace entry index */ + xfs_dir2_free_t *free=NULL; /* freespace block structure */ + xfs_dir2_db_t ifbno; /* initial freespace block no */ + xfs_dir2_db_t lastfbno=0; /* highest freespace block no */ + int length; /* length of the new entry */ + int logfree; /* need to log free entry */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data frees */ + __be16 *tagp; /* data entry tag pointer */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + length = xfs_dir2_data_entsize(args->namelen); + /* + * If we came in with a freespace block that means that lookup + * found an entry with our hash value. This is the freespace + * block for that data entry. + */ + if (fblk) { + fbp = fblk->bp; + /* + * Remember initial freespace block number. + */ + ifbno = fblk->blkno; + free = fbp->data; + ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + findex = fblk->index; + /* + * This means the free entry showed that the data block had + * space for our entry, so we remembered it. + * Use that data block. + */ + if (findex >= 0) { + ASSERT(findex < be32_to_cpu(free->hdr.nvalid)); + ASSERT(be16_to_cpu(free->bests[findex]) != NULLDATAOFF); + ASSERT(be16_to_cpu(free->bests[findex]) >= length); + dbno = be32_to_cpu(free->hdr.firstdb) + findex; + } + /* + * The data block looked at didn't have enough room. + * We'll start at the beginning of the freespace entries. + */ + else { + dbno = -1; + findex = 0; + } + } + /* + * Didn't come in with a freespace block, so don't have a data block. + */ + else { + ifbno = dbno = -1; + fbp = NULL; + findex = 0; + } + /* + * If we don't have a data block yet, we're going to scan the + * freespace blocks looking for one. Figure out what the + * highest freespace block number is. + */ + if (dbno == -1) { + xfs_fileoff_t fo; /* freespace block number */ + + if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) + return error; + lastfbno = xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo); + fbno = ifbno; + } + /* + * While we haven't identified a data block, search the freeblock + * data for a good data block. If we find a null freeblock entry, + * indicating a hole in the data blocks, remember that. + */ + while (dbno == -1) { + /* + * If we don't have a freeblock in hand, get the next one. + */ + if (fbp == NULL) { + /* + * Happens the first time through unless lookup gave + * us a freespace block to start with. + */ + if (++fbno == 0) + fbno = XFS_DIR2_FREE_FIRSTDB(mp); + /* + * If it's ifbno we already looked at it. + */ + if (fbno == ifbno) + fbno++; + /* + * If it's off the end we're done. + */ + if (fbno >= lastfbno) + break; + /* + * Read the block. There can be holes in the + * freespace blocks, so this might not succeed. + * This should be really rare, so there's no reason + * to avoid it. + */ + if ((error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, fbno), -2, &fbp, + XFS_DATA_FORK))) { + return error; + } + if (unlikely(fbp == NULL)) { + continue; + } + free = fbp->data; + ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + findex = 0; + } + /* + * Look at the current free entry. Is it good enough? + */ + if (be16_to_cpu(free->bests[findex]) != NULLDATAOFF && + be16_to_cpu(free->bests[findex]) >= length) + dbno = be32_to_cpu(free->hdr.firstdb) + findex; + else { + /* + * Are we done with the freeblock? + */ + if (++findex == be32_to_cpu(free->hdr.nvalid)) { + /* + * Drop the block. + */ + xfs_da_brelse(tp, fbp); + fbp = NULL; + if (fblk && fblk->bp) + fblk->bp = NULL; + } + } + } + /* + * If we don't have a data block, we need to allocate one and make + * the freespace entries refer to it. + */ + if (unlikely(dbno == -1)) { + /* + * Not allowed to allocate, return failure. + */ + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || + args->total == 0) { + /* + * Drop the freespace buffer unless it came from our + * caller. + */ + if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) + xfs_da_buf_done(fbp); + return XFS_ERROR(ENOSPC); + } + /* + * Allocate and initialize the new data block. + */ + if (unlikely((error = xfs_dir2_grow_inode(args, + XFS_DIR2_DATA_SPACE, + &dbno)) || + (error = xfs_dir2_data_init(args, dbno, &dbp)))) { + /* + * Drop the freespace buffer unless it came from our + * caller. + */ + if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) + xfs_da_buf_done(fbp); + return error; + } + /* + * If (somehow) we have a freespace block, get rid of it. + */ + if (fbp) + xfs_da_brelse(tp, fbp); + if (fblk && fblk->bp) + fblk->bp = NULL; + + /* + * Get the freespace block corresponding to the data block + * that was just allocated. + */ + fbno = xfs_dir2_db_to_fdb(mp, dbno); + if (unlikely(error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, fbno), -2, &fbp, + XFS_DATA_FORK))) { + xfs_da_buf_done(dbp); + return error; + } + /* + * If there wasn't a freespace block, the read will + * return a NULL fbp. Allocate and initialize a new one. + */ + if( fbp == NULL ) { + if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, + &fbno))) { + return error; + } + + if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) { + xfs_alert(mp, + "%s: dir ino " "%llu needed freesp block %lld for\n" + " data block %lld, got %lld ifbno %llu lastfbno %d", + __func__, (unsigned long long)dp->i_ino, + (long long)xfs_dir2_db_to_fdb(mp, dbno), + (long long)dbno, (long long)fbno, + (unsigned long long)ifbno, lastfbno); + if (fblk) { + xfs_alert(mp, + " fblk 0x%p blkno %llu index %d magic 0x%x", + fblk, + (unsigned long long)fblk->blkno, + fblk->index, + fblk->magic); + } else { + xfs_alert(mp, " ... fblk is NULL"); + } + XFS_ERROR_REPORT("xfs_dir2_node_addname_int", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + /* + * Get a buffer for the new block. + */ + if ((error = xfs_da_get_buf(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + -1, &fbp, XFS_DATA_FORK))) { + return error; + } + ASSERT(fbp != NULL); + + /* + * Initialize the new block to be empty, and remember + * its first slot as our empty slot. + */ + free = fbp->data; + free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); + free->hdr.firstdb = cpu_to_be32( + (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * + XFS_DIR2_MAX_FREE_BESTS(mp)); + free->hdr.nvalid = 0; + free->hdr.nused = 0; + } else { + free = fbp->data; + ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + } + + /* + * Set the freespace block index from the data block number. + */ + findex = xfs_dir2_db_to_fdindex(mp, dbno); + /* + * If it's after the end of the current entries in the + * freespace block, extend that table. + */ + if (findex >= be32_to_cpu(free->hdr.nvalid)) { + ASSERT(findex < XFS_DIR2_MAX_FREE_BESTS(mp)); + free->hdr.nvalid = cpu_to_be32(findex + 1); + /* + * Tag new entry so nused will go up. + */ + free->bests[findex] = cpu_to_be16(NULLDATAOFF); + } + /* + * If this entry was for an empty data block + * (this should always be true) then update the header. + */ + if (be16_to_cpu(free->bests[findex]) == NULLDATAOFF) { + be32_add_cpu(&free->hdr.nused, 1); + xfs_dir2_free_log_header(tp, fbp); + } + /* + * Update the real value in the table. + * We haven't allocated the data entry yet so this will + * change again. + */ + data = dbp->data; + free->bests[findex] = data->hdr.bestfree[0].length; + logfree = 1; + } + /* + * We had a data block so we don't have to make a new one. + */ + else { + /* + * If just checking, we succeeded. + */ + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { + if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) + xfs_da_buf_done(fbp); + return 0; + } + /* + * Read the data block in. + */ + if (unlikely( + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), + -1, &dbp, XFS_DATA_FORK))) { + if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) + xfs_da_buf_done(fbp); + return error; + } + data = dbp->data; + logfree = 0; + } + ASSERT(be16_to_cpu(data->hdr.bestfree[0].length) >= length); + /* + * Point to the existing unused space. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)data + be16_to_cpu(data->hdr.bestfree[0].offset)); + needscan = needlog = 0; + /* + * Mark the first part of the unused space, inuse for us. + */ + xfs_dir2_data_use_free(tp, dbp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length, + &needlog, &needscan); + /* + * Fill in the new entry and log it. + */ + dep = (xfs_dir2_data_entry_t *)dup; + dep->inumber = cpu_to_be64(args->inumber); + dep->namelen = args->namelen; + memcpy(dep->name, args->name, dep->namelen); + tagp = xfs_dir2_data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)data); + xfs_dir2_data_log_entry(tp, dbp, dep); + /* + * Rescan the block for bestfree if needed. + */ + if (needscan) + xfs_dir2_data_freescan(mp, data, &needlog); + /* + * Log the data block header if needed. + */ + if (needlog) + xfs_dir2_data_log_header(tp, dbp); + /* + * If the freespace entry is now wrong, update it. + */ + if (be16_to_cpu(free->bests[findex]) != be16_to_cpu(data->hdr.bestfree[0].length)) { + free->bests[findex] = data->hdr.bestfree[0].length; + logfree = 1; + } + /* + * Log the freespace entry if needed. + */ + if (logfree) + xfs_dir2_free_log_bests(tp, fbp, findex, findex); + /* + * If the caller didn't hand us the freespace block, drop it. + */ + if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) + xfs_da_buf_done(fbp); + /* + * Return the data block and offset in args, then drop the data block. + */ + args->blkno = (xfs_dablk_t)dbno; + args->index = be16_to_cpu(*tagp); + xfs_da_buf_done(dbp); + return 0; +} + +/* + * Lookup an entry in a node-format directory. + * All the real work happens in xfs_da_node_lookup_int. + * The only real output is the inode number of the entry. + */ +int /* error */ +xfs_dir2_node_lookup( + xfs_da_args_t *args) /* operation arguments */ +{ + int error; /* error return value */ + int i; /* btree level */ + int rval; /* operation return value */ + xfs_da_state_t *state; /* btree cursor */ + + trace_xfs_dir2_node_lookup(args); + + /* + * Allocate and initialize the btree cursor. + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_dirblksize; + state->node_ents = state->mp->m_dir_node_ents; + /* + * Fill in the path to the entry in the cursor. + */ + error = xfs_da_node_lookup_int(state, &rval); + if (error) + rval = error; + else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) { + /* If a CI match, dup the actual name and return EEXIST */ + xfs_dir2_data_entry_t *dep; + + dep = (xfs_dir2_data_entry_t *)((char *)state->extrablk.bp-> + data + state->extrablk.index); + rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen); + } + /* + * Release the btree blocks and leaf block. + */ + for (i = 0; i < state->path.active; i++) { + xfs_da_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + /* + * Release the data block if we have it. + */ + if (state->extravalid && state->extrablk.bp) { + xfs_da_brelse(args->trans, state->extrablk.bp); + state->extrablk.bp = NULL; + } + xfs_da_state_free(state); + return rval; +} + +/* + * Remove an entry from a node-format directory. + */ +int /* error */ +xfs_dir2_node_removename( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_da_state_blk_t *blk; /* leaf block */ + int error; /* error return value */ + int rval; /* operation return value */ + xfs_da_state_t *state; /* btree cursor */ + + trace_xfs_dir2_node_removename(args); + + /* + * Allocate and initialize the btree cursor. + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_dirblksize; + state->node_ents = state->mp->m_dir_node_ents; + /* + * Look up the entry we're deleting, set up the cursor. + */ + error = xfs_da_node_lookup_int(state, &rval); + if (error) + rval = error; + /* + * Didn't find it, upper layer screwed up. + */ + if (rval != EEXIST) { + xfs_da_state_free(state); + return rval; + } + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(state->extravalid); + /* + * Remove the leaf and data entries. + * Extrablk refers to the data block. + */ + error = xfs_dir2_leafn_remove(args, blk->bp, blk->index, + &state->extrablk, &rval); + if (error) + return error; + /* + * Fix the hash values up the btree. + */ + xfs_da_fixhashpath(state, &state->path); + /* + * If we need to join leaf blocks, do it. + */ + if (rval && state->path.active > 1) + error = xfs_da_join(state); + /* + * If no errors so far, try conversion to leaf format. + */ + if (!error) + error = xfs_dir2_node_to_leaf(state); + xfs_da_state_free(state); + return error; +} + +/* + * Replace an entry's inode number in a node-format directory. + */ +int /* error */ +xfs_dir2_node_replace( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_da_state_blk_t *blk; /* leaf block */ + xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_data_entry_t *dep; /* data entry changed */ + int error; /* error return value */ + int i; /* btree level */ + xfs_ino_t inum; /* new inode number */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */ + int rval; /* internal return value */ + xfs_da_state_t *state; /* btree cursor */ + + trace_xfs_dir2_node_replace(args); + + /* + * Allocate and initialize the btree cursor. + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_dirblksize; + state->node_ents = state->mp->m_dir_node_ents; + inum = args->inumber; + /* + * Lookup the entry to change in the btree. + */ + error = xfs_da_node_lookup_int(state, &rval); + if (error) { + rval = error; + } + /* + * It should be found, since the vnodeops layer has looked it up + * and locked it. But paranoia is good. + */ + if (rval == EEXIST) { + /* + * Find the leaf entry. + */ + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + leaf = blk->bp->data; + lep = &leaf->ents[blk->index]; + ASSERT(state->extravalid); + /* + * Point to the data entry. + */ + data = state->extrablk.bp->data; + ASSERT(be32_to_cpu(data->hdr.magic) == XFS_DIR2_DATA_MAGIC); + dep = (xfs_dir2_data_entry_t *) + ((char *)data + + xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address))); + ASSERT(inum != be64_to_cpu(dep->inumber)); + /* + * Fill in the new inode number and log the entry. + */ + dep->inumber = cpu_to_be64(inum); + xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep); + rval = 0; + } + /* + * Didn't find it, and we're holding a data block. Drop it. + */ + else if (state->extravalid) { + xfs_da_brelse(args->trans, state->extrablk.bp); + state->extrablk.bp = NULL; + } + /* + * Release all the buffers in the cursor. + */ + for (i = 0; i < state->path.active; i++) { + xfs_da_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + xfs_da_state_free(state); + return rval; +} + +/* + * Trim off a trailing empty freespace block. + * Return (in rvalp) 1 if we did it, 0 if not. + */ +int /* error */ +xfs_dir2_node_trim_free( + xfs_da_args_t *args, /* operation arguments */ + xfs_fileoff_t fo, /* free block number */ + int *rvalp) /* out: did something */ +{ + xfs_dabuf_t *bp; /* freespace buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + xfs_dir2_free_t *free; /* freespace structure */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + /* + * Read the freespace block. + */ + if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, + XFS_DATA_FORK))) { + return error; + } + + /* + * There can be holes in freespace. If fo is a hole, there's + * nothing to do. + */ + if (bp == NULL) { + return 0; + } + free = bp->data; + ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + /* + * If there are used entries, there's nothing to do. + */ + if (be32_to_cpu(free->hdr.nused) > 0) { + xfs_da_brelse(tp, bp); + *rvalp = 0; + return 0; + } + /* + * Blow the block away. + */ + if ((error = + xfs_dir2_shrink_inode(args, xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo), + bp))) { + /* + * Can't fail with ENOSPC since that only happens with no + * space reservation, when breaking up an extent into two + * pieces. This is the last block of an extent. + */ + ASSERT(error != ENOSPC); + xfs_da_brelse(tp, bp); + return error; + } + /* + * Return that we succeeded. + */ + *rvalp = 1; + return 0; +} diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h new file mode 100644 index 00000000..82dfe714 --- /dev/null +++ b/fs/xfs/xfs_dir2_node.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DIR2_NODE_H__ +#define __XFS_DIR2_NODE_H__ + +/* + * Directory version 2, btree node format structures + */ + +struct uio; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_da_state; +struct xfs_da_state_blk; +struct xfs_inode; +struct xfs_trans; + +/* + * Offset of the freespace index. + */ +#define XFS_DIR2_FREE_SPACE 2 +#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) +#define XFS_DIR2_FREE_FIRSTDB(mp) \ + xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET) + +#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F */ + +typedef struct xfs_dir2_free_hdr { + __be32 magic; /* XFS_DIR2_FREE_MAGIC */ + __be32 firstdb; /* db of first entry */ + __be32 nvalid; /* count of valid entries */ + __be32 nused; /* count of used entries */ +} xfs_dir2_free_hdr_t; + +typedef struct xfs_dir2_free { + xfs_dir2_free_hdr_t hdr; /* block header */ + __be16 bests[1]; /* best free counts */ + /* unused entries are -1 */ +} xfs_dir2_free_t; + +#define XFS_DIR2_MAX_FREE_BESTS(mp) \ + (((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_free_hdr_t)) / \ + (uint)sizeof(xfs_dir2_data_off_t)) + +/* + * Convert data space db to the corresponding free db. + */ +static inline xfs_dir2_db_t +xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) +{ + return (XFS_DIR2_FREE_FIRSTDB(mp) + (db) / XFS_DIR2_MAX_FREE_BESTS(mp)); +} + +/* + * Convert data space db to the corresponding index in a free db. + */ +static inline int +xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) +{ + return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp)); +} + +extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, + struct xfs_dabuf *lbp); +extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count); +extern int xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp, + struct xfs_da_args *args, int *indexp, + struct xfs_da_state *state); +extern int xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp, + struct xfs_dabuf *leaf2_bp); +extern int xfs_dir2_leafn_split(struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk); +extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action); +extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk); +extern int xfs_dir2_node_addname(struct xfs_da_args *args); +extern int xfs_dir2_node_lookup(struct xfs_da_args *args); +extern int xfs_dir2_node_removename(struct xfs_da_args *args); +extern int xfs_dir2_node_replace(struct xfs_da_args *args); +extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, + int *rvalp); + +#endif /* __XFS_DIR2_NODE_H__ */ diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c new file mode 100644 index 00000000..b1bae6b1 --- /dev/null +++ b/fs/xfs/xfs_dir2_sf.c @@ -0,0 +1,1267 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_error.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_trace.h" + +/* + * Prototypes for internal functions. + */ +static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args, + xfs_dir2_sf_entry_t *sfep, + xfs_dir2_data_aoff_t offset, + int new_isize); +static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange, + int new_isize); +static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange, + xfs_dir2_sf_entry_t **sfepp, + xfs_dir2_data_aoff_t *offsetp); +#ifdef DEBUG +static void xfs_dir2_sf_check(xfs_da_args_t *args); +#else +#define xfs_dir2_sf_check(args) +#endif /* DEBUG */ +#if XFS_BIG_INUMS +static void xfs_dir2_sf_toino4(xfs_da_args_t *args); +static void xfs_dir2_sf_toino8(xfs_da_args_t *args); +#endif /* XFS_BIG_INUMS */ + +/* + * Given a block directory (dp/block), calculate its size as a shortform (sf) + * directory and a header for the sf directory, if it will fit it the + * space currently present in the inode. If it won't fit, the output + * size is too big (but not accurate). + */ +int /* size for sf form */ +xfs_dir2_block_sfsize( + xfs_inode_t *dp, /* incore inode pointer */ + xfs_dir2_block_t *block, /* block directory data */ + xfs_dir2_sf_hdr_t *sfhp) /* output: header for sf form */ +{ + xfs_dir2_dataptr_t addr; /* data entry address */ + xfs_dir2_leaf_entry_t *blp; /* leaf area of the block */ + xfs_dir2_block_tail_t *btp; /* tail area of the block */ + int count; /* shortform entry count */ + xfs_dir2_data_entry_t *dep; /* data entry in the block */ + int i; /* block entry index */ + int i8count; /* count of big-inode entries */ + int isdot; /* entry is "." */ + int isdotdot; /* entry is ".." */ + xfs_mount_t *mp; /* mount structure pointer */ + int namelen; /* total name bytes */ + xfs_ino_t parent = 0; /* parent inode number */ + int size=0; /* total computed size */ + + mp = dp->i_mount; + + count = i8count = namelen = 0; + btp = xfs_dir2_block_tail_p(mp, block); + blp = xfs_dir2_block_leaf_p(btp); + + /* + * Iterate over the block's data entries by using the leaf pointers. + */ + for (i = 0; i < be32_to_cpu(btp->count); i++) { + if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Calculate the pointer to the entry at hand. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + xfs_dir2_dataptr_to_off(mp, addr)); + /* + * Detect . and .., so we can special-case them. + * . is not included in sf directories. + * .. is included by just the parent inode number. + */ + isdot = dep->namelen == 1 && dep->name[0] == '.'; + isdotdot = + dep->namelen == 2 && + dep->name[0] == '.' && dep->name[1] == '.'; +#if XFS_BIG_INUMS + if (!isdot) + i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM; +#endif + if (!isdot && !isdotdot) { + count++; + namelen += dep->namelen; + } else if (isdotdot) + parent = be64_to_cpu(dep->inumber); + /* + * Calculate the new size, see if we should give up yet. + */ + size = xfs_dir2_sf_hdr_size(i8count) + /* header */ + count + /* namelen */ + count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ + namelen + /* name */ + (i8count ? /* inumber */ + (uint)sizeof(xfs_dir2_ino8_t) * count : + (uint)sizeof(xfs_dir2_ino4_t) * count); + if (size > XFS_IFORK_DSIZE(dp)) + return size; /* size value is a failure */ + } + /* + * Create the output header, if it worked. + */ + sfhp->count = count; + sfhp->i8count = i8count; + xfs_dir2_sf_put_inumber((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent); + return size; +} + +/* + * Convert a block format directory to shortform. + * Caller has already checked that it will fit, and built us a header. + */ +int /* error */ +xfs_dir2_block_to_sf( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *bp, /* block buffer */ + int size, /* shortform directory size */ + xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */ +{ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_block_tail_t *btp; /* block tail pointer */ + xfs_dir2_data_entry_t *dep; /* data entry pointer */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* unused data pointer */ + char *endptr; /* end of data entries */ + int error; /* error return value */ + int logflags; /* inode logging flags */ + xfs_mount_t *mp; /* filesystem mount point */ + char *ptr; /* current data pointer */ + xfs_dir2_sf_entry_t *sfep; /* shortform entry */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_ino_t temp; + + trace_xfs_dir2_block_to_sf(args); + + dp = args->dp; + mp = dp->i_mount; + + /* + * Make a copy of the block data, so we can shrink the inode + * and add local data. + */ + block = kmem_alloc(mp->m_dirblksize, KM_SLEEP); + memcpy(block, bp->data, mp->m_dirblksize); + logflags = XFS_ILOG_CORE; + if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) { + ASSERT(error != ENOSPC); + goto out; + } + /* + * The buffer is now unconditionally gone, whether + * xfs_dir2_shrink_inode worked or not. + * + * Convert the inode to local format. + */ + dp->i_df.if_flags &= ~XFS_IFEXTENTS; + dp->i_df.if_flags |= XFS_IFINLINE; + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + ASSERT(dp->i_df.if_bytes == 0); + xfs_idata_realloc(dp, size, XFS_DATA_FORK); + logflags |= XFS_ILOG_DDATA; + /* + * Copy the header into the newly allocate local space. + */ + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); + dp->i_d.di_size = size; + /* + * Set up to loop over the block's entries. + */ + btp = xfs_dir2_block_tail_p(mp, block); + ptr = (char *)block->u; + endptr = (char *)xfs_dir2_block_leaf_p(btp); + sfep = xfs_dir2_sf_firstentry(sfp); + /* + * Loop over the active and unused entries. + * Stop when we reach the leaf/tail portion of the block. + */ + while (ptr < endptr) { + /* + * If it's unused, just skip over it. + */ + dup = (xfs_dir2_data_unused_t *)ptr; + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + ptr += be16_to_cpu(dup->length); + continue; + } + dep = (xfs_dir2_data_entry_t *)ptr; + /* + * Skip . + */ + if (dep->namelen == 1 && dep->name[0] == '.') + ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino); + /* + * Skip .., but make sure the inode number is right. + */ + else if (dep->namelen == 2 && + dep->name[0] == '.' && dep->name[1] == '.') + ASSERT(be64_to_cpu(dep->inumber) == + xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent)); + /* + * Normal entry, copy it into shortform. + */ + else { + sfep->namelen = dep->namelen; + xfs_dir2_sf_put_offset(sfep, + (xfs_dir2_data_aoff_t) + ((char *)dep - (char *)block)); + memcpy(sfep->name, dep->name, dep->namelen); + temp = be64_to_cpu(dep->inumber); + xfs_dir2_sf_put_inumber(sfp, &temp, + xfs_dir2_sf_inumberp(sfep)); + sfep = xfs_dir2_sf_nextentry(sfp, sfep); + } + ptr += xfs_dir2_data_entsize(dep->namelen); + } + ASSERT((char *)sfep - (char *)sfp == size); + xfs_dir2_sf_check(args); +out: + xfs_trans_log_inode(args->trans, dp, logflags); + kmem_free(block); + return error; +} + +/* + * Add a name to a shortform directory. + * There are two algorithms, "easy" and "hard" which we decide on + * before changing anything. + * Convert to block form if necessary, if the new entry won't fit. + */ +int /* error */ +xfs_dir2_sf_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + int add_entsize; /* size of the new entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int incr_isize; /* total change in size */ + int new_isize; /* di_size after adding name */ + int objchange; /* changing to 8-byte inodes */ + xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */ + int old_isize; /* di_size before adding name */ + int pick; /* which algorithm to use */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */ + + trace_xfs_dir2_sf_addname(args); + + ASSERT(xfs_dir2_sf_lookup(args) == ENOENT); + dp = args->dp; + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Make sure the shortform value has some of its header. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); + /* + * Compute entry (and change in) size. + */ + add_entsize = xfs_dir2_sf_entsize_byname(sfp, args->namelen); + incr_isize = add_entsize; + objchange = 0; +#if XFS_BIG_INUMS + /* + * Do we have to change to 8 byte inodes? + */ + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) { + /* + * Yes, adjust the entry size and the total size. + */ + add_entsize += + (uint)sizeof(xfs_dir2_ino8_t) - + (uint)sizeof(xfs_dir2_ino4_t); + incr_isize += + (sfp->hdr.count + 2) * + ((uint)sizeof(xfs_dir2_ino8_t) - + (uint)sizeof(xfs_dir2_ino4_t)); + objchange = 1; + } +#endif + old_isize = (int)dp->i_d.di_size; + new_isize = old_isize + incr_isize; + /* + * Won't fit as shortform any more (due to size), + * or the pick routine says it won't (due to offset values). + */ + if (new_isize > XFS_IFORK_DSIZE(dp) || + (pick = + xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) { + /* + * Just checking or no space reservation, it doesn't fit. + */ + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) + return XFS_ERROR(ENOSPC); + /* + * Convert to block form then add the name. + */ + error = xfs_dir2_sf_to_block(args); + if (error) + return error; + return xfs_dir2_block_addname(args); + } + /* + * Just checking, it fits. + */ + if (args->op_flags & XFS_DA_OP_JUSTCHECK) + return 0; + /* + * Do it the easy way - just add it at the end. + */ + if (pick == 1) + xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize); + /* + * Do it the hard way - look for a place to insert the new entry. + * Convert to 8 byte inode numbers first if necessary. + */ + else { + ASSERT(pick == 2); +#if XFS_BIG_INUMS + if (objchange) + xfs_dir2_sf_toino8(args); +#endif + xfs_dir2_sf_addname_hard(args, objchange, new_isize); + } + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + return 0; +} + +/* + * Add the new entry the "easy" way. + * This is copying the old directory and adding the new entry at the end. + * Since it's sorted by "offset" we need room after the last offset + * that's already there, and then room to convert to a block directory. + * This is already checked by the pick routine. + */ +static void +xfs_dir2_sf_addname_easy( + xfs_da_args_t *args, /* operation arguments */ + xfs_dir2_sf_entry_t *sfep, /* pointer to new entry */ + xfs_dir2_data_aoff_t offset, /* offset to use for new ent */ + int new_isize) /* new directory size */ +{ + int byteoff; /* byte offset in sf dir */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + + dp = args->dp; + + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + byteoff = (int)((char *)sfep - (char *)sfp); + /* + * Grow the in-inode space. + */ + xfs_idata_realloc(dp, xfs_dir2_sf_entsize_byname(sfp, args->namelen), + XFS_DATA_FORK); + /* + * Need to set up again due to realloc of the inode data. + */ + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff); + /* + * Fill in the new entry. + */ + sfep->namelen = args->namelen; + xfs_dir2_sf_put_offset(sfep, offset); + memcpy(sfep->name, args->name, sfep->namelen); + xfs_dir2_sf_put_inumber(sfp, &args->inumber, + xfs_dir2_sf_inumberp(sfep)); + /* + * Update the header and inode. + */ + sfp->hdr.count++; +#if XFS_BIG_INUMS + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) + sfp->hdr.i8count++; +#endif + dp->i_d.di_size = new_isize; + xfs_dir2_sf_check(args); +} + +/* + * Add the new entry the "hard" way. + * The caller has already converted to 8 byte inode numbers if necessary, + * in which case we need to leave the i8count at 1. + * Find a hole that the new entry will fit into, and copy + * the first part of the entries, the new entry, and the last part of + * the entries. + */ +/* ARGSUSED */ +static void +xfs_dir2_sf_addname_hard( + xfs_da_args_t *args, /* operation arguments */ + int objchange, /* changing inode number size */ + int new_isize) /* new directory size */ +{ + int add_datasize; /* data size need for new ent */ + char *buf; /* buffer for old */ + xfs_inode_t *dp; /* incore directory inode */ + int eof; /* reached end of old dir */ + int nbytes; /* temp for byte copies */ + xfs_dir2_data_aoff_t new_offset; /* next offset value */ + xfs_dir2_data_aoff_t offset; /* current offset value */ + int old_isize; /* previous di_size */ + xfs_dir2_sf_entry_t *oldsfep; /* entry in original dir */ + xfs_dir2_sf_t *oldsfp; /* original shortform dir */ + xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ + xfs_dir2_sf_t *sfp; /* new shortform dir */ + + /* + * Copy the old directory to the stack buffer. + */ + dp = args->dp; + + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + old_isize = (int)dp->i_d.di_size; + buf = kmem_alloc(old_isize, KM_SLEEP); + oldsfp = (xfs_dir2_sf_t *)buf; + memcpy(oldsfp, sfp, old_isize); + /* + * Loop over the old directory finding the place we're going + * to insert the new entry. + * If it's going to end up at the end then oldsfep will point there. + */ + for (offset = XFS_DIR2_DATA_FIRST_OFFSET, + oldsfep = xfs_dir2_sf_firstentry(oldsfp), + add_datasize = xfs_dir2_data_entsize(args->namelen), + eof = (char *)oldsfep == &buf[old_isize]; + !eof; + offset = new_offset + xfs_dir2_data_entsize(oldsfep->namelen), + oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep), + eof = (char *)oldsfep == &buf[old_isize]) { + new_offset = xfs_dir2_sf_get_offset(oldsfep); + if (offset + add_datasize <= new_offset) + break; + } + /* + * Get rid of the old directory, then allocate space for + * the new one. We do this so xfs_idata_realloc won't copy + * the data. + */ + xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK); + xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK); + /* + * Reset the pointer since the buffer was reallocated. + */ + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + /* + * Copy the first part of the directory, including the header. + */ + nbytes = (int)((char *)oldsfep - (char *)oldsfp); + memcpy(sfp, oldsfp, nbytes); + sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes); + /* + * Fill in the new entry, and update the header counts. + */ + sfep->namelen = args->namelen; + xfs_dir2_sf_put_offset(sfep, offset); + memcpy(sfep->name, args->name, sfep->namelen); + xfs_dir2_sf_put_inumber(sfp, &args->inumber, + xfs_dir2_sf_inumberp(sfep)); + sfp->hdr.count++; +#if XFS_BIG_INUMS + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) + sfp->hdr.i8count++; +#endif + /* + * If there's more left to copy, do that. + */ + if (!eof) { + sfep = xfs_dir2_sf_nextentry(sfp, sfep); + memcpy(sfep, oldsfep, old_isize - nbytes); + } + kmem_free(buf); + dp->i_d.di_size = new_isize; + xfs_dir2_sf_check(args); +} + +/* + * Decide if the new entry will fit at all. + * If it will fit, pick between adding the new entry to the end (easy) + * or somewhere else (hard). + * Return 0 (won't fit), 1 (easy), 2 (hard). + */ +/*ARGSUSED*/ +static int /* pick result */ +xfs_dir2_sf_addname_pick( + xfs_da_args_t *args, /* operation arguments */ + int objchange, /* inode # size changes */ + xfs_dir2_sf_entry_t **sfepp, /* out(1): new entry ptr */ + xfs_dir2_data_aoff_t *offsetp) /* out(1): new offset */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int holefit; /* found hole it will fit in */ + int i; /* entry number */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_data_aoff_t offset; /* data block offset */ + xfs_dir2_sf_entry_t *sfep; /* shortform entry */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + int size; /* entry's data size */ + int used; /* data bytes used */ + + dp = args->dp; + mp = dp->i_mount; + + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + size = xfs_dir2_data_entsize(args->namelen); + offset = XFS_DIR2_DATA_FIRST_OFFSET; + sfep = xfs_dir2_sf_firstentry(sfp); + holefit = 0; + /* + * Loop over sf entries. + * Keep track of data offset and whether we've seen a place + * to insert the new entry. + */ + for (i = 0; i < sfp->hdr.count; i++) { + if (!holefit) + holefit = offset + size <= xfs_dir2_sf_get_offset(sfep); + offset = xfs_dir2_sf_get_offset(sfep) + + xfs_dir2_data_entsize(sfep->namelen); + sfep = xfs_dir2_sf_nextentry(sfp, sfep); + } + /* + * Calculate data bytes used excluding the new entry, if this + * was a data block (block form directory). + */ + used = offset + + (sfp->hdr.count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t); + /* + * If it won't fit in a block form then we can't insert it, + * we'll go back, convert to block, then try the insert and convert + * to leaf. + */ + if (used + (holefit ? 0 : size) > mp->m_dirblksize) + return 0; + /* + * If changing the inode number size, do it the hard way. + */ +#if XFS_BIG_INUMS + if (objchange) { + return 2; + } +#else + ASSERT(objchange == 0); +#endif + /* + * If it won't fit at the end then do it the hard way (use the hole). + */ + if (used + size > mp->m_dirblksize) + return 2; + /* + * Do it the easy way. + */ + *sfepp = sfep; + *offsetp = offset; + return 1; +} + +#ifdef DEBUG +/* + * Check consistency of shortform directory, assert if bad. + */ +static void +xfs_dir2_sf_check( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry number */ + int i8count; /* number of big inode#s */ + xfs_ino_t ino; /* entry inode number */ + int offset; /* data offset */ + xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + + dp = args->dp; + + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + offset = XFS_DIR2_DATA_FIRST_OFFSET; + ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); + i8count = ino > XFS_DIR2_MAX_SHORT_INUM; + + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); + i < sfp->hdr.count; + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset); + ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); + i8count += ino > XFS_DIR2_MAX_SHORT_INUM; + offset = + xfs_dir2_sf_get_offset(sfep) + + xfs_dir2_data_entsize(sfep->namelen); + } + ASSERT(i8count == sfp->hdr.i8count); + ASSERT(XFS_BIG_INUMS || i8count == 0); + ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); + ASSERT(offset + + (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t) <= + dp->i_mount->m_dirblksize); +} +#endif /* DEBUG */ + +/* + * Create a new (shortform) directory. + */ +int /* error, always 0 */ +xfs_dir2_sf_create( + xfs_da_args_t *args, /* operation arguments */ + xfs_ino_t pino) /* parent inode number */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i8count; /* parent inode is an 8-byte number */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + int size; /* directory size */ + + trace_xfs_dir2_sf_create(args); + + dp = args->dp; + + ASSERT(dp != NULL); + ASSERT(dp->i_d.di_size == 0); + /* + * If it's currently a zero-length extent file, + * convert it to local format. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) { + dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */ + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + dp->i_df.if_flags |= XFS_IFINLINE; + } + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + ASSERT(dp->i_df.if_bytes == 0); + i8count = pino > XFS_DIR2_MAX_SHORT_INUM; + size = xfs_dir2_sf_hdr_size(i8count); + /* + * Make a buffer for the data. + */ + xfs_idata_realloc(dp, size, XFS_DATA_FORK); + /* + * Fill in the header, + */ + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp->hdr.i8count = i8count; + /* + * Now can put in the inode number, since i8count is set. + */ + xfs_dir2_sf_put_inumber(sfp, &pino, &sfp->hdr.parent); + sfp->hdr.count = 0; + dp->i_d.di_size = size; + xfs_dir2_sf_check(args); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + return 0; +} + +int /* error */ +xfs_dir2_sf_getdents( + xfs_inode_t *dp, /* incore directory inode */ + void *dirent, + xfs_off_t *offset, + filldir_t filldir) +{ + int i; /* shortform entry number */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_dataptr_t off; /* current entry's offset */ + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_dataptr_t dot_offset; + xfs_dir2_dataptr_t dotdot_offset; + xfs_ino_t ino; + + mp = dp->i_mount; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Give up if the directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + return XFS_ERROR(EIO); + } + + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); + + /* + * If the block number in the offset is out of range, we're done. + */ + if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) + return 0; + + /* + * Precalculate offsets for . and .. as we will always need them. + * + * XXX(hch): the second argument is sometimes 0 and sometimes + * mp->m_dirdatablk. + */ + dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + XFS_DIR2_DATA_DOT_OFFSET); + dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + XFS_DIR2_DATA_DOTDOT_OFFSET); + + /* + * Put . entry unless we're starting past it. + */ + if (*offset <= dot_offset) { + if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) { + *offset = dot_offset & 0x7fffffff; + return 0; + } + } + + /* + * Put .. entry unless we're starting past it. + */ + if (*offset <= dotdot_offset) { + ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); + if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) { + *offset = dotdot_offset & 0x7fffffff; + return 0; + } + } + + /* + * Loop while there are more entries and put'ing works. + */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->hdr.count; i++) { + off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + xfs_dir2_sf_get_offset(sfep)); + + if (*offset > off) { + sfep = xfs_dir2_sf_nextentry(sfp, sfep); + continue; + } + + ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); + if (filldir(dirent, (char *)sfep->name, sfep->namelen, + off & 0x7fffffff, ino, DT_UNKNOWN)) { + *offset = off & 0x7fffffff; + return 0; + } + sfep = xfs_dir2_sf_nextentry(sfp, sfep); + } + + *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & + 0x7fffffff; + return 0; +} + +/* + * Lookup an entry in a shortform directory. + * Returns EEXIST if found, ENOENT if not found. + */ +int /* error */ +xfs_dir2_sf_lookup( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ + int error; + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + enum xfs_dacmp cmp; /* comparison result */ + xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */ + + trace_xfs_dir2_sf_lookup(args); + + xfs_dir2_sf_check(args); + dp = args->dp; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Bail out if the directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); + /* + * Special case for . + */ + if (args->namelen == 1 && args->name[0] == '.') { + args->inumber = dp->i_ino; + args->cmpresult = XFS_CMP_EXACT; + return XFS_ERROR(EEXIST); + } + /* + * Special case for .. + */ + if (args->namelen == 2 && + args->name[0] == '.' && args->name[1] == '.') { + args->inumber = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); + args->cmpresult = XFS_CMP_EXACT; + return XFS_ERROR(EEXIST); + } + /* + * Loop over all the entries trying to match ours. + */ + ci_sfep = NULL; + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count; + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + /* + * Compare name and if it's an exact match, return the inode + * number. If it's the first case-insensitive match, store the + * inode number and continue looking for an exact match. + */ + cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name, + sfep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; + args->inumber = xfs_dir2_sf_get_inumber(sfp, + xfs_dir2_sf_inumberp(sfep)); + if (cmp == XFS_CMP_EXACT) + return XFS_ERROR(EEXIST); + ci_sfep = sfep; + } + } + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or replace). + * If a case-insensitive match was not found, return ENOENT. + */ + if (!ci_sfep) + return XFS_ERROR(ENOENT); + /* otherwise process the CI match as required by the caller */ + error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); + return XFS_ERROR(error); +} + +/* + * Remove an entry from a shortform directory. + */ +int /* error */ +xfs_dir2_sf_removename( + xfs_da_args_t *args) +{ + int byteoff; /* offset of removed entry */ + xfs_inode_t *dp; /* incore directory inode */ + int entsize; /* this entry's size */ + int i; /* shortform entry index */ + int newsize; /* new inode size */ + int oldsize; /* old inode size */ + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + + trace_xfs_dir2_sf_removename(args); + + dp = args->dp; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + oldsize = (int)dp->i_d.di_size; + /* + * Bail out if the directory is way too short. + */ + if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == oldsize); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); + /* + * Loop over the old directory entries. + * Find the one we're deleting. + */ + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count; + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + if (xfs_da_compname(args, sfep->name, sfep->namelen) == + XFS_CMP_EXACT) { + ASSERT(xfs_dir2_sf_get_inumber(sfp, + xfs_dir2_sf_inumberp(sfep)) == + args->inumber); + break; + } + } + /* + * Didn't find it. + */ + if (i == sfp->hdr.count) + return XFS_ERROR(ENOENT); + /* + * Calculate sizes. + */ + byteoff = (int)((char *)sfep - (char *)sfp); + entsize = xfs_dir2_sf_entsize_byname(sfp, args->namelen); + newsize = oldsize - entsize; + /* + * Copy the part if any after the removed entry, sliding it down. + */ + if (byteoff + entsize < oldsize) + memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize, + oldsize - (byteoff + entsize)); + /* + * Fix up the header and file size. + */ + sfp->hdr.count--; + dp->i_d.di_size = newsize; + /* + * Reallocate, making it smaller. + */ + xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; +#if XFS_BIG_INUMS + /* + * Are we changing inode number size? + */ + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) { + if (sfp->hdr.i8count == 1) + xfs_dir2_sf_toino4(args); + else + sfp->hdr.i8count--; + } +#endif + xfs_dir2_sf_check(args); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + return 0; +} + +/* + * Replace the inode number of an entry in a shortform directory. + */ +int /* error */ +xfs_dir2_sf_replace( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ +#if XFS_BIG_INUMS || defined(DEBUG) + xfs_ino_t ino=0; /* entry old inode number */ +#endif +#if XFS_BIG_INUMS + int i8elevated; /* sf_toino8 set i8count=1 */ +#endif + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + + trace_xfs_dir2_sf_replace(args); + + dp = args->dp; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Bail out if the shortform directory is way too small. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count)); +#if XFS_BIG_INUMS + /* + * New inode number is large, and need to convert to 8-byte inodes. + */ + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) { + int error; /* error return value */ + int newsize; /* new inode size */ + + newsize = + dp->i_df.if_bytes + + (sfp->hdr.count + 1) * + ((uint)sizeof(xfs_dir2_ino8_t) - + (uint)sizeof(xfs_dir2_ino4_t)); + /* + * Won't fit as shortform, convert to block then do replace. + */ + if (newsize > XFS_IFORK_DSIZE(dp)) { + error = xfs_dir2_sf_to_block(args); + if (error) { + return error; + } + return xfs_dir2_block_replace(args); + } + /* + * Still fits, convert to 8-byte now. + */ + xfs_dir2_sf_toino8(args); + i8elevated = 1; + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + } else + i8elevated = 0; +#endif + ASSERT(args->namelen != 1 || args->name[0] != '.'); + /* + * Replace ..'s entry. + */ + if (args->namelen == 2 && + args->name[0] == '.' && args->name[1] == '.') { +#if XFS_BIG_INUMS || defined(DEBUG) + ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); + ASSERT(args->inumber != ino); +#endif + xfs_dir2_sf_put_inumber(sfp, &args->inumber, &sfp->hdr.parent); + } + /* + * Normal entry, look for the name. + */ + else { + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); + i < sfp->hdr.count; + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + if (xfs_da_compname(args, sfep->name, sfep->namelen) == + XFS_CMP_EXACT) { +#if XFS_BIG_INUMS || defined(DEBUG) + ino = xfs_dir2_sf_get_inumber(sfp, + xfs_dir2_sf_inumberp(sfep)); + ASSERT(args->inumber != ino); +#endif + xfs_dir2_sf_put_inumber(sfp, &args->inumber, + xfs_dir2_sf_inumberp(sfep)); + break; + } + } + /* + * Didn't find it. + */ + if (i == sfp->hdr.count) { + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); +#if XFS_BIG_INUMS + if (i8elevated) + xfs_dir2_sf_toino4(args); +#endif + return XFS_ERROR(ENOENT); + } + } +#if XFS_BIG_INUMS + /* + * See if the old number was large, the new number is small. + */ + if (ino > XFS_DIR2_MAX_SHORT_INUM && + args->inumber <= XFS_DIR2_MAX_SHORT_INUM) { + /* + * And the old count was one, so need to convert to small. + */ + if (sfp->hdr.i8count == 1) + xfs_dir2_sf_toino4(args); + else + sfp->hdr.i8count--; + } + /* + * See if the old number was small, the new number is large. + */ + if (ino <= XFS_DIR2_MAX_SHORT_INUM && + args->inumber > XFS_DIR2_MAX_SHORT_INUM) { + /* + * add to the i8count unless we just converted to 8-byte + * inodes (which does an implied i8count = 1) + */ + ASSERT(sfp->hdr.i8count != 0); + if (!i8elevated) + sfp->hdr.i8count++; + } +#endif + xfs_dir2_sf_check(args); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA); + return 0; +} + +#if XFS_BIG_INUMS +/* + * Convert from 8-byte inode numbers to 4-byte inode numbers. + * The last 8-byte inode number is gone, but the count is still 1. + */ +static void +xfs_dir2_sf_toino4( + xfs_da_args_t *args) /* operation arguments */ +{ + char *buf; /* old dir's buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ + xfs_ino_t ino; /* entry inode number */ + int newsize; /* new inode size */ + xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ + xfs_dir2_sf_t *oldsfp; /* old sf directory */ + int oldsize; /* old inode size */ + xfs_dir2_sf_entry_t *sfep; /* new sf entry */ + xfs_dir2_sf_t *sfp; /* new sf directory */ + + trace_xfs_dir2_sf_toino4(args); + + dp = args->dp; + + /* + * Copy the old directory to the buffer. + * Then nuke it from the inode, and add the new buffer to the inode. + * Don't want xfs_idata_realloc copying the data here. + */ + oldsize = dp->i_df.if_bytes; + buf = kmem_alloc(oldsize, KM_SLEEP); + oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsfp->hdr.i8count == 1); + memcpy(buf, oldsfp, oldsize); + /* + * Compute the new inode size. + */ + newsize = + oldsize - + (oldsfp->hdr.count + 1) * + ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); + xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); + /* + * Reset our pointers, the data has moved. + */ + oldsfp = (xfs_dir2_sf_t *)buf; + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + /* + * Fill in the new header. + */ + sfp->hdr.count = oldsfp->hdr.count; + sfp->hdr.i8count = 0; + ino = xfs_dir2_sf_get_inumber(oldsfp, &oldsfp->hdr.parent); + xfs_dir2_sf_put_inumber(sfp, &ino, &sfp->hdr.parent); + /* + * Copy the entries field by field. + */ + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), + oldsfep = xfs_dir2_sf_firstentry(oldsfp); + i < sfp->hdr.count; + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), + oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { + sfep->namelen = oldsfep->namelen; + sfep->offset = oldsfep->offset; + memcpy(sfep->name, oldsfep->name, sfep->namelen); + ino = xfs_dir2_sf_get_inumber(oldsfp, + xfs_dir2_sf_inumberp(oldsfep)); + xfs_dir2_sf_put_inumber(sfp, &ino, xfs_dir2_sf_inumberp(sfep)); + } + /* + * Clean up the inode. + */ + kmem_free(buf); + dp->i_d.di_size = newsize; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); +} + +/* + * Convert from 4-byte inode numbers to 8-byte inode numbers. + * The new 8-byte inode number is not there yet, we leave with the + * count 1 but no corresponding entry. + */ +static void +xfs_dir2_sf_toino8( + xfs_da_args_t *args) /* operation arguments */ +{ + char *buf; /* old dir's buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ + xfs_ino_t ino; /* entry inode number */ + int newsize; /* new inode size */ + xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ + xfs_dir2_sf_t *oldsfp; /* old sf directory */ + int oldsize; /* old inode size */ + xfs_dir2_sf_entry_t *sfep; /* new sf entry */ + xfs_dir2_sf_t *sfp; /* new sf directory */ + + trace_xfs_dir2_sf_toino8(args); + + dp = args->dp; + + /* + * Copy the old directory to the buffer. + * Then nuke it from the inode, and add the new buffer to the inode. + * Don't want xfs_idata_realloc copying the data here. + */ + oldsize = dp->i_df.if_bytes; + buf = kmem_alloc(oldsize, KM_SLEEP); + oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsfp->hdr.i8count == 0); + memcpy(buf, oldsfp, oldsize); + /* + * Compute the new inode size. + */ + newsize = + oldsize + + (oldsfp->hdr.count + 1) * + ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); + xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); + /* + * Reset our pointers, the data has moved. + */ + oldsfp = (xfs_dir2_sf_t *)buf; + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + /* + * Fill in the new header. + */ + sfp->hdr.count = oldsfp->hdr.count; + sfp->hdr.i8count = 1; + ino = xfs_dir2_sf_get_inumber(oldsfp, &oldsfp->hdr.parent); + xfs_dir2_sf_put_inumber(sfp, &ino, &sfp->hdr.parent); + /* + * Copy the entries field by field. + */ + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), + oldsfep = xfs_dir2_sf_firstentry(oldsfp); + i < sfp->hdr.count; + i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), + oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { + sfep->namelen = oldsfep->namelen; + sfep->offset = oldsfep->offset; + memcpy(sfep->name, oldsfep->name, sfep->namelen); + ino = xfs_dir2_sf_get_inumber(oldsfp, + xfs_dir2_sf_inumberp(oldsfep)); + xfs_dir2_sf_put_inumber(sfp, &ino, xfs_dir2_sf_inumberp(sfep)); + } + /* + * Clean up the inode. + */ + kmem_free(buf); + dp->i_d.di_size = newsize; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); +} +#endif /* XFS_BIG_INUMS */ diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h new file mode 100644 index 00000000..6ac44b55 --- /dev/null +++ b/fs/xfs/xfs_dir2_sf.h @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DIR2_SF_H__ +#define __XFS_DIR2_SF_H__ + +/* + * Directory layout when stored internal to an inode. + * + * Small directories are packed as tightly as possible so as to + * fit into the literal area of the inode. + */ + +struct uio; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_dir2_block; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * Inode number stored as 8 8-bit values. + */ +typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; + +/* + * Inode number stored as 4 8-bit values. + * Works a lot of the time, when all the inode numbers in a directory + * fit in 32 bits. + */ +typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; + +typedef union { + xfs_dir2_ino8_t i8; + xfs_dir2_ino4_t i4; +} xfs_dir2_inou_t; +#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) + +/* + * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. + * Only need 16 bits, this is the byte offset into the single block form. + */ +typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; + +/* + * The parent directory has a dedicated field, and the self-pointer must + * be calculated on the fly. + * + * Entries are packed toward the top as tightly as possible. The header + * and the elements must be memcpy'd out into a work area to get correct + * alignment for the inode number fields. + */ +typedef struct xfs_dir2_sf_hdr { + __uint8_t count; /* count of entries */ + __uint8_t i8count; /* count of 8-byte inode #s */ + xfs_dir2_inou_t parent; /* parent dir inode number */ +} __arch_pack xfs_dir2_sf_hdr_t; + +typedef struct xfs_dir2_sf_entry { + __uint8_t namelen; /* actual name length */ + xfs_dir2_sf_off_t offset; /* saved offset */ + __uint8_t name[1]; /* name, variable size */ + xfs_dir2_inou_t inumber; /* inode number, var. offset */ +} __arch_pack xfs_dir2_sf_entry_t; + +typedef struct xfs_dir2_sf { + xfs_dir2_sf_hdr_t hdr; /* shortform header */ + xfs_dir2_sf_entry_t list[1]; /* shortform entries */ +} xfs_dir2_sf_t; + +static inline int xfs_dir2_sf_hdr_size(int i8count) +{ + return ((uint)sizeof(xfs_dir2_sf_hdr_t) - \ + ((i8count) == 0) * \ + ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); +} + +static inline xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep) +{ + return (xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen]; +} + +static inline xfs_intino_t +xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from) +{ + return ((sfp)->hdr.i8count == 0 ? \ + (xfs_intino_t)XFS_GET_DIR_INO4((from)->i4) : \ + (xfs_intino_t)XFS_GET_DIR_INO8((from)->i8)); +} + +static inline void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from, + xfs_dir2_inou_t *to) +{ + if ((sfp)->hdr.i8count == 0) + XFS_PUT_DIR_INO4(*(from), (to)->i4); + else + XFS_PUT_DIR_INO8(*(from), (to)->i8); +} + +static inline xfs_dir2_data_aoff_t +xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) +{ + return INT_GET_UNALIGNED_16_BE(&(sfep)->offset.i); +} + +static inline void +xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) +{ + INT_SET_UNALIGNED_16_BE(&(sfep)->offset.i, off); +} + +static inline int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len) +{ + return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \ + ((sfp)->hdr.i8count == 0) * \ + ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); +} + +static inline int +xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep) +{ + return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (sfep)->namelen - \ + ((sfp)->hdr.i8count == 0) * \ + ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); +} + +static inline xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp) +{ + return ((xfs_dir2_sf_entry_t *) \ + ((char *)(sfp) + xfs_dir2_sf_hdr_size(sfp->hdr.i8count))); +} + +static inline xfs_dir2_sf_entry_t * +xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep) +{ + return ((xfs_dir2_sf_entry_t *) \ + ((char *)(sfep) + xfs_dir2_sf_entsize_byentry(sfp,sfep))); +} + +/* + * Functions. + */ +extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, + struct xfs_dir2_block *block, + xfs_dir2_sf_hdr_t *sfhp); +extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp, + int size, xfs_dir2_sf_hdr_t *sfhp); +extern int xfs_dir2_sf_addname(struct xfs_da_args *args); +extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); +extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent, + xfs_off_t *offset, filldir_t filldir); +extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); +extern int xfs_dir2_sf_removename(struct xfs_da_args *args); +extern int xfs_dir2_sf_replace(struct xfs_da_args *args); + +#endif /* __XFS_DIR2_SF_H__ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c new file mode 100644 index 00000000..39f06336 --- /dev/null +++ b/fs/xfs/xfs_error.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_utils.h" +#include "xfs_error.h" + +#ifdef DEBUG + +int xfs_etrap[XFS_ERROR_NTRAP] = { + 0, +}; + +int +xfs_error_trap(int e) +{ + int i; + + if (!e) + return 0; + for (i = 0; i < XFS_ERROR_NTRAP; i++) { + if (xfs_etrap[i] == 0) + break; + if (e != xfs_etrap[i]) + continue; + xfs_notice(NULL, "%s: error %d", __func__, e); + BUG(); + break; + } + return e; +} + +int xfs_etest[XFS_NUM_INJECT_ERROR]; +int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; +char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; +int xfs_error_test_active; + +int +xfs_error_test(int error_tag, int *fsidp, char *expression, + int line, char *file, unsigned long randfactor) +{ + int i; + int64_t fsid; + + if (random32() % randfactor) + return 0; + + memcpy(&fsid, fsidp, sizeof(xfs_fsid_t)); + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) { + xfs_warn(NULL, + "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", + expression, file, line, xfs_etest_fsname[i]); + return 1; + } + } + + return 0; +} + +int +xfs_errortag_add(int error_tag, xfs_mount_t *mp) +{ + int i; + int len; + int64_t fsid; + + memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { + xfs_warn(mp, "error tag #%d on", error_tag); + return 0; + } + } + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if (xfs_etest[i] == 0) { + xfs_warn(mp, "Turned on XFS error tag #%d", + error_tag); + xfs_etest[i] = error_tag; + xfs_etest_fsid[i] = fsid; + len = strlen(mp->m_fsname); + xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); + strcpy(xfs_etest_fsname[i], mp->m_fsname); + xfs_error_test_active++; + return 0; + } + } + + xfs_warn(mp, "error tag overflow, too many turned on"); + + return 1; +} + +int +xfs_errortag_clearall(xfs_mount_t *mp, int loud) +{ + int64_t fsid; + int cleared = 0; + int i; + + memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); + + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) && + xfs_etest[i] != 0) { + cleared = 1; + xfs_warn(mp, "Clearing XFS error tag #%d", + xfs_etest[i]); + xfs_etest[i] = 0; + xfs_etest_fsid[i] = 0LL; + kmem_free(xfs_etest_fsname[i]); + xfs_etest_fsname[i] = NULL; + xfs_error_test_active--; + } + } + + if (loud || cleared) + xfs_warn(mp, "Cleared all XFS error tags for filesystem"); + + return 0; +} +#endif /* DEBUG */ + +void +xfs_error_report( + const char *tag, + int level, + struct xfs_mount *mp, + const char *filename, + int linenum, + inst_t *ra) +{ + if (level <= xfs_error_level) { + xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, + "Internal error %s at line %d of file %s. Caller 0x%p\n", + tag, linenum, filename, ra); + + xfs_stack_trace(); + } +} + +void +xfs_corruption_error( + const char *tag, + int level, + struct xfs_mount *mp, + void *p, + const char *filename, + int linenum, + inst_t *ra) +{ + if (level <= xfs_error_level) + xfs_hex_dump(p, 16); + xfs_error_report(tag, level, mp, filename, linenum, ra); + xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); +} diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h new file mode 100644 index 00000000..079a367f --- /dev/null +++ b/fs/xfs/xfs_error.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ERROR_H__ +#define __XFS_ERROR_H__ + +#ifdef DEBUG +#define XFS_ERROR_NTRAP 10 +extern int xfs_etrap[XFS_ERROR_NTRAP]; +extern int xfs_error_trap(int); +#define XFS_ERROR(e) xfs_error_trap(e) +#else +#define XFS_ERROR(e) (e) +#endif + +struct xfs_mount; + +extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, + const char *filename, int linenum, inst_t *ra); +extern void xfs_corruption_error(const char *tag, int level, + struct xfs_mount *mp, void *p, const char *filename, + int linenum, inst_t *ra); + +#define XFS_ERROR_REPORT(e, lvl, mp) \ + xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) +#define XFS_CORRUPTION_ERROR(e, lvl, mp, mem) \ + xfs_corruption_error(e, lvl, mp, mem, \ + __FILE__, __LINE__, __return_address) + +#define XFS_ERRLEVEL_OFF 0 +#define XFS_ERRLEVEL_LOW 1 +#define XFS_ERRLEVEL_HIGH 5 + +/* + * Macros to set EFSCORRUPTED & return/branch. + */ +#define XFS_WANT_CORRUPTED_GOTO(x,l) \ + { \ + int fs_is_ok = (x); \ + ASSERT(fs_is_ok); \ + if (unlikely(!fs_is_ok)) { \ + XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \ + XFS_ERRLEVEL_LOW, NULL); \ + error = XFS_ERROR(EFSCORRUPTED); \ + goto l; \ + } \ + } + +#define XFS_WANT_CORRUPTED_RETURN(x) \ + { \ + int fs_is_ok = (x); \ + ASSERT(fs_is_ok); \ + if (unlikely(!fs_is_ok)) { \ + XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \ + XFS_ERRLEVEL_LOW, NULL); \ + return XFS_ERROR(EFSCORRUPTED); \ + } \ + } + +/* + * error injection tags - the labels can be anything you want + * but each tag should have its own unique number + */ + +#define XFS_ERRTAG_NOERROR 0 +#define XFS_ERRTAG_IFLUSH_1 1 +#define XFS_ERRTAG_IFLUSH_2 2 +#define XFS_ERRTAG_IFLUSH_3 3 +#define XFS_ERRTAG_IFLUSH_4 4 +#define XFS_ERRTAG_IFLUSH_5 5 +#define XFS_ERRTAG_IFLUSH_6 6 +#define XFS_ERRTAG_DA_READ_BUF 7 +#define XFS_ERRTAG_BTREE_CHECK_LBLOCK 8 +#define XFS_ERRTAG_BTREE_CHECK_SBLOCK 9 +#define XFS_ERRTAG_ALLOC_READ_AGF 10 +#define XFS_ERRTAG_IALLOC_READ_AGI 11 +#define XFS_ERRTAG_ITOBP_INOTOBP 12 +#define XFS_ERRTAG_IUNLINK 13 +#define XFS_ERRTAG_IUNLINK_REMOVE 14 +#define XFS_ERRTAG_DIR_INO_VALIDATE 15 +#define XFS_ERRTAG_BULKSTAT_READ_CHUNK 16 +#define XFS_ERRTAG_IODONE_IOERR 17 +#define XFS_ERRTAG_STRATREAD_IOERR 18 +#define XFS_ERRTAG_STRATCMPL_IOERR 19 +#define XFS_ERRTAG_DIOWRITE_IOERR 20 +#define XFS_ERRTAG_BMAPIFORMAT 21 +#define XFS_ERRTAG_MAX 22 + +/* + * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. + */ +#define XFS_RANDOM_DEFAULT 100 +#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT +#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4) +#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT +#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT +#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT +#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT + +#ifdef DEBUG +extern int xfs_error_test_active; +extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); + +#define XFS_NUM_INJECT_ERROR 10 +#define XFS_TEST_ERROR(expr, mp, tag, rf) \ + ((expr) || (xfs_error_test_active && \ + xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ + (rf)))) + +extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); +extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); +#else +#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) +#define xfs_errortag_add(tag, mp) (ENOSYS) +#define xfs_errortag_clearall(mp, loud) (ENOSYS) +#endif /* DEBUG */ + +/* + * XFS panic tags -- allow a call to xfs_alert_tag() be turned into + * a panic by setting xfs_panic_mask in a sysctl. + */ +#define XFS_NO_PTAG 0 +#define XFS_PTAG_IFLUSH 0x00000001 +#define XFS_PTAG_LOGRES 0x00000002 +#define XFS_PTAG_AILDELETE 0x00000004 +#define XFS_PTAG_ERROR_REPORT 0x00000008 +#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010 +#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 +#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 +#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 + +#endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c new file mode 100644 index 00000000..d22e6262 --- /dev/null +++ b/fs/xfs/xfs_extfree_item.c @@ -0,0 +1,520 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_trans_priv.h" +#include "xfs_extfree_item.h" + + +kmem_zone_t *xfs_efi_zone; +kmem_zone_t *xfs_efd_zone; + +static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_efi_log_item, efi_item); +} + +void +xfs_efi_item_free( + struct xfs_efi_log_item *efip) +{ + if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) + kmem_free(efip); + else + kmem_zone_free(xfs_efi_zone, efip); +} + +/* + * Freeing the efi requires that we remove it from the AIL if it has already + * been placed there. However, the EFI may not yet have been placed in the AIL + * when called by xfs_efi_release() from EFD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the + * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees + * the EFI. + */ +STATIC void +__xfs_efi_release( + struct xfs_efi_log_item *efip) +{ + struct xfs_ail *ailp = efip->efi_item.li_ailp; + + if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) { + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, &efip->efi_item); + xfs_efi_item_free(efip); + } +} + +/* + * This returns the number of iovecs needed to log the given efi item. + * We only need 1 iovec for an efi item. It just logs the efi_log_format + * structure. + */ +STATIC uint +xfs_efi_item_size( + struct xfs_log_item *lip) +{ + return 1; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given efi log item. We use only 1 iovec, and we point that + * at the efi_log_format structure embedded in the efi item. + * It is at this point that we assert that all of the extent + * slots in the efi item have been filled. + */ +STATIC void +xfs_efi_item_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *log_vector) +{ + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + uint size; + + ASSERT(atomic_read(&efip->efi_next_extent) == + efip->efi_format.efi_nextents); + + efip->efi_format.efi_type = XFS_LI_EFI; + + size = sizeof(xfs_efi_log_format_t); + size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); + efip->efi_format.efi_size = 1; + + log_vector->i_addr = &efip->efi_format; + log_vector->i_len = size; + log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; + ASSERT(size >= sizeof(xfs_efi_log_format_t)); +} + + +/* + * Pinning has no meaning for an efi item, so just return. + */ +STATIC void +xfs_efi_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * While EFIs cannot really be pinned, the unpin operation is the last place at + * which the EFI is manipulated during a transaction. If we are being asked to + * remove the EFI it's because the transaction has been cancelled and by + * definition that means the EFI cannot be in the AIL so remove it from the + * transaction and free it. Otherwise coordinate with xfs_efi_release() (via + * XFS_EFI_COMMITTED) to determine who gets to free the EFI. + */ +STATIC void +xfs_efi_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + + if (remove) { + ASSERT(!(lip->li_flags & XFS_LI_IN_AIL)); + if (lip->li_desc) + xfs_trans_del_item(lip); + xfs_efi_item_free(efip); + return; + } + __xfs_efi_release(efip); +} + +/* + * Efi items have no locking or pushing. However, since EFIs are + * pulled from the AIL when their corresponding EFDs are committed + * to disk, their situation is very similar to being pinned. Return + * XFS_ITEM_PINNED so that the caller will eventually flush the log. + * This should help in getting the EFI out of the AIL. + */ +STATIC uint +xfs_efi_item_trylock( + struct xfs_log_item *lip) +{ + return XFS_ITEM_PINNED; +} + +/* + * Efi items have no locking, so just return. + */ +STATIC void +xfs_efi_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_efi_item_free(EFI_ITEM(lip)); +} + +/* + * The EFI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. For bulk transaction committed + * processing, the EFI may be processed but not yet unpinned prior to the EFD + * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected + * when processing the EFD. + */ +STATIC xfs_lsn_t +xfs_efi_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + + set_bit(XFS_EFI_COMMITTED, &efip->efi_flags); + return lsn; +} + +/* + * There isn't much you can do to push on an efi item. It is simply + * stuck waiting for all of its corresponding efd items to be + * committed to disk. + */ +STATIC void +xfs_efi_item_push( + struct xfs_log_item *lip) +{ +} + +/* + * The EFI dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_efi_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all efi log items. + */ +static struct xfs_item_ops xfs_efi_item_ops = { + .iop_size = xfs_efi_item_size, + .iop_format = xfs_efi_item_format, + .iop_pin = xfs_efi_item_pin, + .iop_unpin = xfs_efi_item_unpin, + .iop_trylock = xfs_efi_item_trylock, + .iop_unlock = xfs_efi_item_unlock, + .iop_committed = xfs_efi_item_committed, + .iop_push = xfs_efi_item_push, + .iop_committing = xfs_efi_item_committing +}; + + +/* + * Allocate and initialize an efi item with the given number of extents. + */ +struct xfs_efi_log_item * +xfs_efi_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_efi_log_item *efip; + uint size; + + ASSERT(nextents > 0); + if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { + size = (uint)(sizeof(xfs_efi_log_item_t) + + ((nextents - 1) * sizeof(xfs_extent_t))); + efip = kmem_zalloc(size, KM_SLEEP); + } else { + efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP); + } + + xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); + efip->efi_format.efi_nextents = nextents; + efip->efi_format.efi_id = (__psint_t)(void*)efip; + atomic_set(&efip->efi_next_extent, 0); + + return efip; +} + +/* + * Copy an EFI format buffer from the given buf, and into the destination + * EFI format structure. + * The given buffer can be in 32 bit or 64 bit form (which has different padding), + * one of which will be the native format for this kernel. + * It will handle the conversion of formats if necessary. + */ +int +xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) +{ + xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; + uint i; + uint len = sizeof(xfs_efi_log_format_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); + uint len32 = sizeof(xfs_efi_log_format_32_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t); + uint len64 = sizeof(xfs_efi_log_format_64_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t); + + if (buf->i_len == len) { + memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); + return 0; + } else if (buf->i_len == len32) { + xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr; + + dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; + dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; + dst_efi_fmt->efi_nextents = src_efi_fmt_32->efi_nextents; + dst_efi_fmt->efi_id = src_efi_fmt_32->efi_id; + for (i = 0; i < dst_efi_fmt->efi_nextents; i++) { + dst_efi_fmt->efi_extents[i].ext_start = + src_efi_fmt_32->efi_extents[i].ext_start; + dst_efi_fmt->efi_extents[i].ext_len = + src_efi_fmt_32->efi_extents[i].ext_len; + } + return 0; + } else if (buf->i_len == len64) { + xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr; + + dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; + dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; + dst_efi_fmt->efi_nextents = src_efi_fmt_64->efi_nextents; + dst_efi_fmt->efi_id = src_efi_fmt_64->efi_id; + for (i = 0; i < dst_efi_fmt->efi_nextents; i++) { + dst_efi_fmt->efi_extents[i].ext_start = + src_efi_fmt_64->efi_extents[i].ext_start; + dst_efi_fmt->efi_extents[i].ext_len = + src_efi_fmt_64->efi_extents[i].ext_len; + } + return 0; + } + return EFSCORRUPTED; +} + +/* + * This is called by the efd item code below to release references to the given + * efi item. Each efd calls this with the number of extents that it has + * logged, and when the sum of these reaches the total number of extents logged + * by this efi item we can free the efi item. + */ +void +xfs_efi_release(xfs_efi_log_item_t *efip, + uint nextents) +{ + ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); + if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) + __xfs_efi_release(efip); +} + +static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_efd_log_item, efd_item); +} + +STATIC void +xfs_efd_item_free(struct xfs_efd_log_item *efdp) +{ + if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) + kmem_free(efdp); + else + kmem_zone_free(xfs_efd_zone, efdp); +} + +/* + * This returns the number of iovecs needed to log the given efd item. + * We only need 1 iovec for an efd item. It just logs the efd_log_format + * structure. + */ +STATIC uint +xfs_efd_item_size( + struct xfs_log_item *lip) +{ + return 1; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given efd log item. We use only 1 iovec, and we point that + * at the efd_log_format structure embedded in the efd item. + * It is at this point that we assert that all of the extent + * slots in the efd item have been filled. + */ +STATIC void +xfs_efd_item_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *log_vector) +{ + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + uint size; + + ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); + + efdp->efd_format.efd_type = XFS_LI_EFD; + + size = sizeof(xfs_efd_log_format_t); + size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); + efdp->efd_format.efd_size = 1; + + log_vector->i_addr = &efdp->efd_format; + log_vector->i_len = size; + log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; + ASSERT(size >= sizeof(xfs_efd_log_format_t)); +} + +/* + * Pinning has no meaning for an efd item, so just return. + */ +STATIC void +xfs_efd_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an efd item, unpinning does + * not either. + */ +STATIC void +xfs_efd_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * Efd items have no locking, so just return success. + */ +STATIC uint +xfs_efd_item_trylock( + struct xfs_log_item *lip) +{ + return XFS_ITEM_LOCKED; +} + +/* + * Efd items have no locking or pushing, so return failure + * so that the caller doesn't bother with us. + */ +STATIC void +xfs_efd_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_efd_item_free(EFD_ITEM(lip)); +} + +/* + * When the efd item is committed to disk, all we need to do + * is delete our reference to our partner efi item and then + * free ourselves. Since we're freeing ourselves we must + * return -1 to keep the transaction code from further referencing + * this item. + */ +STATIC xfs_lsn_t +xfs_efd_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + + /* + * If we got a log I/O error, it's always the case that the LR with the + * EFI got unpinned and freed before the EFD got aborted. + */ + if (!(lip->li_flags & XFS_LI_ABORTED)) + xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents); + + xfs_efd_item_free(efdp); + return (xfs_lsn_t)-1; +} + +/* + * There isn't much you can do to push on an efd item. It is simply + * stuck waiting for the log to be flushed to disk. + */ +STATIC void +xfs_efd_item_push( + struct xfs_log_item *lip) +{ +} + +/* + * The EFD dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_efd_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all efd log items. + */ +static struct xfs_item_ops xfs_efd_item_ops = { + .iop_size = xfs_efd_item_size, + .iop_format = xfs_efd_item_format, + .iop_pin = xfs_efd_item_pin, + .iop_unpin = xfs_efd_item_unpin, + .iop_trylock = xfs_efd_item_trylock, + .iop_unlock = xfs_efd_item_unlock, + .iop_committed = xfs_efd_item_committed, + .iop_push = xfs_efd_item_push, + .iop_committing = xfs_efd_item_committing +}; + +/* + * Allocate and initialize an efd item with the given number of extents. + */ +struct xfs_efd_log_item * +xfs_efd_init( + struct xfs_mount *mp, + struct xfs_efi_log_item *efip, + uint nextents) + +{ + struct xfs_efd_log_item *efdp; + uint size; + + ASSERT(nextents > 0); + if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { + size = (uint)(sizeof(xfs_efd_log_item_t) + + ((nextents - 1) * sizeof(xfs_extent_t))); + efdp = kmem_zalloc(size, KM_SLEEP); + } else { + efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); + } + + xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops); + efdp->efd_efip = efip; + efdp->efd_format.efd_nextents = nextents; + efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; + + return efdp; +} diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h new file mode 100644 index 00000000..375f68e4 --- /dev/null +++ b/fs/xfs/xfs_extfree_item.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_EXTFREE_ITEM_H__ +#define __XFS_EXTFREE_ITEM_H__ + +struct xfs_mount; +struct kmem_zone; + +typedef struct xfs_extent { + xfs_dfsbno_t ext_start; + xfs_extlen_t ext_len; +} xfs_extent_t; + +/* + * Since an xfs_extent_t has types (start:64, len: 32) + * there are different alignments on 32 bit and 64 bit kernels. + * So we provide the different variants for use by a + * conversion routine. + */ + +typedef struct xfs_extent_32 { + __uint64_t ext_start; + __uint32_t ext_len; +} __attribute__((packed)) xfs_extent_32_t; + +typedef struct xfs_extent_64 { + __uint64_t ext_start; + __uint32_t ext_len; + __uint32_t ext_pad; +} xfs_extent_64_t; + +/* + * This is the structure used to lay out an efi log item in the + * log. The efi_extents field is a variable size array whose + * size is given by efi_nextents. + */ +typedef struct xfs_efi_log_format { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_t efi_extents[1]; /* array of extents to free */ +} xfs_efi_log_format_t; + +typedef struct xfs_efi_log_format_32 { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_32_t efi_extents[1]; /* array of extents to free */ +} __attribute__((packed)) xfs_efi_log_format_32_t; + +typedef struct xfs_efi_log_format_64 { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_64_t efi_extents[1]; /* array of extents to free */ +} xfs_efi_log_format_64_t; + +/* + * This is the structure used to lay out an efd log item in the + * log. The efd_extents array is a variable size array whose + * size is given by efd_nextents; + */ +typedef struct xfs_efd_log_format { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_t efd_extents[1]; /* array of extents freed */ +} xfs_efd_log_format_t; + +typedef struct xfs_efd_log_format_32 { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_32_t efd_extents[1]; /* array of extents freed */ +} __attribute__((packed)) xfs_efd_log_format_32_t; + +typedef struct xfs_efd_log_format_64 { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_64_t efd_extents[1]; /* array of extents freed */ +} xfs_efd_log_format_64_t; + + +#ifdef __KERNEL__ + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_EFI_MAX_FAST_EXTENTS 16 + +/* + * Define EFI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define XFS_EFI_RECOVERED 1 +#define XFS_EFI_COMMITTED 2 + +/* + * This is the "extent free intention" log item. It is used + * to log the fact that some extents need to be free. It is + * used in conjunction with the "extent free done" log item + * described below. + */ +typedef struct xfs_efi_log_item { + xfs_log_item_t efi_item; + atomic_t efi_next_extent; + unsigned long efi_flags; /* misc flags */ + xfs_efi_log_format_t efi_format; +} xfs_efi_log_item_t; + +/* + * This is the "extent free done" log item. It is used to log + * the fact that some extents earlier mentioned in an efi item + * have been freed. + */ +typedef struct xfs_efd_log_item { + xfs_log_item_t efd_item; + xfs_efi_log_item_t *efd_efip; + uint efd_next_extent; + xfs_efd_log_format_t efd_format; +} xfs_efd_log_item_t; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_EFD_MAX_FAST_EXTENTS 16 + +extern struct kmem_zone *xfs_efi_zone; +extern struct kmem_zone *xfs_efd_zone; + +xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint); +xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *, + uint); +int xfs_efi_copy_format(xfs_log_iovec_t *buf, + xfs_efi_log_format_t *dst_efi_fmt); +void xfs_efi_item_free(xfs_efi_log_item_t *); + +#endif /* __KERNEL__ */ + +#endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c new file mode 100644 index 00000000..9124425b --- /dev/null +++ b/fs/xfs/xfs_filestream.c @@ -0,0 +1,824 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_bmap_btree.h" +#include "xfs_inum.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_ag.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_bmap.h" +#include "xfs_alloc.h" +#include "xfs_utils.h" +#include "xfs_mru_cache.h" +#include "xfs_filestream.h" +#include "xfs_trace.h" + +#ifdef XFS_FILESTREAMS_TRACE + +ktrace_t *xfs_filestreams_trace_buf; + +STATIC void +xfs_filestreams_trace( + xfs_mount_t *mp, /* mount point */ + int type, /* type of trace */ + const char *func, /* source function */ + int line, /* source line number */ + __psunsigned_t arg0, + __psunsigned_t arg1, + __psunsigned_t arg2, + __psunsigned_t arg3, + __psunsigned_t arg4, + __psunsigned_t arg5) +{ + ktrace_enter(xfs_filestreams_trace_buf, + (void *)(__psint_t)(type | (line << 16)), + (void *)func, + (void *)(__psunsigned_t)current_pid(), + (void *)mp, + (void *)(__psunsigned_t)arg0, + (void *)(__psunsigned_t)arg1, + (void *)(__psunsigned_t)arg2, + (void *)(__psunsigned_t)arg3, + (void *)(__psunsigned_t)arg4, + (void *)(__psunsigned_t)arg5, + NULL, NULL, NULL, NULL, NULL, NULL); +} + +#define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0) +#define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0) +#define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0) +#define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0) +#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0) +#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0) +#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \ + xfs_filestreams_trace(mp, t, __func__, __LINE__, \ + (__psunsigned_t)a0, (__psunsigned_t)a1, \ + (__psunsigned_t)a2, (__psunsigned_t)a3, \ + (__psunsigned_t)a4, (__psunsigned_t)a5) + +#define TRACE_AG_SCAN(mp, ag, ag2) \ + TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2); +#define TRACE_AG_PICK1(mp, max_ag, maxfree) \ + TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree); +#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \ + TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \ + cnt, free, scan, flag) +#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \ + TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2) +#define TRACE_FREE(mp, ip, pip, ag, cnt) \ + TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt) +#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \ + TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt) +#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \ + TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt) +#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \ + TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt) +#define TRACE_ORPHAN(mp, ip, ag) \ + TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag); + + +#else +#define TRACE_AG_SCAN(mp, ag, ag2) +#define TRACE_AG_PICK1(mp, max_ag, maxfree) +#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) +#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) +#define TRACE_FREE(mp, ip, pip, ag, cnt) +#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) +#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) +#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) +#define TRACE_ORPHAN(mp, ip, ag) +#endif + +static kmem_zone_t *item_zone; + +/* + * Structure for associating a file or a directory with an allocation group. + * The parent directory pointer is only needed for files, but since there will + * generally be vastly more files than directories in the cache, using the same + * data structure simplifies the code with very little memory overhead. + */ +typedef struct fstrm_item +{ + xfs_agnumber_t ag; /* AG currently in use for the file/directory. */ + xfs_inode_t *ip; /* inode self-pointer. */ + xfs_inode_t *pip; /* Parent directory inode pointer. */ +} fstrm_item_t; + +/* + * Allocation group filestream associations are tracked with per-ag atomic + * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a + * particular AG already has active filestreams associated with it. The mount + * point's m_peraglock is used to protect these counters from per-ag array + * re-allocation during a growfs operation. When xfs_growfs_data_private() is + * about to reallocate the array, it calls xfs_filestream_flush() with the + * m_peraglock held in write mode. + * + * Since xfs_mru_cache_flush() guarantees that all the free functions for all + * the cache elements have finished executing before it returns, it's safe for + * the free functions to use the atomic counters without m_peraglock protection. + * This allows the implementation of xfs_fstrm_free_func() to be agnostic about + * whether it was called with the m_peraglock held in read mode, write mode or + * not held at all. The race condition this addresses is the following: + * + * - The work queue scheduler fires and pulls a filestream directory cache + * element off the LRU end of the cache for deletion, then gets pre-empted. + * - A growfs operation grabs the m_peraglock in write mode, flushes all the + * remaining items from the cache and reallocates the mount point's per-ag + * array, resetting all the counters to zero. + * - The work queue thread resumes and calls the free function for the element + * it started cleaning up earlier. In the process it decrements the + * filestreams counter for an AG that now has no references. + * + * With a shrinkfs feature, the above scenario could panic the system. + * + * All other uses of the following macros should be protected by either the + * m_peraglock held in read mode, or the cache's internal locking exposed by the + * interval between a call to xfs_mru_cache_lookup() and a call to + * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode + * when new elements are added to the cache. + * + * Combined, these locking rules ensure that no associations will ever exist in + * the cache that reference per-ag array elements that have since been + * reallocated. + */ +static int +xfs_filestream_peek_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_read(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; +} + +static int +xfs_filestream_get_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_inc_return(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; +} + +static void +xfs_filestream_put_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + atomic_dec(&pag->pagf_fstrms); + xfs_perag_put(pag); +} + +/* + * Scan the AGs starting at startag looking for an AG that isn't in use and has + * at least minlen blocks free. + */ +static int +_xfs_filestream_pick_ag( + xfs_mount_t *mp, + xfs_agnumber_t startag, + xfs_agnumber_t *agp, + int flags, + xfs_extlen_t minlen) +{ + int streams, max_streams; + int err, trylock, nscan; + xfs_extlen_t longest, free, minfree, maxfree = 0; + xfs_agnumber_t ag, max_ag = NULLAGNUMBER; + struct xfs_perag *pag; + + /* 2% of an AG's blocks must be free for it to be chosen. */ + minfree = mp->m_sb.sb_agblocks / 50; + + ag = startag; + *agp = NULLAGNUMBER; + + /* For the first pass, don't sleep trying to init the per-AG. */ + trylock = XFS_ALLOC_FLAG_TRYLOCK; + + for (nscan = 0; 1; nscan++) { + pag = xfs_perag_get(mp, ag); + TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms)); + + if (!pag->pagf_init) { + err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); + if (err && !trylock) { + xfs_perag_put(pag); + return err; + } + } + + /* Might fail sometimes during the 1st pass with trylock set. */ + if (!pag->pagf_init) + goto next_ag; + + /* Keep track of the AG with the most free blocks. */ + if (pag->pagf_freeblks > maxfree) { + maxfree = pag->pagf_freeblks; + max_streams = atomic_read(&pag->pagf_fstrms); + max_ag = ag; + } + + /* + * The AG reference count does two things: it enforces mutual + * exclusion when examining the suitability of an AG in this + * loop, and it guards against two filestreams being established + * in the same AG as each other. + */ + if (xfs_filestream_get_ag(mp, ag) > 1) { + xfs_filestream_put_ag(mp, ag); + goto next_ag; + } + + longest = xfs_alloc_longest_free_extent(mp, pag); + if (((minlen && longest >= minlen) || + (!minlen && pag->pagf_freeblks >= minfree)) && + (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || + (flags & XFS_PICK_LOWSPACE))) { + + /* Break out, retaining the reference on the AG. */ + free = pag->pagf_freeblks; + streams = atomic_read(&pag->pagf_fstrms); + xfs_perag_put(pag); + *agp = ag; + break; + } + + /* Drop the reference on this AG, it's not usable. */ + xfs_filestream_put_ag(mp, ag); +next_ag: + xfs_perag_put(pag); + /* Move to the next AG, wrapping to AG 0 if necessary. */ + if (++ag >= mp->m_sb.sb_agcount) + ag = 0; + + /* If a full pass of the AGs hasn't been done yet, continue. */ + if (ag != startag) + continue; + + /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */ + if (trylock != 0) { + trylock = 0; + continue; + } + + /* Finally, if lowspace wasn't set, set it for the 3rd pass. */ + if (!(flags & XFS_PICK_LOWSPACE)) { + flags |= XFS_PICK_LOWSPACE; + continue; + } + + /* + * Take the AG with the most free space, regardless of whether + * it's already in use by another filestream. + */ + if (max_ag != NULLAGNUMBER) { + xfs_filestream_get_ag(mp, max_ag); + TRACE_AG_PICK1(mp, max_ag, maxfree); + streams = max_streams; + free = maxfree; + *agp = max_ag; + break; + } + + /* take AG 0 if none matched */ + TRACE_AG_PICK1(mp, max_ag, maxfree); + *agp = 0; + return 0; + } + + TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags); + + return 0; +} + +/* + * Set the allocation group number for a file or a directory, updating inode + * references and per-AG references as appropriate. + */ +static int +_xfs_filestream_update_ag( + xfs_inode_t *ip, + xfs_inode_t *pip, + xfs_agnumber_t ag) +{ + int err = 0; + xfs_mount_t *mp; + xfs_mru_cache_t *cache; + fstrm_item_t *item; + xfs_agnumber_t old_ag; + xfs_inode_t *old_pip; + + /* + * Either ip is a regular file and pip is a directory, or ip is a + * directory and pip is NULL. + */ + ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip && + (pip->i_d.di_mode & S_IFDIR)) || + ((ip->i_d.di_mode & S_IFDIR) && !pip))); + + mp = ip->i_mount; + cache = mp->m_filestream; + + item = xfs_mru_cache_lookup(cache, ip->i_ino); + if (item) { + ASSERT(item->ip == ip); + old_ag = item->ag; + item->ag = ag; + old_pip = item->pip; + item->pip = pip; + xfs_mru_cache_done(cache); + + /* + * If the AG has changed, drop the old ref and take a new one, + * effectively transferring the reference from old to new AG. + */ + if (ag != old_ag) { + xfs_filestream_put_ag(mp, old_ag); + xfs_filestream_get_ag(mp, ag); + } + + /* + * If ip is a file and its pip has changed, drop the old ref and + * take a new one. + */ + if (pip && pip != old_pip) { + IRELE(old_pip); + IHOLD(pip); + } + + TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag), + ag, xfs_filestream_peek_ag(mp, ag)); + return 0; + } + + item = kmem_zone_zalloc(item_zone, KM_MAYFAIL); + if (!item) + return ENOMEM; + + item->ag = ag; + item->ip = ip; + item->pip = pip; + + err = xfs_mru_cache_insert(cache, ip->i_ino, item); + if (err) { + kmem_zone_free(item_zone, item); + return err; + } + + /* Take a reference on the AG. */ + xfs_filestream_get_ag(mp, ag); + + /* + * Take a reference on the inode itself regardless of whether it's a + * regular file or a directory. + */ + IHOLD(ip); + + /* + * In the case of a regular file, take a reference on the parent inode + * as well to ensure it remains in-core. + */ + if (pip) + IHOLD(pip); + + TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag), + ag, xfs_filestream_peek_ag(mp, ag)); + + return 0; +} + +/* xfs_fstrm_free_func(): callback for freeing cached stream items. */ +STATIC void +xfs_fstrm_free_func( + unsigned long ino, + void *data) +{ + fstrm_item_t *item = (fstrm_item_t *)data; + xfs_inode_t *ip = item->ip; + + ASSERT(ip->i_ino == ino); + + xfs_iflags_clear(ip, XFS_IFILESTREAM); + + /* Drop the reference taken on the AG when the item was added. */ + xfs_filestream_put_ag(ip->i_mount, item->ag); + + TRACE_FREE(ip->i_mount, ip, item->pip, item->ag, + xfs_filestream_peek_ag(ip->i_mount, item->ag)); + + /* + * _xfs_filestream_update_ag() always takes a reference on the inode + * itself, whether it's a file or a directory. Release it here. + * This can result in the inode being freed and so we must + * not hold any inode locks when freeing filesstreams objects + * otherwise we can deadlock here. + */ + IRELE(ip); + + /* + * In the case of a regular file, _xfs_filestream_update_ag() also + * takes a ref on the parent inode to keep it in-core. Release that + * too. + */ + if (item->pip) + IRELE(item->pip); + + /* Finally, free the memory allocated for the item. */ + kmem_zone_free(item_zone, item); +} + +/* + * xfs_filestream_init() is called at xfs initialisation time to set up the + * memory zone that will be used for filestream data structure allocation. + */ +int +xfs_filestream_init(void) +{ + item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item"); + if (!item_zone) + return -ENOMEM; + + return 0; +} + +/* + * xfs_filestream_uninit() is called at xfs termination time to destroy the + * memory zone that was used for filestream data structure allocation. + */ +void +xfs_filestream_uninit(void) +{ + kmem_zone_destroy(item_zone); +} + +/* + * xfs_filestream_mount() is called when a file system is mounted with the + * filestream option. It is responsible for allocating the data structures + * needed to track the new file system's file streams. + */ +int +xfs_filestream_mount( + xfs_mount_t *mp) +{ + int err; + unsigned int lifetime, grp_count; + + /* + * The filestream timer tunable is currently fixed within the range of + * one second to four minutes, with five seconds being the default. The + * group count is somewhat arbitrary, but it'd be nice to adhere to the + * timer tunable to within about 10 percent. This requires at least 10 + * groups. + */ + lifetime = xfs_fstrm_centisecs * 10; + grp_count = 10; + + err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count, + xfs_fstrm_free_func); + + return err; +} + +/* + * xfs_filestream_unmount() is called when a file system that was mounted with + * the filestream option is unmounted. It drains the data structures created + * to track the file system's file streams and frees all the memory that was + * allocated. + */ +void +xfs_filestream_unmount( + xfs_mount_t *mp) +{ + xfs_mru_cache_destroy(mp->m_filestream); +} + +/* + * Return the AG of the filestream the file or directory belongs to, or + * NULLAGNUMBER otherwise. + */ +xfs_agnumber_t +xfs_filestream_lookup_ag( + xfs_inode_t *ip) +{ + xfs_mru_cache_t *cache; + fstrm_item_t *item; + xfs_agnumber_t ag; + int ref; + + if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) { + ASSERT(0); + return NULLAGNUMBER; + } + + cache = ip->i_mount->m_filestream; + item = xfs_mru_cache_lookup(cache, ip->i_ino); + if (!item) { + TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0); + return NULLAGNUMBER; + } + + ASSERT(ip == item->ip); + ag = item->ag; + ref = xfs_filestream_peek_ag(ip->i_mount, ag); + xfs_mru_cache_done(cache); + + TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref); + return ag; +} + +/* + * xfs_filestream_associate() should only be called to associate a regular file + * with its parent directory. Calling it with a child directory isn't + * appropriate because filestreams don't apply to entire directory hierarchies. + * Creating a file in a child directory of an existing filestream directory + * starts a new filestream with its own allocation group association. + * + * Returns < 0 on error, 0 if successful association occurred, > 0 if + * we failed to get an association because of locking issues. + */ +int +xfs_filestream_associate( + xfs_inode_t *pip, + xfs_inode_t *ip) +{ + xfs_mount_t *mp; + xfs_mru_cache_t *cache; + fstrm_item_t *item; + xfs_agnumber_t ag, rotorstep, startag; + int err = 0; + + ASSERT(pip->i_d.di_mode & S_IFDIR); + ASSERT(ip->i_d.di_mode & S_IFREG); + if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG)) + return -EINVAL; + + mp = pip->i_mount; + cache = mp->m_filestream; + + /* + * We have a problem, Houston. + * + * Taking the iolock here violates inode locking order - we already + * hold the ilock. Hence if we block getting this lock we may never + * wake. Unfortunately, that means if we can't get the lock, we're + * screwed in terms of getting a stream association - we can't spin + * waiting for the lock because someone else is waiting on the lock we + * hold and we cannot drop that as we are in a transaction here. + * + * Lucky for us, this inversion is not a problem because it's a + * directory inode that we are trying to lock here. + * + * So, if we can't get the iolock without sleeping then just give up + */ + if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) + return 1; + + /* If the parent directory is already in the cache, use its AG. */ + item = xfs_mru_cache_lookup(cache, pip->i_ino); + if (item) { + ASSERT(item->ip == pip); + ag = item->ag; + xfs_mru_cache_done(cache); + + TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag)); + err = _xfs_filestream_update_ag(ip, pip, ag); + + goto exit; + } + + /* + * Set the starting AG using the rotor for inode32, otherwise + * use the directory inode's AG. + */ + if (mp->m_flags & XFS_MOUNT_32BITINODES) { + rotorstep = xfs_rotorstep; + startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); + } else + startag = XFS_INO_TO_AGNO(mp, pip->i_ino); + + /* Pick a new AG for the parent inode starting at startag. */ + err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0); + if (err || ag == NULLAGNUMBER) + goto exit_did_pick; + + /* Associate the parent inode with the AG. */ + err = _xfs_filestream_update_ag(pip, NULL, ag); + if (err) + goto exit_did_pick; + + /* Associate the file inode with the AG. */ + err = _xfs_filestream_update_ag(ip, pip, ag); + if (err) + goto exit_did_pick; + + TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag)); + +exit_did_pick: + /* + * If _xfs_filestream_pick_ag() returned a valid AG, remove the + * reference it took on it, since the file and directory will have taken + * their own now if they were successfully cached. + */ + if (ag != NULLAGNUMBER) + xfs_filestream_put_ag(mp, ag); + +exit: + xfs_iunlock(pip, XFS_IOLOCK_EXCL); + return -err; +} + +/* + * Pick a new allocation group for the current file and its file stream. This + * function is called by xfs_bmap_filestreams() with the mount point's per-ag + * lock held. + */ +int +xfs_filestream_new_ag( + xfs_bmalloca_t *ap, + xfs_agnumber_t *agp) +{ + int flags, err; + xfs_inode_t *ip, *pip = NULL; + xfs_mount_t *mp; + xfs_mru_cache_t *cache; + xfs_extlen_t minlen; + fstrm_item_t *dir, *file; + xfs_agnumber_t ag = NULLAGNUMBER; + + ip = ap->ip; + mp = ip->i_mount; + cache = mp->m_filestream; + minlen = ap->alen; + *agp = NULLAGNUMBER; + + /* + * Look for the file in the cache, removing it if it's found. Doing + * this allows it to be held across the dir lookup that follows. + */ + file = xfs_mru_cache_remove(cache, ip->i_ino); + if (file) { + ASSERT(ip == file->ip); + + /* Save the file's parent inode and old AG number for later. */ + pip = file->pip; + ag = file->ag; + + /* Look for the file's directory in the cache. */ + dir = xfs_mru_cache_lookup(cache, pip->i_ino); + if (dir) { + ASSERT(pip == dir->ip); + + /* + * If the directory has already moved on to a new AG, + * use that AG as the new AG for the file. Don't + * forget to twiddle the AG refcounts to match the + * movement. + */ + if (dir->ag != file->ag) { + xfs_filestream_put_ag(mp, file->ag); + xfs_filestream_get_ag(mp, dir->ag); + *agp = file->ag = dir->ag; + } + + xfs_mru_cache_done(cache); + } + + /* + * Put the file back in the cache. If this fails, the free + * function needs to be called to tidy up in the same way as if + * the item had simply expired from the cache. + */ + err = xfs_mru_cache_insert(cache, ip->i_ino, file); + if (err) { + xfs_fstrm_free_func(ip->i_ino, file); + return err; + } + + /* + * If the file's AG was moved to the directory's new AG, there's + * nothing more to be done. + */ + if (*agp != NULLAGNUMBER) { + TRACE_MOVEAG(mp, ip, pip, + ag, xfs_filestream_peek_ag(mp, ag), + *agp, xfs_filestream_peek_ag(mp, *agp)); + return 0; + } + } + + /* + * If the file's parent directory is known, take its iolock in exclusive + * mode to prevent two sibling files from racing each other to migrate + * themselves and their parent to different AGs. + * + * Note that we lock the parent directory iolock inside the child + * iolock here. That's fine as we never hold both parent and child + * iolock in any other place. This is different from the ilock, + * which requires locking of the child after the parent for namespace + * operations. + */ + if (pip) + xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); + + /* + * A new AG needs to be found for the file. If the file's parent + * directory is also known, it will be moved to the new AG as well to + * ensure that files created inside it in future use the new AG. + */ + ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; + flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | + (ap->low ? XFS_PICK_LOWSPACE : 0); + + err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); + if (err || *agp == NULLAGNUMBER) + goto exit; + + /* + * If the file wasn't found in the file cache, then its parent directory + * inode isn't known. For this to have happened, the file must either + * be pre-existing, or it was created long enough ago that its cache + * entry has expired. This isn't the sort of usage that the filestreams + * allocator is trying to optimise, so there's no point trying to track + * its new AG somehow in the filestream data structures. + */ + if (!pip) { + TRACE_ORPHAN(mp, ip, *agp); + goto exit; + } + + /* Associate the parent inode with the AG. */ + err = _xfs_filestream_update_ag(pip, NULL, *agp); + if (err) + goto exit; + + /* Associate the file inode with the AG. */ + err = _xfs_filestream_update_ag(ip, pip, *agp); + if (err) + goto exit; + + TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0, + *agp, xfs_filestream_peek_ag(mp, *agp)); + +exit: + /* + * If _xfs_filestream_pick_ag() returned a valid AG, remove the + * reference it took on it, since the file and directory will have taken + * their own now if they were successfully cached. + */ + if (*agp != NULLAGNUMBER) + xfs_filestream_put_ag(mp, *agp); + else + *agp = 0; + + if (pip) + xfs_iunlock(pip, XFS_IOLOCK_EXCL); + + return err; +} + +/* + * Remove an association between an inode and a filestream object. + * Typically this is done on last close of an unlinked file. + */ +void +xfs_filestream_deassociate( + xfs_inode_t *ip) +{ + xfs_mru_cache_t *cache = ip->i_mount->m_filestream; + + xfs_mru_cache_delete(cache, ip->i_ino); +} diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h new file mode 100644 index 00000000..09dd9af4 --- /dev/null +++ b/fs/xfs/xfs_filestream.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FILESTREAM_H__ +#define __XFS_FILESTREAM_H__ + +#ifdef __KERNEL__ + +struct xfs_mount; +struct xfs_inode; +struct xfs_perag; +struct xfs_bmalloca; + +#ifdef XFS_FILESTREAMS_TRACE +#define XFS_FSTRM_KTRACE_INFO 1 +#define XFS_FSTRM_KTRACE_AGSCAN 2 +#define XFS_FSTRM_KTRACE_AGPICK1 3 +#define XFS_FSTRM_KTRACE_AGPICK2 4 +#define XFS_FSTRM_KTRACE_UPDATE 5 +#define XFS_FSTRM_KTRACE_FREE 6 +#define XFS_FSTRM_KTRACE_ITEM_LOOKUP 7 +#define XFS_FSTRM_KTRACE_ASSOCIATE 8 +#define XFS_FSTRM_KTRACE_MOVEAG 9 +#define XFS_FSTRM_KTRACE_ORPHAN 10 + +#define XFS_FSTRM_KTRACE_SIZE 16384 +extern ktrace_t *xfs_filestreams_trace_buf; + +#endif + +/* allocation selection flags */ +typedef enum xfs_fstrm_alloc { + XFS_PICK_USERDATA = 1, + XFS_PICK_LOWSPACE = 2, +} xfs_fstrm_alloc_t; + +/* prototypes for filestream.c */ +int xfs_filestream_init(void); +void xfs_filestream_uninit(void); +int xfs_filestream_mount(struct xfs_mount *mp); +void xfs_filestream_unmount(struct xfs_mount *mp); +xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); +int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip); +void xfs_filestream_deassociate(struct xfs_inode *ip); +int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); + + +/* filestreams for the inode? */ +static inline int +xfs_inode_is_filestream( + struct xfs_inode *ip) +{ + return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) || + xfs_iflags_test(ip, XFS_IFILESTREAM) || + (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM); +} + +#endif /* __KERNEL__ */ + +#endif /* __XFS_FILESTREAM_H__ */ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h new file mode 100644 index 00000000..8f6fc1a9 --- /dev/null +++ b/fs/xfs/xfs_fs.h @@ -0,0 +1,501 @@ +/* + * Copyright (c) 1995-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FS_H__ +#define __XFS_FS_H__ + +/* + * SGI's XFS filesystem's major stuff (constants, structures) + */ + +/* + * Direct I/O attribute record used with XFS_IOC_DIOINFO + * d_miniosz is the min xfer size, xfer size multiple and file seek offset + * alignment. + */ +#ifndef HAVE_DIOATTR +struct dioattr { + __u32 d_mem; /* data buffer memory alignment */ + __u32 d_miniosz; /* min xfer size */ + __u32 d_maxiosz; /* max xfer size */ +}; +#endif + +/* + * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR. + */ +#ifndef HAVE_FSXATTR +struct fsxattr { + __u32 fsx_xflags; /* xflags field value (get/set) */ + __u32 fsx_extsize; /* extsize field value (get/set)*/ + __u32 fsx_nextents; /* nextents field value (get) */ + __u32 fsx_projid; /* project identifier (get/set) */ + unsigned char fsx_pad[12]; +}; +#endif + +/* + * Flags for the bs_xflags/fsx_xflags field + * There should be a one-to-one correspondence between these flags and the + * XFS_DIFLAG_s. + */ +#define XFS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ +#define XFS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ +#define XFS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ +#define XFS_XFLAG_APPEND 0x00000010 /* all writes append */ +#define XFS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ +#define XFS_XFLAG_NOATIME 0x00000040 /* do not update access time */ +#define XFS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ +#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ +#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ +#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ +#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ +#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ +#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ +#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ +#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ + +/* + * Structure for XFS_IOC_GETBMAP. + * On input, fill in bmv_offset and bmv_length of the first structure + * to indicate the area of interest in the file, and bmv_entries with + * the number of array elements given back. The first structure is + * updated on return to give the offset and length for the next call. + */ +#ifndef HAVE_GETBMAP +struct getbmap { + __s64 bmv_offset; /* file offset of segment in blocks */ + __s64 bmv_block; /* starting block (64-bit daddr_t) */ + __s64 bmv_length; /* length of segment, blocks */ + __s32 bmv_count; /* # of entries in array incl. 1st */ + __s32 bmv_entries; /* # of entries filled in (output) */ +}; +#endif + +/* + * Structure for XFS_IOC_GETBMAPX. Fields bmv_offset through bmv_entries + * are used exactly as in the getbmap structure. The getbmapx structure + * has additional bmv_iflags and bmv_oflags fields. The bmv_iflags field + * is only used for the first structure. It contains input flags + * specifying XFS_IOC_GETBMAPX actions. The bmv_oflags field is filled + * in by the XFS_IOC_GETBMAPX command for each returned structure after + * the first. + */ +#ifndef HAVE_GETBMAPX +struct getbmapx { + __s64 bmv_offset; /* file offset of segment in blocks */ + __s64 bmv_block; /* starting block (64-bit daddr_t) */ + __s64 bmv_length; /* length of segment, blocks */ + __s32 bmv_count; /* # of entries in array incl. 1st */ + __s32 bmv_entries; /* # of entries filled in (output). */ + __s32 bmv_iflags; /* input flags (1st structure) */ + __s32 bmv_oflags; /* output flags (after 1st structure)*/ + __s32 bmv_unused1; /* future use */ + __s32 bmv_unused2; /* future use */ +}; +#endif + +/* bmv_iflags values - set by XFS_IOC_GETBMAPX caller. */ +#define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */ +#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ +#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ +#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ +#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */ +#define BMV_IF_VALID \ + (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \ + BMV_IF_DELALLOC|BMV_IF_NO_HOLES) + +/* bmv_oflags values - returned for each non-header segment */ +#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ +#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */ +#define BMV_OF_LAST 0x4 /* segment is the last in the file */ + +/* + * Structure for XFS_IOC_FSSETDM. + * For use by backup and restore programs to set the XFS on-disk inode + * fields di_dmevmask and di_dmstate. These must be set to exactly and + * only values previously obtained via xfs_bulkstat! (Specifically the + * xfs_bstat_t fields bs_dmevmask and bs_dmstate.) + */ +#ifndef HAVE_FSDMIDATA +struct fsdmidata { + __u32 fsd_dmevmask; /* corresponds to di_dmevmask */ + __u16 fsd_padding; + __u16 fsd_dmstate; /* corresponds to di_dmstate */ +}; +#endif + +/* + * File segment locking set data type for 64 bit access. + * Also used for all the RESV/FREE interfaces. + */ +typedef struct xfs_flock64 { + __s16 l_type; + __s16 l_whence; + __s64 l_start; + __s64 l_len; /* len == 0 means until end of file */ + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +} xfs_flock64_t; + +/* + * Output for XFS_IOC_FSGEOMETRY_V1 + */ +typedef struct xfs_fsop_geom_v1 { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ +} xfs_fsop_geom_v1_t; + +/* + * Output for XFS_IOC_FSGEOMETRY + */ +typedef struct xfs_fsop_geom { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ + __u32 logsunit; /* log stripe unit, bytes */ +} xfs_fsop_geom_t; + +/* Output for XFS_FS_COUNTS */ +typedef struct xfs_fsop_counts { + __u64 freedata; /* free data section blocks */ + __u64 freertx; /* free rt extents */ + __u64 freeino; /* free inodes */ + __u64 allocino; /* total allocated inodes */ +} xfs_fsop_counts_t; + +/* Input/Output for XFS_GET_RESBLKS and XFS_SET_RESBLKS */ +typedef struct xfs_fsop_resblks { + __u64 resblks; + __u64 resblks_avail; +} xfs_fsop_resblks_t; + +#define XFS_FSOP_GEOM_VERSION 0 + +#define XFS_FSOP_GEOM_FLAGS_ATTR 0x0001 /* attributes in use */ +#define XFS_FSOP_GEOM_FLAGS_NLINK 0x0002 /* 32-bit nlink values */ +#define XFS_FSOP_GEOM_FLAGS_QUOTA 0x0004 /* quotas enabled */ +#define XFS_FSOP_GEOM_FLAGS_IALIGN 0x0008 /* inode alignment */ +#define XFS_FSOP_GEOM_FLAGS_DALIGN 0x0010 /* large data alignment */ +#define XFS_FSOP_GEOM_FLAGS_SHARED 0x0020 /* read-only shared */ +#define XFS_FSOP_GEOM_FLAGS_EXTFLG 0x0040 /* special extent flag */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2 0x0080 /* directory version 2 */ +#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ +#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ +#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ +#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ + + +/* + * Minimum and maximum sizes need for growth checks + */ +#define XFS_MIN_AG_BLOCKS 64 +#define XFS_MIN_LOG_BLOCKS 512ULL +#define XFS_MAX_LOG_BLOCKS (1024 * 1024ULL) +#define XFS_MIN_LOG_BYTES (10 * 1024 * 1024ULL) + +/* keep the maximum size under 2^31 by a small amount */ +#define XFS_MAX_LOG_BYTES \ + ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES) + +/* + * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT + */ +typedef struct xfs_growfs_data { + __u64 newblocks; /* new data subvol size, fsblocks */ + __u32 imaxpct; /* new inode space percentage limit */ +} xfs_growfs_data_t; + +typedef struct xfs_growfs_log { + __u32 newblocks; /* new log size, fsblocks */ + __u32 isint; /* 1 if new log is internal */ +} xfs_growfs_log_t; + +typedef struct xfs_growfs_rt { + __u64 newblocks; /* new realtime size, fsblocks */ + __u32 extsize; /* new realtime extent size, fsblocks */ +} xfs_growfs_rt_t; + + +/* + * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE + */ +typedef struct xfs_bstime { + time_t tv_sec; /* seconds */ + __s32 tv_nsec; /* and nanoseconds */ +} xfs_bstime_t; + +typedef struct xfs_bstat { + __u64 bs_ino; /* inode number */ + __u16 bs_mode; /* type and mode */ + __u16 bs_nlink; /* number of links */ + __u32 bs_uid; /* user id */ + __u32 bs_gid; /* group id */ + __u32 bs_rdev; /* device value */ + __s32 bs_blksize; /* block size */ + __s64 bs_size; /* file size */ + xfs_bstime_t bs_atime; /* access time */ + xfs_bstime_t bs_mtime; /* modify time */ + xfs_bstime_t bs_ctime; /* inode change time */ + int64_t bs_blocks; /* number of blocks */ + __u32 bs_xflags; /* extended flags */ + __s32 bs_extsize; /* extent size */ + __s32 bs_extents; /* number of extents */ + __u32 bs_gen; /* generation count */ + __u16 bs_projid_lo; /* lower part of project id */ +#define bs_projid bs_projid_lo /* (previously just bs_projid) */ + __u16 bs_forkoff; /* inode fork offset in bytes */ + __u16 bs_projid_hi; /* higher part of project id */ + unsigned char bs_pad[10]; /* pad space, unused */ + __u32 bs_dmevmask; /* DMIG event mask */ + __u16 bs_dmstate; /* DMIG state info */ + __u16 bs_aextents; /* attribute number of extents */ +} xfs_bstat_t; + +/* + * The user-level BulkStat Request interface structure. + */ +typedef struct xfs_fsop_bulkreq { + __u64 __user *lastip; /* last inode # pointer */ + __s32 icount; /* count of entries in buffer */ + void __user *ubuffer;/* user buffer for inode desc. */ + __s32 __user *ocount; /* output count pointer */ +} xfs_fsop_bulkreq_t; + + +/* + * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS). + */ +typedef struct xfs_inogrp { + __u64 xi_startino; /* starting inode number */ + __s32 xi_alloccount; /* # bits set in allocmask */ + __u64 xi_allocmask; /* mask of allocated inodes */ +} xfs_inogrp_t; + + +/* + * Error injection. + */ +typedef struct xfs_error_injection { + __s32 fd; + __s32 errtag; +} xfs_error_injection_t; + + +/* + * The user-level Handle Request interface structure. + */ +typedef struct xfs_fsop_handlereq { + __u32 fd; /* fd for FD_TO_HANDLE */ + void __user *path; /* user pathname */ + __u32 oflags; /* open flags */ + void __user *ihandle;/* user supplied handle */ + __u32 ihandlen; /* user supplied length */ + void __user *ohandle;/* user buffer for handle */ + __u32 __user *ohandlen;/* user buffer length */ +} xfs_fsop_handlereq_t; + +/* + * Compound structures for passing args through Handle Request interfaces + * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle + * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and + * XFS_IOC_ATTRMULTI_BY_HANDLE + */ + +typedef struct xfs_fsop_setdm_handlereq { + struct xfs_fsop_handlereq hreq; /* handle information */ + struct fsdmidata __user *data; /* DMAPI data */ +} xfs_fsop_setdm_handlereq_t; + +typedef struct xfs_attrlist_cursor { + __u32 opaque[4]; +} xfs_attrlist_cursor_t; + +typedef struct xfs_fsop_attrlist_handlereq { + struct xfs_fsop_handlereq hreq; /* handle interface structure */ + struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */ + __u32 flags; /* which namespace to use */ + __u32 buflen; /* length of buffer supplied */ + void __user *buffer; /* returned names */ +} xfs_fsop_attrlist_handlereq_t; + +typedef struct xfs_attr_multiop { + __u32 am_opcode; +#define ATTR_OP_GET 1 /* return the indicated attr's value */ +#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */ +#define ATTR_OP_REMOVE 3 /* remove the indicated attr */ + __s32 am_error; + void __user *am_attrname; + void __user *am_attrvalue; + __u32 am_length; + __u32 am_flags; +} xfs_attr_multiop_t; + +typedef struct xfs_fsop_attrmulti_handlereq { + struct xfs_fsop_handlereq hreq; /* handle interface structure */ + __u32 opcount;/* count of following multiop */ + struct xfs_attr_multiop __user *ops; /* attr_multi data */ +} xfs_fsop_attrmulti_handlereq_t; + +/* + * per machine unique filesystem identifier types. + */ +typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */ + +typedef struct xfs_fid { + __u16 fid_len; /* length of remainder */ + __u16 fid_pad; + __u32 fid_gen; /* generation number */ + __u64 fid_ino; /* 64 bits inode number */ +} xfs_fid_t; + +typedef struct xfs_handle { + union { + __s64 align; /* force alignment of ha_fid */ + xfs_fsid_t _ha_fsid; /* unique file system identifier */ + } ha_u; + xfs_fid_t ha_fid; /* file system specific file ID */ +} xfs_handle_t; +#define ha_fsid ha_u._ha_fsid + +#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.fid_pad \ + - (char *) &(handle)) \ + + (handle).ha_fid.fid_len) + +/* + * Flags for going down operation + */ +#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define XFS_FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + +/* + * ioctl commands that are used by Linux filesystems + */ +#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS +#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS +#define XFS_IOC_GETVERSION FS_IOC_GETVERSION + +/* + * ioctl commands that replace IRIX fcntl()'s + * For 'documentation' purposed more than anything else, + * the "cmd #" field reflects the IRIX fcntl number. + */ +#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) +#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +#define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) +#define XFS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr) +#define XFS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr) +#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) +#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) +#define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) +#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata) +#define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64) +#define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64) +#define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64) +#define XFS_IOC_UNRESVSP64 _IOW ('X', 43, struct xfs_flock64) +#define XFS_IOC_GETBMAPA _IOWR('X', 44, struct getbmap) +#define XFS_IOC_FSGETXATTRA _IOR ('X', 45, struct fsxattr) +/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */ +/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ +#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) +#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) + +/* + * ioctl commands that replace IRIX syssgi()'s + */ +#define XFS_IOC_FSGEOMETRY_V1 _IOR ('X', 100, struct xfs_fsop_geom_v1) +#define XFS_IOC_FSBULKSTAT _IOWR('X', 101, struct xfs_fsop_bulkreq) +#define XFS_IOC_FSBULKSTAT_SINGLE _IOWR('X', 102, struct xfs_fsop_bulkreq) +#define XFS_IOC_FSINUMBERS _IOWR('X', 103, struct xfs_fsop_bulkreq) +#define XFS_IOC_PATH_TO_FSHANDLE _IOWR('X', 104, struct xfs_fsop_handlereq) +#define XFS_IOC_PATH_TO_HANDLE _IOWR('X', 105, struct xfs_fsop_handlereq) +#define XFS_IOC_FD_TO_HANDLE _IOWR('X', 106, struct xfs_fsop_handlereq) +#define XFS_IOC_OPEN_BY_HANDLE _IOWR('X', 107, struct xfs_fsop_handlereq) +#define XFS_IOC_READLINK_BY_HANDLE _IOWR('X', 108, struct xfs_fsop_handlereq) +#define XFS_IOC_SWAPEXT _IOWR('X', 109, struct xfs_swapext) +#define XFS_IOC_FSGROWFSDATA _IOW ('X', 110, struct xfs_growfs_data) +#define XFS_IOC_FSGROWFSLOG _IOW ('X', 111, struct xfs_growfs_log) +#define XFS_IOC_FSGROWFSRT _IOW ('X', 112, struct xfs_growfs_rt) +#define XFS_IOC_FSCOUNTS _IOR ('X', 113, struct xfs_fsop_counts) +#define XFS_IOC_SET_RESBLKS _IOWR('X', 114, struct xfs_fsop_resblks) +#define XFS_IOC_GET_RESBLKS _IOR ('X', 115, struct xfs_fsop_resblks) +#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) +#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) +/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ +/* XFS_IOC_FREEZE -- FIFREEZE 119 */ +/* XFS_IOC_THAW -- FITHAW 120 */ +#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) +#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) +#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) +#define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom) +#define XFS_IOC_GOINGDOWN _IOR ('X', 125, __uint32_t) +/* XFS_IOC_GETFSUUID ---------- deprecated 140 */ + + +#ifndef HAVE_BBMACROS +/* + * Block I/O parameterization. A basic block (BB) is the lowest size of + * filesystem allocation, and must equal 512. Length units given to bio + * routines are in BB's. + */ +#define BBSHIFT 9 +#define BBSIZE (1<> BBSHIFT) +#define BTOBBT(bytes) ((__u64)(bytes) >> BBSHIFT) +#define BBTOB(bbs) ((bbs) << BBSHIFT) +#endif + +#endif /* __XFS_FS_H__ */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c new file mode 100644 index 00000000..9153d2c7 --- /dev/null +++ b/fs/xfs/xfs_fsops.c @@ -0,0 +1,671 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_btree.h" +#include "xfs_error.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_fsops.h" +#include "xfs_itable.h" +#include "xfs_trans_space.h" +#include "xfs_rtalloc.h" +#include "xfs_rw.h" +#include "xfs_filestream.h" +#include "xfs_trace.h" + +/* + * File system operations + */ + +int +xfs_fs_geometry( + xfs_mount_t *mp, + xfs_fsop_geom_t *geo, + int new_version) +{ + + memset(geo, 0, sizeof(*geo)); + + geo->blocksize = mp->m_sb.sb_blocksize; + geo->rtextsize = mp->m_sb.sb_rextsize; + geo->agblocks = mp->m_sb.sb_agblocks; + geo->agcount = mp->m_sb.sb_agcount; + geo->logblocks = mp->m_sb.sb_logblocks; + geo->sectsize = mp->m_sb.sb_sectsize; + geo->inodesize = mp->m_sb.sb_inodesize; + geo->imaxpct = mp->m_sb.sb_imax_pct; + geo->datablocks = mp->m_sb.sb_dblocks; + geo->rtblocks = mp->m_sb.sb_rblocks; + geo->rtextents = mp->m_sb.sb_rextents; + geo->logstart = mp->m_sb.sb_logstart; + ASSERT(sizeof(geo->uuid)==sizeof(mp->m_sb.sb_uuid)); + memcpy(geo->uuid, &mp->m_sb.sb_uuid, sizeof(mp->m_sb.sb_uuid)); + if (new_version >= 2) { + geo->sunit = mp->m_sb.sb_unit; + geo->swidth = mp->m_sb.sb_width; + } + if (new_version >= 3) { + geo->version = XFS_FSOP_GEOM_VERSION; + geo->flags = + (xfs_sb_version_hasattr(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_ATTR : 0) | + (xfs_sb_version_hasnlink(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_NLINK : 0) | + (xfs_sb_version_hasquota(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_QUOTA : 0) | + (xfs_sb_version_hasalign(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_IALIGN : 0) | + (xfs_sb_version_hasdalign(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_DALIGN : 0) | + (xfs_sb_version_hasshared(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_SHARED : 0) | + (xfs_sb_version_hasextflgbit(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) | + (xfs_sb_version_hasdirv2(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) | + (xfs_sb_version_hassector(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_SECTOR : 0) | + (xfs_sb_version_hasasciici(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) | + (xfs_sb_version_haslazysbcount(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | + (xfs_sb_version_hasattr2(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_ATTR2 : 0); + geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? + mp->m_sb.sb_logsectsize : BBSIZE; + geo->rtsectsize = mp->m_sb.sb_blocksize; + geo->dirblocksize = mp->m_dirblksize; + } + if (new_version >= 4) { + geo->flags |= + (xfs_sb_version_haslogv2(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_LOGV2 : 0); + geo->logsunit = mp->m_sb.sb_logsunit; + } + return 0; +} + +static int +xfs_growfs_data_private( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_growfs_data_t *in) /* growfs data input struct */ +{ + xfs_agf_t *agf; + xfs_agi_t *agi; + xfs_agnumber_t agno; + xfs_extlen_t agsize; + xfs_extlen_t tmpsize; + xfs_alloc_rec_t *arec; + struct xfs_btree_block *block; + xfs_buf_t *bp; + int bucket; + int dpct; + int error; + xfs_agnumber_t nagcount; + xfs_agnumber_t nagimax = 0; + xfs_rfsblock_t nb, nb_mod; + xfs_rfsblock_t new; + xfs_rfsblock_t nfree; + xfs_agnumber_t oagcount; + int pct; + xfs_trans_t *tp; + + nb = in->newblocks; + pct = in->imaxpct; + if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100) + return XFS_ERROR(EINVAL); + if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) + return error; + dpct = pct - mp->m_sb.sb_imax_pct; + bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, + XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), + BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); + if (!bp) + return EIO; + xfs_buf_relse(bp); + + new = nb; /* use new as a temporary here */ + nb_mod = do_div(new, mp->m_sb.sb_agblocks); + nagcount = new + (nb_mod != 0); + if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { + nagcount--; + nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; + if (nb < mp->m_sb.sb_dblocks) + return XFS_ERROR(EINVAL); + } + new = nb - mp->m_sb.sb_dblocks; + oagcount = mp->m_sb.sb_agcount; + + /* allocate the new per-ag structures */ + if (nagcount > oagcount) { + error = xfs_initialize_perag(mp, nagcount, &nagimax); + if (error) + return error; + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); + tp->t_flags |= XFS_TRANS_RESERVE; + if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), + XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) { + xfs_trans_cancel(tp, 0); + return error; + } + + /* + * Write new AG headers to disk. Non-transactional, but written + * synchronously so they are completed prior to the growfs transaction + * being logged. + */ + nfree = 0; + for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { + /* + * AG freelist header block + */ + bp = xfs_buf_get(mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + agf = XFS_BUF_TO_AGF(bp); + memset(agf, 0, mp->m_sb.sb_sectsize); + agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); + agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); + agf->agf_seqno = cpu_to_be32(agno); + if (agno == nagcount - 1) + agsize = + nb - + (agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks); + else + agsize = mp->m_sb.sb_agblocks; + agf->agf_length = cpu_to_be32(agsize); + agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp)); + agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); + agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); + agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); + agf->agf_flfirst = 0; + agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1); + agf->agf_flcount = 0; + tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); + agf->agf_freeblks = cpu_to_be32(tmpsize); + agf->agf_longest = cpu_to_be32(tmpsize); + error = xfs_bwrite(mp, bp); + if (error) { + goto error0; + } + /* + * AG inode header block + */ + bp = xfs_buf_get(mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + agi = XFS_BUF_TO_AGI(bp); + memset(agi, 0, mp->m_sb.sb_sectsize); + agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); + agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); + agi->agi_seqno = cpu_to_be32(agno); + agi->agi_length = cpu_to_be32(agsize); + agi->agi_count = 0; + agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp)); + agi->agi_level = cpu_to_be32(1); + agi->agi_freecount = 0; + agi->agi_newino = cpu_to_be32(NULLAGINO); + agi->agi_dirino = cpu_to_be32(NULLAGINO); + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) + agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + error = xfs_bwrite(mp, bp); + if (error) { + goto error0; + } + /* + * BNO btree root block + */ + bp = xfs_buf_get(mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), + XBF_LOCK | XBF_MAPPED); + block = XFS_BUF_TO_BLOCK(bp); + memset(block, 0, mp->m_sb.sb_blocksize); + block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); + block->bb_level = 0; + block->bb_numrecs = cpu_to_be16(1); + block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + arec = XFS_ALLOC_REC_ADDR(mp, block, 1); + arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); + arec->ar_blockcount = cpu_to_be32( + agsize - be32_to_cpu(arec->ar_startblock)); + error = xfs_bwrite(mp, bp); + if (error) { + goto error0; + } + /* + * CNT btree root block + */ + bp = xfs_buf_get(mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), + XBF_LOCK | XBF_MAPPED); + block = XFS_BUF_TO_BLOCK(bp); + memset(block, 0, mp->m_sb.sb_blocksize); + block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); + block->bb_level = 0; + block->bb_numrecs = cpu_to_be16(1); + block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + arec = XFS_ALLOC_REC_ADDR(mp, block, 1); + arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); + arec->ar_blockcount = cpu_to_be32( + agsize - be32_to_cpu(arec->ar_startblock)); + nfree += be32_to_cpu(arec->ar_blockcount); + error = xfs_bwrite(mp, bp); + if (error) { + goto error0; + } + /* + * INO btree root block + */ + bp = xfs_buf_get(mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), + XBF_LOCK | XBF_MAPPED); + block = XFS_BUF_TO_BLOCK(bp); + memset(block, 0, mp->m_sb.sb_blocksize); + block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); + block->bb_level = 0; + block->bb_numrecs = 0; + block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + error = xfs_bwrite(mp, bp); + if (error) { + goto error0; + } + } + xfs_trans_agblocks_delta(tp, nfree); + /* + * There are new blocks in the old last a.g. + */ + if (new) { + /* + * Change the agi length. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &bp); + if (error) { + goto error0; + } + ASSERT(bp); + agi = XFS_BUF_TO_AGI(bp); + be32_add_cpu(&agi->agi_length, new); + ASSERT(nagcount == oagcount || + be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); + xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); + /* + * Change agf length. + */ + error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp); + if (error) { + goto error0; + } + ASSERT(bp); + agf = XFS_BUF_TO_AGF(bp); + be32_add_cpu(&agf->agf_length, new); + ASSERT(be32_to_cpu(agf->agf_length) == + be32_to_cpu(agi->agi_length)); + + xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); + /* + * Free the new space. + */ + error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno, + be32_to_cpu(agf->agf_length) - new), new); + if (error) { + goto error0; + } + } + + /* + * Update changed superblock fields transactionally. These are not + * seen by the rest of the world until the transaction commit applies + * them atomically to the superblock. + */ + if (nagcount > oagcount) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); + if (nb > mp->m_sb.sb_dblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, + nb - mp->m_sb.sb_dblocks); + if (nfree) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree); + if (dpct) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); + error = xfs_trans_commit(tp, 0); + if (error) + return error; + + /* New allocation groups fully initialized, so update mount struct */ + if (nagimax) + mp->m_maxagi = nagimax; + if (mp->m_sb.sb_imax_pct) { + __uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; + do_div(icount, 100); + mp->m_maxicount = icount << mp->m_sb.sb_inopblog; + } else + mp->m_maxicount = 0; + xfs_set_low_space_thresholds(mp); + + /* update secondary superblocks. */ + for (agno = 1; agno < nagcount; agno++) { + error = xfs_read_buf(mp, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp); + if (error) { + xfs_warn(mp, + "error %d reading secondary superblock for ag %d", + error, agno); + break; + } + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); + /* + * If we get an error writing out the alternate superblocks, + * just issue a warning and continue. The real work is + * already done and committed. + */ + if (!(error = xfs_bwrite(mp, bp))) { + continue; + } else { + xfs_warn(mp, + "write error %d updating secondary superblock for ag %d", + error, agno); + break; /* no point in continuing */ + } + } + return 0; + + error0: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); + return error; +} + +static int +xfs_growfs_log_private( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_growfs_log_t *in) /* growfs log input struct */ +{ + xfs_extlen_t nb; + + nb = in->newblocks; + if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES)) + return XFS_ERROR(EINVAL); + if (nb == mp->m_sb.sb_logblocks && + in->isint == (mp->m_sb.sb_logstart != 0)) + return XFS_ERROR(EINVAL); + /* + * Moving the log is hard, need new interfaces to sync + * the log first, hold off all activity while moving it. + * Can have shorter or longer log in the same space, + * or transform internal to external log or vice versa. + */ + return XFS_ERROR(ENOSYS); +} + +/* + * protected versions of growfs function acquire and release locks on the mount + * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG, + * XFS_IOC_FSGROWFSRT + */ + + +int +xfs_growfs_data( + xfs_mount_t *mp, + xfs_growfs_data_t *in) +{ + int error; + + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); + if (!mutex_trylock(&mp->m_growlock)) + return XFS_ERROR(EWOULDBLOCK); + error = xfs_growfs_data_private(mp, in); + mutex_unlock(&mp->m_growlock); + return error; +} + +int +xfs_growfs_log( + xfs_mount_t *mp, + xfs_growfs_log_t *in) +{ + int error; + + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); + if (!mutex_trylock(&mp->m_growlock)) + return XFS_ERROR(EWOULDBLOCK); + error = xfs_growfs_log_private(mp, in); + mutex_unlock(&mp->m_growlock); + return error; +} + +/* + * exported through ioctl XFS_IOC_FSCOUNTS + */ + +int +xfs_fs_counts( + xfs_mount_t *mp, + xfs_fsop_counts_t *cnt) +{ + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + spin_lock(&mp->m_sb_lock); + cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + cnt->freertx = mp->m_sb.sb_frextents; + cnt->freeino = mp->m_sb.sb_ifree; + cnt->allocino = mp->m_sb.sb_icount; + spin_unlock(&mp->m_sb_lock); + return 0; +} + +/* + * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS + * + * xfs_reserve_blocks is called to set m_resblks + * in the in-core mount table. The number of unused reserved blocks + * is kept in m_resblks_avail. + * + * Reserve the requested number of blocks if available. Otherwise return + * as many as possible to satisfy the request. The actual number + * reserved are returned in outval + * + * A null inval pointer indicates that only the current reserved blocks + * available should be returned no settings are changed. + */ + +int +xfs_reserve_blocks( + xfs_mount_t *mp, + __uint64_t *inval, + xfs_fsop_resblks_t *outval) +{ + __int64_t lcounter, delta, fdblks_delta; + __uint64_t request; + + /* If inval is null, report current values and return */ + if (inval == (__uint64_t *)NULL) { + if (!outval) + return EINVAL; + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + return 0; + } + + request = *inval; + + /* + * With per-cpu counters, this becomes an interesting + * problem. we needto work out if we are freeing or allocation + * blocks first, then we can do the modification as necessary. + * + * We do this under the m_sb_lock so that if we are near + * ENOSPC, we will hold out any changes while we work out + * what to do. This means that the amount of free space can + * change while we do this, so we need to retry if we end up + * trying to reserve more space than is available. + * + * We also use the xfs_mod_incore_sb() interface so that we + * don't have to care about whether per cpu counter are + * enabled, disabled or even compiled in.... + */ +retry: + spin_lock(&mp->m_sb_lock); + xfs_icsb_sync_counters_locked(mp, 0); + + /* + * If our previous reservation was larger than the current value, + * then move any unused blocks back to the free pool. + */ + fdblks_delta = 0; + if (mp->m_resblks > request) { + lcounter = mp->m_resblks_avail - request; + if (lcounter > 0) { /* release unused blocks */ + fdblks_delta = lcounter; + mp->m_resblks_avail -= lcounter; + } + mp->m_resblks = request; + } else { + __int64_t free; + + free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + if (!free) + goto out; /* ENOSPC and fdblks_delta = 0 */ + + delta = request - mp->m_resblks; + lcounter = free - delta; + if (lcounter < 0) { + /* We can't satisfy the request, just get what we can */ + mp->m_resblks += free; + mp->m_resblks_avail += free; + fdblks_delta = -free; + } else { + fdblks_delta = -delta; + mp->m_resblks = request; + mp->m_resblks_avail += delta; + } + } +out: + if (outval) { + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + } + spin_unlock(&mp->m_sb_lock); + + if (fdblks_delta) { + /* + * If we are putting blocks back here, m_resblks_avail is + * already at its max so this will put it in the free pool. + * + * If we need space, we'll either succeed in getting it + * from the free block count or we'll get an enospc. If + * we get a ENOSPC, it means things changed while we were + * calculating fdblks_delta and so we should try again to + * see if there is anything left to reserve. + * + * Don't set the reserved flag here - we don't want to reserve + * the extra reserve blocks from the reserve..... + */ + int error; + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + fdblks_delta, 0); + if (error == ENOSPC) + goto retry; + } + return 0; +} + +/* + * Dump a transaction into the log that contains no real change. This is needed + * to be able to make the log dirty or stamp the current tail LSN into the log + * during the covering operation. + * + * We cannot use an inode here for this - that will push dirty state back up + * into the VFS and then periodic inode flushing will prevent log covering from + * making progress. Hence we log a field in the superblock instead and use a + * synchronous transaction to ensure the superblock is immediately unpinned + * and can be written back. + */ +int +xfs_fs_log_dummy( + xfs_mount_t *mp) +{ + xfs_trans_t *tp; + int error; + + tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); + error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, + XFS_DEFAULT_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + /* log the UUID because it is an unchanging field */ + xfs_mod_sb(tp, XFS_SB_UUID); + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp, 0); +} + +int +xfs_fs_goingdown( + xfs_mount_t *mp, + __uint32_t inflags) +{ + switch (inflags) { + case XFS_FSOP_GOING_FLAGS_DEFAULT: { + struct super_block *sb = freeze_bdev(mp->m_super->s_bdev); + + if (sb && !IS_ERR(sb)) { + xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); + thaw_bdev(sb->s_bdev, sb); + } + + break; + } + case XFS_FSOP_GOING_FLAGS_LOGFLUSH: + xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); + break; + case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH: + xfs_force_shutdown(mp, + SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR); + break; + default: + return XFS_ERROR(EINVAL); + } + + return 0; +} diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h new file mode 100644 index 00000000..1b6a98b6 --- /dev/null +++ b/fs/xfs/xfs_fsops.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FSOPS_H__ +#define __XFS_FSOPS_H__ + +extern int xfs_fs_geometry(xfs_mount_t *mp, xfs_fsop_geom_t *geo, int nversion); +extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in); +extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in); +extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); +extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, + xfs_fsop_resblks_t *outval); +extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); +extern int xfs_fs_log_dummy(struct xfs_mount *mp); + +#endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c new file mode 100644 index 00000000..84ebeec1 --- /dev/null +++ b/fs/xfs/xfs_ialloc.c @@ -0,0 +1,1563 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_bmap.h" + + +/* + * Allocation group level functions. + */ +static inline int +xfs_ialloc_cluster_alignment( + xfs_alloc_arg_t *args) +{ + if (xfs_sb_version_hasalign(&args->mp->m_sb) && + args->mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp))) + return args->mp->m_sb.sb_inoalignmt; + return 1; +} + +/* + * Lookup a record by ino in the btree given by cur. + */ +int /* error */ +xfs_inobt_lookup( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + xfs_lookup_t dir, /* <=, >=, == */ + int *stat) /* success/failure */ +{ + cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_freecount = 0; + cur->bc_rec.i.ir_free = 0; + return xfs_btree_lookup(cur, dir, stat); +} + +/* + * Update the record referred to by cur to the value given. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int /* error */ +xfs_inobt_update( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_inobt_rec_incore_t *irec) /* btree record */ +{ + union xfs_btree_rec rec; + + rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); + rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); + rec.inobt.ir_free = cpu_to_be64(irec->ir_free); + return xfs_btree_update(cur, &rec); +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_inobt_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_inobt_rec_incore_t *irec, /* btree record */ + int *stat) /* output: success/failure */ +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (!error && *stat == 1) { + irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); + irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); + irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + } + return error; +} + +/* + * Verify that the number of free inodes in the AGI is correct. + */ +#ifdef DEBUG +STATIC int +xfs_check_agi_freecount( + struct xfs_btree_cur *cur, + struct xfs_agi *agi) +{ + if (cur->bc_nlevels == 1) { + xfs_inobt_rec_incore_t rec; + int freecount = 0; + int error; + int i; + + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + return error; + + do { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + return error; + + if (i) { + freecount += rec.ir_freecount; + error = xfs_btree_increment(cur, 0, &i); + if (error) + return error; + } + } while (i == 1); + + if (!XFS_FORCED_SHUTDOWN(cur->bc_mp)) + ASSERT(freecount == be32_to_cpu(agi->agi_freecount)); + } + return 0; +} +#else +#define xfs_check_agi_freecount(cur, agi) 0 +#endif + +/* + * Initialise a new set of inodes. + */ +STATIC void +xfs_ialloc_inode_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + xfs_agblock_t length, + unsigned int gen) +{ + struct xfs_buf *fbuf; + struct xfs_dinode *free; + int blks_per_cluster, nbufs, ninodes; + int version; + int i, j; + xfs_daddr_t d; + + /* + * Loop over the new block(s), filling in the inodes. + * For small block sizes, manipulate the inodes in buffers + * which are multiples of the blocks size. + */ + if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { + blks_per_cluster = 1; + nbufs = length; + ninodes = mp->m_sb.sb_inopblock; + } else { + blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / + mp->m_sb.sb_blocksize; + nbufs = length / blks_per_cluster; + ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; + } + + /* + * Figure out what version number to use in the inodes we create. + * If the superblock version has caught up to the one that supports + * the new inode format, then use the new inode version. Otherwise + * use the old version so that old kernels will continue to be + * able to use the file system. + */ + if (xfs_sb_version_hasnlink(&mp->m_sb)) + version = 2; + else + version = 1; + + for (j = 0; j < nbufs; j++) { + /* + * Get the block. + */ + d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); + fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize * blks_per_cluster, + XBF_LOCK); + ASSERT(fbuf); + ASSERT(!XFS_BUF_GETERROR(fbuf)); + + /* + * Initialize all inodes in this buffer and then log them. + * + * XXX: It would be much better if we had just one transaction + * to log a whole cluster of inodes instead of all the + * individual transactions causing a lot of log traffic. + */ + xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); + for (i = 0; i < ninodes; i++) { + int ioffset = i << mp->m_sb.sb_inodelog; + uint isize = sizeof(struct xfs_dinode); + + free = xfs_make_iptr(mp, fbuf, i); + free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + free->di_version = version; + free->di_gen = cpu_to_be32(gen); + free->di_next_unlinked = cpu_to_be32(NULLAGINO); + xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1); + } + xfs_trans_inode_alloc_buf(tp, fbuf); + } +} + +/* + * Allocate new inodes in the allocation group specified by agbp. + * Return 0 for success, else error code. + */ +STATIC int /* error code or 0 */ +xfs_ialloc_ag_alloc( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* alloc group buffer */ + int *alloc) +{ + xfs_agi_t *agi; /* allocation group header */ + xfs_alloc_arg_t args; /* allocation argument structure */ + xfs_btree_cur_t *cur; /* inode btree cursor */ + xfs_agnumber_t agno; + int error; + int i; + xfs_agino_t newino; /* new first inode's number */ + xfs_agino_t newlen; /* new number of inodes */ + xfs_agino_t thisino; /* current inode number, for loop */ + int isaligned = 0; /* inode allocation at stripe unit */ + /* boundary */ + struct xfs_perag *pag; + + args.tp = tp; + args.mp = tp->t_mountp; + + /* + * Locking will ensure that we don't have two callers in here + * at one time. + */ + newlen = XFS_IALLOC_INODES(args.mp); + if (args.mp->m_maxicount && + args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) + return XFS_ERROR(ENOSPC); + args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp); + /* + * First try to allocate inodes contiguous with the last-allocated + * chunk of inodes. If the filesystem is striped, this will fill + * an entire stripe unit with inodes. + */ + agi = XFS_BUF_TO_AGI(agbp); + newino = be32_to_cpu(agi->agi_newino); + agno = be32_to_cpu(agi->agi_seqno); + args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + + XFS_IALLOC_BLOCKS(args.mp); + if (likely(newino != NULLAGINO && + (args.agbno < be32_to_cpu(agi->agi_length)))) { + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.type = XFS_ALLOCTYPE_THIS_BNO; + args.mod = args.total = args.wasdel = args.isfl = + args.userdata = args.minalignslop = 0; + args.prod = 1; + + /* + * We need to take into account alignment here to ensure that + * we don't modify the free list if we fail to have an exact + * block. If we don't have an exact match, and every oher + * attempt allocation attempt fails, we'll end up cancelling + * a dirty transaction and shutting down. + * + * For an exact allocation, alignment must be 1, + * however we need to take cluster alignment into account when + * fixing up the freelist. Use the minalignslop field to + * indicate that extra blocks might be required for alignment, + * but not to use them in the actual exact allocation. + */ + args.alignment = 1; + args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1; + + /* Allow space for the inode btree to split. */ + args.minleft = args.mp->m_in_maxlevels - 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + } else + args.fsbno = NULLFSBLOCK; + + if (unlikely(args.fsbno == NULLFSBLOCK)) { + /* + * Set the alignment for the allocation. + * If stripe alignment is turned on then align at stripe unit + * boundary. + * If the cluster size is smaller than a filesystem block + * then we're doing I/O for inodes in filesystem block size + * pieces, so don't need alignment anyway. + */ + isaligned = 0; + if (args.mp->m_sinoalign) { + ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN)); + args.alignment = args.mp->m_dalign; + isaligned = 1; + } else + args.alignment = xfs_ialloc_cluster_alignment(&args); + /* + * Need to figure out where to allocate the inode blocks. + * Ideally they should be spaced out through the a.g. + * For now, just allocate blocks up front. + */ + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + /* + * Allocate a fixed-size extent of inodes. + */ + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.mod = args.total = args.wasdel = args.isfl = + args.userdata = args.minalignslop = 0; + args.prod = 1; + /* + * Allow space for the inode btree to split. + */ + args.minleft = args.mp->m_in_maxlevels - 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + + /* + * If stripe alignment is turned on, then try again with cluster + * alignment. + */ + if (isaligned && args.fsbno == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.alignment = xfs_ialloc_cluster_alignment(&args); + if ((error = xfs_alloc_vextent(&args))) + return error; + } + + if (args.fsbno == NULLFSBLOCK) { + *alloc = 0; + return 0; + } + ASSERT(args.len == args.minlen); + + /* + * Stamp and write the inode buffers. + * + * Seed the new inode cluster with a random generation number. This + * prevents short-term reuse of generation numbers if a chunk is + * freed and then immediately reallocated. We use random numbers + * rather than a linear progression to prevent the next generation + * number from being easily guessable. + */ + xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len, + random32()); + + /* + * Convert the results. + */ + newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + be32_add_cpu(&agi->agi_count, newlen); + be32_add_cpu(&agi->agi_freecount, newlen); + pag = xfs_perag_get(args.mp, agno); + pag->pagi_freecount += newlen; + xfs_perag_put(pag); + agi->agi_newino = cpu_to_be32(newino); + + /* + * Insert records describing the new inode chunk into the btree. + */ + cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno); + for (thisino = newino; + thisino < newino + newlen; + thisino += XFS_INODES_PER_CHUNK) { + cur->bc_rec.i.ir_startino = thisino; + cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK; + cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE; + error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 0); + error = xfs_btree_insert(cur, &i); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 1); + } + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + /* + * Log allocation group header fields + */ + xfs_ialloc_log_agi(tp, agbp, + XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO); + /* + * Modify/log superblock values for inode count and inode free count. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen); + *alloc = 1; + return 0; +} + +STATIC xfs_agnumber_t +xfs_ialloc_next_ag( + xfs_mount_t *mp) +{ + xfs_agnumber_t agno; + + spin_lock(&mp->m_agirotor_lock); + agno = mp->m_agirotor; + if (++mp->m_agirotor == mp->m_maxagi) + mp->m_agirotor = 0; + spin_unlock(&mp->m_agirotor_lock); + + return agno; +} + +/* + * Select an allocation group to look for a free inode in, based on the parent + * inode and then mode. Return the allocation group buffer. + */ +STATIC xfs_buf_t * /* allocation group buffer */ +xfs_ialloc_ag_select( + xfs_trans_t *tp, /* transaction pointer */ + xfs_ino_t parent, /* parent directory inode number */ + mode_t mode, /* bits set to indicate file type */ + int okalloc) /* ok to allocate more space */ +{ + xfs_buf_t *agbp; /* allocation group header buffer */ + xfs_agnumber_t agcount; /* number of ag's in the filesystem */ + xfs_agnumber_t agno; /* current ag number */ + int flags; /* alloc buffer locking flags */ + xfs_extlen_t ineed; /* blocks needed for inode allocation */ + xfs_extlen_t longest = 0; /* longest extent available */ + xfs_mount_t *mp; /* mount point structure */ + int needspace; /* file mode implies space allocated */ + xfs_perag_t *pag; /* per allocation group data */ + xfs_agnumber_t pagno; /* parent (starting) ag number */ + + /* + * Files of these types need at least one block if length > 0 + * (and they won't fit in the inode, but that's hard to figure out). + */ + needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); + mp = tp->t_mountp; + agcount = mp->m_maxagi; + if (S_ISDIR(mode)) + pagno = xfs_ialloc_next_ag(mp); + else { + pagno = XFS_INO_TO_AGNO(mp, parent); + if (pagno >= agcount) + pagno = 0; + } + ASSERT(pagno < agcount); + /* + * Loop through allocation groups, looking for one with a little + * free space in it. Note we don't look for free inodes, exactly. + * Instead, we include whether there is a need to allocate inodes + * to mean that blocks must be allocated for them, + * if none are currently free. + */ + agno = pagno; + flags = XFS_ALLOC_FLAG_TRYLOCK; + for (;;) { + pag = xfs_perag_get(mp, agno); + if (!pag->pagi_init) { + if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { + agbp = NULL; + goto nextag; + } + } else + agbp = NULL; + + if (!pag->pagi_inodeok) { + xfs_ialloc_next_ag(mp); + goto unlock_nextag; + } + + /* + * Is there enough free space for the file plus a block + * of inodes (if we need to allocate some)? + */ + ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp); + if (ineed && !pag->pagf_init) { + if (agbp == NULL && + xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { + agbp = NULL; + goto nextag; + } + (void)xfs_alloc_pagf_init(mp, tp, agno, flags); + } + if (!ineed || pag->pagf_init) { + if (ineed && !(longest = pag->pagf_longest)) + longest = pag->pagf_flcount > 0; + if (!ineed || + (pag->pagf_freeblks >= needspace + ineed && + longest >= ineed && + okalloc)) { + if (agbp == NULL && + xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { + agbp = NULL; + goto nextag; + } + xfs_perag_put(pag); + return agbp; + } + } +unlock_nextag: + if (agbp) + xfs_trans_brelse(tp, agbp); +nextag: + xfs_perag_put(pag); + /* + * No point in iterating over the rest, if we're shutting + * down. + */ + if (XFS_FORCED_SHUTDOWN(mp)) + return NULL; + agno++; + if (agno >= agcount) + agno = 0; + if (agno == pagno) { + if (flags == 0) + return NULL; + flags = 0; + } + } +} + +/* + * Try to retrieve the next record to the left/right from the current one. + */ +STATIC int +xfs_ialloc_next_rec( + struct xfs_btree_cur *cur, + xfs_inobt_rec_incore_t *rec, + int *done, + int left) +{ + int error; + int i; + + if (left) + error = xfs_btree_decrement(cur, 0, &i); + else + error = xfs_btree_increment(cur, 0, &i); + + if (error) + return error; + *done = !i; + if (i) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + + return 0; +} + +STATIC int +xfs_ialloc_get_rec( + struct xfs_btree_cur *cur, + xfs_agino_t agino, + xfs_inobt_rec_incore_t *rec, + int *done, + int left) +{ + int error; + int i; + + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i); + if (error) + return error; + *done = !i; + if (i) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + + return 0; +} + +/* + * Visible inode allocation functions. + */ + +/* + * Allocate an inode on disk. + * Mode is used to tell whether the new inode will need space, and whether + * it is a directory. + * + * The arguments IO_agbp and alloc_done are defined to work within + * the constraint of one allocation per transaction. + * xfs_dialloc() is designed to be called twice if it has to do an + * allocation to make more free inodes. On the first call, + * IO_agbp should be set to NULL. If an inode is available, + * i.e., xfs_dialloc() did not need to do an allocation, an inode + * number is returned. In this case, IO_agbp would be set to the + * current ag_buf and alloc_done set to false. + * If an allocation needed to be done, xfs_dialloc would return + * the current ag_buf in IO_agbp and set alloc_done to true. + * The caller should then commit the current transaction, allocate a new + * transaction, and call xfs_dialloc() again, passing in the previous + * value of IO_agbp. IO_agbp should be held across the transactions. + * Since the agbp is locked across the two calls, the second call is + * guaranteed to have a free inode available. + * + * Once we successfully pick an inode its number is returned and the + * on-disk data structures are updated. The inode itself is not read + * in, since doing so would break ordering constraints with xfs_reclaim. + */ +int +xfs_dialloc( + xfs_trans_t *tp, /* transaction pointer */ + xfs_ino_t parent, /* parent inode (directory) */ + mode_t mode, /* mode bits for new inode */ + int okalloc, /* ok to allocate more space */ + xfs_buf_t **IO_agbp, /* in/out ag header's buffer */ + boolean_t *alloc_done, /* true if we needed to replenish + inode freelist */ + xfs_ino_t *inop) /* inode number allocated */ +{ + xfs_agnumber_t agcount; /* number of allocation groups */ + xfs_buf_t *agbp; /* allocation group header's buffer */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_agi_t *agi; /* allocation group header structure */ + xfs_btree_cur_t *cur; /* inode allocation btree cursor */ + int error; /* error return value */ + int i; /* result code */ + int ialloced; /* inode allocation status */ + int noroom = 0; /* no space for inode blk allocation */ + xfs_ino_t ino; /* fs-relative inode to be returned */ + /* REFERENCED */ + int j; /* result code */ + xfs_mount_t *mp; /* file system mount structure */ + int offset; /* index of inode in chunk */ + xfs_agino_t pagino; /* parent's AG relative inode # */ + xfs_agnumber_t pagno; /* parent's AG number */ + xfs_inobt_rec_incore_t rec; /* inode allocation record */ + xfs_agnumber_t tagno; /* testing allocation group number */ + xfs_btree_cur_t *tcur; /* temp cursor */ + xfs_inobt_rec_incore_t trec; /* temp inode allocation record */ + struct xfs_perag *pag; + + + if (*IO_agbp == NULL) { + /* + * We do not have an agbp, so select an initial allocation + * group for inode allocation. + */ + agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + /* + * Couldn't find an allocation group satisfying the + * criteria, give up. + */ + if (!agbp) { + *inop = NULLFSINO; + return 0; + } + agi = XFS_BUF_TO_AGI(agbp); + ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); + } else { + /* + * Continue where we left off before. In this case, we + * know that the allocation group has free inodes. + */ + agbp = *IO_agbp; + agi = XFS_BUF_TO_AGI(agbp); + ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); + ASSERT(be32_to_cpu(agi->agi_freecount) > 0); + } + mp = tp->t_mountp; + agcount = mp->m_sb.sb_agcount; + agno = be32_to_cpu(agi->agi_seqno); + tagno = agno; + pagno = XFS_INO_TO_AGNO(mp, parent); + pagino = XFS_INO_TO_AGINO(mp, parent); + + /* + * If we have already hit the ceiling of inode blocks then clear + * okalloc so we scan all available agi structures for a free + * inode. + */ + + if (mp->m_maxicount && + mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { + noroom = 1; + okalloc = 0; + } + + /* + * Loop until we find an allocation group that either has free inodes + * or in which we can allocate some inodes. Iterate through the + * allocation groups upward, wrapping at the end. + */ + *alloc_done = B_FALSE; + while (!agi->agi_freecount) { + /* + * Don't do anything if we're not supposed to allocate + * any blocks, just go on to the next ag. + */ + if (okalloc) { + /* + * Try to allocate some new inodes in the allocation + * group. + */ + if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) { + xfs_trans_brelse(tp, agbp); + if (error == ENOSPC) { + *inop = NULLFSINO; + return 0; + } else + return error; + } + if (ialloced) { + /* + * We successfully allocated some inodes, return + * the current context to the caller so that it + * can commit the current transaction and call + * us again where we left off. + */ + ASSERT(be32_to_cpu(agi->agi_freecount) > 0); + *alloc_done = B_TRUE; + *IO_agbp = agbp; + *inop = NULLFSINO; + return 0; + } + } + /* + * If it failed, give up on this ag. + */ + xfs_trans_brelse(tp, agbp); + /* + * Go on to the next ag: get its ag header. + */ +nextag: + if (++tagno == agcount) + tagno = 0; + if (tagno == agno) { + *inop = NULLFSINO; + return noroom ? ENOSPC : 0; + } + pag = xfs_perag_get(mp, tagno); + if (pag->pagi_inodeok == 0) { + xfs_perag_put(pag); + goto nextag; + } + error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); + xfs_perag_put(pag); + if (error) + goto nextag; + agi = XFS_BUF_TO_AGI(agbp); + ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); + } + /* + * Here with an allocation group that has a free inode. + * Reset agno since we may have chosen a new ag in the + * loop above. + */ + agno = tagno; + *IO_agbp = NULL; + pag = xfs_perag_get(mp, agno); + + restart_pagno: + cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); + /* + * If pagino is 0 (this is the root inode allocation) use newino. + * This must work because we've just allocated some. + */ + if (!pagino) + pagino = be32_to_cpu(agi->agi_newino); + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + + /* + * If in the same AG as the parent, try to get near the parent. + */ + if (pagno == agno) { + int doneleft; /* done, to the left */ + int doneright; /* done, to the right */ + int searchdistance = 10; + + error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_inobt_get_rec(cur, &rec, &j); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + if (rec.ir_freecount > 0) { + /* + * Found a free inode in the same chunk + * as the parent, done. + */ + goto alloc_inode; + } + + + /* + * In the same AG as parent, but parent's chunk is full. + */ + + /* duplicate the cursor, search left & right simultaneously */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + + /* + * Skip to last blocks looked up if same parent inode. + */ + if (pagino != NULLAGINO && + pag->pagl_pagino == pagino && + pag->pagl_leftrec != NULLAGINO && + pag->pagl_rightrec != NULLAGINO) { + error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, + &trec, &doneleft, 1); + if (error) + goto error1; + + error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, + &rec, &doneright, 0); + if (error) + goto error1; + } else { + /* search left with tcur, back up 1 record */ + error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1); + if (error) + goto error1; + + /* search right with cur, go forward 1 record. */ + error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0); + if (error) + goto error1; + } + + /* + * Loop until we find an inode chunk with a free inode. + */ + while (!doneleft || !doneright) { + int useleft; /* using left inode chunk this time */ + + if (!--searchdistance) { + /* + * Not in range - save last search + * location and allocate a new inode + */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto newino; + } + + /* figure out the closer block if both are valid. */ + if (!doneleft && !doneright) { + useleft = pagino - + (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) < + rec.ir_startino - pagino; + } else { + useleft = !doneleft; + } + + /* free inodes to the left? */ + if (useleft && trec.ir_freecount) { + rec = trec; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + cur = tcur; + + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto alloc_inode; + } + + /* free inodes to the right? */ + if (!useleft && rec.ir_freecount) { + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto alloc_inode; + } + + /* get next record to check */ + if (useleft) { + error = xfs_ialloc_next_rec(tcur, &trec, + &doneleft, 1); + } else { + error = xfs_ialloc_next_rec(cur, &rec, + &doneright, 0); + } + if (error) + goto error1; + } + + /* + * We've reached the end of the btree. because + * we are only searching a small chunk of the + * btree each search, there is obviously free + * inodes closer to the parent inode than we + * are now. restart the search again. + */ + pag->pagl_pagino = NULLAGINO; + pag->pagl_leftrec = NULLAGINO; + pag->pagl_rightrec = NULLAGINO; + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + goto restart_pagno; + } + + /* + * In a different AG from the parent. + * See if the most recently allocated block has any free. + */ +newino: + if (be32_to_cpu(agi->agi_newino) != NULLAGINO) { + error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), + XFS_LOOKUP_EQ, &i); + if (error) + goto error0; + + if (i == 1) { + error = xfs_inobt_get_rec(cur, &rec, &j); + if (error) + goto error0; + + if (j == 1 && rec.ir_freecount > 0) { + /* + * The last chunk allocated in the group + * still has a free inode. + */ + goto alloc_inode; + } + } + } + + /* + * None left in the last group, search the whole AG + */ + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + for (;;) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (rec.ir_freecount > 0) + break; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + +alloc_inode: + offset = xfs_ialloc_find_free(&rec.ir_free); + ASSERT(offset >= 0); + ASSERT(offset < XFS_INODES_PER_CHUNK); + ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + error = xfs_inobt_update(cur, &rec); + if (error) + goto error0; + be32_add_cpu(&agi->agi_freecount, -1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + pag->pagi_freecount--; + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); + xfs_perag_put(pag); + *inop = ino; + return 0; +error1: + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); +error0: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_perag_put(pag); + return error; +} + +/* + * Free disk inode. Carefully avoids touching the incore inode, all + * manipulations incore are the caller's responsibility. + * The on-disk inode is not changed by this operation, only the + * btree (free inode mask) is changed. + */ +int +xfs_difree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_ino_t inode, /* inode to be freed */ + xfs_bmap_free_t *flist, /* extents to free */ + int *delete, /* set if inode cluster was deleted */ + xfs_ino_t *first_ino) /* first inode in deleted cluster */ +{ + /* REFERENCED */ + xfs_agblock_t agbno; /* block number containing inode */ + xfs_buf_t *agbp; /* buffer containing allocation group header */ + xfs_agino_t agino; /* inode number relative to allocation group */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_agi_t *agi; /* allocation group header */ + xfs_btree_cur_t *cur; /* inode btree cursor */ + int error; /* error return value */ + int i; /* result code */ + int ilen; /* inodes in an inode cluster */ + xfs_mount_t *mp; /* mount structure for filesystem */ + int off; /* offset of inode in inode chunk */ + xfs_inobt_rec_incore_t rec; /* btree record */ + struct xfs_perag *pag; + + mp = tp->t_mountp; + + /* + * Break up inode number into its components. + */ + agno = XFS_INO_TO_AGNO(mp, inode); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", + __func__, agno, mp->m_sb.sb_agcount); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + agino = XFS_INO_TO_AGINO(mp, inode); + if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { + xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", + __func__, (unsigned long long)inode, + (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + if (agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", + __func__, agbno, mp->m_sb.sb_agblocks); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + /* + * Get the allocation group header. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) { + xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", + __func__, error); + return error; + } + agi = XFS_BUF_TO_AGI(agbp); + ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); + ASSERT(agbno < be32_to_cpu(agi->agi_length)); + /* + * Initialize the cursor. + */ + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + + /* + * Look for the entry describing this inode. + */ + if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) { + xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.", + __func__, error); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) { + xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.", + __func__, error); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Get the offset in the inode chunk. + */ + off = agino - rec.ir_startino; + ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK); + ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off))); + /* + * Mark the inode free & increment the count. + */ + rec.ir_free |= XFS_INOBT_MASK(off); + rec.ir_freecount++; + + /* + * When an inode cluster is free, it becomes eligible for removal + */ + if (!(mp->m_flags & XFS_MOUNT_IKEEP) && + (rec.ir_freecount == XFS_IALLOC_INODES(mp))) { + + *delete = 1; + *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + + /* + * Remove the inode cluster from the AGI B+Tree, adjust the + * AGI and Superblock inode counts, and mark the disk space + * to be freed when the transaction is committed. + */ + ilen = XFS_IALLOC_INODES(mp); + be32_add_cpu(&agi->agi_count, -ilen); + be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); + pag = xfs_perag_get(mp, agno); + pag->pagi_freecount -= ilen - 1; + xfs_perag_put(pag); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); + + if ((error = xfs_btree_delete(cur, &i))) { + xfs_warn(mp, "%s: xfs_btree_delete returned error %d.", + __func__, error); + goto error0; + } + + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, + agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)), + XFS_IALLOC_BLOCKS(mp), flist, mp); + } else { + *delete = 0; + + error = xfs_inobt_update(cur, &rec); + if (error) { + xfs_warn(mp, "%s: xfs_inobt_update returned error %d.", + __func__, error); + goto error0; + } + + /* + * Change the inode free counts and log the ag/sb changes. + */ + be32_add_cpu(&agi->agi_freecount, 1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + pag = xfs_perag_get(mp, agno); + pag->pagi_freecount++; + xfs_perag_put(pag); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); + } + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +error0: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +STATIC int +xfs_imap_lookup( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + xfs_agblock_t agbno, + xfs_agblock_t *chunk_agbno, + xfs_agblock_t *offset_agbno, + int flags) +{ + struct xfs_inobt_rec_incore rec; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + int error; + int i; + + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) { + xfs_alert(mp, + "%s: xfs_ialloc_read_agi() returned error %d, agno %d", + __func__, error, agno); + return error; + } + + /* + * Lookup the inode record for the given agino. If the record cannot be + * found, then it's an invalid inode number and we should abort. Once + * we have a record, we need to ensure it contains the inode number + * we are looking up. + */ + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); + if (!error) { + if (i) + error = xfs_inobt_get_rec(cur, &rec, &i); + if (!error && i == 0) + error = EINVAL; + } + + xfs_trans_brelse(tp, agbp); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + if (error) + return error; + + /* check that the returned record contains the required inode */ + if (rec.ir_startino > agino || + rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) + return EINVAL; + + /* for untrusted inodes check it is allocated first */ + if ((flags & XFS_IGET_UNTRUSTED) && + (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) + return EINVAL; + + *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino); + *offset_agbno = agbno - *chunk_agbno; + return 0; +} + +/* + * Return the location of the inode in imap, for mapping it into a buffer. + */ +int +xfs_imap( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_ino_t ino, /* inode to locate */ + struct xfs_imap *imap, /* location map structure */ + uint flags) /* flags for inode btree lookup */ +{ + xfs_agblock_t agbno; /* block number of inode in the alloc group */ + xfs_agino_t agino; /* inode number within alloc group */ + xfs_agnumber_t agno; /* allocation group number */ + int blks_per_cluster; /* num blocks per inode cluster */ + xfs_agblock_t chunk_agbno; /* first block in inode chunk */ + xfs_agblock_t cluster_agbno; /* first block in inode cluster */ + int error; /* error code */ + int offset; /* index of inode in its buffer */ + int offset_agbno; /* blks from chunk start to inode */ + + ASSERT(ino != NULLFSINO); + + /* + * Split up the inode number into its parts. + */ + agno = XFS_INO_TO_AGNO(mp, ino); + agino = XFS_INO_TO_AGINO(mp, ino); + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks || + ino != XFS_AGINO_TO_INO(mp, agno, agino)) { +#ifdef DEBUG + /* + * Don't output diagnostic information for untrusted inodes + * as they can be invalid without implying corruption. + */ + if (flags & XFS_IGET_UNTRUSTED) + return XFS_ERROR(EINVAL); + if (agno >= mp->m_sb.sb_agcount) { + xfs_alert(mp, + "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)", + __func__, agno, mp->m_sb.sb_agcount); + } + if (agbno >= mp->m_sb.sb_agblocks) { + xfs_alert(mp, + "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)", + __func__, (unsigned long long)agbno, + (unsigned long)mp->m_sb.sb_agblocks); + } + if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) { + xfs_alert(mp, + "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", + __func__, ino, + XFS_AGINO_TO_INO(mp, agno, agino)); + } + xfs_stack_trace(); +#endif /* DEBUG */ + return XFS_ERROR(EINVAL); + } + + blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; + + /* + * For bulkstat and handle lookups, we have an untrusted inode number + * that we have to verify is valid. We cannot do this just by reading + * the inode buffer as it may have been unlinked and removed leaving + * inodes in stale state on disk. Hence we have to do a btree lookup + * in all cases where an untrusted inode number is passed. + */ + if (flags & XFS_IGET_UNTRUSTED) { + error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + &chunk_agbno, &offset_agbno, flags); + if (error) + return error; + goto out_map; + } + + /* + * If the inode cluster size is the same as the blocksize or + * smaller we get to the buffer by simple arithmetics. + */ + if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) { + offset = XFS_INO_TO_OFFSET(mp, ino); + ASSERT(offset < mp->m_sb.sb_inopblock); + + imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap->im_len = XFS_FSB_TO_BB(mp, 1); + imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + return 0; + } + + /* + * If the inode chunks are aligned then use simple maths to + * find the location. Otherwise we have to do a btree + * lookup to find the location. + */ + if (mp->m_inoalign_mask) { + offset_agbno = agbno & mp->m_inoalign_mask; + chunk_agbno = agbno - offset_agbno; + } else { + error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + &chunk_agbno, &offset_agbno, flags); + if (error) + return error; + } + +out_map: + ASSERT(agbno >= chunk_agbno); + cluster_agbno = chunk_agbno + + ((offset_agbno / blks_per_cluster) * blks_per_cluster); + offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + + XFS_INO_TO_OFFSET(mp, ino); + + imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); + imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); + imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + + /* + * If the inode number maps to a block outside the bounds + * of the file system then return NULL rather than calling + * read_buf and panicing when we get an error from the + * driver. + */ + if ((imap->im_blkno + imap->im_len) > + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { + xfs_alert(mp, + "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)", + __func__, (unsigned long long) imap->im_blkno, + (unsigned long long) imap->im_len, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); + return XFS_ERROR(EINVAL); + } + return 0; +} + +/* + * Compute and fill in value of m_in_maxlevels. + */ +void +xfs_ialloc_compute_maxlevels( + xfs_mount_t *mp) /* file system mount structure */ +{ + int level; + uint maxblocks; + uint maxleafents; + int minleafrecs; + int minnoderecs; + + maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >> + XFS_INODES_PER_CHUNK_LOG; + minleafrecs = mp->m_alloc_mnr[0]; + minnoderecs = mp->m_alloc_mnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + mp->m_in_maxlevels = level; +} + +/* + * Log specified fields for the ag hdr (inode section) + */ +void +xfs_ialloc_log_agi( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *bp, /* allocation group header buffer */ + int fields) /* bitmask of fields to log */ +{ + int first; /* first byte number */ + int last; /* last byte number */ + static const short offsets[] = { /* field starting offsets */ + /* keep in sync with bit definitions */ + offsetof(xfs_agi_t, agi_magicnum), + offsetof(xfs_agi_t, agi_versionnum), + offsetof(xfs_agi_t, agi_seqno), + offsetof(xfs_agi_t, agi_length), + offsetof(xfs_agi_t, agi_count), + offsetof(xfs_agi_t, agi_root), + offsetof(xfs_agi_t, agi_level), + offsetof(xfs_agi_t, agi_freecount), + offsetof(xfs_agi_t, agi_newino), + offsetof(xfs_agi_t, agi_dirino), + offsetof(xfs_agi_t, agi_unlinked), + sizeof(xfs_agi_t) + }; +#ifdef DEBUG + xfs_agi_t *agi; /* allocation group header */ + + agi = XFS_BUF_TO_AGI(bp); + ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); +#endif + /* + * Compute byte offsets for the first and last fields. + */ + xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last); + /* + * Log the allocation group inode header buffer. + */ + xfs_trans_log_buf(tp, bp, first, last); +} + +#ifdef DEBUG +STATIC void +xfs_check_agi_unlinked( + struct xfs_agi *agi) +{ + int i; + + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) + ASSERT(agi->agi_unlinked[i]); +} +#else +#define xfs_check_agi_unlinked(agi) +#endif + +/* + * Read in the allocation group header (inode allocation section) + */ +int +xfs_read_agi( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + struct xfs_buf **bpp) /* allocation group hdr buf */ +{ + struct xfs_agi *agi; /* allocation group header */ + int agi_ok; /* agi is consistent */ + int error; + + ASSERT(agno != NULLAGNUMBER); + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, bpp); + if (error) + return error; + + ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp)); + agi = XFS_BUF_TO_AGI(*bpp); + + /* + * Validate the magic number of the agi block. + */ + agi_ok = be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && + XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) && + be32_to_cpu(agi->agi_seqno) == agno; + if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI))) { + XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW, + mp, agi); + xfs_trans_brelse(tp, *bpp); + return XFS_ERROR(EFSCORRUPTED); + } + + XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF); + + xfs_check_agi_unlinked(agi); + return 0; +} + +int +xfs_ialloc_read_agi( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + struct xfs_buf **bpp) /* allocation group hdr buf */ +{ + struct xfs_agi *agi; /* allocation group header */ + struct xfs_perag *pag; /* per allocation group data */ + int error; + + error = xfs_read_agi(mp, tp, agno, bpp); + if (error) + return error; + + agi = XFS_BUF_TO_AGI(*bpp); + pag = xfs_perag_get(mp, agno); + if (!pag->pagi_init) { + pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); + pag->pagi_count = be32_to_cpu(agi->agi_count); + pag->pagi_init = 1; + } + + /* + * It's possible for these to be out of sync if + * we are in the middle of a forced shutdown. + */ + ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || + XFS_FORCED_SHUTDOWN(mp)); + xfs_perag_put(pag); + return 0; +} + +/* + * Read in the agi to initialise the per-ag data in the mount structure + */ +int +xfs_ialloc_pagi_init( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno) /* allocation group number */ +{ + xfs_buf_t *bp = NULL; + int error; + + error = xfs_ialloc_read_agi(mp, tp, agno, &bp); + if (error) + return error; + if (bp) + xfs_trans_brelse(tp, bp); + return 0; +} diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h new file mode 100644 index 00000000..bb538547 --- /dev/null +++ b/fs/xfs/xfs_ialloc.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IALLOC_H__ +#define __XFS_IALLOC_H__ + +struct xfs_buf; +struct xfs_dinode; +struct xfs_imap; +struct xfs_mount; +struct xfs_trans; + +/* + * Allocation parameters for inode allocation. + */ +#define XFS_IALLOC_INODES(mp) (mp)->m_ialloc_inos +#define XFS_IALLOC_BLOCKS(mp) (mp)->m_ialloc_blks + +/* + * Move inodes in clusters of this size. + */ +#define XFS_INODE_BIG_CLUSTER_SIZE 8192 +#define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size + +/* + * Make an inode pointer out of the buffer/offset. + */ +static inline struct xfs_dinode * +xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) +{ + return (xfs_dinode_t *) + (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog)); +} + +/* + * Find a free (set) bit in the inode bitmask. + */ +static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) +{ + return xfs_lowbit64(*fp); +} + + +/* + * Allocate an inode on disk. + * Mode is used to tell whether the new inode will need space, and whether + * it is a directory. + * + * To work within the constraint of one allocation per transaction, + * xfs_dialloc() is designed to be called twice if it has to do an + * allocation to make more free inodes. If an inode is + * available without an allocation, agbp would be set to the current + * agbp and alloc_done set to false. + * If an allocation needed to be done, agbp would be set to the + * inode header of the allocation group and alloc_done set to true. + * The caller should then commit the current transaction and allocate a new + * transaction. xfs_dialloc() should then be called again with + * the agbp value returned from the previous call. + * + * Once we successfully pick an inode its number is returned and the + * on-disk data structures are updated. The inode itself is not read + * in, since doing so would break ordering constraints with xfs_reclaim. + * + * *agbp should be set to NULL on the first call, *alloc_done set to FALSE. + */ +int /* error */ +xfs_dialloc( + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t parent, /* parent inode (directory) */ + mode_t mode, /* mode bits for new inode */ + int okalloc, /* ok to allocate more space */ + struct xfs_buf **agbp, /* buf for a.g. inode header */ + boolean_t *alloc_done, /* an allocation was done to replenish + the free inodes */ + xfs_ino_t *inop); /* inode number allocated */ + +/* + * Free disk inode. Carefully avoids touching the incore inode, all + * manipulations incore are the caller's responsibility. + * The on-disk inode is not changed by this operation, only the + * btree (free inode mask) is changed. + */ +int /* error */ +xfs_difree( + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t inode, /* inode to be freed */ + struct xfs_bmap_free *flist, /* extents to free */ + int *delete, /* set if inode cluster was deleted */ + xfs_ino_t *first_ino); /* first inode in deleted cluster */ + +/* + * Return the location of the inode in imap, for mapping it into a buffer. + */ +int +xfs_imap( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t ino, /* inode to locate */ + struct xfs_imap *imap, /* location map structure */ + uint flags); /* flags for inode btree lookup */ + +/* + * Compute and fill in value of m_in_maxlevels. + */ +void +xfs_ialloc_compute_maxlevels( + struct xfs_mount *mp); /* file system mount structure */ + +/* + * Log specified fields for the ag hdr (inode section) + */ +void +xfs_ialloc_log_agi( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *bp, /* allocation group header buffer */ + int fields); /* bitmask of fields to log */ + +/* + * Read in the allocation group header (inode allocation section) + */ +int /* error */ +xfs_ialloc_read_agi( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + struct xfs_buf **bpp); /* allocation group hdr buf */ + +/* + * Read in the allocation group header to initialise the per-ag data + * in the mount structure + */ +int +xfs_ialloc_pagi_init( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno); /* allocation group number */ + +/* + * Lookup a record by ino in the btree given by cur. + */ +int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, + xfs_lookup_t dir, int *stat); + +/* + * Get the data from the pointed-to record. + */ +extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, + xfs_inobt_rec_incore_t *rec, int *stat); + +#endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c new file mode 100644 index 00000000..16921f55 --- /dev/null +++ b/fs/xfs/xfs_ialloc_btree.c @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_btree_trace.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_error.h" + + +STATIC int +xfs_inobt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_inobt_mnr[level != 0]; +} + +STATIC struct xfs_btree_cur * +xfs_inobt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno); +} + +STATIC void +xfs_inobt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, + int inc) /* level change */ +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + + agi->agi_root = nptr->s; + be32_add_cpu(&agi->agi_level, inc); + xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); +} + +STATIC int +xfs_inobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int length, + int *stat) +{ + xfs_alloc_arg_t args; /* block allocation args */ + int error; /* error return value */ + xfs_agblock_t sbno = be32_to_cpu(start->s); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + memset(&args, 0, sizeof(args)); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); + args.minlen = 1; + args.maxlen = 1; + args.prod = 1; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + + error = xfs_alloc_vextent(&args); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + if (args.fsbno == NULLFSBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + + new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno)); + *stat = 1; + return 0; +} + +STATIC int +xfs_inobt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + xfs_fsblock_t fsbno; + int error; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)); + error = xfs_free_extent(cur->bc_tp, fsbno, 1); + if (error) + return error; + + xfs_trans_binval(cur->bc_tp, bp); + return error; +} + +STATIC int +xfs_inobt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_inobt_mxr[level != 0]; +} + +STATIC void +xfs_inobt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->inobt.ir_startino = rec->inobt.ir_startino; +} + +STATIC void +xfs_inobt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + rec->inobt.ir_startino = key->inobt.ir_startino; +} + +STATIC void +xfs_inobt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); + rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); + rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); +} + +/* + * initial value of ptr for lookup + */ +STATIC void +xfs_inobt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + + ptr->s = agi->agi_root; +} + +STATIC __int64_t +xfs_inobt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + return (__int64_t)be32_to_cpu(key->inobt.ir_startino) - + cur->bc_rec.i.ir_startino; +} + +#ifdef DEBUG +STATIC int +xfs_inobt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->inobt.ir_startino) < + be32_to_cpu(k2->inobt.ir_startino); +} + +STATIC int +xfs_inobt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <= + be32_to_cpu(r2->inobt.ir_startino); +} +#endif /* DEBUG */ + +#ifdef XFS_BTREE_TRACE +ktrace_t *xfs_inobt_trace_buf; + +STATIC void +xfs_inobt_trace_enter( + struct xfs_btree_cur *cur, + const char *func, + char *s, + int type, + int line, + __psunsigned_t a0, + __psunsigned_t a1, + __psunsigned_t a2, + __psunsigned_t a3, + __psunsigned_t a4, + __psunsigned_t a5, + __psunsigned_t a6, + __psunsigned_t a7, + __psunsigned_t a8, + __psunsigned_t a9, + __psunsigned_t a10) +{ + ktrace_enter(xfs_inobt_trace_buf, (void *)(__psint_t)type, + (void *)func, (void *)s, NULL, (void *)cur, + (void *)a0, (void *)a1, (void *)a2, (void *)a3, + (void *)a4, (void *)a5, (void *)a6, (void *)a7, + (void *)a8, (void *)a9, (void *)a10); +} + +STATIC void +xfs_inobt_trace_cursor( + struct xfs_btree_cur *cur, + __uint32_t *s0, + __uint64_t *l0, + __uint64_t *l1) +{ + *s0 = cur->bc_private.a.agno; + *l0 = cur->bc_rec.i.ir_startino; + *l1 = cur->bc_rec.i.ir_free; +} + +STATIC void +xfs_inobt_trace_key( + struct xfs_btree_cur *cur, + union xfs_btree_key *key, + __uint64_t *l0, + __uint64_t *l1) +{ + *l0 = be32_to_cpu(key->inobt.ir_startino); + *l1 = 0; +} + +STATIC void +xfs_inobt_trace_record( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + __uint64_t *l0, + __uint64_t *l1, + __uint64_t *l2) +{ + *l0 = be32_to_cpu(rec->inobt.ir_startino); + *l1 = be32_to_cpu(rec->inobt.ir_freecount); + *l2 = be64_to_cpu(rec->inobt.ir_free); +} +#endif /* XFS_BTREE_TRACE */ + +static const struct xfs_btree_ops xfs_inobt_ops = { + .rec_len = sizeof(xfs_inobt_rec_t), + .key_len = sizeof(xfs_inobt_key_t), + + .dup_cursor = xfs_inobt_dup_cursor, + .set_root = xfs_inobt_set_root, + .alloc_block = xfs_inobt_alloc_block, + .free_block = xfs_inobt_free_block, + .get_minrecs = xfs_inobt_get_minrecs, + .get_maxrecs = xfs_inobt_get_maxrecs, + .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_rec_from_key = xfs_inobt_init_rec_from_key, + .init_rec_from_cur = xfs_inobt_init_rec_from_cur, + .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, + .key_diff = xfs_inobt_key_diff, + +#ifdef DEBUG + .keys_inorder = xfs_inobt_keys_inorder, + .recs_inorder = xfs_inobt_recs_inorder, +#endif + +#ifdef XFS_BTREE_TRACE + .trace_enter = xfs_inobt_trace_enter, + .trace_cursor = xfs_inobt_trace_cursor, + .trace_key = xfs_inobt_trace_key, + .trace_record = xfs_inobt_trace_record, +#endif +}; + +/* + * Allocate a new inode btree cursor. + */ +struct xfs_btree_cur * /* new inode btree cursor */ +xfs_inobt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for agi structure */ + xfs_agnumber_t agno) /* allocation group number */ +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_btree_cur *cur; + + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + cur->bc_btnum = XFS_BTNUM_INO; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + + cur->bc_ops = &xfs_inobt_ops; + + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + + return cur; +} + +/* + * Calculate number of records in an inobt btree block. + */ +int +xfs_inobt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= XFS_INOBT_BLOCK_LEN(mp); + + if (leaf) + return blocklen / sizeof(xfs_inobt_rec_t); + return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); +} diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h new file mode 100644 index 00000000..f782ad0c --- /dev/null +++ b/fs/xfs/xfs_ialloc_btree.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IALLOC_BTREE_H__ +#define __XFS_IALLOC_BTREE_H__ + +/* + * Inode map on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +/* + * There is a btree for the inode map per allocation group. + */ +#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ + +typedef __uint64_t xfs_inofree_t; +#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) +#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) +#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) +#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) + +static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) +{ + return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; +} + +/* + * Data record structure + */ +typedef struct xfs_inobt_rec { + __be32 ir_startino; /* starting inode number */ + __be32 ir_freecount; /* count of free inodes (set bits) */ + __be64 ir_free; /* free inode mask */ +} xfs_inobt_rec_t; + +typedef struct xfs_inobt_rec_incore { + xfs_agino_t ir_startino; /* starting inode number */ + __int32_t ir_freecount; /* count of free inodes (set bits) */ + xfs_inofree_t ir_free; /* free inode mask */ +} xfs_inobt_rec_incore_t; + + +/* + * Key structure + */ +typedef struct xfs_inobt_key { + __be32 ir_startino; /* starting inode number */ +} xfs_inobt_key_t; + +/* btree pointer type */ +typedef __be32 xfs_inobt_ptr_t; + +/* + * block numbers in the AG. + */ +#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) +#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) + +/* + * Btree block header size depends on a superblock flag. + * + * (not quite yet, but soon) + */ +#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_INOBT_REC_ADDR(mp, block, index) \ + ((xfs_inobt_rec_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + (((index) - 1) * sizeof(xfs_inobt_rec_t)))) + +#define XFS_INOBT_KEY_ADDR(mp, block, index) \ + ((xfs_inobt_key_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_inobt_key_t))) + +#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_inobt_ptr_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_inobt_key_t) + \ + ((index) - 1) * sizeof(xfs_inobt_ptr_t))) + +extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t); +extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); + +#endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c new file mode 100644 index 00000000..ca752f05 --- /dev/null +++ b/fs/xfs/xfs_iget.c @@ -0,0 +1,727 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_acl.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_utils.h" +#include "xfs_trans_priv.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_btree_trace.h" +#include "xfs_trace.h" + + +/* + * Define xfs inode iolock lockdep classes. We need to ensure that all active + * inodes are considered the same for lockdep purposes, including inodes that + * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to + * guarantee the locks are considered the same when there are multiple lock + * initialisation siteѕ. Also, define a reclaimable inode class so it is + * obvious in lockdep reports which class the report is against. + */ +static struct lock_class_key xfs_iolock_active; +struct lock_class_key xfs_iolock_reclaimable; + +/* + * Allocate and initialise an xfs_inode. + */ +STATIC struct xfs_inode * +xfs_inode_alloc( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + + /* + * if this didn't occur in transactions, we could use + * KM_MAYFAIL and return NULL here on ENOMEM. Set the + * code up to do this anyway. + */ + ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); + if (!ip) + return NULL; + if (inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } + + ASSERT(atomic_read(&ip->i_iocount) == 0); + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(completion_done(&ip->i_flush)); + ASSERT(ip->i_ino == 0); + + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + lockdep_set_class_and_name(&ip->i_iolock.mr_lock, + &xfs_iolock_active, "xfs_iolock_active"); + + /* initialise the xfs inode */ + ip->i_ino = ino; + ip->i_mount = mp; + memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); + ip->i_afp = NULL; + memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); + ip->i_flags = 0; + ip->i_update_core = 0; + ip->i_delayed_blks = 0; + memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); + ip->i_size = 0; + ip->i_new_size = 0; + + return ip; +} + +STATIC void +xfs_inode_free_callback( + struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct xfs_inode *ip = XFS_I(inode); + + INIT_LIST_HEAD(&inode->i_dentry); + kmem_zone_free(xfs_inode_zone, ip); +} + +void +xfs_inode_free( + struct xfs_inode *ip) +{ + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + xfs_idestroy_fork(ip, XFS_DATA_FORK); + break; + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + if (ip->i_itemp) { + /* + * Only if we are shutting down the fs will we see an + * inode still in the AIL. If it is there, we should remove + * it to prevent a use-after-free from occurring. + */ + xfs_log_item_t *lip = &ip->i_itemp->ili_item; + struct xfs_ail *ailp = lip->li_ailp; + + ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || + XFS_FORCED_SHUTDOWN(ip->i_mount)); + if (lip->li_flags & XFS_LI_IN_AIL) { + spin_lock(&ailp->xa_lock); + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, lip); + else + spin_unlock(&ailp->xa_lock); + } + xfs_inode_item_destroy(ip); + ip->i_itemp = NULL; + } + + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_iocount) == 0); + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(completion_done(&ip->i_flush)); + + /* + * Because we use RCU freeing we need to ensure the inode always + * appears to be reclaimed with an invalid inode number when in the + * free state. The ip->i_flags_lock provides the barrier against lookup + * races. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + +/* + * Check the validity of the inode we just found it the cache + */ +static int +xfs_iget_cache_hit( + struct xfs_perag *pag, + struct xfs_inode *ip, + xfs_ino_t ino, + int flags, + int lock_flags) __releases(RCU) +{ + struct inode *inode = VFS_I(ip); + struct xfs_mount *mp = ip->i_mount; + int error; + + /* + * check for re-use of an inode within an RCU grace period due to the + * radix tree nodes not being updated yet. We monitor for this by + * setting the inode number to zero before freeing the inode structure. + * If the inode has been reallocated and set up, then the inode number + * will not match, so check for that, too. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + + + /* + * If we are racing with another cache hit that is currently + * instantiating this inode or currently recycling it out of + * reclaimabe state, wait for the initialisation to complete + * before continuing. + * + * XXX(hch): eventually we should do something equivalent to + * wait_on_inode to wait for these flags to be cleared + * instead of polling for it. + */ + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + + /* + * If lookup is racing with unlink return an error immediately. + */ + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_error; + } + + /* + * If IRECLAIMABLE is set, we've torn down the VFS inode already. + * Need to carefully get it back into useable state. + */ + if (ip->i_flags & XFS_IRECLAIMABLE) { + trace_xfs_iget_reclaim(ip); + + /* + * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode + * from stomping over us while we recycle the inode. We can't + * clear the radix tree reclaimable tag yet as it requires + * pag_ici_lock to be held exclusive. + */ + ip->i_flags |= XFS_IRECLAIM; + + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + error = -inode_init_always(mp->m_super, inode); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + rcu_read_lock(); + spin_lock(&ip->i_flags_lock); + + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + trace_xfs_iget_reclaim_fail(ip); + goto out_error; + } + + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + /* + * Clear the per-lifetime state in the inode as we are now + * effectively a new inode and need to return to the initial + * state before reuse occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; + ip->i_flags |= XFS_INEW; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); + inode->i_state = I_NEW; + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + lockdep_set_class_and_name(&ip->i_iolock.mr_lock, + &xfs_iolock_active, "xfs_iolock_active"); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + } else { + /* If the VFS inode is being torn down, pause and try again. */ + if (!igrab(inode)) { + trace_xfs_iget_skip(ip); + error = EAGAIN; + goto out_error; + } + + /* We've got a live one. */ + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + trace_xfs_iget_hit(ip); + } + + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); + + xfs_iflags_clear(ip, XFS_ISTALE); + XFS_STATS_INC(xs_ig_found); + + return 0; + +out_error: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + return error; +} + + +static int +xfs_iget_cache_miss( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_trans_t *tp, + xfs_ino_t ino, + struct xfs_inode **ipp, + int flags, + int lock_flags) +{ + struct xfs_inode *ip; + int error; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); + + ip = xfs_inode_alloc(mp, ino); + if (!ip) + return ENOMEM; + + error = xfs_iread(mp, tp, ip, flags); + if (error) + goto out_destroy; + + trace_xfs_iget_miss(ip); + + if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_destroy; + } + + /* + * Preload the radix tree so we can insert safely under the + * write spinlock. Note that we cannot sleep inside the preload + * region. + */ + if (radix_tree_preload(GFP_KERNEL)) { + error = EAGAIN; + goto out_destroy; + } + + /* + * Because the inode hasn't been added to the radix-tree yet it can't + * be found by another thread, so we can do the non-sleeping lock here. + */ + if (lock_flags) { + if (!xfs_ilock_nowait(ip, lock_flags)) + BUG(); + } + + /* + * These values must be set before inserting the inode into the radix + * tree as the moment it is inserted a concurrent lookup (allowed by the + * RCU locking mechanism) can find it and that lookup must see that this + * is an inode currently under construction (i.e. that XFS_INEW is set). + * The ip->i_flags_lock that protects the XFS_INEW flag forms the + * memory barrier that ensures this detection works correctly at lookup + * time. + */ + ip->i_udquot = ip->i_gdquot = NULL; + xfs_iflags_set(ip, XFS_INEW); + + /* insert the new inode */ + spin_lock(&pag->pag_ici_lock); + error = radix_tree_insert(&pag->pag_ici_root, agino, ip); + if (unlikely(error)) { + WARN_ON(error != -EEXIST); + XFS_STATS_INC(xs_ig_dup); + error = EAGAIN; + goto out_preload_end; + } + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + + *ipp = ip; + return 0; + +out_preload_end: + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + if (lock_flags) + xfs_iunlock(ip, lock_flags); +out_destroy: + __destroy_inode(VFS_I(ip)); + xfs_inode_free(ip); + return error; +} + +/* + * Look up an inode by number in the given file system. + * The inode is looked up in the cache held in each AG. + * If the inode is found in the cache, initialise the vfs inode + * if necessary. + * + * If it is not in core, read it in from the file system's device, + * add it to the cache and initialise the vfs inode. + * + * The inode is locked according to the value of the lock_flags parameter. + * This flag parameter indicates how and if the inode's IO lock and inode lock + * should be taken. + * + * mp -- the mount point structure for the current file system. It points + * to the inode hash table. + * tp -- a pointer to the current transaction if there is one. This is + * simply passed through to the xfs_iread() call. + * ino -- the number of the inode desired. This is the unique identifier + * within the file system for the inode being requested. + * lock_flags -- flags indicating how to lock the inode. See the comment + * for xfs_ilock() for a list of valid values. + */ +int +xfs_iget( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + xfs_inode_t **ipp) +{ + xfs_inode_t *ip; + int error; + xfs_perag_t *pag; + xfs_agino_t agino; + + /* reject inode numbers outside existing AGs */ + if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) + return EINVAL; + + /* get the perag structure and ensure that it's inode capable */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); + agino = XFS_INO_TO_AGINO(mp, ino); + +again: + error = 0; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + + if (ip) { + error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); + if (error) + goto out_error_or_again; + } else { + rcu_read_unlock(); + XFS_STATS_INC(xs_ig_missed); + + error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, + flags, lock_flags); + if (error) + goto out_error_or_again; + } + xfs_perag_put(pag); + + *ipp = ip; + + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t)); + /* + * If we have a real type for an on-disk inode, we can set ops(&unlock) + * now. If it's a new inode being created, xfs_ialloc will handle it. + */ + if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) + xfs_setup_inode(ip); + return 0; + +out_error_or_again: + if (error == EAGAIN) { + delay(1); + goto again; + } + xfs_perag_put(pag); + return error; +} + +/* + * This is a wrapper routine around the xfs_ilock() routine + * used to centralize some grungy code. It is used in places + * that wish to lock the inode solely for reading the extents. + * The reason these places can't just call xfs_ilock(SHARED) + * is that the inode lock also guards to bringing in of the + * extents from disk for a file in b-tree format. If the inode + * is in b-tree format, then we need to lock the inode exclusively + * until the extents are read in. Locking it exclusively all + * the time would limit our parallelism unnecessarily, though. + * What we do instead is check to see if the extents have been + * read in yet, and only lock the inode exclusively if they + * have not. + * + * The function returns a value which should be given to the + * corresponding xfs_iunlock_map_shared(). This value is + * the mode in which the lock was actually taken. + */ +uint +xfs_ilock_map_shared( + xfs_inode_t *ip) +{ + uint lock_mode; + + if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && + ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { + lock_mode = XFS_ILOCK_EXCL; + } else { + lock_mode = XFS_ILOCK_SHARED; + } + + xfs_ilock(ip, lock_mode); + + return lock_mode; +} + +/* + * This is simply the unlock routine to go with xfs_ilock_map_shared(). + * All it does is call xfs_iunlock() with the given lock_mode. + */ +void +xfs_iunlock_map_shared( + xfs_inode_t *ip, + unsigned int lock_mode) +{ + xfs_iunlock(ip, lock_mode); +} + +/* + * The xfs inode contains 2 locks: a multi-reader lock called the + * i_iolock and a multi-reader lock called the i_lock. This routine + * allows either or both of the locks to be obtained. + * + * The 2 locks should always be ordered so that the IO lock is + * obtained first in order to prevent deadlock. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks + * to be locked. It can be: + * XFS_IOLOCK_SHARED, + * XFS_IOLOCK_EXCL, + * XFS_ILOCK_SHARED, + * XFS_ILOCK_EXCL, + * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, + * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, + * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, + * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL + */ +void +xfs_ilock( + xfs_inode_t *ip, + uint lock_flags) +{ + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_IOLOCK_SHARED) + mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + + if (lock_flags & XFS_ILOCK_EXCL) + mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + else if (lock_flags & XFS_ILOCK_SHARED) + mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + + trace_xfs_ilock(ip, lock_flags, _RET_IP_); +} + +/* + * This is just like xfs_ilock(), except that the caller + * is guaranteed not to sleep. It returns 1 if it gets + * the requested locks and 0 otherwise. If the IO lock is + * obtained but the inode lock cannot be, then the IO lock + * is dropped before returning. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks to be + * to be locked. See the comment for xfs_ilock() for a list + * of valid values. + */ +int +xfs_ilock_nowait( + xfs_inode_t *ip, + uint lock_flags) +{ + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) { + if (!mrtryupdate(&ip->i_iolock)) + goto out; + } else if (lock_flags & XFS_IOLOCK_SHARED) { + if (!mrtryaccess(&ip->i_iolock)) + goto out; + } + if (lock_flags & XFS_ILOCK_EXCL) { + if (!mrtryupdate(&ip->i_lock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_ILOCK_SHARED) { + if (!mrtryaccess(&ip->i_lock)) + goto out_undo_iolock; + } + trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); + return 1; + + out_undo_iolock: + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + out: + return 0; +} + +/* + * xfs_iunlock() is used to drop the inode locks acquired with + * xfs_ilock() and xfs_ilock_nowait(). The caller must pass + * in the flags given to xfs_ilock() or xfs_ilock_nowait() so + * that we know which locks to drop. + * + * ip -- the inode being unlocked + * lock_flags -- this parameter indicates the inode's locks to be + * to be unlocked. See the comment for xfs_ilock() for a list + * of valid values for this parameter. + * + */ +void +xfs_iunlock( + xfs_inode_t *ip, + uint lock_flags) +{ + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY | + XFS_LOCK_DEP_MASK)) == 0); + ASSERT(lock_flags != 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + + if (lock_flags & XFS_ILOCK_EXCL) + mrunlock_excl(&ip->i_lock); + else if (lock_flags & XFS_ILOCK_SHARED) + mrunlock_shared(&ip->i_lock); + + if ((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) && + !(lock_flags & XFS_IUNLOCK_NONOTIFY) && ip->i_itemp) { + /* + * Let the AIL know that this item has been unlocked in case + * it is in the AIL and anyone is waiting on it. Don't do + * this if the caller has asked us not to. + */ + xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp, + (xfs_log_item_t*)(ip->i_itemp)); + } + trace_xfs_iunlock(ip, lock_flags, _RET_IP_); +} + +/* + * give up write locks. the i/o lock cannot be held nested + * if it is being demoted. + */ +void +xfs_ilock_demote( + xfs_inode_t *ip, + uint lock_flags) +{ + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + + if (lock_flags & XFS_ILOCK_EXCL) + mrdemote(&ip->i_lock); + if (lock_flags & XFS_IOLOCK_EXCL) + mrdemote(&ip->i_iolock); + + trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); +} + +#ifdef DEBUG +int +xfs_isilocked( + xfs_inode_t *ip, + uint lock_flags) +{ + if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { + if (!(lock_flags & XFS_ILOCK_SHARED)) + return !!ip->i_lock.mr_writer; + return rwsem_is_locked(&ip->i_lock.mr_lock); + } + + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { + if (!(lock_flags & XFS_IOLOCK_SHARED)) + return !!ip->i_iolock.mr_writer; + return rwsem_is_locked(&ip->i_iolock.mr_lock); + } + + ASSERT(0); + return 0; +} +#endif diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c new file mode 100644 index 00000000..57152799 --- /dev/null +++ b/fs/xfs/xfs_inode.c @@ -0,0 +1,4145 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_buf_item.h" +#include "xfs_inode_item.h" +#include "xfs_btree.h" +#include "xfs_btree_trace.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_utils.h" +#include "xfs_quota.h" +#include "xfs_filestream.h" +#include "xfs_vnodeops.h" +#include "xfs_trace.h" + +kmem_zone_t *xfs_ifork_zone; +kmem_zone_t *xfs_inode_zone; + +/* + * Used in xfs_itruncate(). This is the maximum number of extents + * freed from a file in a single transaction. + */ +#define XFS_ITRUNC_MAX_EXTENTS 2 + +STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); +STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); +STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); +STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); + +#ifdef DEBUG +/* + * Make sure that the extents in the given memory buffer + * are valid. + */ +STATIC void +xfs_validate_extents( + xfs_ifork_t *ifp, + int nrecs, + xfs_exntfmt_t fmt) +{ + xfs_bmbt_irec_t irec; + xfs_bmbt_rec_host_t rec; + int i; + + for (i = 0; i < nrecs; i++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + rec.l0 = get_unaligned(&ep->l0); + rec.l1 = get_unaligned(&ep->l1); + xfs_bmbt_get_all(&rec, &irec); + if (fmt == XFS_EXTFMT_NOSTATE) + ASSERT(irec.br_state == XFS_EXT_NORM); + } +} +#else /* DEBUG */ +#define xfs_validate_extents(ifp, nrecs, fmt) +#endif /* DEBUG */ + +/* + * Check that none of the inode's in the buffer have a next + * unlinked field of 0. + */ +#if defined(DEBUG) +void +xfs_inobp_check( + xfs_mount_t *mp, + xfs_buf_t *bp) +{ + int i; + int j; + xfs_dinode_t *dip; + + j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + + for (i = 0; i < j; i++) { + dip = (xfs_dinode_t *)xfs_buf_offset(bp, + i * mp->m_sb.sb_inodesize); + if (!dip->di_next_unlinked) { + xfs_alert(mp, + "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.", + bp); + ASSERT(dip->di_next_unlinked); + } + } +} +#endif + +/* + * Find the buffer associated with the given inode map + * We do basic validation checks on the buffer once it has been + * retrieved from disk. + */ +STATIC int +xfs_imap_to_bp( + xfs_mount_t *mp, + xfs_trans_t *tp, + struct xfs_imap *imap, + xfs_buf_t **bpp, + uint buf_flags, + uint iget_flags) +{ + int error; + int i; + int ni; + xfs_buf_t *bp; + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + (int)imap->im_len, buf_flags, &bp); + if (error) { + if (error != EAGAIN) { + xfs_warn(mp, + "%s: xfs_trans_read_buf() returned error %d.", + __func__, error); + } else { + ASSERT(buf_flags & XBF_TRYLOCK); + } + return error; + } + + /* + * Validate the magic number and version of every inode in the buffer + * (if DEBUG kernel) or the first inode in the buffer, otherwise. + */ +#ifdef DEBUG + ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog; +#else /* usual case */ + ni = 1; +#endif + + for (i = 0; i < ni; i++) { + int di_ok; + xfs_dinode_t *dip; + + dip = (xfs_dinode_t *)xfs_buf_offset(bp, + (i << mp->m_sb.sb_inodelog)); + di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC && + XFS_DINODE_GOOD_VERSION(dip->di_version); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + if (iget_flags & XFS_IGET_UNTRUSTED) { + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EINVAL); + } + XFS_CORRUPTION_ERROR("xfs_imap_to_bp", + XFS_ERRLEVEL_HIGH, mp, dip); +#ifdef DEBUG + xfs_emerg(mp, + "bad inode magic/vsn daddr %lld #%d (magic=%x)", + (unsigned long long)imap->im_blkno, i, + be16_to_cpu(dip->di_magic)); + ASSERT(0); +#endif + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); + } + } + + xfs_inobp_check(mp, bp); + + /* + * Mark the buffer as an inode buffer now that it looks good + */ + XFS_BUF_SET_VTYPE(bp, B_FS_INO); + + *bpp = bp; + return 0; +} + +/* + * This routine is called to map an inode number within a file + * system to the buffer containing the on-disk version of the + * inode. It returns a pointer to the buffer containing the + * on-disk inode in the bpp parameter, and in the dip parameter + * it returns a pointer to the on-disk inode within that buffer. + * + * If a non-zero error is returned, then the contents of bpp and + * dipp are undefined. + * + * Use xfs_imap() to determine the size and location of the + * buffer to read from disk. + */ +int +xfs_inotobp( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + xfs_dinode_t **dipp, + xfs_buf_t **bpp, + int *offset, + uint imap_flags) +{ + struct xfs_imap imap; + xfs_buf_t *bp; + int error; + + imap.im_blkno = 0; + error = xfs_imap(mp, tp, ino, &imap, imap_flags); + if (error) + return error; + + error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags); + if (error) + return error; + + *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); + *bpp = bp; + *offset = imap.im_boffset; + return 0; +} + + +/* + * This routine is called to map an inode to the buffer containing + * the on-disk version of the inode. It returns a pointer to the + * buffer containing the on-disk inode in the bpp parameter, and in + * the dip parameter it returns a pointer to the on-disk inode within + * that buffer. + * + * If a non-zero error is returned, then the contents of bpp and + * dipp are undefined. + * + * The inode is expected to already been mapped to its buffer and read + * in once, thus we can use the mapping information stored in the inode + * rather than calling xfs_imap(). This allows us to avoid the overhead + * of looking at the inode btree for small block file systems + * (see xfs_imap()). + */ +int +xfs_itobp( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_dinode_t **dipp, + xfs_buf_t **bpp, + uint buf_flags) +{ + xfs_buf_t *bp; + int error; + + ASSERT(ip->i_imap.im_blkno != 0); + + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0); + if (error) + return error; + + if (!bp) { + ASSERT(buf_flags & XBF_TRYLOCK); + ASSERT(tp == NULL); + *bpp = NULL; + return EAGAIN; + } + + *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); + *bpp = bp; + return 0; +} + +/* + * Move inode type and inode format specific information from the + * on-disk inode to the in-core inode. For fifos, devs, and sockets + * this means set if_rdev to the proper value. For files, directories, + * and symlinks this means to bring in the in-line data or extent + * pointers. For a file in B-tree format, only the root is immediately + * brought in-core. The rest will be in-lined in if_extents when it + * is first referenced (see xfs_iread_extents()). + */ +STATIC int +xfs_iformat( + xfs_inode_t *ip, + xfs_dinode_t *dip) +{ + xfs_attr_shortform_t *atp; + int size; + int error; + xfs_fsize_t di_size; + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + error = 0; + + if (unlikely(be32_to_cpu(dip->di_nextents) + + be16_to_cpu(dip->di_anextents) > + be64_to_cpu(dip->di_nblocks))) { + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", + (unsigned long long)ip->i_ino, + (int)(be32_to_cpu(dip->di_nextents) + + be16_to_cpu(dip->di_anextents)), + (unsigned long long) + be64_to_cpu(dip->di_nblocks)); + XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { + xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", + (unsigned long long)ip->i_ino, + dip->di_forkoff); + XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && + !ip->i_mount->m_rtdev_targp)) { + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, has realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { + XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + ip->i_d.di_size = 0; + ip->i_size = 0; + ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); + break; + + case S_IFREG: + case S_IFLNK: + case S_IFDIR: + switch (dip->di_format) { + case XFS_DINODE_FMT_LOCAL: + /* + * no local regular files yet + */ + if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (local format for regular file).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(4)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + di_size = be64_to_cpu(dip->di_size); + if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad size %Ld for local inode).", + (unsigned long long) ip->i_ino, + (long long) di_size); + XFS_CORRUPTION_ERROR("xfs_iformat(5)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + size = (int)di_size; + error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); + break; + default: + XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + break; + + default: + XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + if (error) { + return error; + } + if (!XFS_DFORK_Q(dip)) + return 0; + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); + ip->i_afp->if_ext_max = + XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + switch (dip->di_aformat) { + case XFS_DINODE_FMT_LOCAL: + atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); + size = be16_to_cpu(atp->hdr.totsize); + + if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad attr fork size %Ld).", + (unsigned long long) ip->i_ino, + (long long) size); + XFS_CORRUPTION_ERROR("xfs_iformat(8)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); + break; + default: + error = XFS_ERROR(EFSCORRUPTED); + break; + } + if (error) { + kmem_zone_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + xfs_idestroy_fork(ip, XFS_DATA_FORK); + } + return error; +} + +/* + * The file is in-lined in the on-disk inode. + * If it fits into if_inline_data, then copy + * it there, otherwise allocate a buffer for it + * and copy the data there. Either way, set + * if_data to point at the data. + * If we allocate a buffer for the data, make + * sure that its size is a multiple of 4 and + * record the real size in i_real_bytes. + */ +STATIC int +xfs_iformat_local( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork, + int size) +{ + xfs_ifork_t *ifp; + int real_size; + + /* + * If the size is unreasonable, then something + * is wrong and we just bail out rather than crash in + * kmem_alloc() or memcpy() below. + */ + if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad size %d for local fork, size = %d).", + (unsigned long long) ip->i_ino, size, + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); + XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + ifp = XFS_IFORK_PTR(ip, whichfork); + real_size = 0; + if (size == 0) + ifp->if_u1.if_data = NULL; + else if (size <= sizeof(ifp->if_u2.if_inline_data)) + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + else { + real_size = roundup(size, 4); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); + } + ifp->if_bytes = size; + ifp->if_real_bytes = real_size; + if (size) + memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); + ifp->if_flags &= ~XFS_IFEXTENTS; + ifp->if_flags |= XFS_IFINLINE; + return 0; +} + +/* + * The file consists of a set of extents all + * of which fit into the on-disk inode. + * If there are few enough extents to fit into + * the if_inline_ext, then copy them there. + * Otherwise allocate a buffer for them and copy + * them into it. Either way, set if_extents + * to point at the extents. + */ +STATIC int +xfs_iformat_extents( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork) +{ + xfs_bmbt_rec_t *dp; + xfs_ifork_t *ifp; + int nex; + int size; + int i; + + ifp = XFS_IFORK_PTR(ip, whichfork); + nex = XFS_DFORK_NEXTENTS(dip, whichfork); + size = nex * (uint)sizeof(xfs_bmbt_rec_t); + + /* + * If the number of extents is unreasonable, then something + * is wrong and we just bail out rather than crash in + * kmem_alloc() or memcpy() below. + */ + if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", + (unsigned long long) ip->i_ino, nex); + XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + ifp->if_real_bytes = 0; + if (nex == 0) + ifp->if_u1.if_extents = NULL; + else if (nex <= XFS_INLINE_EXTS) + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + else + xfs_iext_add(ifp, 0, nex); + + ifp->if_bytes = size; + if (size) { + dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); + xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); + for (i = 0; i < nex; i++, dp++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + ep->l0 = get_unaligned_be64(&dp->l0); + ep->l1 = get_unaligned_be64(&dp->l1); + } + XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); + if (whichfork != XFS_DATA_FORK || + XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) + if (unlikely(xfs_check_nostate_extents( + ifp, 0, nex))) { + XFS_ERROR_REPORT("xfs_iformat_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + } + ifp->if_flags |= XFS_IFEXTENTS; + return 0; +} + +/* + * The file has too many extents to fit into + * the inode, so they are in B-tree format. + * Allocate a buffer for the root of the B-tree + * and copy the root into it. The i_extents + * field will remain NULL until all of the + * extents are read in (when they are needed). + */ +STATIC int +xfs_iformat_btree( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork) +{ + xfs_bmdr_block_t *dfp; + xfs_ifork_t *ifp; + /* REFERENCED */ + int nrecs; + int size; + + ifp = XFS_IFORK_PTR(ip, whichfork); + dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); + size = XFS_BMAP_BROOT_SPACE(dfp); + nrecs = be16_to_cpu(dfp->bb_numrecs); + + /* + * blow out if -- fork has less extents than can fit in + * fork (fork shouldn't be a btree format), root btree + * block has more records than can fit into the fork, + * or the number of extents is greater than the number of + * blocks. + */ + if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max + || XFS_BMDR_SPACE_CALC(nrecs) > + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) + || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + ifp->if_broot_bytes = size; + ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); + ASSERT(ifp->if_broot != NULL); + /* + * Copy and convert from the on-disk structure + * to the in-memory structure. + */ + xfs_bmdr_to_bmbt(ip->i_mount, dfp, + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), + ifp->if_broot, size); + ifp->if_flags &= ~XFS_IFEXTENTS; + ifp->if_flags |= XFS_IFBROOT; + + return 0; +} + +STATIC void +xfs_dinode_from_disk( + xfs_icdinode_t *to, + xfs_dinode_t *from) +{ + to->di_magic = be16_to_cpu(from->di_magic); + to->di_mode = be16_to_cpu(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = be16_to_cpu(from->di_onlink); + to->di_uid = be32_to_cpu(from->di_uid); + to->di_gid = be32_to_cpu(from->di_gid); + to->di_nlink = be32_to_cpu(from->di_nlink); + to->di_projid_lo = be16_to_cpu(from->di_projid_lo); + to->di_projid_hi = be16_to_cpu(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_flushiter = be16_to_cpu(from->di_flushiter); + to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); + to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); + to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); + to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); + to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); + to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); + to->di_size = be64_to_cpu(from->di_size); + to->di_nblocks = be64_to_cpu(from->di_nblocks); + to->di_extsize = be32_to_cpu(from->di_extsize); + to->di_nextents = be32_to_cpu(from->di_nextents); + to->di_anextents = be16_to_cpu(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = be32_to_cpu(from->di_dmevmask); + to->di_dmstate = be16_to_cpu(from->di_dmstate); + to->di_flags = be16_to_cpu(from->di_flags); + to->di_gen = be32_to_cpu(from->di_gen); +} + +void +xfs_dinode_to_disk( + xfs_dinode_t *to, + xfs_icdinode_t *from) +{ + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = cpu_to_be16(from->di_onlink); + to->di_uid = cpu_to_be32(from->di_uid); + to->di_gid = cpu_to_be32(from->di_gid); + to->di_nlink = cpu_to_be32(from->di_nlink); + to->di_projid_lo = cpu_to_be16(from->di_projid_lo); + to->di_projid_hi = cpu_to_be16(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_flushiter = cpu_to_be16(from->di_flushiter); + to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); + to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); + to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); + to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); + to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); + to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); + to->di_size = cpu_to_be64(from->di_size); + to->di_nblocks = cpu_to_be64(from->di_nblocks); + to->di_extsize = cpu_to_be32(from->di_extsize); + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); + to->di_gen = cpu_to_be32(from->di_gen); +} + +STATIC uint +_xfs_dic2xflags( + __uint16_t di_flags) +{ + uint flags = 0; + + if (di_flags & XFS_DIFLAG_ANY) { + if (di_flags & XFS_DIFLAG_REALTIME) + flags |= XFS_XFLAG_REALTIME; + if (di_flags & XFS_DIFLAG_PREALLOC) + flags |= XFS_XFLAG_PREALLOC; + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= XFS_XFLAG_IMMUTABLE; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= XFS_XFLAG_APPEND; + if (di_flags & XFS_DIFLAG_SYNC) + flags |= XFS_XFLAG_SYNC; + if (di_flags & XFS_DIFLAG_NOATIME) + flags |= XFS_XFLAG_NOATIME; + if (di_flags & XFS_DIFLAG_NODUMP) + flags |= XFS_XFLAG_NODUMP; + if (di_flags & XFS_DIFLAG_RTINHERIT) + flags |= XFS_XFLAG_RTINHERIT; + if (di_flags & XFS_DIFLAG_PROJINHERIT) + flags |= XFS_XFLAG_PROJINHERIT; + if (di_flags & XFS_DIFLAG_NOSYMLINKS) + flags |= XFS_XFLAG_NOSYMLINKS; + if (di_flags & XFS_DIFLAG_EXTSIZE) + flags |= XFS_XFLAG_EXTSIZE; + if (di_flags & XFS_DIFLAG_EXTSZINHERIT) + flags |= XFS_XFLAG_EXTSZINHERIT; + if (di_flags & XFS_DIFLAG_NODEFRAG) + flags |= XFS_XFLAG_NODEFRAG; + if (di_flags & XFS_DIFLAG_FILESTREAM) + flags |= XFS_XFLAG_FILESTREAM; + } + + return flags; +} + +uint +xfs_ip2xflags( + xfs_inode_t *ip) +{ + xfs_icdinode_t *dic = &ip->i_d; + + return _xfs_dic2xflags(dic->di_flags) | + (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); +} + +uint +xfs_dic2xflags( + xfs_dinode_t *dip) +{ + return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | + (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); +} + +/* + * Read the disk inode attributes into the in-core inode structure. + */ +int +xfs_iread( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_inode_t *ip, + uint iget_flags) +{ + xfs_buf_t *bp; + xfs_dinode_t *dip; + int error; + + /* + * Fill in the location information in the in-core inode. + */ + error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); + if (error) + return error; + + /* + * Get pointers to the on-disk inode and the buffer containing it. + */ + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, + XBF_LOCK, iget_flags); + if (error) + return error; + dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); + + /* + * If we got something that isn't an inode it means someone + * (nfs or dmi) has a stale handle. + */ + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) { +#ifdef DEBUG + xfs_alert(mp, + "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)", + __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC); +#endif /* DEBUG */ + error = XFS_ERROR(EINVAL); + goto out_brelse; + } + + /* + * If the on-disk inode is already linked to a directory + * entry, copy all of the inode into the in-core inode. + * xfs_iformat() handles copying in the inode format + * specific information. + * Otherwise, just get the truly permanent information. + */ + if (dip->di_mode) { + xfs_dinode_from_disk(&ip->i_d, dip); + error = xfs_iformat(ip, dip); + if (error) { +#ifdef DEBUG + xfs_alert(mp, "%s: xfs_iformat() returned error %d", + __func__, error); +#endif /* DEBUG */ + goto out_brelse; + } + } else { + ip->i_d.di_magic = be16_to_cpu(dip->di_magic); + ip->i_d.di_version = dip->di_version; + ip->i_d.di_gen = be32_to_cpu(dip->di_gen); + ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); + /* + * Make sure to pull in the mode here as well in + * case the inode is released without being used. + * This ensures that xfs_inactive() will see that + * the inode is already free and not try to mess + * with the uninitialized part of it. + */ + ip->i_d.di_mode = 0; + /* + * Initialize the per-fork minima and maxima for a new + * inode here. xfs_iformat will do it for old inodes. + */ + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + } + + /* + * The inode format changed when we moved the link count and + * made it 32 bits long. If this is an old format inode, + * convert it in memory to look like a new one. If it gets + * flushed to disk we will convert back before flushing or + * logging it. We zero out the new projid field and the old link + * count field. We'll handle clearing the pad field (the remains + * of the old uuid field) when we actually convert the inode to + * the new format. We don't change the version number so that we + * can distinguish this from a real new format inode. + */ + if (ip->i_d.di_version == 1) { + ip->i_d.di_nlink = ip->i_d.di_onlink; + ip->i_d.di_onlink = 0; + xfs_set_projid(ip, 0); + } + + ip->i_delayed_blks = 0; + ip->i_size = ip->i_d.di_size; + + /* + * Mark the buffer containing the inode as something to keep + * around for a while. This helps to keep recently accessed + * meta-data in-core longer. + */ + xfs_buf_set_ref(bp, XFS_INO_REF); + + /* + * Use xfs_trans_brelse() to release the buffer containing the + * on-disk inode, because it was acquired with xfs_trans_read_buf() + * in xfs_itobp() above. If tp is NULL, this is just a normal + * brelse(). If we're within a transaction, then xfs_trans_brelse() + * will only release the buffer if it is not dirty within the + * transaction. It will be OK to release the buffer in this case, + * because inodes on disk are never destroyed and we will be + * locking the new in-core inode before putting it in the hash + * table where other processes can find it. Thus we don't have + * to worry about the inode being changed just because we released + * the buffer. + */ + out_brelse: + xfs_trans_brelse(tp, bp); + return error; +} + +/* + * Read in extents from a btree-format inode. + * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. + */ +int +xfs_iread_extents( + xfs_trans_t *tp, + xfs_inode_t *ip, + int whichfork) +{ + int error; + xfs_ifork_t *ifp; + xfs_extnum_t nextents; + + if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { + XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + nextents = XFS_IFORK_NEXTENTS(ip, whichfork); + ifp = XFS_IFORK_PTR(ip, whichfork); + + /* + * We know that the size is valid (it's checked in iformat_btree) + */ + ifp->if_bytes = ifp->if_real_bytes = 0; + ifp->if_flags |= XFS_IFEXTENTS; + xfs_iext_add(ifp, 0, nextents); + error = xfs_bmap_read_extents(tp, ip, whichfork); + if (error) { + xfs_iext_destroy(ifp); + ifp->if_flags &= ~XFS_IFEXTENTS; + return error; + } + xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); + return 0; +} + +/* + * Allocate an inode on disk and return a copy of its in-core version. + * The in-core inode is locked exclusively. Set mode, nlink, and rdev + * appropriately within the inode. The uid and gid for the inode are + * set according to the contents of the given cred structure. + * + * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() + * has a free inode available, call xfs_iget() + * to obtain the in-core version of the allocated inode. Finally, + * fill in the inode and log its initial contents. In this case, + * ialloc_context would be set to NULL and call_again set to false. + * + * If xfs_dialloc() does not have an available inode, + * it will replenish its supply by doing an allocation. Since we can + * only do one allocation within a transaction without deadlocks, we + * must commit the current transaction before returning the inode itself. + * In this case, therefore, we will set call_again to true and return. + * The caller should then commit the current transaction, start a new + * transaction, and call xfs_ialloc() again to actually get the inode. + * + * To ensure that some other process does not grab the inode that + * was allocated during the first call to xfs_ialloc(), this routine + * also returns the [locked] bp pointing to the head of the freelist + * as ialloc_context. The caller should hold this buffer across + * the commit and pass it back into this routine on the second call. + * + * If we are allocating quota inodes, we do not have a parent inode + * to attach to or associate with (i.e. pip == NULL) because they + * are not linked into the directory structure - they are attached + * directly to the superblock - and so have no parent. + */ +int +xfs_ialloc( + xfs_trans_t *tp, + xfs_inode_t *pip, + mode_t mode, + xfs_nlink_t nlink, + xfs_dev_t rdev, + prid_t prid, + int okalloc, + xfs_buf_t **ialloc_context, + boolean_t *call_again, + xfs_inode_t **ipp) +{ + xfs_ino_t ino; + xfs_inode_t *ip; + uint flags; + int error; + timespec_t tv; + int filestreams = 0; + + /* + * Call the space management code to pick + * the on-disk inode to be allocated. + */ + error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, + ialloc_context, call_again, &ino); + if (error) + return error; + if (*call_again || ino == NULLFSINO) { + *ipp = NULL; + return 0; + } + ASSERT(*ialloc_context == NULL); + + /* + * Get the in-core inode with the lock held exclusively. + * This is because we're setting fields here we need + * to prevent others from looking at until we're done. + */ + error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, + XFS_ILOCK_EXCL, &ip); + if (error) + return error; + ASSERT(ip != NULL); + + ip->i_d.di_mode = (__uint16_t)mode; + ip->i_d.di_onlink = 0; + ip->i_d.di_nlink = nlink; + ASSERT(ip->i_d.di_nlink == nlink); + ip->i_d.di_uid = current_fsuid(); + ip->i_d.di_gid = current_fsgid(); + xfs_set_projid(ip, prid); + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + + /* + * If the superblock version is up to where we support new format + * inodes and this is currently an old format inode, then change + * the inode version number now. This way we only do the conversion + * here rather than here and in the flush/logging code. + */ + if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && + ip->i_d.di_version == 1) { + ip->i_d.di_version = 2; + /* + * We've already zeroed the old link count, the projid field, + * and the pad field. + */ + } + + /* + * Project ids won't be stored on disk if we are using a version 1 inode. + */ + if ((prid != 0) && (ip->i_d.di_version == 1)) + xfs_bump_ino_vers2(tp, ip); + + if (pip && XFS_INHERIT_GID(pip)) { + ip->i_d.di_gid = pip->i_d.di_gid; + if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { + ip->i_d.di_mode |= S_ISGID; + } + } + + /* + * If the group ID of the new file does not match the effective group + * ID or one of the supplementary group IDs, the S_ISGID bit is cleared + * (and only if the irix_sgid_inherit compatibility variable is set). + */ + if ((irix_sgid_inherit) && + (ip->i_d.di_mode & S_ISGID) && + (!in_group_p((gid_t)ip->i_d.di_gid))) { + ip->i_d.di_mode &= ~S_ISGID; + } + + ip->i_d.di_size = 0; + ip->i_size = 0; + ip->i_d.di_nextents = 0; + ASSERT(ip->i_d.di_nblocks == 0); + + nanotime(&tv); + ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; + ip->i_d.di_atime = ip->i_d.di_mtime; + ip->i_d.di_ctime = ip->i_d.di_mtime; + + /* + * di_gen will have been taken care of in xfs_iread. + */ + ip->i_d.di_extsize = 0; + ip->i_d.di_dmevmask = 0; + ip->i_d.di_dmstate = 0; + ip->i_d.di_flags = 0; + flags = XFS_ILOG_CORE; + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + ip->i_d.di_format = XFS_DINODE_FMT_DEV; + ip->i_df.if_u2.if_rdev = rdev; + ip->i_df.if_flags = 0; + flags |= XFS_ILOG_DEV; + break; + case S_IFREG: + /* + * we can't set up filestreams until after the VFS inode + * is set up properly. + */ + if (pip && xfs_inode_is_filestream(pip)) + filestreams = 1; + /* fall through */ + case S_IFDIR: + if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { + uint di_flags = 0; + + if ((mode & S_IFMT) == S_IFDIR) { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + } else if ((mode & S_IFMT) == S_IFREG) { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_REALTIME; + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSIZE; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + } + if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && + xfs_inherit_noatime) + di_flags |= XFS_DIFLAG_NOATIME; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && + xfs_inherit_nodump) + di_flags |= XFS_DIFLAG_NODUMP; + if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && + xfs_inherit_sync) + di_flags |= XFS_DIFLAG_SYNC; + if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && + xfs_inherit_nosymlinks) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && + xfs_inherit_nodefrag) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + ip->i_d.di_flags |= di_flags; + } + /* FALLTHROUGH */ + case S_IFLNK: + ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_flags = XFS_IFEXTENTS; + ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; + ip->i_df.if_u1.if_extents = NULL; + break; + default: + ASSERT(0); + } + /* + * Attribute fork settings for new inode. + */ + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + ip->i_d.di_anextents = 0; + + /* + * Log the new values stuffed into the inode. + */ + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, flags); + + /* now that we have an i_mode we can setup inode ops and unlock */ + xfs_setup_inode(ip); + + /* now we have set up the vfs inode we can associate the filestream */ + if (filestreams) { + error = xfs_filestream_associate(pip, ip); + if (error < 0) + return -error; + if (!error) + xfs_iflags_set(ip, XFS_IFILESTREAM); + } + + *ipp = ip; + return 0; +} + +/* + * Check to make sure that there are no blocks allocated to the + * file beyond the size of the file. We don't check this for + * files with fixed size extents or real time extents, but we + * at least do it for regular files. + */ +#ifdef DEBUG +void +xfs_isize_check( + xfs_mount_t *mp, + xfs_inode_t *ip, + xfs_fsize_t isize) +{ + xfs_fileoff_t map_first; + int nimaps; + xfs_bmbt_irec_t imaps[2]; + + if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) + return; + + if (XFS_IS_REALTIME_INODE(ip)) + return; + + if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) + return; + + nimaps = 2; + map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); + /* + * The filesystem could be shutting down, so bmapi may return + * an error. + */ + if (xfs_bmapi(NULL, ip, map_first, + (XFS_B_TO_FSB(mp, + (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - + map_first), + XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, + NULL)) + return; + ASSERT(nimaps == 1); + ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); +} +#endif /* DEBUG */ + +/* + * Calculate the last possible buffered byte in a file. This must + * include data that was buffered beyond the EOF by the write code. + * This also needs to deal with overflowing the xfs_fsize_t type + * which can happen for sizes near the limit. + * + * We also need to take into account any blocks beyond the EOF. It + * may be the case that they were buffered by a write which failed. + * In that case the pages will still be in memory, but the inode size + * will never have been updated. + */ +STATIC xfs_fsize_t +xfs_file_last_byte( + xfs_inode_t *ip) +{ + xfs_mount_t *mp; + xfs_fsize_t last_byte; + xfs_fileoff_t last_block; + xfs_fileoff_t size_last_block; + int error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)); + + mp = ip->i_mount; + /* + * Only check for blocks beyond the EOF if the extents have + * been read in. This eliminates the need for the inode lock, + * and it also saves us from looking when it really isn't + * necessary. + */ + if (ip->i_df.if_flags & XFS_IFEXTENTS) { + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_bmap_last_offset(NULL, ip, &last_block, + XFS_DATA_FORK); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (error) { + last_block = 0; + } + } else { + last_block = 0; + } + size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size); + last_block = XFS_FILEOFF_MAX(last_block, size_last_block); + + last_byte = XFS_FSB_TO_B(mp, last_block); + if (last_byte < 0) { + return XFS_MAXIOFFSET(mp); + } + last_byte += (1 << mp->m_writeio_log); + if (last_byte < 0) { + return XFS_MAXIOFFSET(mp); + } + return last_byte; +} + +/* + * Start the truncation of the file to new_size. The new size + * must be smaller than the current size. This routine will + * clear the buffer and page caches of file data in the removed + * range, and xfs_itruncate_finish() will remove the underlying + * disk blocks. + * + * The inode must have its I/O lock locked EXCLUSIVELY, and it + * must NOT have the inode lock held at all. This is because we're + * calling into the buffer/page cache code and we can't hold the + * inode lock when we do so. + * + * We need to wait for any direct I/Os in flight to complete before we + * proceed with the truncate. This is needed to prevent the extents + * being read or written by the direct I/Os from being removed while the + * I/O is in flight as there is no other method of synchronising + * direct I/O with the truncate operation. Also, because we hold + * the IOLOCK in exclusive mode, we prevent new direct I/Os from being + * started until the truncate completes and drops the lock. Essentially, + * the xfs_ioend_wait() call forms an I/O barrier that provides strict + * ordering between direct I/Os and the truncate operation. + * + * The flags parameter can have either the value XFS_ITRUNC_DEFINITE + * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used + * in the case that the caller is locking things out of order and + * may not be able to call xfs_itruncate_finish() with the inode lock + * held without dropping the I/O lock. If the caller must drop the + * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start() + * must be called again with all the same restrictions as the initial + * call. + */ +int +xfs_itruncate_start( + xfs_inode_t *ip, + uint flags, + xfs_fsize_t new_size) +{ + xfs_fsize_t last_byte; + xfs_off_t toss_start; + xfs_mount_t *mp; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT((new_size == 0) || (new_size <= ip->i_size)); + ASSERT((flags == XFS_ITRUNC_DEFINITE) || + (flags == XFS_ITRUNC_MAYBE)); + + mp = ip->i_mount; + + /* wait for the completion of any pending DIOs */ + if (new_size == 0 || new_size < ip->i_size) + xfs_ioend_wait(ip); + + /* + * Call toss_pages or flushinval_pages to get rid of pages + * overlapping the region being removed. We have to use + * the less efficient flushinval_pages in the case that the + * caller may not be able to finish the truncate without + * dropping the inode's I/O lock. Make sure + * to catch any pages brought in by buffers overlapping + * the EOF by searching out beyond the isize by our + * block size. We round new_size up to a block boundary + * so that we don't toss things on the same block as + * new_size but before it. + * + * Before calling toss_page or flushinval_pages, make sure to + * call remapf() over the same region if the file is mapped. + * This frees up mapped file references to the pages in the + * given range and for the flushinval_pages case it ensures + * that we get the latest mapped changes flushed out. + */ + toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); + toss_start = XFS_FSB_TO_B(mp, toss_start); + if (toss_start < 0) { + /* + * The place to start tossing is beyond our maximum + * file size, so there is no way that the data extended + * out there. + */ + return 0; + } + last_byte = xfs_file_last_byte(ip); + trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte); + if (last_byte > toss_start) { + if (flags & XFS_ITRUNC_DEFINITE) { + xfs_tosspages(ip, toss_start, + -1, FI_REMAPF_LOCKED); + } else { + error = xfs_flushinval_pages(ip, toss_start, + -1, FI_REMAPF_LOCKED); + } + } + +#ifdef DEBUG + if (new_size == 0) { + ASSERT(VN_CACHED(VFS_I(ip)) == 0); + } +#endif + return error; +} + +/* + * Shrink the file to the given new_size. The new size must be smaller than + * the current size. This will free up the underlying blocks in the removed + * range after a call to xfs_itruncate_start() or xfs_atruncate_start(). + * + * The transaction passed to this routine must have made a permanent log + * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the + * given transaction and start new ones, so make sure everything involved in + * the transaction is tidy before calling here. Some transaction will be + * returned to the caller to be committed. The incoming transaction must + * already include the inode, and both inode locks must be held exclusively. + * The inode must also be "held" within the transaction. On return the inode + * will be "held" within the returned transaction. This routine does NOT + * require any disk space to be reserved for it within the transaction. + * + * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it + * indicates the fork which is to be truncated. For the attribute fork we only + * support truncation to size 0. + * + * We use the sync parameter to indicate whether or not the first transaction + * we perform might have to be synchronous. For the attr fork, it needs to be + * so if the unlink of the inode is not yet known to be permanent in the log. + * This keeps us from freeing and reusing the blocks of the attribute fork + * before the unlink of the inode becomes permanent. + * + * For the data fork, we normally have to run synchronously if we're being + * called out of the inactive path or we're being called out of the create path + * where we're truncating an existing file. Either way, the truncate needs to + * be sync so blocks don't reappear in the file with altered data in case of a + * crash. wsync filesystems can run the first case async because anything that + * shrinks the inode has to run sync so by the time we're called here from + * inactive, the inode size is permanently set to 0. + * + * Calls from the truncate path always need to be sync unless we're in a wsync + * filesystem and the file has already been unlinked. + * + * The caller is responsible for correctly setting the sync parameter. It gets + * too hard for us to guess here which path we're being called out of just + * based on inode state. + * + * If we get an error, we must return with the inode locked and linked into the + * current transaction. This keeps things simple for the higher level code, + * because it always knows that the inode is locked and held in the transaction + * that returns to it whether errors occur or not. We don't mark the inode + * dirty on error so that transactions can be easily aborted if possible. + */ +int +xfs_itruncate_finish( + xfs_trans_t **tp, + xfs_inode_t *ip, + xfs_fsize_t new_size, + int fork, + int sync) +{ + xfs_fsblock_t first_block; + xfs_fileoff_t first_unmap_block; + xfs_fileoff_t last_block; + xfs_filblks_t unmap_len=0; + xfs_mount_t *mp; + xfs_trans_t *ntp; + int done; + int committed; + xfs_bmap_free_t free_list; + int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + ASSERT((new_size == 0) || (new_size <= ip->i_size)); + ASSERT(*tp != NULL); + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(ip->i_transp == *tp); + ASSERT(ip->i_itemp != NULL); + ASSERT(ip->i_itemp->ili_lock_flags == 0); + + + ntp = *tp; + mp = (ntp)->t_mountp; + ASSERT(! XFS_NOT_DQATTACHED(mp, ip)); + + /* + * We only support truncating the entire attribute fork. + */ + if (fork == XFS_ATTR_FORK) { + new_size = 0LL; + } + first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); + trace_xfs_itruncate_finish_start(ip, new_size); + + /* + * The first thing we do is set the size to new_size permanently + * on disk. This way we don't have to worry about anyone ever + * being able to look at the data being freed even in the face + * of a crash. What we're getting around here is the case where + * we free a block, it is allocated to another file, it is written + * to, and then we crash. If the new data gets written to the + * file but the log buffers containing the free and reallocation + * don't, then we'd end up with garbage in the blocks being freed. + * As long as we make the new_size permanent before actually + * freeing any blocks it doesn't matter if they get written to. + * + * The callers must signal into us whether or not the size + * setting here must be synchronous. There are a few cases + * where it doesn't have to be synchronous. Those cases + * occur if the file is unlinked and we know the unlink is + * permanent or if the blocks being truncated are guaranteed + * to be beyond the inode eof (regardless of the link count) + * and the eof value is permanent. Both of these cases occur + * only on wsync-mounted filesystems. In those cases, we're + * guaranteed that no user will ever see the data in the blocks + * that are being truncated so the truncate can run async. + * In the free beyond eof case, the file may wind up with + * more blocks allocated to it than it needs if we crash + * and that won't get fixed until the next time the file + * is re-opened and closed but that's ok as that shouldn't + * be too many blocks. + * + * However, we can't just make all wsync xactions run async + * because there's one call out of the create path that needs + * to run sync where it's truncating an existing file to size + * 0 whose size is > 0. + * + * It's probably possible to come up with a test in this + * routine that would correctly distinguish all the above + * cases from the values of the function parameters and the + * inode state but for sanity's sake, I've decided to let the + * layers above just tell us. It's simpler to correctly figure + * out in the layer above exactly under what conditions we + * can run async and I think it's easier for others read and + * follow the logic in case something has to be changed. + * cscope is your friend -- rcc. + * + * The attribute fork is much simpler. + * + * For the attribute fork we allow the caller to tell us whether + * the unlink of the inode that led to this call is yet permanent + * in the on disk log. If it is not and we will be freeing extents + * in this inode then we make the first transaction synchronous + * to make sure that the unlink is permanent by the time we free + * the blocks. + */ + if (fork == XFS_DATA_FORK) { + if (ip->i_d.di_nextents > 0) { + /* + * If we are not changing the file size then do + * not update the on-disk file size - we may be + * called from xfs_inactive_free_eofblocks(). If we + * update the on-disk file size and then the system + * crashes before the contents of the file are + * flushed to disk then the files may be full of + * holes (ie NULL files bug). + */ + if (ip->i_size != new_size) { + ip->i_d.di_size = new_size; + ip->i_size = new_size; + xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); + } + } + } + + /* + * Since it is possible for space to become allocated beyond + * the end of the file (in a crash where the space is allocated + * but the inode size is not yet updated), simply remove any + * blocks which show up between the new EOF and the maximum + * possible file size. If the first block to be removed is + * beyond the maximum file size (ie it is the same as last_block), + * then there is nothing to do. + */ + last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + ASSERT(first_unmap_block <= last_block); + done = 0; + if (last_block == first_unmap_block) { + done = 1; + } else { + unmap_len = last_block - first_unmap_block + 1; + } + while (!done) { + /* + * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi() + * will tell us whether it freed the entire range or + * not. If this is a synchronous mount (wsync), + * then we can tell bunmapi to keep all the + * transactions asynchronous since the unlink + * transaction that made this inode inactive has + * already hit the disk. There's no danger of + * the freed blocks being reused, there being a + * crash, and the reused blocks suddenly reappearing + * in this file with garbage in them once recovery + * runs. + */ + xfs_bmap_init(&free_list, &first_block); + error = xfs_bunmapi(ntp, ip, + first_unmap_block, unmap_len, + xfs_bmapi_aflag(fork), + XFS_ITRUNC_MAX_EXTENTS, + &first_block, &free_list, + &done); + if (error) { + /* + * If the bunmapi call encounters an error, + * return to the caller where the transaction + * can be properly aborted. We just need to + * make sure we're not holding any resources + * that we were not when we came in. + */ + xfs_bmap_cancel(&free_list); + return error; + } + + /* + * Duplicate the transaction that has the permanent + * reservation and commit the old transaction. + */ + error = xfs_bmap_finish(tp, &free_list, &committed); + ntp = *tp; + if (committed) + xfs_trans_ijoin(ntp, ip); + + if (error) { + /* + * If the bmap finish call encounters an error, return + * to the caller where the transaction can be properly + * aborted. We just need to make sure we're not + * holding any resources that we were not when we came + * in. + * + * Aborting from this point might lose some blocks in + * the file system, but oh well. + */ + xfs_bmap_cancel(&free_list); + return error; + } + + if (committed) { + /* + * Mark the inode dirty so it will be logged and + * moved forward in the log as part of every commit. + */ + xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); + } + + ntp = xfs_trans_dup(ntp); + error = xfs_trans_commit(*tp, 0); + *tp = ntp; + + xfs_trans_ijoin(ntp, ip); + + if (error) + return error; + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(ntp->t_ticket); + error = xfs_trans_reserve(ntp, 0, + XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + if (error) + return error; + } + /* + * Only update the size in the case of the data fork, but + * always re-log the inode so that our permanent transaction + * can keep on rolling it forward in the log. + */ + if (fork == XFS_DATA_FORK) { + xfs_isize_check(mp, ip, new_size); + /* + * If we are not changing the file size then do + * not update the on-disk file size - we may be + * called from xfs_inactive_free_eofblocks(). If we + * update the on-disk file size and then the system + * crashes before the contents of the file are + * flushed to disk then the files may be full of + * holes (ie NULL files bug). + */ + if (ip->i_size != new_size) { + ip->i_d.di_size = new_size; + ip->i_size = new_size; + } + } + xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); + ASSERT((new_size != 0) || + (fork == XFS_ATTR_FORK) || + (ip->i_delayed_blks == 0)); + ASSERT((new_size != 0) || + (fork == XFS_ATTR_FORK) || + (ip->i_d.di_nextents == 0)); + trace_xfs_itruncate_finish_end(ip, new_size); + return 0; +} + +/* + * This is called when the inode's link count goes to 0. + * We place the on-disk inode on a list in the AGI. It + * will be pulled from this list when the inode is freed. + */ +int +xfs_iunlink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_mount_t *mp; + xfs_agi_t *agi; + xfs_dinode_t *dip; + xfs_buf_t *agibp; + xfs_buf_t *ibp; + xfs_agino_t agino; + short bucket_index; + int offset; + int error; + + ASSERT(ip->i_d.di_nlink == 0); + ASSERT(ip->i_d.di_mode != 0); + ASSERT(ip->i_transp == tp); + + mp = tp->t_mountp; + + /* + * Get the agi buffer first. It ensures lock ordering + * on the list. + */ + error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); + if (error) + return error; + agi = XFS_BUF_TO_AGI(agibp); + + /* + * Get the index into the agi hash table for the + * list this inode will go on. + */ + agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + ASSERT(agino != 0); + bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + ASSERT(agi->agi_unlinked[bucket_index]); + ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); + + if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) { + /* + * There is already another inode in the bucket we need + * to add ourselves to. Add us at the front of the list. + * Here we put the head pointer into our next pointer, + * and then we fall through to point the head at us. + */ + error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + if (error) + return error; + + ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO); + /* both on-disk, don't endian flip twice */ + dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; + offset = ip->i_imap.im_boffset + + offsetof(xfs_dinode_t, di_next_unlinked); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, ibp); + } + + /* + * Point the bucket head pointer at the inode being inserted. + */ + ASSERT(agino != 0); + agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); + offset = offsetof(xfs_agi_t, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + return 0; +} + +/* + * Pull the on-disk inode from the AGI unlinked list. + */ +STATIC int +xfs_iunlink_remove( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_ino_t next_ino; + xfs_mount_t *mp; + xfs_agi_t *agi; + xfs_dinode_t *dip; + xfs_buf_t *agibp; + xfs_buf_t *ibp; + xfs_agnumber_t agno; + xfs_agino_t agino; + xfs_agino_t next_agino; + xfs_buf_t *last_ibp; + xfs_dinode_t *last_dip = NULL; + short bucket_index; + int offset, last_offset = 0; + int error; + + mp = tp->t_mountp; + agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + + /* + * Get the agi buffer first. It ensures lock ordering + * on the list. + */ + error = xfs_read_agi(mp, tp, agno, &agibp); + if (error) + return error; + + agi = XFS_BUF_TO_AGI(agibp); + + /* + * Get the index into the agi hash table for the + * list this inode will go on. + */ + agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + ASSERT(agino != 0); + bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO); + ASSERT(agi->agi_unlinked[bucket_index]); + + if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { + /* + * We're at the head of the list. Get the inode's + * on-disk buffer to see if there is anyone after us + * on the list. Only modify our next pointer if it + * is not already NULLAGINO. This saves us the overhead + * of dealing with the buffer when there is no need to + * change it. + */ + error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + if (error) { + xfs_warn(mp, "%s: xfs_itobp() returned error %d.", + __func__, error); + return error; + } + next_agino = be32_to_cpu(dip->di_next_unlinked); + ASSERT(next_agino != 0); + if (next_agino != NULLAGINO) { + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); + offset = ip->i_imap.im_boffset + + offsetof(xfs_dinode_t, di_next_unlinked); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, ibp); + } else { + xfs_trans_brelse(tp, ibp); + } + /* + * Point the bucket head pointer at the next inode. + */ + ASSERT(next_agino != 0); + ASSERT(next_agino != agino); + agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); + offset = offsetof(xfs_agi_t, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + } else { + /* + * We need to search the list for the inode being freed. + */ + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + last_ibp = NULL; + while (next_agino != agino) { + /* + * If the last inode wasn't the one pointing to + * us, then release its buffer since we're not + * going to do anything with it. + */ + if (last_ibp != NULL) { + xfs_trans_brelse(tp, last_ibp); + } + next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); + error = xfs_inotobp(mp, tp, next_ino, &last_dip, + &last_ibp, &last_offset, 0); + if (error) { + xfs_warn(mp, + "%s: xfs_inotobp() returned error %d.", + __func__, error); + return error; + } + next_agino = be32_to_cpu(last_dip->di_next_unlinked); + ASSERT(next_agino != NULLAGINO); + ASSERT(next_agino != 0); + } + /* + * Now last_ibp points to the buffer previous to us on + * the unlinked list. Pull us from the list. + */ + error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + if (error) { + xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.", + __func__, error); + return error; + } + next_agino = be32_to_cpu(dip->di_next_unlinked); + ASSERT(next_agino != 0); + ASSERT(next_agino != agino); + if (next_agino != NULLAGINO) { + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); + offset = ip->i_imap.im_boffset + + offsetof(xfs_dinode_t, di_next_unlinked); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, ibp); + } else { + xfs_trans_brelse(tp, ibp); + } + /* + * Point the previous inode on the list to the next inode. + */ + last_dip->di_next_unlinked = cpu_to_be32(next_agino); + ASSERT(next_agino != 0); + offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); + xfs_trans_inode_buf(tp, last_ibp); + xfs_trans_log_buf(tp, last_ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, last_ibp); + } + return 0; +} + +/* + * A big issue when freeing the inode cluster is is that we _cannot_ skip any + * inodes that are in memory - they all must be marked stale and attached to + * the cluster buffer. + */ +STATIC void +xfs_ifree_cluster( + xfs_inode_t *free_ip, + xfs_trans_t *tp, + xfs_ino_t inum) +{ + xfs_mount_t *mp = free_ip->i_mount; + int blks_per_cluster; + int nbufs; + int ninodes; + int i, j; + xfs_daddr_t blkno; + xfs_buf_t *bp; + xfs_inode_t *ip; + xfs_inode_log_item_t *iip; + xfs_log_item_t *lip; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); + if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { + blks_per_cluster = 1; + ninodes = mp->m_sb.sb_inopblock; + nbufs = XFS_IALLOC_BLOCKS(mp); + } else { + blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / + mp->m_sb.sb_blocksize; + ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; + nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; + } + + for (j = 0; j < nbufs; j++, inum += ninodes) { + blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), + XFS_INO_TO_AGBNO(mp, inum)); + + /* + * We obtain and lock the backing buffer first in the process + * here, as we have to ensure that any dirty inode that we + * can't get the flush lock on is attached to the buffer. + * If we scan the in-memory inodes first, then buffer IO can + * complete before we get a lock on it, and hence we may fail + * to mark all the active inodes on the buffer stale. + */ + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, + mp->m_bsize * blks_per_cluster, + XBF_LOCK); + + /* + * Walk the inodes already attached to the buffer and mark them + * stale. These will all have the flush locks held, so an + * in-memory inode walk can't lock them. By marking them all + * stale first, we will not attempt to lock them in the loop + * below as the XFS_ISTALE flag will be set. + */ + lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + while (lip) { + if (lip->li_type == XFS_LI_INODE) { + iip = (xfs_inode_log_item_t *)lip; + ASSERT(iip->ili_logged == 1); + lip->li_cb = xfs_istale_done; + xfs_trans_ail_copy_lsn(mp->m_ail, + &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); + xfs_iflags_set(iip->ili_inode, XFS_ISTALE); + } + lip = lip->li_bio_list; + } + + + /* + * For each inode in memory attempt to add it to the inode + * buffer and set it up for being staled on buffer IO + * completion. This is safe as we've locked out tail pushing + * and flushing by locking the buffer. + * + * We have already marked every inode that was part of a + * transaction stale above, which means there is no point in + * even trying to lock them. + */ + for (i = 0; i < ninodes; i++) { +retry: + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, (inum + i))); + + /* Inode not in memory, nothing to do */ + if (!ip) { + rcu_read_unlock(); + continue; + } + + /* + * because this is an RCU protected lookup, we could + * find a recently freed or even reallocated inode + * during the lookup. We need to check under the + * i_flags_lock for a valid inode here. Skip it if it + * is not valid, the wrong inode or stale. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != inum + i || + __xfs_iflags_test(ip, XFS_ISTALE)) { + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + continue; + } + spin_unlock(&ip->i_flags_lock); + + /* + * Don't try to lock/unlock the current inode, but we + * _cannot_ skip the other inodes that we did not find + * in the list attached to the buffer and are not + * already marked stale. If we can't lock it, back off + * and retry. + */ + if (ip != free_ip && + !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + rcu_read_unlock(); + delay(1); + goto retry; + } + rcu_read_unlock(); + + xfs_iflock(ip); + xfs_iflags_set(ip, XFS_ISTALE); + + /* + * we don't need to attach clean inodes or those only + * with unlogged changes (which we throw away, anyway). + */ + iip = ip->i_itemp; + if (!iip || xfs_inode_clean(ip)) { + ASSERT(ip != free_ip); + ip->i_update_core = 0; + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + continue; + } + + iip->ili_last_fields = iip->ili_format.ilf_fields; + iip->ili_format.ilf_fields = 0; + iip->ili_logged = 1; + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); + + xfs_buf_attach_iodone(bp, xfs_istale_done, + &iip->ili_item); + + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + xfs_trans_stale_inode_buf(tp, bp); + xfs_trans_binval(tp, bp); + } + + xfs_perag_put(pag); +} + +/* + * This is called to return an inode to the inode free list. + * The inode should already be truncated to 0 length and have + * no pages associated with it. This routine also assumes that + * the inode is already a part of the transaction. + * + * The on-disk copy of the inode will have been added to the list + * of unlinked inodes in the AGI. We need to remove the inode from + * that list atomically with respect to freeing it here. + */ +int +xfs_ifree( + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_bmap_free_t *flist) +{ + int error; + int delete; + xfs_ino_t first_ino; + xfs_dinode_t *dip; + xfs_buf_t *ibp; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(ip->i_transp == tp); + ASSERT(ip->i_d.di_nlink == 0); + ASSERT(ip->i_d.di_nextents == 0); + ASSERT(ip->i_d.di_anextents == 0); + ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || + ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); + ASSERT(ip->i_d.di_nblocks == 0); + + /* + * Pull the on-disk inode from the AGI unlinked list. + */ + error = xfs_iunlink_remove(tp, ip); + if (error != 0) { + return error; + } + + error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); + if (error != 0) { + return error; + } + ip->i_d.di_mode = 0; /* mark incore inode as free */ + ip->i_d.di_flags = 0; + ip->i_d.di_dmevmask = 0; + ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + /* + * Bump the generation count so no one will be confused + * by reincarnations of this inode. + */ + ip->i_d.di_gen++; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK); + if (error) + return error; + + /* + * Clear the on-disk di_mode. This is to prevent xfs_bulkstat + * from picking up this inode when it is reclaimed (its incore state + * initialzed but not flushed to disk yet). The in-core di_mode is + * already cleared and a corresponding transaction logged. + * The hack here just synchronizes the in-core to on-disk + * di_mode value in advance before the actual inode sync to disk. + * This is OK because the inode is already unlinked and would never + * change its di_mode again for this inode generation. + * This is a temporary hack that would require a proper fix + * in the future. + */ + dip->di_mode = 0; + + if (delete) { + xfs_ifree_cluster(ip, tp, first_ino); + } + + return 0; +} + +/* + * Reallocate the space for if_broot based on the number of records + * being added or deleted as indicated in rec_diff. Move the records + * and pointers in if_broot to fit the new size. When shrinking this + * will eliminate holes between the records and pointers created by + * the caller. When growing this will create holes to be filled in + * by the caller. + * + * The caller must not request to add more records than would fit in + * the on-disk inode root. If the if_broot is currently NULL, then + * if we adding records one will be allocated. The caller must also + * not request that the number of records go below zero, although + * it can go to zero. + * + * ip -- the inode whose if_broot area is changing + * ext_diff -- the change in the number of records, positive or negative, + * requested for the if_broot array. + */ +void +xfs_iroot_realloc( + xfs_inode_t *ip, + int rec_diff, + int whichfork) +{ + struct xfs_mount *mp = ip->i_mount; + int cur_max; + xfs_ifork_t *ifp; + struct xfs_btree_block *new_broot; + int new_max; + size_t new_size; + char *np; + char *op; + + /* + * Handle the degenerate case quietly. + */ + if (rec_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (rec_diff > 0) { + /* + * If there wasn't any memory allocated before, just + * allocate it now and get out. + */ + if (ifp->if_broot_bytes == 0) { + new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); + ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + ifp->if_broot_bytes = (int)new_size; + return; + } + + /* + * If there is already an existing if_broot, then we need + * to realloc() it and shift the pointers to their new + * location. The records don't change location because + * they are kept butted up against the btree block header. + */ + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); + new_max = cur_max + rec_diff; + new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); + ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, + (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ + KM_SLEEP | KM_NOFS); + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + (int)new_size); + ifp->if_broot_bytes = (int)new_size; + ASSERT(ifp->if_broot_bytes <= + XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); + memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); + return; + } + + /* + * rec_diff is less than 0. In this case, we are shrinking the + * if_broot buffer. It must already exist. If we go to zero + * records, just get rid of the root and clear the status bit. + */ + ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); + new_max = cur_max + rec_diff; + ASSERT(new_max >= 0); + if (new_max > 0) + new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); + else + new_size = 0; + if (new_size > 0) { + new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + /* + * First copy over the btree block header. + */ + memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN); + } else { + new_broot = NULL; + ifp->if_flags &= ~XFS_IFBROOT; + } + + /* + * Only copy the records and pointers if there are any. + */ + if (new_max > 0) { + /* + * First copy the records. + */ + op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); + np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); + memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); + + /* + * Then copy the pointers. + */ + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, + (int)new_size); + memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); + } + kmem_free(ifp->if_broot); + ifp->if_broot = new_broot; + ifp->if_broot_bytes = (int)new_size; + ASSERT(ifp->if_broot_bytes <= + XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); + return; +} + + +/* + * This is called when the amount of space needed for if_data + * is increased or decreased. The change in size is indicated by + * the number of bytes that need to be added or deleted in the + * byte_diff parameter. + * + * If the amount of space needed has decreased below the size of the + * inline buffer, then switch to using the inline buffer. Otherwise, + * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer + * to what is needed. + * + * ip -- the inode whose if_data area is changing + * byte_diff -- the change in the number of bytes, positive or negative, + * requested for the if_data array. + */ +void +xfs_idata_realloc( + xfs_inode_t *ip, + int byte_diff, + int whichfork) +{ + xfs_ifork_t *ifp; + int new_size; + int real_size; + + if (byte_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + new_size = (int)ifp->if_bytes + byte_diff; + ASSERT(new_size >= 0); + + if (new_size == 0) { + if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + kmem_free(ifp->if_u1.if_data); + } + ifp->if_u1.if_data = NULL; + real_size = 0; + } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { + /* + * If the valid extents/data can fit in if_inline_ext/data, + * copy them from the malloc'd vector and free it. + */ + if (ifp->if_u1.if_data == NULL) { + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + ASSERT(ifp->if_real_bytes != 0); + memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, + new_size); + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + } + real_size = 0; + } else { + /* + * Stuck with malloc/realloc. + * For inline data, the underlying buffer must be + * a multiple of 4 bytes in size so that it can be + * logged and stay on word boundaries. We enforce + * that here. + */ + real_size = roundup(new_size, 4); + if (ifp->if_u1.if_data == NULL) { + ASSERT(ifp->if_real_bytes == 0); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); + } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + /* + * Only do the realloc if the underlying size + * is really changing. + */ + if (ifp->if_real_bytes != real_size) { + ifp->if_u1.if_data = + kmem_realloc(ifp->if_u1.if_data, + real_size, + ifp->if_real_bytes, + KM_SLEEP | KM_NOFS); + } + } else { + ASSERT(ifp->if_real_bytes == 0); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); + memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, + ifp->if_bytes); + } + } + ifp->if_real_bytes = real_size; + ifp->if_bytes = new_size; + ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); +} + +void +xfs_idestroy_fork( + xfs_inode_t *ip, + int whichfork) +{ + xfs_ifork_t *ifp; + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (ifp->if_broot != NULL) { + kmem_free(ifp->if_broot); + ifp->if_broot = NULL; + } + + /* + * If the format is local, then we can't have an extents + * array so just look for an inline data array. If we're + * not local then we may or may not have an extents list, + * so check and free it up if we do. + */ + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && + (ifp->if_u1.if_data != NULL)) { + ASSERT(ifp->if_real_bytes != 0); + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = NULL; + ifp->if_real_bytes = 0; + } + } else if ((ifp->if_flags & XFS_IFEXTENTS) && + ((ifp->if_flags & XFS_IFEXTIREC) || + ((ifp->if_u1.if_extents != NULL) && + (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { + ASSERT(ifp->if_real_bytes != 0); + xfs_iext_destroy(ifp); + } + ASSERT(ifp->if_u1.if_extents == NULL || + ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); + ASSERT(ifp->if_real_bytes == 0); + if (whichfork == XFS_ATTR_FORK) { + kmem_zone_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + } +} + +/* + * This is called to unpin an inode. The caller must have the inode locked + * in at least shared mode so that the buffer cannot be subsequently pinned + * once someone is waiting for it to be unpinned. + */ +static void +xfs_iunpin_nowait( + struct xfs_inode *ip) +{ + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + + trace_xfs_inode_unpin_nowait(ip, _RET_IP_); + + /* Give the log a push to start the unpinning I/O */ + xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); + +} + +void +xfs_iunpin_wait( + struct xfs_inode *ip) +{ + if (xfs_ipincount(ip)) { + xfs_iunpin_nowait(ip); + wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0)); + } +} + +/* + * xfs_iextents_copy() + * + * This is called to copy the REAL extents (as opposed to the delayed + * allocation extents) from the inode into the given buffer. It + * returns the number of bytes copied into the buffer. + * + * If there are no delayed allocation extents, then we can just + * memcpy() the extents into the buffer. Otherwise, we need to + * examine each extent in turn and skip those which are delayed. + */ +int +xfs_iextents_copy( + xfs_inode_t *ip, + xfs_bmbt_rec_t *dp, + int whichfork) +{ + int copied; + int i; + xfs_ifork_t *ifp; + int nrecs; + xfs_fsblock_t start_block; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(ifp->if_bytes > 0); + + nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); + ASSERT(nrecs > 0); + + /* + * There are some delayed allocation extents in the + * inode, so copy the extents one at a time and skip + * the delayed ones. There must be at least one + * non-delayed extent. + */ + copied = 0; + for (i = 0; i < nrecs; i++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + start_block = xfs_bmbt_get_startblock(ep); + if (isnullstartblock(start_block)) { + /* + * It's a delayed allocation extent, so skip it. + */ + continue; + } + + /* Translate to on disk format */ + put_unaligned(cpu_to_be64(ep->l0), &dp->l0); + put_unaligned(cpu_to_be64(ep->l1), &dp->l1); + dp++; + copied++; + } + ASSERT(copied != 0); + xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); + + return (copied * (uint)sizeof(xfs_bmbt_rec_t)); +} + +/* + * Each of the following cases stores data into the same region + * of the on-disk inode, so only one of them can be valid at + * any given time. While it is possible to have conflicting formats + * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is + * in EXTENTS format, this can only happen when the fork has + * changed formats after being modified but before being flushed. + * In these cases, the format always takes precedence, because the + * format indicates the current state of the fork. + */ +/*ARGSUSED*/ +STATIC void +xfs_iflush_fork( + xfs_inode_t *ip, + xfs_dinode_t *dip, + xfs_inode_log_item_t *iip, + int whichfork, + xfs_buf_t *bp) +{ + char *cp; + xfs_ifork_t *ifp; + xfs_mount_t *mp; +#ifdef XFS_TRANS_DEBUG + int first; +#endif + static const short brootflag[2] = + { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; + static const short dataflag[2] = + { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; + static const short extflag[2] = + { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; + + if (!iip) + return; + ifp = XFS_IFORK_PTR(ip, whichfork); + /* + * This can happen if we gave up in iformat in an error path, + * for the attribute fork. + */ + if (!ifp) { + ASSERT(whichfork == XFS_ATTR_FORK); + return; + } + cp = XFS_DFORK_PTR(dip, whichfork); + mp = ip->i_mount; + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: + if ((iip->ili_format.ilf_fields & dataflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); + memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); + } + break; + + case XFS_DINODE_FMT_EXTENTS: + ASSERT((ifp->if_flags & XFS_IFEXTENTS) || + !(iip->ili_format.ilf_fields & extflag[whichfork])); + if ((iip->ili_format.ilf_fields & extflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(xfs_iext_get_ext(ifp, 0)); + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); + (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, + whichfork); + } + break; + + case XFS_DINODE_FMT_BTREE: + if ((iip->ili_format.ilf_fields & brootflag[whichfork]) && + (ifp->if_broot_bytes > 0)) { + ASSERT(ifp->if_broot != NULL); + ASSERT(ifp->if_broot_bytes <= + (XFS_IFORK_SIZE(ip, whichfork) + + XFS_BROOT_SIZE_ADJ)); + xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, + (xfs_bmdr_block_t *)cp, + XFS_DFORK_SIZE(dip, mp, whichfork)); + } + break; + + case XFS_DINODE_FMT_DEV: + if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { + ASSERT(whichfork == XFS_DATA_FORK); + xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); + } + break; + + case XFS_DINODE_FMT_UUID: + if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { + ASSERT(whichfork == XFS_DATA_FORK); + memcpy(XFS_DFORK_DPTR(dip), + &ip->i_df.if_u2.if_uuid, + sizeof(uuid_t)); + } + break; + + default: + ASSERT(0); + break; + } +} + +STATIC int +xfs_iflush_cluster( + xfs_inode_t *ip, + xfs_buf_t *bp) +{ + xfs_mount_t *mp = ip->i_mount; + struct xfs_perag *pag; + unsigned long first_index, mask; + unsigned long inodes_per_cluster; + int ilist_size; + xfs_inode_t **ilist; + xfs_inode_t *iq; + int nr_found; + int clcount = 0; + int bufwasdelwri; + int i; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + + inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; + ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); + ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); + if (!ilist) + goto out_put; + + mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; + rcu_read_lock(); + /* really need a gang lookup range call here */ + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, + first_index, inodes_per_cluster); + if (nr_found == 0) + goto out_free; + + for (i = 0; i < nr_found; i++) { + iq = ilist[i]; + if (iq == ip) + continue; + + /* + * because this is an RCU protected lookup, we could find a + * recently freed or even reallocated inode during the lookup. + * We need to check under the i_flags_lock for a valid inode + * here. Skip it if it is not valid or the wrong inode. + */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino || + (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { + spin_unlock(&ip->i_flags_lock); + continue; + } + spin_unlock(&ip->i_flags_lock); + + /* + * Do an un-protected check to see if the inode is dirty and + * is a candidate for flushing. These checks will be repeated + * later after the appropriate locks are acquired. + */ + if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) + continue; + + /* + * Try to get locks. If any are unavailable or it is pinned, + * then this inode cannot be flushed and is skipped. + */ + + if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) + continue; + if (!xfs_iflock_nowait(iq)) { + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + if (xfs_ipincount(iq)) { + xfs_ifunlock(iq); + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + + /* + * arriving here means that this inode can be flushed. First + * re-check that it's dirty before flushing. + */ + if (!xfs_inode_clean(iq)) { + int error; + error = xfs_iflush_int(iq, bp); + if (error) { + xfs_iunlock(iq, XFS_ILOCK_SHARED); + goto cluster_corrupt_out; + } + clcount++; + } else { + xfs_ifunlock(iq); + } + xfs_iunlock(iq, XFS_ILOCK_SHARED); + } + + if (clcount) { + XFS_STATS_INC(xs_icluster_flushcnt); + XFS_STATS_ADD(xs_icluster_flushinode, clcount); + } + +out_free: + rcu_read_unlock(); + kmem_free(ilist); +out_put: + xfs_perag_put(pag); + return 0; + + +cluster_corrupt_out: + /* + * Corruption detected in the clustering loop. Invalidate the + * inode buffer and shut down the filesystem. + */ + rcu_read_unlock(); + /* + * Clean up the buffer. If it was B_DELWRI, just release it -- + * brelse can handle it with no problems. If not, shut down the + * filesystem before releasing the buffer. + */ + bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); + if (bufwasdelwri) + xfs_buf_relse(bp); + + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + + if (!bufwasdelwri) { + /* + * Just like incore_relse: if we have b_iodone functions, + * mark the buffer as an error and call them. Otherwise + * mark it as stale and brelse. + */ + if (XFS_BUF_IODONE_FUNC(bp)) { + XFS_BUF_UNDONE(bp); + XFS_BUF_STALE(bp); + XFS_BUF_ERROR(bp,EIO); + xfs_buf_ioend(bp, 0); + } else { + XFS_BUF_STALE(bp); + xfs_buf_relse(bp); + } + } + + /* + * Unlocks the flush lock + */ + xfs_iflush_abort(iq); + kmem_free(ilist); + xfs_perag_put(pag); + return XFS_ERROR(EFSCORRUPTED); +} + +/* + * xfs_iflush() will write a modified inode's changes out to the + * inode's on disk home. The caller must have the inode lock held + * in at least shared mode and the inode flush completion must be + * active as well. The inode lock will still be held upon return from + * the call and the caller is free to unlock it. + * The inode flush will be completed when the inode reaches the disk. + * The flags indicate how the inode's buffer should be written out. + */ +int +xfs_iflush( + xfs_inode_t *ip, + uint flags) +{ + xfs_inode_log_item_t *iip; + xfs_buf_t *bp; + xfs_dinode_t *dip; + xfs_mount_t *mp; + int error; + + XFS_STATS_INC(xs_iflush_count); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(!completion_done(&ip->i_flush)); + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + ip->i_d.di_nextents > ip->i_df.if_ext_max); + + iip = ip->i_itemp; + mp = ip->i_mount; + + /* + * We can't flush the inode until it is unpinned, so wait for it if we + * are allowed to block. We know no one new can pin it, because we are + * holding the inode lock shared and you need to hold it exclusively to + * pin the inode. + * + * If we are not allowed to block, force the log out asynchronously so + * that when we come back the inode will be unpinned. If other inodes + * in the same cluster are dirty, they will probably write the inode + * out for us if they occur after the log force completes. + */ + if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { + xfs_iunpin_nowait(ip); + xfs_ifunlock(ip); + return EAGAIN; + } + xfs_iunpin_wait(ip); + + /* + * For stale inodes we cannot rely on the backing buffer remaining + * stale in cache for the remaining life of the stale inode and so + * xfs_itobp() below may give us a buffer that no longer contains + * inodes below. We have to check this after ensuring the inode is + * unpinned so that it is safe to reclaim the stale inode after the + * flush call. + */ + if (xfs_iflags_test(ip, XFS_ISTALE)) { + xfs_ifunlock(ip); + return 0; + } + + /* + * This may have been unpinned because the filesystem is shutting + * down forcibly. If that's the case we must not write this inode + * to disk, because the log record didn't make it to disk! + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + ip->i_update_core = 0; + if (iip) + iip->ili_format.ilf_fields = 0; + xfs_ifunlock(ip); + return XFS_ERROR(EIO); + } + + /* + * Get the buffer containing the on-disk inode. + */ + error = xfs_itobp(mp, NULL, ip, &dip, &bp, + (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK); + if (error || !bp) { + xfs_ifunlock(ip); + return error; + } + + /* + * First flush out the inode that xfs_iflush was called with. + */ + error = xfs_iflush_int(ip, bp); + if (error) + goto corrupt_out; + + /* + * If the buffer is pinned then push on the log now so we won't + * get stuck waiting in the write for too long. + */ + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(mp, 0); + + /* + * inode clustering: + * see if other inodes can be gathered into this write + */ + error = xfs_iflush_cluster(ip, bp); + if (error) + goto cluster_corrupt_out; + + if (flags & SYNC_WAIT) + error = xfs_bwrite(mp, bp); + else + xfs_bdwrite(mp, bp); + return error; + +corrupt_out: + xfs_buf_relse(bp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); +cluster_corrupt_out: + /* + * Unlocks the flush lock + */ + xfs_iflush_abort(ip); + return XFS_ERROR(EFSCORRUPTED); +} + + +STATIC int +xfs_iflush_int( + xfs_inode_t *ip, + xfs_buf_t *bp) +{ + xfs_inode_log_item_t *iip; + xfs_dinode_t *dip; + xfs_mount_t *mp; +#ifdef XFS_TRANS_DEBUG + int first; +#endif + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(!completion_done(&ip->i_flush)); + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + ip->i_d.di_nextents > ip->i_df.if_ext_max); + + iip = ip->i_itemp; + mp = ip->i_mount; + + /* set *dip = inode's place in the buffer */ + dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); + + /* + * Clear i_update_core before copying out the data. + * This is for coordination with our timestamp updates + * that don't hold the inode lock. They will always + * update the timestamps BEFORE setting i_update_core, + * so if we clear i_update_core after they set it we + * are guaranteed to see their updates to the timestamps. + * I believe that this depends on strongly ordered memory + * semantics, but we have that. We use the SYNCHRONIZE + * macro to make sure that the compiler does not reorder + * the i_update_core access below the data copy below. + */ + ip->i_update_core = 0; + SYNCHRONIZE(); + + /* + * Make sure to get the latest timestamps from the Linux inode. + */ + xfs_synchronize_times(ip); + + if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, + mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", + __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); + goto corrupt_out; + } + if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, + mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x", + __func__, ip->i_ino, ip, ip->i_d.di_magic); + goto corrupt_out; + } + if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + if (XFS_TEST_ERROR( + (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && + (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad regular inode %Lu, ptr 0x%p", + __func__, ip->i_ino, ip); + goto corrupt_out; + } + } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (XFS_TEST_ERROR( + (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && + (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && + (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), + mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad directory inode %Lu, ptr 0x%p", + __func__, ip->i_ino, ip); + goto corrupt_out; + } + } + if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > + ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, + XFS_RANDOM_IFLUSH_5)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: detected corrupt incore inode %Lu, " + "total extents = %d, nblocks = %Ld, ptr 0x%p", + __func__, ip->i_ino, + ip->i_d.di_nextents + ip->i_d.di_anextents, + ip->i_d.di_nblocks, ip); + goto corrupt_out; + } + if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, + mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p", + __func__, ip->i_ino, ip->i_d.di_forkoff, ip); + goto corrupt_out; + } + /* + * bump the flush iteration count, used to detect flushes which + * postdate a log record during recovery. + */ + + ip->i_d.di_flushiter++; + + /* + * Copy the dirty parts of the inode into the on-disk + * inode. We always copy out the core of the inode, + * because if the inode is dirty at all the core must + * be. + */ + xfs_dinode_to_disk(dip, &ip->i_d); + + /* Wrap, we never let the log put out DI_MAX_FLUSH */ + if (ip->i_d.di_flushiter == DI_MAX_FLUSH) + ip->i_d.di_flushiter = 0; + + /* + * If this is really an old format inode and the superblock version + * has not been updated to support only new format inodes, then + * convert back to the old inode format. If the superblock version + * has been updated, then make the conversion permanent. + */ + ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); + if (ip->i_d.di_version == 1) { + if (!xfs_sb_version_hasnlink(&mp->m_sb)) { + /* + * Convert it back. + */ + ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); + dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink); + } else { + /* + * The superblock version has already been bumped, + * so just make the conversion to the new inode + * format permanent. + */ + ip->i_d.di_version = 2; + dip->di_version = 2; + ip->i_d.di_onlink = 0; + dip->di_onlink = 0; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + memset(&(dip->di_pad[0]), 0, + sizeof(dip->di_pad)); + ASSERT(xfs_get_projid(ip) == 0); + } + } + + xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp); + if (XFS_IFORK_Q(ip)) + xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); + xfs_inobp_check(mp, bp); + + /* + * We've recorded everything logged in the inode, so we'd + * like to clear the ilf_fields bits so we don't log and + * flush things unnecessarily. However, we can't stop + * logging all this information until the data we've copied + * into the disk buffer is written to disk. If we did we might + * overwrite the copy of the inode in the log with all the + * data after re-logging only part of it, and in the face of + * a crash we wouldn't have all the data we need to recover. + * + * What we do is move the bits to the ili_last_fields field. + * When logging the inode, these bits are moved back to the + * ilf_fields field. In the xfs_iflush_done() routine we + * clear ili_last_fields, since we know that the information + * those bits represent is permanently on disk. As long as + * the flush completes before the inode is logged again, then + * both ilf_fields and ili_last_fields will be cleared. + * + * We can play with the ilf_fields bits here, because the inode + * lock must be held exclusively in order to set bits there + * and the flush lock protects the ili_last_fields bits. + * Set ili_logged so the flush done + * routine can tell whether or not to look in the AIL. + * Also, store the current LSN of the inode so that we can tell + * whether the item has moved in the AIL from xfs_iflush_done(). + * In order to read the lsn we need the AIL lock, because + * it is a 64 bit value that cannot be read atomically. + */ + if (iip != NULL && iip->ili_format.ilf_fields != 0) { + iip->ili_last_fields = iip->ili_format.ilf_fields; + iip->ili_format.ilf_fields = 0; + iip->ili_logged = 1; + + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); + + /* + * Attach the function xfs_iflush_done to the inode's + * buffer. This will remove the inode from the AIL + * and unlock the inode's flush lock when the inode is + * completely written to disk. + */ + xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); + + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); + } else { + /* + * We're flushing an inode which is not in the AIL and has + * not been logged but has i_update_core set. For this + * case we can use a B_DELWRI flush and immediately drop + * the inode flush lock because we can avoid the whole + * AIL state thing. It's OK to drop the flush lock now, + * because we've already locked the buffer and to do anything + * you really need both. + */ + if (iip != NULL) { + ASSERT(iip->ili_logged == 0); + ASSERT(iip->ili_last_fields == 0); + ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); + } + xfs_ifunlock(ip); + } + + return 0; + +corrupt_out: + return XFS_ERROR(EFSCORRUPTED); +} + +void +xfs_promote_inode( + struct xfs_inode *ip) +{ + struct xfs_buf *bp; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + + bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno, + ip->i_imap.im_len, XBF_TRYLOCK); + if (!bp) + return; + + if (XFS_BUF_ISDELAYWRITE(bp)) { + xfs_buf_delwri_promote(bp); + wake_up_process(ip->i_mount->m_ddev_targp->bt_task); + } + + xfs_buf_relse(bp); +} + +/* + * Return a pointer to the extent record at file index idx. + */ +xfs_bmbt_rec_host_t * +xfs_iext_get_ext( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx) /* index of target extent */ +{ + ASSERT(idx >= 0); + ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + + if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { + return ifp->if_u1.if_ext_irec->er_extbuf; + } else if (ifp->if_flags & XFS_IFEXTIREC) { + xfs_ext_irec_t *erp; /* irec pointer */ + int erp_idx = 0; /* irec index */ + xfs_extnum_t page_idx = idx; /* ext index in target list */ + + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); + return &erp->er_extbuf[page_idx]; + } else if (ifp->if_bytes) { + return &ifp->if_u1.if_extents[idx]; + } else { + return NULL; + } +} + +/* + * Insert new item(s) into the extent records for incore inode + * fork 'ifp'. 'count' new items are inserted at index 'idx'. + */ +void +xfs_iext_insert( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* starting index of new items */ + xfs_extnum_t count, /* number of inserted items */ + xfs_bmbt_irec_t *new, /* items to insert */ + int state) /* type of extent conversion */ +{ + xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_extnum_t i; /* extent record index */ + + trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); + + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + xfs_iext_add(ifp, idx, count); + for (i = idx; i < idx + count; i++, new++) + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); +} + +/* + * This is called when the amount of space required for incore file + * extents needs to be increased. The ext_diff parameter stores the + * number of new extents being added and the idx parameter contains + * the extent index where the new extents will be added. If the new + * extents are being appended, then we just need to (re)allocate and + * initialize the space. Otherwise, if the new extents are being + * inserted into the middle of the existing entries, a bit more work + * is required to make room for the new extents to be inserted. The + * caller is responsible for filling in the new extent entries upon + * return. + */ +void +xfs_iext_add( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin adding exts */ + int ext_diff) /* number of extents to add */ +{ + int byte_diff; /* new bytes being added */ + int new_size; /* size of extents after adding */ + xfs_extnum_t nextents; /* number of extents in file */ + + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT((idx >= 0) && (idx <= nextents)); + byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); + new_size = ifp->if_bytes + byte_diff; + /* + * If the new number of extents (nextents + ext_diff) + * fits inside the inode, then continue to use the inline + * extent buffer. + */ + if (nextents + ext_diff <= XFS_INLINE_EXTS) { + if (idx < nextents) { + memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], + &ifp->if_u2.if_inline_ext[idx], + (nextents - idx) * sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); + } + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + ifp->if_real_bytes = 0; + } + /* + * Otherwise use a linear (direct) extent list. + * If the extents are currently inside the inode, + * xfs_iext_realloc_direct will switch us from + * inline to direct extent allocation mode. + */ + else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { + xfs_iext_realloc_direct(ifp, new_size); + if (idx < nextents) { + memmove(&ifp->if_u1.if_extents[idx + ext_diff], + &ifp->if_u1.if_extents[idx], + (nextents - idx) * sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); + } + } + /* Indirection array */ + else { + xfs_ext_irec_t *erp; + int erp_idx = 0; + int page_idx = idx; + + ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); + if (ifp->if_flags & XFS_IFEXTIREC) { + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); + } else { + xfs_iext_irec_init(ifp); + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = ifp->if_u1.if_ext_irec; + } + /* Extents fit in target extent page */ + if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { + if (page_idx < erp->er_extcount) { + memmove(&erp->er_extbuf[page_idx + ext_diff], + &erp->er_extbuf[page_idx], + (erp->er_extcount - page_idx) * + sizeof(xfs_bmbt_rec_t)); + memset(&erp->er_extbuf[page_idx], 0, byte_diff); + } + erp->er_extcount += ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + } + /* Insert a new extent page */ + else if (erp) { + xfs_iext_add_indirect_multi(ifp, + erp_idx, page_idx, ext_diff); + } + /* + * If extent(s) are being appended to the last page in + * the indirection array and the new extent(s) don't fit + * in the page, then erp is NULL and erp_idx is set to + * the next index needed in the indirection array. + */ + else { + int count = ext_diff; + + while (count) { + erp = xfs_iext_irec_new(ifp, erp_idx); + erp->er_extcount = count; + count -= MIN(count, (int)XFS_LINEAR_EXTS); + if (count) { + erp_idx++; + } + } + } + } + ifp->if_bytes = new_size; +} + +/* + * This is called when incore extents are being added to the indirection + * array and the new extents do not fit in the target extent list. The + * erp_idx parameter contains the irec index for the target extent list + * in the indirection array, and the idx parameter contains the extent + * index within the list. The number of extents being added is stored + * in the count parameter. + * + * |-------| |-------| + * | | | | idx - number of extents before idx + * | idx | | count | + * | | | | count - number of extents being inserted at idx + * |-------| |-------| + * | count | | nex2 | nex2 - number of extents after idx + count + * |-------| |-------| + */ +void +xfs_iext_add_indirect_multi( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx, /* target extent irec index */ + xfs_extnum_t idx, /* index within target list */ + int count) /* new extents being added */ +{ + int byte_diff; /* new bytes being added */ + xfs_ext_irec_t *erp; /* pointer to irec entry */ + xfs_extnum_t ext_diff; /* number of extents to add */ + xfs_extnum_t ext_cnt; /* new extents still needed */ + xfs_extnum_t nex2; /* extents after idx + count */ + xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ + int nlists; /* number of irec's (lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + nex2 = erp->er_extcount - idx; + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + + /* + * Save second part of target extent list + * (all extents past */ + if (nex2) { + byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); + nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); + memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); + erp->er_extcount -= nex2; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); + memset(&erp->er_extbuf[idx], 0, byte_diff); + } + + /* + * Add the new extents to the end of the target + * list, then allocate new irec record(s) and + * extent buffer(s) as needed to store the rest + * of the new extents. + */ + ext_cnt = count; + ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); + if (ext_diff) { + erp->er_extcount += ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + ext_cnt -= ext_diff; + } + while (ext_cnt) { + erp_idx++; + erp = xfs_iext_irec_new(ifp, erp_idx); + ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); + erp->er_extcount = ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + ext_cnt -= ext_diff; + } + + /* Add nex2 extents back to indirection array */ + if (nex2) { + xfs_extnum_t ext_avail; + int i; + + byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); + ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; + i = 0; + /* + * If nex2 extents fit in the current page, append + * nex2_ep after the new extents. + */ + if (nex2 <= ext_avail) { + i = erp->er_extcount; + } + /* + * Otherwise, check if space is available in the + * next page. + */ + else if ((erp_idx < nlists - 1) && + (nex2 <= (ext_avail = XFS_LINEAR_EXTS - + ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { + erp_idx++; + erp++; + /* Create a hole for nex2 extents */ + memmove(&erp->er_extbuf[nex2], erp->er_extbuf, + erp->er_extcount * sizeof(xfs_bmbt_rec_t)); + } + /* + * Final choice, create a new extent page for + * nex2 extents. + */ + else { + erp_idx++; + erp = xfs_iext_irec_new(ifp, erp_idx); + } + memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); + kmem_free(nex2_ep); + erp->er_extcount += nex2; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); + } +} + +/* + * This is called when the amount of space required for incore file + * extents needs to be decreased. The ext_diff parameter stores the + * number of extents to be removed and the idx parameter contains + * the extent index where the extents will be removed from. + * + * If the amount of space needed has decreased below the linear + * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous + * extent array. Otherwise, use kmem_realloc() to adjust the + * size to what is needed. + */ +void +xfs_iext_remove( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff, /* number of extents to remove */ + int state) /* type of extent conversion */ +{ + xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_extnum_t nextents; /* number of extents in file */ + int new_size; /* size of extents after removal */ + + trace_xfs_iext_remove(ip, idx, state, _RET_IP_); + + ASSERT(ext_diff > 0); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); + + if (new_size == 0) { + xfs_iext_destroy(ifp); + } else if (ifp->if_flags & XFS_IFEXTIREC) { + xfs_iext_remove_indirect(ifp, idx, ext_diff); + } else if (ifp->if_real_bytes) { + xfs_iext_remove_direct(ifp, idx, ext_diff); + } else { + xfs_iext_remove_inline(ifp, idx, ext_diff); + } + ifp->if_bytes = new_size; +} + +/* + * This removes ext_diff extents from the inline buffer, beginning + * at extent index idx. + */ +void +xfs_iext_remove_inline( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff) /* number of extents to remove */ +{ + int nextents; /* number of extents in file */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + ASSERT(idx < XFS_INLINE_EXTS); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(((nextents - ext_diff) > 0) && + (nextents - ext_diff) < XFS_INLINE_EXTS); + + if (idx + ext_diff < nextents) { + memmove(&ifp->if_u2.if_inline_ext[idx], + &ifp->if_u2.if_inline_ext[idx + ext_diff], + (nextents - (idx + ext_diff)) * + sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], + 0, ext_diff * sizeof(xfs_bmbt_rec_t)); + } else { + memset(&ifp->if_u2.if_inline_ext[idx], 0, + ext_diff * sizeof(xfs_bmbt_rec_t)); + } +} + +/* + * This removes ext_diff extents from a linear (direct) extent list, + * beginning at extent index idx. If the extents are being removed + * from the end of the list (ie. truncate) then we just need to re- + * allocate the list to remove the extra space. Otherwise, if the + * extents are being removed from the middle of the existing extent + * entries, then we first need to move the extent records beginning + * at idx + ext_diff up in the list to overwrite the records being + * removed, then remove the extra space via kmem_realloc. + */ +void +xfs_iext_remove_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff) /* number of extents to remove */ +{ + xfs_extnum_t nextents; /* number of extents in file */ + int new_size; /* size of extents after removal */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + new_size = ifp->if_bytes - + (ext_diff * sizeof(xfs_bmbt_rec_t)); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + + if (new_size == 0) { + xfs_iext_destroy(ifp); + return; + } + /* Move extents up in the list (if needed) */ + if (idx + ext_diff < nextents) { + memmove(&ifp->if_u1.if_extents[idx], + &ifp->if_u1.if_extents[idx + ext_diff], + (nextents - (idx + ext_diff)) * + sizeof(xfs_bmbt_rec_t)); + } + memset(&ifp->if_u1.if_extents[nextents - ext_diff], + 0, ext_diff * sizeof(xfs_bmbt_rec_t)); + /* + * Reallocate the direct extent list. If the extents + * will fit inside the inode then xfs_iext_realloc_direct + * will switch from direct to inline extent allocation + * mode for us. + */ + xfs_iext_realloc_direct(ifp, new_size); + ifp->if_bytes = new_size; +} + +/* + * This is called when incore extents are being removed from the + * indirection array and the extents being removed span multiple extent + * buffers. The idx parameter contains the file extent index where we + * want to begin removing extents, and the count parameter contains + * how many extents need to be removed. + * + * |-------| |-------| + * | nex1 | | | nex1 - number of extents before idx + * |-------| | count | + * | | | | count - number of extents being removed at idx + * | count | |-------| + * | | | nex2 | nex2 - number of extents after idx + count + * |-------| |-------| + */ +void +xfs_iext_remove_indirect( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing extents */ + int count) /* number of extents to remove */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int erp_idx = 0; /* indirection array index */ + xfs_extnum_t ext_cnt; /* extents left to remove */ + xfs_extnum_t ext_diff; /* extents to remove in current list */ + xfs_extnum_t nex1; /* number of extents before idx */ + xfs_extnum_t nex2; /* extents after idx + count */ + int page_idx = idx; /* index in target extent list */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); + ASSERT(erp != NULL); + nex1 = page_idx; + ext_cnt = count; + while (ext_cnt) { + nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); + ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); + /* + * Check for deletion of entire list; + * xfs_iext_irec_remove() updates extent offsets. + */ + if (ext_diff == erp->er_extcount) { + xfs_iext_irec_remove(ifp, erp_idx); + ext_cnt -= ext_diff; + nex1 = 0; + if (ext_cnt) { + ASSERT(erp_idx < ifp->if_real_bytes / + XFS_IEXT_BUFSZ); + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + nex1 = 0; + continue; + } else { + break; + } + } + /* Move extents up (if needed) */ + if (nex2) { + memmove(&erp->er_extbuf[nex1], + &erp->er_extbuf[nex1 + ext_diff], + nex2 * sizeof(xfs_bmbt_rec_t)); + } + /* Zero out rest of page */ + memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - + ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); + /* Update remaining counters */ + erp->er_extcount -= ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); + ext_cnt -= ext_diff; + nex1 = 0; + erp_idx++; + erp++; + } + ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); + xfs_iext_irec_compact(ifp); +} + +/* + * Create, destroy, or resize a linear (direct) block of extents. + */ +void +xfs_iext_realloc_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* new size of extents */ +{ + int rnew_size; /* real new size of extents */ + + rnew_size = new_size; + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || + ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && + (new_size != ifp->if_real_bytes))); + + /* Free extent records */ + if (new_size == 0) { + xfs_iext_destroy(ifp); + } + /* Resize direct extent list and zero any new bytes */ + else if (ifp->if_real_bytes) { + /* Check if extents will fit inside the inode */ + if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { + xfs_iext_direct_to_inline(ifp, new_size / + (uint)sizeof(xfs_bmbt_rec_t)); + ifp->if_bytes = new_size; + return; + } + if (!is_power_of_2(new_size)){ + rnew_size = roundup_pow_of_two(new_size); + } + if (rnew_size != ifp->if_real_bytes) { + ifp->if_u1.if_extents = + kmem_realloc(ifp->if_u1.if_extents, + rnew_size, + ifp->if_real_bytes, KM_NOFS); + } + if (rnew_size > ifp->if_real_bytes) { + memset(&ifp->if_u1.if_extents[ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t)], 0, + rnew_size - ifp->if_real_bytes); + } + } + /* + * Switch from the inline extent buffer to a direct + * extent list. Be sure to include the inline extent + * bytes in new_size. + */ + else { + new_size += ifp->if_bytes; + if (!is_power_of_2(new_size)) { + rnew_size = roundup_pow_of_two(new_size); + } + xfs_iext_inline_to_direct(ifp, rnew_size); + } + ifp->if_real_bytes = rnew_size; + ifp->if_bytes = new_size; +} + +/* + * Switch from linear (direct) extent records to inline buffer. + */ +void +xfs_iext_direct_to_inline( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t nextents) /* number of extents in file */ +{ + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(nextents <= XFS_INLINE_EXTS); + /* + * The inline buffer was zeroed when we switched + * from inline to direct extent allocation mode, + * so we don't need to clear it here. + */ + memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, + nextents * sizeof(xfs_bmbt_rec_t)); + kmem_free(ifp->if_u1.if_extents); + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + ifp->if_real_bytes = 0; +} + +/* + * Switch from inline buffer to linear (direct) extent records. + * new_size should already be rounded up to the next power of 2 + * by the caller (when appropriate), so use new_size as it is. + * However, since new_size may be rounded up, we can't update + * if_bytes here. It is the caller's responsibility to update + * if_bytes upon return. + */ +void +xfs_iext_inline_to_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* number of extents in file */ +{ + ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); + memset(ifp->if_u1.if_extents, 0, new_size); + if (ifp->if_bytes) { + memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, + ifp->if_bytes); + memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * + sizeof(xfs_bmbt_rec_t)); + } + ifp->if_real_bytes = new_size; +} + +/* + * Resize an extent indirection array to new_size bytes. + */ +STATIC void +xfs_iext_realloc_indirect( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* new indirection array size */ +{ + int nlists; /* number of irec's (ex lists) */ + int size; /* current indirection array size */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + size = nlists * sizeof(xfs_ext_irec_t); + ASSERT(ifp->if_real_bytes); + ASSERT((new_size >= 0) && (new_size != size)); + if (new_size == 0) { + xfs_iext_destroy(ifp); + } else { + ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) + kmem_realloc(ifp->if_u1.if_ext_irec, + new_size, size, KM_NOFS); + } +} + +/* + * Switch from indirection array to linear (direct) extent allocations. + */ +STATIC void +xfs_iext_indirect_to_direct( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + xfs_extnum_t nextents; /* number of extents in file */ + int size; /* size of file extents */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(nextents <= XFS_LINEAR_EXTS); + size = nextents * sizeof(xfs_bmbt_rec_t); + + xfs_iext_irec_compact_pages(ifp); + ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); + + ep = ifp->if_u1.if_ext_irec->er_extbuf; + kmem_free(ifp->if_u1.if_ext_irec); + ifp->if_flags &= ~XFS_IFEXTIREC; + ifp->if_u1.if_extents = ep; + ifp->if_bytes = size; + if (nextents < XFS_LINEAR_EXTS) { + xfs_iext_realloc_direct(ifp, size); + } +} + +/* + * Free incore file extents. + */ +void +xfs_iext_destroy( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + if (ifp->if_flags & XFS_IFEXTIREC) { + int erp_idx; + int nlists; + + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { + xfs_iext_irec_remove(ifp, erp_idx); + } + ifp->if_flags &= ~XFS_IFEXTIREC; + } else if (ifp->if_real_bytes) { + kmem_free(ifp->if_u1.if_extents); + } else if (ifp->if_bytes) { + memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * + sizeof(xfs_bmbt_rec_t)); + } + ifp->if_u1.if_extents = NULL; + ifp->if_real_bytes = 0; + ifp->if_bytes = 0; +} + +/* + * Return a pointer to the extent record for file system block bno. + */ +xfs_bmbt_rec_host_t * /* pointer to found extent record */ +xfs_iext_bno_to_ext( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number to search for */ + xfs_extnum_t *idxp) /* index of target extent */ +{ + xfs_bmbt_rec_host_t *base; /* pointer to first extent */ + xfs_filblks_t blockcount = 0; /* number of blocks in extent */ + xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ + xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ + int high; /* upper boundary in search */ + xfs_extnum_t idx = 0; /* index of target extent */ + int low; /* lower boundary in search */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_fileoff_t startoff = 0; /* start offset of extent */ + + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *idxp = 0; + return NULL; + } + low = 0; + if (ifp->if_flags & XFS_IFEXTIREC) { + /* Find target extent list */ + int erp_idx = 0; + erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); + base = erp->er_extbuf; + high = erp->er_extcount - 1; + } else { + base = ifp->if_u1.if_extents; + high = nextents - 1; + } + /* Binary search extent records */ + while (low <= high) { + idx = (low + high) >> 1; + ep = base + idx; + startoff = xfs_bmbt_get_startoff(ep); + blockcount = xfs_bmbt_get_blockcount(ep); + if (bno < startoff) { + high = idx - 1; + } else if (bno >= startoff + blockcount) { + low = idx + 1; + } else { + /* Convert back to file-based extent index */ + if (ifp->if_flags & XFS_IFEXTIREC) { + idx += erp->er_extoff; + } + *idxp = idx; + return ep; + } + } + /* Convert back to file-based extent index */ + if (ifp->if_flags & XFS_IFEXTIREC) { + idx += erp->er_extoff; + } + if (bno >= startoff + blockcount) { + if (++idx == nextents) { + ep = NULL; + } else { + ep = xfs_iext_get_ext(ifp, idx); + } + } + *idxp = idx; + return ep; +} + +/* + * Return a pointer to the indirection array entry containing the + * extent record for filesystem block bno. Store the index of the + * target irec in *erp_idxp. + */ +xfs_ext_irec_t * /* pointer to found extent record */ +xfs_iext_bno_to_irec( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number to search for */ + int *erp_idxp) /* irec index of target ext list */ +{ + xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ + xfs_ext_irec_t *erp_next; /* next indirection array entry */ + int erp_idx; /* indirection array index */ + int nlists; /* number of extent irec's (lists) */ + int high; /* binary search upper limit */ + int low; /* binary search lower limit */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp_idx = 0; + low = 0; + high = nlists - 1; + while (low <= high) { + erp_idx = (low + high) >> 1; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; + if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { + high = erp_idx - 1; + } else if (erp_next && bno >= + xfs_bmbt_get_startoff(erp_next->er_extbuf)) { + low = erp_idx + 1; + } else { + break; + } + } + *erp_idxp = erp_idx; + return erp; +} + +/* + * Return a pointer to the indirection array entry containing the + * extent record at file extent index *idxp. Store the index of the + * target irec in *erp_idxp and store the page index of the target + * extent record in *idxp. + */ +xfs_ext_irec_t * +xfs_iext_idx_to_irec( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t *idxp, /* extent index (file -> page) */ + int *erp_idxp, /* pointer to target irec */ + int realloc) /* new bytes were just added */ +{ + xfs_ext_irec_t *prev; /* pointer to previous irec */ + xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ + int erp_idx; /* indirection array index */ + int nlists; /* number of irec's (ex lists) */ + int high; /* binary search upper limit */ + int low; /* binary search lower limit */ + xfs_extnum_t page_idx = *idxp; /* extent index in target list */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + ASSERT(page_idx >= 0); + ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); + + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp_idx = 0; + low = 0; + high = nlists - 1; + + /* Binary search extent irec's */ + while (low <= high) { + erp_idx = (low + high) >> 1; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + prev = erp_idx > 0 ? erp - 1 : NULL; + if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && + realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { + high = erp_idx - 1; + } else if (page_idx > erp->er_extoff + erp->er_extcount || + (page_idx == erp->er_extoff + erp->er_extcount && + !realloc)) { + low = erp_idx + 1; + } else if (page_idx == erp->er_extoff + erp->er_extcount && + erp->er_extcount == XFS_LINEAR_EXTS) { + ASSERT(realloc); + page_idx = 0; + erp_idx++; + erp = erp_idx < nlists ? erp + 1 : NULL; + break; + } else { + page_idx -= erp->er_extoff; + break; + } + } + *idxp = page_idx; + *erp_idxp = erp_idx; + return(erp); +} + +/* + * Allocate and initialize an indirection array once the space needed + * for incore extents increases above XFS_IEXT_BUFSZ. + */ +void +xfs_iext_irec_init( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + xfs_extnum_t nextents; /* number of extents in file */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(nextents <= XFS_LINEAR_EXTS); + + erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); + + if (nextents == 0) { + ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); + } else if (!ifp->if_real_bytes) { + xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); + } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { + xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); + } + erp->er_extbuf = ifp->if_u1.if_extents; + erp->er_extcount = nextents; + erp->er_extoff = 0; + + ifp->if_flags |= XFS_IFEXTIREC; + ifp->if_real_bytes = XFS_IEXT_BUFSZ; + ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); + ifp->if_u1.if_ext_irec = erp; + + return; +} + +/* + * Allocate and initialize a new entry in the indirection array. + */ +xfs_ext_irec_t * +xfs_iext_irec_new( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx) /* index for new irec */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + + /* Resize indirection array */ + xfs_iext_realloc_indirect(ifp, ++nlists * + sizeof(xfs_ext_irec_t)); + /* + * Move records down in the array so the + * new page can use erp_idx. + */ + erp = ifp->if_u1.if_ext_irec; + for (i = nlists - 1; i > erp_idx; i--) { + memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); + } + ASSERT(i == erp_idx); + + /* Initialize new extent record */ + erp = ifp->if_u1.if_ext_irec; + erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); + ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; + memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); + erp[erp_idx].er_extcount = 0; + erp[erp_idx].er_extoff = erp_idx > 0 ? + erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; + return (&erp[erp_idx]); +} + +/* + * Remove a record from the indirection array. + */ +void +xfs_iext_irec_remove( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx) /* irec index to remove */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + if (erp->er_extbuf) { + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, + -erp->er_extcount); + kmem_free(erp->er_extbuf); + } + /* Compact extent records */ + erp = ifp->if_u1.if_ext_irec; + for (i = erp_idx; i < nlists - 1; i++) { + memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); + } + /* + * Manually free the last extent record from the indirection + * array. A call to xfs_iext_realloc_indirect() with a size + * of zero would result in a call to xfs_iext_destroy() which + * would in turn call this function again, creating a nasty + * infinite loop. + */ + if (--nlists) { + xfs_iext_realloc_indirect(ifp, + nlists * sizeof(xfs_ext_irec_t)); + } else { + kmem_free(ifp->if_u1.if_ext_irec); + } + ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; +} + +/* + * This is called to clean up large amounts of unused memory allocated + * by the indirection array. Before compacting anything though, verify + * that the indirection array is still needed and switch back to the + * linear extent list (or even the inline buffer) if possible. The + * compaction policy is as follows: + * + * Full Compaction: Extents fit into a single page (or inline buffer) + * Partial Compaction: Extents occupy less than 50% of allocated space + * No Compaction: Extents occupy at least 50% of allocated space + */ +void +xfs_iext_irec_compact( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_extnum_t nextents; /* number of extents in file */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + + if (nextents == 0) { + xfs_iext_destroy(ifp); + } else if (nextents <= XFS_INLINE_EXTS) { + xfs_iext_indirect_to_direct(ifp); + xfs_iext_direct_to_inline(ifp, nextents); + } else if (nextents <= XFS_LINEAR_EXTS) { + xfs_iext_indirect_to_direct(ifp); + } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { + xfs_iext_irec_compact_pages(ifp); + } +} + +/* + * Combine extents from neighboring extent pages. + */ +void +xfs_iext_irec_compact_pages( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ + int erp_idx = 0; /* indirection array index */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + while (erp_idx < nlists - 1) { + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + erp_next = erp + 1; + if (erp_next->er_extcount <= + (XFS_LINEAR_EXTS - erp->er_extcount)) { + memcpy(&erp->er_extbuf[erp->er_extcount], + erp_next->er_extbuf, erp_next->er_extcount * + sizeof(xfs_bmbt_rec_t)); + erp->er_extcount += erp_next->er_extcount; + /* + * Free page before removing extent record + * so er_extoffs don't get modified in + * xfs_iext_irec_remove. + */ + kmem_free(erp_next->er_extbuf); + erp_next->er_extbuf = NULL; + xfs_iext_irec_remove(ifp, erp_idx + 1); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + } else { + erp_idx++; + } + } +} + +/* + * This is called to update the er_extoff field in the indirection + * array when extents have been added or removed from one of the + * extent lists. erp_idx contains the irec index to begin updating + * at and ext_diff contains the number of extents that were added + * or removed. + */ +void +xfs_iext_irec_update_extoffs( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx, /* irec index to update */ + int ext_diff) /* number of new extents */ +{ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (i = erp_idx; i < nlists; i++) { + ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; + } +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h new file mode 100644 index 00000000..28b35964 --- /dev/null +++ b/fs/xfs/xfs_inode.h @@ -0,0 +1,600 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_H__ +#define __XFS_INODE_H__ + +struct posix_acl; +struct xfs_dinode; +struct xfs_inode; + +/* + * Fork identifiers. + */ +#define XFS_DATA_FORK 0 +#define XFS_ATTR_FORK 1 + +/* + * The following xfs_ext_irec_t struct introduces a second (top) level + * to the in-core extent allocation scheme. These structs are allocated + * in a contiguous block, creating an indirection array where each entry + * (irec) contains a pointer to a buffer of in-core extent records which + * it manages. Each extent buffer is 4k in size, since 4k is the system + * page size on Linux i386 and systems with larger page sizes don't seem + * to gain much, if anything, by using their native page size as the + * extent buffer size. Also, using 4k extent buffers everywhere provides + * a consistent interface for CXFS across different platforms. + * + * There is currently no limit on the number of irec's (extent lists) + * allowed, so heavily fragmented files may require an indirection array + * which spans multiple system pages of memory. The number of extents + * which would require this amount of contiguous memory is very large + * and should not cause problems in the foreseeable future. However, + * if the memory needed for the contiguous array ever becomes a problem, + * it is possible that a third level of indirection may be required. + */ +typedef struct xfs_ext_irec { + xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */ + xfs_extnum_t er_extoff; /* extent offset in file */ + xfs_extnum_t er_extcount; /* number of extents in page/block */ +} xfs_ext_irec_t; + +/* + * File incore extent information, present for each of data & attr forks. + */ +#define XFS_IEXT_BUFSZ 4096 +#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t)) +#define XFS_INLINE_EXTS 2 +#define XFS_INLINE_DATA 32 +typedef struct xfs_ifork { + int if_bytes; /* bytes in if_u1 */ + int if_real_bytes; /* bytes allocated in if_u1 */ + struct xfs_btree_block *if_broot; /* file's incore btree root */ + short if_broot_bytes; /* bytes allocated for root */ + unsigned char if_flags; /* per-fork flags */ + unsigned char if_ext_max; /* max # of extent records */ + union { + xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ + xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ + char *if_data; /* inline file data */ + } if_u1; + union { + xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS]; + /* very small file extents */ + char if_inline_data[XFS_INLINE_DATA]; + /* very small file data */ + xfs_dev_t if_rdev; /* dev number if special */ + uuid_t if_uuid; /* mount point value */ + } if_u2; +} xfs_ifork_t; + +/* + * Inode location information. Stored in the inode and passed to + * xfs_imap_to_bp() to get a buffer and dinode for a given inode. + */ +struct xfs_imap { + xfs_daddr_t im_blkno; /* starting BB of inode chunk */ + ushort im_len; /* length in BBs of inode chunk */ + ushort im_boffset; /* inode offset in block in bytes */ +}; + +/* + * This is the xfs in-core inode structure. + * Most of the on-disk inode is embedded in the i_d field. + * + * The extent pointers/inline file space, however, are managed + * separately. The memory for this information is pointed to by + * the if_u1 unions depending on the type of the data. + * This is used to linearize the array of extents for fast in-core + * access. This is used until the file's number of extents + * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers + * are accessed through the buffer cache. + * + * Other state kept in the in-core inode is used for identification, + * locking, transactional updating, etc of the inode. + * + * Generally, we do not want to hold the i_rlock while holding the + * i_ilock. Hierarchy is i_iolock followed by i_rlock. + * + * xfs_iptr_t contains all the inode fields up to and including the + * i_mnext and i_mprev fields, it is used as a marker in the inode + * chain off the mount structure by xfs_sync calls. + */ + +typedef struct xfs_ictimestamp { + __int32_t t_sec; /* timestamp seconds */ + __int32_t t_nsec; /* timestamp nanoseconds */ +} xfs_ictimestamp_t; + +/* + * NOTE: This structure must be kept identical to struct xfs_dinode + * in xfs_dinode.h except for the endianness annotations. + */ +typedef struct xfs_icdinode { + __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __uint16_t di_mode; /* mode and type of file */ + __int8_t di_version; /* inode version */ + __int8_t di_format; /* format of di_c data */ + __uint16_t di_onlink; /* old number of links to file */ + __uint32_t di_uid; /* owner's user id */ + __uint32_t di_gid; /* owner's group id */ + __uint32_t di_nlink; /* number of links to file */ + __uint16_t di_projid_lo; /* lower part of owner's project id */ + __uint16_t di_projid_hi; /* higher part of owner's project id */ + __uint8_t di_pad[6]; /* unused, zeroed space */ + __uint16_t di_flushiter; /* incremented on flush */ + xfs_ictimestamp_t di_atime; /* time last accessed */ + xfs_ictimestamp_t di_mtime; /* time last modified */ + xfs_ictimestamp_t di_ctime; /* time created/inode modified */ + xfs_fsize_t di_size; /* number of bytes in file */ + xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */ + xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ + xfs_extnum_t di_nextents; /* number of extents in data fork */ + xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ + __int8_t di_aformat; /* format of attr fork's data */ + __uint32_t di_dmevmask; /* DMIG event mask */ + __uint16_t di_dmstate; /* DMIG state info */ + __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ + __uint32_t di_gen; /* generation number */ +} xfs_icdinode_t; + +/* + * Flags for xfs_ichgtime(). + */ +#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ +#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ + +/* + * Per-fork incore inode flags. + */ +#define XFS_IFINLINE 0x01 /* Inline data is read in */ +#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ +#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ +#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */ + +/* + * Fork handling. + */ + +#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0) +#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3)) + +#define XFS_IFORK_PTR(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + &(ip)->i_df : \ + (ip)->i_afp) +#define XFS_IFORK_DSIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_IFORK_BOFF(ip) : \ + XFS_LITINO((ip)->i_mount)) +#define XFS_IFORK_ASIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \ + 0) +#define XFS_IFORK_SIZE(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_IFORK_DSIZE(ip) : \ + XFS_IFORK_ASIZE(ip)) +#define XFS_IFORK_FORMAT(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_format : \ + (ip)->i_d.di_aformat) +#define XFS_IFORK_FMT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_format = (n)) : \ + ((ip)->i_d.di_aformat = (n))) +#define XFS_IFORK_NEXTENTS(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_nextents : \ + (ip)->i_d.di_anextents) +#define XFS_IFORK_NEXT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_nextents = (n)) : \ + ((ip)->i_d.di_anextents = (n))) + + + +#ifdef __KERNEL__ + +struct bhv_desc; +struct xfs_buf; +struct xfs_bmap_free; +struct xfs_bmbt_irec; +struct xfs_inode_log_item; +struct xfs_mount; +struct xfs_trans; +struct xfs_dquot; + +typedef struct dm_attrs_s { + __uint32_t da_dmevmask; /* DMIG event mask */ + __uint16_t da_dmstate; /* DMIG state info */ + __uint16_t da_pad; /* DMIG extra padding */ +} dm_attrs_t; + +typedef struct xfs_inode { + /* Inode linking and identification information. */ + struct xfs_mount *i_mount; /* fs mount struct ptr */ + struct xfs_dquot *i_udquot; /* user dquot */ + struct xfs_dquot *i_gdquot; /* group dquot */ + + /* Inode location stuff */ + xfs_ino_t i_ino; /* inode number (agno/agino)*/ + struct xfs_imap i_imap; /* location for xfs_imap() */ + + /* Extent information. */ + xfs_ifork_t *i_afp; /* attribute fork pointer */ + xfs_ifork_t i_df; /* data fork */ + + /* Transaction and locking information. */ + struct xfs_trans *i_transp; /* ptr to owning transaction*/ + struct xfs_inode_log_item *i_itemp; /* logging information */ + mrlock_t i_lock; /* inode lock */ + mrlock_t i_iolock; /* inode IO lock */ + struct completion i_flush; /* inode flush completion q */ + atomic_t i_pincount; /* inode pin count */ + wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ + spinlock_t i_flags_lock; /* inode i_flags lock */ + /* Miscellaneous state. */ + unsigned short i_flags; /* see defined flags below */ + unsigned char i_update_core; /* timestamps/size is dirty */ + unsigned int i_delayed_blks; /* count of delay alloc blks */ + + xfs_icdinode_t i_d; /* most of ondisk inode */ + + xfs_fsize_t i_size; /* in-memory size */ + xfs_fsize_t i_new_size; /* size when write completes */ + atomic_t i_iocount; /* outstanding I/O count */ + + /* VFS inode */ + struct inode i_vnode; /* embedded VFS inode */ +} xfs_inode_t; + +#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \ + (ip)->i_size : (ip)->i_d.di_size; + +/* Convert from vfs inode to xfs inode */ +static inline struct xfs_inode *XFS_I(struct inode *inode) +{ + return container_of(inode, struct xfs_inode, i_vnode); +} + +/* convert from xfs inode to vfs inode */ +static inline struct inode *VFS_I(struct xfs_inode *ip) +{ + return &ip->i_vnode; +} + +/* + * i_flags helper functions + */ +static inline void +__xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) +{ + ip->i_flags |= flags; +} + +static inline void +xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) +{ + spin_lock(&ip->i_flags_lock); + __xfs_iflags_set(ip, flags); + spin_unlock(&ip->i_flags_lock); +} + +static inline void +xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags) +{ + spin_lock(&ip->i_flags_lock); + ip->i_flags &= ~flags; + spin_unlock(&ip->i_flags_lock); +} + +static inline int +__xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) +{ + return (ip->i_flags & flags); +} + +static inline int +xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + spin_lock(&ip->i_flags_lock); + ret = __xfs_iflags_test(ip, flags); + spin_unlock(&ip->i_flags_lock); + return ret; +} + +static inline int +xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + + spin_lock(&ip->i_flags_lock); + ret = ip->i_flags & flags; + if (ret) + ip->i_flags &= ~flags; + spin_unlock(&ip->i_flags_lock); + return ret; +} + +/* + * Project quota id helpers (previously projid was 16bit only + * and using two 16bit values to hold new 32bit projid was chosen + * to retain compatibility with "old" filesystems). + */ +static inline prid_t +xfs_get_projid(struct xfs_inode *ip) +{ + return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo; +} + +static inline void +xfs_set_projid(struct xfs_inode *ip, + prid_t projid) +{ + ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16); + ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff); +} + +/* + * Manage the i_flush queue embedded in the inode. This completion + * queue synchronizes processes attempting to flush the in-core + * inode back to disk. + */ +static inline void xfs_iflock(xfs_inode_t *ip) +{ + wait_for_completion(&ip->i_flush); +} + +static inline int xfs_iflock_nowait(xfs_inode_t *ip) +{ + return try_wait_for_completion(&ip->i_flush); +} + +static inline void xfs_ifunlock(xfs_inode_t *ip) +{ + complete(&ip->i_flush); +} + +/* + * In-core inode flags. + */ +#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */ +#define XFS_ISTALE 0x0002 /* inode has been staled */ +#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ +#define XFS_INEW 0x0008 /* inode has just been allocated */ +#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ +#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ +#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */ + +/* + * Per-lifetime flags need to be reset when re-using a reclaimable inode during + * inode lookup. Thi prevents unintended behaviour on the new inode from + * ocurring. + */ +#define XFS_IRECLAIM_RESET_FLAGS \ + (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ + XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \ + XFS_IFILESTREAM); + +/* + * Flags for inode locking. + * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) + * 1<<16 - 1<<32-1 -- lockdep annotation (integers) + */ +#define XFS_IOLOCK_EXCL (1<<0) +#define XFS_IOLOCK_SHARED (1<<1) +#define XFS_ILOCK_EXCL (1<<2) +#define XFS_ILOCK_SHARED (1<<3) +#define XFS_IUNLOCK_NONOTIFY (1<<4) + +#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ + | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) + +#define XFS_LOCK_FLAGS \ + { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ + { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ + { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ + { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \ + { XFS_IUNLOCK_NONOTIFY, "IUNLOCK_NONOTIFY" } + + +/* + * Flags for lockdep annotations. + * + * XFS_LOCK_PARENT - for directory operations that require locking a + * parent directory inode and a child entry inode. The parent gets locked + * with this flag so it gets a lockdep subclass of 1 and the child entry + * lock will have a lockdep subclass of 0. + * + * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary + * inodes do not participate in the normal lock order, and thus have their + * own subclasses. + * + * XFS_LOCK_INUMORDER - for locking several inodes at the some time + * with xfs_lock_inodes(). This flag is used as the starting subclass + * and each subsequent lock acquired will increment the subclass by one. + * So the first lock acquired will have a lockdep subclass of 4, the + * second lock will have a lockdep subclass of 5, and so on. It is + * the responsibility of the class builder to shift this to the correct + * portion of the lock_mode lockdep mask. + */ +#define XFS_LOCK_PARENT 1 +#define XFS_LOCK_RTBITMAP 2 +#define XFS_LOCK_RTSUM 3 +#define XFS_LOCK_INUMORDER 4 + +#define XFS_IOLOCK_SHIFT 16 +#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) + +#define XFS_ILOCK_SHIFT 24 +#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT) + +#define XFS_IOLOCK_DEP_MASK 0x00ff0000 +#define XFS_ILOCK_DEP_MASK 0xff000000 +#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK) + +#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) +#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) + +extern struct lock_class_key xfs_iolock_reclaimable; + +/* + * Flags for xfs_itruncate_start(). + */ +#define XFS_ITRUNC_DEFINITE 0x1 +#define XFS_ITRUNC_MAYBE 0x2 + +#define XFS_ITRUNC_FLAGS \ + { XFS_ITRUNC_DEFINITE, "DEFINITE" }, \ + { XFS_ITRUNC_MAYBE, "MAYBE" } + +/* + * For multiple groups support: if S_ISGID bit is set in the parent + * directory, group of new file is set to that of the parent, and + * new subdirectory gets S_ISGID bit from parent. + */ +#define XFS_INHERIT_GID(pip) \ + (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ + ((pip)->i_d.di_mode & S_ISGID)) + +/* + * xfs_iget.c prototypes. + */ +int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, + uint, uint, xfs_inode_t **); +void xfs_ilock(xfs_inode_t *, uint); +int xfs_ilock_nowait(xfs_inode_t *, uint); +void xfs_iunlock(xfs_inode_t *, uint); +void xfs_ilock_demote(xfs_inode_t *, uint); +int xfs_isilocked(xfs_inode_t *, uint); +uint xfs_ilock_map_shared(xfs_inode_t *); +void xfs_iunlock_map_shared(xfs_inode_t *, uint); +void xfs_inode_free(struct xfs_inode *ip); + +/* + * xfs_inode.c prototypes. + */ +int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, + xfs_nlink_t, xfs_dev_t, prid_t, int, + struct xfs_buf **, boolean_t *, xfs_inode_t **); + +uint xfs_ip2xflags(struct xfs_inode *); +uint xfs_dic2xflags(struct xfs_dinode *); +int xfs_ifree(struct xfs_trans *, xfs_inode_t *, + struct xfs_bmap_free *); +int xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t); +int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *, + xfs_fsize_t, int, int); +int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); + +void xfs_iext_realloc(xfs_inode_t *, int, int); +void xfs_iunpin_wait(xfs_inode_t *); +int xfs_iflush(xfs_inode_t *, uint); +void xfs_promote_inode(struct xfs_inode *); +void xfs_lock_inodes(xfs_inode_t **, int, uint); +void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); + +void xfs_synchronize_times(xfs_inode_t *); +void xfs_mark_inode_dirty(xfs_inode_t *); +void xfs_mark_inode_dirty_sync(xfs_inode_t *); + +#define IHOLD(ip) \ +do { \ + ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ + ihold(VFS_I(ip)); \ + trace_xfs_ihold(ip, _THIS_IP_); \ +} while (0) + +#define IRELE(ip) \ +do { \ + trace_xfs_irele(ip, _THIS_IP_); \ + iput(VFS_I(ip)); \ +} while (0) + +#endif /* __KERNEL__ */ + +/* + * Flags for xfs_iget() + */ +#define XFS_IGET_CREATE 0x1 +#define XFS_IGET_UNTRUSTED 0x2 + +int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, + xfs_ino_t, struct xfs_dinode **, + struct xfs_buf **, int *, uint); +int xfs_itobp(struct xfs_mount *, struct xfs_trans *, + struct xfs_inode *, struct xfs_dinode **, + struct xfs_buf **, uint); +int xfs_iread(struct xfs_mount *, struct xfs_trans *, + struct xfs_inode *, uint); +void xfs_dinode_to_disk(struct xfs_dinode *, + struct xfs_icdinode *); +void xfs_idestroy_fork(struct xfs_inode *, int); +void xfs_idata_realloc(struct xfs_inode *, int, int); +void xfs_iroot_realloc(struct xfs_inode *, int, int); +int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); +int xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int); + +xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t); +void xfs_iext_insert(xfs_inode_t *, xfs_extnum_t, xfs_extnum_t, + xfs_bmbt_irec_t *, int); +void xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int); +void xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int); +void xfs_iext_remove(xfs_inode_t *, xfs_extnum_t, int, int); +void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int); +void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int); +void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int); +void xfs_iext_realloc_direct(xfs_ifork_t *, int); +void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t); +void xfs_iext_inline_to_direct(xfs_ifork_t *, int); +void xfs_iext_destroy(xfs_ifork_t *); +xfs_bmbt_rec_host_t *xfs_iext_bno_to_ext(xfs_ifork_t *, xfs_fileoff_t, int *); +xfs_ext_irec_t *xfs_iext_bno_to_irec(xfs_ifork_t *, xfs_fileoff_t, int *); +xfs_ext_irec_t *xfs_iext_idx_to_irec(xfs_ifork_t *, xfs_extnum_t *, int *, int); +void xfs_iext_irec_init(xfs_ifork_t *); +xfs_ext_irec_t *xfs_iext_irec_new(xfs_ifork_t *, int); +void xfs_iext_irec_remove(xfs_ifork_t *, int); +void xfs_iext_irec_compact(xfs_ifork_t *); +void xfs_iext_irec_compact_pages(xfs_ifork_t *); +void xfs_iext_irec_compact_full(xfs_ifork_t *); +void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); + +#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) + +#ifdef DEBUG +void xfs_isize_check(struct xfs_mount *, struct xfs_inode *, + xfs_fsize_t); +#else /* DEBUG */ +#define xfs_isize_check(mp, ip, isize) +#endif /* DEBUG */ + +#if defined(DEBUG) +void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); +#else +#define xfs_inobp_check(mp, bp) +#endif /* DEBUG */ + +extern struct kmem_zone *xfs_ifork_zone; +extern struct kmem_zone *xfs_inode_zone; +extern struct kmem_zone *xfs_ili_zone; + +#endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c new file mode 100644 index 00000000..391044c6 --- /dev/null +++ b/fs/xfs/xfs_inode_item.c @@ -0,0 +1,1060 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_trans_priv.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_error.h" +#include "xfs_trace.h" + + +kmem_zone_t *xfs_ili_zone; /* inode log item zone */ + +static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_inode_log_item, ili_item); +} + + +/* + * This returns the number of iovecs needed to log the given inode item. + * + * We need one iovec for the inode log format structure, one for the + * inode core, and possibly one for the inode data/extents/b-tree root + * and one for the inode attribute data/extents/b-tree root. + */ +STATIC uint +xfs_inode_item_size( + struct xfs_log_item *lip) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + uint nvecs = 2; + + /* + * Only log the data/extents/b-tree root if there is something + * left to log. + */ + iip->ili_format.ilf_fields |= XFS_ILOG_CORE; + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) && + (ip->i_d.di_nextents > 0) && + (ip->i_df.if_bytes > 0)) { + ASSERT(ip->i_df.if_u1.if_extents != NULL); + nvecs++; + } else { + iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT; + } + break; + + case XFS_DINODE_FMT_BTREE: + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) && + (ip->i_df.if_broot_bytes > 0)) { + ASSERT(ip->i_df.if_broot != NULL); + nvecs++; + } else { + ASSERT(!(iip->ili_format.ilf_fields & + XFS_ILOG_DBROOT)); +#ifdef XFS_TRANS_DEBUG + if (iip->ili_root_size > 0) { + ASSERT(iip->ili_root_size == + ip->i_df.if_broot_bytes); + ASSERT(memcmp(iip->ili_orig_root, + ip->i_df.if_broot, + iip->ili_root_size) == 0); + } else { + ASSERT(ip->i_df.if_broot_bytes == 0); + } +#endif + iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT; + } + break; + + case XFS_DINODE_FMT_LOCAL: + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) && + (ip->i_df.if_bytes > 0)) { + ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_d.di_size > 0); + nvecs++; + } else { + iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA; + } + break; + + case XFS_DINODE_FMT_DEV: + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEXT | XFS_ILOG_UUID); + break; + + case XFS_DINODE_FMT_UUID: + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEXT | XFS_ILOG_DEV); + break; + + default: + ASSERT(0); + break; + } + + /* + * If there are no attributes associated with this file, + * then there cannot be anything more to log. + * Clear all attribute-related log flags. + */ + if (!XFS_IFORK_Q(ip)) { + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); + return nvecs; + } + + /* + * Log any necessary attribute data. + */ + switch (ip->i_d.di_aformat) { + case XFS_DINODE_FMT_EXTENTS: + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); + if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) && + (ip->i_d.di_anextents > 0) && + (ip->i_afp->if_bytes > 0)) { + ASSERT(ip->i_afp->if_u1.if_extents != NULL); + nvecs++; + } else { + iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT; + } + break; + + case XFS_DINODE_FMT_BTREE: + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); + if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) && + (ip->i_afp->if_broot_bytes > 0)) { + ASSERT(ip->i_afp->if_broot != NULL); + nvecs++; + } else { + iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT; + } + break; + + case XFS_DINODE_FMT_LOCAL: + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); + if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) && + (ip->i_afp->if_bytes > 0)) { + ASSERT(ip->i_afp->if_u1.if_data != NULL); + nvecs++; + } else { + iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA; + } + break; + + default: + ASSERT(0); + break; + } + + return nvecs; +} + +/* + * xfs_inode_item_format_extents - convert in-core extents to on-disk form + * + * For either the data or attr fork in extent format, we need to endian convert + * the in-core extent as we place them into the on-disk inode. In this case, we + * need to do this conversion before we write the extents into the log. Because + * we don't have the disk inode to write into here, we allocate a buffer and + * format the extents into it via xfs_iextents_copy(). We free the buffer in + * the unlock routine after the copy for the log has been made. + * + * In the case of the data fork, the in-core and on-disk fork sizes can be + * different due to delayed allocation extents. We only log on-disk extents + * here, so always use the physical fork size to determine the size of the + * buffer we need to allocate. + */ +STATIC void +xfs_inode_item_format_extents( + struct xfs_inode *ip, + struct xfs_log_iovec *vecp, + int whichfork, + int type) +{ + xfs_bmbt_rec_t *ext_buffer; + + ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP); + if (whichfork == XFS_DATA_FORK) + ip->i_itemp->ili_extents_buf = ext_buffer; + else + ip->i_itemp->ili_aextents_buf = ext_buffer; + + vecp->i_addr = ext_buffer; + vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork); + vecp->i_type = type; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given inode log item. It fills the first item with an inode + * log format structure, the second with the on-disk inode structure, + * and a possible third and/or fourth with the inode data/extents/b-tree + * root and inode attributes data/extents/b-tree root. + */ +STATIC void +xfs_inode_item_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *vecp) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + uint nvecs; + size_t data_bytes; + xfs_mount_t *mp; + + vecp->i_addr = &iip->ili_format; + vecp->i_len = sizeof(xfs_inode_log_format_t); + vecp->i_type = XLOG_REG_TYPE_IFORMAT; + vecp++; + nvecs = 1; + + /* + * Clear i_update_core if the timestamps (or any other + * non-transactional modification) need flushing/logging + * and we're about to log them with the rest of the core. + * + * This is the same logic as xfs_iflush() but this code can't + * run at the same time as xfs_iflush because we're in commit + * processing here and so we have the inode lock held in + * exclusive mode. Although it doesn't really matter + * for the timestamps if both routines were to grab the + * timestamps or not. That would be ok. + * + * We clear i_update_core before copying out the data. + * This is for coordination with our timestamp updates + * that don't hold the inode lock. They will always + * update the timestamps BEFORE setting i_update_core, + * so if we clear i_update_core after they set it we + * are guaranteed to see their updates to the timestamps + * either here. Likewise, if they set it after we clear it + * here, we'll see it either on the next commit of this + * inode or the next time the inode gets flushed via + * xfs_iflush(). This depends on strongly ordered memory + * semantics, but we have that. We use the SYNCHRONIZE + * macro to make sure that the compiler does not reorder + * the i_update_core access below the data copy below. + */ + if (ip->i_update_core) { + ip->i_update_core = 0; + SYNCHRONIZE(); + } + + /* + * Make sure to get the latest timestamps from the Linux inode. + */ + xfs_synchronize_times(ip); + + vecp->i_addr = &ip->i_d; + vecp->i_len = sizeof(struct xfs_icdinode); + vecp->i_type = XLOG_REG_TYPE_ICORE; + vecp++; + nvecs++; + iip->ili_format.ilf_fields |= XFS_ILOG_CORE; + + /* + * If this is really an old format inode, then we need to + * log it as such. This means that we have to copy the link + * count from the new field to the old. We don't have to worry + * about the new fields, because nothing trusts them as long as + * the old inode version number is there. If the superblock already + * has a new version number, then we don't bother converting back. + */ + mp = ip->i_mount; + ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); + if (ip->i_d.di_version == 1) { + if (!xfs_sb_version_hasnlink(&mp->m_sb)) { + /* + * Convert it back. + */ + ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); + ip->i_d.di_onlink = ip->i_d.di_nlink; + } else { + /* + * The superblock version has already been bumped, + * so just make the conversion to the new inode + * format permanent. + */ + ip->i_d.di_version = 2; + ip->i_d.di_onlink = 0; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + } + } + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID))); + if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) { + ASSERT(ip->i_df.if_bytes > 0); + ASSERT(ip->i_df.if_u1.if_extents != NULL); + ASSERT(ip->i_d.di_nextents > 0); + ASSERT(iip->ili_extents_buf == NULL); + ASSERT((ip->i_df.if_bytes / + (uint)sizeof(xfs_bmbt_rec_t)) > 0); +#ifdef XFS_NATIVE_HOST + if (ip->i_d.di_nextents == ip->i_df.if_bytes / + (uint)sizeof(xfs_bmbt_rec_t)) { + /* + * There are no delayed allocation + * extents, so just point to the + * real extents array. + */ + vecp->i_addr = ip->i_df.if_u1.if_extents; + vecp->i_len = ip->i_df.if_bytes; + vecp->i_type = XLOG_REG_TYPE_IEXT; + } else +#endif + { + xfs_inode_item_format_extents(ip, vecp, + XFS_DATA_FORK, XLOG_REG_TYPE_IEXT); + } + ASSERT(vecp->i_len <= ip->i_df.if_bytes); + iip->ili_format.ilf_dsize = vecp->i_len; + vecp++; + nvecs++; + } + break; + + case XFS_DINODE_FMT_BTREE: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_DDATA | XFS_ILOG_DEXT | + XFS_ILOG_DEV | XFS_ILOG_UUID))); + if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) { + ASSERT(ip->i_df.if_broot_bytes > 0); + ASSERT(ip->i_df.if_broot != NULL); + vecp->i_addr = ip->i_df.if_broot; + vecp->i_len = ip->i_df.if_broot_bytes; + vecp->i_type = XLOG_REG_TYPE_IBROOT; + vecp++; + nvecs++; + iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; + } + break; + + case XFS_DINODE_FMT_LOCAL: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | + XFS_ILOG_DEV | XFS_ILOG_UUID))); + if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) { + ASSERT(ip->i_df.if_bytes > 0); + ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_d.di_size > 0); + + vecp->i_addr = ip->i_df.if_u1.if_data; + /* + * Round i_bytes up to a word boundary. + * The underlying memory is guaranteed to + * to be there by xfs_idata_realloc(). + */ + data_bytes = roundup(ip->i_df.if_bytes, 4); + ASSERT((ip->i_df.if_real_bytes == 0) || + (ip->i_df.if_real_bytes == data_bytes)); + vecp->i_len = (int)data_bytes; + vecp->i_type = XLOG_REG_TYPE_ILOCAL; + vecp++; + nvecs++; + iip->ili_format.ilf_dsize = (unsigned)data_bytes; + } + break; + + case XFS_DINODE_FMT_DEV: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | + XFS_ILOG_DDATA | XFS_ILOG_UUID))); + if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { + iip->ili_format.ilf_u.ilfu_rdev = + ip->i_df.if_u2.if_rdev; + } + break; + + case XFS_DINODE_FMT_UUID: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | + XFS_ILOG_DDATA | XFS_ILOG_DEV))); + if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { + iip->ili_format.ilf_u.ilfu_uuid = + ip->i_df.if_u2.if_uuid; + } + break; + + default: + ASSERT(0); + break; + } + + /* + * If there are no attributes associated with the file, + * then we're done. + * Assert that no attribute-related log flags are set. + */ + if (!XFS_IFORK_Q(ip)) { + ASSERT(nvecs == lip->li_desc->lid_size); + iip->ili_format.ilf_size = nvecs; + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); + return; + } + + switch (ip->i_d.di_aformat) { + case XFS_DINODE_FMT_EXTENTS: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_ADATA | XFS_ILOG_ABROOT))); + if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) { +#ifdef DEBUG + int nrecs = ip->i_afp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(nrecs > 0); + ASSERT(nrecs == ip->i_d.di_anextents); + ASSERT(ip->i_afp->if_bytes > 0); + ASSERT(ip->i_afp->if_u1.if_extents != NULL); + ASSERT(ip->i_d.di_anextents > 0); +#endif +#ifdef XFS_NATIVE_HOST + /* + * There are not delayed allocation extents + * for attributes, so just point at the array. + */ + vecp->i_addr = ip->i_afp->if_u1.if_extents; + vecp->i_len = ip->i_afp->if_bytes; + vecp->i_type = XLOG_REG_TYPE_IATTR_EXT; +#else + ASSERT(iip->ili_aextents_buf == NULL); + xfs_inode_item_format_extents(ip, vecp, + XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT); +#endif + iip->ili_format.ilf_asize = vecp->i_len; + vecp++; + nvecs++; + } + break; + + case XFS_DINODE_FMT_BTREE: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_ADATA | XFS_ILOG_AEXT))); + if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) { + ASSERT(ip->i_afp->if_broot_bytes > 0); + ASSERT(ip->i_afp->if_broot != NULL); + vecp->i_addr = ip->i_afp->if_broot; + vecp->i_len = ip->i_afp->if_broot_bytes; + vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; + vecp++; + nvecs++; + iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; + } + break; + + case XFS_DINODE_FMT_LOCAL: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); + if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) { + ASSERT(ip->i_afp->if_bytes > 0); + ASSERT(ip->i_afp->if_u1.if_data != NULL); + + vecp->i_addr = ip->i_afp->if_u1.if_data; + /* + * Round i_bytes up to a word boundary. + * The underlying memory is guaranteed to + * to be there by xfs_idata_realloc(). + */ + data_bytes = roundup(ip->i_afp->if_bytes, 4); + ASSERT((ip->i_afp->if_real_bytes == 0) || + (ip->i_afp->if_real_bytes == data_bytes)); + vecp->i_len = (int)data_bytes; + vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL; + vecp++; + nvecs++; + iip->ili_format.ilf_asize = (unsigned)data_bytes; + } + break; + + default: + ASSERT(0); + break; + } + + ASSERT(nvecs == lip->li_desc->lid_size); + iip->ili_format.ilf_size = nvecs; +} + + +/* + * This is called to pin the inode associated with the inode log + * item in memory so it cannot be written out. + */ +STATIC void +xfs_inode_item_pin( + struct xfs_log_item *lip) +{ + struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + trace_xfs_inode_pin(ip, _RET_IP_); + atomic_inc(&ip->i_pincount); +} + + +/* + * This is called to unpin the inode associated with the inode log + * item which was previously pinned with a call to xfs_inode_item_pin(). + * + * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. + */ +STATIC void +xfs_inode_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + + trace_xfs_inode_unpin(ip, _RET_IP_); + ASSERT(atomic_read(&ip->i_pincount) > 0); + if (atomic_dec_and_test(&ip->i_pincount)) + wake_up(&ip->i_ipin_wait); +} + +/* + * This is called to attempt to lock the inode associated with this + * inode log item, in preparation for the push routine which does the actual + * iflush. Don't sleep on the inode lock or the flush lock. + * + * If the flush lock is already held, indicating that the inode has + * been or is in the process of being flushed, then (ideally) we'd like to + * see if the inode's buffer is still incore, and if so give it a nudge. + * We delay doing so until the pushbuf routine, though, to avoid holding + * the AIL lock across a call to the blackhole which is the buffer cache. + * Also we don't want to sleep in any device strategy routines, which can happen + * if we do the subsequent bawrite in here. + */ +STATIC uint +xfs_inode_item_trylock( + struct xfs_log_item *lip) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + + if (xfs_ipincount(ip) > 0) + return XFS_ITEM_PINNED; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) + return XFS_ITEM_LOCKED; + + if (!xfs_iflock_nowait(ip)) { + /* + * inode has already been flushed to the backing buffer, + * leave it locked in shared mode, pushbuf routine will + * unlock it. + */ + return XFS_ITEM_PUSHBUF; + } + + /* Stale items should force out the iclog */ + if (ip->i_flags & XFS_ISTALE) { + xfs_ifunlock(ip); + /* + * we hold the AIL lock - notify the unlock routine of this + * so it doesn't try to get the lock again. + */ + xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); + return XFS_ITEM_PINNED; + } + +#ifdef DEBUG + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + ASSERT(iip->ili_format.ilf_fields != 0); + ASSERT(iip->ili_logged == 0); + ASSERT(lip->li_flags & XFS_LI_IN_AIL); + } +#endif + return XFS_ITEM_SUCCESS; +} + +/* + * Unlock the inode associated with the inode log item. + * Clear the fields of the inode and inode log item that + * are specific to the current transaction. If the + * hold flags is set, do not unlock the inode. + */ +STATIC void +xfs_inode_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + unsigned short lock_flags; + + ASSERT(iip->ili_inode->i_itemp != NULL); + ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); + + /* + * Clear the transaction pointer in the inode. + */ + ip->i_transp = NULL; + + /* + * If the inode needed a separate buffer with which to log + * its extents, then free it now. + */ + if (iip->ili_extents_buf != NULL) { + ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS); + ASSERT(ip->i_d.di_nextents > 0); + ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT); + ASSERT(ip->i_df.if_bytes > 0); + kmem_free(iip->ili_extents_buf); + iip->ili_extents_buf = NULL; + } + if (iip->ili_aextents_buf != NULL) { + ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS); + ASSERT(ip->i_d.di_anextents > 0); + ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT); + ASSERT(ip->i_afp->if_bytes > 0); + kmem_free(iip->ili_aextents_buf); + iip->ili_aextents_buf = NULL; + } + + lock_flags = iip->ili_lock_flags; + iip->ili_lock_flags = 0; + if (lock_flags) { + xfs_iunlock(iip->ili_inode, lock_flags); + IRELE(iip->ili_inode); + } +} + +/* + * This is called to find out where the oldest active copy of the inode log + * item in the on disk log resides now that the last log write of it completed + * at the given lsn. Since we always re-log all dirty data in an inode, the + * latest copy in the on disk log is the only one that matters. Therefore, + * simply return the given lsn. + * + * If the inode has been marked stale because the cluster is being freed, we + * don't want to (re-)insert this inode into the AIL. There is a race condition + * where the cluster buffer may be unpinned before the inode is inserted into + * the AIL during transaction committed processing. If the buffer is unpinned + * before the inode item has been committed and inserted, then it is possible + * for the buffer to be written and IO completes before the inode is inserted + * into the AIL. In that case, we'd be inserting a clean, stale inode into the + * AIL which will never get removed. It will, however, get reclaimed which + * triggers an assert in xfs_inode_free() complaining about freein an inode + * still in the AIL. + * + * To avoid this, just unpin the inode directly and return a LSN of -1 so the + * transaction committed code knows that it does not need to do any further + * processing on the item. + */ +STATIC xfs_lsn_t +xfs_inode_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + + if (xfs_iflags_test(ip, XFS_ISTALE)) { + xfs_inode_item_unpin(lip, 0); + return -1; + } + return lsn; +} + +/* + * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK + * failed to get the inode flush lock but did get the inode locked SHARED. + * Here we're trying to see if the inode buffer is incore, and if so whether it's + * marked delayed write. If that's the case, we'll promote it and that will + * allow the caller to write the buffer by triggering the xfsbufd to run. + */ +STATIC bool +xfs_inode_item_pushbuf( + struct xfs_log_item *lip) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct xfs_buf *bp; + bool ret = true; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); + + /* + * If a flush is not in progress anymore, chances are that the + * inode was taken off the AIL. So, just get out. + */ + if (completion_done(&ip->i_flush) || + !(lip->li_flags & XFS_LI_IN_AIL)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return true; + } + + bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno, + iip->ili_format.ilf_len, XBF_TRYLOCK); + + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (!bp) + return true; + if (XFS_BUF_ISDELAYWRITE(bp)) + xfs_buf_delwri_promote(bp); + if (XFS_BUF_ISPINNED(bp)) + ret = false; + xfs_buf_relse(bp); + return ret; +} + +/* + * This is called to asynchronously write the inode associated with this + * inode log item out to disk. The inode will already have been locked by + * a successful call to xfs_inode_item_trylock(). + */ +STATIC void +xfs_inode_item_push( + struct xfs_log_item *lip) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); + ASSERT(!completion_done(&ip->i_flush)); + + /* + * Since we were able to lock the inode's flush lock and + * we found it on the AIL, the inode must be dirty. This + * is because the inode is removed from the AIL while still + * holding the flush lock in xfs_iflush_done(). Thus, if + * we found it in the AIL and were able to obtain the flush + * lock without sleeping, then there must not have been + * anyone in the process of flushing the inode. + */ + ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || + iip->ili_format.ilf_fields != 0); + + /* + * Push the inode to it's backing buffer. This will not remove the + * inode from the AIL - a further push will be required to trigger a + * buffer push. However, this allows all the dirty inodes to be pushed + * to the buffer before it is pushed to disk. The buffer IO completion + * will pull the inode from the AIL, mark it clean and unlock the flush + * lock. + */ + (void) xfs_iflush(ip, SYNC_TRYLOCK); + xfs_iunlock(ip, XFS_ILOCK_SHARED); +} + +/* + * XXX rcc - this one really has to do something. Probably needs + * to stamp in a new field in the incore inode. + */ +STATIC void +xfs_inode_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + INODE_ITEM(lip)->ili_last_lsn = lsn; +} + +/* + * This is the ops vector shared by all buf log items. + */ +static struct xfs_item_ops xfs_inode_item_ops = { + .iop_size = xfs_inode_item_size, + .iop_format = xfs_inode_item_format, + .iop_pin = xfs_inode_item_pin, + .iop_unpin = xfs_inode_item_unpin, + .iop_trylock = xfs_inode_item_trylock, + .iop_unlock = xfs_inode_item_unlock, + .iop_committed = xfs_inode_item_committed, + .iop_push = xfs_inode_item_push, + .iop_pushbuf = xfs_inode_item_pushbuf, + .iop_committing = xfs_inode_item_committing +}; + + +/* + * Initialize the inode log item for a newly allocated (in-core) inode. + */ +void +xfs_inode_item_init( + struct xfs_inode *ip, + struct xfs_mount *mp) +{ + struct xfs_inode_log_item *iip; + + ASSERT(ip->i_itemp == NULL); + iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); + + iip->ili_inode = ip; + xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, + &xfs_inode_item_ops); + iip->ili_format.ilf_type = XFS_LI_INODE; + iip->ili_format.ilf_ino = ip->i_ino; + iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; + iip->ili_format.ilf_len = ip->i_imap.im_len; + iip->ili_format.ilf_boffset = ip->i_imap.im_boffset; +} + +/* + * Free the inode log item and any memory hanging off of it. + */ +void +xfs_inode_item_destroy( + xfs_inode_t *ip) +{ +#ifdef XFS_TRANS_DEBUG + if (ip->i_itemp->ili_root_size != 0) { + kmem_free(ip->i_itemp->ili_orig_root); + } +#endif + kmem_zone_free(xfs_ili_zone, ip->i_itemp); +} + + +/* + * This is the inode flushing I/O completion routine. It is called + * from interrupt level when the buffer containing the inode is + * flushed to disk. It is responsible for removing the inode item + * from the AIL if it has not been re-logged, and unlocking the inode's + * flush lock. + * + * To reduce AIL lock traffic as much as possible, we scan the buffer log item + * list for other inodes that will run this function. We remove them from the + * buffer list so we can process all the inode IO completions in one AIL lock + * traversal. + */ +void +xfs_iflush_done( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + struct xfs_inode_log_item *iip; + struct xfs_log_item *blip; + struct xfs_log_item *next; + struct xfs_log_item *prev; + struct xfs_ail *ailp = lip->li_ailp; + int need_ail = 0; + + /* + * Scan the buffer IO completions for other inodes being completed and + * attach them to the current inode log item. + */ + blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + prev = NULL; + while (blip != NULL) { + if (lip->li_cb != xfs_iflush_done) { + prev = blip; + blip = blip->li_bio_list; + continue; + } + + /* remove from list */ + next = blip->li_bio_list; + if (!prev) { + XFS_BUF_SET_FSPRIVATE(bp, next); + } else { + prev->li_bio_list = next; + } + + /* add to current list */ + blip->li_bio_list = lip->li_bio_list; + lip->li_bio_list = blip; + + /* + * while we have the item, do the unlocked check for needing + * the AIL lock. + */ + iip = INODE_ITEM(blip); + if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) + need_ail++; + + blip = next; + } + + /* make sure we capture the state of the initial inode. */ + iip = INODE_ITEM(lip); + if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) + need_ail++; + + /* + * We only want to pull the item from the AIL if it is + * actually there and its location in the log has not + * changed since we started the flush. Thus, we only bother + * if the ili_logged flag is set and the inode's lsn has not + * changed. First we check the lsn outside + * the lock since it's cheaper, and then we recheck while + * holding the lock before removing the inode from the AIL. + */ + if (need_ail) { + struct xfs_log_item *log_items[need_ail]; + int i = 0; + spin_lock(&ailp->xa_lock); + for (blip = lip; blip; blip = blip->li_bio_list) { + iip = INODE_ITEM(blip); + if (iip->ili_logged && + blip->li_lsn == iip->ili_flush_lsn) { + log_items[i++] = blip; + } + ASSERT(i <= need_ail); + } + /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ + xfs_trans_ail_delete_bulk(ailp, log_items, i); + } + + + /* + * clean up and unlock the flush lock now we are done. We can clear the + * ili_last_fields bits now that we know that the data corresponding to + * them is safely on disk. + */ + for (blip = lip; blip; blip = next) { + next = blip->li_bio_list; + blip->li_bio_list = NULL; + + iip = INODE_ITEM(blip); + iip->ili_logged = 0; + iip->ili_last_fields = 0; + xfs_ifunlock(iip->ili_inode); + } +} + +/* + * This is the inode flushing abort routine. It is called + * from xfs_iflush when the filesystem is shutting down to clean + * up the inode state. + * It is responsible for removing the inode item + * from the AIL if it has not been re-logged, and unlocking the inode's + * flush lock. + */ +void +xfs_iflush_abort( + xfs_inode_t *ip) +{ + xfs_inode_log_item_t *iip = ip->i_itemp; + + if (iip) { + struct xfs_ail *ailp = iip->ili_item.li_ailp; + if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { + spin_lock(&ailp->xa_lock); + if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip); + } else + spin_unlock(&ailp->xa_lock); + } + iip->ili_logged = 0; + /* + * Clear the ili_last_fields bits now that we know that the + * data corresponding to them is safely on disk. + */ + iip->ili_last_fields = 0; + /* + * Clear the inode logging fields so no more flushes are + * attempted. + */ + iip->ili_format.ilf_fields = 0; + } + /* + * Release the inode's flush lock since we're done with it. + */ + xfs_ifunlock(ip); +} + +void +xfs_istale_done( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + xfs_iflush_abort(INODE_ITEM(lip)->ili_inode); +} + +/* + * convert an xfs_inode_log_format struct from either 32 or 64 bit versions + * (which can have different field alignments) to the native version + */ +int +xfs_inode_item_format_convert( + xfs_log_iovec_t *buf, + xfs_inode_log_format_t *in_f) +{ + if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) { + xfs_inode_log_format_32_t *in_f32 = buf->i_addr; + + in_f->ilf_type = in_f32->ilf_type; + in_f->ilf_size = in_f32->ilf_size; + in_f->ilf_fields = in_f32->ilf_fields; + in_f->ilf_asize = in_f32->ilf_asize; + in_f->ilf_dsize = in_f32->ilf_dsize; + in_f->ilf_ino = in_f32->ilf_ino; + /* copy biggest field of ilf_u */ + memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, + in_f32->ilf_u.ilfu_uuid.__u_bits, + sizeof(uuid_t)); + in_f->ilf_blkno = in_f32->ilf_blkno; + in_f->ilf_len = in_f32->ilf_len; + in_f->ilf_boffset = in_f32->ilf_boffset; + return 0; + } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){ + xfs_inode_log_format_64_t *in_f64 = buf->i_addr; + + in_f->ilf_type = in_f64->ilf_type; + in_f->ilf_size = in_f64->ilf_size; + in_f->ilf_fields = in_f64->ilf_fields; + in_f->ilf_asize = in_f64->ilf_asize; + in_f->ilf_dsize = in_f64->ilf_dsize; + in_f->ilf_ino = in_f64->ilf_ino; + /* copy biggest field of ilf_u */ + memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, + in_f64->ilf_u.ilfu_uuid.__u_bits, + sizeof(uuid_t)); + in_f->ilf_blkno = in_f64->ilf_blkno; + in_f->ilf_len = in_f64->ilf_len; + in_f->ilf_boffset = in_f64->ilf_boffset; + return 0; + } + return EFSCORRUPTED; +} diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h new file mode 100644 index 00000000..d3dee61e --- /dev/null +++ b/fs/xfs/xfs_inode_item.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_ITEM_H__ +#define __XFS_INODE_ITEM_H__ + +/* + * This is the structure used to lay out an inode log item in the + * log. The size of the inline data/extents/b-tree root to be logged + * (if any) is indicated in the ilf_dsize field. Changes to this structure + * must be added on to the end. + */ +typedef struct xfs_inode_log_format { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} xfs_inode_log_format_t; + +typedef struct xfs_inode_log_format_32 { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} __attribute__((packed)) xfs_inode_log_format_32_t; + +typedef struct xfs_inode_log_format_64 { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint32_t ilf_pad; /* pad for 64 bit boundary */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} xfs_inode_log_format_64_t; + +/* + * Flags for xfs_trans_log_inode flags field. + */ +#define XFS_ILOG_CORE 0x001 /* log standard inode fields */ +#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */ +#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */ +#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */ +#define XFS_ILOG_DEV 0x010 /* log the dev field */ +#define XFS_ILOG_UUID 0x020 /* log the uuid field */ +#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ +#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ +#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ + +#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ + XFS_ILOG_UUID | XFS_ILOG_ADATA | \ + XFS_ILOG_AEXT | XFS_ILOG_ABROOT) + +#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT) + +#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT) + +#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \ + XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ + XFS_ILOG_DEV | XFS_ILOG_UUID | \ + XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT) + +static inline int xfs_ilog_fbroot(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); +} + +static inline int xfs_ilog_fext(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); +} + +static inline int xfs_ilog_fdata(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); +} + +#ifdef __KERNEL__ + +struct xfs_buf; +struct xfs_bmbt_rec; +struct xfs_inode; +struct xfs_mount; + + +typedef struct xfs_inode_log_item { + xfs_log_item_t ili_item; /* common portion */ + struct xfs_inode *ili_inode; /* inode ptr */ + xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ + xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ + unsigned short ili_lock_flags; /* lock flags */ + unsigned short ili_logged; /* flushed logged data */ + unsigned int ili_last_fields; /* fields when flushed */ + struct xfs_bmbt_rec *ili_extents_buf; /* array of logged + data exts */ + struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged + attr exts */ +#ifdef XFS_TRANS_DEBUG + int ili_root_size; + char *ili_orig_root; +#endif + xfs_inode_log_format_t ili_format; /* logged structure */ +} xfs_inode_log_item_t; + + +static inline int xfs_inode_clean(xfs_inode_t *ip) +{ + return (!ip->i_itemp || + !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && + !ip->i_update_core; +} + +extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); +extern void xfs_inode_item_destroy(struct xfs_inode *); +extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); +extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); +extern void xfs_iflush_abort(struct xfs_inode *); +extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, + xfs_inode_log_format_t *); + +#endif /* __KERNEL__ */ + +#endif /* __XFS_INODE_ITEM_H__ */ diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h new file mode 100644 index 00000000..b8e4ee4e --- /dev/null +++ b/fs/xfs/xfs_inum.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INUM_H__ +#define __XFS_INUM_H__ + +/* + * Inode number format: + * low inopblog bits - offset in block + * next agblklog bits - block number in ag + * next agno_log bits - ag number + * high agno_log-agblklog-inopblog bits - 0 + */ + +typedef __uint32_t xfs_agino_t; /* within allocation grp inode number */ + +/* + * Useful inode bits for this kernel. + * Used in some places where having 64-bits in the 32-bit kernels + * costs too much. + */ +#if XFS_BIG_INUMS +typedef xfs_ino_t xfs_intino_t; +#else +typedef __uint32_t xfs_intino_t; +#endif + +#define NULLFSINO ((xfs_ino_t)-1) +#define NULLAGINO ((xfs_agino_t)-1) + +struct xfs_mount; + +#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) +#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog +#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog +#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log +#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log +#define XFS_INO_BITS(mp) \ + XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp) +#define XFS_INO_TO_AGNO(mp,i) \ + ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp))) +#define XFS_INO_TO_AGINO(mp,i) \ + ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp))) +#define XFS_INO_TO_AGBNO(mp,i) \ + (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \ + XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp))) +#define XFS_INO_TO_OFFSET(mp,i) \ + ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) +#define XFS_INO_TO_FSB(mp,i) \ + XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i)) +#define XFS_AGINO_TO_INO(mp,a,i) \ + (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i)) +#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp)) +#define XFS_AGINO_TO_OFFSET(mp,i) \ + ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) +#define XFS_OFFBNO_TO_AGINO(mp,b,o) \ + ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o))) + +#if XFS_BIG_INUMS +#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) +#else +#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL)) +#endif +#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL)) + +#endif /* __XFS_INUM_H__ */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c new file mode 100644 index 00000000..091d82b9 --- /dev/null +++ b/fs/xfs/xfs_iomap.c @@ -0,0 +1,748 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_bmap.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_trans_space.h" +#include "xfs_utils.h" +#include "xfs_iomap.h" +#include "xfs_trace.h" + + +#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ + << mp->m_writeio_log) +#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP + +STATIC int +xfs_iomap_eof_align_last_fsb( + xfs_mount_t *mp, + xfs_inode_t *ip, + xfs_extlen_t extsize, + xfs_fileoff_t *last_fsb) +{ + xfs_fileoff_t new_last_fsb = 0; + xfs_extlen_t align; + int eof, error; + + if (XFS_IS_REALTIME_INODE(ip)) + ; + /* + * If mounted with the "-o swalloc" option, roundup the allocation + * request to a stripe width boundary if the file size is >= + * stripe width and we are allocating past the allocation eof. + */ + else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) && + (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth))) + new_last_fsb = roundup_64(*last_fsb, mp->m_swidth); + /* + * Roundup the allocation request to a stripe unit (m_dalign) boundary + * if the file size is >= stripe unit size, and we are allocating past + * the allocation eof. + */ + else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign))) + new_last_fsb = roundup_64(*last_fsb, mp->m_dalign); + + /* + * Always round up the allocation request to an extent boundary + * (when file on a real-time subvolume or has di_extsize hint). + */ + if (extsize) { + if (new_last_fsb) + align = roundup_64(new_last_fsb, extsize); + else + align = extsize; + new_last_fsb = roundup_64(*last_fsb, align); + } + + if (new_last_fsb) { + error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); + if (error) + return error; + if (eof) + *last_fsb = new_last_fsb; + } + return 0; +} + +STATIC int +xfs_alert_fsblock_zero( + xfs_inode_t *ip, + xfs_bmbt_irec_t *imap) +{ + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x\n", + (unsigned long long)ip->i_ino, + (unsigned long long)imap->br_startblock, + (unsigned long long)imap->br_startoff, + (unsigned long long)imap->br_blockcount, + imap->br_state); + return EFSCORRUPTED; +} + +int +xfs_iomap_write_direct( + xfs_inode_t *ip, + xfs_off_t offset, + size_t count, + xfs_bmbt_irec_t *imap, + int nmaps) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t count_fsb, resaligned; + xfs_fsblock_t firstfsb; + xfs_extlen_t extsz, temp; + int nimaps; + int bmapi_flag; + int quota_flag; + int rt; + xfs_trans_t *tp; + xfs_bmap_free_t free_list; + uint qblocks, resblks, resrtextents; + int committed; + int error; + + /* + * Make sure that the dquots are there. This doesn't hold + * the ilock across a disk read. + */ + error = xfs_qm_dqattach_locked(ip, 0); + if (error) + return XFS_ERROR(error); + + rt = XFS_IS_REALTIME_INODE(ip); + extsz = xfs_get_extsz_hint(ip); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + if ((offset + count) > ip->i_size) { + error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); + if (error) + goto error_out; + } else { + if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) + last_fsb = MIN(last_fsb, (xfs_fileoff_t) + imap->br_blockcount + + imap->br_startoff); + } + count_fsb = last_fsb - offset_fsb; + ASSERT(count_fsb > 0); + + resaligned = count_fsb; + if (unlikely(extsz)) { + if ((temp = do_mod(offset_fsb, extsz))) + resaligned += temp; + if ((temp = do_mod(resaligned, extsz))) + resaligned += extsz - temp; + } + + if (unlikely(rt)) { + resrtextents = qblocks = resaligned; + resrtextents /= mp->m_sb.sb_rextsize; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + quota_flag = XFS_QMOPT_RES_RTBLKS; + } else { + resrtextents = 0; + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + quota_flag = XFS_QMOPT_RES_REGBLKS; + } + + /* + * Allocate and setup the transaction + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, resblks, + XFS_WRITE_LOG_RES(mp), resrtextents, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + /* + * Check for running out of space, note: need lock to return + */ + if (error) + xfs_trans_cancel(tp, 0); + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (error) + goto error_out; + + error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip); + + bmapi_flag = XFS_BMAPI_WRITE; + if (offset < ip->i_size || extsz) + bmapi_flag |= XFS_BMAPI_PREALLOC; + + /* + * Issue the xfs_bmapi() call to allocate the blocks. + * + * From this point onwards we overwrite the imap pointer that the + * caller gave to us. + */ + xfs_bmap_init(&free_list, &firstfsb); + nimaps = 1; + error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, + &firstfsb, 0, imap, &nimaps, &free_list); + if (error) + goto error0; + + /* + * Complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto error0; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error_out; + + /* + * Copy any maps to caller's array and return any error. + */ + if (nimaps == 0) { + error = ENOSPC; + goto error_out; + } + + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) { + error = xfs_alert_fsblock_zero(ip, imap); + goto error_out; + } + + return 0; + +error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ + xfs_bmap_cancel(&free_list); + xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); + +error1: /* Just cancel transaction */ + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + +error_out: + return XFS_ERROR(error); +} + +/* + * If the caller is doing a write at the end of the file, then extend the + * allocation out to the file system's write iosize. We clean up any extra + * space left over when the file is closed in xfs_inactive(). + * + * If we find we already have delalloc preallocation beyond EOF, don't do more + * preallocation as it it not needed. + */ +STATIC int +xfs_iomap_eof_want_preallocate( + xfs_mount_t *mp, + xfs_inode_t *ip, + xfs_off_t offset, + size_t count, + xfs_bmbt_irec_t *imap, + int nimaps, + int *prealloc) +{ + xfs_fileoff_t start_fsb; + xfs_filblks_t count_fsb; + xfs_fsblock_t firstblock; + int n, error, imaps; + int found_delalloc = 0; + + *prealloc = 0; + if ((offset + count) <= ip->i_size) + return 0; + + /* + * If there are any real blocks past eof, then don't + * do any speculative allocation. + */ + start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1))); + count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + while (count_fsb > 0) { + imaps = nimaps; + firstblock = NULLFSBLOCK; + error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0, + &firstblock, 0, imap, &imaps, NULL); + if (error) + return error; + for (n = 0; n < imaps; n++) { + if ((imap[n].br_startblock != HOLESTARTBLOCK) && + (imap[n].br_startblock != DELAYSTARTBLOCK)) + return 0; + start_fsb += imap[n].br_blockcount; + count_fsb -= imap[n].br_blockcount; + + if (imap[n].br_startblock == DELAYSTARTBLOCK) + found_delalloc = 1; + } + } + if (!found_delalloc) + *prealloc = 1; + return 0; +} + +/* + * If we don't have a user specified preallocation size, dynamically increase + * the preallocation size as the size of the file grows. Cap the maximum size + * at a single extent or less if the filesystem is near full. The closer the + * filesystem is to full, the smaller the maximum prealocation. + */ +STATIC xfs_fsblock_t +xfs_iomap_prealloc_size( + struct xfs_mount *mp, + struct xfs_inode *ip) +{ + xfs_fsblock_t alloc_blocks = 0; + + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { + int shift = 0; + int64_t freesp; + + /* + * rounddown_pow_of_two() returns an undefined result + * if we pass in alloc_blocks = 0. Hence the "+ 1" to + * ensure we always pass in a non-zero value. + */ + alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1; + alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, + rounddown_pow_of_two(alloc_blocks)); + + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + freesp = mp->m_sb.sb_fdblocks; + if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { + shift = 2; + if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) + shift++; + } + if (shift) + alloc_blocks >>= shift; + } + + if (alloc_blocks < mp->m_writeio_blocks) + alloc_blocks = mp->m_writeio_blocks; + + return alloc_blocks; +} + +int +xfs_iomap_write_delay( + xfs_inode_t *ip, + xfs_off_t offset, + size_t count, + xfs_bmbt_irec_t *ret_imap) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t last_fsb; + xfs_off_t aligned_offset; + xfs_fileoff_t ioalign; + xfs_fsblock_t firstblock; + xfs_extlen_t extsz; + int nimaps; + xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; + int prealloc, flushed = 0; + int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + /* + * Make sure that the dquots are there. This doesn't hold + * the ilock across a disk read. + */ + error = xfs_qm_dqattach_locked(ip, 0); + if (error) + return XFS_ERROR(error); + + extsz = xfs_get_extsz_hint(ip); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + + + error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, + imap, XFS_WRITE_IMAPS, &prealloc); + if (error) + return error; + +retry: + if (prealloc) { + xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip); + + aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); + ioalign = XFS_B_TO_FSBT(mp, aligned_offset); + last_fsb = ioalign + alloc_blocks; + } else { + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + } + + if (prealloc || extsz) { + error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); + if (error) + return error; + } + + nimaps = XFS_WRITE_IMAPS; + firstblock = NULLFSBLOCK; + error = xfs_bmapi(NULL, ip, offset_fsb, + (xfs_filblks_t)(last_fsb - offset_fsb), + XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | + XFS_BMAPI_ENTIRE, &firstblock, 1, imap, + &nimaps, NULL); + switch (error) { + case 0: + case ENOSPC: + case EDQUOT: + break; + default: + return XFS_ERROR(error); + } + + /* + * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For + * ENOSPC, * flush all other inodes with delalloc blocks to free up + * some of the excess reserved metadata space. For both cases, retry + * without EOF preallocation. + */ + if (nimaps == 0) { + trace_xfs_delalloc_enospc(ip, offset, count); + if (flushed) + return XFS_ERROR(error ? error : ENOSPC); + + if (error == ENOSPC) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_flush_inodes(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + } + + flushed = 1; + error = 0; + prealloc = 0; + goto retry; + } + + if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, &imap[0]); + + *ret_imap = imap[0]; + return 0; +} + +/* + * Pass in a delayed allocate extent, convert it to real extents; + * return to the caller the extent we create which maps on top of + * the originating callers request. + * + * Called without a lock on the inode. + * + * We no longer bother to look at the incoming map - all we have to + * guarantee is that whatever we allocate fills the required range. + */ +int +xfs_iomap_write_allocate( + xfs_inode_t *ip, + xfs_off_t offset, + size_t count, + xfs_bmbt_irec_t *imap) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, last_block; + xfs_fileoff_t end_fsb, map_start_fsb; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + xfs_filblks_t count_fsb; + xfs_trans_t *tp; + int nimaps, committed; + int error = 0; + int nres; + + /* + * Make sure that the dquots are there. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return XFS_ERROR(error); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + count_fsb = imap->br_blockcount; + map_start_fsb = imap->br_startoff; + + XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); + + while (count_fsb != 0) { + /* + * Set up a transaction with which to allocate the + * backing store for the file. Do allocations in a + * loop until we get some space in the range we are + * interested in. The other space that might be allocated + * is in the delayed allocation extent on which we sit + * but before our buffer starts. + */ + + nimaps = 0; + while (nimaps == 0) { + tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + tp->t_flags |= XFS_TRANS_RESERVE; + nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + error = xfs_trans_reserve(tp, nres, + XFS_WRITE_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + return XFS_ERROR(error); + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip); + + xfs_bmap_init(&free_list, &first_block); + + /* + * it is possible that the extents have changed since + * we did the read call as we dropped the ilock for a + * while. We have to be careful about truncates or hole + * punchs here - we are not allowed to allocate + * non-delalloc blocks here. + * + * The only protection against truncation is the pages + * for the range we are being asked to convert are + * locked and hence a truncate will block on them + * first. + * + * As a result, if we go beyond the range we really + * need and hit an delalloc extent boundary followed by + * a hole while we have excess blocks in the map, we + * will fill the hole incorrectly and overrun the + * transaction reservation. + * + * Using a single map prevents this as we are forced to + * check each map we look for overlap with the desired + * range and abort as soon as we find it. Also, given + * that we only return a single map, having one beyond + * what we can return is probably a bit silly. + * + * We also need to check that we don't go beyond EOF; + * this is a truncate optimisation as a truncate sets + * the new file size before block on the pages we + * currently have locked under writeback. Because they + * are about to be tossed, we don't need to write them + * back.... + */ + nimaps = 1; + end_fsb = XFS_B_TO_FSB(mp, ip->i_size); + error = xfs_bmap_last_offset(NULL, ip, &last_block, + XFS_DATA_FORK); + if (error) + goto trans_cancel; + + last_block = XFS_FILEOFF_MAX(last_block, end_fsb); + if ((map_start_fsb + count_fsb) > last_block) { + count_fsb = last_block - map_start_fsb; + if (count_fsb == 0) { + error = EAGAIN; + goto trans_cancel; + } + } + + /* + * Go get the actual blocks. + * + * From this point onwards we overwrite the imap + * pointer that the caller gave to us. + */ + error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, + XFS_BMAPI_WRITE, &first_block, 1, + imap, &nimaps, &free_list); + if (error) + goto trans_cancel; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto trans_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error0; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + /* + * See if we were able to allocate an extent that + * covers at least part of the callers request + */ + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, imap); + + if ((offset_fsb >= imap->br_startoff) && + (offset_fsb < (imap->br_startoff + + imap->br_blockcount))) { + XFS_STATS_INC(xs_xstrat_quick); + return 0; + } + + /* + * So far we have not mapped the requested part of the + * file, just surrounding data, try again. + */ + count_fsb -= imap->br_blockcount; + map_start_fsb = imap->br_startoff + imap->br_blockcount; + } + +trans_cancel: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error0: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return XFS_ERROR(error); +} + +int +xfs_iomap_write_unwritten( + xfs_inode_t *ip, + xfs_off_t offset, + size_t count) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t offset_fsb; + xfs_filblks_t count_fsb; + xfs_filblks_t numblks_fsb; + xfs_fsblock_t firstfsb; + int nimaps; + xfs_trans_t *tp; + xfs_bmbt_irec_t imap; + xfs_bmap_free_t free_list; + uint resblks; + int committed; + int error; + + trace_xfs_unwritten_convert(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); + + /* + * Reserve enough blocks in this transaction for two complete extent + * btree splits. We may be converting the middle part of an unwritten + * extent and in this case we will insert two new extents in the btree + * each of which could cause a full split. + * + * This reservation amount will be used in the first call to + * xfs_bmbt_split() to select an AG with enough space to satisfy the + * rest of the operation. + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + + do { + /* + * set up a transaction to convert the range of extents + * from unwritten to real. Do allocations in a loop until + * we have covered the range passed in. + * + * Note that we open code the transaction allocation here + * to pass KM_NOFS--we can't risk to recursing back into + * the filesystem here as we might be asked to write out + * the same inode that we complete here and might deadlock + * on the iolock. + */ + xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); + tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, resblks, + XFS_WRITE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + return XFS_ERROR(error); + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip); + + /* + * Modify the unwritten extent state of the buffer. + */ + xfs_bmap_init(&free_list, &firstfsb); + nimaps = 1; + error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, + 1, &imap, &nimaps, &free_list); + if (error) + goto error_on_bmapi_transaction; + + error = xfs_bmap_finish(&(tp), &(free_list), &committed); + if (error) + goto error_on_bmapi_transaction; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return XFS_ERROR(error); + + if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, &imap); + + if ((numblks_fsb = imap.br_blockcount) == 0) { + /* + * The numblks_fsb value should always get + * smaller, otherwise the loop is stuck. + */ + ASSERT(imap.br_blockcount); + break; + } + offset_fsb += numblks_fsb; + count_fsb -= numblks_fsb; + } while (count_fsb > 0); + + return 0; + +error_on_bmapi_transaction: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return XFS_ERROR(error); +} diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h new file mode 100644 index 00000000..80615760 --- /dev/null +++ b/fs/xfs/xfs_iomap.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2003-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOMAP_H__ +#define __XFS_IOMAP_H__ + +struct xfs_inode; +struct xfs_bmbt_irec; + +extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *, int); +extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *); +extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *); +extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); + +#endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c new file mode 100644 index 00000000..751e94fe --- /dev/null +++ b/fs/xfs/xfs_itable.c @@ -0,0 +1,736 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_itable.h" +#include "xfs_error.h" +#include "xfs_btree.h" +#include "xfs_trace.h" + +STATIC int +xfs_internal_inum( + xfs_mount_t *mp, + xfs_ino_t ino) +{ + return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || + (xfs_sb_version_hasquota(&mp->m_sb) && + (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))); +} + +/* + * Return stat information for one inode. + * Return 0 if ok, else errno. + */ +int +xfs_bulkstat_one_int( + struct xfs_mount *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + bulkstat_one_fmt_pf formatter, /* formatter, copy to user */ + int *ubused, /* bytes used by me */ + int *stat) /* BULKSTAT_RV_... */ +{ + struct xfs_icdinode *dic; /* dinode core info pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct inode *inode; + struct xfs_bstat *buf; /* return buffer */ + int error = 0; /* error value */ + + *stat = BULKSTAT_RV_NOTHING; + + if (!buffer || xfs_internal_inum(mp, ino)) + return XFS_ERROR(EINVAL); + + buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); + if (!buf) + return XFS_ERROR(ENOMEM); + + error = xfs_iget(mp, NULL, ino, + XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip); + if (error) { + *stat = BULKSTAT_RV_NOTHING; + goto out_free; + } + + ASSERT(ip != NULL); + ASSERT(ip->i_imap.im_blkno != 0); + + dic = &ip->i_d; + inode = VFS_I(ip); + + /* xfs_iget returns the following without needing + * further change. + */ + buf->bs_nlink = dic->di_nlink; + buf->bs_projid_lo = dic->di_projid_lo; + buf->bs_projid_hi = dic->di_projid_hi; + buf->bs_ino = ino; + buf->bs_mode = dic->di_mode; + buf->bs_uid = dic->di_uid; + buf->bs_gid = dic->di_gid; + buf->bs_size = dic->di_size; + + /* + * We need to read the timestamps from the Linux inode because + * the VFS keeps writing directly into the inode structure instead + * of telling us about the updates. + */ + buf->bs_atime.tv_sec = inode->i_atime.tv_sec; + buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec; + buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec; + buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec; + buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec; + buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec; + + buf->bs_xflags = xfs_ip2xflags(ip); + buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; + buf->bs_extents = dic->di_nextents; + buf->bs_gen = dic->di_gen; + memset(buf->bs_pad, 0, sizeof(buf->bs_pad)); + buf->bs_dmevmask = dic->di_dmevmask; + buf->bs_dmstate = dic->di_dmstate; + buf->bs_aextents = dic->di_anextents; + buf->bs_forkoff = XFS_IFORK_BOFF(ip); + + switch (dic->di_format) { + case XFS_DINODE_FMT_DEV: + buf->bs_rdev = ip->i_df.if_u2.if_rdev; + buf->bs_blksize = BLKDEV_IOSIZE; + buf->bs_blocks = 0; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_UUID: + buf->bs_rdev = 0; + buf->bs_blksize = mp->m_sb.sb_blocksize; + buf->bs_blocks = 0; + break; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + buf->bs_rdev = 0; + buf->bs_blksize = mp->m_sb.sb_blocksize; + buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks; + break; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + IRELE(ip); + + error = formatter(buffer, ubsize, ubused, buf); + + if (!error) + *stat = BULKSTAT_RV_DIDONE; + + out_free: + kmem_free(buf); + return error; +} + +/* Return 0 on success or positive error */ +STATIC int +xfs_bulkstat_one_fmt( + void __user *ubuffer, + int ubsize, + int *ubused, + const xfs_bstat_t *buffer) +{ + if (ubsize < sizeof(*buffer)) + return XFS_ERROR(ENOMEM); + if (copy_to_user(ubuffer, buffer, sizeof(*buffer))) + return XFS_ERROR(EFAULT); + if (ubused) + *ubused = sizeof(*buffer); + return 0; +} + +int +xfs_bulkstat_one( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + int *ubused, /* bytes used by me */ + int *stat) /* BULKSTAT_RV_... */ +{ + return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, + xfs_bulkstat_one_fmt, ubused, stat); +} + +#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) + +/* + * Return stat information in bulk (by-inode) for the filesystem. + */ +int /* error status */ +xfs_bulkstat( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *lastinop, /* last inode returned */ + int *ubcountp, /* size of buffer/count returned */ + bulkstat_one_pf formatter, /* func that'd fill a single buf */ + size_t statstruct_size, /* sizeof struct filling */ + char __user *ubuffer, /* buffer with inode stats */ + int *done) /* 1 if there are more stats to get */ +{ + xfs_agblock_t agbno=0;/* allocation group block number */ + xfs_buf_t *agbp; /* agi header buffer */ + xfs_agi_t *agi; /* agi header data */ + xfs_agino_t agino; /* inode # in allocation group */ + xfs_agnumber_t agno; /* allocation group number */ + int chunkidx; /* current index into inode chunk */ + int clustidx; /* current index into inode cluster */ + xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ + int end_of_ag; /* set if we've seen the ag end */ + int error; /* error code */ + int fmterror;/* bulkstat formatter result */ + int i; /* loop index */ + int icount; /* count of inodes good in irbuf */ + size_t irbsize; /* size of irec buffer in bytes */ + xfs_ino_t ino; /* inode number (filesystem) */ + xfs_inobt_rec_incore_t *irbp; /* current irec buffer pointer */ + xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ + xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ + xfs_ino_t lastino; /* last inode number returned */ + int nbcluster; /* # of blocks in a cluster */ + int nicluster; /* # of inodes in a cluster */ + int nimask; /* mask for inode clusters */ + int nirbuf; /* size of irbuf */ + int rval; /* return value error code */ + int tmp; /* result value from btree calls */ + int ubcount; /* size of user's buffer */ + int ubleft; /* bytes left in user's buffer */ + char __user *ubufp; /* pointer into user's buffer */ + int ubelem; /* spaces used in user's buffer */ + int ubused; /* bytes used by formatter */ + xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */ + + /* + * Get the last inode value, see if there's nothing to do. + */ + ino = (xfs_ino_t)*lastinop; + lastino = ino; + agno = XFS_INO_TO_AGNO(mp, ino); + agino = XFS_INO_TO_AGINO(mp, ino); + if (agno >= mp->m_sb.sb_agcount || + ino != XFS_AGINO_TO_INO(mp, agno, agino)) { + *done = 1; + *ubcountp = 0; + return 0; + } + if (!ubcountp || *ubcountp <= 0) { + return EINVAL; + } + ubcount = *ubcountp; /* statstruct's */ + ubleft = ubcount * statstruct_size; /* bytes */ + *ubcountp = ubelem = 0; + *done = 0; + fmterror = 0; + ubufp = ubuffer; + nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ? + mp->m_sb.sb_inopblock : + (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); + nimask = ~(nicluster - 1); + nbcluster = nicluster >> mp->m_sb.sb_inopblog; + irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); + if (!irbuf) + return ENOMEM; + + nirbuf = irbsize / sizeof(*irbuf); + + /* + * Loop over the allocation groups, starting from the last + * inode returned; 0 means start of the allocation group. + */ + rval = 0; + while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { + cond_resched(); + bp = NULL; + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); + if (error) { + /* + * Skip this allocation group and go to the next one. + */ + agno++; + agino = 0; + continue; + } + agi = XFS_BUF_TO_AGI(agbp); + /* + * Allocate and initialize a btree cursor for ialloc btree. + */ + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); + irbp = irbuf; + irbufend = irbuf + nirbuf; + end_of_ag = 0; + /* + * If we're returning in the middle of an allocation group, + * we need to get the remainder of the chunk we're in. + */ + if (agino > 0) { + xfs_inobt_rec_incore_t r; + + /* + * Lookup the inode chunk that this inode lives in. + */ + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, + &tmp); + if (!error && /* no I/O error */ + tmp && /* lookup succeeded */ + /* got the record, should always work */ + !(error = xfs_inobt_get_rec(cur, &r, &i)) && + i == 1 && + /* this is the right chunk */ + agino < r.ir_startino + XFS_INODES_PER_CHUNK && + /* lastino was not last in chunk */ + (chunkidx = agino - r.ir_startino + 1) < + XFS_INODES_PER_CHUNK && + /* there are some left allocated */ + xfs_inobt_maskn(chunkidx, + XFS_INODES_PER_CHUNK - chunkidx) & + ~r.ir_free) { + /* + * Grab the chunk record. Mark all the + * uninteresting inodes (because they're + * before our start point) free. + */ + for (i = 0; i < chunkidx; i++) { + if (XFS_INOBT_MASK(i) & ~r.ir_free) + r.ir_freecount++; + } + r.ir_free |= xfs_inobt_maskn(0, chunkidx); + irbp->ir_startino = r.ir_startino; + irbp->ir_freecount = r.ir_freecount; + irbp->ir_free = r.ir_free; + irbp++; + agino = r.ir_startino + XFS_INODES_PER_CHUNK; + icount = XFS_INODES_PER_CHUNK - r.ir_freecount; + } else { + /* + * If any of those tests failed, bump the + * inode number (just in case). + */ + agino++; + icount = 0; + } + /* + * In any case, increment to the next record. + */ + if (!error) + error = xfs_btree_increment(cur, 0, &tmp); + } else { + /* + * Start of ag. Lookup the first inode chunk. + */ + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp); + icount = 0; + } + /* + * Loop through inode btree records in this ag, + * until we run out of inodes or space in the buffer. + */ + while (irbp < irbufend && icount < ubcount) { + xfs_inobt_rec_incore_t r; + + /* + * Loop as long as we're unable to read the + * inode btree. + */ + while (error) { + agino += XFS_INODES_PER_CHUNK; + if (XFS_AGINO_TO_AGBNO(mp, agino) >= + be32_to_cpu(agi->agi_length)) + break; + error = xfs_inobt_lookup(cur, agino, + XFS_LOOKUP_GE, &tmp); + cond_resched(); + } + /* + * If ran off the end of the ag either with an error, + * or the normal way, set end and stop collecting. + */ + if (error) { + end_of_ag = 1; + break; + } + + error = xfs_inobt_get_rec(cur, &r, &i); + if (error || i == 0) { + end_of_ag = 1; + break; + } + + /* + * If this chunk has any allocated inodes, save it. + * Also start read-ahead now for this chunk. + */ + if (r.ir_freecount < XFS_INODES_PER_CHUNK) { + /* + * Loop over all clusters in the next chunk. + * Do a readahead if there are any allocated + * inodes in that cluster. + */ + agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); + for (chunkidx = 0; + chunkidx < XFS_INODES_PER_CHUNK; + chunkidx += nicluster, + agbno += nbcluster) { + if (xfs_inobt_maskn(chunkidx, nicluster) + & ~r.ir_free) + xfs_btree_reada_bufs(mp, agno, + agbno, nbcluster); + } + irbp->ir_startino = r.ir_startino; + irbp->ir_freecount = r.ir_freecount; + irbp->ir_free = r.ir_free; + irbp++; + icount += XFS_INODES_PER_CHUNK - r.ir_freecount; + } + /* + * Set agino to after this chunk and bump the cursor. + */ + agino = r.ir_startino + XFS_INODES_PER_CHUNK; + error = xfs_btree_increment(cur, 0, &tmp); + cond_resched(); + } + /* + * Drop the btree buffers and the agi buffer. + * We can't hold any of the locks these represent + * when calling iget. + */ + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); + /* + * Now format all the good inodes into the user's buffer. + */ + irbufend = irbp; + for (irbp = irbuf; + irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) { + /* + * Now process this chunk of inodes. + */ + for (agino = irbp->ir_startino, chunkidx = clustidx = 0; + XFS_BULKSTAT_UBLEFT(ubleft) && + irbp->ir_freecount < XFS_INODES_PER_CHUNK; + chunkidx++, clustidx++, agino++) { + ASSERT(chunkidx < XFS_INODES_PER_CHUNK); + /* + * Recompute agbno if this is the + * first inode of the cluster. + * + * Careful with clustidx. There can be + * multiple clusters per chunk, a single + * cluster per chunk or a cluster that has + * inodes represented from several different + * chunks (if blocksize is large). + * + * Because of this, the starting clustidx is + * initialized to zero in this loop but must + * later be reset after reading in the cluster + * buffer. + */ + if ((chunkidx & (nicluster - 1)) == 0) { + agbno = XFS_AGINO_TO_AGBNO(mp, + irbp->ir_startino) + + ((chunkidx & nimask) >> + mp->m_sb.sb_inopblog); + } + ino = XFS_AGINO_TO_INO(mp, agno, agino); + /* + * Skip if this inode is free. + */ + if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) { + lastino = ino; + continue; + } + /* + * Count used inodes as free so we can tell + * when the chunk is used up. + */ + irbp->ir_freecount++; + + /* + * Get the inode and fill in a single buffer. + */ + ubused = statstruct_size; + error = formatter(mp, ino, ubufp, ubleft, + &ubused, &fmterror); + if (fmterror == BULKSTAT_RV_NOTHING) { + if (error && error != ENOENT && + error != EINVAL) { + ubleft = 0; + rval = error; + break; + } + lastino = ino; + continue; + } + if (fmterror == BULKSTAT_RV_GIVEUP) { + ubleft = 0; + ASSERT(error); + rval = error; + break; + } + if (ubufp) + ubufp += ubused; + ubleft -= ubused; + ubelem++; + lastino = ino; + } + + cond_resched(); + } + + if (bp) + xfs_buf_relse(bp); + + /* + * Set up for the next loop iteration. + */ + if (XFS_BULKSTAT_UBLEFT(ubleft)) { + if (end_of_ag) { + agno++; + agino = 0; + } else + agino = XFS_INO_TO_AGINO(mp, lastino); + } else + break; + } + /* + * Done, we're either out of filesystem or space to put the data. + */ + kmem_free_large(irbuf); + *ubcountp = ubelem; + /* + * Found some inodes, return them now and return the error next time. + */ + if (ubelem) + rval = 0; + if (agno >= mp->m_sb.sb_agcount) { + /* + * If we ran out of filesystem, mark lastino as off + * the end of the filesystem, so the next call + * will return immediately. + */ + *lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0); + *done = 1; + } else + *lastinop = (xfs_ino_t)lastino; + + return rval; +} + +/* + * Return stat information in bulk (by-inode) for the filesystem. + * Special case for non-sequential one inode bulkstat. + */ +int /* error status */ +xfs_bulkstat_single( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *lastinop, /* inode to return */ + char __user *buffer, /* buffer with inode stats */ + int *done) /* 1 if there are more stats to get */ +{ + int count; /* count value for bulkstat call */ + int error; /* return value */ + xfs_ino_t ino; /* filesystem inode number */ + int res; /* result from bs1 */ + + /* + * note that requesting valid inode numbers which are not allocated + * to inodes will most likely cause xfs_itobp to generate warning + * messages about bad magic numbers. This is ok. The fact that + * the inode isn't actually an inode is handled by the + * error check below. Done this way to make the usual case faster + * at the expense of the error case. + */ + + ino = (xfs_ino_t)*lastinop; + error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res); + if (error) { + /* + * Special case way failed, do it the "long" way + * to see if that works. + */ + (*lastinop)--; + count = 1; + if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one, + sizeof(xfs_bstat_t), buffer, done)) + return error; + if (count == 0 || (xfs_ino_t)*lastinop != ino) + return error == EFSCORRUPTED ? + XFS_ERROR(EINVAL) : error; + else + return 0; + } + *done = 0; + return 0; +} + +int +xfs_inumbers_fmt( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written) /* # of bytes written */ +{ + if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer))) + return -EFAULT; + *written = count * sizeof(*buffer); + return 0; +} + +/* + * Return inode number table for the filesystem. + */ +int /* error status */ +xfs_inumbers( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *lastino, /* last inode returned */ + int *count, /* size of buffer/count returned */ + void __user *ubuffer,/* buffer with inode descriptions */ + inumbers_fmt_pf formatter) +{ + xfs_buf_t *agbp; + xfs_agino_t agino; + xfs_agnumber_t agno; + int bcount; + xfs_inogrp_t *buffer; + int bufidx; + xfs_btree_cur_t *cur; + int error; + xfs_inobt_rec_incore_t r; + int i; + xfs_ino_t ino; + int left; + int tmp; + + ino = (xfs_ino_t)*lastino; + agno = XFS_INO_TO_AGNO(mp, ino); + agino = XFS_INO_TO_AGINO(mp, ino); + left = *count; + *count = 0; + bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer))); + buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP); + error = bufidx = 0; + cur = NULL; + agbp = NULL; + while (left > 0 && agno < mp->m_sb.sb_agcount) { + if (agbp == NULL) { + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); + if (error) { + /* + * If we can't read the AGI of this ag, + * then just skip to the next one. + */ + ASSERT(cur == NULL); + agbp = NULL; + agno++; + agino = 0; + continue; + } + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE, + &tmp); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + cur = NULL; + xfs_buf_relse(agbp); + agbp = NULL; + /* + * Move up the last inode in the current + * chunk. The lookup_ge will always get + * us the first inode in the next chunk. + */ + agino += XFS_INODES_PER_CHUNK - 1; + continue; + } + } + error = xfs_inobt_get_rec(cur, &r, &i); + if (error || i == 0) { + xfs_buf_relse(agbp); + agbp = NULL; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + cur = NULL; + agno++; + agino = 0; + continue; + } + agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; + buffer[bufidx].xi_startino = + XFS_AGINO_TO_INO(mp, agno, r.ir_startino); + buffer[bufidx].xi_alloccount = + XFS_INODES_PER_CHUNK - r.ir_freecount; + buffer[bufidx].xi_allocmask = ~r.ir_free; + bufidx++; + left--; + if (bufidx == bcount) { + long written; + if (formatter(ubuffer, buffer, bufidx, &written)) { + error = XFS_ERROR(EFAULT); + break; + } + ubuffer += written; + *count += bufidx; + bufidx = 0; + } + if (left) { + error = xfs_btree_increment(cur, 0, &tmp); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + cur = NULL; + xfs_buf_relse(agbp); + agbp = NULL; + /* + * The agino value has already been bumped. + * Just try to skip up to it. + */ + agino += XFS_INODES_PER_CHUNK; + continue; + } + } + } + if (!error) { + if (bufidx) { + long written; + if (formatter(ubuffer, buffer, bufidx, &written)) + error = XFS_ERROR(EFAULT); + else + *count += bufidx; + } + *lastino = XFS_AGINO_TO_INO(mp, agno, agino); + } + kmem_free(buffer); + if (cur) + xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR : + XFS_BTREE_NOERROR)); + if (agbp) + xfs_buf_relse(agbp); + return error; +} diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h new file mode 100644 index 00000000..97295d91 --- /dev/null +++ b/fs/xfs/xfs_itable.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ITABLE_H__ +#define __XFS_ITABLE_H__ + +/* + * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat + * structures (by the dmi library). This is a pointer to a formatter function + * that will iget the inode and fill in the appropriate structure. + * see xfs_bulkstat_one() and xfs_dm_bulkstat_one() in dmapi_xfs.c + */ +typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, + xfs_ino_t ino, + void __user *buffer, + int ubsize, + int *ubused, + int *stat); + +/* + * Values for stat return value. + */ +#define BULKSTAT_RV_NOTHING 0 +#define BULKSTAT_RV_DIDONE 1 +#define BULKSTAT_RV_GIVEUP 2 + +/* + * Return stat information in bulk (by-inode) for the filesystem. + */ +int /* error status */ +xfs_bulkstat( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *lastino, /* last inode returned */ + int *count, /* size of buffer/count returned */ + bulkstat_one_pf formatter, /* func that'd fill a single buf */ + size_t statstruct_size,/* sizeof struct that we're filling */ + char __user *ubuffer,/* buffer with inode stats */ + int *done); /* 1 if there are more stats to get */ + +int +xfs_bulkstat_single( + xfs_mount_t *mp, + xfs_ino_t *lastinop, + char __user *buffer, + int *done); + +typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */ + void __user *ubuffer, /* buffer to write to */ + int ubsize, /* remaining user buffer sz */ + int *ubused, /* bytes used by formatter */ + const xfs_bstat_t *buffer); /* buffer to read from */ + +int +xfs_bulkstat_one_int( + xfs_mount_t *mp, + xfs_ino_t ino, + void __user *buffer, + int ubsize, + bulkstat_one_fmt_pf formatter, + int *ubused, + int *stat); + +int +xfs_bulkstat_one( + xfs_mount_t *mp, + xfs_ino_t ino, + void __user *buffer, + int ubsize, + int *ubused, + int *stat); + +typedef int (*inumbers_fmt_pf)( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written); /* # of bytes written */ + +int +xfs_inumbers_fmt( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written); /* # of bytes written */ + +int /* error status */ +xfs_inumbers( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *last, /* last inode returned */ + int *count, /* size of buffer/count returned */ + void __user *buffer, /* buffer with inode info */ + inumbers_fmt_pf formatter); + +#endif /* __XFS_ITABLE_H__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c new file mode 100644 index 00000000..41d5b8f2 --- /dev/null +++ b/fs/xfs/xfs_log.c @@ -0,0 +1,3767 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_buf_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_log_recover.h" +#include "xfs_trans_priv.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_rw.h" +#include "xfs_trace.h" + +kmem_zone_t *xfs_log_ticket_zone; + +/* Local miscellaneous function prototypes */ +STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket, + xlog_in_core_t **, xfs_lsn_t *); +STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, + xfs_buftarg_t *log_target, + xfs_daddr_t blk_offset, + int num_bblks); +STATIC int xlog_space_left(struct log *log, atomic64_t *head); +STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); +STATIC void xlog_dealloc_log(xlog_t *log); + +/* local state machine functions */ +STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); +STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog); +STATIC int xlog_state_get_iclog_space(xlog_t *log, + int len, + xlog_in_core_t **iclog, + xlog_ticket_t *ticket, + int *continued_write, + int *logoffsetp); +STATIC int xlog_state_release_iclog(xlog_t *log, + xlog_in_core_t *iclog); +STATIC void xlog_state_switch_iclogs(xlog_t *log, + xlog_in_core_t *iclog, + int eventual_size); +STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); + +/* local functions to manipulate grant head */ +STATIC int xlog_grant_log_space(xlog_t *log, + xlog_ticket_t *xtic); +STATIC void xlog_grant_push_ail(struct log *log, + int need_bytes); +STATIC void xlog_regrant_reserve_log_space(xlog_t *log, + xlog_ticket_t *ticket); +STATIC int xlog_regrant_write_log_space(xlog_t *log, + xlog_ticket_t *ticket); +STATIC void xlog_ungrant_log_space(xlog_t *log, + xlog_ticket_t *ticket); + +#if defined(DEBUG) +STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); +STATIC void xlog_verify_grant_tail(struct log *log); +STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, + int count, boolean_t syncing); +STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, + xfs_lsn_t tail_lsn); +#else +#define xlog_verify_dest_ptr(a,b) +#define xlog_verify_grant_tail(a) +#define xlog_verify_iclog(a,b,c,d) +#define xlog_verify_tail_lsn(a,b,c) +#endif + +STATIC int xlog_iclogs_empty(xlog_t *log); + +static void +xlog_grant_sub_space( + struct log *log, + atomic64_t *head, + int bytes) +{ + int64_t head_val = atomic64_read(head); + int64_t new, old; + + do { + int cycle, space; + + xlog_crack_grant_head_val(head_val, &cycle, &space); + + space -= bytes; + if (space < 0) { + space += log->l_logsize; + cycle--; + } + + old = head_val; + new = xlog_assign_grant_head_val(cycle, space); + head_val = atomic64_cmpxchg(head, old, new); + } while (head_val != old); +} + +static void +xlog_grant_add_space( + struct log *log, + atomic64_t *head, + int bytes) +{ + int64_t head_val = atomic64_read(head); + int64_t new, old; + + do { + int tmp; + int cycle, space; + + xlog_crack_grant_head_val(head_val, &cycle, &space); + + tmp = log->l_logsize - space; + if (tmp > bytes) + space += bytes; + else { + space = bytes - tmp; + cycle++; + } + + old = head_val; + new = xlog_assign_grant_head_val(cycle, space); + head_val = atomic64_cmpxchg(head, old, new); + } while (head_val != old); +} + +static void +xlog_tic_reset_res(xlog_ticket_t *tic) +{ + tic->t_res_num = 0; + tic->t_res_arr_sum = 0; + tic->t_res_num_ophdrs = 0; +} + +static void +xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) +{ + if (tic->t_res_num == XLOG_TIC_LEN_MAX) { + /* add to overflow and start again */ + tic->t_res_o_flow += tic->t_res_arr_sum; + tic->t_res_num = 0; + tic->t_res_arr_sum = 0; + } + + tic->t_res_arr[tic->t_res_num].r_len = len; + tic->t_res_arr[tic->t_res_num].r_type = type; + tic->t_res_arr_sum += len; + tic->t_res_num++; +} + +/* + * NOTES: + * + * 1. currblock field gets updated at startup and after in-core logs + * marked as with WANT_SYNC. + */ + +/* + * This routine is called when a user of a log manager ticket is done with + * the reservation. If the ticket was ever used, then a commit record for + * the associated transaction is written out as a log operation header with + * no data. The flag XLOG_TIC_INITED is set when the first write occurs with + * a given ticket. If the ticket was one with a permanent reservation, then + * a few operations are done differently. Permanent reservation tickets by + * default don't release the reservation. They just commit the current + * transaction with the belief that the reservation is still needed. A flag + * must be passed in before permanent reservations are actually released. + * When these type of tickets are not released, they need to be set into + * the inited state again. By doing this, a start record will be written + * out when the next write occurs. + */ +xfs_lsn_t +xfs_log_done( + struct xfs_mount *mp, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + uint flags) +{ + struct log *log = mp->m_log; + xfs_lsn_t lsn = 0; + + if (XLOG_FORCED_SHUTDOWN(log) || + /* + * If nothing was ever written, don't write out commit record. + * If we get an error, just continue and give back the log ticket. + */ + (((ticket->t_flags & XLOG_TIC_INITED) == 0) && + (xlog_commit_record(log, ticket, iclog, &lsn)))) { + lsn = (xfs_lsn_t) -1; + if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { + flags |= XFS_LOG_REL_PERM_RESERV; + } + } + + + if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || + (flags & XFS_LOG_REL_PERM_RESERV)) { + trace_xfs_log_done_nonperm(log, ticket); + + /* + * Release ticket if not permanent reservation or a specific + * request has been made to release a permanent reservation. + */ + xlog_ungrant_log_space(log, ticket); + xfs_log_ticket_put(ticket); + } else { + trace_xfs_log_done_perm(log, ticket); + + xlog_regrant_reserve_log_space(log, ticket); + /* If this ticket was a permanent reservation and we aren't + * trying to release it, reset the inited flags; so next time + * we write, a start record will be written out. + */ + ticket->t_flags |= XLOG_TIC_INITED; + } + + return lsn; +} + +/* + * Attaches a new iclog I/O completion callback routine during + * transaction commit. If the log is in error state, a non-zero + * return code is handed back and the caller is responsible for + * executing the callback at an appropriate time. + */ +int +xfs_log_notify( + struct xfs_mount *mp, + struct xlog_in_core *iclog, + xfs_log_callback_t *cb) +{ + int abortflg; + + spin_lock(&iclog->ic_callback_lock); + abortflg = (iclog->ic_state & XLOG_STATE_IOERROR); + if (!abortflg) { + ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) || + (iclog->ic_state == XLOG_STATE_WANT_SYNC)); + cb->cb_next = NULL; + *(iclog->ic_callback_tail) = cb; + iclog->ic_callback_tail = &(cb->cb_next); + } + spin_unlock(&iclog->ic_callback_lock); + return abortflg; +} + +int +xfs_log_release_iclog( + struct xfs_mount *mp, + struct xlog_in_core *iclog) +{ + if (xlog_state_release_iclog(mp->m_log, iclog)) { + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + return EIO; + } + + return 0; +} + +/* + * 1. Reserve an amount of on-disk log space and return a ticket corresponding + * to the reservation. + * 2. Potentially, push buffers at tail of log to disk. + * + * Each reservation is going to reserve extra space for a log record header. + * When writes happen to the on-disk log, we don't subtract the length of the + * log record header from any reservation. By wasting space in each + * reservation, we prevent over allocation problems. + */ +int +xfs_log_reserve( + struct xfs_mount *mp, + int unit_bytes, + int cnt, + struct xlog_ticket **ticket, + __uint8_t client, + uint flags, + uint t_type) +{ + struct log *log = mp->m_log; + struct xlog_ticket *internal_ticket; + int retval = 0; + + ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); + + if (XLOG_FORCED_SHUTDOWN(log)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_try_logspace); + + + if (*ticket != NULL) { + ASSERT(flags & XFS_LOG_PERM_RESERV); + internal_ticket = *ticket; + + /* + * this is a new transaction on the ticket, so we need to + * change the transaction ID so that the next transaction has a + * different TID in the log. Just add one to the existing tid + * so that we can see chains of rolling transactions in the log + * easily. + */ + internal_ticket->t_tid++; + + trace_xfs_log_reserve(log, internal_ticket); + + xlog_grant_push_ail(log, internal_ticket->t_unit_res); + retval = xlog_regrant_write_log_space(log, internal_ticket); + } else { + /* may sleep if need to allocate more tickets */ + internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, + client, flags, + KM_SLEEP|KM_MAYFAIL); + if (!internal_ticket) + return XFS_ERROR(ENOMEM); + internal_ticket->t_trans_type = t_type; + *ticket = internal_ticket; + + trace_xfs_log_reserve(log, internal_ticket); + + xlog_grant_push_ail(log, + (internal_ticket->t_unit_res * + internal_ticket->t_cnt)); + retval = xlog_grant_log_space(log, internal_ticket); + } + + return retval; +} /* xfs_log_reserve */ + + +/* + * Mount a log filesystem + * + * mp - ubiquitous xfs mount point structure + * log_target - buftarg of on-disk log device + * blk_offset - Start block # where block size is 512 bytes (BBSIZE) + * num_bblocks - Number of BBSIZE blocks in on-disk log + * + * Return error or zero. + */ +int +xfs_log_mount( + xfs_mount_t *mp, + xfs_buftarg_t *log_target, + xfs_daddr_t blk_offset, + int num_bblks) +{ + int error; + + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) + xfs_notice(mp, "Mounting Filesystem"); + else { + xfs_notice(mp, +"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent."); + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + } + + mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); + if (IS_ERR(mp->m_log)) { + error = -PTR_ERR(mp->m_log); + goto out; + } + + /* + * Initialize the AIL now we have a log. + */ + error = xfs_trans_ail_init(mp); + if (error) { + xfs_warn(mp, "AIL initialisation failed: error %d", error); + goto out_free_log; + } + mp->m_log->l_ailp = mp->m_ail; + + /* + * skip log recovery on a norecovery mount. pretend it all + * just worked. + */ + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + int readonly = (mp->m_flags & XFS_MOUNT_RDONLY); + + if (readonly) + mp->m_flags &= ~XFS_MOUNT_RDONLY; + + error = xlog_recover(mp->m_log); + + if (readonly) + mp->m_flags |= XFS_MOUNT_RDONLY; + if (error) { + xfs_warn(mp, "log mount/recovery failed: error %d", + error); + goto out_destroy_ail; + } + } + + /* Normal transactions can now occur */ + mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + + /* + * Now the log has been fully initialised and we know were our + * space grant counters are, we can initialise the permanent ticket + * needed for delayed logging to work. + */ + xlog_cil_init_post_recovery(mp->m_log); + + return 0; + +out_destroy_ail: + xfs_trans_ail_destroy(mp); +out_free_log: + xlog_dealloc_log(mp->m_log); +out: + return error; +} + +/* + * Finish the recovery of the file system. This is separate from + * the xfs_log_mount() call, because it depends on the code in + * xfs_mountfs() to read in the root and real-time bitmap inodes + * between calling xfs_log_mount() and here. + * + * mp - ubiquitous xfs mount point structure + */ +int +xfs_log_mount_finish(xfs_mount_t *mp) +{ + int error; + + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) + error = xlog_recover_finish(mp->m_log); + else { + error = 0; + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + } + + return error; +} + +/* + * Final log writes as part of unmount. + * + * Mark the filesystem clean as unmount happens. Note that during relocation + * this routine needs to be executed as part of source-bag while the + * deallocation must not be done until source-end. + */ + +/* + * Unmount record used to have a string "Unmount filesystem--" in the + * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). + * We just write the magic number now since that particular field isn't + * currently architecture converted and "nUmount" is a bit foo. + * As far as I know, there weren't any dependencies on the old behaviour. + */ + +int +xfs_log_unmount_write(xfs_mount_t *mp) +{ + xlog_t *log = mp->m_log; + xlog_in_core_t *iclog; +#ifdef DEBUG + xlog_in_core_t *first_iclog; +#endif + xlog_ticket_t *tic = NULL; + xfs_lsn_t lsn; + int error; + + /* + * Don't write out unmount record on read-only mounts. + * Or, if we are doing a forced umount (typically because of IO errors). + */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return 0; + + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); + +#ifdef DEBUG + first_iclog = iclog = log->l_iclog; + do { + if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { + ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE); + ASSERT(iclog->ic_offset == 0); + } + iclog = iclog->ic_next; + } while (iclog != first_iclog); +#endif + if (! (XLOG_FORCED_SHUTDOWN(log))) { + error = xfs_log_reserve(mp, 600, 1, &tic, + XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); + if (!error) { + /* the data section must be 32 bit size aligned */ + struct { + __uint16_t magic; + __uint16_t pad1; + __uint32_t pad2; /* may as well make it 64 bits */ + } magic = { + .magic = XLOG_UNMOUNT_TYPE, + }; + struct xfs_log_iovec reg = { + .i_addr = &magic, + .i_len = sizeof(magic), + .i_type = XLOG_REG_TYPE_UNMOUNT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; + + /* remove inited flag */ + tic->t_flags = 0; + error = xlog_write(log, &vec, tic, &lsn, + NULL, XLOG_UNMOUNT_TRANS); + /* + * At this point, we're umounting anyway, + * so there's no point in transitioning log state + * to IOERROR. Just continue... + */ + } + + if (error) + xfs_alert(mp, "%s: unmount record failed", __func__); + + + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + atomic_inc(&iclog->ic_refcnt); + xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); + error = xlog_state_release_iclog(log, iclog); + + spin_lock(&log->l_icloglock); + if (!(iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY)) { + if (!XLOG_FORCED_SHUTDOWN(log)) { + xlog_wait(&iclog->ic_force_wait, + &log->l_icloglock); + } else { + spin_unlock(&log->l_icloglock); + } + } else { + spin_unlock(&log->l_icloglock); + } + if (tic) { + trace_xfs_log_umount_write(log, tic); + xlog_ungrant_log_space(log, tic); + xfs_log_ticket_put(tic); + } + } else { + /* + * We're already in forced_shutdown mode, couldn't + * even attempt to write out the unmount transaction. + * + * Go through the motions of sync'ing and releasing + * the iclog, even though no I/O will actually happen, + * we need to wait for other log I/Os that may already + * be in progress. Do this as a separate section of + * code so we'll know if we ever get stuck here that + * we're in this odd situation of trying to unmount + * a file system that went into forced_shutdown as + * the result of an unmount.. + */ + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + atomic_inc(&iclog->ic_refcnt); + + xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); + error = xlog_state_release_iclog(log, iclog); + + spin_lock(&log->l_icloglock); + + if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE + || iclog->ic_state == XLOG_STATE_DIRTY + || iclog->ic_state == XLOG_STATE_IOERROR) ) { + + xlog_wait(&iclog->ic_force_wait, + &log->l_icloglock); + } else { + spin_unlock(&log->l_icloglock); + } + } + + return error; +} /* xfs_log_unmount_write */ + +/* + * Deallocate log structures for unmount/relocation. + * + * We need to stop the aild from running before we destroy + * and deallocate the log as the aild references the log. + */ +void +xfs_log_unmount(xfs_mount_t *mp) +{ + xfs_trans_ail_destroy(mp); + xlog_dealloc_log(mp->m_log); +} + +void +xfs_log_item_init( + struct xfs_mount *mp, + struct xfs_log_item *item, + int type, + struct xfs_item_ops *ops) +{ + item->li_mountp = mp; + item->li_ailp = mp->m_ail; + item->li_type = type; + item->li_ops = ops; + item->li_lv = NULL; + + INIT_LIST_HEAD(&item->li_ail); + INIT_LIST_HEAD(&item->li_cil); +} + +/* + * Write region vectors to log. The write happens using the space reservation + * of the ticket (tic). It is not a requirement that all writes for a given + * transaction occur with one call to xfs_log_write(). However, it is important + * to note that the transaction reservation code makes an assumption about the + * number of log headers a transaction requires that may be violated if you + * don't pass all the transaction vectors in one call.... + */ +int +xfs_log_write( + struct xfs_mount *mp, + struct xfs_log_iovec reg[], + int nentries, + struct xlog_ticket *tic, + xfs_lsn_t *start_lsn) +{ + struct log *log = mp->m_log; + int error; + struct xfs_log_vec vec = { + .lv_niovecs = nentries, + .lv_iovecp = reg, + }; + + if (XLOG_FORCED_SHUTDOWN(log)) + return XFS_ERROR(EIO); + + error = xlog_write(log, &vec, tic, start_lsn, NULL, 0); + if (error) + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + return error; +} + +void +xfs_log_move_tail(xfs_mount_t *mp, + xfs_lsn_t tail_lsn) +{ + xlog_ticket_t *tic; + xlog_t *log = mp->m_log; + int need_bytes, free_bytes; + + if (XLOG_FORCED_SHUTDOWN(log)) + return; + + if (tail_lsn == 0) + tail_lsn = atomic64_read(&log->l_last_sync_lsn); + + /* tail_lsn == 1 implies that we weren't passed a valid value. */ + if (tail_lsn != 1) + atomic64_set(&log->l_tail_lsn, tail_lsn); + + if (!list_empty_careful(&log->l_writeq)) { +#ifdef DEBUG + if (log->l_flags & XLOG_ACTIVE_RECOVERY) + panic("Recovery problem"); +#endif + spin_lock(&log->l_grant_write_lock); + free_bytes = xlog_space_left(log, &log->l_grant_write_head); + list_for_each_entry(tic, &log->l_writeq, t_queue) { + ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); + + if (free_bytes < tic->t_unit_res && tail_lsn != 1) + break; + tail_lsn = 0; + free_bytes -= tic->t_unit_res; + trace_xfs_log_regrant_write_wake_up(log, tic); + wake_up(&tic->t_wait); + } + spin_unlock(&log->l_grant_write_lock); + } + + if (!list_empty_careful(&log->l_reserveq)) { +#ifdef DEBUG + if (log->l_flags & XLOG_ACTIVE_RECOVERY) + panic("Recovery problem"); +#endif + spin_lock(&log->l_grant_reserve_lock); + free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); + list_for_each_entry(tic, &log->l_reserveq, t_queue) { + if (tic->t_flags & XLOG_TIC_PERM_RESERV) + need_bytes = tic->t_unit_res*tic->t_cnt; + else + need_bytes = tic->t_unit_res; + if (free_bytes < need_bytes && tail_lsn != 1) + break; + tail_lsn = 0; + free_bytes -= need_bytes; + trace_xfs_log_grant_wake_up(log, tic); + wake_up(&tic->t_wait); + } + spin_unlock(&log->l_grant_reserve_lock); + } +} + +/* + * Determine if we have a transaction that has gone to disk + * that needs to be covered. To begin the transition to the idle state + * firstly the log needs to be idle (no AIL and nothing in the iclogs). + * If we are then in a state where covering is needed, the caller is informed + * that dummy transactions are required to move the log into the idle state. + * + * Because this is called as part of the sync process, we should also indicate + * that dummy transactions should be issued in anything but the covered or + * idle states. This ensures that the log tail is accurately reflected in + * the log at the end of the sync, hence if a crash occurrs avoids replay + * of transactions where the metadata is already on disk. + */ +int +xfs_log_need_covered(xfs_mount_t *mp) +{ + int needed = 0; + xlog_t *log = mp->m_log; + + if (!xfs_fs_writable(mp)) + return 0; + + spin_lock(&log->l_icloglock); + switch (log->l_covered_state) { + case XLOG_STATE_COVER_DONE: + case XLOG_STATE_COVER_DONE2: + case XLOG_STATE_COVER_IDLE: + break; + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + if (!xfs_ail_min_lsn(log->l_ailp) && + xlog_iclogs_empty(log)) { + if (log->l_covered_state == XLOG_STATE_COVER_NEED) + log->l_covered_state = XLOG_STATE_COVER_DONE; + else + log->l_covered_state = XLOG_STATE_COVER_DONE2; + } + /* FALLTHRU */ + default: + needed = 1; + break; + } + spin_unlock(&log->l_icloglock); + return needed; +} + +/****************************************************************************** + * + * local routines + * + ****************************************************************************** + */ + +/* xfs_trans_tail_ail returns 0 when there is nothing in the list. + * The log manager must keep track of the last LR which was committed + * to disk. The lsn of this LR will become the new tail_lsn whenever + * xfs_trans_tail_ail returns 0. If we don't do this, we run into + * the situation where stuff could be written into the log but nothing + * was ever in the AIL when asked. Eventually, we panic since the + * tail hits the head. + * + * We may be holding the log iclog lock upon entering this routine. + */ +xfs_lsn_t +xlog_assign_tail_lsn( + struct xfs_mount *mp) +{ + xfs_lsn_t tail_lsn; + struct log *log = mp->m_log; + + tail_lsn = xfs_ail_min_lsn(mp->m_ail); + if (!tail_lsn) + tail_lsn = atomic64_read(&log->l_last_sync_lsn); + + atomic64_set(&log->l_tail_lsn, tail_lsn); + return tail_lsn; +} + +/* + * Return the space in the log between the tail and the head. The head + * is passed in the cycle/bytes formal parms. In the special case where + * the reserve head has wrapped passed the tail, this calculation is no + * longer valid. In this case, just return 0 which means there is no space + * in the log. This works for all places where this function is called + * with the reserve head. Of course, if the write head were to ever + * wrap the tail, we should blow up. Rather than catch this case here, + * we depend on other ASSERTions in other parts of the code. XXXmiken + * + * This code also handles the case where the reservation head is behind + * the tail. The details of this case are described below, but the end + * result is that we return the size of the log as the amount of space left. + */ +STATIC int +xlog_space_left( + struct log *log, + atomic64_t *head) +{ + int free_bytes; + int tail_bytes; + int tail_cycle; + int head_cycle; + int head_bytes; + + xlog_crack_grant_head(head, &head_cycle, &head_bytes); + xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); + tail_bytes = BBTOB(tail_bytes); + if (tail_cycle == head_cycle && head_bytes >= tail_bytes) + free_bytes = log->l_logsize - (head_bytes - tail_bytes); + else if (tail_cycle + 1 < head_cycle) + return 0; + else if (tail_cycle < head_cycle) { + ASSERT(tail_cycle == (head_cycle - 1)); + free_bytes = tail_bytes - head_bytes; + } else { + /* + * The reservation head is behind the tail. + * In this case we just want to return the size of the + * log as the amount of space left. + */ + xfs_alert(log->l_mp, + "xlog_space_left: head behind tail\n" + " tail_cycle = %d, tail_bytes = %d\n" + " GH cycle = %d, GH bytes = %d", + tail_cycle, tail_bytes, head_cycle, head_bytes); + ASSERT(0); + free_bytes = log->l_logsize; + } + return free_bytes; +} + + +/* + * Log function which is called when an io completes. + * + * The log manager needs its own routine, in order to control what + * happens with the buffer after the write completes. + */ +void +xlog_iodone(xfs_buf_t *bp) +{ + xlog_in_core_t *iclog; + xlog_t *l; + int aborted; + + iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *); + ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2); + XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); + aborted = 0; + l = iclog->ic_log; + + /* + * Race to shutdown the filesystem if we see an error. + */ + if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, + XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { + xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp)); + XFS_BUF_STALE(bp); + xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); + /* + * This flag will be propagated to the trans-committed + * callback routines to let them know that the log-commit + * didn't succeed. + */ + aborted = XFS_LI_ABORTED; + } else if (iclog->ic_state & XLOG_STATE_IOERROR) { + aborted = XFS_LI_ABORTED; + } + + /* log I/O is always issued ASYNC */ + ASSERT(XFS_BUF_ISASYNC(bp)); + xlog_state_done_syncing(iclog, aborted); + /* + * do not reference the buffer (bp) here as we could race + * with it being freed after writing the unmount record to the + * log. + */ + +} /* xlog_iodone */ + +/* + * Return size of each in-core log record buffer. + * + * All machines get 8 x 32kB buffers by default, unless tuned otherwise. + * + * If the filesystem blocksize is too large, we may need to choose a + * larger size since the directory code currently logs entire blocks. + */ + +STATIC void +xlog_get_iclog_buffer_size(xfs_mount_t *mp, + xlog_t *log) +{ + int size; + int xhdrs; + + if (mp->m_logbufs <= 0) + log->l_iclog_bufs = XLOG_MAX_ICLOGS; + else + log->l_iclog_bufs = mp->m_logbufs; + + /* + * Buffer size passed in from mount system call. + */ + if (mp->m_logbsize > 0) { + size = log->l_iclog_size = mp->m_logbsize; + log->l_iclog_size_log = 0; + while (size != 1) { + log->l_iclog_size_log++; + size >>= 1; + } + + if (xfs_sb_version_haslogv2(&mp->m_sb)) { + /* # headers = size / 32k + * one header holds cycles from 32k of data + */ + + xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE; + if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE) + xhdrs++; + log->l_iclog_hsize = xhdrs << BBSHIFT; + log->l_iclog_heads = xhdrs; + } else { + ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE); + log->l_iclog_hsize = BBSIZE; + log->l_iclog_heads = 1; + } + goto done; + } + + /* All machines use 32kB buffers by default. */ + log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; + log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; + + /* the default log size is 16k or 32k which is one header sector */ + log->l_iclog_hsize = BBSIZE; + log->l_iclog_heads = 1; + +done: + /* are we being asked to make the sizes selected above visible? */ + if (mp->m_logbufs == 0) + mp->m_logbufs = log->l_iclog_bufs; + if (mp->m_logbsize == 0) + mp->m_logbsize = log->l_iclog_size; +} /* xlog_get_iclog_buffer_size */ + + +/* + * This routine initializes some of the log structure for a given mount point. + * Its primary purpose is to fill in enough, so recovery can occur. However, + * some other stuff may be filled in too. + */ +STATIC xlog_t * +xlog_alloc_log(xfs_mount_t *mp, + xfs_buftarg_t *log_target, + xfs_daddr_t blk_offset, + int num_bblks) +{ + xlog_t *log; + xlog_rec_header_t *head; + xlog_in_core_t **iclogp; + xlog_in_core_t *iclog, *prev_iclog=NULL; + xfs_buf_t *bp; + int i; + int error = ENOMEM; + uint log2_size = 0; + + log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); + if (!log) { + xfs_warn(mp, "Log allocation failed: No memory!"); + goto out; + } + + log->l_mp = mp; + log->l_targ = log_target; + log->l_logsize = BBTOB(num_bblks); + log->l_logBBstart = blk_offset; + log->l_logBBsize = num_bblks; + log->l_covered_state = XLOG_STATE_COVER_IDLE; + log->l_flags |= XLOG_ACTIVE_RECOVERY; + + log->l_prev_block = -1; + /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ + xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); + log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ + xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0); + xlog_assign_grant_head(&log->l_grant_write_head, 1, 0); + INIT_LIST_HEAD(&log->l_reserveq); + INIT_LIST_HEAD(&log->l_writeq); + spin_lock_init(&log->l_grant_reserve_lock); + spin_lock_init(&log->l_grant_write_lock); + + error = EFSCORRUPTED; + if (xfs_sb_version_hassector(&mp->m_sb)) { + log2_size = mp->m_sb.sb_logsectlog; + if (log2_size < BBSHIFT) { + xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)", + log2_size, BBSHIFT); + goto out_free_log; + } + + log2_size -= BBSHIFT; + if (log2_size > mp->m_sectbb_log) { + xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)", + log2_size, mp->m_sectbb_log); + goto out_free_log; + } + + /* for larger sector sizes, must have v2 or external log */ + if (log2_size && log->l_logBBstart > 0 && + !xfs_sb_version_haslogv2(&mp->m_sb)) { + xfs_warn(mp, + "log sector size (0x%x) invalid for configuration.", + log2_size); + goto out_free_log; + } + } + log->l_sectBBsize = 1 << log2_size; + + xlog_get_iclog_buffer_size(mp, log); + + error = ENOMEM; + bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); + if (!bp) + goto out_free_log; + XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); + XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + log->l_xbuf = bp; + + spin_lock_init(&log->l_icloglock); + init_waitqueue_head(&log->l_flush_wait); + + /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ + ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); + + iclogp = &log->l_iclog; + /* + * The amount of memory to allocate for the iclog structure is + * rather funky due to the way the structure is defined. It is + * done this way so that we can use different sizes for machines + * with different amounts of memory. See the definition of + * xlog_in_core_t in xfs_log_priv.h for details. + */ + ASSERT(log->l_iclog_size >= 4096); + for (i=0; i < log->l_iclog_bufs; i++) { + *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL); + if (!*iclogp) + goto out_free_iclog; + + iclog = *iclogp; + iclog->ic_prev = prev_iclog; + prev_iclog = iclog; + + bp = xfs_buf_get_uncached(mp->m_logdev_targp, + log->l_iclog_size, 0); + if (!bp) + goto out_free_iclog; + if (!XFS_BUF_CPSEMA(bp)) + ASSERT(0); + XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); + XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); + iclog->ic_bp = bp; + iclog->ic_data = bp->b_addr; +#ifdef DEBUG + log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); +#endif + head = &iclog->ic_header; + memset(head, 0, sizeof(xlog_rec_header_t)); + head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); + head->h_version = cpu_to_be32( + xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + head->h_size = cpu_to_be32(log->l_iclog_size); + /* new fields */ + head->h_fmt = cpu_to_be32(XLOG_FMT); + memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); + + iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; + iclog->ic_state = XLOG_STATE_ACTIVE; + iclog->ic_log = log; + atomic_set(&iclog->ic_refcnt, 0); + spin_lock_init(&iclog->ic_callback_lock); + iclog->ic_callback_tail = &(iclog->ic_callback); + iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; + + ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); + ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); + init_waitqueue_head(&iclog->ic_force_wait); + init_waitqueue_head(&iclog->ic_write_wait); + + iclogp = &iclog->ic_next; + } + *iclogp = log->l_iclog; /* complete ring */ + log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ + + error = xlog_cil_init(log); + if (error) + goto out_free_iclog; + return log; + +out_free_iclog: + for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { + prev_iclog = iclog->ic_next; + if (iclog->ic_bp) + xfs_buf_free(iclog->ic_bp); + kmem_free(iclog); + } + spinlock_destroy(&log->l_icloglock); + xfs_buf_free(log->l_xbuf); +out_free_log: + kmem_free(log); +out: + return ERR_PTR(-error); +} /* xlog_alloc_log */ + + +/* + * Write out the commit record of a transaction associated with the given + * ticket. Return the lsn of the commit record. + */ +STATIC int +xlog_commit_record( + struct log *log, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + xfs_lsn_t *commitlsnp) +{ + struct xfs_mount *mp = log->l_mp; + int error; + struct xfs_log_iovec reg = { + .i_addr = NULL, + .i_len = 0, + .i_type = XLOG_REG_TYPE_COMMIT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; + + ASSERT_ALWAYS(iclog); + error = xlog_write(log, &vec, ticket, commitlsnp, iclog, + XLOG_COMMIT_TRANS); + if (error) + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + return error; +} + +/* + * Push on the buffer cache code if we ever use more than 75% of the on-disk + * log space. This code pushes on the lsn which would supposedly free up + * the 25% which we want to leave free. We may need to adopt a policy which + * pushes on an lsn which is further along in the log once we reach the high + * water mark. In this manner, we would be creating a low water mark. + */ +STATIC void +xlog_grant_push_ail( + struct log *log, + int need_bytes) +{ + xfs_lsn_t threshold_lsn = 0; + xfs_lsn_t last_sync_lsn; + int free_blocks; + int free_bytes; + int threshold_block; + int threshold_cycle; + int free_threshold; + + ASSERT(BTOBB(need_bytes) < log->l_logBBsize); + + free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); + free_blocks = BTOBBT(free_bytes); + + /* + * Set the threshold for the minimum number of free blocks in the + * log to the maximum of what the caller needs, one quarter of the + * log, and 256 blocks. + */ + free_threshold = BTOBB(need_bytes); + free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); + free_threshold = MAX(free_threshold, 256); + if (free_blocks >= free_threshold) + return; + + xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, + &threshold_block); + threshold_block += free_threshold; + if (threshold_block >= log->l_logBBsize) { + threshold_block -= log->l_logBBsize; + threshold_cycle += 1; + } + threshold_lsn = xlog_assign_lsn(threshold_cycle, + threshold_block); + /* + * Don't pass in an lsn greater than the lsn of the last + * log record known to be on disk. Use a snapshot of the last sync lsn + * so that it doesn't change between the compare and the set. + */ + last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); + if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) + threshold_lsn = last_sync_lsn; + + /* + * Get the transaction layer to kick the dirty buffers out to + * disk asynchronously. No point in trying to do this if + * the filesystem is shutting down. + */ + if (!XLOG_FORCED_SHUTDOWN(log)) + xfs_ail_push(log->l_ailp, threshold_lsn); +} + +/* + * The bdstrat callback function for log bufs. This gives us a central + * place to trap bufs in case we get hit by a log I/O error and need to + * shutdown. Actually, in practice, even when we didn't get a log error, + * we transition the iclogs to IOERROR state *after* flushing all existing + * iclogs to disk. This is because we don't want anymore new transactions to be + * started or completed afterwards. + */ +STATIC int +xlog_bdstrat( + struct xfs_buf *bp) +{ + struct xlog_in_core *iclog; + + iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *); + if (iclog->ic_state & XLOG_STATE_IOERROR) { + XFS_BUF_ERROR(bp, EIO); + XFS_BUF_STALE(bp); + xfs_buf_ioend(bp, 0); + /* + * It would seem logical to return EIO here, but we rely on + * the log state machine to propagate I/O errors instead of + * doing it here. + */ + return 0; + } + + bp->b_flags |= _XBF_RUN_QUEUES; + xfs_buf_iorequest(bp); + return 0; +} + +/* + * Flush out the in-core log (iclog) to the on-disk log in an asynchronous + * fashion. Previously, we should have moved the current iclog + * ptr in the log to point to the next available iclog. This allows further + * write to continue while this code syncs out an iclog ready to go. + * Before an in-core log can be written out, the data section must be scanned + * to save away the 1st word of each BBSIZE block into the header. We replace + * it with the current cycle count. Each BBSIZE block is tagged with the + * cycle count because there in an implicit assumption that drives will + * guarantee that entire 512 byte blocks get written at once. In other words, + * we can't have part of a 512 byte block written and part not written. By + * tagging each block, we will know which blocks are valid when recovering + * after an unclean shutdown. + * + * This routine is single threaded on the iclog. No other thread can be in + * this routine with the same iclog. Changing contents of iclog can there- + * fore be done without grabbing the state machine lock. Updating the global + * log will require grabbing the lock though. + * + * The entire log manager uses a logical block numbering scheme. Only + * log_sync (and then only bwrite()) know about the fact that the log may + * not start with block zero on a given device. The log block start offset + * is added immediately before calling bwrite(). + */ + +STATIC int +xlog_sync(xlog_t *log, + xlog_in_core_t *iclog) +{ + xfs_caddr_t dptr; /* pointer to byte sized element */ + xfs_buf_t *bp; + int i; + uint count; /* byte count of bwrite */ + uint count_init; /* initial count before roundup */ + int roundoff; /* roundoff to BB or stripe */ + int split = 0; /* split write into two regions */ + int error; + int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); + + XFS_STATS_INC(xs_log_writes); + ASSERT(atomic_read(&iclog->ic_refcnt) == 0); + + /* Add for LR header */ + count_init = log->l_iclog_hsize + iclog->ic_offset; + + /* Round out the log write size */ + if (v2 && log->l_mp->m_sb.sb_logsunit > 1) { + /* we have a v2 stripe unit to use */ + count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); + } else { + count = BBTOB(BTOBB(count_init)); + } + roundoff = count - count_init; + ASSERT(roundoff >= 0); + ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 && + roundoff < log->l_mp->m_sb.sb_logsunit) + || + (log->l_mp->m_sb.sb_logsunit <= 1 && + roundoff < BBTOB(1))); + + /* move grant heads by roundoff in sync */ + xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff); + xlog_grant_add_space(log, &log->l_grant_write_head, roundoff); + + /* put cycle number in every block */ + xlog_pack_data(log, iclog, roundoff); + + /* real byte length */ + if (v2) { + iclog->ic_header.h_len = + cpu_to_be32(iclog->ic_offset + roundoff); + } else { + iclog->ic_header.h_len = + cpu_to_be32(iclog->ic_offset); + } + + bp = iclog->ic_bp; + ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); + XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); + XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); + + XFS_STATS_ADD(xs_log_blocks, BTOBB(count)); + + /* Do we need to split this write into 2 parts? */ + if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { + split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); + count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); + iclog->ic_bwritecnt = 2; /* split into 2 writes */ + } else { + iclog->ic_bwritecnt = 1; + } + XFS_BUF_SET_COUNT(bp, count); + XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */ + XFS_BUF_ZEROFLAGS(bp); + XFS_BUF_BUSY(bp); + XFS_BUF_ASYNC(bp); + bp->b_flags |= XBF_LOG_BUFFER; + + if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { + /* + * If we have an external log device, flush the data device + * before flushing the log to make sure all meta data + * written back from the AIL actually made it to disk + * before writing out the new log tail LSN in the log buffer. + */ + if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) + xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); + XFS_BUF_ORDERED(bp); + } + + ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); + ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); + + xlog_verify_iclog(log, iclog, count, B_TRUE); + + /* account for log which doesn't start at block #0 */ + XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); + /* + * Don't call xfs_bwrite here. We do log-syncs even when the filesystem + * is shutting down. + */ + XFS_BUF_WRITE(bp); + + if ((error = xlog_bdstrat(bp))) { + xfs_ioerror_alert("xlog_sync", log->l_mp, bp, + XFS_BUF_ADDR(bp)); + return error; + } + if (split) { + bp = iclog->ic_log->l_xbuf; + ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == + (unsigned long)1); + XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); + XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ + XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ + (__psint_t)count), split); + XFS_BUF_SET_FSPRIVATE(bp, iclog); + XFS_BUF_ZEROFLAGS(bp); + XFS_BUF_BUSY(bp); + XFS_BUF_ASYNC(bp); + bp->b_flags |= XBF_LOG_BUFFER; + if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) + XFS_BUF_ORDERED(bp); + dptr = XFS_BUF_PTR(bp); + /* + * Bump the cycle numbers at the start of each block + * since this part of the buffer is at the start of + * a new cycle. Watch out for the header magic number + * case, though. + */ + for (i = 0; i < split; i += BBSIZE) { + be32_add_cpu((__be32 *)dptr, 1); + if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM) + be32_add_cpu((__be32 *)dptr, 1); + dptr += BBSIZE; + } + + ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); + ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); + + /* account for internal log which doesn't start at block #0 */ + XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); + XFS_BUF_WRITE(bp); + if ((error = xlog_bdstrat(bp))) { + xfs_ioerror_alert("xlog_sync (split)", log->l_mp, + bp, XFS_BUF_ADDR(bp)); + return error; + } + } + return 0; +} /* xlog_sync */ + + +/* + * Deallocate a log structure + */ +STATIC void +xlog_dealloc_log(xlog_t *log) +{ + xlog_in_core_t *iclog, *next_iclog; + int i; + + xlog_cil_destroy(log); + + /* + * always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. + */ + xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size); + xfs_buf_free(log->l_xbuf); + + iclog = log->l_iclog; + for (i=0; il_iclog_bufs; i++) { + xfs_buf_free(iclog->ic_bp); + next_iclog = iclog->ic_next; + kmem_free(iclog); + iclog = next_iclog; + } + spinlock_destroy(&log->l_icloglock); + + log->l_mp->m_log = NULL; + kmem_free(log); +} /* xlog_dealloc_log */ + +/* + * Update counters atomically now that memcpy is done. + */ +/* ARGSUSED */ +static inline void +xlog_state_finish_copy(xlog_t *log, + xlog_in_core_t *iclog, + int record_cnt, + int copy_bytes) +{ + spin_lock(&log->l_icloglock); + + be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); + iclog->ic_offset += copy_bytes; + + spin_unlock(&log->l_icloglock); +} /* xlog_state_finish_copy */ + + + + +/* + * print out info relating to regions written which consume + * the reservation + */ +void +xlog_print_tic_res( + struct xfs_mount *mp, + struct xlog_ticket *ticket) +{ + uint i; + uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); + + /* match with XLOG_REG_TYPE_* in xfs_log.h */ + static char *res_type_str[XLOG_REG_TYPE_MAX] = { + "bformat", + "bchunk", + "efi_format", + "efd_format", + "iformat", + "icore", + "iext", + "ibroot", + "ilocal", + "iattr_ext", + "iattr_broot", + "iattr_local", + "qformat", + "dquot", + "quotaoff", + "LR header", + "unmount", + "commit", + "trans header" + }; + static char *trans_type_str[XFS_TRANS_TYPE_MAX] = { + "SETATTR_NOT_SIZE", + "SETATTR_SIZE", + "INACTIVE", + "CREATE", + "CREATE_TRUNC", + "TRUNCATE_FILE", + "REMOVE", + "LINK", + "RENAME", + "MKDIR", + "RMDIR", + "SYMLINK", + "SET_DMATTRS", + "GROWFS", + "STRAT_WRITE", + "DIOSTRAT", + "WRITE_SYNC", + "WRITEID", + "ADDAFORK", + "ATTRINVAL", + "ATRUNCATE", + "ATTR_SET", + "ATTR_RM", + "ATTR_FLAG", + "CLEAR_AGI_BUCKET", + "QM_SBCHANGE", + "DUMMY1", + "DUMMY2", + "QM_QUOTAOFF", + "QM_DQALLOC", + "QM_SETQLIM", + "QM_DQCLUSTER", + "QM_QINOCREATE", + "QM_QUOTAOFF_END", + "SB_UNIT", + "FSYNC_TS", + "GROWFSRT_ALLOC", + "GROWFSRT_ZERO", + "GROWFSRT_FREE", + "SWAPEXT" + }; + + xfs_warn(mp, + "xfs_log_write: reservation summary:\n" + " trans type = %s (%u)\n" + " unit res = %d bytes\n" + " current res = %d bytes\n" + " total reg = %u bytes (o/flow = %u bytes)\n" + " ophdrs = %u (ophdr space = %u bytes)\n" + " ophdr + reg = %u bytes\n" + " num regions = %u\n", + ((ticket->t_trans_type <= 0 || + ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? + "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), + ticket->t_trans_type, + ticket->t_unit_res, + ticket->t_curr_res, + ticket->t_res_arr_sum, ticket->t_res_o_flow, + ticket->t_res_num_ophdrs, ophdr_spc, + ticket->t_res_arr_sum + + ticket->t_res_o_flow + ophdr_spc, + ticket->t_res_num); + + for (i = 0; i < ticket->t_res_num; i++) { + uint r_type = ticket->t_res_arr[i].r_type; + xfs_warn(mp, "region[%u]: %s - %u bytes\n", i, + ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? + "bad-rtype" : res_type_str[r_type-1]), + ticket->t_res_arr[i].r_len); + } + + xfs_alert_tag(mp, XFS_PTAG_LOGRES, + "xfs_log_write: reservation ran out. Need to up reservation"); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); +} + +/* + * Calculate the potential space needed by the log vector. Each region gets + * its own xlog_op_header_t and may need to be double word aligned. + */ +static int +xlog_write_calc_vec_length( + struct xlog_ticket *ticket, + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + int headers = 0; + int len = 0; + int i; + + /* acct for start rec of xact */ + if (ticket->t_flags & XLOG_TIC_INITED) + headers++; + + for (lv = log_vector; lv; lv = lv->lv_next) { + headers += lv->lv_niovecs; + + for (i = 0; i < lv->lv_niovecs; i++) { + struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; + + len += vecp->i_len; + xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); + } + } + + ticket->t_res_num_ophdrs += headers; + len += headers * sizeof(struct xlog_op_header); + + return len; +} + +/* + * If first write for transaction, insert start record We can't be trying to + * commit if we are inited. We can't have any "partial_copy" if we are inited. + */ +static int +xlog_write_start_rec( + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket) +{ + if (!(ticket->t_flags & XLOG_TIC_INITED)) + return 0; + + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_len = 0; + ophdr->oh_flags = XLOG_START_TRANS; + ophdr->oh_res2 = 0; + + ticket->t_flags &= ~XLOG_TIC_INITED; + + return sizeof(struct xlog_op_header); +} + +static xlog_op_header_t * +xlog_write_setup_ophdr( + struct log *log, + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket, + uint flags) +{ + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_res2 = 0; + + /* are we copying a commit or unmount record? */ + ophdr->oh_flags = flags; + + /* + * We've seen logs corrupted with bad transaction client ids. This + * makes sure that XFS doesn't generate them on. Turn this into an EIO + * and shut down the filesystem. + */ + switch (ophdr->oh_clientid) { + case XFS_TRANSACTION: + case XFS_VOLUME: + case XFS_LOG: + break; + default: + xfs_warn(log->l_mp, + "Bad XFS transaction clientid 0x%x in ticket 0x%p", + ophdr->oh_clientid, ticket); + return NULL; + } + + return ophdr; +} + +/* + * Set up the parameters of the region copy into the log. This has + * to handle region write split across multiple log buffers - this + * state is kept external to this function so that this code can + * can be written in an obvious, self documenting manner. + */ +static int +xlog_write_setup_copy( + struct xlog_ticket *ticket, + struct xlog_op_header *ophdr, + int space_available, + int space_required, + int *copy_off, + int *copy_len, + int *last_was_partial_copy, + int *bytes_consumed) +{ + int still_to_copy; + + still_to_copy = space_required - *bytes_consumed; + *copy_off = *bytes_consumed; + + if (still_to_copy <= space_available) { + /* write of region completes here */ + *copy_len = still_to_copy; + ophdr->oh_len = cpu_to_be32(*copy_len); + if (*last_was_partial_copy) + ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); + *last_was_partial_copy = 0; + *bytes_consumed = 0; + return 0; + } + + /* partial write of region, needs extra log op header reservation */ + *copy_len = space_available; + ophdr->oh_len = cpu_to_be32(*copy_len); + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; + if (*last_was_partial_copy) + ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; + *bytes_consumed += *copy_len; + (*last_was_partial_copy)++; + + /* account for new log op header */ + ticket->t_curr_res -= sizeof(struct xlog_op_header); + ticket->t_res_num_ophdrs++; + + return sizeof(struct xlog_op_header); +} + +static int +xlog_write_copy_finish( + struct log *log, + struct xlog_in_core *iclog, + uint flags, + int *record_cnt, + int *data_cnt, + int *partial_copy, + int *partial_copy_len, + int log_offset, + struct xlog_in_core **commit_iclog) +{ + if (*partial_copy) { + /* + * This iclog has already been marked WANT_SYNC by + * xlog_state_get_iclog_space. + */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + return xlog_state_release_iclog(log, iclog); + } + + *partial_copy = 0; + *partial_copy_len = 0; + + if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { + /* no more space in this iclog - push it. */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + + spin_lock(&log->l_icloglock); + xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); + + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + } + + return 0; +} + +/* + * Write some region out to in-core log + * + * This will be called when writing externally provided regions or when + * writing out a commit record for a given transaction. + * + * General algorithm: + * 1. Find total length of this write. This may include adding to the + * lengths passed in. + * 2. Check whether we violate the tickets reservation. + * 3. While writing to this iclog + * A. Reserve as much space in this iclog as can get + * B. If this is first write, save away start lsn + * C. While writing this region: + * 1. If first write of transaction, write start record + * 2. Write log operation header (header per region) + * 3. Find out if we can fit entire region into this iclog + * 4. Potentially, verify destination memcpy ptr + * 5. Memcpy (partial) region + * 6. If partial copy, release iclog; otherwise, continue + * copying more regions into current iclog + * 4. Mark want sync bit (in simulation mode) + * 5. Release iclog for potential flush to on-disk log. + * + * ERRORS: + * 1. Panic if reservation is overrun. This should never happen since + * reservation amounts are generated internal to the filesystem. + * NOTES: + * 1. Tickets are single threaded data structures. + * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the + * syncing routine. When a single log_write region needs to span + * multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set + * on all log operation writes which don't contain the end of the + * region. The XLOG_END_TRANS bit is used for the in-core log + * operation which contains the end of the continued log_write region. + * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog, + * we don't really know exactly how much space will be used. As a result, + * we don't update ic_offset until the end when we know exactly how many + * bytes have been written out. + */ +int +xlog_write( + struct log *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags) +{ + struct xlog_in_core *iclog = NULL; + struct xfs_log_iovec *vecp; + struct xfs_log_vec *lv; + int len; + int index; + int partial_copy = 0; + int partial_copy_len = 0; + int contwr = 0; + int record_cnt = 0; + int data_cnt = 0; + int error; + + *start_lsn = 0; + + len = xlog_write_calc_vec_length(ticket, log_vector); + if (log->l_cilp) { + /* + * Region headers and bytes are already accounted for. + * We only need to take into account start records and + * split regions in this function. + */ + if (ticket->t_flags & XLOG_TIC_INITED) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + + /* + * Commit record headers need to be accounted for. These + * come in as separate writes so are easy to detect. + */ + if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + } else + ticket->t_curr_res -= len; + + if (ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, ticket); + + index = 0; + lv = log_vector; + vecp = lv->lv_iovecp; + while (lv && index < lv->lv_niovecs) { + void *ptr; + int log_offset; + + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + &contwr, &log_offset); + if (error) + return error; + + ASSERT(log_offset <= iclog->ic_size - 1); + ptr = iclog->ic_datap + log_offset; + + /* start_lsn is the first lsn written to. That's all we need. */ + if (!*start_lsn) + *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + + /* + * This loop writes out as many regions as can fit in the amount + * of space which was allocated by xlog_state_get_iclog_space(). + */ + while (lv && index < lv->lv_niovecs) { + struct xfs_log_iovec *reg = &vecp[index]; + struct xlog_op_header *ophdr; + int start_rec_copy; + int copy_len; + int copy_off; + + ASSERT(reg->i_len % sizeof(__int32_t) == 0); + ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); + + start_rec_copy = xlog_write_start_rec(ptr, ticket); + if (start_rec_copy) { + record_cnt++; + xlog_write_adv_cnt(&ptr, &len, &log_offset, + start_rec_copy); + } + + ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); + if (!ophdr) + return XFS_ERROR(EIO); + + xlog_write_adv_cnt(&ptr, &len, &log_offset, + sizeof(struct xlog_op_header)); + + len += xlog_write_setup_copy(ticket, ophdr, + iclog->ic_size-log_offset, + reg->i_len, + ©_off, ©_len, + &partial_copy, + &partial_copy_len); + xlog_verify_dest_ptr(log, ptr); + + /* copy region */ + ASSERT(copy_len >= 0); + memcpy(ptr, reg->i_addr + copy_off, copy_len); + xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); + + copy_len += start_rec_copy + sizeof(xlog_op_header_t); + record_cnt++; + data_cnt += contwr ? copy_len : 0; + + error = xlog_write_copy_finish(log, iclog, flags, + &record_cnt, &data_cnt, + &partial_copy, + &partial_copy_len, + log_offset, + commit_iclog); + if (error) + return error; + + /* + * if we had a partial copy, we need to get more iclog + * space but we don't want to increment the region + * index because there is still more is this region to + * write. + * + * If we completed writing this region, and we flushed + * the iclog (indicated by resetting of the record + * count), then we also need to get more log space. If + * this was the last record, though, we are done and + * can just return. + */ + if (partial_copy) + break; + + if (++index == lv->lv_niovecs) { + lv = lv->lv_next; + index = 0; + if (lv) + vecp = lv->lv_iovecp; + } + if (record_cnt == 0) { + if (!lv) + return 0; + break; + } + } + } + + ASSERT(len == 0); + + xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); + + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + return 0; +} + + +/***************************************************************************** + * + * State Machine functions + * + ***************************************************************************** + */ + +/* Clean iclogs starting from the head. This ordering must be + * maintained, so an iclog doesn't become ACTIVE beyond one that + * is SYNCING. This is also required to maintain the notion that we use + * a ordered wait queue to hold off would be writers to the log when every + * iclog is trying to sync to disk. + * + * State Change: DIRTY -> ACTIVE + */ +STATIC void +xlog_state_clean_log(xlog_t *log) +{ + xlog_in_core_t *iclog; + int changed = 0; + + iclog = log->l_iclog; + do { + if (iclog->ic_state == XLOG_STATE_DIRTY) { + iclog->ic_state = XLOG_STATE_ACTIVE; + iclog->ic_offset = 0; + ASSERT(iclog->ic_callback == NULL); + /* + * If the number of ops in this iclog indicate it just + * contains the dummy transaction, we can + * change state into IDLE (the second time around). + * Otherwise we should change the state into + * NEED a dummy. + * We don't need to cover the dummy. + */ + if (!changed && + (be32_to_cpu(iclog->ic_header.h_num_logops) == + XLOG_COVER_OPS)) { + changed = 1; + } else { + /* + * We have two dirty iclogs so start over + * This could also be num of ops indicates + * this is not the dummy going out. + */ + changed = 2; + } + iclog->ic_header.h_num_logops = 0; + memset(iclog->ic_header.h_cycle_data, 0, + sizeof(iclog->ic_header.h_cycle_data)); + iclog->ic_header.h_lsn = 0; + } else if (iclog->ic_state == XLOG_STATE_ACTIVE) + /* do nothing */; + else + break; /* stop cleaning */ + iclog = iclog->ic_next; + } while (iclog != log->l_iclog); + + /* log is locked when we are called */ + /* + * Change state for the dummy log recording. + * We usually go to NEED. But we go to NEED2 if the changed indicates + * we are done writing the dummy record. + * If we are done with the second dummy recored (DONE2), then + * we go to IDLE. + */ + if (changed) { + switch (log->l_covered_state) { + case XLOG_STATE_COVER_IDLE: + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + log->l_covered_state = XLOG_STATE_COVER_NEED; + break; + + case XLOG_STATE_COVER_DONE: + if (changed == 1) + log->l_covered_state = XLOG_STATE_COVER_NEED2; + else + log->l_covered_state = XLOG_STATE_COVER_NEED; + break; + + case XLOG_STATE_COVER_DONE2: + if (changed == 1) + log->l_covered_state = XLOG_STATE_COVER_IDLE; + else + log->l_covered_state = XLOG_STATE_COVER_NEED; + break; + + default: + ASSERT(0); + } + } +} /* xlog_state_clean_log */ + +STATIC xfs_lsn_t +xlog_get_lowest_lsn( + xlog_t *log) +{ + xlog_in_core_t *lsn_log; + xfs_lsn_t lowest_lsn, lsn; + + lsn_log = log->l_iclog; + lowest_lsn = 0; + do { + if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) { + lsn = be64_to_cpu(lsn_log->ic_header.h_lsn); + if ((lsn && !lowest_lsn) || + (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) { + lowest_lsn = lsn; + } + } + lsn_log = lsn_log->ic_next; + } while (lsn_log != log->l_iclog); + return lowest_lsn; +} + + +STATIC void +xlog_state_do_callback( + xlog_t *log, + int aborted, + xlog_in_core_t *ciclog) +{ + xlog_in_core_t *iclog; + xlog_in_core_t *first_iclog; /* used to know when we've + * processed all iclogs once */ + xfs_log_callback_t *cb, *cb_next; + int flushcnt = 0; + xfs_lsn_t lowest_lsn; + int ioerrors; /* counter: iclogs with errors */ + int loopdidcallbacks; /* flag: inner loop did callbacks*/ + int funcdidcallbacks; /* flag: function did callbacks */ + int repeats; /* for issuing console warnings if + * looping too many times */ + int wake = 0; + + spin_lock(&log->l_icloglock); + first_iclog = iclog = log->l_iclog; + ioerrors = 0; + funcdidcallbacks = 0; + repeats = 0; + + do { + /* + * Scan all iclogs starting with the one pointed to by the + * log. Reset this starting point each time the log is + * unlocked (during callbacks). + * + * Keep looping through iclogs until one full pass is made + * without running any callbacks. + */ + first_iclog = log->l_iclog; + iclog = log->l_iclog; + loopdidcallbacks = 0; + repeats++; + + do { + + /* skip all iclogs in the ACTIVE & DIRTY states */ + if (iclog->ic_state & + (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) { + iclog = iclog->ic_next; + continue; + } + + /* + * Between marking a filesystem SHUTDOWN and stopping + * the log, we do flush all iclogs to disk (if there + * wasn't a log I/O error). So, we do want things to + * go smoothly in case of just a SHUTDOWN w/o a + * LOG_IO_ERROR. + */ + if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { + /* + * Can only perform callbacks in order. Since + * this iclog is not in the DONE_SYNC/ + * DO_CALLBACK state, we skip the rest and + * just try to clean up. If we set our iclog + * to DO_CALLBACK, we will not process it when + * we retry since a previous iclog is in the + * CALLBACK and the state cannot change since + * we are holding the l_icloglock. + */ + if (!(iclog->ic_state & + (XLOG_STATE_DONE_SYNC | + XLOG_STATE_DO_CALLBACK))) { + if (ciclog && (ciclog->ic_state == + XLOG_STATE_DONE_SYNC)) { + ciclog->ic_state = XLOG_STATE_DO_CALLBACK; + } + break; + } + /* + * We now have an iclog that is in either the + * DO_CALLBACK or DONE_SYNC states. The other + * states (WANT_SYNC, SYNCING, or CALLBACK were + * caught by the above if and are going to + * clean (i.e. we aren't doing their callbacks) + * see the above if. + */ + + /* + * We will do one more check here to see if we + * have chased our tail around. + */ + + lowest_lsn = xlog_get_lowest_lsn(log); + if (lowest_lsn && + XFS_LSN_CMP(lowest_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { + iclog = iclog->ic_next; + continue; /* Leave this iclog for + * another thread */ + } + + iclog->ic_state = XLOG_STATE_CALLBACK; + + + /* + * update the last_sync_lsn before we drop the + * icloglock to ensure we are the only one that + * can update it. + */ + ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), + be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); + + } else + ioerrors++; + + spin_unlock(&log->l_icloglock); + + /* + * Keep processing entries in the callback list until + * we come around and it is empty. We need to + * atomically see that the list is empty and change the + * state to DIRTY so that we don't miss any more + * callbacks being added. + */ + spin_lock(&iclog->ic_callback_lock); + cb = iclog->ic_callback; + while (cb) { + iclog->ic_callback_tail = &(iclog->ic_callback); + iclog->ic_callback = NULL; + spin_unlock(&iclog->ic_callback_lock); + + /* perform callbacks in the order given */ + for (; cb; cb = cb_next) { + cb_next = cb->cb_next; + cb->cb_func(cb->cb_arg, aborted); + } + spin_lock(&iclog->ic_callback_lock); + cb = iclog->ic_callback; + } + + loopdidcallbacks++; + funcdidcallbacks++; + + spin_lock(&log->l_icloglock); + ASSERT(iclog->ic_callback == NULL); + spin_unlock(&iclog->ic_callback_lock); + if (!(iclog->ic_state & XLOG_STATE_IOERROR)) + iclog->ic_state = XLOG_STATE_DIRTY; + + /* + * Transition from DIRTY to ACTIVE if applicable. + * NOP if STATE_IOERROR. + */ + xlog_state_clean_log(log); + + /* wake up threads waiting in xfs_log_force() */ + wake_up_all(&iclog->ic_force_wait); + + iclog = iclog->ic_next; + } while (first_iclog != iclog); + + if (repeats > 5000) { + flushcnt += repeats; + repeats = 0; + xfs_warn(log->l_mp, + "%s: possible infinite loop (%d iterations)", + __func__, flushcnt); + } + } while (!ioerrors && loopdidcallbacks); + + /* + * make one last gasp attempt to see if iclogs are being left in + * limbo.. + */ +#ifdef DEBUG + if (funcdidcallbacks) { + first_iclog = iclog = log->l_iclog; + do { + ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); + /* + * Terminate the loop if iclogs are found in states + * which will cause other threads to clean up iclogs. + * + * SYNCING - i/o completion will go through logs + * DONE_SYNC - interrupt thread should be waiting for + * l_icloglock + * IOERROR - give up hope all ye who enter here + */ + if (iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_SYNCING || + iclog->ic_state == XLOG_STATE_DONE_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR ) + break; + iclog = iclog->ic_next; + } while (first_iclog != iclog); + } +#endif + + if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) + wake = 1; + spin_unlock(&log->l_icloglock); + + if (wake) + wake_up_all(&log->l_flush_wait); +} + + +/* + * Finish transitioning this iclog to the dirty state. + * + * Make sure that we completely execute this routine only when this is + * the last call to the iclog. There is a good chance that iclog flushes, + * when we reach the end of the physical log, get turned into 2 separate + * calls to bwrite. Hence, one iclog flush could generate two calls to this + * routine. By using the reference count bwritecnt, we guarantee that only + * the second completion goes through. + * + * Callbacks could take time, so they are done outside the scope of the + * global state machine log lock. + */ +STATIC void +xlog_state_done_syncing( + xlog_in_core_t *iclog, + int aborted) +{ + xlog_t *log = iclog->ic_log; + + spin_lock(&log->l_icloglock); + + ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || + iclog->ic_state == XLOG_STATE_IOERROR); + ASSERT(atomic_read(&iclog->ic_refcnt) == 0); + ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2); + + + /* + * If we got an error, either on the first buffer, or in the case of + * split log writes, on the second, we mark ALL iclogs STATE_IOERROR, + * and none should ever be attempted to be written to disk + * again. + */ + if (iclog->ic_state != XLOG_STATE_IOERROR) { + if (--iclog->ic_bwritecnt == 1) { + spin_unlock(&log->l_icloglock); + return; + } + iclog->ic_state = XLOG_STATE_DONE_SYNC; + } + + /* + * Someone could be sleeping prior to writing out the next + * iclog buffer, we wake them all, one will get to do the + * I/O, the others get to wait for the result. + */ + wake_up_all(&iclog->ic_write_wait); + spin_unlock(&log->l_icloglock); + xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ +} /* xlog_state_done_syncing */ + + +/* + * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must + * sleep. We wait on the flush queue on the head iclog as that should be + * the first iclog to complete flushing. Hence if all iclogs are syncing, + * we will wait here and all new writes will sleep until a sync completes. + * + * The in-core logs are used in a circular fashion. They are not used + * out-of-order even when an iclog past the head is free. + * + * return: + * * log_offset where xlog_write() can start writing into the in-core + * log's data space. + * * in-core log pointer to which xlog_write() should write. + * * boolean indicating this is a continued write to an in-core log. + * If this is the last write, then the in-core log's offset field + * needs to be incremented, depending on the amount of data which + * is copied. + */ +STATIC int +xlog_state_get_iclog_space(xlog_t *log, + int len, + xlog_in_core_t **iclogp, + xlog_ticket_t *ticket, + int *continued_write, + int *logoffsetp) +{ + int log_offset; + xlog_rec_header_t *head; + xlog_in_core_t *iclog; + int error; + +restart: + spin_lock(&log->l_icloglock); + if (XLOG_FORCED_SHUTDOWN(log)) { + spin_unlock(&log->l_icloglock); + return XFS_ERROR(EIO); + } + + iclog = log->l_iclog; + if (iclog->ic_state != XLOG_STATE_ACTIVE) { + XFS_STATS_INC(xs_log_noiclogs); + + /* Wait for log writes to have flushed */ + xlog_wait(&log->l_flush_wait, &log->l_icloglock); + goto restart; + } + + head = &iclog->ic_header; + + atomic_inc(&iclog->ic_refcnt); /* prevents sync */ + log_offset = iclog->ic_offset; + + /* On the 1st write to an iclog, figure out lsn. This works + * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are + * committing to. If the offset is set, that's how many blocks + * must be written. + */ + if (log_offset == 0) { + ticket->t_curr_res -= log->l_iclog_hsize; + xlog_tic_add_region(ticket, + log->l_iclog_hsize, + XLOG_REG_TYPE_LRHEADER); + head->h_cycle = cpu_to_be32(log->l_curr_cycle); + head->h_lsn = cpu_to_be64( + xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block)); + ASSERT(log->l_curr_block >= 0); + } + + /* If there is enough room to write everything, then do it. Otherwise, + * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC + * bit is on, so this will get flushed out. Don't update ic_offset + * until you know exactly how many bytes get copied. Therefore, wait + * until later to update ic_offset. + * + * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's + * can fit into remaining data section. + */ + if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { + xlog_state_switch_iclogs(log, iclog, iclog->ic_size); + + /* + * If I'm the only one writing to this iclog, sync it to disk. + * We need to do an atomic compare and decrement here to avoid + * racing with concurrent atomic_dec_and_lock() calls in + * xlog_state_release_iclog() when there is more than one + * reference to the iclog. + */ + if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) { + /* we are the only one */ + spin_unlock(&log->l_icloglock); + error = xlog_state_release_iclog(log, iclog); + if (error) + return error; + } else { + spin_unlock(&log->l_icloglock); + } + goto restart; + } + + /* Do we have enough room to write the full amount in the remainder + * of this iclog? Or must we continue a write on the next iclog and + * mark this iclog as completely taken? In the case where we switch + * iclogs (to mark it taken), this particular iclog will release/sync + * to disk in xlog_write(). + */ + if (len <= iclog->ic_size - iclog->ic_offset) { + *continued_write = 0; + iclog->ic_offset += len; + } else { + *continued_write = 1; + xlog_state_switch_iclogs(log, iclog, iclog->ic_size); + } + *iclogp = iclog; + + ASSERT(iclog->ic_offset <= iclog->ic_size); + spin_unlock(&log->l_icloglock); + + *logoffsetp = log_offset; + return 0; +} /* xlog_state_get_iclog_space */ + +/* + * Atomically get the log space required for a log ticket. + * + * Once a ticket gets put onto the reserveq, it will only return after + * the needed reservation is satisfied. + * + * This function is structured so that it has a lock free fast path. This is + * necessary because every new transaction reservation will come through this + * path. Hence any lock will be globally hot if we take it unconditionally on + * every pass. + * + * As tickets are only ever moved on and off the reserveq under the + * l_grant_reserve_lock, we only need to take that lock if we are going + * to add the ticket to the queue and sleep. We can avoid taking the lock if the + * ticket was never added to the reserveq because the t_queue list head will be + * empty and we hold the only reference to it so it can safely be checked + * unlocked. + */ +STATIC int +xlog_grant_log_space(xlog_t *log, + xlog_ticket_t *tic) +{ + int free_bytes; + int need_bytes; + +#ifdef DEBUG + if (log->l_flags & XLOG_ACTIVE_RECOVERY) + panic("grant Recovery problem"); +#endif + + trace_xfs_log_grant_enter(log, tic); + + need_bytes = tic->t_unit_res; + if (tic->t_flags & XFS_LOG_PERM_RESERV) + need_bytes *= tic->t_ocnt; + + /* something is already sleeping; insert new transaction at end */ + if (!list_empty_careful(&log->l_reserveq)) { + spin_lock(&log->l_grant_reserve_lock); + /* recheck the queue now we are locked */ + if (list_empty(&log->l_reserveq)) { + spin_unlock(&log->l_grant_reserve_lock); + goto redo; + } + list_add_tail(&tic->t_queue, &log->l_reserveq); + + trace_xfs_log_grant_sleep1(log, tic); + + /* + * Gotta check this before going to sleep, while we're + * holding the grant lock. + */ + if (XLOG_FORCED_SHUTDOWN(log)) + goto error_return; + + XFS_STATS_INC(xs_sleep_logspace); + xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); + + /* + * If we got an error, and the filesystem is shutting down, + * we'll catch it down below. So just continue... + */ + trace_xfs_log_grant_wake1(log, tic); + } + +redo: + if (XLOG_FORCED_SHUTDOWN(log)) + goto error_return_unlocked; + + free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); + if (free_bytes < need_bytes) { + spin_lock(&log->l_grant_reserve_lock); + if (list_empty(&tic->t_queue)) + list_add_tail(&tic->t_queue, &log->l_reserveq); + + trace_xfs_log_grant_sleep2(log, tic); + + if (XLOG_FORCED_SHUTDOWN(log)) + goto error_return; + + xlog_grant_push_ail(log, need_bytes); + + XFS_STATS_INC(xs_sleep_logspace); + xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); + + trace_xfs_log_grant_wake2(log, tic); + goto redo; + } + + if (!list_empty(&tic->t_queue)) { + spin_lock(&log->l_grant_reserve_lock); + list_del_init(&tic->t_queue); + spin_unlock(&log->l_grant_reserve_lock); + } + + /* we've got enough space */ + xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes); + xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); + trace_xfs_log_grant_exit(log, tic); + xlog_verify_grant_tail(log); + return 0; + +error_return_unlocked: + spin_lock(&log->l_grant_reserve_lock); +error_return: + list_del_init(&tic->t_queue); + spin_unlock(&log->l_grant_reserve_lock); + trace_xfs_log_grant_error(log, tic); + + /* + * If we are failing, make sure the ticket doesn't have any + * current reservations. We don't want to add this back when + * the ticket/transaction gets cancelled. + */ + tic->t_curr_res = 0; + tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ + return XFS_ERROR(EIO); +} /* xlog_grant_log_space */ + + +/* + * Replenish the byte reservation required by moving the grant write head. + * + * Similar to xlog_grant_log_space, the function is structured to have a lock + * free fast path. + */ +STATIC int +xlog_regrant_write_log_space(xlog_t *log, + xlog_ticket_t *tic) +{ + int free_bytes, need_bytes; + + tic->t_curr_res = tic->t_unit_res; + xlog_tic_reset_res(tic); + + if (tic->t_cnt > 0) + return 0; + +#ifdef DEBUG + if (log->l_flags & XLOG_ACTIVE_RECOVERY) + panic("regrant Recovery problem"); +#endif + + trace_xfs_log_regrant_write_enter(log, tic); + if (XLOG_FORCED_SHUTDOWN(log)) + goto error_return_unlocked; + + /* If there are other waiters on the queue then give them a + * chance at logspace before us. Wake up the first waiters, + * if we do not wake up all the waiters then go to sleep waiting + * for more free space, otherwise try to get some space for + * this transaction. + */ + need_bytes = tic->t_unit_res; + if (!list_empty_careful(&log->l_writeq)) { + struct xlog_ticket *ntic; + + spin_lock(&log->l_grant_write_lock); + free_bytes = xlog_space_left(log, &log->l_grant_write_head); + list_for_each_entry(ntic, &log->l_writeq, t_queue) { + ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); + + if (free_bytes < ntic->t_unit_res) + break; + free_bytes -= ntic->t_unit_res; + wake_up(&ntic->t_wait); + } + + if (ntic != list_first_entry(&log->l_writeq, + struct xlog_ticket, t_queue)) { + if (list_empty(&tic->t_queue)) + list_add_tail(&tic->t_queue, &log->l_writeq); + trace_xfs_log_regrant_write_sleep1(log, tic); + + xlog_grant_push_ail(log, need_bytes); + + XFS_STATS_INC(xs_sleep_logspace); + xlog_wait(&tic->t_wait, &log->l_grant_write_lock); + trace_xfs_log_regrant_write_wake1(log, tic); + } else + spin_unlock(&log->l_grant_write_lock); + } + +redo: + if (XLOG_FORCED_SHUTDOWN(log)) + goto error_return_unlocked; + + free_bytes = xlog_space_left(log, &log->l_grant_write_head); + if (free_bytes < need_bytes) { + spin_lock(&log->l_grant_write_lock); + if (list_empty(&tic->t_queue)) + list_add_tail(&tic->t_queue, &log->l_writeq); + + if (XLOG_FORCED_SHUTDOWN(log)) + goto error_return; + + xlog_grant_push_ail(log, need_bytes); + + XFS_STATS_INC(xs_sleep_logspace); + trace_xfs_log_regrant_write_sleep2(log, tic); + xlog_wait(&tic->t_wait, &log->l_grant_write_lock); + + trace_xfs_log_regrant_write_wake2(log, tic); + goto redo; + } + + if (!list_empty(&tic->t_queue)) { + spin_lock(&log->l_grant_write_lock); + list_del_init(&tic->t_queue); + spin_unlock(&log->l_grant_write_lock); + } + + /* we've got enough space */ + xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); + trace_xfs_log_regrant_write_exit(log, tic); + xlog_verify_grant_tail(log); + return 0; + + + error_return_unlocked: + spin_lock(&log->l_grant_write_lock); + error_return: + list_del_init(&tic->t_queue); + spin_unlock(&log->l_grant_write_lock); + trace_xfs_log_regrant_write_error(log, tic); + + /* + * If we are failing, make sure the ticket doesn't have any + * current reservations. We don't want to add this back when + * the ticket/transaction gets cancelled. + */ + tic->t_curr_res = 0; + tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ + return XFS_ERROR(EIO); +} /* xlog_regrant_write_log_space */ + + +/* The first cnt-1 times through here we don't need to + * move the grant write head because the permanent + * reservation has reserved cnt times the unit amount. + * Release part of current permanent unit reservation and + * reset current reservation to be one units worth. Also + * move grant reservation head forward. + */ +STATIC void +xlog_regrant_reserve_log_space(xlog_t *log, + xlog_ticket_t *ticket) +{ + trace_xfs_log_regrant_reserve_enter(log, ticket); + + if (ticket->t_cnt > 0) + ticket->t_cnt--; + + xlog_grant_sub_space(log, &log->l_grant_reserve_head, + ticket->t_curr_res); + xlog_grant_sub_space(log, &log->l_grant_write_head, + ticket->t_curr_res); + ticket->t_curr_res = ticket->t_unit_res; + xlog_tic_reset_res(ticket); + + trace_xfs_log_regrant_reserve_sub(log, ticket); + + /* just return if we still have some of the pre-reserved space */ + if (ticket->t_cnt > 0) + return; + + xlog_grant_add_space(log, &log->l_grant_reserve_head, + ticket->t_unit_res); + + trace_xfs_log_regrant_reserve_exit(log, ticket); + + ticket->t_curr_res = ticket->t_unit_res; + xlog_tic_reset_res(ticket); +} /* xlog_regrant_reserve_log_space */ + + +/* + * Give back the space left from a reservation. + * + * All the information we need to make a correct determination of space left + * is present. For non-permanent reservations, things are quite easy. The + * count should have been decremented to zero. We only need to deal with the + * space remaining in the current reservation part of the ticket. If the + * ticket contains a permanent reservation, there may be left over space which + * needs to be released. A count of N means that N-1 refills of the current + * reservation can be done before we need to ask for more space. The first + * one goes to fill up the first current reservation. Once we run out of + * space, the count will stay at zero and the only space remaining will be + * in the current reservation field. + */ +STATIC void +xlog_ungrant_log_space(xlog_t *log, + xlog_ticket_t *ticket) +{ + int bytes; + + if (ticket->t_cnt > 0) + ticket->t_cnt--; + + trace_xfs_log_ungrant_enter(log, ticket); + trace_xfs_log_ungrant_sub(log, ticket); + + /* + * If this is a permanent reservation ticket, we may be able to free + * up more space based on the remaining count. + */ + bytes = ticket->t_curr_res; + if (ticket->t_cnt > 0) { + ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); + bytes += ticket->t_unit_res*ticket->t_cnt; + } + + xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes); + xlog_grant_sub_space(log, &log->l_grant_write_head, bytes); + + trace_xfs_log_ungrant_exit(log, ticket); + + xfs_log_move_tail(log->l_mp, 1); +} /* xlog_ungrant_log_space */ + + +/* + * Flush iclog to disk if this is the last reference to the given iclog and + * the WANT_SYNC bit is set. + * + * When this function is entered, the iclog is not necessarily in the + * WANT_SYNC state. It may be sitting around waiting to get filled. + * + * + */ +STATIC int +xlog_state_release_iclog( + xlog_t *log, + xlog_in_core_t *iclog) +{ + int sync = 0; /* do we sync? */ + + if (iclog->ic_state & XLOG_STATE_IOERROR) + return XFS_ERROR(EIO); + + ASSERT(atomic_read(&iclog->ic_refcnt) > 0); + if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) + return 0; + + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return XFS_ERROR(EIO); + } + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_WANT_SYNC); + + if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { + /* update tail before writing to iclog */ + xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); + sync++; + iclog->ic_state = XLOG_STATE_SYNCING; + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + xlog_verify_tail_lsn(log, iclog, tail_lsn); + /* cycle incremented when incrementing curr_block */ + } + spin_unlock(&log->l_icloglock); + + /* + * We let the log lock go, so it's possible that we hit a log I/O + * error or some other SHUTDOWN condition that marks the iclog + * as XLOG_STATE_IOERROR before the bwrite. However, we know that + * this iclog has consistent data, so we ignore IOERROR + * flags after this point. + */ + if (sync) + return xlog_sync(log, iclog); + return 0; +} /* xlog_state_release_iclog */ + + +/* + * This routine will mark the current iclog in the ring as WANT_SYNC + * and move the current iclog pointer to the next iclog in the ring. + * When this routine is called from xlog_state_get_iclog_space(), the + * exact size of the iclog has not yet been determined. All we know is + * that every data block. We have run out of space in this log record. + */ +STATIC void +xlog_state_switch_iclogs(xlog_t *log, + xlog_in_core_t *iclog, + int eventual_size) +{ + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + if (!eventual_size) + eventual_size = iclog->ic_offset; + iclog->ic_state = XLOG_STATE_WANT_SYNC; + iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block); + log->l_prev_block = log->l_curr_block; + log->l_prev_cycle = log->l_curr_cycle; + + /* roll log?: ic_offset changed later */ + log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize); + + /* Round up to next log-sunit */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && + log->l_mp->m_sb.sb_logsunit > 1) { + __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); + log->l_curr_block = roundup(log->l_curr_block, sunit_bb); + } + + if (log->l_curr_block >= log->l_logBBsize) { + log->l_curr_cycle++; + if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM) + log->l_curr_cycle++; + log->l_curr_block -= log->l_logBBsize; + ASSERT(log->l_curr_block >= 0); + } + ASSERT(iclog == log->l_iclog); + log->l_iclog = iclog->ic_next; +} /* xlog_state_switch_iclogs */ + +/* + * Write out all data in the in-core log as of this exact moment in time. + * + * Data may be written to the in-core log during this call. However, + * we don't guarantee this data will be written out. A change from past + * implementation means this routine will *not* write out zero length LRs. + * + * Basically, we try and perform an intelligent scan of the in-core logs. + * If we determine there is no flushable data, we just return. There is no + * flushable data if: + * + * 1. the current iclog is active and has no data; the previous iclog + * is in the active or dirty state. + * 2. the current iclog is drity, and the previous iclog is in the + * active or dirty state. + * + * We may sleep if: + * + * 1. the current iclog is not in the active nor dirty state. + * 2. the current iclog dirty, and the previous iclog is not in the + * active nor dirty state. + * 3. the current iclog is active, and there is another thread writing + * to this particular iclog. + * 4. a) the current iclog is active and has no other writers + * b) when we return from flushing out this iclog, it is still + * not in the active nor dirty state. + */ +int +_xfs_log_force( + struct xfs_mount *mp, + uint flags, + int *log_flushed) +{ + struct log *log = mp->m_log; + struct xlog_in_core *iclog; + xfs_lsn_t lsn; + + XFS_STATS_INC(xs_log_force); + + if (log->l_cilp) + xlog_cil_force(log); + + spin_lock(&log->l_icloglock); + + iclog = log->l_iclog; + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return XFS_ERROR(EIO); + } + + /* If the head iclog is not active nor dirty, we just attach + * ourselves to the head and go to sleep. + */ + if (iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY) { + /* + * If the head is dirty or (active and empty), then + * we need to look at the previous iclog. If the previous + * iclog is active or dirty we are done. There is nothing + * to sync out. Otherwise, we attach ourselves to the + * previous iclog and go to sleep. + */ + if (iclog->ic_state == XLOG_STATE_DIRTY || + (atomic_read(&iclog->ic_refcnt) == 0 + && iclog->ic_offset == 0)) { + iclog = iclog->ic_prev; + if (iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY) + goto no_sleep; + else + goto maybe_sleep; + } else { + if (atomic_read(&iclog->ic_refcnt) == 0) { + /* We are the only one with access to this + * iclog. Flush it out now. There should + * be a roundoff of zero to show that someone + * has already taken care of the roundoff from + * the previous sync. + */ + atomic_inc(&iclog->ic_refcnt); + lsn = be64_to_cpu(iclog->ic_header.h_lsn); + xlog_state_switch_iclogs(log, iclog, 0); + spin_unlock(&log->l_icloglock); + + if (xlog_state_release_iclog(log, iclog)) + return XFS_ERROR(EIO); + + if (log_flushed) + *log_flushed = 1; + spin_lock(&log->l_icloglock); + if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && + iclog->ic_state != XLOG_STATE_DIRTY) + goto maybe_sleep; + else + goto no_sleep; + } else { + /* Someone else is writing to this iclog. + * Use its call to flush out the data. However, + * the other thread may not force out this LR, + * so we mark it WANT_SYNC. + */ + xlog_state_switch_iclogs(log, iclog, 0); + goto maybe_sleep; + } + } + } + + /* By the time we come around again, the iclog could've been filled + * which would give it another lsn. If we have a new lsn, just + * return because the relevant data has been flushed. + */ +maybe_sleep: + if (flags & XFS_LOG_SYNC) { + /* + * We must check if we're shutting down here, before + * we wait, while we're holding the l_icloglock. + * Then we check again after waking up, in case our + * sleep was disturbed by a bad news. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return XFS_ERROR(EIO); + } + XFS_STATS_INC(xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + /* + * No need to grab the log lock here since we're + * only deciding whether or not to return EIO + * and the memory read should be atomic. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) + return XFS_ERROR(EIO); + if (log_flushed) + *log_flushed = 1; + } else { + +no_sleep: + spin_unlock(&log->l_icloglock); + } + return 0; +} + +/* + * Wrapper for _xfs_log_force(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force( + xfs_mount_t *mp, + uint flags) +{ + int error; + + error = _xfs_log_force(mp, flags, NULL); + if (error) + xfs_warn(mp, "%s: error %d returned.", __func__, error); +} + +/* + * Force the in-core log to disk for a specific LSN. + * + * Find in-core log with lsn. + * If it is in the DIRTY state, just return. + * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC + * state and go to sleep or return. + * If it is in any other state, go to sleep or return. + * + * Synchronous forces are implemented with a signal variable. All callers + * to force a given lsn to disk will wait on a the sv attached to the + * specific in-core log. When given in-core log finally completes its + * write to disk, that thread will wake up all threads waiting on the + * sv. + */ +int +_xfs_log_force_lsn( + struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags, + int *log_flushed) +{ + struct log *log = mp->m_log; + struct xlog_in_core *iclog; + int already_slept = 0; + + ASSERT(lsn != 0); + + XFS_STATS_INC(xs_log_force); + + if (log->l_cilp) { + lsn = xlog_cil_force_lsn(log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; + } + +try_again: + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return XFS_ERROR(EIO); + } + + do { + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + iclog = iclog->ic_next; + continue; + } + + if (iclog->ic_state == XLOG_STATE_DIRTY) { + spin_unlock(&log->l_icloglock); + return 0; + } + + if (iclog->ic_state == XLOG_STATE_ACTIVE) { + /* + * We sleep here if we haven't already slept (e.g. + * this is the first time we've looked at the correct + * iclog buf) and the buffer before us is going to + * be sync'ed. The reason for this is that if we + * are doing sync transactions here, by waiting for + * the previous I/O to complete, we can allow a few + * more transactions into this iclog before we close + * it down. + * + * Otherwise, we mark the buffer WANT_SYNC, and bump + * up the refcnt so we can release the log (which + * drops the ref count). The state switch keeps new + * transaction commits from using this buffer. When + * the current commits finish writing into the buffer, + * the refcount will drop to zero and the buffer will + * go out then. + */ + if (!already_slept && + (iclog->ic_prev->ic_state & + (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { + ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); + + XFS_STATS_INC(xs_log_force_sleep); + + xlog_wait(&iclog->ic_prev->ic_write_wait, + &log->l_icloglock); + if (log_flushed) + *log_flushed = 1; + already_slept = 1; + goto try_again; + } + atomic_inc(&iclog->ic_refcnt); + xlog_state_switch_iclogs(log, iclog, 0); + spin_unlock(&log->l_icloglock); + if (xlog_state_release_iclog(log, iclog)) + return XFS_ERROR(EIO); + if (log_flushed) + *log_flushed = 1; + spin_lock(&log->l_icloglock); + } + + if ((flags & XFS_LOG_SYNC) && /* sleep */ + !(iclog->ic_state & + (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { + /* + * Don't wait on completion if we know that we've + * gotten a log write error. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return XFS_ERROR(EIO); + } + XFS_STATS_INC(xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + /* + * No need to grab the log lock here since we're + * only deciding whether or not to return EIO + * and the memory read should be atomic. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) + return XFS_ERROR(EIO); + + if (log_flushed) + *log_flushed = 1; + } else { /* just return */ + spin_unlock(&log->l_icloglock); + } + + return 0; + } while (iclog != log->l_iclog); + + spin_unlock(&log->l_icloglock); + return 0; +} + +/* + * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force_lsn( + xfs_mount_t *mp, + xfs_lsn_t lsn, + uint flags) +{ + int error; + + error = _xfs_log_force_lsn(mp, lsn, flags, NULL); + if (error) + xfs_warn(mp, "%s: error %d returned.", __func__, error); +} + +/* + * Called when we want to mark the current iclog as being ready to sync to + * disk. + */ +STATIC void +xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) +{ + assert_spin_locked(&log->l_icloglock); + + if (iclog->ic_state == XLOG_STATE_ACTIVE) { + xlog_state_switch_iclogs(log, iclog, 0); + } else { + ASSERT(iclog->ic_state & + (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); + } +} + + +/***************************************************************************** + * + * TICKET functions + * + ***************************************************************************** + */ + +/* + * Free a used ticket when its refcount falls to zero. + */ +void +xfs_log_ticket_put( + xlog_ticket_t *ticket) +{ + ASSERT(atomic_read(&ticket->t_ref) > 0); + if (atomic_dec_and_test(&ticket->t_ref)) + kmem_zone_free(xfs_log_ticket_zone, ticket); +} + +xlog_ticket_t * +xfs_log_ticket_get( + xlog_ticket_t *ticket) +{ + ASSERT(atomic_read(&ticket->t_ref) > 0); + atomic_inc(&ticket->t_ref); + return ticket; +} + +/* + * Allocate and initialise a new log ticket. + */ +xlog_ticket_t * +xlog_ticket_alloc( + struct log *log, + int unit_bytes, + int cnt, + char client, + uint xflags, + int alloc_flags) +{ + struct xlog_ticket *tic; + uint num_headers; + int iclog_space; + + tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); + if (!tic) + return NULL; + + /* + * Permanent reservations have up to 'cnt'-1 active log operations + * in the log. A unit in this case is the amount of space for one + * of these log operations. Normal reservations have a cnt of 1 + * and their unit amount is the total amount of space required. + * + * The following lines of code account for non-transaction data + * which occupy space in the on-disk log. + * + * Normal form of a transaction is: + * ... + * and then there are LR hdrs, split-recs and roundoff at end of syncs. + * + * We need to account for all the leadup data and trailer data + * around the transaction data. + * And then we need to account for the worst case in terms of using + * more space. + * The worst case will happen if: + * - the placement of the transaction happens to be such that the + * roundoff is at its maximum + * - the transaction data is synced before the commit record is synced + * i.e. | + * Therefore the commit record is in its own Log Record. + * This can happen as the commit record is called with its + * own region to xlog_write(). + * This then means that in the worst case, roundoff can happen for + * the commit-rec as well. + * The commit-rec is smaller than padding in this scenario and so it is + * not added separately. + */ + + /* for trans header */ + unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(xfs_trans_header_t); + + /* for start-rec */ + unit_bytes += sizeof(xlog_op_header_t); + + /* + * for LR headers - the space for data in an iclog is the size minus + * the space used for the headers. If we use the iclog size, then we + * undercalculate the number of headers required. + * + * Furthermore - the addition of op headers for split-recs might + * increase the space required enough to require more log and op + * headers, so take that into account too. + * + * IMPORTANT: This reservation makes the assumption that if this + * transaction is the first in an iclog and hence has the LR headers + * accounted to it, then the remaining space in the iclog is + * exclusively for this transaction. i.e. if the transaction is larger + * than the iclog, it will be the only thing in that iclog. + * Fundamentally, this means we must pass the entire log vector to + * xlog_write to guarantee this. + */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + num_headers = howmany(unit_bytes, iclog_space); + + /* for split-recs - ophdrs added when data split over LRs */ + unit_bytes += sizeof(xlog_op_header_t) * num_headers; + + /* add extra header reservations if we overrun */ + while (!num_headers || + howmany(unit_bytes, iclog_space) > num_headers) { + unit_bytes += sizeof(xlog_op_header_t); + num_headers++; + } + unit_bytes += log->l_iclog_hsize * num_headers; + + /* for commit-rec LR header - note: padding will subsume the ophdr */ + unit_bytes += log->l_iclog_hsize; + + /* for roundoff padding for transaction data and one for commit record */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && + log->l_mp->m_sb.sb_logsunit > 1) { + /* log su roundoff */ + unit_bytes += 2*log->l_mp->m_sb.sb_logsunit; + } else { + /* BB roundoff */ + unit_bytes += 2*BBSIZE; + } + + atomic_set(&tic->t_ref, 1); + INIT_LIST_HEAD(&tic->t_queue); + tic->t_unit_res = unit_bytes; + tic->t_curr_res = unit_bytes; + tic->t_cnt = cnt; + tic->t_ocnt = cnt; + tic->t_tid = random32(); + tic->t_clientid = client; + tic->t_flags = XLOG_TIC_INITED; + tic->t_trans_type = 0; + if (xflags & XFS_LOG_PERM_RESERV) + tic->t_flags |= XLOG_TIC_PERM_RESERV; + init_waitqueue_head(&tic->t_wait); + + xlog_tic_reset_res(tic); + + return tic; +} + + +/****************************************************************************** + * + * Log debug routines + * + ****************************************************************************** + */ +#if defined(DEBUG) +/* + * Make sure that the destination ptr is within the valid data region of + * one of the iclogs. This uses backup pointers stored in a different + * part of the log in case we trash the log structure. + */ +void +xlog_verify_dest_ptr( + struct log *log, + char *ptr) +{ + int i; + int good_ptr = 0; + + for (i = 0; i < log->l_iclog_bufs; i++) { + if (ptr >= log->l_iclog_bak[i] && + ptr <= log->l_iclog_bak[i] + log->l_iclog_size) + good_ptr++; + } + + if (!good_ptr) + xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); +} + +/* + * Check to make sure the grant write head didn't just over lap the tail. If + * the cycles are the same, we can't be overlapping. Otherwise, make sure that + * the cycles differ by exactly one and check the byte count. + * + * This check is run unlocked, so can give false positives. Rather than assert + * on failures, use a warn-once flag and a panic tag to allow the admin to + * determine if they want to panic the machine when such an error occurs. For + * debug kernels this will have the same effect as using an assert but, unlinke + * an assert, it can be turned off at runtime. + */ +STATIC void +xlog_verify_grant_tail( + struct log *log) +{ + int tail_cycle, tail_blocks; + int cycle, space; + + xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space); + xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); + if (tail_cycle != cycle) { + if (cycle - 1 != tail_cycle && + !(log->l_flags & XLOG_TAIL_WARN)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, + "%s: cycle - 1 != tail_cycle", __func__); + log->l_flags |= XLOG_TAIL_WARN; + } + + if (space > BBTOB(tail_blocks) && + !(log->l_flags & XLOG_TAIL_WARN)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, + "%s: space > BBTOB(tail_blocks)", __func__); + log->l_flags |= XLOG_TAIL_WARN; + } + } +} + +/* check if it will fit */ +STATIC void +xlog_verify_tail_lsn(xlog_t *log, + xlog_in_core_t *iclog, + xfs_lsn_t tail_lsn) +{ + int blocks; + + if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { + blocks = + log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); + if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) + xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); + } else { + ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); + + if (BLOCK_LSN(tail_lsn) == log->l_prev_block) + xfs_emerg(log->l_mp, "%s: tail wrapped", __func__); + + blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; + if (blocks < BTOBB(iclog->ic_offset) + 1) + xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); + } +} /* xlog_verify_tail_lsn */ + +/* + * Perform a number of checks on the iclog before writing to disk. + * + * 1. Make sure the iclogs are still circular + * 2. Make sure we have a good magic number + * 3. Make sure we don't have magic numbers in the data + * 4. Check fields of each log operation header for: + * A. Valid client identifier + * B. tid ptr value falls in valid ptr space (user space code) + * C. Length in log record header is correct according to the + * individual operation headers within record. + * 5. When a bwrite will occur within 5 blocks of the front of the physical + * log, check the preceding blocks of the physical log to make sure all + * the cycle numbers agree with the current cycle number. + */ +STATIC void +xlog_verify_iclog(xlog_t *log, + xlog_in_core_t *iclog, + int count, + boolean_t syncing) +{ + xlog_op_header_t *ophead; + xlog_in_core_t *icptr; + xlog_in_core_2_t *xhdr; + xfs_caddr_t ptr; + xfs_caddr_t base_ptr; + __psint_t field_offset; + __uint8_t clientid; + int len, i, j, k, op_len; + int idx; + + /* check validity of iclog pointers */ + spin_lock(&log->l_icloglock); + icptr = log->l_iclog; + for (i=0; i < log->l_iclog_bufs; i++) { + if (icptr == NULL) + xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); + icptr = icptr->ic_next; + } + if (icptr != log->l_iclog) + xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__); + spin_unlock(&log->l_icloglock); + + /* check log magic numbers */ + if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM) + xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); + + ptr = (xfs_caddr_t) &iclog->ic_header; + for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; + ptr += BBSIZE) { + if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) + xfs_emerg(log->l_mp, "%s: unexpected magic num", + __func__); + } + + /* check fields */ + len = be32_to_cpu(iclog->ic_header.h_num_logops); + ptr = iclog->ic_datap; + base_ptr = ptr; + ophead = (xlog_op_header_t *)ptr; + xhdr = iclog->ic_data; + for (i = 0; i < len; i++) { + ophead = (xlog_op_header_t *)ptr; + + /* clientid is only 1 byte */ + field_offset = (__psint_t) + ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); + if (syncing == B_FALSE || (field_offset & 0x1ff)) { + clientid = ophead->oh_clientid; + } else { + idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); + if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { + j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + clientid = xlog_get_client_id( + xhdr[j].hic_xheader.xh_cycle_data[k]); + } else { + clientid = xlog_get_client_id( + iclog->ic_header.h_cycle_data[idx]); + } + } + if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) + xfs_warn(log->l_mp, + "%s: invalid clientid %d op 0x%p offset 0x%lx", + __func__, clientid, ophead, + (unsigned long)field_offset); + + /* check length */ + field_offset = (__psint_t) + ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); + if (syncing == B_FALSE || (field_offset & 0x1ff)) { + op_len = be32_to_cpu(ophead->oh_len); + } else { + idx = BTOBBT((__psint_t)&ophead->oh_len - + (__psint_t)iclog->ic_datap); + if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { + j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]); + } else { + op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); + } + } + ptr += sizeof(xlog_op_header_t) + op_len; + } +} /* xlog_verify_iclog */ +#endif + +/* + * Mark all iclogs IOERROR. l_icloglock is held by the caller. + */ +STATIC int +xlog_state_ioerror( + xlog_t *log) +{ + xlog_in_core_t *iclog, *ic; + + iclog = log->l_iclog; + if (! (iclog->ic_state & XLOG_STATE_IOERROR)) { + /* + * Mark all the incore logs IOERROR. + * From now on, no log flushes will result. + */ + ic = iclog; + do { + ic->ic_state = XLOG_STATE_IOERROR; + ic = ic->ic_next; + } while (ic != iclog); + return 0; + } + /* + * Return non-zero, if state transition has already happened. + */ + return 1; +} + +/* + * This is called from xfs_force_shutdown, when we're forcibly + * shutting down the filesystem, typically because of an IO error. + * Our main objectives here are to make sure that: + * a. the filesystem gets marked 'SHUTDOWN' for all interested + * parties to find out, 'atomically'. + * b. those who're sleeping on log reservations, pinned objects and + * other resources get woken up, and be told the bad news. + * c. nothing new gets queued up after (a) and (b) are done. + * d. if !logerror, flush the iclogs to disk, then seal them off + * for business. + * + * Note: for delayed logging the !logerror case needs to flush the regions + * held in memory out to the iclogs before flushing them to disk. This needs + * to be done before the log is marked as shutdown, otherwise the flush to the + * iclogs will fail. + */ +int +xfs_log_force_umount( + struct xfs_mount *mp, + int logerror) +{ + xlog_ticket_t *tic; + xlog_t *log; + int retval; + + log = mp->m_log; + + /* + * If this happens during log recovery, don't worry about + * locking; the log isn't open for business yet. + */ + if (!log || + log->l_flags & XLOG_ACTIVE_RECOVERY) { + mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; + if (mp->m_sb_bp) + XFS_BUF_DONE(mp->m_sb_bp); + return 0; + } + + /* + * Somebody could've already done the hard work for us. + * No need to get locks for this. + */ + if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) { + ASSERT(XLOG_FORCED_SHUTDOWN(log)); + return 1; + } + retval = 0; + + /* + * Flush the in memory commit item list before marking the log as + * being shut down. We need to do it in this order to ensure all the + * completed transactions are flushed to disk with the xfs_log_force() + * call below. + */ + if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) + xlog_cil_force(log); + + /* + * mark the filesystem and the as in a shutdown state and wake + * everybody up to tell them the bad news. + */ + spin_lock(&log->l_icloglock); + mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; + if (mp->m_sb_bp) + XFS_BUF_DONE(mp->m_sb_bp); + + /* + * This flag is sort of redundant because of the mount flag, but + * it's good to maintain the separation between the log and the rest + * of XFS. + */ + log->l_flags |= XLOG_IO_ERROR; + + /* + * If we hit a log error, we want to mark all the iclogs IOERROR + * while we're still holding the loglock. + */ + if (logerror) + retval = xlog_state_ioerror(log); + spin_unlock(&log->l_icloglock); + + /* + * We don't want anybody waiting for log reservations after this. That + * means we have to wake up everybody queued up on reserveq as well as + * writeq. In addition, we make sure in xlog_{re}grant_log_space that + * we don't enqueue anything once the SHUTDOWN flag is set, and this + * action is protected by the grant locks. + */ + spin_lock(&log->l_grant_reserve_lock); + list_for_each_entry(tic, &log->l_reserveq, t_queue) + wake_up(&tic->t_wait); + spin_unlock(&log->l_grant_reserve_lock); + + spin_lock(&log->l_grant_write_lock); + list_for_each_entry(tic, &log->l_writeq, t_queue) + wake_up(&tic->t_wait); + spin_unlock(&log->l_grant_write_lock); + + if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { + ASSERT(!logerror); + /* + * Force the incore logs to disk before shutting the + * log down completely. + */ + _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + + spin_lock(&log->l_icloglock); + retval = xlog_state_ioerror(log); + spin_unlock(&log->l_icloglock); + } + /* + * Wake up everybody waiting on xfs_log_force. + * Callback all log item committed functions as if the + * log writes were completed. + */ + xlog_state_do_callback(log, XFS_LI_ABORTED, NULL); + +#ifdef XFSERRORDEBUG + { + xlog_in_core_t *iclog; + + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + do { + ASSERT(iclog->ic_callback == 0); + iclog = iclog->ic_next; + } while (iclog != log->l_iclog); + spin_unlock(&log->l_icloglock); + } +#endif + /* return non-zero if log IOERROR transition had already happened */ + return retval; +} + +STATIC int +xlog_iclogs_empty(xlog_t *log) +{ + xlog_in_core_t *iclog; + + iclog = log->l_iclog; + do { + /* endianness does not matter here, zero is zero in + * any language. + */ + if (iclog->ic_header.h_num_logops) + return 0; + iclog = iclog->ic_next; + } while (iclog != log->l_iclog); + return 1; +} diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h new file mode 100644 index 00000000..78c90399 --- /dev/null +++ b/fs/xfs/xfs_log.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_H__ +#define __XFS_LOG_H__ + +/* get lsn fields */ +#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) +#define BLOCK_LSN(lsn) ((uint)(lsn)) + +/* this is used in a spot where we might otherwise double-endian-flip */ +#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0]) + +#ifdef __KERNEL__ +/* + * By comparing each component, we don't have to worry about extra + * endian issues in treating two 32 bit numbers as one 64 bit number + */ +static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) +{ + if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2)) + return (CYCLE_LSN(lsn1)l_cilp is null so + * we can check this conditional to determine if we are doing delayed + * logging or not. + */ +int +xlog_cil_init( + struct log *log) +{ + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + + log->l_cilp = NULL; + if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG)) + return 0; + + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + if (!cil) + return ENOMEM; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + if (!ctx) { + kmem_free(cil); + return ENOMEM; + } + + INIT_LIST_HEAD(&cil->xc_cil); + INIT_LIST_HEAD(&cil->xc_committing); + spin_lock_init(&cil->xc_cil_lock); + init_rwsem(&cil->xc_ctx_lock); + init_waitqueue_head(&cil->xc_commit_wait); + + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + ctx->sequence = 1; + ctx->cil = cil; + cil->xc_ctx = ctx; + cil->xc_current_sequence = ctx->sequence; + + cil->xc_log = log; + log->l_cilp = cil; + return 0; +} + +void +xlog_cil_destroy( + struct log *log) +{ + if (!log->l_cilp) + return; + + if (log->l_cilp->xc_ctx) { + if (log->l_cilp->xc_ctx->ticket) + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); + kmem_free(log->l_cilp->xc_ctx); + } + + ASSERT(list_empty(&log->l_cilp->xc_cil)); + kmem_free(log->l_cilp); +} + +/* + * Allocate a new ticket. Failing to get a new ticket makes it really hard to + * recover, so we don't allow failure here. Also, we allocate in a context that + * we don't want to be issuing transactions from, so we need to tell the + * allocation code this as well. + * + * We don't reserve any space for the ticket - we are going to steal whatever + * space we require from transactions as they commit. To ensure we reserve all + * the space required, we need to set the current reservation of the ticket to + * zero so that we know to steal the initial transaction overhead from the + * first transaction commit. + */ +static struct xlog_ticket * +xlog_cil_ticket_alloc( + struct log *log) +{ + struct xlog_ticket *tic; + + tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, + KM_SLEEP|KM_NOFS); + tic->t_trans_type = XFS_TRANS_CHECKPOINT; + + /* + * set the current reservation to zero so we know to steal the basic + * transaction overhead reservation from the first transaction commit. + */ + tic->t_curr_res = 0; + return tic; +} + +/* + * After the first stage of log recovery is done, we know where the head and + * tail of the log are. We need this log initialisation done before we can + * initialise the first CIL checkpoint context. + * + * Here we allocate a log ticket to track space usage during a CIL push. This + * ticket is passed to xlog_write() directly so that we don't slowly leak log + * space by failing to account for space used by log headers and additional + * region headers for split regions. + */ +void +xlog_cil_init_post_recovery( + struct log *log) +{ + if (!log->l_cilp) + return; + + log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); + log->l_cilp->xc_ctx->sequence = 1; + log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, + log->l_curr_block); +} + +/* + * Format log item into a flat buffers + * + * For delayed logging, we need to hold a formatted buffer containing all the + * changes on the log item. This enables us to relog the item in memory and + * write it out asynchronously without needing to relock the object that was + * modified at the time it gets written into the iclog. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and formats the vector for the item into the buffer. + * The buffer is then attached to the log item are then inserted into the + * Committed Item List for tracking until the next checkpoint is written out. + * + * We don't set up region headers during this process; we simply copy the + * regions into the flat buffer. We can do this because we still have to do a + * formatting step to write the regions into the iclog buffer. Writing the + * ophdrs during the iclog write means that we can support splitting large + * regions across iclog boundares without needing a change in the format of the + * item/region encapsulation. + * + * Hence what we need to do now is change the rewrite the vector array to point + * to the copied region inside the buffer we just allocated. This allows us to + * format the regions into the iclog as though they are being formatted + * directly out of the objects themselves. + */ +static void +xlog_cil_format_items( + struct log *log, + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + + ASSERT(log_vector); + for (lv = log_vector; lv; lv = lv->lv_next) { + void *ptr; + int index; + int len = 0; + + /* build the vector array and calculate it's length */ + IOP_FORMAT(lv->lv_item, lv->lv_iovecp); + for (index = 0; index < lv->lv_niovecs; index++) + len += lv->lv_iovecp[index].i_len; + + lv->lv_buf_len = len; + lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); + ptr = lv->lv_buf; + + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; + + memcpy(ptr, vec->i_addr, vec->i_len); + vec->i_addr = ptr; + ptr += vec->i_len; + } + ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); + } +} + +/* + * Prepare the log item for insertion into the CIL. Calculate the difference in + * log space and vectors it will consume, and if it is a new item pin it as + * well. + */ +STATIC void +xfs_cil_prepare_item( + struct log *log, + struct xfs_log_vec *lv, + int *len, + int *diff_iovecs) +{ + struct xfs_log_vec *old = lv->lv_item->li_lv; + + if (old) { + /* existing lv on log item, space used is a delta */ + ASSERT(!list_empty(&lv->lv_item->li_cil)); + ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); + + *len += lv->lv_buf_len - old->lv_buf_len; + *diff_iovecs += lv->lv_niovecs - old->lv_niovecs; + kmem_free(old->lv_buf); + kmem_free(old); + } else { + /* new lv, must pin the log item */ + ASSERT(!lv->lv_item->li_lv); + ASSERT(list_empty(&lv->lv_item->li_cil)); + + *len += lv->lv_buf_len; + *diff_iovecs += lv->lv_niovecs; + IOP_PIN(lv->lv_item); + + } + + /* attach new log vector to log item */ + lv->lv_item->li_lv = lv; + + /* + * If this is the first time the item is being committed to the + * CIL, store the sequence number on the log item so we can + * tell in future commits whether this is the first checkpoint + * the item is being committed into. + */ + if (!lv->lv_item->li_seq) + lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence; +} + +/* + * Insert the log items into the CIL and calculate the difference in space + * consumed by the item. Add the space to the checkpoint ticket and calculate + * if the change requires additional log metadata. If it does, take that space + * as well. Remove the amount of space we addded to the checkpoint ticket from + * the current transaction ticket so that the accounting works out correctly. + */ +static void +xlog_cil_insert_items( + struct log *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *ticket) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx = cil->xc_ctx; + struct xfs_log_vec *lv; + int len = 0; + int diff_iovecs = 0; + int iclog_space; + + ASSERT(log_vector); + + /* + * Do all the accounting aggregation and switching of log vectors + * around in a separate loop to the insertion of items into the CIL. + * Then we can do a separate loop to update the CIL within a single + * lock/unlock pair. This reduces the number of round trips on the CIL + * lock from O(nr_logvectors) to O(1) and greatly reduces the overall + * hold time for the transaction commit. + * + * If this is the first time the item is being placed into the CIL in + * this context, pin it so it can't be written to disk until the CIL is + * flushed to the iclog and the iclog written to disk. + * + * We can do this safely because the context can't checkpoint until we + * are done so it doesn't matter exactly how we update the CIL. + */ + for (lv = log_vector; lv; lv = lv->lv_next) + xfs_cil_prepare_item(log, lv, &len, &diff_iovecs); + + /* account for space used by new iovec headers */ + len += diff_iovecs * sizeof(xlog_op_header_t); + + spin_lock(&cil->xc_cil_lock); + + /* move the items to the tail of the CIL */ + for (lv = log_vector; lv; lv = lv->lv_next) + list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil); + + ctx->nvecs += diff_iovecs; + + /* + * Now transfer enough transaction reservation to the context ticket + * for the checkpoint. The context ticket is special - the unit + * reservation has to grow as well as the current reservation as we + * steal from tickets so we can correctly determine the space used + * during the transaction commit. + */ + if (ctx->ticket->t_curr_res == 0) { + /* first commit in checkpoint, steal the header reservation */ + ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len); + ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; + ticket->t_curr_res -= ctx->ticket->t_unit_res; + } + + /* do we need space for more log record headers? */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + if (len > 0 && (ctx->space_used / iclog_space != + (ctx->space_used + len) / iclog_space)) { + int hdrs; + + hdrs = (len + iclog_space - 1) / iclog_space; + /* need to take into account split region headers, too */ + hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); + ctx->ticket->t_unit_res += hdrs; + ctx->ticket->t_curr_res += hdrs; + ticket->t_curr_res -= hdrs; + ASSERT(ticket->t_curr_res >= len); + } + ticket->t_curr_res -= len; + ctx->space_used += len; + + spin_unlock(&cil->xc_cil_lock); +} + +static void +xlog_cil_free_logvec( + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + + for (lv = log_vector; lv; ) { + struct xfs_log_vec *next = lv->lv_next; + kmem_free(lv->lv_buf); + kmem_free(lv); + lv = next; + } +} + +/* + * Mark all items committed and clear busy extents. We free the log vector + * chains in a separate pass so that we unpin the log items as quickly as + * possible. + */ +static void +xlog_cil_committed( + void *args, + int abort) +{ + struct xfs_cil_ctx *ctx = args; + struct xfs_mount *mp = ctx->cil->xc_log->l_mp; + + xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, + ctx->start_lsn, abort); + + xfs_alloc_busy_sort(&ctx->busy_extents); + xfs_alloc_busy_clear(mp, &ctx->busy_extents, + (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); + + spin_lock(&ctx->cil->xc_cil_lock); + list_del(&ctx->committing); + spin_unlock(&ctx->cil->xc_cil_lock); + + xlog_cil_free_logvec(ctx->lv_chain); + + if (!list_empty(&ctx->busy_extents)) { + ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); + + xfs_discard_extents(mp, &ctx->busy_extents); + xfs_alloc_busy_clear(mp, &ctx->busy_extents, false); + } + + kmem_free(ctx); +} + +/* + * Push the Committed Item List to the log. If @push_seq flag is zero, then it + * is a background flush and so we can chose to ignore it. Otherwise, if the + * current sequence is the same as @push_seq we need to do a flush. If + * @push_seq is less than the current sequence, then it has already been + * flushed and we don't need to do anything - the caller will wait for it to + * complete if necessary. + * + * @push_seq is a value rather than a flag because that allows us to do an + * unlocked check of the sequence number for a match. Hence we can allows log + * forces to run racily and not issue pushes for the same sequence twice. If we + * get a race between multiple pushes for the same sequence they will block on + * the first one and then abort, hence avoiding needless pushes. + */ +STATIC int +xlog_cil_push( + struct log *log, + xfs_lsn_t push_seq) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_vec *lv; + struct xfs_cil_ctx *ctx; + struct xfs_cil_ctx *new_ctx; + struct xlog_in_core *commit_iclog; + struct xlog_ticket *tic; + int num_lv; + int num_iovecs; + int len; + int error = 0; + struct xfs_trans_header thdr; + struct xfs_log_iovec lhdr; + struct xfs_log_vec lvhdr = { NULL }; + xfs_lsn_t commit_lsn; + + if (!cil) + return 0; + + ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence); + + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); + new_ctx->ticket = xlog_cil_ticket_alloc(log); + + /* + * Lock out transaction commit, but don't block for background pushes + * unless we are well over the CIL space limit. See the definition of + * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic + * used here. + */ + if (!down_write_trylock(&cil->xc_ctx_lock)) { + if (!push_seq && + cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log)) + goto out_free_ticket; + down_write(&cil->xc_ctx_lock); + } + ctx = cil->xc_ctx; + + /* check if we've anything to push */ + if (list_empty(&cil->xc_cil)) + goto out_skip; + + /* check for spurious background flush */ + if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + goto out_skip; + + /* check for a previously pushed seqeunce */ + if (push_seq && push_seq < cil->xc_ctx->sequence) + goto out_skip; + + /* + * pull all the log vectors off the items in the CIL, and + * remove the items from the CIL. We don't need the CIL lock + * here because it's only needed on the transaction commit + * side which is currently locked out by the flush lock. + */ + lv = NULL; + num_lv = 0; + num_iovecs = 0; + len = 0; + while (!list_empty(&cil->xc_cil)) { + struct xfs_log_item *item; + int i; + + item = list_first_entry(&cil->xc_cil, + struct xfs_log_item, li_cil); + list_del_init(&item->li_cil); + if (!ctx->lv_chain) + ctx->lv_chain = item->li_lv; + else + lv->lv_next = item->li_lv; + lv = item->li_lv; + item->li_lv = NULL; + + num_lv++; + num_iovecs += lv->lv_niovecs; + for (i = 0; i < lv->lv_niovecs; i++) + len += lv->lv_iovecp[i].i_len; + } + + /* + * initialise the new context and attach it to the CIL. Then attach + * the current context to the CIL committing lsit so it can be found + * during log forces to extract the commit lsn of the sequence that + * needs to be forced. + */ + INIT_LIST_HEAD(&new_ctx->committing); + INIT_LIST_HEAD(&new_ctx->busy_extents); + new_ctx->sequence = ctx->sequence + 1; + new_ctx->cil = cil; + cil->xc_ctx = new_ctx; + + /* + * mirror the new sequence into the cil structure so that we can do + * unlocked checks against the current sequence in log forces without + * risking deferencing a freed context pointer. + */ + cil->xc_current_sequence = new_ctx->sequence; + + /* + * The switch is now done, so we can drop the context lock and move out + * of a shared context. We can't just go straight to the commit record, + * though - we need to synchronise with previous and future commits so + * that the commit records are correctly ordered in the log to ensure + * that we process items during log IO completion in the correct order. + * + * For example, if we get an EFI in one checkpoint and the EFD in the + * next (e.g. due to log forces), we do not want the checkpoint with + * the EFD to be committed before the checkpoint with the EFI. Hence + * we must strictly order the commit records of the checkpoints so + * that: a) the checkpoint callbacks are attached to the iclogs in the + * correct order; and b) the checkpoints are replayed in correct order + * in log recovery. + * + * Hence we need to add this context to the committing context list so + * that higher sequences will wait for us to write out a commit record + * before they do. + */ + spin_lock(&cil->xc_cil_lock); + list_add(&ctx->committing, &cil->xc_committing); + spin_unlock(&cil->xc_cil_lock); + up_write(&cil->xc_ctx_lock); + + /* + * Build a checkpoint transaction header and write it to the log to + * begin the transaction. We need to account for the space used by the + * transaction header here as it is not accounted for in xlog_write(). + * + * The LSN we need to pass to the log items on transaction commit is + * the LSN reported by the first log vector write. If we use the commit + * record lsn then we can move the tail beyond the grant write head. + */ + tic = ctx->ticket; + thdr.th_magic = XFS_TRANS_HEADER_MAGIC; + thdr.th_type = XFS_TRANS_CHECKPOINT; + thdr.th_tid = tic->t_tid; + thdr.th_num_items = num_iovecs; + lhdr.i_addr = &thdr; + lhdr.i_len = sizeof(xfs_trans_header_t); + lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; + tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); + + lvhdr.lv_niovecs = 1; + lvhdr.lv_iovecp = &lhdr; + lvhdr.lv_next = ctx->lv_chain; + + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); + if (error) + goto out_abort_free_ticket; + + /* + * now that we've written the checkpoint into the log, strictly + * order the commit records so replay will get them in the right order. + */ +restart: + spin_lock(&cil->xc_cil_lock); + list_for_each_entry(new_ctx, &cil->xc_committing, committing) { + /* + * Higher sequences will wait for this one so skip them. + * Don't wait for own own sequence, either. + */ + if (new_ctx->sequence >= ctx->sequence) + continue; + if (!new_ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); + goto restart; + } + } + spin_unlock(&cil->xc_cil_lock); + + /* xfs_log_done always frees the ticket on error. */ + commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); + if (commit_lsn == -1) + goto out_abort; + + /* attach all the transactions w/ busy extents to iclog */ + ctx->log_cb.cb_func = xlog_cil_committed; + ctx->log_cb.cb_arg = ctx; + error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb); + if (error) + goto out_abort; + + /* + * now the checkpoint commit is complete and we've attached the + * callbacks to the iclog we can assign the commit LSN to the context + * and wake up anyone who is waiting for the commit to complete. + */ + spin_lock(&cil->xc_cil_lock); + ctx->commit_lsn = commit_lsn; + wake_up_all(&cil->xc_commit_wait); + spin_unlock(&cil->xc_cil_lock); + + /* release the hounds! */ + return xfs_log_release_iclog(log->l_mp, commit_iclog); + +out_skip: + up_write(&cil->xc_ctx_lock); +out_free_ticket: + xfs_log_ticket_put(new_ctx->ticket); + kmem_free(new_ctx); + return 0; + +out_abort_free_ticket: + xfs_log_ticket_put(tic); +out_abort: + xlog_cil_committed(ctx, XFS_LI_ABORTED); + return XFS_ERROR(EIO); +} + +/* + * Commit a transaction with the given vector to the Committed Item List. + * + * To do this, we need to format the item, pin it in memory if required and + * account for the space used by the transaction. Once we have done that we + * need to release the unused reservation for the transaction, attach the + * transaction to the checkpoint context so we carry the busy extents through + * to checkpoint completion, and then unlock all the items in the transaction. + * + * For more specific information about the order of operations in + * xfs_log_commit_cil() please refer to the comments in + * xfs_trans_commit_iclog(). + * + * Called with the context lock already held in read mode to lock out + * background commit, returns without it held once background commits are + * allowed again. + */ +void +xfs_log_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_log_vec *log_vector, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct log *log = mp->m_log; + int log_flags = 0; + int push = 0; + + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + + /* + * do all the hard work of formatting items (including memory + * allocation) outside the CIL context lock. This prevents stalling CIL + * pushes when we are low on memory and a transaction commit spends a + * lot of time in memory reclaim. + */ + xlog_cil_format_items(log, log_vector); + + /* lock out background commit */ + down_read(&log->l_cilp->xc_ctx_lock); + if (commit_lsn) + *commit_lsn = log->l_cilp->xc_ctx->sequence; + + xlog_cil_insert_items(log, log_vector, tp->t_ticket); + + /* check we didn't blow the reservation */ + if (tp->t_ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, tp->t_ticket); + + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) { + spin_lock(&log->l_cilp->xc_cil_lock); + list_splice_init(&tp->t_busy, + &log->l_cilp->xc_ctx->busy_extents); + spin_unlock(&log->l_cilp->xc_cil_lock); + } + + tp->t_commit_lsn = *commit_lsn; + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_trans_unreserve_and_mod_sb(tp); + + /* + * Once all the items of the transaction have been copied to the CIL, + * the items can be unlocked and freed. + * + * This needs to be done before we drop the CIL context lock because we + * have to update state in the log items and unlock them before they go + * to disk. If we don't, then the CIL checkpoint can race with us and + * we can run checkpoint completion before we've updated and unlocked + * the log items. This affects (at least) processing of stale buffers, + * inodes and EFIs. + */ + xfs_trans_free_items(tp, *commit_lsn, 0); + + /* check for background commit before unlock */ + if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) + push = 1; + + up_read(&log->l_cilp->xc_ctx_lock); + + /* + * We need to push CIL every so often so we don't cache more than we + * can fit in the log. The limit really is that a checkpoint can't be + * more than half the log (the current checkpoint is not allowed to + * overwrite the previous checkpoint), but commit latency and memory + * usage limit this to a smaller size in most cases. + */ + if (push) + xlog_cil_push(log, 0); +} + +/* + * Conditionally push the CIL based on the sequence passed in. + * + * We only need to push if we haven't already pushed the sequence + * number given. Hence the only time we will trigger a push here is + * if the push sequence is the same as the current context. + * + * We return the current commit lsn to allow the callers to determine if a + * iclog flush is necessary following this call. + * + * XXX: Initially, just push the CIL unconditionally and return whatever + * commit lsn is there. It'll be empty, so this is broken for now. + */ +xfs_lsn_t +xlog_cil_force_lsn( + struct log *log, + xfs_lsn_t sequence) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx; + xfs_lsn_t commit_lsn = NULLCOMMITLSN; + + ASSERT(sequence <= cil->xc_current_sequence); + + /* + * check to see if we need to force out the current context. + * xlog_cil_push() handles racing pushes for the same sequence, + * so no need to deal with it here. + */ + if (sequence == cil->xc_current_sequence) + xlog_cil_push(log, sequence); + + /* + * See if we can find a previous sequence still committing. + * We need to wait for all previous sequence commits to complete + * before allowing the force of push_seq to go ahead. Hence block + * on commits for those as well. + */ +restart: + spin_lock(&cil->xc_cil_lock); + list_for_each_entry(ctx, &cil->xc_committing, committing) { + if (ctx->sequence > sequence) + continue; + if (!ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); + goto restart; + } + if (ctx->sequence != sequence) + continue; + /* found it! */ + commit_lsn = ctx->commit_lsn; + } + spin_unlock(&cil->xc_cil_lock); + return commit_lsn; +} + +/* + * Check if the current log item was first committed in this sequence. + * We can't rely on just the log item being in the CIL, we have to check + * the recorded commit sequence number. + * + * Note: for this to be used in a non-racy manner, it has to be called with + * CIL flushing locked out. As a result, it should only be used during the + * transaction commit process when deciding what to format into the item. + */ +bool +xfs_log_item_in_current_chkpt( + struct xfs_log_item *lip) +{ + struct xfs_cil_ctx *ctx; + + if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG)) + return false; + if (list_empty(&lip->li_cil)) + return false; + + ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; + + /* + * li_seq is written on the first commit of a log item to record the + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ + if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) + return false; + return true; +} diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h new file mode 100644 index 00000000..2d3b6a49 --- /dev/null +++ b/fs/xfs/xfs_log_priv.h @@ -0,0 +1,668 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_PRIV_H__ +#define __XFS_LOG_PRIV_H__ + +struct xfs_buf; +struct log; +struct xlog_ticket; +struct xfs_mount; + +/* + * Macros, structures, prototypes for internal log manager use. + */ + +#define XLOG_MIN_ICLOGS 2 +#define XLOG_MAX_ICLOGS 8 +#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */ +#define XLOG_VERSION_1 1 +#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */ +#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2) +#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */ +#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */ +#define XLOG_MAX_RECORD_BSIZE (256*1024) +#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */ +#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ +#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ +#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ +#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ + (log)->l_mp->m_sb.sb_logsunit) +#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit) + +#define XLOG_HEADER_SIZE 512 + +#define XLOG_REC_SHIFT(log) \ + BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) +#define XLOG_TOTAL_REC_SHIFT(log) \ + BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) + +static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) +{ + return ((xfs_lsn_t)cycle << 32) | block; +} + +static inline uint xlog_get_cycle(char *ptr) +{ + if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) + return be32_to_cpu(*((__be32 *)ptr + 1)); + else + return be32_to_cpu(*(__be32 *)ptr); +} + +#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) + +#ifdef __KERNEL__ + +/* + * get client id from packed copy. + * + * this hack is here because the xlog_pack code copies four bytes + * of xlog_op_header containing the fields oh_clientid, oh_flags + * and oh_res2 into the packed copy. + * + * later on this four byte chunk is treated as an int and the + * client id is pulled out. + * + * this has endian issues, of course. + */ +static inline uint xlog_get_client_id(__be32 i) +{ + return be32_to_cpu(i) >> 24; +} + +/* + * In core log state + */ +#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */ +#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */ +#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */ +#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */ +#define XLOG_STATE_DO_CALLBACK \ + 0x0010 /* Process callback functions */ +#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ +#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ +#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ +#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ +#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ +#endif /* __KERNEL__ */ + +/* + * Flags to log operation header + * + * The first write of a new transaction will be preceded with a start + * record, XLOG_START_TRANS. Once a transaction is committed, a commit + * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into + * the remainder of the current active in-core log, it is split up into + * multiple regions. Each partial region will be marked with a + * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS. + * + */ +#define XLOG_START_TRANS 0x01 /* Start a new transaction */ +#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */ +#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */ +#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */ +#define XLOG_END_TRANS 0x10 /* End a continued transaction */ +#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ + +#ifdef __KERNEL__ +/* + * Flags to log ticket + */ +#define XLOG_TIC_INITED 0x1 /* has been initialized */ +#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ + +#define XLOG_TIC_FLAGS \ + { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ + { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } + +#endif /* __KERNEL__ */ + +#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ + +/* + * Flags for log structure + */ +#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */ +#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ +#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ +#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being + shutdown */ +#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ + +typedef __uint32_t xlog_tid_t; + +#ifdef __KERNEL__ +/* + * Below are states for covering allocation transactions. + * By covering, we mean changing the h_tail_lsn in the last on-disk + * log write such that no allocation transactions will be re-done during + * recovery after a system crash. Recovery starts at the last on-disk + * log write. + * + * These states are used to insert dummy log entries to cover + * space allocation transactions which can undo non-transactional changes + * after a crash. Writes to a file with space + * already allocated do not result in any transactions. Allocations + * might include space beyond the EOF. So if we just push the EOF a + * little, the last transaction for the file could contain the wrong + * size. If there is no file system activity, after an allocation + * transaction, and the system crashes, the allocation transaction + * will get replayed and the file will be truncated. This could + * be hours/days/... after the allocation occurred. + * + * The fix for this is to do two dummy transactions when the + * system is idle. We need two dummy transaction because the h_tail_lsn + * in the log record header needs to point beyond the last possible + * non-dummy transaction. The first dummy changes the h_tail_lsn to + * the first transaction before the dummy. The second dummy causes + * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn. + * + * These dummy transactions get committed when everything + * is idle (after there has been some activity). + * + * There are 5 states used to control this. + * + * IDLE -- no logging has been done on the file system or + * we are done covering previous transactions. + * NEED -- logging has occurred and we need a dummy transaction + * when the log becomes idle. + * DONE -- we were in the NEED state and have committed a dummy + * transaction. + * NEED2 -- we detected that a dummy transaction has gone to the + * on disk log with no other transactions. + * DONE2 -- we committed a dummy transaction when in the NEED2 state. + * + * There are two places where we switch states: + * + * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2. + * We commit the dummy transaction and switch to DONE or DONE2, + * respectively. In all other states, we don't do anything. + * + * 2.) When we finish writing the on-disk log (xlog_state_clean_log). + * + * No matter what state we are in, if this isn't the dummy + * transaction going out, the next state is NEED. + * So, if we aren't in the DONE or DONE2 states, the next state + * is NEED. We can't be finishing a write of the dummy record + * unless it was committed and the state switched to DONE or DONE2. + * + * If we are in the DONE state and this was a write of the + * dummy transaction, we move to NEED2. + * + * If we are in the DONE2 state and this was a write of the + * dummy transaction, we move to IDLE. + * + * + * Writing only one dummy transaction can get appended to + * one file space allocation. When this happens, the log recovery + * code replays the space allocation and a file could be truncated. + * This is why we have the NEED2 and DONE2 states before going idle. + */ + +#define XLOG_STATE_COVER_IDLE 0 +#define XLOG_STATE_COVER_NEED 1 +#define XLOG_STATE_COVER_DONE 2 +#define XLOG_STATE_COVER_NEED2 3 +#define XLOG_STATE_COVER_DONE2 4 + +#define XLOG_COVER_OPS 5 + + +/* Ticket reservation region accounting */ +#define XLOG_TIC_LEN_MAX 15 + +/* + * Reservation region + * As would be stored in xfs_log_iovec but without the i_addr which + * we don't care about. + */ +typedef struct xlog_res { + uint r_len; /* region length :4 */ + uint r_type; /* region's transaction type :4 */ +} xlog_res_t; + +typedef struct xlog_ticket { + wait_queue_head_t t_wait; /* ticket wait queue */ + struct list_head t_queue; /* reserve/write queue */ + xlog_tid_t t_tid; /* transaction identifier : 4 */ + atomic_t t_ref; /* ticket reference count : 4 */ + int t_curr_res; /* current reservation in bytes : 4 */ + int t_unit_res; /* unit reservation in bytes : 4 */ + char t_ocnt; /* original count : 1 */ + char t_cnt; /* current count : 1 */ + char t_clientid; /* who does this belong to; : 1 */ + char t_flags; /* properties of reservation : 1 */ + uint t_trans_type; /* transaction type : 4 */ + + /* reservation array fields */ + uint t_res_num; /* num in array : 4 */ + uint t_res_num_ophdrs; /* num op hdrs : 4 */ + uint t_res_arr_sum; /* array sum : 4 */ + uint t_res_o_flow; /* sum overflow : 4 */ + xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ +} xlog_ticket_t; + +#endif + + +typedef struct xlog_op_header { + __be32 oh_tid; /* transaction id of operation : 4 b */ + __be32 oh_len; /* bytes in data region : 4 b */ + __u8 oh_clientid; /* who sent me this : 1 b */ + __u8 oh_flags; /* : 1 b */ + __u16 oh_res2; /* 32 bit align : 2 b */ +} xlog_op_header_t; + + +/* valid values for h_fmt */ +#define XLOG_FMT_UNKNOWN 0 +#define XLOG_FMT_LINUX_LE 1 +#define XLOG_FMT_LINUX_BE 2 +#define XLOG_FMT_IRIX_BE 3 + +/* our fmt */ +#ifdef XFS_NATIVE_HOST +#define XLOG_FMT XLOG_FMT_LINUX_BE +#else +#define XLOG_FMT XLOG_FMT_LINUX_LE +#endif + +typedef struct xlog_rec_header { + __be32 h_magicno; /* log record (LR) identifier : 4 */ + __be32 h_cycle; /* write cycle of log : 4 */ + __be32 h_version; /* LR version : 4 */ + __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ + __be64 h_lsn; /* lsn of this LR : 8 */ + __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ + __be32 h_chksum; /* may not be used; non-zero if used : 4 */ + __be32 h_prev_block; /* block number to previous LR : 4 */ + __be32 h_num_logops; /* number of log operations in this LR : 4 */ + __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; + /* new fields */ + __be32 h_fmt; /* format of log record : 4 */ + uuid_t h_fs_uuid; /* uuid of FS : 16 */ + __be32 h_size; /* iclog size : 4 */ +} xlog_rec_header_t; + +typedef struct xlog_rec_ext_header { + __be32 xh_cycle; /* write cycle of log : 4 */ + __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ +} xlog_rec_ext_header_t; + +#ifdef __KERNEL__ + +/* + * Quite misnamed, because this union lays out the actual on-disk log buffer. + */ +typedef union xlog_in_core2 { + xlog_rec_header_t hic_header; + xlog_rec_ext_header_t hic_xheader; + char hic_sector[XLOG_HEADER_SIZE]; +} xlog_in_core_2_t; + +/* + * - A log record header is 512 bytes. There is plenty of room to grow the + * xlog_rec_header_t into the reserved space. + * - ic_data follows, so a write to disk can start at the beginning of + * the iclog. + * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. + * - ic_next is the pointer to the next iclog in the ring. + * - ic_bp is a pointer to the buffer used to write this incore log to disk. + * - ic_log is a pointer back to the global log structure. + * - ic_callback is a linked list of callback function/argument pairs to be + * called after an iclog finishes writing. + * - ic_size is the full size of the header plus data. + * - ic_offset is the current number of bytes written to in this iclog. + * - ic_refcnt is bumped when someone is writing to the log. + * - ic_state is the state of the iclog. + * + * Because of cacheline contention on large machines, we need to separate + * various resources onto different cachelines. To start with, make the + * structure cacheline aligned. The following fields can be contended on + * by independent processes: + * + * - ic_callback_* + * - ic_refcnt + * - fields protected by the global l_icloglock + * + * so we need to ensure that these fields are located in separate cachelines. + * We'll put all the read-only and l_icloglock fields in the first cacheline, + * and move everything else out to subsequent cachelines. + */ +typedef struct xlog_in_core { + wait_queue_head_t ic_force_wait; + wait_queue_head_t ic_write_wait; + struct xlog_in_core *ic_next; + struct xlog_in_core *ic_prev; + struct xfs_buf *ic_bp; + struct log *ic_log; + int ic_size; + int ic_offset; + int ic_bwritecnt; + unsigned short ic_state; + char *ic_datap; /* pointer to iclog data */ + + /* Callback structures need their own cacheline */ + spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; + xfs_log_callback_t *ic_callback; + xfs_log_callback_t **ic_callback_tail; + + /* reference counts need their own cacheline */ + atomic_t ic_refcnt ____cacheline_aligned_in_smp; + xlog_in_core_2_t *ic_data; +#define ic_header ic_data->hic_header +} xlog_in_core_t; + +/* + * The CIL context is used to aggregate per-transaction details as well be + * passed to the iclog for checkpoint post-commit processing. After being + * passed to the iclog, another context needs to be allocated for tracking the + * next set of transactions to be aggregated into a checkpoint. + */ +struct xfs_cil; + +struct xfs_cil_ctx { + struct xfs_cil *cil; + xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ + xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_ticket *ticket; /* chkpt ticket */ + int nvecs; /* number of regions */ + int space_used; /* aggregate size of regions */ + struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + xfs_log_callback_t log_cb; /* completion callback hook. */ + struct list_head committing; /* ctx committing list */ +}; + +/* + * Committed Item List structure + * + * This structure is used to track log items that have been committed but not + * yet written into the log. It is used only when the delayed logging mount + * option is enabled. + * + * This structure tracks the list of committing checkpoint contexts so + * we can avoid the problem of having to hold out new transactions during a + * flush until we have a the commit record LSN of the checkpoint. We can + * traverse the list of committing contexts in xlog_cil_push_lsn() to find a + * sequence match and extract the commit LSN directly from there. If the + * checkpoint is still in the process of committing, we can block waiting for + * the commit LSN to be determined as well. This should make synchronous + * operations almost as efficient as the old logging methods. + */ +struct xfs_cil { + struct log *xc_log; + struct list_head xc_cil; + spinlock_t xc_cil_lock; + struct xfs_cil_ctx *xc_ctx; + struct rw_semaphore xc_ctx_lock; + struct list_head xc_committing; + wait_queue_head_t xc_commit_wait; + xfs_lsn_t xc_current_sequence; +}; + +/* + * The amount of log space we allow the CIL to aggregate is difficult to size. + * Whatever we choose, we have to make sure we can get a reservation for the + * log space effectively, that it is large enough to capture sufficient + * relogging to reduce log buffer IO significantly, but it is not too large for + * the log or induces too much latency when writing out through the iclogs. We + * track both space consumed and the number of vectors in the checkpoint + * context, so we need to decide which to use for limiting. + * + * Every log buffer we write out during a push needs a header reserved, which + * is at least one sector and more for v2 logs. Hence we need a reservation of + * at least 512 bytes per 32k of log space just for the LR headers. That means + * 16KB of reservation per megabyte of delayed logging space we will consume, + * plus various headers. The number of headers will vary based on the num of + * io vectors, so limiting on a specific number of vectors is going to result + * in transactions of varying size. IOWs, it is more consistent to track and + * limit space consumed in the log rather than by the number of objects being + * logged in order to prevent checkpoint ticket overruns. + * + * Further, use of static reservations through the log grant mechanism is + * problematic. It introduces a lot of complexity (e.g. reserve grant vs write + * grant) and a significant deadlock potential because regranting write space + * can block on log pushes. Hence if we have to regrant log space during a log + * push, we can deadlock. + * + * However, we can avoid this by use of a dynamic "reservation stealing" + * technique during transaction commit whereby unused reservation space in the + * transaction ticket is transferred to the CIL ctx commit ticket to cover the + * space needed by the checkpoint transaction. This means that we never need to + * specifically reserve space for the CIL checkpoint transaction, nor do we + * need to regrant space once the checkpoint completes. This also means the + * checkpoint transaction ticket is specific to the checkpoint context, rather + * than the CIL itself. + * + * With dynamic reservations, we can effectively make up arbitrary limits for + * the checkpoint size so long as they don't violate any other size rules. + * Recovery imposes a rule that no transaction exceed half the log, so we are + * limited by that. Furthermore, the log transaction reservation subsystem + * tries to keep 25% of the log free, so we need to keep below that limit or we + * risk running out of free log space to start any new transactions. + * + * In order to keep background CIL push efficient, we will set a lower + * threshold at which background pushing is attempted without blocking current + * transaction commits. A separate, higher bound defines when CIL pushes are + * enforced to ensure we stay within our maximum checkpoint size bounds. + * threshold, yet give us plenty of space for aggregation on large logs. + */ +#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) +#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4)) + +/* + * The reservation head lsn is not made up of a cycle number and block number. + * Instead, it uses a cycle number and byte number. Logs don't expect to + * overflow 31 bits worth of byte offset, so using a byte number will mean + * that round off problems won't occur when releasing partial reservations. + */ +typedef struct log { + /* The following fields don't need locking */ + struct xfs_mount *l_mp; /* mount point */ + struct xfs_ail *l_ailp; /* AIL log is working with */ + struct xfs_cil *l_cilp; /* CIL log is working with */ + struct xfs_buf *l_xbuf; /* extra buffer for log + * wrapping */ + struct xfs_buftarg *l_targ; /* buftarg of log */ + uint l_flags; + uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ + struct list_head *l_buf_cancel_table; + int l_iclog_hsize; /* size of iclog header */ + int l_iclog_heads; /* # of iclog header sectors */ + uint l_sectBBsize; /* sector size in BBs (2^n) */ + int l_iclog_size; /* size of log in bytes */ + int l_iclog_size_log; /* log power size of log */ + int l_iclog_bufs; /* number of iclog buffers */ + xfs_daddr_t l_logBBstart; /* start block of log */ + int l_logsize; /* size of log in bytes */ + int l_logBBsize; /* size of log in BB chunks */ + + /* The following block of fields are changed while holding icloglock */ + wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp; + /* waiting for iclog flush */ + int l_covered_state;/* state of "covering disk + * log entries" */ + xlog_in_core_t *l_iclog; /* head log queue */ + spinlock_t l_icloglock; /* grab to change iclog state */ + int l_curr_cycle; /* Cycle number of log writes */ + int l_prev_cycle; /* Cycle number before last + * block increment */ + int l_curr_block; /* current logical log block */ + int l_prev_block; /* previous logical log block */ + + /* + * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and + * read without needing to hold specific locks. To avoid operations + * contending with other hot objects, place each of them on a separate + * cacheline. + */ + /* lsn of last LR on disk */ + atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp; + /* lsn of 1st LR with unflushed * buffers */ + atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; + + /* + * ticket grant locks, queues and accounting have their own cachlines + * as these are quite hot and can be operated on concurrently. + */ + spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp; + struct list_head l_reserveq; + atomic64_t l_grant_reserve_head; + + spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp; + struct list_head l_writeq; + atomic64_t l_grant_write_head; + + /* The following field are used for debugging; need to hold icloglock */ +#ifdef DEBUG + char *l_iclog_bak[XLOG_MAX_ICLOGS]; +#endif + +} xlog_t; + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + +#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) + +/* common routines */ +extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); +extern int xlog_recover(xlog_t *log); +extern int xlog_recover_finish(xlog_t *log); +extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); + +extern kmem_zone_t *xfs_log_ticket_zone; +struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, + int count, char client, uint xflags, + int alloc_flags); + + +static inline void +xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) +{ + *ptr += bytes; + *len -= bytes; + *off += bytes; +} + +void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +int xlog_write(struct log *log, struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, + xlog_in_core_t **commit_iclog, uint flags); + +/* + * When we crack an atomic LSN, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. This should always + * be used to sample and crack LSNs that are stored and updated in atomic + * variables. + */ +static inline void +xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block) +{ + xfs_lsn_t val = atomic64_read(lsn); + + *cycle = CYCLE_LSN(val); + *block = BLOCK_LSN(val); +} + +/* + * Calculate and assign a value to an atomic LSN variable from component pieces. + */ +static inline void +xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block) +{ + atomic64_set(lsn, xlog_assign_lsn(cycle, block)); +} + +/* + * When we crack the grant head, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. + */ +static inline void +xlog_crack_grant_head_val(int64_t val, int *cycle, int *space) +{ + *cycle = val >> 32; + *space = val & 0xffffffff; +} + +static inline void +xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space) +{ + xlog_crack_grant_head_val(atomic64_read(head), cycle, space); +} + +static inline int64_t +xlog_assign_grant_head_val(int cycle, int space) +{ + return ((int64_t)cycle << 32) | space; +} + +static inline void +xlog_assign_grant_head(atomic64_t *head, int cycle, int space) +{ + atomic64_set(head, xlog_assign_grant_head_val(cycle, space)); +} + +/* + * Committed Item List interfaces + */ +int xlog_cil_init(struct log *log); +void xlog_cil_init_post_recovery(struct log *log); +void xlog_cil_destroy(struct log *log); + +/* + * CIL force routines + */ +xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence); + +static inline void +xlog_cil_force(struct log *log) +{ + xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); +} + +/* + * Unmount record type is used as a pseudo transaction type for the ticket. + * It's value must be outside the range of XFS_TRANS_* values. + */ +#define XLOG_UNMOUNT_REC_TYPE (-1U) + +/* + * Wrapper function for waiting on a wait queue serialised against wakeups + * by a spinlock. This matches the semantics of all the wait queues used in the + * log code. + */ +static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(wq, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(lock); + schedule(); + remove_wait_queue(wq, &wait); +} +#endif /* __KERNEL__ */ + +#endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c new file mode 100644 index 00000000..b75fd67c --- /dev/null +++ b/fs/xfs/xfs_log_recover.c @@ -0,0 +1,3843 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_log_priv.h" +#include "xfs_buf_item.h" +#include "xfs_log_recover.h" +#include "xfs_extfree_item.h" +#include "xfs_trans_priv.h" +#include "xfs_quota.h" +#include "xfs_rw.h" +#include "xfs_utils.h" +#include "xfs_trace.h" + +STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); +STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); +#if defined(DEBUG) +STATIC void xlog_recover_check_summary(xlog_t *); +#else +#define xlog_recover_check_summary(log) +#endif + +/* + * This structure is used during recovery to record the buf log items which + * have been canceled and should not be replayed. + */ +struct xfs_buf_cancel { + xfs_daddr_t bc_blkno; + uint bc_len; + int bc_refcount; + struct list_head bc_list; +}; + +/* + * Sector aligned buffer routines for buffer create/read/write/access + */ + +/* + * Verify the given count of basic blocks is valid number of blocks + * to specify for an operation involving the given XFS log buffer. + * Returns nonzero if the count is valid, 0 otherwise. + */ + +static inline int +xlog_buf_bbcount_valid( + xlog_t *log, + int bbcount) +{ + return bbcount > 0 && bbcount <= log->l_logBBsize; +} + +/* + * Allocate a buffer to hold log data. The buffer needs to be able + * to map to a range of nbblks basic blocks at any valid (basic + * block) offset within the log. + */ +STATIC xfs_buf_t * +xlog_get_bp( + xlog_t *log, + int nbblks) +{ + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); + return NULL; + } + + /* + * We do log I/O in units of log sectors (a power-of-2 + * multiple of the basic block size), so we round up the + * requested size to accommodate the basic blocks required + * for complete log sectors. + * + * In addition, the buffer may be used for a non-sector- + * aligned block offset, in which case an I/O of the + * requested size could extend beyond the end of the + * buffer. If the requested size is only 1 basic block it + * will never straddle a sector boundary, so this won't be + * an issue. Nor will this be a problem if the log I/O is + * done in basic blocks (sector size 1). But otherwise we + * extend the buffer by one extra log sector to ensure + * there's space to accommodate this possibility. + */ + if (nbblks > 1 && log->l_sectBBsize > 1) + nbblks += log->l_sectBBsize; + nbblks = round_up(nbblks, log->l_sectBBsize); + + return xfs_buf_get_uncached(log->l_mp->m_logdev_targp, + BBTOB(nbblks), 0); +} + +STATIC void +xlog_put_bp( + xfs_buf_t *bp) +{ + xfs_buf_free(bp); +} + +/* + * Return the address of the start of the given block number's data + * in a log buffer. The buffer covers a log sector-aligned region. + */ +STATIC xfs_caddr_t +xlog_align( + xlog_t *log, + xfs_daddr_t blk_no, + int nbblks, + xfs_buf_t *bp) +{ + xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); + + ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); + return XFS_BUF_PTR(bp) + BBTOB(offset); +} + + +/* + * nbblks should be uint, but oh well. Just want to catch that 32-bit length. + */ +STATIC int +xlog_bread_noalign( + xlog_t *log, + xfs_daddr_t blk_no, + int nbblks, + xfs_buf_t *bp) +{ + int error; + + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); + return EFSCORRUPTED; + } + + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); + + ASSERT(nbblks > 0); + ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + + XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); + XFS_BUF_READ(bp); + XFS_BUF_BUSY(bp); + XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); + XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); + + xfsbdstrat(log->l_mp, bp); + error = xfs_buf_iowait(bp); + if (error) + xfs_ioerror_alert("xlog_bread", log->l_mp, + bp, XFS_BUF_ADDR(bp)); + return error; +} + +STATIC int +xlog_bread( + xlog_t *log, + xfs_daddr_t blk_no, + int nbblks, + xfs_buf_t *bp, + xfs_caddr_t *offset) +{ + int error; + + error = xlog_bread_noalign(log, blk_no, nbblks, bp); + if (error) + return error; + + *offset = xlog_align(log, blk_no, nbblks, bp); + return 0; +} + +/* + * Read at an offset into the buffer. Returns with the buffer in it's original + * state regardless of the result of the read. + */ +STATIC int +xlog_bread_offset( + xlog_t *log, + xfs_daddr_t blk_no, /* block to read from */ + int nbblks, /* blocks to read */ + xfs_buf_t *bp, + xfs_caddr_t offset) +{ + xfs_caddr_t orig_offset = XFS_BUF_PTR(bp); + int orig_len = bp->b_buffer_length; + int error, error2; + + error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks)); + if (error) + return error; + + error = xlog_bread_noalign(log, blk_no, nbblks, bp); + + /* must reset buffer pointer even on error */ + error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len); + if (error) + return error; + return error2; +} + +/* + * Write out the buffer at the given block for the given number of blocks. + * The buffer is kept locked across the write and is returned locked. + * This can only be used for synchronous log writes. + */ +STATIC int +xlog_bwrite( + xlog_t *log, + xfs_daddr_t blk_no, + int nbblks, + xfs_buf_t *bp) +{ + int error; + + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); + return EFSCORRUPTED; + } + + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); + + ASSERT(nbblks > 0); + ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + + XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); + XFS_BUF_ZEROFLAGS(bp); + XFS_BUF_BUSY(bp); + XFS_BUF_HOLD(bp); + XFS_BUF_PSEMA(bp, PRIBIO); + XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); + XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); + + if ((error = xfs_bwrite(log->l_mp, bp))) + xfs_ioerror_alert("xlog_bwrite", log->l_mp, + bp, XFS_BUF_ADDR(bp)); + return error; +} + +#ifdef DEBUG +/* + * dump debug superblock and log record information + */ +STATIC void +xlog_header_check_dump( + xfs_mount_t *mp, + xlog_rec_header_t *head) +{ + xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n", + __func__, &mp->m_sb.sb_uuid, XLOG_FMT); + xfs_debug(mp, " log : uuid = %pU, fmt = %d\n", + &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); +} +#else +#define xlog_header_check_dump(mp, head) +#endif + +/* + * check log record header for recovery + */ +STATIC int +xlog_header_check_recover( + xfs_mount_t *mp, + xlog_rec_header_t *head) +{ + ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM); + + /* + * IRIX doesn't write the h_fmt field and leaves it zeroed + * (XLOG_FMT_UNKNOWN). This stops us from trying to recover + * a dirty log created in IRIX. + */ + if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) { + xfs_warn(mp, + "dirty log written in incompatible format - can't recover"); + xlog_header_check_dump(mp, head); + XFS_ERROR_REPORT("xlog_header_check_recover(1)", + XFS_ERRLEVEL_HIGH, mp); + return XFS_ERROR(EFSCORRUPTED); + } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { + xfs_warn(mp, + "dirty log entry has mismatched uuid - can't recover"); + xlog_header_check_dump(mp, head); + XFS_ERROR_REPORT("xlog_header_check_recover(2)", + XFS_ERRLEVEL_HIGH, mp); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +/* + * read the head block of the log and check the header + */ +STATIC int +xlog_header_check_mount( + xfs_mount_t *mp, + xlog_rec_header_t *head) +{ + ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM); + + if (uuid_is_nil(&head->h_fs_uuid)) { + /* + * IRIX doesn't write the h_fs_uuid or h_fmt fields. If + * h_fs_uuid is nil, we assume this log was last mounted + * by IRIX and continue. + */ + xfs_warn(mp, "nil uuid in log - IRIX style log"); + } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { + xfs_warn(mp, "log has mismatched uuid - can't recover"); + xlog_header_check_dump(mp, head); + XFS_ERROR_REPORT("xlog_header_check_mount", + XFS_ERRLEVEL_HIGH, mp); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +STATIC void +xlog_recover_iodone( + struct xfs_buf *bp) +{ + if (XFS_BUF_GETERROR(bp)) { + /* + * We're not going to bother about retrying + * this during recovery. One strike! + */ + xfs_ioerror_alert("xlog_recover_iodone", + bp->b_target->bt_mount, bp, + XFS_BUF_ADDR(bp)); + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } + XFS_BUF_CLR_IODONE_FUNC(bp); + xfs_buf_ioend(bp, 0); +} + +/* + * This routine finds (to an approximation) the first block in the physical + * log which contains the given cycle. It uses a binary search algorithm. + * Note that the algorithm can not be perfect because the disk will not + * necessarily be perfect. + */ +STATIC int +xlog_find_cycle_start( + xlog_t *log, + xfs_buf_t *bp, + xfs_daddr_t first_blk, + xfs_daddr_t *last_blk, + uint cycle) +{ + xfs_caddr_t offset; + xfs_daddr_t mid_blk; + xfs_daddr_t end_blk; + uint mid_cycle; + int error; + + end_blk = *last_blk; + mid_blk = BLK_AVG(first_blk, end_blk); + while (mid_blk != first_blk && mid_blk != end_blk) { + error = xlog_bread(log, mid_blk, 1, bp, &offset); + if (error) + return error; + mid_cycle = xlog_get_cycle(offset); + if (mid_cycle == cycle) + end_blk = mid_blk; /* last_half_cycle == mid_cycle */ + else + first_blk = mid_blk; /* first_half_cycle == mid_cycle */ + mid_blk = BLK_AVG(first_blk, end_blk); + } + ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) || + (mid_blk == end_blk && mid_blk-1 == first_blk)); + + *last_blk = end_blk; + + return 0; +} + +/* + * Check that a range of blocks does not contain stop_on_cycle_no. + * Fill in *new_blk with the block offset where such a block is + * found, or with -1 (an invalid block number) if there is no such + * block in the range. The scan needs to occur from front to back + * and the pointer into the region must be updated since a later + * routine will need to perform another test. + */ +STATIC int +xlog_find_verify_cycle( + xlog_t *log, + xfs_daddr_t start_blk, + int nbblks, + uint stop_on_cycle_no, + xfs_daddr_t *new_blk) +{ + xfs_daddr_t i, j; + uint cycle; + xfs_buf_t *bp; + xfs_daddr_t bufblks; + xfs_caddr_t buf = NULL; + int error = 0; + + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks we'll be examining. If that fails, + * try a smaller size. We need to be able to read at least + * a log sector, or we're out of luck. + */ + bufblks = 1 << ffs(nbblks); + while (!(bp = xlog_get_bp(log, bufblks))) { + bufblks >>= 1; + if (bufblks < log->l_sectBBsize) + return ENOMEM; + } + + for (i = start_blk; i < start_blk + nbblks; i += bufblks) { + int bcount; + + bcount = min(bufblks, (start_blk + nbblks - i)); + + error = xlog_bread(log, i, bcount, bp, &buf); + if (error) + goto out; + + for (j = 0; j < bcount; j++) { + cycle = xlog_get_cycle(buf); + if (cycle == stop_on_cycle_no) { + *new_blk = i+j; + goto out; + } + + buf += BBSIZE; + } + } + + *new_blk = -1; + +out: + xlog_put_bp(bp); + return error; +} + +/* + * Potentially backup over partial log record write. + * + * In the typical case, last_blk is the number of the block directly after + * a good log record. Therefore, we subtract one to get the block number + * of the last block in the given buffer. extra_bblks contains the number + * of blocks we would have read on a previous read. This happens when the + * last log record is split over the end of the physical log. + * + * extra_bblks is the number of blocks potentially verified on a previous + * call to this routine. + */ +STATIC int +xlog_find_verify_log_record( + xlog_t *log, + xfs_daddr_t start_blk, + xfs_daddr_t *last_blk, + int extra_bblks) +{ + xfs_daddr_t i; + xfs_buf_t *bp; + xfs_caddr_t offset = NULL; + xlog_rec_header_t *head = NULL; + int error = 0; + int smallmem = 0; + int num_blks = *last_blk - start_blk; + int xhdrs; + + ASSERT(start_blk != 0 || *last_blk != start_blk); + + if (!(bp = xlog_get_bp(log, num_blks))) { + if (!(bp = xlog_get_bp(log, 1))) + return ENOMEM; + smallmem = 1; + } else { + error = xlog_bread(log, start_blk, num_blks, bp, &offset); + if (error) + goto out; + offset += ((num_blks - 1) << BBSHIFT); + } + + for (i = (*last_blk) - 1; i >= 0; i--) { + if (i < start_blk) { + /* valid log record not found */ + xfs_warn(log->l_mp, + "Log inconsistent (didn't find previous header)"); + ASSERT(0); + error = XFS_ERROR(EIO); + goto out; + } + + if (smallmem) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out; + } + + head = (xlog_rec_header_t *)offset; + + if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno)) + break; + + if (!smallmem) + offset -= BBSIZE; + } + + /* + * We hit the beginning of the physical log & still no header. Return + * to caller. If caller can handle a return of -1, then this routine + * will be called again for the end of the physical log. + */ + if (i == -1) { + error = -1; + goto out; + } + + /* + * We have the final block of the good log (the first block + * of the log record _before_ the head. So we check the uuid. + */ + if ((error = xlog_header_check_mount(log->l_mp, head))) + goto out; + + /* + * We may have found a log record header before we expected one. + * last_blk will be the 1st block # with a given cycle #. We may end + * up reading an entire log record. In this case, we don't want to + * reset last_blk. Only when last_blk points in the middle of a log + * record do we update last_blk. + */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + uint h_size = be32_to_cpu(head->h_size); + + xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE; + if (h_size % XLOG_HEADER_CYCLE_SIZE) + xhdrs++; + } else { + xhdrs = 1; + } + + if (*last_blk - i + extra_bblks != + BTOBB(be32_to_cpu(head->h_len)) + xhdrs) + *last_blk = i; + +out: + xlog_put_bp(bp); + return error; +} + +/* + * Head is defined to be the point of the log where the next log write + * write could go. This means that incomplete LR writes at the end are + * eliminated when calculating the head. We aren't guaranteed that previous + * LR have complete transactions. We only know that a cycle number of + * current cycle number -1 won't be present in the log if we start writing + * from our current block number. + * + * last_blk contains the block number of the first block with a given + * cycle number. + * + * Return: zero if normal, non-zero if error. + */ +STATIC int +xlog_find_head( + xlog_t *log, + xfs_daddr_t *return_head_blk) +{ + xfs_buf_t *bp; + xfs_caddr_t offset; + xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk; + int num_scan_bblks; + uint first_half_cycle, last_half_cycle; + uint stop_on_cycle; + int error, log_bbnum = log->l_logBBsize; + + /* Is the end of the log device zeroed? */ + if ((error = xlog_find_zeroed(log, &first_blk)) == -1) { + *return_head_blk = first_blk; + + /* Is the whole lot zeroed? */ + if (!first_blk) { + /* Linux XFS shouldn't generate totally zeroed logs - + * mkfs etc write a dummy unmount record to a fresh + * log so we can store the uuid in there + */ + xfs_warn(log->l_mp, "totally zeroed log"); + } + + return 0; + } else if (error) { + xfs_warn(log->l_mp, "empty log check failed"); + return error; + } + + first_blk = 0; /* get cycle # of 1st block */ + bp = xlog_get_bp(log, 1); + if (!bp) + return ENOMEM; + + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) + goto bp_err; + + first_half_cycle = xlog_get_cycle(offset); + + last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ + error = xlog_bread(log, last_blk, 1, bp, &offset); + if (error) + goto bp_err; + + last_half_cycle = xlog_get_cycle(offset); + ASSERT(last_half_cycle != 0); + + /* + * If the 1st half cycle number is equal to the last half cycle number, + * then the entire log is stamped with the same cycle number. In this + * case, head_blk can't be set to zero (which makes sense). The below + * math doesn't work out properly with head_blk equal to zero. Instead, + * we set it to log_bbnum which is an invalid block number, but this + * value makes the math correct. If head_blk doesn't changed through + * all the tests below, *head_blk is set to zero at the very end rather + * than log_bbnum. In a sense, log_bbnum and zero are the same block + * in a circular file. + */ + if (first_half_cycle == last_half_cycle) { + /* + * In this case we believe that the entire log should have + * cycle number last_half_cycle. We need to scan backwards + * from the end verifying that there are no holes still + * containing last_half_cycle - 1. If we find such a hole, + * then the start of that hole will be the new head. The + * simple case looks like + * x | x ... | x - 1 | x + * Another case that fits this picture would be + * x | x + 1 | x ... | x + * In this case the head really is somewhere at the end of the + * log, as one of the latest writes at the beginning was + * incomplete. + * One more case is + * x | x + 1 | x ... | x - 1 | x + * This is really the combination of the above two cases, and + * the head has to end up at the start of the x-1 hole at the + * end of the log. + * + * In the 256k log case, we will read from the beginning to the + * end of the log and search for cycle numbers equal to x-1. + * We don't worry about the x+1 blocks that we encounter, + * because we know that they cannot be the head since the log + * started with x. + */ + head_blk = log_bbnum; + stop_on_cycle = last_half_cycle - 1; + } else { + /* + * In this case we want to find the first block with cycle + * number matching last_half_cycle. We expect the log to be + * some variation on + * x + 1 ... | x ... | x + * The first block with cycle number x (last_half_cycle) will + * be where the new head belongs. First we do a binary search + * for the first occurrence of last_half_cycle. The binary + * search may not be totally accurate, so then we scan back + * from there looking for occurrences of last_half_cycle before + * us. If that backwards scan wraps around the beginning of + * the log, then we look for occurrences of last_half_cycle - 1 + * at the end of the log. The cases we're looking for look + * like + * v binary search stopped here + * x + 1 ... | x | x + 1 | x ... | x + * ^ but we want to locate this spot + * or + * <---------> less than scan distance + * x + 1 ... | x ... | x - 1 | x + * ^ we want to locate this spot + */ + stop_on_cycle = last_half_cycle; + if ((error = xlog_find_cycle_start(log, bp, first_blk, + &head_blk, last_half_cycle))) + goto bp_err; + } + + /* + * Now validate the answer. Scan back some number of maximum possible + * blocks and make sure each one has the expected cycle number. The + * maximum is determined by the total possible amount of buffering + * in the in-core log. The following number can be made tighter if + * we actually look at the block size of the filesystem. + */ + num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); + if (head_blk >= num_scan_bblks) { + /* + * We are guaranteed that the entire check can be performed + * in one buffer. + */ + start_blk = head_blk - num_scan_bblks; + if ((error = xlog_find_verify_cycle(log, + start_blk, num_scan_bblks, + stop_on_cycle, &new_blk))) + goto bp_err; + if (new_blk != -1) + head_blk = new_blk; + } else { /* need to read 2 parts of log */ + /* + * We are going to scan backwards in the log in two parts. + * First we scan the physical end of the log. In this part + * of the log, we are looking for blocks with cycle number + * last_half_cycle - 1. + * If we find one, then we know that the log starts there, as + * we've found a hole that didn't get written in going around + * the end of the physical log. The simple case for this is + * x + 1 ... | x ... | x - 1 | x + * <---------> less than scan distance + * If all of the blocks at the end of the log have cycle number + * last_half_cycle, then we check the blocks at the start of + * the log looking for occurrences of last_half_cycle. If we + * find one, then our current estimate for the location of the + * first occurrence of last_half_cycle is wrong and we move + * back to the hole we've found. This case looks like + * x + 1 ... | x | x + 1 | x ... + * ^ binary search stopped here + * Another case we need to handle that only occurs in 256k + * logs is + * x + 1 ... | x ... | x+1 | x ... + * ^ binary search stops here + * In a 256k log, the scan at the end of the log will see the + * x + 1 blocks. We need to skip past those since that is + * certainly not the head of the log. By searching for + * last_half_cycle-1 we accomplish that. + */ + ASSERT(head_blk <= INT_MAX && + (xfs_daddr_t) num_scan_bblks >= head_blk); + start_blk = log_bbnum - (num_scan_bblks - head_blk); + if ((error = xlog_find_verify_cycle(log, start_blk, + num_scan_bblks - (int)head_blk, + (stop_on_cycle - 1), &new_blk))) + goto bp_err; + if (new_blk != -1) { + head_blk = new_blk; + goto validate_head; + } + + /* + * Scan beginning of log now. The last part of the physical + * log is good. This scan needs to verify that it doesn't find + * the last_half_cycle. + */ + start_blk = 0; + ASSERT(head_blk <= INT_MAX); + if ((error = xlog_find_verify_cycle(log, + start_blk, (int)head_blk, + stop_on_cycle, &new_blk))) + goto bp_err; + if (new_blk != -1) + head_blk = new_blk; + } + +validate_head: + /* + * Now we need to make sure head_blk is not pointing to a block in + * the middle of a log record. + */ + num_scan_bblks = XLOG_REC_SHIFT(log); + if (head_blk >= num_scan_bblks) { + start_blk = head_blk - num_scan_bblks; /* don't read head_blk */ + + /* start ptr at last block ptr before head_blk */ + if ((error = xlog_find_verify_log_record(log, start_blk, + &head_blk, 0)) == -1) { + error = XFS_ERROR(EIO); + goto bp_err; + } else if (error) + goto bp_err; + } else { + start_blk = 0; + ASSERT(head_blk <= INT_MAX); + if ((error = xlog_find_verify_log_record(log, start_blk, + &head_blk, 0)) == -1) { + /* We hit the beginning of the log during our search */ + start_blk = log_bbnum - (num_scan_bblks - head_blk); + new_blk = log_bbnum; + ASSERT(start_blk <= INT_MAX && + (xfs_daddr_t) log_bbnum-start_blk >= 0); + ASSERT(head_blk <= INT_MAX); + if ((error = xlog_find_verify_log_record(log, + start_blk, &new_blk, + (int)head_blk)) == -1) { + error = XFS_ERROR(EIO); + goto bp_err; + } else if (error) + goto bp_err; + if (new_blk != log_bbnum) + head_blk = new_blk; + } else if (error) + goto bp_err; + } + + xlog_put_bp(bp); + if (head_blk == log_bbnum) + *return_head_blk = 0; + else + *return_head_blk = head_blk; + /* + * When returning here, we have a good block number. Bad block + * means that during a previous crash, we didn't have a clean break + * from cycle number N to cycle number N-1. In this case, we need + * to find the first block with cycle number N-1. + */ + return 0; + + bp_err: + xlog_put_bp(bp); + + if (error) + xfs_warn(log->l_mp, "failed to find log head"); + return error; +} + +/* + * Find the sync block number or the tail of the log. + * + * This will be the block number of the last record to have its + * associated buffers synced to disk. Every log record header has + * a sync lsn embedded in it. LSNs hold block numbers, so it is easy + * to get a sync block number. The only concern is to figure out which + * log record header to believe. + * + * The following algorithm uses the log record header with the largest + * lsn. The entire log record does not need to be valid. We only care + * that the header is valid. + * + * We could speed up search by using current head_blk buffer, but it is not + * available. + */ +STATIC int +xlog_find_tail( + xlog_t *log, + xfs_daddr_t *head_blk, + xfs_daddr_t *tail_blk) +{ + xlog_rec_header_t *rhead; + xlog_op_header_t *op_head; + xfs_caddr_t offset = NULL; + xfs_buf_t *bp; + int error, i, found; + xfs_daddr_t umount_data_blk; + xfs_daddr_t after_umount_blk; + xfs_lsn_t tail_lsn; + int hblks; + + found = 0; + + /* + * Find previous log record + */ + if ((error = xlog_find_head(log, head_blk))) + return error; + + bp = xlog_get_bp(log, 1); + if (!bp) + return ENOMEM; + if (*head_blk == 0) { /* special case */ + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) + goto done; + + if (xlog_get_cycle(offset) == 0) { + *tail_blk = 0; + /* leave all other log inited values alone */ + goto done; + } + } + + /* + * Search backwards looking for log record header block + */ + ASSERT(*head_blk < INT_MAX); + for (i = (int)(*head_blk) - 1; i >= 0; i--) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto done; + + if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { + found = 1; + break; + } + } + /* + * If we haven't found the log record header block, start looking + * again from the end of the physical log. XXXmiken: There should be + * a check here to make sure we didn't search more than N blocks in + * the previous code. + */ + if (!found) { + for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto done; + + if (XLOG_HEADER_MAGIC_NUM == + be32_to_cpu(*(__be32 *)offset)) { + found = 2; + break; + } + } + } + if (!found) { + xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + ASSERT(0); + return XFS_ERROR(EIO); + } + + /* find blk_no of tail of log */ + rhead = (xlog_rec_header_t *)offset; + *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); + + /* + * Reset log values according to the state of the log when we + * crashed. In the case where head_blk == 0, we bump curr_cycle + * one because the next write starts a new cycle rather than + * continuing the cycle of the last good log record. At this + * point we have guaranteed that all partial log records have been + * accounted for. Therefore, we know that the last good log record + * written was complete and ended exactly on the end boundary + * of the physical log. + */ + log->l_prev_block = i; + log->l_curr_block = (int)*head_blk; + log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); + if (found == 2) + log->l_curr_cycle++; + atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); + atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); + xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle, + BBTOB(log->l_curr_block)); + xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle, + BBTOB(log->l_curr_block)); + + /* + * Look for unmount record. If we find it, then we know there + * was a clean unmount. Since 'i' could be the last block in + * the physical log, we convert to a log block before comparing + * to the head_blk. + * + * Save the current tail lsn to use to pass to + * xlog_clear_stale_blocks() below. We won't want to clear the + * unmount record if there is one, so we pass the lsn of the + * unmount record rather than the block after it. + */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + int h_size = be32_to_cpu(rhead->h_size); + int h_version = be32_to_cpu(rhead->h_version); + + if ((h_version & XLOG_VERSION_2) && + (h_size > XLOG_HEADER_CYCLE_SIZE)) { + hblks = h_size / XLOG_HEADER_CYCLE_SIZE; + if (h_size % XLOG_HEADER_CYCLE_SIZE) + hblks++; + } else { + hblks = 1; + } + } else { + hblks = 1; + } + after_umount_blk = (i + hblks + (int) + BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; + tail_lsn = atomic64_read(&log->l_tail_lsn); + if (*head_blk == after_umount_blk && + be32_to_cpu(rhead->h_num_logops) == 1) { + umount_data_blk = (i + hblks) % log->l_logBBsize; + error = xlog_bread(log, umount_data_blk, 1, bp, &offset); + if (error) + goto done; + + op_head = (xlog_op_header_t *)offset; + if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { + /* + * Set tail and last sync so that newly written + * log records will point recovery to after the + * current unmount record. + */ + xlog_assign_atomic_lsn(&log->l_tail_lsn, + log->l_curr_cycle, after_umount_blk); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, + log->l_curr_cycle, after_umount_blk); + *tail_blk = after_umount_blk; + + /* + * Note that the unmount was clean. If the unmount + * was not clean, we need to know this to rebuild the + * superblock counters from the perag headers if we + * have a filesystem using non-persistent counters. + */ + log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; + } + } + + /* + * Make sure that there are no blocks in front of the head + * with the same cycle number as the head. This can happen + * because we allow multiple outstanding log writes concurrently, + * and the later writes might make it out before earlier ones. + * + * We use the lsn from before modifying it so that we'll never + * overwrite the unmount record after a clean unmount. + * + * Do this only if we are going to recover the filesystem + * + * NOTE: This used to say "if (!readonly)" + * However on Linux, we can & do recover a read-only filesystem. + * We only skip recovery if NORECOVERY is specified on mount, + * in which case we would not be here. + * + * But... if the -device- itself is readonly, just skip this. + * We can't recover this device anyway, so it won't matter. + */ + if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) + error = xlog_clear_stale_blocks(log, tail_lsn); + +done: + xlog_put_bp(bp); + + if (error) + xfs_warn(log->l_mp, "failed to locate log tail"); + return error; +} + +/* + * Is the log zeroed at all? + * + * The last binary search should be changed to perform an X block read + * once X becomes small enough. You can then search linearly through + * the X blocks. This will cut down on the number of reads we need to do. + * + * If the log is partially zeroed, this routine will pass back the blkno + * of the first block with cycle number 0. It won't have a complete LR + * preceding it. + * + * Return: + * 0 => the log is completely written to + * -1 => use *blk_no as the first block of the log + * >0 => error has occurred + */ +STATIC int +xlog_find_zeroed( + xlog_t *log, + xfs_daddr_t *blk_no) +{ + xfs_buf_t *bp; + xfs_caddr_t offset; + uint first_cycle, last_cycle; + xfs_daddr_t new_blk, last_blk, start_blk; + xfs_daddr_t num_scan_bblks; + int error, log_bbnum = log->l_logBBsize; + + *blk_no = 0; + + /* check totally zeroed log */ + bp = xlog_get_bp(log, 1); + if (!bp) + return ENOMEM; + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) + goto bp_err; + + first_cycle = xlog_get_cycle(offset); + if (first_cycle == 0) { /* completely zeroed log */ + *blk_no = 0; + xlog_put_bp(bp); + return -1; + } + + /* check partially zeroed log */ + error = xlog_bread(log, log_bbnum-1, 1, bp, &offset); + if (error) + goto bp_err; + + last_cycle = xlog_get_cycle(offset); + if (last_cycle != 0) { /* log completely written to */ + xlog_put_bp(bp); + return 0; + } else if (first_cycle != 1) { + /* + * If the cycle of the last block is zero, the cycle of + * the first block must be 1. If it's not, maybe we're + * not looking at a log... Bail out. + */ + xfs_warn(log->l_mp, + "Log inconsistent or not a log (last==0, first!=1)"); + return XFS_ERROR(EINVAL); + } + + /* we have a partially zeroed log */ + last_blk = log_bbnum-1; + if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0))) + goto bp_err; + + /* + * Validate the answer. Because there is no way to guarantee that + * the entire log is made up of log records which are the same size, + * we scan over the defined maximum blocks. At this point, the maximum + * is not chosen to mean anything special. XXXmiken + */ + num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); + ASSERT(num_scan_bblks <= INT_MAX); + + if (last_blk < num_scan_bblks) + num_scan_bblks = last_blk; + start_blk = last_blk - num_scan_bblks; + + /* + * We search for any instances of cycle number 0 that occur before + * our current estimate of the head. What we're trying to detect is + * 1 ... | 0 | 1 | 0... + * ^ binary search ends here + */ + if ((error = xlog_find_verify_cycle(log, start_blk, + (int)num_scan_bblks, 0, &new_blk))) + goto bp_err; + if (new_blk != -1) + last_blk = new_blk; + + /* + * Potentially backup over partial log record write. We don't need + * to search the end of the log because we know it is zero. + */ + if ((error = xlog_find_verify_log_record(log, start_blk, + &last_blk, 0)) == -1) { + error = XFS_ERROR(EIO); + goto bp_err; + } else if (error) + goto bp_err; + + *blk_no = last_blk; +bp_err: + xlog_put_bp(bp); + if (error) + return error; + return -1; +} + +/* + * These are simple subroutines used by xlog_clear_stale_blocks() below + * to initialize a buffer full of empty log record headers and write + * them into the log. + */ +STATIC void +xlog_add_record( + xlog_t *log, + xfs_caddr_t buf, + int cycle, + int block, + int tail_cycle, + int tail_block) +{ + xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; + + memset(buf, 0, BBSIZE); + recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); + recp->h_cycle = cpu_to_be32(cycle); + recp->h_version = cpu_to_be32( + xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block)); + recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block)); + recp->h_fmt = cpu_to_be32(XLOG_FMT); + memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t)); +} + +STATIC int +xlog_write_log_records( + xlog_t *log, + int cycle, + int start_block, + int blocks, + int tail_cycle, + int tail_block) +{ + xfs_caddr_t offset; + xfs_buf_t *bp; + int balign, ealign; + int sectbb = log->l_sectBBsize; + int end_block = start_block + blocks; + int bufblks; + int error = 0; + int i, j = 0; + + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks to be written. If that fails, try + * a smaller size. We need to be able to write at least a + * log sector, or we're out of luck. + */ + bufblks = 1 << ffs(blocks); + while (!(bp = xlog_get_bp(log, bufblks))) { + bufblks >>= 1; + if (bufblks < sectbb) + return ENOMEM; + } + + /* We may need to do a read at the start to fill in part of + * the buffer in the starting sector not covered by the first + * write below. + */ + balign = round_down(start_block, sectbb); + if (balign != start_block) { + error = xlog_bread_noalign(log, start_block, 1, bp); + if (error) + goto out_put_bp; + + j = start_block - balign; + } + + for (i = start_block; i < end_block; i += bufblks) { + int bcount, endcount; + + bcount = min(bufblks, end_block - start_block); + endcount = bcount - j; + + /* We may need to do a read at the end to fill in part of + * the buffer in the final sector not covered by the write. + * If this is the same sector as the above read, skip it. + */ + ealign = round_down(end_block, sectbb); + if (j == 0 && (start_block + endcount > ealign)) { + offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block); + error = xlog_bread_offset(log, ealign, sectbb, + bp, offset); + if (error) + break; + + } + + offset = xlog_align(log, start_block, endcount, bp); + for (; j < endcount; j++) { + xlog_add_record(log, offset, cycle, i+j, + tail_cycle, tail_block); + offset += BBSIZE; + } + error = xlog_bwrite(log, start_block, endcount, bp); + if (error) + break; + start_block += endcount; + j = 0; + } + + out_put_bp: + xlog_put_bp(bp); + return error; +} + +/* + * This routine is called to blow away any incomplete log writes out + * in front of the log head. We do this so that we won't become confused + * if we come up, write only a little bit more, and then crash again. + * If we leave the partial log records out there, this situation could + * cause us to think those partial writes are valid blocks since they + * have the current cycle number. We get rid of them by overwriting them + * with empty log records with the old cycle number rather than the + * current one. + * + * The tail lsn is passed in rather than taken from + * the log so that we will not write over the unmount record after a + * clean unmount in a 512 block log. Doing so would leave the log without + * any valid log records in it until a new one was written. If we crashed + * during that time we would not be able to recover. + */ +STATIC int +xlog_clear_stale_blocks( + xlog_t *log, + xfs_lsn_t tail_lsn) +{ + int tail_cycle, head_cycle; + int tail_block, head_block; + int tail_distance, max_distance; + int distance; + int error; + + tail_cycle = CYCLE_LSN(tail_lsn); + tail_block = BLOCK_LSN(tail_lsn); + head_cycle = log->l_curr_cycle; + head_block = log->l_curr_block; + + /* + * Figure out the distance between the new head of the log + * and the tail. We want to write over any blocks beyond the + * head that we may have written just before the crash, but + * we don't want to overwrite the tail of the log. + */ + if (head_cycle == tail_cycle) { + /* + * The tail is behind the head in the physical log, + * so the distance from the head to the tail is the + * distance from the head to the end of the log plus + * the distance from the beginning of the log to the + * tail. + */ + if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) { + XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)", + XFS_ERRLEVEL_LOW, log->l_mp); + return XFS_ERROR(EFSCORRUPTED); + } + tail_distance = tail_block + (log->l_logBBsize - head_block); + } else { + /* + * The head is behind the tail in the physical log, + * so the distance from the head to the tail is just + * the tail block minus the head block. + */ + if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){ + XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)", + XFS_ERRLEVEL_LOW, log->l_mp); + return XFS_ERROR(EFSCORRUPTED); + } + tail_distance = tail_block - head_block; + } + + /* + * If the head is right up against the tail, we can't clear + * anything. + */ + if (tail_distance <= 0) { + ASSERT(tail_distance == 0); + return 0; + } + + max_distance = XLOG_TOTAL_REC_SHIFT(log); + /* + * Take the smaller of the maximum amount of outstanding I/O + * we could have and the distance to the tail to clear out. + * We take the smaller so that we don't overwrite the tail and + * we don't waste all day writing from the head to the tail + * for no reason. + */ + max_distance = MIN(max_distance, tail_distance); + + if ((head_block + max_distance) <= log->l_logBBsize) { + /* + * We can stomp all the blocks we need to without + * wrapping around the end of the log. Just do it + * in a single write. Use the cycle number of the + * current cycle minus one so that the log will look like: + * n ... | n - 1 ... + */ + error = xlog_write_log_records(log, (head_cycle - 1), + head_block, max_distance, tail_cycle, + tail_block); + if (error) + return error; + } else { + /* + * We need to wrap around the end of the physical log in + * order to clear all the blocks. Do it in two separate + * I/Os. The first write should be from the head to the + * end of the physical log, and it should use the current + * cycle number minus one just like above. + */ + distance = log->l_logBBsize - head_block; + error = xlog_write_log_records(log, (head_cycle - 1), + head_block, distance, tail_cycle, + tail_block); + + if (error) + return error; + + /* + * Now write the blocks at the start of the physical log. + * This writes the remainder of the blocks we want to clear. + * It uses the current cycle number since we're now on the + * same cycle as the head so that we get: + * n ... n ... | n - 1 ... + * ^^^^^ blocks we're writing + */ + distance = max_distance - (log->l_logBBsize - head_block); + error = xlog_write_log_records(log, head_cycle, 0, distance, + tail_cycle, tail_block); + if (error) + return error; + } + + return 0; +} + +/****************************************************************************** + * + * Log recover routines + * + ****************************************************************************** + */ + +STATIC xlog_recover_t * +xlog_recover_find_tid( + struct hlist_head *head, + xlog_tid_t tid) +{ + xlog_recover_t *trans; + struct hlist_node *n; + + hlist_for_each_entry(trans, n, head, r_list) { + if (trans->r_log_tid == tid) + return trans; + } + return NULL; +} + +STATIC void +xlog_recover_new_tid( + struct hlist_head *head, + xlog_tid_t tid, + xfs_lsn_t lsn) +{ + xlog_recover_t *trans; + + trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); + trans->r_log_tid = tid; + trans->r_lsn = lsn; + INIT_LIST_HEAD(&trans->r_itemq); + + INIT_HLIST_NODE(&trans->r_list); + hlist_add_head(&trans->r_list, head); +} + +STATIC void +xlog_recover_add_item( + struct list_head *head) +{ + xlog_recover_item_t *item; + + item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); + INIT_LIST_HEAD(&item->ri_list); + list_add_tail(&item->ri_list, head); +} + +STATIC int +xlog_recover_add_to_cont_trans( + struct log *log, + xlog_recover_t *trans, + xfs_caddr_t dp, + int len) +{ + xlog_recover_item_t *item; + xfs_caddr_t ptr, old_ptr; + int old_len; + + if (list_empty(&trans->r_itemq)) { + /* finish copying rest of trans header */ + xlog_recover_add_item(&trans->r_itemq); + ptr = (xfs_caddr_t) &trans->r_theader + + sizeof(xfs_trans_header_t) - len; + memcpy(ptr, dp, len); /* d, s, l */ + return 0; + } + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + + old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; + old_len = item->ri_buf[item->ri_cnt-1].i_len; + + ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u); + memcpy(&ptr[old_len], dp, len); /* d, s, l */ + item->ri_buf[item->ri_cnt-1].i_len += len; + item->ri_buf[item->ri_cnt-1].i_addr = ptr; + trace_xfs_log_recover_item_add_cont(log, trans, item, 0); + return 0; +} + +/* + * The next region to add is the start of a new region. It could be + * a whole region or it could be the first part of a new region. Because + * of this, the assumption here is that the type and size fields of all + * format structures fit into the first 32 bits of the structure. + * + * This works because all regions must be 32 bit aligned. Therefore, we + * either have both fields or we have neither field. In the case we have + * neither field, the data part of the region is zero length. We only have + * a log_op_header and can throw away the header since a new one will appear + * later. If we have at least 4 bytes, then we can determine how many regions + * will appear in the current log item. + */ +STATIC int +xlog_recover_add_to_trans( + struct log *log, + xlog_recover_t *trans, + xfs_caddr_t dp, + int len) +{ + xfs_inode_log_format_t *in_f; /* any will do */ + xlog_recover_item_t *item; + xfs_caddr_t ptr; + + if (!len) + return 0; + if (list_empty(&trans->r_itemq)) { + /* we need to catch log corruptions here */ + if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { + xfs_warn(log->l_mp, "%s: bad header magic number", + __func__); + ASSERT(0); + return XFS_ERROR(EIO); + } + if (len == sizeof(xfs_trans_header_t)) + xlog_recover_add_item(&trans->r_itemq); + memcpy(&trans->r_theader, dp, len); /* d, s, l */ + return 0; + } + + ptr = kmem_alloc(len, KM_SLEEP); + memcpy(ptr, dp, len); + in_f = (xfs_inode_log_format_t *)ptr; + + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + if (item->ri_total != 0 && + item->ri_total == item->ri_cnt) { + /* tail item is in use, get a new one */ + xlog_recover_add_item(&trans->r_itemq); + item = list_entry(trans->r_itemq.prev, + xlog_recover_item_t, ri_list); + } + + if (item->ri_total == 0) { /* first region to be added */ + if (in_f->ilf_size == 0 || + in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { + xfs_warn(log->l_mp, + "bad number of regions (%d) in inode log format", + in_f->ilf_size); + ASSERT(0); + return XFS_ERROR(EIO); + } + + item->ri_total = in_f->ilf_size; + item->ri_buf = + kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), + KM_SLEEP); + } + ASSERT(item->ri_total > item->ri_cnt); + /* Description region is ri_buf[0] */ + item->ri_buf[item->ri_cnt].i_addr = ptr; + item->ri_buf[item->ri_cnt].i_len = len; + item->ri_cnt++; + trace_xfs_log_recover_item_add(log, trans, item, 0); + return 0; +} + +/* + * Sort the log items in the transaction. Cancelled buffers need + * to be put first so they are processed before any items that might + * modify the buffers. If they are cancelled, then the modifications + * don't need to be replayed. + */ +STATIC int +xlog_recover_reorder_trans( + struct log *log, + xlog_recover_t *trans, + int pass) +{ + xlog_recover_item_t *item, *n; + LIST_HEAD(sort_list); + + list_splice_init(&trans->r_itemq, &sort_list); + list_for_each_entry_safe(item, n, &sort_list, ri_list) { + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + trace_xfs_log_recover_item_reorder_head(log, + trans, item, pass); + list_move(&item->ri_list, &trans->r_itemq); + break; + } + case XFS_LI_INODE: + case XFS_LI_DQUOT: + case XFS_LI_QUOTAOFF: + case XFS_LI_EFD: + case XFS_LI_EFI: + trace_xfs_log_recover_item_reorder_tail(log, + trans, item, pass); + list_move_tail(&item->ri_list, &trans->r_itemq); + break; + default: + xfs_warn(log->l_mp, + "%s: unrecognized type of log operation", + __func__); + ASSERT(0); + return XFS_ERROR(EIO); + } + } + ASSERT(list_empty(&sort_list)); + return 0; +} + +/* + * Build up the table of buf cancel records so that we don't replay + * cancelled data in the second pass. For buffer records that are + * not cancel records, there is nothing to do here so we just return. + * + * If we get a cancel record which is already in the table, this indicates + * that the buffer was cancelled multiple times. In order to ensure + * that during pass 2 we keep the record in the table until we reach its + * last occurrence in the log, we keep a reference count in the cancel + * record in the table to tell us how many times we expect to see this + * record during the second pass. + */ +STATIC int +xlog_recover_buffer_pass1( + struct log *log, + xlog_recover_item_t *item) +{ + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + struct list_head *bucket; + struct xfs_buf_cancel *bcp; + + /* + * If this isn't a cancel buffer item, then just return. + */ + if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + trace_xfs_log_recover_buf_not_cancel(log, buf_f); + return 0; + } + + /* + * Insert an xfs_buf_cancel record into the hash table of them. + * If there is already an identical record, bump its reference count. + */ + bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == buf_f->blf_blkno && + bcp->bc_len == buf_f->blf_len) { + bcp->bc_refcount++; + trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); + return 0; + } + } + + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); + bcp->bc_blkno = buf_f->blf_blkno; + bcp->bc_len = buf_f->blf_len; + bcp->bc_refcount = 1; + list_add_tail(&bcp->bc_list, bucket); + + trace_xfs_log_recover_buf_cancel_add(log, buf_f); + return 0; +} + +/* + * Check to see whether the buffer being recovered has a corresponding + * entry in the buffer cancel record table. If it does then return 1 + * so that it will be cancelled, otherwise return 0. If the buffer is + * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement + * the refcount on the entry in the table and remove it from the table + * if this is the last reference. + * + * We remove the cancel record from the table when we encounter its + * last occurrence in the log so that if the same buffer is re-used + * again after its last cancellation we actually replay the changes + * made at that point. + */ +STATIC int +xlog_check_buffer_cancelled( + struct log *log, + xfs_daddr_t blkno, + uint len, + ushort flags) +{ + struct list_head *bucket; + struct xfs_buf_cancel *bcp; + + if (log->l_buf_cancel_table == NULL) { + /* + * There is nothing in the table built in pass one, + * so this buffer must not be cancelled. + */ + ASSERT(!(flags & XFS_BLF_CANCEL)); + return 0; + } + + /* + * Search for an entry in the cancel table that matches our buffer. + */ + bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == blkno && bcp->bc_len == len) + goto found; + } + + /* + * We didn't find a corresponding entry in the table, so return 0 so + * that the buffer is NOT cancelled. + */ + ASSERT(!(flags & XFS_BLF_CANCEL)); + return 0; + +found: + /* + * We've go a match, so return 1 so that the recovery of this buffer + * is cancelled. If this buffer is actually a buffer cancel log + * item, then decrement the refcount on the one in the table and + * remove it if this is the last reference. + */ + if (flags & XFS_BLF_CANCEL) { + if (--bcp->bc_refcount == 0) { + list_del(&bcp->bc_list); + kmem_free(bcp); + } + } + return 1; +} + +/* + * Perform recovery for a buffer full of inodes. In these buffers, the only + * data which should be recovered is that which corresponds to the + * di_next_unlinked pointers in the on disk inode structures. The rest of the + * data for the inodes is always logged through the inodes themselves rather + * than the inode buffer and is recovered in xlog_recover_inode_pass2(). + * + * The only time when buffers full of inodes are fully recovered is when the + * buffer is full of newly allocated inodes. In this case the buffer will + * not be marked as an inode buffer and so will be sent to + * xlog_recover_do_reg_buffer() below during recovery. + */ +STATIC int +xlog_recover_do_inode_buffer( + struct xfs_mount *mp, + xlog_recover_item_t *item, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + int i; + int item_index = 0; + int bit = 0; + int nbits = 0; + int reg_buf_offset = 0; + int reg_buf_bytes = 0; + int next_unlinked_offset; + int inodes_per_buf; + xfs_agino_t *logged_nextp; + xfs_agino_t *buffer_nextp; + + trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + + inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; + for (i = 0; i < inodes_per_buf; i++) { + next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + + offsetof(xfs_dinode_t, di_next_unlinked); + + while (next_unlinked_offset >= + (reg_buf_offset + reg_buf_bytes)) { + /* + * The next di_next_unlinked field is beyond + * the current logged region. Find the next + * logged region that contains or is beyond + * the current di_next_unlinked field. + */ + bit += nbits; + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + + /* + * If there are no more logged regions in the + * buffer, then we're done. + */ + if (bit == -1) + return 0; + + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + ASSERT(nbits > 0); + reg_buf_offset = bit << XFS_BLF_SHIFT; + reg_buf_bytes = nbits << XFS_BLF_SHIFT; + item_index++; + } + + /* + * If the current logged region starts after the current + * di_next_unlinked field, then move on to the next + * di_next_unlinked field. + */ + if (next_unlinked_offset < reg_buf_offset) + continue; + + ASSERT(item->ri_buf[item_index].i_addr != NULL); + ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); + ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); + + /* + * The current logged region contains a copy of the + * current di_next_unlinked field. Extract its value + * and copy it to the buffer copy. + */ + logged_nextp = item->ri_buf[item_index].i_addr + + next_unlinked_offset - reg_buf_offset; + if (unlikely(*logged_nextp == 0)) { + xfs_alert(mp, + "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). " + "Trying to replay bad (0) inode di_next_unlinked field.", + item, bp); + XFS_ERROR_REPORT("xlog_recover_do_inode_buf", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, + next_unlinked_offset); + *buffer_nextp = *logged_nextp; + } + + return 0; +} + +/* + * Perform a 'normal' buffer recovery. Each logged region of the + * buffer should be copied over the corresponding region in the + * given buffer. The bitmap in the buf log format structure indicates + * where to place the logged data. + */ +STATIC void +xlog_recover_do_reg_buffer( + struct xfs_mount *mp, + xlog_recover_item_t *item, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + int i; + int bit; + int nbits; + int error; + + trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); + + bit = 0; + i = 1; /* 0 is the buf format structure */ + while (1) { + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + if (bit == -1) + break; + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + ASSERT(nbits > 0); + ASSERT(item->ri_buf[i].i_addr != NULL); + ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); + ASSERT(XFS_BUF_COUNT(bp) >= + ((uint)bit << XFS_BLF_SHIFT)+(nbits<blf_flags & + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { + if (item->ri_buf[i].i_addr == NULL) { + xfs_alert(mp, + "XFS: NULL dquot in %s.", __func__); + goto next; + } + if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { + xfs_alert(mp, + "XFS: dquot too small (%d) in %s.", + item->ri_buf[i].i_len, __func__); + goto next; + } + error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr, + -1, 0, XFS_QMOPT_DOWARN, + "dquot_buf_recover"); + if (error) + goto next; + } + + memcpy(xfs_buf_offset(bp, + (uint)bit << XFS_BLF_SHIFT), /* dest */ + item->ri_buf[i].i_addr, /* source */ + nbits<ri_total); +} + +/* + * Do some primitive error checking on ondisk dquot data structures. + */ +int +xfs_qm_dqcheck( + struct xfs_mount *mp, + xfs_disk_dquot_t *ddq, + xfs_dqid_t id, + uint type, /* used only when IO_dorepair is true */ + uint flags, + char *str) +{ + xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; + int errs = 0; + + /* + * We can encounter an uninitialized dquot buffer for 2 reasons: + * 1. If we crash while deleting the quotainode(s), and those blks got + * used for user data. This is because we take the path of regular + * file deletion; however, the size field of quotainodes is never + * updated, so all the tricks that we play in itruncate_finish + * don't quite matter. + * + * 2. We don't play the quota buffers when there's a quotaoff logitem. + * But the allocation will be replayed so we'll end up with an + * uninitialized quota block. + * + * This is all fine; things are still consistent, and we haven't lost + * any quota information. Just don't complain about bad dquot blks. + */ + if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", + str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); + errs++; + } + if (ddq->d_version != XFS_DQUOT_VERSION) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", + str, id, ddq->d_version, XFS_DQUOT_VERSION); + errs++; + } + + if (ddq->d_flags != XFS_DQ_USER && + ddq->d_flags != XFS_DQ_PROJ && + ddq->d_flags != XFS_DQ_GROUP) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, unknown flags 0x%x", + str, id, ddq->d_flags); + errs++; + } + + if (id != -1 && id != be32_to_cpu(ddq->d_id)) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : ondisk-dquot 0x%p, ID mismatch: " + "0x%x expected, found id 0x%x", + str, ddq, id, be32_to_cpu(ddq->d_id)); + errs++; + } + + if (!errs && ddq->d_id) { + if (ddq->d_blk_softlimit && + be64_to_cpu(ddq->d_bcount) >= + be64_to_cpu(ddq->d_blk_softlimit)) { + if (!ddq->d_btimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + if (ddq->d_ino_softlimit && + be64_to_cpu(ddq->d_icount) >= + be64_to_cpu(ddq->d_ino_softlimit)) { + if (!ddq->d_itimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + if (ddq->d_rtb_softlimit && + be64_to_cpu(ddq->d_rtbcount) >= + be64_to_cpu(ddq->d_rtb_softlimit)) { + if (!ddq->d_rtbtimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + } + + if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) + return errs; + + if (flags & XFS_QMOPT_DOWARN) + xfs_notice(mp, "Re-initializing dquot ID 0x%x", id); + + /* + * Typically, a repair is only requested by quotacheck. + */ + ASSERT(id != -1); + ASSERT(flags & XFS_QMOPT_DQREPAIR); + memset(d, 0, sizeof(xfs_dqblk_t)); + + d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + d->dd_diskdq.d_version = XFS_DQUOT_VERSION; + d->dd_diskdq.d_flags = type; + d->dd_diskdq.d_id = cpu_to_be32(id); + + return errs; +} + +/* + * Perform a dquot buffer recovery. + * Simple algorithm: if we have found a QUOTAOFF logitem of the same type + * (ie. USR or GRP), then just toss this buffer away; don't recover it. + * Else, treat it as a regular buffer and do recovery. + */ +STATIC void +xlog_recover_do_dquot_buffer( + xfs_mount_t *mp, + xlog_t *log, + xlog_recover_item_t *item, + xfs_buf_t *bp, + xfs_buf_log_format_t *buf_f) +{ + uint type; + + trace_xfs_log_recover_buf_dquot_buf(log, buf_f); + + /* + * Filesystems are required to send in quota flags at mount time. + */ + if (mp->m_qflags == 0) { + return; + } + + type = 0; + if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) + type |= XFS_DQ_USER; + if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) + type |= XFS_DQ_PROJ; + if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) + type |= XFS_DQ_GROUP; + /* + * This type of quotas was turned off, so ignore this buffer + */ + if (log->l_quotaoffs_flag & type) + return; + + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); +} + +/* + * This routine replays a modification made to a buffer at runtime. + * There are actually two types of buffer, regular and inode, which + * are handled differently. Inode buffers are handled differently + * in that we only recover a specific set of data from them, namely + * the inode di_next_unlinked fields. This is because all other inode + * data is actually logged via inode records and any data we replay + * here which overlaps that may be stale. + * + * When meta-data buffers are freed at run time we log a buffer item + * with the XFS_BLF_CANCEL bit set to indicate that previous copies + * of the buffer in the log should not be replayed at recovery time. + * This is so that if the blocks covered by the buffer are reused for + * file data before we crash we don't end up replaying old, freed + * meta-data into a user's file. + * + * To handle the cancellation of buffer log items, we make two passes + * over the log during recovery. During the first we build a table of + * those buffers which have been cancelled, and during the second we + * only replay those buffers which do not have corresponding cancel + * records in the table. See xlog_recover_do_buffer_pass[1,2] above + * for more details on the implementation of the table of cancel records. + */ +STATIC int +xlog_recover_buffer_pass2( + xlog_t *log, + xlog_recover_item_t *item) +{ + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + xfs_mount_t *mp = log->l_mp; + xfs_buf_t *bp; + int error; + uint buf_flags; + + /* + * In this pass we only want to recover all the buffers which have + * not been cancelled and are not cancellation buffers themselves. + */ + if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + trace_xfs_log_recover_buf_cancel(log, buf_f); + return 0; + } + + trace_xfs_log_recover_buf_recover(log, buf_f); + + buf_flags = XBF_LOCK; + if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF)) + buf_flags |= XBF_MAPPED; + + bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, + buf_flags); + if (XFS_BUF_ISERROR(bp)) { + xfs_ioerror_alert("xlog_recover_do..(read#1)", mp, + bp, buf_f->blf_blkno); + error = XFS_BUF_GETERROR(bp); + xfs_buf_relse(bp); + return error; + } + + error = 0; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); + } else if (buf_f->blf_flags & + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { + xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); + } else { + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + } + if (error) + return XFS_ERROR(error); + + /* + * Perform delayed write on the buffer. Asynchronous writes will be + * slower when taking into account all the buffers to be flushed. + * + * Also make sure that only inode buffers with good sizes stay in + * the buffer cache. The kernel moves inodes in buffers of 1 block + * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode + * buffers in the log can be a different size if the log was generated + * by an older kernel using unclustered inode buffers or a newer kernel + * running with a different inode cluster size. Regardless, if the + * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE) + * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep + * the buffer out of the buffer cache so that the buffer won't + * overlap with future reads of those inodes. + */ + if (XFS_DINODE_MAGIC == + be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && + (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, + (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { + XFS_BUF_STALE(bp); + error = xfs_bwrite(mp, bp); + } else { + ASSERT(bp->b_target->bt_mount == mp); + XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); + xfs_bdwrite(mp, bp); + } + + return (error); +} + +STATIC int +xlog_recover_inode_pass2( + xlog_t *log, + xlog_recover_item_t *item) +{ + xfs_inode_log_format_t *in_f; + xfs_mount_t *mp = log->l_mp; + xfs_buf_t *bp; + xfs_dinode_t *dip; + int len; + xfs_caddr_t src; + xfs_caddr_t dest; + int error; + int attr_index; + uint fields; + xfs_icdinode_t *dicp; + int need_free = 0; + + if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { + in_f = item->ri_buf[0].i_addr; + } else { + in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP); + need_free = 1; + error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); + if (error) + goto error; + } + + /* + * Inode buffers can be freed, look out for it, + * and do not replay the inode. + */ + if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, + in_f->ilf_len, 0)) { + error = 0; + trace_xfs_log_recover_inode_cancel(log, in_f); + goto error; + } + trace_xfs_log_recover_inode_recover(log, in_f); + + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, + XBF_LOCK); + if (XFS_BUF_ISERROR(bp)) { + xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, + bp, in_f->ilf_blkno); + error = XFS_BUF_GETERROR(bp); + xfs_buf_relse(bp); + goto error; + } + error = 0; + ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); + dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); + + /* + * Make sure the place we're flushing out to really looks + * like an inode! + */ + if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) { + xfs_buf_relse(bp); + xfs_alert(mp, + "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", + __func__, dip, bp, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", + XFS_ERRLEVEL_LOW, mp); + error = EFSCORRUPTED; + goto error; + } + dicp = item->ri_buf[1].i_addr; + if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { + xfs_buf_relse(bp); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", + __func__, item, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", + XFS_ERRLEVEL_LOW, mp); + error = EFSCORRUPTED; + goto error; + } + + /* Skip replay when the on disk inode is newer than the log one */ + if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * Deal with the wrap case, DI_MAX_FLUSH is less + * than smaller numbers + */ + if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && + dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { + /* do nothing */ + } else { + xfs_buf_relse(bp); + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto error; + } + } + /* Take the opportunity to reset the flush iteration count */ + dicp->di_flushiter = 0; + + if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { + if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && + (dicp->di_format != XFS_DINODE_FMT_BTREE)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_buf_relse(bp); + xfs_alert(mp, + "%s: Bad regular inode log record, rec ptr 0x%p, " + "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); + error = EFSCORRUPTED; + goto error; + } + } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) { + if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && + (dicp->di_format != XFS_DINODE_FMT_BTREE) && + (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_buf_relse(bp); + xfs_alert(mp, + "%s: Bad dir inode log record, rec ptr 0x%p, " + "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); + error = EFSCORRUPTED; + goto error; + } + } + if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_buf_relse(bp); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " + "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", + __func__, item, dip, bp, in_f->ilf_ino, + dicp->di_nextents + dicp->di_anextents, + dicp->di_nblocks); + error = EFSCORRUPTED; + goto error; + } + if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_buf_relse(bp); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " + "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, + item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); + error = EFSCORRUPTED; + goto error; + } + if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_buf_relse(bp); + xfs_alert(mp, + "%s: Bad inode log record length %d, rec ptr 0x%p", + __func__, item->ri_buf[1].i_len, item); + error = EFSCORRUPTED; + goto error; + } + + /* The core is in in-core format */ + xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr); + + /* the rest is in on-disk format */ + if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { + memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode), + item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode), + item->ri_buf[1].i_len - sizeof(struct xfs_icdinode)); + } + + fields = in_f->ilf_fields; + switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) { + case XFS_ILOG_DEV: + xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); + break; + case XFS_ILOG_UUID: + memcpy(XFS_DFORK_DPTR(dip), + &in_f->ilf_u.ilfu_uuid, + sizeof(uuid_t)); + break; + } + + if (in_f->ilf_size == 2) + goto write_inode_buffer; + len = item->ri_buf[2].i_len; + src = item->ri_buf[2].i_addr; + ASSERT(in_f->ilf_size <= 4); + ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); + ASSERT(!(fields & XFS_ILOG_DFORK) || + (len == in_f->ilf_dsize)); + + switch (fields & XFS_ILOG_DFORK) { + case XFS_ILOG_DDATA: + case XFS_ILOG_DEXT: + memcpy(XFS_DFORK_DPTR(dip), src, len); + break; + + case XFS_ILOG_DBROOT: + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, + (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip), + XFS_DFORK_DSIZE(dip, mp)); + break; + + default: + /* + * There are no data fork flags set. + */ + ASSERT((fields & XFS_ILOG_DFORK) == 0); + break; + } + + /* + * If we logged any attribute data, recover it. There may or + * may not have been any other non-core data logged in this + * transaction. + */ + if (in_f->ilf_fields & XFS_ILOG_AFORK) { + if (in_f->ilf_fields & XFS_ILOG_DFORK) { + attr_index = 3; + } else { + attr_index = 2; + } + len = item->ri_buf[attr_index].i_len; + src = item->ri_buf[attr_index].i_addr; + ASSERT(len == in_f->ilf_asize); + + switch (in_f->ilf_fields & XFS_ILOG_AFORK) { + case XFS_ILOG_ADATA: + case XFS_ILOG_AEXT: + dest = XFS_DFORK_APTR(dip); + ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); + memcpy(dest, src, len); + break; + + case XFS_ILOG_ABROOT: + dest = XFS_DFORK_APTR(dip); + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, + len, (xfs_bmdr_block_t*)dest, + XFS_DFORK_ASIZE(dip, mp)); + break; + + default: + xfs_warn(log->l_mp, "%s: Invalid flag", __func__); + ASSERT(0); + xfs_buf_relse(bp); + error = EIO; + goto error; + } + } + +write_inode_buffer: + ASSERT(bp->b_target->bt_mount == mp); + XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); + xfs_bdwrite(mp, bp); +error: + if (need_free) + kmem_free(in_f); + return XFS_ERROR(error); +} + +/* + * Recover QUOTAOFF records. We simply make a note of it in the xlog_t + * structure, so that we know not to do any dquot item or dquot buffer recovery, + * of that type. + */ +STATIC int +xlog_recover_quotaoff_pass1( + xlog_t *log, + xlog_recover_item_t *item) +{ + xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; + ASSERT(qoff_f); + + /* + * The logitem format's flag tells us if this was user quotaoff, + * group/project quotaoff or both. + */ + if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_USER; + if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_PROJ; + if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_GROUP; + + return (0); +} + +/* + * Recover a dquot record + */ +STATIC int +xlog_recover_dquot_pass2( + xlog_t *log, + xlog_recover_item_t *item) +{ + xfs_mount_t *mp = log->l_mp; + xfs_buf_t *bp; + struct xfs_disk_dquot *ddq, *recddq; + int error; + xfs_dq_logformat_t *dq_f; + uint type; + + + /* + * Filesystems are required to send in quota flags at mount time. + */ + if (mp->m_qflags == 0) + return (0); + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) { + xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); + return XFS_ERROR(EIO); + } + if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { + xfs_alert(log->l_mp, "dquot too small (%d) in %s.", + item->ri_buf[1].i_len, __func__); + return XFS_ERROR(EIO); + } + + /* + * This type of quotas was turned off, so ignore this record. + */ + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return (0); + + /* + * At this point we know that quota was _not_ turned off. + * Since the mount flags are not indicating to us otherwise, this + * must mean that quota is on, and the dquot needs to be replayed. + * Remember that we may not have fully recovered the superblock yet, + * so we can't do the usual trick of looking at the SB quota bits. + * + * The other possibility, of course, is that the quota subsystem was + * removed since the last mount - ENOSYS. + */ + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + "xlog_recover_dquot_pass2 (log copy)"); + if (error) + return XFS_ERROR(EIO); + ASSERT(dq_f->qlf_len == 1); + + error = xfs_read_buf(mp, mp->m_ddev_targp, + dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), + 0, &bp); + if (error) { + xfs_ioerror_alert("xlog_recover_do..(read#3)", mp, + bp, dq_f->qlf_blkno); + return error; + } + ASSERT(bp); + ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); + + /* + * At least the magic num portion should be on disk because this + * was among a chunk of dquots created earlier, and we did some + * minimal initialization then. + */ + error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + "xlog_recover_dquot_pass2"); + if (error) { + xfs_buf_relse(bp); + return XFS_ERROR(EIO); + } + + memcpy(ddq, recddq, item->ri_buf[1].i_len); + + ASSERT(dq_f->qlf_size == 2); + ASSERT(bp->b_target->bt_mount == mp); + XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); + xfs_bdwrite(mp, bp); + + return (0); +} + +/* + * This routine is called to create an in-core extent free intent + * item from the efi format structure which was logged on disk. + * It allocates an in-core efi, copies the extents from the format + * structure into it, and adds the efi to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_efi_pass2( + xlog_t *log, + xlog_recover_item_t *item, + xfs_lsn_t lsn) +{ + int error; + xfs_mount_t *mp = log->l_mp; + xfs_efi_log_item_t *efip; + xfs_efi_log_format_t *efi_formatp; + + efi_formatp = item->ri_buf[0].i_addr; + + efip = xfs_efi_init(mp, efi_formatp->efi_nextents); + if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), + &(efip->efi_format)))) { + xfs_efi_item_free(efip); + return error; + } + atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); + + spin_lock(&log->l_ailp->xa_lock); + /* + * xfs_trans_ail_update() drops the AIL lock. + */ + xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); + return 0; +} + + +/* + * This routine is called when an efd format structure is found in + * a committed transaction in the log. It's purpose is to cancel + * the corresponding efi if it was still in the log. To do this + * it searches the AIL for the efi with an id equal to that in the + * efd format structure. If we find it, we remove the efi from the + * AIL and free it. + */ +STATIC int +xlog_recover_efd_pass2( + xlog_t *log, + xlog_recover_item_t *item) +{ + xfs_efd_log_format_t *efd_formatp; + xfs_efi_log_item_t *efip = NULL; + xfs_log_item_t *lip; + __uint64_t efi_id; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; + + efd_formatp = item->ri_buf[0].i_addr; + ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || + (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); + efi_id = efd_formatp->efd_efi_id; + + /* + * Search for the efi with the id in the efd format structure + * in the AIL. + */ + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + if (lip->li_type == XFS_LI_EFI) { + efip = (xfs_efi_log_item_t *)lip; + if (efip->efi_format.efi_id == efi_id) { + /* + * xfs_trans_ail_delete() drops the + * AIL lock. + */ + xfs_trans_ail_delete(ailp, lip); + xfs_efi_item_free(efip); + spin_lock(&ailp->xa_lock); + break; + } + } + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + xfs_trans_ail_cursor_done(ailp, &cur); + spin_unlock(&ailp->xa_lock); + + return 0; +} + +/* + * Free up any resources allocated by the transaction + * + * Remember that EFIs, EFDs, and IUNLINKs are handled later. + */ +STATIC void +xlog_recover_free_trans( + struct xlog_recover *trans) +{ + xlog_recover_item_t *item, *n; + int i; + + list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { + /* Free the regions in the item. */ + list_del(&item->ri_list); + for (i = 0; i < item->ri_cnt; i++) + kmem_free(item->ri_buf[i].i_addr); + /* Free the item itself */ + kmem_free(item->ri_buf); + kmem_free(item); + } + /* Free the transaction recover structure */ + kmem_free(trans); +} + +STATIC int +xlog_recover_commit_pass1( + struct log *log, + struct xlog_recover *trans, + xlog_recover_item_t *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass1(log, item); + case XFS_LI_QUOTAOFF: + return xlog_recover_quotaoff_pass1(log, item); + case XFS_LI_INODE: + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_DQUOT: + /* nothing to do in pass 1 */ + return 0; + default: + xfs_warn(log->l_mp, "%s: invalid item type (%d)", + __func__, ITEM_TYPE(item)); + ASSERT(0); + return XFS_ERROR(EIO); + } +} + +STATIC int +xlog_recover_commit_pass2( + struct log *log, + struct xlog_recover *trans, + xlog_recover_item_t *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass2(log, item); + case XFS_LI_INODE: + return xlog_recover_inode_pass2(log, item); + case XFS_LI_EFI: + return xlog_recover_efi_pass2(log, item, trans->r_lsn); + case XFS_LI_EFD: + return xlog_recover_efd_pass2(log, item); + case XFS_LI_DQUOT: + return xlog_recover_dquot_pass2(log, item); + case XFS_LI_QUOTAOFF: + /* nothing to do in pass2 */ + return 0; + default: + xfs_warn(log->l_mp, "%s: invalid item type (%d)", + __func__, ITEM_TYPE(item)); + ASSERT(0); + return XFS_ERROR(EIO); + } +} + +/* + * Perform the transaction. + * + * If the transaction modifies a buffer or inode, do it now. Otherwise, + * EFIs and EFDs get queued up by adding entries into the AIL for them. + */ +STATIC int +xlog_recover_commit_trans( + struct log *log, + struct xlog_recover *trans, + int pass) +{ + int error = 0; + xlog_recover_item_t *item; + + hlist_del(&trans->r_list); + + error = xlog_recover_reorder_trans(log, trans, pass); + if (error) + return error; + + list_for_each_entry(item, &trans->r_itemq, ri_list) { + if (pass == XLOG_RECOVER_PASS1) + error = xlog_recover_commit_pass1(log, trans, item); + else + error = xlog_recover_commit_pass2(log, trans, item); + if (error) + return error; + } + + xlog_recover_free_trans(trans); + return 0; +} + +STATIC int +xlog_recover_unmount_trans( + struct log *log, + xlog_recover_t *trans) +{ + /* Do nothing now */ + xfs_warn(log->l_mp, "%s: Unmount LR", __func__); + return 0; +} + +/* + * There are two valid states of the r_state field. 0 indicates that the + * transaction structure is in a normal state. We have either seen the + * start of the transaction or the last operation we added was not a partial + * operation. If the last operation we added to the transaction was a + * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS. + * + * NOTE: skip LRs with 0 data length. + */ +STATIC int +xlog_recover_process_data( + xlog_t *log, + struct hlist_head rhash[], + xlog_rec_header_t *rhead, + xfs_caddr_t dp, + int pass) +{ + xfs_caddr_t lp; + int num_logops; + xlog_op_header_t *ohead; + xlog_recover_t *trans; + xlog_tid_t tid; + int error; + unsigned long hash; + uint flags; + + lp = dp + be32_to_cpu(rhead->h_len); + num_logops = be32_to_cpu(rhead->h_num_logops); + + /* check the log format matches our own - else we can't recover */ + if (xlog_header_check_recover(log->l_mp, rhead)) + return (XFS_ERROR(EIO)); + + while ((dp < lp) && num_logops) { + ASSERT(dp + sizeof(xlog_op_header_t) <= lp); + ohead = (xlog_op_header_t *)dp; + dp += sizeof(xlog_op_header_t); + if (ohead->oh_clientid != XFS_TRANSACTION && + ohead->oh_clientid != XFS_LOG) { + xfs_warn(log->l_mp, "%s: bad clientid 0x%x", + __func__, ohead->oh_clientid); + ASSERT(0); + return (XFS_ERROR(EIO)); + } + tid = be32_to_cpu(ohead->oh_tid); + hash = XLOG_RHASH(tid); + trans = xlog_recover_find_tid(&rhash[hash], tid); + if (trans == NULL) { /* not found; add new tid */ + if (ohead->oh_flags & XLOG_START_TRANS) + xlog_recover_new_tid(&rhash[hash], tid, + be64_to_cpu(rhead->h_lsn)); + } else { + if (dp + be32_to_cpu(ohead->oh_len) > lp) { + xfs_warn(log->l_mp, "%s: bad length 0x%x", + __func__, be32_to_cpu(ohead->oh_len)); + WARN_ON(1); + return (XFS_ERROR(EIO)); + } + flags = ohead->oh_flags & ~XLOG_END_TRANS; + if (flags & XLOG_WAS_CONT_TRANS) + flags &= ~XLOG_CONTINUE_TRANS; + switch (flags) { + case XLOG_COMMIT_TRANS: + error = xlog_recover_commit_trans(log, + trans, pass); + break; + case XLOG_UNMOUNT_TRANS: + error = xlog_recover_unmount_trans(log, trans); + break; + case XLOG_WAS_CONT_TRANS: + error = xlog_recover_add_to_cont_trans(log, + trans, dp, + be32_to_cpu(ohead->oh_len)); + break; + case XLOG_START_TRANS: + xfs_warn(log->l_mp, "%s: bad transaction", + __func__); + ASSERT(0); + error = XFS_ERROR(EIO); + break; + case 0: + case XLOG_CONTINUE_TRANS: + error = xlog_recover_add_to_trans(log, trans, + dp, be32_to_cpu(ohead->oh_len)); + break; + default: + xfs_warn(log->l_mp, "%s: bad flag 0x%x", + __func__, flags); + ASSERT(0); + error = XFS_ERROR(EIO); + break; + } + if (error) + return error; + } + dp += be32_to_cpu(ohead->oh_len); + num_logops--; + } + return 0; +} + +/* + * Process an extent free intent item that was recovered from + * the log. We need to free the extents that it describes. + */ +STATIC int +xlog_recover_process_efi( + xfs_mount_t *mp, + xfs_efi_log_item_t *efip) +{ + xfs_efd_log_item_t *efdp; + xfs_trans_t *tp; + int i; + int error = 0; + xfs_extent_t *extp; + xfs_fsblock_t startblock_fsb; + + ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); + + /* + * First check the validity of the extents described by the + * EFI. If any are bad, then assume that all are bad and + * just toss the EFI. + */ + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + extp = &(efip->efi_format.efi_extents[i]); + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, extp->ext_start)); + if ((startblock_fsb == 0) || + (extp->ext_len == 0) || + (startblock_fsb >= mp->m_sb.sb_dblocks) || + (extp->ext_len >= mp->m_sb.sb_agblocks)) { + /* + * This will pull the EFI from the AIL and + * free the memory associated with it. + */ + xfs_efi_release(efip, efip->efi_format.efi_nextents); + return XFS_ERROR(EIO); + } + } + + tp = xfs_trans_alloc(mp, 0); + error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); + if (error) + goto abort_error; + efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); + + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + extp = &(efip->efi_format.efi_extents[i]); + error = xfs_free_extent(tp, extp->ext_start, extp->ext_len); + if (error) + goto abort_error; + xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, + extp->ext_len); + } + + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); + error = xfs_trans_commit(tp, 0); + return error; + +abort_error: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); + return error; +} + +/* + * When this is called, all of the EFIs which did not have + * corresponding EFDs should be in the AIL. What we do now + * is free the extents associated with each one. + * + * Since we process the EFIs in normal transactions, they + * will be removed at some point after the commit. This prevents + * us from just walking down the list processing each one. + * We'll use a flag in the EFI to skip those that we've already + * processed and use the AIL iteration mechanism's generation + * count to try to speed this up at least a bit. + * + * When we start, we know that the EFIs are the only things in + * the AIL. As we process them, however, other items are added + * to the AIL. Since everything added to the AIL must come after + * everything already in the AIL, we stop processing as soon as + * we see something other than an EFI in the AIL. + */ +STATIC int +xlog_recover_process_efis( + xlog_t *log) +{ + xfs_log_item_t *lip; + xfs_efi_log_item_t *efip; + int error = 0; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp; + + ailp = log->l_ailp; + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + /* + * We're done when we see something other than an EFI. + * There should be no EFIs left in the AIL now. + */ + if (lip->li_type != XFS_LI_EFI) { +#ifdef DEBUG + for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) + ASSERT(lip->li_type != XFS_LI_EFI); +#endif + break; + } + + /* + * Skip EFIs that we've already processed. + */ + efip = (xfs_efi_log_item_t *)lip; + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { + lip = xfs_trans_ail_cursor_next(ailp, &cur); + continue; + } + + spin_unlock(&ailp->xa_lock); + error = xlog_recover_process_efi(log->l_mp, efip); + spin_lock(&ailp->xa_lock); + if (error) + goto out; + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } +out: + xfs_trans_ail_cursor_done(ailp, &cur); + spin_unlock(&ailp->xa_lock); + return error; +} + +/* + * This routine performs a transaction to null out a bad inode pointer + * in an agi unlinked inode hash bucket. + */ +STATIC void +xlog_recover_clear_agi_bucket( + xfs_mount_t *mp, + xfs_agnumber_t agno, + int bucket) +{ + xfs_trans_t *tp; + xfs_agi_t *agi; + xfs_buf_t *agibp; + int offset; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); + error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), + 0, 0, 0); + if (error) + goto out_abort; + + error = xfs_read_agi(mp, tp, agno, &agibp); + if (error) + goto out_abort; + + agi = XFS_BUF_TO_AGI(agibp); + agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + offset = offsetof(xfs_agi_t, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket); + xfs_trans_log_buf(tp, agibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + + error = xfs_trans_commit(tp, 0); + if (error) + goto out_error; + return; + +out_abort: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); +out_error: + xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); + return; +} + +STATIC xfs_agino_t +xlog_recover_process_one_iunlink( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino, + int bucket) +{ + struct xfs_buf *ibp; + struct xfs_dinode *dip; + struct xfs_inode *ip; + xfs_ino_t ino; + int error; + + ino = XFS_AGINO_TO_INO(mp, agno, agino); + error = xfs_iget(mp, NULL, ino, 0, 0, &ip); + if (error) + goto fail; + + /* + * Get the on disk inode to find the next inode in the bucket. + */ + error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK); + if (error) + goto fail_iput; + + ASSERT(ip->i_d.di_nlink == 0); + ASSERT(ip->i_d.di_mode != 0); + + /* setup for the next pass */ + agino = be32_to_cpu(dip->di_next_unlinked); + xfs_buf_relse(ibp); + + /* + * Prevent any DMAPI event from being sent when the reference on + * the inode is dropped. + */ + ip->i_d.di_dmevmask = 0; + + IRELE(ip); + return agino; + + fail_iput: + IRELE(ip); + fail: + /* + * We can't read in the inode this bucket points to, or this inode + * is messed up. Just ditch this bucket of inodes. We will lose + * some inodes and space, but at least we won't hang. + * + * Call xlog_recover_clear_agi_bucket() to perform a transaction to + * clear the inode pointer in the bucket. + */ + xlog_recover_clear_agi_bucket(mp, agno, bucket); + return NULLAGINO; +} + +/* + * xlog_iunlink_recover + * + * This is called during recovery to process any inodes which + * we unlinked but not freed when the system crashed. These + * inodes will be on the lists in the AGI blocks. What we do + * here is scan all the AGIs and fully truncate and free any + * inodes found on the lists. Each inode is removed from the + * lists when it has been fully truncated and is freed. The + * freeing of the inode and its removal from the list must be + * atomic. + */ +STATIC void +xlog_recover_process_iunlinks( + xlog_t *log) +{ + xfs_mount_t *mp; + xfs_agnumber_t agno; + xfs_agi_t *agi; + xfs_buf_t *agibp; + xfs_agino_t agino; + int bucket; + int error; + uint mp_dmevmask; + + mp = log->l_mp; + + /* + * Prevent any DMAPI event from being sent while in this function. + */ + mp_dmevmask = mp->m_dmevmask; + mp->m_dmevmask = 0; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + /* + * Find the agi for this ag. + */ + error = xfs_read_agi(mp, NULL, agno, &agibp); + if (error) { + /* + * AGI is b0rked. Don't process it. + * + * We should probably mark the filesystem as corrupt + * after we've recovered all the ag's we can.... + */ + continue; + } + /* + * Unlock the buffer so that it can be acquired in the normal + * course of the transaction to truncate and free each inode. + * Because we are not racing with anyone else here for the AGI + * buffer, we don't even need to hold it locked to read the + * initial unlinked bucket entries out of the buffer. We keep + * buffer reference though, so that it stays pinned in memory + * while we need the buffer. + */ + agi = XFS_BUF_TO_AGI(agibp); + xfs_buf_unlock(agibp); + + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { + agino = be32_to_cpu(agi->agi_unlinked[bucket]); + while (agino != NULLAGINO) { + agino = xlog_recover_process_one_iunlink(mp, + agno, agino, bucket); + } + } + xfs_buf_rele(agibp); + } + + mp->m_dmevmask = mp_dmevmask; +} + + +#ifdef DEBUG +STATIC void +xlog_pack_data_checksum( + xlog_t *log, + xlog_in_core_t *iclog, + int size) +{ + int i; + __be32 *up; + uint chksum = 0; + + up = (__be32 *)iclog->ic_datap; + /* divide length by 4 to get # words */ + for (i = 0; i < (size >> 2); i++) { + chksum ^= be32_to_cpu(*up); + up++; + } + iclog->ic_header.h_chksum = cpu_to_be32(chksum); +} +#else +#define xlog_pack_data_checksum(log, iclog, size) +#endif + +/* + * Stamp cycle number in every block + */ +void +xlog_pack_data( + xlog_t *log, + xlog_in_core_t *iclog, + int roundoff) +{ + int i, j, k; + int size = iclog->ic_offset + roundoff; + __be32 cycle_lsn; + xfs_caddr_t dp; + + xlog_pack_data_checksum(log, iclog, size); + + cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + + dp = iclog->ic_datap; + for (i = 0; i < BTOBB(size) && + i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { + iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = iclog->ic_data; + + for ( ; i < BTOBB(size); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + for (i = 1; i < log->l_iclog_heads; i++) { + xhdr[i].hic_xheader.xh_cycle = cycle_lsn; + } + } +} + +STATIC void +xlog_unpack_data( + xlog_rec_header_t *rhead, + xfs_caddr_t dp, + xlog_t *log) +{ + int i, j, k; + + for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && + i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { + *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; + for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; + dp += BBSIZE; + } + } +} + +STATIC int +xlog_valid_rec_header( + xlog_t *log, + xlog_rec_header_t *rhead, + xfs_daddr_t blkno) +{ + int hlen; + + if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) { + XFS_ERROR_REPORT("xlog_valid_rec_header(1)", + XFS_ERRLEVEL_LOW, log->l_mp); + return XFS_ERROR(EFSCORRUPTED); + } + if (unlikely( + (!rhead->h_version || + (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { + xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", + __func__, be32_to_cpu(rhead->h_version)); + return XFS_ERROR(EIO); + } + + /* LR body must have data or it wouldn't have been written */ + hlen = be32_to_cpu(rhead->h_len); + if (unlikely( hlen <= 0 || hlen > INT_MAX )) { + XFS_ERROR_REPORT("xlog_valid_rec_header(2)", + XFS_ERRLEVEL_LOW, log->l_mp); + return XFS_ERROR(EFSCORRUPTED); + } + if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) { + XFS_ERROR_REPORT("xlog_valid_rec_header(3)", + XFS_ERRLEVEL_LOW, log->l_mp); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +/* + * Read the log from tail to head and process the log records found. + * Handle the two cases where the tail and head are in the same cycle + * and where the active portion of the log wraps around the end of + * the physical log separately. The pass parameter is passed through + * to the routines called to process the data and is not looked at + * here. + */ +STATIC int +xlog_do_recovery_pass( + xlog_t *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk, + int pass) +{ + xlog_rec_header_t *rhead; + xfs_daddr_t blk_no; + xfs_caddr_t offset; + xfs_buf_t *hbp, *dbp; + int error = 0, h_size; + int bblks, split_bblks; + int hblks, split_hblks, wrapped_hblks; + struct hlist_head rhash[XLOG_RHASH_SIZE]; + + ASSERT(head_blk != tail_blk); + + /* + * Read the header of the tail block and get the iclog buffer size from + * h_size. Use this to tell how many sectors make up the log header. + */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + /* + * When using variable length iclogs, read first sector of + * iclog header and extract the header size from it. Get a + * new hbp that is the correct size. + */ + hbp = xlog_get_bp(log, 1); + if (!hbp) + return ENOMEM; + + error = xlog_bread(log, tail_blk, 1, hbp, &offset); + if (error) + goto bread_err1; + + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, tail_blk); + if (error) + goto bread_err1; + h_size = be32_to_cpu(rhead->h_size); + if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && + (h_size > XLOG_HEADER_CYCLE_SIZE)) { + hblks = h_size / XLOG_HEADER_CYCLE_SIZE; + if (h_size % XLOG_HEADER_CYCLE_SIZE) + hblks++; + xlog_put_bp(hbp); + hbp = xlog_get_bp(log, hblks); + } else { + hblks = 1; + } + } else { + ASSERT(log->l_sectBBsize == 1); + hblks = 1; + hbp = xlog_get_bp(log, 1); + h_size = XLOG_BIG_RECORD_BSIZE; + } + + if (!hbp) + return ENOMEM; + dbp = xlog_get_bp(log, BTOBB(h_size)); + if (!dbp) { + xlog_put_bp(hbp); + return ENOMEM; + } + + memset(rhash, 0, sizeof(rhash)); + if (tail_blk <= head_blk) { + for (blk_no = tail_blk; blk_no < head_blk; ) { + error = xlog_bread(log, blk_no, hblks, hbp, &offset); + if (error) + goto bread_err2; + + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, blk_no); + if (error) + goto bread_err2; + + /* blocks in data section */ + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); + error = xlog_bread(log, blk_no + hblks, bblks, dbp, + &offset); + if (error) + goto bread_err2; + + xlog_unpack_data(rhead, offset, log); + if ((error = xlog_recover_process_data(log, + rhash, rhead, offset, pass))) + goto bread_err2; + blk_no += bblks + hblks; + } + } else { + /* + * Perform recovery around the end of the physical log. + * When the head is not on the same cycle number as the tail, + * we can't do a sequential recovery as above. + */ + blk_no = tail_blk; + while (blk_no < log->l_logBBsize) { + /* + * Check for header wrapping around physical end-of-log + */ + offset = XFS_BUF_PTR(hbp); + split_hblks = 0; + wrapped_hblks = 0; + if (blk_no + hblks <= log->l_logBBsize) { + /* Read header in one read */ + error = xlog_bread(log, blk_no, hblks, hbp, + &offset); + if (error) + goto bread_err2; + } else { + /* This LR is split across physical log end */ + if (blk_no != log->l_logBBsize) { + /* some data before physical log end */ + ASSERT(blk_no <= INT_MAX); + split_hblks = log->l_logBBsize - (int)blk_no; + ASSERT(split_hblks > 0); + error = xlog_bread(log, blk_no, + split_hblks, hbp, + &offset); + if (error) + goto bread_err2; + } + + /* + * Note: this black magic still works with + * large sector sizes (non-512) only because: + * - we increased the buffer size originally + * by 1 sector giving us enough extra space + * for the second read; + * - the log start is guaranteed to be sector + * aligned; + * - we read the log end (LR header start) + * _first_, then the log start (LR header end) + * - order is important. + */ + wrapped_hblks = hblks - split_hblks; + error = xlog_bread_offset(log, 0, + wrapped_hblks, hbp, + offset + BBTOB(split_hblks)); + if (error) + goto bread_err2; + } + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, + split_hblks ? blk_no : 0); + if (error) + goto bread_err2; + + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); + blk_no += hblks; + + /* Read in data for log record */ + if (blk_no + bblks <= log->l_logBBsize) { + error = xlog_bread(log, blk_no, bblks, dbp, + &offset); + if (error) + goto bread_err2; + } else { + /* This log record is split across the + * physical end of log */ + offset = XFS_BUF_PTR(dbp); + split_bblks = 0; + if (blk_no != log->l_logBBsize) { + /* some data is before the physical + * end of log */ + ASSERT(!wrapped_hblks); + ASSERT(blk_no <= INT_MAX); + split_bblks = + log->l_logBBsize - (int)blk_no; + ASSERT(split_bblks > 0); + error = xlog_bread(log, blk_no, + split_bblks, dbp, + &offset); + if (error) + goto bread_err2; + } + + /* + * Note: this black magic still works with + * large sector sizes (non-512) only because: + * - we increased the buffer size originally + * by 1 sector giving us enough extra space + * for the second read; + * - the log start is guaranteed to be sector + * aligned; + * - we read the log end (LR header start) + * _first_, then the log start (LR header end) + * - order is important. + */ + error = xlog_bread_offset(log, 0, + bblks - split_bblks, hbp, + offset + BBTOB(split_bblks)); + if (error) + goto bread_err2; + } + xlog_unpack_data(rhead, offset, log); + if ((error = xlog_recover_process_data(log, rhash, + rhead, offset, pass))) + goto bread_err2; + blk_no += bblks; + } + + ASSERT(blk_no >= log->l_logBBsize); + blk_no -= log->l_logBBsize; + + /* read first part of physical log */ + while (blk_no < head_blk) { + error = xlog_bread(log, blk_no, hblks, hbp, &offset); + if (error) + goto bread_err2; + + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, blk_no); + if (error) + goto bread_err2; + + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); + error = xlog_bread(log, blk_no+hblks, bblks, dbp, + &offset); + if (error) + goto bread_err2; + + xlog_unpack_data(rhead, offset, log); + if ((error = xlog_recover_process_data(log, rhash, + rhead, offset, pass))) + goto bread_err2; + blk_no += bblks + hblks; + } + } + + bread_err2: + xlog_put_bp(dbp); + bread_err1: + xlog_put_bp(hbp); + return error; +} + +/* + * Do the recovery of the log. We actually do this in two phases. + * The two passes are necessary in order to implement the function + * of cancelling a record written into the log. The first pass + * determines those things which have been cancelled, and the + * second pass replays log items normally except for those which + * have been cancelled. The handling of the replay and cancellations + * takes place in the log item type specific routines. + * + * The table of items which have cancel records in the log is allocated + * and freed at this level, since only here do we know when all of + * the log recovery has been completed. + */ +STATIC int +xlog_do_log_recovery( + xlog_t *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + int error, i; + + ASSERT(head_blk != tail_blk); + + /* + * First do a pass to find all of the cancelled buf log items. + * Store them in the buf_cancel_table for use in the second pass. + */ + log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * + sizeof(struct list_head), + KM_SLEEP); + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + + error = xlog_do_recovery_pass(log, head_blk, tail_blk, + XLOG_RECOVER_PASS1); + if (error != 0) { + kmem_free(log->l_buf_cancel_table); + log->l_buf_cancel_table = NULL; + return error; + } + /* + * Then do a second pass to actually recover the items in the log. + * When it is complete free the table of buf cancel items. + */ + error = xlog_do_recovery_pass(log, head_blk, tail_blk, + XLOG_RECOVER_PASS2); +#ifdef DEBUG + if (!error) { + int i; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + ASSERT(list_empty(&log->l_buf_cancel_table[i])); + } +#endif /* DEBUG */ + + kmem_free(log->l_buf_cancel_table); + log->l_buf_cancel_table = NULL; + + return error; +} + +/* + * Do the actual recovery + */ +STATIC int +xlog_do_recover( + xlog_t *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + int error; + xfs_buf_t *bp; + xfs_sb_t *sbp; + + /* + * First replay the images in the log. + */ + error = xlog_do_log_recovery(log, head_blk, tail_blk); + if (error) { + return error; + } + + XFS_bflush(log->l_mp->m_ddev_targp); + + /* + * If IO errors happened during recovery, bail out. + */ + if (XFS_FORCED_SHUTDOWN(log->l_mp)) { + return (EIO); + } + + /* + * We now update the tail_lsn since much of the recovery has completed + * and there may be space available to use. If there were no extent + * or iunlinks, we can free up the entire log and set the tail_lsn to + * be the last_sync_lsn. This was set in xlog_find_tail to be the + * lsn of the last known good LR on disk. If there are extent frees + * or iunlinks they will have some entries in the AIL; so we look at + * the AIL to determine how to set the tail_lsn. + */ + xlog_assign_tail_lsn(log->l_mp); + + /* + * Now that we've finished replaying all buffer and inode + * updates, re-read in the superblock. + */ + bp = xfs_getsb(log->l_mp, 0); + XFS_BUF_UNDONE(bp); + ASSERT(!(XFS_BUF_ISWRITE(bp))); + ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); + XFS_BUF_READ(bp); + XFS_BUF_UNASYNC(bp); + xfsbdstrat(log->l_mp, bp); + error = xfs_buf_iowait(bp); + if (error) { + xfs_ioerror_alert("xlog_do_recover", + log->l_mp, bp, XFS_BUF_ADDR(bp)); + ASSERT(0); + xfs_buf_relse(bp); + return error; + } + + /* Convert superblock from on-disk format */ + sbp = &log->l_mp->m_sb; + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); + ASSERT(xfs_sb_good_version(sbp)); + xfs_buf_relse(bp); + + /* We've re-read the superblock so re-initialize per-cpu counters */ + xfs_icsb_reinit_counters(log->l_mp); + + xlog_recover_check_summary(log); + + /* Normal transactions can now occur */ + log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + return 0; +} + +/* + * Perform recovery and re-initialize some log variables in xlog_find_tail. + * + * Return error or zero. + */ +int +xlog_recover( + xlog_t *log) +{ + xfs_daddr_t head_blk, tail_blk; + int error; + + /* find the tail of the log */ + if ((error = xlog_find_tail(log, &head_blk, &tail_blk))) + return error; + + if (tail_blk != head_blk) { + /* There used to be a comment here: + * + * disallow recovery on read-only mounts. note -- mount + * checks for ENOSPC and turns it into an intelligent + * error message. + * ...but this is no longer true. Now, unless you specify + * NORECOVERY (in which case this function would never be + * called), we just go ahead and recover. We do this all + * under the vfs layer, so we can get away with it unless + * the device itself is read-only, in which case we fail. + */ + if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) { + return error; + } + + xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", + log->l_mp->m_logname ? log->l_mp->m_logname + : "internal"); + + error = xlog_do_recover(log, head_blk, tail_blk); + log->l_flags |= XLOG_RECOVERY_NEEDED; + } + return error; +} + +/* + * In the first part of recovery we replay inodes and buffers and build + * up the list of extent free items which need to be processed. Here + * we process the extent free items and clean up the on disk unlinked + * inode lists. This is separated from the first part of recovery so + * that the root and real-time bitmap inodes can be read in from disk in + * between the two stages. This is necessary so that we can free space + * in the real-time portion of the file system. + */ +int +xlog_recover_finish( + xlog_t *log) +{ + /* + * Now we're ready to do the transactions needed for the + * rest of recovery. Start with completing all the extent + * free intent records and then process the unlinked inode + * lists. At this point, we essentially run in normal mode + * except that we're still performing recovery actions + * rather than accepting new requests. + */ + if (log->l_flags & XLOG_RECOVERY_NEEDED) { + int error; + error = xlog_recover_process_efis(log); + if (error) { + xfs_alert(log->l_mp, "Failed to recover EFIs"); + return error; + } + /* + * Sync the log to get all the EFIs out of the AIL. + * This isn't absolutely necessary, but it helps in + * case the unlink transactions would have problems + * pushing the EFIs out of the way. + */ + xfs_log_force(log->l_mp, XFS_LOG_SYNC); + + xlog_recover_process_iunlinks(log); + + xlog_recover_check_summary(log); + + xfs_notice(log->l_mp, "Ending recovery (logdev: %s)", + log->l_mp->m_logname ? log->l_mp->m_logname + : "internal"); + log->l_flags &= ~XLOG_RECOVERY_NEEDED; + } else { + xfs_info(log->l_mp, "Ending clean mount"); + } + return 0; +} + + +#if defined(DEBUG) +/* + * Read all of the agf and agi counters and check that they + * are consistent with the superblock counters. + */ +void +xlog_recover_check_summary( + xlog_t *log) +{ + xfs_mount_t *mp; + xfs_agf_t *agfp; + xfs_buf_t *agfbp; + xfs_buf_t *agibp; + xfs_agnumber_t agno; + __uint64_t freeblks; + __uint64_t itotal; + __uint64_t ifree; + int error; + + mp = log->l_mp; + + freeblks = 0LL; + itotal = 0LL; + ifree = 0LL; + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); + if (error) { + xfs_alert(mp, "%s agf read failed agno %d error %d", + __func__, agno, error); + } else { + agfp = XFS_BUF_TO_AGF(agfbp); + freeblks += be32_to_cpu(agfp->agf_freeblks) + + be32_to_cpu(agfp->agf_flcount); + xfs_buf_relse(agfbp); + } + + error = xfs_read_agi(mp, NULL, agno, &agibp); + if (error) { + xfs_alert(mp, "%s agi read failed agno %d error %d", + __func__, agno, error); + } else { + struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + + itotal += be32_to_cpu(agi->agi_count); + ifree += be32_to_cpu(agi->agi_freecount); + xfs_buf_relse(agibp); + } + } +} +#endif /* DEBUG */ diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h new file mode 100644 index 00000000..1c55ccbb --- /dev/null +++ b/fs/xfs/xfs_log_recover.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_RECOVER_H__ +#define __XFS_LOG_RECOVER_H__ + +/* + * Macros, structures, prototypes for internal log manager use. + */ + +#define XLOG_RHASH_BITS 4 +#define XLOG_RHASH_SIZE 16 +#define XLOG_RHASH_SHIFT 2 +#define XLOG_RHASH(tid) \ + ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) + +#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1) + + +/* + * item headers are in ri_buf[0]. Additional buffers follow. + */ +typedef struct xlog_recover_item { + struct list_head ri_list; + int ri_type; + int ri_cnt; /* count of regions found */ + int ri_total; /* total regions */ + xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ +} xlog_recover_item_t; + +struct xlog_tid; +typedef struct xlog_recover { + struct hlist_node r_list; + xlog_tid_t r_log_tid; /* log's transaction id */ + xfs_trans_header_t r_theader; /* trans header for partial */ + int r_state; /* not needed */ + xfs_lsn_t r_lsn; /* xact lsn */ + struct list_head r_itemq; /* q for items */ +} xlog_recover_t; + +#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) + +/* + * This is the number of entries in the l_buf_cancel_table used during + * recovery. + */ +#define XLOG_BC_TABLE_SIZE 64 + +#define XLOG_RECOVER_PASS1 1 +#define XLOG_RECOVER_PASS2 2 + +#endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c new file mode 100644 index 00000000..9afdd497 --- /dev/null +++ b/fs/xfs/xfs_mount.c @@ -0,0 +1,2585 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_rw.h" +#include "xfs_quota.h" +#include "xfs_fsops.h" +#include "xfs_utils.h" +#include "xfs_trace.h" + + +#ifdef HAVE_PERCPU_SB +STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, + int); +STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, + int); +STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); +#else + +#define xfs_icsb_balance_counter(mp, a, b) do { } while (0) +#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) +#endif + +static const struct { + short offset; + short type; /* 0 = integer + * 1 = binary / string (no translation) + */ +} xfs_sb_info[] = { + { offsetof(xfs_sb_t, sb_magicnum), 0 }, + { offsetof(xfs_sb_t, sb_blocksize), 0 }, + { offsetof(xfs_sb_t, sb_dblocks), 0 }, + { offsetof(xfs_sb_t, sb_rblocks), 0 }, + { offsetof(xfs_sb_t, sb_rextents), 0 }, + { offsetof(xfs_sb_t, sb_uuid), 1 }, + { offsetof(xfs_sb_t, sb_logstart), 0 }, + { offsetof(xfs_sb_t, sb_rootino), 0 }, + { offsetof(xfs_sb_t, sb_rbmino), 0 }, + { offsetof(xfs_sb_t, sb_rsumino), 0 }, + { offsetof(xfs_sb_t, sb_rextsize), 0 }, + { offsetof(xfs_sb_t, sb_agblocks), 0 }, + { offsetof(xfs_sb_t, sb_agcount), 0 }, + { offsetof(xfs_sb_t, sb_rbmblocks), 0 }, + { offsetof(xfs_sb_t, sb_logblocks), 0 }, + { offsetof(xfs_sb_t, sb_versionnum), 0 }, + { offsetof(xfs_sb_t, sb_sectsize), 0 }, + { offsetof(xfs_sb_t, sb_inodesize), 0 }, + { offsetof(xfs_sb_t, sb_inopblock), 0 }, + { offsetof(xfs_sb_t, sb_fname[0]), 1 }, + { offsetof(xfs_sb_t, sb_blocklog), 0 }, + { offsetof(xfs_sb_t, sb_sectlog), 0 }, + { offsetof(xfs_sb_t, sb_inodelog), 0 }, + { offsetof(xfs_sb_t, sb_inopblog), 0 }, + { offsetof(xfs_sb_t, sb_agblklog), 0 }, + { offsetof(xfs_sb_t, sb_rextslog), 0 }, + { offsetof(xfs_sb_t, sb_inprogress), 0 }, + { offsetof(xfs_sb_t, sb_imax_pct), 0 }, + { offsetof(xfs_sb_t, sb_icount), 0 }, + { offsetof(xfs_sb_t, sb_ifree), 0 }, + { offsetof(xfs_sb_t, sb_fdblocks), 0 }, + { offsetof(xfs_sb_t, sb_frextents), 0 }, + { offsetof(xfs_sb_t, sb_uquotino), 0 }, + { offsetof(xfs_sb_t, sb_gquotino), 0 }, + { offsetof(xfs_sb_t, sb_qflags), 0 }, + { offsetof(xfs_sb_t, sb_flags), 0 }, + { offsetof(xfs_sb_t, sb_shared_vn), 0 }, + { offsetof(xfs_sb_t, sb_inoalignmt), 0 }, + { offsetof(xfs_sb_t, sb_unit), 0 }, + { offsetof(xfs_sb_t, sb_width), 0 }, + { offsetof(xfs_sb_t, sb_dirblklog), 0 }, + { offsetof(xfs_sb_t, sb_logsectlog), 0 }, + { offsetof(xfs_sb_t, sb_logsectsize),0 }, + { offsetof(xfs_sb_t, sb_logsunit), 0 }, + { offsetof(xfs_sb_t, sb_features2), 0 }, + { offsetof(xfs_sb_t, sb_bad_features2), 0 }, + { sizeof(xfs_sb_t), 0 } +}; + +static DEFINE_MUTEX(xfs_uuid_table_mutex); +static int xfs_uuid_table_size; +static uuid_t *xfs_uuid_table; + +/* + * See if the UUID is unique among mounted XFS filesystems. + * Mount fails if UUID is nil or a FS with the same UUID is already mounted. + */ +STATIC int +xfs_uuid_mount( + struct xfs_mount *mp) +{ + uuid_t *uuid = &mp->m_sb.sb_uuid; + int hole, i; + + if (mp->m_flags & XFS_MOUNT_NOUUID) + return 0; + + if (uuid_is_nil(uuid)) { + xfs_warn(mp, "Filesystem has nil UUID - can't mount"); + return XFS_ERROR(EINVAL); + } + + mutex_lock(&xfs_uuid_table_mutex); + for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) { + if (uuid_is_nil(&xfs_uuid_table[i])) { + hole = i; + continue; + } + if (uuid_equal(uuid, &xfs_uuid_table[i])) + goto out_duplicate; + } + + if (hole < 0) { + xfs_uuid_table = kmem_realloc(xfs_uuid_table, + (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), + xfs_uuid_table_size * sizeof(*xfs_uuid_table), + KM_SLEEP); + hole = xfs_uuid_table_size++; + } + xfs_uuid_table[hole] = *uuid; + mutex_unlock(&xfs_uuid_table_mutex); + + return 0; + + out_duplicate: + mutex_unlock(&xfs_uuid_table_mutex); + xfs_warn(mp, "Filesystem has duplicate UUID - can't mount"); + return XFS_ERROR(EINVAL); +} + +STATIC void +xfs_uuid_unmount( + struct xfs_mount *mp) +{ + uuid_t *uuid = &mp->m_sb.sb_uuid; + int i; + + if (mp->m_flags & XFS_MOUNT_NOUUID) + return; + + mutex_lock(&xfs_uuid_table_mutex); + for (i = 0; i < xfs_uuid_table_size; i++) { + if (uuid_is_nil(&xfs_uuid_table[i])) + continue; + if (!uuid_equal(uuid, &xfs_uuid_table[i])) + continue; + memset(&xfs_uuid_table[i], 0, sizeof(uuid_t)); + break; + } + ASSERT(i < xfs_uuid_table_size); + mutex_unlock(&xfs_uuid_table_mutex); +} + + +/* + * Reference counting access wrappers to the perag structures. + * Because we never free per-ag structures, the only thing we + * have to protect against changes is the tree structure itself. + */ +struct xfs_perag * +xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ref = 0; + + rcu_read_lock(); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + ASSERT(atomic_read(&pag->pag_ref) >= 0); + ref = atomic_inc_return(&pag->pag_ref); + } + rcu_read_unlock(); + trace_xfs_perag_get(mp, agno, ref, _RET_IP_); + return pag; +} + +/* + * search from @first to find the next perag with the given tag set. + */ +struct xfs_perag * +xfs_perag_get_tag( + struct xfs_mount *mp, + xfs_agnumber_t first, + int tag) +{ + struct xfs_perag *pag; + int found; + int ref; + + rcu_read_lock(); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, first, 1, tag); + if (found <= 0) { + rcu_read_unlock(); + return NULL; + } + ref = atomic_inc_return(&pag->pag_ref); + rcu_read_unlock(); + trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); + return pag; +} + +void +xfs_perag_put(struct xfs_perag *pag) +{ + int ref; + + ASSERT(atomic_read(&pag->pag_ref) > 0); + ref = atomic_dec_return(&pag->pag_ref); + trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); +} + +STATIC void +__xfs_free_perag( + struct rcu_head *head) +{ + struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); + + ASSERT(atomic_read(&pag->pag_ref) == 0); + kmem_free(pag); +} + +/* + * Free up the per-ag resources associated with the mount structure. + */ +STATIC void +xfs_free_perag( + xfs_mount_t *mp) +{ + xfs_agnumber_t agno; + struct xfs_perag *pag; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + spin_lock(&mp->m_perag_lock); + pag = radix_tree_delete(&mp->m_perag_tree, agno); + spin_unlock(&mp->m_perag_lock); + ASSERT(pag); + ASSERT(atomic_read(&pag->pag_ref) == 0); + call_rcu(&pag->rcu_head, __xfs_free_perag); + } +} + +/* + * Check size of device based on the (data/realtime) block count. + * Note: this check is used by the growfs code as well as mount. + */ +int +xfs_sb_validate_fsb_count( + xfs_sb_t *sbp, + __uint64_t nblocks) +{ + ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); + ASSERT(sbp->sb_blocklog >= BBSHIFT); + +#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ + if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) + return EFBIG; +#else /* Limited by UINT_MAX of sectors */ + if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX) + return EFBIG; +#endif + return 0; +} + +/* + * Check the validity of the SB found. + */ +STATIC int +xfs_mount_validate_sb( + xfs_mount_t *mp, + xfs_sb_t *sbp, + int flags) +{ + int loud = !(flags & XFS_MFSI_QUIET); + + /* + * If the log device and data device have the + * same device number, the log is internal. + * Consequently, the sb_logstart should be non-zero. If + * we have a zero sb_logstart in this case, we may be trying to mount + * a volume filesystem in a non-volume manner. + */ + if (sbp->sb_magicnum != XFS_SB_MAGIC) { + if (loud) + xfs_warn(mp, "bad magic number"); + return XFS_ERROR(EWRONGFS); + } + + if (!xfs_sb_good_version(sbp)) { + if (loud) + xfs_warn(mp, "bad version"); + return XFS_ERROR(EWRONGFS); + } + + if (unlikely( + sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { + if (loud) + xfs_warn(mp, + "filesystem is marked as having an external log; " + "specify logdev on the mount command line."); + return XFS_ERROR(EINVAL); + } + + if (unlikely( + sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { + if (loud) + xfs_warn(mp, + "filesystem is marked as having an internal log; " + "do not specify logdev on the mount command line."); + return XFS_ERROR(EINVAL); + } + + /* + * More sanity checking. These were stolen directly from + * xfs_repair. + */ + if (unlikely( + sbp->sb_agcount <= 0 || + sbp->sb_sectsize < XFS_MIN_SECTORSIZE || + sbp->sb_sectsize > XFS_MAX_SECTORSIZE || + sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || + sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || + sbp->sb_sectsize != (1 << sbp->sb_sectlog) || + sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || + sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || + sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || + sbp->sb_blocksize != (1 << sbp->sb_blocklog) || + sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || + sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || + sbp->sb_inodelog < XFS_DINODE_MIN_LOG || + sbp->sb_inodelog > XFS_DINODE_MAX_LOG || + sbp->sb_inodesize != (1 << sbp->sb_inodelog) || + (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || + (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || + (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || + (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) { + if (loud) + xfs_warn(mp, "SB sanity check 1 failed"); + return XFS_ERROR(EFSCORRUPTED); + } + + /* + * Sanity check AG count, size fields against data size field + */ + if (unlikely( + sbp->sb_dblocks == 0 || + sbp->sb_dblocks > + (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks || + sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) * + sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) { + if (loud) + xfs_warn(mp, "SB sanity check 2 failed"); + return XFS_ERROR(EFSCORRUPTED); + } + + /* + * Until this is fixed only page-sized or smaller data blocks work. + */ + if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { + if (loud) { + xfs_warn(mp, + "File system with blocksize %d bytes. " + "Only pagesize (%ld) or less will currently work.", + sbp->sb_blocksize, PAGE_SIZE); + } + return XFS_ERROR(ENOSYS); + } + + /* + * Currently only very few inode sizes are supported. + */ + switch (sbp->sb_inodesize) { + case 256: + case 512: + case 1024: + case 2048: + break; + default: + if (loud) + xfs_warn(mp, "inode size of %d bytes not supported", + sbp->sb_inodesize); + return XFS_ERROR(ENOSYS); + } + + if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || + xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { + if (loud) + xfs_warn(mp, + "file system too large to be mounted on this system."); + return XFS_ERROR(EFBIG); + } + + if (unlikely(sbp->sb_inprogress)) { + if (loud) + xfs_warn(mp, "file system busy"); + return XFS_ERROR(EFSCORRUPTED); + } + + /* + * Version 1 directory format has never worked on Linux. + */ + if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { + if (loud) + xfs_warn(mp, + "file system using version 1 directory format"); + return XFS_ERROR(ENOSYS); + } + + return 0; +} + +int +xfs_initialize_perag( + xfs_mount_t *mp, + xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi) +{ + xfs_agnumber_t index, max_metadata; + xfs_agnumber_t first_initialised = 0; + xfs_perag_t *pag; + xfs_agino_t agino; + xfs_ino_t ino; + xfs_sb_t *sbp = &mp->m_sb; + int error = -ENOMEM; + + /* + * Walk the current per-ag tree so we don't try to initialise AGs + * that already exist (growfs case). Allocate and insert all the + * AGs we don't find ready for initialisation. + */ + for (index = 0; index < agcount; index++) { + pag = xfs_perag_get(mp, index); + if (pag) { + xfs_perag_put(pag); + continue; + } + if (!first_initialised) + first_initialised = index; + + pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); + if (!pag) + goto out_unwind; + pag->pag_agno = index; + pag->pag_mount = mp; + spin_lock_init(&pag->pag_ici_lock); + mutex_init(&pag->pag_ici_reclaim_lock); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + spin_lock_init(&pag->pag_buf_lock); + pag->pag_buf_tree = RB_ROOT; + + if (radix_tree_preload(GFP_NOFS)) + goto out_unwind; + + spin_lock(&mp->m_perag_lock); + if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { + BUG(); + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + error = -EEXIST; + goto out_unwind; + } + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + } + + /* + * If we mount with the inode64 option, or no inode overflows + * the legacy 32-bit address space clear the inode32 option. + */ + agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); + ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); + + if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32) + mp->m_flags |= XFS_MOUNT_32BITINODES; + else + mp->m_flags &= ~XFS_MOUNT_32BITINODES; + + if (mp->m_flags & XFS_MOUNT_32BITINODES) { + /* + * Calculate how much should be reserved for inodes to meet + * the max inode percentage. + */ + if (mp->m_maxicount) { + __uint64_t icount; + + icount = sbp->sb_dblocks * sbp->sb_imax_pct; + do_div(icount, 100); + icount += sbp->sb_agblocks - 1; + do_div(icount, sbp->sb_agblocks); + max_metadata = icount; + } else { + max_metadata = agcount; + } + + for (index = 0; index < agcount; index++) { + ino = XFS_AGINO_TO_INO(mp, index, agino); + if (ino > XFS_MAXINUMBER_32) { + index++; + break; + } + + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 1; + if (index < max_metadata) + pag->pagf_metadata = 1; + xfs_perag_put(pag); + } + } else { + for (index = 0; index < agcount; index++) { + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 1; + xfs_perag_put(pag); + } + } + + if (maxagi) + *maxagi = index; + return 0; + +out_unwind: + kmem_free(pag); + for (; index > first_initialised; index--) { + pag = radix_tree_delete(&mp->m_perag_tree, index); + kmem_free(pag); + } + return error; +} + +void +xfs_sb_from_disk( + xfs_sb_t *to, + xfs_dsb_t *from) +{ + to->sb_magicnum = be32_to_cpu(from->sb_magicnum); + to->sb_blocksize = be32_to_cpu(from->sb_blocksize); + to->sb_dblocks = be64_to_cpu(from->sb_dblocks); + to->sb_rblocks = be64_to_cpu(from->sb_rblocks); + to->sb_rextents = be64_to_cpu(from->sb_rextents); + memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); + to->sb_logstart = be64_to_cpu(from->sb_logstart); + to->sb_rootino = be64_to_cpu(from->sb_rootino); + to->sb_rbmino = be64_to_cpu(from->sb_rbmino); + to->sb_rsumino = be64_to_cpu(from->sb_rsumino); + to->sb_rextsize = be32_to_cpu(from->sb_rextsize); + to->sb_agblocks = be32_to_cpu(from->sb_agblocks); + to->sb_agcount = be32_to_cpu(from->sb_agcount); + to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks); + to->sb_logblocks = be32_to_cpu(from->sb_logblocks); + to->sb_versionnum = be16_to_cpu(from->sb_versionnum); + to->sb_sectsize = be16_to_cpu(from->sb_sectsize); + to->sb_inodesize = be16_to_cpu(from->sb_inodesize); + to->sb_inopblock = be16_to_cpu(from->sb_inopblock); + memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); + to->sb_blocklog = from->sb_blocklog; + to->sb_sectlog = from->sb_sectlog; + to->sb_inodelog = from->sb_inodelog; + to->sb_inopblog = from->sb_inopblog; + to->sb_agblklog = from->sb_agblklog; + to->sb_rextslog = from->sb_rextslog; + to->sb_inprogress = from->sb_inprogress; + to->sb_imax_pct = from->sb_imax_pct; + to->sb_icount = be64_to_cpu(from->sb_icount); + to->sb_ifree = be64_to_cpu(from->sb_ifree); + to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks); + to->sb_frextents = be64_to_cpu(from->sb_frextents); + to->sb_uquotino = be64_to_cpu(from->sb_uquotino); + to->sb_gquotino = be64_to_cpu(from->sb_gquotino); + to->sb_qflags = be16_to_cpu(from->sb_qflags); + to->sb_flags = from->sb_flags; + to->sb_shared_vn = from->sb_shared_vn; + to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt); + to->sb_unit = be32_to_cpu(from->sb_unit); + to->sb_width = be32_to_cpu(from->sb_width); + to->sb_dirblklog = from->sb_dirblklog; + to->sb_logsectlog = from->sb_logsectlog; + to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize); + to->sb_logsunit = be32_to_cpu(from->sb_logsunit); + to->sb_features2 = be32_to_cpu(from->sb_features2); + to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2); +} + +/* + * Copy in core superblock to ondisk one. + * + * The fields argument is mask of superblock fields to copy. + */ +void +xfs_sb_to_disk( + xfs_dsb_t *to, + xfs_sb_t *from, + __int64_t fields) +{ + xfs_caddr_t to_ptr = (xfs_caddr_t)to; + xfs_caddr_t from_ptr = (xfs_caddr_t)from; + xfs_sb_field_t f; + int first; + int size; + + ASSERT(fields); + if (!fields) + return; + + while (fields) { + f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); + first = xfs_sb_info[f].offset; + size = xfs_sb_info[f + 1].offset - first; + + ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1); + + if (size == 1 || xfs_sb_info[f].type == 1) { + memcpy(to_ptr + first, from_ptr + first, size); + } else { + switch (size) { + case 2: + *(__be16 *)(to_ptr + first) = + cpu_to_be16(*(__u16 *)(from_ptr + first)); + break; + case 4: + *(__be32 *)(to_ptr + first) = + cpu_to_be32(*(__u32 *)(from_ptr + first)); + break; + case 8: + *(__be64 *)(to_ptr + first) = + cpu_to_be64(*(__u64 *)(from_ptr + first)); + break; + default: + ASSERT(0); + } + } + + fields &= ~(1LL << f); + } +} + +/* + * xfs_readsb + * + * Does the initial read of the superblock. + */ +int +xfs_readsb(xfs_mount_t *mp, int flags) +{ + unsigned int sector_size; + xfs_buf_t *bp; + int error; + int loud = !(flags & XFS_MFSI_QUIET); + + ASSERT(mp->m_sb_bp == NULL); + ASSERT(mp->m_ddev_targp != NULL); + + /* + * Allocate a (locked) buffer to hold the superblock. + * This will be kept around at all times to optimize + * access to the superblock. + */ + sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); + +reread: + bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, + XFS_SB_DADDR, sector_size, 0); + if (!bp) { + if (loud) + xfs_warn(mp, "SB buffer read failed"); + return EIO; + } + + /* + * Initialize the mount structure from the superblock. + * But first do some basic consistency checking. + */ + xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); + error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); + if (error) { + if (loud) + xfs_warn(mp, "SB validate failed"); + goto release_buf; + } + + /* + * We must be able to do sector-sized and sector-aligned IO. + */ + if (sector_size > mp->m_sb.sb_sectsize) { + if (loud) + xfs_warn(mp, "device supports %u byte sectors (not %u)", + sector_size, mp->m_sb.sb_sectsize); + error = ENOSYS; + goto release_buf; + } + + /* + * If device sector size is smaller than the superblock size, + * re-read the superblock so the buffer is correctly sized. + */ + if (sector_size < mp->m_sb.sb_sectsize) { + xfs_buf_relse(bp); + sector_size = mp->m_sb.sb_sectsize; + goto reread; + } + + /* Initialize per-cpu counters */ + xfs_icsb_reinit_counters(mp); + + mp->m_sb_bp = bp; + xfs_buf_unlock(bp); + return 0; + +release_buf: + xfs_buf_relse(bp); + return error; +} + + +/* + * xfs_mount_common + * + * Mount initialization code establishing various mount + * fields from the superblock associated with the given + * mount structure + */ +STATIC void +xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) +{ + mp->m_agfrotor = mp->m_agirotor = 0; + spin_lock_init(&mp->m_agirotor_lock); + mp->m_maxagi = mp->m_sb.sb_agcount; + mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; + mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; + mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; + mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; + mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; + mp->m_blockmask = sbp->sb_blocksize - 1; + mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; + mp->m_blockwmask = mp->m_blockwsize - 1; + + mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; + mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; + + mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2; + mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2; + + mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; + mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; + + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); + mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, + sbp->sb_inopblock); + mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; +} + +/* + * xfs_initialize_perag_data + * + * Read in each per-ag structure so we can count up the number of + * allocated inodes, free inodes and used filesystem blocks as this + * information is no longer persistent in the superblock. Once we have + * this information, write it into the in-core superblock structure. + */ +STATIC int +xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount) +{ + xfs_agnumber_t index; + xfs_perag_t *pag; + xfs_sb_t *sbp = &mp->m_sb; + uint64_t ifree = 0; + uint64_t ialloc = 0; + uint64_t bfree = 0; + uint64_t bfreelst = 0; + uint64_t btree = 0; + int error; + + for (index = 0; index < agcount; index++) { + /* + * read the agf, then the agi. This gets us + * all the information we need and populates the + * per-ag structures for us. + */ + error = xfs_alloc_pagf_init(mp, NULL, index, 0); + if (error) + return error; + + error = xfs_ialloc_pagi_init(mp, NULL, index); + if (error) + return error; + pag = xfs_perag_get(mp, index); + ifree += pag->pagi_freecount; + ialloc += pag->pagi_count; + bfree += pag->pagf_freeblks; + bfreelst += pag->pagf_flcount; + btree += pag->pagf_btreeblks; + xfs_perag_put(pag); + } + /* + * Overwrite incore superblock counters with just-read data + */ + spin_lock(&mp->m_sb_lock); + sbp->sb_ifree = ifree; + sbp->sb_icount = ialloc; + sbp->sb_fdblocks = bfree + bfreelst + btree; + spin_unlock(&mp->m_sb_lock); + + /* Fixup the per-cpu counters as well. */ + xfs_icsb_reinit_counters(mp); + + return 0; +} + +/* + * Update alignment values based on mount options and sb values + */ +STATIC int +xfs_update_alignment(xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + + if (mp->m_dalign) { + /* + * If stripe unit and stripe width are not multiples + * of the fs blocksize turn off alignment. + */ + if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || + (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + if (mp->m_flags & XFS_MOUNT_RETERR) { + xfs_warn(mp, "alignment check 1 failed"); + return XFS_ERROR(EINVAL); + } + mp->m_dalign = mp->m_swidth = 0; + } else { + /* + * Convert the stripe unit and width to FSBs. + */ + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { + if (mp->m_flags & XFS_MOUNT_RETERR) { + return XFS_ERROR(EINVAL); + } + xfs_warn(mp, + "stripe alignment turned off: sunit(%d)/swidth(%d) " + "incompatible with agsize(%d)", + mp->m_dalign, mp->m_swidth, + sbp->sb_agblocks); + + mp->m_dalign = 0; + mp->m_swidth = 0; + } else if (mp->m_dalign) { + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); + } else { + if (mp->m_flags & XFS_MOUNT_RETERR) { + xfs_warn(mp, + "stripe alignment turned off: sunit(%d) less than bsize(%d)", + mp->m_dalign, + mp->m_blockmask +1); + return XFS_ERROR(EINVAL); + } + mp->m_swidth = 0; + } + } + + /* + * Update superblock with new values + * and log changes + */ + if (xfs_sb_version_hasdalign(sbp)) { + if (sbp->sb_unit != mp->m_dalign) { + sbp->sb_unit = mp->m_dalign; + mp->m_update_flags |= XFS_SB_UNIT; + } + if (sbp->sb_width != mp->m_swidth) { + sbp->sb_width = mp->m_swidth; + mp->m_update_flags |= XFS_SB_WIDTH; + } + } + } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && + xfs_sb_version_hasdalign(&mp->m_sb)) { + mp->m_dalign = sbp->sb_unit; + mp->m_swidth = sbp->sb_width; + } + + return 0; +} + +/* + * Set the maximum inode count for this filesystem + */ +STATIC void +xfs_set_maxicount(xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + __uint64_t icount; + + if (sbp->sb_imax_pct) { + /* + * Make sure the maximum inode count is a multiple + * of the units we allocate inodes in. + */ + icount = sbp->sb_dblocks * sbp->sb_imax_pct; + do_div(icount, 100); + do_div(icount, mp->m_ialloc_blks); + mp->m_maxicount = (icount * mp->m_ialloc_blks) << + sbp->sb_inopblog; + } else { + mp->m_maxicount = 0; + } +} + +/* + * Set the default minimum read and write sizes unless + * already specified in a mount option. + * We use smaller I/O sizes when the file system + * is being used for NFS service (wsync mount option). + */ +STATIC void +xfs_set_rw_sizes(xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + int readio_log, writeio_log; + + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { + if (mp->m_flags & XFS_MOUNT_WSYNC) { + readio_log = XFS_WSYNC_READIO_LOG; + writeio_log = XFS_WSYNC_WRITEIO_LOG; + } else { + readio_log = XFS_READIO_LOG_LARGE; + writeio_log = XFS_WRITEIO_LOG_LARGE; + } + } else { + readio_log = mp->m_readio_log; + writeio_log = mp->m_writeio_log; + } + + if (sbp->sb_blocklog > readio_log) { + mp->m_readio_log = sbp->sb_blocklog; + } else { + mp->m_readio_log = readio_log; + } + mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog); + if (sbp->sb_blocklog > writeio_log) { + mp->m_writeio_log = sbp->sb_blocklog; + } else { + mp->m_writeio_log = writeio_log; + } + mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog); +} + +/* + * precalculate the low space thresholds for dynamic speculative preallocation. + */ +void +xfs_set_low_space_thresholds( + struct xfs_mount *mp) +{ + int i; + + for (i = 0; i < XFS_LOWSP_MAX; i++) { + __uint64_t space = mp->m_sb.sb_dblocks; + + do_div(space, 100); + mp->m_low_space[i] = space * (i + 1); + } +} + + +/* + * Set whether we're using inode alignment. + */ +STATIC void +xfs_set_inoalignment(xfs_mount_t *mp) +{ + if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1; + else + mp->m_inoalign_mask = 0; + /* + * If we are using stripe alignment, check whether + * the stripe unit is a multiple of the inode alignment + */ + if (mp->m_dalign && mp->m_inoalign_mask && + !(mp->m_dalign & mp->m_inoalign_mask)) + mp->m_sinoalign = mp->m_dalign; + else + mp->m_sinoalign = 0; +} + +/* + * Check that the data (and log if separate) are an ok size. + */ +STATIC int +xfs_check_sizes(xfs_mount_t *mp) +{ + xfs_buf_t *bp; + xfs_daddr_t d; + + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { + xfs_warn(mp, "filesystem size mismatch detected"); + return XFS_ERROR(EFBIG); + } + bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, + d - XFS_FSS_TO_BB(mp, 1), + BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); + if (!bp) { + xfs_warn(mp, "last sector read failed"); + return EIO; + } + xfs_buf_relse(bp); + + if (mp->m_logdev_targp != mp->m_ddev_targp) { + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { + xfs_warn(mp, "log size mismatch detected"); + return XFS_ERROR(EFBIG); + } + bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp, + d - XFS_FSB_TO_BB(mp, 1), + XFS_FSB_TO_B(mp, 1), 0); + if (!bp) { + xfs_warn(mp, "log device read failed"); + return EIO; + } + xfs_buf_relse(bp); + } + return 0; +} + +/* + * Clear the quotaflags in memory and in the superblock. + */ +int +xfs_mount_reset_sbqflags( + struct xfs_mount *mp) +{ + int error; + struct xfs_trans *tp; + + mp->m_qflags = 0; + + /* + * It is OK to look at sb_qflags here in mount path, + * without m_sb_lock. + */ + if (mp->m_sb.sb_qflags == 0) + return 0; + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = 0; + spin_unlock(&mp->m_sb_lock); + + /* + * If the fs is readonly, let the incore superblock run + * with quotas off but don't flush the update out to disk + */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return 0; + +#ifdef QUOTADEBUG + xfs_notice(mp, "Writing superblock quota changes"); +#endif + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); + error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, + XFS_DEFAULT_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + xfs_alert(mp, "%s: Superblock update failed!", __func__); + return error; + } + + xfs_mod_sb(tp, XFS_SB_QFLAGS); + return xfs_trans_commit(tp, 0); +} + +__uint64_t +xfs_default_resblks(xfs_mount_t *mp) +{ + __uint64_t resblks; + + /* + * We default to 5% or 8192 fsbs of space reserved, whichever is + * smaller. This is intended to cover concurrent allocation + * transactions when we initially hit enospc. These each require a 4 + * block reservation. Hence by default we cover roughly 2000 concurrent + * allocation reservations. + */ + resblks = mp->m_sb.sb_dblocks; + do_div(resblks, 20); + resblks = min_t(__uint64_t, resblks, 8192); + return resblks; +} + +/* + * This function does the following on an initial mount of a file system: + * - reads the superblock from disk and init the mount struct + * - if we're a 32-bit kernel, do a size check on the superblock + * so we don't mount terabyte filesystems + * - init mount struct realtime fields + * - allocate inode hash table for fs + * - init directory manager + * - perform recovery and init the log manager + */ +int +xfs_mountfs( + xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + xfs_inode_t *rip; + __uint64_t resblks; + uint quotamount = 0; + uint quotaflags = 0; + int error = 0; + + xfs_mount_common(mp, sbp); + + /* + * Check for a mismatched features2 values. Older kernels + * read & wrote into the wrong sb offset for sb_features2 + * on some platforms due to xfs_sb_t not being 64bit size aligned + * when sb_features2 was added, which made older superblock + * reading/writing routines swap it as a 64-bit value. + * + * For backwards compatibility, we make both slots equal. + * + * If we detect a mismatched field, we OR the set bits into the + * existing features2 field in case it has already been modified; we + * don't want to lose any features. We then update the bad location + * with the ORed value so that older kernels will see any features2 + * flags, and mark the two fields as needing updates once the + * transaction subsystem is online. + */ + if (xfs_sb_has_mismatched_features2(sbp)) { + xfs_warn(mp, "correcting sb_features alignment problem"); + sbp->sb_features2 |= sbp->sb_bad_features2; + sbp->sb_bad_features2 = sbp->sb_features2; + mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2; + + /* + * Re-check for ATTR2 in case it was found in bad_features2 + * slot. + */ + if (xfs_sb_version_hasattr2(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_NOATTR2)) + mp->m_flags |= XFS_MOUNT_ATTR2; + } + + if (xfs_sb_version_hasattr2(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_sb_version_removeattr2(&mp->m_sb); + mp->m_update_flags |= XFS_SB_FEATURES2; + + /* update sb_versionnum for the clearing of the morebits */ + if (!sbp->sb_features2) + mp->m_update_flags |= XFS_SB_VERSIONNUM; + } + + /* + * Check if sb_agblocks is aligned at stripe boundary + * If sb_agblocks is NOT aligned turn off m_dalign since + * allocator alignment is within an ag, therefore ag has + * to be aligned at stripe boundary. + */ + error = xfs_update_alignment(mp); + if (error) + goto out; + + xfs_alloc_compute_maxlevels(mp); + xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); + xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); + xfs_ialloc_compute_maxlevels(mp); + + xfs_set_maxicount(mp); + + mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog); + + error = xfs_uuid_mount(mp); + if (error) + goto out; + + /* + * Set the minimum read and write sizes + */ + xfs_set_rw_sizes(mp); + + /* set the low space thresholds for dynamic preallocation */ + xfs_set_low_space_thresholds(mp); + + /* + * Set the inode cluster size. + * This may still be overridden by the file system + * block size if it is larger than the chosen cluster size. + */ + mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; + + /* + * Set inode alignment fields + */ + xfs_set_inoalignment(mp); + + /* + * Check that the data (and log if separate) are an ok size. + */ + error = xfs_check_sizes(mp); + if (error) + goto out_remove_uuid; + + /* + * Initialize realtime fields in the mount structure + */ + error = xfs_rtmount_init(mp); + if (error) { + xfs_warn(mp, "RT mount failed"); + goto out_remove_uuid; + } + + /* + * Copies the low order bits of the timestamp and the randomly + * set "sequence" number out of a UUID. + */ + uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid); + + mp->m_dmevmask = 0; /* not persistent; set after each mount */ + + xfs_dir_mount(mp); + + /* + * Initialize the attribute manager's entries. + */ + mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100; + + /* + * Initialize the precomputed transaction reservations values. + */ + xfs_trans_init(mp); + + /* + * Allocate and initialize the per-ag data. + */ + spin_lock_init(&mp->m_perag_lock); + INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); + error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); + if (error) { + xfs_warn(mp, "Failed per-ag init: %d", error); + goto out_remove_uuid; + } + + if (!sbp->sb_logblocks) { + xfs_warn(mp, "no log defined"); + XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto out_free_perag; + } + + /* + * log's mount-time initialization. Perform 1st part recovery if needed + */ + error = xfs_log_mount(mp, mp->m_logdev_targp, + XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), + XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); + if (error) { + xfs_warn(mp, "log mount failed"); + goto out_free_perag; + } + + /* + * Now the log is mounted, we know if it was an unclean shutdown or + * not. If it was, with the first phase of recovery has completed, we + * have consistent AG blocks on disk. We have not recovered EFIs yet, + * but they are recovered transactionally in the second recovery phase + * later. + * + * Hence we can safely re-initialise incore superblock counters from + * the per-ag data. These may not be correct if the filesystem was not + * cleanly unmounted, so we need to wait for recovery to finish before + * doing this. + * + * If the filesystem was cleanly unmounted, then we can trust the + * values in the superblock to be correct and we don't need to do + * anything here. + * + * If we are currently making the filesystem, the initialisation will + * fail as the perag data is in an undefined state. + */ + if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && + !mp->m_sb.sb_inprogress) { + error = xfs_initialize_perag_data(mp, sbp->sb_agcount); + if (error) + goto out_free_perag; + } + + /* + * Get and sanity-check the root inode. + * Save the pointer to it in the mount structure. + */ + error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip); + if (error) { + xfs_warn(mp, "failed to read root inode"); + goto out_log_dealloc; + } + + ASSERT(rip != NULL); + + if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { + xfs_warn(mp, "corrupted root inode %llu: not a directory", + (unsigned long long)rip->i_ino); + xfs_iunlock(rip, XFS_ILOCK_EXCL); + XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, + mp); + error = XFS_ERROR(EFSCORRUPTED); + goto out_rele_rip; + } + mp->m_rootip = rip; /* save it */ + + xfs_iunlock(rip, XFS_ILOCK_EXCL); + + /* + * Initialize realtime inode pointers in the mount structure + */ + error = xfs_rtmount_inodes(mp); + if (error) { + /* + * Free up the root inode. + */ + xfs_warn(mp, "failed to read RT inodes"); + goto out_rele_rip; + } + + /* + * If this is a read-only mount defer the superblock updates until + * the next remount into writeable mode. Otherwise we would never + * perform the update e.g. for the root filesystem. + */ + if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { + error = xfs_mount_log_sb(mp, mp->m_update_flags); + if (error) { + xfs_warn(mp, "failed to write sb changes"); + goto out_rtunmount; + } + } + + /* + * Initialise the XFS quota management subsystem for this mount + */ + if (XFS_IS_QUOTA_RUNNING(mp)) { + error = xfs_qm_newmount(mp, "amount, "aflags); + if (error) + goto out_rtunmount; + } else { + ASSERT(!XFS_IS_QUOTA_ON(mp)); + + /* + * If a file system had quotas running earlier, but decided to + * mount without -o uquota/pquota/gquota options, revoke the + * quotachecked license. + */ + if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) { + xfs_notice(mp, "resetting quota flags"); + error = xfs_mount_reset_sbqflags(mp); + if (error) + return error; + } + } + + /* + * Finish recovering the file system. This part needed to be + * delayed until after the root and real-time bitmap inodes + * were consistently read in. + */ + error = xfs_log_mount_finish(mp); + if (error) { + xfs_warn(mp, "log mount finish failed"); + goto out_rtunmount; + } + + /* + * Complete the quota initialisation, post-log-replay component. + */ + if (quotamount) { + ASSERT(mp->m_qflags == 0); + mp->m_qflags = quotaflags; + + xfs_qm_mount_quotas(mp); + } + + /* + * Now we are mounted, reserve a small amount of unused space for + * privileged transactions. This is needed so that transaction + * space required for critical operations can dip into this pool + * when at ENOSPC. This is needed for operations like create with + * attr, unwritten extent conversion at ENOSPC, etc. Data allocations + * are not allowed to use this reserved space. + * + * This may drive us straight to ENOSPC on mount, but that implies + * we were already there on the last unmount. Warn if this occurs. + */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + resblks = xfs_default_resblks(mp); + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + xfs_warn(mp, + "Unable to allocate reserve blocks. Continuing without reserve pool."); + } + + return 0; + + out_rtunmount: + xfs_rtunmount_inodes(mp); + out_rele_rip: + IRELE(rip); + out_log_dealloc: + xfs_log_unmount(mp); + out_free_perag: + xfs_free_perag(mp); + out_remove_uuid: + xfs_uuid_unmount(mp); + out: + return error; +} + +/* + * This flushes out the inodes,dquots and the superblock, unmounts the + * log and makes sure that incore structures are freed. + */ +void +xfs_unmountfs( + struct xfs_mount *mp) +{ + __uint64_t resblks; + int error; + + xfs_qm_unmount_quotas(mp); + xfs_rtunmount_inodes(mp); + IRELE(mp->m_rootip); + + /* + * We can potentially deadlock here if we have an inode cluster + * that has been freed has its buffer still pinned in memory because + * the transaction is still sitting in a iclog. The stale inodes + * on that buffer will have their flush locks held until the + * transaction hits the disk and the callbacks run. the inode + * flush takes the flush lock unconditionally and with nothing to + * push out the iclog we will never get that unlocked. hence we + * need to force the log first. + */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * Do a delwri reclaim pass first so that as many dirty inodes are + * queued up for IO as possible. Then flush the buffers before making + * a synchronous path to catch all the remaining inodes are reclaimed. + * This makes the reclaim process as quick as possible by avoiding + * synchronous writeout and blocking on inodes already in the delwri + * state as much as possible. + */ + xfs_reclaim_inodes(mp, 0); + XFS_bflush(mp->m_ddev_targp); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + xfs_qm_unmount(mp); + + /* + * Flush out the log synchronously so that we know for sure + * that nothing is pinned. This is important because bflush() + * will skip pinned buffers. + */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * Unreserve any blocks we have so that when we unmount we don't account + * the reserved free space as used. This is really only necessary for + * lazy superblock counting because it trusts the incore superblock + * counters to be absolutely correct on clean unmount. + * + * We don't bother correcting this elsewhere for lazy superblock + * counting because on mount of an unclean filesystem we reconstruct the + * correct counter value and this is irrelevant. + * + * For non-lazy counter filesystems, this doesn't matter at all because + * we only every apply deltas to the superblock and hence the incore + * value does not matter.... + */ + resblks = 0; + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + xfs_warn(mp, "Unable to free reserved block pool. " + "Freespace may not be correct on next mount."); + + error = xfs_log_sbcount(mp, 1); + if (error) + xfs_warn(mp, "Unable to update superblock counters. " + "Freespace may not be correct on next mount."); + xfs_unmountfs_writesb(mp); + + /* + * Make sure all buffers have been flushed and completed before + * unmounting the log. + */ + error = xfs_flush_buftarg(mp->m_ddev_targp, 1); + if (error) + xfs_warn(mp, "%d busy buffers during unmount.", error); + xfs_wait_buftarg(mp->m_ddev_targp); + + xfs_log_unmount_write(mp); + xfs_log_unmount(mp); + xfs_uuid_unmount(mp); + +#if defined(DEBUG) + xfs_errortag_clearall(mp, 0); +#endif + xfs_free_perag(mp); +} + +int +xfs_fs_writable(xfs_mount_t *mp) +{ + return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) || + (mp->m_flags & XFS_MOUNT_RDONLY)); +} + +/* + * xfs_log_sbcount + * + * Called either periodically to keep the on disk superblock values + * roughly up to date or from unmount to make sure the values are + * correct on a clean unmount. + * + * Note this code can be called during the process of freezing, so + * we may need to use the transaction allocator which does not not + * block when the transaction subsystem is in its frozen state. + */ +int +xfs_log_sbcount( + xfs_mount_t *mp, + uint sync) +{ + xfs_trans_t *tp; + int error; + + if (!xfs_fs_writable(mp)) + return 0; + + xfs_icsb_sync_counters(mp, 0); + + /* + * we don't need to do this if we are updating the superblock + * counters on every modification. + */ + if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) + return 0; + + tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); + error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, + XFS_DEFAULT_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS); + if (sync) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + return error; +} + +int +xfs_unmountfs_writesb(xfs_mount_t *mp) +{ + xfs_buf_t *sbp; + int error = 0; + + /* + * skip superblock write if fs is read-only, or + * if we are doing a forced umount. + */ + if (!((mp->m_flags & XFS_MOUNT_RDONLY) || + XFS_FORCED_SHUTDOWN(mp))) { + + sbp = xfs_getsb(mp, 0); + + XFS_BUF_UNDONE(sbp); + XFS_BUF_UNREAD(sbp); + XFS_BUF_UNDELAYWRITE(sbp); + XFS_BUF_WRITE(sbp); + XFS_BUF_UNASYNC(sbp); + ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); + xfsbdstrat(mp, sbp); + error = xfs_buf_iowait(sbp); + if (error) + xfs_ioerror_alert("xfs_unmountfs_writesb", + mp, sbp, XFS_BUF_ADDR(sbp)); + xfs_buf_relse(sbp); + } + return error; +} + +/* + * xfs_mod_sb() can be used to copy arbitrary changes to the + * in-core superblock into the superblock buffer to be logged. + * It does not provide the higher level of locking that is + * needed to protect the in-core superblock from concurrent + * access. + */ +void +xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) +{ + xfs_buf_t *bp; + int first; + int last; + xfs_mount_t *mp; + xfs_sb_field_t f; + + ASSERT(fields); + if (!fields) + return; + mp = tp->t_mountp; + bp = xfs_trans_getsb(tp, mp, 0); + first = sizeof(xfs_sb_t); + last = 0; + + /* translate/copy */ + + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); + + /* find modified range */ + f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); + ASSERT((1LL << f) & XFS_SB_MOD_BITS); + last = xfs_sb_info[f + 1].offset - 1; + + f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); + ASSERT((1LL << f) & XFS_SB_MOD_BITS); + first = xfs_sb_info[f].offset; + + xfs_trans_log_buf(tp, bp, first, last); +} + + +/* + * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply + * a delta to a specified field in the in-core superblock. Simply + * switch on the field indicated and apply the delta to that field. + * Fields are not allowed to dip below zero, so if the delta would + * do this do not apply it and return EINVAL. + * + * The m_sb_lock must be held when this routine is called. + */ +STATIC int +xfs_mod_incore_sb_unlocked( + xfs_mount_t *mp, + xfs_sb_field_t field, + int64_t delta, + int rsvd) +{ + int scounter; /* short counter for 32 bit fields */ + long long lcounter; /* long counter for 64 bit fields */ + long long res_used, rem; + + /* + * With the in-core superblock spin lock held, switch + * on the indicated field. Apply the delta to the + * proper field. If the fields value would dip below + * 0, then do not apply the delta and return EINVAL. + */ + switch (field) { + case XFS_SBS_ICOUNT: + lcounter = (long long)mp->m_sb.sb_icount; + lcounter += delta; + if (lcounter < 0) { + ASSERT(0); + return XFS_ERROR(EINVAL); + } + mp->m_sb.sb_icount = lcounter; + return 0; + case XFS_SBS_IFREE: + lcounter = (long long)mp->m_sb.sb_ifree; + lcounter += delta; + if (lcounter < 0) { + ASSERT(0); + return XFS_ERROR(EINVAL); + } + mp->m_sb.sb_ifree = lcounter; + return 0; + case XFS_SBS_FDBLOCKS: + lcounter = (long long) + mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); + + if (delta > 0) { /* Putting blocks back */ + if (res_used > delta) { + mp->m_resblks_avail += delta; + } else { + rem = delta - res_used; + mp->m_resblks_avail = mp->m_resblks; + lcounter += rem; + } + } else { /* Taking blocks away */ + lcounter += delta; + if (lcounter >= 0) { + mp->m_sb.sb_fdblocks = lcounter + + XFS_ALLOC_SET_ASIDE(mp); + return 0; + } + + /* + * We are out of blocks, use any available reserved + * blocks if were allowed to. + */ + if (!rsvd) + return XFS_ERROR(ENOSPC); + + lcounter = (long long)mp->m_resblks_avail + delta; + if (lcounter >= 0) { + mp->m_resblks_avail = lcounter; + return 0; + } + printk_once(KERN_WARNING + "Filesystem \"%s\": reserve blocks depleted! " + "Consider increasing reserve pool size.", + mp->m_fsname); + return XFS_ERROR(ENOSPC); + } + + mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); + return 0; + case XFS_SBS_FREXTENTS: + lcounter = (long long)mp->m_sb.sb_frextents; + lcounter += delta; + if (lcounter < 0) { + return XFS_ERROR(ENOSPC); + } + mp->m_sb.sb_frextents = lcounter; + return 0; + case XFS_SBS_DBLOCKS: + lcounter = (long long)mp->m_sb.sb_dblocks; + lcounter += delta; + if (lcounter < 0) { + ASSERT(0); + return XFS_ERROR(EINVAL); + } + mp->m_sb.sb_dblocks = lcounter; + return 0; + case XFS_SBS_AGCOUNT: + scounter = mp->m_sb.sb_agcount; + scounter += delta; + if (scounter < 0) { + ASSERT(0); + return XFS_ERROR(EINVAL); + } + mp->m_sb.sb_agcount = scounter; + return 0; + case XFS_SBS_IMAX_PCT: + scounter = mp->m_sb.sb_imax_pct; + scounter += delta; + if (scounter < 0) { + ASSERT(0); + return XFS_ERROR(EINVAL); + } + mp->m_sb.sb_imax_pct = scounter; + return 0; + case XFS_SBS_REXTSIZE: + scounter = mp->m_sb.sb_rextsize; + scounter += delta; + if (scounter < 0) { + ASSERT(0); + return XFS_ERROR(EINVAL); + } + mp->m_sb.sb_rextsize = scounter; + return 0; + case XFS_SBS_RBMBLOCKS: + scounter = mp->m_sb.sb_rbmblocks; + scounter += delta; + if (scounter < 0) { + ASSERT(0); + return XFS_ERROR(EINVAL); + } + mp->m_sb.sb_rbmblocks = scounter; + return 0; + case XFS_SBS_RBLOCKS: + lcounter = (long long)mp->m_sb.sb_rblocks; + lcounter += delta; + if (lcounter < 0) { + ASSERT(0); + return XFS_ERROR(EINVAL); + } + mp->m_sb.sb_rblocks = lcounter; + return 0; + case XFS_SBS_REXTENTS: + lcounter = (long long)mp->m_sb.sb_rextents; + lcounter += delta; + if (lcounter < 0) { + ASSERT(0); + return XFS_ERROR(EINVAL); + } + mp->m_sb.sb_rextents = lcounter; + return 0; + case XFS_SBS_REXTSLOG: + scounter = mp->m_sb.sb_rextslog; + scounter += delta; + if (scounter < 0) { + ASSERT(0); + return XFS_ERROR(EINVAL); + } + mp->m_sb.sb_rextslog = scounter; + return 0; + default: + ASSERT(0); + return XFS_ERROR(EINVAL); + } +} + +/* + * xfs_mod_incore_sb() is used to change a field in the in-core + * superblock structure by the specified delta. This modification + * is protected by the m_sb_lock. Just use the xfs_mod_incore_sb_unlocked() + * routine to do the work. + */ +int +xfs_mod_incore_sb( + struct xfs_mount *mp, + xfs_sb_field_t field, + int64_t delta, + int rsvd) +{ + int status; + +#ifdef HAVE_PERCPU_SB + ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS); +#endif + spin_lock(&mp->m_sb_lock); + status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); + spin_unlock(&mp->m_sb_lock); + + return status; +} + +/* + * Change more than one field in the in-core superblock structure at a time. + * + * The fields and changes to those fields are specified in the array of + * xfs_mod_sb structures passed in. Either all of the specified deltas + * will be applied or none of them will. If any modified field dips below 0, + * then all modifications will be backed out and EINVAL will be returned. + * + * Note that this function may not be used for the superblock values that + * are tracked with the in-memory per-cpu counters - a direct call to + * xfs_icsb_modify_counters is required for these. + */ +int +xfs_mod_incore_sb_batch( + struct xfs_mount *mp, + xfs_mod_sb_t *msb, + uint nmsb, + int rsvd) +{ + xfs_mod_sb_t *msbp; + int error = 0; + + /* + * Loop through the array of mod structures and apply each individually. + * If any fail, then back out all those which have already been applied. + * Do all of this within the scope of the m_sb_lock so that all of the + * changes will be atomic. + */ + spin_lock(&mp->m_sb_lock); + for (msbp = msb; msbp < (msb + nmsb); msbp++) { + ASSERT(msbp->msb_field < XFS_SBS_ICOUNT || + msbp->msb_field > XFS_SBS_FDBLOCKS); + + error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, + msbp->msb_delta, rsvd); + if (error) + goto unwind; + } + spin_unlock(&mp->m_sb_lock); + return 0; + +unwind: + while (--msbp >= msb) { + error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, + -msbp->msb_delta, rsvd); + ASSERT(error == 0); + } + spin_unlock(&mp->m_sb_lock); + return error; +} + +/* + * xfs_getsb() is called to obtain the buffer for the superblock. + * The buffer is returned locked and read in from disk. + * The buffer should be released with a call to xfs_brelse(). + * + * If the flags parameter is BUF_TRYLOCK, then we'll only return + * the superblock buffer if it can be locked without sleeping. + * If it can't then we'll return NULL. + */ +xfs_buf_t * +xfs_getsb( + xfs_mount_t *mp, + int flags) +{ + xfs_buf_t *bp; + + ASSERT(mp->m_sb_bp != NULL); + bp = mp->m_sb_bp; + if (flags & XBF_TRYLOCK) { + if (!XFS_BUF_CPSEMA(bp)) { + return NULL; + } + } else { + XFS_BUF_PSEMA(bp, PRIBIO); + } + XFS_BUF_HOLD(bp); + ASSERT(XFS_BUF_ISDONE(bp)); + return bp; +} + +/* + * Used to free the superblock along various error paths. + */ +void +xfs_freesb( + struct xfs_mount *mp) +{ + struct xfs_buf *bp = mp->m_sb_bp; + + xfs_buf_lock(bp); + mp->m_sb_bp = NULL; + xfs_buf_relse(bp); +} + +/* + * Used to log changes to the superblock unit and width fields which could + * be altered by the mount options, as well as any potential sb_features2 + * fixup. Only the first superblock is updated. + */ +int +xfs_mount_log_sb( + xfs_mount_t *mp, + __int64_t fields) +{ + xfs_trans_t *tp; + int error; + + ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID | + XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 | + XFS_SB_VERSIONNUM)); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); + error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, + XFS_DEFAULT_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + xfs_mod_sb(tp, fields); + error = xfs_trans_commit(tp, 0); + return error; +} + +/* + * If the underlying (data/log/rt) device is readonly, there are some + * operations that cannot proceed. + */ +int +xfs_dev_is_read_only( + struct xfs_mount *mp, + char *message) +{ + if (xfs_readonly_buftarg(mp->m_ddev_targp) || + xfs_readonly_buftarg(mp->m_logdev_targp) || + (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { + xfs_notice(mp, "%s required on read-only device.", message); + xfs_notice(mp, "write access unavailable, cannot proceed."); + return EROFS; + } + return 0; +} + +#ifdef HAVE_PERCPU_SB +/* + * Per-cpu incore superblock counters + * + * Simple concept, difficult implementation + * + * Basically, replace the incore superblock counters with a distributed per cpu + * counter for contended fields (e.g. free block count). + * + * Difficulties arise in that the incore sb is used for ENOSPC checking, and + * hence needs to be accurately read when we are running low on space. Hence + * there is a method to enable and disable the per-cpu counters based on how + * much "stuff" is available in them. + * + * Basically, a counter is enabled if there is enough free resource to justify + * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local + * ENOSPC), then we disable the counters to synchronise all callers and + * re-distribute the available resources. + * + * If, once we redistributed the available resources, we still get a failure, + * we disable the per-cpu counter and go through the slow path. + * + * The slow path is the current xfs_mod_incore_sb() function. This means that + * when we disable a per-cpu counter, we need to drain its resources back to + * the global superblock. We do this after disabling the counter to prevent + * more threads from queueing up on the counter. + * + * Essentially, this means that we still need a lock in the fast path to enable + * synchronisation between the global counters and the per-cpu counters. This + * is not a problem because the lock will be local to a CPU almost all the time + * and have little contention except when we get to ENOSPC conditions. + * + * Basically, this lock becomes a barrier that enables us to lock out the fast + * path while we do things like enabling and disabling counters and + * synchronising the counters. + * + * Locking rules: + * + * 1. m_sb_lock before picking up per-cpu locks + * 2. per-cpu locks always picked up via for_each_online_cpu() order + * 3. accurate counter sync requires m_sb_lock + per cpu locks + * 4. modifying per-cpu counters requires holding per-cpu lock + * 5. modifying global counters requires holding m_sb_lock + * 6. enabling or disabling a counter requires holding the m_sb_lock + * and _none_ of the per-cpu locks. + * + * Disabled counters are only ever re-enabled by a balance operation + * that results in more free resources per CPU than a given threshold. + * To ensure counters don't remain disabled, they are rebalanced when + * the global resource goes above a higher threshold (i.e. some hysteresis + * is present to prevent thrashing). + */ + +#ifdef CONFIG_HOTPLUG_CPU +/* + * hot-plug CPU notifier support. + * + * We need a notifier per filesystem as we need to be able to identify + * the filesystem to balance the counters out. This is achieved by + * having a notifier block embedded in the xfs_mount_t and doing pointer + * magic to get the mount pointer from the notifier block address. + */ +STATIC int +xfs_icsb_cpu_notify( + struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + xfs_icsb_cnts_t *cntp; + xfs_mount_t *mp; + + mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier); + cntp = (xfs_icsb_cnts_t *) + per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu); + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + /* Easy Case - initialize the area and locks, and + * then rebalance when online does everything else for us. */ + memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + xfs_icsb_lock(mp); + xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); + xfs_icsb_unlock(mp); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + /* Disable all the counters, then fold the dead cpu's + * count into the total on the global superblock and + * re-enable the counters. */ + xfs_icsb_lock(mp); + spin_lock(&mp->m_sb_lock); + xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT); + xfs_icsb_disable_counter(mp, XFS_SBS_IFREE); + xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS); + + mp->m_sb.sb_icount += cntp->icsb_icount; + mp->m_sb.sb_ifree += cntp->icsb_ifree; + mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks; + + memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); + + xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0); + xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0); + xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0); + spin_unlock(&mp->m_sb_lock); + xfs_icsb_unlock(mp); + break; + } + + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +int +xfs_icsb_init_counters( + xfs_mount_t *mp) +{ + xfs_icsb_cnts_t *cntp; + int i; + + mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t); + if (mp->m_sb_cnts == NULL) + return -ENOMEM; + +#ifdef CONFIG_HOTPLUG_CPU + mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; + mp->m_icsb_notifier.priority = 0; + register_hotcpu_notifier(&mp->m_icsb_notifier); +#endif /* CONFIG_HOTPLUG_CPU */ + + for_each_online_cpu(i) { + cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); + memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); + } + + mutex_init(&mp->m_icsb_mutex); + + /* + * start with all counters disabled so that the + * initial balance kicks us off correctly + */ + mp->m_icsb_counters = -1; + return 0; +} + +void +xfs_icsb_reinit_counters( + xfs_mount_t *mp) +{ + xfs_icsb_lock(mp); + /* + * start with all counters disabled so that the + * initial balance kicks us off correctly + */ + mp->m_icsb_counters = -1; + xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); + xfs_icsb_unlock(mp); +} + +void +xfs_icsb_destroy_counters( + xfs_mount_t *mp) +{ + if (mp->m_sb_cnts) { + unregister_hotcpu_notifier(&mp->m_icsb_notifier); + free_percpu(mp->m_sb_cnts); + } + mutex_destroy(&mp->m_icsb_mutex); +} + +STATIC void +xfs_icsb_lock_cntr( + xfs_icsb_cnts_t *icsbp) +{ + while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) { + ndelay(1000); + } +} + +STATIC void +xfs_icsb_unlock_cntr( + xfs_icsb_cnts_t *icsbp) +{ + clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags); +} + + +STATIC void +xfs_icsb_lock_all_counters( + xfs_mount_t *mp) +{ + xfs_icsb_cnts_t *cntp; + int i; + + for_each_online_cpu(i) { + cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); + xfs_icsb_lock_cntr(cntp); + } +} + +STATIC void +xfs_icsb_unlock_all_counters( + xfs_mount_t *mp) +{ + xfs_icsb_cnts_t *cntp; + int i; + + for_each_online_cpu(i) { + cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); + xfs_icsb_unlock_cntr(cntp); + } +} + +STATIC void +xfs_icsb_count( + xfs_mount_t *mp, + xfs_icsb_cnts_t *cnt, + int flags) +{ + xfs_icsb_cnts_t *cntp; + int i; + + memset(cnt, 0, sizeof(xfs_icsb_cnts_t)); + + if (!(flags & XFS_ICSB_LAZY_COUNT)) + xfs_icsb_lock_all_counters(mp); + + for_each_online_cpu(i) { + cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); + cnt->icsb_icount += cntp->icsb_icount; + cnt->icsb_ifree += cntp->icsb_ifree; + cnt->icsb_fdblocks += cntp->icsb_fdblocks; + } + + if (!(flags & XFS_ICSB_LAZY_COUNT)) + xfs_icsb_unlock_all_counters(mp); +} + +STATIC int +xfs_icsb_counter_disabled( + xfs_mount_t *mp, + xfs_sb_field_t field) +{ + ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); + return test_bit(field, &mp->m_icsb_counters); +} + +STATIC void +xfs_icsb_disable_counter( + xfs_mount_t *mp, + xfs_sb_field_t field) +{ + xfs_icsb_cnts_t cnt; + + ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); + + /* + * If we are already disabled, then there is nothing to do + * here. We check before locking all the counters to avoid + * the expensive lock operation when being called in the + * slow path and the counter is already disabled. This is + * safe because the only time we set or clear this state is under + * the m_icsb_mutex. + */ + if (xfs_icsb_counter_disabled(mp, field)) + return; + + xfs_icsb_lock_all_counters(mp); + if (!test_and_set_bit(field, &mp->m_icsb_counters)) { + /* drain back to superblock */ + + xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT); + switch(field) { + case XFS_SBS_ICOUNT: + mp->m_sb.sb_icount = cnt.icsb_icount; + break; + case XFS_SBS_IFREE: + mp->m_sb.sb_ifree = cnt.icsb_ifree; + break; + case XFS_SBS_FDBLOCKS: + mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; + break; + default: + BUG(); + } + } + + xfs_icsb_unlock_all_counters(mp); +} + +STATIC void +xfs_icsb_enable_counter( + xfs_mount_t *mp, + xfs_sb_field_t field, + uint64_t count, + uint64_t resid) +{ + xfs_icsb_cnts_t *cntp; + int i; + + ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); + + xfs_icsb_lock_all_counters(mp); + for_each_online_cpu(i) { + cntp = per_cpu_ptr(mp->m_sb_cnts, i); + switch (field) { + case XFS_SBS_ICOUNT: + cntp->icsb_icount = count + resid; + break; + case XFS_SBS_IFREE: + cntp->icsb_ifree = count + resid; + break; + case XFS_SBS_FDBLOCKS: + cntp->icsb_fdblocks = count + resid; + break; + default: + BUG(); + break; + } + resid = 0; + } + clear_bit(field, &mp->m_icsb_counters); + xfs_icsb_unlock_all_counters(mp); +} + +void +xfs_icsb_sync_counters_locked( + xfs_mount_t *mp, + int flags) +{ + xfs_icsb_cnts_t cnt; + + xfs_icsb_count(mp, &cnt, flags); + + if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT)) + mp->m_sb.sb_icount = cnt.icsb_icount; + if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE)) + mp->m_sb.sb_ifree = cnt.icsb_ifree; + if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS)) + mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; +} + +/* + * Accurate update of per-cpu counters to incore superblock + */ +void +xfs_icsb_sync_counters( + xfs_mount_t *mp, + int flags) +{ + spin_lock(&mp->m_sb_lock); + xfs_icsb_sync_counters_locked(mp, flags); + spin_unlock(&mp->m_sb_lock); +} + +/* + * Balance and enable/disable counters as necessary. + * + * Thresholds for re-enabling counters are somewhat magic. inode counts are + * chosen to be the same number as single on disk allocation chunk per CPU, and + * free blocks is something far enough zero that we aren't going thrash when we + * get near ENOSPC. We also need to supply a minimum we require per cpu to + * prevent looping endlessly when xfs_alloc_space asks for more than will + * be distributed to a single CPU but each CPU has enough blocks to be + * reenabled. + * + * Note that we can be called when counters are already disabled. + * xfs_icsb_disable_counter() optimises the counter locking in this case to + * prevent locking every per-cpu counter needlessly. + */ + +#define XFS_ICSB_INO_CNTR_REENABLE (uint64_t)64 +#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \ + (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp)) +STATIC void +xfs_icsb_balance_counter_locked( + xfs_mount_t *mp, + xfs_sb_field_t field, + int min_per_cpu) +{ + uint64_t count, resid; + int weight = num_online_cpus(); + uint64_t min = (uint64_t)min_per_cpu; + + /* disable counter and sync counter */ + xfs_icsb_disable_counter(mp, field); + + /* update counters - first CPU gets residual*/ + switch (field) { + case XFS_SBS_ICOUNT: + count = mp->m_sb.sb_icount; + resid = do_div(count, weight); + if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) + return; + break; + case XFS_SBS_IFREE: + count = mp->m_sb.sb_ifree; + resid = do_div(count, weight); + if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) + return; + break; + case XFS_SBS_FDBLOCKS: + count = mp->m_sb.sb_fdblocks; + resid = do_div(count, weight); + if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp))) + return; + break; + default: + BUG(); + count = resid = 0; /* quiet, gcc */ + break; + } + + xfs_icsb_enable_counter(mp, field, count, resid); +} + +STATIC void +xfs_icsb_balance_counter( + xfs_mount_t *mp, + xfs_sb_field_t fields, + int min_per_cpu) +{ + spin_lock(&mp->m_sb_lock); + xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu); + spin_unlock(&mp->m_sb_lock); +} + +int +xfs_icsb_modify_counters( + xfs_mount_t *mp, + xfs_sb_field_t field, + int64_t delta, + int rsvd) +{ + xfs_icsb_cnts_t *icsbp; + long long lcounter; /* long counter for 64 bit fields */ + int ret = 0; + + might_sleep(); +again: + preempt_disable(); + icsbp = this_cpu_ptr(mp->m_sb_cnts); + + /* + * if the counter is disabled, go to slow path + */ + if (unlikely(xfs_icsb_counter_disabled(mp, field))) + goto slow_path; + xfs_icsb_lock_cntr(icsbp); + if (unlikely(xfs_icsb_counter_disabled(mp, field))) { + xfs_icsb_unlock_cntr(icsbp); + goto slow_path; + } + + switch (field) { + case XFS_SBS_ICOUNT: + lcounter = icsbp->icsb_icount; + lcounter += delta; + if (unlikely(lcounter < 0)) + goto balance_counter; + icsbp->icsb_icount = lcounter; + break; + + case XFS_SBS_IFREE: + lcounter = icsbp->icsb_ifree; + lcounter += delta; + if (unlikely(lcounter < 0)) + goto balance_counter; + icsbp->icsb_ifree = lcounter; + break; + + case XFS_SBS_FDBLOCKS: + BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0); + + lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + lcounter += delta; + if (unlikely(lcounter < 0)) + goto balance_counter; + icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); + break; + default: + BUG(); + break; + } + xfs_icsb_unlock_cntr(icsbp); + preempt_enable(); + return 0; + +slow_path: + preempt_enable(); + + /* + * serialise with a mutex so we don't burn lots of cpu on + * the superblock lock. We still need to hold the superblock + * lock, however, when we modify the global structures. + */ + xfs_icsb_lock(mp); + + /* + * Now running atomically. + * + * If the counter is enabled, someone has beaten us to rebalancing. + * Drop the lock and try again in the fast path.... + */ + if (!(xfs_icsb_counter_disabled(mp, field))) { + xfs_icsb_unlock(mp); + goto again; + } + + /* + * The counter is currently disabled. Because we are + * running atomically here, we know a rebalance cannot + * be in progress. Hence we can go straight to operating + * on the global superblock. We do not call xfs_mod_incore_sb() + * here even though we need to get the m_sb_lock. Doing so + * will cause us to re-enter this function and deadlock. + * Hence we get the m_sb_lock ourselves and then call + * xfs_mod_incore_sb_unlocked() as the unlocked path operates + * directly on the global counters. + */ + spin_lock(&mp->m_sb_lock); + ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); + spin_unlock(&mp->m_sb_lock); + + /* + * Now that we've modified the global superblock, we + * may be able to re-enable the distributed counters + * (e.g. lots of space just got freed). After that + * we are done. + */ + if (ret != ENOSPC) + xfs_icsb_balance_counter(mp, field, 0); + xfs_icsb_unlock(mp); + return ret; + +balance_counter: + xfs_icsb_unlock_cntr(icsbp); + preempt_enable(); + + /* + * We may have multiple threads here if multiple per-cpu + * counters run dry at the same time. This will mean we can + * do more balances than strictly necessary but it is not + * the common slowpath case. + */ + xfs_icsb_lock(mp); + + /* + * running atomically. + * + * This will leave the counter in the correct state for future + * accesses. After the rebalance, we simply try again and our retry + * will either succeed through the fast path or slow path without + * another balance operation being required. + */ + xfs_icsb_balance_counter(mp, field, delta); + xfs_icsb_unlock(mp); + goto again; +} + +#endif diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h new file mode 100644 index 00000000..3d68bb26 --- /dev/null +++ b/fs/xfs/xfs_mount.h @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_MOUNT_H__ +#define __XFS_MOUNT_H__ + +typedef struct xfs_trans_reservations { + uint tr_write; /* extent alloc trans */ + uint tr_itruncate; /* truncate trans */ + uint tr_rename; /* rename trans */ + uint tr_link; /* link trans */ + uint tr_remove; /* unlink trans */ + uint tr_symlink; /* symlink trans */ + uint tr_create; /* create trans */ + uint tr_mkdir; /* mkdir trans */ + uint tr_ifree; /* inode free trans */ + uint tr_ichange; /* inode update trans */ + uint tr_growdata; /* fs data section grow trans */ + uint tr_swrite; /* sync write inode trans */ + uint tr_addafork; /* cvt inode to attributed trans */ + uint tr_writeid; /* write setuid/setgid file */ + uint tr_attrinval; /* attr fork buffer invalidation */ + uint tr_attrset; /* set/create an attribute */ + uint tr_attrrm; /* remove an attribute */ + uint tr_clearagi; /* clear bad agi unlinked ino bucket */ + uint tr_growrtalloc; /* grow realtime allocations */ + uint tr_growrtzero; /* grow realtime zeroing */ + uint tr_growrtfree; /* grow realtime freeing */ +} xfs_trans_reservations_t; + +#ifndef __KERNEL__ + +#define xfs_daddr_to_agno(mp,d) \ + ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks)) +#define xfs_daddr_to_agbno(mp,d) \ + ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks)) + +#else /* __KERNEL__ */ + +#include "xfs_sync.h" + +struct log; +struct xfs_mount_args; +struct xfs_inode; +struct xfs_bmbt_irec; +struct xfs_bmap_free; +struct xfs_extdelta; +struct xfs_swapext; +struct xfs_mru_cache; +struct xfs_nameops; +struct xfs_ail; +struct xfs_quotainfo; + +#ifdef HAVE_PERCPU_SB + +/* + * Valid per-cpu incore superblock counters. Note that if you add new counters, + * you may need to define new counter disabled bit field descriptors as there + * are more possible fields in the superblock that can fit in a bitfield on a + * 32 bit platform. The XFS_SBS_* values for the current current counters just + * fit. + */ +typedef struct xfs_icsb_cnts { + uint64_t icsb_fdblocks; + uint64_t icsb_ifree; + uint64_t icsb_icount; + unsigned long icsb_flags; +} xfs_icsb_cnts_t; + +#define XFS_ICSB_FLAG_LOCK (1 << 0) /* counter lock bit */ + +#define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */ + +extern int xfs_icsb_init_counters(struct xfs_mount *); +extern void xfs_icsb_reinit_counters(struct xfs_mount *); +extern void xfs_icsb_destroy_counters(struct xfs_mount *); +extern void xfs_icsb_sync_counters(struct xfs_mount *, int); +extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); +extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t, + int64_t, int); + +#else +#define xfs_icsb_init_counters(mp) (0) +#define xfs_icsb_destroy_counters(mp) do { } while (0) +#define xfs_icsb_reinit_counters(mp) do { } while (0) +#define xfs_icsb_sync_counters(mp, flags) do { } while (0) +#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) +#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \ + xfs_mod_incore_sb(mp, field, delta, rsvd) +#endif + +/* dynamic preallocation free space thresholds, 5% down to 1% */ +enum { + XFS_LOWSP_1_PCNT = 0, + XFS_LOWSP_2_PCNT, + XFS_LOWSP_3_PCNT, + XFS_LOWSP_4_PCNT, + XFS_LOWSP_5_PCNT, + XFS_LOWSP_MAX, +}; + +typedef struct xfs_mount { + struct super_block *m_super; + xfs_tid_t m_tid; /* next unused tid for fs */ + struct xfs_ail *m_ail; /* fs active log item list */ + xfs_sb_t m_sb; /* copy of fs superblock */ + spinlock_t m_sb_lock; /* sb counter lock */ + struct xfs_buf *m_sb_bp; /* buffer for superblock */ + char *m_fsname; /* filesystem name */ + int m_fsname_len; /* strlen of fs name */ + char *m_rtname; /* realtime device name */ + char *m_logname; /* external log device name */ + int m_bsize; /* fs logical block size */ + xfs_agnumber_t m_agfrotor; /* last ag where space found */ + xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ + spinlock_t m_agirotor_lock;/* .. and lock protecting it */ + xfs_agnumber_t m_maxagi; /* highest inode alloc group */ + uint m_readio_log; /* min read size log bytes */ + uint m_readio_blocks; /* min read size blocks */ + uint m_writeio_log; /* min write size log bytes */ + uint m_writeio_blocks; /* min write size blocks */ + struct log *m_log; /* log specific stuff */ + int m_logbufs; /* number of log buffers */ + int m_logbsize; /* size of each log buffer */ + uint m_rsumlevels; /* rt summary levels */ + uint m_rsumsize; /* size of rt summary, bytes */ + struct xfs_inode *m_rbmip; /* pointer to bitmap inode */ + struct xfs_inode *m_rsumip; /* pointer to summary inode */ + struct xfs_inode *m_rootip; /* pointer to root directory */ + struct xfs_quotainfo *m_quotainfo; /* disk quota information */ + xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ + xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ + xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ + __uint8_t m_blkbit_log; /* blocklog + NBBY */ + __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ + __uint8_t m_agno_log; /* log #ag's */ + __uint8_t m_agino_log; /* #bits for agino in inum */ + __uint16_t m_inode_cluster_size;/* min inode buf size */ + uint m_blockmask; /* sb_blocksize-1 */ + uint m_blockwsize; /* sb_blocksize in words */ + uint m_blockwmask; /* blockwsize-1 */ + uint m_alloc_mxr[2]; /* max alloc btree records */ + uint m_alloc_mnr[2]; /* min alloc btree records */ + uint m_bmap_dmxr[2]; /* max bmap btree records */ + uint m_bmap_dmnr[2]; /* min bmap btree records */ + uint m_inobt_mxr[2]; /* max inobt btree records */ + uint m_inobt_mnr[2]; /* min inobt btree records */ + uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ + uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ + uint m_in_maxlevels; /* max inobt btree levels. */ + struct radix_tree_root m_perag_tree; /* per-ag accounting info */ + spinlock_t m_perag_lock; /* lock for m_perag_tree */ + struct mutex m_growlock; /* growfs mutex */ + int m_fixedfsid[2]; /* unchanged for life of FS */ + uint m_dmevmask; /* DMI events for this FS */ + __uint64_t m_flags; /* global mount flags */ + uint m_dir_node_ents; /* #entries in a dir danode */ + uint m_attr_node_ents; /* #entries in attr danode */ + int m_ialloc_inos; /* inodes in inode allocation */ + int m_ialloc_blks; /* blocks in inode allocation */ + int m_inoalign_mask;/* mask sb_inoalignmt if used */ + uint m_qflags; /* quota status flags */ + xfs_trans_reservations_t m_reservations;/* precomputed res values */ + __uint64_t m_maxicount; /* maximum inode count */ + __uint64_t m_maxioffset; /* maximum inode offset */ + __uint64_t m_resblks; /* total reserved blocks */ + __uint64_t m_resblks_avail;/* available reserved blocks */ + __uint64_t m_resblks_save; /* reserved blks @ remount,ro */ + int m_dalign; /* stripe unit */ + int m_swidth; /* stripe width */ + int m_sinoalign; /* stripe unit inode alignment */ + int m_attr_magicpct;/* 37% of the blocksize */ + int m_dir_magicpct; /* 37% of the dir blocksize */ + __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ + const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ + int m_dirblksize; /* directory block sz--bytes */ + int m_dirblkfsbs; /* directory block sz--fsbs */ + xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */ + xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */ + xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */ + uint m_chsize; /* size of next field */ + struct xfs_chash *m_chash; /* fs private inode per-cluster + * hash table */ + atomic_t m_active_trans; /* number trans frozen */ +#ifdef HAVE_PERCPU_SB + xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ + unsigned long m_icsb_counters; /* disabled per-cpu counters */ + struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ + struct mutex m_icsb_mutex; /* balancer sync lock */ +#endif + struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ + struct delayed_work m_sync_work; /* background sync work */ + struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct work_struct m_flush_work; /* background inode flush */ + __int64_t m_update_flags; /* sb flags we need to update + on the next remount,rw */ + struct shrinker m_inode_shrink; /* inode reclaim shrinker */ + int64_t m_low_space[XFS_LOWSP_MAX]; + /* low free space thresholds */ +} xfs_mount_t; + +/* + * Flags for m_flags. + */ +#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops + must be synchronous except + for space allocations */ +#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */ +#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) +#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem + operations, typically for + disk errors in metadata */ +#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ +#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to + user */ +#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment + allocations */ +#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ +#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ +#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ +#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ +#define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above + * 32 bits in size */ +#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */ +#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */ +#define XFS_MOUNT_BARRIER (1ULL << 17) +#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/ +#define XFS_MOUNT_SWALLOC (1ULL << 19) /* turn on stripe width + * allocation */ +#define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */ +#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ +#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred + * I/O size in stat() */ +#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams + allocator */ +#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ + + +/* + * Default minimum read and write sizes. + */ +#define XFS_READIO_LOG_LARGE 16 +#define XFS_WRITEIO_LOG_LARGE 16 + +/* + * Max and min values for mount-option defined I/O + * preallocation sizes. + */ +#define XFS_MAX_IO_LOG 30 /* 1G */ +#define XFS_MIN_IO_LOG PAGE_SHIFT + +/* + * Synchronous read and write sizes. This should be + * better for NFSv2 wsync filesystems. + */ +#define XFS_WSYNC_READIO_LOG 15 /* 32k */ +#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */ + +/* + * Allow large block sizes to be reported to userspace programs if the + * "largeio" mount option is used. + * + * If compatibility mode is specified, simply return the basic unit of caching + * so that we don't get inefficient read/modify/write I/O from user apps. + * Otherwise.... + * + * If the underlying volume is a stripe, then return the stripe width in bytes + * as the recommended I/O size. It is not a stripe and we've set a default + * buffered I/O size, return that, otherwise return the compat default. + */ +static inline unsigned long +xfs_preferred_iosize(xfs_mount_t *mp) +{ + if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE) + return PAGE_CACHE_SIZE; + return (mp->m_swidth ? + (mp->m_swidth << mp->m_sb.sb_blocklog) : + ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ? + (1 << (int)MAX(mp->m_readio_log, mp->m_writeio_log)) : + PAGE_CACHE_SIZE)); +} + +#define XFS_MAXIOFFSET(mp) ((mp)->m_maxioffset) + +#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ + ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN) +#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) +void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, + int lnnum); +#define xfs_force_shutdown(m,f) \ + xfs_do_force_shutdown(m, f, __FILE__, __LINE__) + +#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ +#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ +#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ +#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ +#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ +#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ + +#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen) +#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l)) + +/* + * Flags for xfs_mountfs + */ +#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ + +static inline xfs_agnumber_t +xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) +{ + xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d); + do_div(ld, mp->m_sb.sb_agblocks); + return (xfs_agnumber_t) ld; +} + +static inline xfs_agblock_t +xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) +{ + xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d); + return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); +} + +/* + * perag get/put wrappers for ref counting + */ +struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); +struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, + int tag); +void xfs_perag_put(struct xfs_perag *pag); + +/* + * Per-cpu superblock locking functions + */ +#ifdef HAVE_PERCPU_SB +static inline void +xfs_icsb_lock(xfs_mount_t *mp) +{ + mutex_lock(&mp->m_icsb_mutex); +} + +static inline void +xfs_icsb_unlock(xfs_mount_t *mp) +{ + mutex_unlock(&mp->m_icsb_mutex); +} +#else +#define xfs_icsb_lock(mp) +#define xfs_icsb_unlock(mp) +#endif + +/* + * This structure is for use by the xfs_mod_incore_sb_batch() routine. + * xfs_growfs can specify a few fields which are more than int limit + */ +typedef struct xfs_mod_sb { + xfs_sb_field_t msb_field; /* Field to modify, see below */ + int64_t msb_delta; /* Change to make to specified field */ +} xfs_mod_sb_t; + +extern int xfs_log_sbcount(xfs_mount_t *, uint); +extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); +extern int xfs_mountfs(xfs_mount_t *mp); + +extern void xfs_unmountfs(xfs_mount_t *); +extern int xfs_unmountfs_writesb(xfs_mount_t *); +extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); +extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, + uint, int); +extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); +extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); +extern int xfs_readsb(xfs_mount_t *, int); +extern void xfs_freesb(xfs_mount_t *); +extern int xfs_fs_writable(xfs_mount_t *); +extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); + +extern int xfs_dev_is_read_only(struct xfs_mount *, char *); + +extern void xfs_set_low_space_thresholds(struct xfs_mount *); + +#endif /* __KERNEL__ */ + +extern void xfs_mod_sb(struct xfs_trans *, __int64_t); +extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, + xfs_agnumber_t *); +extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); +extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); + +#endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c new file mode 100644 index 00000000..4aff5639 --- /dev/null +++ b/fs/xfs/xfs_mru_cache.c @@ -0,0 +1,576 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_mru_cache.h" + +/* + * The MRU Cache data structure consists of a data store, an array of lists and + * a lock to protect its internal state. At initialisation time, the client + * supplies an element lifetime in milliseconds and a group count, as well as a + * function pointer to call when deleting elements. A data structure for + * queueing up work in the form of timed callbacks is also included. + * + * The group count controls how many lists are created, and thereby how finely + * the elements are grouped in time. When reaping occurs, all the elements in + * all the lists whose time has expired are deleted. + * + * To give an example of how this works in practice, consider a client that + * initialises an MRU Cache with a lifetime of ten seconds and a group count of + * five. Five internal lists will be created, each representing a two second + * period in time. When the first element is added, time zero for the data + * structure is initialised to the current time. + * + * All the elements added in the first two seconds are appended to the first + * list. Elements added in the third second go into the second list, and so on. + * If an element is accessed at any point, it is removed from its list and + * inserted at the head of the current most-recently-used list. + * + * The reaper function will have nothing to do until at least twelve seconds + * have elapsed since the first element was added. The reason for this is that + * if it were called at t=11s, there could be elements in the first list that + * have only been inactive for nine seconds, so it still does nothing. If it is + * called anywhere between t=12 and t=14 seconds, it will delete all the + * elements that remain in the first list. It's therefore possible for elements + * to remain in the data store even after they've been inactive for up to + * (t + t/g) seconds, where t is the inactive element lifetime and g is the + * number of groups. + * + * The above example assumes that the reaper function gets called at least once + * every (t/g) seconds. If it is called less frequently, unused elements will + * accumulate in the reap list until the reaper function is eventually called. + * The current implementation uses work queue callbacks to carefully time the + * reaper function calls, so this should happen rarely, if at all. + * + * From a design perspective, the primary reason for the choice of a list array + * representing discrete time intervals is that it's only practical to reap + * expired elements in groups of some appreciable size. This automatically + * introduces a granularity to element lifetimes, so there's no point storing an + * individual timeout with each element that specifies a more precise reap time. + * The bonus is a saving of sizeof(long) bytes of memory per element stored. + * + * The elements could have been stored in just one list, but an array of + * counters or pointers would need to be maintained to allow them to be divided + * up into discrete time groups. More critically, the process of touching or + * removing an element would involve walking large portions of the entire list, + * which would have a detrimental effect on performance. The additional memory + * requirement for the array of list heads is minimal. + * + * When an element is touched or deleted, it needs to be removed from its + * current list. Doubly linked lists are used to make the list maintenance + * portion of these operations O(1). Since reaper timing can be imprecise, + * inserts and lookups can occur when there are no free lists available. When + * this happens, all the elements on the LRU list need to be migrated to the end + * of the reap list. To keep the list maintenance portion of these operations + * O(1) also, list tails need to be accessible without walking the entire list. + * This is the reason why doubly linked list heads are used. + */ + +/* + * An MRU Cache is a dynamic data structure that stores its elements in a way + * that allows efficient lookups, but also groups them into discrete time + * intervals based on insertion time. This allows elements to be efficiently + * and automatically reaped after a fixed period of inactivity. + * + * When a client data pointer is stored in the MRU Cache it needs to be added to + * both the data store and to one of the lists. It must also be possible to + * access each of these entries via the other, i.e. to: + * + * a) Walk a list, removing the corresponding data store entry for each item. + * b) Look up a data store entry, then access its list entry directly. + * + * To achieve both of these goals, each entry must contain both a list entry and + * a key, in addition to the user's data pointer. Note that it's not a good + * idea to have the client embed one of these structures at the top of their own + * data structure, because inserting the same item more than once would most + * likely result in a loop in one of the lists. That's a sure-fire recipe for + * an infinite loop in the code. + */ +typedef struct xfs_mru_cache_elem +{ + struct list_head list_node; + unsigned long key; + void *value; +} xfs_mru_cache_elem_t; + +static kmem_zone_t *xfs_mru_elem_zone; +static struct workqueue_struct *xfs_mru_reap_wq; + +/* + * When inserting, destroying or reaping, it's first necessary to update the + * lists relative to a particular time. In the case of destroying, that time + * will be well in the future to ensure that all items are moved to the reap + * list. In all other cases though, the time will be the current time. + * + * This function enters a loop, moving the contents of the LRU list to the reap + * list again and again until either a) the lists are all empty, or b) time zero + * has been advanced sufficiently to be within the immediate element lifetime. + * + * Case a) above is detected by counting how many groups are migrated and + * stopping when they've all been moved. Case b) is detected by monitoring the + * time_zero field, which is updated as each group is migrated. + * + * The return value is the earliest time that more migration could be needed, or + * zero if there's no need to schedule more work because the lists are empty. + */ +STATIC unsigned long +_xfs_mru_cache_migrate( + xfs_mru_cache_t *mru, + unsigned long now) +{ + unsigned int grp; + unsigned int migrated = 0; + struct list_head *lru_list; + + /* Nothing to do if the data store is empty. */ + if (!mru->time_zero) + return 0; + + /* While time zero is older than the time spanned by all the lists. */ + while (mru->time_zero <= now - mru->grp_count * mru->grp_time) { + + /* + * If the LRU list isn't empty, migrate its elements to the tail + * of the reap list. + */ + lru_list = mru->lists + mru->lru_grp; + if (!list_empty(lru_list)) + list_splice_init(lru_list, mru->reap_list.prev); + + /* + * Advance the LRU group number, freeing the old LRU list to + * become the new MRU list; advance time zero accordingly. + */ + mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count; + mru->time_zero += mru->grp_time; + + /* + * If reaping is so far behind that all the elements on all the + * lists have been migrated to the reap list, it's now empty. + */ + if (++migrated == mru->grp_count) { + mru->lru_grp = 0; + mru->time_zero = 0; + return 0; + } + } + + /* Find the first non-empty list from the LRU end. */ + for (grp = 0; grp < mru->grp_count; grp++) { + + /* Check the grp'th list from the LRU end. */ + lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count); + if (!list_empty(lru_list)) + return mru->time_zero + + (mru->grp_count + grp) * mru->grp_time; + } + + /* All the lists must be empty. */ + mru->lru_grp = 0; + mru->time_zero = 0; + return 0; +} + +/* + * When inserting or doing a lookup, an element needs to be inserted into the + * MRU list. The lists must be migrated first to ensure that they're + * up-to-date, otherwise the new element could be given a shorter lifetime in + * the cache than it should. + */ +STATIC void +_xfs_mru_cache_list_insert( + xfs_mru_cache_t *mru, + xfs_mru_cache_elem_t *elem) +{ + unsigned int grp = 0; + unsigned long now = jiffies; + + /* + * If the data store is empty, initialise time zero, leave grp set to + * zero and start the work queue timer if necessary. Otherwise, set grp + * to the number of group times that have elapsed since time zero. + */ + if (!_xfs_mru_cache_migrate(mru, now)) { + mru->time_zero = now; + if (!mru->queued) { + mru->queued = 1; + queue_delayed_work(xfs_mru_reap_wq, &mru->work, + mru->grp_count * mru->grp_time); + } + } else { + grp = (now - mru->time_zero) / mru->grp_time; + grp = (mru->lru_grp + grp) % mru->grp_count; + } + + /* Insert the element at the tail of the corresponding list. */ + list_add_tail(&elem->list_node, mru->lists + grp); +} + +/* + * When destroying or reaping, all the elements that were migrated to the reap + * list need to be deleted. For each element this involves removing it from the + * data store, removing it from the reap list, calling the client's free + * function and deleting the element from the element zone. + * + * We get called holding the mru->lock, which we drop and then reacquire. + * Sparse need special help with this to tell it we know what we are doing. + */ +STATIC void +_xfs_mru_cache_clear_reap_list( + xfs_mru_cache_t *mru) __releases(mru->lock) __acquires(mru->lock) + +{ + xfs_mru_cache_elem_t *elem, *next; + struct list_head tmp; + + INIT_LIST_HEAD(&tmp); + list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) { + + /* Remove the element from the data store. */ + radix_tree_delete(&mru->store, elem->key); + + /* + * remove to temp list so it can be freed without + * needing to hold the lock + */ + list_move(&elem->list_node, &tmp); + } + spin_unlock(&mru->lock); + + list_for_each_entry_safe(elem, next, &tmp, list_node) { + + /* Remove the element from the reap list. */ + list_del_init(&elem->list_node); + + /* Call the client's free function with the key and value pointer. */ + mru->free_func(elem->key, elem->value); + + /* Free the element structure. */ + kmem_zone_free(xfs_mru_elem_zone, elem); + } + + spin_lock(&mru->lock); +} + +/* + * We fire the reap timer every group expiry interval so + * we always have a reaper ready to run. This makes shutdown + * and flushing of the reaper easy to do. Hence we need to + * keep when the next reap must occur so we can determine + * at each interval whether there is anything we need to do. + */ +STATIC void +_xfs_mru_cache_reap( + struct work_struct *work) +{ + xfs_mru_cache_t *mru = container_of(work, xfs_mru_cache_t, work.work); + unsigned long now, next; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return; + + spin_lock(&mru->lock); + next = _xfs_mru_cache_migrate(mru, jiffies); + _xfs_mru_cache_clear_reap_list(mru); + + mru->queued = next; + if ((mru->queued > 0)) { + now = jiffies; + if (next <= now) + next = 0; + else + next -= now; + queue_delayed_work(xfs_mru_reap_wq, &mru->work, next); + } + + spin_unlock(&mru->lock); +} + +int +xfs_mru_cache_init(void) +{ + xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t), + "xfs_mru_cache_elem"); + if (!xfs_mru_elem_zone) + goto out; + + xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1); + if (!xfs_mru_reap_wq) + goto out_destroy_mru_elem_zone; + + return 0; + + out_destroy_mru_elem_zone: + kmem_zone_destroy(xfs_mru_elem_zone); + out: + return -ENOMEM; +} + +void +xfs_mru_cache_uninit(void) +{ + destroy_workqueue(xfs_mru_reap_wq); + kmem_zone_destroy(xfs_mru_elem_zone); +} + +/* + * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create() + * with the address of the pointer, a lifetime value in milliseconds, a group + * count and a free function to use when deleting elements. This function + * returns 0 if the initialisation was successful. + */ +int +xfs_mru_cache_create( + xfs_mru_cache_t **mrup, + unsigned int lifetime_ms, + unsigned int grp_count, + xfs_mru_cache_free_func_t free_func) +{ + xfs_mru_cache_t *mru = NULL; + int err = 0, grp; + unsigned int grp_time; + + if (mrup) + *mrup = NULL; + + if (!mrup || !grp_count || !lifetime_ms || !free_func) + return EINVAL; + + if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) + return EINVAL; + + if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) + return ENOMEM; + + /* An extra list is needed to avoid reaping up to a grp_time early. */ + mru->grp_count = grp_count + 1; + mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); + + if (!mru->lists) { + err = ENOMEM; + goto exit; + } + + for (grp = 0; grp < mru->grp_count; grp++) + INIT_LIST_HEAD(mru->lists + grp); + + /* + * We use GFP_KERNEL radix tree preload and do inserts under a + * spinlock so GFP_ATOMIC is appropriate for the radix tree itself. + */ + INIT_RADIX_TREE(&mru->store, GFP_ATOMIC); + INIT_LIST_HEAD(&mru->reap_list); + spin_lock_init(&mru->lock); + INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap); + + mru->grp_time = grp_time; + mru->free_func = free_func; + + *mrup = mru; + +exit: + if (err && mru && mru->lists) + kmem_free(mru->lists); + if (err && mru) + kmem_free(mru); + + return err; +} + +/* + * Call xfs_mru_cache_flush() to flush out all cached entries, calling their + * free functions as they're deleted. When this function returns, the caller is + * guaranteed that all the free functions for all the elements have finished + * executing and the reaper is not running. + */ +static void +xfs_mru_cache_flush( + xfs_mru_cache_t *mru) +{ + if (!mru || !mru->lists) + return; + + spin_lock(&mru->lock); + if (mru->queued) { + spin_unlock(&mru->lock); + cancel_delayed_work_sync(&mru->work); + spin_lock(&mru->lock); + } + + _xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time); + _xfs_mru_cache_clear_reap_list(mru); + + spin_unlock(&mru->lock); +} + +void +xfs_mru_cache_destroy( + xfs_mru_cache_t *mru) +{ + if (!mru || !mru->lists) + return; + + xfs_mru_cache_flush(mru); + + kmem_free(mru->lists); + kmem_free(mru); +} + +/* + * To insert an element, call xfs_mru_cache_insert() with the data store, the + * element's key and the client data pointer. This function returns 0 on + * success or ENOMEM if memory for the data element couldn't be allocated. + */ +int +xfs_mru_cache_insert( + xfs_mru_cache_t *mru, + unsigned long key, + void *value) +{ + xfs_mru_cache_elem_t *elem; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return EINVAL; + + elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP); + if (!elem) + return ENOMEM; + + if (radix_tree_preload(GFP_KERNEL)) { + kmem_zone_free(xfs_mru_elem_zone, elem); + return ENOMEM; + } + + INIT_LIST_HEAD(&elem->list_node); + elem->key = key; + elem->value = value; + + spin_lock(&mru->lock); + + radix_tree_insert(&mru->store, key, elem); + radix_tree_preload_end(); + _xfs_mru_cache_list_insert(mru, elem); + + spin_unlock(&mru->lock); + + return 0; +} + +/* + * To remove an element without calling the free function, call + * xfs_mru_cache_remove() with the data store and the element's key. On success + * the client data pointer for the removed element is returned, otherwise this + * function will return a NULL pointer. + */ +void * +xfs_mru_cache_remove( + xfs_mru_cache_t *mru, + unsigned long key) +{ + xfs_mru_cache_elem_t *elem; + void *value = NULL; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return NULL; + + spin_lock(&mru->lock); + elem = radix_tree_delete(&mru->store, key); + if (elem) { + value = elem->value; + list_del(&elem->list_node); + } + + spin_unlock(&mru->lock); + + if (elem) + kmem_zone_free(xfs_mru_elem_zone, elem); + + return value; +} + +/* + * To remove and element and call the free function, call xfs_mru_cache_delete() + * with the data store and the element's key. + */ +void +xfs_mru_cache_delete( + xfs_mru_cache_t *mru, + unsigned long key) +{ + void *value = xfs_mru_cache_remove(mru, key); + + if (value) + mru->free_func(key, value); +} + +/* + * To look up an element using its key, call xfs_mru_cache_lookup() with the + * data store and the element's key. If found, the element will be moved to the + * head of the MRU list to indicate that it's been touched. + * + * The internal data structures are protected by a spinlock that is STILL HELD + * when this function returns. Call xfs_mru_cache_done() to release it. Note + * that it is not safe to call any function that might sleep in the interim. + * + * The implementation could have used reference counting to avoid this + * restriction, but since most clients simply want to get, set or test a member + * of the returned data structure, the extra per-element memory isn't warranted. + * + * If the element isn't found, this function returns NULL and the spinlock is + * released. xfs_mru_cache_done() should NOT be called when this occurs. + * + * Because sparse isn't smart enough to know about conditional lock return + * status, we need to help it get it right by annotating the path that does + * not release the lock. + */ +void * +xfs_mru_cache_lookup( + xfs_mru_cache_t *mru, + unsigned long key) +{ + xfs_mru_cache_elem_t *elem; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return NULL; + + spin_lock(&mru->lock); + elem = radix_tree_lookup(&mru->store, key); + if (elem) { + list_del(&elem->list_node); + _xfs_mru_cache_list_insert(mru, elem); + __release(mru_lock); /* help sparse not be stupid */ + } else + spin_unlock(&mru->lock); + + return elem ? elem->value : NULL; +} + +/* + * To release the internal data structure spinlock after having performed an + * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() + * with the data store pointer. + */ +void +xfs_mru_cache_done( + xfs_mru_cache_t *mru) __releases(mru->lock) +{ + spin_unlock(&mru->lock); +} diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h new file mode 100644 index 00000000..36dd3ec8 --- /dev/null +++ b/fs/xfs/xfs_mru_cache.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_MRU_CACHE_H__ +#define __XFS_MRU_CACHE_H__ + + +/* Function pointer type for callback to free a client's data pointer. */ +typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*); + +typedef struct xfs_mru_cache +{ + struct radix_tree_root store; /* Core storage data structure. */ + struct list_head *lists; /* Array of lists, one per grp. */ + struct list_head reap_list; /* Elements overdue for reaping. */ + spinlock_t lock; /* Lock to protect this struct. */ + unsigned int grp_count; /* Number of discrete groups. */ + unsigned int grp_time; /* Time period spanned by grps. */ + unsigned int lru_grp; /* Group containing time zero. */ + unsigned long time_zero; /* Time first element was added. */ + xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ + struct delayed_work work; /* Workqueue data for reaping. */ + unsigned int queued; /* work has been queued */ +} xfs_mru_cache_t; + +int xfs_mru_cache_init(void); +void xfs_mru_cache_uninit(void); +int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, + unsigned int grp_count, + xfs_mru_cache_free_func_t free_func); +void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); +int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, + void *value); +void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); +void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); +void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); +void xfs_mru_cache_done(struct xfs_mru_cache *mru); + +#endif /* __XFS_MRU_CACHE_H__ */ diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h new file mode 100644 index 00000000..a595f295 --- /dev/null +++ b/fs/xfs/xfs_quota.h @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_QUOTA_H__ +#define __XFS_QUOTA_H__ + +struct xfs_trans; + +/* + * The ondisk form of a dquot structure. + */ +#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ +#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */ + +/* + * uid_t and gid_t are hard-coded to 32 bits in the inode. + * Hence, an 'id' in a dquot is 32 bits.. + */ +typedef __uint32_t xfs_dqid_t; + +/* + * Even though users may not have quota limits occupying all 64-bits, + * they may need 64-bit accounting. Hence, 64-bit quota-counters, + * and quota-limits. This is a waste in the common case, but hey ... + */ +typedef __uint64_t xfs_qcnt_t; +typedef __uint16_t xfs_qwarncnt_t; + +/* + * This is the main portion of the on-disk representation of quota + * information for a user. This is the q_core of the xfs_dquot_t that + * is kept in kernel memory. We pad this with some more expansion room + * to construct the on disk structure. + */ +typedef struct xfs_disk_dquot { + __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ + __u8 d_version; /* dquot version */ + __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ + __be32 d_id; /* user,project,group id */ + __be64 d_blk_hardlimit;/* absolute limit on disk blks */ + __be64 d_blk_softlimit;/* preferred limit on disk blks */ + __be64 d_ino_hardlimit;/* maximum # allocated inodes */ + __be64 d_ino_softlimit;/* preferred inode limit */ + __be64 d_bcount; /* disk blocks owned by the user */ + __be64 d_icount; /* inodes owned by the user */ + __be32 d_itimer; /* zero if within inode limits if not, + this is when we refuse service */ + __be32 d_btimer; /* similar to above; for disk blocks */ + __be16 d_iwarns; /* warnings issued wrt num inodes */ + __be16 d_bwarns; /* warnings issued wrt disk blocks */ + __be32 d_pad0; /* 64 bit align */ + __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */ + __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */ + __be64 d_rtbcount; /* realtime blocks owned */ + __be32 d_rtbtimer; /* similar to above; for RT disk blocks */ + __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */ + __be16 d_pad; +} xfs_disk_dquot_t; + +/* + * This is what goes on disk. This is separated from the xfs_disk_dquot because + * carrying the unnecessary padding would be a waste of memory. + */ +typedef struct xfs_dqblk { + xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ + char dd_fill[32]; /* filling for posterity */ +} xfs_dqblk_t; + +/* + * flags for q_flags field in the dquot. + */ +#define XFS_DQ_USER 0x0001 /* a user quota */ +#define XFS_DQ_PROJ 0x0002 /* project quota */ +#define XFS_DQ_GROUP 0x0004 /* a group quota */ +#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ +#define XFS_DQ_WANT 0x0010 /* for lookup/reclaim race */ +#define XFS_DQ_INACTIVE 0x0020 /* dq off mplist & hashlist */ + +#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) + +#define XFS_DQ_FLAGS \ + { XFS_DQ_USER, "USER" }, \ + { XFS_DQ_PROJ, "PROJ" }, \ + { XFS_DQ_GROUP, "GROUP" }, \ + { XFS_DQ_DIRTY, "DIRTY" }, \ + { XFS_DQ_WANT, "WANT" }, \ + { XFS_DQ_INACTIVE, "INACTIVE" } + +/* + * In the worst case, when both user and group quotas are on, + * we can have a max of three dquots changing in a single transaction. + */ +#define XFS_DQUOT_LOGRES(mp) (sizeof(xfs_disk_dquot_t) * 3) + + +/* + * These are the structures used to lay out dquots and quotaoff + * records on the log. Quite similar to those of inodes. + */ + +/* + * log format struct for dquots. + * The first two fields must be the type and size fitting into + * 32 bits : log_recovery code assumes that. + */ +typedef struct xfs_dq_logformat { + __uint16_t qlf_type; /* dquot log item type */ + __uint16_t qlf_size; /* size of this item */ + xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ + __int64_t qlf_blkno; /* blkno of dquot buffer */ + __int32_t qlf_len; /* len of dquot buffer */ + __uint32_t qlf_boffset; /* off of dquot in buffer */ +} xfs_dq_logformat_t; + +/* + * log format struct for QUOTAOFF records. + * The first two fields must be the type and size fitting into + * 32 bits : log_recovery code assumes that. + * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer + * to the first and ensures that the first logitem is taken out of the AIL + * only when the last one is securely committed. + */ +typedef struct xfs_qoff_logformat { + unsigned short qf_type; /* quotaoff log item type */ + unsigned short qf_size; /* size of this item */ + unsigned int qf_flags; /* USR and/or GRP */ + char qf_pad[12]; /* padding for future */ +} xfs_qoff_logformat_t; + + +/* + * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. + */ +#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */ +#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */ +#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */ +#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */ +#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */ +#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */ +#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ + +/* + * Quota Accounting/Enforcement flags + */ +#define XFS_ALL_QUOTA_ACCT \ + (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) +#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD) +#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD) + +#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) +#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) +#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) +#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) +#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) +#define XFS_IS_OQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_OQUOTA_ENFD) + +/* + * Incore only flags for quotaoff - these bits get cleared when quota(s) + * are in the process of getting turned off. These flags are in m_qflags but + * never in sb_qflags. + */ +#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */ +#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */ +#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */ + +/* + * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees + * quota will be not be switched off as long as that inode lock is held. + */ +#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ + XFS_GQUOTA_ACTIVE | \ + XFS_PQUOTA_ACTIVE)) +#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \ + XFS_PQUOTA_ACTIVE)) +#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) +#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) +#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) + +/* + * Flags to tell various functions what to do. Not all of these are meaningful + * to a single function. None of these XFS_QMOPT_* flags are meant to have + * persistent values (ie. their values can and will change between versions) + */ +#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ +#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ +#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ +#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ +#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ +#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ +#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ +#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ +#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ +#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ + +/* + * flags to xfs_trans_mod_dquot to indicate which field needs to be + * modified. + */ +#define XFS_QMOPT_RES_REGBLKS 0x0010000 +#define XFS_QMOPT_RES_RTBLKS 0x0020000 +#define XFS_QMOPT_BCOUNT 0x0040000 +#define XFS_QMOPT_ICOUNT 0x0080000 +#define XFS_QMOPT_RTBCOUNT 0x0100000 +#define XFS_QMOPT_DELBCOUNT 0x0200000 +#define XFS_QMOPT_DELRTBCOUNT 0x0400000 +#define XFS_QMOPT_RES_INOS 0x0800000 + +/* + * flags for dqalloc. + */ +#define XFS_QMOPT_INHERIT 0x1000000 + +/* + * flags to xfs_trans_mod_dquot. + */ +#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS +#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS +#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS +#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT +#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT +#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT +#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT +#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT + + +#define XFS_QMOPT_QUOTALL \ + (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) +#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) + +#ifdef __KERNEL__ +/* + * This check is done typically without holding the inode lock; + * that may seem racy, but it is harmless in the context that it is used. + * The inode cannot go inactive as long a reference is kept, and + * therefore if dquot(s) were attached, they'll stay consistent. + * If, for example, the ownership of the inode changes while + * we didn't have the inode locked, the appropriate dquot(s) will be + * attached atomically. + */ +#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\ + (ip)->i_udquot == NULL) || \ + (XFS_IS_OQUOTA_ON(mp) && \ + (ip)->i_gdquot == NULL)) + +#define XFS_QM_NEED_QUOTACHECK(mp) \ + ((XFS_IS_UQUOTA_ON(mp) && \ + (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \ + (XFS_IS_GQUOTA_ON(mp) && \ + ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ + (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \ + (XFS_IS_PQUOTA_ON(mp) && \ + ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ + (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT)))) + +#define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ + XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ + XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD) + +#define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ + XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ + XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD) + +#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ + XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ + XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\ + XFS_GQUOTA_ACCT) + + +/* + * The structure kept inside the xfs_trans_t keep track of dquot changes + * within a transaction and apply them later. + */ +typedef struct xfs_dqtrx { + struct xfs_dquot *qt_dquot; /* the dquot this refers to */ + ulong qt_blk_res; /* blks reserved on a dquot */ + ulong qt_blk_res_used; /* blks used from the reservation */ + ulong qt_ino_res; /* inode reserved on a dquot */ + ulong qt_ino_res_used; /* inodes used from the reservation */ + long qt_bcount_delta; /* dquot blk count changes */ + long qt_delbcnt_delta; /* delayed dquot blk count changes */ + long qt_icount_delta; /* dquot inode count changes */ + ulong qt_rtblk_res; /* # blks reserved on a dquot */ + ulong qt_rtblk_res_used;/* # blks used from reservation */ + long qt_rtbcount_delta;/* dquot realtime blk changes */ + long qt_delrtb_delta; /* delayed RT blk count changes */ +} xfs_dqtrx_t; + +#ifdef CONFIG_XFS_QUOTA +extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *); +extern void xfs_trans_free_dqinfo(struct xfs_trans *); +extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *, + uint, long); +extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *); +extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *); +extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *, + struct xfs_inode *, long, long, uint); +extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, + struct xfs_mount *, struct xfs_dquot *, + struct xfs_dquot *, long, long, uint); + +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint, + struct xfs_dquot **, struct xfs_dquot **); +extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, + struct xfs_dquot *, struct xfs_dquot *); +extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **); +extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *, + struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *); +extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *, + struct xfs_dquot *, struct xfs_dquot *, uint); +extern int xfs_qm_dqattach(struct xfs_inode *, uint); +extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint); +extern void xfs_qm_dqdetach(struct xfs_inode *); +extern void xfs_qm_dqrele(struct xfs_dquot *); +extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *); +extern int xfs_qm_sync(struct xfs_mount *, int); +extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *); +extern void xfs_qm_mount_quotas(struct xfs_mount *); +extern void xfs_qm_unmount(struct xfs_mount *); +extern void xfs_qm_unmount_quotas(struct xfs_mount *); + +#else +static inline int +xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, + uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp) +{ + *udqp = NULL; + *gdqp = NULL; + return 0; +} +#define xfs_trans_dup_dqinfo(tp, tp2) +#define xfs_trans_free_dqinfo(tp) +#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) +#define xfs_trans_apply_dquot_deltas(tp) +#define xfs_trans_unreserve_and_mod_dquots(tp) +static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, + struct xfs_inode *ip, long nblks, long ninos, uint flags) +{ + return 0; +} +static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, + struct xfs_mount *mp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, long nblks, long nions, uint flags) +{ + return 0; +} +#define xfs_qm_vop_create_dqattach(tp, ip, u, g) +#define xfs_qm_vop_rename_dqattach(it) (0) +#define xfs_qm_vop_chown(tp, ip, old, new) (NULL) +#define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl) (0) +#define xfs_qm_dqattach(ip, fl) (0) +#define xfs_qm_dqattach_locked(ip, fl) (0) +#define xfs_qm_dqdetach(ip) +#define xfs_qm_dqrele(d) +#define xfs_qm_statvfs(ip, s) +static inline int xfs_qm_sync(struct xfs_mount *mp, int flags) +{ + return 0; +} +#define xfs_qm_newmount(mp, a, b) (0) +#define xfs_qm_mount_quotas(mp) +#define xfs_qm_unmount(mp) +#define xfs_qm_unmount_quotas(mp) +#endif /* CONFIG_XFS_QUOTA */ + +#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ + xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags) +#define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \ + xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \ + f | XFS_QMOPT_RES_REGBLKS) + +extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *, + xfs_dqid_t, uint, uint, char *); +extern int xfs_mount_reset_sbqflags(struct xfs_mount *); + +#endif /* __KERNEL__ */ +#endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c new file mode 100644 index 00000000..77a59891 --- /dev/null +++ b/fs/xfs/xfs_rename.c @@ -0,0 +1,358 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_utils.h" +#include "xfs_trans_space.h" +#include "xfs_vnodeops.h" +#include "xfs_trace.h" + + +/* + * Enter all inodes for a rename transaction into a sorted array. + */ +STATIC void +xfs_sort_for_rename( + xfs_inode_t *dp1, /* in: old (source) directory inode */ + xfs_inode_t *dp2, /* in: new (target) directory inode */ + xfs_inode_t *ip1, /* in: inode of old entry */ + xfs_inode_t *ip2, /* in: inode of new entry, if it + already exists, NULL otherwise. */ + xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ + int *num_inodes) /* out: number of inodes in array */ +{ + xfs_inode_t *temp; + int i, j; + + /* + * i_tab contains a list of pointers to inodes. We initialize + * the table here & we'll sort it. We will then use it to + * order the acquisition of the inode locks. + * + * Note that the table may contain duplicates. e.g., dp1 == dp2. + */ + i_tab[0] = dp1; + i_tab[1] = dp2; + i_tab[2] = ip1; + if (ip2) { + *num_inodes = 4; + i_tab[3] = ip2; + } else { + *num_inodes = 3; + i_tab[3] = NULL; + } + + /* + * Sort the elements via bubble sort. (Remember, there are at + * most 4 elements to sort, so this is adequate.) + */ + for (i = 0; i < *num_inodes; i++) { + for (j = 1; j < *num_inodes; j++) { + if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { + temp = i_tab[j]; + i_tab[j] = i_tab[j-1]; + i_tab[j-1] = temp; + } + } + } +} + +/* + * xfs_rename + */ +int +xfs_rename( + xfs_inode_t *src_dp, + struct xfs_name *src_name, + xfs_inode_t *src_ip, + xfs_inode_t *target_dp, + struct xfs_name *target_name, + xfs_inode_t *target_ip) +{ + xfs_trans_t *tp = NULL; + xfs_mount_t *mp = src_dp->i_mount; + int new_parent; /* moving to a new dir */ + int src_is_directory; /* src_name is a directory */ + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + xfs_inode_t *inodes[4]; + int spaceres; + int num_inodes; + + trace_xfs_rename(src_dp, target_dp, src_name, target_name); + + new_parent = (src_dp != target_dp); + src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); + + if (src_is_directory) { + /* + * Check for link count overflow on target_dp + */ + if (target_ip == NULL && new_parent && + target_dp->i_d.di_nlink >= XFS_MAXLINK) { + error = XFS_ERROR(EMLINK); + goto std_return; + } + } + + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, + inodes, &num_inodes); + + xfs_bmap_init(&free_list, &first_block); + tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); + error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT); + if (error == ENOSPC) { + spaceres = 0; + error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT); + } + if (error) { + xfs_trans_cancel(tp, 0); + goto std_return; + } + + /* + * Attach the dquots to the inodes + */ + error = xfs_qm_vop_rename_dqattach(inodes); + if (error) { + xfs_trans_cancel(tp, cancel_flags); + goto std_return; + } + + /* + * Lock all the participating inodes. Depending upon whether + * the target_name exists in the target directory, and + * whether the target directory is the same as the source + * directory, we can lock from 2 to 4 inodes. + */ + xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); + + /* + * Join all the inodes to the transaction. From this point on, + * we can rely on either trans_commit or trans_cancel to unlock + * them. + */ + xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL); + if (new_parent) + xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL); + if (target_ip) + xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL); + + /* + * If we are using project inheritance, we only allow renames + * into our tree when the project IDs are the same; else the + * tree quota mechanism would be circumvented. + */ + if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { + error = XFS_ERROR(EXDEV); + goto error_return; + } + + /* + * Set up the target. + */ + if (target_ip == NULL) { + /* + * If there's no space reservation, check the entry will + * fit before actually inserting it. + */ + error = xfs_dir_canenter(tp, target_dp, target_name, spaceres); + if (error) + goto error_return; + /* + * If target does not exist and the rename crosses + * directories, adjust the target directory link count + * to account for the ".." reference from the new entry. + */ + error = xfs_dir_createname(tp, target_dp, target_name, + src_ip->i_ino, &first_block, + &free_list, spaceres); + if (error == ENOSPC) + goto error_return; + if (error) + goto abort_return; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + if (new_parent && src_is_directory) { + error = xfs_bumplink(tp, target_dp); + if (error) + goto abort_return; + } + } else { /* target_ip != NULL */ + /* + * If target exists and it's a directory, check that both + * target and source are directories and that target can be + * destroyed, or that neither is a directory. + */ + if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + /* + * Make sure target dir is empty. + */ + if (!(xfs_dir_isempty(target_ip)) || + (target_ip->i_d.di_nlink > 2)) { + error = XFS_ERROR(EEXIST); + goto error_return; + } + } + + /* + * Link the source inode under the target name. + * If the source inode is a directory and we are moving + * it across directories, its ".." entry will be + * inconsistent until we replace that down below. + * + * In case there is already an entry with the same + * name at the destination directory, remove it first. + */ + error = xfs_dir_replace(tp, target_dp, target_name, + src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error) + goto abort_return; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* + * Decrement the link count on the target since the target + * dir no longer points to it. + */ + error = xfs_droplink(tp, target_ip); + if (error) + goto abort_return; + + if (src_is_directory) { + /* + * Drop the link from the old "." entry. + */ + error = xfs_droplink(tp, target_ip); + if (error) + goto abort_return; + } + } /* target_ip != NULL */ + + /* + * Remove the source. + */ + if (new_parent && src_is_directory) { + /* + * Rewrite the ".." entry to point to the new + * directory. + */ + error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, + target_dp->i_ino, + &first_block, &free_list, spaceres); + ASSERT(error != EEXIST); + if (error) + goto abort_return; + } + + /* + * We always want to hit the ctime on the source inode. + * + * This isn't strictly required by the standards since the source + * inode isn't really being changed, but old unix file systems did + * it and some incremental backup programs won't work without it. + */ + xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); + + /* + * Adjust the link count on src_dp. This is necessary when + * renaming a directory, either within one parent when + * the target existed, or across two parent directories. + */ + if (src_is_directory && (new_parent || target_ip != NULL)) { + + /* + * Decrement link count on src_directory since the + * entry that's moved no longer points to it. + */ + error = xfs_droplink(tp, src_dp); + if (error) + goto abort_return; + } + + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error) + goto abort_return; + + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); + if (new_parent) + xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + + /* + * If this is a synchronous mount, make sure that the + * rename transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + goto std_return; + } + + /* + * trans_commit will unlock src_ip, target_ip & decrement + * the vnode references. + */ + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + + abort_return: + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; +} diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c new file mode 100644 index 00000000..8f76fdff --- /dev/null +++ b/fs/xfs/xfs_rtalloc.c @@ -0,0 +1,2325 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_rtalloc.h" +#include "xfs_fsops.h" +#include "xfs_error.h" +#include "xfs_rw.h" +#include "xfs_inode_item.h" +#include "xfs_trans_space.h" +#include "xfs_utils.h" +#include "xfs_trace.h" +#include "xfs_buf.h" + + +/* + * Prototypes for internal functions. + */ + + +STATIC int xfs_rtallocate_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, + xfs_extlen_t, xfs_buf_t **, xfs_fsblock_t *); +STATIC int xfs_rtany_summary(xfs_mount_t *, xfs_trans_t *, int, int, + xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, int *); +STATIC int xfs_rtcheck_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, + xfs_extlen_t, int, xfs_rtblock_t *, int *); +STATIC int xfs_rtfind_back(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, + xfs_rtblock_t, xfs_rtblock_t *); +STATIC int xfs_rtfind_forw(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, + xfs_rtblock_t, xfs_rtblock_t *); +STATIC int xfs_rtget_summary( xfs_mount_t *, xfs_trans_t *, int, + xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, xfs_suminfo_t *); +STATIC int xfs_rtmodify_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, + xfs_extlen_t, int); +STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int, + xfs_rtblock_t, int, xfs_buf_t **, xfs_fsblock_t *); + +/* + * Internal functions. + */ + +/* + * Allocate space to the bitmap or summary file, and zero it, for growfs. + */ +STATIC int /* error */ +xfs_growfs_rt_alloc( + xfs_mount_t *mp, /* file system mount point */ + xfs_extlen_t oblocks, /* old count of blocks */ + xfs_extlen_t nblocks, /* new count of blocks */ + xfs_inode_t *ip) /* inode (bitmap/summary) */ +{ + xfs_fileoff_t bno; /* block number in file */ + xfs_buf_t *bp; /* temporary buffer for zeroing */ + int committed; /* transaction committed flag */ + xfs_daddr_t d; /* disk block address */ + int error; /* error return value */ + xfs_fsblock_t firstblock; /* first block allocated in xaction */ + xfs_bmap_free_t flist; /* list of freed blocks */ + xfs_fsblock_t fsbno; /* filesystem block for bno */ + xfs_bmbt_irec_t map; /* block map output */ + int nmap; /* number of block maps */ + int resblks; /* space reservation */ + + /* + * Allocate space to the file, as necessary. + */ + while (oblocks < nblocks) { + int cancelflags = 0; + xfs_trans_t *tp; + + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); + resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); + /* + * Reserve space & log for one extent added to the file. + */ + if ((error = xfs_trans_reserve(tp, resblks, + XFS_GROWRTALLOC_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_DEFAULT_PERM_LOG_COUNT))) + goto error_cancel; + cancelflags = XFS_TRANS_RELEASE_LOG_RES; + /* + * Lock the inode. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + + xfs_bmap_init(&flist, &firstblock); + /* + * Allocate blocks to the bitmap file. + */ + nmap = 1; + cancelflags |= XFS_TRANS_ABORT; + error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks, + XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock, + resblks, &map, &nmap, &flist); + if (!error && nmap < 1) + error = XFS_ERROR(ENOSPC); + if (error) + goto error_cancel; + /* + * Free any blocks freed up in the transaction, then commit. + */ + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) + goto error_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error; + /* + * Now we need to clear the allocated blocks. + * Do this one block per transaction, to keep it simple. + */ + cancelflags = 0; + for (bno = map.br_startoff, fsbno = map.br_startblock; + bno < map.br_startoff + map.br_blockcount; + bno++, fsbno++) { + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO); + /* + * Reserve log for one block zeroing. + */ + if ((error = xfs_trans_reserve(tp, 0, + XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0))) + goto error_cancel; + /* + * Lock the bitmap inode. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + /* + * Get a buffer for the block. + */ + d = XFS_FSB_TO_DADDR(mp, fsbno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize, 0); + if (bp == NULL) { + error = XFS_ERROR(EIO); +error_cancel: + xfs_trans_cancel(tp, cancelflags); + goto error; + } + memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize); + xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); + /* + * Commit the transaction. + */ + error = xfs_trans_commit(tp, 0); + if (error) + goto error; + } + /* + * Go on to the next extent, if any. + */ + oblocks = map.br_startoff + map.br_blockcount; + } + return 0; +error: + return error; +} + +/* + * Attempt to allocate an extent minlen<=len<=maxlen starting from + * bitmap block bbno. If we don't get maxlen then use prod to trim + * the length, if given. Returns error; returns starting block in *rtblock. + * The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_block( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_rtblock_t *nextp, /* out: next block to try */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + xfs_rtblock_t besti; /* best rtblock found so far */ + xfs_rtblock_t bestlen; /* best length found so far */ + xfs_rtblock_t end; /* last rtblock in chunk */ + int error; /* error value */ + xfs_rtblock_t i; /* current rtblock trying */ + xfs_rtblock_t next; /* next rtblock to try */ + int stat; /* status from internal calls */ + + /* + * Loop over all the extents starting in this bitmap block, + * looking for one that's long enough. + */ + for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0, + end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1; + i <= end; + i++) { + /* + * See if there's a free extent of maxlen starting at i. + * If it's not so then next will contain the first non-free. + */ + error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat); + if (error) { + return error; + } + if (stat) { + /* + * i for maxlen is all free, allocate and return that. + */ + error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp, + rsb); + if (error) { + return error; + } + *len = maxlen; + *rtblock = i; + return 0; + } + /* + * In the case where we have a variable-sized allocation + * request, figure out how big this free piece is, + * and if it's big enough for the minimum, and the best + * so far, remember it. + */ + if (minlen < maxlen) { + xfs_rtblock_t thislen; /* this extent size */ + + thislen = next - i; + if (thislen >= minlen && thislen > bestlen) { + besti = i; + bestlen = thislen; + } + } + /* + * If not done yet, find the start of the next free space. + */ + if (next < end) { + error = xfs_rtfind_forw(mp, tp, next, end, &i); + if (error) { + return error; + } + } else + break; + } + /* + * Searched the whole thing & didn't find a maxlen free extent. + */ + if (minlen < maxlen && besti != -1) { + xfs_extlen_t p; /* amount to trim length by */ + + /* + * If size should be a multiple of prod, make that so. + */ + if (prod > 1 && (p = do_mod(bestlen, prod))) + bestlen -= p; + /* + * Allocate besti for bestlen & return that. + */ + error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb); + if (error) { + return error; + } + *len = bestlen; + *rtblock = besti; + return 0; + } + /* + * Allocation failed. Set *nextp to the next block to try. + */ + *nextp = next; + *rtblock = NULLRTBLOCK; + return 0; +} + +/* + * Allocate an extent of length minlen<=len<=maxlen, starting at block + * bno. If we don't get maxlen then use prod to trim the length, if given. + * Returns error; returns starting block in *rtblock. + * The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_exact( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + int error; /* error value */ + xfs_extlen_t i; /* extent length trimmed due to prod */ + int isfree; /* extent is free */ + xfs_rtblock_t next; /* next block to try (dummy) */ + + ASSERT(minlen % prod == 0 && maxlen % prod == 0); + /* + * Check if the range in question (for maxlen) is free. + */ + error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree); + if (error) { + return error; + } + if (isfree) { + /* + * If it is, allocate it and return success. + */ + error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb); + if (error) { + return error; + } + *len = maxlen; + *rtblock = bno; + return 0; + } + /* + * If not, allocate what there is, if it's at least minlen. + */ + maxlen = next - bno; + if (maxlen < minlen) { + /* + * Failed, return failure status. + */ + *rtblock = NULLRTBLOCK; + return 0; + } + /* + * Trim off tail of extent, if prod is specified. + */ + if (prod > 1 && (i = maxlen % prod)) { + maxlen -= i; + if (maxlen < minlen) { + /* + * Now we can't do it, return failure status. + */ + *rtblock = NULLRTBLOCK; + return 0; + } + } + /* + * Allocate what we can and return it. + */ + error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb); + if (error) { + return error; + } + *len = maxlen; + *rtblock = bno; + return 0; +} + +/* + * Allocate an extent of length minlen<=len<=maxlen, starting as near + * to bno as possible. If we don't get maxlen then use prod to trim + * the length, if given. The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_near( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + int any; /* any useful extents from summary */ + xfs_rtblock_t bbno; /* bitmap block number */ + int error; /* error value */ + int i; /* bitmap block offset (loop control) */ + int j; /* secondary loop control */ + int log2len; /* log2 of minlen */ + xfs_rtblock_t n; /* next block to try */ + xfs_rtblock_t r; /* result block */ + + ASSERT(minlen % prod == 0 && maxlen % prod == 0); + /* + * If the block number given is off the end, silently set it to + * the last block. + */ + if (bno >= mp->m_sb.sb_rextents) + bno = mp->m_sb.sb_rextents - 1; + /* + * Try the exact allocation first. + */ + error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len, + rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If the exact allocation worked, return that. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + bbno = XFS_BITTOBLOCK(mp, bno); + i = 0; + ASSERT(minlen != 0); + log2len = xfs_highbit32(minlen); + /* + * Loop over all bitmap blocks (bbno + i is current block). + */ + for (;;) { + /* + * Get summary information of extents of all useful levels + * starting in this bitmap block. + */ + error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1, + bbno + i, rbpp, rsb, &any); + if (error) { + return error; + } + /* + * If there are any useful extents starting here, try + * allocating one. + */ + if (any) { + /* + * On the positive side of the starting location. + */ + if (i >= 0) { + /* + * Try to allocate an extent starting in + * this block. + */ + error = xfs_rtallocate_extent_block(mp, tp, + bbno + i, minlen, maxlen, len, &n, rbpp, + rsb, prod, &r); + if (error) { + return error; + } + /* + * If it worked, return it. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + } + /* + * On the negative side of the starting location. + */ + else { /* i < 0 */ + /* + * Loop backwards through the bitmap blocks from + * the starting point-1 up to where we are now. + * There should be an extent which ends in this + * bitmap block and is long enough. + */ + for (j = -1; j > i; j--) { + /* + * Grab the summary information for + * this bitmap block. + */ + error = xfs_rtany_summary(mp, tp, + log2len, mp->m_rsumlevels - 1, + bbno + j, rbpp, rsb, &any); + if (error) { + return error; + } + /* + * If there's no extent given in the + * summary that means the extent we + * found must carry over from an + * earlier block. If there is an + * extent given, we've already tried + * that allocation, don't do it again. + */ + if (any) + continue; + error = xfs_rtallocate_extent_block(mp, + tp, bbno + j, minlen, maxlen, + len, &n, rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If it works, return the extent. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + } + /* + * There weren't intervening bitmap blocks + * with a long enough extent, or the + * allocation didn't work for some reason + * (i.e. it's a little * too short). + * Try to allocate from the summary block + * that we found. + */ + error = xfs_rtallocate_extent_block(mp, tp, + bbno + i, minlen, maxlen, len, &n, rbpp, + rsb, prod, &r); + if (error) { + return error; + } + /* + * If it works, return the extent. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + } + } + /* + * Loop control. If we were on the positive side, and there's + * still more blocks on the negative side, go there. + */ + if (i > 0 && (int)bbno - i >= 0) + i = -i; + /* + * If positive, and no more negative, but there are more + * positive, go there. + */ + else if (i > 0 && (int)bbno + i < mp->m_sb.sb_rbmblocks - 1) + i++; + /* + * If negative or 0 (just started), and there are positive + * blocks to go, go there. The 0 case moves to block 1. + */ + else if (i <= 0 && (int)bbno - i < mp->m_sb.sb_rbmblocks - 1) + i = 1 - i; + /* + * If negative or 0 and there are more negative blocks, + * go there. + */ + else if (i <= 0 && (int)bbno + i > 0) + i--; + /* + * Must be done. Return failure. + */ + else + break; + } + *rtblock = NULLRTBLOCK; + return 0; +} + +/* + * Allocate an extent of length minlen<=len<=maxlen, with no position + * specified. If we don't get maxlen then use prod to trim + * the length, if given. The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_size( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + int error; /* error value */ + int i; /* bitmap block number */ + int l; /* level number (loop control) */ + xfs_rtblock_t n; /* next block to be tried */ + xfs_rtblock_t r; /* result block number */ + xfs_suminfo_t sum; /* summary information for extents */ + + ASSERT(minlen % prod == 0 && maxlen % prod == 0); + ASSERT(maxlen != 0); + + /* + * Loop over all the levels starting with maxlen. + * At each level, look at all the bitmap blocks, to see if there + * are extents starting there that are long enough (>= maxlen). + * Note, only on the initial level can the allocation fail if + * the summary says there's an extent. + */ + for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) { + /* + * Loop over all the bitmap blocks. + */ + for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { + /* + * Get the summary for this level/block. + */ + error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb, + &sum); + if (error) { + return error; + } + /* + * Nothing there, on to the next block. + */ + if (!sum) + continue; + /* + * Try allocating the extent. + */ + error = xfs_rtallocate_extent_block(mp, tp, i, maxlen, + maxlen, len, &n, rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If it worked, return that. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + /* + * If the "next block to try" returned from the + * allocator is beyond the next bitmap block, + * skip to that bitmap block. + */ + if (XFS_BITTOBLOCK(mp, n) > i + 1) + i = XFS_BITTOBLOCK(mp, n) - 1; + } + } + /* + * Didn't find any maxlen blocks. Try smaller ones, unless + * we're asking for a fixed size extent. + */ + if (minlen > --maxlen) { + *rtblock = NULLRTBLOCK; + return 0; + } + ASSERT(minlen != 0); + ASSERT(maxlen != 0); + + /* + * Loop over sizes, from maxlen down to minlen. + * This time, when we do the allocations, allow smaller ones + * to succeed. + */ + for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) { + /* + * Loop over all the bitmap blocks, try an allocation + * starting in that block. + */ + for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { + /* + * Get the summary information for this level/block. + */ + error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb, + &sum); + if (error) { + return error; + } + /* + * If nothing there, go on to next. + */ + if (!sum) + continue; + /* + * Try the allocation. Make sure the specified + * minlen/maxlen are in the possible range for + * this summary level. + */ + error = xfs_rtallocate_extent_block(mp, tp, i, + XFS_RTMAX(minlen, 1 << l), + XFS_RTMIN(maxlen, (1 << (l + 1)) - 1), + len, &n, rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If it worked, return that extent. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + /* + * If the "next block to try" returned from the + * allocator is beyond the next bitmap block, + * skip to that bitmap block. + */ + if (XFS_BITTOBLOCK(mp, n) > i + 1) + i = XFS_BITTOBLOCK(mp, n) - 1; + } + } + /* + * Got nothing, return failure. + */ + *rtblock = NULLRTBLOCK; + return 0; +} + +/* + * Mark an extent specified by start and len allocated. + * Updates all the summary information as well as the bitmap. + */ +STATIC int /* error */ +xfs_rtallocate_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* start block to allocate */ + xfs_extlen_t len, /* length to allocate */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_rtblock_t end; /* end of the allocated extent */ + int error; /* error value */ + xfs_rtblock_t postblock; /* first block allocated > end */ + xfs_rtblock_t preblock; /* first block allocated < start */ + + end = start + len - 1; + /* + * Assume we're allocating out of the middle of a free extent. + * We need to find the beginning and end of the extent so we can + * properly update the summary. + */ + error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + if (error) { + return error; + } + /* + * Find the next allocated block (end of free extent). + */ + error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, + &postblock); + if (error) { + return error; + } + /* + * Decrement the summary information corresponding to the entire + * (old) free extent. + */ + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + if (error) { + return error; + } + /* + * If there are blocks not being allocated at the front of the + * old extent, add summary data for them to be free. + */ + if (preblock < start) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(start - preblock), + XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * If there are blocks not being allocated at the end of the + * old extent, add summary data for them to be free. + */ + if (postblock > end) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock - end), + XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * Modify the bitmap to mark this extent allocated. + */ + error = xfs_rtmodify_range(mp, tp, start, len, 0); + return error; +} + +/* + * Return whether there are any free extents in the size range given + * by low and high, for the bitmap block bbno. + */ +STATIC int /* error */ +xfs_rtany_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int low, /* low log2 extent size */ + int high, /* high log2 extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + int *stat) /* out: any good extents here? */ +{ + int error; /* error value */ + int log; /* loop counter, log2 of ext. size */ + xfs_suminfo_t sum; /* summary data */ + + /* + * Loop over logs of extent sizes. Order is irrelevant. + */ + for (log = low; log <= high; log++) { + /* + * Get one summary datum. + */ + error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum); + if (error) { + return error; + } + /* + * If there are any, return success. + */ + if (sum) { + *stat = 1; + return 0; + } + } + /* + * Found nothing, return failure. + */ + *stat = 0; + return 0; +} + +/* + * Get a buffer for the bitmap or summary file block specified. + * The buffer is returned read and locked. + */ +STATIC int /* error */ +xfs_rtbuf_get( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t block, /* block number in bitmap or summary */ + int issum, /* is summary not bitmap */ + xfs_buf_t **bpp) /* output: buffer for the block */ +{ + xfs_buf_t *bp; /* block buffer, result */ + xfs_daddr_t d; /* disk addr of block */ + int error; /* error value */ + xfs_fsblock_t fsb; /* fs block number for block */ + xfs_inode_t *ip; /* bitmap or summary inode */ + + ip = issum ? mp->m_rsumip : mp->m_rbmip; + /* + * Map from the file offset (block) and inode number to the + * file system block. + */ + error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block); + if (error) { + return error; + } + ASSERT(fsb != NULLFSBLOCK); + /* + * Convert to disk address for buffer cache. + */ + d = XFS_FSB_TO_DADDR(mp, fsb); + /* + * Read the buffer. + */ + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, 0, &bp); + if (error) { + return error; + } + ASSERT(bp && !XFS_BUF_GETERROR(bp)); + *bpp = bp; + return 0; +} + +#ifdef DEBUG +/* + * Check that the given extent (block range) is allocated already. + */ +STATIC int /* error */ +xfs_rtcheck_alloc_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* out: 1 for allocated, 0 for not */ +{ + xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ + + return xfs_rtcheck_range(mp, tp, bno, len, 0, &new, stat); +} +#endif + +/* + * Check that the given range is either all allocated (val = 0) or + * all free (val = 1). + */ +STATIC int /* error */ +xfs_rtcheck_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block number of extent */ + xfs_extlen_t len, /* length of extent */ + int val, /* 1 for free, 0 for allocated */ + xfs_rtblock_t *new, /* out: first block not matching */ + int *stat) /* out: 1 for matches, 0 for not */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute starting bitmap block number + */ + block = XFS_BITTOBLOCK(mp, start); + /* + * Read the bitmap block. + */ + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + /* + * Compute the starting word's address, and starting bit. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + /* + * 0 (allocated) => all zero's; 1 (free) => all one's. + */ + val = -val; + /* + * If not starting on a word boundary, deal with the first + * (partial) word. + */ + if (bit) { + /* + * Compute first bit not examined. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + /* + * Mask of relevant bits. + */ + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ val) & mask)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i = XFS_RTLOBIT(wdiff) - bit; + *new = start + i; + *stat = 0; + return 0; + } + i = lastbit - bit; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ val)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *new = start + i; + *stat = 0; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Mask of relevant bits. + */ + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ val) & mask)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *new = start + i; + *stat = 0; + return 0; + } else + i = len; + } + /* + * Successful, return. + */ + xfs_trans_brelse(tp, bp); + *new = start + i; + *stat = 1; + return 0; +} + +/* + * Copy and transform the summary file, given the old and new + * parameters in the mount structures. + */ +STATIC int /* error */ +xfs_rtcopy_summary( + xfs_mount_t *omp, /* old file system mount point */ + xfs_mount_t *nmp, /* new file system mount point */ + xfs_trans_t *tp) /* transaction pointer */ +{ + xfs_rtblock_t bbno; /* bitmap block number */ + xfs_buf_t *bp; /* summary buffer */ + int error; /* error return value */ + int log; /* summary level number (log length) */ + xfs_suminfo_t sum; /* summary data */ + xfs_fsblock_t sumbno; /* summary block number */ + + bp = NULL; + for (log = omp->m_rsumlevels - 1; log >= 0; log--) { + for (bbno = omp->m_sb.sb_rbmblocks - 1; + (xfs_srtblock_t)bbno >= 0; + bbno--) { + error = xfs_rtget_summary(omp, tp, log, bbno, &bp, + &sumbno, &sum); + if (error) + return error; + if (sum == 0) + continue; + error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum, + &bp, &sumbno); + if (error) + return error; + error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum, + &bp, &sumbno); + if (error) + return error; + ASSERT(sum > 0); + } + } + return 0; +} + +/* + * Searching backward from start to limit, find the first block whose + * allocated/free state is different from start's. + */ +STATIC int /* error */ +xfs_rtfind_back( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to look at */ + xfs_rtblock_t limit, /* last block to look at */ + xfs_rtblock_t *rtblock) /* out: start block found */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t firstbit; /* first useful bit in the word */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute and read in starting bitmap block for starting block. + */ + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + /* + * Get the first word's index & point to it. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + len = start - limit + 1; + /* + * Compute match value, based on the bit at start: if 1 (free) + * then all-ones, else all-zeroes. + */ + want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + /* + * If the starting position is not word-aligned, deal with the + * partial word. + */ + if (bit < XFS_NBWORD - 1) { + /* + * Calculate first (leftmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0); + mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) << + firstbit; + /* + * Calculate the difference between the value there + * and what we're looking for. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different. Mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i = bit - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } + i = bit - firstbit + 1; + /* + * Go on to previous block if that's where the previous word is + * and we need the previous word. + */ + if (--word == -1 && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); + if (error) { + return error; + } + bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = XFS_BLOCKWMASK(mp); + b = &bufp[word]; + } else { + /* + * Go on to the previous word in the buffer. + */ + b--; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the previous one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ want)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to previous block if that's where the previous word is + * and we need the previous word. + */ + if (--word == -1 && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); + if (error) { + return error; + } + bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = XFS_BLOCKWMASK(mp); + b = &bufp[word]; + } else { + /* + * Go on to the previous word in the buffer. + */ + b--; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if (len - i) { + /* + * Calculate first (leftmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + firstbit = XFS_NBWORD - (len - i); + mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } else + i = len; + } + /* + * No match, return that we scanned the whole area. + */ + xfs_trans_brelse(tp, bp); + *rtblock = start - i + 1; + return 0; +} + +/* + * Searching forward from start to limit, find the first block whose + * allocated/free state is different from start's. + */ +STATIC int /* error */ +xfs_rtfind_forw( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to look at */ + xfs_rtblock_t limit, /* last block to look at */ + xfs_rtblock_t *rtblock) /* out: start block found */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t lastbit; /* last useful bit in the word */ + xfs_rtblock_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute and read in starting bitmap block for starting block. + */ + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + /* + * Get the first word's index & point to it. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + len = limit - start + 1; + /* + * Compute match value, based on the bit at start: if 1 (free) + * then all-ones, else all-zeroes. + */ + want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + /* + * If the starting position is not word-aligned, deal with the + * partial word. + */ + if (bit) { + /* + * Calculate last (rightmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Calculate the difference between the value there + * and what we're looking for. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different. Mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i = XFS_RTLOBIT(wdiff) - bit; + *rtblock = start + i - 1; + return 0; + } + i = lastbit - bit; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = 0; + } else { + /* + * Go on to the previous word in the buffer. + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ want)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *rtblock = start + i - 1; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Calculate mask for all the relevant bits in this word. + */ + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *rtblock = start + i - 1; + return 0; + } else + i = len; + } + /* + * No match, return that we scanned the whole area. + */ + xfs_trans_brelse(tp, bp); + *rtblock = start + i - 1; + return 0; +} + +/* + * Mark an extent specified by start and len freed. + * Updates all the summary information as well as the bitmap. + */ +STATIC int /* error */ +xfs_rtfree_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to free */ + xfs_extlen_t len, /* length to free */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_rtblock_t end; /* end of the freed extent */ + int error; /* error value */ + xfs_rtblock_t postblock; /* first block freed > end */ + xfs_rtblock_t preblock; /* first block freed < start */ + + end = start + len - 1; + /* + * Modify the bitmap to mark this extent freed. + */ + error = xfs_rtmodify_range(mp, tp, start, len, 1); + if (error) { + return error; + } + /* + * Assume we're freeing out of the middle of an allocated extent. + * We need to find the beginning and end of the extent so we can + * properly update the summary. + */ + error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + if (error) { + return error; + } + /* + * Find the next allocated block (end of allocated extent). + */ + error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, + &postblock); + if (error) + return error; + /* + * If there are blocks not being freed at the front of the + * old extent, add summary data for them to be allocated. + */ + if (preblock < start) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(start - preblock), + XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * If there are blocks not being freed at the end of the + * old extent, add summary data for them to be allocated. + */ + if (postblock > end) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock - end), + XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * Increment the summary information corresponding to the entire + * (new) free extent. + */ + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + return error; +} + +/* + * Read and return the summary information for a given extent size, + * bitmap block combination. + * Keeps track of a current summary block, so we don't keep reading + * it from the buffer cache. + */ +STATIC int /* error */ +xfs_rtget_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ +{ + xfs_buf_t *bp; /* buffer for summary block */ + int error; /* error value */ + xfs_fsblock_t sb; /* summary fsblock */ + int so; /* index into the summary file */ + xfs_suminfo_t *sp; /* pointer to returned data */ + + /* + * Compute entry number in the summary file. + */ + so = XFS_SUMOFFS(mp, log, bbno); + /* + * Compute the block number in the summary file. + */ + sb = XFS_SUMOFFSTOBLOCK(mp, so); + /* + * If we have an old buffer, and the block number matches, use that. + */ + if (rbpp && *rbpp && *rsb == sb) + bp = *rbpp; + /* + * Otherwise we have to get the buffer. + */ + else { + /* + * If there was an old one, get rid of it first. + */ + if (rbpp && *rbpp) + xfs_trans_brelse(tp, *rbpp); + error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); + if (error) { + return error; + } + /* + * Remember this buffer and block for the next call. + */ + if (rbpp) { + *rbpp = bp; + *rsb = sb; + } + } + /* + * Point to the summary information & copy it out. + */ + sp = XFS_SUMPTR(mp, bp, so); + *sum = *sp; + /* + * Drop the buffer if we're not asked to remember it. + */ + if (!rbpp) + xfs_trans_brelse(tp, bp); + return 0; +} + +/* + * Set the given range of bitmap bits to the given value. + * Do whatever I/O and logging is required. + */ +STATIC int /* error */ +xfs_rtmodify_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to modify */ + xfs_extlen_t len, /* length of extent to modify */ + int val) /* 1 for free, 0 for allocated */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtword_t *first; /* first used word in the buffer */ + int i; /* current bit number rel. to start */ + int lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask o frelevant bits for value */ + int word; /* word number in the buffer */ + + /* + * Compute starting bitmap block number. + */ + block = XFS_BITTOBLOCK(mp, start); + /* + * Read the bitmap block, and point to its data. + */ + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + /* + * Compute the starting word's address, and starting bit. + */ + word = XFS_BITTOWORD(mp, start); + first = b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + /* + * 0 (allocated) => all zeroes; 1 (free) => all ones. + */ + val = -val; + /* + * If not starting on a word boundary, deal with the first + * (partial) word. + */ + if (bit) { + /* + * Compute first bit not changed and mask of relevant bits. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Set/clear the active bits. + */ + if (val) + *b |= mask; + else + *b &= ~mask; + i = lastbit - bit; + /* + * Go on to the next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * Log the changed part of this block. + * Get the next one. + */ + xfs_trans_log_buf(tp, bp, + (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp)); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = 0; + } else { + /* + * Go on to the next word in the buffer + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Set the word value correctly. + */ + *b = val; + i += XFS_NBWORD; + /* + * Go on to the next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * Log the changed part of this block. + * Get the next one. + */ + xfs_trans_log_buf(tp, bp, + (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp)); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = 0; + } else { + /* + * Go on to the next word in the buffer + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Compute a mask of relevant bits. + */ + bit = 0; + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Set/clear the active bits. + */ + if (val) + *b |= mask; + else + *b &= ~mask; + b++; + } + /* + * Log any remaining changed bytes. + */ + if (b > first) + xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp - 1)); + return 0; +} + +/* + * Read and modify the summary information for a given extent size, + * bitmap block combination. + * Keeps track of a current summary block, so we don't keep reading + * it from the buffer cache. + */ +STATIC int /* error */ +xfs_rtmodify_summary( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + int delta, /* change to make to summary info */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_buf_t *bp; /* buffer for the summary block */ + int error; /* error value */ + xfs_fsblock_t sb; /* summary fsblock */ + int so; /* index into the summary file */ + xfs_suminfo_t *sp; /* pointer to returned data */ + + /* + * Compute entry number in the summary file. + */ + so = XFS_SUMOFFS(mp, log, bbno); + /* + * Compute the block number in the summary file. + */ + sb = XFS_SUMOFFSTOBLOCK(mp, so); + /* + * If we have an old buffer, and the block number matches, use that. + */ + if (rbpp && *rbpp && *rsb == sb) + bp = *rbpp; + /* + * Otherwise we have to get the buffer. + */ + else { + /* + * If there was an old one, get rid of it first. + */ + if (rbpp && *rbpp) + xfs_trans_brelse(tp, *rbpp); + error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); + if (error) { + return error; + } + /* + * Remember this buffer and block for the next call. + */ + if (rbpp) { + *rbpp = bp; + *rsb = sb; + } + } + /* + * Point to the summary information, modify and log it. + */ + sp = XFS_SUMPTR(mp, bp, so); + *sp += delta; + xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)), + (uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1)); + return 0; +} + +/* + * Visible (exported) functions. + */ + +/* + * Grow the realtime area of the filesystem. + */ +int +xfs_growfs_rt( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_growfs_rt_t *in) /* growfs rt input struct */ +{ + xfs_rtblock_t bmbno; /* bitmap block number */ + xfs_buf_t *bp; /* temporary buffer */ + int error; /* error return value */ + xfs_mount_t *nmp; /* new (fake) mount structure */ + xfs_drfsbno_t nrblocks; /* new number of realtime blocks */ + xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ + xfs_drtbno_t nrextents; /* new number of realtime extents */ + uint8_t nrextslog; /* new log2 of sb_rextents */ + xfs_extlen_t nrsumblocks; /* new number of summary blocks */ + uint nrsumlevels; /* new rt summary levels */ + uint nrsumsize; /* new size of rt summary, bytes */ + xfs_sb_t *nsbp; /* new superblock */ + xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */ + xfs_extlen_t rsumblocks; /* current number of rt summary blks */ + xfs_sb_t *sbp; /* old superblock */ + xfs_fsblock_t sumbno; /* summary block number */ + + sbp = &mp->m_sb; + /* + * Initial error checking. + */ + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); + if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL || + (nrblocks = in->newblocks) <= sbp->sb_rblocks || + (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) + return XFS_ERROR(EINVAL); + if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks))) + return error; + /* + * Read in the last block of the device, make sure it exists. + */ + bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp, + XFS_FSB_TO_BB(mp, nrblocks - 1), + XFS_FSB_TO_B(mp, 1), 0); + if (!bp) + return EIO; + xfs_buf_relse(bp); + + /* + * Calculate new parameters. These are the final values to be reached. + */ + nrextents = nrblocks; + do_div(nrextents, in->extsize); + nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize); + nrextslog = xfs_highbit32(nrextents); + nrsumlevels = nrextslog + 1; + nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks; + nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize); + nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); + /* + * New summary size can't be more than half the size of + * the log. This prevents us from getting a log overflow, + * since we'll log basically the whole summary file at once. + */ + if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) + return XFS_ERROR(EINVAL); + /* + * Get the old block counts for bitmap and summary inodes. + * These can't change since other growfs callers are locked out. + */ + rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_d.di_size); + rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_d.di_size); + /* + * Allocate space to the bitmap and summary files, as necessary. + */ + error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip); + if (error) + return error; + error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip); + if (error) + return error; + /* + * Allocate a new (fake) mount/sb. + */ + nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); + /* + * Loop over the bitmap blocks. + * We will do everything one bitmap block at a time. + * Skip the current block if it is exactly full. + * This also deals with the case where there were no rtextents before. + */ + for (bmbno = sbp->sb_rbmblocks - + ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); + bmbno < nrbmblocks; + bmbno++) { + xfs_trans_t *tp; + int cancelflags = 0; + + *nmp = *mp; + nsbp = &nmp->m_sb; + /* + * Calculate new sb and mount fields for this round. + */ + nsbp->sb_rextsize = in->extsize; + nsbp->sb_rbmblocks = bmbno + 1; + nsbp->sb_rblocks = + XFS_RTMIN(nrblocks, + nsbp->sb_rbmblocks * NBBY * + nsbp->sb_blocksize * nsbp->sb_rextsize); + nsbp->sb_rextents = nsbp->sb_rblocks; + do_div(nsbp->sb_rextents, nsbp->sb_rextsize); + ASSERT(nsbp->sb_rextents != 0); + nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); + nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; + nrsumsize = + (uint)sizeof(xfs_suminfo_t) * nrsumlevels * + nsbp->sb_rbmblocks; + nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize); + nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); + /* + * Start a transaction, get the log reservation. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); + if ((error = xfs_trans_reserve(tp, 0, + XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0))) + goto error_cancel; + /* + * Lock out other callers by grabbing the bitmap inode lock. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + /* + * Update the bitmap inode's size. + */ + mp->m_rbmip->i_d.di_size = + nsbp->sb_rbmblocks * nsbp->sb_blocksize; + xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + cancelflags |= XFS_TRANS_ABORT; + /* + * Get the summary inode into the transaction. + */ + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL); + /* + * Update the summary inode's size. + */ + mp->m_rsumip->i_d.di_size = nmp->m_rsumsize; + xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE); + /* + * Copy summary data from old to new sizes. + * Do this when the real size (not block-aligned) changes. + */ + if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks || + mp->m_rsumlevels != nmp->m_rsumlevels) { + error = xfs_rtcopy_summary(mp, nmp, tp); + if (error) + goto error_cancel; + } + /* + * Update superblock fields. + */ + if (nsbp->sb_rextsize != sbp->sb_rextsize) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE, + nsbp->sb_rextsize - sbp->sb_rextsize); + if (nsbp->sb_rbmblocks != sbp->sb_rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + nsbp->sb_rbmblocks - sbp->sb_rbmblocks); + if (nsbp->sb_rblocks != sbp->sb_rblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS, + nsbp->sb_rblocks - sbp->sb_rblocks); + if (nsbp->sb_rextents != sbp->sb_rextents) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS, + nsbp->sb_rextents - sbp->sb_rextents); + if (nsbp->sb_rextslog != sbp->sb_rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + nsbp->sb_rextslog - sbp->sb_rextslog); + /* + * Free new extent. + */ + bp = NULL; + error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents, + nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); + if (error) { +error_cancel: + xfs_trans_cancel(tp, cancelflags); + break; + } + /* + * Mark more blocks free in the superblock. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, + nsbp->sb_rextents - sbp->sb_rextents); + /* + * Update mp values into the real mp structure. + */ + mp->m_rsumlevels = nrsumlevels; + mp->m_rsumsize = nrsumsize; + + error = xfs_trans_commit(tp, 0); + if (error) + break; + } + + /* + * Free the fake mp structure. + */ + kmem_free(nmp); + + return error; +} + +/* + * Allocate an extent in the realtime subvolume, with the usual allocation + * parameters. The length units are all in realtime extents, as is the + * result block number. + */ +int /* error */ +xfs_rtallocate_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */ + int wasdel, /* was a delayed allocation extent */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + xfs_mount_t *mp = tp->t_mountp; + int error; /* error value */ + xfs_rtblock_t r; /* result allocated block */ + xfs_fsblock_t sb; /* summary file block number */ + xfs_buf_t *sumbp; /* summary file block buffer */ + + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + ASSERT(minlen > 0 && minlen <= maxlen); + + /* + * If prod is set then figure out what to do to minlen and maxlen. + */ + if (prod > 1) { + xfs_extlen_t i; + + if ((i = maxlen % prod)) + maxlen -= i; + if ((i = minlen % prod)) + minlen += prod - i; + if (maxlen < minlen) { + *rtblock = NULLRTBLOCK; + return 0; + } + } + + sumbp = NULL; + /* + * Allocate by size, or near another block, or exactly at some block. + */ + switch (type) { + case XFS_ALLOCTYPE_ANY_AG: + error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len, + &sumbp, &sb, prod, &r); + break; + case XFS_ALLOCTYPE_NEAR_BNO: + error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen, + len, &sumbp, &sb, prod, &r); + break; + case XFS_ALLOCTYPE_THIS_BNO: + error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, + len, &sumbp, &sb, prod, &r); + break; + default: + error = EIO; + ASSERT(0); + } + if (error) + return error; + + /* + * If it worked, update the superblock. + */ + if (r != NULLRTBLOCK) { + long slen = (long)*len; + + ASSERT(*len >= minlen && *len <= maxlen); + if (wasdel) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen); + else + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen); + } + *rtblock = r; + return 0; +} + +/* + * Free an extent in the realtime subvolume. Length is expressed in + * realtime extents, as is the block number. + */ +int /* error */ +xfs_rtfree_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to free */ + xfs_extlen_t len) /* length of extent freed */ +{ + int error; /* error value */ + xfs_mount_t *mp; /* file system mount structure */ + xfs_fsblock_t sb; /* summary file block number */ + xfs_buf_t *sumbp; /* summary file block buffer */ + + mp = tp->t_mountp; + /* + * Synchronize by locking the bitmap inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + +#if defined(__KERNEL__) && defined(DEBUG) + /* + * Check to see that this whole range is currently allocated. + */ + { + int stat; /* result from checking range */ + + error = xfs_rtcheck_alloc_range(mp, tp, bno, len, &stat); + if (error) { + return error; + } + ASSERT(stat); + } +#endif + sumbp = NULL; + /* + * Free the range of realtime blocks. + */ + error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb); + if (error) { + return error; + } + /* + * Mark more blocks free in the superblock. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len); + /* + * If we've now freed all the blocks, reset the file sequence + * number to 0. + */ + if (tp->t_frextents_delta + mp->m_sb.sb_frextents == + mp->m_sb.sb_rextents) { + if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) + mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0; + xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + } + return 0; +} + +/* + * Initialize realtime fields in the mount structure. + */ +int /* error */ +xfs_rtmount_init( + xfs_mount_t *mp) /* file system mount structure */ +{ + xfs_buf_t *bp; /* buffer for last block of subvolume */ + xfs_daddr_t d; /* address of last block of subvolume */ + xfs_sb_t *sbp; /* filesystem superblock copy in mount */ + + sbp = &mp->m_sb; + if (sbp->sb_rblocks == 0) + return 0; + if (mp->m_rtdev_targp == NULL) { + xfs_warn(mp, + "Filesystem has a realtime volume, use rtdev=device option"); + return XFS_ERROR(ENODEV); + } + mp->m_rsumlevels = sbp->sb_rextslog + 1; + mp->m_rsumsize = + (uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels * + sbp->sb_rbmblocks; + mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize); + mp->m_rbmip = mp->m_rsumip = NULL; + /* + * Check that the realtime section is an ok size. + */ + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { + xfs_warn(mp, "realtime mount -- %llu != %llu", + (unsigned long long) XFS_BB_TO_FSB(mp, d), + (unsigned long long) mp->m_sb.sb_rblocks); + return XFS_ERROR(EFBIG); + } + bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp, + d - XFS_FSB_TO_BB(mp, 1), + XFS_FSB_TO_B(mp, 1), 0); + if (!bp) { + xfs_warn(mp, "realtime device size check failed"); + return EIO; + } + xfs_buf_relse(bp); + return 0; +} + +/* + * Get the bitmap and summary inodes into the mount structure + * at mount time. + */ +int /* error */ +xfs_rtmount_inodes( + xfs_mount_t *mp) /* file system mount structure */ +{ + int error; /* error return value */ + xfs_sb_t *sbp; + + sbp = &mp->m_sb; + if (sbp->sb_rbmino == NULLFSINO) + return 0; + error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip); + if (error) + return error; + ASSERT(mp->m_rbmip != NULL); + ASSERT(sbp->sb_rsumino != NULLFSINO); + error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); + if (error) { + IRELE(mp->m_rbmip); + return error; + } + ASSERT(mp->m_rsumip != NULL); + return 0; +} + +void +xfs_rtunmount_inodes( + struct xfs_mount *mp) +{ + if (mp->m_rbmip) + IRELE(mp->m_rbmip); + if (mp->m_rsumip) + IRELE(mp->m_rsumip); +} + +/* + * Pick an extent for allocation at the start of a new realtime file. + * Use the sequence number stored in the atime field of the bitmap inode. + * Translate this to a fraction of the rtextents, and return the product + * of rtextents and the fraction. + * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... + */ +int /* error */ +xfs_rtpick_extent( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_extlen_t len, /* allocation length (rtextents) */ + xfs_rtblock_t *pick) /* result rt extent */ +{ + xfs_rtblock_t b; /* result block */ + int log2; /* log of sequence number */ + __uint64_t resid; /* residual after log removed */ + __uint64_t seq; /* sequence number of file creation */ + __uint64_t *seqp; /* pointer to seqno in inode */ + + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + + seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime; + if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) { + mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + *seqp = 0; + } + seq = *seqp; + if ((log2 = xfs_highbit64(seq)) == -1) + b = 0; + else { + resid = seq - (1ULL << log2); + b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >> + (log2 + 1); + if (b >= mp->m_sb.sb_rextents) + b = do_mod(b, mp->m_sb.sb_rextents); + if (b + len > mp->m_sb.sb_rextents) + b = mp->m_sb.sb_rextents - len; + } + *seqp = seq + 1; + xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + *pick = b; + return 0; +} diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h new file mode 100644 index 00000000..09e1f4f3 --- /dev/null +++ b/fs/xfs/xfs_rtalloc.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_RTALLOC_H__ +#define __XFS_RTALLOC_H__ + +struct xfs_mount; +struct xfs_trans; + +/* Min and max rt extent sizes, specified in bytes */ +#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ +#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */ +#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ + +/* + * Constants for bit manipulations. + */ +#define XFS_NBBYLOG 3 /* log2(NBBY) */ +#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */ +#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG) +#define XFS_NBWORD (1 << XFS_NBWORDLOG) +#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) + +#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) +#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) +#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize) +#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask) + +/* + * Summary and bit manipulation macros. + */ +#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb))) +#define XFS_SUMOFFSTOBLOCK(mp,s) \ + (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) +#define XFS_SUMPTR(mp,bp,so) \ + ((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \ + (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) + +#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) +#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log) +#define XFS_BITTOWORD(mp,bi) \ + ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp))) + +#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) +#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) + +#define XFS_RTLOBIT(w) xfs_lowbit32(w) +#define XFS_RTHIBIT(w) xfs_highbit32(w) + +#if XFS_BIG_BLKNOS +#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) +#else +#define XFS_RTBLOCKLOG(b) xfs_highbit32(b) +#endif + + +#ifdef __KERNEL__ + +#ifdef CONFIG_XFS_RT +/* + * Function prototypes for exported functions. + */ + +/* + * Allocate an extent in the realtime subvolume, with the usual allocation + * parameters. The length units are all in realtime extents, as is the + * result block number. + */ +int /* error */ +xfs_rtallocate_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */ + int wasdel, /* was a delayed allocation extent */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock); /* out: start block allocated */ + +/* + * Free an extent in the realtime subvolume. Length is expressed in + * realtime extents, as is the block number. + */ +int /* error */ +xfs_rtfree_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to free */ + xfs_extlen_t len); /* length of extent freed */ + +/* + * Initialize realtime fields in the mount structure. + */ +int /* error */ +xfs_rtmount_init( + struct xfs_mount *mp); /* file system mount structure */ +void +xfs_rtunmount_inodes( + struct xfs_mount *mp); + +/* + * Get the bitmap and summary inodes into the mount structure + * at mount time. + */ +int /* error */ +xfs_rtmount_inodes( + struct xfs_mount *mp); /* file system mount structure */ + +/* + * Pick an extent for allocation at the start of a new realtime file. + * Use the sequence number stored in the atime field of the bitmap inode. + * Translate this to a fraction of the rtextents, and return the product + * of rtextents and the fraction. + * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... + */ +int /* error */ +xfs_rtpick_extent( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_extlen_t len, /* allocation length (rtextents) */ + xfs_rtblock_t *pick); /* result rt extent */ + +/* + * Grow the realtime area of the filesystem. + */ +int +xfs_growfs_rt( + struct xfs_mount *mp, /* file system mount structure */ + xfs_growfs_rt_t *in); /* user supplied growfs struct */ + +#else +# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb) (ENOSYS) +# define xfs_rtfree_extent(t,b,l) (ENOSYS) +# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) +# define xfs_growfs_rt(mp,in) (ENOSYS) +static inline int /* error */ +xfs_rtmount_init( + xfs_mount_t *mp) /* file system mount structure */ +{ + if (mp->m_sb.sb_rblocks == 0) + return 0; + + xfs_warn(mp, "Not built with CONFIG_XFS_RT"); + return ENOSYS; +} +# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +# define xfs_rtunmount_inodes(m) +#endif /* CONFIG_XFS_RT */ + +#endif /* __KERNEL__ */ + +#endif /* __XFS_RTALLOC_H__ */ diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c new file mode 100644 index 00000000..d6d6fdfe --- /dev/null +++ b/fs/xfs/xfs_rw.c @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_rw.h" + +/* + * Force a shutdown of the filesystem instantly while keeping + * the filesystem consistent. We don't do an unmount here; just shutdown + * the shop, make sure that absolutely nothing persistent happens to + * this filesystem after this point. + */ +void +xfs_do_force_shutdown( + xfs_mount_t *mp, + int flags, + char *fname, + int lnnum) +{ + int logerror; + + logerror = flags & SHUTDOWN_LOG_IO_ERROR; + + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_notice(mp, + "%s(0x%x) called from line %d of file %s. Return address = 0x%p", + __func__, flags, lnnum, fname, __return_address); + } + /* + * No need to duplicate efforts. + */ + if (XFS_FORCED_SHUTDOWN(mp) && !logerror) + return; + + /* + * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't + * queue up anybody new on the log reservations, and wakes up + * everybody who's sleeping on log reservations to tell them + * the bad news. + */ + if (xfs_log_force_umount(mp, logerror)) + return; + + if (flags & SHUTDOWN_CORRUPT_INCORE) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, + "Corruption of in-memory data detected. Shutting down filesystem"); + if (XFS_ERRLEVEL_HIGH <= xfs_error_level) + xfs_stack_trace(); + } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + if (logerror) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, + "Log I/O Error Detected. Shutting down filesystem"); + } else if (flags & SHUTDOWN_DEVICE_REQ) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "All device paths lost. Shutting down filesystem"); + } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "I/O Error Detected. Shutting down filesystem"); + } + } + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_alert(mp, + "Please umount the filesystem and rectify the problem(s)"); + } +} + +/* + * Prints out an ALERT message about I/O error. + */ +void +xfs_ioerror_alert( + char *func, + struct xfs_mount *mp, + xfs_buf_t *bp, + xfs_daddr_t blkno) +{ + xfs_alert(mp, + "I/O error occurred: meta-data dev %s block 0x%llx" + " (\"%s\") error %d buf count %zd", + XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), + (__uint64_t)blkno, func, + XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp)); +} + +/* + * This isn't an absolute requirement, but it is + * just a good idea to call xfs_read_buf instead of + * directly doing a read_buf call. For one, we shouldn't + * be doing this disk read if we are in SHUTDOWN state anyway, + * so this stops that from happening. Secondly, this does all + * the error checking stuff and the brelse if appropriate for + * the caller, so the code can be a little leaner. + */ + +int +xfs_read_buf( + struct xfs_mount *mp, + xfs_buftarg_t *target, + xfs_daddr_t blkno, + int len, + uint flags, + xfs_buf_t **bpp) +{ + xfs_buf_t *bp; + int error; + + if (!flags) + flags = XBF_LOCK | XBF_MAPPED; + + bp = xfs_buf_read(target, blkno, len, flags); + if (!bp) + return XFS_ERROR(EIO); + error = XFS_BUF_GETERROR(bp); + if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) { + *bpp = bp; + } else { + *bpp = NULL; + if (error) { + xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp)); + } else { + error = XFS_ERROR(EIO); + } + if (bp) { + XFS_BUF_UNDONE(bp); + XFS_BUF_UNDELAYWRITE(bp); + XFS_BUF_STALE(bp); + /* + * brelse clears B_ERROR and b_error + */ + xfs_buf_relse(bp); + } + } + return (error); +} + +/* + * helper function to extract extent size hint from inode + */ +xfs_extlen_t +xfs_get_extsz_hint( + struct xfs_inode *ip) +{ + if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) + return ip->i_d.di_extsize; + if (XFS_IS_REALTIME_INODE(ip)) + return ip->i_mount->m_sb.sb_rextsize; + return 0; +} diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h new file mode 100644 index 00000000..11c41ec6 --- /dev/null +++ b/fs/xfs/xfs_rw.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_RW_H__ +#define __XFS_RW_H__ + +struct xfs_buf; +struct xfs_inode; +struct xfs_mount; + +/* + * Convert the given file system block to a disk block. + * We have to treat it differently based on whether the + * file is a real time file or not, because the bmap code + * does. + */ +static inline xfs_daddr_t +xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) +{ + return (XFS_IS_REALTIME_INODE(ip) ? \ + (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); +} + +/* + * Prototypes for functions in xfs_rw.c. + */ +extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, + xfs_daddr_t blkno, int len, uint flags, + struct xfs_buf **bpp); +extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, + xfs_buf_t *bp, xfs_daddr_t blkno); +extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); + +#endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h new file mode 100644 index 00000000..1eb2ba58 --- /dev/null +++ b/fs/xfs/xfs_sb.h @@ -0,0 +1,543 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SB_H__ +#define __XFS_SB_H__ + +/* + * Super block + * Fits into a sector-sized buffer at address 0 of each allocation group. + * Only the first of these is ever updated except during growfs. + */ + +struct xfs_buf; +struct xfs_mount; + +#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */ +#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */ +#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */ +#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */ +#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */ +#define XFS_SB_VERSION_NUMBITS 0x000f +#define XFS_SB_VERSION_ALLFBITS 0xfff0 +#define XFS_SB_VERSION_SASHFBITS 0xf000 +#define XFS_SB_VERSION_REALFBITS 0x0ff0 +#define XFS_SB_VERSION_ATTRBIT 0x0010 +#define XFS_SB_VERSION_NLINKBIT 0x0020 +#define XFS_SB_VERSION_QUOTABIT 0x0040 +#define XFS_SB_VERSION_ALIGNBIT 0x0080 +#define XFS_SB_VERSION_DALIGNBIT 0x0100 +#define XFS_SB_VERSION_SHAREDBIT 0x0200 +#define XFS_SB_VERSION_LOGV2BIT 0x0400 +#define XFS_SB_VERSION_SECTORBIT 0x0800 +#define XFS_SB_VERSION_EXTFLGBIT 0x1000 +#define XFS_SB_VERSION_DIRV2BIT 0x2000 +#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */ +#define XFS_SB_VERSION_MOREBITSBIT 0x8000 +#define XFS_SB_VERSION_OKSASHFBITS \ + (XFS_SB_VERSION_EXTFLGBIT | \ + XFS_SB_VERSION_DIRV2BIT | \ + XFS_SB_VERSION_BORGBIT) +#define XFS_SB_VERSION_OKREALFBITS \ + (XFS_SB_VERSION_ATTRBIT | \ + XFS_SB_VERSION_NLINKBIT | \ + XFS_SB_VERSION_QUOTABIT | \ + XFS_SB_VERSION_ALIGNBIT | \ + XFS_SB_VERSION_DALIGNBIT | \ + XFS_SB_VERSION_SHAREDBIT | \ + XFS_SB_VERSION_LOGV2BIT | \ + XFS_SB_VERSION_SECTORBIT | \ + XFS_SB_VERSION_MOREBITSBIT) +#define XFS_SB_VERSION_OKREALBITS \ + (XFS_SB_VERSION_NUMBITS | \ + XFS_SB_VERSION_OKREALFBITS | \ + XFS_SB_VERSION_OKSASHFBITS) + +/* + * There are two words to hold XFS "feature" bits: the original + * word, sb_versionnum, and sb_features2. Whenever a bit is set in + * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set. + * + * These defines represent bits in sb_features2. + */ +#define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */ +#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 +#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ +#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 +#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ +#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ +#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ + +#define XFS_SB_VERSION2_OKREALFBITS \ + (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ + XFS_SB_VERSION2_ATTR2BIT | \ + XFS_SB_VERSION2_PROJID32BIT) +#define XFS_SB_VERSION2_OKSASHFBITS \ + (0) +#define XFS_SB_VERSION2_OKREALBITS \ + (XFS_SB_VERSION2_OKREALFBITS | \ + XFS_SB_VERSION2_OKSASHFBITS ) + +/* + * Superblock - in core version. Must match the ondisk version below. + * Must be padded to 64 bit alignment. + */ +typedef struct xfs_sb { + __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ + __uint32_t sb_blocksize; /* logical block size, bytes */ + xfs_drfsbno_t sb_dblocks; /* number of data blocks */ + xfs_drfsbno_t sb_rblocks; /* number of realtime blocks */ + xfs_drtbno_t sb_rextents; /* number of realtime extents */ + uuid_t sb_uuid; /* file system unique id */ + xfs_dfsbno_t sb_logstart; /* starting block of log if internal */ + xfs_ino_t sb_rootino; /* root inode number */ + xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */ + xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */ + xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */ + xfs_agblock_t sb_agblocks; /* size of an allocation group */ + xfs_agnumber_t sb_agcount; /* number of allocation groups */ + xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */ + xfs_extlen_t sb_logblocks; /* number of log blocks */ + __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */ + __uint16_t sb_sectsize; /* volume sector size, bytes */ + __uint16_t sb_inodesize; /* inode size, bytes */ + __uint16_t sb_inopblock; /* inodes per block */ + char sb_fname[12]; /* file system name */ + __uint8_t sb_blocklog; /* log2 of sb_blocksize */ + __uint8_t sb_sectlog; /* log2 of sb_sectsize */ + __uint8_t sb_inodelog; /* log2 of sb_inodesize */ + __uint8_t sb_inopblog; /* log2 of sb_inopblock */ + __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + __uint8_t sb_rextslog; /* log2 of sb_rextents */ + __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */ + __uint8_t sb_imax_pct; /* max % of fs for inode space */ + /* statistics */ + /* + * These fields must remain contiguous. If you really + * want to change their layout, make sure you fix the + * code in xfs_trans_apply_sb_deltas(). + */ + __uint64_t sb_icount; /* allocated inodes */ + __uint64_t sb_ifree; /* free inodes */ + __uint64_t sb_fdblocks; /* free data blocks */ + __uint64_t sb_frextents; /* free realtime extents */ + /* + * End contiguous fields. + */ + xfs_ino_t sb_uquotino; /* user quota inode */ + xfs_ino_t sb_gquotino; /* group quota inode */ + __uint16_t sb_qflags; /* quota flags */ + __uint8_t sb_flags; /* misc. flags */ + __uint8_t sb_shared_vn; /* shared version number */ + xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */ + __uint32_t sb_unit; /* stripe or raid unit */ + __uint32_t sb_width; /* stripe or raid width */ + __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */ + __uint8_t sb_logsectlog; /* log2 of the log sector size */ + __uint16_t sb_logsectsize; /* sector size for the log, bytes */ + __uint32_t sb_logsunit; /* stripe unit size for the log */ + __uint32_t sb_features2; /* additional feature bits */ + + /* + * bad features2 field as a result of failing to pad the sb + * structure to 64 bits. Some machines will be using this field + * for features2 bits. Easiest just to mark it bad and not use + * it for anything else. + */ + __uint32_t sb_bad_features2; + + /* must be padded to 64 bit alignment */ +} xfs_sb_t; + +/* + * Superblock - on disk version. Must match the in core version above. + * Must be padded to 64 bit alignment. + */ +typedef struct xfs_dsb { + __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */ + __be32 sb_blocksize; /* logical block size, bytes */ + __be64 sb_dblocks; /* number of data blocks */ + __be64 sb_rblocks; /* number of realtime blocks */ + __be64 sb_rextents; /* number of realtime extents */ + uuid_t sb_uuid; /* file system unique id */ + __be64 sb_logstart; /* starting block of log if internal */ + __be64 sb_rootino; /* root inode number */ + __be64 sb_rbmino; /* bitmap inode for realtime extents */ + __be64 sb_rsumino; /* summary inode for rt bitmap */ + __be32 sb_rextsize; /* realtime extent size, blocks */ + __be32 sb_agblocks; /* size of an allocation group */ + __be32 sb_agcount; /* number of allocation groups */ + __be32 sb_rbmblocks; /* number of rt bitmap blocks */ + __be32 sb_logblocks; /* number of log blocks */ + __be16 sb_versionnum; /* header version == XFS_SB_VERSION */ + __be16 sb_sectsize; /* volume sector size, bytes */ + __be16 sb_inodesize; /* inode size, bytes */ + __be16 sb_inopblock; /* inodes per block */ + char sb_fname[12]; /* file system name */ + __u8 sb_blocklog; /* log2 of sb_blocksize */ + __u8 sb_sectlog; /* log2 of sb_sectsize */ + __u8 sb_inodelog; /* log2 of sb_inodesize */ + __u8 sb_inopblog; /* log2 of sb_inopblock */ + __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + __u8 sb_rextslog; /* log2 of sb_rextents */ + __u8 sb_inprogress; /* mkfs is in progress, don't mount */ + __u8 sb_imax_pct; /* max % of fs for inode space */ + /* statistics */ + /* + * These fields must remain contiguous. If you really + * want to change their layout, make sure you fix the + * code in xfs_trans_apply_sb_deltas(). + */ + __be64 sb_icount; /* allocated inodes */ + __be64 sb_ifree; /* free inodes */ + __be64 sb_fdblocks; /* free data blocks */ + __be64 sb_frextents; /* free realtime extents */ + /* + * End contiguous fields. + */ + __be64 sb_uquotino; /* user quota inode */ + __be64 sb_gquotino; /* group quota inode */ + __be16 sb_qflags; /* quota flags */ + __u8 sb_flags; /* misc. flags */ + __u8 sb_shared_vn; /* shared version number */ + __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */ + __be32 sb_unit; /* stripe or raid unit */ + __be32 sb_width; /* stripe or raid width */ + __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */ + __u8 sb_logsectlog; /* log2 of the log sector size */ + __be16 sb_logsectsize; /* sector size for the log, bytes */ + __be32 sb_logsunit; /* stripe unit size for the log */ + __be32 sb_features2; /* additional feature bits */ + /* + * bad features2 field as a result of failing to pad the sb + * structure to 64 bits. Some machines will be using this field + * for features2 bits. Easiest just to mark it bad and not use + * it for anything else. + */ + __be32 sb_bad_features2; + + /* must be padded to 64 bit alignment */ +} xfs_dsb_t; + +/* + * Sequence number values for the fields. + */ +typedef enum { + XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS, + XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO, + XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS, + XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS, + XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE, + XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG, + XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG, + XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT, + XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO, + XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, + XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, + XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, + XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, + XFS_SBS_FIELDCOUNT +} xfs_sb_field_t; + +/* + * Mask values, defined based on the xfs_sb_field_t values. + * Only define the ones we're using. + */ +#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x) +#define XFS_SB_UUID XFS_SB_MVAL(UUID) +#define XFS_SB_FNAME XFS_SB_MVAL(FNAME) +#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO) +#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO) +#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO) +#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM) +#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO) +#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO) +#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS) +#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN) +#define XFS_SB_UNIT XFS_SB_MVAL(UNIT) +#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH) +#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT) +#define XFS_SB_IFREE XFS_SB_MVAL(IFREE) +#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) +#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2) +#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2) +#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) +#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) +#define XFS_SB_MOD_BITS \ + (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \ + XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ + XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ + XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ + XFS_SB_BAD_FEATURES2) + + +/* + * Misc. Flags - warning - these will be cleared by xfs_repair unless + * a feature bit is set when the flag is used. + */ +#define XFS_SBF_NOFLAGS 0x00 /* no flags set */ +#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */ + +/* + * define max. shared version we can interoperate with + */ +#define XFS_SB_MAX_SHARED_VN 0 + +#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) + +static inline int xfs_sb_good_version(xfs_sb_t *sbp) +{ + /* We always support version 1-3 */ + if (sbp->sb_versionnum >= XFS_SB_VERSION_1 && + sbp->sb_versionnum <= XFS_SB_VERSION_3) + return 1; + + /* We support version 4 if all feature bits are supported */ + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) { + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) + return 0; + +#ifdef __KERNEL__ + if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN) + return 0; +#else + if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) && + sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN) + return 0; +#endif + + return 1; + } + + return 0; +} + +/* + * Detect a mismatched features2 field. Older kernels read/wrote + * this into the wrong slot, so to be safe we keep them in sync. + */ +static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp) +{ + return (sbp->sb_bad_features2 != sbp->sb_features2); +} + +static inline unsigned xfs_sb_version_tonew(unsigned v) +{ + if (v == XFS_SB_VERSION_1) + return XFS_SB_VERSION_4; + + if (v == XFS_SB_VERSION_2) + return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT; + + return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT | + XFS_SB_VERSION_NLINKBIT; +} + +static inline unsigned xfs_sb_version_toold(unsigned v) +{ + if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) + return 0; + if (v & XFS_SB_VERSION_NLINKBIT) + return XFS_SB_VERSION_3; + if (v & XFS_SB_VERSION_ATTRBIT) + return XFS_SB_VERSION_2; + return XFS_SB_VERSION_1; +} + +static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp) +{ + return sbp->sb_versionnum == XFS_SB_VERSION_2 || + sbp->sb_versionnum == XFS_SB_VERSION_3 || + (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)); +} + +static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) +{ + if (sbp->sb_versionnum == XFS_SB_VERSION_1) + sbp->sb_versionnum = XFS_SB_VERSION_2; + else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; + else + sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT; +} + +static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp) +{ + return sbp->sb_versionnum == XFS_SB_VERSION_3 || + (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)); +} + +static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp) +{ + if (sbp->sb_versionnum <= XFS_SB_VERSION_2) + sbp->sb_versionnum = XFS_SB_VERSION_3; + else + sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT; +} + +static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); +} + +static inline void xfs_sb_version_addquota(xfs_sb_t *sbp) +{ + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; + else + sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) | + XFS_SB_VERSION_QUOTABIT; +} + +static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT); +} + +static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); +} + +static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT); +} + +static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT); +} + +static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); +} + +static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); +} + +static inline int xfs_sb_version_hassector(xfs_sb_t *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); +} + +static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); +} + +static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); +} + +/* + * sb_features2 bit version macros. + * + * For example, for a bit defined as XFS_SB_VERSION2_FUNBIT, has a macro: + * + * SB_VERSION_HASFUNBIT(xfs_sb_t *sbp) + * ((xfs_sb_version_hasmorebits(sbp) && + * ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT) + */ + +static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp) +{ + return xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT); +} + +static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp) +{ + return xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT); +} + +static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; +} + +static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp) +{ + sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; + if (!sbp->sb_features2) + sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; +} + +static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) +{ + return xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); +} + +/* + * end of superblock version macros + */ + +#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ +#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) +#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)XFS_BUF_PTR(bp)) + +#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) +#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ + xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d)) +#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \ + XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno)) + +/* + * File system sector to basic block conversions. + */ +#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log) + +/* + * File system block to basic block conversions. + */ +#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log) +#define XFS_BB_TO_FSB(mp,bb) \ + (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log) +#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log) +#define XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1)) + +/* + * File system block to byte conversions. + */ +#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog) +#define XFS_B_TO_FSB(mp,b) \ + ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog) +#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) +#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) + +#endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c new file mode 100644 index 00000000..efc147f0 --- /dev/null +++ b/fs/xfs/xfs_trans.c @@ -0,0 +1,2026 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_trans_priv.h" +#include "xfs_trans_space.h" +#include "xfs_inode_item.h" +#include "xfs_trace.h" + +kmem_zone_t *xfs_trans_zone; +kmem_zone_t *xfs_log_item_desc_zone; + + +/* + * Various log reservation values. + * + * These are based on the size of the file system block because that is what + * most transactions manipulate. Each adds in an additional 128 bytes per + * item logged to try to account for the overhead of the transaction mechanism. + * + * Note: Most of the reservations underestimate the number of allocation + * groups into which they could free extents in the xfs_bmap_finish() call. + * This is because the number in the worst case is quite high and quite + * unusual. In order to fix this we need to change xfs_bmap_finish() to free + * extents in only a single AG at a time. This will require changes to the + * EFI code as well, however, so that the EFI for the extents not freed is + * logged again in each transaction. See SGI PV #261917. + * + * Reservation functions here avoid a huge stack in xfs_trans_init due to + * register overflow from temporaries in the calculations. + */ + + +/* + * In a write transaction we can allocate a maximum of 2 + * extents. This gives: + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join: + * the agfs of the ags containing the blocks: 2 * sector size + * the agfls of the ags containing the blocks: 2 * sector size + * the super block free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_write_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + + 2 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 2) + + 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + + XFS_ALLOCFREE_LOG_COUNT(mp, 2))), + (2 * mp->m_sb.sb_sectsize + + 2 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 2) + + 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); +} + +/* + * In truncating a file we free up to two extents at once. We can modify: + * the inode being truncated: inode size + * the inode's bmap btree: (max depth + 1) * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_itruncate_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + + 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), + (4 * mp->m_sb.sb_sectsize + + 4 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 4) + + 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) + + 128 * 5 + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); +} + +/* + * In renaming a files we can modify: + * the four inodes involved: 4 * inode size + * the two directory btrees: 2 * (max depth + v2) * dir block size + * the two directory bmap btrees: 2 * max depth * block size + * And the bmap_finish transaction can free dir and bmap blocks (two sets + * of bmap blocks) giving: + * the agf for the ags in which the blocks live: 3 * sector size + * the agfl for the ags in which the blocks live: 3 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_rename_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((4 * mp->m_sb.sb_inodesize + + 2 * XFS_DIROP_LOG_RES(mp) + + 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))), + (3 * mp->m_sb.sb_sectsize + + 3 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 3) + + 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3)))); +} + +/* + * For creating a link to an inode: + * the parent directory inode: inode size + * the linked inode: inode size + * the directory btree could split: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free some bmap blocks giving: + * the agf for the ag in which the blocks live: sector size + * the agfl for the ag in which the blocks live: sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_link_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + mp->m_sb.sb_inodesize + + XFS_DIROP_LOG_RES(mp) + + 128 * (2 + XFS_DIROP_LOG_COUNT(mp))), + (mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); +} + +/* + * For removing a directory entry we can modify: + * the parent directory inode: inode size + * the removed inode: inode size + * the directory btree could join: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free the dir and bmap blocks giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_remove_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + mp->m_sb.sb_inodesize + + XFS_DIROP_LOG_RES(mp) + + 128 * (2 + XFS_DIROP_LOG_COUNT(mp))), + (2 * mp->m_sb.sb_sectsize + + 2 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 2) + + 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); +} + +/* + * For symlink we can modify: + * the parent directory inode: inode size + * the new inode: inode size + * the inode btree entry: 1 block + * the directory btree: (max depth + v2) * dir block size + * the directory inode's bmap btree: (max depth + v2) * block size + * the blocks for the symlink: 1 kB + * Or in the first xact we allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_symlink_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, 1) + + XFS_DIROP_LOG_RES(mp) + + 1024 + + 128 * (4 + XFS_DIROP_LOG_COUNT(mp))), + (2 * mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + + XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); +} + +/* + * For create we can modify: + * the parent directory inode: inode size + * the new inode: inode size + * the inode btree entry: block size + * the superblock for the nlink flag: sector size + * the directory btree: (max depth + v2) * dir block size + * the directory inode's bmap btree: (max depth + v2) * block size + * Or in the first xact we allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size + * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_create_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, 1) + + XFS_DIROP_LOG_RES(mp) + + 128 * (3 + XFS_DIROP_LOG_COUNT(mp))), + (3 * mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + + XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); +} + +/* + * Making a new directory is the same as creating a new file. + */ +STATIC uint +xfs_calc_mkdir_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_create_reservation(mp); +} + +/* + * In freeing an inode we can modify: + * the inode being freed: inode size + * the super block free inode counter: sector size + * the agi hash list and counters: sector size + * the inode btree entry: block size + * the on disk inode before ours in the agi hash list: inode cluster size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_ifree_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, 1) + + MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), + XFS_INODE_CLUSTER_SIZE(mp)) + + 128 * 5 + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); +} + +/* + * When only changing the inode we log the inode and possibly the superblock + * We also add a bit of slop for the transaction stuff. + */ +STATIC uint +xfs_calc_ichange_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize + + 512; + +} + +/* + * Growing the data section of the filesystem. + * superblock + * agi and agf + * allocation btrees + */ +STATIC uint +xfs_calc_growdata_reservation( + struct xfs_mount *mp) +{ + return mp->m_sb.sb_sectsize * 3 + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); +} + +/* + * Growing the rt section of the filesystem. + * In the first set of transactions (ALLOC) we allocate space to the + * bitmap or summary files. + * superblock: sector size + * agf of the ag from which the extent is allocated: sector size + * bmap btree for bitmap/summary inode: max depth * blocksize + * bitmap/summary inode: inode size + * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize + */ +STATIC uint +xfs_calc_growrtalloc_reservation( + struct xfs_mount *mp) +{ + return 2 * mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + + mp->m_sb.sb_inodesize + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); +} + +/* + * Growing the rt section of the filesystem. + * In the second set of transactions (ZERO) we zero the new metadata blocks. + * one bitmap/summary block: blocksize + */ +STATIC uint +xfs_calc_growrtzero_reservation( + struct xfs_mount *mp) +{ + return mp->m_sb.sb_blocksize + 128; +} + +/* + * Growing the rt section of the filesystem. + * In the third set of transactions (FREE) we update metadata without + * allocating any new blocks. + * superblock: sector size + * bitmap inode: inode size + * summary inode: inode size + * one bitmap block: blocksize + * summary blocks: new summary size + */ +STATIC uint +xfs_calc_growrtfree_reservation( + struct xfs_mount *mp) +{ + return mp->m_sb.sb_sectsize + + 2 * mp->m_sb.sb_inodesize + + mp->m_sb.sb_blocksize + + mp->m_rsumsize + + 128 * 5; +} + +/* + * Logging the inode modification timestamp on a synchronous write. + * inode + */ +STATIC uint +xfs_calc_swrite_reservation( + struct xfs_mount *mp) +{ + return mp->m_sb.sb_inodesize + 128; +} + +/* + * Logging the inode mode bits when writing a setuid/setgid file + * inode + */ +STATIC uint +xfs_calc_writeid_reservation(xfs_mount_t *mp) +{ + return mp->m_sb.sb_inodesize + 128; +} + +/* + * Converting the inode from non-attributed to attributed. + * the inode being converted: inode size + * agf block and superblock (for block allocation) + * the new block (directory sized) + * bmap blocks for the new directory block + * allocation btrees + */ +STATIC uint +xfs_calc_addafork_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize * 2 + + mp->m_dirblksize + + XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); +} + +/* + * Removing the attribute fork of a file + * the inode being truncated: inode size + * the inode's bmap btree: max depth * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_attrinval_reservation( + struct xfs_mount *mp) +{ + return MAX((mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + + 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))), + (4 * mp->m_sb.sb_sectsize + + 4 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 4) + + 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)))); +} + +/* + * Setting an attribute. + * the inode getting the attribute + * the superblock for allocations + * the agfs extents are allocated from + * the attribute btree * max depth + * the inode allocation btree + * Since attribute transaction space is dependent on the size of the attribute, + * the calculation is done partially at mount time and partially at runtime. + */ +STATIC uint +xfs_calc_attrset_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + + 128 * (2 + XFS_DA_NODE_MAXDEPTH); +} + +/* + * Removing an attribute. + * the inode: inode size + * the attribute btree could join: max depth * block size + * the inode bmap btree could join or split: max depth * block size + * And the bmap_finish transaction can free the attr blocks freed giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_attrrm_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + + 128 * (1 + XFS_DA_NODE_MAXDEPTH + + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), + (2 * mp->m_sb.sb_sectsize + + 2 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 2) + + 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); +} + +/* + * Clearing a bad agino number in an agi hash bucket. + */ +STATIC uint +xfs_calc_clear_agi_bucket_reservation( + struct xfs_mount *mp) +{ + return mp->m_sb.sb_sectsize + 128; +} + +/* + * Initialize the precomputed transaction reservation values + * in the mount structure. + */ +void +xfs_trans_init( + struct xfs_mount *mp) +{ + struct xfs_trans_reservations *resp = &mp->m_reservations; + + resp->tr_write = xfs_calc_write_reservation(mp); + resp->tr_itruncate = xfs_calc_itruncate_reservation(mp); + resp->tr_rename = xfs_calc_rename_reservation(mp); + resp->tr_link = xfs_calc_link_reservation(mp); + resp->tr_remove = xfs_calc_remove_reservation(mp); + resp->tr_symlink = xfs_calc_symlink_reservation(mp); + resp->tr_create = xfs_calc_create_reservation(mp); + resp->tr_mkdir = xfs_calc_mkdir_reservation(mp); + resp->tr_ifree = xfs_calc_ifree_reservation(mp); + resp->tr_ichange = xfs_calc_ichange_reservation(mp); + resp->tr_growdata = xfs_calc_growdata_reservation(mp); + resp->tr_swrite = xfs_calc_swrite_reservation(mp); + resp->tr_writeid = xfs_calc_writeid_reservation(mp); + resp->tr_addafork = xfs_calc_addafork_reservation(mp); + resp->tr_attrinval = xfs_calc_attrinval_reservation(mp); + resp->tr_attrset = xfs_calc_attrset_reservation(mp); + resp->tr_attrrm = xfs_calc_attrrm_reservation(mp); + resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp); + resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp); + resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp); + resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp); +} + +/* + * This routine is called to allocate a transaction structure. + * The type parameter indicates the type of the transaction. These + * are enumerated in xfs_trans.h. + * + * Dynamically allocate the transaction structure from the transaction + * zone, initialize it, and return it to the caller. + */ +xfs_trans_t * +xfs_trans_alloc( + xfs_mount_t *mp, + uint type) +{ + xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); + return _xfs_trans_alloc(mp, type, KM_SLEEP); +} + +xfs_trans_t * +_xfs_trans_alloc( + xfs_mount_t *mp, + uint type, + uint memflags) +{ + xfs_trans_t *tp; + + atomic_inc(&mp->m_active_trans); + + tp = kmem_zone_zalloc(xfs_trans_zone, memflags); + tp->t_magic = XFS_TRANS_MAGIC; + tp->t_type = type; + tp->t_mountp = mp; + INIT_LIST_HEAD(&tp->t_items); + INIT_LIST_HEAD(&tp->t_busy); + return tp; +} + +/* + * Free the transaction structure. If there is more clean up + * to do when the structure is freed, add it here. + */ +STATIC void +xfs_trans_free( + struct xfs_trans *tp) +{ + xfs_alloc_busy_sort(&tp->t_busy); + xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false); + + atomic_dec(&tp->t_mountp->m_active_trans); + xfs_trans_free_dqinfo(tp); + kmem_zone_free(xfs_trans_zone, tp); +} + +/* + * This is called to create a new transaction which will share the + * permanent log reservation of the given transaction. The remaining + * unused block and rt extent reservations are also inherited. This + * implies that the original transaction is no longer allowed to allocate + * blocks. Locks and log items, however, are no inherited. They must + * be added to the new transaction explicitly. + */ +xfs_trans_t * +xfs_trans_dup( + xfs_trans_t *tp) +{ + xfs_trans_t *ntp; + + ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); + + /* + * Initialize the new transaction structure. + */ + ntp->t_magic = XFS_TRANS_MAGIC; + ntp->t_type = tp->t_type; + ntp->t_mountp = tp->t_mountp; + INIT_LIST_HEAD(&ntp->t_items); + INIT_LIST_HEAD(&ntp->t_busy); + + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(tp->t_ticket != NULL); + + ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE); + ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); + ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; + tp->t_blk_res = tp->t_blk_res_used; + ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; + tp->t_rtx_res = tp->t_rtx_res_used; + ntp->t_pflags = tp->t_pflags; + + xfs_trans_dup_dqinfo(tp, ntp); + + atomic_inc(&tp->t_mountp->m_active_trans); + return ntp; +} + +/* + * This is called to reserve free disk blocks and log space for the + * given transaction. This must be done before allocating any resources + * within the transaction. + * + * This will return ENOSPC if there are not enough blocks available. + * It will sleep waiting for available log space. + * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which + * is used by long running transactions. If any one of the reservations + * fails then they will all be backed out. + * + * This does not do quota reservations. That typically is done by the + * caller afterwards. + */ +int +xfs_trans_reserve( + xfs_trans_t *tp, + uint blocks, + uint logspace, + uint rtextents, + uint flags, + uint logcount) +{ + int log_flags; + int error = 0; + int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + + /* Mark this thread as being in a transaction */ + current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + + /* + * Attempt to reserve the needed disk blocks by decrementing + * the number needed from the number available. This will + * fail if the count would go below zero. + */ + if (blocks > 0) { + error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, + -((int64_t)blocks), rsvd); + if (error != 0) { + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + return (XFS_ERROR(ENOSPC)); + } + tp->t_blk_res += blocks; + } + + /* + * Reserve the log space needed for this transaction. + */ + if (logspace > 0) { + ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace)); + ASSERT((tp->t_log_count == 0) || + (tp->t_log_count == logcount)); + if (flags & XFS_TRANS_PERM_LOG_RES) { + log_flags = XFS_LOG_PERM_RESERV; + tp->t_flags |= XFS_TRANS_PERM_LOG_RES; + } else { + ASSERT(tp->t_ticket == NULL); + ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); + log_flags = 0; + } + + error = xfs_log_reserve(tp->t_mountp, logspace, logcount, + &tp->t_ticket, + XFS_TRANSACTION, log_flags, tp->t_type); + if (error) { + goto undo_blocks; + } + tp->t_log_res = logspace; + tp->t_log_count = logcount; + } + + /* + * Attempt to reserve the needed realtime extents by decrementing + * the number needed from the number available. This will + * fail if the count would go below zero. + */ + if (rtextents > 0) { + error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS, + -((int64_t)rtextents), rsvd); + if (error) { + error = XFS_ERROR(ENOSPC); + goto undo_log; + } + tp->t_rtx_res += rtextents; + } + + return 0; + + /* + * Error cases jump to one of these labels to undo any + * reservations which have already been performed. + */ +undo_log: + if (logspace > 0) { + if (flags & XFS_TRANS_PERM_LOG_RES) { + log_flags = XFS_LOG_REL_PERM_RESERV; + } else { + log_flags = 0; + } + xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags); + tp->t_ticket = NULL; + tp->t_log_res = 0; + tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; + } + +undo_blocks: + if (blocks > 0) { + xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, + (int64_t)blocks, rsvd); + tp->t_blk_res = 0; + } + + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + + return error; +} + +/* + * Record the indicated change to the given field for application + * to the file system's superblock when the transaction commits. + * For now, just store the change in the transaction structure. + * + * Mark the transaction structure to indicate that the superblock + * needs to be updated before committing. + * + * Because we may not be keeping track of allocated/free inodes and + * used filesystem blocks in the superblock, we do not mark the + * superblock dirty in this transaction if we modify these fields. + * We still need to update the transaction deltas so that they get + * applied to the incore superblock, but we don't want them to + * cause the superblock to get locked and logged if these are the + * only fields in the superblock that the transaction modifies. + */ +void +xfs_trans_mod_sb( + xfs_trans_t *tp, + uint field, + int64_t delta) +{ + uint32_t flags = (XFS_TRANS_DIRTY|XFS_TRANS_SB_DIRTY); + xfs_mount_t *mp = tp->t_mountp; + + switch (field) { + case XFS_TRANS_SB_ICOUNT: + tp->t_icount_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; + break; + case XFS_TRANS_SB_IFREE: + tp->t_ifree_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; + break; + case XFS_TRANS_SB_FDBLOCKS: + /* + * Track the number of blocks allocated in the + * transaction. Make sure it does not exceed the + * number reserved. + */ + if (delta < 0) { + tp->t_blk_res_used += (uint)-delta; + ASSERT(tp->t_blk_res_used <= tp->t_blk_res); + } + tp->t_fdblocks_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; + break; + case XFS_TRANS_SB_RES_FDBLOCKS: + /* + * The allocation has already been applied to the + * in-core superblock's counter. This should only + * be applied to the on-disk superblock. + */ + ASSERT(delta < 0); + tp->t_res_fdblocks_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; + break; + case XFS_TRANS_SB_FREXTENTS: + /* + * Track the number of blocks allocated in the + * transaction. Make sure it does not exceed the + * number reserved. + */ + if (delta < 0) { + tp->t_rtx_res_used += (uint)-delta; + ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res); + } + tp->t_frextents_delta += delta; + break; + case XFS_TRANS_SB_RES_FREXTENTS: + /* + * The allocation has already been applied to the + * in-core superblock's counter. This should only + * be applied to the on-disk superblock. + */ + ASSERT(delta < 0); + tp->t_res_frextents_delta += delta; + break; + case XFS_TRANS_SB_DBLOCKS: + ASSERT(delta > 0); + tp->t_dblocks_delta += delta; + break; + case XFS_TRANS_SB_AGCOUNT: + ASSERT(delta > 0); + tp->t_agcount_delta += delta; + break; + case XFS_TRANS_SB_IMAXPCT: + tp->t_imaxpct_delta += delta; + break; + case XFS_TRANS_SB_REXTSIZE: + tp->t_rextsize_delta += delta; + break; + case XFS_TRANS_SB_RBMBLOCKS: + tp->t_rbmblocks_delta += delta; + break; + case XFS_TRANS_SB_RBLOCKS: + tp->t_rblocks_delta += delta; + break; + case XFS_TRANS_SB_REXTENTS: + tp->t_rextents_delta += delta; + break; + case XFS_TRANS_SB_REXTSLOG: + tp->t_rextslog_delta += delta; + break; + default: + ASSERT(0); + return; + } + + tp->t_flags |= flags; +} + +/* + * xfs_trans_apply_sb_deltas() is called from the commit code + * to bring the superblock buffer into the current transaction + * and modify it as requested by earlier calls to xfs_trans_mod_sb(). + * + * For now we just look at each field allowed to change and change + * it if necessary. + */ +STATIC void +xfs_trans_apply_sb_deltas( + xfs_trans_t *tp) +{ + xfs_dsb_t *sbp; + xfs_buf_t *bp; + int whole = 0; + + bp = xfs_trans_getsb(tp, tp->t_mountp, 0); + sbp = XFS_BUF_TO_SBP(bp); + + /* + * Check that superblock mods match the mods made to AGF counters. + */ + ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) == + (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + + tp->t_ag_btree_delta)); + + /* + * Only update the superblock counters if we are logging them + */ + if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) { + if (tp->t_icount_delta) + be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta); + if (tp->t_ifree_delta) + be64_add_cpu(&sbp->sb_ifree, tp->t_ifree_delta); + if (tp->t_fdblocks_delta) + be64_add_cpu(&sbp->sb_fdblocks, tp->t_fdblocks_delta); + if (tp->t_res_fdblocks_delta) + be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta); + } + + if (tp->t_frextents_delta) + be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta); + if (tp->t_res_frextents_delta) + be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta); + + if (tp->t_dblocks_delta) { + be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); + whole = 1; + } + if (tp->t_agcount_delta) { + be32_add_cpu(&sbp->sb_agcount, tp->t_agcount_delta); + whole = 1; + } + if (tp->t_imaxpct_delta) { + sbp->sb_imax_pct += tp->t_imaxpct_delta; + whole = 1; + } + if (tp->t_rextsize_delta) { + be32_add_cpu(&sbp->sb_rextsize, tp->t_rextsize_delta); + whole = 1; + } + if (tp->t_rbmblocks_delta) { + be32_add_cpu(&sbp->sb_rbmblocks, tp->t_rbmblocks_delta); + whole = 1; + } + if (tp->t_rblocks_delta) { + be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta); + whole = 1; + } + if (tp->t_rextents_delta) { + be64_add_cpu(&sbp->sb_rextents, tp->t_rextents_delta); + whole = 1; + } + if (tp->t_rextslog_delta) { + sbp->sb_rextslog += tp->t_rextslog_delta; + whole = 1; + } + + if (whole) + /* + * Log the whole thing, the fields are noncontiguous. + */ + xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_dsb_t) - 1); + else + /* + * Since all the modifiable fields are contiguous, we + * can get away with this. + */ + xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount), + offsetof(xfs_dsb_t, sb_frextents) + + sizeof(sbp->sb_frextents) - 1); +} + +/* + * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations + * and apply superblock counter changes to the in-core superblock. The + * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT + * applied to the in-core superblock. The idea is that that has already been + * done. + * + * This is done efficiently with a single call to xfs_mod_incore_sb_batch(). + * However, we have to ensure that we only modify each superblock field only + * once because the application of the delta values may not be atomic. That can + * lead to ENOSPC races occurring if we have two separate modifcations of the + * free space counter to put back the entire reservation and then take away + * what we used. + * + * If we are not logging superblock counters, then the inode allocated/free and + * used block counts are not updated in the on disk superblock. In this case, + * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we + * still need to update the incore superblock with the changes. + */ +void +xfs_trans_unreserve_and_mod_sb( + xfs_trans_t *tp) +{ + xfs_mod_sb_t msb[9]; /* If you add cases, add entries */ + xfs_mod_sb_t *msbp; + xfs_mount_t *mp = tp->t_mountp; + /* REFERENCED */ + int error; + int rsvd; + int64_t blkdelta = 0; + int64_t rtxdelta = 0; + int64_t idelta = 0; + int64_t ifreedelta = 0; + + msbp = msb; + rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + + /* calculate deltas */ + if (tp->t_blk_res > 0) + blkdelta = tp->t_blk_res; + if ((tp->t_fdblocks_delta != 0) && + (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (tp->t_flags & XFS_TRANS_SB_DIRTY))) + blkdelta += tp->t_fdblocks_delta; + + if (tp->t_rtx_res > 0) + rtxdelta = tp->t_rtx_res; + if ((tp->t_frextents_delta != 0) && + (tp->t_flags & XFS_TRANS_SB_DIRTY)) + rtxdelta += tp->t_frextents_delta; + + if (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (tp->t_flags & XFS_TRANS_SB_DIRTY)) { + idelta = tp->t_icount_delta; + ifreedelta = tp->t_ifree_delta; + } + + /* apply the per-cpu counters */ + if (blkdelta) { + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + blkdelta, rsvd); + if (error) + goto out; + } + + if (idelta) { + error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, + idelta, rsvd); + if (error) + goto out_undo_fdblocks; + } + + if (ifreedelta) { + error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, + ifreedelta, rsvd); + if (error) + goto out_undo_icount; + } + + /* apply remaining deltas */ + if (rtxdelta != 0) { + msbp->msb_field = XFS_SBS_FREXTENTS; + msbp->msb_delta = rtxdelta; + msbp++; + } + + if (tp->t_flags & XFS_TRANS_SB_DIRTY) { + if (tp->t_dblocks_delta != 0) { + msbp->msb_field = XFS_SBS_DBLOCKS; + msbp->msb_delta = tp->t_dblocks_delta; + msbp++; + } + if (tp->t_agcount_delta != 0) { + msbp->msb_field = XFS_SBS_AGCOUNT; + msbp->msb_delta = tp->t_agcount_delta; + msbp++; + } + if (tp->t_imaxpct_delta != 0) { + msbp->msb_field = XFS_SBS_IMAX_PCT; + msbp->msb_delta = tp->t_imaxpct_delta; + msbp++; + } + if (tp->t_rextsize_delta != 0) { + msbp->msb_field = XFS_SBS_REXTSIZE; + msbp->msb_delta = tp->t_rextsize_delta; + msbp++; + } + if (tp->t_rbmblocks_delta != 0) { + msbp->msb_field = XFS_SBS_RBMBLOCKS; + msbp->msb_delta = tp->t_rbmblocks_delta; + msbp++; + } + if (tp->t_rblocks_delta != 0) { + msbp->msb_field = XFS_SBS_RBLOCKS; + msbp->msb_delta = tp->t_rblocks_delta; + msbp++; + } + if (tp->t_rextents_delta != 0) { + msbp->msb_field = XFS_SBS_REXTENTS; + msbp->msb_delta = tp->t_rextents_delta; + msbp++; + } + if (tp->t_rextslog_delta != 0) { + msbp->msb_field = XFS_SBS_REXTSLOG; + msbp->msb_delta = tp->t_rextslog_delta; + msbp++; + } + } + + /* + * If we need to change anything, do it. + */ + if (msbp > msb) { + error = xfs_mod_incore_sb_batch(tp->t_mountp, msb, + (uint)(msbp - msb), rsvd); + if (error) + goto out_undo_ifreecount; + } + + return; + +out_undo_ifreecount: + if (ifreedelta) + xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd); +out_undo_icount: + if (idelta) + xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd); +out_undo_fdblocks: + if (blkdelta) + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); +out: + ASSERT(error == 0); + return; +} + +/* + * Add the given log item to the transaction's list of log items. + * + * The log item will now point to its new descriptor with its li_desc field. + */ +void +xfs_trans_add_item( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_log_item_desc *lidp; + + ASSERT(lip->li_mountp = tp->t_mountp); + ASSERT(lip->li_ailp = tp->t_mountp->m_ail); + + lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS); + + lidp->lid_item = lip; + lidp->lid_flags = 0; + lidp->lid_size = 0; + list_add_tail(&lidp->lid_trans, &tp->t_items); + + lip->li_desc = lidp; +} + +STATIC void +xfs_trans_free_item_desc( + struct xfs_log_item_desc *lidp) +{ + list_del_init(&lidp->lid_trans); + kmem_zone_free(xfs_log_item_desc_zone, lidp); +} + +/* + * Unlink and free the given descriptor. + */ +void +xfs_trans_del_item( + struct xfs_log_item *lip) +{ + xfs_trans_free_item_desc(lip->li_desc); + lip->li_desc = NULL; +} + +/* + * Unlock all of the items of a transaction and free all the descriptors + * of that transaction. + */ +void +xfs_trans_free_items( + struct xfs_trans *tp, + xfs_lsn_t commit_lsn, + int flags) +{ + struct xfs_log_item_desc *lidp, *next; + + list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + lip->li_desc = NULL; + + if (commit_lsn != NULLCOMMITLSN) + IOP_COMMITTING(lip, commit_lsn); + if (flags & XFS_TRANS_ABORT) + lip->li_flags |= XFS_LI_ABORTED; + IOP_UNLOCK(lip); + + xfs_trans_free_item_desc(lidp); + } +} + +/* + * Unlock the items associated with a transaction. + * + * Items which were not logged should be freed. Those which were logged must + * still be tracked so they can be unpinned when the transaction commits. + */ +STATIC void +xfs_trans_unlock_items( + struct xfs_trans *tp, + xfs_lsn_t commit_lsn) +{ + struct xfs_log_item_desc *lidp, *next; + + list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + lip->li_desc = NULL; + + if (commit_lsn != NULLCOMMITLSN) + IOP_COMMITTING(lip, commit_lsn); + IOP_UNLOCK(lip); + + /* + * Free the descriptor if the item is not dirty + * within this transaction. + */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + xfs_trans_free_item_desc(lidp); + } +} + +/* + * Total up the number of log iovecs needed to commit this + * transaction. The transaction itself needs one for the + * transaction header. Ask each dirty item in turn how many + * it needs to get the total. + */ +static uint +xfs_trans_count_vecs( + struct xfs_trans *tp) +{ + int nvecs; + struct xfs_log_item_desc *lidp; + + nvecs = 1; + + /* In the non-debug case we need to start bailing out if we + * didn't find a log_item here, return zero and let trans_commit + * deal with it. + */ + if (list_empty(&tp->t_items)) { + ASSERT(0); + return 0; + } + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + /* + * Skip items which aren't dirty in this transaction. + */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + lidp->lid_size = IOP_SIZE(lidp->lid_item); + nvecs += lidp->lid_size; + } + + return nvecs; +} + +/* + * Fill in the vector with pointers to data to be logged + * by this transaction. The transaction header takes + * the first vector, and then each dirty item takes the + * number of vectors it indicated it needed in xfs_trans_count_vecs(). + * + * As each item fills in the entries it needs, also pin the item + * so that it cannot be flushed out until the log write completes. + */ +static void +xfs_trans_fill_vecs( + struct xfs_trans *tp, + struct xfs_log_iovec *log_vector) +{ + struct xfs_log_item_desc *lidp; + struct xfs_log_iovec *vecp; + uint nitems; + + /* + * Skip over the entry for the transaction header, we'll + * fill that in at the end. + */ + vecp = log_vector + 1; + + nitems = 0; + ASSERT(!list_empty(&tp->t_items)); + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* + * The item may be marked dirty but not log anything. This can + * be used to get called when a transaction is committed. + */ + if (lidp->lid_size) + nitems++; + IOP_FORMAT(lidp->lid_item, vecp); + vecp += lidp->lid_size; + IOP_PIN(lidp->lid_item); + } + + /* + * Now that we've counted the number of items in this transaction, fill + * in the transaction header. Note that the transaction header does not + * have a log item. + */ + tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; + tp->t_header.th_type = tp->t_type; + tp->t_header.th_num_items = nitems; + log_vector->i_addr = (xfs_caddr_t)&tp->t_header; + log_vector->i_len = sizeof(xfs_trans_header_t); + log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; +} + +/* + * The committed item processing consists of calling the committed routine of + * each logged item, updating the item's position in the AIL if necessary, and + * unpinning each item. If the committed routine returns -1, then do nothing + * further with the item because it may have been freed. + * + * Since items are unlocked when they are copied to the incore log, it is + * possible for two transactions to be completing and manipulating the same + * item simultaneously. The AIL lock will protect the lsn field of each item. + * The value of this field can never go backwards. + * + * We unpin the items after repositioning them in the AIL, because otherwise + * they could be immediately flushed and we'd have to race with the flusher + * trying to pull the item from the AIL as we add it. + */ +static void +xfs_trans_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn, + int aborted) +{ + xfs_lsn_t item_lsn; + struct xfs_ail *ailp; + + if (aborted) + lip->li_flags |= XFS_LI_ABORTED; + item_lsn = IOP_COMMITTED(lip, commit_lsn); + + /* item_lsn of -1 means the item needs no further processing */ + if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) + return; + + /* + * If the returned lsn is greater than what it contained before, update + * the location of the item in the AIL. If it is not, then do nothing. + * Items can never move backwards in the AIL. + * + * While the new lsn should usually be greater, it is possible that a + * later transaction completing simultaneously with an earlier one + * using the same item could complete first with a higher lsn. This + * would cause the earlier transaction to fail the test below. + */ + ailp = lip->li_ailp; + spin_lock(&ailp->xa_lock); + if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { + /* + * This will set the item's lsn to item_lsn and update the + * position of the item in the AIL. + * + * xfs_trans_ail_update() drops the AIL lock. + */ + xfs_trans_ail_update(ailp, lip, item_lsn); + } else { + spin_unlock(&ailp->xa_lock); + } + + /* + * Now that we've repositioned the item in the AIL, unpin it so it can + * be flushed. Pass information about buffer stale state down from the + * log item flags, if anyone else stales the buffer we do not want to + * pay any attention to it. + */ + IOP_UNPIN(lip, 0); +} + +/* + * This is typically called by the LM when a transaction has been fully + * committed to disk. It needs to unpin the items which have + * been logged by the transaction and update their positions + * in the AIL if necessary. + * + * This also gets called when the transactions didn't get written out + * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then. + */ +STATIC void +xfs_trans_committed( + void *arg, + int abortflag) +{ + struct xfs_trans *tp = arg; + struct xfs_log_item_desc *lidp, *next; + + list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { + xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); + xfs_trans_free_item_desc(lidp); + } + + xfs_trans_free(tp); +} + +static inline void +xfs_log_item_batch_insert( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, + int nr_items, + xfs_lsn_t commit_lsn) +{ + int i; + + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ + xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); + + for (i = 0; i < nr_items; i++) + IOP_UNPIN(log_items[i], 0); +} + +/* + * Bulk operation version of xfs_trans_committed that takes a log vector of + * items to insert into the AIL. This uses bulk AIL insertion techniques to + * minimise lock traffic. + * + * If we are called with the aborted flag set, it is because a log write during + * a CIL checkpoint commit has failed. In this case, all the items in the + * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which + * means that checkpoint commit abort handling is treated exactly the same + * as an iclog write error even though we haven't started any IO yet. Hence in + * this case all we need to do is IOP_COMMITTED processing, followed by an + * IOP_UNPIN(aborted) call. + * + * The AIL cursor is used to optimise the insert process. If commit_lsn is not + * at the end of the AIL, the insert cursor avoids the need to walk + * the AIL to find the insertion point on every xfs_log_item_batch_insert() + * call. This saves a lot of needless list walking and is a net win, even + * though it slightly increases that amount of AIL lock traffic to set it up + * and tear it down. + */ +void +xfs_trans_committed_bulk( + struct xfs_ail *ailp, + struct xfs_log_vec *log_vector, + xfs_lsn_t commit_lsn, + int aborted) +{ +#define LOG_ITEM_BATCH_SIZE 32 + struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; + struct xfs_log_vec *lv; + struct xfs_ail_cursor cur; + int i = 0; + + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn); + spin_unlock(&ailp->xa_lock); + + /* unpin all the log items */ + for (lv = log_vector; lv; lv = lv->lv_next ) { + struct xfs_log_item *lip = lv->lv_item; + xfs_lsn_t item_lsn; + + if (aborted) + lip->li_flags |= XFS_LI_ABORTED; + item_lsn = IOP_COMMITTED(lip, commit_lsn); + + /* item_lsn of -1 means the item needs no further processing */ + if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) + continue; + + /* + * if we are aborting the operation, no point in inserting the + * object into the AIL as we are in a shutdown situation. + */ + if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount)); + IOP_UNPIN(lip, 1); + continue; + } + + if (item_lsn != commit_lsn) { + + /* + * Not a bulk update option due to unusual item_lsn. + * Push into AIL immediately, rechecking the lsn once + * we have the ail lock. Then unpin the item. This does + * not affect the AIL cursor the bulk insert path is + * using. + */ + spin_lock(&ailp->xa_lock); + if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) + xfs_trans_ail_update(ailp, lip, item_lsn); + else + spin_unlock(&ailp->xa_lock); + IOP_UNPIN(lip, 0); + continue; + } + + /* Item is a candidate for bulk AIL insert. */ + log_items[i++] = lv->lv_item; + if (i >= LOG_ITEM_BATCH_SIZE) { + xfs_log_item_batch_insert(ailp, &cur, log_items, + LOG_ITEM_BATCH_SIZE, commit_lsn); + i = 0; + } + } + + /* make sure we insert the remainder! */ + if (i) + xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn); + + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_done(ailp, &cur); + spin_unlock(&ailp->xa_lock); +} + +/* + * Called from the trans_commit code when we notice that the filesystem is in + * the middle of a forced shutdown. + * + * When we are called here, we have already pinned all the items in the + * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called + * so we can simply walk the items in the transaction, unpin them with an abort + * flag and then free the items. Note that unpinning the items can result in + * them being freed immediately, so we need to use a safe list traversal method + * here. + */ +STATIC void +xfs_trans_uncommit( + struct xfs_trans *tp, + uint flags) +{ + struct xfs_log_item_desc *lidp, *n; + + list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) { + if (lidp->lid_flags & XFS_LID_DIRTY) + IOP_UNPIN(lidp->lid_item, 1); + } + + xfs_trans_unreserve_and_mod_sb(tp); + xfs_trans_unreserve_and_mod_dquots(tp); + + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); + xfs_trans_free(tp); +} + +/* + * Format the transaction direct to the iclog. This isolates the physical + * transaction commit operation from the logical operation and hence allows + * other methods to be introduced without affecting the existing commit path. + */ +static int +xfs_trans_commit_iclog( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, + int flags) +{ + int shutdown; + int error; + int log_flags = 0; + struct xlog_in_core *commit_iclog; +#define XFS_TRANS_LOGVEC_COUNT 16 + struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; + struct xfs_log_iovec *log_vector; + uint nvec; + + + /* + * Ask each log item how many log_vector entries it will + * need so we can figure out how many to allocate. + * Try to avoid the kmem_alloc() call in the common case + * by using a vector from the stack when it fits. + */ + nvec = xfs_trans_count_vecs(tp); + if (nvec == 0) { + return ENOMEM; /* triggers a shutdown! */ + } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) { + log_vector = log_vector_fast; + } else { + log_vector = (xfs_log_iovec_t *)kmem_alloc(nvec * + sizeof(xfs_log_iovec_t), + KM_SLEEP); + } + + /* + * Fill in the log_vector and pin the logged items, and + * then write the transaction to the log. + */ + xfs_trans_fill_vecs(tp, log_vector); + + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + + error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn)); + + /* + * The transaction is committed incore here, and can go out to disk + * at any time after this call. However, all the items associated + * with the transaction are still locked and pinned in memory. + */ + *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); + + tp->t_commit_lsn = *commit_lsn; + trace_xfs_trans_commit_lsn(tp); + + if (nvec > XFS_TRANS_LOGVEC_COUNT) + kmem_free(log_vector); + + /* + * If we got a log write error. Unpin the logitems that we + * had pinned, clean up, free trans structure, and return error. + */ + if (error || *commit_lsn == -1) { + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); + return XFS_ERROR(EIO); + } + + /* + * Once the transaction has committed, unused + * reservations need to be released and changes to + * the superblock need to be reflected in the in-core + * version. Do that now. + */ + xfs_trans_unreserve_and_mod_sb(tp); + + /* + * Tell the LM to call the transaction completion routine + * when the log write with LSN commit_lsn completes (e.g. + * when the transaction commit really hits the on-disk log). + * After this call we cannot reference tp, because the call + * can happen at any time and the call will free the transaction + * structure pointed to by tp. The only case where we call + * the completion routine (xfs_trans_committed) directly is + * if the log is turned off on a debug kernel or we're + * running in simulation mode (the log is explicitly turned + * off). + */ + tp->t_logcb.cb_func = xfs_trans_committed; + tp->t_logcb.cb_arg = tp; + + /* + * We need to pass the iclog buffer which was used for the + * transaction commit record into this function, and attach + * the callback to it. The callback must be attached before + * the items are unlocked to avoid racing with other threads + * waiting for an item to unlock. + */ + shutdown = xfs_log_notify(mp, commit_iclog, &(tp->t_logcb)); + + /* + * Mark this thread as no longer being in a transaction + */ + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + + /* + * Once all the items of the transaction have been copied + * to the in core log and the callback is attached, the + * items can be unlocked. + * + * This will free descriptors pointing to items which were + * not logged since there is nothing more to do with them. + * For items which were logged, we will keep pointers to them + * so they can be unpinned after the transaction commits to disk. + * This will also stamp each modified meta-data item with + * the commit lsn of this transaction for dependency tracking + * purposes. + */ + xfs_trans_unlock_items(tp, *commit_lsn); + + /* + * If we detected a log error earlier, finish committing + * the transaction now (unpin log items, etc). + * + * Order is critical here, to avoid using the transaction + * pointer after its been freed (by xfs_trans_committed + * either here now, or as a callback). We cannot do this + * step inside xfs_log_notify as was done earlier because + * of this issue. + */ + if (shutdown) + xfs_trans_committed(tp, XFS_LI_ABORTED); + + /* + * Now that the xfs_trans_committed callback has been attached, + * and the items are released we can finally allow the iclog to + * go to disk. + */ + return xfs_log_release_iclog(mp, commit_iclog); +} + +/* + * Walk the log items and allocate log vector structures for + * each item large enough to fit all the vectors they require. + * Note that this format differs from the old log vector format in + * that there is no transaction header in these log vectors. + */ +STATIC struct xfs_log_vec * +xfs_trans_alloc_log_vecs( + xfs_trans_t *tp) +{ + struct xfs_log_item_desc *lidp; + struct xfs_log_vec *lv = NULL; + struct xfs_log_vec *ret_lv = NULL; + + + /* Bail out if we didn't find a log item. */ + if (list_empty(&tp->t_items)) { + ASSERT(0); + return NULL; + } + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_vec *new_lv; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* Skip items that do not have any vectors for writing */ + lidp->lid_size = IOP_SIZE(lidp->lid_item); + if (!lidp->lid_size) + continue; + + new_lv = kmem_zalloc(sizeof(*new_lv) + + lidp->lid_size * sizeof(struct xfs_log_iovec), + KM_SLEEP); + + /* The allocated iovec region lies beyond the log vector. */ + new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; + new_lv->lv_niovecs = lidp->lid_size; + new_lv->lv_item = lidp->lid_item; + if (!ret_lv) + ret_lv = new_lv; + else + lv->lv_next = new_lv; + lv = new_lv; + } + + return ret_lv; +} + +static int +xfs_trans_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct xfs_log_vec *log_vector; + + /* + * Get each log item to allocate a vector structure for + * the log item to to pass to the log write code. The + * CIL commit code will format the vector and save it away. + */ + log_vector = xfs_trans_alloc_log_vecs(tp); + if (!log_vector) + return ENOMEM; + + xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); + + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_free(tp); + return 0; +} + +/* + * xfs_trans_commit + * + * Commit the given transaction to the log a/synchronously. + * + * XFS disk error handling mechanism is not based on a typical + * transaction abort mechanism. Logically after the filesystem + * gets marked 'SHUTDOWN', we can't let any new transactions + * be durable - ie. committed to disk - because some metadata might + * be inconsistent. In such cases, this returns an error, and the + * caller may assume that all locked objects joined to the transaction + * have already been unlocked as if the commit had succeeded. + * Do not reference the transaction structure after this call. + */ +int +_xfs_trans_commit( + struct xfs_trans *tp, + uint flags, + int *log_flushed) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_lsn_t commit_lsn = -1; + int error = 0; + int log_flags = 0; + int sync = tp->t_flags & XFS_TRANS_SYNC; + + /* + * Determine whether this commit is releasing a permanent + * log reservation or not. + */ + if (flags & XFS_TRANS_RELEASE_LOG_RES) { + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + log_flags = XFS_LOG_REL_PERM_RESERV; + } + + /* + * If there is nothing to be logged by the transaction, + * then unlock all of the items associated with the + * transaction and free the transaction structure. + * Also make sure to return any reserved blocks to + * the free pool. + */ + if (!(tp->t_flags & XFS_TRANS_DIRTY)) + goto out_unreserve; + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + goto out_unreserve; + } + + ASSERT(tp->t_ticket != NULL); + + /* + * If we need to update the superblock, then do it now. + */ + if (tp->t_flags & XFS_TRANS_SB_DIRTY) + xfs_trans_apply_sb_deltas(tp); + xfs_trans_apply_dquot_deltas(tp); + + if (mp->m_flags & XFS_MOUNT_DELAYLOG) + error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags); + else + error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); + + if (error == ENOMEM) { + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + error = XFS_ERROR(EIO); + goto out_unreserve; + } + + /* + * If the transaction needs to be synchronous, then force the + * log out now and wait for it. + */ + if (sync) { + if (!error) { + error = _xfs_log_force_lsn(mp, commit_lsn, + XFS_LOG_SYNC, log_flushed); + } + XFS_STATS_INC(xs_trans_sync); + } else { + XFS_STATS_INC(xs_trans_async); + } + + return error; + +out_unreserve: + xfs_trans_unreserve_and_mod_sb(tp); + + /* + * It is indeed possible for the transaction to be not dirty but + * the dqinfo portion to be. All that means is that we have some + * (non-persistent) quota reservations that need to be unreserved. + */ + xfs_trans_unreserve_and_mod_dquots(tp); + if (tp->t_ticket) { + commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + if (commit_lsn == -1 && !error) + error = XFS_ERROR(EIO); + } + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); + xfs_trans_free(tp); + + XFS_STATS_INC(xs_trans_empty); + return error; +} + +/* + * Unlock all of the transaction's items and free the transaction. + * The transaction must not have modified any of its items, because + * there is no way to restore them to their previous state. + * + * If the transaction has made a log reservation, make sure to release + * it as well. + */ +void +xfs_trans_cancel( + xfs_trans_t *tp, + int flags) +{ + int log_flags; + xfs_mount_t *mp = tp->t_mountp; + + /* + * See if the caller is being too lazy to figure out if + * the transaction really needs an abort. + */ + if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY)) + flags &= ~XFS_TRANS_ABORT; + /* + * See if the caller is relying on us to shut down the + * filesystem. This happens in paths where we detect + * corruption and decide to give up. + */ + if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) { + XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } +#ifdef DEBUG + if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) { + struct xfs_log_item_desc *lidp; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) + ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD)); + } +#endif + xfs_trans_unreserve_and_mod_sb(tp); + xfs_trans_unreserve_and_mod_dquots(tp); + + if (tp->t_ticket) { + if (flags & XFS_TRANS_RELEASE_LOG_RES) { + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + log_flags = XFS_LOG_REL_PERM_RESERV; + } else { + log_flags = 0; + } + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + } + + /* mark this thread as no longer being in a transaction */ + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); + xfs_trans_free(tp); +} + +/* + * Roll from one trans in the sequence of PERMANENT transactions to + * the next: permanent transactions are only flushed out when + * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon + * as possible to let chunks of it go to the log. So we commit the + * chunk we've been working on and get a new transaction to continue. + */ +int +xfs_trans_roll( + struct xfs_trans **tpp, + struct xfs_inode *dp) +{ + struct xfs_trans *trans; + unsigned int logres, count; + int error; + + /* + * Ensure that the inode is always logged. + */ + trans = *tpp; + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + + /* + * Copy the critical parameters from one trans to the next. + */ + logres = trans->t_log_res; + count = trans->t_log_count; + *tpp = xfs_trans_dup(trans); + + /* + * Commit the current transaction. + * If this commit failed, then it'd just unlock those items that + * are not marked ihold. That also means that a filesystem shutdown + * is in progress. The caller takes the responsibility to cancel + * the duplicate transaction that gets returned. + */ + error = xfs_trans_commit(trans, 0); + if (error) + return (error); + + trans = *tpp; + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(trans->t_ticket); + + + /* + * Reserve space in the log for th next transaction. + * This also pushes items in the "AIL", the list of logged items, + * out to disk if they are taking up space at the tail of the log + * that we want to use. This requires that either nothing be locked + * across this call, or that anything that is locked be logged in + * the prior and the next transactions. + */ + error = xfs_trans_reserve(trans, 0, logres, 0, + XFS_TRANS_PERM_LOG_RES, count); + /* + * Ensure that the inode is in the new transaction and locked. + */ + if (error) + return error; + + xfs_trans_ijoin(trans, dp); + return 0; +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h new file mode 100644 index 00000000..53597f4d --- /dev/null +++ b/fs/xfs/xfs_trans.h @@ -0,0 +1,506 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_H__ +#define __XFS_TRANS_H__ + +struct xfs_log_item; + +/* + * This is the structure written in the log at the head of + * every transaction. It identifies the type and id of the + * transaction, and contains the number of items logged by + * the transaction so we know how many to expect during recovery. + * + * Do not change the below structure without redoing the code in + * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). + */ +typedef struct xfs_trans_header { + uint th_magic; /* magic number */ + uint th_type; /* transaction type */ + __int32_t th_tid; /* transaction id (unused) */ + uint th_num_items; /* num items logged by trans */ +} xfs_trans_header_t; + +#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ + +/* + * Log item types. + */ +#define XFS_LI_EFI 0x1236 +#define XFS_LI_EFD 0x1237 +#define XFS_LI_IUNLINK 0x1238 +#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */ +#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ +#define XFS_LI_DQUOT 0x123d +#define XFS_LI_QUOTAOFF 0x123e + +#define XFS_LI_TYPE_DESC \ + { XFS_LI_EFI, "XFS_LI_EFI" }, \ + { XFS_LI_EFD, "XFS_LI_EFD" }, \ + { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \ + { XFS_LI_INODE, "XFS_LI_INODE" }, \ + { XFS_LI_BUF, "XFS_LI_BUF" }, \ + { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ + { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" } + +/* + * Transaction types. Used to distinguish types of buffers. + */ +#define XFS_TRANS_SETATTR_NOT_SIZE 1 +#define XFS_TRANS_SETATTR_SIZE 2 +#define XFS_TRANS_INACTIVE 3 +#define XFS_TRANS_CREATE 4 +#define XFS_TRANS_CREATE_TRUNC 5 +#define XFS_TRANS_TRUNCATE_FILE 6 +#define XFS_TRANS_REMOVE 7 +#define XFS_TRANS_LINK 8 +#define XFS_TRANS_RENAME 9 +#define XFS_TRANS_MKDIR 10 +#define XFS_TRANS_RMDIR 11 +#define XFS_TRANS_SYMLINK 12 +#define XFS_TRANS_SET_DMATTRS 13 +#define XFS_TRANS_GROWFS 14 +#define XFS_TRANS_STRAT_WRITE 15 +#define XFS_TRANS_DIOSTRAT 16 +/* 17 was XFS_TRANS_WRITE_SYNC */ +#define XFS_TRANS_WRITEID 18 +#define XFS_TRANS_ADDAFORK 19 +#define XFS_TRANS_ATTRINVAL 20 +#define XFS_TRANS_ATRUNCATE 21 +#define XFS_TRANS_ATTR_SET 22 +#define XFS_TRANS_ATTR_RM 23 +#define XFS_TRANS_ATTR_FLAG 24 +#define XFS_TRANS_CLEAR_AGI_BUCKET 25 +#define XFS_TRANS_QM_SBCHANGE 26 +/* + * Dummy entries since we use the transaction type to index into the + * trans_type[] in xlog_recover_print_trans_head() + */ +#define XFS_TRANS_DUMMY1 27 +#define XFS_TRANS_DUMMY2 28 +#define XFS_TRANS_QM_QUOTAOFF 29 +#define XFS_TRANS_QM_DQALLOC 30 +#define XFS_TRANS_QM_SETQLIM 31 +#define XFS_TRANS_QM_DQCLUSTER 32 +#define XFS_TRANS_QM_QINOCREATE 33 +#define XFS_TRANS_QM_QUOTAOFF_END 34 +#define XFS_TRANS_SB_UNIT 35 +#define XFS_TRANS_FSYNC_TS 36 +#define XFS_TRANS_GROWFSRT_ALLOC 37 +#define XFS_TRANS_GROWFSRT_ZERO 38 +#define XFS_TRANS_GROWFSRT_FREE 39 +#define XFS_TRANS_SWAPEXT 40 +#define XFS_TRANS_SB_COUNT 41 +#define XFS_TRANS_CHECKPOINT 42 +#define XFS_TRANS_TYPE_MAX 42 +/* new transaction types need to be reflected in xfs_logprint(8) */ + +#define XFS_TRANS_TYPES \ + { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ + { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ + { XFS_TRANS_INACTIVE, "INACTIVE" }, \ + { XFS_TRANS_CREATE, "CREATE" }, \ + { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ + { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ + { XFS_TRANS_REMOVE, "REMOVE" }, \ + { XFS_TRANS_LINK, "LINK" }, \ + { XFS_TRANS_RENAME, "RENAME" }, \ + { XFS_TRANS_MKDIR, "MKDIR" }, \ + { XFS_TRANS_RMDIR, "RMDIR" }, \ + { XFS_TRANS_SYMLINK, "SYMLINK" }, \ + { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ + { XFS_TRANS_GROWFS, "GROWFS" }, \ + { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ + { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ + { XFS_TRANS_WRITEID, "WRITEID" }, \ + { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ + { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ + { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ + { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ + { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ + { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ + { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ + { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \ + { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ + { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ + { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ + { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ + { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ + { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ + { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \ + { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ + { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ + { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ + { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ + { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ + { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ + { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ + { XFS_TRANS_DUMMY1, "DUMMY1" }, \ + { XFS_TRANS_DUMMY2, "DUMMY2" }, \ + { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } + +/* + * This structure is used to track log items associated with + * a transaction. It points to the log item and keeps some + * flags to track the state of the log item. It also tracks + * the amount of space needed to log the item it describes + * once we get to commit processing (see xfs_trans_commit()). + */ +struct xfs_log_item_desc { + struct xfs_log_item *lid_item; + ushort lid_size; + unsigned char lid_flags; + struct list_head lid_trans; +}; + +#define XFS_LID_DIRTY 0x1 + +#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */ +/* + * Values for t_flags. + */ +#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ +#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ +#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ +#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ +#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ +#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ + +/* + * Values for call flags parameter. + */ +#define XFS_TRANS_RELEASE_LOG_RES 0x4 +#define XFS_TRANS_ABORT 0x8 + +/* + * Field values for xfs_trans_mod_sb. + */ +#define XFS_TRANS_SB_ICOUNT 0x00000001 +#define XFS_TRANS_SB_IFREE 0x00000002 +#define XFS_TRANS_SB_FDBLOCKS 0x00000004 +#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008 +#define XFS_TRANS_SB_FREXTENTS 0x00000010 +#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020 +#define XFS_TRANS_SB_DBLOCKS 0x00000040 +#define XFS_TRANS_SB_AGCOUNT 0x00000080 +#define XFS_TRANS_SB_IMAXPCT 0x00000100 +#define XFS_TRANS_SB_REXTSIZE 0x00000200 +#define XFS_TRANS_SB_RBMBLOCKS 0x00000400 +#define XFS_TRANS_SB_RBLOCKS 0x00000800 +#define XFS_TRANS_SB_REXTENTS 0x00001000 +#define XFS_TRANS_SB_REXTSLOG 0x00002000 + + +/* + * Per-extent log reservation for the allocation btree changes + * involved in freeing or allocating an extent. + * 2 trees * (2 blocks/level * max depth - 1) * block size + */ +#define XFS_ALLOCFREE_LOG_RES(mp,nx) \ + ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) +#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ + ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) + +/* + * Per-directory log reservation for any directory change. + * dir blocks: (1 btree block per level + data block + free block) * dblock size + * bmap btree: (levels + 2) * max depth * block size + * v2 directory blocks can be fragmented below the dirblksize down to the fsb + * size, so account for that in the DAENTER macros. + */ +#define XFS_DIROP_LOG_RES(mp) \ + (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \ + (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1))) +#define XFS_DIROP_LOG_COUNT(mp) \ + (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ + XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) + + +#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write) +#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) +#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename) +#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link) +#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove) +#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) +#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) +#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir) +#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree) +#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange) +#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata) +#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc) +#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero) +#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree) +#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) +/* + * Logging the inode timestamps on an fsync -- same as SWRITE + * as long as SWRITE logs the entire inode core + */ +#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) +#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) +#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork) +#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval) +#define XFS_ATTRSET_LOG_RES(mp, ext) \ + ((mp)->m_reservations.tr_attrset + \ + (ext * (mp)->m_sb.sb_sectsize) + \ + (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \ + (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))))) +#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) +#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi) + + +/* + * Various log count values. + */ +#define XFS_DEFAULT_LOG_COUNT 1 +#define XFS_DEFAULT_PERM_LOG_COUNT 2 +#define XFS_ITRUNCATE_LOG_COUNT 2 +#define XFS_INACTIVE_LOG_COUNT 2 +#define XFS_CREATE_LOG_COUNT 2 +#define XFS_MKDIR_LOG_COUNT 3 +#define XFS_SYMLINK_LOG_COUNT 3 +#define XFS_REMOVE_LOG_COUNT 2 +#define XFS_LINK_LOG_COUNT 2 +#define XFS_RENAME_LOG_COUNT 2 +#define XFS_WRITE_LOG_COUNT 2 +#define XFS_ADDAFORK_LOG_COUNT 2 +#define XFS_ATTRINVAL_LOG_COUNT 1 +#define XFS_ATTRSET_LOG_COUNT 3 +#define XFS_ATTRRM_LOG_COUNT 3 + +/* + * Here we centralize the specification of XFS meta-data buffer + * reference count values. This determine how hard the buffer + * cache tries to hold onto the buffer. + */ +#define XFS_AGF_REF 4 +#define XFS_AGI_REF 4 +#define XFS_AGFL_REF 3 +#define XFS_INO_BTREE_REF 3 +#define XFS_ALLOC_BTREE_REF 2 +#define XFS_BMAP_BTREE_REF 2 +#define XFS_DIR_BTREE_REF 2 +#define XFS_INO_REF 2 +#define XFS_ATTR_BTREE_REF 1 +#define XFS_DQUOT_REF 1 + +#ifdef __KERNEL__ + +struct xfs_buf; +struct xfs_buftarg; +struct xfs_efd_log_item; +struct xfs_efi_log_item; +struct xfs_inode; +struct xfs_item_ops; +struct xfs_log_iovec; +struct xfs_log_item_desc; +struct xfs_mount; +struct xfs_trans; +struct xfs_dquot_acct; +struct xfs_busy_extent; + +typedef struct xfs_log_item { + struct list_head li_ail; /* AIL pointers */ + xfs_lsn_t li_lsn; /* last on-disk lsn */ + struct xfs_log_item_desc *li_desc; /* ptr to current desc*/ + struct xfs_mount *li_mountp; /* ptr to fs mount */ + struct xfs_ail *li_ailp; /* ptr to AIL */ + uint li_type; /* item type */ + uint li_flags; /* misc flags */ + struct xfs_log_item *li_bio_list; /* buffer item list */ + void (*li_cb)(struct xfs_buf *, + struct xfs_log_item *); + /* buffer item iodone */ + /* callback func */ + struct xfs_item_ops *li_ops; /* function list */ + + /* delayed logging */ + struct list_head li_cil; /* CIL pointers */ + struct xfs_log_vec *li_lv; /* active log vector */ + xfs_lsn_t li_seq; /* CIL commit seq */ +} xfs_log_item_t; + +#define XFS_LI_IN_AIL 0x1 +#define XFS_LI_ABORTED 0x2 + +#define XFS_LI_FLAGS \ + { XFS_LI_IN_AIL, "IN_AIL" }, \ + { XFS_LI_ABORTED, "ABORTED" } + +typedef struct xfs_item_ops { + uint (*iop_size)(xfs_log_item_t *); + void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); + void (*iop_pin)(xfs_log_item_t *); + void (*iop_unpin)(xfs_log_item_t *, int remove); + uint (*iop_trylock)(xfs_log_item_t *); + void (*iop_unlock)(xfs_log_item_t *); + xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); + void (*iop_push)(xfs_log_item_t *); + bool (*iop_pushbuf)(xfs_log_item_t *); + void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); +} xfs_item_ops_t; + +#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) +#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) +#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) +#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove) +#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) +#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) +#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) +#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip) +#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip) +#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) + +/* + * Return values for the IOP_TRYLOCK() routines. + */ +#define XFS_ITEM_SUCCESS 0 +#define XFS_ITEM_PINNED 1 +#define XFS_ITEM_LOCKED 2 +#define XFS_ITEM_PUSHBUF 3 + +/* + * This is the type of function which can be given to xfs_trans_callback() + * to be called upon the transaction's commit to disk. + */ +typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *); + +/* + * This is the structure maintained for every active transaction. + */ +typedef struct xfs_trans { + unsigned int t_magic; /* magic number */ + xfs_log_callback_t t_logcb; /* log callback struct */ + unsigned int t_type; /* transaction type */ + unsigned int t_log_res; /* amt of log space resvd */ + unsigned int t_log_count; /* count for perm log res */ + unsigned int t_blk_res; /* # of blocks resvd */ + unsigned int t_blk_res_used; /* # of resvd blocks used */ + unsigned int t_rtx_res; /* # of rt extents resvd */ + unsigned int t_rtx_res_used; /* # of resvd rt extents used */ + struct xlog_ticket *t_ticket; /* log mgr ticket */ + xfs_lsn_t t_lsn; /* log seq num of start of + * transaction. */ + xfs_lsn_t t_commit_lsn; /* log seq num of end of + * transaction. */ + struct xfs_mount *t_mountp; /* ptr to fs mount struct */ + struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ + unsigned int t_flags; /* misc flags */ + int64_t t_icount_delta; /* superblock icount change */ + int64_t t_ifree_delta; /* superblock ifree change */ + int64_t t_fdblocks_delta; /* superblock fdblocks chg */ + int64_t t_res_fdblocks_delta; /* on-disk only chg */ + int64_t t_frextents_delta;/* superblock freextents chg*/ + int64_t t_res_frextents_delta; /* on-disk only chg */ +#ifdef DEBUG + int64_t t_ag_freeblks_delta; /* debugging counter */ + int64_t t_ag_flist_delta; /* debugging counter */ + int64_t t_ag_btree_delta; /* debugging counter */ +#endif + int64_t t_dblocks_delta;/* superblock dblocks change */ + int64_t t_agcount_delta;/* superblock agcount change */ + int64_t t_imaxpct_delta;/* superblock imaxpct change */ + int64_t t_rextsize_delta;/* superblock rextsize chg */ + int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */ + int64_t t_rblocks_delta;/* superblock rblocks change */ + int64_t t_rextents_delta;/* superblocks rextents chg */ + int64_t t_rextslog_delta;/* superblocks rextslog chg */ + struct list_head t_items; /* log item descriptors */ + xfs_trans_header_t t_header; /* header for in-log trans */ + struct list_head t_busy; /* list of busy extents */ + unsigned long t_pflags; /* saved process flags state */ +} xfs_trans_t; + +/* + * XFS transaction mechanism exported interfaces that are + * actually macros. + */ +#define xfs_trans_get_log_res(tp) ((tp)->t_log_res) +#define xfs_trans_get_log_count(tp) ((tp)->t_log_count) +#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) +#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) + +#ifdef DEBUG +#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d) +#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d) +#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d) +#else +#define xfs_trans_agblocks_delta(tp, d) +#define xfs_trans_agflist_delta(tp, d) +#define xfs_trans_agbtree_delta(tp, d) +#endif + +/* + * XFS transaction mechanism exported interfaces. + */ +xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); +xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, uint); +xfs_trans_t *xfs_trans_dup(xfs_trans_t *); +int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, + uint, uint); +void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); +struct xfs_buf *xfs_trans_get_buf(xfs_trans_t *, struct xfs_buftarg *, xfs_daddr_t, + int, uint); +int xfs_trans_read_buf(struct xfs_mount *, xfs_trans_t *, + struct xfs_buftarg *, xfs_daddr_t, int, uint, + struct xfs_buf **); +struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); + +void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); +void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); +void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); +void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); +void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); +void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); +struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); +void xfs_efi_release(struct xfs_efi_log_item *, uint); +void xfs_trans_log_efi_extent(xfs_trans_t *, + struct xfs_efi_log_item *, + xfs_fsblock_t, + xfs_extlen_t); +struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *, + struct xfs_efi_log_item *, + uint); +void xfs_trans_log_efd_extent(xfs_trans_t *, + struct xfs_efd_log_item *, + xfs_fsblock_t, + xfs_extlen_t); +int _xfs_trans_commit(xfs_trans_t *, + uint flags, + int *); +#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL) +void xfs_trans_cancel(xfs_trans_t *, int); +int xfs_trans_ail_init(struct xfs_mount *); +void xfs_trans_ail_destroy(struct xfs_mount *); + +extern kmem_zone_t *xfs_trans_zone; +extern kmem_zone_t *xfs_log_item_desc_zone; + +#endif /* __KERNEL__ */ + +void xfs_trans_init(struct xfs_mount *); +int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); + +#endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c new file mode 100644 index 00000000..a4c281bf --- /dev/null +++ b/fs/xfs/xfs_trans_ail.c @@ -0,0 +1,890 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2008 Dave Chinner + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" + +#ifdef DEBUG +/* + * Check that the list is sorted as it should be. + */ +STATIC void +xfs_ail_check( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + xfs_log_item_t *prev_lip; + + if (list_empty(&ailp->xa_ail)) + return; + + /* + * Check the next and previous entries are valid. + */ + ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); + prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail); + if (&prev_lip->li_ail != &ailp->xa_ail) + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); + + prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail); + if (&prev_lip->li_ail != &ailp->xa_ail) + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); + + +#ifdef XFS_TRANS_DEBUG + /* + * Walk the list checking lsn ordering, and that every entry has the + * XFS_LI_IN_AIL flag set. This is really expensive, so only do it + * when specifically debugging the transaction subsystem. + */ + prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); + list_for_each_entry(lip, &ailp->xa_ail, li_ail) { + if (&prev_lip->li_ail != &ailp->xa_ail) + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); + ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); + prev_lip = lip; + } +#endif /* XFS_TRANS_DEBUG */ +} +#else /* !DEBUG */ +#define xfs_ail_check(a,l) +#endif /* DEBUG */ + +/* + * Return a pointer to the first item in the AIL. If the AIL is empty, then + * return NULL. + */ +static xfs_log_item_t * +xfs_ail_min( + struct xfs_ail *ailp) +{ + if (list_empty(&ailp->xa_ail)) + return NULL; + + return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); +} + + /* + * Return a pointer to the last item in the AIL. If the AIL is empty, then + * return NULL. + */ +static xfs_log_item_t * +xfs_ail_max( + struct xfs_ail *ailp) +{ + if (list_empty(&ailp->xa_ail)) + return NULL; + + return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail); +} + +/* + * Return a pointer to the item which follows the given item in the AIL. If + * the given item is the last item in the list, then return NULL. + */ +static xfs_log_item_t * +xfs_ail_next( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + if (lip->li_ail.next == &ailp->xa_ail) + return NULL; + + return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail); +} + +/* + * This is called by the log manager code to determine the LSN of the tail of + * the log. This is exactly the LSN of the first item in the AIL. If the AIL + * is empty, then this function returns 0. + * + * We need the AIL lock in order to get a coherent read of the lsn of the last + * item in the AIL. + */ +xfs_lsn_t +xfs_ail_min_lsn( + struct xfs_ail *ailp) +{ + xfs_lsn_t lsn = 0; + xfs_log_item_t *lip; + + spin_lock(&ailp->xa_lock); + lip = xfs_ail_min(ailp); + if (lip) + lsn = lip->li_lsn; + spin_unlock(&ailp->xa_lock); + + return lsn; +} + +/* + * Return the maximum lsn held in the AIL, or zero if the AIL is empty. + */ +static xfs_lsn_t +xfs_ail_max_lsn( + struct xfs_ail *ailp) +{ + xfs_lsn_t lsn = 0; + xfs_log_item_t *lip; + + spin_lock(&ailp->xa_lock); + lip = xfs_ail_max(ailp); + if (lip) + lsn = lip->li_lsn; + spin_unlock(&ailp->xa_lock); + + return lsn; +} + +/* + * AIL traversal cursor initialisation. + * + * The cursor keeps track of where our current traversal is up + * to by tracking the next ƣtem in the list for us. However, for + * this to be safe, removing an object from the AIL needs to invalidate + * any cursor that points to it. hence the traversal cursor needs to + * be linked to the struct xfs_ail so that deletion can search all the + * active cursors for invalidation. + * + * We don't link the push cursor because it is embedded in the struct + * xfs_ail and hence easily findable. + */ +STATIC void +xfs_trans_ail_cursor_init( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur) +{ + cur->item = NULL; + if (cur == &ailp->xa_cursors) + return; + + cur->next = ailp->xa_cursors.next; + ailp->xa_cursors.next = cur; +} + +/* + * Set the cursor to the next item, because when we look + * up the cursor the current item may have been freed. + */ +STATIC void +xfs_trans_ail_cursor_set( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item *lip) +{ + if (lip) + cur->item = xfs_ail_next(ailp, lip); +} + +/* + * Get the next item in the traversal and advance the cursor. + * If the cursor was invalidated (inidicated by a lip of 1), + * restart the traversal. + */ +struct xfs_log_item * +xfs_trans_ail_cursor_next( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur) +{ + struct xfs_log_item *lip = cur->item; + + if ((__psint_t)lip & 1) + lip = xfs_ail_min(ailp); + xfs_trans_ail_cursor_set(ailp, cur, lip); + return lip; +} + +/* + * Now that the traversal is complete, we need to remove the cursor + * from the list of traversing cursors. Avoid removing the embedded + * push cursor, but use the fact it is always present to make the + * list deletion simple. + */ +void +xfs_trans_ail_cursor_done( + struct xfs_ail *ailp, + struct xfs_ail_cursor *done) +{ + struct xfs_ail_cursor *prev = NULL; + struct xfs_ail_cursor *cur; + + done->item = NULL; + if (done == &ailp->xa_cursors) + return; + prev = &ailp->xa_cursors; + for (cur = prev->next; cur; prev = cur, cur = prev->next) { + if (cur == done) { + prev->next = cur->next; + break; + } + } + ASSERT(cur); +} + +/* + * Invalidate any cursor that is pointing to this item. This is + * called when an item is removed from the AIL. Any cursor pointing + * to this object is now invalid and the traversal needs to be + * terminated so it doesn't reference a freed object. We set the + * cursor item to a value of 1 so we can distinguish between an + * invalidation and the end of the list when getting the next item + * from the cursor. + */ +STATIC void +xfs_trans_ail_cursor_clear( + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_ail_cursor *cur; + + /* need to search all cursors */ + for (cur = &ailp->xa_cursors; cur; cur = cur->next) { + if (cur->item == lip) + cur->item = (struct xfs_log_item *) + ((__psint_t)cur->item | 1); + } +} + +/* + * Initialise the cursor to the first item in the AIL with the given @lsn. + * This searches the list from lowest LSN to highest. Pass a @lsn of zero + * to initialise the cursor to the first item in the AIL. + */ +xfs_log_item_t * +xfs_trans_ail_cursor_first( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn) +{ + xfs_log_item_t *lip; + + xfs_trans_ail_cursor_init(ailp, cur); + lip = xfs_ail_min(ailp); + if (lsn == 0) + goto out; + + list_for_each_entry(lip, &ailp->xa_ail, li_ail) { + if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0) + goto out; + } + lip = NULL; +out: + xfs_trans_ail_cursor_set(ailp, cur, lip); + return lip; +} + +/* + * Initialise the cursor to the last item in the AIL with the given @lsn. + * This searches the list from highest LSN to lowest. If there is no item with + * the value of @lsn, then it sets the cursor to the last item with an LSN lower + * than @lsn. + */ +static struct xfs_log_item * +__xfs_trans_ail_cursor_last( + struct xfs_ail *ailp, + xfs_lsn_t lsn) +{ + xfs_log_item_t *lip; + + list_for_each_entry_reverse(lip, &ailp->xa_ail, li_ail) { + if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0) + return lip; + } + return NULL; +} + +/* + * Initialise the cursor to the last item in the AIL with the given @lsn. + * This searches the list from highest LSN to lowest. + */ +struct xfs_log_item * +xfs_trans_ail_cursor_last( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn) +{ + xfs_trans_ail_cursor_init(ailp, cur); + cur->item = __xfs_trans_ail_cursor_last(ailp, lsn); + return cur->item; +} + +/* + * splice the log item list into the AIL at the given LSN. We splice to the + * tail of the given LSN to maintain insert order for push traversals. The + * cursor is optional, allowing repeated updates to the same LSN to avoid + * repeated traversals. + */ +static void +xfs_ail_splice( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct list_head *list, + xfs_lsn_t lsn) +{ + struct xfs_log_item *lip = cur ? cur->item : NULL; + struct xfs_log_item *next_lip; + + /* + * Get a new cursor if we don't have a placeholder or the existing one + * has been invalidated. + */ + if (!lip || (__psint_t)lip & 1) { + lip = __xfs_trans_ail_cursor_last(ailp, lsn); + + if (!lip) { + /* The list is empty, so just splice and return. */ + if (cur) + cur->item = NULL; + list_splice(list, &ailp->xa_ail); + return; + } + } + + /* + * Our cursor points to the item we want to insert _after_, so we have + * to update the cursor to point to the end of the list we are splicing + * in so that it points to the correct location for the next splice. + * i.e. before the splice + * + * lsn -> lsn -> lsn + x -> lsn + x ... + * ^ + * | cursor points here + * + * After the splice we have: + * + * lsn -> lsn -> lsn -> lsn -> .... -> lsn -> lsn + x -> lsn + x ... + * ^ ^ + * | cursor points here | needs to move here + * + * So we set the cursor to the last item in the list to be spliced + * before we execute the splice, resulting in the cursor pointing to + * the correct item after the splice occurs. + */ + if (cur) { + next_lip = list_entry(list->prev, struct xfs_log_item, li_ail); + cur->item = next_lip; + } + list_splice(list, &lip->li_ail); +} + +/* + * Delete the given item from the AIL. Return a pointer to the item. + */ +static void +xfs_ail_delete( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + xfs_ail_check(ailp, lip); + list_del(&lip->li_ail); + xfs_trans_ail_cursor_clear(ailp, lip); +} + +static long +xfsaild_push( + struct xfs_ail *ailp) +{ + xfs_mount_t *mp = ailp->xa_mount; + struct xfs_ail_cursor *cur = &ailp->xa_cursors; + xfs_log_item_t *lip; + xfs_lsn_t lsn; + xfs_lsn_t target; + long tout = 10; + int flush_log = 0; + int stuck = 0; + int count = 0; + int push_xfsbufd = 0; + + spin_lock(&ailp->xa_lock); + target = ailp->xa_target; + xfs_trans_ail_cursor_init(ailp, cur); + lip = xfs_trans_ail_cursor_first(ailp, cur, ailp->xa_last_pushed_lsn); + if (!lip || XFS_FORCED_SHUTDOWN(mp)) { + /* + * AIL is empty or our push has reached the end. + */ + xfs_trans_ail_cursor_done(ailp, cur); + spin_unlock(&ailp->xa_lock); + goto out_done; + } + + XFS_STATS_INC(xs_push_ail); + + /* + * While the item we are looking at is below the given threshold + * try to flush it out. We'd like not to stop until we've at least + * tried to push on everything in the AIL with an LSN less than + * the given threshold. + * + * However, we will stop after a certain number of pushes and wait + * for a reduced timeout to fire before pushing further. This + * prevents use from spinning when we can't do anything or there is + * lots of contention on the AIL lists. + */ + lsn = lip->li_lsn; + while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { + int lock_result; + /* + * If we can lock the item without sleeping, unlock the AIL + * lock and flush the item. Then re-grab the AIL lock so we + * can look for the next item on the AIL. List changes are + * handled by the AIL lookup functions internally + * + * If we can't lock the item, either its holder will flush it + * or it is already being flushed or it is being relogged. In + * any of these case it is being taken care of and we can just + * skip to the next item in the list. + */ + lock_result = IOP_TRYLOCK(lip); + spin_unlock(&ailp->xa_lock); + switch (lock_result) { + case XFS_ITEM_SUCCESS: + XFS_STATS_INC(xs_push_ail_success); + IOP_PUSH(lip); + ailp->xa_last_pushed_lsn = lsn; + break; + + case XFS_ITEM_PUSHBUF: + XFS_STATS_INC(xs_push_ail_pushbuf); + + if (!IOP_PUSHBUF(lip)) { + stuck++; + flush_log = 1; + } else { + ailp->xa_last_pushed_lsn = lsn; + } + push_xfsbufd = 1; + break; + + case XFS_ITEM_PINNED: + XFS_STATS_INC(xs_push_ail_pinned); + stuck++; + flush_log = 1; + break; + + case XFS_ITEM_LOCKED: + XFS_STATS_INC(xs_push_ail_locked); + stuck++; + break; + + default: + ASSERT(0); + break; + } + + spin_lock(&ailp->xa_lock); + /* should we bother continuing? */ + if (XFS_FORCED_SHUTDOWN(mp)) + break; + ASSERT(mp->m_log); + + count++; + + /* + * Are there too many items we can't do anything with? + * If we we are skipping too many items because we can't flush + * them or they are already being flushed, we back off and + * given them time to complete whatever operation is being + * done. i.e. remove pressure from the AIL while we can't make + * progress so traversals don't slow down further inserts and + * removals to/from the AIL. + * + * The value of 100 is an arbitrary magic number based on + * observation. + */ + if (stuck > 100) + break; + + lip = xfs_trans_ail_cursor_next(ailp, cur); + if (lip == NULL) + break; + lsn = lip->li_lsn; + } + xfs_trans_ail_cursor_done(ailp, cur); + spin_unlock(&ailp->xa_lock); + + if (flush_log) { + /* + * If something we need to push out was pinned, then + * push out the log so it will become unpinned and + * move forward in the AIL. + */ + XFS_STATS_INC(xs_push_ail_flush); + xfs_log_force(mp, 0); + } + + if (push_xfsbufd) { + /* we've got delayed write buffers to flush */ + wake_up_process(mp->m_ddev_targp->bt_task); + } + + /* assume we have more work to do in a short while */ +out_done: + if (!count) { + /* We're past our target or empty, so idle */ + ailp->xa_last_pushed_lsn = 0; + + tout = 50; + } else if (XFS_LSN_CMP(lsn, target) >= 0) { + /* + * We reached the target so wait a bit longer for I/O to + * complete and remove pushed items from the AIL before we + * start the next scan from the start of the AIL. + */ + tout = 50; + ailp->xa_last_pushed_lsn = 0; + } else if ((stuck * 100) / count > 90) { + /* + * Either there is a lot of contention on the AIL or we + * are stuck due to operations in progress. "Stuck" in this + * case is defined as >90% of the items we tried to push + * were stuck. + * + * Backoff a bit more to allow some I/O to complete before + * continuing from where we were. + */ + tout = 20; + } + + return tout; +} + +static int +xfsaild( + void *data) +{ + struct xfs_ail *ailp = data; + long tout = 0; /* milliseconds */ + + while (!kthread_should_stop()) { + if (tout && tout <= 20) + __set_current_state(TASK_KILLABLE); + else + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(tout ? + msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); + + try_to_freeze(); + + tout = xfsaild_push(ailp); + } + + return 0; +} + +/* + * This routine is called to move the tail of the AIL forward. It does this by + * trying to flush items in the AIL whose lsns are below the given + * threshold_lsn. + * + * The push is run asynchronously in a workqueue, which means the caller needs + * to handle waiting on the async flush for space to become available. + * We don't want to interrupt any push that is in progress, hence we only queue + * work if we set the pushing bit approriately. + * + * We do this unlocked - we only need to know whether there is anything in the + * AIL at the time we are called. We don't need to access the contents of + * any of the objects, so the lock is not needed. + */ +void +xfs_ail_push( + struct xfs_ail *ailp, + xfs_lsn_t threshold_lsn) +{ + xfs_log_item_t *lip; + + lip = xfs_ail_min(ailp); + if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) || + XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0) + return; + + /* + * Ensure that the new target is noticed in push code before it clears + * the XFS_AIL_PUSHING_BIT. + */ + smp_wmb(); + xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn); + smp_wmb(); + + wake_up_process(ailp->xa_task); +} + +/* + * Push out all items in the AIL immediately + */ +void +xfs_ail_push_all( + struct xfs_ail *ailp) +{ + xfs_lsn_t threshold_lsn = xfs_ail_max_lsn(ailp); + + if (threshold_lsn) + xfs_ail_push(ailp, threshold_lsn); +} + +/* + * This is to be called when an item is unlocked that may have + * been in the AIL. It will wake up the first member of the AIL + * wait list if this item's unlocking might allow it to progress. + * If the item is in the AIL, then we need to get the AIL lock + * while doing our checking so we don't race with someone going + * to sleep waiting for this event in xfs_trans_push_ail(). + */ +void +xfs_trans_unlocked_item( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + xfs_log_item_t *min_lip; + + /* + * If we're forcibly shutting down, we may have + * unlocked log items arbitrarily. The last thing + * we want to do is to move the tail of the log + * over some potentially valid data. + */ + if (!(lip->li_flags & XFS_LI_IN_AIL) || + XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { + return; + } + + /* + * This is the one case where we can call into xfs_ail_min() + * without holding the AIL lock because we only care about the + * case where we are at the tail of the AIL. If the object isn't + * at the tail, it doesn't matter what result we get back. This + * is slightly racy because since we were just unlocked, we could + * go to sleep between the call to xfs_ail_min and the call to + * xfs_log_move_tail, have someone else lock us, commit to us disk, + * move us out of the tail of the AIL, and then we wake up. However, + * the call to xfs_log_move_tail() doesn't do anything if there's + * not enough free space to wake people up so we're safe calling it. + */ + min_lip = xfs_ail_min(ailp); + + if (min_lip == lip) + xfs_log_move_tail(ailp->xa_mount, 1); +} /* xfs_trans_unlocked_item */ + +/* + * xfs_trans_ail_update - bulk AIL insertion operation. + * + * @xfs_trans_ail_update takes an array of log items that all need to be + * positioned at the same LSN in the AIL. If an item is not in the AIL, it will + * be added. Otherwise, it will be repositioned by removing it and re-adding + * it to the AIL. If we move the first item in the AIL, update the log tail to + * match the new minimum LSN in the AIL. + * + * This function takes the AIL lock once to execute the update operations on + * all the items in the array, and as such should not be called with the AIL + * lock held. As a result, once we have the AIL lock, we need to check each log + * item LSN to confirm it needs to be moved forward in the AIL. + * + * To optimise the insert operation, we delete all the items from the AIL in + * the first pass, moving them into a temporary list, then splice the temporary + * list into the correct position in the AIL. This avoids needing to do an + * insert operation on every item. + * + * This function must be called with the AIL lock held. The lock is dropped + * before returning. + */ +void +xfs_trans_ail_update_bulk( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, + int nr_items, + xfs_lsn_t lsn) __releases(ailp->xa_lock) +{ + xfs_log_item_t *mlip; + xfs_lsn_t tail_lsn; + int mlip_changed = 0; + int i; + LIST_HEAD(tmp); + + mlip = xfs_ail_min(ailp); + + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + if (lip->li_flags & XFS_LI_IN_AIL) { + /* check if we really need to move the item */ + if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0) + continue; + + xfs_ail_delete(ailp, lip); + if (mlip == lip) + mlip_changed = 1; + } else { + lip->li_flags |= XFS_LI_IN_AIL; + } + lip->li_lsn = lsn; + list_add(&lip->li_ail, &tmp); + } + + xfs_ail_splice(ailp, cur, &tmp, lsn); + + if (!mlip_changed) { + spin_unlock(&ailp->xa_lock); + return; + } + + /* + * It is not safe to access mlip after the AIL lock is dropped, so we + * must get a copy of li_lsn before we do so. This is especially + * important on 32-bit platforms where accessing and updating 64-bit + * values like li_lsn is not atomic. + */ + mlip = xfs_ail_min(ailp); + tail_lsn = mlip->li_lsn; + spin_unlock(&ailp->xa_lock); + xfs_log_move_tail(ailp->xa_mount, tail_lsn); +} + +/* + * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL + * + * @xfs_trans_ail_delete_bulk takes an array of log items that all need to + * removed from the AIL. The caller is already holding the AIL lock, and done + * all the checks necessary to ensure the items passed in via @log_items are + * ready for deletion. This includes checking that the items are in the AIL. + * + * For each log item to be removed, unlink it from the AIL, clear the IN_AIL + * flag from the item and reset the item's lsn to 0. If we remove the first + * item in the AIL, update the log tail to match the new minimum LSN in the + * AIL. + * + * This function will not drop the AIL lock until all items are removed from + * the AIL to minimise the amount of lock traffic on the AIL. This does not + * greatly increase the AIL hold time, but does significantly reduce the amount + * of traffic on the lock, especially during IO completion. + * + * This function must be called with the AIL lock held. The lock is dropped + * before returning. + */ +void +xfs_trans_ail_delete_bulk( + struct xfs_ail *ailp, + struct xfs_log_item **log_items, + int nr_items) __releases(ailp->xa_lock) +{ + xfs_log_item_t *mlip; + xfs_lsn_t tail_lsn; + int mlip_changed = 0; + int i; + + mlip = xfs_ail_min(ailp); + + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + if (!(lip->li_flags & XFS_LI_IN_AIL)) { + struct xfs_mount *mp = ailp->xa_mount; + + spin_unlock(&ailp->xa_lock); + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert_tag(mp, XFS_PTAG_AILDELETE, + "%s: attempting to delete a log item that is not in the AIL", + __func__); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } + return; + } + + xfs_ail_delete(ailp, lip); + lip->li_flags &= ~XFS_LI_IN_AIL; + lip->li_lsn = 0; + if (mlip == lip) + mlip_changed = 1; + } + + if (!mlip_changed) { + spin_unlock(&ailp->xa_lock); + return; + } + + /* + * It is not safe to access mlip after the AIL lock is dropped, so we + * must get a copy of li_lsn before we do so. This is especially + * important on 32-bit platforms where accessing and updating 64-bit + * values like li_lsn is not atomic. It is possible we've emptied the + * AIL here, so if that is the case, pass an LSN of 0 to the tail move. + */ + mlip = xfs_ail_min(ailp); + tail_lsn = mlip ? mlip->li_lsn : 0; + spin_unlock(&ailp->xa_lock); + xfs_log_move_tail(ailp->xa_mount, tail_lsn); +} + +/* + * The active item list (AIL) is a doubly linked list of log + * items sorted by ascending lsn. The base of the list is + * a forw/back pointer pair embedded in the xfs mount structure. + * The base is initialized with both pointers pointing to the + * base. This case always needs to be distinguished, because + * the base has no lsn to look at. We almost always insert + * at the end of the list, so on inserts we search from the + * end of the list to find where the new item belongs. + */ + +/* + * Initialize the doubly linked list to point only to itself. + */ +int +xfs_trans_ail_init( + xfs_mount_t *mp) +{ + struct xfs_ail *ailp; + + ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL); + if (!ailp) + return ENOMEM; + + ailp->xa_mount = mp; + INIT_LIST_HEAD(&ailp->xa_ail); + spin_lock_init(&ailp->xa_lock); + + ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", + ailp->xa_mount->m_fsname); + if (IS_ERR(ailp->xa_task)) + goto out_free_ailp; + + mp->m_ail = ailp; + return 0; + +out_free_ailp: + kmem_free(ailp); + return ENOMEM; +} + +void +xfs_trans_ail_destroy( + xfs_mount_t *mp) +{ + struct xfs_ail *ailp = mp->m_ail; + + kthread_stop(ailp->xa_task); + kmem_free(ailp); +} diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c new file mode 100644 index 00000000..03b3b7f8 --- /dev/null +++ b/fs/xfs/xfs_trans_buf.c @@ -0,0 +1,879 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_rw.h" +#include "xfs_trace.h" + +/* + * Check to see if a buffer matching the given parameters is already + * a part of the given transaction. + */ +STATIC struct xfs_buf * +xfs_trans_buf_item_match( + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int len) +{ + struct xfs_log_item_desc *lidp; + struct xfs_buf_log_item *blip; + + len = BBTOB(len); + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + blip = (struct xfs_buf_log_item *)lidp->lid_item; + if (blip->bli_item.li_type == XFS_LI_BUF && + XFS_BUF_TARGET(blip->bli_buf) == target && + XFS_BUF_ADDR(blip->bli_buf) == blkno && + XFS_BUF_COUNT(blip->bli_buf) == len) + return blip->bli_buf; + } + + return NULL; +} + +/* + * Add the locked buffer to the transaction. + * + * The buffer must be locked, and it cannot be associated with any + * transaction. + * + * If the buffer does not yet have a buf log item associated with it, + * then allocate one for it. Then add the buf item to the transaction. + */ +STATIC void +_xfs_trans_bjoin( + struct xfs_trans *tp, + struct xfs_buf *bp, + int reset_recur) +{ + struct xfs_buf_log_item *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL); + + /* + * The xfs_buf_log_item pointer is stored in b_fsprivate. If + * it doesn't have one yet, then allocate one and initialize it. + * The checks to see if one is there are in xfs_buf_item_init(). + */ + xfs_buf_item_init(bp, tp->t_mountp); + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + if (reset_recur) + bip->bli_recur = 0; + + /* + * Take a reference for this transaction on the buf item. + */ + atomic_inc(&bip->bli_refcount); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &bip->bli_item); + + /* + * Initialize b_fsprivate2 so we can find it with incore_match() + * in xfs_trans_get_buf() and friends above. + */ + XFS_BUF_SET_FSPRIVATE2(bp, tp); + +} + +void +xfs_trans_bjoin( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + _xfs_trans_bjoin(tp, bp, 0); + trace_xfs_trans_bjoin(bp->b_fspriv); +} + +/* + * Get and lock the buffer for the caller if it is not already + * locked within the given transaction. If it is already locked + * within the transaction, just increment its lock recursion count + * and return a pointer to it. + * + * If the transaction pointer is NULL, make this just a normal + * get_buf() call. + */ +xfs_buf_t * +xfs_trans_get_buf(xfs_trans_t *tp, + xfs_buftarg_t *target_dev, + xfs_daddr_t blkno, + int len, + uint flags) +{ + xfs_buf_t *bp; + xfs_buf_log_item_t *bip; + + if (flags == 0) + flags = XBF_LOCK | XBF_MAPPED; + + /* + * Default to a normal get_buf() call if the tp is NULL. + */ + if (tp == NULL) + return xfs_buf_get(target_dev, blkno, len, + flags | XBF_DONT_BLOCK); + + /* + * If we find the buffer in the cache with this transaction + * pointer in its b_fsprivate2 field, then we know we already + * have it locked. In this case we just increment the lock + * recursion count and return the buffer to the caller. + */ + bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); + if (bp != NULL) { + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) + XFS_BUF_SUPER_STALE(bp); + + /* + * If the buffer is stale then it was binval'ed + * since last read. This doesn't matter since the + * caller isn't allowed to use the data anyway. + */ + else if (XFS_BUF_ISSTALE(bp)) + ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); + + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + bip->bli_recur++; + trace_xfs_trans_get_buf_recur(bip); + return (bp); + } + + /* + * We always specify the XBF_DONT_BLOCK flag within a transaction + * so that get_buf does not try to push out a delayed write buffer + * which might cause another transaction to take place (if the + * buffer was delayed alloc). Such recursive transactions can + * easily deadlock with our current transaction as well as cause + * us to run out of stack space. + */ + bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK); + if (bp == NULL) { + return NULL; + } + + ASSERT(!XFS_BUF_GETERROR(bp)); + + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_get_buf(bp->b_fspriv); + return (bp); +} + +/* + * Get and lock the superblock buffer of this file system for the + * given transaction. + * + * We don't need to use incore_match() here, because the superblock + * buffer is a private buffer which we keep a pointer to in the + * mount structure. + */ +xfs_buf_t * +xfs_trans_getsb(xfs_trans_t *tp, + struct xfs_mount *mp, + int flags) +{ + xfs_buf_t *bp; + xfs_buf_log_item_t *bip; + + /* + * Default to just trying to lock the superblock buffer + * if tp is NULL. + */ + if (tp == NULL) { + return (xfs_getsb(mp, flags)); + } + + /* + * If the superblock buffer already has this transaction + * pointer in its b_fsprivate2 field, then we know we already + * have it locked. In this case we just increment the lock + * recursion count and return the buffer to the caller. + */ + bp = mp->m_sb_bp; + if (XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp) { + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + bip->bli_recur++; + trace_xfs_trans_getsb_recur(bip); + return (bp); + } + + bp = xfs_getsb(mp, flags); + if (bp == NULL) + return NULL; + + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_getsb(bp->b_fspriv); + return (bp); +} + +#ifdef DEBUG +xfs_buftarg_t *xfs_error_target; +int xfs_do_error; +int xfs_req_num; +int xfs_error_mod = 33; +#endif + +/* + * Get and lock the buffer for the caller if it is not already + * locked within the given transaction. If it has not yet been + * read in, read it from disk. If it is already locked + * within the transaction and already read in, just increment its + * lock recursion count and return a pointer to it. + * + * If the transaction pointer is NULL, make this just a normal + * read_buf() call. + */ +int +xfs_trans_read_buf( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_buftarg_t *target, + xfs_daddr_t blkno, + int len, + uint flags, + xfs_buf_t **bpp) +{ + xfs_buf_t *bp; + xfs_buf_log_item_t *bip; + int error; + + if (flags == 0) + flags = XBF_LOCK | XBF_MAPPED; + + /* + * Default to a normal get_buf() call if the tp is NULL. + */ + if (tp == NULL) { + bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); + if (!bp) + return (flags & XBF_TRYLOCK) ? + EAGAIN : XFS_ERROR(ENOMEM); + + if (XFS_BUF_GETERROR(bp) != 0) { + xfs_ioerror_alert("xfs_trans_read_buf", mp, + bp, blkno); + error = XFS_BUF_GETERROR(bp); + xfs_buf_relse(bp); + return error; + } +#ifdef DEBUG + if (xfs_do_error) { + if (xfs_error_target == target) { + if (((xfs_req_num++) % xfs_error_mod) == 0) { + xfs_buf_relse(bp); + xfs_debug(mp, "Returning error!"); + return XFS_ERROR(EIO); + } + } + } +#endif + if (XFS_FORCED_SHUTDOWN(mp)) + goto shutdown_abort; + *bpp = bp; + return 0; + } + + /* + * If we find the buffer in the cache with this transaction + * pointer in its b_fsprivate2 field, then we know we already + * have it locked. If it is already read in we just increment + * the lock recursion count and return the buffer to the caller. + * If the buffer is not yet read in, then we read it in, increment + * the lock recursion count, and return it to the caller. + */ + bp = xfs_trans_buf_item_match(tp, target, blkno, len); + if (bp != NULL) { + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + ASSERT((XFS_BUF_ISERROR(bp)) == 0); + if (!(XFS_BUF_ISDONE(bp))) { + trace_xfs_trans_read_buf_io(bp, _RET_IP_); + ASSERT(!XFS_BUF_ISASYNC(bp)); + XFS_BUF_READ(bp); + xfsbdstrat(tp->t_mountp, bp); + error = xfs_buf_iowait(bp); + if (error) { + xfs_ioerror_alert("xfs_trans_read_buf", mp, + bp, blkno); + xfs_buf_relse(bp); + /* + * We can gracefully recover from most read + * errors. Ones we can't are those that happen + * after the transaction's already dirty. + */ + if (tp->t_flags & XFS_TRANS_DIRTY) + xfs_force_shutdown(tp->t_mountp, + SHUTDOWN_META_IO_ERROR); + return error; + } + } + /* + * We never locked this buf ourselves, so we shouldn't + * brelse it either. Just get out. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + trace_xfs_trans_read_buf_shut(bp, _RET_IP_); + *bpp = NULL; + return XFS_ERROR(EIO); + } + + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + bip->bli_recur++; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + trace_xfs_trans_read_buf_recur(bip); + *bpp = bp; + return 0; + } + + /* + * We always specify the XBF_DONT_BLOCK flag within a transaction + * so that get_buf does not try to push out a delayed write buffer + * which might cause another transaction to take place (if the + * buffer was delayed alloc). Such recursive transactions can + * easily deadlock with our current transaction as well as cause + * us to run out of stack space. + */ + bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); + if (bp == NULL) { + *bpp = NULL; + return (flags & XBF_TRYLOCK) ? + 0 : XFS_ERROR(ENOMEM); + } + if (XFS_BUF_GETERROR(bp) != 0) { + XFS_BUF_SUPER_STALE(bp); + error = XFS_BUF_GETERROR(bp); + + xfs_ioerror_alert("xfs_trans_read_buf", mp, + bp, blkno); + if (tp->t_flags & XFS_TRANS_DIRTY) + xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); + xfs_buf_relse(bp); + return error; + } +#ifdef DEBUG + if (xfs_do_error && !(tp->t_flags & XFS_TRANS_DIRTY)) { + if (xfs_error_target == target) { + if (((xfs_req_num++) % xfs_error_mod) == 0) { + xfs_force_shutdown(tp->t_mountp, + SHUTDOWN_META_IO_ERROR); + xfs_buf_relse(bp); + xfs_debug(mp, "Returning trans error!"); + return XFS_ERROR(EIO); + } + } + } +#endif + if (XFS_FORCED_SHUTDOWN(mp)) + goto shutdown_abort; + + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_read_buf(bp->b_fspriv); + + *bpp = bp; + return 0; + +shutdown_abort: + /* + * the theory here is that buffer is good but we're + * bailing out because the filesystem is being forcibly + * shut down. So we should leave the b_flags alone since + * the buffer's not staled and just get out. + */ +#if defined(DEBUG) + if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) + xfs_notice(mp, "about to pop assert, bp == 0x%p", bp); +#endif + ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) != + (XBF_STALE|XBF_DELWRI)); + + trace_xfs_trans_read_buf_shut(bp, _RET_IP_); + xfs_buf_relse(bp); + *bpp = NULL; + return XFS_ERROR(EIO); +} + + +/* + * Release the buffer bp which was previously acquired with one of the + * xfs_trans_... buffer allocation routines if the buffer has not + * been modified within this transaction. If the buffer is modified + * within this transaction, do decrement the recursion count but do + * not release the buffer even if the count goes to 0. If the buffer is not + * modified within the transaction, decrement the recursion count and + * release the buffer if the recursion count goes to 0. + * + * If the buffer is to be released and it was not modified before + * this transaction began, then free the buf_log_item associated with it. + * + * If the transaction pointer is NULL, make this just a normal + * brelse() call. + */ +void +xfs_trans_brelse(xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + xfs_log_item_t *lip; + + /* + * Default to a normal brelse() call if the tp is NULL. + */ + if (tp == NULL) { + ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL); + /* + * If there's a buf log item attached to the buffer, + * then let the AIL know that the buffer is being + * unlocked. + */ + if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { + lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + if (lip->li_type == XFS_LI_BUF) { + bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*); + xfs_trans_unlocked_item(bip->bli_item.li_ailp, + lip); + } + } + xfs_buf_relse(bp); + return; + } + + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bip->bli_item.li_type == XFS_LI_BUF); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + trace_xfs_trans_brelse(bip); + + /* + * If the release is just for a recursive lock, + * then decrement the count and return. + */ + if (bip->bli_recur > 0) { + bip->bli_recur--; + return; + } + + /* + * If the buffer is dirty within this transaction, we can't + * release it until we commit. + */ + if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY) + return; + + /* + * If the buffer has been invalidated, then we can't release + * it until the transaction commits to disk unless it is re-dirtied + * as part of this transaction. This prevents us from pulling + * the item from the AIL before we should. + */ + if (bip->bli_flags & XFS_BLI_STALE) + return; + + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + + /* + * Free up the log item descriptor tracking the released item. + */ + xfs_trans_del_item(&bip->bli_item); + + /* + * Clear the hold flag in the buf log item if it is set. + * We wouldn't want the next user of the buffer to + * get confused. + */ + if (bip->bli_flags & XFS_BLI_HOLD) { + bip->bli_flags &= ~XFS_BLI_HOLD; + } + + /* + * Drop our reference to the buf log item. + */ + atomic_dec(&bip->bli_refcount); + + /* + * If the buf item is not tracking data in the log, then + * we must free it before releasing the buffer back to the + * free pool. Before releasing the buffer to the free pool, + * clear the transaction pointer in b_fsprivate2 to dissolve + * its relation to this transaction. + */ + if (!xfs_buf_item_dirty(bip)) { +/*** + ASSERT(bp->b_pincount == 0); +***/ + ASSERT(atomic_read(&bip->bli_refcount) == 0); + ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); + xfs_buf_item_relse(bp); + bip = NULL; + } + XFS_BUF_SET_FSPRIVATE2(bp, NULL); + + /* + * If we've still got a buf log item on the buffer, then + * tell the AIL that the buffer is being unlocked. + */ + if (bip != NULL) { + xfs_trans_unlocked_item(bip->bli_item.li_ailp, + (xfs_log_item_t*)bip); + } + + xfs_buf_relse(bp); + return; +} + +/* + * Mark the buffer as not needing to be unlocked when the buf item's + * IOP_UNLOCK() routine is called. The buffer must already be locked + * and associated with the given transaction. + */ +/* ARGSUSED */ +void +xfs_trans_bhold(xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + bip->bli_flags |= XFS_BLI_HOLD; + trace_xfs_trans_bhold(bip); +} + +/* + * Cancel the previous buffer hold request made on this buffer + * for this transaction. + */ +void +xfs_trans_bhold_release(xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT(bip->bli_flags & XFS_BLI_HOLD); + bip->bli_flags &= ~XFS_BLI_HOLD; + + trace_xfs_trans_bhold_release(bip); +} + +/* + * This is called to mark bytes first through last inclusive of the given + * buffer as needing to be logged when the transaction is committed. + * The buffer must already be associated with the given transaction. + * + * First and last are numbers relative to the beginning of this buffer, + * so the first byte in the buffer is numbered 0 regardless of the + * value of b_blkno. + */ +void +xfs_trans_log_buf(xfs_trans_t *tp, + xfs_buf_t *bp, + uint first, + uint last) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp))); + ASSERT((XFS_BUF_IODONE_FUNC(bp) == NULL) || + (XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks)); + + /* + * Mark the buffer as needing to be written out eventually, + * and set its iodone function to remove the buffer's buf log + * item from the AIL and free it when the buffer is flushed + * to disk. See xfs_buf_attach_iodone() for more details + * on li_cb and xfs_buf_iodone_callbacks(). + * If we end up aborting this transaction, we trap this buffer + * inside the b_bdstrat callback so that this won't get written to + * disk. + */ + XFS_BUF_DELAYWRITE(bp); + XFS_BUF_DONE(bp); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); + bip->bli_item.li_cb = xfs_buf_iodone; + + trace_xfs_trans_log_buf(bip); + + /* + * If we invalidated the buffer within this transaction, then + * cancel the invalidation now that we're dirtying the buffer + * again. There are no races with the code in xfs_buf_item_unpin(), + * because we have a reference to the buffer this entire time. + */ + if (bip->bli_flags & XFS_BLI_STALE) { + bip->bli_flags &= ~XFS_BLI_STALE; + ASSERT(XFS_BUF_ISSTALE(bp)); + XFS_BUF_UNSTALE(bp); + bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; + } + + tp->t_flags |= XFS_TRANS_DIRTY; + bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + bip->bli_flags |= XFS_BLI_LOGGED; + xfs_buf_item_log(bip, first, last); +} + + +/* + * This called to invalidate a buffer that is being used within + * a transaction. Typically this is because the blocks in the + * buffer are being freed, so we need to prevent it from being + * written out when we're done. Allowing it to be written again + * might overwrite data in the free blocks if they are reallocated + * to a file. + * + * We prevent the buffer from being written out by clearing the + * B_DELWRI flag. We can't always + * get rid of the buf log item at this point, though, because + * the buffer may still be pinned by another transaction. If that + * is the case, then we'll wait until the buffer is committed to + * disk for the last time (we can tell by the ref count) and + * free it in xfs_buf_item_unpin(). Until it is cleaned up we + * will keep the buffer locked so that the buffer and buf log item + * are not reused. + */ +void +xfs_trans_binval( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + trace_xfs_trans_binval(bip); + + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * If the buffer is already invalidated, then + * just return. + */ + ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); + ASSERT(XFS_BUF_ISSTALE(bp)); + ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); + ASSERT(tp->t_flags & XFS_TRANS_DIRTY); + return; + } + + /* + * Clear the dirty bit in the buffer and set the STALE flag + * in the buf log item. The STALE flag will be used in + * xfs_buf_item_unpin() to determine if it should clean up + * when the last reference to the buf item is given up. + * We set the XFS_BLF_CANCEL flag in the buf log format structure + * and log the buf item. This will be used at recovery time + * to determine that copies of the buffer in the log before + * this should not be replayed. + * We mark the item descriptor and the transaction dirty so + * that we'll hold the buffer until after the commit. + * + * Since we're invalidating the buffer, we also clear the state + * about which parts of the buffer have been logged. We also + * clear the flag indicating that this is an inode buffer since + * the data in the buffer will no longer be valid. + * + * We set the stale bit in the buffer as well since we're getting + * rid of it. + */ + XFS_BUF_UNDELAYWRITE(bp); + XFS_BUF_STALE(bp); + bip->bli_flags |= XFS_BLI_STALE; + bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); + bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; + bip->bli_format.blf_flags |= XFS_BLF_CANCEL; + memset((char *)(bip->bli_format.blf_data_map), 0, + (bip->bli_format.blf_map_size * sizeof(uint))); + bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY; +} + +/* + * This call is used to indicate that the buffer contains on-disk inodes which + * must be handled specially during recovery. They require special handling + * because only the di_next_unlinked from the inodes in the buffer should be + * recovered. The rest of the data in the buffer is logged via the inodes + * themselves. + * + * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be + * transferred to the buffer's log format structure so that we'll know what to + * do at recovery time. + */ +void +xfs_trans_inode_buf( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_INODE_BUF; +} + +/* + * This call is used to indicate that the buffer is going to + * be staled and was an inode buffer. This means it gets + * special processing during unpin - where any inodes + * associated with the buffer should be removed from ail. + * There is also special processing during recovery, + * any replay of the inodes in the buffer needs to be + * prevented as the buffer may have been reused. + */ +void +xfs_trans_stale_inode_buf( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_STALE_INODE; + bip->bli_item.li_cb = xfs_buf_iodone; +} + +/* + * Mark the buffer as being one which contains newly allocated + * inodes. We need to make sure that even if this buffer is + * relogged as an 'inode buf' we still recover all of the inode + * images in the face of a crash. This works in coordination with + * xfs_buf_item_committed() to ensure that the buffer remains in the + * AIL at its original location even after it has been relogged. + */ +/* ARGSUSED */ +void +xfs_trans_inode_alloc_buf( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; +} + + +/* + * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of + * dquots. However, unlike in inode buffer recovery, dquot buffers get + * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag). + * The only thing that makes dquot buffers different from regular + * buffers is that we must not replay dquot bufs when recovering + * if a _corresponding_ quotaoff has happened. We also have to distinguish + * between usr dquot bufs and grp dquot bufs, because usr and grp quotas + * can be turned off independently. + */ +/* ARGSUSED */ +void +xfs_trans_dquot_buf( + xfs_trans_t *tp, + xfs_buf_t *bp, + uint type) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + ASSERT(type == XFS_BLF_UDQUOT_BUF || + type == XFS_BLF_PDQUOT_BUF || + type == XFS_BLF_GDQUOT_BUF); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_format.blf_flags |= type; +} diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c new file mode 100644 index 00000000..f7590f5b --- /dev/null +++ b/fs/xfs/xfs_trans_extfree.c @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_trans_priv.h" +#include "xfs_extfree_item.h" + +/* + * This routine is called to allocate an "extent free intention" + * log item that will hold nextents worth of extents. The + * caller must use all nextents extents, because we are not + * flexible about this at all. + */ +xfs_efi_log_item_t * +xfs_trans_get_efi(xfs_trans_t *tp, + uint nextents) +{ + xfs_efi_log_item_t *efip; + + ASSERT(tp != NULL); + ASSERT(nextents > 0); + + efip = xfs_efi_init(tp->t_mountp, nextents); + ASSERT(efip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &efip->efi_item); + return efip; +} + +/* + * This routine is called to indicate that the described + * extent is to be logged as needing to be freed. It should + * be called once for each extent to be freed. + */ +void +xfs_trans_log_efi_extent(xfs_trans_t *tp, + xfs_efi_log_item_t *efip, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len) +{ + uint next_extent; + xfs_extent_t *extp; + + tp->t_flags |= XFS_TRANS_DIRTY; + efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; + ASSERT(next_extent < efip->efi_format.efi_nextents); + extp = &(efip->efi_format.efi_extents[next_extent]); + extp->ext_start = start_block; + extp->ext_len = ext_len; +} + + +/* + * This routine is called to allocate an "extent free done" + * log item that will hold nextents worth of extents. The + * caller must use all nextents extents, because we are not + * flexible about this at all. + */ +xfs_efd_log_item_t * +xfs_trans_get_efd(xfs_trans_t *tp, + xfs_efi_log_item_t *efip, + uint nextents) +{ + xfs_efd_log_item_t *efdp; + + ASSERT(tp != NULL); + ASSERT(nextents > 0); + + efdp = xfs_efd_init(tp->t_mountp, efip, nextents); + ASSERT(efdp != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &efdp->efd_item); + return efdp; +} + +/* + * This routine is called to indicate that the described + * extent is to be logged as having been freed. It should + * be called once for each extent freed. + */ +void +xfs_trans_log_efd_extent(xfs_trans_t *tp, + xfs_efd_log_item_t *efdp, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len) +{ + uint next_extent; + xfs_extent_t *extp; + + tp->t_flags |= XFS_TRANS_DIRTY; + efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); + extp->ext_start = start_block; + extp->ext_len = ext_len; + efdp->efd_next_extent++; +} diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c new file mode 100644 index 00000000..048b0c68 --- /dev/null +++ b/fs/xfs/xfs_trans_inode.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans_priv.h" +#include "xfs_inode_item.h" +#include "xfs_trace.h" + +#ifdef XFS_TRANS_DEBUG +STATIC void +xfs_trans_inode_broot_debug( + xfs_inode_t *ip); +#else +#define xfs_trans_inode_broot_debug(ip) +#endif + +/* + * Add a locked inode to the transaction. + * + * The inode must be locked, and it cannot be associated with any transaction. + */ +void +xfs_trans_ijoin( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + xfs_inode_log_item_t *iip; + + ASSERT(ip->i_transp == NULL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (ip->i_itemp == NULL) + xfs_inode_item_init(ip, ip->i_mount); + iip = ip->i_itemp; + ASSERT(iip->ili_lock_flags == 0); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &iip->ili_item); + + xfs_trans_inode_broot_debug(ip); + + /* + * Initialize i_transp so we can find it with xfs_inode_incore() + * in xfs_trans_iget() above. + */ + ip->i_transp = tp; +} + +/* + * Add a locked inode to the transaction. + * + * + * Grabs a reference to the inode which will be dropped when the transaction + * is committed. The inode will also be unlocked at that point. The inode + * must be locked, and it cannot be associated with any transaction. + */ +void +xfs_trans_ijoin_ref( + struct xfs_trans *tp, + struct xfs_inode *ip, + uint lock_flags) +{ + xfs_trans_ijoin(tp, ip); + IHOLD(ip); + ip->i_itemp->ili_lock_flags = lock_flags; +} + +/* + * Transactional inode timestamp update. Requires the inode to be locked and + * joined to the transaction supplied. Relies on the transaction subsystem to + * track dirty state and update/writeback the inode accordingly. + */ +void +xfs_trans_ichgtime( + struct xfs_trans *tp, + struct xfs_inode *ip, + int flags) +{ + struct inode *inode = VFS_I(ip); + timespec_t tv; + + ASSERT(tp); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(ip->i_transp == tp); + + tv = current_fs_time(inode->i_sb); + + if ((flags & XFS_ICHGTIME_MOD) && + !timespec_equal(&inode->i_mtime, &tv)) { + inode->i_mtime = tv; + } + if ((flags & XFS_ICHGTIME_CHG) && + !timespec_equal(&inode->i_ctime, &tv)) { + inode->i_ctime = tv; + } +} + +/* + * This is called to mark the fields indicated in fieldmask as needing + * to be logged when the transaction is committed. The inode must + * already be associated with the given transaction. + * + * The values for fieldmask are defined in xfs_inode_item.h. We always + * log all of the core inode if any of it has changed, and we always log + * all of the inline data/extents/b-tree root if any of them has changed. + */ +void +xfs_trans_log_inode( + xfs_trans_t *tp, + xfs_inode_t *ip, + uint flags) +{ + ASSERT(ip->i_transp == tp); + ASSERT(ip->i_itemp != NULL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + tp->t_flags |= XFS_TRANS_DIRTY; + ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * Always OR in the bits from the ili_last_fields field. + * This is to coordinate with the xfs_iflush() and xfs_iflush_done() + * routines in the eventual clearing of the ilf_fields bits. + * See the big comment in xfs_iflush() for an explanation of + * this coordination mechanism. + */ + flags |= ip->i_itemp->ili_last_fields; + ip->i_itemp->ili_format.ilf_fields |= flags; +} + +#ifdef XFS_TRANS_DEBUG +/* + * Keep track of the state of the inode btree root to make sure we + * log it properly. + */ +STATIC void +xfs_trans_inode_broot_debug( + xfs_inode_t *ip) +{ + xfs_inode_log_item_t *iip; + + ASSERT(ip->i_itemp != NULL); + iip = ip->i_itemp; + if (iip->ili_root_size != 0) { + ASSERT(iip->ili_orig_root != NULL); + kmem_free(iip->ili_orig_root); + iip->ili_root_size = 0; + iip->ili_orig_root = NULL; + } + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + ASSERT((ip->i_df.if_broot != NULL) && + (ip->i_df.if_broot_bytes > 0)); + iip->ili_root_size = ip->i_df.if_broot_bytes; + iip->ili_orig_root = + (char*)kmem_alloc(iip->ili_root_size, KM_SLEEP); + memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot), + iip->ili_root_size); + } +} +#endif diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h new file mode 100644 index 00000000..fe2e3cbc --- /dev/null +++ b/fs/xfs/xfs_trans_priv.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_PRIV_H__ +#define __XFS_TRANS_PRIV_H__ + +struct xfs_log_item; +struct xfs_log_item_desc; +struct xfs_mount; +struct xfs_trans; +struct xfs_ail; +struct xfs_log_vec; + +void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); +void xfs_trans_del_item(struct xfs_log_item *); +void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, + int flags); +void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); + +void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, + xfs_lsn_t commit_lsn, int aborted); +/* + * AIL traversal cursor. + * + * Rather than using a generation number for detecting changes in the ail, use + * a cursor that is protected by the ail lock. The aild cursor exists in the + * struct xfs_ail, but other traversals can declare it on the stack and link it + * to the ail list. + * + * When an object is deleted from or moved int the AIL, the cursor list is + * searched to see if the object is a designated cursor item. If it is, it is + * deleted from the cursor so that the next time the cursor is used traversal + * will return to the start. + * + * This means a traversal colliding with a removal will cause a restart of the + * list scan, rather than any insertion or deletion anywhere in the list. The + * low bit of the item pointer is set if the cursor has been invalidated so + * that we can tell the difference between invalidation and reaching the end + * of the list to trigger traversal restarts. + */ +struct xfs_ail_cursor { + struct xfs_ail_cursor *next; + struct xfs_log_item *item; +}; + +/* + * Private AIL structures. + * + * Eventually we need to drive the locking in here as well. + */ +struct xfs_ail { + struct xfs_mount *xa_mount; + struct task_struct *xa_task; + struct list_head xa_ail; + xfs_lsn_t xa_target; + struct xfs_ail_cursor xa_cursors; + spinlock_t xa_lock; + xfs_lsn_t xa_last_pushed_lsn; +}; + +/* + * From xfs_trans_ail.c + */ +void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, int nr_items, + xfs_lsn_t lsn) __releases(ailp->xa_lock); +static inline void +xfs_trans_ail_update( + struct xfs_ail *ailp, + struct xfs_log_item *lip, + xfs_lsn_t lsn) __releases(ailp->xa_lock) +{ + xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); +} + +void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, + struct xfs_log_item **log_items, int nr_items) + __releases(ailp->xa_lock); +static inline void +xfs_trans_ail_delete( + struct xfs_ail *ailp, + xfs_log_item_t *lip) __releases(ailp->xa_lock) +{ + xfs_trans_ail_delete_bulk(ailp, &lip, 1); +} + +void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); +void xfs_ail_push_all(struct xfs_ail *); +xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); + +void xfs_trans_unlocked_item(struct xfs_ail *, + xfs_log_item_t *); + +struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur); +void xfs_trans_ail_cursor_done(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur); + +#if BITS_PER_LONG != 64 +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ + spin_lock(&ailp->xa_lock); + *dst = *src; + spin_unlock(&ailp->xa_lock); +} +#else +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); + *dst = *src; +} +#endif +#endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h new file mode 100644 index 00000000..7d2c920d --- /dev/null +++ b/fs/xfs/xfs_trans_space.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_SPACE_H__ +#define __XFS_TRANS_SPACE_H__ + +/* + * Components of space reservations. + */ +#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \ + (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0])) +#define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1) +#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\ + (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ + XFS_EXTENTADD_SPACE_RES(mp,w)) +#define XFS_DAENTER_1B(mp,w) ((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1) +#define XFS_DAENTER_DBS(mp,w) \ + (XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0)) +#define XFS_DAENTER_BLOCKS(mp,w) \ + (XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w)) +#define XFS_DAENTER_BMAP1B(mp,w) \ + XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w) +#define XFS_DAENTER_BMAPS(mp,w) \ + (XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w)) +#define XFS_DAENTER_SPACE_RES(mp,w) \ + (XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w)) +#define XFS_DAREMOVE_SPACE_RES(mp,w) XFS_DAENTER_BMAPS(mp,w) +#define XFS_DIRENTER_MAX_SPLIT(mp,nl) 1 +#define XFS_DIRENTER_SPACE_RES(mp,nl) \ + (XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \ + XFS_DIRENTER_MAX_SPLIT(mp,nl)) +#define XFS_DIRREMOVE_SPACE_RES(mp) \ + XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) +#define XFS_IALLOC_SPACE_RES(mp) \ + (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1) + +/* + * Space reservation values for various transactions. + */ +#define XFS_ADDAFORK_SPACE_RES(mp) \ + ((mp)->m_dirblkfsbs + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)) +#define XFS_ATTRRM_SPACE_RES(mp) \ + XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK) +/* This macro is not used - see inline code in xfs_attr_set */ +#define XFS_ATTRSET_SPACE_RES(mp, v) \ + (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v)) +#define XFS_CREATE_SPACE_RES(mp,nl) \ + (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) +#define XFS_DIOSTRAT_SPACE_RES(mp, v) \ + (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) +#define XFS_GROWFS_SPACE_RES(mp) \ + (2 * XFS_AG_MAXLEVELS(mp)) +#define XFS_GROWFSRT_SPACE_RES(mp,b) \ + ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) +#define XFS_LINK_SPACE_RES(mp,nl) \ + XFS_DIRENTER_SPACE_RES(mp,nl) +#define XFS_MKDIR_SPACE_RES(mp,nl) \ + (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) +#define XFS_QM_DQALLOC_SPACE_RES(mp) \ + (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \ + XFS_DQUOT_CLUSTER_SIZE_FSB) +#define XFS_QM_QINOCREATE_SPACE_RES(mp) \ + XFS_IALLOC_SPACE_RES(mp) +#define XFS_REMOVE_SPACE_RES(mp) \ + XFS_DIRREMOVE_SPACE_RES(mp) +#define XFS_RENAME_SPACE_RES(mp,nl) \ + (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) +#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ + (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) + +#endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h new file mode 100644 index 00000000..65584b55 --- /dev/null +++ b/fs/xfs/xfs_types.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TYPES_H__ +#define __XFS_TYPES_H__ + +#ifdef __KERNEL__ + +/* + * Additional type declarations for XFS + */ +typedef signed char __int8_t; +typedef unsigned char __uint8_t; +typedef signed short int __int16_t; +typedef unsigned short int __uint16_t; +typedef signed int __int32_t; +typedef unsigned int __uint32_t; +typedef signed long long int __int64_t; +typedef unsigned long long int __uint64_t; + +typedef enum { B_FALSE,B_TRUE } boolean_t; +typedef __uint32_t prid_t; /* project ID */ +typedef __uint32_t inst_t; /* an instruction */ + +typedef __s64 xfs_off_t; /* type */ +typedef unsigned long long xfs_ino_t; /* type */ +typedef __s64 xfs_daddr_t; /* type */ +typedef char * xfs_caddr_t; /* type */ +typedef __u32 xfs_dev_t; +typedef __u32 xfs_nlink_t; + +/* __psint_t is the same size as a pointer */ +#if (BITS_PER_LONG == 32) +typedef __int32_t __psint_t; +typedef __uint32_t __psunsigned_t; +#elif (BITS_PER_LONG == 64) +typedef __int64_t __psint_t; +typedef __uint64_t __psunsigned_t; +#else +#error BITS_PER_LONG must be 32 or 64 +#endif + +#endif /* __KERNEL__ */ + +typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ +typedef __uint32_t xfs_extlen_t; /* extent length in blocks */ +typedef __uint32_t xfs_agnumber_t; /* allocation group number */ +typedef __int32_t xfs_extnum_t; /* # of extents in a file */ +typedef __int16_t xfs_aextnum_t; /* # extents in an attribute fork */ +typedef __int64_t xfs_fsize_t; /* bytes in a file */ +typedef __uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ + +typedef __int32_t xfs_suminfo_t; /* type of bitmap summary info */ +typedef __int32_t xfs_rtword_t; /* word type for bitmap manipulations */ + +typedef __int64_t xfs_lsn_t; /* log sequence number */ +typedef __int32_t xfs_tid_t; /* transaction identifier */ + +typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ +typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ + +/* + * These types are 64 bits on disk but are either 32 or 64 bits in memory. + * Disk based types: + */ +typedef __uint64_t xfs_dfsbno_t; /* blockno in filesystem (agno|agbno) */ +typedef __uint64_t xfs_drfsbno_t; /* blockno in filesystem (raw) */ +typedef __uint64_t xfs_drtbno_t; /* extent (block) in realtime area */ +typedef __uint64_t xfs_dfiloff_t; /* block number in a file */ +typedef __uint64_t xfs_dfilblks_t; /* number of blocks in a file */ + +/* + * Memory based types are conditional. + */ +#if XFS_BIG_BLKNOS +typedef __uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */ +typedef __uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ +typedef __uint64_t xfs_rtblock_t; /* extent (block) in realtime area */ +typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ +#else +typedef __uint32_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */ +typedef __uint32_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ +typedef __uint32_t xfs_rtblock_t; /* extent (block) in realtime area */ +typedef __int32_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ +#endif +typedef __uint64_t xfs_fileoff_t; /* block number in a file */ +typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ +typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ + +/* + * Null values for the types. + */ +#define NULLDFSBNO ((xfs_dfsbno_t)-1) +#define NULLDRFSBNO ((xfs_drfsbno_t)-1) +#define NULLDRTBNO ((xfs_drtbno_t)-1) +#define NULLDFILOFF ((xfs_dfiloff_t)-1) + +#define NULLFSBLOCK ((xfs_fsblock_t)-1) +#define NULLRFSBLOCK ((xfs_rfsblock_t)-1) +#define NULLRTBLOCK ((xfs_rtblock_t)-1) +#define NULLFILEOFF ((xfs_fileoff_t)-1) + +#define NULLAGBLOCK ((xfs_agblock_t)-1) +#define NULLAGNUMBER ((xfs_agnumber_t)-1) +#define NULLEXTNUM ((xfs_extnum_t)-1) + +#define NULLCOMMITLSN ((xfs_lsn_t)-1) + +/* + * Max values for extlen, extnum, aextnum. + */ +#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */ +#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */ +#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ + +/* + * Min numbers of data/attr fork btree root pointers. + */ +#define MINDBTPTRS 3 +#define MINABTPTRS 2 + +/* + * MAXNAMELEN is the length (including the terminating null) of + * the longest permissible file (component) name. + */ +#define MAXNAMELEN 256 + +typedef enum { + XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi +} xfs_lookup_t; + +typedef enum { + XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi, + XFS_BTNUM_MAX +} xfs_btnum_t; + +struct xfs_name { + const unsigned char *name; + int len; +}; + +#endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c new file mode 100644 index 00000000..8b32d1a4 --- /dev/null +++ b/fs/xfs/xfs_utils.c @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_itable.h" +#include "xfs_utils.h" + + +/* + * Allocates a new inode from disk and return a pointer to the + * incore copy. This routine will internally commit the current + * transaction and allocate a new one if the Space Manager needed + * to do an allocation to replenish the inode free-list. + * + * This routine is designed to be called from xfs_create and + * xfs_create_dir. + * + */ +int +xfs_dir_ialloc( + xfs_trans_t **tpp, /* input: current transaction; + output: may be a new transaction. */ + xfs_inode_t *dp, /* directory within whose allocate + the inode. */ + mode_t mode, + xfs_nlink_t nlink, + xfs_dev_t rdev, + prid_t prid, /* project id */ + int okalloc, /* ok to allocate new space */ + xfs_inode_t **ipp, /* pointer to inode; it will be + locked. */ + int *committed) + +{ + xfs_trans_t *tp; + xfs_trans_t *ntp; + xfs_inode_t *ip; + xfs_buf_t *ialloc_context = NULL; + boolean_t call_again = B_FALSE; + int code; + uint log_res; + uint log_count; + void *dqinfo; + uint tflags; + + tp = *tpp; + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + /* + * xfs_ialloc will return a pointer to an incore inode if + * the Space Manager has an available inode on the free + * list. Otherwise, it will do an allocation and replenish + * the freelist. Since we can only do one allocation per + * transaction without deadlocks, we will need to commit the + * current transaction and start a new one. We will then + * need to call xfs_ialloc again to get the inode. + * + * If xfs_ialloc did an allocation to replenish the freelist, + * it returns the bp containing the head of the freelist as + * ialloc_context. We will hold a lock on it across the + * transaction commit so that no other process can steal + * the inode(s) that we've just allocated. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, + &ialloc_context, &call_again, &ip); + + /* + * Return an error if we were unable to allocate a new inode. + * This should only happen if we run out of space on disk or + * encounter a disk error. + */ + if (code) { + *ipp = NULL; + return code; + } + if (!call_again && (ip == NULL)) { + *ipp = NULL; + return XFS_ERROR(ENOSPC); + } + + /* + * If call_again is set, then we were unable to get an + * inode in one operation. We need to commit the current + * transaction and call xfs_ialloc() again. It is guaranteed + * to succeed the second time. + */ + if (call_again) { + + /* + * Normally, xfs_trans_commit releases all the locks. + * We call bhold to hang on to the ialloc_context across + * the commit. Holding this buffer prevents any other + * processes from doing any allocations in this + * allocation group. + */ + xfs_trans_bhold(tp, ialloc_context); + /* + * Save the log reservation so we can use + * them in the next transaction. + */ + log_res = xfs_trans_get_log_res(tp); + log_count = xfs_trans_get_log_count(tp); + + /* + * We want the quota changes to be associated with the next + * transaction, NOT this one. So, detach the dqinfo from this + * and attach it to the next transaction. + */ + dqinfo = NULL; + tflags = 0; + if (tp->t_dqinfo) { + dqinfo = (void *)tp->t_dqinfo; + tp->t_dqinfo = NULL; + tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY; + tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); + } + + ntp = xfs_trans_dup(tp); + code = xfs_trans_commit(tp, 0); + tp = ntp; + if (committed != NULL) { + *committed = 1; + } + /* + * If we get an error during the commit processing, + * release the buffer that is still held and return + * to the caller. + */ + if (code) { + xfs_buf_relse(ialloc_context); + if (dqinfo) { + tp->t_dqinfo = dqinfo; + xfs_trans_free_dqinfo(tp); + } + *tpp = ntp; + *ipp = NULL; + return code; + } + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(tp->t_ticket); + code = xfs_trans_reserve(tp, 0, log_res, 0, + XFS_TRANS_PERM_LOG_RES, log_count); + /* + * Re-attach the quota info that we detached from prev trx. + */ + if (dqinfo) { + tp->t_dqinfo = dqinfo; + tp->t_flags |= tflags; + } + + if (code) { + xfs_buf_relse(ialloc_context); + *tpp = ntp; + *ipp = NULL; + return code; + } + xfs_trans_bjoin(tp, ialloc_context); + + /* + * Call ialloc again. Since we've locked out all + * other allocations in this allocation group, + * this call should always succeed. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, + okalloc, &ialloc_context, &call_again, &ip); + + /* + * If we get an error at this point, return to the caller + * so that the current transaction can be aborted. + */ + if (code) { + *tpp = tp; + *ipp = NULL; + return code; + } + ASSERT ((!call_again) && (ip != NULL)); + + } else { + if (committed != NULL) { + *committed = 0; + } + } + + *ipp = ip; + *tpp = tp; + + return 0; +} + +/* + * Decrement the link count on an inode & log the change. + * If this causes the link count to go to zero, initiate the + * logging activity required to truncate a file. + */ +int /* error */ +xfs_droplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + int error; + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + ASSERT (ip->i_d.di_nlink > 0); + ip->i_d.di_nlink--; + drop_nlink(VFS_I(ip)); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = 0; + if (ip->i_d.di_nlink == 0) { + /* + * We're dropping the last link to this file. + * Move the on-disk inode to the AGI unlinked list. + * From xfs_inactive() we will pull the inode from + * the list and free it. + */ + error = xfs_iunlink(tp, ip); + } + return error; +} + +/* + * This gets called when the inode's version needs to be changed from 1 to 2. + * Currently this happens when the nlink field overflows the old 16-bit value + * or when chproj is called to change the project for the first time. + * As a side effect the superblock version will also get rev'd + * to contain the NLINK bit. + */ +void +xfs_bump_ino_vers2( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_mount_t *mp; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(ip->i_d.di_version == 1); + + ip->i_d.di_version = 2; + ip->i_d.di_onlink = 0; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + mp = tp->t_mountp; + if (!xfs_sb_version_hasnlink(&mp->m_sb)) { + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasnlink(&mp->m_sb)) { + xfs_sb_version_addnlink(&mp->m_sb); + spin_unlock(&mp->m_sb_lock); + xfs_mod_sb(tp, XFS_SB_VERSIONNUM); + } else { + spin_unlock(&mp->m_sb_lock); + } + } + /* Caller must log the inode */ +} + +/* + * Increment the link count on an inode & log the change. + */ +int +xfs_bumplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + if (ip->i_d.di_nlink >= XFS_MAXLINK) + return XFS_ERROR(EMLINK); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + ASSERT(ip->i_d.di_nlink > 0); + ip->i_d.di_nlink++; + inc_nlink(VFS_I(ip)); + if ((ip->i_d.di_version == 1) && + (ip->i_d.di_nlink > XFS_MAXLINK_1)) { + /* + * The inode has increased its number of links beyond + * what can fit in an old format inode. It now needs + * to be converted to a version 2 inode with a 32 bit + * link count. If this is the first inode in the file + * system to do this, then we need to bump the superblock + * version number as well. + */ + xfs_bump_ino_vers2(tp, ip); + } + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h new file mode 100644 index 00000000..456fca31 --- /dev/null +++ b/fs/xfs/xfs_utils.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_UTILS_H__ +#define __XFS_UTILS_H__ + +extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, + xfs_dev_t, prid_t, int, xfs_inode_t **, int *); +extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *); +extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *); +extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *); + +#endif /* __XFS_UTILS_H__ */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c new file mode 100644 index 00000000..59509ae0 --- /dev/null +++ b/fs/xfs/xfs_vnodeops.c @@ -0,0 +1,2852 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_itable.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_acl.h" +#include "xfs_attr.h" +#include "xfs_rw.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_utils.h" +#include "xfs_rtalloc.h" +#include "xfs_trans_space.h" +#include "xfs_log_priv.h" +#include "xfs_filestream.h" +#include "xfs_vnodeops.h" +#include "xfs_trace.h" + +int +xfs_setattr( + struct xfs_inode *ip, + struct iattr *iattr, + int flags) +{ + xfs_mount_t *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + int mask = iattr->ia_valid; + xfs_trans_t *tp; + int code; + uint lock_flags; + uint commit_flags=0; + uid_t uid=0, iuid=0; + gid_t gid=0, igid=0; + struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; + int need_iolock = 1; + + trace_xfs_setattr(ip); + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return XFS_ERROR(EROFS); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + code = -inode_change_ok(inode, iattr); + if (code) + return code; + + olddquot1 = olddquot2 = NULL; + udqp = gdqp = NULL; + + /* + * If disk quotas is on, we make sure that the dquots do exist on disk, + * before we start any other transactions. Trying to do this later + * is messy. We don't care to take a readlock to look at the ids + * in inode here, because we can't hold it across the trans_reserve. + * If the IDs do change before we take the ilock, we're covered + * because the i_*dquot fields will get updated anyway. + */ + if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) { + uint qflags = 0; + + if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { + uid = iattr->ia_uid; + qflags |= XFS_QMOPT_UQUOTA; + } else { + uid = ip->i_d.di_uid; + } + if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { + gid = iattr->ia_gid; + qflags |= XFS_QMOPT_GQUOTA; + } else { + gid = ip->i_d.di_gid; + } + + /* + * We take a reference when we initialize udqp and gdqp, + * so it is important that we never blindly double trip on + * the same variable. See xfs_create() for an example. + */ + ASSERT(udqp == NULL); + ASSERT(gdqp == NULL); + code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip), + qflags, &udqp, &gdqp); + if (code) + return code; + } + + /* + * For the other attributes, we acquire the inode lock and + * first do an error checking pass. + */ + tp = NULL; + lock_flags = XFS_ILOCK_EXCL; + if (flags & XFS_ATTR_NOLOCK) + need_iolock = 0; + if (!(mask & ATTR_SIZE)) { + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + commit_flags = 0; + code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), + 0, 0, 0); + if (code) { + lock_flags = 0; + goto error_return; + } + } else { + if (need_iolock) + lock_flags |= XFS_IOLOCK_EXCL; + } + + xfs_ilock(ip, lock_flags); + + /* + * Change file ownership. Must be the owner or privileged. + */ + if (mask & (ATTR_UID|ATTR_GID)) { + /* + * These IDs could have changed since we last looked at them. + * But, we're assured that if the ownership did change + * while we didn't have the inode locked, inode's dquot(s) + * would have changed also. + */ + iuid = ip->i_d.di_uid; + igid = ip->i_d.di_gid; + gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; + uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; + + /* + * Do a quota reservation only if uid/gid is actually + * going to change. + */ + if (XFS_IS_QUOTA_RUNNING(mp) && + ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || + (XFS_IS_GQUOTA_ON(mp) && igid != gid))) { + ASSERT(tp); + code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, + capable(CAP_FOWNER) ? + XFS_QMOPT_FORCE_RES : 0); + if (code) /* out of quota */ + goto error_return; + } + } + + /* + * Truncate file. Must have write permission and not be a directory. + */ + if (mask & ATTR_SIZE) { + /* Short circuit the truncate case for zero length files */ + if (iattr->ia_size == 0 && + ip->i_size == 0 && ip->i_d.di_nextents == 0) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + lock_flags &= ~XFS_ILOCK_EXCL; + if (mask & ATTR_CTIME) { + inode->i_mtime = inode->i_ctime = + current_fs_time(inode->i_sb); + xfs_mark_inode_dirty_sync(ip); + } + code = 0; + goto error_return; + } + + if (S_ISDIR(ip->i_d.di_mode)) { + code = XFS_ERROR(EISDIR); + goto error_return; + } else if (!S_ISREG(ip->i_d.di_mode)) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + + /* + * Make sure that the dquots are attached to the inode. + */ + code = xfs_qm_dqattach_locked(ip, 0); + if (code) + goto error_return; + + /* + * Now we can make the changes. Before we join the inode + * to the transaction, if ATTR_SIZE is set then take care of + * the part of the truncation that must be done without the + * inode lock. This needs to be done before joining the inode + * to the transaction, because the inode cannot be unlocked + * once it is a part of the transaction. + */ + if (iattr->ia_size > ip->i_size) { + /* + * Do the first part of growing a file: zero any data + * in the last block that is beyond the old EOF. We + * need to do this before the inode is joined to the + * transaction to modify the i_size. + */ + code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); + if (code) + goto error_return; + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + lock_flags &= ~XFS_ILOCK_EXCL; + + /* + * We are going to log the inode size change in this + * transaction so any previous writes that are beyond the on + * disk EOF and the new EOF that have not been written out need + * to be written here. If we do not write the data out, we + * expose ourselves to the null files problem. + * + * Only flush from the on disk size to the smaller of the in + * memory file size or the new size as that's the range we + * really care about here and prevents waiting for other data + * not within the range we care about here. + */ + if (ip->i_size != ip->i_d.di_size && + iattr->ia_size > ip->i_d.di_size) { + code = xfs_flush_pages(ip, + ip->i_d.di_size, iattr->ia_size, + XBF_ASYNC, FI_NONE); + if (code) + goto error_return; + } + + /* wait for all I/O to complete */ + xfs_ioend_wait(ip); + + code = -block_truncate_page(inode->i_mapping, iattr->ia_size, + xfs_get_blocks); + if (code) + goto error_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); + code = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + if (code) + goto error_return; + + truncate_setsize(inode, iattr->ia_size); + + commit_flags = XFS_TRANS_RELEASE_LOG_RES; + lock_flags |= XFS_ILOCK_EXCL; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, ip); + + /* + * Only change the c/mtime if we are changing the size + * or we are explicitly asked to change it. This handles + * the semantic difference between truncate() and ftruncate() + * as implemented in the VFS. + * + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME + * is a special case where we need to update the times despite + * not having these flags set. For all other operations the + * VFS set these flags explicitly if it wants a timestamp + * update. + */ + if (iattr->ia_size != ip->i_size && + (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { + iattr->ia_ctime = iattr->ia_mtime = + current_fs_time(inode->i_sb); + mask |= ATTR_CTIME | ATTR_MTIME; + } + + if (iattr->ia_size > ip->i_size) { + ip->i_d.di_size = iattr->ia_size; + ip->i_size = iattr->ia_size; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } else if (iattr->ia_size <= ip->i_size || + (iattr->ia_size == 0 && ip->i_d.di_nextents)) { + /* + * signal a sync transaction unless + * we're truncating an already unlinked + * file on a wsync filesystem + */ + code = xfs_itruncate_finish(&tp, ip, iattr->ia_size, + XFS_DATA_FORK, + ((ip->i_d.di_nlink != 0 || + !(mp->m_flags & XFS_MOUNT_WSYNC)) + ? 1 : 0)); + if (code) + goto abort_return; + /* + * Truncated "down", so we're removing references + * to old data here - if we now delay flushing for + * a long time, we expose ourselves unduly to the + * notorious NULL files problem. So, we mark this + * vnode and flush it when the file is closed, and + * do not wait the usual (long) time for writeout. + */ + xfs_iflags_set(ip, XFS_ITRUNCATED); + } + } else if (tp) { + xfs_trans_ijoin(tp, ip); + } + + /* + * Change file ownership. Must be the owner or privileged. + */ + if (mask & (ATTR_UID|ATTR_GID)) { + /* + * CAP_FSETID overrides the following restrictions: + * + * The set-user-ID and set-group-ID bits of a file will be + * cleared upon successful return from chown() + */ + if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + !capable(CAP_FSETID)) { + ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); + } + + /* + * Change the ownerships and register quota modifications + * in the transaction. + */ + if (iuid != uid) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { + ASSERT(mask & ATTR_UID); + ASSERT(udqp); + olddquot1 = xfs_qm_vop_chown(tp, ip, + &ip->i_udquot, udqp); + } + ip->i_d.di_uid = uid; + inode->i_uid = uid; + } + if (igid != gid) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { + ASSERT(!XFS_IS_PQUOTA_ON(mp)); + ASSERT(mask & ATTR_GID); + ASSERT(gdqp); + olddquot2 = xfs_qm_vop_chown(tp, ip, + &ip->i_gdquot, gdqp); + } + ip->i_d.di_gid = gid; + inode->i_gid = gid; + } + } + + /* + * Change file access modes. + */ + if (mask & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= mode & ~S_IFMT; + } + + /* + * Change file access or modified times. + */ + if (mask & ATTR_ATIME) { + inode->i_atime = iattr->ia_atime; + ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; + ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; + ip->i_update_core = 1; + } + if (mask & ATTR_CTIME) { + inode->i_ctime = iattr->ia_ctime; + ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; + ip->i_update_core = 1; + } + if (mask & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; + ip->i_update_core = 1; + } + + /* + * And finally, log the inode core if any attribute in it + * has been changed. + */ + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE| + ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(xs_ig_attrchg); + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + * This is slightly sub-optimal in that truncates require + * two sync transactions instead of one for wsync filesystems. + * One for the truncate and one for the timestamps since we + * don't want to change the timestamps unless we're sure the + * truncate worked. Truncates are less than 1% of the laddis + * mix so this probably isn't worth the trouble to optimize. + */ + code = 0; + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + code = xfs_trans_commit(tp, commit_flags); + + xfs_iunlock(ip, lock_flags); + + /* + * Release any dquot(s) the inode had kept before chown. + */ + xfs_qm_dqrele(olddquot1); + xfs_qm_dqrele(olddquot2); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + if (code) + return code; + + /* + * XXX(hch): Updating the ACL entries is not atomic vs the i_mode + * update. We could avoid this with linked transactions + * and passing down the transaction pointer all the way + * to attr_set. No previous user of the generic + * Posix ACL code seems to care about this issue either. + */ + if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { + code = -xfs_acl_chmod(inode); + if (code) + return XFS_ERROR(code); + } + + return 0; + + abort_return: + commit_flags |= XFS_TRANS_ABORT; + error_return: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + if (tp) { + xfs_trans_cancel(tp, commit_flags); + } + if (lock_flags != 0) { + xfs_iunlock(ip, lock_flags); + } + return code; +} + +/* + * The maximum pathlen is 1024 bytes. Since the minimum file system + * blocksize is 512 bytes, we can get a max of 2 extents back from + * bmapi. + */ +#define SYMLINK_MAPS 2 + +STATIC int +xfs_readlink_bmap( + xfs_inode_t *ip, + char *link) +{ + xfs_mount_t *mp = ip->i_mount; + int pathlen = ip->i_d.di_size; + int nmaps = SYMLINK_MAPS; + xfs_bmbt_irec_t mval[SYMLINK_MAPS]; + xfs_daddr_t d; + int byte_cnt; + int n; + xfs_buf_t *bp; + int error = 0; + + error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0, + mval, &nmaps, NULL); + if (error) + goto out; + + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), + XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK); + error = XFS_BUF_GETERROR(bp); + if (error) { + xfs_ioerror_alert("xfs_readlink", + ip->i_mount, bp, XFS_BUF_ADDR(bp)); + xfs_buf_relse(bp); + goto out; + } + if (pathlen < byte_cnt) + byte_cnt = pathlen; + pathlen -= byte_cnt; + + memcpy(link, XFS_BUF_PTR(bp), byte_cnt); + xfs_buf_relse(bp); + } + + link[ip->i_d.di_size] = '\0'; + error = 0; + + out: + return error; +} + +int +xfs_readlink( + xfs_inode_t *ip, + char *link) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fsize_t pathlen; + int error = 0; + + trace_xfs_readlink(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + + pathlen = ip->i_d.di_size; + if (!pathlen) + goto out; + + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", + __func__, (unsigned long long) ip->i_ino, + (long long) pathlen); + ASSERT(0); + error = XFS_ERROR(EFSCORRUPTED); + goto out; + } + + + if (ip->i_df.if_flags & XFS_IFINLINE) { + memcpy(link, ip->i_df.if_u1.if_data, pathlen); + link[pathlen] = '\0'; + } else { + error = xfs_readlink_bmap(ip, link); + } + + out: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +/* + * Flags for xfs_free_eofblocks + */ +#define XFS_FREE_EOF_TRYLOCK (1<<0) + +/* + * This is called by xfs_inactive to free any blocks beyond eof + * when the link count isn't zero and by xfs_dm_punch_hole() when + * punching a hole to EOF. + */ +STATIC int +xfs_free_eofblocks( + xfs_mount_t *mp, + xfs_inode_t *ip, + int flags) +{ + xfs_trans_t *tp; + int error; + xfs_fileoff_t end_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t map_len; + int nimaps; + xfs_bmbt_irec_t imap; + + /* + * Figure out if there are any blocks beyond the end + * of the file. If not, then there is nothing to do. + */ + end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); + last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + if (last_fsb <= end_fsb) + return 0; + map_len = last_fsb - end_fsb; + + nimaps = 1; + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0, + NULL, 0, &imap, &nimaps, NULL); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!error && (nimaps != 0) && + (imap.br_startblock != HOLESTARTBLOCK || + ip->i_delayed_blks)) { + /* + * Attach the dquots to the inode up front. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + /* + * There are blocks after the end of file. + * Free them up now by truncating the file to + * its current size. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + + /* + * Do the xfs_itruncate_start() call before + * reserving any log space because + * itruncate_start will call into the buffer + * cache and we can't + * do that within a transaction. + */ + if (flags & XFS_FREE_EOF_TRYLOCK) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + xfs_trans_cancel(tp, 0); + return 0; + } + } else { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + } + error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, + ip->i_size); + if (error) { + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + } + + error = xfs_trans_reserve(tp, 0, + XFS_ITRUNCATE_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip); + + error = xfs_itruncate_finish(&tp, ip, + ip->i_size, + XFS_DATA_FORK, + 0); + /* + * If we get an error at this point we + * simply don't bother truncating the file. + */ + if (error) { + xfs_trans_cancel(tp, + (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + } else { + error = xfs_trans_commit(tp, + XFS_TRANS_RELEASE_LOG_RES); + } + xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL); + } + return error; +} + +/* + * Free a symlink that has blocks associated with it. + */ +STATIC int +xfs_inactive_symlink_rmt( + xfs_inode_t *ip, + xfs_trans_t **tpp) +{ + xfs_buf_t *bp; + int committed; + int done; + int error; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + int i; + xfs_mount_t *mp; + xfs_bmbt_irec_t mval[SYMLINK_MAPS]; + int nmaps; + xfs_trans_t *ntp; + int size; + xfs_trans_t *tp; + + tp = *tpp; + mp = ip->i_mount; + ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip)); + /* + * We're freeing a symlink that has some + * blocks allocated to it. Free the + * blocks here. We know that we've got + * either 1 or 2 extents and that we can + * free them all in one bunmapi call. + */ + ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); + if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + *tpp = NULL; + return error; + } + /* + * Lock the inode, fix the size, and join it to the transaction. + * Hold it so in the normal path, we still have it locked for + * the second transaction. In the error paths we need it + * held so the cancel won't rele it, see below. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + size = (int)ip->i_d.di_size; + ip->i_d.di_size = 0; + xfs_trans_ijoin(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Find the block(s) so we can inval and unmap them. + */ + done = 0; + xfs_bmap_init(&free_list, &first_block); + nmaps = ARRAY_SIZE(mval); + if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), + XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, + &free_list))) + goto error0; + /* + * Invalidate the block(s). + */ + for (i = 0; i < nmaps; i++) { + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); + xfs_trans_binval(tp, bp); + } + /* + * Unmap the dead block(s) to the free_list. + */ + if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, + &first_block, &free_list, &done))) + goto error1; + ASSERT(done); + /* + * Commit the first transaction. This logs the EFI and the inode. + */ + if ((error = xfs_bmap_finish(&tp, &free_list, &committed))) + goto error1; + /* + * The transaction must have been committed, since there were + * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. + * The new tp has the extent freeing and EFDs. + */ + ASSERT(committed); + /* + * The first xact was committed, so add the inode to the new one. + * Mark it dirty so it will be logged and moved forward in the log as + * part of every commit. + */ + xfs_trans_ijoin(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Get a new, empty transaction to return to our caller. + */ + ntp = xfs_trans_dup(tp); + /* + * Commit the transaction containing extent freeing and EFDs. + * If we get an error on the commit here or on the reserve below, + * we need to unlock the inode since the new transaction doesn't + * have the inode attached. + */ + error = xfs_trans_commit(tp, 0); + tp = ntp; + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + goto error0; + } + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(tp->t_ticket); + + /* + * Remove the memory for extent descriptions (just bookkeeping). + */ + if (ip->i_df.if_bytes) + xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + /* + * Put an itruncate log reservation in the new transaction + * for our caller. + */ + if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + goto error0; + } + /* + * Return with the inode locked but not joined to the transaction. + */ + *tpp = tp; + return 0; + + error1: + xfs_bmap_cancel(&free_list); + error0: + /* + * Have to come here with the inode locked and either + * (held and in the transaction) or (not in the transaction). + * If the inode isn't held then cancel would iput it, but + * that's wrong since this is inactive and the vnode ref + * count is 0 already. + * Cancel won't do anything to the inode if held, but it still + * needs to be locked until the cancel is done, if it was + * joined to the transaction. + */ + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + *tpp = NULL; + return error; + +} + +STATIC int +xfs_inactive_symlink_local( + xfs_inode_t *ip, + xfs_trans_t **tpp) +{ + int error; + + ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip)); + /* + * We're freeing a symlink which fit into + * the inode. Just free the memory used + * to hold the old symlink. + */ + error = xfs_trans_reserve(*tpp, 0, + XFS_ITRUNCATE_LOG_RES(ip->i_mount), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + + if (error) { + xfs_trans_cancel(*tpp, 0); + *tpp = NULL; + return error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + + /* + * Zero length symlinks _can_ exist. + */ + if (ip->i_df.if_bytes > 0) { + xfs_idata_realloc(ip, + -(ip->i_df.if_bytes), + XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + } + return 0; +} + +STATIC int +xfs_inactive_attrs( + xfs_inode_t *ip, + xfs_trans_t **tpp) +{ + xfs_trans_t *tp; + int error; + xfs_mount_t *mp; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + tp = *tpp; + mp = ip->i_mount; + ASSERT(ip->i_d.di_forkoff != 0); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + goto error_unlock; + + error = xfs_attr_inactive(ip); + if (error) + goto error_unlock; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, 0, + XFS_IFREE_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_INACTIVE_LOG_COUNT); + if (error) + goto error_cancel; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip); + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + ASSERT(ip->i_d.di_anextents == 0); + + *tpp = tp; + return 0; + +error_cancel: + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); +error_unlock: + *tpp = NULL; + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} + +int +xfs_release( + xfs_inode_t *ip) +{ + xfs_mount_t *mp = ip->i_mount; + int error; + + if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0)) + return 0; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return 0; + + if (!XFS_FORCED_SHUTDOWN(mp)) { + int truncated; + + /* + * If we are using filestreams, and we have an unlinked + * file that we are processing the last close on, then nothing + * will be able to reopen and write to this file. Purge this + * inode from the filestreams cache so that it doesn't delay + * teardown of the inode. + */ + if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip)) + xfs_filestream_deassociate(ip); + + /* + * If we previously truncated this file and removed old data + * in the process, we want to initiate "early" writeout on + * the last close. This is an attempt to combat the notorious + * NULL files problem which is particularly noticeable from a + * truncate down, buffered (re-)write (delalloc), followed by + * a crash. What we are effectively doing here is + * significantly reducing the time window where we'd otherwise + * be exposed to that problem. + */ + truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); + if (truncated) { + xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); + if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) + xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); + } + } + + if (ip->i_d.di_nlink == 0) + return 0; + + if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || + ip->i_delayed_blks > 0)) && + (ip->i_df.if_flags & XFS_IFEXTENTS)) && + (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { + + /* + * If we can't get the iolock just skip truncating the blocks + * past EOF because we could deadlock with the mmap_sem + * otherwise. We'll get another chance to drop them once the + * last reference to the inode is dropped, so we'll never leak + * blocks permanently. + * + * Further, check if the inode is being opened, written and + * closed frequently and we have delayed allocation blocks + * outstanding (e.g. streaming writes from the NFS server), + * truncating the blocks past EOF will cause fragmentation to + * occur. + * + * In this case don't do the truncation, either, but we have to + * be careful how we detect this case. Blocks beyond EOF show + * up as i_delayed_blks even when the inode is clean, so we + * need to truncate them away first before checking for a dirty + * release. Hence on the first dirty close we will still remove + * the speculative allocation, but after that we will leave it + * in place. + */ + if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) + return 0; + + error = xfs_free_eofblocks(mp, ip, + XFS_FREE_EOF_TRYLOCK); + if (error) + return error; + + /* delalloc blocks after truncation means it really is dirty */ + if (ip->i_delayed_blks) + xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); + } + return 0; +} + +/* + * xfs_inactive + * + * This is called when the vnode reference count for the vnode + * goes to zero. If the file has been unlinked, then it must + * now be truncated. Also, we clear all of the read-ahead state + * kept for the inode here since the file is now closed. + */ +int +xfs_inactive( + xfs_inode_t *ip) +{ + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int committed; + xfs_trans_t *tp; + xfs_mount_t *mp; + int error; + int truncate; + + /* + * If the inode is already free, then there can be nothing + * to clean up here. + */ + if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) { + ASSERT(ip->i_df.if_real_bytes == 0); + ASSERT(ip->i_df.if_broot_bytes == 0); + return VN_INACTIVE_CACHE; + } + + /* + * Only do a truncate if it's a regular file with + * some actual space in it. It's OK to look at the + * inode's fields without the lock because we're the + * only one with a reference to the inode. + */ + truncate = ((ip->i_d.di_nlink == 0) && + ((ip->i_d.di_size != 0) || (ip->i_size != 0) || + (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && + ((ip->i_d.di_mode & S_IFMT) == S_IFREG)); + + mp = ip->i_mount; + + error = 0; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + goto out; + + if (ip->i_d.di_nlink != 0) { + if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || + ip->i_delayed_blks > 0)) && + (ip->i_df.if_flags & XFS_IFEXTENTS) && + (!(ip->i_d.di_flags & + (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || + (ip->i_delayed_blks != 0)))) { + error = xfs_free_eofblocks(mp, ip, 0); + if (error) + return VN_INACTIVE_CACHE; + } + goto out; + } + + ASSERT(ip->i_d.di_nlink == 0); + + error = xfs_qm_dqattach(ip, 0); + if (error) + return VN_INACTIVE_CACHE; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + if (truncate) { + /* + * Do the xfs_itruncate_start() call before + * reserving any log space because itruncate_start + * will call into the buffer cache and we can't + * do that within a transaction. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0); + if (error) { + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return VN_INACTIVE_CACHE; + } + + error = xfs_trans_reserve(tp, 0, + XFS_ITRUNCATE_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + if (error) { + /* Don't call itruncate_cleanup */ + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return VN_INACTIVE_CACHE; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip); + + /* + * normally, we have to run xfs_itruncate_finish sync. + * But if filesystem is wsync and we're in the inactive + * path, then we know that nlink == 0, and that the + * xaction that made nlink == 0 is permanently committed + * since xfs_remove runs as a synchronous transaction. + */ + error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, + (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0)); + + if (error) { + xfs_trans_cancel(tp, + XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + return VN_INACTIVE_CACHE; + } + } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) { + + /* + * If we get an error while cleaning up a + * symlink we bail out. + */ + error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ? + xfs_inactive_symlink_rmt(ip, &tp) : + xfs_inactive_symlink_local(ip, &tp); + + if (error) { + ASSERT(tp == NULL); + return VN_INACTIVE_CACHE; + } + + xfs_trans_ijoin(tp, ip); + } else { + error = xfs_trans_reserve(tp, 0, + XFS_IFREE_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_INACTIVE_LOG_COUNT); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + return VN_INACTIVE_CACHE; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, ip); + } + + /* + * If there are attributes associated with the file + * then blow them away now. The code calls a routine + * that recursively deconstructs the attribute fork. + * We need to just commit the current transaction + * because we can't use it for xfs_attr_inactive(). + */ + if (ip->i_d.di_anextents > 0) { + error = xfs_inactive_attrs(ip, &tp); + /* + * If we got an error, the transaction is already + * cancelled, and the inode is unlocked. Just get out. + */ + if (error) + return VN_INACTIVE_CACHE; + } else if (ip->i_afp) { + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + } + + /* + * Free the inode. + */ + xfs_bmap_init(&free_list, &first_block); + error = xfs_ifree(tp, ip, &free_list); + if (error) { + /* + * If we fail to free the inode, shut down. The cancel + * might do that, we need to make sure. Otherwise the + * inode might be lost for a long time or forever. + */ + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_notice(mp, "%s: xfs_ifree returned error %d", + __func__, error); + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + } + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + } else { + /* + * Credit the quota account(s). The inode is gone. + */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); + + /* + * Just ignore errors at this point. There is nothing we can + * do except to try to keep going. Make sure it's not a silent + * error. + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", + __func__, error); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + xfs_notice(mp, "%s: xfs_trans_commit returned error %d", + __func__, error); + } + + /* + * Release the dquots held by inode, if any. + */ + xfs_qm_dqdetach(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + + out: + return VN_INACTIVE_CACHE; +} + +/* + * Lookups up an inode from "name". If ci_name is not NULL, then a CI match + * is allowed, otherwise it has to be an exact match. If a CI match is found, + * ci_name->name will point to a the actual name (caller must free) or + * will be set to NULL if an exact match is found. + */ +int +xfs_lookup( + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t **ipp, + struct xfs_name *ci_name) +{ + xfs_ino_t inum; + int error; + uint lock_mode; + + trace_xfs_lookup(dp, name); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return XFS_ERROR(EIO); + + lock_mode = xfs_ilock_map_shared(dp); + error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); + xfs_iunlock_map_shared(dp, lock_mode); + + if (error) + goto out; + + error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); + if (error) + goto out_free_name; + + return 0; + +out_free_name: + if (ci_name) + kmem_free(ci_name->name); +out: + *ipp = NULL; + return error; +} + +int +xfs_create( + xfs_inode_t *dp, + struct xfs_name *name, + mode_t mode, + xfs_dev_t rdev, + xfs_inode_t **ipp) +{ + int is_dir = S_ISDIR(mode); + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + boolean_t unlock_dp_on_error = B_FALSE; + uint cancel_flags; + int committed; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + uint resblks; + uint log_res; + uint log_count; + + trace_xfs_create(dp, name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + prid = xfs_get_projid(dp); + else + prid = XFS_PROJID_DEFAULT; + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); + if (error) + return error; + + if (is_dir) { + rdev = 0; + resblks = XFS_MKDIR_SPACE_RES(mp, name->len); + log_res = XFS_MKDIR_LOG_RES(mp); + log_count = XFS_MKDIR_LOG_COUNT; + tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); + } else { + resblks = XFS_CREATE_SPACE_RES(mp, name->len); + log_res = XFS_CREATE_LOG_RES(mp); + log_count = XFS_CREATE_LOG_COUNT; + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); + } + + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + + /* + * Initially assume that the file does not exist and + * reserve the resources for that case. If that is not + * the case we'll drop the one we have and get a more + * appropriate transaction later. + */ + error = xfs_trans_reserve(tp, resblks, log_res, 0, + XFS_TRANS_PERM_LOG_RES, log_count); + if (error == ENOSPC) { + /* flush outstanding delalloc blocks and retry */ + xfs_flush_inodes(dp); + error = xfs_trans_reserve(tp, resblks, log_res, 0, + XFS_TRANS_PERM_LOG_RES, log_count); + } + if (error == ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ + resblks = 0; + error = xfs_trans_reserve(tp, 0, log_res, 0, + XFS_TRANS_PERM_LOG_RES, log_count); + } + if (error) { + cancel_flags = 0; + goto out_trans_cancel; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = B_TRUE; + + /* + * Check for directory link count overflow. + */ + if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) { + error = XFS_ERROR(EMLINK); + goto out_trans_cancel; + } + + xfs_bmap_init(&free_list, &first_block); + + /* + * Reserve disk quota and the inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + error = xfs_dir_canenter(tp, dp, name, resblks); + if (error) + goto out_trans_cancel; + + /* + * A newly created regular or special file just has one directory + * entry pointing to them, but a directory also the "." entry + * pointing to itself. + */ + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, + prid, resblks > 0, &ip, &committed); + if (error) { + if (error == ENOSPC) + goto out_trans_cancel; + goto out_trans_abort; + } + + /* + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); + unlock_dp_on_error = B_FALSE; + + error = xfs_dir_createname(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks ? + resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + if (error) { + ASSERT(error != ENOSPC); + goto out_trans_abort; + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + if (is_dir) { + error = xfs_dir_init(tp, ip, dp); + if (error) + goto out_bmap_cancel; + + error = xfs_bumplink(tp, dp); + if (error) + goto out_bmap_cancel; + } + + /* + * If this is a synchronous mount, make sure that the + * create transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + *ipp = ip; + return 0; + + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + out_trans_abort: + cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + out_release_inode: + /* + * Wait until after the current transaction is aborted to + * release the inode. This prevents recursive transactions + * and deadlocks from xfs_inactive. + */ + if (ip) + IRELE(ip); + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return error; +} + +#ifdef DEBUG +int xfs_locked_n; +int xfs_small_retries; +int xfs_middle_retries; +int xfs_lots_retries; +int xfs_lock_delays; +#endif + +/* + * Bump the subclass so xfs_lock_inodes() acquires each lock with + * a different value + */ +static inline int +xfs_lock_inumorder(int lock_mode, int subclass) +{ + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; + + return lock_mode; +} + +/* + * The following routine will lock n inodes in exclusive mode. + * We assume the caller calls us with the inodes in i_ino order. + * + * We need to detect deadlock where an inode that we lock + * is in the AIL and we start waiting for another inode that is locked + * by a thread in a long running transaction (such as truncate). This can + * result in deadlock since the long running trans might need to wait + * for the inode we just locked in order to push the tail and free space + * in the log. + */ +void +xfs_lock_inodes( + xfs_inode_t **ips, + int inodes, + uint lock_mode) +{ + int attempts = 0, i, j, try_lock; + xfs_log_item_t *lp; + + ASSERT(ips && (inodes >= 2)); /* we need at least two */ + + try_lock = 0; + i = 0; + +again: + for (; i < inodes; i++) { + ASSERT(ips[i]); + + if (i && (ips[i] == ips[i-1])) /* Already locked */ + continue; + + /* + * If try_lock is not set yet, make sure all locked inodes + * are not in the AIL. + * If any are, set try_lock to be used later. + */ + + if (!try_lock) { + for (j = (i - 1); j >= 0 && !try_lock; j--) { + lp = (xfs_log_item_t *)ips[j]->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + try_lock++; + } + } + } + + /* + * If any of the previous locks we have locked is in the AIL, + * we must TRY to get the second and subsequent locks. If + * we can't get any, we must release all we have + * and try again. + */ + + if (try_lock) { + /* try_lock must be 0 if i is 0. */ + /* + * try_lock means we have an inode locked + * that is in the AIL. + */ + ASSERT(i != 0); + if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { + attempts++; + + /* + * Unlock all previous guys and try again. + * xfs_iunlock will try to push the tail + * if the inode is in the AIL. + */ + + for(j = i - 1; j >= 0; j--) { + + /* + * Check to see if we've already + * unlocked this one. + * Not the first one going back, + * and the inode ptr is the same. + */ + if ((j != (i - 1)) && ips[j] == + ips[j+1]) + continue; + + xfs_iunlock(ips[j], lock_mode); + } + + if ((attempts % 5) == 0) { + delay(1); /* Don't just spin the CPU */ +#ifdef DEBUG + xfs_lock_delays++; +#endif + } + i = 0; + try_lock = 0; + goto again; + } + } else { + xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); + } + } + +#ifdef DEBUG + if (attempts) { + if (attempts < 5) xfs_small_retries++; + else if (attempts < 100) xfs_middle_retries++; + else xfs_lots_retries++; + } else { + xfs_locked_n++; + } +#endif +} + +/* + * xfs_lock_two_inodes() can only be used to lock one type of lock + * at a time - the iolock or the ilock, but not both at once. If + * we lock both at once, lockdep will report false positives saying + * we have violated locking orders. + */ +void +xfs_lock_two_inodes( + xfs_inode_t *ip0, + xfs_inode_t *ip1, + uint lock_mode) +{ + xfs_inode_t *temp; + int attempts = 0; + xfs_log_item_t *lp; + + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); + ASSERT(ip0->i_ino != ip1->i_ino); + + if (ip0->i_ino > ip1->i_ino) { + temp = ip0; + ip0 = ip1; + ip1 = temp; + } + + again: + xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0)); + + /* + * If the first lock we have locked is in the AIL, we must TRY to get + * the second lock. If we can't get it, we must release the first one + * and try again. + */ + lp = (xfs_log_item_t *)ip0->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) { + xfs_iunlock(ip0, lock_mode); + if ((++attempts % 5) == 0) + delay(1); /* Don't just spin the CPU */ + goto again; + } + } else { + xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1)); + } +} + +int +xfs_remove( + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t *ip) +{ + xfs_mount_t *mp = dp->i_mount; + xfs_trans_t *tp = NULL; + int is_dir = S_ISDIR(ip->i_d.di_mode); + int error = 0; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + int link_zero; + uint resblks; + uint log_count; + + trace_xfs_remove(dp, name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + error = xfs_qm_dqattach(dp, 0); + if (error) + goto std_return; + + error = xfs_qm_dqattach(ip, 0); + if (error) + goto std_return; + + if (is_dir) { + tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); + log_count = XFS_DEFAULT_LOG_COUNT; + } else { + tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); + log_count = XFS_REMOVE_LOG_COUNT; + } + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + + /* + * We try to get the real space reservation first, + * allowing for directory btree deletion(s) implying + * possible bmap insert(s). If we can't get the space + * reservation then we use 0 instead, and avoid the bmap + * btree insert(s) in the directory code by, if the bmap + * insert tries to happen, instead trimming the LAST + * block from the directory. + */ + resblks = XFS_REMOVE_SPACE_RES(mp); + error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, log_count); + if (error == ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, log_count); + } + if (error) { + ASSERT(error != ENOSPC); + cancel_flags = 0; + goto out_trans_cancel; + } + + xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + + /* + * If we're removing a directory perform some additional validation. + */ + if (is_dir) { + ASSERT(ip->i_d.di_nlink >= 2); + if (ip->i_d.di_nlink != 2) { + error = XFS_ERROR(ENOTEMPTY); + goto out_trans_cancel; + } + if (!xfs_dir_isempty(ip)) { + error = XFS_ERROR(ENOTEMPTY); + goto out_trans_cancel; + } + } + + xfs_bmap_init(&free_list, &first_block); + error = xfs_dir_removename(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) { + ASSERT(error != ENOENT); + goto out_bmap_cancel; + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + if (is_dir) { + /* + * Drop the link from ip's "..". + */ + error = xfs_droplink(tp, dp); + if (error) + goto out_bmap_cancel; + + /* + * Drop the "." link from ip to self. + */ + error = xfs_droplink(tp, ip); + if (error) + goto out_bmap_cancel; + } else { + /* + * When removing a non-directory we need to log the parent + * inode here. For a directory this is done implicitly + * by the xfs_droplink call for the ".." entry. + */ + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + } + + /* + * Drop the link from dp to ip. + */ + error = xfs_droplink(tp, ip); + if (error) + goto out_bmap_cancel; + + /* + * Determine if this is the last link while + * we are in the transaction. + */ + link_zero = (ip->i_d.di_nlink == 0); + + /* + * If this is a synchronous mount, make sure that the + * remove transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto std_return; + + /* + * If we are using filestreams, kill the stream association. + * If the file is still open it may get a new one but that + * will get killed on last close in xfs_close() so we don't + * have to worry about that. + */ + if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) + xfs_filestream_deassociate(ip); + + return 0; + + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; +} + +int +xfs_link( + xfs_inode_t *tdp, + xfs_inode_t *sip, + struct xfs_name *target_name) +{ + xfs_mount_t *mp = tdp->i_mount; + xfs_trans_t *tp; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + int resblks; + + trace_xfs_link(tdp, target_name); + + ASSERT(!S_ISDIR(sip->i_d.di_mode)); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + error = xfs_qm_dqattach(sip, 0); + if (error) + goto std_return; + + error = xfs_qm_dqattach(tdp, 0); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + resblks = XFS_LINK_SPACE_RES(mp, target_name->len); + error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT); + if (error == ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT); + } + if (error) { + cancel_flags = 0; + goto error_return; + } + + xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); + + xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL); + + /* + * If the source has too many links, we can't make any more to it. + */ + if (sip->i_d.di_nlink >= XFS_MAXLINK) { + error = XFS_ERROR(EMLINK); + goto error_return; + } + + /* + * If we are using project inheritance, we only allow hard link + * creation in our tree when the project IDs are the same; else + * the tree quota mechanism could be circumvented. + */ + if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { + error = XFS_ERROR(EXDEV); + goto error_return; + } + + error = xfs_dir_canenter(tp, tdp, target_name, resblks); + if (error) + goto error_return; + + xfs_bmap_init(&free_list, &first_block); + + error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto abort_return; + xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); + + error = xfs_bumplink(tp, sip); + if (error) + goto abort_return; + + /* + * If this is a synchronous mount, make sure that the + * link transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish (&tp, &free_list, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + goto abort_return; + } + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + + abort_return: + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; +} + +int +xfs_symlink( + xfs_inode_t *dp, + struct xfs_name *link_name, + const char *target_path, + mode_t mode, + xfs_inode_t **ipp) +{ + xfs_mount_t *mp = dp->i_mount; + xfs_trans_t *tp; + xfs_inode_t *ip; + int error; + int pathlen; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + boolean_t unlock_dp_on_error = B_FALSE; + uint cancel_flags; + int committed; + xfs_fileoff_t first_fsb; + xfs_filblks_t fs_blocks; + int nmaps; + xfs_bmbt_irec_t mval[SYMLINK_MAPS]; + xfs_daddr_t d; + const char *cur_chunk; + int byte_cnt; + int n; + xfs_buf_t *bp; + prid_t prid; + struct xfs_dquot *udqp, *gdqp; + uint resblks; + + *ipp = NULL; + error = 0; + ip = NULL; + tp = NULL; + + trace_xfs_symlink(dp, link_name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * Check component lengths of the target path name. + */ + pathlen = strlen(target_path); + if (pathlen >= MAXPATHLEN) /* total string too long */ + return XFS_ERROR(ENAMETOOLONG); + + udqp = gdqp = NULL; + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + prid = xfs_get_projid(dp); + else + prid = XFS_PROJID_DEFAULT; + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + /* + * The symlink will fit into the inode data fork? + * There can't be any attributes so we get the whole variable part. + */ + if (pathlen <= XFS_LITINO(mp)) + fs_blocks = 0; + else + fs_blocks = XFS_B_TO_FSB(mp, pathlen); + resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); + error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); + if (error == ENOSPC && fs_blocks == 0) { + resblks = 0; + error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); + } + if (error) { + cancel_flags = 0; + goto error_return; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = B_TRUE; + + /* + * Check whether the directory allows new symlinks or not. + */ + if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { + error = XFS_ERROR(EPERM); + goto error_return; + } + + /* + * Reserve disk quota : blocks and inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0); + if (error) + goto error_return; + + /* + * Check for ability to enter directory entry, if no space reserved. + */ + error = xfs_dir_canenter(tp, dp, link_name, resblks); + if (error) + goto error_return; + /* + * Initialize the bmap freelist prior to calling either + * bmapi or the directory create code. + */ + xfs_bmap_init(&free_list, &first_block); + + /* + * Allocate an inode for the symlink. + */ + error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) { + if (error == ENOSPC) + goto error_return; + goto error1; + } + + /* + * An error after we've joined dp to the transaction will result in the + * transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); + unlock_dp_on_error = B_FALSE; + + /* + * Also attach the dquot(s) to it, if applicable. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); + + if (resblks) + resblks -= XFS_IALLOC_SPACE_RES(mp); + /* + * If the symlink will fit into the inode, write it inline. + */ + if (pathlen <= XFS_IFORK_DSIZE(ip)) { + xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); + memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); + ip->i_d.di_size = pathlen; + + /* + * The inode was initially created in extent format. + */ + ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); + ip->i_df.if_flags |= XFS_IFINLINE; + + ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); + + } else { + first_fsb = 0; + nmaps = SYMLINK_MAPS; + + error = xfs_bmapi(tp, ip, first_fsb, fs_blocks, + XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, + &first_block, resblks, mval, &nmaps, + &free_list); + if (error) + goto error2; + + if (resblks) + resblks -= fs_blocks; + ip->i_d.di_size = pathlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + cur_chunk = target_path; + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0); + ASSERT(bp && !XFS_BUF_GETERROR(bp)); + if (pathlen < byte_cnt) { + byte_cnt = pathlen; + } + pathlen -= byte_cnt; + + memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt); + cur_chunk += byte_cnt; + + xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1); + } + } + + /* + * Create the directory entry for the symlink. + */ + error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto error2; + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + /* + * If this is a synchronous mount, make sure that the + * symlink transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error2; + } + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + *ipp = ip; + return 0; + + error2: + IRELE(ip); + error1: + xfs_bmap_cancel(&free_list); + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + std_return: + return error; +} + +int +xfs_set_dmattrs( + xfs_inode_t *ip, + u_int evmask, + u_int16_t state) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_trans_t *tp; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); + error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + + ip->i_d.di_dmevmask = evmask; + ip->i_d.di_dmstate = state; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp, 0); + + return error; +} + +/* + * xfs_alloc_file_space() + * This routine allocates disk space for the given file. + * + * If alloc_type == 0, this request is for an ALLOCSP type + * request which will change the file size. In this case, no + * DMAPI event will be generated by the call. A TRUNCATE event + * will be generated later by xfs_setattr. + * + * If alloc_type != 0, this request is for a RESVSP type + * request, and a DMAPI DM_EVENT_WRITE will be generated if the + * lower block boundary byte address is less than the file's + * length. + * + * RETURNS: + * 0 on success + * errno on error + * + */ +STATIC int +xfs_alloc_file_space( + xfs_inode_t *ip, + xfs_off_t offset, + xfs_off_t len, + int alloc_type, + int attr_flags) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_off_t count; + xfs_filblks_t allocated_fsb; + xfs_filblks_t allocatesize_fsb; + xfs_extlen_t extsz, temp; + xfs_fileoff_t startoffset_fsb; + xfs_fsblock_t firstfsb; + int nimaps; + int bmapi_flag; + int quota_flag; + int rt; + xfs_trans_t *tp; + xfs_bmbt_irec_t imaps[1], *imapp; + xfs_bmap_free_t free_list; + uint qblocks, resblks, resrtextents; + int committed; + int error; + + trace_xfs_alloc_file_space(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + if (len <= 0) + return XFS_ERROR(EINVAL); + + rt = XFS_IS_REALTIME_INODE(ip); + extsz = xfs_get_extsz_hint(ip); + + count = len; + imapp = &imaps[0]; + nimaps = 1; + bmapi_flag = XFS_BMAPI_WRITE | alloc_type; + startoffset_fsb = XFS_B_TO_FSBT(mp, offset); + allocatesize_fsb = XFS_B_TO_FSB(mp, count); + + /* + * Allocate file space until done or until there is an error + */ + while (allocatesize_fsb && !error) { + xfs_fileoff_t s, e; + + /* + * Determine space reservations for data/realtime. + */ + if (unlikely(extsz)) { + s = startoffset_fsb; + do_div(s, extsz); + s *= extsz; + e = startoffset_fsb + allocatesize_fsb; + if ((temp = do_mod(startoffset_fsb, extsz))) + e += temp; + if ((temp = do_mod(e, extsz))) + e += extsz - temp; + } else { + s = 0; + e = allocatesize_fsb; + } + + /* + * The transaction reservation is limited to a 32-bit block + * count, hence we need to limit the number of blocks we are + * trying to reserve to avoid an overflow. We can't allocate + * more than @nimaps extents, and an extent is limited on disk + * to MAXEXTLEN (21 bits), so use that to enforce the limit. + */ + resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); + if (unlikely(rt)) { + resrtextents = qblocks = resblks; + resrtextents /= mp->m_sb.sb_rextsize; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + quota_flag = XFS_QMOPT_RES_RTBLKS; + } else { + resrtextents = 0; + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); + quota_flag = XFS_QMOPT_RES_REGBLKS; + } + + /* + * Allocate and setup the transaction. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, resblks, + XFS_WRITE_LOG_RES(mp), resrtextents, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + /* + * Check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, + 0, quota_flag); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip); + + /* + * Issue the xfs_bmapi() call to allocate the blocks + */ + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bmapi(tp, ip, startoffset_fsb, + allocatesize_fsb, bmapi_flag, + &firstfsb, 0, imapp, &nimaps, + &free_list); + if (error) { + goto error0; + } + + /* + * Complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) { + break; + } + + allocated_fsb = imapp->br_blockcount; + + if (nimaps == 0) { + error = XFS_ERROR(ENOSPC); + break; + } + + startoffset_fsb += allocated_fsb; + allocatesize_fsb -= allocated_fsb; + } + + return error; + +error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ + xfs_bmap_cancel(&free_list); + xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); + +error1: /* Just cancel transaction */ + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Zero file bytes between startoff and endoff inclusive. + * The iolock is held exclusive and no blocks are buffered. + * + * This function is used by xfs_free_file_space() to zero + * partial blocks when the range to free is not block aligned. + * When unreserving space with boundaries that are not block + * aligned we round up the start and round down the end + * boundaries and then use this function to zero the parts of + * the blocks that got dropped during the rounding. + */ +STATIC int +xfs_zero_remaining_bytes( + xfs_inode_t *ip, + xfs_off_t startoff, + xfs_off_t endoff) +{ + xfs_bmbt_irec_t imap; + xfs_fileoff_t offset_fsb; + xfs_off_t lastoffset; + xfs_off_t offset; + xfs_buf_t *bp; + xfs_mount_t *mp = ip->i_mount; + int nimap; + int error = 0; + + /* + * Avoid doing I/O beyond eof - it's not necessary + * since nothing can read beyond eof. The space will + * be zeroed when the file is extended anyway. + */ + if (startoff >= ip->i_size) + return 0; + + if (endoff > ip->i_size) + endoff = ip->i_size; + + bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp, + mp->m_sb.sb_blocksize, XBF_DONT_BLOCK); + if (!bp) + return XFS_ERROR(ENOMEM); + + for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { + offset_fsb = XFS_B_TO_FSBT(mp, offset); + nimap = 1; + error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, + NULL, 0, &imap, &nimap, NULL); + if (error || nimap < 1) + break; + ASSERT(imap.br_blockcount >= 1); + ASSERT(imap.br_startoff == offset_fsb); + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; + if (lastoffset > endoff) + lastoffset = endoff; + if (imap.br_startblock == HOLESTARTBLOCK) + continue; + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + if (imap.br_state == XFS_EXT_UNWRITTEN) + continue; + XFS_BUF_UNDONE(bp); + XFS_BUF_UNWRITE(bp); + XFS_BUF_READ(bp); + XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); + xfsbdstrat(mp, bp); + error = xfs_buf_iowait(bp); + if (error) { + xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", + mp, bp, XFS_BUF_ADDR(bp)); + break; + } + memset(XFS_BUF_PTR(bp) + + (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), + 0, lastoffset - offset + 1); + XFS_BUF_UNDONE(bp); + XFS_BUF_UNREAD(bp); + XFS_BUF_WRITE(bp); + xfsbdstrat(mp, bp); + error = xfs_buf_iowait(bp); + if (error) { + xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", + mp, bp, XFS_BUF_ADDR(bp)); + break; + } + } + xfs_buf_free(bp); + return error; +} + +/* + * xfs_free_file_space() + * This routine frees disk space for the given file. + * + * This routine is only called by xfs_change_file_space + * for an UNRESVSP type call. + * + * RETURNS: + * 0 on success + * errno on error + * + */ +STATIC int +xfs_free_file_space( + xfs_inode_t *ip, + xfs_off_t offset, + xfs_off_t len, + int attr_flags) +{ + int committed; + int done; + xfs_fileoff_t endoffset_fsb; + int error; + xfs_fsblock_t firstfsb; + xfs_bmap_free_t free_list; + xfs_bmbt_irec_t imap; + xfs_off_t ioffset; + xfs_extlen_t mod=0; + xfs_mount_t *mp; + int nimap; + uint resblks; + uint rounding; + int rt; + xfs_fileoff_t startoffset_fsb; + xfs_trans_t *tp; + int need_iolock = 1; + + mp = ip->i_mount; + + trace_xfs_free_file_space(ip); + + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + error = 0; + if (len <= 0) /* if nothing being freed */ + return error; + rt = XFS_IS_REALTIME_INODE(ip); + startoffset_fsb = XFS_B_TO_FSB(mp, offset); + endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); + + if (attr_flags & XFS_ATTR_NOLOCK) + need_iolock = 0; + if (need_iolock) { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + /* wait for the completion of any pending DIOs */ + xfs_ioend_wait(ip); + } + + rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + ioffset = offset & ~(rounding - 1); + + if (VN_CACHED(VFS_I(ip)) != 0) { + error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED); + if (error) + goto out_unlock_iolock; + } + + /* + * Need to zero the stuff we're not freeing, on disk. + * If it's a realtime file & can't use unwritten extents then we + * actually need to zero the extent edges. Otherwise xfs_bunmapi + * will take care of it for us. + */ + if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + nimap = 1; + error = xfs_bmapi(NULL, ip, startoffset_fsb, + 1, 0, NULL, 0, &imap, &nimap, NULL); + if (error) + goto out_unlock_iolock; + ASSERT(nimap == 0 || nimap == 1); + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + xfs_daddr_t block; + + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + block = imap.br_startblock; + mod = do_div(block, mp->m_sb.sb_rextsize); + if (mod) + startoffset_fsb += mp->m_sb.sb_rextsize - mod; + } + nimap = 1; + error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, + 1, 0, NULL, 0, &imap, &nimap, NULL); + if (error) + goto out_unlock_iolock; + ASSERT(nimap == 0 || nimap == 1); + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + mod++; + if (mod && (mod != mp->m_sb.sb_rextsize)) + endoffset_fsb -= mod; + } + } + if ((done = (endoffset_fsb <= startoffset_fsb))) + /* + * One contiguous piece to clear + */ + error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); + else { + /* + * Some full blocks, possibly two pieces to clear + */ + if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) + error = xfs_zero_remaining_bytes(ip, offset, + XFS_FSB_TO_B(mp, startoffset_fsb) - 1); + if (!error && + XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) + error = xfs_zero_remaining_bytes(ip, + XFS_FSB_TO_B(mp, endoffset_fsb), + offset + len - 1); + } + + /* + * free file space until done or until there is an error + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + while (!error && !done) { + + /* + * allocate and setup the transaction. Allow this + * transaction to dip into the reserve blocks to ensure + * the freeing of the space succeeds at ENOSPC. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, + resblks, + XFS_WRITE_LOG_RES(mp), + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + + /* + * check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, + ip->i_udquot, ip->i_gdquot, + resblks, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip); + + /* + * issue the bunmapi() call to free the blocks + */ + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bunmapi(tp, ip, startoffset_fsb, + endoffset_fsb - startoffset_fsb, + 0, 2, &firstfsb, &free_list, &done); + if (error) { + goto error0; + } + + /* + * complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + out_unlock_iolock: + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + + error0: + xfs_bmap_cancel(&free_list); + error1: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) : + XFS_ILOCK_EXCL); + return error; +} + +/* + * xfs_change_file_space() + * This routine allocates or frees disk space for the given file. + * The user specified parameters are checked for alignment and size + * limitations. + * + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_change_file_space( + xfs_inode_t *ip, + int cmd, + xfs_flock64_t *bf, + xfs_off_t offset, + int attr_flags) +{ + xfs_mount_t *mp = ip->i_mount; + int clrprealloc; + int error; + xfs_fsize_t fsize; + int setprealloc; + xfs_off_t startoffset; + xfs_off_t llen; + xfs_trans_t *tp; + struct iattr iattr; + int prealloc_type; + + if (!S_ISREG(ip->i_d.di_mode)) + return XFS_ERROR(EINVAL); + + switch (bf->l_whence) { + case 0: /*SEEK_SET*/ + break; + case 1: /*SEEK_CUR*/ + bf->l_start += offset; + break; + case 2: /*SEEK_END*/ + bf->l_start += ip->i_size; + break; + default: + return XFS_ERROR(EINVAL); + } + + llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len; + + if ( (bf->l_start < 0) + || (bf->l_start > XFS_MAXIOFFSET(mp)) + || (bf->l_start + llen < 0) + || (bf->l_start + llen > XFS_MAXIOFFSET(mp))) + return XFS_ERROR(EINVAL); + + bf->l_whence = 0; + + startoffset = bf->l_start; + fsize = ip->i_size; + + /* + * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve + * file space. + * These calls do NOT zero the data space allocated to the file, + * nor do they change the file size. + * + * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file + * space. + * These calls cause the new file data to be zeroed and the file + * size to be changed. + */ + setprealloc = clrprealloc = 0; + prealloc_type = XFS_BMAPI_PREALLOC; + + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + prealloc_type |= XFS_BMAPI_CONVERT; + xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0); + /* FALLTHRU */ + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + error = xfs_alloc_file_space(ip, startoffset, bf->l_len, + prealloc_type, attr_flags); + if (error) + return error; + setprealloc = 1; + break; + + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + if ((error = xfs_free_file_space(ip, startoffset, bf->l_len, + attr_flags))) + return error; + break; + + case XFS_IOC_ALLOCSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP: + case XFS_IOC_FREESP64: + if (startoffset > fsize) { + error = xfs_alloc_file_space(ip, fsize, + startoffset - fsize, 0, attr_flags); + if (error) + break; + } + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = startoffset; + + error = xfs_setattr(ip, &iattr, attr_flags); + + if (error) + return error; + + clrprealloc = 1; + break; + + default: + ASSERT(0); + return XFS_ERROR(EINVAL); + } + + /* + * update the inode timestamp, mode, and prealloc flag bits + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); + + if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp), + 0, 0, 0))) { + /* ASSERT(0); */ + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, ip); + + if ((attr_flags & XFS_ATTR_DMI) == 0) { + ip->i_d.di_mode &= ~S_ISUID; + + /* + * Note that we don't have to worry about mandatory + * file locking being disabled here because we only + * clear the S_ISGID bit if the Group execute bit is + * on, but if it was on then mandatory locking wouldn't + * have been enabled. + */ + if (ip->i_d.di_mode & S_IXGRP) + ip->i_d.di_mode &= ~S_ISGID; + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + } + if (setprealloc) + ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; + else if (clrprealloc) + ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + if (attr_flags & XFS_ATTR_SYNC) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp, 0); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + return error; +} diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h new file mode 100644 index 00000000..3bcd2335 --- /dev/null +++ b/fs/xfs/xfs_vnodeops.h @@ -0,0 +1,63 @@ +#ifndef _XFS_VNODEOPS_H +#define _XFS_VNODEOPS_H 1 + +struct attrlist_cursor_kern; +struct file; +struct iattr; +struct inode; +struct iovec; +struct kiocb; +struct pipe_inode_info; +struct uio; +struct xfs_inode; +struct xfs_iomap; + + +int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags); +#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */ +#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ +#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ +#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ +#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */ + +int xfs_readlink(struct xfs_inode *ip, char *link); +int xfs_release(struct xfs_inode *ip); +int xfs_inactive(struct xfs_inode *ip); +int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode **ipp, struct xfs_name *ci_name); +int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode, + xfs_dev_t rdev, struct xfs_inode **ipp); +int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode *ip); +int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, + struct xfs_name *target_name); +int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, + xfs_off_t *offset, filldir_t filldir); +int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, + const char *target_path, mode_t mode, struct xfs_inode **ipp); +int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); +int xfs_change_file_space(struct xfs_inode *ip, int cmd, + xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); +int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, + struct xfs_inode *src_ip, struct xfs_inode *target_dp, + struct xfs_name *target_name, struct xfs_inode *target_ip); +int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, + unsigned char *value, int *valuelenp, int flags); +int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, + unsigned char *value, int valuelen, int flags); +int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); +int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, + int flags, struct attrlist_cursor_kern *cursor); +int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, + int flags, struct xfs_iomap *iomapp, int *niomaps); +void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, + xfs_off_t last, int fiopt); +int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, + xfs_off_t last, int fiopt); +int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, + xfs_off_t last, uint64_t flags, int fiopt); +int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); + +int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); + +#endif /* _XFS_VNODEOPS_H */ diff --git a/fs/yaffs2/Kconfig b/fs/yaffs2/Kconfig new file mode 100644 index 00000000..63541405 --- /dev/null +++ b/fs/yaffs2/Kconfig @@ -0,0 +1,161 @@ +# +# YAFFS file system configurations +# + +config YAFFS_FS + tristate "YAFFS2 file system support" + default n + depends on MTD_BLOCK + select YAFFS_YAFFS1 + select YAFFS_YAFFS2 + help + YAFFS2, or Yet Another Flash Filing System, is a filing system + optimised for NAND Flash chips. + + To compile the YAFFS2 file system support as a module, choose M + here: the module will be called yaffs2. + + If unsure, say N. + + Further information on YAFFS2 is available at + . + +config YAFFS_YAFFS1 + bool "512 byte / page devices" + depends on YAFFS_FS + default y + help + Enable YAFFS1 support -- yaffs for 512 byte / page devices + + Not needed for 2K-page devices. + + If unsure, say Y. + +config YAFFS_9BYTE_TAGS + bool "Use older-style on-NAND data format with pageStatus byte" + depends on YAFFS_YAFFS1 + default n + help + + Older-style on-NAND data format has a "pageStatus" byte to record + chunk/page state. This byte is zero when the page is discarded. + Choose this option if you have existing on-NAND data using this + format that you need to continue to support. New data written + also uses the older-style format. Note: Use of this option + generally requires that MTD's oob layout be adjusted to use the + older-style format. See notes on tags formats and MTD versions + in yaffs_mtdif1.c. + + If unsure, say N. + +config YAFFS_DOES_ECC + bool "Lets Yaffs do its own ECC" + depends on YAFFS_FS && YAFFS_YAFFS1 && !YAFFS_9BYTE_TAGS + default n + help + This enables Yaffs to use its own ECC functions instead of using + the ones from the generic MTD-NAND driver. + + If unsure, say N. + +config YAFFS_ECC_WRONG_ORDER + bool "Use the same ecc byte order as Steven Hill's nand_ecc.c" + depends on YAFFS_FS && YAFFS_DOES_ECC && !YAFFS_9BYTE_TAGS + default n + help + This makes yaffs_ecc.c use the same ecc byte order as Steven + Hill's nand_ecc.c. If not set, then you get the same ecc byte + order as SmartMedia. + + If unsure, say N. + +config YAFFS_YAFFS2 + bool "2048 byte (or larger) / page devices" + depends on YAFFS_FS + default y + help + Enable YAFFS2 support -- yaffs for >= 2K bytes per page devices + + If unsure, say Y. + +config YAFFS_AUTO_YAFFS2 + bool "Autoselect yaffs2 format" + depends on YAFFS_YAFFS2 + default y + help + Without this, you need to explicitely use yaffs2 as the file + system type. With this, you can say "yaffs" and yaffs or yaffs2 + will be used depending on the device page size (yaffs on + 512-byte page devices, yaffs2 on 2K page devices). + + If unsure, say Y. + +config YAFFS_DISABLE_TAGS_ECC + bool "Disable YAFFS from doing ECC on tags by default" + depends on YAFFS_FS && YAFFS_YAFFS2 + default n + help + This defaults Yaffs to using its own ECC calculations on tags instead of + just relying on the MTD. + This behavior can also be overridden with tags_ecc_on and + tags_ecc_off mount options. + + If unsure, say N. + +config YAFFS_ALWAYS_CHECK_CHUNK_ERASED + bool "Force chunk erase check" + depends on YAFFS_FS + default n + help + Normally YAFFS only checks chunks before writing until an erased + chunk is found. This helps to detect any partially written + chunks that might have happened due to power loss. + + Enabling this forces on the test that chunks are erased in flash + before writing to them. This takes more time but is potentially + a bit more secure. + + Suggest setting Y during development and ironing out driver + issues etc. Suggest setting to N if you want faster writing. + + If unsure, say Y. + +config YAFFS_EMPTY_LOST_AND_FOUND + bool "Empty lost and found on boot" + depends on YAFFS_FS + default n + help + If this is enabled then the contents of lost and found is + automatically dumped at mount. + + If unsure, say N. + +config YAFFS_DISABLE_BLOCK_REFRESHING + bool "Disable yaffs2 block refreshing" + depends on YAFFS_FS + default n + help + If this is set, then block refreshing is disabled. + Block refreshing infrequently refreshes the oldest block in + a yaffs2 file system. This mechanism helps to refresh flash to + mitigate against data loss. This is particularly useful for MLC. + + If unsure, say N. + +config YAFFS_DISABLE_BACKGROUND + bool "Disable yaffs2 background processing" + depends on YAFFS_FS + default n + help + If this is set, then background processing is disabled. + Background processing makes many foreground activities faster. + + If unsure, say N. + +config YAFFS_XATTR + bool "Enable yaffs2 xattr support" + depends on YAFFS_FS + default y + help + If this is set then yaffs2 will provide xattr support. + If unsure, say Y. diff --git a/fs/yaffs2/Makefile b/fs/yaffs2/Makefile new file mode 100644 index 00000000..e63a28aa --- /dev/null +++ b/fs/yaffs2/Makefile @@ -0,0 +1,17 @@ +# +# Makefile for the linux YAFFS filesystem routines. +# + +obj-$(CONFIG_YAFFS_FS) += yaffs.o + +yaffs-y := yaffs_ecc.o yaffs_vfs.o yaffs_guts.o yaffs_checkptrw.o +yaffs-y += yaffs_packedtags1.o yaffs_packedtags2.o yaffs_nand.o +yaffs-y += yaffs_tagscompat.o yaffs_tagsvalidity.o +yaffs-y += yaffs_mtdif.o yaffs_mtdif1.o yaffs_mtdif2.o +yaffs-y += yaffs_nameval.o yaffs_attribs.o +yaffs-y += yaffs_allocator.o +yaffs-y += yaffs_yaffs1.o +yaffs-y += yaffs_yaffs2.o +yaffs-y += yaffs_bitmap.o +yaffs-y += yaffs_verify.o + diff --git a/fs/yaffs2/yaffs_allocator.c b/fs/yaffs2/yaffs_allocator.c new file mode 100644 index 00000000..f9cd5bec --- /dev/null +++ b/fs/yaffs2/yaffs_allocator.c @@ -0,0 +1,396 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "yaffs_allocator.h" +#include "yaffs_guts.h" +#include "yaffs_trace.h" +#include "yportenv.h" + +#ifdef CONFIG_YAFFS_KMALLOC_ALLOCATOR + +void yaffs_deinit_raw_tnodes_and_objs(struct yaffs_dev *dev) +{ + dev = dev; +} + +void yaffs_init_raw_tnodes_and_objs(struct yaffs_dev *dev) +{ + dev = dev; +} + +struct yaffs_tnode *yaffs_alloc_raw_tnode(struct yaffs_dev *dev) +{ + return (struct yaffs_tnode *)kmalloc(dev->tnode_size, GFP_NOFS); +} + +void yaffs_free_raw_tnode(struct yaffs_dev *dev, struct yaffs_tnode *tn) +{ + dev = dev; + kfree(tn); +} + +void yaffs_init_raw_objs(struct yaffs_dev *dev) +{ + dev = dev; +} + +void yaffs_deinit_raw_objs(struct yaffs_dev *dev) +{ + dev = dev; +} + +struct yaffs_obj *yaffs_alloc_raw_obj(struct yaffs_dev *dev) +{ + dev = dev; + return (struct yaffs_obj *)kmalloc(sizeof(struct yaffs_obj)); +} + +void yaffs_free_raw_obj(struct yaffs_dev *dev, struct yaffs_obj *obj) +{ + + dev = dev; + kfree(obj); +} + +#else + +struct yaffs_tnode_list { + struct yaffs_tnode_list *next; + struct yaffs_tnode *tnodes; +}; + +struct yaffs_obj_list { + struct yaffs_obj_list *next; + struct yaffs_obj *objects; +}; + +struct yaffs_allocator { + int n_tnodes_created; + struct yaffs_tnode *free_tnodes; + int n_free_tnodes; + struct yaffs_tnode_list *alloc_tnode_list; + + int n_obj_created; + struct yaffs_obj *free_objs; + int n_free_objects; + + struct yaffs_obj_list *allocated_obj_list; +}; + +static void yaffs_deinit_raw_tnodes(struct yaffs_dev *dev) +{ + + struct yaffs_allocator *allocator = + (struct yaffs_allocator *)dev->allocator; + + struct yaffs_tnode_list *tmp; + + if (!allocator) { + YBUG(); + return; + } + + while (allocator->alloc_tnode_list) { + tmp = allocator->alloc_tnode_list->next; + + kfree(allocator->alloc_tnode_list->tnodes); + kfree(allocator->alloc_tnode_list); + allocator->alloc_tnode_list = tmp; + + } + + allocator->free_tnodes = NULL; + allocator->n_free_tnodes = 0; + allocator->n_tnodes_created = 0; +} + +static void yaffs_init_raw_tnodes(struct yaffs_dev *dev) +{ + struct yaffs_allocator *allocator = dev->allocator; + + if (allocator) { + allocator->alloc_tnode_list = NULL; + allocator->free_tnodes = NULL; + allocator->n_free_tnodes = 0; + allocator->n_tnodes_created = 0; + } else { + YBUG(); + } +} + +static int yaffs_create_tnodes(struct yaffs_dev *dev, int n_tnodes) +{ + struct yaffs_allocator *allocator = + (struct yaffs_allocator *)dev->allocator; + int i; + struct yaffs_tnode *new_tnodes; + u8 *mem; + struct yaffs_tnode *curr; + struct yaffs_tnode *next; + struct yaffs_tnode_list *tnl; + + if (!allocator) { + YBUG(); + return YAFFS_FAIL; + } + + if (n_tnodes < 1) + return YAFFS_OK; + + /* make these things */ + + new_tnodes = kmalloc(n_tnodes * dev->tnode_size, GFP_NOFS); + mem = (u8 *) new_tnodes; + + if (!new_tnodes) { + yaffs_trace(YAFFS_TRACE_ERROR, + "yaffs: Could not allocate Tnodes"); + return YAFFS_FAIL; + } + + /* New hookup for wide tnodes */ + for (i = 0; i < n_tnodes - 1; i++) { + curr = (struct yaffs_tnode *)&mem[i * dev->tnode_size]; + next = (struct yaffs_tnode *)&mem[(i + 1) * dev->tnode_size]; + curr->internal[0] = next; + } + + curr = (struct yaffs_tnode *)&mem[(n_tnodes - 1) * dev->tnode_size]; + curr->internal[0] = allocator->free_tnodes; + allocator->free_tnodes = (struct yaffs_tnode *)mem; + + allocator->n_free_tnodes += n_tnodes; + allocator->n_tnodes_created += n_tnodes; + + /* Now add this bunch of tnodes to a list for freeing up. + * NB If we can't add this to the management list it isn't fatal + * but it just means we can't free this bunch of tnodes later. + */ + + tnl = kmalloc(sizeof(struct yaffs_tnode_list), GFP_NOFS); + if (!tnl) { + yaffs_trace(YAFFS_TRACE_ERROR, + "Could not add tnodes to management list"); + return YAFFS_FAIL; + } else { + tnl->tnodes = new_tnodes; + tnl->next = allocator->alloc_tnode_list; + allocator->alloc_tnode_list = tnl; + } + + yaffs_trace(YAFFS_TRACE_ALLOCATE,"Tnodes added"); + + return YAFFS_OK; +} + +struct yaffs_tnode *yaffs_alloc_raw_tnode(struct yaffs_dev *dev) +{ + struct yaffs_allocator *allocator = + (struct yaffs_allocator *)dev->allocator; + struct yaffs_tnode *tn = NULL; + + if (!allocator) { + YBUG(); + return NULL; + } + + /* If there are none left make more */ + if (!allocator->free_tnodes) + yaffs_create_tnodes(dev, YAFFS_ALLOCATION_NTNODES); + + if (allocator->free_tnodes) { + tn = allocator->free_tnodes; + allocator->free_tnodes = allocator->free_tnodes->internal[0]; + allocator->n_free_tnodes--; + } + + return tn; +} + +/* FreeTnode frees up a tnode and puts it back on the free list */ +void yaffs_free_raw_tnode(struct yaffs_dev *dev, struct yaffs_tnode *tn) +{ + struct yaffs_allocator *allocator = dev->allocator; + + if (!allocator) { + YBUG(); + return; + } + + if (tn) { + tn->internal[0] = allocator->free_tnodes; + allocator->free_tnodes = tn; + allocator->n_free_tnodes++; + } + dev->checkpoint_blocks_required = 0; /* force recalculation */ +} + +static void yaffs_init_raw_objs(struct yaffs_dev *dev) +{ + struct yaffs_allocator *allocator = dev->allocator; + + if (allocator) { + allocator->allocated_obj_list = NULL; + allocator->free_objs = NULL; + allocator->n_free_objects = 0; + } else { + YBUG(); + } +} + +static void yaffs_deinit_raw_objs(struct yaffs_dev *dev) +{ + struct yaffs_allocator *allocator = dev->allocator; + struct yaffs_obj_list *tmp; + + if (!allocator) { + YBUG(); + return; + } + + while (allocator->allocated_obj_list) { + tmp = allocator->allocated_obj_list->next; + kfree(allocator->allocated_obj_list->objects); + kfree(allocator->allocated_obj_list); + + allocator->allocated_obj_list = tmp; + } + + allocator->free_objs = NULL; + allocator->n_free_objects = 0; + allocator->n_obj_created = 0; +} + +static int yaffs_create_free_objs(struct yaffs_dev *dev, int n_obj) +{ + struct yaffs_allocator *allocator = dev->allocator; + + int i; + struct yaffs_obj *new_objs; + struct yaffs_obj_list *list; + + if (!allocator) { + YBUG(); + return YAFFS_FAIL; + } + + if (n_obj < 1) + return YAFFS_OK; + + /* make these things */ + new_objs = kmalloc(n_obj * sizeof(struct yaffs_obj), GFP_NOFS); + list = kmalloc(sizeof(struct yaffs_obj_list), GFP_NOFS); + + if (!new_objs || !list) { + if (new_objs) { + kfree(new_objs); + new_objs = NULL; + } + if (list) { + kfree(list); + list = NULL; + } + yaffs_trace(YAFFS_TRACE_ALLOCATE, + "Could not allocate more objects"); + return YAFFS_FAIL; + } + + /* Hook them into the free list */ + for (i = 0; i < n_obj - 1; i++) { + new_objs[i].siblings.next = + (struct list_head *)(&new_objs[i + 1]); + } + + new_objs[n_obj - 1].siblings.next = (void *)allocator->free_objs; + allocator->free_objs = new_objs; + allocator->n_free_objects += n_obj; + allocator->n_obj_created += n_obj; + + /* Now add this bunch of Objects to a list for freeing up. */ + + list->objects = new_objs; + list->next = allocator->allocated_obj_list; + allocator->allocated_obj_list = list; + + return YAFFS_OK; +} + +struct yaffs_obj *yaffs_alloc_raw_obj(struct yaffs_dev *dev) +{ + struct yaffs_obj *obj = NULL; + struct yaffs_allocator *allocator = dev->allocator; + + if (!allocator) { + YBUG(); + return obj; + } + + /* If there are none left make more */ + if (!allocator->free_objs) + yaffs_create_free_objs(dev, YAFFS_ALLOCATION_NOBJECTS); + + if (allocator->free_objs) { + obj = allocator->free_objs; + allocator->free_objs = + (struct yaffs_obj *)(allocator->free_objs->siblings.next); + allocator->n_free_objects--; + } + + return obj; +} + +void yaffs_free_raw_obj(struct yaffs_dev *dev, struct yaffs_obj *obj) +{ + + struct yaffs_allocator *allocator = dev->allocator; + + if (!allocator) + YBUG(); + else { + /* Link into the free list. */ + obj->siblings.next = (struct list_head *)(allocator->free_objs); + allocator->free_objs = obj; + allocator->n_free_objects++; + } +} + +void yaffs_deinit_raw_tnodes_and_objs(struct yaffs_dev *dev) +{ + if (dev->allocator) { + yaffs_deinit_raw_tnodes(dev); + yaffs_deinit_raw_objs(dev); + + kfree(dev->allocator); + dev->allocator = NULL; + } else { + YBUG(); + } +} + +void yaffs_init_raw_tnodes_and_objs(struct yaffs_dev *dev) +{ + struct yaffs_allocator *allocator; + + if (!dev->allocator) { + allocator = kmalloc(sizeof(struct yaffs_allocator), GFP_NOFS); + if (allocator) { + dev->allocator = allocator; + yaffs_init_raw_tnodes(dev); + yaffs_init_raw_objs(dev); + } + } else { + YBUG(); + } +} + +#endif diff --git a/fs/yaffs2/yaffs_allocator.h b/fs/yaffs2/yaffs_allocator.h new file mode 100644 index 00000000..4d5f2aec --- /dev/null +++ b/fs/yaffs2/yaffs_allocator.h @@ -0,0 +1,30 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_ALLOCATOR_H__ +#define __YAFFS_ALLOCATOR_H__ + +#include "yaffs_guts.h" + +void yaffs_init_raw_tnodes_and_objs(struct yaffs_dev *dev); +void yaffs_deinit_raw_tnodes_and_objs(struct yaffs_dev *dev); + +struct yaffs_tnode *yaffs_alloc_raw_tnode(struct yaffs_dev *dev); +void yaffs_free_raw_tnode(struct yaffs_dev *dev, struct yaffs_tnode *tn); + +struct yaffs_obj *yaffs_alloc_raw_obj(struct yaffs_dev *dev); +void yaffs_free_raw_obj(struct yaffs_dev *dev, struct yaffs_obj *obj); + +#endif diff --git a/fs/yaffs2/yaffs_attribs.c b/fs/yaffs2/yaffs_attribs.c new file mode 100644 index 00000000..9b47d376 --- /dev/null +++ b/fs/yaffs2/yaffs_attribs.c @@ -0,0 +1,124 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "yaffs_guts.h" +#include "yaffs_attribs.h" + +void yaffs_load_attribs(struct yaffs_obj *obj, struct yaffs_obj_hdr *oh) +{ + obj->yst_uid = oh->yst_uid; + obj->yst_gid = oh->yst_gid; + obj->yst_atime = oh->yst_atime; + obj->yst_mtime = oh->yst_mtime; + obj->yst_ctime = oh->yst_ctime; + obj->yst_rdev = oh->yst_rdev; +} + +void yaffs_load_attribs_oh(struct yaffs_obj_hdr *oh, struct yaffs_obj *obj) +{ + oh->yst_uid = obj->yst_uid; + oh->yst_gid = obj->yst_gid; + oh->yst_atime = obj->yst_atime; + oh->yst_mtime = obj->yst_mtime; + oh->yst_ctime = obj->yst_ctime; + oh->yst_rdev = obj->yst_rdev; + +} + +void yaffs_load_current_time(struct yaffs_obj *obj, int do_a, int do_c) +{ + obj->yst_mtime = Y_CURRENT_TIME; + if (do_a) + obj->yst_atime = obj->yst_mtime; + if (do_c) + obj->yst_ctime = obj->yst_mtime; +} + +void yaffs_attribs_init(struct yaffs_obj *obj, u32 gid, u32 uid, u32 rdev) +{ + yaffs_load_current_time(obj, 1, 1); + obj->yst_rdev = rdev; + obj->yst_uid = uid; + obj->yst_gid = gid; +} + +loff_t yaffs_get_file_size(struct yaffs_obj *obj) +{ + YCHAR *alias = NULL; + obj = yaffs_get_equivalent_obj(obj); + + switch (obj->variant_type) { + case YAFFS_OBJECT_TYPE_FILE: + return obj->variant.file_variant.file_size; + case YAFFS_OBJECT_TYPE_SYMLINK: + alias = obj->variant.symlink_variant.alias; + if (!alias) + return 0; + return strnlen(alias, YAFFS_MAX_ALIAS_LENGTH); + default: + return 0; + } +} + +int yaffs_set_attribs(struct yaffs_obj *obj, struct iattr *attr) +{ + unsigned int valid = attr->ia_valid; + + if (valid & ATTR_MODE) + obj->yst_mode = attr->ia_mode; + if (valid & ATTR_UID) + obj->yst_uid = attr->ia_uid; + if (valid & ATTR_GID) + obj->yst_gid = attr->ia_gid; + + if (valid & ATTR_ATIME) + obj->yst_atime = Y_TIME_CONVERT(attr->ia_atime); + if (valid & ATTR_CTIME) + obj->yst_ctime = Y_TIME_CONVERT(attr->ia_ctime); + if (valid & ATTR_MTIME) + obj->yst_mtime = Y_TIME_CONVERT(attr->ia_mtime); + + if (valid & ATTR_SIZE) + yaffs_resize_file(obj, attr->ia_size); + + yaffs_update_oh(obj, NULL, 1, 0, 0, NULL); + + return YAFFS_OK; + +} + +int yaffs_get_attribs(struct yaffs_obj *obj, struct iattr *attr) +{ + unsigned int valid = 0; + + attr->ia_mode = obj->yst_mode; + valid |= ATTR_MODE; + attr->ia_uid = obj->yst_uid; + valid |= ATTR_UID; + attr->ia_gid = obj->yst_gid; + valid |= ATTR_GID; + + Y_TIME_CONVERT(attr->ia_atime) = obj->yst_atime; + valid |= ATTR_ATIME; + Y_TIME_CONVERT(attr->ia_ctime) = obj->yst_ctime; + valid |= ATTR_CTIME; + Y_TIME_CONVERT(attr->ia_mtime) = obj->yst_mtime; + valid |= ATTR_MTIME; + + attr->ia_size = yaffs_get_file_size(obj); + valid |= ATTR_SIZE; + + attr->ia_valid = valid; + + return YAFFS_OK; +} diff --git a/fs/yaffs2/yaffs_attribs.h b/fs/yaffs2/yaffs_attribs.h new file mode 100644 index 00000000..33d541d6 --- /dev/null +++ b/fs/yaffs2/yaffs_attribs.h @@ -0,0 +1,28 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_ATTRIBS_H__ +#define __YAFFS_ATTRIBS_H__ + +#include "yaffs_guts.h" + +void yaffs_load_attribs(struct yaffs_obj *obj, struct yaffs_obj_hdr *oh); +void yaffs_load_attribs_oh(struct yaffs_obj_hdr *oh, struct yaffs_obj *obj); +void yaffs_attribs_init(struct yaffs_obj *obj, u32 gid, u32 uid, u32 rdev); +void yaffs_load_current_time(struct yaffs_obj *obj, int do_a, int do_c); +int yaffs_set_attribs(struct yaffs_obj *obj, struct iattr *attr); +int yaffs_get_attribs(struct yaffs_obj *obj, struct iattr *attr); + +#endif diff --git a/fs/yaffs2/yaffs_bitmap.c b/fs/yaffs2/yaffs_bitmap.c new file mode 100644 index 00000000..7df42cd0 --- /dev/null +++ b/fs/yaffs2/yaffs_bitmap.c @@ -0,0 +1,98 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "yaffs_bitmap.h" +#include "yaffs_trace.h" +/* + * Chunk bitmap manipulations + */ + +static inline u8 *yaffs_block_bits(struct yaffs_dev *dev, int blk) +{ + if (blk < dev->internal_start_block || blk > dev->internal_end_block) { + yaffs_trace(YAFFS_TRACE_ERROR, + "BlockBits block %d is not valid", + blk); + YBUG(); + } + return dev->chunk_bits + + (dev->chunk_bit_stride * (blk - dev->internal_start_block)); +} + +void yaffs_verify_chunk_bit_id(struct yaffs_dev *dev, int blk, int chunk) +{ + if (blk < dev->internal_start_block || blk > dev->internal_end_block || + chunk < 0 || chunk >= dev->param.chunks_per_block) { + yaffs_trace(YAFFS_TRACE_ERROR, + "Chunk Id (%d:%d) invalid", + blk, chunk); + YBUG(); + } +} + +void yaffs_clear_chunk_bits(struct yaffs_dev *dev, int blk) +{ + u8 *blk_bits = yaffs_block_bits(dev, blk); + + memset(blk_bits, 0, dev->chunk_bit_stride); +} + +void yaffs_clear_chunk_bit(struct yaffs_dev *dev, int blk, int chunk) +{ + u8 *blk_bits = yaffs_block_bits(dev, blk); + + yaffs_verify_chunk_bit_id(dev, blk, chunk); + + blk_bits[chunk / 8] &= ~(1 << (chunk & 7)); +} + +void yaffs_set_chunk_bit(struct yaffs_dev *dev, int blk, int chunk) +{ + u8 *blk_bits = yaffs_block_bits(dev, blk); + + yaffs_verify_chunk_bit_id(dev, blk, chunk); + + blk_bits[chunk / 8] |= (1 << (chunk & 7)); +} + +int yaffs_check_chunk_bit(struct yaffs_dev *dev, int blk, int chunk) +{ + u8 *blk_bits = yaffs_block_bits(dev, blk); + yaffs_verify_chunk_bit_id(dev, blk, chunk); + + return (blk_bits[chunk / 8] & (1 << (chunk & 7))) ? 1 : 0; +} + +int yaffs_still_some_chunks(struct yaffs_dev *dev, int blk) +{ + u8 *blk_bits = yaffs_block_bits(dev, blk); + int i; + for (i = 0; i < dev->chunk_bit_stride; i++) { + if (*blk_bits) + return 1; + blk_bits++; + } + return 0; +} + +int yaffs_count_chunk_bits(struct yaffs_dev *dev, int blk) +{ + u8 *blk_bits = yaffs_block_bits(dev, blk); + int i; + int n = 0; + + for (i = 0; i < dev->chunk_bit_stride; i++, blk_bits++) + n += hweight8(*blk_bits); + + return n; +} diff --git a/fs/yaffs2/yaffs_bitmap.h b/fs/yaffs2/yaffs_bitmap.h new file mode 100644 index 00000000..cf9ea58d --- /dev/null +++ b/fs/yaffs2/yaffs_bitmap.h @@ -0,0 +1,33 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +/* + * Chunk bitmap manipulations + */ + +#ifndef __YAFFS_BITMAP_H__ +#define __YAFFS_BITMAP_H__ + +#include "yaffs_guts.h" + +void yaffs_verify_chunk_bit_id(struct yaffs_dev *dev, int blk, int chunk); +void yaffs_clear_chunk_bits(struct yaffs_dev *dev, int blk); +void yaffs_clear_chunk_bit(struct yaffs_dev *dev, int blk, int chunk); +void yaffs_set_chunk_bit(struct yaffs_dev *dev, int blk, int chunk); +int yaffs_check_chunk_bit(struct yaffs_dev *dev, int blk, int chunk); +int yaffs_still_some_chunks(struct yaffs_dev *dev, int blk); +int yaffs_count_chunk_bits(struct yaffs_dev *dev, int blk); + +#endif diff --git a/fs/yaffs2/yaffs_checkptrw.c b/fs/yaffs2/yaffs_checkptrw.c new file mode 100644 index 00000000..4e40f437 --- /dev/null +++ b/fs/yaffs2/yaffs_checkptrw.c @@ -0,0 +1,415 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "yaffs_checkptrw.h" +#include "yaffs_getblockinfo.h" + +static int yaffs2_checkpt_space_ok(struct yaffs_dev *dev) +{ + int blocks_avail = dev->n_erased_blocks - dev->param.n_reserved_blocks; + + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "checkpt blocks_avail = %d", blocks_avail); + + return (blocks_avail <= 0) ? 0 : 1; +} + +static int yaffs_checkpt_erase(struct yaffs_dev *dev) +{ + int i; + + if (!dev->param.erase_fn) + return 0; + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "checking blocks %d to %d", + dev->internal_start_block, dev->internal_end_block); + + for (i = dev->internal_start_block; i <= dev->internal_end_block; i++) { + struct yaffs_block_info *bi = yaffs_get_block_info(dev, i); + if (bi->block_state == YAFFS_BLOCK_STATE_CHECKPOINT) { + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "erasing checkpt block %d", i); + + dev->n_erasures++; + + if (dev->param. + erase_fn(dev, + i - dev->block_offset /* realign */ )) { + bi->block_state = YAFFS_BLOCK_STATE_EMPTY; + dev->n_erased_blocks++; + dev->n_free_chunks += + dev->param.chunks_per_block; + } else { + dev->param.bad_block_fn(dev, i); + bi->block_state = YAFFS_BLOCK_STATE_DEAD; + } + } + } + + dev->blocks_in_checkpt = 0; + + return 1; +} + +static void yaffs2_checkpt_find_erased_block(struct yaffs_dev *dev) +{ + int i; + int blocks_avail = dev->n_erased_blocks - dev->param.n_reserved_blocks; + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "allocating checkpt block: erased %d reserved %d avail %d next %d ", + dev->n_erased_blocks, dev->param.n_reserved_blocks, + blocks_avail, dev->checkpt_next_block); + + if (dev->checkpt_next_block >= 0 && + dev->checkpt_next_block <= dev->internal_end_block && + blocks_avail > 0) { + + for (i = dev->checkpt_next_block; i <= dev->internal_end_block; + i++) { + struct yaffs_block_info *bi = + yaffs_get_block_info(dev, i); + if (bi->block_state == YAFFS_BLOCK_STATE_EMPTY) { + dev->checkpt_next_block = i + 1; + dev->checkpt_cur_block = i; + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "allocating checkpt block %d", i); + return; + } + } + } + yaffs_trace(YAFFS_TRACE_CHECKPOINT, "out of checkpt blocks"); + + dev->checkpt_next_block = -1; + dev->checkpt_cur_block = -1; +} + +static void yaffs2_checkpt_find_block(struct yaffs_dev *dev) +{ + int i; + struct yaffs_ext_tags tags; + + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "find next checkpt block: start: blocks %d next %d", + dev->blocks_in_checkpt, dev->checkpt_next_block); + + if (dev->blocks_in_checkpt < dev->checkpt_max_blocks) + for (i = dev->checkpt_next_block; i <= dev->internal_end_block; + i++) { + int chunk = i * dev->param.chunks_per_block; + int realigned_chunk = chunk - dev->chunk_offset; + + dev->param.read_chunk_tags_fn(dev, realigned_chunk, + NULL, &tags); + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "find next checkpt block: search: block %d oid %d seq %d eccr %d", + i, tags.obj_id, tags.seq_number, + tags.ecc_result); + + if (tags.seq_number == YAFFS_SEQUENCE_CHECKPOINT_DATA) { + /* Right kind of block */ + dev->checkpt_next_block = tags.obj_id; + dev->checkpt_cur_block = i; + dev->checkpt_block_list[dev-> + blocks_in_checkpt] = i; + dev->blocks_in_checkpt++; + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "found checkpt block %d", i); + return; + } + } + + yaffs_trace(YAFFS_TRACE_CHECKPOINT, "found no more checkpt blocks"); + + dev->checkpt_next_block = -1; + dev->checkpt_cur_block = -1; +} + +int yaffs2_checkpt_open(struct yaffs_dev *dev, int writing) +{ + + dev->checkpt_open_write = writing; + + /* Got the functions we need? */ + if (!dev->param.write_chunk_tags_fn || + !dev->param.read_chunk_tags_fn || + !dev->param.erase_fn || !dev->param.bad_block_fn) + return 0; + + if (writing && !yaffs2_checkpt_space_ok(dev)) + return 0; + + if (!dev->checkpt_buffer) + dev->checkpt_buffer = + kmalloc(dev->param.total_bytes_per_chunk, GFP_NOFS); + if (!dev->checkpt_buffer) + return 0; + + dev->checkpt_page_seq = 0; + dev->checkpt_byte_count = 0; + dev->checkpt_sum = 0; + dev->checkpt_xor = 0; + dev->checkpt_cur_block = -1; + dev->checkpt_cur_chunk = -1; + dev->checkpt_next_block = dev->internal_start_block; + + /* Erase all the blocks in the checkpoint area */ + if (writing) { + memset(dev->checkpt_buffer, 0, dev->data_bytes_per_chunk); + dev->checkpt_byte_offs = 0; + return yaffs_checkpt_erase(dev); + } else { + int i; + /* Set to a value that will kick off a read */ + dev->checkpt_byte_offs = dev->data_bytes_per_chunk; + /* A checkpoint block list of 1 checkpoint block per 16 block is (hopefully) + * going to be way more than we need */ + dev->blocks_in_checkpt = 0; + dev->checkpt_max_blocks = + (dev->internal_end_block - dev->internal_start_block) / 16 + + 2; + dev->checkpt_block_list = + kmalloc(sizeof(int) * dev->checkpt_max_blocks, GFP_NOFS); + if (!dev->checkpt_block_list) + return 0; + + for (i = 0; i < dev->checkpt_max_blocks; i++) + dev->checkpt_block_list[i] = -1; + } + + return 1; +} + +int yaffs2_get_checkpt_sum(struct yaffs_dev *dev, u32 * sum) +{ + u32 composite_sum; + composite_sum = (dev->checkpt_sum << 8) | (dev->checkpt_xor & 0xFF); + *sum = composite_sum; + return 1; +} + +static int yaffs2_checkpt_flush_buffer(struct yaffs_dev *dev) +{ + int chunk; + int realigned_chunk; + + struct yaffs_ext_tags tags; + + if (dev->checkpt_cur_block < 0) { + yaffs2_checkpt_find_erased_block(dev); + dev->checkpt_cur_chunk = 0; + } + + if (dev->checkpt_cur_block < 0) + return 0; + + tags.is_deleted = 0; + tags.obj_id = dev->checkpt_next_block; /* Hint to next place to look */ + tags.chunk_id = dev->checkpt_page_seq + 1; + tags.seq_number = YAFFS_SEQUENCE_CHECKPOINT_DATA; + tags.n_bytes = dev->data_bytes_per_chunk; + if (dev->checkpt_cur_chunk == 0) { + /* First chunk we write for the block? Set block state to + checkpoint */ + struct yaffs_block_info *bi = + yaffs_get_block_info(dev, dev->checkpt_cur_block); + bi->block_state = YAFFS_BLOCK_STATE_CHECKPOINT; + dev->blocks_in_checkpt++; + } + + chunk = + dev->checkpt_cur_block * dev->param.chunks_per_block + + dev->checkpt_cur_chunk; + + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "checkpoint wite buffer nand %d(%d:%d) objid %d chId %d", + chunk, dev->checkpt_cur_block, dev->checkpt_cur_chunk, + tags.obj_id, tags.chunk_id); + + realigned_chunk = chunk - dev->chunk_offset; + + dev->n_page_writes++; + + dev->param.write_chunk_tags_fn(dev, realigned_chunk, + dev->checkpt_buffer, &tags); + dev->checkpt_byte_offs = 0; + dev->checkpt_page_seq++; + dev->checkpt_cur_chunk++; + if (dev->checkpt_cur_chunk >= dev->param.chunks_per_block) { + dev->checkpt_cur_chunk = 0; + dev->checkpt_cur_block = -1; + } + memset(dev->checkpt_buffer, 0, dev->data_bytes_per_chunk); + + return 1; +} + +int yaffs2_checkpt_wr(struct yaffs_dev *dev, const void *data, int n_bytes) +{ + int i = 0; + int ok = 1; + + u8 *data_bytes = (u8 *) data; + + if (!dev->checkpt_buffer) + return 0; + + if (!dev->checkpt_open_write) + return -1; + + while (i < n_bytes && ok) { + dev->checkpt_buffer[dev->checkpt_byte_offs] = *data_bytes; + dev->checkpt_sum += *data_bytes; + dev->checkpt_xor ^= *data_bytes; + + dev->checkpt_byte_offs++; + i++; + data_bytes++; + dev->checkpt_byte_count++; + + if (dev->checkpt_byte_offs < 0 || + dev->checkpt_byte_offs >= dev->data_bytes_per_chunk) + ok = yaffs2_checkpt_flush_buffer(dev); + } + + return i; +} + +int yaffs2_checkpt_rd(struct yaffs_dev *dev, void *data, int n_bytes) +{ + int i = 0; + int ok = 1; + struct yaffs_ext_tags tags; + + int chunk; + int realigned_chunk; + + u8 *data_bytes = (u8 *) data; + + if (!dev->checkpt_buffer) + return 0; + + if (dev->checkpt_open_write) + return -1; + + while (i < n_bytes && ok) { + + if (dev->checkpt_byte_offs < 0 || + dev->checkpt_byte_offs >= dev->data_bytes_per_chunk) { + + if (dev->checkpt_cur_block < 0) { + yaffs2_checkpt_find_block(dev); + dev->checkpt_cur_chunk = 0; + } + + if (dev->checkpt_cur_block < 0) + ok = 0; + else { + chunk = dev->checkpt_cur_block * + dev->param.chunks_per_block + + dev->checkpt_cur_chunk; + + realigned_chunk = chunk - dev->chunk_offset; + + dev->n_page_reads++; + + /* read in the next chunk */ + dev->param.read_chunk_tags_fn(dev, + realigned_chunk, + dev-> + checkpt_buffer, + &tags); + + if (tags.chunk_id != (dev->checkpt_page_seq + 1) + || tags.ecc_result > YAFFS_ECC_RESULT_FIXED + || tags.seq_number != + YAFFS_SEQUENCE_CHECKPOINT_DATA) + ok = 0; + + dev->checkpt_byte_offs = 0; + dev->checkpt_page_seq++; + dev->checkpt_cur_chunk++; + + if (dev->checkpt_cur_chunk >= + dev->param.chunks_per_block) + dev->checkpt_cur_block = -1; + } + } + + if (ok) { + *data_bytes = + dev->checkpt_buffer[dev->checkpt_byte_offs]; + dev->checkpt_sum += *data_bytes; + dev->checkpt_xor ^= *data_bytes; + dev->checkpt_byte_offs++; + i++; + data_bytes++; + dev->checkpt_byte_count++; + } + } + + return i; +} + +int yaffs_checkpt_close(struct yaffs_dev *dev) +{ + + if (dev->checkpt_open_write) { + if (dev->checkpt_byte_offs != 0) + yaffs2_checkpt_flush_buffer(dev); + } else if (dev->checkpt_block_list) { + int i; + for (i = 0; + i < dev->blocks_in_checkpt + && dev->checkpt_block_list[i] >= 0; i++) { + int blk = dev->checkpt_block_list[i]; + struct yaffs_block_info *bi = NULL; + if (dev->internal_start_block <= blk + && blk <= dev->internal_end_block) + bi = yaffs_get_block_info(dev, blk); + if (bi && bi->block_state == YAFFS_BLOCK_STATE_EMPTY) + bi->block_state = YAFFS_BLOCK_STATE_CHECKPOINT; + else { + /* Todo this looks odd... */ + } + } + kfree(dev->checkpt_block_list); + dev->checkpt_block_list = NULL; + } + + dev->n_free_chunks -= + dev->blocks_in_checkpt * dev->param.chunks_per_block; + dev->n_erased_blocks -= dev->blocks_in_checkpt; + + yaffs_trace(YAFFS_TRACE_CHECKPOINT,"checkpoint byte count %d", + dev->checkpt_byte_count); + + if (dev->checkpt_buffer) { + /* free the buffer */ + kfree(dev->checkpt_buffer); + dev->checkpt_buffer = NULL; + return 1; + } else { + return 0; + } +} + +int yaffs2_checkpt_invalidate_stream(struct yaffs_dev *dev) +{ + /* Erase the checkpoint data */ + + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "checkpoint invalidate of %d blocks", + dev->blocks_in_checkpt); + + return yaffs_checkpt_erase(dev); +} diff --git a/fs/yaffs2/yaffs_checkptrw.h b/fs/yaffs2/yaffs_checkptrw.h new file mode 100644 index 00000000..361c6067 --- /dev/null +++ b/fs/yaffs2/yaffs_checkptrw.h @@ -0,0 +1,33 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_CHECKPTRW_H__ +#define __YAFFS_CHECKPTRW_H__ + +#include "yaffs_guts.h" + +int yaffs2_checkpt_open(struct yaffs_dev *dev, int writing); + +int yaffs2_checkpt_wr(struct yaffs_dev *dev, const void *data, int n_bytes); + +int yaffs2_checkpt_rd(struct yaffs_dev *dev, void *data, int n_bytes); + +int yaffs2_get_checkpt_sum(struct yaffs_dev *dev, u32 * sum); + +int yaffs_checkpt_close(struct yaffs_dev *dev); + +int yaffs2_checkpt_invalidate_stream(struct yaffs_dev *dev); + +#endif diff --git a/fs/yaffs2/yaffs_ecc.c b/fs/yaffs2/yaffs_ecc.c new file mode 100644 index 00000000..e95a8069 --- /dev/null +++ b/fs/yaffs2/yaffs_ecc.c @@ -0,0 +1,298 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * This code implements the ECC algorithm used in SmartMedia. + * + * The ECC comprises 22 bits of parity information and is stuffed into 3 bytes. + * The two unused bit are set to 1. + * The ECC can correct single bit errors in a 256-byte page of data. Thus, two such ECC + * blocks are used on a 512-byte NAND page. + * + */ + +/* Table generated by gen-ecc.c + * Using a table means we do not have to calculate p1..p4 and p1'..p4' + * for each byte of data. These are instead provided in a table in bits7..2. + * Bit 0 of each entry indicates whether the entry has an odd or even parity, and therefore + * this bytes influence on the line parity. + */ + +#include "yportenv.h" + +#include "yaffs_ecc.h" + +static const unsigned char column_parity_table[] = { + 0x00, 0x55, 0x59, 0x0c, 0x65, 0x30, 0x3c, 0x69, + 0x69, 0x3c, 0x30, 0x65, 0x0c, 0x59, 0x55, 0x00, + 0x95, 0xc0, 0xcc, 0x99, 0xf0, 0xa5, 0xa9, 0xfc, + 0xfc, 0xa9, 0xa5, 0xf0, 0x99, 0xcc, 0xc0, 0x95, + 0x99, 0xcc, 0xc0, 0x95, 0xfc, 0xa9, 0xa5, 0xf0, + 0xf0, 0xa5, 0xa9, 0xfc, 0x95, 0xc0, 0xcc, 0x99, + 0x0c, 0x59, 0x55, 0x00, 0x69, 0x3c, 0x30, 0x65, + 0x65, 0x30, 0x3c, 0x69, 0x00, 0x55, 0x59, 0x0c, + 0xa5, 0xf0, 0xfc, 0xa9, 0xc0, 0x95, 0x99, 0xcc, + 0xcc, 0x99, 0x95, 0xc0, 0xa9, 0xfc, 0xf0, 0xa5, + 0x30, 0x65, 0x69, 0x3c, 0x55, 0x00, 0x0c, 0x59, + 0x59, 0x0c, 0x00, 0x55, 0x3c, 0x69, 0x65, 0x30, + 0x3c, 0x69, 0x65, 0x30, 0x59, 0x0c, 0x00, 0x55, + 0x55, 0x00, 0x0c, 0x59, 0x30, 0x65, 0x69, 0x3c, + 0xa9, 0xfc, 0xf0, 0xa5, 0xcc, 0x99, 0x95, 0xc0, + 0xc0, 0x95, 0x99, 0xcc, 0xa5, 0xf0, 0xfc, 0xa9, + 0xa9, 0xfc, 0xf0, 0xa5, 0xcc, 0x99, 0x95, 0xc0, + 0xc0, 0x95, 0x99, 0xcc, 0xa5, 0xf0, 0xfc, 0xa9, + 0x3c, 0x69, 0x65, 0x30, 0x59, 0x0c, 0x00, 0x55, + 0x55, 0x00, 0x0c, 0x59, 0x30, 0x65, 0x69, 0x3c, + 0x30, 0x65, 0x69, 0x3c, 0x55, 0x00, 0x0c, 0x59, + 0x59, 0x0c, 0x00, 0x55, 0x3c, 0x69, 0x65, 0x30, + 0xa5, 0xf0, 0xfc, 0xa9, 0xc0, 0x95, 0x99, 0xcc, + 0xcc, 0x99, 0x95, 0xc0, 0xa9, 0xfc, 0xf0, 0xa5, + 0x0c, 0x59, 0x55, 0x00, 0x69, 0x3c, 0x30, 0x65, + 0x65, 0x30, 0x3c, 0x69, 0x00, 0x55, 0x59, 0x0c, + 0x99, 0xcc, 0xc0, 0x95, 0xfc, 0xa9, 0xa5, 0xf0, + 0xf0, 0xa5, 0xa9, 0xfc, 0x95, 0xc0, 0xcc, 0x99, + 0x95, 0xc0, 0xcc, 0x99, 0xf0, 0xa5, 0xa9, 0xfc, + 0xfc, 0xa9, 0xa5, 0xf0, 0x99, 0xcc, 0xc0, 0x95, + 0x00, 0x55, 0x59, 0x0c, 0x65, 0x30, 0x3c, 0x69, + 0x69, 0x3c, 0x30, 0x65, 0x0c, 0x59, 0x55, 0x00, +}; + + +/* Calculate the ECC for a 256-byte block of data */ +void yaffs_ecc_cacl(const unsigned char *data, unsigned char *ecc) +{ + unsigned int i; + + unsigned char col_parity = 0; + unsigned char line_parity = 0; + unsigned char line_parity_prime = 0; + unsigned char t; + unsigned char b; + + for (i = 0; i < 256; i++) { + b = column_parity_table[*data++]; + col_parity ^= b; + + if (b & 0x01) { /* odd number of bits in the byte */ + line_parity ^= i; + line_parity_prime ^= ~i; + } + } + + ecc[2] = (~col_parity) | 0x03; + + t = 0; + if (line_parity & 0x80) + t |= 0x80; + if (line_parity_prime & 0x80) + t |= 0x40; + if (line_parity & 0x40) + t |= 0x20; + if (line_parity_prime & 0x40) + t |= 0x10; + if (line_parity & 0x20) + t |= 0x08; + if (line_parity_prime & 0x20) + t |= 0x04; + if (line_parity & 0x10) + t |= 0x02; + if (line_parity_prime & 0x10) + t |= 0x01; + ecc[1] = ~t; + + t = 0; + if (line_parity & 0x08) + t |= 0x80; + if (line_parity_prime & 0x08) + t |= 0x40; + if (line_parity & 0x04) + t |= 0x20; + if (line_parity_prime & 0x04) + t |= 0x10; + if (line_parity & 0x02) + t |= 0x08; + if (line_parity_prime & 0x02) + t |= 0x04; + if (line_parity & 0x01) + t |= 0x02; + if (line_parity_prime & 0x01) + t |= 0x01; + ecc[0] = ~t; + +#ifdef CONFIG_YAFFS_ECC_WRONG_ORDER + /* Swap the bytes into the wrong order */ + t = ecc[0]; + ecc[0] = ecc[1]; + ecc[1] = t; +#endif +} + +/* Correct the ECC on a 256 byte block of data */ + +int yaffs_ecc_correct(unsigned char *data, unsigned char *read_ecc, + const unsigned char *test_ecc) +{ + unsigned char d0, d1, d2; /* deltas */ + + d0 = read_ecc[0] ^ test_ecc[0]; + d1 = read_ecc[1] ^ test_ecc[1]; + d2 = read_ecc[2] ^ test_ecc[2]; + + if ((d0 | d1 | d2) == 0) + return 0; /* no error */ + + if (((d0 ^ (d0 >> 1)) & 0x55) == 0x55 && + ((d1 ^ (d1 >> 1)) & 0x55) == 0x55 && + ((d2 ^ (d2 >> 1)) & 0x54) == 0x54) { + /* Single bit (recoverable) error in data */ + + unsigned byte; + unsigned bit; + +#ifdef CONFIG_YAFFS_ECC_WRONG_ORDER + /* swap the bytes to correct for the wrong order */ + unsigned char t; + + t = d0; + d0 = d1; + d1 = t; +#endif + + bit = byte = 0; + + if (d1 & 0x80) + byte |= 0x80; + if (d1 & 0x20) + byte |= 0x40; + if (d1 & 0x08) + byte |= 0x20; + if (d1 & 0x02) + byte |= 0x10; + if (d0 & 0x80) + byte |= 0x08; + if (d0 & 0x20) + byte |= 0x04; + if (d0 & 0x08) + byte |= 0x02; + if (d0 & 0x02) + byte |= 0x01; + + if (d2 & 0x80) + bit |= 0x04; + if (d2 & 0x20) + bit |= 0x02; + if (d2 & 0x08) + bit |= 0x01; + + data[byte] ^= (1 << bit); + + return 1; /* Corrected the error */ + } + + if ((hweight8(d0) + hweight8(d1) + hweight8(d2)) == 1) { + /* Reccoverable error in ecc */ + + read_ecc[0] = test_ecc[0]; + read_ecc[1] = test_ecc[1]; + read_ecc[2] = test_ecc[2]; + + return 1; /* Corrected the error */ + } + + /* Unrecoverable error */ + + return -1; + +} + +/* + * ECCxxxOther does ECC calcs on arbitrary n bytes of data + */ +void yaffs_ecc_calc_other(const unsigned char *data, unsigned n_bytes, + struct yaffs_ecc_other *ecc_other) +{ + unsigned int i; + + unsigned char col_parity = 0; + unsigned line_parity = 0; + unsigned line_parity_prime = 0; + unsigned char b; + + for (i = 0; i < n_bytes; i++) { + b = column_parity_table[*data++]; + col_parity ^= b; + + if (b & 0x01) { + /* odd number of bits in the byte */ + line_parity ^= i; + line_parity_prime ^= ~i; + } + + } + + ecc_other->col_parity = (col_parity >> 2) & 0x3f; + ecc_other->line_parity = line_parity; + ecc_other->line_parity_prime = line_parity_prime; +} + +int yaffs_ecc_correct_other(unsigned char *data, unsigned n_bytes, + struct yaffs_ecc_other *read_ecc, + const struct yaffs_ecc_other *test_ecc) +{ + unsigned char delta_col; /* column parity delta */ + unsigned delta_line; /* line parity delta */ + unsigned delta_line_prime; /* line parity delta */ + unsigned bit; + + delta_col = read_ecc->col_parity ^ test_ecc->col_parity; + delta_line = read_ecc->line_parity ^ test_ecc->line_parity; + delta_line_prime = + read_ecc->line_parity_prime ^ test_ecc->line_parity_prime; + + if ((delta_col | delta_line | delta_line_prime) == 0) + return 0; /* no error */ + + if (delta_line == ~delta_line_prime && + (((delta_col ^ (delta_col >> 1)) & 0x15) == 0x15)) { + /* Single bit (recoverable) error in data */ + + bit = 0; + + if (delta_col & 0x20) + bit |= 0x04; + if (delta_col & 0x08) + bit |= 0x02; + if (delta_col & 0x02) + bit |= 0x01; + + if (delta_line >= n_bytes) + return -1; + + data[delta_line] ^= (1 << bit); + + return 1; /* corrected */ + } + + if ((hweight32(delta_line) + + hweight32(delta_line_prime) + + hweight8(delta_col)) == 1) { + /* Reccoverable error in ecc */ + + *read_ecc = *test_ecc; + return 1; /* corrected */ + } + + /* Unrecoverable error */ + + return -1; +} diff --git a/fs/yaffs2/yaffs_ecc.h b/fs/yaffs2/yaffs_ecc.h new file mode 100644 index 00000000..b0c461d6 --- /dev/null +++ b/fs/yaffs2/yaffs_ecc.h @@ -0,0 +1,44 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +/* + * This code implements the ECC algorithm used in SmartMedia. + * + * The ECC comprises 22 bits of parity information and is stuffed into 3 bytes. + * The two unused bit are set to 1. + * The ECC can correct single bit errors in a 256-byte page of data. Thus, two such ECC + * blocks are used on a 512-byte NAND page. + * + */ + +#ifndef __YAFFS_ECC_H__ +#define __YAFFS_ECC_H__ + +struct yaffs_ecc_other { + unsigned char col_parity; + unsigned line_parity; + unsigned line_parity_prime; +}; + +void yaffs_ecc_cacl(const unsigned char *data, unsigned char *ecc); +int yaffs_ecc_correct(unsigned char *data, unsigned char *read_ecc, + const unsigned char *test_ecc); + +void yaffs_ecc_calc_other(const unsigned char *data, unsigned n_bytes, + struct yaffs_ecc_other *ecc); +int yaffs_ecc_correct_other(unsigned char *data, unsigned n_bytes, + struct yaffs_ecc_other *read_ecc, + const struct yaffs_ecc_other *test_ecc); +#endif diff --git a/fs/yaffs2/yaffs_getblockinfo.h b/fs/yaffs2/yaffs_getblockinfo.h new file mode 100644 index 00000000..d87acbde --- /dev/null +++ b/fs/yaffs2/yaffs_getblockinfo.h @@ -0,0 +1,35 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_GETBLOCKINFO_H__ +#define __YAFFS_GETBLOCKINFO_H__ + +#include "yaffs_guts.h" +#include "yaffs_trace.h" + +/* Function to manipulate block info */ +static inline struct yaffs_block_info *yaffs_get_block_info(struct yaffs_dev + *dev, int blk) +{ + if (blk < dev->internal_start_block || blk > dev->internal_end_block) { + yaffs_trace(YAFFS_TRACE_ERROR, + "**>> yaffs: get_block_info block %d is not valid", + blk); + YBUG(); + } + return &dev->block_info[blk - dev->internal_start_block]; +} + +#endif diff --git a/fs/yaffs2/yaffs_guts.c b/fs/yaffs2/yaffs_guts.c new file mode 100644 index 00000000..f4ae9dee --- /dev/null +++ b/fs/yaffs2/yaffs_guts.c @@ -0,0 +1,5164 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "yportenv.h" +#include "yaffs_trace.h" + +#include "yaffs_guts.h" +#include "yaffs_tagsvalidity.h" +#include "yaffs_getblockinfo.h" + +#include "yaffs_tagscompat.h" + +#include "yaffs_nand.h" + +#include "yaffs_yaffs1.h" +#include "yaffs_yaffs2.h" +#include "yaffs_bitmap.h" +#include "yaffs_verify.h" + +#include "yaffs_nand.h" +#include "yaffs_packedtags2.h" + +#include "yaffs_nameval.h" +#include "yaffs_allocator.h" + +#include "yaffs_attribs.h" + +/* Note YAFFS_GC_GOOD_ENOUGH must be <= YAFFS_GC_PASSIVE_THRESHOLD */ +#define YAFFS_GC_GOOD_ENOUGH 2 +#define YAFFS_GC_PASSIVE_THRESHOLD 4 + +#include "yaffs_ecc.h" + +/* Forward declarations */ + +static int yaffs_wr_data_obj(struct yaffs_obj *in, int inode_chunk, + const u8 * buffer, int n_bytes, int use_reserve); + + + +/* Function to calculate chunk and offset */ + +static void yaffs_addr_to_chunk(struct yaffs_dev *dev, loff_t addr, + int *chunk_out, u32 * offset_out) +{ + int chunk; + u32 offset; + + chunk = (u32) (addr >> dev->chunk_shift); + + if (dev->chunk_div == 1) { + /* easy power of 2 case */ + offset = (u32) (addr & dev->chunk_mask); + } else { + /* Non power-of-2 case */ + + loff_t chunk_base; + + chunk /= dev->chunk_div; + + chunk_base = ((loff_t) chunk) * dev->data_bytes_per_chunk; + offset = (u32) (addr - chunk_base); + } + + *chunk_out = chunk; + *offset_out = offset; +} + +/* Function to return the number of shifts for a power of 2 greater than or + * equal to the given number + * Note we don't try to cater for all possible numbers and this does not have to + * be hellishly efficient. + */ + +static u32 calc_shifts_ceiling(u32 x) +{ + int extra_bits; + int shifts; + + shifts = extra_bits = 0; + + while (x > 1) { + if (x & 1) + extra_bits++; + x >>= 1; + shifts++; + } + + if (extra_bits) + shifts++; + + return shifts; +} + +/* Function to return the number of shifts to get a 1 in bit 0 + */ + +static u32 calc_shifts(u32 x) +{ + u32 shifts; + + shifts = 0; + + if (!x) + return 0; + + while (!(x & 1)) { + x >>= 1; + shifts++; + } + + return shifts; +} + +/* + * Temporary buffer manipulations. + */ + +static int yaffs_init_tmp_buffers(struct yaffs_dev *dev) +{ + int i; + u8 *buf = (u8 *) 1; + + memset(dev->temp_buffer, 0, sizeof(dev->temp_buffer)); + + for (i = 0; buf && i < YAFFS_N_TEMP_BUFFERS; i++) { + dev->temp_buffer[i].line = 0; /* not in use */ + dev->temp_buffer[i].buffer = buf = + kmalloc(dev->param.total_bytes_per_chunk, GFP_NOFS); + } + + return buf ? YAFFS_OK : YAFFS_FAIL; +} + +u8 *yaffs_get_temp_buffer(struct yaffs_dev * dev, int line_no) +{ + int i, j; + + dev->temp_in_use++; + if (dev->temp_in_use > dev->max_temp) + dev->max_temp = dev->temp_in_use; + + for (i = 0; i < YAFFS_N_TEMP_BUFFERS; i++) { + if (dev->temp_buffer[i].line == 0) { + dev->temp_buffer[i].line = line_no; + if ((i + 1) > dev->max_temp) { + dev->max_temp = i + 1; + for (j = 0; j <= i; j++) + dev->temp_buffer[j].max_line = + dev->temp_buffer[j].line; + } + + return dev->temp_buffer[i].buffer; + } + } + + yaffs_trace(YAFFS_TRACE_BUFFERS, + "Out of temp buffers at line %d, other held by lines:", + line_no); + for (i = 0; i < YAFFS_N_TEMP_BUFFERS; i++) + yaffs_trace(YAFFS_TRACE_BUFFERS," %d", dev->temp_buffer[i].line); + + /* + * If we got here then we have to allocate an unmanaged one + * This is not good. + */ + + dev->unmanaged_buffer_allocs++; + return kmalloc(dev->data_bytes_per_chunk, GFP_NOFS); + +} + +void yaffs_release_temp_buffer(struct yaffs_dev *dev, u8 * buffer, int line_no) +{ + int i; + + dev->temp_in_use--; + + for (i = 0; i < YAFFS_N_TEMP_BUFFERS; i++) { + if (dev->temp_buffer[i].buffer == buffer) { + dev->temp_buffer[i].line = 0; + return; + } + } + + if (buffer) { + /* assume it is an unmanaged one. */ + yaffs_trace(YAFFS_TRACE_BUFFERS, + "Releasing unmanaged temp buffer in line %d", + line_no); + kfree(buffer); + dev->unmanaged_buffer_deallocs++; + } + +} + +/* + * Determine if we have a managed buffer. + */ +int yaffs_is_managed_tmp_buffer(struct yaffs_dev *dev, const u8 * buffer) +{ + int i; + + for (i = 0; i < YAFFS_N_TEMP_BUFFERS; i++) { + if (dev->temp_buffer[i].buffer == buffer) + return 1; + } + + for (i = 0; i < dev->param.n_caches; i++) { + if (dev->cache[i].data == buffer) + return 1; + } + + if (buffer == dev->checkpt_buffer) + return 1; + + yaffs_trace(YAFFS_TRACE_ALWAYS, + "yaffs: unmaged buffer detected."); + return 0; +} + +/* + * Functions for robustisizing TODO + * + */ + +static void yaffs_handle_chunk_wr_ok(struct yaffs_dev *dev, int nand_chunk, + const u8 * data, + const struct yaffs_ext_tags *tags) +{ + dev = dev; + nand_chunk = nand_chunk; + data = data; + tags = tags; +} + +static void yaffs_handle_chunk_update(struct yaffs_dev *dev, int nand_chunk, + const struct yaffs_ext_tags *tags) +{ + dev = dev; + nand_chunk = nand_chunk; + tags = tags; +} + +void yaffs_handle_chunk_error(struct yaffs_dev *dev, + struct yaffs_block_info *bi) +{ + if (!bi->gc_prioritise) { + bi->gc_prioritise = 1; + dev->has_pending_prioritised_gc = 1; + bi->chunk_error_strikes++; + + if (bi->chunk_error_strikes > 3) { + bi->needs_retiring = 1; /* Too many stikes, so retire this */ + yaffs_trace(YAFFS_TRACE_ALWAYS, "yaffs: Block struck out"); + + } + } +} + +static void yaffs_handle_chunk_wr_error(struct yaffs_dev *dev, int nand_chunk, + int erased_ok) +{ + int flash_block = nand_chunk / dev->param.chunks_per_block; + struct yaffs_block_info *bi = yaffs_get_block_info(dev, flash_block); + + yaffs_handle_chunk_error(dev, bi); + + if (erased_ok) { + /* Was an actual write failure, so mark the block for retirement */ + bi->needs_retiring = 1; + yaffs_trace(YAFFS_TRACE_ERROR | YAFFS_TRACE_BAD_BLOCKS, + "**>> Block %d needs retiring", flash_block); + } + + /* Delete the chunk */ + yaffs_chunk_del(dev, nand_chunk, 1, __LINE__); + yaffs_skip_rest_of_block(dev); +} + +/* + * Verification code + */ + +/* + * Simple hash function. Needs to have a reasonable spread + */ + +static inline int yaffs_hash_fn(int n) +{ + n = abs(n); + return n % YAFFS_NOBJECT_BUCKETS; +} + +/* + * Access functions to useful fake objects. + * Note that root might have a presence in NAND if permissions are set. + */ + +struct yaffs_obj *yaffs_root(struct yaffs_dev *dev) +{ + return dev->root_dir; +} + +struct yaffs_obj *yaffs_lost_n_found(struct yaffs_dev *dev) +{ + return dev->lost_n_found; +} + +/* + * Erased NAND checking functions + */ + +int yaffs_check_ff(u8 * buffer, int n_bytes) +{ + /* Horrible, slow implementation */ + while (n_bytes--) { + if (*buffer != 0xFF) + return 0; + buffer++; + } + return 1; +} + +static int yaffs_check_chunk_erased(struct yaffs_dev *dev, int nand_chunk) +{ + int retval = YAFFS_OK; + u8 *data = yaffs_get_temp_buffer(dev, __LINE__); + struct yaffs_ext_tags tags; + int result; + + result = yaffs_rd_chunk_tags_nand(dev, nand_chunk, data, &tags); + + if (tags.ecc_result > YAFFS_ECC_RESULT_NO_ERROR) + retval = YAFFS_FAIL; + + if (!yaffs_check_ff(data, dev->data_bytes_per_chunk) || + tags.chunk_used) { + yaffs_trace(YAFFS_TRACE_NANDACCESS, "Chunk %d not erased", nand_chunk); + retval = YAFFS_FAIL; + } + + yaffs_release_temp_buffer(dev, data, __LINE__); + + return retval; + +} + +static int yaffs_verify_chunk_written(struct yaffs_dev *dev, + int nand_chunk, + const u8 * data, + struct yaffs_ext_tags *tags) +{ + int retval = YAFFS_OK; + struct yaffs_ext_tags temp_tags; + u8 *buffer = yaffs_get_temp_buffer(dev, __LINE__); + int result; + + result = yaffs_rd_chunk_tags_nand(dev, nand_chunk, buffer, &temp_tags); + if (memcmp(buffer, data, dev->data_bytes_per_chunk) || + temp_tags.obj_id != tags->obj_id || + temp_tags.chunk_id != tags->chunk_id || + temp_tags.n_bytes != tags->n_bytes) + retval = YAFFS_FAIL; + + yaffs_release_temp_buffer(dev, buffer, __LINE__); + + return retval; +} + + +int yaffs_check_alloc_available(struct yaffs_dev *dev, int n_chunks) +{ + int reserved_chunks; + int reserved_blocks = dev->param.n_reserved_blocks; + int checkpt_blocks; + + checkpt_blocks = yaffs_calc_checkpt_blocks_required(dev); + + reserved_chunks = + ((reserved_blocks + checkpt_blocks) * dev->param.chunks_per_block); + + return (dev->n_free_chunks > (reserved_chunks + n_chunks)); +} + +static int yaffs_find_alloc_block(struct yaffs_dev *dev) +{ + int i; + + struct yaffs_block_info *bi; + + if (dev->n_erased_blocks < 1) { + /* Hoosterman we've got a problem. + * Can't get space to gc + */ + yaffs_trace(YAFFS_TRACE_ERROR, + "yaffs tragedy: no more erased blocks" ); + + return -1; + } + + /* Find an empty block. */ + + for (i = dev->internal_start_block; i <= dev->internal_end_block; i++) { + dev->alloc_block_finder++; + if (dev->alloc_block_finder < dev->internal_start_block + || dev->alloc_block_finder > dev->internal_end_block) { + dev->alloc_block_finder = dev->internal_start_block; + } + + bi = yaffs_get_block_info(dev, dev->alloc_block_finder); + + if (bi->block_state == YAFFS_BLOCK_STATE_EMPTY) { + bi->block_state = YAFFS_BLOCK_STATE_ALLOCATING; + dev->seq_number++; + bi->seq_number = dev->seq_number; + dev->n_erased_blocks--; + yaffs_trace(YAFFS_TRACE_ALLOCATE, + "Allocated block %d, seq %d, %d left" , + dev->alloc_block_finder, dev->seq_number, + dev->n_erased_blocks); + return dev->alloc_block_finder; + } + } + + yaffs_trace(YAFFS_TRACE_ALWAYS, + "yaffs tragedy: no more erased blocks, but there should have been %d", + dev->n_erased_blocks); + + return -1; +} + +static int yaffs_alloc_chunk(struct yaffs_dev *dev, int use_reserver, + struct yaffs_block_info **block_ptr) +{ + int ret_val; + struct yaffs_block_info *bi; + + if (dev->alloc_block < 0) { + /* Get next block to allocate off */ + dev->alloc_block = yaffs_find_alloc_block(dev); + dev->alloc_page = 0; + } + + if (!use_reserver && !yaffs_check_alloc_available(dev, 1)) { + /* Not enough space to allocate unless we're allowed to use the reserve. */ + return -1; + } + + if (dev->n_erased_blocks < dev->param.n_reserved_blocks + && dev->alloc_page == 0) + yaffs_trace(YAFFS_TRACE_ALLOCATE, "Allocating reserve"); + + /* Next page please.... */ + if (dev->alloc_block >= 0) { + bi = yaffs_get_block_info(dev, dev->alloc_block); + + ret_val = (dev->alloc_block * dev->param.chunks_per_block) + + dev->alloc_page; + bi->pages_in_use++; + yaffs_set_chunk_bit(dev, dev->alloc_block, dev->alloc_page); + + dev->alloc_page++; + + dev->n_free_chunks--; + + /* If the block is full set the state to full */ + if (dev->alloc_page >= dev->param.chunks_per_block) { + bi->block_state = YAFFS_BLOCK_STATE_FULL; + dev->alloc_block = -1; + } + + if (block_ptr) + *block_ptr = bi; + + return ret_val; + } + + yaffs_trace(YAFFS_TRACE_ERROR, "!!!!!!!!! Allocator out !!!!!!!!!!!!!!!!!" ); + + return -1; +} + +static int yaffs_get_erased_chunks(struct yaffs_dev *dev) +{ + int n; + + n = dev->n_erased_blocks * dev->param.chunks_per_block; + + if (dev->alloc_block > 0) + n += (dev->param.chunks_per_block - dev->alloc_page); + + return n; + +} + +/* + * yaffs_skip_rest_of_block() skips over the rest of the allocation block + * if we don't want to write to it. + */ +void yaffs_skip_rest_of_block(struct yaffs_dev *dev) +{ + if (dev->alloc_block > 0) { + struct yaffs_block_info *bi = + yaffs_get_block_info(dev, dev->alloc_block); + if (bi->block_state == YAFFS_BLOCK_STATE_ALLOCATING) { + bi->block_state = YAFFS_BLOCK_STATE_FULL; + dev->alloc_block = -1; + } + } +} + +static int yaffs_write_new_chunk(struct yaffs_dev *dev, + const u8 * data, + struct yaffs_ext_tags *tags, int use_reserver) +{ + int attempts = 0; + int write_ok = 0; + int chunk; + + yaffs2_checkpt_invalidate(dev); + + do { + struct yaffs_block_info *bi = 0; + int erased_ok = 0; + + chunk = yaffs_alloc_chunk(dev, use_reserver, &bi); + if (chunk < 0) { + /* no space */ + break; + } + + /* First check this chunk is erased, if it needs + * checking. The checking policy (unless forced + * always on) is as follows: + * + * Check the first page we try to write in a block. + * If the check passes then we don't need to check any + * more. If the check fails, we check again... + * If the block has been erased, we don't need to check. + * + * However, if the block has been prioritised for gc, + * then we think there might be something odd about + * this block and stop using it. + * + * Rationale: We should only ever see chunks that have + * not been erased if there was a partially written + * chunk due to power loss. This checking policy should + * catch that case with very few checks and thus save a + * lot of checks that are most likely not needed. + * + * Mods to the above + * If an erase check fails or the write fails we skip the + * rest of the block. + */ + + /* let's give it a try */ + attempts++; + + if (dev->param.always_check_erased) + bi->skip_erased_check = 0; + + if (!bi->skip_erased_check) { + erased_ok = yaffs_check_chunk_erased(dev, chunk); + if (erased_ok != YAFFS_OK) { + yaffs_trace(YAFFS_TRACE_ERROR, + "**>> yaffs chunk %d was not erased", + chunk); + + /* If not erased, delete this one, + * skip rest of block and + * try another chunk */ + yaffs_chunk_del(dev, chunk, 1, __LINE__); + yaffs_skip_rest_of_block(dev); + continue; + } + } + + write_ok = yaffs_wr_chunk_tags_nand(dev, chunk, data, tags); + + if (!bi->skip_erased_check) + write_ok = + yaffs_verify_chunk_written(dev, chunk, data, tags); + + if (write_ok != YAFFS_OK) { + /* Clean up aborted write, skip to next block and + * try another chunk */ + yaffs_handle_chunk_wr_error(dev, chunk, erased_ok); + continue; + } + + bi->skip_erased_check = 1; + + /* Copy the data into the robustification buffer */ + yaffs_handle_chunk_wr_ok(dev, chunk, data, tags); + + } while (write_ok != YAFFS_OK && + (yaffs_wr_attempts <= 0 || attempts <= yaffs_wr_attempts)); + + if (!write_ok) + chunk = -1; + + if (attempts > 1) { + yaffs_trace(YAFFS_TRACE_ERROR, + "**>> yaffs write required %d attempts", + attempts); + dev->n_retired_writes += (attempts - 1); + } + + return chunk; +} + +/* + * Block retiring for handling a broken block. + */ + +static void yaffs_retire_block(struct yaffs_dev *dev, int flash_block) +{ + struct yaffs_block_info *bi = yaffs_get_block_info(dev, flash_block); + + yaffs2_checkpt_invalidate(dev); + + yaffs2_clear_oldest_dirty_seq(dev, bi); + + if (yaffs_mark_bad(dev, flash_block) != YAFFS_OK) { + if (yaffs_erase_block(dev, flash_block) != YAFFS_OK) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "yaffs: Failed to mark bad and erase block %d", + flash_block); + } else { + struct yaffs_ext_tags tags; + int chunk_id = + flash_block * dev->param.chunks_per_block; + + u8 *buffer = yaffs_get_temp_buffer(dev, __LINE__); + + memset(buffer, 0xff, dev->data_bytes_per_chunk); + yaffs_init_tags(&tags); + tags.seq_number = YAFFS_SEQUENCE_BAD_BLOCK; + if (dev->param.write_chunk_tags_fn(dev, chunk_id - + dev->chunk_offset, + buffer, + &tags) != YAFFS_OK) + yaffs_trace(YAFFS_TRACE_ALWAYS, + "yaffs: Failed to write bad block marker to block %d", + flash_block); + + yaffs_release_temp_buffer(dev, buffer, __LINE__); + } + } + + bi->block_state = YAFFS_BLOCK_STATE_DEAD; + bi->gc_prioritise = 0; + bi->needs_retiring = 0; + + dev->n_retired_blocks++; +} + +/*---------------- Name handling functions ------------*/ + +static u16 yaffs_calc_name_sum(const YCHAR * name) +{ + u16 sum = 0; + u16 i = 1; + + const YUCHAR *bname = (const YUCHAR *)name; + if (bname) { + while ((*bname) && (i < (YAFFS_MAX_NAME_LENGTH / 2))) { + + /* 0x1f mask is case insensitive */ + sum += ((*bname) & 0x1f) * i; + i++; + bname++; + } + } + return sum; +} + +void yaffs_set_obj_name(struct yaffs_obj *obj, const YCHAR * name) +{ +#ifndef CONFIG_YAFFS_NO_SHORT_NAMES + memset(obj->short_name, 0, sizeof(obj->short_name)); + if (name && + strnlen(name, YAFFS_SHORT_NAME_LENGTH + 1) <= + YAFFS_SHORT_NAME_LENGTH) + strcpy(obj->short_name, name); + else + obj->short_name[0] = _Y('\0'); +#endif + obj->sum = yaffs_calc_name_sum(name); +} + +void yaffs_set_obj_name_from_oh(struct yaffs_obj *obj, + const struct yaffs_obj_hdr *oh) +{ +#ifdef CONFIG_YAFFS_AUTO_UNICODE + YCHAR tmp_name[YAFFS_MAX_NAME_LENGTH + 1]; + memset(tmp_name, 0, sizeof(tmp_name)); + yaffs_load_name_from_oh(obj->my_dev, tmp_name, oh->name, + YAFFS_MAX_NAME_LENGTH + 1); + yaffs_set_obj_name(obj, tmp_name); +#else + yaffs_set_obj_name(obj, oh->name); +#endif +} + +/*-------------------- TNODES ------------------- + + * List of spare tnodes + * The list is hooked together using the first pointer + * in the tnode. + */ + +struct yaffs_tnode *yaffs_get_tnode(struct yaffs_dev *dev) +{ + struct yaffs_tnode *tn = yaffs_alloc_raw_tnode(dev); + if (tn) { + memset(tn, 0, dev->tnode_size); + dev->n_tnodes++; + } + + dev->checkpoint_blocks_required = 0; /* force recalculation */ + + return tn; +} + +/* FreeTnode frees up a tnode and puts it back on the free list */ +static void yaffs_free_tnode(struct yaffs_dev *dev, struct yaffs_tnode *tn) +{ + yaffs_free_raw_tnode(dev, tn); + dev->n_tnodes--; + dev->checkpoint_blocks_required = 0; /* force recalculation */ +} + +static void yaffs_deinit_tnodes_and_objs(struct yaffs_dev *dev) +{ + yaffs_deinit_raw_tnodes_and_objs(dev); + dev->n_obj = 0; + dev->n_tnodes = 0; +} + +void yaffs_load_tnode_0(struct yaffs_dev *dev, struct yaffs_tnode *tn, + unsigned pos, unsigned val) +{ + u32 *map = (u32 *) tn; + u32 bit_in_map; + u32 bit_in_word; + u32 word_in_map; + u32 mask; + + pos &= YAFFS_TNODES_LEVEL0_MASK; + val >>= dev->chunk_grp_bits; + + bit_in_map = pos * dev->tnode_width; + word_in_map = bit_in_map / 32; + bit_in_word = bit_in_map & (32 - 1); + + mask = dev->tnode_mask << bit_in_word; + + map[word_in_map] &= ~mask; + map[word_in_map] |= (mask & (val << bit_in_word)); + + if (dev->tnode_width > (32 - bit_in_word)) { + bit_in_word = (32 - bit_in_word); + word_in_map++;; + mask = + dev->tnode_mask >> ( /*dev->tnode_width - */ bit_in_word); + map[word_in_map] &= ~mask; + map[word_in_map] |= (mask & (val >> bit_in_word)); + } +} + +u32 yaffs_get_group_base(struct yaffs_dev *dev, struct yaffs_tnode *tn, + unsigned pos) +{ + u32 *map = (u32 *) tn; + u32 bit_in_map; + u32 bit_in_word; + u32 word_in_map; + u32 val; + + pos &= YAFFS_TNODES_LEVEL0_MASK; + + bit_in_map = pos * dev->tnode_width; + word_in_map = bit_in_map / 32; + bit_in_word = bit_in_map & (32 - 1); + + val = map[word_in_map] >> bit_in_word; + + if (dev->tnode_width > (32 - bit_in_word)) { + bit_in_word = (32 - bit_in_word); + word_in_map++;; + val |= (map[word_in_map] << bit_in_word); + } + + val &= dev->tnode_mask; + val <<= dev->chunk_grp_bits; + + return val; +} + +/* ------------------- End of individual tnode manipulation -----------------*/ + +/* ---------Functions to manipulate the look-up tree (made up of tnodes) ------ + * The look up tree is represented by the top tnode and the number of top_level + * in the tree. 0 means only the level 0 tnode is in the tree. + */ + +/* FindLevel0Tnode finds the level 0 tnode, if one exists. */ +struct yaffs_tnode *yaffs_find_tnode_0(struct yaffs_dev *dev, + struct yaffs_file_var *file_struct, + u32 chunk_id) +{ + struct yaffs_tnode *tn = file_struct->top; + u32 i; + int required_depth; + int level = file_struct->top_level; + + dev = dev; + + /* Check sane level and chunk Id */ + if (level < 0 || level > YAFFS_TNODES_MAX_LEVEL) + return NULL; + + if (chunk_id > YAFFS_MAX_CHUNK_ID) + return NULL; + + /* First check we're tall enough (ie enough top_level) */ + + i = chunk_id >> YAFFS_TNODES_LEVEL0_BITS; + required_depth = 0; + while (i) { + i >>= YAFFS_TNODES_INTERNAL_BITS; + required_depth++; + } + + if (required_depth > file_struct->top_level) + return NULL; /* Not tall enough, so we can't find it */ + + /* Traverse down to level 0 */ + while (level > 0 && tn) { + tn = tn->internal[(chunk_id >> + (YAFFS_TNODES_LEVEL0_BITS + + (level - 1) * + YAFFS_TNODES_INTERNAL_BITS)) & + YAFFS_TNODES_INTERNAL_MASK]; + level--; + } + + return tn; +} + +/* AddOrFindLevel0Tnode finds the level 0 tnode if it exists, otherwise first expands the tree. + * This happens in two steps: + * 1. If the tree isn't tall enough, then make it taller. + * 2. Scan down the tree towards the level 0 tnode adding tnodes if required. + * + * Used when modifying the tree. + * + * If the tn argument is NULL, then a fresh tnode will be added otherwise the specified tn will + * be plugged into the ttree. + */ + +struct yaffs_tnode *yaffs_add_find_tnode_0(struct yaffs_dev *dev, + struct yaffs_file_var *file_struct, + u32 chunk_id, + struct yaffs_tnode *passed_tn) +{ + int required_depth; + int i; + int l; + struct yaffs_tnode *tn; + + u32 x; + + /* Check sane level and page Id */ + if (file_struct->top_level < 0 + || file_struct->top_level > YAFFS_TNODES_MAX_LEVEL) + return NULL; + + if (chunk_id > YAFFS_MAX_CHUNK_ID) + return NULL; + + /* First check we're tall enough (ie enough top_level) */ + + x = chunk_id >> YAFFS_TNODES_LEVEL0_BITS; + required_depth = 0; + while (x) { + x >>= YAFFS_TNODES_INTERNAL_BITS; + required_depth++; + } + + if (required_depth > file_struct->top_level) { + /* Not tall enough, gotta make the tree taller */ + for (i = file_struct->top_level; i < required_depth; i++) { + + tn = yaffs_get_tnode(dev); + + if (tn) { + tn->internal[0] = file_struct->top; + file_struct->top = tn; + file_struct->top_level++; + } else { + yaffs_trace(YAFFS_TRACE_ERROR, "yaffs: no more tnodes"); + return NULL; + } + } + } + + /* Traverse down to level 0, adding anything we need */ + + l = file_struct->top_level; + tn = file_struct->top; + + if (l > 0) { + while (l > 0 && tn) { + x = (chunk_id >> + (YAFFS_TNODES_LEVEL0_BITS + + (l - 1) * YAFFS_TNODES_INTERNAL_BITS)) & + YAFFS_TNODES_INTERNAL_MASK; + + if ((l > 1) && !tn->internal[x]) { + /* Add missing non-level-zero tnode */ + tn->internal[x] = yaffs_get_tnode(dev); + if (!tn->internal[x]) + return NULL; + } else if (l == 1) { + /* Looking from level 1 at level 0 */ + if (passed_tn) { + /* If we already have one, then release it. */ + if (tn->internal[x]) + yaffs_free_tnode(dev, + tn-> + internal[x]); + tn->internal[x] = passed_tn; + + } else if (!tn->internal[x]) { + /* Don't have one, none passed in */ + tn->internal[x] = yaffs_get_tnode(dev); + if (!tn->internal[x]) + return NULL; + } + } + + tn = tn->internal[x]; + l--; + } + } else { + /* top is level 0 */ + if (passed_tn) { + memcpy(tn, passed_tn, + (dev->tnode_width * YAFFS_NTNODES_LEVEL0) / 8); + yaffs_free_tnode(dev, passed_tn); + } + } + + return tn; +} + +static int yaffs_tags_match(const struct yaffs_ext_tags *tags, int obj_id, + int chunk_obj) +{ + return (tags->chunk_id == chunk_obj && + tags->obj_id == obj_id && !tags->is_deleted) ? 1 : 0; + +} + +static int yaffs_find_chunk_in_group(struct yaffs_dev *dev, int the_chunk, + struct yaffs_ext_tags *tags, int obj_id, + int inode_chunk) +{ + int j; + + for (j = 0; the_chunk && j < dev->chunk_grp_size; j++) { + if (yaffs_check_chunk_bit + (dev, the_chunk / dev->param.chunks_per_block, + the_chunk % dev->param.chunks_per_block)) { + + if (dev->chunk_grp_size == 1) + return the_chunk; + else { + yaffs_rd_chunk_tags_nand(dev, the_chunk, NULL, + tags); + if (yaffs_tags_match(tags, obj_id, inode_chunk)) { + /* found it; */ + return the_chunk; + } + } + } + the_chunk++; + } + return -1; +} + +static int yaffs_find_chunk_in_file(struct yaffs_obj *in, int inode_chunk, + struct yaffs_ext_tags *tags) +{ + /*Get the Tnode, then get the level 0 offset chunk offset */ + struct yaffs_tnode *tn; + int the_chunk = -1; + struct yaffs_ext_tags local_tags; + int ret_val = -1; + + struct yaffs_dev *dev = in->my_dev; + + if (!tags) { + /* Passed a NULL, so use our own tags space */ + tags = &local_tags; + } + + tn = yaffs_find_tnode_0(dev, &in->variant.file_variant, inode_chunk); + + if (tn) { + the_chunk = yaffs_get_group_base(dev, tn, inode_chunk); + + ret_val = + yaffs_find_chunk_in_group(dev, the_chunk, tags, in->obj_id, + inode_chunk); + } + return ret_val; +} + +static int yaffs_find_del_file_chunk(struct yaffs_obj *in, int inode_chunk, + struct yaffs_ext_tags *tags) +{ + /* Get the Tnode, then get the level 0 offset chunk offset */ + struct yaffs_tnode *tn; + int the_chunk = -1; + struct yaffs_ext_tags local_tags; + + struct yaffs_dev *dev = in->my_dev; + int ret_val = -1; + + if (!tags) { + /* Passed a NULL, so use our own tags space */ + tags = &local_tags; + } + + tn = yaffs_find_tnode_0(dev, &in->variant.file_variant, inode_chunk); + + if (tn) { + + the_chunk = yaffs_get_group_base(dev, tn, inode_chunk); + + ret_val = + yaffs_find_chunk_in_group(dev, the_chunk, tags, in->obj_id, + inode_chunk); + + /* Delete the entry in the filestructure (if found) */ + if (ret_val != -1) + yaffs_load_tnode_0(dev, tn, inode_chunk, 0); + } + + return ret_val; +} + +int yaffs_put_chunk_in_file(struct yaffs_obj *in, int inode_chunk, + int nand_chunk, int in_scan) +{ + /* NB in_scan is zero unless scanning. + * For forward scanning, in_scan is > 0; + * for backward scanning in_scan is < 0 + * + * nand_chunk = 0 is a dummy insert to make sure the tnodes are there. + */ + + struct yaffs_tnode *tn; + struct yaffs_dev *dev = in->my_dev; + int existing_cunk; + struct yaffs_ext_tags existing_tags; + struct yaffs_ext_tags new_tags; + unsigned existing_serial, new_serial; + + if (in->variant_type != YAFFS_OBJECT_TYPE_FILE) { + /* Just ignore an attempt at putting a chunk into a non-file during scanning + * If it is not during Scanning then something went wrong! + */ + if (!in_scan) { + yaffs_trace(YAFFS_TRACE_ERROR, + "yaffs tragedy:attempt to put data chunk into a non-file" + ); + YBUG(); + } + + yaffs_chunk_del(dev, nand_chunk, 1, __LINE__); + return YAFFS_OK; + } + + tn = yaffs_add_find_tnode_0(dev, + &in->variant.file_variant, + inode_chunk, NULL); + if (!tn) + return YAFFS_FAIL; + + if (!nand_chunk) + /* Dummy insert, bail now */ + return YAFFS_OK; + + existing_cunk = yaffs_get_group_base(dev, tn, inode_chunk); + + if (in_scan != 0) { + /* If we're scanning then we need to test for duplicates + * NB This does not need to be efficient since it should only ever + * happen when the power fails during a write, then only one + * chunk should ever be affected. + * + * Correction for YAFFS2: This could happen quite a lot and we need to think about efficiency! TODO + * Update: For backward scanning we don't need to re-read tags so this is quite cheap. + */ + + if (existing_cunk > 0) { + /* NB Right now existing chunk will not be real chunk_id if the chunk group size > 1 + * thus we have to do a FindChunkInFile to get the real chunk id. + * + * We have a duplicate now we need to decide which one to use: + * + * Backwards scanning YAFFS2: The old one is what we use, dump the new one. + * Forward scanning YAFFS2: The new one is what we use, dump the old one. + * YAFFS1: Get both sets of tags and compare serial numbers. + */ + + if (in_scan > 0) { + /* Only do this for forward scanning */ + yaffs_rd_chunk_tags_nand(dev, + nand_chunk, + NULL, &new_tags); + + /* Do a proper find */ + existing_cunk = + yaffs_find_chunk_in_file(in, inode_chunk, + &existing_tags); + } + + if (existing_cunk <= 0) { + /*Hoosterman - how did this happen? */ + + yaffs_trace(YAFFS_TRACE_ERROR, + "yaffs tragedy: existing chunk < 0 in scan" + ); + + } + + /* NB The deleted flags should be false, otherwise the chunks will + * not be loaded during a scan + */ + + if (in_scan > 0) { + new_serial = new_tags.serial_number; + existing_serial = existing_tags.serial_number; + } + + if ((in_scan > 0) && + (existing_cunk <= 0 || + ((existing_serial + 1) & 3) == new_serial)) { + /* Forward scanning. + * Use new + * Delete the old one and drop through to update the tnode + */ + yaffs_chunk_del(dev, existing_cunk, 1, + __LINE__); + } else { + /* Backward scanning or we want to use the existing one + * Use existing. + * Delete the new one and return early so that the tnode isn't changed + */ + yaffs_chunk_del(dev, nand_chunk, 1, __LINE__); + return YAFFS_OK; + } + } + + } + + if (existing_cunk == 0) + in->n_data_chunks++; + + yaffs_load_tnode_0(dev, tn, inode_chunk, nand_chunk); + + return YAFFS_OK; +} + +static void yaffs_soft_del_chunk(struct yaffs_dev *dev, int chunk) +{ + struct yaffs_block_info *the_block; + unsigned block_no; + + yaffs_trace(YAFFS_TRACE_DELETION, "soft delete chunk %d", chunk); + + block_no = chunk / dev->param.chunks_per_block; + the_block = yaffs_get_block_info(dev, block_no); + if (the_block) { + the_block->soft_del_pages++; + dev->n_free_chunks++; + yaffs2_update_oldest_dirty_seq(dev, block_no, the_block); + } +} + +/* SoftDeleteWorker scans backwards through the tnode tree and soft deletes all the chunks in the file. + * All soft deleting does is increment the block's softdelete count and pulls the chunk out + * of the tnode. + * Thus, essentially this is the same as DeleteWorker except that the chunks are soft deleted. + */ + +static int yaffs_soft_del_worker(struct yaffs_obj *in, struct yaffs_tnode *tn, + u32 level, int chunk_offset) +{ + int i; + int the_chunk; + int all_done = 1; + struct yaffs_dev *dev = in->my_dev; + + if (tn) { + if (level > 0) { + + for (i = YAFFS_NTNODES_INTERNAL - 1; all_done && i >= 0; + i--) { + if (tn->internal[i]) { + all_done = + yaffs_soft_del_worker(in, + tn->internal + [i], + level - 1, + (chunk_offset + << + YAFFS_TNODES_INTERNAL_BITS) + + i); + if (all_done) { + yaffs_free_tnode(dev, + tn->internal + [i]); + tn->internal[i] = NULL; + } else { + /* Hoosterman... how could this happen? */ + } + } + } + return (all_done) ? 1 : 0; + } else if (level == 0) { + + for (i = YAFFS_NTNODES_LEVEL0 - 1; i >= 0; i--) { + the_chunk = yaffs_get_group_base(dev, tn, i); + if (the_chunk) { + /* Note this does not find the real chunk, only the chunk group. + * We make an assumption that a chunk group is not larger than + * a block. + */ + yaffs_soft_del_chunk(dev, the_chunk); + yaffs_load_tnode_0(dev, tn, i, 0); + } + + } + return 1; + + } + + } + + return 1; + +} + +static void yaffs_remove_obj_from_dir(struct yaffs_obj *obj) +{ + struct yaffs_dev *dev = obj->my_dev; + struct yaffs_obj *parent; + + yaffs_verify_obj_in_dir(obj); + parent = obj->parent; + + yaffs_verify_dir(parent); + + if (dev && dev->param.remove_obj_fn) + dev->param.remove_obj_fn(obj); + + list_del_init(&obj->siblings); + obj->parent = NULL; + + yaffs_verify_dir(parent); +} + +void yaffs_add_obj_to_dir(struct yaffs_obj *directory, struct yaffs_obj *obj) +{ + if (!directory) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "tragedy: Trying to add an object to a null pointer directory" + ); + YBUG(); + return; + } + if (directory->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "tragedy: Trying to add an object to a non-directory" + ); + YBUG(); + } + + if (obj->siblings.prev == NULL) { + /* Not initialised */ + YBUG(); + } + + yaffs_verify_dir(directory); + + yaffs_remove_obj_from_dir(obj); + + /* Now add it */ + list_add(&obj->siblings, &directory->variant.dir_variant.children); + obj->parent = directory; + + if (directory == obj->my_dev->unlinked_dir + || directory == obj->my_dev->del_dir) { + obj->unlinked = 1; + obj->my_dev->n_unlinked_files++; + obj->rename_allowed = 0; + } + + yaffs_verify_dir(directory); + yaffs_verify_obj_in_dir(obj); +} + +static int yaffs_change_obj_name(struct yaffs_obj *obj, + struct yaffs_obj *new_dir, + const YCHAR * new_name, int force, int shadows) +{ + int unlink_op; + int del_op; + + struct yaffs_obj *existing_target; + + if (new_dir == NULL) + new_dir = obj->parent; /* use the old directory */ + + if (new_dir->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "tragedy: yaffs_change_obj_name: new_dir is not a directory" + ); + YBUG(); + } + + /* TODO: Do we need this different handling for YAFFS2 and YAFFS1?? */ + if (obj->my_dev->param.is_yaffs2) + unlink_op = (new_dir == obj->my_dev->unlinked_dir); + else + unlink_op = (new_dir == obj->my_dev->unlinked_dir + && obj->variant_type == YAFFS_OBJECT_TYPE_FILE); + + del_op = (new_dir == obj->my_dev->del_dir); + + existing_target = yaffs_find_by_name(new_dir, new_name); + + /* If the object is a file going into the unlinked directory, + * then it is OK to just stuff it in since duplicate names are allowed. + * else only proceed if the new name does not exist and if we're putting + * it into a directory. + */ + if ((unlink_op || + del_op || + force || + (shadows > 0) || + !existing_target) && + new_dir->variant_type == YAFFS_OBJECT_TYPE_DIRECTORY) { + yaffs_set_obj_name(obj, new_name); + obj->dirty = 1; + + yaffs_add_obj_to_dir(new_dir, obj); + + if (unlink_op) + obj->unlinked = 1; + + /* If it is a deletion then we mark it as a shrink for gc purposes. */ + if (yaffs_update_oh(obj, new_name, 0, del_op, shadows, NULL) >= + 0) + return YAFFS_OK; + } + + return YAFFS_FAIL; +} + +/*------------------------ Short Operations Cache ---------------------------------------- + * In many situations where there is no high level buffering a lot of + * reads might be short sequential reads, and a lot of writes may be short + * sequential writes. eg. scanning/writing a jpeg file. + * In these cases, a short read/write cache can provide a huge perfomance + * benefit with dumb-as-a-rock code. + * In Linux, the page cache provides read buffering and the short op cache + * provides write buffering. + * + * There are a limited number (~10) of cache chunks per device so that we don't + * need a very intelligent search. + */ + +static int yaffs_obj_cache_dirty(struct yaffs_obj *obj) +{ + struct yaffs_dev *dev = obj->my_dev; + int i; + struct yaffs_cache *cache; + int n_caches = obj->my_dev->param.n_caches; + + for (i = 0; i < n_caches; i++) { + cache = &dev->cache[i]; + if (cache->object == obj && cache->dirty) + return 1; + } + + return 0; +} + +static void yaffs_flush_file_cache(struct yaffs_obj *obj) +{ + struct yaffs_dev *dev = obj->my_dev; + int lowest = -99; /* Stop compiler whining. */ + int i; + struct yaffs_cache *cache; + int chunk_written = 0; + int n_caches = obj->my_dev->param.n_caches; + + if (n_caches > 0) { + do { + cache = NULL; + + /* Find the dirty cache for this object with the lowest chunk id. */ + for (i = 0; i < n_caches; i++) { + if (dev->cache[i].object == obj && + dev->cache[i].dirty) { + if (!cache + || dev->cache[i].chunk_id < + lowest) { + cache = &dev->cache[i]; + lowest = cache->chunk_id; + } + } + } + + if (cache && !cache->locked) { + /* Write it out and free it up */ + + chunk_written = + yaffs_wr_data_obj(cache->object, + cache->chunk_id, + cache->data, + cache->n_bytes, 1); + cache->dirty = 0; + cache->object = NULL; + } + + } while (cache && chunk_written > 0); + + if (cache) + /* Hoosterman, disk full while writing cache out. */ + yaffs_trace(YAFFS_TRACE_ERROR, + "yaffs tragedy: no space during cache write"); + + } + +} + +/*yaffs_flush_whole_cache(dev) + * + * + */ + +void yaffs_flush_whole_cache(struct yaffs_dev *dev) +{ + struct yaffs_obj *obj; + int n_caches = dev->param.n_caches; + int i; + + /* Find a dirty object in the cache and flush it... + * until there are no further dirty objects. + */ + do { + obj = NULL; + for (i = 0; i < n_caches && !obj; i++) { + if (dev->cache[i].object && dev->cache[i].dirty) + obj = dev->cache[i].object; + + } + if (obj) + yaffs_flush_file_cache(obj); + + } while (obj); + +} + +/* Grab us a cache chunk for use. + * First look for an empty one. + * Then look for the least recently used non-dirty one. + * Then look for the least recently used dirty one...., flush and look again. + */ +static struct yaffs_cache *yaffs_grab_chunk_worker(struct yaffs_dev *dev) +{ + int i; + + if (dev->param.n_caches > 0) { + for (i = 0; i < dev->param.n_caches; i++) { + if (!dev->cache[i].object) + return &dev->cache[i]; + } + } + + return NULL; +} + +static struct yaffs_cache *yaffs_grab_chunk_cache(struct yaffs_dev *dev) +{ + struct yaffs_cache *cache; + struct yaffs_obj *the_obj; + int usage; + int i; + int pushout; + + if (dev->param.n_caches > 0) { + /* Try find a non-dirty one... */ + + cache = yaffs_grab_chunk_worker(dev); + + if (!cache) { + /* They were all dirty, find the last recently used object and flush + * its cache, then find again. + * NB what's here is not very accurate, we actually flush the object + * the last recently used page. + */ + + /* With locking we can't assume we can use entry zero */ + + the_obj = NULL; + usage = -1; + cache = NULL; + pushout = -1; + + for (i = 0; i < dev->param.n_caches; i++) { + if (dev->cache[i].object && + !dev->cache[i].locked && + (dev->cache[i].last_use < usage + || !cache)) { + usage = dev->cache[i].last_use; + the_obj = dev->cache[i].object; + cache = &dev->cache[i]; + pushout = i; + } + } + + if (!cache || cache->dirty) { + /* Flush and try again */ + yaffs_flush_file_cache(the_obj); + cache = yaffs_grab_chunk_worker(dev); + } + + } + return cache; + } else { + return NULL; + } +} + +/* Find a cached chunk */ +static struct yaffs_cache *yaffs_find_chunk_cache(const struct yaffs_obj *obj, + int chunk_id) +{ + struct yaffs_dev *dev = obj->my_dev; + int i; + if (dev->param.n_caches > 0) { + for (i = 0; i < dev->param.n_caches; i++) { + if (dev->cache[i].object == obj && + dev->cache[i].chunk_id == chunk_id) { + dev->cache_hits++; + + return &dev->cache[i]; + } + } + } + return NULL; +} + +/* Mark the chunk for the least recently used algorithym */ +static void yaffs_use_cache(struct yaffs_dev *dev, struct yaffs_cache *cache, + int is_write) +{ + + if (dev->param.n_caches > 0) { + if (dev->cache_last_use < 0 || dev->cache_last_use > 100000000) { + /* Reset the cache usages */ + int i; + for (i = 1; i < dev->param.n_caches; i++) + dev->cache[i].last_use = 0; + + dev->cache_last_use = 0; + } + + dev->cache_last_use++; + + cache->last_use = dev->cache_last_use; + + if (is_write) + cache->dirty = 1; + } +} + +/* Invalidate a single cache page. + * Do this when a whole page gets written, + * ie the short cache for this page is no longer valid. + */ +static void yaffs_invalidate_chunk_cache(struct yaffs_obj *object, int chunk_id) +{ + if (object->my_dev->param.n_caches > 0) { + struct yaffs_cache *cache = + yaffs_find_chunk_cache(object, chunk_id); + + if (cache) + cache->object = NULL; + } +} + +/* Invalidate all the cache pages associated with this object + * Do this whenever ther file is deleted or resized. + */ +static void yaffs_invalidate_whole_cache(struct yaffs_obj *in) +{ + int i; + struct yaffs_dev *dev = in->my_dev; + + if (dev->param.n_caches > 0) { + /* Invalidate it. */ + for (i = 0; i < dev->param.n_caches; i++) { + if (dev->cache[i].object == in) + dev->cache[i].object = NULL; + } + } +} + +static void yaffs_unhash_obj(struct yaffs_obj *obj) +{ + int bucket; + struct yaffs_dev *dev = obj->my_dev; + + /* If it is still linked into the bucket list, free from the list */ + if (!list_empty(&obj->hash_link)) { + list_del_init(&obj->hash_link); + bucket = yaffs_hash_fn(obj->obj_id); + dev->obj_bucket[bucket].count--; + } +} + +/* FreeObject frees up a Object and puts it back on the free list */ +static void yaffs_free_obj(struct yaffs_obj *obj) +{ + struct yaffs_dev *dev = obj->my_dev; + + yaffs_trace(YAFFS_TRACE_OS, "FreeObject %p inode %p", + obj, obj->my_inode); + + if (!obj) + YBUG(); + if (obj->parent) + YBUG(); + if (!list_empty(&obj->siblings)) + YBUG(); + + if (obj->my_inode) { + /* We're still hooked up to a cached inode. + * Don't delete now, but mark for later deletion + */ + obj->defered_free = 1; + return; + } + + yaffs_unhash_obj(obj); + + yaffs_free_raw_obj(dev, obj); + dev->n_obj--; + dev->checkpoint_blocks_required = 0; /* force recalculation */ +} + +void yaffs_handle_defered_free(struct yaffs_obj *obj) +{ + if (obj->defered_free) + yaffs_free_obj(obj); +} + +static int yaffs_generic_obj_del(struct yaffs_obj *in) +{ + + /* First off, invalidate the file's data in the cache, without flushing. */ + yaffs_invalidate_whole_cache(in); + + if (in->my_dev->param.is_yaffs2 && (in->parent != in->my_dev->del_dir)) { + /* Move to the unlinked directory so we have a record that it was deleted. */ + yaffs_change_obj_name(in, in->my_dev->del_dir, _Y("deleted"), 0, + 0); + + } + + yaffs_remove_obj_from_dir(in); + yaffs_chunk_del(in->my_dev, in->hdr_chunk, 1, __LINE__); + in->hdr_chunk = 0; + + yaffs_free_obj(in); + return YAFFS_OK; + +} + +static void yaffs_soft_del_file(struct yaffs_obj *obj) +{ + if (obj->deleted && + obj->variant_type == YAFFS_OBJECT_TYPE_FILE && !obj->soft_del) { + if (obj->n_data_chunks <= 0) { + /* Empty file with no duplicate object headers, + * just delete it immediately */ + yaffs_free_tnode(obj->my_dev, + obj->variant.file_variant.top); + obj->variant.file_variant.top = NULL; + yaffs_trace(YAFFS_TRACE_TRACING, + "yaffs: Deleting empty file %d", + obj->obj_id); + yaffs_generic_obj_del(obj); + } else { + yaffs_soft_del_worker(obj, + obj->variant.file_variant.top, + obj->variant. + file_variant.top_level, 0); + obj->soft_del = 1; + } + } +} + +/* Pruning removes any part of the file structure tree that is beyond the + * bounds of the file (ie that does not point to chunks). + * + * A file should only get pruned when its size is reduced. + * + * Before pruning, the chunks must be pulled from the tree and the + * level 0 tnode entries must be zeroed out. + * Could also use this for file deletion, but that's probably better handled + * by a special case. + * + * This function is recursive. For levels > 0 the function is called again on + * any sub-tree. For level == 0 we just check if the sub-tree has data. + * If there is no data in a subtree then it is pruned. + */ + +static struct yaffs_tnode *yaffs_prune_worker(struct yaffs_dev *dev, + struct yaffs_tnode *tn, u32 level, + int del0) +{ + int i; + int has_data; + + if (tn) { + has_data = 0; + + if (level > 0) { + for (i = 0; i < YAFFS_NTNODES_INTERNAL; i++) { + if (tn->internal[i]) { + tn->internal[i] = + yaffs_prune_worker(dev, + tn->internal[i], + level - 1, + (i == + 0) ? del0 : 1); + } + + if (tn->internal[i]) + has_data++; + } + } else { + int tnode_size_u32 = dev->tnode_size / sizeof(u32); + u32 *map = (u32 *) tn; + + for (i = 0; !has_data && i < tnode_size_u32; i++) { + if (map[i]) + has_data++; + } + } + + if (has_data == 0 && del0) { + /* Free and return NULL */ + + yaffs_free_tnode(dev, tn); + tn = NULL; + } + + } + + return tn; + +} + +static int yaffs_prune_tree(struct yaffs_dev *dev, + struct yaffs_file_var *file_struct) +{ + int i; + int has_data; + int done = 0; + struct yaffs_tnode *tn; + + if (file_struct->top_level > 0) { + file_struct->top = + yaffs_prune_worker(dev, file_struct->top, + file_struct->top_level, 0); + + /* Now we have a tree with all the non-zero branches NULL but the height + * is the same as it was. + * Let's see if we can trim internal tnodes to shorten the tree. + * We can do this if only the 0th element in the tnode is in use + * (ie all the non-zero are NULL) + */ + + while (file_struct->top_level && !done) { + tn = file_struct->top; + + has_data = 0; + for (i = 1; i < YAFFS_NTNODES_INTERNAL; i++) { + if (tn->internal[i]) + has_data++; + } + + if (!has_data) { + file_struct->top = tn->internal[0]; + file_struct->top_level--; + yaffs_free_tnode(dev, tn); + } else { + done = 1; + } + } + } + + return YAFFS_OK; +} + +/*-------------------- End of File Structure functions.-------------------*/ + +/* AllocateEmptyObject gets us a clean Object. Tries to make allocate more if we run out */ +static struct yaffs_obj *yaffs_alloc_empty_obj(struct yaffs_dev *dev) +{ + struct yaffs_obj *obj = yaffs_alloc_raw_obj(dev); + + if (obj) { + dev->n_obj++; + + /* Now sweeten it up... */ + + memset(obj, 0, sizeof(struct yaffs_obj)); + obj->being_created = 1; + + obj->my_dev = dev; + obj->hdr_chunk = 0; + obj->variant_type = YAFFS_OBJECT_TYPE_UNKNOWN; + INIT_LIST_HEAD(&(obj->hard_links)); + INIT_LIST_HEAD(&(obj->hash_link)); + INIT_LIST_HEAD(&obj->siblings); + + /* Now make the directory sane */ + if (dev->root_dir) { + obj->parent = dev->root_dir; + list_add(&(obj->siblings), + &dev->root_dir->variant.dir_variant.children); + } + + /* Add it to the lost and found directory. + * NB Can't put root or lost-n-found in lost-n-found so + * check if lost-n-found exists first + */ + if (dev->lost_n_found) + yaffs_add_obj_to_dir(dev->lost_n_found, obj); + + obj->being_created = 0; + } + + dev->checkpoint_blocks_required = 0; /* force recalculation */ + + return obj; +} + +static int yaffs_find_nice_bucket(struct yaffs_dev *dev) +{ + int i; + int l = 999; + int lowest = 999999; + + /* Search for the shortest list or one that + * isn't too long. + */ + + for (i = 0; i < 10 && lowest > 4; i++) { + dev->bucket_finder++; + dev->bucket_finder %= YAFFS_NOBJECT_BUCKETS; + if (dev->obj_bucket[dev->bucket_finder].count < lowest) { + lowest = dev->obj_bucket[dev->bucket_finder].count; + l = dev->bucket_finder; + } + + } + + return l; +} + +static int yaffs_new_obj_id(struct yaffs_dev *dev) +{ + int bucket = yaffs_find_nice_bucket(dev); + + /* Now find an object value that has not already been taken + * by scanning the list. + */ + + int found = 0; + struct list_head *i; + + u32 n = (u32) bucket; + + /* yaffs_check_obj_hash_sane(); */ + + while (!found) { + found = 1; + n += YAFFS_NOBJECT_BUCKETS; + if (1 || dev->obj_bucket[bucket].count > 0) { + list_for_each(i, &dev->obj_bucket[bucket].list) { + /* If there is already one in the list */ + if (i && list_entry(i, struct yaffs_obj, + hash_link)->obj_id == n) { + found = 0; + } + } + } + } + + return n; +} + +static void yaffs_hash_obj(struct yaffs_obj *in) +{ + int bucket = yaffs_hash_fn(in->obj_id); + struct yaffs_dev *dev = in->my_dev; + + list_add(&in->hash_link, &dev->obj_bucket[bucket].list); + dev->obj_bucket[bucket].count++; +} + +struct yaffs_obj *yaffs_find_by_number(struct yaffs_dev *dev, u32 number) +{ + int bucket = yaffs_hash_fn(number); + struct list_head *i; + struct yaffs_obj *in; + + list_for_each(i, &dev->obj_bucket[bucket].list) { + /* Look if it is in the list */ + if (i) { + in = list_entry(i, struct yaffs_obj, hash_link); + if (in->obj_id == number) { + + /* Don't tell the VFS about this one if it is defered free */ + if (in->defered_free) + return NULL; + + return in; + } + } + } + + return NULL; +} + +struct yaffs_obj *yaffs_new_obj(struct yaffs_dev *dev, int number, + enum yaffs_obj_type type) +{ + struct yaffs_obj *the_obj = NULL; + struct yaffs_tnode *tn = NULL; + + if (number < 0) + number = yaffs_new_obj_id(dev); + + if (type == YAFFS_OBJECT_TYPE_FILE) { + tn = yaffs_get_tnode(dev); + if (!tn) + return NULL; + } + + the_obj = yaffs_alloc_empty_obj(dev); + if (!the_obj) { + if (tn) + yaffs_free_tnode(dev, tn); + return NULL; + } + + if (the_obj) { + the_obj->fake = 0; + the_obj->rename_allowed = 1; + the_obj->unlink_allowed = 1; + the_obj->obj_id = number; + yaffs_hash_obj(the_obj); + the_obj->variant_type = type; + yaffs_load_current_time(the_obj, 1, 1); + + switch (type) { + case YAFFS_OBJECT_TYPE_FILE: + the_obj->variant.file_variant.file_size = 0; + the_obj->variant.file_variant.scanned_size = 0; + the_obj->variant.file_variant.shrink_size = ~0; /* max */ + the_obj->variant.file_variant.top_level = 0; + the_obj->variant.file_variant.top = tn; + break; + case YAFFS_OBJECT_TYPE_DIRECTORY: + INIT_LIST_HEAD(&the_obj->variant.dir_variant.children); + INIT_LIST_HEAD(&the_obj->variant.dir_variant.dirty); + break; + case YAFFS_OBJECT_TYPE_SYMLINK: + case YAFFS_OBJECT_TYPE_HARDLINK: + case YAFFS_OBJECT_TYPE_SPECIAL: + /* No action required */ + break; + case YAFFS_OBJECT_TYPE_UNKNOWN: + /* todo this should not happen */ + break; + } + } + + return the_obj; +} + +static struct yaffs_obj *yaffs_create_fake_dir(struct yaffs_dev *dev, + int number, u32 mode) +{ + + struct yaffs_obj *obj = + yaffs_new_obj(dev, number, YAFFS_OBJECT_TYPE_DIRECTORY); + if (obj) { + obj->fake = 1; /* it is fake so it might have no NAND presence... */ + obj->rename_allowed = 0; /* ... and we're not allowed to rename it... */ + obj->unlink_allowed = 0; /* ... or unlink it */ + obj->deleted = 0; + obj->unlinked = 0; + obj->yst_mode = mode; + obj->my_dev = dev; + obj->hdr_chunk = 0; /* Not a valid chunk. */ + } + + return obj; + +} + + +static void yaffs_init_tnodes_and_objs(struct yaffs_dev *dev) +{ + int i; + + dev->n_obj = 0; + dev->n_tnodes = 0; + + yaffs_init_raw_tnodes_and_objs(dev); + + for (i = 0; i < YAFFS_NOBJECT_BUCKETS; i++) { + INIT_LIST_HEAD(&dev->obj_bucket[i].list); + dev->obj_bucket[i].count = 0; + } +} + +struct yaffs_obj *yaffs_find_or_create_by_number(struct yaffs_dev *dev, + int number, + enum yaffs_obj_type type) +{ + struct yaffs_obj *the_obj = NULL; + + if (number > 0) + the_obj = yaffs_find_by_number(dev, number); + + if (!the_obj) + the_obj = yaffs_new_obj(dev, number, type); + + return the_obj; + +} + +YCHAR *yaffs_clone_str(const YCHAR * str) +{ + YCHAR *new_str = NULL; + int len; + + if (!str) + str = _Y(""); + + len = strnlen(str, YAFFS_MAX_ALIAS_LENGTH); + new_str = kmalloc((len + 1) * sizeof(YCHAR), GFP_NOFS); + if (new_str) { + strncpy(new_str, str, len); + new_str[len] = 0; + } + return new_str; + +} +/* + *yaffs_update_parent() handles fixing a directories mtime and ctime when a new + * link (ie. name) is created or deleted in the directory. + * + * ie. + * create dir/a : update dir's mtime/ctime + * rm dir/a: update dir's mtime/ctime + * modify dir/a: don't update dir's mtimme/ctime + * + * This can be handled immediately or defered. Defering helps reduce the number + * of updates when many files in a directory are changed within a brief period. + * + * If the directory updating is defered then yaffs_update_dirty_dirs must be + * called periodically. + */ + +static void yaffs_update_parent(struct yaffs_obj *obj) +{ + struct yaffs_dev *dev; + if (!obj) + return; + dev = obj->my_dev; + obj->dirty = 1; + yaffs_load_current_time(obj, 0, 1); + if (dev->param.defered_dir_update) { + struct list_head *link = &obj->variant.dir_variant.dirty; + + if (list_empty(link)) { + list_add(link, &dev->dirty_dirs); + yaffs_trace(YAFFS_TRACE_BACKGROUND, + "Added object %d to dirty directories", + obj->obj_id); + } + + } else { + yaffs_update_oh(obj, NULL, 0, 0, 0, NULL); + } +} + +void yaffs_update_dirty_dirs(struct yaffs_dev *dev) +{ + struct list_head *link; + struct yaffs_obj *obj; + struct yaffs_dir_var *d_s; + union yaffs_obj_var *o_v; + + yaffs_trace(YAFFS_TRACE_BACKGROUND, "Update dirty directories"); + + while (!list_empty(&dev->dirty_dirs)) { + link = dev->dirty_dirs.next; + list_del_init(link); + + d_s = list_entry(link, struct yaffs_dir_var, dirty); + o_v = list_entry(d_s, union yaffs_obj_var, dir_variant); + obj = list_entry(o_v, struct yaffs_obj, variant); + + yaffs_trace(YAFFS_TRACE_BACKGROUND, "Update directory %d", + obj->obj_id); + + if (obj->dirty) + yaffs_update_oh(obj, NULL, 0, 0, 0, NULL); + } +} + +/* + * Mknod (create) a new object. + * equiv_obj only has meaning for a hard link; + * alias_str only has meaning for a symlink. + * rdev only has meaning for devices (a subset of special objects) + */ + +static struct yaffs_obj *yaffs_create_obj(enum yaffs_obj_type type, + struct yaffs_obj *parent, + const YCHAR * name, + u32 mode, + u32 uid, + u32 gid, + struct yaffs_obj *equiv_obj, + const YCHAR * alias_str, u32 rdev) +{ + struct yaffs_obj *in; + YCHAR *str = NULL; + + struct yaffs_dev *dev = parent->my_dev; + + /* Check if the entry exists. If it does then fail the call since we don't want a dup. */ + if (yaffs_find_by_name(parent, name)) + return NULL; + + if (type == YAFFS_OBJECT_TYPE_SYMLINK) { + str = yaffs_clone_str(alias_str); + if (!str) + return NULL; + } + + in = yaffs_new_obj(dev, -1, type); + + if (!in) { + if (str) + kfree(str); + return NULL; + } + + if (in) { + in->hdr_chunk = 0; + in->valid = 1; + in->variant_type = type; + + in->yst_mode = mode; + + yaffs_attribs_init(in, gid, uid, rdev); + + in->n_data_chunks = 0; + + yaffs_set_obj_name(in, name); + in->dirty = 1; + + yaffs_add_obj_to_dir(parent, in); + + in->my_dev = parent->my_dev; + + switch (type) { + case YAFFS_OBJECT_TYPE_SYMLINK: + in->variant.symlink_variant.alias = str; + break; + case YAFFS_OBJECT_TYPE_HARDLINK: + in->variant.hardlink_variant.equiv_obj = equiv_obj; + in->variant.hardlink_variant.equiv_id = + equiv_obj->obj_id; + list_add(&in->hard_links, &equiv_obj->hard_links); + break; + case YAFFS_OBJECT_TYPE_FILE: + case YAFFS_OBJECT_TYPE_DIRECTORY: + case YAFFS_OBJECT_TYPE_SPECIAL: + case YAFFS_OBJECT_TYPE_UNKNOWN: + /* do nothing */ + break; + } + + if (yaffs_update_oh(in, name, 0, 0, 0, NULL) < 0) { + /* Could not create the object header, fail the creation */ + yaffs_del_obj(in); + in = NULL; + } + + yaffs_update_parent(parent); + } + + return in; +} + +struct yaffs_obj *yaffs_create_file(struct yaffs_obj *parent, + const YCHAR * name, u32 mode, u32 uid, + u32 gid) +{ + return yaffs_create_obj(YAFFS_OBJECT_TYPE_FILE, parent, name, mode, + uid, gid, NULL, NULL, 0); +} + +struct yaffs_obj *yaffs_create_dir(struct yaffs_obj *parent, const YCHAR * name, + u32 mode, u32 uid, u32 gid) +{ + return yaffs_create_obj(YAFFS_OBJECT_TYPE_DIRECTORY, parent, name, + mode, uid, gid, NULL, NULL, 0); +} + +struct yaffs_obj *yaffs_create_special(struct yaffs_obj *parent, + const YCHAR * name, u32 mode, u32 uid, + u32 gid, u32 rdev) +{ + return yaffs_create_obj(YAFFS_OBJECT_TYPE_SPECIAL, parent, name, mode, + uid, gid, NULL, NULL, rdev); +} + +struct yaffs_obj *yaffs_create_symlink(struct yaffs_obj *parent, + const YCHAR * name, u32 mode, u32 uid, + u32 gid, const YCHAR * alias) +{ + return yaffs_create_obj(YAFFS_OBJECT_TYPE_SYMLINK, parent, name, mode, + uid, gid, NULL, alias, 0); +} + +/* yaffs_link_obj returns the object id of the equivalent object.*/ +struct yaffs_obj *yaffs_link_obj(struct yaffs_obj *parent, const YCHAR * name, + struct yaffs_obj *equiv_obj) +{ + /* Get the real object in case we were fed a hard link as an equivalent object */ + equiv_obj = yaffs_get_equivalent_obj(equiv_obj); + + if (yaffs_create_obj + (YAFFS_OBJECT_TYPE_HARDLINK, parent, name, 0, 0, 0, + equiv_obj, NULL, 0)) { + return equiv_obj; + } else { + return NULL; + } + +} + + + +/*------------------------- Block Management and Page Allocation ----------------*/ + +static int yaffs_init_blocks(struct yaffs_dev *dev) +{ + int n_blocks = dev->internal_end_block - dev->internal_start_block + 1; + + dev->block_info = NULL; + dev->chunk_bits = NULL; + + dev->alloc_block = -1; /* force it to get a new one */ + + /* If the first allocation strategy fails, thry the alternate one */ + dev->block_info = + kmalloc(n_blocks * sizeof(struct yaffs_block_info), GFP_NOFS); + if (!dev->block_info) { + dev->block_info = + vmalloc(n_blocks * sizeof(struct yaffs_block_info)); + dev->block_info_alt = 1; + } else { + dev->block_info_alt = 0; + } + + if (dev->block_info) { + /* Set up dynamic blockinfo stuff. Round up bytes. */ + dev->chunk_bit_stride = (dev->param.chunks_per_block + 7) / 8; + dev->chunk_bits = + kmalloc(dev->chunk_bit_stride * n_blocks, GFP_NOFS); + if (!dev->chunk_bits) { + dev->chunk_bits = + vmalloc(dev->chunk_bit_stride * n_blocks); + dev->chunk_bits_alt = 1; + } else { + dev->chunk_bits_alt = 0; + } + } + + if (dev->block_info && dev->chunk_bits) { + memset(dev->block_info, 0, + n_blocks * sizeof(struct yaffs_block_info)); + memset(dev->chunk_bits, 0, dev->chunk_bit_stride * n_blocks); + return YAFFS_OK; + } + + return YAFFS_FAIL; +} + +static void yaffs_deinit_blocks(struct yaffs_dev *dev) +{ + if (dev->block_info_alt && dev->block_info) + vfree(dev->block_info); + else if (dev->block_info) + kfree(dev->block_info); + + dev->block_info_alt = 0; + + dev->block_info = NULL; + + if (dev->chunk_bits_alt && dev->chunk_bits) + vfree(dev->chunk_bits); + else if (dev->chunk_bits) + kfree(dev->chunk_bits); + dev->chunk_bits_alt = 0; + dev->chunk_bits = NULL; +} + +void yaffs_block_became_dirty(struct yaffs_dev *dev, int block_no) +{ + struct yaffs_block_info *bi = yaffs_get_block_info(dev, block_no); + + int erased_ok = 0; + + /* If the block is still healthy erase it and mark as clean. + * If the block has had a data failure, then retire it. + */ + + yaffs_trace(YAFFS_TRACE_GC | YAFFS_TRACE_ERASE, + "yaffs_block_became_dirty block %d state %d %s", + block_no, bi->block_state, + (bi->needs_retiring) ? "needs retiring" : ""); + + yaffs2_clear_oldest_dirty_seq(dev, bi); + + bi->block_state = YAFFS_BLOCK_STATE_DIRTY; + + /* If this is the block being garbage collected then stop gc'ing this block */ + if (block_no == dev->gc_block) + dev->gc_block = 0; + + /* If this block is currently the best candidate for gc then drop as a candidate */ + if (block_no == dev->gc_dirtiest) { + dev->gc_dirtiest = 0; + dev->gc_pages_in_use = 0; + } + + if (!bi->needs_retiring) { + yaffs2_checkpt_invalidate(dev); + erased_ok = yaffs_erase_block(dev, block_no); + if (!erased_ok) { + dev->n_erase_failures++; + yaffs_trace(YAFFS_TRACE_ERROR | YAFFS_TRACE_BAD_BLOCKS, + "**>> Erasure failed %d", block_no); + } + } + + if (erased_ok && + ((yaffs_trace_mask & YAFFS_TRACE_ERASE) + || !yaffs_skip_verification(dev))) { + int i; + for (i = 0; i < dev->param.chunks_per_block; i++) { + if (!yaffs_check_chunk_erased + (dev, block_no * dev->param.chunks_per_block + i)) { + yaffs_trace(YAFFS_TRACE_ERROR, + ">>Block %d erasure supposedly OK, but chunk %d not erased", + block_no, i); + } + } + } + + if (erased_ok) { + /* Clean it up... */ + bi->block_state = YAFFS_BLOCK_STATE_EMPTY; + bi->seq_number = 0; + dev->n_erased_blocks++; + bi->pages_in_use = 0; + bi->soft_del_pages = 0; + bi->has_shrink_hdr = 0; + bi->skip_erased_check = 1; /* Clean, so no need to check */ + bi->gc_prioritise = 0; + yaffs_clear_chunk_bits(dev, block_no); + + yaffs_trace(YAFFS_TRACE_ERASE, + "Erased block %d", block_no); + } else { + /* We lost a block of free space */ + dev->n_free_chunks -= dev->param.chunks_per_block; + yaffs_retire_block(dev, block_no); + yaffs_trace(YAFFS_TRACE_ERROR | YAFFS_TRACE_BAD_BLOCKS, + "**>> Block %d retired", block_no); + } +} + + + +static int yaffs_gc_block(struct yaffs_dev *dev, int block, int whole_block) +{ + int old_chunk; + int new_chunk; + int mark_flash; + int ret_val = YAFFS_OK; + int i; + int is_checkpt_block; + int matching_chunk; + int max_copies; + + int chunks_before = yaffs_get_erased_chunks(dev); + int chunks_after; + + struct yaffs_ext_tags tags; + + struct yaffs_block_info *bi = yaffs_get_block_info(dev, block); + + struct yaffs_obj *object; + + is_checkpt_block = (bi->block_state == YAFFS_BLOCK_STATE_CHECKPOINT); + + yaffs_trace(YAFFS_TRACE_TRACING, + "Collecting block %d, in use %d, shrink %d, whole_block %d", + block, bi->pages_in_use, bi->has_shrink_hdr, + whole_block); + + /*yaffs_verify_free_chunks(dev); */ + + if (bi->block_state == YAFFS_BLOCK_STATE_FULL) + bi->block_state = YAFFS_BLOCK_STATE_COLLECTING; + + bi->has_shrink_hdr = 0; /* clear the flag so that the block can erase */ + + dev->gc_disable = 1; + + if (is_checkpt_block || !yaffs_still_some_chunks(dev, block)) { + yaffs_trace(YAFFS_TRACE_TRACING, + "Collecting block %d that has no chunks in use", + block); + yaffs_block_became_dirty(dev, block); + } else { + + u8 *buffer = yaffs_get_temp_buffer(dev, __LINE__); + + yaffs_verify_blk(dev, bi, block); + + max_copies = (whole_block) ? dev->param.chunks_per_block : 5; + old_chunk = block * dev->param.chunks_per_block + dev->gc_chunk; + + for ( /* init already done */ ; + ret_val == YAFFS_OK && + dev->gc_chunk < dev->param.chunks_per_block && + (bi->block_state == YAFFS_BLOCK_STATE_COLLECTING) && + max_copies > 0; dev->gc_chunk++, old_chunk++) { + if (yaffs_check_chunk_bit(dev, block, dev->gc_chunk)) { + + /* This page is in use and might need to be copied off */ + + max_copies--; + + mark_flash = 1; + + yaffs_init_tags(&tags); + + yaffs_rd_chunk_tags_nand(dev, old_chunk, + buffer, &tags); + + object = yaffs_find_by_number(dev, tags.obj_id); + + yaffs_trace(YAFFS_TRACE_GC_DETAIL, + "Collecting chunk in block %d, %d %d %d ", + dev->gc_chunk, tags.obj_id, + tags.chunk_id, tags.n_bytes); + + if (object && !yaffs_skip_verification(dev)) { + if (tags.chunk_id == 0) + matching_chunk = + object->hdr_chunk; + else if (object->soft_del) + matching_chunk = old_chunk; /* Defeat the test */ + else + matching_chunk = + yaffs_find_chunk_in_file + (object, tags.chunk_id, + NULL); + + if (old_chunk != matching_chunk) + yaffs_trace(YAFFS_TRACE_ERROR, + "gc: page in gc mismatch: %d %d %d %d", + old_chunk, + matching_chunk, + tags.obj_id, + tags.chunk_id); + + } + + if (!object) { + yaffs_trace(YAFFS_TRACE_ERROR, + "page %d in gc has no object: %d %d %d ", + old_chunk, + tags.obj_id, tags.chunk_id, + tags.n_bytes); + } + + if (object && + object->deleted && + object->soft_del && tags.chunk_id != 0) { + /* Data chunk in a soft deleted file, throw it away + * It's a soft deleted data chunk, + * No need to copy this, just forget about it and + * fix up the object. + */ + + /* Free chunks already includes softdeleted chunks. + * How ever this chunk is going to soon be really deleted + * which will increment free chunks. + * We have to decrement free chunks so this works out properly. + */ + dev->n_free_chunks--; + bi->soft_del_pages--; + + object->n_data_chunks--; + + if (object->n_data_chunks <= 0) { + /* remeber to clean up the object */ + dev->gc_cleanup_list[dev-> + n_clean_ups] + = tags.obj_id; + dev->n_clean_ups++; + } + mark_flash = 0; + } else if (0) { + /* Todo object && object->deleted && object->n_data_chunks == 0 */ + /* Deleted object header with no data chunks. + * Can be discarded and the file deleted. + */ + object->hdr_chunk = 0; + yaffs_free_tnode(object->my_dev, + object-> + variant.file_variant. + top); + object->variant.file_variant.top = NULL; + yaffs_generic_obj_del(object); + + } else if (object) { + /* It's either a data chunk in a live file or + * an ObjectHeader, so we're interested in it. + * NB Need to keep the ObjectHeaders of deleted files + * until the whole file has been deleted off + */ + tags.serial_number++; + + dev->n_gc_copies++; + + if (tags.chunk_id == 0) { + /* It is an object Id, + * We need to nuke the shrinkheader flags first + * Also need to clean up shadowing. + * We no longer want the shrink_header flag since its work is done + * and if it is left in place it will mess up scanning. + */ + + struct yaffs_obj_hdr *oh; + oh = (struct yaffs_obj_hdr *) + buffer; + + oh->is_shrink = 0; + tags.extra_is_shrink = 0; + + oh->shadows_obj = 0; + oh->inband_shadowed_obj_id = 0; + tags.extra_shadows = 0; + + /* Update file size */ + if (object->variant_type == + YAFFS_OBJECT_TYPE_FILE) { + oh->file_size = + object->variant. + file_variant. + file_size; + tags.extra_length = + oh->file_size; + } + + yaffs_verify_oh(object, oh, + &tags, 1); + new_chunk = + yaffs_write_new_chunk(dev, + (u8 *) + oh, + &tags, + 1); + } else { + new_chunk = + yaffs_write_new_chunk(dev, + buffer, + &tags, + 1); + } + + if (new_chunk < 0) { + ret_val = YAFFS_FAIL; + } else { + + /* Ok, now fix up the Tnodes etc. */ + + if (tags.chunk_id == 0) { + /* It's a header */ + object->hdr_chunk = + new_chunk; + object->serial = + tags.serial_number; + } else { + /* It's a data chunk */ + int ok; + ok = yaffs_put_chunk_in_file(object, tags.chunk_id, new_chunk, 0); + } + } + } + + if (ret_val == YAFFS_OK) + yaffs_chunk_del(dev, old_chunk, + mark_flash, __LINE__); + + } + } + + yaffs_release_temp_buffer(dev, buffer, __LINE__); + + } + + yaffs_verify_collected_blk(dev, bi, block); + + if (bi->block_state == YAFFS_BLOCK_STATE_COLLECTING) { + /* + * The gc did not complete. Set block state back to FULL + * because checkpointing does not restore gc. + */ + bi->block_state = YAFFS_BLOCK_STATE_FULL; + } else { + /* The gc completed. */ + /* Do any required cleanups */ + for (i = 0; i < dev->n_clean_ups; i++) { + /* Time to delete the file too */ + object = + yaffs_find_by_number(dev, dev->gc_cleanup_list[i]); + if (object) { + yaffs_free_tnode(dev, + object->variant. + file_variant.top); + object->variant.file_variant.top = NULL; + yaffs_trace(YAFFS_TRACE_GC, + "yaffs: About to finally delete object %d", + object->obj_id); + yaffs_generic_obj_del(object); + object->my_dev->n_deleted_files--; + } + + } + + chunks_after = yaffs_get_erased_chunks(dev); + if (chunks_before >= chunks_after) + yaffs_trace(YAFFS_TRACE_GC, + "gc did not increase free chunks before %d after %d", + chunks_before, chunks_after); + dev->gc_block = 0; + dev->gc_chunk = 0; + dev->n_clean_ups = 0; + } + + dev->gc_disable = 0; + + return ret_val; +} + +/* + * FindBlockForgarbageCollection is used to select the dirtiest block (or close enough) + * for garbage collection. + */ + +static unsigned yaffs_find_gc_block(struct yaffs_dev *dev, + int aggressive, int background) +{ + int i; + int iterations; + unsigned selected = 0; + int prioritised = 0; + int prioritised_exist = 0; + struct yaffs_block_info *bi; + int threshold; + + /* First let's see if we need to grab a prioritised block */ + if (dev->has_pending_prioritised_gc && !aggressive) { + dev->gc_dirtiest = 0; + bi = dev->block_info; + for (i = dev->internal_start_block; + i <= dev->internal_end_block && !selected; i++) { + + if (bi->gc_prioritise) { + prioritised_exist = 1; + if (bi->block_state == YAFFS_BLOCK_STATE_FULL && + yaffs_block_ok_for_gc(dev, bi)) { + selected = i; + prioritised = 1; + } + } + bi++; + } + + /* + * If there is a prioritised block and none was selected then + * this happened because there is at least one old dirty block gumming + * up the works. Let's gc the oldest dirty block. + */ + + if (prioritised_exist && + !selected && dev->oldest_dirty_block > 0) + selected = dev->oldest_dirty_block; + + if (!prioritised_exist) /* None found, so we can clear this */ + dev->has_pending_prioritised_gc = 0; + } + + /* If we're doing aggressive GC then we are happy to take a less-dirty block, and + * search harder. + * else (we're doing a leasurely gc), then we only bother to do this if the + * block has only a few pages in use. + */ + + if (!selected) { + int pages_used; + int n_blocks = + dev->internal_end_block - dev->internal_start_block + 1; + if (aggressive) { + threshold = dev->param.chunks_per_block; + iterations = n_blocks; + } else { + int max_threshold; + + if (background) + max_threshold = dev->param.chunks_per_block / 2; + else + max_threshold = dev->param.chunks_per_block / 8; + + if (max_threshold < YAFFS_GC_PASSIVE_THRESHOLD) + max_threshold = YAFFS_GC_PASSIVE_THRESHOLD; + + threshold = background ? (dev->gc_not_done + 2) * 2 : 0; + if (threshold < YAFFS_GC_PASSIVE_THRESHOLD) + threshold = YAFFS_GC_PASSIVE_THRESHOLD; + if (threshold > max_threshold) + threshold = max_threshold; + + iterations = n_blocks / 16 + 1; + if (iterations > 100) + iterations = 100; + } + + for (i = 0; + i < iterations && + (dev->gc_dirtiest < 1 || + dev->gc_pages_in_use > YAFFS_GC_GOOD_ENOUGH); i++) { + dev->gc_block_finder++; + if (dev->gc_block_finder < dev->internal_start_block || + dev->gc_block_finder > dev->internal_end_block) + dev->gc_block_finder = + dev->internal_start_block; + + bi = yaffs_get_block_info(dev, dev->gc_block_finder); + + pages_used = bi->pages_in_use - bi->soft_del_pages; + + if (bi->block_state == YAFFS_BLOCK_STATE_FULL && + pages_used < dev->param.chunks_per_block && + (dev->gc_dirtiest < 1 + || pages_used < dev->gc_pages_in_use) + && yaffs_block_ok_for_gc(dev, bi)) { + dev->gc_dirtiest = dev->gc_block_finder; + dev->gc_pages_in_use = pages_used; + } + } + + if (dev->gc_dirtiest > 0 && dev->gc_pages_in_use <= threshold) + selected = dev->gc_dirtiest; + } + + /* + * If nothing has been selected for a while, try selecting the oldest dirty + * because that's gumming up the works. + */ + + if (!selected && dev->param.is_yaffs2 && + dev->gc_not_done >= (background ? 10 : 20)) { + yaffs2_find_oldest_dirty_seq(dev); + if (dev->oldest_dirty_block > 0) { + selected = dev->oldest_dirty_block; + dev->gc_dirtiest = selected; + dev->oldest_dirty_gc_count++; + bi = yaffs_get_block_info(dev, selected); + dev->gc_pages_in_use = + bi->pages_in_use - bi->soft_del_pages; + } else { + dev->gc_not_done = 0; + } + } + + if (selected) { + yaffs_trace(YAFFS_TRACE_GC, + "GC Selected block %d with %d free, prioritised:%d", + selected, + dev->param.chunks_per_block - dev->gc_pages_in_use, + prioritised); + + dev->n_gc_blocks++; + if (background) + dev->bg_gcs++; + + dev->gc_dirtiest = 0; + dev->gc_pages_in_use = 0; + dev->gc_not_done = 0; + if (dev->refresh_skip > 0) + dev->refresh_skip--; + } else { + dev->gc_not_done++; + yaffs_trace(YAFFS_TRACE_GC, + "GC none: finder %d skip %d threshold %d dirtiest %d using %d oldest %d%s", + dev->gc_block_finder, dev->gc_not_done, threshold, + dev->gc_dirtiest, dev->gc_pages_in_use, + dev->oldest_dirty_block, background ? " bg" : ""); + } + + return selected; +} + +/* New garbage collector + * If we're very low on erased blocks then we do aggressive garbage collection + * otherwise we do "leasurely" garbage collection. + * Aggressive gc looks further (whole array) and will accept less dirty blocks. + * Passive gc only inspects smaller areas and will only accept more dirty blocks. + * + * The idea is to help clear out space in a more spread-out manner. + * Dunno if it really does anything useful. + */ +static int yaffs_check_gc(struct yaffs_dev *dev, int background) +{ + int aggressive = 0; + int gc_ok = YAFFS_OK; + int max_tries = 0; + int min_erased; + int erased_chunks; + int checkpt_block_adjust; + + if (dev->param.gc_control && (dev->param.gc_control(dev) & 1) == 0) + return YAFFS_OK; + + if (dev->gc_disable) { + /* Bail out so we don't get recursive gc */ + return YAFFS_OK; + } + + /* This loop should pass the first time. + * We'll only see looping here if the collection does not increase space. + */ + + do { + max_tries++; + + checkpt_block_adjust = yaffs_calc_checkpt_blocks_required(dev); + + min_erased = + dev->param.n_reserved_blocks + checkpt_block_adjust + 1; + erased_chunks = + dev->n_erased_blocks * dev->param.chunks_per_block; + + /* If we need a block soon then do aggressive gc. */ + if (dev->n_erased_blocks < min_erased) + aggressive = 1; + else { + if (!background + && erased_chunks > (dev->n_free_chunks / 4)) + break; + + if (dev->gc_skip > 20) + dev->gc_skip = 20; + if (erased_chunks < dev->n_free_chunks / 2 || + dev->gc_skip < 1 || background) + aggressive = 0; + else { + dev->gc_skip--; + break; + } + } + + dev->gc_skip = 5; + + /* If we don't already have a block being gc'd then see if we should start another */ + + if (dev->gc_block < 1 && !aggressive) { + dev->gc_block = yaffs2_find_refresh_block(dev); + dev->gc_chunk = 0; + dev->n_clean_ups = 0; + } + if (dev->gc_block < 1) { + dev->gc_block = + yaffs_find_gc_block(dev, aggressive, background); + dev->gc_chunk = 0; + dev->n_clean_ups = 0; + } + + if (dev->gc_block > 0) { + dev->all_gcs++; + if (!aggressive) + dev->passive_gc_count++; + + yaffs_trace(YAFFS_TRACE_GC, + "yaffs: GC n_erased_blocks %d aggressive %d", + dev->n_erased_blocks, aggressive); + + gc_ok = yaffs_gc_block(dev, dev->gc_block, aggressive); + } + + if (dev->n_erased_blocks < (dev->param.n_reserved_blocks) + && dev->gc_block > 0) { + yaffs_trace(YAFFS_TRACE_GC, + "yaffs: GC !!!no reclaim!!! n_erased_blocks %d after try %d block %d", + dev->n_erased_blocks, max_tries, + dev->gc_block); + } + } while ((dev->n_erased_blocks < dev->param.n_reserved_blocks) && + (dev->gc_block > 0) && (max_tries < 2)); + + return aggressive ? gc_ok : YAFFS_OK; +} + +/* + * yaffs_bg_gc() + * Garbage collects. Intended to be called from a background thread. + * Returns non-zero if at least half the free chunks are erased. + */ +int yaffs_bg_gc(struct yaffs_dev *dev, unsigned urgency) +{ + int erased_chunks = dev->n_erased_blocks * dev->param.chunks_per_block; + + yaffs_trace(YAFFS_TRACE_BACKGROUND, "Background gc %u", urgency); + + yaffs_check_gc(dev, 1); + return erased_chunks > dev->n_free_chunks / 2; +} + +/*-------------------- Data file manipulation -----------------*/ + +static int yaffs_rd_data_obj(struct yaffs_obj *in, int inode_chunk, u8 * buffer) +{ + int nand_chunk = yaffs_find_chunk_in_file(in, inode_chunk, NULL); + + if (nand_chunk >= 0) + return yaffs_rd_chunk_tags_nand(in->my_dev, nand_chunk, + buffer, NULL); + else { + yaffs_trace(YAFFS_TRACE_NANDACCESS, + "Chunk %d not found zero instead", + nand_chunk); + /* get sane (zero) data if you read a hole */ + memset(buffer, 0, in->my_dev->data_bytes_per_chunk); + return 0; + } + +} + +void yaffs_chunk_del(struct yaffs_dev *dev, int chunk_id, int mark_flash, + int lyn) +{ + int block; + int page; + struct yaffs_ext_tags tags; + struct yaffs_block_info *bi; + + if (chunk_id <= 0) + return; + + dev->n_deletions++; + block = chunk_id / dev->param.chunks_per_block; + page = chunk_id % dev->param.chunks_per_block; + + if (!yaffs_check_chunk_bit(dev, block, page)) + yaffs_trace(YAFFS_TRACE_VERIFY, + "Deleting invalid chunk %d", chunk_id); + + bi = yaffs_get_block_info(dev, block); + + yaffs2_update_oldest_dirty_seq(dev, block, bi); + + yaffs_trace(YAFFS_TRACE_DELETION, + "line %d delete of chunk %d", + lyn, chunk_id); + + if (!dev->param.is_yaffs2 && mark_flash && + bi->block_state != YAFFS_BLOCK_STATE_COLLECTING) { + + yaffs_init_tags(&tags); + + tags.is_deleted = 1; + + yaffs_wr_chunk_tags_nand(dev, chunk_id, NULL, &tags); + yaffs_handle_chunk_update(dev, chunk_id, &tags); + } else { + dev->n_unmarked_deletions++; + } + + /* Pull out of the management area. + * If the whole block became dirty, this will kick off an erasure. + */ + if (bi->block_state == YAFFS_BLOCK_STATE_ALLOCATING || + bi->block_state == YAFFS_BLOCK_STATE_FULL || + bi->block_state == YAFFS_BLOCK_STATE_NEEDS_SCANNING || + bi->block_state == YAFFS_BLOCK_STATE_COLLECTING) { + dev->n_free_chunks++; + + yaffs_clear_chunk_bit(dev, block, page); + + bi->pages_in_use--; + + if (bi->pages_in_use == 0 && + !bi->has_shrink_hdr && + bi->block_state != YAFFS_BLOCK_STATE_ALLOCATING && + bi->block_state != YAFFS_BLOCK_STATE_NEEDS_SCANNING) { + yaffs_block_became_dirty(dev, block); + } + + } + +} + +static int yaffs_wr_data_obj(struct yaffs_obj *in, int inode_chunk, + const u8 * buffer, int n_bytes, int use_reserve) +{ + /* Find old chunk Need to do this to get serial number + * Write new one and patch into tree. + * Invalidate old tags. + */ + + int prev_chunk_id; + struct yaffs_ext_tags prev_tags; + + int new_chunk_id; + struct yaffs_ext_tags new_tags; + + struct yaffs_dev *dev = in->my_dev; + + yaffs_check_gc(dev, 0); + + /* Get the previous chunk at this location in the file if it exists. + * If it does not exist then put a zero into the tree. This creates + * the tnode now, rather than later when it is harder to clean up. + */ + prev_chunk_id = yaffs_find_chunk_in_file(in, inode_chunk, &prev_tags); + if (prev_chunk_id < 1 && + !yaffs_put_chunk_in_file(in, inode_chunk, 0, 0)) + return 0; + + /* Set up new tags */ + yaffs_init_tags(&new_tags); + + new_tags.chunk_id = inode_chunk; + new_tags.obj_id = in->obj_id; + new_tags.serial_number = + (prev_chunk_id > 0) ? prev_tags.serial_number + 1 : 1; + new_tags.n_bytes = n_bytes; + + if (n_bytes < 1 || n_bytes > dev->param.total_bytes_per_chunk) { + yaffs_trace(YAFFS_TRACE_ERROR, + "Writing %d bytes to chunk!!!!!!!!!", + n_bytes); + YBUG(); + } + + new_chunk_id = + yaffs_write_new_chunk(dev, buffer, &new_tags, use_reserve); + + if (new_chunk_id > 0) { + yaffs_put_chunk_in_file(in, inode_chunk, new_chunk_id, 0); + + if (prev_chunk_id > 0) + yaffs_chunk_del(dev, prev_chunk_id, 1, __LINE__); + + yaffs_verify_file_sane(in); + } + return new_chunk_id; + +} + + + +static int yaffs_do_xattrib_mod(struct yaffs_obj *obj, int set, + const YCHAR * name, const void *value, int size, + int flags) +{ + struct yaffs_xattr_mod xmod; + + int result; + + xmod.set = set; + xmod.name = name; + xmod.data = value; + xmod.size = size; + xmod.flags = flags; + xmod.result = -ENOSPC; + + result = yaffs_update_oh(obj, NULL, 0, 0, 0, &xmod); + + if (result > 0) + return xmod.result; + else + return -ENOSPC; +} + +static int yaffs_apply_xattrib_mod(struct yaffs_obj *obj, char *buffer, + struct yaffs_xattr_mod *xmod) +{ + int retval = 0; + int x_offs = sizeof(struct yaffs_obj_hdr); + struct yaffs_dev *dev = obj->my_dev; + int x_size = dev->data_bytes_per_chunk - sizeof(struct yaffs_obj_hdr); + + char *x_buffer = buffer + x_offs; + + if (xmod->set) + retval = + nval_set(x_buffer, x_size, xmod->name, xmod->data, + xmod->size, xmod->flags); + else + retval = nval_del(x_buffer, x_size, xmod->name); + + obj->has_xattr = nval_hasvalues(x_buffer, x_size); + obj->xattr_known = 1; + + xmod->result = retval; + + return retval; +} + +static int yaffs_do_xattrib_fetch(struct yaffs_obj *obj, const YCHAR * name, + void *value, int size) +{ + char *buffer = NULL; + int result; + struct yaffs_ext_tags tags; + struct yaffs_dev *dev = obj->my_dev; + int x_offs = sizeof(struct yaffs_obj_hdr); + int x_size = dev->data_bytes_per_chunk - sizeof(struct yaffs_obj_hdr); + + char *x_buffer; + + int retval = 0; + + if (obj->hdr_chunk < 1) + return -ENODATA; + + /* If we know that the object has no xattribs then don't do all the + * reading and parsing. + */ + if (obj->xattr_known && !obj->has_xattr) { + if (name) + return -ENODATA; + else + return 0; + } + + buffer = (char *)yaffs_get_temp_buffer(dev, __LINE__); + if (!buffer) + return -ENOMEM; + + result = + yaffs_rd_chunk_tags_nand(dev, obj->hdr_chunk, (u8 *) buffer, &tags); + + if (result != YAFFS_OK) + retval = -ENOENT; + else { + x_buffer = buffer + x_offs; + + if (!obj->xattr_known) { + obj->has_xattr = nval_hasvalues(x_buffer, x_size); + obj->xattr_known = 1; + } + + if (name) + retval = nval_get(x_buffer, x_size, name, value, size); + else + retval = nval_list(x_buffer, x_size, value, size); + } + yaffs_release_temp_buffer(dev, (u8 *) buffer, __LINE__); + return retval; +} + +int yaffs_set_xattrib(struct yaffs_obj *obj, const YCHAR * name, + const void *value, int size, int flags) +{ + return yaffs_do_xattrib_mod(obj, 1, name, value, size, flags); +} + +int yaffs_remove_xattrib(struct yaffs_obj *obj, const YCHAR * name) +{ + return yaffs_do_xattrib_mod(obj, 0, name, NULL, 0, 0); +} + +int yaffs_get_xattrib(struct yaffs_obj *obj, const YCHAR * name, void *value, + int size) +{ + return yaffs_do_xattrib_fetch(obj, name, value, size); +} + +int yaffs_list_xattrib(struct yaffs_obj *obj, char *buffer, int size) +{ + return yaffs_do_xattrib_fetch(obj, NULL, buffer, size); +} + +static void yaffs_check_obj_details_loaded(struct yaffs_obj *in) +{ + u8 *chunk_data; + struct yaffs_obj_hdr *oh; + struct yaffs_dev *dev; + struct yaffs_ext_tags tags; + int result; + int alloc_failed = 0; + + if (!in) + return; + + dev = in->my_dev; + + if (in->lazy_loaded && in->hdr_chunk > 0) { + in->lazy_loaded = 0; + chunk_data = yaffs_get_temp_buffer(dev, __LINE__); + + result = + yaffs_rd_chunk_tags_nand(dev, in->hdr_chunk, chunk_data, + &tags); + oh = (struct yaffs_obj_hdr *)chunk_data; + + in->yst_mode = oh->yst_mode; + yaffs_load_attribs(in, oh); + yaffs_set_obj_name_from_oh(in, oh); + + if (in->variant_type == YAFFS_OBJECT_TYPE_SYMLINK) { + in->variant.symlink_variant.alias = + yaffs_clone_str(oh->alias); + if (!in->variant.symlink_variant.alias) + alloc_failed = 1; /* Not returned to caller */ + } + + yaffs_release_temp_buffer(dev, chunk_data, __LINE__); + } +} + +static void yaffs_load_name_from_oh(struct yaffs_dev *dev, YCHAR * name, + const YCHAR * oh_name, int buff_size) +{ +#ifdef CONFIG_YAFFS_AUTO_UNICODE + if (dev->param.auto_unicode) { + if (*oh_name) { + /* It is an ASCII name, do an ASCII to + * unicode conversion */ + const char *ascii_oh_name = (const char *)oh_name; + int n = buff_size - 1; + while (n > 0 && *ascii_oh_name) { + *name = *ascii_oh_name; + name++; + ascii_oh_name++; + n--; + } + } else { + strncpy(name, oh_name + 1, buff_size - 1); + } + } else { +#else + { +#endif + strncpy(name, oh_name, buff_size - 1); + } +} + +static void yaffs_load_oh_from_name(struct yaffs_dev *dev, YCHAR * oh_name, + const YCHAR * name) +{ +#ifdef CONFIG_YAFFS_AUTO_UNICODE + + int is_ascii; + YCHAR *w; + + if (dev->param.auto_unicode) { + + is_ascii = 1; + w = name; + + /* Figure out if the name will fit in ascii character set */ + while (is_ascii && *w) { + if ((*w) & 0xff00) + is_ascii = 0; + w++; + } + + if (is_ascii) { + /* It is an ASCII name, so do a unicode to ascii conversion */ + char *ascii_oh_name = (char *)oh_name; + int n = YAFFS_MAX_NAME_LENGTH - 1; + while (n > 0 && *name) { + *ascii_oh_name = *name; + name++; + ascii_oh_name++; + n--; + } + } else { + /* It is a unicode name, so save starting at the second YCHAR */ + *oh_name = 0; + strncpy(oh_name + 1, name, + YAFFS_MAX_NAME_LENGTH - 2); + } + } else { +#else + { +#endif + strncpy(oh_name, name, YAFFS_MAX_NAME_LENGTH - 1); + } + +} + +/* UpdateObjectHeader updates the header on NAND for an object. + * If name is not NULL, then that new name is used. + */ +int yaffs_update_oh(struct yaffs_obj *in, const YCHAR * name, int force, + int is_shrink, int shadows, struct yaffs_xattr_mod *xmod) +{ + + struct yaffs_block_info *bi; + + struct yaffs_dev *dev = in->my_dev; + + int prev_chunk_id; + int ret_val = 0; + int result = 0; + + int new_chunk_id; + struct yaffs_ext_tags new_tags; + struct yaffs_ext_tags old_tags; + const YCHAR *alias = NULL; + + u8 *buffer = NULL; + YCHAR old_name[YAFFS_MAX_NAME_LENGTH + 1]; + + struct yaffs_obj_hdr *oh = NULL; + + strcpy(old_name, _Y("silly old name")); + + if (!in->fake || in == dev->root_dir || + force || xmod) { + + yaffs_check_gc(dev, 0); + yaffs_check_obj_details_loaded(in); + + buffer = yaffs_get_temp_buffer(in->my_dev, __LINE__); + oh = (struct yaffs_obj_hdr *)buffer; + + prev_chunk_id = in->hdr_chunk; + + if (prev_chunk_id > 0) { + result = yaffs_rd_chunk_tags_nand(dev, prev_chunk_id, + buffer, &old_tags); + + yaffs_verify_oh(in, oh, &old_tags, 0); + + memcpy(old_name, oh->name, sizeof(oh->name)); + memset(buffer, 0xFF, sizeof(struct yaffs_obj_hdr)); + } else { + memset(buffer, 0xFF, dev->data_bytes_per_chunk); + } + + oh->type = in->variant_type; + oh->yst_mode = in->yst_mode; + oh->shadows_obj = oh->inband_shadowed_obj_id = shadows; + + yaffs_load_attribs_oh(oh, in); + + if (in->parent) + oh->parent_obj_id = in->parent->obj_id; + else + oh->parent_obj_id = 0; + + if (name && *name) { + memset(oh->name, 0, sizeof(oh->name)); + yaffs_load_oh_from_name(dev, oh->name, name); + } else if (prev_chunk_id > 0) { + memcpy(oh->name, old_name, sizeof(oh->name)); + } else { + memset(oh->name, 0, sizeof(oh->name)); + } + + oh->is_shrink = is_shrink; + + switch (in->variant_type) { + case YAFFS_OBJECT_TYPE_UNKNOWN: + /* Should not happen */ + break; + case YAFFS_OBJECT_TYPE_FILE: + oh->file_size = + (oh->parent_obj_id == YAFFS_OBJECTID_DELETED + || oh->parent_obj_id == + YAFFS_OBJECTID_UNLINKED) ? 0 : in-> + variant.file_variant.file_size; + break; + case YAFFS_OBJECT_TYPE_HARDLINK: + oh->equiv_id = in->variant.hardlink_variant.equiv_id; + break; + case YAFFS_OBJECT_TYPE_SPECIAL: + /* Do nothing */ + break; + case YAFFS_OBJECT_TYPE_DIRECTORY: + /* Do nothing */ + break; + case YAFFS_OBJECT_TYPE_SYMLINK: + alias = in->variant.symlink_variant.alias; + if (!alias) + alias = _Y("no alias"); + strncpy(oh->alias, alias, YAFFS_MAX_ALIAS_LENGTH); + oh->alias[YAFFS_MAX_ALIAS_LENGTH] = 0; + break; + } + + /* process any xattrib modifications */ + if (xmod) + yaffs_apply_xattrib_mod(in, (char *)buffer, xmod); + + /* Tags */ + yaffs_init_tags(&new_tags); + in->serial++; + new_tags.chunk_id = 0; + new_tags.obj_id = in->obj_id; + new_tags.serial_number = in->serial; + + /* Add extra info for file header */ + + new_tags.extra_available = 1; + new_tags.extra_parent_id = oh->parent_obj_id; + new_tags.extra_length = oh->file_size; + new_tags.extra_is_shrink = oh->is_shrink; + new_tags.extra_equiv_id = oh->equiv_id; + new_tags.extra_shadows = (oh->shadows_obj > 0) ? 1 : 0; + new_tags.extra_obj_type = in->variant_type; + + yaffs_verify_oh(in, oh, &new_tags, 1); + + /* Create new chunk in NAND */ + new_chunk_id = + yaffs_write_new_chunk(dev, buffer, &new_tags, + (prev_chunk_id > 0) ? 1 : 0); + + if (new_chunk_id >= 0) { + + in->hdr_chunk = new_chunk_id; + + if (prev_chunk_id > 0) { + yaffs_chunk_del(dev, prev_chunk_id, 1, + __LINE__); + } + + if (!yaffs_obj_cache_dirty(in)) + in->dirty = 0; + + /* If this was a shrink, then mark the block that the chunk lives on */ + if (is_shrink) { + bi = yaffs_get_block_info(in->my_dev, + new_chunk_id / + in->my_dev->param. + chunks_per_block); + bi->has_shrink_hdr = 1; + } + + } + + ret_val = new_chunk_id; + + } + + if (buffer) + yaffs_release_temp_buffer(dev, buffer, __LINE__); + + return ret_val; +} + +/*--------------------- File read/write ------------------------ + * Read and write have very similar structures. + * In general the read/write has three parts to it + * An incomplete chunk to start with (if the read/write is not chunk-aligned) + * Some complete chunks + * An incomplete chunk to end off with + * + * Curve-balls: the first chunk might also be the last chunk. + */ + +int yaffs_file_rd(struct yaffs_obj *in, u8 * buffer, loff_t offset, int n_bytes) +{ + + int chunk; + u32 start; + int n_copy; + int n = n_bytes; + int n_done = 0; + struct yaffs_cache *cache; + + struct yaffs_dev *dev; + + dev = in->my_dev; + + while (n > 0) { + /* chunk = offset / dev->data_bytes_per_chunk + 1; */ + /* start = offset % dev->data_bytes_per_chunk; */ + yaffs_addr_to_chunk(dev, offset, &chunk, &start); + chunk++; + + /* OK now check for the curveball where the start and end are in + * the same chunk. + */ + if ((start + n) < dev->data_bytes_per_chunk) + n_copy = n; + else + n_copy = dev->data_bytes_per_chunk - start; + + cache = yaffs_find_chunk_cache(in, chunk); + + /* If the chunk is already in the cache or it is less than a whole chunk + * or we're using inband tags then use the cache (if there is caching) + * else bypass the cache. + */ + if (cache || n_copy != dev->data_bytes_per_chunk + || dev->param.inband_tags) { + if (dev->param.n_caches > 0) { + + /* If we can't find the data in the cache, then load it up. */ + + if (!cache) { + cache = + yaffs_grab_chunk_cache(in->my_dev); + cache->object = in; + cache->chunk_id = chunk; + cache->dirty = 0; + cache->locked = 0; + yaffs_rd_data_obj(in, chunk, + cache->data); + cache->n_bytes = 0; + } + + yaffs_use_cache(dev, cache, 0); + + cache->locked = 1; + + memcpy(buffer, &cache->data[start], n_copy); + + cache->locked = 0; + } else { + /* Read into the local buffer then copy.. */ + + u8 *local_buffer = + yaffs_get_temp_buffer(dev, __LINE__); + yaffs_rd_data_obj(in, chunk, local_buffer); + + memcpy(buffer, &local_buffer[start], n_copy); + + yaffs_release_temp_buffer(dev, local_buffer, + __LINE__); + } + + } else { + + /* A full chunk. Read directly into the supplied buffer. */ + yaffs_rd_data_obj(in, chunk, buffer); + + } + + n -= n_copy; + offset += n_copy; + buffer += n_copy; + n_done += n_copy; + + } + + return n_done; +} + +int yaffs_do_file_wr(struct yaffs_obj *in, const u8 * buffer, loff_t offset, + int n_bytes, int write_trhrough) +{ + + int chunk; + u32 start; + int n_copy; + int n = n_bytes; + int n_done = 0; + int n_writeback; + int start_write = offset; + int chunk_written = 0; + u32 n_bytes_read; + u32 chunk_start; + + struct yaffs_dev *dev; + + dev = in->my_dev; + + while (n > 0 && chunk_written >= 0) { + yaffs_addr_to_chunk(dev, offset, &chunk, &start); + + if (chunk * dev->data_bytes_per_chunk + start != offset || + start >= dev->data_bytes_per_chunk) { + yaffs_trace(YAFFS_TRACE_ERROR, + "AddrToChunk of offset %d gives chunk %d start %d", + (int)offset, chunk, start); + } + chunk++; /* File pos to chunk in file offset */ + + /* OK now check for the curveball where the start and end are in + * the same chunk. + */ + + if ((start + n) < dev->data_bytes_per_chunk) { + n_copy = n; + + /* Now folks, to calculate how many bytes to write back.... + * If we're overwriting and not writing to then end of file then + * we need to write back as much as was there before. + */ + + chunk_start = ((chunk - 1) * dev->data_bytes_per_chunk); + + if (chunk_start > in->variant.file_variant.file_size) + n_bytes_read = 0; /* Past end of file */ + else + n_bytes_read = + in->variant.file_variant.file_size - + chunk_start; + + if (n_bytes_read > dev->data_bytes_per_chunk) + n_bytes_read = dev->data_bytes_per_chunk; + + n_writeback = + (n_bytes_read > + (start + n)) ? n_bytes_read : (start + n); + + if (n_writeback < 0 + || n_writeback > dev->data_bytes_per_chunk) + YBUG(); + + } else { + n_copy = dev->data_bytes_per_chunk - start; + n_writeback = dev->data_bytes_per_chunk; + } + + if (n_copy != dev->data_bytes_per_chunk + || dev->param.inband_tags) { + /* An incomplete start or end chunk (or maybe both start and end chunk), + * or we're using inband tags, so we want to use the cache buffers. + */ + if (dev->param.n_caches > 0) { + struct yaffs_cache *cache; + /* If we can't find the data in the cache, then load the cache */ + cache = yaffs_find_chunk_cache(in, chunk); + + if (!cache + && yaffs_check_alloc_available(dev, 1)) { + cache = yaffs_grab_chunk_cache(dev); + cache->object = in; + cache->chunk_id = chunk; + cache->dirty = 0; + cache->locked = 0; + yaffs_rd_data_obj(in, chunk, + cache->data); + } else if (cache && + !cache->dirty && + !yaffs_check_alloc_available(dev, + 1)) { + /* Drop the cache if it was a read cache item and + * no space check has been made for it. + */ + cache = NULL; + } + + if (cache) { + yaffs_use_cache(dev, cache, 1); + cache->locked = 1; + + memcpy(&cache->data[start], buffer, + n_copy); + + cache->locked = 0; + cache->n_bytes = n_writeback; + + if (write_trhrough) { + chunk_written = + yaffs_wr_data_obj + (cache->object, + cache->chunk_id, + cache->data, + cache->n_bytes, 1); + cache->dirty = 0; + } + + } else { + chunk_written = -1; /* fail the write */ + } + } else { + /* An incomplete start or end chunk (or maybe both start and end chunk) + * Read into the local buffer then copy, then copy over and write back. + */ + + u8 *local_buffer = + yaffs_get_temp_buffer(dev, __LINE__); + + yaffs_rd_data_obj(in, chunk, local_buffer); + + memcpy(&local_buffer[start], buffer, n_copy); + + chunk_written = + yaffs_wr_data_obj(in, chunk, + local_buffer, + n_writeback, 0); + + yaffs_release_temp_buffer(dev, local_buffer, + __LINE__); + + } + + } else { + /* A full chunk. Write directly from the supplied buffer. */ + + chunk_written = + yaffs_wr_data_obj(in, chunk, buffer, + dev->data_bytes_per_chunk, 0); + + /* Since we've overwritten the cached data, we better invalidate it. */ + yaffs_invalidate_chunk_cache(in, chunk); + } + + if (chunk_written >= 0) { + n -= n_copy; + offset += n_copy; + buffer += n_copy; + n_done += n_copy; + } + + } + + /* Update file object */ + + if ((start_write + n_done) > in->variant.file_variant.file_size) + in->variant.file_variant.file_size = (start_write + n_done); + + in->dirty = 1; + + return n_done; +} + +int yaffs_wr_file(struct yaffs_obj *in, const u8 * buffer, loff_t offset, + int n_bytes, int write_trhrough) +{ + yaffs2_handle_hole(in, offset); + return yaffs_do_file_wr(in, buffer, offset, n_bytes, write_trhrough); +} + +/* ---------------------- File resizing stuff ------------------ */ + +static void yaffs_prune_chunks(struct yaffs_obj *in, int new_size) +{ + + struct yaffs_dev *dev = in->my_dev; + int old_size = in->variant.file_variant.file_size; + + int last_del = 1 + (old_size - 1) / dev->data_bytes_per_chunk; + + int start_del = 1 + (new_size + dev->data_bytes_per_chunk - 1) / + dev->data_bytes_per_chunk; + int i; + int chunk_id; + + /* Delete backwards so that we don't end up with holes if + * power is lost part-way through the operation. + */ + for (i = last_del; i >= start_del; i--) { + /* NB this could be optimised somewhat, + * eg. could retrieve the tags and write them without + * using yaffs_chunk_del + */ + + chunk_id = yaffs_find_del_file_chunk(in, i, NULL); + if (chunk_id > 0) { + if (chunk_id < + (dev->internal_start_block * + dev->param.chunks_per_block) + || chunk_id >= + ((dev->internal_end_block + + 1) * dev->param.chunks_per_block)) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "Found daft chunk_id %d for %d", + chunk_id, i); + } else { + in->n_data_chunks--; + yaffs_chunk_del(dev, chunk_id, 1, __LINE__); + } + } + } + +} + +void yaffs_resize_file_down(struct yaffs_obj *obj, loff_t new_size) +{ + int new_full; + u32 new_partial; + struct yaffs_dev *dev = obj->my_dev; + + yaffs_addr_to_chunk(dev, new_size, &new_full, &new_partial); + + yaffs_prune_chunks(obj, new_size); + + if (new_partial != 0) { + int last_chunk = 1 + new_full; + u8 *local_buffer = yaffs_get_temp_buffer(dev, __LINE__); + + /* Rewrite the last chunk with its new size and zero pad */ + yaffs_rd_data_obj(obj, last_chunk, local_buffer); + memset(local_buffer + new_partial, 0, + dev->data_bytes_per_chunk - new_partial); + + yaffs_wr_data_obj(obj, last_chunk, local_buffer, + new_partial, 1); + + yaffs_release_temp_buffer(dev, local_buffer, __LINE__); + } + + obj->variant.file_variant.file_size = new_size; + + yaffs_prune_tree(dev, &obj->variant.file_variant); +} + +int yaffs_resize_file(struct yaffs_obj *in, loff_t new_size) +{ + struct yaffs_dev *dev = in->my_dev; + int old_size = in->variant.file_variant.file_size; + + yaffs_flush_file_cache(in); + yaffs_invalidate_whole_cache(in); + + yaffs_check_gc(dev, 0); + + if (in->variant_type != YAFFS_OBJECT_TYPE_FILE) + return YAFFS_FAIL; + + if (new_size == old_size) + return YAFFS_OK; + + if (new_size > old_size) { + yaffs2_handle_hole(in, new_size); + in->variant.file_variant.file_size = new_size; + } else { + /* new_size < old_size */ + yaffs_resize_file_down(in, new_size); + } + + /* Write a new object header to reflect the resize. + * show we've shrunk the file, if need be + * Do this only if the file is not in the deleted directories + * and is not shadowed. + */ + if (in->parent && + !in->is_shadowed && + in->parent->obj_id != YAFFS_OBJECTID_UNLINKED && + in->parent->obj_id != YAFFS_OBJECTID_DELETED) + yaffs_update_oh(in, NULL, 0, 0, 0, NULL); + + return YAFFS_OK; +} + +int yaffs_flush_file(struct yaffs_obj *in, int update_time, int data_sync) +{ + int ret_val; + if (in->dirty) { + yaffs_flush_file_cache(in); + if (data_sync) /* Only sync data */ + ret_val = YAFFS_OK; + else { + if (update_time) + yaffs_load_current_time(in, 0, 0); + + ret_val = (yaffs_update_oh(in, NULL, 0, 0, 0, NULL) >= + 0) ? YAFFS_OK : YAFFS_FAIL; + } + } else { + ret_val = YAFFS_OK; + } + + return ret_val; + +} + + +/* yaffs_del_file deletes the whole file data + * and the inode associated with the file. + * It does not delete the links associated with the file. + */ +static int yaffs_unlink_file_if_needed(struct yaffs_obj *in) +{ + + int ret_val; + int del_now = 0; + struct yaffs_dev *dev = in->my_dev; + + if (!in->my_inode) + del_now = 1; + + if (del_now) { + ret_val = + yaffs_change_obj_name(in, in->my_dev->del_dir, + _Y("deleted"), 0, 0); + yaffs_trace(YAFFS_TRACE_TRACING, + "yaffs: immediate deletion of file %d", + in->obj_id); + in->deleted = 1; + in->my_dev->n_deleted_files++; + if (dev->param.disable_soft_del || dev->param.is_yaffs2) + yaffs_resize_file(in, 0); + yaffs_soft_del_file(in); + } else { + ret_val = + yaffs_change_obj_name(in, in->my_dev->unlinked_dir, + _Y("unlinked"), 0, 0); + } + + return ret_val; +} + +int yaffs_del_file(struct yaffs_obj *in) +{ + int ret_val = YAFFS_OK; + int deleted; /* Need to cache value on stack if in is freed */ + struct yaffs_dev *dev = in->my_dev; + + if (dev->param.disable_soft_del || dev->param.is_yaffs2) + yaffs_resize_file(in, 0); + + if (in->n_data_chunks > 0) { + /* Use soft deletion if there is data in the file. + * That won't be the case if it has been resized to zero. + */ + if (!in->unlinked) + ret_val = yaffs_unlink_file_if_needed(in); + + deleted = in->deleted; + + if (ret_val == YAFFS_OK && in->unlinked && !in->deleted) { + in->deleted = 1; + deleted = 1; + in->my_dev->n_deleted_files++; + yaffs_soft_del_file(in); + } + return deleted ? YAFFS_OK : YAFFS_FAIL; + } else { + /* The file has no data chunks so we toss it immediately */ + yaffs_free_tnode(in->my_dev, in->variant.file_variant.top); + in->variant.file_variant.top = NULL; + yaffs_generic_obj_del(in); + + return YAFFS_OK; + } +} + +int yaffs_is_non_empty_dir(struct yaffs_obj *obj) +{ + return (obj && + obj->variant_type == YAFFS_OBJECT_TYPE_DIRECTORY) && + !(list_empty(&obj->variant.dir_variant.children)); +} + +static int yaffs_del_dir(struct yaffs_obj *obj) +{ + /* First check that the directory is empty. */ + if (yaffs_is_non_empty_dir(obj)) + return YAFFS_FAIL; + + return yaffs_generic_obj_del(obj); +} + +static int yaffs_del_symlink(struct yaffs_obj *in) +{ + if (in->variant.symlink_variant.alias) + kfree(in->variant.symlink_variant.alias); + in->variant.symlink_variant.alias = NULL; + + return yaffs_generic_obj_del(in); +} + +static int yaffs_del_link(struct yaffs_obj *in) +{ + /* remove this hardlink from the list assocaited with the equivalent + * object + */ + list_del_init(&in->hard_links); + return yaffs_generic_obj_del(in); +} + +int yaffs_del_obj(struct yaffs_obj *obj) +{ + int ret_val = -1; + switch (obj->variant_type) { + case YAFFS_OBJECT_TYPE_FILE: + ret_val = yaffs_del_file(obj); + break; + case YAFFS_OBJECT_TYPE_DIRECTORY: + if (!list_empty(&obj->variant.dir_variant.dirty)) { + yaffs_trace(YAFFS_TRACE_BACKGROUND, + "Remove object %d from dirty directories", + obj->obj_id); + list_del_init(&obj->variant.dir_variant.dirty); + } + return yaffs_del_dir(obj); + break; + case YAFFS_OBJECT_TYPE_SYMLINK: + ret_val = yaffs_del_symlink(obj); + break; + case YAFFS_OBJECT_TYPE_HARDLINK: + ret_val = yaffs_del_link(obj); + break; + case YAFFS_OBJECT_TYPE_SPECIAL: + ret_val = yaffs_generic_obj_del(obj); + break; + case YAFFS_OBJECT_TYPE_UNKNOWN: + ret_val = 0; + break; /* should not happen. */ + } + + return ret_val; +} + +static int yaffs_unlink_worker(struct yaffs_obj *obj) +{ + + int del_now = 0; + + if (!obj->my_inode) + del_now = 1; + + if (obj) + yaffs_update_parent(obj->parent); + + if (obj->variant_type == YAFFS_OBJECT_TYPE_HARDLINK) { + return yaffs_del_link(obj); + } else if (!list_empty(&obj->hard_links)) { + /* Curve ball: We're unlinking an object that has a hardlink. + * + * This problem arises because we are not strictly following + * The Linux link/inode model. + * + * We can't really delete the object. + * Instead, we do the following: + * - Select a hardlink. + * - Unhook it from the hard links + * - Move it from its parent directory (so that the rename can work) + * - Rename the object to the hardlink's name. + * - Delete the hardlink + */ + + struct yaffs_obj *hl; + struct yaffs_obj *parent; + int ret_val; + YCHAR name[YAFFS_MAX_NAME_LENGTH + 1]; + + hl = list_entry(obj->hard_links.next, struct yaffs_obj, + hard_links); + + yaffs_get_obj_name(hl, name, YAFFS_MAX_NAME_LENGTH + 1); + parent = hl->parent; + + list_del_init(&hl->hard_links); + + yaffs_add_obj_to_dir(obj->my_dev->unlinked_dir, hl); + + ret_val = yaffs_change_obj_name(obj, parent, name, 0, 0); + + if (ret_val == YAFFS_OK) + ret_val = yaffs_generic_obj_del(hl); + + return ret_val; + + } else if (del_now) { + switch (obj->variant_type) { + case YAFFS_OBJECT_TYPE_FILE: + return yaffs_del_file(obj); + break; + case YAFFS_OBJECT_TYPE_DIRECTORY: + list_del_init(&obj->variant.dir_variant.dirty); + return yaffs_del_dir(obj); + break; + case YAFFS_OBJECT_TYPE_SYMLINK: + return yaffs_del_symlink(obj); + break; + case YAFFS_OBJECT_TYPE_SPECIAL: + return yaffs_generic_obj_del(obj); + break; + case YAFFS_OBJECT_TYPE_HARDLINK: + case YAFFS_OBJECT_TYPE_UNKNOWN: + default: + return YAFFS_FAIL; + } + } else if (yaffs_is_non_empty_dir(obj)) { + return YAFFS_FAIL; + } else { + return yaffs_change_obj_name(obj, obj->my_dev->unlinked_dir, + _Y("unlinked"), 0, 0); + } +} + +static int yaffs_unlink_obj(struct yaffs_obj *obj) +{ + + if (obj && obj->unlink_allowed) + return yaffs_unlink_worker(obj); + + return YAFFS_FAIL; + +} + +int yaffs_unlinker(struct yaffs_obj *dir, const YCHAR * name) +{ + struct yaffs_obj *obj; + + obj = yaffs_find_by_name(dir, name); + return yaffs_unlink_obj(obj); +} + +/* Note: + * If old_name is NULL then we take old_dir as the object to be renamed. + */ +int yaffs_rename_obj(struct yaffs_obj *old_dir, const YCHAR * old_name, + struct yaffs_obj *new_dir, const YCHAR * new_name) +{ + struct yaffs_obj *obj = NULL; + struct yaffs_obj *existing_target = NULL; + int force = 0; + int result; + struct yaffs_dev *dev; + + if (!old_dir || old_dir->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) + YBUG(); + if (!new_dir || new_dir->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) + YBUG(); + + dev = old_dir->my_dev; + +#ifdef CONFIG_YAFFS_CASE_INSENSITIVE + /* Special case for case insemsitive systems. + * While look-up is case insensitive, the name isn't. + * Therefore we might want to change x.txt to X.txt + */ + if (old_dir == new_dir && + old_name && new_name && + strcmp(old_name, new_name) == 0) + force = 1; +#endif + + if (strnlen(new_name, YAFFS_MAX_NAME_LENGTH + 1) > + YAFFS_MAX_NAME_LENGTH) + /* ENAMETOOLONG */ + return YAFFS_FAIL; + + if(old_name) + obj = yaffs_find_by_name(old_dir, old_name); + else{ + obj = old_dir; + old_dir = obj->parent; + } + + + if (obj && obj->rename_allowed) { + + /* Now do the handling for an existing target, if there is one */ + + existing_target = yaffs_find_by_name(new_dir, new_name); + if (yaffs_is_non_empty_dir(existing_target)){ + return YAFFS_FAIL; /* ENOTEMPTY */ + } else if (existing_target && existing_target != obj) { + /* Nuke the target first, using shadowing, + * but only if it isn't the same object. + * + * Note we must disable gc otherwise it can mess up the shadowing. + * + */ + dev->gc_disable = 1; + yaffs_change_obj_name(obj, new_dir, new_name, force, + existing_target->obj_id); + existing_target->is_shadowed = 1; + yaffs_unlink_obj(existing_target); + dev->gc_disable = 0; + } + + result = yaffs_change_obj_name(obj, new_dir, new_name, 1, 0); + + yaffs_update_parent(old_dir); + if (new_dir != old_dir) + yaffs_update_parent(new_dir); + + return result; + } + return YAFFS_FAIL; +} + +/*----------------------- Initialisation Scanning ---------------------- */ + +void yaffs_handle_shadowed_obj(struct yaffs_dev *dev, int obj_id, + int backward_scanning) +{ + struct yaffs_obj *obj; + + if (!backward_scanning) { + /* Handle YAFFS1 forward scanning case + * For YAFFS1 we always do the deletion + */ + + } else { + /* Handle YAFFS2 case (backward scanning) + * If the shadowed object exists then ignore. + */ + obj = yaffs_find_by_number(dev, obj_id); + if (obj) + return; + } + + /* Let's create it (if it does not exist) assuming it is a file so that it can do shrinking etc. + * We put it in unlinked dir to be cleaned up after the scanning + */ + obj = + yaffs_find_or_create_by_number(dev, obj_id, YAFFS_OBJECT_TYPE_FILE); + if (!obj) + return; + obj->is_shadowed = 1; + yaffs_add_obj_to_dir(dev->unlinked_dir, obj); + obj->variant.file_variant.shrink_size = 0; + obj->valid = 1; /* So that we don't read any other info for this file */ + +} + +void yaffs_link_fixup(struct yaffs_dev *dev, struct yaffs_obj *hard_list) +{ + struct yaffs_obj *hl; + struct yaffs_obj *in; + + while (hard_list) { + hl = hard_list; + hard_list = (struct yaffs_obj *)(hard_list->hard_links.next); + + in = yaffs_find_by_number(dev, + hl->variant. + hardlink_variant.equiv_id); + + if (in) { + /* Add the hardlink pointers */ + hl->variant.hardlink_variant.equiv_obj = in; + list_add(&hl->hard_links, &in->hard_links); + } else { + /* Todo Need to report/handle this better. + * Got a problem... hardlink to a non-existant object + */ + hl->variant.hardlink_variant.equiv_obj = NULL; + INIT_LIST_HEAD(&hl->hard_links); + + } + } +} + +static void yaffs_strip_deleted_objs(struct yaffs_dev *dev) +{ + /* + * Sort out state of unlinked and deleted objects after scanning. + */ + struct list_head *i; + struct list_head *n; + struct yaffs_obj *l; + + if (dev->read_only) + return; + + /* Soft delete all the unlinked files */ + list_for_each_safe(i, n, + &dev->unlinked_dir->variant.dir_variant.children) { + if (i) { + l = list_entry(i, struct yaffs_obj, siblings); + yaffs_del_obj(l); + } + } + + list_for_each_safe(i, n, &dev->del_dir->variant.dir_variant.children) { + if (i) { + l = list_entry(i, struct yaffs_obj, siblings); + yaffs_del_obj(l); + } + } + +} + +/* + * This code iterates through all the objects making sure that they are rooted. + * Any unrooted objects are re-rooted in lost+found. + * An object needs to be in one of: + * - Directly under deleted, unlinked + * - Directly or indirectly under root. + * + * Note: + * This code assumes that we don't ever change the current relationships between + * directories: + * root_dir->parent == unlinked_dir->parent == del_dir->parent == NULL + * lost-n-found->parent == root_dir + * + * This fixes the problem where directories might have inadvertently been deleted + * leaving the object "hanging" without being rooted in the directory tree. + */ + +static int yaffs_has_null_parent(struct yaffs_dev *dev, struct yaffs_obj *obj) +{ + return (obj == dev->del_dir || + obj == dev->unlinked_dir || obj == dev->root_dir); +} + +static void yaffs_fix_hanging_objs(struct yaffs_dev *dev) +{ + struct yaffs_obj *obj; + struct yaffs_obj *parent; + int i; + struct list_head *lh; + struct list_head *n; + int depth_limit; + int hanging; + + if (dev->read_only) + return; + + /* Iterate through the objects in each hash entry, + * looking at each object. + * Make sure it is rooted. + */ + + for (i = 0; i < YAFFS_NOBJECT_BUCKETS; i++) { + list_for_each_safe(lh, n, &dev->obj_bucket[i].list) { + if (lh) { + obj = + list_entry(lh, struct yaffs_obj, hash_link); + parent = obj->parent; + + if (yaffs_has_null_parent(dev, obj)) { + /* These directories are not hanging */ + hanging = 0; + } else if (!parent + || parent->variant_type != + YAFFS_OBJECT_TYPE_DIRECTORY) { + hanging = 1; + } else if (yaffs_has_null_parent(dev, parent)) { + hanging = 0; + } else { + /* + * Need to follow the parent chain to see if it is hanging. + */ + hanging = 0; + depth_limit = 100; + + while (parent != dev->root_dir && + parent->parent && + parent->parent->variant_type == + YAFFS_OBJECT_TYPE_DIRECTORY + && depth_limit > 0) { + parent = parent->parent; + depth_limit--; + } + if (parent != dev->root_dir) + hanging = 1; + } + if (hanging) { + yaffs_trace(YAFFS_TRACE_SCAN, + "Hanging object %d moved to lost and found", + obj->obj_id); + yaffs_add_obj_to_dir(dev->lost_n_found, + obj); + } + } + } + } +} + +/* + * Delete directory contents for cleaning up lost and found. + */ +static void yaffs_del_dir_contents(struct yaffs_obj *dir) +{ + struct yaffs_obj *obj; + struct list_head *lh; + struct list_head *n; + + if (dir->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) + YBUG(); + + list_for_each_safe(lh, n, &dir->variant.dir_variant.children) { + if (lh) { + obj = list_entry(lh, struct yaffs_obj, siblings); + if (obj->variant_type == YAFFS_OBJECT_TYPE_DIRECTORY) + yaffs_del_dir_contents(obj); + + yaffs_trace(YAFFS_TRACE_SCAN, + "Deleting lost_found object %d", + obj->obj_id); + + /* Need to use UnlinkObject since Delete would not handle + * hardlinked objects correctly. + */ + yaffs_unlink_obj(obj); + } + } + +} + +static void yaffs_empty_l_n_f(struct yaffs_dev *dev) +{ + yaffs_del_dir_contents(dev->lost_n_found); +} + + +struct yaffs_obj *yaffs_find_by_name(struct yaffs_obj *directory, + const YCHAR * name) +{ + int sum; + + struct list_head *i; + YCHAR buffer[YAFFS_MAX_NAME_LENGTH + 1]; + + struct yaffs_obj *l; + + if (!name) + return NULL; + + if (!directory) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "tragedy: yaffs_find_by_name: null pointer directory" + ); + YBUG(); + return NULL; + } + if (directory->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "tragedy: yaffs_find_by_name: non-directory" + ); + YBUG(); + } + + sum = yaffs_calc_name_sum(name); + + list_for_each(i, &directory->variant.dir_variant.children) { + if (i) { + l = list_entry(i, struct yaffs_obj, siblings); + + if (l->parent != directory) + YBUG(); + + yaffs_check_obj_details_loaded(l); + + /* Special case for lost-n-found */ + if (l->obj_id == YAFFS_OBJECTID_LOSTNFOUND) { + if (!strcmp(name, YAFFS_LOSTNFOUND_NAME)) + return l; + } else if (l->sum == sum + || l->hdr_chunk <= 0) { + /* LostnFound chunk called Objxxx + * Do a real check + */ + yaffs_get_obj_name(l, buffer, + YAFFS_MAX_NAME_LENGTH + 1); + if (strncmp + (name, buffer, YAFFS_MAX_NAME_LENGTH) == 0) + return l; + } + } + } + + return NULL; +} + +/* GetEquivalentObject dereferences any hard links to get to the + * actual object. + */ + +struct yaffs_obj *yaffs_get_equivalent_obj(struct yaffs_obj *obj) +{ + if (obj && obj->variant_type == YAFFS_OBJECT_TYPE_HARDLINK) { + /* We want the object id of the equivalent object, not this one */ + obj = obj->variant.hardlink_variant.equiv_obj; + yaffs_check_obj_details_loaded(obj); + } + return obj; +} + +/* + * A note or two on object names. + * * If the object name is missing, we then make one up in the form objnnn + * + * * ASCII names are stored in the object header's name field from byte zero + * * Unicode names are historically stored starting from byte zero. + * + * Then there are automatic Unicode names... + * The purpose of these is to save names in a way that can be read as + * ASCII or Unicode names as appropriate, thus allowing a Unicode and ASCII + * system to share files. + * + * These automatic unicode are stored slightly differently... + * - If the name can fit in the ASCII character space then they are saved as + * ascii names as per above. + * - If the name needs Unicode then the name is saved in Unicode + * starting at oh->name[1]. + + */ +static void yaffs_fix_null_name(struct yaffs_obj *obj, YCHAR * name, + int buffer_size) +{ + /* Create an object name if we could not find one. */ + if (strnlen(name, YAFFS_MAX_NAME_LENGTH) == 0) { + YCHAR local_name[20]; + YCHAR num_string[20]; + YCHAR *x = &num_string[19]; + unsigned v = obj->obj_id; + num_string[19] = 0; + while (v > 0) { + x--; + *x = '0' + (v % 10); + v /= 10; + } + /* make up a name */ + strcpy(local_name, YAFFS_LOSTNFOUND_PREFIX); + strcat(local_name, x); + strncpy(name, local_name, buffer_size - 1); + } +} + +int yaffs_get_obj_name(struct yaffs_obj *obj, YCHAR * name, int buffer_size) +{ + memset(name, 0, buffer_size * sizeof(YCHAR)); + + yaffs_check_obj_details_loaded(obj); + + if (obj->obj_id == YAFFS_OBJECTID_LOSTNFOUND) { + strncpy(name, YAFFS_LOSTNFOUND_NAME, buffer_size - 1); + } +#ifndef CONFIG_YAFFS_NO_SHORT_NAMES + else if (obj->short_name[0]) { + strcpy(name, obj->short_name); + } +#endif + else if (obj->hdr_chunk > 0) { + int result; + u8 *buffer = yaffs_get_temp_buffer(obj->my_dev, __LINE__); + + struct yaffs_obj_hdr *oh = (struct yaffs_obj_hdr *)buffer; + + memset(buffer, 0, obj->my_dev->data_bytes_per_chunk); + + if (obj->hdr_chunk > 0) { + result = yaffs_rd_chunk_tags_nand(obj->my_dev, + obj->hdr_chunk, + buffer, NULL); + } + yaffs_load_name_from_oh(obj->my_dev, name, oh->name, + buffer_size); + + yaffs_release_temp_buffer(obj->my_dev, buffer, __LINE__); + } + + yaffs_fix_null_name(obj, name, buffer_size); + + return strnlen(name, YAFFS_MAX_NAME_LENGTH); +} + +int yaffs_get_obj_length(struct yaffs_obj *obj) +{ + /* Dereference any hard linking */ + obj = yaffs_get_equivalent_obj(obj); + + if (obj->variant_type == YAFFS_OBJECT_TYPE_FILE) + return obj->variant.file_variant.file_size; + if (obj->variant_type == YAFFS_OBJECT_TYPE_SYMLINK) { + if (!obj->variant.symlink_variant.alias) + return 0; + return strnlen(obj->variant.symlink_variant.alias, + YAFFS_MAX_ALIAS_LENGTH); + } else { + /* Only a directory should drop through to here */ + return obj->my_dev->data_bytes_per_chunk; + } +} + +int yaffs_get_obj_link_count(struct yaffs_obj *obj) +{ + int count = 0; + struct list_head *i; + + if (!obj->unlinked) + count++; /* the object itself */ + + list_for_each(i, &obj->hard_links) + count++; /* add the hard links; */ + + return count; +} + +int yaffs_get_obj_inode(struct yaffs_obj *obj) +{ + obj = yaffs_get_equivalent_obj(obj); + + return obj->obj_id; +} + +unsigned yaffs_get_obj_type(struct yaffs_obj *obj) +{ + obj = yaffs_get_equivalent_obj(obj); + + switch (obj->variant_type) { + case YAFFS_OBJECT_TYPE_FILE: + return DT_REG; + break; + case YAFFS_OBJECT_TYPE_DIRECTORY: + return DT_DIR; + break; + case YAFFS_OBJECT_TYPE_SYMLINK: + return DT_LNK; + break; + case YAFFS_OBJECT_TYPE_HARDLINK: + return DT_REG; + break; + case YAFFS_OBJECT_TYPE_SPECIAL: + if (S_ISFIFO(obj->yst_mode)) + return DT_FIFO; + if (S_ISCHR(obj->yst_mode)) + return DT_CHR; + if (S_ISBLK(obj->yst_mode)) + return DT_BLK; + if (S_ISSOCK(obj->yst_mode)) + return DT_SOCK; + default: + return DT_REG; + break; + } +} + +YCHAR *yaffs_get_symlink_alias(struct yaffs_obj *obj) +{ + obj = yaffs_get_equivalent_obj(obj); + if (obj->variant_type == YAFFS_OBJECT_TYPE_SYMLINK) + return yaffs_clone_str(obj->variant.symlink_variant.alias); + else + return yaffs_clone_str(_Y("")); +} + +/*--------------------------- Initialisation code -------------------------- */ + +static int yaffs_check_dev_fns(const struct yaffs_dev *dev) +{ + + /* Common functions, gotta have */ + if (!dev->param.erase_fn || !dev->param.initialise_flash_fn) + return 0; + +#ifdef CONFIG_YAFFS_YAFFS2 + + /* Can use the "with tags" style interface for yaffs1 or yaffs2 */ + if (dev->param.write_chunk_tags_fn && + dev->param.read_chunk_tags_fn && + !dev->param.write_chunk_fn && + !dev->param.read_chunk_fn && + dev->param.bad_block_fn && dev->param.query_block_fn) + return 1; +#endif + + /* Can use the "spare" style interface for yaffs1 */ + if (!dev->param.is_yaffs2 && + !dev->param.write_chunk_tags_fn && + !dev->param.read_chunk_tags_fn && + dev->param.write_chunk_fn && + dev->param.read_chunk_fn && + !dev->param.bad_block_fn && !dev->param.query_block_fn) + return 1; + + return 0; /* bad */ +} + +static int yaffs_create_initial_dir(struct yaffs_dev *dev) +{ + /* Initialise the unlinked, deleted, root and lost and found directories */ + + dev->lost_n_found = dev->root_dir = NULL; + dev->unlinked_dir = dev->del_dir = NULL; + + dev->unlinked_dir = + yaffs_create_fake_dir(dev, YAFFS_OBJECTID_UNLINKED, S_IFDIR); + + dev->del_dir = + yaffs_create_fake_dir(dev, YAFFS_OBJECTID_DELETED, S_IFDIR); + + dev->root_dir = + yaffs_create_fake_dir(dev, YAFFS_OBJECTID_ROOT, + YAFFS_ROOT_MODE | S_IFDIR); + dev->lost_n_found = + yaffs_create_fake_dir(dev, YAFFS_OBJECTID_LOSTNFOUND, + YAFFS_LOSTNFOUND_MODE | S_IFDIR); + + if (dev->lost_n_found && dev->root_dir && dev->unlinked_dir + && dev->del_dir) { + yaffs_add_obj_to_dir(dev->root_dir, dev->lost_n_found); + return YAFFS_OK; + } + + return YAFFS_FAIL; +} + +int yaffs_guts_initialise(struct yaffs_dev *dev) +{ + int init_failed = 0; + unsigned x; + int bits; + + yaffs_trace(YAFFS_TRACE_TRACING, "yaffs: yaffs_guts_initialise()" ); + + /* Check stuff that must be set */ + + if (!dev) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "yaffs: Need a device" + ); + return YAFFS_FAIL; + } + + dev->internal_start_block = dev->param.start_block; + dev->internal_end_block = dev->param.end_block; + dev->block_offset = 0; + dev->chunk_offset = 0; + dev->n_free_chunks = 0; + + dev->gc_block = 0; + + if (dev->param.start_block == 0) { + dev->internal_start_block = dev->param.start_block + 1; + dev->internal_end_block = dev->param.end_block + 1; + dev->block_offset = 1; + dev->chunk_offset = dev->param.chunks_per_block; + } + + /* Check geometry parameters. */ + + if ((!dev->param.inband_tags && dev->param.is_yaffs2 && + dev->param.total_bytes_per_chunk < 1024) || + (!dev->param.is_yaffs2 && + dev->param.total_bytes_per_chunk < 512) || + (dev->param.inband_tags && !dev->param.is_yaffs2) || + dev->param.chunks_per_block < 2 || + dev->param.n_reserved_blocks < 2 || + dev->internal_start_block <= 0 || + dev->internal_end_block <= 0 || + dev->internal_end_block <= + (dev->internal_start_block + dev->param.n_reserved_blocks + 2) + ) { + /* otherwise it is too small */ + yaffs_trace(YAFFS_TRACE_ALWAYS, + "NAND geometry problems: chunk size %d, type is yaffs%s, inband_tags %d ", + dev->param.total_bytes_per_chunk, + dev->param.is_yaffs2 ? "2" : "", + dev->param.inband_tags); + return YAFFS_FAIL; + } + + if (yaffs_init_nand(dev) != YAFFS_OK) { + yaffs_trace(YAFFS_TRACE_ALWAYS, "InitialiseNAND failed"); + return YAFFS_FAIL; + } + + /* Sort out space for inband tags, if required */ + if (dev->param.inband_tags) + dev->data_bytes_per_chunk = + dev->param.total_bytes_per_chunk - + sizeof(struct yaffs_packed_tags2_tags_only); + else + dev->data_bytes_per_chunk = dev->param.total_bytes_per_chunk; + + /* Got the right mix of functions? */ + if (!yaffs_check_dev_fns(dev)) { + /* Function missing */ + yaffs_trace(YAFFS_TRACE_ALWAYS, + "device function(s) missing or wrong"); + + return YAFFS_FAIL; + } + + if (dev->is_mounted) { + yaffs_trace(YAFFS_TRACE_ALWAYS, "device already mounted"); + return YAFFS_FAIL; + } + + /* Finished with most checks. One or two more checks happen later on too. */ + + dev->is_mounted = 1; + + /* OK now calculate a few things for the device */ + + /* + * Calculate all the chunk size manipulation numbers: + */ + x = dev->data_bytes_per_chunk; + /* We always use dev->chunk_shift and dev->chunk_div */ + dev->chunk_shift = calc_shifts(x); + x >>= dev->chunk_shift; + dev->chunk_div = x; + /* We only use chunk mask if chunk_div is 1 */ + dev->chunk_mask = (1 << dev->chunk_shift) - 1; + + /* + * Calculate chunk_grp_bits. + * We need to find the next power of 2 > than internal_end_block + */ + + x = dev->param.chunks_per_block * (dev->internal_end_block + 1); + + bits = calc_shifts_ceiling(x); + + /* Set up tnode width if wide tnodes are enabled. */ + if (!dev->param.wide_tnodes_disabled) { + /* bits must be even so that we end up with 32-bit words */ + if (bits & 1) + bits++; + if (bits < 16) + dev->tnode_width = 16; + else + dev->tnode_width = bits; + } else { + dev->tnode_width = 16; + } + + dev->tnode_mask = (1 << dev->tnode_width) - 1; + + /* Level0 Tnodes are 16 bits or wider (if wide tnodes are enabled), + * so if the bitwidth of the + * chunk range we're using is greater than 16 we need + * to figure out chunk shift and chunk_grp_size + */ + + if (bits <= dev->tnode_width) + dev->chunk_grp_bits = 0; + else + dev->chunk_grp_bits = bits - dev->tnode_width; + + dev->tnode_size = (dev->tnode_width * YAFFS_NTNODES_LEVEL0) / 8; + if (dev->tnode_size < sizeof(struct yaffs_tnode)) + dev->tnode_size = sizeof(struct yaffs_tnode); + + dev->chunk_grp_size = 1 << dev->chunk_grp_bits; + + if (dev->param.chunks_per_block < dev->chunk_grp_size) { + /* We have a problem because the soft delete won't work if + * the chunk group size > chunks per block. + * This can be remedied by using larger "virtual blocks". + */ + yaffs_trace(YAFFS_TRACE_ALWAYS, "chunk group too large"); + + return YAFFS_FAIL; + } + + /* OK, we've finished verifying the device, lets continue with initialisation */ + + /* More device initialisation */ + dev->all_gcs = 0; + dev->passive_gc_count = 0; + dev->oldest_dirty_gc_count = 0; + dev->bg_gcs = 0; + dev->gc_block_finder = 0; + dev->buffered_block = -1; + dev->doing_buffered_block_rewrite = 0; + dev->n_deleted_files = 0; + dev->n_bg_deletions = 0; + dev->n_unlinked_files = 0; + dev->n_ecc_fixed = 0; + dev->n_ecc_unfixed = 0; + dev->n_tags_ecc_fixed = 0; + dev->n_tags_ecc_unfixed = 0; + dev->n_erase_failures = 0; + dev->n_erased_blocks = 0; + dev->gc_disable = 0; + dev->has_pending_prioritised_gc = 1; /* Assume the worst for now, will get fixed on first GC */ + INIT_LIST_HEAD(&dev->dirty_dirs); + dev->oldest_dirty_seq = 0; + dev->oldest_dirty_block = 0; + + /* Initialise temporary buffers and caches. */ + if (!yaffs_init_tmp_buffers(dev)) + init_failed = 1; + + dev->cache = NULL; + dev->gc_cleanup_list = NULL; + + if (!init_failed && dev->param.n_caches > 0) { + int i; + void *buf; + int cache_bytes = + dev->param.n_caches * sizeof(struct yaffs_cache); + + if (dev->param.n_caches > YAFFS_MAX_SHORT_OP_CACHES) + dev->param.n_caches = YAFFS_MAX_SHORT_OP_CACHES; + + dev->cache = kmalloc(cache_bytes, GFP_NOFS); + + buf = (u8 *) dev->cache; + + if (dev->cache) + memset(dev->cache, 0, cache_bytes); + + for (i = 0; i < dev->param.n_caches && buf; i++) { + dev->cache[i].object = NULL; + dev->cache[i].last_use = 0; + dev->cache[i].dirty = 0; + dev->cache[i].data = buf = + kmalloc(dev->param.total_bytes_per_chunk, GFP_NOFS); + } + if (!buf) + init_failed = 1; + + dev->cache_last_use = 0; + } + + dev->cache_hits = 0; + + if (!init_failed) { + dev->gc_cleanup_list = + kmalloc(dev->param.chunks_per_block * sizeof(u32), + GFP_NOFS); + if (!dev->gc_cleanup_list) + init_failed = 1; + } + + if (dev->param.is_yaffs2) + dev->param.use_header_file_size = 1; + + if (!init_failed && !yaffs_init_blocks(dev)) + init_failed = 1; + + yaffs_init_tnodes_and_objs(dev); + + if (!init_failed && !yaffs_create_initial_dir(dev)) + init_failed = 1; + + if (!init_failed) { + /* Now scan the flash. */ + if (dev->param.is_yaffs2) { + if (yaffs2_checkpt_restore(dev)) { + yaffs_check_obj_details_loaded(dev->root_dir); + yaffs_trace(YAFFS_TRACE_CHECKPOINT | YAFFS_TRACE_MOUNT, + "yaffs: restored from checkpoint" + ); + } else { + + /* Clean up the mess caused by an aborted checkpoint load + * and scan backwards. + */ + yaffs_deinit_blocks(dev); + + yaffs_deinit_tnodes_and_objs(dev); + + dev->n_erased_blocks = 0; + dev->n_free_chunks = 0; + dev->alloc_block = -1; + dev->alloc_page = -1; + dev->n_deleted_files = 0; + dev->n_unlinked_files = 0; + dev->n_bg_deletions = 0; + + if (!init_failed && !yaffs_init_blocks(dev)) + init_failed = 1; + + yaffs_init_tnodes_and_objs(dev); + + if (!init_failed + && !yaffs_create_initial_dir(dev)) + init_failed = 1; + + if (!init_failed && !yaffs2_scan_backwards(dev)) + init_failed = 1; + } + } else if (!yaffs1_scan(dev)) { + init_failed = 1; + } + + yaffs_strip_deleted_objs(dev); + yaffs_fix_hanging_objs(dev); + if (dev->param.empty_lost_n_found) + yaffs_empty_l_n_f(dev); + } + + if (init_failed) { + /* Clean up the mess */ + yaffs_trace(YAFFS_TRACE_TRACING, + "yaffs: yaffs_guts_initialise() aborted."); + + yaffs_deinitialise(dev); + return YAFFS_FAIL; + } + + /* Zero out stats */ + dev->n_page_reads = 0; + dev->n_page_writes = 0; + dev->n_erasures = 0; + dev->n_gc_copies = 0; + dev->n_retired_writes = 0; + + dev->n_retired_blocks = 0; + + yaffs_verify_free_chunks(dev); + yaffs_verify_blocks(dev); + + /* Clean up any aborted checkpoint data */ + if (!dev->is_checkpointed && dev->blocks_in_checkpt > 0) + yaffs2_checkpt_invalidate(dev); + + yaffs_trace(YAFFS_TRACE_TRACING, + "yaffs: yaffs_guts_initialise() done."); + return YAFFS_OK; + +} + +void yaffs_deinitialise(struct yaffs_dev *dev) +{ + if (dev->is_mounted) { + int i; + + yaffs_deinit_blocks(dev); + yaffs_deinit_tnodes_and_objs(dev); + if (dev->param.n_caches > 0 && dev->cache) { + + for (i = 0; i < dev->param.n_caches; i++) { + if (dev->cache[i].data) + kfree(dev->cache[i].data); + dev->cache[i].data = NULL; + } + + kfree(dev->cache); + dev->cache = NULL; + } + + kfree(dev->gc_cleanup_list); + + for (i = 0; i < YAFFS_N_TEMP_BUFFERS; i++) + kfree(dev->temp_buffer[i].buffer); + + dev->is_mounted = 0; + + if (dev->param.deinitialise_flash_fn) + dev->param.deinitialise_flash_fn(dev); + } +} + +int yaffs_count_free_chunks(struct yaffs_dev *dev) +{ + int n_free = 0; + int b; + + struct yaffs_block_info *blk; + + blk = dev->block_info; + for (b = dev->internal_start_block; b <= dev->internal_end_block; b++) { + switch (blk->block_state) { + case YAFFS_BLOCK_STATE_EMPTY: + case YAFFS_BLOCK_STATE_ALLOCATING: + case YAFFS_BLOCK_STATE_COLLECTING: + case YAFFS_BLOCK_STATE_FULL: + n_free += + (dev->param.chunks_per_block - blk->pages_in_use + + blk->soft_del_pages); + break; + default: + break; + } + blk++; + } + + return n_free; +} + +int yaffs_get_n_free_chunks(struct yaffs_dev *dev) +{ + /* This is what we report to the outside world */ + + int n_free; + int n_dirty_caches; + int blocks_for_checkpt; + int i; + + n_free = dev->n_free_chunks; + n_free += dev->n_deleted_files; + + /* Now count the number of dirty chunks in the cache and subtract those */ + + for (n_dirty_caches = 0, i = 0; i < dev->param.n_caches; i++) { + if (dev->cache[i].dirty) + n_dirty_caches++; + } + + n_free -= n_dirty_caches; + + n_free -= + ((dev->param.n_reserved_blocks + 1) * dev->param.chunks_per_block); + + /* Now we figure out how much to reserve for the checkpoint and report that... */ + blocks_for_checkpt = yaffs_calc_checkpt_blocks_required(dev); + + n_free -= (blocks_for_checkpt * dev->param.chunks_per_block); + + if (n_free < 0) + n_free = 0; + + return n_free; + +} diff --git a/fs/yaffs2/yaffs_guts.h b/fs/yaffs2/yaffs_guts.h new file mode 100644 index 00000000..307eba28 --- /dev/null +++ b/fs/yaffs2/yaffs_guts.h @@ -0,0 +1,915 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_GUTS_H__ +#define __YAFFS_GUTS_H__ + +#include "yportenv.h" + +#define YAFFS_OK 1 +#define YAFFS_FAIL 0 + +/* Give us a Y=0x59, + * Give us an A=0x41, + * Give us an FF=0xFF + * Give us an S=0x53 + * And what have we got... + */ +#define YAFFS_MAGIC 0x5941FF53 + +#define YAFFS_NTNODES_LEVEL0 16 +#define YAFFS_TNODES_LEVEL0_BITS 4 +#define YAFFS_TNODES_LEVEL0_MASK 0xf + +#define YAFFS_NTNODES_INTERNAL (YAFFS_NTNODES_LEVEL0 / 2) +#define YAFFS_TNODES_INTERNAL_BITS (YAFFS_TNODES_LEVEL0_BITS - 1) +#define YAFFS_TNODES_INTERNAL_MASK 0x7 +#define YAFFS_TNODES_MAX_LEVEL 6 + +#ifndef CONFIG_YAFFS_NO_YAFFS1 +#define YAFFS_BYTES_PER_SPARE 16 +#define YAFFS_BYTES_PER_CHUNK 512 +#define YAFFS_CHUNK_SIZE_SHIFT 9 +#define YAFFS_CHUNKS_PER_BLOCK 32 +#define YAFFS_BYTES_PER_BLOCK (YAFFS_CHUNKS_PER_BLOCK*YAFFS_BYTES_PER_CHUNK) +#endif + +#define YAFFS_MIN_YAFFS2_CHUNK_SIZE 1024 +#define YAFFS_MIN_YAFFS2_SPARE_SIZE 32 + +#define YAFFS_MAX_CHUNK_ID 0x000FFFFF + +#define YAFFS_ALLOCATION_NOBJECTS 100 +#define YAFFS_ALLOCATION_NTNODES 100 +#define YAFFS_ALLOCATION_NLINKS 100 + +#define YAFFS_NOBJECT_BUCKETS 256 + +#define YAFFS_OBJECT_SPACE 0x40000 +#define YAFFS_MAX_OBJECT_ID (YAFFS_OBJECT_SPACE -1) + +#define YAFFS_CHECKPOINT_VERSION 4 + +#ifdef CONFIG_YAFFS_UNICODE +#define YAFFS_MAX_NAME_LENGTH 127 +#define YAFFS_MAX_ALIAS_LENGTH 79 +#else +#define YAFFS_MAX_NAME_LENGTH 255 +#define YAFFS_MAX_ALIAS_LENGTH 159 +#endif + +#define YAFFS_SHORT_NAME_LENGTH 15 + +/* Some special object ids for pseudo objects */ +#define YAFFS_OBJECTID_ROOT 1 +#define YAFFS_OBJECTID_LOSTNFOUND 2 +#define YAFFS_OBJECTID_UNLINKED 3 +#define YAFFS_OBJECTID_DELETED 4 + +/* Pseudo object ids for checkpointing */ +#define YAFFS_OBJECTID_SB_HEADER 0x10 +#define YAFFS_OBJECTID_CHECKPOINT_DATA 0x20 +#define YAFFS_SEQUENCE_CHECKPOINT_DATA 0x21 + +#define YAFFS_MAX_SHORT_OP_CACHES 20 + +#define YAFFS_N_TEMP_BUFFERS 6 + +/* We limit the number attempts at sucessfully saving a chunk of data. + * Small-page devices have 32 pages per block; large-page devices have 64. + * Default to something in the order of 5 to 10 blocks worth of chunks. + */ +#define YAFFS_WR_ATTEMPTS (5*64) + +/* Sequence numbers are used in YAFFS2 to determine block allocation order. + * The range is limited slightly to help distinguish bad numbers from good. + * This also allows us to perhaps in the future use special numbers for + * special purposes. + * EFFFFF00 allows the allocation of 8 blocks per second (~1Mbytes) for 15 years, + * and is a larger number than the lifetime of a 2GB device. + */ +#define YAFFS_LOWEST_SEQUENCE_NUMBER 0x00001000 +#define YAFFS_HIGHEST_SEQUENCE_NUMBER 0xEFFFFF00 + +/* Special sequence number for bad block that failed to be marked bad */ +#define YAFFS_SEQUENCE_BAD_BLOCK 0xFFFF0000 + +/* ChunkCache is used for short read/write operations.*/ +struct yaffs_cache { + struct yaffs_obj *object; + int chunk_id; + int last_use; + int dirty; + int n_bytes; /* Only valid if the cache is dirty */ + int locked; /* Can't push out or flush while locked. */ + u8 *data; +}; + +/* Tags structures in RAM + * NB This uses bitfield. Bitfields should not straddle a u32 boundary otherwise + * the structure size will get blown out. + */ + +#ifndef CONFIG_YAFFS_NO_YAFFS1 +struct yaffs_tags { + unsigned chunk_id:20; + unsigned serial_number:2; + unsigned n_bytes_lsb:10; + unsigned obj_id:18; + unsigned ecc:12; + unsigned n_bytes_msb:2; +}; + +union yaffs_tags_union { + struct yaffs_tags as_tags; + u8 as_bytes[8]; +}; + +#endif + +/* Stuff used for extended tags in YAFFS2 */ + +enum yaffs_ecc_result { + YAFFS_ECC_RESULT_UNKNOWN, + YAFFS_ECC_RESULT_NO_ERROR, + YAFFS_ECC_RESULT_FIXED, + YAFFS_ECC_RESULT_UNFIXED +}; + +enum yaffs_obj_type { + YAFFS_OBJECT_TYPE_UNKNOWN, + YAFFS_OBJECT_TYPE_FILE, + YAFFS_OBJECT_TYPE_SYMLINK, + YAFFS_OBJECT_TYPE_DIRECTORY, + YAFFS_OBJECT_TYPE_HARDLINK, + YAFFS_OBJECT_TYPE_SPECIAL +}; + +#define YAFFS_OBJECT_TYPE_MAX YAFFS_OBJECT_TYPE_SPECIAL + +struct yaffs_ext_tags { + + unsigned validity0; + unsigned chunk_used; /* Status of the chunk: used or unused */ + unsigned obj_id; /* If 0 then this is not part of an object (unused) */ + unsigned chunk_id; /* If 0 then this is a header, else a data chunk */ + unsigned n_bytes; /* Only valid for data chunks */ + + /* The following stuff only has meaning when we read */ + enum yaffs_ecc_result ecc_result; + unsigned block_bad; + + /* YAFFS 1 stuff */ + unsigned is_deleted; /* The chunk is marked deleted */ + unsigned serial_number; /* Yaffs1 2-bit serial number */ + + /* YAFFS2 stuff */ + unsigned seq_number; /* The sequence number of this block */ + + /* Extra info if this is an object header (YAFFS2 only) */ + + unsigned extra_available; /* There is extra info available if this is not zero */ + unsigned extra_parent_id; /* The parent object */ + unsigned extra_is_shrink; /* Is it a shrink header? */ + unsigned extra_shadows; /* Does this shadow another object? */ + + enum yaffs_obj_type extra_obj_type; /* What object type? */ + + unsigned extra_length; /* Length if it is a file */ + unsigned extra_equiv_id; /* Equivalent object Id if it is a hard link */ + + unsigned validity1; + +}; + +/* Spare structure for YAFFS1 */ +struct yaffs_spare { + u8 tb0; + u8 tb1; + u8 tb2; + u8 tb3; + u8 page_status; /* set to 0 to delete the chunk */ + u8 block_status; + u8 tb4; + u8 tb5; + u8 ecc1[3]; + u8 tb6; + u8 tb7; + u8 ecc2[3]; +}; + +/*Special structure for passing through to mtd */ +struct yaffs_nand_spare { + struct yaffs_spare spare; + int eccres1; + int eccres2; +}; + +/* Block data in RAM */ + +enum yaffs_block_state { + YAFFS_BLOCK_STATE_UNKNOWN = 0, + + YAFFS_BLOCK_STATE_SCANNING, + /* Being scanned */ + + YAFFS_BLOCK_STATE_NEEDS_SCANNING, + /* The block might have something on it (ie it is allocating or full, perhaps empty) + * but it needs to be scanned to determine its true state. + * This state is only valid during scanning. + * NB We tolerate empty because the pre-scanner might be incapable of deciding + * However, if this state is returned on a YAFFS2 device, then we expect a sequence number + */ + + YAFFS_BLOCK_STATE_EMPTY, + /* This block is empty */ + + YAFFS_BLOCK_STATE_ALLOCATING, + /* This block is partially allocated. + * At least one page holds valid data. + * This is the one currently being used for page + * allocation. Should never be more than one of these. + * If a block is only partially allocated at mount it is treated as full. + */ + + YAFFS_BLOCK_STATE_FULL, + /* All the pages in this block have been allocated. + * If a block was only partially allocated when mounted we treat + * it as fully allocated. + */ + + YAFFS_BLOCK_STATE_DIRTY, + /* The block was full and now all chunks have been deleted. + * Erase me, reuse me. + */ + + YAFFS_BLOCK_STATE_CHECKPOINT, + /* This block is assigned to holding checkpoint data. */ + + YAFFS_BLOCK_STATE_COLLECTING, + /* This block is being garbage collected */ + + YAFFS_BLOCK_STATE_DEAD + /* This block has failed and is not in use */ +}; + +#define YAFFS_NUMBER_OF_BLOCK_STATES (YAFFS_BLOCK_STATE_DEAD + 1) + +struct yaffs_block_info { + + int soft_del_pages:10; /* number of soft deleted pages */ + int pages_in_use:10; /* number of pages in use */ + unsigned block_state:4; /* One of the above block states. NB use unsigned because enum is sometimes an int */ + u32 needs_retiring:1; /* Data has failed on this block, need to get valid data off */ + /* and retire the block. */ + u32 skip_erased_check:1; /* If this is set we can skip the erased check on this block */ + u32 gc_prioritise:1; /* An ECC check or blank check has failed on this block. + It should be prioritised for GC */ + u32 chunk_error_strikes:3; /* How many times we've had ecc etc failures on this block and tried to reuse it */ + +#ifdef CONFIG_YAFFS_YAFFS2 + u32 has_shrink_hdr:1; /* This block has at least one shrink object header */ + u32 seq_number; /* block sequence number for yaffs2 */ +#endif + +}; + +/* -------------------------- Object structure -------------------------------*/ +/* This is the object structure as stored on NAND */ + +struct yaffs_obj_hdr { + enum yaffs_obj_type type; + + /* Apply to everything */ + int parent_obj_id; + u16 sum_no_longer_used; /* checksum of name. No longer used */ + YCHAR name[YAFFS_MAX_NAME_LENGTH + 1]; + + /* The following apply to directories, files, symlinks - not hard links */ + u32 yst_mode; /* protection */ + + u32 yst_uid; + u32 yst_gid; + u32 yst_atime; + u32 yst_mtime; + u32 yst_ctime; + + /* File size applies to files only */ + int file_size; + + /* Equivalent object id applies to hard links only. */ + int equiv_id; + + /* Alias is for symlinks only. */ + YCHAR alias[YAFFS_MAX_ALIAS_LENGTH + 1]; + + u32 yst_rdev; /* device stuff for block and char devices (major/min) */ + + u32 win_ctime[2]; + u32 win_atime[2]; + u32 win_mtime[2]; + + u32 inband_shadowed_obj_id; + u32 inband_is_shrink; + + u32 reserved[2]; + int shadows_obj; /* This object header shadows the specified object if > 0 */ + + /* is_shrink applies to object headers written when we shrink the file (ie resize) */ + u32 is_shrink; + +}; + +/*--------------------------- Tnode -------------------------- */ + +struct yaffs_tnode { + struct yaffs_tnode *internal[YAFFS_NTNODES_INTERNAL]; +}; + +/*------------------------ Object -----------------------------*/ +/* An object can be one of: + * - a directory (no data, has children links + * - a regular file (data.... not prunes :->). + * - a symlink [symbolic link] (the alias). + * - a hard link + */ + +struct yaffs_file_var { + u32 file_size; + u32 scanned_size; + u32 shrink_size; + int top_level; + struct yaffs_tnode *top; +}; + +struct yaffs_dir_var { + struct list_head children; /* list of child links */ + struct list_head dirty; /* Entry for list of dirty directories */ +}; + +struct yaffs_symlink_var { + YCHAR *alias; +}; + +struct yaffs_hardlink_var { + struct yaffs_obj *equiv_obj; + u32 equiv_id; +}; + +union yaffs_obj_var { + struct yaffs_file_var file_variant; + struct yaffs_dir_var dir_variant; + struct yaffs_symlink_var symlink_variant; + struct yaffs_hardlink_var hardlink_variant; +}; + +struct yaffs_obj { + u8 deleted:1; /* This should only apply to unlinked files. */ + u8 soft_del:1; /* it has also been soft deleted */ + u8 unlinked:1; /* An unlinked file. The file should be in the unlinked directory. */ + u8 fake:1; /* A fake object has no presence on NAND. */ + u8 rename_allowed:1; /* Some objects are not allowed to be renamed. */ + u8 unlink_allowed:1; + u8 dirty:1; /* the object needs to be written to flash */ + u8 valid:1; /* When the file system is being loaded up, this + * object might be created before the data + * is available (ie. file data records appear before the header). + */ + u8 lazy_loaded:1; /* This object has been lazy loaded and is missing some detail */ + + u8 defered_free:1; /* For Linux kernel. Object is removed from NAND, but is + * still in the inode cache. Free of object is defered. + * until the inode is released. + */ + u8 being_created:1; /* This object is still being created so skip some checks. */ + u8 is_shadowed:1; /* This object is shadowed on the way to being renamed. */ + + u8 xattr_known:1; /* We know if this has object has xattribs or not. */ + u8 has_xattr:1; /* This object has xattribs. Valid if xattr_known. */ + + u8 serial; /* serial number of chunk in NAND. Cached here */ + u16 sum; /* sum of the name to speed searching */ + + struct yaffs_dev *my_dev; /* The device I'm on */ + + struct list_head hash_link; /* list of objects in this hash bucket */ + + struct list_head hard_links; /* all the equivalent hard linked objects */ + + /* directory structure stuff */ + /* also used for linking up the free list */ + struct yaffs_obj *parent; + struct list_head siblings; + + /* Where's my object header in NAND? */ + int hdr_chunk; + + int n_data_chunks; /* Number of data chunks attached to the file. */ + + u32 obj_id; /* the object id value */ + + u32 yst_mode; + +#ifndef CONFIG_YAFFS_NO_SHORT_NAMES + YCHAR short_name[YAFFS_SHORT_NAME_LENGTH + 1]; +#endif + +#ifdef CONFIG_YAFFS_WINCE + u32 win_ctime[2]; + u32 win_mtime[2]; + u32 win_atime[2]; +#else + u32 yst_uid; + u32 yst_gid; + u32 yst_atime; + u32 yst_mtime; + u32 yst_ctime; +#endif + + u32 yst_rdev; + + void *my_inode; + + enum yaffs_obj_type variant_type; + + union yaffs_obj_var variant; + +}; + +struct yaffs_obj_bucket { + struct list_head list; + int count; +}; + +/* yaffs_checkpt_obj holds the definition of an object as dumped + * by checkpointing. + */ + +struct yaffs_checkpt_obj { + int struct_type; + u32 obj_id; + u32 parent_id; + int hdr_chunk; + enum yaffs_obj_type variant_type:3; + u8 deleted:1; + u8 soft_del:1; + u8 unlinked:1; + u8 fake:1; + u8 rename_allowed:1; + u8 unlink_allowed:1; + u8 serial; + int n_data_chunks; + u32 size_or_equiv_obj; +}; + +/*--------------------- Temporary buffers ---------------- + * + * These are chunk-sized working buffers. Each device has a few + */ + +struct yaffs_buffer { + u8 *buffer; + int line; /* track from whence this buffer was allocated */ + int max_line; +}; + +/*----------------- Device ---------------------------------*/ + +struct yaffs_param { + const YCHAR *name; + + /* + * Entry parameters set up way early. Yaffs sets up the rest. + * The structure should be zeroed out before use so that unused + * and defualt values are zero. + */ + + int inband_tags; /* Use unband tags */ + u32 total_bytes_per_chunk; /* Should be >= 512, does not need to be a power of 2 */ + int chunks_per_block; /* does not need to be a power of 2 */ + int spare_bytes_per_chunk; /* spare area size */ + int start_block; /* Start block we're allowed to use */ + int end_block; /* End block we're allowed to use */ + int n_reserved_blocks; /* We want this tuneable so that we can reduce */ + /* reserved blocks on NOR and RAM. */ + + int n_caches; /* If <= 0, then short op caching is disabled, else + * the number of short op caches (don't use too many). + * 10 to 20 is a good bet. + */ + int use_nand_ecc; /* Flag to decide whether or not to use NANDECC on data (yaffs1) */ + int no_tags_ecc; /* Flag to decide whether or not to do ECC on packed tags (yaffs2) */ + + int is_yaffs2; /* Use yaffs2 mode on this device */ + + int empty_lost_n_found; /* Auto-empty lost+found directory on mount */ + + int refresh_period; /* How often we should check to do a block refresh */ + + /* Checkpoint control. Can be set before or after initialisation */ + u8 skip_checkpt_rd; + u8 skip_checkpt_wr; + + int enable_xattr; /* Enable xattribs */ + + /* NAND access functions (Must be set before calling YAFFS) */ + + int (*write_chunk_fn) (struct yaffs_dev * dev, + int nand_chunk, const u8 * data, + const struct yaffs_spare * spare); + int (*read_chunk_fn) (struct yaffs_dev * dev, + int nand_chunk, u8 * data, + struct yaffs_spare * spare); + int (*erase_fn) (struct yaffs_dev * dev, int flash_block); + int (*initialise_flash_fn) (struct yaffs_dev * dev); + int (*deinitialise_flash_fn) (struct yaffs_dev * dev); + +#ifdef CONFIG_YAFFS_YAFFS2 + int (*write_chunk_tags_fn) (struct yaffs_dev * dev, + int nand_chunk, const u8 * data, + const struct yaffs_ext_tags * tags); + int (*read_chunk_tags_fn) (struct yaffs_dev * dev, + int nand_chunk, u8 * data, + struct yaffs_ext_tags * tags); + int (*bad_block_fn) (struct yaffs_dev * dev, int block_no); + int (*query_block_fn) (struct yaffs_dev * dev, int block_no, + enum yaffs_block_state * state, + u32 * seq_number); +#endif + + /* The remove_obj_fn function must be supplied by OS flavours that + * need it. + * yaffs direct uses it to implement the faster readdir. + * Linux uses it to protect the directory during unlocking. + */ + void (*remove_obj_fn) (struct yaffs_obj * obj); + + /* Callback to mark the superblock dirty */ + void (*sb_dirty_fn) (struct yaffs_dev * dev); + + /* Callback to control garbage collection. */ + unsigned (*gc_control) (struct yaffs_dev * dev); + + /* Debug control flags. Don't use unless you know what you're doing */ + int use_header_file_size; /* Flag to determine if we should use file sizes from the header */ + int disable_lazy_load; /* Disable lazy loading on this device */ + int wide_tnodes_disabled; /* Set to disable wide tnodes */ + int disable_soft_del; /* yaffs 1 only: Set to disable the use of softdeletion. */ + + int defered_dir_update; /* Set to defer directory updates */ + +#ifdef CONFIG_YAFFS_AUTO_UNICODE + int auto_unicode; +#endif + int always_check_erased; /* Force chunk erased check always on */ +}; + +struct yaffs_dev { + struct yaffs_param param; + + /* Context storage. Holds extra OS specific data for this device */ + + void *os_context; + void *driver_context; + + struct list_head dev_list; + + /* Runtime parameters. Set up by YAFFS. */ + int data_bytes_per_chunk; + + /* Non-wide tnode stuff */ + u16 chunk_grp_bits; /* Number of bits that need to be resolved if + * the tnodes are not wide enough. + */ + u16 chunk_grp_size; /* == 2^^chunk_grp_bits */ + + /* Stuff to support wide tnodes */ + u32 tnode_width; + u32 tnode_mask; + u32 tnode_size; + + /* Stuff for figuring out file offset to chunk conversions */ + u32 chunk_shift; /* Shift value */ + u32 chunk_div; /* Divisor after shifting: 1 for power-of-2 sizes */ + u32 chunk_mask; /* Mask to use for power-of-2 case */ + + int is_mounted; + int read_only; + int is_checkpointed; + + /* Stuff to support block offsetting to support start block zero */ + int internal_start_block; + int internal_end_block; + int block_offset; + int chunk_offset; + + /* Runtime checkpointing stuff */ + int checkpt_page_seq; /* running sequence number of checkpoint pages */ + int checkpt_byte_count; + int checkpt_byte_offs; + u8 *checkpt_buffer; + int checkpt_open_write; + int blocks_in_checkpt; + int checkpt_cur_chunk; + int checkpt_cur_block; + int checkpt_next_block; + int *checkpt_block_list; + int checkpt_max_blocks; + u32 checkpt_sum; + u32 checkpt_xor; + + int checkpoint_blocks_required; /* Number of blocks needed to store current checkpoint set */ + + /* Block Info */ + struct yaffs_block_info *block_info; + u8 *chunk_bits; /* bitmap of chunks in use */ + unsigned block_info_alt:1; /* was allocated using alternative strategy */ + unsigned chunk_bits_alt:1; /* was allocated using alternative strategy */ + int chunk_bit_stride; /* Number of bytes of chunk_bits per block. + * Must be consistent with chunks_per_block. + */ + + int n_erased_blocks; + int alloc_block; /* Current block being allocated off */ + u32 alloc_page; + int alloc_block_finder; /* Used to search for next allocation block */ + + /* Object and Tnode memory management */ + void *allocator; + int n_obj; + int n_tnodes; + + int n_hardlinks; + + struct yaffs_obj_bucket obj_bucket[YAFFS_NOBJECT_BUCKETS]; + u32 bucket_finder; + + int n_free_chunks; + + /* Garbage collection control */ + u32 *gc_cleanup_list; /* objects to delete at the end of a GC. */ + u32 n_clean_ups; + + unsigned has_pending_prioritised_gc; /* We think this device might have pending prioritised gcs */ + unsigned gc_disable; + unsigned gc_block_finder; + unsigned gc_dirtiest; + unsigned gc_pages_in_use; + unsigned gc_not_done; + unsigned gc_block; + unsigned gc_chunk; + unsigned gc_skip; + + /* Special directories */ + struct yaffs_obj *root_dir; + struct yaffs_obj *lost_n_found; + + /* Buffer areas for storing data to recover from write failures TODO + * u8 buffered_data[YAFFS_CHUNKS_PER_BLOCK][YAFFS_BYTES_PER_CHUNK]; + * struct yaffs_spare buffered_spare[YAFFS_CHUNKS_PER_BLOCK]; + */ + + int buffered_block; /* Which block is buffered here? */ + int doing_buffered_block_rewrite; + + struct yaffs_cache *cache; + int cache_last_use; + + /* Stuff for background deletion and unlinked files. */ + struct yaffs_obj *unlinked_dir; /* Directory where unlinked and deleted files live. */ + struct yaffs_obj *del_dir; /* Directory where deleted objects are sent to disappear. */ + struct yaffs_obj *unlinked_deletion; /* Current file being background deleted. */ + int n_deleted_files; /* Count of files awaiting deletion; */ + int n_unlinked_files; /* Count of unlinked files. */ + int n_bg_deletions; /* Count of background deletions. */ + + /* Temporary buffer management */ + struct yaffs_buffer temp_buffer[YAFFS_N_TEMP_BUFFERS]; + int max_temp; + int temp_in_use; + int unmanaged_buffer_allocs; + int unmanaged_buffer_deallocs; + + /* yaffs2 runtime stuff */ + unsigned seq_number; /* Sequence number of currently allocating block */ + unsigned oldest_dirty_seq; + unsigned oldest_dirty_block; + + /* Block refreshing */ + int refresh_skip; /* A skip down counter. Refresh happens when this gets to zero. */ + + /* Dirty directory handling */ + struct list_head dirty_dirs; /* List of dirty directories */ + + /* Statistcs */ + u32 n_page_writes; + u32 n_page_reads; + u32 n_erasures; + u32 n_erase_failures; + u32 n_gc_copies; + u32 all_gcs; + u32 passive_gc_count; + u32 oldest_dirty_gc_count; + u32 n_gc_blocks; + u32 bg_gcs; + u32 n_retired_writes; + u32 n_retired_blocks; + u32 n_ecc_fixed; + u32 n_ecc_unfixed; + u32 n_tags_ecc_fixed; + u32 n_tags_ecc_unfixed; + u32 n_deletions; + u32 n_unmarked_deletions; + u32 refresh_count; + u32 cache_hits; + +}; + +/* The CheckpointDevice structure holds the device information that changes at runtime and + * must be preserved over unmount/mount cycles. + */ +struct yaffs_checkpt_dev { + int struct_type; + int n_erased_blocks; + int alloc_block; /* Current block being allocated off */ + u32 alloc_page; + int n_free_chunks; + + int n_deleted_files; /* Count of files awaiting deletion; */ + int n_unlinked_files; /* Count of unlinked files. */ + int n_bg_deletions; /* Count of background deletions. */ + + /* yaffs2 runtime stuff */ + unsigned seq_number; /* Sequence number of currently allocating block */ + +}; + +struct yaffs_checkpt_validity { + int struct_type; + u32 magic; + u32 version; + u32 head; +}; + +struct yaffs_shadow_fixer { + int obj_id; + int shadowed_id; + struct yaffs_shadow_fixer *next; +}; + +/* Structure for doing xattr modifications */ +struct yaffs_xattr_mod { + int set; /* If 0 then this is a deletion */ + const YCHAR *name; + const void *data; + int size; + int flags; + int result; +}; + +/*----------------------- YAFFS Functions -----------------------*/ + +int yaffs_guts_initialise(struct yaffs_dev *dev); +void yaffs_deinitialise(struct yaffs_dev *dev); + +int yaffs_get_n_free_chunks(struct yaffs_dev *dev); + +int yaffs_rename_obj(struct yaffs_obj *old_dir, const YCHAR * old_name, + struct yaffs_obj *new_dir, const YCHAR * new_name); + +int yaffs_unlinker(struct yaffs_obj *dir, const YCHAR * name); +int yaffs_del_obj(struct yaffs_obj *obj); + +int yaffs_get_obj_name(struct yaffs_obj *obj, YCHAR * name, int buffer_size); +int yaffs_get_obj_length(struct yaffs_obj *obj); +int yaffs_get_obj_inode(struct yaffs_obj *obj); +unsigned yaffs_get_obj_type(struct yaffs_obj *obj); +int yaffs_get_obj_link_count(struct yaffs_obj *obj); + +/* File operations */ +int yaffs_file_rd(struct yaffs_obj *obj, u8 * buffer, loff_t offset, + int n_bytes); +int yaffs_wr_file(struct yaffs_obj *obj, const u8 * buffer, loff_t offset, + int n_bytes, int write_trhrough); +int yaffs_resize_file(struct yaffs_obj *obj, loff_t new_size); + +struct yaffs_obj *yaffs_create_file(struct yaffs_obj *parent, + const YCHAR * name, u32 mode, u32 uid, + u32 gid); + +int yaffs_flush_file(struct yaffs_obj *obj, int update_time, int data_sync); + +/* Flushing and checkpointing */ +void yaffs_flush_whole_cache(struct yaffs_dev *dev); + +int yaffs_checkpoint_save(struct yaffs_dev *dev); +int yaffs_checkpoint_restore(struct yaffs_dev *dev); + +/* Directory operations */ +struct yaffs_obj *yaffs_create_dir(struct yaffs_obj *parent, const YCHAR * name, + u32 mode, u32 uid, u32 gid); +struct yaffs_obj *yaffs_find_by_name(struct yaffs_obj *the_dir, + const YCHAR * name); +struct yaffs_obj *yaffs_find_by_number(struct yaffs_dev *dev, u32 number); + +/* Link operations */ +struct yaffs_obj *yaffs_link_obj(struct yaffs_obj *parent, const YCHAR * name, + struct yaffs_obj *equiv_obj); + +struct yaffs_obj *yaffs_get_equivalent_obj(struct yaffs_obj *obj); + +/* Symlink operations */ +struct yaffs_obj *yaffs_create_symlink(struct yaffs_obj *parent, + const YCHAR * name, u32 mode, u32 uid, + u32 gid, const YCHAR * alias); +YCHAR *yaffs_get_symlink_alias(struct yaffs_obj *obj); + +/* Special inodes (fifos, sockets and devices) */ +struct yaffs_obj *yaffs_create_special(struct yaffs_obj *parent, + const YCHAR * name, u32 mode, u32 uid, + u32 gid, u32 rdev); + +int yaffs_set_xattrib(struct yaffs_obj *obj, const YCHAR * name, + const void *value, int size, int flags); +int yaffs_get_xattrib(struct yaffs_obj *obj, const YCHAR * name, void *value, + int size); +int yaffs_list_xattrib(struct yaffs_obj *obj, char *buffer, int size); +int yaffs_remove_xattrib(struct yaffs_obj *obj, const YCHAR * name); + +/* Special directories */ +struct yaffs_obj *yaffs_root(struct yaffs_dev *dev); +struct yaffs_obj *yaffs_lost_n_found(struct yaffs_dev *dev); + +void yaffs_handle_defered_free(struct yaffs_obj *obj); + +void yaffs_update_dirty_dirs(struct yaffs_dev *dev); + +int yaffs_bg_gc(struct yaffs_dev *dev, unsigned urgency); + +/* Debug dump */ +int yaffs_dump_obj(struct yaffs_obj *obj); + +void yaffs_guts_test(struct yaffs_dev *dev); + +/* A few useful functions to be used within the core files*/ +void yaffs_chunk_del(struct yaffs_dev *dev, int chunk_id, int mark_flash, + int lyn); +int yaffs_check_ff(u8 * buffer, int n_bytes); +void yaffs_handle_chunk_error(struct yaffs_dev *dev, + struct yaffs_block_info *bi); + +u8 *yaffs_get_temp_buffer(struct yaffs_dev *dev, int line_no); +void yaffs_release_temp_buffer(struct yaffs_dev *dev, u8 * buffer, int line_no); + +struct yaffs_obj *yaffs_find_or_create_by_number(struct yaffs_dev *dev, + int number, + enum yaffs_obj_type type); +int yaffs_put_chunk_in_file(struct yaffs_obj *in, int inode_chunk, + int nand_chunk, int in_scan); +void yaffs_set_obj_name(struct yaffs_obj *obj, const YCHAR * name); +void yaffs_set_obj_name_from_oh(struct yaffs_obj *obj, + const struct yaffs_obj_hdr *oh); +void yaffs_add_obj_to_dir(struct yaffs_obj *directory, struct yaffs_obj *obj); +YCHAR *yaffs_clone_str(const YCHAR * str); +void yaffs_link_fixup(struct yaffs_dev *dev, struct yaffs_obj *hard_list); +void yaffs_block_became_dirty(struct yaffs_dev *dev, int block_no); +int yaffs_update_oh(struct yaffs_obj *in, const YCHAR * name, + int force, int is_shrink, int shadows, + struct yaffs_xattr_mod *xop); +void yaffs_handle_shadowed_obj(struct yaffs_dev *dev, int obj_id, + int backward_scanning); +int yaffs_check_alloc_available(struct yaffs_dev *dev, int n_chunks); +struct yaffs_tnode *yaffs_get_tnode(struct yaffs_dev *dev); +struct yaffs_tnode *yaffs_add_find_tnode_0(struct yaffs_dev *dev, + struct yaffs_file_var *file_struct, + u32 chunk_id, + struct yaffs_tnode *passed_tn); + +int yaffs_do_file_wr(struct yaffs_obj *in, const u8 * buffer, loff_t offset, + int n_bytes, int write_trhrough); +void yaffs_resize_file_down(struct yaffs_obj *obj, loff_t new_size); +void yaffs_skip_rest_of_block(struct yaffs_dev *dev); + +int yaffs_count_free_chunks(struct yaffs_dev *dev); + +struct yaffs_tnode *yaffs_find_tnode_0(struct yaffs_dev *dev, + struct yaffs_file_var *file_struct, + u32 chunk_id); + +u32 yaffs_get_group_base(struct yaffs_dev *dev, struct yaffs_tnode *tn, + unsigned pos); + +int yaffs_is_non_empty_dir(struct yaffs_obj *obj); +#endif diff --git a/fs/yaffs2/yaffs_linux.h b/fs/yaffs2/yaffs_linux.h new file mode 100644 index 00000000..3b508cbc --- /dev/null +++ b/fs/yaffs2/yaffs_linux.h @@ -0,0 +1,41 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_LINUX_H__ +#define __YAFFS_LINUX_H__ + +#include "yportenv.h" + +struct yaffs_linux_context { + struct list_head context_list; /* List of these we have mounted */ + struct yaffs_dev *dev; + struct super_block *super; + struct task_struct *bg_thread; /* Background thread for this device */ + int bg_running; + struct mutex gross_lock; /* Gross locking mutex*/ + u8 *spare_buffer; /* For mtdif2 use. Don't know the size of the buffer + * at compile time so we have to allocate it. + */ + struct list_head search_contexts; + void (*put_super_fn) (struct super_block * sb); + + struct task_struct *readdir_process; + unsigned mount_id; +}; + +#define yaffs_dev_to_lc(dev) ((struct yaffs_linux_context *)((dev)->os_context)) +#define yaffs_dev_to_mtd(dev) ((struct mtd_info *)((dev)->driver_context)) + +#endif diff --git a/fs/yaffs2/yaffs_mtdif.c b/fs/yaffs2/yaffs_mtdif.c new file mode 100644 index 00000000..7cf53b3d --- /dev/null +++ b/fs/yaffs2/yaffs_mtdif.c @@ -0,0 +1,54 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "yportenv.h" + +#include "yaffs_mtdif.h" + +#include "linux/mtd/mtd.h" +#include "linux/types.h" +#include "linux/time.h" +#include "linux/mtd/nand.h" + +#include "yaffs_linux.h" + +int nandmtd_erase_block(struct yaffs_dev *dev, int block_no) +{ + struct mtd_info *mtd = yaffs_dev_to_mtd(dev); + u32 addr = + ((loff_t) block_no) * dev->param.total_bytes_per_chunk + * dev->param.chunks_per_block; + struct erase_info ei; + + int retval = 0; + + ei.mtd = mtd; + ei.addr = addr; + ei.len = dev->param.total_bytes_per_chunk * dev->param.chunks_per_block; + ei.time = 1000; + ei.retries = 2; + ei.callback = NULL; + ei.priv = (u_long) dev; + + retval = mtd->erase(mtd, &ei); + + if (retval == 0) + return YAFFS_OK; + else + return YAFFS_FAIL; +} + +int nandmtd_initialise(struct yaffs_dev *dev) +{ + return YAFFS_OK; +} diff --git a/fs/yaffs2/yaffs_mtdif.h b/fs/yaffs2/yaffs_mtdif.h new file mode 100644 index 00000000..66650741 --- /dev/null +++ b/fs/yaffs2/yaffs_mtdif.h @@ -0,0 +1,23 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_MTDIF_H__ +#define __YAFFS_MTDIF_H__ + +#include "yaffs_guts.h" + +int nandmtd_erase_block(struct yaffs_dev *dev, int block_no); +int nandmtd_initialise(struct yaffs_dev *dev); +#endif diff --git a/fs/yaffs2/yaffs_mtdif1.c b/fs/yaffs2/yaffs_mtdif1.c new file mode 100644 index 00000000..51083695 --- /dev/null +++ b/fs/yaffs2/yaffs_mtdif1.c @@ -0,0 +1,330 @@ +/* + * YAFFS: Yet another FFS. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * This module provides the interface between yaffs_nand.c and the + * MTD API. This version is used when the MTD interface supports the + * 'mtd_oob_ops' style calls to read_oob and write_oob, circa 2.6.17, + * and we have small-page NAND device. + * + * These functions are invoked via function pointers in yaffs_nand.c. + * This replaces functionality provided by functions in yaffs_mtdif.c + * and the yaffs_tags compatability functions in yaffs_tagscompat.c that are + * called in yaffs_mtdif.c when the function pointers are NULL. + * We assume the MTD layer is performing ECC (use_nand_ecc is true). + */ + +#include "yportenv.h" +#include "yaffs_trace.h" +#include "yaffs_guts.h" +#include "yaffs_packedtags1.h" +#include "yaffs_tagscompat.h" /* for yaffs_calc_tags_ecc */ +#include "yaffs_linux.h" + +#include "linux/kernel.h" +#include "linux/version.h" +#include "linux/types.h" +#include "linux/mtd/mtd.h" + +#ifndef CONFIG_YAFFS_9BYTE_TAGS +# define YTAG1_SIZE 8 +#else +# define YTAG1_SIZE 9 +#endif + +/* Write a chunk (page) of data to NAND. + * + * Caller always provides ExtendedTags data which are converted to a more + * compact (packed) form for storage in NAND. A mini-ECC runs over the + * contents of the tags meta-data; used to valid the tags when read. + * + * - Pack ExtendedTags to packed_tags1 form + * - Compute mini-ECC for packed_tags1 + * - Write data and packed tags to NAND. + * + * Note: Due to the use of the packed_tags1 meta-data which does not include + * a full sequence number (as found in the larger packed_tags2 form) it is + * necessary for Yaffs to re-write a chunk/page (just once) to mark it as + * discarded and dirty. This is not ideal: newer NAND parts are supposed + * to be written just once. When Yaffs performs this operation, this + * function is called with a NULL data pointer -- calling MTD write_oob + * without data is valid usage (2.6.17). + * + * Any underlying MTD error results in YAFFS_FAIL. + * Returns YAFFS_OK or YAFFS_FAIL. + */ +int nandmtd1_write_chunk_tags(struct yaffs_dev *dev, + int nand_chunk, const u8 * data, + const struct yaffs_ext_tags *etags) +{ + struct mtd_info *mtd = yaffs_dev_to_mtd(dev); + int chunk_bytes = dev->data_bytes_per_chunk; + loff_t addr = ((loff_t) nand_chunk) * chunk_bytes; + struct mtd_oob_ops ops; + struct yaffs_packed_tags1 pt1; + int retval; + + /* we assume that packed_tags1 and struct yaffs_tags are compatible */ + compile_time_assertion(sizeof(struct yaffs_packed_tags1) == 12); + compile_time_assertion(sizeof(struct yaffs_tags) == 8); + + yaffs_pack_tags1(&pt1, etags); + yaffs_calc_tags_ecc((struct yaffs_tags *)&pt1); + + /* When deleting a chunk, the upper layer provides only skeletal + * etags, one with is_deleted set. However, we need to update the + * tags, not erase them completely. So we use the NAND write property + * that only zeroed-bits stick and set tag bytes to all-ones and + * zero just the (not) deleted bit. + */ +#ifndef CONFIG_YAFFS_9BYTE_TAGS + if (etags->is_deleted) { + memset(&pt1, 0xff, 8); + /* clear delete status bit to indicate deleted */ + pt1.deleted = 0; + } +#else + ((u8 *) & pt1)[8] = 0xff; + if (etags->is_deleted) { + memset(&pt1, 0xff, 8); + /* zero page_status byte to indicate deleted */ + ((u8 *) & pt1)[8] = 0; + } +#endif + + memset(&ops, 0, sizeof(ops)); + ops.mode = MTD_OOB_AUTO; + ops.len = (data) ? chunk_bytes : 0; + ops.ooblen = YTAG1_SIZE; + ops.datbuf = (u8 *) data; + ops.oobbuf = (u8 *) & pt1; + + retval = mtd->write_oob(mtd, addr, &ops); + if (retval) { + yaffs_trace(YAFFS_TRACE_MTD, + "write_oob failed, chunk %d, mtd error %d", + nand_chunk, retval); + } + return retval ? YAFFS_FAIL : YAFFS_OK; +} + +/* Return with empty ExtendedTags but add ecc_result. + */ +static int rettags(struct yaffs_ext_tags *etags, int ecc_result, int retval) +{ + if (etags) { + memset(etags, 0, sizeof(*etags)); + etags->ecc_result = ecc_result; + } + return retval; +} + +/* Read a chunk (page) from NAND. + * + * Caller expects ExtendedTags data to be usable even on error; that is, + * all members except ecc_result and block_bad are zeroed. + * + * - Check ECC results for data (if applicable) + * - Check for blank/erased block (return empty ExtendedTags if blank) + * - Check the packed_tags1 mini-ECC (correct if necessary/possible) + * - Convert packed_tags1 to ExtendedTags + * - Update ecc_result and block_bad members to refect state. + * + * Returns YAFFS_OK or YAFFS_FAIL. + */ +int nandmtd1_read_chunk_tags(struct yaffs_dev *dev, + int nand_chunk, u8 * data, + struct yaffs_ext_tags *etags) +{ + struct mtd_info *mtd = yaffs_dev_to_mtd(dev); + int chunk_bytes = dev->data_bytes_per_chunk; + loff_t addr = ((loff_t) nand_chunk) * chunk_bytes; + int eccres = YAFFS_ECC_RESULT_NO_ERROR; + struct mtd_oob_ops ops; + struct yaffs_packed_tags1 pt1; + int retval; + int deleted; + + memset(&ops, 0, sizeof(ops)); + ops.mode = MTD_OOB_AUTO; + ops.len = (data) ? chunk_bytes : 0; + ops.ooblen = YTAG1_SIZE; + ops.datbuf = data; + ops.oobbuf = (u8 *) & pt1; + + /* Read page and oob using MTD. + * Check status and determine ECC result. + */ + retval = mtd->read_oob(mtd, addr, &ops); + if (retval) { + yaffs_trace(YAFFS_TRACE_MTD, + "read_oob failed, chunk %d, mtd error %d", + nand_chunk, retval); + } + + switch (retval) { + case 0: + /* no error */ + break; + + case -EUCLEAN: + /* MTD's ECC fixed the data */ + eccres = YAFFS_ECC_RESULT_FIXED; + dev->n_ecc_fixed++; + break; + + case -EBADMSG: + /* MTD's ECC could not fix the data */ + dev->n_ecc_unfixed++; + /* fall into... */ + default: + rettags(etags, YAFFS_ECC_RESULT_UNFIXED, 0); + etags->block_bad = (mtd->block_isbad) (mtd, addr); + return YAFFS_FAIL; + } + + /* Check for a blank/erased chunk. + */ + if (yaffs_check_ff((u8 *) & pt1, 8)) { + /* when blank, upper layers want ecc_result to be <= NO_ERROR */ + return rettags(etags, YAFFS_ECC_RESULT_NO_ERROR, YAFFS_OK); + } +#ifndef CONFIG_YAFFS_9BYTE_TAGS + /* Read deleted status (bit) then return it to it's non-deleted + * state before performing tags mini-ECC check. pt1.deleted is + * inverted. + */ + deleted = !pt1.deleted; + pt1.deleted = 1; +#else + deleted = (yaffs_count_bits(((u8 *) & pt1)[8]) < 7); +#endif + + /* Check the packed tags mini-ECC and correct if necessary/possible. + */ + retval = yaffs_check_tags_ecc((struct yaffs_tags *)&pt1); + switch (retval) { + case 0: + /* no tags error, use MTD result */ + break; + case 1: + /* recovered tags-ECC error */ + dev->n_tags_ecc_fixed++; + if (eccres == YAFFS_ECC_RESULT_NO_ERROR) + eccres = YAFFS_ECC_RESULT_FIXED; + break; + default: + /* unrecovered tags-ECC error */ + dev->n_tags_ecc_unfixed++; + return rettags(etags, YAFFS_ECC_RESULT_UNFIXED, YAFFS_FAIL); + } + + /* Unpack the tags to extended form and set ECC result. + * [set should_be_ff just to keep yaffs_unpack_tags1 happy] + */ + pt1.should_be_ff = 0xFFFFFFFF; + yaffs_unpack_tags1(etags, &pt1); + etags->ecc_result = eccres; + + /* Set deleted state */ + etags->is_deleted = deleted; + return YAFFS_OK; +} + +/* Mark a block bad. + * + * This is a persistant state. + * Use of this function should be rare. + * + * Returns YAFFS_OK or YAFFS_FAIL. + */ +int nandmtd1_mark_block_bad(struct yaffs_dev *dev, int block_no) +{ + struct mtd_info *mtd = yaffs_dev_to_mtd(dev); + int blocksize = dev->param.chunks_per_block * dev->data_bytes_per_chunk; + int retval; + + yaffs_trace(YAFFS_TRACE_BAD_BLOCKS, + "marking block %d bad", block_no); + + retval = mtd->block_markbad(mtd, (loff_t) blocksize * block_no); + return (retval) ? YAFFS_FAIL : YAFFS_OK; +} + +/* Check any MTD prerequists. + * + * Returns YAFFS_OK or YAFFS_FAIL. + */ +static int nandmtd1_test_prerequists(struct mtd_info *mtd) +{ + /* 2.6.18 has mtd->ecclayout->oobavail */ + /* 2.6.21 has mtd->ecclayout->oobavail and mtd->oobavail */ + int oobavail = mtd->ecclayout->oobavail; + + if (oobavail < YTAG1_SIZE) { + yaffs_trace(YAFFS_TRACE_ERROR, + "mtd device has only %d bytes for tags, need %d", + oobavail, YTAG1_SIZE); + return YAFFS_FAIL; + } + return YAFFS_OK; +} + +/* Query for the current state of a specific block. + * + * Examine the tags of the first chunk of the block and return the state: + * - YAFFS_BLOCK_STATE_DEAD, the block is marked bad + * - YAFFS_BLOCK_STATE_NEEDS_SCANNING, the block is in use + * - YAFFS_BLOCK_STATE_EMPTY, the block is clean + * + * Always returns YAFFS_OK. + */ +int nandmtd1_query_block(struct yaffs_dev *dev, int block_no, + enum yaffs_block_state *state_ptr, u32 * seq_ptr) +{ + struct mtd_info *mtd = yaffs_dev_to_mtd(dev); + int chunk_num = block_no * dev->param.chunks_per_block; + loff_t addr = (loff_t) chunk_num * dev->data_bytes_per_chunk; + struct yaffs_ext_tags etags; + int state = YAFFS_BLOCK_STATE_DEAD; + int seqnum = 0; + int retval; + + /* We don't yet have a good place to test for MTD config prerequists. + * Do it here as we are called during the initial scan. + */ + if (nandmtd1_test_prerequists(mtd) != YAFFS_OK) + return YAFFS_FAIL; + + retval = nandmtd1_read_chunk_tags(dev, chunk_num, NULL, &etags); + etags.block_bad = (mtd->block_isbad) (mtd, addr); + if (etags.block_bad) { + yaffs_trace(YAFFS_TRACE_BAD_BLOCKS, + "block %d is marked bad", block_no); + state = YAFFS_BLOCK_STATE_DEAD; + } else if (etags.ecc_result != YAFFS_ECC_RESULT_NO_ERROR) { + /* bad tags, need to look more closely */ + state = YAFFS_BLOCK_STATE_NEEDS_SCANNING; + } else if (etags.chunk_used) { + state = YAFFS_BLOCK_STATE_NEEDS_SCANNING; + seqnum = etags.seq_number; + } else { + state = YAFFS_BLOCK_STATE_EMPTY; + } + + *state_ptr = state; + *seq_ptr = seqnum; + + /* query always succeeds */ + return YAFFS_OK; +} diff --git a/fs/yaffs2/yaffs_mtdif1.h b/fs/yaffs2/yaffs_mtdif1.h new file mode 100644 index 00000000..07ce4524 --- /dev/null +++ b/fs/yaffs2/yaffs_mtdif1.h @@ -0,0 +1,29 @@ +/* + * YAFFS: Yet another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_MTDIF1_H__ +#define __YAFFS_MTDIF1_H__ + +int nandmtd1_write_chunk_tags(struct yaffs_dev *dev, int nand_chunk, + const u8 * data, + const struct yaffs_ext_tags *tags); + +int nandmtd1_read_chunk_tags(struct yaffs_dev *dev, int nand_chunk, + u8 * data, struct yaffs_ext_tags *tags); + +int nandmtd1_mark_block_bad(struct yaffs_dev *dev, int block_no); + +int nandmtd1_query_block(struct yaffs_dev *dev, int block_no, + enum yaffs_block_state *state, u32 * seq_number); + +#endif diff --git a/fs/yaffs2/yaffs_mtdif2.c b/fs/yaffs2/yaffs_mtdif2.c new file mode 100644 index 00000000..d1643df2 --- /dev/null +++ b/fs/yaffs2/yaffs_mtdif2.c @@ -0,0 +1,225 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* mtd interface for YAFFS2 */ + +#include "yportenv.h" +#include "yaffs_trace.h" + +#include "yaffs_mtdif2.h" + +#include "linux/mtd/mtd.h" +#include "linux/types.h" +#include "linux/time.h" + +#include "yaffs_packedtags2.h" + +#include "yaffs_linux.h" + +/* NB For use with inband tags.... + * We assume that the data buffer is of size total_bytes_per_chunk so that we can also + * use it to load the tags. + */ +int nandmtd2_write_chunk_tags(struct yaffs_dev *dev, int nand_chunk, + const u8 * data, + const struct yaffs_ext_tags *tags) +{ + struct mtd_info *mtd = yaffs_dev_to_mtd(dev); + struct mtd_oob_ops ops; + int retval = 0; + + loff_t addr; + + struct yaffs_packed_tags2 pt; + + int packed_tags_size = + dev->param.no_tags_ecc ? sizeof(pt.t) : sizeof(pt); + void *packed_tags_ptr = + dev->param.no_tags_ecc ? (void *)&pt.t : (void *)&pt; + + yaffs_trace(YAFFS_TRACE_MTD, + "nandmtd2_write_chunk_tags chunk %d data %p tags %p", + nand_chunk, data, tags); + + addr = ((loff_t) nand_chunk) * dev->param.total_bytes_per_chunk; + + /* For yaffs2 writing there must be both data and tags. + * If we're using inband tags, then the tags are stuffed into + * the end of the data buffer. + */ + if (!data || !tags) + BUG(); + else if (dev->param.inband_tags) { + struct yaffs_packed_tags2_tags_only *pt2tp; + pt2tp = + (struct yaffs_packed_tags2_tags_only *)(data + + dev-> + data_bytes_per_chunk); + yaffs_pack_tags2_tags_only(pt2tp, tags); + } else { + yaffs_pack_tags2(&pt, tags, !dev->param.no_tags_ecc); + } + + ops.mode = MTD_OOB_AUTO; + ops.ooblen = (dev->param.inband_tags) ? 0 : packed_tags_size; + ops.len = dev->param.total_bytes_per_chunk; + ops.ooboffs = 0; + ops.datbuf = (u8 *) data; + ops.oobbuf = (dev->param.inband_tags) ? NULL : packed_tags_ptr; + retval = mtd->write_oob(mtd, addr, &ops); + + if (retval == 0) + return YAFFS_OK; + else + return YAFFS_FAIL; +} + +int nandmtd2_read_chunk_tags(struct yaffs_dev *dev, int nand_chunk, + u8 * data, struct yaffs_ext_tags *tags) +{ + struct mtd_info *mtd = yaffs_dev_to_mtd(dev); + struct mtd_oob_ops ops; + + size_t dummy; + int retval = 0; + int local_data = 0; + + loff_t addr = ((loff_t) nand_chunk) * dev->param.total_bytes_per_chunk; + + struct yaffs_packed_tags2 pt; + + int packed_tags_size = + dev->param.no_tags_ecc ? sizeof(pt.t) : sizeof(pt); + void *packed_tags_ptr = + dev->param.no_tags_ecc ? (void *)&pt.t : (void *)&pt; + + yaffs_trace(YAFFS_TRACE_MTD, + "nandmtd2_read_chunk_tags chunk %d data %p tags %p", + nand_chunk, data, tags); + + if (dev->param.inband_tags) { + + if (!data) { + local_data = 1; + data = yaffs_get_temp_buffer(dev, __LINE__); + } + + } + + if (dev->param.inband_tags || (data && !tags)) + retval = mtd->read(mtd, addr, dev->param.total_bytes_per_chunk, + &dummy, data); + else if (tags) { + ops.mode = MTD_OOB_AUTO; + ops.ooblen = packed_tags_size; + ops.len = data ? dev->data_bytes_per_chunk : packed_tags_size; + ops.ooboffs = 0; + ops.datbuf = data; + ops.oobbuf = yaffs_dev_to_lc(dev)->spare_buffer; + retval = mtd->read_oob(mtd, addr, &ops); + } + + if (dev->param.inband_tags) { + if (tags) { + struct yaffs_packed_tags2_tags_only *pt2tp; + pt2tp = + (struct yaffs_packed_tags2_tags_only *)&data[dev-> + data_bytes_per_chunk]; + yaffs_unpack_tags2_tags_only(tags, pt2tp); + } + } else { + if (tags) { + memcpy(packed_tags_ptr, + yaffs_dev_to_lc(dev)->spare_buffer, + packed_tags_size); + yaffs_unpack_tags2(tags, &pt, !dev->param.no_tags_ecc); + } + } + + if (local_data) + yaffs_release_temp_buffer(dev, data, __LINE__); + + if (tags && retval == -EBADMSG + && tags->ecc_result == YAFFS_ECC_RESULT_NO_ERROR) { + tags->ecc_result = YAFFS_ECC_RESULT_UNFIXED; + dev->n_ecc_unfixed++; + } + if (tags && retval == -EUCLEAN + && tags->ecc_result == YAFFS_ECC_RESULT_NO_ERROR) { + tags->ecc_result = YAFFS_ECC_RESULT_FIXED; + dev->n_ecc_fixed++; + } + if (retval == 0) + return YAFFS_OK; + else + return YAFFS_FAIL; +} + +int nandmtd2_mark_block_bad(struct yaffs_dev *dev, int block_no) +{ + struct mtd_info *mtd = yaffs_dev_to_mtd(dev); + int retval; + yaffs_trace(YAFFS_TRACE_MTD, + "nandmtd2_mark_block_bad %d", block_no); + + retval = + mtd->block_markbad(mtd, + block_no * dev->param.chunks_per_block * + dev->param.total_bytes_per_chunk); + + if (retval == 0) + return YAFFS_OK; + else + return YAFFS_FAIL; + +} + +int nandmtd2_query_block(struct yaffs_dev *dev, int block_no, + enum yaffs_block_state *state, u32 * seq_number) +{ + struct mtd_info *mtd = yaffs_dev_to_mtd(dev); + int retval; + + yaffs_trace(YAFFS_TRACE_MTD, "nandmtd2_query_block %d", block_no); + retval = + mtd->block_isbad(mtd, + block_no * dev->param.chunks_per_block * + dev->param.total_bytes_per_chunk); + + if (retval) { + yaffs_trace(YAFFS_TRACE_MTD, "block is bad"); + + *state = YAFFS_BLOCK_STATE_DEAD; + *seq_number = 0; + } else { + struct yaffs_ext_tags t; + nandmtd2_read_chunk_tags(dev, block_no * + dev->param.chunks_per_block, NULL, &t); + + if (t.chunk_used) { + *seq_number = t.seq_number; + *state = YAFFS_BLOCK_STATE_NEEDS_SCANNING; + } else { + *seq_number = 0; + *state = YAFFS_BLOCK_STATE_EMPTY; + } + } + yaffs_trace(YAFFS_TRACE_MTD, + "block is bad seq %d state %d", *seq_number, *state); + + if (retval == 0) + return YAFFS_OK; + else + return YAFFS_FAIL; +} + diff --git a/fs/yaffs2/yaffs_mtdif2.h b/fs/yaffs2/yaffs_mtdif2.h new file mode 100644 index 00000000..d8211261 --- /dev/null +++ b/fs/yaffs2/yaffs_mtdif2.h @@ -0,0 +1,29 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_MTDIF2_H__ +#define __YAFFS_MTDIF2_H__ + +#include "yaffs_guts.h" +int nandmtd2_write_chunk_tags(struct yaffs_dev *dev, int nand_chunk, + const u8 * data, + const struct yaffs_ext_tags *tags); +int nandmtd2_read_chunk_tags(struct yaffs_dev *dev, int nand_chunk, + u8 * data, struct yaffs_ext_tags *tags); +int nandmtd2_mark_block_bad(struct yaffs_dev *dev, int block_no); +int nandmtd2_query_block(struct yaffs_dev *dev, int block_no, + enum yaffs_block_state *state, u32 * seq_number); + +#endif diff --git a/fs/yaffs2/yaffs_nameval.c b/fs/yaffs2/yaffs_nameval.c new file mode 100644 index 00000000..daa36f98 --- /dev/null +++ b/fs/yaffs2/yaffs_nameval.c @@ -0,0 +1,201 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * This simple implementation of a name-value store assumes a small number of values and fits + * into a small finite buffer. + * + * Each attribute is stored as a record: + * sizeof(int) bytes record size. + * strnlen+1 bytes name null terminated. + * nbytes value. + * ---------- + * total size stored in record size + * + * This code has not been tested with unicode yet. + */ + +#include "yaffs_nameval.h" + +#include "yportenv.h" + +static int nval_find(const char *xb, int xb_size, const YCHAR * name, + int *exist_size) +{ + int pos = 0; + int size; + + memcpy(&size, xb, sizeof(int)); + while (size > 0 && (size < xb_size) && (pos + size < xb_size)) { + if (strncmp + ((YCHAR *) (xb + pos + sizeof(int)), name, size) == 0) { + if (exist_size) + *exist_size = size; + return pos; + } + pos += size; + if (pos < xb_size - sizeof(int)) + memcpy(&size, xb + pos, sizeof(int)); + else + size = 0; + } + if (exist_size) + *exist_size = 0; + return -1; +} + +static int nval_used(const char *xb, int xb_size) +{ + int pos = 0; + int size; + + memcpy(&size, xb + pos, sizeof(int)); + while (size > 0 && (size < xb_size) && (pos + size < xb_size)) { + pos += size; + if (pos < xb_size - sizeof(int)) + memcpy(&size, xb + pos, sizeof(int)); + else + size = 0; + } + return pos; +} + +int nval_del(char *xb, int xb_size, const YCHAR * name) +{ + int pos = nval_find(xb, xb_size, name, NULL); + int size; + + if (pos >= 0 && pos < xb_size) { + /* Find size, shift rest over this record, then zero out the rest of buffer */ + memcpy(&size, xb + pos, sizeof(int)); + memcpy(xb + pos, xb + pos + size, xb_size - (pos + size)); + memset(xb + (xb_size - size), 0, size); + return 0; + } else { + return -ENODATA; + } +} + +int nval_set(char *xb, int xb_size, const YCHAR * name, const char *buf, + int bsize, int flags) +{ + int pos; + int namelen = strnlen(name, xb_size); + int reclen; + int size_exist = 0; + int space; + int start; + + pos = nval_find(xb, xb_size, name, &size_exist); + + if (flags & XATTR_CREATE && pos >= 0) + return -EEXIST; + if (flags & XATTR_REPLACE && pos < 0) + return -ENODATA; + + start = nval_used(xb, xb_size); + space = xb_size - start + size_exist; + + reclen = (sizeof(int) + namelen + 1 + bsize); + + if (reclen > space) + return -ENOSPC; + + if (pos >= 0) { + nval_del(xb, xb_size, name); + start = nval_used(xb, xb_size); + } + + pos = start; + + memcpy(xb + pos, &reclen, sizeof(int)); + pos += sizeof(int); + strncpy((YCHAR *) (xb + pos), name, reclen); + pos += (namelen + 1); + memcpy(xb + pos, buf, bsize); + return 0; +} + +int nval_get(const char *xb, int xb_size, const YCHAR * name, char *buf, + int bsize) +{ + int pos = nval_find(xb, xb_size, name, NULL); + int size; + + if (pos >= 0 && pos < xb_size) { + + memcpy(&size, xb + pos, sizeof(int)); + pos += sizeof(int); /* advance past record length */ + size -= sizeof(int); + + /* Advance over name string */ + while (xb[pos] && size > 0 && pos < xb_size) { + pos++; + size--; + } + /*Advance over NUL */ + pos++; + size--; + + if (size <= bsize) { + memcpy(buf, xb + pos, size); + return size; + } + + } + if (pos >= 0) + return -ERANGE; + else + return -ENODATA; +} + +int nval_list(const char *xb, int xb_size, char *buf, int bsize) +{ + int pos = 0; + int size; + int name_len; + int ncopied = 0; + int filled = 0; + + memcpy(&size, xb + pos, sizeof(int)); + while (size > sizeof(int) && size <= xb_size && (pos + size) < xb_size + && !filled) { + pos += sizeof(int); + size -= sizeof(int); + name_len = strnlen((YCHAR *) (xb + pos), size); + if (ncopied + name_len + 1 < bsize) { + memcpy(buf, xb + pos, name_len * sizeof(YCHAR)); + buf += name_len; + *buf = '\0'; + buf++; + if (sizeof(YCHAR) > 1) { + *buf = '\0'; + buf++; + } + ncopied += (name_len + 1); + } else { + filled = 1; + } + pos += size; + if (pos < xb_size - sizeof(int)) + memcpy(&size, xb + pos, sizeof(int)); + else + size = 0; + } + return ncopied; +} + +int nval_hasvalues(const char *xb, int xb_size) +{ + return nval_used(xb, xb_size) > 0; +} diff --git a/fs/yaffs2/yaffs_nameval.h b/fs/yaffs2/yaffs_nameval.h new file mode 100644 index 00000000..2bb02b62 --- /dev/null +++ b/fs/yaffs2/yaffs_nameval.h @@ -0,0 +1,28 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __NAMEVAL_H__ +#define __NAMEVAL_H__ + +#include "yportenv.h" + +int nval_del(char *xb, int xb_size, const YCHAR * name); +int nval_set(char *xb, int xb_size, const YCHAR * name, const char *buf, + int bsize, int flags); +int nval_get(const char *xb, int xb_size, const YCHAR * name, char *buf, + int bsize); +int nval_list(const char *xb, int xb_size, char *buf, int bsize); +int nval_hasvalues(const char *xb, int xb_size); +#endif diff --git a/fs/yaffs2/yaffs_nand.c b/fs/yaffs2/yaffs_nand.c new file mode 100644 index 00000000..e816cabf --- /dev/null +++ b/fs/yaffs2/yaffs_nand.c @@ -0,0 +1,127 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "yaffs_nand.h" +#include "yaffs_tagscompat.h" +#include "yaffs_tagsvalidity.h" + +#include "yaffs_getblockinfo.h" + +int yaffs_rd_chunk_tags_nand(struct yaffs_dev *dev, int nand_chunk, + u8 * buffer, struct yaffs_ext_tags *tags) +{ + int result; + struct yaffs_ext_tags local_tags; + + int realigned_chunk = nand_chunk - dev->chunk_offset; + + dev->n_page_reads++; + + /* If there are no tags provided, use local tags to get prioritised gc working */ + if (!tags) + tags = &local_tags; + + if (dev->param.read_chunk_tags_fn) + result = + dev->param.read_chunk_tags_fn(dev, realigned_chunk, buffer, + tags); + else + result = yaffs_tags_compat_rd(dev, + realigned_chunk, buffer, tags); + if (tags && tags->ecc_result > YAFFS_ECC_RESULT_NO_ERROR) { + + struct yaffs_block_info *bi; + bi = yaffs_get_block_info(dev, + nand_chunk / + dev->param.chunks_per_block); + yaffs_handle_chunk_error(dev, bi); + } + + return result; +} + +int yaffs_wr_chunk_tags_nand(struct yaffs_dev *dev, + int nand_chunk, + const u8 * buffer, struct yaffs_ext_tags *tags) +{ + + dev->n_page_writes++; + + nand_chunk -= dev->chunk_offset; + + if (tags) { + tags->seq_number = dev->seq_number; + tags->chunk_used = 1; + if (!yaffs_validate_tags(tags)) { + yaffs_trace(YAFFS_TRACE_ERROR, "Writing uninitialised tags"); + YBUG(); + } + yaffs_trace(YAFFS_TRACE_WRITE, + "Writing chunk %d tags %d %d", + nand_chunk, tags->obj_id, tags->chunk_id); + } else { + yaffs_trace(YAFFS_TRACE_ERROR, "Writing with no tags"); + YBUG(); + } + + if (dev->param.write_chunk_tags_fn) + return dev->param.write_chunk_tags_fn(dev, nand_chunk, buffer, + tags); + else + return yaffs_tags_compat_wr(dev, nand_chunk, buffer, tags); +} + +int yaffs_mark_bad(struct yaffs_dev *dev, int block_no) +{ + block_no -= dev->block_offset; + + if (dev->param.bad_block_fn) + return dev->param.bad_block_fn(dev, block_no); + else + return yaffs_tags_compat_mark_bad(dev, block_no); +} + +int yaffs_query_init_block_state(struct yaffs_dev *dev, + int block_no, + enum yaffs_block_state *state, + u32 * seq_number) +{ + block_no -= dev->block_offset; + + if (dev->param.query_block_fn) + return dev->param.query_block_fn(dev, block_no, state, + seq_number); + else + return yaffs_tags_compat_query_block(dev, block_no, + state, seq_number); +} + +int yaffs_erase_block(struct yaffs_dev *dev, int flash_block) +{ + int result; + + flash_block -= dev->block_offset; + + dev->n_erasures++; + + result = dev->param.erase_fn(dev, flash_block); + + return result; +} + +int yaffs_init_nand(struct yaffs_dev *dev) +{ + if (dev->param.initialise_flash_fn) + return dev->param.initialise_flash_fn(dev); + return YAFFS_OK; +} diff --git a/fs/yaffs2/yaffs_nand.h b/fs/yaffs2/yaffs_nand.h new file mode 100644 index 00000000..543f1987 --- /dev/null +++ b/fs/yaffs2/yaffs_nand.h @@ -0,0 +1,38 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_NAND_H__ +#define __YAFFS_NAND_H__ +#include "yaffs_guts.h" + +int yaffs_rd_chunk_tags_nand(struct yaffs_dev *dev, int nand_chunk, + u8 * buffer, struct yaffs_ext_tags *tags); + +int yaffs_wr_chunk_tags_nand(struct yaffs_dev *dev, + int nand_chunk, + const u8 * buffer, struct yaffs_ext_tags *tags); + +int yaffs_mark_bad(struct yaffs_dev *dev, int block_no); + +int yaffs_query_init_block_state(struct yaffs_dev *dev, + int block_no, + enum yaffs_block_state *state, + unsigned *seq_number); + +int yaffs_erase_block(struct yaffs_dev *dev, int flash_block); + +int yaffs_init_nand(struct yaffs_dev *dev); + +#endif diff --git a/fs/yaffs2/yaffs_packedtags1.c b/fs/yaffs2/yaffs_packedtags1.c new file mode 100644 index 00000000..a77f0954 --- /dev/null +++ b/fs/yaffs2/yaffs_packedtags1.c @@ -0,0 +1,53 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "yaffs_packedtags1.h" +#include "yportenv.h" + +void yaffs_pack_tags1(struct yaffs_packed_tags1 *pt, + const struct yaffs_ext_tags *t) +{ + pt->chunk_id = t->chunk_id; + pt->serial_number = t->serial_number; + pt->n_bytes = t->n_bytes; + pt->obj_id = t->obj_id; + pt->ecc = 0; + pt->deleted = (t->is_deleted) ? 0 : 1; + pt->unused_stuff = 0; + pt->should_be_ff = 0xFFFFFFFF; + +} + +void yaffs_unpack_tags1(struct yaffs_ext_tags *t, + const struct yaffs_packed_tags1 *pt) +{ + static const u8 all_ff[] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff + }; + + if (memcmp(all_ff, pt, sizeof(struct yaffs_packed_tags1))) { + t->block_bad = 0; + if (pt->should_be_ff != 0xFFFFFFFF) + t->block_bad = 1; + t->chunk_used = 1; + t->obj_id = pt->obj_id; + t->chunk_id = pt->chunk_id; + t->n_bytes = pt->n_bytes; + t->ecc_result = YAFFS_ECC_RESULT_NO_ERROR; + t->is_deleted = (pt->deleted) ? 0 : 1; + t->serial_number = pt->serial_number; + } else { + memset(t, 0, sizeof(struct yaffs_ext_tags)); + } +} diff --git a/fs/yaffs2/yaffs_packedtags1.h b/fs/yaffs2/yaffs_packedtags1.h new file mode 100644 index 00000000..d6861ff5 --- /dev/null +++ b/fs/yaffs2/yaffs_packedtags1.h @@ -0,0 +1,39 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +/* This is used to pack YAFFS1 tags, not YAFFS2 tags. */ + +#ifndef __YAFFS_PACKEDTAGS1_H__ +#define __YAFFS_PACKEDTAGS1_H__ + +#include "yaffs_guts.h" + +struct yaffs_packed_tags1 { + unsigned chunk_id:20; + unsigned serial_number:2; + unsigned n_bytes:10; + unsigned obj_id:18; + unsigned ecc:12; + unsigned deleted:1; + unsigned unused_stuff:1; + unsigned should_be_ff; + +}; + +void yaffs_pack_tags1(struct yaffs_packed_tags1 *pt, + const struct yaffs_ext_tags *t); +void yaffs_unpack_tags1(struct yaffs_ext_tags *t, + const struct yaffs_packed_tags1 *pt); +#endif diff --git a/fs/yaffs2/yaffs_packedtags2.c b/fs/yaffs2/yaffs_packedtags2.c new file mode 100644 index 00000000..8e7fea3d --- /dev/null +++ b/fs/yaffs2/yaffs_packedtags2.c @@ -0,0 +1,196 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "yaffs_packedtags2.h" +#include "yportenv.h" +#include "yaffs_trace.h" +#include "yaffs_tagsvalidity.h" + +/* This code packs a set of extended tags into a binary structure for + * NAND storage + */ + +/* Some of the information is "extra" struff which can be packed in to + * speed scanning + * This is defined by having the EXTRA_HEADER_INFO_FLAG set. + */ + +/* Extra flags applied to chunk_id */ + +#define EXTRA_HEADER_INFO_FLAG 0x80000000 +#define EXTRA_SHRINK_FLAG 0x40000000 +#define EXTRA_SHADOWS_FLAG 0x20000000 +#define EXTRA_SPARE_FLAGS 0x10000000 + +#define ALL_EXTRA_FLAGS 0xF0000000 + +/* Also, the top 4 bits of the object Id are set to the object type. */ +#define EXTRA_OBJECT_TYPE_SHIFT (28) +#define EXTRA_OBJECT_TYPE_MASK ((0x0F) << EXTRA_OBJECT_TYPE_SHIFT) + +static void yaffs_dump_packed_tags2_tags_only(const struct + yaffs_packed_tags2_tags_only *ptt) +{ + yaffs_trace(YAFFS_TRACE_MTD, + "packed tags obj %d chunk %d byte %d seq %d", + ptt->obj_id, ptt->chunk_id, ptt->n_bytes, ptt->seq_number); +} + +static void yaffs_dump_packed_tags2(const struct yaffs_packed_tags2 *pt) +{ + yaffs_dump_packed_tags2_tags_only(&pt->t); +} + +static void yaffs_dump_tags2(const struct yaffs_ext_tags *t) +{ + yaffs_trace(YAFFS_TRACE_MTD, + "ext.tags eccres %d blkbad %d chused %d obj %d chunk%d byte %d del %d ser %d seq %d", + t->ecc_result, t->block_bad, t->chunk_used, t->obj_id, + t->chunk_id, t->n_bytes, t->is_deleted, t->serial_number, + t->seq_number); + +} + +void yaffs_pack_tags2_tags_only(struct yaffs_packed_tags2_tags_only *ptt, + const struct yaffs_ext_tags *t) +{ + ptt->chunk_id = t->chunk_id; + ptt->seq_number = t->seq_number; + ptt->n_bytes = t->n_bytes; + ptt->obj_id = t->obj_id; + + if (t->chunk_id == 0 && t->extra_available) { + /* Store the extra header info instead */ + /* We save the parent object in the chunk_id */ + ptt->chunk_id = EXTRA_HEADER_INFO_FLAG | t->extra_parent_id; + if (t->extra_is_shrink) + ptt->chunk_id |= EXTRA_SHRINK_FLAG; + if (t->extra_shadows) + ptt->chunk_id |= EXTRA_SHADOWS_FLAG; + + ptt->obj_id &= ~EXTRA_OBJECT_TYPE_MASK; + ptt->obj_id |= (t->extra_obj_type << EXTRA_OBJECT_TYPE_SHIFT); + + if (t->extra_obj_type == YAFFS_OBJECT_TYPE_HARDLINK) + ptt->n_bytes = t->extra_equiv_id; + else if (t->extra_obj_type == YAFFS_OBJECT_TYPE_FILE) + ptt->n_bytes = t->extra_length; + else + ptt->n_bytes = 0; + } + + yaffs_dump_packed_tags2_tags_only(ptt); + yaffs_dump_tags2(t); +} + +void yaffs_pack_tags2(struct yaffs_packed_tags2 *pt, + const struct yaffs_ext_tags *t, int tags_ecc) +{ + yaffs_pack_tags2_tags_only(&pt->t, t); + + if (tags_ecc) + yaffs_ecc_calc_other((unsigned char *)&pt->t, + sizeof(struct + yaffs_packed_tags2_tags_only), + &pt->ecc); +} + +void yaffs_unpack_tags2_tags_only(struct yaffs_ext_tags *t, + struct yaffs_packed_tags2_tags_only *ptt) +{ + + memset(t, 0, sizeof(struct yaffs_ext_tags)); + + yaffs_init_tags(t); + + if (ptt->seq_number != 0xFFFFFFFF) { + t->block_bad = 0; + t->chunk_used = 1; + t->obj_id = ptt->obj_id; + t->chunk_id = ptt->chunk_id; + t->n_bytes = ptt->n_bytes; + t->is_deleted = 0; + t->serial_number = 0; + t->seq_number = ptt->seq_number; + + /* Do extra header info stuff */ + + if (ptt->chunk_id & EXTRA_HEADER_INFO_FLAG) { + t->chunk_id = 0; + t->n_bytes = 0; + + t->extra_available = 1; + t->extra_parent_id = + ptt->chunk_id & (~(ALL_EXTRA_FLAGS)); + t->extra_is_shrink = + (ptt->chunk_id & EXTRA_SHRINK_FLAG) ? 1 : 0; + t->extra_shadows = + (ptt->chunk_id & EXTRA_SHADOWS_FLAG) ? 1 : 0; + t->extra_obj_type = + ptt->obj_id >> EXTRA_OBJECT_TYPE_SHIFT; + t->obj_id &= ~EXTRA_OBJECT_TYPE_MASK; + + if (t->extra_obj_type == YAFFS_OBJECT_TYPE_HARDLINK) + t->extra_equiv_id = ptt->n_bytes; + else + t->extra_length = ptt->n_bytes; + } + } + + yaffs_dump_packed_tags2_tags_only(ptt); + yaffs_dump_tags2(t); + +} + +void yaffs_unpack_tags2(struct yaffs_ext_tags *t, struct yaffs_packed_tags2 *pt, + int tags_ecc) +{ + + enum yaffs_ecc_result ecc_result = YAFFS_ECC_RESULT_NO_ERROR; + + if (pt->t.seq_number != 0xFFFFFFFF && tags_ecc) { + /* Chunk is in use and we need to do ECC */ + + struct yaffs_ecc_other ecc; + int result; + yaffs_ecc_calc_other((unsigned char *)&pt->t, + sizeof(struct + yaffs_packed_tags2_tags_only), + &ecc); + result = + yaffs_ecc_correct_other((unsigned char *)&pt->t, + sizeof(struct + yaffs_packed_tags2_tags_only), + &pt->ecc, &ecc); + switch (result) { + case 0: + ecc_result = YAFFS_ECC_RESULT_NO_ERROR; + break; + case 1: + ecc_result = YAFFS_ECC_RESULT_FIXED; + break; + case -1: + ecc_result = YAFFS_ECC_RESULT_UNFIXED; + break; + default: + ecc_result = YAFFS_ECC_RESULT_UNKNOWN; + } + } + + yaffs_unpack_tags2_tags_only(t, &pt->t); + + t->ecc_result = ecc_result; + + yaffs_dump_packed_tags2(pt); + yaffs_dump_tags2(t); +} diff --git a/fs/yaffs2/yaffs_packedtags2.h b/fs/yaffs2/yaffs_packedtags2.h new file mode 100644 index 00000000..f3296697 --- /dev/null +++ b/fs/yaffs2/yaffs_packedtags2.h @@ -0,0 +1,47 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +/* This is used to pack YAFFS2 tags, not YAFFS1tags. */ + +#ifndef __YAFFS_PACKEDTAGS2_H__ +#define __YAFFS_PACKEDTAGS2_H__ + +#include "yaffs_guts.h" +#include "yaffs_ecc.h" + +struct yaffs_packed_tags2_tags_only { + unsigned seq_number; + unsigned obj_id; + unsigned chunk_id; + unsigned n_bytes; +}; + +struct yaffs_packed_tags2 { + struct yaffs_packed_tags2_tags_only t; + struct yaffs_ecc_other ecc; +}; + +/* Full packed tags with ECC, used for oob tags */ +void yaffs_pack_tags2(struct yaffs_packed_tags2 *pt, + const struct yaffs_ext_tags *t, int tags_ecc); +void yaffs_unpack_tags2(struct yaffs_ext_tags *t, struct yaffs_packed_tags2 *pt, + int tags_ecc); + +/* Only the tags part (no ECC for use with inband tags */ +void yaffs_pack_tags2_tags_only(struct yaffs_packed_tags2_tags_only *pt, + const struct yaffs_ext_tags *t); +void yaffs_unpack_tags2_tags_only(struct yaffs_ext_tags *t, + struct yaffs_packed_tags2_tags_only *pt); +#endif diff --git a/fs/yaffs2/yaffs_tagscompat.c b/fs/yaffs2/yaffs_tagscompat.c new file mode 100644 index 00000000..7578075d --- /dev/null +++ b/fs/yaffs2/yaffs_tagscompat.c @@ -0,0 +1,422 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "yaffs_guts.h" +#include "yaffs_tagscompat.h" +#include "yaffs_ecc.h" +#include "yaffs_getblockinfo.h" +#include "yaffs_trace.h" + +static void yaffs_handle_rd_data_error(struct yaffs_dev *dev, int nand_chunk); + + +/********** Tags ECC calculations *********/ + +void yaffs_calc_ecc(const u8 * data, struct yaffs_spare *spare) +{ + yaffs_ecc_cacl(data, spare->ecc1); + yaffs_ecc_cacl(&data[256], spare->ecc2); +} + +void yaffs_calc_tags_ecc(struct yaffs_tags *tags) +{ + /* Calculate an ecc */ + + unsigned char *b = ((union yaffs_tags_union *)tags)->as_bytes; + unsigned i, j; + unsigned ecc = 0; + unsigned bit = 0; + + tags->ecc = 0; + + for (i = 0; i < 8; i++) { + for (j = 1; j & 0xff; j <<= 1) { + bit++; + if (b[i] & j) + ecc ^= bit; + } + } + + tags->ecc = ecc; + +} + +int yaffs_check_tags_ecc(struct yaffs_tags *tags) +{ + unsigned ecc = tags->ecc; + + yaffs_calc_tags_ecc(tags); + + ecc ^= tags->ecc; + + if (ecc && ecc <= 64) { + /* TODO: Handle the failure better. Retire? */ + unsigned char *b = ((union yaffs_tags_union *)tags)->as_bytes; + + ecc--; + + b[ecc / 8] ^= (1 << (ecc & 7)); + + /* Now recvalc the ecc */ + yaffs_calc_tags_ecc(tags); + + return 1; /* recovered error */ + } else if (ecc) { + /* Wierd ecc failure value */ + /* TODO Need to do somethiong here */ + return -1; /* unrecovered error */ + } + + return 0; +} + +/********** Tags **********/ + +static void yaffs_load_tags_to_spare(struct yaffs_spare *spare_ptr, + struct yaffs_tags *tags_ptr) +{ + union yaffs_tags_union *tu = (union yaffs_tags_union *)tags_ptr; + + yaffs_calc_tags_ecc(tags_ptr); + + spare_ptr->tb0 = tu->as_bytes[0]; + spare_ptr->tb1 = tu->as_bytes[1]; + spare_ptr->tb2 = tu->as_bytes[2]; + spare_ptr->tb3 = tu->as_bytes[3]; + spare_ptr->tb4 = tu->as_bytes[4]; + spare_ptr->tb5 = tu->as_bytes[5]; + spare_ptr->tb6 = tu->as_bytes[6]; + spare_ptr->tb7 = tu->as_bytes[7]; +} + +static void yaffs_get_tags_from_spare(struct yaffs_dev *dev, + struct yaffs_spare *spare_ptr, + struct yaffs_tags *tags_ptr) +{ + union yaffs_tags_union *tu = (union yaffs_tags_union *)tags_ptr; + int result; + + tu->as_bytes[0] = spare_ptr->tb0; + tu->as_bytes[1] = spare_ptr->tb1; + tu->as_bytes[2] = spare_ptr->tb2; + tu->as_bytes[3] = spare_ptr->tb3; + tu->as_bytes[4] = spare_ptr->tb4; + tu->as_bytes[5] = spare_ptr->tb5; + tu->as_bytes[6] = spare_ptr->tb6; + tu->as_bytes[7] = spare_ptr->tb7; + + result = yaffs_check_tags_ecc(tags_ptr); + if (result > 0) + dev->n_tags_ecc_fixed++; + else if (result < 0) + dev->n_tags_ecc_unfixed++; +} + +static void yaffs_spare_init(struct yaffs_spare *spare) +{ + memset(spare, 0xFF, sizeof(struct yaffs_spare)); +} + +static int yaffs_wr_nand(struct yaffs_dev *dev, + int nand_chunk, const u8 * data, + struct yaffs_spare *spare) +{ + if (nand_chunk < dev->param.start_block * dev->param.chunks_per_block) { + yaffs_trace(YAFFS_TRACE_ERROR, + "**>> yaffs chunk %d is not valid", + nand_chunk); + return YAFFS_FAIL; + } + + return dev->param.write_chunk_fn(dev, nand_chunk, data, spare); +} + +static int yaffs_rd_chunk_nand(struct yaffs_dev *dev, + int nand_chunk, + u8 * data, + struct yaffs_spare *spare, + enum yaffs_ecc_result *ecc_result, + int correct_errors) +{ + int ret_val; + struct yaffs_spare local_spare; + + if (!spare && data) { + /* If we don't have a real spare, then we use a local one. */ + /* Need this for the calculation of the ecc */ + spare = &local_spare; + } + + if (!dev->param.use_nand_ecc) { + ret_val = + dev->param.read_chunk_fn(dev, nand_chunk, data, spare); + if (data && correct_errors) { + /* Do ECC correction */ + /* Todo handle any errors */ + int ecc_result1, ecc_result2; + u8 calc_ecc[3]; + + yaffs_ecc_cacl(data, calc_ecc); + ecc_result1 = + yaffs_ecc_correct(data, spare->ecc1, calc_ecc); + yaffs_ecc_cacl(&data[256], calc_ecc); + ecc_result2 = + yaffs_ecc_correct(&data[256], spare->ecc2, + calc_ecc); + + if (ecc_result1 > 0) { + yaffs_trace(YAFFS_TRACE_ERROR, + "**>>yaffs ecc error fix performed on chunk %d:0", + nand_chunk); + dev->n_ecc_fixed++; + } else if (ecc_result1 < 0) { + yaffs_trace(YAFFS_TRACE_ERROR, + "**>>yaffs ecc error unfixed on chunk %d:0", + nand_chunk); + dev->n_ecc_unfixed++; + } + + if (ecc_result2 > 0) { + yaffs_trace(YAFFS_TRACE_ERROR, + "**>>yaffs ecc error fix performed on chunk %d:1", + nand_chunk); + dev->n_ecc_fixed++; + } else if (ecc_result2 < 0) { + yaffs_trace(YAFFS_TRACE_ERROR, + "**>>yaffs ecc error unfixed on chunk %d:1", + nand_chunk); + dev->n_ecc_unfixed++; + } + + if (ecc_result1 || ecc_result2) { + /* We had a data problem on this page */ + yaffs_handle_rd_data_error(dev, nand_chunk); + } + + if (ecc_result1 < 0 || ecc_result2 < 0) + *ecc_result = YAFFS_ECC_RESULT_UNFIXED; + else if (ecc_result1 > 0 || ecc_result2 > 0) + *ecc_result = YAFFS_ECC_RESULT_FIXED; + else + *ecc_result = YAFFS_ECC_RESULT_NO_ERROR; + } + } else { + /* Must allocate enough memory for spare+2*sizeof(int) */ + /* for ecc results from device. */ + struct yaffs_nand_spare nspare; + + memset(&nspare, 0, sizeof(nspare)); + + ret_val = dev->param.read_chunk_fn(dev, nand_chunk, data, + (struct yaffs_spare *) + &nspare); + memcpy(spare, &nspare, sizeof(struct yaffs_spare)); + if (data && correct_errors) { + if (nspare.eccres1 > 0) { + yaffs_trace(YAFFS_TRACE_ERROR, + "**>>mtd ecc error fix performed on chunk %d:0", + nand_chunk); + } else if (nspare.eccres1 < 0) { + yaffs_trace(YAFFS_TRACE_ERROR, + "**>>mtd ecc error unfixed on chunk %d:0", + nand_chunk); + } + + if (nspare.eccres2 > 0) { + yaffs_trace(YAFFS_TRACE_ERROR, + "**>>mtd ecc error fix performed on chunk %d:1", + nand_chunk); + } else if (nspare.eccres2 < 0) { + yaffs_trace(YAFFS_TRACE_ERROR, + "**>>mtd ecc error unfixed on chunk %d:1", + nand_chunk); + } + + if (nspare.eccres1 || nspare.eccres2) { + /* We had a data problem on this page */ + yaffs_handle_rd_data_error(dev, nand_chunk); + } + + if (nspare.eccres1 < 0 || nspare.eccres2 < 0) + *ecc_result = YAFFS_ECC_RESULT_UNFIXED; + else if (nspare.eccres1 > 0 || nspare.eccres2 > 0) + *ecc_result = YAFFS_ECC_RESULT_FIXED; + else + *ecc_result = YAFFS_ECC_RESULT_NO_ERROR; + + } + } + return ret_val; +} + +/* + * Functions for robustisizing + */ + +static void yaffs_handle_rd_data_error(struct yaffs_dev *dev, int nand_chunk) +{ + int flash_block = nand_chunk / dev->param.chunks_per_block; + + /* Mark the block for retirement */ + yaffs_get_block_info(dev, + flash_block + dev->block_offset)->needs_retiring = + 1; + yaffs_trace(YAFFS_TRACE_ERROR | YAFFS_TRACE_BAD_BLOCKS, + "**>>Block %d marked for retirement", + flash_block); + + /* TODO: + * Just do a garbage collection on the affected block + * then retire the block + * NB recursion + */ +} + +int yaffs_tags_compat_wr(struct yaffs_dev *dev, + int nand_chunk, + const u8 * data, const struct yaffs_ext_tags *ext_tags) +{ + struct yaffs_spare spare; + struct yaffs_tags tags; + + yaffs_spare_init(&spare); + + if (ext_tags->is_deleted) + spare.page_status = 0; + else { + tags.obj_id = ext_tags->obj_id; + tags.chunk_id = ext_tags->chunk_id; + + tags.n_bytes_lsb = ext_tags->n_bytes & 0x3ff; + + if (dev->data_bytes_per_chunk >= 1024) + tags.n_bytes_msb = (ext_tags->n_bytes >> 10) & 3; + else + tags.n_bytes_msb = 3; + + tags.serial_number = ext_tags->serial_number; + + if (!dev->param.use_nand_ecc && data) + yaffs_calc_ecc(data, &spare); + + yaffs_load_tags_to_spare(&spare, &tags); + + } + + return yaffs_wr_nand(dev, nand_chunk, data, &spare); +} + +int yaffs_tags_compat_rd(struct yaffs_dev *dev, + int nand_chunk, + u8 * data, struct yaffs_ext_tags *ext_tags) +{ + + struct yaffs_spare spare; + struct yaffs_tags tags; + enum yaffs_ecc_result ecc_result = YAFFS_ECC_RESULT_UNKNOWN; + + static struct yaffs_spare spare_ff; + static int init; + + if (!init) { + memset(&spare_ff, 0xFF, sizeof(spare_ff)); + init = 1; + } + + if (yaffs_rd_chunk_nand(dev, nand_chunk, data, &spare, &ecc_result, 1)) { + /* ext_tags may be NULL */ + if (ext_tags) { + + int deleted = + (hweight8(spare.page_status) < 7) ? 1 : 0; + + ext_tags->is_deleted = deleted; + ext_tags->ecc_result = ecc_result; + ext_tags->block_bad = 0; /* We're reading it */ + /* therefore it is not a bad block */ + ext_tags->chunk_used = + (memcmp(&spare_ff, &spare, sizeof(spare_ff)) != + 0) ? 1 : 0; + + if (ext_tags->chunk_used) { + yaffs_get_tags_from_spare(dev, &spare, &tags); + + ext_tags->obj_id = tags.obj_id; + ext_tags->chunk_id = tags.chunk_id; + ext_tags->n_bytes = tags.n_bytes_lsb; + + if (dev->data_bytes_per_chunk >= 1024) + ext_tags->n_bytes |= + (((unsigned)tags. + n_bytes_msb) << 10); + + ext_tags->serial_number = tags.serial_number; + } + } + + return YAFFS_OK; + } else { + return YAFFS_FAIL; + } +} + +int yaffs_tags_compat_mark_bad(struct yaffs_dev *dev, int flash_block) +{ + + struct yaffs_spare spare; + + memset(&spare, 0xff, sizeof(struct yaffs_spare)); + + spare.block_status = 'Y'; + + yaffs_wr_nand(dev, flash_block * dev->param.chunks_per_block, NULL, + &spare); + yaffs_wr_nand(dev, flash_block * dev->param.chunks_per_block + 1, + NULL, &spare); + + return YAFFS_OK; + +} + +int yaffs_tags_compat_query_block(struct yaffs_dev *dev, + int block_no, + enum yaffs_block_state *state, + u32 * seq_number) +{ + + struct yaffs_spare spare0, spare1; + static struct yaffs_spare spare_ff; + static int init; + enum yaffs_ecc_result dummy; + + if (!init) { + memset(&spare_ff, 0xFF, sizeof(spare_ff)); + init = 1; + } + + *seq_number = 0; + + yaffs_rd_chunk_nand(dev, block_no * dev->param.chunks_per_block, NULL, + &spare0, &dummy, 1); + yaffs_rd_chunk_nand(dev, block_no * dev->param.chunks_per_block + 1, + NULL, &spare1, &dummy, 1); + + if (hweight8(spare0.block_status & spare1.block_status) < 7) + *state = YAFFS_BLOCK_STATE_DEAD; + else if (memcmp(&spare_ff, &spare0, sizeof(spare_ff)) == 0) + *state = YAFFS_BLOCK_STATE_EMPTY; + else + *state = YAFFS_BLOCK_STATE_NEEDS_SCANNING; + + return YAFFS_OK; +} diff --git a/fs/yaffs2/yaffs_tagscompat.h b/fs/yaffs2/yaffs_tagscompat.h new file mode 100644 index 00000000..8cd35dcd --- /dev/null +++ b/fs/yaffs2/yaffs_tagscompat.h @@ -0,0 +1,36 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_TAGSCOMPAT_H__ +#define __YAFFS_TAGSCOMPAT_H__ + +#include "yaffs_guts.h" +int yaffs_tags_compat_wr(struct yaffs_dev *dev, + int nand_chunk, + const u8 * data, const struct yaffs_ext_tags *tags); +int yaffs_tags_compat_rd(struct yaffs_dev *dev, + int nand_chunk, + u8 * data, struct yaffs_ext_tags *tags); +int yaffs_tags_compat_mark_bad(struct yaffs_dev *dev, int block_no); +int yaffs_tags_compat_query_block(struct yaffs_dev *dev, + int block_no, + enum yaffs_block_state *state, + u32 * seq_number); + +void yaffs_calc_tags_ecc(struct yaffs_tags *tags); +int yaffs_check_tags_ecc(struct yaffs_tags *tags); +int yaffs_count_bits(u8 byte); + +#endif diff --git a/fs/yaffs2/yaffs_tagsvalidity.c b/fs/yaffs2/yaffs_tagsvalidity.c new file mode 100644 index 00000000..4358d79d --- /dev/null +++ b/fs/yaffs2/yaffs_tagsvalidity.c @@ -0,0 +1,27 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "yaffs_tagsvalidity.h" + +void yaffs_init_tags(struct yaffs_ext_tags *tags) +{ + memset(tags, 0, sizeof(struct yaffs_ext_tags)); + tags->validity0 = 0xAAAAAAAA; + tags->validity1 = 0x55555555; +} + +int yaffs_validate_tags(struct yaffs_ext_tags *tags) +{ + return (tags->validity0 == 0xAAAAAAAA && tags->validity1 == 0x55555555); + +} diff --git a/fs/yaffs2/yaffs_tagsvalidity.h b/fs/yaffs2/yaffs_tagsvalidity.h new file mode 100644 index 00000000..36a021fc --- /dev/null +++ b/fs/yaffs2/yaffs_tagsvalidity.h @@ -0,0 +1,23 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_TAGS_VALIDITY_H__ +#define __YAFFS_TAGS_VALIDITY_H__ + +#include "yaffs_guts.h" + +void yaffs_init_tags(struct yaffs_ext_tags *tags); +int yaffs_validate_tags(struct yaffs_ext_tags *tags); +#endif diff --git a/fs/yaffs2/yaffs_trace.h b/fs/yaffs2/yaffs_trace.h new file mode 100644 index 00000000..6273dbf9 --- /dev/null +++ b/fs/yaffs2/yaffs_trace.h @@ -0,0 +1,57 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YTRACE_H__ +#define __YTRACE_H__ + +extern unsigned int yaffs_trace_mask; +extern unsigned int yaffs_wr_attempts; + +/* + * Tracing flags. + * The flags masked in YAFFS_TRACE_ALWAYS are always traced. + */ + +#define YAFFS_TRACE_OS 0x00000002 +#define YAFFS_TRACE_ALLOCATE 0x00000004 +#define YAFFS_TRACE_SCAN 0x00000008 +#define YAFFS_TRACE_BAD_BLOCKS 0x00000010 +#define YAFFS_TRACE_ERASE 0x00000020 +#define YAFFS_TRACE_GC 0x00000040 +#define YAFFS_TRACE_WRITE 0x00000080 +#define YAFFS_TRACE_TRACING 0x00000100 +#define YAFFS_TRACE_DELETION 0x00000200 +#define YAFFS_TRACE_BUFFERS 0x00000400 +#define YAFFS_TRACE_NANDACCESS 0x00000800 +#define YAFFS_TRACE_GC_DETAIL 0x00001000 +#define YAFFS_TRACE_SCAN_DEBUG 0x00002000 +#define YAFFS_TRACE_MTD 0x00004000 +#define YAFFS_TRACE_CHECKPOINT 0x00008000 + +#define YAFFS_TRACE_VERIFY 0x00010000 +#define YAFFS_TRACE_VERIFY_NAND 0x00020000 +#define YAFFS_TRACE_VERIFY_FULL 0x00040000 +#define YAFFS_TRACE_VERIFY_ALL 0x000F0000 + +#define YAFFS_TRACE_SYNC 0x00100000 +#define YAFFS_TRACE_BACKGROUND 0x00200000 +#define YAFFS_TRACE_LOCK 0x00400000 +#define YAFFS_TRACE_MOUNT 0x00800000 + +#define YAFFS_TRACE_ERROR 0x40000000 +#define YAFFS_TRACE_BUG 0x80000000 +#define YAFFS_TRACE_ALWAYS 0xF0000000 + +#endif diff --git a/fs/yaffs2/yaffs_verify.c b/fs/yaffs2/yaffs_verify.c new file mode 100644 index 00000000..738c7f69 --- /dev/null +++ b/fs/yaffs2/yaffs_verify.c @@ -0,0 +1,535 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "yaffs_verify.h" +#include "yaffs_trace.h" +#include "yaffs_bitmap.h" +#include "yaffs_getblockinfo.h" +#include "yaffs_nand.h" + +int yaffs_skip_verification(struct yaffs_dev *dev) +{ + dev = dev; + return !(yaffs_trace_mask & + (YAFFS_TRACE_VERIFY | YAFFS_TRACE_VERIFY_FULL)); +} + +static int yaffs_skip_full_verification(struct yaffs_dev *dev) +{ + dev = dev; + return !(yaffs_trace_mask & (YAFFS_TRACE_VERIFY_FULL)); +} + +static int yaffs_skip_nand_verification(struct yaffs_dev *dev) +{ + dev = dev; + return !(yaffs_trace_mask & (YAFFS_TRACE_VERIFY_NAND)); +} + +static const char *block_state_name[] = { + "Unknown", + "Needs scanning", + "Scanning", + "Empty", + "Allocating", + "Full", + "Dirty", + "Checkpoint", + "Collecting", + "Dead" +}; + +void yaffs_verify_blk(struct yaffs_dev *dev, struct yaffs_block_info *bi, int n) +{ + int actually_used; + int in_use; + + if (yaffs_skip_verification(dev)) + return; + + /* Report illegal runtime states */ + if (bi->block_state >= YAFFS_NUMBER_OF_BLOCK_STATES) + yaffs_trace(YAFFS_TRACE_VERIFY, + "Block %d has undefined state %d", + n, bi->block_state); + + switch (bi->block_state) { + case YAFFS_BLOCK_STATE_UNKNOWN: + case YAFFS_BLOCK_STATE_SCANNING: + case YAFFS_BLOCK_STATE_NEEDS_SCANNING: + yaffs_trace(YAFFS_TRACE_VERIFY, + "Block %d has bad run-state %s", + n, block_state_name[bi->block_state]); + } + + /* Check pages in use and soft deletions are legal */ + + actually_used = bi->pages_in_use - bi->soft_del_pages; + + if (bi->pages_in_use < 0 + || bi->pages_in_use > dev->param.chunks_per_block + || bi->soft_del_pages < 0 + || bi->soft_del_pages > dev->param.chunks_per_block + || actually_used < 0 || actually_used > dev->param.chunks_per_block) + yaffs_trace(YAFFS_TRACE_VERIFY, + "Block %d has illegal values pages_in_used %d soft_del_pages %d", + n, bi->pages_in_use, bi->soft_del_pages); + + /* Check chunk bitmap legal */ + in_use = yaffs_count_chunk_bits(dev, n); + if (in_use != bi->pages_in_use) + yaffs_trace(YAFFS_TRACE_VERIFY, + "Block %d has inconsistent values pages_in_use %d counted chunk bits %d", + n, bi->pages_in_use, in_use); + +} + +void yaffs_verify_collected_blk(struct yaffs_dev *dev, + struct yaffs_block_info *bi, int n) +{ + yaffs_verify_blk(dev, bi, n); + + /* After collection the block should be in the erased state */ + + if (bi->block_state != YAFFS_BLOCK_STATE_COLLECTING && + bi->block_state != YAFFS_BLOCK_STATE_EMPTY) { + yaffs_trace(YAFFS_TRACE_ERROR, + "Block %d is in state %d after gc, should be erased", + n, bi->block_state); + } +} + +void yaffs_verify_blocks(struct yaffs_dev *dev) +{ + int i; + int state_count[YAFFS_NUMBER_OF_BLOCK_STATES]; + int illegal_states = 0; + + if (yaffs_skip_verification(dev)) + return; + + memset(state_count, 0, sizeof(state_count)); + + for (i = dev->internal_start_block; i <= dev->internal_end_block; i++) { + struct yaffs_block_info *bi = yaffs_get_block_info(dev, i); + yaffs_verify_blk(dev, bi, i); + + if (bi->block_state < YAFFS_NUMBER_OF_BLOCK_STATES) + state_count[bi->block_state]++; + else + illegal_states++; + } + + yaffs_trace(YAFFS_TRACE_VERIFY, "Block summary"); + + yaffs_trace(YAFFS_TRACE_VERIFY, + "%d blocks have illegal states", + illegal_states); + if (state_count[YAFFS_BLOCK_STATE_ALLOCATING] > 1) + yaffs_trace(YAFFS_TRACE_VERIFY, + "Too many allocating blocks"); + + for (i = 0; i < YAFFS_NUMBER_OF_BLOCK_STATES; i++) + yaffs_trace(YAFFS_TRACE_VERIFY, + "%s %d blocks", + block_state_name[i], state_count[i]); + + if (dev->blocks_in_checkpt != state_count[YAFFS_BLOCK_STATE_CHECKPOINT]) + yaffs_trace(YAFFS_TRACE_VERIFY, + "Checkpoint block count wrong dev %d count %d", + dev->blocks_in_checkpt, + state_count[YAFFS_BLOCK_STATE_CHECKPOINT]); + + if (dev->n_erased_blocks != state_count[YAFFS_BLOCK_STATE_EMPTY]) + yaffs_trace(YAFFS_TRACE_VERIFY, + "Erased block count wrong dev %d count %d", + dev->n_erased_blocks, + state_count[YAFFS_BLOCK_STATE_EMPTY]); + + if (state_count[YAFFS_BLOCK_STATE_COLLECTING] > 1) + yaffs_trace(YAFFS_TRACE_VERIFY, + "Too many collecting blocks %d (max is 1)", + state_count[YAFFS_BLOCK_STATE_COLLECTING]); +} + +/* + * Verify the object header. oh must be valid, but obj and tags may be NULL in which + * case those tests will not be performed. + */ +void yaffs_verify_oh(struct yaffs_obj *obj, struct yaffs_obj_hdr *oh, + struct yaffs_ext_tags *tags, int parent_check) +{ + if (obj && yaffs_skip_verification(obj->my_dev)) + return; + + if (!(tags && obj && oh)) { + yaffs_trace(YAFFS_TRACE_VERIFY, + "Verifying object header tags %p obj %p oh %p", + tags, obj, oh); + return; + } + + if (oh->type <= YAFFS_OBJECT_TYPE_UNKNOWN || + oh->type > YAFFS_OBJECT_TYPE_MAX) + yaffs_trace(YAFFS_TRACE_VERIFY, + "Obj %d header type is illegal value 0x%x", + tags->obj_id, oh->type); + + if (tags->obj_id != obj->obj_id) + yaffs_trace(YAFFS_TRACE_VERIFY, + "Obj %d header mismatch obj_id %d", + tags->obj_id, obj->obj_id); + + /* + * Check that the object's parent ids match if parent_check requested. + * + * Tests do not apply to the root object. + */ + + if (parent_check && tags->obj_id > 1 && !obj->parent) + yaffs_trace(YAFFS_TRACE_VERIFY, + "Obj %d header mismatch parent_id %d obj->parent is NULL", + tags->obj_id, oh->parent_obj_id); + + if (parent_check && obj->parent && + oh->parent_obj_id != obj->parent->obj_id && + (oh->parent_obj_id != YAFFS_OBJECTID_UNLINKED || + obj->parent->obj_id != YAFFS_OBJECTID_DELETED)) + yaffs_trace(YAFFS_TRACE_VERIFY, + "Obj %d header mismatch parent_id %d parent_obj_id %d", + tags->obj_id, oh->parent_obj_id, + obj->parent->obj_id); + + if (tags->obj_id > 1 && oh->name[0] == 0) /* Null name */ + yaffs_trace(YAFFS_TRACE_VERIFY, + "Obj %d header name is NULL", + obj->obj_id); + + if (tags->obj_id > 1 && ((u8) (oh->name[0])) == 0xff) /* Trashed name */ + yaffs_trace(YAFFS_TRACE_VERIFY, + "Obj %d header name is 0xFF", + obj->obj_id); +} + +void yaffs_verify_file(struct yaffs_obj *obj) +{ + int required_depth; + int actual_depth; + u32 last_chunk; + u32 x; + u32 i; + struct yaffs_dev *dev; + struct yaffs_ext_tags tags; + struct yaffs_tnode *tn; + u32 obj_id; + + if (!obj) + return; + + if (yaffs_skip_verification(obj->my_dev)) + return; + + dev = obj->my_dev; + obj_id = obj->obj_id; + + /* Check file size is consistent with tnode depth */ + last_chunk = + obj->variant.file_variant.file_size / dev->data_bytes_per_chunk + 1; + x = last_chunk >> YAFFS_TNODES_LEVEL0_BITS; + required_depth = 0; + while (x > 0) { + x >>= YAFFS_TNODES_INTERNAL_BITS; + required_depth++; + } + + actual_depth = obj->variant.file_variant.top_level; + + /* Check that the chunks in the tnode tree are all correct. + * We do this by scanning through the tnode tree and + * checking the tags for every chunk match. + */ + + if (yaffs_skip_nand_verification(dev)) + return; + + for (i = 1; i <= last_chunk; i++) { + tn = yaffs_find_tnode_0(dev, &obj->variant.file_variant, i); + + if (tn) { + u32 the_chunk = yaffs_get_group_base(dev, tn, i); + if (the_chunk > 0) { + yaffs_rd_chunk_tags_nand(dev, the_chunk, NULL, + &tags); + if (tags.obj_id != obj_id || tags.chunk_id != i) + yaffs_trace(YAFFS_TRACE_VERIFY, + "Object %d chunk_id %d NAND mismatch chunk %d tags (%d:%d)", + obj_id, i, the_chunk, + tags.obj_id, tags.chunk_id); + } + } + } +} + +void yaffs_verify_link(struct yaffs_obj *obj) +{ + if (obj && yaffs_skip_verification(obj->my_dev)) + return; + + /* Verify sane equivalent object */ +} + +void yaffs_verify_symlink(struct yaffs_obj *obj) +{ + if (obj && yaffs_skip_verification(obj->my_dev)) + return; + + /* Verify symlink string */ +} + +void yaffs_verify_special(struct yaffs_obj *obj) +{ + if (obj && yaffs_skip_verification(obj->my_dev)) + return; +} + +void yaffs_verify_obj(struct yaffs_obj *obj) +{ + struct yaffs_dev *dev; + + u32 chunk_min; + u32 chunk_max; + + u32 chunk_id_ok; + u32 chunk_in_range; + u32 chunk_wrongly_deleted; + u32 chunk_valid; + + if (!obj) + return; + + if (obj->being_created) + return; + + dev = obj->my_dev; + + if (yaffs_skip_verification(dev)) + return; + + /* Check sane object header chunk */ + + chunk_min = dev->internal_start_block * dev->param.chunks_per_block; + chunk_max = + (dev->internal_end_block + 1) * dev->param.chunks_per_block - 1; + + chunk_in_range = (((unsigned)(obj->hdr_chunk)) >= chunk_min && + ((unsigned)(obj->hdr_chunk)) <= chunk_max); + chunk_id_ok = chunk_in_range || (obj->hdr_chunk == 0); + chunk_valid = chunk_in_range && + yaffs_check_chunk_bit(dev, + obj->hdr_chunk / dev->param.chunks_per_block, + obj->hdr_chunk % dev->param.chunks_per_block); + chunk_wrongly_deleted = chunk_in_range && !chunk_valid; + + if (!obj->fake && (!chunk_id_ok || chunk_wrongly_deleted)) + yaffs_trace(YAFFS_TRACE_VERIFY, + "Obj %d has chunk_id %d %s %s", + obj->obj_id, obj->hdr_chunk, + chunk_id_ok ? "" : ",out of range", + chunk_wrongly_deleted ? ",marked as deleted" : ""); + + if (chunk_valid && !yaffs_skip_nand_verification(dev)) { + struct yaffs_ext_tags tags; + struct yaffs_obj_hdr *oh; + u8 *buffer = yaffs_get_temp_buffer(dev, __LINE__); + + oh = (struct yaffs_obj_hdr *)buffer; + + yaffs_rd_chunk_tags_nand(dev, obj->hdr_chunk, buffer, &tags); + + yaffs_verify_oh(obj, oh, &tags, 1); + + yaffs_release_temp_buffer(dev, buffer, __LINE__); + } + + /* Verify it has a parent */ + if (obj && !obj->fake && (!obj->parent || obj->parent->my_dev != dev)) { + yaffs_trace(YAFFS_TRACE_VERIFY, + "Obj %d has parent pointer %p which does not look like an object", + obj->obj_id, obj->parent); + } + + /* Verify parent is a directory */ + if (obj->parent + && obj->parent->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) { + yaffs_trace(YAFFS_TRACE_VERIFY, + "Obj %d's parent is not a directory (type %d)", + obj->obj_id, obj->parent->variant_type); + } + + switch (obj->variant_type) { + case YAFFS_OBJECT_TYPE_FILE: + yaffs_verify_file(obj); + break; + case YAFFS_OBJECT_TYPE_SYMLINK: + yaffs_verify_symlink(obj); + break; + case YAFFS_OBJECT_TYPE_DIRECTORY: + yaffs_verify_dir(obj); + break; + case YAFFS_OBJECT_TYPE_HARDLINK: + yaffs_verify_link(obj); + break; + case YAFFS_OBJECT_TYPE_SPECIAL: + yaffs_verify_special(obj); + break; + case YAFFS_OBJECT_TYPE_UNKNOWN: + default: + yaffs_trace(YAFFS_TRACE_VERIFY, + "Obj %d has illegaltype %d", + obj->obj_id, obj->variant_type); + break; + } +} + +void yaffs_verify_objects(struct yaffs_dev *dev) +{ + struct yaffs_obj *obj; + int i; + struct list_head *lh; + + if (yaffs_skip_verification(dev)) + return; + + /* Iterate through the objects in each hash entry */ + + for (i = 0; i < YAFFS_NOBJECT_BUCKETS; i++) { + list_for_each(lh, &dev->obj_bucket[i].list) { + if (lh) { + obj = + list_entry(lh, struct yaffs_obj, hash_link); + yaffs_verify_obj(obj); + } + } + } +} + +void yaffs_verify_obj_in_dir(struct yaffs_obj *obj) +{ + struct list_head *lh; + struct yaffs_obj *list_obj; + + int count = 0; + + if (!obj) { + yaffs_trace(YAFFS_TRACE_ALWAYS, "No object to verify"); + YBUG(); + return; + } + + if (yaffs_skip_verification(obj->my_dev)) + return; + + if (!obj->parent) { + yaffs_trace(YAFFS_TRACE_ALWAYS, "Object does not have parent" ); + YBUG(); + return; + } + + if (obj->parent->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) { + yaffs_trace(YAFFS_TRACE_ALWAYS, "Parent is not directory"); + YBUG(); + } + + /* Iterate through the objects in each hash entry */ + + list_for_each(lh, &obj->parent->variant.dir_variant.children) { + if (lh) { + list_obj = list_entry(lh, struct yaffs_obj, siblings); + yaffs_verify_obj(list_obj); + if (obj == list_obj) + count++; + } + } + + if (count != 1) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "Object in directory %d times", + count); + YBUG(); + } +} + +void yaffs_verify_dir(struct yaffs_obj *directory) +{ + struct list_head *lh; + struct yaffs_obj *list_obj; + + if (!directory) { + YBUG(); + return; + } + + if (yaffs_skip_full_verification(directory->my_dev)) + return; + + if (directory->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "Directory has wrong type: %d", + directory->variant_type); + YBUG(); + } + + /* Iterate through the objects in each hash entry */ + + list_for_each(lh, &directory->variant.dir_variant.children) { + if (lh) { + list_obj = list_entry(lh, struct yaffs_obj, siblings); + if (list_obj->parent != directory) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "Object in directory list has wrong parent %p", + list_obj->parent); + YBUG(); + } + yaffs_verify_obj_in_dir(list_obj); + } + } +} + +static int yaffs_free_verification_failures; + +void yaffs_verify_free_chunks(struct yaffs_dev *dev) +{ + int counted; + int difference; + + if (yaffs_skip_verification(dev)) + return; + + counted = yaffs_count_free_chunks(dev); + + difference = dev->n_free_chunks - counted; + + if (difference) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "Freechunks verification failure %d %d %d", + dev->n_free_chunks, counted, difference); + yaffs_free_verification_failures++; + } +} + +int yaffs_verify_file_sane(struct yaffs_obj *in) +{ + in = in; + return YAFFS_OK; +} + diff --git a/fs/yaffs2/yaffs_verify.h b/fs/yaffs2/yaffs_verify.h new file mode 100644 index 00000000..cc6f8899 --- /dev/null +++ b/fs/yaffs2/yaffs_verify.h @@ -0,0 +1,43 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_VERIFY_H__ +#define __YAFFS_VERIFY_H__ + +#include "yaffs_guts.h" + +void yaffs_verify_blk(struct yaffs_dev *dev, struct yaffs_block_info *bi, + int n); +void yaffs_verify_collected_blk(struct yaffs_dev *dev, + struct yaffs_block_info *bi, int n); +void yaffs_verify_blocks(struct yaffs_dev *dev); + +void yaffs_verify_oh(struct yaffs_obj *obj, struct yaffs_obj_hdr *oh, + struct yaffs_ext_tags *tags, int parent_check); +void yaffs_verify_file(struct yaffs_obj *obj); +void yaffs_verify_link(struct yaffs_obj *obj); +void yaffs_verify_symlink(struct yaffs_obj *obj); +void yaffs_verify_special(struct yaffs_obj *obj); +void yaffs_verify_obj(struct yaffs_obj *obj); +void yaffs_verify_objects(struct yaffs_dev *dev); +void yaffs_verify_obj_in_dir(struct yaffs_obj *obj); +void yaffs_verify_dir(struct yaffs_obj *directory); +void yaffs_verify_free_chunks(struct yaffs_dev *dev); + +int yaffs_verify_file_sane(struct yaffs_obj *obj); + +int yaffs_skip_verification(struct yaffs_dev *dev); + +#endif diff --git a/fs/yaffs2/yaffs_vfs.c b/fs/yaffs2/yaffs_vfs.c new file mode 100644 index 00000000..d5b87531 --- /dev/null +++ b/fs/yaffs2/yaffs_vfs.c @@ -0,0 +1,2792 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * Acknowledgements: + * Luc van OostenRyck for numerous patches. + * Nick Bane for numerous patches. + * Nick Bane for 2.5/2.6 integration. + * Andras Toth for mknod rdev issue. + * Michael Fischer for finding the problem with inode inconsistency. + * Some code bodily lifted from JFFS + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * + * This is the file system front-end to YAFFS that hooks it up to + * the VFS. + * + * Special notes: + * >> 2.4: sb->u.generic_sbp points to the struct yaffs_dev associated with + * this superblock + * >> 2.6: sb->s_fs_info points to the struct yaffs_dev associated with this + * superblock + * >> inode->u.generic_ip points to the associated struct yaffs_obj. + */ + +/* + * NB There are two variants of Linux VFS glue code. This variant supports + * a single version and should not include any multi-version code. + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#define UnlockPage(p) unlock_page(p) +#define Page_Uptodate(page) test_bit(PG_uptodate, &(page)->flags) + +#define yaffs_devname(sb, buf) bdevname(sb->s_bdev, buf) + +#define YPROC_ROOT NULL + +#define Y_INIT_TIMER(a) init_timer_on_stack(a) + +#define WRITE_SIZE_STR "writesize" +#define WRITE_SIZE(mtd) ((mtd)->writesize) + +static uint32_t YCALCBLOCKS(uint64_t partition_size, uint32_t block_size) +{ + uint64_t result = partition_size; + do_div(result, block_size); + return (uint32_t) result; +} + +#include +#include + +#include "yportenv.h" +#include "yaffs_trace.h" +#include "yaffs_guts.h" +#include "yaffs_attribs.h" + +#include "yaffs_linux.h" + +#include "yaffs_mtdif.h" +#include "yaffs_mtdif1.h" +#include "yaffs_mtdif2.h" + +unsigned int yaffs_trace_mask = YAFFS_TRACE_BAD_BLOCKS | YAFFS_TRACE_ALWAYS; +unsigned int yaffs_wr_attempts = YAFFS_WR_ATTEMPTS; +unsigned int yaffs_auto_checkpoint = 1; +unsigned int yaffs_gc_control = 1; +unsigned int yaffs_bg_enable = 1; + +/* Module Parameters */ +module_param(yaffs_trace_mask, uint, 0644); +module_param(yaffs_wr_attempts, uint, 0644); +module_param(yaffs_auto_checkpoint, uint, 0644); +module_param(yaffs_gc_control, uint, 0644); +module_param(yaffs_bg_enable, uint, 0644); + + +#define yaffs_inode_to_obj_lv(iptr) ((iptr)->i_private) +#define yaffs_inode_to_obj(iptr) ((struct yaffs_obj *)(yaffs_inode_to_obj_lv(iptr))) +#define yaffs_dentry_to_obj(dptr) yaffs_inode_to_obj((dptr)->d_inode) +#define yaffs_super_to_dev(sb) ((struct yaffs_dev *)sb->s_fs_info) + +#define update_dir_time(dir) do {\ + (dir)->i_ctime = (dir)->i_mtime = CURRENT_TIME; \ + } while(0) + + +static unsigned yaffs_gc_control_callback(struct yaffs_dev *dev) +{ + return yaffs_gc_control; +} + +static void yaffs_gross_lock(struct yaffs_dev *dev) +{ + yaffs_trace(YAFFS_TRACE_LOCK, "yaffs locking %p", current); + mutex_lock(&(yaffs_dev_to_lc(dev)->gross_lock)); + yaffs_trace(YAFFS_TRACE_LOCK, "yaffs locked %p", current); +} + +static void yaffs_gross_unlock(struct yaffs_dev *dev) +{ + yaffs_trace(YAFFS_TRACE_LOCK, "yaffs unlocking %p", current); + mutex_unlock(&(yaffs_dev_to_lc(dev)->gross_lock)); +} + +static void yaffs_fill_inode_from_obj(struct inode *inode, + struct yaffs_obj *obj); + +static struct inode *yaffs_iget(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; + struct yaffs_obj *obj; + struct yaffs_dev *dev = yaffs_super_to_dev(sb); + + yaffs_trace(YAFFS_TRACE_OS, "yaffs_iget for %lu", ino); + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + /* NB This is called as a side effect of other functions, but + * we had to release the lock to prevent deadlocks, so + * need to lock again. + */ + + yaffs_gross_lock(dev); + + obj = yaffs_find_by_number(dev, inode->i_ino); + + yaffs_fill_inode_from_obj(inode, obj); + + yaffs_gross_unlock(dev); + + unlock_new_inode(inode); + return inode; +} + +struct inode *yaffs_get_inode(struct super_block *sb, int mode, int dev, + struct yaffs_obj *obj) +{ + struct inode *inode; + + if (!sb) { + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_get_inode for NULL super_block!!"); + return NULL; + + } + + if (!obj) { + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_get_inode for NULL object!!"); + return NULL; + + } + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_get_inode for object %d", + obj->obj_id); + + inode = yaffs_iget(sb, obj->obj_id); + if (IS_ERR(inode)) + return NULL; + + /* NB Side effect: iget calls back to yaffs_read_inode(). */ + /* iget also increments the inode's i_count */ + /* NB You can't be holding gross_lock or deadlock will happen! */ + + return inode; +} + +static int yaffs_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + struct inode *inode; + + struct yaffs_obj *obj = NULL; + struct yaffs_dev *dev; + + struct yaffs_obj *parent = yaffs_inode_to_obj(dir); + + int error = -ENOSPC; + uid_t uid = current->cred->fsuid; + gid_t gid = + (dir->i_mode & S_ISGID) ? dir->i_gid : current->cred->fsgid; + + if ((dir->i_mode & S_ISGID) && S_ISDIR(mode)) + mode |= S_ISGID; + + if (parent) { + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_mknod: parent object %d type %d", + parent->obj_id, parent->variant_type); + } else { + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_mknod: could not get parent object"); + return -EPERM; + } + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_mknod: making oject for %s, mode %x dev %x", + dentry->d_name.name, mode, rdev); + + dev = parent->my_dev; + + yaffs_gross_lock(dev); + + switch (mode & S_IFMT) { + default: + /* Special (socket, fifo, device...) */ + yaffs_trace(YAFFS_TRACE_OS, "yaffs_mknod: making special"); + obj = + yaffs_create_special(parent, dentry->d_name.name, mode, uid, + gid, old_encode_dev(rdev)); + break; + case S_IFREG: /* file */ + yaffs_trace(YAFFS_TRACE_OS, "yaffs_mknod: making file"); + obj = yaffs_create_file(parent, dentry->d_name.name, mode, uid, + gid); + break; + case S_IFDIR: /* directory */ + yaffs_trace(YAFFS_TRACE_OS, "yaffs_mknod: making directory"); + obj = yaffs_create_dir(parent, dentry->d_name.name, mode, + uid, gid); + break; + case S_IFLNK: /* symlink */ + yaffs_trace(YAFFS_TRACE_OS, "yaffs_mknod: making symlink"); + obj = NULL; /* Do we ever get here? */ + break; + } + + /* Can not call yaffs_get_inode() with gross lock held */ + yaffs_gross_unlock(dev); + + if (obj) { + inode = yaffs_get_inode(dir->i_sb, mode, rdev, obj); + d_instantiate(dentry, inode); + update_dir_time(dir); + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_mknod created object %d count = %d", + obj->obj_id, atomic_read(&inode->i_count)); + error = 0; + yaffs_fill_inode_from_obj(dir, parent); + } else { + yaffs_trace(YAFFS_TRACE_OS, "yaffs_mknod failed making object"); + error = -ENOMEM; + } + + return error; +} + +static int yaffs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + return yaffs_mknod(dir, dentry, mode | S_IFDIR, 0); +} + +static int yaffs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *n) +{ + return yaffs_mknod(dir, dentry, mode | S_IFREG, 0); +} + +static int yaffs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + struct yaffs_obj *obj = NULL; + struct yaffs_obj *link = NULL; + struct yaffs_dev *dev; + + yaffs_trace(YAFFS_TRACE_OS, "yaffs_link"); + + obj = yaffs_inode_to_obj(inode); + dev = obj->my_dev; + + yaffs_gross_lock(dev); + + if (!S_ISDIR(inode->i_mode)) /* Don't link directories */ + link = + yaffs_link_obj(yaffs_inode_to_obj(dir), dentry->d_name.name, + obj); + + if (link) { + old_dentry->d_inode->i_nlink = yaffs_get_obj_link_count(obj); + d_instantiate(dentry, old_dentry->d_inode); + atomic_inc(&old_dentry->d_inode->i_count); + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_link link count %d i_count %d", + old_dentry->d_inode->i_nlink, + atomic_read(&old_dentry->d_inode->i_count)); + } + + yaffs_gross_unlock(dev); + + if (link) { + update_dir_time(dir); + return 0; + } + + return -EPERM; +} + +static int yaffs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct yaffs_obj *obj; + struct yaffs_dev *dev; + uid_t uid = current->cred->fsuid; + gid_t gid = + (dir->i_mode & S_ISGID) ? dir->i_gid : current->cred->fsgid; + + yaffs_trace(YAFFS_TRACE_OS, "yaffs_symlink"); + + dev = yaffs_inode_to_obj(dir)->my_dev; + yaffs_gross_lock(dev); + obj = yaffs_create_symlink(yaffs_inode_to_obj(dir), dentry->d_name.name, + S_IFLNK | S_IRWXUGO, uid, gid, symname); + yaffs_gross_unlock(dev); + + if (obj) { + struct inode *inode; + + inode = yaffs_get_inode(dir->i_sb, obj->yst_mode, 0, obj); + d_instantiate(dentry, inode); + update_dir_time(dir); + yaffs_trace(YAFFS_TRACE_OS, "symlink created OK"); + return 0; + } else { + yaffs_trace(YAFFS_TRACE_OS, "symlink not created"); + } + + return -ENOMEM; +} + +static struct dentry *yaffs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *n) +{ + struct yaffs_obj *obj; + struct inode *inode = NULL; + + struct yaffs_dev *dev = yaffs_inode_to_obj(dir)->my_dev; + + if (current != yaffs_dev_to_lc(dev)->readdir_process) + yaffs_gross_lock(dev); + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_lookup for %d:%s", + yaffs_inode_to_obj(dir)->obj_id, dentry->d_name.name); + + obj = yaffs_find_by_name(yaffs_inode_to_obj(dir), dentry->d_name.name); + + obj = yaffs_get_equivalent_obj(obj); /* in case it was a hardlink */ + + /* Can't hold gross lock when calling yaffs_get_inode() */ + if (current != yaffs_dev_to_lc(dev)->readdir_process) + yaffs_gross_unlock(dev); + + if (obj) { + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_lookup found %d", obj->obj_id); + + inode = yaffs_get_inode(dir->i_sb, obj->yst_mode, 0, obj); + + if (inode) { + yaffs_trace(YAFFS_TRACE_OS, "yaffs_loookup dentry"); + d_add(dentry, inode); + /* return dentry; */ + return NULL; + } + + } else { + yaffs_trace(YAFFS_TRACE_OS, "yaffs_lookup not found"); + + } + + d_add(dentry, inode); + + return NULL; +} + +static int yaffs_unlink(struct inode *dir, struct dentry *dentry) +{ + int ret_val; + + struct yaffs_dev *dev; + struct yaffs_obj *obj; + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_unlink %d:%s", + (int)(dir->i_ino), dentry->d_name.name); + obj = yaffs_inode_to_obj(dir); + dev = obj->my_dev; + + yaffs_gross_lock(dev); + + ret_val = yaffs_unlinker(obj, dentry->d_name.name); + + if (ret_val == YAFFS_OK) { + dentry->d_inode->i_nlink--; + dir->i_version++; + yaffs_gross_unlock(dev); + mark_inode_dirty(dentry->d_inode); + update_dir_time(dir); + return 0; + } + yaffs_gross_unlock(dev); + return -ENOTEMPTY; +} + +static int yaffs_sync_object(struct file *file, int datasync) +{ + + struct yaffs_obj *obj; + struct yaffs_dev *dev; + struct dentry *dentry = file->f_path.dentry; + + obj = yaffs_dentry_to_obj(dentry); + + dev = obj->my_dev; + + yaffs_trace(YAFFS_TRACE_OS | YAFFS_TRACE_SYNC, "yaffs_sync_object"); + yaffs_gross_lock(dev); + yaffs_flush_file(obj, 1, datasync); + yaffs_gross_unlock(dev); + return 0; +} +/* + * The VFS layer already does all the dentry stuff for rename. + * + * NB: POSIX says you can rename an object over an old object of the same name + */ +static int yaffs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct yaffs_dev *dev; + int ret_val = YAFFS_FAIL; + struct yaffs_obj *target; + + yaffs_trace(YAFFS_TRACE_OS, "yaffs_rename"); + dev = yaffs_inode_to_obj(old_dir)->my_dev; + + yaffs_gross_lock(dev); + + /* Check if the target is an existing directory that is not empty. */ + target = yaffs_find_by_name(yaffs_inode_to_obj(new_dir), + new_dentry->d_name.name); + + if (target && target->variant_type == YAFFS_OBJECT_TYPE_DIRECTORY && + !list_empty(&target->variant.dir_variant.children)) { + + yaffs_trace(YAFFS_TRACE_OS, "target is non-empty dir"); + + ret_val = YAFFS_FAIL; + } else { + /* Now does unlinking internally using shadowing mechanism */ + yaffs_trace(YAFFS_TRACE_OS, "calling yaffs_rename_obj"); + + ret_val = yaffs_rename_obj(yaffs_inode_to_obj(old_dir), + old_dentry->d_name.name, + yaffs_inode_to_obj(new_dir), + new_dentry->d_name.name); + } + yaffs_gross_unlock(dev); + + if (ret_val == YAFFS_OK) { + if (target) { + new_dentry->d_inode->i_nlink--; + mark_inode_dirty(new_dentry->d_inode); + } + + update_dir_time(old_dir); + if (old_dir != new_dir) + update_dir_time(new_dir); + return 0; + } else { + return -ENOTEMPTY; + } +} + +static int yaffs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error = 0; + struct yaffs_dev *dev; + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_setattr of object %d", + yaffs_inode_to_obj(inode)->obj_id); + + /* Fail if a requested resize >= 2GB */ + if (attr->ia_valid & ATTR_SIZE && (attr->ia_size >> 31)) + error = -EINVAL; + + if (error == 0) + error = inode_change_ok(inode, attr); + if (error == 0) { + int result; + if (!error) { + setattr_copy(inode, attr); + yaffs_trace(YAFFS_TRACE_OS, "inode_setattr called"); + if (attr->ia_valid & ATTR_SIZE) { + truncate_setsize(inode, attr->ia_size); + inode->i_blocks = (inode->i_size + 511) >> 9; + } + } + dev = yaffs_inode_to_obj(inode)->my_dev; + if (attr->ia_valid & ATTR_SIZE) { + yaffs_trace(YAFFS_TRACE_OS, "resize to %d(%x)", + (int)(attr->ia_size), + (int)(attr->ia_size)); + } + yaffs_gross_lock(dev); + result = yaffs_set_attribs(yaffs_inode_to_obj(inode), attr); + if (result == YAFFS_OK) { + error = 0; + } else { + error = -EPERM; + } + yaffs_gross_unlock(dev); + + } + + yaffs_trace(YAFFS_TRACE_OS, "yaffs_setattr done returning %d", error); + + return error; +} + +#ifdef CONFIG_YAFFS_XATTR +static int yaffs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + int error = 0; + struct yaffs_dev *dev; + struct yaffs_obj *obj = yaffs_inode_to_obj(inode); + + yaffs_trace(YAFFS_TRACE_OS, "yaffs_setxattr of object %d", obj->obj_id); + + if (error == 0) { + int result; + dev = obj->my_dev; + yaffs_gross_lock(dev); + result = yaffs_set_xattrib(obj, name, value, size, flags); + if (result == YAFFS_OK) + error = 0; + else if (result < 0) + error = result; + yaffs_gross_unlock(dev); + + } + yaffs_trace(YAFFS_TRACE_OS, "yaffs_setxattr done returning %d", error); + + return error; +} + +static ssize_t yaffs_getxattr(struct dentry * dentry, const char *name, void *buff, + size_t size) +{ + struct inode *inode = dentry->d_inode; + int error = 0; + struct yaffs_dev *dev; + struct yaffs_obj *obj = yaffs_inode_to_obj(inode); + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_getxattr \"%s\" from object %d", + name, obj->obj_id); + + if (error == 0) { + dev = obj->my_dev; + yaffs_gross_lock(dev); + error = yaffs_get_xattrib(obj, name, buff, size); + yaffs_gross_unlock(dev); + + } + yaffs_trace(YAFFS_TRACE_OS, "yaffs_getxattr done returning %d", error); + + return error; +} + +static int yaffs_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + int error = 0; + struct yaffs_dev *dev; + struct yaffs_obj *obj = yaffs_inode_to_obj(inode); + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_removexattr of object %d", obj->obj_id); + + if (error == 0) { + int result; + dev = obj->my_dev; + yaffs_gross_lock(dev); + result = yaffs_remove_xattrib(obj, name); + if (result == YAFFS_OK) + error = 0; + else if (result < 0) + error = result; + yaffs_gross_unlock(dev); + + } + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_removexattr done returning %d", error); + + return error; +} + +static ssize_t yaffs_listxattr(struct dentry * dentry, char *buff, size_t size) +{ + struct inode *inode = dentry->d_inode; + int error = 0; + struct yaffs_dev *dev; + struct yaffs_obj *obj = yaffs_inode_to_obj(inode); + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_listxattr of object %d", obj->obj_id); + + if (error == 0) { + dev = obj->my_dev; + yaffs_gross_lock(dev); + error = yaffs_list_xattrib(obj, buff, size); + yaffs_gross_unlock(dev); + + } + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_listxattr done returning %d", error); + + return error; +} + +#endif + +static const struct inode_operations yaffs_dir_inode_operations = { + .create = yaffs_create, + .lookup = yaffs_lookup, + .link = yaffs_link, + .unlink = yaffs_unlink, + .symlink = yaffs_symlink, + .mkdir = yaffs_mkdir, + .rmdir = yaffs_unlink, + .mknod = yaffs_mknod, + .rename = yaffs_rename, + .setattr = yaffs_setattr, +#ifdef CONFIG_YAFFS_XATTR + .setxattr = yaffs_setxattr, + .getxattr = yaffs_getxattr, + .listxattr = yaffs_listxattr, + .removexattr = yaffs_removexattr, +#endif +}; +/*-----------------------------------------------------------------*/ +/* Directory search context allows us to unlock access to yaffs during + * filldir without causing problems with the directory being modified. + * This is similar to the tried and tested mechanism used in yaffs direct. + * + * A search context iterates along a doubly linked list of siblings in the + * directory. If the iterating object is deleted then this would corrupt + * the list iteration, likely causing a crash. The search context avoids + * this by using the remove_obj_fn to move the search context to the + * next object before the object is deleted. + * + * Many readdirs (and thus seach conexts) may be alive simulateously so + * each struct yaffs_dev has a list of these. + * + * A seach context lives for the duration of a readdir. + * + * All these functions must be called while yaffs is locked. + */ + +struct yaffs_search_context { + struct yaffs_dev *dev; + struct yaffs_obj *dir_obj; + struct yaffs_obj *next_return; + struct list_head others; +}; + +/* + * yaffs_new_search() creates a new search context, initialises it and + * adds it to the device's search context list. + * + * Called at start of readdir. + */ +static struct yaffs_search_context *yaffs_new_search(struct yaffs_obj *dir) +{ + struct yaffs_dev *dev = dir->my_dev; + struct yaffs_search_context *sc = + kmalloc(sizeof(struct yaffs_search_context), GFP_NOFS); + if (sc) { + sc->dir_obj = dir; + sc->dev = dev; + if (list_empty(&sc->dir_obj->variant.dir_variant.children)) + sc->next_return = NULL; + else + sc->next_return = + list_entry(dir->variant.dir_variant.children.next, + struct yaffs_obj, siblings); + INIT_LIST_HEAD(&sc->others); + list_add(&sc->others, &(yaffs_dev_to_lc(dev)->search_contexts)); + } + return sc; +} + +/* + * yaffs_search_end() disposes of a search context and cleans up. + */ +static void yaffs_search_end(struct yaffs_search_context *sc) +{ + if (sc) { + list_del(&sc->others); + kfree(sc); + } +} + +/* + * yaffs_search_advance() moves a search context to the next object. + * Called when the search iterates or when an object removal causes + * the search context to be moved to the next object. + */ +static void yaffs_search_advance(struct yaffs_search_context *sc) +{ + if (!sc) + return; + + if (sc->next_return == NULL || + list_empty(&sc->dir_obj->variant.dir_variant.children)) + sc->next_return = NULL; + else { + struct list_head *next = sc->next_return->siblings.next; + + if (next == &sc->dir_obj->variant.dir_variant.children) + sc->next_return = NULL; /* end of list */ + else + sc->next_return = + list_entry(next, struct yaffs_obj, siblings); + } +} + +/* + * yaffs_remove_obj_callback() is called when an object is unlinked. + * We check open search contexts and advance any which are currently + * on the object being iterated. + */ +static void yaffs_remove_obj_callback(struct yaffs_obj *obj) +{ + + struct list_head *i; + struct yaffs_search_context *sc; + struct list_head *search_contexts = + &(yaffs_dev_to_lc(obj->my_dev)->search_contexts); + + /* Iterate through the directory search contexts. + * If any are currently on the object being removed, then advance + * the search context to the next object to prevent a hanging pointer. + */ + list_for_each(i, search_contexts) { + if (i) { + sc = list_entry(i, struct yaffs_search_context, others); + if (sc->next_return == obj) + yaffs_search_advance(sc); + } + } + +} + +static int yaffs_readdir(struct file *f, void *dirent, filldir_t filldir) +{ + struct yaffs_obj *obj; + struct yaffs_dev *dev; + struct yaffs_search_context *sc; + struct inode *inode = f->f_dentry->d_inode; + unsigned long offset, curoffs; + struct yaffs_obj *l; + int ret_val = 0; + + char name[YAFFS_MAX_NAME_LENGTH + 1]; + + obj = yaffs_dentry_to_obj(f->f_dentry); + dev = obj->my_dev; + + yaffs_gross_lock(dev); + + yaffs_dev_to_lc(dev)->readdir_process = current; + + offset = f->f_pos; + + sc = yaffs_new_search(obj); + if (!sc) { + ret_val = -ENOMEM; + goto out; + } + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_readdir: starting at %d", (int)offset); + + if (offset == 0) { + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_readdir: entry . ino %d", + (int)inode->i_ino); + yaffs_gross_unlock(dev); + if (filldir(dirent, ".", 1, offset, inode->i_ino, DT_DIR) < 0) { + yaffs_gross_lock(dev); + goto out; + } + yaffs_gross_lock(dev); + offset++; + f->f_pos++; + } + if (offset == 1) { + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_readdir: entry .. ino %d", + (int)f->f_dentry->d_parent->d_inode->i_ino); + yaffs_gross_unlock(dev); + if (filldir(dirent, "..", 2, offset, + f->f_dentry->d_parent->d_inode->i_ino, + DT_DIR) < 0) { + yaffs_gross_lock(dev); + goto out; + } + yaffs_gross_lock(dev); + offset++; + f->f_pos++; + } + + curoffs = 1; + + /* If the directory has changed since the open or last call to + readdir, rewind to after the 2 canned entries. */ + if (f->f_version != inode->i_version) { + offset = 2; + f->f_pos = offset; + f->f_version = inode->i_version; + } + + while (sc->next_return) { + curoffs++; + l = sc->next_return; + if (curoffs >= offset) { + int this_inode = yaffs_get_obj_inode(l); + int this_type = yaffs_get_obj_type(l); + + yaffs_get_obj_name(l, name, YAFFS_MAX_NAME_LENGTH + 1); + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_readdir: %s inode %d", + name, yaffs_get_obj_inode(l)); + + yaffs_gross_unlock(dev); + + if (filldir(dirent, + name, + strlen(name), + offset, this_inode, this_type) < 0) { + yaffs_gross_lock(dev); + goto out; + } + + yaffs_gross_lock(dev); + + offset++; + f->f_pos++; + } + yaffs_search_advance(sc); + } + +out: + yaffs_search_end(sc); + yaffs_dev_to_lc(dev)->readdir_process = NULL; + yaffs_gross_unlock(dev); + + return ret_val; +} + +static const struct file_operations yaffs_dir_operations = { + .read = generic_read_dir, + .readdir = yaffs_readdir, + .fsync = yaffs_sync_object, + .llseek = generic_file_llseek, +}; + + + +static int yaffs_file_flush(struct file *file, fl_owner_t id) +{ + struct yaffs_obj *obj = yaffs_dentry_to_obj(file->f_dentry); + + struct yaffs_dev *dev = obj->my_dev; + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_file_flush object %d (%s)", + obj->obj_id, obj->dirty ? "dirty" : "clean"); + + yaffs_gross_lock(dev); + + yaffs_flush_file(obj, 1, 0); + + yaffs_gross_unlock(dev); + + return 0; +} + +static const struct file_operations yaffs_file_operations = { + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .mmap = generic_file_mmap, + .flush = yaffs_file_flush, + .fsync = yaffs_sync_object, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, + .llseek = generic_file_llseek, +}; + + +/* ExportFS support */ +static struct inode *yaffs2_nfs_get_inode(struct super_block *sb, uint64_t ino, + uint32_t generation) +{ + return yaffs_iget(sb, ino); +} + +static struct dentry *yaffs2_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + yaffs2_nfs_get_inode); +} + +static struct dentry *yaffs2_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + yaffs2_nfs_get_inode); +} + +struct dentry *yaffs2_get_parent(struct dentry *dentry) +{ + + struct super_block *sb = dentry->d_inode->i_sb; + struct dentry *parent = ERR_PTR(-ENOENT); + struct inode *inode; + unsigned long parent_ino; + struct yaffs_obj *d_obj; + struct yaffs_obj *parent_obj; + + d_obj = yaffs_inode_to_obj(dentry->d_inode); + + if (d_obj) { + parent_obj = d_obj->parent; + if (parent_obj) { + parent_ino = yaffs_get_obj_inode(parent_obj); + inode = yaffs_iget(sb, parent_ino); + + if (IS_ERR(inode)) { + parent = ERR_CAST(inode); + } else { + parent = d_obtain_alias(inode); + if (!IS_ERR(parent)) { + parent = ERR_PTR(-ENOMEM); + iput(inode); + } + } + } + } + + return parent; +} + +/* Just declare a zero structure as a NULL value implies + * using the default functions of exportfs. + */ + +static struct export_operations yaffs_export_ops = { + .fh_to_dentry = yaffs2_fh_to_dentry, + .fh_to_parent = yaffs2_fh_to_parent, + .get_parent = yaffs2_get_parent, +}; + + +/*-----------------------------------------------------------------*/ + +static int yaffs_readlink(struct dentry *dentry, char __user * buffer, + int buflen) +{ + unsigned char *alias; + int ret; + + struct yaffs_dev *dev = yaffs_dentry_to_obj(dentry)->my_dev; + + yaffs_gross_lock(dev); + + alias = yaffs_get_symlink_alias(yaffs_dentry_to_obj(dentry)); + + yaffs_gross_unlock(dev); + + if (!alias) + return -ENOMEM; + + ret = vfs_readlink(dentry, buffer, buflen, alias); + kfree(alias); + return ret; +} + +static void *yaffs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + unsigned char *alias; + void *ret; + struct yaffs_dev *dev = yaffs_dentry_to_obj(dentry)->my_dev; + + yaffs_gross_lock(dev); + + alias = yaffs_get_symlink_alias(yaffs_dentry_to_obj(dentry)); + yaffs_gross_unlock(dev); + + if (!alias) { + ret = ERR_PTR(-ENOMEM); + goto out; + } + + nd_set_link(nd, alias); + ret = (void *)alias; +out: + return ret; +} + +void yaffs_put_link(struct dentry *dentry, struct nameidata *nd, void *alias) +{ + kfree(alias); +} + + +static void yaffs_unstitch_obj(struct inode *inode, struct yaffs_obj *obj) +{ + /* Clear the association between the inode and + * the struct yaffs_obj. + */ + obj->my_inode = NULL; + yaffs_inode_to_obj_lv(inode) = NULL; + + /* If the object freeing was deferred, then the real + * free happens now. + * This should fix the inode inconsistency problem. + */ + yaffs_handle_defered_free(obj); +} + +/* yaffs_evict_inode combines into one operation what was previously done in + * yaffs_clear_inode() and yaffs_delete_inode() + * + */ +static void yaffs_evict_inode(struct inode *inode) +{ + struct yaffs_obj *obj; + struct yaffs_dev *dev; + int deleteme = 0; + + obj = yaffs_inode_to_obj(inode); + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_evict_inode: ino %d, count %d %s", + (int)inode->i_ino, + atomic_read(&inode->i_count), + obj ? "object exists" : "null object"); + + if (!inode->i_nlink && !is_bad_inode(inode)) + deleteme = 1; + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + + if (deleteme && obj) { + dev = obj->my_dev; + yaffs_gross_lock(dev); + yaffs_del_obj(obj); + yaffs_gross_unlock(dev); + } + if (obj) { + dev = obj->my_dev; + yaffs_gross_lock(dev); + yaffs_unstitch_obj(inode, obj); + yaffs_gross_unlock(dev); + } + +} + +static void yaffs_touch_super(struct yaffs_dev *dev) +{ + struct super_block *sb = yaffs_dev_to_lc(dev)->super; + + yaffs_trace(YAFFS_TRACE_OS, "yaffs_touch_super() sb = %p", sb); + if (sb) + sb->s_dirt = 1; +} + +static int yaffs_readpage_nolock(struct file *f, struct page *pg) +{ + /* Lifted from jffs2 */ + + struct yaffs_obj *obj; + unsigned char *pg_buf; + int ret; + + struct yaffs_dev *dev; + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_readpage_nolock at %08x, size %08x", + (unsigned)(pg->index << PAGE_CACHE_SHIFT), + (unsigned)PAGE_CACHE_SIZE); + + obj = yaffs_dentry_to_obj(f->f_dentry); + + dev = obj->my_dev; + + BUG_ON(!PageLocked(pg)); + + pg_buf = kmap(pg); + /* FIXME: Can kmap fail? */ + + yaffs_gross_lock(dev); + + ret = yaffs_file_rd(obj, pg_buf, + pg->index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE); + + yaffs_gross_unlock(dev); + + if (ret >= 0) + ret = 0; + + if (ret) { + ClearPageUptodate(pg); + SetPageError(pg); + } else { + SetPageUptodate(pg); + ClearPageError(pg); + } + + flush_dcache_page(pg); + kunmap(pg); + + yaffs_trace(YAFFS_TRACE_OS, "yaffs_readpage_nolock done"); + return ret; +} + +static int yaffs_readpage_unlock(struct file *f, struct page *pg) +{ + int ret = yaffs_readpage_nolock(f, pg); + UnlockPage(pg); + return ret; +} + +static int yaffs_readpage(struct file *f, struct page *pg) +{ + int ret; + + yaffs_trace(YAFFS_TRACE_OS, "yaffs_readpage"); + ret = yaffs_readpage_unlock(f, pg); + yaffs_trace(YAFFS_TRACE_OS, "yaffs_readpage done"); + return ret; +} + +/* writepage inspired by/stolen from smbfs */ + +static int yaffs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct yaffs_dev *dev; + struct address_space *mapping = page->mapping; + struct inode *inode; + unsigned long end_index; + char *buffer; + struct yaffs_obj *obj; + int n_written = 0; + unsigned n_bytes; + loff_t i_size; + + if (!mapping) + BUG(); + inode = mapping->host; + if (!inode) + BUG(); + i_size = i_size_read(inode); + + end_index = i_size >> PAGE_CACHE_SHIFT; + + if (page->index < end_index) + n_bytes = PAGE_CACHE_SIZE; + else { + n_bytes = i_size & (PAGE_CACHE_SIZE - 1); + + if (page->index > end_index || !n_bytes) { + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_writepage at %08x, inode size = %08x!!!", + (unsigned)(page->index << PAGE_CACHE_SHIFT), + (unsigned)inode->i_size); + yaffs_trace(YAFFS_TRACE_OS, + " -> don't care!!"); + + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + return 0; + } + } + + if (n_bytes != PAGE_CACHE_SIZE) + zero_user_segment(page, n_bytes, PAGE_CACHE_SIZE); + + get_page(page); + + buffer = kmap(page); + + obj = yaffs_inode_to_obj(inode); + dev = obj->my_dev; + yaffs_gross_lock(dev); + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_writepage at %08x, size %08x", + (unsigned)(page->index << PAGE_CACHE_SHIFT), n_bytes); + yaffs_trace(YAFFS_TRACE_OS, + "writepag0: obj = %05x, ino = %05x", + (int)obj->variant.file_variant.file_size, (int)inode->i_size); + + n_written = yaffs_wr_file(obj, buffer, + page->index << PAGE_CACHE_SHIFT, n_bytes, 0); + + yaffs_touch_super(dev); + + yaffs_trace(YAFFS_TRACE_OS, + "writepag1: obj = %05x, ino = %05x", + (int)obj->variant.file_variant.file_size, (int)inode->i_size); + + yaffs_gross_unlock(dev); + + kunmap(page); + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + put_page(page); + + return (n_written == n_bytes) ? 0 : -ENOSPC; +} + +/* Space holding and freeing is done to ensure we have space available for + * write_begin/end. + * For now we just assume few parallel writes and check against a small + * number. + * Todo: need to do this with a counter to handle parallel reads better. + */ + +static ssize_t yaffs_hold_space(struct file *f) +{ + struct yaffs_obj *obj; + struct yaffs_dev *dev; + + int n_free_chunks; + + obj = yaffs_dentry_to_obj(f->f_dentry); + + dev = obj->my_dev; + + yaffs_gross_lock(dev); + + n_free_chunks = yaffs_get_n_free_chunks(dev); + + yaffs_gross_unlock(dev); + + return (n_free_chunks > 20) ? 1 : 0; +} + +static void yaffs_release_space(struct file *f) +{ + struct yaffs_obj *obj; + struct yaffs_dev *dev; + + obj = yaffs_dentry_to_obj(f->f_dentry); + + dev = obj->my_dev; + + yaffs_gross_lock(dev); + + yaffs_gross_unlock(dev); +} + +static int yaffs_write_begin(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct page *pg = NULL; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + int ret = 0; + int space_held = 0; + + /* Get a page */ + pg = grab_cache_page_write_begin(mapping, index, flags); + + *pagep = pg; + if (!pg) { + ret = -ENOMEM; + goto out; + } + yaffs_trace(YAFFS_TRACE_OS, + "start yaffs_write_begin index %d(%x) uptodate %d", + (int)index, (int)index, Page_Uptodate(pg) ? 1 : 0); + + /* Get fs space */ + space_held = yaffs_hold_space(filp); + + if (!space_held) { + ret = -ENOSPC; + goto out; + } + + /* Update page if required */ + + if (!Page_Uptodate(pg)) + ret = yaffs_readpage_nolock(filp, pg); + + if (ret) + goto out; + + /* Happy path return */ + yaffs_trace(YAFFS_TRACE_OS, "end yaffs_write_begin - ok"); + + return 0; + +out: + yaffs_trace(YAFFS_TRACE_OS, + "end yaffs_write_begin fail returning %d", ret); + if (space_held) + yaffs_release_space(filp); + if (pg) { + unlock_page(pg); + page_cache_release(pg); + } + return ret; +} + +static ssize_t yaffs_file_write(struct file *f, const char *buf, size_t n, + loff_t * pos) +{ + struct yaffs_obj *obj; + int n_written, ipos; + struct inode *inode; + struct yaffs_dev *dev; + + obj = yaffs_dentry_to_obj(f->f_dentry); + + dev = obj->my_dev; + + yaffs_gross_lock(dev); + + inode = f->f_dentry->d_inode; + + if (!S_ISBLK(inode->i_mode) && f->f_flags & O_APPEND) + ipos = inode->i_size; + else + ipos = *pos; + + if (!obj) + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_file_write: hey obj is null!"); + else + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_file_write about to write writing %u(%x) bytes to object %d at %d(%x)", + (unsigned)n, (unsigned)n, obj->obj_id, ipos, ipos); + + n_written = yaffs_wr_file(obj, buf, ipos, n, 0); + + yaffs_touch_super(dev); + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_file_write: %d(%x) bytes written", + (unsigned)n, (unsigned)n); + + if (n_written > 0) { + ipos += n_written; + *pos = ipos; + if (ipos > inode->i_size) { + inode->i_size = ipos; + inode->i_blocks = (ipos + 511) >> 9; + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_file_write size updated to %d bytes, %d blocks", + ipos, (int)(inode->i_blocks)); + } + + } + yaffs_gross_unlock(dev); + return (n_written == 0) && (n > 0) ? -ENOSPC : n_written; +} + +static int yaffs_write_end(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *pg, void *fsdadata) +{ + int ret = 0; + void *addr, *kva; + uint32_t offset_into_page = pos & (PAGE_CACHE_SIZE - 1); + + kva = kmap(pg); + addr = kva + offset_into_page; + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_write_end addr %p pos %x n_bytes %d", + addr, (unsigned)pos, copied); + + ret = yaffs_file_write(filp, addr, copied, &pos); + + if (ret != copied) { + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_write_end not same size ret %d copied %d", + ret, copied); + SetPageError(pg); + } + + kunmap(pg); + + yaffs_release_space(filp); + unlock_page(pg); + page_cache_release(pg); + return ret; +} + +static int yaffs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct yaffs_dev *dev = yaffs_dentry_to_obj(dentry)->my_dev; + struct super_block *sb = dentry->d_sb; + + yaffs_trace(YAFFS_TRACE_OS, "yaffs_statfs"); + + yaffs_gross_lock(dev); + + buf->f_type = YAFFS_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_namelen = 255; + + if (dev->data_bytes_per_chunk & (dev->data_bytes_per_chunk - 1)) { + /* Do this if chunk size is not a power of 2 */ + + uint64_t bytes_in_dev; + uint64_t bytes_free; + + bytes_in_dev = + ((uint64_t) + ((dev->param.end_block - dev->param.start_block + + 1))) * ((uint64_t) (dev->param.chunks_per_block * + dev->data_bytes_per_chunk)); + + do_div(bytes_in_dev, sb->s_blocksize); /* bytes_in_dev becomes the number of blocks */ + buf->f_blocks = bytes_in_dev; + + bytes_free = ((uint64_t) (yaffs_get_n_free_chunks(dev))) * + ((uint64_t) (dev->data_bytes_per_chunk)); + + do_div(bytes_free, sb->s_blocksize); + + buf->f_bfree = bytes_free; + + } else if (sb->s_blocksize > dev->data_bytes_per_chunk) { + + buf->f_blocks = + (dev->param.end_block - dev->param.start_block + 1) * + dev->param.chunks_per_block / + (sb->s_blocksize / dev->data_bytes_per_chunk); + buf->f_bfree = + yaffs_get_n_free_chunks(dev) / + (sb->s_blocksize / dev->data_bytes_per_chunk); + } else { + buf->f_blocks = + (dev->param.end_block - dev->param.start_block + 1) * + dev->param.chunks_per_block * + (dev->data_bytes_per_chunk / sb->s_blocksize); + + buf->f_bfree = + yaffs_get_n_free_chunks(dev) * + (dev->data_bytes_per_chunk / sb->s_blocksize); + } + + buf->f_files = 0; + buf->f_ffree = 0; + buf->f_bavail = buf->f_bfree; + + yaffs_gross_unlock(dev); + return 0; +} + +static void yaffs_flush_inodes(struct super_block *sb) +{ + struct inode *iptr; + struct yaffs_obj *obj; + + list_for_each_entry(iptr, &sb->s_inodes, i_sb_list) { + obj = yaffs_inode_to_obj(iptr); + if (obj) { + yaffs_trace(YAFFS_TRACE_OS, + "flushing obj %d", obj->obj_id); + yaffs_flush_file(obj, 1, 0); + } + } +} + +static void yaffs_flush_super(struct super_block *sb, int do_checkpoint) +{ + struct yaffs_dev *dev = yaffs_super_to_dev(sb); + if (!dev) + return; + + yaffs_flush_inodes(sb); + yaffs_update_dirty_dirs(dev); + yaffs_flush_whole_cache(dev); + if (do_checkpoint) + yaffs_checkpoint_save(dev); +} + +static unsigned yaffs_bg_gc_urgency(struct yaffs_dev *dev) +{ + unsigned erased_chunks = + dev->n_erased_blocks * dev->param.chunks_per_block; + struct yaffs_linux_context *context = yaffs_dev_to_lc(dev); + unsigned scattered = 0; /* Free chunks not in an erased block */ + + if (erased_chunks < dev->n_free_chunks) + scattered = (dev->n_free_chunks - erased_chunks); + + if (!context->bg_running) + return 0; + else if (scattered < (dev->param.chunks_per_block * 2)) + return 0; + else if (erased_chunks > dev->n_free_chunks / 2) + return 0; + else if (erased_chunks > dev->n_free_chunks / 4) + return 1; + else + return 2; +} + +static int yaffs_do_sync_fs(struct super_block *sb, int request_checkpoint) +{ + + struct yaffs_dev *dev = yaffs_super_to_dev(sb); + unsigned int oneshot_checkpoint = (yaffs_auto_checkpoint & 4); + unsigned gc_urgent = yaffs_bg_gc_urgency(dev); + int do_checkpoint; + + yaffs_trace(YAFFS_TRACE_OS | YAFFS_TRACE_SYNC | YAFFS_TRACE_BACKGROUND, + "yaffs_do_sync_fs: gc-urgency %d %s %s%s", + gc_urgent, + sb->s_dirt ? "dirty" : "clean", + request_checkpoint ? "checkpoint requested" : "no checkpoint", + oneshot_checkpoint ? " one-shot" : ""); + + yaffs_gross_lock(dev); + do_checkpoint = ((request_checkpoint && !gc_urgent) || + oneshot_checkpoint) && !dev->is_checkpointed; + + if (sb->s_dirt || do_checkpoint) { + yaffs_flush_super(sb, !dev->is_checkpointed && do_checkpoint); + sb->s_dirt = 0; + if (oneshot_checkpoint) + yaffs_auto_checkpoint &= ~4; + } + yaffs_gross_unlock(dev); + + return 0; +} + +/* + * yaffs background thread functions . + * yaffs_bg_thread_fn() the thread function + * yaffs_bg_start() launches the background thread. + * yaffs_bg_stop() cleans up the background thread. + * + * NB: + * The thread should only run after the yaffs is initialised + * The thread should be stopped before yaffs is unmounted. + * The thread should not do any writing while the fs is in read only. + */ + +void yaffs_background_waker(unsigned long data) +{ + wake_up_process((struct task_struct *)data); +} + +static int yaffs_bg_thread_fn(void *data) +{ + struct yaffs_dev *dev = (struct yaffs_dev *)data; + struct yaffs_linux_context *context = yaffs_dev_to_lc(dev); + unsigned long now = jiffies; + unsigned long next_dir_update = now; + unsigned long next_gc = now; + unsigned long expires; + unsigned int urgency; + + int gc_result; + struct timer_list timer; + + yaffs_trace(YAFFS_TRACE_BACKGROUND, + "yaffs_background starting for dev %p", (void *)dev); + + set_freezable(); + while (context->bg_running) { + yaffs_trace(YAFFS_TRACE_BACKGROUND, "yaffs_background"); + + if (kthread_should_stop()) + break; + + if (try_to_freeze()) + continue; + + yaffs_gross_lock(dev); + + now = jiffies; + + if (time_after(now, next_dir_update) && yaffs_bg_enable) { + yaffs_update_dirty_dirs(dev); + next_dir_update = now + HZ; + } + + if (time_after(now, next_gc) && yaffs_bg_enable) { + if (!dev->is_checkpointed) { + urgency = yaffs_bg_gc_urgency(dev); + gc_result = yaffs_bg_gc(dev, urgency); + if (urgency > 1) + next_gc = now + HZ / 20 + 1; + else if (urgency > 0) + next_gc = now + HZ / 10 + 1; + else + next_gc = now + HZ * 2; + } else { + /* + * gc not running so set to next_dir_update + * to cut down on wake ups + */ + next_gc = next_dir_update; + } + } + yaffs_gross_unlock(dev); + expires = next_dir_update; + if (time_before(next_gc, expires)) + expires = next_gc; + if (time_before(expires, now)) + expires = now + HZ; + + Y_INIT_TIMER(&timer); + timer.expires = expires + 1; + timer.data = (unsigned long)current; + timer.function = yaffs_background_waker; + + set_current_state(TASK_INTERRUPTIBLE); + add_timer(&timer); + schedule(); + del_timer_sync(&timer); + } + + return 0; +} + +static int yaffs_bg_start(struct yaffs_dev *dev) +{ + int retval = 0; + struct yaffs_linux_context *context = yaffs_dev_to_lc(dev); + + if (dev->read_only) + return -1; + + context->bg_running = 1; + + context->bg_thread = kthread_run(yaffs_bg_thread_fn, + (void *)dev, "yaffs-bg-%d", + context->mount_id); + + if (IS_ERR(context->bg_thread)) { + retval = PTR_ERR(context->bg_thread); + context->bg_thread = NULL; + context->bg_running = 0; + } + return retval; +} + +static void yaffs_bg_stop(struct yaffs_dev *dev) +{ + struct yaffs_linux_context *ctxt = yaffs_dev_to_lc(dev); + + ctxt->bg_running = 0; + + if (ctxt->bg_thread) { + kthread_stop(ctxt->bg_thread); + ctxt->bg_thread = NULL; + } +} + +static void yaffs_write_super(struct super_block *sb) +{ + unsigned request_checkpoint = (yaffs_auto_checkpoint >= 2); + + yaffs_trace(YAFFS_TRACE_OS | YAFFS_TRACE_SYNC | YAFFS_TRACE_BACKGROUND, + "yaffs_write_super%s", + request_checkpoint ? " checkpt" : ""); + + yaffs_do_sync_fs(sb, request_checkpoint); + +} + +static int yaffs_sync_fs(struct super_block *sb, int wait) +{ + unsigned request_checkpoint = (yaffs_auto_checkpoint >= 1); + + yaffs_trace(YAFFS_TRACE_OS | YAFFS_TRACE_SYNC, + "yaffs_sync_fs%s", request_checkpoint ? " checkpt" : ""); + + yaffs_do_sync_fs(sb, request_checkpoint); + + return 0; +} + + +static LIST_HEAD(yaffs_context_list); +struct mutex yaffs_context_lock; + + + +struct yaffs_options { + int inband_tags; + int skip_checkpoint_read; + int skip_checkpoint_write; + int no_cache; + int tags_ecc_on; + int tags_ecc_overridden; + int lazy_loading_enabled; + int lazy_loading_overridden; + int empty_lost_and_found; + int empty_lost_and_found_overridden; +}; + +#define MAX_OPT_LEN 30 +static int yaffs_parse_options(struct yaffs_options *options, + const char *options_str) +{ + char cur_opt[MAX_OPT_LEN + 1]; + int p; + int error = 0; + + /* Parse through the options which is a comma seperated list */ + + while (options_str && *options_str && !error) { + memset(cur_opt, 0, MAX_OPT_LEN + 1); + p = 0; + + while (*options_str == ',') + options_str++; + + while (*options_str && *options_str != ',') { + if (p < MAX_OPT_LEN) { + cur_opt[p] = *options_str; + p++; + } + options_str++; + } + + if (!strcmp(cur_opt, "inband-tags")) { + options->inband_tags = 1; + } else if (!strcmp(cur_opt, "tags-ecc-off")) { + options->tags_ecc_on = 0; + options->tags_ecc_overridden = 1; + } else if (!strcmp(cur_opt, "tags-ecc-on")) { + options->tags_ecc_on = 1; + options->tags_ecc_overridden = 1; + } else if (!strcmp(cur_opt, "lazy-loading-off")) { + options->lazy_loading_enabled = 0; + options->lazy_loading_overridden = 1; + } else if (!strcmp(cur_opt, "lazy-loading-on")) { + options->lazy_loading_enabled = 1; + options->lazy_loading_overridden = 1; + } else if (!strcmp(cur_opt, "empty-lost-and-found-off")) { + options->empty_lost_and_found = 0; + options->empty_lost_and_found_overridden = 1; + } else if (!strcmp(cur_opt, "empty-lost-and-found-on")) { + options->empty_lost_and_found = 1; + options->empty_lost_and_found_overridden = 1; + } else if (!strcmp(cur_opt, "no-cache")) { + options->no_cache = 1; + } else if (!strcmp(cur_opt, "no-checkpoint-read")) { + options->skip_checkpoint_read = 1; + } else if (!strcmp(cur_opt, "no-checkpoint-write")) { + options->skip_checkpoint_write = 1; + } else if (!strcmp(cur_opt, "no-checkpoint")) { + options->skip_checkpoint_read = 1; + options->skip_checkpoint_write = 1; + } else { + printk(KERN_INFO "yaffs: Bad mount option \"%s\"\n", + cur_opt); + error = 1; + } + } + + return error; +} + +static struct address_space_operations yaffs_file_address_operations = { + .readpage = yaffs_readpage, + .writepage = yaffs_writepage, + .write_begin = yaffs_write_begin, + .write_end = yaffs_write_end, +}; + + + +static const struct inode_operations yaffs_file_inode_operations = { + .setattr = yaffs_setattr, +#ifdef CONFIG_YAFFS_XATTR + .setxattr = yaffs_setxattr, + .getxattr = yaffs_getxattr, + .listxattr = yaffs_listxattr, + .removexattr = yaffs_removexattr, +#endif +}; + +static const struct inode_operations yaffs_symlink_inode_operations = { + .readlink = yaffs_readlink, + .follow_link = yaffs_follow_link, + .put_link = yaffs_put_link, + .setattr = yaffs_setattr, +#ifdef CONFIG_YAFFS_XATTR + .setxattr = yaffs_setxattr, + .getxattr = yaffs_getxattr, + .listxattr = yaffs_listxattr, + .removexattr = yaffs_removexattr, +#endif +}; + +static void yaffs_fill_inode_from_obj(struct inode *inode, + struct yaffs_obj *obj) +{ + if (inode && obj) { + + /* Check mode against the variant type and attempt to repair if broken. */ + u32 mode = obj->yst_mode; + switch (obj->variant_type) { + case YAFFS_OBJECT_TYPE_FILE: + if (!S_ISREG(mode)) { + obj->yst_mode &= ~S_IFMT; + obj->yst_mode |= S_IFREG; + } + + break; + case YAFFS_OBJECT_TYPE_SYMLINK: + if (!S_ISLNK(mode)) { + obj->yst_mode &= ~S_IFMT; + obj->yst_mode |= S_IFLNK; + } + + break; + case YAFFS_OBJECT_TYPE_DIRECTORY: + if (!S_ISDIR(mode)) { + obj->yst_mode &= ~S_IFMT; + obj->yst_mode |= S_IFDIR; + } + + break; + case YAFFS_OBJECT_TYPE_UNKNOWN: + case YAFFS_OBJECT_TYPE_HARDLINK: + case YAFFS_OBJECT_TYPE_SPECIAL: + default: + /* TODO? */ + break; + } + + inode->i_flags |= S_NOATIME; + + inode->i_ino = obj->obj_id; + inode->i_mode = obj->yst_mode; + inode->i_uid = obj->yst_uid; + inode->i_gid = obj->yst_gid; + + inode->i_rdev = old_decode_dev(obj->yst_rdev); + + inode->i_atime.tv_sec = (time_t) (obj->yst_atime); + inode->i_atime.tv_nsec = 0; + inode->i_mtime.tv_sec = (time_t) obj->yst_mtime; + inode->i_mtime.tv_nsec = 0; + inode->i_ctime.tv_sec = (time_t) obj->yst_ctime; + inode->i_ctime.tv_nsec = 0; + inode->i_size = yaffs_get_obj_length(obj); + inode->i_blocks = (inode->i_size + 511) >> 9; + + inode->i_nlink = yaffs_get_obj_link_count(obj); + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_fill_inode mode %x uid %d gid %d size %d count %d", + inode->i_mode, inode->i_uid, inode->i_gid, + (int)inode->i_size, atomic_read(&inode->i_count)); + + switch (obj->yst_mode & S_IFMT) { + default: /* fifo, device or socket */ + init_special_inode(inode, obj->yst_mode, + old_decode_dev(obj->yst_rdev)); + break; + case S_IFREG: /* file */ + inode->i_op = &yaffs_file_inode_operations; + inode->i_fop = &yaffs_file_operations; + inode->i_mapping->a_ops = + &yaffs_file_address_operations; + break; + case S_IFDIR: /* directory */ + inode->i_op = &yaffs_dir_inode_operations; + inode->i_fop = &yaffs_dir_operations; + break; + case S_IFLNK: /* symlink */ + inode->i_op = &yaffs_symlink_inode_operations; + break; + } + + yaffs_inode_to_obj_lv(inode) = obj; + + obj->my_inode = inode; + + } else { + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_fill_inode invalid parameters"); + } +} + +static void yaffs_put_super(struct super_block *sb) +{ + struct yaffs_dev *dev = yaffs_super_to_dev(sb); + + yaffs_trace(YAFFS_TRACE_OS, "yaffs_put_super"); + + yaffs_trace(YAFFS_TRACE_OS | YAFFS_TRACE_BACKGROUND, + "Shutting down yaffs background thread"); + yaffs_bg_stop(dev); + yaffs_trace(YAFFS_TRACE_OS | YAFFS_TRACE_BACKGROUND, + "yaffs background thread shut down"); + + yaffs_gross_lock(dev); + + yaffs_flush_super(sb, 1); + + if (yaffs_dev_to_lc(dev)->put_super_fn) + yaffs_dev_to_lc(dev)->put_super_fn(sb); + + yaffs_deinitialise(dev); + + yaffs_gross_unlock(dev); + mutex_lock(&yaffs_context_lock); + list_del_init(&(yaffs_dev_to_lc(dev)->context_list)); + mutex_unlock(&yaffs_context_lock); + + if (yaffs_dev_to_lc(dev)->spare_buffer) { + kfree(yaffs_dev_to_lc(dev)->spare_buffer); + yaffs_dev_to_lc(dev)->spare_buffer = NULL; + } + + kfree(dev); +} + +static void yaffs_mtd_put_super(struct super_block *sb) +{ + struct mtd_info *mtd = yaffs_dev_to_mtd(yaffs_super_to_dev(sb)); + + if (mtd->sync) + mtd->sync(mtd); + + put_mtd_device(mtd); +} + +static const struct super_operations yaffs_super_ops = { + .statfs = yaffs_statfs, + .put_super = yaffs_put_super, + .evict_inode = yaffs_evict_inode, + .sync_fs = yaffs_sync_fs, + .write_super = yaffs_write_super, +}; + +static struct super_block *yaffs_internal_read_super(int yaffs_version, + struct super_block *sb, + void *data, int silent) +{ + int n_blocks; + struct inode *inode = NULL; + struct dentry *root; + struct yaffs_dev *dev = 0; + char devname_buf[BDEVNAME_SIZE + 1]; + struct mtd_info *mtd; + int err; + char *data_str = (char *)data; + struct yaffs_linux_context *context = NULL; + struct yaffs_param *param; + + int read_only = 0; + + struct yaffs_options options; + + unsigned mount_id; + int found; + struct yaffs_linux_context *context_iterator; + struct list_head *l; + + sb->s_magic = YAFFS_MAGIC; + sb->s_op = &yaffs_super_ops; + sb->s_flags |= MS_NOATIME; + + read_only = ((sb->s_flags & MS_RDONLY) != 0); + + sb->s_export_op = &yaffs_export_ops; + + if (!sb) + printk(KERN_INFO "yaffs: sb is NULL\n"); + else if (!sb->s_dev) + printk(KERN_INFO "yaffs: sb->s_dev is NULL\n"); + else if (!yaffs_devname(sb, devname_buf)) + printk(KERN_INFO "yaffs: devname is NULL\n"); + else + printk(KERN_INFO "yaffs: dev is %d name is \"%s\" %s\n", + sb->s_dev, + yaffs_devname(sb, devname_buf), read_only ? "ro" : "rw"); + + if (!data_str) + data_str = ""; + + printk(KERN_INFO "yaffs: passed flags \"%s\"\n", data_str); + + memset(&options, 0, sizeof(options)); + + if (yaffs_parse_options(&options, data_str)) { + /* Option parsing failed */ + return NULL; + } + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_read_super: Using yaffs%d", yaffs_version); + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_read_super: block size %d", (int)(sb->s_blocksize)); + + yaffs_trace(YAFFS_TRACE_ALWAYS, + "Attempting MTD mount of %u.%u,\"%s\"", + MAJOR(sb->s_dev), MINOR(sb->s_dev), + yaffs_devname(sb, devname_buf)); + + /* Check it's an mtd device..... */ + if (MAJOR(sb->s_dev) != MTD_BLOCK_MAJOR) + return NULL; /* This isn't an mtd device */ + + /* Get the device */ + mtd = get_mtd_device(NULL, MINOR(sb->s_dev)); + if (!mtd) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "MTD device #%u doesn't appear to exist", + MINOR(sb->s_dev)); + return NULL; + } + /* Check it's NAND */ + if (mtd->type != MTD_NANDFLASH) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "MTD device is not NAND it's type %d", + mtd->type); + return NULL; + } + + yaffs_trace(YAFFS_TRACE_OS, " erase %p", mtd->erase); + yaffs_trace(YAFFS_TRACE_OS, " read %p", mtd->read); + yaffs_trace(YAFFS_TRACE_OS, " write %p", mtd->write); + yaffs_trace(YAFFS_TRACE_OS, " readoob %p", mtd->read_oob); + yaffs_trace(YAFFS_TRACE_OS, " writeoob %p", mtd->write_oob); + yaffs_trace(YAFFS_TRACE_OS, " block_isbad %p", mtd->block_isbad); + yaffs_trace(YAFFS_TRACE_OS, " block_markbad %p", mtd->block_markbad); + yaffs_trace(YAFFS_TRACE_OS, " %s %d", WRITE_SIZE_STR, WRITE_SIZE(mtd)); + yaffs_trace(YAFFS_TRACE_OS, " oobsize %d", mtd->oobsize); + yaffs_trace(YAFFS_TRACE_OS, " erasesize %d", mtd->erasesize); + yaffs_trace(YAFFS_TRACE_OS, " size %lld", mtd->size); + +#ifdef CONFIG_YAFFS_AUTO_YAFFS2 + + if (yaffs_version == 1 && WRITE_SIZE(mtd) >= 2048) { + yaffs_trace(YAFFS_TRACE_ALWAYS, "auto selecting yaffs2"); + yaffs_version = 2; + } + + /* Added NCB 26/5/2006 for completeness */ + if (yaffs_version == 2 && !options.inband_tags + && WRITE_SIZE(mtd) == 512) { + yaffs_trace(YAFFS_TRACE_ALWAYS, "auto selecting yaffs1"); + yaffs_version = 1; + } +#endif + + if (yaffs_version == 2) { + /* Check for version 2 style functions */ + if (!mtd->erase || + !mtd->block_isbad || + !mtd->block_markbad || + !mtd->read || + !mtd->write || !mtd->read_oob || !mtd->write_oob) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "MTD device does not support required functions"); + return NULL; + } + + if ((WRITE_SIZE(mtd) < YAFFS_MIN_YAFFS2_CHUNK_SIZE || + mtd->oobsize < YAFFS_MIN_YAFFS2_SPARE_SIZE) && + !options.inband_tags) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "MTD device does not have the right page sizes"); + return NULL; + } + } else { + /* Check for V1 style functions */ + if (!mtd->erase || + !mtd->read || + !mtd->write || !mtd->read_oob || !mtd->write_oob) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "MTD device does not support required functions"); + return NULL; + } + + if (WRITE_SIZE(mtd) < YAFFS_BYTES_PER_CHUNK || + mtd->oobsize != YAFFS_BYTES_PER_SPARE) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "MTD device does not support have the right page sizes"); + return NULL; + } + } + + /* OK, so if we got here, we have an MTD that's NAND and looks + * like it has the right capabilities + * Set the struct yaffs_dev up for mtd + */ + + if (!read_only && !(mtd->flags & MTD_WRITEABLE)) { + read_only = 1; + printk(KERN_INFO + "yaffs: mtd is read only, setting superblock read only"); + sb->s_flags |= MS_RDONLY; + } + + dev = kmalloc(sizeof(struct yaffs_dev), GFP_KERNEL); + context = kmalloc(sizeof(struct yaffs_linux_context), GFP_KERNEL); + + if (!dev || !context) { + if (dev) + kfree(dev); + if (context) + kfree(context); + dev = NULL; + context = NULL; + } + + if (!dev) { + /* Deep shit could not allocate device structure */ + yaffs_trace(YAFFS_TRACE_ALWAYS, + "yaffs_read_super failed trying to allocate yaffs_dev"); + return NULL; + } + memset(dev, 0, sizeof(struct yaffs_dev)); + param = &(dev->param); + + memset(context, 0, sizeof(struct yaffs_linux_context)); + dev->os_context = context; + INIT_LIST_HEAD(&(context->context_list)); + context->dev = dev; + context->super = sb; + + dev->read_only = read_only; + + sb->s_fs_info = dev; + + dev->driver_context = mtd; + param->name = mtd->name; + + /* Set up the memory size parameters.... */ + + n_blocks = + YCALCBLOCKS(mtd->size, + (YAFFS_CHUNKS_PER_BLOCK * YAFFS_BYTES_PER_CHUNK)); + + param->start_block = 0; + param->end_block = n_blocks - 1; + param->chunks_per_block = YAFFS_CHUNKS_PER_BLOCK; + param->total_bytes_per_chunk = YAFFS_BYTES_PER_CHUNK; + param->n_reserved_blocks = 5; + param->n_caches = (options.no_cache) ? 0 : 10; + param->inband_tags = options.inband_tags; + +#ifdef CONFIG_YAFFS_DISABLE_LAZY_LOAD + param->disable_lazy_load = 1; +#endif +#ifdef CONFIG_YAFFS_XATTR + param->enable_xattr = 1; +#endif + if (options.lazy_loading_overridden) + param->disable_lazy_load = !options.lazy_loading_enabled; + +#ifdef CONFIG_YAFFS_DISABLE_TAGS_ECC + param->no_tags_ecc = 1; +#endif + +#ifdef CONFIG_YAFFS_DISABLE_BACKGROUND +#else + param->defered_dir_update = 1; +#endif + + if (options.tags_ecc_overridden) + param->no_tags_ecc = !options.tags_ecc_on; + +#ifdef CONFIG_YAFFS_EMPTY_LOST_AND_FOUND + param->empty_lost_n_found = 1; +#endif + +#ifdef CONFIG_YAFFS_DISABLE_BLOCK_REFRESHING + param->refresh_period = 0; +#else + param->refresh_period = 500; +#endif + +#ifdef CONFIG_YAFFS_ALWAYS_CHECK_CHUNK_ERASED + param->always_check_erased = 1; +#endif + + if (options.empty_lost_and_found_overridden) + param->empty_lost_n_found = options.empty_lost_and_found; + + /* ... and the functions. */ + if (yaffs_version == 2) { + param->write_chunk_tags_fn = nandmtd2_write_chunk_tags; + param->read_chunk_tags_fn = nandmtd2_read_chunk_tags; + param->bad_block_fn = nandmtd2_mark_block_bad; + param->query_block_fn = nandmtd2_query_block; + yaffs_dev_to_lc(dev)->spare_buffer = + kmalloc(mtd->oobsize, GFP_NOFS); + param->is_yaffs2 = 1; + param->total_bytes_per_chunk = mtd->writesize; + param->chunks_per_block = mtd->erasesize / mtd->writesize; + n_blocks = YCALCBLOCKS(mtd->size, mtd->erasesize); + + param->start_block = 0; + param->end_block = n_blocks - 1; + } else { + /* use the MTD interface in yaffs_mtdif1.c */ + param->write_chunk_tags_fn = nandmtd1_write_chunk_tags; + param->read_chunk_tags_fn = nandmtd1_read_chunk_tags; + param->bad_block_fn = nandmtd1_mark_block_bad; + param->query_block_fn = nandmtd1_query_block; + param->is_yaffs2 = 0; + } + /* ... and common functions */ + param->erase_fn = nandmtd_erase_block; + param->initialise_flash_fn = nandmtd_initialise; + + yaffs_dev_to_lc(dev)->put_super_fn = yaffs_mtd_put_super; + + param->sb_dirty_fn = yaffs_touch_super; + param->gc_control = yaffs_gc_control_callback; + + yaffs_dev_to_lc(dev)->super = sb; + +#ifndef CONFIG_YAFFS_DOES_ECC + param->use_nand_ecc = 1; +#endif + + param->skip_checkpt_rd = options.skip_checkpoint_read; + param->skip_checkpt_wr = options.skip_checkpoint_write; + + mutex_lock(&yaffs_context_lock); + /* Get a mount id */ + found = 0; + for (mount_id = 0; !found; mount_id++) { + found = 1; + list_for_each(l, &yaffs_context_list) { + context_iterator = + list_entry(l, struct yaffs_linux_context, + context_list); + if (context_iterator->mount_id == mount_id) + found = 0; + } + } + context->mount_id = mount_id; + + list_add_tail(&(yaffs_dev_to_lc(dev)->context_list), + &yaffs_context_list); + mutex_unlock(&yaffs_context_lock); + + /* Directory search handling... */ + INIT_LIST_HEAD(&(yaffs_dev_to_lc(dev)->search_contexts)); + param->remove_obj_fn = yaffs_remove_obj_callback; + + mutex_init(&(yaffs_dev_to_lc(dev)->gross_lock)); + + yaffs_gross_lock(dev); + + err = yaffs_guts_initialise(dev); + + yaffs_trace(YAFFS_TRACE_OS, + "yaffs_read_super: guts initialised %s", + (err == YAFFS_OK) ? "OK" : "FAILED"); + + if (err == YAFFS_OK) + yaffs_bg_start(dev); + + if (!context->bg_thread) + param->defered_dir_update = 0; + + /* Release lock before yaffs_get_inode() */ + yaffs_gross_unlock(dev); + + /* Create root inode */ + if (err == YAFFS_OK) + inode = yaffs_get_inode(sb, S_IFDIR | 0755, 0, yaffs_root(dev)); + + if (!inode) + return NULL; + + inode->i_op = &yaffs_dir_inode_operations; + inode->i_fop = &yaffs_dir_operations; + + yaffs_trace(YAFFS_TRACE_OS, "yaffs_read_super: got root inode"); + + root = d_alloc_root(inode); + + yaffs_trace(YAFFS_TRACE_OS, "yaffs_read_super: d_alloc_root done"); + + if (!root) { + iput(inode); + return NULL; + } + sb->s_root = root; + sb->s_dirt = !dev->is_checkpointed; + yaffs_trace(YAFFS_TRACE_ALWAYS, + "yaffs_read_super: is_checkpointed %d", + dev->is_checkpointed); + + yaffs_trace(YAFFS_TRACE_OS, "yaffs_read_super: done"); + return sb; +} + +static int yaffs_internal_read_super_mtd(struct super_block *sb, void *data, + int silent) +{ + return yaffs_internal_read_super(1, sb, data, silent) ? 0 : -EINVAL; +} + +static int yaffs_read_super(struct file_system_type *fs, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) +{ + + return get_sb_bdev(fs, flags, dev_name, data, + yaffs_internal_read_super_mtd, mnt); +} + +static struct file_system_type yaffs_fs_type = { + .owner = THIS_MODULE, + .name = "yaffs", + .get_sb = yaffs_read_super, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +#ifdef CONFIG_YAFFS_YAFFS2 + +static int yaffs2_internal_read_super_mtd(struct super_block *sb, void *data, + int silent) +{ + return yaffs_internal_read_super(2, sb, data, silent) ? 0 : -EINVAL; +} + +static int yaffs2_read_super(struct file_system_type *fs, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) +{ + return get_sb_bdev(fs, flags, dev_name, data, + yaffs2_internal_read_super_mtd, mnt); +} + +static struct file_system_type yaffs2_fs_type = { + .owner = THIS_MODULE, + .name = "yaffs2", + .get_sb = yaffs2_read_super, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +#endif /* CONFIG_YAFFS_YAFFS2 */ + +static struct proc_dir_entry *my_proc_entry; + +static char *yaffs_dump_dev_part0(char *buf, struct yaffs_dev *dev) +{ + struct yaffs_param *param = &dev->param; + buf += sprintf(buf, "start_block........... %d\n", param->start_block); + buf += sprintf(buf, "end_block............. %d\n", param->end_block); + buf += sprintf(buf, "total_bytes_per_chunk. %d\n", + param->total_bytes_per_chunk); + buf += sprintf(buf, "use_nand_ecc.......... %d\n", + param->use_nand_ecc); + buf += sprintf(buf, "no_tags_ecc........... %d\n", param->no_tags_ecc); + buf += sprintf(buf, "is_yaffs2............. %d\n", param->is_yaffs2); + buf += sprintf(buf, "inband_tags........... %d\n", param->inband_tags); + buf += sprintf(buf, "empty_lost_n_found.... %d\n", + param->empty_lost_n_found); + buf += sprintf(buf, "disable_lazy_load..... %d\n", + param->disable_lazy_load); + buf += sprintf(buf, "refresh_period........ %d\n", + param->refresh_period); + buf += sprintf(buf, "n_caches.............. %d\n", param->n_caches); + buf += sprintf(buf, "n_reserved_blocks..... %d\n", + param->n_reserved_blocks); + buf += sprintf(buf, "always_check_erased... %d\n", + param->always_check_erased); + + return buf; +} + +static char *yaffs_dump_dev_part1(char *buf, struct yaffs_dev *dev) +{ + buf += + sprintf(buf, "data_bytes_per_chunk.. %d\n", + dev->data_bytes_per_chunk); + buf += sprintf(buf, "chunk_grp_bits........ %d\n", dev->chunk_grp_bits); + buf += sprintf(buf, "chunk_grp_size........ %d\n", dev->chunk_grp_size); + buf += + sprintf(buf, "n_erased_blocks....... %d\n", dev->n_erased_blocks); + buf += + sprintf(buf, "blocks_in_checkpt..... %d\n", dev->blocks_in_checkpt); + buf += sprintf(buf, "\n"); + buf += sprintf(buf, "n_tnodes.............. %d\n", dev->n_tnodes); + buf += sprintf(buf, "n_obj................. %d\n", dev->n_obj); + buf += sprintf(buf, "n_free_chunks......... %d\n", dev->n_free_chunks); + buf += sprintf(buf, "\n"); + buf += sprintf(buf, "n_page_writes......... %u\n", dev->n_page_writes); + buf += sprintf(buf, "n_page_reads.......... %u\n", dev->n_page_reads); + buf += sprintf(buf, "n_erasures............ %u\n", dev->n_erasures); + buf += sprintf(buf, "n_gc_copies........... %u\n", dev->n_gc_copies); + buf += sprintf(buf, "all_gcs............... %u\n", dev->all_gcs); + buf += + sprintf(buf, "passive_gc_count...... %u\n", dev->passive_gc_count); + buf += + sprintf(buf, "oldest_dirty_gc_count. %u\n", + dev->oldest_dirty_gc_count); + buf += sprintf(buf, "n_gc_blocks........... %u\n", dev->n_gc_blocks); + buf += sprintf(buf, "bg_gcs................ %u\n", dev->bg_gcs); + buf += + sprintf(buf, "n_retired_writes...... %u\n", dev->n_retired_writes); + buf += + sprintf(buf, "n_retired_blocks...... %u\n", dev->n_retired_blocks); + buf += sprintf(buf, "n_ecc_fixed........... %u\n", dev->n_ecc_fixed); + buf += sprintf(buf, "n_ecc_unfixed......... %u\n", dev->n_ecc_unfixed); + buf += + sprintf(buf, "n_tags_ecc_fixed...... %u\n", dev->n_tags_ecc_fixed); + buf += + sprintf(buf, "n_tags_ecc_unfixed.... %u\n", + dev->n_tags_ecc_unfixed); + buf += sprintf(buf, "cache_hits............ %u\n", dev->cache_hits); + buf += + sprintf(buf, "n_deleted_files....... %u\n", dev->n_deleted_files); + buf += + sprintf(buf, "n_unlinked_files...... %u\n", dev->n_unlinked_files); + buf += sprintf(buf, "refresh_count......... %u\n", dev->refresh_count); + buf += sprintf(buf, "n_bg_deletions........ %u\n", dev->n_bg_deletions); + + return buf; +} + +static int yaffs_proc_read(char *page, + char **start, + off_t offset, int count, int *eof, void *data) +{ + struct list_head *item; + char *buf = page; + int step = offset; + int n = 0; + + /* Get proc_file_read() to step 'offset' by one on each sucessive call. + * We use 'offset' (*ppos) to indicate where we are in dev_list. + * This also assumes the user has posted a read buffer large + * enough to hold the complete output; but that's life in /proc. + */ + + *(int *)start = 1; + + /* Print header first */ + if (step == 0) + buf += sprintf(buf, "YAFFS built:" __DATE__ " " __TIME__ "\n"); + else if (step == 1) + buf += sprintf(buf, "\n"); + else { + step -= 2; + + mutex_lock(&yaffs_context_lock); + + /* Locate and print the Nth entry. Order N-squared but N is small. */ + list_for_each(item, &yaffs_context_list) { + struct yaffs_linux_context *dc = + list_entry(item, struct yaffs_linux_context, + context_list); + struct yaffs_dev *dev = dc->dev; + + if (n < (step & ~1)) { + n += 2; + continue; + } + if ((step & 1) == 0) { + buf += + sprintf(buf, "\nDevice %d \"%s\"\n", n, + dev->param.name); + buf = yaffs_dump_dev_part0(buf, dev); + } else { + buf = yaffs_dump_dev_part1(buf, dev); + } + + break; + } + mutex_unlock(&yaffs_context_lock); + } + + return buf - page < count ? buf - page : count; +} + + +/** + * Set the verbosity of the warnings and error messages. + * + * Note that the names can only be a..z or _ with the current code. + */ + +static struct { + char *mask_name; + unsigned mask_bitfield; +} mask_flags[] = { + {"allocate", YAFFS_TRACE_ALLOCATE}, + {"always", YAFFS_TRACE_ALWAYS}, + {"background", YAFFS_TRACE_BACKGROUND}, + {"bad_blocks", YAFFS_TRACE_BAD_BLOCKS}, + {"buffers", YAFFS_TRACE_BUFFERS}, + {"bug", YAFFS_TRACE_BUG}, + {"checkpt", YAFFS_TRACE_CHECKPOINT}, + {"deletion", YAFFS_TRACE_DELETION}, + {"erase", YAFFS_TRACE_ERASE}, + {"error", YAFFS_TRACE_ERROR}, + {"gc_detail", YAFFS_TRACE_GC_DETAIL}, + {"gc", YAFFS_TRACE_GC}, + {"lock", YAFFS_TRACE_LOCK}, + {"mtd", YAFFS_TRACE_MTD}, + {"nandaccess", YAFFS_TRACE_NANDACCESS}, + {"os", YAFFS_TRACE_OS}, + {"scan_debug", YAFFS_TRACE_SCAN_DEBUG}, + {"scan", YAFFS_TRACE_SCAN}, + {"mount", YAFFS_TRACE_MOUNT}, + {"tracing", YAFFS_TRACE_TRACING}, + {"sync", YAFFS_TRACE_SYNC}, + {"write", YAFFS_TRACE_WRITE}, + {"verify", YAFFS_TRACE_VERIFY}, + {"verify_nand", YAFFS_TRACE_VERIFY_NAND}, + {"verify_full", YAFFS_TRACE_VERIFY_FULL}, + {"verify_all", YAFFS_TRACE_VERIFY_ALL}, + {"all", 0xffffffff}, + {"none", 0}, + {NULL, 0}, +}; + +#define MAX_MASK_NAME_LENGTH 40 +static int yaffs_proc_write_trace_options(struct file *file, const char *buf, + unsigned long count, void *data) +{ + unsigned rg = 0, mask_bitfield; + char *end; + char *mask_name; + const char *x; + char substring[MAX_MASK_NAME_LENGTH + 1]; + int i; + int done = 0; + int add, len = 0; + int pos = 0; + + rg = yaffs_trace_mask; + + while (!done && (pos < count)) { + done = 1; + while ((pos < count) && isspace(buf[pos])) + pos++; + + switch (buf[pos]) { + case '+': + case '-': + case '=': + add = buf[pos]; + pos++; + break; + + default: + add = ' '; + break; + } + mask_name = NULL; + + mask_bitfield = simple_strtoul(buf + pos, &end, 0); + + if (end > buf + pos) { + mask_name = "numeral"; + len = end - (buf + pos); + pos += len; + done = 0; + } else { + for (x = buf + pos, i = 0; + (*x == '_' || (*x >= 'a' && *x <= 'z')) && + i < MAX_MASK_NAME_LENGTH; x++, i++, pos++) + substring[i] = *x; + substring[i] = '\0'; + + for (i = 0; mask_flags[i].mask_name != NULL; i++) { + if (strcmp(substring, mask_flags[i].mask_name) + == 0) { + mask_name = mask_flags[i].mask_name; + mask_bitfield = + mask_flags[i].mask_bitfield; + done = 0; + break; + } + } + } + + if (mask_name != NULL) { + done = 0; + switch (add) { + case '-': + rg &= ~mask_bitfield; + break; + case '+': + rg |= mask_bitfield; + break; + case '=': + rg = mask_bitfield; + break; + default: + rg |= mask_bitfield; + break; + } + } + } + + yaffs_trace_mask = rg | YAFFS_TRACE_ALWAYS; + + printk(KERN_DEBUG "new trace = 0x%08X\n", yaffs_trace_mask); + + if (rg & YAFFS_TRACE_ALWAYS) { + for (i = 0; mask_flags[i].mask_name != NULL; i++) { + char flag; + flag = ((rg & mask_flags[i].mask_bitfield) == + mask_flags[i].mask_bitfield) ? '+' : '-'; + printk(KERN_DEBUG "%c%s\n", flag, + mask_flags[i].mask_name); + } + } + + return count; +} + +static int yaffs_proc_write(struct file *file, const char *buf, + unsigned long count, void *data) +{ + return yaffs_proc_write_trace_options(file, buf, count, data); +} + +/* Stuff to handle installation of file systems */ +struct file_system_to_install { + struct file_system_type *fst; + int installed; +}; + +static struct file_system_to_install fs_to_install[] = { + {&yaffs_fs_type, 0}, + {&yaffs2_fs_type, 0}, + {NULL, 0} +}; + +static int __init init_yaffs_fs(void) +{ + int error = 0; + struct file_system_to_install *fsinst; + + yaffs_trace(YAFFS_TRACE_ALWAYS, + "yaffs built " __DATE__ " " __TIME__ " Installing."); + +#ifdef CONFIG_YAFFS_ALWAYS_CHECK_CHUNK_ERASED + yaffs_trace(YAFFS_TRACE_ALWAYS, + "\n\nYAFFS-WARNING CONFIG_YAFFS_ALWAYS_CHECK_CHUNK_ERASED selected.\n\n\n"); +#endif + + mutex_init(&yaffs_context_lock); + + /* Install the proc_fs entries */ + my_proc_entry = create_proc_entry("yaffs", + S_IRUGO | S_IFREG, YPROC_ROOT); + + if (my_proc_entry) { + my_proc_entry->write_proc = yaffs_proc_write; + my_proc_entry->read_proc = yaffs_proc_read; + my_proc_entry->data = NULL; + } else { + return -ENOMEM; + } + + + /* Now add the file system entries */ + + fsinst = fs_to_install; + + while (fsinst->fst && !error) { + error = register_filesystem(fsinst->fst); + if (!error) + fsinst->installed = 1; + fsinst++; + } + + /* Any errors? uninstall */ + if (error) { + fsinst = fs_to_install; + + while (fsinst->fst) { + if (fsinst->installed) { + unregister_filesystem(fsinst->fst); + fsinst->installed = 0; + } + fsinst++; + } + } + + return error; +} + +static void __exit exit_yaffs_fs(void) +{ + + struct file_system_to_install *fsinst; + + yaffs_trace(YAFFS_TRACE_ALWAYS, + "yaffs built " __DATE__ " " __TIME__ " removing."); + + remove_proc_entry("yaffs", YPROC_ROOT); + + fsinst = fs_to_install; + + while (fsinst->fst) { + if (fsinst->installed) { + unregister_filesystem(fsinst->fst); + fsinst->installed = 0; + } + fsinst++; + } +} + +module_init(init_yaffs_fs) + module_exit(exit_yaffs_fs) + + MODULE_DESCRIPTION("YAFFS2 - a NAND specific flash file system"); +MODULE_AUTHOR("Charles Manning, Aleph One Ltd., 2002-2010"); +MODULE_LICENSE("GPL"); diff --git a/fs/yaffs2/yaffs_yaffs1.c b/fs/yaffs2/yaffs_yaffs1.c new file mode 100644 index 00000000..9eb60308 --- /dev/null +++ b/fs/yaffs2/yaffs_yaffs1.c @@ -0,0 +1,433 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "yaffs_yaffs1.h" +#include "yportenv.h" +#include "yaffs_trace.h" +#include "yaffs_bitmap.h" +#include "yaffs_getblockinfo.h" +#include "yaffs_nand.h" +#include "yaffs_attribs.h" + +int yaffs1_scan(struct yaffs_dev *dev) +{ + struct yaffs_ext_tags tags; + int blk; + int result; + + int chunk; + int c; + int deleted; + enum yaffs_block_state state; + struct yaffs_obj *hard_list = NULL; + struct yaffs_block_info *bi; + u32 seq_number; + struct yaffs_obj_hdr *oh; + struct yaffs_obj *in; + struct yaffs_obj *parent; + + int alloc_failed = 0; + + struct yaffs_shadow_fixer *shadow_fixers = NULL; + + u8 *chunk_data; + + yaffs_trace(YAFFS_TRACE_SCAN, + "yaffs1_scan starts intstartblk %d intendblk %d...", + dev->internal_start_block, dev->internal_end_block); + + chunk_data = yaffs_get_temp_buffer(dev, __LINE__); + + dev->seq_number = YAFFS_LOWEST_SEQUENCE_NUMBER; + + /* Scan all the blocks to determine their state */ + bi = dev->block_info; + for (blk = dev->internal_start_block; blk <= dev->internal_end_block; + blk++) { + yaffs_clear_chunk_bits(dev, blk); + bi->pages_in_use = 0; + bi->soft_del_pages = 0; + + yaffs_query_init_block_state(dev, blk, &state, &seq_number); + + bi->block_state = state; + bi->seq_number = seq_number; + + if (bi->seq_number == YAFFS_SEQUENCE_BAD_BLOCK) + bi->block_state = state = YAFFS_BLOCK_STATE_DEAD; + + yaffs_trace(YAFFS_TRACE_SCAN_DEBUG, + "Block scanning block %d state %d seq %d", + blk, state, seq_number); + + if (state == YAFFS_BLOCK_STATE_DEAD) { + yaffs_trace(YAFFS_TRACE_BAD_BLOCKS, + "block %d is bad", blk); + } else if (state == YAFFS_BLOCK_STATE_EMPTY) { + yaffs_trace(YAFFS_TRACE_SCAN_DEBUG, "Block empty "); + dev->n_erased_blocks++; + dev->n_free_chunks += dev->param.chunks_per_block; + } + bi++; + } + + /* For each block.... */ + for (blk = dev->internal_start_block; + !alloc_failed && blk <= dev->internal_end_block; blk++) { + + cond_resched(); + + bi = yaffs_get_block_info(dev, blk); + state = bi->block_state; + + deleted = 0; + + /* For each chunk in each block that needs scanning.... */ + for (c = 0; !alloc_failed && c < dev->param.chunks_per_block && + state == YAFFS_BLOCK_STATE_NEEDS_SCANNING; c++) { + /* Read the tags and decide what to do */ + chunk = blk * dev->param.chunks_per_block + c; + + result = yaffs_rd_chunk_tags_nand(dev, chunk, NULL, + &tags); + + /* Let's have a good look at this chunk... */ + + if (tags.ecc_result == YAFFS_ECC_RESULT_UNFIXED + || tags.is_deleted) { + /* YAFFS1 only... + * A deleted chunk + */ + deleted++; + dev->n_free_chunks++; + /*T((" %d %d deleted\n",blk,c)); */ + } else if (!tags.chunk_used) { + /* An unassigned chunk in the block + * This means that either the block is empty or + * this is the one being allocated from + */ + + if (c == 0) { + /* We're looking at the first chunk in the block so the block is unused */ + state = YAFFS_BLOCK_STATE_EMPTY; + dev->n_erased_blocks++; + } else { + /* this is the block being allocated from */ + yaffs_trace(YAFFS_TRACE_SCAN, + " Allocating from %d %d", + blk, c); + state = YAFFS_BLOCK_STATE_ALLOCATING; + dev->alloc_block = blk; + dev->alloc_page = c; + dev->alloc_block_finder = blk; + /* Set block finder here to encourage the allocator to go forth from here. */ + + } + + dev->n_free_chunks += + (dev->param.chunks_per_block - c); + } else if (tags.chunk_id > 0) { + /* chunk_id > 0 so it is a data chunk... */ + unsigned int endpos; + + yaffs_set_chunk_bit(dev, blk, c); + bi->pages_in_use++; + + in = yaffs_find_or_create_by_number(dev, + tags.obj_id, + YAFFS_OBJECT_TYPE_FILE); + /* PutChunkIntoFile checks for a clash (two data chunks with + * the same chunk_id). + */ + + if (!in) + alloc_failed = 1; + + if (in) { + if (!yaffs_put_chunk_in_file + (in, tags.chunk_id, chunk, 1)) + alloc_failed = 1; + } + + endpos = + (tags.chunk_id - + 1) * dev->data_bytes_per_chunk + + tags.n_bytes; + if (in + && in->variant_type == + YAFFS_OBJECT_TYPE_FILE + && in->variant.file_variant.scanned_size < + endpos) { + in->variant.file_variant.scanned_size = + endpos; + if (!dev->param.use_header_file_size) { + in->variant. + file_variant.file_size = + in->variant. + file_variant.scanned_size; + } + + } + /* T((" %d %d data %d %d\n",blk,c,tags.obj_id,tags.chunk_id)); */ + } else { + /* chunk_id == 0, so it is an ObjectHeader. + * Thus, we read in the object header and make the object + */ + yaffs_set_chunk_bit(dev, blk, c); + bi->pages_in_use++; + + result = yaffs_rd_chunk_tags_nand(dev, chunk, + chunk_data, + NULL); + + oh = (struct yaffs_obj_hdr *)chunk_data; + + in = yaffs_find_by_number(dev, tags.obj_id); + if (in && in->variant_type != oh->type) { + /* This should not happen, but somehow + * Wev'e ended up with an obj_id that has been reused but not yet + * deleted, and worse still it has changed type. Delete the old object. + */ + + yaffs_del_obj(in); + + in = 0; + } + + in = yaffs_find_or_create_by_number(dev, + tags.obj_id, + oh->type); + + if (!in) + alloc_failed = 1; + + if (in && oh->shadows_obj > 0) { + + struct yaffs_shadow_fixer *fixer; + fixer = + kmalloc(sizeof + (struct yaffs_shadow_fixer), + GFP_NOFS); + if (fixer) { + fixer->next = shadow_fixers; + shadow_fixers = fixer; + fixer->obj_id = tags.obj_id; + fixer->shadowed_id = + oh->shadows_obj; + yaffs_trace(YAFFS_TRACE_SCAN, + " Shadow fixer: %d shadows %d", + fixer->obj_id, + fixer->shadowed_id); + + } + + } + + if (in && in->valid) { + /* We have already filled this one. We have a duplicate and need to resolve it. */ + + unsigned existing_serial = in->serial; + unsigned new_serial = + tags.serial_number; + + if (((existing_serial + 1) & 3) == + new_serial) { + /* Use new one - destroy the exisiting one */ + yaffs_chunk_del(dev, + in->hdr_chunk, + 1, __LINE__); + in->valid = 0; + } else { + /* Use existing - destroy this one. */ + yaffs_chunk_del(dev, chunk, 1, + __LINE__); + } + } + + if (in && !in->valid && + (tags.obj_id == YAFFS_OBJECTID_ROOT || + tags.obj_id == + YAFFS_OBJECTID_LOSTNFOUND)) { + /* We only load some info, don't fiddle with directory structure */ + in->valid = 1; + in->variant_type = oh->type; + + in->yst_mode = oh->yst_mode; + yaffs_load_attribs(in, oh); + in->hdr_chunk = chunk; + in->serial = tags.serial_number; + + } else if (in && !in->valid) { + /* we need to load this info */ + + in->valid = 1; + in->variant_type = oh->type; + + in->yst_mode = oh->yst_mode; + yaffs_load_attribs(in, oh); + in->hdr_chunk = chunk; + in->serial = tags.serial_number; + + yaffs_set_obj_name_from_oh(in, oh); + in->dirty = 0; + + /* directory stuff... + * hook up to parent + */ + + parent = + yaffs_find_or_create_by_number + (dev, oh->parent_obj_id, + YAFFS_OBJECT_TYPE_DIRECTORY); + if (!parent) + alloc_failed = 1; + if (parent && parent->variant_type == + YAFFS_OBJECT_TYPE_UNKNOWN) { + /* Set up as a directory */ + parent->variant_type = + YAFFS_OBJECT_TYPE_DIRECTORY; + INIT_LIST_HEAD(&parent-> + variant.dir_variant.children); + } else if (!parent + || parent->variant_type != + YAFFS_OBJECT_TYPE_DIRECTORY) { + /* Hoosterman, another problem.... + * We're trying to use a non-directory as a directory + */ + + yaffs_trace(YAFFS_TRACE_ERROR, + "yaffs tragedy: attempting to use non-directory as a directory in scan. Put in lost+found." + ); + parent = dev->lost_n_found; + } + + yaffs_add_obj_to_dir(parent, in); + + if (0 && (parent == dev->del_dir || + parent == + dev->unlinked_dir)) { + in->deleted = 1; /* If it is unlinked at start up then it wants deleting */ + dev->n_deleted_files++; + } + /* Note re hardlinks. + * Since we might scan a hardlink before its equivalent object is scanned + * we put them all in a list. + * After scanning is complete, we should have all the objects, so we run through this + * list and fix up all the chains. + */ + + switch (in->variant_type) { + case YAFFS_OBJECT_TYPE_UNKNOWN: + /* Todo got a problem */ + break; + case YAFFS_OBJECT_TYPE_FILE: + if (dev->param. + use_header_file_size) + + in->variant. + file_variant.file_size + = oh->file_size; + + break; + case YAFFS_OBJECT_TYPE_HARDLINK: + in->variant. + hardlink_variant.equiv_id = + oh->equiv_id; + in->hard_links.next = + (struct list_head *) + hard_list; + hard_list = in; + break; + case YAFFS_OBJECT_TYPE_DIRECTORY: + /* Do nothing */ + break; + case YAFFS_OBJECT_TYPE_SPECIAL: + /* Do nothing */ + break; + case YAFFS_OBJECT_TYPE_SYMLINK: + in->variant.symlink_variant. + alias = + yaffs_clone_str(oh->alias); + if (!in->variant. + symlink_variant.alias) + alloc_failed = 1; + break; + } + + } + } + } + + if (state == YAFFS_BLOCK_STATE_NEEDS_SCANNING) { + /* If we got this far while scanning, then the block is fully allocated. */ + state = YAFFS_BLOCK_STATE_FULL; + } + + if (state == YAFFS_BLOCK_STATE_ALLOCATING) { + /* If the block was partially allocated then treat it as fully allocated. */ + state = YAFFS_BLOCK_STATE_FULL; + dev->alloc_block = -1; + } + + bi->block_state = state; + + /* Now let's see if it was dirty */ + if (bi->pages_in_use == 0 && + !bi->has_shrink_hdr && + bi->block_state == YAFFS_BLOCK_STATE_FULL) { + yaffs_block_became_dirty(dev, blk); + } + + } + + /* Ok, we've done all the scanning. + * Fix up the hard link chains. + * We should now have scanned all the objects, now it's time to add these + * hardlinks. + */ + + yaffs_link_fixup(dev, hard_list); + + /* Fix up any shadowed objects */ + { + struct yaffs_shadow_fixer *fixer; + struct yaffs_obj *obj; + + while (shadow_fixers) { + fixer = shadow_fixers; + shadow_fixers = fixer->next; + /* Complete the rename transaction by deleting the shadowed object + * then setting the object header to unshadowed. + */ + obj = yaffs_find_by_number(dev, fixer->shadowed_id); + if (obj) + yaffs_del_obj(obj); + + obj = yaffs_find_by_number(dev, fixer->obj_id); + + if (obj) + yaffs_update_oh(obj, NULL, 1, 0, 0, NULL); + + kfree(fixer); + } + } + + yaffs_release_temp_buffer(dev, chunk_data, __LINE__); + + if (alloc_failed) + return YAFFS_FAIL; + + yaffs_trace(YAFFS_TRACE_SCAN, "yaffs1_scan ends"); + + return YAFFS_OK; +} diff --git a/fs/yaffs2/yaffs_yaffs1.h b/fs/yaffs2/yaffs_yaffs1.h new file mode 100644 index 00000000..db23e049 --- /dev/null +++ b/fs/yaffs2/yaffs_yaffs1.h @@ -0,0 +1,22 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_YAFFS1_H__ +#define __YAFFS_YAFFS1_H__ + +#include "yaffs_guts.h" +int yaffs1_scan(struct yaffs_dev *dev); + +#endif diff --git a/fs/yaffs2/yaffs_yaffs2.c b/fs/yaffs2/yaffs_yaffs2.c new file mode 100644 index 00000000..33397af7 --- /dev/null +++ b/fs/yaffs2/yaffs_yaffs2.c @@ -0,0 +1,1598 @@ +/* + * YAFFS: Yet Another Flash File System. A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "yaffs_guts.h" +#include "yaffs_trace.h" +#include "yaffs_yaffs2.h" +#include "yaffs_checkptrw.h" +#include "yaffs_bitmap.h" +#include "yaffs_nand.h" +#include "yaffs_getblockinfo.h" +#include "yaffs_verify.h" +#include "yaffs_attribs.h" + +/* + * Checkpoints are really no benefit on very small partitions. + * + * To save space on small partitions don't bother with checkpoints unless + * the partition is at least this big. + */ +#define YAFFS_CHECKPOINT_MIN_BLOCKS 60 + +#define YAFFS_SMALL_HOLE_THRESHOLD 4 + +/* + * Oldest Dirty Sequence Number handling. + */ + +/* yaffs_calc_oldest_dirty_seq() + * yaffs2_find_oldest_dirty_seq() + * Calculate the oldest dirty sequence number if we don't know it. + */ +void yaffs_calc_oldest_dirty_seq(struct yaffs_dev *dev) +{ + int i; + unsigned seq; + unsigned block_no = 0; + struct yaffs_block_info *b; + + if (!dev->param.is_yaffs2) + return; + + /* Find the oldest dirty sequence number. */ + seq = dev->seq_number + 1; + b = dev->block_info; + for (i = dev->internal_start_block; i <= dev->internal_end_block; i++) { + if (b->block_state == YAFFS_BLOCK_STATE_FULL && + (b->pages_in_use - b->soft_del_pages) < + dev->param.chunks_per_block && b->seq_number < seq) { + seq = b->seq_number; + block_no = i; + } + b++; + } + + if (block_no) { + dev->oldest_dirty_seq = seq; + dev->oldest_dirty_block = block_no; + } + +} + +void yaffs2_find_oldest_dirty_seq(struct yaffs_dev *dev) +{ + if (!dev->param.is_yaffs2) + return; + + if (!dev->oldest_dirty_seq) + yaffs_calc_oldest_dirty_seq(dev); +} + +/* + * yaffs_clear_oldest_dirty_seq() + * Called when a block is erased or marked bad. (ie. when its seq_number + * becomes invalid). If the value matches the oldest then we clear + * dev->oldest_dirty_seq to force its recomputation. + */ +void yaffs2_clear_oldest_dirty_seq(struct yaffs_dev *dev, + struct yaffs_block_info *bi) +{ + + if (!dev->param.is_yaffs2) + return; + + if (!bi || bi->seq_number == dev->oldest_dirty_seq) { + dev->oldest_dirty_seq = 0; + dev->oldest_dirty_block = 0; + } +} + +/* + * yaffs2_update_oldest_dirty_seq() + * Update the oldest dirty sequence number whenever we dirty a block. + * Only do this if the oldest_dirty_seq is actually being tracked. + */ +void yaffs2_update_oldest_dirty_seq(struct yaffs_dev *dev, unsigned block_no, + struct yaffs_block_info *bi) +{ + if (!dev->param.is_yaffs2) + return; + + if (dev->oldest_dirty_seq) { + if (dev->oldest_dirty_seq > bi->seq_number) { + dev->oldest_dirty_seq = bi->seq_number; + dev->oldest_dirty_block = block_no; + } + } +} + +int yaffs_block_ok_for_gc(struct yaffs_dev *dev, struct yaffs_block_info *bi) +{ + + if (!dev->param.is_yaffs2) + return 1; /* disqualification only applies to yaffs2. */ + + if (!bi->has_shrink_hdr) + return 1; /* can gc */ + + yaffs2_find_oldest_dirty_seq(dev); + + /* Can't do gc of this block if there are any blocks older than this one that have + * discarded pages. + */ + return (bi->seq_number <= dev->oldest_dirty_seq); +} + +/* + * yaffs2_find_refresh_block() + * periodically finds the oldest full block by sequence number for refreshing. + * Only for yaffs2. + */ +u32 yaffs2_find_refresh_block(struct yaffs_dev * dev) +{ + u32 b; + + u32 oldest = 0; + u32 oldest_seq = 0; + + struct yaffs_block_info *bi; + + if (!dev->param.is_yaffs2) + return oldest; + + /* + * If refresh period < 10 then refreshing is disabled. + */ + if (dev->param.refresh_period < 10) + return oldest; + + /* + * Fix broken values. + */ + if (dev->refresh_skip > dev->param.refresh_period) + dev->refresh_skip = dev->param.refresh_period; + + if (dev->refresh_skip > 0) + return oldest; + + /* + * Refresh skip is now zero. + * We'll do a refresh this time around.... + * Update the refresh skip and find the oldest block. + */ + dev->refresh_skip = dev->param.refresh_period; + dev->refresh_count++; + bi = dev->block_info; + for (b = dev->internal_start_block; b <= dev->internal_end_block; b++) { + + if (bi->block_state == YAFFS_BLOCK_STATE_FULL) { + + if (oldest < 1 || bi->seq_number < oldest_seq) { + oldest = b; + oldest_seq = bi->seq_number; + } + } + bi++; + } + + if (oldest > 0) { + yaffs_trace(YAFFS_TRACE_GC, + "GC refresh count %d selected block %d with seq_number %d", + dev->refresh_count, oldest, oldest_seq); + } + + return oldest; +} + +int yaffs2_checkpt_required(struct yaffs_dev *dev) +{ + int nblocks; + + if (!dev->param.is_yaffs2) + return 0; + + nblocks = dev->internal_end_block - dev->internal_start_block + 1; + + return !dev->param.skip_checkpt_wr && + !dev->read_only && (nblocks >= YAFFS_CHECKPOINT_MIN_BLOCKS); +} + +int yaffs_calc_checkpt_blocks_required(struct yaffs_dev *dev) +{ + int retval; + + if (!dev->param.is_yaffs2) + return 0; + + if (!dev->checkpoint_blocks_required && yaffs2_checkpt_required(dev)) { + /* Not a valid value so recalculate */ + int n_bytes = 0; + int n_blocks; + int dev_blocks = + (dev->param.end_block - dev->param.start_block + 1); + + n_bytes += sizeof(struct yaffs_checkpt_validity); + n_bytes += sizeof(struct yaffs_checkpt_dev); + n_bytes += dev_blocks * sizeof(struct yaffs_block_info); + n_bytes += dev_blocks * dev->chunk_bit_stride; + n_bytes += + (sizeof(struct yaffs_checkpt_obj) + + sizeof(u32)) * (dev->n_obj); + n_bytes += (dev->tnode_size + sizeof(u32)) * (dev->n_tnodes); + n_bytes += sizeof(struct yaffs_checkpt_validity); + n_bytes += sizeof(u32); /* checksum */ + + /* Round up and add 2 blocks to allow for some bad blocks, so add 3 */ + + n_blocks = + (n_bytes / + (dev->data_bytes_per_chunk * + dev->param.chunks_per_block)) + 3; + + dev->checkpoint_blocks_required = n_blocks; + } + + retval = dev->checkpoint_blocks_required - dev->blocks_in_checkpt; + if (retval < 0) + retval = 0; + return retval; +} + +/*--------------------- Checkpointing --------------------*/ + +static int yaffs2_wr_checkpt_validity_marker(struct yaffs_dev *dev, int head) +{ + struct yaffs_checkpt_validity cp; + + memset(&cp, 0, sizeof(cp)); + + cp.struct_type = sizeof(cp); + cp.magic = YAFFS_MAGIC; + cp.version = YAFFS_CHECKPOINT_VERSION; + cp.head = (head) ? 1 : 0; + + return (yaffs2_checkpt_wr(dev, &cp, sizeof(cp)) == sizeof(cp)) ? 1 : 0; +} + +static int yaffs2_rd_checkpt_validity_marker(struct yaffs_dev *dev, int head) +{ + struct yaffs_checkpt_validity cp; + int ok; + + ok = (yaffs2_checkpt_rd(dev, &cp, sizeof(cp)) == sizeof(cp)); + + if (ok) + ok = (cp.struct_type == sizeof(cp)) && + (cp.magic == YAFFS_MAGIC) && + (cp.version == YAFFS_CHECKPOINT_VERSION) && + (cp.head == ((head) ? 1 : 0)); + return ok ? 1 : 0; +} + +static void yaffs2_dev_to_checkpt_dev(struct yaffs_checkpt_dev *cp, + struct yaffs_dev *dev) +{ + cp->n_erased_blocks = dev->n_erased_blocks; + cp->alloc_block = dev->alloc_block; + cp->alloc_page = dev->alloc_page; + cp->n_free_chunks = dev->n_free_chunks; + + cp->n_deleted_files = dev->n_deleted_files; + cp->n_unlinked_files = dev->n_unlinked_files; + cp->n_bg_deletions = dev->n_bg_deletions; + cp->seq_number = dev->seq_number; + +} + +static void yaffs_checkpt_dev_to_dev(struct yaffs_dev *dev, + struct yaffs_checkpt_dev *cp) +{ + dev->n_erased_blocks = cp->n_erased_blocks; + dev->alloc_block = cp->alloc_block; + dev->alloc_page = cp->alloc_page; + dev->n_free_chunks = cp->n_free_chunks; + + dev->n_deleted_files = cp->n_deleted_files; + dev->n_unlinked_files = cp->n_unlinked_files; + dev->n_bg_deletions = cp->n_bg_deletions; + dev->seq_number = cp->seq_number; +} + +static int yaffs2_wr_checkpt_dev(struct yaffs_dev *dev) +{ + struct yaffs_checkpt_dev cp; + u32 n_bytes; + u32 n_blocks = + (dev->internal_end_block - dev->internal_start_block + 1); + + int ok; + + /* Write device runtime values */ + yaffs2_dev_to_checkpt_dev(&cp, dev); + cp.struct_type = sizeof(cp); + + ok = (yaffs2_checkpt_wr(dev, &cp, sizeof(cp)) == sizeof(cp)); + + /* Write block info */ + if (ok) { + n_bytes = n_blocks * sizeof(struct yaffs_block_info); + ok = (yaffs2_checkpt_wr(dev, dev->block_info, n_bytes) == + n_bytes); + } + + /* Write chunk bits */ + if (ok) { + n_bytes = n_blocks * dev->chunk_bit_stride; + ok = (yaffs2_checkpt_wr(dev, dev->chunk_bits, n_bytes) == + n_bytes); + } + return ok ? 1 : 0; + +} + +static int yaffs2_rd_checkpt_dev(struct yaffs_dev *dev) +{ + struct yaffs_checkpt_dev cp; + u32 n_bytes; + u32 n_blocks = + (dev->internal_end_block - dev->internal_start_block + 1); + + int ok; + + ok = (yaffs2_checkpt_rd(dev, &cp, sizeof(cp)) == sizeof(cp)); + if (!ok) + return 0; + + if (cp.struct_type != sizeof(cp)) + return 0; + + yaffs_checkpt_dev_to_dev(dev, &cp); + + n_bytes = n_blocks * sizeof(struct yaffs_block_info); + + ok = (yaffs2_checkpt_rd(dev, dev->block_info, n_bytes) == n_bytes); + + if (!ok) + return 0; + n_bytes = n_blocks * dev->chunk_bit_stride; + + ok = (yaffs2_checkpt_rd(dev, dev->chunk_bits, n_bytes) == n_bytes); + + return ok ? 1 : 0; +} + +static void yaffs2_obj_checkpt_obj(struct yaffs_checkpt_obj *cp, + struct yaffs_obj *obj) +{ + + cp->obj_id = obj->obj_id; + cp->parent_id = (obj->parent) ? obj->parent->obj_id : 0; + cp->hdr_chunk = obj->hdr_chunk; + cp->variant_type = obj->variant_type; + cp->deleted = obj->deleted; + cp->soft_del = obj->soft_del; + cp->unlinked = obj->unlinked; + cp->fake = obj->fake; + cp->rename_allowed = obj->rename_allowed; + cp->unlink_allowed = obj->unlink_allowed; + cp->serial = obj->serial; + cp->n_data_chunks = obj->n_data_chunks; + + if (obj->variant_type == YAFFS_OBJECT_TYPE_FILE) + cp->size_or_equiv_obj = obj->variant.file_variant.file_size; + else if (obj->variant_type == YAFFS_OBJECT_TYPE_HARDLINK) + cp->size_or_equiv_obj = obj->variant.hardlink_variant.equiv_id; +} + +static int taffs2_checkpt_obj_to_obj(struct yaffs_obj *obj, + struct yaffs_checkpt_obj *cp) +{ + + struct yaffs_obj *parent; + + if (obj->variant_type != cp->variant_type) { + yaffs_trace(YAFFS_TRACE_ERROR, + "Checkpoint read object %d type %d chunk %d does not match existing object type %d", + cp->obj_id, cp->variant_type, cp->hdr_chunk, + obj->variant_type); + return 0; + } + + obj->obj_id = cp->obj_id; + + if (cp->parent_id) + parent = yaffs_find_or_create_by_number(obj->my_dev, + cp->parent_id, + YAFFS_OBJECT_TYPE_DIRECTORY); + else + parent = NULL; + + if (parent) { + if (parent->variant_type != YAFFS_OBJECT_TYPE_DIRECTORY) { + yaffs_trace(YAFFS_TRACE_ALWAYS, + "Checkpoint read object %d parent %d type %d chunk %d Parent type, %d, not directory", + cp->obj_id, cp->parent_id, + cp->variant_type, cp->hdr_chunk, + parent->variant_type); + return 0; + } + yaffs_add_obj_to_dir(parent, obj); + } + + obj->hdr_chunk = cp->hdr_chunk; + obj->variant_type = cp->variant_type; + obj->deleted = cp->deleted; + obj->soft_del = cp->soft_del; + obj->unlinked = cp->unlinked; + obj->fake = cp->fake; + obj->rename_allowed = cp->rename_allowed; + obj->unlink_allowed = cp->unlink_allowed; + obj->serial = cp->serial; + obj->n_data_chunks = cp->n_data_chunks; + + if (obj->variant_type == YAFFS_OBJECT_TYPE_FILE) + obj->variant.file_variant.file_size = cp->size_or_equiv_obj; + else if (obj->variant_type == YAFFS_OBJECT_TYPE_HARDLINK) + obj->variant.hardlink_variant.equiv_id = cp->size_or_equiv_obj; + + if (obj->hdr_chunk > 0) + obj->lazy_loaded = 1; + return 1; +} + +static int yaffs2_checkpt_tnode_worker(struct yaffs_obj *in, + struct yaffs_tnode *tn, u32 level, + int chunk_offset) +{ + int i; + struct yaffs_dev *dev = in->my_dev; + int ok = 1; + + if (tn) { + if (level > 0) { + + for (i = 0; i < YAFFS_NTNODES_INTERNAL && ok; i++) { + if (tn->internal[i]) { + ok = yaffs2_checkpt_tnode_worker(in, + tn-> + internal + [i], + level - + 1, + (chunk_offset + << + YAFFS_TNODES_INTERNAL_BITS) + + i); + } + } + } else if (level == 0) { + u32 base_offset = + chunk_offset << YAFFS_TNODES_LEVEL0_BITS; + ok = (yaffs2_checkpt_wr + (dev, &base_offset, + sizeof(base_offset)) == sizeof(base_offset)); + if (ok) + ok = (yaffs2_checkpt_wr + (dev, tn, + dev->tnode_size) == dev->tnode_size); + } + } + + return ok; + +} + +static int yaffs2_wr_checkpt_tnodes(struct yaffs_obj *obj) +{ + u32 end_marker = ~0; + int ok = 1; + + if (obj->variant_type == YAFFS_OBJECT_TYPE_FILE) { + ok = yaffs2_checkpt_tnode_worker(obj, + obj->variant.file_variant.top, + obj->variant.file_variant. + top_level, 0); + if (ok) + ok = (yaffs2_checkpt_wr + (obj->my_dev, &end_marker, + sizeof(end_marker)) == sizeof(end_marker)); + } + + return ok ? 1 : 0; +} + +static int yaffs2_rd_checkpt_tnodes(struct yaffs_obj *obj) +{ + u32 base_chunk; + int ok = 1; + struct yaffs_dev *dev = obj->my_dev; + struct yaffs_file_var *file_stuct_ptr = &obj->variant.file_variant; + struct yaffs_tnode *tn; + int nread = 0; + + ok = (yaffs2_checkpt_rd(dev, &base_chunk, sizeof(base_chunk)) == + sizeof(base_chunk)); + + while (ok && (~base_chunk)) { + nread++; + /* Read level 0 tnode */ + + tn = yaffs_get_tnode(dev); + if (tn) { + ok = (yaffs2_checkpt_rd(dev, tn, dev->tnode_size) == + dev->tnode_size); + } else { + ok = 0; + } + + if (tn && ok) + ok = yaffs_add_find_tnode_0(dev, + file_stuct_ptr, + base_chunk, tn) ? 1 : 0; + + if (ok) + ok = (yaffs2_checkpt_rd + (dev, &base_chunk, + sizeof(base_chunk)) == sizeof(base_chunk)); + + } + + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "Checkpoint read tnodes %d records, last %d. ok %d", + nread, base_chunk, ok); + + return ok ? 1 : 0; +} + +static int yaffs2_wr_checkpt_objs(struct yaffs_dev *dev) +{ + struct yaffs_obj *obj; + struct yaffs_checkpt_obj cp; + int i; + int ok = 1; + struct list_head *lh; + + /* Iterate through the objects in each hash entry, + * dumping them to the checkpointing stream. + */ + + for (i = 0; ok && i < YAFFS_NOBJECT_BUCKETS; i++) { + list_for_each(lh, &dev->obj_bucket[i].list) { + if (lh) { + obj = + list_entry(lh, struct yaffs_obj, hash_link); + if (!obj->defered_free) { + yaffs2_obj_checkpt_obj(&cp, obj); + cp.struct_type = sizeof(cp); + + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "Checkpoint write object %d parent %d type %d chunk %d obj addr %p", + cp.obj_id, cp.parent_id, + cp.variant_type, cp.hdr_chunk, obj); + + ok = (yaffs2_checkpt_wr + (dev, &cp, + sizeof(cp)) == sizeof(cp)); + + if (ok + && obj->variant_type == + YAFFS_OBJECT_TYPE_FILE) + ok = yaffs2_wr_checkpt_tnodes + (obj); + } + } + } + } + + /* Dump end of list */ + memset(&cp, 0xFF, sizeof(struct yaffs_checkpt_obj)); + cp.struct_type = sizeof(cp); + + if (ok) + ok = (yaffs2_checkpt_wr(dev, &cp, sizeof(cp)) == sizeof(cp)); + + return ok ? 1 : 0; +} + +static int yaffs2_rd_checkpt_objs(struct yaffs_dev *dev) +{ + struct yaffs_obj *obj; + struct yaffs_checkpt_obj cp; + int ok = 1; + int done = 0; + struct yaffs_obj *hard_list = NULL; + + while (ok && !done) { + ok = (yaffs2_checkpt_rd(dev, &cp, sizeof(cp)) == sizeof(cp)); + if (cp.struct_type != sizeof(cp)) { + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "struct size %d instead of %d ok %d", + cp.struct_type, (int)sizeof(cp), ok); + ok = 0; + } + + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "Checkpoint read object %d parent %d type %d chunk %d ", + cp.obj_id, cp.parent_id, cp.variant_type, + cp.hdr_chunk); + + if (ok && cp.obj_id == ~0) { + done = 1; + } else if (ok) { + obj = + yaffs_find_or_create_by_number(dev, cp.obj_id, + cp.variant_type); + if (obj) { + ok = taffs2_checkpt_obj_to_obj(obj, &cp); + if (!ok) + break; + if (obj->variant_type == YAFFS_OBJECT_TYPE_FILE) { + ok = yaffs2_rd_checkpt_tnodes(obj); + } else if (obj->variant_type == + YAFFS_OBJECT_TYPE_HARDLINK) { + obj->hard_links.next = + (struct list_head *)hard_list; + hard_list = obj; + } + } else { + ok = 0; + } + } + } + + if (ok) + yaffs_link_fixup(dev, hard_list); + + return ok ? 1 : 0; +} + +static int yaffs2_wr_checkpt_sum(struct yaffs_dev *dev) +{ + u32 checkpt_sum; + int ok; + + yaffs2_get_checkpt_sum(dev, &checkpt_sum); + + ok = (yaffs2_checkpt_wr(dev, &checkpt_sum, sizeof(checkpt_sum)) == + sizeof(checkpt_sum)); + + if (!ok) + return 0; + + return 1; +} + +static int yaffs2_rd_checkpt_sum(struct yaffs_dev *dev) +{ + u32 checkpt_sum0; + u32 checkpt_sum1; + int ok; + + yaffs2_get_checkpt_sum(dev, &checkpt_sum0); + + ok = (yaffs2_checkpt_rd(dev, &checkpt_sum1, sizeof(checkpt_sum1)) == + sizeof(checkpt_sum1)); + + if (!ok) + return 0; + + if (checkpt_sum0 != checkpt_sum1) + return 0; + + return 1; +} + +static int yaffs2_wr_checkpt_data(struct yaffs_dev *dev) +{ + int ok = 1; + + if (!yaffs2_checkpt_required(dev)) { + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "skipping checkpoint write"); + ok = 0; + } + + if (ok) + ok = yaffs2_checkpt_open(dev, 1); + + if (ok) { + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "write checkpoint validity"); + ok = yaffs2_wr_checkpt_validity_marker(dev, 1); + } + if (ok) { + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "write checkpoint device"); + ok = yaffs2_wr_checkpt_dev(dev); + } + if (ok) { + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "write checkpoint objects"); + ok = yaffs2_wr_checkpt_objs(dev); + } + if (ok) { + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "write checkpoint validity"); + ok = yaffs2_wr_checkpt_validity_marker(dev, 0); + } + + if (ok) + ok = yaffs2_wr_checkpt_sum(dev); + + if (!yaffs_checkpt_close(dev)) + ok = 0; + + if (ok) + dev->is_checkpointed = 1; + else + dev->is_checkpointed = 0; + + return dev->is_checkpointed; +} + +static int yaffs2_rd_checkpt_data(struct yaffs_dev *dev) +{ + int ok = 1; + + if (!dev->param.is_yaffs2) + ok = 0; + + if (ok && dev->param.skip_checkpt_rd) { + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "skipping checkpoint read"); + ok = 0; + } + + if (ok) + ok = yaffs2_checkpt_open(dev, 0); /* open for read */ + + if (ok) { + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "read checkpoint validity"); + ok = yaffs2_rd_checkpt_validity_marker(dev, 1); + } + if (ok) { + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "read checkpoint device"); + ok = yaffs2_rd_checkpt_dev(dev); + } + if (ok) { + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "read checkpoint objects"); + ok = yaffs2_rd_checkpt_objs(dev); + } + if (ok) { + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "read checkpoint validity"); + ok = yaffs2_rd_checkpt_validity_marker(dev, 0); + } + + if (ok) { + ok = yaffs2_rd_checkpt_sum(dev); + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "read checkpoint checksum %d", ok); + } + + if (!yaffs_checkpt_close(dev)) + ok = 0; + + if (ok) + dev->is_checkpointed = 1; + else + dev->is_checkpointed = 0; + + return ok ? 1 : 0; + +} + +void yaffs2_checkpt_invalidate(struct yaffs_dev *dev) +{ + if (dev->is_checkpointed || dev->blocks_in_checkpt > 0) { + dev->is_checkpointed = 0; + yaffs2_checkpt_invalidate_stream(dev); + } + if (dev->param.sb_dirty_fn) + dev->param.sb_dirty_fn(dev); +} + +int yaffs_checkpoint_save(struct yaffs_dev *dev) +{ + + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "save entry: is_checkpointed %d", + dev->is_checkpointed); + + yaffs_verify_objects(dev); + yaffs_verify_blocks(dev); + yaffs_verify_free_chunks(dev); + + if (!dev->is_checkpointed) { + yaffs2_checkpt_invalidate(dev); + yaffs2_wr_checkpt_data(dev); + } + + yaffs_trace(YAFFS_TRACE_CHECKPOINT | YAFFS_TRACE_MOUNT, + "save exit: is_checkpointed %d", + dev->is_checkpointed); + + return dev->is_checkpointed; +} + +int yaffs2_checkpt_restore(struct yaffs_dev *dev) +{ + int retval; + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "restore entry: is_checkpointed %d", + dev->is_checkpointed); + + retval = yaffs2_rd_checkpt_data(dev); + + if (dev->is_checkpointed) { + yaffs_verify_objects(dev); + yaffs_verify_blocks(dev); + yaffs_verify_free_chunks(dev); + } + + yaffs_trace(YAFFS_TRACE_CHECKPOINT, + "restore exit: is_checkpointed %d", + dev->is_checkpointed); + + return retval; +} + +int yaffs2_handle_hole(struct yaffs_obj *obj, loff_t new_size) +{ + /* if new_size > old_file_size. + * We're going to be writing a hole. + * If the hole is small then write zeros otherwise write a start of hole marker. + */ + + loff_t old_file_size; + int increase; + int small_hole; + int result = YAFFS_OK; + struct yaffs_dev *dev = NULL; + + u8 *local_buffer = NULL; + + int small_increase_ok = 0; + + if (!obj) + return YAFFS_FAIL; + + if (obj->variant_type != YAFFS_OBJECT_TYPE_FILE) + return YAFFS_FAIL; + + dev = obj->my_dev; + + /* Bail out if not yaffs2 mode */ + if (!dev->param.is_yaffs2) + return YAFFS_OK; + + old_file_size = obj->variant.file_variant.file_size; + + if (new_size <= old_file_size) + return YAFFS_OK; + + increase = new_size - old_file_size; + + if (increase < YAFFS_SMALL_HOLE_THRESHOLD * dev->data_bytes_per_chunk && + yaffs_check_alloc_available(dev, YAFFS_SMALL_HOLE_THRESHOLD + 1)) + small_hole = 1; + else + small_hole = 0; + + if (small_hole) + local_buffer = yaffs_get_temp_buffer(dev, __LINE__); + + if (local_buffer) { + /* fill hole with zero bytes */ + int pos = old_file_size; + int this_write; + int written; + memset(local_buffer, 0, dev->data_bytes_per_chunk); + small_increase_ok = 1; + + while (increase > 0 && small_increase_ok) { + this_write = increase; + if (this_write > dev->data_bytes_per_chunk) + this_write = dev->data_bytes_per_chunk; + written = + yaffs_do_file_wr(obj, local_buffer, pos, this_write, + 0); + if (written == this_write) { + pos += this_write; + increase -= this_write; + } else { + small_increase_ok = 0; + } + } + + yaffs_release_temp_buffer(dev, local_buffer, __LINE__); + + /* If we were out of space then reverse any chunks we've added */ + if (!small_increase_ok) + yaffs_resize_file_down(obj, old_file_size); + } + + if (!small_increase_ok && + obj->parent && + obj->parent->obj_id != YAFFS_OBJECTID_UNLINKED && + obj->parent->obj_id != YAFFS_OBJECTID_DELETED) { + /* Write a hole start header with the old file size */ + yaffs_update_oh(obj, NULL, 0, 1, 0, NULL); + } + + return result; + +} + +struct yaffs_block_index { + int seq; + int block; +}; + +static int yaffs2_ybicmp(const void *a, const void *b) +{ + int aseq = ((struct yaffs_block_index *)a)->seq; + int bseq = ((struct yaffs_block_index *)b)->seq; + int ablock = ((struct yaffs_block_index *)a)->block; + int bblock = ((struct yaffs_block_index *)b)->block; + if (aseq == bseq) + return ablock - bblock; + else + return aseq - bseq; +} + +int yaffs2_scan_backwards(struct yaffs_dev *dev) +{ + struct yaffs_ext_tags tags; + int blk; + int block_iter; + int start_iter; + int end_iter; + int n_to_scan = 0; + + int chunk; + int result; + int c; + int deleted; + enum yaffs_block_state state; + struct yaffs_obj *hard_list = NULL; + struct yaffs_block_info *bi; + u32 seq_number; + struct yaffs_obj_hdr *oh; + struct yaffs_obj *in; + struct yaffs_obj *parent; + int n_blocks = dev->internal_end_block - dev->internal_start_block + 1; + int is_unlinked; + u8 *chunk_data; + + int file_size; + int is_shrink; + int found_chunks; + int equiv_id; + int alloc_failed = 0; + + struct yaffs_block_index *block_index = NULL; + int alt_block_index = 0; + + yaffs_trace(YAFFS_TRACE_SCAN, + "yaffs2_scan_backwards starts intstartblk %d intendblk %d...", + dev->internal_start_block, dev->internal_end_block); + + dev->seq_number = YAFFS_LOWEST_SEQUENCE_NUMBER; + + block_index = kmalloc(n_blocks * sizeof(struct yaffs_block_index), + GFP_NOFS); + + if (!block_index) { + block_index = + vmalloc(n_blocks * sizeof(struct yaffs_block_index)); + alt_block_index = 1; + } + + if (!block_index) { + yaffs_trace(YAFFS_TRACE_SCAN, + "yaffs2_scan_backwards() could not allocate block index!" + ); + return YAFFS_FAIL; + } + + dev->blocks_in_checkpt = 0; + + chunk_data = yaffs_get_temp_buffer(dev, __LINE__); + + /* Scan all the blocks to determine their state */ + bi = dev->block_info; + for (blk = dev->internal_start_block; blk <= dev->internal_end_block; + blk++) { + yaffs_clear_chunk_bits(dev, blk); + bi->pages_in_use = 0; + bi->soft_del_pages = 0; + + yaffs_query_init_block_state(dev, blk, &state, &seq_number); + + bi->block_state = state; + bi->seq_number = seq_number; + + if (bi->seq_number == YAFFS_SEQUENCE_CHECKPOINT_DATA) + bi->block_state = state = YAFFS_BLOCK_STATE_CHECKPOINT; + if (bi->seq_number == YAFFS_SEQUENCE_BAD_BLOCK) + bi->block_state = state = YAFFS_BLOCK_STATE_DEAD; + + yaffs_trace(YAFFS_TRACE_SCAN_DEBUG, + "Block scanning block %d state %d seq %d", + blk, state, seq_number); + + if (state == YAFFS_BLOCK_STATE_CHECKPOINT) { + dev->blocks_in_checkpt++; + + } else if (state == YAFFS_BLOCK_STATE_DEAD) { + yaffs_trace(YAFFS_TRACE_BAD_BLOCKS, + "block %d is bad", blk); + } else if (state == YAFFS_BLOCK_STATE_EMPTY) { + yaffs_trace(YAFFS_TRACE_SCAN_DEBUG, "Block empty "); + dev->n_erased_blocks++; + dev->n_free_chunks += dev->param.chunks_per_block; + } else if (state == YAFFS_BLOCK_STATE_NEEDS_SCANNING) { + + /* Determine the highest sequence number */ + if (seq_number >= YAFFS_LOWEST_SEQUENCE_NUMBER && + seq_number < YAFFS_HIGHEST_SEQUENCE_NUMBER) { + + block_index[n_to_scan].seq = seq_number; + block_index[n_to_scan].block = blk; + + n_to_scan++; + + if (seq_number >= dev->seq_number) + dev->seq_number = seq_number; + } else { + /* TODO: Nasty sequence number! */ + yaffs_trace(YAFFS_TRACE_SCAN, + "Block scanning block %d has bad sequence number %d", + blk, seq_number); + + } + } + bi++; + } + + yaffs_trace(YAFFS_TRACE_SCAN, "%d blocks to be sorted...", n_to_scan); + + cond_resched(); + + /* Sort the blocks by sequence number */ + sort(block_index, n_to_scan, sizeof(struct yaffs_block_index), + yaffs2_ybicmp, NULL); + + cond_resched(); + + yaffs_trace(YAFFS_TRACE_SCAN, "...done"); + + /* Now scan the blocks looking at the data. */ + start_iter = 0; + end_iter = n_to_scan - 1; + yaffs_trace(YAFFS_TRACE_SCAN_DEBUG, "%d blocks to scan", n_to_scan); + + /* For each block.... backwards */ + for (block_iter = end_iter; !alloc_failed && block_iter >= start_iter; + block_iter--) { + /* Cooperative multitasking! This loop can run for so + long that watchdog timers expire. */ + cond_resched(); + + /* get the block to scan in the correct order */ + blk = block_index[block_iter].block; + + bi = yaffs_get_block_info(dev, blk); + + state = bi->block_state; + + deleted = 0; + + /* For each chunk in each block that needs scanning.... */ + found_chunks = 0; + for (c = dev->param.chunks_per_block - 1; + !alloc_failed && c >= 0 && + (state == YAFFS_BLOCK_STATE_NEEDS_SCANNING || + state == YAFFS_BLOCK_STATE_ALLOCATING); c--) { + /* Scan backwards... + * Read the tags and decide what to do + */ + + chunk = blk * dev->param.chunks_per_block + c; + + result = yaffs_rd_chunk_tags_nand(dev, chunk, NULL, + &tags); + + /* Let's have a good look at this chunk... */ + + if (!tags.chunk_used) { + /* An unassigned chunk in the block. + * If there are used chunks after this one, then + * it is a chunk that was skipped due to failing the erased + * check. Just skip it so that it can be deleted. + * But, more typically, We get here when this is an unallocated + * chunk and his means that either the block is empty or + * this is the one being allocated from + */ + + if (found_chunks) { + /* This is a chunk that was skipped due to failing the erased check */ + } else if (c == 0) { + /* We're looking at the first chunk in the block so the block is unused */ + state = YAFFS_BLOCK_STATE_EMPTY; + dev->n_erased_blocks++; + } else { + if (state == + YAFFS_BLOCK_STATE_NEEDS_SCANNING + || state == + YAFFS_BLOCK_STATE_ALLOCATING) { + if (dev->seq_number == + bi->seq_number) { + /* this is the block being allocated from */ + + yaffs_trace(YAFFS_TRACE_SCAN, + " Allocating from %d %d", + blk, c); + + state = + YAFFS_BLOCK_STATE_ALLOCATING; + dev->alloc_block = blk; + dev->alloc_page = c; + dev-> + alloc_block_finder = + blk; + } else { + /* This is a partially written block that is not + * the current allocation block. + */ + + yaffs_trace(YAFFS_TRACE_SCAN, + "Partially written block %d detected", + blk); + } + } + } + + dev->n_free_chunks++; + + } else if (tags.ecc_result == YAFFS_ECC_RESULT_UNFIXED) { + yaffs_trace(YAFFS_TRACE_SCAN, + " Unfixed ECC in chunk(%d:%d), chunk ignored", + blk, c); + + dev->n_free_chunks++; + + } else if (tags.obj_id > YAFFS_MAX_OBJECT_ID || + tags.chunk_id > YAFFS_MAX_CHUNK_ID || + (tags.chunk_id > 0 + && tags.n_bytes > dev->data_bytes_per_chunk) + || tags.seq_number != bi->seq_number) { + yaffs_trace(YAFFS_TRACE_SCAN, + "Chunk (%d:%d) with bad tags:obj = %d, chunk_id = %d, n_bytes = %d, ignored", + blk, c, tags.obj_id, + tags.chunk_id, tags.n_bytes); + + dev->n_free_chunks++; + + } else if (tags.chunk_id > 0) { + /* chunk_id > 0 so it is a data chunk... */ + unsigned int endpos; + u32 chunk_base = + (tags.chunk_id - + 1) * dev->data_bytes_per_chunk; + + found_chunks = 1; + + yaffs_set_chunk_bit(dev, blk, c); + bi->pages_in_use++; + + in = yaffs_find_or_create_by_number(dev, + tags.obj_id, + YAFFS_OBJECT_TYPE_FILE); + if (!in) { + /* Out of memory */ + alloc_failed = 1; + } + + if (in && + in->variant_type == YAFFS_OBJECT_TYPE_FILE + && chunk_base < + in->variant.file_variant.shrink_size) { + /* This has not been invalidated by a resize */ + if (!yaffs_put_chunk_in_file + (in, tags.chunk_id, chunk, -1)) { + alloc_failed = 1; + } + + /* File size is calculated by looking at the data chunks if we have not + * seen an object header yet. Stop this practice once we find an object header. + */ + endpos = chunk_base + tags.n_bytes; + + if (!in->valid && /* have not got an object header yet */ + in->variant.file_variant. + scanned_size < endpos) { + in->variant.file_variant. + scanned_size = endpos; + in->variant.file_variant. + file_size = endpos; + } + + } else if (in) { + /* This chunk has been invalidated by a resize, or a past file deletion + * so delete the chunk*/ + yaffs_chunk_del(dev, chunk, 1, + __LINE__); + + } + } else { + /* chunk_id == 0, so it is an ObjectHeader. + * Thus, we read in the object header and make the object + */ + found_chunks = 1; + + yaffs_set_chunk_bit(dev, blk, c); + bi->pages_in_use++; + + oh = NULL; + in = NULL; + + if (tags.extra_available) { + in = yaffs_find_or_create_by_number(dev, + tags. + obj_id, + tags. + extra_obj_type); + if (!in) + alloc_failed = 1; + } + + if (!in || + (!in->valid && dev->param.disable_lazy_load) + || tags.extra_shadows || (!in->valid + && (tags.obj_id == + YAFFS_OBJECTID_ROOT + || tags. + obj_id == + YAFFS_OBJECTID_LOSTNFOUND))) + { + + /* If we don't have valid info then we need to read the chunk + * TODO In future we can probably defer reading the chunk and + * living with invalid data until needed. + */ + + result = yaffs_rd_chunk_tags_nand(dev, + chunk, + chunk_data, + NULL); + + oh = (struct yaffs_obj_hdr *)chunk_data; + + if (dev->param.inband_tags) { + /* Fix up the header if they got corrupted by inband tags */ + oh->shadows_obj = + oh->inband_shadowed_obj_id; + oh->is_shrink = + oh->inband_is_shrink; + } + + if (!in) { + in = yaffs_find_or_create_by_number(dev, tags.obj_id, oh->type); + if (!in) + alloc_failed = 1; + } + + } + + if (!in) { + /* TODO Hoosterman we have a problem! */ + yaffs_trace(YAFFS_TRACE_ERROR, + "yaffs tragedy: Could not make object for object %d at chunk %d during scan", + tags.obj_id, chunk); + continue; + } + + if (in->valid) { + /* We have already filled this one. + * We have a duplicate that will be discarded, but + * we first have to suck out resize info if it is a file. + */ + + if ((in->variant_type == + YAFFS_OBJECT_TYPE_FILE) && ((oh + && + oh-> + type + == + YAFFS_OBJECT_TYPE_FILE) + || + (tags. + extra_available + && + tags. + extra_obj_type + == + YAFFS_OBJECT_TYPE_FILE))) + { + u32 this_size = + (oh) ? oh-> + file_size : + tags.extra_length; + u32 parent_obj_id = + (oh) ? oh->parent_obj_id : + tags.extra_parent_id; + + is_shrink = + (oh) ? oh-> + is_shrink : + tags.extra_is_shrink; + + /* If it is deleted (unlinked at start also means deleted) + * we treat the file size as being zeroed at this point. + */ + if (parent_obj_id == + YAFFS_OBJECTID_DELETED + || parent_obj_id == + YAFFS_OBJECTID_UNLINKED) { + this_size = 0; + is_shrink = 1; + } + + if (is_shrink + && in->variant.file_variant. + shrink_size > this_size) + in->variant. + file_variant. + shrink_size = + this_size; + + if (is_shrink) + bi->has_shrink_hdr = 1; + + } + /* Use existing - destroy this one. */ + yaffs_chunk_del(dev, chunk, 1, + __LINE__); + + } + + if (!in->valid && in->variant_type != + (oh ? oh->type : tags.extra_obj_type)) + yaffs_trace(YAFFS_TRACE_ERROR, + "yaffs tragedy: Bad object type, %d != %d, for object %d at chunk %d during scan", + oh ? + oh->type : tags.extra_obj_type, + in->variant_type, tags.obj_id, + chunk); + + if (!in->valid && + (tags.obj_id == YAFFS_OBJECTID_ROOT || + tags.obj_id == + YAFFS_OBJECTID_LOSTNFOUND)) { + /* We only load some info, don't fiddle with directory structure */ + in->valid = 1; + + if (oh) { + + in->yst_mode = oh->yst_mode; + yaffs_load_attribs(in, oh); + in->lazy_loaded = 0; + } else { + in->lazy_loaded = 1; + } + in->hdr_chunk = chunk; + + } else if (!in->valid) { + /* we need to load this info */ + + in->valid = 1; + in->hdr_chunk = chunk; + + if (oh) { + in->variant_type = oh->type; + + in->yst_mode = oh->yst_mode; + yaffs_load_attribs(in, oh); + + if (oh->shadows_obj > 0) + yaffs_handle_shadowed_obj + (dev, + oh->shadows_obj, + 1); + + yaffs_set_obj_name_from_oh(in, + oh); + parent = + yaffs_find_or_create_by_number + (dev, oh->parent_obj_id, + YAFFS_OBJECT_TYPE_DIRECTORY); + + file_size = oh->file_size; + is_shrink = oh->is_shrink; + equiv_id = oh->equiv_id; + + } else { + in->variant_type = + tags.extra_obj_type; + parent = + yaffs_find_or_create_by_number + (dev, tags.extra_parent_id, + YAFFS_OBJECT_TYPE_DIRECTORY); + file_size = tags.extra_length; + is_shrink = + tags.extra_is_shrink; + equiv_id = tags.extra_equiv_id; + in->lazy_loaded = 1; + + } + in->dirty = 0; + + if (!parent) + alloc_failed = 1; + + /* directory stuff... + * hook up to parent + */ + + if (parent && parent->variant_type == + YAFFS_OBJECT_TYPE_UNKNOWN) { + /* Set up as a directory */ + parent->variant_type = + YAFFS_OBJECT_TYPE_DIRECTORY; + INIT_LIST_HEAD(&parent-> + variant.dir_variant.children); + } else if (!parent + || parent->variant_type != + YAFFS_OBJECT_TYPE_DIRECTORY) { + /* Hoosterman, another problem.... + * We're trying to use a non-directory as a directory + */ + + yaffs_trace(YAFFS_TRACE_ERROR, + "yaffs tragedy: attempting to use non-directory as a directory in scan. Put in lost+found." + ); + parent = dev->lost_n_found; + } + + yaffs_add_obj_to_dir(parent, in); + + is_unlinked = (parent == dev->del_dir) + || (parent == dev->unlinked_dir); + + if (is_shrink) { + /* Mark the block as having a shrink header */ + bi->has_shrink_hdr = 1; + } + + /* Note re hardlinks. + * Since we might scan a hardlink before its equivalent object is scanned + * we put them all in a list. + * After scanning is complete, we should have all the objects, so we run + * through this list and fix up all the chains. + */ + + switch (in->variant_type) { + case YAFFS_OBJECT_TYPE_UNKNOWN: + /* Todo got a problem */ + break; + case YAFFS_OBJECT_TYPE_FILE: + + if (in->variant. + file_variant.scanned_size < + file_size) { + /* This covers the case where the file size is greater + * than where the data is + * This will happen if the file is resized to be larger + * than its current data extents. + */ + in->variant. + file_variant. + file_size = + file_size; + in->variant. + file_variant. + scanned_size = + file_size; + } + + if (in->variant.file_variant. + shrink_size > file_size) + in->variant. + file_variant. + shrink_size = + file_size; + + break; + case YAFFS_OBJECT_TYPE_HARDLINK: + if (!is_unlinked) { + in->variant. + hardlink_variant. + equiv_id = equiv_id; + in->hard_links.next = + (struct list_head *) + hard_list; + hard_list = in; + } + break; + case YAFFS_OBJECT_TYPE_DIRECTORY: + /* Do nothing */ + break; + case YAFFS_OBJECT_TYPE_SPECIAL: + /* Do nothing */ + break; + case YAFFS_OBJECT_TYPE_SYMLINK: + if (oh) { + in->variant. + symlink_variant. + alias = + yaffs_clone_str(oh-> + alias); + if (!in->variant. + symlink_variant. + alias) + alloc_failed = + 1; + } + break; + } + + } + + } + + } /* End of scanning for each chunk */ + + if (state == YAFFS_BLOCK_STATE_NEEDS_SCANNING) { + /* If we got this far while scanning, then the block is fully allocated. */ + state = YAFFS_BLOCK_STATE_FULL; + } + + bi->block_state = state; + + /* Now let's see if it was dirty */ + if (bi->pages_in_use == 0 && + !bi->has_shrink_hdr && + bi->block_state == YAFFS_BLOCK_STATE_FULL) { + yaffs_block_became_dirty(dev, blk); + } + + } + + yaffs_skip_rest_of_block(dev); + + if (alt_block_index) + vfree(block_index); + else + kfree(block_index); + + /* Ok, we've done all the scanning. + * Fix up the hard link chains. + * We should now have scanned all the objects, now it's time to add these + * hardlinks. + */ + yaffs_link_fixup(dev, hard_list); + + yaffs_release_temp_buffer(dev, chunk_data, __LINE__); + + if (alloc_failed) + return YAFFS_FAIL; + + yaffs_trace(YAFFS_TRACE_SCAN, "yaffs2_scan_backwards ends"); + + return YAFFS_OK; +} diff --git a/fs/yaffs2/yaffs_yaffs2.h b/fs/yaffs2/yaffs_yaffs2.h new file mode 100644 index 00000000..e1a9287f --- /dev/null +++ b/fs/yaffs2/yaffs_yaffs2.h @@ -0,0 +1,39 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YAFFS_YAFFS2_H__ +#define __YAFFS_YAFFS2_H__ + +#include "yaffs_guts.h" + +void yaffs_calc_oldest_dirty_seq(struct yaffs_dev *dev); +void yaffs2_find_oldest_dirty_seq(struct yaffs_dev *dev); +void yaffs2_clear_oldest_dirty_seq(struct yaffs_dev *dev, + struct yaffs_block_info *bi); +void yaffs2_update_oldest_dirty_seq(struct yaffs_dev *dev, unsigned block_no, + struct yaffs_block_info *bi); +int yaffs_block_ok_for_gc(struct yaffs_dev *dev, struct yaffs_block_info *bi); +u32 yaffs2_find_refresh_block(struct yaffs_dev *dev); +int yaffs2_checkpt_required(struct yaffs_dev *dev); +int yaffs_calc_checkpt_blocks_required(struct yaffs_dev *dev); + +void yaffs2_checkpt_invalidate(struct yaffs_dev *dev); +int yaffs2_checkpt_save(struct yaffs_dev *dev); +int yaffs2_checkpt_restore(struct yaffs_dev *dev); + +int yaffs2_handle_hole(struct yaffs_obj *obj, loff_t new_size); +int yaffs2_scan_backwards(struct yaffs_dev *dev); + +#endif diff --git a/fs/yaffs2/yportenv.h b/fs/yaffs2/yportenv.h new file mode 100644 index 00000000..81834254 --- /dev/null +++ b/fs/yaffs2/yportenv.h @@ -0,0 +1,70 @@ +/* + * YAFFS: Yet another Flash File System . A NAND-flash specific file system. + * + * Copyright (C) 2002-2010 Aleph One Ltd. + * for Toby Churchill Ltd and Brightstar Engineering + * + * Created by Charles Manning + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 as + * published by the Free Software Foundation. + * + * Note: Only YAFFS headers are LGPL, YAFFS C code is covered by GPL. + */ + +#ifndef __YPORTENV_LINUX_H__ +#define __YPORTENV_LINUX_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define YCHAR char +#define YUCHAR unsigned char +#define _Y(x) x + +#define YAFFS_LOSTNFOUND_NAME "lost+found" +#define YAFFS_LOSTNFOUND_PREFIX "obj" + + +#define YAFFS_ROOT_MODE 0755 +#define YAFFS_LOSTNFOUND_MODE 0700 + +#define Y_CURRENT_TIME CURRENT_TIME.tv_sec +#define Y_TIME_CONVERT(x) (x).tv_sec + +#define compile_time_assertion(assertion) \ + ({ int x = __builtin_choose_expr(assertion, 0, (void)0); (void) x; }) + + +#ifndef Y_DUMP_STACK +#define Y_DUMP_STACK() dump_stack() +#endif + +#define yaffs_trace(msk, fmt, ...) do { \ + if(yaffs_trace_mask & (msk)) \ + printk(KERN_DEBUG "yaffs: " fmt "\n", ##__VA_ARGS__); \ +} while(0) + +#ifndef YBUG +#define YBUG() do {\ + yaffs_trace(YAFFS_TRACE_BUG,\ + "bug " __FILE__ " %d",\ + __LINE__);\ + Y_DUMP_STACK();\ +} while (0) +#endif + +#endif -- cgit v1.2.3